865 files changed, 500102 insertions, 0 deletions
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64.h b/contrib/llvm/lib/Target/AArch64/AArch64.h
new file mode 100644
index 0000000..1c022aa
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64.h
@@ -0,0 +1,49 @@
+//==-- AArch64.h - Top-level interface for AArch64  --------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// AArch64 back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_AArch64_H
+#define TARGET_AArch64_H
+
+#include "Utils/AArch64BaseInfo.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+
+class AArch64TargetMachine;
+class FunctionPass;
+class MachineFunctionPass;
+
+FunctionPass *createAArch64DeadRegisterDefinitions();
+FunctionPass *createAArch64ConditionalCompares();
+FunctionPass *createAArch64AdvSIMDScalar();
+FunctionPass *createAArch64BranchRelaxation();
+FunctionPass *createAArch64ISelDag(AArch64TargetMachine &TM,
+                                 CodeGenOpt::Level OptLevel);
+FunctionPass *createAArch64StorePairSuppressPass();
+FunctionPass *createAArch64ExpandPseudoPass();
+FunctionPass *createAArch64LoadStoreOptimizationPass();
+ModulePass *createAArch64PromoteConstantPass();
+FunctionPass *createAArch64AddressTypePromotionPass();
+/// \brief Creates an ARM-specific Target Transformation Info pass.
+ImmutablePass *
+createAArch64TargetTransformInfoPass(const AArch64TargetMachine *TM);
+
+FunctionPass *createAArch64CleanupLocalDynamicTLSPass();
+
+FunctionPass *createAArch64CollectLOHPass();
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64.td b/contrib/llvm/lib/Target/AArch64/AArch64.td
new file mode 100644
index 0000000..e6a27c3
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64.td
@@ -0,0 +1,135 @@
+//=- AArch64.td - Describe the AArch64 Target Machine --------*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// AArch64 Subtarget features.
+//
+
+def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8", "true",
+                                       "Enable ARMv8 FP">;
+
+def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true",
+  "Enable Advanced SIMD instructions", [FeatureFPARMv8]>;
+
+def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
+  "Enable cryptographic instructions">;
+
+def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true",
+  "Enable ARMv8 CRC-32 checksum instructions">;
+
+/// Cyclone has register move instructions which are "free".
+def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true",
+                                        "Has zero-cycle register moves">;
+
+/// Cyclone has instructions which zero registers for "free".
+def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
+                                        "Has zero-cycle zeroing instructions">;
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "AArch64RegisterInfo.td"
+include "AArch64CallingConvention.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "AArch64Schedule.td"
+include "AArch64InstrInfo.td"
+
+def AArch64InstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// AArch64 Processors supported.
+//
+include "AArch64SchedA53.td"
+include "AArch64SchedA57.td"
+include "AArch64SchedCyclone.td"
+
+def ProcA53     : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
+                                   "Cortex-A53 ARM processors",
+                                   [FeatureFPARMv8,
+                                   FeatureNEON,
+                                   FeatureCrypto,
+                                   FeatureCRC]>;
+
+def ProcA57     : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
+                                   "Cortex-A57 ARM processors",
+                                   [FeatureFPARMv8,
+                                   FeatureNEON,
+                                   FeatureCrypto,
+                                   FeatureCRC]>;
+
+def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone",
+                                   "Cyclone",
+                                   [FeatureFPARMv8,
+                                   FeatureNEON,
+                                   FeatureCrypto,
+                                   FeatureCRC,
+                                   FeatureZCRegMove, FeatureZCZeroing]>;
+
+def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8,
+                                              FeatureNEON,
+                                              FeatureCRC]>;
+
+def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>;
+def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>;
+def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>;
+
+//===----------------------------------------------------------------------===//
+// Assembly parser
+//===----------------------------------------------------------------------===//
+
+def GenericAsmParserVariant : AsmParserVariant {
+  int Variant = 0;
+  string Name = "generic";
+}
+
+def AppleAsmParserVariant : AsmParserVariant {
+  int Variant = 1;
+  string Name = "apple-neon";
+}
+
+//===----------------------------------------------------------------------===//
+// Assembly printer
+//===----------------------------------------------------------------------===//
+// AArch64 Uses the MC printer for asm output, so make sure the TableGen
+// AsmWriter bits get associated with the correct class.
+def GenericAsmWriter : AsmWriter {
+  string AsmWriterClassName  = "InstPrinter";
+  int Variant = 0;
+  bit isMCAsmWriter = 1;
+}
+
+def AppleAsmWriter : AsmWriter {
+  let AsmWriterClassName = "AppleInstPrinter";
+  int Variant = 1;
+  int isMCAsmWriter = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Target Declaration
+//===----------------------------------------------------------------------===//
+
+def AArch64 : Target {
+  let InstructionSet = AArch64InstrInfo;
+  let AssemblyParserVariants = [GenericAsmParserVariant, AppleAsmParserVariant];
+  let AssemblyWriters = [GenericAsmWriter, AppleAsmWriter];
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp b/contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
new file mode 100644
index 0000000..ab2c4b7
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
@@ -0,0 +1,491 @@
+//===-- AArch64AddressTypePromotion.cpp --- Promote type for addr accesses -==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass tries to promote the computations use to obtained a sign extended
+// value used into memory accesses.
+// E.g.
+// a = add nsw i32 b, 3
+// d = sext i32 a to i64
+// e = getelementptr ..., i64 d
+//
+// =>
+// f = sext i32 b to i64
+// a = add nsw i64 f, 3
+// e = getelementptr ..., i64 a
+//
+// This is legal to do so if the computations are markers with either nsw or nuw
+// markers.
+// Moreover, the current heuristic is simple: it does not create new sext
+// operations, i.e., it gives up when a sext would have forked (e.g., if
+// a = add i32 b, c, two sexts are required to promote the computation).
+//
+// FIXME: This pass may be useful for other targets too.
+// ===---------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-type-promotion"
+
+static cl::opt<bool>
+EnableAddressTypePromotion("aarch64-type-promotion", cl::Hidden,
+                           cl::desc("Enable the type promotion pass"),
+                           cl::init(true));
+static cl::opt<bool>
+EnableMerge("aarch64-type-promotion-merge", cl::Hidden,
+            cl::desc("Enable merging of redundant sexts when one is dominating"
+                     " the other."),
+            cl::init(true));
+
+//===----------------------------------------------------------------------===//
+//                       AArch64AddressTypePromotion
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+void initializeAArch64AddressTypePromotionPass(PassRegistry &);
+}
+
+namespace {
+class AArch64AddressTypePromotion : public FunctionPass {
+
+public:
+  static char ID;
+  AArch64AddressTypePromotion()
+      : FunctionPass(ID), Func(nullptr), ConsideredSExtType(nullptr) {
+    initializeAArch64AddressTypePromotionPass(*PassRegistry::getPassRegistry());
+  }
+
+  const char *getPassName() const override {
+    return "AArch64 Address Type Promotion";
+  }
+
+  /// Iterate over the functions and promote the computation of interesting
+  // sext instructions.
+  bool runOnFunction(Function &F) override;
+
+private:
+  /// The current function.
+  Function *Func;
+  /// Filter out all sexts that does not have this type.
+  /// Currently initialized with Int64Ty.
+  Type *ConsideredSExtType;
+
+  // This transformation requires dominator info.
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+
+  typedef SmallPtrSet<Instruction *, 32> SetOfInstructions;
+  typedef SmallVector<Instruction *, 16> Instructions;
+  typedef DenseMap<Value *, Instructions> ValueToInsts;
+
+  /// Check if it is profitable to move a sext through this instruction.
+  /// Currently, we consider it is profitable if:
+  /// - Inst is used only once (no need to insert truncate).
+  /// - Inst has only one operand that will require a sext operation (we do
+  ///   do not create new sext operation).
+  bool shouldGetThrough(const Instruction *Inst);
+
+  /// Check if it is possible and legal to move a sext through this
+  /// instruction.
+  /// Current heuristic considers that we can get through:
+  /// - Arithmetic operation marked with the nsw or nuw flag.
+  /// - Other sext operation.
+  /// - Truncate operation if it was just dropping sign extended bits.
+  bool canGetThrough(const Instruction *Inst);
+
+  /// Move sext operations through safe to sext instructions.
+  bool propagateSignExtension(Instructions &SExtInsts);
+
+  /// Is this sext should be considered for code motion.
+  /// We look for sext with ConsideredSExtType and uses in at least one
+  // GetElementPtrInst.
+  bool shouldConsiderSExt(const Instruction *SExt) const;
+
+  /// Collect all interesting sext operations, i.e., the ones with the right
+  /// type and used in memory accesses.
+  /// More precisely, a sext instruction is considered as interesting if it
+  /// is used in a "complex" getelementptr or it exits at least another
+  /// sext instruction that sign extended the same initial value.
+  /// A getelementptr is considered as "complex" if it has more than 2
+  // operands.
+  void analyzeSExtension(Instructions &SExtInsts);
+
+  /// Merge redundant sign extension operations in common dominator.
+  void mergeSExts(ValueToInsts &ValToSExtendedUses,
+                  SetOfInstructions &ToRemove);
+};
+} // end anonymous namespace.
+
+char AArch64AddressTypePromotion::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AArch64AddressTypePromotion, "aarch64-type-promotion",
+                      "AArch64 Type Promotion Pass", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(AArch64AddressTypePromotion, "aarch64-type-promotion",
+                    "AArch64 Type Promotion Pass", false, false)
+
+FunctionPass *llvm::createAArch64AddressTypePromotionPass() {
+  return new AArch64AddressTypePromotion();
+}
+
+bool AArch64AddressTypePromotion::canGetThrough(const Instruction *Inst) {
+  if (isa<SExtInst>(Inst))
+    return true;
+
+  const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
+  if (BinOp && isa<OverflowingBinaryOperator>(BinOp) &&
+      (BinOp->hasNoUnsignedWrap() || BinOp->hasNoSignedWrap()))
+    return true;
+
+  // sext(trunc(sext)) --> sext
+  if (isa<TruncInst>(Inst) && isa<SExtInst>(Inst->getOperand(0))) {
+    const Instruction *Opnd = cast<Instruction>(Inst->getOperand(0));
+    // Check that the truncate just drop sign extended bits.
+    if (Inst->getType()->getIntegerBitWidth() >=
+            Opnd->getOperand(0)->getType()->getIntegerBitWidth() &&
+        Inst->getOperand(0)->getType()->getIntegerBitWidth() <=
+            ConsideredSExtType->getIntegerBitWidth())
+      return true;
+  }
+
+  return false;
+}
+
+bool AArch64AddressTypePromotion::shouldGetThrough(const Instruction *Inst) {
+  // If the type of the sext is the same as the considered one, this sext
+  // will become useless.
+  // Otherwise, we will have to do something to preserve the original value,
+  // unless it is used once.
+  if (isa<SExtInst>(Inst) &&
+      (Inst->getType() == ConsideredSExtType || Inst->hasOneUse()))
+    return true;
+
+  // If the Inst is used more that once, we may need to insert truncate
+  // operations and we don't do that at the moment.
+  if (!Inst->hasOneUse())
+    return false;
+
+  // This truncate is used only once, thus if we can get thourgh, it will become
+  // useless.
+  if (isa<TruncInst>(Inst))
+    return true;
+
+  // If both operands are not constant, a new sext will be created here.
+  // Current heuristic is: each step should be profitable.
+  // Therefore we don't allow to increase the number of sext even if it may
+  // be profitable later on.
+  if (isa<BinaryOperator>(Inst) && isa<ConstantInt>(Inst->getOperand(1)))
+    return true;
+
+  return false;
+}
+
+static bool shouldSExtOperand(const Instruction *Inst, int OpIdx) {
+  if (isa<SelectInst>(Inst) && OpIdx == 0)
+    return false;
+  return true;
+}
+
+bool
+AArch64AddressTypePromotion::shouldConsiderSExt(const Instruction *SExt) const {
+  if (SExt->getType() != ConsideredSExtType)
+    return false;
+
+  for (const User *U : SExt->users()) {
+    if (isa<GetElementPtrInst>(U))
+      return true;
+  }
+
+  return false;
+}
+
+// Input:
+// - SExtInsts contains all the sext instructions that are use direclty in
+//   GetElementPtrInst, i.e., access to memory.
+// Algorithm:
+// - For each sext operation in SExtInsts:
+//   Let var be the operand of sext.
+//   while it is profitable (see shouldGetThrough), legal, and safe
+//   (see canGetThrough) to move sext through var's definition:
+//   * promote the type of var's definition.
+//   * fold var into sext uses.
+//   * move sext above var's definition.
+//   * update sext operand to use the operand of var that should be sign
+//     extended (by construction there is only one).
+//
+//   E.g.,
+//   a = ... i32 c, 3
+//   b = sext i32 a to i64 <- is it legal/safe/profitable to get through 'a'
+//   ...
+//   = b
+// => Yes, update the code
+//   b = sext i32 c to i64
+//   a = ... i64 b, 3
+//   ...
+//   = a
+// Iterate on 'c'.
+bool
+AArch64AddressTypePromotion::propagateSignExtension(Instructions &SExtInsts) {
+  DEBUG(dbgs() << "*** Propagate Sign Extension ***\n");
+
+  bool LocalChange = false;
+  SetOfInstructions ToRemove;
+  ValueToInsts ValToSExtendedUses;
+  while (!SExtInsts.empty()) {
+    // Get through simple chain.
+    Instruction *SExt = SExtInsts.pop_back_val();
+
+    DEBUG(dbgs() << "Consider:\n" << *SExt << '\n');
+
+    // If this SExt has already been merged continue.
+    if (SExt->use_empty() && ToRemove.count(SExt)) {
+      DEBUG(dbgs() << "No uses => marked as delete\n");
+      continue;
+    }
+
+    // Now try to get through the chain of definitions.
+    while (auto *Inst = dyn_cast<Instruction>(SExt->getOperand(0))) {
+      DEBUG(dbgs() << "Try to get through:\n" << *Inst << '\n');
+      if (!canGetThrough(Inst) || !shouldGetThrough(Inst)) {
+        // We cannot get through something that is not an Instruction
+        // or not safe to SExt.
+        DEBUG(dbgs() << "Cannot get through\n");
+        break;
+      }
+
+      LocalChange = true;
+      // If this is a sign extend, it becomes useless.
+      if (isa<SExtInst>(Inst) || isa<TruncInst>(Inst)) {
+        DEBUG(dbgs() << "SExt or trunc, mark it as to remove\n");
+        // We cannot use replaceAllUsesWith here because we may trigger some
+        // assertion on the type as all involved sext operation may have not
+        // been moved yet.
+        while (!Inst->use_empty()) {
+          Use &U = *Inst->use_begin();
+          Instruction *User = dyn_cast<Instruction>(U.getUser());
+          assert(User && "User of sext is not an Instruction!");
+          User->setOperand(U.getOperandNo(), SExt);
+        }
+        ToRemove.insert(Inst);
+        SExt->setOperand(0, Inst->getOperand(0));
+        SExt->moveBefore(Inst);
+        continue;
+      }
+
+      // Get through the Instruction:
+      // 1. Update its type.
+      // 2. Replace the uses of SExt by Inst.
+      // 3. Sign extend each operand that needs to be sign extended.
+
+      // Step #1.
+      Inst->mutateType(SExt->getType());
+      // Step #2.
+      SExt->replaceAllUsesWith(Inst);
+      // Step #3.
+      Instruction *SExtForOpnd = SExt;
+
+      DEBUG(dbgs() << "Propagate SExt to operands\n");
+      for (int OpIdx = 0, EndOpIdx = Inst->getNumOperands(); OpIdx != EndOpIdx;
+           ++OpIdx) {
+        DEBUG(dbgs() << "Operand:\n" << *(Inst->getOperand(OpIdx)) << '\n');
+        if (Inst->getOperand(OpIdx)->getType() == SExt->getType() ||
+            !shouldSExtOperand(Inst, OpIdx)) {
+          DEBUG(dbgs() << "No need to propagate\n");
+          continue;
+        }
+        // Check if we can statically sign extend the operand.
+        Value *Opnd = Inst->getOperand(OpIdx);
+        if (const ConstantInt *Cst = dyn_cast<ConstantInt>(Opnd)) {
+          DEBUG(dbgs() << "Statically sign extend\n");
+          Inst->setOperand(OpIdx, ConstantInt::getSigned(SExt->getType(),
+                                                         Cst->getSExtValue()));
+          continue;
+        }
+        // UndefValue are typed, so we have to statically sign extend them.
+        if (isa<UndefValue>(Opnd)) {
+          DEBUG(dbgs() << "Statically sign extend\n");
+          Inst->setOperand(OpIdx, UndefValue::get(SExt->getType()));
+          continue;
+        }
+
+        // Otherwise we have to explicity sign extend it.
+        assert(SExtForOpnd &&
+               "Only one operand should have been sign extended");
+
+        SExtForOpnd->setOperand(0, Opnd);
+
+        DEBUG(dbgs() << "Move before:\n" << *Inst << "\nSign extend\n");
+        // Move the sign extension before the insertion point.
+        SExtForOpnd->moveBefore(Inst);
+        Inst->setOperand(OpIdx, SExtForOpnd);
+        // If more sext are required, new instructions will have to be created.
+        SExtForOpnd = nullptr;
+      }
+      if (SExtForOpnd == SExt) {
+        DEBUG(dbgs() << "Sign extension is useless now\n");
+        ToRemove.insert(SExt);
+        break;
+      }
+    }
+
+    // If the use is already of the right type, connect its uses to its argument
+    // and delete it.
+    // This can happen for an Instruction which all uses are sign extended.
+    if (!ToRemove.count(SExt) &&
+        SExt->getType() == SExt->getOperand(0)->getType()) {
+      DEBUG(dbgs() << "Sign extension is useless, attach its use to "
+                      "its argument\n");
+      SExt->replaceAllUsesWith(SExt->getOperand(0));
+      ToRemove.insert(SExt);
+    } else
+      ValToSExtendedUses[SExt->getOperand(0)].push_back(SExt);
+  }
+
+  if (EnableMerge)
+    mergeSExts(ValToSExtendedUses, ToRemove);
+
+  // Remove all instructions marked as ToRemove.
+  for (Instruction *I: ToRemove)
+    I->eraseFromParent();
+  return LocalChange;
+}
+
+void AArch64AddressTypePromotion::mergeSExts(ValueToInsts &ValToSExtendedUses,
+                                             SetOfInstructions &ToRemove) {
+  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+  for (auto &Entry : ValToSExtendedUses) {
+    Instructions &Insts = Entry.second;
+    Instructions CurPts;
+    for (Instruction *Inst : Insts) {
+      if (ToRemove.count(Inst))
+        continue;
+      bool inserted = false;
+      for (auto &Pt : CurPts) {
+        if (DT.dominates(Inst, Pt)) {
+          DEBUG(dbgs() << "Replace all uses of:\n" << *Pt << "\nwith:\n"
+                       << *Inst << '\n');
+          Pt->replaceAllUsesWith(Inst);
+          ToRemove.insert(Pt);
+          Pt = Inst;
+          inserted = true;
+          break;
+        }
+        if (!DT.dominates(Pt, Inst))
+          // Give up if we need to merge in a common dominator as the
+          // expermients show it is not profitable.
+          continue;
+
+        DEBUG(dbgs() << "Replace all uses of:\n" << *Inst << "\nwith:\n"
+                     << *Pt << '\n');
+        Inst->replaceAllUsesWith(Pt);
+        ToRemove.insert(Inst);
+        inserted = true;
+        break;
+      }
+      if (!inserted)
+        CurPts.push_back(Inst);
+    }
+  }
+}
+
+void AArch64AddressTypePromotion::analyzeSExtension(Instructions &SExtInsts) {
+  DEBUG(dbgs() << "*** Analyze Sign Extensions ***\n");
+
+  DenseMap<Value *, Instruction *> SeenChains;
+
+  for (auto &BB : *Func) {
+    for (auto &II : BB) {
+      Instruction *SExt = &II;
+
+      // Collect all sext operation per type.
+      if (!isa<SExtInst>(SExt) || !shouldConsiderSExt(SExt))
+        continue;
+
+      DEBUG(dbgs() << "Found:\n" << (*SExt) << '\n');
+
+      // Cases where we actually perform the optimization:
+      // 1. SExt is used in a getelementptr with more than 2 operand =>
+      //    likely we can merge some computation if they are done on 64 bits.
+      // 2. The beginning of the SExt chain is SExt several time. =>
+      //    code sharing is possible.
+
+      bool insert = false;
+      // #1.
+      for (const User *U : SExt->users()) {
+        const Instruction *Inst = dyn_cast<GetElementPtrInst>(U);
+        if (Inst && Inst->getNumOperands() > 2) {
+          DEBUG(dbgs() << "Interesting use in GetElementPtrInst\n" << *Inst
+                       << '\n');
+          insert = true;
+          break;
+        }
+      }
+
+      // #2.
+      // Check the head of the chain.
+      Instruction *Inst = SExt;
+      Value *Last;
+      do {
+        int OpdIdx = 0;
+        const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
+        if (BinOp && isa<ConstantInt>(BinOp->getOperand(0)))
+          OpdIdx = 1;
+        Last = Inst->getOperand(OpdIdx);
+        Inst = dyn_cast<Instruction>(Last);
+      } while (Inst && canGetThrough(Inst) && shouldGetThrough(Inst));
+
+      DEBUG(dbgs() << "Head of the chain:\n" << *Last << '\n');
+      DenseMap<Value *, Instruction *>::iterator AlreadySeen =
+          SeenChains.find(Last);
+      if (insert || AlreadySeen != SeenChains.end()) {
+        DEBUG(dbgs() << "Insert\n");
+        SExtInsts.push_back(SExt);
+        if (AlreadySeen != SeenChains.end() && AlreadySeen->second != nullptr) {
+          DEBUG(dbgs() << "Insert chain member\n");
+          SExtInsts.push_back(AlreadySeen->second);
+          SeenChains[Last] = nullptr;
+        }
+      } else {
+        DEBUG(dbgs() << "Record its chain membership\n");
+        SeenChains[Last] = SExt;
+      }
+    }
+  }
+}
+
+bool AArch64AddressTypePromotion::runOnFunction(Function &F) {
+  if (!EnableAddressTypePromotion || F.isDeclaration())
+    return false;
+  Func = &F;
+  ConsideredSExtType = Type::getInt64Ty(Func->getContext());
+
+  DEBUG(dbgs() << "*** " << getPassName() << ": " << Func->getName() << '\n');
+
+  Instructions SExtInsts;
+  analyzeSExtension(SExtInsts);
+  return propagateSignExtension(SExtInsts);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp b/contrib/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
new file mode 100644
index 0000000..734fb21
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
@@ -0,0 +1,387 @@
+//===-- AArch64AdvSIMDScalar.cpp - Replace dead defs w/ zero reg --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// When profitable, replace GPR targeting i64 instructions with their
+// AdvSIMD scalar equivalents. Generally speaking, "profitable" is defined
+// as minimizing the number of cross-class register copies.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// TODO: Graph based predicate heuristics.
+// Walking the instruction list linearly will get many, perhaps most, of
+// the cases, but to do a truly thorough job of this, we need a more
+// wholistic approach.
+//
+// This optimization is very similar in spirit to the register allocator's
+// spill placement, only here we're determining where to place cross-class
+// register copies rather than spills. As such, a similar approach is
+// called for.
+//
+// We want to build up a set of graphs of all instructions which are candidates
+// for transformation along with instructions which generate their inputs and
+// consume their outputs. For each edge in the graph, we assign a weight
+// based on whether there is a copy required there (weight zero if not) and
+// the block frequency of the block containing the defining or using
+// instruction, whichever is less. Our optimization is then a graph problem
+// to minimize the total weight of all the graphs, then transform instructions
+// and add or remove copy instructions as called for to implement the
+// solution.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64RegisterInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-simd-scalar"
+
+// Allow forcing all i64 operations with equivalent SIMD instructions to use
+// them. For stress-testing the transformation function.
+static cl::opt<bool>
+TransformAll("aarch64-simd-scalar-force-all",
+             cl::desc("Force use of AdvSIMD scalar instructions everywhere"),
+             cl::init(false), cl::Hidden);
+
+STATISTIC(NumScalarInsnsUsed, "Number of scalar instructions used");
+STATISTIC(NumCopiesDeleted, "Number of cross-class copies deleted");
+STATISTIC(NumCopiesInserted, "Number of cross-class copies inserted");
+
+namespace {
+class AArch64AdvSIMDScalar : public MachineFunctionPass {
+  MachineRegisterInfo *MRI;
+  const AArch64InstrInfo *TII;
+
+private:
+  // isProfitableToTransform - Predicate function to determine whether an
+  // instruction should be transformed to its equivalent AdvSIMD scalar
+  // instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example.
+  bool isProfitableToTransform(const MachineInstr *MI) const;
+
+  // transformInstruction - Perform the transformation of an instruction
+  // to its equivalant AdvSIMD scalar instruction. Update inputs and outputs
+  // to be the correct register class, minimizing cross-class copies.
+  void transformInstruction(MachineInstr *MI);
+
+  // processMachineBasicBlock - Main optimzation loop.
+  bool processMachineBasicBlock(MachineBasicBlock *MBB);
+
+public:
+  static char ID; // Pass identification, replacement for typeid.
+  explicit AArch64AdvSIMDScalar() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+
+  const char *getPassName() const override {
+    return "AdvSIMD Scalar Operation Optimization";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+char AArch64AdvSIMDScalar::ID = 0;
+} // end anonymous namespace
+
+static bool isGPR64(unsigned Reg, unsigned SubReg,
+                    const MachineRegisterInfo *MRI) {
+  if (SubReg)
+    return false;
+  if (TargetRegisterInfo::isVirtualRegister(Reg))
+    return MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::GPR64RegClass);
+  return AArch64::GPR64RegClass.contains(Reg);
+}
+
+static bool isFPR64(unsigned Reg, unsigned SubReg,
+                    const MachineRegisterInfo *MRI) {
+  if (TargetRegisterInfo::isVirtualRegister(Reg))
+    return (MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR64RegClass) &&
+            SubReg == 0) ||
+           (MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR128RegClass) &&
+            SubReg == AArch64::dsub);
+  // Physical register references just check the register class directly.
+  return (AArch64::FPR64RegClass.contains(Reg) && SubReg == 0) ||
+         (AArch64::FPR128RegClass.contains(Reg) && SubReg == AArch64::dsub);
+}
+
+// getSrcFromCopy - Get the original source register for a GPR64 <--> FPR64
+// copy instruction. Return zero_reg if the instruction is not a copy.
+static unsigned getSrcFromCopy(const MachineInstr *MI,
+                               const MachineRegisterInfo *MRI,
+                               unsigned &SubReg) {
+  SubReg = 0;
+  // The "FMOV Xd, Dn" instruction is the typical form.
+  if (MI->getOpcode() == AArch64::FMOVDXr ||
+      MI->getOpcode() == AArch64::FMOVXDr)
+    return MI->getOperand(1).getReg();
+  // A lane zero extract "UMOV.d Xd, Vn[0]" is equivalent. We shouldn't see
+  // these at this stage, but it's easy to check for.
+  if (MI->getOpcode() == AArch64::UMOVvi64 && MI->getOperand(2).getImm() == 0) {
+    SubReg = AArch64::dsub;
+    return MI->getOperand(1).getReg();
+  }
+  // Or just a plain COPY instruction. This can be directly to/from FPR64,
+  // or it can be a dsub subreg reference to an FPR128.
+  if (MI->getOpcode() == AArch64::COPY) {
+    if (isFPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(),
+                MRI) &&
+        isGPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(), MRI))
+      return MI->getOperand(1).getReg();
+    if (isGPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(),
+                MRI) &&
+        isFPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(),
+                MRI)) {
+      SubReg = MI->getOperand(1).getSubReg();
+      return MI->getOperand(1).getReg();
+    }
+  }
+
+  // Otherwise, this is some other kind of instruction.
+  return 0;
+}
+
+// getTransformOpcode - For any opcode for which there is an AdvSIMD equivalent
+// that we're considering transforming to, return that AdvSIMD opcode. For all
+// others, return the original opcode.
+static int getTransformOpcode(unsigned Opc) {
+  switch (Opc) {
+  default:
+    break;
+  // FIXME: Lots more possibilities.
+  case AArch64::ADDXrr:
+    return AArch64::ADDv1i64;
+  case AArch64::SUBXrr:
+    return AArch64::SUBv1i64;
+  }
+  // No AdvSIMD equivalent, so just return the original opcode.
+  return Opc;
+}
+
+static bool isTransformable(const MachineInstr *MI) {
+  int Opc = MI->getOpcode();
+  return Opc != getTransformOpcode(Opc);
+}
+
+// isProfitableToTransform - Predicate function to determine whether an
+// instruction should be transformed to its equivalent AdvSIMD scalar
+// instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example.
+bool
+AArch64AdvSIMDScalar::isProfitableToTransform(const MachineInstr *MI) const {
+  // If this instruction isn't eligible to be transformed (no SIMD equivalent),
+  // early exit since that's the common case.
+  if (!isTransformable(MI))
+    return false;
+
+  // Count the number of copies we'll need to add and approximate the number
+  // of copies that a transform will enable us to remove.
+  unsigned NumNewCopies = 3;
+  unsigned NumRemovableCopies = 0;
+
+  unsigned OrigSrc0 = MI->getOperand(1).getReg();
+  unsigned OrigSrc1 = MI->getOperand(2).getReg();
+  unsigned Src0 = 0, SubReg0;
+  unsigned Src1 = 0, SubReg1;
+  if (!MRI->def_empty(OrigSrc0)) {
+    MachineRegisterInfo::def_instr_iterator Def =
+        MRI->def_instr_begin(OrigSrc0);
+    assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
+    Src0 = getSrcFromCopy(&*Def, MRI, SubReg0);
+    // If the source was from a copy, we don't need to insert a new copy.
+    if (Src0)
+      --NumNewCopies;
+    // If there are no other users of the original source, we can delete
+    // that instruction.
+    if (Src0 && MRI->hasOneNonDBGUse(OrigSrc0))
+      ++NumRemovableCopies;
+  }
+  if (!MRI->def_empty(OrigSrc1)) {
+    MachineRegisterInfo::def_instr_iterator Def =
+        MRI->def_instr_begin(OrigSrc1);
+    assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
+    Src1 = getSrcFromCopy(&*Def, MRI, SubReg1);
+    if (Src1)
+      --NumNewCopies;
+    // If there are no other users of the original source, we can delete
+    // that instruction.
+    if (Src1 && MRI->hasOneNonDBGUse(OrigSrc1))
+      ++NumRemovableCopies;
+  }
+
+  // If any of the uses of the original instructions is a cross class copy,
+  // that's a copy that will be removable if we transform. Likewise, if
+  // any of the uses is a transformable instruction, it's likely the tranforms
+  // will chain, enabling us to save a copy there, too. This is an aggressive
+  // heuristic that approximates the graph based cost analysis described above.
+  unsigned Dst = MI->getOperand(0).getReg();
+  bool AllUsesAreCopies = true;
+  for (MachineRegisterInfo::use_instr_nodbg_iterator
+           Use = MRI->use_instr_nodbg_begin(Dst),
+           E = MRI->use_instr_nodbg_end();
+       Use != E; ++Use) {
+    unsigned SubReg;
+    if (getSrcFromCopy(&*Use, MRI, SubReg) || isTransformable(&*Use))
+      ++NumRemovableCopies;
+    // If the use is an INSERT_SUBREG, that's still something that can
+    // directly use the FPR64, so we don't invalidate AllUsesAreCopies. It's
+    // preferable to have it use the FPR64 in most cases, as if the source
+    // vector is an IMPLICIT_DEF, the INSERT_SUBREG just goes away entirely.
+    // Ditto for a lane insert.
+    else if (Use->getOpcode() == AArch64::INSERT_SUBREG ||
+             Use->getOpcode() == AArch64::INSvi64gpr)
+      ;
+    else
+      AllUsesAreCopies = false;
+  }
+  // If all of the uses of the original destination register are copies to
+  // FPR64, then we won't end up having a new copy back to GPR64 either.
+  if (AllUsesAreCopies)
+    --NumNewCopies;
+
+  // If a transform will not increase the number of cross-class copies required,
+  // return true.
+  if (NumNewCopies <= NumRemovableCopies)
+    return true;
+
+  // Finally, even if we otherwise wouldn't transform, check if we're forcing
+  // transformation of everything.
+  return TransformAll;
+}
+
+static MachineInstr *insertCopy(const AArch64InstrInfo *TII, MachineInstr *MI,
+                                unsigned Dst, unsigned Src, bool IsKill) {
+  MachineInstrBuilder MIB =
+      BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AArch64::COPY),
+              Dst)
+          .addReg(Src, getKillRegState(IsKill));
+  DEBUG(dbgs() << "    adding copy: " << *MIB);
+  ++NumCopiesInserted;
+  return MIB;
+}
+
+// transformInstruction - Perform the transformation of an instruction
+// to its equivalant AdvSIMD scalar instruction. Update inputs and outputs
+// to be the correct register class, minimizing cross-class copies.
+void AArch64AdvSIMDScalar::transformInstruction(MachineInstr *MI) {
+  DEBUG(dbgs() << "Scalar transform: " << *MI);
+
+  MachineBasicBlock *MBB = MI->getParent();
+  int OldOpc = MI->getOpcode();
+  int NewOpc = getTransformOpcode(OldOpc);
+  assert(OldOpc != NewOpc && "transform an instruction to itself?!");
+
+  // Check if we need a copy for the source registers.
+  unsigned OrigSrc0 = MI->getOperand(1).getReg();
+  unsigned OrigSrc1 = MI->getOperand(2).getReg();
+  unsigned Src0 = 0, SubReg0;
+  unsigned Src1 = 0, SubReg1;
+  if (!MRI->def_empty(OrigSrc0)) {
+    MachineRegisterInfo::def_instr_iterator Def =
+        MRI->def_instr_begin(OrigSrc0);
+    assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
+    Src0 = getSrcFromCopy(&*Def, MRI, SubReg0);
+    // If there are no other users of the original source, we can delete
+    // that instruction.
+    if (Src0 && MRI->hasOneNonDBGUse(OrigSrc0)) {
+      assert(Src0 && "Can't delete copy w/o a valid original source!");
+      Def->eraseFromParent();
+      ++NumCopiesDeleted;
+    }
+  }
+  if (!MRI->def_empty(OrigSrc1)) {
+    MachineRegisterInfo::def_instr_iterator Def =
+        MRI->def_instr_begin(OrigSrc1);
+    assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
+    Src1 = getSrcFromCopy(&*Def, MRI, SubReg1);
+    // If there are no other users of the original source, we can delete
+    // that instruction.
+    if (Src1 && MRI->hasOneNonDBGUse(OrigSrc1)) {
+      assert(Src1 && "Can't delete copy w/o a valid original source!");
+      Def->eraseFromParent();
+      ++NumCopiesDeleted;
+    }
+  }
+  // If we weren't able to reference the original source directly, create a
+  // copy.
+  if (!Src0) {
+    SubReg0 = 0;
+    Src0 = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
+    insertCopy(TII, MI, Src0, OrigSrc0, true);
+  }
+  if (!Src1) {
+    SubReg1 = 0;
+    Src1 = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
+    insertCopy(TII, MI, Src1, OrigSrc1, true);
+  }
+
+  // Create a vreg for the destination.
+  // FIXME: No need to do this if the ultimate user expects an FPR64.
+  // Check for that and avoid the copy if possible.
+  unsigned Dst = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
+
+  // For now, all of the new instructions have the same simple three-register
+  // form, so no need to special case based on what instruction we're
+  // building.
+  BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(NewOpc), Dst)
+      .addReg(Src0, getKillRegState(true), SubReg0)
+      .addReg(Src1, getKillRegState(true), SubReg1);
+
+  // Now copy the result back out to a GPR.
+  // FIXME: Try to avoid this if all uses could actually just use the FPR64
+  // directly.
+  insertCopy(TII, MI, MI->getOperand(0).getReg(), Dst, true);
+
+  // Erase the old instruction.
+  MI->eraseFromParent();
+
+  ++NumScalarInsnsUsed;
+}
+
+// processMachineBasicBlock - Main optimzation loop.
+bool AArch64AdvSIMDScalar::processMachineBasicBlock(MachineBasicBlock *MBB) {
+  bool Changed = false;
+  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;) {
+    MachineInstr *MI = I;
+    ++I;
+    if (isProfitableToTransform(MI)) {
+      transformInstruction(MI);
+      Changed = true;
+    }
+  }
+  return Changed;
+}
+
+// runOnMachineFunction - Pass entry point from PassManager.
+bool AArch64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) {
+  bool Changed = false;
+  DEBUG(dbgs() << "***** AArch64AdvSIMDScalar *****\n");
+
+  const TargetMachine &TM = mf.getTarget();
+  MRI = &mf.getRegInfo();
+  TII = static_cast<const AArch64InstrInfo *>(TM.getInstrInfo());
+
+  // Just check things on a one-block-at-a-time basis.
+  for (MachineFunction::iterator I = mf.begin(), E = mf.end(); I != E; ++I)
+    if (processMachineBasicBlock(I))
+      Changed = true;
+  return Changed;
+}
+
+// createAArch64AdvSIMDScalar - Factory function used by AArch64TargetMachine
+// to add the pass to the PassManager.
+FunctionPass *llvm::createAArch64AdvSIMDScalar() {
+  return new AArch64AdvSIMDScalar();
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/contrib/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
new file mode 100644
index 0000000..cd94e24
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -0,0 +1,524 @@
+//===-- AArch64AsmPrinter.cpp - AArch64 LLVM assembly writer --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to the AArch64 assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64MCInstLower.h"
+#include "AArch64RegisterInfo.h"
+#include "AArch64Subtarget.h"
+#include "InstPrinter/AArch64InstPrinter.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCLinkerOptimizationHint.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+namespace {
+
+class AArch64AsmPrinter : public AsmPrinter {
+  /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
+  /// make the right decision when printing asm code for different targets.
+  const AArch64Subtarget *Subtarget;
+
+  AArch64MCInstLower MCInstLowering;
+  StackMaps SM;
+
+public:
+  AArch64AsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+      : AsmPrinter(TM, Streamer),
+        Subtarget(&TM.getSubtarget<AArch64Subtarget>()),
+        MCInstLowering(OutContext, *Mang, *this), SM(*this), AArch64FI(nullptr),
+        LOHLabelCounter(0) {}
+
+  const char *getPassName() const override {
+    return "AArch64 Assembly Printer";
+  }
+
+  /// \brief Wrapper for MCInstLowering.lowerOperand() for the
+  /// tblgen'erated pseudo lowering.
+  bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const {
+    return MCInstLowering.lowerOperand(MO, MCOp);
+  }
+
+  void LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
+                     const MachineInstr &MI);
+  void LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
+                       const MachineInstr &MI);
+  /// \brief tblgen'erated driver function for lowering simple MI->MC
+  /// pseudo instructions.
+  bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
+                                   const MachineInstr *MI);
+
+  void EmitInstruction(const MachineInstr *MI) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AsmPrinter::getAnalysisUsage(AU);
+    AU.setPreservesAll();
+  }
+
+  bool runOnMachineFunction(MachineFunction &F) override {
+    AArch64FI = F.getInfo<AArch64FunctionInfo>();
+    return AsmPrinter::runOnMachineFunction(F);
+  }
+
+private:
+  MachineLocation getDebugValueLocation(const MachineInstr *MI) const;
+  void printOperand(const MachineInstr *MI, unsigned OpNum, raw_ostream &O);
+  bool printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O);
+  bool printAsmRegInClass(const MachineOperand &MO,
+                          const TargetRegisterClass *RC, bool isVector,
+                          raw_ostream &O);
+
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
+                       unsigned AsmVariant, const char *ExtraCode,
+                       raw_ostream &O) override;
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
+                             unsigned AsmVariant, const char *ExtraCode,
+                             raw_ostream &O) override;
+
+  void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
+
+  void EmitFunctionBodyEnd() override;
+
+  MCSymbol *GetCPISymbol(unsigned CPID) const override;
+  void EmitEndOfAsmFile(Module &M) override;
+  AArch64FunctionInfo *AArch64FI;
+
+  /// \brief Emit the LOHs contained in AArch64FI.
+  void EmitLOHs();
+
+  typedef std::map<const MachineInstr *, MCSymbol *> MInstToMCSymbol;
+  MInstToMCSymbol LOHInstToLabel;
+  unsigned LOHLabelCounter;
+};
+
+} // end of anonymous namespace
+
+//===----------------------------------------------------------------------===//
+
+void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) {
+  if (Subtarget->isTargetMachO()) {
+    // Funny Darwin hack: This flag tells the linker that no global symbols
+    // contain code that falls through to other global symbols (e.g. the obvious
+    // implementation of multiple entry points).  If this doesn't occur, the
+    // linker can safely perform dead code stripping.  Since LLVM never
+    // generates code that does this, it is always safe to set.
+    OutStreamer.EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
+    SM.serializeToStackMapSection();
+  }
+
+  // Emit a .data.rel section containing any stubs that were created.
+  if (Subtarget->isTargetELF()) {
+    const TargetLoweringObjectFileELF &TLOFELF =
+      static_cast<const TargetLoweringObjectFileELF &>(getObjFileLowering());
+
+    MachineModuleInfoELF &MMIELF = MMI->getObjFileInfo<MachineModuleInfoELF>();
+
+    // Output stubs for external and common global variables.
+    MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList();
+    if (!Stubs.empty()) {
+      OutStreamer.SwitchSection(TLOFELF.getDataRelSection());
+      const DataLayout *TD = TM.getDataLayout();
+
+      for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+        OutStreamer.EmitLabel(Stubs[i].first);
+        OutStreamer.EmitSymbolValue(Stubs[i].second.getPointer(),
+                                    TD->getPointerSize(0));
+      }
+      Stubs.clear();
+    }
+  }
+
+}
+
+MachineLocation
+AArch64AsmPrinter::getDebugValueLocation(const MachineInstr *MI) const {
+  MachineLocation Location;
+  assert(MI->getNumOperands() == 4 && "Invalid no. of machine operands!");
+  // Frame address.  Currently handles register +- offset only.
+  if (MI->getOperand(0).isReg() && MI->getOperand(1).isImm())
+    Location.set(MI->getOperand(0).getReg(), MI->getOperand(1).getImm());
+  else {
+    DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n");
+  }
+  return Location;
+}
+
+void AArch64AsmPrinter::EmitLOHs() {
+  SmallVector<MCSymbol *, 3> MCArgs;
+
+  for (const auto &D : AArch64FI->getLOHContainer()) {
+    for (const MachineInstr *MI : D.getArgs()) {
+      MInstToMCSymbol::iterator LabelIt = LOHInstToLabel.find(MI);
+      assert(LabelIt != LOHInstToLabel.end() &&
+             "Label hasn't been inserted for LOH related instruction");
+      MCArgs.push_back(LabelIt->second);
+    }
+    OutStreamer.EmitLOHDirective(D.getKind(), MCArgs);
+    MCArgs.clear();
+  }
+}
+
+void AArch64AsmPrinter::EmitFunctionBodyEnd() {
+  if (!AArch64FI->getLOHRelated().empty())
+    EmitLOHs();
+}
+
+/// GetCPISymbol - Return the symbol for the specified constant pool entry.
+MCSymbol *AArch64AsmPrinter::GetCPISymbol(unsigned CPID) const {
+  // Darwin uses a linker-private symbol name for constant-pools (to
+  // avoid addends on the relocation?), ELF has no such concept and
+  // uses a normal private symbol.
+  if (getDataLayout().getLinkerPrivateGlobalPrefix()[0])
+    return OutContext.GetOrCreateSymbol(
+        Twine(getDataLayout().getLinkerPrivateGlobalPrefix()) + "CPI" +
+        Twine(getFunctionNumber()) + "_" + Twine(CPID));
+
+  return OutContext.GetOrCreateSymbol(
+      Twine(getDataLayout().getPrivateGlobalPrefix()) + "CPI" +
+      Twine(getFunctionNumber()) + "_" + Twine(CPID));
+}
+
+void AArch64AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNum,
+                                     raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  switch (MO.getType()) {
+  default:
+    llvm_unreachable("<unknown operand type>");
+  case MachineOperand::MO_Register: {
+    unsigned Reg = MO.getReg();
+    assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+    assert(!MO.getSubReg() && "Subregs should be eliminated!");
+    O << AArch64InstPrinter::getRegisterName(Reg);
+    break;
+  }
+  case MachineOperand::MO_Immediate: {
+    int64_t Imm = MO.getImm();
+    O << '#' << Imm;
+    break;
+  }
+  }
+}
+
+bool AArch64AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode,
+                                          raw_ostream &O) {
+  unsigned Reg = MO.getReg();
+  switch (Mode) {
+  default:
+    return true; // Unknown mode.
+  case 'w':
+    Reg = getWRegFromXReg(Reg);
+    break;
+  case 'x':
+    Reg = getXRegFromWReg(Reg);
+    break;
+  }
+
+  O << AArch64InstPrinter::getRegisterName(Reg);
+  return false;
+}
+
+// Prints the register in MO using class RC using the offset in the
+// new register class. This should not be used for cross class
+// printing.
+bool AArch64AsmPrinter::printAsmRegInClass(const MachineOperand &MO,
+                                           const TargetRegisterClass *RC,
+                                           bool isVector, raw_ostream &O) {
+  assert(MO.isReg() && "Should only get here with a register!");
+  const AArch64RegisterInfo *RI =
+      static_cast<const AArch64RegisterInfo *>(TM.getRegisterInfo());
+  unsigned Reg = MO.getReg();
+  unsigned RegToPrint = RC->getRegister(RI->getEncodingValue(Reg));
+  assert(RI->regsOverlap(RegToPrint, Reg));
+  O << AArch64InstPrinter::getRegisterName(
+           RegToPrint, isVector ? AArch64::vreg : AArch64::NoRegAltName);
+  return false;
+}
+
+bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
+                                        unsigned AsmVariant,
+                                        const char *ExtraCode, raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand(OpNum);
+
+  // First try the generic code, which knows about modifiers like 'c' and 'n'.
+  if (!AsmPrinter::PrintAsmOperand(MI, OpNum, AsmVariant, ExtraCode, O))
+    return false;
+
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0)
+      return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default:
+      return true; // Unknown modifier.
+    case 'w':      // Print W register
+    case 'x':      // Print X register
+      if (MO.isReg())
+        return printAsmMRegister(MO, ExtraCode[0], O);
+      if (MO.isImm() && MO.getImm() == 0) {
+        unsigned Reg = ExtraCode[0] == 'w' ? AArch64::WZR : AArch64::XZR;
+        O << AArch64InstPrinter::getRegisterName(Reg);
+        return false;
+      }
+      printOperand(MI, OpNum, O);
+      return false;
+    case 'b': // Print B register.
+    case 'h': // Print H register.
+    case 's': // Print S register.
+    case 'd': // Print D register.
+    case 'q': // Print Q register.
+      if (MO.isReg()) {
+        const TargetRegisterClass *RC;
+        switch (ExtraCode[0]) {
+        case 'b':
+          RC = &AArch64::FPR8RegClass;
+          break;
+        case 'h':
+          RC = &AArch64::FPR16RegClass;
+          break;
+        case 's':
+          RC = &AArch64::FPR32RegClass;
+          break;
+        case 'd':
+          RC = &AArch64::FPR64RegClass;
+          break;
+        case 'q':
+          RC = &AArch64::FPR128RegClass;
+          break;
+        default:
+          return true;
+        }
+        return printAsmRegInClass(MO, RC, false /* vector */, O);
+      }
+      printOperand(MI, OpNum, O);
+      return false;
+    }
+  }
+
+  // According to ARM, we should emit x and v registers unless we have a
+  // modifier.
+  if (MO.isReg()) {
+    unsigned Reg = MO.getReg();
+
+    // If this is a w or x register, print an x register.
+    if (AArch64::GPR32allRegClass.contains(Reg) ||
+        AArch64::GPR64allRegClass.contains(Reg))
+      return printAsmMRegister(MO, 'x', O);
+
+    // If this is a b, h, s, d, or q register, print it as a v register.
+    return printAsmRegInClass(MO, &AArch64::FPR128RegClass, true /* vector */,
+                              O);
+  }
+
+  printOperand(MI, OpNum, O);
+  return false;
+}
+
+bool AArch64AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                              unsigned OpNum,
+                                              unsigned AsmVariant,
+                                              const char *ExtraCode,
+                                              raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0])
+    return true; // Unknown modifier.
+
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  assert(MO.isReg() && "unexpected inline asm memory operand");
+  O << "[" << AArch64InstPrinter::getRegisterName(MO.getReg()) << "]";
+  return false;
+}
+
+void AArch64AsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
+                                               raw_ostream &OS) {
+  unsigned NOps = MI->getNumOperands();
+  assert(NOps == 4);
+  OS << '\t' << MAI->getCommentString() << "DEBUG_VALUE: ";
+  // cast away const; DIetc do not take const operands for some reason.
+  DIVariable V(const_cast<MDNode *>(MI->getOperand(NOps - 1).getMetadata()));
+  OS << V.getName();
+  OS << " <- ";
+  // Frame address.  Currently handles register +- offset only.
+  assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm());
+  OS << '[';
+  printOperand(MI, 0, OS);
+  OS << '+';
+  printOperand(MI, 1, OS);
+  OS << ']';
+  OS << "+";
+  printOperand(MI, NOps - 2, OS);
+}
+
+void AArch64AsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
+                                      const MachineInstr &MI) {
+  unsigned NumNOPBytes = MI.getOperand(1).getImm();
+
+  SM.recordStackMap(MI);
+  // Emit padding.
+  assert(NumNOPBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
+  for (unsigned i = 0; i < NumNOPBytes; i += 4)
+    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0));
+}
+
+// Lower a patchpoint of the form:
+// [<def>], <id>, <numBytes>, <target>, <numArgs>
+void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
+                                        const MachineInstr &MI) {
+  SM.recordPatchPoint(MI);
+
+  PatchPointOpers Opers(&MI);
+
+  int64_t CallTarget = Opers.getMetaOper(PatchPointOpers::TargetPos).getImm();
+  unsigned EncodedBytes = 0;
+  if (CallTarget) {
+    assert((CallTarget & 0xFFFFFFFFFFFF) == CallTarget &&
+           "High 16 bits of call target should be zero.");
+    unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg();
+    EncodedBytes = 16;
+    // Materialize the jump address:
+    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVZWi)
+                                    .addReg(ScratchReg)
+                                    .addImm((CallTarget >> 32) & 0xFFFF)
+                                    .addImm(32));
+    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKWi)
+                                    .addReg(ScratchReg)
+                                    .addReg(ScratchReg)
+                                    .addImm((CallTarget >> 16) & 0xFFFF)
+                                    .addImm(16));
+    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKWi)
+                                    .addReg(ScratchReg)
+                                    .addReg(ScratchReg)
+                                    .addImm(CallTarget & 0xFFFF)
+                                    .addImm(0));
+    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::BLR).addReg(ScratchReg));
+  }
+  // Emit padding.
+  unsigned NumBytes = Opers.getMetaOper(PatchPointOpers::NBytesPos).getImm();
+  assert(NumBytes >= EncodedBytes &&
+         "Patchpoint can't request size less than the length of a call.");
+  assert((NumBytes - EncodedBytes) % 4 == 0 &&
+         "Invalid number of NOP bytes requested!");
+  for (unsigned i = EncodedBytes; i < NumBytes; i += 4)
+    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0));
+}
+
+// Simple pseudo-instructions have their lowering (with expansion to real
+// instructions) auto-generated.
+#include "AArch64GenMCPseudoLowering.inc"
+
+void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  // Do any auto-generated pseudo lowerings.
+  if (emitPseudoExpansionLowering(OutStreamer, MI))
+    return;
+
+  if (AArch64FI->getLOHRelated().count(MI)) {
+    // Generate a label for LOH related instruction
+    MCSymbol *LOHLabel = GetTempSymbol("loh", LOHLabelCounter++);
+    // Associate the instruction with the label
+    LOHInstToLabel[MI] = LOHLabel;
+    OutStreamer.EmitLabel(LOHLabel);
+  }
+
+  // Do any manual lowerings.
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case AArch64::DBG_VALUE: {
+    if (isVerbose() && OutStreamer.hasRawTextSupport()) {
+      SmallString<128> TmpStr;
+      raw_svector_ostream OS(TmpStr);
+      PrintDebugValueComment(MI, OS);
+      OutStreamer.EmitRawText(StringRef(OS.str()));
+    }
+    return;
+  }
+
+  // Tail calls use pseudo instructions so they have the proper code-gen
+  // attributes (isCall, isReturn, etc.). We lower them to the real
+  // instruction here.
+  case AArch64::TCRETURNri: {
+    MCInst TmpInst;
+    TmpInst.setOpcode(AArch64::BR);
+    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
+    EmitToStreamer(OutStreamer, TmpInst);
+    return;
+  }
+  case AArch64::TCRETURNdi: {
+    MCOperand Dest;
+    MCInstLowering.lowerOperand(MI->getOperand(0), Dest);
+    MCInst TmpInst;
+    TmpInst.setOpcode(AArch64::B);
+    TmpInst.addOperand(Dest);
+    EmitToStreamer(OutStreamer, TmpInst);
+    return;
+  }
+  case AArch64::TLSDESC_BLR: {
+    MCOperand Callee, Sym;
+    MCInstLowering.lowerOperand(MI->getOperand(0), Callee);
+    MCInstLowering.lowerOperand(MI->getOperand(1), Sym);
+
+    // First emit a relocation-annotation. This expands to no code, but requests
+    // the following instruction gets an R_AARCH64_TLSDESC_CALL.
+    MCInst TLSDescCall;
+    TLSDescCall.setOpcode(AArch64::TLSDESCCALL);
+    TLSDescCall.addOperand(Sym);
+    EmitToStreamer(OutStreamer, TLSDescCall);
+
+    // Other than that it's just a normal indirect call to the function loaded
+    // from the descriptor.
+    MCInst BLR;
+    BLR.setOpcode(AArch64::BLR);
+    BLR.addOperand(Callee);
+    EmitToStreamer(OutStreamer, BLR);
+
+    return;
+  }
+
+  case TargetOpcode::STACKMAP:
+    return LowerSTACKMAP(OutStreamer, SM, *MI);
+
+  case TargetOpcode::PATCHPOINT:
+    return LowerPATCHPOINT(OutStreamer, SM, *MI);
+  }
+
+  // Finally, do the automated lowerings for everything else.
+  MCInst TmpInst;
+  MCInstLowering.Lower(MI, TmpInst);
+  EmitToStreamer(OutStreamer, TmpInst);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeAArch64AsmPrinter() {
+  RegisterAsmPrinter<AArch64AsmPrinter> X(TheAArch64leTarget);
+  RegisterAsmPrinter<AArch64AsmPrinter> Y(TheAArch64beTarget);
+
+  RegisterAsmPrinter<AArch64AsmPrinter> Z(TheARM64leTarget);
+  RegisterAsmPrinter<AArch64AsmPrinter> W(TheARM64beTarget);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64BranchRelaxation.cpp b/contrib/llvm/lib/Target/AArch64/AArch64BranchRelaxation.cpp
new file mode 100644
index 0000000..484e7e8
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64BranchRelaxation.cpp
@@ -0,0 +1,510 @@
+//===-- AArch64BranchRelaxation.cpp - AArch64 branch relaxation -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-branch-relax"
+
+static cl::opt<bool>
+BranchRelaxation("aarch64-branch-relax", cl::Hidden, cl::init(true),
+                 cl::desc("Relax out of range conditional branches"));
+
+static cl::opt<unsigned>
+TBZDisplacementBits("aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
+                    cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
+
+static cl::opt<unsigned>
+CBZDisplacementBits("aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
+                    cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
+
+static cl::opt<unsigned>
+BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
+                    cl::desc("Restrict range of Bcc instructions (DEBUG)"));
+
+STATISTIC(NumSplit, "Number of basic blocks split");
+STATISTIC(NumRelaxed, "Number of conditional branches relaxed");
+
+namespace {
+class AArch64BranchRelaxation : public MachineFunctionPass {
+  /// BasicBlockInfo - Information about the offset and size of a single
+  /// basic block.
+  struct BasicBlockInfo {
+    /// Offset - Distance from the beginning of the function to the beginning
+    /// of this basic block.
+    ///
+    /// The offset is always aligned as required by the basic block.
+    unsigned Offset;
+
+    /// Size - Size of the basic block in bytes.  If the block contains
+    /// inline assembly, this is a worst case estimate.
+    ///
+    /// The size does not include any alignment padding whether from the
+    /// beginning of the block, or from an aligned jump table at the end.
+    unsigned Size;
+
+    BasicBlockInfo() : Offset(0), Size(0) {}
+
+    /// Compute the offset immediately following this block.  If LogAlign is
+    /// specified, return the offset the successor block will get if it has
+    /// this alignment.
+    unsigned postOffset(unsigned LogAlign = 0) const {
+      unsigned PO = Offset + Size;
+      unsigned Align = 1 << LogAlign;
+      return (PO + Align - 1) / Align * Align;
+    }
+  };
+
+  SmallVector<BasicBlockInfo, 16> BlockInfo;
+
+  MachineFunction *MF;
+  const AArch64InstrInfo *TII;
+
+  bool relaxBranchInstructions();
+  void scanFunction();
+  MachineBasicBlock *splitBlockBeforeInstr(MachineInstr *MI);
+  void adjustBlockOffsets(MachineBasicBlock &MBB);
+  bool isBlockInRange(MachineInstr *MI, MachineBasicBlock *BB, unsigned Disp);
+  bool fixupConditionalBranch(MachineInstr *MI);
+  void computeBlockSize(const MachineBasicBlock &MBB);
+  unsigned getInstrOffset(MachineInstr *MI) const;
+  void dumpBBs();
+  void verify();
+
+public:
+  static char ID;
+  AArch64BranchRelaxation() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "AArch64 branch relaxation pass";
+  }
+};
+char AArch64BranchRelaxation::ID = 0;
+}
+
+/// verify - check BBOffsets, BBSizes, alignment of islands
+void AArch64BranchRelaxation::verify() {
+#ifndef NDEBUG
+  unsigned PrevNum = MF->begin()->getNumber();
+  for (MachineBasicBlock &MBB : *MF) {
+    unsigned Align = MBB.getAlignment();
+    unsigned Num = MBB.getNumber();
+    assert(BlockInfo[Num].Offset % (1u << Align) == 0);
+    assert(!Num || BlockInfo[PrevNum].postOffset() <= BlockInfo[Num].Offset);
+    PrevNum = Num;
+  }
+#endif
+}
+
+/// print block size and offset information - debugging
+void AArch64BranchRelaxation::dumpBBs() {
+  for (auto &MBB : *MF) {
+    const BasicBlockInfo &BBI = BlockInfo[MBB.getNumber()];
+    dbgs() << format("BB#%u\toffset=%08x\t", MBB.getNumber(), BBI.Offset)
+           << format("size=%#x\n", BBI.Size);
+  }
+}
+
+/// BBHasFallthrough - Return true if the specified basic block can fallthrough
+/// into the block immediately after it.
+static bool BBHasFallthrough(MachineBasicBlock *MBB) {
+  // Get the next machine basic block in the function.
+  MachineFunction::iterator MBBI = MBB;
+  // Can't fall off end of function.
+  MachineBasicBlock *NextBB = std::next(MBBI);
+  if (NextBB == MBB->getParent()->end())
+    return false;
+
+  for (MachineBasicBlock *S : MBB->successors()) 
+    if (S == NextBB)
+      return true;
+
+  return false;
+}
+
+/// scanFunction - Do the initial scan of the function, building up
+/// information about each block.
+void AArch64BranchRelaxation::scanFunction() {
+  BlockInfo.clear();
+  BlockInfo.resize(MF->getNumBlockIDs());
+
+  // First thing, compute the size of all basic blocks, and see if the function
+  // has any inline assembly in it. If so, we have to be conservative about
+  // alignment assumptions, as we don't know for sure the size of any
+  // instructions in the inline assembly.
+  for (MachineBasicBlock &MBB : *MF)
+    computeBlockSize(MBB);
+
+  // Compute block offsets and known bits.
+  adjustBlockOffsets(*MF->begin());
+}
+
+/// computeBlockSize - Compute the size for MBB.
+/// This function updates BlockInfo directly.
+void AArch64BranchRelaxation::computeBlockSize(const MachineBasicBlock &MBB) {
+  unsigned Size = 0;
+  for (const MachineInstr &MI : MBB)
+    Size += TII->GetInstSizeInBytes(&MI);
+  BlockInfo[MBB.getNumber()].Size = Size;
+}
+
+/// getInstrOffset - Return the current offset of the specified machine
+/// instruction from the start of the function.  This offset changes as stuff is
+/// moved around inside the function.
+unsigned AArch64BranchRelaxation::getInstrOffset(MachineInstr *MI) const {
+  MachineBasicBlock *MBB = MI->getParent();
+
+  // The offset is composed of two things: the sum of the sizes of all MBB's
+  // before this instruction's block, and the offset from the start of the block
+  // it is in.
+  unsigned Offset = BlockInfo[MBB->getNumber()].Offset;
+
+  // Sum instructions before MI in MBB.
+  for (MachineBasicBlock::iterator I = MBB->begin(); &*I != MI; ++I) {
+    assert(I != MBB->end() && "Didn't find MI in its own basic block?");
+    Offset += TII->GetInstSizeInBytes(I);
+  }
+  return Offset;
+}
+
+void AArch64BranchRelaxation::adjustBlockOffsets(MachineBasicBlock &Start) {
+  unsigned PrevNum = Start.getNumber();
+  for (auto &MBB : make_range(MachineFunction::iterator(Start), MF->end())) {
+    unsigned Num = MBB.getNumber();
+    if (!Num) // block zero is never changed from offset zero.
+      continue;
+    // Get the offset and known bits at the end of the layout predecessor.
+    // Include the alignment of the current block.
+    unsigned LogAlign = MBB.getAlignment();
+    BlockInfo[Num].Offset = BlockInfo[PrevNum].postOffset(LogAlign);
+    PrevNum = Num;
+  }
+}
+
+/// Split the basic block containing MI into two blocks, which are joined by
+/// an unconditional branch.  Update data structures and renumber blocks to
+/// account for this change and returns the newly created block.
+/// NOTE: Successor list of the original BB is out of date after this function,
+/// and must be updated by the caller! Other transforms follow using this
+/// utility function, so no point updating now rather than waiting.
+MachineBasicBlock *
+AArch64BranchRelaxation::splitBlockBeforeInstr(MachineInstr *MI) {
+  MachineBasicBlock *OrigBB = MI->getParent();
+
+  // Create a new MBB for the code after the OrigBB.
+  MachineBasicBlock *NewBB =
+      MF->CreateMachineBasicBlock(OrigBB->getBasicBlock());
+  MachineFunction::iterator MBBI = OrigBB;
+  ++MBBI;
+  MF->insert(MBBI, NewBB);
+
+  // Splice the instructions starting with MI over to NewBB.
+  NewBB->splice(NewBB->end(), OrigBB, MI, OrigBB->end());
+
+  // Add an unconditional branch from OrigBB to NewBB.
+  // Note the new unconditional branch is not being recorded.
+  // There doesn't seem to be meaningful DebugInfo available; this doesn't
+  // correspond to anything in the source.
+  BuildMI(OrigBB, DebugLoc(), TII->get(AArch64::B)).addMBB(NewBB);
+
+  // Insert an entry into BlockInfo to align it properly with the block numbers.
+  BlockInfo.insert(BlockInfo.begin() + NewBB->getNumber(), BasicBlockInfo());
+
+  // Figure out how large the OrigBB is.  As the first half of the original
+  // block, it cannot contain a tablejump.  The size includes
+  // the new jump we added.  (It should be possible to do this without
+  // recounting everything, but it's very confusing, and this is rarely
+  // executed.)
+  computeBlockSize(*OrigBB);
+
+  // Figure out how large the NewMBB is.  As the second half of the original
+  // block, it may contain a tablejump.
+  computeBlockSize(*NewBB);
+
+  // All BBOffsets following these blocks must be modified.
+  adjustBlockOffsets(*OrigBB);
+
+  ++NumSplit;
+
+  return NewBB;
+}
+
+/// isBlockInRange - Returns true if the distance between specific MI and
+/// specific BB can fit in MI's displacement field.
+bool AArch64BranchRelaxation::isBlockInRange(MachineInstr *MI,
+                                             MachineBasicBlock *DestBB,
+                                             unsigned Bits) {
+  unsigned MaxOffs = ((1 << (Bits - 1)) - 1) << 2;
+  unsigned BrOffset = getInstrOffset(MI);
+  unsigned DestOffset = BlockInfo[DestBB->getNumber()].Offset;
+
+  DEBUG(dbgs() << "Branch of destination BB#" << DestBB->getNumber()
+               << " from BB#" << MI->getParent()->getNumber()
+               << " max delta=" << MaxOffs << " from " << getInstrOffset(MI)
+               << " to " << DestOffset << " offset "
+               << int(DestOffset - BrOffset) << "\t" << *MI);
+
+  // Branch before the Dest.
+  if (BrOffset <= DestOffset)
+    return (DestOffset - BrOffset <= MaxOffs);
+  return (BrOffset - DestOffset <= MaxOffs);
+}
+
+static bool isConditionalBranch(unsigned Opc) {
+  switch (Opc) {
+  default:
+    return false;
+  case AArch64::TBZW:
+  case AArch64::TBNZW:
+  case AArch64::TBZX:
+  case AArch64::TBNZX:
+  case AArch64::CBZW:
+  case AArch64::CBNZW:
+  case AArch64::CBZX:
+  case AArch64::CBNZX:
+  case AArch64::Bcc:
+    return true;
+  }
+}
+
+static MachineBasicBlock *getDestBlock(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  default:
+    llvm_unreachable("unexpected opcode!");
+  case AArch64::TBZW:
+  case AArch64::TBNZW:
+  case AArch64::TBZX:
+  case AArch64::TBNZX:
+    return MI->getOperand(2).getMBB();
+  case AArch64::CBZW:
+  case AArch64::CBNZW:
+  case AArch64::CBZX:
+  case AArch64::CBNZX:
+  case AArch64::Bcc:
+    return MI->getOperand(1).getMBB();
+  }
+}
+
+static unsigned getOppositeConditionOpcode(unsigned Opc) {
+  switch (Opc) {
+  default:
+    llvm_unreachable("unexpected opcode!");
+  case AArch64::TBNZW:   return AArch64::TBZW;
+  case AArch64::TBNZX:   return AArch64::TBZX;
+  case AArch64::TBZW:    return AArch64::TBNZW;
+  case AArch64::TBZX:    return AArch64::TBNZX;
+  case AArch64::CBNZW:   return AArch64::CBZW;
+  case AArch64::CBNZX:   return AArch64::CBZX;
+  case AArch64::CBZW:    return AArch64::CBNZW;
+  case AArch64::CBZX:    return AArch64::CBNZX;
+  case AArch64::Bcc:     return AArch64::Bcc; // Condition is an operand for Bcc.
+  }
+}
+
+static unsigned getBranchDisplacementBits(unsigned Opc) {
+  switch (Opc) {
+  default:
+    llvm_unreachable("unexpected opcode!");
+  case AArch64::TBNZW:
+  case AArch64::TBZW:
+  case AArch64::TBNZX:
+  case AArch64::TBZX:
+    return TBZDisplacementBits;
+  case AArch64::CBNZW:
+  case AArch64::CBZW:
+  case AArch64::CBNZX:
+  case AArch64::CBZX:
+    return CBZDisplacementBits;
+  case AArch64::Bcc:
+    return BCCDisplacementBits;
+  }
+}
+
+static inline void invertBccCondition(MachineInstr *MI) {
+  assert(MI->getOpcode() == AArch64::Bcc && "Unexpected opcode!");
+  AArch64CC::CondCode CC = (AArch64CC::CondCode)MI->getOperand(0).getImm();
+  CC = AArch64CC::getInvertedCondCode(CC);
+  MI->getOperand(0).setImm((int64_t)CC);
+}
+
+/// fixupConditionalBranch - Fix up a conditional branch whose destination is
+/// too far away to fit in its displacement field. It is converted to an inverse
+/// conditional branch + an unconditional branch to the destination.
+bool AArch64BranchRelaxation::fixupConditionalBranch(MachineInstr *MI) {
+  MachineBasicBlock *DestBB = getDestBlock(MI);
+
+  // Add an unconditional branch to the destination and invert the branch
+  // condition to jump over it:
+  // tbz L1
+  // =>
+  // tbnz L2
+  // b   L1
+  // L2:
+
+  // If the branch is at the end of its MBB and that has a fall-through block,
+  // direct the updated conditional branch to the fall-through block. Otherwise,
+  // split the MBB before the next instruction.
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineInstr *BMI = &MBB->back();
+  bool NeedSplit = (BMI != MI) || !BBHasFallthrough(MBB);
+
+  if (BMI != MI) {
+    if (std::next(MachineBasicBlock::iterator(MI)) ==
+            std::prev(MBB->getLastNonDebugInstr()) &&
+        BMI->getOpcode() == AArch64::B) {
+      // Last MI in the BB is an unconditional branch. Can we simply invert the
+      // condition and swap destinations:
+      // beq L1
+      // b   L2
+      // =>
+      // bne L2
+      // b   L1
+      MachineBasicBlock *NewDest = BMI->getOperand(0).getMBB();
+      if (isBlockInRange(MI, NewDest,
+                         getBranchDisplacementBits(MI->getOpcode()))) {
+        DEBUG(dbgs() << "  Invert condition and swap its destination with "
+                     << *BMI);
+        BMI->getOperand(0).setMBB(DestBB);
+        unsigned OpNum = (MI->getOpcode() == AArch64::TBZW ||
+                          MI->getOpcode() == AArch64::TBNZW ||
+                          MI->getOpcode() == AArch64::TBZX ||
+                          MI->getOpcode() == AArch64::TBNZX)
+                             ? 2
+                             : 1;
+        MI->getOperand(OpNum).setMBB(NewDest);
+        MI->setDesc(TII->get(getOppositeConditionOpcode(MI->getOpcode())));
+        if (MI->getOpcode() == AArch64::Bcc)
+          invertBccCondition(MI);
+        return true;
+      }
+    }
+  }
+
+  if (NeedSplit) {
+    // Analyze the branch so we know how to update the successor lists.
+    MachineBasicBlock *TBB, *FBB;
+    SmallVector<MachineOperand, 2> Cond;
+    TII->AnalyzeBranch(*MBB, TBB, FBB, Cond, false);
+
+    MachineBasicBlock *NewBB = splitBlockBeforeInstr(MI);
+    // No need for the branch to the next block. We're adding an unconditional
+    // branch to the destination.
+    int delta = TII->GetInstSizeInBytes(&MBB->back());
+    BlockInfo[MBB->getNumber()].Size -= delta;
+    MBB->back().eraseFromParent();
+    // BlockInfo[SplitBB].Offset is wrong temporarily, fixed below
+
+    // Update the successor lists according to the transformation to follow.
+    // Do it here since if there's no split, no update is needed.
+    MBB->replaceSuccessor(FBB, NewBB);
+    NewBB->addSuccessor(FBB);
+  }
+  MachineBasicBlock *NextBB = std::next(MachineFunction::iterator(MBB));
+
+  DEBUG(dbgs() << "  Insert B to BB#" << DestBB->getNumber()
+               << ", invert condition and change dest. to BB#"
+               << NextBB->getNumber() << "\n");
+
+  // Insert a new conditional branch and a new unconditional branch.
+  MachineInstrBuilder MIB = BuildMI(
+      MBB, DebugLoc(), TII->get(getOppositeConditionOpcode(MI->getOpcode())))
+                                .addOperand(MI->getOperand(0));
+  if (MI->getOpcode() == AArch64::TBZW || MI->getOpcode() == AArch64::TBNZW ||
+      MI->getOpcode() == AArch64::TBZX || MI->getOpcode() == AArch64::TBNZX)
+    MIB.addOperand(MI->getOperand(1));
+  if (MI->getOpcode() == AArch64::Bcc)
+    invertBccCondition(MIB);
+  MIB.addMBB(NextBB);
+  BlockInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back());
+  BuildMI(MBB, DebugLoc(), TII->get(AArch64::B)).addMBB(DestBB);
+  BlockInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back());
+
+  // Remove the old conditional branch.  It may or may not still be in MBB.
+  BlockInfo[MI->getParent()->getNumber()].Size -= TII->GetInstSizeInBytes(MI);
+  MI->eraseFromParent();
+
+  // Finally, keep the block offsets up to date.
+  adjustBlockOffsets(*MBB);
+  return true;
+}
+
+bool AArch64BranchRelaxation::relaxBranchInstructions() {
+  bool Changed = false;
+  // Relaxing branches involves creating new basic blocks, so re-eval
+  // end() for termination.
+  for (auto &MBB : *MF) {
+    MachineInstr *MI = MBB.getFirstTerminator();
+    if (isConditionalBranch(MI->getOpcode()) &&
+        !isBlockInRange(MI, getDestBlock(MI),
+                        getBranchDisplacementBits(MI->getOpcode()))) {
+      fixupConditionalBranch(MI);
+      ++NumRelaxed;
+      Changed = true;
+    }
+  }
+  return Changed;
+}
+
+bool AArch64BranchRelaxation::runOnMachineFunction(MachineFunction &mf) {
+  MF = &mf;
+
+  // If the pass is disabled, just bail early.
+  if (!BranchRelaxation)
+    return false;
+
+  DEBUG(dbgs() << "***** AArch64BranchRelaxation *****\n");
+
+  TII = (const AArch64InstrInfo *)MF->getTarget().getInstrInfo();
+
+  // Renumber all of the machine basic blocks in the function, guaranteeing that
+  // the numbers agree with the position of the block in the function.
+  MF->RenumberBlocks();
+
+  // Do the initial scan of the function, building up information about the
+  // sizes of each block.
+  scanFunction();
+
+  DEBUG(dbgs() << "  Basic blocks before relaxation\n");
+  DEBUG(dumpBBs());
+
+  bool MadeChange = false;
+  while (relaxBranchInstructions())
+    MadeChange = true;
+
+  // After a while, this might be made debug-only, but it is not expensive.
+  verify();
+
+  DEBUG(dbgs() << "  Basic blocks after relaxation\n");
+  DEBUG(dbgs() << '\n'; dumpBBs());
+
+  BlockInfo.clear();
+
+  return MadeChange;
+}
+
+/// createAArch64BranchRelaxation - returns an instance of the constpool
+/// island pass.
+FunctionPass *llvm::createAArch64BranchRelaxation() {
+  return new AArch64BranchRelaxation();
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td
new file mode 100644
index 0000000..1fe5138
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td
@@ -0,0 +1,242 @@
+//=- AArch64CallingConv.td - Calling Conventions for AArch64 -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for AArch64 architecture.
+//
+//===----------------------------------------------------------------------===//
+
+/// CCIfAlign - Match of the original alignment of the arg
+class CCIfAlign<string Align, CCAction A> :
+  CCIf<!strconcat("ArgFlags.getOrigAlign() == ", Align), A>;
+/// CCIfBigEndian - Match only if we're in big endian mode.
+class CCIfBigEndian<CCAction A> :
+  CCIf<"State.getTarget().getDataLayout()->isBigEndian()", A>;
+
+//===----------------------------------------------------------------------===//
+// ARM AAPCS64 Calling Convention
+//===----------------------------------------------------------------------===//
+
+def CC_AArch64_AAPCS : CallingConv<[
+  CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
+  CCIfType<[v2f64, v4f32], CCBitConvertToType<v2i64>>,
+
+  // Big endian vectors must be passed as if they were 1-element vectors so that
+  // their lanes are in a consistent order.
+  CCIfBigEndian<CCIfType<[v2i32, v2f32, v4i16, v4f16, v8i8],
+                         CCBitConvertToType<f64>>>,
+  CCIfBigEndian<CCIfType<[v2i64, v2f64, v4i32, v4f32, v8i16, v8f16, v16i8],
+                         CCBitConvertToType<f128>>>,
+
+  // An SRet is passed in X8, not X0 like a normal pointer parameter.
+  CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[X8], [W8]>>>,
+
+  // Put ByVal arguments directly on the stack. Minimum size and alignment of a
+  // slot is 64-bit.
+  CCIfByVal<CCPassByVal<8, 8>>,
+
+  // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
+  // up to eight each of GPR and FPR.
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+  CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
+                                          [X0, X1, X2, X3, X4, X5, X6, X7]>>,
+  // i128 is split to two i64s, we can't fit half to register X7.
+  CCIfType<[i64], CCIfSplit<CCAssignToRegWithShadow<[X0, X2, X4, X6],
+                                                    [X0, X1, X3, X5]>>>,
+
+  // i128 is split to two i64s, and its stack alignment is 16 bytes.
+  CCIfType<[i64], CCIfSplit<CCAssignToStackWithShadow<8, 16, [X7]>>>,
+
+  CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
+                                          [W0, W1, W2, W3, W4, W5, W6, W7]>>,
+  CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32],
+           CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+                                   [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],
+           CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+
+  // If more than will fit in registers, pass them on the stack instead.
+  CCIfType<[i1, i8, i16, f16], CCAssignToStack<8, 8>>,
+  CCIfType<[i32, f32], CCAssignToStack<8, 8>>,
+  CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8],
+           CCAssignToStack<8, 8>>,
+  CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],
+           CCAssignToStack<16, 16>>
+]>;
+
+def RetCC_AArch64_AAPCS : CallingConv<[
+  CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
+  CCIfType<[v2f64, v4f32], CCBitConvertToType<v2i64>>,
+
+  // Big endian vectors must be passed as if they were 1-element vectors so that
+  // their lanes are in a consistent order.
+  CCIfBigEndian<CCIfType<[v2i32, v2f32, v4i16, v4f16, v8i8],
+                         CCBitConvertToType<f64>>>,
+  CCIfBigEndian<CCIfType<[v2i64, v2f64, v4i32, v4f32, v8i16, v8f16, v16i8],
+                         CCBitConvertToType<f128>>>,
+
+  CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
+                                          [X0, X1, X2, X3, X4, X5, X6, X7]>>,
+  CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
+                                          [W0, W1, W2, W3, W4, W5, W6, W7]>>,
+  CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32],
+      CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+                              [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],
+      CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>
+]>;
+
+
+// Darwin uses a calling convention which differs in only two ways
+// from the standard one at this level:
+//     + i128s (i.e. split i64s) don't need even registers.
+//     + Stack slots are sized as needed rather than being at least 64-bit.
+def CC_AArch64_DarwinPCS : CallingConv<[
+  CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
+  CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
+
+  // An SRet is passed in X8, not X0 like a normal pointer parameter.
+  CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[X8], [W8]>>>,
+
+  // Put ByVal arguments directly on the stack. Minimum size and alignment of a
+  // slot is 64-bit.
+  CCIfByVal<CCPassByVal<8, 8>>,
+
+  // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
+  // up to eight each of GPR and FPR.
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+  CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
+                                          [X0, X1, X2, X3, X4, X5, X6, X7]>>,
+  // i128 is split to two i64s, we can't fit half to register X7.
+  CCIfType<[i64],
+           CCIfSplit<CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6],
+                                             [W0, W1, W2, W3, W4, W5, W6]>>>,
+  // i128 is split to two i64s, and its stack alignment is 16 bytes.
+  CCIfType<[i64], CCIfSplit<CCAssignToStackWithShadow<8, 16, [X7]>>>,
+
+  CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
+                                          [W0, W1, W2, W3, W4, W5, W6, W7]>>,
+  CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32],
+           CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+                                   [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],
+           CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+
+  // If more than will fit in registers, pass them on the stack instead.
+  CCIf<"ValVT == MVT::i1 || ValVT == MVT::i8", CCAssignToStack<1, 1>>,
+  CCIf<"ValVT == MVT::i16 || ValVT == MVT::f16", CCAssignToStack<2, 2>>,
+  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+  CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8],
+           CCAssignToStack<8, 8>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64], CCAssignToStack<16, 16>>
+]>;
+
+def CC_AArch64_DarwinPCS_VarArg : CallingConv<[
+  CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
+  CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
+
+  // Handle all scalar types as either i64 or f64.
+  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+  CCIfType<[f16, f32],     CCPromoteToType<f64>>,
+
+  // Everything is on the stack.
+  // i128 is split to two i64s, and its stack alignment is 16 bytes.
+  CCIfType<[i64], CCIfSplit<CCAssignToStack<8, 16>>>,
+  CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32], CCAssignToStack<8, 8>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],   CCAssignToStack<16, 16>>
+]>;
+
+// The WebKit_JS calling convention only passes the first argument (the callee)
+// in register and the remaining arguments on stack. We allow 32bit stack slots,
+// so that WebKit can write partial values in the stack and define the other
+// 32bit quantity as undef.
+def CC_AArch64_WebKit_JS : CallingConv<[
+  // Handle i1, i8, i16, i32, and i64 passing in register X0 (W0).
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+  CCIfType<[i32], CCAssignToRegWithShadow<[W0], [X0]>>,
+  CCIfType<[i64], CCAssignToRegWithShadow<[X0], [W0]>>,
+
+  // Pass the remaining arguments on the stack instead.
+  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+  CCIfType<[i64, f64], CCAssignToStack<8, 8>>
+]>;
+
+def RetCC_AArch64_WebKit_JS : CallingConv<[
+  CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
+                                          [X0, X1, X2, X3, X4, X5, X6, X7]>>,
+  CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
+                                          [W0, W1, W2, W3, W4, W5, W6, W7]>>,
+  CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>
+]>;
+
+// FIXME: LR is only callee-saved in the sense that *we* preserve it and are
+// presumably a callee to someone. External functions may not do so, but this
+// is currently safe since BL has LR as an implicit-def and what happens after a
+// tail call doesn't matter.
+//
+// It would be better to model its preservation semantics properly (create a
+// vreg on entry, use it in RET & tail call generation; make that vreg def if we
+// end up saving LR as part of a call frame). Watch this space...
+def CSR_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
+                                           X23, X24, X25, X26, X27, X28,
+                                           D8,  D9,  D10, D11,
+                                           D12, D13, D14, D15)>;
+
+// Constructors and destructors return 'this' in the iOS 64-bit C++ ABI; since
+// 'this' and the pointer return value are both passed in X0 in these cases,
+// this can be partially modelled by treating X0 as a callee-saved register;
+// only the resulting RegMask is used; the SaveList is ignored
+//
+// (For generic ARM 64-bit ABI code, clang will not generate constructors or
+// destructors with 'this' returns, so this RegMask will not be used in that
+// case)
+def CSR_AArch64_AAPCS_ThisReturn : CalleeSavedRegs<(add CSR_AArch64_AAPCS, X0)>;
+
+// The function used by Darwin to obtain the address of a thread-local variable
+// guarantees more than a normal AAPCS function. x16 and x17 are used on the
+// fast path for calculation, but other registers except X0 (argument/return)
+// and LR (it is a call, after all) are preserved.
+def CSR_AArch64_TLS_Darwin
+    : CalleeSavedRegs<(add (sub (sequence "X%u", 1, 28), X16, X17),
+                           FP,
+                           (sequence "Q%u", 0, 31))>;
+
+// The ELF stub used for TLS-descriptor access saves every feasible
+// register. Only X0 and LR are clobbered.
+def CSR_AArch64_TLS_ELF
+    : CalleeSavedRegs<(add (sequence "X%u", 1, 28), FP,
+                           (sequence "Q%u", 0, 31))>;
+
+def CSR_AArch64_AllRegs
+    : CalleeSavedRegs<(add (sequence "W%u", 0, 30), WSP,
+                           (sequence "X%u", 0, 28), FP, LR, SP,
+                           (sequence "B%u", 0, 31), (sequence "H%u", 0, 31),
+                           (sequence "S%u", 0, 31), (sequence "D%u", 0, 31),
+                           (sequence "Q%u", 0, 31))>;
+
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp b/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
new file mode 100644
index 0000000..4d23dc5
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
@@ -0,0 +1,147 @@
+//===-- AArch64CleanupLocalDynamicTLSPass.cpp ---------------------*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Local-dynamic access to thread-local variables proceeds in three stages.
+//
+// 1. The offset of this Module's thread-local area from TPIDR_EL0 is calculated
+//    in much the same way as a general-dynamic TLS-descriptor access against
+//    the special symbol _TLS_MODULE_BASE.
+// 2. The variable's offset from _TLS_MODULE_BASE_ is calculated using
+//    instructions with "dtprel" modifiers.
+// 3. These two are added, together with TPIDR_EL0, to obtain the variable's
+//    true address.
+//
+// This is only better than general-dynamic access to the variable if two or
+// more of the first stage TLS-descriptor calculations can be combined. This
+// pass looks through a function and performs such combinations.
+//
+//===----------------------------------------------------------------------===//
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64TargetMachine.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+using namespace llvm;
+
+namespace {
+struct LDTLSCleanup : public MachineFunctionPass {
+  static char ID;
+  LDTLSCleanup() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+    if (AFI->getNumLocalDynamicTLSAccesses() < 2) {
+      // No point folding accesses if there isn't at least two.
+      return false;
+    }
+
+    MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
+    return VisitNode(DT->getRootNode(), 0);
+  }
+
+  // Visit the dominator subtree rooted at Node in pre-order.
+  // If TLSBaseAddrReg is non-null, then use that to replace any
+  // TLS_base_addr instructions. Otherwise, create the register
+  // when the first such instruction is seen, and then use it
+  // as we encounter more instructions.
+  bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg) {
+    MachineBasicBlock *BB = Node->getBlock();
+    bool Changed = false;
+
+    // Traverse the current block.
+    for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
+         ++I) {
+      switch (I->getOpcode()) {
+      case AArch64::TLSDESC_BLR:
+        // Make sure it's a local dynamic access.
+        if (!I->getOperand(1).isSymbol() ||
+            strcmp(I->getOperand(1).getSymbolName(), "_TLS_MODULE_BASE_"))
+          break;
+
+        if (TLSBaseAddrReg)
+          I = replaceTLSBaseAddrCall(I, TLSBaseAddrReg);
+        else
+          I = setRegister(I, &TLSBaseAddrReg);
+        Changed = true;
+        break;
+      default:
+        break;
+      }
+    }
+
+    // Visit the children of this block in the dominator tree.
+    for (MachineDomTreeNode *N : *Node) {
+      Changed |= VisitNode(N, TLSBaseAddrReg);
+    }
+
+    return Changed;
+  }
+
+  // Replace the TLS_base_addr instruction I with a copy from
+  // TLSBaseAddrReg, returning the new instruction.
+  MachineInstr *replaceTLSBaseAddrCall(MachineInstr *I,
+                                       unsigned TLSBaseAddrReg) {
+    MachineFunction *MF = I->getParent()->getParent();
+    const AArch64TargetMachine *TM =
+        static_cast<const AArch64TargetMachine *>(&MF->getTarget());
+    const AArch64InstrInfo *TII = TM->getInstrInfo();
+
+    // Insert a Copy from TLSBaseAddrReg to x0, which is where the rest of the
+    // code sequence assumes the address will be.
+    MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(),
+                                 TII->get(TargetOpcode::COPY),
+                                 AArch64::X0).addReg(TLSBaseAddrReg);
+
+    // Erase the TLS_base_addr instruction.
+    I->eraseFromParent();
+
+    return Copy;
+  }
+
+  // Create a virtal register in *TLSBaseAddrReg, and populate it by
+  // inserting a copy instruction after I. Returns the new instruction.
+  MachineInstr *setRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) {
+    MachineFunction *MF = I->getParent()->getParent();
+    const AArch64TargetMachine *TM =
+        static_cast<const AArch64TargetMachine *>(&MF->getTarget());
+    const AArch64InstrInfo *TII = TM->getInstrInfo();
+
+    // Create a virtual register for the TLS base address.
+    MachineRegisterInfo &RegInfo = MF->getRegInfo();
+    *TLSBaseAddrReg = RegInfo.createVirtualRegister(&AArch64::GPR64RegClass);
+
+    // Insert a copy from X0 to TLSBaseAddrReg for later.
+    MachineInstr *Next = I->getNextNode();
+    MachineInstr *Copy = BuildMI(*I->getParent(), Next, I->getDebugLoc(),
+                                 TII->get(TargetOpcode::COPY),
+                                 *TLSBaseAddrReg).addReg(AArch64::X0);
+
+    return Copy;
+  }
+
+  const char *getPassName() const override {
+    return "Local Dynamic TLS Access Clean-up";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineDominatorTree>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+}
+
+char LDTLSCleanup::ID = 0;
+FunctionPass *llvm::createAArch64CleanupLocalDynamicTLSPass() {
+  return new LDTLSCleanup();
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp b/contrib/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
new file mode 100644
index 0000000..6b1f096
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
@@ -0,0 +1,1117 @@
+//===---------- AArch64CollectLOH.cpp - AArch64 collect LOH pass --*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that collect the Linker Optimization Hint (LOH).
+// This pass should be run at the very end of the compilation flow, just before
+// assembly printer.
+// To be useful for the linker, the LOH must be printed into the assembly file.
+//
+// A LOH describes a sequence of instructions that may be optimized by the
+// linker.
+// This same sequence cannot be optimized by the compiler because some of
+// the information will be known at link time.
+// For instance, consider the following sequence:
+//     L1: adrp xA, sym@PAGE
+//     L2: add xB, xA, sym@PAGEOFF
+//     L3: ldr xC, [xB, #imm]
+// This sequence can be turned into:
+// A literal load if sym@PAGE + sym@PAGEOFF + #imm - address(L3) is < 1MB:
+//     L3: ldr xC, sym+#imm
+// It may also be turned into either the following more efficient
+// code sequences:
+// - If sym@PAGEOFF + #imm fits the encoding space of L3.
+//     L1: adrp xA, sym@PAGE
+//     L3: ldr xC, [xB, sym@PAGEOFF + #imm]
+// - If sym@PAGE + sym@PAGEOFF - address(L1) < 1MB:
+//     L1: adr xA, sym
+//     L3: ldr xC, [xB, #imm]
+//
+// To be valid a LOH must meet all the requirements needed by all the related
+// possible linker transformations.
+// For instance, using the running example, the constraints to emit
+// ".loh AdrpAddLdr" are:
+// - L1, L2, and L3 instructions are of the expected type, i.e.,
+//   respectively ADRP, ADD (immediate), and LD.
+// - The result of L1 is used only by L2.
+// - The register argument (xA) used in the ADD instruction is defined
+//   only by L1.
+// - The result of L2 is used only by L3.
+// - The base address (xB) in L3 is defined only L2.
+// - The ADRP in L1 and the ADD in L2 must reference the same symbol using
+//   @PAGE/@PAGEOFF with no additional constants
+//
+// Currently supported LOHs are:
+// * So called non-ADRP-related:
+//   - .loh AdrpAddLdr L1, L2, L3:
+//     L1: adrp xA, sym@PAGE
+//     L2: add xB, xA, sym@PAGEOFF
+//     L3: ldr xC, [xB, #imm]
+//   - .loh AdrpLdrGotLdr L1, L2, L3:
+//     L1: adrp xA, sym@GOTPAGE
+//     L2: ldr xB, [xA, sym@GOTPAGEOFF]
+//     L3: ldr xC, [xB, #imm]
+//   - .loh AdrpLdr L1, L3:
+//     L1: adrp xA, sym@PAGE
+//     L3: ldr xC, [xA, sym@PAGEOFF]
+//   - .loh AdrpAddStr L1, L2, L3:
+//     L1: adrp xA, sym@PAGE
+//     L2: add xB, xA, sym@PAGEOFF
+//     L3: str xC, [xB, #imm]
+//   - .loh AdrpLdrGotStr L1, L2, L3:
+//     L1: adrp xA, sym@GOTPAGE
+//     L2: ldr xB, [xA, sym@GOTPAGEOFF]
+//     L3: str xC, [xB, #imm]
+//   - .loh AdrpAdd L1, L2:
+//     L1: adrp xA, sym@PAGE
+//     L2: add xB, xA, sym@PAGEOFF
+//   For all these LOHs, L1, L2, L3 form a simple chain:
+//   L1 result is used only by L2 and L2 result by L3.
+//   L3 LOH-related argument is defined only by L2 and L2 LOH-related argument
+//   by L1.
+// All these LOHs aim at using more efficient load/store patterns by folding
+// some instructions used to compute the address directly into the load/store.
+//
+// * So called ADRP-related:
+//  - .loh AdrpAdrp L2, L1:
+//    L2: ADRP xA, sym1@PAGE
+//    L1: ADRP xA, sym2@PAGE
+//    L2 dominates L1 and xA is not redifined between L2 and L1
+// This LOH aims at getting rid of redundant ADRP instructions.
+//
+// The overall design for emitting the LOHs is:
+// 1. AArch64CollectLOH (this pass) records the LOHs in the AArch64FunctionInfo.
+// 2. AArch64AsmPrinter reads the LOHs from AArch64FunctionInfo and it:
+//     1. Associates them a label.
+//     2. Emits them in a MCStreamer (EmitLOHDirective).
+//         - The MCMachOStreamer records them into the MCAssembler.
+//         - The MCAsmStreamer prints them.
+//         - Other MCStreamers ignore them.
+//     3. Closes the MCStreamer:
+//         - The MachObjectWriter gets them from the MCAssembler and writes
+//           them in the object file.
+//         - Other ObjectWriters ignore them.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-collect-loh"
+
+static cl::opt<bool>
+PreCollectRegister("aarch64-collect-loh-pre-collect-register", cl::Hidden,
+                   cl::desc("Restrict analysis to registers invovled"
+                            " in LOHs"),
+                   cl::init(true));
+
+static cl::opt<bool>
+BasicBlockScopeOnly("aarch64-collect-loh-bb-only", cl::Hidden,
+                    cl::desc("Restrict analysis at basic block scope"),
+                    cl::init(true));
+
+STATISTIC(NumADRPSimpleCandidate,
+          "Number of simplifiable ADRP dominate by another");
+STATISTIC(NumADRPComplexCandidate2,
+          "Number of simplifiable ADRP reachable by 2 defs");
+STATISTIC(NumADRPComplexCandidate3,
+          "Number of simplifiable ADRP reachable by 3 defs");
+STATISTIC(NumADRPComplexCandidateOther,
+          "Number of simplifiable ADRP reachable by 4 or more defs");
+STATISTIC(NumADDToSTRWithImm,
+          "Number of simplifiable STR with imm reachable by ADD");
+STATISTIC(NumLDRToSTRWithImm,
+          "Number of simplifiable STR with imm reachable by LDR");
+STATISTIC(NumADDToSTR, "Number of simplifiable STR reachable by ADD");
+STATISTIC(NumLDRToSTR, "Number of simplifiable STR reachable by LDR");
+STATISTIC(NumADDToLDRWithImm,
+          "Number of simplifiable LDR with imm reachable by ADD");
+STATISTIC(NumLDRToLDRWithImm,
+          "Number of simplifiable LDR with imm reachable by LDR");
+STATISTIC(NumADDToLDR, "Number of simplifiable LDR reachable by ADD");
+STATISTIC(NumLDRToLDR, "Number of simplifiable LDR reachable by LDR");
+STATISTIC(NumADRPToLDR, "Number of simplifiable LDR reachable by ADRP");
+STATISTIC(NumCplxLvl1, "Number of complex case of level 1");
+STATISTIC(NumTooCplxLvl1, "Number of too complex case of level 1");
+STATISTIC(NumCplxLvl2, "Number of complex case of level 2");
+STATISTIC(NumTooCplxLvl2, "Number of too complex case of level 2");
+STATISTIC(NumADRSimpleCandidate, "Number of simplifiable ADRP + ADD");
+STATISTIC(NumADRComplexCandidate, "Number of too complex ADRP + ADD");
+
+namespace llvm {
+void initializeAArch64CollectLOHPass(PassRegistry &);
+}
+
+namespace {
+struct AArch64CollectLOH : public MachineFunctionPass {
+  static char ID;
+  AArch64CollectLOH() : MachineFunctionPass(ID) {
+    initializeAArch64CollectLOHPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "AArch64 Collect Linker Optimization Hint (LOH)";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+    AU.addRequired<MachineDominatorTree>();
+  }
+
+private:
+};
+
+/// A set of MachineInstruction.
+typedef SetVector<const MachineInstr *> SetOfMachineInstr;
+/// Map a basic block to a set of instructions per register.
+/// This is used to represent the exposed uses of a basic block
+/// per register.
+typedef MapVector<const MachineBasicBlock *, SetOfMachineInstr *>
+BlockToSetOfInstrsPerColor;
+/// Map a basic block to an instruction per register.
+/// This is used to represent the live-out definitions of a basic block
+/// per register.
+typedef MapVector<const MachineBasicBlock *, const MachineInstr **>
+BlockToInstrPerColor;
+/// Map an instruction to a set of instructions. Used to represent the
+/// mapping def to reachable uses or use to definitions.
+typedef MapVector<const MachineInstr *, SetOfMachineInstr> InstrToInstrs;
+/// Map a basic block to a BitVector.
+/// This is used to record the kill registers per basic block.
+typedef MapVector<const MachineBasicBlock *, BitVector> BlockToRegSet;
+
+/// Map a register to a dense id.
+typedef DenseMap<unsigned, unsigned> MapRegToId;
+/// Map a dense id to a register. Used for debug purposes.
+typedef SmallVector<unsigned, 32> MapIdToReg;
+} // end anonymous namespace.
+
+char AArch64CollectLOH::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AArch64CollectLOH, "aarch64-collect-loh",
+                      "AArch64 Collect Linker Optimization Hint (LOH)", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(AArch64CollectLOH, "aarch64-collect-loh",
+                    "AArch64 Collect Linker Optimization Hint (LOH)", false,
+                    false)
+
+/// Given a couple (MBB, reg) get the corresponding set of instruction from
+/// the given "sets".
+/// If this couple does not reference any set, an empty set is added to "sets"
+/// for this couple and returned.
+/// \param nbRegs is used internally allocate some memory. It must be consistent
+/// with the way sets is used.
+static SetOfMachineInstr &getSet(BlockToSetOfInstrsPerColor &sets,
+                                 const MachineBasicBlock &MBB, unsigned reg,
+                                 unsigned nbRegs) {
+  SetOfMachineInstr *result;
+  BlockToSetOfInstrsPerColor::iterator it = sets.find(&MBB);
+  if (it != sets.end())
+    result = it->second;
+  else
+    result = sets[&MBB] = new SetOfMachineInstr[nbRegs];
+
+  return result[reg];
+}
+
+/// Given a couple (reg, MI) get the corresponding set of instructions from the
+/// the given "sets".
+/// This is used to get the uses record in sets of a definition identified by
+/// MI and reg, i.e., MI defines reg.
+/// If the couple does not reference anything, an empty set is added to
+/// "sets[reg]".
+/// \pre set[reg] is valid.
+static SetOfMachineInstr &getUses(InstrToInstrs *sets, unsigned reg,
+                                  const MachineInstr &MI) {
+  return sets[reg][&MI];
+}
+
+/// Same as getUses but does not modify the input map: sets.
+/// \return NULL if the couple (reg, MI) is not in sets.
+static const SetOfMachineInstr *getUses(const InstrToInstrs *sets, unsigned reg,
+                                        const MachineInstr &MI) {
+  InstrToInstrs::const_iterator Res = sets[reg].find(&MI);
+  if (Res != sets[reg].end())
+    return &(Res->second);
+  return nullptr;
+}
+
+/// Initialize the reaching definition algorithm:
+/// For each basic block BB in MF, record:
+/// - its kill set.
+/// - its reachable uses (uses that are exposed to BB's predecessors).
+/// - its the generated definitions.
+/// \param DummyOp if not NULL, specifies a Dummy Operation to be added to
+/// the list of uses of exposed defintions.
+/// \param ADRPMode specifies to only consider ADRP instructions for generated
+/// definition. It also consider definitions of ADRP instructions as uses and
+/// ignore other uses. The ADRPMode is used to collect the information for LHO
+/// that involve ADRP operation only.
+static void initReachingDef(MachineFunction &MF,
+                            InstrToInstrs *ColorOpToReachedUses,
+                            BlockToInstrPerColor &Gen, BlockToRegSet &Kill,
+                            BlockToSetOfInstrsPerColor &ReachableUses,
+                            const MapRegToId &RegToId,
+                            const MachineInstr *DummyOp, bool ADRPMode) {
+  const TargetMachine &TM = MF.getTarget();
+  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+
+  unsigned NbReg = RegToId.size();
+
+  for (MachineBasicBlock &MBB : MF) {
+    const MachineInstr **&BBGen = Gen[&MBB];
+    BBGen = new const MachineInstr *[NbReg];
+    memset(BBGen, 0, sizeof(const MachineInstr *) * NbReg);
+
+    BitVector &BBKillSet = Kill[&MBB];
+    BBKillSet.resize(NbReg);
+    for (const MachineInstr &MI : MBB) {
+      bool IsADRP = MI.getOpcode() == AArch64::ADRP;
+
+      // Process uses first.
+      if (IsADRP || !ADRPMode)
+        for (const MachineOperand &MO : MI.operands()) {
+          // Treat ADRP def as use, as the goal of the analysis is to find
+          // ADRP defs reached by other ADRP defs.
+          if (!MO.isReg() || (!ADRPMode && !MO.isUse()) ||
+              (ADRPMode && (!IsADRP || !MO.isDef())))
+            continue;
+          unsigned CurReg = MO.getReg();
+          MapRegToId::const_iterator ItCurRegId = RegToId.find(CurReg);
+          if (ItCurRegId == RegToId.end())
+            continue;
+          CurReg = ItCurRegId->second;
+
+          // if CurReg has not been defined, this use is reachable.
+          if (!BBGen[CurReg] && !BBKillSet.test(CurReg))
+            getSet(ReachableUses, MBB, CurReg, NbReg).insert(&MI);
+          // current basic block definition for this color, if any, is in Gen.
+          if (BBGen[CurReg])
+            getUses(ColorOpToReachedUses, CurReg, *BBGen[CurReg]).insert(&MI);
+        }
+
+      // Process clobbers.
+      for (const MachineOperand &MO : MI.operands()) {
+        if (!MO.isRegMask())
+          continue;
+        // Clobbers kill the related colors.
+        const uint32_t *PreservedRegs = MO.getRegMask();
+
+        // Set generated regs.
+        for (const auto Entry : RegToId) {
+          unsigned Reg = Entry.second;
+          // Use the global register ID when querying APIs external to this
+          // pass.
+          if (MachineOperand::clobbersPhysReg(PreservedRegs, Entry.first)) {
+            // Do not register clobbered definition for no ADRP.
+            // This definition is not used anyway (otherwise register
+            // allocation is wrong).
+            BBGen[Reg] = ADRPMode ? &MI : nullptr;
+            BBKillSet.set(Reg);
+          }
+        }
+      }
+
+      // Process register defs.
+      for (const MachineOperand &MO : MI.operands()) {
+        if (!MO.isReg() || !MO.isDef())
+          continue;
+        unsigned CurReg = MO.getReg();
+        MapRegToId::const_iterator ItCurRegId = RegToId.find(CurReg);
+        if (ItCurRegId == RegToId.end())
+          continue;
+
+        for (MCRegAliasIterator AI(CurReg, TRI, true); AI.isValid(); ++AI) {
+          MapRegToId::const_iterator ItRegId = RegToId.find(*AI);
+          assert(ItRegId != RegToId.end() &&
+                 "Sub-register of an "
+                 "involved register, not recorded as involved!");
+          BBKillSet.set(ItRegId->second);
+          BBGen[ItRegId->second] = &MI;
+        }
+        BBGen[ItCurRegId->second] = &MI;
+      }
+    }
+
+    // If we restrict our analysis to basic block scope, conservatively add a
+    // dummy
+    // use for each generated value.
+    if (!ADRPMode && DummyOp && !MBB.succ_empty())
+      for (unsigned CurReg = 0; CurReg < NbReg; ++CurReg)
+        if (BBGen[CurReg])
+          getUses(ColorOpToReachedUses, CurReg, *BBGen[CurReg]).insert(DummyOp);
+  }
+}
+
+/// Reaching def core algorithm:
+/// while an Out has changed
+///    for each bb
+///       for each color
+///           In[bb][color] = U Out[bb.predecessors][color]
+///           insert reachableUses[bb][color] in each in[bb][color]
+///                 op.reachedUses
+///
+///           Out[bb] = Gen[bb] U (In[bb] - Kill[bb])
+static void reachingDefAlgorithm(MachineFunction &MF,
+                                 InstrToInstrs *ColorOpToReachedUses,
+                                 BlockToSetOfInstrsPerColor &In,
+                                 BlockToSetOfInstrsPerColor &Out,
+                                 BlockToInstrPerColor &Gen, BlockToRegSet &Kill,
+                                 BlockToSetOfInstrsPerColor &ReachableUses,
+                                 unsigned NbReg) {
+  bool HasChanged;
+  do {
+    HasChanged = false;
+    for (MachineBasicBlock &MBB : MF) {
+      unsigned CurReg;
+      for (CurReg = 0; CurReg < NbReg; ++CurReg) {
+        SetOfMachineInstr &BBInSet = getSet(In, MBB, CurReg, NbReg);
+        SetOfMachineInstr &BBReachableUses =
+            getSet(ReachableUses, MBB, CurReg, NbReg);
+        SetOfMachineInstr &BBOutSet = getSet(Out, MBB, CurReg, NbReg);
+        unsigned Size = BBOutSet.size();
+        //   In[bb][color] = U Out[bb.predecessors][color]
+        for (MachineBasicBlock *PredMBB : MBB.predecessors()) {
+          SetOfMachineInstr &PredOutSet = getSet(Out, *PredMBB, CurReg, NbReg);
+          BBInSet.insert(PredOutSet.begin(), PredOutSet.end());
+        }
+        //   insert reachableUses[bb][color] in each in[bb][color] op.reachedses
+        for (const MachineInstr *MI : BBInSet) {
+          SetOfMachineInstr &OpReachedUses =
+              getUses(ColorOpToReachedUses, CurReg, *MI);
+          OpReachedUses.insert(BBReachableUses.begin(), BBReachableUses.end());
+        }
+        //           Out[bb] = Gen[bb] U (In[bb] - Kill[bb])
+        if (!Kill[&MBB].test(CurReg))
+          BBOutSet.insert(BBInSet.begin(), BBInSet.end());
+        if (Gen[&MBB][CurReg])
+          BBOutSet.insert(Gen[&MBB][CurReg]);
+        HasChanged |= BBOutSet.size() != Size;
+      }
+    }
+  } while (HasChanged);
+}
+
+/// Release all memory dynamically allocated during the reaching
+/// definition algorithm.
+static void finitReachingDef(BlockToSetOfInstrsPerColor &In,
+                             BlockToSetOfInstrsPerColor &Out,
+                             BlockToInstrPerColor &Gen,
+                             BlockToSetOfInstrsPerColor &ReachableUses) {
+  for (auto &IT : Out)
+    delete[] IT.second;
+  for (auto &IT : In)
+    delete[] IT.second;
+  for (auto &IT : ReachableUses)
+    delete[] IT.second;
+  for (auto &IT : Gen)
+    delete[] IT.second;
+}
+
+/// Reaching definition algorithm.
+/// \param MF function on which the algorithm will operate.
+/// \param[out] ColorOpToReachedUses will contain the result of the reaching
+/// def algorithm.
+/// \param ADRPMode specify whether the reaching def algorithm should be tuned
+/// for ADRP optimization. \see initReachingDef for more details.
+/// \param DummyOp if not NULL, the algorithm will work at
+/// basic block scope and will set for every exposed definition a use to
+/// @p DummyOp.
+/// \pre ColorOpToReachedUses is an array of at least number of registers of
+/// InstrToInstrs.
+static void reachingDef(MachineFunction &MF,
+                        InstrToInstrs *ColorOpToReachedUses,
+                        const MapRegToId &RegToId, bool ADRPMode = false,
+                        const MachineInstr *DummyOp = nullptr) {
+  // structures:
+  // For each basic block.
+  // Out: a set per color of definitions that reach the
+  //      out boundary of this block.
+  // In: Same as Out but for in boundary.
+  // Gen: generated color in this block (one operation per color).
+  // Kill: register set of killed color in this block.
+  // ReachableUses: a set per color of uses (operation) reachable
+  //                for "In" definitions.
+  BlockToSetOfInstrsPerColor Out, In, ReachableUses;
+  BlockToInstrPerColor Gen;
+  BlockToRegSet Kill;
+
+  // Initialize Gen, kill and reachableUses.
+  initReachingDef(MF, ColorOpToReachedUses, Gen, Kill, ReachableUses, RegToId,
+                  DummyOp, ADRPMode);
+
+  // Algo.
+  if (!DummyOp)
+    reachingDefAlgorithm(MF, ColorOpToReachedUses, In, Out, Gen, Kill,
+                         ReachableUses, RegToId.size());
+
+  // finit.
+  finitReachingDef(In, Out, Gen, ReachableUses);
+}
+
+#ifndef NDEBUG
+/// print the result of the reaching definition algorithm.
+static void printReachingDef(const InstrToInstrs *ColorOpToReachedUses,
+                             unsigned NbReg, const TargetRegisterInfo *TRI,
+                             const MapIdToReg &IdToReg) {
+  unsigned CurReg;
+  for (CurReg = 0; CurReg < NbReg; ++CurReg) {
+    if (ColorOpToReachedUses[CurReg].empty())
+      continue;
+    DEBUG(dbgs() << "*** Reg " << PrintReg(IdToReg[CurReg], TRI) << " ***\n");
+
+    for (const auto &DefsIt : ColorOpToReachedUses[CurReg]) {
+      DEBUG(dbgs() << "Def:\n");
+      DEBUG(DefsIt.first->print(dbgs()));
+      DEBUG(dbgs() << "Reachable uses:\n");
+      for (const MachineInstr *MI : DefsIt.second) {
+        DEBUG(MI->print(dbgs()));
+      }
+    }
+  }
+}
+#endif // NDEBUG
+
+/// Answer the following question: Can Def be one of the definition
+/// involved in a part of a LOH?
+static bool canDefBePartOfLOH(const MachineInstr *Def) {
+  unsigned Opc = Def->getOpcode();
+  // Accept ADRP, ADDLow and LOADGot.
+  switch (Opc) {
+  default:
+    return false;
+  case AArch64::ADRP:
+    return true;
+  case AArch64::ADDXri:
+    // Check immediate to see if the immediate is an address.
+    switch (Def->getOperand(2).getType()) {
+    default:
+      return false;
+    case MachineOperand::MO_GlobalAddress:
+    case MachineOperand::MO_JumpTableIndex:
+    case MachineOperand::MO_ConstantPoolIndex:
+    case MachineOperand::MO_BlockAddress:
+      return true;
+    }
+  case AArch64::LDRXui:
+    // Check immediate to see if the immediate is an address.
+    switch (Def->getOperand(2).getType()) {
+    default:
+      return false;
+    case MachineOperand::MO_GlobalAddress:
+      return true;
+    }
+  }
+  // Unreachable.
+  return false;
+}
+
+/// Check whether the given instruction can the end of a LOH chain involving a
+/// store.
+static bool isCandidateStore(const MachineInstr *Instr) {
+  switch (Instr->getOpcode()) {
+  default:
+    return false;
+  case AArch64::STRBui:
+  case AArch64::STRHui:
+  case AArch64::STRWui:
+  case AArch64::STRXui:
+  case AArch64::STRSui:
+  case AArch64::STRDui:
+  case AArch64::STRQui:
+    // In case we have str xA, [xA, #imm], this is two different uses
+    // of xA and we cannot fold, otherwise the xA stored may be wrong,
+    // even if #imm == 0.
+    if (Instr->getOperand(0).getReg() != Instr->getOperand(1).getReg())
+      return true;
+  }
+  return false;
+}
+
+/// Given the result of a reaching definition algorithm in ColorOpToReachedUses,
+/// Build the Use to Defs information and filter out obvious non-LOH candidates.
+/// In ADRPMode, non-LOH candidates are "uses" with non-ADRP definitions.
+/// In non-ADRPMode, non-LOH candidates are "uses" with several definition,
+/// i.e., no simple chain.
+/// \param ADRPMode -- \see initReachingDef.
+static void reachedUsesToDefs(InstrToInstrs &UseToReachingDefs,
+                              const InstrToInstrs *ColorOpToReachedUses,
+                              const MapRegToId &RegToId,
+                              bool ADRPMode = false) {
+
+  SetOfMachineInstr NotCandidate;
+  unsigned NbReg = RegToId.size();
+  MapRegToId::const_iterator EndIt = RegToId.end();
+  for (unsigned CurReg = 0; CurReg < NbReg; ++CurReg) {
+    // If this color is never defined, continue.
+    if (ColorOpToReachedUses[CurReg].empty())
+      continue;
+
+    for (const auto &DefsIt : ColorOpToReachedUses[CurReg]) {
+      for (const MachineInstr *MI : DefsIt.second) {
+        const MachineInstr *Def = DefsIt.first;
+        MapRegToId::const_iterator It;
+        // if all the reaching defs are not adrp, this use will not be
+        // simplifiable.
+        if ((ADRPMode && Def->getOpcode() != AArch64::ADRP) ||
+            (!ADRPMode && !canDefBePartOfLOH(Def)) ||
+            (!ADRPMode && isCandidateStore(MI) &&
+             // store are LOH candidate iff the end of the chain is used as
+             // base.
+             ((It = RegToId.find((MI)->getOperand(1).getReg())) == EndIt ||
+              It->second != CurReg))) {
+          NotCandidate.insert(MI);
+          continue;
+        }
+        // Do not consider self reaching as a simplifiable case for ADRP.
+        if (!ADRPMode || MI != DefsIt.first) {
+          UseToReachingDefs[MI].insert(DefsIt.first);
+          // If UsesIt has several reaching definitions, it is not
+          // candidate for simplificaton in non-ADRPMode.
+          if (!ADRPMode && UseToReachingDefs[MI].size() > 1)
+            NotCandidate.insert(MI);
+        }
+      }
+    }
+  }
+  for (const MachineInstr *Elem : NotCandidate) {
+    DEBUG(dbgs() << "Too many reaching defs: " << *Elem << "\n");
+    // It would have been better if we could just remove the entry
+    // from the map.  Because of that, we have to filter the garbage
+    // (second.empty) in the subsequence analysis.
+    UseToReachingDefs[Elem].clear();
+  }
+}
+
+/// Based on the use to defs information (in ADRPMode), compute the
+/// opportunities of LOH ADRP-related.
+static void computeADRP(const InstrToInstrs &UseToDefs,
+                        AArch64FunctionInfo &AArch64FI,
+                        const MachineDominatorTree *MDT) {
+  DEBUG(dbgs() << "*** Compute LOH for ADRP\n");
+  for (const auto &Entry : UseToDefs) {
+    unsigned Size = Entry.second.size();
+    if (Size == 0)
+      continue;
+    if (Size == 1) {
+      const MachineInstr *L2 = *Entry.second.begin();
+      const MachineInstr *L1 = Entry.first;
+      if (!MDT->dominates(L2, L1)) {
+        DEBUG(dbgs() << "Dominance check failed:\n" << *L2 << '\n' << *L1
+                     << '\n');
+        continue;
+      }
+      DEBUG(dbgs() << "Record AdrpAdrp:\n" << *L2 << '\n' << *L1 << '\n');
+      SmallVector<const MachineInstr *, 2> Args;
+      Args.push_back(L2);
+      Args.push_back(L1);
+      AArch64FI.addLOHDirective(MCLOH_AdrpAdrp, Args);
+      ++NumADRPSimpleCandidate;
+    }
+#ifdef DEBUG
+    else if (Size == 2)
+      ++NumADRPComplexCandidate2;
+    else if (Size == 3)
+      ++NumADRPComplexCandidate3;
+    else
+      ++NumADRPComplexCandidateOther;
+#endif
+    // if Size < 1, the use should have been removed from the candidates
+    assert(Size >= 1 && "No reaching defs for that use!");
+  }
+}
+
+/// Check whether the given instruction can be the end of a LOH chain
+/// involving a load.
+static bool isCandidateLoad(const MachineInstr *Instr) {
+  switch (Instr->getOpcode()) {
+  default:
+    return false;
+  case AArch64::LDRSBWui:
+  case AArch64::LDRSBXui:
+  case AArch64::LDRSHWui:
+  case AArch64::LDRSHXui:
+  case AArch64::LDRSWui:
+  case AArch64::LDRBui:
+  case AArch64::LDRHui:
+  case AArch64::LDRWui:
+  case AArch64::LDRXui:
+  case AArch64::LDRSui:
+  case AArch64::LDRDui:
+  case AArch64::LDRQui:
+    if (Instr->getOperand(2).getTargetFlags() & AArch64II::MO_GOT)
+      return false;
+    return true;
+  }
+  // Unreachable.
+  return false;
+}
+
+/// Check whether the given instruction can load a litteral.
+static bool supportLoadFromLiteral(const MachineInstr *Instr) {
+  switch (Instr->getOpcode()) {
+  default:
+    return false;
+  case AArch64::LDRSWui:
+  case AArch64::LDRWui:
+  case AArch64::LDRXui:
+  case AArch64::LDRSui:
+  case AArch64::LDRDui:
+  case AArch64::LDRQui:
+    return true;
+  }
+  // Unreachable.
+  return false;
+}
+
+/// Check whether the given instruction is a LOH candidate.
+/// \param UseToDefs is used to check that Instr is at the end of LOH supported
+/// chain.
+/// \pre UseToDefs contains only on def per use, i.e., obvious non candidate are
+/// already been filtered out.
+static bool isCandidate(const MachineInstr *Instr,
+                        const InstrToInstrs &UseToDefs,
+                        const MachineDominatorTree *MDT) {
+  if (!isCandidateLoad(Instr) && !isCandidateStore(Instr))
+    return false;
+
+  const MachineInstr *Def = *UseToDefs.find(Instr)->second.begin();
+  if (Def->getOpcode() != AArch64::ADRP) {
+    // At this point, Def is ADDXri or LDRXui of the right type of
+    // symbol, because we filtered out the uses that were not defined
+    // by these kind of instructions (+ ADRP).
+
+    // Check if this forms a simple chain: each intermediate node must
+    // dominates the next one.
+    if (!MDT->dominates(Def, Instr))
+      return false;
+    // Move one node up in the simple chain.
+    if (UseToDefs.find(Def) ==
+            UseToDefs.end()
+            // The map may contain garbage we have to ignore.
+        ||
+        UseToDefs.find(Def)->second.empty())
+      return false;
+    Instr = Def;
+    Def = *UseToDefs.find(Def)->second.begin();
+  }
+  // Check if we reached the top of the simple chain:
+  // - top is ADRP.
+  // - check the simple chain property: each intermediate node must
+  // dominates the next one.
+  if (Def->getOpcode() == AArch64::ADRP)
+    return MDT->dominates(Def, Instr);
+  return false;
+}
+
+static bool registerADRCandidate(const MachineInstr &Use,
+                                 const InstrToInstrs &UseToDefs,
+                                 const InstrToInstrs *DefsPerColorToUses,
+                                 AArch64FunctionInfo &AArch64FI,
+                                 SetOfMachineInstr *InvolvedInLOHs,
+                                 const MapRegToId &RegToId) {
+  // Look for opportunities to turn ADRP -> ADD or
+  // ADRP -> LDR GOTPAGEOFF into ADR.
+  // If ADRP has more than one use. Give up.
+  if (Use.getOpcode() != AArch64::ADDXri &&
+      (Use.getOpcode() != AArch64::LDRXui ||
+       !(Use.getOperand(2).getTargetFlags() & AArch64II::MO_GOT)))
+    return false;
+  InstrToInstrs::const_iterator It = UseToDefs.find(&Use);
+  // The map may contain garbage that we need to ignore.
+  if (It == UseToDefs.end() || It->second.empty())
+    return false;
+  const MachineInstr &Def = **It->second.begin();
+  if (Def.getOpcode() != AArch64::ADRP)
+    return false;
+  // Check the number of users of ADRP.
+  const SetOfMachineInstr *Users =
+      getUses(DefsPerColorToUses,
+              RegToId.find(Def.getOperand(0).getReg())->second, Def);
+  if (Users->size() > 1) {
+    ++NumADRComplexCandidate;
+    return false;
+  }
+  ++NumADRSimpleCandidate;
+  assert((!InvolvedInLOHs || InvolvedInLOHs->insert(&Def)) &&
+         "ADRP already involved in LOH.");
+  assert((!InvolvedInLOHs || InvolvedInLOHs->insert(&Use)) &&
+         "ADD already involved in LOH.");
+  DEBUG(dbgs() << "Record AdrpAdd\n" << Def << '\n' << Use << '\n');
+
+  SmallVector<const MachineInstr *, 2> Args;
+  Args.push_back(&Def);
+  Args.push_back(&Use);
+
+  AArch64FI.addLOHDirective(Use.getOpcode() == AArch64::ADDXri ? MCLOH_AdrpAdd
+                                                           : MCLOH_AdrpLdrGot,
+                          Args);
+  return true;
+}
+
+/// Based on the use to defs information (in non-ADRPMode), compute the
+/// opportunities of LOH non-ADRP-related
+static void computeOthers(const InstrToInstrs &UseToDefs,
+                          const InstrToInstrs *DefsPerColorToUses,
+                          AArch64FunctionInfo &AArch64FI, const MapRegToId &RegToId,
+                          const MachineDominatorTree *MDT) {
+  SetOfMachineInstr *InvolvedInLOHs = nullptr;
+#ifdef DEBUG
+  SetOfMachineInstr InvolvedInLOHsStorage;
+  InvolvedInLOHs = &InvolvedInLOHsStorage;
+#endif // DEBUG
+  DEBUG(dbgs() << "*** Compute LOH for Others\n");
+  // ADRP -> ADD/LDR -> LDR/STR pattern.
+  // Fall back to ADRP -> ADD pattern if we fail to catch the bigger pattern.
+
+  // FIXME: When the statistics are not important,
+  // This initial filtering loop can be merged into the next loop.
+  // Currently, we didn't do it to have the same code for both DEBUG and
+  // NDEBUG builds. Indeed, the iterator of the second loop would need
+  // to be changed.
+  SetOfMachineInstr PotentialCandidates;
+  SetOfMachineInstr PotentialADROpportunities;
+  for (auto &Use : UseToDefs) {
+    // If no definition is available, this is a non candidate.
+    if (Use.second.empty())
+      continue;
+    // Keep only instructions that are load or store and at the end of
+    // a ADRP -> ADD/LDR/Nothing chain.
+    // We already filtered out the no-chain cases.
+    if (!isCandidate(Use.first, UseToDefs, MDT)) {
+      PotentialADROpportunities.insert(Use.first);
+      continue;
+    }
+    PotentialCandidates.insert(Use.first);
+  }
+
+  // Make the following distinctions for statistics as the linker does
+  // know how to decode instructions:
+  // - ADD/LDR/Nothing make there different patterns.
+  // - LDR/STR make two different patterns.
+  // Hence, 6 - 1 base patterns.
+  // (because ADRP-> Nothing -> STR is not simplifiable)
+
+  // The linker is only able to have a simple semantic, i.e., if pattern A
+  // do B.
+  // However, we want to see the opportunity we may miss if we were able to
+  // catch more complex cases.
+
+  // PotentialCandidates are result of a chain ADRP -> ADD/LDR ->
+  // A potential candidate becomes a candidate, if its current immediate
+  // operand is zero and all nodes of the chain have respectively only one user
+#ifdef DEBUG
+  SetOfMachineInstr DefsOfPotentialCandidates;
+#endif
+  for (const MachineInstr *Candidate : PotentialCandidates) {
+    // Get the definition of the candidate i.e., ADD or LDR.
+    const MachineInstr *Def = *UseToDefs.find(Candidate)->second.begin();
+    // Record the elements of the chain.
+    const MachineInstr *L1 = Def;
+    const MachineInstr *L2 = nullptr;
+    unsigned ImmediateDefOpc = Def->getOpcode();
+    if (Def->getOpcode() != AArch64::ADRP) {
+      // Check the number of users of this node.
+      const SetOfMachineInstr *Users =
+          getUses(DefsPerColorToUses,
+                  RegToId.find(Def->getOperand(0).getReg())->second, *Def);
+      if (Users->size() > 1) {
+#ifdef DEBUG
+        // if all the uses of this def are in potential candidate, this is
+        // a complex candidate of level 2.
+        bool IsLevel2 = true;
+        for (const MachineInstr *MI : *Users) {
+          if (!PotentialCandidates.count(MI)) {
+            ++NumTooCplxLvl2;
+            IsLevel2 = false;
+            break;
+          }
+        }
+        if (IsLevel2)
+          ++NumCplxLvl2;
+#endif // DEBUG
+        PotentialADROpportunities.insert(Def);
+        continue;
+      }
+      L2 = Def;
+      Def = *UseToDefs.find(Def)->second.begin();
+      L1 = Def;
+    } // else the element in the middle of the chain is nothing, thus
+      // Def already contains the first element of the chain.
+
+    // Check the number of users of the first node in the chain, i.e., ADRP
+    const SetOfMachineInstr *Users =
+        getUses(DefsPerColorToUses,
+                RegToId.find(Def->getOperand(0).getReg())->second, *Def);
+    if (Users->size() > 1) {
+#ifdef DEBUG
+      // if all the uses of this def are in the defs of the potential candidate,
+      // this is a complex candidate of level 1
+      if (DefsOfPotentialCandidates.empty()) {
+        // lazy init
+        DefsOfPotentialCandidates = PotentialCandidates;
+        for (const MachineInstr *Candidate : PotentialCandidates) {
+          if (!UseToDefs.find(Candidate)->second.empty())
+            DefsOfPotentialCandidates.insert(
+                *UseToDefs.find(Candidate)->second.begin());
+        }
+      }
+      bool Found = false;
+      for (auto &Use : *Users) {
+        if (!DefsOfPotentialCandidates.count(Use)) {
+          ++NumTooCplxLvl1;
+          Found = true;
+          break;
+        }
+      }
+      if (!Found)
+        ++NumCplxLvl1;
+#endif // DEBUG
+      continue;
+    }
+
+    bool IsL2Add = (ImmediateDefOpc == AArch64::ADDXri);
+    // If the chain is three instructions long and ldr is the second element,
+    // then this ldr must load form GOT, otherwise this is not a correct chain.
+    if (L2 && !IsL2Add && L2->getOperand(2).getTargetFlags() != AArch64II::MO_GOT)
+      continue;
+    SmallVector<const MachineInstr *, 3> Args;
+    MCLOHType Kind;
+    if (isCandidateLoad(Candidate)) {
+      if (!L2) {
+        // At this point, the candidate LOH indicates that the ldr instruction
+        // may use a direct access to the symbol. There is not such encoding
+        // for loads of byte and half.
+        if (!supportLoadFromLiteral(Candidate))
+          continue;
+
+        DEBUG(dbgs() << "Record AdrpLdr:\n" << *L1 << '\n' << *Candidate
+                     << '\n');
+        Kind = MCLOH_AdrpLdr;
+        Args.push_back(L1);
+        Args.push_back(Candidate);
+        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L1)) &&
+               "L1 already involved in LOH.");
+        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Candidate)) &&
+               "Candidate already involved in LOH.");
+        ++NumADRPToLDR;
+      } else {
+        DEBUG(dbgs() << "Record Adrp" << (IsL2Add ? "Add" : "LdrGot")
+                     << "Ldr:\n" << *L1 << '\n' << *L2 << '\n' << *Candidate
+                     << '\n');
+
+        Kind = IsL2Add ? MCLOH_AdrpAddLdr : MCLOH_AdrpLdrGotLdr;
+        Args.push_back(L1);
+        Args.push_back(L2);
+        Args.push_back(Candidate);
+
+        PotentialADROpportunities.remove(L2);
+        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L1)) &&
+               "L1 already involved in LOH.");
+        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L2)) &&
+               "L2 already involved in LOH.");
+        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Candidate)) &&
+               "Candidate already involved in LOH.");
+#ifdef DEBUG
+        // get the immediate of the load
+        if (Candidate->getOperand(2).getImm() == 0)
+          if (ImmediateDefOpc == AArch64::ADDXri)
+            ++NumADDToLDR;
+          else
+            ++NumLDRToLDR;
+        else if (ImmediateDefOpc == AArch64::ADDXri)
+          ++NumADDToLDRWithImm;
+        else
+          ++NumLDRToLDRWithImm;
+#endif // DEBUG
+      }
+    } else {
+      if (ImmediateDefOpc == AArch64::ADRP)
+        continue;
+      else {
+
+        DEBUG(dbgs() << "Record Adrp" << (IsL2Add ? "Add" : "LdrGot")
+                     << "Str:\n" << *L1 << '\n' << *L2 << '\n' << *Candidate
+                     << '\n');
+
+        Kind = IsL2Add ? MCLOH_AdrpAddStr : MCLOH_AdrpLdrGotStr;
+        Args.push_back(L1);
+        Args.push_back(L2);
+        Args.push_back(Candidate);
+
+        PotentialADROpportunities.remove(L2);
+        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L1)) &&
+               "L1 already involved in LOH.");
+        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L2)) &&
+               "L2 already involved in LOH.");
+        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Candidate)) &&
+               "Candidate already involved in LOH.");
+#ifdef DEBUG
+        // get the immediate of the store
+        if (Candidate->getOperand(2).getImm() == 0)
+          if (ImmediateDefOpc == AArch64::ADDXri)
+            ++NumADDToSTR;
+          else
+            ++NumLDRToSTR;
+        else if (ImmediateDefOpc == AArch64::ADDXri)
+          ++NumADDToSTRWithImm;
+        else
+          ++NumLDRToSTRWithImm;
+#endif // DEBUG
+      }
+    }
+    AArch64FI.addLOHDirective(Kind, Args);
+  }
+
+  // Now, we grabbed all the big patterns, check ADR opportunities.
+  for (const MachineInstr *Candidate : PotentialADROpportunities)
+    registerADRCandidate(*Candidate, UseToDefs, DefsPerColorToUses, AArch64FI,
+                         InvolvedInLOHs, RegToId);
+}
+
+/// Look for every register defined by potential LOHs candidates.
+/// Map these registers with dense id in @p RegToId and vice-versa in
+/// @p IdToReg. @p IdToReg is populated only in DEBUG mode.
+static void collectInvolvedReg(MachineFunction &MF, MapRegToId &RegToId,
+                               MapIdToReg &IdToReg,
+                               const TargetRegisterInfo *TRI) {
+  unsigned CurRegId = 0;
+  if (!PreCollectRegister) {
+    unsigned NbReg = TRI->getNumRegs();
+    for (; CurRegId < NbReg; ++CurRegId) {
+      RegToId[CurRegId] = CurRegId;
+      DEBUG(IdToReg.push_back(CurRegId));
+      DEBUG(assert(IdToReg[CurRegId] == CurRegId && "Reg index mismatches"));
+    }
+    return;
+  }
+
+  DEBUG(dbgs() << "** Collect Involved Register\n");
+  for (const auto &MBB : MF) {
+    for (const MachineInstr &MI : MBB) {
+      if (!canDefBePartOfLOH(&MI))
+        continue;
+
+      // Process defs
+      for (MachineInstr::const_mop_iterator IO = MI.operands_begin(),
+                                            IOEnd = MI.operands_end();
+           IO != IOEnd; ++IO) {
+        if (!IO->isReg() || !IO->isDef())
+          continue;
+        unsigned CurReg = IO->getReg();
+        for (MCRegAliasIterator AI(CurReg, TRI, true); AI.isValid(); ++AI)
+          if (RegToId.find(*AI) == RegToId.end()) {
+            DEBUG(IdToReg.push_back(*AI);
+                  assert(IdToReg[CurRegId] == *AI &&
+                         "Reg index mismatches insertion index."));
+            RegToId[*AI] = CurRegId++;
+            DEBUG(dbgs() << "Register: " << PrintReg(*AI, TRI) << '\n');
+          }
+      }
+    }
+  }
+}
+
+bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) {
+  const TargetMachine &TM = MF.getTarget();
+  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+  const MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>();
+
+  MapRegToId RegToId;
+  MapIdToReg IdToReg;
+  AArch64FunctionInfo *AArch64FI = MF.getInfo<AArch64FunctionInfo>();
+  assert(AArch64FI && "No MachineFunctionInfo for this function!");
+
+  DEBUG(dbgs() << "Looking for LOH in " << MF.getName() << '\n');
+
+  collectInvolvedReg(MF, RegToId, IdToReg, TRI);
+  if (RegToId.empty())
+    return false;
+
+  MachineInstr *DummyOp = nullptr;
+  if (BasicBlockScopeOnly) {
+    const AArch64InstrInfo *TII =
+        static_cast<const AArch64InstrInfo *>(TM.getInstrInfo());
+    // For local analysis, create a dummy operation to record uses that are not
+    // local.
+    DummyOp = MF.CreateMachineInstr(TII->get(AArch64::COPY), DebugLoc());
+  }
+
+  unsigned NbReg = RegToId.size();
+  bool Modified = false;
+
+  // Start with ADRP.
+  InstrToInstrs *ColorOpToReachedUses = new InstrToInstrs[NbReg];
+
+  // Compute the reaching def in ADRP mode, meaning ADRP definitions
+  // are first considered as uses.
+  reachingDef(MF, ColorOpToReachedUses, RegToId, true, DummyOp);
+  DEBUG(dbgs() << "ADRP reaching defs\n");
+  DEBUG(printReachingDef(ColorOpToReachedUses, NbReg, TRI, IdToReg));
+
+  // Translate the definition to uses map into a use to definitions map to ease
+  // statistic computation.
+  InstrToInstrs ADRPToReachingDefs;
+  reachedUsesToDefs(ADRPToReachingDefs, ColorOpToReachedUses, RegToId, true);
+
+  // Compute LOH for ADRP.
+  computeADRP(ADRPToReachingDefs, *AArch64FI, MDT);
+  delete[] ColorOpToReachedUses;
+
+  // Continue with general ADRP -> ADD/LDR -> LDR/STR pattern.
+  ColorOpToReachedUses = new InstrToInstrs[NbReg];
+
+  // first perform a regular reaching def analysis.
+  reachingDef(MF, ColorOpToReachedUses, RegToId, false, DummyOp);
+  DEBUG(dbgs() << "All reaching defs\n");
+  DEBUG(printReachingDef(ColorOpToReachedUses, NbReg, TRI, IdToReg));
+
+  // Turn that into a use to defs to ease statistic computation.
+  InstrToInstrs UsesToReachingDefs;
+  reachedUsesToDefs(UsesToReachingDefs, ColorOpToReachedUses, RegToId, false);
+
+  // Compute other than AdrpAdrp LOH.
+  computeOthers(UsesToReachingDefs, ColorOpToReachedUses, *AArch64FI, RegToId,
+                MDT);
+  delete[] ColorOpToReachedUses;
+
+  if (BasicBlockScopeOnly)
+    MF.DeleteMachineInstr(DummyOp);
+
+  return Modified;
+}
+
+/// createAArch64CollectLOHPass - returns an instance of the Statistic for
+/// linker optimization pass.
+FunctionPass *llvm::createAArch64CollectLOHPass() {
+  return new AArch64CollectLOH();
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
new file mode 100644
index 0000000..452cdec
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -0,0 +1,919 @@
+//===-- AArch64ConditionalCompares.cpp --- CCMP formation for AArch64 -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AArch64ConditionalCompares pass which reduces
+// branching and code size by using the conditional compare instructions CCMP,
+// CCMN, and FCMP.
+//
+// The CFG transformations for forming conditional compares are very similar to
+// if-conversion, and this pass should run immediately before the early
+// if-conversion pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SparseSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineTraceMetrics.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-ccmp"
+
+// Absolute maximum number of instructions allowed per speculated block.
+// This bypasses all other heuristics, so it should be set fairly high.
+static cl::opt<unsigned> BlockInstrLimit(
+    "aarch64-ccmp-limit", cl::init(30), cl::Hidden,
+    cl::desc("Maximum number of instructions per speculated block."));
+
+// Stress testing mode - disable heuristics.
+static cl::opt<bool> Stress("aarch64-stress-ccmp", cl::Hidden,
+                            cl::desc("Turn all knobs to 11"));
+
+STATISTIC(NumConsidered, "Number of ccmps considered");
+STATISTIC(NumPhiRejs, "Number of ccmps rejected (PHI)");
+STATISTIC(NumPhysRejs, "Number of ccmps rejected (Physregs)");
+STATISTIC(NumPhi2Rejs, "Number of ccmps rejected (PHI2)");
+STATISTIC(NumHeadBranchRejs, "Number of ccmps rejected (Head branch)");
+STATISTIC(NumCmpBranchRejs, "Number of ccmps rejected (CmpBB branch)");
+STATISTIC(NumCmpTermRejs, "Number of ccmps rejected (CmpBB is cbz...)");
+STATISTIC(NumImmRangeRejs, "Number of ccmps rejected (Imm out of range)");
+STATISTIC(NumLiveDstRejs, "Number of ccmps rejected (Cmp dest live)");
+STATISTIC(NumMultNZCVUses, "Number of ccmps rejected (NZCV used)");
+STATISTIC(NumUnknNZCVDefs, "Number of ccmps rejected (NZCV def unknown)");
+
+STATISTIC(NumSpeculateRejs, "Number of ccmps rejected (Can't speculate)");
+
+STATISTIC(NumConverted, "Number of ccmp instructions created");
+STATISTIC(NumCompBranches, "Number of cbz/cbnz branches converted");
+
+//===----------------------------------------------------------------------===//
+//                                 SSACCmpConv
+//===----------------------------------------------------------------------===//
+//
+// The SSACCmpConv class performs ccmp-conversion on SSA form machine code
+// after determining if it is possible. The class contains no heuristics;
+// external code should be used to determine when ccmp-conversion is a good
+// idea.
+//
+// CCmp-formation works on a CFG representing chained conditions, typically
+// from C's short-circuit || and && operators:
+//
+//   From:         Head            To:         Head
+//                 / |                         CmpBB
+//                /  |                         / |
+//               |  CmpBB                     /  |
+//               |  / |                    Tail  |
+//               | /  |                      |   |
+//              Tail  |                      |   |
+//                |   |                      |   |
+//               ... ...                    ... ...
+//
+// The Head block is terminated by a br.cond instruction, and the CmpBB block
+// contains compare + br.cond. Tail must be a successor of both.
+//
+// The cmp-conversion turns the compare instruction in CmpBB into a conditional
+// compare, and merges CmpBB into Head, speculatively executing its
+// instructions. The AArch64 conditional compare instructions have an immediate
+// operand that specifies the NZCV flag values when the condition is false and
+// the compare isn't executed. This makes it possible to chain compares with
+// different condition codes.
+//
+// Example:
+//
+//    if (a == 5 || b == 17)
+//      foo();
+//
+//    Head:
+//       cmp  w0, #5
+//       b.eq Tail
+//    CmpBB:
+//       cmp  w1, #17
+//       b.eq Tail
+//    ...
+//    Tail:
+//      bl _foo
+//
+//  Becomes:
+//
+//    Head:
+//       cmp  w0, #5
+//       ccmp w1, #17, 4, ne  ; 4 = nZcv
+//       b.eq Tail
+//    ...
+//    Tail:
+//      bl _foo
+//
+// The ccmp condition code is the one that would cause the Head terminator to
+// branch to CmpBB.
+//
+// FIXME: It should also be possible to speculate a block on the critical edge
+// between Head and Tail, just like if-converting a diamond.
+//
+// FIXME: Handle PHIs in Tail by turning them into selects (if-conversion).
+
+namespace {
+class SSACCmpConv {
+  MachineFunction *MF;
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  MachineRegisterInfo *MRI;
+
+public:
+  /// The first block containing a conditional branch, dominating everything
+  /// else.
+  MachineBasicBlock *Head;
+
+  /// The block containing cmp+br.cond with a successor shared with Head.
+  MachineBasicBlock *CmpBB;
+
+  /// The common successor for Head and CmpBB.
+  MachineBasicBlock *Tail;
+
+  /// The compare instruction in CmpBB that can be converted to a ccmp.
+  MachineInstr *CmpMI;
+
+private:
+  /// The branch condition in Head as determined by AnalyzeBranch.
+  SmallVector<MachineOperand, 4> HeadCond;
+
+  /// The condition code that makes Head branch to CmpBB.
+  AArch64CC::CondCode HeadCmpBBCC;
+
+  /// The branch condition in CmpBB.
+  SmallVector<MachineOperand, 4> CmpBBCond;
+
+  /// The condition code that makes CmpBB branch to Tail.
+  AArch64CC::CondCode CmpBBTailCC;
+
+  /// Check if the Tail PHIs are trivially convertible.
+  bool trivialTailPHIs();
+
+  /// Remove CmpBB from the Tail PHIs.
+  void updateTailPHIs();
+
+  /// Check if an operand defining DstReg is dead.
+  bool isDeadDef(unsigned DstReg);
+
+  /// Find the compare instruction in MBB that controls the conditional branch.
+  /// Return NULL if a convertible instruction can't be found.
+  MachineInstr *findConvertibleCompare(MachineBasicBlock *MBB);
+
+  /// Return true if all non-terminator instructions in MBB can be safely
+  /// speculated.
+  bool canSpeculateInstrs(MachineBasicBlock *MBB, const MachineInstr *CmpMI);
+
+public:
+  /// runOnMachineFunction - Initialize per-function data structures.
+  void runOnMachineFunction(MachineFunction &MF) {
+    this->MF = &MF;
+    TII = MF.getTarget().getInstrInfo();
+    TRI = MF.getTarget().getRegisterInfo();
+    MRI = &MF.getRegInfo();
+  }
+
+  /// If the sub-CFG headed by MBB can be cmp-converted, initialize the
+  /// internal state, and return true.
+  bool canConvert(MachineBasicBlock *MBB);
+
+  /// Cmo-convert the last block passed to canConvertCmp(), assuming
+  /// it is possible. Add any erased blocks to RemovedBlocks.
+  void convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks);
+
+  /// Return the expected code size delta if the conversion into a
+  /// conditional compare is performed.
+  int expectedCodeSizeDelta() const;
+};
+} // end anonymous namespace
+
+// Check that all PHIs in Tail are selecting the same value from Head and CmpBB.
+// This means that no if-conversion is required when merging CmpBB into Head.
+bool SSACCmpConv::trivialTailPHIs() {
+  for (auto &I : *Tail) {
+    if (!I.isPHI())
+      break;
+    unsigned HeadReg = 0, CmpBBReg = 0;
+    // PHI operands come in (VReg, MBB) pairs.
+    for (unsigned oi = 1, oe = I.getNumOperands(); oi != oe; oi += 2) {
+      MachineBasicBlock *MBB = I.getOperand(oi + 1).getMBB();
+      unsigned Reg = I.getOperand(oi).getReg();
+      if (MBB == Head) {
+        assert((!HeadReg || HeadReg == Reg) && "Inconsistent PHI operands");
+        HeadReg = Reg;
+      }
+      if (MBB == CmpBB) {
+        assert((!CmpBBReg || CmpBBReg == Reg) && "Inconsistent PHI operands");
+        CmpBBReg = Reg;
+      }
+    }
+    if (HeadReg != CmpBBReg)
+      return false;
+  }
+  return true;
+}
+
+// Assuming that trivialTailPHIs() is true, update the Tail PHIs by simply
+// removing the CmpBB operands. The Head operands will be identical.
+void SSACCmpConv::updateTailPHIs() {
+  for (auto &I : *Tail) {
+    if (!I.isPHI())
+      break;
+    // I is a PHI. It can have multiple entries for CmpBB.
+    for (unsigned oi = I.getNumOperands(); oi > 2; oi -= 2) {
+      // PHI operands are (Reg, MBB) at (oi-2, oi-1).
+      if (I.getOperand(oi - 1).getMBB() == CmpBB) {
+        I.RemoveOperand(oi - 1);
+        I.RemoveOperand(oi - 2);
+      }
+    }
+  }
+}
+
+// This pass runs before the AArch64DeadRegisterDefinitions pass, so compares
+// are still writing virtual registers without any uses.
+bool SSACCmpConv::isDeadDef(unsigned DstReg) {
+  // Writes to the zero register are dead.
+  if (DstReg == AArch64::WZR || DstReg == AArch64::XZR)
+    return true;
+  if (!TargetRegisterInfo::isVirtualRegister(DstReg))
+    return false;
+  // A virtual register def without any uses will be marked dead later, and
+  // eventually replaced by the zero register.
+  return MRI->use_nodbg_empty(DstReg);
+}
+
+// Parse a condition code returned by AnalyzeBranch, and compute the CondCode
+// corresponding to TBB.
+// Return
+static bool parseCond(ArrayRef<MachineOperand> Cond, AArch64CC::CondCode &CC) {
+  // A normal br.cond simply has the condition code.
+  if (Cond[0].getImm() != -1) {
+    assert(Cond.size() == 1 && "Unknown Cond array format");
+    CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
+    return true;
+  }
+  // For tbz and cbz instruction, the opcode is next.
+  switch (Cond[1].getImm()) {
+  default:
+    // This includes tbz / tbnz branches which can't be converted to
+    // ccmp + br.cond.
+    return false;
+  case AArch64::CBZW:
+  case AArch64::CBZX:
+    assert(Cond.size() == 3 && "Unknown Cond array format");
+    CC = AArch64CC::EQ;
+    return true;
+  case AArch64::CBNZW:
+  case AArch64::CBNZX:
+    assert(Cond.size() == 3 && "Unknown Cond array format");
+    CC = AArch64CC::NE;
+    return true;
+  }
+}
+
+MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) {
+  MachineBasicBlock::iterator I = MBB->getFirstTerminator();
+  if (I == MBB->end())
+    return nullptr;
+  // The terminator must be controlled by the flags.
+  if (!I->readsRegister(AArch64::NZCV)) {
+    switch (I->getOpcode()) {
+    case AArch64::CBZW:
+    case AArch64::CBZX:
+    case AArch64::CBNZW:
+    case AArch64::CBNZX:
+      // These can be converted into a ccmp against #0.
+      return I;
+    }
+    ++NumCmpTermRejs;
+    DEBUG(dbgs() << "Flags not used by terminator: " << *I);
+    return nullptr;
+  }
+
+  // Now find the instruction controlling the terminator.
+  for (MachineBasicBlock::iterator B = MBB->begin(); I != B;) {
+    --I;
+    assert(!I->isTerminator() && "Spurious terminator");
+    switch (I->getOpcode()) {
+    // cmp is an alias for subs with a dead destination register.
+    case AArch64::SUBSWri:
+    case AArch64::SUBSXri:
+    // cmn is an alias for adds with a dead destination register.
+    case AArch64::ADDSWri:
+    case AArch64::ADDSXri:
+      // Check that the immediate operand is within range, ccmp wants a uimm5.
+      // Rd = SUBSri Rn, imm, shift
+      if (I->getOperand(3).getImm() || !isUInt<5>(I->getOperand(2).getImm())) {
+        DEBUG(dbgs() << "Immediate out of range for ccmp: " << *I);
+        ++NumImmRangeRejs;
+        return nullptr;
+      }
+    // Fall through.
+    case AArch64::SUBSWrr:
+    case AArch64::SUBSXrr:
+    case AArch64::ADDSWrr:
+    case AArch64::ADDSXrr:
+      if (isDeadDef(I->getOperand(0).getReg()))
+        return I;
+      DEBUG(dbgs() << "Can't convert compare with live destination: " << *I);
+      ++NumLiveDstRejs;
+      return nullptr;
+    case AArch64::FCMPSrr:
+    case AArch64::FCMPDrr:
+    case AArch64::FCMPESrr:
+    case AArch64::FCMPEDrr:
+      return I;
+    }
+
+    // Check for flag reads and clobbers.
+    MIOperands::PhysRegInfo PRI =
+        MIOperands(I).analyzePhysReg(AArch64::NZCV, TRI);
+
+    if (PRI.Reads) {
+      // The ccmp doesn't produce exactly the same flags as the original
+      // compare, so reject the transform if there are uses of the flags
+      // besides the terminators.
+      DEBUG(dbgs() << "Can't create ccmp with multiple uses: " << *I);
+      ++NumMultNZCVUses;
+      return nullptr;
+    }
+
+    if (PRI.Clobbers) {
+      DEBUG(dbgs() << "Not convertible compare: " << *I);
+      ++NumUnknNZCVDefs;
+      return nullptr;
+    }
+  }
+  DEBUG(dbgs() << "Flags not defined in BB#" << MBB->getNumber() << '\n');
+  return nullptr;
+}
+
+/// Determine if all the instructions in MBB can safely
+/// be speculated. The terminators are not considered.
+///
+/// Only CmpMI is allowed to clobber the flags.
+///
+bool SSACCmpConv::canSpeculateInstrs(MachineBasicBlock *MBB,
+                                     const MachineInstr *CmpMI) {
+  // Reject any live-in physregs. It's probably NZCV/EFLAGS, and very hard to
+  // get right.
+  if (!MBB->livein_empty()) {
+    DEBUG(dbgs() << "BB#" << MBB->getNumber() << " has live-ins.\n");
+    return false;
+  }
+
+  unsigned InstrCount = 0;
+
+  // Check all instructions, except the terminators. It is assumed that
+  // terminators never have side effects or define any used register values.
+  for (auto &I : make_range(MBB->begin(), MBB->getFirstTerminator())) {
+    if (I.isDebugValue())
+      continue;
+
+    if (++InstrCount > BlockInstrLimit && !Stress) {
+      DEBUG(dbgs() << "BB#" << MBB->getNumber() << " has more than "
+                   << BlockInstrLimit << " instructions.\n");
+      return false;
+    }
+
+    // There shouldn't normally be any phis in a single-predecessor block.
+    if (I.isPHI()) {
+      DEBUG(dbgs() << "Can't hoist: " << I);
+      return false;
+    }
+
+    // Don't speculate loads. Note that it may be possible and desirable to
+    // speculate GOT or constant pool loads that are guaranteed not to trap,
+    // but we don't support that for now.
+    if (I.mayLoad()) {
+      DEBUG(dbgs() << "Won't speculate load: " << I);
+      return false;
+    }
+
+    // We never speculate stores, so an AA pointer isn't necessary.
+    bool DontMoveAcrossStore = true;
+    if (!I.isSafeToMove(TII, nullptr, DontMoveAcrossStore)) {
+      DEBUG(dbgs() << "Can't speculate: " << I);
+      return false;
+    }
+
+    // Only CmpMI is allowed to clobber the flags.
+    if (&I != CmpMI && I.modifiesRegister(AArch64::NZCV, TRI)) {
+      DEBUG(dbgs() << "Clobbers flags: " << I);
+      return false;
+    }
+  }
+  return true;
+}
+
+/// Analyze the sub-cfg rooted in MBB, and return true if it is a potential
+/// candidate for cmp-conversion. Fill out the internal state.
+///
+bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) {
+  Head = MBB;
+  Tail = CmpBB = nullptr;
+
+  if (Head->succ_size() != 2)
+    return false;
+  MachineBasicBlock *Succ0 = Head->succ_begin()[0];
+  MachineBasicBlock *Succ1 = Head->succ_begin()[1];
+
+  // CmpBB can only have a single predecessor. Tail is allowed many.
+  if (Succ0->pred_size() != 1)
+    std::swap(Succ0, Succ1);
+
+  // Succ0 is our candidate for CmpBB.
+  if (Succ0->pred_size() != 1 || Succ0->succ_size() != 2)
+    return false;
+
+  CmpBB = Succ0;
+  Tail = Succ1;
+
+  if (!CmpBB->isSuccessor(Tail))
+    return false;
+
+  // The CFG topology checks out.
+  DEBUG(dbgs() << "\nTriangle: BB#" << Head->getNumber() << " -> BB#"
+               << CmpBB->getNumber() << " -> BB#" << Tail->getNumber() << '\n');
+  ++NumConsidered;
+
+  // Tail is allowed to have many predecessors, but we can't handle PHIs yet.
+  //
+  // FIXME: Real PHIs could be if-converted as long as the CmpBB values are
+  // defined before The CmpBB cmp clobbers the flags. Alternatively, it should
+  // always be safe to sink the ccmp down to immediately before the CmpBB
+  // terminators.
+  if (!trivialTailPHIs()) {
+    DEBUG(dbgs() << "Can't handle phis in Tail.\n");
+    ++NumPhiRejs;
+    return false;
+  }
+
+  if (!Tail->livein_empty()) {
+    DEBUG(dbgs() << "Can't handle live-in physregs in Tail.\n");
+    ++NumPhysRejs;
+    return false;
+  }
+
+  // CmpBB should never have PHIs since Head is its only predecessor.
+  // FIXME: Clean them up if it happens.
+  if (!CmpBB->empty() && CmpBB->front().isPHI()) {
+    DEBUG(dbgs() << "Can't handle phis in CmpBB.\n");
+    ++NumPhi2Rejs;
+    return false;
+  }
+
+  if (!CmpBB->livein_empty()) {
+    DEBUG(dbgs() << "Can't handle live-in physregs in CmpBB.\n");
+    ++NumPhysRejs;
+    return false;
+  }
+
+  // The branch we're looking to eliminate must be analyzable.
+  HeadCond.clear();
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+  if (TII->AnalyzeBranch(*Head, TBB, FBB, HeadCond)) {
+    DEBUG(dbgs() << "Head branch not analyzable.\n");
+    ++NumHeadBranchRejs;
+    return false;
+  }
+
+  // This is weird, probably some sort of degenerate CFG, or an edge to a
+  // landing pad.
+  if (!TBB || HeadCond.empty()) {
+    DEBUG(dbgs() << "AnalyzeBranch didn't find conditional branch in Head.\n");
+    ++NumHeadBranchRejs;
+    return false;
+  }
+
+  if (!parseCond(HeadCond, HeadCmpBBCC)) {
+    DEBUG(dbgs() << "Unsupported branch type on Head\n");
+    ++NumHeadBranchRejs;
+    return false;
+  }
+
+  // Make sure the branch direction is right.
+  if (TBB != CmpBB) {
+    assert(TBB == Tail && "Unexpected TBB");
+    HeadCmpBBCC = AArch64CC::getInvertedCondCode(HeadCmpBBCC);
+  }
+
+  CmpBBCond.clear();
+  TBB = FBB = nullptr;
+  if (TII->AnalyzeBranch(*CmpBB, TBB, FBB, CmpBBCond)) {
+    DEBUG(dbgs() << "CmpBB branch not analyzable.\n");
+    ++NumCmpBranchRejs;
+    return false;
+  }
+
+  if (!TBB || CmpBBCond.empty()) {
+    DEBUG(dbgs() << "AnalyzeBranch didn't find conditional branch in CmpBB.\n");
+    ++NumCmpBranchRejs;
+    return false;
+  }
+
+  if (!parseCond(CmpBBCond, CmpBBTailCC)) {
+    DEBUG(dbgs() << "Unsupported branch type on CmpBB\n");
+    ++NumCmpBranchRejs;
+    return false;
+  }
+
+  if (TBB != Tail)
+    CmpBBTailCC = AArch64CC::getInvertedCondCode(CmpBBTailCC);
+
+  DEBUG(dbgs() << "Head->CmpBB on " << AArch64CC::getCondCodeName(HeadCmpBBCC)
+               << ", CmpBB->Tail on " << AArch64CC::getCondCodeName(CmpBBTailCC)
+               << '\n');
+
+  CmpMI = findConvertibleCompare(CmpBB);
+  if (!CmpMI)
+    return false;
+
+  if (!canSpeculateInstrs(CmpBB, CmpMI)) {
+    ++NumSpeculateRejs;
+    return false;
+  }
+  return true;
+}
+
+void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
+  DEBUG(dbgs() << "Merging BB#" << CmpBB->getNumber() << " into BB#"
+               << Head->getNumber() << ":\n" << *CmpBB);
+
+  // All CmpBB instructions are moved into Head, and CmpBB is deleted.
+  // Update the CFG first.
+  updateTailPHIs();
+  Head->removeSuccessor(CmpBB);
+  CmpBB->removeSuccessor(Tail);
+  Head->transferSuccessorsAndUpdatePHIs(CmpBB);
+  DebugLoc TermDL = Head->getFirstTerminator()->getDebugLoc();
+  TII->RemoveBranch(*Head);
+
+  // If the Head terminator was one of the cbz / tbz branches with built-in
+  // compare, we need to insert an explicit compare instruction in its place.
+  if (HeadCond[0].getImm() == -1) {
+    ++NumCompBranches;
+    unsigned Opc = 0;
+    switch (HeadCond[1].getImm()) {
+    case AArch64::CBZW:
+    case AArch64::CBNZW:
+      Opc = AArch64::SUBSWri;
+      break;
+    case AArch64::CBZX:
+    case AArch64::CBNZX:
+      Opc = AArch64::SUBSXri;
+      break;
+    default:
+      llvm_unreachable("Cannot convert Head branch");
+    }
+    const MCInstrDesc &MCID = TII->get(Opc);
+    // Create a dummy virtual register for the SUBS def.
+    unsigned DestReg =
+        MRI->createVirtualRegister(TII->getRegClass(MCID, 0, TRI, *MF));
+    // Insert a SUBS Rn, #0 instruction instead of the cbz / cbnz.
+    BuildMI(*Head, Head->end(), TermDL, MCID)
+        .addReg(DestReg, RegState::Define | RegState::Dead)
+        .addOperand(HeadCond[2])
+        .addImm(0)
+        .addImm(0);
+    // SUBS uses the GPR*sp register classes.
+    MRI->constrainRegClass(HeadCond[2].getReg(),
+                           TII->getRegClass(MCID, 1, TRI, *MF));
+  }
+
+  Head->splice(Head->end(), CmpBB, CmpBB->begin(), CmpBB->end());
+
+  // Now replace CmpMI with a ccmp instruction that also considers the incoming
+  // flags.
+  unsigned Opc = 0;
+  unsigned FirstOp = 1;   // First CmpMI operand to copy.
+  bool isZBranch = false; // CmpMI is a cbz/cbnz instruction.
+  switch (CmpMI->getOpcode()) {
+  default:
+    llvm_unreachable("Unknown compare opcode");
+  case AArch64::SUBSWri:    Opc = AArch64::CCMPWi; break;
+  case AArch64::SUBSWrr:    Opc = AArch64::CCMPWr; break;
+  case AArch64::SUBSXri:    Opc = AArch64::CCMPXi; break;
+  case AArch64::SUBSXrr:    Opc = AArch64::CCMPXr; break;
+  case AArch64::ADDSWri:    Opc = AArch64::CCMNWi; break;
+  case AArch64::ADDSWrr:    Opc = AArch64::CCMNWr; break;
+  case AArch64::ADDSXri:    Opc = AArch64::CCMNXi; break;
+  case AArch64::ADDSXrr:    Opc = AArch64::CCMNXr; break;
+  case AArch64::FCMPSrr:    Opc = AArch64::FCCMPSrr; FirstOp = 0; break;
+  case AArch64::FCMPDrr:    Opc = AArch64::FCCMPDrr; FirstOp = 0; break;
+  case AArch64::FCMPESrr:   Opc = AArch64::FCCMPESrr; FirstOp = 0; break;
+  case AArch64::FCMPEDrr:   Opc = AArch64::FCCMPEDrr; FirstOp = 0; break;
+  case AArch64::CBZW:
+  case AArch64::CBNZW:
+    Opc = AArch64::CCMPWi;
+    FirstOp = 0;
+    isZBranch = true;
+    break;
+  case AArch64::CBZX:
+  case AArch64::CBNZX:
+    Opc = AArch64::CCMPXi;
+    FirstOp = 0;
+    isZBranch = true;
+    break;
+  }
+
+  // The ccmp instruction should set the flags according to the comparison when
+  // Head would have branched to CmpBB.
+  // The NZCV immediate operand should provide flags for the case where Head
+  // would have branched to Tail. These flags should cause the new Head
+  // terminator to branch to tail.
+  unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(CmpBBTailCC);
+  const MCInstrDesc &MCID = TII->get(Opc);
+  MRI->constrainRegClass(CmpMI->getOperand(FirstOp).getReg(),
+                         TII->getRegClass(MCID, 0, TRI, *MF));
+  if (CmpMI->getOperand(FirstOp + 1).isReg())
+    MRI->constrainRegClass(CmpMI->getOperand(FirstOp + 1).getReg(),
+                           TII->getRegClass(MCID, 1, TRI, *MF));
+  MachineInstrBuilder MIB =
+      BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), MCID)
+          .addOperand(CmpMI->getOperand(FirstOp)); // Register Rn
+  if (isZBranch)
+    MIB.addImm(0); // cbz/cbnz Rn -> ccmp Rn, #0
+  else
+    MIB.addOperand(CmpMI->getOperand(FirstOp + 1)); // Register Rm / Immediate
+  MIB.addImm(NZCV).addImm(HeadCmpBBCC);
+
+  // If CmpMI was a terminator, we need a new conditional branch to replace it.
+  // This now becomes a Head terminator.
+  if (isZBranch) {
+    bool isNZ = CmpMI->getOpcode() == AArch64::CBNZW ||
+                CmpMI->getOpcode() == AArch64::CBNZX;
+    BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), TII->get(AArch64::Bcc))
+        .addImm(isNZ ? AArch64CC::NE : AArch64CC::EQ)
+        .addOperand(CmpMI->getOperand(1)); // Branch target.
+  }
+  CmpMI->eraseFromParent();
+  Head->updateTerminator();
+
+  RemovedBlocks.push_back(CmpBB);
+  CmpBB->eraseFromParent();
+  DEBUG(dbgs() << "Result:\n" << *Head);
+  ++NumConverted;
+}
+
+int SSACCmpConv::expectedCodeSizeDelta() const {
+  int delta = 0;
+  // If the Head terminator was one of the cbz / tbz branches with built-in
+  // compare, we need to insert an explicit compare instruction in its place
+  // plus a branch instruction.
+  if (HeadCond[0].getImm() == -1) {
+    switch (HeadCond[1].getImm()) {
+    case AArch64::CBZW:
+    case AArch64::CBNZW:
+    case AArch64::CBZX:
+    case AArch64::CBNZX:
+      // Therefore delta += 1
+      delta = 1;
+      break;
+    default:
+      llvm_unreachable("Cannot convert Head branch");
+    }
+  }
+  // If the Cmp terminator was one of the cbz / tbz branches with
+  // built-in compare, it will be turned into a compare instruction
+  // into Head, but we do not save any instruction.
+  // Otherwise, we save the branch instruction.
+  switch (CmpMI->getOpcode()) {
+  default:
+    --delta;
+    break;
+  case AArch64::CBZW:
+  case AArch64::CBNZW:
+  case AArch64::CBZX:
+  case AArch64::CBNZX:
+    break;
+  }
+  return delta;
+}
+
+//===----------------------------------------------------------------------===//
+//                       AArch64ConditionalCompares Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+class AArch64ConditionalCompares : public MachineFunctionPass {
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  const MCSchedModel *SchedModel;
+  // Does the proceeded function has Oz attribute.
+  bool MinSize;
+  MachineRegisterInfo *MRI;
+  MachineDominatorTree *DomTree;
+  MachineLoopInfo *Loops;
+  MachineTraceMetrics *Traces;
+  MachineTraceMetrics::Ensemble *MinInstr;
+  SSACCmpConv CmpConv;
+
+public:
+  static char ID;
+  AArch64ConditionalCompares() : MachineFunctionPass(ID) {}
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  const char *getPassName() const override {
+    return "AArch64 Conditional Compares";
+  }
+
+private:
+  bool tryConvert(MachineBasicBlock *);
+  void updateDomTree(ArrayRef<MachineBasicBlock *> Removed);
+  void updateLoops(ArrayRef<MachineBasicBlock *> Removed);
+  void invalidateTraces();
+  bool shouldConvert();
+};
+} // end anonymous namespace
+
+char AArch64ConditionalCompares::ID = 0;
+
+namespace llvm {
+void initializeAArch64ConditionalComparesPass(PassRegistry &);
+}
+
+INITIALIZE_PASS_BEGIN(AArch64ConditionalCompares, "aarch64-ccmp",
+                      "AArch64 CCMP Pass", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
+INITIALIZE_PASS_END(AArch64ConditionalCompares, "aarch64-ccmp",
+                    "AArch64 CCMP Pass", false, false)
+
+FunctionPass *llvm::createAArch64ConditionalCompares() {
+  return new AArch64ConditionalCompares();
+}
+
+void AArch64ConditionalCompares::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<MachineBranchProbabilityInfo>();
+  AU.addRequired<MachineDominatorTree>();
+  AU.addPreserved<MachineDominatorTree>();
+  AU.addRequired<MachineLoopInfo>();
+  AU.addPreserved<MachineLoopInfo>();
+  AU.addRequired<MachineTraceMetrics>();
+  AU.addPreserved<MachineTraceMetrics>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+/// Update the dominator tree after if-conversion erased some blocks.
+void AArch64ConditionalCompares::updateDomTree(
+    ArrayRef<MachineBasicBlock *> Removed) {
+  // convert() removes CmpBB which was previously dominated by Head.
+  // CmpBB children should be transferred to Head.
+  MachineDomTreeNode *HeadNode = DomTree->getNode(CmpConv.Head);
+  for (unsigned i = 0, e = Removed.size(); i != e; ++i) {
+    MachineDomTreeNode *Node = DomTree->getNode(Removed[i]);
+    assert(Node != HeadNode && "Cannot erase the head node");
+    assert(Node->getIDom() == HeadNode && "CmpBB should be dominated by Head");
+    while (Node->getNumChildren())
+      DomTree->changeImmediateDominator(Node->getChildren().back(), HeadNode);
+    DomTree->eraseNode(Removed[i]);
+  }
+}
+
+/// Update LoopInfo after if-conversion.
+void
+AArch64ConditionalCompares::updateLoops(ArrayRef<MachineBasicBlock *> Removed) {
+  if (!Loops)
+    return;
+  for (unsigned i = 0, e = Removed.size(); i != e; ++i)
+    Loops->removeBlock(Removed[i]);
+}
+
+/// Invalidate MachineTraceMetrics before if-conversion.
+void AArch64ConditionalCompares::invalidateTraces() {
+  Traces->invalidate(CmpConv.Head);
+  Traces->invalidate(CmpConv.CmpBB);
+}
+
+/// Apply cost model and heuristics to the if-conversion in IfConv.
+/// Return true if the conversion is a good idea.
+///
+bool AArch64ConditionalCompares::shouldConvert() {
+  // Stress testing mode disables all cost considerations.
+  if (Stress)
+    return true;
+  if (!MinInstr)
+    MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
+
+  // Head dominates CmpBB, so it is always included in its trace.
+  MachineTraceMetrics::Trace Trace = MinInstr->getTrace(CmpConv.CmpBB);
+
+  // If code size is the main concern
+  if (MinSize) {
+    int CodeSizeDelta = CmpConv.expectedCodeSizeDelta();
+    DEBUG(dbgs() << "Code size delta:  " << CodeSizeDelta << '\n');
+    // If we are minimizing the code size, do the conversion whatever
+    // the cost is.
+    if (CodeSizeDelta < 0)
+      return true;
+    if (CodeSizeDelta > 0) {
+      DEBUG(dbgs() << "Code size is increasing, give up on this one.\n");
+      return false;
+    }
+    // CodeSizeDelta == 0, continue with the regular heuristics
+  }
+
+  // Heuristic: The compare conversion delays the execution of the branch
+  // instruction because we must wait for the inputs to the second compare as
+  // well. The branch has no dependent instructions, but delaying it increases
+  // the cost of a misprediction.
+  //
+  // Set a limit on the delay we will accept.
+  unsigned DelayLimit = SchedModel->MispredictPenalty * 3 / 4;
+
+  // Instruction depths can be computed for all trace instructions above CmpBB.
+  unsigned HeadDepth =
+      Trace.getInstrCycles(CmpConv.Head->getFirstTerminator()).Depth;
+  unsigned CmpBBDepth =
+      Trace.getInstrCycles(CmpConv.CmpBB->getFirstTerminator()).Depth;
+  DEBUG(dbgs() << "Head depth:  " << HeadDepth
+               << "\nCmpBB depth: " << CmpBBDepth << '\n');
+  if (CmpBBDepth > HeadDepth + DelayLimit) {
+    DEBUG(dbgs() << "Branch delay would be larger than " << DelayLimit
+                 << " cycles.\n");
+    return false;
+  }
+
+  // Check the resource depth at the bottom of CmpBB - these instructions will
+  // be speculated.
+  unsigned ResDepth = Trace.getResourceDepth(true);
+  DEBUG(dbgs() << "Resources:   " << ResDepth << '\n');
+
+  // Heuristic: The speculatively executed instructions must all be able to
+  // merge into the Head block. The Head critical path should dominate the
+  // resource cost of the speculated instructions.
+  if (ResDepth > HeadDepth) {
+    DEBUG(dbgs() << "Too many instructions to speculate.\n");
+    return false;
+  }
+  return true;
+}
+
+bool AArch64ConditionalCompares::tryConvert(MachineBasicBlock *MBB) {
+  bool Changed = false;
+  while (CmpConv.canConvert(MBB) && shouldConvert()) {
+    invalidateTraces();
+    SmallVector<MachineBasicBlock *, 4> RemovedBlocks;
+    CmpConv.convert(RemovedBlocks);
+    Changed = true;
+    updateDomTree(RemovedBlocks);
+    updateLoops(RemovedBlocks);
+  }
+  return Changed;
+}
+
+bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "********** AArch64 Conditional Compares **********\n"
+               << "********** Function: " << MF.getName() << '\n');
+  TII = MF.getTarget().getInstrInfo();
+  TRI = MF.getTarget().getRegisterInfo();
+  SchedModel =
+      MF.getTarget().getSubtarget<TargetSubtargetInfo>().getSchedModel();
+  MRI = &MF.getRegInfo();
+  DomTree = &getAnalysis<MachineDominatorTree>();
+  Loops = getAnalysisIfAvailable<MachineLoopInfo>();
+  Traces = &getAnalysis<MachineTraceMetrics>();
+  MinInstr = nullptr;
+  MinSize = MF.getFunction()->getAttributes().hasAttribute(
+      AttributeSet::FunctionIndex, Attribute::MinSize);
+
+  bool Changed = false;
+  CmpConv.runOnMachineFunction(MF);
+
+  // Visit blocks in dominator tree pre-order. The pre-order enables multiple
+  // cmp-conversions from the same head block.
+  // Note that updateDomTree() modifies the children of the DomTree node
+  // currently being visited. The df_iterator supports that; it doesn't look at
+  // child_begin() / child_end() until after a node has been visited.
+  for (auto *I : depth_first(DomTree))
+    if (tryConvert(I->getBlock()))
+      Changed = true;
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
new file mode 100644
index 0000000..a2d853c
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
@@ -0,0 +1,134 @@
+//==-- AArch64DeadRegisterDefinitions.cpp - Replace dead defs w/ zero reg --==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// When allowed by the instruction, replace a dead definition of a GPR with
+// the zero register. This makes the code a bit friendlier towards the
+// hardware's register renamer.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64RegisterInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-dead-defs"
+
+STATISTIC(NumDeadDefsReplaced, "Number of dead definitions replaced");
+
+namespace {
+class AArch64DeadRegisterDefinitions : public MachineFunctionPass {
+private:
+  const TargetRegisterInfo *TRI;
+  bool implicitlyDefinesOverlappingReg(unsigned Reg, const MachineInstr &MI);
+  bool processMachineBasicBlock(MachineBasicBlock &MBB);
+  bool usesFrameIndex(const MachineInstr &MI);
+public:
+  static char ID; // Pass identification, replacement for typeid.
+  explicit AArch64DeadRegisterDefinitions() : MachineFunctionPass(ID) {}
+
+  virtual bool runOnMachineFunction(MachineFunction &F) override;
+
+  const char *getPassName() const override { return "Dead register definitions"; }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+char AArch64DeadRegisterDefinitions::ID = 0;
+} // end anonymous namespace
+
+bool AArch64DeadRegisterDefinitions::implicitlyDefinesOverlappingReg(
+    unsigned Reg, const MachineInstr &MI) {
+  for (const MachineOperand &MO : MI.implicit_operands())
+    if (MO.isReg() && MO.isDef())
+      if (TRI->regsOverlap(Reg, MO.getReg()))
+        return true;
+  return false;
+}
+
+bool AArch64DeadRegisterDefinitions::usesFrameIndex(const MachineInstr &MI) {
+  for (const MachineOperand &Op : MI.uses())
+    if (Op.isFI())
+      return true;
+  return false;
+}
+
+bool AArch64DeadRegisterDefinitions::processMachineBasicBlock(
+    MachineBasicBlock &MBB) {
+  bool Changed = false;
+  for (MachineInstr &MI : MBB) {
+    if (usesFrameIndex(MI)) {
+      // We need to skip this instruction because while it appears to have a
+      // dead def it uses a frame index which might expand into a multi
+      // instruction sequence during EPI.
+      DEBUG(dbgs() << "    Ignoring, operand is frame index\n");
+      continue;
+    }
+    for (int i = 0, e = MI.getDesc().getNumDefs(); i != e; ++i) {
+      MachineOperand &MO = MI.getOperand(i);
+      if (MO.isReg() && MO.isDead() && MO.isDef()) {
+        assert(!MO.isImplicit() && "Unexpected implicit def!");
+        DEBUG(dbgs() << "  Dead def operand #" << i << " in:\n    ";
+              MI.print(dbgs()));
+        // Be careful not to change the register if it's a tied operand.
+        if (MI.isRegTiedToUseOperand(i)) {
+          DEBUG(dbgs() << "    Ignoring, def is tied operand.\n");
+          continue;
+        }
+        // Don't change the register if there's an implicit def of a subreg or
+        // supperreg.
+        if (implicitlyDefinesOverlappingReg(MO.getReg(), MI)) {
+          DEBUG(dbgs() << "    Ignoring, implicitly defines overlap reg.\n");
+          continue;
+        }
+        // Make sure the instruction take a register class that contains
+        // the zero register and replace it if so.
+        unsigned NewReg;
+        switch (MI.getDesc().OpInfo[i].RegClass) {
+        default:
+          DEBUG(dbgs() << "    Ignoring, register is not a GPR.\n");
+          continue;
+        case AArch64::GPR32RegClassID:
+          NewReg = AArch64::WZR;
+          break;
+        case AArch64::GPR64RegClassID:
+          NewReg = AArch64::XZR;
+          break;
+        }
+        DEBUG(dbgs() << "    Replacing with zero register. New:\n      ");
+        MO.setReg(NewReg);
+        DEBUG(MI.print(dbgs()));
+        ++NumDeadDefsReplaced;
+      }
+    }
+  }
+  return Changed;
+}
+
+// Scan the function for instructions that have a dead definition of a
+// register. Replace that register with the zero register when possible.
+bool AArch64DeadRegisterDefinitions::runOnMachineFunction(MachineFunction &MF) {
+  TRI = MF.getTarget().getRegisterInfo();
+  bool Changed = false;
+  DEBUG(dbgs() << "***** AArch64DeadRegisterDefinitions *****\n");
+
+  for (auto &MBB : MF)
+    if (processMachineBasicBlock(MBB))
+      Changed = true;
+  return Changed;
+}
+
+FunctionPass *llvm::createAArch64DeadRegisterDefinitions() {
+  return new AArch64DeadRegisterDefinitions();
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
new file mode 100644
index 0000000..8839085
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -0,0 +1,736 @@
+//==-- AArch64ExpandPseudoInsts.cpp - Expand pseudo instructions --*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that expands pseudo instructions into target
+// instructions to allow proper scheduling and other late optimizations.  This
+// pass should be run after register allocation but before the post-regalloc
+// scheduling pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "AArch64InstrInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/MathExtras.h"
+using namespace llvm;
+
+namespace {
+class AArch64ExpandPseudo : public MachineFunctionPass {
+public:
+  static char ID;
+  AArch64ExpandPseudo() : MachineFunctionPass(ID) {}
+
+  const AArch64InstrInfo *TII;
+
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+
+  const char *getPassName() const override {
+    return "AArch64 pseudo instruction expansion pass";
+  }
+
+private:
+  bool expandMBB(MachineBasicBlock &MBB);
+  bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
+  bool expandMOVImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                    unsigned BitSize);
+};
+char AArch64ExpandPseudo::ID = 0;
+}
+
+/// \brief Transfer implicit operands on the pseudo instruction to the
+/// instructions created from the expansion.
+static void transferImpOps(MachineInstr &OldMI, MachineInstrBuilder &UseMI,
+                           MachineInstrBuilder &DefMI) {
+  const MCInstrDesc &Desc = OldMI.getDesc();
+  for (unsigned i = Desc.getNumOperands(), e = OldMI.getNumOperands(); i != e;
+       ++i) {
+    const MachineOperand &MO = OldMI.getOperand(i);
+    assert(MO.isReg() && MO.getReg());
+    if (MO.isUse())
+      UseMI.addOperand(MO);
+    else
+      DefMI.addOperand(MO);
+  }
+}
+
+/// \brief Helper function which extracts the specified 16-bit chunk from a
+/// 64-bit value.
+static uint64_t getChunk(uint64_t Imm, unsigned ChunkIdx) {
+  assert(ChunkIdx < 4 && "Out of range chunk index specified!");
+
+  return (Imm >> (ChunkIdx * 16)) & 0xFFFF;
+}
+
+/// \brief Helper function which replicates a 16-bit chunk within a 64-bit
+/// value. Indices correspond to element numbers in a v4i16.
+static uint64_t replicateChunk(uint64_t Imm, unsigned FromIdx, unsigned ToIdx) {
+  assert((FromIdx < 4) && (ToIdx < 4) && "Out of range chunk index specified!");
+  const unsigned ShiftAmt = ToIdx * 16;
+
+  // Replicate the source chunk to the destination position.
+  const uint64_t Chunk = getChunk(Imm, FromIdx) << ShiftAmt;
+  // Clear the destination chunk.
+  Imm &= ~(0xFFFFLL << ShiftAmt);
+  // Insert the replicated chunk.
+  return Imm | Chunk;
+}
+
+/// \brief Helper function which tries to materialize a 64-bit value with an
+/// ORR + MOVK instruction sequence.
+static bool tryOrrMovk(uint64_t UImm, uint64_t OrrImm, MachineInstr &MI,
+                       MachineBasicBlock &MBB,
+                       MachineBasicBlock::iterator &MBBI,
+                       const AArch64InstrInfo *TII, unsigned ChunkIdx) {
+  assert(ChunkIdx < 4 && "Out of range chunk index specified!");
+  const unsigned ShiftAmt = ChunkIdx * 16;
+
+  uint64_t Encoding;
+  if (AArch64_AM::processLogicalImmediate(OrrImm, 64, Encoding)) {
+    // Create the ORR-immediate instruction.
+    MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri))
+            .addOperand(MI.getOperand(0))
+            .addReg(AArch64::XZR)
+            .addImm(Encoding);
+
+    // Create the MOVK instruction.
+    const unsigned Imm16 = getChunk(UImm, ChunkIdx);
+    const unsigned DstReg = MI.getOperand(0).getReg();
+    const bool DstIsDead = MI.getOperand(0).isDead();
+    MachineInstrBuilder MIB1 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
+            .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+            .addReg(DstReg)
+            .addImm(Imm16)
+            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt));
+
+    transferImpOps(MI, MIB, MIB1);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  return false;
+}
+
+/// \brief Check whether the given 16-bit chunk replicated to full 64-bit width
+/// can be materialized with an ORR instruction.
+static bool canUseOrr(uint64_t Chunk, uint64_t &Encoding) {
+  Chunk = (Chunk << 48) | (Chunk << 32) | (Chunk << 16) | Chunk;
+
+  return AArch64_AM::processLogicalImmediate(Chunk, 64, Encoding);
+}
+
+/// \brief Check for identical 16-bit chunks within the constant and if so
+/// materialize them with a single ORR instruction. The remaining one or two
+/// 16-bit chunks will be materialized with MOVK instructions.
+///
+/// This allows us to materialize constants like |A|B|A|A| or |A|B|C|A| (order
+/// of the chunks doesn't matter), assuming |A|A|A|A| can be materialized with
+/// an ORR instruction.
+///
+static bool tryToreplicateChunks(uint64_t UImm, MachineInstr &MI,
+                                 MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator &MBBI,
+                                 const AArch64InstrInfo *TII) {
+  typedef DenseMap<uint64_t, unsigned> CountMap;
+  CountMap Counts;
+
+  // Scan the constant and count how often every chunk occurs.
+  for (unsigned Idx = 0; Idx < 4; ++Idx)
+    ++Counts[getChunk(UImm, Idx)];
+
+  // Traverse the chunks to find one which occurs more than once.
+  for (CountMap::const_iterator Chunk = Counts.begin(), End = Counts.end();
+       Chunk != End; ++Chunk) {
+    const uint64_t ChunkVal = Chunk->first;
+    const unsigned Count = Chunk->second;
+
+    uint64_t Encoding = 0;
+
+    // We are looking for chunks which have two or three instances and can be
+    // materialized with an ORR instruction.
+    if ((Count != 2 && Count != 3) || !canUseOrr(ChunkVal, Encoding))
+      continue;
+
+    const bool CountThree = Count == 3;
+    // Create the ORR-immediate instruction.
+    MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri))
+            .addOperand(MI.getOperand(0))
+            .addReg(AArch64::XZR)
+            .addImm(Encoding);
+
+    const unsigned DstReg = MI.getOperand(0).getReg();
+    const bool DstIsDead = MI.getOperand(0).isDead();
+
+    unsigned ShiftAmt = 0;
+    uint64_t Imm16 = 0;
+    // Find the first chunk not materialized with the ORR instruction.
+    for (; ShiftAmt < 64; ShiftAmt += 16) {
+      Imm16 = (UImm >> ShiftAmt) & 0xFFFF;
+
+      if (Imm16 != ChunkVal)
+        break;
+    }
+
+    // Create the first MOVK instruction.
+    MachineInstrBuilder MIB1 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
+            .addReg(DstReg,
+                    RegState::Define | getDeadRegState(DstIsDead && CountThree))
+            .addReg(DstReg)
+            .addImm(Imm16)
+            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt));
+
+    // In case we have three instances the whole constant is now materialized
+    // and we can exit.
+    if (CountThree) {
+      transferImpOps(MI, MIB, MIB1);
+      MI.eraseFromParent();
+      return true;
+    }
+
+    // Find the remaining chunk which needs to be materialized.
+    for (ShiftAmt += 16; ShiftAmt < 64; ShiftAmt += 16) {
+      Imm16 = (UImm >> ShiftAmt) & 0xFFFF;
+
+      if (Imm16 != ChunkVal)
+        break;
+    }
+
+    // Create the second MOVK instruction.
+    MachineInstrBuilder MIB2 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
+            .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+            .addReg(DstReg)
+            .addImm(Imm16)
+            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt));
+
+    transferImpOps(MI, MIB, MIB2);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  return false;
+}
+
+/// \brief Check whether this chunk matches the pattern '1...0...'. This pattern
+/// starts a contiguous sequence of ones if we look at the bits from the LSB
+/// towards the MSB.
+static bool isStartChunk(uint64_t Chunk) {
+  if (Chunk == 0 || Chunk == UINT64_MAX)
+    return false;
+
+  return (CountLeadingOnes_64(Chunk) + countTrailingZeros(Chunk)) == 64;
+}
+
+/// \brief Check whether this chunk matches the pattern '0...1...' This pattern
+/// ends a contiguous sequence of ones if we look at the bits from the LSB
+/// towards the MSB.
+static bool isEndChunk(uint64_t Chunk) {
+  if (Chunk == 0 || Chunk == UINT64_MAX)
+    return false;
+
+  return (countLeadingZeros(Chunk) + CountTrailingOnes_64(Chunk)) == 64;
+}
+
+/// \brief Clear or set all bits in the chunk at the given index.
+static uint64_t updateImm(uint64_t Imm, unsigned Idx, bool Clear) {
+  const uint64_t Mask = 0xFFFF;
+
+  if (Clear)
+    // Clear chunk in the immediate.
+    Imm &= ~(Mask << (Idx * 16));
+  else
+    // Set all bits in the immediate for the particular chunk.
+    Imm |= Mask << (Idx * 16);
+
+  return Imm;
+}
+
+/// \brief Check whether the constant contains a sequence of contiguous ones,
+/// which might be interrupted by one or two chunks. If so, materialize the
+/// sequence of contiguous ones with an ORR instruction.
+/// Materialize the chunks which are either interrupting the sequence or outside
+/// of the sequence with a MOVK instruction.
+///
+/// Assuming S is a chunk which starts the sequence (1...0...), E is a chunk
+/// which ends the sequence (0...1...). Then we are looking for constants which
+/// contain at least one S and E chunk.
+/// E.g. |E|A|B|S|, |A|E|B|S| or |A|B|E|S|.
+///
+/// We are also looking for constants like |S|A|B|E| where the contiguous
+/// sequence of ones wraps around the MSB into the LSB.
+///
+static bool trySequenceOfOnes(uint64_t UImm, MachineInstr &MI,
+                              MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator &MBBI,
+                              const AArch64InstrInfo *TII) {
+  const int NotSet = -1;
+  const uint64_t Mask = 0xFFFF;
+
+  int StartIdx = NotSet;
+  int EndIdx = NotSet;
+  // Try to find the chunks which start/end a contiguous sequence of ones.
+  for (int Idx = 0; Idx < 4; ++Idx) {
+    int64_t Chunk = getChunk(UImm, Idx);
+    // Sign extend the 16-bit chunk to 64-bit.
+    Chunk = (Chunk << 48) >> 48;
+
+    if (isStartChunk(Chunk))
+      StartIdx = Idx;
+    else if (isEndChunk(Chunk))
+      EndIdx = Idx;
+  }
+
+  // Early exit in case we can't find a start/end chunk.
+  if (StartIdx == NotSet || EndIdx == NotSet)
+    return false;
+
+  // Outside of the contiguous sequence of ones everything needs to be zero.
+  uint64_t Outside = 0;
+  // Chunks between the start and end chunk need to have all their bits set.
+  uint64_t Inside = Mask;
+
+  // If our contiguous sequence of ones wraps around from the MSB into the LSB,
+  // just swap indices and pretend we are materializing a contiguous sequence
+  // of zeros surrounded by a contiguous sequence of ones.
+  if (StartIdx > EndIdx) {
+    std::swap(StartIdx, EndIdx);
+    std::swap(Outside, Inside);
+  }
+
+  uint64_t OrrImm = UImm;
+  int FirstMovkIdx = NotSet;
+  int SecondMovkIdx = NotSet;
+
+  // Find out which chunks we need to patch up to obtain a contiguous sequence
+  // of ones.
+  for (int Idx = 0; Idx < 4; ++Idx) {
+    const uint64_t Chunk = getChunk(UImm, Idx);
+
+    // Check whether we are looking at a chunk which is not part of the
+    // contiguous sequence of ones.
+    if ((Idx < StartIdx || EndIdx < Idx) && Chunk != Outside) {
+      OrrImm = updateImm(OrrImm, Idx, Outside == 0);
+
+      // Remember the index we need to patch.
+      if (FirstMovkIdx == NotSet)
+        FirstMovkIdx = Idx;
+      else
+        SecondMovkIdx = Idx;
+
+      // Check whether we are looking a chunk which is part of the contiguous
+      // sequence of ones.
+    } else if (Idx > StartIdx && Idx < EndIdx && Chunk != Inside) {
+      OrrImm = updateImm(OrrImm, Idx, Inside != Mask);
+
+      // Remember the index we need to patch.
+      if (FirstMovkIdx == NotSet)
+        FirstMovkIdx = Idx;
+      else
+        SecondMovkIdx = Idx;
+    }
+  }
+  assert(FirstMovkIdx != NotSet && "Constant materializable with single ORR!");
+
+  // Create the ORR-immediate instruction.
+  uint64_t Encoding = 0;
+  AArch64_AM::processLogicalImmediate(OrrImm, 64, Encoding);
+  MachineInstrBuilder MIB =
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri))
+          .addOperand(MI.getOperand(0))
+          .addReg(AArch64::XZR)
+          .addImm(Encoding);
+
+  const unsigned DstReg = MI.getOperand(0).getReg();
+  const bool DstIsDead = MI.getOperand(0).isDead();
+
+  const bool SingleMovk = SecondMovkIdx == NotSet;
+  // Create the first MOVK instruction.
+  MachineInstrBuilder MIB1 =
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
+          .addReg(DstReg,
+                  RegState::Define | getDeadRegState(DstIsDead && SingleMovk))
+          .addReg(DstReg)
+          .addImm(getChunk(UImm, FirstMovkIdx))
+          .addImm(
+              AArch64_AM::getShifterImm(AArch64_AM::LSL, FirstMovkIdx * 16));
+
+  // Early exit in case we only need to emit a single MOVK instruction.
+  if (SingleMovk) {
+    transferImpOps(MI, MIB, MIB1);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  // Create the second MOVK instruction.
+  MachineInstrBuilder MIB2 =
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
+          .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+          .addReg(DstReg)
+          .addImm(getChunk(UImm, SecondMovkIdx))
+          .addImm(
+              AArch64_AM::getShifterImm(AArch64_AM::LSL, SecondMovkIdx * 16));
+
+  transferImpOps(MI, MIB, MIB2);
+  MI.eraseFromParent();
+  return true;
+}
+
+/// \brief Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more
+/// real move-immediate instructions to synthesize the immediate.
+bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MBBI,
+                                       unsigned BitSize) {
+  MachineInstr &MI = *MBBI;
+  uint64_t Imm = MI.getOperand(1).getImm();
+  const unsigned Mask = 0xFFFF;
+
+  // Try a MOVI instruction (aka ORR-immediate with the zero register).
+  uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
+  uint64_t Encoding;
+  if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
+    unsigned Opc = (BitSize == 32 ? AArch64::ORRWri : AArch64::ORRXri);
+    MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc))
+            .addOperand(MI.getOperand(0))
+            .addReg(BitSize == 32 ? AArch64::WZR : AArch64::XZR)
+            .addImm(Encoding);
+    transferImpOps(MI, MIB, MIB);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  // Scan the immediate and count the number of 16-bit chunks which are either
+  // all ones or all zeros.
+  unsigned OneChunks = 0;
+  unsigned ZeroChunks = 0;
+  for (unsigned Shift = 0; Shift < BitSize; Shift += 16) {
+    const unsigned Chunk = (Imm >> Shift) & Mask;
+    if (Chunk == Mask)
+      OneChunks++;
+    else if (Chunk == 0)
+      ZeroChunks++;
+  }
+
+  // Since we can't materialize the constant with a single ORR instruction,
+  // let's see whether we can materialize 3/4 of the constant with an ORR
+  // instruction and use an additional MOVK instruction to materialize the
+  // remaining 1/4.
+  //
+  // We are looking for constants with a pattern like: |A|X|B|X| or |X|A|X|B|.
+  //
+  // E.g. assuming |A|X|A|X| is a pattern which can be materialized with ORR,
+  // we would create the following instruction sequence:
+  //
+  // ORR x0, xzr, |A|X|A|X|
+  // MOVK x0, |B|, LSL #16
+  //
+  // Only look at 64-bit constants which can't be materialized with a single
+  // instruction e.g. which have less than either three all zero or all one
+  // chunks.
+  //
+  // Ignore 32-bit constants here, they always can be materialized with a
+  // MOVZ/MOVN + MOVK pair. Since the 32-bit constant can't be materialized
+  // with a single ORR, the best sequence we can achieve is a ORR + MOVK pair.
+  // Thus we fall back to the default code below which in the best case creates
+  // a single MOVZ/MOVN instruction (in case one chunk is all zero or all one).
+  //
+  if (BitSize == 64 && OneChunks < 3 && ZeroChunks < 3) {
+    // If we interpret the 64-bit constant as a v4i16, are elements 0 and 2
+    // identical?
+    if (getChunk(UImm, 0) == getChunk(UImm, 2)) {
+      // See if we can come up with a constant which can be materialized with
+      // ORR-immediate by replicating element 3 into element 1.
+      uint64_t OrrImm = replicateChunk(UImm, 3, 1);
+      if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 1))
+        return true;
+
+      // See if we can come up with a constant which can be materialized with
+      // ORR-immediate by replicating element 1 into element 3.
+      OrrImm = replicateChunk(UImm, 1, 3);
+      if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 3))
+        return true;
+
+      // If we interpret the 64-bit constant as a v4i16, are elements 1 and 3
+      // identical?
+    } else if (getChunk(UImm, 1) == getChunk(UImm, 3)) {
+      // See if we can come up with a constant which can be materialized with
+      // ORR-immediate by replicating element 2 into element 0.
+      uint64_t OrrImm = replicateChunk(UImm, 2, 0);
+      if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 0))
+        return true;
+
+      // See if we can come up with a constant which can be materialized with
+      // ORR-immediate by replicating element 1 into element 3.
+      OrrImm = replicateChunk(UImm, 0, 2);
+      if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 2))
+        return true;
+    }
+  }
+
+  // Check for identical 16-bit chunks within the constant and if so materialize
+  // them with a single ORR instruction. The remaining one or two 16-bit chunks
+  // will be materialized with MOVK instructions.
+  if (BitSize == 64 && tryToreplicateChunks(UImm, MI, MBB, MBBI, TII))
+    return true;
+
+  // Check whether the constant contains a sequence of contiguous ones, which
+  // might be interrupted by one or two chunks. If so, materialize the sequence
+  // of contiguous ones with an ORR instruction. Materialize the chunks which
+  // are either interrupting the sequence or outside of the sequence with a
+  // MOVK instruction.
+  if (BitSize == 64 && trySequenceOfOnes(UImm, MI, MBB, MBBI, TII))
+    return true;
+
+  // Use a MOVZ or MOVN instruction to set the high bits, followed by one or
+  // more MOVK instructions to insert additional 16-bit portions into the
+  // lower bits.
+  bool isNeg = false;
+
+  // Use MOVN to materialize the high bits if we have more all one chunks
+  // than all zero chunks.
+  if (OneChunks > ZeroChunks) {
+    isNeg = true;
+    Imm = ~Imm;
+  }
+
+  unsigned FirstOpc;
+  if (BitSize == 32) {
+    Imm &= (1LL << 32) - 1;
+    FirstOpc = (isNeg ? AArch64::MOVNWi : AArch64::MOVZWi);
+  } else {
+    FirstOpc = (isNeg ? AArch64::MOVNXi : AArch64::MOVZXi);
+  }
+  unsigned Shift = 0;     // LSL amount for high bits with MOVZ/MOVN
+  unsigned LastShift = 0; // LSL amount for last MOVK
+  if (Imm != 0) {
+    unsigned LZ = countLeadingZeros(Imm);
+    unsigned TZ = countTrailingZeros(Imm);
+    Shift = ((63 - LZ) / 16) * 16;
+    LastShift = (TZ / 16) * 16;
+  }
+  unsigned Imm16 = (Imm >> Shift) & Mask;
+  unsigned DstReg = MI.getOperand(0).getReg();
+  bool DstIsDead = MI.getOperand(0).isDead();
+  MachineInstrBuilder MIB1 =
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(FirstOpc))
+          .addReg(DstReg, RegState::Define |
+                              getDeadRegState(DstIsDead && Shift == LastShift))
+          .addImm(Imm16)
+          .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift));
+
+  // If a MOVN was used for the high bits of a negative value, flip the rest
+  // of the bits back for use with MOVK.
+  if (isNeg)
+    Imm = ~Imm;
+
+  if (Shift == LastShift) {
+    transferImpOps(MI, MIB1, MIB1);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  MachineInstrBuilder MIB2;
+  unsigned Opc = (BitSize == 32 ? AArch64::MOVKWi : AArch64::MOVKXi);
+  while (Shift != LastShift) {
+    Shift -= 16;
+    Imm16 = (Imm >> Shift) & Mask;
+    if (Imm16 == (isNeg ? Mask : 0))
+      continue; // This 16-bit portion is already set correctly.
+    MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc))
+               .addReg(DstReg,
+                       RegState::Define |
+                           getDeadRegState(DstIsDead && Shift == LastShift))
+               .addReg(DstReg)
+               .addImm(Imm16)
+               .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift));
+  }
+
+  transferImpOps(MI, MIB1, MIB2);
+  MI.eraseFromParent();
+  return true;
+}
+
+/// \brief If MBBI references a pseudo instruction that should be expanded here,
+/// do the expansion and return true.  Otherwise return false.
+bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned Opcode = MI.getOpcode();
+  switch (Opcode) {
+  default:
+    break;
+
+  case AArch64::ADDWrr:
+  case AArch64::SUBWrr:
+  case AArch64::ADDXrr:
+  case AArch64::SUBXrr:
+  case AArch64::ADDSWrr:
+  case AArch64::SUBSWrr:
+  case AArch64::ADDSXrr:
+  case AArch64::SUBSXrr:
+  case AArch64::ANDWrr:
+  case AArch64::ANDXrr:
+  case AArch64::BICWrr:
+  case AArch64::BICXrr:
+  case AArch64::ANDSWrr:
+  case AArch64::ANDSXrr:
+  case AArch64::BICSWrr:
+  case AArch64::BICSXrr:
+  case AArch64::EONWrr:
+  case AArch64::EONXrr:
+  case AArch64::EORWrr:
+  case AArch64::EORXrr:
+  case AArch64::ORNWrr:
+  case AArch64::ORNXrr:
+  case AArch64::ORRWrr:
+  case AArch64::ORRXrr: {
+    unsigned Opcode;
+    switch (MI.getOpcode()) {
+    default:
+      return false;
+    case AArch64::ADDWrr:      Opcode = AArch64::ADDWrs; break;
+    case AArch64::SUBWrr:      Opcode = AArch64::SUBWrs; break;
+    case AArch64::ADDXrr:      Opcode = AArch64::ADDXrs; break;
+    case AArch64::SUBXrr:      Opcode = AArch64::SUBXrs; break;
+    case AArch64::ADDSWrr:     Opcode = AArch64::ADDSWrs; break;
+    case AArch64::SUBSWrr:     Opcode = AArch64::SUBSWrs; break;
+    case AArch64::ADDSXrr:     Opcode = AArch64::ADDSXrs; break;
+    case AArch64::SUBSXrr:     Opcode = AArch64::SUBSXrs; break;
+    case AArch64::ANDWrr:      Opcode = AArch64::ANDWrs; break;
+    case AArch64::ANDXrr:      Opcode = AArch64::ANDXrs; break;
+    case AArch64::BICWrr:      Opcode = AArch64::BICWrs; break;
+    case AArch64::BICXrr:      Opcode = AArch64::BICXrs; break;
+    case AArch64::ANDSWrr:     Opcode = AArch64::ANDSWrs; break;
+    case AArch64::ANDSXrr:     Opcode = AArch64::ANDSXrs; break;
+    case AArch64::BICSWrr:     Opcode = AArch64::BICSWrs; break;
+    case AArch64::BICSXrr:     Opcode = AArch64::BICSXrs; break;
+    case AArch64::EONWrr:      Opcode = AArch64::EONWrs; break;
+    case AArch64::EONXrr:      Opcode = AArch64::EONXrs; break;
+    case AArch64::EORWrr:      Opcode = AArch64::EORWrs; break;
+    case AArch64::EORXrr:      Opcode = AArch64::EORXrs; break;
+    case AArch64::ORNWrr:      Opcode = AArch64::ORNWrs; break;
+    case AArch64::ORNXrr:      Opcode = AArch64::ORNXrs; break;
+    case AArch64::ORRWrr:      Opcode = AArch64::ORRWrs; break;
+    case AArch64::ORRXrr:      Opcode = AArch64::ORRXrs; break;
+    }
+    MachineInstrBuilder MIB1 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode),
+                MI.getOperand(0).getReg())
+            .addOperand(MI.getOperand(1))
+            .addOperand(MI.getOperand(2))
+            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+    transferImpOps(MI, MIB1, MIB1);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  case AArch64::LOADgot: {
+    // Expand into ADRP + LDR.
+    unsigned DstReg = MI.getOperand(0).getReg();
+    const MachineOperand &MO1 = MI.getOperand(1);
+    unsigned Flags = MO1.getTargetFlags();
+    MachineInstrBuilder MIB1 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg);
+    MachineInstrBuilder MIB2 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::LDRXui))
+            .addOperand(MI.getOperand(0))
+            .addReg(DstReg);
+
+    if (MO1.isGlobal()) {
+      MIB1.addGlobalAddress(MO1.getGlobal(), 0, Flags | AArch64II::MO_PAGE);
+      MIB2.addGlobalAddress(MO1.getGlobal(), 0,
+                            Flags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+    } else if (MO1.isSymbol()) {
+      MIB1.addExternalSymbol(MO1.getSymbolName(), Flags | AArch64II::MO_PAGE);
+      MIB2.addExternalSymbol(MO1.getSymbolName(),
+                             Flags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+    } else {
+      assert(MO1.isCPI() &&
+             "Only expect globals, externalsymbols, or constant pools");
+      MIB1.addConstantPoolIndex(MO1.getIndex(), MO1.getOffset(),
+                                Flags | AArch64II::MO_PAGE);
+      MIB2.addConstantPoolIndex(MO1.getIndex(), MO1.getOffset(),
+                                Flags | AArch64II::MO_PAGEOFF |
+                                    AArch64II::MO_NC);
+    }
+
+    transferImpOps(MI, MIB1, MIB2);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  case AArch64::MOVaddr:
+  case AArch64::MOVaddrJT:
+  case AArch64::MOVaddrCP:
+  case AArch64::MOVaddrBA:
+  case AArch64::MOVaddrTLS:
+  case AArch64::MOVaddrEXT: {
+    // Expand into ADRP + ADD.
+    unsigned DstReg = MI.getOperand(0).getReg();
+    MachineInstrBuilder MIB1 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg)
+            .addOperand(MI.getOperand(1));
+
+    MachineInstrBuilder MIB2 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADDXri))
+            .addOperand(MI.getOperand(0))
+            .addReg(DstReg)
+            .addOperand(MI.getOperand(2))
+            .addImm(0);
+
+    transferImpOps(MI, MIB1, MIB2);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  case AArch64::MOVi32imm:
+    return expandMOVImm(MBB, MBBI, 32);
+  case AArch64::MOVi64imm:
+    return expandMOVImm(MBB, MBBI, 64);
+  case AArch64::RET_ReallyLR:
+    BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::RET))
+        .addReg(AArch64::LR);
+    MI.eraseFromParent();
+    return true;
+  }
+  return false;
+}
+
+/// \brief Iterate over the instructions in basic block MBB and expand any
+/// pseudo instructions.  Return true if anything was modified.
+bool AArch64ExpandPseudo::expandMBB(MachineBasicBlock &MBB) {
+  bool Modified = false;
+
+  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+  while (MBBI != E) {
+    MachineBasicBlock::iterator NMBBI = std::next(MBBI);
+    Modified |= expandMI(MBB, MBBI);
+    MBBI = NMBBI;
+  }
+
+  return Modified;
+}
+
+bool AArch64ExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
+  TII = static_cast<const AArch64InstrInfo *>(MF.getTarget().getInstrInfo());
+
+  bool Modified = false;
+  for (auto &MBB : MF)
+    Modified |= expandMBB(MBB);
+  return Modified;
+}
+
+/// \brief Returns an instance of the pseudo instruction expansion pass.
+FunctionPass *llvm::createAArch64ExpandPseudoPass() {
+  return new AArch64ExpandPseudo();
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp
new file mode 100644
index 0000000..2164d77
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp
@@ -0,0 +1,1994 @@
+//===-- AArch6464FastISel.cpp - AArch64 FastISel implementation -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the AArch64-specific support for the FastISel class. Some
+// of the target-specific code is generated by tablegen in the file
+// AArch64GenFastISel.inc, which is #included here.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64TargetMachine.h"
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+namespace {
+
+class AArch64FastISel : public FastISel {
+
+  class Address {
+  public:
+    typedef enum {
+      RegBase,
+      FrameIndexBase
+    } BaseKind;
+
+  private:
+    BaseKind Kind;
+    union {
+      unsigned Reg;
+      int FI;
+    } Base;
+    int64_t Offset;
+
+  public:
+    Address() : Kind(RegBase), Offset(0) { Base.Reg = 0; }
+    void setKind(BaseKind K) { Kind = K; }
+    BaseKind getKind() const { return Kind; }
+    bool isRegBase() const { return Kind == RegBase; }
+    bool isFIBase() const { return Kind == FrameIndexBase; }
+    void setReg(unsigned Reg) {
+      assert(isRegBase() && "Invalid base register access!");
+      Base.Reg = Reg;
+    }
+    unsigned getReg() const {
+      assert(isRegBase() && "Invalid base register access!");
+      return Base.Reg;
+    }
+    void setFI(unsigned FI) {
+      assert(isFIBase() && "Invalid base frame index  access!");
+      Base.FI = FI;
+    }
+    unsigned getFI() const {
+      assert(isFIBase() && "Invalid base frame index access!");
+      return Base.FI;
+    }
+    void setOffset(int64_t O) { Offset = O; }
+    int64_t getOffset() { return Offset; }
+
+    bool isValid() { return isFIBase() || (isRegBase() && getReg() != 0); }
+  };
+
+  /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const AArch64Subtarget *Subtarget;
+  LLVMContext *Context;
+
+private:
+  // Selection routines.
+  bool SelectLoad(const Instruction *I);
+  bool SelectStore(const Instruction *I);
+  bool SelectBranch(const Instruction *I);
+  bool SelectIndirectBr(const Instruction *I);
+  bool SelectCmp(const Instruction *I);
+  bool SelectSelect(const Instruction *I);
+  bool SelectFPExt(const Instruction *I);
+  bool SelectFPTrunc(const Instruction *I);
+  bool SelectFPToInt(const Instruction *I, bool Signed);
+  bool SelectIntToFP(const Instruction *I, bool Signed);
+  bool SelectRem(const Instruction *I, unsigned ISDOpcode);
+  bool SelectCall(const Instruction *I, const char *IntrMemName);
+  bool SelectIntrinsicCall(const IntrinsicInst &I);
+  bool SelectRet(const Instruction *I);
+  bool SelectTrunc(const Instruction *I);
+  bool SelectIntExt(const Instruction *I);
+  bool SelectMul(const Instruction *I);
+
+  // Utility helper routines.
+  bool isTypeLegal(Type *Ty, MVT &VT);
+  bool isLoadStoreTypeLegal(Type *Ty, MVT &VT);
+  bool ComputeAddress(const Value *Obj, Address &Addr);
+  bool SimplifyAddress(Address &Addr, MVT VT, int64_t ScaleFactor,
+                       bool UseUnscaled);
+  void AddLoadStoreOperands(Address &Addr, const MachineInstrBuilder &MIB,
+                            unsigned Flags, bool UseUnscaled);
+  bool IsMemCpySmall(uint64_t Len, unsigned Alignment);
+  bool TryEmitSmallMemCpy(Address Dest, Address Src, uint64_t Len,
+                          unsigned Alignment);
+  // Emit functions.
+  bool EmitCmp(Value *Src1Value, Value *Src2Value, bool isZExt);
+  bool EmitLoad(MVT VT, unsigned &ResultReg, Address Addr,
+                bool UseUnscaled = false);
+  bool EmitStore(MVT VT, unsigned SrcReg, Address Addr,
+                 bool UseUnscaled = false);
+  unsigned EmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, bool isZExt);
+  unsigned Emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt);
+
+  unsigned AArch64MaterializeFP(const ConstantFP *CFP, MVT VT);
+  unsigned AArch64MaterializeGV(const GlobalValue *GV);
+
+  // Call handling routines.
+private:
+  CCAssignFn *CCAssignFnForCall(CallingConv::ID CC) const;
+  bool ProcessCallArgs(SmallVectorImpl<Value *> &Args,
+                       SmallVectorImpl<unsigned> &ArgRegs,
+                       SmallVectorImpl<MVT> &ArgVTs,
+                       SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
+                       SmallVectorImpl<unsigned> &RegArgs, CallingConv::ID CC,
+                       unsigned &NumBytes);
+  bool FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
+                  const Instruction *I, CallingConv::ID CC, unsigned &NumBytes);
+
+public:
+  // Backend specific FastISel code.
+  unsigned TargetMaterializeAlloca(const AllocaInst *AI) override;
+  unsigned TargetMaterializeConstant(const Constant *C) override;
+
+  explicit AArch64FastISel(FunctionLoweringInfo &funcInfo,
+                         const TargetLibraryInfo *libInfo)
+      : FastISel(funcInfo, libInfo) {
+    Subtarget = &TM.getSubtarget<AArch64Subtarget>();
+    Context = &funcInfo.Fn->getContext();
+  }
+
+  bool TargetSelectInstruction(const Instruction *I) override;
+
+#include "AArch64GenFastISel.inc"
+};
+
+} // end anonymous namespace
+
+#include "AArch64GenCallingConv.inc"
+
+CCAssignFn *AArch64FastISel::CCAssignFnForCall(CallingConv::ID CC) const {
+  if (CC == CallingConv::WebKit_JS)
+    return CC_AArch64_WebKit_JS;
+  return Subtarget->isTargetDarwin() ? CC_AArch64_DarwinPCS : CC_AArch64_AAPCS;
+}
+
+unsigned AArch64FastISel::TargetMaterializeAlloca(const AllocaInst *AI) {
+  assert(TLI.getValueType(AI->getType(), true) == MVT::i64 &&
+         "Alloca should always return a pointer.");
+
+  // Don't handle dynamic allocas.
+  if (!FuncInfo.StaticAllocaMap.count(AI))
+    return 0;
+
+  DenseMap<const AllocaInst *, int>::iterator SI =
+      FuncInfo.StaticAllocaMap.find(AI);
+
+  if (SI != FuncInfo.StaticAllocaMap.end()) {
+    unsigned ResultReg = createResultReg(&AArch64::GPR64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri),
+            ResultReg)
+        .addFrameIndex(SI->second)
+        .addImm(0)
+        .addImm(0);
+    return ResultReg;
+  }
+
+  return 0;
+}
+
+unsigned AArch64FastISel::AArch64MaterializeFP(const ConstantFP *CFP, MVT VT) {
+  if (VT != MVT::f32 && VT != MVT::f64)
+    return 0;
+
+  const APFloat Val = CFP->getValueAPF();
+  bool is64bit = (VT == MVT::f64);
+
+  // This checks to see if we can use FMOV instructions to materialize
+  // a constant, otherwise we have to materialize via the constant pool.
+  if (TLI.isFPImmLegal(Val, VT)) {
+    int Imm;
+    unsigned Opc;
+    if (is64bit) {
+      Imm = AArch64_AM::getFP64Imm(Val);
+      Opc = AArch64::FMOVDi;
+    } else {
+      Imm = AArch64_AM::getFP32Imm(Val);
+      Opc = AArch64::FMOVSi;
+    }
+    unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+        .addImm(Imm);
+    return ResultReg;
+  }
+
+  // Materialize via constant pool.  MachineConstantPool wants an explicit
+  // alignment.
+  unsigned Align = DL.getPrefTypeAlignment(CFP->getType());
+  if (Align == 0)
+    Align = DL.getTypeAllocSize(CFP->getType());
+
+  unsigned Idx = MCP.getConstantPoolIndex(cast<Constant>(CFP), Align);
+  unsigned ADRPReg = createResultReg(&AArch64::GPR64commonRegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
+          ADRPReg).addConstantPoolIndex(Idx, 0, AArch64II::MO_PAGE);
+
+  unsigned Opc = is64bit ? AArch64::LDRDui : AArch64::LDRSui;
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+      .addReg(ADRPReg)
+      .addConstantPoolIndex(Idx, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+  return ResultReg;
+}
+
+unsigned AArch64FastISel::AArch64MaterializeGV(const GlobalValue *GV) {
+  // We can't handle thread-local variables quickly yet.
+  if (GV->isThreadLocal())
+    return 0;
+
+  // MachO still uses GOT for large code-model accesses, but ELF requires
+  // movz/movk sequences, which FastISel doesn't handle yet.
+  if (TM.getCodeModel() != CodeModel::Small && !Subtarget->isTargetMachO())
+    return 0;
+
+  unsigned char OpFlags = Subtarget->ClassifyGlobalReference(GV, TM);
+
+  EVT DestEVT = TLI.getValueType(GV->getType(), true);
+  if (!DestEVT.isSimple())
+    return 0;
+
+  unsigned ADRPReg = createResultReg(&AArch64::GPR64commonRegClass);
+  unsigned ResultReg;
+
+  if (OpFlags & AArch64II::MO_GOT) {
+    // ADRP + LDRX
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
+            ADRPReg)
+        .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGE);
+
+    ResultReg = createResultReg(&AArch64::GPR64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::LDRXui),
+            ResultReg)
+        .addReg(ADRPReg)
+        .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGEOFF |
+                          AArch64II::MO_NC);
+  } else {
+    // ADRP + ADDX
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
+            ADRPReg).addGlobalAddress(GV, 0, AArch64II::MO_PAGE);
+
+    ResultReg = createResultReg(&AArch64::GPR64spRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri),
+            ResultReg)
+        .addReg(ADRPReg)
+        .addGlobalAddress(GV, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC)
+        .addImm(0);
+  }
+  return ResultReg;
+}
+
+unsigned AArch64FastISel::TargetMaterializeConstant(const Constant *C) {
+  EVT CEVT = TLI.getValueType(C->getType(), true);
+
+  // Only handle simple types.
+  if (!CEVT.isSimple())
+    return 0;
+  MVT VT = CEVT.getSimpleVT();
+
+  // FIXME: Handle ConstantInt.
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
+    return AArch64MaterializeFP(CFP, VT);
+  else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+    return AArch64MaterializeGV(GV);
+
+  return 0;
+}
+
+// Computes the address to get to an object.
+bool AArch64FastISel::ComputeAddress(const Value *Obj, Address &Addr) {
+  const User *U = nullptr;
+  unsigned Opcode = Instruction::UserOp1;
+  if (const Instruction *I = dyn_cast<Instruction>(Obj)) {
+    // Don't walk into other basic blocks unless the object is an alloca from
+    // another block, otherwise it may not have a virtual register assigned.
+    if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(Obj)) ||
+        FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
+      Opcode = I->getOpcode();
+      U = I;
+    }
+  } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(Obj)) {
+    Opcode = C->getOpcode();
+    U = C;
+  }
+
+  if (const PointerType *Ty = dyn_cast<PointerType>(Obj->getType()))
+    if (Ty->getAddressSpace() > 255)
+      // Fast instruction selection doesn't support the special
+      // address spaces.
+      return false;
+
+  switch (Opcode) {
+  default:
+    break;
+  case Instruction::BitCast: {
+    // Look through bitcasts.
+    return ComputeAddress(U->getOperand(0), Addr);
+  }
+  case Instruction::IntToPtr: {
+    // Look past no-op inttoptrs.
+    if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
+      return ComputeAddress(U->getOperand(0), Addr);
+    break;
+  }
+  case Instruction::PtrToInt: {
+    // Look past no-op ptrtoints.
+    if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
+      return ComputeAddress(U->getOperand(0), Addr);
+    break;
+  }
+  case Instruction::GetElementPtr: {
+    Address SavedAddr = Addr;
+    uint64_t TmpOffset = Addr.getOffset();
+
+    // Iterate through the GEP folding the constants into offsets where
+    // we can.
+    gep_type_iterator GTI = gep_type_begin(U);
+    for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end(); i != e;
+         ++i, ++GTI) {
+      const Value *Op = *i;
+      if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+        const StructLayout *SL = DL.getStructLayout(STy);
+        unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
+        TmpOffset += SL->getElementOffset(Idx);
+      } else {
+        uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
+        for (;;) {
+          if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+            // Constant-offset addressing.
+            TmpOffset += CI->getSExtValue() * S;
+            break;
+          }
+          if (canFoldAddIntoGEP(U, Op)) {
+            // A compatible add with a constant operand. Fold the constant.
+            ConstantInt *CI =
+                cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
+            TmpOffset += CI->getSExtValue() * S;
+            // Iterate on the other operand.
+            Op = cast<AddOperator>(Op)->getOperand(0);
+            continue;
+          }
+          // Unsupported
+          goto unsupported_gep;
+        }
+      }
+    }
+
+    // Try to grab the base operand now.
+    Addr.setOffset(TmpOffset);
+    if (ComputeAddress(U->getOperand(0), Addr))
+      return true;
+
+    // We failed, restore everything and try the other options.
+    Addr = SavedAddr;
+
+  unsupported_gep:
+    break;
+  }
+  case Instruction::Alloca: {
+    const AllocaInst *AI = cast<AllocaInst>(Obj);
+    DenseMap<const AllocaInst *, int>::iterator SI =
+        FuncInfo.StaticAllocaMap.find(AI);
+    if (SI != FuncInfo.StaticAllocaMap.end()) {
+      Addr.setKind(Address::FrameIndexBase);
+      Addr.setFI(SI->second);
+      return true;
+    }
+    break;
+  }
+  }
+
+  // Try to get this in a register if nothing else has worked.
+  if (!Addr.isValid())
+    Addr.setReg(getRegForValue(Obj));
+  return Addr.isValid();
+}
+
+bool AArch64FastISel::isTypeLegal(Type *Ty, MVT &VT) {
+  EVT evt = TLI.getValueType(Ty, true);
+
+  // Only handle simple types.
+  if (evt == MVT::Other || !evt.isSimple())
+    return false;
+  VT = evt.getSimpleVT();
+
+  // This is a legal type, but it's not something we handle in fast-isel.
+  if (VT == MVT::f128)
+    return false;
+
+  // Handle all other legal types, i.e. a register that will directly hold this
+  // value.
+  return TLI.isTypeLegal(VT);
+}
+
+bool AArch64FastISel::isLoadStoreTypeLegal(Type *Ty, MVT &VT) {
+  if (isTypeLegal(Ty, VT))
+    return true;
+
+  // If this is a type than can be sign or zero-extended to a basic operation
+  // go ahead and accept it now. For stores, this reflects truncation.
+  if (VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16)
+    return true;
+
+  return false;
+}
+
+bool AArch64FastISel::SimplifyAddress(Address &Addr, MVT VT,
+                                      int64_t ScaleFactor, bool UseUnscaled) {
+  bool needsLowering = false;
+  int64_t Offset = Addr.getOffset();
+  switch (VT.SimpleTy) {
+  default:
+    return false;
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+  case MVT::i64:
+  case MVT::f32:
+  case MVT::f64:
+    if (!UseUnscaled)
+      // Using scaled, 12-bit, unsigned immediate offsets.
+      needsLowering = ((Offset & 0xfff) != Offset);
+    else
+      // Using unscaled, 9-bit, signed immediate offsets.
+      needsLowering = (Offset > 256 || Offset < -256);
+    break;
+  }
+
+  //If this is a stack pointer and the offset needs to be simplified then put
+  // the alloca address into a register, set the base type back to register and
+  // continue. This should almost never happen.
+  if (needsLowering && Addr.getKind() == Address::FrameIndexBase) {
+    unsigned ResultReg = createResultReg(&AArch64::GPR64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri),
+            ResultReg)
+        .addFrameIndex(Addr.getFI())
+        .addImm(0)
+        .addImm(0);
+    Addr.setKind(Address::RegBase);
+    Addr.setReg(ResultReg);
+  }
+
+  // Since the offset is too large for the load/store instruction get the
+  // reg+offset into a register.
+  if (needsLowering) {
+    uint64_t UnscaledOffset = Addr.getOffset() * ScaleFactor;
+    unsigned ResultReg = FastEmit_ri_(MVT::i64, ISD::ADD, Addr.getReg(), false,
+                                      UnscaledOffset, MVT::i64);
+    if (ResultReg == 0)
+      return false;
+    Addr.setReg(ResultReg);
+    Addr.setOffset(0);
+  }
+  return true;
+}
+
+void AArch64FastISel::AddLoadStoreOperands(Address &Addr,
+                                           const MachineInstrBuilder &MIB,
+                                           unsigned Flags, bool UseUnscaled) {
+  int64_t Offset = Addr.getOffset();
+  // Frame base works a bit differently. Handle it separately.
+  if (Addr.getKind() == Address::FrameIndexBase) {
+    int FI = Addr.getFI();
+    // FIXME: We shouldn't be using getObjectSize/getObjectAlignment.  The size
+    // and alignment should be based on the VT.
+    MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(FI, Offset), Flags,
+        MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+    // Now add the rest of the operands.
+    MIB.addFrameIndex(FI).addImm(Offset).addMemOperand(MMO);
+  } else {
+    // Now add the rest of the operands.
+    MIB.addReg(Addr.getReg());
+    MIB.addImm(Offset);
+  }
+}
+
+bool AArch64FastISel::EmitLoad(MVT VT, unsigned &ResultReg, Address Addr,
+                               bool UseUnscaled) {
+  // Negative offsets require unscaled, 9-bit, signed immediate offsets.
+  // Otherwise, we try using scaled, 12-bit, unsigned immediate offsets.
+  if (!UseUnscaled && Addr.getOffset() < 0)
+    UseUnscaled = true;
+
+  unsigned Opc;
+  const TargetRegisterClass *RC;
+  bool VTIsi1 = false;
+  int64_t ScaleFactor = 0;
+  switch (VT.SimpleTy) {
+  default:
+    return false;
+  case MVT::i1:
+    VTIsi1 = true;
+  // Intentional fall-through.
+  case MVT::i8:
+    Opc = UseUnscaled ? AArch64::LDURBBi : AArch64::LDRBBui;
+    RC = &AArch64::GPR32RegClass;
+    ScaleFactor = 1;
+    break;
+  case MVT::i16:
+    Opc = UseUnscaled ? AArch64::LDURHHi : AArch64::LDRHHui;
+    RC = &AArch64::GPR32RegClass;
+    ScaleFactor = 2;
+    break;
+  case MVT::i32:
+    Opc = UseUnscaled ? AArch64::LDURWi : AArch64::LDRWui;
+    RC = &AArch64::GPR32RegClass;
+    ScaleFactor = 4;
+    break;
+  case MVT::i64:
+    Opc = UseUnscaled ? AArch64::LDURXi : AArch64::LDRXui;
+    RC = &AArch64::GPR64RegClass;
+    ScaleFactor = 8;
+    break;
+  case MVT::f32:
+    Opc = UseUnscaled ? AArch64::LDURSi : AArch64::LDRSui;
+    RC = TLI.getRegClassFor(VT);
+    ScaleFactor = 4;
+    break;
+  case MVT::f64:
+    Opc = UseUnscaled ? AArch64::LDURDi : AArch64::LDRDui;
+    RC = TLI.getRegClassFor(VT);
+    ScaleFactor = 8;
+    break;
+  }
+  // Scale the offset.
+  if (!UseUnscaled) {
+    int64_t Offset = Addr.getOffset();
+    if (Offset & (ScaleFactor - 1))
+      // Retry using an unscaled, 9-bit, signed immediate offset.
+      return EmitLoad(VT, ResultReg, Addr, /*UseUnscaled*/ true);
+
+    Addr.setOffset(Offset / ScaleFactor);
+  }
+
+  // Simplify this down to something we can handle.
+  if (!SimplifyAddress(Addr, VT, UseUnscaled ? 1 : ScaleFactor, UseUnscaled))
+    return false;
+
+  // Create the base instruction, then add the operands.
+  ResultReg = createResultReg(RC);
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                                    TII.get(Opc), ResultReg);
+  AddLoadStoreOperands(Addr, MIB, MachineMemOperand::MOLoad, UseUnscaled);
+
+  // Loading an i1 requires special handling.
+  if (VTIsi1) {
+    MRI.constrainRegClass(ResultReg, &AArch64::GPR32RegClass);
+    unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri),
+            ANDReg)
+        .addReg(ResultReg)
+        .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
+    ResultReg = ANDReg;
+  }
+  return true;
+}
+
+bool AArch64FastISel::SelectLoad(const Instruction *I) {
+  MVT VT;
+  // Verify we have a legal type before going any further.  Currently, we handle
+  // simple types that will directly fit in a register (i32/f32/i64/f64) or
+  // those that can be sign or zero-extended to a basic operation (i1/i8/i16).
+  if (!isLoadStoreTypeLegal(I->getType(), VT) || cast<LoadInst>(I)->isAtomic())
+    return false;
+
+  // See if we can handle this address.
+  Address Addr;
+  if (!ComputeAddress(I->getOperand(0), Addr))
+    return false;
+
+  unsigned ResultReg;
+  if (!EmitLoad(VT, ResultReg, Addr))
+    return false;
+
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::EmitStore(MVT VT, unsigned SrcReg, Address Addr,
+                                bool UseUnscaled) {
+  // Negative offsets require unscaled, 9-bit, signed immediate offsets.
+  // Otherwise, we try using scaled, 12-bit, unsigned immediate offsets.
+  if (!UseUnscaled && Addr.getOffset() < 0)
+    UseUnscaled = true;
+
+  unsigned StrOpc;
+  bool VTIsi1 = false;
+  int64_t ScaleFactor = 0;
+  // Using scaled, 12-bit, unsigned immediate offsets.
+  switch (VT.SimpleTy) {
+  default:
+    return false;
+  case MVT::i1:
+    VTIsi1 = true;
+  case MVT::i8:
+    StrOpc = UseUnscaled ? AArch64::STURBBi : AArch64::STRBBui;
+    ScaleFactor = 1;
+    break;
+  case MVT::i16:
+    StrOpc = UseUnscaled ? AArch64::STURHHi : AArch64::STRHHui;
+    ScaleFactor = 2;
+    break;
+  case MVT::i32:
+    StrOpc = UseUnscaled ? AArch64::STURWi : AArch64::STRWui;
+    ScaleFactor = 4;
+    break;
+  case MVT::i64:
+    StrOpc = UseUnscaled ? AArch64::STURXi : AArch64::STRXui;
+    ScaleFactor = 8;
+    break;
+  case MVT::f32:
+    StrOpc = UseUnscaled ? AArch64::STURSi : AArch64::STRSui;
+    ScaleFactor = 4;
+    break;
+  case MVT::f64:
+    StrOpc = UseUnscaled ? AArch64::STURDi : AArch64::STRDui;
+    ScaleFactor = 8;
+    break;
+  }
+  // Scale the offset.
+  if (!UseUnscaled) {
+    int64_t Offset = Addr.getOffset();
+    if (Offset & (ScaleFactor - 1))
+      // Retry using an unscaled, 9-bit, signed immediate offset.
+      return EmitStore(VT, SrcReg, Addr, /*UseUnscaled*/ true);
+
+    Addr.setOffset(Offset / ScaleFactor);
+  }
+
+  // Simplify this down to something we can handle.
+  if (!SimplifyAddress(Addr, VT, UseUnscaled ? 1 : ScaleFactor, UseUnscaled))
+    return false;
+
+  // Storing an i1 requires special handling.
+  if (VTIsi1) {
+    MRI.constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
+    unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri),
+            ANDReg)
+        .addReg(SrcReg)
+        .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
+    SrcReg = ANDReg;
+  }
+  // Create the base instruction, then add the operands.
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                                    TII.get(StrOpc)).addReg(SrcReg);
+  AddLoadStoreOperands(Addr, MIB, MachineMemOperand::MOStore, UseUnscaled);
+  return true;
+}
+
+bool AArch64FastISel::SelectStore(const Instruction *I) {
+  MVT VT;
+  Value *Op0 = I->getOperand(0);
+  // Verify we have a legal type before going any further.  Currently, we handle
+  // simple types that will directly fit in a register (i32/f32/i64/f64) or
+  // those that can be sign or zero-extended to a basic operation (i1/i8/i16).
+  if (!isLoadStoreTypeLegal(Op0->getType(), VT) ||
+      cast<StoreInst>(I)->isAtomic())
+    return false;
+
+  // Get the value to be stored into a register.
+  unsigned SrcReg = getRegForValue(Op0);
+  if (SrcReg == 0)
+    return false;
+
+  // See if we can handle this address.
+  Address Addr;
+  if (!ComputeAddress(I->getOperand(1), Addr))
+    return false;
+
+  if (!EmitStore(VT, SrcReg, Addr))
+    return false;
+  return true;
+}
+
+static AArch64CC::CondCode getCompareCC(CmpInst::Predicate Pred) {
+  switch (Pred) {
+  case CmpInst::FCMP_ONE:
+  case CmpInst::FCMP_UEQ:
+  default:
+    // AL is our "false" for now. The other two need more compares.
+    return AArch64CC::AL;
+  case CmpInst::ICMP_EQ:
+  case CmpInst::FCMP_OEQ:
+    return AArch64CC::EQ;
+  case CmpInst::ICMP_SGT:
+  case CmpInst::FCMP_OGT:
+    return AArch64CC::GT;
+  case CmpInst::ICMP_SGE:
+  case CmpInst::FCMP_OGE:
+    return AArch64CC::GE;
+  case CmpInst::ICMP_UGT:
+  case CmpInst::FCMP_UGT:
+    return AArch64CC::HI;
+  case CmpInst::FCMP_OLT:
+    return AArch64CC::MI;
+  case CmpInst::ICMP_ULE:
+  case CmpInst::FCMP_OLE:
+    return AArch64CC::LS;
+  case CmpInst::FCMP_ORD:
+    return AArch64CC::VC;
+  case CmpInst::FCMP_UNO:
+    return AArch64CC::VS;
+  case CmpInst::FCMP_UGE:
+    return AArch64CC::PL;
+  case CmpInst::ICMP_SLT:
+  case CmpInst::FCMP_ULT:
+    return AArch64CC::LT;
+  case CmpInst::ICMP_SLE:
+  case CmpInst::FCMP_ULE:
+    return AArch64CC::LE;
+  case CmpInst::FCMP_UNE:
+  case CmpInst::ICMP_NE:
+    return AArch64CC::NE;
+  case CmpInst::ICMP_UGE:
+    return AArch64CC::HS;
+  case CmpInst::ICMP_ULT:
+    return AArch64CC::LO;
+  }
+}
+
+bool AArch64FastISel::SelectBranch(const Instruction *I) {
+  const BranchInst *BI = cast<BranchInst>(I);
+  MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
+  MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
+
+  if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
+    if (CI->hasOneUse() && (CI->getParent() == I->getParent())) {
+      // We may not handle every CC for now.
+      AArch64CC::CondCode CC = getCompareCC(CI->getPredicate());
+      if (CC == AArch64CC::AL)
+        return false;
+
+      // Emit the cmp.
+      if (!EmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
+        return false;
+
+      // Emit the branch.
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
+          .addImm(CC)
+          .addMBB(TBB);
+      FuncInfo.MBB->addSuccessor(TBB);
+
+      FastEmitBranch(FBB, DbgLoc);
+      return true;
+    }
+  } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
+    MVT SrcVT;
+    if (TI->hasOneUse() && TI->getParent() == I->getParent() &&
+        (isLoadStoreTypeLegal(TI->getOperand(0)->getType(), SrcVT))) {
+      unsigned CondReg = getRegForValue(TI->getOperand(0));
+      if (CondReg == 0)
+        return false;
+
+      // Issue an extract_subreg to get the lower 32-bits.
+      if (SrcVT == MVT::i64)
+        CondReg = FastEmitInst_extractsubreg(MVT::i32, CondReg, /*Kill=*/true,
+                                             AArch64::sub_32);
+
+      MRI.constrainRegClass(CondReg, &AArch64::GPR32RegClass);
+      unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(AArch64::ANDWri), ANDReg)
+          .addReg(CondReg)
+          .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(AArch64::SUBSWri))
+          .addReg(ANDReg)
+          .addReg(ANDReg)
+          .addImm(0)
+          .addImm(0);
+
+      unsigned CC = AArch64CC::NE;
+      if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
+        std::swap(TBB, FBB);
+        CC = AArch64CC::EQ;
+      }
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
+          .addImm(CC)
+          .addMBB(TBB);
+      FuncInfo.MBB->addSuccessor(TBB);
+      FastEmitBranch(FBB, DbgLoc);
+      return true;
+    }
+  } else if (const ConstantInt *CI =
+                 dyn_cast<ConstantInt>(BI->getCondition())) {
+    uint64_t Imm = CI->getZExtValue();
+    MachineBasicBlock *Target = (Imm == 0) ? FBB : TBB;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::B))
+        .addMBB(Target);
+    FuncInfo.MBB->addSuccessor(Target);
+    return true;
+  }
+
+  unsigned CondReg = getRegForValue(BI->getCondition());
+  if (CondReg == 0)
+    return false;
+
+  // We've been divorced from our compare!  Our block was split, and
+  // now our compare lives in a predecessor block.  We musn't
+  // re-compare here, as the children of the compare aren't guaranteed
+  // live across the block boundary (we *could* check for this).
+  // Regardless, the compare has been done in the predecessor block,
+  // and it left a value for us in a virtual register.  Ergo, we test
+  // the one-bit value left in the virtual register.
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SUBSWri),
+          AArch64::WZR)
+      .addReg(CondReg)
+      .addImm(0)
+      .addImm(0);
+
+  unsigned CC = AArch64CC::NE;
+  if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
+    std::swap(TBB, FBB);
+    CC = AArch64CC::EQ;
+  }
+
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
+      .addImm(CC)
+      .addMBB(TBB);
+  FuncInfo.MBB->addSuccessor(TBB);
+  FastEmitBranch(FBB, DbgLoc);
+  return true;
+}
+
+bool AArch64FastISel::SelectIndirectBr(const Instruction *I) {
+  const IndirectBrInst *BI = cast<IndirectBrInst>(I);
+  unsigned AddrReg = getRegForValue(BI->getOperand(0));
+  if (AddrReg == 0)
+    return false;
+
+  // Emit the indirect branch.
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BR))
+      .addReg(AddrReg);
+
+  // Make sure the CFG is up-to-date.
+  for (unsigned i = 0, e = BI->getNumSuccessors(); i != e; ++i)
+    FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[BI->getSuccessor(i)]);
+
+  return true;
+}
+
+bool AArch64FastISel::EmitCmp(Value *Src1Value, Value *Src2Value, bool isZExt) {
+  Type *Ty = Src1Value->getType();
+  EVT SrcEVT = TLI.getValueType(Ty, true);
+  if (!SrcEVT.isSimple())
+    return false;
+  MVT SrcVT = SrcEVT.getSimpleVT();
+
+  // Check to see if the 2nd operand is a constant that we can encode directly
+  // in the compare.
+  uint64_t Imm;
+  bool UseImm = false;
+  bool isNegativeImm = false;
+  if (const ConstantInt *ConstInt = dyn_cast<ConstantInt>(Src2Value)) {
+    if (SrcVT == MVT::i64 || SrcVT == MVT::i32 || SrcVT == MVT::i16 ||
+        SrcVT == MVT::i8 || SrcVT == MVT::i1) {
+      const APInt &CIVal = ConstInt->getValue();
+
+      Imm = (isZExt) ? CIVal.getZExtValue() : CIVal.getSExtValue();
+      if (CIVal.isNegative()) {
+        isNegativeImm = true;
+        Imm = -Imm;
+      }
+      // FIXME: We can handle more immediates using shifts.
+      UseImm = ((Imm & 0xfff) == Imm);
+    }
+  } else if (const ConstantFP *ConstFP = dyn_cast<ConstantFP>(Src2Value)) {
+    if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
+      if (ConstFP->isZero() && !ConstFP->isNegative())
+        UseImm = true;
+  }
+
+  unsigned ZReg;
+  unsigned CmpOpc;
+  bool isICmp = true;
+  bool needsExt = false;
+  switch (SrcVT.SimpleTy) {
+  default:
+    return false;
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i16:
+    needsExt = true;
+  // Intentional fall-through.
+  case MVT::i32:
+    ZReg = AArch64::WZR;
+    if (UseImm)
+      CmpOpc = isNegativeImm ? AArch64::ADDSWri : AArch64::SUBSWri;
+    else
+      CmpOpc = AArch64::SUBSWrr;
+    break;
+  case MVT::i64:
+    ZReg = AArch64::XZR;
+    if (UseImm)
+      CmpOpc = isNegativeImm ? AArch64::ADDSXri : AArch64::SUBSXri;
+    else
+      CmpOpc = AArch64::SUBSXrr;
+    break;
+  case MVT::f32:
+    isICmp = false;
+    CmpOpc = UseImm ? AArch64::FCMPSri : AArch64::FCMPSrr;
+    break;
+  case MVT::f64:
+    isICmp = false;
+    CmpOpc = UseImm ? AArch64::FCMPDri : AArch64::FCMPDrr;
+    break;
+  }
+
+  unsigned SrcReg1 = getRegForValue(Src1Value);
+  if (SrcReg1 == 0)
+    return false;
+
+  unsigned SrcReg2;
+  if (!UseImm) {
+    SrcReg2 = getRegForValue(Src2Value);
+    if (SrcReg2 == 0)
+      return false;
+  }
+
+  // We have i1, i8, or i16, we need to either zero extend or sign extend.
+  if (needsExt) {
+    SrcReg1 = EmitIntExt(SrcVT, SrcReg1, MVT::i32, isZExt);
+    if (SrcReg1 == 0)
+      return false;
+    if (!UseImm) {
+      SrcReg2 = EmitIntExt(SrcVT, SrcReg2, MVT::i32, isZExt);
+      if (SrcReg2 == 0)
+        return false;
+    }
+  }
+
+  if (isICmp) {
+    if (UseImm)
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc))
+          .addReg(ZReg)
+          .addReg(SrcReg1)
+          .addImm(Imm)
+          .addImm(0);
+    else
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc))
+          .addReg(ZReg)
+          .addReg(SrcReg1)
+          .addReg(SrcReg2);
+  } else {
+    if (UseImm)
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc))
+          .addReg(SrcReg1);
+    else
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc))
+          .addReg(SrcReg1)
+          .addReg(SrcReg2);
+  }
+  return true;
+}
+
+bool AArch64FastISel::SelectCmp(const Instruction *I) {
+  const CmpInst *CI = cast<CmpInst>(I);
+
+  // We may not handle every CC for now.
+  AArch64CC::CondCode CC = getCompareCC(CI->getPredicate());
+  if (CC == AArch64CC::AL)
+    return false;
+
+  // Emit the cmp.
+  if (!EmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
+    return false;
+
+  // Now set a register based on the comparison.
+  AArch64CC::CondCode invertedCC = getInvertedCondCode(CC);
+  unsigned ResultReg = createResultReg(&AArch64::GPR32RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::CSINCWr),
+          ResultReg)
+      .addReg(AArch64::WZR)
+      .addReg(AArch64::WZR)
+      .addImm(invertedCC);
+
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::SelectSelect(const Instruction *I) {
+  const SelectInst *SI = cast<SelectInst>(I);
+
+  EVT DestEVT = TLI.getValueType(SI->getType(), true);
+  if (!DestEVT.isSimple())
+    return false;
+
+  MVT DestVT = DestEVT.getSimpleVT();
+  if (DestVT != MVT::i32 && DestVT != MVT::i64 && DestVT != MVT::f32 &&
+      DestVT != MVT::f64)
+    return false;
+
+  unsigned CondReg = getRegForValue(SI->getCondition());
+  if (CondReg == 0)
+    return false;
+  unsigned TrueReg = getRegForValue(SI->getTrueValue());
+  if (TrueReg == 0)
+    return false;
+  unsigned FalseReg = getRegForValue(SI->getFalseValue());
+  if (FalseReg == 0)
+    return false;
+
+
+  MRI.constrainRegClass(CondReg, &AArch64::GPR32RegClass);
+  unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri),
+          ANDReg)
+      .addReg(CondReg)
+      .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
+
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SUBSWri))
+      .addReg(ANDReg)
+      .addReg(ANDReg)
+      .addImm(0)
+      .addImm(0);
+
+  unsigned SelectOpc;
+  switch (DestVT.SimpleTy) {
+  default:
+    return false;
+  case MVT::i32:
+    SelectOpc = AArch64::CSELWr;
+    break;
+  case MVT::i64:
+    SelectOpc = AArch64::CSELXr;
+    break;
+  case MVT::f32:
+    SelectOpc = AArch64::FCSELSrrr;
+    break;
+  case MVT::f64:
+    SelectOpc = AArch64::FCSELDrrr;
+    break;
+  }
+
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SelectOpc),
+          ResultReg)
+      .addReg(TrueReg)
+      .addReg(FalseReg)
+      .addImm(AArch64CC::NE);
+
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::SelectFPExt(const Instruction *I) {
+  Value *V = I->getOperand(0);
+  if (!I->getType()->isDoubleTy() || !V->getType()->isFloatTy())
+    return false;
+
+  unsigned Op = getRegForValue(V);
+  if (Op == 0)
+    return false;
+
+  unsigned ResultReg = createResultReg(&AArch64::FPR64RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::FCVTDSr),
+          ResultReg).addReg(Op);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::SelectFPTrunc(const Instruction *I) {
+  Value *V = I->getOperand(0);
+  if (!I->getType()->isFloatTy() || !V->getType()->isDoubleTy())
+    return false;
+
+  unsigned Op = getRegForValue(V);
+  if (Op == 0)
+    return false;
+
+  unsigned ResultReg = createResultReg(&AArch64::FPR32RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::FCVTSDr),
+          ResultReg).addReg(Op);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+// FPToUI and FPToSI
+bool AArch64FastISel::SelectFPToInt(const Instruction *I, bool Signed) {
+  MVT DestVT;
+  if (!isTypeLegal(I->getType(), DestVT) || DestVT.isVector())
+    return false;
+
+  unsigned SrcReg = getRegForValue(I->getOperand(0));
+  if (SrcReg == 0)
+    return false;
+
+  EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType(), true);
+  if (SrcVT == MVT::f128)
+    return false;
+
+  unsigned Opc;
+  if (SrcVT == MVT::f64) {
+    if (Signed)
+      Opc = (DestVT == MVT::i32) ? AArch64::FCVTZSUWDr : AArch64::FCVTZSUXDr;
+    else
+      Opc = (DestVT == MVT::i32) ? AArch64::FCVTZUUWDr : AArch64::FCVTZUUXDr;
+  } else {
+    if (Signed)
+      Opc = (DestVT == MVT::i32) ? AArch64::FCVTZSUWSr : AArch64::FCVTZSUXSr;
+    else
+      Opc = (DestVT == MVT::i32) ? AArch64::FCVTZUUWSr : AArch64::FCVTZUUXSr;
+  }
+  unsigned ResultReg = createResultReg(
+      DestVT == MVT::i32 ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+      .addReg(SrcReg);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::SelectIntToFP(const Instruction *I, bool Signed) {
+  MVT DestVT;
+  if (!isTypeLegal(I->getType(), DestVT) || DestVT.isVector())
+    return false;
+  assert ((DestVT == MVT::f32 || DestVT == MVT::f64) &&
+          "Unexpected value type.");
+
+  unsigned SrcReg = getRegForValue(I->getOperand(0));
+  if (SrcReg == 0)
+    return false;
+
+  EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType(), true);
+
+  // Handle sign-extension.
+  if (SrcVT == MVT::i16 || SrcVT == MVT::i8 || SrcVT == MVT::i1) {
+    SrcReg =
+        EmitIntExt(SrcVT.getSimpleVT(), SrcReg, MVT::i32, /*isZExt*/ !Signed);
+    if (SrcReg == 0)
+      return false;
+  }
+
+  MRI.constrainRegClass(SrcReg, SrcVT == MVT::i64 ? &AArch64::GPR64RegClass
+                                                  : &AArch64::GPR32RegClass);
+
+  unsigned Opc;
+  if (SrcVT == MVT::i64) {
+    if (Signed)
+      Opc = (DestVT == MVT::f32) ? AArch64::SCVTFUXSri : AArch64::SCVTFUXDri;
+    else
+      Opc = (DestVT == MVT::f32) ? AArch64::UCVTFUXSri : AArch64::UCVTFUXDri;
+  } else {
+    if (Signed)
+      Opc = (DestVT == MVT::f32) ? AArch64::SCVTFUWSri : AArch64::SCVTFUWDri;
+    else
+      Opc = (DestVT == MVT::f32) ? AArch64::UCVTFUWSri : AArch64::UCVTFUWDri;
+  }
+
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+      .addReg(SrcReg);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::ProcessCallArgs(
+    SmallVectorImpl<Value *> &Args, SmallVectorImpl<unsigned> &ArgRegs,
+    SmallVectorImpl<MVT> &ArgVTs, SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
+    SmallVectorImpl<unsigned> &RegArgs, CallingConv::ID CC,
+    unsigned &NumBytes) {
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CC, false, *FuncInfo.MF, TM, ArgLocs, *Context);
+  CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CCAssignFnForCall(CC));
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  NumBytes = CCInfo.getNextStackOffset();
+
+  // Issue CALLSEQ_START
+  unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
+      .addImm(NumBytes);
+
+  // Process the args.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    unsigned Arg = ArgRegs[VA.getValNo()];
+    MVT ArgVT = ArgVTs[VA.getValNo()];
+
+    // Handle arg promotion: SExt, ZExt, AExt.
+    switch (VA.getLocInfo()) {
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::SExt: {
+      MVT DestVT = VA.getLocVT();
+      MVT SrcVT = ArgVT;
+      Arg = EmitIntExt(SrcVT, Arg, DestVT, /*isZExt*/ false);
+      if (Arg == 0)
+        return false;
+      break;
+    }
+    case CCValAssign::AExt:
+    // Intentional fall-through.
+    case CCValAssign::ZExt: {
+      MVT DestVT = VA.getLocVT();
+      MVT SrcVT = ArgVT;
+      Arg = EmitIntExt(SrcVT, Arg, DestVT, /*isZExt*/ true);
+      if (Arg == 0)
+        return false;
+      break;
+    }
+    default:
+      llvm_unreachable("Unknown arg promotion!");
+    }
+
+    // Now copy/store arg to correct locations.
+    if (VA.isRegLoc() && !VA.needsCustom()) {
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(Arg);
+      RegArgs.push_back(VA.getLocReg());
+    } else if (VA.needsCustom()) {
+      // FIXME: Handle custom args.
+      return false;
+    } else {
+      assert(VA.isMemLoc() && "Assuming store on stack.");
+
+      // Need to store on the stack.
+      unsigned ArgSize = (ArgVT.getSizeInBits() + 7) / 8;
+
+      unsigned BEAlign = 0;
+      if (ArgSize < 8 && !Subtarget->isLittleEndian())
+        BEAlign = 8 - ArgSize;
+
+      Address Addr;
+      Addr.setKind(Address::RegBase);
+      Addr.setReg(AArch64::SP);
+      Addr.setOffset(VA.getLocMemOffset() + BEAlign);
+
+      if (!EmitStore(ArgVT, Arg, Addr))
+        return false;
+    }
+  }
+  return true;
+}
+
+bool AArch64FastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
+                                 const Instruction *I, CallingConv::ID CC,
+                                 unsigned &NumBytes) {
+  // Issue CALLSEQ_END
+  unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp))
+      .addImm(NumBytes)
+      .addImm(0);
+
+  // Now the return value.
+  if (RetVT != MVT::isVoid) {
+    SmallVector<CCValAssign, 16> RVLocs;
+    CCState CCInfo(CC, false, *FuncInfo.MF, TM, RVLocs, *Context);
+    CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC));
+
+    // Only handle a single return value.
+    if (RVLocs.size() != 1)
+      return false;
+
+    // Copy all of the result registers out of their specified physreg.
+    MVT CopyVT = RVLocs[0].getValVT();
+    unsigned ResultReg = createResultReg(TLI.getRegClassFor(CopyVT));
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY),
+            ResultReg).addReg(RVLocs[0].getLocReg());
+    UsedRegs.push_back(RVLocs[0].getLocReg());
+
+    // Finally update the result.
+    UpdateValueMap(I, ResultReg);
+  }
+
+  return true;
+}
+
+bool AArch64FastISel::SelectCall(const Instruction *I,
+                                 const char *IntrMemName = nullptr) {
+  const CallInst *CI = cast<CallInst>(I);
+  const Value *Callee = CI->getCalledValue();
+
+  // Don't handle inline asm or intrinsics.
+  if (isa<InlineAsm>(Callee))
+    return false;
+
+  // Only handle global variable Callees.
+  const GlobalValue *GV = dyn_cast<GlobalValue>(Callee);
+  if (!GV)
+    return false;
+
+  // Check the calling convention.
+  ImmutableCallSite CS(CI);
+  CallingConv::ID CC = CS.getCallingConv();
+
+  // Let SDISel handle vararg functions.
+  PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
+  FunctionType *FTy = cast<FunctionType>(PT->getElementType());
+  if (FTy->isVarArg())
+    return false;
+
+  // Handle *simple* calls for now.
+  MVT RetVT;
+  Type *RetTy = I->getType();
+  if (RetTy->isVoidTy())
+    RetVT = MVT::isVoid;
+  else if (!isTypeLegal(RetTy, RetVT))
+    return false;
+
+  // Set up the argument vectors.
+  SmallVector<Value *, 8> Args;
+  SmallVector<unsigned, 8> ArgRegs;
+  SmallVector<MVT, 8> ArgVTs;
+  SmallVector<ISD::ArgFlagsTy, 8> ArgFlags;
+  Args.reserve(CS.arg_size());
+  ArgRegs.reserve(CS.arg_size());
+  ArgVTs.reserve(CS.arg_size());
+  ArgFlags.reserve(CS.arg_size());
+
+  for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
+       i != e; ++i) {
+    // If we're lowering a memory intrinsic instead of a regular call, skip the
+    // last two arguments, which shouldn't be passed to the underlying function.
+    if (IntrMemName && e - i <= 2)
+      break;
+
+    unsigned Arg = getRegForValue(*i);
+    if (Arg == 0)
+      return false;
+
+    ISD::ArgFlagsTy Flags;
+    unsigned AttrInd = i - CS.arg_begin() + 1;
+    if (CS.paramHasAttr(AttrInd, Attribute::SExt))
+      Flags.setSExt();
+    if (CS.paramHasAttr(AttrInd, Attribute::ZExt))
+      Flags.setZExt();
+
+    // FIXME: Only handle *easy* calls for now.
+    if (CS.paramHasAttr(AttrInd, Attribute::InReg) ||
+        CS.paramHasAttr(AttrInd, Attribute::StructRet) ||
+        CS.paramHasAttr(AttrInd, Attribute::Nest) ||
+        CS.paramHasAttr(AttrInd, Attribute::ByVal))
+      return false;
+
+    MVT ArgVT;
+    Type *ArgTy = (*i)->getType();
+    if (!isTypeLegal(ArgTy, ArgVT) &&
+        !(ArgVT == MVT::i1 || ArgVT == MVT::i8 || ArgVT == MVT::i16))
+      return false;
+
+    // We don't handle vector parameters yet.
+    if (ArgVT.isVector() || ArgVT.getSizeInBits() > 64)
+      return false;
+
+    unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy);
+    Flags.setOrigAlign(OriginalAlignment);
+
+    Args.push_back(*i);
+    ArgRegs.push_back(Arg);
+    ArgVTs.push_back(ArgVT);
+    ArgFlags.push_back(Flags);
+  }
+
+  // Handle the arguments now that we've gotten them.
+  SmallVector<unsigned, 4> RegArgs;
+  unsigned NumBytes;
+  if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags, RegArgs, CC, NumBytes))
+    return false;
+
+  // Issue the call.
+  MachineInstrBuilder MIB;
+  MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BL));
+  if (!IntrMemName)
+    MIB.addGlobalAddress(GV, 0, 0);
+  else
+    MIB.addExternalSymbol(IntrMemName, 0);
+
+  // Add implicit physical register uses to the call.
+  for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
+    MIB.addReg(RegArgs[i], RegState::Implicit);
+
+  // Add a register mask with the call-preserved registers.
+  // Proper defs for return values will be added by setPhysRegsDeadExcept().
+  MIB.addRegMask(TRI.getCallPreservedMask(CS.getCallingConv()));
+
+  // Finish off the call including any return values.
+  SmallVector<unsigned, 4> UsedRegs;
+  if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes))
+    return false;
+
+  // Set all unused physreg defs as dead.
+  static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI);
+
+  return true;
+}
+
+bool AArch64FastISel::IsMemCpySmall(uint64_t Len, unsigned Alignment) {
+  if (Alignment)
+    return Len / Alignment <= 4;
+  else
+    return Len < 32;
+}
+
+bool AArch64FastISel::TryEmitSmallMemCpy(Address Dest, Address Src,
+                                         uint64_t Len, unsigned Alignment) {
+  // Make sure we don't bloat code by inlining very large memcpy's.
+  if (!IsMemCpySmall(Len, Alignment))
+    return false;
+
+  int64_t UnscaledOffset = 0;
+  Address OrigDest = Dest;
+  Address OrigSrc = Src;
+
+  while (Len) {
+    MVT VT;
+    if (!Alignment || Alignment >= 8) {
+      if (Len >= 8)
+        VT = MVT::i64;
+      else if (Len >= 4)
+        VT = MVT::i32;
+      else if (Len >= 2)
+        VT = MVT::i16;
+      else {
+        VT = MVT::i8;
+      }
+    } else {
+      // Bound based on alignment.
+      if (Len >= 4 && Alignment == 4)
+        VT = MVT::i32;
+      else if (Len >= 2 && Alignment == 2)
+        VT = MVT::i16;
+      else {
+        VT = MVT::i8;
+      }
+    }
+
+    bool RV;
+    unsigned ResultReg;
+    RV = EmitLoad(VT, ResultReg, Src);
+    if (!RV)
+      return false;
+
+    RV = EmitStore(VT, ResultReg, Dest);
+    if (!RV)
+      return false;
+
+    int64_t Size = VT.getSizeInBits() / 8;
+    Len -= Size;
+    UnscaledOffset += Size;
+
+    // We need to recompute the unscaled offset for each iteration.
+    Dest.setOffset(OrigDest.getOffset() + UnscaledOffset);
+    Src.setOffset(OrigSrc.getOffset() + UnscaledOffset);
+  }
+
+  return true;
+}
+
+bool AArch64FastISel::SelectIntrinsicCall(const IntrinsicInst &I) {
+  // FIXME: Handle more intrinsics.
+  switch (I.getIntrinsicID()) {
+  default:
+    return false;
+  case Intrinsic::memcpy:
+  case Intrinsic::memmove: {
+    const MemTransferInst &MTI = cast<MemTransferInst>(I);
+    // Don't handle volatile.
+    if (MTI.isVolatile())
+      return false;
+
+    // Disable inlining for memmove before calls to ComputeAddress.  Otherwise,
+    // we would emit dead code because we don't currently handle memmoves.
+    bool isMemCpy = (I.getIntrinsicID() == Intrinsic::memcpy);
+    if (isa<ConstantInt>(MTI.getLength()) && isMemCpy) {
+      // Small memcpy's are common enough that we want to do them without a call
+      // if possible.
+      uint64_t Len = cast<ConstantInt>(MTI.getLength())->getZExtValue();
+      unsigned Alignment = MTI.getAlignment();
+      if (IsMemCpySmall(Len, Alignment)) {
+        Address Dest, Src;
+        if (!ComputeAddress(MTI.getRawDest(), Dest) ||
+            !ComputeAddress(MTI.getRawSource(), Src))
+          return false;
+        if (TryEmitSmallMemCpy(Dest, Src, Len, Alignment))
+          return true;
+      }
+    }
+
+    if (!MTI.getLength()->getType()->isIntegerTy(64))
+      return false;
+
+    if (MTI.getSourceAddressSpace() > 255 || MTI.getDestAddressSpace() > 255)
+      // Fast instruction selection doesn't support the special
+      // address spaces.
+      return false;
+
+    const char *IntrMemName = isa<MemCpyInst>(I) ? "memcpy" : "memmove";
+    return SelectCall(&I, IntrMemName);
+  }
+  case Intrinsic::memset: {
+    const MemSetInst &MSI = cast<MemSetInst>(I);
+    // Don't handle volatile.
+    if (MSI.isVolatile())
+      return false;
+
+    if (!MSI.getLength()->getType()->isIntegerTy(64))
+      return false;
+
+    if (MSI.getDestAddressSpace() > 255)
+      // Fast instruction selection doesn't support the special
+      // address spaces.
+      return false;
+
+    return SelectCall(&I, "memset");
+  }
+  case Intrinsic::trap: {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BRK))
+        .addImm(1);
+    return true;
+  }
+  }
+  return false;
+}
+
+bool AArch64FastISel::SelectRet(const Instruction *I) {
+  const ReturnInst *Ret = cast<ReturnInst>(I);
+  const Function &F = *I->getParent()->getParent();
+
+  if (!FuncInfo.CanLowerReturn)
+    return false;
+
+  if (F.isVarArg())
+    return false;
+
+  // Build a list of return value registers.
+  SmallVector<unsigned, 4> RetRegs;
+
+  if (Ret->getNumOperands() > 0) {
+    CallingConv::ID CC = F.getCallingConv();
+    SmallVector<ISD::OutputArg, 4> Outs;
+    GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI);
+
+    // Analyze operands of the call, assigning locations to each operand.
+    SmallVector<CCValAssign, 16> ValLocs;
+    CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs,
+                   I->getContext());
+    CCAssignFn *RetCC = CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS
+                                                     : RetCC_AArch64_AAPCS;
+    CCInfo.AnalyzeReturn(Outs, RetCC);
+
+    // Only handle a single return value for now.
+    if (ValLocs.size() != 1)
+      return false;
+
+    CCValAssign &VA = ValLocs[0];
+    const Value *RV = Ret->getOperand(0);
+
+    // Don't bother handling odd stuff for now.
+    if (VA.getLocInfo() != CCValAssign::Full)
+      return false;
+    // Only handle register returns for now.
+    if (!VA.isRegLoc())
+      return false;
+    unsigned Reg = getRegForValue(RV);
+    if (Reg == 0)
+      return false;
+
+    unsigned SrcReg = Reg + VA.getValNo();
+    unsigned DestReg = VA.getLocReg();
+    // Avoid a cross-class copy. This is very unlikely.
+    if (!MRI.getRegClass(SrcReg)->contains(DestReg))
+      return false;
+
+    EVT RVEVT = TLI.getValueType(RV->getType());
+    if (!RVEVT.isSimple())
+      return false;
+
+    // Vectors (of > 1 lane) in big endian need tricky handling.
+    if (RVEVT.isVector() && RVEVT.getVectorNumElements() > 1)
+      return false;
+
+    MVT RVVT = RVEVT.getSimpleVT();
+    if (RVVT == MVT::f128)
+      return false;
+    MVT DestVT = VA.getValVT();
+    // Special handling for extended integers.
+    if (RVVT != DestVT) {
+      if (RVVT != MVT::i1 && RVVT != MVT::i8 && RVVT != MVT::i16)
+        return false;
+
+      if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt())
+        return false;
+
+      bool isZExt = Outs[0].Flags.isZExt();
+      SrcReg = EmitIntExt(RVVT, SrcReg, DestVT, isZExt);
+      if (SrcReg == 0)
+        return false;
+    }
+
+    // Make the copy.
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), DestReg).addReg(SrcReg);
+
+    // Add register to return instruction.
+    RetRegs.push_back(VA.getLocReg());
+  }
+
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                                    TII.get(AArch64::RET_ReallyLR));
+  for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
+    MIB.addReg(RetRegs[i], RegState::Implicit);
+  return true;
+}
+
+bool AArch64FastISel::SelectTrunc(const Instruction *I) {
+  Type *DestTy = I->getType();
+  Value *Op = I->getOperand(0);
+  Type *SrcTy = Op->getType();
+
+  EVT SrcEVT = TLI.getValueType(SrcTy, true);
+  EVT DestEVT = TLI.getValueType(DestTy, true);
+  if (!SrcEVT.isSimple())
+    return false;
+  if (!DestEVT.isSimple())
+    return false;
+
+  MVT SrcVT = SrcEVT.getSimpleVT();
+  MVT DestVT = DestEVT.getSimpleVT();
+
+  if (SrcVT != MVT::i64 && SrcVT != MVT::i32 && SrcVT != MVT::i16 &&
+      SrcVT != MVT::i8)
+    return false;
+  if (DestVT != MVT::i32 && DestVT != MVT::i16 && DestVT != MVT::i8 &&
+      DestVT != MVT::i1)
+    return false;
+
+  unsigned SrcReg = getRegForValue(Op);
+  if (!SrcReg)
+    return false;
+
+  // If we're truncating from i64 to a smaller non-legal type then generate an
+  // AND.  Otherwise, we know the high bits are undefined and a truncate doesn't
+  // generate any code.
+  if (SrcVT == MVT::i64) {
+    uint64_t Mask = 0;
+    switch (DestVT.SimpleTy) {
+    default:
+      // Trunc i64 to i32 is handled by the target-independent fast-isel.
+      return false;
+    case MVT::i1:
+      Mask = 0x1;
+      break;
+    case MVT::i8:
+      Mask = 0xff;
+      break;
+    case MVT::i16:
+      Mask = 0xffff;
+      break;
+    }
+    // Issue an extract_subreg to get the lower 32-bits.
+    unsigned Reg32 = FastEmitInst_extractsubreg(MVT::i32, SrcReg, /*Kill=*/true,
+                                                AArch64::sub_32);
+    MRI.constrainRegClass(Reg32, &AArch64::GPR32RegClass);
+    // Create the AND instruction which performs the actual truncation.
+    unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri),
+            ANDReg)
+        .addReg(Reg32)
+        .addImm(AArch64_AM::encodeLogicalImmediate(Mask, 32));
+    SrcReg = ANDReg;
+  }
+
+  UpdateValueMap(I, SrcReg);
+  return true;
+}
+
+unsigned AArch64FastISel::Emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt) {
+  assert((DestVT == MVT::i8 || DestVT == MVT::i16 || DestVT == MVT::i32 ||
+          DestVT == MVT::i64) &&
+         "Unexpected value type.");
+  // Handle i8 and i16 as i32.
+  if (DestVT == MVT::i8 || DestVT == MVT::i16)
+    DestVT = MVT::i32;
+
+  if (isZExt) {
+    MRI.constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
+    unsigned ResultReg = createResultReg(&AArch64::GPR32spRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri),
+            ResultReg)
+        .addReg(SrcReg)
+        .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
+
+    if (DestVT == MVT::i64) {
+      // We're ZExt i1 to i64.  The ANDWri Wd, Ws, #1 implicitly clears the
+      // upper 32 bits.  Emit a SUBREG_TO_REG to extend from Wd to Xd.
+      unsigned Reg64 = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(AArch64::SUBREG_TO_REG), Reg64)
+          .addImm(0)
+          .addReg(ResultReg)
+          .addImm(AArch64::sub_32);
+      ResultReg = Reg64;
+    }
+    return ResultReg;
+  } else {
+    if (DestVT == MVT::i64) {
+      // FIXME: We're SExt i1 to i64.
+      return 0;
+    }
+    unsigned ResultReg = createResultReg(&AArch64::GPR32RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SBFMWri),
+            ResultReg)
+        .addReg(SrcReg)
+        .addImm(0)
+        .addImm(0);
+    return ResultReg;
+  }
+}
+
+unsigned AArch64FastISel::EmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+                                     bool isZExt) {
+  assert(DestVT != MVT::i1 && "ZeroExt/SignExt an i1?");
+
+  // FastISel does not have plumbing to deal with extensions where the SrcVT or
+  // DestVT are odd things, so test to make sure that they are both types we can
+  // handle (i1/i8/i16/i32 for SrcVT and i8/i16/i32/i64 for DestVT), otherwise
+  // bail out to SelectionDAG.
+  if (((DestVT != MVT::i8) && (DestVT != MVT::i16) &&
+       (DestVT != MVT::i32) && (DestVT != MVT::i64)) ||
+      ((SrcVT !=  MVT::i1) && (SrcVT !=  MVT::i8) &&
+       (SrcVT !=  MVT::i16) && (SrcVT !=  MVT::i32)))
+    return 0;
+
+  unsigned Opc;
+  unsigned Imm = 0;
+
+  switch (SrcVT.SimpleTy) {
+  default:
+    return 0;
+  case MVT::i1:
+    return Emiti1Ext(SrcReg, DestVT, isZExt);
+  case MVT::i8:
+    if (DestVT == MVT::i64)
+      Opc = isZExt ? AArch64::UBFMXri : AArch64::SBFMXri;
+    else
+      Opc = isZExt ? AArch64::UBFMWri : AArch64::SBFMWri;
+    Imm = 7;
+    break;
+  case MVT::i16:
+    if (DestVT == MVT::i64)
+      Opc = isZExt ? AArch64::UBFMXri : AArch64::SBFMXri;
+    else
+      Opc = isZExt ? AArch64::UBFMWri : AArch64::SBFMWri;
+    Imm = 15;
+    break;
+  case MVT::i32:
+    assert(DestVT == MVT::i64 && "IntExt i32 to i32?!?");
+    Opc = isZExt ? AArch64::UBFMXri : AArch64::SBFMXri;
+    Imm = 31;
+    break;
+  }
+
+  // Handle i8 and i16 as i32.
+  if (DestVT == MVT::i8 || DestVT == MVT::i16)
+    DestVT = MVT::i32;
+  else if (DestVT == MVT::i64) {
+    unsigned Src64 = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(AArch64::SUBREG_TO_REG), Src64)
+        .addImm(0)
+        .addReg(SrcReg)
+        .addImm(AArch64::sub_32);
+    SrcReg = Src64;
+  }
+
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+      .addReg(SrcReg)
+      .addImm(0)
+      .addImm(Imm);
+
+  return ResultReg;
+}
+
+bool AArch64FastISel::SelectIntExt(const Instruction *I) {
+  // On ARM, in general, integer casts don't involve legal types; this code
+  // handles promotable integers.  The high bits for a type smaller than
+  // the register size are assumed to be undefined.
+  Type *DestTy = I->getType();
+  Value *Src = I->getOperand(0);
+  Type *SrcTy = Src->getType();
+
+  bool isZExt = isa<ZExtInst>(I);
+  unsigned SrcReg = getRegForValue(Src);
+  if (!SrcReg)
+    return false;
+
+  EVT SrcEVT = TLI.getValueType(SrcTy, true);
+  EVT DestEVT = TLI.getValueType(DestTy, true);
+  if (!SrcEVT.isSimple())
+    return false;
+  if (!DestEVT.isSimple())
+    return false;
+
+  MVT SrcVT = SrcEVT.getSimpleVT();
+  MVT DestVT = DestEVT.getSimpleVT();
+  unsigned ResultReg = EmitIntExt(SrcVT, SrcReg, DestVT, isZExt);
+  if (ResultReg == 0)
+    return false;
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::SelectRem(const Instruction *I, unsigned ISDOpcode) {
+  EVT DestEVT = TLI.getValueType(I->getType(), true);
+  if (!DestEVT.isSimple())
+    return false;
+
+  MVT DestVT = DestEVT.getSimpleVT();
+  if (DestVT != MVT::i64 && DestVT != MVT::i32)
+    return false;
+
+  unsigned DivOpc;
+  bool is64bit = (DestVT == MVT::i64);
+  switch (ISDOpcode) {
+  default:
+    return false;
+  case ISD::SREM:
+    DivOpc = is64bit ? AArch64::SDIVXr : AArch64::SDIVWr;
+    break;
+  case ISD::UREM:
+    DivOpc = is64bit ? AArch64::UDIVXr : AArch64::UDIVWr;
+    break;
+  }
+  unsigned MSubOpc = is64bit ? AArch64::MSUBXrrr : AArch64::MSUBWrrr;
+  unsigned Src0Reg = getRegForValue(I->getOperand(0));
+  if (!Src0Reg)
+    return false;
+
+  unsigned Src1Reg = getRegForValue(I->getOperand(1));
+  if (!Src1Reg)
+    return false;
+
+  unsigned QuotReg = createResultReg(TLI.getRegClassFor(DestVT));
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(DivOpc), QuotReg)
+      .addReg(Src0Reg)
+      .addReg(Src1Reg);
+  // The remainder is computed as numerator - (quotient * denominator) using the
+  // MSUB instruction.
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MSubOpc), ResultReg)
+      .addReg(QuotReg)
+      .addReg(Src1Reg)
+      .addReg(Src0Reg);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::SelectMul(const Instruction *I) {
+  EVT SrcEVT = TLI.getValueType(I->getOperand(0)->getType(), true);
+  if (!SrcEVT.isSimple())
+    return false;
+  MVT SrcVT = SrcEVT.getSimpleVT();
+
+  // Must be simple value type.  Don't handle vectors.
+  if (SrcVT != MVT::i64 && SrcVT != MVT::i32 && SrcVT != MVT::i16 &&
+      SrcVT != MVT::i8)
+    return false;
+
+  unsigned Opc;
+  unsigned ZReg;
+  switch (SrcVT.SimpleTy) {
+  default:
+    return false;
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+    ZReg = AArch64::WZR;
+    Opc = AArch64::MADDWrrr;
+    SrcVT = MVT::i32;
+    break;
+  case MVT::i64:
+    ZReg = AArch64::XZR;
+    Opc = AArch64::MADDXrrr;
+    break;
+  }
+
+  unsigned Src0Reg = getRegForValue(I->getOperand(0));
+  if (!Src0Reg)
+    return false;
+
+  unsigned Src1Reg = getRegForValue(I->getOperand(1));
+  if (!Src1Reg)
+    return false;
+
+  // Create the base instruction, then add the operands.
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(SrcVT));
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+      .addReg(Src0Reg)
+      .addReg(Src1Reg)
+      .addReg(ZReg);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::TargetSelectInstruction(const Instruction *I) {
+  switch (I->getOpcode()) {
+  default:
+    break;
+  case Instruction::Load:
+    return SelectLoad(I);
+  case Instruction::Store:
+    return SelectStore(I);
+  case Instruction::Br:
+    return SelectBranch(I);
+  case Instruction::IndirectBr:
+    return SelectIndirectBr(I);
+  case Instruction::FCmp:
+  case Instruction::ICmp:
+    return SelectCmp(I);
+  case Instruction::Select:
+    return SelectSelect(I);
+  case Instruction::FPExt:
+    return SelectFPExt(I);
+  case Instruction::FPTrunc:
+    return SelectFPTrunc(I);
+  case Instruction::FPToSI:
+    return SelectFPToInt(I, /*Signed=*/true);
+  case Instruction::FPToUI:
+    return SelectFPToInt(I, /*Signed=*/false);
+  case Instruction::SIToFP:
+    return SelectIntToFP(I, /*Signed=*/true);
+  case Instruction::UIToFP:
+    return SelectIntToFP(I, /*Signed=*/false);
+  case Instruction::SRem:
+    return SelectRem(I, ISD::SREM);
+  case Instruction::URem:
+    return SelectRem(I, ISD::UREM);
+  case Instruction::Call:
+    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
+      return SelectIntrinsicCall(*II);
+    return SelectCall(I);
+  case Instruction::Ret:
+    return SelectRet(I);
+  case Instruction::Trunc:
+    return SelectTrunc(I);
+  case Instruction::ZExt:
+  case Instruction::SExt:
+    return SelectIntExt(I);
+  case Instruction::Mul:
+    // FIXME: This really should be handled by the target-independent selector.
+    return SelectMul(I);
+  }
+  return false;
+  // Silence warnings.
+  (void)&CC_AArch64_DarwinPCS_VarArg;
+}
+
+namespace llvm {
+llvm::FastISel *AArch64::createFastISel(FunctionLoweringInfo &funcInfo,
+                                        const TargetLibraryInfo *libInfo) {
+  return new AArch64FastISel(funcInfo, libInfo);
+}
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
new file mode 100644
index 0000000..9c33717
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -0,0 +1,892 @@
+//===- AArch64FrameLowering.cpp - AArch64 Frame Lowering -------*- C++ -*-====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AArch64 implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64FrameLowering.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64Subtarget.h"
+#include "AArch64TargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "frame-info"
+
+static cl::opt<bool> EnableRedZone("aarch64-redzone",
+                                   cl::desc("enable use of redzone on AArch64"),
+                                   cl::init(false), cl::Hidden);
+
+STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
+
+static unsigned estimateStackSize(MachineFunction &MF) {
+  const MachineFrameInfo *FFI = MF.getFrameInfo();
+  int Offset = 0;
+  for (int i = FFI->getObjectIndexBegin(); i != 0; ++i) {
+    int FixedOff = -FFI->getObjectOffset(i);
+    if (FixedOff > Offset)
+      Offset = FixedOff;
+  }
+  for (unsigned i = 0, e = FFI->getObjectIndexEnd(); i != e; ++i) {
+    if (FFI->isDeadObjectIndex(i))
+      continue;
+    Offset += FFI->getObjectSize(i);
+    unsigned Align = FFI->getObjectAlignment(i);
+    // Adjust to alignment boundary
+    Offset = (Offset + Align - 1) / Align * Align;
+  }
+  // This does not include the 16 bytes used for fp and lr.
+  return (unsigned)Offset;
+}
+
+bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
+  if (!EnableRedZone)
+    return false;
+  // Don't use the red zone if the function explicitly asks us not to.
+  // This is typically used for kernel code.
+  if (MF.getFunction()->getAttributes().hasAttribute(
+          AttributeSet::FunctionIndex, Attribute::NoRedZone))
+    return false;
+
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  unsigned NumBytes = AFI->getLocalStackSize();
+
+  // Note: currently hasFP() is always true for hasCalls(), but that's an
+  // implementation detail of the current code, not a strict requirement,
+  // so stay safe here and check both.
+  if (MFI->hasCalls() || hasFP(MF) || NumBytes > 128)
+    return false;
+  return true;
+}
+
+/// hasFP - Return true if the specified function should have a dedicated frame
+/// pointer register.
+bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+#ifndef NDEBUG
+  const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
+  assert(!RegInfo->needsStackRealignment(MF) &&
+         "No stack realignment on AArch64!");
+#endif
+
+  return (MFI->hasCalls() || MFI->hasVarSizedObjects() ||
+          MFI->isFrameAddressTaken());
+}
+
+/// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
+/// not required, we reserve argument space for call sites in the function
+/// immediately on entry to the current function.  This eliminates the need for
+/// add/sub sp brackets around call sites.  Returns true if the call frame is
+/// included as part of the stack frame.
+bool
+AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+  return !MF.getFrameInfo()->hasVarSizedObjects();
+}
+
+void AArch64FrameLowering::eliminateCallFramePseudoInstr(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator I) const {
+  const AArch64InstrInfo *TII =
+      static_cast<const AArch64InstrInfo *>(MF.getTarget().getInstrInfo());
+  DebugLoc DL = I->getDebugLoc();
+  int Opc = I->getOpcode();
+  bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
+  uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
+
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  if (!TFI->hasReservedCallFrame(MF)) {
+    unsigned Align = getStackAlignment();
+
+    int64_t Amount = I->getOperand(0).getImm();
+    Amount = RoundUpToAlignment(Amount, Align);
+    if (!IsDestroy)
+      Amount = -Amount;
+
+    // N.b. if CalleePopAmount is valid but zero (i.e. callee would pop, but it
+    // doesn't have to pop anything), then the first operand will be zero too so
+    // this adjustment is a no-op.
+    if (CalleePopAmount == 0) {
+      // FIXME: in-function stack adjustment for calls is limited to 24-bits
+      // because there's no guaranteed temporary register available.
+      //
+      // ADD/SUB (immediate) has only LSL #0 and LSL #12 avaiable.
+      // 1) For offset <= 12-bit, we use LSL #0
+      // 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses
+      // LSL #0, and the other uses LSL #12.
+      //
+      // Mostly call frames will be allocated at the start of a function so
+      // this is OK, but it is a limitation that needs dealing with.
+      assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
+      emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, Amount, TII);
+    }
+  } else if (CalleePopAmount != 0) {
+    // If the calling convention demands that the callee pops arguments from the
+    // stack, we want to add it back if we have a reserved call frame.
+    assert(CalleePopAmount < 0xffffff && "call frame too large");
+    emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, -CalleePopAmount,
+                    TII);
+  }
+  MBB.erase(I);
+}
+
+void AArch64FrameLowering::emitCalleeSavedFrameMoves(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    unsigned FramePtr) const {
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineModuleInfo &MMI = MF.getMMI();
+  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+  const TargetInstrInfo *TII = MF.getTarget().getInstrInfo();
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+
+  // Add callee saved registers to move list.
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+  if (CSI.empty())
+    return;
+
+  const DataLayout *TD = MF.getTarget().getDataLayout();
+  bool HasFP = hasFP(MF);
+
+  // Calculate amount of bytes used for return address storing.
+  int stackGrowth = -TD->getPointerSize(0);
+
+  // Calculate offsets.
+  int64_t saveAreaOffset = (HasFP ? 2 : 1) * stackGrowth;
+  unsigned TotalSkipped = 0;
+  for (const auto &Info : CSI) {
+    unsigned Reg = Info.getReg();
+    int64_t Offset = MFI->getObjectOffset(Info.getFrameIdx()) -
+                     getOffsetOfLocalArea() + saveAreaOffset;
+
+    // Don't output a new CFI directive if we're re-saving the frame pointer or
+    // link register. This happens when the PrologEpilogInserter has inserted an
+    // extra "STP" of the frame pointer and link register -- the "emitPrologue"
+    // method automatically generates the directives when frame pointers are
+    // used. If we generate CFI directives for the extra "STP"s, the linker will
+    // lose track of the correct values for the frame pointer and link register.
+    if (HasFP && (FramePtr == Reg || Reg == AArch64::LR)) {
+      TotalSkipped += stackGrowth;
+      continue;
+    }
+
+    unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
+    unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
+        nullptr, DwarfReg, Offset - TotalSkipped));
+    BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
+  }
+}
+
+void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front(); // Prologue goes in entry BB.
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const Function *Fn = MF.getFunction();
+  const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
+      MF.getTarget().getRegisterInfo());
+  const TargetInstrInfo *TII = MF.getTarget().getInstrInfo();
+  MachineModuleInfo &MMI = MF.getMMI();
+  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  bool needsFrameMoves = MMI.hasDebugInfo() || Fn->needsUnwindTableEntry();
+  bool HasFP = hasFP(MF);
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+
+  int NumBytes = (int)MFI->getStackSize();
+  if (!AFI->hasStackFrame()) {
+    assert(!HasFP && "unexpected function without stack frame but with FP");
+
+    // All of the stack allocation is for locals.
+    AFI->setLocalStackSize(NumBytes);
+
+    // Label used to tie together the PROLOG_LABEL and the MachineMoves.
+    MCSymbol *FrameLabel = MMI.getContext().CreateTempSymbol();
+
+    // REDZONE: If the stack size is less than 128 bytes, we don't need
+    // to actually allocate.
+    if (NumBytes && !canUseRedZone(MF)) {
+      emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII,
+                      MachineInstr::FrameSetup);
+
+      // Encode the stack size of the leaf function.
+      unsigned CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createDefCfaOffset(FrameLabel, -NumBytes));
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    } else if (NumBytes) {
+      ++NumRedZoneFunctions;
+    }
+
+    return;
+  }
+
+  // Only set up FP if we actually need to.
+  int FPOffset = 0;
+  if (HasFP) {
+    // First instruction must a) allocate the stack  and b) have an immediate
+    // that is a multiple of -2.
+    assert((MBBI->getOpcode() == AArch64::STPXpre ||
+            MBBI->getOpcode() == AArch64::STPDpre) &&
+           MBBI->getOperand(3).getReg() == AArch64::SP &&
+           MBBI->getOperand(4).getImm() < 0 &&
+           (MBBI->getOperand(4).getImm() & 1) == 0);
+
+    // Frame pointer is fp = sp - 16. Since the  STPXpre subtracts the space
+    // required for the callee saved register area we get the frame pointer
+    // by addding that offset - 16 = -getImm()*8 - 2*8 = -(getImm() + 2) * 8.
+    FPOffset = -(MBBI->getOperand(4).getImm() + 2) * 8;
+    assert(FPOffset >= 0 && "Bad Framepointer Offset");
+  }
+
+  // Move past the saves of the callee-saved registers.
+  while (MBBI->getOpcode() == AArch64::STPXi ||
+         MBBI->getOpcode() == AArch64::STPDi ||
+         MBBI->getOpcode() == AArch64::STPXpre ||
+         MBBI->getOpcode() == AArch64::STPDpre) {
+    ++MBBI;
+    NumBytes -= 16;
+  }
+  assert(NumBytes >= 0 && "Negative stack allocation size!?");
+  if (HasFP) {
+    // Issue    sub fp, sp, FPOffset or
+    //          mov fp,sp          when FPOffset is zero.
+    // Note: All stores of callee-saved registers are marked as "FrameSetup".
+    // This code marks the instruction(s) that set the FP also.
+    emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP, FPOffset, TII,
+                    MachineInstr::FrameSetup);
+  }
+
+  // All of the remaining stack allocations are for locals.
+  AFI->setLocalStackSize(NumBytes);
+
+  // Allocate space for the rest of the frame.
+  if (NumBytes) {
+    // If we're a leaf function, try using the red zone.
+    if (!canUseRedZone(MF))
+      emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII,
+                      MachineInstr::FrameSetup);
+  }
+
+  // If we need a base pointer, set it up here. It's whatever the value of the
+  // stack pointer is at this point. Any variable size objects will be allocated
+  // after this, so we can still use the base pointer to reference locals.
+  //
+  // FIXME: Clarify FrameSetup flags here.
+  // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is
+  // needed.
+  //
+  if (RegInfo->hasBasePointer(MF))
+    TII->copyPhysReg(MBB, MBBI, DL, AArch64::X19, AArch64::SP, false);
+
+  if (needsFrameMoves) {
+    const DataLayout *TD = MF.getTarget().getDataLayout();
+    const int StackGrowth = -TD->getPointerSize(0);
+    unsigned FramePtr = RegInfo->getFrameRegister(MF);
+
+    // An example of the prologue:
+    //
+    //     .globl __foo
+    //     .align 2
+    //  __foo:
+    // Ltmp0:
+    //     .cfi_startproc
+    //     .cfi_personality 155, ___gxx_personality_v0
+    // Leh_func_begin:
+    //     .cfi_lsda 16, Lexception33
+    //
+    //     stp  xa,bx, [sp, -#offset]!
+    //     ...
+    //     stp  x28, x27, [sp, #offset-32]
+    //     stp  fp, lr, [sp, #offset-16]
+    //     add  fp, sp, #offset - 16
+    //     sub  sp, sp, #1360
+    //
+    // The Stack:
+    //       +-------------------------------------------+
+    // 10000 | ........ | ........ | ........ | ........ |
+    // 10004 | ........ | ........ | ........ | ........ |
+    //       +-------------------------------------------+
+    // 10008 | ........ | ........ | ........ | ........ |
+    // 1000c | ........ | ........ | ........ | ........ |
+    //       +===========================================+
+    // 10010 |                X28 Register               |
+    // 10014 |                X28 Register               |
+    //       +-------------------------------------------+
+    // 10018 |                X27 Register               |
+    // 1001c |                X27 Register               |
+    //       +===========================================+
+    // 10020 |                Frame Pointer              |
+    // 10024 |                Frame Pointer              |
+    //       +-------------------------------------------+
+    // 10028 |                Link Register              |
+    // 1002c |                Link Register              |
+    //       +===========================================+
+    // 10030 | ........ | ........ | ........ | ........ |
+    // 10034 | ........ | ........ | ........ | ........ |
+    //       +-------------------------------------------+
+    // 10038 | ........ | ........ | ........ | ........ |
+    // 1003c | ........ | ........ | ........ | ........ |
+    //       +-------------------------------------------+
+    //
+    //     [sp] = 10030        ::    >>initial value<<
+    //     sp = 10020          ::  stp fp, lr, [sp, #-16]!
+    //     fp = sp == 10020    ::  mov fp, sp
+    //     [sp] == 10020       ::  stp x28, x27, [sp, #-16]!
+    //     sp == 10010         ::    >>final value<<
+    //
+    // The frame pointer (w29) points to address 10020. If we use an offset of
+    // '16' from 'w29', we get the CFI offsets of -8 for w30, -16 for w29, -24
+    // for w27, and -32 for w28:
+    //
+    //  Ltmp1:
+    //     .cfi_def_cfa w29, 16
+    //  Ltmp2:
+    //     .cfi_offset w30, -8
+    //  Ltmp3:
+    //     .cfi_offset w29, -16
+    //  Ltmp4:
+    //     .cfi_offset w27, -24
+    //  Ltmp5:
+    //     .cfi_offset w28, -32
+
+    if (HasFP) {
+      // Define the current CFA rule to use the provided FP.
+      unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true);
+      unsigned CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createDefCfa(nullptr, Reg, 2 * StackGrowth));
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+
+      // Record the location of the stored LR
+      unsigned LR = RegInfo->getDwarfRegNum(AArch64::LR, true);
+      CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createOffset(nullptr, LR, StackGrowth));
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+
+      // Record the location of the stored FP
+      CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createOffset(nullptr, Reg, 2 * StackGrowth));
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    } else {
+      // Encode the stack size of the leaf function.
+      unsigned CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createDefCfaOffset(nullptr, -MFI->getStackSize()));
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    }
+
+    // Now emit the moves for whatever callee saved regs we have.
+    emitCalleeSavedFrameMoves(MBB, MBBI, FramePtr);
+  }
+}
+
+static bool isCalleeSavedRegister(unsigned Reg, const MCPhysReg *CSRegs) {
+  for (unsigned i = 0; CSRegs[i]; ++i)
+    if (Reg == CSRegs[i])
+      return true;
+  return false;
+}
+
+static bool isCSRestore(MachineInstr *MI, const MCPhysReg *CSRegs) {
+  unsigned RtIdx = 0;
+  if (MI->getOpcode() == AArch64::LDPXpost ||
+      MI->getOpcode() == AArch64::LDPDpost)
+    RtIdx = 1;
+
+  if (MI->getOpcode() == AArch64::LDPXpost ||
+      MI->getOpcode() == AArch64::LDPDpost ||
+      MI->getOpcode() == AArch64::LDPXi || MI->getOpcode() == AArch64::LDPDi) {
+    if (!isCalleeSavedRegister(MI->getOperand(RtIdx).getReg(), CSRegs) ||
+        !isCalleeSavedRegister(MI->getOperand(RtIdx + 1).getReg(), CSRegs) ||
+        MI->getOperand(RtIdx + 2).getReg() != AArch64::SP)
+      return false;
+    return true;
+  }
+
+  return false;
+}
+
+void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
+                                        MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  assert(MBBI->isReturn() && "Can only insert epilog into returning blocks");
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const AArch64InstrInfo *TII =
+      static_cast<const AArch64InstrInfo *>(MF.getTarget().getInstrInfo());
+  const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
+      MF.getTarget().getRegisterInfo());
+  DebugLoc DL = MBBI->getDebugLoc();
+  unsigned RetOpcode = MBBI->getOpcode();
+
+  int NumBytes = MFI->getStackSize();
+  const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+
+  // Initial and residual are named for consitency with the prologue. Note that
+  // in the epilogue, the residual adjustment is executed first.
+  uint64_t ArgumentPopSize = 0;
+  if (RetOpcode == AArch64::TCRETURNdi || RetOpcode == AArch64::TCRETURNri) {
+    MachineOperand &StackAdjust = MBBI->getOperand(1);
+
+    // For a tail-call in a callee-pops-arguments environment, some or all of
+    // the stack may actually be in use for the call's arguments, this is
+    // calculated during LowerCall and consumed here...
+    ArgumentPopSize = StackAdjust.getImm();
+  } else {
+    // ... otherwise the amount to pop is *all* of the argument space,
+    // conveniently stored in the MachineFunctionInfo by
+    // LowerFormalArguments. This will, of course, be zero for the C calling
+    // convention.
+    ArgumentPopSize = AFI->getArgumentStackToRestore();
+  }
+
+  // The stack frame should be like below,
+  //
+  //      ----------------------                     ---
+  //      |                    |                      |
+  //      | BytesInStackArgArea|              CalleeArgStackSize
+  //      | (NumReusableBytes) |                (of tail call)
+  //      |                    |                     ---
+  //      |                    |                      |
+  //      ---------------------|        ---           |
+  //      |                    |         |            |
+  //      |   CalleeSavedReg   |         |            |
+  //      | (NumRestores * 16) |         |            |
+  //      |                    |         |            |
+  //      ---------------------|         |         NumBytes
+  //      |                    |     StackSize  (StackAdjustUp)
+  //      |   LocalStackSize   |         |            |
+  //      | (covering callee   |         |            |
+  //      |       args)        |         |            |
+  //      |                    |         |            |
+  //      ----------------------        ---          ---
+  //
+  // So NumBytes = StackSize + BytesInStackArgArea - CalleeArgStackSize
+  //             = StackSize + ArgumentPopSize
+  //
+  // AArch64TargetLowering::LowerCall figures out ArgumentPopSize and keeps
+  // it as the 2nd argument of AArch64ISD::TC_RETURN.
+  NumBytes += ArgumentPopSize;
+
+  unsigned NumRestores = 0;
+  // Move past the restores of the callee-saved registers.
+  MachineBasicBlock::iterator LastPopI = MBBI;
+  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
+  if (LastPopI != MBB.begin()) {
+    do {
+      ++NumRestores;
+      --LastPopI;
+    } while (LastPopI != MBB.begin() && isCSRestore(LastPopI, CSRegs));
+    if (!isCSRestore(LastPopI, CSRegs)) {
+      ++LastPopI;
+      --NumRestores;
+    }
+  }
+  NumBytes -= NumRestores * 16;
+  assert(NumBytes >= 0 && "Negative stack allocation size!?");
+
+  if (!hasFP(MF)) {
+    // If this was a redzone leaf function, we don't need to restore the
+    // stack pointer.
+    if (!canUseRedZone(MF))
+      emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, NumBytes,
+                      TII);
+    return;
+  }
+
+  // Restore the original stack pointer.
+  // FIXME: Rather than doing the math here, we should instead just use
+  // non-post-indexed loads for the restores if we aren't actually going to
+  // be able to save any instructions.
+  if (NumBytes || MFI->hasVarSizedObjects())
+    emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
+                    -(NumRestores - 1) * 16, TII, MachineInstr::NoFlags);
+}
+
+/// getFrameIndexOffset - Returns the displacement from the frame register to
+/// the stack frame of the specified index.
+int AArch64FrameLowering::getFrameIndexOffset(const MachineFunction &MF,
+                                              int FI) const {
+  unsigned FrameReg;
+  return getFrameIndexReference(MF, FI, FrameReg);
+}
+
+/// getFrameIndexReference - Provide a base+offset reference to an FI slot for
+/// debug info.  It's the same as what we use for resolving the code-gen
+/// references for now.  FIXME: This can go wrong when references are
+/// SP-relative and simple call frames aren't used.
+int AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF,
+                                                 int FI,
+                                                 unsigned &FrameReg) const {
+  return resolveFrameIndexReference(MF, FI, FrameReg);
+}
+
+int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF,
+                                                     int FI, unsigned &FrameReg,
+                                                     bool PreferFP) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
+      MF.getTarget().getRegisterInfo());
+  const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  int FPOffset = MFI->getObjectOffset(FI) + 16;
+  int Offset = MFI->getObjectOffset(FI) + MFI->getStackSize();
+  bool isFixed = MFI->isFixedObjectIndex(FI);
+
+  // Use frame pointer to reference fixed objects. Use it for locals if
+  // there are VLAs (and thus the SP isn't reliable as a base).
+  // Make sure useFPForScavengingIndex() does the right thing for the emergency
+  // spill slot.
+  bool UseFP = false;
+  if (AFI->hasStackFrame()) {
+    // Note: Keeping the following as multiple 'if' statements rather than
+    // merging to a single expression for readability.
+    //
+    // Argument access should always use the FP.
+    if (isFixed) {
+      UseFP = hasFP(MF);
+    } else if (hasFP(MF) && !RegInfo->hasBasePointer(MF)) {
+      // Use SP or FP, whichever gives us the best chance of the offset
+      // being in range for direct access. If the FPOffset is positive,
+      // that'll always be best, as the SP will be even further away.
+      // If the FPOffset is negative, we have to keep in mind that the
+      // available offset range for negative offsets is smaller than for
+      // positive ones. If we have variable sized objects, we're stuck with
+      // using the FP regardless, though, as the SP offset is unknown
+      // and we don't have a base pointer available. If an offset is
+      // available via the FP and the SP, use whichever is closest.
+      if (PreferFP || MFI->hasVarSizedObjects() || FPOffset >= 0 ||
+          (FPOffset >= -256 && Offset > -FPOffset))
+        UseFP = true;
+    }
+  }
+
+  if (UseFP) {
+    FrameReg = RegInfo->getFrameRegister(MF);
+    return FPOffset;
+  }
+
+  // Use the base pointer if we have one.
+  if (RegInfo->hasBasePointer(MF))
+    FrameReg = RegInfo->getBaseRegister();
+  else {
+    FrameReg = AArch64::SP;
+    // If we're using the red zone for this function, the SP won't actually
+    // be adjusted, so the offsets will be negative. They're also all
+    // within range of the signed 9-bit immediate instructions.
+    if (canUseRedZone(MF))
+      Offset -= AFI->getLocalStackSize();
+  }
+
+  return Offset;
+}
+
+static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
+  if (Reg != AArch64::LR)
+    return getKillRegState(true);
+
+  // LR maybe referred to later by an @llvm.returnaddress intrinsic.
+  bool LRLiveIn = MF.getRegInfo().isLiveIn(AArch64::LR);
+  bool LRKill = !(LRLiveIn && MF.getFrameInfo()->isReturnAddressTaken());
+  return getKillRegState(LRKill);
+}
+
+bool AArch64FrameLowering::spillCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    const std::vector<CalleeSavedInfo> &CSI,
+    const TargetRegisterInfo *TRI) const {
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  unsigned Count = CSI.size();
+  DebugLoc DL;
+  assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
+
+  if (MI != MBB.end())
+    DL = MI->getDebugLoc();
+
+  for (unsigned i = 0; i < Count; i += 2) {
+    unsigned idx = Count - i - 2;
+    unsigned Reg1 = CSI[idx].getReg();
+    unsigned Reg2 = CSI[idx + 1].getReg();
+    // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
+    // list to come in sorted by frame index so that we can issue the store
+    // pair instructions directly. Assert if we see anything otherwise.
+    //
+    // The order of the registers in the list is controlled by
+    // getCalleeSavedRegs(), so they will always be in-order, as well.
+    assert(CSI[idx].getFrameIdx() + 1 == CSI[idx + 1].getFrameIdx() &&
+           "Out of order callee saved regs!");
+    unsigned StrOpc;
+    assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
+    assert((i & 1) == 0 && "Odd index for callee-saved reg spill!");
+    // Issue sequence of non-sp increment and pi sp spills for cs regs. The
+    // first spill is a pre-increment that allocates the stack.
+    // For example:
+    //    stp     x22, x21, [sp, #-48]!   // addImm(-6)
+    //    stp     x20, x19, [sp, #16]    // addImm(+2)
+    //    stp     fp, lr, [sp, #32]      // addImm(+4)
+    // Rationale: This sequence saves uop updates compared to a sequence of
+    // pre-increment spills like stp xi,xj,[sp,#-16]!
+    // Note: Similar rational and sequence for restores in epilog.
+    if (AArch64::GPR64RegClass.contains(Reg1)) {
+      assert(AArch64::GPR64RegClass.contains(Reg2) &&
+             "Expected GPR64 callee-saved register pair!");
+      // For first spill use pre-increment store.
+      if (i == 0)
+        StrOpc = AArch64::STPXpre;
+      else
+        StrOpc = AArch64::STPXi;
+    } else if (AArch64::FPR64RegClass.contains(Reg1)) {
+      assert(AArch64::FPR64RegClass.contains(Reg2) &&
+             "Expected FPR64 callee-saved register pair!");
+      // For first spill use pre-increment store.
+      if (i == 0)
+        StrOpc = AArch64::STPDpre;
+      else
+        StrOpc = AArch64::STPDi;
+    } else
+      llvm_unreachable("Unexpected callee saved register!");
+    DEBUG(dbgs() << "CSR spill: (" << TRI->getName(Reg1) << ", "
+                 << TRI->getName(Reg2) << ") -> fi#(" << CSI[idx].getFrameIdx()
+                 << ", " << CSI[idx + 1].getFrameIdx() << ")\n");
+    // Compute offset: i = 0 => offset = -Count;
+    //                 i = 2 => offset = -(Count - 2) + Count = 2 = i; etc.
+    const int Offset = (i == 0) ? -Count : i;
+    assert((Offset >= -64 && Offset <= 63) &&
+           "Offset out of bounds for STP immediate");
+    MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
+    if (StrOpc == AArch64::STPDpre || StrOpc == AArch64::STPXpre)
+      MIB.addReg(AArch64::SP, RegState::Define);
+
+    MIB.addReg(Reg2, getPrologueDeath(MF, Reg2))
+        .addReg(Reg1, getPrologueDeath(MF, Reg1))
+        .addReg(AArch64::SP)
+        .addImm(Offset) // [sp, #offset * 8], where factor * 8 is implicit
+        .setMIFlag(MachineInstr::FrameSetup);
+  }
+  return true;
+}
+
+bool AArch64FrameLowering::restoreCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    const std::vector<CalleeSavedInfo> &CSI,
+    const TargetRegisterInfo *TRI) const {
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  unsigned Count = CSI.size();
+  DebugLoc DL;
+  assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
+
+  if (MI != MBB.end())
+    DL = MI->getDebugLoc();
+
+  for (unsigned i = 0; i < Count; i += 2) {
+    unsigned Reg1 = CSI[i].getReg();
+    unsigned Reg2 = CSI[i + 1].getReg();
+    // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
+    // list to come in sorted by frame index so that we can issue the store
+    // pair instructions directly. Assert if we see anything otherwise.
+    assert(CSI[i].getFrameIdx() + 1 == CSI[i + 1].getFrameIdx() &&
+           "Out of order callee saved regs!");
+    // Issue sequence of non-sp increment and sp-pi restores for cs regs. Only
+    // the last load is sp-pi post-increment and de-allocates the stack:
+    // For example:
+    //    ldp     fp, lr, [sp, #32]       // addImm(+4)
+    //    ldp     x20, x19, [sp, #16]     // addImm(+2)
+    //    ldp     x22, x21, [sp], #48     // addImm(+6)
+    // Note: see comment in spillCalleeSavedRegisters()
+    unsigned LdrOpc;
+
+    assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
+    assert((i & 1) == 0 && "Odd index for callee-saved reg spill!");
+    if (AArch64::GPR64RegClass.contains(Reg1)) {
+      assert(AArch64::GPR64RegClass.contains(Reg2) &&
+             "Expected GPR64 callee-saved register pair!");
+      if (i == Count - 2)
+        LdrOpc = AArch64::LDPXpost;
+      else
+        LdrOpc = AArch64::LDPXi;
+    } else if (AArch64::FPR64RegClass.contains(Reg1)) {
+      assert(AArch64::FPR64RegClass.contains(Reg2) &&
+             "Expected FPR64 callee-saved register pair!");
+      if (i == Count - 2)
+        LdrOpc = AArch64::LDPDpost;
+      else
+        LdrOpc = AArch64::LDPDi;
+    } else
+      llvm_unreachable("Unexpected callee saved register!");
+    DEBUG(dbgs() << "CSR restore: (" << TRI->getName(Reg1) << ", "
+                 << TRI->getName(Reg2) << ") -> fi#(" << CSI[i].getFrameIdx()
+                 << ", " << CSI[i + 1].getFrameIdx() << ")\n");
+
+    // Compute offset: i = 0 => offset = Count - 2; i = 2 => offset = Count - 4;
+    // etc.
+    const int Offset = (i == Count - 2) ? Count : Count - i - 2;
+    assert((Offset >= -64 && Offset <= 63) &&
+           "Offset out of bounds for LDP immediate");
+    MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc));
+    if (LdrOpc == AArch64::LDPXpost || LdrOpc == AArch64::LDPDpost)
+      MIB.addReg(AArch64::SP, RegState::Define);
+
+    MIB.addReg(Reg2, getDefRegState(true))
+        .addReg(Reg1, getDefRegState(true))
+        .addReg(AArch64::SP)
+        .addImm(Offset); // [sp], #offset * 8  or [sp, #offset * 8]
+                         // where the factor * 8 is implicit
+  }
+  return true;
+}
+
+void AArch64FrameLowering::processFunctionBeforeCalleeSavedScan(
+    MachineFunction &MF, RegScavenger *RS) const {
+  const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
+      MF.getTarget().getRegisterInfo());
+  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  MachineRegisterInfo *MRI = &MF.getRegInfo();
+  SmallVector<unsigned, 4> UnspilledCSGPRs;
+  SmallVector<unsigned, 4> UnspilledCSFPRs;
+
+  // The frame record needs to be created by saving the appropriate registers
+  if (hasFP(MF)) {
+    MRI->setPhysRegUsed(AArch64::FP);
+    MRI->setPhysRegUsed(AArch64::LR);
+  }
+
+  // Spill the BasePtr if it's used. Do this first thing so that the
+  // getCalleeSavedRegs() below will get the right answer.
+  if (RegInfo->hasBasePointer(MF))
+    MRI->setPhysRegUsed(RegInfo->getBaseRegister());
+
+  // If any callee-saved registers are used, the frame cannot be eliminated.
+  unsigned NumGPRSpilled = 0;
+  unsigned NumFPRSpilled = 0;
+  bool ExtraCSSpill = false;
+  bool CanEliminateFrame = true;
+  DEBUG(dbgs() << "*** processFunctionBeforeCalleeSavedScan\nUsed CSRs:");
+  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
+
+  // Check pairs of consecutive callee-saved registers.
+  for (unsigned i = 0; CSRegs[i]; i += 2) {
+    assert(CSRegs[i + 1] && "Odd number of callee-saved registers!");
+
+    const unsigned OddReg = CSRegs[i];
+    const unsigned EvenReg = CSRegs[i + 1];
+    assert((AArch64::GPR64RegClass.contains(OddReg) &&
+            AArch64::GPR64RegClass.contains(EvenReg)) ^
+               (AArch64::FPR64RegClass.contains(OddReg) &&
+                AArch64::FPR64RegClass.contains(EvenReg)) &&
+           "Register class mismatch!");
+
+    const bool OddRegUsed = MRI->isPhysRegUsed(OddReg);
+    const bool EvenRegUsed = MRI->isPhysRegUsed(EvenReg);
+
+    // Early exit if none of the registers in the register pair is actually
+    // used.
+    if (!OddRegUsed && !EvenRegUsed) {
+      if (AArch64::GPR64RegClass.contains(OddReg)) {
+        UnspilledCSGPRs.push_back(OddReg);
+        UnspilledCSGPRs.push_back(EvenReg);
+      } else {
+        UnspilledCSFPRs.push_back(OddReg);
+        UnspilledCSFPRs.push_back(EvenReg);
+      }
+      continue;
+    }
+
+    unsigned Reg = AArch64::NoRegister;
+    // If only one of the registers of the register pair is used, make sure to
+    // mark the other one as used as well.
+    if (OddRegUsed ^ EvenRegUsed) {
+      // Find out which register is the additional spill.
+      Reg = OddRegUsed ? EvenReg : OddReg;
+      MRI->setPhysRegUsed(Reg);
+    }
+
+    DEBUG(dbgs() << ' ' << PrintReg(OddReg, RegInfo));
+    DEBUG(dbgs() << ' ' << PrintReg(EvenReg, RegInfo));
+
+    assert(((OddReg == AArch64::LR && EvenReg == AArch64::FP) ||
+            (RegInfo->getEncodingValue(OddReg) + 1 ==
+             RegInfo->getEncodingValue(EvenReg))) &&
+           "Register pair of non-adjacent registers!");
+    if (AArch64::GPR64RegClass.contains(OddReg)) {
+      NumGPRSpilled += 2;
+      // If it's not a reserved register, we can use it in lieu of an
+      // emergency spill slot for the register scavenger.
+      // FIXME: It would be better to instead keep looking and choose another
+      // unspilled register that isn't reserved, if there is one.
+      if (Reg != AArch64::NoRegister && !RegInfo->isReservedReg(MF, Reg))
+        ExtraCSSpill = true;
+    } else
+      NumFPRSpilled += 2;
+
+    CanEliminateFrame = false;
+  }
+
+  // FIXME: Set BigStack if any stack slot references may be out of range.
+  // For now, just conservatively guestimate based on unscaled indexing
+  // range. We'll end up allocating an unnecessary spill slot a lot, but
+  // realistically that's not a big deal at this stage of the game.
+  // The CSR spill slots have not been allocated yet, so estimateStackSize
+  // won't include them.
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  unsigned CFSize = estimateStackSize(MF) + 8 * (NumGPRSpilled + NumFPRSpilled);
+  DEBUG(dbgs() << "Estimated stack frame size: " << CFSize << " bytes.\n");
+  bool BigStack = (CFSize >= 256);
+  if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF))
+    AFI->setHasStackFrame(true);
+
+  // Estimate if we might need to scavenge a register at some point in order
+  // to materialize a stack offset. If so, either spill one additional
+  // callee-saved register or reserve a special spill slot to facilitate
+  // register scavenging. If we already spilled an extra callee-saved register
+  // above to keep the number of spills even, we don't need to do anything else
+  // here.
+  if (BigStack && !ExtraCSSpill) {
+
+    // If we're adding a register to spill here, we have to add two of them
+    // to keep the number of regs to spill even.
+    assert(((UnspilledCSGPRs.size() & 1) == 0) && "Odd number of registers!");
+    unsigned Count = 0;
+    while (!UnspilledCSGPRs.empty() && Count < 2) {
+      unsigned Reg = UnspilledCSGPRs.back();
+      UnspilledCSGPRs.pop_back();
+      DEBUG(dbgs() << "Spilling " << PrintReg(Reg, RegInfo)
+                   << " to get a scratch register.\n");
+      MRI->setPhysRegUsed(Reg);
+      ExtraCSSpill = true;
+      ++Count;
+    }
+
+    // If we didn't find an extra callee-saved register to spill, create
+    // an emergency spill slot.
+    if (!ExtraCSSpill) {
+      const TargetRegisterClass *RC = &AArch64::GPR64RegClass;
+      int FI = MFI->CreateStackObject(RC->getSize(), RC->getAlignment(), false);
+      RS->addScavengingFrameIndex(FI);
+      DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI
+                   << " as the emergency spill slot.\n");
+    }
+  }
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h
new file mode 100644
index 0000000..7686e6f
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -0,0 +1,68 @@
+//==-- AArch64FrameLowering.h - TargetFrameLowering for AArch64 --*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AArch64_FRAMELOWERING_H
+#define AArch64_FRAMELOWERING_H
+
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+
+class AArch64FrameLowering : public TargetFrameLowering {
+public:
+  explicit AArch64FrameLowering()
+      : TargetFrameLowering(StackGrowsDown, 16, 0, 16,
+                            false /*StackRealignable*/) {}
+
+  void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MBBI,
+                                 unsigned FramePtr) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                  MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I) const override;
+
+  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+  /// the function.
+  void emitPrologue(MachineFunction &MF) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+  int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
+  int getFrameIndexReference(const MachineFunction &MF, int FI,
+                             unsigned &FrameReg) const override;
+  int resolveFrameIndexReference(const MachineFunction &MF, int FI,
+                                 unsigned &FrameReg,
+                                 bool PreferFP = false) const;
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 const TargetRegisterInfo *TRI) const override;
+
+  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator MI,
+                                  const std::vector<CalleeSavedInfo> &CSI,
+                                  const TargetRegisterInfo *TRI) const override;
+
+  /// \brief Can this function use the red zone for local allocations.
+  bool canUseRedZone(const MachineFunction &MF) const;
+
+  bool hasFP(const MachineFunction &MF) const override;
+  bool hasReservedCallFrame(const MachineFunction &MF) const override;
+
+  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                            RegScavenger *RS) const override;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
new file mode 100644
index 0000000..3f49fab
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -0,0 +1,3033 @@
+//===-- AArch64ISelDAGToDAG.cpp - A dag to dag inst selector for AArch64 --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the AArch64 target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64TargetMachine.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/ADT/APSInt.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/Function.h" // To access function attributes.
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-isel"
+
+//===--------------------------------------------------------------------===//
+/// AArch64DAGToDAGISel - AArch64 specific code to select AArch64 machine
+/// instructions for SelectionDAG operations.
+///
+namespace {
+
+class AArch64DAGToDAGISel : public SelectionDAGISel {
+  AArch64TargetMachine &TM;
+
+  /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const AArch64Subtarget *Subtarget;
+
+  bool ForCodeSize;
+
+public:
+  explicit AArch64DAGToDAGISel(AArch64TargetMachine &tm,
+                               CodeGenOpt::Level OptLevel)
+      : SelectionDAGISel(tm, OptLevel), TM(tm), Subtarget(nullptr),
+        ForCodeSize(false) {}
+
+  const char *getPassName() const override {
+    return "AArch64 Instruction Selection";
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    AttributeSet FnAttrs = MF.getFunction()->getAttributes();
+    ForCodeSize =
+        FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
+                             Attribute::OptimizeForSize) ||
+        FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
+    Subtarget = &TM.getSubtarget<AArch64Subtarget>();
+    return SelectionDAGISel::runOnMachineFunction(MF);
+  }
+
+  SDNode *Select(SDNode *Node) override;
+
+  /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+  /// inline asm expressions.
+  bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                    char ConstraintCode,
+                                    std::vector<SDValue> &OutOps) override;
+
+  SDNode *SelectMLAV64LaneV128(SDNode *N);
+  SDNode *SelectMULLV64LaneV128(unsigned IntNo, SDNode *N);
+  bool SelectArithExtendedRegister(SDValue N, SDValue &Reg, SDValue &Shift);
+  bool SelectArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
+  bool SelectNegArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
+  bool SelectArithShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
+    return SelectShiftedRegister(N, false, Reg, Shift);
+  }
+  bool SelectLogicalShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
+    return SelectShiftedRegister(N, true, Reg, Shift);
+  }
+  bool SelectAddrModeIndexed8(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed(N, 1, Base, OffImm);
+  }
+  bool SelectAddrModeIndexed16(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed(N, 2, Base, OffImm);
+  }
+  bool SelectAddrModeIndexed32(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed(N, 4, Base, OffImm);
+  }
+  bool SelectAddrModeIndexed64(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed(N, 8, Base, OffImm);
+  }
+  bool SelectAddrModeIndexed128(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed(N, 16, Base, OffImm);
+  }
+  bool SelectAddrModeUnscaled8(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeUnscaled(N, 1, Base, OffImm);
+  }
+  bool SelectAddrModeUnscaled16(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeUnscaled(N, 2, Base, OffImm);
+  }
+  bool SelectAddrModeUnscaled32(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeUnscaled(N, 4, Base, OffImm);
+  }
+  bool SelectAddrModeUnscaled64(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeUnscaled(N, 8, Base, OffImm);
+  }
+  bool SelectAddrModeUnscaled128(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeUnscaled(N, 16, Base, OffImm);
+  }
+
+  template<int Width>
+  bool SelectAddrModeWRO(SDValue N, SDValue &Base, SDValue &Offset,
+                         SDValue &SignExtend, SDValue &DoShift) {
+    return SelectAddrModeWRO(N, Width / 8, Base, Offset, SignExtend, DoShift);
+  }
+
+  template<int Width>
+  bool SelectAddrModeXRO(SDValue N, SDValue &Base, SDValue &Offset,
+                         SDValue &SignExtend, SDValue &DoShift) {
+    return SelectAddrModeXRO(N, Width / 8, Base, Offset, SignExtend, DoShift);
+  }
+
+
+  /// Form sequences of consecutive 64/128-bit registers for use in NEON
+  /// instructions making use of a vector-list (e.g. ldN, tbl). Vecs must have
+  /// between 1 and 4 elements. If it contains a single element that is returned
+  /// unchanged; otherwise a REG_SEQUENCE value is returned.
+  SDValue createDTuple(ArrayRef<SDValue> Vecs);
+  SDValue createQTuple(ArrayRef<SDValue> Vecs);
+
+  /// Generic helper for the createDTuple/createQTuple
+  /// functions. Those should almost always be called instead.
+  SDValue createTuple(ArrayRef<SDValue> Vecs, unsigned RegClassIDs[],
+                      unsigned SubRegs[]);
+
+  SDNode *SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc, bool isExt);
+
+  SDNode *SelectIndexedLoad(SDNode *N, bool &Done);
+
+  SDNode *SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
+                     unsigned SubRegIdx);
+  SDNode *SelectPostLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
+                         unsigned SubRegIdx);
+  SDNode *SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+  SDNode *SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+
+  SDNode *SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc);
+  SDNode *SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc);
+  SDNode *SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+  SDNode *SelectPostStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+
+  SDNode *SelectBitfieldExtractOp(SDNode *N);
+  SDNode *SelectBitfieldInsertOp(SDNode *N);
+
+  SDNode *SelectLIBM(SDNode *N);
+
+// Include the pieces autogenerated from the target description.
+#include "AArch64GenDAGISel.inc"
+
+private:
+  bool SelectShiftedRegister(SDValue N, bool AllowROR, SDValue &Reg,
+                             SDValue &Shift);
+  bool SelectAddrModeIndexed(SDValue N, unsigned Size, SDValue &Base,
+                             SDValue &OffImm);
+  bool SelectAddrModeUnscaled(SDValue N, unsigned Size, SDValue &Base,
+                              SDValue &OffImm);
+  bool SelectAddrModeWRO(SDValue N, unsigned Size, SDValue &Base,
+                         SDValue &Offset, SDValue &SignExtend,
+                         SDValue &DoShift);
+  bool SelectAddrModeXRO(SDValue N, unsigned Size, SDValue &Base,
+                         SDValue &Offset, SDValue &SignExtend,
+                         SDValue &DoShift);
+  bool isWorthFolding(SDValue V) const;
+  bool SelectExtendedSHL(SDValue N, unsigned Size, bool WantExtend,
+                         SDValue &Offset, SDValue &SignExtend);
+
+  template<unsigned RegWidth>
+  bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos) {
+    return SelectCVTFixedPosOperand(N, FixedPos, RegWidth);
+  }
+
+  bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos, unsigned Width);
+};
+} // end anonymous namespace
+
+/// isIntImmediate - This method tests to see if the node is a constant
+/// operand. If so Imm will receive the 32-bit value.
+static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
+  if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
+    Imm = C->getZExtValue();
+    return true;
+  }
+  return false;
+}
+
+// isIntImmediate - This method tests to see if a constant operand.
+// If so Imm will receive the value.
+static bool isIntImmediate(SDValue N, uint64_t &Imm) {
+  return isIntImmediate(N.getNode(), Imm);
+}
+
+// isOpcWithIntImmediate - This method tests to see if the node is a specific
+// opcode and that it has a immediate integer right operand.
+// If so Imm will receive the 32 bit value.
+static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
+                                  uint64_t &Imm) {
+  return N->getOpcode() == Opc &&
+         isIntImmediate(N->getOperand(1).getNode(), Imm);
+}
+
+bool AArch64DAGToDAGISel::SelectInlineAsmMemoryOperand(
+    const SDValue &Op, char ConstraintCode, std::vector<SDValue> &OutOps) {
+  assert(ConstraintCode == 'm' && "unexpected asm memory constraint");
+  // Require the address to be in a register.  That is safe for all AArch64
+  // variants and it is hard to do anything much smarter without knowing
+  // how the operand is used.
+  OutOps.push_back(Op);
+  return false;
+}
+
+/// SelectArithImmed - Select an immediate value that can be represented as
+/// a 12-bit value shifted left by either 0 or 12.  If so, return true with
+/// Val set to the 12-bit value and Shift set to the shifter operand.
+bool AArch64DAGToDAGISel::SelectArithImmed(SDValue N, SDValue &Val,
+                                           SDValue &Shift) {
+  // This function is called from the addsub_shifted_imm ComplexPattern,
+  // which lists [imm] as the list of opcode it's interested in, however
+  // we still need to check whether the operand is actually an immediate
+  // here because the ComplexPattern opcode list is only used in
+  // root-level opcode matching.
+  if (!isa<ConstantSDNode>(N.getNode()))
+    return false;
+
+  uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue();
+  unsigned ShiftAmt;
+
+  if (Immed >> 12 == 0) {
+    ShiftAmt = 0;
+  } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
+    ShiftAmt = 12;
+    Immed = Immed >> 12;
+  } else
+    return false;
+
+  unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt);
+  Val = CurDAG->getTargetConstant(Immed, MVT::i32);
+  Shift = CurDAG->getTargetConstant(ShVal, MVT::i32);
+  return true;
+}
+
+/// SelectNegArithImmed - As above, but negates the value before trying to
+/// select it.
+bool AArch64DAGToDAGISel::SelectNegArithImmed(SDValue N, SDValue &Val,
+                                              SDValue &Shift) {
+  // This function is called from the addsub_shifted_imm ComplexPattern,
+  // which lists [imm] as the list of opcode it's interested in, however
+  // we still need to check whether the operand is actually an immediate
+  // here because the ComplexPattern opcode list is only used in
+  // root-level opcode matching.
+  if (!isa<ConstantSDNode>(N.getNode()))
+    return false;
+
+  // The immediate operand must be a 24-bit zero-extended immediate.
+  uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue();
+
+  // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0"
+  // have the opposite effect on the C flag, so this pattern mustn't match under
+  // those circumstances.
+  if (Immed == 0)
+    return false;
+
+  if (N.getValueType() == MVT::i32)
+    Immed = ~((uint32_t)Immed) + 1;
+  else
+    Immed = ~Immed + 1ULL;
+  if (Immed & 0xFFFFFFFFFF000000ULL)
+    return false;
+
+  Immed &= 0xFFFFFFULL;
+  return SelectArithImmed(CurDAG->getConstant(Immed, MVT::i32), Val, Shift);
+}
+
+/// getShiftTypeForNode - Translate a shift node to the corresponding
+/// ShiftType value.
+static AArch64_AM::ShiftExtendType getShiftTypeForNode(SDValue N) {
+  switch (N.getOpcode()) {
+  default:
+    return AArch64_AM::InvalidShiftExtend;
+  case ISD::SHL:
+    return AArch64_AM::LSL;
+  case ISD::SRL:
+    return AArch64_AM::LSR;
+  case ISD::SRA:
+    return AArch64_AM::ASR;
+  case ISD::ROTR:
+    return AArch64_AM::ROR;
+  }
+}
+
+/// \brief Determine wether it is worth to fold V into an extended register.
+bool AArch64DAGToDAGISel::isWorthFolding(SDValue V) const {
+  // it hurts if the a value is used at least twice, unless we are optimizing
+  // for code size.
+  if (ForCodeSize || V.hasOneUse())
+    return true;
+  return false;
+}
+
+/// SelectShiftedRegister - Select a "shifted register" operand.  If the value
+/// is not shifted, set the Shift operand to default of "LSL 0".  The logical
+/// instructions allow the shifted register to be rotated, but the arithmetic
+/// instructions do not.  The AllowROR parameter specifies whether ROR is
+/// supported.
+bool AArch64DAGToDAGISel::SelectShiftedRegister(SDValue N, bool AllowROR,
+                                                SDValue &Reg, SDValue &Shift) {
+  AArch64_AM::ShiftExtendType ShType = getShiftTypeForNode(N);
+  if (ShType == AArch64_AM::InvalidShiftExtend)
+    return false;
+  if (!AllowROR && ShType == AArch64_AM::ROR)
+    return false;
+
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    unsigned BitSize = N.getValueType().getSizeInBits();
+    unsigned Val = RHS->getZExtValue() & (BitSize - 1);
+    unsigned ShVal = AArch64_AM::getShifterImm(ShType, Val);
+
+    Reg = N.getOperand(0);
+    Shift = CurDAG->getTargetConstant(ShVal, MVT::i32);
+    return isWorthFolding(N);
+  }
+
+  return false;
+}
+
+/// getExtendTypeForNode - Translate an extend node to the corresponding
+/// ExtendType value.
+static AArch64_AM::ShiftExtendType
+getExtendTypeForNode(SDValue N, bool IsLoadStore = false) {
+  if (N.getOpcode() == ISD::SIGN_EXTEND ||
+      N.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+    EVT SrcVT;
+    if (N.getOpcode() == ISD::SIGN_EXTEND_INREG)
+      SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();
+    else
+      SrcVT = N.getOperand(0).getValueType();
+
+    if (!IsLoadStore && SrcVT == MVT::i8)
+      return AArch64_AM::SXTB;
+    else if (!IsLoadStore && SrcVT == MVT::i16)
+      return AArch64_AM::SXTH;
+    else if (SrcVT == MVT::i32)
+      return AArch64_AM::SXTW;
+    assert(SrcVT != MVT::i64 && "extend from 64-bits?");
+
+    return AArch64_AM::InvalidShiftExtend;
+  } else if (N.getOpcode() == ISD::ZERO_EXTEND ||
+             N.getOpcode() == ISD::ANY_EXTEND) {
+    EVT SrcVT = N.getOperand(0).getValueType();
+    if (!IsLoadStore && SrcVT == MVT::i8)
+      return AArch64_AM::UXTB;
+    else if (!IsLoadStore && SrcVT == MVT::i16)
+      return AArch64_AM::UXTH;
+    else if (SrcVT == MVT::i32)
+      return AArch64_AM::UXTW;
+    assert(SrcVT != MVT::i64 && "extend from 64-bits?");
+
+    return AArch64_AM::InvalidShiftExtend;
+  } else if (N.getOpcode() == ISD::AND) {
+    ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
+    if (!CSD)
+      return AArch64_AM::InvalidShiftExtend;
+    uint64_t AndMask = CSD->getZExtValue();
+
+    switch (AndMask) {
+    default:
+      return AArch64_AM::InvalidShiftExtend;
+    case 0xFF:
+      return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend;
+    case 0xFFFF:
+      return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend;
+    case 0xFFFFFFFF:
+      return AArch64_AM::UXTW;
+    }
+  }
+
+  return AArch64_AM::InvalidShiftExtend;
+}
+
+// Helper for SelectMLAV64LaneV128 - Recognize high lane extracts.
+static bool checkHighLaneIndex(SDNode *DL, SDValue &LaneOp, int &LaneIdx) {
+  if (DL->getOpcode() != AArch64ISD::DUPLANE16 &&
+      DL->getOpcode() != AArch64ISD::DUPLANE32)
+    return false;
+
+  SDValue SV = DL->getOperand(0);
+  if (SV.getOpcode() != ISD::INSERT_SUBVECTOR)
+    return false;
+
+  SDValue EV = SV.getOperand(1);
+  if (EV.getOpcode() != ISD::EXTRACT_SUBVECTOR)
+    return false;
+
+  ConstantSDNode *DLidx = cast<ConstantSDNode>(DL->getOperand(1).getNode());
+  ConstantSDNode *EVidx = cast<ConstantSDNode>(EV.getOperand(1).getNode());
+  LaneIdx = DLidx->getSExtValue() + EVidx->getSExtValue();
+  LaneOp = EV.getOperand(0);
+
+  return true;
+}
+
+// Helper for SelectOpcV64LaneV128 - Recogzine operatinos where one operand is a
+// high lane extract.
+static bool checkV64LaneV128(SDValue Op0, SDValue Op1, SDValue &StdOp,
+                             SDValue &LaneOp, int &LaneIdx) {
+
+  if (!checkHighLaneIndex(Op0.getNode(), LaneOp, LaneIdx)) {
+    std::swap(Op0, Op1);
+    if (!checkHighLaneIndex(Op0.getNode(), LaneOp, LaneIdx))
+      return false;
+  }
+  StdOp = Op1;
+  return true;
+}
+
+/// SelectMLAV64LaneV128 - AArch64 supports vector MLAs where one multiplicand
+/// is a lane in the upper half of a 128-bit vector.  Recognize and select this
+/// so that we don't emit unnecessary lane extracts.
+SDNode *AArch64DAGToDAGISel::SelectMLAV64LaneV128(SDNode *N) {
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  SDValue MLAOp1;   // Will hold ordinary multiplicand for MLA.
+  SDValue MLAOp2;   // Will hold lane-accessed multiplicand for MLA.
+  int LaneIdx = -1; // Will hold the lane index.
+
+  if (Op1.getOpcode() != ISD::MUL ||
+      !checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2,
+                        LaneIdx)) {
+    std::swap(Op0, Op1);
+    if (Op1.getOpcode() != ISD::MUL ||
+        !checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2,
+                          LaneIdx))
+      return nullptr;
+  }
+
+  SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, MVT::i64);
+
+  SDValue Ops[] = { Op0, MLAOp1, MLAOp2, LaneIdxVal };
+
+  unsigned MLAOpc = ~0U;
+
+  switch (N->getSimpleValueType(0).SimpleTy) {
+  default:
+    llvm_unreachable("Unrecognized MLA.");
+  case MVT::v4i16:
+    MLAOpc = AArch64::MLAv4i16_indexed;
+    break;
+  case MVT::v8i16:
+    MLAOpc = AArch64::MLAv8i16_indexed;
+    break;
+  case MVT::v2i32:
+    MLAOpc = AArch64::MLAv2i32_indexed;
+    break;
+  case MVT::v4i32:
+    MLAOpc = AArch64::MLAv4i32_indexed;
+    break;
+  }
+
+  return CurDAG->getMachineNode(MLAOpc, SDLoc(N), N->getValueType(0), Ops);
+}
+
+SDNode *AArch64DAGToDAGISel::SelectMULLV64LaneV128(unsigned IntNo, SDNode *N) {
+  SDValue SMULLOp0;
+  SDValue SMULLOp1;
+  int LaneIdx;
+
+  if (!checkV64LaneV128(N->getOperand(1), N->getOperand(2), SMULLOp0, SMULLOp1,
+                        LaneIdx))
+    return nullptr;
+
+  SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, MVT::i64);
+
+  SDValue Ops[] = { SMULLOp0, SMULLOp1, LaneIdxVal };
+
+  unsigned SMULLOpc = ~0U;
+
+  if (IntNo == Intrinsic::aarch64_neon_smull) {
+    switch (N->getSimpleValueType(0).SimpleTy) {
+    default:
+      llvm_unreachable("Unrecognized SMULL.");
+    case MVT::v4i32:
+      SMULLOpc = AArch64::SMULLv4i16_indexed;
+      break;
+    case MVT::v2i64:
+      SMULLOpc = AArch64::SMULLv2i32_indexed;
+      break;
+    }
+  } else if (IntNo == Intrinsic::aarch64_neon_umull) {
+    switch (N->getSimpleValueType(0).SimpleTy) {
+    default:
+      llvm_unreachable("Unrecognized SMULL.");
+    case MVT::v4i32:
+      SMULLOpc = AArch64::UMULLv4i16_indexed;
+      break;
+    case MVT::v2i64:
+      SMULLOpc = AArch64::UMULLv2i32_indexed;
+      break;
+    }
+  } else
+    llvm_unreachable("Unrecognized intrinsic.");
+
+  return CurDAG->getMachineNode(SMULLOpc, SDLoc(N), N->getValueType(0), Ops);
+}
+
+/// Instructions that accept extend modifiers like UXTW expect the register
+/// being extended to be a GPR32, but the incoming DAG might be acting on a
+/// GPR64 (either via SEXT_INREG or AND). Extract the appropriate low bits if
+/// this is the case.
+static SDValue narrowIfNeeded(SelectionDAG *CurDAG, SDValue N) {
+  if (N.getValueType() == MVT::i32)
+    return N;
+
+  SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, MVT::i32);
+  MachineSDNode *Node = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                               SDLoc(N), MVT::i32, N, SubReg);
+  return SDValue(Node, 0);
+}
+
+
+/// SelectArithExtendedRegister - Select a "extended register" operand.  This
+/// operand folds in an extend followed by an optional left shift.
+bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg,
+                                                      SDValue &Shift) {
+  unsigned ShiftVal = 0;
+  AArch64_AM::ShiftExtendType Ext;
+
+  if (N.getOpcode() == ISD::SHL) {
+    ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
+    if (!CSD)
+      return false;
+    ShiftVal = CSD->getZExtValue();
+    if (ShiftVal > 4)
+      return false;
+
+    Ext = getExtendTypeForNode(N.getOperand(0));
+    if (Ext == AArch64_AM::InvalidShiftExtend)
+      return false;
+
+    Reg = N.getOperand(0).getOperand(0);
+  } else {
+    Ext = getExtendTypeForNode(N);
+    if (Ext == AArch64_AM::InvalidShiftExtend)
+      return false;
+
+    Reg = N.getOperand(0);
+  }
+
+  // AArch64 mandates that the RHS of the operation must use the smallest
+  // register classs that could contain the size being extended from.  Thus,
+  // if we're folding a (sext i8), we need the RHS to be a GPR32, even though
+  // there might not be an actual 32-bit value in the program.  We can
+  // (harmlessly) synthesize one by injected an EXTRACT_SUBREG here.
+  assert(Ext != AArch64_AM::UXTX && Ext != AArch64_AM::SXTX);
+  Reg = narrowIfNeeded(CurDAG, Reg);
+  Shift = CurDAG->getTargetConstant(getArithExtendImm(Ext, ShiftVal), MVT::i32);
+  return isWorthFolding(N);
+}
+
+/// SelectAddrModeIndexed - Select a "register plus scaled unsigned 12-bit
+/// immediate" address.  The "Size" argument is the size in bytes of the memory
+/// reference, which determines the scale.
+bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
+                                              SDValue &Base, SDValue &OffImm) {
+  const TargetLowering *TLI = getTargetLowering();
+  if (N.getOpcode() == ISD::FrameIndex) {
+    int FI = cast<FrameIndexSDNode>(N)->getIndex();
+    Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+    OffImm = CurDAG->getTargetConstant(0, MVT::i64);
+    return true;
+  }
+
+  if (N.getOpcode() == AArch64ISD::ADDlow) {
+    GlobalAddressSDNode *GAN =
+        dyn_cast<GlobalAddressSDNode>(N.getOperand(1).getNode());
+    Base = N.getOperand(0);
+    OffImm = N.getOperand(1);
+    if (!GAN)
+      return true;
+
+    const GlobalValue *GV = GAN->getGlobal();
+    unsigned Alignment = GV->getAlignment();
+    const DataLayout *DL = TLI->getDataLayout();
+    Type *Ty = GV->getType()->getElementType();
+    if (Alignment == 0 && Ty->isSized() && !Subtarget->isTargetDarwin())
+      Alignment = DL->getABITypeAlignment(Ty);
+
+    if (Alignment >= Size)
+      return true;
+  }
+
+  if (CurDAG->isBaseWithConstantOffset(N)) {
+    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+      int64_t RHSC = (int64_t)RHS->getZExtValue();
+      unsigned Scale = Log2_32(Size);
+      if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) {
+        Base = N.getOperand(0);
+        if (Base.getOpcode() == ISD::FrameIndex) {
+          int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+          Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+        }
+        OffImm = CurDAG->getTargetConstant(RHSC >> Scale, MVT::i64);
+        return true;
+      }
+    }
+  }
+
+  // Before falling back to our general case, check if the unscaled
+  // instructions can handle this. If so, that's preferable.
+  if (SelectAddrModeUnscaled(N, Size, Base, OffImm))
+    return false;
+
+  // Base only. The address will be materialized into a register before
+  // the memory is accessed.
+  //    add x0, Xbase, #offset
+  //    ldr x0, [x0]
+  Base = N;
+  OffImm = CurDAG->getTargetConstant(0, MVT::i64);
+  return true;
+}
+
+/// SelectAddrModeUnscaled - Select a "register plus unscaled signed 9-bit
+/// immediate" address.  This should only match when there is an offset that
+/// is not valid for a scaled immediate addressing mode.  The "Size" argument
+/// is the size in bytes of the memory reference, which is needed here to know
+/// what is valid for a scaled immediate.
+bool AArch64DAGToDAGISel::SelectAddrModeUnscaled(SDValue N, unsigned Size,
+                                                 SDValue &Base,
+                                                 SDValue &OffImm) {
+  if (!CurDAG->isBaseWithConstantOffset(N))
+    return false;
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    int64_t RHSC = RHS->getSExtValue();
+    // If the offset is valid as a scaled immediate, don't match here.
+    if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 &&
+        RHSC < (0x1000 << Log2_32(Size)))
+      return false;
+    if (RHSC >= -256 && RHSC < 256) {
+      Base = N.getOperand(0);
+      if (Base.getOpcode() == ISD::FrameIndex) {
+        int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+        const TargetLowering *TLI = getTargetLowering();
+        Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+      }
+      OffImm = CurDAG->getTargetConstant(RHSC, MVT::i64);
+      return true;
+    }
+  }
+  return false;
+}
+
+static SDValue Widen(SelectionDAG *CurDAG, SDValue N) {
+  SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, MVT::i32);
+  SDValue ImpDef = SDValue(
+      CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SDLoc(N), MVT::i64),
+      0);
+  MachineSDNode *Node = CurDAG->getMachineNode(
+      TargetOpcode::INSERT_SUBREG, SDLoc(N), MVT::i64, ImpDef, N, SubReg);
+  return SDValue(Node, 0);
+}
+
+/// \brief Check if the given SHL node (\p N), can be used to form an
+/// extended register for an addressing mode.
+bool AArch64DAGToDAGISel::SelectExtendedSHL(SDValue N, unsigned Size,
+                                            bool WantExtend, SDValue &Offset,
+                                            SDValue &SignExtend) {
+  assert(N.getOpcode() == ISD::SHL && "Invalid opcode.");
+  ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
+  if (!CSD || (CSD->getZExtValue() & 0x7) != CSD->getZExtValue())
+    return false;
+
+  if (WantExtend) {
+    AArch64_AM::ShiftExtendType Ext =
+        getExtendTypeForNode(N.getOperand(0), true);
+    if (Ext == AArch64_AM::InvalidShiftExtend)
+      return false;
+
+    Offset = narrowIfNeeded(CurDAG, N.getOperand(0).getOperand(0));
+    SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, MVT::i32);
+  } else {
+    Offset = N.getOperand(0);
+    SignExtend = CurDAG->getTargetConstant(0, MVT::i32);
+  }
+
+  unsigned LegalShiftVal = Log2_32(Size);
+  unsigned ShiftVal = CSD->getZExtValue();
+
+  if (ShiftVal != 0 && ShiftVal != LegalShiftVal)
+    return false;
+
+  if (isWorthFolding(N))
+    return true;
+
+  return false;
+}
+
+bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
+                                            SDValue &Base, SDValue &Offset,
+                                            SDValue &SignExtend,
+                                            SDValue &DoShift) {
+  if (N.getOpcode() != ISD::ADD)
+    return false;
+  SDValue LHS = N.getOperand(0);
+  SDValue RHS = N.getOperand(1);
+
+  // We don't want to match immediate adds here, because they are better lowered
+  // to the register-immediate addressing modes.
+  if (isa<ConstantSDNode>(LHS) || isa<ConstantSDNode>(RHS))
+    return false;
+
+  // Check if this particular node is reused in any non-memory related
+  // operation.  If yes, do not try to fold this node into the address
+  // computation, since the computation will be kept.
+  const SDNode *Node = N.getNode();
+  for (SDNode *UI : Node->uses()) {
+    if (!isa<MemSDNode>(*UI))
+      return false;
+  }
+
+  // Remember if it is worth folding N when it produces extended register.
+  bool IsExtendedRegisterWorthFolding = isWorthFolding(N);
+
+  // Try to match a shifted extend on the RHS.
+  if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
+      SelectExtendedSHL(RHS, Size, true, Offset, SignExtend)) {
+    Base = LHS;
+    DoShift = CurDAG->getTargetConstant(true, MVT::i32);
+    return true;
+  }
+
+  // Try to match a shifted extend on the LHS.
+  if (IsExtendedRegisterWorthFolding && LHS.getOpcode() == ISD::SHL &&
+      SelectExtendedSHL(LHS, Size, true, Offset, SignExtend)) {
+    Base = RHS;
+    DoShift = CurDAG->getTargetConstant(true, MVT::i32);
+    return true;
+  }
+
+  // There was no shift, whatever else we find.
+  DoShift = CurDAG->getTargetConstant(false, MVT::i32);
+
+  AArch64_AM::ShiftExtendType Ext = AArch64_AM::InvalidShiftExtend;
+  // Try to match an unshifted extend on the LHS.
+  if (IsExtendedRegisterWorthFolding &&
+      (Ext = getExtendTypeForNode(LHS, true)) !=
+          AArch64_AM::InvalidShiftExtend) {
+    Base = RHS;
+    Offset = narrowIfNeeded(CurDAG, LHS.getOperand(0));
+    SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, MVT::i32);
+    if (isWorthFolding(LHS))
+      return true;
+  }
+
+  // Try to match an unshifted extend on the RHS.
+  if (IsExtendedRegisterWorthFolding &&
+      (Ext = getExtendTypeForNode(RHS, true)) !=
+          AArch64_AM::InvalidShiftExtend) {
+    Base = LHS;
+    Offset = narrowIfNeeded(CurDAG, RHS.getOperand(0));
+    SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, MVT::i32);
+    if (isWorthFolding(RHS))
+      return true;
+  }
+
+  return false;
+}
+
+bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
+                                            SDValue &Base, SDValue &Offset,
+                                            SDValue &SignExtend,
+                                            SDValue &DoShift) {
+  if (N.getOpcode() != ISD::ADD)
+    return false;
+  SDValue LHS = N.getOperand(0);
+  SDValue RHS = N.getOperand(1);
+
+  // We don't want to match immediate adds here, because they are better lowered
+  // to the register-immediate addressing modes.
+  if (isa<ConstantSDNode>(LHS) || isa<ConstantSDNode>(RHS))
+    return false;
+
+  // Check if this particular node is reused in any non-memory related
+  // operation.  If yes, do not try to fold this node into the address
+  // computation, since the computation will be kept.
+  const SDNode *Node = N.getNode();
+  for (SDNode *UI : Node->uses()) {
+    if (!isa<MemSDNode>(*UI))
+      return false;
+  }
+
+  // Remember if it is worth folding N when it produces extended register.
+  bool IsExtendedRegisterWorthFolding = isWorthFolding(N);
+
+  // Try to match a shifted extend on the RHS.
+  if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
+      SelectExtendedSHL(RHS, Size, false, Offset, SignExtend)) {
+    Base = LHS;
+    DoShift = CurDAG->getTargetConstant(true, MVT::i32);
+    return true;
+  }
+
+  // Try to match a shifted extend on the LHS.
+  if (IsExtendedRegisterWorthFolding && LHS.getOpcode() == ISD::SHL &&
+      SelectExtendedSHL(LHS, Size, false, Offset, SignExtend)) {
+    Base = RHS;
+    DoShift = CurDAG->getTargetConstant(true, MVT::i32);
+    return true;
+  }
+
+  // Match any non-shifted, non-extend, non-immediate add expression.
+  Base = LHS;
+  Offset = RHS;
+  SignExtend = CurDAG->getTargetConstant(false, MVT::i32);
+  DoShift = CurDAG->getTargetConstant(false, MVT::i32);
+  // Reg1 + Reg2 is free: no check needed.
+  return true;
+}
+
+SDValue AArch64DAGToDAGISel::createDTuple(ArrayRef<SDValue> Regs) {
+  static unsigned RegClassIDs[] = {
+      AArch64::DDRegClassID, AArch64::DDDRegClassID, AArch64::DDDDRegClassID};
+  static unsigned SubRegs[] = { AArch64::dsub0, AArch64::dsub1,
+                                AArch64::dsub2, AArch64::dsub3 };
+
+  return createTuple(Regs, RegClassIDs, SubRegs);
+}
+
+SDValue AArch64DAGToDAGISel::createQTuple(ArrayRef<SDValue> Regs) {
+  static unsigned RegClassIDs[] = {
+      AArch64::QQRegClassID, AArch64::QQQRegClassID, AArch64::QQQQRegClassID};
+  static unsigned SubRegs[] = { AArch64::qsub0, AArch64::qsub1,
+                                AArch64::qsub2, AArch64::qsub3 };
+
+  return createTuple(Regs, RegClassIDs, SubRegs);
+}
+
+SDValue AArch64DAGToDAGISel::createTuple(ArrayRef<SDValue> Regs,
+                                         unsigned RegClassIDs[],
+                                         unsigned SubRegs[]) {
+  // There's no special register-class for a vector-list of 1 element: it's just
+  // a vector.
+  if (Regs.size() == 1)
+    return Regs[0];
+
+  assert(Regs.size() >= 2 && Regs.size() <= 4);
+
+  SDLoc DL(Regs[0].getNode());
+
+  SmallVector<SDValue, 4> Ops;
+
+  // First operand of REG_SEQUENCE is the desired RegClass.
+  Ops.push_back(
+      CurDAG->getTargetConstant(RegClassIDs[Regs.size() - 2], MVT::i32));
+
+  // Then we get pairs of source & subregister-position for the components.
+  for (unsigned i = 0; i < Regs.size(); ++i) {
+    Ops.push_back(Regs[i]);
+    Ops.push_back(CurDAG->getTargetConstant(SubRegs[i], MVT::i32));
+  }
+
+  SDNode *N =
+      CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::Untyped, Ops);
+  return SDValue(N, 0);
+}
+
+SDNode *AArch64DAGToDAGISel::SelectTable(SDNode *N, unsigned NumVecs,
+                                         unsigned Opc, bool isExt) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+
+  unsigned ExtOff = isExt;
+
+  // Form a REG_SEQUENCE to force register allocation.
+  unsigned Vec0Off = ExtOff + 1;
+  SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Off,
+                               N->op_begin() + Vec0Off + NumVecs);
+  SDValue RegSeq = createQTuple(Regs);
+
+  SmallVector<SDValue, 6> Ops;
+  if (isExt)
+    Ops.push_back(N->getOperand(1));
+  Ops.push_back(RegSeq);
+  Ops.push_back(N->getOperand(NumVecs + ExtOff + 1));
+  return CurDAG->getMachineNode(Opc, dl, VT, Ops);
+}
+
+SDNode *AArch64DAGToDAGISel::SelectIndexedLoad(SDNode *N, bool &Done) {
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  if (LD->isUnindexed())
+    return nullptr;
+  EVT VT = LD->getMemoryVT();
+  EVT DstVT = N->getValueType(0);
+  ISD::MemIndexedMode AM = LD->getAddressingMode();
+  bool IsPre = AM == ISD::PRE_INC || AM == ISD::PRE_DEC;
+
+  // We're not doing validity checking here. That was done when checking
+  // if we should mark the load as indexed or not. We're just selecting
+  // the right instruction.
+  unsigned Opcode = 0;
+
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  bool InsertTo64 = false;
+  if (VT == MVT::i64)
+    Opcode = IsPre ? AArch64::LDRXpre : AArch64::LDRXpost;
+  else if (VT == MVT::i32) {
+    if (ExtType == ISD::NON_EXTLOAD)
+      Opcode = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost;
+    else if (ExtType == ISD::SEXTLOAD)
+      Opcode = IsPre ? AArch64::LDRSWpre : AArch64::LDRSWpost;
+    else {
+      Opcode = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost;
+      InsertTo64 = true;
+      // The result of the load is only i32. It's the subreg_to_reg that makes
+      // it into an i64.
+      DstVT = MVT::i32;
+    }
+  } else if (VT == MVT::i16) {
+    if (ExtType == ISD::SEXTLOAD) {
+      if (DstVT == MVT::i64)
+        Opcode = IsPre ? AArch64::LDRSHXpre : AArch64::LDRSHXpost;
+      else
+        Opcode = IsPre ? AArch64::LDRSHWpre : AArch64::LDRSHWpost;
+    } else {
+      Opcode = IsPre ? AArch64::LDRHHpre : AArch64::LDRHHpost;
+      InsertTo64 = DstVT == MVT::i64;
+      // The result of the load is only i32. It's the subreg_to_reg that makes
+      // it into an i64.
+      DstVT = MVT::i32;
+    }
+  } else if (VT == MVT::i8) {
+    if (ExtType == ISD::SEXTLOAD) {
+      if (DstVT == MVT::i64)
+        Opcode = IsPre ? AArch64::LDRSBXpre : AArch64::LDRSBXpost;
+      else
+        Opcode = IsPre ? AArch64::LDRSBWpre : AArch64::LDRSBWpost;
+    } else {
+      Opcode = IsPre ? AArch64::LDRBBpre : AArch64::LDRBBpost;
+      InsertTo64 = DstVT == MVT::i64;
+      // The result of the load is only i32. It's the subreg_to_reg that makes
+      // it into an i64.
+      DstVT = MVT::i32;
+    }
+  } else if (VT == MVT::f32) {
+    Opcode = IsPre ? AArch64::LDRSpre : AArch64::LDRSpost;
+  } else if (VT == MVT::f64 || VT.is64BitVector()) {
+    Opcode = IsPre ? AArch64::LDRDpre : AArch64::LDRDpost;
+  } else if (VT.is128BitVector()) {
+    Opcode = IsPre ? AArch64::LDRQpre : AArch64::LDRQpost;
+  } else
+    return nullptr;
+  SDValue Chain = LD->getChain();
+  SDValue Base = LD->getBasePtr();
+  ConstantSDNode *OffsetOp = cast<ConstantSDNode>(LD->getOffset());
+  int OffsetVal = (int)OffsetOp->getZExtValue();
+  SDValue Offset = CurDAG->getTargetConstant(OffsetVal, MVT::i64);
+  SDValue Ops[] = { Base, Offset, Chain };
+  SDNode *Res = CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i64, DstVT,
+                                       MVT::Other, Ops);
+  // Either way, we're replacing the node, so tell the caller that.
+  Done = true;
+  SDValue LoadedVal = SDValue(Res, 1);
+  if (InsertTo64) {
+    SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, MVT::i32);
+    LoadedVal =
+        SDValue(CurDAG->getMachineNode(
+                    AArch64::SUBREG_TO_REG, SDLoc(N), MVT::i64,
+                    CurDAG->getTargetConstant(0, MVT::i64), LoadedVal, SubReg),
+                0);
+  }
+
+  ReplaceUses(SDValue(N, 0), LoadedVal);
+  ReplaceUses(SDValue(N, 1), SDValue(Res, 0));
+  ReplaceUses(SDValue(N, 2), SDValue(Res, 2));
+
+  return nullptr;
+}
+
+SDNode *AArch64DAGToDAGISel::SelectLoad(SDNode *N, unsigned NumVecs,
+                                        unsigned Opc, unsigned SubRegIdx) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  SDValue Chain = N->getOperand(0);
+
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(N->getOperand(2)); // Mem operand;
+  Ops.push_back(Chain);
+
+  std::vector<EVT> ResTys;
+  ResTys.push_back(MVT::Untyped);
+  ResTys.push_back(MVT::Other);
+
+  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+  SDValue SuperReg = SDValue(Ld, 0);
+  for (unsigned i = 0; i < NumVecs; ++i)
+    ReplaceUses(SDValue(N, i),
+        CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg));
+
+  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
+  return nullptr;
+}
+
+SDNode *AArch64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs,
+                                            unsigned Opc, unsigned SubRegIdx) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  SDValue Chain = N->getOperand(0);
+
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(N->getOperand(1)); // Mem operand
+  Ops.push_back(N->getOperand(2)); // Incremental
+  Ops.push_back(Chain);
+
+  std::vector<EVT> ResTys;
+  ResTys.push_back(MVT::i64); // Type of the write back register
+  ResTys.push_back(MVT::Untyped);
+  ResTys.push_back(MVT::Other);
+
+  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+
+  // Update uses of write back register
+  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 0));
+
+  // Update uses of vector list
+  SDValue SuperReg = SDValue(Ld, 1);
+  if (NumVecs == 1)
+    ReplaceUses(SDValue(N, 0), SuperReg);
+  else
+    for (unsigned i = 0; i < NumVecs; ++i)
+      ReplaceUses(SDValue(N, i),
+          CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg));
+
+  // Update the chain
+  ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2));
+  return nullptr;
+}
+
+SDNode *AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs,
+                                         unsigned Opc) {
+  SDLoc dl(N);
+  EVT VT = N->getOperand(2)->getValueType(0);
+
+  // Form a REG_SEQUENCE to force register allocation.
+  bool Is128Bit = VT.getSizeInBits() == 128;
+  SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
+  SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs);
+
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(RegSeq);
+  Ops.push_back(N->getOperand(NumVecs + 2));
+  Ops.push_back(N->getOperand(0));
+  SDNode *St = CurDAG->getMachineNode(Opc, dl, N->getValueType(0), Ops);
+
+  return St;
+}
+
+SDNode *AArch64DAGToDAGISel::SelectPostStore(SDNode *N, unsigned NumVecs,
+                                             unsigned Opc) {
+  SDLoc dl(N);
+  EVT VT = N->getOperand(2)->getValueType(0);
+  SmallVector<EVT, 2> ResTys;
+  ResTys.push_back(MVT::i64);   // Type of the write back register
+  ResTys.push_back(MVT::Other); // Type for the Chain
+
+  // Form a REG_SEQUENCE to force register allocation.
+  bool Is128Bit = VT.getSizeInBits() == 128;
+  SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
+  SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs);
+
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(RegSeq);
+  Ops.push_back(N->getOperand(NumVecs + 1)); // base register
+  Ops.push_back(N->getOperand(NumVecs + 2)); // Incremental
+  Ops.push_back(N->getOperand(0)); // Chain
+  SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+
+  return St;
+}
+
+/// WidenVector - Given a value in the V64 register class, produce the
+/// equivalent value in the V128 register class.
+class WidenVector {
+  SelectionDAG &DAG;
+
+public:
+  WidenVector(SelectionDAG &DAG) : DAG(DAG) {}
+
+  SDValue operator()(SDValue V64Reg) {
+    EVT VT = V64Reg.getValueType();
+    unsigned NarrowSize = VT.getVectorNumElements();
+    MVT EltTy = VT.getVectorElementType().getSimpleVT();
+    MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
+    SDLoc DL(V64Reg);
+
+    SDValue Undef =
+        SDValue(DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, WideTy), 0);
+    return DAG.getTargetInsertSubreg(AArch64::dsub, DL, WideTy, Undef, V64Reg);
+  }
+};
+
+/// NarrowVector - Given a value in the V128 register class, produce the
+/// equivalent value in the V64 register class.
+static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
+  EVT VT = V128Reg.getValueType();
+  unsigned WideSize = VT.getVectorNumElements();
+  MVT EltTy = VT.getVectorElementType().getSimpleVT();
+  MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
+
+  return DAG.getTargetExtractSubreg(AArch64::dsub, SDLoc(V128Reg), NarrowTy,
+                                    V128Reg);
+}
+
+SDNode *AArch64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs,
+                                            unsigned Opc) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  bool Narrow = VT.getSizeInBits() == 64;
+
+  // Form a REG_SEQUENCE to force register allocation.
+  SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
+
+  if (Narrow)
+    std::transform(Regs.begin(), Regs.end(), Regs.begin(),
+                   WidenVector(*CurDAG));
+
+  SDValue RegSeq = createQTuple(Regs);
+
+  std::vector<EVT> ResTys;
+  ResTys.push_back(MVT::Untyped);
+  ResTys.push_back(MVT::Other);
+
+  unsigned LaneNo =
+      cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();
+
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(RegSeq);
+  Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64));
+  Ops.push_back(N->getOperand(NumVecs + 3));
+  Ops.push_back(N->getOperand(0));
+  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+  SDValue SuperReg = SDValue(Ld, 0);
+
+  EVT WideVT = RegSeq.getOperand(1)->getValueType(0);
+  static unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1, AArch64::qsub2,
+                              AArch64::qsub3 };
+  for (unsigned i = 0; i < NumVecs; ++i) {
+    SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT, SuperReg);
+    if (Narrow)
+      NV = NarrowVector(NV, *CurDAG);
+    ReplaceUses(SDValue(N, i), NV);
+  }
+
+  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
+
+  return Ld;
+}
+
+SDNode *AArch64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs,
+                                                unsigned Opc) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  bool Narrow = VT.getSizeInBits() == 64;
+
+  // Form a REG_SEQUENCE to force register allocation.
+  SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
+
+  if (Narrow)
+    std::transform(Regs.begin(), Regs.end(), Regs.begin(),
+                   WidenVector(*CurDAG));
+
+  SDValue RegSeq = createQTuple(Regs);
+
+  std::vector<EVT> ResTys;
+  ResTys.push_back(MVT::i64); // Type of the write back register
+  ResTys.push_back(MVT::Untyped);
+  ResTys.push_back(MVT::Other);
+
+  unsigned LaneNo =
+      cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue();
+
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(RegSeq);
+  Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64)); // Lane Number
+  Ops.push_back(N->getOperand(NumVecs + 2)); // Base register
+  Ops.push_back(N->getOperand(NumVecs + 3)); // Incremental
+  Ops.push_back(N->getOperand(0));
+  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+
+  // Update uses of the write back register
+  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 0));
+
+  // Update uses of the vector list
+  SDValue SuperReg = SDValue(Ld, 1);
+  if (NumVecs == 1) {
+    ReplaceUses(SDValue(N, 0),
+                Narrow ? NarrowVector(SuperReg, *CurDAG) : SuperReg);
+  } else {
+    EVT WideVT = RegSeq.getOperand(1)->getValueType(0);
+    static unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1, AArch64::qsub2,
+                                AArch64::qsub3 };
+    for (unsigned i = 0; i < NumVecs; ++i) {
+      SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT,
+                                                  SuperReg);
+      if (Narrow)
+        NV = NarrowVector(NV, *CurDAG);
+      ReplaceUses(SDValue(N, i), NV);
+    }
+  }
+
+  // Update the Chain
+  ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2));
+
+  return Ld;
+}
+
+SDNode *AArch64DAGToDAGISel::SelectStoreLane(SDNode *N, unsigned NumVecs,
+                                             unsigned Opc) {
+  SDLoc dl(N);
+  EVT VT = N->getOperand(2)->getValueType(0);
+  bool Narrow = VT.getSizeInBits() == 64;
+
+  // Form a REG_SEQUENCE to force register allocation.
+  SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
+
+  if (Narrow)
+    std::transform(Regs.begin(), Regs.end(), Regs.begin(),
+                   WidenVector(*CurDAG));
+
+  SDValue RegSeq = createQTuple(Regs);
+
+  unsigned LaneNo =
+      cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();
+
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(RegSeq);
+  Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64));
+  Ops.push_back(N->getOperand(NumVecs + 3));
+  Ops.push_back(N->getOperand(0));
+  SDNode *St = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
+
+  // Transfer memoperands.
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
+
+  return St;
+}
+
+SDNode *AArch64DAGToDAGISel::SelectPostStoreLane(SDNode *N, unsigned NumVecs,
+                                                 unsigned Opc) {
+  SDLoc dl(N);
+  EVT VT = N->getOperand(2)->getValueType(0);
+  bool Narrow = VT.getSizeInBits() == 64;
+
+  // Form a REG_SEQUENCE to force register allocation.
+  SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
+
+  if (Narrow)
+    std::transform(Regs.begin(), Regs.end(), Regs.begin(),
+                   WidenVector(*CurDAG));
+
+  SDValue RegSeq = createQTuple(Regs);
+
+  SmallVector<EVT, 2> ResTys;
+  ResTys.push_back(MVT::i64);   // Type of the write back register
+  ResTys.push_back(MVT::Other);
+
+  unsigned LaneNo =
+      cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue();
+
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(RegSeq);
+  Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64));
+  Ops.push_back(N->getOperand(NumVecs + 2)); // Base Register
+  Ops.push_back(N->getOperand(NumVecs + 3)); // Incremental
+  Ops.push_back(N->getOperand(0));
+  SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+
+  // Transfer memoperands.
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
+
+  return St;
+}
+
+static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
+                                       unsigned &Opc, SDValue &Opd0,
+                                       unsigned &LSB, unsigned &MSB,
+                                       unsigned NumberOfIgnoredLowBits,
+                                       bool BiggerPattern) {
+  assert(N->getOpcode() == ISD::AND &&
+         "N must be a AND operation to call this function");
+
+  EVT VT = N->getValueType(0);
+
+  // Here we can test the type of VT and return false when the type does not
+  // match, but since it is done prior to that call in the current context
+  // we turned that into an assert to avoid redundant code.
+  assert((VT == MVT::i32 || VT == MVT::i64) &&
+         "Type checking must have been done before calling this function");
+
+  // FIXME: simplify-demanded-bits in DAGCombine will probably have
+  // changed the AND node to a 32-bit mask operation. We'll have to
+  // undo that as part of the transform here if we want to catch all
+  // the opportunities.
+  // Currently the NumberOfIgnoredLowBits argument helps to recover
+  // form these situations when matching bigger pattern (bitfield insert).
+
+  // For unsigned extracts, check for a shift right and mask
+  uint64_t And_imm = 0;
+  if (!isOpcWithIntImmediate(N, ISD::AND, And_imm))
+    return false;
+
+  const SDNode *Op0 = N->getOperand(0).getNode();
+
+  // Because of simplify-demanded-bits in DAGCombine, the mask may have been
+  // simplified. Try to undo that
+  And_imm |= (1 << NumberOfIgnoredLowBits) - 1;
+
+  // The immediate is a mask of the low bits iff imm & (imm+1) == 0
+  if (And_imm & (And_imm + 1))
+    return false;
+
+  bool ClampMSB = false;
+  uint64_t Srl_imm = 0;
+  // Handle the SRL + ANY_EXTEND case.
+  if (VT == MVT::i64 && Op0->getOpcode() == ISD::ANY_EXTEND &&
+      isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL, Srl_imm)) {
+    // Extend the incoming operand of the SRL to 64-bit.
+    Opd0 = Widen(CurDAG, Op0->getOperand(0).getOperand(0));
+    // Make sure to clamp the MSB so that we preserve the semantics of the
+    // original operations.
+    ClampMSB = true;
+  } else if (VT == MVT::i32 && Op0->getOpcode() == ISD::TRUNCATE &&
+             isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL,
+                                   Srl_imm)) {
+    // If the shift result was truncated, we can still combine them.
+    Opd0 = Op0->getOperand(0).getOperand(0);
+
+    // Use the type of SRL node.
+    VT = Opd0->getValueType(0);
+  } else if (isOpcWithIntImmediate(Op0, ISD::SRL, Srl_imm)) {
+    Opd0 = Op0->getOperand(0);
+  } else if (BiggerPattern) {
+    // Let's pretend a 0 shift right has been performed.
+    // The resulting code will be at least as good as the original one
+    // plus it may expose more opportunities for bitfield insert pattern.
+    // FIXME: Currently we limit this to the bigger pattern, because
+    // some optimizations expect AND and not UBFM
+    Opd0 = N->getOperand(0);
+  } else
+    return false;
+
+  assert((BiggerPattern || (Srl_imm > 0 && Srl_imm < VT.getSizeInBits())) &&
+         "bad amount in shift node!");
+
+  LSB = Srl_imm;
+  MSB = Srl_imm + (VT == MVT::i32 ? CountTrailingOnes_32(And_imm)
+                                  : CountTrailingOnes_64(And_imm)) -
+        1;
+  if (ClampMSB)
+    // Since we're moving the extend before the right shift operation, we need
+    // to clamp the MSB to make sure we don't shift in undefined bits instead of
+    // the zeros which would get shifted in with the original right shift
+    // operation.
+    MSB = MSB > 31 ? 31 : MSB;
+
+  Opc = VT == MVT::i32 ? AArch64::UBFMWri : AArch64::UBFMXri;
+  return true;
+}
+
+static bool isOneBitExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
+                                     unsigned &LSB, unsigned &MSB) {
+  // We are looking for the following pattern which basically extracts a single
+  // bit from the source value and places it in the LSB of the destination
+  // value, all other bits of the destination value or set to zero:
+  //
+  // Value2 = AND Value, MaskImm
+  // SRL Value2, ShiftImm
+  //
+  // with MaskImm >> ShiftImm == 1.
+  //
+  // This gets selected into a single UBFM:
+  //
+  // UBFM Value, ShiftImm, ShiftImm
+  //
+
+  if (N->getOpcode() != ISD::SRL)
+    return false;
+
+  uint64_t And_mask = 0;
+  if (!isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, And_mask))
+    return false;
+
+  Opd0 = N->getOperand(0).getOperand(0);
+
+  uint64_t Srl_imm = 0;
+  if (!isIntImmediate(N->getOperand(1), Srl_imm))
+    return false;
+
+  // Check whether we really have a one bit extract here.
+  if (And_mask >> Srl_imm == 0x1) {
+    if (N->getValueType(0) == MVT::i32)
+      Opc = AArch64::UBFMWri;
+    else
+      Opc = AArch64::UBFMXri;
+
+    LSB = MSB = Srl_imm;
+
+    return true;
+  }
+
+  return false;
+}
+
+static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
+                                       unsigned &LSB, unsigned &MSB,
+                                       bool BiggerPattern) {
+  assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
+         "N must be a SHR/SRA operation to call this function");
+
+  EVT VT = N->getValueType(0);
+
+  // Here we can test the type of VT and return false when the type does not
+  // match, but since it is done prior to that call in the current context
+  // we turned that into an assert to avoid redundant code.
+  assert((VT == MVT::i32 || VT == MVT::i64) &&
+         "Type checking must have been done before calling this function");
+
+  // Check for AND + SRL doing a one bit extract.
+  if (isOneBitExtractOpFromShr(N, Opc, Opd0, LSB, MSB))
+    return true;
+
+  // we're looking for a shift of a shift
+  uint64_t Shl_imm = 0;
+  uint64_t Trunc_bits = 0;
+  if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL, Shl_imm)) {
+    Opd0 = N->getOperand(0).getOperand(0);
+  } else if (VT == MVT::i32 && N->getOpcode() == ISD::SRL &&
+             N->getOperand(0).getNode()->getOpcode() == ISD::TRUNCATE) {
+    // We are looking for a shift of truncate. Truncate from i64 to i32 could
+    // be considered as setting high 32 bits as zero. Our strategy here is to
+    // always generate 64bit UBFM. This consistency will help the CSE pass
+    // later find more redundancy.
+    Opd0 = N->getOperand(0).getOperand(0);
+    Trunc_bits = Opd0->getValueType(0).getSizeInBits() - VT.getSizeInBits();
+    VT = Opd0->getValueType(0);
+    assert(VT == MVT::i64 && "the promoted type should be i64");
+  } else if (BiggerPattern) {
+    // Let's pretend a 0 shift left has been performed.
+    // FIXME: Currently we limit this to the bigger pattern case,
+    // because some optimizations expect AND and not UBFM
+    Opd0 = N->getOperand(0);
+  } else
+    return false;
+
+  assert(Shl_imm < VT.getSizeInBits() && "bad amount in shift node!");
+  uint64_t Srl_imm = 0;
+  if (!isIntImmediate(N->getOperand(1), Srl_imm))
+    return false;
+
+  assert(Srl_imm > 0 && Srl_imm < VT.getSizeInBits() &&
+         "bad amount in shift node!");
+  // Note: The width operand is encoded as width-1.
+  unsigned Width = VT.getSizeInBits() - Trunc_bits - Srl_imm - 1;
+  int sLSB = Srl_imm - Shl_imm;
+  if (sLSB < 0)
+    return false;
+  LSB = sLSB;
+  MSB = LSB + Width;
+  // SRA requires a signed extraction
+  if (VT == MVT::i32)
+    Opc = N->getOpcode() == ISD::SRA ? AArch64::SBFMWri : AArch64::UBFMWri;
+  else
+    Opc = N->getOpcode() == ISD::SRA ? AArch64::SBFMXri : AArch64::UBFMXri;
+  return true;
+}
+
+static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc,
+                                SDValue &Opd0, unsigned &LSB, unsigned &MSB,
+                                unsigned NumberOfIgnoredLowBits = 0,
+                                bool BiggerPattern = false) {
+  if (N->getValueType(0) != MVT::i32 && N->getValueType(0) != MVT::i64)
+    return false;
+
+  switch (N->getOpcode()) {
+  default:
+    if (!N->isMachineOpcode())
+      return false;
+    break;
+  case ISD::AND:
+    return isBitfieldExtractOpFromAnd(CurDAG, N, Opc, Opd0, LSB, MSB,
+                                      NumberOfIgnoredLowBits, BiggerPattern);
+  case ISD::SRL:
+  case ISD::SRA:
+    return isBitfieldExtractOpFromShr(N, Opc, Opd0, LSB, MSB, BiggerPattern);
+  }
+
+  unsigned NOpc = N->getMachineOpcode();
+  switch (NOpc) {
+  default:
+    return false;
+  case AArch64::SBFMWri:
+  case AArch64::UBFMWri:
+  case AArch64::SBFMXri:
+  case AArch64::UBFMXri:
+    Opc = NOpc;
+    Opd0 = N->getOperand(0);
+    LSB = cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
+    MSB = cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
+    return true;
+  }
+  // Unreachable
+  return false;
+}
+
+SDNode *AArch64DAGToDAGISel::SelectBitfieldExtractOp(SDNode *N) {
+  unsigned Opc, LSB, MSB;
+  SDValue Opd0;
+  if (!isBitfieldExtractOp(CurDAG, N, Opc, Opd0, LSB, MSB))
+    return nullptr;
+
+  EVT VT = N->getValueType(0);
+
+  // If the bit extract operation is 64bit but the original type is 32bit, we
+  // need to add one EXTRACT_SUBREG.
+  if ((Opc == AArch64::SBFMXri || Opc == AArch64::UBFMXri) && VT == MVT::i32) {
+    SDValue Ops64[] = {Opd0, CurDAG->getTargetConstant(LSB, MVT::i64),
+                       CurDAG->getTargetConstant(MSB, MVT::i64)};
+
+    SDNode *BFM = CurDAG->getMachineNode(Opc, SDLoc(N), MVT::i64, Ops64);
+    SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, MVT::i32);
+    MachineSDNode *Node =
+        CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SDLoc(N), MVT::i32,
+                               SDValue(BFM, 0), SubReg);
+    return Node;
+  }
+
+  SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(LSB, VT),
+                   CurDAG->getTargetConstant(MSB, VT)};
+  return CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+}
+
+/// Does DstMask form a complementary pair with the mask provided by
+/// BitsToBeInserted, suitable for use in a BFI instruction. Roughly speaking,
+/// this asks whether DstMask zeroes precisely those bits that will be set by
+/// the other half.
+static bool isBitfieldDstMask(uint64_t DstMask, APInt BitsToBeInserted,
+                              unsigned NumberOfIgnoredHighBits, EVT VT) {
+  assert((VT == MVT::i32 || VT == MVT::i64) &&
+         "i32 or i64 mask type expected!");
+  unsigned BitWidth = VT.getSizeInBits() - NumberOfIgnoredHighBits;
+
+  APInt SignificantDstMask = APInt(BitWidth, DstMask);
+  APInt SignificantBitsToBeInserted = BitsToBeInserted.zextOrTrunc(BitWidth);
+
+  return (SignificantDstMask & SignificantBitsToBeInserted) == 0 &&
+         (SignificantDstMask | SignificantBitsToBeInserted).isAllOnesValue();
+}
+
+// Look for bits that will be useful for later uses.
+// A bit is consider useless as soon as it is dropped and never used
+// before it as been dropped.
+// E.g., looking for useful bit of x
+// 1. y = x & 0x7
+// 2. z = y >> 2
+// After #1, x useful bits are 0x7, then the useful bits of x, live through
+// y.
+// After #2, the useful bits of x are 0x4.
+// However, if x is used on an unpredicatable instruction, then all its bits
+// are useful.
+// E.g.
+// 1. y = x & 0x7
+// 2. z = y >> 2
+// 3. str x, [@x]
+static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth = 0);
+
+static void getUsefulBitsFromAndWithImmediate(SDValue Op, APInt &UsefulBits,
+                                              unsigned Depth) {
+  uint64_t Imm =
+      cast<const ConstantSDNode>(Op.getOperand(1).getNode())->getZExtValue();
+  Imm = AArch64_AM::decodeLogicalImmediate(Imm, UsefulBits.getBitWidth());
+  UsefulBits &= APInt(UsefulBits.getBitWidth(), Imm);
+  getUsefulBits(Op, UsefulBits, Depth + 1);
+}
+
+static void getUsefulBitsFromBitfieldMoveOpd(SDValue Op, APInt &UsefulBits,
+                                             uint64_t Imm, uint64_t MSB,
+                                             unsigned Depth) {
+  // inherit the bitwidth value
+  APInt OpUsefulBits(UsefulBits);
+  OpUsefulBits = 1;
+
+  if (MSB >= Imm) {
+    OpUsefulBits = OpUsefulBits.shl(MSB - Imm + 1);
+    --OpUsefulBits;
+    // The interesting part will be in the lower part of the result
+    getUsefulBits(Op, OpUsefulBits, Depth + 1);
+    // The interesting part was starting at Imm in the argument
+    OpUsefulBits = OpUsefulBits.shl(Imm);
+  } else {
+    OpUsefulBits = OpUsefulBits.shl(MSB + 1);
+    --OpUsefulBits;
+    // The interesting part will be shifted in the result
+    OpUsefulBits = OpUsefulBits.shl(OpUsefulBits.getBitWidth() - Imm);
+    getUsefulBits(Op, OpUsefulBits, Depth + 1);
+    // The interesting part was at zero in the argument
+    OpUsefulBits = OpUsefulBits.lshr(OpUsefulBits.getBitWidth() - Imm);
+  }
+
+  UsefulBits &= OpUsefulBits;
+}
+
+static void getUsefulBitsFromUBFM(SDValue Op, APInt &UsefulBits,
+                                  unsigned Depth) {
+  uint64_t Imm =
+      cast<const ConstantSDNode>(Op.getOperand(1).getNode())->getZExtValue();
+  uint64_t MSB =
+      cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
+
+  getUsefulBitsFromBitfieldMoveOpd(Op, UsefulBits, Imm, MSB, Depth);
+}
+
+static void getUsefulBitsFromOrWithShiftedReg(SDValue Op, APInt &UsefulBits,
+                                              unsigned Depth) {
+  uint64_t ShiftTypeAndValue =
+      cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
+  APInt Mask(UsefulBits);
+  Mask.clearAllBits();
+  Mask.flipAllBits();
+
+  if (AArch64_AM::getShiftType(ShiftTypeAndValue) == AArch64_AM::LSL) {
+    // Shift Left
+    uint64_t ShiftAmt = AArch64_AM::getShiftValue(ShiftTypeAndValue);
+    Mask = Mask.shl(ShiftAmt);
+    getUsefulBits(Op, Mask, Depth + 1);
+    Mask = Mask.lshr(ShiftAmt);
+  } else if (AArch64_AM::getShiftType(ShiftTypeAndValue) == AArch64_AM::LSR) {
+    // Shift Right
+    // We do not handle AArch64_AM::ASR, because the sign will change the
+    // number of useful bits
+    uint64_t ShiftAmt = AArch64_AM::getShiftValue(ShiftTypeAndValue);
+    Mask = Mask.lshr(ShiftAmt);
+    getUsefulBits(Op, Mask, Depth + 1);
+    Mask = Mask.shl(ShiftAmt);
+  } else
+    return;
+
+  UsefulBits &= Mask;
+}
+
+static void getUsefulBitsFromBFM(SDValue Op, SDValue Orig, APInt &UsefulBits,
+                                 unsigned Depth) {
+  uint64_t Imm =
+      cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
+  uint64_t MSB =
+      cast<const ConstantSDNode>(Op.getOperand(3).getNode())->getZExtValue();
+
+  if (Op.getOperand(1) == Orig)
+    return getUsefulBitsFromBitfieldMoveOpd(Op, UsefulBits, Imm, MSB, Depth);
+
+  APInt OpUsefulBits(UsefulBits);
+  OpUsefulBits = 1;
+
+  if (MSB >= Imm) {
+    OpUsefulBits = OpUsefulBits.shl(MSB - Imm + 1);
+    --OpUsefulBits;
+    UsefulBits &= ~OpUsefulBits;
+    getUsefulBits(Op, UsefulBits, Depth + 1);
+  } else {
+    OpUsefulBits = OpUsefulBits.shl(MSB + 1);
+    --OpUsefulBits;
+    UsefulBits = ~(OpUsefulBits.shl(OpUsefulBits.getBitWidth() - Imm));
+    getUsefulBits(Op, UsefulBits, Depth + 1);
+  }
+}
+
+static void getUsefulBitsForUse(SDNode *UserNode, APInt &UsefulBits,
+                                SDValue Orig, unsigned Depth) {
+
+  // Users of this node should have already been instruction selected
+  // FIXME: Can we turn that into an assert?
+  if (!UserNode->isMachineOpcode())
+    return;
+
+  switch (UserNode->getMachineOpcode()) {
+  default:
+    return;
+  case AArch64::ANDSWri:
+  case AArch64::ANDSXri:
+  case AArch64::ANDWri:
+  case AArch64::ANDXri:
+    // We increment Depth only when we call the getUsefulBits
+    return getUsefulBitsFromAndWithImmediate(SDValue(UserNode, 0), UsefulBits,
+                                             Depth);
+  case AArch64::UBFMWri:
+  case AArch64::UBFMXri:
+    return getUsefulBitsFromUBFM(SDValue(UserNode, 0), UsefulBits, Depth);
+
+  case AArch64::ORRWrs:
+  case AArch64::ORRXrs:
+    if (UserNode->getOperand(1) != Orig)
+      return;
+    return getUsefulBitsFromOrWithShiftedReg(SDValue(UserNode, 0), UsefulBits,
+                                             Depth);
+  case AArch64::BFMWri:
+  case AArch64::BFMXri:
+    return getUsefulBitsFromBFM(SDValue(UserNode, 0), Orig, UsefulBits, Depth);
+  }
+}
+
+static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth) {
+  if (Depth >= 6)
+    return;
+  // Initialize UsefulBits
+  if (!Depth) {
+    unsigned Bitwidth = Op.getValueType().getScalarType().getSizeInBits();
+    // At the beginning, assume every produced bits is useful
+    UsefulBits = APInt(Bitwidth, 0);
+    UsefulBits.flipAllBits();
+  }
+  APInt UsersUsefulBits(UsefulBits.getBitWidth(), 0);
+
+  for (SDNode *Node : Op.getNode()->uses()) {
+    // A use cannot produce useful bits
+    APInt UsefulBitsForUse = APInt(UsefulBits);
+    getUsefulBitsForUse(Node, UsefulBitsForUse, Op, Depth);
+    UsersUsefulBits |= UsefulBitsForUse;
+  }
+  // UsefulBits contains the produced bits that are meaningful for the
+  // current definition, thus a user cannot make a bit meaningful at
+  // this point
+  UsefulBits &= UsersUsefulBits;
+}
+
+/// Create a machine node performing a notional SHL of Op by ShlAmount. If
+/// ShlAmount is negative, do a (logical) right-shift instead. If ShlAmount is
+/// 0, return Op unchanged.
+static SDValue getLeftShift(SelectionDAG *CurDAG, SDValue Op, int ShlAmount) {
+  if (ShlAmount == 0)
+    return Op;
+
+  EVT VT = Op.getValueType();
+  unsigned BitWidth = VT.getSizeInBits();
+  unsigned UBFMOpc = BitWidth == 32 ? AArch64::UBFMWri : AArch64::UBFMXri;
+
+  SDNode *ShiftNode;
+  if (ShlAmount > 0) {
+    // LSL wD, wN, #Amt == UBFM wD, wN, #32-Amt, #31-Amt
+    ShiftNode = CurDAG->getMachineNode(
+        UBFMOpc, SDLoc(Op), VT, Op,
+        CurDAG->getTargetConstant(BitWidth - ShlAmount, VT),
+        CurDAG->getTargetConstant(BitWidth - 1 - ShlAmount, VT));
+  } else {
+    // LSR wD, wN, #Amt == UBFM wD, wN, #Amt, #32-1
+    assert(ShlAmount < 0 && "expected right shift");
+    int ShrAmount = -ShlAmount;
+    ShiftNode = CurDAG->getMachineNode(
+        UBFMOpc, SDLoc(Op), VT, Op, CurDAG->getTargetConstant(ShrAmount, VT),
+        CurDAG->getTargetConstant(BitWidth - 1, VT));
+  }
+
+  return SDValue(ShiftNode, 0);
+}
+
+/// Does this tree qualify as an attempt to move a bitfield into position,
+/// essentially "(and (shl VAL, N), Mask)".
+static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op,
+                                    SDValue &Src, int &ShiftAmount,
+                                    int &MaskWidth) {
+  EVT VT = Op.getValueType();
+  unsigned BitWidth = VT.getSizeInBits();
+  (void)BitWidth;
+  assert(BitWidth == 32 || BitWidth == 64);
+
+  APInt KnownZero, KnownOne;
+  CurDAG->computeKnownBits(Op, KnownZero, KnownOne);
+
+  // Non-zero in the sense that they're not provably zero, which is the key
+  // point if we want to use this value
+  uint64_t NonZeroBits = (~KnownZero).getZExtValue();
+
+  // Discard a constant AND mask if present. It's safe because the node will
+  // already have been factored into the computeKnownBits calculation above.
+  uint64_t AndImm;
+  if (isOpcWithIntImmediate(Op.getNode(), ISD::AND, AndImm)) {
+    assert((~APInt(BitWidth, AndImm) & ~KnownZero) == 0);
+    Op = Op.getOperand(0);
+  }
+
+  uint64_t ShlImm;
+  if (!isOpcWithIntImmediate(Op.getNode(), ISD::SHL, ShlImm))
+    return false;
+  Op = Op.getOperand(0);
+
+  if (!isShiftedMask_64(NonZeroBits))
+    return false;
+
+  ShiftAmount = countTrailingZeros(NonZeroBits);
+  MaskWidth = CountTrailingOnes_64(NonZeroBits >> ShiftAmount);
+
+  // BFI encompasses sufficiently many nodes that it's worth inserting an extra
+  // LSL/LSR if the mask in NonZeroBits doesn't quite match up with the ISD::SHL
+  // amount.
+  Src = getLeftShift(CurDAG, Op, ShlImm - ShiftAmount);
+
+  return true;
+}
+
+// Given a OR operation, check if we have the following pattern
+// ubfm c, b, imm, imm2 (or something that does the same jobs, see
+//                       isBitfieldExtractOp)
+// d = e & mask2 ; where mask is a binary sequence of 1..10..0 and
+//                 countTrailingZeros(mask2) == imm2 - imm + 1
+// f = d | c
+// if yes, given reference arguments will be update so that one can replace
+// the OR instruction with:
+// f = Opc Opd0, Opd1, LSB, MSB ; where Opc is a BFM, LSB = imm, and MSB = imm2
+static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst,
+                                     SDValue &Src, unsigned &ImmR,
+                                     unsigned &ImmS, SelectionDAG *CurDAG) {
+  assert(N->getOpcode() == ISD::OR && "Expect a OR operation");
+
+  // Set Opc
+  EVT VT = N->getValueType(0);
+  if (VT == MVT::i32)
+    Opc = AArch64::BFMWri;
+  else if (VT == MVT::i64)
+    Opc = AArch64::BFMXri;
+  else
+    return false;
+
+  // Because of simplify-demanded-bits in DAGCombine, involved masks may not
+  // have the expected shape. Try to undo that.
+  APInt UsefulBits;
+  getUsefulBits(SDValue(N, 0), UsefulBits);
+
+  unsigned NumberOfIgnoredLowBits = UsefulBits.countTrailingZeros();
+  unsigned NumberOfIgnoredHighBits = UsefulBits.countLeadingZeros();
+
+  // OR is commutative, check both possibilities (does llvm provide a
+  // way to do that directely, e.g., via code matcher?)
+  SDValue OrOpd1Val = N->getOperand(1);
+  SDNode *OrOpd0 = N->getOperand(0).getNode();
+  SDNode *OrOpd1 = N->getOperand(1).getNode();
+  for (int i = 0; i < 2;
+       ++i, std::swap(OrOpd0, OrOpd1), OrOpd1Val = N->getOperand(0)) {
+    unsigned BFXOpc;
+    int DstLSB, Width;
+    if (isBitfieldExtractOp(CurDAG, OrOpd0, BFXOpc, Src, ImmR, ImmS,
+                            NumberOfIgnoredLowBits, true)) {
+      // Check that the returned opcode is compatible with the pattern,
+      // i.e., same type and zero extended (U and not S)
+      if ((BFXOpc != AArch64::UBFMXri && VT == MVT::i64) ||
+          (BFXOpc != AArch64::UBFMWri && VT == MVT::i32))
+        continue;
+
+      // Compute the width of the bitfield insertion
+      DstLSB = 0;
+      Width = ImmS - ImmR + 1;
+      // FIXME: This constraint is to catch bitfield insertion we may
+      // want to widen the pattern if we want to grab general bitfied
+      // move case
+      if (Width <= 0)
+        continue;
+
+      // If the mask on the insertee is correct, we have a BFXIL operation. We
+      // can share the ImmR and ImmS values from the already-computed UBFM.
+    } else if (isBitfieldPositioningOp(CurDAG, SDValue(OrOpd0, 0), Src,
+                                       DstLSB, Width)) {
+      ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits();
+      ImmS = Width - 1;
+    } else
+      continue;
+
+    // Check the second part of the pattern
+    EVT VT = OrOpd1->getValueType(0);
+    assert((VT == MVT::i32 || VT == MVT::i64) && "unexpected OR operand");
+
+    // Compute the Known Zero for the candidate of the first operand.
+    // This allows to catch more general case than just looking for
+    // AND with imm. Indeed, simplify-demanded-bits may have removed
+    // the AND instruction because it proves it was useless.
+    APInt KnownZero, KnownOne;
+    CurDAG->computeKnownBits(OrOpd1Val, KnownZero, KnownOne);
+
+    // Check if there is enough room for the second operand to appear
+    // in the first one
+    APInt BitsToBeInserted =
+        APInt::getBitsSet(KnownZero.getBitWidth(), DstLSB, DstLSB + Width);
+
+    if ((BitsToBeInserted & ~KnownZero) != 0)
+      continue;
+
+    // Set the first operand
+    uint64_t Imm;
+    if (isOpcWithIntImmediate(OrOpd1, ISD::AND, Imm) &&
+        isBitfieldDstMask(Imm, BitsToBeInserted, NumberOfIgnoredHighBits, VT))
+      // In that case, we can eliminate the AND
+      Dst = OrOpd1->getOperand(0);
+    else
+      // Maybe the AND has been removed by simplify-demanded-bits
+      // or is useful because it discards more bits
+      Dst = OrOpd1Val;
+
+    // both parts match
+    return true;
+  }
+
+  return false;
+}
+
+SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertOp(SDNode *N) {
+  if (N->getOpcode() != ISD::OR)
+    return nullptr;
+
+  unsigned Opc;
+  unsigned LSB, MSB;
+  SDValue Opd0, Opd1;
+
+  if (!isBitfieldInsertOpFromOr(N, Opc, Opd0, Opd1, LSB, MSB, CurDAG))
+    return nullptr;
+
+  EVT VT = N->getValueType(0);
+  SDValue Ops[] = { Opd0,
+                    Opd1,
+                    CurDAG->getTargetConstant(LSB, VT),
+                    CurDAG->getTargetConstant(MSB, VT) };
+  return CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+}
+
+SDNode *AArch64DAGToDAGISel::SelectLIBM(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  unsigned Variant;
+  unsigned Opc;
+  unsigned FRINTXOpcs[] = { AArch64::FRINTXSr, AArch64::FRINTXDr };
+
+  if (VT == MVT::f32) {
+    Variant = 0;
+  } else if (VT == MVT::f64) {
+    Variant = 1;
+  } else
+    return nullptr; // Unrecognized argument type. Fall back on default codegen.
+
+  // Pick the FRINTX variant needed to set the flags.
+  unsigned FRINTXOpc = FRINTXOpcs[Variant];
+
+  switch (N->getOpcode()) {
+  default:
+    return nullptr; // Unrecognized libm ISD node. Fall back on default codegen.
+  case ISD::FCEIL: {
+    unsigned FRINTPOpcs[] = { AArch64::FRINTPSr, AArch64::FRINTPDr };
+    Opc = FRINTPOpcs[Variant];
+    break;
+  }
+  case ISD::FFLOOR: {
+    unsigned FRINTMOpcs[] = { AArch64::FRINTMSr, AArch64::FRINTMDr };
+    Opc = FRINTMOpcs[Variant];
+    break;
+  }
+  case ISD::FTRUNC: {
+    unsigned FRINTZOpcs[] = { AArch64::FRINTZSr, AArch64::FRINTZDr };
+    Opc = FRINTZOpcs[Variant];
+    break;
+  }
+  case ISD::FROUND: {
+    unsigned FRINTAOpcs[] = { AArch64::FRINTASr, AArch64::FRINTADr };
+    Opc = FRINTAOpcs[Variant];
+    break;
+  }
+  }
+
+  SDLoc dl(N);
+  SDValue In = N->getOperand(0);
+  SmallVector<SDValue, 2> Ops;
+  Ops.push_back(In);
+
+  if (!TM.Options.UnsafeFPMath) {
+    SDNode *FRINTX = CurDAG->getMachineNode(FRINTXOpc, dl, VT, MVT::Glue, In);
+    Ops.push_back(SDValue(FRINTX, 1));
+  }
+
+  return CurDAG->getMachineNode(Opc, dl, VT, Ops);
+}
+
+bool
+AArch64DAGToDAGISel::SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos,
+                                              unsigned RegWidth) {
+  APFloat FVal(0.0);
+  if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N))
+    FVal = CN->getValueAPF();
+  else if (LoadSDNode *LN = dyn_cast<LoadSDNode>(N)) {
+    // Some otherwise illegal constants are allowed in this case.
+    if (LN->getOperand(1).getOpcode() != AArch64ISD::ADDlow ||
+        !isa<ConstantPoolSDNode>(LN->getOperand(1)->getOperand(1)))
+      return false;
+
+    ConstantPoolSDNode *CN =
+        dyn_cast<ConstantPoolSDNode>(LN->getOperand(1)->getOperand(1));
+    FVal = cast<ConstantFP>(CN->getConstVal())->getValueAPF();
+  } else
+    return false;
+
+  // An FCVT[SU] instruction performs: convertToInt(Val * 2^fbits) where fbits
+  // is between 1 and 32 for a destination w-register, or 1 and 64 for an
+  // x-register.
+  //
+  // By this stage, we've detected (fp_to_[su]int (fmul Val, THIS_NODE)) so we
+  // want THIS_NODE to be 2^fbits. This is much easier to deal with using
+  // integers.
+  bool IsExact;
+
+  // fbits is between 1 and 64 in the worst-case, which means the fmul
+  // could have 2^64 as an actual operand. Need 65 bits of precision.
+  APSInt IntVal(65, true);
+  FVal.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact);
+
+  // N.b. isPowerOf2 also checks for > 0.
+  if (!IsExact || !IntVal.isPowerOf2()) return false;
+  unsigned FBits = IntVal.logBase2();
+
+  // Checks above should have guaranteed that we haven't lost information in
+  // finding FBits, but it must still be in range.
+  if (FBits == 0 || FBits > RegWidth) return false;
+
+  FixedPos = CurDAG->getTargetConstant(FBits, MVT::i32);
+  return true;
+}
+
+SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
+  // Dump information about the Node being selected
+  DEBUG(errs() << "Selecting: ");
+  DEBUG(Node->dump(CurDAG));
+  DEBUG(errs() << "\n");
+
+  // If we have a custom node, we already have selected!
+  if (Node->isMachineOpcode()) {
+    DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
+    Node->setNodeId(-1);
+    return nullptr;
+  }
+
+  // Few custom selection stuff.
+  SDNode *ResNode = nullptr;
+  EVT VT = Node->getValueType(0);
+
+  switch (Node->getOpcode()) {
+  default:
+    break;
+
+  case ISD::ADD:
+    if (SDNode *I = SelectMLAV64LaneV128(Node))
+      return I;
+    break;
+
+  case ISD::LOAD: {
+    // Try to select as an indexed load. Fall through to normal processing
+    // if we can't.
+    bool Done = false;
+    SDNode *I = SelectIndexedLoad(Node, Done);
+    if (Done)
+      return I;
+    break;
+  }
+
+  case ISD::SRL:
+  case ISD::AND:
+  case ISD::SRA:
+    if (SDNode *I = SelectBitfieldExtractOp(Node))
+      return I;
+    break;
+
+  case ISD::OR:
+    if (SDNode *I = SelectBitfieldInsertOp(Node))
+      return I;
+    break;
+
+  case ISD::EXTRACT_VECTOR_ELT: {
+    // Extracting lane zero is a special case where we can just use a plain
+    // EXTRACT_SUBREG instruction, which will become FMOV. This is easier for
+    // the rest of the compiler, especially the register allocator and copyi
+    // propagation, to reason about, so is preferred when it's possible to
+    // use it.
+    ConstantSDNode *LaneNode = cast<ConstantSDNode>(Node->getOperand(1));
+    // Bail and use the default Select() for non-zero lanes.
+    if (LaneNode->getZExtValue() != 0)
+      break;
+    // If the element type is not the same as the result type, likewise
+    // bail and use the default Select(), as there's more to do than just
+    // a cross-class COPY. This catches extracts of i8 and i16 elements
+    // since they will need an explicit zext.
+    if (VT != Node->getOperand(0).getValueType().getVectorElementType())
+      break;
+    unsigned SubReg;
+    switch (Node->getOperand(0)
+                .getValueType()
+                .getVectorElementType()
+                .getSizeInBits()) {
+    default:
+      llvm_unreachable("Unexpected vector element type!");
+    case 64:
+      SubReg = AArch64::dsub;
+      break;
+    case 32:
+      SubReg = AArch64::ssub;
+      break;
+    case 16: // FALLTHROUGH
+    case 8:
+      llvm_unreachable("unexpected zext-requiring extract element!");
+    }
+    SDValue Extract = CurDAG->getTargetExtractSubreg(SubReg, SDLoc(Node), VT,
+                                                     Node->getOperand(0));
+    DEBUG(dbgs() << "ISEL: Custom selection!\n=> ");
+    DEBUG(Extract->dumpr(CurDAG));
+    DEBUG(dbgs() << "\n");
+    return Extract.getNode();
+  }
+  case ISD::Constant: {
+    // Materialize zero constants as copies from WZR/XZR.  This allows
+    // the coalescer to propagate these into other instructions.
+    ConstantSDNode *ConstNode = cast<ConstantSDNode>(Node);
+    if (ConstNode->isNullValue()) {
+      if (VT == MVT::i32)
+        return CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node),
+                                      AArch64::WZR, MVT::i32).getNode();
+      else if (VT == MVT::i64)
+        return CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node),
+                                      AArch64::XZR, MVT::i64).getNode();
+    }
+    break;
+  }
+
+  case ISD::FrameIndex: {
+    // Selects to ADDXri FI, 0 which in turn will become ADDXri SP, imm.
+    int FI = cast<FrameIndexSDNode>(Node)->getIndex();
+    unsigned Shifter = AArch64_AM::getShifterImm(AArch64_AM::LSL, 0);
+    const TargetLowering *TLI = getTargetLowering();
+    SDValue TFI = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+    SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, MVT::i32),
+                      CurDAG->getTargetConstant(Shifter, MVT::i32) };
+    return CurDAG->SelectNodeTo(Node, AArch64::ADDXri, MVT::i64, Ops);
+  }
+  case ISD::INTRINSIC_W_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    switch (IntNo) {
+    default:
+      break;
+    case Intrinsic::aarch64_ldaxp:
+    case Intrinsic::aarch64_ldxp: {
+      unsigned Op =
+          IntNo == Intrinsic::aarch64_ldaxp ? AArch64::LDAXPX : AArch64::LDXPX;
+      SDValue MemAddr = Node->getOperand(2);
+      SDLoc DL(Node);
+      SDValue Chain = Node->getOperand(0);
+
+      SDNode *Ld = CurDAG->getMachineNode(Op, DL, MVT::i64, MVT::i64,
+                                          MVT::Other, MemAddr, Chain);
+
+      // Transfer memoperands.
+      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+      MemOp[0] = cast<MemIntrinsicSDNode>(Node)->getMemOperand();
+      cast<MachineSDNode>(Ld)->setMemRefs(MemOp, MemOp + 1);
+      return Ld;
+    }
+    case Intrinsic::aarch64_stlxp:
+    case Intrinsic::aarch64_stxp: {
+      unsigned Op =
+          IntNo == Intrinsic::aarch64_stlxp ? AArch64::STLXPX : AArch64::STXPX;
+      SDLoc DL(Node);
+      SDValue Chain = Node->getOperand(0);
+      SDValue ValLo = Node->getOperand(2);
+      SDValue ValHi = Node->getOperand(3);
+      SDValue MemAddr = Node->getOperand(4);
+
+      // Place arguments in the right order.
+      SmallVector<SDValue, 7> Ops;
+      Ops.push_back(ValLo);
+      Ops.push_back(ValHi);
+      Ops.push_back(MemAddr);
+      Ops.push_back(Chain);
+
+      SDNode *St = CurDAG->getMachineNode(Op, DL, MVT::i32, MVT::Other, Ops);
+      // Transfer memoperands.
+      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+      MemOp[0] = cast<MemIntrinsicSDNode>(Node)->getMemOperand();
+      cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
+
+      return St;
+    }
+    case Intrinsic::aarch64_neon_ld1x2:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 2, AArch64::LD1Twov8b, AArch64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 2, AArch64::LD1Twov16b, AArch64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 2, AArch64::LD1Twov4h, AArch64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 2, AArch64::LD1Twov8h, AArch64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 2, AArch64::LD1Twov2s, AArch64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 2, AArch64::LD1Twov4s, AArch64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 2, AArch64::LD1Twov2d, AArch64::qsub0);
+      break;
+    case Intrinsic::aarch64_neon_ld1x3:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 3, AArch64::LD1Threev8b, AArch64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 3, AArch64::LD1Threev16b, AArch64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 3, AArch64::LD1Threev4h, AArch64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 3, AArch64::LD1Threev8h, AArch64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 3, AArch64::LD1Threev2s, AArch64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 3, AArch64::LD1Threev4s, AArch64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 3, AArch64::LD1Threev2d, AArch64::qsub0);
+      break;
+    case Intrinsic::aarch64_neon_ld1x4:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 4, AArch64::LD1Fourv8b, AArch64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 4, AArch64::LD1Fourv16b, AArch64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 4, AArch64::LD1Fourv4h, AArch64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 4, AArch64::LD1Fourv8h, AArch64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 4, AArch64::LD1Fourv2s, AArch64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 4, AArch64::LD1Fourv4s, AArch64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 4, AArch64::LD1Fourv2d, AArch64::qsub0);
+      break;
+    case Intrinsic::aarch64_neon_ld2:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 2, AArch64::LD2Twov8b, AArch64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 2, AArch64::LD2Twov16b, AArch64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 2, AArch64::LD2Twov4h, AArch64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 2, AArch64::LD2Twov8h, AArch64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 2, AArch64::LD2Twov2s, AArch64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 2, AArch64::LD2Twov4s, AArch64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 2, AArch64::LD2Twov2d, AArch64::qsub0);
+      break;
+    case Intrinsic::aarch64_neon_ld3:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 3, AArch64::LD3Threev8b, AArch64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 3, AArch64::LD3Threev16b, AArch64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 3, AArch64::LD3Threev4h, AArch64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 3, AArch64::LD3Threev8h, AArch64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 3, AArch64::LD3Threev2s, AArch64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 3, AArch64::LD3Threev4s, AArch64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 3, AArch64::LD3Threev2d, AArch64::qsub0);
+      break;
+    case Intrinsic::aarch64_neon_ld4:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 4, AArch64::LD4Fourv8b, AArch64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 4, AArch64::LD4Fourv16b, AArch64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 4, AArch64::LD4Fourv4h, AArch64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 4, AArch64::LD4Fourv8h, AArch64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 4, AArch64::LD4Fourv2s, AArch64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 4, AArch64::LD4Fourv4s, AArch64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 4, AArch64::LD4Fourv2d, AArch64::qsub0);
+      break;
+    case Intrinsic::aarch64_neon_ld2r:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 2, AArch64::LD2Rv8b, AArch64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 2, AArch64::LD2Rv16b, AArch64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 2, AArch64::LD2Rv4h, AArch64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 2, AArch64::LD2Rv8h, AArch64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 2, AArch64::LD2Rv2s, AArch64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 2, AArch64::LD2Rv4s, AArch64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 2, AArch64::LD2Rv1d, AArch64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 2, AArch64::LD2Rv2d, AArch64::qsub0);
+      break;
+    case Intrinsic::aarch64_neon_ld3r:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 3, AArch64::LD3Rv8b, AArch64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 3, AArch64::LD3Rv16b, AArch64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 3, AArch64::LD3Rv4h, AArch64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 3, AArch64::LD3Rv8h, AArch64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 3, AArch64::LD3Rv2s, AArch64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 3, AArch64::LD3Rv4s, AArch64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 3, AArch64::LD3Rv1d, AArch64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 3, AArch64::LD3Rv2d, AArch64::qsub0);
+      break;
+    case Intrinsic::aarch64_neon_ld4r:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 4, AArch64::LD4Rv8b, AArch64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 4, AArch64::LD4Rv16b, AArch64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 4, AArch64::LD4Rv4h, AArch64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 4, AArch64::LD4Rv8h, AArch64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 4, AArch64::LD4Rv2s, AArch64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 4, AArch64::LD4Rv4s, AArch64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 4, AArch64::LD4Rv1d, AArch64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 4, AArch64::LD4Rv2d, AArch64::qsub0);
+      break;
+    case Intrinsic::aarch64_neon_ld2lane:
+      if (VT == MVT::v16i8 || VT == MVT::v8i8)
+        return SelectLoadLane(Node, 2, AArch64::LD2i8);
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+        return SelectLoadLane(Node, 2, AArch64::LD2i16);
+      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32)
+        return SelectLoadLane(Node, 2, AArch64::LD2i32);
+      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64)
+        return SelectLoadLane(Node, 2, AArch64::LD2i64);
+      break;
+    case Intrinsic::aarch64_neon_ld3lane:
+      if (VT == MVT::v16i8 || VT == MVT::v8i8)
+        return SelectLoadLane(Node, 3, AArch64::LD3i8);
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+        return SelectLoadLane(Node, 3, AArch64::LD3i16);
+      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32)
+        return SelectLoadLane(Node, 3, AArch64::LD3i32);
+      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64)
+        return SelectLoadLane(Node, 3, AArch64::LD3i64);
+      break;
+    case Intrinsic::aarch64_neon_ld4lane:
+      if (VT == MVT::v16i8 || VT == MVT::v8i8)
+        return SelectLoadLane(Node, 4, AArch64::LD4i8);
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+        return SelectLoadLane(Node, 4, AArch64::LD4i16);
+      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32)
+        return SelectLoadLane(Node, 4, AArch64::LD4i32);
+      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64)
+        return SelectLoadLane(Node, 4, AArch64::LD4i64);
+      break;
+    }
+  } break;
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
+    switch (IntNo) {
+    default:
+      break;
+    case Intrinsic::aarch64_neon_tbl2:
+      return SelectTable(Node, 2, VT == MVT::v8i8 ? AArch64::TBLv8i8Two
+                                                  : AArch64::TBLv16i8Two,
+                         false);
+    case Intrinsic::aarch64_neon_tbl3:
+      return SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBLv8i8Three
+                                                  : AArch64::TBLv16i8Three,
+                         false);
+    case Intrinsic::aarch64_neon_tbl4:
+      return SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBLv8i8Four
+                                                  : AArch64::TBLv16i8Four,
+                         false);
+    case Intrinsic::aarch64_neon_tbx2:
+      return SelectTable(Node, 2, VT == MVT::v8i8 ? AArch64::TBXv8i8Two
+                                                  : AArch64::TBXv16i8Two,
+                         true);
+    case Intrinsic::aarch64_neon_tbx3:
+      return SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBXv8i8Three
+                                                  : AArch64::TBXv16i8Three,
+                         true);
+    case Intrinsic::aarch64_neon_tbx4:
+      return SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBXv8i8Four
+                                                  : AArch64::TBXv16i8Four,
+                         true);
+    case Intrinsic::aarch64_neon_smull:
+    case Intrinsic::aarch64_neon_umull:
+      if (SDNode *N = SelectMULLV64LaneV128(IntNo, Node))
+        return N;
+      break;
+    }
+    break;
+  }
+  case ISD::INTRINSIC_VOID: {
+    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    if (Node->getNumOperands() >= 3)
+      VT = Node->getOperand(2)->getValueType(0);
+    switch (IntNo) {
+    default:
+      break;
+    case Intrinsic::aarch64_neon_st1x2: {
+      if (VT == MVT::v8i8)
+        return SelectStore(Node, 2, AArch64::ST1Twov8b);
+      else if (VT == MVT::v16i8)
+        return SelectStore(Node, 2, AArch64::ST1Twov16b);
+      else if (VT == MVT::v4i16)
+        return SelectStore(Node, 2, AArch64::ST1Twov4h);
+      else if (VT == MVT::v8i16)
+        return SelectStore(Node, 2, AArch64::ST1Twov8h);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectStore(Node, 2, AArch64::ST1Twov2s);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectStore(Node, 2, AArch64::ST1Twov4s);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectStore(Node, 2, AArch64::ST1Twov2d);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectStore(Node, 2, AArch64::ST1Twov1d);
+      break;
+    }
+    case Intrinsic::aarch64_neon_st1x3: {
+      if (VT == MVT::v8i8)
+        return SelectStore(Node, 3, AArch64::ST1Threev8b);
+      else if (VT == MVT::v16i8)
+        return SelectStore(Node, 3, AArch64::ST1Threev16b);
+      else if (VT == MVT::v4i16)
+        return SelectStore(Node, 3, AArch64::ST1Threev4h);
+      else if (VT == MVT::v8i16)
+        return SelectStore(Node, 3, AArch64::ST1Threev8h);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectStore(Node, 3, AArch64::ST1Threev2s);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectStore(Node, 3, AArch64::ST1Threev4s);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectStore(Node, 3, AArch64::ST1Threev2d);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectStore(Node, 3, AArch64::ST1Threev1d);
+      break;
+    }
+    case Intrinsic::aarch64_neon_st1x4: {
+      if (VT == MVT::v8i8)
+        return SelectStore(Node, 4, AArch64::ST1Fourv8b);
+      else if (VT == MVT::v16i8)
+        return SelectStore(Node, 4, AArch64::ST1Fourv16b);
+      else if (VT == MVT::v4i16)
+        return SelectStore(Node, 4, AArch64::ST1Fourv4h);
+      else if (VT == MVT::v8i16)
+        return SelectStore(Node, 4, AArch64::ST1Fourv8h);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectStore(Node, 4, AArch64::ST1Fourv2s);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectStore(Node, 4, AArch64::ST1Fourv4s);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectStore(Node, 4, AArch64::ST1Fourv2d);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectStore(Node, 4, AArch64::ST1Fourv1d);
+      break;
+    }
+    case Intrinsic::aarch64_neon_st2: {
+      if (VT == MVT::v8i8)
+        return SelectStore(Node, 2, AArch64::ST2Twov8b);
+      else if (VT == MVT::v16i8)
+        return SelectStore(Node, 2, AArch64::ST2Twov16b);
+      else if (VT == MVT::v4i16)
+        return SelectStore(Node, 2, AArch64::ST2Twov4h);
+      else if (VT == MVT::v8i16)
+        return SelectStore(Node, 2, AArch64::ST2Twov8h);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectStore(Node, 2, AArch64::ST2Twov2s);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectStore(Node, 2, AArch64::ST2Twov4s);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectStore(Node, 2, AArch64::ST2Twov2d);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectStore(Node, 2, AArch64::ST1Twov1d);
+      break;
+    }
+    case Intrinsic::aarch64_neon_st3: {
+      if (VT == MVT::v8i8)
+        return SelectStore(Node, 3, AArch64::ST3Threev8b);
+      else if (VT == MVT::v16i8)
+        return SelectStore(Node, 3, AArch64::ST3Threev16b);
+      else if (VT == MVT::v4i16)
+        return SelectStore(Node, 3, AArch64::ST3Threev4h);
+      else if (VT == MVT::v8i16)
+        return SelectStore(Node, 3, AArch64::ST3Threev8h);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectStore(Node, 3, AArch64::ST3Threev2s);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectStore(Node, 3, AArch64::ST3Threev4s);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectStore(Node, 3, AArch64::ST3Threev2d);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectStore(Node, 3, AArch64::ST1Threev1d);
+      break;
+    }
+    case Intrinsic::aarch64_neon_st4: {
+      if (VT == MVT::v8i8)
+        return SelectStore(Node, 4, AArch64::ST4Fourv8b);
+      else if (VT == MVT::v16i8)
+        return SelectStore(Node, 4, AArch64::ST4Fourv16b);
+      else if (VT == MVT::v4i16)
+        return SelectStore(Node, 4, AArch64::ST4Fourv4h);
+      else if (VT == MVT::v8i16)
+        return SelectStore(Node, 4, AArch64::ST4Fourv8h);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectStore(Node, 4, AArch64::ST4Fourv2s);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectStore(Node, 4, AArch64::ST4Fourv4s);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectStore(Node, 4, AArch64::ST4Fourv2d);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectStore(Node, 4, AArch64::ST1Fourv1d);
+      break;
+    }
+    case Intrinsic::aarch64_neon_st2lane: {
+      if (VT == MVT::v16i8 || VT == MVT::v8i8)
+        return SelectStoreLane(Node, 2, AArch64::ST2i8);
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+        return SelectStoreLane(Node, 2, AArch64::ST2i16);
+      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32)
+        return SelectStoreLane(Node, 2, AArch64::ST2i32);
+      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64)
+        return SelectStoreLane(Node, 2, AArch64::ST2i64);
+      break;
+    }
+    case Intrinsic::aarch64_neon_st3lane: {
+      if (VT == MVT::v16i8 || VT == MVT::v8i8)
+        return SelectStoreLane(Node, 3, AArch64::ST3i8);
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+        return SelectStoreLane(Node, 3, AArch64::ST3i16);
+      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32)
+        return SelectStoreLane(Node, 3, AArch64::ST3i32);
+      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64)
+        return SelectStoreLane(Node, 3, AArch64::ST3i64);
+      break;
+    }
+    case Intrinsic::aarch64_neon_st4lane: {
+      if (VT == MVT::v16i8 || VT == MVT::v8i8)
+        return SelectStoreLane(Node, 4, AArch64::ST4i8);
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+        return SelectStoreLane(Node, 4, AArch64::ST4i16);
+      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32)
+        return SelectStoreLane(Node, 4, AArch64::ST4i32);
+      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64)
+        return SelectStoreLane(Node, 4, AArch64::ST4i64);
+      break;
+    }
+    }
+  }
+  case AArch64ISD::LD2post: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 2, AArch64::LD2Twov8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 2, AArch64::LD2Twov16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 2, AArch64::LD2Twov4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 2, AArch64::LD2Twov8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 2, AArch64::LD2Twov2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 2, AArch64::LD2Twov4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 2, AArch64::LD2Twov2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD3post: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 3, AArch64::LD3Threev8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 3, AArch64::LD3Threev16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 3, AArch64::LD3Threev4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 3, AArch64::LD3Threev8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 3, AArch64::LD3Threev2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 3, AArch64::LD3Threev4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 3, AArch64::LD3Threev2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD4post: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 4, AArch64::LD4Fourv8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 4, AArch64::LD4Fourv16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 4, AArch64::LD4Fourv4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 4, AArch64::LD4Fourv8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 4, AArch64::LD4Fourv2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 4, AArch64::LD4Fourv4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 4, AArch64::LD4Fourv2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD1x2post: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 2, AArch64::LD1Twov8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 2, AArch64::LD1Twov16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 2, AArch64::LD1Twov4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 2, AArch64::LD1Twov8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 2, AArch64::LD1Twov2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 2, AArch64::LD1Twov4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 2, AArch64::LD1Twov2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD1x3post: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 3, AArch64::LD1Threev8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 3, AArch64::LD1Threev16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 3, AArch64::LD1Threev4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 3, AArch64::LD1Threev8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 3, AArch64::LD1Threev2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 3, AArch64::LD1Threev4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 3, AArch64::LD1Threev2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD1x4post: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 4, AArch64::LD1Fourv8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 4, AArch64::LD1Fourv16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 4, AArch64::LD1Fourv4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 4, AArch64::LD1Fourv8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 4, AArch64::LD1Fourv2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 4, AArch64::LD1Fourv4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 4, AArch64::LD1Fourv2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD1DUPpost: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 1, AArch64::LD1Rv8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 1, AArch64::LD1Rv16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 1, AArch64::LD1Rv4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 1, AArch64::LD1Rv8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 1, AArch64::LD1Rv2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 1, AArch64::LD1Rv4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 1, AArch64::LD1Rv1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 1, AArch64::LD1Rv2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD2DUPpost: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 2, AArch64::LD2Rv8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 2, AArch64::LD2Rv16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 2, AArch64::LD2Rv4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 2, AArch64::LD2Rv8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 2, AArch64::LD2Rv2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 2, AArch64::LD2Rv4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 2, AArch64::LD2Rv1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 2, AArch64::LD2Rv2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD3DUPpost: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 3, AArch64::LD3Rv8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 3, AArch64::LD3Rv16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 3, AArch64::LD3Rv4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 3, AArch64::LD3Rv8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 3, AArch64::LD3Rv2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 3, AArch64::LD3Rv4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 3, AArch64::LD3Rv1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 3, AArch64::LD3Rv2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD4DUPpost: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 4, AArch64::LD4Rv8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 4, AArch64::LD4Rv16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 4, AArch64::LD4Rv4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 4, AArch64::LD4Rv8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 4, AArch64::LD4Rv2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 4, AArch64::LD4Rv4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 4, AArch64::LD4Rv1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 4, AArch64::LD4Rv2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD1LANEpost: {
+    if (VT == MVT::v16i8 || VT == MVT::v8i8)
+      return SelectPostLoadLane(Node, 1, AArch64::LD1i8_POST);
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      return SelectPostLoadLane(Node, 1, AArch64::LD1i16_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+             VT == MVT::v2f32)
+      return SelectPostLoadLane(Node, 1, AArch64::LD1i32_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+             VT == MVT::v1f64)
+      return SelectPostLoadLane(Node, 1, AArch64::LD1i64_POST);
+    break;
+  }
+  case AArch64ISD::LD2LANEpost: {
+    if (VT == MVT::v16i8 || VT == MVT::v8i8)
+      return SelectPostLoadLane(Node, 2, AArch64::LD2i8_POST);
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      return SelectPostLoadLane(Node, 2, AArch64::LD2i16_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+             VT == MVT::v2f32)
+      return SelectPostLoadLane(Node, 2, AArch64::LD2i32_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+             VT == MVT::v1f64)
+      return SelectPostLoadLane(Node, 2, AArch64::LD2i64_POST);
+    break;
+  }
+  case AArch64ISD::LD3LANEpost: {
+    if (VT == MVT::v16i8 || VT == MVT::v8i8)
+      return SelectPostLoadLane(Node, 3, AArch64::LD3i8_POST);
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      return SelectPostLoadLane(Node, 3, AArch64::LD3i16_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+             VT == MVT::v2f32)
+      return SelectPostLoadLane(Node, 3, AArch64::LD3i32_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+             VT == MVT::v1f64)
+      return SelectPostLoadLane(Node, 3, AArch64::LD3i64_POST);
+    break;
+  }
+  case AArch64ISD::LD4LANEpost: {
+    if (VT == MVT::v16i8 || VT == MVT::v8i8)
+      return SelectPostLoadLane(Node, 4, AArch64::LD4i8_POST);
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      return SelectPostLoadLane(Node, 4, AArch64::LD4i16_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+             VT == MVT::v2f32)
+      return SelectPostLoadLane(Node, 4, AArch64::LD4i32_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+             VT == MVT::v1f64)
+      return SelectPostLoadLane(Node, 4, AArch64::LD4i64_POST);
+    break;
+  }
+  case AArch64ISD::ST2post: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v8i8)
+      return SelectPostStore(Node, 2, AArch64::ST2Twov8b_POST);
+    else if (VT == MVT::v16i8)
+      return SelectPostStore(Node, 2, AArch64::ST2Twov16b_POST);
+    else if (VT == MVT::v4i16)
+      return SelectPostStore(Node, 2, AArch64::ST2Twov4h_POST);
+    else if (VT == MVT::v8i16)
+      return SelectPostStore(Node, 2, AArch64::ST2Twov8h_POST);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostStore(Node, 2, AArch64::ST2Twov2s_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostStore(Node, 2, AArch64::ST2Twov4s_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostStore(Node, 2, AArch64::ST2Twov2d_POST);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST);
+    break;
+  }
+  case AArch64ISD::ST3post: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v8i8)
+      return SelectPostStore(Node, 3, AArch64::ST3Threev8b_POST);
+    else if (VT == MVT::v16i8)
+      return SelectPostStore(Node, 3, AArch64::ST3Threev16b_POST);
+    else if (VT == MVT::v4i16)
+      return SelectPostStore(Node, 3, AArch64::ST3Threev4h_POST);
+    else if (VT == MVT::v8i16)
+      return SelectPostStore(Node, 3, AArch64::ST3Threev8h_POST);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostStore(Node, 3, AArch64::ST3Threev2s_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostStore(Node, 3, AArch64::ST3Threev4s_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostStore(Node, 3, AArch64::ST3Threev2d_POST);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST);
+    break;
+  }
+  case AArch64ISD::ST4post: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v8i8)
+      return SelectPostStore(Node, 4, AArch64::ST4Fourv8b_POST);
+    else if (VT == MVT::v16i8)
+      return SelectPostStore(Node, 4, AArch64::ST4Fourv16b_POST);
+    else if (VT == MVT::v4i16)
+      return SelectPostStore(Node, 4, AArch64::ST4Fourv4h_POST);
+    else if (VT == MVT::v8i16)
+      return SelectPostStore(Node, 4, AArch64::ST4Fourv8h_POST);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostStore(Node, 4, AArch64::ST4Fourv2s_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostStore(Node, 4, AArch64::ST4Fourv4s_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostStore(Node, 4, AArch64::ST4Fourv2d_POST);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST);
+    break;
+  }
+  case AArch64ISD::ST1x2post: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v8i8)
+      return SelectPostStore(Node, 2, AArch64::ST1Twov8b_POST);
+    else if (VT == MVT::v16i8)
+      return SelectPostStore(Node, 2, AArch64::ST1Twov16b_POST);
+    else if (VT == MVT::v4i16)
+      return SelectPostStore(Node, 2, AArch64::ST1Twov4h_POST);
+    else if (VT == MVT::v8i16)
+      return SelectPostStore(Node, 2, AArch64::ST1Twov8h_POST);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostStore(Node, 2, AArch64::ST1Twov2s_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostStore(Node, 2, AArch64::ST1Twov4s_POST);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostStore(Node, 2, AArch64::ST1Twov2d_POST);
+    break;
+  }
+  case AArch64ISD::ST1x3post: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v8i8)
+      return SelectPostStore(Node, 3, AArch64::ST1Threev8b_POST);
+    else if (VT == MVT::v16i8)
+      return SelectPostStore(Node, 3, AArch64::ST1Threev16b_POST);
+    else if (VT == MVT::v4i16)
+      return SelectPostStore(Node, 3, AArch64::ST1Threev4h_POST);
+    else if (VT == MVT::v8i16)
+      return SelectPostStore(Node, 3, AArch64::ST1Threev8h_POST);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostStore(Node, 3, AArch64::ST1Threev2s_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostStore(Node, 3, AArch64::ST1Threev4s_POST);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostStore(Node, 3, AArch64::ST1Threev2d_POST);
+    break;
+  }
+  case AArch64ISD::ST1x4post: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v8i8)
+      return SelectPostStore(Node, 4, AArch64::ST1Fourv8b_POST);
+    else if (VT == MVT::v16i8)
+      return SelectPostStore(Node, 4, AArch64::ST1Fourv16b_POST);
+    else if (VT == MVT::v4i16)
+      return SelectPostStore(Node, 4, AArch64::ST1Fourv4h_POST);
+    else if (VT == MVT::v8i16)
+      return SelectPostStore(Node, 4, AArch64::ST1Fourv8h_POST);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostStore(Node, 4, AArch64::ST1Fourv2s_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostStore(Node, 4, AArch64::ST1Fourv4s_POST);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostStore(Node, 4, AArch64::ST1Fourv2d_POST);
+    break;
+  }
+  case AArch64ISD::ST2LANEpost: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v16i8 || VT == MVT::v8i8)
+      return SelectPostStoreLane(Node, 2, AArch64::ST2i8_POST);
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      return SelectPostStoreLane(Node, 2, AArch64::ST2i16_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+             VT == MVT::v2f32)
+      return SelectPostStoreLane(Node, 2, AArch64::ST2i32_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+             VT == MVT::v1f64)
+      return SelectPostStoreLane(Node, 2, AArch64::ST2i64_POST);
+    break;
+  }
+  case AArch64ISD::ST3LANEpost: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v16i8 || VT == MVT::v8i8)
+      return SelectPostStoreLane(Node, 3, AArch64::ST3i8_POST);
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      return SelectPostStoreLane(Node, 3, AArch64::ST3i16_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+             VT == MVT::v2f32)
+      return SelectPostStoreLane(Node, 3, AArch64::ST3i32_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+             VT == MVT::v1f64)
+      return SelectPostStoreLane(Node, 3, AArch64::ST3i64_POST);
+    break;
+  }
+  case AArch64ISD::ST4LANEpost: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v16i8 || VT == MVT::v8i8)
+      return SelectPostStoreLane(Node, 4, AArch64::ST4i8_POST);
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      return SelectPostStoreLane(Node, 4, AArch64::ST4i16_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+             VT == MVT::v2f32)
+      return SelectPostStoreLane(Node, 4, AArch64::ST4i32_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+             VT == MVT::v1f64)
+      return SelectPostStoreLane(Node, 4, AArch64::ST4i64_POST);
+    break;
+  }
+
+  case ISD::FCEIL:
+  case ISD::FFLOOR:
+  case ISD::FTRUNC:
+  case ISD::FROUND:
+    if (SDNode *I = SelectLIBM(Node))
+      return I;
+    break;
+  }
+
+  // Select the default instruction
+  ResNode = SelectCode(Node);
+
+  DEBUG(errs() << "=> ");
+  if (ResNode == nullptr || ResNode == Node)
+    DEBUG(Node->dump(CurDAG));
+  else
+    DEBUG(ResNode->dump(CurDAG));
+  DEBUG(errs() << "\n");
+
+  return ResNode;
+}
+
+/// createAArch64ISelDag - This pass converts a legalized DAG into a
+/// AArch64-specific DAG, ready for instruction scheduling.
+FunctionPass *llvm::createAArch64ISelDag(AArch64TargetMachine &TM,
+                                         CodeGenOpt::Level OptLevel) {
+  return new AArch64DAGToDAGISel(TM, OptLevel);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
new file mode 100644
index 0000000..f2004ea
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -0,0 +1,8095 @@
+//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation  ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AArch64TargetLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64ISelLowering.h"
+#include "AArch64PerfectShuffle.h"
+#include "AArch64Subtarget.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64TargetMachine.h"
+#include "AArch64TargetObjectFile.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-lower"
+
+STATISTIC(NumTailCalls, "Number of tail calls");
+STATISTIC(NumShiftInserts, "Number of vector shift inserts");
+
+enum AlignMode {
+  StrictAlign,
+  NoStrictAlign
+};
+
+static cl::opt<AlignMode>
+Align(cl::desc("Load/store alignment support"),
+      cl::Hidden, cl::init(NoStrictAlign),
+      cl::values(
+          clEnumValN(StrictAlign,   "aarch64-strict-align",
+                     "Disallow all unaligned memory accesses"),
+          clEnumValN(NoStrictAlign, "aarch64-no-strict-align",
+                     "Allow unaligned memory accesses"),
+          clEnumValEnd));
+
+// Place holder until extr generation is tested fully.
+static cl::opt<bool>
+EnableAArch64ExtrGeneration("aarch64-extr-generation", cl::Hidden,
+                          cl::desc("Allow AArch64 (or (shift)(shift))->extract"),
+                          cl::init(true));
+
+static cl::opt<bool>
+EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
+                         cl::desc("Allow AArch64 SLI/SRI formation"),
+                         cl::init(false));
+
+//===----------------------------------------------------------------------===//
+// AArch64 Lowering public interface.
+//===----------------------------------------------------------------------===//
+static TargetLoweringObjectFile *createTLOF(const Triple &TT) {
+  if (TT.isOSBinFormatMachO())
+    return new AArch64_MachoTargetObjectFile();
+
+  return new AArch64_ELFTargetObjectFile();
+}
+
+AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM)
+    : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) {
+  Subtarget = &TM.getSubtarget<AArch64Subtarget>();
+
+  // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
+  // we have to make something up. Arbitrarily, choose ZeroOrOne.
+  setBooleanContents(ZeroOrOneBooleanContent);
+  // When comparing vectors the result sets the different elements in the
+  // vector to all-one or all-zero.
+  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+
+  // Set up the register classes.
+  addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
+  addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
+
+  if (Subtarget->hasFPARMv8()) {
+    addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
+    addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
+    addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
+    addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
+  }
+
+  if (Subtarget->hasNEON()) {
+    addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
+    addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
+    // Someone set us up the NEON.
+    addDRTypeForNEON(MVT::v2f32);
+    addDRTypeForNEON(MVT::v8i8);
+    addDRTypeForNEON(MVT::v4i16);
+    addDRTypeForNEON(MVT::v2i32);
+    addDRTypeForNEON(MVT::v1i64);
+    addDRTypeForNEON(MVT::v1f64);
+
+    addQRTypeForNEON(MVT::v4f32);
+    addQRTypeForNEON(MVT::v2f64);
+    addQRTypeForNEON(MVT::v16i8);
+    addQRTypeForNEON(MVT::v8i16);
+    addQRTypeForNEON(MVT::v4i32);
+    addQRTypeForNEON(MVT::v2i64);
+  }
+
+  // Compute derived properties from the register classes
+  computeRegisterProperties();
+
+  // Provide all sorts of operation actions
+  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
+  setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
+  setOperationAction(ISD::SETCC, MVT::i32, Custom);
+  setOperationAction(ISD::SETCC, MVT::i64, Custom);
+  setOperationAction(ISD::SETCC, MVT::f32, Custom);
+  setOperationAction(ISD::SETCC, MVT::f64, Custom);
+  setOperationAction(ISD::BRCOND, MVT::Other, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i32, Custom);
+  setOperationAction(ISD::BR_CC, MVT::i64, Custom);
+  setOperationAction(ISD::BR_CC, MVT::f32, Custom);
+  setOperationAction(ISD::BR_CC, MVT::f64, Custom);
+  setOperationAction(ISD::SELECT, MVT::i32, Custom);
+  setOperationAction(ISD::SELECT, MVT::i64, Custom);
+  setOperationAction(ISD::SELECT, MVT::f32, Custom);
+  setOperationAction(ISD::SELECT, MVT::f64, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
+  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+  setOperationAction(ISD::JumpTable, MVT::i64, Custom);
+
+  setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
+  setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
+  setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
+
+  setOperationAction(ISD::FREM, MVT::f32, Expand);
+  setOperationAction(ISD::FREM, MVT::f64, Expand);
+  setOperationAction(ISD::FREM, MVT::f80, Expand);
+
+  // Custom lowering hooks are needed for XOR
+  // to fold it into CSINC/CSINV.
+  setOperationAction(ISD::XOR, MVT::i32, Custom);
+  setOperationAction(ISD::XOR, MVT::i64, Custom);
+
+  // Virtually no operation on f128 is legal, but LLVM can't expand them when
+  // there's a valid register class, so we need custom operations in most cases.
+  setOperationAction(ISD::FABS, MVT::f128, Expand);
+  setOperationAction(ISD::FADD, MVT::f128, Custom);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
+  setOperationAction(ISD::FCOS, MVT::f128, Expand);
+  setOperationAction(ISD::FDIV, MVT::f128, Custom);
+  setOperationAction(ISD::FMA, MVT::f128, Expand);
+  setOperationAction(ISD::FMUL, MVT::f128, Custom);
+  setOperationAction(ISD::FNEG, MVT::f128, Expand);
+  setOperationAction(ISD::FPOW, MVT::f128, Expand);
+  setOperationAction(ISD::FREM, MVT::f128, Expand);
+  setOperationAction(ISD::FRINT, MVT::f128, Expand);
+  setOperationAction(ISD::FSIN, MVT::f128, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
+  setOperationAction(ISD::FSQRT, MVT::f128, Expand);
+  setOperationAction(ISD::FSUB, MVT::f128, Custom);
+  setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
+  setOperationAction(ISD::SETCC, MVT::f128, Custom);
+  setOperationAction(ISD::BR_CC, MVT::f128, Custom);
+  setOperationAction(ISD::SELECT, MVT::f128, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
+  setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
+
+  // Lowering for many of the conversions is actually specified by the non-f128
+  // type. The LowerXXX function will be trivial when f128 isn't involved.
+  setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
+  setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
+  setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
+
+  // Variable arguments.
+  setOperationAction(ISD::VASTART, MVT::Other, Custom);
+  setOperationAction(ISD::VAARG, MVT::Other, Custom);
+  setOperationAction(ISD::VACOPY, MVT::Other, Custom);
+  setOperationAction(ISD::VAEND, MVT::Other, Expand);
+
+  // Variable-sized objects.
+  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
+
+  // Exception handling.
+  // FIXME: These are guesses. Has this been defined yet?
+  setExceptionPointerRegister(AArch64::X0);
+  setExceptionSelectorRegister(AArch64::X1);
+
+  // Constant pool entries
+  setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
+
+  // BlockAddress
+  setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
+
+  // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
+  setOperationAction(ISD::ADDC, MVT::i32, Custom);
+  setOperationAction(ISD::ADDE, MVT::i32, Custom);
+  setOperationAction(ISD::SUBC, MVT::i32, Custom);
+  setOperationAction(ISD::SUBE, MVT::i32, Custom);
+  setOperationAction(ISD::ADDC, MVT::i64, Custom);
+  setOperationAction(ISD::ADDE, MVT::i64, Custom);
+  setOperationAction(ISD::SUBC, MVT::i64, Custom);
+  setOperationAction(ISD::SUBE, MVT::i64, Custom);
+
+  // AArch64 lacks both left-rotate and popcount instructions.
+  setOperationAction(ISD::ROTL, MVT::i32, Expand);
+  setOperationAction(ISD::ROTL, MVT::i64, Expand);
+
+  // AArch64 doesn't have {U|S}MUL_LOHI.
+  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
+
+
+  // Expand the undefined-at-zero variants to cttz/ctlz to their defined-at-zero
+  // counterparts, which AArch64 supports directly.
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
+
+  setOperationAction(ISD::CTPOP, MVT::i32, Custom);
+  setOperationAction(ISD::CTPOP, MVT::i64, Custom);
+
+  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
+  setOperationAction(ISD::SREM, MVT::i32, Expand);
+  setOperationAction(ISD::SREM, MVT::i64, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
+  setOperationAction(ISD::UREM, MVT::i32, Expand);
+  setOperationAction(ISD::UREM, MVT::i64, Expand);
+
+  // Custom lower Add/Sub/Mul with overflow.
+  setOperationAction(ISD::SADDO, MVT::i32, Custom);
+  setOperationAction(ISD::SADDO, MVT::i64, Custom);
+  setOperationAction(ISD::UADDO, MVT::i32, Custom);
+  setOperationAction(ISD::UADDO, MVT::i64, Custom);
+  setOperationAction(ISD::SSUBO, MVT::i32, Custom);
+  setOperationAction(ISD::SSUBO, MVT::i64, Custom);
+  setOperationAction(ISD::USUBO, MVT::i32, Custom);
+  setOperationAction(ISD::USUBO, MVT::i64, Custom);
+  setOperationAction(ISD::SMULO, MVT::i32, Custom);
+  setOperationAction(ISD::SMULO, MVT::i64, Custom);
+  setOperationAction(ISD::UMULO, MVT::i32, Custom);
+  setOperationAction(ISD::UMULO, MVT::i64, Custom);
+
+  setOperationAction(ISD::FSIN, MVT::f32, Expand);
+  setOperationAction(ISD::FSIN, MVT::f64, Expand);
+  setOperationAction(ISD::FCOS, MVT::f32, Expand);
+  setOperationAction(ISD::FCOS, MVT::f64, Expand);
+  setOperationAction(ISD::FPOW, MVT::f32, Expand);
+  setOperationAction(ISD::FPOW, MVT::f64, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
+
+  // AArch64 has implementations of a lot of rounding-like FP operations.
+  static MVT RoundingTypes[] = { MVT::f32, MVT::f64};
+  for (unsigned I = 0; I < array_lengthof(RoundingTypes); ++I) {
+    MVT Ty = RoundingTypes[I];
+    setOperationAction(ISD::FFLOOR, Ty, Legal);
+    setOperationAction(ISD::FNEARBYINT, Ty, Legal);
+    setOperationAction(ISD::FCEIL, Ty, Legal);
+    setOperationAction(ISD::FRINT, Ty, Legal);
+    setOperationAction(ISD::FTRUNC, Ty, Legal);
+    setOperationAction(ISD::FROUND, Ty, Legal);
+  }
+
+  setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
+
+  if (Subtarget->isTargetMachO()) {
+    // For iOS, we don't want to the normal expansion of a libcall to
+    // sincos. We want to issue a libcall to __sincos_stret to avoid memory
+    // traffic.
+    setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
+    setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
+  } else {
+    setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+    setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
+  }
+
+  // AArch64 does not have floating-point extending loads, i1 sign-extending
+  // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
+  setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f64, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f80, Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Expand);
+  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f80, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f64, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f16, Expand);
+
+  setOperationAction(ISD::BITCAST, MVT::i16, Custom);
+  setOperationAction(ISD::BITCAST, MVT::f16, Custom);
+
+  // Indexed loads and stores are supported.
+  for (unsigned im = (unsigned)ISD::PRE_INC;
+       im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
+    setIndexedLoadAction(im, MVT::i8, Legal);
+    setIndexedLoadAction(im, MVT::i16, Legal);
+    setIndexedLoadAction(im, MVT::i32, Legal);
+    setIndexedLoadAction(im, MVT::i64, Legal);
+    setIndexedLoadAction(im, MVT::f64, Legal);
+    setIndexedLoadAction(im, MVT::f32, Legal);
+    setIndexedStoreAction(im, MVT::i8, Legal);
+    setIndexedStoreAction(im, MVT::i16, Legal);
+    setIndexedStoreAction(im, MVT::i32, Legal);
+    setIndexedStoreAction(im, MVT::i64, Legal);
+    setIndexedStoreAction(im, MVT::f64, Legal);
+    setIndexedStoreAction(im, MVT::f32, Legal);
+  }
+
+  // Trap.
+  setOperationAction(ISD::TRAP, MVT::Other, Legal);
+
+  // We combine OR nodes for bitfield operations.
+  setTargetDAGCombine(ISD::OR);
+
+  // Vector add and sub nodes may conceal a high-half opportunity.
+  // Also, try to fold ADD into CSINC/CSINV..
+  setTargetDAGCombine(ISD::ADD);
+  setTargetDAGCombine(ISD::SUB);
+
+  setTargetDAGCombine(ISD::XOR);
+  setTargetDAGCombine(ISD::SINT_TO_FP);
+  setTargetDAGCombine(ISD::UINT_TO_FP);
+
+  setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
+
+  setTargetDAGCombine(ISD::ANY_EXTEND);
+  setTargetDAGCombine(ISD::ZERO_EXTEND);
+  setTargetDAGCombine(ISD::SIGN_EXTEND);
+  setTargetDAGCombine(ISD::BITCAST);
+  setTargetDAGCombine(ISD::CONCAT_VECTORS);
+  setTargetDAGCombine(ISD::STORE);
+
+  setTargetDAGCombine(ISD::MUL);
+
+  setTargetDAGCombine(ISD::SELECT);
+  setTargetDAGCombine(ISD::VSELECT);
+
+  setTargetDAGCombine(ISD::INTRINSIC_VOID);
+  setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
+  setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
+
+  MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8;
+  MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;
+  MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4;
+
+  setStackPointerRegisterToSaveRestore(AArch64::SP);
+
+  setSchedulingPreference(Sched::Hybrid);
+
+  // Enable TBZ/TBNZ
+  MaskAndBranchFoldingIsLegal = true;
+
+  setMinFunctionAlignment(2);
+
+  RequireStrictAlign = (Align == StrictAlign);
+
+  setHasExtractBitsInsn(true);
+
+  if (Subtarget->hasNEON()) {
+    // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
+    // silliness like this:
+    setOperationAction(ISD::FABS, MVT::v1f64, Expand);
+    setOperationAction(ISD::FADD, MVT::v1f64, Expand);
+    setOperationAction(ISD::FCEIL, MVT::v1f64, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand);
+    setOperationAction(ISD::FCOS, MVT::v1f64, Expand);
+    setOperationAction(ISD::FDIV, MVT::v1f64, Expand);
+    setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand);
+    setOperationAction(ISD::FMA, MVT::v1f64, Expand);
+    setOperationAction(ISD::FMUL, MVT::v1f64, Expand);
+    setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand);
+    setOperationAction(ISD::FNEG, MVT::v1f64, Expand);
+    setOperationAction(ISD::FPOW, MVT::v1f64, Expand);
+    setOperationAction(ISD::FREM, MVT::v1f64, Expand);
+    setOperationAction(ISD::FROUND, MVT::v1f64, Expand);
+    setOperationAction(ISD::FRINT, MVT::v1f64, Expand);
+    setOperationAction(ISD::FSIN, MVT::v1f64, Expand);
+    setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand);
+    setOperationAction(ISD::FSQRT, MVT::v1f64, Expand);
+    setOperationAction(ISD::FSUB, MVT::v1f64, Expand);
+    setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand);
+    setOperationAction(ISD::SETCC, MVT::v1f64, Expand);
+    setOperationAction(ISD::BR_CC, MVT::v1f64, Expand);
+    setOperationAction(ISD::SELECT, MVT::v1f64, Expand);
+    setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand);
+    setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand);
+
+    setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand);
+    setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand);
+    setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand);
+
+    setOperationAction(ISD::MUL, MVT::v1i64, Expand);
+
+    // AArch64 doesn't have a direct vector ->f32 conversion instructions for
+    // elements smaller than i32, so promote the input to i32 first.
+    setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Promote);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v4i8, Promote);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Promote);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Promote);
+    // Similarly, there is no direct i32 -> f64 vector conversion instruction.
+    setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom);
+
+    // AArch64 doesn't have MUL.2d:
+    setOperationAction(ISD::MUL, MVT::v2i64, Expand);
+    setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
+    setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
+    // Likewise, narrowing and extending vector loads/stores aren't handled
+    // directly.
+    for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+         VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
+
+      setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
+                         Expand);
+
+      setOperationAction(ISD::MULHS, (MVT::SimpleValueType)VT, Expand);
+      setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
+      setOperationAction(ISD::MULHU, (MVT::SimpleValueType)VT, Expand);
+      setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
+
+      setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
+
+      for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+           InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
+        setTruncStoreAction((MVT::SimpleValueType)VT,
+                            (MVT::SimpleValueType)InnerVT, Expand);
+      setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
+      setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand);
+      setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand);
+    }
+
+    // AArch64 has implementations of a lot of rounding-like FP operations.
+    static MVT RoundingVecTypes[] = {MVT::v2f32, MVT::v4f32, MVT::v2f64 };
+    for (unsigned I = 0; I < array_lengthof(RoundingVecTypes); ++I) {
+      MVT Ty = RoundingVecTypes[I];
+      setOperationAction(ISD::FFLOOR, Ty, Legal);
+      setOperationAction(ISD::FNEARBYINT, Ty, Legal);
+      setOperationAction(ISD::FCEIL, Ty, Legal);
+      setOperationAction(ISD::FRINT, Ty, Legal);
+      setOperationAction(ISD::FTRUNC, Ty, Legal);
+      setOperationAction(ISD::FROUND, Ty, Legal);
+    }
+  }
+}
+
+void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {
+  if (VT == MVT::v2f32) {
+    setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
+    AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i32);
+
+    setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
+    AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i32);
+  } else if (VT == MVT::v2f64 || VT == MVT::v4f32) {
+    setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
+    AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i64);
+
+    setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
+    AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i64);
+  }
+
+  // Mark vector float intrinsics as expand.
+  if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
+    setOperationAction(ISD::FSIN, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FCOS, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FPOWI, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FPOW, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FLOG, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FLOG2, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FLOG10, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FEXP, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FEXP2, VT.getSimpleVT(), Expand);
+  }
+
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::SRA, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::SHL, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::AND, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::OR, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::SETCC, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal);
+
+  setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::VSELECT, VT.getSimpleVT(), Expand);
+  setLoadExtAction(ISD::EXTLOAD, VT.getSimpleVT(), Expand);
+
+  // CNT supports only B element sizes.
+  if (VT != MVT::v8i8 && VT != MVT::v16i8)
+    setOperationAction(ISD::CTPOP, VT.getSimpleVT(), Expand);
+
+  setOperationAction(ISD::UDIV, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::SDIV, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::UREM, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::SREM, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::FREM, VT.getSimpleVT(), Expand);
+
+  setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Custom);
+
+  if (Subtarget->isLittleEndian()) {
+    for (unsigned im = (unsigned)ISD::PRE_INC;
+         im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
+      setIndexedLoadAction(im, VT.getSimpleVT(), Legal);
+      setIndexedStoreAction(im, VT.getSimpleVT(), Legal);
+    }
+  }
+}
+
+void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
+  addRegisterClass(VT, &AArch64::FPR64RegClass);
+  addTypeForNEON(VT, MVT::v2i32);
+}
+
+void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
+  addRegisterClass(VT, &AArch64::FPR128RegClass);
+  addTypeForNEON(VT, MVT::v4i32);
+}
+
+EVT AArch64TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
+  if (!VT.isVector())
+    return MVT::i32;
+  return VT.changeVectorElementTypeToInteger();
+}
+
+/// computeKnownBitsForTargetNode - Determine which of the bits specified in
+/// Mask are known to be either zero or one and return them in the
+/// KnownZero/KnownOne bitsets.
+void AArch64TargetLowering::computeKnownBitsForTargetNode(
+    const SDValue Op, APInt &KnownZero, APInt &KnownOne,
+    const SelectionDAG &DAG, unsigned Depth) const {
+  switch (Op.getOpcode()) {
+  default:
+    break;
+  case AArch64ISD::CSEL: {
+    APInt KnownZero2, KnownOne2;
+    DAG.computeKnownBits(Op->getOperand(0), KnownZero, KnownOne, Depth + 1);
+    DAG.computeKnownBits(Op->getOperand(1), KnownZero2, KnownOne2, Depth + 1);
+    KnownZero &= KnownZero2;
+    KnownOne &= KnownOne2;
+    break;
+  }
+  case ISD::INTRINSIC_W_CHAIN: {
+   ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
+    Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
+    switch (IntID) {
+    default: return;
+    case Intrinsic::aarch64_ldaxr:
+    case Intrinsic::aarch64_ldxr: {
+      unsigned BitWidth = KnownOne.getBitWidth();
+      EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
+      unsigned MemBits = VT.getScalarType().getSizeInBits();
+      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
+      return;
+    }
+    }
+    break;
+  }
+  case ISD::INTRINSIC_WO_CHAIN:
+  case ISD::INTRINSIC_VOID: {
+    unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    switch (IntNo) {
+    default:
+      break;
+    case Intrinsic::aarch64_neon_umaxv:
+    case Intrinsic::aarch64_neon_uminv: {
+      // Figure out the datatype of the vector operand. The UMINV instruction
+      // will zero extend the result, so we can mark as known zero all the
+      // bits larger than the element datatype. 32-bit or larget doesn't need
+      // this as those are legal types and will be handled by isel directly.
+      MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
+      unsigned BitWidth = KnownZero.getBitWidth();
+      if (VT == MVT::v8i8 || VT == MVT::v16i8) {
+        assert(BitWidth >= 8 && "Unexpected width!");
+        APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
+        KnownZero |= Mask;
+      } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
+        assert(BitWidth >= 16 && "Unexpected width!");
+        APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
+        KnownZero |= Mask;
+      }
+      break;
+    } break;
+    }
+  }
+  }
+}
+
+MVT AArch64TargetLowering::getScalarShiftAmountTy(EVT LHSTy) const {
+  return MVT::i64;
+}
+
+unsigned AArch64TargetLowering::getMaximalGlobalOffset() const {
+  // FIXME: On AArch64, this depends on the type.
+  // Basically, the addressable offsets are up to 4095 * Ty.getSizeInBytes().
+  // and the offset has to be a multiple of the related size in bytes.
+  return 4095;
+}
+
+FastISel *
+AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
+                                      const TargetLibraryInfo *libInfo) const {
+  return AArch64::createFastISel(funcInfo, libInfo);
+}
+
+const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default:
+    return nullptr;
+  case AArch64ISD::CALL:              return "AArch64ISD::CALL";
+  case AArch64ISD::ADRP:              return "AArch64ISD::ADRP";
+  case AArch64ISD::ADDlow:            return "AArch64ISD::ADDlow";
+  case AArch64ISD::LOADgot:           return "AArch64ISD::LOADgot";
+  case AArch64ISD::RET_FLAG:          return "AArch64ISD::RET_FLAG";
+  case AArch64ISD::BRCOND:            return "AArch64ISD::BRCOND";
+  case AArch64ISD::CSEL:              return "AArch64ISD::CSEL";
+  case AArch64ISD::FCSEL:             return "AArch64ISD::FCSEL";
+  case AArch64ISD::CSINV:             return "AArch64ISD::CSINV";
+  case AArch64ISD::CSNEG:             return "AArch64ISD::CSNEG";
+  case AArch64ISD::CSINC:             return "AArch64ISD::CSINC";
+  case AArch64ISD::THREAD_POINTER:    return "AArch64ISD::THREAD_POINTER";
+  case AArch64ISD::TLSDESC_CALL:      return "AArch64ISD::TLSDESC_CALL";
+  case AArch64ISD::ADC:               return "AArch64ISD::ADC";
+  case AArch64ISD::SBC:               return "AArch64ISD::SBC";
+  case AArch64ISD::ADDS:              return "AArch64ISD::ADDS";
+  case AArch64ISD::SUBS:              return "AArch64ISD::SUBS";
+  case AArch64ISD::ADCS:              return "AArch64ISD::ADCS";
+  case AArch64ISD::SBCS:              return "AArch64ISD::SBCS";
+  case AArch64ISD::ANDS:              return "AArch64ISD::ANDS";
+  case AArch64ISD::FCMP:              return "AArch64ISD::FCMP";
+  case AArch64ISD::FMIN:              return "AArch64ISD::FMIN";
+  case AArch64ISD::FMAX:              return "AArch64ISD::FMAX";
+  case AArch64ISD::DUP:               return "AArch64ISD::DUP";
+  case AArch64ISD::DUPLANE8:          return "AArch64ISD::DUPLANE8";
+  case AArch64ISD::DUPLANE16:         return "AArch64ISD::DUPLANE16";
+  case AArch64ISD::DUPLANE32:         return "AArch64ISD::DUPLANE32";
+  case AArch64ISD::DUPLANE64:         return "AArch64ISD::DUPLANE64";
+  case AArch64ISD::MOVI:              return "AArch64ISD::MOVI";
+  case AArch64ISD::MOVIshift:         return "AArch64ISD::MOVIshift";
+  case AArch64ISD::MOVIedit:          return "AArch64ISD::MOVIedit";
+  case AArch64ISD::MOVImsl:           return "AArch64ISD::MOVImsl";
+  case AArch64ISD::FMOV:              return "AArch64ISD::FMOV";
+  case AArch64ISD::MVNIshift:         return "AArch64ISD::MVNIshift";
+  case AArch64ISD::MVNImsl:           return "AArch64ISD::MVNImsl";
+  case AArch64ISD::BICi:              return "AArch64ISD::BICi";
+  case AArch64ISD::ORRi:              return "AArch64ISD::ORRi";
+  case AArch64ISD::BSL:               return "AArch64ISD::BSL";
+  case AArch64ISD::NEG:               return "AArch64ISD::NEG";
+  case AArch64ISD::EXTR:              return "AArch64ISD::EXTR";
+  case AArch64ISD::ZIP1:              return "AArch64ISD::ZIP1";
+  case AArch64ISD::ZIP2:              return "AArch64ISD::ZIP2";
+  case AArch64ISD::UZP1:              return "AArch64ISD::UZP1";
+  case AArch64ISD::UZP2:              return "AArch64ISD::UZP2";
+  case AArch64ISD::TRN1:              return "AArch64ISD::TRN1";
+  case AArch64ISD::TRN2:              return "AArch64ISD::TRN2";
+  case AArch64ISD::REV16:             return "AArch64ISD::REV16";
+  case AArch64ISD::REV32:             return "AArch64ISD::REV32";
+  case AArch64ISD::REV64:             return "AArch64ISD::REV64";
+  case AArch64ISD::EXT:               return "AArch64ISD::EXT";
+  case AArch64ISD::VSHL:              return "AArch64ISD::VSHL";
+  case AArch64ISD::VLSHR:             return "AArch64ISD::VLSHR";
+  case AArch64ISD::VASHR:             return "AArch64ISD::VASHR";
+  case AArch64ISD::CMEQ:              return "AArch64ISD::CMEQ";
+  case AArch64ISD::CMGE:              return "AArch64ISD::CMGE";
+  case AArch64ISD::CMGT:              return "AArch64ISD::CMGT";
+  case AArch64ISD::CMHI:              return "AArch64ISD::CMHI";
+  case AArch64ISD::CMHS:              return "AArch64ISD::CMHS";
+  case AArch64ISD::FCMEQ:             return "AArch64ISD::FCMEQ";
+  case AArch64ISD::FCMGE:             return "AArch64ISD::FCMGE";
+  case AArch64ISD::FCMGT:             return "AArch64ISD::FCMGT";
+  case AArch64ISD::CMEQz:             return "AArch64ISD::CMEQz";
+  case AArch64ISD::CMGEz:             return "AArch64ISD::CMGEz";
+  case AArch64ISD::CMGTz:             return "AArch64ISD::CMGTz";
+  case AArch64ISD::CMLEz:             return "AArch64ISD::CMLEz";
+  case AArch64ISD::CMLTz:             return "AArch64ISD::CMLTz";
+  case AArch64ISD::FCMEQz:            return "AArch64ISD::FCMEQz";
+  case AArch64ISD::FCMGEz:            return "AArch64ISD::FCMGEz";
+  case AArch64ISD::FCMGTz:            return "AArch64ISD::FCMGTz";
+  case AArch64ISD::FCMLEz:            return "AArch64ISD::FCMLEz";
+  case AArch64ISD::FCMLTz:            return "AArch64ISD::FCMLTz";
+  case AArch64ISD::NOT:               return "AArch64ISD::NOT";
+  case AArch64ISD::BIT:               return "AArch64ISD::BIT";
+  case AArch64ISD::CBZ:               return "AArch64ISD::CBZ";
+  case AArch64ISD::CBNZ:              return "AArch64ISD::CBNZ";
+  case AArch64ISD::TBZ:               return "AArch64ISD::TBZ";
+  case AArch64ISD::TBNZ:              return "AArch64ISD::TBNZ";
+  case AArch64ISD::TC_RETURN:         return "AArch64ISD::TC_RETURN";
+  case AArch64ISD::SITOF:             return "AArch64ISD::SITOF";
+  case AArch64ISD::UITOF:             return "AArch64ISD::UITOF";
+  case AArch64ISD::SQSHL_I:           return "AArch64ISD::SQSHL_I";
+  case AArch64ISD::UQSHL_I:           return "AArch64ISD::UQSHL_I";
+  case AArch64ISD::SRSHR_I:           return "AArch64ISD::SRSHR_I";
+  case AArch64ISD::URSHR_I:           return "AArch64ISD::URSHR_I";
+  case AArch64ISD::SQSHLU_I:          return "AArch64ISD::SQSHLU_I";
+  case AArch64ISD::WrapperLarge:      return "AArch64ISD::WrapperLarge";
+  case AArch64ISD::LD2post:           return "AArch64ISD::LD2post";
+  case AArch64ISD::LD3post:           return "AArch64ISD::LD3post";
+  case AArch64ISD::LD4post:           return "AArch64ISD::LD4post";
+  case AArch64ISD::ST2post:           return "AArch64ISD::ST2post";
+  case AArch64ISD::ST3post:           return "AArch64ISD::ST3post";
+  case AArch64ISD::ST4post:           return "AArch64ISD::ST4post";
+  case AArch64ISD::LD1x2post:         return "AArch64ISD::LD1x2post";
+  case AArch64ISD::LD1x3post:         return "AArch64ISD::LD1x3post";
+  case AArch64ISD::LD1x4post:         return "AArch64ISD::LD1x4post";
+  case AArch64ISD::ST1x2post:         return "AArch64ISD::ST1x2post";
+  case AArch64ISD::ST1x3post:         return "AArch64ISD::ST1x3post";
+  case AArch64ISD::ST1x4post:         return "AArch64ISD::ST1x4post";
+  case AArch64ISD::LD1DUPpost:        return "AArch64ISD::LD1DUPpost";
+  case AArch64ISD::LD2DUPpost:        return "AArch64ISD::LD2DUPpost";
+  case AArch64ISD::LD3DUPpost:        return "AArch64ISD::LD3DUPpost";
+  case AArch64ISD::LD4DUPpost:        return "AArch64ISD::LD4DUPpost";
+  case AArch64ISD::LD1LANEpost:       return "AArch64ISD::LD1LANEpost";
+  case AArch64ISD::LD2LANEpost:       return "AArch64ISD::LD2LANEpost";
+  case AArch64ISD::LD3LANEpost:       return "AArch64ISD::LD3LANEpost";
+  case AArch64ISD::LD4LANEpost:       return "AArch64ISD::LD4LANEpost";
+  case AArch64ISD::ST2LANEpost:       return "AArch64ISD::ST2LANEpost";
+  case AArch64ISD::ST3LANEpost:       return "AArch64ISD::ST3LANEpost";
+  case AArch64ISD::ST4LANEpost:       return "AArch64ISD::ST4LANEpost";
+  }
+}
+
+MachineBasicBlock *
+AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
+                                    MachineBasicBlock *MBB) const {
+  // We materialise the F128CSEL pseudo-instruction as some control flow and a
+  // phi node:
+
+  // OrigBB:
+  //     [... previous instrs leading to comparison ...]
+  //     b.ne TrueBB
+  //     b EndBB
+  // TrueBB:
+  //     ; Fallthrough
+  // EndBB:
+  //     Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
+
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineFunction *MF = MBB->getParent();
+  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+  DebugLoc DL = MI->getDebugLoc();
+  MachineFunction::iterator It = MBB;
+  ++It;
+
+  unsigned DestReg = MI->getOperand(0).getReg();
+  unsigned IfTrueReg = MI->getOperand(1).getReg();
+  unsigned IfFalseReg = MI->getOperand(2).getReg();
+  unsigned CondCode = MI->getOperand(3).getImm();
+  bool NZCVKilled = MI->getOperand(4).isKill();
+
+  MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MF->insert(It, TrueBB);
+  MF->insert(It, EndBB);
+
+  // Transfer rest of current basic-block to EndBB
+  EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
+                MBB->end());
+  EndBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
+  BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
+  MBB->addSuccessor(TrueBB);
+  MBB->addSuccessor(EndBB);
+
+  // TrueBB falls through to the end.
+  TrueBB->addSuccessor(EndBB);
+
+  if (!NZCVKilled) {
+    TrueBB->addLiveIn(AArch64::NZCV);
+    EndBB->addLiveIn(AArch64::NZCV);
+  }
+
+  BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
+      .addReg(IfTrueReg)
+      .addMBB(TrueBB)
+      .addReg(IfFalseReg)
+      .addMBB(MBB);
+
+  MI->eraseFromParent();
+  return EndBB;
+}
+
+MachineBasicBlock *
+AArch64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                 MachineBasicBlock *BB) const {
+  switch (MI->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    MI->dump();
+#endif
+    llvm_unreachable("Unexpected instruction for custom inserter!");
+
+  case AArch64::F128CSEL:
+    return EmitF128CSEL(MI, BB);
+
+  case TargetOpcode::STACKMAP:
+  case TargetOpcode::PATCHPOINT:
+    return emitPatchPoint(MI, BB);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// AArch64 Lowering private implementation.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Lowering Code
+//===----------------------------------------------------------------------===//
+
+/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
+/// CC
+static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) {
+  switch (CC) {
+  default:
+    llvm_unreachable("Unknown condition code!");
+  case ISD::SETNE:
+    return AArch64CC::NE;
+  case ISD::SETEQ:
+    return AArch64CC::EQ;
+  case ISD::SETGT:
+    return AArch64CC::GT;
+  case ISD::SETGE:
+    return AArch64CC::GE;
+  case ISD::SETLT:
+    return AArch64CC::LT;
+  case ISD::SETLE:
+    return AArch64CC::LE;
+  case ISD::SETUGT:
+    return AArch64CC::HI;
+  case ISD::SETUGE:
+    return AArch64CC::HS;
+  case ISD::SETULT:
+    return AArch64CC::LO;
+  case ISD::SETULE:
+    return AArch64CC::LS;
+  }
+}
+
+/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
+static void changeFPCCToAArch64CC(ISD::CondCode CC,
+                                  AArch64CC::CondCode &CondCode,
+                                  AArch64CC::CondCode &CondCode2) {
+  CondCode2 = AArch64CC::AL;
+  switch (CC) {
+  default:
+    llvm_unreachable("Unknown FP condition!");
+  case ISD::SETEQ:
+  case ISD::SETOEQ:
+    CondCode = AArch64CC::EQ;
+    break;
+  case ISD::SETGT:
+  case ISD::SETOGT:
+    CondCode = AArch64CC::GT;
+    break;
+  case ISD::SETGE:
+  case ISD::SETOGE:
+    CondCode = AArch64CC::GE;
+    break;
+  case ISD::SETOLT:
+    CondCode = AArch64CC::MI;
+    break;
+  case ISD::SETOLE:
+    CondCode = AArch64CC::LS;
+    break;
+  case ISD::SETONE:
+    CondCode = AArch64CC::MI;
+    CondCode2 = AArch64CC::GT;
+    break;
+  case ISD::SETO:
+    CondCode = AArch64CC::VC;
+    break;
+  case ISD::SETUO:
+    CondCode = AArch64CC::VS;
+    break;
+  case ISD::SETUEQ:
+    CondCode = AArch64CC::EQ;
+    CondCode2 = AArch64CC::VS;
+    break;
+  case ISD::SETUGT:
+    CondCode = AArch64CC::HI;
+    break;
+  case ISD::SETUGE:
+    CondCode = AArch64CC::PL;
+    break;
+  case ISD::SETLT:
+  case ISD::SETULT:
+    CondCode = AArch64CC::LT;
+    break;
+  case ISD::SETLE:
+  case ISD::SETULE:
+    CondCode = AArch64CC::LE;
+    break;
+  case ISD::SETNE:
+  case ISD::SETUNE:
+    CondCode = AArch64CC::NE;
+    break;
+  }
+}
+
+/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
+/// CC usable with the vector instructions. Fewer operations are available
+/// without a real NZCV register, so we have to use less efficient combinations
+/// to get the same effect.
+static void changeVectorFPCCToAArch64CC(ISD::CondCode CC,
+                                        AArch64CC::CondCode &CondCode,
+                                        AArch64CC::CondCode &CondCode2,
+                                        bool &Invert) {
+  Invert = false;
+  switch (CC) {
+  default:
+    // Mostly the scalar mappings work fine.
+    changeFPCCToAArch64CC(CC, CondCode, CondCode2);
+    break;
+  case ISD::SETUO:
+    Invert = true; // Fallthrough
+  case ISD::SETO:
+    CondCode = AArch64CC::MI;
+    CondCode2 = AArch64CC::GE;
+    break;
+  case ISD::SETUEQ:
+  case ISD::SETULT:
+  case ISD::SETULE:
+  case ISD::SETUGT:
+  case ISD::SETUGE:
+    // All of the compare-mask comparisons are ordered, but we can switch
+    // between the two by a double inversion. E.g. ULE == !OGT.
+    Invert = true;
+    changeFPCCToAArch64CC(getSetCCInverse(CC, false), CondCode, CondCode2);
+    break;
+  }
+}
+
+static bool isLegalArithImmed(uint64_t C) {
+  // Matches AArch64DAGToDAGISel::SelectArithImmed().
+  return (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
+}
+
+static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+                              SDLoc dl, SelectionDAG &DAG) {
+  EVT VT = LHS.getValueType();
+
+  if (VT.isFloatingPoint())
+    return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
+
+  // The CMP instruction is just an alias for SUBS, and representing it as
+  // SUBS means that it's possible to get CSE with subtract operations.
+  // A later phase can perform the optimization of setting the destination
+  // register to WZR/XZR if it ends up being unused.
+  unsigned Opcode = AArch64ISD::SUBS;
+
+  if (RHS.getOpcode() == ISD::SUB && isa<ConstantSDNode>(RHS.getOperand(0)) &&
+      cast<ConstantSDNode>(RHS.getOperand(0))->getZExtValue() == 0 &&
+      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+    // We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on
+    // the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags
+    // can be set differently by this operation. It comes down to whether
+    // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
+    // everything is fine. If not then the optimization is wrong. Thus general
+    // comparisons are only valid if op2 != 0.
+
+    // So, finally, the only LLVM-native comparisons that don't mention C and V
+    // are SETEQ and SETNE. They're the only ones we can safely use CMN for in
+    // the absence of information about op2.
+    Opcode = AArch64ISD::ADDS;
+    RHS = RHS.getOperand(1);
+  } else if (LHS.getOpcode() == ISD::AND && isa<ConstantSDNode>(RHS) &&
+             cast<ConstantSDNode>(RHS)->getZExtValue() == 0 &&
+             !isUnsignedIntSetCC(CC)) {
+    // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
+    // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
+    // of the signed comparisons.
+    Opcode = AArch64ISD::ANDS;
+    RHS = LHS.getOperand(1);
+    LHS = LHS.getOperand(0);
+  }
+
+  return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS)
+      .getValue(1);
+}
+
+static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+                             SDValue &AArch64cc, SelectionDAG &DAG, SDLoc dl) {
+  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
+    EVT VT = RHS.getValueType();
+    uint64_t C = RHSC->getZExtValue();
+    if (!isLegalArithImmed(C)) {
+      // Constant does not fit, try adjusting it by one?
+      switch (CC) {
+      default:
+        break;
+      case ISD::SETLT:
+      case ISD::SETGE:
+        if ((VT == MVT::i32 && C != 0x80000000 &&
+             isLegalArithImmed((uint32_t)(C - 1))) ||
+            (VT == MVT::i64 && C != 0x80000000ULL &&
+             isLegalArithImmed(C - 1ULL))) {
+          CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
+          C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
+          RHS = DAG.getConstant(C, VT);
+        }
+        break;
+      case ISD::SETULT:
+      case ISD::SETUGE:
+        if ((VT == MVT::i32 && C != 0 &&
+             isLegalArithImmed((uint32_t)(C - 1))) ||
+            (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
+          CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
+          C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
+          RHS = DAG.getConstant(C, VT);
+        }
+        break;
+      case ISD::SETLE:
+      case ISD::SETGT:
+        if ((VT == MVT::i32 && C != 0x7fffffff &&
+             isLegalArithImmed((uint32_t)(C + 1))) ||
+            (VT == MVT::i64 && C != 0x7ffffffffffffffULL &&
+             isLegalArithImmed(C + 1ULL))) {
+          CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
+          C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
+          RHS = DAG.getConstant(C, VT);
+        }
+        break;
+      case ISD::SETULE:
+      case ISD::SETUGT:
+        if ((VT == MVT::i32 && C != 0xffffffff &&
+             isLegalArithImmed((uint32_t)(C + 1))) ||
+            (VT == MVT::i64 && C != 0xfffffffffffffffULL &&
+             isLegalArithImmed(C + 1ULL))) {
+          CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
+          C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
+          RHS = DAG.getConstant(C, VT);
+        }
+        break;
+      }
+    }
+  }
+
+  SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
+  AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
+  AArch64cc = DAG.getConstant(AArch64CC, MVT::i32);
+  return Cmp;
+}
+
+static std::pair<SDValue, SDValue>
+getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
+  assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
+         "Unsupported value type");
+  SDValue Value, Overflow;
+  SDLoc DL(Op);
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  unsigned Opc = 0;
+  switch (Op.getOpcode()) {
+  default:
+    llvm_unreachable("Unknown overflow instruction!");
+  case ISD::SADDO:
+    Opc = AArch64ISD::ADDS;
+    CC = AArch64CC::VS;
+    break;
+  case ISD::UADDO:
+    Opc = AArch64ISD::ADDS;
+    CC = AArch64CC::HS;
+    break;
+  case ISD::SSUBO:
+    Opc = AArch64ISD::SUBS;
+    CC = AArch64CC::VS;
+    break;
+  case ISD::USUBO:
+    Opc = AArch64ISD::SUBS;
+    CC = AArch64CC::LO;
+    break;
+  // Multiply needs a little bit extra work.
+  case ISD::SMULO:
+  case ISD::UMULO: {
+    CC = AArch64CC::NE;
+    bool IsSigned = (Op.getOpcode() == ISD::SMULO) ? true : false;
+    if (Op.getValueType() == MVT::i32) {
+      unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+      // For a 32 bit multiply with overflow check we want the instruction
+      // selector to generate a widening multiply (SMADDL/UMADDL). For that we
+      // need to generate the following pattern:
+      // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b))
+      LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
+      RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
+      SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
+      SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul,
+                                DAG.getConstant(0, MVT::i64));
+      // On AArch64 the upper 32 bits are always zero extended for a 32 bit
+      // operation. We need to clear out the upper 32 bits, because we used a
+      // widening multiply that wrote all 64 bits. In the end this should be a
+      // noop.
+      Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add);
+      if (IsSigned) {
+        // The signed overflow check requires more than just a simple check for
+        // any bit set in the upper 32 bits of the result. These bits could be
+        // just the sign bits of a negative number. To perform the overflow
+        // check we have to arithmetic shift right the 32nd bit of the result by
+        // 31 bits. Then we compare the result to the upper 32 bits.
+        SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add,
+                                        DAG.getConstant(32, MVT::i64));
+        UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits);
+        SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value,
+                                        DAG.getConstant(31, MVT::i64));
+        // It is important that LowerBits is last, otherwise the arithmetic
+        // shift will not be folded into the compare (SUBS).
+        SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32);
+        Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
+                       .getValue(1);
+      } else {
+        // The overflow check for unsigned multiply is easy. We only need to
+        // check if any of the upper 32 bits are set. This can be done with a
+        // CMP (shifted register). For that we need to generate the following
+        // pattern:
+        // (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32)
+        SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
+                                        DAG.getConstant(32, MVT::i64));
+        SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
+        Overflow =
+            DAG.getNode(AArch64ISD::SUBS, DL, VTs, DAG.getConstant(0, MVT::i64),
+                        UpperBits).getValue(1);
+      }
+      break;
+    }
+    assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
+    // For the 64 bit multiply
+    Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
+    if (IsSigned) {
+      SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
+      SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
+                                      DAG.getConstant(63, MVT::i64));
+      // It is important that LowerBits is last, otherwise the arithmetic
+      // shift will not be folded into the compare (SUBS).
+      SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
+      Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
+                     .getValue(1);
+    } else {
+      SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
+      SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
+      Overflow =
+          DAG.getNode(AArch64ISD::SUBS, DL, VTs, DAG.getConstant(0, MVT::i64),
+                      UpperBits).getValue(1);
+    }
+    break;
+  }
+  } // switch (...)
+
+  if (Opc) {
+    SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
+
+    // Emit the AArch64 operation with overflow check.
+    Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
+    Overflow = Value.getValue(1);
+  }
+  return std::make_pair(Value, Overflow);
+}
+
+SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
+                                             RTLIB::Libcall Call) const {
+  SmallVector<SDValue, 2> Ops;
+  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i)
+    Ops.push_back(Op.getOperand(i));
+
+  return makeLibCall(DAG, Call, MVT::f128, &Ops[0], Ops.size(), false,
+                     SDLoc(Op)).first;
+}
+
+static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) {
+  SDValue Sel = Op.getOperand(0);
+  SDValue Other = Op.getOperand(1);
+
+  // If neither operand is a SELECT_CC, give up.
+  if (Sel.getOpcode() != ISD::SELECT_CC)
+    std::swap(Sel, Other);
+  if (Sel.getOpcode() != ISD::SELECT_CC)
+    return Op;
+
+  // The folding we want to perform is:
+  // (xor x, (select_cc a, b, cc, 0, -1) )
+  //   -->
+  // (csel x, (xor x, -1), cc ...)
+  //
+  // The latter will get matched to a CSINV instruction.
+
+  ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
+  SDValue LHS = Sel.getOperand(0);
+  SDValue RHS = Sel.getOperand(1);
+  SDValue TVal = Sel.getOperand(2);
+  SDValue FVal = Sel.getOperand(3);
+  SDLoc dl(Sel);
+
+  // FIXME: This could be generalized to non-integer comparisons.
+  if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
+    return Op;
+
+  ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
+  ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
+
+  // The the values aren't constants, this isn't the pattern we're looking for.
+  if (!CFVal || !CTVal)
+    return Op;
+
+  // We can commute the SELECT_CC by inverting the condition.  This
+  // might be needed to make this fit into a CSINV pattern.
+  if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
+    std::swap(TVal, FVal);
+    std::swap(CTVal, CFVal);
+    CC = ISD::getSetCCInverse(CC, true);
+  }
+
+  // If the constants line up, perform the transform!
+  if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
+    SDValue CCVal;
+    SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
+
+    FVal = Other;
+    TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
+                       DAG.getConstant(-1ULL, Other.getValueType()));
+
+    return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
+                       CCVal, Cmp);
+  }
+
+  return Op;
+}
+
+static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+
+  // Let legalize expand this if it isn't a legal type yet.
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+
+  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+
+  unsigned Opc;
+  bool ExtraOp = false;
+  switch (Op.getOpcode()) {
+  default:
+    llvm_unreachable("Invalid code");
+  case ISD::ADDC:
+    Opc = AArch64ISD::ADDS;
+    break;
+  case ISD::SUBC:
+    Opc = AArch64ISD::SUBS;
+    break;
+  case ISD::ADDE:
+    Opc = AArch64ISD::ADCS;
+    ExtraOp = true;
+    break;
+  case ISD::SUBE:
+    Opc = AArch64ISD::SBCS;
+    ExtraOp = true;
+    break;
+  }
+
+  if (!ExtraOp)
+    return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
+  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
+                     Op.getOperand(2));
+}
+
+static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
+  // Let legalize expand this if it isn't a legal type yet.
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
+    return SDValue();
+
+  AArch64CC::CondCode CC;
+  // The actual operation that sets the overflow or carry flag.
+  SDValue Value, Overflow;
+  std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
+
+  // We use 0 and 1 as false and true values.
+  SDValue TVal = DAG.getConstant(1, MVT::i32);
+  SDValue FVal = DAG.getConstant(0, MVT::i32);
+
+  // We use an inverted condition, because the conditional select is inverted
+  // too. This will allow it to be selected to a single instruction:
+  // CSINC Wd, WZR, WZR, invert(cond).
+  SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), MVT::i32);
+  Overflow = DAG.getNode(AArch64ISD::CSEL, SDLoc(Op), MVT::i32, FVal, TVal,
+                         CCVal, Overflow);
+
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+  return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), VTs, Value, Overflow);
+}
+
+// Prefetch operands are:
+// 1: Address to prefetch
+// 2: bool isWrite
+// 3: int locality (0 = no locality ... 3 = extreme locality)
+// 4: bool isDataCache
+static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+  unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+  // The data thing is not used.
+  // unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
+
+  bool IsStream = !Locality;
+  // When the locality number is set
+  if (Locality) {
+    // The front-end should have filtered out the out-of-range values
+    assert(Locality <= 3 && "Prefetch locality out-of-range");
+    // The locality degree is the opposite of the cache speed.
+    // Put the number the other way around.
+    // The encoding starts at 0 for level 1
+    Locality = 3 - Locality;
+  }
+
+  // built the mask value encoding the expected behavior.
+  unsigned PrfOp = (IsWrite << 4) |     // Load/Store bit
+                   (Locality << 1) |    // Cache level bits
+                   (unsigned)IsStream;  // Stream bit
+  return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
+                     DAG.getConstant(PrfOp, MVT::i32), Op.getOperand(1));
+}
+
+SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
+
+  RTLIB::Libcall LC;
+  LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
+
+  return LowerF128Call(Op, DAG, LC);
+}
+
+SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  if (Op.getOperand(0).getValueType() != MVT::f128) {
+    // It's legal except when f128 is involved
+    return Op;
+  }
+
+  RTLIB::Libcall LC;
+  LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
+
+  // FP_ROUND node has a second operand indicating whether it is known to be
+  // precise. That doesn't take part in the LibCall so we can't directly use
+  // LowerF128Call.
+  SDValue SrcVal = Op.getOperand(0);
+  return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1,
+                     /*isSigned*/ false, SDLoc(Op)).first;
+}
+
+static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
+  // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
+  // Any additional optimization in this function should be recorded
+  // in the cost tables.
+  EVT InVT = Op.getOperand(0).getValueType();
+  EVT VT = Op.getValueType();
+
+  if (VT.getSizeInBits() < InVT.getSizeInBits()) {
+    SDLoc dl(Op);
+    SDValue Cv =
+        DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
+                    Op.getOperand(0));
+    return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
+  }
+
+  if (VT.getSizeInBits() > InVT.getSizeInBits()) {
+    SDLoc dl(Op);
+    SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v2f64, Op.getOperand(0));
+    return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
+  }
+
+  // Type changing conversions are illegal.
+  return Op;
+}
+
+SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  if (Op.getOperand(0).getValueType().isVector())
+    return LowerVectorFP_TO_INT(Op, DAG);
+
+  if (Op.getOperand(0).getValueType() != MVT::f128) {
+    // It's legal except when f128 is involved
+    return Op;
+  }
+
+  RTLIB::Libcall LC;
+  if (Op.getOpcode() == ISD::FP_TO_SINT)
+    LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType());
+  else
+    LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());
+
+  SmallVector<SDValue, 2> Ops;
+  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i)
+    Ops.push_back(Op.getOperand(i));
+
+  return makeLibCall(DAG, LC, Op.getValueType(), &Ops[0], Ops.size(), false,
+                     SDLoc(Op)).first;
+}
+
+static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
+  // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
+  // Any additional optimization in this function should be recorded
+  // in the cost tables.
+  EVT VT = Op.getValueType();
+  SDLoc dl(Op);
+  SDValue In = Op.getOperand(0);
+  EVT InVT = In.getValueType();
+
+  if (VT.getSizeInBits() < InVT.getSizeInBits()) {
+    MVT CastVT =
+        MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()),
+                         InVT.getVectorNumElements());
+    In = DAG.getNode(Op.getOpcode(), dl, CastVT, In);
+    return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0));
+  }
+
+  if (VT.getSizeInBits() > InVT.getSizeInBits()) {
+    unsigned CastOpc =
+        Op.getOpcode() == ISD::SINT_TO_FP ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+    EVT CastVT = VT.changeVectorElementTypeToInteger();
+    In = DAG.getNode(CastOpc, dl, CastVT, In);
+    return DAG.getNode(Op.getOpcode(), dl, VT, In);
+  }
+
+  return Op;
+}
+
+SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  if (Op.getValueType().isVector())
+    return LowerVectorINT_TO_FP(Op, DAG);
+
+  // i128 conversions are libcalls.
+  if (Op.getOperand(0).getValueType() == MVT::i128)
+    return SDValue();
+
+  // Other conversions are legal, unless it's to the completely software-based
+  // fp128.
+  if (Op.getValueType() != MVT::f128)
+    return Op;
+
+  RTLIB::Libcall LC;
+  if (Op.getOpcode() == ISD::SINT_TO_FP)
+    LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
+  else
+    LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
+
+  return LowerF128Call(Op, DAG, LC);
+}
+
+SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  // For iOS, we want to call an alternative entry point: __sincos_stret,
+  // which returns the values in two S / D registers.
+  SDLoc dl(Op);
+  SDValue Arg = Op.getOperand(0);
+  EVT ArgVT = Arg.getValueType();
+  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+
+  ArgListTy Args;
+  ArgListEntry Entry;
+
+  Entry.Node = Arg;
+  Entry.Ty = ArgTy;
+  Entry.isSExt = false;
+  Entry.isZExt = false;
+  Args.push_back(Entry);
+
+  const char *LibcallName =
+      (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret";
+  SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy());
+
+  StructType *RetTy = StructType::get(ArgTy, ArgTy, NULL);
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
+    .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args), 0);
+
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+  return CallResult.first;
+}
+
+static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) {
+  if (Op.getValueType() != MVT::f16)
+    return SDValue();
+
+  assert(Op.getOperand(0).getValueType() == MVT::i16);
+  SDLoc DL(Op);
+
+  Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
+  Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
+  return SDValue(
+      DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op,
+                         DAG.getTargetConstant(AArch64::hsub, MVT::i32)),
+      0);
+}
+
+
+SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  default:
+    llvm_unreachable("unimplemented operand");
+    return SDValue();
+  case ISD::BITCAST:
+    return LowerBITCAST(Op, DAG);
+  case ISD::GlobalAddress:
+    return LowerGlobalAddress(Op, DAG);
+  case ISD::GlobalTLSAddress:
+    return LowerGlobalTLSAddress(Op, DAG);
+  case ISD::SETCC:
+    return LowerSETCC(Op, DAG);
+  case ISD::BR_CC:
+    return LowerBR_CC(Op, DAG);
+  case ISD::SELECT:
+    return LowerSELECT(Op, DAG);
+  case ISD::SELECT_CC:
+    return LowerSELECT_CC(Op, DAG);
+  case ISD::JumpTable:
+    return LowerJumpTable(Op, DAG);
+  case ISD::ConstantPool:
+    return LowerConstantPool(Op, DAG);
+  case ISD::BlockAddress:
+    return LowerBlockAddress(Op, DAG);
+  case ISD::VASTART:
+    return LowerVASTART(Op, DAG);
+  case ISD::VACOPY:
+    return LowerVACOPY(Op, DAG);
+  case ISD::VAARG:
+    return LowerVAARG(Op, DAG);
+  case ISD::ADDC:
+  case ISD::ADDE:
+  case ISD::SUBC:
+  case ISD::SUBE:
+    return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
+  case ISD::SADDO:
+  case ISD::UADDO:
+  case ISD::SSUBO:
+  case ISD::USUBO:
+  case ISD::SMULO:
+  case ISD::UMULO:
+    return LowerXALUO(Op, DAG);
+  case ISD::FADD:
+    return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
+  case ISD::FSUB:
+    return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
+  case ISD::FMUL:
+    return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
+  case ISD::FDIV:
+    return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
+  case ISD::FP_ROUND:
+    return LowerFP_ROUND(Op, DAG);
+  case ISD::FP_EXTEND:
+    return LowerFP_EXTEND(Op, DAG);
+  case ISD::FRAMEADDR:
+    return LowerFRAMEADDR(Op, DAG);
+  case ISD::RETURNADDR:
+    return LowerRETURNADDR(Op, DAG);
+  case ISD::INSERT_VECTOR_ELT:
+    return LowerINSERT_VECTOR_ELT(Op, DAG);
+  case ISD::EXTRACT_VECTOR_ELT:
+    return LowerEXTRACT_VECTOR_ELT(Op, DAG);
+  case ISD::BUILD_VECTOR:
+    return LowerBUILD_VECTOR(Op, DAG);
+  case ISD::VECTOR_SHUFFLE:
+    return LowerVECTOR_SHUFFLE(Op, DAG);
+  case ISD::EXTRACT_SUBVECTOR:
+    return LowerEXTRACT_SUBVECTOR(Op, DAG);
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::SHL:
+    return LowerVectorSRA_SRL_SHL(Op, DAG);
+  case ISD::SHL_PARTS:
+    return LowerShiftLeftParts(Op, DAG);
+  case ISD::SRL_PARTS:
+  case ISD::SRA_PARTS:
+    return LowerShiftRightParts(Op, DAG);
+  case ISD::CTPOP:
+    return LowerCTPOP(Op, DAG);
+  case ISD::FCOPYSIGN:
+    return LowerFCOPYSIGN(Op, DAG);
+  case ISD::AND:
+    return LowerVectorAND(Op, DAG);
+  case ISD::OR:
+    return LowerVectorOR(Op, DAG);
+  case ISD::XOR:
+    return LowerXOR(Op, DAG);
+  case ISD::PREFETCH:
+    return LowerPREFETCH(Op, DAG);
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+    return LowerINT_TO_FP(Op, DAG);
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+    return LowerFP_TO_INT(Op, DAG);
+  case ISD::FSINCOS:
+    return LowerFSINCOS(Op, DAG);
+  }
+}
+
+/// getFunctionAlignment - Return the Log2 alignment of this function.
+unsigned AArch64TargetLowering::getFunctionAlignment(const Function *F) const {
+  return 2;
+}
+
+//===----------------------------------------------------------------------===//
+//                      Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "AArch64GenCallingConv.inc"
+
+/// Selects the correct CCAssignFn for a the given CallingConvention
+/// value.
+CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
+                                                     bool IsVarArg) const {
+  switch (CC) {
+  default:
+    llvm_unreachable("Unsupported calling convention.");
+  case CallingConv::WebKit_JS:
+    return CC_AArch64_WebKit_JS;
+  case CallingConv::C:
+  case CallingConv::Fast:
+    if (!Subtarget->isTargetDarwin())
+      return CC_AArch64_AAPCS;
+    return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS;
+  }
+}
+
+SDValue AArch64TargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
+    SmallVectorImpl<SDValue> &InVals) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+
+  // At this point, Ins[].VT may already be promoted to i32. To correctly
+  // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
+  // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
+  // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
+  // we use a special version of AnalyzeFormalArguments to pass in ValVT and
+  // LocVT.
+  unsigned NumArgs = Ins.size();
+  Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin();
+  unsigned CurArgIdx = 0;
+  for (unsigned i = 0; i != NumArgs; ++i) {
+    MVT ValVT = Ins[i].VT;
+    std::advance(CurOrigArg, Ins[i].OrigArgIndex - CurArgIdx);
+    CurArgIdx = Ins[i].OrigArgIndex;
+
+    // Get type of the original argument.
+    EVT ActualVT = getValueType(CurOrigArg->getType(), /*AllowUnknown*/ true);
+    MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
+    // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
+    if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
+      ValVT = MVT::i8;
+    else if (ActualMVT == MVT::i16)
+      ValVT = MVT::i16;
+
+    CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
+    bool Res =
+        AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
+    assert(!Res && "Call operand has unhandled type");
+    (void)Res;
+  }
+  assert(ArgLocs.size() == Ins.size());
+  SmallVector<SDValue, 16> ArgValues;
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+
+    if (Ins[i].Flags.isByVal()) {
+      // Byval is used for HFAs in the PCS, but the system should work in a
+      // non-compliant manner for larger structs.
+      EVT PtrTy = getPointerTy();
+      int Size = Ins[i].Flags.getByValSize();
+      unsigned NumRegs = (Size + 7) / 8;
+
+      // FIXME: This works on big-endian for composite byvals, which are the common
+      // case. It should also work for fundamental types too.
+      unsigned FrameIdx =
+        MFI->CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
+      SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrTy);
+      InVals.push_back(FrameIdxN);
+
+      continue;
+    }
+    
+    if (VA.isRegLoc()) {
+      // Arguments stored in registers.
+      EVT RegVT = VA.getLocVT();
+
+      SDValue ArgValue;
+      const TargetRegisterClass *RC;
+
+      if (RegVT == MVT::i32)
+        RC = &AArch64::GPR32RegClass;
+      else if (RegVT == MVT::i64)
+        RC = &AArch64::GPR64RegClass;
+      else if (RegVT == MVT::f16)
+        RC = &AArch64::FPR16RegClass;
+      else if (RegVT == MVT::f32)
+        RC = &AArch64::FPR32RegClass;
+      else if (RegVT == MVT::f64 || RegVT.is64BitVector())
+        RC = &AArch64::FPR64RegClass;
+      else if (RegVT == MVT::f128 || RegVT.is128BitVector())
+        RC = &AArch64::FPR128RegClass;
+      else
+        llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
+
+      // Transform the arguments in physical registers into virtual ones.
+      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+      ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
+
+      // If this is an 8, 16 or 32-bit value, it is really passed promoted
+      // to 64 bits.  Insert an assert[sz]ext to capture this, then
+      // truncate to the right size.
+      switch (VA.getLocInfo()) {
+      default:
+        llvm_unreachable("Unknown loc info!");
+      case CCValAssign::Full:
+        break;
+      case CCValAssign::BCvt:
+        ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
+        break;
+      case CCValAssign::AExt:
+      case CCValAssign::SExt:
+      case CCValAssign::ZExt:
+        // SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt
+        // nodes after our lowering.
+        assert(RegVT == Ins[i].VT && "incorrect register location selected");
+        break;
+      }
+
+      InVals.push_back(ArgValue);
+
+    } else { // VA.isRegLoc()
+      assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
+      unsigned ArgOffset = VA.getLocMemOffset();
+      unsigned ArgSize = VA.getLocVT().getSizeInBits() / 8;
+
+      uint32_t BEAlign = 0;
+      if (ArgSize < 8 && !Subtarget->isLittleEndian())
+        BEAlign = 8 - ArgSize;
+
+      int FI = MFI->CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
+
+      // Create load nodes to retrieve arguments from the stack.
+      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+      SDValue ArgValue;
+
+      // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
+      ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
+      MVT MemVT = VA.getValVT();
+
+      switch (VA.getLocInfo()) {
+      default:
+        break;
+      case CCValAssign::BCvt:
+        MemVT = VA.getLocVT();
+        break;
+      case CCValAssign::SExt:
+        ExtType = ISD::SEXTLOAD;
+        break;
+      case CCValAssign::ZExt:
+        ExtType = ISD::ZEXTLOAD;
+        break;
+      case CCValAssign::AExt:
+        ExtType = ISD::EXTLOAD;
+        break;
+      }
+
+      ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN,
+                                MachinePointerInfo::getFixedStack(FI),
+                                MemVT, false, false, false, nullptr);
+
+      InVals.push_back(ArgValue);
+    }
+  }
+
+  // varargs
+  if (isVarArg) {
+    if (!Subtarget->isTargetDarwin()) {
+      // The AAPCS variadic function ABI is identical to the non-variadic
+      // one. As a result there may be more arguments in registers and we should
+      // save them for future reference.
+      saveVarArgRegisters(CCInfo, DAG, DL, Chain);
+    }
+
+    AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+    // This will point to the next argument passed via stack.
+    unsigned StackOffset = CCInfo.getNextStackOffset();
+    // We currently pass all varargs at 8-byte alignment.
+    StackOffset = ((StackOffset + 7) & ~7);
+    AFI->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true));
+  }
+
+  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+  unsigned StackArgSize = CCInfo.getNextStackOffset();
+  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
+  if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
+    // This is a non-standard ABI so by fiat I say we're allowed to make full
+    // use of the stack area to be popped, which must be aligned to 16 bytes in
+    // any case:
+    StackArgSize = RoundUpToAlignment(StackArgSize, 16);
+
+    // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
+    // a multiple of 16.
+    FuncInfo->setArgumentStackToRestore(StackArgSize);
+
+    // This realignment carries over to the available bytes below. Our own
+    // callers will guarantee the space is free by giving an aligned value to
+    // CALLSEQ_START.
+  }
+  // Even if we're not expected to free up the space, it's useful to know how
+  // much is there while considering tail calls (because we can reuse it).
+  FuncInfo->setBytesInStackArgArea(StackArgSize);
+
+  return Chain;
+}
+
+void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
+                                                SelectionDAG &DAG, SDLoc DL,
+                                                SDValue &Chain) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+
+  SmallVector<SDValue, 8> MemOps;
+
+  static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2,
+                                          AArch64::X3, AArch64::X4, AArch64::X5,
+                                          AArch64::X6, AArch64::X7 };
+  static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
+  unsigned FirstVariadicGPR =
+      CCInfo.getFirstUnallocated(GPRArgRegs, NumGPRArgRegs);
+
+  unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
+  int GPRIdx = 0;
+  if (GPRSaveSize != 0) {
+    GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false);
+
+    SDValue FIN = DAG.getFrameIndex(GPRIdx, getPointerTy());
+
+    for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
+      unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
+      SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
+      SDValue Store =
+          DAG.getStore(Val.getValue(1), DL, Val, FIN,
+                       MachinePointerInfo::getStack(i * 8), false, false, 0);
+      MemOps.push_back(Store);
+      FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
+                        DAG.getConstant(8, getPointerTy()));
+    }
+  }
+  FuncInfo->setVarArgsGPRIndex(GPRIdx);
+  FuncInfo->setVarArgsGPRSize(GPRSaveSize);
+
+  if (Subtarget->hasFPARMv8()) {
+    static const MCPhysReg FPRArgRegs[] = {
+        AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
+        AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
+    static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
+    unsigned FirstVariadicFPR =
+        CCInfo.getFirstUnallocated(FPRArgRegs, NumFPRArgRegs);
+
+    unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
+    int FPRIdx = 0;
+    if (FPRSaveSize != 0) {
+      FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false);
+
+      SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy());
+
+      for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
+        unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
+        SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
+
+        SDValue Store =
+            DAG.getStore(Val.getValue(1), DL, Val, FIN,
+                         MachinePointerInfo::getStack(i * 16), false, false, 0);
+        MemOps.push_back(Store);
+        FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
+                          DAG.getConstant(16, getPointerTy()));
+      }
+    }
+    FuncInfo->setVarArgsFPRIndex(FPRIdx);
+    FuncInfo->setVarArgsFPRSize(FPRSaveSize);
+  }
+
+  if (!MemOps.empty()) {
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
+  }
+}
+
+/// LowerCallResult - Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+SDValue AArch64TargetLowering::LowerCallResult(
+    SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
+    SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
+    SDValue ThisVal) const {
+  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
+                          ? RetCC_AArch64_WebKit_JS
+                          : RetCC_AArch64_AAPCS;
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), RVLocs, *DAG.getContext());
+  CCInfo.AnalyzeCallResult(Ins, RetCC);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign VA = RVLocs[i];
+
+    // Pass 'this' value directly from the argument to return value, to avoid
+    // reg unit interference
+    if (i == 0 && isThisReturn) {
+      assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
+             "unexpected return calling convention register assignment");
+      InVals.push_back(ThisVal);
+      continue;
+    }
+
+    SDValue Val =
+        DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
+    Chain = Val.getValue(1);
+    InFlag = Val.getValue(2);
+
+    switch (VA.getLocInfo()) {
+    default:
+      llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::BCvt:
+      Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
+      break;
+    }
+
+    InVals.push_back(Val);
+  }
+
+  return Chain;
+}
+
+bool AArch64TargetLowering::isEligibleForTailCallOptimization(
+    SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
+    bool isCalleeStructRet, bool isCallerStructRet,
+    const SmallVectorImpl<ISD::OutputArg> &Outs,
+    const SmallVectorImpl<SDValue> &OutVals,
+    const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
+  // For CallingConv::C this function knows whether the ABI needs
+  // changing. That's not true for other conventions so they will have to opt in
+  // manually.
+  if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C)
+    return false;
+
+  const MachineFunction &MF = DAG.getMachineFunction();
+  const Function *CallerF = MF.getFunction();
+  CallingConv::ID CallerCC = CallerF->getCallingConv();
+  bool CCMatch = CallerCC == CalleeCC;
+
+  // Byval parameters hand the function a pointer directly into the stack area
+  // we want to reuse during a tail call. Working around this *is* possible (see
+  // X86) but less efficient and uglier in LowerCall.
+  for (Function::const_arg_iterator i = CallerF->arg_begin(),
+                                    e = CallerF->arg_end();
+       i != e; ++i)
+    if (i->hasByValAttr())
+      return false;
+
+  if (getTargetMachine().Options.GuaranteedTailCallOpt) {
+    if (IsTailCallConvention(CalleeCC) && CCMatch)
+      return true;
+    return false;
+  }
+
+  // Now we search for cases where we can use a tail call without changing the
+  // ABI. Sibcall is used in some places (particularly gcc) to refer to this
+  // concept.
+
+  // I want anyone implementing a new calling convention to think long and hard
+  // about this assert.
+  assert((!isVarArg || CalleeCC == CallingConv::C) &&
+         "Unexpected variadic calling convention");
+
+  if (isVarArg && !Outs.empty()) {
+    // At least two cases here: if caller is fastcc then we can't have any
+    // memory arguments (we'd be expected to clean up the stack afterwards). If
+    // caller is C then we could potentially use its argument area.
+
+    // FIXME: for now we take the most conservative of these in both cases:
+    // disallow all variadic memory operands.
+    SmallVector<CCValAssign, 16> ArgLocs;
+    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
+                   getTargetMachine(), ArgLocs, *DAG.getContext());
+
+    CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
+    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
+      if (!ArgLocs[i].isRegLoc())
+        return false;
+  }
+
+  // If the calling conventions do not match, then we'd better make sure the
+  // results are returned in the same way as what the caller expects.
+  if (!CCMatch) {
+    SmallVector<CCValAssign, 16> RVLocs1;
+    CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
+                    getTargetMachine(), RVLocs1, *DAG.getContext());
+    CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForCall(CalleeCC, isVarArg));
+
+    SmallVector<CCValAssign, 16> RVLocs2;
+    CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
+                    getTargetMachine(), RVLocs2, *DAG.getContext());
+    CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForCall(CallerCC, isVarArg));
+
+    if (RVLocs1.size() != RVLocs2.size())
+      return false;
+    for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
+      if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
+        return false;
+      if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
+        return false;
+      if (RVLocs1[i].isRegLoc()) {
+        if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
+          return false;
+      } else {
+        if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
+          return false;
+      }
+    }
+  }
+
+  // Nothing more to check if the callee is taking no arguments
+  if (Outs.empty())
+    return true;
+
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+
+  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
+
+  const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+
+  // If the stack arguments for this call would fit into our own save area then
+  // the call can be made tail.
+  return CCInfo.getNextStackOffset() <= FuncInfo->getBytesInStackArgArea();
+}
+
+SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
+                                                   SelectionDAG &DAG,
+                                                   MachineFrameInfo *MFI,
+                                                   int ClobberedFI) const {
+  SmallVector<SDValue, 8> ArgChains;
+  int64_t FirstByte = MFI->getObjectOffset(ClobberedFI);
+  int64_t LastByte = FirstByte + MFI->getObjectSize(ClobberedFI) - 1;
+
+  // Include the original chain at the beginning of the list. When this is
+  // used by target LowerCall hooks, this helps legalize find the
+  // CALLSEQ_BEGIN node.
+  ArgChains.push_back(Chain);
+
+  // Add a chain value for each stack argument corresponding
+  for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
+                            UE = DAG.getEntryNode().getNode()->use_end();
+       U != UE; ++U)
+    if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
+      if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
+        if (FI->getIndex() < 0) {
+          int64_t InFirstByte = MFI->getObjectOffset(FI->getIndex());
+          int64_t InLastByte = InFirstByte;
+          InLastByte += MFI->getObjectSize(FI->getIndex()) - 1;
+
+          if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
+              (FirstByte <= InFirstByte && InFirstByte <= LastByte))
+            ArgChains.push_back(SDValue(L, 1));
+        }
+
+  // Build a tokenfactor for all the chains.
+  return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
+}
+
+bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
+                                                   bool TailCallOpt) const {
+  return CallCC == CallingConv::Fast && TailCallOpt;
+}
+
+bool AArch64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const {
+  return CallCC == CallingConv::Fast;
+}
+
+/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
+/// and add input and output parameter nodes.
+SDValue
+AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
+                                 SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG = CLI.DAG;
+  SDLoc &DL = CLI.DL;
+  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
+  SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
+  SDValue Chain = CLI.Chain;
+  SDValue Callee = CLI.Callee;
+  bool &IsTailCall = CLI.IsTailCall;
+  CallingConv::ID CallConv = CLI.CallConv;
+  bool IsVarArg = CLI.IsVarArg;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
+  bool IsThisReturn = false;
+
+  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
+  bool IsSibCall = false;
+
+  if (IsTailCall) {
+    // Check if it's really possible to do a tail call.
+    IsTailCall = isEligibleForTailCallOptimization(
+        Callee, CallConv, IsVarArg, IsStructRet,
+        MF.getFunction()->hasStructRetAttr(), Outs, OutVals, Ins, DAG);
+    if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall())
+      report_fatal_error("failed to perform tail call elimination on a call "
+                         "site marked musttail");
+
+    // A sibling call is one where we're under the usual C ABI and not planning
+    // to change that but can still do a tail call:
+    if (!TailCallOpt && IsTailCall)
+      IsSibCall = true;
+
+    if (IsTailCall)
+      ++NumTailCalls;
+  }
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+
+  if (IsVarArg) {
+    // Handle fixed and variable vector arguments differently.
+    // Variable vector arguments always go into memory.
+    unsigned NumArgs = Outs.size();
+
+    for (unsigned i = 0; i != NumArgs; ++i) {
+      MVT ArgVT = Outs[i].VT;
+      ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+      CCAssignFn *AssignFn = CCAssignFnForCall(CallConv,
+                                               /*IsVarArg=*/ !Outs[i].IsFixed);
+      bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
+      assert(!Res && "Call operand has unhandled type");
+      (void)Res;
+    }
+  } else {
+    // At this point, Outs[].VT may already be promoted to i32. To correctly
+    // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
+    // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
+    // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
+    // we use a special version of AnalyzeCallOperands to pass in ValVT and
+    // LocVT.
+    unsigned NumArgs = Outs.size();
+    for (unsigned i = 0; i != NumArgs; ++i) {
+      MVT ValVT = Outs[i].VT;
+      // Get type of the original argument.
+      EVT ActualVT = getValueType(CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
+                                  /*AllowUnknown*/ true);
+      MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
+      ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+      // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
+      if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
+        ValVT = MVT::i8;
+      else if (ActualMVT == MVT::i16)
+        ValVT = MVT::i16;
+
+      CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
+      bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo);
+      assert(!Res && "Call operand has unhandled type");
+      (void)Res;
+    }
+  }
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+
+  if (IsSibCall) {
+    // Since we're not changing the ABI to make this a tail call, the memory
+    // operands are already available in the caller's incoming argument space.
+    NumBytes = 0;
+  }
+
+  // FPDiff is the byte offset of the call's argument area from the callee's.
+  // Stores to callee stack arguments will be placed in FixedStackSlots offset
+  // by this amount for a tail call. In a sibling call it must be 0 because the
+  // caller will deallocate the entire stack and the callee still expects its
+  // arguments to begin at SP+0. Completely unused for non-tail calls.
+  int FPDiff = 0;
+
+  if (IsTailCall && !IsSibCall) {
+    unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
+
+    // Since callee will pop argument stack as a tail call, we must keep the
+    // popped size 16-byte aligned.
+    NumBytes = RoundUpToAlignment(NumBytes, 16);
+
+    // FPDiff will be negative if this tail call requires more space than we
+    // would automatically have in our incoming argument space. Positive if we
+    // can actually shrink the stack.
+    FPDiff = NumReusableBytes - NumBytes;
+
+    // The stack pointer must be 16-byte aligned at all times it's used for a
+    // memory operation, which in practice means at *all* times and in
+    // particular across call boundaries. Therefore our own arguments started at
+    // a 16-byte aligned SP and the delta applied for the tail call should
+    // satisfy the same constraint.
+    assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
+  }
+
+  // Adjust the stack pointer for the new arguments...
+  // These operations are automatically eliminated by the prolog/epilog pass
+  if (!IsSibCall)
+    Chain =
+        DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true), DL);
+
+  SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP, getPointerTy());
+
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  SmallVector<SDValue, 8> MemOpChains;
+
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
+       ++i, ++realArgIdx) {
+    CCValAssign &VA = ArgLocs[i];
+    SDValue Arg = OutVals[realArgIdx];
+    ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    default:
+      llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::AExt:
+      if (Outs[realArgIdx].ArgVT == MVT::i1) {
+        // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
+        Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
+        Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
+      }
+      Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::BCvt:
+      Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::FPExt:
+      Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    }
+
+    if (VA.isRegLoc()) {
+      if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i64) {
+        assert(VA.getLocVT() == MVT::i64 &&
+               "unexpected calling convention register assignment");
+        assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
+               "unexpected use of 'returned'");
+        IsThisReturn = true;
+      }
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else {
+      assert(VA.isMemLoc());
+
+      SDValue DstAddr;
+      MachinePointerInfo DstInfo;
+
+      // FIXME: This works on big-endian for composite byvals, which are the
+      // common case. It should also work for fundamental types too.
+      uint32_t BEAlign = 0;
+      unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
+                                        : VA.getLocVT().getSizeInBits();
+      OpSize = (OpSize + 7) / 8;
+      if (!Subtarget->isLittleEndian() && !Flags.isByVal()) {
+        if (OpSize < 8)
+          BEAlign = 8 - OpSize;
+      }
+      unsigned LocMemOffset = VA.getLocMemOffset();
+      int32_t Offset = LocMemOffset + BEAlign;
+      SDValue PtrOff = DAG.getIntPtrConstant(Offset);
+      PtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
+
+      if (IsTailCall) {
+        Offset = Offset + FPDiff;
+        int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
+
+        DstAddr = DAG.getFrameIndex(FI, getPointerTy());
+        DstInfo = MachinePointerInfo::getFixedStack(FI);
+
+        // Make sure any stack arguments overlapping with where we're storing
+        // are loaded before this eventual operation. Otherwise they'll be
+        // clobbered.
+        Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
+      } else {
+        SDValue PtrOff = DAG.getIntPtrConstant(Offset);
+
+        DstAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
+        DstInfo = MachinePointerInfo::getStack(LocMemOffset);
+      }
+
+      if (Outs[i].Flags.isByVal()) {
+        SDValue SizeNode =
+            DAG.getConstant(Outs[i].Flags.getByValSize(), MVT::i64);
+        SDValue Cpy = DAG.getMemcpy(
+            Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
+            /*isVolatile = */ false,
+            /*alwaysInline = */ false, DstInfo, MachinePointerInfo());
+
+        MemOpChains.push_back(Cpy);
+      } else {
+        // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
+        // promoted to a legal register type i32, we should truncate Arg back to
+        // i1/i8/i16.
+        if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
+            VA.getValVT() == MVT::i16)
+          Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
+
+        SDValue Store =
+            DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, false, false, 0);
+        MemOpChains.push_back(Store);
+      }
+    }
+  }
+
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into the appropriate regs.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
+  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
+  // node so that legalize doesn't hack it.
+  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
+      Subtarget->isTargetMachO()) {
+    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+      const GlobalValue *GV = G->getGlobal();
+      bool InternalLinkage = GV->hasInternalLinkage();
+      if (InternalLinkage)
+        Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0);
+      else {
+        Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0,
+                                            AArch64II::MO_GOT);
+        Callee = DAG.getNode(AArch64ISD::LOADgot, DL, getPointerTy(), Callee);
+      }
+    } else if (ExternalSymbolSDNode *S =
+                   dyn_cast<ExternalSymbolSDNode>(Callee)) {
+      const char *Sym = S->getSymbol();
+      Callee =
+          DAG.getTargetExternalSymbol(Sym, getPointerTy(), AArch64II::MO_GOT);
+      Callee = DAG.getNode(AArch64ISD::LOADgot, DL, getPointerTy(), Callee);
+    }
+  } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    const GlobalValue *GV = G->getGlobal();
+    Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0);
+  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    const char *Sym = S->getSymbol();
+    Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), 0);
+  }
+
+  // We don't usually want to end the call-sequence here because we would tidy
+  // the frame up *after* the call, however in the ABI-changing tail-call case
+  // we've carefully laid out the parameters so that when sp is reset they'll be
+  // in the correct location.
+  if (IsTailCall && !IsSibCall) {
+    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                               DAG.getIntPtrConstant(0, true), InFlag, DL);
+    InFlag = Chain.getValue(1);
+  }
+
+  std::vector<SDValue> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  if (IsTailCall) {
+    // Each tail call may have to adjust the stack by a different amount, so
+    // this information must travel along with the operation for eventual
+    // consumption by emitEpilogue.
+    Ops.push_back(DAG.getTargetConstant(FPDiff, MVT::i32));
+  }
+
+  // Add argument registers to the end of the list so that they are known live
+  // into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  // Add a register mask operand representing the call-preserved registers.
+  const uint32_t *Mask;
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const AArch64RegisterInfo *ARI =
+      static_cast<const AArch64RegisterInfo *>(TRI);
+  if (IsThisReturn) {
+    // For 'this' returns, use the X0-preserving mask if applicable
+    Mask = ARI->getThisReturnPreservedMask(CallConv);
+    if (!Mask) {
+      IsThisReturn = false;
+      Mask = ARI->getCallPreservedMask(CallConv);
+    }
+  } else
+    Mask = ARI->getCallPreservedMask(CallConv);
+
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+
+  // If we're doing a tall call, use a TC_RETURN here rather than an
+  // actual call instruction.
+  if (IsTailCall)
+    return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
+
+  // Returns a chain and a flag for retval copy to use.
+  Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
+  InFlag = Chain.getValue(1);
+
+  uint64_t CalleePopBytes = DoesCalleeRestoreStack(CallConv, TailCallOpt)
+                                ? RoundUpToAlignment(NumBytes, 16)
+                                : 0;
+
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                             DAG.getIntPtrConstant(CalleePopBytes, true),
+                             InFlag, DL);
+  if (!Ins.empty())
+    InFlag = Chain.getValue(1);
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
+                         InVals, IsThisReturn,
+                         IsThisReturn ? OutVals[0] : SDValue());
+}
+
+bool AArch64TargetLowering::CanLowerReturn(
+    CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
+    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
+  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
+                          ? RetCC_AArch64_WebKit_JS
+                          : RetCC_AArch64_AAPCS;
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), RVLocs, Context);
+  return CCInfo.CheckReturn(Outs, RetCC);
+}
+
+SDValue
+AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                                   bool isVarArg,
+                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                   const SmallVectorImpl<SDValue> &OutVals,
+                                   SDLoc DL, SelectionDAG &DAG) const {
+  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
+                          ? RetCC_AArch64_WebKit_JS
+                          : RetCC_AArch64_AAPCS;
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), RVLocs, *DAG.getContext());
+  CCInfo.AnalyzeReturn(Outs, RetCC);
+
+  // Copy the result values into the output registers.
+  SDValue Flag;
+  SmallVector<SDValue, 4> RetOps(1, Chain);
+  for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
+       ++i, ++realRVLocIdx) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+    SDValue Arg = OutVals[realRVLocIdx];
+
+    switch (VA.getLocInfo()) {
+    default:
+      llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full:
+      if (Outs[i].ArgVT == MVT::i1) {
+        // AAPCS requires i1 to be zero-extended to i8 by the producer of the
+        // value. This is strictly redundant on Darwin (which uses "zeroext
+        // i1"), but will be optimised out before ISel.
+        Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
+        Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
+      }
+      break;
+    case CCValAssign::BCvt:
+      Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
+      break;
+    }
+
+    Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
+
+  RetOps[0] = Chain; // Update chain.
+
+  // Add the flag if we have it.
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
+  return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps);
+}
+
+//===----------------------------------------------------------------------===//
+//  Other Lowering Code
+//===----------------------------------------------------------------------===//
+
+SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  EVT PtrVT = getPointerTy();
+  SDLoc DL(Op);
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  unsigned char OpFlags =
+      Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
+
+  assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
+         "unexpected offset in global node");
+
+  // This also catched the large code model case for Darwin.
+  if ((OpFlags & AArch64II::MO_GOT) != 0) {
+    SDValue GotAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
+    // FIXME: Once remat is capable of dealing with instructions with register
+    // operands, expand this into two nodes instead of using a wrapper node.
+    return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr);
+  }
+
+  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
+    const unsigned char MO_NC = AArch64II::MO_NC;
+    return DAG.getNode(
+        AArch64ISD::WrapperLarge, DL, PtrVT,
+        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G3),
+        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G2 | MO_NC),
+        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G1 | MO_NC),
+        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G0 | MO_NC));
+  } else {
+    // Use ADRP/ADD or ADRP/LDR for everything else: the small model on ELF and
+    // the only correct model on Darwin.
+    SDValue Hi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
+                                            OpFlags | AArch64II::MO_PAGE);
+    unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC;
+    SDValue Lo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, LoFlags);
+
+    SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
+    return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
+  }
+}
+
+/// \brief Convert a TLS address reference into the correct sequence of loads
+/// and calls to compute the variable's address (for Darwin, currently) and
+/// return an SDValue containing the final node.
+
+/// Darwin only has one TLS scheme which must be capable of dealing with the
+/// fully general situation, in the worst case. This means:
+///     + "extern __thread" declaration.
+///     + Defined in a possibly unknown dynamic library.
+///
+/// The general system is that each __thread variable has a [3 x i64] descriptor
+/// which contains information used by the runtime to calculate the address. The
+/// only part of this the compiler needs to know about is the first xword, which
+/// contains a function pointer that must be called with the address of the
+/// entire descriptor in "x0".
+///
+/// Since this descriptor may be in a different unit, in general even the
+/// descriptor must be accessed via an indirect load. The "ideal" code sequence
+/// is:
+///     adrp x0, _var@TLVPPAGE
+///     ldr x0, [x0, _var@TLVPPAGEOFF]   ; x0 now contains address of descriptor
+///     ldr x1, [x0]                     ; x1 contains 1st entry of descriptor,
+///                                      ; the function pointer
+///     blr x1                           ; Uses descriptor address in x0
+///     ; Address of _var is now in x0.
+///
+/// If the address of _var's descriptor *is* known to the linker, then it can
+/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
+/// a slight efficiency gain.
+SDValue
+AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin");
+
+  SDLoc DL(Op);
+  MVT PtrVT = getPointerTy();
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+
+  SDValue TLVPAddr =
+      DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
+  SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
+
+  // The first entry in the descriptor is a function pointer that we must call
+  // to obtain the address of the variable.
+  SDValue Chain = DAG.getEntryNode();
+  SDValue FuncTLVGet =
+      DAG.getLoad(MVT::i64, DL, Chain, DescAddr, MachinePointerInfo::getGOT(),
+                  false, true, true, 8);
+  Chain = FuncTLVGet.getValue(1);
+
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI->setAdjustsStack(true);
+
+  // TLS calls preserve all registers except those that absolutely must be
+  // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
+  // silly).
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const AArch64RegisterInfo *ARI =
+      static_cast<const AArch64RegisterInfo *>(TRI);
+  const uint32_t *Mask = ARI->getTLSCallPreservedMask();
+
+  // Finally, we can make the call. This is just a degenerate version of a
+  // normal AArch64 call node: x0 takes the address of the descriptor, and
+  // returns the address of the variable in this thread.
+  Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
+  Chain =
+      DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
+                  Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
+                  DAG.getRegisterMask(Mask), Chain.getValue(1));
+  return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
+}
+
+/// When accessing thread-local variables under either the general-dynamic or
+/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
+/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
+/// is a function pointer to carry out the resolution. This function takes the
+/// address of the descriptor in X0 and returns the TPIDR_EL0 offset in X0. All
+/// other registers (except LR, NZCV) are preserved.
+///
+/// Thus, the ideal call sequence on AArch64 is:
+///
+///     adrp x0, :tlsdesc:thread_var
+///     ldr x8, [x0, :tlsdesc_lo12:thread_var]
+///     add x0, x0, :tlsdesc_lo12:thread_var
+///     .tlsdesccall thread_var
+///     blr x8
+///     (TPIDR_EL0 offset now in x0).
+///
+/// The ".tlsdesccall" directive instructs the assembler to insert a particular
+/// relocation to help the linker relax this sequence if it turns out to be too
+/// conservative.
+///
+/// FIXME: we currently produce an extra, duplicated, ADRP instruction, but this
+/// is harmless.
+SDValue AArch64TargetLowering::LowerELFTLSDescCall(SDValue SymAddr,
+                                                   SDValue DescAddr, SDLoc DL,
+                                                   SelectionDAG &DAG) const {
+  EVT PtrVT = getPointerTy();
+
+  // The function we need to call is simply the first entry in the GOT for this
+  // descriptor, load it in preparation.
+  SDValue Func = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, SymAddr);
+
+  // TLS calls preserve all registers except those that absolutely must be
+  // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
+  // silly).
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const AArch64RegisterInfo *ARI =
+      static_cast<const AArch64RegisterInfo *>(TRI);
+  const uint32_t *Mask = ARI->getTLSCallPreservedMask();
+
+  // The function takes only one argument: the address of the descriptor itself
+  // in X0.
+  SDValue Glue, Chain;
+  Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X0, DescAddr, Glue);
+  Glue = Chain.getValue(1);
+
+  // We're now ready to populate the argument list, as with a normal call:
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Func);
+  Ops.push_back(SymAddr);
+  Ops.push_back(DAG.getRegister(AArch64::X0, PtrVT));
+  Ops.push_back(DAG.getRegisterMask(Mask));
+  Ops.push_back(Glue);
+
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  Chain = DAG.getNode(AArch64ISD::TLSDESC_CALL, DL, NodeTys, Ops);
+  Glue = Chain.getValue(1);
+
+  return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
+}
+
+SDValue
+AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetELF() && "This function expects an ELF target");
+  assert(getTargetMachine().getCodeModel() == CodeModel::Small &&
+         "ELF TLS only supported in small memory model");
+  const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+
+  TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
+
+  SDValue TPOff;
+  EVT PtrVT = getPointerTy();
+  SDLoc DL(Op);
+  const GlobalValue *GV = GA->getGlobal();
+
+  SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
+
+  if (Model == TLSModel::LocalExec) {
+    SDValue HiVar = DAG.getTargetGlobalAddress(
+        GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
+    SDValue LoVar = DAG.getTargetGlobalAddress(
+        GV, DL, PtrVT, 0,
+        AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
+
+    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
+                                       DAG.getTargetConstant(16, MVT::i32)),
+                    0);
+    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
+                                       DAG.getTargetConstant(0, MVT::i32)),
+                    0);
+  } else if (Model == TLSModel::InitialExec) {
+    TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
+    TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
+  } else if (Model == TLSModel::LocalDynamic) {
+    // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
+    // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
+    // the beginning of the module's TLS region, followed by a DTPREL offset
+    // calculation.
+
+    // These accesses will need deduplicating if there's more than one.
+    AArch64FunctionInfo *MFI =
+        DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
+    MFI->incNumLocalDynamicTLSAccesses();
+
+    // Accesses used in this sequence go via the TLS descriptor which lives in
+    // the GOT. Prepare an address we can use to handle this.
+    SDValue HiDesc = DAG.getTargetExternalSymbol(
+        "_TLS_MODULE_BASE_", PtrVT, AArch64II::MO_TLS | AArch64II::MO_PAGE);
+    SDValue LoDesc = DAG.getTargetExternalSymbol(
+        "_TLS_MODULE_BASE_", PtrVT,
+        AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+
+    // First argument to the descriptor call is the address of the descriptor
+    // itself.
+    SDValue DescAddr = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, HiDesc);
+    DescAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, DescAddr, LoDesc);
+
+    // The call needs a relocation too for linker relaxation. It doesn't make
+    // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
+    // the address.
+    SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
+                                                  AArch64II::MO_TLS);
+
+    // Now we can calculate the offset from TPIDR_EL0 to this module's
+    // thread-local area.
+    TPOff = LowerELFTLSDescCall(SymAddr, DescAddr, DL, DAG);
+
+    // Now use :dtprel_whatever: operations to calculate this variable's offset
+    // in its thread-storage area.
+    SDValue HiVar = DAG.getTargetGlobalAddress(
+        GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
+    SDValue LoVar = DAG.getTargetGlobalAddress(
+        GV, DL, MVT::i64, 0,
+        AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
+
+    SDValue DTPOff =
+        SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
+                                   DAG.getTargetConstant(16, MVT::i32)),
+                0);
+    DTPOff =
+        SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, DTPOff, LoVar,
+                                   DAG.getTargetConstant(0, MVT::i32)),
+                0);
+
+    TPOff = DAG.getNode(ISD::ADD, DL, PtrVT, TPOff, DTPOff);
+  } else if (Model == TLSModel::GeneralDynamic) {
+    // Accesses used in this sequence go via the TLS descriptor which lives in
+    // the GOT. Prepare an address we can use to handle this.
+    SDValue HiDesc = DAG.getTargetGlobalAddress(
+        GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGE);
+    SDValue LoDesc = DAG.getTargetGlobalAddress(
+        GV, DL, PtrVT, 0,
+        AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+
+    // First argument to the descriptor call is the address of the descriptor
+    // itself.
+    SDValue DescAddr = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, HiDesc);
+    DescAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, DescAddr, LoDesc);
+
+    // The call needs a relocation too for linker relaxation. It doesn't make
+    // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
+    // the address.
+    SDValue SymAddr =
+        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
+
+    // Finally we can make a call to calculate the offset from tpidr_el0.
+    TPOff = LowerELFTLSDescCall(SymAddr, DescAddr, DL, DAG);
+  } else
+    llvm_unreachable("Unsupported ELF TLS access model");
+
+  return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
+}
+
+SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  if (Subtarget->isTargetDarwin())
+    return LowerDarwinGlobalTLSAddress(Op, DAG);
+  else if (Subtarget->isTargetELF())
+    return LowerELFGlobalTLSAddress(Op, DAG);
+
+  llvm_unreachable("Unexpected platform trying to use TLS");
+}
+SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+  SDValue LHS = Op.getOperand(2);
+  SDValue RHS = Op.getOperand(3);
+  SDValue Dest = Op.getOperand(4);
+  SDLoc dl(Op);
+
+  // Handle f128 first, since lowering it will result in comparing the return
+  // value of a libcall against zero, which is just what the rest of LowerBR_CC
+  // is expecting to deal with.
+  if (LHS.getValueType() == MVT::f128) {
+    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
+
+    // If softenSetCCOperands returned a scalar, we need to compare the result
+    // against zero to select between true and false values.
+    if (!RHS.getNode()) {
+      RHS = DAG.getConstant(0, LHS.getValueType());
+      CC = ISD::SETNE;
+    }
+  }
+
+  // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
+  // instruction.
+  unsigned Opc = LHS.getOpcode();
+  if (LHS.getResNo() == 1 && isa<ConstantSDNode>(RHS) &&
+      cast<ConstantSDNode>(RHS)->isOne() &&
+      (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
+       Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) {
+    assert((CC == ISD::SETEQ || CC == ISD::SETNE) &&
+           "Unexpected condition code.");
+    // Only lower legal XALUO ops.
+    if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
+      return SDValue();
+
+    // The actual operation with overflow check.
+    AArch64CC::CondCode OFCC;
+    SDValue Value, Overflow;
+    std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
+
+    if (CC == ISD::SETNE)
+      OFCC = getInvertedCondCode(OFCC);
+    SDValue CCVal = DAG.getConstant(OFCC, MVT::i32);
+
+    return DAG.getNode(AArch64ISD::BRCOND, SDLoc(LHS), MVT::Other, Chain, Dest,
+                       CCVal, Overflow);
+  }
+
+  if (LHS.getValueType().isInteger()) {
+    assert((LHS.getValueType() == RHS.getValueType()) &&
+           (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
+
+    // If the RHS of the comparison is zero, we can potentially fold this
+    // to a specialized branch.
+    const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
+    if (RHSC && RHSC->getZExtValue() == 0) {
+      if (CC == ISD::SETEQ) {
+        // See if we can use a TBZ to fold in an AND as well.
+        // TBZ has a smaller branch displacement than CBZ.  If the offset is
+        // out of bounds, a late MI-layer pass rewrites branches.
+        // 403.gcc is an example that hits this case.
+        if (LHS.getOpcode() == ISD::AND &&
+            isa<ConstantSDNode>(LHS.getOperand(1)) &&
+            isPowerOf2_64(LHS.getConstantOperandVal(1))) {
+          SDValue Test = LHS.getOperand(0);
+          uint64_t Mask = LHS.getConstantOperandVal(1);
+
+          // TBZ only operates on i64's, but the ext should be free.
+          if (Test.getValueType() == MVT::i32)
+            Test = DAG.getAnyExtOrTrunc(Test, dl, MVT::i64);
+
+          return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
+                             DAG.getConstant(Log2_64(Mask), MVT::i64), Dest);
+        }
+
+        return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
+      } else if (CC == ISD::SETNE) {
+        // See if we can use a TBZ to fold in an AND as well.
+        // TBZ has a smaller branch displacement than CBZ.  If the offset is
+        // out of bounds, a late MI-layer pass rewrites branches.
+        // 403.gcc is an example that hits this case.
+        if (LHS.getOpcode() == ISD::AND &&
+            isa<ConstantSDNode>(LHS.getOperand(1)) &&
+            isPowerOf2_64(LHS.getConstantOperandVal(1))) {
+          SDValue Test = LHS.getOperand(0);
+          uint64_t Mask = LHS.getConstantOperandVal(1);
+
+          // TBNZ only operates on i64's, but the ext should be free.
+          if (Test.getValueType() == MVT::i32)
+            Test = DAG.getAnyExtOrTrunc(Test, dl, MVT::i64);
+
+          return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
+                             DAG.getConstant(Log2_64(Mask), MVT::i64), Dest);
+        }
+
+        return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
+      }
+    }
+
+    SDValue CCVal;
+    SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
+    return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
+                       Cmp);
+  }
+
+  assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
+
+  // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
+  // clean.  Some of them require two branches to implement.
+  SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
+  AArch64CC::CondCode CC1, CC2;
+  changeFPCCToAArch64CC(CC, CC1, CC2);
+  SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
+  SDValue BR1 =
+      DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
+  if (CC2 != AArch64CC::AL) {
+    SDValue CC2Val = DAG.getConstant(CC2, MVT::i32);
+    return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
+                       Cmp);
+  }
+
+  return BR1;
+}
+
+SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+
+  SDValue In1 = Op.getOperand(0);
+  SDValue In2 = Op.getOperand(1);
+  EVT SrcVT = In2.getValueType();
+  if (SrcVT != VT) {
+    if (SrcVT == MVT::f32 && VT == MVT::f64)
+      In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2);
+    else if (SrcVT == MVT::f64 && VT == MVT::f32)
+      In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0));
+    else
+      // FIXME: Src type is different, bail out for now. Can VT really be a
+      // vector type?
+      return SDValue();
+  }
+
+  EVT VecVT;
+  EVT EltVT;
+  SDValue EltMask, VecVal1, VecVal2;
+  if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) {
+    EltVT = MVT::i32;
+    VecVT = MVT::v4i32;
+    EltMask = DAG.getConstant(0x80000000ULL, EltVT);
+
+    if (!VT.isVector()) {
+      VecVal1 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT,
+                                          DAG.getUNDEF(VecVT), In1);
+      VecVal2 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT,
+                                          DAG.getUNDEF(VecVT), In2);
+    } else {
+      VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
+      VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
+    }
+  } else if (VT == MVT::f64 || VT == MVT::v2f64) {
+    EltVT = MVT::i64;
+    VecVT = MVT::v2i64;
+
+    // We want to materialize a mask with the the high bit set, but the AdvSIMD
+    // immediate moves cannot materialize that in a single instruction for
+    // 64-bit elements. Instead, materialize zero and then negate it.
+    EltMask = DAG.getConstant(0, EltVT);
+
+    if (!VT.isVector()) {
+      VecVal1 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT,
+                                          DAG.getUNDEF(VecVT), In1);
+      VecVal2 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT,
+                                          DAG.getUNDEF(VecVT), In2);
+    } else {
+      VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
+      VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
+    }
+  } else {
+    llvm_unreachable("Invalid type for copysign!");
+  }
+
+  std::vector<SDValue> BuildVectorOps;
+  for (unsigned i = 0; i < VecVT.getVectorNumElements(); ++i)
+    BuildVectorOps.push_back(EltMask);
+
+  SDValue BuildVec = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, BuildVectorOps);
+
+  // If we couldn't materialize the mask above, then the mask vector will be
+  // the zero vector, and we need to negate it here.
+  if (VT == MVT::f64 || VT == MVT::v2f64) {
+    BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec);
+    BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec);
+    BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec);
+  }
+
+  SDValue Sel =
+      DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec);
+
+  if (VT == MVT::f32)
+    return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel);
+  else if (VT == MVT::f64)
+    return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel);
+  else
+    return DAG.getNode(ISD::BITCAST, DL, VT, Sel);
+}
+
+SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
+  if (DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute(
+          AttributeSet::FunctionIndex, Attribute::NoImplicitFloat))
+    return SDValue();
+
+  // While there is no integer popcount instruction, it can
+  // be more efficiently lowered to the following sequence that uses
+  // AdvSIMD registers/instructions as long as the copies to/from
+  // the AdvSIMD registers are cheap.
+  //  FMOV    D0, X0        // copy 64-bit int to vector, high bits zero'd
+  //  CNT     V0.8B, V0.8B  // 8xbyte pop-counts
+  //  ADDV    B0, V0.8B     // sum 8xbyte pop-counts
+  //  UMOV    X0, V0.B[0]   // copy byte result back to integer reg
+  SDValue Val = Op.getOperand(0);
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  SDValue ZeroVec = DAG.getUNDEF(MVT::v8i8);
+
+  SDValue VecVal;
+  if (VT == MVT::i32) {
+    VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Val);
+    VecVal = DAG.getTargetInsertSubreg(AArch64::ssub, DL, MVT::v8i8, ZeroVec,
+                                       VecVal);
+  } else {
+    VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
+  }
+
+  SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, VecVal);
+  SDValue UaddLV = DAG.getNode(
+      ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
+      DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, MVT::i32), CtPop);
+
+  if (VT == MVT::i64)
+    UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
+  return UaddLV;
+}
+
+SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+
+  if (Op.getValueType().isVector())
+    return LowerVSETCC(Op, DAG);
+
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+  SDLoc dl(Op);
+
+  // We chose ZeroOrOneBooleanContents, so use zero and one.
+  EVT VT = Op.getValueType();
+  SDValue TVal = DAG.getConstant(1, VT);
+  SDValue FVal = DAG.getConstant(0, VT);
+
+  // Handle f128 first, since one possible outcome is a normal integer
+  // comparison which gets picked up by the next if statement.
+  if (LHS.getValueType() == MVT::f128) {
+    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
+
+    // If softenSetCCOperands returned a scalar, use it.
+    if (!RHS.getNode()) {
+      assert(LHS.getValueType() == Op.getValueType() &&
+             "Unexpected setcc expansion!");
+      return LHS;
+    }
+  }
+
+  if (LHS.getValueType().isInteger()) {
+    SDValue CCVal;
+    SDValue Cmp =
+        getAArch64Cmp(LHS, RHS, ISD::getSetCCInverse(CC, true), CCVal, DAG, dl);
+
+    // Note that we inverted the condition above, so we reverse the order of
+    // the true and false operands here.  This will allow the setcc to be
+    // matched to a single CSINC instruction.
+    return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
+  }
+
+  // Now we know we're dealing with FP values.
+  assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
+
+  // If that fails, we'll need to perform an FCMP + CSEL sequence.  Go ahead
+  // and do the comparison.
+  SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
+
+  AArch64CC::CondCode CC1, CC2;
+  changeFPCCToAArch64CC(CC, CC1, CC2);
+  if (CC2 == AArch64CC::AL) {
+    changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, false), CC1, CC2);
+    SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
+
+    // Note that we inverted the condition above, so we reverse the order of
+    // the true and false operands here.  This will allow the setcc to be
+    // matched to a single CSINC instruction.
+    return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
+  } else {
+    // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
+    // totally clean.  Some of them require two CSELs to implement.  As is in
+    // this case, we emit the first CSEL and then emit a second using the output
+    // of the first as the RHS.  We're effectively OR'ing the two CC's together.
+
+    // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
+    SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
+    SDValue CS1 =
+        DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
+
+    SDValue CC2Val = DAG.getConstant(CC2, MVT::i32);
+    return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
+  }
+}
+
+/// A SELECT_CC operation is really some kind of max or min if both values being
+/// compared are, in some sense, equal to the results in either case. However,
+/// it is permissible to compare f32 values and produce directly extended f64
+/// values.
+///
+/// Extending the comparison operands would also be allowed, but is less likely
+/// to happen in practice since their use is right here. Note that truncate
+/// operations would *not* be semantically equivalent.
+static bool selectCCOpsAreFMaxCompatible(SDValue Cmp, SDValue Result) {
+  if (Cmp == Result)
+    return true;
+
+  ConstantFPSDNode *CCmp = dyn_cast<ConstantFPSDNode>(Cmp);
+  ConstantFPSDNode *CResult = dyn_cast<ConstantFPSDNode>(Result);
+  if (CCmp && CResult && Cmp.getValueType() == MVT::f32 &&
+      Result.getValueType() == MVT::f64) {
+    bool Lossy;
+    APFloat CmpVal = CCmp->getValueAPF();
+    CmpVal.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &Lossy);
+    return CResult->getValueAPF().bitwiseIsEqual(CmpVal);
+  }
+
+  return Result->getOpcode() == ISD::FP_EXTEND && Result->getOperand(0) == Cmp;
+}
+
+SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  SDValue CC = Op->getOperand(0);
+  SDValue TVal = Op->getOperand(1);
+  SDValue FVal = Op->getOperand(2);
+  SDLoc DL(Op);
+
+  unsigned Opc = CC.getOpcode();
+  // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
+  // instruction.
+  if (CC.getResNo() == 1 &&
+      (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
+       Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) {
+    // Only lower legal XALUO ops.
+    if (!DAG.getTargetLoweringInfo().isTypeLegal(CC->getValueType(0)))
+      return SDValue();
+
+    AArch64CC::CondCode OFCC;
+    SDValue Value, Overflow;
+    std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CC.getValue(0), DAG);
+    SDValue CCVal = DAG.getConstant(OFCC, MVT::i32);
+
+    return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
+                       CCVal, Overflow);
+  }
+
+  if (CC.getOpcode() == ISD::SETCC)
+    return DAG.getSelectCC(DL, CC.getOperand(0), CC.getOperand(1), TVal, FVal,
+                           cast<CondCodeSDNode>(CC.getOperand(2))->get());
+  else
+    return DAG.getSelectCC(DL, CC, DAG.getConstant(0, CC.getValueType()), TVal,
+                           FVal, ISD::SETNE);
+}
+
+SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue TVal = Op.getOperand(2);
+  SDValue FVal = Op.getOperand(3);
+  SDLoc dl(Op);
+
+  // Handle f128 first, because it will result in a comparison of some RTLIB
+  // call result against zero.
+  if (LHS.getValueType() == MVT::f128) {
+    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
+
+    // If softenSetCCOperands returned a scalar, we need to compare the result
+    // against zero to select between true and false values.
+    if (!RHS.getNode()) {
+      RHS = DAG.getConstant(0, LHS.getValueType());
+      CC = ISD::SETNE;
+    }
+  }
+
+  // Handle integers first.
+  if (LHS.getValueType().isInteger()) {
+    assert((LHS.getValueType() == RHS.getValueType()) &&
+           (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
+
+    unsigned Opcode = AArch64ISD::CSEL;
+
+    // If both the TVal and the FVal are constants, see if we can swap them in
+    // order to for a CSINV or CSINC out of them.
+    ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
+    ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
+
+    if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) {
+      std::swap(TVal, FVal);
+      std::swap(CTVal, CFVal);
+      CC = ISD::getSetCCInverse(CC, true);
+    } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) {
+      std::swap(TVal, FVal);
+      std::swap(CTVal, CFVal);
+      CC = ISD::getSetCCInverse(CC, true);
+    } else if (TVal.getOpcode() == ISD::XOR) {
+      // If TVal is a NOT we want to swap TVal and FVal so that we can match
+      // with a CSINV rather than a CSEL.
+      ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(1));
+
+      if (CVal && CVal->isAllOnesValue()) {
+        std::swap(TVal, FVal);
+        std::swap(CTVal, CFVal);
+        CC = ISD::getSetCCInverse(CC, true);
+      }
+    } else if (TVal.getOpcode() == ISD::SUB) {
+      // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
+      // that we can match with a CSNEG rather than a CSEL.
+      ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(0));
+
+      if (CVal && CVal->isNullValue()) {
+        std::swap(TVal, FVal);
+        std::swap(CTVal, CFVal);
+        CC = ISD::getSetCCInverse(CC, true);
+      }
+    } else if (CTVal && CFVal) {
+      const int64_t TrueVal = CTVal->getSExtValue();
+      const int64_t FalseVal = CFVal->getSExtValue();
+      bool Swap = false;
+
+      // If both TVal and FVal are constants, see if FVal is the
+      // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
+      // instead of a CSEL in that case.
+      if (TrueVal == ~FalseVal) {
+        Opcode = AArch64ISD::CSINV;
+      } else if (TrueVal == -FalseVal) {
+        Opcode = AArch64ISD::CSNEG;
+      } else if (TVal.getValueType() == MVT::i32) {
+        // If our operands are only 32-bit wide, make sure we use 32-bit
+        // arithmetic for the check whether we can use CSINC. This ensures that
+        // the addition in the check will wrap around properly in case there is
+        // an overflow (which would not be the case if we do the check with
+        // 64-bit arithmetic).
+        const uint32_t TrueVal32 = CTVal->getZExtValue();
+        const uint32_t FalseVal32 = CFVal->getZExtValue();
+
+        if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
+          Opcode = AArch64ISD::CSINC;
+
+          if (TrueVal32 > FalseVal32) {
+            Swap = true;
+          }
+        }
+        // 64-bit check whether we can use CSINC.
+      } else if ((TrueVal == FalseVal + 1) || (TrueVal + 1 == FalseVal)) {
+        Opcode = AArch64ISD::CSINC;
+
+        if (TrueVal > FalseVal) {
+          Swap = true;
+        }
+      }
+
+      // Swap TVal and FVal if necessary.
+      if (Swap) {
+        std::swap(TVal, FVal);
+        std::swap(CTVal, CFVal);
+        CC = ISD::getSetCCInverse(CC, true);
+      }
+
+      if (Opcode != AArch64ISD::CSEL) {
+        // Drop FVal since we can get its value by simply inverting/negating
+        // TVal.
+        FVal = TVal;
+      }
+    }
+
+    SDValue CCVal;
+    SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
+
+    EVT VT = Op.getValueType();
+    return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
+  }
+
+  // Now we know we're dealing with FP values.
+  assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
+  assert(LHS.getValueType() == RHS.getValueType());
+  EVT VT = Op.getValueType();
+
+  // Try to match this select into a max/min operation, which have dedicated
+  // opcode in the instruction set.
+  // FIXME: This is not correct in the presence of NaNs, so we only enable this
+  // in no-NaNs mode.
+  if (getTargetMachine().Options.NoNaNsFPMath) {
+    SDValue MinMaxLHS = TVal, MinMaxRHS = FVal;
+    if (selectCCOpsAreFMaxCompatible(LHS, MinMaxRHS) &&
+        selectCCOpsAreFMaxCompatible(RHS, MinMaxLHS)) {
+      CC = ISD::getSetCCSwappedOperands(CC);
+      std::swap(MinMaxLHS, MinMaxRHS);
+    }
+
+    if (selectCCOpsAreFMaxCompatible(LHS, MinMaxLHS) &&
+        selectCCOpsAreFMaxCompatible(RHS, MinMaxRHS)) {
+      switch (CC) {
+      default:
+        break;
+      case ISD::SETGT:
+      case ISD::SETGE:
+      case ISD::SETUGT:
+      case ISD::SETUGE:
+      case ISD::SETOGT:
+      case ISD::SETOGE:
+        return DAG.getNode(AArch64ISD::FMAX, dl, VT, MinMaxLHS, MinMaxRHS);
+        break;
+      case ISD::SETLT:
+      case ISD::SETLE:
+      case ISD::SETULT:
+      case ISD::SETULE:
+      case ISD::SETOLT:
+      case ISD::SETOLE:
+        return DAG.getNode(AArch64ISD::FMIN, dl, VT, MinMaxLHS, MinMaxRHS);
+        break;
+      }
+    }
+  }
+
+  // If that fails, we'll need to perform an FCMP + CSEL sequence.  Go ahead
+  // and do the comparison.
+  SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
+
+  // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
+  // clean.  Some of them require two CSELs to implement.
+  AArch64CC::CondCode CC1, CC2;
+  changeFPCCToAArch64CC(CC, CC1, CC2);
+  SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
+  SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
+
+  // If we need a second CSEL, emit it, using the output of the first as the
+  // RHS.  We're effectively OR'ing the two CC's together.
+  if (CC2 != AArch64CC::AL) {
+    SDValue CC2Val = DAG.getConstant(CC2, MVT::i32);
+    return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
+  }
+
+  // Otherwise, return the output of the first CSEL.
+  return CS1;
+}
+
+SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  // Jump table entries as PC relative offsets. No additional tweaking
+  // is necessary here. Just get the address of the jump table.
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+  EVT PtrVT = getPointerTy();
+  SDLoc DL(Op);
+
+  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
+      !Subtarget->isTargetMachO()) {
+    const unsigned char MO_NC = AArch64II::MO_NC;
+    return DAG.getNode(
+        AArch64ISD::WrapperLarge, DL, PtrVT,
+        DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G3),
+        DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G2 | MO_NC),
+        DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G1 | MO_NC),
+        DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
+                               AArch64II::MO_G0 | MO_NC));
+  }
+
+  SDValue Hi =
+      DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_PAGE);
+  SDValue Lo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
+                                      AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+  SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
+  return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
+}
+
+SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+  EVT PtrVT = getPointerTy();
+  SDLoc DL(Op);
+
+  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
+    // Use the GOT for the large code model on iOS.
+    if (Subtarget->isTargetMachO()) {
+      SDValue GotAddr = DAG.getTargetConstantPool(
+          CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(),
+          AArch64II::MO_GOT);
+      return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr);
+    }
+
+    const unsigned char MO_NC = AArch64II::MO_NC;
+    return DAG.getNode(
+        AArch64ISD::WrapperLarge, DL, PtrVT,
+        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
+                                  CP->getOffset(), AArch64II::MO_G3),
+        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
+                                  CP->getOffset(), AArch64II::MO_G2 | MO_NC),
+        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
+                                  CP->getOffset(), AArch64II::MO_G1 | MO_NC),
+        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
+                                  CP->getOffset(), AArch64II::MO_G0 | MO_NC));
+  } else {
+    // Use ADRP/ADD or ADRP/LDR for everything else: the small memory model on
+    // ELF, the only valid one on Darwin.
+    SDValue Hi =
+        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
+                                  CP->getOffset(), AArch64II::MO_PAGE);
+    SDValue Lo = DAG.getTargetConstantPool(
+        CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(),
+        AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+
+    SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
+    return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
+  }
+}
+
+SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+  EVT PtrVT = getPointerTy();
+  SDLoc DL(Op);
+  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
+      !Subtarget->isTargetMachO()) {
+    const unsigned char MO_NC = AArch64II::MO_NC;
+    return DAG.getNode(
+        AArch64ISD::WrapperLarge, DL, PtrVT,
+        DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G3),
+        DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G2 | MO_NC),
+        DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G1 | MO_NC),
+        DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G0 | MO_NC));
+  } else {
+    SDValue Hi = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGE);
+    SDValue Lo = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGEOFF |
+                                                             AArch64II::MO_NC);
+    SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
+    return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
+  }
+}
+
+SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  AArch64FunctionInfo *FuncInfo =
+      DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
+
+  SDLoc DL(Op);
+  SDValue FR =
+      DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy());
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
+                      MachinePointerInfo(SV), false, false, 0);
+}
+
+SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  // The layout of the va_list struct is specified in the AArch64 Procedure Call
+  // Standard, section B.3.
+  MachineFunction &MF = DAG.getMachineFunction();
+  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+  SDLoc DL(Op);
+
+  SDValue Chain = Op.getOperand(0);
+  SDValue VAList = Op.getOperand(1);
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  SmallVector<SDValue, 4> MemOps;
+
+  // void *__stack at offset 0
+  SDValue Stack =
+      DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy());
+  MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
+                                MachinePointerInfo(SV), false, false, 8));
+
+  // void *__gr_top at offset 8
+  int GPRSize = FuncInfo->getVarArgsGPRSize();
+  if (GPRSize > 0) {
+    SDValue GRTop, GRTopAddr;
+
+    GRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
+                            DAG.getConstant(8, getPointerTy()));
+
+    GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), getPointerTy());
+    GRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), GRTop,
+                        DAG.getConstant(GPRSize, getPointerTy()));
+
+    MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
+                                  MachinePointerInfo(SV, 8), false, false, 8));
+  }
+
+  // void *__vr_top at offset 16
+  int FPRSize = FuncInfo->getVarArgsFPRSize();
+  if (FPRSize > 0) {
+    SDValue VRTop, VRTopAddr;
+    VRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
+                            DAG.getConstant(16, getPointerTy()));
+
+    VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), getPointerTy());
+    VRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), VRTop,
+                        DAG.getConstant(FPRSize, getPointerTy()));
+
+    MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
+                                  MachinePointerInfo(SV, 16), false, false, 8));
+  }
+
+  // int __gr_offs at offset 24
+  SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
+                                   DAG.getConstant(24, getPointerTy()));
+  MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, MVT::i32),
+                                GROffsAddr, MachinePointerInfo(SV, 24), false,
+                                false, 4));
+
+  // int __vr_offs at offset 28
+  SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
+                                   DAG.getConstant(28, getPointerTy()));
+  MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, MVT::i32),
+                                VROffsAddr, MachinePointerInfo(SV, 28), false,
+                                false, 4));
+
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
+}
+
+SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  return Subtarget->isTargetDarwin() ? LowerDarwin_VASTART(Op, DAG)
+                                     : LowerAAPCS_VASTART(Op, DAG);
+}
+
+SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
+  // pointer.
+  unsigned VaListSize = Subtarget->isTargetDarwin() ? 8 : 32;
+  const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
+  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+
+  return DAG.getMemcpy(Op.getOperand(0), SDLoc(Op), Op.getOperand(1),
+                       Op.getOperand(2), DAG.getConstant(VaListSize, MVT::i32),
+                       8, false, false, MachinePointerInfo(DestSV),
+                       MachinePointerInfo(SrcSV));
+}
+
+SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetDarwin() &&
+         "automatic va_arg instruction only works on Darwin");
+
+  const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  SDValue Chain = Op.getOperand(0);
+  SDValue Addr = Op.getOperand(1);
+  unsigned Align = Op.getConstantOperandVal(3);
+
+  SDValue VAList = DAG.getLoad(getPointerTy(), DL, Chain, Addr,
+                               MachinePointerInfo(V), false, false, false, 0);
+  Chain = VAList.getValue(1);
+
+  if (Align > 8) {
+    assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2");
+    VAList = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
+                         DAG.getConstant(Align - 1, getPointerTy()));
+    VAList = DAG.getNode(ISD::AND, DL, getPointerTy(), VAList,
+                         DAG.getConstant(-(int64_t)Align, getPointerTy()));
+  }
+
+  Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
+  uint64_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
+
+  // Scalar integer and FP values smaller than 64 bits are implicitly extended
+  // up to 64 bits.  At the very least, we have to increase the striding of the
+  // vaargs list to match this, and for FP values we need to introduce
+  // FP_ROUND nodes as well.
+  if (VT.isInteger() && !VT.isVector())
+    ArgSize = 8;
+  bool NeedFPTrunc = false;
+  if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
+    ArgSize = 8;
+    NeedFPTrunc = true;
+  }
+
+  // Increment the pointer, VAList, to the next vaarg
+  SDValue VANext = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
+                               DAG.getConstant(ArgSize, getPointerTy()));
+  // Store the incremented VAList to the legalized pointer
+  SDValue APStore = DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V),
+                                 false, false, 0);
+
+  // Load the actual argument out of the pointer VAList
+  if (NeedFPTrunc) {
+    // Load the value as an f64.
+    SDValue WideFP = DAG.getLoad(MVT::f64, DL, APStore, VAList,
+                                 MachinePointerInfo(), false, false, false, 0);
+    // Round the value down to an f32.
+    SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
+                                   DAG.getIntPtrConstant(1));
+    SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
+    // Merge the rounded value with the chain output of the load.
+    return DAG.getMergeValues(Ops, DL);
+  }
+
+  return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo(), false,
+                     false, false, 0);
+}
+
+SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI->setFrameAddressIsTaken(true);
+
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  SDValue FrameAddr =
+      DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
+  while (Depth--)
+    FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
+                            MachinePointerInfo(), false, false, false, 0);
+  return FrameAddr;
+}
+
+// FIXME? Maybe this could be a TableGen attribute on some registers and
+// this table could be generated automatically from RegInfo.
+unsigned AArch64TargetLowering::getRegisterByName(const char* RegName,
+                                                  EVT VT) const {
+  unsigned Reg = StringSwitch<unsigned>(RegName)
+                       .Case("sp", AArch64::SP)
+                       .Default(0);
+  if (Reg)
+    return Reg;
+  report_fatal_error("Invalid register name global variable");
+}
+
+SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MFI->setReturnAddressIsTaken(true);
+
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  if (Depth) {
+    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+    SDValue Offset = DAG.getConstant(8, getPointerTy());
+    return DAG.getLoad(VT, DL, DAG.getEntryNode(),
+                       DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
+                       MachinePointerInfo(), false, false, false, 0);
+  }
+
+  // Return LR, which contains the return address. Mark it an implicit live-in.
+  unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
+  return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
+}
+
+/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
+/// i64 values and take a 2 x i64 value to shift plus a shift amount.
+SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op,
+                                                    SelectionDAG &DAG) const {
+  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+  EVT VT = Op.getValueType();
+  unsigned VTBits = VT.getSizeInBits();
+  SDLoc dl(Op);
+  SDValue ShOpLo = Op.getOperand(0);
+  SDValue ShOpHi = Op.getOperand(1);
+  SDValue ShAmt = Op.getOperand(2);
+  SDValue ARMcc;
+  unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
+
+  assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
+
+  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
+                                 DAG.getConstant(VTBits, MVT::i64), ShAmt);
+  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
+  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
+                                   DAG.getConstant(VTBits, MVT::i64));
+  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
+
+  SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, MVT::i64),
+                               ISD::SETGE, dl, DAG);
+  SDValue CCVal = DAG.getConstant(AArch64CC::GE, MVT::i32);
+
+  SDValue FalseValLo = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+  SDValue TrueValLo = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
+  SDValue Lo =
+      DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp);
+
+  // AArch64 shifts larger than the register width are wrapped rather than
+  // clamped, so we can't just emit "hi >> x".
+  SDValue FalseValHi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
+  SDValue TrueValHi = Opc == ISD::SRA
+                          ? DAG.getNode(Opc, dl, VT, ShOpHi,
+                                        DAG.getConstant(VTBits - 1, MVT::i64))
+                          : DAG.getConstant(0, VT);
+  SDValue Hi =
+      DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValHi, FalseValHi, CCVal, Cmp);
+
+  SDValue Ops[2] = { Lo, Hi };
+  return DAG.getMergeValues(Ops, dl);
+}
+
+/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
+/// i64 values and take a 2 x i64 value to shift plus a shift amount.
+SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+  EVT VT = Op.getValueType();
+  unsigned VTBits = VT.getSizeInBits();
+  SDLoc dl(Op);
+  SDValue ShOpLo = Op.getOperand(0);
+  SDValue ShOpHi = Op.getOperand(1);
+  SDValue ShAmt = Op.getOperand(2);
+  SDValue ARMcc;
+
+  assert(Op.getOpcode() == ISD::SHL_PARTS);
+  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
+                                 DAG.getConstant(VTBits, MVT::i64), ShAmt);
+  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
+  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
+                                   DAG.getConstant(VTBits, MVT::i64));
+  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
+  SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
+
+  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+
+  SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, MVT::i64),
+                               ISD::SETGE, dl, DAG);
+  SDValue CCVal = DAG.getConstant(AArch64CC::GE, MVT::i32);
+  SDValue Hi =
+      DAG.getNode(AArch64ISD::CSEL, dl, VT, Tmp3, FalseVal, CCVal, Cmp);
+
+  // AArch64 shifts of larger than register sizes are wrapped rather than
+  // clamped, so we can't just emit "lo << a" if a is too big.
+  SDValue TrueValLo = DAG.getConstant(0, VT);
+  SDValue FalseValLo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+  SDValue Lo =
+      DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp);
+
+  SDValue Ops[2] = { Lo, Hi };
+  return DAG.getMergeValues(Ops, dl);
+}
+
+bool AArch64TargetLowering::isOffsetFoldingLegal(
+    const GlobalAddressSDNode *GA) const {
+  // The AArch64 target doesn't support folding offsets into global addresses.
+  return false;
+}
+
+bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+  // We can materialize #0.0 as fmov $Rd, XZR for 64-bit and 32-bit cases.
+  // FIXME: We should be able to handle f128 as well with a clever lowering.
+  if (Imm.isPosZero() && (VT == MVT::f64 || VT == MVT::f32))
+    return true;
+
+  if (VT == MVT::f64)
+    return AArch64_AM::getFP64Imm(Imm) != -1;
+  else if (VT == MVT::f32)
+    return AArch64_AM::getFP32Imm(Imm) != -1;
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+//                          AArch64 Optimization Hooks
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//                          AArch64 Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+// Table of Constraints
+// TODO: This is the current set of constraints supported by ARM for the
+// compiler, not all of them may make sense, e.g. S may be difficult to support.
+//
+// r - A general register
+// w - An FP/SIMD register of some size in the range v0-v31
+// x - An FP/SIMD register of some size in the range v0-v15
+// I - Constant that can be used with an ADD instruction
+// J - Constant that can be used with a SUB instruction
+// K - Constant that can be used with a 32-bit logical instruction
+// L - Constant that can be used with a 64-bit logical instruction
+// M - Constant that can be used as a 32-bit MOV immediate
+// N - Constant that can be used as a 64-bit MOV immediate
+// Q - A memory reference with base register and no offset
+// S - A symbolic address
+// Y - Floating point constant zero
+// Z - Integer constant zero
+//
+//   Note that general register operands will be output using their 64-bit x
+// register name, whatever the size of the variable, unless the asm operand
+// is prefixed by the %w modifier. Floating-point and SIMD register operands
+// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
+// %q modifier.
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+AArch64TargetLowering::ConstraintType
+AArch64TargetLowering::getConstraintType(const std::string &Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default:
+      break;
+    case 'z':
+      return C_Other;
+    case 'x':
+    case 'w':
+      return C_RegisterClass;
+    // An address with a single base register. Due to the way we
+    // currently handle addresses it is the same as 'r'.
+    case 'Q':
+      return C_Memory;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+/// Examine constraint type and operand type and determine a weight value.
+/// This object must already have been set up with the operand type
+/// and the current alternative constraint selected.
+TargetLowering::ConstraintWeight
+AArch64TargetLowering::getSingleConstraintMatchWeight(
+    AsmOperandInfo &info, const char *constraint) const {
+  ConstraintWeight weight = CW_Invalid;
+  Value *CallOperandVal = info.CallOperandVal;
+  // If we don't have a value, we can't do a match,
+  // but allow it at the lowest weight.
+  if (!CallOperandVal)
+    return CW_Default;
+  Type *type = CallOperandVal->getType();
+  // Look at the constraint type.
+  switch (*constraint) {
+  default:
+    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+    break;
+  case 'x':
+  case 'w':
+    if (type->isFloatingPointTy() || type->isVectorTy())
+      weight = CW_Register;
+    break;
+  case 'z':
+    weight = CW_Constant;
+    break;
+  }
+  return weight;
+}
+
+std::pair<unsigned, const TargetRegisterClass *>
+AArch64TargetLowering::getRegForInlineAsmConstraint(
+    const std::string &Constraint, MVT VT) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'r':
+      if (VT.getSizeInBits() == 64)
+        return std::make_pair(0U, &AArch64::GPR64commonRegClass);
+      return std::make_pair(0U, &AArch64::GPR32commonRegClass);
+    case 'w':
+      if (VT == MVT::f32)
+        return std::make_pair(0U, &AArch64::FPR32RegClass);
+      if (VT.getSizeInBits() == 64)
+        return std::make_pair(0U, &AArch64::FPR64RegClass);
+      if (VT.getSizeInBits() == 128)
+        return std::make_pair(0U, &AArch64::FPR128RegClass);
+      break;
+    // The instructions that this constraint is designed for can
+    // only take 128-bit registers so just use that regclass.
+    case 'x':
+      if (VT.getSizeInBits() == 128)
+        return std::make_pair(0U, &AArch64::FPR128_loRegClass);
+      break;
+    }
+  }
+  if (StringRef("{cc}").equals_lower(Constraint))
+    return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
+
+  // Use the default implementation in TargetLowering to convert the register
+  // constraint into a member of a register class.
+  std::pair<unsigned, const TargetRegisterClass *> Res;
+  Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+
+  // Not found as a standard register?
+  if (!Res.second) {
+    unsigned Size = Constraint.size();
+    if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
+        tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
+      const std::string Reg =
+          std::string(&Constraint[2], &Constraint[Size - 1]);
+      int RegNo = atoi(Reg.c_str());
+      if (RegNo >= 0 && RegNo <= 31) {
+        // v0 - v31 are aliases of q0 - q31.
+        // By default we'll emit v0-v31 for this unless there's a modifier where
+        // we'll emit the correct register as well.
+        Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
+        Res.second = &AArch64::FPR128RegClass;
+      }
+    }
+  }
+
+  return Res;
+}
+
+/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+/// vector.  If it is invalid, don't add anything to Ops.
+void AArch64TargetLowering::LowerAsmOperandForConstraint(
+    SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
+    SelectionDAG &DAG) const {
+  SDValue Result;
+
+  // Currently only support length 1 constraints.
+  if (Constraint.length() != 1)
+    return;
+
+  char ConstraintLetter = Constraint[0];
+  switch (ConstraintLetter) {
+  default:
+    break;
+
+  // This set of constraints deal with valid constants for various instructions.
+  // Validate and return a target constant for them if we can.
+  case 'z': {
+    // 'z' maps to xzr or wzr so it needs an input of 0.
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
+    if (!C || C->getZExtValue() != 0)
+      return;
+
+    if (Op.getValueType() == MVT::i64)
+      Result = DAG.getRegister(AArch64::XZR, MVT::i64);
+    else
+      Result = DAG.getRegister(AArch64::WZR, MVT::i32);
+    break;
+  }
+
+  case 'I':
+  case 'J':
+  case 'K':
+  case 'L':
+  case 'M':
+  case 'N':
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
+    if (!C)
+      return;
+
+    // Grab the value and do some validation.
+    uint64_t CVal = C->getZExtValue();
+    switch (ConstraintLetter) {
+    // The I constraint applies only to simple ADD or SUB immediate operands:
+    // i.e. 0 to 4095 with optional shift by 12
+    // The J constraint applies only to ADD or SUB immediates that would be
+    // valid when negated, i.e. if [an add pattern] were to be output as a SUB
+    // instruction [or vice versa], in other words -1 to -4095 with optional
+    // left shift by 12.
+    case 'I':
+      if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
+        break;
+      return;
+    case 'J': {
+      uint64_t NVal = -C->getSExtValue();
+      if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal))
+        break;
+      return;
+    }
+    // The K and L constraints apply *only* to logical immediates, including
+    // what used to be the MOVI alias for ORR (though the MOVI alias has now
+    // been removed and MOV should be used). So these constraints have to
+    // distinguish between bit patterns that are valid 32-bit or 64-bit
+    // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
+    // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
+    // versa.
+    case 'K':
+      if (AArch64_AM::isLogicalImmediate(CVal, 32))
+        break;
+      return;
+    case 'L':
+      if (AArch64_AM::isLogicalImmediate(CVal, 64))
+        break;
+      return;
+    // The M and N constraints are a superset of K and L respectively, for use
+    // with the MOV (immediate) alias. As well as the logical immediates they
+    // also match 32 or 64-bit immediates that can be loaded either using a
+    // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
+    // (M) or 64-bit 0x1234000000000000 (N) etc.
+    // As a note some of this code is liberally stolen from the asm parser.
+    case 'M': {
+      if (!isUInt<32>(CVal))
+        return;
+      if (AArch64_AM::isLogicalImmediate(CVal, 32))
+        break;
+      if ((CVal & 0xFFFF) == CVal)
+        break;
+      if ((CVal & 0xFFFF0000ULL) == CVal)
+        break;
+      uint64_t NCVal = ~(uint32_t)CVal;
+      if ((NCVal & 0xFFFFULL) == NCVal)
+        break;
+      if ((NCVal & 0xFFFF0000ULL) == NCVal)
+        break;
+      return;
+    }
+    case 'N': {
+      if (AArch64_AM::isLogicalImmediate(CVal, 64))
+        break;
+      if ((CVal & 0xFFFFULL) == CVal)
+        break;
+      if ((CVal & 0xFFFF0000ULL) == CVal)
+        break;
+      if ((CVal & 0xFFFF00000000ULL) == CVal)
+        break;
+      if ((CVal & 0xFFFF000000000000ULL) == CVal)
+        break;
+      uint64_t NCVal = ~CVal;
+      if ((NCVal & 0xFFFFULL) == NCVal)
+        break;
+      if ((NCVal & 0xFFFF0000ULL) == NCVal)
+        break;
+      if ((NCVal & 0xFFFF00000000ULL) == NCVal)
+        break;
+      if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
+        break;
+      return;
+    }
+    default:
+      return;
+    }
+
+    // All assembler immediates are 64-bit integers.
+    Result = DAG.getTargetConstant(CVal, MVT::i64);
+    break;
+  }
+
+  if (Result.getNode()) {
+    Ops.push_back(Result);
+    return;
+  }
+
+  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+//===----------------------------------------------------------------------===//
+//                     AArch64 Advanced SIMD Support
+//===----------------------------------------------------------------------===//
+
+/// WidenVector - Given a value in the V64 register class, produce the
+/// equivalent value in the V128 register class.
+static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) {
+  EVT VT = V64Reg.getValueType();
+  unsigned NarrowSize = VT.getVectorNumElements();
+  MVT EltTy = VT.getVectorElementType().getSimpleVT();
+  MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
+  SDLoc DL(V64Reg);
+
+  return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
+                     V64Reg, DAG.getConstant(0, MVT::i32));
+}
+
+/// getExtFactor - Determine the adjustment factor for the position when
+/// generating an "extract from vector registers" instruction.
+static unsigned getExtFactor(SDValue &V) {
+  EVT EltType = V.getValueType().getVectorElementType();
+  return EltType.getSizeInBits() / 8;
+}
+
+/// NarrowVector - Given a value in the V128 register class, produce the
+/// equivalent value in the V64 register class.
+static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
+  EVT VT = V128Reg.getValueType();
+  unsigned WideSize = VT.getVectorNumElements();
+  MVT EltTy = VT.getVectorElementType().getSimpleVT();
+  MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
+  SDLoc DL(V128Reg);
+
+  return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg);
+}
+
+// Gather data to see if the operation can be modelled as a
+// shuffle in combination with VEXTs.
+SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+  unsigned NumElts = VT.getVectorNumElements();
+
+  SmallVector<SDValue, 2> SourceVecs;
+  SmallVector<unsigned, 2> MinElts;
+  SmallVector<unsigned, 2> MaxElts;
+
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue V = Op.getOperand(i);
+    if (V.getOpcode() == ISD::UNDEF)
+      continue;
+    else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
+      // A shuffle can only come from building a vector from various
+      // elements of other vectors.
+      return SDValue();
+    }
+
+    // Record this extraction against the appropriate vector if possible...
+    SDValue SourceVec = V.getOperand(0);
+    unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
+    bool FoundSource = false;
+    for (unsigned j = 0; j < SourceVecs.size(); ++j) {
+      if (SourceVecs[j] == SourceVec) {
+        if (MinElts[j] > EltNo)
+          MinElts[j] = EltNo;
+        if (MaxElts[j] < EltNo)
+          MaxElts[j] = EltNo;
+        FoundSource = true;
+        break;
+      }
+    }
+
+    // Or record a new source if not...
+    if (!FoundSource) {
+      SourceVecs.push_back(SourceVec);
+      MinElts.push_back(EltNo);
+      MaxElts.push_back(EltNo);
+    }
+  }
+
+  // Currently only do something sane when at most two source vectors
+  // involved.
+  if (SourceVecs.size() > 2)
+    return SDValue();
+
+  SDValue ShuffleSrcs[2] = { DAG.getUNDEF(VT), DAG.getUNDEF(VT) };
+  int VEXTOffsets[2] = { 0, 0 };
+  int OffsetMultipliers[2] = { 1, 1 };
+
+  // This loop extracts the usage patterns of the source vectors
+  // and prepares appropriate SDValues for a shuffle if possible.
+  for (unsigned i = 0; i < SourceVecs.size(); ++i) {
+    unsigned NumSrcElts = SourceVecs[i].getValueType().getVectorNumElements();
+    SDValue CurSource = SourceVecs[i];
+    if (SourceVecs[i].getValueType().getVectorElementType() !=
+        VT.getVectorElementType()) {
+      // It may hit this case if SourceVecs[i] is AssertSext/AssertZext.
+      // Then bitcast it to the vector which holds asserted element type,
+      // and record the multiplier of element width between SourceVecs and
+      // Build_vector which is needed to extract the correct lanes later.
+      EVT CastVT =
+          EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
+                           SourceVecs[i].getValueSizeInBits() /
+                               VT.getVectorElementType().getSizeInBits());
+
+      CurSource = DAG.getNode(ISD::BITCAST, dl, CastVT, SourceVecs[i]);
+      OffsetMultipliers[i] = CastVT.getVectorNumElements() / NumSrcElts;
+      NumSrcElts *= OffsetMultipliers[i];
+      MaxElts[i] *= OffsetMultipliers[i];
+      MinElts[i] *= OffsetMultipliers[i];
+    }
+
+    if (CurSource.getValueType() == VT) {
+      // No VEXT necessary
+      ShuffleSrcs[i] = CurSource;
+      VEXTOffsets[i] = 0;
+      continue;
+    } else if (NumSrcElts < NumElts) {
+      // We can pad out the smaller vector for free, so if it's part of a
+      // shuffle...
+      ShuffleSrcs[i] = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, CurSource,
+                                   DAG.getUNDEF(CurSource.getValueType()));
+      continue;
+    }
+
+    // Since only 64-bit and 128-bit vectors are legal on ARM and
+    // we've eliminated the other cases...
+    assert(NumSrcElts == 2 * NumElts &&
+           "unexpected vector sizes in ReconstructShuffle");
+
+    if (MaxElts[i] - MinElts[i] >= NumElts) {
+      // Span too large for a VEXT to cope
+      return SDValue();
+    }
+
+    if (MinElts[i] >= NumElts) {
+      // The extraction can just take the second half
+      VEXTOffsets[i] = NumElts;
+      ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource,
+                                   DAG.getIntPtrConstant(NumElts));
+    } else if (MaxElts[i] < NumElts) {
+      // The extraction can just take the first half
+      VEXTOffsets[i] = 0;
+      ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource,
+                                   DAG.getIntPtrConstant(0));
+    } else {
+      // An actual VEXT is needed
+      VEXTOffsets[i] = MinElts[i];
+      SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource,
+                                     DAG.getIntPtrConstant(0));
+      SDValue VEXTSrc2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource,
+                                     DAG.getIntPtrConstant(NumElts));
+      unsigned Imm = VEXTOffsets[i] * getExtFactor(VEXTSrc1);
+      ShuffleSrcs[i] = DAG.getNode(AArch64ISD::EXT, dl, VT, VEXTSrc1, VEXTSrc2,
+                                   DAG.getConstant(Imm, MVT::i32));
+    }
+  }
+
+  SmallVector<int, 8> Mask;
+
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue Entry = Op.getOperand(i);
+    if (Entry.getOpcode() == ISD::UNDEF) {
+      Mask.push_back(-1);
+      continue;
+    }
+
+    SDValue ExtractVec = Entry.getOperand(0);
+    int ExtractElt =
+        cast<ConstantSDNode>(Op.getOperand(i).getOperand(1))->getSExtValue();
+    if (ExtractVec == SourceVecs[0]) {
+      Mask.push_back(ExtractElt * OffsetMultipliers[0] - VEXTOffsets[0]);
+    } else {
+      Mask.push_back(ExtractElt * OffsetMultipliers[1] + NumElts -
+                     VEXTOffsets[1]);
+    }
+  }
+
+  // Final check before we try to produce nonsense...
+  if (isShuffleMaskLegal(Mask, VT))
+    return DAG.getVectorShuffle(VT, dl, ShuffleSrcs[0], ShuffleSrcs[1],
+                                &Mask[0]);
+
+  return SDValue();
+}
+
+// check if an EXT instruction can handle the shuffle mask when the
+// vector sources of the shuffle are the same.
+static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  // Assume that the first shuffle index is not UNDEF.  Fail if it is.
+  if (M[0] < 0)
+    return false;
+
+  Imm = M[0];
+
+  // If this is a VEXT shuffle, the immediate value is the index of the first
+  // element.  The other shuffle indices must be the successive elements after
+  // the first one.
+  unsigned ExpectedElt = Imm;
+  for (unsigned i = 1; i < NumElts; ++i) {
+    // Increment the expected index.  If it wraps around, just follow it
+    // back to index zero and keep going.
+    ++ExpectedElt;
+    if (ExpectedElt == NumElts)
+      ExpectedElt = 0;
+
+    if (M[i] < 0)
+      continue; // ignore UNDEF indices
+    if (ExpectedElt != static_cast<unsigned>(M[i]))
+      return false;
+  }
+
+  return true;
+}
+
+// check if an EXT instruction can handle the shuffle mask when the
+// vector sources of the shuffle are different.
+static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
+                      unsigned &Imm) {
+  // Look for the first non-undef element.
+  const int *FirstRealElt = std::find_if(M.begin(), M.end(),
+      [](int Elt) {return Elt >= 0;});
+
+  // Benefit form APInt to handle overflow when calculating expected element.
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
+  APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
+  // The following shuffle indices must be the successive elements after the
+  // first real element.
+  const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(),
+      [&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;});
+  if (FirstWrongElt != M.end())
+    return false;
+
+  // The index of an EXT is the first element if it is not UNDEF.
+  // Watch out for the beginning UNDEFs. The EXT index should be the expected
+  // value of the first element.  E.g. 
+  // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
+  // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
+  // ExpectedElt is the last mask index plus 1.
+  Imm = ExpectedElt.getZExtValue();
+
+  // There are two difference cases requiring to reverse input vectors.
+  // For example, for vector <4 x i32> we have the following cases,
+  // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
+  // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
+  // For both cases, we finally use mask <5, 6, 7, 0>, which requires
+  // to reverse two input vectors.
+  if (Imm < NumElts)
+    ReverseEXT = true;
+  else
+    Imm -= NumElts;
+
+  return true;
+}
+
+/// isREVMask - Check if a vector shuffle corresponds to a REV
+/// instruction with the specified blocksize.  (The order of the elements
+/// within each block of the vector is reversed.)
+static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
+  assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
+         "Only possible block sizes for REV are: 16, 32, 64");
+
+  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
+  if (EltSz == 64)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned BlockElts = M[0] + 1;
+  // If the first shuffle index is UNDEF, be optimistic.
+  if (M[0] < 0)
+    BlockElts = BlockSize / EltSz;
+
+  if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
+    return false;
+
+  for (unsigned i = 0; i < NumElts; ++i) {
+    if (M[i] < 0)
+      continue; // ignore UNDEF indices
+    if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
+      return false;
+  }
+
+  return true;
+}
+
+static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  unsigned Idx = WhichResult * NumElts / 2;
+  for (unsigned i = 0; i != NumElts; i += 2) {
+    if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
+        (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts))
+      return false;
+    Idx += 1;
+  }
+
+  return true;
+}
+
+static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned i = 0; i != NumElts; ++i) {
+    if (M[i] < 0)
+      continue; // ignore UNDEF indices
+    if ((unsigned)M[i] != 2 * i + WhichResult)
+      return false;
+  }
+
+  return true;
+}
+
+static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned i = 0; i < NumElts; i += 2) {
+    if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
+        (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
+      return false;
+  }
+  return true;
+}
+
+/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
+/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
+/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
+static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  unsigned Idx = WhichResult * NumElts / 2;
+  for (unsigned i = 0; i != NumElts; i += 2) {
+    if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
+        (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
+      return false;
+    Idx += 1;
+  }
+
+  return true;
+}
+
+/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
+/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
+/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
+static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned Half = VT.getVectorNumElements() / 2;
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned j = 0; j != 2; ++j) {
+    unsigned Idx = WhichResult;
+    for (unsigned i = 0; i != Half; ++i) {
+      int MIdx = M[i + j * Half];
+      if (MIdx >= 0 && (unsigned)MIdx != Idx)
+        return false;
+      Idx += 2;
+    }
+  }
+
+  return true;
+}
+
+/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
+/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
+/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
+static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned i = 0; i < NumElts; i += 2) {
+    if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
+        (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
+      return false;
+  }
+  return true;
+}
+
+static bool isINSMask(ArrayRef<int> M, int NumInputElements,
+                      bool &DstIsLeft, int &Anomaly) {
+  if (M.size() != static_cast<size_t>(NumInputElements))
+    return false;
+
+  int NumLHSMatch = 0, NumRHSMatch = 0;
+  int LastLHSMismatch = -1, LastRHSMismatch = -1;
+
+  for (int i = 0; i < NumInputElements; ++i) {
+    if (M[i] == -1) {
+      ++NumLHSMatch;
+      ++NumRHSMatch;
+      continue;
+    }
+
+    if (M[i] == i)
+      ++NumLHSMatch;
+    else
+      LastLHSMismatch = i;
+
+    if (M[i] == i + NumInputElements)
+      ++NumRHSMatch;
+    else
+      LastRHSMismatch = i;
+  }
+
+  if (NumLHSMatch == NumInputElements - 1) {
+    DstIsLeft = true;
+    Anomaly = LastLHSMismatch;
+    return true;
+  } else if (NumRHSMatch == NumInputElements - 1) {
+    DstIsLeft = false;
+    Anomaly = LastRHSMismatch;
+    return true;
+  }
+
+  return false;
+}
+
+static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
+  if (VT.getSizeInBits() != 128)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+
+  for (int I = 0, E = NumElts / 2; I != E; I++) {
+    if (Mask[I] != I)
+      return false;
+  }
+
+  int Offset = NumElts / 2;
+  for (int I = NumElts / 2, E = NumElts; I != E; I++) {
+    if (Mask[I] != I + SplitLHS * Offset)
+      return false;
+  }
+
+  return true;
+}
+
+static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  SDValue V0 = Op.getOperand(0);
+  SDValue V1 = Op.getOperand(1);
+  ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
+
+  if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() ||
+      VT.getVectorElementType() != V1.getValueType().getVectorElementType())
+    return SDValue();
+
+  bool SplitV0 = V0.getValueType().getSizeInBits() == 128;
+
+  if (!isConcatMask(Mask, VT, SplitV0))
+    return SDValue();
+
+  EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
+                                VT.getVectorNumElements() / 2);
+  if (SplitV0) {
+    V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
+                     DAG.getConstant(0, MVT::i64));
+  }
+  if (V1.getValueType().getSizeInBits() == 128) {
+    V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
+                     DAG.getConstant(0, MVT::i64));
+  }
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
+}
+
+/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
+/// the specified operations to build the shuffle.
+static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
+                                      SDValue RHS, SelectionDAG &DAG,
+                                      SDLoc dl) {
+  unsigned OpNum = (PFEntry >> 26) & 0x0F;
+  unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
+  unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
+
+  enum {
+    OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
+    OP_VREV,
+    OP_VDUP0,
+    OP_VDUP1,
+    OP_VDUP2,
+    OP_VDUP3,
+    OP_VEXT1,
+    OP_VEXT2,
+    OP_VEXT3,
+    OP_VUZPL, // VUZP, left result
+    OP_VUZPR, // VUZP, right result
+    OP_VZIPL, // VZIP, left result
+    OP_VZIPR, // VZIP, right result
+    OP_VTRNL, // VTRN, left result
+    OP_VTRNR  // VTRN, right result
+  };
+
+  if (OpNum == OP_COPY) {
+    if (LHSID == (1 * 9 + 2) * 9 + 3)
+      return LHS;
+    assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
+    return RHS;
+  }
+
+  SDValue OpLHS, OpRHS;
+  OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
+  OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
+  EVT VT = OpLHS.getValueType();
+
+  switch (OpNum) {
+  default:
+    llvm_unreachable("Unknown shuffle opcode!");
+  case OP_VREV:
+    // VREV divides the vector in half and swaps within the half.
+    if (VT.getVectorElementType() == MVT::i32 ||
+        VT.getVectorElementType() == MVT::f32)
+      return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
+    // vrev <4 x i16> -> REV32
+    if (VT.getVectorElementType() == MVT::i16)
+      return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
+    // vrev <4 x i8> -> REV16
+    assert(VT.getVectorElementType() == MVT::i8);
+    return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
+  case OP_VDUP0:
+  case OP_VDUP1:
+  case OP_VDUP2:
+  case OP_VDUP3: {
+    EVT EltTy = VT.getVectorElementType();
+    unsigned Opcode;
+    if (EltTy == MVT::i8)
+      Opcode = AArch64ISD::DUPLANE8;
+    else if (EltTy == MVT::i16)
+      Opcode = AArch64ISD::DUPLANE16;
+    else if (EltTy == MVT::i32 || EltTy == MVT::f32)
+      Opcode = AArch64ISD::DUPLANE32;
+    else if (EltTy == MVT::i64 || EltTy == MVT::f64)
+      Opcode = AArch64ISD::DUPLANE64;
+    else
+      llvm_unreachable("Invalid vector element type?");
+
+    if (VT.getSizeInBits() == 64)
+      OpLHS = WidenVector(OpLHS, DAG);
+    SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, MVT::i64);
+    return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
+  }
+  case OP_VEXT1:
+  case OP_VEXT2:
+  case OP_VEXT3: {
+    unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
+    return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
+                       DAG.getConstant(Imm, MVT::i32));
+  }
+  case OP_VUZPL:
+    return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS,
+                       OpRHS);
+  case OP_VUZPR:
+    return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS,
+                       OpRHS);
+  case OP_VZIPL:
+    return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS,
+                       OpRHS);
+  case OP_VZIPR:
+    return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS,
+                       OpRHS);
+  case OP_VTRNL:
+    return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS,
+                       OpRHS);
+  case OP_VTRNR:
+    return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS,
+                       OpRHS);
+  }
+}
+
+static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
+                           SelectionDAG &DAG) {
+  // Check to see if we can use the TBL instruction.
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  SDLoc DL(Op);
+
+  EVT EltVT = Op.getValueType().getVectorElementType();
+  unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
+
+  SmallVector<SDValue, 8> TBLMask;
+  for (int Val : ShuffleMask) {
+    for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
+      unsigned Offset = Byte + Val * BytesPerElt;
+      TBLMask.push_back(DAG.getConstant(Offset, MVT::i32));
+    }
+  }
+
+  MVT IndexVT = MVT::v8i8;
+  unsigned IndexLen = 8;
+  if (Op.getValueType().getSizeInBits() == 128) {
+    IndexVT = MVT::v16i8;
+    IndexLen = 16;
+  }
+
+  SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
+  SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
+
+  SDValue Shuffle;
+  if (V2.getNode()->getOpcode() == ISD::UNDEF) {
+    if (IndexLen == 8)
+      V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
+    Shuffle = DAG.getNode(
+        ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
+        DAG.getConstant(Intrinsic::aarch64_neon_tbl1, MVT::i32), V1Cst,
+        DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
+                    makeArrayRef(TBLMask.data(), IndexLen)));
+  } else {
+    if (IndexLen == 8) {
+      V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
+      Shuffle = DAG.getNode(
+          ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
+          DAG.getConstant(Intrinsic::aarch64_neon_tbl1, MVT::i32), V1Cst,
+          DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
+                      makeArrayRef(TBLMask.data(), IndexLen)));
+    } else {
+      // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
+      // cannot currently represent the register constraints on the input
+      // table registers.
+      //  Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
+      //                   DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
+      //                               &TBLMask[0], IndexLen));
+      Shuffle = DAG.getNode(
+          ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
+          DAG.getConstant(Intrinsic::aarch64_neon_tbl2, MVT::i32), V1Cst, V2Cst,
+          DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
+                      makeArrayRef(TBLMask.data(), IndexLen)));
+    }
+  }
+  return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
+}
+
+static unsigned getDUPLANEOp(EVT EltType) {
+  if (EltType == MVT::i8)
+    return AArch64ISD::DUPLANE8;
+  if (EltType == MVT::i16)
+    return AArch64ISD::DUPLANE16;
+  if (EltType == MVT::i32 || EltType == MVT::f32)
+    return AArch64ISD::DUPLANE32;
+  if (EltType == MVT::i64 || EltType == MVT::f64)
+    return AArch64ISD::DUPLANE64;
+
+  llvm_unreachable("Invalid vector element type?");
+}
+
+SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+
+  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
+
+  // Convert shuffles that are directly supported on NEON to target-specific
+  // DAG nodes, instead of keeping them as shuffles and matching them again
+  // during code selection.  This is more efficient and avoids the possibility
+  // of inconsistencies between legalization and selection.
+  ArrayRef<int> ShuffleMask = SVN->getMask();
+
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+
+  if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0],
+                                       V1.getValueType().getSimpleVT())) {
+    int Lane = SVN->getSplatIndex();
+    // If this is undef splat, generate it via "just" vdup, if possible.
+    if (Lane == -1)
+      Lane = 0;
+
+    if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
+      return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
+                         V1.getOperand(0));
+    // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
+    // constant. If so, we can just reference the lane's definition directly.
+    if (V1.getOpcode() == ISD::BUILD_VECTOR &&
+        !isa<ConstantSDNode>(V1.getOperand(Lane)))
+      return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));
+
+    // Otherwise, duplicate from the lane of the input vector.
+    unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
+
+    // SelectionDAGBuilder may have "helpfully" already extracted or conatenated
+    // to make a vector of the same size as this SHUFFLE. We can ignore the
+    // extract entirely, and canonicalise the concat using WidenVector.
+    if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+      Lane += cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue();
+      V1 = V1.getOperand(0);
+    } else if (V1.getOpcode() == ISD::CONCAT_VECTORS) {
+      unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
+      Lane -= Idx * VT.getVectorNumElements() / 2;
+      V1 = WidenVector(V1.getOperand(Idx), DAG);
+    } else if (VT.getSizeInBits() == 64)
+      V1 = WidenVector(V1, DAG);
+
+    return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, MVT::i64));
+  }
+
+  if (isREVMask(ShuffleMask, VT, 64))
+    return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
+  if (isREVMask(ShuffleMask, VT, 32))
+    return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
+  if (isREVMask(ShuffleMask, VT, 16))
+    return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);
+
+  bool ReverseEXT = false;
+  unsigned Imm;
+  if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
+    if (ReverseEXT)
+      std::swap(V1, V2);
+    Imm *= getExtFactor(V1);
+    return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
+                       DAG.getConstant(Imm, MVT::i32));
+  } else if (V2->getOpcode() == ISD::UNDEF &&
+             isSingletonEXTMask(ShuffleMask, VT, Imm)) {
+    Imm *= getExtFactor(V1);
+    return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
+                       DAG.getConstant(Imm, MVT::i32));
+  }
+
+  unsigned WhichResult;
+  if (isZIPMask(ShuffleMask, VT, WhichResult)) {
+    unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
+    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
+  }
+  if (isUZPMask(ShuffleMask, VT, WhichResult)) {
+    unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
+    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
+  }
+  if (isTRNMask(ShuffleMask, VT, WhichResult)) {
+    unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
+    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
+  }
+
+  if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
+    unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
+    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
+  }
+  if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
+    unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
+    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
+  }
+  if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
+    unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
+    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
+  }
+
+  SDValue Concat = tryFormConcatFromShuffle(Op, DAG);
+  if (Concat.getNode())
+    return Concat;
+
+  bool DstIsLeft;
+  int Anomaly;
+  int NumInputElements = V1.getValueType().getVectorNumElements();
+  if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
+    SDValue DstVec = DstIsLeft ? V1 : V2;
+    SDValue DstLaneV = DAG.getConstant(Anomaly, MVT::i64);
+
+    SDValue SrcVec = V1;
+    int SrcLane = ShuffleMask[Anomaly];
+    if (SrcLane >= NumInputElements) {
+      SrcVec = V2;
+      SrcLane -= VT.getVectorNumElements();
+    }
+    SDValue SrcLaneV = DAG.getConstant(SrcLane, MVT::i64);
+
+    EVT ScalarVT = VT.getVectorElementType();
+    if (ScalarVT.getSizeInBits() < 32)
+      ScalarVT = MVT::i32;
+
+    return DAG.getNode(
+        ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
+        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
+        DstLaneV);
+  }
+
+  // If the shuffle is not directly supported and it has 4 elements, use
+  // the PerfectShuffle-generated table to synthesize it from other shuffles.
+  unsigned NumElts = VT.getVectorNumElements();
+  if (NumElts == 4) {
+    unsigned PFIndexes[4];
+    for (unsigned i = 0; i != 4; ++i) {
+      if (ShuffleMask[i] < 0)
+        PFIndexes[i] = 8;
+      else
+        PFIndexes[i] = ShuffleMask[i];
+    }
+
+    // Compute the index in the perfect shuffle table.
+    unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
+                            PFIndexes[2] * 9 + PFIndexes[3];
+    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
+    unsigned Cost = (PFEntry >> 30);
+
+    if (Cost <= 4)
+      return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
+  }
+
+  return GenerateTBL(Op, ShuffleMask, DAG);
+}
+
+static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
+                               APInt &UndefBits) {
+  EVT VT = BVN->getValueType(0);
+  APInt SplatBits, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
+    unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
+
+    for (unsigned i = 0; i < NumSplats; ++i) {
+      CnstBits <<= SplatBitSize;
+      UndefBits <<= SplatBitSize;
+      CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
+      UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
+    }
+
+    return true;
+  }
+
+  return false;
+}
+
+SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  BuildVectorSDNode *BVN =
+      dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
+  SDValue LHS = Op.getOperand(0);
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+
+  if (!BVN)
+    return Op;
+
+  APInt CnstBits(VT.getSizeInBits(), 0);
+  APInt UndefBits(VT.getSizeInBits(), 0);
+  if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
+    // We only have BIC vector immediate instruction, which is and-not.
+    CnstBits = ~CnstBits;
+
+    // We make use of a little bit of goto ickiness in order to avoid having to
+    // duplicate the immediate matching logic for the undef toggled case.
+    bool SecondTry = false;
+  AttemptModImm:
+
+    if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
+      CnstBits = CnstBits.zextOrTrunc(64);
+      uint64_t CnstVal = CnstBits.getZExtValue();
+
+      if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(16, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(24, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+    }
+
+    if (SecondTry)
+      goto FailedModImm;
+    SecondTry = true;
+    CnstBits = ~UndefBits;
+    goto AttemptModImm;
+  }
+
+// We can always fall back to a non-immediate AND.
+FailedModImm:
+  return Op;
+}
+
+// Specialized code to quickly find if PotentialBVec is a BuildVector that
+// consists of only the same constant int value, returned in reference arg
+// ConstVal
+static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
+                                     uint64_t &ConstVal) {
+  BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
+  if (!Bvec)
+    return false;
+  ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
+  if (!FirstElt)
+    return false;
+  EVT VT = Bvec->getValueType(0);
+  unsigned NumElts = VT.getVectorNumElements();
+  for (unsigned i = 1; i < NumElts; ++i)
+    if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
+      return false;
+  ConstVal = FirstElt->getZExtValue();
+  return true;
+}
+
+static unsigned getIntrinsicID(const SDNode *N) {
+  unsigned Opcode = N->getOpcode();
+  switch (Opcode) {
+  default:
+    return Intrinsic::not_intrinsic;
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+    if (IID < Intrinsic::num_intrinsics)
+      return IID;
+    return Intrinsic::not_intrinsic;
+  }
+  }
+}
+
+// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
+// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
+// BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2.
+// Also, logical shift right -> sri, with the same structure.
+static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+
+  if (!VT.isVector())
+    return SDValue();
+
+  SDLoc DL(N);
+
+  // Is the first op an AND?
+  const SDValue And = N->getOperand(0);
+  if (And.getOpcode() != ISD::AND)
+    return SDValue();
+
+  // Is the second op an shl or lshr?
+  SDValue Shift = N->getOperand(1);
+  // This will have been turned into: AArch64ISD::VSHL vector, #shift
+  // or AArch64ISD::VLSHR vector, #shift
+  unsigned ShiftOpc = Shift.getOpcode();
+  if ((ShiftOpc != AArch64ISD::VSHL && ShiftOpc != AArch64ISD::VLSHR))
+    return SDValue();
+  bool IsShiftRight = ShiftOpc == AArch64ISD::VLSHR;
+
+  // Is the shift amount constant?
+  ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
+  if (!C2node)
+    return SDValue();
+
+  // Is the and mask vector all constant?
+  uint64_t C1;
+  if (!isAllConstantBuildVector(And.getOperand(1), C1))
+    return SDValue();
+
+  // Is C1 == ~C2, taking into account how much one can shift elements of a
+  // particular size?
+  uint64_t C2 = C2node->getZExtValue();
+  unsigned ElemSizeInBits = VT.getVectorElementType().getSizeInBits();
+  if (C2 > ElemSizeInBits)
+    return SDValue();
+  unsigned ElemMask = (1 << ElemSizeInBits) - 1;
+  if ((C1 & ElemMask) != (~C2 & ElemMask))
+    return SDValue();
+
+  SDValue X = And.getOperand(0);
+  SDValue Y = Shift.getOperand(0);
+
+  unsigned Intrin =
+      IsShiftRight ? Intrinsic::aarch64_neon_vsri : Intrinsic::aarch64_neon_vsli;
+  SDValue ResultSLI =
+      DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                  DAG.getConstant(Intrin, MVT::i32), X, Y, Shift.getOperand(1));
+
+  DEBUG(dbgs() << "aarch64-lower: transformed: \n");
+  DEBUG(N->dump(&DAG));
+  DEBUG(dbgs() << "into: \n");
+  DEBUG(ResultSLI->dump(&DAG));
+
+  ++NumShiftInserts;
+  return ResultSLI;
+}
+
+SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
+  if (EnableAArch64SlrGeneration) {
+    SDValue Res = tryLowerToSLI(Op.getNode(), DAG);
+    if (Res.getNode())
+      return Res;
+  }
+
+  BuildVectorSDNode *BVN =
+      dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
+  SDValue LHS = Op.getOperand(1);
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+
+  // OR commutes, so try swapping the operands.
+  if (!BVN) {
+    LHS = Op.getOperand(0);
+    BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
+  }
+  if (!BVN)
+    return Op;
+
+  APInt CnstBits(VT.getSizeInBits(), 0);
+  APInt UndefBits(VT.getSizeInBits(), 0);
+  if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
+    // We make use of a little bit of goto ickiness in order to avoid having to
+    // duplicate the immediate matching logic for the undef toggled case.
+    bool SecondTry = false;
+  AttemptModImm:
+
+    if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
+      CnstBits = CnstBits.zextOrTrunc(64);
+      uint64_t CnstVal = CnstBits.getZExtValue();
+
+      if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(16, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(24, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+    }
+
+    if (SecondTry)
+      goto FailedModImm;
+    SecondTry = true;
+    CnstBits = UndefBits;
+    goto AttemptModImm;
+  }
+
+// We can always fall back to a non-immediate OR.
+FailedModImm:
+  return Op;
+}
+
+// Normalize the operands of BUILD_VECTOR. The value of constant operands will
+// be truncated to fit element width.
+static SDValue NormalizeBuildVector(SDValue Op,
+                                    SelectionDAG &DAG) {
+  assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+  EVT EltTy= VT.getVectorElementType();
+
+  if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
+    return Op;
+
+  SmallVector<SDValue, 16> Ops;
+  for (unsigned I = 0, E = VT.getVectorNumElements(); I != E; ++I) {
+    SDValue Lane = Op.getOperand(I);
+    if (Lane.getOpcode() == ISD::Constant) {
+      APInt LowBits(EltTy.getSizeInBits(),
+                    cast<ConstantSDNode>(Lane)->getZExtValue());
+      Lane = DAG.getConstant(LowBits.getZExtValue(), MVT::i32);
+    }
+    Ops.push_back(Lane);
+  }
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
+}
+
+SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+  Op = NormalizeBuildVector(Op, DAG);
+  BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
+
+  APInt CnstBits(VT.getSizeInBits(), 0);
+  APInt UndefBits(VT.getSizeInBits(), 0);
+  if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
+    // We make use of a little bit of goto ickiness in order to avoid having to
+    // duplicate the immediate matching logic for the undef toggled case.
+    bool SecondTry = false;
+  AttemptModImm:
+
+    if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
+      CnstBits = CnstBits.zextOrTrunc(64);
+      uint64_t CnstVal = CnstBits.getZExtValue();
+
+      // Certain magic vector constants (used to express things like NOT
+      // and NEG) are passed through unmodified.  This allows codegen patterns
+      // for these operations to match.  Special-purpose patterns will lower
+      // these immediates to MOVIs if it proves necessary.
+      if (VT.isInteger() && (CnstVal == 0 || CnstVal == ~0ULL))
+        return Op;
+
+      // The many faces of MOVI...
+      if (AArch64_AM::isAdvSIMDModImmType10(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType10(CnstVal);
+        if (VT.getSizeInBits() == 128) {
+          SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::v2i64,
+                                    DAG.getConstant(CnstVal, MVT::i32));
+          return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        }
+
+        // Support the V64 version via subregister insertion.
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::f64,
+                                  DAG.getConstant(CnstVal, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(16, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(24, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(264, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(272, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType9(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType9(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVI, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      // The few faces of FMOV...
+      if (AArch64_AM::isAdvSIMDModImmType11(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType11(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4f32 : MVT::v2f32;
+        SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType12(CnstVal) &&
+          VT.getSizeInBits() == 128) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType12(CnstVal);
+        SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MVT::v2f64,
+                                  DAG.getConstant(CnstVal, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      // The many faces of MVNI...
+      CnstVal = ~CnstVal;
+      if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(16, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(24, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(264, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(272, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+    }
+
+    if (SecondTry)
+      goto FailedModImm;
+    SecondTry = true;
+    CnstBits = UndefBits;
+    goto AttemptModImm;
+  }
+FailedModImm:
+
+  // Scan through the operands to find some interesting properties we can
+  // exploit:
+  //   1) If only one value is used, we can use a DUP, or
+  //   2) if only the low element is not undef, we can just insert that, or
+  //   3) if only one constant value is used (w/ some non-constant lanes),
+  //      we can splat the constant value into the whole vector then fill
+  //      in the non-constant lanes.
+  //   4) FIXME: If different constant values are used, but we can intelligently
+  //             select the values we'll be overwriting for the non-constant
+  //             lanes such that we can directly materialize the vector
+  //             some other way (MOVI, e.g.), we can be sneaky.
+  unsigned NumElts = VT.getVectorNumElements();
+  bool isOnlyLowElement = true;
+  bool usesOnlyOneValue = true;
+  bool usesOnlyOneConstantValue = true;
+  bool isConstant = true;
+  unsigned NumConstantLanes = 0;
+  SDValue Value;
+  SDValue ConstantValue;
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue V = Op.getOperand(i);
+    if (V.getOpcode() == ISD::UNDEF)
+      continue;
+    if (i > 0)
+      isOnlyLowElement = false;
+    if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
+      isConstant = false;
+
+    if (isa<ConstantSDNode>(V) || isa<ConstantFPSDNode>(V)) {
+      ++NumConstantLanes;
+      if (!ConstantValue.getNode())
+        ConstantValue = V;
+      else if (ConstantValue != V)
+        usesOnlyOneConstantValue = false;
+    }
+
+    if (!Value.getNode())
+      Value = V;
+    else if (V != Value)
+      usesOnlyOneValue = false;
+  }
+
+  if (!Value.getNode())
+    return DAG.getUNDEF(VT);
+
+  if (isOnlyLowElement)
+    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
+
+  // Use DUP for non-constant splats.  For f32 constant splats, reduce to
+  // i32 and try again.
+  if (usesOnlyOneValue) {
+    if (!isConstant) {
+      if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+          Value.getValueType() != VT)
+        return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
+
+      // This is actually a DUPLANExx operation, which keeps everything vectory.
+
+      // DUPLANE works on 128-bit vectors, widen it if necessary.
+      SDValue Lane = Value.getOperand(1);
+      Value = Value.getOperand(0);
+      if (Value.getValueType().getSizeInBits() == 64)
+        Value = WidenVector(Value, DAG);
+
+      unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
+      return DAG.getNode(Opcode, dl, VT, Value, Lane);
+    }
+
+    if (VT.getVectorElementType().isFloatingPoint()) {
+      SmallVector<SDValue, 8> Ops;
+      MVT NewType =
+          (VT.getVectorElementType() == MVT::f32) ? MVT::i32 : MVT::i64;
+      for (unsigned i = 0; i < NumElts; ++i)
+        Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
+      EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
+      SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, Ops);
+      Val = LowerBUILD_VECTOR(Val, DAG);
+      if (Val.getNode())
+        return DAG.getNode(ISD::BITCAST, dl, VT, Val);
+    }
+  }
+
+  // If there was only one constant value used and for more than one lane,
+  // start by splatting that value, then replace the non-constant lanes. This
+  // is better than the default, which will perform a separate initialization
+  // for each lane.
+  if (NumConstantLanes > 0 && usesOnlyOneConstantValue) {
+    SDValue Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
+    // Now insert the non-constant lanes.
+    for (unsigned i = 0; i < NumElts; ++i) {
+      SDValue V = Op.getOperand(i);
+      SDValue LaneIdx = DAG.getConstant(i, MVT::i64);
+      if (!isa<ConstantSDNode>(V) && !isa<ConstantFPSDNode>(V)) {
+        // Note that type legalization likely mucked about with the VT of the
+        // source operand, so we may have to convert it here before inserting.
+        Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
+      }
+    }
+    return Val;
+  }
+
+  // If all elements are constants and the case above didn't get hit, fall back
+  // to the default expansion, which will generate a load from the constant
+  // pool.
+  if (isConstant)
+    return SDValue();
+
+  // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
+  if (NumElts >= 4) {
+    SDValue shuffle = ReconstructShuffle(Op, DAG);
+    if (shuffle != SDValue())
+      return shuffle;
+  }
+
+  // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
+  // know the default expansion would otherwise fall back on something even
+  // worse. For a vector with one or two non-undef values, that's
+  // scalar_to_vector for the elements followed by a shuffle (provided the
+  // shuffle is valid for the target) and materialization element by element
+  // on the stack followed by a load for everything else.
+  if (!isConstant && !usesOnlyOneValue) {
+    SDValue Vec = DAG.getUNDEF(VT);
+    SDValue Op0 = Op.getOperand(0);
+    unsigned ElemSize = VT.getVectorElementType().getSizeInBits();
+    unsigned i = 0;
+    // For 32 and 64 bit types, use INSERT_SUBREG for lane zero to
+    // a) Avoid a RMW dependency on the full vector register, and
+    // b) Allow the register coalescer to fold away the copy if the
+    //    value is already in an S or D register.
+    if (Op0.getOpcode() != ISD::UNDEF && (ElemSize == 32 || ElemSize == 64)) {
+      unsigned SubIdx = ElemSize == 32 ? AArch64::ssub : AArch64::dsub;
+      MachineSDNode *N =
+          DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, VT, Vec, Op0,
+                             DAG.getTargetConstant(SubIdx, MVT::i32));
+      Vec = SDValue(N, 0);
+      ++i;
+    }
+    for (; i < NumElts; ++i) {
+      SDValue V = Op.getOperand(i);
+      if (V.getOpcode() == ISD::UNDEF)
+        continue;
+      SDValue LaneIdx = DAG.getConstant(i, MVT::i64);
+      Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
+    }
+    return Vec;
+  }
+
+  // Just use the default expansion. We failed to find a better alternative.
+  return SDValue();
+}
+
+SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
+                                                      SelectionDAG &DAG) const {
+  assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
+
+  // Check for non-constant or out of range lane.
+  EVT VT = Op.getOperand(0).getValueType();
+  ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+  if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
+    return SDValue();
+
+
+  // Insertion/extraction are legal for V128 types.
+  if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
+      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64)
+    return Op;
+
+  if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
+      VT != MVT::v1i64 && VT != MVT::v2f32)
+    return SDValue();
+
+  // For V64 types, we perform insertion by expanding the value
+  // to a V128 type and perform the insertion on that.
+  SDLoc DL(Op);
+  SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
+  EVT WideTy = WideVec.getValueType();
+
+  SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec,
+                             Op.getOperand(1), Op.getOperand(2));
+  // Re-narrow the resultant vector.
+  return NarrowVector(Node, DAG);
+}
+
+SDValue
+AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
+
+  // Check for non-constant or out of range lane.
+  EVT VT = Op.getOperand(0).getValueType();
+  ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+  if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
+    return SDValue();
+
+
+  // Insertion/extraction are legal for V128 types.
+  if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
+      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64)
+    return Op;
+
+  if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
+      VT != MVT::v1i64 && VT != MVT::v2f32)
+    return SDValue();
+
+  // For V64 types, we perform extraction by expanding the value
+  // to a V128 type and perform the extraction on that.
+  SDLoc DL(Op);
+  SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
+  EVT WideTy = WideVec.getValueType();
+
+  EVT ExtrTy = WideTy.getVectorElementType();
+  if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
+    ExtrTy = MVT::i32;
+
+  // For extractions, we just return the result directly.
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
+                     Op.getOperand(1));
+}
+
+SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
+                                                      SelectionDAG &DAG) const {
+  EVT VT = Op.getOperand(0).getValueType();
+  SDLoc dl(Op);
+  // Just in case...
+  if (!VT.isVector())
+    return SDValue();
+
+  ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+  if (!Cst)
+    return SDValue();
+  unsigned Val = Cst->getZExtValue();
+
+  unsigned Size = Op.getValueType().getSizeInBits();
+  if (Val == 0) {
+    switch (Size) {
+    case 8:
+      return DAG.getTargetExtractSubreg(AArch64::bsub, dl, Op.getValueType(),
+                                        Op.getOperand(0));
+    case 16:
+      return DAG.getTargetExtractSubreg(AArch64::hsub, dl, Op.getValueType(),
+                                        Op.getOperand(0));
+    case 32:
+      return DAG.getTargetExtractSubreg(AArch64::ssub, dl, Op.getValueType(),
+                                        Op.getOperand(0));
+    case 64:
+      return DAG.getTargetExtractSubreg(AArch64::dsub, dl, Op.getValueType(),
+                                        Op.getOperand(0));
+    default:
+      llvm_unreachable("Unexpected vector type in extract_subvector!");
+    }
+  }
+  // If this is extracting the upper 64-bits of a 128-bit vector, we match
+  // that directly.
+  if (Size == 64 && Val * VT.getVectorElementType().getSizeInBits() == 64)
+    return Op;
+
+  return SDValue();
+}
+
+bool AArch64TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
+                                               EVT VT) const {
+  if (VT.getVectorNumElements() == 4 &&
+      (VT.is128BitVector() || VT.is64BitVector())) {
+    unsigned PFIndexes[4];
+    for (unsigned i = 0; i != 4; ++i) {
+      if (M[i] < 0)
+        PFIndexes[i] = 8;
+      else
+        PFIndexes[i] = M[i];
+    }
+
+    // Compute the index in the perfect shuffle table.
+    unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
+                            PFIndexes[2] * 9 + PFIndexes[3];
+    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
+    unsigned Cost = (PFEntry >> 30);
+
+    if (Cost <= 4)
+      return true;
+  }
+
+  bool DummyBool;
+  int DummyInt;
+  unsigned DummyUnsigned;
+
+  return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) ||
+          isREVMask(M, VT, 32) || isREVMask(M, VT, 16) ||
+          isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
+          // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM.
+          isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) ||
+          isZIPMask(M, VT, DummyUnsigned) ||
+          isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
+          isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
+          isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
+          isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) ||
+          isConcatMask(M, VT, VT.getSizeInBits() == 128));
+}
+
+/// getVShiftImm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift operation, where all the elements of the
+/// build_vector must have the same constant integer value.
+static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
+  // Ignore bit_converts.
+  while (Op.getOpcode() == ISD::BITCAST)
+    Op = Op.getOperand(0);
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
+  APInt SplatBits, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
+                                    HasAnyUndefs, ElementBits) ||
+      SplatBitSize > ElementBits)
+    return false;
+  Cnt = SplatBits.getSExtValue();
+  return true;
+}
+
+/// isVShiftLImm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift left operation.  That value must be in the range:
+///   0 <= Value < ElementBits for a left shift; or
+///   0 <= Value <= ElementBits for a long left shift.
+static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
+  assert(VT.isVector() && "vector shift count is not a vector type");
+  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
+  if (!getVShiftImm(Op, ElementBits, Cnt))
+    return false;
+  return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
+}
+
+/// isVShiftRImm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift right operation.  For a shift opcode, the value
+/// is positive, but for an intrinsic the value count must be negative. The
+/// absolute value must be in the range:
+///   1 <= |Value| <= ElementBits for a right shift; or
+///   1 <= |Value| <= ElementBits/2 for a narrow right shift.
+static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
+                         int64_t &Cnt) {
+  assert(VT.isVector() && "vector shift count is not a vector type");
+  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
+  if (!getVShiftImm(Op, ElementBits, Cnt))
+    return false;
+  if (isIntrinsic)
+    Cnt = -Cnt;
+  return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
+}
+
+SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
+                                                      SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  int64_t Cnt;
+
+  if (!Op.getOperand(1).getValueType().isVector())
+    return Op;
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+
+  switch (Op.getOpcode()) {
+  default:
+    llvm_unreachable("unexpected shift opcode");
+
+  case ISD::SHL:
+    if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
+      return DAG.getNode(AArch64ISD::VSHL, SDLoc(Op), VT, Op.getOperand(0),
+                         DAG.getConstant(Cnt, MVT::i32));
+    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                       DAG.getConstant(Intrinsic::aarch64_neon_ushl, MVT::i32),
+                       Op.getOperand(0), Op.getOperand(1));
+  case ISD::SRA:
+  case ISD::SRL:
+    // Right shift immediate
+    if (isVShiftRImm(Op.getOperand(1), VT, false, false, Cnt) &&
+        Cnt < EltSize) {
+      unsigned Opc =
+          (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
+      return DAG.getNode(Opc, SDLoc(Op), VT, Op.getOperand(0),
+                         DAG.getConstant(Cnt, MVT::i32));
+    }
+
+    // Right shift register.  Note, there is not a shift right register
+    // instruction, but the shift left register instruction takes a signed
+    // value, where negative numbers specify a right shift.
+    unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
+                                                : Intrinsic::aarch64_neon_ushl;
+    // negate the shift amount
+    SDValue NegShift = DAG.getNode(AArch64ISD::NEG, DL, VT, Op.getOperand(1));
+    SDValue NegShiftLeft =
+        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                    DAG.getConstant(Opc, MVT::i32), Op.getOperand(0), NegShift);
+    return NegShiftLeft;
+  }
+
+  return SDValue();
+}
+
+static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
+                                    AArch64CC::CondCode CC, bool NoNans, EVT VT,
+                                    SDLoc dl, SelectionDAG &DAG) {
+  EVT SrcVT = LHS.getValueType();
+
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
+  APInt CnstBits(VT.getSizeInBits(), 0);
+  APInt UndefBits(VT.getSizeInBits(), 0);
+  bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits);
+  bool IsZero = IsCnst && (CnstBits == 0);
+
+  if (SrcVT.getVectorElementType().isFloatingPoint()) {
+    switch (CC) {
+    default:
+      return SDValue();
+    case AArch64CC::NE: {
+      SDValue Fcmeq;
+      if (IsZero)
+        Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
+      else
+        Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
+      return DAG.getNode(AArch64ISD::NOT, dl, VT, Fcmeq);
+    }
+    case AArch64CC::EQ:
+      if (IsZero)
+        return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
+      return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
+    case AArch64CC::GE:
+      if (IsZero)
+        return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
+      return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
+    case AArch64CC::GT:
+      if (IsZero)
+        return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
+      return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
+    case AArch64CC::LS:
+      if (IsZero)
+        return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
+      return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
+    case AArch64CC::LT:
+      if (!NoNans)
+        return SDValue();
+    // If we ignore NaNs then we can use to the MI implementation.
+    // Fallthrough.
+    case AArch64CC::MI:
+      if (IsZero)
+        return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
+      return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
+    }
+  }
+
+  switch (CC) {
+  default:
+    return SDValue();
+  case AArch64CC::NE: {
+    SDValue Cmeq;
+    if (IsZero)
+      Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
+    else
+      Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
+    return DAG.getNode(AArch64ISD::NOT, dl, VT, Cmeq);
+  }
+  case AArch64CC::EQ:
+    if (IsZero)
+      return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
+    return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
+  case AArch64CC::GE:
+    if (IsZero)
+      return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
+    return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
+  case AArch64CC::GT:
+    if (IsZero)
+      return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
+    return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
+  case AArch64CC::LE:
+    if (IsZero)
+      return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
+    return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
+  case AArch64CC::LS:
+    return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
+  case AArch64CC::LO:
+    return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
+  case AArch64CC::LT:
+    if (IsZero)
+      return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
+    return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
+  case AArch64CC::HI:
+    return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
+  case AArch64CC::HS:
+    return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
+  }
+}
+
+SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDLoc dl(Op);
+
+  if (LHS.getValueType().getVectorElementType().isInteger()) {
+    assert(LHS.getValueType() == RHS.getValueType());
+    AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
+    return EmitVectorComparison(LHS, RHS, AArch64CC, false, Op.getValueType(),
+                                dl, DAG);
+  }
+
+  assert(LHS.getValueType().getVectorElementType() == MVT::f32 ||
+         LHS.getValueType().getVectorElementType() == MVT::f64);
+
+  // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
+  // clean.  Some of them require two branches to implement.
+  AArch64CC::CondCode CC1, CC2;
+  bool ShouldInvert;
+  changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
+
+  bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath;
+  SDValue Cmp =
+      EmitVectorComparison(LHS, RHS, CC1, NoNaNs, Op.getValueType(), dl, DAG);
+  if (!Cmp.getNode())
+    return SDValue();
+
+  if (CC2 != AArch64CC::AL) {
+    SDValue Cmp2 =
+        EmitVectorComparison(LHS, RHS, CC2, NoNaNs, Op.getValueType(), dl, DAG);
+    if (!Cmp2.getNode())
+      return SDValue();
+
+    Cmp = DAG.getNode(ISD::OR, dl, Cmp.getValueType(), Cmp, Cmp2);
+  }
+
+  if (ShouldInvert)
+    return Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
+
+  return Cmp;
+}
+
+/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
+/// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment
+/// specified in the intrinsic calls.
+bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
+                                               const CallInst &I,
+                                               unsigned Intrinsic) const {
+  switch (Intrinsic) {
+  case Intrinsic::aarch64_neon_ld2:
+  case Intrinsic::aarch64_neon_ld3:
+  case Intrinsic::aarch64_neon_ld4:
+  case Intrinsic::aarch64_neon_ld1x2:
+  case Intrinsic::aarch64_neon_ld1x3:
+  case Intrinsic::aarch64_neon_ld1x4:
+  case Intrinsic::aarch64_neon_ld2lane:
+  case Intrinsic::aarch64_neon_ld3lane:
+  case Intrinsic::aarch64_neon_ld4lane:
+  case Intrinsic::aarch64_neon_ld2r:
+  case Intrinsic::aarch64_neon_ld3r:
+  case Intrinsic::aarch64_neon_ld4r: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    // Conservatively set memVT to the entire set of vectors loaded.
+    uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8;
+    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+    Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
+    Info.offset = 0;
+    Info.align = 0;
+    Info.vol = false; // volatile loads with NEON intrinsics not supported
+    Info.readMem = true;
+    Info.writeMem = false;
+    return true;
+  }
+  case Intrinsic::aarch64_neon_st2:
+  case Intrinsic::aarch64_neon_st3:
+  case Intrinsic::aarch64_neon_st4:
+  case Intrinsic::aarch64_neon_st1x2:
+  case Intrinsic::aarch64_neon_st1x3:
+  case Intrinsic::aarch64_neon_st1x4:
+  case Intrinsic::aarch64_neon_st2lane:
+  case Intrinsic::aarch64_neon_st3lane:
+  case Intrinsic::aarch64_neon_st4lane: {
+    Info.opc = ISD::INTRINSIC_VOID;
+    // Conservatively set memVT to the entire set of vectors stored.
+    unsigned NumElts = 0;
+    for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
+      Type *ArgTy = I.getArgOperand(ArgI)->getType();
+      if (!ArgTy->isVectorTy())
+        break;
+      NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8;
+    }
+    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+    Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
+    Info.offset = 0;
+    Info.align = 0;
+    Info.vol = false; // volatile stores with NEON intrinsics not supported
+    Info.readMem = false;
+    Info.writeMem = true;
+    return true;
+  }
+  case Intrinsic::aarch64_ldaxr:
+  case Intrinsic::aarch64_ldxr: {
+    PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::getVT(PtrTy->getElementType());
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
+    Info.vol = true;
+    Info.readMem = true;
+    Info.writeMem = false;
+    return true;
+  }
+  case Intrinsic::aarch64_stlxr:
+  case Intrinsic::aarch64_stxr: {
+    PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::getVT(PtrTy->getElementType());
+    Info.ptrVal = I.getArgOperand(1);
+    Info.offset = 0;
+    Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
+    Info.vol = true;
+    Info.readMem = false;
+    Info.writeMem = true;
+    return true;
+  }
+  case Intrinsic::aarch64_ldaxp:
+  case Intrinsic::aarch64_ldxp: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::i128;
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.align = 16;
+    Info.vol = true;
+    Info.readMem = true;
+    Info.writeMem = false;
+    return true;
+  }
+  case Intrinsic::aarch64_stlxp:
+  case Intrinsic::aarch64_stxp: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::i128;
+    Info.ptrVal = I.getArgOperand(2);
+    Info.offset = 0;
+    Info.align = 16;
+    Info.vol = true;
+    Info.readMem = false;
+    Info.writeMem = true;
+    return true;
+  }
+  default:
+    break;
+  }
+
+  return false;
+}
+
+// Truncations from 64-bit GPR to 32-bit GPR is free.
+bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
+  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+    return false;
+  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
+  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
+  return NumBits1 > NumBits2;
+}
+bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
+  if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
+    return false;
+  unsigned NumBits1 = VT1.getSizeInBits();
+  unsigned NumBits2 = VT2.getSizeInBits();
+  return NumBits1 > NumBits2;
+}
+
+// All 32-bit GPR operations implicitly zero the high-half of the corresponding
+// 64-bit GPR.
+bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
+  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+    return false;
+  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
+  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
+  return NumBits1 == 32 && NumBits2 == 64;
+}
+bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
+  if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
+    return false;
+  unsigned NumBits1 = VT1.getSizeInBits();
+  unsigned NumBits2 = VT2.getSizeInBits();
+  return NumBits1 == 32 && NumBits2 == 64;
+}
+
+bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+  EVT VT1 = Val.getValueType();
+  if (isZExtFree(VT1, VT2)) {
+    return true;
+  }
+
+  if (Val.getOpcode() != ISD::LOAD)
+    return false;
+
+  // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
+  return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
+          VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
+          VT1.getSizeInBits() <= 32);
+}
+
+bool AArch64TargetLowering::hasPairedLoad(Type *LoadedType,
+                                          unsigned &RequiredAligment) const {
+  if (!LoadedType->isIntegerTy() && !LoadedType->isFloatTy())
+    return false;
+  // Cyclone supports unaligned accesses.
+  RequiredAligment = 0;
+  unsigned NumBits = LoadedType->getPrimitiveSizeInBits();
+  return NumBits == 32 || NumBits == 64;
+}
+
+bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
+                                          unsigned &RequiredAligment) const {
+  if (!LoadedType.isSimple() ||
+      (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
+    return false;
+  // Cyclone supports unaligned accesses.
+  RequiredAligment = 0;
+  unsigned NumBits = LoadedType.getSizeInBits();
+  return NumBits == 32 || NumBits == 64;
+}
+
+static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
+                       unsigned AlignCheck) {
+  return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) &&
+          (DstAlign == 0 || DstAlign % AlignCheck == 0));
+}
+
+EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
+                                               unsigned SrcAlign, bool IsMemset,
+                                               bool ZeroMemset,
+                                               bool MemcpyStrSrc,
+                                               MachineFunction &MF) const {
+  // Don't use AdvSIMD to implement 16-byte memset. It would have taken one
+  // instruction to materialize the v2i64 zero and one store (with restrictive
+  // addressing mode). Just do two i64 store of zero-registers.
+  bool Fast;
+  const Function *F = MF.getFunction();
+  if (Subtarget->hasFPARMv8() && !IsMemset && Size >= 16 &&
+      !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                       Attribute::NoImplicitFloat) &&
+      (memOpAlign(SrcAlign, DstAlign, 16) ||
+       (allowsUnalignedMemoryAccesses(MVT::f128, 0, &Fast) && Fast)))
+    return MVT::f128;
+
+  return Size >= 8 ? MVT::i64 : MVT::i32;
+}
+
+// 12-bit optionally shifted immediates are legal for adds.
+bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
+  if ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0))
+    return true;
+  return false;
+}
+
+// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
+// immediates is the same as for an add or a sub.
+bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
+  if (Immed < 0)
+    Immed *= -1;
+  return isLegalAddImmediate(Immed);
+}
+
+/// isLegalAddressingMode - Return true if the addressing mode represented
+/// by AM is legal for this target, for a load/store of the specified type.
+bool AArch64TargetLowering::isLegalAddressingMode(const AddrMode &AM,
+                                                  Type *Ty) const {
+  // AArch64 has five basic addressing modes:
+  //  reg
+  //  reg + 9-bit signed offset
+  //  reg + SIZE_IN_BYTES * 12-bit unsigned offset
+  //  reg1 + reg2
+  //  reg + SIZE_IN_BYTES * reg
+
+  // No global is ever allowed as a base.
+  if (AM.BaseGV)
+    return false;
+
+  // No reg+reg+imm addressing.
+  if (AM.HasBaseReg && AM.BaseOffs && AM.Scale)
+    return false;
+
+  // check reg + imm case:
+  // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
+  uint64_t NumBytes = 0;
+  if (Ty->isSized()) {
+    uint64_t NumBits = getDataLayout()->getTypeSizeInBits(Ty);
+    NumBytes = NumBits / 8;
+    if (!isPowerOf2_64(NumBits))
+      NumBytes = 0;
+  }
+
+  if (!AM.Scale) {
+    int64_t Offset = AM.BaseOffs;
+
+    // 9-bit signed offset
+    if (Offset >= -(1LL << 9) && Offset <= (1LL << 9) - 1)
+      return true;
+
+    // 12-bit unsigned offset
+    unsigned shift = Log2_64(NumBytes);
+    if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
+        // Must be a multiple of NumBytes (NumBytes is a power of 2)
+        (Offset >> shift) << shift == Offset)
+      return true;
+    return false;
+  }
+
+  // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
+
+  if (!AM.Scale || AM.Scale == 1 ||
+      (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes))
+    return true;
+  return false;
+}
+
+int AArch64TargetLowering::getScalingFactorCost(const AddrMode &AM,
+                                                Type *Ty) const {
+  // Scaling factors are not free at all.
+  // Operands                     | Rt Latency
+  // -------------------------------------------
+  // Rt, [Xn, Xm]                 | 4
+  // -------------------------------------------
+  // Rt, [Xn, Xm, lsl #imm]       | Rn: 4 Rm: 5
+  // Rt, [Xn, Wm, <extend> #imm]  |
+  if (isLegalAddressingMode(AM, Ty))
+    // Scale represents reg2 * scale, thus account for 1 if
+    // it is not equal to 0 or 1.
+    return AM.Scale != 0 && AM.Scale != 1;
+  return -1;
+}
+
+bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+  VT = VT.getScalarType();
+
+  if (!VT.isSimple())
+    return false;
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::f32:
+  case MVT::f64:
+    return true;
+  default:
+    break;
+  }
+
+  return false;
+}
+
+const MCPhysReg *
+AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const {
+  // LR is a callee-save register, but we must treat it as clobbered by any call
+  // site. Hence we include LR in the scratch registers, which are in turn added
+  // as implicit-defs for stackmaps and patchpoints.
+  static const MCPhysReg ScratchRegs[] = {
+    AArch64::X16, AArch64::X17, AArch64::LR, 0
+  };
+  return ScratchRegs;
+}
+
+bool
+AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N) const {
+  EVT VT = N->getValueType(0);
+    // If N is unsigned bit extraction: ((x >> C) & mask), then do not combine
+    // it with shift to let it be lowered to UBFX.
+  if (N->getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
+      isa<ConstantSDNode>(N->getOperand(1))) {
+    uint64_t TruncMask = N->getConstantOperandVal(1);
+    if (isMask_64(TruncMask) &&
+      N->getOperand(0).getOpcode() == ISD::SRL &&
+      isa<ConstantSDNode>(N->getOperand(0)->getOperand(1)))
+      return false;
+  }
+  return true;
+}
+
+bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                                              Type *Ty) const {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  if (BitSize == 0)
+    return false;
+
+  int64_t Val = Imm.getSExtValue();
+  if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
+    return true;
+
+  if ((int64_t)Val < 0)
+    Val = ~Val;
+  if (BitSize == 32)
+    Val &= (1LL << 32) - 1;
+
+  unsigned LZ = countLeadingZeros((uint64_t)Val);
+  unsigned Shift = (63 - LZ) / 16;
+  // MOVZ is free so return true for one or fewer MOVK.
+  return (Shift < 3) ? true : false;
+}
+
+// Generate SUBS and CSEL for integer abs.
+static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDLoc DL(N);
+
+  // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
+  // and change it to SUB and CSEL.
+  if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
+      N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
+      N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0))
+    if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
+      if (Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
+        SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT),
+                                  N0.getOperand(0));
+        // Generate SUBS & CSEL.
+        SDValue Cmp =
+            DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
+                        N0.getOperand(0), DAG.getConstant(0, VT));
+        return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0.getOperand(0), Neg,
+                           DAG.getConstant(AArch64CC::PL, MVT::i32),
+                           SDValue(Cmp.getNode(), 1));
+      }
+  return SDValue();
+}
+
+// performXorCombine - Attempts to handle integer ABS.
+static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const AArch64Subtarget *Subtarget) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  return performIntegerAbsCombine(N, DAG);
+}
+
+static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const AArch64Subtarget *Subtarget) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  // Multiplication of a power of two plus/minus one can be done more
+  // cheaply as as shift+add/sub. For now, this is true unilaterally. If
+  // future CPUs have a cheaper MADD instruction, this may need to be
+  // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
+  // 64-bit is 5 cycles, so this is always a win.
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
+    APInt Value = C->getAPIntValue();
+    EVT VT = N->getValueType(0);
+    if (Value.isNonNegative()) {
+      // (mul x, 2^N + 1) => (add (shl x, N), x)
+      APInt VM1 = Value - 1;
+      if (VM1.isPowerOf2()) {
+        SDValue ShiftedVal =
+            DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
+                        DAG.getConstant(VM1.logBase2(), MVT::i64));
+        return DAG.getNode(ISD::ADD, SDLoc(N), VT, ShiftedVal,
+                           N->getOperand(0));
+      }
+      // (mul x, 2^N - 1) => (sub (shl x, N), x)
+      APInt VP1 = Value + 1;
+      if (VP1.isPowerOf2()) {
+        SDValue ShiftedVal =
+            DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
+                        DAG.getConstant(VP1.logBase2(), MVT::i64));
+        return DAG.getNode(ISD::SUB, SDLoc(N), VT, ShiftedVal,
+                           N->getOperand(0));
+      }
+    } else {
+      // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
+      APInt VNM1 = -Value - 1;
+      if (VNM1.isPowerOf2()) {
+        SDValue ShiftedVal =
+            DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
+                        DAG.getConstant(VNM1.logBase2(), MVT::i64));
+        SDValue Add =
+            DAG.getNode(ISD::ADD, SDLoc(N), VT, ShiftedVal, N->getOperand(0));
+        return DAG.getNode(ISD::SUB, SDLoc(N), VT, DAG.getConstant(0, VT), Add);
+      }
+      // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
+      APInt VNP1 = -Value + 1;
+      if (VNP1.isPowerOf2()) {
+        SDValue ShiftedVal =
+            DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
+                        DAG.getConstant(VNP1.logBase2(), MVT::i64));
+        return DAG.getNode(ISD::SUB, SDLoc(N), VT, N->getOperand(0),
+                           ShiftedVal);
+      }
+    }
+  }
+  return SDValue();
+}
+
+static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
+                                                         SelectionDAG &DAG) {
+  // Take advantage of vector comparisons producing 0 or -1 in each lane to
+  // optimize away operation when it's from a constant.
+  //
+  // The general transformation is:
+  //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
+  //       AND(VECTOR_CMP(x,y), constant2)
+  //    constant2 = UNARYOP(constant)
+
+  // Early exit if this isn't a vector operation, the operand of the
+  // unary operation isn't a bitwise AND, or if the sizes of the operations
+  // aren't the same.
+  EVT VT = N->getValueType(0);
+  if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
+      N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
+      VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
+    return SDValue();
+
+  // Now check that the other operand of the AND is a constant splat. We could
+  // make the transformation for non-constant splats as well, but it's unclear
+  // that would be a benefit as it would not eliminate any operations, just
+  // perform one more step in scalar code before moving to the vector unit.
+  if (BuildVectorSDNode *BV =
+          dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
+    // Bail out if the vector isn't a constant splat.
+    if (!BV->getConstantSplatNode())
+      return SDValue();
+
+    // Everything checks out. Build up the new and improved node.
+    SDLoc DL(N);
+    EVT IntVT = BV->getValueType(0);
+    // Create a new constant of the appropriate type for the transformed
+    // DAG.
+    SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
+    // The AND node needs bitcasts to/from an integer vector type around it.
+    SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
+    SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
+                                 N->getOperand(0)->getOperand(0), MaskConst);
+    SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
+    return Res;
+  }
+
+  return SDValue();
+}
+
+static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG) {
+  // First try to optimize away the conversion when it's conditionally from
+  // a constant. Vectors only.
+  SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG);
+  if (Res != SDValue())
+    return Res;
+
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::f32 && VT != MVT::f64)
+    return SDValue();
+
+  // Only optimize when the source and destination types have the same width.
+  if (VT.getSizeInBits() != N->getOperand(0).getValueType().getSizeInBits())
+    return SDValue();
+
+  // If the result of an integer load is only used by an integer-to-float
+  // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
+  // This eliminates an "integer-to-vector-move UOP and improve throughput.
+  SDValue N0 = N->getOperand(0);
+  if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
+      // Do not change the width of a volatile load.
+      !cast<LoadSDNode>(N0)->isVolatile()) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
+                               LN0->getPointerInfo(), LN0->isVolatile(),
+                               LN0->isNonTemporal(), LN0->isInvariant(),
+                               LN0->getAlignment());
+
+    // Make sure successors of the original load stay after it by updating them
+    // to use the new Chain.
+    DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
+
+    unsigned Opcode =
+        (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
+    return DAG.getNode(Opcode, SDLoc(N), VT, Load);
+  }
+
+  return SDValue();
+}
+
+/// An EXTR instruction is made up of two shifts, ORed together. This helper
+/// searches for and classifies those shifts.
+static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
+                         bool &FromHi) {
+  if (N.getOpcode() == ISD::SHL)
+    FromHi = false;
+  else if (N.getOpcode() == ISD::SRL)
+    FromHi = true;
+  else
+    return false;
+
+  if (!isa<ConstantSDNode>(N.getOperand(1)))
+    return false;
+
+  ShiftAmount = N->getConstantOperandVal(1);
+  Src = N->getOperand(0);
+  return true;
+}
+
+/// EXTR instruction extracts a contiguous chunk of bits from two existing
+/// registers viewed as a high/low pair. This function looks for the pattern:
+/// (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) and replaces it with an
+/// EXTR. Can't quite be done in TableGen because the two immediates aren't
+/// independent.
+static SDValue tryCombineToEXTR(SDNode *N,
+                                TargetLowering::DAGCombinerInfo &DCI) {
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+
+  assert(N->getOpcode() == ISD::OR && "Unexpected root");
+
+  if (VT != MVT::i32 && VT != MVT::i64)
+    return SDValue();
+
+  SDValue LHS;
+  uint32_t ShiftLHS = 0;
+  bool LHSFromHi = 0;
+  if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
+    return SDValue();
+
+  SDValue RHS;
+  uint32_t ShiftRHS = 0;
+  bool RHSFromHi = 0;
+  if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
+    return SDValue();
+
+  // If they're both trying to come from the high part of the register, they're
+  // not really an EXTR.
+  if (LHSFromHi == RHSFromHi)
+    return SDValue();
+
+  if (ShiftLHS + ShiftRHS != VT.getSizeInBits())
+    return SDValue();
+
+  if (LHSFromHi) {
+    std::swap(LHS, RHS);
+    std::swap(ShiftLHS, ShiftRHS);
+  }
+
+  return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS,
+                     DAG.getConstant(ShiftRHS, MVT::i64));
+}
+
+static SDValue tryCombineToBSL(SDNode *N,
+                                TargetLowering::DAGCombinerInfo &DCI) {
+  EVT VT = N->getValueType(0);
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
+
+  if (!VT.isVector())
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  if (N0.getOpcode() != ISD::AND)
+    return SDValue();
+
+  SDValue N1 = N->getOperand(1);
+  if (N1.getOpcode() != ISD::AND)
+    return SDValue();
+
+  // We only have to look for constant vectors here since the general, variable
+  // case can be handled in TableGen.
+  unsigned Bits = VT.getVectorElementType().getSizeInBits();
+  uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
+  for (int i = 1; i >= 0; --i)
+    for (int j = 1; j >= 0; --j) {
+      BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
+      BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
+      if (!BVN0 || !BVN1)
+        continue;
+
+      bool FoundMatch = true;
+      for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
+        ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
+        ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
+        if (!CN0 || !CN1 ||
+            CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
+          FoundMatch = false;
+          break;
+        }
+      }
+
+      if (FoundMatch)
+        return DAG.getNode(AArch64ISD::BSL, DL, VT, SDValue(BVN0, 0),
+                           N0->getOperand(1 - i), N1->getOperand(1 - j));
+    }
+
+  return SDValue();
+}
+
+static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+                                const AArch64Subtarget *Subtarget) {
+  // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
+  if (!EnableAArch64ExtrGeneration)
+    return SDValue();
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+
+  SDValue Res = tryCombineToEXTR(N, DCI);
+  if (Res.getNode())
+    return Res;
+
+  Res = tryCombineToBSL(N, DCI);
+  if (Res.getNode())
+    return Res;
+
+  return SDValue();
+}
+
+static SDValue performBitcastCombine(SDNode *N,
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     SelectionDAG &DAG) {
+  // Wait 'til after everything is legalized to try this. That way we have
+  // legal vector types and such.
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  // Remove extraneous bitcasts around an extract_subvector.
+  // For example,
+  //    (v4i16 (bitconvert
+  //             (extract_subvector (v2i64 (bitconvert (v8i16 ...)), (i64 1)))))
+  //  becomes
+  //    (extract_subvector ((v8i16 ...), (i64 4)))
+
+  // Only interested in 64-bit vectors as the ultimate result.
+  EVT VT = N->getValueType(0);
+  if (!VT.isVector())
+    return SDValue();
+  if (VT.getSimpleVT().getSizeInBits() != 64)
+    return SDValue();
+  // Is the operand an extract_subvector starting at the beginning or halfway
+  // point of the vector? A low half may also come through as an
+  // EXTRACT_SUBREG, so look for that, too.
+  SDValue Op0 = N->getOperand(0);
+  if (Op0->getOpcode() != ISD::EXTRACT_SUBVECTOR &&
+      !(Op0->isMachineOpcode() &&
+        Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG))
+    return SDValue();
+  uint64_t idx = cast<ConstantSDNode>(Op0->getOperand(1))->getZExtValue();
+  if (Op0->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+    if (Op0->getValueType(0).getVectorNumElements() != idx && idx != 0)
+      return SDValue();
+  } else if (Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG) {
+    if (idx != AArch64::dsub)
+      return SDValue();
+    // The dsub reference is equivalent to a lane zero subvector reference.
+    idx = 0;
+  }
+  // Look through the bitcast of the input to the extract.
+  if (Op0->getOperand(0)->getOpcode() != ISD::BITCAST)
+    return SDValue();
+  SDValue Source = Op0->getOperand(0)->getOperand(0);
+  // If the source type has twice the number of elements as our destination
+  // type, we know this is an extract of the high or low half of the vector.
+  EVT SVT = Source->getValueType(0);
+  if (SVT.getVectorNumElements() != VT.getVectorNumElements() * 2)
+    return SDValue();
+
+  DEBUG(dbgs() << "aarch64-lower: bitcast extract_subvector simplification\n");
+
+  // Create the simplified form to just extract the low or high half of the
+  // vector directly rather than bothering with the bitcasts.
+  SDLoc dl(N);
+  unsigned NumElements = VT.getVectorNumElements();
+  if (idx) {
+    SDValue HalfIdx = DAG.getConstant(NumElements, MVT::i64);
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Source, HalfIdx);
+  } else {
+    SDValue SubReg = DAG.getTargetConstant(AArch64::dsub, MVT::i32);
+    return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, VT,
+                                      Source, SubReg),
+                   0);
+  }
+}
+
+static SDValue performConcatVectorsCombine(SDNode *N,
+                                           TargetLowering::DAGCombinerInfo &DCI,
+                                           SelectionDAG &DAG) {
+  // Wait 'til after everything is legalized to try this. That way we have
+  // legal vector types and such.
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+
+  // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
+  // splat. The indexed instructions are going to be expecting a DUPLANE64, so
+  // canonicalise to that.
+  if (N->getOperand(0) == N->getOperand(1) && VT.getVectorNumElements() == 2) {
+    assert(VT.getVectorElementType().getSizeInBits() == 64);
+    return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT,
+                       WidenVector(N->getOperand(0), DAG),
+                       DAG.getConstant(0, MVT::i64));
+  }
+
+  // Canonicalise concat_vectors so that the right-hand vector has as few
+  // bit-casts as possible before its real operation. The primary matching
+  // destination for these operations will be the narrowing "2" instructions,
+  // which depend on the operation being performed on this right-hand vector.
+  // For example,
+  //    (concat_vectors LHS,  (v1i64 (bitconvert (v4i16 RHS))))
+  // becomes
+  //    (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
+
+  SDValue Op1 = N->getOperand(1);
+  if (Op1->getOpcode() != ISD::BITCAST)
+    return SDValue();
+  SDValue RHS = Op1->getOperand(0);
+  MVT RHSTy = RHS.getValueType().getSimpleVT();
+  // If the RHS is not a vector, this is not the pattern we're looking for.
+  if (!RHSTy.isVector())
+    return SDValue();
+
+  DEBUG(dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
+
+  MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
+                                  RHSTy.getVectorNumElements() * 2);
+  return DAG.getNode(
+      ISD::BITCAST, dl, VT,
+      DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
+                  DAG.getNode(ISD::BITCAST, dl, RHSTy, N->getOperand(0)), RHS));
+}
+
+static SDValue tryCombineFixedPointConvert(SDNode *N,
+                                           TargetLowering::DAGCombinerInfo &DCI,
+                                           SelectionDAG &DAG) {
+  // Wait 'til after everything is legalized to try this. That way we have
+  // legal vector types and such.
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+  // Transform a scalar conversion of a value from a lane extract into a
+  // lane extract of a vector conversion. E.g., from foo1 to foo2:
+  // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
+  // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
+  //
+  // The second form interacts better with instruction selection and the
+  // register allocator to avoid cross-class register copies that aren't
+  // coalescable due to a lane reference.
+
+  // Check the operand and see if it originates from a lane extract.
+  SDValue Op1 = N->getOperand(1);
+  if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+    // Yep, no additional predication needed. Perform the transform.
+    SDValue IID = N->getOperand(0);
+    SDValue Shift = N->getOperand(2);
+    SDValue Vec = Op1.getOperand(0);
+    SDValue Lane = Op1.getOperand(1);
+    EVT ResTy = N->getValueType(0);
+    EVT VecResTy;
+    SDLoc DL(N);
+
+    // The vector width should be 128 bits by the time we get here, even
+    // if it started as 64 bits (the extract_vector handling will have
+    // done so).
+    assert(Vec.getValueType().getSizeInBits() == 128 &&
+           "unexpected vector size on extract_vector_elt!");
+    if (Vec.getValueType() == MVT::v4i32)
+      VecResTy = MVT::v4f32;
+    else if (Vec.getValueType() == MVT::v2i64)
+      VecResTy = MVT::v2f64;
+    else
+      llvm_unreachable("unexpected vector type!");
+
+    SDValue Convert =
+        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
+  }
+  return SDValue();
+}
+
+// AArch64 high-vector "long" operations are formed by performing the non-high
+// version on an extract_subvector of each operand which gets the high half:
+//
+//  (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
+//
+// However, there are cases which don't have an extract_high explicitly, but
+// have another operation that can be made compatible with one for free. For
+// example:
+//
+//  (dupv64 scalar) --> (extract_high (dup128 scalar))
+//
+// This routine does the actual conversion of such DUPs, once outer routines
+// have determined that everything else is in order.
+static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
+  // We can handle most types of duplicate, but the lane ones have an extra
+  // operand saying *which* lane, so we need to know.
+  bool IsDUPLANE;
+  switch (N.getOpcode()) {
+  case AArch64ISD::DUP:
+    IsDUPLANE = false;
+    break;
+  case AArch64ISD::DUPLANE8:
+  case AArch64ISD::DUPLANE16:
+  case AArch64ISD::DUPLANE32:
+  case AArch64ISD::DUPLANE64:
+    IsDUPLANE = true;
+    break;
+  default:
+    return SDValue();
+  }
+
+  MVT NarrowTy = N.getSimpleValueType();
+  if (!NarrowTy.is64BitVector())
+    return SDValue();
+
+  MVT ElementTy = NarrowTy.getVectorElementType();
+  unsigned NumElems = NarrowTy.getVectorNumElements();
+  MVT NewDUPVT = MVT::getVectorVT(ElementTy, NumElems * 2);
+
+  SDValue NewDUP;
+  if (IsDUPLANE)
+    NewDUP = DAG.getNode(N.getOpcode(), SDLoc(N), NewDUPVT, N.getOperand(0),
+                         N.getOperand(1));
+  else
+    NewDUP = DAG.getNode(AArch64ISD::DUP, SDLoc(N), NewDUPVT, N.getOperand(0));
+
+  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N.getNode()), NarrowTy,
+                     NewDUP, DAG.getConstant(NumElems, MVT::i64));
+}
+
+static bool isEssentiallyExtractSubvector(SDValue N) {
+  if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR)
+    return true;
+
+  return N.getOpcode() == ISD::BITCAST &&
+         N.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR;
+}
+
+/// \brief Helper structure to keep track of ISD::SET_CC operands.
+struct GenericSetCCInfo {
+  const SDValue *Opnd0;
+  const SDValue *Opnd1;
+  ISD::CondCode CC;
+};
+
+/// \brief Helper structure to keep track of a SET_CC lowered into AArch64 code.
+struct AArch64SetCCInfo {
+  const SDValue *Cmp;
+  AArch64CC::CondCode CC;
+};
+
+/// \brief Helper structure to keep track of SetCC information.
+union SetCCInfo {
+  GenericSetCCInfo Generic;
+  AArch64SetCCInfo AArch64;
+};
+
+/// \brief Helper structure to be able to read SetCC information.  If set to
+/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
+/// GenericSetCCInfo.
+struct SetCCInfoAndKind {
+  SetCCInfo Info;
+  bool IsAArch64;
+};
+
+/// \brief Check whether or not \p Op is a SET_CC operation, either a generic or
+/// an
+/// AArch64 lowered one.
+/// \p SetCCInfo is filled accordingly.
+/// \post SetCCInfo is meanginfull only when this function returns true.
+/// \return True when Op is a kind of SET_CC operation.
+static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
+  // If this is a setcc, this is straight forward.
+  if (Op.getOpcode() == ISD::SETCC) {
+    SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
+    SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
+    SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+    SetCCInfo.IsAArch64 = false;
+    return true;
+  }
+  // Otherwise, check if this is a matching csel instruction.
+  // In other words:
+  // - csel 1, 0, cc
+  // - csel 0, 1, !cc
+  if (Op.getOpcode() != AArch64ISD::CSEL)
+    return false;
+  // Set the information about the operands.
+  // TODO: we want the operands of the Cmp not the csel
+  SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
+  SetCCInfo.IsAArch64 = true;
+  SetCCInfo.Info.AArch64.CC = static_cast<AArch64CC::CondCode>(
+      cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
+
+  // Check that the operands matches the constraints:
+  // (1) Both operands must be constants.
+  // (2) One must be 1 and the other must be 0.
+  ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
+  ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+
+  // Check (1).
+  if (!TValue || !FValue)
+    return false;
+
+  // Check (2).
+  if (!TValue->isOne()) {
+    // Update the comparison when we are interested in !cc.
+    std::swap(TValue, FValue);
+    SetCCInfo.Info.AArch64.CC =
+        AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC);
+  }
+  return TValue->isOne() && FValue->isNullValue();
+}
+
+// Returns true if Op is setcc or zext of setcc.
+static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
+  if (isSetCC(Op, Info))
+    return true;
+  return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
+    isSetCC(Op->getOperand(0), Info));
+}
+
+// The folding we want to perform is:
+// (add x, [zext] (setcc cc ...) )
+//   -->
+// (csel x, (add x, 1), !cc ...)
+//
+// The latter will get matched to a CSINC instruction.
+static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
+  assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
+  SDValue LHS = Op->getOperand(0);
+  SDValue RHS = Op->getOperand(1);
+  SetCCInfoAndKind InfoAndKind;
+
+  // If neither operand is a SET_CC, give up.
+  if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
+    std::swap(LHS, RHS);
+    if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
+      return SDValue();
+  }
+
+  // FIXME: This could be generatized to work for FP comparisons.
+  EVT CmpVT = InfoAndKind.IsAArch64
+                  ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
+                  : InfoAndKind.Info.Generic.Opnd0->getValueType();
+  if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
+    return SDValue();
+
+  SDValue CCVal;
+  SDValue Cmp;
+  SDLoc dl(Op);
+  if (InfoAndKind.IsAArch64) {
+    CCVal = DAG.getConstant(
+        AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), MVT::i32);
+    Cmp = *InfoAndKind.Info.AArch64.Cmp;
+  } else
+    Cmp = getAArch64Cmp(*InfoAndKind.Info.Generic.Opnd0,
+                      *InfoAndKind.Info.Generic.Opnd1,
+                      ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, true),
+                      CCVal, DAG, dl);
+
+  EVT VT = Op->getValueType(0);
+  LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, VT));
+  return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
+}
+
+// The basic add/sub long vector instructions have variants with "2" on the end
+// which act on the high-half of their inputs. They are normally matched by
+// patterns like:
+//
+// (add (zeroext (extract_high LHS)),
+//      (zeroext (extract_high RHS)))
+// -> uaddl2 vD, vN, vM
+//
+// However, if one of the extracts is something like a duplicate, this
+// instruction can still be used profitably. This function puts the DAG into a
+// more appropriate form for those patterns to trigger.
+static SDValue performAddSubLongCombine(SDNode *N,
+                                        TargetLowering::DAGCombinerInfo &DCI,
+                                        SelectionDAG &DAG) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  MVT VT = N->getSimpleValueType(0);
+  if (!VT.is128BitVector()) {
+    if (N->getOpcode() == ISD::ADD)
+      return performSetccAddFolding(N, DAG);
+    return SDValue();
+  }
+
+  // Make sure both branches are extended in the same way.
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
+       LHS.getOpcode() != ISD::SIGN_EXTEND) ||
+      LHS.getOpcode() != RHS.getOpcode())
+    return SDValue();
+
+  unsigned ExtType = LHS.getOpcode();
+
+  // It's not worth doing if at least one of the inputs isn't already an
+  // extract, but we don't know which it'll be so we have to try both.
+  if (isEssentiallyExtractSubvector(LHS.getOperand(0))) {
+    RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
+    if (!RHS.getNode())
+      return SDValue();
+
+    RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
+  } else if (isEssentiallyExtractSubvector(RHS.getOperand(0))) {
+    LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
+    if (!LHS.getNode())
+      return SDValue();
+
+    LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
+  }
+
+  return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
+}
+
+// Massage DAGs which we can use the high-half "long" operations on into
+// something isel will recognize better. E.g.
+//
+// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
+//   (aarch64_neon_umull (extract_high (v2i64 vec)))
+//                     (extract_high (v2i64 (dup128 scalar)))))
+//
+static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
+                                       TargetLowering::DAGCombinerInfo &DCI,
+                                       SelectionDAG &DAG) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SDValue LHS = N->getOperand(1);
+  SDValue RHS = N->getOperand(2);
+  assert(LHS.getValueType().is64BitVector() &&
+         RHS.getValueType().is64BitVector() &&
+         "unexpected shape for long operation");
+
+  // Either node could be a DUP, but it's not worth doing both of them (you'd
+  // just as well use the non-high version) so look for a corresponding extract
+  // operation on the other "wing".
+  if (isEssentiallyExtractSubvector(LHS)) {
+    RHS = tryExtendDUPToExtractHigh(RHS, DAG);
+    if (!RHS.getNode())
+      return SDValue();
+  } else if (isEssentiallyExtractSubvector(RHS)) {
+    LHS = tryExtendDUPToExtractHigh(LHS, DAG);
+    if (!LHS.getNode())
+      return SDValue();
+  }
+
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
+                     N->getOperand(0), LHS, RHS);
+}
+
+static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
+  MVT ElemTy = N->getSimpleValueType(0).getScalarType();
+  unsigned ElemBits = ElemTy.getSizeInBits();
+
+  int64_t ShiftAmount;
+  if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
+    APInt SplatValue, SplatUndef;
+    unsigned SplatBitSize;
+    bool HasAnyUndefs;
+    if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
+                              HasAnyUndefs, ElemBits) ||
+        SplatBitSize != ElemBits)
+      return SDValue();
+
+    ShiftAmount = SplatValue.getSExtValue();
+  } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
+    ShiftAmount = CVN->getSExtValue();
+  } else
+    return SDValue();
+
+  unsigned Opcode;
+  bool IsRightShift;
+  switch (IID) {
+  default:
+    llvm_unreachable("Unknown shift intrinsic");
+  case Intrinsic::aarch64_neon_sqshl:
+    Opcode = AArch64ISD::SQSHL_I;
+    IsRightShift = false;
+    break;
+  case Intrinsic::aarch64_neon_uqshl:
+    Opcode = AArch64ISD::UQSHL_I;
+    IsRightShift = false;
+    break;
+  case Intrinsic::aarch64_neon_srshl:
+    Opcode = AArch64ISD::SRSHR_I;
+    IsRightShift = true;
+    break;
+  case Intrinsic::aarch64_neon_urshl:
+    Opcode = AArch64ISD::URSHR_I;
+    IsRightShift = true;
+    break;
+  case Intrinsic::aarch64_neon_sqshlu:
+    Opcode = AArch64ISD::SQSHLU_I;
+    IsRightShift = false;
+    break;
+  }
+
+  if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits)
+    return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1),
+                       DAG.getConstant(-ShiftAmount, MVT::i32));
+  else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits)
+    return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1),
+                       DAG.getConstant(ShiftAmount, MVT::i32));
+
+  return SDValue();
+}
+
+// The CRC32[BH] instructions ignore the high bits of their data operand. Since
+// the intrinsics must be legal and take an i32, this means there's almost
+// certainly going to be a zext in the DAG which we can eliminate.
+static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
+  SDValue AndN = N->getOperand(2);
+  if (AndN.getOpcode() != ISD::AND)
+    return SDValue();
+
+  ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
+  if (!CMask || CMask->getZExtValue() != Mask)
+    return SDValue();
+
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
+                     N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
+}
+
+static SDValue performIntrinsicCombine(SDNode *N,
+                                       TargetLowering::DAGCombinerInfo &DCI,
+                                       const AArch64Subtarget *Subtarget) {
+  SelectionDAG &DAG = DCI.DAG;
+  unsigned IID = getIntrinsicID(N);
+  switch (IID) {
+  default:
+    break;
+  case Intrinsic::aarch64_neon_vcvtfxs2fp:
+  case Intrinsic::aarch64_neon_vcvtfxu2fp:
+    return tryCombineFixedPointConvert(N, DCI, DAG);
+    break;
+  case Intrinsic::aarch64_neon_fmax:
+    return DAG.getNode(AArch64ISD::FMAX, SDLoc(N), N->getValueType(0),
+                       N->getOperand(1), N->getOperand(2));
+  case Intrinsic::aarch64_neon_fmin:
+    return DAG.getNode(AArch64ISD::FMIN, SDLoc(N), N->getValueType(0),
+                       N->getOperand(1), N->getOperand(2));
+  case Intrinsic::aarch64_neon_smull:
+  case Intrinsic::aarch64_neon_umull:
+  case Intrinsic::aarch64_neon_pmull:
+  case Intrinsic::aarch64_neon_sqdmull:
+    return tryCombineLongOpWithDup(IID, N, DCI, DAG);
+  case Intrinsic::aarch64_neon_sqshl:
+  case Intrinsic::aarch64_neon_uqshl:
+  case Intrinsic::aarch64_neon_sqshlu:
+  case Intrinsic::aarch64_neon_srshl:
+  case Intrinsic::aarch64_neon_urshl:
+    return tryCombineShiftImm(IID, N, DAG);
+  case Intrinsic::aarch64_crc32b:
+  case Intrinsic::aarch64_crc32cb:
+    return tryCombineCRC32(0xff, N, DAG);
+  case Intrinsic::aarch64_crc32h:
+  case Intrinsic::aarch64_crc32ch:
+    return tryCombineCRC32(0xffff, N, DAG);
+  }
+  return SDValue();
+}
+
+static SDValue performExtendCombine(SDNode *N,
+                                    TargetLowering::DAGCombinerInfo &DCI,
+                                    SelectionDAG &DAG) {
+  // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
+  // we can convert that DUP into another extract_high (of a bigger DUP), which
+  // helps the backend to decide that an sabdl2 would be useful, saving a real
+  // extract_high operation.
+  if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
+      N->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
+    SDNode *ABDNode = N->getOperand(0).getNode();
+    unsigned IID = getIntrinsicID(ABDNode);
+    if (IID == Intrinsic::aarch64_neon_sabd ||
+        IID == Intrinsic::aarch64_neon_uabd) {
+      SDValue NewABD = tryCombineLongOpWithDup(IID, ABDNode, DCI, DAG);
+      if (!NewABD.getNode())
+        return SDValue();
+
+      return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0),
+                         NewABD);
+    }
+  }
+
+  // This is effectively a custom type legalization for AArch64.
+  //
+  // Type legalization will split an extend of a small, legal, type to a larger
+  // illegal type by first splitting the destination type, often creating
+  // illegal source types, which then get legalized in isel-confusing ways,
+  // leading to really terrible codegen. E.g.,
+  //   %result = v8i32 sext v8i8 %value
+  // becomes
+  //   %losrc = extract_subreg %value, ...
+  //   %hisrc = extract_subreg %value, ...
+  //   %lo = v4i32 sext v4i8 %losrc
+  //   %hi = v4i32 sext v4i8 %hisrc
+  // Things go rapidly downhill from there.
+  //
+  // For AArch64, the [sz]ext vector instructions can only go up one element
+  // size, so we can, e.g., extend from i8 to i16, but to go from i8 to i32
+  // take two instructions.
+  //
+  // This implies that the most efficient way to do the extend from v8i8
+  // to two v4i32 values is to first extend the v8i8 to v8i16, then do
+  // the normal splitting to happen for the v8i16->v8i32.
+
+  // This is pre-legalization to catch some cases where the default
+  // type legalization will create ill-tempered code.
+  if (!DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  // We're only interested in cleaning things up for non-legal vector types
+  // here. If both the source and destination are legal, things will just
+  // work naturally without any fiddling.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT ResVT = N->getValueType(0);
+  if (!ResVT.isVector() || TLI.isTypeLegal(ResVT))
+    return SDValue();
+  // If the vector type isn't a simple VT, it's beyond the scope of what
+  // we're  worried about here. Let legalization do its thing and hope for
+  // the best.
+  if (!ResVT.isSimple())
+    return SDValue();
+
+  SDValue Src = N->getOperand(0);
+  MVT SrcVT = Src->getValueType(0).getSimpleVT();
+  // If the source VT is a 64-bit vector, we can play games and get the
+  // better results we want.
+  if (SrcVT.getSizeInBits() != 64)
+    return SDValue();
+
+  unsigned SrcEltSize = SrcVT.getVectorElementType().getSizeInBits();
+  unsigned ElementCount = SrcVT.getVectorNumElements();
+  SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), ElementCount);
+  SDLoc DL(N);
+  Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src);
+
+  // Now split the rest of the operation into two halves, each with a 64
+  // bit source.
+  EVT LoVT, HiVT;
+  SDValue Lo, Hi;
+  unsigned NumElements = ResVT.getVectorNumElements();
+  assert(!(NumElements & 1) && "Splitting vector, but not in half!");
+  LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(),
+                                 ResVT.getVectorElementType(), NumElements / 2);
+
+  EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
+                               LoVT.getVectorNumElements());
+  Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
+                   DAG.getIntPtrConstant(0));
+  Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
+                   DAG.getIntPtrConstant(InNVT.getVectorNumElements()));
+  Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo);
+  Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi);
+
+  // Now combine the parts back together so we still have a single result
+  // like the combiner expects.
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
+}
+
+/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
+/// value. The load store optimizer pass will merge them to store pair stores.
+/// This has better performance than a splat of the scalar followed by a split
+/// vector store. Even if the stores are not merged it is four stores vs a dup,
+/// followed by an ext.b and two stores.
+static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) {
+  SDValue StVal = St->getValue();
+  EVT VT = StVal.getValueType();
+
+  // Don't replace floating point stores, they possibly won't be transformed to
+  // stp because of the store pair suppress pass.
+  if (VT.isFloatingPoint())
+    return SDValue();
+
+  // Check for insert vector elements.
+  if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
+    return SDValue();
+
+  // We can express a splat as store pair(s) for 2 or 4 elements.
+  unsigned NumVecElts = VT.getVectorNumElements();
+  if (NumVecElts != 4 && NumVecElts != 2)
+    return SDValue();
+  SDValue SplatVal = StVal.getOperand(1);
+  unsigned RemainInsertElts = NumVecElts - 1;
+
+  // Check that this is a splat.
+  while (--RemainInsertElts) {
+    SDValue NextInsertElt = StVal.getOperand(0);
+    if (NextInsertElt.getOpcode() != ISD::INSERT_VECTOR_ELT)
+      return SDValue();
+    if (NextInsertElt.getOperand(1) != SplatVal)
+      return SDValue();
+    StVal = NextInsertElt;
+  }
+  unsigned OrigAlignment = St->getAlignment();
+  unsigned EltOffset = NumVecElts == 4 ? 4 : 8;
+  unsigned Alignment = std::min(OrigAlignment, EltOffset);
+
+  // Create scalar stores. This is at least as good as the code sequence for a
+  // split unaligned store wich is a dup.s, ext.b, and two stores.
+  // Most of the time the three stores should be replaced by store pair
+  // instructions (stp).
+  SDLoc DL(St);
+  SDValue BasePtr = St->getBasePtr();
+  SDValue NewST1 =
+      DAG.getStore(St->getChain(), DL, SplatVal, BasePtr, St->getPointerInfo(),
+                   St->isVolatile(), St->isNonTemporal(), St->getAlignment());
+
+  unsigned Offset = EltOffset;
+  while (--NumVecElts) {
+    SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
+                                    DAG.getConstant(Offset, MVT::i64));
+    NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
+                          St->getPointerInfo(), St->isVolatile(),
+                          St->isNonTemporal(), Alignment);
+    Offset += EltOffset;
+  }
+  return NewST1;
+}
+
+static SDValue performSTORECombine(SDNode *N,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   SelectionDAG &DAG,
+                                   const AArch64Subtarget *Subtarget) {
+  if (!DCI.isBeforeLegalize())
+    return SDValue();
+
+  StoreSDNode *S = cast<StoreSDNode>(N);
+  if (S->isVolatile())
+    return SDValue();
+
+  // Cyclone has bad performance on unaligned 16B stores when crossing line and
+  // page boundries. We want to split such stores.
+  if (!Subtarget->isCyclone())
+    return SDValue();
+
+  // Don't split at Oz.
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool IsMinSize = MF.getFunction()->getAttributes().hasAttribute(
+      AttributeSet::FunctionIndex, Attribute::MinSize);
+  if (IsMinSize)
+    return SDValue();
+
+  SDValue StVal = S->getValue();
+  EVT VT = StVal.getValueType();
+
+  // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
+  // those up regresses performance on micro-benchmarks and olden/bh.
+  if (!VT.isVector() || VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
+    return SDValue();
+
+  // Split unaligned 16B stores. They are terrible for performance.
+  // Don't split stores with alignment of 1 or 2. Code that uses clang vector
+  // extensions can use this to mark that it does not want splitting to happen
+  // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
+  // eliminating alignment hazards is only 1 in 8 for alignment of 2.
+  if (VT.getSizeInBits() != 128 || S->getAlignment() >= 16 ||
+      S->getAlignment() <= 2)
+    return SDValue();
+
+  // If we get a splat of a scalar convert this vector store to a store of
+  // scalars. They will be merged into store pairs thereby removing two
+  // instructions.
+  SDValue ReplacedSplat = replaceSplatVectorStore(DAG, S);
+  if (ReplacedSplat != SDValue())
+    return ReplacedSplat;
+
+  SDLoc DL(S);
+  unsigned NumElts = VT.getVectorNumElements() / 2;
+  // Split VT into two.
+  EVT HalfVT =
+      EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts);
+  SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
+                                   DAG.getIntPtrConstant(0));
+  SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
+                                   DAG.getIntPtrConstant(NumElts));
+  SDValue BasePtr = S->getBasePtr();
+  SDValue NewST1 =
+      DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
+                   S->isVolatile(), S->isNonTemporal(), S->getAlignment());
+  SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
+                                  DAG.getConstant(8, MVT::i64));
+  return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
+                      S->getPointerInfo(), S->isVolatile(), S->isNonTemporal(),
+                      S->getAlignment());
+}
+
+/// Target-specific DAG combine function for post-increment LD1 (lane) and
+/// post-increment LD1R.
+static SDValue performPostLD1Combine(SDNode *N,
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     bool IsLaneOp) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+
+  unsigned LoadIdx = IsLaneOp ? 1 : 0;
+  SDNode *LD = N->getOperand(LoadIdx).getNode();
+  // If it is not LOAD, can not do such combine.
+  if (LD->getOpcode() != ISD::LOAD)
+    return SDValue();
+
+  LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
+  EVT MemVT = LoadSDN->getMemoryVT();
+  // Check if memory operand is the same type as the vector element.
+  if (MemVT != VT.getVectorElementType())
+    return SDValue();
+
+  // Check if there are other uses. If so, do not combine as it will introduce
+  // an extra load.
+  for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
+       ++UI) {
+    if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
+      continue;
+    if (*UI != N)
+      return SDValue();
+  }
+
+  SDValue Addr = LD->getOperand(1);
+  SDValue Vector = N->getOperand(0);
+  // Search for a use of the address operand that is an increment.
+  for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
+       Addr.getNode()->use_end(); UI != UE; ++UI) {
+    SDNode *User = *UI;
+    if (User->getOpcode() != ISD::ADD
+        || UI.getUse().getResNo() != Addr.getResNo())
+      continue;
+
+    // Check that the add is independent of the load.  Otherwise, folding it
+    // would create a cycle.
+    if (User->isPredecessorOf(LD) || LD->isPredecessorOf(User))
+      continue;
+    // Also check that add is not used in the vector operand.  This would also
+    // create a cycle.
+    if (User->isPredecessorOf(Vector.getNode()))
+      continue;
+
+    // If the increment is a constant, it must match the memory ref size.
+    SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
+    if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
+      uint32_t IncVal = CInc->getZExtValue();
+      unsigned NumBytes = VT.getScalarSizeInBits() / 8;
+      if (IncVal != NumBytes)
+        continue;
+      Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
+    }
+
+    SmallVector<SDValue, 8> Ops;
+    Ops.push_back(LD->getOperand(0));  // Chain
+    if (IsLaneOp) {
+      Ops.push_back(Vector);           // The vector to be inserted
+      Ops.push_back(N->getOperand(2)); // The lane to be inserted in the vector
+    }
+    Ops.push_back(Addr);
+    Ops.push_back(Inc);
+
+    EVT Tys[3] = { VT, MVT::i64, MVT::Other };
+    SDVTList SDTys = DAG.getVTList(ArrayRef<EVT>(Tys, 3));
+    unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
+    SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
+                                           MemVT,
+                                           LoadSDN->getMemOperand());
+
+    // Update the uses.
+    std::vector<SDValue> NewResults;
+    NewResults.push_back(SDValue(LD, 0));             // The result of load
+    NewResults.push_back(SDValue(UpdN.getNode(), 2)); // Chain
+    DCI.CombineTo(LD, NewResults);
+    DCI.CombineTo(N, SDValue(UpdN.getNode(), 0));     // Dup/Inserted Result
+    DCI.CombineTo(User, SDValue(UpdN.getNode(), 1));  // Write back register
+
+    break;
+  }
+  return SDValue();
+}
+
+/// Target-specific DAG combine function for NEON load/store intrinsics
+/// to merge base address updates.
+static SDValue performNEONPostLDSTCombine(SDNode *N,
+                                          TargetLowering::DAGCombinerInfo &DCI,
+                                          SelectionDAG &DAG) {
+  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+    return SDValue();
+
+  unsigned AddrOpIdx = N->getNumOperands() - 1;
+  SDValue Addr = N->getOperand(AddrOpIdx);
+
+  // Search for a use of the address operand that is an increment.
+  for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
+       UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
+    SDNode *User = *UI;
+    if (User->getOpcode() != ISD::ADD ||
+        UI.getUse().getResNo() != Addr.getResNo())
+      continue;
+
+    // Check that the add is independent of the load/store.  Otherwise, folding
+    // it would create a cycle.
+    if (User->isPredecessorOf(N) || N->isPredecessorOf(User))
+      continue;
+
+    // Find the new opcode for the updating load/store.
+    bool IsStore = false;
+    bool IsLaneOp = false;
+    bool IsDupOp = false;
+    unsigned NewOpc = 0;
+    unsigned NumVecs = 0;
+    unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+    switch (IntNo) {
+    default: llvm_unreachable("unexpected intrinsic for Neon base update");
+    case Intrinsic::aarch64_neon_ld2:       NewOpc = AArch64ISD::LD2post;
+      NumVecs = 2; break;
+    case Intrinsic::aarch64_neon_ld3:       NewOpc = AArch64ISD::LD3post;
+      NumVecs = 3; break;
+    case Intrinsic::aarch64_neon_ld4:       NewOpc = AArch64ISD::LD4post;
+      NumVecs = 4; break;
+    case Intrinsic::aarch64_neon_st2:       NewOpc = AArch64ISD::ST2post;
+      NumVecs = 2; IsStore = true; break;
+    case Intrinsic::aarch64_neon_st3:       NewOpc = AArch64ISD::ST3post;
+      NumVecs = 3; IsStore = true; break;
+    case Intrinsic::aarch64_neon_st4:       NewOpc = AArch64ISD::ST4post;
+      NumVecs = 4; IsStore = true; break;
+    case Intrinsic::aarch64_neon_ld1x2:     NewOpc = AArch64ISD::LD1x2post;
+      NumVecs = 2; break;
+    case Intrinsic::aarch64_neon_ld1x3:     NewOpc = AArch64ISD::LD1x3post;
+      NumVecs = 3; break;
+    case Intrinsic::aarch64_neon_ld1x4:     NewOpc = AArch64ISD::LD1x4post;
+      NumVecs = 4; break;
+    case Intrinsic::aarch64_neon_st1x2:     NewOpc = AArch64ISD::ST1x2post;
+      NumVecs = 2; IsStore = true; break;
+    case Intrinsic::aarch64_neon_st1x3:     NewOpc = AArch64ISD::ST1x3post;
+      NumVecs = 3; IsStore = true; break;
+    case Intrinsic::aarch64_neon_st1x4:     NewOpc = AArch64ISD::ST1x4post;
+      NumVecs = 4; IsStore = true; break;
+    case Intrinsic::aarch64_neon_ld2r:      NewOpc = AArch64ISD::LD2DUPpost;
+      NumVecs = 2; IsDupOp = true; break;
+    case Intrinsic::aarch64_neon_ld3r:      NewOpc = AArch64ISD::LD3DUPpost;
+      NumVecs = 3; IsDupOp = true; break;
+    case Intrinsic::aarch64_neon_ld4r:      NewOpc = AArch64ISD::LD4DUPpost;
+      NumVecs = 4; IsDupOp = true; break;
+    case Intrinsic::aarch64_neon_ld2lane:   NewOpc = AArch64ISD::LD2LANEpost;
+      NumVecs = 2; IsLaneOp = true; break;
+    case Intrinsic::aarch64_neon_ld3lane:   NewOpc = AArch64ISD::LD3LANEpost;
+      NumVecs = 3; IsLaneOp = true; break;
+    case Intrinsic::aarch64_neon_ld4lane:   NewOpc = AArch64ISD::LD4LANEpost;
+      NumVecs = 4; IsLaneOp = true; break;
+    case Intrinsic::aarch64_neon_st2lane:   NewOpc = AArch64ISD::ST2LANEpost;
+      NumVecs = 2; IsStore = true; IsLaneOp = true; break;
+    case Intrinsic::aarch64_neon_st3lane:   NewOpc = AArch64ISD::ST3LANEpost;
+      NumVecs = 3; IsStore = true; IsLaneOp = true; break;
+    case Intrinsic::aarch64_neon_st4lane:   NewOpc = AArch64ISD::ST4LANEpost;
+      NumVecs = 4; IsStore = true; IsLaneOp = true; break;
+    }
+
+    EVT VecTy;
+    if (IsStore)
+      VecTy = N->getOperand(2).getValueType();
+    else
+      VecTy = N->getValueType(0);
+
+    // If the increment is a constant, it must match the memory ref size.
+    SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
+    if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
+      uint32_t IncVal = CInc->getZExtValue();
+      unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
+      if (IsLaneOp || IsDupOp)
+        NumBytes /= VecTy.getVectorNumElements();
+      if (IncVal != NumBytes)
+        continue;
+      Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
+    }
+    SmallVector<SDValue, 8> Ops;
+    Ops.push_back(N->getOperand(0)); // Incoming chain
+    // Load lane and store have vector list as input.
+    if (IsLaneOp || IsStore)
+      for (unsigned i = 2; i < AddrOpIdx; ++i)
+        Ops.push_back(N->getOperand(i));
+    Ops.push_back(Addr); // Base register
+    Ops.push_back(Inc);
+
+    // Return Types.
+    EVT Tys[6];
+    unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
+    unsigned n;
+    for (n = 0; n < NumResultVecs; ++n)
+      Tys[n] = VecTy;
+    Tys[n++] = MVT::i64;  // Type of write back register
+    Tys[n] = MVT::Other;  // Type of the chain
+    SDVTList SDTys = DAG.getVTList(ArrayRef<EVT>(Tys, NumResultVecs + 2));
+
+    MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
+    SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
+                                           MemInt->getMemoryVT(),
+                                           MemInt->getMemOperand());
+
+    // Update the uses.
+    std::vector<SDValue> NewResults;
+    for (unsigned i = 0; i < NumResultVecs; ++i) {
+      NewResults.push_back(SDValue(UpdN.getNode(), i));
+    }
+    NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
+    DCI.CombineTo(N, NewResults);
+    DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
+
+    break;
+  }
+  return SDValue();
+}
+
+// Optimize compare with zero and branch.
+static SDValue performBRCONDCombine(SDNode *N,
+                                    TargetLowering::DAGCombinerInfo &DCI,
+                                    SelectionDAG &DAG) {
+  SDValue Chain = N->getOperand(0);
+  SDValue Dest = N->getOperand(1);
+  SDValue CCVal = N->getOperand(2);
+  SDValue Cmp = N->getOperand(3);
+
+  assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
+  unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue();
+  if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
+    return SDValue();
+
+  unsigned CmpOpc = Cmp.getOpcode();
+  if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
+    return SDValue();
+
+  // Only attempt folding if there is only one use of the flag and no use of the
+  // value.
+  if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
+    return SDValue();
+
+  SDValue LHS = Cmp.getOperand(0);
+  SDValue RHS = Cmp.getOperand(1);
+
+  assert(LHS.getValueType() == RHS.getValueType() &&
+         "Expected the value type to be the same for both operands!");
+  if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
+    return SDValue();
+
+  if (isa<ConstantSDNode>(LHS) && cast<ConstantSDNode>(LHS)->isNullValue())
+    std::swap(LHS, RHS);
+
+  if (!isa<ConstantSDNode>(RHS) || !cast<ConstantSDNode>(RHS)->isNullValue())
+    return SDValue();
+
+  if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
+      LHS.getOpcode() == ISD::SRL)
+    return SDValue();
+
+  // Fold the compare into the branch instruction.
+  SDValue BR;
+  if (CC == AArch64CC::EQ)
+    BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
+  else
+    BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
+
+  // Do not add new nodes to DAG combiner worklist.
+  DCI.CombineTo(N, BR, false);
+
+  return SDValue();
+}
+
+// vselect (v1i1 setcc) ->
+//     vselect (v1iXX setcc)  (XX is the size of the compared operand type)
+// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
+// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
+// such VSELECT.
+static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  EVT CCVT = N0.getValueType();
+
+  if (N0.getOpcode() != ISD::SETCC || CCVT.getVectorNumElements() != 1 ||
+      CCVT.getVectorElementType() != MVT::i1)
+    return SDValue();
+
+  EVT ResVT = N->getValueType(0);
+  EVT CmpVT = N0.getOperand(0).getValueType();
+  // Only combine when the result type is of the same size as the compared
+  // operands.
+  if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
+    return SDValue();
+
+  SDValue IfTrue = N->getOperand(1);
+  SDValue IfFalse = N->getOperand(2);
+  SDValue SetCC =
+      DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
+                   N0.getOperand(0), N0.getOperand(1),
+                   cast<CondCodeSDNode>(N0.getOperand(2))->get());
+  return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
+                     IfTrue, IfFalse);
+}
+
+/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
+/// the compare-mask instructions rather than going via NZCV, even if LHS and
+/// RHS are really scalar. This replaces any scalar setcc in the above pattern
+/// with a vector one followed by a DUP shuffle on the result.
+static SDValue performSelectCombine(SDNode *N, SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  EVT ResVT = N->getValueType(0);
+
+  if (!N->getOperand(1).getValueType().isVector())
+    return SDValue();
+
+  if (N0.getOpcode() != ISD::SETCC || N0.getValueType() != MVT::i1)
+    return SDValue();
+
+  SDLoc DL(N0);
+
+  EVT SrcVT = N0.getOperand(0).getValueType();
+  SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT,
+                           ResVT.getSizeInBits() / SrcVT.getSizeInBits());
+  EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
+
+  // First perform a vector comparison, where lane 0 is the one we're interested
+  // in.
+  SDValue LHS =
+      DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
+  SDValue RHS =
+      DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
+  SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
+
+  // Now duplicate the comparison mask we want across all other lanes.
+  SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
+  SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask.data());
+  Mask = DAG.getNode(ISD::BITCAST, DL, ResVT.changeVectorElementTypeToInteger(),
+                     Mask);
+
+  return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
+}
+
+SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
+                                                 DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  switch (N->getOpcode()) {
+  default:
+    break;
+  case ISD::ADD:
+  case ISD::SUB:
+    return performAddSubLongCombine(N, DCI, DAG);
+  case ISD::XOR:
+    return performXorCombine(N, DAG, DCI, Subtarget);
+  case ISD::MUL:
+    return performMulCombine(N, DAG, DCI, Subtarget);
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+    return performIntToFpCombine(N, DAG);
+  case ISD::OR:
+    return performORCombine(N, DCI, Subtarget);
+  case ISD::INTRINSIC_WO_CHAIN:
+    return performIntrinsicCombine(N, DCI, Subtarget);
+  case ISD::ANY_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::SIGN_EXTEND:
+    return performExtendCombine(N, DCI, DAG);
+  case ISD::BITCAST:
+    return performBitcastCombine(N, DCI, DAG);
+  case ISD::CONCAT_VECTORS:
+    return performConcatVectorsCombine(N, DCI, DAG);
+  case ISD::SELECT:
+    return performSelectCombine(N, DAG);
+  case ISD::VSELECT:
+    return performVSelectCombine(N, DCI.DAG);
+  case ISD::STORE:
+    return performSTORECombine(N, DCI, DAG, Subtarget);
+  case AArch64ISD::BRCOND:
+    return performBRCONDCombine(N, DCI, DAG);
+  case AArch64ISD::DUP:
+    return performPostLD1Combine(N, DCI, false);
+  case ISD::INSERT_VECTOR_ELT:
+    return performPostLD1Combine(N, DCI, true);
+  case ISD::INTRINSIC_VOID:
+  case ISD::INTRINSIC_W_CHAIN:
+    switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+    case Intrinsic::aarch64_neon_ld2:
+    case Intrinsic::aarch64_neon_ld3:
+    case Intrinsic::aarch64_neon_ld4:
+    case Intrinsic::aarch64_neon_ld1x2:
+    case Intrinsic::aarch64_neon_ld1x3:
+    case Intrinsic::aarch64_neon_ld1x4:
+    case Intrinsic::aarch64_neon_ld2lane:
+    case Intrinsic::aarch64_neon_ld3lane:
+    case Intrinsic::aarch64_neon_ld4lane:
+    case Intrinsic::aarch64_neon_ld2r:
+    case Intrinsic::aarch64_neon_ld3r:
+    case Intrinsic::aarch64_neon_ld4r:
+    case Intrinsic::aarch64_neon_st2:
+    case Intrinsic::aarch64_neon_st3:
+    case Intrinsic::aarch64_neon_st4:
+    case Intrinsic::aarch64_neon_st1x2:
+    case Intrinsic::aarch64_neon_st1x3:
+    case Intrinsic::aarch64_neon_st1x4:
+    case Intrinsic::aarch64_neon_st2lane:
+    case Intrinsic::aarch64_neon_st3lane:
+    case Intrinsic::aarch64_neon_st4lane:
+      return performNEONPostLDSTCombine(N, DCI, DAG);
+    default:
+      break;
+    }
+  }
+  return SDValue();
+}
+
+// Check if the return value is used as only a return value, as otherwise
+// we can't perform a tail-call. In particular, we need to check for
+// target ISD nodes that are returns and any other "odd" constructs
+// that the generic analysis code won't necessarily catch.
+bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
+                                               SDValue &Chain) const {
+  if (N->getNumValues() != 1)
+    return false;
+  if (!N->hasNUsesOfValue(1, 0))
+    return false;
+
+  SDValue TCChain = Chain;
+  SDNode *Copy = *N->use_begin();
+  if (Copy->getOpcode() == ISD::CopyToReg) {
+    // If the copy has a glue operand, we conservatively assume it isn't safe to
+    // perform a tail call.
+    if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
+        MVT::Glue)
+      return false;
+    TCChain = Copy->getOperand(0);
+  } else if (Copy->getOpcode() != ISD::FP_EXTEND)
+    return false;
+
+  bool HasRet = false;
+  for (SDNode *Node : Copy->uses()) {
+    if (Node->getOpcode() != AArch64ISD::RET_FLAG)
+      return false;
+    HasRet = true;
+  }
+
+  if (!HasRet)
+    return false;
+
+  Chain = TCChain;
+  return true;
+}
+
+// Return whether the an instruction can potentially be optimized to a tail
+// call. This will cause the optimizers to attempt to move, or duplicate,
+// return instructions to help enable tail call optimizations for this
+// instruction.
+bool AArch64TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
+  if (!CI->isTailCall())
+    return false;
+
+  return true;
+}
+
+bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
+                                                   SDValue &Offset,
+                                                   ISD::MemIndexedMode &AM,
+                                                   bool &IsInc,
+                                                   SelectionDAG &DAG) const {
+  if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
+    return false;
+
+  Base = Op->getOperand(0);
+  // All of the indexed addressing mode instructions take a signed
+  // 9 bit immediate offset.
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
+    int64_t RHSC = (int64_t)RHS->getZExtValue();
+    if (RHSC >= 256 || RHSC <= -256)
+      return false;
+    IsInc = (Op->getOpcode() == ISD::ADD);
+    Offset = Op->getOperand(1);
+    return true;
+  }
+  return false;
+}
+
+bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
+                                                      SDValue &Offset,
+                                                      ISD::MemIndexedMode &AM,
+                                                      SelectionDAG &DAG) const {
+  EVT VT;
+  SDValue Ptr;
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    VT = LD->getMemoryVT();
+    Ptr = LD->getBasePtr();
+  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    VT = ST->getMemoryVT();
+    Ptr = ST->getBasePtr();
+  } else
+    return false;
+
+  bool IsInc;
+  if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG))
+    return false;
+  AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC;
+  return true;
+}
+
+bool AArch64TargetLowering::getPostIndexedAddressParts(
+    SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset,
+    ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
+  EVT VT;
+  SDValue Ptr;
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    VT = LD->getMemoryVT();
+    Ptr = LD->getBasePtr();
+  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    VT = ST->getMemoryVT();
+    Ptr = ST->getBasePtr();
+  } else
+    return false;
+
+  bool IsInc;
+  if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG))
+    return false;
+  // Post-indexing updates the base, so it's not a valid transform
+  // if that's not the same as the load's pointer.
+  if (Ptr != Base)
+    return false;
+  AM = IsInc ? ISD::POST_INC : ISD::POST_DEC;
+  return true;
+}
+
+static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                                  SelectionDAG &DAG) {
+  if (N->getValueType(0) != MVT::i16)
+    return;
+
+  SDLoc DL(N);
+  SDValue Op = N->getOperand(0);
+  assert(Op.getValueType() == MVT::f16 &&
+         "Inconsistent bitcast? Only 16-bit types should be i16 or f16");
+  Op = SDValue(
+      DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32,
+                         DAG.getUNDEF(MVT::i32), Op,
+                         DAG.getTargetConstant(AArch64::hsub, MVT::i32)),
+      0);
+  Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
+  Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
+}
+
+void AArch64TargetLowering::ReplaceNodeResults(
+    SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Don't know how to custom expand this");
+  case ISD::BITCAST:
+    ReplaceBITCASTResults(N, Results, DAG);
+    return;
+  case ISD::FP_TO_UINT:
+  case ISD::FP_TO_SINT:
+    assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
+    // Let normal code take care of it by not adding anything to Results.
+    return;
+  }
+}
+
+bool AArch64TargetLowering::shouldExpandAtomicInIR(Instruction *Inst) const {
+  // Loads and stores less than 128-bits are already atomic; ones above that
+  // are doomed anyway, so defer to the default libcall and blame the OS when
+  // things go wrong:
+  if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+    return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128;
+  else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+    return LI->getType()->getPrimitiveSizeInBits() == 128;
+
+  // For the real atomic operations, we have ldxr/stxr up to 128 bits.
+  return Inst->getType()->getPrimitiveSizeInBits() <= 128;
+}
+
+TargetLoweringBase::LegalizeTypeAction
+AArch64TargetLowering::getPreferredVectorAction(EVT VT) const {
+  MVT SVT = VT.getSimpleVT();
+  // During type legalization, we prefer to widen v1i8, v1i16, v1i32  to v8i8,
+  // v4i16, v2i32 instead of to promote.
+  if (SVT == MVT::v1i8 || SVT == MVT::v1i16 || SVT == MVT::v1i32
+      || SVT == MVT::v1f32)
+    return TypeWidenVector;
+
+  return TargetLoweringBase::getPreferredVectorAction(VT);
+}
+
+Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
+                                             AtomicOrdering Ord) const {
+  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+  Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
+  bool IsAcquire =
+      Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent;
+
+  // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
+  // intrinsic must return {i64, i64} and we have to recombine them into a
+  // single i128 here.
+  if (ValTy->getPrimitiveSizeInBits() == 128) {
+    Intrinsic::ID Int =
+        IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
+    Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int);
+
+    Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
+    Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
+
+    Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
+    Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
+    Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
+    Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
+    return Builder.CreateOr(
+        Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");
+  }
+
+  Type *Tys[] = { Addr->getType() };
+  Intrinsic::ID Int =
+      IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
+  Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int, Tys);
+
+  return Builder.CreateTruncOrBitCast(
+      Builder.CreateCall(Ldxr, Addr),
+      cast<PointerType>(Addr->getType())->getElementType());
+}
+
+Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
+                                                   Value *Val, Value *Addr,
+                                                   AtomicOrdering Ord) const {
+  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+  bool IsRelease =
+      Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent;
+
+  // Since the intrinsics must have legal type, the i128 intrinsics take two
+  // parameters: "i64, i64". We must marshal Val into the appropriate form
+  // before the call.
+  if (Val->getType()->getPrimitiveSizeInBits() == 128) {
+    Intrinsic::ID Int =
+        IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
+    Function *Stxr = Intrinsic::getDeclaration(M, Int);
+    Type *Int64Ty = Type::getInt64Ty(M->getContext());
+
+    Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
+    Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
+    Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
+    return Builder.CreateCall3(Stxr, Lo, Hi, Addr);
+  }
+
+  Intrinsic::ID Int =
+      IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
+  Type *Tys[] = { Addr->getType() };
+  Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
+
+  return Builder.CreateCall2(
+      Stxr, Builder.CreateZExtOrBitCast(
+                Val, Stxr->getFunctionType()->getParamType(0)),
+      Addr);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h
new file mode 100644
index 0000000..cb0b9ef
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -0,0 +1,467 @@
+//==-- AArch64ISelLowering.h - AArch64 DAG Lowering Interface ----*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that AArch64 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_AArch64_ISELLOWERING_H
+#define LLVM_TARGET_AArch64_ISELLOWERING_H
+
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+namespace AArch64ISD {
+
+enum {
+  FIRST_NUMBER = ISD::BUILTIN_OP_END,
+  WrapperLarge, // 4-instruction MOVZ/MOVK sequence for 64-bit addresses.
+  CALL,         // Function call.
+
+  // Almost the same as a normal call node, except that a TLSDesc relocation is
+  // needed so the linker can relax it correctly if possible.
+  TLSDESC_CALL,
+  ADRP,     // Page address of a TargetGlobalAddress operand.
+  ADDlow,   // Add the low 12 bits of a TargetGlobalAddress operand.
+  LOADgot,  // Load from automatically generated descriptor (e.g. Global
+            // Offset Table, TLS record).
+  RET_FLAG, // Return with a flag operand. Operand 0 is the chain operand.
+  BRCOND,   // Conditional branch instruction; "b.cond".
+  CSEL,
+  FCSEL, // Conditional move instruction.
+  CSINV, // Conditional select invert.
+  CSNEG, // Conditional select negate.
+  CSINC, // Conditional select increment.
+
+  // Pointer to the thread's local storage area. Materialised from TPIDR_EL0 on
+  // ELF.
+  THREAD_POINTER,
+  ADC,
+  SBC, // adc, sbc instructions
+
+  // Arithmetic instructions which write flags.
+  ADDS,
+  SUBS,
+  ADCS,
+  SBCS,
+  ANDS,
+
+  // Floating point comparison
+  FCMP,
+
+  // Floating point max and min instructions.
+  FMAX,
+  FMIN,
+
+  // Scalar extract
+  EXTR,
+
+  // Scalar-to-vector duplication
+  DUP,
+  DUPLANE8,
+  DUPLANE16,
+  DUPLANE32,
+  DUPLANE64,
+
+  // Vector immedate moves
+  MOVI,
+  MOVIshift,
+  MOVIedit,
+  MOVImsl,
+  FMOV,
+  MVNIshift,
+  MVNImsl,
+
+  // Vector immediate ops
+  BICi,
+  ORRi,
+
+  // Vector bit select: similar to ISD::VSELECT but not all bits within an
+  // element must be identical.
+  BSL,
+
+  // Vector arithmetic negation
+  NEG,
+
+  // Vector shuffles
+  ZIP1,
+  ZIP2,
+  UZP1,
+  UZP2,
+  TRN1,
+  TRN2,
+  REV16,
+  REV32,
+  REV64,
+  EXT,
+
+  // Vector shift by scalar
+  VSHL,
+  VLSHR,
+  VASHR,
+
+  // Vector shift by scalar (again)
+  SQSHL_I,
+  UQSHL_I,
+  SQSHLU_I,
+  SRSHR_I,
+  URSHR_I,
+
+  // Vector comparisons
+  CMEQ,
+  CMGE,
+  CMGT,
+  CMHI,
+  CMHS,
+  FCMEQ,
+  FCMGE,
+  FCMGT,
+
+  // Vector zero comparisons
+  CMEQz,
+  CMGEz,
+  CMGTz,
+  CMLEz,
+  CMLTz,
+  FCMEQz,
+  FCMGEz,
+  FCMGTz,
+  FCMLEz,
+  FCMLTz,
+
+  // Vector bitwise negation
+  NOT,
+
+  // Vector bitwise selection
+  BIT,
+
+  // Compare-and-branch
+  CBZ,
+  CBNZ,
+  TBZ,
+  TBNZ,
+
+  // Tail calls
+  TC_RETURN,
+
+  // Custom prefetch handling
+  PREFETCH,
+
+  // {s|u}int to FP within a FP register.
+  SITOF,
+  UITOF,
+
+  // NEON Load/Store with post-increment base updates
+  LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE,
+  LD3post,
+  LD4post,
+  ST2post,
+  ST3post,
+  ST4post,
+  LD1x2post,
+  LD1x3post,
+  LD1x4post,
+  ST1x2post,
+  ST1x3post,
+  ST1x4post,
+  LD1DUPpost,
+  LD2DUPpost,
+  LD3DUPpost,
+  LD4DUPpost,
+  LD1LANEpost,
+  LD2LANEpost,
+  LD3LANEpost,
+  LD4LANEpost,
+  ST2LANEpost,
+  ST3LANEpost,
+  ST4LANEpost
+};
+
+} // end namespace AArch64ISD
+
+class AArch64Subtarget;
+class AArch64TargetMachine;
+
+class AArch64TargetLowering : public TargetLowering {
+  bool RequireStrictAlign;
+
+public:
+  explicit AArch64TargetLowering(TargetMachine &TM);
+
+  /// Selects the correct CCAssignFn for a the given CallingConvention
+  /// value.
+  CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const;
+
+  /// computeKnownBitsForTargetNode - Determine which of the bits specified in
+  /// Mask are known to be either zero or one and return them in the
+  /// KnownZero/KnownOne bitsets.
+  void computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero,
+                                     APInt &KnownOne, const SelectionDAG &DAG,
+                                     unsigned Depth = 0) const override;
+
+  MVT getScalarShiftAmountTy(EVT LHSTy) const override;
+
+  /// allowsUnalignedMemoryAccesses - Returns true if the target allows
+  /// unaligned memory accesses. of the specified type.
+  bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AddrSpace = 0,
+                                     bool *Fast = nullptr) const override {
+    if (RequireStrictAlign)
+      return false;
+    // FIXME: True for Cyclone, but not necessary others.
+    if (Fast)
+      *Fast = true;
+    return true;
+  }
+
+  /// LowerOperation - Provide custom lowering hooks for some operations.
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+  const char *getTargetNodeName(unsigned Opcode) const override;
+
+  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
+  /// getFunctionAlignment - Return the Log2 alignment of this function.
+  unsigned getFunctionAlignment(const Function *F) const;
+
+  /// getMaximalGlobalOffset - Returns the maximal possible offset which can
+  /// be used for loads / stores from the global.
+  unsigned getMaximalGlobalOffset() const override;
+
+  /// Returns true if a cast between SrcAS and DestAS is a noop.
+  bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
+    // Addrspacecasts are always noops.
+    return true;
+  }
+
+  /// createFastISel - This method returns a target specific FastISel object,
+  /// or null if the target does not support "fast" ISel.
+  FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                           const TargetLibraryInfo *libInfo) const override;
+
+  bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+
+  bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
+
+  /// isShuffleMaskLegal - Return true if the given shuffle mask can be
+  /// codegen'd directly, or if it should be stack expanded.
+  bool isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const override;
+
+  /// getSetCCResultType - Return the ISD::SETCC ValueType
+  EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
+
+  SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const;
+
+  MachineBasicBlock *EmitF128CSEL(MachineInstr *MI,
+                                  MachineBasicBlock *BB) const;
+
+  MachineBasicBlock *
+  EmitInstrWithCustomInserter(MachineInstr *MI,
+                              MachineBasicBlock *MBB) const override;
+
+  bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
+                          unsigned Intrinsic) const override;
+
+  bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
+  bool isTruncateFree(EVT VT1, EVT VT2) const override;
+
+  bool isZExtFree(Type *Ty1, Type *Ty2) const override;
+  bool isZExtFree(EVT VT1, EVT VT2) const override;
+  bool isZExtFree(SDValue Val, EVT VT2) const override;
+
+  bool hasPairedLoad(Type *LoadedType,
+                     unsigned &RequiredAligment) const override;
+  bool hasPairedLoad(EVT LoadedType, unsigned &RequiredAligment) const override;
+
+  bool isLegalAddImmediate(int64_t) const override;
+  bool isLegalICmpImmediate(int64_t) const override;
+
+  EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
+                          bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
+                          MachineFunction &MF) const override;
+
+  /// isLegalAddressingMode - Return true if the addressing mode represented
+  /// by AM is legal for this target, for a load/store of the specified type.
+  bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
+
+  /// \brief Return the cost of the scaling factor used in the addressing
+  /// mode represented by AM for this target, for a load/store
+  /// of the specified type.
+  /// If the AM is supported, the return value must be >= 0.
+  /// If the AM is not supported, it returns a negative value.
+  int getScalingFactorCost(const AddrMode &AM, Type *Ty) const override;
+
+  /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
+  /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
+  /// expanded to FMAs when this method returns true, otherwise fmuladd is
+  /// expanded to fmul + fadd.
+  bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
+
+  const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
+
+  /// \brief Returns false if N is a bit extraction pattern of (X >> C) & Mask.
+  bool isDesirableToCommuteWithShift(const SDNode *N) const override;
+
+  /// \brief Returns true if it is beneficial to convert a load of a constant
+  /// to just the constant itself.
+  bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                         Type *Ty) const override;
+
+  Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
+                        AtomicOrdering Ord) const override;
+  Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
+                              Value *Addr, AtomicOrdering Ord) const override;
+
+  bool shouldExpandAtomicInIR(Instruction *Inst) const override;
+
+  TargetLoweringBase::LegalizeTypeAction
+  getPreferredVectorAction(EVT VT) const override;
+
+private:
+  /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const AArch64Subtarget *Subtarget;
+
+  void addTypeForNEON(EVT VT, EVT PromotedBitwiseVT);
+  void addDRTypeForNEON(MVT VT);
+  void addQRTypeForNEON(MVT VT);
+
+  SDValue
+  LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                       const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL,
+                       SelectionDAG &DAG,
+                       SmallVectorImpl<SDValue> &InVals) const override;
+
+  SDValue LowerCall(CallLoweringInfo & /*CLI*/,
+                    SmallVectorImpl<SDValue> &InVals) const override;
+
+  SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                          CallingConv::ID CallConv, bool isVarArg,
+                          const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL,
+                          SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
+                          bool isThisReturn, SDValue ThisVal) const;
+
+  bool isEligibleForTailCallOptimization(
+      SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
+      bool isCalleeStructRet, bool isCallerStructRet,
+      const SmallVectorImpl<ISD::OutputArg> &Outs,
+      const SmallVectorImpl<SDValue> &OutVals,
+      const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const;
+
+  /// Finds the incoming stack arguments which overlap the given fixed stack
+  /// object and incorporates their load into the current chain. This prevents
+  /// an upcoming store from clobbering the stack argument before it's used.
+  SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG,
+                              MachineFrameInfo *MFI, int ClobberedFI) const;
+
+  bool DoesCalleeRestoreStack(CallingConv::ID CallCC, bool TailCallOpt) const;
+
+  bool IsTailCallConvention(CallingConv::ID CallCC) const;
+
+  void saveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, SDLoc DL,
+                           SDValue &Chain) const;
+
+  bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+                      bool isVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      LLVMContext &Context) const override;
+
+  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      const SmallVectorImpl<SDValue> &OutVals, SDLoc DL,
+                      SelectionDAG &DAG) const override;
+
+  SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerDarwinGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerELFGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerELFTLSDescCall(SDValue SymAddr, SDValue DescAddr, SDLoc DL,
+                              SelectionDAG &DAG) const;
+  SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerAAPCS_VASTART(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerDarwin_VASTART(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerF128Call(SDValue Op, SelectionDAG &DAG,
+                        RTLIB::Libcall Call) const;
+  SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVectorAND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
+
+  ConstraintType
+  getConstraintType(const std::string &Constraint) const override;
+  unsigned getRegisterByName(const char* RegName, EVT VT) const override;
+
+  /// Examine constraint string and operand type and determine a weight value.
+  /// The operand object must already have been set up with the operand type.
+  ConstraintWeight
+  getSingleConstraintMatchWeight(AsmOperandInfo &info,
+                                 const char *constraint) const override;
+
+  std::pair<unsigned, const TargetRegisterClass *>
+  getRegForInlineAsmConstraint(const std::string &Constraint,
+                               MVT VT) const override;
+  void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
+                                    std::vector<SDValue> &Ops,
+                                    SelectionDAG &DAG) const override;
+
+  bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
+  bool mayBeEmittedAsTailCall(CallInst *CI) const override;
+  bool getIndexedAddressParts(SDNode *Op, SDValue &Base, SDValue &Offset,
+                              ISD::MemIndexedMode &AM, bool &IsInc,
+                              SelectionDAG &DAG) const;
+  bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset,
+                                 ISD::MemIndexedMode &AM,
+                                 SelectionDAG &DAG) const override;
+  bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base,
+                                  SDValue &Offset, ISD::MemIndexedMode &AM,
+                                  SelectionDAG &DAG) const override;
+
+  void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                          SelectionDAG &DAG) const override;
+};
+
+namespace AArch64 {
+FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                         const TargetLibraryInfo *libInfo);
+} // end namespace AArch64
+
+} // end namespace llvm
+
+#endif // LLVM_TARGET_AArch64_ISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrAtomics.td b/contrib/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
new file mode 100644
index 0000000..3b9e3c6
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
@@ -0,0 +1,364 @@
+//=- AArch64InstrAtomics.td - AArch64 Atomic codegen support -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// AArch64 Atomic operand code-gen constructs.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------
+// Atomic fences
+//===----------------------------------
+def : Pat<(atomic_fence (i64 4), (imm)), (DMB (i32 0x9))>;
+def : Pat<(atomic_fence (imm), (imm)), (DMB (i32 0xb))>;
+
+//===----------------------------------
+// Atomic loads
+//===----------------------------------
+
+// When they're actually atomic, only one addressing mode (GPR64sp) is
+// supported, but when they're relaxed and anything can be used, all the
+// standard modes would be valid and may give efficiency gains.
+
+// A atomic load operation that actually needs acquire semantics.
+class acquiring_load<PatFrag base>
+  : PatFrag<(ops node:$ptr), (base node:$ptr), [{
+  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+  assert(Ordering != AcquireRelease && "unexpected load ordering");
+  return Ordering == Acquire || Ordering == SequentiallyConsistent;
+}]>;
+
+// An atomic load operation that does not need either acquire or release
+// semantics.
+class relaxed_load<PatFrag base>
+  : PatFrag<(ops node:$ptr), (base node:$ptr), [{
+  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+  return Ordering == Monotonic || Ordering == Unordered;
+}]>;
+
+// 8-bit loads
+def : Pat<(acquiring_load<atomic_load_8>  GPR64sp:$ptr), (LDARB GPR64sp:$ptr)>;
+def : Pat<(relaxed_load<atomic_load_8> (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm,
+                                                     ro_Wextend8:$offset)),
+          (LDRBBroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$offset)>;
+def : Pat<(relaxed_load<atomic_load_8> (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm,
+                                                     ro_Xextend8:$offset)),
+          (LDRBBroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$offset)>;
+def : Pat<(relaxed_load<atomic_load_8> (am_indexed8 GPR64sp:$Rn,
+                                                    uimm12s1:$offset)),
+          (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
+def : Pat<(relaxed_load<atomic_load_8>
+               (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
+          (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
+
+// 16-bit loads
+def : Pat<(acquiring_load<atomic_load_16> GPR64sp:$ptr), (LDARH GPR64sp:$ptr)>;
+def : Pat<(relaxed_load<atomic_load_16> (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
+                                                       ro_Wextend16:$extend)),
+          (LDRHHroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend)>;
+def : Pat<(relaxed_load<atomic_load_16> (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
+                                                       ro_Xextend16:$extend)),
+          (LDRHHroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend)>;
+def : Pat<(relaxed_load<atomic_load_16> (am_indexed16 GPR64sp:$Rn,
+                                                      uimm12s2:$offset)),
+          (LDRHHui GPR64sp:$Rn, uimm12s2:$offset)>;
+def : Pat<(relaxed_load<atomic_load_16>
+               (am_unscaled16 GPR64sp:$Rn, simm9:$offset)),
+          (LDURHHi GPR64sp:$Rn, simm9:$offset)>;
+
+// 32-bit loads
+def : Pat<(acquiring_load<atomic_load_32> GPR64sp:$ptr), (LDARW GPR64sp:$ptr)>;
+def : Pat<(relaxed_load<atomic_load_32> (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm,
+                                                       ro_Wextend32:$extend)),
+          (LDRWroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend)>;
+def : Pat<(relaxed_load<atomic_load_32> (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm,
+                                                       ro_Xextend32:$extend)),
+          (LDRWroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend)>;
+def : Pat<(relaxed_load<atomic_load_32> (am_indexed32 GPR64sp:$Rn,
+                                                      uimm12s4:$offset)),
+          (LDRWui GPR64sp:$Rn, uimm12s4:$offset)>;
+def : Pat<(relaxed_load<atomic_load_32>
+               (am_unscaled32 GPR64sp:$Rn, simm9:$offset)),
+          (LDURWi GPR64sp:$Rn, simm9:$offset)>;
+
+// 64-bit loads
+def : Pat<(acquiring_load<atomic_load_64> GPR64sp:$ptr), (LDARX GPR64sp:$ptr)>;
+def : Pat<(relaxed_load<atomic_load_64> (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+                                                       ro_Wextend64:$extend)),
+          (LDRXroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;
+def : Pat<(relaxed_load<atomic_load_64> (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+                                                       ro_Xextend64:$extend)),
+          (LDRXroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
+def : Pat<(relaxed_load<atomic_load_64> (am_indexed64 GPR64sp:$Rn,
+                                                      uimm12s8:$offset)),
+          (LDRXui GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat<(relaxed_load<atomic_load_64>
+               (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+          (LDURXi GPR64sp:$Rn, simm9:$offset)>;
+
+//===----------------------------------
+// Atomic stores
+//===----------------------------------
+
+// When they're actually atomic, only one addressing mode (GPR64sp) is
+// supported, but when they're relaxed and anything can be used, all the
+// standard modes would be valid and may give efficiency gains.
+
+// A store operation that actually needs release semantics.
+class releasing_store<PatFrag base>
+  : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
+  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+  assert(Ordering != AcquireRelease && "unexpected store ordering");
+  return Ordering == Release || Ordering == SequentiallyConsistent;
+}]>;
+
+// An atomic store operation that doesn't actually need to be atomic on AArch64.
+class relaxed_store<PatFrag base>
+  : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
+  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+  return Ordering == Monotonic || Ordering == Unordered;
+}]>;
+
+// 8-bit stores
+def : Pat<(releasing_store<atomic_store_8> GPR64sp:$ptr, GPR32:$val),
+          (STLRB GPR32:$val, GPR64sp:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_8>
+               (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend),
+               GPR32:$val),
+          (STRBBroW GPR32:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend)>;
+def : Pat<(relaxed_store<atomic_store_8>
+               (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend),
+               GPR32:$val),
+          (STRBBroX GPR32:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend)>;
+def : Pat<(relaxed_store<atomic_store_8>
+               (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset), GPR32:$val),
+          (STRBBui GPR32:$val, GPR64sp:$Rn, uimm12s1:$offset)>;
+def : Pat<(relaxed_store<atomic_store_8>
+               (am_unscaled8 GPR64sp:$Rn, simm9:$offset), GPR32:$val),
+          (STURBBi GPR32:$val, GPR64sp:$Rn, simm9:$offset)>;
+
+// 16-bit stores
+def : Pat<(releasing_store<atomic_store_16> GPR64sp:$ptr, GPR32:$val),
+          (STLRH GPR32:$val, GPR64sp:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_16> (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
+                                                         ro_Wextend16:$extend),
+                                          GPR32:$val),
+          (STRHHroW GPR32:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend)>;
+def : Pat<(relaxed_store<atomic_store_16> (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
+                                                         ro_Xextend16:$extend),
+                                          GPR32:$val),
+          (STRHHroX GPR32:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend)>;
+def : Pat<(relaxed_store<atomic_store_16>
+              (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset), GPR32:$val),
+          (STRHHui GPR32:$val, GPR64sp:$Rn, uimm12s2:$offset)>;
+def : Pat<(relaxed_store<atomic_store_16>
+               (am_unscaled16 GPR64sp:$Rn, simm9:$offset), GPR32:$val),
+          (STURHHi GPR32:$val, GPR64sp:$Rn, simm9:$offset)>;
+
+// 32-bit stores
+def : Pat<(releasing_store<atomic_store_32> GPR64sp:$ptr, GPR32:$val),
+          (STLRW GPR32:$val, GPR64sp:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_32> (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm,
+                                                         ro_Wextend32:$extend),
+                                          GPR32:$val),
+          (STRWroW GPR32:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend)>;
+def : Pat<(relaxed_store<atomic_store_32> (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm,
+                                                         ro_Xextend32:$extend),
+                                          GPR32:$val),
+          (STRWroX GPR32:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend)>;
+def : Pat<(relaxed_store<atomic_store_32>
+              (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset), GPR32:$val),
+          (STRWui GPR32:$val, GPR64sp:$Rn, uimm12s4:$offset)>;
+def : Pat<(relaxed_store<atomic_store_32>
+               (am_unscaled32 GPR64sp:$Rn, simm9:$offset), GPR32:$val),
+          (STURWi GPR32:$val, GPR64sp:$Rn, simm9:$offset)>;
+
+// 64-bit stores
+def : Pat<(releasing_store<atomic_store_64> GPR64sp:$ptr, GPR64:$val),
+          (STLRX GPR64:$val, GPR64sp:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_64> (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+                                                         ro_Wextend16:$extend),
+                                          GPR64:$val),
+          (STRXroW GPR64:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;
+def : Pat<(relaxed_store<atomic_store_64> (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+                                                         ro_Xextend16:$extend),
+                                          GPR64:$val),
+          (STRXroX GPR64:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
+def : Pat<(relaxed_store<atomic_store_64>
+              (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset), GPR64:$val),
+          (STRXui GPR64:$val, GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat<(relaxed_store<atomic_store_64>
+               (am_unscaled64 GPR64sp:$Rn, simm9:$offset), GPR64:$val),
+          (STURXi GPR64:$val, GPR64sp:$Rn, simm9:$offset)>;
+
+//===----------------------------------
+// Low-level exclusive operations
+//===----------------------------------
+
+// Load-exclusives.
+
+def ldxr_1 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def ldxr_2 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def ldxr_4 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def ldxr_8 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+def : Pat<(ldxr_1 GPR64sp:$addr),
+          (SUBREG_TO_REG (i64 0), (LDXRB GPR64sp:$addr), sub_32)>;
+def : Pat<(ldxr_2 GPR64sp:$addr),
+          (SUBREG_TO_REG (i64 0), (LDXRH GPR64sp:$addr), sub_32)>;
+def : Pat<(ldxr_4 GPR64sp:$addr),
+          (SUBREG_TO_REG (i64 0), (LDXRW GPR64sp:$addr), sub_32)>;
+def : Pat<(ldxr_8 GPR64sp:$addr), (LDXRX GPR64sp:$addr)>;
+
+def : Pat<(and (ldxr_1 GPR64sp:$addr), 0xff),
+          (SUBREG_TO_REG (i64 0), (LDXRB GPR64sp:$addr), sub_32)>;
+def : Pat<(and (ldxr_2 GPR64sp:$addr), 0xffff),
+          (SUBREG_TO_REG (i64 0), (LDXRH GPR64sp:$addr), sub_32)>;
+def : Pat<(and (ldxr_4 GPR64sp:$addr), 0xffffffff),
+          (SUBREG_TO_REG (i64 0), (LDXRW GPR64sp:$addr), sub_32)>;
+
+// Load-exclusives.
+
+def ldaxr_1 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def ldaxr_2 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def ldaxr_4 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def ldaxr_8 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+def : Pat<(ldaxr_1 GPR64sp:$addr),
+          (SUBREG_TO_REG (i64 0), (LDAXRB GPR64sp:$addr), sub_32)>;
+def : Pat<(ldaxr_2 GPR64sp:$addr),
+          (SUBREG_TO_REG (i64 0), (LDAXRH GPR64sp:$addr), sub_32)>;
+def : Pat<(ldaxr_4 GPR64sp:$addr),
+          (SUBREG_TO_REG (i64 0), (LDAXRW GPR64sp:$addr), sub_32)>;
+def : Pat<(ldaxr_8 GPR64sp:$addr), (LDAXRX GPR64sp:$addr)>;
+
+def : Pat<(and (ldaxr_1 GPR64sp:$addr), 0xff),
+          (SUBREG_TO_REG (i64 0), (LDAXRB GPR64sp:$addr), sub_32)>;
+def : Pat<(and (ldaxr_2 GPR64sp:$addr), 0xffff),
+          (SUBREG_TO_REG (i64 0), (LDAXRH GPR64sp:$addr), sub_32)>;
+def : Pat<(and (ldaxr_4 GPR64sp:$addr), 0xffffffff),
+          (SUBREG_TO_REG (i64 0), (LDAXRW GPR64sp:$addr), sub_32)>;
+
+// Store-exclusives.
+
+def stxr_1 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def stxr_2 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def stxr_4 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def stxr_8 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+
+def : Pat<(stxr_1 GPR64:$val, GPR64sp:$addr),
+          (STXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stxr_2 GPR64:$val, GPR64sp:$addr),
+          (STXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stxr_4 GPR64:$val, GPR64sp:$addr),
+          (STXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stxr_8 GPR64:$val, GPR64sp:$addr),
+          (STXRX GPR64:$val, GPR64sp:$addr)>;
+
+def : Pat<(stxr_1 (zext (and GPR32:$val, 0xff)), GPR64sp:$addr),
+          (STXRB GPR32:$val, GPR64sp:$addr)>;
+def : Pat<(stxr_2 (zext (and GPR32:$val, 0xffff)), GPR64sp:$addr),
+          (STXRH GPR32:$val, GPR64sp:$addr)>;
+def : Pat<(stxr_4 (zext GPR32:$val), GPR64sp:$addr),
+          (STXRW GPR32:$val, GPR64sp:$addr)>;
+
+def : Pat<(stxr_1 (and GPR64:$val, 0xff), GPR64sp:$addr),
+          (STXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stxr_2 (and GPR64:$val, 0xffff), GPR64sp:$addr),
+          (STXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stxr_4 (and GPR64:$val, 0xffffffff), GPR64sp:$addr),
+          (STXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+
+// Store-release-exclusives.
+
+def stlxr_1 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stlxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def stlxr_2 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stlxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def stlxr_4 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stlxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def stlxr_8 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stlxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+
+def : Pat<(stlxr_1 GPR64:$val, GPR64sp:$addr),
+          (STLXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stlxr_2 GPR64:$val, GPR64sp:$addr),
+          (STLXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stlxr_4 GPR64:$val, GPR64sp:$addr),
+          (STLXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stlxr_8 GPR64:$val, GPR64sp:$addr),
+          (STLXRX GPR64:$val, GPR64sp:$addr)>;
+
+def : Pat<(stlxr_1 (zext (and GPR32:$val, 0xff)), GPR64sp:$addr),
+          (STLXRB GPR32:$val, GPR64sp:$addr)>;
+def : Pat<(stlxr_2 (zext (and GPR32:$val, 0xffff)), GPR64sp:$addr),
+          (STLXRH GPR32:$val, GPR64sp:$addr)>;
+def : Pat<(stlxr_4 (zext GPR32:$val), GPR64sp:$addr),
+          (STLXRW GPR32:$val, GPR64sp:$addr)>;
+
+def : Pat<(stlxr_1 (and GPR64:$val, 0xff), GPR64sp:$addr),
+          (STLXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stlxr_2 (and GPR64:$val, 0xffff), GPR64sp:$addr),
+          (STLXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stlxr_4 (and GPR64:$val, 0xffffffff), GPR64sp:$addr),
+          (STLXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+
+
+// And clear exclusive.
+
+def : Pat<(int_aarch64_clrex), (CLREX 0xf)>;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td
new file mode 100644
index 0000000..e88c0c0
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -0,0 +1,8625 @@
+//===- AArch64InstrFormats.td - AArch64 Instruction Formats --*- tblgen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Describe AArch64 instructions format here
+//
+
+// Format specifies the encoding used by the instruction.  This is part of the
+// ad-hoc solution used to emit machine instruction encodings by our machine
+// code emitter.
+class Format<bits<2> val> {
+  bits<2> Value = val;
+}
+
+def PseudoFrm   : Format<0>;
+def NormalFrm   : Format<1>; // Do we need any others?
+
+// AArch64 Instruction Format
+class AArch64Inst<Format f, string cstr> : Instruction {
+  field bits<32> Inst; // Instruction encoding.
+  // Mask of bits that cause an encoding to be UNPREDICTABLE.
+  // If a bit is set, then if the corresponding bit in the
+  // target encoding differs from its value in the "Inst" field,
+  // the instruction is UNPREDICTABLE (SoftFail in abstract parlance).
+  field bits<32> Unpredictable = 0;
+  // SoftFail is the generic name for this field, but we alias it so
+  // as to make it more obvious what it means in ARM-land.
+  field bits<32> SoftFail = Unpredictable;
+  let Namespace   = "AArch64";
+  Format F        = f;
+  bits<2> Form    = F.Value;
+  let Pattern     = [];
+  let Constraints = cstr;
+}
+
+// Pseudo instructions (don't have encoding information)
+class Pseudo<dag oops, dag iops, list<dag> pattern, string cstr = "">
+    : AArch64Inst<PseudoFrm, cstr> {
+  dag OutOperandList = oops;
+  dag InOperandList  = iops;
+  let Pattern        = pattern;
+  let isCodeGenOnly  = 1;
+}
+
+// Real instructions (have encoding information)
+class EncodedI<string cstr, list<dag> pattern> : AArch64Inst<NormalFrm, cstr> {
+  let Pattern = pattern;
+  let Size = 4;
+}
+
+// Normal instructions
+class I<dag oops, dag iops, string asm, string operands, string cstr,
+        list<dag> pattern>
+    : EncodedI<cstr, pattern> {
+  dag OutOperandList = oops;
+  dag InOperandList  = iops;
+  let AsmString      = !strconcat(asm, operands);
+}
+
+class TriOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$MHS, node:$RHS), res>;
+class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
+class UnOpFrag<dag res>  : PatFrag<(ops node:$LHS), res>;
+
+// Helper fragment for an extract of the high portion of a 128-bit vector.
+def extract_high_v16i8 :
+   UnOpFrag<(extract_subvector (v16i8 node:$LHS), (i64 8))>;
+def extract_high_v8i16 :
+   UnOpFrag<(extract_subvector (v8i16 node:$LHS), (i64 4))>;
+def extract_high_v4i32 :
+   UnOpFrag<(extract_subvector (v4i32 node:$LHS), (i64 2))>;
+def extract_high_v2i64 :
+   UnOpFrag<(extract_subvector (v2i64 node:$LHS), (i64 1))>;
+
+//===----------------------------------------------------------------------===//
+// Asm Operand Classes.
+//
+
+// Shifter operand for arithmetic shifted encodings.
+def ShifterOperand : AsmOperandClass {
+  let Name = "Shifter";
+}
+
+// Shifter operand for mov immediate encodings.
+def MovImm32ShifterOperand : AsmOperandClass {
+  let SuperClasses = [ShifterOperand];
+  let Name = "MovImm32Shifter";
+  let RenderMethod = "addShifterOperands";
+  let DiagnosticType = "InvalidMovImm32Shift";
+}
+def MovImm64ShifterOperand : AsmOperandClass {
+  let SuperClasses = [ShifterOperand];
+  let Name = "MovImm64Shifter";
+  let RenderMethod = "addShifterOperands";
+  let DiagnosticType = "InvalidMovImm64Shift";
+}
+
+// Shifter operand for arithmetic register shifted encodings.
+class ArithmeticShifterOperand<int width> : AsmOperandClass {
+  let SuperClasses = [ShifterOperand];
+  let Name = "ArithmeticShifter" # width;
+  let PredicateMethod = "isArithmeticShifter<" # width # ">";
+  let RenderMethod = "addShifterOperands";
+  let DiagnosticType = "AddSubRegShift" # width;
+}
+
+def ArithmeticShifterOperand32 : ArithmeticShifterOperand<32>;
+def ArithmeticShifterOperand64 : ArithmeticShifterOperand<64>;
+
+// Shifter operand for logical register shifted encodings.
+class LogicalShifterOperand<int width> : AsmOperandClass {
+  let SuperClasses = [ShifterOperand];
+  let Name = "LogicalShifter" # width;
+  let PredicateMethod = "isLogicalShifter<" # width # ">";
+  let RenderMethod = "addShifterOperands";
+  let DiagnosticType = "AddSubRegShift" # width;
+}
+
+def LogicalShifterOperand32 : LogicalShifterOperand<32>;
+def LogicalShifterOperand64 : LogicalShifterOperand<64>;
+
+// Shifter operand for logical vector 128/64-bit shifted encodings.
+def LogicalVecShifterOperand : AsmOperandClass {
+  let SuperClasses = [ShifterOperand];
+  let Name = "LogicalVecShifter";
+  let RenderMethod = "addShifterOperands";
+}
+def LogicalVecHalfWordShifterOperand : AsmOperandClass {
+  let SuperClasses = [LogicalVecShifterOperand];
+  let Name = "LogicalVecHalfWordShifter";
+  let RenderMethod = "addShifterOperands";
+}
+
+// The "MSL" shifter on the vector MOVI instruction.
+def MoveVecShifterOperand : AsmOperandClass {
+  let SuperClasses = [ShifterOperand];
+  let Name = "MoveVecShifter";
+  let RenderMethod = "addShifterOperands";
+}
+
+// Extend operand for arithmetic encodings.
+def ExtendOperand : AsmOperandClass {
+  let Name = "Extend";
+  let DiagnosticType = "AddSubRegExtendLarge";
+}
+def ExtendOperand64 : AsmOperandClass {
+  let SuperClasses = [ExtendOperand];
+  let Name = "Extend64";
+  let DiagnosticType = "AddSubRegExtendSmall";
+}
+// 'extend' that's a lsl of a 64-bit register.
+def ExtendOperandLSL64 : AsmOperandClass {
+  let SuperClasses = [ExtendOperand];
+  let Name = "ExtendLSL64";
+  let RenderMethod = "addExtend64Operands";
+  let DiagnosticType = "AddSubRegExtendLarge";
+}
+
+// 8-bit floating-point immediate encodings.
+def FPImmOperand : AsmOperandClass {
+  let Name = "FPImm";
+  let ParserMethod = "tryParseFPImm";
+  let DiagnosticType = "InvalidFPImm";
+}
+
+def CondCode : AsmOperandClass {
+  let Name = "CondCode";
+  let DiagnosticType = "InvalidCondCode";
+}
+
+// A 32-bit register pasrsed as 64-bit
+def GPR32as64Operand : AsmOperandClass {
+  let Name = "GPR32as64";
+}
+def GPR32as64 : RegisterOperand<GPR32> {
+  let ParserMatchClass = GPR32as64Operand;
+}
+
+// 8-bit immediate for AdvSIMD where 64-bit values of the form:
+// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
+// are encoded as the eight bit value 'abcdefgh'.
+def SIMDImmType10Operand : AsmOperandClass { let Name = "SIMDImmType10"; }
+
+
+//===----------------------------------------------------------------------===//
+// Operand Definitions.
+//
+
+// ADR[P] instruction labels.
+def AdrpOperand : AsmOperandClass {
+  let Name = "AdrpLabel";
+  let ParserMethod = "tryParseAdrpLabel";
+  let DiagnosticType = "InvalidLabel";
+}
+def adrplabel : Operand<i64> {
+  let EncoderMethod = "getAdrLabelOpValue";
+  let PrintMethod = "printAdrpLabel";
+  let ParserMatchClass = AdrpOperand;
+}
+
+def AdrOperand : AsmOperandClass {
+  let Name = "AdrLabel";
+  let ParserMethod = "tryParseAdrLabel";
+  let DiagnosticType = "InvalidLabel";
+}
+def adrlabel : Operand<i64> {
+  let EncoderMethod = "getAdrLabelOpValue";
+  let ParserMatchClass = AdrOperand;
+}
+
+// simm9 predicate - True if the immediate is in the range [-256, 255].
+def SImm9Operand : AsmOperandClass {
+  let Name = "SImm9";
+  let DiagnosticType = "InvalidMemoryIndexedSImm9";
+}
+def simm9 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= -256 && Imm < 256; }]> {
+  let ParserMatchClass = SImm9Operand;
+}
+
+// simm7sN predicate - True if the immediate is a multiple of N in the range
+// [-64 * N, 63 * N].
+class SImm7Scaled<int Scale> : AsmOperandClass {
+  let Name = "SImm7s" # Scale;
+  let DiagnosticType = "InvalidMemoryIndexed" # Scale # "SImm7";
+}
+
+def SImm7s4Operand : SImm7Scaled<4>;
+def SImm7s8Operand : SImm7Scaled<8>;
+def SImm7s16Operand : SImm7Scaled<16>;
+
+def simm7s4 : Operand<i32> {
+  let ParserMatchClass = SImm7s4Operand;
+  let PrintMethod = "printImmScale<4>";
+}
+
+def simm7s8 : Operand<i32> {
+  let ParserMatchClass = SImm7s8Operand;
+  let PrintMethod = "printImmScale<8>";
+}
+
+def simm7s16 : Operand<i32> {
+  let ParserMatchClass = SImm7s16Operand;
+  let PrintMethod = "printImmScale<16>";
+}
+
+class AsmImmRange<int Low, int High> : AsmOperandClass {
+  let Name = "Imm" # Low # "_" # High;
+  let DiagnosticType = "InvalidImm" # Low # "_" # High;
+}
+
+def Imm1_8Operand : AsmImmRange<1, 8>;
+def Imm1_16Operand : AsmImmRange<1, 16>;
+def Imm1_32Operand : AsmImmRange<1, 32>;
+def Imm1_64Operand : AsmImmRange<1, 64>;
+
+def MovZSymbolG3AsmOperand : AsmOperandClass {
+  let Name = "MovZSymbolG3";
+  let RenderMethod = "addImmOperands";
+}
+
+def movz_symbol_g3 : Operand<i32> {
+  let ParserMatchClass = MovZSymbolG3AsmOperand;
+}
+
+def MovZSymbolG2AsmOperand : AsmOperandClass {
+  let Name = "MovZSymbolG2";
+  let RenderMethod = "addImmOperands";
+}
+
+def movz_symbol_g2 : Operand<i32> {
+  let ParserMatchClass = MovZSymbolG2AsmOperand;
+}
+
+def MovZSymbolG1AsmOperand : AsmOperandClass {
+  let Name = "MovZSymbolG1";
+  let RenderMethod = "addImmOperands";
+}
+
+def movz_symbol_g1 : Operand<i32> {
+  let ParserMatchClass = MovZSymbolG1AsmOperand;
+}
+
+def MovZSymbolG0AsmOperand : AsmOperandClass {
+  let Name = "MovZSymbolG0";
+  let RenderMethod = "addImmOperands";
+}
+
+def movz_symbol_g0 : Operand<i32> {
+  let ParserMatchClass = MovZSymbolG0AsmOperand;
+}
+
+def MovKSymbolG3AsmOperand : AsmOperandClass {
+  let Name = "MovKSymbolG3";
+  let RenderMethod = "addImmOperands";
+}
+
+def movk_symbol_g3 : Operand<i32> {
+  let ParserMatchClass = MovKSymbolG3AsmOperand;
+}
+
+def MovKSymbolG2AsmOperand : AsmOperandClass {
+  let Name = "MovKSymbolG2";
+  let RenderMethod = "addImmOperands";
+}
+
+def movk_symbol_g2 : Operand<i32> {
+  let ParserMatchClass = MovKSymbolG2AsmOperand;
+}
+
+def MovKSymbolG1AsmOperand : AsmOperandClass {
+  let Name = "MovKSymbolG1";
+  let RenderMethod = "addImmOperands";
+}
+
+def movk_symbol_g1 : Operand<i32> {
+  let ParserMatchClass = MovKSymbolG1AsmOperand;
+}
+
+def MovKSymbolG0AsmOperand : AsmOperandClass {
+  let Name = "MovKSymbolG0";
+  let RenderMethod = "addImmOperands";
+}
+
+def movk_symbol_g0 : Operand<i32> {
+  let ParserMatchClass = MovKSymbolG0AsmOperand;
+}
+
+class fixedpoint_i32<ValueType FloatVT>
+  : Operand<FloatVT>,
+    ComplexPattern<FloatVT, 1, "SelectCVTFixedPosOperand<32>", [fpimm, ld]> {
+  let EncoderMethod = "getFixedPointScaleOpValue";
+  let DecoderMethod = "DecodeFixedPointScaleImm32";
+  let ParserMatchClass = Imm1_32Operand;
+}
+
+class fixedpoint_i64<ValueType FloatVT>
+  : Operand<FloatVT>,
+    ComplexPattern<FloatVT, 1, "SelectCVTFixedPosOperand<64>", [fpimm, ld]> {
+  let EncoderMethod = "getFixedPointScaleOpValue";
+  let DecoderMethod = "DecodeFixedPointScaleImm64";
+  let ParserMatchClass = Imm1_64Operand;
+}
+
+def fixedpoint_f32_i32 : fixedpoint_i32<f32>;
+def fixedpoint_f64_i32 : fixedpoint_i32<f64>;
+
+def fixedpoint_f32_i64 : fixedpoint_i64<f32>;
+def fixedpoint_f64_i64 : fixedpoint_i64<f64>;
+
+def vecshiftR8 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 9);
+}]> {
+  let EncoderMethod = "getVecShiftR8OpValue";
+  let DecoderMethod = "DecodeVecShiftR8Imm";
+  let ParserMatchClass = Imm1_8Operand;
+}
+def vecshiftR16 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 17);
+}]> {
+  let EncoderMethod = "getVecShiftR16OpValue";
+  let DecoderMethod = "DecodeVecShiftR16Imm";
+  let ParserMatchClass = Imm1_16Operand;
+}
+def vecshiftR16Narrow : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 9);
+}]> {
+  let EncoderMethod = "getVecShiftR16OpValue";
+  let DecoderMethod = "DecodeVecShiftR16ImmNarrow";
+  let ParserMatchClass = Imm1_8Operand;
+}
+def vecshiftR32 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 33);
+}]> {
+  let EncoderMethod = "getVecShiftR32OpValue";
+  let DecoderMethod = "DecodeVecShiftR32Imm";
+  let ParserMatchClass = Imm1_32Operand;
+}
+def vecshiftR32Narrow : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 17);
+}]> {
+  let EncoderMethod = "getVecShiftR32OpValue";
+  let DecoderMethod = "DecodeVecShiftR32ImmNarrow";
+  let ParserMatchClass = Imm1_16Operand;
+}
+def vecshiftR64 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 65);
+}]> {
+  let EncoderMethod = "getVecShiftR64OpValue";
+  let DecoderMethod = "DecodeVecShiftR64Imm";
+  let ParserMatchClass = Imm1_64Operand;
+}
+def vecshiftR64Narrow : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 33);
+}]> {
+  let EncoderMethod = "getVecShiftR64OpValue";
+  let DecoderMethod = "DecodeVecShiftR64ImmNarrow";
+  let ParserMatchClass = Imm1_32Operand;
+}
+
+def Imm0_7Operand : AsmImmRange<0, 7>;
+def Imm0_15Operand : AsmImmRange<0, 15>;
+def Imm0_31Operand : AsmImmRange<0, 31>;
+def Imm0_63Operand : AsmImmRange<0, 63>;
+
+def vecshiftL8 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) < 8);
+}]> {
+  let EncoderMethod = "getVecShiftL8OpValue";
+  let DecoderMethod = "DecodeVecShiftL8Imm";
+  let ParserMatchClass = Imm0_7Operand;
+}
+def vecshiftL16 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) < 16);
+}]> {
+  let EncoderMethod = "getVecShiftL16OpValue";
+  let DecoderMethod = "DecodeVecShiftL16Imm";
+  let ParserMatchClass = Imm0_15Operand;
+}
+def vecshiftL32 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) < 32);
+}]> {
+  let EncoderMethod = "getVecShiftL32OpValue";
+  let DecoderMethod = "DecodeVecShiftL32Imm";
+  let ParserMatchClass = Imm0_31Operand;
+}
+def vecshiftL64 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) < 64);
+}]> {
+  let EncoderMethod = "getVecShiftL64OpValue";
+  let DecoderMethod = "DecodeVecShiftL64Imm";
+  let ParserMatchClass = Imm0_63Operand;
+}
+
+
+// Crazy immediate formats used by 32-bit and 64-bit logical immediate
+// instructions for splatting repeating bit patterns across the immediate.
+def logical_imm32_XFORM : SDNodeXForm<imm, [{
+  uint64_t enc = AArch64_AM::encodeLogicalImmediate(N->getZExtValue(), 32);
+  return CurDAG->getTargetConstant(enc, MVT::i32);
+}]>;
+def logical_imm64_XFORM : SDNodeXForm<imm, [{
+  uint64_t enc = AArch64_AM::encodeLogicalImmediate(N->getZExtValue(), 64);
+  return CurDAG->getTargetConstant(enc, MVT::i32);
+}]>;
+
+let DiagnosticType = "LogicalSecondSource" in {
+  def LogicalImm32Operand : AsmOperandClass {
+    let Name = "LogicalImm32";
+  }
+  def LogicalImm64Operand : AsmOperandClass {
+    let Name = "LogicalImm64";
+  }
+  def LogicalImm32NotOperand : AsmOperandClass {
+    let Name = "LogicalImm32Not";
+  }
+  def LogicalImm64NotOperand : AsmOperandClass {
+    let Name = "LogicalImm64Not";
+  }
+}
+def logical_imm32 : Operand<i32>, PatLeaf<(imm), [{
+  return AArch64_AM::isLogicalImmediate(N->getZExtValue(), 32);
+}], logical_imm32_XFORM> {
+  let PrintMethod = "printLogicalImm32";
+  let ParserMatchClass = LogicalImm32Operand;
+}
+def logical_imm64 : Operand<i64>, PatLeaf<(imm), [{
+  return AArch64_AM::isLogicalImmediate(N->getZExtValue(), 64);
+}], logical_imm64_XFORM> {
+  let PrintMethod = "printLogicalImm64";
+  let ParserMatchClass = LogicalImm64Operand;
+}
+def logical_imm32_not : Operand<i32> {
+  let ParserMatchClass = LogicalImm32NotOperand;
+}
+def logical_imm64_not : Operand<i64> {
+  let ParserMatchClass = LogicalImm64NotOperand;
+}
+
+// imm0_65535 predicate - True if the immediate is in the range [0,65535].
+def Imm0_65535Operand : AsmImmRange<0, 65535>;
+def imm0_65535 : Operand<i32>, ImmLeaf<i32, [{
+  return ((uint32_t)Imm) < 65536;
+}]> {
+  let ParserMatchClass = Imm0_65535Operand;
+  let PrintMethod = "printHexImm";
+}
+
+// imm0_255 predicate - True if the immediate is in the range [0,255].
+def Imm0_255Operand : AsmOperandClass { let Name = "Imm0_255"; }
+def imm0_255 : Operand<i32>, ImmLeaf<i32, [{
+  return ((uint32_t)Imm) < 256;
+}]> {
+  let ParserMatchClass = Imm0_255Operand;
+  let PrintMethod = "printHexImm";
+}
+
+// imm0_127 predicate - True if the immediate is in the range [0,127]
+def Imm0_127Operand : AsmImmRange<0, 127>;
+def imm0_127 : Operand<i32>, ImmLeaf<i32, [{
+  return ((uint32_t)Imm) < 128;
+}]> {
+  let ParserMatchClass = Imm0_127Operand;
+  let PrintMethod = "printHexImm";
+}
+
+// NOTE: These imm0_N operands have to be of type i64 because i64 is the size
+// for all shift-amounts.
+
+// imm0_63 predicate - True if the immediate is in the range [0,63]
+def imm0_63 : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 64;
+}]> {
+  let ParserMatchClass = Imm0_63Operand;
+}
+
+// imm0_31 predicate - True if the immediate is in the range [0,31]
+def imm0_31 : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 32;
+}]> {
+  let ParserMatchClass = Imm0_31Operand;
+}
+
+// imm0_15 predicate - True if the immediate is in the range [0,15]
+def imm0_15 : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 16;
+}]> {
+  let ParserMatchClass = Imm0_15Operand;
+}
+
+// imm0_7 predicate - True if the immediate is in the range [0,7]
+def imm0_7 : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 8;
+}]> {
+  let ParserMatchClass = Imm0_7Operand;
+}
+
+// imm32_0_15 predicate - True if the 32-bit immediate is in the range [0,15]
+def imm32_0_15 : Operand<i32>, ImmLeaf<i32, [{
+  return ((uint32_t)Imm) < 16;
+}]>;
+
+// An arithmetic shifter operand:
+//  {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr
+//  {5-0} - imm6
+class arith_shift<ValueType Ty, int width> : Operand<Ty> {
+  let PrintMethod = "printShifter";
+  let ParserMatchClass = !cast<AsmOperandClass>(
+                         "ArithmeticShifterOperand" # width);
+}
+
+def arith_shift32 : arith_shift<i32, 32>;
+def arith_shift64 : arith_shift<i64, 64>;
+
+class arith_shifted_reg<ValueType Ty, RegisterClass regclass, int width>
+    : Operand<Ty>,
+      ComplexPattern<Ty, 2, "SelectArithShiftedRegister", []> {
+  let PrintMethod = "printShiftedRegister";
+  let MIOperandInfo = (ops regclass, !cast<Operand>("arith_shift" # width));
+}
+
+def arith_shifted_reg32 : arith_shifted_reg<i32, GPR32, 32>;
+def arith_shifted_reg64 : arith_shifted_reg<i64, GPR64, 64>;
+
+// An arithmetic shifter operand:
+//  {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr, 11 = ror
+//  {5-0} - imm6
+class logical_shift<int width> : Operand<i32> {
+  let PrintMethod = "printShifter";
+  let ParserMatchClass = !cast<AsmOperandClass>(
+                         "LogicalShifterOperand" # width);
+}
+
+def logical_shift32 : logical_shift<32>;
+def logical_shift64 : logical_shift<64>;
+
+class logical_shifted_reg<ValueType Ty, RegisterClass regclass, Operand shiftop>
+    : Operand<Ty>,
+      ComplexPattern<Ty, 2, "SelectLogicalShiftedRegister", []> {
+  let PrintMethod = "printShiftedRegister";
+  let MIOperandInfo = (ops regclass, shiftop);
+}
+
+def logical_shifted_reg32 : logical_shifted_reg<i32, GPR32, logical_shift32>;
+def logical_shifted_reg64 : logical_shifted_reg<i64, GPR64, logical_shift64>;
+
+// A logical vector shifter operand:
+//  {7-6} - shift type: 00 = lsl
+//  {5-0} - imm6: #0, #8, #16, or #24
+def logical_vec_shift : Operand<i32> {
+  let PrintMethod = "printShifter";
+  let EncoderMethod = "getVecShifterOpValue";
+  let ParserMatchClass = LogicalVecShifterOperand;
+}
+
+// A logical vector half-word shifter operand:
+//  {7-6} - shift type: 00 = lsl
+//  {5-0} - imm6: #0 or #8
+def logical_vec_hw_shift : Operand<i32> {
+  let PrintMethod = "printShifter";
+  let EncoderMethod = "getVecShifterOpValue";
+  let ParserMatchClass = LogicalVecHalfWordShifterOperand;
+}
+
+// A vector move shifter operand:
+//  {0} - imm1: #8 or #16
+def move_vec_shift : Operand<i32> {
+  let PrintMethod = "printShifter";
+  let EncoderMethod = "getMoveVecShifterOpValue";
+  let ParserMatchClass = MoveVecShifterOperand;
+}
+
+def AddSubImmOperand : AsmOperandClass {
+  let Name = "AddSubImm";
+  let ParserMethod = "tryParseAddSubImm";
+  let DiagnosticType = "AddSubSecondSource";
+}
+// An ADD/SUB immediate shifter operand:
+//  second operand:
+//  {7-6} - shift type: 00 = lsl
+//  {5-0} - imm6: #0 or #12
+class addsub_shifted_imm<ValueType Ty>
+    : Operand<Ty>, ComplexPattern<Ty, 2, "SelectArithImmed", [imm]> {
+  let PrintMethod = "printAddSubImm";
+  let EncoderMethod = "getAddSubImmOpValue";
+  let ParserMatchClass = AddSubImmOperand;
+  let MIOperandInfo = (ops i32imm, i32imm);
+}
+
+def addsub_shifted_imm32 : addsub_shifted_imm<i32>;
+def addsub_shifted_imm64 : addsub_shifted_imm<i64>;
+
+class neg_addsub_shifted_imm<ValueType Ty>
+    : Operand<Ty>, ComplexPattern<Ty, 2, "SelectNegArithImmed", [imm]> {
+  let PrintMethod = "printAddSubImm";
+  let EncoderMethod = "getAddSubImmOpValue";
+  let ParserMatchClass = AddSubImmOperand;
+  let MIOperandInfo = (ops i32imm, i32imm);
+}
+
+def neg_addsub_shifted_imm32 : neg_addsub_shifted_imm<i32>;
+def neg_addsub_shifted_imm64 : neg_addsub_shifted_imm<i64>;
+
+// An extend operand:
+//  {5-3} - extend type
+//  {2-0} - imm3
+def arith_extend : Operand<i32> {
+  let PrintMethod = "printArithExtend";
+  let ParserMatchClass = ExtendOperand;
+}
+def arith_extend64 : Operand<i32> {
+  let PrintMethod = "printArithExtend";
+  let ParserMatchClass = ExtendOperand64;
+}
+
+// 'extend' that's a lsl of a 64-bit register.
+def arith_extendlsl64 : Operand<i32> {
+  let PrintMethod = "printArithExtend";
+  let ParserMatchClass = ExtendOperandLSL64;
+}
+
+class arith_extended_reg32<ValueType Ty> : Operand<Ty>,
+                    ComplexPattern<Ty, 2, "SelectArithExtendedRegister", []> {
+  let PrintMethod = "printExtendedRegister";
+  let MIOperandInfo = (ops GPR32, arith_extend);
+}
+
+class arith_extended_reg32to64<ValueType Ty> : Operand<Ty>,
+                    ComplexPattern<Ty, 2, "SelectArithExtendedRegister", []> {
+  let PrintMethod = "printExtendedRegister";
+  let MIOperandInfo = (ops GPR32, arith_extend64);
+}
+
+// Floating-point immediate.
+def fpimm32 : Operand<f32>,
+              PatLeaf<(f32 fpimm), [{
+      return AArch64_AM::getFP32Imm(N->getValueAPF()) != -1;
+    }], SDNodeXForm<fpimm, [{
+      APFloat InVal = N->getValueAPF();
+      uint32_t enc = AArch64_AM::getFP32Imm(InVal);
+      return CurDAG->getTargetConstant(enc, MVT::i32);
+    }]>> {
+  let ParserMatchClass = FPImmOperand;
+  let PrintMethod = "printFPImmOperand";
+}
+def fpimm64 : Operand<f64>,
+              PatLeaf<(f64 fpimm), [{
+      return AArch64_AM::getFP64Imm(N->getValueAPF()) != -1;
+    }], SDNodeXForm<fpimm, [{
+      APFloat InVal = N->getValueAPF();
+      uint32_t enc = AArch64_AM::getFP64Imm(InVal);
+      return CurDAG->getTargetConstant(enc, MVT::i32);
+    }]>> {
+  let ParserMatchClass = FPImmOperand;
+  let PrintMethod = "printFPImmOperand";
+}
+
+def fpimm8 : Operand<i32> {
+  let ParserMatchClass = FPImmOperand;
+  let PrintMethod = "printFPImmOperand";
+}
+
+def fpimm0 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(+0.0);
+}]>;
+
+// Vector lane operands
+class AsmVectorIndex<string Suffix> : AsmOperandClass {
+  let Name = "VectorIndex" # Suffix;
+  let DiagnosticType = "InvalidIndex" # Suffix;
+}
+def VectorIndex1Operand : AsmVectorIndex<"1">;
+def VectorIndexBOperand : AsmVectorIndex<"B">;
+def VectorIndexHOperand : AsmVectorIndex<"H">;
+def VectorIndexSOperand : AsmVectorIndex<"S">;
+def VectorIndexDOperand : AsmVectorIndex<"D">;
+
+def VectorIndex1 : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) == 1;
+}]> {
+  let ParserMatchClass = VectorIndex1Operand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i64imm);
+}
+def VectorIndexB : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 16;
+}]> {
+  let ParserMatchClass = VectorIndexBOperand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i64imm);
+}
+def VectorIndexH : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 8;
+}]> {
+  let ParserMatchClass = VectorIndexHOperand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i64imm);
+}
+def VectorIndexS : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 4;
+}]> {
+  let ParserMatchClass = VectorIndexSOperand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i64imm);
+}
+def VectorIndexD : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 2;
+}]> {
+  let ParserMatchClass = VectorIndexDOperand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i64imm);
+}
+
+// 8-bit immediate for AdvSIMD where 64-bit values of the form:
+// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
+// are encoded as the eight bit value 'abcdefgh'.
+def simdimmtype10 : Operand<i32>,
+                    PatLeaf<(f64 fpimm), [{
+      return AArch64_AM::isAdvSIMDModImmType10(N->getValueAPF()
+                                               .bitcastToAPInt()
+                                               .getZExtValue());
+    }], SDNodeXForm<fpimm, [{
+      APFloat InVal = N->getValueAPF();
+      uint32_t enc = AArch64_AM::encodeAdvSIMDModImmType10(N->getValueAPF()
+                                                           .bitcastToAPInt()
+                                                           .getZExtValue());
+      return CurDAG->getTargetConstant(enc, MVT::i32);
+    }]>> {
+  let ParserMatchClass = SIMDImmType10Operand;
+  let PrintMethod = "printSIMDType10Operand";
+}
+
+
+//---
+// System management
+//---
+
+// Base encoding for system instruction operands.
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+class BaseSystemI<bit L, dag oops, dag iops, string asm, string operands,
+                  list<dag> pattern = []>
+    : I<oops, iops, asm, operands, "", pattern> {
+  let Inst{31-22} = 0b1101010100;
+  let Inst{21}    = L;
+}
+
+// System instructions which do not have an Rt register.
+class SimpleSystemI<bit L, dag iops, string asm, string operands,
+                    list<dag> pattern = []>
+    : BaseSystemI<L, (outs), iops, asm, operands, pattern> {
+  let Inst{4-0} = 0b11111;
+}
+
+// System instructions which have an Rt register.
+class RtSystemI<bit L, dag oops, dag iops, string asm, string operands>
+    : BaseSystemI<L, oops, iops, asm, operands>,
+      Sched<[WriteSys]> {
+  bits<5> Rt;
+  let Inst{4-0} = Rt;
+}
+
+// Hint instructions that take both a CRm and a 3-bit immediate.
+// NOTE: ideally, this would have mayStore = 0, mayLoad = 0, but we cannot
+// model patterns with sufficiently fine granularity
+let mayStore = 1, mayLoad = 1, hasSideEffects = 1 in
+  class HintI<string mnemonic>
+      : SimpleSystemI<0, (ins imm0_127:$imm), mnemonic#" $imm", "",
+                      [(int_aarch64_hint imm0_127:$imm)]>,
+        Sched<[WriteHint]> {
+    bits <7> imm;
+    let Inst{20-12} = 0b000110010;
+    let Inst{11-5} = imm;
+  }
+
+// System instructions taking a single literal operand which encodes into
+// CRm. op2 differentiates the opcodes.
+def BarrierAsmOperand : AsmOperandClass {
+  let Name = "Barrier";
+  let ParserMethod = "tryParseBarrierOperand";
+}
+def barrier_op : Operand<i32> {
+  let PrintMethod = "printBarrierOption";
+  let ParserMatchClass = BarrierAsmOperand;
+}
+class CRmSystemI<Operand crmtype, bits<3> opc, string asm,
+                 list<dag> pattern = []>
+    : SimpleSystemI<0, (ins crmtype:$CRm), asm, "\t$CRm", pattern>,
+      Sched<[WriteBarrier]> {
+  bits<4> CRm;
+  let Inst{20-12} = 0b000110011;
+  let Inst{11-8} = CRm;
+  let Inst{7-5} = opc;
+}
+
+// MRS/MSR system instructions. These have different operand classes because
+// a different subset of registers can be accessed through each instruction.
+def MRSSystemRegisterOperand : AsmOperandClass {
+  let Name = "MRSSystemRegister";
+  let ParserMethod = "tryParseSysReg";
+  let DiagnosticType = "MRS";
+}
+// concatenation of 1, op0, op1, CRn, CRm, op2. 16-bit immediate.
+def mrs_sysreg_op : Operand<i32> {
+  let ParserMatchClass = MRSSystemRegisterOperand;
+  let DecoderMethod = "DecodeMRSSystemRegister";
+  let PrintMethod = "printMRSSystemRegister";
+}
+
+def MSRSystemRegisterOperand : AsmOperandClass {
+  let Name = "MSRSystemRegister";
+  let ParserMethod = "tryParseSysReg";
+  let DiagnosticType = "MSR";
+}
+def msr_sysreg_op : Operand<i32> {
+  let ParserMatchClass = MSRSystemRegisterOperand;
+  let DecoderMethod = "DecodeMSRSystemRegister";
+  let PrintMethod = "printMSRSystemRegister";
+}
+
+class MRSI : RtSystemI<1, (outs GPR64:$Rt), (ins mrs_sysreg_op:$systemreg),
+                       "mrs", "\t$Rt, $systemreg"> {
+  bits<15> systemreg;
+  let Inst{20} = 1;
+  let Inst{19-5} = systemreg;
+}
+
+// FIXME: Some of these def NZCV, others don't. Best way to model that?
+// Explicitly modeling each of the system register as a register class
+// would do it, but feels like overkill at this point.
+class MSRI : RtSystemI<0, (outs), (ins msr_sysreg_op:$systemreg, GPR64:$Rt),
+                       "msr", "\t$systemreg, $Rt"> {
+  bits<15> systemreg;
+  let Inst{20} = 1;
+  let Inst{19-5} = systemreg;
+}
+
+def SystemPStateFieldOperand : AsmOperandClass {
+  let Name = "SystemPStateField";
+  let ParserMethod = "tryParseSysReg";
+}
+def pstatefield_op : Operand<i32> {
+  let ParserMatchClass = SystemPStateFieldOperand;
+  let PrintMethod = "printSystemPStateField";
+}
+
+let Defs = [NZCV] in
+class MSRpstateI
+  : SimpleSystemI<0, (ins pstatefield_op:$pstate_field, imm0_15:$imm),
+                  "msr", "\t$pstate_field, $imm">,
+    Sched<[WriteSys]> {
+  bits<6> pstatefield;
+  bits<4> imm;
+  let Inst{20-19} = 0b00;
+  let Inst{18-16} = pstatefield{5-3};
+  let Inst{15-12} = 0b0100;
+  let Inst{11-8} = imm;
+  let Inst{7-5} = pstatefield{2-0};
+
+  let DecoderMethod = "DecodeSystemPStateInstruction";
+}
+
+// SYS and SYSL generic system instructions.
+def SysCRAsmOperand : AsmOperandClass {
+  let Name = "SysCR";
+  let ParserMethod = "tryParseSysCROperand";
+}
+
+def sys_cr_op : Operand<i32> {
+  let PrintMethod = "printSysCROperand";
+  let ParserMatchClass = SysCRAsmOperand;
+}
+
+class SystemXtI<bit L, string asm>
+  : RtSystemI<L, (outs),
+       (ins imm0_7:$op1, sys_cr_op:$Cn, sys_cr_op:$Cm, imm0_7:$op2, GPR64:$Rt),
+       asm, "\t$op1, $Cn, $Cm, $op2, $Rt"> {
+  bits<3> op1;
+  bits<4> Cn;
+  bits<4> Cm;
+  bits<3> op2;
+  let Inst{20-19} = 0b01;
+  let Inst{18-16} = op1;
+  let Inst{15-12} = Cn;
+  let Inst{11-8}  = Cm;
+  let Inst{7-5}   = op2;
+}
+
+class SystemLXtI<bit L, string asm>
+  : RtSystemI<L, (outs),
+       (ins GPR64:$Rt, imm0_7:$op1, sys_cr_op:$Cn, sys_cr_op:$Cm, imm0_7:$op2),
+       asm, "\t$Rt, $op1, $Cn, $Cm, $op2"> {
+  bits<3> op1;
+  bits<4> Cn;
+  bits<4> Cm;
+  bits<3> op2;
+  let Inst{20-19} = 0b01;
+  let Inst{18-16} = op1;
+  let Inst{15-12} = Cn;
+  let Inst{11-8}  = Cm;
+  let Inst{7-5}   = op2;
+}
+
+
+// Branch (register) instructions:
+//
+//  case opc of
+//    0001 blr
+//    0000 br
+//    0101 dret
+//    0100 eret
+//    0010 ret
+//    otherwise UNDEFINED
+class BaseBranchReg<bits<4> opc, dag oops, dag iops, string asm,
+                    string operands, list<dag> pattern>
+    : I<oops, iops, asm, operands, "", pattern>, Sched<[WriteBrReg]> {
+  let Inst{31-25} = 0b1101011;
+  let Inst{24-21} = opc;
+  let Inst{20-16} = 0b11111;
+  let Inst{15-10} = 0b000000;
+  let Inst{4-0}   = 0b00000;
+}
+
+class BranchReg<bits<4> opc, string asm, list<dag> pattern>
+    : BaseBranchReg<opc, (outs), (ins GPR64:$Rn), asm, "\t$Rn", pattern> {
+  bits<5> Rn;
+  let Inst{9-5} = Rn;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1, isReturn = 1 in
+class SpecialReturn<bits<4> opc, string asm>
+    : BaseBranchReg<opc, (outs), (ins), asm, "", []> {
+  let Inst{9-5} = 0b11111;
+}
+
+//---
+// Conditional branch instruction.
+//---
+
+// Condition code.
+// 4-bit immediate. Pretty-printed as <cc>
+def ccode : Operand<i32> {
+  let PrintMethod = "printCondCode";
+  let ParserMatchClass = CondCode;
+}
+def inv_ccode : Operand<i32> {
+  // AL and NV are invalid in the aliases which use inv_ccode
+  let PrintMethod = "printInverseCondCode";
+  let ParserMatchClass = CondCode;
+  let MCOperandPredicate = [{
+    return MCOp.isImm() &&
+           MCOp.getImm() != AArch64CC::AL &&
+           MCOp.getImm() != AArch64CC::NV;
+  }];
+}
+
+// Conditional branch target. 19-bit immediate. The low two bits of the target
+// offset are implied zero and so are not part of the immediate.
+def PCRelLabel19Operand : AsmOperandClass {
+  let Name = "PCRelLabel19";
+  let DiagnosticType = "InvalidLabel";
+}
+def am_brcond : Operand<OtherVT> {
+  let EncoderMethod = "getCondBranchTargetOpValue";
+  let DecoderMethod = "DecodePCRelLabel19";
+  let PrintMethod = "printAlignedLabel";
+  let ParserMatchClass = PCRelLabel19Operand;
+}
+
+class BranchCond : I<(outs), (ins ccode:$cond, am_brcond:$target),
+                     "b", ".$cond\t$target", "",
+                     [(AArch64brcond bb:$target, imm:$cond, NZCV)]>,
+                   Sched<[WriteBr]> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let Uses = [NZCV];
+
+  bits<4> cond;
+  bits<19> target;
+  let Inst{31-24} = 0b01010100;
+  let Inst{23-5} = target;
+  let Inst{4} = 0;
+  let Inst{3-0} = cond;
+}
+
+//---
+// Compare-and-branch instructions.
+//---
+class BaseCmpBranch<RegisterClass regtype, bit op, string asm, SDNode node>
+    : I<(outs), (ins regtype:$Rt, am_brcond:$target),
+         asm, "\t$Rt, $target", "",
+         [(node regtype:$Rt, bb:$target)]>,
+      Sched<[WriteBr]> {
+  let isBranch = 1;
+  let isTerminator = 1;
+
+  bits<5> Rt;
+  bits<19> target;
+  let Inst{30-25} = 0b011010;
+  let Inst{24}    = op;
+  let Inst{23-5}  = target;
+  let Inst{4-0}   = Rt;
+}
+
+multiclass CmpBranch<bit op, string asm, SDNode node> {
+  def W : BaseCmpBranch<GPR32, op, asm, node> {
+    let Inst{31} = 0;
+  }
+  def X : BaseCmpBranch<GPR64, op, asm, node> {
+    let Inst{31} = 1;
+  }
+}
+
+//---
+// Test-bit-and-branch instructions.
+//---
+// Test-and-branch target. 14-bit sign-extended immediate. The low two bits of
+// the target offset are implied zero and so are not part of the immediate.
+def BranchTarget14Operand : AsmOperandClass {
+  let Name = "BranchTarget14";
+}
+def am_tbrcond : Operand<OtherVT> {
+  let EncoderMethod = "getTestBranchTargetOpValue";
+  let PrintMethod = "printAlignedLabel";
+  let ParserMatchClass = BranchTarget14Operand;
+}
+
+// AsmOperand classes to emit (or not) special diagnostics
+def TBZImm0_31Operand : AsmOperandClass {
+  let Name = "TBZImm0_31";
+  let PredicateMethod = "isImm0_31";
+  let RenderMethod = "addImm0_31Operands";
+}
+def TBZImm32_63Operand : AsmOperandClass {
+  let Name = "Imm32_63";
+  let DiagnosticType = "InvalidImm0_63";
+}
+
+class tbz_imm0_31<AsmOperandClass matcher> : Operand<i64>, ImmLeaf<i64, [{
+  return (((uint32_t)Imm) < 32);
+}]> {
+  let ParserMatchClass = matcher;
+}
+
+def tbz_imm0_31_diag : tbz_imm0_31<Imm0_31Operand>;
+def tbz_imm0_31_nodiag : tbz_imm0_31<TBZImm0_31Operand>;
+
+def tbz_imm32_63 : Operand<i64>, ImmLeaf<i64, [{
+  return (((uint32_t)Imm) > 31) && (((uint32_t)Imm) < 64);
+}]> {
+  let ParserMatchClass = TBZImm32_63Operand;
+}
+
+class BaseTestBranch<RegisterClass regtype, Operand immtype,
+                     bit op, string asm, SDNode node>
+    : I<(outs), (ins regtype:$Rt, immtype:$bit_off, am_tbrcond:$target),
+       asm, "\t$Rt, $bit_off, $target", "",
+       [(node regtype:$Rt, immtype:$bit_off, bb:$target)]>,
+      Sched<[WriteBr]> {
+  let isBranch = 1;
+  let isTerminator = 1;
+
+  bits<5> Rt;
+  bits<6> bit_off;
+  bits<14> target;
+
+  let Inst{30-25} = 0b011011;
+  let Inst{24}    = op;
+  let Inst{23-19} = bit_off{4-0};
+  let Inst{18-5}  = target;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodeTestAndBranch";
+}
+
+multiclass TestBranch<bit op, string asm, SDNode node> {
+  def W : BaseTestBranch<GPR32, tbz_imm0_31_diag, op, asm, node> {
+    let Inst{31} = 0;
+  }
+
+  def X : BaseTestBranch<GPR64, tbz_imm32_63, op, asm, node> {
+    let Inst{31} = 1;
+  }
+
+  // Alias X-reg with 0-31 imm to W-Reg.
+  def : InstAlias<asm # "\t$Rd, $imm, $target",
+                  (!cast<Instruction>(NAME#"W") GPR32as64:$Rd,
+                  tbz_imm0_31_nodiag:$imm, am_tbrcond:$target), 0>;
+  def : Pat<(node GPR64:$Rn, tbz_imm0_31_diag:$imm, bb:$target),
+            (!cast<Instruction>(NAME#"W") (EXTRACT_SUBREG GPR64:$Rn, sub_32),
+            tbz_imm0_31_diag:$imm, bb:$target)>;
+}
+
+//---
+// Unconditional branch (immediate) instructions.
+//---
+def BranchTarget26Operand : AsmOperandClass {
+  let Name = "BranchTarget26";
+  let DiagnosticType = "InvalidLabel";
+}
+def am_b_target : Operand<OtherVT> {
+  let EncoderMethod = "getBranchTargetOpValue";
+  let PrintMethod = "printAlignedLabel";
+  let ParserMatchClass = BranchTarget26Operand;
+}
+def am_bl_target : Operand<i64> {
+  let EncoderMethod = "getBranchTargetOpValue";
+  let PrintMethod = "printAlignedLabel";
+  let ParserMatchClass = BranchTarget26Operand;
+}
+
+class BImm<bit op, dag iops, string asm, list<dag> pattern>
+    : I<(outs), iops, asm, "\t$addr", "", pattern>, Sched<[WriteBr]> {
+  bits<26> addr;
+  let Inst{31}    = op;
+  let Inst{30-26} = 0b00101;
+  let Inst{25-0}  = addr;
+
+  let DecoderMethod = "DecodeUnconditionalBranch";
+}
+
+class BranchImm<bit op, string asm, list<dag> pattern>
+    : BImm<op, (ins am_b_target:$addr), asm, pattern>;
+class CallImm<bit op, string asm, list<dag> pattern>
+    : BImm<op, (ins am_bl_target:$addr), asm, pattern>;
+
+//---
+// Basic one-operand data processing instructions.
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseOneOperandData<bits<3> opc, RegisterClass regtype, string asm,
+                         SDPatternOperator node>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "\t$Rd, $Rn", "",
+      [(set regtype:$Rd, (node regtype:$Rn))]>,
+    Sched<[WriteI, ReadI]> {
+  bits<5> Rd;
+  bits<5> Rn;
+
+  let Inst{30-13} = 0b101101011000000000;
+  let Inst{12-10} = opc;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+multiclass OneOperandData<bits<3> opc, string asm,
+                          SDPatternOperator node = null_frag> {
+  def Wr : BaseOneOperandData<opc, GPR32, asm, node> {
+    let Inst{31} = 0;
+  }
+
+  def Xr : BaseOneOperandData<opc, GPR64, asm, node> {
+    let Inst{31} = 1;
+  }
+}
+
+class OneWRegData<bits<3> opc, string asm, SDPatternOperator node>
+    : BaseOneOperandData<opc, GPR32, asm, node> {
+  let Inst{31} = 0;
+}
+
+class OneXRegData<bits<3> opc, string asm, SDPatternOperator node>
+    : BaseOneOperandData<opc, GPR64, asm, node> {
+  let Inst{31} = 1;
+}
+
+//---
+// Basic two-operand data processing instructions.
+//---
+class BaseBaseAddSubCarry<bit isSub, RegisterClass regtype, string asm,
+                          list<dag> pattern>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
+        asm, "\t$Rd, $Rn, $Rm", "", pattern>,
+      Sched<[WriteI, ReadI, ReadI]> {
+  let Uses = [NZCV];
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{30}    = isSub;
+  let Inst{28-21} = 0b11010000;
+  let Inst{20-16} = Rm;
+  let Inst{15-10} = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class BaseAddSubCarry<bit isSub, RegisterClass regtype, string asm,
+                      SDNode OpNode>
+    : BaseBaseAddSubCarry<isSub, regtype, asm,
+        [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm, NZCV))]>;
+
+class BaseAddSubCarrySetFlags<bit isSub, RegisterClass regtype, string asm,
+                              SDNode OpNode>
+    : BaseBaseAddSubCarry<isSub, regtype, asm,
+        [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm, NZCV)),
+         (implicit NZCV)]> {
+  let Defs = [NZCV];
+}
+
+multiclass AddSubCarry<bit isSub, string asm, string asm_setflags,
+                       SDNode OpNode, SDNode OpNode_setflags> {
+  def Wr : BaseAddSubCarry<isSub, GPR32, asm, OpNode> {
+    let Inst{31} = 0;
+    let Inst{29} = 0;
+  }
+  def Xr : BaseAddSubCarry<isSub, GPR64, asm, OpNode> {
+    let Inst{31} = 1;
+    let Inst{29} = 0;
+  }
+
+  // Sets flags.
+  def SWr : BaseAddSubCarrySetFlags<isSub, GPR32, asm_setflags,
+                                    OpNode_setflags> {
+    let Inst{31} = 0;
+    let Inst{29} = 1;
+  }
+  def SXr : BaseAddSubCarrySetFlags<isSub, GPR64, asm_setflags,
+                                    OpNode_setflags> {
+    let Inst{31} = 1;
+    let Inst{29} = 1;
+  }
+}
+
+class BaseTwoOperand<bits<4> opc, RegisterClass regtype, string asm,
+                     SDPatternOperator OpNode>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
+      asm, "\t$Rd, $Rn, $Rm", "",
+      [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{30-21} = 0b0011010110;
+  let Inst{20-16} = Rm;
+  let Inst{15-14} = 0b00;
+  let Inst{13-10} = opc;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class BaseDiv<bit isSigned, RegisterClass regtype, string asm,
+              SDPatternOperator OpNode>
+    : BaseTwoOperand<{0,0,1,?}, regtype, asm, OpNode> {
+  let Inst{10}    = isSigned;
+}
+
+multiclass Div<bit isSigned, string asm, SDPatternOperator OpNode> {
+  def Wr : BaseDiv<isSigned, GPR32, asm, OpNode>,
+           Sched<[WriteID32, ReadID, ReadID]> {
+    let Inst{31} = 0;
+  }
+  def Xr : BaseDiv<isSigned, GPR64, asm, OpNode>,
+           Sched<[WriteID64, ReadID, ReadID]> {
+    let Inst{31} = 1;
+  }
+}
+
+class BaseShift<bits<2> shift_type, RegisterClass regtype, string asm,
+                SDPatternOperator OpNode = null_frag>
+  : BaseTwoOperand<{1,0,?,?}, regtype, asm, OpNode>,
+    Sched<[WriteIS, ReadI]> {
+  let Inst{11-10} = shift_type;
+}
+
+multiclass Shift<bits<2> shift_type, string asm, SDNode OpNode> {
+  def Wr : BaseShift<shift_type, GPR32, asm> {
+    let Inst{31} = 0;
+  }
+
+  def Xr : BaseShift<shift_type, GPR64, asm, OpNode> {
+    let Inst{31} = 1;
+  }
+
+  def : Pat<(i32 (OpNode GPR32:$Rn, i64:$Rm)),
+            (!cast<Instruction>(NAME # "Wr") GPR32:$Rn,
+                                             (EXTRACT_SUBREG i64:$Rm, sub_32))>;
+
+  def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (zext GPR32:$Rm)))),
+            (!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>;
+
+  def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (anyext GPR32:$Rm)))),
+            (!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>;
+
+  def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (sext GPR32:$Rm)))),
+            (!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>;
+}
+
+class ShiftAlias<string asm, Instruction inst, RegisterClass regtype>
+    : InstAlias<asm#" $dst, $src1, $src2",
+                (inst regtype:$dst, regtype:$src1, regtype:$src2), 0>;
+
+class BaseMulAccum<bit isSub, bits<3> opc, RegisterClass multype,
+                       RegisterClass addtype, string asm,
+                       list<dag> pattern>
+  : I<(outs addtype:$Rd), (ins multype:$Rn, multype:$Rm, addtype:$Ra),
+      asm, "\t$Rd, $Rn, $Rm, $Ra", "", pattern> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<5> Ra;
+  let Inst{30-24} = 0b0011011;
+  let Inst{23-21} = opc;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = isSub;
+  let Inst{14-10} = Ra;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass MulAccum<bit isSub, string asm, SDNode AccNode> {
+  def Wrrr : BaseMulAccum<isSub, 0b000, GPR32, GPR32, asm,
+      [(set GPR32:$Rd, (AccNode GPR32:$Ra, (mul GPR32:$Rn, GPR32:$Rm)))]>,
+      Sched<[WriteIM32, ReadIM, ReadIM, ReadIMA]> {
+    let Inst{31} = 0;
+  }
+
+  def Xrrr : BaseMulAccum<isSub, 0b000, GPR64, GPR64, asm,
+      [(set GPR64:$Rd, (AccNode GPR64:$Ra, (mul GPR64:$Rn, GPR64:$Rm)))]>,
+      Sched<[WriteIM64, ReadIM, ReadIM, ReadIMA]> {
+    let Inst{31} = 1;
+  }
+}
+
+class WideMulAccum<bit isSub, bits<3> opc, string asm,
+                   SDNode AccNode, SDNode ExtNode>
+  : BaseMulAccum<isSub, opc, GPR32, GPR64, asm,
+    [(set GPR64:$Rd, (AccNode GPR64:$Ra,
+                            (mul (ExtNode GPR32:$Rn), (ExtNode GPR32:$Rm))))]>,
+    Sched<[WriteIM32, ReadIM, ReadIM, ReadIMA]> {
+  let Inst{31} = 1;
+}
+
+class MulHi<bits<3> opc, string asm, SDNode OpNode>
+  : I<(outs GPR64:$Rd), (ins GPR64:$Rn, GPR64:$Rm),
+      asm, "\t$Rd, $Rn, $Rm", "",
+      [(set GPR64:$Rd, (OpNode GPR64:$Rn, GPR64:$Rm))]>,
+    Sched<[WriteIM64, ReadIM, ReadIM]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31-24} = 0b10011011;
+  let Inst{23-21} = opc;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+
+  // The Ra field of SMULH and UMULH is unused: it should be assembled as 31
+  // (i.e. all bits 1) but is ignored by the processor.
+  let PostEncoderMethod = "fixMulHigh";
+}
+
+class MulAccumWAlias<string asm, Instruction inst>
+    : InstAlias<asm#" $dst, $src1, $src2",
+                (inst GPR32:$dst, GPR32:$src1, GPR32:$src2, WZR)>;
+class MulAccumXAlias<string asm, Instruction inst>
+    : InstAlias<asm#" $dst, $src1, $src2",
+                (inst GPR64:$dst, GPR64:$src1, GPR64:$src2, XZR)>;
+class WideMulAccumAlias<string asm, Instruction inst>
+    : InstAlias<asm#" $dst, $src1, $src2",
+                (inst GPR64:$dst, GPR32:$src1, GPR32:$src2, XZR)>;
+
+class BaseCRC32<bit sf, bits<2> sz, bit C, RegisterClass StreamReg,
+              SDPatternOperator OpNode, string asm>
+  : I<(outs GPR32:$Rd), (ins GPR32:$Rn, StreamReg:$Rm),
+      asm, "\t$Rd, $Rn, $Rm", "",
+      [(set GPR32:$Rd, (OpNode GPR32:$Rn, StreamReg:$Rm))]>,
+    Sched<[WriteISReg, ReadI, ReadISReg]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+
+  let Inst{31} = sf;
+  let Inst{30-21} = 0b0011010110;
+  let Inst{20-16} = Rm;
+  let Inst{15-13} = 0b010;
+  let Inst{12} = C;
+  let Inst{11-10} = sz;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rd;
+  let Predicates = [HasCRC];
+}
+
+//---
+// Address generation.
+//---
+
+class ADRI<bit page, string asm, Operand adr, list<dag> pattern>
+    : I<(outs GPR64:$Xd), (ins adr:$label), asm, "\t$Xd, $label", "",
+        pattern>,
+      Sched<[WriteI]> {
+  bits<5>  Xd;
+  bits<21> label;
+  let Inst{31}    = page;
+  let Inst{30-29} = label{1-0};
+  let Inst{28-24} = 0b10000;
+  let Inst{23-5}  = label{20-2};
+  let Inst{4-0}   = Xd;
+
+  let DecoderMethod = "DecodeAdrInstruction";
+}
+
+//---
+// Move immediate.
+//---
+
+def movimm32_imm : Operand<i32> {
+  let ParserMatchClass = Imm0_65535Operand;
+  let EncoderMethod = "getMoveWideImmOpValue";
+  let PrintMethod = "printHexImm";
+}
+def movimm32_shift : Operand<i32> {
+  let PrintMethod = "printShifter";
+  let ParserMatchClass = MovImm32ShifterOperand;
+}
+def movimm64_shift : Operand<i32> {
+  let PrintMethod = "printShifter";
+  let ParserMatchClass = MovImm64ShifterOperand;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseMoveImmediate<bits<2> opc, RegisterClass regtype, Operand shifter,
+                        string asm>
+  : I<(outs regtype:$Rd), (ins movimm32_imm:$imm, shifter:$shift),
+       asm, "\t$Rd, $imm$shift", "", []>,
+    Sched<[WriteImm]> {
+  bits<5> Rd;
+  bits<16> imm;
+  bits<6> shift;
+  let Inst{30-29} = opc;
+  let Inst{28-23} = 0b100101;
+  let Inst{22-21} = shift{5-4};
+  let Inst{20-5}  = imm;
+  let Inst{4-0}   = Rd;
+
+  let DecoderMethod = "DecodeMoveImmInstruction";
+}
+
+multiclass MoveImmediate<bits<2> opc, string asm> {
+  def Wi : BaseMoveImmediate<opc, GPR32, movimm32_shift, asm> {
+    let Inst{31} = 0;
+  }
+
+  def Xi : BaseMoveImmediate<opc, GPR64, movimm64_shift, asm> {
+    let Inst{31} = 1;
+  }
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseInsertImmediate<bits<2> opc, RegisterClass regtype, Operand shifter,
+                          string asm>
+  : I<(outs regtype:$Rd),
+      (ins regtype:$src, movimm32_imm:$imm, shifter:$shift),
+       asm, "\t$Rd, $imm$shift", "$src = $Rd", []>,
+    Sched<[WriteI, ReadI]> {
+  bits<5> Rd;
+  bits<16> imm;
+  bits<6> shift;
+  let Inst{30-29} = opc;
+  let Inst{28-23} = 0b100101;
+  let Inst{22-21} = shift{5-4};
+  let Inst{20-5}  = imm;
+  let Inst{4-0}   = Rd;
+
+  let DecoderMethod = "DecodeMoveImmInstruction";
+}
+
+multiclass InsertImmediate<bits<2> opc, string asm> {
+  def Wi : BaseInsertImmediate<opc, GPR32, movimm32_shift, asm> {
+    let Inst{31} = 0;
+  }
+
+  def Xi : BaseInsertImmediate<opc, GPR64, movimm64_shift, asm> {
+    let Inst{31} = 1;
+  }
+}
+
+//---
+// Add/Subtract
+//---
+
+class BaseAddSubImm<bit isSub, bit setFlags, RegisterClass dstRegtype,
+                    RegisterClass srcRegtype, addsub_shifted_imm immtype,
+                    string asm, SDPatternOperator OpNode>
+    : I<(outs dstRegtype:$Rd), (ins srcRegtype:$Rn, immtype:$imm),
+        asm, "\t$Rd, $Rn, $imm", "",
+        [(set dstRegtype:$Rd, (OpNode srcRegtype:$Rn, immtype:$imm))]>,
+      Sched<[WriteI, ReadI]>  {
+  bits<5>  Rd;
+  bits<5>  Rn;
+  bits<14> imm;
+  let Inst{30}    = isSub;
+  let Inst{29}    = setFlags;
+  let Inst{28-24} = 0b10001;
+  let Inst{23-22} = imm{13-12}; // '00' => lsl #0, '01' => lsl #12
+  let Inst{21-10} = imm{11-0};
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+  let DecoderMethod = "DecodeBaseAddSubImm";
+}
+
+class BaseAddSubRegPseudo<RegisterClass regtype,
+                          SDPatternOperator OpNode>
+    : Pseudo<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
+             [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]>,
+      Sched<[WriteI, ReadI, ReadI]>;
+
+class BaseAddSubSReg<bit isSub, bit setFlags, RegisterClass regtype,
+                     arith_shifted_reg shifted_regtype, string asm,
+                     SDPatternOperator OpNode>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, shifted_regtype:$Rm),
+        asm, "\t$Rd, $Rn, $Rm", "",
+        [(set regtype:$Rd, (OpNode regtype:$Rn, shifted_regtype:$Rm))]>,
+      Sched<[WriteISReg, ReadI, ReadISReg]> {
+  // The operands are in order to match the 'addr' MI operands, so we
+  // don't need an encoder method and by-name matching. Just use the default
+  // in-order handling. Since we're using by-order, make sure the names
+  // do not match.
+  bits<5> dst;
+  bits<5> src1;
+  bits<5> src2;
+  bits<8> shift;
+  let Inst{30}    = isSub;
+  let Inst{29}    = setFlags;
+  let Inst{28-24} = 0b01011;
+  let Inst{23-22} = shift{7-6};
+  let Inst{21}    = 0;
+  let Inst{20-16} = src2;
+  let Inst{15-10} = shift{5-0};
+  let Inst{9-5}   = src1;
+  let Inst{4-0}   = dst;
+
+  let DecoderMethod = "DecodeThreeAddrSRegInstruction";
+}
+
+class BaseAddSubEReg<bit isSub, bit setFlags, RegisterClass dstRegtype,
+                     RegisterClass src1Regtype, Operand src2Regtype,
+                     string asm, SDPatternOperator OpNode>
+    : I<(outs dstRegtype:$R1),
+        (ins src1Regtype:$R2, src2Regtype:$R3),
+        asm, "\t$R1, $R2, $R3", "",
+        [(set dstRegtype:$R1, (OpNode src1Regtype:$R2, src2Regtype:$R3))]>,
+      Sched<[WriteIEReg, ReadI, ReadIEReg]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<6> ext;
+  let Inst{30}    = isSub;
+  let Inst{29}    = setFlags;
+  let Inst{28-24} = 0b01011;
+  let Inst{23-21} = 0b001;
+  let Inst{20-16} = Rm;
+  let Inst{15-13} = ext{5-3};
+  let Inst{12-10} = ext{2-0};
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+
+  let DecoderMethod = "DecodeAddSubERegInstruction";
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseAddSubEReg64<bit isSub, bit setFlags, RegisterClass dstRegtype,
+                       RegisterClass src1Regtype, RegisterClass src2Regtype,
+                       Operand ext_op, string asm>
+    : I<(outs dstRegtype:$Rd),
+        (ins src1Regtype:$Rn, src2Regtype:$Rm, ext_op:$ext),
+        asm, "\t$Rd, $Rn, $Rm$ext", "", []>,
+      Sched<[WriteIEReg, ReadI, ReadIEReg]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<6> ext;
+  let Inst{30}    = isSub;
+  let Inst{29}    = setFlags;
+  let Inst{28-24} = 0b01011;
+  let Inst{23-21} = 0b001;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = ext{5};
+  let Inst{12-10} = ext{2-0};
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+
+  let DecoderMethod = "DecodeAddSubERegInstruction";
+}
+
+// Aliases for register+register add/subtract.
+class AddSubRegAlias<string asm, Instruction inst, RegisterClass dstRegtype,
+                     RegisterClass src1Regtype, RegisterClass src2Regtype,
+                     int shiftExt>
+    : InstAlias<asm#" $dst, $src1, $src2",
+                (inst dstRegtype:$dst, src1Regtype:$src1, src2Regtype:$src2,
+                      shiftExt)>;
+
+multiclass AddSub<bit isSub, string mnemonic,
+                  SDPatternOperator OpNode = null_frag> {
+  let hasSideEffects = 0 in {
+  // Add/Subtract immediate
+  def Wri  : BaseAddSubImm<isSub, 0, GPR32sp, GPR32sp, addsub_shifted_imm32,
+                           mnemonic, OpNode> {
+    let Inst{31} = 0;
+  }
+  def Xri  : BaseAddSubImm<isSub, 0, GPR64sp, GPR64sp, addsub_shifted_imm64,
+                           mnemonic, OpNode> {
+    let Inst{31} = 1;
+  }
+
+  // Add/Subtract register - Only used for CodeGen
+  def Wrr : BaseAddSubRegPseudo<GPR32, OpNode>;
+  def Xrr : BaseAddSubRegPseudo<GPR64, OpNode>;
+
+  // Add/Subtract shifted register
+  def Wrs : BaseAddSubSReg<isSub, 0, GPR32, arith_shifted_reg32, mnemonic,
+                           OpNode> {
+    let Inst{31} = 0;
+  }
+  def Xrs : BaseAddSubSReg<isSub, 0, GPR64, arith_shifted_reg64, mnemonic,
+                           OpNode> {
+    let Inst{31} = 1;
+  }
+  }
+
+  // Add/Subtract extended register
+  let AddedComplexity = 1, hasSideEffects = 0 in {
+  def Wrx : BaseAddSubEReg<isSub, 0, GPR32sp, GPR32sp,
+                           arith_extended_reg32<i32>, mnemonic, OpNode> {
+    let Inst{31} = 0;
+  }
+  def Xrx : BaseAddSubEReg<isSub, 0, GPR64sp, GPR64sp,
+                           arith_extended_reg32to64<i64>, mnemonic, OpNode> {
+    let Inst{31} = 1;
+  }
+  }
+
+  def Xrx64 : BaseAddSubEReg64<isSub, 0, GPR64sp, GPR64sp, GPR64,
+                               arith_extendlsl64, mnemonic> {
+    // UXTX and SXTX only.
+    let Inst{14-13} = 0b11;
+    let Inst{31} = 1;
+  }
+
+  // Register/register aliases with no shift when SP is not used.
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrs"),
+                       GPR32, GPR32, GPR32, 0>;
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Xrs"),
+                       GPR64, GPR64, GPR64, 0>;
+
+  // Register/register aliases with no shift when either the destination or
+  // first source register is SP.
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
+                       GPR32sponly, GPR32sp, GPR32, 16>; // UXTW #0
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
+                       GPR32sp, GPR32sponly, GPR32, 16>; // UXTW #0
+  def : AddSubRegAlias<mnemonic,
+                       !cast<Instruction>(NAME#"Xrx64"),
+                       GPR64sponly, GPR64sp, GPR64, 24>; // UXTX #0
+  def : AddSubRegAlias<mnemonic,
+                       !cast<Instruction>(NAME#"Xrx64"),
+                       GPR64sp, GPR64sponly, GPR64, 24>; // UXTX #0
+}
+
+multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode, string cmp> {
+  let isCompare = 1, Defs = [NZCV] in {
+  // Add/Subtract immediate
+  def Wri  : BaseAddSubImm<isSub, 1, GPR32, GPR32sp, addsub_shifted_imm32,
+                           mnemonic, OpNode> {
+    let Inst{31} = 0;
+  }
+  def Xri  : BaseAddSubImm<isSub, 1, GPR64, GPR64sp, addsub_shifted_imm64,
+                           mnemonic, OpNode> {
+    let Inst{31} = 1;
+  }
+
+  // Add/Subtract register
+  def Wrr : BaseAddSubRegPseudo<GPR32, OpNode>;
+  def Xrr : BaseAddSubRegPseudo<GPR64, OpNode>;
+
+  // Add/Subtract shifted register
+  def Wrs : BaseAddSubSReg<isSub, 1, GPR32, arith_shifted_reg32, mnemonic,
+                           OpNode> {
+    let Inst{31} = 0;
+  }
+  def Xrs : BaseAddSubSReg<isSub, 1, GPR64, arith_shifted_reg64, mnemonic,
+                           OpNode> {
+    let Inst{31} = 1;
+  }
+
+  // Add/Subtract extended register
+  let AddedComplexity = 1 in {
+  def Wrx : BaseAddSubEReg<isSub, 1, GPR32, GPR32sp,
+                           arith_extended_reg32<i32>, mnemonic, OpNode> {
+    let Inst{31} = 0;
+  }
+  def Xrx : BaseAddSubEReg<isSub, 1, GPR64, GPR64sp,
+                           arith_extended_reg32<i64>, mnemonic, OpNode> {
+    let Inst{31} = 1;
+  }
+  }
+
+  def Xrx64 : BaseAddSubEReg64<isSub, 1, GPR64, GPR64sp, GPR64,
+                               arith_extendlsl64, mnemonic> {
+    // UXTX and SXTX only.
+    let Inst{14-13} = 0b11;
+    let Inst{31} = 1;
+  }
+  } // Defs = [NZCV]
+
+  // Compare aliases
+  def : InstAlias<cmp#" $src, $imm", (!cast<Instruction>(NAME#"Wri")
+                  WZR, GPR32sp:$src, addsub_shifted_imm32:$imm), 5>;
+  def : InstAlias<cmp#" $src, $imm", (!cast<Instruction>(NAME#"Xri")
+                  XZR, GPR64sp:$src, addsub_shifted_imm64:$imm), 5>;
+  def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Wrx")
+                  WZR, GPR32sp:$src1, GPR32:$src2, arith_extend:$sh), 4>;
+  def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx")
+                  XZR, GPR64sp:$src1, GPR32:$src2, arith_extend:$sh), 4>;
+  def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx64")
+                  XZR, GPR64sp:$src1, GPR64:$src2, arith_extendlsl64:$sh), 4>;
+  def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Wrs")
+                  WZR, GPR32:$src1, GPR32:$src2, arith_shift32:$sh), 4>;
+  def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Xrs")
+                  XZR, GPR64:$src1, GPR64:$src2, arith_shift64:$sh), 4>;
+
+  // Compare shorthands
+  def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Wrs")
+                  WZR, GPR32:$src1, GPR32:$src2, 0), 5>;
+  def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Xrs")
+                  XZR, GPR64:$src1, GPR64:$src2, 0), 5>;
+  def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Wrx")
+                  WZR, GPR32sponly:$src1, GPR32:$src2, 16), 5>;
+  def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Xrx64")
+                  XZR, GPR64sponly:$src1, GPR64:$src2, 24), 5>;
+
+  // Register/register aliases with no shift when SP is not used.
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrs"),
+                       GPR32, GPR32, GPR32, 0>;
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Xrs"),
+                       GPR64, GPR64, GPR64, 0>;
+
+  // Register/register aliases with no shift when the first source register
+  // is SP.
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
+                       GPR32, GPR32sponly, GPR32, 16>; // UXTW #0
+  def : AddSubRegAlias<mnemonic,
+                       !cast<Instruction>(NAME#"Xrx64"),
+                       GPR64, GPR64sponly, GPR64, 24>; // UXTX #0
+}
+
+//---
+// Extract
+//---
+def SDTA64EXTR : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+                                      SDTCisPtrTy<3>]>;
+def AArch64Extr : SDNode<"AArch64ISD::EXTR", SDTA64EXTR>;
+
+class BaseExtractImm<RegisterClass regtype, Operand imm_type, string asm,
+                     list<dag> patterns>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, imm_type:$imm),
+         asm, "\t$Rd, $Rn, $Rm, $imm", "", patterns>,
+      Sched<[WriteExtr, ReadExtrHi]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<6> imm;
+
+  let Inst{30-23} = 0b00100111;
+  let Inst{21}    = 0;
+  let Inst{20-16} = Rm;
+  let Inst{15-10} = imm;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass ExtractImm<string asm> {
+  def Wrri : BaseExtractImm<GPR32, imm0_31, asm,
+                      [(set GPR32:$Rd,
+                        (AArch64Extr GPR32:$Rn, GPR32:$Rm, imm0_31:$imm))]> {
+    let Inst{31} = 0;
+    let Inst{22} = 0;
+    // imm<5> must be zero.
+    let imm{5}   = 0;
+  }
+  def Xrri : BaseExtractImm<GPR64, imm0_63, asm,
+                      [(set GPR64:$Rd,
+                        (AArch64Extr GPR64:$Rn, GPR64:$Rm, imm0_63:$imm))]> {
+
+    let Inst{31} = 1;
+    let Inst{22} = 1;
+  }
+}
+
+//---
+// Bitfield
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseBitfieldImm<bits<2> opc,
+                      RegisterClass regtype, Operand imm_type, string asm>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, imm_type:$immr, imm_type:$imms),
+         asm, "\t$Rd, $Rn, $immr, $imms", "", []>,
+      Sched<[WriteIS, ReadI]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<6> immr;
+  bits<6> imms;
+
+  let Inst{30-29} = opc;
+  let Inst{28-23} = 0b100110;
+  let Inst{21-16} = immr;
+  let Inst{15-10} = imms;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass BitfieldImm<bits<2> opc, string asm> {
+  def Wri : BaseBitfieldImm<opc, GPR32, imm0_31, asm> {
+    let Inst{31} = 0;
+    let Inst{22} = 0;
+    // imms<5> and immr<5> must be zero, else ReservedValue().
+    let Inst{21} = 0;
+    let Inst{15} = 0;
+  }
+  def Xri : BaseBitfieldImm<opc, GPR64, imm0_63, asm> {
+    let Inst{31} = 1;
+    let Inst{22} = 1;
+  }
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseBitfieldImmWith2RegArgs<bits<2> opc,
+                      RegisterClass regtype, Operand imm_type, string asm>
+    : I<(outs regtype:$Rd), (ins regtype:$src, regtype:$Rn, imm_type:$immr,
+                             imm_type:$imms),
+         asm, "\t$Rd, $Rn, $immr, $imms", "$src = $Rd", []>,
+      Sched<[WriteIS, ReadI]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<6> immr;
+  bits<6> imms;
+
+  let Inst{30-29} = opc;
+  let Inst{28-23} = 0b100110;
+  let Inst{21-16} = immr;
+  let Inst{15-10} = imms;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass BitfieldImmWith2RegArgs<bits<2> opc, string asm> {
+  def Wri : BaseBitfieldImmWith2RegArgs<opc, GPR32, imm0_31, asm> {
+    let Inst{31} = 0;
+    let Inst{22} = 0;
+    // imms<5> and immr<5> must be zero, else ReservedValue().
+    let Inst{21} = 0;
+    let Inst{15} = 0;
+  }
+  def Xri : BaseBitfieldImmWith2RegArgs<opc, GPR64, imm0_63, asm> {
+    let Inst{31} = 1;
+    let Inst{22} = 1;
+  }
+}
+
+//---
+// Logical
+//---
+
+// Logical (immediate)
+class BaseLogicalImm<bits<2> opc, RegisterClass dregtype,
+                     RegisterClass sregtype, Operand imm_type, string asm,
+                     list<dag> pattern>
+    : I<(outs dregtype:$Rd), (ins sregtype:$Rn, imm_type:$imm),
+         asm, "\t$Rd, $Rn, $imm", "", pattern>,
+      Sched<[WriteI, ReadI]> {
+  bits<5>  Rd;
+  bits<5>  Rn;
+  bits<13> imm;
+  let Inst{30-29} = opc;
+  let Inst{28-23} = 0b100100;
+  let Inst{22}    = imm{12};
+  let Inst{21-16} = imm{11-6};
+  let Inst{15-10} = imm{5-0};
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+
+  let DecoderMethod = "DecodeLogicalImmInstruction";
+}
+
+// Logical (shifted register)
+class BaseLogicalSReg<bits<2> opc, bit N, RegisterClass regtype,
+                      logical_shifted_reg shifted_regtype, string asm,
+                      list<dag> pattern>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, shifted_regtype:$Rm),
+        asm, "\t$Rd, $Rn, $Rm", "", pattern>,
+      Sched<[WriteISReg, ReadI, ReadISReg]> {
+  // The operands are in order to match the 'addr' MI operands, so we
+  // don't need an encoder method and by-name matching. Just use the default
+  // in-order handling. Since we're using by-order, make sure the names
+  // do not match.
+  bits<5> dst;
+  bits<5> src1;
+  bits<5> src2;
+  bits<8> shift;
+  let Inst{30-29} = opc;
+  let Inst{28-24} = 0b01010;
+  let Inst{23-22} = shift{7-6};
+  let Inst{21}    = N;
+  let Inst{20-16} = src2;
+  let Inst{15-10} = shift{5-0};
+  let Inst{9-5}   = src1;
+  let Inst{4-0}   = dst;
+
+  let DecoderMethod = "DecodeThreeAddrSRegInstruction";
+}
+
+// Aliases for register+register logical instructions.
+class LogicalRegAlias<string asm, Instruction inst, RegisterClass regtype>
+    : InstAlias<asm#" $dst, $src1, $src2",
+                (inst regtype:$dst, regtype:$src1, regtype:$src2, 0)>;
+
+multiclass LogicalImm<bits<2> opc, string mnemonic, SDNode OpNode,
+                      string Alias> {
+  let AddedComplexity = 6 in
+  def Wri : BaseLogicalImm<opc, GPR32sp, GPR32, logical_imm32, mnemonic,
+                           [(set GPR32sp:$Rd, (OpNode GPR32:$Rn,
+                                               logical_imm32:$imm))]> {
+    let Inst{31} = 0;
+    let Inst{22} = 0; // 64-bit version has an additional bit of immediate.
+  }
+  let AddedComplexity = 6 in
+  def Xri : BaseLogicalImm<opc, GPR64sp, GPR64, logical_imm64, mnemonic,
+                           [(set GPR64sp:$Rd, (OpNode GPR64:$Rn,
+                                               logical_imm64:$imm))]> {
+    let Inst{31} = 1;
+  }
+
+  def : InstAlias<Alias # " $Rd, $Rn, $imm",
+                  (!cast<Instruction>(NAME # "Wri") GPR32sp:$Rd, GPR32:$Rn,
+                      logical_imm32_not:$imm), 0>;
+  def : InstAlias<Alias # " $Rd, $Rn, $imm",
+                  (!cast<Instruction>(NAME # "Xri") GPR64sp:$Rd, GPR64:$Rn,
+                       logical_imm64_not:$imm), 0>;
+}
+
+multiclass LogicalImmS<bits<2> opc, string mnemonic, SDNode OpNode,
+                       string Alias> {
+  let isCompare = 1, Defs = [NZCV] in {
+  def Wri  : BaseLogicalImm<opc, GPR32, GPR32, logical_imm32, mnemonic,
+      [(set GPR32:$Rd, (OpNode GPR32:$Rn, logical_imm32:$imm))]> {
+    let Inst{31} = 0;
+    let Inst{22} = 0; // 64-bit version has an additional bit of immediate.
+  }
+  def Xri  : BaseLogicalImm<opc, GPR64, GPR64, logical_imm64, mnemonic,
+      [(set GPR64:$Rd, (OpNode GPR64:$Rn, logical_imm64:$imm))]> {
+    let Inst{31} = 1;
+  }
+  } // end Defs = [NZCV]
+
+  def : InstAlias<Alias # " $Rd, $Rn, $imm",
+                  (!cast<Instruction>(NAME # "Wri") GPR32:$Rd, GPR32:$Rn,
+                      logical_imm32_not:$imm), 0>;
+  def : InstAlias<Alias # " $Rd, $Rn, $imm",
+                  (!cast<Instruction>(NAME # "Xri") GPR64:$Rd, GPR64:$Rn,
+                       logical_imm64_not:$imm), 0>;
+}
+
+class BaseLogicalRegPseudo<RegisterClass regtype, SDPatternOperator OpNode>
+    : Pseudo<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
+             [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]>,
+      Sched<[WriteI, ReadI, ReadI]>;
+
+// Split from LogicalImm as not all instructions have both.
+multiclass LogicalReg<bits<2> opc, bit N, string mnemonic,
+                      SDPatternOperator OpNode> {
+  def Wrr : BaseLogicalRegPseudo<GPR32, OpNode>;
+  def Xrr : BaseLogicalRegPseudo<GPR64, OpNode>;
+
+  def Wrs : BaseLogicalSReg<opc, N, GPR32, logical_shifted_reg32, mnemonic,
+                            [(set GPR32:$Rd, (OpNode GPR32:$Rn,
+                                                 logical_shifted_reg32:$Rm))]> {
+    let Inst{31} = 0;
+  }
+  def Xrs : BaseLogicalSReg<opc, N, GPR64, logical_shifted_reg64, mnemonic,
+                            [(set GPR64:$Rd, (OpNode GPR64:$Rn,
+                                                 logical_shifted_reg64:$Rm))]> {
+    let Inst{31} = 1;
+  }
+
+  def : LogicalRegAlias<mnemonic,
+                        !cast<Instruction>(NAME#"Wrs"), GPR32>;
+  def : LogicalRegAlias<mnemonic,
+                        !cast<Instruction>(NAME#"Xrs"), GPR64>;
+}
+
+// Split from LogicalReg to allow setting NZCV Defs
+multiclass LogicalRegS<bits<2> opc, bit N, string mnemonic,
+                       SDPatternOperator OpNode = null_frag> {
+  let Defs = [NZCV], mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+  def Wrr : BaseLogicalRegPseudo<GPR32, OpNode>;
+  def Xrr : BaseLogicalRegPseudo<GPR64, OpNode>;
+
+  def Wrs : BaseLogicalSReg<opc, N, GPR32, logical_shifted_reg32, mnemonic,
+            [(set GPR32:$Rd, (OpNode GPR32:$Rn, logical_shifted_reg32:$Rm))]> {
+    let Inst{31} = 0;
+  }
+  def Xrs : BaseLogicalSReg<opc, N, GPR64, logical_shifted_reg64, mnemonic,
+            [(set GPR64:$Rd, (OpNode GPR64:$Rn, logical_shifted_reg64:$Rm))]> {
+    let Inst{31} = 1;
+  }
+  } // Defs = [NZCV]
+
+  def : LogicalRegAlias<mnemonic,
+                        !cast<Instruction>(NAME#"Wrs"), GPR32>;
+  def : LogicalRegAlias<mnemonic,
+                        !cast<Instruction>(NAME#"Xrs"), GPR64>;
+}
+
+//---
+// Conditionally set flags
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseCondSetFlagsImm<bit op, RegisterClass regtype, string asm>
+    : I<(outs), (ins regtype:$Rn, imm0_31:$imm, imm0_15:$nzcv, ccode:$cond),
+         asm, "\t$Rn, $imm, $nzcv, $cond", "", []>,
+      Sched<[WriteI, ReadI]> {
+  let Uses = [NZCV];
+  let Defs = [NZCV];
+
+  bits<5> Rn;
+  bits<5> imm;
+  bits<4> nzcv;
+  bits<4> cond;
+
+  let Inst{30}    = op;
+  let Inst{29-21} = 0b111010010;
+  let Inst{20-16} = imm;
+  let Inst{15-12} = cond;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4}     = 0b0;
+  let Inst{3-0}   = nzcv;
+}
+
+multiclass CondSetFlagsImm<bit op, string asm> {
+  def Wi : BaseCondSetFlagsImm<op, GPR32, asm> {
+    let Inst{31} = 0;
+  }
+  def Xi : BaseCondSetFlagsImm<op, GPR64, asm> {
+    let Inst{31} = 1;
+  }
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseCondSetFlagsReg<bit op, RegisterClass regtype, string asm>
+    : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm0_15:$nzcv, ccode:$cond),
+         asm, "\t$Rn, $Rm, $nzcv, $cond", "", []>,
+      Sched<[WriteI, ReadI, ReadI]> {
+  let Uses = [NZCV];
+  let Defs = [NZCV];
+
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<4> nzcv;
+  bits<4> cond;
+
+  let Inst{30}    = op;
+  let Inst{29-21} = 0b111010010;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = cond;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5}   = Rn;
+  let Inst{4}     = 0b0;
+  let Inst{3-0}   = nzcv;
+}
+
+multiclass CondSetFlagsReg<bit op, string asm> {
+  def Wr : BaseCondSetFlagsReg<op, GPR32, asm> {
+    let Inst{31} = 0;
+  }
+  def Xr : BaseCondSetFlagsReg<op, GPR64, asm> {
+    let Inst{31} = 1;
+  }
+}
+
+//---
+// Conditional select
+//---
+
+class BaseCondSelect<bit op, bits<2> op2, RegisterClass regtype, string asm>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond),
+         asm, "\t$Rd, $Rn, $Rm, $cond", "",
+         [(set regtype:$Rd,
+               (AArch64csel regtype:$Rn, regtype:$Rm, (i32 imm:$cond), NZCV))]>,
+      Sched<[WriteI, ReadI, ReadI]> {
+  let Uses = [NZCV];
+
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<4> cond;
+
+  let Inst{30}    = op;
+  let Inst{29-21} = 0b011010100;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = cond;
+  let Inst{11-10} = op2;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass CondSelect<bit op, bits<2> op2, string asm> {
+  def Wr : BaseCondSelect<op, op2, GPR32, asm> {
+    let Inst{31} = 0;
+  }
+  def Xr : BaseCondSelect<op, op2, GPR64, asm> {
+    let Inst{31} = 1;
+  }
+}
+
+class BaseCondSelectOp<bit op, bits<2> op2, RegisterClass regtype, string asm,
+                       PatFrag frag>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond),
+         asm, "\t$Rd, $Rn, $Rm, $cond", "",
+         [(set regtype:$Rd,
+               (AArch64csel regtype:$Rn, (frag regtype:$Rm),
+               (i32 imm:$cond), NZCV))]>,
+      Sched<[WriteI, ReadI, ReadI]> {
+  let Uses = [NZCV];
+
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<4> cond;
+
+  let Inst{30}    = op;
+  let Inst{29-21} = 0b011010100;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = cond;
+  let Inst{11-10} = op2;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+def inv_cond_XFORM : SDNodeXForm<imm, [{
+  AArch64CC::CondCode CC = static_cast<AArch64CC::CondCode>(N->getZExtValue());
+  return CurDAG->getTargetConstant(AArch64CC::getInvertedCondCode(CC), MVT::i32);
+}]>;
+
+multiclass CondSelectOp<bit op, bits<2> op2, string asm, PatFrag frag> {
+  def Wr : BaseCondSelectOp<op, op2, GPR32, asm, frag> {
+    let Inst{31} = 0;
+  }
+  def Xr : BaseCondSelectOp<op, op2, GPR64, asm, frag> {
+    let Inst{31} = 1;
+  }
+
+  def : Pat<(AArch64csel (frag GPR32:$Rm), GPR32:$Rn, (i32 imm:$cond), NZCV),
+            (!cast<Instruction>(NAME # Wr) GPR32:$Rn, GPR32:$Rm,
+                                           (inv_cond_XFORM imm:$cond))>;
+
+  def : Pat<(AArch64csel (frag GPR64:$Rm), GPR64:$Rn, (i32 imm:$cond), NZCV),
+            (!cast<Instruction>(NAME # Xr) GPR64:$Rn, GPR64:$Rm,
+                                           (inv_cond_XFORM imm:$cond))>;
+}
+
+//---
+// Special Mask Value
+//---
+def maski8_or_more : Operand<i32>,
+  ImmLeaf<i32, [{ return (Imm & 0xff) == 0xff; }]> {
+}
+def maski16_or_more : Operand<i32>,
+  ImmLeaf<i32, [{ return (Imm & 0xffff) == 0xffff; }]> {
+}
+
+
+//---
+// Load/store
+//---
+
+// (unsigned immediate)
+// Indexed for 8-bit registers. offset is in range [0,4095].
+def am_indexed8 : ComplexPattern<i64, 2, "SelectAddrModeIndexed8", []>;
+def am_indexed16 : ComplexPattern<i64, 2, "SelectAddrModeIndexed16", []>;
+def am_indexed32 : ComplexPattern<i64, 2, "SelectAddrModeIndexed32", []>;
+def am_indexed64 : ComplexPattern<i64, 2, "SelectAddrModeIndexed64", []>;
+def am_indexed128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed128", []>;
+
+class UImm12OffsetOperand<int Scale> : AsmOperandClass {
+  let Name = "UImm12Offset" # Scale;
+  let RenderMethod = "addUImm12OffsetOperands<" # Scale # ">";
+  let PredicateMethod = "isUImm12Offset<" # Scale # ">";
+  let DiagnosticType = "InvalidMemoryIndexed" # Scale;
+}
+
+def UImm12OffsetScale1Operand : UImm12OffsetOperand<1>;
+def UImm12OffsetScale2Operand : UImm12OffsetOperand<2>;
+def UImm12OffsetScale4Operand : UImm12OffsetOperand<4>;
+def UImm12OffsetScale8Operand : UImm12OffsetOperand<8>;
+def UImm12OffsetScale16Operand : UImm12OffsetOperand<16>;
+
+class uimm12_scaled<int Scale> : Operand<i64> {
+  let ParserMatchClass
+   = !cast<AsmOperandClass>("UImm12OffsetScale" # Scale # "Operand");
+  let EncoderMethod
+   = "getLdStUImm12OpValue<AArch64::fixup_aarch64_ldst_imm12_scale" # Scale # ">";
+  let PrintMethod = "printUImm12Offset<" # Scale # ">";
+}
+
+def uimm12s1 : uimm12_scaled<1>;
+def uimm12s2 : uimm12_scaled<2>;
+def uimm12s4 : uimm12_scaled<4>;
+def uimm12s8 : uimm12_scaled<8>;
+def uimm12s16 : uimm12_scaled<16>;
+
+class BaseLoadStoreUI<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
+                      string asm, list<dag> pattern>
+    : I<oops, iops, asm, "\t$Rt, [$Rn, $offset]", "", pattern> {
+  bits<5> Rt;
+
+  bits<5> Rn;
+  bits<12> offset;
+
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b01;
+  let Inst{23-22} = opc;
+  let Inst{21-10} = offset;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodeUnsignedLdStInstruction";
+}
+
+multiclass LoadUI<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                  Operand indextype, string asm, list<dag> pattern> {
+  let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+  def ui : BaseLoadStoreUI<sz, V, opc, (outs regtype:$Rt),
+                           (ins GPR64sp:$Rn, indextype:$offset),
+                           asm, pattern>,
+           Sched<[WriteLD]>;
+
+  def : InstAlias<asm # " $Rt, [$Rn]",
+                  (!cast<Instruction>(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+multiclass StoreUI<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+             Operand indextype, string asm, list<dag> pattern> {
+  let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+  def ui : BaseLoadStoreUI<sz, V, opc, (outs),
+                           (ins regtype:$Rt, GPR64sp:$Rn, indextype:$offset),
+                           asm, pattern>,
+           Sched<[WriteST]>;
+
+  def : InstAlias<asm # " $Rt, [$Rn]",
+                  (!cast<Instruction>(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+def PrefetchOperand : AsmOperandClass {
+  let Name = "Prefetch";
+  let ParserMethod = "tryParsePrefetch";
+}
+def prfop : Operand<i32> {
+  let PrintMethod = "printPrefetchOp";
+  let ParserMatchClass = PrefetchOperand;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+class PrefetchUI<bits<2> sz, bit V, bits<2> opc, string asm, list<dag> pat>
+    : BaseLoadStoreUI<sz, V, opc,
+                      (outs), (ins prfop:$Rt, GPR64sp:$Rn, uimm12s8:$offset),
+                      asm, pat>,
+      Sched<[WriteLD]>;
+
+//---
+// Load literal
+//---
+
+// Load literal address: 19-bit immediate. The low two bits of the target
+// offset are implied zero and so are not part of the immediate.
+def am_ldrlit : Operand<OtherVT> {
+  let EncoderMethod = "getLoadLiteralOpValue";
+  let DecoderMethod = "DecodePCRelLabel19";
+  let PrintMethod = "printAlignedLabel";
+  let ParserMatchClass = PCRelLabel19Operand;
+}
+
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+class LoadLiteral<bits<2> opc, bit V, RegisterClass regtype, string asm>
+    : I<(outs regtype:$Rt), (ins am_ldrlit:$label),
+        asm, "\t$Rt, $label", "", []>,
+      Sched<[WriteLD]> {
+  bits<5> Rt;
+  bits<19> label;
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b011;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-5}  = label;
+  let Inst{4-0}   = Rt;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+class PrefetchLiteral<bits<2> opc, bit V, string asm, list<dag> pat>
+    : I<(outs), (ins prfop:$Rt, am_ldrlit:$label),
+        asm, "\t$Rt, $label", "", pat>,
+      Sched<[WriteLD]> {
+  bits<5> Rt;
+  bits<19> label;
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b011;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-5}  = label;
+  let Inst{4-0}   = Rt;
+}
+
+//---
+// Load/store register offset
+//---
+
+def ro_Xindexed8 : ComplexPattern<i64, 4, "SelectAddrModeXRO<8>", []>;
+def ro_Xindexed16 : ComplexPattern<i64, 4, "SelectAddrModeXRO<16>", []>;
+def ro_Xindexed32 : ComplexPattern<i64, 4, "SelectAddrModeXRO<32>", []>;
+def ro_Xindexed64 : ComplexPattern<i64, 4, "SelectAddrModeXRO<64>", []>;
+def ro_Xindexed128 : ComplexPattern<i64, 4, "SelectAddrModeXRO<128>", []>;
+
+def ro_Windexed8 : ComplexPattern<i64, 4, "SelectAddrModeWRO<8>", []>;
+def ro_Windexed16 : ComplexPattern<i64, 4, "SelectAddrModeWRO<16>", []>;
+def ro_Windexed32 : ComplexPattern<i64, 4, "SelectAddrModeWRO<32>", []>;
+def ro_Windexed64 : ComplexPattern<i64, 4, "SelectAddrModeWRO<64>", []>;
+def ro_Windexed128 : ComplexPattern<i64, 4, "SelectAddrModeWRO<128>", []>;
+
+class MemExtendOperand<string Reg, int Width> : AsmOperandClass {
+  let Name = "Mem" # Reg # "Extend" # Width;
+  let PredicateMethod = "isMem" # Reg # "Extend<" # Width # ">";
+  let RenderMethod = "addMemExtendOperands";
+  let DiagnosticType = "InvalidMemory" # Reg # "Extend" # Width;
+}
+
+def MemWExtend8Operand : MemExtendOperand<"W", 8> {
+  // The address "[x0, x1, lsl #0]" actually maps to the variant which performs
+  // the trivial shift.
+  let RenderMethod = "addMemExtend8Operands";
+}
+def MemWExtend16Operand : MemExtendOperand<"W", 16>;
+def MemWExtend32Operand : MemExtendOperand<"W", 32>;
+def MemWExtend64Operand : MemExtendOperand<"W", 64>;
+def MemWExtend128Operand : MemExtendOperand<"W", 128>;
+
+def MemXExtend8Operand : MemExtendOperand<"X", 8> {
+  // The address "[x0, x1, lsl #0]" actually maps to the variant which performs
+  // the trivial shift.
+  let RenderMethod = "addMemExtend8Operands";
+}
+def MemXExtend16Operand : MemExtendOperand<"X", 16>;
+def MemXExtend32Operand : MemExtendOperand<"X", 32>;
+def MemXExtend64Operand : MemExtendOperand<"X", 64>;
+def MemXExtend128Operand : MemExtendOperand<"X", 128>;
+
+class ro_extend<AsmOperandClass ParserClass, string Reg, int Width>
+        : Operand<i32> {
+  let ParserMatchClass = ParserClass;
+  let PrintMethod = "printMemExtend<'" # Reg # "', " # Width # ">";
+  let DecoderMethod = "DecodeMemExtend";
+  let EncoderMethod = "getMemExtendOpValue";
+  let MIOperandInfo = (ops i32imm:$signed, i32imm:$doshift);
+}
+
+def ro_Wextend8   : ro_extend<MemWExtend8Operand,   "w", 8>;
+def ro_Wextend16  : ro_extend<MemWExtend16Operand,  "w", 16>;
+def ro_Wextend32  : ro_extend<MemWExtend32Operand,  "w", 32>;
+def ro_Wextend64  : ro_extend<MemWExtend64Operand,  "w", 64>;
+def ro_Wextend128 : ro_extend<MemWExtend128Operand, "w", 128>;
+
+def ro_Xextend8   : ro_extend<MemXExtend8Operand,   "x", 8>;
+def ro_Xextend16  : ro_extend<MemXExtend16Operand,  "x", 16>;
+def ro_Xextend32  : ro_extend<MemXExtend32Operand,  "x", 32>;
+def ro_Xextend64  : ro_extend<MemXExtend64Operand,  "x", 64>;
+def ro_Xextend128 : ro_extend<MemXExtend128Operand, "x", 128>;
+
+class ROAddrMode<ComplexPattern windex, ComplexPattern xindex,
+                  Operand wextend, Operand xextend>  {
+  // CodeGen-level pattern covering the entire addressing mode.
+  ComplexPattern Wpat = windex;
+  ComplexPattern Xpat = xindex;
+
+  // Asm-level Operand covering the valid "uxtw #3" style syntax.
+  Operand Wext = wextend;
+  Operand Xext = xextend;
+}
+
+def ro8 : ROAddrMode<ro_Windexed8, ro_Xindexed8, ro_Wextend8, ro_Xextend8>;
+def ro16 : ROAddrMode<ro_Windexed16, ro_Xindexed16, ro_Wextend16, ro_Xextend16>;
+def ro32 : ROAddrMode<ro_Windexed32, ro_Xindexed32, ro_Wextend32, ro_Xextend32>;
+def ro64 : ROAddrMode<ro_Windexed64, ro_Xindexed64, ro_Wextend64, ro_Xextend64>;
+def ro128 : ROAddrMode<ro_Windexed128, ro_Xindexed128, ro_Wextend128,
+                       ro_Xextend128>;
+
+class LoadStore8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                      string asm, dag ins, dag outs, list<dag> pat>
+    : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<2> extend;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = extend{1}; // sign extend Rm?
+  let Inst{14}    = 1;
+  let Inst{12}    = extend{0}; // do shift?
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+}
+
+class ROInstAlias<string asm, RegisterClass regtype, Instruction INST>
+  : InstAlias<asm # " $Rt, [$Rn, $Rm]",
+              (INST regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, 0, 0)>;
+
+multiclass Load8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                   string asm, ValueType Ty, SDPatternOperator loadop> {
+  let AddedComplexity = 10 in
+  def roW : LoadStore8RO<sz, V, opc, regtype, asm,
+                 (outs regtype:$Rt),
+                 (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm,
+                                             ro_Wextend8:$extend)))]>,
+           Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10 in
+  def roX : LoadStore8RO<sz, V, opc, regtype, asm,
+                 (outs regtype:$Rt),
+                 (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm,
+                                             ro_Xextend8:$extend)))]>,
+           Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+multiclass Store8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                    string asm, ValueType Ty, SDPatternOperator storeop> {
+  let AddedComplexity = 10 in
+  def roW : LoadStore8RO<sz, V, opc, regtype, asm, (outs),
+                 (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend),
+                 [(storeop (Ty regtype:$Rt),
+                           (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm,
+                                         ro_Wextend8:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10 in
+  def roX : LoadStore8RO<sz, V, opc, regtype, asm, (outs),
+                 (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend),
+                 [(storeop (Ty regtype:$Rt),
+                           (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm,
+                                         ro_Xextend8:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+class LoadStore16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                      string asm, dag ins, dag outs, list<dag> pat>
+    : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<2> extend;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = extend{1}; // sign extend Rm?
+  let Inst{14}    = 1;
+  let Inst{12}    = extend{0}; // do shift?
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+}
+
+multiclass Load16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                    string asm, ValueType Ty, SDPatternOperator loadop> {
+  let AddedComplexity = 10 in
+  def roW : LoadStore16RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                 (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
+                                              ro_Wextend16:$extend)))]>,
+            Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10 in
+  def roX : LoadStore16RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                 (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
+                                             ro_Xextend16:$extend)))]>,
+            Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+multiclass Store16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                     string asm, ValueType Ty, SDPatternOperator storeop> {
+  let AddedComplexity = 10 in
+  def roW : LoadStore16RO<sz, V, opc, regtype, asm, (outs),
+                (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
+                                         ro_Wextend16:$extend))]>,
+           Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10 in
+  def roX : LoadStore16RO<sz, V, opc, regtype, asm, (outs),
+                (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
+                                         ro_Xextend16:$extend))]>,
+           Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+class LoadStore32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                      string asm, dag ins, dag outs, list<dag> pat>
+    : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<2> extend;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = extend{1}; // sign extend Rm?
+  let Inst{14}    = 1;
+  let Inst{12}    = extend{0}; // do shift?
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+}
+
+multiclass Load32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                    string asm, ValueType Ty, SDPatternOperator loadop> {
+  let AddedComplexity = 10 in
+  def roW : LoadStore32RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                 (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm,
+                                              ro_Wextend32:$extend)))]>,
+           Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10 in
+  def roX : LoadStore32RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                 (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm,
+                                              ro_Xextend32:$extend)))]>,
+           Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+multiclass Store32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                     string asm, ValueType Ty, SDPatternOperator storeop> {
+  let AddedComplexity = 10 in
+  def roW : LoadStore32RO<sz, V, opc, regtype, asm, (outs),
+                (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm,
+                                         ro_Wextend32:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10 in
+  def roX : LoadStore32RO<sz, V, opc, regtype, asm, (outs),
+                (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm,
+                                        ro_Xextend32:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+class LoadStore64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                      string asm, dag ins, dag outs, list<dag> pat>
+    : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<2> extend;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = extend{1}; // sign extend Rm?
+  let Inst{14}    = 1;
+  let Inst{12}    = extend{0}; // do shift?
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+}
+
+multiclass Load64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                    string asm, ValueType Ty, SDPatternOperator loadop> {
+  let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+  def roW : LoadStore64RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend),
+                [(set (Ty regtype:$Rt),
+                      (loadop (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+                                             ro_Wextend64:$extend)))]>,
+           Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+  def roX : LoadStore64RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+                                              ro_Xextend64:$extend)))]>,
+           Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+multiclass Store64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                     string asm, ValueType Ty, SDPatternOperator storeop> {
+  let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+  def roW : LoadStore64RO<sz, V, opc, regtype, asm, (outs),
+                (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+                                         ro_Wextend64:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+  def roX : LoadStore64RO<sz, V, opc, regtype, asm, (outs),
+                (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+                                         ro_Xextend64:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+class LoadStore128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                      string asm, dag ins, dag outs, list<dag> pat>
+    : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<2> extend;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = extend{1}; // sign extend Rm?
+  let Inst{14}    = 1;
+  let Inst{12}    = extend{0}; // do shift?
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+}
+
+multiclass Load128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                     string asm, ValueType Ty, SDPatternOperator loadop> {
+  let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+  def roW : LoadStore128RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend128:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Windexed128 GPR64sp:$Rn, GPR32:$Rm,
+                                               ro_Wextend128:$extend)))]>,
+            Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+  def roX : LoadStore128RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend128:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Xindexed128 GPR64sp:$Rn, GPR64:$Rm,
+                                               ro_Xextend128:$extend)))]>,
+            Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+multiclass Store128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                      string asm, ValueType Ty, SDPatternOperator storeop> {
+  let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+  def roW : LoadStore128RO<sz, V, opc, regtype, asm, (outs),
+               (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend128:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Windexed128 GPR64sp:$Rn, GPR32:$Rm,
+                                          ro_Wextend128:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+  def roX : LoadStore128RO<sz, V, opc, regtype, asm, (outs),
+               (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend128:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Xindexed128 GPR64sp:$Rn, GPR64:$Rm,
+                                          ro_Xextend128:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+class BasePrefetchRO<bits<2> sz, bit V, bits<2> opc, dag outs, dag ins,
+                     string asm, list<dag> pat>
+    : I<outs, ins, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat>,
+      Sched<[WriteLD]> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<2> extend;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = extend{1}; // sign extend Rm?
+  let Inst{14}    = 1;
+  let Inst{12}    = extend{0}; // do shift?
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+}
+
+multiclass PrefetchRO<bits<2> sz, bit V, bits<2> opc, string asm> {
+  def roW : BasePrefetchRO<sz, V, opc, (outs),
+                (ins prfop:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend),
+                asm, [(AArch64Prefetch imm:$Rt,
+                                     (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+                                                    ro_Wextend64:$extend))]> {
+    let Inst{13} = 0b0;
+  }
+
+  def roX : BasePrefetchRO<sz, V, opc, (outs),
+                (ins prfop:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend),
+                asm,  [(AArch64Prefetch imm:$Rt,
+                                      (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+                                                     ro_Xextend64:$extend))]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : InstAlias<"prfm $Rt, [$Rn, $Rm]",
+               (!cast<Instruction>(NAME # "roX") prfop:$Rt,
+                                                 GPR64sp:$Rn, GPR64:$Rm, 0, 0)>;
+}
+
+//---
+// Load/store unscaled immediate
+//---
+
+def am_unscaled8 :  ComplexPattern<i64, 2, "SelectAddrModeUnscaled8", []>;
+def am_unscaled16 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled16", []>;
+def am_unscaled32 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled32", []>;
+def am_unscaled64 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled64", []>;
+def am_unscaled128 :ComplexPattern<i64, 2, "SelectAddrModeUnscaled128", []>;
+
+class BaseLoadStoreUnscale<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
+                           string asm, list<dag> pattern>
+    : I<oops, iops, asm, "\t$Rt, [$Rn, $offset]", "", pattern> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<9> offset;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 0;
+  let Inst{20-12} = offset;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodeSignedLdStInstruction";
+}
+
+multiclass LoadUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                   string asm, list<dag> pattern> {
+  let AddedComplexity = 1 in // try this before LoadUI
+  def i : BaseLoadStoreUnscale<sz, V, opc, (outs regtype:$Rt),
+                               (ins GPR64sp:$Rn, simm9:$offset), asm, pattern>,
+          Sched<[WriteLD]>;
+
+  def : InstAlias<asm # " $Rt, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+multiclass StoreUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                         string asm, list<dag> pattern> {
+  let AddedComplexity = 1 in // try this before StoreUI
+  def i : BaseLoadStoreUnscale<sz, V, opc, (outs),
+                               (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
+                               asm, pattern>,
+          Sched<[WriteST]>;
+
+  def : InstAlias<asm # " $Rt, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+multiclass PrefetchUnscaled<bits<2> sz, bit V, bits<2> opc, string asm,
+                            list<dag> pat> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+  def i : BaseLoadStoreUnscale<sz, V, opc, (outs),
+                               (ins prfop:$Rt, GPR64sp:$Rn, simm9:$offset),
+                               asm, pat>,
+          Sched<[WriteLD]>;
+
+  def : InstAlias<asm # " $Rt, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") prfop:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+//---
+// Load/store unscaled immediate, unprivileged
+//---
+
+class BaseLoadStoreUnprivileged<bits<2> sz, bit V, bits<2> opc,
+                                dag oops, dag iops, string asm>
+    : I<oops, iops, asm, "\t$Rt, [$Rn, $offset]", "", []> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<9> offset;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 0;
+  let Inst{20-12} = offset;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodeSignedLdStInstruction";
+}
+
+multiclass LoadUnprivileged<bits<2> sz, bit V, bits<2> opc,
+                            RegisterClass regtype, string asm> {
+  let mayStore = 0, mayLoad = 1, hasSideEffects = 0 in
+  def i : BaseLoadStoreUnprivileged<sz, V, opc, (outs regtype:$Rt),
+                                    (ins GPR64sp:$Rn, simm9:$offset), asm>,
+          Sched<[WriteLD]>;
+
+  def : InstAlias<asm # " $Rt, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+multiclass StoreUnprivileged<bits<2> sz, bit V, bits<2> opc,
+                             RegisterClass regtype, string asm> {
+  let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in
+  def i : BaseLoadStoreUnprivileged<sz, V, opc, (outs),
+                                 (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
+                                 asm>,
+          Sched<[WriteST]>;
+
+  def : InstAlias<asm # " $Rt, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+//---
+// Load/store pre-indexed
+//---
+
+class BaseLoadStorePreIdx<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
+                          string asm, string cstr, list<dag> pat>
+    : I<oops, iops, asm, "\t$Rt, [$Rn, $offset]!", cstr, pat> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<9> offset;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 0;
+  let Inst{20-12} = offset;
+  let Inst{11-10} = 0b11;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodeSignedLdStInstruction";
+}
+
+let hasSideEffects = 0 in {
+let mayStore = 0, mayLoad = 1 in
+class LoadPreIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+             string asm>
+    : BaseLoadStorePreIdx<sz, V, opc,
+                     (outs GPR64sp:$wback, regtype:$Rt),
+                     (ins GPR64sp:$Rn, simm9:$offset), asm,
+                     "$Rn = $wback", []>,
+      Sched<[WriteLD, WriteAdr]>;
+
+let mayStore = 1, mayLoad = 0 in
+class StorePreIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                  string asm, SDPatternOperator storeop, ValueType Ty>
+    : BaseLoadStorePreIdx<sz, V, opc,
+                      (outs GPR64sp:$wback),
+                      (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
+                      asm, "$Rn = $wback",
+      [(set GPR64sp:$wback,
+            (storeop (Ty regtype:$Rt), GPR64sp:$Rn, simm9:$offset))]>,
+      Sched<[WriteAdr, WriteST]>;
+} // hasSideEffects = 0
+
+//---
+// Load/store post-indexed
+//---
+
+// (pre-index) load/stores.
+class BaseLoadStorePostIdx<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
+                          string asm, string cstr, list<dag> pat>
+    : I<oops, iops, asm, "\t$Rt, [$Rn], $offset", cstr, pat> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<9> offset;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 0b0;
+  let Inst{20-12} = offset;
+  let Inst{11-10} = 0b01;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodeSignedLdStInstruction";
+}
+
+let hasSideEffects = 0 in {
+let mayStore = 0, mayLoad = 1 in
+class LoadPostIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+             string asm>
+    : BaseLoadStorePostIdx<sz, V, opc,
+                      (outs GPR64sp:$wback, regtype:$Rt),
+                      (ins GPR64sp:$Rn, simm9:$offset),
+                      asm, "$Rn = $wback", []>,
+      Sched<[WriteLD, WriteI]>;
+
+let mayStore = 1, mayLoad = 0 in
+class StorePostIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                   string asm, SDPatternOperator storeop, ValueType Ty>
+    : BaseLoadStorePostIdx<sz, V, opc,
+                      (outs GPR64sp:$wback),
+                      (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
+                       asm, "$Rn = $wback",
+      [(set GPR64sp:$wback,
+            (storeop (Ty regtype:$Rt), GPR64sp:$Rn, simm9:$offset))]>,
+    Sched<[WriteAdr, WriteST, ReadAdrBase]>;
+} // hasSideEffects = 0
+
+
+//---
+// Load/store pair
+//---
+
+// (indexed, offset)
+
+class BaseLoadStorePairOffset<bits<2> opc, bit V, bit L, dag oops, dag iops,
+                              string asm>
+    : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, $offset]", "", []> {
+  bits<5> Rt;
+  bits<5> Rt2;
+  bits<5> Rn;
+  bits<7> offset;
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b101;
+  let Inst{26}    = V;
+  let Inst{25-23} = 0b010;
+  let Inst{22}    = L;
+  let Inst{21-15} = offset;
+  let Inst{14-10} = Rt2;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodePairLdStInstruction";
+}
+
+multiclass LoadPairOffset<bits<2> opc, bit V, RegisterClass regtype,
+                          Operand indextype, string asm> {
+  let hasSideEffects = 0, mayStore = 0, mayLoad = 1 in
+  def i : BaseLoadStorePairOffset<opc, V, 1,
+                                  (outs regtype:$Rt, regtype:$Rt2),
+                                  (ins GPR64sp:$Rn, indextype:$offset), asm>,
+          Sched<[WriteLD, WriteLDHi]>;
+
+  def : InstAlias<asm # " $Rt, $Rt2, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
+                                                  GPR64sp:$Rn, 0)>;
+}
+
+
+multiclass StorePairOffset<bits<2> opc, bit V, RegisterClass regtype,
+                           Operand indextype, string asm> {
+  let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in
+  def i : BaseLoadStorePairOffset<opc, V, 0, (outs),
+                                  (ins regtype:$Rt, regtype:$Rt2,
+                                       GPR64sp:$Rn, indextype:$offset),
+                                  asm>,
+          Sched<[WriteSTP]>;
+
+  def : InstAlias<asm # " $Rt, $Rt2, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
+                                                  GPR64sp:$Rn, 0)>;
+}
+
+// (pre-indexed)
+class BaseLoadStorePairPreIdx<bits<2> opc, bit V, bit L, dag oops, dag iops,
+                              string asm>
+    : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, $offset]!", "$Rn = $wback", []> {
+  bits<5> Rt;
+  bits<5> Rt2;
+  bits<5> Rn;
+  bits<7> offset;
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b101;
+  let Inst{26}    = V;
+  let Inst{25-23} = 0b011;
+  let Inst{22}    = L;
+  let Inst{21-15} = offset;
+  let Inst{14-10} = Rt2;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodePairLdStInstruction";
+}
+
+let hasSideEffects = 0 in {
+let mayStore = 0, mayLoad = 1 in
+class LoadPairPreIdx<bits<2> opc, bit V, RegisterClass regtype,
+                     Operand indextype, string asm>
+    : BaseLoadStorePairPreIdx<opc, V, 1,
+                              (outs GPR64sp:$wback, regtype:$Rt, regtype:$Rt2),
+                              (ins GPR64sp:$Rn, indextype:$offset), asm>,
+      Sched<[WriteLD, WriteLDHi, WriteAdr]>;
+
+let mayStore = 1, mayLoad = 0 in
+class StorePairPreIdx<bits<2> opc, bit V, RegisterClass regtype,
+                      Operand indextype, string asm>
+    : BaseLoadStorePairPreIdx<opc, V, 0, (outs GPR64sp:$wback),
+                             (ins regtype:$Rt, regtype:$Rt2,
+                                  GPR64sp:$Rn, indextype:$offset),
+                             asm>,
+      Sched<[WriteAdr, WriteSTP]>;
+} // hasSideEffects = 0
+
+// (post-indexed)
+
+class BaseLoadStorePairPostIdx<bits<2> opc, bit V, bit L, dag oops, dag iops,
+                              string asm>
+    : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn], $offset", "$Rn = $wback", []> {
+  bits<5> Rt;
+  bits<5> Rt2;
+  bits<5> Rn;
+  bits<7> offset;
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b101;
+  let Inst{26}    = V;
+  let Inst{25-23} = 0b001;
+  let Inst{22}    = L;
+  let Inst{21-15} = offset;
+  let Inst{14-10} = Rt2;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodePairLdStInstruction";
+}
+
+let hasSideEffects = 0 in {
+let mayStore = 0, mayLoad = 1 in
+class LoadPairPostIdx<bits<2> opc, bit V, RegisterClass regtype,
+                      Operand idxtype, string asm>
+    : BaseLoadStorePairPostIdx<opc, V, 1,
+                              (outs GPR64sp:$wback, regtype:$Rt, regtype:$Rt2),
+                              (ins GPR64sp:$Rn, idxtype:$offset), asm>,
+      Sched<[WriteLD, WriteLDHi, WriteAdr]>;
+
+let mayStore = 1, mayLoad = 0 in
+class StorePairPostIdx<bits<2> opc, bit V, RegisterClass regtype,
+                       Operand idxtype, string asm>
+    : BaseLoadStorePairPostIdx<opc, V, 0, (outs),
+                             (ins GPR64sp:$wback, regtype:$Rt, regtype:$Rt2,
+                                  GPR64sp:$Rn, idxtype:$offset),
+                             asm>,
+      Sched<[WriteAdr, WriteSTP]>;
+} // hasSideEffects = 0
+
+//  (no-allocate)
+
+class BaseLoadStorePairNoAlloc<bits<2> opc, bit V, bit L, dag oops, dag iops,
+                              string asm>
+    : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, $offset]", "", []> {
+  bits<5> Rt;
+  bits<5> Rt2;
+  bits<5> Rn;
+  bits<7> offset;
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b101;
+  let Inst{26}    = V;
+  let Inst{25-23} = 0b000;
+  let Inst{22}    = L;
+  let Inst{21-15} = offset;
+  let Inst{14-10} = Rt2;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodePairLdStInstruction";
+}
+
+multiclass LoadPairNoAlloc<bits<2> opc, bit V, RegisterClass regtype,
+                           Operand indextype, string asm> {
+  let hasSideEffects = 0, mayStore = 0, mayLoad = 1 in
+  def i : BaseLoadStorePairNoAlloc<opc, V, 1,
+                                   (outs regtype:$Rt, regtype:$Rt2),
+                                   (ins GPR64sp:$Rn, indextype:$offset), asm>,
+          Sched<[WriteLD, WriteLDHi]>;
+
+
+  def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
+                                                  GPR64sp:$Rn, 0)>;
+}
+
+multiclass StorePairNoAlloc<bits<2> opc, bit V, RegisterClass regtype,
+                      Operand indextype, string asm> {
+  let hasSideEffects = 0, mayStore = 1, mayLoad = 0 in
+  def i : BaseLoadStorePairNoAlloc<opc, V, 0, (outs),
+                                   (ins regtype:$Rt, regtype:$Rt2,
+                                        GPR64sp:$Rn, indextype:$offset),
+                                   asm>,
+          Sched<[WriteSTP]>;
+
+  def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
+                                                  GPR64sp:$Rn, 0)>;
+}
+
+//---
+// Load/store exclusive
+//---
+
+// True exclusive operations write to and/or read from the system's exclusive
+// monitors, which as far as a compiler is concerned can be modelled as a
+// random shared memory address. Hence LoadExclusive mayStore.
+//
+// Since these instructions have the undefined register bits set to 1 in
+// their canonical form, we need a post encoder method to set those bits
+// to 1 when encoding these instructions. We do this using the
+// fixLoadStoreExclusive function. This function has template parameters:
+//
+// fixLoadStoreExclusive<int hasRs, int hasRt2>
+//
+// hasRs indicates that the instruction uses the Rs field, so we won't set
+// it to 1 (and the same for Rt2). We don't need template parameters for
+// the other register fields since Rt and Rn are always used.
+//
+let hasSideEffects = 1, mayLoad = 1, mayStore = 1 in
+class BaseLoadStoreExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                             dag oops, dag iops, string asm, string operands>
+    : I<oops, iops, asm, operands, "", []> {
+  let Inst{31-30} = sz;
+  let Inst{29-24} = 0b001000;
+  let Inst{23}    = o2;
+  let Inst{22}    = L;
+  let Inst{21}    = o1;
+  let Inst{15}    = o0;
+
+  let DecoderMethod = "DecodeExclusiveLdStInstruction";
+}
+
+// Neither Rs nor Rt2 operands.
+class LoadStoreExclusiveSimple<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                               dag oops, dag iops, string asm, string operands>
+    : BaseLoadStoreExclusive<sz, o2, L, o1, o0, oops, iops, asm, operands> {
+  bits<5> Rt;
+  bits<5> Rn;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rt;
+
+  let PostEncoderMethod = "fixLoadStoreExclusive<0,0>";
+}
+
+// Simple load acquires don't set the exclusive monitor
+let mayLoad = 1, mayStore = 0 in
+class LoadAcquire<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                  RegisterClass regtype, string asm>
+    : LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs regtype:$Rt),
+                               (ins GPR64sp0:$Rn), asm, "\t$Rt, [$Rn]">,
+      Sched<[WriteLD]>;
+
+class LoadExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                    RegisterClass regtype, string asm>
+    : LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs regtype:$Rt),
+                               (ins GPR64sp0:$Rn), asm, "\t$Rt, [$Rn]">,
+      Sched<[WriteLD]>;
+
+class LoadExclusivePair<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                       RegisterClass regtype, string asm>
+    : BaseLoadStoreExclusive<sz, o2, L, o1, o0,
+                             (outs regtype:$Rt, regtype:$Rt2),
+                             (ins GPR64sp0:$Rn), asm,
+                             "\t$Rt, $Rt2, [$Rn]">,
+      Sched<[WriteLD, WriteLDHi]> {
+  bits<5> Rt;
+  bits<5> Rt2;
+  bits<5> Rn;
+  let Inst{14-10} = Rt2;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rt;
+
+  let PostEncoderMethod = "fixLoadStoreExclusive<0,1>";
+}
+
+// Simple store release operations do not check the exclusive monitor.
+let mayLoad = 0, mayStore = 1 in
+class StoreRelease<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                   RegisterClass regtype, string asm>
+    : LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs),
+                               (ins regtype:$Rt, GPR64sp0:$Rn),
+                               asm, "\t$Rt, [$Rn]">,
+      Sched<[WriteST]>;
+
+let mayLoad = 1, mayStore = 1 in
+class StoreExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                     RegisterClass regtype, string asm>
+    : BaseLoadStoreExclusive<sz, o2, L, o1, o0, (outs GPR32:$Ws),
+                             (ins regtype:$Rt, GPR64sp0:$Rn),
+                             asm, "\t$Ws, $Rt, [$Rn]">,
+      Sched<[WriteSTX]> {
+  bits<5> Ws;
+  bits<5> Rt;
+  bits<5> Rn;
+  let Inst{20-16} = Ws;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rt;
+
+  let Constraints = "@earlyclobber $Ws";
+  let PostEncoderMethod = "fixLoadStoreExclusive<1,0>";
+}
+
+class StoreExclusivePair<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                         RegisterClass regtype, string asm>
+    : BaseLoadStoreExclusive<sz, o2, L, o1, o0,
+                             (outs GPR32:$Ws),
+                             (ins regtype:$Rt, regtype:$Rt2, GPR64sp0:$Rn),
+                              asm, "\t$Ws, $Rt, $Rt2, [$Rn]">,
+      Sched<[WriteSTX]> {
+  bits<5> Ws;
+  bits<5> Rt;
+  bits<5> Rt2;
+  bits<5> Rn;
+  let Inst{20-16} = Ws;
+  let Inst{14-10} = Rt2;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rt;
+
+  let Constraints = "@earlyclobber $Ws";
+}
+
+//---
+// Exception generation
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+class ExceptionGeneration<bits<3> op1, bits<2> ll, string asm>
+    : I<(outs), (ins imm0_65535:$imm), asm, "\t$imm", "", []>,
+      Sched<[WriteSys]> {
+  bits<16> imm;
+  let Inst{31-24} = 0b11010100;
+  let Inst{23-21} = op1;
+  let Inst{20-5}  = imm;
+  let Inst{4-2}   = 0b000;
+  let Inst{1-0}   = ll;
+}
+
+let Predicates = [HasFPARMv8] in {
+
+//---
+// Floating point to integer conversion
+//---
+
+class BaseFPToIntegerUnscaled<bits<2> type, bits<2> rmode, bits<3> opcode,
+                      RegisterClass srcType, RegisterClass dstType,
+                      string asm, list<dag> pattern>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn),
+         asm, "\t$Rd, $Rn", "", pattern>,
+      Sched<[WriteFCvt]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{30-29} = 0b00;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = type;
+  let Inst{21}    = 1;
+  let Inst{20-19} = rmode;
+  let Inst{18-16} = opcode;
+  let Inst{15-10} = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseFPToInteger<bits<2> type, bits<2> rmode, bits<3> opcode,
+                      RegisterClass srcType, RegisterClass dstType,
+                      Operand immType, string asm, list<dag> pattern>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn, immType:$scale),
+         asm, "\t$Rd, $Rn, $scale", "", pattern>,
+      Sched<[WriteFCvt]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<6> scale;
+  let Inst{30-29} = 0b00;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = type;
+  let Inst{21}    = 0;
+  let Inst{20-19} = rmode;
+  let Inst{18-16} = opcode;
+  let Inst{15-10} = scale;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass FPToIntegerUnscaled<bits<2> rmode, bits<3> opcode, string asm,
+           SDPatternOperator OpN> {
+  // Unscaled single-precision to 32-bit
+  def UWSr : BaseFPToIntegerUnscaled<0b00, rmode, opcode, FPR32, GPR32, asm,
+                                     [(set GPR32:$Rd, (OpN FPR32:$Rn))]> {
+    let Inst{31} = 0; // 32-bit GPR flag
+  }
+
+  // Unscaled single-precision to 64-bit
+  def UXSr : BaseFPToIntegerUnscaled<0b00, rmode, opcode, FPR32, GPR64, asm,
+                                     [(set GPR64:$Rd, (OpN FPR32:$Rn))]> {
+    let Inst{31} = 1; // 64-bit GPR flag
+  }
+
+  // Unscaled double-precision to 32-bit
+  def UWDr : BaseFPToIntegerUnscaled<0b01, rmode, opcode, FPR64, GPR32, asm,
+                                     [(set GPR32:$Rd, (OpN (f64 FPR64:$Rn)))]> {
+    let Inst{31} = 0; // 32-bit GPR flag
+  }
+
+  // Unscaled double-precision to 64-bit
+  def UXDr : BaseFPToIntegerUnscaled<0b01, rmode, opcode, FPR64, GPR64, asm,
+                                     [(set GPR64:$Rd, (OpN (f64 FPR64:$Rn)))]> {
+    let Inst{31} = 1; // 64-bit GPR flag
+  }
+}
+
+multiclass FPToIntegerScaled<bits<2> rmode, bits<3> opcode, string asm,
+                             SDPatternOperator OpN> {
+  // Scaled single-precision to 32-bit
+  def SWSri : BaseFPToInteger<0b00, rmode, opcode, FPR32, GPR32,
+                              fixedpoint_f32_i32, asm,
+              [(set GPR32:$Rd, (OpN (fmul FPR32:$Rn,
+                                          fixedpoint_f32_i32:$scale)))]> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let scale{5} = 1;
+  }
+
+  // Scaled single-precision to 64-bit
+  def SXSri : BaseFPToInteger<0b00, rmode, opcode, FPR32, GPR64,
+                              fixedpoint_f32_i64, asm,
+              [(set GPR64:$Rd, (OpN (fmul FPR32:$Rn,
+                                          fixedpoint_f32_i64:$scale)))]> {
+    let Inst{31} = 1; // 64-bit GPR flag
+  }
+
+  // Scaled double-precision to 32-bit
+  def SWDri : BaseFPToInteger<0b01, rmode, opcode, FPR64, GPR32,
+                              fixedpoint_f64_i32, asm,
+              [(set GPR32:$Rd, (OpN (fmul FPR64:$Rn,
+                                          fixedpoint_f64_i32:$scale)))]> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let scale{5} = 1;
+  }
+
+  // Scaled double-precision to 64-bit
+  def SXDri : BaseFPToInteger<0b01, rmode, opcode, FPR64, GPR64,
+                              fixedpoint_f64_i64, asm,
+              [(set GPR64:$Rd, (OpN (fmul FPR64:$Rn,
+                                          fixedpoint_f64_i64:$scale)))]> {
+    let Inst{31} = 1; // 64-bit GPR flag
+  }
+}
+
+//---
+// Integer to floating point conversion
+//---
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseIntegerToFP<bit isUnsigned,
+                      RegisterClass srcType, RegisterClass dstType,
+                      Operand immType, string asm, list<dag> pattern>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn, immType:$scale),
+         asm, "\t$Rd, $Rn, $scale", "", pattern>,
+      Sched<[WriteFCvt]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<6> scale;
+  let Inst{30-23} = 0b00111100;
+  let Inst{21-17} = 0b00001;
+  let Inst{16}    = isUnsigned;
+  let Inst{15-10} = scale;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class BaseIntegerToFPUnscaled<bit isUnsigned,
+                      RegisterClass srcType, RegisterClass dstType,
+                      ValueType dvt, string asm, SDNode node>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn),
+         asm, "\t$Rd, $Rn", "", [(set (dvt dstType:$Rd), (node srcType:$Rn))]>,
+      Sched<[WriteFCvt]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<6> scale;
+  let Inst{30-23} = 0b00111100;
+  let Inst{21-17} = 0b10001;
+  let Inst{16}    = isUnsigned;
+  let Inst{15-10} = 0b000000;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass IntegerToFP<bit isUnsigned, string asm, SDNode node> {
+  // Unscaled
+  def UWSri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR32, f32, asm, node> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{22} = 0; // 32-bit FPR flag
+  }
+
+  def UWDri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR64, f64, asm, node> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{22} = 1; // 64-bit FPR flag
+  }
+
+  def UXSri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR32, f32, asm, node> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{22} = 0; // 32-bit FPR flag
+  }
+
+  def UXDri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR64, f64, asm, node> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{22} = 1; // 64-bit FPR flag
+  }
+
+  // Scaled
+  def SWSri: BaseIntegerToFP<isUnsigned, GPR32, FPR32, fixedpoint_f32_i32, asm,
+                             [(set FPR32:$Rd,
+                                   (fdiv (node GPR32:$Rn),
+                                         fixedpoint_f32_i32:$scale))]> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{22} = 0; // 32-bit FPR flag
+    let scale{5} = 1;
+  }
+
+  def SWDri: BaseIntegerToFP<isUnsigned, GPR32, FPR64, fixedpoint_f64_i32, asm,
+                             [(set FPR64:$Rd,
+                                   (fdiv (node GPR32:$Rn),
+                                         fixedpoint_f64_i32:$scale))]> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{22} = 1; // 64-bit FPR flag
+    let scale{5} = 1;
+  }
+
+  def SXSri: BaseIntegerToFP<isUnsigned, GPR64, FPR32, fixedpoint_f32_i64, asm,
+                             [(set FPR32:$Rd,
+                                   (fdiv (node GPR64:$Rn),
+                                         fixedpoint_f32_i64:$scale))]> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{22} = 0; // 32-bit FPR flag
+  }
+
+  def SXDri: BaseIntegerToFP<isUnsigned, GPR64, FPR64, fixedpoint_f64_i64, asm,
+                             [(set FPR64:$Rd,
+                                   (fdiv (node GPR64:$Rn),
+                                         fixedpoint_f64_i64:$scale))]> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{22} = 1; // 64-bit FPR flag
+  }
+}
+
+//---
+// Unscaled integer <-> floating point conversion (i.e. FMOV)
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseUnscaledConversion<bits<2> rmode, bits<3> opcode,
+                      RegisterClass srcType, RegisterClass dstType,
+                      string asm>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd, $Rn", "",
+        // We use COPY_TO_REGCLASS for these bitconvert operations.
+        // copyPhysReg() expands the resultant COPY instructions after
+        // regalloc is done. This gives greater freedom for the allocator
+        // and related passes (coalescing, copy propagation, et. al.) to
+        // be more effective.
+        [/*(set (dvt dstType:$Rd), (bitconvert (svt srcType:$Rn)))*/]>,
+      Sched<[WriteFCopy]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{30-23} = 0b00111100;
+  let Inst{21}    = 1;
+  let Inst{20-19} = rmode;
+  let Inst{18-16} = opcode;
+  let Inst{15-10} = 0b000000;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseUnscaledConversionToHigh<bits<2> rmode, bits<3> opcode,
+                     RegisterClass srcType, RegisterOperand dstType, string asm,
+                     string kind>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn, VectorIndex1:$idx), asm,
+        "{\t$Rd"#kind#"$idx, $Rn|"#kind#"\t$Rd$idx, $Rn}", "", []>,
+      Sched<[WriteFCopy]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{30-23} = 0b00111101;
+  let Inst{21}    = 1;
+  let Inst{20-19} = rmode;
+  let Inst{18-16} = opcode;
+  let Inst{15-10} = 0b000000;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+
+  let DecoderMethod =  "DecodeFMOVLaneInstruction";
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseUnscaledConversionFromHigh<bits<2> rmode, bits<3> opcode,
+                     RegisterOperand srcType, RegisterClass dstType, string asm,
+                     string kind>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn, VectorIndex1:$idx), asm,
+        "{\t$Rd, $Rn"#kind#"$idx|"#kind#"\t$Rd, $Rn$idx}", "", []>,
+      Sched<[WriteFCopy]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{30-23} = 0b00111101;
+  let Inst{21}    = 1;
+  let Inst{20-19} = rmode;
+  let Inst{18-16} = opcode;
+  let Inst{15-10} = 0b000000;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+
+  let DecoderMethod =  "DecodeFMOVLaneInstruction";
+}
+
+
+
+multiclass UnscaledConversion<string asm> {
+  def WSr : BaseUnscaledConversion<0b00, 0b111, GPR32, FPR32, asm> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{22} = 0; // 32-bit FPR flag
+  }
+
+  def XDr : BaseUnscaledConversion<0b00, 0b111, GPR64, FPR64, asm> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{22} = 1; // 64-bit FPR flag
+  }
+
+  def SWr : BaseUnscaledConversion<0b00, 0b110, FPR32, GPR32, asm> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{22} = 0; // 32-bit FPR flag
+  }
+
+  def DXr : BaseUnscaledConversion<0b00, 0b110, FPR64, GPR64, asm> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{22} = 1; // 64-bit FPR flag
+  }
+
+  def XDHighr : BaseUnscaledConversionToHigh<0b01, 0b111, GPR64, V128,
+                                             asm, ".d"> {
+    let Inst{31} = 1;
+    let Inst{22} = 0;
+  }
+
+  def DXHighr : BaseUnscaledConversionFromHigh<0b01, 0b110, V128, GPR64,
+                                               asm, ".d"> {
+    let Inst{31} = 1;
+    let Inst{22} = 0;
+  }
+}
+
+//---
+// Floating point conversion
+//---
+
+class BaseFPConversion<bits<2> type, bits<2> opcode, RegisterClass dstType,
+                       RegisterClass srcType, string asm, list<dag> pattern>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd, $Rn", "", pattern>,
+      Sched<[WriteFCvt]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-24} = 0b00011110;
+  let Inst{23-22} = type;
+  let Inst{21-17} = 0b10001;
+  let Inst{16-15} = opcode;
+  let Inst{14-10} = 0b10000;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass FPConversion<string asm> {
+  // Double-precision to Half-precision
+  def HDr : BaseFPConversion<0b01, 0b11, FPR16, FPR64, asm,
+                             [(set FPR16:$Rd, (fround FPR64:$Rn))]>;
+
+  // Double-precision to Single-precision
+  def SDr : BaseFPConversion<0b01, 0b00, FPR32, FPR64, asm,
+                             [(set FPR32:$Rd, (fround FPR64:$Rn))]>;
+
+  // Half-precision to Double-precision
+  def DHr : BaseFPConversion<0b11, 0b01, FPR64, FPR16, asm,
+                             [(set FPR64:$Rd, (fextend FPR16:$Rn))]>;
+
+  // Half-precision to Single-precision
+  def SHr : BaseFPConversion<0b11, 0b00, FPR32, FPR16, asm,
+                             [(set FPR32:$Rd, (fextend FPR16:$Rn))]>;
+
+  // Single-precision to Double-precision
+  def DSr : BaseFPConversion<0b00, 0b01, FPR64, FPR32, asm,
+                             [(set FPR64:$Rd, (fextend FPR32:$Rn))]>;
+
+  // Single-precision to Half-precision
+  def HSr : BaseFPConversion<0b00, 0b11, FPR16, FPR32, asm,
+                             [(set FPR16:$Rd, (fround FPR32:$Rn))]>;
+}
+
+//---
+// Single operand floating point data processing
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSingleOperandFPData<bits<4> opcode, RegisterClass regtype,
+                              ValueType vt, string asm, SDPatternOperator node>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "\t$Rd, $Rn", "",
+         [(set (vt regtype:$Rd), (node (vt regtype:$Rn)))]>,
+      Sched<[WriteF]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-23} = 0b000111100;
+  let Inst{21-19} = 0b100;
+  let Inst{18-15} = opcode;
+  let Inst{14-10} = 0b10000;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SingleOperandFPData<bits<4> opcode, string asm,
+                               SDPatternOperator node = null_frag> {
+  def Sr : BaseSingleOperandFPData<opcode, FPR32, f32, asm, node> {
+    let Inst{22} = 0; // 32-bit size flag
+  }
+
+  def Dr : BaseSingleOperandFPData<opcode, FPR64, f64, asm, node> {
+    let Inst{22} = 1; // 64-bit size flag
+  }
+}
+
+//---
+// Two operand floating point data processing
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseTwoOperandFPData<bits<4> opcode, RegisterClass regtype,
+                           string asm, list<dag> pat>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
+         asm, "\t$Rd, $Rn, $Rm", "", pat>,
+      Sched<[WriteF]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31-23} = 0b000111100;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass TwoOperandFPData<bits<4> opcode, string asm,
+                            SDPatternOperator node = null_frag> {
+  def Srr : BaseTwoOperandFPData<opcode, FPR32, asm,
+                         [(set (f32 FPR32:$Rd),
+                               (node (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]> {
+    let Inst{22} = 0; // 32-bit size flag
+  }
+
+  def Drr : BaseTwoOperandFPData<opcode, FPR64, asm,
+                         [(set (f64 FPR64:$Rd),
+                               (node (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]> {
+    let Inst{22} = 1; // 64-bit size flag
+  }
+}
+
+multiclass TwoOperandFPDataNeg<bits<4> opcode, string asm, SDNode node> {
+  def Srr : BaseTwoOperandFPData<opcode, FPR32, asm,
+                  [(set FPR32:$Rd, (fneg (node FPR32:$Rn, (f32 FPR32:$Rm))))]> {
+    let Inst{22} = 0; // 32-bit size flag
+  }
+
+  def Drr : BaseTwoOperandFPData<opcode, FPR64, asm,
+                  [(set FPR64:$Rd, (fneg (node FPR64:$Rn, (f64 FPR64:$Rm))))]> {
+    let Inst{22} = 1; // 64-bit size flag
+  }
+}
+
+
+//---
+// Three operand floating point data processing
+//---
+
+class BaseThreeOperandFPData<bit isNegated, bit isSub,
+                             RegisterClass regtype, string asm, list<dag> pat>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, regtype: $Ra),
+         asm, "\t$Rd, $Rn, $Rm, $Ra", "", pat>,
+      Sched<[WriteFMul]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<5> Ra;
+  let Inst{31-23} = 0b000111110;
+  let Inst{21}    = isNegated;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = isSub;
+  let Inst{14-10} = Ra;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass ThreeOperandFPData<bit isNegated, bit isSub,string asm,
+                              SDPatternOperator node> {
+  def Srrr : BaseThreeOperandFPData<isNegated, isSub, FPR32, asm,
+            [(set FPR32:$Rd,
+                  (node (f32 FPR32:$Rn), (f32 FPR32:$Rm), (f32 FPR32:$Ra)))]> {
+    let Inst{22} = 0; // 32-bit size flag
+  }
+
+  def Drrr : BaseThreeOperandFPData<isNegated, isSub, FPR64, asm,
+            [(set FPR64:$Rd,
+                  (node (f64 FPR64:$Rn), (f64 FPR64:$Rm), (f64 FPR64:$Ra)))]> {
+    let Inst{22} = 1; // 64-bit size flag
+  }
+}
+
+//---
+// Floating point data comparisons
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseOneOperandFPComparison<bit signalAllNans,
+                                 RegisterClass regtype, string asm,
+                                 list<dag> pat>
+    : I<(outs), (ins regtype:$Rn), asm, "\t$Rn, #0.0", "", pat>,
+      Sched<[WriteFCmp]> {
+  bits<5> Rn;
+  let Inst{31-23} = 0b000111100;
+  let Inst{21}    = 1;
+
+  let Inst{15-10} = 0b001000;
+  let Inst{9-5}   = Rn;
+  let Inst{4}     = signalAllNans;
+  let Inst{3-0}   = 0b1000;
+
+  // Rm should be 0b00000 canonically, but we need to accept any value.
+  let PostEncoderMethod = "fixOneOperandFPComparison";
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseTwoOperandFPComparison<bit signalAllNans, RegisterClass regtype,
+                                string asm, list<dag> pat>
+    : I<(outs), (ins regtype:$Rn, regtype:$Rm), asm, "\t$Rn, $Rm", "", pat>,
+      Sched<[WriteFCmp]> {
+  bits<5> Rm;
+  bits<5> Rn;
+  let Inst{31-23} = 0b000111100;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-10} = 0b001000;
+  let Inst{9-5}   = Rn;
+  let Inst{4}     = signalAllNans;
+  let Inst{3-0}   = 0b0000;
+}
+
+multiclass FPComparison<bit signalAllNans, string asm,
+                        SDPatternOperator OpNode = null_frag> {
+  let Defs = [NZCV] in {
+  def Srr : BaseTwoOperandFPComparison<signalAllNans, FPR32, asm,
+      [(OpNode FPR32:$Rn, (f32 FPR32:$Rm)), (implicit NZCV)]> {
+    let Inst{22} = 0;
+  }
+
+  def Sri : BaseOneOperandFPComparison<signalAllNans, FPR32, asm,
+      [(OpNode (f32 FPR32:$Rn), fpimm0), (implicit NZCV)]> {
+    let Inst{22} = 0;
+  }
+
+  def Drr : BaseTwoOperandFPComparison<signalAllNans, FPR64, asm,
+      [(OpNode FPR64:$Rn, (f64 FPR64:$Rm)), (implicit NZCV)]> {
+    let Inst{22} = 1;
+  }
+
+  def Dri : BaseOneOperandFPComparison<signalAllNans, FPR64, asm,
+      [(OpNode (f64 FPR64:$Rn), fpimm0), (implicit NZCV)]> {
+    let Inst{22} = 1;
+  }
+  } // Defs = [NZCV]
+}
+
+//---
+// Floating point conditional comparisons
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseFPCondComparison<bit signalAllNans,
+                              RegisterClass regtype, string asm>
+    : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm0_15:$nzcv, ccode:$cond),
+         asm, "\t$Rn, $Rm, $nzcv, $cond", "", []>,
+      Sched<[WriteFCmp]> {
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<4> nzcv;
+  bits<4> cond;
+
+  let Inst{31-23} = 0b000111100;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = cond;
+  let Inst{11-10} = 0b01;
+  let Inst{9-5}   = Rn;
+  let Inst{4}     = signalAllNans;
+  let Inst{3-0}   = nzcv;
+}
+
+multiclass FPCondComparison<bit signalAllNans, string asm> {
+  let Defs = [NZCV], Uses = [NZCV] in {
+  def Srr : BaseFPCondComparison<signalAllNans, FPR32, asm> {
+    let Inst{22} = 0;
+  }
+
+  def Drr : BaseFPCondComparison<signalAllNans, FPR64, asm> {
+    let Inst{22} = 1;
+  }
+  } // Defs = [NZCV], Uses = [NZCV]
+}
+
+//---
+// Floating point conditional select
+//---
+
+class BaseFPCondSelect<RegisterClass regtype, ValueType vt, string asm>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond),
+         asm, "\t$Rd, $Rn, $Rm, $cond", "",
+         [(set regtype:$Rd,
+               (AArch64csel (vt regtype:$Rn), regtype:$Rm,
+                          (i32 imm:$cond), NZCV))]>,
+      Sched<[WriteF]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<4> cond;
+
+  let Inst{31-23} = 0b000111100;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = cond;
+  let Inst{11-10} = 0b11;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass FPCondSelect<string asm> {
+  let Uses = [NZCV] in {
+  def Srrr : BaseFPCondSelect<FPR32, f32, asm> {
+    let Inst{22} = 0;
+  }
+
+  def Drrr : BaseFPCondSelect<FPR64, f64, asm> {
+    let Inst{22} = 1;
+  }
+  } // Uses = [NZCV]
+}
+
+//---
+// Floating move immediate
+//---
+
+class BaseFPMoveImmediate<RegisterClass regtype, Operand fpimmtype, string asm>
+  : I<(outs regtype:$Rd), (ins fpimmtype:$imm), asm, "\t$Rd, $imm", "",
+      [(set regtype:$Rd, fpimmtype:$imm)]>,
+    Sched<[WriteFImm]> {
+  bits<5> Rd;
+  bits<8> imm;
+  let Inst{31-23} = 0b000111100;
+  let Inst{21}    = 1;
+  let Inst{20-13} = imm;
+  let Inst{12-5}  = 0b10000000;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass FPMoveImmediate<string asm> {
+  def Si : BaseFPMoveImmediate<FPR32, fpimm32, asm> {
+    let Inst{22} = 0;
+  }
+
+  def Di : BaseFPMoveImmediate<FPR64, fpimm64, asm> {
+    let Inst{22} = 1;
+  }
+}
+} // end of 'let Predicates = [HasFPARMv8]'
+
+//----------------------------------------------------------------------------
+// AdvSIMD
+//----------------------------------------------------------------------------
+
+let Predicates = [HasNEON] in {
+
+//----------------------------------------------------------------------------
+// AdvSIMD three register vector instructions
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDThreeSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
+                        RegisterOperand regtype, string asm, string kind,
+                        list<dag> pattern>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
+      "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
+      "|" # kind # "\t$Rd, $Rn, $Rm|}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-11} = opcode;
+  let Inst{10}    = 1;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
+                        RegisterOperand regtype, string asm, string kind,
+                        list<dag> pattern>
+  : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn, regtype:$Rm), asm,
+      "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
+      "|" # kind # "\t$Rd, $Rn, $Rm}", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-11} = opcode;
+  let Inst{10}    = 1;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+// All operand sizes distinguished in the encoding.
+multiclass SIMDThreeSameVector<bit U, bits<5> opc, string asm,
+                               SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
+                                      asm, ".8b",
+         [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
+                                      asm, ".16b",
+         [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
+  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
+                                      asm, ".4h",
+         [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
+                                      asm, ".8h",
+         [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
+  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
+                                      asm, ".2s",
+         [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
+                                      asm, ".4s",
+         [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
+  def v2i64 : BaseSIMDThreeSameVector<1, U, 0b11, opc, V128,
+                                      asm, ".2d",
+         [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>;
+}
+
+// As above, but D sized elements unsupported.
+multiclass SIMDThreeSameVectorBHS<bit U, bits<5> opc, string asm,
+                                  SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
+                                      asm, ".8b",
+        [(set V64:$Rd, (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm))))]>;
+  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
+                                      asm, ".16b",
+        [(set V128:$Rd, (v16i8 (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm))))]>;
+  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
+                                      asm, ".4h",
+        [(set V64:$Rd, (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm))))]>;
+  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
+                                      asm, ".8h",
+        [(set V128:$Rd, (v8i16 (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm))))]>;
+  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
+                                      asm, ".2s",
+        [(set V64:$Rd, (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm))))]>;
+  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
+                                      asm, ".4s",
+        [(set V128:$Rd, (v4i32 (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm))))]>;
+}
+
+multiclass SIMDThreeSameVectorBHSTied<bit U, bits<5> opc, string asm,
+                                  SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVectorTied<0, U, 0b00, opc, V64,
+                                      asm, ".8b",
+      [(set (v8i8 V64:$dst),
+            (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8 : BaseSIMDThreeSameVectorTied<1, U, 0b00, opc, V128,
+                                      asm, ".16b",
+      [(set (v16i8 V128:$dst),
+            (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
+  def v4i16 : BaseSIMDThreeSameVectorTied<0, U, 0b01, opc, V64,
+                                      asm, ".4h",
+      [(set (v4i16 V64:$dst),
+            (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16 : BaseSIMDThreeSameVectorTied<1, U, 0b01, opc, V128,
+                                      asm, ".8h",
+      [(set (v8i16 V128:$dst),
+            (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
+  def v2i32 : BaseSIMDThreeSameVectorTied<0, U, 0b10, opc, V64,
+                                      asm, ".2s",
+      [(set (v2i32 V64:$dst),
+            (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32 : BaseSIMDThreeSameVectorTied<1, U, 0b10, opc, V128,
+                                      asm, ".4s",
+      [(set (v4i32 V128:$dst),
+            (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
+}
+
+// As above, but only B sized elements supported.
+multiclass SIMDThreeSameVectorB<bit U, bits<5> opc, string asm,
+                                SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
+                                      asm, ".8b",
+    [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
+                                      asm, ".16b",
+    [(set (v16i8 V128:$Rd),
+          (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
+}
+
+// As above, but only S and D sized floating point elements supported.
+multiclass SIMDThreeSameVectorFP<bit U, bit S, bits<5> opc,
+                                 string asm, SDPatternOperator OpNode> {
+  def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0}, opc, V64,
+                                      asm, ".2s",
+        [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
+  def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0}, opc, V128,
+                                      asm, ".4s",
+        [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
+  def v2f64 : BaseSIMDThreeSameVector<1, U, {S,1}, opc, V128,
+                                      asm, ".2d",
+        [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
+}
+
+multiclass SIMDThreeSameVectorFPCmp<bit U, bit S, bits<5> opc,
+                                    string asm,
+                                    SDPatternOperator OpNode> {
+  def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0}, opc, V64,
+                                      asm, ".2s",
+        [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
+  def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0}, opc, V128,
+                                      asm, ".4s",
+        [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
+  def v2f64 : BaseSIMDThreeSameVector<1, U, {S,1}, opc, V128,
+                                      asm, ".2d",
+        [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
+}
+
+multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<5> opc,
+                                 string asm, SDPatternOperator OpNode> {
+  def v2f32 : BaseSIMDThreeSameVectorTied<0, U, {S,0}, opc, V64,
+                                      asm, ".2s",
+     [(set (v2f32 V64:$dst),
+           (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
+  def v4f32 : BaseSIMDThreeSameVectorTied<1, U, {S,0}, opc, V128,
+                                      asm, ".4s",
+     [(set (v4f32 V128:$dst),
+           (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
+  def v2f64 : BaseSIMDThreeSameVectorTied<1, U, {S,1}, opc, V128,
+                                      asm, ".2d",
+     [(set (v2f64 V128:$dst),
+           (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
+}
+
+// As above, but D and B sized elements unsupported.
+multiclass SIMDThreeSameVectorHS<bit U, bits<5> opc, string asm,
+                                SDPatternOperator OpNode> {
+  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
+                                      asm, ".4h",
+        [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
+                                      asm, ".8h",
+        [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
+  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
+                                      asm, ".2s",
+        [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
+                                      asm, ".4s",
+        [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
+}
+
+// Logical three vector ops share opcode bits, and only use B sized elements.
+multiclass SIMDLogicalThreeVector<bit U, bits<2> size, string asm,
+                                  SDPatternOperator OpNode = null_frag> {
+  def v8i8  : BaseSIMDThreeSameVector<0, U, size, 0b00011, V64,
+                                     asm, ".8b",
+                         [(set (v8i8 V64:$Rd), (OpNode V64:$Rn, V64:$Rm))]>;
+  def v16i8  : BaseSIMDThreeSameVector<1, U, size, 0b00011, V128,
+                                     asm, ".16b",
+                         [(set (v16i8 V128:$Rd), (OpNode V128:$Rn, V128:$Rm))]>;
+
+  def : Pat<(v4i16 (OpNode V64:$LHS, V64:$RHS)),
+          (!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>;
+  def : Pat<(v2i32 (OpNode V64:$LHS, V64:$RHS)),
+          (!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>;
+  def : Pat<(v1i64 (OpNode V64:$LHS, V64:$RHS)),
+          (!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>;
+
+  def : Pat<(v8i16 (OpNode V128:$LHS, V128:$RHS)),
+      (!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>;
+  def : Pat<(v4i32 (OpNode V128:$LHS, V128:$RHS)),
+      (!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>;
+  def : Pat<(v2i64 (OpNode V128:$LHS, V128:$RHS)),
+      (!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>;
+}
+
+multiclass SIMDLogicalThreeVectorTied<bit U, bits<2> size,
+                                  string asm, SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVectorTied<0, U, size, 0b00011, V64,
+                                     asm, ".8b",
+             [(set (v8i8 V64:$dst),
+                   (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8  : BaseSIMDThreeSameVectorTied<1, U, size, 0b00011, V128,
+                                     asm, ".16b",
+             [(set (v16i8 V128:$dst),
+                   (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
+                           (v16i8 V128:$Rm)))]>;
+
+  def : Pat<(v4i16 (OpNode (v4i16 V64:$LHS), (v4i16 V64:$MHS),
+                           (v4i16 V64:$RHS))),
+          (!cast<Instruction>(NAME#"v8i8")
+            V64:$LHS, V64:$MHS, V64:$RHS)>;
+  def : Pat<(v2i32 (OpNode (v2i32 V64:$LHS), (v2i32 V64:$MHS),
+                           (v2i32 V64:$RHS))),
+          (!cast<Instruction>(NAME#"v8i8")
+            V64:$LHS, V64:$MHS, V64:$RHS)>;
+  def : Pat<(v1i64 (OpNode (v1i64 V64:$LHS), (v1i64 V64:$MHS),
+                           (v1i64 V64:$RHS))),
+          (!cast<Instruction>(NAME#"v8i8")
+            V64:$LHS, V64:$MHS, V64:$RHS)>;
+
+  def : Pat<(v8i16 (OpNode (v8i16 V128:$LHS), (v8i16 V128:$MHS),
+                           (v8i16 V128:$RHS))),
+      (!cast<Instruction>(NAME#"v16i8")
+        V128:$LHS, V128:$MHS, V128:$RHS)>;
+  def : Pat<(v4i32 (OpNode (v4i32 V128:$LHS), (v4i32 V128:$MHS),
+                           (v4i32 V128:$RHS))),
+      (!cast<Instruction>(NAME#"v16i8")
+        V128:$LHS, V128:$MHS, V128:$RHS)>;
+  def : Pat<(v2i64 (OpNode (v2i64 V128:$LHS), (v2i64 V128:$MHS),
+                           (v2i64 V128:$RHS))),
+      (!cast<Instruction>(NAME#"v16i8")
+        V128:$LHS, V128:$MHS, V128:$RHS)>;
+}
+
+
+//----------------------------------------------------------------------------
+// AdvSIMD two register vector instructions.
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDTwoSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
+                        RegisterOperand regtype, string asm, string dstkind,
+                        string srckind, list<dag> pattern>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
+      "{\t$Rd" # dstkind # ", $Rn" # srckind #
+      "|" # dstkind # "\t$Rd, $Rn}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDTwoSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
+                            RegisterOperand regtype, string asm, string dstkind,
+                            string srckind, list<dag> pattern>
+  : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn), asm,
+      "{\t$Rd" # dstkind # ", $Rn" # srckind #
+      "|" # dstkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+// Supports B, H, and S element sizes.
+multiclass SIMDTwoVectorBHS<bit U, bits<5> opc, string asm,
+                            SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+                                      asm, ".8b", ".8b",
+                          [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
+  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+                                      asm, ".16b", ".16b",
+                          [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
+  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+                                      asm, ".4h", ".4h",
+                          [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
+  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+                                      asm, ".8h", ".8h",
+                          [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
+  def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
+                                      asm, ".2s", ".2s",
+                          [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
+  def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
+                                      asm, ".4s", ".4s",
+                          [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+}
+
+class BaseSIMDVectorLShiftLongBySize<bit Q, bits<2> size,
+                            RegisterOperand regtype, string asm, string dstkind,
+                            string srckind, string amount>
+  : I<(outs V128:$Rd), (ins regtype:$Rn), asm,
+      "{\t$Rd" # dstkind # ", $Rn" # srckind # ", #" # amount #
+      "|" # dstkind # "\t$Rd, $Rn, #" #  amount # "}", "", []>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29-24} = 0b101110;
+  let Inst{23-22} = size;
+  let Inst{21-10} = 0b100001001110;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDVectorLShiftLongBySizeBHS {
+  let neverHasSideEffects = 1 in {
+  def v8i8  : BaseSIMDVectorLShiftLongBySize<0, 0b00, V64,
+                                             "shll", ".8h",  ".8b", "8">;
+  def v16i8 : BaseSIMDVectorLShiftLongBySize<1, 0b00, V128,
+                                             "shll2", ".8h", ".16b", "8">;
+  def v4i16 : BaseSIMDVectorLShiftLongBySize<0, 0b01, V64,
+                                             "shll", ".4s",  ".4h", "16">;
+  def v8i16 : BaseSIMDVectorLShiftLongBySize<1, 0b01, V128,
+                                             "shll2", ".4s", ".8h", "16">;
+  def v2i32 : BaseSIMDVectorLShiftLongBySize<0, 0b10, V64,
+                                             "shll", ".2d",  ".2s", "32">;
+  def v4i32 : BaseSIMDVectorLShiftLongBySize<1, 0b10, V128,
+                                             "shll2", ".2d", ".4s", "32">;
+  }
+}
+
+// Supports all element sizes.
+multiclass SIMDLongTwoVector<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode> {
+  def v8i8_v4i16  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+                                      asm, ".4h", ".8b",
+               [(set (v4i16 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
+  def v16i8_v8i16 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+                                      asm, ".8h", ".16b",
+               [(set (v8i16 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
+  def v4i16_v2i32 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+                                      asm, ".2s", ".4h",
+               [(set (v2i32 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
+  def v8i16_v4i32 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+                                      asm, ".4s", ".8h",
+               [(set (v4i32 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
+  def v2i32_v1i64 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
+                                      asm, ".1d", ".2s",
+               [(set (v1i64 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
+  def v4i32_v2i64 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
+                                      asm, ".2d", ".4s",
+               [(set (v2i64 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+}
+
+multiclass SIMDLongTwoVectorTied<bit U, bits<5> opc, string asm,
+                                 SDPatternOperator OpNode> {
+  def v8i8_v4i16  : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, V64,
+                                          asm, ".4h", ".8b",
+      [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd),
+                                      (v8i8 V64:$Rn)))]>;
+  def v16i8_v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, V128,
+                                          asm, ".8h", ".16b",
+      [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd),
+                                      (v16i8 V128:$Rn)))]>;
+  def v4i16_v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, V64,
+                                          asm, ".2s", ".4h",
+      [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd),
+                                      (v4i16 V64:$Rn)))]>;
+  def v8i16_v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, V128,
+                                          asm, ".4s", ".8h",
+      [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd),
+                                      (v8i16 V128:$Rn)))]>;
+  def v2i32_v1i64 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, V64,
+                                          asm, ".1d", ".2s",
+      [(set (v1i64 V64:$dst), (OpNode (v1i64 V64:$Rd),
+                                      (v2i32 V64:$Rn)))]>;
+  def v4i32_v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, V128,
+                                          asm, ".2d", ".4s",
+      [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd),
+                                      (v4i32 V128:$Rn)))]>;
+}
+
+// Supports all element sizes, except 1xD.
+multiclass SIMDTwoVectorBHSDTied<bit U, bits<5> opc, string asm,
+                                  SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, V64,
+                                    asm, ".8b", ".8b",
+    [(set (v8i8 V64:$dst), (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn)))]>;
+  def v16i8 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, V128,
+                                    asm, ".16b", ".16b",
+    [(set (v16i8 V128:$dst), (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn)))]>;
+  def v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, V64,
+                                    asm, ".4h", ".4h",
+    [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn)))]>;
+  def v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, V128,
+                                    asm, ".8h", ".8h",
+    [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn)))]>;
+  def v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, V64,
+                                    asm, ".2s", ".2s",
+    [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn)))]>;
+  def v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, V128,
+                                    asm, ".4s", ".4s",
+    [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn)))]>;
+  def v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b11, opc, V128,
+                                    asm, ".2d", ".2d",
+    [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn)))]>;
+}
+
+multiclass SIMDTwoVectorBHSD<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode = null_frag> {
+  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+                                asm, ".8b", ".8b",
+    [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
+  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+                                asm, ".16b", ".16b",
+    [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
+  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+                                asm, ".4h", ".4h",
+    [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
+  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+                                asm, ".8h", ".8h",
+    [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
+  def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
+                                asm, ".2s", ".2s",
+    [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
+  def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
+                                asm, ".4s", ".4s",
+    [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+  def v2i64 : BaseSIMDTwoSameVector<1, U, 0b11, opc, V128,
+                                asm, ".2d", ".2d",
+    [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
+}
+
+
+// Supports only B element sizes.
+multiclass SIMDTwoVectorB<bit U, bits<2> size, bits<5> opc, string asm,
+                          SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDTwoSameVector<0, U, size, opc, V64,
+                                asm, ".8b", ".8b",
+                    [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
+  def v16i8 : BaseSIMDTwoSameVector<1, U, size, opc, V128,
+                                asm, ".16b", ".16b",
+                    [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
+
+}
+
+// Supports only B and H element sizes.
+multiclass SIMDTwoVectorBH<bit U, bits<5> opc, string asm,
+                                SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+                                asm, ".8b", ".8b",
+                    [(set (v8i8 V64:$Rd), (OpNode V64:$Rn))]>;
+  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+                                asm, ".16b", ".16b",
+                    [(set (v16i8 V128:$Rd), (OpNode V128:$Rn))]>;
+  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+                                asm, ".4h", ".4h",
+                    [(set (v4i16 V64:$Rd), (OpNode V64:$Rn))]>;
+  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+                                asm, ".8h", ".8h",
+                    [(set (v8i16 V128:$Rd), (OpNode V128:$Rn))]>;
+}
+
+// Supports only S and D element sizes, uses high bit of the size field
+// as an extra opcode bit.
+multiclass SIMDTwoVectorFP<bit U, bit S, bits<5> opc, string asm,
+                           SDPatternOperator OpNode> {
+  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+                                asm, ".2s", ".2s",
+                          [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>;
+  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+                                asm, ".4s", ".4s",
+                          [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>;
+  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
+                                asm, ".2d", ".2d",
+                          [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
+}
+
+// Supports only S element size.
+multiclass SIMDTwoVectorS<bit U, bit S, bits<5> opc, string asm,
+                           SDPatternOperator OpNode> {
+  def v2i32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+                                asm, ".2s", ".2s",
+                          [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
+  def v4i32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+                                asm, ".4s", ".4s",
+                          [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+}
+
+
+multiclass SIMDTwoVectorFPToInt<bit U, bit S, bits<5> opc, string asm,
+                           SDPatternOperator OpNode> {
+  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+                                asm, ".2s", ".2s",
+                          [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>;
+  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+                                asm, ".4s", ".4s",
+                          [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>;
+  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
+                                asm, ".2d", ".2d",
+                          [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
+}
+
+multiclass SIMDTwoVectorIntToFP<bit U, bit S, bits<5> opc, string asm,
+                           SDPatternOperator OpNode> {
+  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+                                asm, ".2s", ".2s",
+                          [(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
+  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+                                asm, ".4s", ".4s",
+                          [(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
+                                asm, ".2d", ".2d",
+                          [(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
+}
+
+
+class BaseSIMDMixedTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
+                           RegisterOperand inreg, RegisterOperand outreg,
+                           string asm, string outkind, string inkind,
+                           list<dag> pattern>
+  : I<(outs outreg:$Rd), (ins inreg:$Rn), asm,
+      "{\t$Rd" # outkind # ", $Rn" # inkind #
+      "|" # outkind # "\t$Rd, $Rn}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class BaseSIMDMixedTwoVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
+                           RegisterOperand inreg, RegisterOperand outreg,
+                           string asm, string outkind, string inkind,
+                           list<dag> pattern>
+  : I<(outs outreg:$dst), (ins outreg:$Rd, inreg:$Rn), asm,
+      "{\t$Rd" # outkind # ", $Rn" # inkind #
+      "|" # outkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDMixedTwoVector<bit U, bits<5> opc, string asm,
+                              SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDMixedTwoVector<0, U, 0b00, opc, V128, V64,
+                                      asm, ".8b", ".8h",
+        [(set (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
+  def v16i8 : BaseSIMDMixedTwoVectorTied<1, U, 0b00, opc, V128, V128,
+                                      asm#"2", ".16b", ".8h", []>;
+  def v4i16 : BaseSIMDMixedTwoVector<0, U, 0b01, opc, V128, V64,
+                                      asm, ".4h", ".4s",
+        [(set (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+  def v8i16 : BaseSIMDMixedTwoVectorTied<1, U, 0b01, opc, V128, V128,
+                                      asm#"2", ".8h", ".4s", []>;
+  def v2i32 : BaseSIMDMixedTwoVector<0, U, 0b10, opc, V128, V64,
+                                      asm, ".2s", ".2d",
+        [(set (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
+  def v4i32 : BaseSIMDMixedTwoVectorTied<1, U, 0b10, opc, V128, V128,
+                                      asm#"2", ".4s", ".2d", []>;
+
+  def : Pat<(concat_vectors (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn))),
+            (!cast<Instruction>(NAME # "v16i8")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+  def : Pat<(concat_vectors (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn))),
+            (!cast<Instruction>(NAME # "v8i16")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+  def : Pat<(concat_vectors (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn))),
+            (!cast<Instruction>(NAME # "v4i32")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+}
+
+class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
+                           RegisterOperand regtype,
+                           string asm, string kind, string zero,
+                           ValueType dty, ValueType sty, SDNode OpNode>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
+      "{\t$Rd" # kind # ", $Rn" # kind # ", #" # zero #
+      "|" # kind # "\t$Rd, $Rn, #" # zero # "}", "",
+      [(set (dty regtype:$Rd), (OpNode (sty regtype:$Rn)))]>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+// Comparisons support all element sizes, except 1xD.
+multiclass SIMDCmpTwoVector<bit U, bits<5> opc, string asm,
+                            SDNode OpNode> {
+  def v8i8rz  : BaseSIMDCmpTwoVector<0, U, 0b00, opc, V64,
+                                     asm, ".8b", "0",
+                                     v8i8, v8i8, OpNode>;
+  def v16i8rz : BaseSIMDCmpTwoVector<1, U, 0b00, opc, V128,
+                                     asm, ".16b", "0",
+                                     v16i8, v16i8, OpNode>;
+  def v4i16rz : BaseSIMDCmpTwoVector<0, U, 0b01, opc, V64,
+                                     asm, ".4h", "0",
+                                     v4i16, v4i16, OpNode>;
+  def v8i16rz : BaseSIMDCmpTwoVector<1, U, 0b01, opc, V128,
+                                     asm, ".8h", "0",
+                                     v8i16, v8i16, OpNode>;
+  def v2i32rz : BaseSIMDCmpTwoVector<0, U, 0b10, opc, V64,
+                                     asm, ".2s", "0",
+                                     v2i32, v2i32, OpNode>;
+  def v4i32rz : BaseSIMDCmpTwoVector<1, U, 0b10, opc, V128,
+                                     asm, ".4s", "0",
+                                     v4i32, v4i32, OpNode>;
+  def v2i64rz : BaseSIMDCmpTwoVector<1, U, 0b11, opc, V128,
+                                     asm, ".2d", "0",
+                                     v2i64, v2i64, OpNode>;
+}
+
+// FP Comparisons support only S and D element sizes.
+multiclass SIMDFPCmpTwoVector<bit U, bit S, bits<5> opc,
+                              string asm, SDNode OpNode> {
+
+  def v2i32rz : BaseSIMDCmpTwoVector<0, U, {S,0}, opc, V64,
+                                     asm, ".2s", "0.0",
+                                     v2i32, v2f32, OpNode>;
+  def v4i32rz : BaseSIMDCmpTwoVector<1, U, {S,0}, opc, V128,
+                                     asm, ".4s", "0.0",
+                                     v4i32, v4f32, OpNode>;
+  def v2i64rz : BaseSIMDCmpTwoVector<1, U, {S,1}, opc, V128,
+                                     asm, ".2d", "0.0",
+                                     v2i64, v2f64, OpNode>;
+
+  def : InstAlias<asm # " $Vd.2s, $Vn.2s, #0",
+                  (!cast<Instruction>(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>;
+  def : InstAlias<asm # " $Vd.4s, $Vn.4s, #0",
+                  (!cast<Instruction>(NAME # v4i32rz) V128:$Vd, V128:$Vn), 0>;
+  def : InstAlias<asm # " $Vd.2d, $Vn.2d, #0",
+                  (!cast<Instruction>(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>;
+  def : InstAlias<asm # ".2s $Vd, $Vn, #0",
+                  (!cast<Instruction>(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>;
+  def : InstAlias<asm # ".4s $Vd, $Vn, #0",
+                  (!cast<Instruction>(NAME # v4i32rz) V128:$Vd, V128:$Vn), 0>;
+  def : InstAlias<asm # ".2d $Vd, $Vn, #0",
+                  (!cast<Instruction>(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDFPCvtTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
+                             RegisterOperand outtype, RegisterOperand intype,
+                             string asm, string VdTy, string VnTy,
+                             list<dag> pattern>
+  : I<(outs outtype:$Rd), (ins intype:$Rn), asm,
+      !strconcat("\t$Rd", VdTy, ", $Rn", VnTy), "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class BaseSIMDFPCvtTwoVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
+                             RegisterOperand outtype, RegisterOperand intype,
+                             string asm, string VdTy, string VnTy,
+                             list<dag> pattern>
+  : I<(outs outtype:$dst), (ins outtype:$Rd, intype:$Rn), asm,
+      !strconcat("\t$Rd", VdTy, ", $Rn", VnTy), "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDFPWidenTwoVector<bit U, bit S, bits<5> opc, string asm> {
+  def v4i16 : BaseSIMDFPCvtTwoVector<0, U, {S,0}, opc, V128, V64,
+                                    asm, ".4s", ".4h", []>;
+  def v8i16 : BaseSIMDFPCvtTwoVector<1, U, {S,0}, opc, V128, V128,
+                                    asm#"2", ".4s", ".8h", []>;
+  def v2i32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V128, V64,
+                                    asm, ".2d", ".2s", []>;
+  def v4i32 : BaseSIMDFPCvtTwoVector<1, U, {S,1}, opc, V128, V128,
+                                    asm#"2", ".2d", ".4s", []>;
+}
+
+multiclass SIMDFPNarrowTwoVector<bit U, bit S, bits<5> opc, string asm> {
+  def v4i16 : BaseSIMDFPCvtTwoVector<0, U, {S,0}, opc, V64, V128,
+                                    asm, ".4h", ".4s", []>;
+  def v8i16 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,0}, opc, V128, V128,
+                                    asm#"2", ".8h", ".4s", []>;
+  def v2i32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V64, V128,
+                                    asm, ".2s", ".2d", []>;
+  def v4i32 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,1}, opc, V128, V128,
+                                    asm#"2", ".4s", ".2d", []>;
+}
+
+multiclass SIMDFPInexactCvtTwoVector<bit U, bit S, bits<5> opc, string asm,
+                                     Intrinsic OpNode> {
+  def v2f32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V64, V128,
+                                     asm, ".2s", ".2d",
+                          [(set (v2f32 V64:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
+  def v4f32 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,1}, opc, V128, V128,
+                                    asm#"2", ".4s", ".2d", []>;
+
+  def : Pat<(concat_vectors (v2f32 V64:$Rd), (OpNode (v2f64 V128:$Rn))),
+            (!cast<Instruction>(NAME # "v4f32")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD three register different-size vector instructions.
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDDifferentThreeVector<bit U, bits<3> size, bits<4> opcode,
+                      RegisterOperand outtype, RegisterOperand intype1,
+                      RegisterOperand intype2, string asm,
+                      string outkind, string inkind1, string inkind2,
+                      list<dag> pattern>
+  : I<(outs outtype:$Rd), (ins intype1:$Rn, intype2:$Rm), asm,
+      "{\t$Rd" # outkind # ", $Rn" # inkind1 # ", $Rm" # inkind2 #
+      "|" # outkind # "\t$Rd, $Rn, $Rm}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31}    = 0;
+  let Inst{30}    = size{0};
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size{2-1};
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = opcode;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDDifferentThreeVectorTied<bit U, bits<3> size, bits<4> opcode,
+                      RegisterOperand outtype, RegisterOperand intype1,
+                      RegisterOperand intype2, string asm,
+                      string outkind, string inkind1, string inkind2,
+                      list<dag> pattern>
+  : I<(outs outtype:$dst), (ins outtype:$Rd, intype1:$Rn, intype2:$Rm), asm,
+      "{\t$Rd" # outkind # ", $Rn" # inkind1 # ", $Rm" # inkind2 #
+      "|" # outkind # "\t$Rd, $Rn, $Rm}", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31}    = 0;
+  let Inst{30}    = size{0};
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size{2-1};
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = opcode;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+// FIXME: TableGen doesn't know how to deal with expanded types that also
+//        change the element count (in this case, placing the results in
+//        the high elements of the result register rather than the low
+//        elements). Until that's fixed, we can't code-gen those.
+multiclass SIMDNarrowThreeVectorBHS<bit U, bits<4> opc, string asm,
+                                    Intrinsic IntOp> {
+  def v8i16_v8i8   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
+                                                  V64, V128, V128,
+                                                  asm, ".8b", ".8h", ".8h",
+     [(set (v8i8 V64:$Rd), (IntOp (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
+  def v8i16_v16i8  : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".16b", ".8h", ".8h",
+     []>;
+  def v4i32_v4i16  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
+                                                  V64, V128, V128,
+                                                  asm, ".4h", ".4s", ".4s",
+     [(set (v4i16 V64:$Rd), (IntOp (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
+  def v4i32_v8i16  : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".8h", ".4s", ".4s",
+     []>;
+  def v2i64_v2i32  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
+                                                  V64, V128, V128,
+                                                  asm, ".2s", ".2d", ".2d",
+     [(set (v2i32 V64:$Rd), (IntOp (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>;
+  def v2i64_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".2d", ".2d",
+     []>;
+
+
+  // Patterns for the '2' variants involve INSERT_SUBREG, which you can't put in
+  // a version attached to an instruction.
+  def : Pat<(concat_vectors (v8i8 V64:$Rd), (IntOp (v8i16 V128:$Rn),
+                                                   (v8i16 V128:$Rm))),
+            (!cast<Instruction>(NAME # "v8i16_v16i8")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                V128:$Rn, V128:$Rm)>;
+  def : Pat<(concat_vectors (v4i16 V64:$Rd), (IntOp (v4i32 V128:$Rn),
+                                                    (v4i32 V128:$Rm))),
+            (!cast<Instruction>(NAME # "v4i32_v8i16")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                V128:$Rn, V128:$Rm)>;
+  def : Pat<(concat_vectors (v2i32 V64:$Rd), (IntOp (v2i64 V128:$Rn),
+                                                    (v2i64 V128:$Rm))),
+            (!cast<Instruction>(NAME # "v2i64_v4i32")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                V128:$Rn, V128:$Rm)>;
+}
+
+multiclass SIMDDifferentThreeVectorBD<bit U, bits<4> opc, string asm,
+                                      Intrinsic IntOp> {
+  def v8i8   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
+                                            V128, V64, V64,
+                                            asm, ".8h", ".8b", ".8b",
+      [(set (v8i16 V128:$Rd), (IntOp (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
+                                            V128, V128, V128,
+                                            asm#"2", ".8h", ".16b", ".16b", []>;
+  let Predicates = [HasCrypto] in {
+    def v1i64  : BaseSIMDDifferentThreeVector<U, 0b110, opc,
+                                              V128, V64, V64,
+                                              asm, ".1q", ".1d", ".1d", []>;
+    def v2i64  : BaseSIMDDifferentThreeVector<U, 0b111, opc,
+                                              V128, V128, V128,
+                                              asm#"2", ".1q", ".2d", ".2d", []>;
+  }
+
+  def : Pat<(v8i16 (IntOp (v8i8 (extract_high_v16i8 V128:$Rn)),
+                          (v8i8 (extract_high_v16i8 V128:$Rm)))),
+      (!cast<Instruction>(NAME#"v16i8") V128:$Rn, V128:$Rm)>;
+}
+
+multiclass SIMDLongThreeVectorHS<bit U, bits<4> opc, string asm,
+                                 SDPatternOperator OpNode> {
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".4s", ".4h", ".4h",
+      [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".8h", ".8h",
+      [(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 V128:$Rn),
+                                      (extract_high_v8i16 V128:$Rm)))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".2d", ".2s", ".2s",
+      [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".4s", ".4s",
+      [(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 V128:$Rn),
+                                      (extract_high_v4i32 V128:$Rm)))]>;
+}
+
+multiclass SIMDLongThreeVectorBHSabdl<bit U, bits<4> opc, string asm,
+                                  SDPatternOperator OpNode = null_frag> {
+  def v8i8_v8i16   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".8h", ".8b", ".8b",
+      [(set (v8i16 V128:$Rd),
+            (zext (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))))]>;
+  def v16i8_v8i16  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
+                                                 V128, V128, V128,
+                                                 asm#"2", ".8h", ".16b", ".16b",
+      [(set (v8i16 V128:$Rd),
+            (zext (v8i8 (OpNode (extract_high_v16i8 V128:$Rn),
+                                (extract_high_v16i8 V128:$Rm)))))]>;
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".4s", ".4h", ".4h",
+      [(set (v4i32 V128:$Rd),
+            (zext (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".8h", ".8h",
+      [(set (v4i32 V128:$Rd),
+            (zext (v4i16 (OpNode (extract_high_v8i16 V128:$Rn),
+                                  (extract_high_v8i16 V128:$Rm)))))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".2d", ".2s", ".2s",
+      [(set (v2i64 V128:$Rd),
+            (zext (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".4s", ".4s",
+      [(set (v2i64 V128:$Rd),
+            (zext (v2i32 (OpNode (extract_high_v4i32 V128:$Rn),
+                                 (extract_high_v4i32 V128:$Rm)))))]>;
+}
+
+multiclass SIMDLongThreeVectorTiedBHSabal<bit U, bits<4> opc,
+                                          string asm,
+                                          SDPatternOperator OpNode> {
+  def v8i8_v8i16   : BaseSIMDDifferentThreeVectorTied<U, 0b000, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".8h", ".8b", ".8b",
+    [(set (v8i16 V128:$dst),
+          (add (v8i16 V128:$Rd),
+               (zext (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm))))))]>;
+  def v16i8_v8i16  : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc,
+                                                 V128, V128, V128,
+                                                 asm#"2", ".8h", ".16b", ".16b",
+    [(set (v8i16 V128:$dst),
+          (add (v8i16 V128:$Rd),
+               (zext (v8i8 (OpNode (extract_high_v16i8 V128:$Rn),
+                                   (extract_high_v16i8 V128:$Rm))))))]>;
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".4s", ".4h", ".4h",
+    [(set (v4i32 V128:$dst),
+          (add (v4i32 V128:$Rd),
+               (zext (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm))))))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".8h", ".8h",
+    [(set (v4i32 V128:$dst),
+          (add (v4i32 V128:$Rd),
+               (zext (v4i16 (OpNode (extract_high_v8i16 V128:$Rn),
+                                    (extract_high_v8i16 V128:$Rm))))))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".2d", ".2s", ".2s",
+    [(set (v2i64 V128:$dst),
+          (add (v2i64 V128:$Rd),
+               (zext (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm))))))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".4s", ".4s",
+    [(set (v2i64 V128:$dst),
+          (add (v2i64 V128:$Rd),
+               (zext (v2i32 (OpNode (extract_high_v4i32 V128:$Rn),
+                                    (extract_high_v4i32 V128:$Rm))))))]>;
+}
+
+multiclass SIMDLongThreeVectorBHS<bit U, bits<4> opc, string asm,
+                                  SDPatternOperator OpNode = null_frag> {
+  def v8i8_v8i16   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".8h", ".8b", ".8b",
+      [(set (v8i16 V128:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8_v8i16  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
+                                                 V128, V128, V128,
+                                                 asm#"2", ".8h", ".16b", ".16b",
+      [(set (v8i16 V128:$Rd), (OpNode (extract_high_v16i8 V128:$Rn),
+                                      (extract_high_v16i8 V128:$Rm)))]>;
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".4s", ".4h", ".4h",
+      [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".8h", ".8h",
+      [(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 V128:$Rn),
+                                      (extract_high_v8i16 V128:$Rm)))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".2d", ".2s", ".2s",
+      [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".4s", ".4s",
+      [(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 V128:$Rn),
+                                      (extract_high_v4i32 V128:$Rm)))]>;
+}
+
+multiclass SIMDLongThreeVectorTiedBHS<bit U, bits<4> opc,
+                                      string asm,
+                                      SDPatternOperator OpNode> {
+  def v8i8_v8i16   : BaseSIMDDifferentThreeVectorTied<U, 0b000, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".8h", ".8b", ".8b",
+    [(set (v8i16 V128:$dst),
+          (OpNode (v8i16 V128:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8_v8i16  : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc,
+                                                 V128, V128, V128,
+                                                 asm#"2", ".8h", ".16b", ".16b",
+    [(set (v8i16 V128:$dst),
+          (OpNode (v8i16 V128:$Rd),
+                  (extract_high_v16i8 V128:$Rn),
+                  (extract_high_v16i8 V128:$Rm)))]>;
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".4s", ".4h", ".4h",
+    [(set (v4i32 V128:$dst),
+          (OpNode (v4i32 V128:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".8h", ".8h",
+    [(set (v4i32 V128:$dst),
+          (OpNode (v4i32 V128:$Rd),
+                  (extract_high_v8i16 V128:$Rn),
+                  (extract_high_v8i16 V128:$Rm)))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".2d", ".2s", ".2s",
+    [(set (v2i64 V128:$dst),
+          (OpNode (v2i64 V128:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".4s", ".4s",
+    [(set (v2i64 V128:$dst),
+          (OpNode (v2i64 V128:$Rd),
+                  (extract_high_v4i32 V128:$Rn),
+                  (extract_high_v4i32 V128:$Rm)))]>;
+}
+
+multiclass SIMDLongThreeVectorSQDMLXTiedHS<bit U, bits<4> opc, string asm,
+                                           SDPatternOperator Accum> {
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".4s", ".4h", ".4h",
+    [(set (v4i32 V128:$dst),
+          (Accum (v4i32 V128:$Rd),
+                 (v4i32 (int_aarch64_neon_sqdmull (v4i16 V64:$Rn),
+                                                (v4i16 V64:$Rm)))))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".8h", ".8h",
+    [(set (v4i32 V128:$dst),
+          (Accum (v4i32 V128:$Rd),
+                 (v4i32 (int_aarch64_neon_sqdmull (extract_high_v8i16 V128:$Rn),
+                                            (extract_high_v8i16 V128:$Rm)))))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".2d", ".2s", ".2s",
+    [(set (v2i64 V128:$dst),
+          (Accum (v2i64 V128:$Rd),
+                 (v2i64 (int_aarch64_neon_sqdmull (v2i32 V64:$Rn),
+                                                (v2i32 V64:$Rm)))))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".4s", ".4s",
+    [(set (v2i64 V128:$dst),
+          (Accum (v2i64 V128:$Rd),
+                 (v2i64 (int_aarch64_neon_sqdmull (extract_high_v4i32 V128:$Rn),
+                                            (extract_high_v4i32 V128:$Rm)))))]>;
+}
+
+multiclass SIMDWideThreeVectorBHS<bit U, bits<4> opc, string asm,
+                                  SDPatternOperator OpNode> {
+  def v8i8_v8i16   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
+                                                  V128, V128, V64,
+                                                  asm, ".8h", ".8h", ".8b",
+       [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8_v8i16  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".8h", ".8h", ".16b",
+       [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn),
+                                       (extract_high_v16i8 V128:$Rm)))]>;
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
+                                                  V128, V128, V64,
+                                                  asm, ".4s", ".4s", ".4h",
+       [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".4s", ".8h",
+       [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn),
+                                       (extract_high_v8i16 V128:$Rm)))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
+                                                  V128, V128, V64,
+                                                  asm, ".2d", ".2d", ".2s",
+       [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".2d", ".4s",
+       [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn),
+                                       (extract_high_v4i32 V128:$Rm)))]>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD bitwise extract from vector
+//----------------------------------------------------------------------------
+
+class BaseSIMDBitwiseExtract<bit size, RegisterOperand regtype, ValueType vty,
+                             string asm, string kind>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, i32imm:$imm), asm,
+      "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind # ", $imm" #
+      "|" # kind # "\t$Rd, $Rn, $Rm, $imm}", "",
+      [(set (vty regtype:$Rd),
+            (AArch64ext regtype:$Rn, regtype:$Rm, (i32 imm:$imm)))]>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<4> imm;
+  let Inst{31}    = 0;
+  let Inst{30}    = size;
+  let Inst{29-21} = 0b101110000;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = 0;
+  let Inst{14-11} = imm;
+  let Inst{10}    = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+
+multiclass SIMDBitwiseExtract<string asm> {
+  def v8i8  : BaseSIMDBitwiseExtract<0, V64, v8i8, asm, ".8b"> {
+    let imm{3} = 0;
+  }
+  def v16i8 : BaseSIMDBitwiseExtract<1, V128, v16i8, asm, ".16b">;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD zip vector
+//----------------------------------------------------------------------------
+
+class BaseSIMDZipVector<bits<3> size, bits<3> opc, RegisterOperand regtype,
+                        string asm, string kind, SDNode OpNode, ValueType valty>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
+      "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
+      "|" # kind # "\t$Rd, $Rn, $Rm}", "",
+      [(set (valty regtype:$Rd), (OpNode regtype:$Rn, regtype:$Rm))]>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31}    = 0;
+  let Inst{30}    = size{0};
+  let Inst{29-24} = 0b001110;
+  let Inst{23-22} = size{2-1};
+  let Inst{21}    = 0;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = 0;
+  let Inst{14-12} = opc;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDZipVector<bits<3>opc, string asm,
+                         SDNode OpNode> {
+  def v8i8   : BaseSIMDZipVector<0b000, opc, V64,
+      asm, ".8b", OpNode, v8i8>;
+  def v16i8  : BaseSIMDZipVector<0b001, opc, V128,
+      asm, ".16b", OpNode, v16i8>;
+  def v4i16  : BaseSIMDZipVector<0b010, opc, V64,
+      asm, ".4h", OpNode, v4i16>;
+  def v8i16  : BaseSIMDZipVector<0b011, opc, V128,
+      asm, ".8h", OpNode, v8i16>;
+  def v2i32  : BaseSIMDZipVector<0b100, opc, V64,
+      asm, ".2s", OpNode, v2i32>;
+  def v4i32  : BaseSIMDZipVector<0b101, opc, V128,
+      asm, ".4s", OpNode, v4i32>;
+  def v2i64  : BaseSIMDZipVector<0b111, opc, V128,
+      asm, ".2d", OpNode, v2i64>;
+
+  def : Pat<(v2f32 (OpNode V64:$Rn, V64:$Rm)),
+        (!cast<Instruction>(NAME#"v2i32") V64:$Rn, V64:$Rm)>;
+  def : Pat<(v4f32 (OpNode V128:$Rn, V128:$Rm)),
+        (!cast<Instruction>(NAME#"v4i32") V128:$Rn, V128:$Rm)>;
+  def : Pat<(v2f64 (OpNode V128:$Rn, V128:$Rm)),
+        (!cast<Instruction>(NAME#"v2i64") V128:$Rn, V128:$Rm)>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD three register scalar instructions
+//----------------------------------------------------------------------------
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDThreeScalar<bit U, bits<2> size, bits<5> opcode,
+                        RegisterClass regtype, string asm,
+                        list<dag> pattern>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
+      "\t$Rd, $Rn, $Rm", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = size;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-11} = opcode;
+  let Inst{10}    = 1;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDThreeScalarD<bit U, bits<5> opc, string asm,
+                            SDPatternOperator OpNode> {
+  def v1i64  : BaseSIMDThreeScalar<U, 0b11, opc, FPR64, asm,
+    [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>;
+}
+
+multiclass SIMDThreeScalarBHSD<bit U, bits<5> opc, string asm,
+                               SDPatternOperator OpNode> {
+  def v1i64  : BaseSIMDThreeScalar<U, 0b11, opc, FPR64, asm,
+    [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>;
+  def v1i32  : BaseSIMDThreeScalar<U, 0b10, opc, FPR32, asm, []>;
+  def v1i16  : BaseSIMDThreeScalar<U, 0b01, opc, FPR16, asm, []>;
+  def v1i8   : BaseSIMDThreeScalar<U, 0b00, opc, FPR8 , asm, []>;
+
+  def : Pat<(i64 (OpNode (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
+            (!cast<Instruction>(NAME#"v1i64") FPR64:$Rn, FPR64:$Rm)>;
+  def : Pat<(i32 (OpNode (i32 FPR32:$Rn), (i32 FPR32:$Rm))),
+            (!cast<Instruction>(NAME#"v1i32") FPR32:$Rn, FPR32:$Rm)>;
+}
+
+multiclass SIMDThreeScalarHS<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode> {
+  def v1i32  : BaseSIMDThreeScalar<U, 0b10, opc, FPR32, asm,
+                             [(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>;
+  def v1i16  : BaseSIMDThreeScalar<U, 0b01, opc, FPR16, asm, []>;
+}
+
+multiclass SIMDThreeScalarSD<bit U, bit S, bits<5> opc, string asm,
+                             SDPatternOperator OpNode = null_frag> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+    def #NAME#64 : BaseSIMDThreeScalar<U, {S,1}, opc, FPR64, asm,
+      [(set (f64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
+    def #NAME#32 : BaseSIMDThreeScalar<U, {S,0}, opc, FPR32, asm,
+      [(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>;
+  }
+
+  def : Pat<(v1f64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+            (!cast<Instruction>(NAME # "64") FPR64:$Rn, FPR64:$Rm)>;
+}
+
+multiclass SIMDThreeScalarFPCmp<bit U, bit S, bits<5> opc, string asm,
+                                SDPatternOperator OpNode = null_frag> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+    def #NAME#64 : BaseSIMDThreeScalar<U, {S,1}, opc, FPR64, asm,
+      [(set (i64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
+    def #NAME#32 : BaseSIMDThreeScalar<U, {S,0}, opc, FPR32, asm,
+      [(set (i32 FPR32:$Rd), (OpNode (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]>;
+  }
+
+  def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+            (!cast<Instruction>(NAME # "64") FPR64:$Rn, FPR64:$Rm)>;
+}
+
+class BaseSIMDThreeScalarMixed<bit U, bits<2> size, bits<5> opcode,
+              dag oops, dag iops, string asm, string cstr, list<dag> pat>
+  : I<oops, iops, asm,
+      "\t$Rd, $Rn, $Rm", cstr, pat>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = size;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-11} = opcode;
+  let Inst{10}    = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDThreeScalarMixedHS<bit U, bits<5> opc, string asm,
+                                  SDPatternOperator OpNode = null_frag> {
+  def i16  : BaseSIMDThreeScalarMixed<U, 0b01, opc,
+                                      (outs FPR32:$Rd),
+                                      (ins FPR16:$Rn, FPR16:$Rm), asm, "", []>;
+  def i32  : BaseSIMDThreeScalarMixed<U, 0b10, opc,
+                                      (outs FPR64:$Rd),
+                                      (ins FPR32:$Rn, FPR32:$Rm), asm, "",
+            [(set (i64 FPR64:$Rd), (OpNode (i32 FPR32:$Rn), (i32 FPR32:$Rm)))]>;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDThreeScalarMixedTiedHS<bit U, bits<5> opc, string asm,
+                                  SDPatternOperator OpNode = null_frag> {
+  def i16  : BaseSIMDThreeScalarMixed<U, 0b01, opc,
+                                      (outs FPR32:$dst),
+                                      (ins FPR32:$Rd, FPR16:$Rn, FPR16:$Rm),
+                                      asm, "$Rd = $dst", []>;
+  def i32  : BaseSIMDThreeScalarMixed<U, 0b10, opc,
+                                      (outs FPR64:$dst),
+                                      (ins FPR64:$Rd, FPR32:$Rn, FPR32:$Rm),
+                                      asm, "$Rd = $dst",
+            [(set (i64 FPR64:$dst),
+                  (OpNode (i64 FPR64:$Rd), (i32 FPR32:$Rn), (i32 FPR32:$Rm)))]>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD two register scalar instructions
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDTwoScalar<bit U, bits<2> size, bits<5> opcode,
+                        RegisterClass regtype, RegisterClass regtype2,
+                        string asm, list<dag> pat>
+  : I<(outs regtype:$Rd), (ins regtype2:$Rn), asm,
+      "\t$Rd, $Rn", "", pat>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDTwoScalarTied<bit U, bits<2> size, bits<5> opcode,
+                        RegisterClass regtype, RegisterClass regtype2,
+                        string asm, list<dag> pat>
+  : I<(outs regtype:$dst), (ins regtype:$Rd, regtype2:$Rn), asm,
+      "\t$Rd, $Rn", "$Rd = $dst", pat>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDCmpTwoScalar<bit U, bits<2> size, bits<5> opcode,
+                        RegisterClass regtype, string asm, string zero>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
+      "\t$Rd, $Rn, #" # zero, "", []>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class SIMDInexactCvtTwoScalar<bits<5> opcode, string asm>
+  : I<(outs FPR32:$Rd), (ins FPR64:$Rn), asm, "\t$Rd, $Rn", "",
+     [(set (f32 FPR32:$Rd), (int_aarch64_sisd_fcvtxn (f64 FPR64:$Rn)))]>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-17} = 0b011111100110000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDCmpTwoScalarD<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode> {
+  def v1i64rz  : BaseSIMDCmpTwoScalar<U, 0b11, opc, FPR64, asm, "0">;
+
+  def : Pat<(v1i64 (OpNode FPR64:$Rn)),
+            (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>;
+}
+
+multiclass SIMDCmpTwoScalarSD<bit U, bit S, bits<5> opc, string asm,
+                              SDPatternOperator OpNode> {
+  def v1i64rz  : BaseSIMDCmpTwoScalar<U, {S,1}, opc, FPR64, asm, "0.0">;
+  def v1i32rz  : BaseSIMDCmpTwoScalar<U, {S,0}, opc, FPR32, asm, "0.0">;
+
+  def : InstAlias<asm # " $Rd, $Rn, #0",
+                  (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rd, FPR64:$Rn), 0>;
+  def : InstAlias<asm # " $Rd, $Rn, #0",
+                  (!cast<Instruction>(NAME # v1i32rz) FPR32:$Rd, FPR32:$Rn), 0>;
+
+  def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn))),
+            (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>;
+}
+
+multiclass SIMDTwoScalarD<bit U, bits<5> opc, string asm,
+                          SDPatternOperator OpNode = null_frag> {
+  def v1i64       : BaseSIMDTwoScalar<U, 0b11, opc, FPR64, FPR64, asm,
+    [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn)))]>;
+
+  def : Pat<(i64 (OpNode (i64 FPR64:$Rn))),
+            (!cast<Instruction>(NAME # "v1i64") FPR64:$Rn)>;
+}
+
+multiclass SIMDTwoScalarSD<bit U, bit S, bits<5> opc, string asm> {
+  def v1i64       : BaseSIMDTwoScalar<U, {S,1}, opc, FPR64, FPR64, asm,[]>;
+  def v1i32       : BaseSIMDTwoScalar<U, {S,0}, opc, FPR32, FPR32, asm,[]>;
+}
+
+multiclass SIMDTwoScalarCVTSD<bit U, bit S, bits<5> opc, string asm,
+                              SDPatternOperator OpNode> {
+  def v1i64 : BaseSIMDTwoScalar<U, {S,1}, opc, FPR64, FPR64, asm,
+                                [(set FPR64:$Rd, (OpNode (f64 FPR64:$Rn)))]>;
+  def v1i32 : BaseSIMDTwoScalar<U, {S,0}, opc, FPR32, FPR32, asm,
+                                [(set FPR32:$Rd, (OpNode (f32 FPR32:$Rn)))]>;
+}
+
+multiclass SIMDTwoScalarBHSD<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode = null_frag> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+    def v1i64  : BaseSIMDTwoScalar<U, 0b11, opc, FPR64, FPR64, asm,
+           [(set (i64 FPR64:$Rd), (OpNode (i64 FPR64:$Rn)))]>;
+    def v1i32  : BaseSIMDTwoScalar<U, 0b10, opc, FPR32, FPR32, asm,
+           [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn)))]>;
+    def v1i16  : BaseSIMDTwoScalar<U, 0b01, opc, FPR16, FPR16, asm, []>;
+    def v1i8   : BaseSIMDTwoScalar<U, 0b00, opc, FPR8 , FPR8 , asm, []>;
+  }
+
+  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn))),
+            (!cast<Instruction>(NAME # v1i64) FPR64:$Rn)>;
+}
+
+multiclass SIMDTwoScalarBHSDTied<bit U, bits<5> opc, string asm,
+                                 Intrinsic OpNode> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+    def v1i64  : BaseSIMDTwoScalarTied<U, 0b11, opc, FPR64, FPR64, asm,
+        [(set (i64 FPR64:$dst), (OpNode (i64 FPR64:$Rd), (i64 FPR64:$Rn)))]>;
+    def v1i32  : BaseSIMDTwoScalarTied<U, 0b10, opc, FPR32, FPR32, asm,
+        [(set (i32 FPR32:$dst), (OpNode (i32 FPR32:$Rd), (i32 FPR32:$Rn)))]>;
+    def v1i16  : BaseSIMDTwoScalarTied<U, 0b01, opc, FPR16, FPR16, asm, []>;
+    def v1i8   : BaseSIMDTwoScalarTied<U, 0b00, opc, FPR8 , FPR8 , asm, []>;
+  }
+
+  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn))),
+            (!cast<Instruction>(NAME # v1i64) FPR64:$Rd, FPR64:$Rn)>;
+}
+
+
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDTwoScalarMixedBHS<bit U, bits<5> opc, string asm,
+                                 SDPatternOperator OpNode = null_frag> {
+  def v1i32  : BaseSIMDTwoScalar<U, 0b10, opc, FPR32, FPR64, asm,
+        [(set (i32 FPR32:$Rd), (OpNode (i64 FPR64:$Rn)))]>;
+  def v1i16  : BaseSIMDTwoScalar<U, 0b01, opc, FPR16, FPR32, asm, []>;
+  def v1i8   : BaseSIMDTwoScalar<U, 0b00, opc, FPR8 , FPR16, asm, []>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar pairwise instructions
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDPairwiseScalar<bit U, bits<2> size, bits<5> opcode,
+                        RegisterOperand regtype, RegisterOperand vectype,
+                        string asm, string kind>
+  : I<(outs regtype:$Rd), (ins vectype:$Rn), asm,
+      "{\t$Rd, $Rn" # kind # "|" # kind # "\t$Rd, $Rn}", "", []>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b11000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDPairwiseScalarD<bit U, bits<5> opc, string asm> {
+  def v2i64p : BaseSIMDPairwiseScalar<U, 0b11, opc, FPR64Op, V128,
+                                      asm, ".2d">;
+}
+
+multiclass SIMDPairwiseScalarSD<bit U, bit S, bits<5> opc, string asm> {
+  def v2i32p : BaseSIMDPairwiseScalar<U, {S,0}, opc, FPR32Op, V64,
+                                      asm, ".2s">;
+  def v2i64p : BaseSIMDPairwiseScalar<U, {S,1}, opc, FPR64Op, V128,
+                                      asm, ".2d">;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD across lanes instructions
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDAcrossLanes<bit Q, bit U, bits<2> size, bits<5> opcode,
+                          RegisterClass regtype, RegisterOperand vectype,
+                          string asm, string kind, list<dag> pattern>
+  : I<(outs regtype:$Rd), (ins vectype:$Rn), asm,
+      "{\t$Rd, $Rn" # kind # "|" # kind # "\t$Rd, $Rn}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b11000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDAcrossLanesBHS<bit U, bits<5> opcode,
+                              string asm> {
+  def v8i8v  : BaseSIMDAcrossLanes<0, U, 0b00, opcode, FPR8,  V64,
+                                   asm, ".8b", []>;
+  def v16i8v : BaseSIMDAcrossLanes<1, U, 0b00, opcode, FPR8,  V128,
+                                   asm, ".16b", []>;
+  def v4i16v : BaseSIMDAcrossLanes<0, U, 0b01, opcode, FPR16, V64,
+                                   asm, ".4h", []>;
+  def v8i16v : BaseSIMDAcrossLanes<1, U, 0b01, opcode, FPR16, V128,
+                                   asm, ".8h", []>;
+  def v4i32v : BaseSIMDAcrossLanes<1, U, 0b10, opcode, FPR32, V128,
+                                   asm, ".4s", []>;
+}
+
+multiclass SIMDAcrossLanesHSD<bit U, bits<5> opcode, string asm> {
+  def v8i8v  : BaseSIMDAcrossLanes<0, U, 0b00, opcode, FPR16, V64,
+                                   asm, ".8b", []>;
+  def v16i8v : BaseSIMDAcrossLanes<1, U, 0b00, opcode, FPR16, V128,
+                                   asm, ".16b", []>;
+  def v4i16v : BaseSIMDAcrossLanes<0, U, 0b01, opcode, FPR32, V64,
+                                   asm, ".4h", []>;
+  def v8i16v : BaseSIMDAcrossLanes<1, U, 0b01, opcode, FPR32, V128,
+                                   asm, ".8h", []>;
+  def v4i32v : BaseSIMDAcrossLanes<1, U, 0b10, opcode, FPR64, V128,
+                                   asm, ".4s", []>;
+}
+
+multiclass SIMDAcrossLanesS<bits<5> opcode, bit sz1, string asm,
+                            Intrinsic intOp> {
+  def v4i32v : BaseSIMDAcrossLanes<1, 1, {sz1, 0}, opcode, FPR32, V128,
+                                   asm, ".4s",
+        [(set FPR32:$Rd, (intOp (v4f32 V128:$Rn)))]>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD INS/DUP instructions
+//----------------------------------------------------------------------------
+
+// FIXME: There has got to be a better way to factor these. ugh.
+
+class BaseSIMDInsDup<bit Q, bit op, dag outs, dag ins, string asm,
+                     string operands, string constraints, list<dag> pattern>
+  : I<outs, ins, asm, operands, constraints, pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31} = 0;
+  let Inst{30} = Q;
+  let Inst{29} = op;
+  let Inst{28-21} = 0b01110000;
+  let Inst{15} = 0;
+  let Inst{10} = 1;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rd;
+}
+
+class SIMDDupFromMain<bit Q, bits<5> imm5, string size, ValueType vectype,
+                      RegisterOperand vecreg, RegisterClass regtype>
+  : BaseSIMDInsDup<Q, 0, (outs vecreg:$Rd), (ins regtype:$Rn), "dup",
+                   "{\t$Rd" # size # ", $Rn" #
+                   "|" # size # "\t$Rd, $Rn}", "",
+                   [(set (vectype vecreg:$Rd), (AArch64dup regtype:$Rn))]> {
+  let Inst{20-16} = imm5;
+  let Inst{14-11} = 0b0001;
+}
+
+class SIMDDupFromElement<bit Q, string dstkind, string srckind,
+                         ValueType vectype, ValueType insreg,
+                         RegisterOperand vecreg, Operand idxtype,
+                         ValueType elttype, SDNode OpNode>
+  : BaseSIMDInsDup<Q, 0, (outs vecreg:$Rd), (ins V128:$Rn, idxtype:$idx), "dup",
+                   "{\t$Rd" # dstkind # ", $Rn" # srckind # "$idx" #
+                   "|" # dstkind # "\t$Rd, $Rn$idx}", "",
+                 [(set (vectype vecreg:$Rd),
+                       (OpNode (insreg V128:$Rn), idxtype:$idx))]> {
+  let Inst{14-11} = 0b0000;
+}
+
+class SIMDDup64FromElement
+  : SIMDDupFromElement<1, ".2d", ".d", v2i64, v2i64, V128,
+                       VectorIndexD, i64, AArch64duplane64> {
+  bits<1> idx;
+  let Inst{20} = idx;
+  let Inst{19-16} = 0b1000;
+}
+
+class SIMDDup32FromElement<bit Q, string size, ValueType vectype,
+                           RegisterOperand vecreg>
+  : SIMDDupFromElement<Q, size, ".s", vectype, v4i32, vecreg,
+                       VectorIndexS, i64, AArch64duplane32> {
+  bits<2> idx;
+  let Inst{20-19} = idx;
+  let Inst{18-16} = 0b100;
+}
+
+class SIMDDup16FromElement<bit Q, string size, ValueType vectype,
+                           RegisterOperand vecreg>
+  : SIMDDupFromElement<Q, size, ".h", vectype, v8i16, vecreg,
+                       VectorIndexH, i64, AArch64duplane16> {
+  bits<3> idx;
+  let Inst{20-18} = idx;
+  let Inst{17-16} = 0b10;
+}
+
+class SIMDDup8FromElement<bit Q, string size, ValueType vectype,
+                          RegisterOperand vecreg>
+  : SIMDDupFromElement<Q, size, ".b", vectype, v16i8, vecreg,
+                       VectorIndexB, i64, AArch64duplane8> {
+  bits<4> idx;
+  let Inst{20-17} = idx;
+  let Inst{16} = 1;
+}
+
+class BaseSIMDMov<bit Q, string size, bits<4> imm4, RegisterClass regtype,
+                  Operand idxtype, string asm, list<dag> pattern>
+  : BaseSIMDInsDup<Q, 0, (outs regtype:$Rd), (ins V128:$Rn, idxtype:$idx), asm,
+                   "{\t$Rd, $Rn" # size # "$idx" #
+                   "|" # size # "\t$Rd, $Rn$idx}", "", pattern> {
+  let Inst{14-11} = imm4;
+}
+
+class SIMDSMov<bit Q, string size, RegisterClass regtype,
+               Operand idxtype>
+  : BaseSIMDMov<Q, size, 0b0101, regtype, idxtype, "smov", []>;
+class SIMDUMov<bit Q, string size, ValueType vectype, RegisterClass regtype,
+               Operand idxtype>
+  : BaseSIMDMov<Q, size, 0b0111, regtype, idxtype, "umov",
+      [(set regtype:$Rd, (vector_extract (vectype V128:$Rn), idxtype:$idx))]>;
+
+class SIMDMovAlias<string asm, string size, Instruction inst,
+                   RegisterClass regtype, Operand idxtype>
+    : InstAlias<asm#"{\t$dst, $src"#size#"$idx" #
+                    "|" # size # "\t$dst, $src$idx}",
+                (inst regtype:$dst, V128:$src, idxtype:$idx)>;
+
+multiclass SMov {
+  def vi8to32 : SIMDSMov<0, ".b", GPR32, VectorIndexB> {
+    bits<4> idx;
+    let Inst{20-17} = idx;
+    let Inst{16} = 1;
+  }
+  def vi8to64 : SIMDSMov<1, ".b", GPR64, VectorIndexB> {
+    bits<4> idx;
+    let Inst{20-17} = idx;
+    let Inst{16} = 1;
+  }
+  def vi16to32 : SIMDSMov<0, ".h", GPR32, VectorIndexH> {
+    bits<3> idx;
+    let Inst{20-18} = idx;
+    let Inst{17-16} = 0b10;
+  }
+  def vi16to64 : SIMDSMov<1, ".h", GPR64, VectorIndexH> {
+    bits<3> idx;
+    let Inst{20-18} = idx;
+    let Inst{17-16} = 0b10;
+  }
+  def vi32to64 : SIMDSMov<1, ".s", GPR64, VectorIndexS> {
+    bits<2> idx;
+    let Inst{20-19} = idx;
+    let Inst{18-16} = 0b100;
+  }
+}
+
+multiclass UMov {
+  def vi8 : SIMDUMov<0, ".b", v16i8, GPR32, VectorIndexB> {
+    bits<4> idx;
+    let Inst{20-17} = idx;
+    let Inst{16} = 1;
+  }
+  def vi16 : SIMDUMov<0, ".h", v8i16, GPR32, VectorIndexH> {
+    bits<3> idx;
+    let Inst{20-18} = idx;
+    let Inst{17-16} = 0b10;
+  }
+  def vi32 : SIMDUMov<0, ".s", v4i32, GPR32, VectorIndexS> {
+    bits<2> idx;
+    let Inst{20-19} = idx;
+    let Inst{18-16} = 0b100;
+  }
+  def vi64 : SIMDUMov<1, ".d", v2i64, GPR64, VectorIndexD> {
+    bits<1> idx;
+    let Inst{20} = idx;
+    let Inst{19-16} = 0b1000;
+  }
+  def : SIMDMovAlias<"mov", ".s",
+                     !cast<Instruction>(NAME#"vi32"),
+                     GPR32, VectorIndexS>;
+  def : SIMDMovAlias<"mov", ".d",
+                     !cast<Instruction>(NAME#"vi64"),
+                     GPR64, VectorIndexD>;
+}
+
+class SIMDInsFromMain<string size, ValueType vectype,
+                      RegisterClass regtype, Operand idxtype>
+  : BaseSIMDInsDup<1, 0, (outs V128:$dst),
+                   (ins V128:$Rd, idxtype:$idx, regtype:$Rn), "ins",
+                   "{\t$Rd" # size # "$idx, $Rn" #
+                   "|" # size # "\t$Rd$idx, $Rn}",
+                   "$Rd = $dst",
+            [(set V128:$dst,
+              (vector_insert (vectype V128:$Rd), regtype:$Rn, idxtype:$idx))]> {
+  let Inst{14-11} = 0b0011;
+}
+
+class SIMDInsFromElement<string size, ValueType vectype,
+                         ValueType elttype, Operand idxtype>
+  : BaseSIMDInsDup<1, 1, (outs V128:$dst),
+                   (ins V128:$Rd, idxtype:$idx, V128:$Rn, idxtype:$idx2), "ins",
+                   "{\t$Rd" # size # "$idx, $Rn" # size # "$idx2" #
+                   "|" # size # "\t$Rd$idx, $Rn$idx2}",
+                   "$Rd = $dst",
+         [(set V128:$dst,
+               (vector_insert
+                 (vectype V128:$Rd),
+                 (elttype (vector_extract (vectype V128:$Rn), idxtype:$idx2)),
+                 idxtype:$idx))]>;
+
+class SIMDInsMainMovAlias<string size, Instruction inst,
+                          RegisterClass regtype, Operand idxtype>
+    : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" #
+                        "|" # size #"\t$dst$idx, $src}",
+                (inst V128:$dst, idxtype:$idx, regtype:$src)>;
+class SIMDInsElementMovAlias<string size, Instruction inst,
+                             Operand idxtype>
+    : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" # size # "$idx2" #
+                      # "|" # size #" $dst$idx, $src$idx2}",
+                (inst V128:$dst, idxtype:$idx, V128:$src, idxtype:$idx2)>;
+
+
+multiclass SIMDIns {
+  def vi8gpr : SIMDInsFromMain<".b", v16i8, GPR32, VectorIndexB> {
+    bits<4> idx;
+    let Inst{20-17} = idx;
+    let Inst{16} = 1;
+  }
+  def vi16gpr : SIMDInsFromMain<".h", v8i16, GPR32, VectorIndexH> {
+    bits<3> idx;
+    let Inst{20-18} = idx;
+    let Inst{17-16} = 0b10;
+  }
+  def vi32gpr : SIMDInsFromMain<".s", v4i32, GPR32, VectorIndexS> {
+    bits<2> idx;
+    let Inst{20-19} = idx;
+    let Inst{18-16} = 0b100;
+  }
+  def vi64gpr : SIMDInsFromMain<".d", v2i64, GPR64, VectorIndexD> {
+    bits<1> idx;
+    let Inst{20} = idx;
+    let Inst{19-16} = 0b1000;
+  }
+
+  def vi8lane : SIMDInsFromElement<".b", v16i8, i32, VectorIndexB> {
+    bits<4> idx;
+    bits<4> idx2;
+    let Inst{20-17} = idx;
+    let Inst{16} = 1;
+    let Inst{14-11} = idx2;
+  }
+  def vi16lane : SIMDInsFromElement<".h", v8i16, i32, VectorIndexH> {
+    bits<3> idx;
+    bits<3> idx2;
+    let Inst{20-18} = idx;
+    let Inst{17-16} = 0b10;
+    let Inst{14-12} = idx2;
+    let Inst{11} = 0;
+  }
+  def vi32lane : SIMDInsFromElement<".s", v4i32, i32, VectorIndexS> {
+    bits<2> idx;
+    bits<2> idx2;
+    let Inst{20-19} = idx;
+    let Inst{18-16} = 0b100;
+    let Inst{14-13} = idx2;
+    let Inst{12-11} = 0;
+  }
+  def vi64lane : SIMDInsFromElement<".d", v2i64, i64, VectorIndexD> {
+    bits<1> idx;
+    bits<1> idx2;
+    let Inst{20} = idx;
+    let Inst{19-16} = 0b1000;
+    let Inst{14} = idx2;
+    let Inst{13-11} = 0;
+  }
+
+  // For all forms of the INS instruction, the "mov" mnemonic is the
+  // preferred alias. Why they didn't just call the instruction "mov" in
+  // the first place is a very good question indeed...
+  def : SIMDInsMainMovAlias<".b", !cast<Instruction>(NAME#"vi8gpr"),
+                         GPR32, VectorIndexB>;
+  def : SIMDInsMainMovAlias<".h", !cast<Instruction>(NAME#"vi16gpr"),
+                         GPR32, VectorIndexH>;
+  def : SIMDInsMainMovAlias<".s", !cast<Instruction>(NAME#"vi32gpr"),
+                         GPR32, VectorIndexS>;
+  def : SIMDInsMainMovAlias<".d", !cast<Instruction>(NAME#"vi64gpr"),
+                         GPR64, VectorIndexD>;
+
+  def : SIMDInsElementMovAlias<".b", !cast<Instruction>(NAME#"vi8lane"),
+                         VectorIndexB>;
+  def : SIMDInsElementMovAlias<".h", !cast<Instruction>(NAME#"vi16lane"),
+                         VectorIndexH>;
+  def : SIMDInsElementMovAlias<".s", !cast<Instruction>(NAME#"vi32lane"),
+                         VectorIndexS>;
+  def : SIMDInsElementMovAlias<".d", !cast<Instruction>(NAME#"vi64lane"),
+                         VectorIndexD>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD TBL/TBX
+//----------------------------------------------------------------------------
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDTableLookup<bit Q, bits<2> len, bit op, RegisterOperand vectype,
+                          RegisterOperand listtype, string asm, string kind>
+  : I<(outs vectype:$Vd), (ins listtype:$Vn, vectype:$Vm), asm,
+       "\t$Vd" # kind # ", $Vn, $Vm" # kind, "", []>,
+    Sched<[WriteV]> {
+  bits<5> Vd;
+  bits<5> Vn;
+  bits<5> Vm;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29-21} = 0b001110000;
+  let Inst{20-16} = Vm;
+  let Inst{15}    = 0;
+  let Inst{14-13} = len;
+  let Inst{12}    = op;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5}   = Vn;
+  let Inst{4-0}   = Vd;
+}
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDTableLookupTied<bit Q, bits<2> len, bit op, RegisterOperand vectype,
+                          RegisterOperand listtype, string asm, string kind>
+  : I<(outs vectype:$dst), (ins vectype:$Vd, listtype:$Vn, vectype:$Vm), asm,
+       "\t$Vd" # kind # ", $Vn, $Vm" # kind, "$Vd = $dst", []>,
+    Sched<[WriteV]> {
+  bits<5> Vd;
+  bits<5> Vn;
+  bits<5> Vm;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29-21} = 0b001110000;
+  let Inst{20-16} = Vm;
+  let Inst{15}    = 0;
+  let Inst{14-13} = len;
+  let Inst{12}    = op;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5}   = Vn;
+  let Inst{4-0}   = Vd;
+}
+
+class SIMDTableLookupAlias<string asm, Instruction inst,
+                          RegisterOperand vectype, RegisterOperand listtype>
+    : InstAlias<!strconcat(asm, "\t$dst, $lst, $index"),
+                (inst vectype:$dst, listtype:$lst, vectype:$index), 0>;
+
+multiclass SIMDTableLookup<bit op, string asm> {
+  def v8i8One   : BaseSIMDTableLookup<0, 0b00, op, V64, VecListOne16b,
+                                      asm, ".8b">;
+  def v8i8Two   : BaseSIMDTableLookup<0, 0b01, op, V64, VecListTwo16b,
+                                      asm, ".8b">;
+  def v8i8Three : BaseSIMDTableLookup<0, 0b10, op, V64, VecListThree16b,
+                                      asm, ".8b">;
+  def v8i8Four  : BaseSIMDTableLookup<0, 0b11, op, V64, VecListFour16b,
+                                      asm, ".8b">;
+  def v16i8One  : BaseSIMDTableLookup<1, 0b00, op, V128, VecListOne16b,
+                                      asm, ".16b">;
+  def v16i8Two  : BaseSIMDTableLookup<1, 0b01, op, V128, VecListTwo16b,
+                                      asm, ".16b">;
+  def v16i8Three: BaseSIMDTableLookup<1, 0b10, op, V128, VecListThree16b,
+                                      asm, ".16b">;
+  def v16i8Four : BaseSIMDTableLookup<1, 0b11, op, V128, VecListFour16b,
+                                      asm, ".16b">;
+
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8One"),
+                         V64, VecListOne128>;
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8Two"),
+                         V64, VecListTwo128>;
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8Three"),
+                         V64, VecListThree128>;
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8Four"),
+                         V64, VecListFour128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8One"),
+                         V128, VecListOne128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8Two"),
+                         V128, VecListTwo128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8Three"),
+                         V128, VecListThree128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8Four"),
+                         V128, VecListFour128>;
+}
+
+multiclass SIMDTableLookupTied<bit op, string asm> {
+  def v8i8One   : BaseSIMDTableLookupTied<0, 0b00, op, V64, VecListOne16b,
+                                      asm, ".8b">;
+  def v8i8Two   : BaseSIMDTableLookupTied<0, 0b01, op, V64, VecListTwo16b,
+                                      asm, ".8b">;
+  def v8i8Three : BaseSIMDTableLookupTied<0, 0b10, op, V64, VecListThree16b,
+                                      asm, ".8b">;
+  def v8i8Four  : BaseSIMDTableLookupTied<0, 0b11, op, V64, VecListFour16b,
+                                      asm, ".8b">;
+  def v16i8One  : BaseSIMDTableLookupTied<1, 0b00, op, V128, VecListOne16b,
+                                      asm, ".16b">;
+  def v16i8Two  : BaseSIMDTableLookupTied<1, 0b01, op, V128, VecListTwo16b,
+                                      asm, ".16b">;
+  def v16i8Three: BaseSIMDTableLookupTied<1, 0b10, op, V128, VecListThree16b,
+                                      asm, ".16b">;
+  def v16i8Four : BaseSIMDTableLookupTied<1, 0b11, op, V128, VecListFour16b,
+                                      asm, ".16b">;
+
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8One"),
+                         V64, VecListOne128>;
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8Two"),
+                         V64, VecListTwo128>;
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8Three"),
+                         V64, VecListThree128>;
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8Four"),
+                         V64, VecListFour128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8One"),
+                         V128, VecListOne128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8Two"),
+                         V128, VecListTwo128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8Three"),
+                         V128, VecListThree128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8Four"),
+                         V128, VecListFour128>;
+}
+
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar CPY
+//----------------------------------------------------------------------------
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDScalarCPY<RegisterClass regtype, RegisterOperand vectype,
+                        string kind, Operand idxtype>
+  : I<(outs regtype:$dst), (ins vectype:$src, idxtype:$idx), "mov",
+       "{\t$dst, $src" # kind # "$idx" #
+       "|\t$dst, $src$idx}", "", []>,
+    Sched<[WriteV]> {
+  bits<5> dst;
+  bits<5> src;
+  let Inst{31-21} = 0b01011110000;
+  let Inst{15-10} = 0b000001;
+  let Inst{9-5}   = src;
+  let Inst{4-0}   = dst;
+}
+
+class SIMDScalarCPYAlias<string asm, string size, Instruction inst,
+      RegisterClass regtype, RegisterOperand vectype, Operand idxtype>
+    : InstAlias<asm # "{\t$dst, $src" # size # "$index" #
+                    # "|\t$dst, $src$index}",
+                (inst regtype:$dst, vectype:$src, idxtype:$index), 0>;
+
+
+multiclass SIMDScalarCPY<string asm> {
+  def i8  : BaseSIMDScalarCPY<FPR8,  V128, ".b", VectorIndexB> {
+    bits<4> idx;
+    let Inst{20-17} = idx;
+    let Inst{16} = 1;
+  }
+  def i16 : BaseSIMDScalarCPY<FPR16, V128, ".h", VectorIndexH> {
+    bits<3> idx;
+    let Inst{20-18} = idx;
+    let Inst{17-16} = 0b10;
+  }
+  def i32 : BaseSIMDScalarCPY<FPR32, V128, ".s", VectorIndexS> {
+    bits<2> idx;
+    let Inst{20-19} = idx;
+    let Inst{18-16} = 0b100;
+  }
+  def i64 : BaseSIMDScalarCPY<FPR64, V128, ".d", VectorIndexD> {
+    bits<1> idx;
+    let Inst{20} = idx;
+    let Inst{19-16} = 0b1000;
+  }
+
+  def : Pat<(v1i64 (scalar_to_vector (i64 (vector_extract (v2i64 V128:$src),
+                                                          VectorIndexD:$idx)))),
+            (!cast<Instruction>(NAME # i64) V128:$src, VectorIndexD:$idx)>;
+
+  // 'DUP' mnemonic aliases.
+  def : SIMDScalarCPYAlias<"dup", ".b",
+                           !cast<Instruction>(NAME#"i8"),
+                           FPR8, V128, VectorIndexB>;
+  def : SIMDScalarCPYAlias<"dup", ".h",
+                           !cast<Instruction>(NAME#"i16"),
+                           FPR16, V128, VectorIndexH>;
+  def : SIMDScalarCPYAlias<"dup", ".s",
+                           !cast<Instruction>(NAME#"i32"),
+                           FPR32, V128, VectorIndexS>;
+  def : SIMDScalarCPYAlias<"dup", ".d",
+                           !cast<Instruction>(NAME#"i64"),
+                           FPR64, V128, VectorIndexD>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD modified immediate instructions
+//----------------------------------------------------------------------------
+
+class BaseSIMDModifiedImm<bit Q, bit op, dag oops, dag iops,
+                          string asm, string op_string,
+                          string cstr, list<dag> pattern>
+  : I<oops, iops, asm, op_string, cstr, pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<8> imm8;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = op;
+  let Inst{28-19} = 0b0111100000;
+  let Inst{18-16} = imm8{7-5};
+  let Inst{11-10} = 0b01;
+  let Inst{9-5}   = imm8{4-0};
+  let Inst{4-0}   = Rd;
+}
+
+class BaseSIMDModifiedImmVector<bit Q, bit op, RegisterOperand vectype,
+                                Operand immtype, dag opt_shift_iop,
+                                string opt_shift, string asm, string kind,
+                                list<dag> pattern>
+  : BaseSIMDModifiedImm<Q, op, (outs vectype:$Rd),
+                        !con((ins immtype:$imm8), opt_shift_iop), asm,
+                        "{\t$Rd" # kind # ", $imm8" # opt_shift #
+                        "|" # kind # "\t$Rd, $imm8" # opt_shift # "}",
+                        "", pattern> {
+  let DecoderMethod = "DecodeModImmInstruction";
+}
+
+class BaseSIMDModifiedImmVectorTied<bit Q, bit op, RegisterOperand vectype,
+                                Operand immtype, dag opt_shift_iop,
+                                string opt_shift, string asm, string kind,
+                                list<dag> pattern>
+  : BaseSIMDModifiedImm<Q, op, (outs vectype:$dst),
+                        !con((ins vectype:$Rd, immtype:$imm8), opt_shift_iop),
+                        asm, "{\t$Rd" # kind # ", $imm8" # opt_shift #
+                             "|" # kind # "\t$Rd, $imm8" # opt_shift # "}",
+                        "$Rd = $dst", pattern> {
+  let DecoderMethod = "DecodeModImmTiedInstruction";
+}
+
+class BaseSIMDModifiedImmVectorShift<bit Q, bit op, bits<2> b15_b12,
+                                     RegisterOperand vectype, string asm,
+                                     string kind, list<dag> pattern>
+  : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
+                              (ins logical_vec_shift:$shift),
+                              "$shift", asm, kind, pattern> {
+  bits<2> shift;
+  let Inst{15}    = b15_b12{1};
+  let Inst{14-13} = shift;
+  let Inst{12}    = b15_b12{0};
+}
+
+class BaseSIMDModifiedImmVectorShiftTied<bit Q, bit op, bits<2> b15_b12,
+                                     RegisterOperand vectype, string asm,
+                                     string kind, list<dag> pattern>
+  : BaseSIMDModifiedImmVectorTied<Q, op, vectype, imm0_255,
+                              (ins logical_vec_shift:$shift),
+                              "$shift", asm, kind, pattern> {
+  bits<2> shift;
+  let Inst{15}    = b15_b12{1};
+  let Inst{14-13} = shift;
+  let Inst{12}    = b15_b12{0};
+}
+
+
+class BaseSIMDModifiedImmVectorShiftHalf<bit Q, bit op, bits<2> b15_b12,
+                                         RegisterOperand vectype, string asm,
+                                         string kind, list<dag> pattern>
+  : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
+                              (ins logical_vec_hw_shift:$shift),
+                              "$shift", asm, kind, pattern> {
+  bits<2> shift;
+  let Inst{15} = b15_b12{1};
+  let Inst{14} = 0;
+  let Inst{13} = shift{0};
+  let Inst{12} = b15_b12{0};
+}
+
+class BaseSIMDModifiedImmVectorShiftHalfTied<bit Q, bit op, bits<2> b15_b12,
+                                         RegisterOperand vectype, string asm,
+                                         string kind, list<dag> pattern>
+  : BaseSIMDModifiedImmVectorTied<Q, op, vectype, imm0_255,
+                              (ins logical_vec_hw_shift:$shift),
+                              "$shift", asm, kind, pattern> {
+  bits<2> shift;
+  let Inst{15} = b15_b12{1};
+  let Inst{14} = 0;
+  let Inst{13} = shift{0};
+  let Inst{12} = b15_b12{0};
+}
+
+multiclass SIMDModifiedImmVectorShift<bit op, bits<2> hw_cmode, bits<2> w_cmode,
+                                      string asm> {
+  def v4i16 : BaseSIMDModifiedImmVectorShiftHalf<0, op, hw_cmode, V64,
+                                                 asm, ".4h", []>;
+  def v8i16 : BaseSIMDModifiedImmVectorShiftHalf<1, op, hw_cmode, V128,
+                                                 asm, ".8h", []>;
+
+  def v2i32 : BaseSIMDModifiedImmVectorShift<0, op, w_cmode, V64,
+                                             asm, ".2s", []>;
+  def v4i32 : BaseSIMDModifiedImmVectorShift<1, op, w_cmode, V128,
+                                             asm, ".4s", []>;
+}
+
+multiclass SIMDModifiedImmVectorShiftTied<bit op, bits<2> hw_cmode,
+                                      bits<2> w_cmode, string asm,
+                                      SDNode OpNode> {
+  def v4i16 : BaseSIMDModifiedImmVectorShiftHalfTied<0, op, hw_cmode, V64,
+                                                 asm, ".4h",
+             [(set (v4i16 V64:$dst), (OpNode V64:$Rd,
+                                             imm0_255:$imm8,
+                                             (i32 imm:$shift)))]>;
+  def v8i16 : BaseSIMDModifiedImmVectorShiftHalfTied<1, op, hw_cmode, V128,
+                                                 asm, ".8h",
+             [(set (v8i16 V128:$dst), (OpNode V128:$Rd,
+                                              imm0_255:$imm8,
+                                              (i32 imm:$shift)))]>;
+
+  def v2i32 : BaseSIMDModifiedImmVectorShiftTied<0, op, w_cmode, V64,
+                                             asm, ".2s",
+             [(set (v2i32 V64:$dst), (OpNode V64:$Rd,
+                                             imm0_255:$imm8,
+                                             (i32 imm:$shift)))]>;
+  def v4i32 : BaseSIMDModifiedImmVectorShiftTied<1, op, w_cmode, V128,
+                                             asm, ".4s",
+             [(set (v4i32 V128:$dst), (OpNode V128:$Rd,
+                                              imm0_255:$imm8,
+                                              (i32 imm:$shift)))]>;
+}
+
+class SIMDModifiedImmMoveMSL<bit Q, bit op, bits<4> cmode,
+                             RegisterOperand vectype, string asm,
+                             string kind, list<dag> pattern>
+  : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
+                              (ins move_vec_shift:$shift),
+                              "$shift", asm, kind, pattern> {
+  bits<1> shift;
+  let Inst{15-13} = cmode{3-1};
+  let Inst{12}    = shift;
+}
+
+class SIMDModifiedImmVectorNoShift<bit Q, bit op, bits<4> cmode,
+                                   RegisterOperand vectype,
+                                   Operand imm_type, string asm,
+                                   string kind, list<dag> pattern>
+  : BaseSIMDModifiedImmVector<Q, op, vectype, imm_type, (ins), "",
+                              asm, kind, pattern> {
+  let Inst{15-12} = cmode;
+}
+
+class SIMDModifiedImmScalarNoShift<bit Q, bit op, bits<4> cmode, string asm,
+                                   list<dag> pattern>
+  : BaseSIMDModifiedImm<Q, op, (outs FPR64:$Rd), (ins simdimmtype10:$imm8), asm,
+                        "\t$Rd, $imm8", "", pattern> {
+  let Inst{15-12} = cmode;
+  let DecoderMethod = "DecodeModImmInstruction";
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD indexed element
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDIndexed<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc,
+                      RegisterOperand dst_reg, RegisterOperand lhs_reg,
+                      RegisterOperand rhs_reg, Operand vec_idx, string asm,
+                      string apple_kind, string dst_kind, string lhs_kind,
+                      string rhs_kind, list<dag> pattern>
+  : I<(outs dst_reg:$Rd), (ins lhs_reg:$Rn, rhs_reg:$Rm, vec_idx:$idx),
+      asm,
+      "{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind # "$idx" #
+      "|" # apple_kind # "\t$Rd, $Rn, $Rm$idx}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28}    = Scalar;
+  let Inst{27-24} = 0b1111;
+  let Inst{23-22} = size;
+  // Bit 21 must be set by the derived class.
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = opc;
+  // Bit 11 must be set by the derived class.
+  let Inst{10}    = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDIndexedTied<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc,
+                      RegisterOperand dst_reg, RegisterOperand lhs_reg,
+                      RegisterOperand rhs_reg, Operand vec_idx, string asm,
+                      string apple_kind, string dst_kind, string lhs_kind,
+                      string rhs_kind, list<dag> pattern>
+  : I<(outs dst_reg:$dst),
+      (ins dst_reg:$Rd, lhs_reg:$Rn, rhs_reg:$Rm, vec_idx:$idx), asm,
+      "{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind # "$idx" #
+      "|" # apple_kind # "\t$Rd, $Rn, $Rm$idx}", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28}    = Scalar;
+  let Inst{27-24} = 0b1111;
+  let Inst{23-22} = size;
+  // Bit 21 must be set by the derived class.
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = opc;
+  // Bit 11 must be set by the derived class.
+  let Inst{10}    = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDFPIndexedSD<bit U, bits<4> opc, string asm,
+                           SDPatternOperator OpNode> {
+  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
+                                      V64, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2s", ".2s", ".2s", ".s",
+    [(set (v2f32 V64:$Rd),
+        (OpNode (v2f32 V64:$Rn),
+         (v2f32 (AArch64duplane32 (v4f32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm, ".4s", ".4s", ".4s", ".s",
+    [(set (v4f32 V128:$Rd),
+        (OpNode (v4f32 V128:$Rn),
+         (v4f32 (AArch64duplane32 (v4f32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v2i64_indexed : BaseSIMDIndexed<1, U, 0, 0b11, opc,
+                                      V128, V128,
+                                      V128, VectorIndexD,
+                                      asm, ".2d", ".2d", ".2d", ".d",
+    [(set (v2f64 V128:$Rd),
+        (OpNode (v2f64 V128:$Rn),
+         (v2f64 (AArch64duplane64 (v2f64 V128:$Rm), VectorIndexD:$idx))))]> {
+    bits<1> idx;
+    let Inst{11} = idx{0};
+    let Inst{21} = 0;
+  }
+
+  def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
+                                      FPR32Op, FPR32Op, V128, VectorIndexS,
+                                      asm, ".s", "", "", ".s",
+    [(set (f32 FPR32Op:$Rd),
+          (OpNode (f32 FPR32Op:$Rn),
+                  (f32 (vector_extract (v4f32 V128:$Rm),
+                                       VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v1i64_indexed : BaseSIMDIndexed<1, U, 1, 0b11, opc,
+                                      FPR64Op, FPR64Op, V128, VectorIndexD,
+                                      asm, ".d", "", "", ".d",
+    [(set (f64 FPR64Op:$Rd),
+          (OpNode (f64 FPR64Op:$Rn),
+                  (f64 (vector_extract (v2f64 V128:$Rm),
+                                       VectorIndexD:$idx))))]> {
+    bits<1> idx;
+    let Inst{11} = idx{0};
+    let Inst{21} = 0;
+  }
+}
+
+multiclass SIMDFPIndexedSDTiedPatterns<string INST, SDPatternOperator OpNode> {
+  // 2 variants for the .2s version: DUPLANE from 128-bit and DUP scalar.
+  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
+                           (AArch64duplane32 (v4f32 V128:$Rm),
+                                           VectorIndexS:$idx))),
+            (!cast<Instruction>(INST # v2i32_indexed)
+                V64:$Rd, V64:$Rn, V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
+                           (AArch64dup (f32 FPR32Op:$Rm)))),
+            (!cast<Instruction>(INST # "v2i32_indexed") V64:$Rd, V64:$Rn,
+                (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
+
+
+  // 2 variants for the .4s version: DUPLANE from 128-bit and DUP scalar.
+  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
+                           (AArch64duplane32 (v4f32 V128:$Rm),
+                                           VectorIndexS:$idx))),
+            (!cast<Instruction>(INST # "v4i32_indexed")
+                V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
+                           (AArch64dup (f32 FPR32Op:$Rm)))),
+            (!cast<Instruction>(INST # "v4i32_indexed") V128:$Rd, V128:$Rn,
+                (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
+
+  // 2 variants for the .2d version: DUPLANE from 128-bit and DUP scalar.
+  def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
+                           (AArch64duplane64 (v2f64 V128:$Rm),
+                                           VectorIndexD:$idx))),
+            (!cast<Instruction>(INST # "v2i64_indexed")
+                V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
+                           (AArch64dup (f64 FPR64Op:$Rm)))),
+            (!cast<Instruction>(INST # "v2i64_indexed") V128:$Rd, V128:$Rn,
+                (SUBREG_TO_REG (i32 0), FPR64Op:$Rm, dsub), (i64 0))>;
+
+  // 2 variants for 32-bit scalar version: extract from .2s or from .4s
+  def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
+                         (vector_extract (v4f32 V128:$Rm), VectorIndexS:$idx))),
+            (!cast<Instruction>(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn,
+                V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
+                         (vector_extract (v2f32 V64:$Rm), VectorIndexS:$idx))),
+            (!cast<Instruction>(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn,
+                (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>;
+
+  // 1 variant for 64-bit scalar version: extract from .1d or from .2d
+  def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn),
+                         (vector_extract (v2f64 V128:$Rm), VectorIndexD:$idx))),
+            (!cast<Instruction>(INST # "v1i64_indexed") FPR64:$Rd, FPR64:$Rn,
+                V128:$Rm, VectorIndexD:$idx)>;
+}
+
+multiclass SIMDFPIndexedSDTied<bit U, bits<4> opc, string asm> {
+  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc, V64, V64,
+                                          V128, VectorIndexS,
+                                          asm, ".2s", ".2s", ".2s", ".s", []> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm, ".4s", ".4s", ".4s", ".s", []> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v2i64_indexed : BaseSIMDIndexedTied<1, U, 0, 0b11, opc,
+                                      V128, V128,
+                                      V128, VectorIndexD,
+                                      asm, ".2d", ".2d", ".2d", ".d", []> {
+    bits<1> idx;
+    let Inst{11} = idx{0};
+    let Inst{21} = 0;
+  }
+
+
+  def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
+                                      FPR32Op, FPR32Op, V128, VectorIndexS,
+                                      asm, ".s", "", "", ".s", []> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v1i64_indexed : BaseSIMDIndexedTied<1, U, 1, 0b11, opc,
+                                      FPR64Op, FPR64Op, V128, VectorIndexD,
+                                      asm, ".d", "", "", ".d", []> {
+    bits<1> idx;
+    let Inst{11} = idx{0};
+    let Inst{21} = 0;
+  }
+}
+
+multiclass SIMDIndexedHS<bit U, bits<4> opc, string asm,
+                         SDPatternOperator OpNode> {
+  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc, V64, V64,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".4h", ".4h", ".4h", ".h",
+    [(set (v4i16 V64:$Rd),
+        (OpNode (v4i16 V64:$Rn),
+         (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".8h", ".8h", ".8h", ".h",
+    [(set (v8i16 V128:$Rd),
+       (OpNode (v8i16 V128:$Rn),
+         (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
+                                      V64, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2s", ".2s", ".2s",  ".s",
+    [(set (v2i32 V64:$Rd),
+       (OpNode (v2i32 V64:$Rn),
+          (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm, ".4s", ".4s", ".4s", ".s",
+    [(set (v4i32 V128:$Rd),
+       (OpNode (v4i32 V128:$Rn),
+          (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v1i16_indexed : BaseSIMDIndexed<1, U, 1, 0b01, opc,
+                                      FPR16Op, FPR16Op, V128_lo, VectorIndexH,
+                                      asm, ".h", "", "", ".h", []> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
+                                      FPR32Op, FPR32Op, V128, VectorIndexS,
+                                      asm, ".s", "", "", ".s",
+      [(set (i32 FPR32Op:$Rd),
+            (OpNode FPR32Op:$Rn,
+                    (i32 (vector_extract (v4i32 V128:$Rm),
+                                         VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+}
+
+multiclass SIMDVectorIndexedHS<bit U, bits<4> opc, string asm,
+                               SDPatternOperator OpNode> {
+  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc,
+                                      V64, V64,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".4h", ".4h", ".4h", ".h",
+    [(set (v4i16 V64:$Rd),
+        (OpNode (v4i16 V64:$Rn),
+         (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".8h", ".8h", ".8h", ".h",
+    [(set (v8i16 V128:$Rd),
+       (OpNode (v8i16 V128:$Rn),
+         (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
+                                      V64, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2s", ".2s", ".2s", ".s",
+    [(set (v2i32 V64:$Rd),
+       (OpNode (v2i32 V64:$Rn),
+          (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm, ".4s", ".4s", ".4s", ".s",
+    [(set (v4i32 V128:$Rd),
+       (OpNode (v4i32 V128:$Rn),
+          (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+}
+
+multiclass SIMDVectorIndexedHSTied<bit U, bits<4> opc, string asm,
+                                   SDPatternOperator OpNode> {
+  def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc, V64, V64,
+                                          V128_lo, VectorIndexH,
+                                          asm, ".4h", ".4h", ".4h", ".h",
+    [(set (v4i16 V64:$dst),
+        (OpNode (v4i16 V64:$Rd),(v4i16 V64:$Rn),
+         (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".8h", ".8h", ".8h", ".h",
+    [(set (v8i16 V128:$dst),
+       (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn),
+         (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
+                                      V64, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2s", ".2s", ".2s", ".s",
+    [(set (v2i32 V64:$dst),
+       (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn),
+          (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm, ".4s", ".4s", ".4s", ".s",
+    [(set (v4i32 V128:$dst),
+       (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
+          (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+}
+
+multiclass SIMDIndexedLongSD<bit U, bits<4> opc, string asm,
+                             SDPatternOperator OpNode> {
+  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc,
+                                      V128, V64,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".4s", ".4s", ".4h", ".h",
+    [(set (v4i32 V128:$Rd),
+        (OpNode (v4i16 V64:$Rn),
+         (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm#"2", ".4s", ".4s", ".8h", ".h",
+    [(set (v4i32 V128:$Rd),
+          (OpNode (extract_high_v8i16 V128:$Rn),
+                  (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+                                                      VectorIndexH:$idx))))]> {
+
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
+                                      V128, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2d", ".2d", ".2s", ".s",
+    [(set (v2i64 V128:$Rd),
+        (OpNode (v2i32 V64:$Rn),
+         (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm#"2", ".2d", ".2d", ".4s", ".s",
+    [(set (v2i64 V128:$Rd),
+          (OpNode (extract_high_v4i32 V128:$Rn),
+                  (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
+                                                      VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b01, opc,
+                                      FPR32Op, FPR16Op, V128_lo, VectorIndexH,
+                                      asm, ".h", "", "", ".h", []> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v1i64_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
+                                      FPR64Op, FPR32Op, V128, VectorIndexS,
+                                      asm, ".s", "", "", ".s", []> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+}
+
+multiclass SIMDIndexedLongSQDMLXSDTied<bit U, bits<4> opc, string asm,
+                                       SDPatternOperator Accum> {
+  def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc,
+                                      V128, V64,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".4s", ".4s", ".4h", ".h",
+    [(set (v4i32 V128:$dst),
+          (Accum (v4i32 V128:$Rd),
+                 (v4i32 (int_aarch64_neon_sqdmull
+                             (v4i16 V64:$Rn),
+                             (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+                                                    VectorIndexH:$idx))))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  // FIXME: it would be nice to use the scalar (v1i32) instruction here, but an
+  // intermediate EXTRACT_SUBREG would be untyped.
+  def : Pat<(i32 (Accum (i32 FPR32Op:$Rd),
+                (i32 (vector_extract (v4i32
+                         (int_aarch64_neon_sqdmull (v4i16 V64:$Rn),
+                             (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+                                                    VectorIndexH:$idx)))),
+                         (i64 0))))),
+            (EXTRACT_SUBREG
+                (!cast<Instruction>(NAME # v4i16_indexed)
+                    (SUBREG_TO_REG (i32 0), FPR32Op:$Rd, ssub), V64:$Rn,
+                    V128_lo:$Rm, VectorIndexH:$idx),
+                ssub)>;
+
+  def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm#"2", ".4s", ".4s", ".8h", ".h",
+    [(set (v4i32 V128:$dst),
+          (Accum (v4i32 V128:$Rd),
+                 (v4i32 (int_aarch64_neon_sqdmull
+                            (extract_high_v8i16 V128:$Rn),
+                            (extract_high_v8i16
+                                (AArch64duplane16 (v8i16 V128_lo:$Rm),
+                                                VectorIndexH:$idx))))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
+                                      V128, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2d", ".2d", ".2s", ".s",
+    [(set (v2i64 V128:$dst),
+        (Accum (v2i64 V128:$Rd),
+               (v2i64 (int_aarch64_neon_sqdmull
+                          (v2i32 V64:$Rn),
+                          (v2i32 (AArch64duplane32 (v4i32 V128:$Rm),
+                                                 VectorIndexS:$idx))))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm#"2", ".2d", ".2d", ".4s", ".s",
+    [(set (v2i64 V128:$dst),
+          (Accum (v2i64 V128:$Rd),
+                 (v2i64 (int_aarch64_neon_sqdmull
+                            (extract_high_v4i32 V128:$Rn),
+                            (extract_high_v4i32
+                                (AArch64duplane32 (v4i32 V128:$Rm),
+                                                VectorIndexS:$idx))))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b01, opc,
+                                      FPR32Op, FPR16Op, V128_lo, VectorIndexH,
+                                      asm, ".h", "", "", ".h", []> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+
+  def v1i64_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
+                                      FPR64Op, FPR32Op, V128, VectorIndexS,
+                                      asm, ".s", "", "", ".s",
+    [(set (i64 FPR64Op:$dst),
+          (Accum (i64 FPR64Op:$Rd),
+                 (i64 (int_aarch64_neon_sqdmulls_scalar
+                            (i32 FPR32Op:$Rn),
+                            (i32 (vector_extract (v4i32 V128:$Rm),
+                                                 VectorIndexS:$idx))))))]> {
+
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+}
+
+multiclass SIMDVectorIndexedLongSD<bit U, bits<4> opc, string asm,
+                                   SDPatternOperator OpNode> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc,
+                                      V128, V64,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".4s", ".4s", ".4h", ".h",
+    [(set (v4i32 V128:$Rd),
+        (OpNode (v4i16 V64:$Rn),
+         (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm#"2", ".4s", ".4s", ".8h", ".h",
+    [(set (v4i32 V128:$Rd),
+          (OpNode (extract_high_v8i16 V128:$Rn),
+                  (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+                                                      VectorIndexH:$idx))))]> {
+
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
+                                      V128, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2d", ".2d", ".2s", ".s",
+    [(set (v2i64 V128:$Rd),
+        (OpNode (v2i32 V64:$Rn),
+         (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm#"2", ".2d", ".2d", ".4s", ".s",
+    [(set (v2i64 V128:$Rd),
+          (OpNode (extract_high_v4i32 V128:$Rn),
+                  (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
+                                                      VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+  }
+}
+
+multiclass SIMDVectorIndexedLongSDTied<bit U, bits<4> opc, string asm,
+                                       SDPatternOperator OpNode> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+  def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc,
+                                      V128, V64,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".4s", ".4s", ".4h", ".h",
+    [(set (v4i32 V128:$dst),
+        (OpNode (v4i32 V128:$Rd), (v4i16 V64:$Rn),
+         (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm#"2", ".4s", ".4s", ".8h", ".h",
+    [(set (v4i32 V128:$dst),
+          (OpNode (v4i32 V128:$Rd),
+                  (extract_high_v8i16 V128:$Rn),
+                  (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+                                                      VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
+                                      V128, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2d", ".2d", ".2s", ".s",
+    [(set (v2i64 V128:$dst),
+        (OpNode (v2i64 V128:$Rd), (v2i32 V64:$Rn),
+         (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm#"2", ".2d", ".2d", ".4s", ".s",
+    [(set (v2i64 V128:$dst),
+          (OpNode (v2i64 V128:$Rd),
+                  (extract_high_v4i32 V128:$Rn),
+                  (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
+                                                      VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+  }
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar shift by immediate
+//----------------------------------------------------------------------------
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDScalarShift<bit U, bits<5> opc, bits<7> fixed_imm,
+                     RegisterClass regtype1, RegisterClass regtype2,
+                     Operand immtype, string asm, list<dag> pattern>
+  : I<(outs regtype1:$Rd), (ins regtype2:$Rn, immtype:$imm),
+      asm, "\t$Rd, $Rn, $imm", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<7> imm;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-23} = 0b111110;
+  let Inst{22-16} = fixed_imm;
+  let Inst{15-11} = opc;
+  let Inst{10}    = 1;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rd;
+}
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDScalarShiftTied<bit U, bits<5> opc, bits<7> fixed_imm,
+                     RegisterClass regtype1, RegisterClass regtype2,
+                     Operand immtype, string asm, list<dag> pattern>
+  : I<(outs regtype1:$dst), (ins regtype1:$Rd, regtype2:$Rn, immtype:$imm),
+      asm, "\t$Rd, $Rn, $imm", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<7> imm;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-23} = 0b111110;
+  let Inst{22-16} = fixed_imm;
+  let Inst{15-11} = opc;
+  let Inst{10}    = 1;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rd;
+}
+
+
+multiclass SIMDScalarRShiftSD<bit U, bits<5> opc, string asm> {
+  def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
+                              FPR32, FPR32, vecshiftR32, asm, []> {
+    let Inst{20-16} = imm{4-0};
+  }
+
+  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftR64, asm, []> {
+    let Inst{21-16} = imm{5-0};
+  }
+}
+
+multiclass SIMDScalarRShiftD<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode> {
+  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftR64, asm,
+  [(set (i64 FPR64:$Rd),
+     (OpNode (i64 FPR64:$Rn), (i32 vecshiftR64:$imm)))]> {
+    let Inst{21-16} = imm{5-0};
+  }
+
+  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftR64:$imm))),
+            (!cast<Instruction>(NAME # "d") FPR64:$Rn, vecshiftR64:$imm)>;
+}
+
+multiclass SIMDScalarRShiftDTied<bit U, bits<5> opc, string asm,
+                                 SDPatternOperator OpNode = null_frag> {
+  def d : BaseSIMDScalarShiftTied<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftR64, asm,
+  [(set (i64 FPR64:$dst), (OpNode (i64 FPR64:$Rd), (i64 FPR64:$Rn),
+                                                   (i32 vecshiftR64:$imm)))]> {
+    let Inst{21-16} = imm{5-0};
+  }
+
+  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
+                           (i32 vecshiftR64:$imm))),
+            (!cast<Instruction>(NAME # "d") FPR64:$Rd, FPR64:$Rn,
+                                            vecshiftR64:$imm)>;
+}
+
+multiclass SIMDScalarLShiftD<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode> {
+  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftL64, asm,
+    [(set (v1i64 FPR64:$Rd),
+       (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftL64:$imm)))]> {
+    let Inst{21-16} = imm{5-0};
+  }
+}
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+multiclass SIMDScalarLShiftDTied<bit U, bits<5> opc, string asm> {
+  def d : BaseSIMDScalarShiftTied<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftL64, asm, []> {
+    let Inst{21-16} = imm{5-0};
+  }
+}
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+multiclass SIMDScalarRShiftBHS<bit U, bits<5> opc, string asm,
+                               SDPatternOperator OpNode = null_frag> {
+  def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
+                              FPR8, FPR16, vecshiftR8, asm, []> {
+    let Inst{18-16} = imm{2-0};
+  }
+
+  def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
+                              FPR16, FPR32, vecshiftR16, asm, []> {
+    let Inst{19-16} = imm{3-0};
+  }
+
+  def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
+                              FPR32, FPR64, vecshiftR32, asm,
+    [(set (i32 FPR32:$Rd), (OpNode (i64 FPR64:$Rn), vecshiftR32:$imm))]> {
+    let Inst{20-16} = imm{4-0};
+  }
+}
+
+multiclass SIMDScalarLShiftBHSD<bit U, bits<5> opc, string asm,
+                                SDPatternOperator OpNode> {
+  def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
+                              FPR8, FPR8, vecshiftL8, asm, []> {
+    let Inst{18-16} = imm{2-0};
+  }
+
+  def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
+                              FPR16, FPR16, vecshiftL16, asm, []> {
+    let Inst{19-16} = imm{3-0};
+  }
+
+  def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
+                              FPR32, FPR32, vecshiftL32, asm,
+    [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn), (i32 vecshiftL32:$imm)))]> {
+    let Inst{20-16} = imm{4-0};
+  }
+
+  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftL64, asm,
+    [(set (i64 FPR64:$Rd), (OpNode (i64 FPR64:$Rn), (i32 vecshiftL64:$imm)))]> {
+    let Inst{21-16} = imm{5-0};
+  }
+
+  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftL64:$imm))),
+            (!cast<Instruction>(NAME # "d") FPR64:$Rn, vecshiftL64:$imm)>;
+}
+
+multiclass SIMDScalarRShiftBHSD<bit U, bits<5> opc, string asm> {
+  def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
+                              FPR8, FPR8, vecshiftR8, asm, []> {
+    let Inst{18-16} = imm{2-0};
+  }
+
+  def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
+                              FPR16, FPR16, vecshiftR16, asm, []> {
+    let Inst{19-16} = imm{3-0};
+  }
+
+  def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
+                              FPR32, FPR32, vecshiftR32, asm, []> {
+    let Inst{20-16} = imm{4-0};
+  }
+
+  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftR64, asm, []> {
+    let Inst{21-16} = imm{5-0};
+  }
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD vector x indexed element
+//----------------------------------------------------------------------------
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDVectorShift<bit Q, bit U, bits<5> opc, bits<7> fixed_imm,
+                     RegisterOperand dst_reg, RegisterOperand src_reg,
+                     Operand immtype,
+                     string asm, string dst_kind, string src_kind,
+                     list<dag> pattern>
+  : I<(outs dst_reg:$Rd), (ins src_reg:$Rn, immtype:$imm),
+      asm, "{\t$Rd" # dst_kind # ", $Rn" # src_kind # ", $imm" #
+           "|" # dst_kind # "\t$Rd, $Rn, $imm}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-23} = 0b011110;
+  let Inst{22-16} = fixed_imm;
+  let Inst{15-11} = opc;
+  let Inst{10}    = 1;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDVectorShiftTied<bit Q, bit U, bits<5> opc, bits<7> fixed_imm,
+                     RegisterOperand vectype1, RegisterOperand vectype2,
+                     Operand immtype,
+                     string asm, string dst_kind, string src_kind,
+                     list<dag> pattern>
+  : I<(outs vectype1:$dst), (ins vectype1:$Rd, vectype2:$Rn, immtype:$imm),
+      asm, "{\t$Rd" # dst_kind # ", $Rn" # src_kind # ", $imm" #
+           "|" # dst_kind # "\t$Rd, $Rn, $imm}", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-23} = 0b011110;
+  let Inst{22-16} = fixed_imm;
+  let Inst{15-11} = opc;
+  let Inst{10}    = 1;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDVectorRShiftSD<bit U, bits<5> opc, string asm,
+                              Intrinsic OpNode> {
+  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V64, vecshiftR32,
+                                  asm, ".2s", ".2s",
+      [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (i32 imm:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftR32,
+                                  asm, ".4s", ".4s",
+      [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (i32 imm:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
+                                  V128, V128, vecshiftR64,
+                                  asm, ".2d", ".2d",
+      [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (i32 imm:$imm)))]> {
+    bits<6> imm;
+    let Inst{21-16} = imm;
+  }
+}
+
+multiclass SIMDVectorRShiftSDToFP<bit U, bits<5> opc, string asm,
+                                  Intrinsic OpNode> {
+  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V64, vecshiftR32,
+                                  asm, ".2s", ".2s",
+      [(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (i32 imm:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftR32,
+                                  asm, ".4s", ".4s",
+      [(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (i32 imm:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
+                                  V128, V128, vecshiftR64,
+                                  asm, ".2d", ".2d",
+      [(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (i32 imm:$imm)))]> {
+    bits<6> imm;
+    let Inst{21-16} = imm;
+  }
+}
+
+multiclass SIMDVectorRShiftNarrowBHS<bit U, bits<5> opc, string asm,
+                                     SDPatternOperator OpNode> {
+  def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
+                                  V64, V128, vecshiftR16Narrow,
+                                  asm, ".8b", ".8h",
+      [(set (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn), vecshiftR16Narrow:$imm))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V128, vecshiftR16Narrow,
+                                  asm#"2", ".16b", ".8h", []> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+    let hasSideEffects = 0;
+  }
+
+  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+                                  V64, V128, vecshiftR32Narrow,
+                                  asm, ".4h", ".4s",
+      [(set (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn), vecshiftR32Narrow:$imm))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftR32Narrow,
+                                  asm#"2", ".8h", ".4s", []> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+    let hasSideEffects = 0;
+  }
+
+  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V128, vecshiftR64Narrow,
+                                  asm, ".2s", ".2d",
+      [(set (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn), vecshiftR64Narrow:$imm))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftR64Narrow,
+                                  asm#"2", ".4s", ".2d", []> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+    let hasSideEffects = 0;
+  }
+
+  // TableGen doesn't like patters w/ INSERT_SUBREG on the instructions
+  // themselves, so put them here instead.
+
+  // Patterns involving what's effectively an insert high and a normal
+  // intrinsic, represented by CONCAT_VECTORS.
+  def : Pat<(concat_vectors (v8i8 V64:$Rd),(OpNode (v8i16 V128:$Rn),
+                                                   vecshiftR16Narrow:$imm)),
+            (!cast<Instruction>(NAME # "v16i8_shift")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                V128:$Rn, vecshiftR16Narrow:$imm)>;
+  def : Pat<(concat_vectors (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn),
+                                                     vecshiftR32Narrow:$imm)),
+            (!cast<Instruction>(NAME # "v8i16_shift")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                V128:$Rn, vecshiftR32Narrow:$imm)>;
+  def : Pat<(concat_vectors (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn),
+                                                     vecshiftR64Narrow:$imm)),
+            (!cast<Instruction>(NAME # "v4i32_shift")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                V128:$Rn, vecshiftR64Narrow:$imm)>;
+}
+
+multiclass SIMDVectorLShiftBHSD<bit U, bits<5> opc, string asm,
+                                SDPatternOperator OpNode> {
+  def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
+                                  V64, V64, vecshiftL8,
+                                  asm, ".8b", ".8b",
+                 [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn),
+                       (i32 vecshiftL8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V128, vecshiftL8,
+                                  asm, ".16b", ".16b",
+             [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn),
+                   (i32 vecshiftL8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+                                  V64, V64, vecshiftL16,
+                                  asm, ".4h", ".4h",
+              [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn),
+                    (i32 vecshiftL16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftL16,
+                                  asm, ".8h", ".8h",
+            [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn),
+                  (i32 vecshiftL16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V64, vecshiftL32,
+                                  asm, ".2s", ".2s",
+              [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn),
+                    (i32 vecshiftL32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftL32,
+                                  asm, ".4s", ".4s",
+            [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn),
+                  (i32 vecshiftL32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
+                                  V128, V128, vecshiftL64,
+                                  asm, ".2d", ".2d",
+            [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn),
+                  (i32 vecshiftL64:$imm)))]> {
+    bits<6> imm;
+    let Inst{21-16} = imm;
+  }
+}
+
+multiclass SIMDVectorRShiftBHSD<bit U, bits<5> opc, string asm,
+                                SDPatternOperator OpNode> {
+  def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
+                                  V64, V64, vecshiftR8,
+                                  asm, ".8b", ".8b",
+                 [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn),
+                       (i32 vecshiftR8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V128, vecshiftR8,
+                                  asm, ".16b", ".16b",
+             [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn),
+                   (i32 vecshiftR8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+                                  V64, V64, vecshiftR16,
+                                  asm, ".4h", ".4h",
+              [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn),
+                    (i32 vecshiftR16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftR16,
+                                  asm, ".8h", ".8h",
+            [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn),
+                  (i32 vecshiftR16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V64, vecshiftR32,
+                                  asm, ".2s", ".2s",
+              [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn),
+                    (i32 vecshiftR32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftR32,
+                                  asm, ".4s", ".4s",
+            [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn),
+                  (i32 vecshiftR32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
+                                  V128, V128, vecshiftR64,
+                                  asm, ".2d", ".2d",
+            [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn),
+                  (i32 vecshiftR64:$imm)))]> {
+    bits<6> imm;
+    let Inst{21-16} = imm;
+  }
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDVectorRShiftBHSDTied<bit U, bits<5> opc, string asm,
+                                    SDPatternOperator OpNode = null_frag> {
+  def v8i8_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,0,1,?,?,?},
+                                  V64, V64, vecshiftR8, asm, ".8b", ".8b",
+                 [(set (v8i8 V64:$dst),
+                   (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn),
+                           (i32 vecshiftR8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V128, vecshiftR8, asm, ".16b", ".16b",
+             [(set (v16i8 V128:$dst),
+               (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
+                       (i32 vecshiftR8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v4i16_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,1,?,?,?,?},
+                                  V64, V64, vecshiftR16, asm, ".4h", ".4h",
+              [(set (v4i16 V64:$dst),
+                (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn),
+                        (i32 vecshiftR16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftR16, asm, ".8h", ".8h",
+            [(set (v8i16 V128:$dst),
+              (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn),
+                      (i32 vecshiftR16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v2i32_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V64, vecshiftR32, asm, ".2s", ".2s",
+              [(set (v2i32 V64:$dst),
+                (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn),
+                        (i32 vecshiftR32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftR32, asm, ".4s", ".4s",
+            [(set (v4i32 V128:$dst),
+              (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
+                      (i32 vecshiftR32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v2i64_shift : BaseSIMDVectorShiftTied<1, U, opc, {1,?,?,?,?,?,?},
+                                  V128, V128, vecshiftR64,
+                                  asm, ".2d", ".2d", [(set (v2i64 V128:$dst),
+              (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn),
+                      (i32 vecshiftR64:$imm)))]> {
+    bits<6> imm;
+    let Inst{21-16} = imm;
+  }
+}
+
+multiclass SIMDVectorLShiftBHSDTied<bit U, bits<5> opc, string asm,
+                                    SDPatternOperator OpNode = null_frag> {
+  def v8i8_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,0,1,?,?,?},
+                                  V64, V64, vecshiftL8,
+                                  asm, ".8b", ".8b",
+                    [(set (v8i8 V64:$dst),
+                          (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn),
+                                  (i32 vecshiftL8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V128, vecshiftL8,
+                                  asm, ".16b", ".16b",
+                    [(set (v16i8 V128:$dst),
+                          (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
+                                  (i32 vecshiftL8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v4i16_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,1,?,?,?,?},
+                                  V64, V64, vecshiftL16,
+                                  asm, ".4h", ".4h",
+                    [(set (v4i16 V64:$dst),
+                           (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn),
+                                   (i32 vecshiftL16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftL16,
+                                  asm, ".8h", ".8h",
+                    [(set (v8i16 V128:$dst),
+                          (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn),
+                                  (i32 vecshiftL16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v2i32_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V64, vecshiftL32,
+                                  asm, ".2s", ".2s",
+                    [(set (v2i32 V64:$dst),
+                          (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn),
+                                  (i32 vecshiftL32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftL32,
+                                  asm, ".4s", ".4s",
+                    [(set (v4i32 V128:$dst),
+                          (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
+                                  (i32 vecshiftL32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v2i64_shift : BaseSIMDVectorShiftTied<1, U, opc, {1,?,?,?,?,?,?},
+                                  V128, V128, vecshiftL64,
+                                  asm, ".2d", ".2d",
+                    [(set (v2i64 V128:$dst),
+                          (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn),
+                                  (i32 vecshiftL64:$imm)))]> {
+    bits<6> imm;
+    let Inst{21-16} = imm;
+  }
+}
+
+multiclass SIMDVectorLShiftLongBHSD<bit U, bits<5> opc, string asm,
+                                   SDPatternOperator OpNode> {
+  def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V64, vecshiftL8, asm, ".8h", ".8b",
+      [(set (v8i16 V128:$Rd), (OpNode (v8i8 V64:$Rn), vecshiftL8:$imm))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V128, vecshiftL8,
+                                  asm#"2", ".8h", ".16b",
+      [(set (v8i16 V128:$Rd),
+            (OpNode (extract_high_v16i8 V128:$Rn), vecshiftL8:$imm))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V64, vecshiftL16, asm, ".4s", ".4h",
+      [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), vecshiftL16:$imm))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftL16,
+                                  asm#"2", ".4s", ".8h",
+      [(set (v4i32 V128:$Rd),
+            (OpNode (extract_high_v8i16 V128:$Rn), vecshiftL16:$imm))]> {
+
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V64, vecshiftL32, asm, ".2d", ".2s",
+      [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), vecshiftL32:$imm))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftL32,
+                                  asm#"2", ".2d", ".4s",
+      [(set (v2i64 V128:$Rd),
+            (OpNode (extract_high_v4i32 V128:$Rn), vecshiftL32:$imm))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+}
+
+
+//---
+// Vector load/store
+//---
+// SIMD ldX/stX no-index memory references don't allow the optional
+// ", #0" constant and handle post-indexing explicitly, so we use
+// a more specialized parse method for them. Otherwise, it's the same as
+// the general GPR64sp handling.
+
+class BaseSIMDLdSt<bit Q, bit L, bits<4> opcode, bits<2> size,
+                   string asm, dag oops, dag iops, list<dag> pattern>
+  : I<oops, iops, asm, "\t$Vt, [$Rn]", "", pattern> {
+  bits<5> Vt;
+  bits<5> Rn;
+  let Inst{31} = 0;
+  let Inst{30} = Q;
+  let Inst{29-23} = 0b0011000;
+  let Inst{22} = L;
+  let Inst{21-16} = 0b000000;
+  let Inst{15-12} = opcode;
+  let Inst{11-10} = size;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Vt;
+}
+
+class BaseSIMDLdStPost<bit Q, bit L, bits<4> opcode, bits<2> size,
+                       string asm, dag oops, dag iops>
+  : I<oops, iops, asm, "\t$Vt, [$Rn], $Xm", "$Rn = $wback", []> {
+  bits<5> Vt;
+  bits<5> Rn;
+  bits<5> Xm;
+  let Inst{31} = 0;
+  let Inst{30} = Q;
+  let Inst{29-23} = 0b0011001;
+  let Inst{22} = L;
+  let Inst{21} = 0;
+  let Inst{20-16} = Xm;
+  let Inst{15-12} = opcode;
+  let Inst{11-10} = size;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Vt;
+}
+
+// The immediate form of AdvSIMD post-indexed addressing is encoded with
+// register post-index addressing from the zero register.
+multiclass SIMDLdStAliases<string asm, string layout, string Count,
+                           int Offset, int Size> {
+  // E.g. "ld1 { v0.8b, v1.8b }, [x1], #16"
+  //      "ld1\t$Vt, [$Rn], #16"
+  // may get mapped to
+  //      (LD1Twov8b_POST VecListTwo8b:$Vt, GPR64sp:$Rn, XZR)
+  def : InstAlias<asm # "\t$Vt, [$Rn], #" # Offset,
+                  (!cast<Instruction>(NAME # Count # "v" # layout # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # layout):$Vt,
+                      XZR), 1>;
+
+  // E.g. "ld1.8b { v0, v1 }, [x1], #16"
+  //      "ld1.8b\t$Vt, [$Rn], #16"
+  // may get mapped to
+  //      (LD1Twov8b_POST VecListTwo64:$Vt, GPR64sp:$Rn, XZR)
+  def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], #" # Offset,
+                  (!cast<Instruction>(NAME # Count # "v" # layout # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+                      XZR), 0>;
+
+  // E.g. "ld1.8b { v0, v1 }, [x1]"
+  //      "ld1\t$Vt, [$Rn]"
+  // may get mapped to
+  //      (LD1Twov8b VecListTwo64:$Vt, GPR64sp:$Rn)
+  def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn]",
+                  (!cast<Instruction>(NAME # Count # "v" # layout)
+                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+                      GPR64sp:$Rn), 0>;
+
+  // E.g. "ld1.8b { v0, v1 }, [x1], x2"
+  //      "ld1\t$Vt, [$Rn], $Xm"
+  // may get mapped to
+  //      (LD1Twov8b_POST VecListTwo64:$Vt, GPR64sp:$Rn, GPR64pi8:$Xm)
+  def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], $Xm",
+                  (!cast<Instruction>(NAME # Count # "v" # layout # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+                      !cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
+}
+
+multiclass BaseSIMDLdN<string Count, string asm, string veclist, int Offset128,
+                       int Offset64, bits<4> opcode> {
+  let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
+    def v16b: BaseSIMDLdSt<1, 1, opcode, 0b00, asm,
+                           (outs !cast<RegisterOperand>(veclist # "16b"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+    def v8h : BaseSIMDLdSt<1, 1, opcode, 0b01, asm,
+                           (outs !cast<RegisterOperand>(veclist # "8h"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+    def v4s : BaseSIMDLdSt<1, 1, opcode, 0b10, asm,
+                           (outs !cast<RegisterOperand>(veclist # "4s"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+    def v2d : BaseSIMDLdSt<1, 1, opcode, 0b11, asm,
+                           (outs !cast<RegisterOperand>(veclist # "2d"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+    def v8b : BaseSIMDLdSt<0, 1, opcode, 0b00, asm,
+                           (outs !cast<RegisterOperand>(veclist # "8b"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+    def v4h : BaseSIMDLdSt<0, 1, opcode, 0b01, asm,
+                           (outs !cast<RegisterOperand>(veclist # "4h"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+    def v2s : BaseSIMDLdSt<0, 1, opcode, 0b10, asm,
+                           (outs !cast<RegisterOperand>(veclist # "2s"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+
+
+    def v16b_POST: BaseSIMDLdStPost<1, 1, opcode, 0b00, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "16b"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v8h_POST : BaseSIMDLdStPost<1, 1, opcode, 0b01, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "8h"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v4s_POST : BaseSIMDLdStPost<1, 1, opcode, 0b10, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "4s"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v2d_POST : BaseSIMDLdStPost<1, 1, opcode, 0b11, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "2d"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v8b_POST : BaseSIMDLdStPost<0, 1, opcode, 0b00, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "8b"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+    def v4h_POST : BaseSIMDLdStPost<0, 1, opcode, 0b01, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "4h"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+    def v2s_POST : BaseSIMDLdStPost<0, 1, opcode, 0b10, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "2s"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+  }
+
+  defm : SIMDLdStAliases<asm, "16b", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "8h", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "4s", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "2d", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "8b", Count, Offset64, 64>;
+  defm : SIMDLdStAliases<asm, "4h", Count, Offset64, 64>;
+  defm : SIMDLdStAliases<asm, "2s", Count, Offset64, 64>;
+}
+
+// Only ld1/st1 has a v1d version.
+multiclass BaseSIMDStN<string Count, string asm, string veclist, int Offset128,
+                       int Offset64, bits<4> opcode> {
+  let hasSideEffects = 0, mayStore = 1, mayLoad = 0 in {
+    def v16b : BaseSIMDLdSt<1, 0, opcode, 0b00, asm, (outs),
+                            (ins !cast<RegisterOperand>(veclist # "16b"):$Vt,
+                                 GPR64sp:$Rn), []>;
+    def v8h : BaseSIMDLdSt<1, 0, opcode, 0b01, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "8h"):$Vt,
+                                GPR64sp:$Rn), []>;
+    def v4s : BaseSIMDLdSt<1, 0, opcode, 0b10, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "4s"):$Vt,
+                                GPR64sp:$Rn), []>;
+    def v2d : BaseSIMDLdSt<1, 0, opcode, 0b11, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "2d"):$Vt,
+                                GPR64sp:$Rn), []>;
+    def v8b : BaseSIMDLdSt<0, 0, opcode, 0b00, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "8b"):$Vt,
+                                GPR64sp:$Rn), []>;
+    def v4h : BaseSIMDLdSt<0, 0, opcode, 0b01, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "4h"):$Vt,
+                                GPR64sp:$Rn), []>;
+    def v2s : BaseSIMDLdSt<0, 0, opcode, 0b10, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "2s"):$Vt,
+                                GPR64sp:$Rn), []>;
+
+    def v16b_POST : BaseSIMDLdStPost<1, 0, opcode, 0b00, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "16b"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v8h_POST : BaseSIMDLdStPost<1, 0, opcode, 0b01, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "8h"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v4s_POST : BaseSIMDLdStPost<1, 0, opcode, 0b10, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "4s"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v2d_POST : BaseSIMDLdStPost<1, 0, opcode, 0b11, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "2d"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v8b_POST : BaseSIMDLdStPost<0, 0, opcode, 0b00, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "8b"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+    def v4h_POST : BaseSIMDLdStPost<0, 0, opcode, 0b01, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "4h"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+    def v2s_POST : BaseSIMDLdStPost<0, 0, opcode, 0b10, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "2s"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+  }
+
+  defm : SIMDLdStAliases<asm, "16b", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "8h", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "4s", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "2d", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "8b", Count, Offset64, 64>;
+  defm : SIMDLdStAliases<asm, "4h", Count, Offset64, 64>;
+  defm : SIMDLdStAliases<asm, "2s", Count, Offset64, 64>;
+}
+
+multiclass BaseSIMDLd1<string Count, string asm, string veclist,
+                       int Offset128, int Offset64, bits<4> opcode>
+  : BaseSIMDLdN<Count, asm, veclist, Offset128, Offset64, opcode> {
+
+  // LD1 instructions have extra "1d" variants.
+  let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
+    def v1d : BaseSIMDLdSt<0, 1, opcode, 0b11, asm,
+                           (outs !cast<RegisterOperand>(veclist # "1d"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+
+    def v1d_POST : BaseSIMDLdStPost<0, 1, opcode, 0b11, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "1d"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+  }
+
+  defm : SIMDLdStAliases<asm, "1d", Count, Offset64, 64>;
+}
+
+multiclass BaseSIMDSt1<string Count, string asm, string veclist,
+                       int Offset128, int Offset64, bits<4> opcode>
+  : BaseSIMDStN<Count, asm, veclist, Offset128, Offset64, opcode> {
+
+  // ST1 instructions have extra "1d" variants.
+  let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in {
+    def v1d : BaseSIMDLdSt<0, 0, opcode, 0b11, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "1d"):$Vt,
+                                GPR64sp:$Rn), []>;
+
+    def v1d_POST : BaseSIMDLdStPost<0, 0, opcode, 0b11, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "1d"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+  }
+
+  defm : SIMDLdStAliases<asm, "1d", Count, Offset64, 64>;
+}
+
+multiclass SIMDLd1Multiple<string asm> {
+  defm One   : BaseSIMDLd1<"One", asm, "VecListOne", 16, 8,  0b0111>;
+  defm Two   : BaseSIMDLd1<"Two", asm, "VecListTwo", 32, 16, 0b1010>;
+  defm Three : BaseSIMDLd1<"Three", asm, "VecListThree", 48, 24, 0b0110>;
+  defm Four  : BaseSIMDLd1<"Four", asm, "VecListFour", 64, 32, 0b0010>;
+}
+
+multiclass SIMDSt1Multiple<string asm> {
+  defm One   : BaseSIMDSt1<"One", asm, "VecListOne", 16, 8,  0b0111>;
+  defm Two   : BaseSIMDSt1<"Two", asm, "VecListTwo", 32, 16, 0b1010>;
+  defm Three : BaseSIMDSt1<"Three", asm, "VecListThree", 48, 24, 0b0110>;
+  defm Four  : BaseSIMDSt1<"Four", asm, "VecListFour", 64, 32, 0b0010>;
+}
+
+multiclass SIMDLd2Multiple<string asm> {
+  defm Two : BaseSIMDLdN<"Two", asm, "VecListTwo", 32, 16, 0b1000>;
+}
+
+multiclass SIMDSt2Multiple<string asm> {
+  defm Two : BaseSIMDStN<"Two", asm, "VecListTwo", 32, 16, 0b1000>;
+}
+
+multiclass SIMDLd3Multiple<string asm> {
+  defm Three : BaseSIMDLdN<"Three", asm, "VecListThree", 48, 24, 0b0100>;
+}
+
+multiclass SIMDSt3Multiple<string asm> {
+  defm Three : BaseSIMDStN<"Three", asm, "VecListThree", 48, 24, 0b0100>;
+}
+
+multiclass SIMDLd4Multiple<string asm> {
+  defm Four : BaseSIMDLdN<"Four", asm, "VecListFour", 64, 32, 0b0000>;
+}
+
+multiclass SIMDSt4Multiple<string asm> {
+  defm Four : BaseSIMDStN<"Four", asm, "VecListFour", 64, 32, 0b0000>;
+}
+
+//---
+// AdvSIMD Load/store single-element
+//---
+
+class BaseSIMDLdStSingle<bit L, bit R, bits<3> opcode,
+                         string asm, string operands, string cst,
+                         dag oops, dag iops, list<dag> pattern>
+  : I<oops, iops, asm, operands, cst, pattern> {
+  bits<5> Vt;
+  bits<5> Rn;
+  let Inst{31} = 0;
+  let Inst{29-24} = 0b001101;
+  let Inst{22} = L;
+  let Inst{21} = R;
+  let Inst{15-13} = opcode;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Vt;
+}
+
+class BaseSIMDLdStSingleTied<bit L, bit R, bits<3> opcode,
+                         string asm, string operands, string cst,
+                         dag oops, dag iops, list<dag> pattern>
+  : I<oops, iops, asm, operands, "$Vt = $dst," # cst, pattern> {
+  bits<5> Vt;
+  bits<5> Rn;
+  let Inst{31} = 0;
+  let Inst{29-24} = 0b001101;
+  let Inst{22} = L;
+  let Inst{21} = R;
+  let Inst{15-13} = opcode;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Vt;
+}
+
+
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDLdR<bit Q, bit R, bits<3> opcode, bit S, bits<2> size, string asm,
+                  Operand listtype>
+  : BaseSIMDLdStSingle<1, R, opcode, asm, "\t$Vt, [$Rn]", "",
+                       (outs listtype:$Vt), (ins GPR64sp:$Rn),
+                       []> {
+  let Inst{30} = Q;
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = S;
+  let Inst{11-10} = size;
+}
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDLdRPost<bit Q, bit R, bits<3> opcode, bit S, bits<2> size,
+                      string asm, Operand listtype, Operand GPR64pi>
+  : BaseSIMDLdStSingle<1, R, opcode, asm, "\t$Vt, [$Rn], $Xm",
+                       "$Rn = $wback",
+                       (outs GPR64sp:$wback, listtype:$Vt),
+                       (ins GPR64sp:$Rn, GPR64pi:$Xm), []> {
+  bits<5> Xm;
+  let Inst{30} = Q;
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = S;
+  let Inst{11-10} = size;
+}
+
+multiclass SIMDLdrAliases<string asm, string layout, string Count,
+                          int Offset, int Size> {
+  // E.g. "ld1r { v0.8b }, [x1], #1"
+  //      "ld1r.8b\t$Vt, [$Rn], #1"
+  // may get mapped to
+  //      (LD1Rv8b_POST VecListOne8b:$Vt, GPR64sp:$Rn, XZR)
+  def : InstAlias<asm # "\t$Vt, [$Rn], #" # Offset,
+                  (!cast<Instruction>(NAME # "v" # layout # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # layout):$Vt,
+                      XZR), 1>;
+
+  // E.g. "ld1r.8b { v0 }, [x1], #1"
+  //      "ld1r.8b\t$Vt, [$Rn], #1"
+  // may get mapped to
+  //      (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, XZR)
+  def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], #" # Offset,
+                  (!cast<Instruction>(NAME # "v" # layout # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+                      XZR), 0>;
+
+  // E.g. "ld1r.8b { v0 }, [x1]"
+  //      "ld1r.8b\t$Vt, [$Rn]"
+  // may get mapped to
+  //      (LD1Rv8b VecListOne64:$Vt, GPR64sp:$Rn)
+  def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn]",
+                  (!cast<Instruction>(NAME # "v" # layout)
+                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+                      GPR64sp:$Rn), 0>;
+
+  // E.g. "ld1r.8b { v0 }, [x1], x2"
+  //      "ld1r.8b\t$Vt, [$Rn], $Xm"
+  // may get mapped to
+  //      (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, GPR64pi1:$Xm)
+  def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], $Xm",
+                  (!cast<Instruction>(NAME # "v" # layout # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+                      !cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
+}
+
+multiclass SIMDLdR<bit R, bits<3> opcode, bit S, string asm, string Count,
+  int Offset1, int Offset2, int Offset4, int Offset8> {
+  def v8b : BaseSIMDLdR<0, R, opcode, S, 0b00, asm,
+                        !cast<Operand>("VecList" # Count # "8b")>;
+  def v16b: BaseSIMDLdR<1, R, opcode, S, 0b00, asm,
+                        !cast<Operand>("VecList" # Count #"16b")>;
+  def v4h : BaseSIMDLdR<0, R, opcode, S, 0b01, asm,
+                        !cast<Operand>("VecList" # Count #"4h")>;
+  def v8h : BaseSIMDLdR<1, R, opcode, S, 0b01, asm,
+                        !cast<Operand>("VecList" # Count #"8h")>;
+  def v2s : BaseSIMDLdR<0, R, opcode, S, 0b10, asm,
+                        !cast<Operand>("VecList" # Count #"2s")>;
+  def v4s : BaseSIMDLdR<1, R, opcode, S, 0b10, asm,
+                        !cast<Operand>("VecList" # Count #"4s")>;
+  def v1d : BaseSIMDLdR<0, R, opcode, S, 0b11, asm,
+                        !cast<Operand>("VecList" # Count #"1d")>;
+  def v2d : BaseSIMDLdR<1, R, opcode, S, 0b11, asm,
+                        !cast<Operand>("VecList" # Count #"2d")>;
+
+  def v8b_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b00, asm,
+                                 !cast<Operand>("VecList" # Count # "8b"),
+                                 !cast<Operand>("GPR64pi" # Offset1)>;
+  def v16b_POST: BaseSIMDLdRPost<1, R, opcode, S, 0b00, asm,
+                                 !cast<Operand>("VecList" # Count # "16b"),
+                                 !cast<Operand>("GPR64pi" # Offset1)>;
+  def v4h_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b01, asm,
+                                 !cast<Operand>("VecList" # Count # "4h"),
+                                 !cast<Operand>("GPR64pi" # Offset2)>;
+  def v8h_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b01, asm,
+                                 !cast<Operand>("VecList" # Count # "8h"),
+                                 !cast<Operand>("GPR64pi" # Offset2)>;
+  def v2s_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b10, asm,
+                                 !cast<Operand>("VecList" # Count # "2s"),
+                                 !cast<Operand>("GPR64pi" # Offset4)>;
+  def v4s_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b10, asm,
+                                 !cast<Operand>("VecList" # Count # "4s"),
+                                 !cast<Operand>("GPR64pi" # Offset4)>;
+  def v1d_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b11, asm,
+                                 !cast<Operand>("VecList" # Count # "1d"),
+                                 !cast<Operand>("GPR64pi" # Offset8)>;
+  def v2d_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b11, asm,
+                                 !cast<Operand>("VecList" # Count # "2d"),
+                                 !cast<Operand>("GPR64pi" # Offset8)>;
+
+  defm : SIMDLdrAliases<asm, "8b",  Count, Offset1,  64>;
+  defm : SIMDLdrAliases<asm, "16b", Count, Offset1, 128>;
+  defm : SIMDLdrAliases<asm, "4h",  Count, Offset2,  64>;
+  defm : SIMDLdrAliases<asm, "8h",  Count, Offset2, 128>;
+  defm : SIMDLdrAliases<asm, "2s",  Count, Offset4,  64>;
+  defm : SIMDLdrAliases<asm, "4s",  Count, Offset4, 128>;
+  defm : SIMDLdrAliases<asm, "1d",  Count, Offset8,  64>;
+  defm : SIMDLdrAliases<asm, "2d",  Count, Offset8, 128>;
+}
+
+class SIMDLdStSingleB<bit L, bit R, bits<3> opcode, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops,
+                       pattern> {
+  // idx encoded in Q:S:size fields.
+  bits<4> idx;
+  let Inst{30} = idx{3};
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = idx{2};
+  let Inst{11-10} = idx{1-0};
+}
+class SIMDLdStSingleBTied<bit L, bit R, bits<3> opcode, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "",
+                           oops, iops, pattern> {
+  // idx encoded in Q:S:size fields.
+  bits<4> idx;
+  let Inst{30} = idx{3};
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = idx{2};
+  let Inst{11-10} = idx{1-0};
+}
+class SIMDLdStSingleBPost<bit L, bit R, bits<3> opcode, string asm,
+                          dag oops, dag iops>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                       "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q:S:size fields.
+  bits<4> idx;
+  bits<5> Xm;
+  let Inst{30} = idx{3};
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = idx{2};
+  let Inst{11-10} = idx{1-0};
+}
+class SIMDLdStSingleBTiedPost<bit L, bit R, bits<3> opcode, string asm,
+                          dag oops, dag iops>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                           "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q:S:size fields.
+  bits<4> idx;
+  bits<5> Xm;
+  let Inst{30} = idx{3};
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = idx{2};
+  let Inst{11-10} = idx{1-0};
+}
+
+class SIMDLdStSingleH<bit L, bit R, bits<3> opcode, bit size, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops,
+                       pattern> {
+  // idx encoded in Q:S:size<1> fields.
+  bits<3> idx;
+  let Inst{30} = idx{2};
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = idx{1};
+  let Inst{11} = idx{0};
+  let Inst{10} = size;
+}
+class SIMDLdStSingleHTied<bit L, bit R, bits<3> opcode, bit size, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "",
+                           oops, iops, pattern> {
+  // idx encoded in Q:S:size<1> fields.
+  bits<3> idx;
+  let Inst{30} = idx{2};
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = idx{1};
+  let Inst{11} = idx{0};
+  let Inst{10} = size;
+}
+
+class SIMDLdStSingleHPost<bit L, bit R, bits<3> opcode, bit size, string asm,
+                          dag oops, dag iops>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                       "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q:S:size<1> fields.
+  bits<3> idx;
+  bits<5> Xm;
+  let Inst{30} = idx{2};
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = idx{1};
+  let Inst{11} = idx{0};
+  let Inst{10} = size;
+}
+class SIMDLdStSingleHTiedPost<bit L, bit R, bits<3> opcode, bit size, string asm,
+                          dag oops, dag iops>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                           "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q:S:size<1> fields.
+  bits<3> idx;
+  bits<5> Xm;
+  let Inst{30} = idx{2};
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = idx{1};
+  let Inst{11} = idx{0};
+  let Inst{10} = size;
+}
+class SIMDLdStSingleS<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops,
+                       pattern> {
+  // idx encoded in Q:S fields.
+  bits<2> idx;
+  let Inst{30} = idx{1};
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = idx{0};
+  let Inst{11-10} = size;
+}
+class SIMDLdStSingleSTied<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "",
+                           oops, iops, pattern> {
+  // idx encoded in Q:S fields.
+  bits<2> idx;
+  let Inst{30} = idx{1};
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = idx{0};
+  let Inst{11-10} = size;
+}
+class SIMDLdStSingleSPost<bit L, bit R, bits<3> opcode, bits<2> size,
+                          string asm, dag oops, dag iops>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                       "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q:S fields.
+  bits<2> idx;
+  bits<5> Xm;
+  let Inst{30} = idx{1};
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = idx{0};
+  let Inst{11-10} = size;
+}
+class SIMDLdStSingleSTiedPost<bit L, bit R, bits<3> opcode, bits<2> size,
+                          string asm, dag oops, dag iops>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                           "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q:S fields.
+  bits<2> idx;
+  bits<5> Xm;
+  let Inst{30} = idx{1};
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = idx{0};
+  let Inst{11-10} = size;
+}
+class SIMDLdStSingleD<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops,
+                       pattern> {
+  // idx encoded in Q field.
+  bits<1> idx;
+  let Inst{30} = idx;
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = 0;
+  let Inst{11-10} = size;
+}
+class SIMDLdStSingleDTied<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "",
+                           oops, iops, pattern> {
+  // idx encoded in Q field.
+  bits<1> idx;
+  let Inst{30} = idx;
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = 0;
+  let Inst{11-10} = size;
+}
+class SIMDLdStSingleDPost<bit L, bit R, bits<3> opcode, bits<2> size,
+                          string asm, dag oops, dag iops>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                       "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q field.
+  bits<1> idx;
+  bits<5> Xm;
+  let Inst{30} = idx;
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = 0;
+  let Inst{11-10} = size;
+}
+class SIMDLdStSingleDTiedPost<bit L, bit R, bits<3> opcode, bits<2> size,
+                          string asm, dag oops, dag iops>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                           "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q field.
+  bits<1> idx;
+  bits<5> Xm;
+  let Inst{30} = idx;
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = 0;
+  let Inst{11-10} = size;
+}
+
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDLdSingleBTied<bit R, bits<3> opcode, string asm,
+                         RegisterOperand listtype,
+                         RegisterOperand GPR64pi> {
+  def i8 : SIMDLdStSingleBTied<1, R, opcode, asm,
+                           (outs listtype:$dst),
+                           (ins listtype:$Vt, VectorIndexB:$idx,
+                                GPR64sp:$Rn), []>;
+
+  def i8_POST : SIMDLdStSingleBTiedPost<1, R, opcode, asm,
+                            (outs GPR64sp:$wback, listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexB:$idx,
+                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDLdSingleHTied<bit R, bits<3> opcode, bit size, string asm,
+                         RegisterOperand listtype,
+                         RegisterOperand GPR64pi> {
+  def i16 : SIMDLdStSingleHTied<1, R, opcode, size, asm,
+                            (outs listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexH:$idx,
+                                 GPR64sp:$Rn), []>;
+
+  def i16_POST : SIMDLdStSingleHTiedPost<1, R, opcode, size, asm,
+                            (outs GPR64sp:$wback, listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexH:$idx,
+                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDLdSingleSTied<bit R, bits<3> opcode, bits<2> size,string asm,
+                         RegisterOperand listtype,
+                         RegisterOperand GPR64pi> {
+  def i32 : SIMDLdStSingleSTied<1, R, opcode, size, asm,
+                            (outs listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexS:$idx,
+                                 GPR64sp:$Rn), []>;
+
+  def i32_POST : SIMDLdStSingleSTiedPost<1, R, opcode, size, asm,
+                            (outs GPR64sp:$wback, listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexS:$idx,
+                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDLdSingleDTied<bit R, bits<3> opcode, bits<2> size, string asm,
+                         RegisterOperand listtype, RegisterOperand GPR64pi> {
+  def i64 : SIMDLdStSingleDTied<1, R, opcode, size, asm,
+                            (outs listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexD:$idx,
+                                 GPR64sp:$Rn), []>;
+
+  def i64_POST : SIMDLdStSingleDTiedPost<1, R, opcode, size, asm,
+                            (outs GPR64sp:$wback, listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexD:$idx,
+                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+multiclass SIMDStSingleB<bit R, bits<3> opcode, string asm,
+                         RegisterOperand listtype, RegisterOperand GPR64pi> {
+  def i8 : SIMDLdStSingleB<0, R, opcode, asm,
+                           (outs), (ins listtype:$Vt, VectorIndexB:$idx,
+                                        GPR64sp:$Rn), []>;
+
+  def i8_POST : SIMDLdStSingleBPost<0, R, opcode, asm,
+                                    (outs GPR64sp:$wback),
+                                    (ins listtype:$Vt, VectorIndexB:$idx,
+                                         GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+multiclass SIMDStSingleH<bit R, bits<3> opcode, bit size, string asm,
+                         RegisterOperand listtype, RegisterOperand GPR64pi> {
+  def i16 : SIMDLdStSingleH<0, R, opcode, size, asm,
+                            (outs), (ins listtype:$Vt, VectorIndexH:$idx,
+                                         GPR64sp:$Rn), []>;
+
+  def i16_POST : SIMDLdStSingleHPost<0, R, opcode, size, asm,
+                            (outs GPR64sp:$wback),
+                            (ins listtype:$Vt, VectorIndexH:$idx,
+                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+multiclass SIMDStSingleS<bit R, bits<3> opcode, bits<2> size,string asm,
+                         RegisterOperand listtype, RegisterOperand GPR64pi> {
+  def i32 : SIMDLdStSingleS<0, R, opcode, size, asm,
+                            (outs), (ins listtype:$Vt, VectorIndexS:$idx,
+                                         GPR64sp:$Rn), []>;
+
+  def i32_POST : SIMDLdStSingleSPost<0, R, opcode, size, asm,
+                            (outs GPR64sp:$wback),
+                            (ins listtype:$Vt, VectorIndexS:$idx,
+                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+multiclass SIMDStSingleD<bit R, bits<3> opcode, bits<2> size, string asm,
+                         RegisterOperand listtype, RegisterOperand GPR64pi> {
+  def i64 : SIMDLdStSingleD<0, R, opcode, size, asm,
+                            (outs), (ins listtype:$Vt, VectorIndexD:$idx,
+                                         GPR64sp:$Rn), []>;
+
+  def i64_POST : SIMDLdStSingleDPost<0, R, opcode, size, asm,
+                            (outs GPR64sp:$wback),
+                            (ins listtype:$Vt, VectorIndexD:$idx,
+                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+
+multiclass SIMDLdStSingleAliases<string asm, string layout, string Type,
+                                 string Count, int Offset, Operand idxtype> {
+  // E.g. "ld1 { v0.8b }[0], [x1], #1"
+  //      "ld1\t$Vt, [$Rn], #1"
+  // may get mapped to
+  //      (LD1Rv8b_POST VecListOne8b:$Vt, GPR64sp:$Rn, XZR)
+  def : InstAlias<asm # "\t$Vt$idx, [$Rn], #" # Offset,
+                  (!cast<Instruction>(NAME # Type  # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # layout):$Vt,
+                      idxtype:$idx, XZR), 1>;
+
+  // E.g. "ld1.8b { v0 }[0], [x1], #1"
+  //      "ld1.8b\t$Vt, [$Rn], #1"
+  // may get mapped to
+  //      (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, XZR)
+  def : InstAlias<asm # "." # layout # "\t$Vt$idx, [$Rn], #" # Offset,
+                  (!cast<Instruction>(NAME # Type # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # "128"):$Vt,
+                      idxtype:$idx, XZR), 0>;
+
+  // E.g. "ld1.8b { v0 }[0], [x1]"
+  //      "ld1.8b\t$Vt, [$Rn]"
+  // may get mapped to
+  //      (LD1Rv8b VecListOne64:$Vt, GPR64sp:$Rn)
+  def : InstAlias<asm # "." # layout # "\t$Vt$idx, [$Rn]",
+                      (!cast<Instruction>(NAME # Type)
+                         !cast<RegisterOperand>("VecList" # Count # "128"):$Vt,
+                         idxtype:$idx, GPR64sp:$Rn), 0>;
+
+  // E.g. "ld1.8b { v0 }[0], [x1], x2"
+  //      "ld1.8b\t$Vt, [$Rn], $Xm"
+  // may get mapped to
+  //      (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, GPR64pi1:$Xm)
+  def : InstAlias<asm # "." # layout # "\t$Vt$idx, [$Rn], $Xm",
+                      (!cast<Instruction>(NAME # Type # "_POST")
+                         GPR64sp:$Rn,
+                         !cast<RegisterOperand>("VecList" # Count # "128"):$Vt,
+                         idxtype:$idx,
+                         !cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
+}
+
+multiclass SIMDLdSt1SingleAliases<string asm> {
+  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "One", 1, VectorIndexB>;
+  defm : SIMDLdStSingleAliases<asm, "h", "i16", "One", 2, VectorIndexH>;
+  defm : SIMDLdStSingleAliases<asm, "s", "i32", "One", 4, VectorIndexS>;
+  defm : SIMDLdStSingleAliases<asm, "d", "i64", "One", 8, VectorIndexD>;
+}
+
+multiclass SIMDLdSt2SingleAliases<string asm> {
+  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "Two", 2,  VectorIndexB>;
+  defm : SIMDLdStSingleAliases<asm, "h", "i16", "Two", 4,  VectorIndexH>;
+  defm : SIMDLdStSingleAliases<asm, "s", "i32", "Two", 8,  VectorIndexS>;
+  defm : SIMDLdStSingleAliases<asm, "d", "i64", "Two", 16, VectorIndexD>;
+}
+
+multiclass SIMDLdSt3SingleAliases<string asm> {
+  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "Three", 3,  VectorIndexB>;
+  defm : SIMDLdStSingleAliases<asm, "h", "i16", "Three", 6,  VectorIndexH>;
+  defm : SIMDLdStSingleAliases<asm, "s", "i32", "Three", 12, VectorIndexS>;
+  defm : SIMDLdStSingleAliases<asm, "d", "i64", "Three", 24, VectorIndexD>;
+}
+
+multiclass SIMDLdSt4SingleAliases<string asm> {
+  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "Four", 4,  VectorIndexB>;
+  defm : SIMDLdStSingleAliases<asm, "h", "i16", "Four", 8,  VectorIndexH>;
+  defm : SIMDLdStSingleAliases<asm, "s", "i32", "Four", 16, VectorIndexS>;
+  defm : SIMDLdStSingleAliases<asm, "d", "i64", "Four", 32, VectorIndexD>;
+}
+} // end of 'let Predicates = [HasNEON]'
+
+//----------------------------------------------------------------------------
+// Crypto extensions
+//----------------------------------------------------------------------------
+
+let Predicates = [HasCrypto] in {
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class AESBase<bits<4> opc, string asm, dag outs, dag ins, string cstr,
+              list<dag> pat>
+  : I<outs, ins, asm, "{\t$Rd.16b, $Rn.16b|.16b\t$Rd, $Rn}", cstr, pat>,
+    Sched<[WriteV]>{
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-16} = 0b0100111000101000;
+  let Inst{15-12} = opc;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class AESInst<bits<4> opc, string asm, Intrinsic OpNode>
+  : AESBase<opc, asm, (outs V128:$Rd), (ins V128:$Rn), "",
+            [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
+
+class AESTiedInst<bits<4> opc, string asm, Intrinsic OpNode>
+  : AESBase<opc, asm, (outs V128:$dst), (ins V128:$Rd, V128:$Rn),
+            "$Rd = $dst",
+            [(set (v16i8 V128:$dst),
+                  (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn)))]>;
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class SHA3OpTiedInst<bits<3> opc, string asm, string dst_lhs_kind,
+                     dag oops, dag iops, list<dag> pat>
+  : I<oops, iops, asm,
+      "{\t$Rd" # dst_lhs_kind # ", $Rn" # dst_lhs_kind # ", $Rm.4s" #
+      "|.4s\t$Rd, $Rn, $Rm}", "$Rd = $dst", pat>,
+    Sched<[WriteV]>{
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31-21} = 0b01011110000;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = 0;
+  let Inst{14-12} = opc;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class SHATiedInstQSV<bits<3> opc, string asm, Intrinsic OpNode>
+  : SHA3OpTiedInst<opc, asm, "", (outs FPR128:$dst),
+                   (ins FPR128:$Rd, FPR32:$Rn, V128:$Rm),
+                   [(set (v4i32 FPR128:$dst),
+                         (OpNode (v4i32 FPR128:$Rd), (i32 FPR32:$Rn),
+                                 (v4i32 V128:$Rm)))]>;
+
+class SHATiedInstVVV<bits<3> opc, string asm, Intrinsic OpNode>
+  : SHA3OpTiedInst<opc, asm, ".4s", (outs V128:$dst),
+                   (ins V128:$Rd, V128:$Rn, V128:$Rm),
+                   [(set (v4i32 V128:$dst),
+                         (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
+                                 (v4i32 V128:$Rm)))]>;
+
+class SHATiedInstQQV<bits<3> opc, string asm, Intrinsic OpNode>
+  : SHA3OpTiedInst<opc, asm, "", (outs FPR128:$dst),
+                   (ins FPR128:$Rd, FPR128:$Rn, V128:$Rm),
+                   [(set (v4i32 FPR128:$dst),
+                         (OpNode (v4i32 FPR128:$Rd), (v4i32 FPR128:$Rn),
+                                 (v4i32 V128:$Rm)))]>;
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class SHA2OpInst<bits<4> opc, string asm, string kind,
+                 string cstr, dag oops, dag iops,
+                 list<dag> pat>
+  : I<oops, iops, asm, "{\t$Rd" # kind # ", $Rn" # kind #
+                       "|" # kind # "\t$Rd, $Rn}", cstr, pat>,
+    Sched<[WriteV]>{
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-16} = 0b0101111000101000;
+  let Inst{15-12} = opc;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class SHATiedInstVV<bits<4> opc, string asm, Intrinsic OpNode>
+  : SHA2OpInst<opc, asm, ".4s", "$Rd = $dst", (outs V128:$dst),
+               (ins V128:$Rd, V128:$Rn),
+               [(set (v4i32 V128:$dst),
+                     (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn)))]>;
+
+class SHAInstSS<bits<4> opc, string asm, Intrinsic OpNode>
+  : SHA2OpInst<opc, asm, "", "", (outs FPR32:$Rd), (ins FPR32:$Rn),
+               [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn)))]>;
+} // end of 'let Predicates = [HasCrypto]'
+
+// Allow the size specifier tokens to be upper case, not just lower.
+def : TokenAlias<".8B", ".8b">;
+def : TokenAlias<".4H", ".4h">;
+def : TokenAlias<".2S", ".2s">;
+def : TokenAlias<".1D", ".1d">;
+def : TokenAlias<".16B", ".16b">;
+def : TokenAlias<".8H", ".8h">;
+def : TokenAlias<".4S", ".4s">;
+def : TokenAlias<".2D", ".2d">;
+def : TokenAlias<".1Q", ".1q">;
+def : TokenAlias<".B", ".b">;
+def : TokenAlias<".H", ".h">;
+def : TokenAlias<".S", ".s">;
+def : TokenAlias<".D", ".d">;
+def : TokenAlias<".Q", ".q">;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
new file mode 100644
index 0000000..ce85b2c
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -0,0 +1,2089 @@
+//===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AArch64 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "AArch64GenInstrInfo.inc"
+
+AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
+    : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP),
+      RI(this, &STI), Subtarget(STI) {}
+
+/// GetInstSize - Return the number of bytes of code the specified
+/// instruction may be.  This returns the maximum number of bytes.
+unsigned AArch64InstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
+  const MachineBasicBlock &MBB = *MI->getParent();
+  const MachineFunction *MF = MBB.getParent();
+  const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
+
+  if (MI->getOpcode() == AArch64::INLINEASM)
+    return getInlineAsmLength(MI->getOperand(0).getSymbolName(), *MAI);
+
+  const MCInstrDesc &Desc = MI->getDesc();
+  switch (Desc.getOpcode()) {
+  default:
+    // Anything not explicitly designated otherwise is a nomal 4-byte insn.
+    return 4;
+  case TargetOpcode::DBG_VALUE:
+  case TargetOpcode::EH_LABEL:
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::KILL:
+    return 0;
+  }
+
+  llvm_unreachable("GetInstSizeInBytes()- Unable to determin insn size");
+}
+
+static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
+                            SmallVectorImpl<MachineOperand> &Cond) {
+  // Block ends with fall-through condbranch.
+  switch (LastInst->getOpcode()) {
+  default:
+    llvm_unreachable("Unknown branch instruction?");
+  case AArch64::Bcc:
+    Target = LastInst->getOperand(1).getMBB();
+    Cond.push_back(LastInst->getOperand(0));
+    break;
+  case AArch64::CBZW:
+  case AArch64::CBZX:
+  case AArch64::CBNZW:
+  case AArch64::CBNZX:
+    Target = LastInst->getOperand(1).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(-1));
+    Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
+    Cond.push_back(LastInst->getOperand(0));
+    break;
+  case AArch64::TBZW:
+  case AArch64::TBZX:
+  case AArch64::TBNZW:
+  case AArch64::TBNZX:
+    Target = LastInst->getOperand(2).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(-1));
+    Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
+    Cond.push_back(LastInst->getOperand(0));
+    Cond.push_back(LastInst->getOperand(1));
+  }
+}
+
+// Branch analysis.
+bool AArch64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+                                   MachineBasicBlock *&TBB,
+                                   MachineBasicBlock *&FBB,
+                                   SmallVectorImpl<MachineOperand> &Cond,
+                                   bool AllowModify) const {
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin())
+    return false;
+  --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return false;
+    --I;
+  }
+  if (!isUnpredicatedTerminator(I))
+    return false;
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = I;
+
+  // If there is only one terminator instruction, process it.
+  unsigned LastOpc = LastInst->getOpcode();
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+    if (isUncondBranchOpcode(LastOpc)) {
+      TBB = LastInst->getOperand(0).getMBB();
+      return false;
+    }
+    if (isCondBranchOpcode(LastOpc)) {
+      // Block ends with fall-through condbranch.
+      parseCondBranch(LastInst, TBB, Cond);
+      return false;
+    }
+    return true; // Can't handle indirect branch.
+  }
+
+  // Get the instruction before it if it is a terminator.
+  MachineInstr *SecondLastInst = I;
+  unsigned SecondLastOpc = SecondLastInst->getOpcode();
+
+  // If AllowModify is true and the block ends with two or more unconditional
+  // branches, delete all but the first unconditional branch.
+  if (AllowModify && isUncondBranchOpcode(LastOpc)) {
+    while (isUncondBranchOpcode(SecondLastOpc)) {
+      LastInst->eraseFromParent();
+      LastInst = SecondLastInst;
+      LastOpc = LastInst->getOpcode();
+      if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+        // Return now the only terminator is an unconditional branch.
+        TBB = LastInst->getOperand(0).getMBB();
+        return false;
+      } else {
+        SecondLastInst = I;
+        SecondLastOpc = SecondLastInst->getOpcode();
+      }
+    }
+  }
+
+  // If there are three terminators, we don't know what sort of block this is.
+  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I))
+    return true;
+
+  // If the block ends with a B and a Bcc, handle it.
+  if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+    parseCondBranch(SecondLastInst, TBB, Cond);
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  }
+
+  // If the block ends with two unconditional branches, handle it.  The second
+  // one is not executed, so remove it.
+  if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return false;
+  }
+
+  // ...likewise if it ends with an indirect branch followed by an unconditional
+  // branch.
+  if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return true;
+  }
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+bool AArch64InstrInfo::ReverseBranchCondition(
+    SmallVectorImpl<MachineOperand> &Cond) const {
+  if (Cond[0].getImm() != -1) {
+    // Regular Bcc
+    AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
+    Cond[0].setImm(AArch64CC::getInvertedCondCode(CC));
+  } else {
+    // Folded compare-and-branch
+    switch (Cond[1].getImm()) {
+    default:
+      llvm_unreachable("Unknown conditional branch!");
+    case AArch64::CBZW:
+      Cond[1].setImm(AArch64::CBNZW);
+      break;
+    case AArch64::CBNZW:
+      Cond[1].setImm(AArch64::CBZW);
+      break;
+    case AArch64::CBZX:
+      Cond[1].setImm(AArch64::CBNZX);
+      break;
+    case AArch64::CBNZX:
+      Cond[1].setImm(AArch64::CBZX);
+      break;
+    case AArch64::TBZW:
+      Cond[1].setImm(AArch64::TBNZW);
+      break;
+    case AArch64::TBNZW:
+      Cond[1].setImm(AArch64::TBZW);
+      break;
+    case AArch64::TBZX:
+      Cond[1].setImm(AArch64::TBNZX);
+      break;
+    case AArch64::TBNZX:
+      Cond[1].setImm(AArch64::TBZX);
+      break;
+    }
+  }
+
+  return false;
+}
+
+unsigned AArch64InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin())
+    return 0;
+  --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return 0;
+    --I;
+  }
+  if (!isUncondBranchOpcode(I->getOpcode()) &&
+      !isCondBranchOpcode(I->getOpcode()))
+    return 0;
+
+  // Remove the branch.
+  I->eraseFromParent();
+
+  I = MBB.end();
+
+  if (I == MBB.begin())
+    return 1;
+  --I;
+  if (!isCondBranchOpcode(I->getOpcode()))
+    return 1;
+
+  // Remove the branch.
+  I->eraseFromParent();
+  return 2;
+}
+
+void AArch64InstrInfo::instantiateCondBranch(
+    MachineBasicBlock &MBB, DebugLoc DL, MachineBasicBlock *TBB,
+    const SmallVectorImpl<MachineOperand> &Cond) const {
+  if (Cond[0].getImm() != -1) {
+    // Regular Bcc
+    BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
+  } else {
+    // Folded compare-and-branch
+    const MachineInstrBuilder MIB =
+        BuildMI(&MBB, DL, get(Cond[1].getImm())).addReg(Cond[2].getReg());
+    if (Cond.size() > 3)
+      MIB.addImm(Cond[3].getImm());
+    MIB.addMBB(TBB);
+  }
+}
+
+unsigned AArch64InstrInfo::InsertBranch(
+    MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
+    const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const {
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+
+  if (!FBB) {
+    if (Cond.empty()) // Unconditional branch?
+      BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
+    else
+      instantiateCondBranch(MBB, DL, TBB, Cond);
+    return 1;
+  }
+
+  // Two-way conditional branch.
+  instantiateCondBranch(MBB, DL, TBB, Cond);
+  BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);
+  return 2;
+}
+
+// Find the original register that VReg is copied from.
+static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
+  while (TargetRegisterInfo::isVirtualRegister(VReg)) {
+    const MachineInstr *DefMI = MRI.getVRegDef(VReg);
+    if (!DefMI->isFullCopy())
+      return VReg;
+    VReg = DefMI->getOperand(1).getReg();
+  }
+  return VReg;
+}
+
+// Determine if VReg is defined by an instruction that can be folded into a
+// csel instruction. If so, return the folded opcode, and the replacement
+// register.
+static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
+                                unsigned *NewVReg = nullptr) {
+  VReg = removeCopies(MRI, VReg);
+  if (!TargetRegisterInfo::isVirtualRegister(VReg))
+    return 0;
+
+  bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
+  const MachineInstr *DefMI = MRI.getVRegDef(VReg);
+  unsigned Opc = 0;
+  unsigned SrcOpNum = 0;
+  switch (DefMI->getOpcode()) {
+  case AArch64::ADDSXri:
+  case AArch64::ADDSWri:
+    // if NZCV is used, do not fold.
+    if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
+      return 0;
+  // fall-through to ADDXri and ADDWri.
+  case AArch64::ADDXri:
+  case AArch64::ADDWri:
+    // add x, 1 -> csinc.
+    if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
+        DefMI->getOperand(3).getImm() != 0)
+      return 0;
+    SrcOpNum = 1;
+    Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
+    break;
+
+  case AArch64::ORNXrr:
+  case AArch64::ORNWrr: {
+    // not x -> csinv, represented as orn dst, xzr, src.
+    unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
+    if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
+      return 0;
+    SrcOpNum = 2;
+    Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
+    break;
+  }
+
+  case AArch64::SUBSXrr:
+  case AArch64::SUBSWrr:
+    // if NZCV is used, do not fold.
+    if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
+      return 0;
+  // fall-through to SUBXrr and SUBWrr.
+  case AArch64::SUBXrr:
+  case AArch64::SUBWrr: {
+    // neg x -> csneg, represented as sub dst, xzr, src.
+    unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
+    if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
+      return 0;
+    SrcOpNum = 2;
+    Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
+    break;
+  }
+  default:
+    return 0;
+  }
+  assert(Opc && SrcOpNum && "Missing parameters");
+
+  if (NewVReg)
+    *NewVReg = DefMI->getOperand(SrcOpNum).getReg();
+  return Opc;
+}
+
+bool AArch64InstrInfo::canInsertSelect(
+    const MachineBasicBlock &MBB, const SmallVectorImpl<MachineOperand> &Cond,
+    unsigned TrueReg, unsigned FalseReg, int &CondCycles, int &TrueCycles,
+    int &FalseCycles) const {
+  // Check register classes.
+  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  const TargetRegisterClass *RC =
+      RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
+  if (!RC)
+    return false;
+
+  // Expanding cbz/tbz requires an extra cycle of latency on the condition.
+  unsigned ExtraCondLat = Cond.size() != 1;
+
+  // GPRs are handled by csel.
+  // FIXME: Fold in x+1, -x, and ~x when applicable.
+  if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
+      AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
+    // Single-cycle csel, csinc, csinv, and csneg.
+    CondCycles = 1 + ExtraCondLat;
+    TrueCycles = FalseCycles = 1;
+    if (canFoldIntoCSel(MRI, TrueReg))
+      TrueCycles = 0;
+    else if (canFoldIntoCSel(MRI, FalseReg))
+      FalseCycles = 0;
+    return true;
+  }
+
+  // Scalar floating point is handled by fcsel.
+  // FIXME: Form fabs, fmin, and fmax when applicable.
+  if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
+      AArch64::FPR32RegClass.hasSubClassEq(RC)) {
+    CondCycles = 5 + ExtraCondLat;
+    TrueCycles = FalseCycles = 2;
+    return true;
+  }
+
+  // Can't do vectors.
+  return false;
+}
+
+void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator I, DebugLoc DL,
+                                    unsigned DstReg,
+                                    const SmallVectorImpl<MachineOperand> &Cond,
+                                    unsigned TrueReg, unsigned FalseReg) const {
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+  // Parse the condition code, see parseCondBranch() above.
+  AArch64CC::CondCode CC;
+  switch (Cond.size()) {
+  default:
+    llvm_unreachable("Unknown condition opcode in Cond");
+  case 1: // b.cc
+    CC = AArch64CC::CondCode(Cond[0].getImm());
+    break;
+  case 3: { // cbz/cbnz
+    // We must insert a compare against 0.
+    bool Is64Bit;
+    switch (Cond[1].getImm()) {
+    default:
+      llvm_unreachable("Unknown branch opcode in Cond");
+    case AArch64::CBZW:
+      Is64Bit = 0;
+      CC = AArch64CC::EQ;
+      break;
+    case AArch64::CBZX:
+      Is64Bit = 1;
+      CC = AArch64CC::EQ;
+      break;
+    case AArch64::CBNZW:
+      Is64Bit = 0;
+      CC = AArch64CC::NE;
+      break;
+    case AArch64::CBNZX:
+      Is64Bit = 1;
+      CC = AArch64CC::NE;
+      break;
+    }
+    unsigned SrcReg = Cond[2].getReg();
+    if (Is64Bit) {
+      // cmp reg, #0 is actually subs xzr, reg, #0.
+      MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
+      BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
+          .addReg(SrcReg)
+          .addImm(0)
+          .addImm(0);
+    } else {
+      MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
+      BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
+          .addReg(SrcReg)
+          .addImm(0)
+          .addImm(0);
+    }
+    break;
+  }
+  case 4: { // tbz/tbnz
+    // We must insert a tst instruction.
+    switch (Cond[1].getImm()) {
+    default:
+      llvm_unreachable("Unknown branch opcode in Cond");
+    case AArch64::TBZW:
+    case AArch64::TBZX:
+      CC = AArch64CC::EQ;
+      break;
+    case AArch64::TBNZW:
+    case AArch64::TBNZX:
+      CC = AArch64CC::NE;
+      break;
+    }
+    // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
+    if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
+      BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
+          .addReg(Cond[2].getReg())
+          .addImm(
+              AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32));
+    else
+      BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
+          .addReg(Cond[2].getReg())
+          .addImm(
+              AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64));
+    break;
+  }
+  }
+
+  unsigned Opc = 0;
+  const TargetRegisterClass *RC = nullptr;
+  bool TryFold = false;
+  if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
+    RC = &AArch64::GPR64RegClass;
+    Opc = AArch64::CSELXr;
+    TryFold = true;
+  } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
+    RC = &AArch64::GPR32RegClass;
+    Opc = AArch64::CSELWr;
+    TryFold = true;
+  } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
+    RC = &AArch64::FPR64RegClass;
+    Opc = AArch64::FCSELDrrr;
+  } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
+    RC = &AArch64::FPR32RegClass;
+    Opc = AArch64::FCSELSrrr;
+  }
+  assert(RC && "Unsupported regclass");
+
+  // Try folding simple instructions into the csel.
+  if (TryFold) {
+    unsigned NewVReg = 0;
+    unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg);
+    if (FoldedOpc) {
+      // The folded opcodes csinc, csinc and csneg apply the operation to
+      // FalseReg, so we need to invert the condition.
+      CC = AArch64CC::getInvertedCondCode(CC);
+      TrueReg = FalseReg;
+    } else
+      FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg);
+
+    // Fold the operation. Leave any dead instructions for DCE to clean up.
+    if (FoldedOpc) {
+      FalseReg = NewVReg;
+      Opc = FoldedOpc;
+      // The extends the live range of NewVReg.
+      MRI.clearKillFlags(NewVReg);
+    }
+  }
+
+  // Pull all virtual register into the appropriate class.
+  MRI.constrainRegClass(TrueReg, RC);
+  MRI.constrainRegClass(FalseReg, RC);
+
+  // Insert the csel.
+  BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(TrueReg).addReg(FalseReg).addImm(
+      CC);
+}
+
+bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
+                                             unsigned &SrcReg, unsigned &DstReg,
+                                             unsigned &SubIdx) const {
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  case AArch64::SBFMXri: // aka sxtw
+  case AArch64::UBFMXri: // aka uxtw
+    // Check for the 32 -> 64 bit extension case, these instructions can do
+    // much more.
+    if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
+      return false;
+    // This is a signed or unsigned 32 -> 64 bit extension.
+    SrcReg = MI.getOperand(1).getReg();
+    DstReg = MI.getOperand(0).getReg();
+    SubIdx = AArch64::sub_32;
+    return true;
+  }
+}
+
+/// analyzeCompare - For a comparison instruction, return the source registers
+/// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
+/// Return true if the comparison instruction can be analyzed.
+bool AArch64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
+                                      unsigned &SrcReg2, int &CmpMask,
+                                      int &CmpValue) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case AArch64::SUBSWrr:
+  case AArch64::SUBSWrs:
+  case AArch64::SUBSWrx:
+  case AArch64::SUBSXrr:
+  case AArch64::SUBSXrs:
+  case AArch64::SUBSXrx:
+  case AArch64::ADDSWrr:
+  case AArch64::ADDSWrs:
+  case AArch64::ADDSWrx:
+  case AArch64::ADDSXrr:
+  case AArch64::ADDSXrs:
+  case AArch64::ADDSXrx:
+    // Replace SUBSWrr with SUBWrr if NZCV is not used.
+    SrcReg = MI->getOperand(1).getReg();
+    SrcReg2 = MI->getOperand(2).getReg();
+    CmpMask = ~0;
+    CmpValue = 0;
+    return true;
+  case AArch64::SUBSWri:
+  case AArch64::ADDSWri:
+  case AArch64::SUBSXri:
+  case AArch64::ADDSXri:
+    SrcReg = MI->getOperand(1).getReg();
+    SrcReg2 = 0;
+    CmpMask = ~0;
+    CmpValue = MI->getOperand(2).getImm();
+    return true;
+  case AArch64::ANDSWri:
+  case AArch64::ANDSXri:
+    // ANDS does not use the same encoding scheme as the others xxxS
+    // instructions.
+    SrcReg = MI->getOperand(1).getReg();
+    SrcReg2 = 0;
+    CmpMask = ~0;
+    CmpValue = AArch64_AM::decodeLogicalImmediate(
+        MI->getOperand(2).getImm(),
+        MI->getOpcode() == AArch64::ANDSWri ? 32 : 64);
+    return true;
+  }
+
+  return false;
+}
+
+static bool UpdateOperandRegClass(MachineInstr *Instr) {
+  MachineBasicBlock *MBB = Instr->getParent();
+  assert(MBB && "Can't get MachineBasicBlock here");
+  MachineFunction *MF = MBB->getParent();
+  assert(MF && "Can't get MachineFunction here");
+  const TargetMachine *TM = &MF->getTarget();
+  const TargetInstrInfo *TII = TM->getInstrInfo();
+  const TargetRegisterInfo *TRI = TM->getRegisterInfo();
+  MachineRegisterInfo *MRI = &MF->getRegInfo();
+
+  for (unsigned OpIdx = 0, EndIdx = Instr->getNumOperands(); OpIdx < EndIdx;
+       ++OpIdx) {
+    MachineOperand &MO = Instr->getOperand(OpIdx);
+    const TargetRegisterClass *OpRegCstraints =
+        Instr->getRegClassConstraint(OpIdx, TII, TRI);
+
+    // If there's no constraint, there's nothing to do.
+    if (!OpRegCstraints)
+      continue;
+    // If the operand is a frame index, there's nothing to do here.
+    // A frame index operand will resolve correctly during PEI.
+    if (MO.isFI())
+      continue;
+
+    assert(MO.isReg() &&
+           "Operand has register constraints without being a register!");
+
+    unsigned Reg = MO.getReg();
+    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      if (!OpRegCstraints->contains(Reg))
+        return false;
+    } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
+               !MRI->constrainRegClass(Reg, OpRegCstraints))
+      return false;
+  }
+
+  return true;
+}
+
+/// optimizeCompareInstr - Convert the instruction supplying the argument to the
+/// comparison into one that sets the zero bit in the flags register.
+bool AArch64InstrInfo::optimizeCompareInstr(
+    MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask,
+    int CmpValue, const MachineRegisterInfo *MRI) const {
+
+  // Replace SUBSWrr with SUBWrr if NZCV is not used.
+  int Cmp_NZCV = CmpInstr->findRegisterDefOperandIdx(AArch64::NZCV, true);
+  if (Cmp_NZCV != -1) {
+    unsigned NewOpc;
+    switch (CmpInstr->getOpcode()) {
+    default:
+      return false;
+    case AArch64::ADDSWrr:      NewOpc = AArch64::ADDWrr; break;
+    case AArch64::ADDSWri:      NewOpc = AArch64::ADDWri; break;
+    case AArch64::ADDSWrs:      NewOpc = AArch64::ADDWrs; break;
+    case AArch64::ADDSWrx:      NewOpc = AArch64::ADDWrx; break;
+    case AArch64::ADDSXrr:      NewOpc = AArch64::ADDXrr; break;
+    case AArch64::ADDSXri:      NewOpc = AArch64::ADDXri; break;
+    case AArch64::ADDSXrs:      NewOpc = AArch64::ADDXrs; break;
+    case AArch64::ADDSXrx:      NewOpc = AArch64::ADDXrx; break;
+    case AArch64::SUBSWrr:      NewOpc = AArch64::SUBWrr; break;
+    case AArch64::SUBSWri:      NewOpc = AArch64::SUBWri; break;
+    case AArch64::SUBSWrs:      NewOpc = AArch64::SUBWrs; break;
+    case AArch64::SUBSWrx:      NewOpc = AArch64::SUBWrx; break;
+    case AArch64::SUBSXrr:      NewOpc = AArch64::SUBXrr; break;
+    case AArch64::SUBSXri:      NewOpc = AArch64::SUBXri; break;
+    case AArch64::SUBSXrs:      NewOpc = AArch64::SUBXrs; break;
+    case AArch64::SUBSXrx:      NewOpc = AArch64::SUBXrx; break;
+    }
+
+    const MCInstrDesc &MCID = get(NewOpc);
+    CmpInstr->setDesc(MCID);
+    CmpInstr->RemoveOperand(Cmp_NZCV);
+    bool succeeded = UpdateOperandRegClass(CmpInstr);
+    (void)succeeded;
+    assert(succeeded && "Some operands reg class are incompatible!");
+    return true;
+  }
+
+  // Continue only if we have a "ri" where immediate is zero.
+  if (CmpValue != 0 || SrcReg2 != 0)
+    return false;
+
+  // CmpInstr is a Compare instruction if destination register is not used.
+  if (!MRI->use_nodbg_empty(CmpInstr->getOperand(0).getReg()))
+    return false;
+
+  // Get the unique definition of SrcReg.
+  MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
+  if (!MI)
+    return false;
+
+  // We iterate backward, starting from the instruction before CmpInstr and
+  // stop when reaching the definition of the source register or done with the
+  // basic block, to check whether NZCV is used or modified in between.
+  MachineBasicBlock::iterator I = CmpInstr, E = MI,
+                              B = CmpInstr->getParent()->begin();
+
+  // Early exit if CmpInstr is at the beginning of the BB.
+  if (I == B)
+    return false;
+
+  // Check whether the definition of SrcReg is in the same basic block as
+  // Compare. If not, we can't optimize away the Compare.
+  if (MI->getParent() != CmpInstr->getParent())
+    return false;
+
+  // Check that NZCV isn't set between the comparison instruction and the one we
+  // want to change.
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  for (--I; I != E; --I) {
+    const MachineInstr &Instr = *I;
+
+    if (Instr.modifiesRegister(AArch64::NZCV, TRI) ||
+        Instr.readsRegister(AArch64::NZCV, TRI))
+      // This instruction modifies or uses NZCV after the one we want to
+      // change. We can't do this transformation.
+      return false;
+    if (I == B)
+      // The 'and' is below the comparison instruction.
+      return false;
+  }
+
+  unsigned NewOpc = MI->getOpcode();
+  switch (MI->getOpcode()) {
+  default:
+    return false;
+  case AArch64::ADDSWrr:
+  case AArch64::ADDSWri:
+  case AArch64::ADDSXrr:
+  case AArch64::ADDSXri:
+  case AArch64::SUBSWrr:
+  case AArch64::SUBSWri:
+  case AArch64::SUBSXrr:
+  case AArch64::SUBSXri:
+    break;
+  case AArch64::ADDWrr:    NewOpc = AArch64::ADDSWrr; break;
+  case AArch64::ADDWri:    NewOpc = AArch64::ADDSWri; break;
+  case AArch64::ADDXrr:    NewOpc = AArch64::ADDSXrr; break;
+  case AArch64::ADDXri:    NewOpc = AArch64::ADDSXri; break;
+  case AArch64::ADCWr:     NewOpc = AArch64::ADCSWr; break;
+  case AArch64::ADCXr:     NewOpc = AArch64::ADCSXr; break;
+  case AArch64::SUBWrr:    NewOpc = AArch64::SUBSWrr; break;
+  case AArch64::SUBWri:    NewOpc = AArch64::SUBSWri; break;
+  case AArch64::SUBXrr:    NewOpc = AArch64::SUBSXrr; break;
+  case AArch64::SUBXri:    NewOpc = AArch64::SUBSXri; break;
+  case AArch64::SBCWr:     NewOpc = AArch64::SBCSWr; break;
+  case AArch64::SBCXr:     NewOpc = AArch64::SBCSXr; break;
+  case AArch64::ANDWri:    NewOpc = AArch64::ANDSWri; break;
+  case AArch64::ANDXri:    NewOpc = AArch64::ANDSXri; break;
+  }
+
+  // Scan forward for the use of NZCV.
+  // When checking against MI: if it's a conditional code requires
+  // checking of V bit, then this is not safe to do.
+  // It is safe to remove CmpInstr if NZCV is redefined or killed.
+  // If we are done with the basic block, we need to check whether NZCV is
+  // live-out.
+  bool IsSafe = false;
+  for (MachineBasicBlock::iterator I = CmpInstr,
+                                   E = CmpInstr->getParent()->end();
+       !IsSafe && ++I != E;) {
+    const MachineInstr &Instr = *I;
+    for (unsigned IO = 0, EO = Instr.getNumOperands(); !IsSafe && IO != EO;
+         ++IO) {
+      const MachineOperand &MO = Instr.getOperand(IO);
+      if (MO.isRegMask() && MO.clobbersPhysReg(AArch64::NZCV)) {
+        IsSafe = true;
+        break;
+      }
+      if (!MO.isReg() || MO.getReg() != AArch64::NZCV)
+        continue;
+      if (MO.isDef()) {
+        IsSafe = true;
+        break;
+      }
+
+      // Decode the condition code.
+      unsigned Opc = Instr.getOpcode();
+      AArch64CC::CondCode CC;
+      switch (Opc) {
+      default:
+        return false;
+      case AArch64::Bcc:
+        CC = (AArch64CC::CondCode)Instr.getOperand(IO - 2).getImm();
+        break;
+      case AArch64::CSINVWr:
+      case AArch64::CSINVXr:
+      case AArch64::CSINCWr:
+      case AArch64::CSINCXr:
+      case AArch64::CSELWr:
+      case AArch64::CSELXr:
+      case AArch64::CSNEGWr:
+      case AArch64::CSNEGXr:
+      case AArch64::FCSELSrrr:
+      case AArch64::FCSELDrrr:
+        CC = (AArch64CC::CondCode)Instr.getOperand(IO - 1).getImm();
+        break;
+      }
+
+      // It is not safe to remove Compare instruction if Overflow(V) is used.
+      switch (CC) {
+      default:
+        // NZCV can be used multiple times, we should continue.
+        break;
+      case AArch64CC::VS:
+      case AArch64CC::VC:
+      case AArch64CC::GE:
+      case AArch64CC::LT:
+      case AArch64CC::GT:
+      case AArch64CC::LE:
+        return false;
+      }
+    }
+  }
+
+  // If NZCV is not killed nor re-defined, we should check whether it is
+  // live-out. If it is live-out, do not optimize.
+  if (!IsSafe) {
+    MachineBasicBlock *ParentBlock = CmpInstr->getParent();
+    for (auto *MBB : ParentBlock->successors())
+      if (MBB->isLiveIn(AArch64::NZCV))
+        return false;
+  }
+
+  // Update the instruction to set NZCV.
+  MI->setDesc(get(NewOpc));
+  CmpInstr->eraseFromParent();
+  bool succeeded = UpdateOperandRegClass(MI);
+  (void)succeeded;
+  assert(succeeded && "Some operands reg class are incompatible!");
+  MI->addRegisterDefined(AArch64::NZCV, TRI);
+  return true;
+}
+
+/// Return true if this is this instruction has a non-zero immediate
+bool AArch64InstrInfo::hasShiftedReg(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case AArch64::ADDSWrs:
+  case AArch64::ADDSXrs:
+  case AArch64::ADDWrs:
+  case AArch64::ADDXrs:
+  case AArch64::ANDSWrs:
+  case AArch64::ANDSXrs:
+  case AArch64::ANDWrs:
+  case AArch64::ANDXrs:
+  case AArch64::BICSWrs:
+  case AArch64::BICSXrs:
+  case AArch64::BICWrs:
+  case AArch64::BICXrs:
+  case AArch64::CRC32Brr:
+  case AArch64::CRC32CBrr:
+  case AArch64::CRC32CHrr:
+  case AArch64::CRC32CWrr:
+  case AArch64::CRC32CXrr:
+  case AArch64::CRC32Hrr:
+  case AArch64::CRC32Wrr:
+  case AArch64::CRC32Xrr:
+  case AArch64::EONWrs:
+  case AArch64::EONXrs:
+  case AArch64::EORWrs:
+  case AArch64::EORXrs:
+  case AArch64::ORNWrs:
+  case AArch64::ORNXrs:
+  case AArch64::ORRWrs:
+  case AArch64::ORRXrs:
+  case AArch64::SUBSWrs:
+  case AArch64::SUBSXrs:
+  case AArch64::SUBWrs:
+  case AArch64::SUBXrs:
+    if (MI->getOperand(3).isImm()) {
+      unsigned val = MI->getOperand(3).getImm();
+      return (val != 0);
+    }
+    break;
+  }
+  return false;
+}
+
+/// Return true if this is this instruction has a non-zero immediate
+bool AArch64InstrInfo::hasExtendedReg(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case AArch64::ADDSWrx:
+  case AArch64::ADDSXrx:
+  case AArch64::ADDSXrx64:
+  case AArch64::ADDWrx:
+  case AArch64::ADDXrx:
+  case AArch64::ADDXrx64:
+  case AArch64::SUBSWrx:
+  case AArch64::SUBSXrx:
+  case AArch64::SUBSXrx64:
+  case AArch64::SUBWrx:
+  case AArch64::SUBXrx:
+  case AArch64::SUBXrx64:
+    if (MI->getOperand(3).isImm()) {
+      unsigned val = MI->getOperand(3).getImm();
+      return (val != 0);
+    }
+    break;
+  }
+
+  return false;
+}
+
+// Return true if this instruction simply sets its single destination register
+// to zero. This is equivalent to a register rename of the zero-register.
+bool AArch64InstrInfo::isGPRZero(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case AArch64::MOVZWi:
+  case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
+    if (MI->getOperand(1).isImm() && MI->getOperand(1).getImm() == 0) {
+      assert(MI->getDesc().getNumOperands() == 3 &&
+             MI->getOperand(2).getImm() == 0 && "invalid MOVZi operands");
+      return true;
+    }
+    break;
+  case AArch64::ANDWri: // and Rd, Rzr, #imm
+    return MI->getOperand(1).getReg() == AArch64::WZR;
+  case AArch64::ANDXri:
+    return MI->getOperand(1).getReg() == AArch64::XZR;
+  case TargetOpcode::COPY:
+    return MI->getOperand(1).getReg() == AArch64::WZR;
+  }
+  return false;
+}
+
+// Return true if this instruction simply renames a general register without
+// modifying bits.
+bool AArch64InstrInfo::isGPRCopy(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case TargetOpcode::COPY: {
+    // GPR32 copies will by lowered to ORRXrs
+    unsigned DstReg = MI->getOperand(0).getReg();
+    return (AArch64::GPR32RegClass.contains(DstReg) ||
+            AArch64::GPR64RegClass.contains(DstReg));
+  }
+  case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
+    if (MI->getOperand(1).getReg() == AArch64::XZR) {
+      assert(MI->getDesc().getNumOperands() == 4 &&
+             MI->getOperand(3).getImm() == 0 && "invalid ORRrs operands");
+      return true;
+    }
+  case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
+    if (MI->getOperand(2).getImm() == 0) {
+      assert(MI->getDesc().getNumOperands() == 4 &&
+             MI->getOperand(3).getImm() == 0 && "invalid ADDXri operands");
+      return true;
+    }
+  }
+  return false;
+}
+
+// Return true if this instruction simply renames a general register without
+// modifying bits.
+bool AArch64InstrInfo::isFPRCopy(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case TargetOpcode::COPY: {
+    // FPR64 copies will by lowered to ORR.16b
+    unsigned DstReg = MI->getOperand(0).getReg();
+    return (AArch64::FPR64RegClass.contains(DstReg) ||
+            AArch64::FPR128RegClass.contains(DstReg));
+  }
+  case AArch64::ORRv16i8:
+    if (MI->getOperand(1).getReg() == MI->getOperand(2).getReg()) {
+      assert(MI->getDesc().getNumOperands() == 3 && MI->getOperand(0).isReg() &&
+             "invalid ORRv16i8 operands");
+      return true;
+    }
+  }
+  return false;
+}
+
+unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+                                               int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case AArch64::LDRWui:
+  case AArch64::LDRXui:
+  case AArch64::LDRBui:
+  case AArch64::LDRHui:
+  case AArch64::LDRSui:
+  case AArch64::LDRDui:
+  case AArch64::LDRQui:
+    if (MI->getOperand(0).getSubReg() == 0 && MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+
+  return 0;
+}
+
+unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+                                              int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case AArch64::STRWui:
+  case AArch64::STRXui:
+  case AArch64::STRBui:
+  case AArch64::STRHui:
+  case AArch64::STRSui:
+  case AArch64::STRDui:
+  case AArch64::STRQui:
+    if (MI->getOperand(0).getSubReg() == 0 && MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+  return 0;
+}
+
+/// Return true if this is load/store scales or extends its register offset.
+/// This refers to scaling a dynamic index as opposed to scaled immediates.
+/// MI should be a memory op that allows scaled addressing.
+bool AArch64InstrInfo::isScaledAddr(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case AArch64::LDRBBroW:
+  case AArch64::LDRBroW:
+  case AArch64::LDRDroW:
+  case AArch64::LDRHHroW:
+  case AArch64::LDRHroW:
+  case AArch64::LDRQroW:
+  case AArch64::LDRSBWroW:
+  case AArch64::LDRSBXroW:
+  case AArch64::LDRSHWroW:
+  case AArch64::LDRSHXroW:
+  case AArch64::LDRSWroW:
+  case AArch64::LDRSroW:
+  case AArch64::LDRWroW:
+  case AArch64::LDRXroW:
+  case AArch64::STRBBroW:
+  case AArch64::STRBroW:
+  case AArch64::STRDroW:
+  case AArch64::STRHHroW:
+  case AArch64::STRHroW:
+  case AArch64::STRQroW:
+  case AArch64::STRSroW:
+  case AArch64::STRWroW:
+  case AArch64::STRXroW:
+  case AArch64::LDRBBroX:
+  case AArch64::LDRBroX:
+  case AArch64::LDRDroX:
+  case AArch64::LDRHHroX:
+  case AArch64::LDRHroX:
+  case AArch64::LDRQroX:
+  case AArch64::LDRSBWroX:
+  case AArch64::LDRSBXroX:
+  case AArch64::LDRSHWroX:
+  case AArch64::LDRSHXroX:
+  case AArch64::LDRSWroX:
+  case AArch64::LDRSroX:
+  case AArch64::LDRWroX:
+  case AArch64::LDRXroX:
+  case AArch64::STRBBroX:
+  case AArch64::STRBroX:
+  case AArch64::STRDroX:
+  case AArch64::STRHHroX:
+  case AArch64::STRHroX:
+  case AArch64::STRQroX:
+  case AArch64::STRSroX:
+  case AArch64::STRWroX:
+  case AArch64::STRXroX:
+
+    unsigned Val = MI->getOperand(3).getImm();
+    AArch64_AM::ShiftExtendType ExtType = AArch64_AM::getMemExtendType(Val);
+    return (ExtType != AArch64_AM::UXTX) || AArch64_AM::getMemDoShift(Val);
+  }
+  return false;
+}
+
+/// Check all MachineMemOperands for a hint to suppress pairing.
+bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr *MI) const {
+  assert(MOSuppressPair < (1 << MachineMemOperand::MOTargetNumBits) &&
+         "Too many target MO flags");
+  for (auto *MM : MI->memoperands()) {
+    if (MM->getFlags() &
+        (MOSuppressPair << MachineMemOperand::MOTargetStartBit)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+/// Set a flag on the first MachineMemOperand to suppress pairing.
+void AArch64InstrInfo::suppressLdStPair(MachineInstr *MI) const {
+  if (MI->memoperands_empty())
+    return;
+
+  assert(MOSuppressPair < (1 << MachineMemOperand::MOTargetNumBits) &&
+         "Too many target MO flags");
+  (*MI->memoperands_begin())
+      ->setFlags(MOSuppressPair << MachineMemOperand::MOTargetStartBit);
+}
+
+bool
+AArch64InstrInfo::getLdStBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
+                                       unsigned &Offset,
+                                       const TargetRegisterInfo *TRI) const {
+  switch (LdSt->getOpcode()) {
+  default:
+    return false;
+  case AArch64::STRSui:
+  case AArch64::STRDui:
+  case AArch64::STRQui:
+  case AArch64::STRXui:
+  case AArch64::STRWui:
+  case AArch64::LDRSui:
+  case AArch64::LDRDui:
+  case AArch64::LDRQui:
+  case AArch64::LDRXui:
+  case AArch64::LDRWui:
+    if (!LdSt->getOperand(1).isReg() || !LdSt->getOperand(2).isImm())
+      return false;
+    BaseReg = LdSt->getOperand(1).getReg();
+    MachineFunction &MF = *LdSt->getParent()->getParent();
+    unsigned Width = getRegClass(LdSt->getDesc(), 0, TRI, MF)->getSize();
+    Offset = LdSt->getOperand(2).getImm() * Width;
+    return true;
+  };
+}
+
+/// Detect opportunities for ldp/stp formation.
+///
+/// Only called for LdSt for which getLdStBaseRegImmOfs returns true.
+bool AArch64InstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt,
+                                          MachineInstr *SecondLdSt,
+                                          unsigned NumLoads) const {
+  // Only cluster up to a single pair.
+  if (NumLoads > 1)
+    return false;
+  if (FirstLdSt->getOpcode() != SecondLdSt->getOpcode())
+    return false;
+  // getLdStBaseRegImmOfs guarantees that oper 2 isImm.
+  unsigned Ofs1 = FirstLdSt->getOperand(2).getImm();
+  // Allow 6 bits of positive range.
+  if (Ofs1 > 64)
+    return false;
+  // The caller should already have ordered First/SecondLdSt by offset.
+  unsigned Ofs2 = SecondLdSt->getOperand(2).getImm();
+  return Ofs1 + 1 == Ofs2;
+}
+
+bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First,
+                                              MachineInstr *Second) const {
+  // Cyclone can fuse CMN, CMP followed by Bcc.
+
+  // FIXME: B0 can also fuse:
+  // AND, BIC, ORN, ORR, or EOR (optional S) followed by Bcc or CBZ or CBNZ.
+  if (Second->getOpcode() != AArch64::Bcc)
+    return false;
+  switch (First->getOpcode()) {
+  default:
+    return false;
+  case AArch64::SUBSWri:
+  case AArch64::ADDSWri:
+  case AArch64::ANDSWri:
+  case AArch64::SUBSXri:
+  case AArch64::ADDSXri:
+  case AArch64::ANDSXri:
+    return true;
+  }
+}
+
+MachineInstr *AArch64InstrInfo::emitFrameIndexDebugValue(MachineFunction &MF,
+                                                         int FrameIx,
+                                                         uint64_t Offset,
+                                                         const MDNode *MDPtr,
+                                                         DebugLoc DL) const {
+  MachineInstrBuilder MIB = BuildMI(MF, DL, get(AArch64::DBG_VALUE))
+                                .addFrameIndex(FrameIx)
+                                .addImm(0)
+                                .addImm(Offset)
+                                .addMetadata(MDPtr);
+  return &*MIB;
+}
+
+static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB,
+                                            unsigned Reg, unsigned SubIdx,
+                                            unsigned State,
+                                            const TargetRegisterInfo *TRI) {
+  if (!SubIdx)
+    return MIB.addReg(Reg, State);
+
+  if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
+  return MIB.addReg(Reg, State, SubIdx);
+}
+
+static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
+                                        unsigned NumRegs) {
+  // We really want the positive remainder mod 32 here, that happens to be
+  // easily obtainable with a mask.
+  return ((DestReg - SrcReg) & 0x1f) < NumRegs;
+}
+
+void AArch64InstrInfo::copyPhysRegTuple(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL,
+    unsigned DestReg, unsigned SrcReg, bool KillSrc, unsigned Opcode,
+    llvm::ArrayRef<unsigned> Indices) const {
+  assert(Subtarget.hasNEON() &&
+         "Unexpected register copy without NEON");
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
+  uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
+  unsigned NumRegs = Indices.size();
+
+  int SubReg = 0, End = NumRegs, Incr = 1;
+  if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
+    SubReg = NumRegs - 1;
+    End = -1;
+    Incr = -1;
+  }
+
+  for (; SubReg != End; SubReg += Incr) {
+    const MachineInstrBuilder &MIB = BuildMI(MBB, I, DL, get(Opcode));
+    AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
+    AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI);
+    AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
+  }
+}
+
+void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator I, DebugLoc DL,
+                                   unsigned DestReg, unsigned SrcReg,
+                                   bool KillSrc) const {
+  if (AArch64::GPR32spRegClass.contains(DestReg) &&
+      (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) {
+    const TargetRegisterInfo *TRI = &getRegisterInfo();
+
+    if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
+      // If either operand is WSP, expand to ADD #0.
+      if (Subtarget.hasZeroCycleRegMove()) {
+        // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
+        unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, AArch64::sub_32,
+                                                     &AArch64::GPR64spRegClass);
+        unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32,
+                                                    &AArch64::GPR64spRegClass);
+        // This instruction is reading and writing X registers.  This may upset
+        // the register scavenger and machine verifier, so we need to indicate
+        // that we are reading an undefined value from SrcRegX, but a proper
+        // value from SrcReg.
+        BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
+            .addReg(SrcRegX, RegState::Undef)
+            .addImm(0)
+            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
+            .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+      } else {
+        BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
+            .addReg(SrcReg, getKillRegState(KillSrc))
+            .addImm(0)
+            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+      }
+    } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroing()) {
+      BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg).addImm(0).addImm(
+          AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+    } else {
+      if (Subtarget.hasZeroCycleRegMove()) {
+        // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
+        unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, AArch64::sub_32,
+                                                     &AArch64::GPR64spRegClass);
+        unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32,
+                                                    &AArch64::GPR64spRegClass);
+        // This instruction is reading and writing X registers.  This may upset
+        // the register scavenger and machine verifier, so we need to indicate
+        // that we are reading an undefined value from SrcRegX, but a proper
+        // value from SrcReg.
+        BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
+            .addReg(AArch64::XZR)
+            .addReg(SrcRegX, RegState::Undef)
+            .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+      } else {
+        // Otherwise, expand to ORR WZR.
+        BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
+            .addReg(AArch64::WZR)
+            .addReg(SrcReg, getKillRegState(KillSrc));
+      }
+    }
+    return;
+  }
+
+  if (AArch64::GPR64spRegClass.contains(DestReg) &&
+      (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) {
+    if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
+      // If either operand is SP, expand to ADD #0.
+      BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc))
+          .addImm(0)
+          .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+    } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroing()) {
+      BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg).addImm(0).addImm(
+          AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+    } else {
+      // Otherwise, expand to ORR XZR.
+      BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
+          .addReg(AArch64::XZR)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    }
+    return;
+  }
+
+  // Copy a DDDD register quad by copying the individual sub-registers.
+  if (AArch64::DDDDRegClass.contains(DestReg) &&
+      AArch64::DDDDRegClass.contains(SrcReg)) {
+    static const unsigned Indices[] = { AArch64::dsub0, AArch64::dsub1,
+                                        AArch64::dsub2, AArch64::dsub3 };
+    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
+                     Indices);
+    return;
+  }
+
+  // Copy a DDD register triple by copying the individual sub-registers.
+  if (AArch64::DDDRegClass.contains(DestReg) &&
+      AArch64::DDDRegClass.contains(SrcReg)) {
+    static const unsigned Indices[] = { AArch64::dsub0, AArch64::dsub1,
+                                        AArch64::dsub2 };
+    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
+                     Indices);
+    return;
+  }
+
+  // Copy a DD register pair by copying the individual sub-registers.
+  if (AArch64::DDRegClass.contains(DestReg) &&
+      AArch64::DDRegClass.contains(SrcReg)) {
+    static const unsigned Indices[] = { AArch64::dsub0, AArch64::dsub1 };
+    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
+                     Indices);
+    return;
+  }
+
+  // Copy a QQQQ register quad by copying the individual sub-registers.
+  if (AArch64::QQQQRegClass.contains(DestReg) &&
+      AArch64::QQQQRegClass.contains(SrcReg)) {
+    static const unsigned Indices[] = { AArch64::qsub0, AArch64::qsub1,
+                                        AArch64::qsub2, AArch64::qsub3 };
+    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
+                     Indices);
+    return;
+  }
+
+  // Copy a QQQ register triple by copying the individual sub-registers.
+  if (AArch64::QQQRegClass.contains(DestReg) &&
+      AArch64::QQQRegClass.contains(SrcReg)) {
+    static const unsigned Indices[] = { AArch64::qsub0, AArch64::qsub1,
+                                        AArch64::qsub2 };
+    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
+                     Indices);
+    return;
+  }
+
+  // Copy a QQ register pair by copying the individual sub-registers.
+  if (AArch64::QQRegClass.contains(DestReg) &&
+      AArch64::QQRegClass.contains(SrcReg)) {
+    static const unsigned Indices[] = { AArch64::qsub0, AArch64::qsub1 };
+    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
+                     Indices);
+    return;
+  }
+
+  if (AArch64::FPR128RegClass.contains(DestReg) &&
+      AArch64::FPR128RegClass.contains(SrcReg)) {
+    if(Subtarget.hasNEON()) {
+      BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
+          .addReg(SrcReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    } else {
+      BuildMI(MBB, I, DL, get(AArch64::STRQpre))
+        .addReg(AArch64::SP, RegState::Define)
+        .addReg(SrcReg, getKillRegState(KillSrc))
+        .addReg(AArch64::SP)
+        .addImm(-16);
+      BuildMI(MBB, I, DL, get(AArch64::LDRQpre))
+        .addReg(AArch64::SP, RegState::Define)
+        .addReg(DestReg, RegState::Define)
+        .addReg(AArch64::SP)
+        .addImm(16);
+    }
+    return;
+  }
+
+  if (AArch64::FPR64RegClass.contains(DestReg) &&
+      AArch64::FPR64RegClass.contains(SrcReg)) {
+    if(Subtarget.hasNEON()) {
+      DestReg = RI.getMatchingSuperReg(DestReg, AArch64::dsub,
+                                       &AArch64::FPR128RegClass);
+      SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::dsub,
+                                      &AArch64::FPR128RegClass);
+      BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
+          .addReg(SrcReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    } else {
+      BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    }
+    return;
+  }
+
+  if (AArch64::FPR32RegClass.contains(DestReg) &&
+      AArch64::FPR32RegClass.contains(SrcReg)) {
+    if(Subtarget.hasNEON()) {
+      DestReg = RI.getMatchingSuperReg(DestReg, AArch64::ssub,
+                                       &AArch64::FPR128RegClass);
+      SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::ssub,
+                                      &AArch64::FPR128RegClass);
+      BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
+          .addReg(SrcReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    } else {
+      BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    }
+    return;
+  }
+
+  if (AArch64::FPR16RegClass.contains(DestReg) &&
+      AArch64::FPR16RegClass.contains(SrcReg)) {
+    if(Subtarget.hasNEON()) {
+      DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
+                                       &AArch64::FPR128RegClass);
+      SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
+                                      &AArch64::FPR128RegClass);
+      BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
+          .addReg(SrcReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    } else {
+      DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
+                                       &AArch64::FPR32RegClass);
+      SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
+                                      &AArch64::FPR32RegClass);
+      BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    }
+    return;
+  }
+
+  if (AArch64::FPR8RegClass.contains(DestReg) &&
+      AArch64::FPR8RegClass.contains(SrcReg)) {
+    if(Subtarget.hasNEON()) {
+      DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
+                                       &AArch64::FPR128RegClass);
+      SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
+                                      &AArch64::FPR128RegClass);
+      BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
+          .addReg(SrcReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    } else {
+      DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
+                                       &AArch64::FPR32RegClass);
+      SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
+                                      &AArch64::FPR32RegClass);
+      BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    }
+    return;
+  }
+
+  // Copies between GPR64 and FPR64.
+  if (AArch64::FPR64RegClass.contains(DestReg) &&
+      AArch64::GPR64RegClass.contains(SrcReg)) {
+    BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+  if (AArch64::GPR64RegClass.contains(DestReg) &&
+      AArch64::FPR64RegClass.contains(SrcReg)) {
+    BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+  // Copies between GPR32 and FPR32.
+  if (AArch64::FPR32RegClass.contains(DestReg) &&
+      AArch64::GPR32RegClass.contains(SrcReg)) {
+    BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+  if (AArch64::GPR32RegClass.contains(DestReg) &&
+      AArch64::FPR32RegClass.contains(SrcReg)) {
+    BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+
+  if (DestReg == AArch64::NZCV) {
+    assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
+    BuildMI(MBB, I, DL, get(AArch64::MSR))
+      .addImm(AArch64SysReg::NZCV)
+      .addReg(SrcReg, getKillRegState(KillSrc))
+      .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define);
+    return;
+  }
+
+  if (SrcReg == AArch64::NZCV) {
+    assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
+    BuildMI(MBB, I, DL, get(AArch64::MRS))
+      .addReg(DestReg)
+      .addImm(AArch64SysReg::NZCV)
+      .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
+    return;
+  }
+
+  llvm_unreachable("unimplemented reg-to-reg copy");
+}
+
+void AArch64InstrInfo::storeRegToStackSlot(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned SrcReg,
+    bool isKill, int FI, const TargetRegisterClass *RC,
+    const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  if (MBBI != MBB.end())
+    DL = MBBI->getDebugLoc();
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+  unsigned Align = MFI.getObjectAlignment(FI);
+
+  MachinePointerInfo PtrInfo(PseudoSourceValue::getFixedStack(FI));
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      PtrInfo, MachineMemOperand::MOStore, MFI.getObjectSize(FI), Align);
+  unsigned Opc = 0;
+  bool Offset = true;
+  switch (RC->getSize()) {
+  case 1:
+    if (AArch64::FPR8RegClass.hasSubClassEq(RC))
+      Opc = AArch64::STRBui;
+    break;
+  case 2:
+    if (AArch64::FPR16RegClass.hasSubClassEq(RC))
+      Opc = AArch64::STRHui;
+    break;
+  case 4:
+    if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
+      Opc = AArch64::STRWui;
+      if (TargetRegisterInfo::isVirtualRegister(SrcReg))
+        MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
+      else
+        assert(SrcReg != AArch64::WSP);
+    } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
+      Opc = AArch64::STRSui;
+    break;
+  case 8:
+    if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
+      Opc = AArch64::STRXui;
+      if (TargetRegisterInfo::isVirtualRegister(SrcReg))
+        MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
+      else
+        assert(SrcReg != AArch64::SP);
+    } else if (AArch64::FPR64RegClass.hasSubClassEq(RC))
+      Opc = AArch64::STRDui;
+    break;
+  case 16:
+    if (AArch64::FPR128RegClass.hasSubClassEq(RC))
+      Opc = AArch64::STRQui;
+    else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasNEON() &&
+             "Unexpected register store without NEON");
+      Opc = AArch64::ST1Twov1d, Offset = false;
+    }
+    break;
+  case 24:
+    if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasNEON() &&
+             "Unexpected register store without NEON");
+      Opc = AArch64::ST1Threev1d, Offset = false;
+    }
+    break;
+  case 32:
+    if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasNEON() &&
+             "Unexpected register store without NEON");
+      Opc = AArch64::ST1Fourv1d, Offset = false;
+    } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasNEON() &&
+             "Unexpected register store without NEON");
+      Opc = AArch64::ST1Twov2d, Offset = false;
+    }
+    break;
+  case 48:
+    if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasNEON() &&
+             "Unexpected register store without NEON");
+      Opc = AArch64::ST1Threev2d, Offset = false;
+    }
+    break;
+  case 64:
+    if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasNEON() &&
+             "Unexpected register store without NEON");
+      Opc = AArch64::ST1Fourv2d, Offset = false;
+    }
+    break;
+  }
+  assert(Opc && "Unknown register class");
+
+  const MachineInstrBuilder &MI = BuildMI(MBB, MBBI, DL, get(Opc))
+                                      .addReg(SrcReg, getKillRegState(isKill))
+                                      .addFrameIndex(FI);
+
+  if (Offset)
+    MI.addImm(0);
+  MI.addMemOperand(MMO);
+}
+
+void AArch64InstrInfo::loadRegFromStackSlot(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned DestReg,
+    int FI, const TargetRegisterClass *RC,
+    const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  if (MBBI != MBB.end())
+    DL = MBBI->getDebugLoc();
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+  unsigned Align = MFI.getObjectAlignment(FI);
+  MachinePointerInfo PtrInfo(PseudoSourceValue::getFixedStack(FI));
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize(FI), Align);
+
+  unsigned Opc = 0;
+  bool Offset = true;
+  switch (RC->getSize()) {
+  case 1:
+    if (AArch64::FPR8RegClass.hasSubClassEq(RC))
+      Opc = AArch64::LDRBui;
+    break;
+  case 2:
+    if (AArch64::FPR16RegClass.hasSubClassEq(RC))
+      Opc = AArch64::LDRHui;
+    break;
+  case 4:
+    if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
+      Opc = AArch64::LDRWui;
+      if (TargetRegisterInfo::isVirtualRegister(DestReg))
+        MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
+      else
+        assert(DestReg != AArch64::WSP);
+    } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
+      Opc = AArch64::LDRSui;
+    break;
+  case 8:
+    if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
+      Opc = AArch64::LDRXui;
+      if (TargetRegisterInfo::isVirtualRegister(DestReg))
+        MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
+      else
+        assert(DestReg != AArch64::SP);
+    } else if (AArch64::FPR64RegClass.hasSubClassEq(RC))
+      Opc = AArch64::LDRDui;
+    break;
+  case 16:
+    if (AArch64::FPR128RegClass.hasSubClassEq(RC))
+      Opc = AArch64::LDRQui;
+    else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasNEON() &&
+             "Unexpected register load without NEON");
+      Opc = AArch64::LD1Twov1d, Offset = false;
+    }
+    break;
+  case 24:
+    if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasNEON() &&
+             "Unexpected register load without NEON");
+      Opc = AArch64::LD1Threev1d, Offset = false;
+    }
+    break;
+  case 32:
+    if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasNEON() &&
+             "Unexpected register load without NEON");
+      Opc = AArch64::LD1Fourv1d, Offset = false;
+    } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasNEON() &&
+             "Unexpected register load without NEON");
+      Opc = AArch64::LD1Twov2d, Offset = false;
+    }
+    break;
+  case 48:
+    if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasNEON() &&
+             "Unexpected register load without NEON");
+      Opc = AArch64::LD1Threev2d, Offset = false;
+    }
+    break;
+  case 64:
+    if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasNEON() &&
+             "Unexpected register load without NEON");
+      Opc = AArch64::LD1Fourv2d, Offset = false;
+    }
+    break;
+  }
+  assert(Opc && "Unknown register class");
+
+  const MachineInstrBuilder &MI = BuildMI(MBB, MBBI, DL, get(Opc))
+                                      .addReg(DestReg, getDefRegState(true))
+                                      .addFrameIndex(FI);
+  if (Offset)
+    MI.addImm(0);
+  MI.addMemOperand(MMO);
+}
+
+void llvm::emitFrameOffset(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg, int Offset,
+                           const TargetInstrInfo *TII,
+                           MachineInstr::MIFlag Flag, bool SetNZCV) {
+  if (DestReg == SrcReg && Offset == 0)
+    return;
+
+  bool isSub = Offset < 0;
+  if (isSub)
+    Offset = -Offset;
+
+  // FIXME: If the offset won't fit in 24-bits, compute the offset into a
+  // scratch register.  If DestReg is a virtual register, use it as the
+  // scratch register; otherwise, create a new virtual register (to be
+  // replaced by the scavenger at the end of PEI).  That case can be optimized
+  // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
+  // register can be loaded with offset%8 and the add/sub can use an extending
+  // instruction with LSL#3.
+  // Currently the function handles any offsets but generates a poor sequence
+  // of code.
+  //  assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
+
+  unsigned Opc;
+  if (SetNZCV)
+    Opc = isSub ? AArch64::SUBSXri : AArch64::ADDSXri;
+  else
+    Opc = isSub ? AArch64::SUBXri : AArch64::ADDXri;
+  const unsigned MaxEncoding = 0xfff;
+  const unsigned ShiftSize = 12;
+  const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
+  while (((unsigned)Offset) >= (1 << ShiftSize)) {
+    unsigned ThisVal;
+    if (((unsigned)Offset) > MaxEncodableValue) {
+      ThisVal = MaxEncodableValue;
+    } else {
+      ThisVal = Offset & MaxEncodableValue;
+    }
+    assert((ThisVal >> ShiftSize) <= MaxEncoding &&
+           "Encoding cannot handle value that big");
+    BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
+        .addReg(SrcReg)
+        .addImm(ThisVal >> ShiftSize)
+        .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftSize))
+        .setMIFlag(Flag);
+
+    SrcReg = DestReg;
+    Offset -= ThisVal;
+    if (Offset == 0)
+      return;
+  }
+  BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
+      .addReg(SrcReg)
+      .addImm(Offset)
+      .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
+      .setMIFlag(Flag);
+}
+
+MachineInstr *
+AArch64InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
+                                        const SmallVectorImpl<unsigned> &Ops,
+                                        int FrameIndex) const {
+  // This is a bit of a hack. Consider this instruction:
+  //
+  //   %vreg0<def> = COPY %SP; GPR64all:%vreg0
+  //
+  // We explicitly chose GPR64all for the virtual register so such a copy might
+  // be eliminated by RegisterCoalescer. However, that may not be possible, and
+  // %vreg0 may even spill. We can't spill %SP, and since it is in the GPR64all
+  // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
+  //
+  // To prevent that, we are going to constrain the %vreg0 register class here.
+  //
+  // <rdar://problem/11522048>
+  //
+  if (MI->isCopy()) {
+    unsigned DstReg = MI->getOperand(0).getReg();
+    unsigned SrcReg = MI->getOperand(1).getReg();
+    if (SrcReg == AArch64::SP &&
+        TargetRegisterInfo::isVirtualRegister(DstReg)) {
+      MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
+      return nullptr;
+    }
+    if (DstReg == AArch64::SP &&
+        TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+      MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
+      return nullptr;
+    }
+  }
+
+  // Cannot fold.
+  return nullptr;
+}
+
+int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
+                                    bool *OutUseUnscaledOp,
+                                    unsigned *OutUnscaledOp,
+                                    int *EmittableOffset) {
+  int Scale = 1;
+  bool IsSigned = false;
+  // The ImmIdx should be changed case by case if it is not 2.
+  unsigned ImmIdx = 2;
+  unsigned UnscaledOp = 0;
+  // Set output values in case of early exit.
+  if (EmittableOffset)
+    *EmittableOffset = 0;
+  if (OutUseUnscaledOp)
+    *OutUseUnscaledOp = false;
+  if (OutUnscaledOp)
+    *OutUnscaledOp = 0;
+  switch (MI.getOpcode()) {
+  default:
+    llvm_unreachable("unhandled opcode in rewriteAArch64FrameIndex");
+  // Vector spills/fills can't take an immediate offset.
+  case AArch64::LD1Twov2d:
+  case AArch64::LD1Threev2d:
+  case AArch64::LD1Fourv2d:
+  case AArch64::LD1Twov1d:
+  case AArch64::LD1Threev1d:
+  case AArch64::LD1Fourv1d:
+  case AArch64::ST1Twov2d:
+  case AArch64::ST1Threev2d:
+  case AArch64::ST1Fourv2d:
+  case AArch64::ST1Twov1d:
+  case AArch64::ST1Threev1d:
+  case AArch64::ST1Fourv1d:
+    return AArch64FrameOffsetCannotUpdate;
+  case AArch64::PRFMui:
+    Scale = 8;
+    UnscaledOp = AArch64::PRFUMi;
+    break;
+  case AArch64::LDRXui:
+    Scale = 8;
+    UnscaledOp = AArch64::LDURXi;
+    break;
+  case AArch64::LDRWui:
+    Scale = 4;
+    UnscaledOp = AArch64::LDURWi;
+    break;
+  case AArch64::LDRBui:
+    Scale = 1;
+    UnscaledOp = AArch64::LDURBi;
+    break;
+  case AArch64::LDRHui:
+    Scale = 2;
+    UnscaledOp = AArch64::LDURHi;
+    break;
+  case AArch64::LDRSui:
+    Scale = 4;
+    UnscaledOp = AArch64::LDURSi;
+    break;
+  case AArch64::LDRDui:
+    Scale = 8;
+    UnscaledOp = AArch64::LDURDi;
+    break;
+  case AArch64::LDRQui:
+    Scale = 16;
+    UnscaledOp = AArch64::LDURQi;
+    break;
+  case AArch64::LDRBBui:
+    Scale = 1;
+    UnscaledOp = AArch64::LDURBBi;
+    break;
+  case AArch64::LDRHHui:
+    Scale = 2;
+    UnscaledOp = AArch64::LDURHHi;
+    break;
+  case AArch64::LDRSBXui:
+    Scale = 1;
+    UnscaledOp = AArch64::LDURSBXi;
+    break;
+  case AArch64::LDRSBWui:
+    Scale = 1;
+    UnscaledOp = AArch64::LDURSBWi;
+    break;
+  case AArch64::LDRSHXui:
+    Scale = 2;
+    UnscaledOp = AArch64::LDURSHXi;
+    break;
+  case AArch64::LDRSHWui:
+    Scale = 2;
+    UnscaledOp = AArch64::LDURSHWi;
+    break;
+  case AArch64::LDRSWui:
+    Scale = 4;
+    UnscaledOp = AArch64::LDURSWi;
+    break;
+
+  case AArch64::STRXui:
+    Scale = 8;
+    UnscaledOp = AArch64::STURXi;
+    break;
+  case AArch64::STRWui:
+    Scale = 4;
+    UnscaledOp = AArch64::STURWi;
+    break;
+  case AArch64::STRBui:
+    Scale = 1;
+    UnscaledOp = AArch64::STURBi;
+    break;
+  case AArch64::STRHui:
+    Scale = 2;
+    UnscaledOp = AArch64::STURHi;
+    break;
+  case AArch64::STRSui:
+    Scale = 4;
+    UnscaledOp = AArch64::STURSi;
+    break;
+  case AArch64::STRDui:
+    Scale = 8;
+    UnscaledOp = AArch64::STURDi;
+    break;
+  case AArch64::STRQui:
+    Scale = 16;
+    UnscaledOp = AArch64::STURQi;
+    break;
+  case AArch64::STRBBui:
+    Scale = 1;
+    UnscaledOp = AArch64::STURBBi;
+    break;
+  case AArch64::STRHHui:
+    Scale = 2;
+    UnscaledOp = AArch64::STURHHi;
+    break;
+
+  case AArch64::LDPXi:
+  case AArch64::LDPDi:
+  case AArch64::STPXi:
+  case AArch64::STPDi:
+    IsSigned = true;
+    Scale = 8;
+    break;
+  case AArch64::LDPQi:
+  case AArch64::STPQi:
+    IsSigned = true;
+    Scale = 16;
+    break;
+  case AArch64::LDPWi:
+  case AArch64::LDPSi:
+  case AArch64::STPWi:
+  case AArch64::STPSi:
+    IsSigned = true;
+    Scale = 4;
+    break;
+
+  case AArch64::LDURXi:
+  case AArch64::LDURWi:
+  case AArch64::LDURBi:
+  case AArch64::LDURHi:
+  case AArch64::LDURSi:
+  case AArch64::LDURDi:
+  case AArch64::LDURQi:
+  case AArch64::LDURHHi:
+  case AArch64::LDURBBi:
+  case AArch64::LDURSBXi:
+  case AArch64::LDURSBWi:
+  case AArch64::LDURSHXi:
+  case AArch64::LDURSHWi:
+  case AArch64::LDURSWi:
+  case AArch64::STURXi:
+  case AArch64::STURWi:
+  case AArch64::STURBi:
+  case AArch64::STURHi:
+  case AArch64::STURSi:
+  case AArch64::STURDi:
+  case AArch64::STURQi:
+  case AArch64::STURBBi:
+  case AArch64::STURHHi:
+    Scale = 1;
+    break;
+  }
+
+  Offset += MI.getOperand(ImmIdx).getImm() * Scale;
+
+  bool useUnscaledOp = false;
+  // If the offset doesn't match the scale, we rewrite the instruction to
+  // use the unscaled instruction instead. Likewise, if we have a negative
+  // offset (and have an unscaled op to use).
+  if ((Offset & (Scale - 1)) != 0 || (Offset < 0 && UnscaledOp != 0))
+    useUnscaledOp = true;
+
+  // Use an unscaled addressing mode if the instruction has a negative offset
+  // (or if the instruction is already using an unscaled addressing mode).
+  unsigned MaskBits;
+  if (IsSigned) {
+    // ldp/stp instructions.
+    MaskBits = 7;
+    Offset /= Scale;
+  } else if (UnscaledOp == 0 || useUnscaledOp) {
+    MaskBits = 9;
+    IsSigned = true;
+    Scale = 1;
+  } else {
+    MaskBits = 12;
+    IsSigned = false;
+    Offset /= Scale;
+  }
+
+  // Attempt to fold address computation.
+  int MaxOff = (1 << (MaskBits - IsSigned)) - 1;
+  int MinOff = (IsSigned ? (-MaxOff - 1) : 0);
+  if (Offset >= MinOff && Offset <= MaxOff) {
+    if (EmittableOffset)
+      *EmittableOffset = Offset;
+    Offset = 0;
+  } else {
+    int NewOff = Offset < 0 ? MinOff : MaxOff;
+    if (EmittableOffset)
+      *EmittableOffset = NewOff;
+    Offset = (Offset - NewOff) * Scale;
+  }
+  if (OutUseUnscaledOp)
+    *OutUseUnscaledOp = useUnscaledOp;
+  if (OutUnscaledOp)
+    *OutUnscaledOp = UnscaledOp;
+  return AArch64FrameOffsetCanUpdate |
+         (Offset == 0 ? AArch64FrameOffsetIsLegal : 0);
+}
+
+bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+                                    unsigned FrameReg, int &Offset,
+                                    const AArch64InstrInfo *TII) {
+  unsigned Opcode = MI.getOpcode();
+  unsigned ImmIdx = FrameRegIdx + 1;
+
+  if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
+    Offset += MI.getOperand(ImmIdx).getImm();
+    emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
+                    MI.getOperand(0).getReg(), FrameReg, Offset, TII,
+                    MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
+    MI.eraseFromParent();
+    Offset = 0;
+    return true;
+  }
+
+  int NewOffset;
+  unsigned UnscaledOp;
+  bool UseUnscaledOp;
+  int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
+                                         &UnscaledOp, &NewOffset);
+  if (Status & AArch64FrameOffsetCanUpdate) {
+    if (Status & AArch64FrameOffsetIsLegal)
+      // Replace the FrameIndex with FrameReg.
+      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+    if (UseUnscaledOp)
+      MI.setDesc(TII->get(UnscaledOp));
+
+    MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
+    return Offset == 0;
+  }
+
+  return false;
+}
+
+void AArch64InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+  NopInst.setOpcode(AArch64::HINT);
+  NopInst.addOperand(MCOperand::CreateImm(0));
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h
new file mode 100644
index 0000000..f70b82b
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -0,0 +1,229 @@
+//===- AArch64InstrInfo.h - AArch64 Instruction Information -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AArch64 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_AArch64INSTRINFO_H
+#define LLVM_TARGET_AArch64INSTRINFO_H
+
+#include "AArch64.h"
+#include "AArch64RegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "AArch64GenInstrInfo.inc"
+
+namespace llvm {
+
+class AArch64Subtarget;
+class AArch64TargetMachine;
+
+class AArch64InstrInfo : public AArch64GenInstrInfo {
+  // Reserve bits in the MachineMemOperand target hint flags, starting at 1.
+  // They will be shifted into MOTargetHintStart when accessed.
+  enum TargetMemOperandFlags {
+    MOSuppressPair = 1
+  };
+
+  const AArch64RegisterInfo RI;
+  const AArch64Subtarget &Subtarget;
+
+public:
+  explicit AArch64InstrInfo(const AArch64Subtarget &STI);
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  const AArch64RegisterInfo &getRegisterInfo() const { return RI; }
+
+  unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
+
+  bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg,
+                             unsigned &DstReg, unsigned &SubIdx) const override;
+
+  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                               int &FrameIndex) const override;
+  unsigned isStoreToStackSlot(const MachineInstr *MI,
+                              int &FrameIndex) const override;
+
+  /// Returns true if there is a shiftable register and that the shift value
+  /// is non-zero.
+  bool hasShiftedReg(const MachineInstr *MI) const;
+
+  /// Returns true if there is an extendable register and that the extending
+  /// value is non-zero.
+  bool hasExtendedReg(const MachineInstr *MI) const;
+
+  /// \brief Does this instruction set its full destination register to zero?
+  bool isGPRZero(const MachineInstr *MI) const;
+
+  /// \brief Does this instruction rename a GPR without modifying bits?
+  bool isGPRCopy(const MachineInstr *MI) const;
+
+  /// \brief Does this instruction rename an FPR without modifying bits?
+  bool isFPRCopy(const MachineInstr *MI) const;
+
+  /// Return true if this is load/store scales or extends its register offset.
+  /// This refers to scaling a dynamic index as opposed to scaled immediates.
+  /// MI should be a memory op that allows scaled addressing.
+  bool isScaledAddr(const MachineInstr *MI) const;
+
+  /// Return true if pairing the given load or store is hinted to be
+  /// unprofitable.
+  bool isLdStPairSuppressed(const MachineInstr *MI) const;
+
+  /// Hint that pairing the given load or store is unprofitable.
+  void suppressLdStPair(MachineInstr *MI) const;
+
+  bool getLdStBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
+                            unsigned &Offset,
+                            const TargetRegisterInfo *TRI) const override;
+
+  bool enableClusterLoads() const override { return true; }
+
+  bool shouldClusterLoads(MachineInstr *FirstLdSt, MachineInstr *SecondLdSt,
+                          unsigned NumLoads) const override;
+
+  bool shouldScheduleAdjacent(MachineInstr *First,
+                              MachineInstr *Second) const override;
+
+  MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx,
+                                         uint64_t Offset, const MDNode *MDPtr,
+                                         DebugLoc DL) const;
+  void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                        DebugLoc DL, unsigned DestReg, unsigned SrcReg,
+                        bool KillSrc, unsigned Opcode,
+                        llvm::ArrayRef<unsigned> Indices) const;
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                   DebugLoc DL, unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI, unsigned SrcReg,
+                           bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI, unsigned DestReg,
+                            int FrameIndex, const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
+
+  MachineInstr *
+  foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
+                        const SmallVectorImpl<unsigned> &Ops,
+                        int FrameIndex) const override;
+
+  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify = false) const override;
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
+  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB,
+                        const SmallVectorImpl<MachineOperand> &Cond,
+                        DebugLoc DL) const override;
+  bool
+  ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+  bool canInsertSelect(const MachineBasicBlock &,
+                       const SmallVectorImpl<MachineOperand> &Cond, unsigned,
+                       unsigned, int &, int &, int &) const override;
+  void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                    DebugLoc DL, unsigned DstReg,
+                    const SmallVectorImpl<MachineOperand> &Cond,
+                    unsigned TrueReg, unsigned FalseReg) const override;
+  void getNoopForMachoTarget(MCInst &NopInst) const override;
+
+  /// analyzeCompare - For a comparison instruction, return the source registers
+  /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
+  /// Return true if the comparison instruction can be analyzed.
+  bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
+                      unsigned &SrcReg2, int &CmpMask,
+                      int &CmpValue) const override;
+  /// optimizeCompareInstr - Convert the instruction supplying the argument to
+  /// the comparison into one that sets the zero bit in the flags register.
+  bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
+                            unsigned SrcReg2, int CmpMask, int CmpValue,
+                            const MachineRegisterInfo *MRI) const override;
+
+private:
+  void instantiateCondBranch(MachineBasicBlock &MBB, DebugLoc DL,
+                             MachineBasicBlock *TBB,
+                             const SmallVectorImpl<MachineOperand> &Cond) const;
+};
+
+/// emitFrameOffset - Emit instructions as needed to set DestReg to SrcReg
+/// plus Offset.  This is intended to be used from within the prolog/epilog
+/// insertion (PEI) pass, where a virtual scratch register may be allocated
+/// if necessary, to be replaced by the scavenger at the end of PEI.
+void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                     DebugLoc DL, unsigned DestReg, unsigned SrcReg, int Offset,
+                     const TargetInstrInfo *TII,
+                     MachineInstr::MIFlag = MachineInstr::NoFlags,
+                     bool SetNZCV = false);
+
+/// rewriteAArch64FrameIndex - Rewrite MI to access 'Offset' bytes from the
+/// FP. Return false if the offset could not be handled directly in MI, and
+/// return the left-over portion by reference.
+bool rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+                            unsigned FrameReg, int &Offset,
+                            const AArch64InstrInfo *TII);
+
+/// \brief Use to report the frame offset status in isAArch64FrameOffsetLegal.
+enum AArch64FrameOffsetStatus {
+  AArch64FrameOffsetCannotUpdate = 0x0, ///< Offset cannot apply.
+  AArch64FrameOffsetIsLegal = 0x1,      ///< Offset is legal.
+  AArch64FrameOffsetCanUpdate = 0x2     ///< Offset can apply, at least partly.
+};
+
+/// \brief Check if the @p Offset is a valid frame offset for @p MI.
+/// The returned value reports the validity of the frame offset for @p MI.
+/// It uses the values defined by AArch64FrameOffsetStatus for that.
+/// If result == AArch64FrameOffsetCannotUpdate, @p MI cannot be updated to
+/// use an offset.eq
+/// If result & AArch64FrameOffsetIsLegal, @p Offset can completely be
+/// rewriten in @p MI.
+/// If result & AArch64FrameOffsetCanUpdate, @p Offset contains the
+/// amount that is off the limit of the legal offset.
+/// If set, @p OutUseUnscaledOp will contain the whether @p MI should be
+/// turned into an unscaled operator, which opcode is in @p OutUnscaledOp.
+/// If set, @p EmittableOffset contains the amount that can be set in @p MI
+/// (possibly with @p OutUnscaledOp if OutUseUnscaledOp is true) and that
+/// is a legal offset.
+int isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
+                            bool *OutUseUnscaledOp = nullptr,
+                            unsigned *OutUnscaledOp = nullptr,
+                            int *EmittableOffset = nullptr);
+
+static inline bool isUncondBranchOpcode(int Opc) { return Opc == AArch64::B; }
+
+static inline bool isCondBranchOpcode(int Opc) {
+  switch (Opc) {
+  case AArch64::Bcc:
+  case AArch64::CBZW:
+  case AArch64::CBZX:
+  case AArch64::CBNZW:
+  case AArch64::CBNZX:
+  case AArch64::TBZW:
+  case AArch64::TBZX:
+  case AArch64::TBNZW:
+  case AArch64::TBNZX:
+    return true;
+  default:
+    return false;
+  }
+}
+
+static inline bool isIndirectBranchOpcode(int Opc) { return Opc == AArch64::BR; }
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td
new file mode 100644
index 0000000..0ba069e
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -0,0 +1,5290 @@
+//=- AArch64InstrInfo.td - Describe the AArch64 Instructions -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// AArch64 Instruction definitions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ARM Instruction Predicate Definitions.
+//
+def HasFPARMv8       : Predicate<"Subtarget->hasFPARMv8()">,
+                               AssemblerPredicate<"FeatureFPARMv8", "fp-armv8">;
+def HasNEON          : Predicate<"Subtarget->hasNEON()">,
+                                 AssemblerPredicate<"FeatureNEON", "neon">;
+def HasCrypto        : Predicate<"Subtarget->hasCrypto()">,
+                                 AssemblerPredicate<"FeatureCrypto", "crypto">;
+def HasCRC           : Predicate<"Subtarget->hasCRC()">,
+                                 AssemblerPredicate<"FeatureCRC", "crc">;
+def IsLE             : Predicate<"Subtarget->isLittleEndian()">;
+def IsBE             : Predicate<"!Subtarget->isLittleEndian()">;
+
+//===----------------------------------------------------------------------===//
+// AArch64-specific DAG Nodes.
+//
+
+// SDTBinaryArithWithFlagsOut - RES1, FLAGS = op LHS, RHS
+def SDTBinaryArithWithFlagsOut : SDTypeProfile<2, 2,
+                                              [SDTCisSameAs<0, 2>,
+                                               SDTCisSameAs<0, 3>,
+                                               SDTCisInt<0>, SDTCisVT<1, i32>]>;
+
+// SDTBinaryArithWithFlagsIn - RES1, FLAGS = op LHS, RHS, FLAGS
+def SDTBinaryArithWithFlagsIn : SDTypeProfile<1, 3,
+                                            [SDTCisSameAs<0, 1>,
+                                             SDTCisSameAs<0, 2>,
+                                             SDTCisInt<0>,
+                                             SDTCisVT<3, i32>]>;
+
+// SDTBinaryArithWithFlagsInOut - RES1, FLAGS = op LHS, RHS, FLAGS
+def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3,
+                                            [SDTCisSameAs<0, 2>,
+                                             SDTCisSameAs<0, 3>,
+                                             SDTCisInt<0>,
+                                             SDTCisVT<1, i32>,
+                                             SDTCisVT<4, i32>]>;
+
+def SDT_AArch64Brcond  : SDTypeProfile<0, 3,
+                                     [SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>,
+                                      SDTCisVT<2, i32>]>;
+def SDT_AArch64cbz : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisVT<1, OtherVT>]>;
+def SDT_AArch64tbz : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>,
+                                        SDTCisVT<2, OtherVT>]>;
+
+
+def SDT_AArch64CSel  : SDTypeProfile<1, 4,
+                                   [SDTCisSameAs<0, 1>,
+                                    SDTCisSameAs<0, 2>,
+                                    SDTCisInt<3>,
+                                    SDTCisVT<4, i32>]>;
+def SDT_AArch64FCmp   : SDTypeProfile<0, 2,
+                                   [SDTCisFP<0>,
+                                    SDTCisSameAs<0, 1>]>;
+def SDT_AArch64Dup   : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
+def SDT_AArch64DupLane   : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<2>]>;
+def SDT_AArch64Zip   : SDTypeProfile<1, 2, [SDTCisVec<0>,
+                                          SDTCisSameAs<0, 1>,
+                                          SDTCisSameAs<0, 2>]>;
+def SDT_AArch64MOVIedit : SDTypeProfile<1, 1, [SDTCisInt<1>]>;
+def SDT_AArch64MOVIshift : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
+def SDT_AArch64vecimm : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                           SDTCisInt<2>, SDTCisInt<3>]>;
+def SDT_AArch64UnaryVec: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
+def SDT_AArch64ExtVec: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                          SDTCisSameAs<0,2>, SDTCisInt<3>]>;
+def SDT_AArch64vshift : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>, SDTCisInt<2>]>;
+
+def SDT_AArch64unvec : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
+def SDT_AArch64fcmpz : SDTypeProfile<1, 1, []>;
+def SDT_AArch64fcmp  : SDTypeProfile<1, 2, [SDTCisSameAs<1,2>]>;
+def SDT_AArch64binvec : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                           SDTCisSameAs<0,2>]>;
+def SDT_AArch64trivec : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                           SDTCisSameAs<0,2>,
+                                           SDTCisSameAs<0,3>]>;
+def SDT_AArch64TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>]>;
+def SDT_AArch64PREFETCH : SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<1>]>;
+
+def SDT_AArch64ITOF  : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>;
+
+def SDT_AArch64TLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>,
+                                                 SDTCisPtrTy<1>]>;
+def SDT_AArch64WrapperLarge : SDTypeProfile<1, 4,
+                                        [SDTCisVT<0, i64>, SDTCisVT<1, i32>,
+                                         SDTCisSameAs<1, 2>, SDTCisSameAs<1, 3>,
+                                         SDTCisSameAs<1, 4>]>;
+
+
+// Node definitions.
+def AArch64adrp          : SDNode<"AArch64ISD::ADRP", SDTIntUnaryOp, []>;
+def AArch64addlow        : SDNode<"AArch64ISD::ADDlow", SDTIntBinOp, []>;
+def AArch64LOADgot       : SDNode<"AArch64ISD::LOADgot", SDTIntUnaryOp>;
+def AArch64callseq_start : SDNode<"ISD::CALLSEQ_START",
+                                SDCallSeqStart<[ SDTCisVT<0, i32> ]>,
+                                [SDNPHasChain, SDNPOutGlue]>;
+def AArch64callseq_end   : SDNode<"ISD::CALLSEQ_END",
+                                SDCallSeqEnd<[ SDTCisVT<0, i32>,
+                                               SDTCisVT<1, i32> ]>,
+                                [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def AArch64call          : SDNode<"AArch64ISD::CALL",
+                                SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>,
+                                [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                                 SDNPVariadic]>;
+def AArch64brcond        : SDNode<"AArch64ISD::BRCOND", SDT_AArch64Brcond,
+                                [SDNPHasChain]>;
+def AArch64cbz           : SDNode<"AArch64ISD::CBZ", SDT_AArch64cbz,
+                                [SDNPHasChain]>;
+def AArch64cbnz           : SDNode<"AArch64ISD::CBNZ", SDT_AArch64cbz,
+                                [SDNPHasChain]>;
+def AArch64tbz           : SDNode<"AArch64ISD::TBZ", SDT_AArch64tbz,
+                                [SDNPHasChain]>;
+def AArch64tbnz           : SDNode<"AArch64ISD::TBNZ", SDT_AArch64tbz,
+                                [SDNPHasChain]>;
+
+
+def AArch64csel          : SDNode<"AArch64ISD::CSEL", SDT_AArch64CSel>;
+def AArch64csinv         : SDNode<"AArch64ISD::CSINV", SDT_AArch64CSel>;
+def AArch64csneg         : SDNode<"AArch64ISD::CSNEG", SDT_AArch64CSel>;
+def AArch64csinc         : SDNode<"AArch64ISD::CSINC", SDT_AArch64CSel>;
+def AArch64retflag       : SDNode<"AArch64ISD::RET_FLAG", SDTNone,
+                                [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def AArch64adc       : SDNode<"AArch64ISD::ADC",  SDTBinaryArithWithFlagsIn >;
+def AArch64sbc       : SDNode<"AArch64ISD::SBC",  SDTBinaryArithWithFlagsIn>;
+def AArch64add_flag  : SDNode<"AArch64ISD::ADDS",  SDTBinaryArithWithFlagsOut,
+                            [SDNPCommutative]>;
+def AArch64sub_flag  : SDNode<"AArch64ISD::SUBS",  SDTBinaryArithWithFlagsOut>;
+def AArch64and_flag  : SDNode<"AArch64ISD::ANDS",  SDTBinaryArithWithFlagsOut,
+                            [SDNPCommutative]>;
+def AArch64adc_flag  : SDNode<"AArch64ISD::ADCS",  SDTBinaryArithWithFlagsInOut>;
+def AArch64sbc_flag  : SDNode<"AArch64ISD::SBCS",  SDTBinaryArithWithFlagsInOut>;
+
+def AArch64threadpointer : SDNode<"AArch64ISD::THREAD_POINTER", SDTPtrLeaf>;
+
+def AArch64fcmp      : SDNode<"AArch64ISD::FCMP", SDT_AArch64FCmp>;
+
+def AArch64fmax      : SDNode<"AArch64ISD::FMAX", SDTFPBinOp>;
+def AArch64fmin      : SDNode<"AArch64ISD::FMIN", SDTFPBinOp>;
+
+def AArch64dup       : SDNode<"AArch64ISD::DUP", SDT_AArch64Dup>;
+def AArch64duplane8  : SDNode<"AArch64ISD::DUPLANE8", SDT_AArch64DupLane>;
+def AArch64duplane16 : SDNode<"AArch64ISD::DUPLANE16", SDT_AArch64DupLane>;
+def AArch64duplane32 : SDNode<"AArch64ISD::DUPLANE32", SDT_AArch64DupLane>;
+def AArch64duplane64 : SDNode<"AArch64ISD::DUPLANE64", SDT_AArch64DupLane>;
+
+def AArch64zip1      : SDNode<"AArch64ISD::ZIP1", SDT_AArch64Zip>;
+def AArch64zip2      : SDNode<"AArch64ISD::ZIP2", SDT_AArch64Zip>;
+def AArch64uzp1      : SDNode<"AArch64ISD::UZP1", SDT_AArch64Zip>;
+def AArch64uzp2      : SDNode<"AArch64ISD::UZP2", SDT_AArch64Zip>;
+def AArch64trn1      : SDNode<"AArch64ISD::TRN1", SDT_AArch64Zip>;
+def AArch64trn2      : SDNode<"AArch64ISD::TRN2", SDT_AArch64Zip>;
+
+def AArch64movi_edit : SDNode<"AArch64ISD::MOVIedit", SDT_AArch64MOVIedit>;
+def AArch64movi_shift : SDNode<"AArch64ISD::MOVIshift", SDT_AArch64MOVIshift>;
+def AArch64movi_msl : SDNode<"AArch64ISD::MOVImsl", SDT_AArch64MOVIshift>;
+def AArch64mvni_shift : SDNode<"AArch64ISD::MVNIshift", SDT_AArch64MOVIshift>;
+def AArch64mvni_msl : SDNode<"AArch64ISD::MVNImsl", SDT_AArch64MOVIshift>;
+def AArch64movi : SDNode<"AArch64ISD::MOVI", SDT_AArch64MOVIedit>;
+def AArch64fmov : SDNode<"AArch64ISD::FMOV", SDT_AArch64MOVIedit>;
+
+def AArch64rev16 : SDNode<"AArch64ISD::REV16", SDT_AArch64UnaryVec>;
+def AArch64rev32 : SDNode<"AArch64ISD::REV32", SDT_AArch64UnaryVec>;
+def AArch64rev64 : SDNode<"AArch64ISD::REV64", SDT_AArch64UnaryVec>;
+def AArch64ext : SDNode<"AArch64ISD::EXT", SDT_AArch64ExtVec>;
+
+def AArch64vashr : SDNode<"AArch64ISD::VASHR", SDT_AArch64vshift>;
+def AArch64vlshr : SDNode<"AArch64ISD::VLSHR", SDT_AArch64vshift>;
+def AArch64vshl : SDNode<"AArch64ISD::VSHL", SDT_AArch64vshift>;
+def AArch64sqshli : SDNode<"AArch64ISD::SQSHL_I", SDT_AArch64vshift>;
+def AArch64uqshli : SDNode<"AArch64ISD::UQSHL_I", SDT_AArch64vshift>;
+def AArch64sqshlui : SDNode<"AArch64ISD::SQSHLU_I", SDT_AArch64vshift>;
+def AArch64srshri : SDNode<"AArch64ISD::SRSHR_I", SDT_AArch64vshift>;
+def AArch64urshri : SDNode<"AArch64ISD::URSHR_I", SDT_AArch64vshift>;
+
+def AArch64not: SDNode<"AArch64ISD::NOT", SDT_AArch64unvec>;
+def AArch64bit: SDNode<"AArch64ISD::BIT", SDT_AArch64trivec>;
+def AArch64bsl: SDNode<"AArch64ISD::BSL", SDT_AArch64trivec>;
+
+def AArch64cmeq: SDNode<"AArch64ISD::CMEQ", SDT_AArch64binvec>;
+def AArch64cmge: SDNode<"AArch64ISD::CMGE", SDT_AArch64binvec>;
+def AArch64cmgt: SDNode<"AArch64ISD::CMGT", SDT_AArch64binvec>;
+def AArch64cmhi: SDNode<"AArch64ISD::CMHI", SDT_AArch64binvec>;
+def AArch64cmhs: SDNode<"AArch64ISD::CMHS", SDT_AArch64binvec>;
+
+def AArch64fcmeq: SDNode<"AArch64ISD::FCMEQ", SDT_AArch64fcmp>;
+def AArch64fcmge: SDNode<"AArch64ISD::FCMGE", SDT_AArch64fcmp>;
+def AArch64fcmgt: SDNode<"AArch64ISD::FCMGT", SDT_AArch64fcmp>;
+
+def AArch64cmeqz: SDNode<"AArch64ISD::CMEQz", SDT_AArch64unvec>;
+def AArch64cmgez: SDNode<"AArch64ISD::CMGEz", SDT_AArch64unvec>;
+def AArch64cmgtz: SDNode<"AArch64ISD::CMGTz", SDT_AArch64unvec>;
+def AArch64cmlez: SDNode<"AArch64ISD::CMLEz", SDT_AArch64unvec>;
+def AArch64cmltz: SDNode<"AArch64ISD::CMLTz", SDT_AArch64unvec>;
+def AArch64cmtst : PatFrag<(ops node:$LHS, node:$RHS),
+                        (AArch64not (AArch64cmeqz (and node:$LHS, node:$RHS)))>;
+
+def AArch64fcmeqz: SDNode<"AArch64ISD::FCMEQz", SDT_AArch64fcmpz>;
+def AArch64fcmgez: SDNode<"AArch64ISD::FCMGEz", SDT_AArch64fcmpz>;
+def AArch64fcmgtz: SDNode<"AArch64ISD::FCMGTz", SDT_AArch64fcmpz>;
+def AArch64fcmlez: SDNode<"AArch64ISD::FCMLEz", SDT_AArch64fcmpz>;
+def AArch64fcmltz: SDNode<"AArch64ISD::FCMLTz", SDT_AArch64fcmpz>;
+
+def AArch64bici: SDNode<"AArch64ISD::BICi", SDT_AArch64vecimm>;
+def AArch64orri: SDNode<"AArch64ISD::ORRi", SDT_AArch64vecimm>;
+
+def AArch64neg : SDNode<"AArch64ISD::NEG", SDT_AArch64unvec>;
+
+def AArch64tcret: SDNode<"AArch64ISD::TC_RETURN", SDT_AArch64TCRET,
+                  [SDNPHasChain,  SDNPOptInGlue, SDNPVariadic]>;
+
+def AArch64Prefetch        : SDNode<"AArch64ISD::PREFETCH", SDT_AArch64PREFETCH,
+                               [SDNPHasChain, SDNPSideEffect]>;
+
+def AArch64sitof: SDNode<"AArch64ISD::SITOF", SDT_AArch64ITOF>;
+def AArch64uitof: SDNode<"AArch64ISD::UITOF", SDT_AArch64ITOF>;
+
+def AArch64tlsdesc_call : SDNode<"AArch64ISD::TLSDESC_CALL",
+                                 SDT_AArch64TLSDescCall,
+                                 [SDNPInGlue, SDNPOutGlue, SDNPHasChain,
+                                  SDNPVariadic]>;
+
+def AArch64WrapperLarge : SDNode<"AArch64ISD::WrapperLarge",
+                                 SDT_AArch64WrapperLarge>;
+
+
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+
+// AArch64 Instruction Predicate Definitions.
+//
+def HasZCZ    : Predicate<"Subtarget->hasZeroCycleZeroing()">;
+def NoZCZ     : Predicate<"!Subtarget->hasZeroCycleZeroing()">;
+def IsDarwin  : Predicate<"Subtarget->isTargetDarwin()">;
+def IsNotDarwin: Predicate<"!Subtarget->isTargetDarwin()">;
+def ForCodeSize   : Predicate<"ForCodeSize">;
+def NotForCodeSize   : Predicate<"!ForCodeSize">;
+
+include "AArch64InstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous instructions.
+//===----------------------------------------------------------------------===//
+
+let Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 in {
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
+                              [(AArch64callseq_start timm:$amt)]>;
+def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+                            [(AArch64callseq_end timm:$amt1, timm:$amt2)]>;
+} // Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1
+
+let isReMaterializable = 1, isCodeGenOnly = 1 in {
+// FIXME: The following pseudo instructions are only needed because remat
+// cannot handle multiple instructions.  When that changes, they can be
+// removed, along with the AArch64Wrapper node.
+
+let AddedComplexity = 10 in
+def LOADgot : Pseudo<(outs GPR64:$dst), (ins i64imm:$addr),
+                     [(set GPR64:$dst, (AArch64LOADgot tglobaladdr:$addr))]>,
+              Sched<[WriteLDAdr]>;
+
+// The MOVaddr instruction should match only when the add is not folded
+// into a load or store address.
+def MOVaddr
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64:$dst, (AArch64addlow (AArch64adrp tglobaladdr:$hi),
+                                            tglobaladdr:$low))]>,
+      Sched<[WriteAdrAdr]>;
+def MOVaddrJT
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64:$dst, (AArch64addlow (AArch64adrp tjumptable:$hi),
+                                             tjumptable:$low))]>,
+      Sched<[WriteAdrAdr]>;
+def MOVaddrCP
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64:$dst, (AArch64addlow (AArch64adrp tconstpool:$hi),
+                                             tconstpool:$low))]>,
+      Sched<[WriteAdrAdr]>;
+def MOVaddrBA
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64:$dst, (AArch64addlow (AArch64adrp tblockaddress:$hi),
+                                             tblockaddress:$low))]>,
+      Sched<[WriteAdrAdr]>;
+def MOVaddrTLS
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64:$dst, (AArch64addlow (AArch64adrp tglobaltlsaddr:$hi),
+                                            tglobaltlsaddr:$low))]>,
+      Sched<[WriteAdrAdr]>;
+def MOVaddrEXT
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64:$dst, (AArch64addlow (AArch64adrp texternalsym:$hi),
+                                            texternalsym:$low))]>,
+      Sched<[WriteAdrAdr]>;
+
+} // isReMaterializable, isCodeGenOnly
+
+def : Pat<(AArch64LOADgot tglobaltlsaddr:$addr),
+          (LOADgot tglobaltlsaddr:$addr)>;
+
+def : Pat<(AArch64LOADgot texternalsym:$addr),
+          (LOADgot texternalsym:$addr)>;
+
+def : Pat<(AArch64LOADgot tconstpool:$addr),
+          (LOADgot tconstpool:$addr)>;
+
+//===----------------------------------------------------------------------===//
+// System instructions.
+//===----------------------------------------------------------------------===//
+
+def HINT : HintI<"hint">;
+def : InstAlias<"nop",  (HINT 0b000)>;
+def : InstAlias<"yield",(HINT 0b001)>;
+def : InstAlias<"wfe",  (HINT 0b010)>;
+def : InstAlias<"wfi",  (HINT 0b011)>;
+def : InstAlias<"sev",  (HINT 0b100)>;
+def : InstAlias<"sevl", (HINT 0b101)>;
+
+// As far as LLVM is concerned this writes to the system's exclusive monitors.
+let mayLoad = 1, mayStore = 1 in
+def CLREX : CRmSystemI<imm0_15, 0b010, "clrex">;
+
+// NOTE: ideally, this would have mayStore = 0, mayLoad = 0, but we cannot
+// model patterns with sufficiently fine granularity.
+let mayLoad = ?, mayStore = ? in {
+def DMB   : CRmSystemI<barrier_op, 0b101, "dmb",
+                       [(int_aarch64_dmb (i32 imm32_0_15:$CRm))]>;
+
+def DSB   : CRmSystemI<barrier_op, 0b100, "dsb",
+                       [(int_aarch64_dsb (i32 imm32_0_15:$CRm))]>;
+
+def ISB   : CRmSystemI<barrier_op, 0b110, "isb",
+                       [(int_aarch64_isb (i32 imm32_0_15:$CRm))]>;
+}
+
+def : InstAlias<"clrex", (CLREX 0xf)>;
+def : InstAlias<"isb", (ISB 0xf)>;
+
+def MRS    : MRSI;
+def MSR    : MSRI;
+def MSRpstate: MSRpstateI;
+
+// The thread pointer (on Linux, at least, where this has been implemented) is
+// TPIDR_EL0.
+def : Pat<(AArch64threadpointer), (MRS 0xde82)>;
+
+// Generic system instructions
+def SYSxt  : SystemXtI<0, "sys">;
+def SYSLxt : SystemLXtI<1, "sysl">;
+
+def : InstAlias<"sys $op1, $Cn, $Cm, $op2",
+                (SYSxt imm0_7:$op1, sys_cr_op:$Cn,
+                 sys_cr_op:$Cm, imm0_7:$op2, XZR)>;
+
+//===----------------------------------------------------------------------===//
+// Move immediate instructions.
+//===----------------------------------------------------------------------===//
+
+defm MOVK : InsertImmediate<0b11, "movk">;
+defm MOVN : MoveImmediate<0b00, "movn">;
+
+let PostEncoderMethod = "fixMOVZ" in
+defm MOVZ : MoveImmediate<0b10, "movz">;
+
+// First group of aliases covers an implicit "lsl #0".
+def : InstAlias<"movk $dst, $imm", (MOVKWi GPR32:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movk $dst, $imm", (MOVKXi GPR64:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movn $dst, $imm", (MOVNWi GPR32:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movn $dst, $imm", (MOVNXi GPR64:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movz $dst, $imm", (MOVZWi GPR32:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movz $dst, $imm", (MOVZXi GPR64:$dst, imm0_65535:$imm, 0)>;
+
+// Next, we have various ELF relocations with the ":XYZ_g0:sym" syntax.
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>;
+
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>;
+
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g3:$sym, 48)>;
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g2:$sym, 32)>;
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g1:$sym, 16)>;
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g0:$sym, 0)>;
+
+def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>;
+
+def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>;
+
+def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g1:$sym, 16)>;
+def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g0:$sym, 0)>;
+
+// Final group of aliases covers true "mov $Rd, $imm" cases.
+multiclass movw_mov_alias<string basename,Instruction INST, RegisterClass GPR,
+                          int width, int shift> {
+  def _asmoperand : AsmOperandClass {
+    let Name = basename # width # "_lsl" # shift # "MovAlias";
+    let PredicateMethod = "is" # basename # "MovAlias<" # width # ", "
+                               # shift # ">";
+    let RenderMethod = "add" # basename # "MovAliasOperands<" # shift # ">";
+  }
+
+  def _movimm : Operand<i32> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_asmoperand");
+  }
+
+  def : InstAlias<"mov $Rd, $imm",
+                  (INST GPR:$Rd, !cast<Operand>(NAME # "_movimm"):$imm, shift)>;
+}
+
+defm : movw_mov_alias<"MOVZ", MOVZWi, GPR32, 32, 0>;
+defm : movw_mov_alias<"MOVZ", MOVZWi, GPR32, 32, 16>;
+
+defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 0>;
+defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 16>;
+defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 32>;
+defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 48>;
+
+defm : movw_mov_alias<"MOVN", MOVNWi, GPR32, 32, 0>;
+defm : movw_mov_alias<"MOVN", MOVNWi, GPR32, 32, 16>;
+
+defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 0>;
+defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 16>;
+defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 32>;
+defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 48>;
+
+let isReMaterializable = 1, isCodeGenOnly = 1, isMoveImm = 1,
+    isAsCheapAsAMove = 1 in {
+// FIXME: The following pseudo instructions are only needed because remat
+// cannot handle multiple instructions.  When that changes, we can select
+// directly to the real instructions and get rid of these pseudos.
+
+def MOVi32imm
+    : Pseudo<(outs GPR32:$dst), (ins i32imm:$src),
+             [(set GPR32:$dst, imm:$src)]>,
+      Sched<[WriteImm]>;
+def MOVi64imm
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$src),
+             [(set GPR64:$dst, imm:$src)]>,
+      Sched<[WriteImm]>;
+} // isReMaterializable, isCodeGenOnly
+
+// If possible, we want to use MOVi32imm even for 64-bit moves. This gives the
+// eventual expansion code fewer bits to worry about getting right. Marshalling
+// the types is a little tricky though:
+def i64imm_32bit : ImmLeaf<i64, [{
+  return (Imm & 0xffffffffULL) == static_cast<uint64_t>(Imm);
+}]>;
+
+def trunc_imm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue(), MVT::i32);
+}]>;
+
+def : Pat<(i64 i64imm_32bit:$src),
+          (SUBREG_TO_REG (i64 0), (MOVi32imm (trunc_imm imm:$src)), sub_32)>;
+
+// Deal with the various forms of (ELF) large addressing with MOVZ/MOVK
+// sequences.
+def : Pat<(AArch64WrapperLarge tglobaladdr:$g3, tglobaladdr:$g2,
+                             tglobaladdr:$g1, tglobaladdr:$g0),
+          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tglobaladdr:$g3, 48),
+                                  tglobaladdr:$g2, 32),
+                          tglobaladdr:$g1, 16),
+                  tglobaladdr:$g0, 0)>;
+
+def : Pat<(AArch64WrapperLarge tblockaddress:$g3, tblockaddress:$g2,
+                             tblockaddress:$g1, tblockaddress:$g0),
+          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tblockaddress:$g3, 48),
+                                  tblockaddress:$g2, 32),
+                          tblockaddress:$g1, 16),
+                  tblockaddress:$g0, 0)>;
+
+def : Pat<(AArch64WrapperLarge tconstpool:$g3, tconstpool:$g2,
+                             tconstpool:$g1, tconstpool:$g0),
+          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tconstpool:$g3, 48),
+                                  tconstpool:$g2, 32),
+                          tconstpool:$g1, 16),
+                  tconstpool:$g0, 0)>;
+
+def : Pat<(AArch64WrapperLarge tjumptable:$g3, tjumptable:$g2,
+                             tjumptable:$g1, tjumptable:$g0),
+          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tjumptable:$g3, 48),
+                                  tjumptable:$g2, 32),
+                          tjumptable:$g1, 16),
+                  tjumptable:$g0, 0)>;
+
+
+//===----------------------------------------------------------------------===//
+// Arithmetic instructions.
+//===----------------------------------------------------------------------===//
+
+// Add/subtract with carry.
+defm ADC : AddSubCarry<0, "adc", "adcs", AArch64adc, AArch64adc_flag>;
+defm SBC : AddSubCarry<1, "sbc", "sbcs", AArch64sbc, AArch64sbc_flag>;
+
+def : InstAlias<"ngc $dst, $src",  (SBCWr  GPR32:$dst, WZR, GPR32:$src)>;
+def : InstAlias<"ngc $dst, $src",  (SBCXr  GPR64:$dst, XZR, GPR64:$src)>;
+def : InstAlias<"ngcs $dst, $src", (SBCSWr GPR32:$dst, WZR, GPR32:$src)>;
+def : InstAlias<"ngcs $dst, $src", (SBCSXr GPR64:$dst, XZR, GPR64:$src)>;
+
+// Add/subtract
+defm ADD : AddSub<0, "add", add>;
+defm SUB : AddSub<1, "sub">;
+
+def : InstAlias<"mov $dst, $src",
+                (ADDWri GPR32sponly:$dst, GPR32sp:$src, 0, 0)>;
+def : InstAlias<"mov $dst, $src",
+                (ADDWri GPR32sp:$dst, GPR32sponly:$src, 0, 0)>;
+def : InstAlias<"mov $dst, $src",
+                (ADDXri GPR64sponly:$dst, GPR64sp:$src, 0, 0)>;
+def : InstAlias<"mov $dst, $src",
+                (ADDXri GPR64sp:$dst, GPR64sponly:$src, 0, 0)>;
+
+defm ADDS : AddSubS<0, "adds", AArch64add_flag, "cmn">;
+defm SUBS : AddSubS<1, "subs", AArch64sub_flag, "cmp">;
+
+// Use SUBS instead of SUB to enable CSE between SUBS and SUB.
+def : Pat<(sub GPR32sp:$Rn, addsub_shifted_imm32:$imm),
+          (SUBSWri GPR32sp:$Rn, addsub_shifted_imm32:$imm)>;
+def : Pat<(sub GPR64sp:$Rn, addsub_shifted_imm64:$imm),
+          (SUBSXri GPR64sp:$Rn, addsub_shifted_imm64:$imm)>;
+def : Pat<(sub GPR32:$Rn, GPR32:$Rm),
+          (SUBSWrr GPR32:$Rn, GPR32:$Rm)>;
+def : Pat<(sub GPR64:$Rn, GPR64:$Rm),
+          (SUBSXrr GPR64:$Rn, GPR64:$Rm)>;
+def : Pat<(sub GPR32:$Rn, arith_shifted_reg32:$Rm),
+          (SUBSWrs GPR32:$Rn, arith_shifted_reg32:$Rm)>;
+def : Pat<(sub GPR64:$Rn, arith_shifted_reg64:$Rm),
+          (SUBSXrs GPR64:$Rn, arith_shifted_reg64:$Rm)>;
+def : Pat<(sub GPR32sp:$R2, arith_extended_reg32<i32>:$R3),
+          (SUBSWrx GPR32sp:$R2, arith_extended_reg32<i32>:$R3)>;
+def : Pat<(sub GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3),
+          (SUBSXrx GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3)>;
+
+// Because of the immediate format for add/sub-imm instructions, the
+// expression (add x, -1) must be transformed to (SUB{W,X}ri x, 1).
+//  These patterns capture that transformation.
+let AddedComplexity = 1 in {
+def : Pat<(add GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
+          (SUBSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
+def : Pat<(add GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
+          (SUBSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
+def : Pat<(sub GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
+          (ADDWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
+def : Pat<(sub GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
+          (ADDXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
+}
+
+// Because of the immediate format for add/sub-imm instructions, the
+// expression (add x, -1) must be transformed to (SUB{W,X}ri x, 1).
+//  These patterns capture that transformation.
+let AddedComplexity = 1 in {
+def : Pat<(AArch64add_flag GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
+          (SUBSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
+def : Pat<(AArch64add_flag GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
+          (SUBSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
+def : Pat<(AArch64sub_flag GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
+          (ADDSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
+def : Pat<(AArch64sub_flag GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
+          (ADDSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
+}
+
+def : InstAlias<"neg $dst, $src", (SUBWrs GPR32:$dst, WZR, GPR32:$src, 0), 3>;
+def : InstAlias<"neg $dst, $src", (SUBXrs GPR64:$dst, XZR, GPR64:$src, 0), 3>;
+def : InstAlias<"neg $dst, $src$shift",
+                (SUBWrs GPR32:$dst, WZR, GPR32:$src, arith_shift32:$shift), 2>;
+def : InstAlias<"neg $dst, $src$shift",
+                (SUBXrs GPR64:$dst, XZR, GPR64:$src, arith_shift64:$shift), 2>;
+
+def : InstAlias<"negs $dst, $src", (SUBSWrs GPR32:$dst, WZR, GPR32:$src, 0), 3>;
+def : InstAlias<"negs $dst, $src", (SUBSXrs GPR64:$dst, XZR, GPR64:$src, 0), 3>;
+def : InstAlias<"negs $dst, $src$shift",
+                (SUBSWrs GPR32:$dst, WZR, GPR32:$src, arith_shift32:$shift), 2>;
+def : InstAlias<"negs $dst, $src$shift",
+                (SUBSXrs GPR64:$dst, XZR, GPR64:$src, arith_shift64:$shift), 2>;
+
+
+// Unsigned/Signed divide
+defm UDIV : Div<0, "udiv", udiv>;
+defm SDIV : Div<1, "sdiv", sdiv>;
+let isCodeGenOnly = 1 in {
+defm UDIV_Int : Div<0, "udiv", int_aarch64_udiv>;
+defm SDIV_Int : Div<1, "sdiv", int_aarch64_sdiv>;
+}
+
+// Variable shift
+defm ASRV : Shift<0b10, "asr", sra>;
+defm LSLV : Shift<0b00, "lsl", shl>;
+defm LSRV : Shift<0b01, "lsr", srl>;
+defm RORV : Shift<0b11, "ror", rotr>;
+
+def : ShiftAlias<"asrv", ASRVWr, GPR32>;
+def : ShiftAlias<"asrv", ASRVXr, GPR64>;
+def : ShiftAlias<"lslv", LSLVWr, GPR32>;
+def : ShiftAlias<"lslv", LSLVXr, GPR64>;
+def : ShiftAlias<"lsrv", LSRVWr, GPR32>;
+def : ShiftAlias<"lsrv", LSRVXr, GPR64>;
+def : ShiftAlias<"rorv", RORVWr, GPR32>;
+def : ShiftAlias<"rorv", RORVXr, GPR64>;
+
+// Multiply-add
+let AddedComplexity = 7 in {
+defm MADD : MulAccum<0, "madd", add>;
+defm MSUB : MulAccum<1, "msub", sub>;
+
+def : Pat<(i32 (mul GPR32:$Rn, GPR32:$Rm)),
+          (MADDWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
+def : Pat<(i64 (mul GPR64:$Rn, GPR64:$Rm)),
+          (MADDXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
+
+def : Pat<(i32 (ineg (mul GPR32:$Rn, GPR32:$Rm))),
+          (MSUBWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
+def : Pat<(i64 (ineg (mul GPR64:$Rn, GPR64:$Rm))),
+          (MSUBXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
+} // AddedComplexity = 7
+
+let AddedComplexity = 5 in {
+def SMADDLrrr : WideMulAccum<0, 0b001, "smaddl", add, sext>;
+def SMSUBLrrr : WideMulAccum<1, 0b001, "smsubl", sub, sext>;
+def UMADDLrrr : WideMulAccum<0, 0b101, "umaddl", add, zext>;
+def UMSUBLrrr : WideMulAccum<1, 0b101, "umsubl", sub, zext>;
+
+def : Pat<(i64 (mul (sext GPR32:$Rn), (sext GPR32:$Rm))),
+          (SMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
+def : Pat<(i64 (mul (zext GPR32:$Rn), (zext GPR32:$Rm))),
+          (UMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
+
+def : Pat<(i64 (ineg (mul (sext GPR32:$Rn), (sext GPR32:$Rm)))),
+          (SMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
+def : Pat<(i64 (ineg (mul (zext GPR32:$Rn), (zext GPR32:$Rm)))),
+          (UMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
+} // AddedComplexity = 5
+
+def : MulAccumWAlias<"mul", MADDWrrr>;
+def : MulAccumXAlias<"mul", MADDXrrr>;
+def : MulAccumWAlias<"mneg", MSUBWrrr>;
+def : MulAccumXAlias<"mneg", MSUBXrrr>;
+def : WideMulAccumAlias<"smull", SMADDLrrr>;
+def : WideMulAccumAlias<"smnegl", SMSUBLrrr>;
+def : WideMulAccumAlias<"umull", UMADDLrrr>;
+def : WideMulAccumAlias<"umnegl", UMSUBLrrr>;
+
+// Multiply-high
+def SMULHrr : MulHi<0b010, "smulh", mulhs>;
+def UMULHrr : MulHi<0b110, "umulh", mulhu>;
+
+// CRC32
+def CRC32Brr : BaseCRC32<0, 0b00, 0, GPR32, int_aarch64_crc32b, "crc32b">;
+def CRC32Hrr : BaseCRC32<0, 0b01, 0, GPR32, int_aarch64_crc32h, "crc32h">;
+def CRC32Wrr : BaseCRC32<0, 0b10, 0, GPR32, int_aarch64_crc32w, "crc32w">;
+def CRC32Xrr : BaseCRC32<1, 0b11, 0, GPR64, int_aarch64_crc32x, "crc32x">;
+
+def CRC32CBrr : BaseCRC32<0, 0b00, 1, GPR32, int_aarch64_crc32cb, "crc32cb">;
+def CRC32CHrr : BaseCRC32<0, 0b01, 1, GPR32, int_aarch64_crc32ch, "crc32ch">;
+def CRC32CWrr : BaseCRC32<0, 0b10, 1, GPR32, int_aarch64_crc32cw, "crc32cw">;
+def CRC32CXrr : BaseCRC32<1, 0b11, 1, GPR64, int_aarch64_crc32cx, "crc32cx">;
+
+
+//===----------------------------------------------------------------------===//
+// Logical instructions.
+//===----------------------------------------------------------------------===//
+
+// (immediate)
+defm ANDS : LogicalImmS<0b11, "ands", AArch64and_flag, "bics">;
+defm AND  : LogicalImm<0b00, "and", and, "bic">;
+defm EOR  : LogicalImm<0b10, "eor", xor, "eon">;
+defm ORR  : LogicalImm<0b01, "orr", or, "orn">;
+
+// FIXME: these aliases *are* canonical sometimes (when movz can't be
+// used). Actually, it seems to be working right now, but putting logical_immXX
+// here is a bit dodgy on the AsmParser side too.
+def : InstAlias<"mov $dst, $imm", (ORRWri GPR32sp:$dst, WZR,
+                                          logical_imm32:$imm), 0>;
+def : InstAlias<"mov $dst, $imm", (ORRXri GPR64sp:$dst, XZR,
+                                          logical_imm64:$imm), 0>;
+
+
+// (register)
+defm ANDS : LogicalRegS<0b11, 0, "ands", AArch64and_flag>;
+defm BICS : LogicalRegS<0b11, 1, "bics",
+                        BinOpFrag<(AArch64and_flag node:$LHS, (not node:$RHS))>>;
+defm AND  : LogicalReg<0b00, 0, "and", and>;
+defm BIC  : LogicalReg<0b00, 1, "bic",
+                       BinOpFrag<(and node:$LHS, (not node:$RHS))>>;
+defm EON  : LogicalReg<0b10, 1, "eon",
+                       BinOpFrag<(xor node:$LHS, (not node:$RHS))>>;
+defm EOR  : LogicalReg<0b10, 0, "eor", xor>;
+defm ORN  : LogicalReg<0b01, 1, "orn",
+                       BinOpFrag<(or node:$LHS, (not node:$RHS))>>;
+defm ORR  : LogicalReg<0b01, 0, "orr", or>;
+
+def : InstAlias<"mov $dst, $src", (ORRWrs GPR32:$dst, WZR, GPR32:$src, 0), 2>;
+def : InstAlias<"mov $dst, $src", (ORRXrs GPR64:$dst, XZR, GPR64:$src, 0), 2>;
+
+def : InstAlias<"mvn $Wd, $Wm", (ORNWrs GPR32:$Wd, WZR, GPR32:$Wm, 0), 3>;
+def : InstAlias<"mvn $Xd, $Xm", (ORNXrs GPR64:$Xd, XZR, GPR64:$Xm, 0), 3>;
+
+def : InstAlias<"mvn $Wd, $Wm$sh",
+                (ORNWrs GPR32:$Wd, WZR, GPR32:$Wm, logical_shift32:$sh), 2>;
+def : InstAlias<"mvn $Xd, $Xm$sh",
+                (ORNXrs GPR64:$Xd, XZR, GPR64:$Xm, logical_shift64:$sh), 2>;
+
+def : InstAlias<"tst $src1, $src2",
+                (ANDSWri WZR, GPR32:$src1, logical_imm32:$src2), 2>;
+def : InstAlias<"tst $src1, $src2",
+                (ANDSXri XZR, GPR64:$src1, logical_imm64:$src2), 2>;
+
+def : InstAlias<"tst $src1, $src2",
+                        (ANDSWrs WZR, GPR32:$src1, GPR32:$src2, 0), 3>;
+def : InstAlias<"tst $src1, $src2",
+                        (ANDSXrs XZR, GPR64:$src1, GPR64:$src2, 0), 3>;
+
+def : InstAlias<"tst $src1, $src2$sh",
+               (ANDSWrs WZR, GPR32:$src1, GPR32:$src2, logical_shift32:$sh), 2>;
+def : InstAlias<"tst $src1, $src2$sh",
+               (ANDSXrs XZR, GPR64:$src1, GPR64:$src2, logical_shift64:$sh), 2>;
+
+
+def : Pat<(not GPR32:$Wm), (ORNWrr WZR, GPR32:$Wm)>;
+def : Pat<(not GPR64:$Xm), (ORNXrr XZR, GPR64:$Xm)>;
+
+
+//===----------------------------------------------------------------------===//
+// One operand data processing instructions.
+//===----------------------------------------------------------------------===//
+
+defm CLS    : OneOperandData<0b101, "cls">;
+defm CLZ    : OneOperandData<0b100, "clz", ctlz>;
+defm RBIT   : OneOperandData<0b000, "rbit">;
+
+def : Pat<(int_aarch64_rbit GPR32:$Rn), (RBITWr $Rn)>;
+def : Pat<(int_aarch64_rbit GPR64:$Rn), (RBITXr $Rn)>;
+
+def  REV16Wr : OneWRegData<0b001, "rev16",
+                                  UnOpFrag<(rotr (bswap node:$LHS), (i64 16))>>;
+def  REV16Xr : OneXRegData<0b001, "rev16", null_frag>;
+
+def : Pat<(cttz GPR32:$Rn),
+          (CLZWr (RBITWr GPR32:$Rn))>;
+def : Pat<(cttz GPR64:$Rn),
+          (CLZXr (RBITXr GPR64:$Rn))>;
+def : Pat<(ctlz (or (shl (xor (sra GPR32:$Rn, (i64 31)), GPR32:$Rn), (i64 1)),
+                (i32 1))),
+          (CLSWr GPR32:$Rn)>;
+def : Pat<(ctlz (or (shl (xor (sra GPR64:$Rn, (i64 63)), GPR64:$Rn), (i64 1)),
+                (i64 1))),
+          (CLSXr GPR64:$Rn)>;
+
+// Unlike the other one operand instructions, the instructions with the "rev"
+// mnemonic do *not* just different in the size bit, but actually use different
+// opcode bits for the different sizes.
+def REVWr   : OneWRegData<0b010, "rev", bswap>;
+def REVXr   : OneXRegData<0b011, "rev", bswap>;
+def REV32Xr : OneXRegData<0b010, "rev32",
+                                 UnOpFrag<(rotr (bswap node:$LHS), (i64 32))>>;
+
+// The bswap commutes with the rotr so we want a pattern for both possible
+// orders.
+def : Pat<(bswap (rotr GPR32:$Rn, (i64 16))), (REV16Wr GPR32:$Rn)>;
+def : Pat<(bswap (rotr GPR64:$Rn, (i64 32))), (REV32Xr GPR64:$Rn)>;
+
+//===----------------------------------------------------------------------===//
+// Bitfield immediate extraction instruction.
+//===----------------------------------------------------------------------===//
+let neverHasSideEffects = 1 in
+defm EXTR : ExtractImm<"extr">;
+def : InstAlias<"ror $dst, $src, $shift",
+            (EXTRWrri GPR32:$dst, GPR32:$src, GPR32:$src, imm0_31:$shift)>;
+def : InstAlias<"ror $dst, $src, $shift",
+            (EXTRXrri GPR64:$dst, GPR64:$src, GPR64:$src, imm0_63:$shift)>;
+
+def : Pat<(rotr GPR32:$Rn, (i64 imm0_31:$imm)),
+          (EXTRWrri GPR32:$Rn, GPR32:$Rn, imm0_31:$imm)>;
+def : Pat<(rotr GPR64:$Rn, (i64 imm0_63:$imm)),
+          (EXTRXrri GPR64:$Rn, GPR64:$Rn, imm0_63:$imm)>;
+
+//===----------------------------------------------------------------------===//
+// Other bitfield immediate instructions.
+//===----------------------------------------------------------------------===//
+let neverHasSideEffects = 1 in {
+defm BFM  : BitfieldImmWith2RegArgs<0b01, "bfm">;
+defm SBFM : BitfieldImm<0b00, "sbfm">;
+defm UBFM : BitfieldImm<0b10, "ubfm">;
+}
+
+def i32shift_a : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = (32 - N->getZExtValue()) & 0x1f;
+  return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
+
+def i32shift_b : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 31 - N->getZExtValue();
+  return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
+
+// min(7, 31 - shift_amt)
+def i32shift_sext_i8 : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 31 - N->getZExtValue();
+  enc = enc > 7 ? 7 : enc;
+  return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
+
+// min(15, 31 - shift_amt)
+def i32shift_sext_i16 : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 31 - N->getZExtValue();
+  enc = enc > 15 ? 15 : enc;
+  return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
+
+def i64shift_a : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = (64 - N->getZExtValue()) & 0x3f;
+  return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
+
+def i64shift_b : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 63 - N->getZExtValue();
+  return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
+
+// min(7, 63 - shift_amt)
+def i64shift_sext_i8 : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 63 - N->getZExtValue();
+  enc = enc > 7 ? 7 : enc;
+  return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
+
+// min(15, 63 - shift_amt)
+def i64shift_sext_i16 : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 63 - N->getZExtValue();
+  enc = enc > 15 ? 15 : enc;
+  return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
+
+// min(31, 63 - shift_amt)
+def i64shift_sext_i32 : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 63 - N->getZExtValue();
+  enc = enc > 31 ? 31 : enc;
+  return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
+
+def : Pat<(shl GPR32:$Rn, (i64 imm0_31:$imm)),
+          (UBFMWri GPR32:$Rn, (i64 (i32shift_a imm0_31:$imm)),
+                              (i64 (i32shift_b imm0_31:$imm)))>;
+def : Pat<(shl GPR64:$Rn, (i64 imm0_63:$imm)),
+          (UBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)),
+                              (i64 (i64shift_b imm0_63:$imm)))>;
+
+let AddedComplexity = 10 in {
+def : Pat<(sra GPR32:$Rn, (i64 imm0_31:$imm)),
+          (SBFMWri GPR32:$Rn, imm0_31:$imm, 31)>;
+def : Pat<(sra GPR64:$Rn, (i64 imm0_63:$imm)),
+          (SBFMXri GPR64:$Rn, imm0_63:$imm, 63)>;
+}
+
+def : InstAlias<"asr $dst, $src, $shift",
+                (SBFMWri GPR32:$dst, GPR32:$src, imm0_31:$shift, 31)>;
+def : InstAlias<"asr $dst, $src, $shift",
+                (SBFMXri GPR64:$dst, GPR64:$src, imm0_63:$shift, 63)>;
+def : InstAlias<"sxtb $dst, $src", (SBFMWri GPR32:$dst, GPR32:$src, 0, 7)>;
+def : InstAlias<"sxtb $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 7)>;
+def : InstAlias<"sxth $dst, $src", (SBFMWri GPR32:$dst, GPR32:$src, 0, 15)>;
+def : InstAlias<"sxth $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 15)>;
+def : InstAlias<"sxtw $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 31)>;
+
+def : Pat<(srl GPR32:$Rn, (i64 imm0_31:$imm)),
+          (UBFMWri GPR32:$Rn, imm0_31:$imm, 31)>;
+def : Pat<(srl GPR64:$Rn, (i64 imm0_63:$imm)),
+          (UBFMXri GPR64:$Rn, imm0_63:$imm, 63)>;
+
+def : InstAlias<"lsr $dst, $src, $shift",
+                (UBFMWri GPR32:$dst, GPR32:$src, imm0_31:$shift, 31)>;
+def : InstAlias<"lsr $dst, $src, $shift",
+                (UBFMXri GPR64:$dst, GPR64:$src, imm0_63:$shift, 63)>;
+def : InstAlias<"uxtb $dst, $src", (UBFMWri GPR32:$dst, GPR32:$src, 0, 7)>;
+def : InstAlias<"uxtb $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 7)>;
+def : InstAlias<"uxth $dst, $src", (UBFMWri GPR32:$dst, GPR32:$src, 0, 15)>;
+def : InstAlias<"uxth $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 15)>;
+def : InstAlias<"uxtw $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 31)>;
+
+//===----------------------------------------------------------------------===//
+// Conditionally set flags instructions.
+//===----------------------------------------------------------------------===//
+defm CCMN : CondSetFlagsImm<0, "ccmn">;
+defm CCMP : CondSetFlagsImm<1, "ccmp">;
+
+defm CCMN : CondSetFlagsReg<0, "ccmn">;
+defm CCMP : CondSetFlagsReg<1, "ccmp">;
+
+//===----------------------------------------------------------------------===//
+// Conditional select instructions.
+//===----------------------------------------------------------------------===//
+defm CSEL  : CondSelect<0, 0b00, "csel">;
+
+def inc : PatFrag<(ops node:$in), (add node:$in, 1)>;
+defm CSINC : CondSelectOp<0, 0b01, "csinc", inc>;
+defm CSINV : CondSelectOp<1, 0b00, "csinv", not>;
+defm CSNEG : CondSelectOp<1, 0b01, "csneg", ineg>;
+
+def : Pat<(AArch64csinv GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV),
+          (CSINVWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
+def : Pat<(AArch64csinv GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV),
+          (CSINVXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
+def : Pat<(AArch64csneg GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV),
+          (CSNEGWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
+def : Pat<(AArch64csneg GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV),
+          (CSNEGXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
+def : Pat<(AArch64csinc GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV),
+          (CSINCWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
+def : Pat<(AArch64csinc GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV),
+          (CSINCXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
+
+def : Pat<(AArch64csel (i32 0), (i32 1), (i32 imm:$cc), NZCV),
+          (CSINCWr WZR, WZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel (i64 0), (i64 1), (i32 imm:$cc), NZCV),
+          (CSINCXr XZR, XZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel (i32 0), (i32 -1), (i32 imm:$cc), NZCV),
+          (CSINVWr WZR, WZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel (i64 0), (i64 -1), (i32 imm:$cc), NZCV),
+          (CSINVXr XZR, XZR, (i32 imm:$cc))>;
+
+// The inverse of the condition code from the alias instruction is what is used
+// in the aliased instruction. The parser all ready inverts the condition code
+// for these aliases.
+def : InstAlias<"cset $dst, $cc",
+                (CSINCWr GPR32:$dst, WZR, WZR, inv_ccode:$cc)>;
+def : InstAlias<"cset $dst, $cc",
+                (CSINCXr GPR64:$dst, XZR, XZR, inv_ccode:$cc)>;
+
+def : InstAlias<"csetm $dst, $cc",
+                (CSINVWr GPR32:$dst, WZR, WZR, inv_ccode:$cc)>;
+def : InstAlias<"csetm $dst, $cc",
+                (CSINVXr GPR64:$dst, XZR, XZR, inv_ccode:$cc)>;
+
+def : InstAlias<"cinc $dst, $src, $cc",
+                (CSINCWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>;
+def : InstAlias<"cinc $dst, $src, $cc",
+                (CSINCXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>;
+
+def : InstAlias<"cinv $dst, $src, $cc",
+                (CSINVWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>;
+def : InstAlias<"cinv $dst, $src, $cc",
+                (CSINVXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>;
+
+def : InstAlias<"cneg $dst, $src, $cc",
+                (CSNEGWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>;
+def : InstAlias<"cneg $dst, $src, $cc",
+                (CSNEGXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>;
+
+//===----------------------------------------------------------------------===//
+// PC-relative instructions.
+//===----------------------------------------------------------------------===//
+let isReMaterializable = 1 in {
+let neverHasSideEffects = 1, mayStore = 0, mayLoad = 0 in {
+def ADR  : ADRI<0, "adr", adrlabel, []>;
+} // neverHasSideEffects = 1
+
+def ADRP : ADRI<1, "adrp", adrplabel,
+                [(set GPR64:$Xd, (AArch64adrp tglobaladdr:$label))]>;
+} // isReMaterializable = 1
+
+// page address of a constant pool entry, block address
+def : Pat<(AArch64adrp tconstpool:$cp), (ADRP tconstpool:$cp)>;
+def : Pat<(AArch64adrp tblockaddress:$cp), (ADRP tblockaddress:$cp)>;
+
+//===----------------------------------------------------------------------===//
+// Unconditional branch (register) instructions.
+//===----------------------------------------------------------------------===//
+
+let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
+def RET  : BranchReg<0b0010, "ret", []>;
+def DRPS : SpecialReturn<0b0101, "drps">;
+def ERET : SpecialReturn<0b0100, "eret">;
+} // isReturn = 1, isTerminator = 1, isBarrier = 1
+
+// Default to the LR register.
+def : InstAlias<"ret", (RET LR)>;
+
+let isCall = 1, Defs = [LR], Uses = [SP] in {
+def BLR : BranchReg<0b0001, "blr", [(AArch64call GPR64:$Rn)]>;
+} // isCall
+
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
+def BR  : BranchReg<0b0000, "br", [(brind GPR64:$Rn)]>;
+} // isBranch, isTerminator, isBarrier, isIndirectBranch
+
+// Create a separate pseudo-instruction for codegen to use so that we don't
+// flag lr as used in every function. It'll be restored before the RET by the
+// epilogue if it's legitimately used.
+def RET_ReallyLR : Pseudo<(outs), (ins), [(AArch64retflag)]> {
+  let isTerminator = 1;
+  let isBarrier = 1;
+  let isReturn = 1;
+}
+
+// This is a directive-like pseudo-instruction. The purpose is to insert an
+// R_AARCH64_TLSDESC_CALL relocation at the offset of the following instruction
+// (which in the usual case is a BLR).
+let hasSideEffects = 1 in
+def TLSDESCCALL : Pseudo<(outs), (ins i64imm:$sym), []> {
+  let AsmString = ".tlsdesccall $sym";
+}
+
+// Pseudo-instruction representing a BLR with attached TLSDESC relocation. It
+// gets expanded to two MCInsts during lowering.
+let isCall = 1, Defs = [LR] in
+def TLSDESC_BLR
+    : Pseudo<(outs), (ins GPR64:$dest, i64imm:$sym),
+             [(AArch64tlsdesc_call GPR64:$dest, tglobaltlsaddr:$sym)]>;
+
+def : Pat<(AArch64tlsdesc_call GPR64:$dest, texternalsym:$sym),
+          (TLSDESC_BLR GPR64:$dest, texternalsym:$sym)>;
+//===----------------------------------------------------------------------===//
+// Conditional branch (immediate) instruction.
+//===----------------------------------------------------------------------===//
+def Bcc : BranchCond;
+
+//===----------------------------------------------------------------------===//
+// Compare-and-branch instructions.
+//===----------------------------------------------------------------------===//
+defm CBZ  : CmpBranch<0, "cbz", AArch64cbz>;
+defm CBNZ : CmpBranch<1, "cbnz", AArch64cbnz>;
+
+//===----------------------------------------------------------------------===//
+// Test-bit-and-branch instructions.
+//===----------------------------------------------------------------------===//
+defm TBZ  : TestBranch<0, "tbz", AArch64tbz>;
+defm TBNZ : TestBranch<1, "tbnz", AArch64tbnz>;
+
+//===----------------------------------------------------------------------===//
+// Unconditional branch (immediate) instructions.
+//===----------------------------------------------------------------------===//
+let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
+def B  : BranchImm<0, "b", [(br bb:$addr)]>;
+} // isBranch, isTerminator, isBarrier
+
+let isCall = 1, Defs = [LR], Uses = [SP] in {
+def BL : CallImm<1, "bl", [(AArch64call tglobaladdr:$addr)]>;
+} // isCall
+def : Pat<(AArch64call texternalsym:$func), (BL texternalsym:$func)>;
+
+//===----------------------------------------------------------------------===//
+// Exception generation instructions.
+//===----------------------------------------------------------------------===//
+def BRK   : ExceptionGeneration<0b001, 0b00, "brk">;
+def DCPS1 : ExceptionGeneration<0b101, 0b01, "dcps1">;
+def DCPS2 : ExceptionGeneration<0b101, 0b10, "dcps2">;
+def DCPS3 : ExceptionGeneration<0b101, 0b11, "dcps3">;
+def HLT   : ExceptionGeneration<0b010, 0b00, "hlt">;
+def HVC   : ExceptionGeneration<0b000, 0b10, "hvc">;
+def SMC   : ExceptionGeneration<0b000, 0b11, "smc">;
+def SVC   : ExceptionGeneration<0b000, 0b01, "svc">;
+
+// DCPSn defaults to an immediate operand of zero if unspecified.
+def : InstAlias<"dcps1", (DCPS1 0)>;
+def : InstAlias<"dcps2", (DCPS2 0)>;
+def : InstAlias<"dcps3", (DCPS3 0)>;
+
+//===----------------------------------------------------------------------===//
+// Load instructions.
+//===----------------------------------------------------------------------===//
+
+// Pair (indexed, offset)
+defm LDPW : LoadPairOffset<0b00, 0, GPR32, simm7s4, "ldp">;
+defm LDPX : LoadPairOffset<0b10, 0, GPR64, simm7s8, "ldp">;
+defm LDPS : LoadPairOffset<0b00, 1, FPR32, simm7s4, "ldp">;
+defm LDPD : LoadPairOffset<0b01, 1, FPR64, simm7s8, "ldp">;
+defm LDPQ : LoadPairOffset<0b10, 1, FPR128, simm7s16, "ldp">;
+
+defm LDPSW : LoadPairOffset<0b01, 0, GPR64, simm7s4, "ldpsw">;
+
+// Pair (pre-indexed)
+def LDPWpre : LoadPairPreIdx<0b00, 0, GPR32, simm7s4, "ldp">;
+def LDPXpre : LoadPairPreIdx<0b10, 0, GPR64, simm7s8, "ldp">;
+def LDPSpre : LoadPairPreIdx<0b00, 1, FPR32, simm7s4, "ldp">;
+def LDPDpre : LoadPairPreIdx<0b01, 1, FPR64, simm7s8, "ldp">;
+def LDPQpre : LoadPairPreIdx<0b10, 1, FPR128, simm7s16, "ldp">;
+
+def LDPSWpre : LoadPairPreIdx<0b01, 0, GPR64, simm7s4, "ldpsw">;
+
+// Pair (post-indexed)
+def LDPWpost : LoadPairPostIdx<0b00, 0, GPR32, simm7s4, "ldp">;
+def LDPXpost : LoadPairPostIdx<0b10, 0, GPR64, simm7s8, "ldp">;
+def LDPSpost : LoadPairPostIdx<0b00, 1, FPR32, simm7s4, "ldp">;
+def LDPDpost : LoadPairPostIdx<0b01, 1, FPR64, simm7s8, "ldp">;
+def LDPQpost : LoadPairPostIdx<0b10, 1, FPR128, simm7s16, "ldp">;
+
+def LDPSWpost : LoadPairPostIdx<0b01, 0, GPR64, simm7s4, "ldpsw">;
+
+
+// Pair (no allocate)
+defm LDNPW : LoadPairNoAlloc<0b00, 0, GPR32, simm7s4, "ldnp">;
+defm LDNPX : LoadPairNoAlloc<0b10, 0, GPR64, simm7s8, "ldnp">;
+defm LDNPS : LoadPairNoAlloc<0b00, 1, FPR32, simm7s4, "ldnp">;
+defm LDNPD : LoadPairNoAlloc<0b01, 1, FPR64, simm7s8, "ldnp">;
+defm LDNPQ : LoadPairNoAlloc<0b10, 1, FPR128, simm7s16, "ldnp">;
+
+//---
+// (register offset)
+//---
+
+// Integer
+defm LDRBB : Load8RO<0b00,  0, 0b01, GPR32, "ldrb", i32, zextloadi8>;
+defm LDRHH : Load16RO<0b01, 0, 0b01, GPR32, "ldrh", i32, zextloadi16>;
+defm LDRW  : Load32RO<0b10, 0, 0b01, GPR32, "ldr", i32, load>;
+defm LDRX  : Load64RO<0b11, 0, 0b01, GPR64, "ldr", i64, load>;
+
+// Floating-point
+defm LDRB : Load8RO<0b00,   1, 0b01, FPR8,   "ldr", untyped, load>;
+defm LDRH : Load16RO<0b01,  1, 0b01, FPR16,  "ldr", f16, load>;
+defm LDRS : Load32RO<0b10,  1, 0b01, FPR32,  "ldr", f32, load>;
+defm LDRD : Load64RO<0b11,  1, 0b01, FPR64,  "ldr", f64, load>;
+defm LDRQ : Load128RO<0b00, 1, 0b11, FPR128, "ldr", f128, load>;
+
+// Load sign-extended half-word
+defm LDRSHW : Load16RO<0b01, 0, 0b11, GPR32, "ldrsh", i32, sextloadi16>;
+defm LDRSHX : Load16RO<0b01, 0, 0b10, GPR64, "ldrsh", i64, sextloadi16>;
+
+// Load sign-extended byte
+defm LDRSBW : Load8RO<0b00, 0, 0b11, GPR32, "ldrsb", i32, sextloadi8>;
+defm LDRSBX : Load8RO<0b00, 0, 0b10, GPR64, "ldrsb", i64, sextloadi8>;
+
+// Load sign-extended word
+defm LDRSW  : Load32RO<0b10, 0, 0b10, GPR64, "ldrsw", i64, sextloadi32>;
+
+// Pre-fetch.
+defm PRFM : PrefetchRO<0b11, 0, 0b10, "prfm">;
+
+// For regular load, we do not have any alignment requirement.
+// Thus, it is safe to directly map the vector loads with interesting
+// addressing modes.
+// FIXME: We could do the same for bitconvert to floating point vectors.
+multiclass ScalToVecROLoadPat<ROAddrMode ro, SDPatternOperator loadop,
+                              ValueType ScalTy, ValueType VecTy,
+                              Instruction LOADW, Instruction LOADX,
+                              SubRegIndex sub> {
+  def : Pat<(VecTy (scalar_to_vector (ScalTy
+              (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset))))),
+            (INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
+                           (LOADW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset),
+                           sub)>;
+
+  def : Pat<(VecTy (scalar_to_vector (ScalTy
+              (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset))))),
+            (INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
+                           (LOADX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset),
+                           sub)>;
+}
+
+let AddedComplexity = 10 in {
+defm : ScalToVecROLoadPat<ro8,  extloadi8,  i32, v8i8,  LDRBroW, LDRBroX, bsub>;
+defm : ScalToVecROLoadPat<ro8,  extloadi8,  i32, v16i8, LDRBroW, LDRBroX, bsub>;
+
+defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v4i16, LDRHroW, LDRHroX, hsub>;
+defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v8i16, LDRHroW, LDRHroX, hsub>;
+
+defm : ScalToVecROLoadPat<ro32, load,       i32, v2i32, LDRSroW, LDRSroX, ssub>;
+defm : ScalToVecROLoadPat<ro32, load,       i32, v4i32, LDRSroW, LDRSroX, ssub>;
+
+defm : ScalToVecROLoadPat<ro32, load,       f32, v2f32, LDRSroW, LDRSroX, ssub>;
+defm : ScalToVecROLoadPat<ro32, load,       f32, v4f32, LDRSroW, LDRSroX, ssub>;
+
+defm : ScalToVecROLoadPat<ro64, load,       i64, v2i64, LDRDroW, LDRDroX, dsub>;
+
+defm : ScalToVecROLoadPat<ro64, load,       f64, v2f64, LDRDroW, LDRDroX, dsub>;
+
+
+def : Pat <(v1i64 (scalar_to_vector (i64
+                      (load (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+                                           ro_Wextend64:$extend))))),
+           (LDRDroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;
+
+def : Pat <(v1i64 (scalar_to_vector (i64
+                      (load (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+                                           ro_Xextend64:$extend))))),
+           (LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
+}
+
+// Match all load 64 bits width whose type is compatible with FPR64
+multiclass VecROLoadPat<ROAddrMode ro, ValueType VecTy,
+                        Instruction LOADW, Instruction LOADX> {
+
+  def : Pat<(VecTy (load (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))),
+            (LOADW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
+
+  def : Pat<(VecTy (load (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))),
+            (LOADX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
+}
+
+let AddedComplexity = 10 in {
+let Predicates = [IsLE] in {
+  // We must do vector loads with LD1 in big-endian.
+  defm : VecROLoadPat<ro64, v2i32, LDRDroW, LDRDroX>;
+  defm : VecROLoadPat<ro64, v2f32, LDRDroW, LDRDroX>;
+  defm : VecROLoadPat<ro64, v8i8,  LDRDroW, LDRDroX>;
+  defm : VecROLoadPat<ro64, v4i16, LDRDroW, LDRDroX>;
+}
+
+defm : VecROLoadPat<ro64, v1i64,  LDRDroW, LDRDroX>;
+defm : VecROLoadPat<ro64, v1f64,  LDRDroW, LDRDroX>;
+
+// Match all load 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+  // We must do vector loads with LD1 in big-endian.
+  defm : VecROLoadPat<ro128, v2i64,  LDRQroW, LDRQroX>;
+  defm : VecROLoadPat<ro128, v2f64,  LDRQroW, LDRQroX>;
+  defm : VecROLoadPat<ro128, v4i32,  LDRQroW, LDRQroX>;
+  defm : VecROLoadPat<ro128, v4f32,  LDRQroW, LDRQroX>;
+  defm : VecROLoadPat<ro128, v8i16,  LDRQroW, LDRQroX>;
+  defm : VecROLoadPat<ro128, v16i8,  LDRQroW, LDRQroX>;
+}
+} // AddedComplexity = 10
+
+// zextload -> i64
+multiclass ExtLoadTo64ROPat<ROAddrMode ro, SDPatternOperator loadop,
+                            Instruction INSTW, Instruction INSTX> {
+  def : Pat<(i64 (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))),
+            (SUBREG_TO_REG (i64 0),
+                           (INSTW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend),
+                           sub_32)>;
+
+  def : Pat<(i64 (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))),
+            (SUBREG_TO_REG (i64 0),
+                           (INSTX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend),
+                           sub_32)>;
+}
+
+let AddedComplexity = 10 in {
+  defm : ExtLoadTo64ROPat<ro8,  zextloadi8,  LDRBBroW, LDRBBroX>;
+  defm : ExtLoadTo64ROPat<ro16, zextloadi16, LDRHHroW, LDRHHroX>;
+  defm : ExtLoadTo64ROPat<ro32, zextloadi32, LDRWroW,  LDRWroX>;
+
+  // zextloadi1 -> zextloadi8
+  defm : ExtLoadTo64ROPat<ro8,  zextloadi1,  LDRBBroW, LDRBBroX>;
+
+  // extload -> zextload
+  defm : ExtLoadTo64ROPat<ro8,  extloadi8,   LDRBBroW, LDRBBroX>;
+  defm : ExtLoadTo64ROPat<ro16, extloadi16,  LDRHHroW, LDRHHroX>;
+  defm : ExtLoadTo64ROPat<ro32, extloadi32,  LDRWroW,  LDRWroX>;
+
+  // extloadi1 -> zextloadi8
+  defm : ExtLoadTo64ROPat<ro8,  extloadi1,   LDRBBroW, LDRBBroX>;
+}
+
+
+// zextload -> i64
+multiclass ExtLoadTo32ROPat<ROAddrMode ro, SDPatternOperator loadop,
+                            Instruction INSTW, Instruction INSTX> {
+  def : Pat<(i32 (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))),
+            (INSTW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
+
+  def : Pat<(i32 (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))),
+            (INSTX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
+
+}
+
+let AddedComplexity = 10 in {
+  // extload -> zextload
+  defm : ExtLoadTo32ROPat<ro8,  extloadi8,   LDRBBroW, LDRBBroX>;
+  defm : ExtLoadTo32ROPat<ro16, extloadi16,  LDRHHroW, LDRHHroX>;
+  defm : ExtLoadTo32ROPat<ro32, extloadi32,  LDRWroW,  LDRWroX>;
+
+  // zextloadi1 -> zextloadi8
+  defm : ExtLoadTo32ROPat<ro8, zextloadi1, LDRBBroW, LDRBBroX>;
+}
+
+//---
+// (unsigned immediate)
+//---
+defm LDRX : LoadUI<0b11, 0, 0b01, GPR64, uimm12s8, "ldr",
+                   [(set GPR64:$Rt,
+                         (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)))]>;
+defm LDRW : LoadUI<0b10, 0, 0b01, GPR32, uimm12s4, "ldr",
+                   [(set GPR32:$Rt,
+                         (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)))]>;
+defm LDRB : LoadUI<0b00, 1, 0b01, FPR8, uimm12s1, "ldr",
+                   [(set FPR8:$Rt,
+                         (load (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)))]>;
+defm LDRH : LoadUI<0b01, 1, 0b01, FPR16, uimm12s2, "ldr",
+                   [(set (f16 FPR16:$Rt),
+                         (load (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)))]>;
+defm LDRS : LoadUI<0b10, 1, 0b01, FPR32, uimm12s4, "ldr",
+                   [(set (f32 FPR32:$Rt),
+                         (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)))]>;
+defm LDRD : LoadUI<0b11, 1, 0b01, FPR64, uimm12s8, "ldr",
+                   [(set (f64 FPR64:$Rt),
+                         (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)))]>;
+defm LDRQ : LoadUI<0b00, 1, 0b11, FPR128, uimm12s16, "ldr",
+                 [(set (f128 FPR128:$Rt),
+                       (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)))]>;
+
+// For regular load, we do not have any alignment requirement.
+// Thus, it is safe to directly map the vector loads with interesting
+// addressing modes.
+// FIXME: We could do the same for bitconvert to floating point vectors.
+def : Pat <(v8i8 (scalar_to_vector (i32
+               (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
+           (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)),
+                          (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
+def : Pat <(v16i8 (scalar_to_vector (i32
+               (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
+           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+                          (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
+def : Pat <(v4i16 (scalar_to_vector (i32
+               (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
+           (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
+                          (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
+def : Pat <(v8i16 (scalar_to_vector (i32
+               (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
+           (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
+                          (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
+def : Pat <(v2i32 (scalar_to_vector (i32
+               (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
+           (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
+                          (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
+def : Pat <(v4i32 (scalar_to_vector (i32
+               (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
+           (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
+                          (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
+def : Pat <(v1i64 (scalar_to_vector (i64
+               (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
+           (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat <(v2i64 (scalar_to_vector (i64
+               (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
+           (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
+                          (LDRDui GPR64sp:$Rn, uimm12s8:$offset), dsub)>;
+
+// Match all load 64 bits width whose type is compatible with FPR64
+let Predicates = [IsLE] in {
+  // We must use LD1 to perform vector loads in big-endian.
+  def : Pat<(v2f32 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+            (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+  def : Pat<(v8i8 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+            (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+  def : Pat<(v4i16 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+            (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+  def : Pat<(v2i32 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+            (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+}
+def : Pat<(v1f64 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+          (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat<(v1i64 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+          (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+
+// Match all load 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+  // We must use LD1 to perform vector loads in big-endian.
+  def : Pat<(v4f32 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+            (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(v2f64 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+            (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(v16i8 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+            (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(v8i16 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+            (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(v4i32 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+            (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(v2i64 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+            (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+}
+def : Pat<(f128  (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+          (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+
+defm LDRHH : LoadUI<0b01, 0, 0b01, GPR32, uimm12s2, "ldrh",
+                    [(set GPR32:$Rt,
+                          (zextloadi16 (am_indexed16 GPR64sp:$Rn,
+                                                     uimm12s2:$offset)))]>;
+defm LDRBB : LoadUI<0b00, 0, 0b01, GPR32, uimm12s1, "ldrb",
+                    [(set GPR32:$Rt,
+                          (zextloadi8 (am_indexed8 GPR64sp:$Rn,
+                                                   uimm12s1:$offset)))]>;
+// zextload -> i64
+def : Pat<(i64 (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
+def : Pat<(i64 (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDRHHui GPR64sp:$Rn, uimm12s2:$offset), sub_32)>;
+
+// zextloadi1 -> zextloadi8
+def : Pat<(i32 (zextloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+          (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
+def : Pat<(i64 (zextloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
+
+// extload -> zextload
+def : Pat<(i32 (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
+          (LDRHHui GPR64sp:$Rn, uimm12s2:$offset)>;
+def : Pat<(i32 (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+          (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
+def : Pat<(i32 (extloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+          (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
+def : Pat<(i64 (extloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>;
+def : Pat<(i64 (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDRHHui GPR64sp:$Rn, uimm12s2:$offset), sub_32)>;
+def : Pat<(i64 (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
+def : Pat<(i64 (extloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
+
+// load sign-extended half-word
+defm LDRSHW : LoadUI<0b01, 0, 0b11, GPR32, uimm12s2, "ldrsh",
+                     [(set GPR32:$Rt,
+                           (sextloadi16 (am_indexed16 GPR64sp:$Rn,
+                                                      uimm12s2:$offset)))]>;
+defm LDRSHX : LoadUI<0b01, 0, 0b10, GPR64, uimm12s2, "ldrsh",
+                     [(set GPR64:$Rt,
+                           (sextloadi16 (am_indexed16 GPR64sp:$Rn,
+                                                      uimm12s2:$offset)))]>;
+
+// load sign-extended byte
+defm LDRSBW : LoadUI<0b00, 0, 0b11, GPR32, uimm12s1, "ldrsb",
+                     [(set GPR32:$Rt,
+                           (sextloadi8 (am_indexed8 GPR64sp:$Rn,
+                                                    uimm12s1:$offset)))]>;
+defm LDRSBX : LoadUI<0b00, 0, 0b10, GPR64, uimm12s1, "ldrsb",
+                     [(set GPR64:$Rt,
+                           (sextloadi8 (am_indexed8 GPR64sp:$Rn,
+                                                    uimm12s1:$offset)))]>;
+
+// load sign-extended word
+defm LDRSW  : LoadUI<0b10, 0, 0b10, GPR64, uimm12s4, "ldrsw",
+                     [(set GPR64:$Rt,
+                           (sextloadi32 (am_indexed32 GPR64sp:$Rn,
+                                                      uimm12s4:$offset)))]>;
+
+// load zero-extended word
+def : Pat<(i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))),
+      (SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>;
+
+// Pre-fetch.
+def PRFMui : PrefetchUI<0b11, 0, 0b10, "prfm",
+                        [(AArch64Prefetch imm:$Rt,
+                                        (am_indexed64 GPR64sp:$Rn,
+                                                      uimm12s8:$offset))]>;
+
+def : InstAlias<"prfm $Rt, [$Rn]", (PRFMui prfop:$Rt, GPR64sp:$Rn, 0)>;
+
+//---
+// (literal)
+def LDRWl : LoadLiteral<0b00, 0, GPR32, "ldr">;
+def LDRXl : LoadLiteral<0b01, 0, GPR64, "ldr">;
+def LDRSl : LoadLiteral<0b00, 1, FPR32, "ldr">;
+def LDRDl : LoadLiteral<0b01, 1, FPR64, "ldr">;
+def LDRQl : LoadLiteral<0b10, 1, FPR128, "ldr">;
+
+// load sign-extended word
+def LDRSWl : LoadLiteral<0b10, 0, GPR64, "ldrsw">;
+
+// prefetch
+def PRFMl : PrefetchLiteral<0b11, 0, "prfm", []>;
+//                   [(AArch64Prefetch imm:$Rt, tglobaladdr:$label)]>;
+
+//---
+// (unscaled immediate)
+defm LDURX : LoadUnscaled<0b11, 0, 0b01, GPR64, "ldur",
+                    [(set GPR64:$Rt,
+                          (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURW : LoadUnscaled<0b10, 0, 0b01, GPR32, "ldur",
+                    [(set GPR32:$Rt,
+                          (load (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURB : LoadUnscaled<0b00, 1, 0b01, FPR8, "ldur",
+                    [(set FPR8:$Rt,
+                          (load (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURH : LoadUnscaled<0b01, 1, 0b01, FPR16, "ldur",
+                    [(set FPR16:$Rt,
+                          (load (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURS : LoadUnscaled<0b10, 1, 0b01, FPR32, "ldur",
+                    [(set (f32 FPR32:$Rt),
+                          (load (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURD : LoadUnscaled<0b11, 1, 0b01, FPR64, "ldur",
+                    [(set (f64 FPR64:$Rt),
+                          (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURQ : LoadUnscaled<0b00, 1, 0b11, FPR128, "ldur",
+                    [(set (f128 FPR128:$Rt),
+                          (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset)))]>;
+
+defm LDURHH
+    : LoadUnscaled<0b01, 0, 0b01, GPR32, "ldurh",
+             [(set GPR32:$Rt,
+                    (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURBB
+    : LoadUnscaled<0b00, 0, 0b01, GPR32, "ldurb",
+             [(set GPR32:$Rt,
+                    (zextloadi8 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
+
+// Match all load 64 bits width whose type is compatible with FPR64
+let Predicates = [IsLE] in {
+  def : Pat<(v2f32 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+            (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v2i32 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+            (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v4i16 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+            (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v8i8 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+            (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+}
+def : Pat<(v1f64 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+          (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(v1i64 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+          (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+
+// Match all load 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+  def : Pat<(v2f64 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+            (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v2i64 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+            (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v4f32 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+            (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v4i32 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+            (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v8i16 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+            (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v16i8 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+            (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+}
+
+//  anyext -> zext
+def : Pat<(i32 (extloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
+          (LDURHHi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i32 (extloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+          (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i32 (extloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+          (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i64 (extloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURWi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (extloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (extloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (extloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+// unscaled zext
+def : Pat<(i32 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
+          (LDURHHi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i32 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+          (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i32 (zextloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+          (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i64 (zextloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURWi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (zextloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+
+
+//---
+// LDR mnemonics fall back to LDUR for negative or unaligned offsets.
+
+// Define new assembler match classes as we want to only match these when
+// the don't otherwise match the scaled addressing mode for LDR/STR. Don't
+// associate a DiagnosticType either, as we want the diagnostic for the
+// canonical form (the scaled operand) to take precedence.
+class SImm9OffsetOperand<int Width> : AsmOperandClass {
+  let Name = "SImm9OffsetFB" # Width;
+  let PredicateMethod = "isSImm9OffsetFB<" # Width # ">";
+  let RenderMethod = "addImmOperands";
+}
+
+def SImm9OffsetFB8Operand : SImm9OffsetOperand<8>;
+def SImm9OffsetFB16Operand : SImm9OffsetOperand<16>;
+def SImm9OffsetFB32Operand : SImm9OffsetOperand<32>;
+def SImm9OffsetFB64Operand : SImm9OffsetOperand<64>;
+def SImm9OffsetFB128Operand : SImm9OffsetOperand<128>;
+
+def simm9_offset_fb8 : Operand<i64> {
+  let ParserMatchClass = SImm9OffsetFB8Operand;
+}
+def simm9_offset_fb16 : Operand<i64> {
+  let ParserMatchClass = SImm9OffsetFB16Operand;
+}
+def simm9_offset_fb32 : Operand<i64> {
+  let ParserMatchClass = SImm9OffsetFB32Operand;
+}
+def simm9_offset_fb64 : Operand<i64> {
+  let ParserMatchClass = SImm9OffsetFB64Operand;
+}
+def simm9_offset_fb128 : Operand<i64> {
+  let ParserMatchClass = SImm9OffsetFB128Operand;
+}
+
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+                (LDURXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+                (LDURWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+                (LDURBi FPR8:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+                (LDURHi FPR16:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+                (LDURSi FPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+                (LDURDi FPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+               (LDURQi FPR128:$Rt, GPR64sp:$Rn, simm9_offset_fb128:$offset), 0>;
+
+// zextload -> i64
+def : Pat<(i64 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+  (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
+  (SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+
+// load sign-extended half-word
+defm LDURSHW
+    : LoadUnscaled<0b01, 0, 0b11, GPR32, "ldursh",
+               [(set GPR32:$Rt,
+                    (sextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURSHX
+    : LoadUnscaled<0b01, 0, 0b10, GPR64, "ldursh",
+              [(set GPR64:$Rt,
+                    (sextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
+
+// load sign-extended byte
+defm LDURSBW
+    : LoadUnscaled<0b00, 0, 0b11, GPR32, "ldursb",
+                [(set GPR32:$Rt,
+                      (sextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURSBX
+    : LoadUnscaled<0b00, 0, 0b10, GPR64, "ldursb",
+                [(set GPR64:$Rt,
+                      (sextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;
+
+// load sign-extended word
+defm LDURSW
+    : LoadUnscaled<0b10, 0, 0b10, GPR64, "ldursw",
+              [(set GPR64:$Rt,
+                    (sextloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;
+
+// zero and sign extending aliases from generic LDR* mnemonics to LDUR*.
+def : InstAlias<"ldrb $Rt, [$Rn, $offset]",
+                (LDURBBi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"ldrh $Rt, [$Rn, $offset]",
+                (LDURHHi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+def : InstAlias<"ldrsb $Rt, [$Rn, $offset]",
+                (LDURSBWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"ldrsb $Rt, [$Rn, $offset]",
+                (LDURSBXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"ldrsh $Rt, [$Rn, $offset]",
+                (LDURSHWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+def : InstAlias<"ldrsh $Rt, [$Rn, $offset]",
+                (LDURSHXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+def : InstAlias<"ldrsw $Rt, [$Rn, $offset]",
+                (LDURSWi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
+
+// Pre-fetch.
+defm PRFUM : PrefetchUnscaled<0b11, 0, 0b10, "prfum",
+                  [(AArch64Prefetch imm:$Rt,
+                                  (am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>;
+
+//---
+// (unscaled immediate, unprivileged)
+defm LDTRX : LoadUnprivileged<0b11, 0, 0b01, GPR64, "ldtr">;
+defm LDTRW : LoadUnprivileged<0b10, 0, 0b01, GPR32, "ldtr">;
+
+defm LDTRH : LoadUnprivileged<0b01, 0, 0b01, GPR32, "ldtrh">;
+defm LDTRB : LoadUnprivileged<0b00, 0, 0b01, GPR32, "ldtrb">;
+
+// load sign-extended half-word
+defm LDTRSHW : LoadUnprivileged<0b01, 0, 0b11, GPR32, "ldtrsh">;
+defm LDTRSHX : LoadUnprivileged<0b01, 0, 0b10, GPR64, "ldtrsh">;
+
+// load sign-extended byte
+defm LDTRSBW : LoadUnprivileged<0b00, 0, 0b11, GPR32, "ldtrsb">;
+defm LDTRSBX : LoadUnprivileged<0b00, 0, 0b10, GPR64, "ldtrsb">;
+
+// load sign-extended word
+defm LDTRSW  : LoadUnprivileged<0b10, 0, 0b10, GPR64, "ldtrsw">;
+
+//---
+// (immediate pre-indexed)
+def LDRWpre : LoadPreIdx<0b10, 0, 0b01, GPR32, "ldr">;
+def LDRXpre : LoadPreIdx<0b11, 0, 0b01, GPR64, "ldr">;
+def LDRBpre : LoadPreIdx<0b00, 1, 0b01, FPR8,  "ldr">;
+def LDRHpre : LoadPreIdx<0b01, 1, 0b01, FPR16, "ldr">;
+def LDRSpre : LoadPreIdx<0b10, 1, 0b01, FPR32, "ldr">;
+def LDRDpre : LoadPreIdx<0b11, 1, 0b01, FPR64, "ldr">;
+def LDRQpre : LoadPreIdx<0b00, 1, 0b11, FPR128, "ldr">;
+
+// load sign-extended half-word
+def LDRSHWpre : LoadPreIdx<0b01, 0, 0b11, GPR32, "ldrsh">;
+def LDRSHXpre : LoadPreIdx<0b01, 0, 0b10, GPR64, "ldrsh">;
+
+// load sign-extended byte
+def LDRSBWpre : LoadPreIdx<0b00, 0, 0b11, GPR32, "ldrsb">;
+def LDRSBXpre : LoadPreIdx<0b00, 0, 0b10, GPR64, "ldrsb">;
+
+// load zero-extended byte
+def LDRBBpre : LoadPreIdx<0b00, 0, 0b01, GPR32, "ldrb">;
+def LDRHHpre : LoadPreIdx<0b01, 0, 0b01, GPR32, "ldrh">;
+
+// load sign-extended word
+def LDRSWpre : LoadPreIdx<0b10, 0, 0b10, GPR64, "ldrsw">;
+
+//---
+// (immediate post-indexed)
+def LDRWpost : LoadPostIdx<0b10, 0, 0b01, GPR32, "ldr">;
+def LDRXpost : LoadPostIdx<0b11, 0, 0b01, GPR64, "ldr">;
+def LDRBpost : LoadPostIdx<0b00, 1, 0b01, FPR8,  "ldr">;
+def LDRHpost : LoadPostIdx<0b01, 1, 0b01, FPR16, "ldr">;
+def LDRSpost : LoadPostIdx<0b10, 1, 0b01, FPR32, "ldr">;
+def LDRDpost : LoadPostIdx<0b11, 1, 0b01, FPR64, "ldr">;
+def LDRQpost : LoadPostIdx<0b00, 1, 0b11, FPR128, "ldr">;
+
+// load sign-extended half-word
+def LDRSHWpost : LoadPostIdx<0b01, 0, 0b11, GPR32, "ldrsh">;
+def LDRSHXpost : LoadPostIdx<0b01, 0, 0b10, GPR64, "ldrsh">;
+
+// load sign-extended byte
+def LDRSBWpost : LoadPostIdx<0b00, 0, 0b11, GPR32, "ldrsb">;
+def LDRSBXpost : LoadPostIdx<0b00, 0, 0b10, GPR64, "ldrsb">;
+
+// load zero-extended byte
+def LDRBBpost : LoadPostIdx<0b00, 0, 0b01, GPR32, "ldrb">;
+def LDRHHpost : LoadPostIdx<0b01, 0, 0b01, GPR32, "ldrh">;
+
+// load sign-extended word
+def LDRSWpost : LoadPostIdx<0b10, 0, 0b10, GPR64, "ldrsw">;
+
+//===----------------------------------------------------------------------===//
+// Store instructions.
+//===----------------------------------------------------------------------===//
+
+// Pair (indexed, offset)
+// FIXME: Use dedicated range-checked addressing mode operand here.
+defm STPW : StorePairOffset<0b00, 0, GPR32, simm7s4, "stp">;
+defm STPX : StorePairOffset<0b10, 0, GPR64, simm7s8, "stp">;
+defm STPS : StorePairOffset<0b00, 1, FPR32, simm7s4, "stp">;
+defm STPD : StorePairOffset<0b01, 1, FPR64, simm7s8, "stp">;
+defm STPQ : StorePairOffset<0b10, 1, FPR128, simm7s16, "stp">;
+
+// Pair (pre-indexed)
+def STPWpre : StorePairPreIdx<0b00, 0, GPR32, simm7s4, "stp">;
+def STPXpre : StorePairPreIdx<0b10, 0, GPR64, simm7s8, "stp">;
+def STPSpre : StorePairPreIdx<0b00, 1, FPR32, simm7s4, "stp">;
+def STPDpre : StorePairPreIdx<0b01, 1, FPR64, simm7s8, "stp">;
+def STPQpre : StorePairPreIdx<0b10, 1, FPR128, simm7s16, "stp">;
+
+// Pair (pre-indexed)
+def STPWpost : StorePairPostIdx<0b00, 0, GPR32, simm7s4, "stp">;
+def STPXpost : StorePairPostIdx<0b10, 0, GPR64, simm7s8, "stp">;
+def STPSpost : StorePairPostIdx<0b00, 1, FPR32, simm7s4, "stp">;
+def STPDpost : StorePairPostIdx<0b01, 1, FPR64, simm7s8, "stp">;
+def STPQpost : StorePairPostIdx<0b10, 1, FPR128, simm7s16, "stp">;
+
+// Pair (no allocate)
+defm STNPW : StorePairNoAlloc<0b00, 0, GPR32, simm7s4, "stnp">;
+defm STNPX : StorePairNoAlloc<0b10, 0, GPR64, simm7s8, "stnp">;
+defm STNPS : StorePairNoAlloc<0b00, 1, FPR32, simm7s4, "stnp">;
+defm STNPD : StorePairNoAlloc<0b01, 1, FPR64, simm7s8, "stnp">;
+defm STNPQ : StorePairNoAlloc<0b10, 1, FPR128, simm7s16, "stnp">;
+
+//---
+// (Register offset)
+
+// Integer
+defm STRBB : Store8RO< 0b00, 0, 0b00, GPR32, "strb", i32, truncstorei8>;
+defm STRHH : Store16RO<0b01, 0, 0b00, GPR32, "strh", i32, truncstorei16>;
+defm STRW  : Store32RO<0b10, 0, 0b00, GPR32, "str",  i32, store>;
+defm STRX  : Store64RO<0b11, 0, 0b00, GPR64, "str",  i64, store>;
+
+
+// Floating-point
+defm STRB : Store8RO< 0b00,  1, 0b00, FPR8,   "str", untyped, store>;
+defm STRH : Store16RO<0b01,  1, 0b00, FPR16,  "str", f16,     store>;
+defm STRS : Store32RO<0b10,  1, 0b00, FPR32,  "str", f32,     store>;
+defm STRD : Store64RO<0b11,  1, 0b00, FPR64,  "str", f64,     store>;
+defm STRQ : Store128RO<0b00, 1, 0b10, FPR128, "str", f128,    store>;
+
+multiclass TruncStoreFrom64ROPat<ROAddrMode ro, SDPatternOperator storeop,
+                                 Instruction STRW, Instruction STRX> {
+
+  def : Pat<(storeop GPR64:$Rt,
+                     (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)),
+            (STRW (EXTRACT_SUBREG GPR64:$Rt, sub_32),
+                  GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
+
+  def : Pat<(storeop GPR64:$Rt,
+                     (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)),
+            (STRX (EXTRACT_SUBREG GPR64:$Rt, sub_32),
+                  GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
+}
+
+let AddedComplexity = 10 in {
+  // truncstore i64
+  defm : TruncStoreFrom64ROPat<ro8,  truncstorei8,  STRBBroW, STRBBroX>;
+  defm : TruncStoreFrom64ROPat<ro16, truncstorei16, STRHHroW, STRHHroX>;
+  defm : TruncStoreFrom64ROPat<ro32, truncstorei32, STRWroW,  STRWroX>;
+}
+
+multiclass VecROStorePat<ROAddrMode ro, ValueType VecTy, RegisterClass FPR,
+                         Instruction STRW, Instruction STRX> {
+  def : Pat<(store (VecTy FPR:$Rt),
+                   (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)),
+            (STRW FPR:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
+
+  def : Pat<(store (VecTy FPR:$Rt),
+                   (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)),
+            (STRX FPR:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
+}
+
+let AddedComplexity = 10 in {
+// Match all store 64 bits width whose type is compatible with FPR64
+let Predicates = [IsLE] in {
+  // We must use ST1 to store vectors in big-endian.
+  defm : VecROStorePat<ro64, v2i32, FPR64, STRDroW, STRDroX>;
+  defm : VecROStorePat<ro64, v2f32, FPR64, STRDroW, STRDroX>;
+  defm : VecROStorePat<ro64, v4i16, FPR64, STRDroW, STRDroX>;
+  defm : VecROStorePat<ro64, v8i8, FPR64, STRDroW, STRDroX>;
+}
+
+defm : VecROStorePat<ro64, v1i64, FPR64, STRDroW, STRDroX>;
+defm : VecROStorePat<ro64, v1f64, FPR64, STRDroW, STRDroX>;
+
+// Match all store 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+  // We must use ST1 to store vectors in big-endian.
+  defm : VecROStorePat<ro128, v2i64, FPR128, STRQroW, STRQroX>;
+  defm : VecROStorePat<ro128, v2f64, FPR128, STRQroW, STRQroX>;
+  defm : VecROStorePat<ro128, v4i32, FPR128, STRQroW, STRQroX>;
+  defm : VecROStorePat<ro128, v4f32, FPR128, STRQroW, STRQroX>;
+  defm : VecROStorePat<ro128, v8i16, FPR128, STRQroW, STRQroX>;
+  defm : VecROStorePat<ro128, v16i8, FPR128, STRQroW, STRQroX>;
+}
+} // AddedComplexity = 10
+
+//---
+// (unsigned immediate)
+defm STRX : StoreUI<0b11, 0, 0b00, GPR64, uimm12s8, "str",
+                   [(store GPR64:$Rt,
+                            (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))]>;
+defm STRW : StoreUI<0b10, 0, 0b00, GPR32, uimm12s4, "str",
+                    [(store GPR32:$Rt,
+                            (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))]>;
+defm STRB : StoreUI<0b00, 1, 0b00, FPR8, uimm12s1, "str",
+                    [(store FPR8:$Rt,
+                            (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))]>;
+defm STRH : StoreUI<0b01, 1, 0b00, FPR16, uimm12s2, "str",
+                    [(store (f16 FPR16:$Rt),
+                            (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))]>;
+defm STRS : StoreUI<0b10, 1, 0b00, FPR32, uimm12s4, "str",
+                    [(store (f32 FPR32:$Rt),
+                            (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))]>;
+defm STRD : StoreUI<0b11, 1, 0b00, FPR64, uimm12s8, "str",
+                    [(store (f64 FPR64:$Rt),
+                            (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))]>;
+defm STRQ : StoreUI<0b00, 1, 0b10, FPR128, uimm12s16, "str", []>;
+
+defm STRHH : StoreUI<0b01, 0, 0b00, GPR32, uimm12s2, "strh",
+                     [(truncstorei16 GPR32:$Rt,
+                                     (am_indexed16 GPR64sp:$Rn,
+                                                   uimm12s2:$offset))]>;
+defm STRBB : StoreUI<0b00, 0, 0b00, GPR32, uimm12s1,  "strb",
+                     [(truncstorei8 GPR32:$Rt,
+                                    (am_indexed8 GPR64sp:$Rn,
+                                                 uimm12s1:$offset))]>;
+
+// Match all store 64 bits width whose type is compatible with FPR64
+let AddedComplexity = 10 in {
+let Predicates = [IsLE] in {
+  // We must use ST1 to store vectors in big-endian.
+  def : Pat<(store (v2f32 FPR64:$Rt),
+                   (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+            (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+  def : Pat<(store (v8i8 FPR64:$Rt),
+                   (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+            (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+  def : Pat<(store (v4i16 FPR64:$Rt),
+                   (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+            (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+  def : Pat<(store (v2i32 FPR64:$Rt),
+                   (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+            (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+}
+def : Pat<(store (v1f64 FPR64:$Rt),
+                 (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+          (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat<(store (v1i64 FPR64:$Rt),
+                 (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+          (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+
+// Match all store 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+  // We must use ST1 to store vectors in big-endian.
+  def : Pat<(store (v4f32 FPR128:$Rt),
+                   (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+            (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(store (v2f64 FPR128:$Rt),
+                   (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+            (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(store (v16i8 FPR128:$Rt),
+                   (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+            (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(store (v8i16 FPR128:$Rt),
+                   (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+            (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(store (v4i32 FPR128:$Rt),
+                   (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+            (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(store (v2i64 FPR128:$Rt),
+                   (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+            (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+}
+def : Pat<(store (f128  FPR128:$Rt),
+                 (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+          (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+
+// truncstore i64
+def : Pat<(truncstorei32 GPR64:$Rt,
+                         (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)),
+  (STRWui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s4:$offset)>;
+def : Pat<(truncstorei16 GPR64:$Rt,
+                         (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)),
+  (STRHHui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s2:$offset)>;
+def : Pat<(truncstorei8 GPR64:$Rt, (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)),
+  (STRBBui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s1:$offset)>;
+
+} // AddedComplexity = 10
+
+//---
+// (unscaled immediate)
+defm STURX : StoreUnscaled<0b11, 0, 0b00, GPR64, "stur",
+                         [(store GPR64:$Rt,
+                                 (am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURW : StoreUnscaled<0b10, 0, 0b00, GPR32, "stur",
+                         [(store GPR32:$Rt,
+                                 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURB : StoreUnscaled<0b00, 1, 0b00, FPR8, "stur",
+                         [(store FPR8:$Rt,
+                                 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURH : StoreUnscaled<0b01, 1, 0b00, FPR16, "stur",
+                         [(store (f16 FPR16:$Rt),
+                                 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURS : StoreUnscaled<0b10, 1, 0b00, FPR32, "stur",
+                         [(store (f32 FPR32:$Rt),
+                                 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURD : StoreUnscaled<0b11, 1, 0b00, FPR64, "stur",
+                         [(store (f64 FPR64:$Rt),
+                                 (am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURQ : StoreUnscaled<0b00, 1, 0b10, FPR128, "stur",
+                         [(store (f128 FPR128:$Rt),
+                                 (am_unscaled128 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURHH : StoreUnscaled<0b01, 0, 0b00, GPR32, "sturh",
+                         [(truncstorei16 GPR32:$Rt,
+                                 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURBB : StoreUnscaled<0b00, 0, 0b00, GPR32, "sturb",
+                         [(truncstorei8 GPR32:$Rt,
+                                  (am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>;
+
+// Match all store 64 bits width whose type is compatible with FPR64
+let Predicates = [IsLE] in {
+  // We must use ST1 to store vectors in big-endian.
+  def : Pat<(store (v2f32 FPR64:$Rt),
+                   (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+            (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v8i8 FPR64:$Rt),
+                   (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+            (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v4i16 FPR64:$Rt),
+                   (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+            (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v2i32 FPR64:$Rt),
+                   (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+            (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+}
+def : Pat<(store (v1f64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+          (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(store (v1i64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+          (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+
+// Match all store 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+  // We must use ST1 to store vectors in big-endian.
+  def : Pat<(store (v4f32 FPR128:$Rt),
+                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v2f64 FPR128:$Rt),
+                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v16i8 FPR128:$Rt),
+                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v8i16 FPR128:$Rt),
+                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v4i32 FPR128:$Rt),
+                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v2i64 FPR128:$Rt),
+                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v2f64 FPR128:$Rt),
+                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+}
+
+// unscaled i64 truncating stores
+def : Pat<(truncstorei32 GPR64:$Rt, (am_unscaled32 GPR64sp:$Rn, simm9:$offset)),
+  (STURWi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(truncstorei16 GPR64:$Rt, (am_unscaled16 GPR64sp:$Rn, simm9:$offset)),
+  (STURHHi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(truncstorei8 GPR64:$Rt, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
+  (STURBBi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
+
+//---
+// STR mnemonics fall back to STUR for negative or unaligned offsets.
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+                (STURXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+                (STURWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+                (STURBi FPR8:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+                (STURHi FPR16:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+                (STURSi FPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+                (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+                (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9_offset_fb128:$offset), 0>;
+
+def : InstAlias<"strb $Rt, [$Rn, $offset]",
+                (STURBBi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"strh $Rt, [$Rn, $offset]",
+                (STURHHi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+
+//---
+// (unscaled immediate, unprivileged)
+defm STTRW : StoreUnprivileged<0b10, 0, 0b00, GPR32, "sttr">;
+defm STTRX : StoreUnprivileged<0b11, 0, 0b00, GPR64, "sttr">;
+
+defm STTRH : StoreUnprivileged<0b01, 0, 0b00, GPR32, "sttrh">;
+defm STTRB : StoreUnprivileged<0b00, 0, 0b00, GPR32, "sttrb">;
+
+//---
+// (immediate pre-indexed)
+def STRWpre : StorePreIdx<0b10, 0, 0b00, GPR32, "str",  pre_store, i32>;
+def STRXpre : StorePreIdx<0b11, 0, 0b00, GPR64, "str",  pre_store, i64>;
+def STRBpre : StorePreIdx<0b00, 1, 0b00, FPR8,  "str",  pre_store, untyped>;
+def STRHpre : StorePreIdx<0b01, 1, 0b00, FPR16, "str",  pre_store, f16>;
+def STRSpre : StorePreIdx<0b10, 1, 0b00, FPR32, "str",  pre_store, f32>;
+def STRDpre : StorePreIdx<0b11, 1, 0b00, FPR64, "str",  pre_store, f64>;
+def STRQpre : StorePreIdx<0b00, 1, 0b10, FPR128, "str", pre_store, f128>;
+
+def STRBBpre : StorePreIdx<0b00, 0, 0b00, GPR32, "strb", pre_truncsti8,  i32>;
+def STRHHpre : StorePreIdx<0b01, 0, 0b00, GPR32, "strh", pre_truncsti16, i32>;
+
+// truncstore i64
+def : Pat<(pre_truncsti32 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+  (STRWpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+           simm9:$off)>;
+def : Pat<(pre_truncsti16 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+  (STRHHpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+            simm9:$off)>;
+def : Pat<(pre_truncsti8 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+  (STRBBpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+            simm9:$off)>;
+
+def : Pat<(pre_store (v8i8 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v4i16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v2i32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v2f32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v1i64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+
+def : Pat<(pre_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v8i16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v4i32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v4f32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v2i64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+
+//---
+// (immediate post-indexed)
+def STRWpost : StorePostIdx<0b10, 0, 0b00, GPR32,  "str", post_store, i32>;
+def STRXpost : StorePostIdx<0b11, 0, 0b00, GPR64,  "str", post_store, i64>;
+def STRBpost : StorePostIdx<0b00, 1, 0b00, FPR8,   "str", post_store, untyped>;
+def STRHpost : StorePostIdx<0b01, 1, 0b00, FPR16,  "str", post_store, f16>;
+def STRSpost : StorePostIdx<0b10, 1, 0b00, FPR32,  "str", post_store, f32>;
+def STRDpost : StorePostIdx<0b11, 1, 0b00, FPR64,  "str", post_store, f64>;
+def STRQpost : StorePostIdx<0b00, 1, 0b10, FPR128, "str", post_store, f128>;
+
+def STRBBpost : StorePostIdx<0b00, 0, 0b00, GPR32, "strb", post_truncsti8, i32>;
+def STRHHpost : StorePostIdx<0b01, 0, 0b00, GPR32, "strh", post_truncsti16, i32>;
+
+// truncstore i64
+def : Pat<(post_truncsti32 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+  (STRWpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+            simm9:$off)>;
+def : Pat<(post_truncsti16 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+  (STRHHpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+             simm9:$off)>;
+def : Pat<(post_truncsti8 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+  (STRBBpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+             simm9:$off)>;
+
+def : Pat<(post_store (v8i8 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v4i16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v2i32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v2f32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v1i64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+
+def : Pat<(post_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v8i16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v4i32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v4f32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v2i64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+
+//===----------------------------------------------------------------------===//
+// Load/store exclusive instructions.
+//===----------------------------------------------------------------------===//
+
+def LDARW  : LoadAcquire   <0b10, 1, 1, 0, 1, GPR32, "ldar">;
+def LDARX  : LoadAcquire   <0b11, 1, 1, 0, 1, GPR64, "ldar">;
+def LDARB  : LoadAcquire   <0b00, 1, 1, 0, 1, GPR32, "ldarb">;
+def LDARH  : LoadAcquire   <0b01, 1, 1, 0, 1, GPR32, "ldarh">;
+
+def LDAXRW : LoadExclusive <0b10, 0, 1, 0, 1, GPR32, "ldaxr">;
+def LDAXRX : LoadExclusive <0b11, 0, 1, 0, 1, GPR64, "ldaxr">;
+def LDAXRB : LoadExclusive <0b00, 0, 1, 0, 1, GPR32, "ldaxrb">;
+def LDAXRH : LoadExclusive <0b01, 0, 1, 0, 1, GPR32, "ldaxrh">;
+
+def LDXRW  : LoadExclusive <0b10, 0, 1, 0, 0, GPR32, "ldxr">;
+def LDXRX  : LoadExclusive <0b11, 0, 1, 0, 0, GPR64, "ldxr">;
+def LDXRB  : LoadExclusive <0b00, 0, 1, 0, 0, GPR32, "ldxrb">;
+def LDXRH  : LoadExclusive <0b01, 0, 1, 0, 0, GPR32, "ldxrh">;
+
+def STLRW  : StoreRelease  <0b10, 1, 0, 0, 1, GPR32, "stlr">;
+def STLRX  : StoreRelease  <0b11, 1, 0, 0, 1, GPR64, "stlr">;
+def STLRB  : StoreRelease  <0b00, 1, 0, 0, 1, GPR32, "stlrb">;
+def STLRH  : StoreRelease  <0b01, 1, 0, 0, 1, GPR32, "stlrh">;
+
+def STLXRW : StoreExclusive<0b10, 0, 0, 0, 1, GPR32, "stlxr">;
+def STLXRX : StoreExclusive<0b11, 0, 0, 0, 1, GPR64, "stlxr">;
+def STLXRB : StoreExclusive<0b00, 0, 0, 0, 1, GPR32, "stlxrb">;
+def STLXRH : StoreExclusive<0b01, 0, 0, 0, 1, GPR32, "stlxrh">;
+
+def STXRW  : StoreExclusive<0b10, 0, 0, 0, 0, GPR32, "stxr">;
+def STXRX  : StoreExclusive<0b11, 0, 0, 0, 0, GPR64, "stxr">;
+def STXRB  : StoreExclusive<0b00, 0, 0, 0, 0, GPR32, "stxrb">;
+def STXRH  : StoreExclusive<0b01, 0, 0, 0, 0, GPR32, "stxrh">;
+
+def LDAXPW : LoadExclusivePair<0b10, 0, 1, 1, 1, GPR32, "ldaxp">;
+def LDAXPX : LoadExclusivePair<0b11, 0, 1, 1, 1, GPR64, "ldaxp">;
+
+def LDXPW  : LoadExclusivePair<0b10, 0, 1, 1, 0, GPR32, "ldxp">;
+def LDXPX  : LoadExclusivePair<0b11, 0, 1, 1, 0, GPR64, "ldxp">;
+
+def STLXPW : StoreExclusivePair<0b10, 0, 0, 1, 1, GPR32, "stlxp">;
+def STLXPX : StoreExclusivePair<0b11, 0, 0, 1, 1, GPR64, "stlxp">;
+
+def STXPW  : StoreExclusivePair<0b10, 0, 0, 1, 0, GPR32, "stxp">;
+def STXPX  : StoreExclusivePair<0b11, 0, 0, 1, 0, GPR64, "stxp">;
+
+//===----------------------------------------------------------------------===//
+// Scaled floating point to integer conversion instructions.
+//===----------------------------------------------------------------------===//
+
+defm FCVTAS : FPToIntegerUnscaled<0b00, 0b100, "fcvtas", int_aarch64_neon_fcvtas>;
+defm FCVTAU : FPToIntegerUnscaled<0b00, 0b101, "fcvtau", int_aarch64_neon_fcvtau>;
+defm FCVTMS : FPToIntegerUnscaled<0b10, 0b000, "fcvtms", int_aarch64_neon_fcvtms>;
+defm FCVTMU : FPToIntegerUnscaled<0b10, 0b001, "fcvtmu", int_aarch64_neon_fcvtmu>;
+defm FCVTNS : FPToIntegerUnscaled<0b00, 0b000, "fcvtns", int_aarch64_neon_fcvtns>;
+defm FCVTNU : FPToIntegerUnscaled<0b00, 0b001, "fcvtnu", int_aarch64_neon_fcvtnu>;
+defm FCVTPS : FPToIntegerUnscaled<0b01, 0b000, "fcvtps", int_aarch64_neon_fcvtps>;
+defm FCVTPU : FPToIntegerUnscaled<0b01, 0b001, "fcvtpu", int_aarch64_neon_fcvtpu>;
+defm FCVTZS : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", fp_to_sint>;
+defm FCVTZU : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", fp_to_uint>;
+defm FCVTZS : FPToIntegerScaled<0b11, 0b000, "fcvtzs", fp_to_sint>;
+defm FCVTZU : FPToIntegerScaled<0b11, 0b001, "fcvtzu", fp_to_uint>;
+let isCodeGenOnly = 1 in {
+defm FCVTZS_Int : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", int_aarch64_neon_fcvtzs>;
+defm FCVTZU_Int : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", int_aarch64_neon_fcvtzu>;
+defm FCVTZS_Int : FPToIntegerScaled<0b11, 0b000, "fcvtzs", int_aarch64_neon_fcvtzs>;
+defm FCVTZU_Int : FPToIntegerScaled<0b11, 0b001, "fcvtzu", int_aarch64_neon_fcvtzu>;
+}
+
+//===----------------------------------------------------------------------===//
+// Scaled integer to floating point conversion instructions.
+//===----------------------------------------------------------------------===//
+
+defm SCVTF : IntegerToFP<0, "scvtf", sint_to_fp>;
+defm UCVTF : IntegerToFP<1, "ucvtf", uint_to_fp>;
+
+//===----------------------------------------------------------------------===//
+// Unscaled integer to floating point conversion instruction.
+//===----------------------------------------------------------------------===//
+
+defm FMOV : UnscaledConversion<"fmov">;
+
+def : Pat<(f32 (fpimm0)), (FMOVWSr WZR)>, Requires<[NoZCZ]>;
+def : Pat<(f64 (fpimm0)), (FMOVXDr XZR)>, Requires<[NoZCZ]>;
+
+//===----------------------------------------------------------------------===//
+// Floating point conversion instruction.
+//===----------------------------------------------------------------------===//
+
+defm FCVT : FPConversion<"fcvt">;
+
+//===----------------------------------------------------------------------===//
+// Floating point single operand instructions.
+//===----------------------------------------------------------------------===//
+
+defm FABS   : SingleOperandFPData<0b0001, "fabs", fabs>;
+defm FMOV   : SingleOperandFPData<0b0000, "fmov">;
+defm FNEG   : SingleOperandFPData<0b0010, "fneg", fneg>;
+defm FRINTA : SingleOperandFPData<0b1100, "frinta", frnd>;
+defm FRINTI : SingleOperandFPData<0b1111, "frinti", fnearbyint>;
+defm FRINTM : SingleOperandFPData<0b1010, "frintm", ffloor>;
+defm FRINTN : SingleOperandFPData<0b1000, "frintn", int_aarch64_neon_frintn>;
+defm FRINTP : SingleOperandFPData<0b1001, "frintp", fceil>;
+
+def : Pat<(v1f64 (int_aarch64_neon_frintn (v1f64 FPR64:$Rn))),
+          (FRINTNDr FPR64:$Rn)>;
+
+// FRINTX is inserted to set the flags as required by FENV_ACCESS ON behavior
+// in the C spec. Setting hasSideEffects ensures it is not DCE'd.
+// <rdar://problem/13715968>
+// TODO: We should really model the FPSR flags correctly. This is really ugly.
+let hasSideEffects = 1 in {
+defm FRINTX : SingleOperandFPData<0b1110, "frintx", frint>;
+}
+
+defm FRINTZ : SingleOperandFPData<0b1011, "frintz", ftrunc>;
+
+let SchedRW = [WriteFDiv] in {
+defm FSQRT  : SingleOperandFPData<0b0011, "fsqrt", fsqrt>;
+}
+
+//===----------------------------------------------------------------------===//
+// Floating point two operand instructions.
+//===----------------------------------------------------------------------===//
+
+defm FADD   : TwoOperandFPData<0b0010, "fadd", fadd>;
+let SchedRW = [WriteFDiv] in {
+defm FDIV   : TwoOperandFPData<0b0001, "fdiv", fdiv>;
+}
+defm FMAXNM : TwoOperandFPData<0b0110, "fmaxnm", int_aarch64_neon_fmaxnm>;
+defm FMAX   : TwoOperandFPData<0b0100, "fmax", AArch64fmax>;
+defm FMINNM : TwoOperandFPData<0b0111, "fminnm", int_aarch64_neon_fminnm>;
+defm FMIN   : TwoOperandFPData<0b0101, "fmin", AArch64fmin>;
+let SchedRW = [WriteFMul] in {
+defm FMUL   : TwoOperandFPData<0b0000, "fmul", fmul>;
+defm FNMUL  : TwoOperandFPDataNeg<0b1000, "fnmul", fmul>;
+}
+defm FSUB   : TwoOperandFPData<0b0011, "fsub", fsub>;
+
+def : Pat<(v1f64 (AArch64fmax (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+          (FMAXDrr FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(v1f64 (AArch64fmin (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+          (FMINDrr FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(v1f64 (int_aarch64_neon_fmaxnm (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+          (FMAXNMDrr FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(v1f64 (int_aarch64_neon_fminnm (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+          (FMINNMDrr FPR64:$Rn, FPR64:$Rm)>;
+
+//===----------------------------------------------------------------------===//
+// Floating point three operand instructions.
+//===----------------------------------------------------------------------===//
+
+defm FMADD  : ThreeOperandFPData<0, 0, "fmadd", fma>;
+defm FMSUB  : ThreeOperandFPData<0, 1, "fmsub",
+     TriOpFrag<(fma node:$LHS, (fneg node:$MHS), node:$RHS)> >;
+defm FNMADD : ThreeOperandFPData<1, 0, "fnmadd",
+     TriOpFrag<(fneg (fma node:$LHS, node:$MHS, node:$RHS))> >;
+defm FNMSUB : ThreeOperandFPData<1, 1, "fnmsub",
+     TriOpFrag<(fma node:$LHS, node:$MHS, (fneg node:$RHS))> >;
+
+// The following def pats catch the case where the LHS of an FMA is negated.
+// The TriOpFrag above catches the case where the middle operand is negated.
+
+// N.b. FMSUB etc have the accumulator at the *end* of (outs), unlike
+// the NEON variant.
+def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, FPR32:$Ra)),
+          (FMSUBSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
+
+def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, FPR64:$Ra)),
+          (FMSUBDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
+
+// We handled -(a + b*c) for FNMADD above, now it's time for "(-a) + (-b)*c" and
+// "(-a) + b*(-c)".
+def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, (fneg FPR32:$Ra))),
+          (FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
+
+def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, (fneg FPR64:$Ra))),
+          (FNMADDDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
+
+def : Pat<(f32 (fma FPR32:$Rn, (fneg FPR32:$Rm), (fneg FPR32:$Ra))),
+          (FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
+
+def : Pat<(f64 (fma FPR64:$Rn, (fneg FPR64:$Rm), (fneg FPR64:$Ra))),
+          (FNMADDDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
+
+//===----------------------------------------------------------------------===//
+// Floating point comparison instructions.
+//===----------------------------------------------------------------------===//
+
+defm FCMPE : FPComparison<1, "fcmpe">;
+defm FCMP  : FPComparison<0, "fcmp", AArch64fcmp>;
+
+//===----------------------------------------------------------------------===//
+// Floating point conditional comparison instructions.
+//===----------------------------------------------------------------------===//
+
+defm FCCMPE : FPCondComparison<1, "fccmpe">;
+defm FCCMP  : FPCondComparison<0, "fccmp">;
+
+//===----------------------------------------------------------------------===//
+// Floating point conditional select instruction.
+//===----------------------------------------------------------------------===//
+
+defm FCSEL : FPCondSelect<"fcsel">;
+
+// CSEL instructions providing f128 types need to be handled by a
+// pseudo-instruction since the eventual code will need to introduce basic
+// blocks and control flow.
+def F128CSEL : Pseudo<(outs FPR128:$Rd),
+                      (ins FPR128:$Rn, FPR128:$Rm, ccode:$cond),
+                      [(set (f128 FPR128:$Rd),
+                            (AArch64csel FPR128:$Rn, FPR128:$Rm,
+                                       (i32 imm:$cond), NZCV))]> {
+  let Uses = [NZCV];
+  let usesCustomInserter = 1;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Floating point immediate move.
+//===----------------------------------------------------------------------===//
+
+let isReMaterializable = 1 in {
+defm FMOV : FPMoveImmediate<"fmov">;
+}
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD two vector instructions.
+//===----------------------------------------------------------------------===//
+
+defm ABS    : SIMDTwoVectorBHSD<0, 0b01011, "abs", int_aarch64_neon_abs>;
+defm CLS    : SIMDTwoVectorBHS<0, 0b00100, "cls", int_aarch64_neon_cls>;
+defm CLZ    : SIMDTwoVectorBHS<1, 0b00100, "clz", ctlz>;
+defm CMEQ   : SIMDCmpTwoVector<0, 0b01001, "cmeq", AArch64cmeqz>;
+defm CMGE   : SIMDCmpTwoVector<1, 0b01000, "cmge", AArch64cmgez>;
+defm CMGT   : SIMDCmpTwoVector<0, 0b01000, "cmgt", AArch64cmgtz>;
+defm CMLE   : SIMDCmpTwoVector<1, 0b01001, "cmle", AArch64cmlez>;
+defm CMLT   : SIMDCmpTwoVector<0, 0b01010, "cmlt", AArch64cmltz>;
+defm CNT    : SIMDTwoVectorB<0, 0b00, 0b00101, "cnt", ctpop>;
+defm FABS   : SIMDTwoVectorFP<0, 1, 0b01111, "fabs", fabs>;
+
+defm FCMEQ  : SIMDFPCmpTwoVector<0, 1, 0b01101, "fcmeq", AArch64fcmeqz>;
+defm FCMGE  : SIMDFPCmpTwoVector<1, 1, 0b01100, "fcmge", AArch64fcmgez>;
+defm FCMGT  : SIMDFPCmpTwoVector<0, 1, 0b01100, "fcmgt", AArch64fcmgtz>;
+defm FCMLE  : SIMDFPCmpTwoVector<1, 1, 0b01101, "fcmle", AArch64fcmlez>;
+defm FCMLT  : SIMDFPCmpTwoVector<0, 1, 0b01110, "fcmlt", AArch64fcmltz>;
+defm FCVTAS : SIMDTwoVectorFPToInt<0,0,0b11100, "fcvtas",int_aarch64_neon_fcvtas>;
+defm FCVTAU : SIMDTwoVectorFPToInt<1,0,0b11100, "fcvtau",int_aarch64_neon_fcvtau>;
+defm FCVTL  : SIMDFPWidenTwoVector<0, 0, 0b10111, "fcvtl">;
+def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (v4i16 V64:$Rn))),
+          (FCVTLv4i16 V64:$Rn)>;
+def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (extract_subvector (v8i16 V128:$Rn),
+                                                              (i64 4)))),
+          (FCVTLv8i16 V128:$Rn)>;
+def : Pat<(v2f64 (fextend (v2f32 V64:$Rn))), (FCVTLv2i32 V64:$Rn)>;
+def : Pat<(v2f64 (fextend (v2f32 (extract_subvector (v4f32 V128:$Rn),
+                                                    (i64 2))))),
+          (FCVTLv4i32 V128:$Rn)>;
+
+defm FCVTMS : SIMDTwoVectorFPToInt<0,0,0b11011, "fcvtms",int_aarch64_neon_fcvtms>;
+defm FCVTMU : SIMDTwoVectorFPToInt<1,0,0b11011, "fcvtmu",int_aarch64_neon_fcvtmu>;
+defm FCVTNS : SIMDTwoVectorFPToInt<0,0,0b11010, "fcvtns",int_aarch64_neon_fcvtns>;
+defm FCVTNU : SIMDTwoVectorFPToInt<1,0,0b11010, "fcvtnu",int_aarch64_neon_fcvtnu>;
+defm FCVTN  : SIMDFPNarrowTwoVector<0, 0, 0b10110, "fcvtn">;
+def : Pat<(v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn))),
+          (FCVTNv4i16 V128:$Rn)>;
+def : Pat<(concat_vectors V64:$Rd,
+                          (v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn)))),
+          (FCVTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+def : Pat<(v2f32 (fround (v2f64 V128:$Rn))), (FCVTNv2i32 V128:$Rn)>;
+def : Pat<(concat_vectors V64:$Rd, (v2f32 (fround (v2f64 V128:$Rn)))),
+          (FCVTNv4i32 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+defm FCVTPS : SIMDTwoVectorFPToInt<0,1,0b11010, "fcvtps",int_aarch64_neon_fcvtps>;
+defm FCVTPU : SIMDTwoVectorFPToInt<1,1,0b11010, "fcvtpu",int_aarch64_neon_fcvtpu>;
+defm FCVTXN : SIMDFPInexactCvtTwoVector<1, 0, 0b10110, "fcvtxn",
+                                        int_aarch64_neon_fcvtxn>;
+defm FCVTZS : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", fp_to_sint>;
+defm FCVTZU : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", fp_to_uint>;
+let isCodeGenOnly = 1 in {
+defm FCVTZS_Int : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs",
+                                       int_aarch64_neon_fcvtzs>;
+defm FCVTZU_Int : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu",
+                                       int_aarch64_neon_fcvtzu>;
+}
+defm FNEG   : SIMDTwoVectorFP<1, 1, 0b01111, "fneg", fneg>;
+defm FRECPE : SIMDTwoVectorFP<0, 1, 0b11101, "frecpe", int_aarch64_neon_frecpe>;
+defm FRINTA : SIMDTwoVectorFP<1, 0, 0b11000, "frinta", frnd>;
+defm FRINTI : SIMDTwoVectorFP<1, 1, 0b11001, "frinti", fnearbyint>;
+defm FRINTM : SIMDTwoVectorFP<0, 0, 0b11001, "frintm", ffloor>;
+defm FRINTN : SIMDTwoVectorFP<0, 0, 0b11000, "frintn", int_aarch64_neon_frintn>;
+defm FRINTP : SIMDTwoVectorFP<0, 1, 0b11000, "frintp", fceil>;
+defm FRINTX : SIMDTwoVectorFP<1, 0, 0b11001, "frintx", frint>;
+defm FRINTZ : SIMDTwoVectorFP<0, 1, 0b11001, "frintz", ftrunc>;
+defm FRSQRTE: SIMDTwoVectorFP<1, 1, 0b11101, "frsqrte", int_aarch64_neon_frsqrte>;
+defm FSQRT  : SIMDTwoVectorFP<1, 1, 0b11111, "fsqrt", fsqrt>;
+defm NEG    : SIMDTwoVectorBHSD<1, 0b01011, "neg",
+                               UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
+defm NOT    : SIMDTwoVectorB<1, 0b00, 0b00101, "not", vnot>;
+// Aliases for MVN -> NOT.
+def : InstAlias<"mvn{ $Vd.8b, $Vn.8b|.8b $Vd, $Vn}",
+                (NOTv8i8 V64:$Vd, V64:$Vn)>;
+def : InstAlias<"mvn{ $Vd.16b, $Vn.16b|.16b $Vd, $Vn}",
+                (NOTv16i8 V128:$Vd, V128:$Vn)>;
+
+def : Pat<(AArch64neg (v8i8  V64:$Rn)),  (NEGv8i8  V64:$Rn)>;
+def : Pat<(AArch64neg (v16i8 V128:$Rn)), (NEGv16i8 V128:$Rn)>;
+def : Pat<(AArch64neg (v4i16 V64:$Rn)),  (NEGv4i16 V64:$Rn)>;
+def : Pat<(AArch64neg (v8i16 V128:$Rn)), (NEGv8i16 V128:$Rn)>;
+def : Pat<(AArch64neg (v2i32 V64:$Rn)),  (NEGv2i32 V64:$Rn)>;
+def : Pat<(AArch64neg (v4i32 V128:$Rn)), (NEGv4i32 V128:$Rn)>;
+def : Pat<(AArch64neg (v2i64 V128:$Rn)), (NEGv2i64 V128:$Rn)>;
+
+def : Pat<(AArch64not (v8i8 V64:$Rn)),   (NOTv8i8  V64:$Rn)>;
+def : Pat<(AArch64not (v16i8 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+def : Pat<(AArch64not (v4i16 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
+def : Pat<(AArch64not (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+def : Pat<(AArch64not (v2i32 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
+def : Pat<(AArch64not (v1i64 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
+def : Pat<(AArch64not (v4i32 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+def : Pat<(AArch64not (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+
+def : Pat<(vnot (v4i16 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
+def : Pat<(vnot (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+def : Pat<(vnot (v2i32 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
+def : Pat<(vnot (v4i32 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+def : Pat<(vnot (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+
+defm RBIT   : SIMDTwoVectorB<1, 0b01, 0b00101, "rbit", int_aarch64_neon_rbit>;
+defm REV16  : SIMDTwoVectorB<0, 0b00, 0b00001, "rev16", AArch64rev16>;
+defm REV32  : SIMDTwoVectorBH<1, 0b00000, "rev32", AArch64rev32>;
+defm REV64  : SIMDTwoVectorBHS<0, 0b00000, "rev64", AArch64rev64>;
+defm SADALP : SIMDLongTwoVectorTied<0, 0b00110, "sadalp",
+       BinOpFrag<(add node:$LHS, (int_aarch64_neon_saddlp node:$RHS))> >;
+defm SADDLP : SIMDLongTwoVector<0, 0b00010, "saddlp", int_aarch64_neon_saddlp>;
+defm SCVTF  : SIMDTwoVectorIntToFP<0, 0, 0b11101, "scvtf", sint_to_fp>;
+defm SHLL   : SIMDVectorLShiftLongBySizeBHS;
+defm SQABS  : SIMDTwoVectorBHSD<0, 0b00111, "sqabs", int_aarch64_neon_sqabs>;
+defm SQNEG  : SIMDTwoVectorBHSD<1, 0b00111, "sqneg", int_aarch64_neon_sqneg>;
+defm SQXTN  : SIMDMixedTwoVector<0, 0b10100, "sqxtn", int_aarch64_neon_sqxtn>;
+defm SQXTUN : SIMDMixedTwoVector<1, 0b10010, "sqxtun", int_aarch64_neon_sqxtun>;
+defm SUQADD : SIMDTwoVectorBHSDTied<0, 0b00011, "suqadd",int_aarch64_neon_suqadd>;
+defm UADALP : SIMDLongTwoVectorTied<1, 0b00110, "uadalp",
+       BinOpFrag<(add node:$LHS, (int_aarch64_neon_uaddlp node:$RHS))> >;
+defm UADDLP : SIMDLongTwoVector<1, 0b00010, "uaddlp",
+                    int_aarch64_neon_uaddlp>;
+defm UCVTF  : SIMDTwoVectorIntToFP<1, 0, 0b11101, "ucvtf", uint_to_fp>;
+defm UQXTN  : SIMDMixedTwoVector<1, 0b10100, "uqxtn", int_aarch64_neon_uqxtn>;
+defm URECPE : SIMDTwoVectorS<0, 1, 0b11100, "urecpe", int_aarch64_neon_urecpe>;
+defm URSQRTE: SIMDTwoVectorS<1, 1, 0b11100, "ursqrte", int_aarch64_neon_ursqrte>;
+defm USQADD : SIMDTwoVectorBHSDTied<1, 0b00011, "usqadd",int_aarch64_neon_usqadd>;
+defm XTN    : SIMDMixedTwoVector<0, 0b10010, "xtn", trunc>;
+
+def : Pat<(v2f32 (AArch64rev64 V64:$Rn)), (REV64v2i32 V64:$Rn)>;
+def : Pat<(v4f32 (AArch64rev64 V128:$Rn)), (REV64v4i32 V128:$Rn)>;
+
+// Patterns for vector long shift (by element width). These need to match all
+// three of zext, sext and anyext so it's easier to pull the patterns out of the
+// definition.
+multiclass SIMDVectorLShiftLongBySizeBHSPats<SDPatternOperator ext> {
+  def : Pat<(AArch64vshl (v8i16 (ext (v8i8 V64:$Rn))), (i32 8)),
+            (SHLLv8i8 V64:$Rn)>;
+  def : Pat<(AArch64vshl (v8i16 (ext (extract_high_v16i8 V128:$Rn))), (i32 8)),
+            (SHLLv16i8 V128:$Rn)>;
+  def : Pat<(AArch64vshl (v4i32 (ext (v4i16 V64:$Rn))), (i32 16)),
+            (SHLLv4i16 V64:$Rn)>;
+  def : Pat<(AArch64vshl (v4i32 (ext (extract_high_v8i16 V128:$Rn))), (i32 16)),
+            (SHLLv8i16 V128:$Rn)>;
+  def : Pat<(AArch64vshl (v2i64 (ext (v2i32 V64:$Rn))), (i32 32)),
+            (SHLLv2i32 V64:$Rn)>;
+  def : Pat<(AArch64vshl (v2i64 (ext (extract_high_v4i32 V128:$Rn))), (i32 32)),
+            (SHLLv4i32 V128:$Rn)>;
+}
+
+defm : SIMDVectorLShiftLongBySizeBHSPats<anyext>;
+defm : SIMDVectorLShiftLongBySizeBHSPats<zext>;
+defm : SIMDVectorLShiftLongBySizeBHSPats<sext>;
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD three vector instructions.
+//===----------------------------------------------------------------------===//
+
+defm ADD     : SIMDThreeSameVector<0, 0b10000, "add", add>;
+defm ADDP    : SIMDThreeSameVector<0, 0b10111, "addp", int_aarch64_neon_addp>;
+defm CMEQ    : SIMDThreeSameVector<1, 0b10001, "cmeq", AArch64cmeq>;
+defm CMGE    : SIMDThreeSameVector<0, 0b00111, "cmge", AArch64cmge>;
+defm CMGT    : SIMDThreeSameVector<0, 0b00110, "cmgt", AArch64cmgt>;
+defm CMHI    : SIMDThreeSameVector<1, 0b00110, "cmhi", AArch64cmhi>;
+defm CMHS    : SIMDThreeSameVector<1, 0b00111, "cmhs", AArch64cmhs>;
+defm CMTST   : SIMDThreeSameVector<0, 0b10001, "cmtst", AArch64cmtst>;
+defm FABD    : SIMDThreeSameVectorFP<1,1,0b11010,"fabd", int_aarch64_neon_fabd>;
+defm FACGE   : SIMDThreeSameVectorFPCmp<1,0,0b11101,"facge",int_aarch64_neon_facge>;
+defm FACGT   : SIMDThreeSameVectorFPCmp<1,1,0b11101,"facgt",int_aarch64_neon_facgt>;
+defm FADDP   : SIMDThreeSameVectorFP<1,0,0b11010,"faddp",int_aarch64_neon_addp>;
+defm FADD    : SIMDThreeSameVectorFP<0,0,0b11010,"fadd", fadd>;
+defm FCMEQ   : SIMDThreeSameVectorFPCmp<0, 0, 0b11100, "fcmeq", AArch64fcmeq>;
+defm FCMGE   : SIMDThreeSameVectorFPCmp<1, 0, 0b11100, "fcmge", AArch64fcmge>;
+defm FCMGT   : SIMDThreeSameVectorFPCmp<1, 1, 0b11100, "fcmgt", AArch64fcmgt>;
+defm FDIV    : SIMDThreeSameVectorFP<1,0,0b11111,"fdiv", fdiv>;
+defm FMAXNMP : SIMDThreeSameVectorFP<1,0,0b11000,"fmaxnmp", int_aarch64_neon_fmaxnmp>;
+defm FMAXNM  : SIMDThreeSameVectorFP<0,0,0b11000,"fmaxnm", int_aarch64_neon_fmaxnm>;
+defm FMAXP   : SIMDThreeSameVectorFP<1,0,0b11110,"fmaxp", int_aarch64_neon_fmaxp>;
+defm FMAX    : SIMDThreeSameVectorFP<0,0,0b11110,"fmax", AArch64fmax>;
+defm FMINNMP : SIMDThreeSameVectorFP<1,1,0b11000,"fminnmp", int_aarch64_neon_fminnmp>;
+defm FMINNM  : SIMDThreeSameVectorFP<0,1,0b11000,"fminnm", int_aarch64_neon_fminnm>;
+defm FMINP   : SIMDThreeSameVectorFP<1,1,0b11110,"fminp", int_aarch64_neon_fminp>;
+defm FMIN    : SIMDThreeSameVectorFP<0,1,0b11110,"fmin", AArch64fmin>;
+
+// NOTE: The operands of the PatFrag are reordered on FMLA/FMLS because the
+// instruction expects the addend first, while the fma intrinsic puts it last.
+defm FMLA     : SIMDThreeSameVectorFPTied<0, 0, 0b11001, "fmla",
+            TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >;
+defm FMLS     : SIMDThreeSameVectorFPTied<0, 1, 0b11001, "fmls",
+            TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
+
+// The following def pats catch the case where the LHS of an FMA is negated.
+// The TriOpFrag above catches the case where the middle operand is negated.
+def : Pat<(v2f32 (fma (fneg V64:$Rn), V64:$Rm, V64:$Rd)),
+          (FMLSv2f32 V64:$Rd, V64:$Rn, V64:$Rm)>;
+
+def : Pat<(v4f32 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
+          (FMLSv4f32 V128:$Rd, V128:$Rn, V128:$Rm)>;
+
+def : Pat<(v2f64 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
+          (FMLSv2f64 V128:$Rd, V128:$Rn, V128:$Rm)>;
+
+defm FMULX    : SIMDThreeSameVectorFP<0,0,0b11011,"fmulx", int_aarch64_neon_fmulx>;
+defm FMUL     : SIMDThreeSameVectorFP<1,0,0b11011,"fmul", fmul>;
+defm FRECPS   : SIMDThreeSameVectorFP<0,0,0b11111,"frecps", int_aarch64_neon_frecps>;
+defm FRSQRTS  : SIMDThreeSameVectorFP<0,1,0b11111,"frsqrts", int_aarch64_neon_frsqrts>;
+defm FSUB     : SIMDThreeSameVectorFP<0,1,0b11010,"fsub", fsub>;
+defm MLA      : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla",
+                      TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))> >;
+defm MLS      : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls",
+                      TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))> >;
+defm MUL      : SIMDThreeSameVectorBHS<0, 0b10011, "mul", mul>;
+defm PMUL     : SIMDThreeSameVectorB<1, 0b10011, "pmul", int_aarch64_neon_pmul>;
+defm SABA     : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba",
+      TriOpFrag<(add node:$LHS, (int_aarch64_neon_sabd node:$MHS, node:$RHS))> >;
+defm SABD     : SIMDThreeSameVectorBHS<0,0b01110,"sabd", int_aarch64_neon_sabd>;
+defm SHADD    : SIMDThreeSameVectorBHS<0,0b00000,"shadd", int_aarch64_neon_shadd>;
+defm SHSUB    : SIMDThreeSameVectorBHS<0,0b00100,"shsub", int_aarch64_neon_shsub>;
+defm SMAXP    : SIMDThreeSameVectorBHS<0,0b10100,"smaxp", int_aarch64_neon_smaxp>;
+defm SMAX     : SIMDThreeSameVectorBHS<0,0b01100,"smax", int_aarch64_neon_smax>;
+defm SMINP    : SIMDThreeSameVectorBHS<0,0b10101,"sminp", int_aarch64_neon_sminp>;
+defm SMIN     : SIMDThreeSameVectorBHS<0,0b01101,"smin", int_aarch64_neon_smin>;
+defm SQADD    : SIMDThreeSameVector<0,0b00001,"sqadd", int_aarch64_neon_sqadd>;
+defm SQDMULH  : SIMDThreeSameVectorHS<0,0b10110,"sqdmulh",int_aarch64_neon_sqdmulh>;
+defm SQRDMULH : SIMDThreeSameVectorHS<1,0b10110,"sqrdmulh",int_aarch64_neon_sqrdmulh>;
+defm SQRSHL   : SIMDThreeSameVector<0,0b01011,"sqrshl", int_aarch64_neon_sqrshl>;
+defm SQSHL    : SIMDThreeSameVector<0,0b01001,"sqshl", int_aarch64_neon_sqshl>;
+defm SQSUB    : SIMDThreeSameVector<0,0b00101,"sqsub", int_aarch64_neon_sqsub>;
+defm SRHADD   : SIMDThreeSameVectorBHS<0,0b00010,"srhadd",int_aarch64_neon_srhadd>;
+defm SRSHL    : SIMDThreeSameVector<0,0b01010,"srshl", int_aarch64_neon_srshl>;
+defm SSHL     : SIMDThreeSameVector<0,0b01000,"sshl", int_aarch64_neon_sshl>;
+defm SUB      : SIMDThreeSameVector<1,0b10000,"sub", sub>;
+defm UABA     : SIMDThreeSameVectorBHSTied<1, 0b01111, "uaba",
+      TriOpFrag<(add node:$LHS, (int_aarch64_neon_uabd node:$MHS, node:$RHS))> >;
+defm UABD     : SIMDThreeSameVectorBHS<1,0b01110,"uabd", int_aarch64_neon_uabd>;
+defm UHADD    : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", int_aarch64_neon_uhadd>;
+defm UHSUB    : SIMDThreeSameVectorBHS<1,0b00100,"uhsub", int_aarch64_neon_uhsub>;
+defm UMAXP    : SIMDThreeSameVectorBHS<1,0b10100,"umaxp", int_aarch64_neon_umaxp>;
+defm UMAX     : SIMDThreeSameVectorBHS<1,0b01100,"umax", int_aarch64_neon_umax>;
+defm UMINP    : SIMDThreeSameVectorBHS<1,0b10101,"uminp", int_aarch64_neon_uminp>;
+defm UMIN     : SIMDThreeSameVectorBHS<1,0b01101,"umin", int_aarch64_neon_umin>;
+defm UQADD    : SIMDThreeSameVector<1,0b00001,"uqadd", int_aarch64_neon_uqadd>;
+defm UQRSHL   : SIMDThreeSameVector<1,0b01011,"uqrshl", int_aarch64_neon_uqrshl>;
+defm UQSHL    : SIMDThreeSameVector<1,0b01001,"uqshl", int_aarch64_neon_uqshl>;
+defm UQSUB    : SIMDThreeSameVector<1,0b00101,"uqsub", int_aarch64_neon_uqsub>;
+defm URHADD   : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", int_aarch64_neon_urhadd>;
+defm URSHL    : SIMDThreeSameVector<1,0b01010,"urshl", int_aarch64_neon_urshl>;
+defm USHL     : SIMDThreeSameVector<1,0b01000,"ushl", int_aarch64_neon_ushl>;
+
+defm AND : SIMDLogicalThreeVector<0, 0b00, "and", and>;
+defm BIC : SIMDLogicalThreeVector<0, 0b01, "bic",
+                                  BinOpFrag<(and node:$LHS, (vnot node:$RHS))> >;
+defm BIF : SIMDLogicalThreeVector<1, 0b11, "bif">;
+defm BIT : SIMDLogicalThreeVectorTied<1, 0b10, "bit", AArch64bit>;
+defm BSL : SIMDLogicalThreeVectorTied<1, 0b01, "bsl",
+    TriOpFrag<(or (and node:$LHS, node:$MHS), (and (vnot node:$LHS), node:$RHS))>>;
+defm EOR : SIMDLogicalThreeVector<1, 0b00, "eor", xor>;
+defm ORN : SIMDLogicalThreeVector<0, 0b11, "orn",
+                                  BinOpFrag<(or node:$LHS, (vnot node:$RHS))> >;
+defm ORR : SIMDLogicalThreeVector<0, 0b10, "orr", or>;
+
+def : Pat<(AArch64bsl (v8i8 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bsl (v4i16 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bsl (v2i32 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bsl (v1i64 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+
+def : Pat<(AArch64bsl (v16i8 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bsl (v8i16 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bsl (v4i32 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bsl (v2i64 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+
+def : InstAlias<"mov{\t$dst.16b, $src.16b|.16b\t$dst, $src}",
+                (ORRv16i8 V128:$dst, V128:$src, V128:$src), 1>;
+def : InstAlias<"mov{\t$dst.8h, $src.8h|.8h\t$dst, $src}",
+                (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
+def : InstAlias<"mov{\t$dst.4s, $src.4s|.4s\t$dst, $src}",
+                (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
+def : InstAlias<"mov{\t$dst.2d, $src.2d|.2d\t$dst, $src}",
+                (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
+
+def : InstAlias<"mov{\t$dst.8b, $src.8b|.8b\t$dst, $src}",
+                (ORRv8i8 V64:$dst, V64:$src, V64:$src), 1>;
+def : InstAlias<"mov{\t$dst.4h, $src.4h|.4h\t$dst, $src}",
+                (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
+def : InstAlias<"mov{\t$dst.2s, $src.2s|.2s\t$dst, $src}",
+                (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
+def : InstAlias<"mov{\t$dst.1d, $src.1d|.1d\t$dst, $src}",
+                (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
+
+def : InstAlias<"{cmls\t$dst.8b, $src1.8b, $src2.8b" #
+                "|cmls.8b\t$dst, $src1, $src2}",
+                (CMHSv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.16b, $src1.16b, $src2.16b" #
+                "|cmls.16b\t$dst, $src1, $src2}",
+                (CMHSv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.4h, $src1.4h, $src2.4h" #
+                "|cmls.4h\t$dst, $src1, $src2}",
+                (CMHSv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.8h, $src1.8h, $src2.8h" #
+                "|cmls.8h\t$dst, $src1, $src2}",
+                (CMHSv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.2s, $src1.2s, $src2.2s" #
+                "|cmls.2s\t$dst, $src1, $src2}",
+                (CMHSv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.4s, $src1.4s, $src2.4s" #
+                "|cmls.4s\t$dst, $src1, $src2}",
+                (CMHSv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.2d, $src1.2d, $src2.2d" #
+                "|cmls.2d\t$dst, $src1, $src2}",
+                (CMHSv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{cmlo\t$dst.8b, $src1.8b, $src2.8b" #
+                "|cmlo.8b\t$dst, $src1, $src2}",
+                (CMHIv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.16b, $src1.16b, $src2.16b" #
+                "|cmlo.16b\t$dst, $src1, $src2}",
+                (CMHIv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.4h, $src1.4h, $src2.4h" #
+                "|cmlo.4h\t$dst, $src1, $src2}",
+                (CMHIv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.8h, $src1.8h, $src2.8h" #
+                "|cmlo.8h\t$dst, $src1, $src2}",
+                (CMHIv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.2s, $src1.2s, $src2.2s" #
+                "|cmlo.2s\t$dst, $src1, $src2}",
+                (CMHIv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.4s, $src1.4s, $src2.4s" #
+                "|cmlo.4s\t$dst, $src1, $src2}",
+                (CMHIv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.2d, $src1.2d, $src2.2d" #
+                "|cmlo.2d\t$dst, $src1, $src2}",
+                (CMHIv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{cmle\t$dst.8b, $src1.8b, $src2.8b" #
+                "|cmle.8b\t$dst, $src1, $src2}",
+                (CMGEv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.16b, $src1.16b, $src2.16b" #
+                "|cmle.16b\t$dst, $src1, $src2}",
+                (CMGEv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.4h, $src1.4h, $src2.4h" #
+                "|cmle.4h\t$dst, $src1, $src2}",
+                (CMGEv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.8h, $src1.8h, $src2.8h" #
+                "|cmle.8h\t$dst, $src1, $src2}",
+                (CMGEv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.2s, $src1.2s, $src2.2s" #
+                "|cmle.2s\t$dst, $src1, $src2}",
+                (CMGEv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.4s, $src1.4s, $src2.4s" #
+                "|cmle.4s\t$dst, $src1, $src2}",
+                (CMGEv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.2d, $src1.2d, $src2.2d" #
+                "|cmle.2d\t$dst, $src1, $src2}",
+                (CMGEv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{cmlt\t$dst.8b, $src1.8b, $src2.8b" #
+                "|cmlt.8b\t$dst, $src1, $src2}",
+                (CMGTv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.16b, $src1.16b, $src2.16b" #
+                "|cmlt.16b\t$dst, $src1, $src2}",
+                (CMGTv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.4h, $src1.4h, $src2.4h" #
+                "|cmlt.4h\t$dst, $src1, $src2}",
+                (CMGTv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.8h, $src1.8h, $src2.8h" #
+                "|cmlt.8h\t$dst, $src1, $src2}",
+                (CMGTv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.2s, $src1.2s, $src2.2s" #
+                "|cmlt.2s\t$dst, $src1, $src2}",
+                (CMGTv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.4s, $src1.4s, $src2.4s" #
+                "|cmlt.4s\t$dst, $src1, $src2}",
+                (CMGTv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.2d, $src1.2d, $src2.2d" #
+                "|cmlt.2d\t$dst, $src1, $src2}",
+                (CMGTv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{fcmle\t$dst.2s, $src1.2s, $src2.2s" #
+                "|fcmle.2s\t$dst, $src1, $src2}",
+                (FCMGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{fcmle\t$dst.4s, $src1.4s, $src2.4s" #
+                "|fcmle.4s\t$dst, $src1, $src2}",
+                (FCMGEv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{fcmle\t$dst.2d, $src1.2d, $src2.2d" #
+                "|fcmle.2d\t$dst, $src1, $src2}",
+                (FCMGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{fcmlt\t$dst.2s, $src1.2s, $src2.2s" #
+                "|fcmlt.2s\t$dst, $src1, $src2}",
+                (FCMGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{fcmlt\t$dst.4s, $src1.4s, $src2.4s" #
+                "|fcmlt.4s\t$dst, $src1, $src2}",
+                (FCMGTv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{fcmlt\t$dst.2d, $src1.2d, $src2.2d" #
+                "|fcmlt.2d\t$dst, $src1, $src2}",
+                (FCMGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{facle\t$dst.2s, $src1.2s, $src2.2s" #
+                "|facle.2s\t$dst, $src1, $src2}",
+                (FACGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{facle\t$dst.4s, $src1.4s, $src2.4s" #
+                "|facle.4s\t$dst, $src1, $src2}",
+                (FACGEv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{facle\t$dst.2d, $src1.2d, $src2.2d" #
+                "|facle.2d\t$dst, $src1, $src2}",
+                (FACGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{faclt\t$dst.2s, $src1.2s, $src2.2s" #
+                "|faclt.2s\t$dst, $src1, $src2}",
+                (FACGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{faclt\t$dst.4s, $src1.4s, $src2.4s" #
+                "|faclt.4s\t$dst, $src1, $src2}",
+                (FACGTv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{faclt\t$dst.2d, $src1.2d, $src2.2d" #
+                "|faclt.2d\t$dst, $src1, $src2}",
+                (FACGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD three scalar instructions.
+//===----------------------------------------------------------------------===//
+
+defm ADD      : SIMDThreeScalarD<0, 0b10000, "add", add>;
+defm CMEQ     : SIMDThreeScalarD<1, 0b10001, "cmeq", AArch64cmeq>;
+defm CMGE     : SIMDThreeScalarD<0, 0b00111, "cmge", AArch64cmge>;
+defm CMGT     : SIMDThreeScalarD<0, 0b00110, "cmgt", AArch64cmgt>;
+defm CMHI     : SIMDThreeScalarD<1, 0b00110, "cmhi", AArch64cmhi>;
+defm CMHS     : SIMDThreeScalarD<1, 0b00111, "cmhs", AArch64cmhs>;
+defm CMTST    : SIMDThreeScalarD<0, 0b10001, "cmtst", AArch64cmtst>;
+defm FABD     : SIMDThreeScalarSD<1, 1, 0b11010, "fabd", int_aarch64_sisd_fabd>;
+def : Pat<(v1f64 (int_aarch64_neon_fabd (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+          (FABD64 FPR64:$Rn, FPR64:$Rm)>;
+defm FACGE    : SIMDThreeScalarFPCmp<1, 0, 0b11101, "facge",
+                                     int_aarch64_neon_facge>;
+defm FACGT    : SIMDThreeScalarFPCmp<1, 1, 0b11101, "facgt",
+                                     int_aarch64_neon_facgt>;
+defm FCMEQ    : SIMDThreeScalarFPCmp<0, 0, 0b11100, "fcmeq", AArch64fcmeq>;
+defm FCMGE    : SIMDThreeScalarFPCmp<1, 0, 0b11100, "fcmge", AArch64fcmge>;
+defm FCMGT    : SIMDThreeScalarFPCmp<1, 1, 0b11100, "fcmgt", AArch64fcmgt>;
+defm FMULX    : SIMDThreeScalarSD<0, 0, 0b11011, "fmulx", int_aarch64_neon_fmulx>;
+defm FRECPS   : SIMDThreeScalarSD<0, 0, 0b11111, "frecps", int_aarch64_neon_frecps>;
+defm FRSQRTS  : SIMDThreeScalarSD<0, 1, 0b11111, "frsqrts", int_aarch64_neon_frsqrts>;
+defm SQADD    : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", int_aarch64_neon_sqadd>;
+defm SQDMULH  : SIMDThreeScalarHS<  0, 0b10110, "sqdmulh", int_aarch64_neon_sqdmulh>;
+defm SQRDMULH : SIMDThreeScalarHS<  1, 0b10110, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
+defm SQRSHL   : SIMDThreeScalarBHSD<0, 0b01011, "sqrshl",int_aarch64_neon_sqrshl>;
+defm SQSHL    : SIMDThreeScalarBHSD<0, 0b01001, "sqshl", int_aarch64_neon_sqshl>;
+defm SQSUB    : SIMDThreeScalarBHSD<0, 0b00101, "sqsub", int_aarch64_neon_sqsub>;
+defm SRSHL    : SIMDThreeScalarD<   0, 0b01010, "srshl", int_aarch64_neon_srshl>;
+defm SSHL     : SIMDThreeScalarD<   0, 0b01000, "sshl", int_aarch64_neon_sshl>;
+defm SUB      : SIMDThreeScalarD<   1, 0b10000, "sub", sub>;
+defm UQADD    : SIMDThreeScalarBHSD<1, 0b00001, "uqadd", int_aarch64_neon_uqadd>;
+defm UQRSHL   : SIMDThreeScalarBHSD<1, 0b01011, "uqrshl",int_aarch64_neon_uqrshl>;
+defm UQSHL    : SIMDThreeScalarBHSD<1, 0b01001, "uqshl", int_aarch64_neon_uqshl>;
+defm UQSUB    : SIMDThreeScalarBHSD<1, 0b00101, "uqsub", int_aarch64_neon_uqsub>;
+defm URSHL    : SIMDThreeScalarD<   1, 0b01010, "urshl", int_aarch64_neon_urshl>;
+defm USHL     : SIMDThreeScalarD<   1, 0b01000, "ushl", int_aarch64_neon_ushl>;
+
+def : InstAlias<"cmls $dst, $src1, $src2",
+                (CMHSv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"cmle $dst, $src1, $src2",
+                (CMGEv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"cmlo $dst, $src1, $src2",
+                (CMHIv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"cmlt $dst, $src1, $src2",
+                (CMGTv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"fcmle $dst, $src1, $src2",
+                (FCMGE32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
+def : InstAlias<"fcmle $dst, $src1, $src2",
+                (FCMGE64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"fcmlt $dst, $src1, $src2",
+                (FCMGT32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
+def : InstAlias<"fcmlt $dst, $src1, $src2",
+                (FCMGT64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"facle $dst, $src1, $src2",
+                (FACGE32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
+def : InstAlias<"facle $dst, $src1, $src2",
+                (FACGE64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"faclt $dst, $src1, $src2",
+                (FACGT32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
+def : InstAlias<"faclt $dst, $src1, $src2",
+                (FACGT64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD three scalar instructions (mixed operands).
+//===----------------------------------------------------------------------===//
+defm SQDMULL  : SIMDThreeScalarMixedHS<0, 0b11010, "sqdmull",
+                                       int_aarch64_neon_sqdmulls_scalar>;
+defm SQDMLAL  : SIMDThreeScalarMixedTiedHS<0, 0b10010, "sqdmlal">;
+defm SQDMLSL  : SIMDThreeScalarMixedTiedHS<0, 0b10110, "sqdmlsl">;
+
+def : Pat<(i64 (int_aarch64_neon_sqadd (i64 FPR64:$Rd),
+                   (i64 (int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
+                                                        (i32 FPR32:$Rm))))),
+          (SQDMLALi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>;
+def : Pat<(i64 (int_aarch64_neon_sqsub (i64 FPR64:$Rd),
+                   (i64 (int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
+                                                        (i32 FPR32:$Rm))))),
+          (SQDMLSLi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>;
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD two scalar instructions.
+//===----------------------------------------------------------------------===//
+
+defm ABS    : SIMDTwoScalarD<    0, 0b01011, "abs", int_aarch64_neon_abs>;
+defm CMEQ   : SIMDCmpTwoScalarD< 0, 0b01001, "cmeq", AArch64cmeqz>;
+defm CMGE   : SIMDCmpTwoScalarD< 1, 0b01000, "cmge", AArch64cmgez>;
+defm CMGT   : SIMDCmpTwoScalarD< 0, 0b01000, "cmgt", AArch64cmgtz>;
+defm CMLE   : SIMDCmpTwoScalarD< 1, 0b01001, "cmle", AArch64cmlez>;
+defm CMLT   : SIMDCmpTwoScalarD< 0, 0b01010, "cmlt", AArch64cmltz>;
+defm FCMEQ  : SIMDCmpTwoScalarSD<0, 1, 0b01101, "fcmeq", AArch64fcmeqz>;
+defm FCMGE  : SIMDCmpTwoScalarSD<1, 1, 0b01100, "fcmge", AArch64fcmgez>;
+defm FCMGT  : SIMDCmpTwoScalarSD<0, 1, 0b01100, "fcmgt", AArch64fcmgtz>;
+defm FCMLE  : SIMDCmpTwoScalarSD<1, 1, 0b01101, "fcmle", AArch64fcmlez>;
+defm FCMLT  : SIMDCmpTwoScalarSD<0, 1, 0b01110, "fcmlt", AArch64fcmltz>;
+defm FCVTAS : SIMDTwoScalarSD<   0, 0, 0b11100, "fcvtas">;
+defm FCVTAU : SIMDTwoScalarSD<   1, 0, 0b11100, "fcvtau">;
+defm FCVTMS : SIMDTwoScalarSD<   0, 0, 0b11011, "fcvtms">;
+defm FCVTMU : SIMDTwoScalarSD<   1, 0, 0b11011, "fcvtmu">;
+defm FCVTNS : SIMDTwoScalarSD<   0, 0, 0b11010, "fcvtns">;
+defm FCVTNU : SIMDTwoScalarSD<   1, 0, 0b11010, "fcvtnu">;
+defm FCVTPS : SIMDTwoScalarSD<   0, 1, 0b11010, "fcvtps">;
+defm FCVTPU : SIMDTwoScalarSD<   1, 1, 0b11010, "fcvtpu">;
+def  FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">;
+defm FCVTZS : SIMDTwoScalarSD<   0, 1, 0b11011, "fcvtzs">;
+defm FCVTZU : SIMDTwoScalarSD<   1, 1, 0b11011, "fcvtzu">;
+defm FRECPE : SIMDTwoScalarSD<   0, 1, 0b11101, "frecpe">;
+defm FRECPX : SIMDTwoScalarSD<   0, 1, 0b11111, "frecpx">;
+defm FRSQRTE : SIMDTwoScalarSD<  1, 1, 0b11101, "frsqrte">;
+defm NEG    : SIMDTwoScalarD<    1, 0b01011, "neg",
+                                 UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
+defm SCVTF  : SIMDTwoScalarCVTSD<   0, 0, 0b11101, "scvtf", AArch64sitof>;
+defm SQABS  : SIMDTwoScalarBHSD< 0, 0b00111, "sqabs", int_aarch64_neon_sqabs>;
+defm SQNEG  : SIMDTwoScalarBHSD< 1, 0b00111, "sqneg", int_aarch64_neon_sqneg>;
+defm SQXTN  : SIMDTwoScalarMixedBHS< 0, 0b10100, "sqxtn", int_aarch64_neon_scalar_sqxtn>;
+defm SQXTUN : SIMDTwoScalarMixedBHS< 1, 0b10010, "sqxtun", int_aarch64_neon_scalar_sqxtun>;
+defm SUQADD : SIMDTwoScalarBHSDTied< 0, 0b00011, "suqadd",
+                                     int_aarch64_neon_suqadd>;
+defm UCVTF  : SIMDTwoScalarCVTSD<   1, 0, 0b11101, "ucvtf", AArch64uitof>;
+defm UQXTN  : SIMDTwoScalarMixedBHS<1, 0b10100, "uqxtn", int_aarch64_neon_scalar_uqxtn>;
+defm USQADD : SIMDTwoScalarBHSDTied< 1, 0b00011, "usqadd",
+                                    int_aarch64_neon_usqadd>;
+
+def : Pat<(AArch64neg (v1i64 V64:$Rn)), (NEGv1i64 V64:$Rn)>;
+
+def : Pat<(v1i64 (int_aarch64_neon_fcvtas (v1f64 FPR64:$Rn))),
+          (FCVTASv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtau (v1f64 FPR64:$Rn))),
+          (FCVTAUv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtms (v1f64 FPR64:$Rn))),
+          (FCVTMSv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtmu (v1f64 FPR64:$Rn))),
+          (FCVTMUv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtns (v1f64 FPR64:$Rn))),
+          (FCVTNSv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtnu (v1f64 FPR64:$Rn))),
+          (FCVTNUv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtps (v1f64 FPR64:$Rn))),
+          (FCVTPSv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtpu (v1f64 FPR64:$Rn))),
+          (FCVTPUv1i64 FPR64:$Rn)>;
+
+def : Pat<(f32 (int_aarch64_neon_frecpe (f32 FPR32:$Rn))),
+          (FRECPEv1i32 FPR32:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_frecpe (f64 FPR64:$Rn))),
+          (FRECPEv1i64 FPR64:$Rn)>;
+def : Pat<(v1f64 (int_aarch64_neon_frecpe (v1f64 FPR64:$Rn))),
+          (FRECPEv1i64 FPR64:$Rn)>;
+
+def : Pat<(f32 (int_aarch64_neon_frecpx (f32 FPR32:$Rn))),
+          (FRECPXv1i32 FPR32:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_frecpx (f64 FPR64:$Rn))),
+          (FRECPXv1i64 FPR64:$Rn)>;
+
+def : Pat<(f32 (int_aarch64_neon_frsqrte (f32 FPR32:$Rn))),
+          (FRSQRTEv1i32 FPR32:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_frsqrte (f64 FPR64:$Rn))),
+          (FRSQRTEv1i64 FPR64:$Rn)>;
+def : Pat<(v1f64 (int_aarch64_neon_frsqrte (v1f64 FPR64:$Rn))),
+          (FRSQRTEv1i64 FPR64:$Rn)>;
+
+// If an integer is about to be converted to a floating point value,
+// just load it on the floating point unit.
+// Here are the patterns for 8 and 16-bits to float.
+// 8-bits -> float.
+multiclass UIntToFPROLoadPat<ValueType DstTy, ValueType SrcTy,
+                             SDPatternOperator loadop, Instruction UCVTF,
+                             ROAddrMode ro, Instruction LDRW, Instruction LDRX,
+                             SubRegIndex sub> {
+  def : Pat<(DstTy (uint_to_fp (SrcTy
+                     (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm,
+                                      ro.Wext:$extend))))),
+           (UCVTF (INSERT_SUBREG (DstTy (IMPLICIT_DEF)),
+                                 (LDRW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend),
+                                 sub))>;
+
+  def : Pat<(DstTy (uint_to_fp (SrcTy
+                     (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm,
+                                      ro.Wext:$extend))))),
+           (UCVTF (INSERT_SUBREG (DstTy (IMPLICIT_DEF)),
+                                 (LDRX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend),
+                                 sub))>;
+}
+
+defm : UIntToFPROLoadPat<f32, i32, zextloadi8,
+                         UCVTFv1i32, ro8, LDRBroW, LDRBroX, bsub>;
+def : Pat <(f32 (uint_to_fp (i32
+               (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
+           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
+                          (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub))>;
+def : Pat <(f32 (uint_to_fp (i32
+                     (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))))),
+           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
+                          (LDURBi GPR64sp:$Rn, simm9:$offset), bsub))>;
+// 16-bits -> float.
+defm : UIntToFPROLoadPat<f32, i32, zextloadi16,
+                         UCVTFv1i32, ro16, LDRHroW, LDRHroX, hsub>;
+def : Pat <(f32 (uint_to_fp (i32
+                  (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
+           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
+                          (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub))>;
+def : Pat <(f32 (uint_to_fp (i32
+                  (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))))),
+           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
+                          (LDURHi GPR64sp:$Rn, simm9:$offset), hsub))>;
+// 32-bits are handled in target specific dag combine:
+// performIntToFpCombine.
+// 64-bits integer to 32-bits floating point, not possible with
+// UCVTF on floating point registers (both source and destination
+// must have the same size).
+
+// Here are the patterns for 8, 16, 32, and 64-bits to double.
+// 8-bits -> double.
+defm : UIntToFPROLoadPat<f64, i32, zextloadi8,
+                         UCVTFv1i64, ro8, LDRBroW, LDRBroX, bsub>;
+def : Pat <(f64 (uint_to_fp (i32
+                    (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
+           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                          (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub))>;
+def : Pat <(f64 (uint_to_fp (i32
+                  (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))))),
+           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                          (LDURBi GPR64sp:$Rn, simm9:$offset), bsub))>;
+// 16-bits -> double.
+defm : UIntToFPROLoadPat<f64, i32, zextloadi16,
+                         UCVTFv1i64, ro16, LDRHroW, LDRHroX, hsub>;
+def : Pat <(f64 (uint_to_fp (i32
+                  (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
+           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                          (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub))>;
+def : Pat <(f64 (uint_to_fp (i32
+                  (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))))),
+           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                          (LDURHi GPR64sp:$Rn, simm9:$offset), hsub))>;
+// 32-bits -> double.
+defm : UIntToFPROLoadPat<f64, i32, load,
+                         UCVTFv1i64, ro32, LDRSroW, LDRSroX, ssub>;
+def : Pat <(f64 (uint_to_fp (i32
+                  (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
+           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                          (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub))>;
+def : Pat <(f64 (uint_to_fp (i32
+                  (load (am_unscaled32 GPR64sp:$Rn, simm9:$offset))))),
+           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                          (LDURSi GPR64sp:$Rn, simm9:$offset), ssub))>;
+// 64-bits -> double are handled in target specific dag combine:
+// performIntToFpCombine.
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD three different-sized vector instructions.
+//===----------------------------------------------------------------------===//
+
+defm ADDHN  : SIMDNarrowThreeVectorBHS<0,0b0100,"addhn", int_aarch64_neon_addhn>;
+defm SUBHN  : SIMDNarrowThreeVectorBHS<0,0b0110,"subhn", int_aarch64_neon_subhn>;
+defm RADDHN : SIMDNarrowThreeVectorBHS<1,0b0100,"raddhn",int_aarch64_neon_raddhn>;
+defm RSUBHN : SIMDNarrowThreeVectorBHS<1,0b0110,"rsubhn",int_aarch64_neon_rsubhn>;
+defm PMULL  : SIMDDifferentThreeVectorBD<0,0b1110,"pmull",int_aarch64_neon_pmull>;
+defm SABAL  : SIMDLongThreeVectorTiedBHSabal<0,0b0101,"sabal",
+                                             int_aarch64_neon_sabd>;
+defm SABDL   : SIMDLongThreeVectorBHSabdl<0, 0b0111, "sabdl",
+                                          int_aarch64_neon_sabd>;
+defm SADDL   : SIMDLongThreeVectorBHS<   0, 0b0000, "saddl",
+            BinOpFrag<(add (sext node:$LHS), (sext node:$RHS))>>;
+defm SADDW   : SIMDWideThreeVectorBHS<   0, 0b0001, "saddw",
+                 BinOpFrag<(add node:$LHS, (sext node:$RHS))>>;
+defm SMLAL   : SIMDLongThreeVectorTiedBHS<0, 0b1000, "smlal",
+    TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
+defm SMLSL   : SIMDLongThreeVectorTiedBHS<0, 0b1010, "smlsl",
+    TriOpFrag<(sub node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
+defm SMULL   : SIMDLongThreeVectorBHS<0, 0b1100, "smull", int_aarch64_neon_smull>;
+defm SQDMLAL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1001, "sqdmlal",
+                                               int_aarch64_neon_sqadd>;
+defm SQDMLSL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1011, "sqdmlsl",
+                                               int_aarch64_neon_sqsub>;
+defm SQDMULL : SIMDLongThreeVectorHS<0, 0b1101, "sqdmull",
+                                     int_aarch64_neon_sqdmull>;
+defm SSUBL   : SIMDLongThreeVectorBHS<0, 0b0010, "ssubl",
+                 BinOpFrag<(sub (sext node:$LHS), (sext node:$RHS))>>;
+defm SSUBW   : SIMDWideThreeVectorBHS<0, 0b0011, "ssubw",
+                 BinOpFrag<(sub node:$LHS, (sext node:$RHS))>>;
+defm UABAL   : SIMDLongThreeVectorTiedBHSabal<1, 0b0101, "uabal",
+                                              int_aarch64_neon_uabd>;
+defm UABDL   : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl",
+                                          int_aarch64_neon_uabd>;
+defm UADDL   : SIMDLongThreeVectorBHS<1, 0b0000, "uaddl",
+                 BinOpFrag<(add (zext node:$LHS), (zext node:$RHS))>>;
+defm UADDW   : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw",
+                 BinOpFrag<(add node:$LHS, (zext node:$RHS))>>;
+defm UMLAL   : SIMDLongThreeVectorTiedBHS<1, 0b1000, "umlal",
+    TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
+defm UMLSL   : SIMDLongThreeVectorTiedBHS<1, 0b1010, "umlsl",
+    TriOpFrag<(sub node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
+defm UMULL   : SIMDLongThreeVectorBHS<1, 0b1100, "umull", int_aarch64_neon_umull>;
+defm USUBL   : SIMDLongThreeVectorBHS<1, 0b0010, "usubl",
+                 BinOpFrag<(sub (zext node:$LHS), (zext node:$RHS))>>;
+defm USUBW   : SIMDWideThreeVectorBHS<   1, 0b0011, "usubw",
+                 BinOpFrag<(sub node:$LHS, (zext node:$RHS))>>;
+
+// Patterns for 64-bit pmull
+def : Pat<(int_aarch64_neon_pmull64 V64:$Rn, V64:$Rm),
+          (PMULLv1i64 V64:$Rn, V64:$Rm)>;
+def : Pat<(int_aarch64_neon_pmull64 (vector_extract (v2i64 V128:$Rn), (i64 1)),
+                                  (vector_extract (v2i64 V128:$Rm), (i64 1))),
+          (PMULLv2i64 V128:$Rn, V128:$Rm)>;
+
+// CodeGen patterns for addhn and subhn instructions, which can actually be
+// written in LLVM IR without too much difficulty.
+
+// ADDHN
+def : Pat<(v8i8 (trunc (v8i16 (AArch64vlshr (add V128:$Rn, V128:$Rm), (i32 8))))),
+          (ADDHNv8i16_v8i8 V128:$Rn, V128:$Rm)>;
+def : Pat<(v4i16 (trunc (v4i32 (AArch64vlshr (add V128:$Rn, V128:$Rm),
+                                           (i32 16))))),
+          (ADDHNv4i32_v4i16 V128:$Rn, V128:$Rm)>;
+def : Pat<(v2i32 (trunc (v2i64 (AArch64vlshr (add V128:$Rn, V128:$Rm),
+                                           (i32 32))))),
+          (ADDHNv2i64_v2i32 V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v8i8 V64:$Rd),
+                          (trunc (v8i16 (AArch64vlshr (add V128:$Rn, V128:$Rm),
+                                                    (i32 8))))),
+          (ADDHNv8i16_v16i8 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+                            V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v4i16 V64:$Rd),
+                          (trunc (v4i32 (AArch64vlshr (add V128:$Rn, V128:$Rm),
+                                                    (i32 16))))),
+          (ADDHNv4i32_v8i16 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+                            V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v2i32 V64:$Rd),
+                          (trunc (v2i64 (AArch64vlshr (add V128:$Rn, V128:$Rm),
+                                                    (i32 32))))),
+          (ADDHNv2i64_v4i32 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+                            V128:$Rn, V128:$Rm)>;
+
+// SUBHN
+def : Pat<(v8i8 (trunc (v8i16 (AArch64vlshr (sub V128:$Rn, V128:$Rm), (i32 8))))),
+          (SUBHNv8i16_v8i8 V128:$Rn, V128:$Rm)>;
+def : Pat<(v4i16 (trunc (v4i32 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
+                                           (i32 16))))),
+          (SUBHNv4i32_v4i16 V128:$Rn, V128:$Rm)>;
+def : Pat<(v2i32 (trunc (v2i64 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
+                                           (i32 32))))),
+          (SUBHNv2i64_v2i32 V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v8i8 V64:$Rd),
+                          (trunc (v8i16 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
+                                                    (i32 8))))),
+          (SUBHNv8i16_v16i8 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+                            V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v4i16 V64:$Rd),
+                          (trunc (v4i32 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
+                                                    (i32 16))))),
+          (SUBHNv4i32_v8i16 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+                            V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v2i32 V64:$Rd),
+                          (trunc (v2i64 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
+                                                    (i32 32))))),
+          (SUBHNv2i64_v4i32 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+                            V128:$Rn, V128:$Rm)>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD bitwise extract from vector instruction.
+//----------------------------------------------------------------------------
+
+defm EXT : SIMDBitwiseExtract<"ext">;
+
+def : Pat<(v4i16 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
+          (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
+def : Pat<(v8i16 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+def : Pat<(v2i32 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
+          (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
+def : Pat<(v2f32 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
+          (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
+def : Pat<(v4i32 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+def : Pat<(v4f32 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+def : Pat<(v2i64 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+def : Pat<(v2f64 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+
+// We use EXT to handle extract_subvector to copy the upper 64-bits of a
+// 128-bit vector.
+def : Pat<(v8i8  (extract_subvector V128:$Rn, (i64 8))),
+          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v4i16 (extract_subvector V128:$Rn, (i64 4))),
+          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v2i32 (extract_subvector V128:$Rn, (i64 2))),
+          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v1i64 (extract_subvector V128:$Rn, (i64 1))),
+          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v2f32 (extract_subvector V128:$Rn, (i64 2))),
+          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v1f64 (extract_subvector V128:$Rn, (i64 1))),
+          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+
+
+//----------------------------------------------------------------------------
+// AdvSIMD zip vector
+//----------------------------------------------------------------------------
+
+defm TRN1 : SIMDZipVector<0b010, "trn1", AArch64trn1>;
+defm TRN2 : SIMDZipVector<0b110, "trn2", AArch64trn2>;
+defm UZP1 : SIMDZipVector<0b001, "uzp1", AArch64uzp1>;
+defm UZP2 : SIMDZipVector<0b101, "uzp2", AArch64uzp2>;
+defm ZIP1 : SIMDZipVector<0b011, "zip1", AArch64zip1>;
+defm ZIP2 : SIMDZipVector<0b111, "zip2", AArch64zip2>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD TBL/TBX instructions
+//----------------------------------------------------------------------------
+
+defm TBL : SIMDTableLookup<    0, "tbl">;
+defm TBX : SIMDTableLookupTied<1, "tbx">;
+
+def : Pat<(v8i8 (int_aarch64_neon_tbl1 (v16i8 VecListOne128:$Rn), (v8i8 V64:$Ri))),
+          (TBLv8i8One VecListOne128:$Rn, V64:$Ri)>;
+def : Pat<(v16i8 (int_aarch64_neon_tbl1 (v16i8 V128:$Ri), (v16i8 V128:$Rn))),
+          (TBLv16i8One V128:$Ri, V128:$Rn)>;
+
+def : Pat<(v8i8 (int_aarch64_neon_tbx1 (v8i8 V64:$Rd),
+                  (v16i8 VecListOne128:$Rn), (v8i8 V64:$Ri))),
+          (TBXv8i8One V64:$Rd, VecListOne128:$Rn, V64:$Ri)>;
+def : Pat<(v16i8 (int_aarch64_neon_tbx1 (v16i8 V128:$Rd),
+                   (v16i8 V128:$Ri), (v16i8 V128:$Rn))),
+          (TBXv16i8One V128:$Rd, V128:$Ri, V128:$Rn)>;
+
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar CPY instruction
+//----------------------------------------------------------------------------
+
+defm CPY : SIMDScalarCPY<"cpy">;
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar pairwise instructions
+//----------------------------------------------------------------------------
+
+defm ADDP    : SIMDPairwiseScalarD<0, 0b11011, "addp">;
+defm FADDP   : SIMDPairwiseScalarSD<1, 0, 0b01101, "faddp">;
+defm FMAXNMP : SIMDPairwiseScalarSD<1, 0, 0b01100, "fmaxnmp">;
+defm FMAXP   : SIMDPairwiseScalarSD<1, 0, 0b01111, "fmaxp">;
+defm FMINNMP : SIMDPairwiseScalarSD<1, 1, 0b01100, "fminnmp">;
+defm FMINP   : SIMDPairwiseScalarSD<1, 1, 0b01111, "fminp">;
+def : Pat<(i64 (int_aarch64_neon_saddv (v2i64 V128:$Rn))),
+          (ADDPv2i64p V128:$Rn)>;
+def : Pat<(i64 (int_aarch64_neon_uaddv (v2i64 V128:$Rn))),
+          (ADDPv2i64p V128:$Rn)>;
+def : Pat<(f32 (int_aarch64_neon_faddv (v2f32 V64:$Rn))),
+          (FADDPv2i32p V64:$Rn)>;
+def : Pat<(f32 (int_aarch64_neon_faddv (v4f32 V128:$Rn))),
+          (FADDPv2i32p (EXTRACT_SUBREG (FADDPv4f32 V128:$Rn, V128:$Rn), dsub))>;
+def : Pat<(f64 (int_aarch64_neon_faddv (v2f64 V128:$Rn))),
+          (FADDPv2i64p V128:$Rn)>;
+def : Pat<(f32 (int_aarch64_neon_fmaxnmv (v2f32 V64:$Rn))),
+          (FMAXNMPv2i32p V64:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_fmaxnmv (v2f64 V128:$Rn))),
+          (FMAXNMPv2i64p V128:$Rn)>;
+def : Pat<(f32 (int_aarch64_neon_fmaxv (v2f32 V64:$Rn))),
+          (FMAXPv2i32p V64:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_fmaxv (v2f64 V128:$Rn))),
+          (FMAXPv2i64p V128:$Rn)>;
+def : Pat<(f32 (int_aarch64_neon_fminnmv (v2f32 V64:$Rn))),
+          (FMINNMPv2i32p V64:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_fminnmv (v2f64 V128:$Rn))),
+          (FMINNMPv2i64p V128:$Rn)>;
+def : Pat<(f32 (int_aarch64_neon_fminv (v2f32 V64:$Rn))),
+          (FMINPv2i32p V64:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_fminv (v2f64 V128:$Rn))),
+          (FMINPv2i64p V128:$Rn)>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD INS/DUP instructions
+//----------------------------------------------------------------------------
+
+def DUPv8i8gpr  : SIMDDupFromMain<0, 0b00001, ".8b", v8i8, V64, GPR32>;
+def DUPv16i8gpr : SIMDDupFromMain<1, 0b00001, ".16b", v16i8, V128, GPR32>;
+def DUPv4i16gpr : SIMDDupFromMain<0, 0b00010, ".4h", v4i16, V64, GPR32>;
+def DUPv8i16gpr : SIMDDupFromMain<1, 0b00010, ".8h", v8i16, V128, GPR32>;
+def DUPv2i32gpr : SIMDDupFromMain<0, 0b00100, ".2s", v2i32, V64, GPR32>;
+def DUPv4i32gpr : SIMDDupFromMain<1, 0b00100, ".4s", v4i32, V128, GPR32>;
+def DUPv2i64gpr : SIMDDupFromMain<1, 0b01000, ".2d", v2i64, V128, GPR64>;
+
+def DUPv2i64lane : SIMDDup64FromElement;
+def DUPv2i32lane : SIMDDup32FromElement<0, ".2s", v2i32, V64>;
+def DUPv4i32lane : SIMDDup32FromElement<1, ".4s", v4i32, V128>;
+def DUPv4i16lane : SIMDDup16FromElement<0, ".4h", v4i16, V64>;
+def DUPv8i16lane : SIMDDup16FromElement<1, ".8h", v8i16, V128>;
+def DUPv8i8lane  : SIMDDup8FromElement <0, ".8b", v8i8, V64>;
+def DUPv16i8lane : SIMDDup8FromElement <1, ".16b", v16i8, V128>;
+
+def : Pat<(v2f32 (AArch64dup (f32 FPR32:$Rn))),
+          (v2f32 (DUPv2i32lane
+            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rn, ssub),
+            (i64 0)))>;
+def : Pat<(v4f32 (AArch64dup (f32 FPR32:$Rn))),
+          (v4f32 (DUPv4i32lane
+            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rn, ssub),
+            (i64 0)))>;
+def : Pat<(v2f64 (AArch64dup (f64 FPR64:$Rn))),
+          (v2f64 (DUPv2i64lane
+            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rn, dsub),
+            (i64 0)))>;
+
+def : Pat<(v2f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
+          (DUPv2i32lane V128:$Rn, VectorIndexS:$imm)>;
+def : Pat<(v4f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
+         (DUPv4i32lane V128:$Rn, VectorIndexS:$imm)>;
+def : Pat<(v2f64 (AArch64duplane64 (v2f64 V128:$Rn), VectorIndexD:$imm)),
+          (DUPv2i64lane V128:$Rn, VectorIndexD:$imm)>;
+
+// If there's an (AArch64dup (vector_extract ...) ...), we can use a duplane
+// instruction even if the types don't match: we just have to remap the lane
+// carefully. N.b. this trick only applies to truncations.
+def VecIndex_x2 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(2 * N->getZExtValue(), MVT::i64);
+}]>;
+def VecIndex_x4 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(4 * N->getZExtValue(), MVT::i64);
+}]>;
+def VecIndex_x8 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(8 * N->getZExtValue(), MVT::i64);
+}]>;
+
+multiclass DUPWithTruncPats<ValueType ResVT, ValueType Src64VT,
+                            ValueType Src128VT, ValueType ScalVT,
+                            Instruction DUP, SDNodeXForm IdxXFORM> {
+  def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src128VT V128:$Rn),
+                                                     imm:$idx)))),
+            (DUP V128:$Rn, (IdxXFORM imm:$idx))>;
+
+  def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src64VT V64:$Rn),
+                                                     imm:$idx)))),
+            (DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
+}
+
+defm : DUPWithTruncPats<v8i8,   v4i16, v8i16, i32, DUPv8i8lane,  VecIndex_x2>;
+defm : DUPWithTruncPats<v8i8,   v2i32, v4i32, i32, DUPv8i8lane,  VecIndex_x4>;
+defm : DUPWithTruncPats<v4i16,  v2i32, v4i32, i32, DUPv4i16lane, VecIndex_x2>;
+
+defm : DUPWithTruncPats<v16i8,  v4i16, v8i16, i32, DUPv16i8lane, VecIndex_x2>;
+defm : DUPWithTruncPats<v16i8,  v2i32, v4i32, i32, DUPv16i8lane, VecIndex_x4>;
+defm : DUPWithTruncPats<v8i16,  v2i32, v4i32, i32, DUPv8i16lane, VecIndex_x2>;
+
+multiclass DUPWithTrunci64Pats<ValueType ResVT, Instruction DUP,
+                               SDNodeXForm IdxXFORM> {
+  def : Pat<(ResVT (AArch64dup (i32 (trunc (vector_extract (v2i64 V128:$Rn),
+                                                         imm:$idx))))),
+            (DUP V128:$Rn, (IdxXFORM imm:$idx))>;
+
+  def : Pat<(ResVT (AArch64dup (i32 (trunc (vector_extract (v1i64 V64:$Rn),
+                                                         imm:$idx))))),
+            (DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
+}
+
+defm : DUPWithTrunci64Pats<v8i8,  DUPv8i8lane,   VecIndex_x8>;
+defm : DUPWithTrunci64Pats<v4i16, DUPv4i16lane,  VecIndex_x4>;
+defm : DUPWithTrunci64Pats<v2i32, DUPv2i32lane,  VecIndex_x2>;
+
+defm : DUPWithTrunci64Pats<v16i8, DUPv16i8lane, VecIndex_x8>;
+defm : DUPWithTrunci64Pats<v8i16, DUPv8i16lane, VecIndex_x4>;
+defm : DUPWithTrunci64Pats<v4i32, DUPv4i32lane, VecIndex_x2>;
+
+// SMOV and UMOV definitions, with some extra patterns for convenience
+defm SMOV : SMov;
+defm UMOV : UMov;
+
+def : Pat<(sext_inreg (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx), i8),
+          (i32 (SMOVvi8to32 V128:$Rn, VectorIndexB:$idx))>;
+def : Pat<(sext_inreg (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx), i8),
+          (i64 (SMOVvi8to64 V128:$Rn, VectorIndexB:$idx))>;
+def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
+          (i32 (SMOVvi16to32 V128:$Rn, VectorIndexH:$idx))>;
+def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
+          (i64 (SMOVvi16to64 V128:$Rn, VectorIndexH:$idx))>;
+def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
+          (i32 (SMOVvi16to32 V128:$Rn, VectorIndexH:$idx))>;
+def : Pat<(sext (i32 (vector_extract (v4i32 V128:$Rn), VectorIndexS:$idx))),
+          (i64 (SMOVvi32to64 V128:$Rn, VectorIndexS:$idx))>;
+
+// Extracting i8 or i16 elements will have the zero-extend transformed to
+// an 'and' mask by type legalization since neither i8 nor i16 are legal types
+// for AArch64. Match these patterns here since UMOV already zeroes out the high
+// bits of the destination register.
+def : Pat<(and (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx),
+               (i32 0xff)),
+          (i32 (UMOVvi8 V128:$Rn, VectorIndexB:$idx))>;
+def : Pat<(and (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),
+               (i32 0xffff)),
+          (i32 (UMOVvi16 V128:$Rn, VectorIndexH:$idx))>;
+
+defm INS : SIMDIns;
+
+def : Pat<(v16i8 (scalar_to_vector GPR32:$Rn)),
+          (SUBREG_TO_REG (i32 0),
+                         (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
+def : Pat<(v8i8 (scalar_to_vector GPR32:$Rn)),
+          (SUBREG_TO_REG (i32 0),
+                         (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
+
+def : Pat<(v8i16 (scalar_to_vector GPR32:$Rn)),
+          (SUBREG_TO_REG (i32 0),
+                         (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
+def : Pat<(v4i16 (scalar_to_vector GPR32:$Rn)),
+          (SUBREG_TO_REG (i32 0),
+                         (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
+
+def : Pat<(v2i32 (scalar_to_vector (i32 FPR32:$Rn))),
+            (v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
+                                  (i32 FPR32:$Rn), ssub))>;
+def : Pat<(v4i32 (scalar_to_vector (i32 FPR32:$Rn))),
+            (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
+                                  (i32 FPR32:$Rn), ssub))>;
+def : Pat<(v2i64 (scalar_to_vector (i64 FPR64:$Rn))),
+            (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
+                                  (i64 FPR64:$Rn), dsub))>;
+
+def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))),
+          (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
+def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))),
+          (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
+def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$Rn))),
+          (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rn, dsub)>;
+
+def : Pat<(v2f32 (vector_insert (v2f32 V64:$Rn),
+            (f32 FPR32:$Rm), (i64 VectorIndexS:$imm))),
+          (EXTRACT_SUBREG
+            (INSvi32lane
+              (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), V64:$Rn, dsub)),
+              VectorIndexS:$imm,
+              (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rm, ssub)),
+              (i64 0)),
+            dsub)>;
+def : Pat<(v4f32 (vector_insert (v4f32 V128:$Rn),
+            (f32 FPR32:$Rm), (i64 VectorIndexS:$imm))),
+          (INSvi32lane
+            V128:$Rn, VectorIndexS:$imm,
+            (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rm, ssub)),
+            (i64 0))>;
+def : Pat<(v2f64 (vector_insert (v2f64 V128:$Rn),
+            (f64 FPR64:$Rm), (i64 VectorIndexD:$imm))),
+          (INSvi64lane
+            V128:$Rn, VectorIndexD:$imm,
+            (v2f64 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rm, dsub)),
+            (i64 0))>;
+
+// Copy an element at a constant index in one vector into a constant indexed
+// element of another.
+// FIXME refactor to a shared class/dev parameterized on vector type, vector
+// index type and INS extension
+def : Pat<(v16i8 (int_aarch64_neon_vcopy_lane
+                   (v16i8 V128:$Vd), VectorIndexB:$idx, (v16i8 V128:$Vs),
+                   VectorIndexB:$idx2)),
+          (v16i8 (INSvi8lane
+                   V128:$Vd, VectorIndexB:$idx, V128:$Vs, VectorIndexB:$idx2)
+          )>;
+def : Pat<(v8i16 (int_aarch64_neon_vcopy_lane
+                   (v8i16 V128:$Vd), VectorIndexH:$idx, (v8i16 V128:$Vs),
+                   VectorIndexH:$idx2)),
+          (v8i16 (INSvi16lane
+                   V128:$Vd, VectorIndexH:$idx, V128:$Vs, VectorIndexH:$idx2)
+          )>;
+def : Pat<(v4i32 (int_aarch64_neon_vcopy_lane
+                   (v4i32 V128:$Vd), VectorIndexS:$idx, (v4i32 V128:$Vs),
+                   VectorIndexS:$idx2)),
+          (v4i32 (INSvi32lane
+                   V128:$Vd, VectorIndexS:$idx, V128:$Vs, VectorIndexS:$idx2)
+          )>;
+def : Pat<(v2i64 (int_aarch64_neon_vcopy_lane
+                   (v2i64 V128:$Vd), VectorIndexD:$idx, (v2i64 V128:$Vs),
+                   VectorIndexD:$idx2)),
+          (v2i64 (INSvi64lane
+                   V128:$Vd, VectorIndexD:$idx, V128:$Vs, VectorIndexD:$idx2)
+          )>;
+
+multiclass Neon_INS_elt_pattern<ValueType VT128, ValueType VT64,
+                                ValueType VTScal, Instruction INS> {
+  def : Pat<(VT128 (vector_insert V128:$src,
+                        (VTScal (vector_extract (VT128 V128:$Rn), imm:$Immn)),
+                        imm:$Immd)),
+            (INS V128:$src, imm:$Immd, V128:$Rn, imm:$Immn)>;
+
+  def : Pat<(VT128 (vector_insert V128:$src,
+                        (VTScal (vector_extract (VT64 V64:$Rn), imm:$Immn)),
+                        imm:$Immd)),
+            (INS V128:$src, imm:$Immd,
+                 (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), imm:$Immn)>;
+
+  def : Pat<(VT64 (vector_insert V64:$src,
+                        (VTScal (vector_extract (VT128 V128:$Rn), imm:$Immn)),
+                        imm:$Immd)),
+            (EXTRACT_SUBREG (INS (SUBREG_TO_REG (i64 0), V64:$src, dsub),
+                                 imm:$Immd, V128:$Rn, imm:$Immn),
+                            dsub)>;
+
+  def : Pat<(VT64 (vector_insert V64:$src,
+                        (VTScal (vector_extract (VT64 V64:$Rn), imm:$Immn)),
+                        imm:$Immd)),
+            (EXTRACT_SUBREG
+                (INS (SUBREG_TO_REG (i64 0), V64:$src, dsub), imm:$Immd,
+                     (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), imm:$Immn),
+                dsub)>;
+}
+
+defm : Neon_INS_elt_pattern<v4f32, v2f32, f32, INSvi32lane>;
+defm : Neon_INS_elt_pattern<v2f64, v1f64, f64, INSvi64lane>;
+defm : Neon_INS_elt_pattern<v16i8, v8i8,  i32, INSvi8lane>;
+defm : Neon_INS_elt_pattern<v8i16, v4i16, i32, INSvi16lane>;
+defm : Neon_INS_elt_pattern<v4i32, v2i32, i32, INSvi32lane>;
+defm : Neon_INS_elt_pattern<v2i64, v1i64, i64, INSvi32lane>;
+
+
+// Floating point vector extractions are codegen'd as either a sequence of
+// subregister extractions, possibly fed by an INS if the lane number is
+// anything other than zero.
+def : Pat<(vector_extract (v2f64 V128:$Rn), 0),
+          (f64 (EXTRACT_SUBREG V128:$Rn, dsub))>;
+def : Pat<(vector_extract (v4f32 V128:$Rn), 0),
+          (f32 (EXTRACT_SUBREG V128:$Rn, ssub))>;
+def : Pat<(vector_extract (v2f64 V128:$Rn), VectorIndexD:$idx),
+          (f64 (EXTRACT_SUBREG
+            (INSvi64lane (v2f64 (IMPLICIT_DEF)), 0,
+                         V128:$Rn, VectorIndexD:$idx),
+            dsub))>;
+def : Pat<(vector_extract (v4f32 V128:$Rn), VectorIndexS:$idx),
+          (f32 (EXTRACT_SUBREG
+            (INSvi32lane (v4f32 (IMPLICIT_DEF)), 0,
+                         V128:$Rn, VectorIndexS:$idx),
+            ssub))>;
+
+// All concat_vectors operations are canonicalised to act on i64 vectors for
+// AArch64. In the general case we need an instruction, which had just as well be
+// INS.
+class ConcatPat<ValueType DstTy, ValueType SrcTy>
+  : Pat<(DstTy (concat_vectors (SrcTy V64:$Rd), V64:$Rn)),
+        (INSvi64lane (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), 1,
+                     (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), 0)>;
+
+def : ConcatPat<v2i64, v1i64>;
+def : ConcatPat<v2f64, v1f64>;
+def : ConcatPat<v4i32, v2i32>;
+def : ConcatPat<v4f32, v2f32>;
+def : ConcatPat<v8i16, v4i16>;
+def : ConcatPat<v16i8, v8i8>;
+
+// If the high lanes are undef, though, we can just ignore them:
+class ConcatUndefPat<ValueType DstTy, ValueType SrcTy>
+  : Pat<(DstTy (concat_vectors (SrcTy V64:$Rn), undef)),
+        (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub)>;
+
+def : ConcatUndefPat<v2i64, v1i64>;
+def : ConcatUndefPat<v2f64, v1f64>;
+def : ConcatUndefPat<v4i32, v2i32>;
+def : ConcatUndefPat<v4f32, v2f32>;
+def : ConcatUndefPat<v8i16, v4i16>;
+def : ConcatUndefPat<v16i8, v8i8>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD across lanes instructions
+//----------------------------------------------------------------------------
+
+defm ADDV    : SIMDAcrossLanesBHS<0, 0b11011, "addv">;
+defm SMAXV   : SIMDAcrossLanesBHS<0, 0b01010, "smaxv">;
+defm SMINV   : SIMDAcrossLanesBHS<0, 0b11010, "sminv">;
+defm UMAXV   : SIMDAcrossLanesBHS<1, 0b01010, "umaxv">;
+defm UMINV   : SIMDAcrossLanesBHS<1, 0b11010, "uminv">;
+defm SADDLV  : SIMDAcrossLanesHSD<0, 0b00011, "saddlv">;
+defm UADDLV  : SIMDAcrossLanesHSD<1, 0b00011, "uaddlv">;
+defm FMAXNMV : SIMDAcrossLanesS<0b01100, 0, "fmaxnmv", int_aarch64_neon_fmaxnmv>;
+defm FMAXV   : SIMDAcrossLanesS<0b01111, 0, "fmaxv", int_aarch64_neon_fmaxv>;
+defm FMINNMV : SIMDAcrossLanesS<0b01100, 1, "fminnmv", int_aarch64_neon_fminnmv>;
+defm FMINV   : SIMDAcrossLanesS<0b01111, 1, "fminv", int_aarch64_neon_fminv>;
+
+multiclass SIMDAcrossLanesSignedIntrinsic<string baseOpc, Intrinsic intOp> {
+// If there is a sign extension after this intrinsic, consume it as smov already
+// performed it
+  def : Pat<(i32 (sext_inreg (i32 (intOp (v8i8 V64:$Rn))), i8)),
+        (i32 (SMOVvi8to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
+          (i64 0)))>;
+  def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
+        (i32 (SMOVvi8to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
+          (i64 0)))>;
+// If there is a sign extension after this intrinsic, consume it as smov already
+// performed it
+def : Pat<(i32 (sext_inreg (i32 (intOp (v16i8 V128:$Rn))), i8)),
+        (i32 (SMOVvi8to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
+          (i64 0)))>;
+def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
+        (i32 (SMOVvi8to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
+          (i64 0)))>;
+// If there is a sign extension after this intrinsic, consume it as smov already
+// performed it
+def : Pat<(i32 (sext_inreg (i32 (intOp (v4i16 V64:$Rn))), i16)),
+          (i32 (SMOVvi16to32
+           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
+           (i64 0)))>;
+def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
+          (i32 (SMOVvi16to32
+           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
+           (i64 0)))>;
+// If there is a sign extension after this intrinsic, consume it as smov already
+// performed it
+def : Pat<(i32 (sext_inreg (i32 (intOp (v8i16 V128:$Rn))), i16)),
+        (i32 (SMOVvi16to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
+          (i64 0)))>;
+def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
+        (i32 (SMOVvi16to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
+          (i64 0)))>;
+
+def : Pat<(i32 (intOp (v4i32 V128:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), ssub),
+          ssub))>;
+}
+
+multiclass SIMDAcrossLanesUnsignedIntrinsic<string baseOpc, Intrinsic intOp> {
+// If there is a masking operation keeping only what has been actually
+// generated, consume it.
+  def : Pat<(i32 (and (i32 (intOp (v8i8 V64:$Rn))), maski8_or_more)),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
+          ssub))>;
+  def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
+          ssub))>;
+// If there is a masking operation keeping only what has been actually
+// generated, consume it.
+def : Pat<(i32 (and (i32 (intOp (v16i8 V128:$Rn))), maski8_or_more)),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
+          ssub))>;
+def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
+          ssub))>;
+
+// If there is a masking operation keeping only what has been actually
+// generated, consume it.
+def : Pat<(i32 (and (i32 (intOp (v4i16 V64:$Rn))), maski16_or_more)),
+          (i32 (EXTRACT_SUBREG
+            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+              (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
+            ssub))>;
+def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
+          (i32 (EXTRACT_SUBREG
+            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+              (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
+            ssub))>;
+// If there is a masking operation keeping only what has been actually
+// generated, consume it.
+def : Pat<(i32 (and (i32 (intOp (v8i16 V128:$Rn))), maski16_or_more)),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
+          ssub))>;
+def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
+          ssub))>;
+
+def : Pat<(i32 (intOp (v4i32 V128:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), ssub),
+          ssub))>;
+
+}
+
+multiclass SIMDAcrossLanesSignedLongIntrinsic<string baseOpc, Intrinsic intOp> {
+  def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
+        (i32 (SMOVvi16to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub),
+          (i64 0)))>;
+def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
+        (i32 (SMOVvi16to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub),
+          (i64 0)))>;
+
+def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
+          (i32 (EXTRACT_SUBREG
+           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub),
+           ssub))>;
+def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub),
+          ssub))>;
+
+def : Pat<(i64 (intOp (v4i32 V128:$Rn))),
+        (i64 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub),
+          dsub))>;
+}
+
+multiclass SIMDAcrossLanesUnsignedLongIntrinsic<string baseOpc,
+                                                Intrinsic intOp> {
+  def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub),
+          ssub))>;
+def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub),
+          ssub))>;
+
+def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
+          (i32 (EXTRACT_SUBREG
+            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+              (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub),
+            ssub))>;
+def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub),
+          ssub))>;
+
+def : Pat<(i64 (intOp (v4i32 V128:$Rn))),
+        (i64 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub),
+          dsub))>;
+}
+
+defm : SIMDAcrossLanesSignedIntrinsic<"ADDV",  int_aarch64_neon_saddv>;
+// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm
+def : Pat<(i32 (int_aarch64_neon_saddv (v2i32 V64:$Rn))),
+          (EXTRACT_SUBREG (ADDPv2i32 V64:$Rn, V64:$Rn), ssub)>;
+
+defm : SIMDAcrossLanesUnsignedIntrinsic<"ADDV",  int_aarch64_neon_uaddv>;
+// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm
+def : Pat<(i32 (int_aarch64_neon_uaddv (v2i32 V64:$Rn))),
+          (EXTRACT_SUBREG (ADDPv2i32 V64:$Rn, V64:$Rn), ssub)>;
+
+defm : SIMDAcrossLanesSignedIntrinsic<"SMAXV", int_aarch64_neon_smaxv>;
+def : Pat<(i32 (int_aarch64_neon_smaxv (v2i32 V64:$Rn))),
+           (EXTRACT_SUBREG (SMAXPv2i32 V64:$Rn, V64:$Rn), ssub)>;
+
+defm : SIMDAcrossLanesSignedIntrinsic<"SMINV", int_aarch64_neon_sminv>;
+def : Pat<(i32 (int_aarch64_neon_sminv (v2i32 V64:$Rn))),
+           (EXTRACT_SUBREG (SMINPv2i32 V64:$Rn, V64:$Rn), ssub)>;
+
+defm : SIMDAcrossLanesUnsignedIntrinsic<"UMAXV", int_aarch64_neon_umaxv>;
+def : Pat<(i32 (int_aarch64_neon_umaxv (v2i32 V64:$Rn))),
+           (EXTRACT_SUBREG (UMAXPv2i32 V64:$Rn, V64:$Rn), ssub)>;
+
+defm : SIMDAcrossLanesUnsignedIntrinsic<"UMINV", int_aarch64_neon_uminv>;
+def : Pat<(i32 (int_aarch64_neon_uminv (v2i32 V64:$Rn))),
+           (EXTRACT_SUBREG (UMINPv2i32 V64:$Rn, V64:$Rn), ssub)>;
+
+defm : SIMDAcrossLanesSignedLongIntrinsic<"SADDLV", int_aarch64_neon_saddlv>;
+defm : SIMDAcrossLanesUnsignedLongIntrinsic<"UADDLV", int_aarch64_neon_uaddlv>;
+
+// The vaddlv_s32 intrinsic gets mapped to SADDLP.
+def : Pat<(i64 (int_aarch64_neon_saddlv (v2i32 V64:$Rn))),
+          (i64 (EXTRACT_SUBREG
+            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+              (SADDLPv2i32_v1i64 V64:$Rn), dsub),
+            dsub))>;
+// The vaddlv_u32 intrinsic gets mapped to UADDLP.
+def : Pat<(i64 (int_aarch64_neon_uaddlv (v2i32 V64:$Rn))),
+          (i64 (EXTRACT_SUBREG
+            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+              (UADDLPv2i32_v1i64 V64:$Rn), dsub),
+            dsub))>;
+
+//------------------------------------------------------------------------------
+// AdvSIMD modified immediate instructions
+//------------------------------------------------------------------------------
+
+// AdvSIMD BIC
+defm BIC : SIMDModifiedImmVectorShiftTied<1, 0b11, 0b01, "bic", AArch64bici>;
+// AdvSIMD ORR
+defm ORR : SIMDModifiedImmVectorShiftTied<0, 0b11, 0b01, "orr", AArch64orri>;
+
+def : InstAlias<"bic $Vd.4h, $imm", (BICv4i16 V64:$Vd,  imm0_255:$imm, 0)>;
+def : InstAlias<"bic $Vd.8h, $imm", (BICv8i16 V128:$Vd, imm0_255:$imm, 0)>;
+def : InstAlias<"bic $Vd.2s, $imm", (BICv2i32 V64:$Vd,  imm0_255:$imm, 0)>;
+def : InstAlias<"bic $Vd.4s, $imm", (BICv4i32 V128:$Vd, imm0_255:$imm, 0)>;
+
+def : InstAlias<"bic.4h $Vd, $imm", (BICv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"bic.8h $Vd, $imm", (BICv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"bic.2s $Vd, $imm", (BICv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"bic.4s $Vd, $imm", (BICv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+def : InstAlias<"orr $Vd.4h, $imm", (ORRv4i16 V64:$Vd,  imm0_255:$imm, 0)>;
+def : InstAlias<"orr $Vd.8h, $imm", (ORRv8i16 V128:$Vd, imm0_255:$imm, 0)>;
+def : InstAlias<"orr $Vd.2s, $imm", (ORRv2i32 V64:$Vd,  imm0_255:$imm, 0)>;
+def : InstAlias<"orr $Vd.4s, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0)>;
+
+def : InstAlias<"orr.4h $Vd, $imm", (ORRv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"orr.8h $Vd, $imm", (ORRv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"orr.2s $Vd, $imm", (ORRv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"orr.4s $Vd, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+// AdvSIMD FMOV
+def FMOVv2f64_ns : SIMDModifiedImmVectorNoShift<1, 1, 0b1111, V128, fpimm8,
+                                              "fmov", ".2d",
+                       [(set (v2f64 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>;
+def FMOVv2f32_ns : SIMDModifiedImmVectorNoShift<0, 0, 0b1111, V64,  fpimm8,
+                                              "fmov", ".2s",
+                       [(set (v2f32 V64:$Rd), (AArch64fmov imm0_255:$imm8))]>;
+def FMOVv4f32_ns : SIMDModifiedImmVectorNoShift<1, 0, 0b1111, V128, fpimm8,
+                                              "fmov", ".4s",
+                       [(set (v4f32 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>;
+
+// AdvSIMD MOVI
+
+// EDIT byte mask: scalar
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
+def MOVID      : SIMDModifiedImmScalarNoShift<0, 1, 0b1110, "movi",
+                    [(set FPR64:$Rd, simdimmtype10:$imm8)]>;
+// The movi_edit node has the immediate value already encoded, so we use
+// a plain imm0_255 here.
+def : Pat<(f64 (AArch64movi_edit imm0_255:$shift)),
+          (MOVID imm0_255:$shift)>;
+
+def : Pat<(v1i64 immAllZerosV), (MOVID (i32 0))>;
+def : Pat<(v2i32 immAllZerosV), (MOVID (i32 0))>;
+def : Pat<(v4i16 immAllZerosV), (MOVID (i32 0))>;
+def : Pat<(v8i8  immAllZerosV), (MOVID (i32 0))>;
+
+def : Pat<(v1i64 immAllOnesV), (MOVID (i32 255))>;
+def : Pat<(v2i32 immAllOnesV), (MOVID (i32 255))>;
+def : Pat<(v4i16 immAllOnesV), (MOVID (i32 255))>;
+def : Pat<(v8i8  immAllOnesV), (MOVID (i32 255))>;
+
+// EDIT byte mask: 2d
+
+// The movi_edit node has the immediate value already encoded, so we use
+// a plain imm0_255 in the pattern
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
+def MOVIv2d_ns   : SIMDModifiedImmVectorNoShift<1, 1, 0b1110, V128,
+                                                simdimmtype10,
+                                                "movi", ".2d",
+                   [(set (v2i64 V128:$Rd), (AArch64movi_edit imm0_255:$imm8))]>;
+
+
+// Use movi.2d to materialize 0.0 if the HW does zero-cycle zeroing.
+// Complexity is added to break a tie with a plain MOVI.
+let AddedComplexity = 1 in {
+def : Pat<(f32   fpimm0),
+          (f32 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), ssub))>,
+      Requires<[HasZCZ]>;
+def : Pat<(f64   fpimm0),
+          (f64 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), dsub))>,
+      Requires<[HasZCZ]>;
+}
+
+def : Pat<(v2i64 immAllZerosV), (MOVIv2d_ns (i32 0))>;
+def : Pat<(v4i32 immAllZerosV), (MOVIv2d_ns (i32 0))>;
+def : Pat<(v8i16 immAllZerosV), (MOVIv2d_ns (i32 0))>;
+def : Pat<(v16i8 immAllZerosV), (MOVIv2d_ns (i32 0))>;
+
+def : Pat<(v2i64 immAllOnesV), (MOVIv2d_ns (i32 255))>;
+def : Pat<(v4i32 immAllOnesV), (MOVIv2d_ns (i32 255))>;
+def : Pat<(v8i16 immAllOnesV), (MOVIv2d_ns (i32 255))>;
+def : Pat<(v16i8 immAllOnesV), (MOVIv2d_ns (i32 255))>;
+
+def : Pat<(v2f64 (AArch64dup (f64 fpimm0))), (MOVIv2d_ns (i32 0))>;
+def : Pat<(v4f32 (AArch64dup (f32 fpimm0))), (MOVIv2d_ns (i32 0))>;
+
+// EDIT per word & halfword: 2s, 4h, 4s, & 8h
+defm MOVI      : SIMDModifiedImmVectorShift<0, 0b10, 0b00, "movi">;
+
+def : InstAlias<"movi $Vd.4h, $imm", (MOVIv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi $Vd.8h, $imm", (MOVIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi $Vd.2s, $imm", (MOVIv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi $Vd.4s, $imm", (MOVIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+def : InstAlias<"movi.4h $Vd, $imm", (MOVIv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi.8h $Vd, $imm", (MOVIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi.2s $Vd, $imm", (MOVIv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi.4s $Vd, $imm", (MOVIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+def : Pat<(v2i32 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MOVIv2i32 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v4i32 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MOVIv4i32 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v4i16 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MOVIv4i16 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v8i16 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MOVIv8i16 imm0_255:$imm8, imm:$shift)>;
+
+// EDIT per word: 2s & 4s with MSL shifter
+def MOVIv2s_msl  : SIMDModifiedImmMoveMSL<0, 0, {1,1,0,?}, V64, "movi", ".2s",
+                      [(set (v2i32 V64:$Rd),
+                            (AArch64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
+def MOVIv4s_msl  : SIMDModifiedImmMoveMSL<1, 0, {1,1,0,?}, V128, "movi", ".4s",
+                      [(set (v4i32 V128:$Rd),
+                            (AArch64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
+
+// Per byte: 8b & 16b
+def MOVIv8b_ns   : SIMDModifiedImmVectorNoShift<0, 0, 0b1110, V64,  imm0_255,
+                                                 "movi", ".8b",
+                       [(set (v8i8 V64:$Rd), (AArch64movi imm0_255:$imm8))]>;
+def MOVIv16b_ns  : SIMDModifiedImmVectorNoShift<1, 0, 0b1110, V128, imm0_255,
+                                                 "movi", ".16b",
+                       [(set (v16i8 V128:$Rd), (AArch64movi imm0_255:$imm8))]>;
+
+// AdvSIMD MVNI
+
+// EDIT per word & halfword: 2s, 4h, 4s, & 8h
+defm MVNI      : SIMDModifiedImmVectorShift<1, 0b10, 0b00, "mvni">;
+
+def : InstAlias<"mvni $Vd.4h, $imm", (MVNIv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni $Vd.8h, $imm", (MVNIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni $Vd.2s, $imm", (MVNIv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni $Vd.4s, $imm", (MVNIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+def : InstAlias<"mvni.4h $Vd, $imm", (MVNIv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni.8h $Vd, $imm", (MVNIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni.2s $Vd, $imm", (MVNIv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni.4s $Vd, $imm", (MVNIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+def : Pat<(v2i32 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MVNIv2i32 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v4i32 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MVNIv4i32 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v4i16 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MVNIv4i16 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v8i16 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MVNIv8i16 imm0_255:$imm8, imm:$shift)>;
+
+// EDIT per word: 2s & 4s with MSL shifter
+def MVNIv2s_msl   : SIMDModifiedImmMoveMSL<0, 1, {1,1,0,?}, V64, "mvni", ".2s",
+                      [(set (v2i32 V64:$Rd),
+                            (AArch64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
+def MVNIv4s_msl   : SIMDModifiedImmMoveMSL<1, 1, {1,1,0,?}, V128, "mvni", ".4s",
+                      [(set (v4i32 V128:$Rd),
+                            (AArch64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD indexed element
+//----------------------------------------------------------------------------
+
+let neverHasSideEffects = 1 in {
+  defm FMLA  : SIMDFPIndexedSDTied<0, 0b0001, "fmla">;
+  defm FMLS  : SIMDFPIndexedSDTied<0, 0b0101, "fmls">;
+}
+
+// NOTE: Operands are reordered in the FMLA/FMLS PatFrags because the
+// instruction expects the addend first, while the intrinsic expects it last.
+
+// On the other hand, there are quite a few valid combinatorial options due to
+// the commutativity of multiplication and the fact that (-x) * y = x * (-y).
+defm : SIMDFPIndexedSDTiedPatterns<"FMLA",
+           TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)>>;
+defm : SIMDFPIndexedSDTiedPatterns<"FMLA",
+           TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)>>;
+
+defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
+           TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
+defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
+           TriOpFrag<(fma node:$RHS, (fneg node:$MHS), node:$LHS)> >;
+defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
+           TriOpFrag<(fma (fneg node:$RHS), node:$MHS, node:$LHS)> >;
+defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
+           TriOpFrag<(fma (fneg node:$MHS), node:$RHS, node:$LHS)> >;
+
+multiclass FMLSIndexedAfterNegPatterns<SDPatternOperator OpNode> {
+  // 3 variants for the .2s version: DUPLANE from 128-bit, DUPLANE from 64-bit
+  // and DUP scalar.
+  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
+                           (AArch64duplane32 (v4f32 (fneg V128:$Rm)),
+                                           VectorIndexS:$idx))),
+            (FMLSv2i32_indexed V64:$Rd, V64:$Rn, V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
+                           (v2f32 (AArch64duplane32
+                                      (v4f32 (insert_subvector undef,
+                                                 (v2f32 (fneg V64:$Rm)),
+                                                 (i32 0))),
+                                      VectorIndexS:$idx)))),
+            (FMLSv2i32_indexed V64:$Rd, V64:$Rn,
+                               (SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
+                               VectorIndexS:$idx)>;
+  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
+                           (AArch64dup (f32 (fneg FPR32Op:$Rm))))),
+            (FMLSv2i32_indexed V64:$Rd, V64:$Rn,
+                (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
+
+  // 3 variants for the .4s version: DUPLANE from 128-bit, DUPLANE from 64-bit
+  // and DUP scalar.
+  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
+                           (AArch64duplane32 (v4f32 (fneg V128:$Rm)),
+                                           VectorIndexS:$idx))),
+            (FMLSv4i32_indexed V128:$Rd, V128:$Rn, V128:$Rm,
+                               VectorIndexS:$idx)>;
+  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
+                           (v4f32 (AArch64duplane32
+                                      (v4f32 (insert_subvector undef,
+                                                 (v2f32 (fneg V64:$Rm)),
+                                                 (i32 0))),
+                                      VectorIndexS:$idx)))),
+            (FMLSv4i32_indexed V128:$Rd, V128:$Rn,
+                               (SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
+                               VectorIndexS:$idx)>;
+  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
+                           (AArch64dup (f32 (fneg FPR32Op:$Rm))))),
+            (FMLSv4i32_indexed V128:$Rd, V128:$Rn,
+                (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
+
+  // 2 variants for the .2d version: DUPLANE from 128-bit, and DUP scalar
+  // (DUPLANE from 64-bit would be trivial).
+  def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
+                           (AArch64duplane64 (v2f64 (fneg V128:$Rm)),
+                                           VectorIndexD:$idx))),
+            (FMLSv2i64_indexed
+                V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
+                           (AArch64dup (f64 (fneg FPR64Op:$Rm))))),
+            (FMLSv2i64_indexed V128:$Rd, V128:$Rn,
+                (SUBREG_TO_REG (i32 0), FPR64Op:$Rm, dsub), (i64 0))>;
+
+  // 2 variants for 32-bit scalar version: extract from .2s or from .4s
+  def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
+                         (vector_extract (v4f32 (fneg V128:$Rm)),
+                                         VectorIndexS:$idx))),
+            (FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn,
+                V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
+                         (vector_extract (v2f32 (fneg V64:$Rm)),
+                                         VectorIndexS:$idx))),
+            (FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn,
+                (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>;
+
+  // 1 variant for 64-bit scalar version: extract from .1d or from .2d
+  def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn),
+                         (vector_extract (v2f64 (fneg V128:$Rm)),
+                                         VectorIndexS:$idx))),
+            (FMLSv1i64_indexed FPR64:$Rd, FPR64:$Rn,
+                V128:$Rm, VectorIndexS:$idx)>;
+}
+
+defm : FMLSIndexedAfterNegPatterns<
+           TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >;
+defm : FMLSIndexedAfterNegPatterns<
+           TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)> >;
+
+defm FMULX : SIMDFPIndexedSD<1, 0b1001, "fmulx", int_aarch64_neon_fmulx>;
+defm FMUL  : SIMDFPIndexedSD<0, 0b1001, "fmul", fmul>;
+
+def : Pat<(v2f32 (fmul V64:$Rn, (AArch64dup (f32 FPR32:$Rm)))),
+          (FMULv2i32_indexed V64:$Rn,
+            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub),
+            (i64 0))>;
+def : Pat<(v4f32 (fmul V128:$Rn, (AArch64dup (f32 FPR32:$Rm)))),
+          (FMULv4i32_indexed V128:$Rn,
+            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub),
+            (i64 0))>;
+def : Pat<(v2f64 (fmul V128:$Rn, (AArch64dup (f64 FPR64:$Rm)))),
+          (FMULv2i64_indexed V128:$Rn,
+            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rm, dsub),
+            (i64 0))>;
+
+defm SQDMULH : SIMDIndexedHS<0, 0b1100, "sqdmulh", int_aarch64_neon_sqdmulh>;
+defm SQRDMULH : SIMDIndexedHS<0, 0b1101, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
+defm MLA   : SIMDVectorIndexedHSTied<1, 0b0000, "mla",
+              TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))>>;
+defm MLS   : SIMDVectorIndexedHSTied<1, 0b0100, "mls",
+              TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))>>;
+defm MUL   : SIMDVectorIndexedHS<0, 0b1000, "mul", mul>;
+defm SMLAL : SIMDVectorIndexedLongSDTied<0, 0b0010, "smlal",
+    TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
+defm SMLSL : SIMDVectorIndexedLongSDTied<0, 0b0110, "smlsl",
+    TriOpFrag<(sub node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
+defm SMULL : SIMDVectorIndexedLongSD<0, 0b1010, "smull",
+                int_aarch64_neon_smull>;
+defm SQDMLAL : SIMDIndexedLongSQDMLXSDTied<0, 0b0011, "sqdmlal",
+                                           int_aarch64_neon_sqadd>;
+defm SQDMLSL : SIMDIndexedLongSQDMLXSDTied<0, 0b0111, "sqdmlsl",
+                                           int_aarch64_neon_sqsub>;
+defm SQDMULL : SIMDIndexedLongSD<0, 0b1011, "sqdmull", int_aarch64_neon_sqdmull>;
+defm UMLAL   : SIMDVectorIndexedLongSDTied<1, 0b0010, "umlal",
+    TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
+defm UMLSL   : SIMDVectorIndexedLongSDTied<1, 0b0110, "umlsl",
+    TriOpFrag<(sub node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
+defm UMULL   : SIMDVectorIndexedLongSD<1, 0b1010, "umull",
+                int_aarch64_neon_umull>;
+
+// A scalar sqdmull with the second operand being a vector lane can be
+// handled directly with the indexed instruction encoding.
+def : Pat<(int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
+                                          (vector_extract (v4i32 V128:$Vm),
+                                                           VectorIndexS:$idx)),
+          (SQDMULLv1i64_indexed FPR32:$Rn, V128:$Vm, VectorIndexS:$idx)>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar shift instructions
+//----------------------------------------------------------------------------
+defm FCVTZS : SIMDScalarRShiftSD<0, 0b11111, "fcvtzs">;
+defm FCVTZU : SIMDScalarRShiftSD<1, 0b11111, "fcvtzu">;
+defm SCVTF  : SIMDScalarRShiftSD<0, 0b11100, "scvtf">;
+defm UCVTF  : SIMDScalarRShiftSD<1, 0b11100, "ucvtf">;
+// Codegen patterns for the above. We don't put these directly on the
+// instructions because TableGen's type inference can't handle the truth.
+// Having the same base pattern for fp <--> int totally freaks it out.
+def : Pat<(int_aarch64_neon_vcvtfp2fxs FPR32:$Rn, vecshiftR32:$imm),
+          (FCVTZSs FPR32:$Rn, vecshiftR32:$imm)>;
+def : Pat<(int_aarch64_neon_vcvtfp2fxu FPR32:$Rn, vecshiftR32:$imm),
+          (FCVTZUs FPR32:$Rn, vecshiftR32:$imm)>;
+def : Pat<(i64 (int_aarch64_neon_vcvtfp2fxs (f64 FPR64:$Rn), vecshiftR64:$imm)),
+          (FCVTZSd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(i64 (int_aarch64_neon_vcvtfp2fxu (f64 FPR64:$Rn), vecshiftR64:$imm)),
+          (FCVTZUd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(v1i64 (int_aarch64_neon_vcvtfp2fxs (v1f64 FPR64:$Rn),
+                                            vecshiftR64:$imm)),
+          (FCVTZSd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(v1i64 (int_aarch64_neon_vcvtfp2fxu (v1f64 FPR64:$Rn),
+                                            vecshiftR64:$imm)),
+          (FCVTZUd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(int_aarch64_neon_vcvtfxs2fp FPR32:$Rn, vecshiftR32:$imm),
+          (SCVTFs FPR32:$Rn, vecshiftR32:$imm)>;
+def : Pat<(int_aarch64_neon_vcvtfxu2fp FPR32:$Rn, vecshiftR32:$imm),
+          (UCVTFs FPR32:$Rn, vecshiftR32:$imm)>;
+def : Pat<(f64 (int_aarch64_neon_vcvtfxs2fp (i64 FPR64:$Rn), vecshiftR64:$imm)),
+          (SCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(f64 (int_aarch64_neon_vcvtfxu2fp (i64 FPR64:$Rn), vecshiftR64:$imm)),
+          (UCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(v1f64 (int_aarch64_neon_vcvtfxs2fp (v1i64 FPR64:$Rn),
+                                            vecshiftR64:$imm)),
+          (SCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(v1f64 (int_aarch64_neon_vcvtfxu2fp (v1i64 FPR64:$Rn),
+                                            vecshiftR64:$imm)),
+          (UCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
+
+defm SHL      : SIMDScalarLShiftD<   0, 0b01010, "shl", AArch64vshl>;
+defm SLI      : SIMDScalarLShiftDTied<1, 0b01010, "sli">;
+defm SQRSHRN  : SIMDScalarRShiftBHS< 0, 0b10011, "sqrshrn",
+                                     int_aarch64_neon_sqrshrn>;
+defm SQRSHRUN : SIMDScalarRShiftBHS< 1, 0b10001, "sqrshrun",
+                                     int_aarch64_neon_sqrshrun>;
+defm SQSHLU   : SIMDScalarLShiftBHSD<1, 0b01100, "sqshlu", AArch64sqshlui>;
+defm SQSHL    : SIMDScalarLShiftBHSD<0, 0b01110, "sqshl", AArch64sqshli>;
+defm SQSHRN   : SIMDScalarRShiftBHS< 0, 0b10010, "sqshrn",
+                                     int_aarch64_neon_sqshrn>;
+defm SQSHRUN  : SIMDScalarRShiftBHS< 1, 0b10000, "sqshrun",
+                                     int_aarch64_neon_sqshrun>;
+defm SRI      : SIMDScalarRShiftDTied<   1, 0b01000, "sri">;
+defm SRSHR    : SIMDScalarRShiftD<   0, 0b00100, "srshr", AArch64srshri>;
+defm SRSRA    : SIMDScalarRShiftDTied<   0, 0b00110, "srsra",
+    TriOpFrag<(add node:$LHS,
+                   (AArch64srshri node:$MHS, node:$RHS))>>;
+defm SSHR     : SIMDScalarRShiftD<   0, 0b00000, "sshr", AArch64vashr>;
+defm SSRA     : SIMDScalarRShiftDTied<   0, 0b00010, "ssra",
+    TriOpFrag<(add node:$LHS,
+                   (AArch64vashr node:$MHS, node:$RHS))>>;
+defm UQRSHRN  : SIMDScalarRShiftBHS< 1, 0b10011, "uqrshrn",
+                                     int_aarch64_neon_uqrshrn>;
+defm UQSHL    : SIMDScalarLShiftBHSD<1, 0b01110, "uqshl", AArch64uqshli>;
+defm UQSHRN   : SIMDScalarRShiftBHS< 1, 0b10010, "uqshrn",
+                                     int_aarch64_neon_uqshrn>;
+defm URSHR    : SIMDScalarRShiftD<   1, 0b00100, "urshr", AArch64urshri>;
+defm URSRA    : SIMDScalarRShiftDTied<   1, 0b00110, "ursra",
+    TriOpFrag<(add node:$LHS,
+                   (AArch64urshri node:$MHS, node:$RHS))>>;
+defm USHR     : SIMDScalarRShiftD<   1, 0b00000, "ushr", AArch64vlshr>;
+defm USRA     : SIMDScalarRShiftDTied<   1, 0b00010, "usra",
+    TriOpFrag<(add node:$LHS,
+                   (AArch64vlshr node:$MHS, node:$RHS))>>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD vector shift instructions
+//----------------------------------------------------------------------------
+defm FCVTZS:SIMDVectorRShiftSD<0, 0b11111, "fcvtzs", int_aarch64_neon_vcvtfp2fxs>;
+defm FCVTZU:SIMDVectorRShiftSD<1, 0b11111, "fcvtzu", int_aarch64_neon_vcvtfp2fxu>;
+defm SCVTF: SIMDVectorRShiftSDToFP<0, 0b11100, "scvtf",
+                                   int_aarch64_neon_vcvtfxs2fp>;
+defm RSHRN   : SIMDVectorRShiftNarrowBHS<0, 0b10001, "rshrn",
+                                         int_aarch64_neon_rshrn>;
+defm SHL     : SIMDVectorLShiftBHSD<0, 0b01010, "shl", AArch64vshl>;
+defm SHRN    : SIMDVectorRShiftNarrowBHS<0, 0b10000, "shrn",
+                          BinOpFrag<(trunc (AArch64vashr node:$LHS, node:$RHS))>>;
+defm SLI     : SIMDVectorLShiftBHSDTied<1, 0b01010, "sli", int_aarch64_neon_vsli>;
+def : Pat<(v1i64 (int_aarch64_neon_vsli (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
+                                      (i32 vecshiftL64:$imm))),
+          (SLId FPR64:$Rd, FPR64:$Rn, vecshiftL64:$imm)>;
+defm SQRSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10011, "sqrshrn",
+                                         int_aarch64_neon_sqrshrn>;
+defm SQRSHRUN: SIMDVectorRShiftNarrowBHS<1, 0b10001, "sqrshrun",
+                                         int_aarch64_neon_sqrshrun>;
+defm SQSHLU : SIMDVectorLShiftBHSD<1, 0b01100, "sqshlu", AArch64sqshlui>;
+defm SQSHL  : SIMDVectorLShiftBHSD<0, 0b01110, "sqshl", AArch64sqshli>;
+defm SQSHRN  : SIMDVectorRShiftNarrowBHS<0, 0b10010, "sqshrn",
+                                         int_aarch64_neon_sqshrn>;
+defm SQSHRUN : SIMDVectorRShiftNarrowBHS<1, 0b10000, "sqshrun",
+                                         int_aarch64_neon_sqshrun>;
+defm SRI     : SIMDVectorRShiftBHSDTied<1, 0b01000, "sri", int_aarch64_neon_vsri>;
+def : Pat<(v1i64 (int_aarch64_neon_vsri (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
+                                      (i32 vecshiftR64:$imm))),
+          (SRId FPR64:$Rd, FPR64:$Rn, vecshiftR64:$imm)>;
+defm SRSHR   : SIMDVectorRShiftBHSD<0, 0b00100, "srshr", AArch64srshri>;
+defm SRSRA   : SIMDVectorRShiftBHSDTied<0, 0b00110, "srsra",
+                 TriOpFrag<(add node:$LHS,
+                                (AArch64srshri node:$MHS, node:$RHS))> >;
+defm SSHLL   : SIMDVectorLShiftLongBHSD<0, 0b10100, "sshll",
+                BinOpFrag<(AArch64vshl (sext node:$LHS), node:$RHS)>>;
+
+defm SSHR    : SIMDVectorRShiftBHSD<0, 0b00000, "sshr", AArch64vashr>;
+defm SSRA    : SIMDVectorRShiftBHSDTied<0, 0b00010, "ssra",
+                TriOpFrag<(add node:$LHS, (AArch64vashr node:$MHS, node:$RHS))>>;
+defm UCVTF   : SIMDVectorRShiftSDToFP<1, 0b11100, "ucvtf",
+                        int_aarch64_neon_vcvtfxu2fp>;
+defm UQRSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10011, "uqrshrn",
+                                         int_aarch64_neon_uqrshrn>;
+defm UQSHL   : SIMDVectorLShiftBHSD<1, 0b01110, "uqshl", AArch64uqshli>;
+defm UQSHRN  : SIMDVectorRShiftNarrowBHS<1, 0b10010, "uqshrn",
+                                         int_aarch64_neon_uqshrn>;
+defm URSHR   : SIMDVectorRShiftBHSD<1, 0b00100, "urshr", AArch64urshri>;
+defm URSRA   : SIMDVectorRShiftBHSDTied<1, 0b00110, "ursra",
+                TriOpFrag<(add node:$LHS,
+                               (AArch64urshri node:$MHS, node:$RHS))> >;
+defm USHLL   : SIMDVectorLShiftLongBHSD<1, 0b10100, "ushll",
+                BinOpFrag<(AArch64vshl (zext node:$LHS), node:$RHS)>>;
+defm USHR    : SIMDVectorRShiftBHSD<1, 0b00000, "ushr", AArch64vlshr>;
+defm USRA    : SIMDVectorRShiftBHSDTied<1, 0b00010, "usra",
+                TriOpFrag<(add node:$LHS, (AArch64vlshr node:$MHS, node:$RHS))> >;
+
+// SHRN patterns for when a logical right shift was used instead of arithmetic
+// (the immediate guarantees no sign bits actually end up in the result so it
+// doesn't matter).
+def : Pat<(v8i8 (trunc (AArch64vlshr (v8i16 V128:$Rn), vecshiftR16Narrow:$imm))),
+          (SHRNv8i8_shift V128:$Rn, vecshiftR16Narrow:$imm)>;
+def : Pat<(v4i16 (trunc (AArch64vlshr (v4i32 V128:$Rn), vecshiftR32Narrow:$imm))),
+          (SHRNv4i16_shift V128:$Rn, vecshiftR32Narrow:$imm)>;
+def : Pat<(v2i32 (trunc (AArch64vlshr (v2i64 V128:$Rn), vecshiftR64Narrow:$imm))),
+          (SHRNv2i32_shift V128:$Rn, vecshiftR64Narrow:$imm)>;
+
+def : Pat<(v16i8 (concat_vectors (v8i8 V64:$Rd),
+                                 (trunc (AArch64vlshr (v8i16 V128:$Rn),
+                                                    vecshiftR16Narrow:$imm)))),
+          (SHRNv16i8_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                           V128:$Rn, vecshiftR16Narrow:$imm)>;
+def : Pat<(v8i16 (concat_vectors (v4i16 V64:$Rd),
+                                 (trunc (AArch64vlshr (v4i32 V128:$Rn),
+                                                    vecshiftR32Narrow:$imm)))),
+          (SHRNv8i16_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                           V128:$Rn, vecshiftR32Narrow:$imm)>;
+def : Pat<(v4i32 (concat_vectors (v2i32 V64:$Rd),
+                                 (trunc (AArch64vlshr (v2i64 V128:$Rn),
+                                                    vecshiftR64Narrow:$imm)))),
+          (SHRNv4i32_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                           V128:$Rn, vecshiftR32Narrow:$imm)>;
+
+// Vector sign and zero extensions are implemented with SSHLL and USSHLL.
+// Anyexts are implemented as zexts.
+def : Pat<(v8i16 (sext   (v8i8 V64:$Rn))),  (SSHLLv8i8_shift  V64:$Rn, (i32 0))>;
+def : Pat<(v8i16 (zext   (v8i8 V64:$Rn))),  (USHLLv8i8_shift  V64:$Rn, (i32 0))>;
+def : Pat<(v8i16 (anyext (v8i8 V64:$Rn))),  (USHLLv8i8_shift  V64:$Rn, (i32 0))>;
+def : Pat<(v4i32 (sext   (v4i16 V64:$Rn))), (SSHLLv4i16_shift V64:$Rn, (i32 0))>;
+def : Pat<(v4i32 (zext   (v4i16 V64:$Rn))), (USHLLv4i16_shift V64:$Rn, (i32 0))>;
+def : Pat<(v4i32 (anyext (v4i16 V64:$Rn))), (USHLLv4i16_shift V64:$Rn, (i32 0))>;
+def : Pat<(v2i64 (sext   (v2i32 V64:$Rn))), (SSHLLv2i32_shift V64:$Rn, (i32 0))>;
+def : Pat<(v2i64 (zext   (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>;
+def : Pat<(v2i64 (anyext (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>;
+// Also match an extend from the upper half of a 128 bit source register.
+def : Pat<(v8i16 (anyext (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
+          (USHLLv16i8_shift V128:$Rn, (i32 0))>;
+def : Pat<(v8i16 (zext   (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
+          (USHLLv16i8_shift V128:$Rn, (i32 0))>;
+def : Pat<(v8i16 (sext   (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
+          (SSHLLv16i8_shift V128:$Rn, (i32 0))>;
+def : Pat<(v4i32 (anyext (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
+          (USHLLv8i16_shift V128:$Rn, (i32 0))>;
+def : Pat<(v4i32 (zext   (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
+          (USHLLv8i16_shift V128:$Rn, (i32 0))>;
+def : Pat<(v4i32 (sext   (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
+          (SSHLLv8i16_shift V128:$Rn, (i32 0))>;
+def : Pat<(v2i64 (anyext (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
+          (USHLLv4i32_shift V128:$Rn, (i32 0))>;
+def : Pat<(v2i64 (zext   (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
+          (USHLLv4i32_shift V128:$Rn, (i32 0))>;
+def : Pat<(v2i64 (sext   (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
+          (SSHLLv4i32_shift V128:$Rn, (i32 0))>;
+
+// Vector shift sxtl aliases
+def : InstAlias<"sxtl.8h $dst, $src1",
+                (SSHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"sxtl $dst.8h, $src1.8b",
+                (SSHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"sxtl.4s $dst, $src1",
+                (SSHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"sxtl $dst.4s, $src1.4h",
+                (SSHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"sxtl.2d $dst, $src1",
+                (SSHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"sxtl $dst.2d, $src1.2s",
+                (SSHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
+
+// Vector shift sxtl2 aliases
+def : InstAlias<"sxtl2.8h $dst, $src1",
+                (SSHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"sxtl2 $dst.8h, $src1.16b",
+                (SSHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"sxtl2.4s $dst, $src1",
+                (SSHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"sxtl2 $dst.4s, $src1.8h",
+                (SSHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"sxtl2.2d $dst, $src1",
+                (SSHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"sxtl2 $dst.2d, $src1.4s",
+                (SSHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
+
+// Vector shift uxtl aliases
+def : InstAlias<"uxtl.8h $dst, $src1",
+                (USHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"uxtl $dst.8h, $src1.8b",
+                (USHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"uxtl.4s $dst, $src1",
+                (USHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"uxtl $dst.4s, $src1.4h",
+                (USHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"uxtl.2d $dst, $src1",
+                (USHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"uxtl $dst.2d, $src1.2s",
+                (USHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
+
+// Vector shift uxtl2 aliases
+def : InstAlias<"uxtl2.8h $dst, $src1",
+                (USHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"uxtl2 $dst.8h, $src1.16b",
+                (USHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"uxtl2.4s $dst, $src1",
+                (USHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"uxtl2 $dst.4s, $src1.8h",
+                (USHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"uxtl2.2d $dst, $src1",
+                (USHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"uxtl2 $dst.2d, $src1.4s",
+                (USHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
+
+// If an integer is about to be converted to a floating point value,
+// just load it on the floating point unit.
+// These patterns are more complex because floating point loads do not
+// support sign extension.
+// The sign extension has to be explicitly added and is only supported for
+// one step: byte-to-half, half-to-word, word-to-doubleword.
+// SCVTF GPR -> FPR is 9 cycles.
+// SCVTF FPR -> FPR is 4 cyclces.
+// (sign extension with lengthen) SXTL FPR -> FPR is 2 cycles.
+// Therefore, we can do 2 sign extensions and one SCVTF FPR -> FPR
+// and still being faster.
+// However, this is not good for code size.
+// 8-bits -> float. 2 sizes step-up.
+class SExtLoadi8CVTf32Pat<dag addrmode, dag INST>
+  : Pat<(f32 (sint_to_fp (i32 (sextloadi8 addrmode)))),
+        (SCVTFv1i32 (f32 (EXTRACT_SUBREG
+                            (SSHLLv4i16_shift
+                              (f64
+                                (EXTRACT_SUBREG
+                                  (SSHLLv8i8_shift
+                                    (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                                        INST,
+                                        bsub),
+                                    0),
+                                  dsub)),
+                               0),
+                             ssub)))>, Requires<[NotForCodeSize]>;
+
+def : SExtLoadi8CVTf32Pat<(ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext),
+                          (LDRBroW  GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext)>;
+def : SExtLoadi8CVTf32Pat<(ro8.Xpat GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$ext),
+                          (LDRBroX  GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$ext)>;
+def : SExtLoadi8CVTf32Pat<(am_indexed8 GPR64sp:$Rn, uimm12s1:$offset),
+                          (LDRBui GPR64sp:$Rn, uimm12s1:$offset)>;
+def : SExtLoadi8CVTf32Pat<(am_unscaled8 GPR64sp:$Rn, simm9:$offset),
+                          (LDURBi GPR64sp:$Rn, simm9:$offset)>;
+
+// 16-bits -> float. 1 size step-up.
+class SExtLoadi16CVTf32Pat<dag addrmode, dag INST>
+  : Pat<(f32 (sint_to_fp (i32 (sextloadi16 addrmode)))),
+        (SCVTFv1i32 (f32 (EXTRACT_SUBREG
+                            (SSHLLv4i16_shift
+                                (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                                  INST,
+                                  hsub),
+                                0),
+                            ssub)))>, Requires<[NotForCodeSize]>;
+
+def : SExtLoadi16CVTf32Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),
+                           (LDRHroW   GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;
+def : SExtLoadi16CVTf32Pat<(ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext),
+                           (LDRHroX   GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext)>;
+def : SExtLoadi16CVTf32Pat<(am_indexed16 GPR64sp:$Rn, uimm12s2:$offset),
+                           (LDRHui GPR64sp:$Rn, uimm12s2:$offset)>;
+def : SExtLoadi16CVTf32Pat<(am_unscaled16 GPR64sp:$Rn, simm9:$offset),
+                           (LDURHi GPR64sp:$Rn, simm9:$offset)>;
+
+// 32-bits to 32-bits are handled in target specific dag combine:
+// performIntToFpCombine.
+// 64-bits integer to 32-bits floating point, not possible with
+// SCVTF on floating point registers (both source and destination
+// must have the same size).
+
+// Here are the patterns for 8, 16, 32, and 64-bits to double.
+// 8-bits -> double. 3 size step-up: give up.
+// 16-bits -> double. 2 size step.
+class SExtLoadi16CVTf64Pat<dag addrmode, dag INST>
+  : Pat <(f64 (sint_to_fp (i32 (sextloadi16 addrmode)))),
+           (SCVTFv1i64 (f64 (EXTRACT_SUBREG
+                              (SSHLLv2i32_shift
+                                 (f64
+                                  (EXTRACT_SUBREG
+                                    (SSHLLv4i16_shift
+                                      (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                                        INST,
+                                        hsub),
+                                     0),
+                                   dsub)),
+                               0),
+                             dsub)))>, Requires<[NotForCodeSize]>;
+
+def : SExtLoadi16CVTf64Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),
+                           (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;
+def : SExtLoadi16CVTf64Pat<(ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext),
+                           (LDRHroX GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext)>;
+def : SExtLoadi16CVTf64Pat<(am_indexed16 GPR64sp:$Rn, uimm12s2:$offset),
+                           (LDRHui GPR64sp:$Rn, uimm12s2:$offset)>;
+def : SExtLoadi16CVTf64Pat<(am_unscaled16 GPR64sp:$Rn, simm9:$offset),
+                           (LDURHi GPR64sp:$Rn, simm9:$offset)>;
+// 32-bits -> double. 1 size step-up.
+class SExtLoadi32CVTf64Pat<dag addrmode, dag INST>
+  : Pat <(f64 (sint_to_fp (i32 (load addrmode)))),
+           (SCVTFv1i64 (f64 (EXTRACT_SUBREG
+                              (SSHLLv2i32_shift
+                                (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                                  INST,
+                                  ssub),
+                               0),
+                             dsub)))>, Requires<[NotForCodeSize]>;
+
+def : SExtLoadi32CVTf64Pat<(ro32.Wpat GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$ext),
+                           (LDRSroW GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$ext)>;
+def : SExtLoadi32CVTf64Pat<(ro32.Xpat GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$ext),
+                           (LDRSroX GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$ext)>;
+def : SExtLoadi32CVTf64Pat<(am_indexed32 GPR64sp:$Rn, uimm12s4:$offset),
+                           (LDRSui GPR64sp:$Rn, uimm12s4:$offset)>;
+def : SExtLoadi32CVTf64Pat<(am_unscaled32 GPR64sp:$Rn, simm9:$offset),
+                           (LDURSi GPR64sp:$Rn, simm9:$offset)>;
+
+// 64-bits -> double are handled in target specific dag combine:
+// performIntToFpCombine.
+
+
+//----------------------------------------------------------------------------
+// AdvSIMD Load-Store Structure
+//----------------------------------------------------------------------------
+defm LD1 : SIMDLd1Multiple<"ld1">;
+defm LD2 : SIMDLd2Multiple<"ld2">;
+defm LD3 : SIMDLd3Multiple<"ld3">;
+defm LD4 : SIMDLd4Multiple<"ld4">;
+
+defm ST1 : SIMDSt1Multiple<"st1">;
+defm ST2 : SIMDSt2Multiple<"st2">;
+defm ST3 : SIMDSt3Multiple<"st3">;
+defm ST4 : SIMDSt4Multiple<"st4">;
+
+class Ld1Pat<ValueType ty, Instruction INST>
+  : Pat<(ty (load GPR64sp:$Rn)), (INST GPR64sp:$Rn)>;
+
+def : Ld1Pat<v16i8, LD1Onev16b>;
+def : Ld1Pat<v8i16, LD1Onev8h>;
+def : Ld1Pat<v4i32, LD1Onev4s>;
+def : Ld1Pat<v2i64, LD1Onev2d>;
+def : Ld1Pat<v8i8,  LD1Onev8b>;
+def : Ld1Pat<v4i16, LD1Onev4h>;
+def : Ld1Pat<v2i32, LD1Onev2s>;
+def : Ld1Pat<v1i64, LD1Onev1d>;
+
+class St1Pat<ValueType ty, Instruction INST>
+  : Pat<(store ty:$Vt, GPR64sp:$Rn),
+        (INST ty:$Vt, GPR64sp:$Rn)>;
+
+def : St1Pat<v16i8, ST1Onev16b>;
+def : St1Pat<v8i16, ST1Onev8h>;
+def : St1Pat<v4i32, ST1Onev4s>;
+def : St1Pat<v2i64, ST1Onev2d>;
+def : St1Pat<v8i8,  ST1Onev8b>;
+def : St1Pat<v4i16, ST1Onev4h>;
+def : St1Pat<v2i32, ST1Onev2s>;
+def : St1Pat<v1i64, ST1Onev1d>;
+
+//---
+// Single-element
+//---
+
+defm LD1R          : SIMDLdR<0, 0b110, 0, "ld1r", "One", 1, 2, 4, 8>;
+defm LD2R          : SIMDLdR<1, 0b110, 0, "ld2r", "Two", 2, 4, 8, 16>;
+defm LD3R          : SIMDLdR<0, 0b111, 0, "ld3r", "Three", 3, 6, 12, 24>;
+defm LD4R          : SIMDLdR<1, 0b111, 0, "ld4r", "Four", 4, 8, 16, 32>;
+let mayLoad = 1, neverHasSideEffects = 1 in {
+defm LD1 : SIMDLdSingleBTied<0, 0b000,       "ld1", VecListOneb,   GPR64pi1>;
+defm LD1 : SIMDLdSingleHTied<0, 0b010, 0,    "ld1", VecListOneh,   GPR64pi2>;
+defm LD1 : SIMDLdSingleSTied<0, 0b100, 0b00, "ld1", VecListOnes,   GPR64pi4>;
+defm LD1 : SIMDLdSingleDTied<0, 0b100, 0b01, "ld1", VecListOned,   GPR64pi8>;
+defm LD2 : SIMDLdSingleBTied<1, 0b000,       "ld2", VecListTwob,   GPR64pi2>;
+defm LD2 : SIMDLdSingleHTied<1, 0b010, 0,    "ld2", VecListTwoh,   GPR64pi4>;
+defm LD2 : SIMDLdSingleSTied<1, 0b100, 0b00, "ld2", VecListTwos,   GPR64pi8>;
+defm LD2 : SIMDLdSingleDTied<1, 0b100, 0b01, "ld2", VecListTwod,   GPR64pi16>;
+defm LD3 : SIMDLdSingleBTied<0, 0b001,       "ld3", VecListThreeb, GPR64pi3>;
+defm LD3 : SIMDLdSingleHTied<0, 0b011, 0,    "ld3", VecListThreeh, GPR64pi6>;
+defm LD3 : SIMDLdSingleSTied<0, 0b101, 0b00, "ld3", VecListThrees, GPR64pi12>;
+defm LD3 : SIMDLdSingleDTied<0, 0b101, 0b01, "ld3", VecListThreed, GPR64pi24>;
+defm LD4 : SIMDLdSingleBTied<1, 0b001,       "ld4", VecListFourb,  GPR64pi4>;
+defm LD4 : SIMDLdSingleHTied<1, 0b011, 0,    "ld4", VecListFourh,  GPR64pi8>;
+defm LD4 : SIMDLdSingleSTied<1, 0b101, 0b00, "ld4", VecListFours,  GPR64pi16>;
+defm LD4 : SIMDLdSingleDTied<1, 0b101, 0b01, "ld4", VecListFourd,  GPR64pi32>;
+}
+
+def : Pat<(v8i8 (AArch64dup (i32 (extloadi8 GPR64sp:$Rn)))),
+          (LD1Rv8b GPR64sp:$Rn)>;
+def : Pat<(v16i8 (AArch64dup (i32 (extloadi8 GPR64sp:$Rn)))),
+          (LD1Rv16b GPR64sp:$Rn)>;
+def : Pat<(v4i16 (AArch64dup (i32 (extloadi16 GPR64sp:$Rn)))),
+          (LD1Rv4h GPR64sp:$Rn)>;
+def : Pat<(v8i16 (AArch64dup (i32 (extloadi16 GPR64sp:$Rn)))),
+          (LD1Rv8h GPR64sp:$Rn)>;
+def : Pat<(v2i32 (AArch64dup (i32 (load GPR64sp:$Rn)))),
+          (LD1Rv2s GPR64sp:$Rn)>;
+def : Pat<(v4i32 (AArch64dup (i32 (load GPR64sp:$Rn)))),
+          (LD1Rv4s GPR64sp:$Rn)>;
+def : Pat<(v2i64 (AArch64dup (i64 (load GPR64sp:$Rn)))),
+          (LD1Rv2d GPR64sp:$Rn)>;
+def : Pat<(v1i64 (AArch64dup (i64 (load GPR64sp:$Rn)))),
+          (LD1Rv1d GPR64sp:$Rn)>;
+// Grab the floating point version too
+def : Pat<(v2f32 (AArch64dup (f32 (load GPR64sp:$Rn)))),
+          (LD1Rv2s GPR64sp:$Rn)>;
+def : Pat<(v4f32 (AArch64dup (f32 (load GPR64sp:$Rn)))),
+          (LD1Rv4s GPR64sp:$Rn)>;
+def : Pat<(v2f64 (AArch64dup (f64 (load GPR64sp:$Rn)))),
+          (LD1Rv2d GPR64sp:$Rn)>;
+def : Pat<(v1f64 (AArch64dup (f64 (load GPR64sp:$Rn)))),
+          (LD1Rv1d GPR64sp:$Rn)>;
+
+class Ld1Lane128Pat<SDPatternOperator scalar_load, Operand VecIndex,
+                    ValueType VTy, ValueType STy, Instruction LD1>
+  : Pat<(vector_insert (VTy VecListOne128:$Rd),
+           (STy (scalar_load GPR64sp:$Rn)), VecIndex:$idx),
+        (LD1 VecListOne128:$Rd, VecIndex:$idx, GPR64sp:$Rn)>;
+
+def : Ld1Lane128Pat<extloadi8,  VectorIndexB, v16i8, i32, LD1i8>;
+def : Ld1Lane128Pat<extloadi16, VectorIndexH, v8i16, i32, LD1i16>;
+def : Ld1Lane128Pat<load,       VectorIndexS, v4i32, i32, LD1i32>;
+def : Ld1Lane128Pat<load,       VectorIndexS, v4f32, f32, LD1i32>;
+def : Ld1Lane128Pat<load,       VectorIndexD, v2i64, i64, LD1i64>;
+def : Ld1Lane128Pat<load,       VectorIndexD, v2f64, f64, LD1i64>;
+
+class Ld1Lane64Pat<SDPatternOperator scalar_load, Operand VecIndex,
+                   ValueType VTy, ValueType STy, Instruction LD1>
+  : Pat<(vector_insert (VTy VecListOne64:$Rd),
+           (STy (scalar_load GPR64sp:$Rn)), VecIndex:$idx),
+        (EXTRACT_SUBREG
+            (LD1 (SUBREG_TO_REG (i32 0), VecListOne64:$Rd, dsub),
+                          VecIndex:$idx, GPR64sp:$Rn),
+            dsub)>;
+
+def : Ld1Lane64Pat<extloadi8,  VectorIndexB, v8i8,  i32, LD1i8>;
+def : Ld1Lane64Pat<extloadi16, VectorIndexH, v4i16, i32, LD1i16>;
+def : Ld1Lane64Pat<load,       VectorIndexS, v2i32, i32, LD1i32>;
+def : Ld1Lane64Pat<load,       VectorIndexS, v2f32, f32, LD1i32>;
+
+
+defm LD1 : SIMDLdSt1SingleAliases<"ld1">;
+defm LD2 : SIMDLdSt2SingleAliases<"ld2">;
+defm LD3 : SIMDLdSt3SingleAliases<"ld3">;
+defm LD4 : SIMDLdSt4SingleAliases<"ld4">;
+
+// Stores
+defm ST1 : SIMDStSingleB<0, 0b000,       "st1", VecListOneb, GPR64pi1>;
+defm ST1 : SIMDStSingleH<0, 0b010, 0,    "st1", VecListOneh, GPR64pi2>;
+defm ST1 : SIMDStSingleS<0, 0b100, 0b00, "st1", VecListOnes, GPR64pi4>;
+defm ST1 : SIMDStSingleD<0, 0b100, 0b01, "st1", VecListOned, GPR64pi8>;
+
+let AddedComplexity = 15 in
+class St1Lane128Pat<SDPatternOperator scalar_store, Operand VecIndex,
+                    ValueType VTy, ValueType STy, Instruction ST1>
+  : Pat<(scalar_store
+             (STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)),
+             GPR64sp:$Rn),
+        (ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn)>;
+
+def : St1Lane128Pat<truncstorei8,  VectorIndexB, v16i8, i32, ST1i8>;
+def : St1Lane128Pat<truncstorei16, VectorIndexH, v8i16, i32, ST1i16>;
+def : St1Lane128Pat<store,         VectorIndexS, v4i32, i32, ST1i32>;
+def : St1Lane128Pat<store,         VectorIndexS, v4f32, f32, ST1i32>;
+def : St1Lane128Pat<store,         VectorIndexD, v2i64, i64, ST1i64>;
+def : St1Lane128Pat<store,         VectorIndexD, v2f64, f64, ST1i64>;
+
+let AddedComplexity = 15 in
+class St1Lane64Pat<SDPatternOperator scalar_store, Operand VecIndex,
+                   ValueType VTy, ValueType STy, Instruction ST1>
+  : Pat<(scalar_store
+             (STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)),
+             GPR64sp:$Rn),
+        (ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub),
+             VecIndex:$idx, GPR64sp:$Rn)>;
+
+def : St1Lane64Pat<truncstorei8,  VectorIndexB, v8i8, i32, ST1i8>;
+def : St1Lane64Pat<truncstorei16, VectorIndexH, v4i16, i32, ST1i16>;
+def : St1Lane64Pat<store,         VectorIndexS, v2i32, i32, ST1i32>;
+def : St1Lane64Pat<store,         VectorIndexS, v2f32, f32, ST1i32>;
+
+multiclass St1LanePost64Pat<SDPatternOperator scalar_store, Operand VecIndex,
+                             ValueType VTy, ValueType STy, Instruction ST1,
+                             int offset> {
+  def : Pat<(scalar_store
+              (STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)),
+              GPR64sp:$Rn, offset),
+        (ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub),
+             VecIndex:$idx, GPR64sp:$Rn, XZR)>;
+
+  def : Pat<(scalar_store
+              (STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)),
+              GPR64sp:$Rn, GPR64:$Rm),
+        (ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub),
+             VecIndex:$idx, GPR64sp:$Rn, $Rm)>;
+}
+
+defm : St1LanePost64Pat<post_truncsti8, VectorIndexB, v8i8, i32, ST1i8_POST, 1>;
+defm : St1LanePost64Pat<post_truncsti16, VectorIndexH, v4i16, i32, ST1i16_POST,
+                        2>;
+defm : St1LanePost64Pat<post_store, VectorIndexS, v2i32, i32, ST1i32_POST, 4>;
+defm : St1LanePost64Pat<post_store, VectorIndexS, v2f32, f32, ST1i32_POST, 4>;
+defm : St1LanePost64Pat<post_store, VectorIndexD, v1i64, i64, ST1i64_POST, 8>;
+defm : St1LanePost64Pat<post_store, VectorIndexD, v1f64, f64, ST1i64_POST, 8>;
+
+multiclass St1LanePost128Pat<SDPatternOperator scalar_store, Operand VecIndex,
+                             ValueType VTy, ValueType STy, Instruction ST1,
+                             int offset> {
+  def : Pat<(scalar_store
+              (STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)),
+              GPR64sp:$Rn, offset),
+        (ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn, XZR)>;
+
+  def : Pat<(scalar_store
+              (STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)),
+              GPR64sp:$Rn, GPR64:$Rm),
+        (ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn, $Rm)>;
+}
+
+defm : St1LanePost128Pat<post_truncsti8, VectorIndexB, v16i8, i32, ST1i8_POST,
+                         1>;
+defm : St1LanePost128Pat<post_truncsti16, VectorIndexH, v8i16, i32, ST1i16_POST,
+                         2>;
+defm : St1LanePost128Pat<post_store, VectorIndexS, v4i32, i32, ST1i32_POST, 4>;
+defm : St1LanePost128Pat<post_store, VectorIndexS, v4f32, f32, ST1i32_POST, 4>;
+defm : St1LanePost128Pat<post_store, VectorIndexD, v2i64, i64, ST1i64_POST, 8>;
+defm : St1LanePost128Pat<post_store, VectorIndexD, v2f64, f64, ST1i64_POST, 8>;
+
+let mayStore = 1, neverHasSideEffects = 1 in {
+defm ST2 : SIMDStSingleB<1, 0b000,       "st2", VecListTwob,   GPR64pi2>;
+defm ST2 : SIMDStSingleH<1, 0b010, 0,    "st2", VecListTwoh,   GPR64pi4>;
+defm ST2 : SIMDStSingleS<1, 0b100, 0b00, "st2", VecListTwos,   GPR64pi8>;
+defm ST2 : SIMDStSingleD<1, 0b100, 0b01, "st2", VecListTwod,   GPR64pi16>;
+defm ST3 : SIMDStSingleB<0, 0b001,       "st3", VecListThreeb, GPR64pi3>;
+defm ST3 : SIMDStSingleH<0, 0b011, 0,    "st3", VecListThreeh, GPR64pi6>;
+defm ST3 : SIMDStSingleS<0, 0b101, 0b00, "st3", VecListThrees, GPR64pi12>;
+defm ST3 : SIMDStSingleD<0, 0b101, 0b01, "st3", VecListThreed, GPR64pi24>;
+defm ST4 : SIMDStSingleB<1, 0b001,       "st4", VecListFourb,  GPR64pi4>;
+defm ST4 : SIMDStSingleH<1, 0b011, 0,    "st4", VecListFourh,  GPR64pi8>;
+defm ST4 : SIMDStSingleS<1, 0b101, 0b00, "st4", VecListFours,  GPR64pi16>;
+defm ST4 : SIMDStSingleD<1, 0b101, 0b01, "st4", VecListFourd,  GPR64pi32>;
+}
+
+defm ST1 : SIMDLdSt1SingleAliases<"st1">;
+defm ST2 : SIMDLdSt2SingleAliases<"st2">;
+defm ST3 : SIMDLdSt3SingleAliases<"st3">;
+defm ST4 : SIMDLdSt4SingleAliases<"st4">;
+
+//----------------------------------------------------------------------------
+// Crypto extensions
+//----------------------------------------------------------------------------
+
+def AESErr   : AESTiedInst<0b0100, "aese",   int_aarch64_crypto_aese>;
+def AESDrr   : AESTiedInst<0b0101, "aesd",   int_aarch64_crypto_aesd>;
+def AESMCrr  : AESInst<    0b0110, "aesmc",  int_aarch64_crypto_aesmc>;
+def AESIMCrr : AESInst<    0b0111, "aesimc", int_aarch64_crypto_aesimc>;
+
+def SHA1Crrr     : SHATiedInstQSV<0b000, "sha1c",   int_aarch64_crypto_sha1c>;
+def SHA1Prrr     : SHATiedInstQSV<0b001, "sha1p",   int_aarch64_crypto_sha1p>;
+def SHA1Mrrr     : SHATiedInstQSV<0b010, "sha1m",   int_aarch64_crypto_sha1m>;
+def SHA1SU0rrr   : SHATiedInstVVV<0b011, "sha1su0", int_aarch64_crypto_sha1su0>;
+def SHA256Hrrr   : SHATiedInstQQV<0b100, "sha256h", int_aarch64_crypto_sha256h>;
+def SHA256H2rrr  : SHATiedInstQQV<0b101, "sha256h2",int_aarch64_crypto_sha256h2>;
+def SHA256SU1rrr :SHATiedInstVVV<0b110, "sha256su1",int_aarch64_crypto_sha256su1>;
+
+def SHA1Hrr     : SHAInstSS<    0b0000, "sha1h",    int_aarch64_crypto_sha1h>;
+def SHA1SU1rr   : SHATiedInstVV<0b0001, "sha1su1",  int_aarch64_crypto_sha1su1>;
+def SHA256SU0rr : SHATiedInstVV<0b0010, "sha256su0",int_aarch64_crypto_sha256su0>;
+
+//----------------------------------------------------------------------------
+// Compiler-pseudos
+//----------------------------------------------------------------------------
+// FIXME: Like for X86, these should go in their own separate .td file.
+
+// Any instruction that defines a 32-bit result leaves the high half of the
+// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
+// be copying from a truncate. But any other 32-bit operation will zero-extend
+// up to 64 bits.
+// FIXME: X86 also checks for CMOV here. Do we need something similar?
+def def32 : PatLeaf<(i32 GPR32:$src), [{
+  return N->getOpcode() != ISD::TRUNCATE &&
+         N->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
+         N->getOpcode() != ISD::CopyFromReg;
+}]>;
+
+// In the case of a 32-bit def that is known to implicitly zero-extend,
+// we can use a SUBREG_TO_REG.
+def : Pat<(i64 (zext def32:$src)), (SUBREG_TO_REG (i64 0), GPR32:$src, sub_32)>;
+
+// For an anyext, we don't care what the high bits are, so we can perform an
+// INSERT_SUBREF into an IMPLICIT_DEF.
+def : Pat<(i64 (anyext GPR32:$src)),
+          (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32)>;
+
+// When we need to explicitly zero-extend, we use an unsigned bitfield move
+// instruction (UBFM) on the enclosing super-reg.
+def : Pat<(i64 (zext GPR32:$src)),
+ (UBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32), 0, 31)>;
+
+// To sign extend, we use a signed bitfield move instruction (SBFM) on the
+// containing super-reg.
+def : Pat<(i64 (sext GPR32:$src)),
+   (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32), 0, 31)>;
+def : Pat<(i64 (sext_inreg GPR64:$src, i32)), (SBFMXri GPR64:$src, 0, 31)>;
+def : Pat<(i64 (sext_inreg GPR64:$src, i16)), (SBFMXri GPR64:$src, 0, 15)>;
+def : Pat<(i64 (sext_inreg GPR64:$src, i8)),  (SBFMXri GPR64:$src, 0, 7)>;
+def : Pat<(i64 (sext_inreg GPR64:$src, i1)),  (SBFMXri GPR64:$src, 0, 0)>;
+def : Pat<(i32 (sext_inreg GPR32:$src, i16)), (SBFMWri GPR32:$src, 0, 15)>;
+def : Pat<(i32 (sext_inreg GPR32:$src, i8)),  (SBFMWri GPR32:$src, 0, 7)>;
+def : Pat<(i32 (sext_inreg GPR32:$src, i1)),  (SBFMWri GPR32:$src, 0, 0)>;
+
+def : Pat<(shl (sext_inreg GPR32:$Rn, i8), (i64 imm0_31:$imm)),
+          (SBFMWri GPR32:$Rn, (i64 (i32shift_a       imm0_31:$imm)),
+                              (i64 (i32shift_sext_i8 imm0_31:$imm)))>;
+def : Pat<(shl (sext_inreg GPR64:$Rn, i8), (i64 imm0_63:$imm)),
+          (SBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)),
+                              (i64 (i64shift_sext_i8 imm0_63:$imm)))>;
+
+def : Pat<(shl (sext_inreg GPR32:$Rn, i16), (i64 imm0_31:$imm)),
+          (SBFMWri GPR32:$Rn, (i64 (i32shift_a        imm0_31:$imm)),
+                              (i64 (i32shift_sext_i16 imm0_31:$imm)))>;
+def : Pat<(shl (sext_inreg GPR64:$Rn, i16), (i64 imm0_63:$imm)),
+          (SBFMXri GPR64:$Rn, (i64 (i64shift_a        imm0_63:$imm)),
+                              (i64 (i64shift_sext_i16 imm0_63:$imm)))>;
+
+def : Pat<(shl (i64 (sext GPR32:$Rn)), (i64 imm0_63:$imm)),
+          (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32),
+                   (i64 (i64shift_a        imm0_63:$imm)),
+                   (i64 (i64shift_sext_i32 imm0_63:$imm)))>;
+
+// sra patterns have an AddedComplexity of 10, so make sure we have a higher
+// AddedComplexity for the following patterns since we want to match sext + sra
+// patterns before we attempt to match a single sra node.
+let AddedComplexity = 20 in {
+// We support all sext + sra combinations which preserve at least one bit of the
+// original value which is to be sign extended. E.g. we support shifts up to
+// bitwidth-1 bits.
+def : Pat<(sra (sext_inreg GPR32:$Rn, i8), (i64 imm0_7:$imm)),
+          (SBFMWri GPR32:$Rn, (i64 imm0_7:$imm), 7)>;
+def : Pat<(sra (sext_inreg GPR64:$Rn, i8), (i64 imm0_7:$imm)),
+          (SBFMXri GPR64:$Rn, (i64 imm0_7:$imm), 7)>;
+
+def : Pat<(sra (sext_inreg GPR32:$Rn, i16), (i64 imm0_15:$imm)),
+          (SBFMWri GPR32:$Rn, (i64 imm0_15:$imm), 15)>;
+def : Pat<(sra (sext_inreg GPR64:$Rn, i16), (i64 imm0_15:$imm)),
+          (SBFMXri GPR64:$Rn, (i64 imm0_15:$imm), 15)>;
+
+def : Pat<(sra (i64 (sext GPR32:$Rn)), (i64 imm0_31:$imm)),
+          (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32),
+                   (i64 imm0_31:$imm), 31)>;
+} // AddedComplexity = 20
+
+// To truncate, we can simply extract from a subregister.
+def : Pat<(i32 (trunc GPR64sp:$src)),
+          (i32 (EXTRACT_SUBREG GPR64sp:$src, sub_32))>;
+
+// __builtin_trap() uses the BRK instruction on AArch64.
+def : Pat<(trap), (BRK 1)>;
+
+// Conversions within AdvSIMD types in the same register size are free.
+// But because we need a consistent lane ordering, in big endian many
+// conversions require one or more REV instructions.
+//
+// Consider a simple memory load followed by a bitconvert then a store.
+//   v0 = load v2i32
+//   v1 = BITCAST v2i32 v0 to v4i16
+//        store v4i16 v2
+//
+// In big endian mode every memory access has an implicit byte swap. LDR and
+// STR do a 64-bit byte swap, whereas LD1/ST1 do a byte swap per lane - that
+// is, they treat the vector as a sequence of elements to be byte-swapped.
+// The two pairs of instructions are fundamentally incompatible. We've decided
+// to use LD1/ST1 only to simplify compiler implementation.
+//
+// LD1/ST1 perform the equivalent of a sequence of LDR/STR + REV. This makes
+// the original code sequence:
+//   v0 = load v2i32
+//   v1 = REV v2i32                  (implicit)
+//   v2 = BITCAST v2i32 v1 to v4i16
+//   v3 = REV v4i16 v2               (implicit)
+//        store v4i16 v3
+//
+// But this is now broken - the value stored is different to the value loaded
+// due to lane reordering. To fix this, on every BITCAST we must perform two
+// other REVs:
+//   v0 = load v2i32
+//   v1 = REV v2i32                  (implicit)
+//   v2 = REV v2i32
+//   v3 = BITCAST v2i32 v2 to v4i16
+//   v4 = REV v4i16
+//   v5 = REV v4i16 v4               (implicit)
+//        store v4i16 v5
+//
+// This means an extra two instructions, but actually in most cases the two REV
+// instructions can be combined into one. For example:
+//   (REV64_2s (REV64_4h X)) === (REV32_4h X)
+//
+// There is also no 128-bit REV instruction. This must be synthesized with an
+// EXT instruction.
+//
+// Most bitconverts require some sort of conversion. The only exceptions are:
+//   a) Identity conversions -  vNfX <-> vNiX
+//   b) Single-lane-to-scalar - v1fX <-> fX or v1iX <-> iX
+//
+
+let Predicates = [IsLE] in {
+def : Pat<(v8i8  (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v4i16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v2i32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v2f32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+
+def : Pat<(i64 (bitconvert (v8i8  V64:$Vn))),
+          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(i64 (bitconvert (v4i16 V64:$Vn))),
+          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))),
+          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))),
+          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))),
+          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v8i8  (bitconvert GPR64:$Xn)),
+                 (REV64v8i8 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
+def : Pat<(v4i16 (bitconvert GPR64:$Xn)),
+                 (REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
+def : Pat<(v2i32 (bitconvert GPR64:$Xn)),
+                 (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
+def : Pat<(v2f32 (bitconvert GPR64:$Xn)),
+                 (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
+
+def : Pat<(i64 (bitconvert (v8i8  V64:$Vn))),
+          (REV64v8i8 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
+def : Pat<(i64 (bitconvert (v4i16 V64:$Vn))),
+          (REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
+def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))),
+          (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
+def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))),
+          (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
+}
+def : Pat<(v1i64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v1f64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(i64 (bitconvert (v1i64 V64:$Vn))),
+          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(v1i64 (scalar_to_vector GPR64:$Xn)),
+          (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v1f64 (scalar_to_vector GPR64:$Xn)),
+          (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$Xn))), (v1f64 FPR64:$Xn)>;
+
+def : Pat<(f32 (bitconvert (i32 GPR32:$Xn))),
+          (COPY_TO_REGCLASS GPR32:$Xn, FPR32)>;
+def : Pat<(i32 (bitconvert (f32 FPR32:$Xn))),
+          (COPY_TO_REGCLASS FPR32:$Xn, GPR32)>;
+def : Pat<(f64 (bitconvert (i64 GPR64:$Xn))),
+          (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(i64 (bitconvert (f64 FPR64:$Xn))),
+          (COPY_TO_REGCLASS FPR64:$Xn, GPR64)>;
+def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))),
+          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v8i8  FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))),
+                             (v1i64 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))),
+                             (v1i64 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v1i64 (bitconvert (v8i8  FPR64:$src))),
+                             (v1i64 (REV64v8i8 FPR64:$src))>;
+def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))),
+                             (v1i64 (REV64v2i32 FPR64:$src))>;
+}
+def : Pat<(v1i64 (bitconvert (v1f64 FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1i64 (bitconvert (f64   FPR64:$src))), (v1i64 FPR64:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v4i16 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v8i8  FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (f64   FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))), (v2i32 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))),
+                             (v2i32 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v2i32 (bitconvert (v4i16 FPR64:$src))),
+                             (v2i32 (REV32v4i16 FPR64:$src))>;
+def : Pat<(v2i32 (bitconvert (v8i8  FPR64:$src))),
+                             (v2i32 (REV32v8i8 FPR64:$src))>;
+def : Pat<(v2i32 (bitconvert (f64   FPR64:$src))),
+                             (v2i32 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))),
+                             (v2i32 (REV64v2i32 FPR64:$src))>;
+}
+def : Pat<(v2i32 (bitconvert (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v4i16 (bitconvert (v1i64 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v8i8  FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (f64   FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))), (v4i16 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v4i16 (bitconvert (v1i64 FPR64:$src))),
+                             (v4i16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4i16 (bitconvert (v2i32 FPR64:$src))),
+                             (v4i16 (REV32v4i16 FPR64:$src))>;
+def : Pat<(v4i16 (bitconvert (v8i8  FPR64:$src))),
+                             (v4i16 (REV16v8i8 FPR64:$src))>;
+def : Pat<(v4i16 (bitconvert (f64   FPR64:$src))),
+                             (v4i16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))),
+                             (v4i16 (REV32v4i16 FPR64:$src))>;
+def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))),
+                             (v4i16 (REV64v4i16 FPR64:$src))>;
+}
+
+let Predicates = [IsLE] in {
+def : Pat<(v8i8  (bitconvert (v1i64 FPR64:$src))), (v8i8  FPR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v2i32 FPR64:$src))), (v8i8  FPR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v4i16 FPR64:$src))), (v8i8  FPR64:$src)>;
+def : Pat<(v8i8  (bitconvert (f64   FPR64:$src))), (v8i8  FPR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v2f32 FPR64:$src))), (v8i8  FPR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v1f64 FPR64:$src))), (v8i8  FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v8i8  (bitconvert (v1i64 FPR64:$src))),
+                             (v8i8 (REV64v8i8 FPR64:$src))>;
+def : Pat<(v8i8  (bitconvert (v2i32 FPR64:$src))),
+                             (v8i8 (REV32v8i8 FPR64:$src))>;
+def : Pat<(v8i8  (bitconvert (v4i16 FPR64:$src))),
+                             (v8i8 (REV16v8i8 FPR64:$src))>;
+def : Pat<(v8i8  (bitconvert (f64   FPR64:$src))),
+                             (v8i8 (REV64v8i8 FPR64:$src))>;
+def : Pat<(v8i8  (bitconvert (v2f32 FPR64:$src))),
+                             (v8i8 (REV32v8i8 FPR64:$src))>;
+def : Pat<(v8i8  (bitconvert (v1f64 FPR64:$src))),
+                             (v8i8 (REV64v8i8 FPR64:$src))>;
+}
+
+let Predicates = [IsLE] in {
+def : Pat<(f64   (bitconvert (v2i32 FPR64:$src))), (f64   FPR64:$src)>;
+def : Pat<(f64   (bitconvert (v4i16 FPR64:$src))), (f64   FPR64:$src)>;
+def : Pat<(f64   (bitconvert (v2f32 FPR64:$src))), (f64   FPR64:$src)>;
+def : Pat<(f64   (bitconvert (v8i8  FPR64:$src))), (f64   FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(f64   (bitconvert (v2i32 FPR64:$src))),
+                             (f64 (REV64v2i32 FPR64:$src))>;
+def : Pat<(f64   (bitconvert (v4i16 FPR64:$src))),
+                             (f64 (REV64v4i16 FPR64:$src))>;
+def : Pat<(f64   (bitconvert (v2f32 FPR64:$src))),
+                             (f64 (REV64v2i32 FPR64:$src))>;
+def : Pat<(f64   (bitconvert (v8i8  FPR64:$src))),
+                             (f64 (REV64v8i8 FPR64:$src))>;
+}
+def : Pat<(f64   (bitconvert (v1i64 FPR64:$src))), (f64   FPR64:$src)>;
+def : Pat<(f64   (bitconvert (v1f64 FPR64:$src))), (f64   FPR64:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))), (v1f64 FPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))), (v1f64 FPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (v8i8  FPR64:$src))), (v1f64 FPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))), (v1f64 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))),
+                             (v1f64 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))),
+                             (v1f64 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v1f64 (bitconvert (v8i8  FPR64:$src))),
+                             (v1f64 (REV64v8i8 FPR64:$src))>;
+def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))),
+                             (v1f64 (REV64v2i32 FPR64:$src))>;
+}
+def : Pat<(v1f64 (bitconvert (v1i64 FPR64:$src))), (v1f64 FPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (f64   FPR64:$src))), (v1f64 FPR64:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v4i16 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v8i8  FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (f64   FPR64:$src))), (v2f32 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))),
+                             (v2f32 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v2f32 (bitconvert (v4i16 FPR64:$src))),
+                             (v2f32 (REV32v4i16 FPR64:$src))>;
+def : Pat<(v2f32 (bitconvert (v8i8  FPR64:$src))),
+                             (v2f32 (REV32v8i8 FPR64:$src))>;
+def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))),
+                             (v2f32 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v2f32 (bitconvert (f64   FPR64:$src))),
+                             (v2f32 (REV64v2i32 FPR64:$src))>;
+}
+def : Pat<(v2f32 (bitconvert (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(f128 (bitconvert (v2i64 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v4i32 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v16i8 FPR128:$src))), (f128 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(f128 (bitconvert (v2i64 FPR128:$src))),
+                            (f128 (EXTv16i8 FPR128:$src, FPR128:$src, (i32 8)))>;
+def : Pat<(f128 (bitconvert (v4i32 FPR128:$src))),
+                            (f128 (EXTv16i8 (REV64v4i32 FPR128:$src),
+                                            (REV64v4i32 FPR128:$src), (i32 8)))>;
+def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))),
+                            (f128 (EXTv16i8 (REV64v8i16 FPR128:$src),
+                                            (REV64v8i16 FPR128:$src), (i32 8)))>;
+def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))),
+                            (f128 (EXTv16i8 FPR128:$src, FPR128:$src, (i32 8)))>;
+def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))),
+                            (f128 (EXTv16i8 (REV64v4i32 FPR128:$src),
+                                            (REV64v4i32 FPR128:$src), (i32 8)))>;
+def : Pat<(f128 (bitconvert (v16i8 FPR128:$src))),
+                            (f128 (EXTv16i8 (REV64v16i8 FPR128:$src),
+                                            (REV64v16i8 FPR128:$src), (i32 8)))>;
+}
+
+let Predicates = [IsLE] in {
+def : Pat<(v2f64 (bitconvert (f128  FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v2f64 (bitconvert (f128  FPR128:$src))),
+                             (v2f64 (EXTv16i8 FPR128:$src,
+                                              FPR128:$src, (i32 8)))>;
+def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))),
+                             (v2f64 (REV64v4i32 FPR128:$src))>;
+def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))),
+                             (v2f64 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))),
+                             (v2f64 (REV64v16i8 FPR128:$src))>;
+def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))),
+                             (v2f64 (REV64v4i32 FPR128:$src))>;
+}
+def : Pat<(v2f64 (bitconvert (v2i64 FPR128:$src))), (v2f64 FPR128:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v4f32 (bitconvert (f128  FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v4f32 (bitconvert (f128  FPR128:$src))),
+                             (v4f32 (EXTv16i8 (REV64v4i32 FPR128:$src),
+                                    (REV64v4i32 FPR128:$src), (i32 8)))>;
+def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))),
+                             (v4f32 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))),
+                             (v4f32 (REV32v16i8 FPR128:$src))>;
+def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))),
+                             (v4f32 (REV64v4i32 FPR128:$src))>;
+def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))),
+                             (v4f32 (REV64v4i32 FPR128:$src))>;
+}
+def : Pat<(v4f32 (bitconvert (v4i32 FPR128:$src))), (v4f32 FPR128:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v2i64 (bitconvert (f128  FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v2i64 (bitconvert (f128  FPR128:$src))),
+                             (v2i64 (EXTv16i8 FPR128:$src,
+                                              FPR128:$src, (i32 8)))>;
+def : Pat<(v2i64 (bitconvert (v4i32 FPR128:$src))),
+                             (v2i64 (REV64v4i32 FPR128:$src))>;
+def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))),
+                             (v2i64 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))),
+                             (v2i64 (REV64v16i8 FPR128:$src))>;
+def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))),
+                             (v2i64 (REV64v4i32 FPR128:$src))>;
+}
+def : Pat<(v2i64 (bitconvert (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v4i32 (bitconvert (f128  FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v2i64 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v4i32 (bitconvert (f128  FPR128:$src))),
+                             (v4i32 (EXTv16i8 (REV64v4i32 FPR128:$src),
+                                              (REV64v4i32 FPR128:$src),
+                                              (i32 8)))>;
+def : Pat<(v4i32 (bitconvert (v2i64 FPR128:$src))),
+                             (v4i32 (REV64v4i32 FPR128:$src))>;
+def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))),
+                             (v4i32 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))),
+                             (v4i32 (REV32v16i8 FPR128:$src))>;
+def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))),
+                             (v4i32 (REV64v4i32 FPR128:$src))>;
+}
+def : Pat<(v4i32 (bitconvert (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v8i16 (bitconvert (f128  FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v2i64 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v4i32 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))), (v8i16 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v8i16 (bitconvert (f128  FPR128:$src))),
+                             (v8i16 (EXTv16i8 (REV64v8i16 FPR128:$src),
+                                              (REV64v8i16 FPR128:$src),
+                                              (i32 8)))>;
+def : Pat<(v8i16 (bitconvert (v2i64 FPR128:$src))),
+                             (v8i16 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v8i16 (bitconvert (v4i32 FPR128:$src))),
+                             (v8i16 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v8i16 (bitconvert (v16i8 FPR128:$src))),
+                             (v8i16 (REV16v16i8 FPR128:$src))>;
+def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))),
+                             (v8i16 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))),
+                             (v8i16 (REV32v8i16 FPR128:$src))>;
+}
+
+let Predicates = [IsLE] in {
+def : Pat<(v16i8 (bitconvert (f128  FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v2i64 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v4i32 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))), (v16i8 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v16i8 (bitconvert (f128  FPR128:$src))),
+                             (v16i8 (EXTv16i8 (REV64v16i8 FPR128:$src),
+                                              (REV64v16i8 FPR128:$src),
+                                              (i32 8)))>;
+def : Pat<(v16i8 (bitconvert (v2i64 FPR128:$src))),
+                             (v16i8 (REV64v16i8 FPR128:$src))>;
+def : Pat<(v16i8 (bitconvert (v4i32 FPR128:$src))),
+                             (v16i8 (REV32v16i8 FPR128:$src))>;
+def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))),
+                             (v16i8 (REV16v16i8 FPR128:$src))>;
+def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))),
+                             (v16i8 (REV64v16i8 FPR128:$src))>;
+def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))),
+                             (v16i8 (REV32v16i8 FPR128:$src))>;
+}
+
+def : Pat<(v8i8 (extract_subvector (v16i8 FPR128:$Rn), (i64 1))),
+          (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
+def : Pat<(v4i16 (extract_subvector (v8i16 FPR128:$Rn), (i64 1))),
+          (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
+def : Pat<(v2i32 (extract_subvector (v4i32 FPR128:$Rn), (i64 1))),
+          (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
+def : Pat<(v1i64 (extract_subvector (v2i64 FPR128:$Rn), (i64 1))),
+          (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
+
+// A 64-bit subvector insert to the first 128-bit vector position
+// is a subregister copy that needs no instruction.
+def : Pat<(insert_subvector undef, (v1i64 FPR64:$src), (i32 0)),
+          (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v1f64 FPR64:$src), (i32 0)),
+          (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v2i32 FPR64:$src), (i32 0)),
+          (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v2f32 FPR64:$src), (i32 0)),
+          (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v4i16 FPR64:$src), (i32 0)),
+          (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v8i8 FPR64:$src), (i32 0)),
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+
+// Use pair-wise add instructions when summing up the lanes for v2f64, v2i64
+// or v2f32.
+def : Pat<(i64 (add (vector_extract (v2i64 FPR128:$Rn), (i64 0)),
+                    (vector_extract (v2i64 FPR128:$Rn), (i64 1)))),
+           (i64 (ADDPv2i64p (v2i64 FPR128:$Rn)))>;
+def : Pat<(f64 (fadd (vector_extract (v2f64 FPR128:$Rn), (i64 0)),
+                     (vector_extract (v2f64 FPR128:$Rn), (i64 1)))),
+           (f64 (FADDPv2i64p (v2f64 FPR128:$Rn)))>;
+    // vector_extract on 64-bit vectors gets promoted to a 128 bit vector,
+    // so we match on v4f32 here, not v2f32. This will also catch adding
+    // the low two lanes of a true v4f32 vector.
+def : Pat<(fadd (vector_extract (v4f32 FPR128:$Rn), (i64 0)),
+                (vector_extract (v4f32 FPR128:$Rn), (i64 1))),
+          (f32 (FADDPv2i32p (EXTRACT_SUBREG FPR128:$Rn, dsub)))>;
+
+// Scalar 64-bit shifts in FPR64 registers.
+def : Pat<(i64 (int_aarch64_neon_sshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
+          (SSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(i64 (int_aarch64_neon_ushl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
+          (USHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(i64 (int_aarch64_neon_srshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
+          (SRSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(i64 (int_aarch64_neon_urshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
+          (URSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
+
+// Tail call return handling. These are all compiler pseudo-instructions,
+// so no encoding information or anything like that.
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
+  def TCRETURNdi : Pseudo<(outs), (ins i64imm:$dst, i32imm:$FPDiff),[]>;
+  def TCRETURNri : Pseudo<(outs), (ins tcGPR64:$dst, i32imm:$FPDiff), []>;
+}
+
+def : Pat<(AArch64tcret tcGPR64:$dst, (i32 timm:$FPDiff)),
+          (TCRETURNri tcGPR64:$dst, imm:$FPDiff)>;
+def : Pat<(AArch64tcret tglobaladdr:$dst, (i32 timm:$FPDiff)),
+          (TCRETURNdi texternalsym:$dst, imm:$FPDiff)>;
+def : Pat<(AArch64tcret texternalsym:$dst, (i32 timm:$FPDiff)),
+          (TCRETURNdi texternalsym:$dst, imm:$FPDiff)>;
+
+include "AArch64InstrAtomics.td"
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
new file mode 100644
index 0000000..3df9c4f
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -0,0 +1,951 @@
+//=- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that performs load / store related peephole
+// optimizations. This pass should be run after register allocation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64InstrInfo.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-ldst-opt"
+
+/// AArch64AllocLoadStoreOpt - Post-register allocation pass to combine
+/// load / store instructions to form ldp / stp instructions.
+
+STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
+STATISTIC(NumPostFolded, "Number of post-index updates folded");
+STATISTIC(NumPreFolded, "Number of pre-index updates folded");
+STATISTIC(NumUnscaledPairCreated,
+          "Number of load/store from unscaled generated");
+
+static cl::opt<unsigned> ScanLimit("aarch64-load-store-scan-limit",
+                                   cl::init(20), cl::Hidden);
+
+// Place holder while testing unscaled load/store combining
+static cl::opt<bool> EnableAArch64UnscaledMemOp(
+    "aarch64-unscaled-mem-op", cl::Hidden,
+    cl::desc("Allow AArch64 unscaled load/store combining"), cl::init(true));
+
+namespace {
+struct AArch64LoadStoreOpt : public MachineFunctionPass {
+  static char ID;
+  AArch64LoadStoreOpt() : MachineFunctionPass(ID) {}
+
+  const AArch64InstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+
+  // Scan the instructions looking for a load/store that can be combined
+  // with the current instruction into a load/store pair.
+  // Return the matching instruction if one is found, else MBB->end().
+  // If a matching instruction is found, MergeForward is set to true if the
+  // merge is to remove the first instruction and replace the second with
+  // a pair-wise insn, and false if the reverse is true.
+  MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
+                                               bool &MergeForward,
+                                               unsigned Limit);
+  // Merge the two instructions indicated into a single pair-wise instruction.
+  // If MergeForward is true, erase the first instruction and fold its
+  // operation into the second. If false, the reverse. Return the instruction
+  // following the first instruction (which may change during processing).
+  MachineBasicBlock::iterator
+  mergePairedInsns(MachineBasicBlock::iterator I,
+                   MachineBasicBlock::iterator Paired, bool MergeForward);
+
+  // Scan the instruction list to find a base register update that can
+  // be combined with the current instruction (a load or store) using
+  // pre or post indexed addressing with writeback. Scan forwards.
+  MachineBasicBlock::iterator
+  findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, unsigned Limit,
+                                int Value);
+
+  // Scan the instruction list to find a base register update that can
+  // be combined with the current instruction (a load or store) using
+  // pre or post indexed addressing with writeback. Scan backwards.
+  MachineBasicBlock::iterator
+  findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit);
+
+  // Merge a pre-index base register update into a ld/st instruction.
+  MachineBasicBlock::iterator
+  mergePreIdxUpdateInsn(MachineBasicBlock::iterator I,
+                        MachineBasicBlock::iterator Update);
+
+  // Merge a post-index base register update into a ld/st instruction.
+  MachineBasicBlock::iterator
+  mergePostIdxUpdateInsn(MachineBasicBlock::iterator I,
+                         MachineBasicBlock::iterator Update);
+
+  bool optimizeBlock(MachineBasicBlock &MBB);
+
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+
+  const char *getPassName() const override {
+    return "AArch64 load / store optimization pass";
+  }
+
+private:
+  int getMemSize(MachineInstr *MemMI);
+};
+char AArch64LoadStoreOpt::ID = 0;
+}
+
+static bool isUnscaledLdst(unsigned Opc) {
+  switch (Opc) {
+  default:
+    return false;
+  case AArch64::STURSi:
+    return true;
+  case AArch64::STURDi:
+    return true;
+  case AArch64::STURQi:
+    return true;
+  case AArch64::STURWi:
+    return true;
+  case AArch64::STURXi:
+    return true;
+  case AArch64::LDURSi:
+    return true;
+  case AArch64::LDURDi:
+    return true;
+  case AArch64::LDURQi:
+    return true;
+  case AArch64::LDURWi:
+    return true;
+  case AArch64::LDURXi:
+    return true;
+  }
+}
+
+// Size in bytes of the data moved by an unscaled load or store
+int AArch64LoadStoreOpt::getMemSize(MachineInstr *MemMI) {
+  switch (MemMI->getOpcode()) {
+  default:
+    llvm_unreachable("Opcode has unknown size!");
+  case AArch64::STRSui:
+  case AArch64::STURSi:
+    return 4;
+  case AArch64::STRDui:
+  case AArch64::STURDi:
+    return 8;
+  case AArch64::STRQui:
+  case AArch64::STURQi:
+    return 16;
+  case AArch64::STRWui:
+  case AArch64::STURWi:
+    return 4;
+  case AArch64::STRXui:
+  case AArch64::STURXi:
+    return 8;
+  case AArch64::LDRSui:
+  case AArch64::LDURSi:
+    return 4;
+  case AArch64::LDRDui:
+  case AArch64::LDURDi:
+    return 8;
+  case AArch64::LDRQui:
+  case AArch64::LDURQi:
+    return 16;
+  case AArch64::LDRWui:
+  case AArch64::LDURWi:
+    return 4;
+  case AArch64::LDRXui:
+  case AArch64::LDURXi:
+    return 8;
+  }
+}
+
+static unsigned getMatchingPairOpcode(unsigned Opc) {
+  switch (Opc) {
+  default:
+    llvm_unreachable("Opcode has no pairwise equivalent!");
+  case AArch64::STRSui:
+  case AArch64::STURSi:
+    return AArch64::STPSi;
+  case AArch64::STRDui:
+  case AArch64::STURDi:
+    return AArch64::STPDi;
+  case AArch64::STRQui:
+  case AArch64::STURQi:
+    return AArch64::STPQi;
+  case AArch64::STRWui:
+  case AArch64::STURWi:
+    return AArch64::STPWi;
+  case AArch64::STRXui:
+  case AArch64::STURXi:
+    return AArch64::STPXi;
+  case AArch64::LDRSui:
+  case AArch64::LDURSi:
+    return AArch64::LDPSi;
+  case AArch64::LDRDui:
+  case AArch64::LDURDi:
+    return AArch64::LDPDi;
+  case AArch64::LDRQui:
+  case AArch64::LDURQi:
+    return AArch64::LDPQi;
+  case AArch64::LDRWui:
+  case AArch64::LDURWi:
+    return AArch64::LDPWi;
+  case AArch64::LDRXui:
+  case AArch64::LDURXi:
+    return AArch64::LDPXi;
+  }
+}
+
+static unsigned getPreIndexedOpcode(unsigned Opc) {
+  switch (Opc) {
+  default:
+    llvm_unreachable("Opcode has no pre-indexed equivalent!");
+  case AArch64::STRSui:
+    return AArch64::STRSpre;
+  case AArch64::STRDui:
+    return AArch64::STRDpre;
+  case AArch64::STRQui:
+    return AArch64::STRQpre;
+  case AArch64::STRWui:
+    return AArch64::STRWpre;
+  case AArch64::STRXui:
+    return AArch64::STRXpre;
+  case AArch64::LDRSui:
+    return AArch64::LDRSpre;
+  case AArch64::LDRDui:
+    return AArch64::LDRDpre;
+  case AArch64::LDRQui:
+    return AArch64::LDRQpre;
+  case AArch64::LDRWui:
+    return AArch64::LDRWpre;
+  case AArch64::LDRXui:
+    return AArch64::LDRXpre;
+  }
+}
+
+static unsigned getPostIndexedOpcode(unsigned Opc) {
+  switch (Opc) {
+  default:
+    llvm_unreachable("Opcode has no post-indexed wise equivalent!");
+  case AArch64::STRSui:
+    return AArch64::STRSpost;
+  case AArch64::STRDui:
+    return AArch64::STRDpost;
+  case AArch64::STRQui:
+    return AArch64::STRQpost;
+  case AArch64::STRWui:
+    return AArch64::STRWpost;
+  case AArch64::STRXui:
+    return AArch64::STRXpost;
+  case AArch64::LDRSui:
+    return AArch64::LDRSpost;
+  case AArch64::LDRDui:
+    return AArch64::LDRDpost;
+  case AArch64::LDRQui:
+    return AArch64::LDRQpost;
+  case AArch64::LDRWui:
+    return AArch64::LDRWpost;
+  case AArch64::LDRXui:
+    return AArch64::LDRXpost;
+  }
+}
+
+MachineBasicBlock::iterator
+AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
+                                      MachineBasicBlock::iterator Paired,
+                                      bool MergeForward) {
+  MachineBasicBlock::iterator NextI = I;
+  ++NextI;
+  // If NextI is the second of the two instructions to be merged, we need
+  // to skip one further. Either way we merge will invalidate the iterator,
+  // and we don't need to scan the new instruction, as it's a pairwise
+  // instruction, which we're not considering for further action anyway.
+  if (NextI == Paired)
+    ++NextI;
+
+  bool IsUnscaled = isUnscaledLdst(I->getOpcode());
+  int OffsetStride =
+      IsUnscaled && EnableAArch64UnscaledMemOp ? getMemSize(I) : 1;
+
+  unsigned NewOpc = getMatchingPairOpcode(I->getOpcode());
+  // Insert our new paired instruction after whichever of the paired
+  // instructions MergeForward indicates.
+  MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
+  // Also based on MergeForward is from where we copy the base register operand
+  // so we get the flags compatible with the input code.
+  MachineOperand &BaseRegOp =
+      MergeForward ? Paired->getOperand(1) : I->getOperand(1);
+
+  // Which register is Rt and which is Rt2 depends on the offset order.
+  MachineInstr *RtMI, *Rt2MI;
+  if (I->getOperand(2).getImm() ==
+      Paired->getOperand(2).getImm() + OffsetStride) {
+    RtMI = Paired;
+    Rt2MI = I;
+  } else {
+    RtMI = I;
+    Rt2MI = Paired;
+  }
+  // Handle Unscaled
+  int OffsetImm = RtMI->getOperand(2).getImm();
+  if (IsUnscaled && EnableAArch64UnscaledMemOp)
+    OffsetImm /= OffsetStride;
+
+  // Construct the new instruction.
+  MachineInstrBuilder MIB = BuildMI(*I->getParent(), InsertionPoint,
+                                    I->getDebugLoc(), TII->get(NewOpc))
+                                .addOperand(RtMI->getOperand(0))
+                                .addOperand(Rt2MI->getOperand(0))
+                                .addOperand(BaseRegOp)
+                                .addImm(OffsetImm);
+  (void)MIB;
+
+  // FIXME: Do we need/want to copy the mem operands from the source
+  //        instructions? Probably. What uses them after this?
+
+  DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n    ");
+  DEBUG(I->print(dbgs()));
+  DEBUG(dbgs() << "    ");
+  DEBUG(Paired->print(dbgs()));
+  DEBUG(dbgs() << "  with instruction:\n    ");
+  DEBUG(((MachineInstr *)MIB)->print(dbgs()));
+  DEBUG(dbgs() << "\n");
+
+  // Erase the old instructions.
+  I->eraseFromParent();
+  Paired->eraseFromParent();
+
+  return NextI;
+}
+
+/// trackRegDefsUses - Remember what registers the specified instruction uses
+/// and modifies.
+static void trackRegDefsUses(MachineInstr *MI, BitVector &ModifiedRegs,
+                             BitVector &UsedRegs,
+                             const TargetRegisterInfo *TRI) {
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (MO.isRegMask())
+      ModifiedRegs.setBitsNotInMask(MO.getRegMask());
+
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (MO.isDef()) {
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        ModifiedRegs.set(*AI);
+    } else {
+      assert(MO.isUse() && "Reg operand not a def and not a use?!?");
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        UsedRegs.set(*AI);
+    }
+  }
+}
+
+static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
+  if (!IsUnscaled && (Offset > 63 || Offset < -64))
+    return false;
+  if (IsUnscaled) {
+    // Convert the byte-offset used by unscaled into an "element" offset used
+    // by the scaled pair load/store instructions.
+    int ElemOffset = Offset / OffsetStride;
+    if (ElemOffset > 63 || ElemOffset < -64)
+      return false;
+  }
+  return true;
+}
+
+// Do alignment, specialized to power of 2 and for signed ints,
+// avoiding having to do a C-style cast from uint_64t to int when
+// using RoundUpToAlignment from include/llvm/Support/MathExtras.h.
+// FIXME: Move this function to include/MathExtras.h?
+static int alignTo(int Num, int PowOf2) {
+  return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
+}
+
+/// findMatchingInsn - Scan the instructions looking for a load/store that can
+/// be combined with the current instruction into a load/store pair.
+MachineBasicBlock::iterator
+AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
+                                      bool &MergeForward, unsigned Limit) {
+  MachineBasicBlock::iterator E = I->getParent()->end();
+  MachineBasicBlock::iterator MBBI = I;
+  MachineInstr *FirstMI = I;
+  ++MBBI;
+
+  int Opc = FirstMI->getOpcode();
+  bool MayLoad = FirstMI->mayLoad();
+  bool IsUnscaled = isUnscaledLdst(Opc);
+  unsigned Reg = FirstMI->getOperand(0).getReg();
+  unsigned BaseReg = FirstMI->getOperand(1).getReg();
+  int Offset = FirstMI->getOperand(2).getImm();
+
+  // Early exit if the first instruction modifies the base register.
+  // e.g., ldr x0, [x0]
+  // Early exit if the offset if not possible to match. (6 bits of positive
+  // range, plus allow an extra one in case we find a later insn that matches
+  // with Offset-1
+  if (FirstMI->modifiesRegister(BaseReg, TRI))
+    return E;
+  int OffsetStride =
+      IsUnscaled && EnableAArch64UnscaledMemOp ? getMemSize(FirstMI) : 1;
+  if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
+    return E;
+
+  // Track which registers have been modified and used between the first insn
+  // (inclusive) and the second insn.
+  BitVector ModifiedRegs, UsedRegs;
+  ModifiedRegs.resize(TRI->getNumRegs());
+  UsedRegs.resize(TRI->getNumRegs());
+  for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
+    MachineInstr *MI = MBBI;
+    // Skip DBG_VALUE instructions. Otherwise debug info can affect the
+    // optimization by changing how far we scan.
+    if (MI->isDebugValue())
+      continue;
+
+    // Now that we know this is a real instruction, count it.
+    ++Count;
+
+    if (Opc == MI->getOpcode() && MI->getOperand(2).isImm()) {
+      // If we've found another instruction with the same opcode, check to see
+      // if the base and offset are compatible with our starting instruction.
+      // These instructions all have scaled immediate operands, so we just
+      // check for +1/-1. Make sure to check the new instruction offset is
+      // actually an immediate and not a symbolic reference destined for
+      // a relocation.
+      //
+      // Pairwise instructions have a 7-bit signed offset field. Single insns
+      // have a 12-bit unsigned offset field. To be a valid combine, the
+      // final offset must be in range.
+      unsigned MIBaseReg = MI->getOperand(1).getReg();
+      int MIOffset = MI->getOperand(2).getImm();
+      if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) ||
+                                   (Offset + OffsetStride == MIOffset))) {
+        int MinOffset = Offset < MIOffset ? Offset : MIOffset;
+        // If this is a volatile load/store that otherwise matched, stop looking
+        // as something is going on that we don't have enough information to
+        // safely transform. Similarly, stop if we see a hint to avoid pairs.
+        if (MI->hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
+          return E;
+        // If the resultant immediate offset of merging these instructions
+        // is out of range for a pairwise instruction, bail and keep looking.
+        bool MIIsUnscaled = isUnscaledLdst(MI->getOpcode());
+        if (!inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) {
+          trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+          continue;
+        }
+        // If the alignment requirements of the paired (scaled) instruction
+        // can't express the offset of the unscaled input, bail and keep
+        // looking.
+        if (IsUnscaled && EnableAArch64UnscaledMemOp &&
+            (alignTo(MinOffset, OffsetStride) != MinOffset)) {
+          trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+          continue;
+        }
+        // If the destination register of the loads is the same register, bail
+        // and keep looking. A load-pair instruction with both destination
+        // registers the same is UNPREDICTABLE and will result in an exception.
+        if (MayLoad && Reg == MI->getOperand(0).getReg()) {
+          trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+          continue;
+        }
+
+        // If the Rt of the second instruction was not modified or used between
+        // the two instructions, we can combine the second into the first.
+        if (!ModifiedRegs[MI->getOperand(0).getReg()] &&
+            !UsedRegs[MI->getOperand(0).getReg()]) {
+          MergeForward = false;
+          return MBBI;
+        }
+
+        // Likewise, if the Rt of the first instruction is not modified or used
+        // between the two instructions, we can combine the first into the
+        // second.
+        if (!ModifiedRegs[FirstMI->getOperand(0).getReg()] &&
+            !UsedRegs[FirstMI->getOperand(0).getReg()]) {
+          MergeForward = true;
+          return MBBI;
+        }
+        // Unable to combine these instructions due to interference in between.
+        // Keep looking.
+      }
+    }
+
+    // If the instruction wasn't a matching load or store, but does (or can)
+    // modify memory, stop searching, as we don't have alias analysis or
+    // anything like that to tell us whether the access is tromping on the
+    // locations we care about. The big one we want to catch is calls.
+    //
+    // FIXME: Theoretically, we can do better than that for SP and FP based
+    // references since we can effectively know where those are touching. It's
+    // unclear if it's worth the extra code, though. Most paired instructions
+    // will be sequential, perhaps with a few intervening non-memory related
+    // instructions.
+    if (MI->mayStore() || MI->isCall())
+      return E;
+    // Likewise, if we're matching a store instruction, we don't want to
+    // move across a load, as it may be reading the same location.
+    if (FirstMI->mayStore() && MI->mayLoad())
+      return E;
+
+    // Update modified / uses register lists.
+    trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+
+    // Otherwise, if the base register is modified, we have no match, so
+    // return early.
+    if (ModifiedRegs[BaseReg])
+      return E;
+  }
+  return E;
+}
+
+MachineBasicBlock::iterator
+AArch64LoadStoreOpt::mergePreIdxUpdateInsn(MachineBasicBlock::iterator I,
+                                           MachineBasicBlock::iterator Update) {
+  assert((Update->getOpcode() == AArch64::ADDXri ||
+          Update->getOpcode() == AArch64::SUBXri) &&
+         "Unexpected base register update instruction to merge!");
+  MachineBasicBlock::iterator NextI = I;
+  // Return the instruction following the merged instruction, which is
+  // the instruction following our unmerged load. Unless that's the add/sub
+  // instruction we're merging, in which case it's the one after that.
+  if (++NextI == Update)
+    ++NextI;
+
+  int Value = Update->getOperand(2).getImm();
+  assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
+         "Can't merge 1 << 12 offset into pre-indexed load / store");
+  if (Update->getOpcode() == AArch64::SUBXri)
+    Value = -Value;
+
+  unsigned NewOpc = getPreIndexedOpcode(I->getOpcode());
+  MachineInstrBuilder MIB =
+      BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
+          .addOperand(Update->getOperand(0))
+          .addOperand(I->getOperand(0))
+          .addOperand(I->getOperand(1))
+          .addImm(Value);
+  (void)MIB;
+
+  DEBUG(dbgs() << "Creating pre-indexed load/store.");
+  DEBUG(dbgs() << "    Replacing instructions:\n    ");
+  DEBUG(I->print(dbgs()));
+  DEBUG(dbgs() << "    ");
+  DEBUG(Update->print(dbgs()));
+  DEBUG(dbgs() << "  with instruction:\n    ");
+  DEBUG(((MachineInstr *)MIB)->print(dbgs()));
+  DEBUG(dbgs() << "\n");
+
+  // Erase the old instructions for the block.
+  I->eraseFromParent();
+  Update->eraseFromParent();
+
+  return NextI;
+}
+
+MachineBasicBlock::iterator AArch64LoadStoreOpt::mergePostIdxUpdateInsn(
+    MachineBasicBlock::iterator I, MachineBasicBlock::iterator Update) {
+  assert((Update->getOpcode() == AArch64::ADDXri ||
+          Update->getOpcode() == AArch64::SUBXri) &&
+         "Unexpected base register update instruction to merge!");
+  MachineBasicBlock::iterator NextI = I;
+  // Return the instruction following the merged instruction, which is
+  // the instruction following our unmerged load. Unless that's the add/sub
+  // instruction we're merging, in which case it's the one after that.
+  if (++NextI == Update)
+    ++NextI;
+
+  int Value = Update->getOperand(2).getImm();
+  assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
+         "Can't merge 1 << 12 offset into post-indexed load / store");
+  if (Update->getOpcode() == AArch64::SUBXri)
+    Value = -Value;
+
+  unsigned NewOpc = getPostIndexedOpcode(I->getOpcode());
+  MachineInstrBuilder MIB =
+      BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
+          .addOperand(Update->getOperand(0))
+          .addOperand(I->getOperand(0))
+          .addOperand(I->getOperand(1))
+          .addImm(Value);
+  (void)MIB;
+
+  DEBUG(dbgs() << "Creating post-indexed load/store.");
+  DEBUG(dbgs() << "    Replacing instructions:\n    ");
+  DEBUG(I->print(dbgs()));
+  DEBUG(dbgs() << "    ");
+  DEBUG(Update->print(dbgs()));
+  DEBUG(dbgs() << "  with instruction:\n    ");
+  DEBUG(((MachineInstr *)MIB)->print(dbgs()));
+  DEBUG(dbgs() << "\n");
+
+  // Erase the old instructions for the block.
+  I->eraseFromParent();
+  Update->eraseFromParent();
+
+  return NextI;
+}
+
+static bool isMatchingUpdateInsn(MachineInstr *MI, unsigned BaseReg,
+                                 int Offset) {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case AArch64::SUBXri:
+    // Negate the offset for a SUB instruction.
+    Offset *= -1;
+  // FALLTHROUGH
+  case AArch64::ADDXri:
+    // Make sure it's a vanilla immediate operand, not a relocation or
+    // anything else we can't handle.
+    if (!MI->getOperand(2).isImm())
+      break;
+    // Watch out for 1 << 12 shifted value.
+    if (AArch64_AM::getShiftValue(MI->getOperand(3).getImm()))
+      break;
+    // If the instruction has the base register as source and dest and the
+    // immediate will fit in a signed 9-bit integer, then we have a match.
+    if (MI->getOperand(0).getReg() == BaseReg &&
+        MI->getOperand(1).getReg() == BaseReg &&
+        MI->getOperand(2).getImm() <= 255 &&
+        MI->getOperand(2).getImm() >= -256) {
+      // If we have a non-zero Offset, we check that it matches the amount
+      // we're adding to the register.
+      if (!Offset || Offset == MI->getOperand(2).getImm())
+        return true;
+    }
+    break;
+  }
+  return false;
+}
+
+MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
+    MachineBasicBlock::iterator I, unsigned Limit, int Value) {
+  MachineBasicBlock::iterator E = I->getParent()->end();
+  MachineInstr *MemMI = I;
+  MachineBasicBlock::iterator MBBI = I;
+  const MachineFunction &MF = *MemMI->getParent()->getParent();
+
+  unsigned DestReg = MemMI->getOperand(0).getReg();
+  unsigned BaseReg = MemMI->getOperand(1).getReg();
+  int Offset = MemMI->getOperand(2).getImm() *
+               TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize();
+
+  // If the base register overlaps the destination register, we can't
+  // merge the update.
+  if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
+    return E;
+
+  // Scan forward looking for post-index opportunities.
+  // Updating instructions can't be formed if the memory insn already
+  // has an offset other than the value we're looking for.
+  if (Offset != Value)
+    return E;
+
+  // Track which registers have been modified and used between the first insn
+  // (inclusive) and the second insn.
+  BitVector ModifiedRegs, UsedRegs;
+  ModifiedRegs.resize(TRI->getNumRegs());
+  UsedRegs.resize(TRI->getNumRegs());
+  ++MBBI;
+  for (unsigned Count = 0; MBBI != E; ++MBBI) {
+    MachineInstr *MI = MBBI;
+    // Skip DBG_VALUE instructions. Otherwise debug info can affect the
+    // optimization by changing how far we scan.
+    if (MI->isDebugValue())
+      continue;
+
+    // Now that we know this is a real instruction, count it.
+    ++Count;
+
+    // If we found a match, return it.
+    if (isMatchingUpdateInsn(MI, BaseReg, Value))
+      return MBBI;
+
+    // Update the status of what the instruction clobbered and used.
+    trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+
+    // Otherwise, if the base register is used or modified, we have no match, so
+    // return early.
+    if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
+      return E;
+  }
+  return E;
+}
+
+MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
+    MachineBasicBlock::iterator I, unsigned Limit) {
+  MachineBasicBlock::iterator B = I->getParent()->begin();
+  MachineBasicBlock::iterator E = I->getParent()->end();
+  MachineInstr *MemMI = I;
+  MachineBasicBlock::iterator MBBI = I;
+  const MachineFunction &MF = *MemMI->getParent()->getParent();
+
+  unsigned DestReg = MemMI->getOperand(0).getReg();
+  unsigned BaseReg = MemMI->getOperand(1).getReg();
+  int Offset = MemMI->getOperand(2).getImm();
+  unsigned RegSize = TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize();
+
+  // If the load/store is the first instruction in the block, there's obviously
+  // not any matching update. Ditto if the memory offset isn't zero.
+  if (MBBI == B || Offset != 0)
+    return E;
+  // If the base register overlaps the destination register, we can't
+  // merge the update.
+  if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
+    return E;
+
+  // Track which registers have been modified and used between the first insn
+  // (inclusive) and the second insn.
+  BitVector ModifiedRegs, UsedRegs;
+  ModifiedRegs.resize(TRI->getNumRegs());
+  UsedRegs.resize(TRI->getNumRegs());
+  --MBBI;
+  for (unsigned Count = 0; MBBI != B; --MBBI) {
+    MachineInstr *MI = MBBI;
+    // Skip DBG_VALUE instructions. Otherwise debug info can affect the
+    // optimization by changing how far we scan.
+    if (MI->isDebugValue())
+      continue;
+
+    // Now that we know this is a real instruction, count it.
+    ++Count;
+
+    // If we found a match, return it.
+    if (isMatchingUpdateInsn(MI, BaseReg, RegSize))
+      return MBBI;
+
+    // Update the status of what the instruction clobbered and used.
+    trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+
+    // Otherwise, if the base register is used or modified, we have no match, so
+    // return early.
+    if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
+      return E;
+  }
+  return E;
+}
+
+bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
+  bool Modified = false;
+  // Two tranformations to do here:
+  // 1) Find loads and stores that can be merged into a single load or store
+  //    pair instruction.
+  //      e.g.,
+  //        ldr x0, [x2]
+  //        ldr x1, [x2, #8]
+  //        ; becomes
+  //        ldp x0, x1, [x2]
+  // 2) Find base register updates that can be merged into the load or store
+  //    as a base-reg writeback.
+  //      e.g.,
+  //        ldr x0, [x2]
+  //        add x2, x2, #4
+  //        ; becomes
+  //        ldr x0, [x2], #4
+
+  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+       MBBI != E;) {
+    MachineInstr *MI = MBBI;
+    switch (MI->getOpcode()) {
+    default:
+      // Just move on to the next instruction.
+      ++MBBI;
+      break;
+    case AArch64::STRSui:
+    case AArch64::STRDui:
+    case AArch64::STRQui:
+    case AArch64::STRXui:
+    case AArch64::STRWui:
+    case AArch64::LDRSui:
+    case AArch64::LDRDui:
+    case AArch64::LDRQui:
+    case AArch64::LDRXui:
+    case AArch64::LDRWui:
+    // do the unscaled versions as well
+    case AArch64::STURSi:
+    case AArch64::STURDi:
+    case AArch64::STURQi:
+    case AArch64::STURWi:
+    case AArch64::STURXi:
+    case AArch64::LDURSi:
+    case AArch64::LDURDi:
+    case AArch64::LDURQi:
+    case AArch64::LDURWi:
+    case AArch64::LDURXi: {
+      // If this is a volatile load/store, don't mess with it.
+      if (MI->hasOrderedMemoryRef()) {
+        ++MBBI;
+        break;
+      }
+      // Make sure this is a reg+imm (as opposed to an address reloc).
+      if (!MI->getOperand(2).isImm()) {
+        ++MBBI;
+        break;
+      }
+      // Check if this load/store has a hint to avoid pair formation.
+      // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
+      if (TII->isLdStPairSuppressed(MI)) {
+        ++MBBI;
+        break;
+      }
+      // Look ahead up to ScanLimit instructions for a pairable instruction.
+      bool MergeForward = false;
+      MachineBasicBlock::iterator Paired =
+          findMatchingInsn(MBBI, MergeForward, ScanLimit);
+      if (Paired != E) {
+        // Merge the loads into a pair. Keeping the iterator straight is a
+        // pain, so we let the merge routine tell us what the next instruction
+        // is after it's done mucking about.
+        MBBI = mergePairedInsns(MBBI, Paired, MergeForward);
+
+        Modified = true;
+        ++NumPairCreated;
+        if (isUnscaledLdst(MI->getOpcode()))
+          ++NumUnscaledPairCreated;
+        break;
+      }
+      ++MBBI;
+      break;
+    }
+      // FIXME: Do the other instructions.
+    }
+  }
+
+  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+       MBBI != E;) {
+    MachineInstr *MI = MBBI;
+    // Do update merging. It's simpler to keep this separate from the above
+    // switch, though not strictly necessary.
+    int Opc = MI->getOpcode();
+    switch (Opc) {
+    default:
+      // Just move on to the next instruction.
+      ++MBBI;
+      break;
+    case AArch64::STRSui:
+    case AArch64::STRDui:
+    case AArch64::STRQui:
+    case AArch64::STRXui:
+    case AArch64::STRWui:
+    case AArch64::LDRSui:
+    case AArch64::LDRDui:
+    case AArch64::LDRQui:
+    case AArch64::LDRXui:
+    case AArch64::LDRWui:
+    // do the unscaled versions as well
+    case AArch64::STURSi:
+    case AArch64::STURDi:
+    case AArch64::STURQi:
+    case AArch64::STURWi:
+    case AArch64::STURXi:
+    case AArch64::LDURSi:
+    case AArch64::LDURDi:
+    case AArch64::LDURQi:
+    case AArch64::LDURWi:
+    case AArch64::LDURXi: {
+      // Make sure this is a reg+imm (as opposed to an address reloc).
+      if (!MI->getOperand(2).isImm()) {
+        ++MBBI;
+        break;
+      }
+      // Look ahead up to ScanLimit instructions for a mergable instruction.
+      MachineBasicBlock::iterator Update =
+          findMatchingUpdateInsnForward(MBBI, ScanLimit, 0);
+      if (Update != E) {
+        // Merge the update into the ld/st.
+        MBBI = mergePostIdxUpdateInsn(MBBI, Update);
+        Modified = true;
+        ++NumPostFolded;
+        break;
+      }
+      // Don't know how to handle pre/post-index versions, so move to the next
+      // instruction.
+      if (isUnscaledLdst(Opc)) {
+        ++MBBI;
+        break;
+      }
+
+      // Look back to try to find a pre-index instruction. For example,
+      // add x0, x0, #8
+      // ldr x1, [x0]
+      //   merged into:
+      // ldr x1, [x0, #8]!
+      Update = findMatchingUpdateInsnBackward(MBBI, ScanLimit);
+      if (Update != E) {
+        // Merge the update into the ld/st.
+        MBBI = mergePreIdxUpdateInsn(MBBI, Update);
+        Modified = true;
+        ++NumPreFolded;
+        break;
+      }
+
+      // Look forward to try to find a post-index instruction. For example,
+      // ldr x1, [x0, #64]
+      // add x0, x0, #64
+      //   merged into:
+      // ldr x1, [x0, #64]!
+
+      // The immediate in the load/store is scaled by the size of the register
+      // being loaded. The immediate in the add we're looking for,
+      // however, is not, so adjust here.
+      int Value = MI->getOperand(2).getImm() *
+                  TII->getRegClass(MI->getDesc(), 0, TRI, *(MBB.getParent()))
+                      ->getSize();
+      Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, Value);
+      if (Update != E) {
+        // Merge the update into the ld/st.
+        MBBI = mergePreIdxUpdateInsn(MBBI, Update);
+        Modified = true;
+        ++NumPreFolded;
+        break;
+      }
+
+      // Nothing found. Just move to the next instruction.
+      ++MBBI;
+      break;
+    }
+      // FIXME: Do the other instructions.
+    }
+  }
+
+  return Modified;
+}
+
+bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
+  const TargetMachine &TM = Fn.getTarget();
+  TII = static_cast<const AArch64InstrInfo *>(TM.getInstrInfo());
+  TRI = TM.getRegisterInfo();
+
+  bool Modified = false;
+  for (auto &MBB : Fn)
+    Modified |= optimizeBlock(MBB);
+
+  return Modified;
+}
+
+// FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep
+// loads and stores near one another?
+
+/// createARMLoadStoreOptimizationPass - returns an instance of the load / store
+/// optimization pass.
+FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
+  return new AArch64LoadStoreOpt();
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp b/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
new file mode 100644
index 0000000..75a17b9
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
@@ -0,0 +1,202 @@
+//==-- AArch64MCInstLower.cpp - Convert AArch64 MachineInstr to an MCInst --==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower AArch64 MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64MCInstLower.h"
+#include "MCTargetDesc/AArch64MCExpr.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+AArch64MCInstLower::AArch64MCInstLower(MCContext &ctx, Mangler &mang,
+                                       AsmPrinter &printer)
+    : Ctx(ctx), Printer(printer), TargetTriple(printer.getTargetTriple()) {}
+
+MCSymbol *
+AArch64MCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
+  return Printer.getSymbol(MO.getGlobal());
+}
+
+MCSymbol *
+AArch64MCInstLower::GetExternalSymbolSymbol(const MachineOperand &MO) const {
+  return Printer.GetExternalSymbolSymbol(MO.getSymbolName());
+}
+
+MCOperand AArch64MCInstLower::lowerSymbolOperandDarwin(const MachineOperand &MO,
+                                                       MCSymbol *Sym) const {
+  // FIXME: We would like an efficient form for this, so we don't have to do a
+  // lot of extra uniquing.
+  MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
+  if ((MO.getTargetFlags() & AArch64II::MO_GOT) != 0) {
+    if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE)
+      RefKind = MCSymbolRefExpr::VK_GOTPAGE;
+    else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) ==
+             AArch64II::MO_PAGEOFF)
+      RefKind = MCSymbolRefExpr::VK_GOTPAGEOFF;
+    else
+      llvm_unreachable("Unexpected target flags with MO_GOT on GV operand");
+  } else if ((MO.getTargetFlags() & AArch64II::MO_TLS) != 0) {
+    if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE)
+      RefKind = MCSymbolRefExpr::VK_TLVPPAGE;
+    else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) ==
+             AArch64II::MO_PAGEOFF)
+      RefKind = MCSymbolRefExpr::VK_TLVPPAGEOFF;
+    else
+      llvm_unreachable("Unexpected target flags with MO_TLS on GV operand");
+  } else {
+    if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE)
+      RefKind = MCSymbolRefExpr::VK_PAGE;
+    else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) ==
+             AArch64II::MO_PAGEOFF)
+      RefKind = MCSymbolRefExpr::VK_PAGEOFF;
+  }
+  const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, RefKind, Ctx);
+  if (!MO.isJTI() && MO.getOffset())
+    Expr = MCBinaryExpr::CreateAdd(
+        Expr, MCConstantExpr::Create(MO.getOffset(), Ctx), Ctx);
+  return MCOperand::CreateExpr(Expr);
+}
+
+MCOperand AArch64MCInstLower::lowerSymbolOperandELF(const MachineOperand &MO,
+                                                    MCSymbol *Sym) const {
+  uint32_t RefFlags = 0;
+
+  if (MO.getTargetFlags() & AArch64II::MO_GOT)
+    RefFlags |= AArch64MCExpr::VK_GOT;
+  else if (MO.getTargetFlags() & AArch64II::MO_TLS) {
+    TLSModel::Model Model;
+    if (MO.isGlobal()) {
+      const GlobalValue *GV = MO.getGlobal();
+      Model = Printer.TM.getTLSModel(GV);
+    } else {
+      assert(MO.isSymbol() &&
+             StringRef(MO.getSymbolName()) == "_TLS_MODULE_BASE_" &&
+             "unexpected external TLS symbol");
+      Model = TLSModel::GeneralDynamic;
+    }
+    switch (Model) {
+    case TLSModel::InitialExec:
+      RefFlags |= AArch64MCExpr::VK_GOTTPREL;
+      break;
+    case TLSModel::LocalExec:
+      RefFlags |= AArch64MCExpr::VK_TPREL;
+      break;
+    case TLSModel::LocalDynamic:
+      RefFlags |= AArch64MCExpr::VK_DTPREL;
+      break;
+    case TLSModel::GeneralDynamic:
+      RefFlags |= AArch64MCExpr::VK_TLSDESC;
+      break;
+    }
+  } else {
+    // No modifier means this is a generic reference, classified as absolute for
+    // the cases where it matters (:abs_g0: etc).
+    RefFlags |= AArch64MCExpr::VK_ABS;
+  }
+
+  if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE)
+    RefFlags |= AArch64MCExpr::VK_PAGE;
+  else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) ==
+           AArch64II::MO_PAGEOFF)
+    RefFlags |= AArch64MCExpr::VK_PAGEOFF;
+  else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G3)
+    RefFlags |= AArch64MCExpr::VK_G3;
+  else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G2)
+    RefFlags |= AArch64MCExpr::VK_G2;
+  else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G1)
+    RefFlags |= AArch64MCExpr::VK_G1;
+  else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G0)
+    RefFlags |= AArch64MCExpr::VK_G0;
+
+  if (MO.getTargetFlags() & AArch64II::MO_NC)
+    RefFlags |= AArch64MCExpr::VK_NC;
+
+  const MCExpr *Expr =
+      MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None, Ctx);
+  if (!MO.isJTI() && MO.getOffset())
+    Expr = MCBinaryExpr::CreateAdd(
+        Expr, MCConstantExpr::Create(MO.getOffset(), Ctx), Ctx);
+
+  AArch64MCExpr::VariantKind RefKind;
+  RefKind = static_cast<AArch64MCExpr::VariantKind>(RefFlags);
+  Expr = AArch64MCExpr::Create(Expr, RefKind, Ctx);
+
+  return MCOperand::CreateExpr(Expr);
+}
+
+MCOperand AArch64MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
+                                                 MCSymbol *Sym) const {
+  if (TargetTriple.isOSDarwin())
+    return lowerSymbolOperandDarwin(MO, Sym);
+
+  assert(TargetTriple.isOSBinFormatELF() && "Expect Darwin or ELF target");
+  return lowerSymbolOperandELF(MO, Sym);
+}
+
+bool AArch64MCInstLower::lowerOperand(const MachineOperand &MO,
+                                      MCOperand &MCOp) const {
+  switch (MO.getType()) {
+  default:
+    llvm_unreachable("unknown operand type");
+  case MachineOperand::MO_Register:
+    // Ignore all implicit register operands.
+    if (MO.isImplicit())
+      return false;
+    MCOp = MCOperand::CreateReg(MO.getReg());
+    break;
+  case MachineOperand::MO_RegisterMask:
+    // Regmasks are like implicit defs.
+    return false;
+  case MachineOperand::MO_Immediate:
+    MCOp = MCOperand::CreateImm(MO.getImm());
+    break;
+  case MachineOperand::MO_MachineBasicBlock:
+    MCOp = MCOperand::CreateExpr(
+        MCSymbolRefExpr::Create(MO.getMBB()->getSymbol(), Ctx));
+    break;
+  case MachineOperand::MO_GlobalAddress:
+    MCOp = LowerSymbolOperand(MO, GetGlobalAddressSymbol(MO));
+    break;
+  case MachineOperand::MO_ExternalSymbol:
+    MCOp = LowerSymbolOperand(MO, GetExternalSymbolSymbol(MO));
+    break;
+  case MachineOperand::MO_JumpTableIndex:
+    MCOp = LowerSymbolOperand(MO, Printer.GetJTISymbol(MO.getIndex()));
+    break;
+  case MachineOperand::MO_ConstantPoolIndex:
+    MCOp = LowerSymbolOperand(MO, Printer.GetCPISymbol(MO.getIndex()));
+    break;
+  case MachineOperand::MO_BlockAddress:
+    MCOp = LowerSymbolOperand(
+        MO, Printer.GetBlockAddressSymbol(MO.getBlockAddress()));
+    break;
+  }
+  return true;
+}
+
+void AArch64MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+  OutMI.setOpcode(MI->getOpcode());
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MCOperand MCOp;
+    if (lowerOperand(MI->getOperand(i), MCOp))
+      OutMI.addOperand(MCOp);
+  }
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.h b/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.h
new file mode 100644
index 0000000..ba50ba9
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.h
@@ -0,0 +1,52 @@
+//===-- AArch64MCInstLower.h - Lower MachineInstr to MCInst ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AArch64_MCINSTLOWER_H
+#define AArch64_MCINSTLOWER_H
+
+#include "llvm/ADT/Triple.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+class AsmPrinter;
+class MCAsmInfo;
+class MCContext;
+class MCInst;
+class MCOperand;
+class MCSymbol;
+class MachineInstr;
+class MachineModuleInfoMachO;
+class MachineOperand;
+class Mangler;
+
+/// AArch64MCInstLower - This class is used to lower an MachineInstr
+/// into an MCInst.
+class LLVM_LIBRARY_VISIBILITY AArch64MCInstLower {
+  MCContext &Ctx;
+  AsmPrinter &Printer;
+  Triple TargetTriple;
+
+public:
+  AArch64MCInstLower(MCContext &ctx, Mangler &mang, AsmPrinter &printer);
+
+  bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const;
+  void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+  MCOperand lowerSymbolOperandDarwin(const MachineOperand &MO,
+                                     MCSymbol *Sym) const;
+  MCOperand lowerSymbolOperandELF(const MachineOperand &MO,
+                                  MCSymbol *Sym) const;
+  MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
+
+  MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const;
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
new file mode 100644
index 0000000..7c257ba
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -0,0 +1,163 @@
+//=- AArch64MachineFuctionInfo.h - AArch64 machine function info --*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares AArch64-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AArch64MACHINEFUNCTIONINFO_H
+#define AArch64MACHINEFUNCTIONINFO_H
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/MC/MCLinkerOptimizationHint.h"
+
+namespace llvm {
+
+/// AArch64FunctionInfo - This class is derived from MachineFunctionInfo and
+/// contains private AArch64-specific information for each MachineFunction.
+class AArch64FunctionInfo : public MachineFunctionInfo {
+
+  /// Number of bytes of arguments this function has on the stack. If the callee
+  /// is expected to restore the argument stack this should be a multiple of 16,
+  /// all usable during a tail call.
+  ///
+  /// The alternative would forbid tail call optimisation in some cases: if we
+  /// want to transfer control from a function with 8-bytes of stack-argument
+  /// space to a function with 16-bytes then misalignment of this value would
+  /// make a stack adjustment necessary, which could not be undone by the
+  /// callee.
+  unsigned BytesInStackArgArea;
+
+  /// The number of bytes to restore to deallocate space for incoming
+  /// arguments. Canonically 0 in the C calling convention, but non-zero when
+  /// callee is expected to pop the args.
+  unsigned ArgumentStackToRestore;
+
+  /// HasStackFrame - True if this function has a stack frame. Set by
+  /// processFunctionBeforeCalleeSavedScan().
+  bool HasStackFrame;
+
+  /// \brief Amount of stack frame size, not including callee-saved registers.
+  unsigned LocalStackSize;
+
+  /// \brief Number of TLS accesses using the special (combinable)
+  /// _TLS_MODULE_BASE_ symbol.
+  unsigned NumLocalDynamicTLSAccesses;
+
+  /// \brief FrameIndex for start of varargs area for arguments passed on the
+  /// stack.
+  int VarArgsStackIndex;
+
+  /// \brief FrameIndex for start of varargs area for arguments passed in
+  /// general purpose registers.
+  int VarArgsGPRIndex;
+
+  /// \brief Size of the varargs area for arguments passed in general purpose
+  /// registers.
+  unsigned VarArgsGPRSize;
+
+  /// \brief FrameIndex for start of varargs area for arguments passed in
+  /// floating-point registers.
+  int VarArgsFPRIndex;
+
+  /// \brief Size of the varargs area for arguments passed in floating-point
+  /// registers.
+  unsigned VarArgsFPRSize;
+
+public:
+  AArch64FunctionInfo()
+      : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
+        NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
+        VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) {}
+
+  explicit AArch64FunctionInfo(MachineFunction &MF)
+      : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
+        NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
+        VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) {
+    (void)MF;
+  }
+
+  unsigned getBytesInStackArgArea() const { return BytesInStackArgArea; }
+  void setBytesInStackArgArea(unsigned bytes) { BytesInStackArgArea = bytes; }
+
+  unsigned getArgumentStackToRestore() const { return ArgumentStackToRestore; }
+  void setArgumentStackToRestore(unsigned bytes) {
+    ArgumentStackToRestore = bytes;
+  }
+
+  bool hasStackFrame() const { return HasStackFrame; }
+  void setHasStackFrame(bool s) { HasStackFrame = s; }
+
+  void setLocalStackSize(unsigned Size) { LocalStackSize = Size; }
+  unsigned getLocalStackSize() const { return LocalStackSize; }
+
+  void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamicTLSAccesses; }
+  unsigned getNumLocalDynamicTLSAccesses() const {
+    return NumLocalDynamicTLSAccesses;
+  }
+
+  int getVarArgsStackIndex() const { return VarArgsStackIndex; }
+  void setVarArgsStackIndex(int Index) { VarArgsStackIndex = Index; }
+
+  int getVarArgsGPRIndex() const { return VarArgsGPRIndex; }
+  void setVarArgsGPRIndex(int Index) { VarArgsGPRIndex = Index; }
+
+  unsigned getVarArgsGPRSize() const { return VarArgsGPRSize; }
+  void setVarArgsGPRSize(unsigned Size) { VarArgsGPRSize = Size; }
+
+  int getVarArgsFPRIndex() const { return VarArgsFPRIndex; }
+  void setVarArgsFPRIndex(int Index) { VarArgsFPRIndex = Index; }
+
+  unsigned getVarArgsFPRSize() const { return VarArgsFPRSize; }
+  void setVarArgsFPRSize(unsigned Size) { VarArgsFPRSize = Size; }
+
+  typedef SmallPtrSet<const MachineInstr *, 16> SetOfInstructions;
+
+  const SetOfInstructions &getLOHRelated() const { return LOHRelated; }
+
+  // Shortcuts for LOH related types.
+  class MILOHDirective {
+    MCLOHType Kind;
+
+    /// Arguments of this directive. Order matters.
+    SmallVector<const MachineInstr *, 3> Args;
+
+  public:
+    typedef SmallVectorImpl<const MachineInstr *> LOHArgs;
+
+    MILOHDirective(MCLOHType Kind, const LOHArgs &Args)
+        : Kind(Kind), Args(Args.begin(), Args.end()) {
+      assert(isValidMCLOHType(Kind) && "Invalid LOH directive type!");
+    }
+
+    MCLOHType getKind() const { return Kind; }
+    const LOHArgs &getArgs() const { return Args; }
+  };
+
+  typedef MILOHDirective::LOHArgs MILOHArgs;
+  typedef SmallVector<MILOHDirective, 32> MILOHContainer;
+
+  const MILOHContainer &getLOHContainer() const { return LOHContainerSet; }
+
+  /// Add a LOH directive of this @p Kind and this @p Args.
+  void addLOHDirective(MCLOHType Kind, const MILOHArgs &Args) {
+    LOHContainerSet.push_back(MILOHDirective(Kind, Args));
+    LOHRelated.insert(Args.begin(), Args.end());
+  }
+
+private:
+  // Hold the lists of LOHs.
+  MILOHContainer LOHContainerSet;
+  SetOfInstructions LOHRelated;
+};
+} // End llvm namespace
+
+#endif // AArch64MACHINEFUNCTIONINFO_H
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h b/contrib/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
new file mode 100644
index 0000000..b22fa24
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
@@ -0,0 +1,6586 @@
+//===-- AArch64PerfectShuffle.h - AdvSIMD Perfect Shuffle Table -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file, which was autogenerated by llvm-PerfectShuffle, contains data
+// for the optimal way to build a perfect shuffle using AdvSIMD instructions.
+//
+//===----------------------------------------------------------------------===//
+
+// 31 entries have cost 0
+// 242 entries have cost 1
+// 1447 entries have cost 2
+// 3602 entries have cost 3
+// 1237 entries have cost 4
+// 2 entries have cost 5
+
+// This table is 6561*4 = 26244 bytes in size.
+static const unsigned PerfectShuffleTable[6561+1] = {
+  135053414U, // <0,0,0,0>: Cost 1 vdup0 LHS
+  1543503974U, // <0,0,0,1>: Cost 2 vext2 <0,0,0,0>, LHS
+  2618572962U, // <0,0,0,2>: Cost 3 vext2 <0,2,0,0>, <0,2,0,0>
+  2568054923U, // <0,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
+  1476398390U, // <0,0,0,4>: Cost 2 vext1 <0,0,0,0>, RHS
+  2550140624U, // <0,0,0,5>: Cost 3 vext1 <0,0,0,0>, <5,1,7,3>
+  2550141434U, // <0,0,0,6>: Cost 3 vext1 <0,0,0,0>, <6,2,7,3>
+  2591945711U, // <0,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
+  135053414U, // <0,0,0,u>: Cost 1 vdup0 LHS
+  2886516736U, // <0,0,1,0>: Cost 3 vzipl LHS, <0,0,0,0>
+  1812775014U, // <0,0,1,1>: Cost 2 vzipl LHS, LHS
+  1618133094U, // <0,0,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
+  2625209292U, // <0,0,1,3>: Cost 3 vext2 <1,3,0,0>, <1,3,0,0>
+  2886558034U, // <0,0,1,4>: Cost 3 vzipl LHS, <0,4,1,5>
+  2617246864U, // <0,0,1,5>: Cost 3 vext2 <0,0,0,0>, <1,5,3,7>
+  3659723031U, // <0,0,1,6>: Cost 4 vext1 <6,0,0,1>, <6,0,0,1>
+  2591953904U, // <0,0,1,7>: Cost 3 vext1 <7,0,0,1>, <7,0,0,1>
+  1812775581U, // <0,0,1,u>: Cost 2 vzipl LHS, LHS
+  3020734464U, // <0,0,2,0>: Cost 3 vtrnl LHS, <0,0,0,0>
+  3020734474U, // <0,0,2,1>: Cost 3 vtrnl LHS, <0,0,1,1>
+  1946992742U, // <0,0,2,2>: Cost 2 vtrnl LHS, LHS
+  2631181989U, // <0,0,2,3>: Cost 3 vext2 <2,3,0,0>, <2,3,0,0>
+  3020734668U, // <0,0,2,4>: Cost 3 vtrnl LHS, <0,2,4,6>
+  3826550569U, // <0,0,2,5>: Cost 4 vuzpl <0,2,0,2>, <2,4,5,6>
+  2617247674U, // <0,0,2,6>: Cost 3 vext2 <0,0,0,0>, <2,6,3,7>
+  2591962097U, // <0,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
+  1946992796U, // <0,0,2,u>: Cost 2 vtrnl LHS, LHS
+  2635163787U, // <0,0,3,0>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
+  2686419196U, // <0,0,3,1>: Cost 3 vext3 <0,3,1,0>, <0,3,1,0>
+  2686492933U, // <0,0,3,2>: Cost 3 vext3 <0,3,2,0>, <0,3,2,0>
+  2617248156U, // <0,0,3,3>: Cost 3 vext2 <0,0,0,0>, <3,3,3,3>
+  2617248258U, // <0,0,3,4>: Cost 3 vext2 <0,0,0,0>, <3,4,5,6>
+  3826551298U, // <0,0,3,5>: Cost 4 vuzpl <0,2,0,2>, <3,4,5,6>
+  3690990200U, // <0,0,3,6>: Cost 4 vext2 <0,0,0,0>, <3,6,0,7>
+  3713551042U, // <0,0,3,7>: Cost 4 vext2 <3,7,0,0>, <3,7,0,0>
+  2635163787U, // <0,0,3,u>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
+  2617248658U, // <0,0,4,0>: Cost 3 vext2 <0,0,0,0>, <4,0,5,1>
+  2888450150U, // <0,0,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
+  3021570150U, // <0,0,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
+  3641829519U, // <0,0,4,3>: Cost 4 vext1 <3,0,0,4>, <3,0,0,4>
+  3021570252U, // <0,0,4,4>: Cost 3 vtrnl <0,2,4,6>, <0,2,4,6>
+  1543507254U, // <0,0,4,5>: Cost 2 vext2 <0,0,0,0>, RHS
+  2752810294U, // <0,0,4,6>: Cost 3 vuzpl <0,2,0,2>, RHS
+  3786998152U, // <0,0,4,7>: Cost 4 vext3 <4,7,5,0>, <0,4,7,5>
+  1543507497U, // <0,0,4,u>: Cost 2 vext2 <0,0,0,0>, RHS
+  2684354972U, // <0,0,5,0>: Cost 3 vext3 <0,0,0,0>, <0,5,0,7>
+  2617249488U, // <0,0,5,1>: Cost 3 vext2 <0,0,0,0>, <5,1,7,3>
+  3765617070U, // <0,0,5,2>: Cost 4 vext3 <1,2,3,0>, <0,5,2,7>
+  3635865780U, // <0,0,5,3>: Cost 4 vext1 <2,0,0,5>, <3,0,4,5>
+  2617249734U, // <0,0,5,4>: Cost 3 vext2 <0,0,0,0>, <5,4,7,6>
+  2617249796U, // <0,0,5,5>: Cost 3 vext2 <0,0,0,0>, <5,5,5,5>
+  2718712274U, // <0,0,5,6>: Cost 3 vext3 <5,6,7,0>, <0,5,6,7>
+  2617249960U, // <0,0,5,7>: Cost 3 vext2 <0,0,0,0>, <5,7,5,7>
+  2720039396U, // <0,0,5,u>: Cost 3 vext3 <5,u,7,0>, <0,5,u,7>
+  2684355053U, // <0,0,6,0>: Cost 3 vext3 <0,0,0,0>, <0,6,0,7>
+  3963609190U, // <0,0,6,1>: Cost 4 vzipl <0,6,2,7>, LHS
+  2617250298U, // <0,0,6,2>: Cost 3 vext2 <0,0,0,0>, <6,2,7,3>
+  3796435464U, // <0,0,6,3>: Cost 4 vext3 <6,3,7,0>, <0,6,3,7>
+  3659762998U, // <0,0,6,4>: Cost 4 vext1 <6,0,0,6>, RHS
+  3659763810U, // <0,0,6,5>: Cost 4 vext1 <6,0,0,6>, <5,6,7,0>
+  2617250616U, // <0,0,6,6>: Cost 3 vext2 <0,0,0,0>, <6,6,6,6>
+  2657727309U, // <0,0,6,7>: Cost 3 vext2 <6,7,0,0>, <6,7,0,0>
+  2658390942U, // <0,0,6,u>: Cost 3 vext2 <6,u,0,0>, <6,u,0,0>
+  2659054575U, // <0,0,7,0>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
+  3635880854U, // <0,0,7,1>: Cost 4 vext1 <2,0,0,7>, <1,2,3,0>
+  3635881401U, // <0,0,7,2>: Cost 4 vext1 <2,0,0,7>, <2,0,0,7>
+  3734787298U, // <0,0,7,3>: Cost 4 vext2 <7,3,0,0>, <7,3,0,0>
+  2617251174U, // <0,0,7,4>: Cost 3 vext2 <0,0,0,0>, <7,4,5,6>
+  3659772002U, // <0,0,7,5>: Cost 4 vext1 <6,0,0,7>, <5,6,7,0>
+  3659772189U, // <0,0,7,6>: Cost 4 vext1 <6,0,0,7>, <6,0,0,7>
+  2617251436U, // <0,0,7,7>: Cost 3 vext2 <0,0,0,0>, <7,7,7,7>
+  2659054575U, // <0,0,7,u>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
+  135053414U, // <0,0,u,0>: Cost 1 vdup0 LHS
+  1817419878U, // <0,0,u,1>: Cost 2 vzipl LHS, LHS
+  1947435110U, // <0,0,u,2>: Cost 2 vtrnl LHS, LHS
+  2568120467U, // <0,0,u,3>: Cost 3 vext1 <3,0,0,u>, <3,0,0,u>
+  1476463926U, // <0,0,u,4>: Cost 2 vext1 <0,0,0,u>, RHS
+  1543510170U, // <0,0,u,5>: Cost 2 vext2 <0,0,0,0>, RHS
+  2752813210U, // <0,0,u,6>: Cost 3 vuzpl <0,2,0,2>, RHS
+  2592011255U, // <0,0,u,7>: Cost 3 vext1 <7,0,0,u>, <7,0,0,u>
+  135053414U, // <0,0,u,u>: Cost 1 vdup0 LHS
+  2618581002U, // <0,1,0,0>: Cost 3 vext2 <0,2,0,1>, <0,0,1,1>
+  1557446758U, // <0,1,0,1>: Cost 2 vext2 <2,3,0,1>, LHS
+  2618581155U, // <0,1,0,2>: Cost 3 vext2 <0,2,0,1>, <0,2,0,1>
+  2690548468U, // <0,1,0,3>: Cost 3 vext3 <1,0,3,0>, <1,0,3,0>
+  2626543954U, // <0,1,0,4>: Cost 3 vext2 <1,5,0,1>, <0,4,1,5>
+  4094985216U, // <0,1,0,5>: Cost 4 vtrnl <0,2,0,2>, <1,3,5,7>
+  2592019278U, // <0,1,0,6>: Cost 3 vext1 <7,0,1,0>, <6,7,0,1>
+  2592019448U, // <0,1,0,7>: Cost 3 vext1 <7,0,1,0>, <7,0,1,0>
+  1557447325U, // <0,1,0,u>: Cost 2 vext2 <2,3,0,1>, LHS
+  1476476938U, // <0,1,1,0>: Cost 2 vext1 <0,0,1,1>, <0,0,1,1>
+  2886517556U, // <0,1,1,1>: Cost 3 vzipl LHS, <1,1,1,1>
+  2886517654U, // <0,1,1,2>: Cost 3 vzipl LHS, <1,2,3,0>
+  2886517720U, // <0,1,1,3>: Cost 3 vzipl LHS, <1,3,1,3>
+  1476480310U, // <0,1,1,4>: Cost 2 vext1 <0,0,1,1>, RHS
+  2886558864U, // <0,1,1,5>: Cost 3 vzipl LHS, <1,5,3,7>
+  2550223354U, // <0,1,1,6>: Cost 3 vext1 <0,0,1,1>, <6,2,7,3>
+  2550223856U, // <0,1,1,7>: Cost 3 vext1 <0,0,1,1>, <7,0,0,1>
+  1476482862U, // <0,1,1,u>: Cost 2 vext1 <0,0,1,1>, LHS
+  1494401126U, // <0,1,2,0>: Cost 2 vext1 <3,0,1,2>, LHS
+  3020735284U, // <0,1,2,1>: Cost 3 vtrnl LHS, <1,1,1,1>
+  2562172349U, // <0,1,2,2>: Cost 3 vext1 <2,0,1,2>, <2,0,1,2>
+  835584U, // <0,1,2,3>: Cost 0 copy LHS
+  1494404406U, // <0,1,2,4>: Cost 2 vext1 <3,0,1,2>, RHS
+  3020735488U, // <0,1,2,5>: Cost 3 vtrnl LHS, <1,3,5,7>
+  2631190458U, // <0,1,2,6>: Cost 3 vext2 <2,3,0,1>, <2,6,3,7>
+  1518294010U, // <0,1,2,7>: Cost 2 vext1 <7,0,1,2>, <7,0,1,2>
+  835584U, // <0,1,2,u>: Cost 0 copy LHS
+  2692318156U, // <0,1,3,0>: Cost 3 vext3 <1,3,0,0>, <1,3,0,0>
+  2691875800U, // <0,1,3,1>: Cost 3 vext3 <1,2,3,0>, <1,3,1,3>
+  2691875806U, // <0,1,3,2>: Cost 3 vext3 <1,2,3,0>, <1,3,2,0>
+  2692539367U, // <0,1,3,3>: Cost 3 vext3 <1,3,3,0>, <1,3,3,0>
+  2562182454U, // <0,1,3,4>: Cost 3 vext1 <2,0,1,3>, RHS
+  2691875840U, // <0,1,3,5>: Cost 3 vext3 <1,2,3,0>, <1,3,5,7>
+  2692760578U, // <0,1,3,6>: Cost 3 vext3 <1,3,6,0>, <1,3,6,0>
+  2639817411U, // <0,1,3,7>: Cost 3 vext2 <3,7,0,1>, <3,7,0,1>
+  2691875863U, // <0,1,3,u>: Cost 3 vext3 <1,2,3,0>, <1,3,u,3>
+  2568159334U, // <0,1,4,0>: Cost 3 vext1 <3,0,1,4>, LHS
+  4095312692U, // <0,1,4,1>: Cost 4 vtrnl <0,2,4,6>, <1,1,1,1>
+  2568160934U, // <0,1,4,2>: Cost 3 vext1 <3,0,1,4>, <2,3,0,1>
+  2568161432U, // <0,1,4,3>: Cost 3 vext1 <3,0,1,4>, <3,0,1,4>
+  2568162614U, // <0,1,4,4>: Cost 3 vext1 <3,0,1,4>, RHS
+  1557450038U, // <0,1,4,5>: Cost 2 vext2 <2,3,0,1>, RHS
+  2754235702U, // <0,1,4,6>: Cost 3 vuzpl <0,4,1,5>, RHS
+  2592052220U, // <0,1,4,7>: Cost 3 vext1 <7,0,1,4>, <7,0,1,4>
+  1557450281U, // <0,1,4,u>: Cost 2 vext2 <2,3,0,1>, RHS
+  3765617775U, // <0,1,5,0>: Cost 4 vext3 <1,2,3,0>, <1,5,0,1>
+  2647781007U, // <0,1,5,1>: Cost 3 vext2 <5,1,0,1>, <5,1,0,1>
+  3704934138U, // <0,1,5,2>: Cost 4 vext2 <2,3,0,1>, <5,2,3,0>
+  2691875984U, // <0,1,5,3>: Cost 3 vext3 <1,2,3,0>, <1,5,3,7>
+  2657734598U, // <0,1,5,4>: Cost 3 vext2 <6,7,0,1>, <5,4,7,6>
+  2650435539U, // <0,1,5,5>: Cost 3 vext2 <5,5,0,1>, <5,5,0,1>
+  2651099172U, // <0,1,5,6>: Cost 3 vext2 <5,6,0,1>, <5,6,0,1>
+  2651762805U, // <0,1,5,7>: Cost 3 vext2 <5,7,0,1>, <5,7,0,1>
+  2691876029U, // <0,1,5,u>: Cost 3 vext3 <1,2,3,0>, <1,5,u,7>
+  2592063590U, // <0,1,6,0>: Cost 3 vext1 <7,0,1,6>, LHS
+  3765617871U, // <0,1,6,1>: Cost 4 vext3 <1,2,3,0>, <1,6,1,7>
+  2654417337U, // <0,1,6,2>: Cost 3 vext2 <6,2,0,1>, <6,2,0,1>
+  3765617889U, // <0,1,6,3>: Cost 4 vext3 <1,2,3,0>, <1,6,3,7>
+  2592066870U, // <0,1,6,4>: Cost 3 vext1 <7,0,1,6>, RHS
+  3765617907U, // <0,1,6,5>: Cost 4 vext3 <1,2,3,0>, <1,6,5,7>
+  2657071869U, // <0,1,6,6>: Cost 3 vext2 <6,6,0,1>, <6,6,0,1>
+  1583993678U, // <0,1,6,7>: Cost 2 vext2 <6,7,0,1>, <6,7,0,1>
+  1584657311U, // <0,1,6,u>: Cost 2 vext2 <6,u,0,1>, <6,u,0,1>
+  2657735672U, // <0,1,7,0>: Cost 3 vext2 <6,7,0,1>, <7,0,1,0>
+  2657735808U, // <0,1,7,1>: Cost 3 vext2 <6,7,0,1>, <7,1,7,1>
+  2631193772U, // <0,1,7,2>: Cost 3 vext2 <2,3,0,1>, <7,2,3,0>
+  2661053667U, // <0,1,7,3>: Cost 3 vext2 <7,3,0,1>, <7,3,0,1>
+  2657736038U, // <0,1,7,4>: Cost 3 vext2 <6,7,0,1>, <7,4,5,6>
+  3721524621U, // <0,1,7,5>: Cost 4 vext2 <5,1,0,1>, <7,5,1,0>
+  2657736158U, // <0,1,7,6>: Cost 3 vext2 <6,7,0,1>, <7,6,1,0>
+  2657736300U, // <0,1,7,7>: Cost 3 vext2 <6,7,0,1>, <7,7,7,7>
+  2657736322U, // <0,1,7,u>: Cost 3 vext2 <6,7,0,1>, <7,u,1,2>
+  1494450278U, // <0,1,u,0>: Cost 2 vext1 <3,0,1,u>, LHS
+  1557452590U, // <0,1,u,1>: Cost 2 vext2 <2,3,0,1>, LHS
+  2754238254U, // <0,1,u,2>: Cost 3 vuzpl <0,4,1,5>, LHS
+  835584U, // <0,1,u,3>: Cost 0 copy LHS
+  1494453558U, // <0,1,u,4>: Cost 2 vext1 <3,0,1,u>, RHS
+  1557452954U, // <0,1,u,5>: Cost 2 vext2 <2,3,0,1>, RHS
+  2754238618U, // <0,1,u,6>: Cost 3 vuzpl <0,4,1,5>, RHS
+  1518343168U, // <0,1,u,7>: Cost 2 vext1 <7,0,1,u>, <7,0,1,u>
+  835584U, // <0,1,u,u>: Cost 0 copy LHS
+  2752299008U, // <0,2,0,0>: Cost 3 vuzpl LHS, <0,0,0,0>
+  1544847462U, // <0,2,0,1>: Cost 2 vext2 <0,2,0,2>, LHS
+  1678557286U, // <0,2,0,2>: Cost 2 vuzpl LHS, LHS
+  2696521165U, // <0,2,0,3>: Cost 3 vext3 <2,0,3,0>, <2,0,3,0>
+  2752340172U, // <0,2,0,4>: Cost 3 vuzpl LHS, <0,2,4,6>
+  2691876326U, // <0,2,0,5>: Cost 3 vext3 <1,2,3,0>, <2,0,5,7>
+  2618589695U, // <0,2,0,6>: Cost 3 vext2 <0,2,0,2>, <0,6,2,7>
+  2592093185U, // <0,2,0,7>: Cost 3 vext1 <7,0,2,0>, <7,0,2,0>
+  1678557340U, // <0,2,0,u>: Cost 2 vuzpl LHS, LHS
+  2618589942U, // <0,2,1,0>: Cost 3 vext2 <0,2,0,2>, <1,0,3,2>
+  2752299828U, // <0,2,1,1>: Cost 3 vuzpl LHS, <1,1,1,1>
+  2886518376U, // <0,2,1,2>: Cost 3 vzipl LHS, <2,2,2,2>
+  2752299766U, // <0,2,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
+  2550295862U, // <0,2,1,4>: Cost 3 vext1 <0,0,2,1>, RHS
+  2752340992U, // <0,2,1,5>: Cost 3 vuzpl LHS, <1,3,5,7>
+  2886559674U, // <0,2,1,6>: Cost 3 vzipl LHS, <2,6,3,7>
+  3934208106U, // <0,2,1,7>: Cost 4 vuzpr <7,0,1,2>, <0,1,2,7>
+  2752340771U, // <0,2,1,u>: Cost 3 vuzpl LHS, <1,0,u,2>
+  1476558868U, // <0,2,2,0>: Cost 2 vext1 <0,0,2,2>, <0,0,2,2>
+  2226628029U, // <0,2,2,1>: Cost 3 vrev <2,0,1,2>
+  2752300648U, // <0,2,2,2>: Cost 3 vuzpl LHS, <2,2,2,2>
+  3020736114U, // <0,2,2,3>: Cost 3 vtrnl LHS, <2,2,3,3>
+  1476562230U, // <0,2,2,4>: Cost 2 vext1 <0,0,2,2>, RHS
+  2550304464U, // <0,2,2,5>: Cost 3 vext1 <0,0,2,2>, <5,1,7,3>
+  2618591162U, // <0,2,2,6>: Cost 3 vext2 <0,2,0,2>, <2,6,3,7>
+  2550305777U, // <0,2,2,7>: Cost 3 vext1 <0,0,2,2>, <7,0,0,2>
+  1476564782U, // <0,2,2,u>: Cost 2 vext1 <0,0,2,2>, LHS
+  2618591382U, // <0,2,3,0>: Cost 3 vext2 <0,2,0,2>, <3,0,1,2>
+  2752301206U, // <0,2,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
+  3826043121U, // <0,2,3,2>: Cost 4 vuzpl LHS, <3,1,2,3>
+  2752301468U, // <0,2,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
+  2618591746U, // <0,2,3,4>: Cost 3 vext2 <0,2,0,2>, <3,4,5,6>
+  2752301570U, // <0,2,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
+  3830688102U, // <0,2,3,6>: Cost 4 vuzpl LHS, <3,2,6,3>
+  2698807012U, // <0,2,3,7>: Cost 3 vext3 <2,3,7,0>, <2,3,7,0>
+  2752301269U, // <0,2,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
+  2562261094U, // <0,2,4,0>: Cost 3 vext1 <2,0,2,4>, LHS
+  4095313828U, // <0,2,4,1>: Cost 4 vtrnl <0,2,4,6>, <2,6,1,3>
+  2226718152U, // <0,2,4,2>: Cost 3 vrev <2,0,2,4>
+  2568235169U, // <0,2,4,3>: Cost 3 vext1 <3,0,2,4>, <3,0,2,4>
+  2562264374U, // <0,2,4,4>: Cost 3 vext1 <2,0,2,4>, RHS
+  1544850742U, // <0,2,4,5>: Cost 2 vext2 <0,2,0,2>, RHS
+  1678560566U, // <0,2,4,6>: Cost 2 vuzpl LHS, RHS
+  2592125957U, // <0,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
+  1678560584U, // <0,2,4,u>: Cost 2 vuzpl LHS, RHS
+  2691876686U, // <0,2,5,0>: Cost 3 vext3 <1,2,3,0>, <2,5,0,7>
+  2618592976U, // <0,2,5,1>: Cost 3 vext2 <0,2,0,2>, <5,1,7,3>
+  3765618528U, // <0,2,5,2>: Cost 4 vext3 <1,2,3,0>, <2,5,2,7>
+  3765618536U, // <0,2,5,3>: Cost 4 vext3 <1,2,3,0>, <2,5,3,6>
+  2618593222U, // <0,2,5,4>: Cost 3 vext2 <0,2,0,2>, <5,4,7,6>
+  2752303108U, // <0,2,5,5>: Cost 3 vuzpl LHS, <5,5,5,5>
+  2618593378U, // <0,2,5,6>: Cost 3 vext2 <0,2,0,2>, <5,6,7,0>
+  2824785206U, // <0,2,5,7>: Cost 3 vuzpr <1,0,3,2>, RHS
+  2824785207U, // <0,2,5,u>: Cost 3 vuzpr <1,0,3,2>, RHS
+  2752303950U, // <0,2,6,0>: Cost 3 vuzpl LHS, <6,7,0,1>
+  3830690081U, // <0,2,6,1>: Cost 4 vuzpl LHS, <6,0,1,2>
+  2618593786U, // <0,2,6,2>: Cost 3 vext2 <0,2,0,2>, <6,2,7,3>
+  2691876794U, // <0,2,6,3>: Cost 3 vext3 <1,2,3,0>, <2,6,3,7>
+  2752303990U, // <0,2,6,4>: Cost 3 vuzpl LHS, <6,7,4,5>
+  3830690445U, // <0,2,6,5>: Cost 4 vuzpl LHS, <6,4,5,6>
+  2752303928U, // <0,2,6,6>: Cost 3 vuzpl LHS, <6,6,6,6>
+  2657743695U, // <0,2,6,7>: Cost 3 vext2 <6,7,0,2>, <6,7,0,2>
+  2691876839U, // <0,2,6,u>: Cost 3 vext3 <1,2,3,0>, <2,6,u,7>
+  2659070961U, // <0,2,7,0>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
+  2659734594U, // <0,2,7,1>: Cost 3 vext2 <7,1,0,2>, <7,1,0,2>
+  3734140051U, // <0,2,7,2>: Cost 4 vext2 <7,2,0,2>, <7,2,0,2>
+  2701166596U, // <0,2,7,3>: Cost 3 vext3 <2,7,3,0>, <2,7,3,0>
+  2662389094U, // <0,2,7,4>: Cost 3 vext2 <7,5,0,2>, <7,4,5,6>
+  2662389126U, // <0,2,7,5>: Cost 3 vext2 <7,5,0,2>, <7,5,0,2>
+  3736794583U, // <0,2,7,6>: Cost 4 vext2 <7,6,0,2>, <7,6,0,2>
+  2752304748U, // <0,2,7,7>: Cost 3 vuzpl LHS, <7,7,7,7>
+  2659070961U, // <0,2,7,u>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
+  1476608026U, // <0,2,u,0>: Cost 2 vext1 <0,0,2,u>, <0,0,2,u>
+  1544853294U, // <0,2,u,1>: Cost 2 vext2 <0,2,0,2>, LHS
+  1678563118U, // <0,2,u,2>: Cost 2 vuzpl LHS, LHS
+  3021178482U, // <0,2,u,3>: Cost 3 vtrnl LHS, <2,2,3,3>
+  1476611382U, // <0,2,u,4>: Cost 2 vext1 <0,0,2,u>, RHS
+  1544853658U, // <0,2,u,5>: Cost 2 vext2 <0,2,0,2>, RHS
+  1678563482U, // <0,2,u,6>: Cost 2 vuzpl LHS, RHS
+  2824785449U, // <0,2,u,7>: Cost 3 vuzpr <1,0,3,2>, RHS
+  1678563172U, // <0,2,u,u>: Cost 2 vuzpl LHS, LHS
+  2556329984U, // <0,3,0,0>: Cost 3 vext1 <1,0,3,0>, <0,0,0,0>
+  2686421142U, // <0,3,0,1>: Cost 3 vext3 <0,3,1,0>, <3,0,1,2>
+  2562303437U, // <0,3,0,2>: Cost 3 vext1 <2,0,3,0>, <2,0,3,0>
+  4094986652U, // <0,3,0,3>: Cost 4 vtrnl <0,2,0,2>, <3,3,3,3>
+  2556333366U, // <0,3,0,4>: Cost 3 vext1 <1,0,3,0>, RHS
+  4094986754U, // <0,3,0,5>: Cost 4 vtrnl <0,2,0,2>, <3,4,5,6>
+  3798796488U, // <0,3,0,6>: Cost 4 vext3 <6,7,3,0>, <3,0,6,7>
+  3776530634U, // <0,3,0,7>: Cost 4 vext3 <3,0,7,0>, <3,0,7,0>
+  2556335918U, // <0,3,0,u>: Cost 3 vext1 <1,0,3,0>, LHS
+  2886518934U, // <0,3,1,0>: Cost 3 vzipl LHS, <3,0,1,2>
+  2556338933U, // <0,3,1,1>: Cost 3 vext1 <1,0,3,1>, <1,0,3,1>
+  2691877105U, // <0,3,1,2>: Cost 3 vext3 <1,2,3,0>, <3,1,2,3>
+  2886519196U, // <0,3,1,3>: Cost 3 vzipl LHS, <3,3,3,3>
+  2886519298U, // <0,3,1,4>: Cost 3 vzipl LHS, <3,4,5,6>
+  4095740418U, // <0,3,1,5>: Cost 4 vtrnl <0,3,1,4>, <3,4,5,6>
+  3659944242U, // <0,3,1,6>: Cost 4 vext1 <6,0,3,1>, <6,0,3,1>
+  3769600286U, // <0,3,1,7>: Cost 4 vext3 <1,u,3,0>, <3,1,7,3>
+  2886519582U, // <0,3,1,u>: Cost 3 vzipl LHS, <3,u,1,2>
+  1482604646U, // <0,3,2,0>: Cost 2 vext1 <1,0,3,2>, LHS
+  1482605302U, // <0,3,2,1>: Cost 2 vext1 <1,0,3,2>, <1,0,3,2>
+  2556348008U, // <0,3,2,2>: Cost 3 vext1 <1,0,3,2>, <2,2,2,2>
+  3020736924U, // <0,3,2,3>: Cost 3 vtrnl LHS, <3,3,3,3>
+  1482607926U, // <0,3,2,4>: Cost 2 vext1 <1,0,3,2>, RHS
+  3020737026U, // <0,3,2,5>: Cost 3 vtrnl LHS, <3,4,5,6>
+  2598154746U, // <0,3,2,6>: Cost 3 vext1 <u,0,3,2>, <6,2,7,3>
+  2598155258U, // <0,3,2,7>: Cost 3 vext1 <u,0,3,2>, <7,0,1,2>
+  1482610478U, // <0,3,2,u>: Cost 2 vext1 <1,0,3,2>, LHS
+  3692341398U, // <0,3,3,0>: Cost 4 vext2 <0,2,0,3>, <3,0,1,2>
+  2635851999U, // <0,3,3,1>: Cost 3 vext2 <3,1,0,3>, <3,1,0,3>
+  3636069840U, // <0,3,3,2>: Cost 4 vext1 <2,0,3,3>, <2,0,3,3>
+  2691877276U, // <0,3,3,3>: Cost 3 vext3 <1,2,3,0>, <3,3,3,3>
+  3961522690U, // <0,3,3,4>: Cost 4 vzipl <0,3,1,4>, <3,4,5,6>
+  3826797058U, // <0,3,3,5>: Cost 4 vuzpl <0,2,3,5>, <3,4,5,6>
+  3703622282U, // <0,3,3,6>: Cost 4 vext2 <2,1,0,3>, <3,6,2,7>
+  3769600452U, // <0,3,3,7>: Cost 4 vext3 <1,u,3,0>, <3,3,7,7>
+  2640497430U, // <0,3,3,u>: Cost 3 vext2 <3,u,0,3>, <3,u,0,3>
+  3962194070U, // <0,3,4,0>: Cost 4 vzipl <0,4,1,5>, <3,0,1,2>
+  2232617112U, // <0,3,4,1>: Cost 3 vrev <3,0,1,4>
+  2232690849U, // <0,3,4,2>: Cost 3 vrev <3,0,2,4>
+  4095314332U, // <0,3,4,3>: Cost 4 vtrnl <0,2,4,6>, <3,3,3,3>
+  3962194434U, // <0,3,4,4>: Cost 4 vzipl <0,4,1,5>, <3,4,5,6>
+  2691877378U, // <0,3,4,5>: Cost 3 vext3 <1,2,3,0>, <3,4,5,6>
+  3826765110U, // <0,3,4,6>: Cost 4 vuzpl <0,2,3,1>, RHS
+  3665941518U, // <0,3,4,7>: Cost 4 vext1 <7,0,3,4>, <7,0,3,4>
+  2691877405U, // <0,3,4,u>: Cost 3 vext3 <1,2,3,0>, <3,4,u,6>
+  3630112870U, // <0,3,5,0>: Cost 4 vext1 <1,0,3,5>, LHS
+  3630113526U, // <0,3,5,1>: Cost 4 vext1 <1,0,3,5>, <1,0,3,2>
+  4035199734U, // <0,3,5,2>: Cost 4 vzipr <1,4,0,5>, <1,0,3,2>
+  3769600578U, // <0,3,5,3>: Cost 4 vext3 <1,u,3,0>, <3,5,3,7>
+  2232846516U, // <0,3,5,4>: Cost 3 vrev <3,0,4,5>
+  3779037780U, // <0,3,5,5>: Cost 4 vext3 <3,4,5,0>, <3,5,5,7>
+  2718714461U, // <0,3,5,6>: Cost 3 vext3 <5,6,7,0>, <3,5,6,7>
+  2706106975U, // <0,3,5,7>: Cost 3 vext3 <3,5,7,0>, <3,5,7,0>
+  2233141464U, // <0,3,5,u>: Cost 3 vrev <3,0,u,5>
+  2691877496U, // <0,3,6,0>: Cost 3 vext3 <1,2,3,0>, <3,6,0,7>
+  3727511914U, // <0,3,6,1>: Cost 4 vext2 <6,1,0,3>, <6,1,0,3>
+  3765619338U, // <0,3,6,2>: Cost 4 vext3 <1,2,3,0>, <3,6,2,7>
+  3765619347U, // <0,3,6,3>: Cost 4 vext3 <1,2,3,0>, <3,6,3,7>
+  3765987996U, // <0,3,6,4>: Cost 4 vext3 <1,2,u,0>, <3,6,4,7>
+  3306670270U, // <0,3,6,5>: Cost 4 vrev <3,0,5,6>
+  3792456365U, // <0,3,6,6>: Cost 4 vext3 <5,6,7,0>, <3,6,6,6>
+  2706770608U, // <0,3,6,7>: Cost 3 vext3 <3,6,7,0>, <3,6,7,0>
+  2706844345U, // <0,3,6,u>: Cost 3 vext3 <3,6,u,0>, <3,6,u,0>
+  3769600707U, // <0,3,7,0>: Cost 4 vext3 <1,u,3,0>, <3,7,0,1>
+  2659742787U, // <0,3,7,1>: Cost 3 vext2 <7,1,0,3>, <7,1,0,3>
+  3636102612U, // <0,3,7,2>: Cost 4 vext1 <2,0,3,7>, <2,0,3,7>
+  3769600740U, // <0,3,7,3>: Cost 4 vext3 <1,u,3,0>, <3,7,3,7>
+  3769600747U, // <0,3,7,4>: Cost 4 vext3 <1,u,3,0>, <3,7,4,5>
+  3769600758U, // <0,3,7,5>: Cost 4 vext3 <1,u,3,0>, <3,7,5,7>
+  3659993400U, // <0,3,7,6>: Cost 4 vext1 <6,0,3,7>, <6,0,3,7>
+  3781176065U, // <0,3,7,7>: Cost 4 vext3 <3,7,7,0>, <3,7,7,0>
+  2664388218U, // <0,3,7,u>: Cost 3 vext2 <7,u,0,3>, <7,u,0,3>
+  1482653798U, // <0,3,u,0>: Cost 2 vext1 <1,0,3,u>, LHS
+  1482654460U, // <0,3,u,1>: Cost 2 vext1 <1,0,3,u>, <1,0,3,u>
+  2556397160U, // <0,3,u,2>: Cost 3 vext1 <1,0,3,u>, <2,2,2,2>
+  3021179292U, // <0,3,u,3>: Cost 3 vtrnl LHS, <3,3,3,3>
+  1482657078U, // <0,3,u,4>: Cost 2 vext1 <1,0,3,u>, RHS
+  3021179394U, // <0,3,u,5>: Cost 3 vtrnl LHS, <3,4,5,6>
+  2598203898U, // <0,3,u,6>: Cost 3 vext1 <u,0,3,u>, <6,2,7,3>
+  2708097874U, // <0,3,u,7>: Cost 3 vext3 <3,u,7,0>, <3,u,7,0>
+  1482659630U, // <0,3,u,u>: Cost 2 vext1 <1,0,3,u>, LHS
+  2617278468U, // <0,4,0,0>: Cost 3 vext2 <0,0,0,4>, <0,0,0,4>
+  2618605670U, // <0,4,0,1>: Cost 3 vext2 <0,2,0,4>, LHS
+  2618605734U, // <0,4,0,2>: Cost 3 vext2 <0,2,0,4>, <0,2,0,4>
+  3642091695U, // <0,4,0,3>: Cost 4 vext1 <3,0,4,0>, <3,0,4,0>
+  2753134796U, // <0,4,0,4>: Cost 3 vuzpl <0,2,4,6>, <0,2,4,6>
+  2718714770U, // <0,4,0,5>: Cost 3 vext3 <5,6,7,0>, <4,0,5,1>
+  3021245750U, // <0,4,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
+  3665982483U, // <0,4,0,7>: Cost 4 vext1 <7,0,4,0>, <7,0,4,0>
+  3021245768U, // <0,4,0,u>: Cost 3 vtrnl <0,2,0,2>, RHS
+  2568355942U, // <0,4,1,0>: Cost 3 vext1 <3,0,4,1>, LHS
+  3692348212U, // <0,4,1,1>: Cost 4 vext2 <0,2,0,4>, <1,1,1,1>
+  3692348310U, // <0,4,1,2>: Cost 4 vext2 <0,2,0,4>, <1,2,3,0>
+  2568358064U, // <0,4,1,3>: Cost 3 vext1 <3,0,4,1>, <3,0,4,1>
+  2568359222U, // <0,4,1,4>: Cost 3 vext1 <3,0,4,1>, RHS
+  1812778294U, // <0,4,1,5>: Cost 2 vzipl LHS, RHS
+  3022671158U, // <0,4,1,6>: Cost 3 vtrnl <0,4,1,5>, RHS
+  2592248852U, // <0,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
+  1812778537U, // <0,4,1,u>: Cost 2 vzipl LHS, RHS
+  2568364134U, // <0,4,2,0>: Cost 3 vext1 <3,0,4,2>, LHS
+  2238573423U, // <0,4,2,1>: Cost 3 vrev <4,0,1,2>
+  3692349032U, // <0,4,2,2>: Cost 4 vext2 <0,2,0,4>, <2,2,2,2>
+  2631214761U, // <0,4,2,3>: Cost 3 vext2 <2,3,0,4>, <2,3,0,4>
+  2568367414U, // <0,4,2,4>: Cost 3 vext1 <3,0,4,2>, RHS
+  2887028022U, // <0,4,2,5>: Cost 3 vzipl <0,2,0,2>, RHS
+  1946996022U, // <0,4,2,6>: Cost 2 vtrnl LHS, RHS
+  2592257045U, // <0,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
+  1946996040U, // <0,4,2,u>: Cost 2 vtrnl LHS, RHS
+  3692349590U, // <0,4,3,0>: Cost 4 vext2 <0,2,0,4>, <3,0,1,2>
+  3826878614U, // <0,4,3,1>: Cost 4 vuzpl <0,2,4,6>, <3,0,1,2>
+  3826878625U, // <0,4,3,2>: Cost 4 vuzpl <0,2,4,6>, <3,0,2,4>
+  3692349852U, // <0,4,3,3>: Cost 4 vext2 <0,2,0,4>, <3,3,3,3>
+  3692349954U, // <0,4,3,4>: Cost 4 vext2 <0,2,0,4>, <3,4,5,6>
+  3826878978U, // <0,4,3,5>: Cost 4 vuzpl <0,2,4,6>, <3,4,5,6>
+  4095200566U, // <0,4,3,6>: Cost 4 vtrnl <0,2,3,1>, RHS
+  3713583814U, // <0,4,3,7>: Cost 4 vext2 <3,7,0,4>, <3,7,0,4>
+  3692350238U, // <0,4,3,u>: Cost 4 vext2 <0,2,0,4>, <3,u,1,2>
+  2550464552U, // <0,4,4,0>: Cost 3 vext1 <0,0,4,4>, <0,0,4,4>
+  3962194914U, // <0,4,4,1>: Cost 4 vzipl <0,4,1,5>, <4,1,5,0>
+  3693677631U, // <0,4,4,2>: Cost 4 vext2 <0,4,0,4>, <4,2,6,3>
+  3642124467U, // <0,4,4,3>: Cost 4 vext1 <3,0,4,4>, <3,0,4,4>
+  2718715088U, // <0,4,4,4>: Cost 3 vext3 <5,6,7,0>, <4,4,4,4>
+  2618608950U, // <0,4,4,5>: Cost 3 vext2 <0,2,0,4>, RHS
+  2753137974U, // <0,4,4,6>: Cost 3 vuzpl <0,2,4,6>, RHS
+  3666015255U, // <0,4,4,7>: Cost 4 vext1 <7,0,4,4>, <7,0,4,4>
+  2618609193U, // <0,4,4,u>: Cost 3 vext2 <0,2,0,4>, RHS
+  2568388710U, // <0,4,5,0>: Cost 3 vext1 <3,0,4,5>, LHS
+  2568389526U, // <0,4,5,1>: Cost 3 vext1 <3,0,4,5>, <1,2,3,0>
+  3636159963U, // <0,4,5,2>: Cost 4 vext1 <2,0,4,5>, <2,0,4,5>
+  2568390836U, // <0,4,5,3>: Cost 3 vext1 <3,0,4,5>, <3,0,4,5>
+  2568391990U, // <0,4,5,4>: Cost 3 vext1 <3,0,4,5>, RHS
+  2718715180U, // <0,4,5,5>: Cost 3 vext3 <5,6,7,0>, <4,5,5,6>
+  1618136374U, // <0,4,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
+  2592281624U, // <0,4,5,7>: Cost 3 vext1 <7,0,4,5>, <7,0,4,5>
+  1618136392U, // <0,4,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
+  2550480938U, // <0,4,6,0>: Cost 3 vext1 <0,0,4,6>, <0,0,4,6>
+  3826880801U, // <0,4,6,1>: Cost 4 vuzpl <0,2,4,6>, <6,0,1,2>
+  2562426332U, // <0,4,6,2>: Cost 3 vext1 <2,0,4,6>, <2,0,4,6>
+  3786190181U, // <0,4,6,3>: Cost 4 vext3 <4,6,3,0>, <4,6,3,0>
+  2718715252U, // <0,4,6,4>: Cost 3 vext3 <5,6,7,0>, <4,6,4,6>
+  3826881165U, // <0,4,6,5>: Cost 4 vuzpl <0,2,4,6>, <6,4,5,6>
+  2712669568U, // <0,4,6,6>: Cost 3 vext3 <4,6,6,0>, <4,6,6,0>
+  2657760081U, // <0,4,6,7>: Cost 3 vext2 <6,7,0,4>, <6,7,0,4>
+  2718715284U, // <0,4,6,u>: Cost 3 vext3 <5,6,7,0>, <4,6,u,2>
+  3654090854U, // <0,4,7,0>: Cost 4 vext1 <5,0,4,7>, LHS
+  3934229326U, // <0,4,7,1>: Cost 4 vuzpr <7,0,1,4>, <6,7,0,1>
+  3734156437U, // <0,4,7,2>: Cost 4 vext2 <7,2,0,4>, <7,2,0,4>
+  3734820070U, // <0,4,7,3>: Cost 4 vext2 <7,3,0,4>, <7,3,0,4>
+  3654094134U, // <0,4,7,4>: Cost 4 vext1 <5,0,4,7>, RHS
+  2713259464U, // <0,4,7,5>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
+  2713333201U, // <0,4,7,6>: Cost 3 vext3 <4,7,6,0>, <4,7,6,0>
+  3654095866U, // <0,4,7,7>: Cost 4 vext1 <5,0,4,7>, <7,0,1,2>
+  2713259464U, // <0,4,7,u>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
+  2568413286U, // <0,4,u,0>: Cost 3 vext1 <3,0,4,u>, LHS
+  2618611502U, // <0,4,u,1>: Cost 3 vext2 <0,2,0,4>, LHS
+  2753140526U, // <0,4,u,2>: Cost 3 vuzpl <0,2,4,6>, LHS
+  2568415415U, // <0,4,u,3>: Cost 3 vext1 <3,0,4,u>, <3,0,4,u>
+  2568416566U, // <0,4,u,4>: Cost 3 vext1 <3,0,4,u>, RHS
+  1817423158U, // <0,4,u,5>: Cost 2 vzipl LHS, RHS
+  1947438390U, // <0,4,u,6>: Cost 2 vtrnl LHS, RHS
+  2592306203U, // <0,4,u,7>: Cost 3 vext1 <7,0,4,u>, <7,0,4,u>
+  1947438408U, // <0,4,u,u>: Cost 2 vtrnl LHS, RHS
+  3630219264U, // <0,5,0,0>: Cost 4 vext1 <1,0,5,0>, <0,0,0,0>
+  2625912934U, // <0,5,0,1>: Cost 3 vext2 <1,4,0,5>, LHS
+  3692355748U, // <0,5,0,2>: Cost 4 vext2 <0,2,0,5>, <0,2,0,2>
+  3693019384U, // <0,5,0,3>: Cost 4 vext2 <0,3,0,5>, <0,3,0,5>
+  3630222646U, // <0,5,0,4>: Cost 4 vext1 <1,0,5,0>, RHS
+  3699655062U, // <0,5,0,5>: Cost 4 vext2 <1,4,0,5>, <0,5,0,1>
+  2718715508U, // <0,5,0,6>: Cost 3 vext3 <5,6,7,0>, <5,0,6,1>
+  3087011126U, // <0,5,0,7>: Cost 3 vtrnr <0,0,0,0>, RHS
+  2625913501U, // <0,5,0,u>: Cost 3 vext2 <1,4,0,5>, LHS
+  1500659814U, // <0,5,1,0>: Cost 2 vext1 <4,0,5,1>, LHS
+  2886520528U, // <0,5,1,1>: Cost 3 vzipl LHS, <5,1,7,3>
+  2574403176U, // <0,5,1,2>: Cost 3 vext1 <4,0,5,1>, <2,2,2,2>
+  2574403734U, // <0,5,1,3>: Cost 3 vext1 <4,0,5,1>, <3,0,1,2>
+  1500662674U, // <0,5,1,4>: Cost 2 vext1 <4,0,5,1>, <4,0,5,1>
+  2886520836U, // <0,5,1,5>: Cost 3 vzipl LHS, <5,5,5,5>
+  2886520930U, // <0,5,1,6>: Cost 3 vzipl LHS, <5,6,7,0>
+  2718715600U, // <0,5,1,7>: Cost 3 vext3 <5,6,7,0>, <5,1,7,3>
+  1500665646U, // <0,5,1,u>: Cost 2 vext1 <4,0,5,1>, LHS
+  2556493926U, // <0,5,2,0>: Cost 3 vext1 <1,0,5,2>, LHS
+  2244546120U, // <0,5,2,1>: Cost 3 vrev <5,0,1,2>
+  3692357256U, // <0,5,2,2>: Cost 4 vext2 <0,2,0,5>, <2,2,5,7>
+  2568439994U, // <0,5,2,3>: Cost 3 vext1 <3,0,5,2>, <3,0,5,2>
+  2556497206U, // <0,5,2,4>: Cost 3 vext1 <1,0,5,2>, RHS
+  3020738564U, // <0,5,2,5>: Cost 3 vtrnl LHS, <5,5,5,5>
+  4027877161U, // <0,5,2,6>: Cost 4 vzipr <0,2,0,2>, <2,4,5,6>
+  3093220662U, // <0,5,2,7>: Cost 3 vtrnr <1,0,3,2>, RHS
+  3093220663U, // <0,5,2,u>: Cost 3 vtrnr <1,0,3,2>, RHS
+  3699656854U, // <0,5,3,0>: Cost 4 vext2 <1,4,0,5>, <3,0,1,2>
+  3699656927U, // <0,5,3,1>: Cost 4 vext2 <1,4,0,5>, <3,1,0,3>
+  3699657006U, // <0,5,3,2>: Cost 4 vext2 <1,4,0,5>, <3,2,0,1>
+  3699657116U, // <0,5,3,3>: Cost 4 vext2 <1,4,0,5>, <3,3,3,3>
+  2637859284U, // <0,5,3,4>: Cost 3 vext2 <3,4,0,5>, <3,4,0,5>
+  3790319453U, // <0,5,3,5>: Cost 4 vext3 <5,3,5,0>, <5,3,5,0>
+  3699657354U, // <0,5,3,6>: Cost 4 vext2 <1,4,0,5>, <3,6,2,7>
+  2716725103U, // <0,5,3,7>: Cost 3 vext3 <5,3,7,0>, <5,3,7,0>
+  2716798840U, // <0,5,3,u>: Cost 3 vext3 <5,3,u,0>, <5,3,u,0>
+  2661747602U, // <0,5,4,0>: Cost 3 vext2 <7,4,0,5>, <4,0,5,1>
+  3630252810U, // <0,5,4,1>: Cost 4 vext1 <1,0,5,4>, <1,0,5,4>
+  3636225507U, // <0,5,4,2>: Cost 4 vext1 <2,0,5,4>, <2,0,5,4>
+  3716910172U, // <0,5,4,3>: Cost 4 vext2 <4,3,0,5>, <4,3,0,5>
+  3962195892U, // <0,5,4,4>: Cost 4 vzipl <0,4,1,5>, <5,4,5,6>
+  2625916214U, // <0,5,4,5>: Cost 3 vext2 <1,4,0,5>, RHS
+  3718901071U, // <0,5,4,6>: Cost 4 vext2 <4,6,0,5>, <4,6,0,5>
+  2718715846U, // <0,5,4,7>: Cost 3 vext3 <5,6,7,0>, <5,4,7,6>
+  2625916457U, // <0,5,4,u>: Cost 3 vext2 <1,4,0,5>, RHS
+  3791278034U, // <0,5,5,0>: Cost 4 vext3 <5,5,0,0>, <5,5,0,0>
+  3791351771U, // <0,5,5,1>: Cost 4 vext3 <5,5,1,0>, <5,5,1,0>
+  3318386260U, // <0,5,5,2>: Cost 4 vrev <5,0,2,5>
+  3791499245U, // <0,5,5,3>: Cost 4 vext3 <5,5,3,0>, <5,5,3,0>
+  3318533734U, // <0,5,5,4>: Cost 4 vrev <5,0,4,5>
+  2718715908U, // <0,5,5,5>: Cost 3 vext3 <5,6,7,0>, <5,5,5,5>
+  2657767522U, // <0,5,5,6>: Cost 3 vext2 <6,7,0,5>, <5,6,7,0>
+  2718715928U, // <0,5,5,7>: Cost 3 vext3 <5,6,7,0>, <5,5,7,7>
+  2718715937U, // <0,5,5,u>: Cost 3 vext3 <5,6,7,0>, <5,5,u,7>
+  2592358502U, // <0,5,6,0>: Cost 3 vext1 <7,0,5,6>, LHS
+  3792015404U, // <0,5,6,1>: Cost 4 vext3 <5,6,1,0>, <5,6,1,0>
+  3731509754U, // <0,5,6,2>: Cost 4 vext2 <6,7,0,5>, <6,2,7,3>
+  3785748546U, // <0,5,6,3>: Cost 4 vext3 <4,5,6,0>, <5,6,3,4>
+  2592361782U, // <0,5,6,4>: Cost 3 vext1 <7,0,5,6>, RHS
+  2592362594U, // <0,5,6,5>: Cost 3 vext1 <7,0,5,6>, <5,6,7,0>
+  3785748576U, // <0,5,6,6>: Cost 4 vext3 <4,5,6,0>, <5,6,6,7>
+  1644974178U, // <0,5,6,7>: Cost 2 vext3 <5,6,7,0>, <5,6,7,0>
+  1645047915U, // <0,5,6,u>: Cost 2 vext3 <5,6,u,0>, <5,6,u,0>
+  2562506854U, // <0,5,7,0>: Cost 3 vext1 <2,0,5,7>, LHS
+  2562507670U, // <0,5,7,1>: Cost 3 vext1 <2,0,5,7>, <1,2,3,0>
+  2562508262U, // <0,5,7,2>: Cost 3 vext1 <2,0,5,7>, <2,0,5,7>
+  3636250774U, // <0,5,7,3>: Cost 4 vext1 <2,0,5,7>, <3,0,1,2>
+  2562510134U, // <0,5,7,4>: Cost 3 vext1 <2,0,5,7>, RHS
+  2718716072U, // <0,5,7,5>: Cost 3 vext3 <5,6,7,0>, <5,7,5,7>
+  2718716074U, // <0,5,7,6>: Cost 3 vext3 <5,6,7,0>, <5,7,6,0>
+  2719379635U, // <0,5,7,7>: Cost 3 vext3 <5,7,7,0>, <5,7,7,0>
+  2562512686U, // <0,5,7,u>: Cost 3 vext1 <2,0,5,7>, LHS
+  1500717158U, // <0,5,u,0>: Cost 2 vext1 <4,0,5,u>, LHS
+  2625918766U, // <0,5,u,1>: Cost 3 vext2 <1,4,0,5>, LHS
+  2719674583U, // <0,5,u,2>: Cost 3 vext3 <5,u,2,0>, <5,u,2,0>
+  2568489152U, // <0,5,u,3>: Cost 3 vext1 <3,0,5,u>, <3,0,5,u>
+  1500720025U, // <0,5,u,4>: Cost 2 vext1 <4,0,5,u>, <4,0,5,u>
+  2625919130U, // <0,5,u,5>: Cost 3 vext2 <1,4,0,5>, RHS
+  2586407243U, // <0,5,u,6>: Cost 3 vext1 <6,0,5,u>, <6,0,5,u>
+  1646301444U, // <0,5,u,7>: Cost 2 vext3 <5,u,7,0>, <5,u,7,0>
+  1646375181U, // <0,5,u,u>: Cost 2 vext3 <5,u,u,0>, <5,u,u,0>
+  2586411110U, // <0,6,0,0>: Cost 3 vext1 <6,0,6,0>, LHS
+  2619949158U, // <0,6,0,1>: Cost 3 vext2 <0,4,0,6>, LHS
+  2619949220U, // <0,6,0,2>: Cost 3 vext2 <0,4,0,6>, <0,2,0,2>
+  3785748789U, // <0,6,0,3>: Cost 4 vext3 <4,5,6,0>, <6,0,3,4>
+  2619949386U, // <0,6,0,4>: Cost 3 vext2 <0,4,0,6>, <0,4,0,6>
+  2586415202U, // <0,6,0,5>: Cost 3 vext1 <6,0,6,0>, <5,6,7,0>
+  2586415436U, // <0,6,0,6>: Cost 3 vext1 <6,0,6,0>, <6,0,6,0>
+  2952793398U, // <0,6,0,7>: Cost 3 vzipr <0,0,0,0>, RHS
+  2619949725U, // <0,6,0,u>: Cost 3 vext2 <0,4,0,6>, LHS
+  2562531430U, // <0,6,1,0>: Cost 3 vext1 <2,0,6,1>, LHS
+  3693691700U, // <0,6,1,1>: Cost 4 vext2 <0,4,0,6>, <1,1,1,1>
+  2886521338U, // <0,6,1,2>: Cost 3 vzipl LHS, <6,2,7,3>
+  3693691864U, // <0,6,1,3>: Cost 4 vext2 <0,4,0,6>, <1,3,1,3>
+  2562534710U, // <0,6,1,4>: Cost 3 vext1 <2,0,6,1>, RHS
+  2580450932U, // <0,6,1,5>: Cost 3 vext1 <5,0,6,1>, <5,0,6,1>
+  2886521656U, // <0,6,1,6>: Cost 3 vzipl LHS, <6,6,6,6>
+  2966736182U, // <0,6,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
+  2966736183U, // <0,6,1,u>: Cost 3 vzipr <2,3,0,1>, RHS
+  1500741734U, // <0,6,2,0>: Cost 2 vext1 <4,0,6,2>, LHS
+  2250518817U, // <0,6,2,1>: Cost 3 vrev <6,0,1,2>
+  2574485096U, // <0,6,2,2>: Cost 3 vext1 <4,0,6,2>, <2,2,2,2>
+  2631894694U, // <0,6,2,3>: Cost 3 vext2 <2,4,0,6>, <2,3,0,1>
+  1500744604U, // <0,6,2,4>: Cost 2 vext1 <4,0,6,2>, <4,0,6,2>
+  2574487248U, // <0,6,2,5>: Cost 3 vext1 <4,0,6,2>, <5,1,7,3>
+  3020739384U, // <0,6,2,6>: Cost 3 vtrnl LHS, <6,6,6,6>
+  2954136886U, // <0,6,2,7>: Cost 3 vzipr <0,2,0,2>, RHS
+  1500747566U, // <0,6,2,u>: Cost 2 vext1 <4,0,6,2>, LHS
+  3693693078U, // <0,6,3,0>: Cost 4 vext2 <0,4,0,6>, <3,0,1,2>
+  3705637136U, // <0,6,3,1>: Cost 4 vext2 <2,4,0,6>, <3,1,5,7>
+  3705637192U, // <0,6,3,2>: Cost 4 vext2 <2,4,0,6>, <3,2,3,0>
+  3693693340U, // <0,6,3,3>: Cost 4 vext2 <0,4,0,6>, <3,3,3,3>
+  2637867477U, // <0,6,3,4>: Cost 3 vext2 <3,4,0,6>, <3,4,0,6>
+  3705637424U, // <0,6,3,5>: Cost 4 vext2 <2,4,0,6>, <3,5,1,7>
+  3666154056U, // <0,6,3,6>: Cost 4 vext1 <7,0,6,3>, <6,3,7,0>
+  2722697800U, // <0,6,3,7>: Cost 3 vext3 <6,3,7,0>, <6,3,7,0>
+  2722771537U, // <0,6,3,u>: Cost 3 vext3 <6,3,u,0>, <6,3,u,0>
+  2562556006U, // <0,6,4,0>: Cost 3 vext1 <2,0,6,4>, LHS
+  4095316257U, // <0,6,4,1>: Cost 4 vtrnl <0,2,4,6>, <6,0,1,2>
+  2562557420U, // <0,6,4,2>: Cost 3 vext1 <2,0,6,4>, <2,0,6,4>
+  3636299926U, // <0,6,4,3>: Cost 4 vext1 <2,0,6,4>, <3,0,1,2>
+  2562559286U, // <0,6,4,4>: Cost 3 vext1 <2,0,6,4>, RHS
+  2619952438U, // <0,6,4,5>: Cost 3 vext2 <0,4,0,6>, RHS
+  2723287696U, // <0,6,4,6>: Cost 3 vext3 <6,4,6,0>, <6,4,6,0>
+  4027895094U, // <0,6,4,7>: Cost 4 vzipr <0,2,0,4>, RHS
+  2619952681U, // <0,6,4,u>: Cost 3 vext2 <0,4,0,6>, RHS
+  2718716594U, // <0,6,5,0>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
+  3648250774U, // <0,6,5,1>: Cost 4 vext1 <4,0,6,5>, <1,2,3,0>
+  3792458436U, // <0,6,5,2>: Cost 4 vext3 <5,6,7,0>, <6,5,2,7>
+  3705638767U, // <0,6,5,3>: Cost 5 vext2 <2,4,0,6>, <5,3,7,0>
+  3648252831U, // <0,6,5,4>: Cost 4 vext1 <4,0,6,5>, <4,0,6,5>
+  3797619416U, // <0,6,5,5>: Cost 4 vext3 <6,5,5,0>, <6,5,5,0>
+  3792458472U, // <0,6,5,6>: Cost 4 vext3 <5,6,7,0>, <6,5,6,7>
+  4035202358U, // <0,6,5,7>: Cost 4 vzipr <1,4,0,5>, RHS
+  2718716594U, // <0,6,5,u>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
+  3786412796U, // <0,6,6,0>: Cost 4 vext3 <4,6,6,0>, <6,6,0,0>
+  3792458504U, // <0,6,6,1>: Cost 4 vext3 <5,6,7,0>, <6,6,1,3>
+  3728200126U, // <0,6,6,2>: Cost 4 vext2 <6,2,0,6>, <6,2,0,6>
+  3798135575U, // <0,6,6,3>: Cost 4 vext3 <6,6,3,0>, <6,6,3,0>
+  3786412836U, // <0,6,6,4>: Cost 4 vext3 <4,6,6,0>, <6,6,4,4>
+  3792458543U, // <0,6,6,5>: Cost 4 vext3 <5,6,7,0>, <6,6,5,6>
+  2718716728U, // <0,6,6,6>: Cost 3 vext3 <5,6,7,0>, <6,6,6,6>
+  2718716738U, // <0,6,6,7>: Cost 3 vext3 <5,6,7,0>, <6,6,7,7>
+  2718716747U, // <0,6,6,u>: Cost 3 vext3 <5,6,7,0>, <6,6,u,7>
+  2718716750U, // <0,6,7,0>: Cost 3 vext3 <5,6,7,0>, <6,7,0,1>
+  2724909910U, // <0,6,7,1>: Cost 3 vext3 <6,7,1,0>, <6,7,1,0>
+  3636323823U, // <0,6,7,2>: Cost 4 vext1 <2,0,6,7>, <2,0,6,7>
+  2725057384U, // <0,6,7,3>: Cost 3 vext3 <6,7,3,0>, <6,7,3,0>
+  2718716790U, // <0,6,7,4>: Cost 3 vext3 <5,6,7,0>, <6,7,4,5>
+  2718716800U, // <0,6,7,5>: Cost 3 vext3 <5,6,7,0>, <6,7,5,6>
+  3792458629U, // <0,6,7,6>: Cost 4 vext3 <5,6,7,0>, <6,7,6,2>
+  2725352332U, // <0,6,7,7>: Cost 3 vext3 <6,7,7,0>, <6,7,7,0>
+  2718716822U, // <0,6,7,u>: Cost 3 vext3 <5,6,7,0>, <6,7,u,1>
+  1500790886U, // <0,6,u,0>: Cost 2 vext1 <4,0,6,u>, LHS
+  2619954990U, // <0,6,u,1>: Cost 3 vext2 <0,4,0,6>, LHS
+  2562590192U, // <0,6,u,2>: Cost 3 vext1 <2,0,6,u>, <2,0,6,u>
+  2725721017U, // <0,6,u,3>: Cost 3 vext3 <6,u,3,0>, <6,u,3,0>
+  1500793762U, // <0,6,u,4>: Cost 2 vext1 <4,0,6,u>, <4,0,6,u>
+  2619955354U, // <0,6,u,5>: Cost 3 vext2 <0,4,0,6>, RHS
+  2725942228U, // <0,6,u,6>: Cost 3 vext3 <6,u,6,0>, <6,u,6,0>
+  2954186038U, // <0,6,u,7>: Cost 3 vzipr <0,2,0,u>, RHS
+  1500796718U, // <0,6,u,u>: Cost 2 vext1 <4,0,6,u>, LHS
+  2256401391U, // <0,7,0,0>: Cost 3 vrev <7,0,0,0>
+  2632564838U, // <0,7,0,1>: Cost 3 vext2 <2,5,0,7>, LHS
+  2256548865U, // <0,7,0,2>: Cost 3 vrev <7,0,2,0>
+  3700998396U, // <0,7,0,3>: Cost 4 vext2 <1,6,0,7>, <0,3,1,0>
+  2718716952U, // <0,7,0,4>: Cost 3 vext3 <5,6,7,0>, <7,0,4,5>
+  2718716962U, // <0,7,0,5>: Cost 3 vext3 <5,6,7,0>, <7,0,5,6>
+  2621284845U, // <0,7,0,6>: Cost 3 vext2 <0,6,0,7>, <0,6,0,7>
+  3904685542U, // <0,7,0,7>: Cost 4 vuzpr <2,0,5,7>, <2,0,5,7>
+  2632565405U, // <0,7,0,u>: Cost 3 vext2 <2,5,0,7>, LHS
+  2256409584U, // <0,7,1,0>: Cost 3 vrev <7,0,0,1>
+  3706307380U, // <0,7,1,1>: Cost 4 vext2 <2,5,0,7>, <1,1,1,1>
+  2632565654U, // <0,7,1,2>: Cost 3 vext2 <2,5,0,7>, <1,2,3,0>
+  3769603168U, // <0,7,1,3>: Cost 4 vext3 <1,u,3,0>, <7,1,3,5>
+  2256704532U, // <0,7,1,4>: Cost 3 vrev <7,0,4,1>
+  3769603184U, // <0,7,1,5>: Cost 4 vext3 <1,u,3,0>, <7,1,5,3>
+  3700999366U, // <0,7,1,6>: Cost 4 vext2 <1,6,0,7>, <1,6,0,7>
+  2886522476U, // <0,7,1,7>: Cost 3 vzipl LHS, <7,7,7,7>
+  2256999480U, // <0,7,1,u>: Cost 3 vrev <7,0,u,1>
+  2586501222U, // <0,7,2,0>: Cost 3 vext1 <6,0,7,2>, LHS
+  1182749690U, // <0,7,2,1>: Cost 2 vrev <7,0,1,2>
+  3636356595U, // <0,7,2,2>: Cost 4 vext1 <2,0,7,2>, <2,0,7,2>
+  2727711916U, // <0,7,2,3>: Cost 3 vext3 <7,2,3,0>, <7,2,3,0>
+  2586504502U, // <0,7,2,4>: Cost 3 vext1 <6,0,7,2>, RHS
+  2632566606U, // <0,7,2,5>: Cost 3 vext2 <2,5,0,7>, <2,5,0,7>
+  2586505559U, // <0,7,2,6>: Cost 3 vext1 <6,0,7,2>, <6,0,7,2>
+  3020740204U, // <0,7,2,7>: Cost 3 vtrnl LHS, <7,7,7,7>
+  1183265849U, // <0,7,2,u>: Cost 2 vrev <7,0,u,2>
+  3701000342U, // <0,7,3,0>: Cost 4 vext2 <1,6,0,7>, <3,0,1,2>
+  3706308849U, // <0,7,3,1>: Cost 4 vext2 <2,5,0,7>, <3,1,2,3>
+  3330315268U, // <0,7,3,2>: Cost 4 vrev <7,0,2,3>
+  3706309020U, // <0,7,3,3>: Cost 4 vext2 <2,5,0,7>, <3,3,3,3>
+  3706309122U, // <0,7,3,4>: Cost 4 vext2 <2,5,0,7>, <3,4,5,6>
+  3712281127U, // <0,7,3,5>: Cost 4 vext2 <3,5,0,7>, <3,5,0,7>
+  2639202936U, // <0,7,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
+  3802412321U, // <0,7,3,7>: Cost 4 vext3 <7,3,7,0>, <7,3,7,0>
+  2640530202U, // <0,7,3,u>: Cost 3 vext2 <3,u,0,7>, <3,u,0,7>
+  3654287462U, // <0,7,4,0>: Cost 4 vext1 <5,0,7,4>, LHS
+  2256507900U, // <0,7,4,1>: Cost 3 vrev <7,0,1,4>
+  2256581637U, // <0,7,4,2>: Cost 3 vrev <7,0,2,4>
+  3660262008U, // <0,7,4,3>: Cost 4 vext1 <6,0,7,4>, <3,6,0,7>
+  3786413405U, // <0,7,4,4>: Cost 4 vext3 <4,6,6,0>, <7,4,4,6>
+  2632568118U, // <0,7,4,5>: Cost 3 vext2 <2,5,0,7>, RHS
+  3718917457U, // <0,7,4,6>: Cost 4 vext2 <4,6,0,7>, <4,6,0,7>
+  3787003255U, // <0,7,4,7>: Cost 4 vext3 <4,7,5,0>, <7,4,7,5>
+  2632568361U, // <0,7,4,u>: Cost 3 vext2 <2,5,0,7>, RHS
+  3706310268U, // <0,7,5,0>: Cost 4 vext2 <2,5,0,7>, <5,0,7,0>
+  3792459156U, // <0,7,5,1>: Cost 4 vext3 <5,6,7,0>, <7,5,1,7>
+  3330331654U, // <0,7,5,2>: Cost 4 vrev <7,0,2,5>
+  3722899255U, // <0,7,5,3>: Cost 4 vext2 <5,3,0,7>, <5,3,0,7>
+  2256737304U, // <0,7,5,4>: Cost 3 vrev <7,0,4,5>
+  3724226521U, // <0,7,5,5>: Cost 4 vext2 <5,5,0,7>, <5,5,0,7>
+  2718717377U, // <0,7,5,6>: Cost 3 vext3 <5,6,7,0>, <7,5,6,7>
+  2729997763U, // <0,7,5,7>: Cost 3 vext3 <7,5,7,0>, <7,5,7,0>
+  2720044499U, // <0,7,5,u>: Cost 3 vext3 <5,u,7,0>, <7,5,u,7>
+  3712946517U, // <0,7,6,0>: Cost 4 vext2 <3,6,0,7>, <6,0,7,0>
+  2256524286U, // <0,7,6,1>: Cost 3 vrev <7,0,1,6>
+  3792459246U, // <0,7,6,2>: Cost 4 vext3 <5,6,7,0>, <7,6,2,7>
+  3796440567U, // <0,7,6,3>: Cost 4 vext3 <6,3,7,0>, <7,6,3,7>
+  3654307126U, // <0,7,6,4>: Cost 4 vext1 <5,0,7,6>, RHS
+  2656457394U, // <0,7,6,5>: Cost 3 vext2 <6,5,0,7>, <6,5,0,7>
+  3792459281U, // <0,7,6,6>: Cost 4 vext3 <5,6,7,0>, <7,6,6,6>
+  2730661396U, // <0,7,6,7>: Cost 3 vext3 <7,6,7,0>, <7,6,7,0>
+  2658448293U, // <0,7,6,u>: Cost 3 vext2 <6,u,0,7>, <6,u,0,7>
+  3787003431U, // <0,7,7,0>: Cost 4 vext3 <4,7,5,0>, <7,7,0,1>
+  3654312854U, // <0,7,7,1>: Cost 4 vext1 <5,0,7,7>, <1,2,3,0>
+  3654313446U, // <0,7,7,2>: Cost 4 vext1 <5,0,7,7>, <2,0,5,7>
+  3804771905U, // <0,7,7,3>: Cost 4 vext3 <7,7,3,0>, <7,7,3,0>
+  3654315318U, // <0,7,7,4>: Cost 4 vext1 <5,0,7,7>, RHS
+  3654315651U, // <0,7,7,5>: Cost 4 vext1 <5,0,7,7>, <5,0,7,7>
+  3660288348U, // <0,7,7,6>: Cost 4 vext1 <6,0,7,7>, <6,0,7,7>
+  2718717548U, // <0,7,7,7>: Cost 3 vext3 <5,6,7,0>, <7,7,7,7>
+  2664420990U, // <0,7,7,u>: Cost 3 vext2 <7,u,0,7>, <7,u,0,7>
+  2256466935U, // <0,7,u,0>: Cost 3 vrev <7,0,0,u>
+  1182798848U, // <0,7,u,1>: Cost 2 vrev <7,0,1,u>
+  2256614409U, // <0,7,u,2>: Cost 3 vrev <7,0,2,u>
+  2731693714U, // <0,7,u,3>: Cost 3 vext3 <7,u,3,0>, <7,u,3,0>
+  2256761883U, // <0,7,u,4>: Cost 3 vrev <7,0,4,u>
+  2632571034U, // <0,7,u,5>: Cost 3 vext2 <2,5,0,7>, RHS
+  2669066421U, // <0,7,u,6>: Cost 3 vext2 <u,6,0,7>, <u,6,0,7>
+  2731988662U, // <0,7,u,7>: Cost 3 vext3 <7,u,7,0>, <7,u,7,0>
+  1183315007U, // <0,7,u,u>: Cost 2 vrev <7,0,u,u>
+  135053414U, // <0,u,0,0>: Cost 1 vdup0 LHS
+  1544896614U, // <0,u,0,1>: Cost 2 vext2 <0,2,0,u>, LHS
+  1678999654U, // <0,u,0,2>: Cost 2 vuzpl LHS, LHS
+  2691880677U, // <0,u,0,3>: Cost 3 vext3 <1,2,3,0>, <u,0,3,2>
+  1476988214U, // <0,u,0,4>: Cost 2 vext1 <0,0,u,0>, RHS
+  2718791419U, // <0,u,0,5>: Cost 3 vext3 <5,6,u,0>, <u,0,5,6>
+  3021248666U, // <0,u,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
+  2592535607U, // <0,u,0,7>: Cost 3 vext1 <7,0,u,0>, <7,0,u,0>
+  135053414U, // <0,u,0,u>: Cost 1 vdup0 LHS
+  1476993097U, // <0,u,1,0>: Cost 2 vext1 <0,0,u,1>, <0,0,u,1>
+  1812780846U, // <0,u,1,1>: Cost 2 vzipl LHS, LHS
+  1618138926U, // <0,u,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
+  2752742134U, // <0,u,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
+  1476996406U, // <0,u,1,4>: Cost 2 vext1 <0,0,u,1>, RHS
+  1812781210U, // <0,u,1,5>: Cost 2 vzipl LHS, RHS
+  2887006416U, // <0,u,1,6>: Cost 3 vzipl LHS, <u,6,3,7>
+  2966736200U, // <0,u,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
+  1812781413U, // <0,u,1,u>: Cost 2 vzipl LHS, LHS
+  1482973286U, // <0,u,2,0>: Cost 2 vext1 <1,0,u,2>, LHS
+  1482973987U, // <0,u,2,1>: Cost 2 vext1 <1,0,u,2>, <1,0,u,2>
+  1946998574U, // <0,u,2,2>: Cost 2 vtrnl LHS, LHS
+  835584U, // <0,u,2,3>: Cost 0 copy LHS
+  1482976566U, // <0,u,2,4>: Cost 2 vext1 <1,0,u,2>, RHS
+  3020781631U, // <0,u,2,5>: Cost 3 vtrnl LHS, <u,4,5,6>
+  1946998938U, // <0,u,2,6>: Cost 2 vtrnl LHS, RHS
+  1518810169U, // <0,u,2,7>: Cost 2 vext1 <7,0,u,2>, <7,0,u,2>
+  835584U, // <0,u,2,u>: Cost 0 copy LHS
+  2618640534U, // <0,u,3,0>: Cost 3 vext2 <0,2,0,u>, <3,0,1,2>
+  2752743574U, // <0,u,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
+  2636556597U, // <0,u,3,2>: Cost 3 vext2 <3,2,0,u>, <3,2,0,u>
+  2752743836U, // <0,u,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
+  2618640898U, // <0,u,3,4>: Cost 3 vext2 <0,2,0,u>, <3,4,5,6>
+  2752743938U, // <0,u,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
+  2639202936U, // <0,u,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
+  2639874762U, // <0,u,3,7>: Cost 3 vext2 <3,7,0,u>, <3,7,0,u>
+  2752743637U, // <0,u,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
+  2562703462U, // <0,u,4,0>: Cost 3 vext1 <2,0,u,4>, LHS
+  2888455982U, // <0,u,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
+  3021575982U, // <0,u,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
+  2568677591U, // <0,u,4,3>: Cost 3 vext1 <3,0,u,4>, <3,0,u,4>
+  2562706742U, // <0,u,4,4>: Cost 3 vext1 <2,0,u,4>, RHS
+  1544899894U, // <0,u,4,5>: Cost 2 vext2 <0,2,0,u>, RHS
+  1679002934U, // <0,u,4,6>: Cost 2 vuzpl LHS, RHS
+  2718718033U, // <0,u,4,7>: Cost 3 vext3 <5,6,7,0>, <u,4,7,6>
+  1679002952U, // <0,u,4,u>: Cost 2 vuzpl LHS, RHS
+  2568683622U, // <0,u,5,0>: Cost 3 vext1 <3,0,u,5>, LHS
+  2568684438U, // <0,u,5,1>: Cost 3 vext1 <3,0,u,5>, <1,2,3,0>
+  3765622902U, // <0,u,5,2>: Cost 4 vext3 <1,2,3,0>, <u,5,2,7>
+  2691881087U, // <0,u,5,3>: Cost 3 vext3 <1,2,3,0>, <u,5,3,7>
+  2568686902U, // <0,u,5,4>: Cost 3 vext1 <3,0,u,5>, RHS
+  2650492890U, // <0,u,5,5>: Cost 3 vext2 <5,5,0,u>, <5,5,0,u>
+  1618139290U, // <0,u,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
+  2824834358U, // <0,u,5,7>: Cost 3 vuzpr <1,0,3,u>, RHS
+  1618139308U, // <0,u,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
+  2592579686U, // <0,u,6,0>: Cost 3 vext1 <7,0,u,6>, LHS
+  2262496983U, // <0,u,6,1>: Cost 3 vrev <u,0,1,6>
+  2654474688U, // <0,u,6,2>: Cost 3 vext2 <6,2,0,u>, <6,2,0,u>
+  2691881168U, // <0,u,6,3>: Cost 3 vext3 <1,2,3,0>, <u,6,3,7>
+  2592582966U, // <0,u,6,4>: Cost 3 vext1 <7,0,u,6>, RHS
+  2656465587U, // <0,u,6,5>: Cost 3 vext2 <6,5,0,u>, <6,5,0,u>
+  2657129220U, // <0,u,6,6>: Cost 3 vext2 <6,6,0,u>, <6,6,0,u>
+  1584051029U, // <0,u,6,7>: Cost 2 vext2 <6,7,0,u>, <6,7,0,u>
+  1584714662U, // <0,u,6,u>: Cost 2 vext2 <6,u,0,u>, <6,u,0,u>
+  2562728038U, // <0,u,7,0>: Cost 3 vext1 <2,0,u,7>, LHS
+  2562728854U, // <0,u,7,1>: Cost 3 vext1 <2,0,u,7>, <1,2,3,0>
+  2562729473U, // <0,u,7,2>: Cost 3 vext1 <2,0,u,7>, <2,0,u,7>
+  2661111018U, // <0,u,7,3>: Cost 3 vext2 <7,3,0,u>, <7,3,0,u>
+  2562731318U, // <0,u,7,4>: Cost 3 vext1 <2,0,u,7>, RHS
+  2718718258U, // <0,u,7,5>: Cost 3 vext3 <5,6,7,0>, <u,7,5,6>
+  2586620261U, // <0,u,7,6>: Cost 3 vext1 <6,0,u,7>, <6,0,u,7>
+  2657793644U, // <0,u,7,7>: Cost 3 vext2 <6,7,0,u>, <7,7,7,7>
+  2562733870U, // <0,u,7,u>: Cost 3 vext1 <2,0,u,7>, LHS
+  135053414U, // <0,u,u,0>: Cost 1 vdup0 LHS
+  1544902446U, // <0,u,u,1>: Cost 2 vext2 <0,2,0,u>, LHS
+  1679005486U, // <0,u,u,2>: Cost 2 vuzpl LHS, LHS
+  835584U, // <0,u,u,3>: Cost 0 copy LHS
+  1483025718U, // <0,u,u,4>: Cost 2 vext1 <1,0,u,u>, RHS
+  1544902810U, // <0,u,u,5>: Cost 2 vext2 <0,2,0,u>, RHS
+  1679005850U, // <0,u,u,6>: Cost 2 vuzpl LHS, RHS
+  1518859327U, // <0,u,u,7>: Cost 2 vext1 <7,0,u,u>, <7,0,u,u>
+  835584U, // <0,u,u,u>: Cost 0 copy LHS
+  2689744896U, // <1,0,0,0>: Cost 3 vext3 <0,u,1,1>, <0,0,0,0>
+  1610694666U, // <1,0,0,1>: Cost 2 vext3 <0,0,1,1>, <0,0,1,1>
+  2689744916U, // <1,0,0,2>: Cost 3 vext3 <0,u,1,1>, <0,0,2,2>
+  2619310332U, // <1,0,0,3>: Cost 3 vext2 <0,3,1,0>, <0,3,1,0>
+  2684657701U, // <1,0,0,4>: Cost 3 vext3 <0,0,4,1>, <0,0,4,1>
+  2620637598U, // <1,0,0,5>: Cost 3 vext2 <0,5,1,0>, <0,5,1,0>
+  3708977654U, // <1,0,0,6>: Cost 4 vext2 <3,0,1,0>, <0,6,1,7>
+  3666351168U, // <1,0,0,7>: Cost 4 vext1 <7,1,0,0>, <7,1,0,0>
+  1611210825U, // <1,0,0,u>: Cost 2 vext3 <0,0,u,1>, <0,0,u,1>
+  2556780646U, // <1,0,1,0>: Cost 3 vext1 <1,1,0,1>, LHS
+  2556781355U, // <1,0,1,1>: Cost 3 vext1 <1,1,0,1>, <1,1,0,1>
+  1616003174U, // <1,0,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
+  3693052888U, // <1,0,1,3>: Cost 4 vext2 <0,3,1,0>, <1,3,1,3>
+  2556783926U, // <1,0,1,4>: Cost 3 vext1 <1,1,0,1>, RHS
+  2580672143U, // <1,0,1,5>: Cost 3 vext1 <5,1,0,1>, <5,1,0,1>
+  2724839566U, // <1,0,1,6>: Cost 3 vext3 <6,7,0,1>, <0,1,6,7>
+  3654415354U, // <1,0,1,7>: Cost 4 vext1 <5,1,0,1>, <7,0,1,2>
+  1616003228U, // <1,0,1,u>: Cost 2 vext3 <0,u,1,1>, LHS
+  2685690019U, // <1,0,2,0>: Cost 3 vext3 <0,2,0,1>, <0,2,0,1>
+  2685763756U, // <1,0,2,1>: Cost 3 vext3 <0,2,1,1>, <0,2,1,1>
+  2698297524U, // <1,0,2,2>: Cost 3 vext3 <2,3,0,1>, <0,2,2,0>
+  2685911230U, // <1,0,2,3>: Cost 3 vext3 <0,2,3,1>, <0,2,3,1>
+  2689745100U, // <1,0,2,4>: Cost 3 vext3 <0,u,1,1>, <0,2,4,6>
+  3764814038U, // <1,0,2,5>: Cost 4 vext3 <1,1,1,1>, <0,2,5,7>
+  2724839640U, // <1,0,2,6>: Cost 3 vext3 <6,7,0,1>, <0,2,6,0>
+  2592625658U, // <1,0,2,7>: Cost 3 vext1 <7,1,0,2>, <7,0,1,2>
+  2686279915U, // <1,0,2,u>: Cost 3 vext3 <0,2,u,1>, <0,2,u,1>
+  3087843328U, // <1,0,3,0>: Cost 3 vtrnr LHS, <0,0,0,0>
+  3087843338U, // <1,0,3,1>: Cost 3 vtrnr LHS, <0,0,1,1>
+  67944550U, // <1,0,3,2>: Cost 1 vrev LHS
+  2568743135U, // <1,0,3,3>: Cost 3 vext1 <3,1,0,3>, <3,1,0,3>
+  2562772278U, // <1,0,3,4>: Cost 3 vext1 <2,1,0,3>, RHS
+  4099850454U, // <1,0,3,5>: Cost 4 vtrnl <1,0,3,2>, <0,2,5,7>
+  3704998538U, // <1,0,3,6>: Cost 4 vext2 <2,3,1,0>, <3,6,2,7>
+  2592633923U, // <1,0,3,7>: Cost 3 vext1 <7,1,0,3>, <7,1,0,3>
+  68386972U, // <1,0,3,u>: Cost 1 vrev LHS
+  2620640146U, // <1,0,4,0>: Cost 3 vext2 <0,5,1,0>, <4,0,5,1>
+  2689745234U, // <1,0,4,1>: Cost 3 vext3 <0,u,1,1>, <0,4,1,5>
+  2689745244U, // <1,0,4,2>: Cost 3 vext3 <0,u,1,1>, <0,4,2,6>
+  3760980320U, // <1,0,4,3>: Cost 4 vext3 <0,4,3,1>, <0,4,3,1>
+  3761054057U, // <1,0,4,4>: Cost 4 vext3 <0,4,4,1>, <0,4,4,1>
+  2619313462U, // <1,0,4,5>: Cost 3 vext2 <0,3,1,0>, RHS
+  3761201531U, // <1,0,4,6>: Cost 4 vext3 <0,4,6,1>, <0,4,6,1>
+  3666383940U, // <1,0,4,7>: Cost 4 vext1 <7,1,0,4>, <7,1,0,4>
+  2619313705U, // <1,0,4,u>: Cost 3 vext2 <0,3,1,0>, RHS
+  4029300736U, // <1,0,5,0>: Cost 4 vzipr <0,4,1,5>, <0,0,0,0>
+  2895249510U, // <1,0,5,1>: Cost 3 vzipl <1,5,3,7>, LHS
+  3028287590U, // <1,0,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
+  3642501345U, // <1,0,5,3>: Cost 4 vext1 <3,1,0,5>, <3,1,0,5>
+  2215592058U, // <1,0,5,4>: Cost 3 vrev <0,1,4,5>
+  3724242907U, // <1,0,5,5>: Cost 4 vext2 <5,5,1,0>, <5,5,1,0>
+  3724906540U, // <1,0,5,6>: Cost 4 vext2 <5,6,1,0>, <5,6,1,0>
+  3911118134U, // <1,0,5,7>: Cost 4 vuzpr <3,1,3,0>, RHS
+  3028287644U, // <1,0,5,u>: Cost 3 vtrnl <1,3,5,7>, LHS
+  3762086375U, // <1,0,6,0>: Cost 4 vext3 <0,6,0,1>, <0,6,0,1>
+  2698297846U, // <1,0,6,1>: Cost 3 vext3 <2,3,0,1>, <0,6,1,7>
+  3760022015U, // <1,0,6,2>: Cost 4 vext3 <0,2,u,1>, <0,6,2,7>
+  3642509538U, // <1,0,6,3>: Cost 4 vext1 <3,1,0,6>, <3,1,0,6>
+  3762381323U, // <1,0,6,4>: Cost 4 vext3 <0,6,4,1>, <0,6,4,1>
+  3730215604U, // <1,0,6,5>: Cost 4 vext2 <6,5,1,0>, <6,5,1,0>
+  3730879237U, // <1,0,6,6>: Cost 4 vext2 <6,6,1,0>, <6,6,1,0>
+  2657801046U, // <1,0,6,7>: Cost 3 vext2 <6,7,1,0>, <6,7,1,0>
+  2658464679U, // <1,0,6,u>: Cost 3 vext2 <6,u,1,0>, <6,u,1,0>
+  2659128312U, // <1,0,7,0>: Cost 3 vext2 <7,0,1,0>, <7,0,1,0>
+  4047898278U, // <1,0,7,1>: Cost 4 vzipr <3,5,1,7>, <2,3,0,1>
+  2215460970U, // <1,0,7,2>: Cost 3 vrev <0,1,2,7>
+  3734861035U, // <1,0,7,3>: Cost 4 vext2 <7,3,1,0>, <7,3,1,0>
+  3731543398U, // <1,0,7,4>: Cost 4 vext2 <6,7,1,0>, <7,4,5,6>
+  3736188301U, // <1,0,7,5>: Cost 4 vext2 <7,5,1,0>, <7,5,1,0>
+  2663110110U, // <1,0,7,6>: Cost 3 vext2 <7,6,1,0>, <7,6,1,0>
+  3731543660U, // <1,0,7,7>: Cost 4 vext2 <6,7,1,0>, <7,7,7,7>
+  2664437376U, // <1,0,7,u>: Cost 3 vext2 <7,u,1,0>, <7,u,1,0>
+  3087884288U, // <1,0,u,0>: Cost 3 vtrnr LHS, <0,0,0,0>
+  1616003730U, // <1,0,u,1>: Cost 2 vext3 <0,u,1,1>, <0,u,1,1>
+  67985515U, // <1,0,u,2>: Cost 1 vrev LHS
+  2689893028U, // <1,0,u,3>: Cost 3 vext3 <0,u,3,1>, <0,u,3,1>
+  2689745586U, // <1,0,u,4>: Cost 3 vext3 <0,u,1,1>, <0,u,4,6>
+  2619316378U, // <1,0,u,5>: Cost 3 vext2 <0,3,1,0>, RHS
+  2669082807U, // <1,0,u,6>: Cost 3 vext2 <u,6,1,0>, <u,6,1,0>
+  2592674888U, // <1,0,u,7>: Cost 3 vext1 <7,1,0,u>, <7,1,0,u>
+  68427937U, // <1,0,u,u>: Cost 1 vrev LHS
+  1543585802U, // <1,1,0,0>: Cost 2 vext2 <0,0,1,1>, <0,0,1,1>
+  1548894310U, // <1,1,0,1>: Cost 2 vext2 <0,u,1,1>, LHS
+  2618654892U, // <1,1,0,2>: Cost 3 vext2 <0,2,1,1>, <0,2,1,1>
+  2689745654U, // <1,1,0,3>: Cost 3 vext3 <0,u,1,1>, <1,0,3,2>
+  2622636370U, // <1,1,0,4>: Cost 3 vext2 <0,u,1,1>, <0,4,1,5>
+  2620645791U, // <1,1,0,5>: Cost 3 vext2 <0,5,1,1>, <0,5,1,1>
+  3696378367U, // <1,1,0,6>: Cost 4 vext2 <0,u,1,1>, <0,6,2,7>
+  3666424905U, // <1,1,0,7>: Cost 4 vext1 <7,1,1,0>, <7,1,1,0>
+  1548894866U, // <1,1,0,u>: Cost 2 vext2 <0,u,1,1>, <0,u,1,1>
+  1483112550U, // <1,1,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
+  202162278U, // <1,1,1,1>: Cost 1 vdup1 LHS
+  2622636950U, // <1,1,1,2>: Cost 3 vext2 <0,u,1,1>, <1,2,3,0>
+  2622637016U, // <1,1,1,3>: Cost 3 vext2 <0,u,1,1>, <1,3,1,3>
+  1483115830U, // <1,1,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
+  2622637200U, // <1,1,1,5>: Cost 3 vext2 <0,u,1,1>, <1,5,3,7>
+  2622637263U, // <1,1,1,6>: Cost 3 vext2 <0,u,1,1>, <1,6,1,7>
+  2592691274U, // <1,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
+  202162278U, // <1,1,1,u>: Cost 1 vdup1 LHS
+  2550890588U, // <1,1,2,0>: Cost 3 vext1 <0,1,1,2>, <0,1,1,2>
+  2617329183U, // <1,1,2,1>: Cost 3 vext2 <0,0,1,1>, <2,1,3,1>
+  2622637672U, // <1,1,2,2>: Cost 3 vext2 <0,u,1,1>, <2,2,2,2>
+  2622637734U, // <1,1,2,3>: Cost 3 vext2 <0,u,1,1>, <2,3,0,1>
+  2550893878U, // <1,1,2,4>: Cost 3 vext1 <0,1,1,2>, RHS
+  3696379744U, // <1,1,2,5>: Cost 4 vext2 <0,u,1,1>, <2,5,2,7>
+  2622638010U, // <1,1,2,6>: Cost 3 vext2 <0,u,1,1>, <2,6,3,7>
+  3804554170U, // <1,1,2,7>: Cost 4 vext3 <7,7,0,1>, <1,2,7,0>
+  2622638139U, // <1,1,2,u>: Cost 3 vext2 <0,u,1,1>, <2,u,0,1>
+  2622638230U, // <1,1,3,0>: Cost 3 vext2 <0,u,1,1>, <3,0,1,2>
+  3087844148U, // <1,1,3,1>: Cost 3 vtrnr LHS, <1,1,1,1>
+  4161585244U, // <1,1,3,2>: Cost 4 vtrnr LHS, <0,1,1,2>
+  2014101606U, // <1,1,3,3>: Cost 2 vtrnr LHS, LHS
+  2622638594U, // <1,1,3,4>: Cost 3 vext2 <0,u,1,1>, <3,4,5,6>
+  2689745920U, // <1,1,3,5>: Cost 3 vext3 <0,u,1,1>, <1,3,5,7>
+  3763487753U, // <1,1,3,6>: Cost 4 vext3 <0,u,1,1>, <1,3,6,7>
+  2592707660U, // <1,1,3,7>: Cost 3 vext1 <7,1,1,3>, <7,1,1,3>
+  2014101611U, // <1,1,3,u>: Cost 2 vtrnr LHS, LHS
+  2556878950U, // <1,1,4,0>: Cost 3 vext1 <1,1,1,4>, LHS
+  2221335351U, // <1,1,4,1>: Cost 3 vrev <1,1,1,4>
+  3696380988U, // <1,1,4,2>: Cost 4 vext2 <0,u,1,1>, <4,2,6,0>
+  3763487805U, // <1,1,4,3>: Cost 4 vext3 <0,u,1,1>, <1,4,3,5>
+  2556882230U, // <1,1,4,4>: Cost 3 vext1 <1,1,1,4>, RHS
+  1548897590U, // <1,1,4,5>: Cost 2 vext2 <0,u,1,1>, RHS
+  2758184246U, // <1,1,4,6>: Cost 3 vuzpl <1,1,1,1>, RHS
+  3666457677U, // <1,1,4,7>: Cost 4 vext1 <7,1,1,4>, <7,1,1,4>
+  1548897833U, // <1,1,4,u>: Cost 2 vext2 <0,u,1,1>, RHS
+  2693653615U, // <1,1,5,0>: Cost 3 vext3 <1,5,0,1>, <1,5,0,1>
+  2617331408U, // <1,1,5,1>: Cost 3 vext2 <0,0,1,1>, <5,1,7,3>
+  4029302934U, // <1,1,5,2>: Cost 4 vzipr <0,4,1,5>, <3,0,1,2>
+  2689746064U, // <1,1,5,3>: Cost 3 vext3 <0,u,1,1>, <1,5,3,7>
+  2221564755U, // <1,1,5,4>: Cost 3 vrev <1,1,4,5>
+  2955559250U, // <1,1,5,5>: Cost 3 vzipr <0,4,1,5>, <0,4,1,5>
+  2617331810U, // <1,1,5,6>: Cost 3 vext2 <0,0,1,1>, <5,6,7,0>
+  2825293110U, // <1,1,5,7>: Cost 3 vuzpr <1,1,1,1>, RHS
+  2689746109U, // <1,1,5,u>: Cost 3 vext3 <0,u,1,1>, <1,5,u,7>
+  3696382241U, // <1,1,6,0>: Cost 4 vext2 <0,u,1,1>, <6,0,1,2>
+  2689746127U, // <1,1,6,1>: Cost 3 vext3 <0,u,1,1>, <1,6,1,7>
+  2617332218U, // <1,1,6,2>: Cost 3 vext2 <0,0,1,1>, <6,2,7,3>
+  3763487969U, // <1,1,6,3>: Cost 4 vext3 <0,u,1,1>, <1,6,3,7>
+  3696382605U, // <1,1,6,4>: Cost 4 vext2 <0,u,1,1>, <6,4,5,6>
+  4029309266U, // <1,1,6,5>: Cost 4 vzipr <0,4,1,6>, <0,4,1,5>
+  2617332536U, // <1,1,6,6>: Cost 3 vext2 <0,0,1,1>, <6,6,6,6>
+  2724840702U, // <1,1,6,7>: Cost 3 vext3 <6,7,0,1>, <1,6,7,0>
+  2725504263U, // <1,1,6,u>: Cost 3 vext3 <6,u,0,1>, <1,6,u,0>
+  2617332720U, // <1,1,7,0>: Cost 3 vext2 <0,0,1,1>, <7,0,0,1>
+  2659800138U, // <1,1,7,1>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
+  3691074717U, // <1,1,7,2>: Cost 4 vext2 <0,0,1,1>, <7,2,1,3>
+  4167811174U, // <1,1,7,3>: Cost 4 vtrnr <1,1,5,7>, LHS
+  2617333094U, // <1,1,7,4>: Cost 3 vext2 <0,0,1,1>, <7,4,5,6>
+  3295396702U, // <1,1,7,5>: Cost 4 vrev <1,1,5,7>
+  3803891014U, // <1,1,7,6>: Cost 4 vext3 <7,6,0,1>, <1,7,6,0>
+  2617333356U, // <1,1,7,7>: Cost 3 vext2 <0,0,1,1>, <7,7,7,7>
+  2659800138U, // <1,1,7,u>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
+  1483112550U, // <1,1,u,0>: Cost 2 vext1 <1,1,1,1>, LHS
+  202162278U, // <1,1,u,1>: Cost 1 vdup1 LHS
+  2622642056U, // <1,1,u,2>: Cost 3 vext2 <0,u,1,1>, <u,2,3,3>
+  2014142566U, // <1,1,u,3>: Cost 2 vtrnr LHS, LHS
+  1483115830U, // <1,1,u,4>: Cost 2 vext1 <1,1,1,1>, RHS
+  1548900506U, // <1,1,u,5>: Cost 2 vext2 <0,u,1,1>, RHS
+  2622642384U, // <1,1,u,6>: Cost 3 vext2 <0,u,1,1>, <u,6,3,7>
+  2825293353U, // <1,1,u,7>: Cost 3 vuzpr <1,1,1,1>, RHS
+  202162278U, // <1,1,u,u>: Cost 1 vdup1 LHS
+  2635251712U, // <1,2,0,0>: Cost 3 vext2 <3,0,1,2>, <0,0,0,0>
+  1561509990U, // <1,2,0,1>: Cost 2 vext2 <3,0,1,2>, LHS
+  2618663085U, // <1,2,0,2>: Cost 3 vext2 <0,2,1,2>, <0,2,1,2>
+  2696529358U, // <1,2,0,3>: Cost 3 vext3 <2,0,3,1>, <2,0,3,1>
+  2635252050U, // <1,2,0,4>: Cost 3 vext2 <3,0,1,2>, <0,4,1,5>
+  3769533926U, // <1,2,0,5>: Cost 4 vext3 <1,u,2,1>, <2,0,5,7>
+  2621317617U, // <1,2,0,6>: Cost 3 vext2 <0,6,1,2>, <0,6,1,2>
+  2659140170U, // <1,2,0,7>: Cost 3 vext2 <7,0,1,2>, <0,7,2,1>
+  1561510557U, // <1,2,0,u>: Cost 2 vext2 <3,0,1,2>, LHS
+  2623308516U, // <1,2,1,0>: Cost 3 vext2 <1,0,1,2>, <1,0,1,2>
+  2635252532U, // <1,2,1,1>: Cost 3 vext2 <3,0,1,2>, <1,1,1,1>
+  2631271318U, // <1,2,1,2>: Cost 3 vext2 <2,3,1,2>, <1,2,3,0>
+  2958180454U, // <1,2,1,3>: Cost 3 vzipr <0,u,1,1>, LHS
+  2550959414U, // <1,2,1,4>: Cost 3 vext1 <0,1,2,1>, RHS
+  2635252880U, // <1,2,1,5>: Cost 3 vext2 <3,0,1,2>, <1,5,3,7>
+  2635252952U, // <1,2,1,6>: Cost 3 vext2 <3,0,1,2>, <1,6,2,7>
+  3732882731U, // <1,2,1,7>: Cost 4 vext2 <7,0,1,2>, <1,7,3,0>
+  2958180459U, // <1,2,1,u>: Cost 3 vzipr <0,u,1,1>, LHS
+  2629281213U, // <1,2,2,0>: Cost 3 vext2 <2,0,1,2>, <2,0,1,2>
+  2635253280U, // <1,2,2,1>: Cost 3 vext2 <3,0,1,2>, <2,1,3,2>
+  2618664552U, // <1,2,2,2>: Cost 3 vext2 <0,2,1,2>, <2,2,2,2>
+  2689746546U, // <1,2,2,3>: Cost 3 vext3 <0,u,1,1>, <2,2,3,3>
+  3764815485U, // <1,2,2,4>: Cost 4 vext3 <1,1,1,1>, <2,2,4,5>
+  3760023176U, // <1,2,2,5>: Cost 4 vext3 <0,2,u,1>, <2,2,5,7>
+  2635253690U, // <1,2,2,6>: Cost 3 vext2 <3,0,1,2>, <2,6,3,7>
+  2659141610U, // <1,2,2,7>: Cost 3 vext2 <7,0,1,2>, <2,7,0,1>
+  2689746591U, // <1,2,2,u>: Cost 3 vext3 <0,u,1,1>, <2,2,u,3>
+  403488870U, // <1,2,3,0>: Cost 1 vext1 LHS, LHS
+  1477231350U, // <1,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  1477232232U, // <1,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
+  1477233052U, // <1,2,3,3>: Cost 2 vext1 LHS, <3,3,3,3>
+  403492150U, // <1,2,3,4>: Cost 1 vext1 LHS, RHS
+  1525010128U, // <1,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
+  1525010938U, // <1,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1525011450U, // <1,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  403494702U, // <1,2,3,u>: Cost 1 vext1 LHS, LHS
+  2641226607U, // <1,2,4,0>: Cost 3 vext2 <4,0,1,2>, <4,0,1,2>
+  3624723446U, // <1,2,4,1>: Cost 4 vext1 <0,1,2,4>, <1,3,4,6>
+  3301123609U, // <1,2,4,2>: Cost 4 vrev <2,1,2,4>
+  2598759198U, // <1,2,4,3>: Cost 3 vext1 <u,1,2,4>, <3,u,1,2>
+  2659142864U, // <1,2,4,4>: Cost 3 vext2 <7,0,1,2>, <4,4,4,4>
+  1561513270U, // <1,2,4,5>: Cost 2 vext2 <3,0,1,2>, RHS
+  2659143028U, // <1,2,4,6>: Cost 3 vext2 <7,0,1,2>, <4,6,4,6>
+  2659143112U, // <1,2,4,7>: Cost 3 vext2 <7,0,1,2>, <4,7,5,0>
+  1561513513U, // <1,2,4,u>: Cost 2 vext2 <3,0,1,2>, RHS
+  2550988902U, // <1,2,5,0>: Cost 3 vext1 <0,1,2,5>, LHS
+  2550989824U, // <1,2,5,1>: Cost 3 vext1 <0,1,2,5>, <1,3,5,7>
+  3624732264U, // <1,2,5,2>: Cost 4 vext1 <0,1,2,5>, <2,2,2,2>
+  2955559014U, // <1,2,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
+  2550992182U, // <1,2,5,4>: Cost 3 vext1 <0,1,2,5>, RHS
+  2659143684U, // <1,2,5,5>: Cost 3 vext2 <7,0,1,2>, <5,5,5,5>
+  2659143778U, // <1,2,5,6>: Cost 3 vext2 <7,0,1,2>, <5,6,7,0>
+  2659143848U, // <1,2,5,7>: Cost 3 vext2 <7,0,1,2>, <5,7,5,7>
+  2550994734U, // <1,2,5,u>: Cost 3 vext1 <0,1,2,5>, LHS
+  2700289945U, // <1,2,6,0>: Cost 3 vext3 <2,6,0,1>, <2,6,0,1>
+  2635256232U, // <1,2,6,1>: Cost 3 vext2 <3,0,1,2>, <6,1,7,2>
+  2659144186U, // <1,2,6,2>: Cost 3 vext2 <7,0,1,2>, <6,2,7,3>
+  2689746874U, // <1,2,6,3>: Cost 3 vext3 <0,u,1,1>, <2,6,3,7>
+  3763488705U, // <1,2,6,4>: Cost 4 vext3 <0,u,1,1>, <2,6,4,5>
+  3763488716U, // <1,2,6,5>: Cost 4 vext3 <0,u,1,1>, <2,6,5,7>
+  2659144504U, // <1,2,6,6>: Cost 3 vext2 <7,0,1,2>, <6,6,6,6>
+  2657817432U, // <1,2,6,7>: Cost 3 vext2 <6,7,1,2>, <6,7,1,2>
+  2689746919U, // <1,2,6,u>: Cost 3 vext3 <0,u,1,1>, <2,6,u,7>
+  1585402874U, // <1,2,7,0>: Cost 2 vext2 <7,0,1,2>, <7,0,1,2>
+  2659144770U, // <1,2,7,1>: Cost 3 vext2 <7,0,1,2>, <7,1,0,2>
+  3708998858U, // <1,2,7,2>: Cost 4 vext2 <3,0,1,2>, <7,2,6,3>
+  2635257059U, // <1,2,7,3>: Cost 3 vext2 <3,0,1,2>, <7,3,0,1>
+  2659145062U, // <1,2,7,4>: Cost 3 vext2 <7,0,1,2>, <7,4,5,6>
+  3732886916U, // <1,2,7,5>: Cost 4 vext2 <7,0,1,2>, <7,5,0,0>
+  3732886998U, // <1,2,7,6>: Cost 4 vext2 <7,0,1,2>, <7,6,0,1>
+  2659145255U, // <1,2,7,7>: Cost 3 vext2 <7,0,1,2>, <7,7,0,1>
+  1590711938U, // <1,2,7,u>: Cost 2 vext2 <7,u,1,2>, <7,u,1,2>
+  403529835U, // <1,2,u,0>: Cost 1 vext1 LHS, LHS
+  1477272310U, // <1,2,u,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  1477273192U, // <1,2,u,2>: Cost 2 vext1 LHS, <2,2,2,2>
+  1477273750U, // <1,2,u,3>: Cost 2 vext1 LHS, <3,0,1,2>
+  403533110U, // <1,2,u,4>: Cost 1 vext1 LHS, RHS
+  1561516186U, // <1,2,u,5>: Cost 2 vext2 <3,0,1,2>, RHS
+  1525051898U, // <1,2,u,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1525052410U, // <1,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  403535662U, // <1,2,u,u>: Cost 1 vext1 LHS, LHS
+  2819407872U, // <1,3,0,0>: Cost 3 vuzpr LHS, <0,0,0,0>
+  1551564902U, // <1,3,0,1>: Cost 2 vext2 <1,3,1,3>, LHS
+  2819408630U, // <1,3,0,2>: Cost 3 vuzpr LHS, <1,0,3,2>
+  2619334911U, // <1,3,0,3>: Cost 3 vext2 <0,3,1,3>, <0,3,1,3>
+  2625306962U, // <1,3,0,4>: Cost 3 vext2 <1,3,1,3>, <0,4,1,5>
+  3832725879U, // <1,3,0,5>: Cost 4 vuzpl <1,2,3,0>, <0,4,5,6>
+  3699048959U, // <1,3,0,6>: Cost 4 vext2 <1,3,1,3>, <0,6,2,7>
+  3776538827U, // <1,3,0,7>: Cost 4 vext3 <3,0,7,1>, <3,0,7,1>
+  1551565469U, // <1,3,0,u>: Cost 2 vext2 <1,3,1,3>, LHS
+  2618671862U, // <1,3,1,0>: Cost 3 vext2 <0,2,1,3>, <1,0,3,2>
+  2819408692U, // <1,3,1,1>: Cost 3 vuzpr LHS, <1,1,1,1>
+  2624643975U, // <1,3,1,2>: Cost 3 vext2 <1,2,1,3>, <1,2,1,3>
+  1745666150U, // <1,3,1,3>: Cost 2 vuzpr LHS, LHS
+  2557005110U, // <1,3,1,4>: Cost 3 vext1 <1,1,3,1>, RHS
+  2625307792U, // <1,3,1,5>: Cost 3 vext2 <1,3,1,3>, <1,5,3,7>
+  3698386127U, // <1,3,1,6>: Cost 4 vext2 <1,2,1,3>, <1,6,1,7>
+  2592838748U, // <1,3,1,7>: Cost 3 vext1 <7,1,3,1>, <7,1,3,1>
+  1745666155U, // <1,3,1,u>: Cost 2 vuzpr LHS, LHS
+  2819408790U, // <1,3,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
+  2625308193U, // <1,3,2,1>: Cost 3 vext2 <1,3,1,3>, <2,1,3,3>
+  2819408036U, // <1,3,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
+  2819851890U, // <1,3,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
+  2819408794U, // <1,3,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
+  3893149890U, // <1,3,2,5>: Cost 4 vuzpr LHS, <0,2,3,5>
+  2819408076U, // <1,3,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
+  3772041583U, // <1,3,2,7>: Cost 4 vext3 <2,3,0,1>, <3,2,7,3>
+  2819408042U, // <1,3,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
+  1483276390U, // <1,3,3,0>: Cost 2 vext1 <1,1,3,3>, LHS
+  1483277128U, // <1,3,3,1>: Cost 2 vext1 <1,1,3,3>, <1,1,3,3>
+  2557019752U, // <1,3,3,2>: Cost 3 vext1 <1,1,3,3>, <2,2,2,2>
+  2819408856U, // <1,3,3,3>: Cost 3 vuzpr LHS, <1,3,1,3>
+  1483279670U, // <1,3,3,4>: Cost 2 vext1 <1,1,3,3>, RHS
+  2819409614U, // <1,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
+  2598826490U, // <1,3,3,6>: Cost 3 vext1 <u,1,3,3>, <6,2,7,3>
+  3087844352U, // <1,3,3,7>: Cost 3 vtrnr LHS, <1,3,5,7>
+  1483282222U, // <1,3,3,u>: Cost 2 vext1 <1,1,3,3>, LHS
+  2568970342U, // <1,3,4,0>: Cost 3 vext1 <3,1,3,4>, LHS
+  2568971224U, // <1,3,4,1>: Cost 3 vext1 <3,1,3,4>, <1,3,1,3>
+  3832761290U, // <1,3,4,2>: Cost 4 vuzpl <1,2,3,4>, <4,1,2,3>
+  2233428219U, // <1,3,4,3>: Cost 3 vrev <3,1,3,4>
+  2568973622U, // <1,3,4,4>: Cost 3 vext1 <3,1,3,4>, RHS
+  1551568182U, // <1,3,4,5>: Cost 2 vext2 <1,3,1,3>, RHS
+  2819410434U, // <1,3,4,6>: Cost 3 vuzpr LHS, <3,4,5,6>
+  3666605151U, // <1,3,4,7>: Cost 4 vext1 <7,1,3,4>, <7,1,3,4>
+  1551568425U, // <1,3,4,u>: Cost 2 vext2 <1,3,1,3>, RHS
+  2563006566U, // <1,3,5,0>: Cost 3 vext1 <2,1,3,5>, LHS
+  2568979456U, // <1,3,5,1>: Cost 3 vext1 <3,1,3,5>, <1,3,5,7>
+  2563008035U, // <1,3,5,2>: Cost 3 vext1 <2,1,3,5>, <2,1,3,5>
+  2233436412U, // <1,3,5,3>: Cost 3 vrev <3,1,3,5>
+  2563009846U, // <1,3,5,4>: Cost 3 vext1 <2,1,3,5>, RHS
+  2867187716U, // <1,3,5,5>: Cost 3 vuzpr LHS, <5,5,5,5>
+  2655834214U, // <1,3,5,6>: Cost 3 vext2 <6,4,1,3>, <5,6,7,4>
+  1745669430U, // <1,3,5,7>: Cost 2 vuzpr LHS, RHS
+  1745669431U, // <1,3,5,u>: Cost 2 vuzpr LHS, RHS
+  2867187810U, // <1,3,6,0>: Cost 3 vuzpr LHS, <5,6,7,0>
+  3699052931U, // <1,3,6,1>: Cost 4 vext2 <1,3,1,3>, <6,1,3,1>
+  2654507460U, // <1,3,6,2>: Cost 3 vext2 <6,2,1,3>, <6,2,1,3>
+  3766291091U, // <1,3,6,3>: Cost 4 vext3 <1,3,3,1>, <3,6,3,7>
+  2655834726U, // <1,3,6,4>: Cost 3 vext2 <6,4,1,3>, <6,4,1,3>
+  3923384562U, // <1,3,6,5>: Cost 4 vuzpr <5,1,7,3>, <u,6,7,5>
+  2657161992U, // <1,3,6,6>: Cost 3 vext2 <6,6,1,3>, <6,6,1,3>
+  2819852218U, // <1,3,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
+  2819852219U, // <1,3,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
+  2706926275U, // <1,3,7,0>: Cost 3 vext3 <3,7,0,1>, <3,7,0,1>
+  2659816524U, // <1,3,7,1>: Cost 3 vext2 <7,1,1,3>, <7,1,1,3>
+  3636766245U, // <1,3,7,2>: Cost 4 vext1 <2,1,3,7>, <2,1,3,7>
+  2867187903U, // <1,3,7,3>: Cost 3 vuzpr LHS, <5,7,u,3>
+  2625312102U, // <1,3,7,4>: Cost 3 vext2 <1,3,1,3>, <7,4,5,6>
+  2867188598U, // <1,3,7,5>: Cost 3 vuzpr LHS, <6,7,4,5>
+  3728250344U, // <1,3,7,6>: Cost 4 vext2 <6,2,1,3>, <7,6,2,1>
+  2867187880U, // <1,3,7,7>: Cost 3 vuzpr LHS, <5,7,5,7>
+  2707516171U, // <1,3,7,u>: Cost 3 vext3 <3,7,u,1>, <3,7,u,1>
+  1483317350U, // <1,3,u,0>: Cost 2 vext1 <1,1,3,u>, LHS
+  1483318093U, // <1,3,u,1>: Cost 2 vext1 <1,1,3,u>, <1,1,3,u>
+  2819410718U, // <1,3,u,2>: Cost 3 vuzpr LHS, <3,u,1,2>
+  1745666717U, // <1,3,u,3>: Cost 2 vuzpr LHS, LHS
+  1483320630U, // <1,3,u,4>: Cost 2 vext1 <1,1,3,u>, RHS
+  1551571098U, // <1,3,u,5>: Cost 2 vext2 <1,3,1,3>, RHS
+  2819410758U, // <1,3,u,6>: Cost 3 vuzpr LHS, <3,u,5,6>
+  1745669673U, // <1,3,u,7>: Cost 2 vuzpr LHS, RHS
+  1745666722U, // <1,3,u,u>: Cost 2 vuzpr LHS, LHS
+  2617352205U, // <1,4,0,0>: Cost 3 vext2 <0,0,1,4>, <0,0,1,4>
+  2619342950U, // <1,4,0,1>: Cost 3 vext2 <0,3,1,4>, LHS
+  3692421295U, // <1,4,0,2>: Cost 4 vext2 <0,2,1,4>, <0,2,1,4>
+  2619343104U, // <1,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
+  2617352530U, // <1,4,0,4>: Cost 3 vext2 <0,0,1,4>, <0,4,1,5>
+  1634880402U, // <1,4,0,5>: Cost 2 vext3 <4,0,5,1>, <4,0,5,1>
+  2713930652U, // <1,4,0,6>: Cost 3 vext3 <4,u,5,1>, <4,0,6,2>
+  3732898396U, // <1,4,0,7>: Cost 4 vext2 <7,0,1,4>, <0,7,4,1>
+  1635101613U, // <1,4,0,u>: Cost 2 vext3 <4,0,u,1>, <4,0,u,1>
+  3693085430U, // <1,4,1,0>: Cost 4 vext2 <0,3,1,4>, <1,0,3,2>
+  2623988535U, // <1,4,1,1>: Cost 3 vext2 <1,1,1,4>, <1,1,1,4>
+  3693085590U, // <1,4,1,2>: Cost 4 vext2 <0,3,1,4>, <1,2,3,0>
+  3692422134U, // <1,4,1,3>: Cost 4 vext2 <0,2,1,4>, <1,3,4,6>
+  3693085726U, // <1,4,1,4>: Cost 4 vext2 <0,3,1,4>, <1,4,0,1>
+  2892401974U, // <1,4,1,5>: Cost 3 vzipl <1,1,1,1>, RHS
+  3026619702U, // <1,4,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
+  3800206324U, // <1,4,1,7>: Cost 4 vext3 <7,0,4,1>, <4,1,7,0>
+  2892402217U, // <1,4,1,u>: Cost 3 vzipl <1,1,1,1>, RHS
+  3966978927U, // <1,4,2,0>: Cost 4 vzipl <1,2,3,4>, <4,0,1,2>
+  3966979018U, // <1,4,2,1>: Cost 4 vzipl <1,2,3,4>, <4,1,2,3>
+  3693086312U, // <1,4,2,2>: Cost 4 vext2 <0,3,1,4>, <2,2,2,2>
+  2635269798U, // <1,4,2,3>: Cost 3 vext2 <3,0,1,4>, <2,3,0,1>
+  3966979280U, // <1,4,2,4>: Cost 4 vzipl <1,2,3,4>, <4,4,4,4>
+  2893204790U, // <1,4,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
+  3693086650U, // <1,4,2,6>: Cost 4 vext2 <0,3,1,4>, <2,6,3,7>
+  3666662502U, // <1,4,2,7>: Cost 4 vext1 <7,1,4,2>, <7,1,4,2>
+  2893205033U, // <1,4,2,u>: Cost 3 vzipl <1,2,3,0>, RHS
+  2563063910U, // <1,4,3,0>: Cost 3 vext1 <2,1,4,3>, LHS
+  2563064730U, // <1,4,3,1>: Cost 3 vext1 <2,1,4,3>, <1,2,3,4>
+  2563065386U, // <1,4,3,2>: Cost 3 vext1 <2,1,4,3>, <2,1,4,3>
+  3693087132U, // <1,4,3,3>: Cost 4 vext2 <0,3,1,4>, <3,3,3,3>
+  2619345410U, // <1,4,3,4>: Cost 3 vext2 <0,3,1,4>, <3,4,5,6>
+  3087843666U, // <1,4,3,5>: Cost 3 vtrnr LHS, <0,4,1,5>
+  3087843676U, // <1,4,3,6>: Cost 3 vtrnr LHS, <0,4,2,6>
+  3666670695U, // <1,4,3,7>: Cost 4 vext1 <7,1,4,3>, <7,1,4,3>
+  3087843669U, // <1,4,3,u>: Cost 3 vtrnr LHS, <0,4,1,u>
+  2620672914U, // <1,4,4,0>: Cost 3 vext2 <0,5,1,4>, <4,0,5,1>
+  3630842706U, // <1,4,4,1>: Cost 4 vext1 <1,1,4,4>, <1,1,4,4>
+  3313069003U, // <1,4,4,2>: Cost 4 vrev <4,1,2,4>
+  3642788100U, // <1,4,4,3>: Cost 4 vext1 <3,1,4,4>, <3,1,4,4>
+  2713930960U, // <1,4,4,4>: Cost 3 vext3 <4,u,5,1>, <4,4,4,4>
+  2619346230U, // <1,4,4,5>: Cost 3 vext2 <0,3,1,4>, RHS
+  2713930980U, // <1,4,4,6>: Cost 3 vext3 <4,u,5,1>, <4,4,6,6>
+  3736882642U, // <1,4,4,7>: Cost 4 vext2 <7,6,1,4>, <4,7,6,1>
+  2619346473U, // <1,4,4,u>: Cost 3 vext2 <0,3,1,4>, RHS
+  2557108326U, // <1,4,5,0>: Cost 3 vext1 <1,1,4,5>, LHS
+  2557109075U, // <1,4,5,1>: Cost 3 vext1 <1,1,4,5>, <1,1,4,5>
+  2598913774U, // <1,4,5,2>: Cost 3 vext1 <u,1,4,5>, <2,3,u,1>
+  3630852246U, // <1,4,5,3>: Cost 4 vext1 <1,1,4,5>, <3,0,1,2>
+  2557111606U, // <1,4,5,4>: Cost 3 vext1 <1,1,4,5>, RHS
+  2895252790U, // <1,4,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
+  1616006454U, // <1,4,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  3899059510U, // <1,4,5,7>: Cost 4 vuzpr <1,1,1,4>, RHS
+  1616006472U, // <1,4,5,u>: Cost 2 vext3 <0,u,1,1>, RHS
+  2557116518U, // <1,4,6,0>: Cost 3 vext1 <1,1,4,6>, LHS
+  2557117236U, // <1,4,6,1>: Cost 3 vext1 <1,1,4,6>, <1,1,1,1>
+  3630859880U, // <1,4,6,2>: Cost 4 vext1 <1,1,4,6>, <2,2,2,2>
+  2569062550U, // <1,4,6,3>: Cost 3 vext1 <3,1,4,6>, <3,0,1,2>
+  2557119798U, // <1,4,6,4>: Cost 3 vext1 <1,1,4,6>, RHS
+  3763490174U, // <1,4,6,5>: Cost 4 vext3 <0,u,1,1>, <4,6,5,7>
+  3763490183U, // <1,4,6,6>: Cost 4 vext3 <0,u,1,1>, <4,6,6,7>
+  2712751498U, // <1,4,6,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
+  2557122350U, // <1,4,6,u>: Cost 3 vext1 <1,1,4,6>, LHS
+  2659161084U, // <1,4,7,0>: Cost 3 vext2 <7,0,1,4>, <7,0,1,4>
+  3732903040U, // <1,4,7,1>: Cost 4 vext2 <7,0,1,4>, <7,1,7,1>
+  3734230174U, // <1,4,7,2>: Cost 4 vext2 <7,2,1,4>, <7,2,1,4>
+  3734893807U, // <1,4,7,3>: Cost 4 vext2 <7,3,1,4>, <7,3,1,4>
+  3660729654U, // <1,4,7,4>: Cost 4 vext1 <6,1,4,7>, RHS
+  3786493384U, // <1,4,7,5>: Cost 4 vext3 <4,6,7,1>, <4,7,5,0>
+  2713341394U, // <1,4,7,6>: Cost 3 vext3 <4,7,6,1>, <4,7,6,1>
+  3660731386U, // <1,4,7,7>: Cost 4 vext1 <6,1,4,7>, <7,0,1,2>
+  2664470148U, // <1,4,7,u>: Cost 3 vext2 <7,u,1,4>, <7,u,1,4>
+  2557132902U, // <1,4,u,0>: Cost 3 vext1 <1,1,4,u>, LHS
+  2619348782U, // <1,4,u,1>: Cost 3 vext2 <0,3,1,4>, LHS
+  2563106351U, // <1,4,u,2>: Cost 3 vext1 <2,1,4,u>, <2,1,4,u>
+  2713783816U, // <1,4,u,3>: Cost 3 vext3 <4,u,3,1>, <4,u,3,1>
+  2622666815U, // <1,4,u,4>: Cost 3 vext2 <0,u,1,4>, <u,4,5,6>
+  1640189466U, // <1,4,u,5>: Cost 2 vext3 <4,u,5,1>, <4,u,5,1>
+  1616006697U, // <1,4,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  2712751498U, // <1,4,u,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
+  1616006715U, // <1,4,u,u>: Cost 2 vext3 <0,u,1,1>, RHS
+  2620014592U, // <1,5,0,0>: Cost 3 vext2 <0,4,1,5>, <0,0,0,0>
+  1546272870U, // <1,5,0,1>: Cost 2 vext2 <0,4,1,5>, LHS
+  2618687664U, // <1,5,0,2>: Cost 3 vext2 <0,2,1,5>, <0,2,1,5>
+  3693093120U, // <1,5,0,3>: Cost 4 vext2 <0,3,1,5>, <0,3,1,4>
+  1546273106U, // <1,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
+  2620678563U, // <1,5,0,5>: Cost 3 vext2 <0,5,1,5>, <0,5,1,5>
+  2714668660U, // <1,5,0,6>: Cost 3 vext3 <5,0,6,1>, <5,0,6,1>
+  3772042877U, // <1,5,0,7>: Cost 4 vext3 <2,3,0,1>, <5,0,7,1>
+  1546273437U, // <1,5,0,u>: Cost 2 vext2 <0,4,1,5>, LHS
+  2620015350U, // <1,5,1,0>: Cost 3 vext2 <0,4,1,5>, <1,0,3,2>
+  2620015412U, // <1,5,1,1>: Cost 3 vext2 <0,4,1,5>, <1,1,1,1>
+  2620015510U, // <1,5,1,2>: Cost 3 vext2 <0,4,1,5>, <1,2,3,0>
+  2618688512U, // <1,5,1,3>: Cost 3 vext2 <0,2,1,5>, <1,3,5,7>
+  2620015677U, // <1,5,1,4>: Cost 3 vext2 <0,4,1,5>, <1,4,3,5>
+  2620015727U, // <1,5,1,5>: Cost 3 vext2 <0,4,1,5>, <1,5,0,1>
+  2620015859U, // <1,5,1,6>: Cost 3 vext2 <0,4,1,5>, <1,6,5,7>
+  3093728566U, // <1,5,1,7>: Cost 3 vtrnr <1,1,1,1>, RHS
+  2620015981U, // <1,5,1,u>: Cost 3 vext2 <0,4,1,5>, <1,u,1,3>
+  3692430816U, // <1,5,2,0>: Cost 4 vext2 <0,2,1,5>, <2,0,5,1>
+  2620016163U, // <1,5,2,1>: Cost 3 vext2 <0,4,1,5>, <2,1,3,5>
+  2620016232U, // <1,5,2,2>: Cost 3 vext2 <0,4,1,5>, <2,2,2,2>
+  2620016294U, // <1,5,2,3>: Cost 3 vext2 <0,4,1,5>, <2,3,0,1>
+  3693758221U, // <1,5,2,4>: Cost 4 vext2 <0,4,1,5>, <2,4,2,5>
+  3692431209U, // <1,5,2,5>: Cost 4 vext2 <0,2,1,5>, <2,5,3,7>
+  2620016570U, // <1,5,2,6>: Cost 3 vext2 <0,4,1,5>, <2,6,3,7>
+  4173598006U, // <1,5,2,7>: Cost 4 vtrnr <2,1,3,2>, RHS
+  2620016699U, // <1,5,2,u>: Cost 3 vext2 <0,4,1,5>, <2,u,0,1>
+  2620016790U, // <1,5,3,0>: Cost 3 vext2 <0,4,1,5>, <3,0,1,2>
+  2569110672U, // <1,5,3,1>: Cost 3 vext1 <3,1,5,3>, <1,5,3,7>
+  3693758785U, // <1,5,3,2>: Cost 4 vext2 <0,4,1,5>, <3,2,2,2>
+  2620017052U, // <1,5,3,3>: Cost 3 vext2 <0,4,1,5>, <3,3,3,3>
+  2620017154U, // <1,5,3,4>: Cost 3 vext2 <0,4,1,5>, <3,4,5,6>
+  3135623172U, // <1,5,3,5>: Cost 3 vtrnr LHS, <5,5,5,5>
+  4161587048U, // <1,5,3,6>: Cost 4 vtrnr LHS, <2,5,3,6>
+  2014104886U, // <1,5,3,7>: Cost 2 vtrnr LHS, RHS
+  2014104887U, // <1,5,3,u>: Cost 2 vtrnr LHS, RHS
+  2620017554U, // <1,5,4,0>: Cost 3 vext2 <0,4,1,5>, <4,0,5,1>
+  2620017634U, // <1,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
+  3693759551U, // <1,5,4,2>: Cost 4 vext2 <0,4,1,5>, <4,2,6,3>
+  3642861837U, // <1,5,4,3>: Cost 4 vext1 <3,1,5,4>, <3,1,5,4>
+  2575092710U, // <1,5,4,4>: Cost 3 vext1 <4,1,5,4>, <4,1,5,4>
+  1546276150U, // <1,5,4,5>: Cost 2 vext2 <0,4,1,5>, RHS
+  2759855414U, // <1,5,4,6>: Cost 3 vuzpl <1,3,5,7>, RHS
+  2713931718U, // <1,5,4,7>: Cost 3 vext3 <4,u,5,1>, <5,4,7,6>
+  1546276393U, // <1,5,4,u>: Cost 2 vext2 <0,4,1,5>, RHS
+  2557182054U, // <1,5,5,0>: Cost 3 vext1 <1,1,5,5>, LHS
+  2557182812U, // <1,5,5,1>: Cost 3 vext1 <1,1,5,5>, <1,1,5,5>
+  3630925347U, // <1,5,5,2>: Cost 4 vext1 <1,1,5,5>, <2,1,3,5>
+  4029301675U, // <1,5,5,3>: Cost 4 vzipr <0,4,1,5>, <1,2,5,3>
+  2557185334U, // <1,5,5,4>: Cost 3 vext1 <1,1,5,5>, RHS
+  2713931780U, // <1,5,5,5>: Cost 3 vext3 <4,u,5,1>, <5,5,5,5>
+  2667794530U, // <1,5,5,6>: Cost 3 vext2 <u,4,1,5>, <5,6,7,0>
+  2713931800U, // <1,5,5,7>: Cost 3 vext3 <4,u,5,1>, <5,5,7,7>
+  2557187886U, // <1,5,5,u>: Cost 3 vext1 <1,1,5,5>, LHS
+  2718208036U, // <1,5,6,0>: Cost 3 vext3 <5,6,0,1>, <5,6,0,1>
+  2620019115U, // <1,5,6,1>: Cost 3 vext2 <0,4,1,5>, <6,1,7,5>
+  2667794938U, // <1,5,6,2>: Cost 3 vext2 <u,4,1,5>, <6,2,7,3>
+  3787673666U, // <1,5,6,3>: Cost 4 vext3 <4,u,5,1>, <5,6,3,4>
+  3693761165U, // <1,5,6,4>: Cost 4 vext2 <0,4,1,5>, <6,4,5,6>
+  3319279297U, // <1,5,6,5>: Cost 4 vrev <5,1,5,6>
+  2667795256U, // <1,5,6,6>: Cost 3 vext2 <u,4,1,5>, <6,6,6,6>
+  2713931874U, // <1,5,6,7>: Cost 3 vext3 <4,u,5,1>, <5,6,7,0>
+  2713931883U, // <1,5,6,u>: Cost 3 vext3 <4,u,5,1>, <5,6,u,0>
+  2557198438U, // <1,5,7,0>: Cost 3 vext1 <1,1,5,7>, LHS
+  2557199156U, // <1,5,7,1>: Cost 3 vext1 <1,1,5,7>, <1,1,1,1>
+  2569143974U, // <1,5,7,2>: Cost 3 vext1 <3,1,5,7>, <2,3,0,1>
+  2569144592U, // <1,5,7,3>: Cost 3 vext1 <3,1,5,7>, <3,1,5,7>
+  2557201718U, // <1,5,7,4>: Cost 3 vext1 <1,1,5,7>, RHS
+  2713931944U, // <1,5,7,5>: Cost 3 vext3 <4,u,5,1>, <5,7,5,7>
+  3787673770U, // <1,5,7,6>: Cost 4 vext3 <4,u,5,1>, <5,7,6,0>
+  2719387828U, // <1,5,7,7>: Cost 3 vext3 <5,7,7,1>, <5,7,7,1>
+  2557204270U, // <1,5,7,u>: Cost 3 vext1 <1,1,5,7>, LHS
+  2620020435U, // <1,5,u,0>: Cost 3 vext2 <0,4,1,5>, <u,0,1,2>
+  1546278702U, // <1,5,u,1>: Cost 2 vext2 <0,4,1,5>, LHS
+  2620020616U, // <1,5,u,2>: Cost 3 vext2 <0,4,1,5>, <u,2,3,3>
+  2620020668U, // <1,5,u,3>: Cost 3 vext2 <0,4,1,5>, <u,3,0,1>
+  1594054682U, // <1,5,u,4>: Cost 2 vext2 <u,4,1,5>, <u,4,1,5>
+  1546279066U, // <1,5,u,5>: Cost 2 vext2 <0,4,1,5>, RHS
+  2620020944U, // <1,5,u,6>: Cost 3 vext2 <0,4,1,5>, <u,6,3,7>
+  2014145846U, // <1,5,u,7>: Cost 2 vtrnr LHS, RHS
+  2014145847U, // <1,5,u,u>: Cost 2 vtrnr LHS, RHS
+  3692437504U, // <1,6,0,0>: Cost 4 vext2 <0,2,1,6>, <0,0,0,0>
+  2618695782U, // <1,6,0,1>: Cost 3 vext2 <0,2,1,6>, LHS
+  2618695857U, // <1,6,0,2>: Cost 3 vext2 <0,2,1,6>, <0,2,1,6>
+  3794161970U, // <1,6,0,3>: Cost 4 vext3 <6,0,3,1>, <6,0,3,1>
+  2620023122U, // <1,6,0,4>: Cost 3 vext2 <0,4,1,6>, <0,4,1,5>
+  2620686756U, // <1,6,0,5>: Cost 3 vext2 <0,5,1,6>, <0,5,1,6>
+  2621350389U, // <1,6,0,6>: Cost 3 vext2 <0,6,1,6>, <0,6,1,6>
+  4028599606U, // <1,6,0,7>: Cost 4 vzipr <0,3,1,0>, RHS
+  2618696349U, // <1,6,0,u>: Cost 3 vext2 <0,2,1,6>, LHS
+  3692438262U, // <1,6,1,0>: Cost 4 vext2 <0,2,1,6>, <1,0,3,2>
+  2625995572U, // <1,6,1,1>: Cost 3 vext2 <1,4,1,6>, <1,1,1,1>
+  3692438422U, // <1,6,1,2>: Cost 4 vext2 <0,2,1,6>, <1,2,3,0>
+  3692438488U, // <1,6,1,3>: Cost 4 vext2 <0,2,1,6>, <1,3,1,3>
+  2625995820U, // <1,6,1,4>: Cost 3 vext2 <1,4,1,6>, <1,4,1,6>
+  3692438672U, // <1,6,1,5>: Cost 4 vext2 <0,2,1,6>, <1,5,3,7>
+  3692438720U, // <1,6,1,6>: Cost 4 vext2 <0,2,1,6>, <1,6,0,1>
+  2958183734U, // <1,6,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
+  2958183735U, // <1,6,1,u>: Cost 3 vzipr <0,u,1,1>, RHS
+  2721526201U, // <1,6,2,0>: Cost 3 vext3 <6,2,0,1>, <6,2,0,1>
+  3692439097U, // <1,6,2,1>: Cost 4 vext2 <0,2,1,6>, <2,1,6,0>
+  3692439144U, // <1,6,2,2>: Cost 4 vext2 <0,2,1,6>, <2,2,2,2>
+  3692439206U, // <1,6,2,3>: Cost 4 vext2 <0,2,1,6>, <2,3,0,1>
+  3636948278U, // <1,6,2,4>: Cost 4 vext1 <2,1,6,2>, RHS
+  3787674092U, // <1,6,2,5>: Cost 4 vext3 <4,u,5,1>, <6,2,5,7>
+  2618697658U, // <1,6,2,6>: Cost 3 vext2 <0,2,1,6>, <2,6,3,7>
+  2970799414U, // <1,6,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
+  2970799415U, // <1,6,2,u>: Cost 3 vzipr <3,0,1,2>, RHS
+  2563211366U, // <1,6,3,0>: Cost 3 vext1 <2,1,6,3>, LHS
+  3699738854U, // <1,6,3,1>: Cost 4 vext2 <1,4,1,6>, <3,1,1,1>
+  2563212860U, // <1,6,3,2>: Cost 3 vext1 <2,1,6,3>, <2,1,6,3>
+  3692439964U, // <1,6,3,3>: Cost 4 vext2 <0,2,1,6>, <3,3,3,3>
+  2563214646U, // <1,6,3,4>: Cost 3 vext1 <2,1,6,3>, RHS
+  4191820018U, // <1,6,3,5>: Cost 4 vtrnr <5,1,7,3>, <u,6,7,5>
+  2587103648U, // <1,6,3,6>: Cost 3 vext1 <6,1,6,3>, <6,1,6,3>
+  3087845306U, // <1,6,3,7>: Cost 3 vtrnr LHS, <2,6,3,7>
+  3087845307U, // <1,6,3,u>: Cost 3 vtrnr LHS, <2,6,3,u>
+  3693767570U, // <1,6,4,0>: Cost 4 vext2 <0,4,1,6>, <4,0,5,1>
+  3693767650U, // <1,6,4,1>: Cost 4 vext2 <0,4,1,6>, <4,1,5,0>
+  3636962877U, // <1,6,4,2>: Cost 4 vext1 <2,1,6,4>, <2,1,6,4>
+  3325088134U, // <1,6,4,3>: Cost 4 vrev <6,1,3,4>
+  3693767898U, // <1,6,4,4>: Cost 4 vext2 <0,4,1,6>, <4,4,5,5>
+  2618699062U, // <1,6,4,5>: Cost 3 vext2 <0,2,1,6>, RHS
+  3833670966U, // <1,6,4,6>: Cost 4 vuzpl <1,3,6,7>, RHS
+  4028632374U, // <1,6,4,7>: Cost 4 vzipr <0,3,1,4>, RHS
+  2618699305U, // <1,6,4,u>: Cost 3 vext2 <0,2,1,6>, RHS
+  3693768264U, // <1,6,5,0>: Cost 4 vext2 <0,4,1,6>, <5,0,1,2>
+  3630998373U, // <1,6,5,1>: Cost 4 vext1 <1,1,6,5>, <1,1,6,5>
+  3636971070U, // <1,6,5,2>: Cost 4 vext1 <2,1,6,5>, <2,1,6,5>
+  3642943767U, // <1,6,5,3>: Cost 4 vext1 <3,1,6,5>, <3,1,6,5>
+  3693768628U, // <1,6,5,4>: Cost 4 vext2 <0,4,1,6>, <5,4,5,6>
+  3732918276U, // <1,6,5,5>: Cost 4 vext2 <7,0,1,6>, <5,5,5,5>
+  2620690530U, // <1,6,5,6>: Cost 3 vext2 <0,5,1,6>, <5,6,7,0>
+  2955562294U, // <1,6,5,7>: Cost 3 vzipr <0,4,1,5>, RHS
+  2955562295U, // <1,6,5,u>: Cost 3 vzipr <0,4,1,5>, RHS
+  2724180733U, // <1,6,6,0>: Cost 3 vext3 <6,6,0,1>, <6,6,0,1>
+  3631006566U, // <1,6,6,1>: Cost 4 vext1 <1,1,6,6>, <1,1,6,6>
+  3631007674U, // <1,6,6,2>: Cost 4 vext1 <1,1,6,6>, <2,6,3,7>
+  3692442184U, // <1,6,6,3>: Cost 4 vext2 <0,2,1,6>, <6,3,7,0>
+  3631009078U, // <1,6,6,4>: Cost 4 vext1 <1,1,6,6>, RHS
+  3787674416U, // <1,6,6,5>: Cost 4 vext3 <4,u,5,1>, <6,6,5,7>
+  2713932600U, // <1,6,6,6>: Cost 3 vext3 <4,u,5,1>, <6,6,6,6>
+  2713932610U, // <1,6,6,7>: Cost 3 vext3 <4,u,5,1>, <6,6,7,7>
+  2713932619U, // <1,6,6,u>: Cost 3 vext3 <4,u,5,1>, <6,6,u,7>
+  1651102542U, // <1,6,7,0>: Cost 2 vext3 <6,7,0,1>, <6,7,0,1>
+  2724918103U, // <1,6,7,1>: Cost 3 vext3 <6,7,1,1>, <6,7,1,1>
+  2698302306U, // <1,6,7,2>: Cost 3 vext3 <2,3,0,1>, <6,7,2,3>
+  3642960153U, // <1,6,7,3>: Cost 4 vext1 <3,1,6,7>, <3,1,6,7>
+  2713932662U, // <1,6,7,4>: Cost 3 vext3 <4,u,5,1>, <6,7,4,5>
+  2725213051U, // <1,6,7,5>: Cost 3 vext3 <6,7,5,1>, <6,7,5,1>
+  2724844426U, // <1,6,7,6>: Cost 3 vext3 <6,7,0,1>, <6,7,6,7>
+  4035956022U, // <1,6,7,7>: Cost 4 vzipr <1,5,1,7>, RHS
+  1651692438U, // <1,6,7,u>: Cost 2 vext3 <6,7,u,1>, <6,7,u,1>
+  1651766175U, // <1,6,u,0>: Cost 2 vext3 <6,u,0,1>, <6,u,0,1>
+  2618701614U, // <1,6,u,1>: Cost 3 vext2 <0,2,1,6>, LHS
+  3135663508U, // <1,6,u,2>: Cost 3 vtrnr LHS, <4,6,u,2>
+  3692443580U, // <1,6,u,3>: Cost 4 vext2 <0,2,1,6>, <u,3,0,1>
+  2713932743U, // <1,6,u,4>: Cost 3 vext3 <4,u,5,1>, <6,u,4,5>
+  2618701978U, // <1,6,u,5>: Cost 3 vext2 <0,2,1,6>, RHS
+  2622683344U, // <1,6,u,6>: Cost 3 vext2 <0,u,1,6>, <u,6,3,7>
+  3087886266U, // <1,6,u,7>: Cost 3 vtrnr LHS, <2,6,3,7>
+  1652356071U, // <1,6,u,u>: Cost 2 vext3 <6,u,u,1>, <6,u,u,1>
+  2726171632U, // <1,7,0,0>: Cost 3 vext3 <7,0,0,1>, <7,0,0,1>
+  2626666598U, // <1,7,0,1>: Cost 3 vext2 <1,5,1,7>, LHS
+  3695100067U, // <1,7,0,2>: Cost 4 vext2 <0,6,1,7>, <0,2,0,1>
+  3707044102U, // <1,7,0,3>: Cost 4 vext2 <2,6,1,7>, <0,3,2,1>
+  2726466580U, // <1,7,0,4>: Cost 3 vext3 <7,0,4,1>, <7,0,4,1>
+  3654921933U, // <1,7,0,5>: Cost 4 vext1 <5,1,7,0>, <5,1,7,0>
+  2621358582U, // <1,7,0,6>: Cost 3 vext2 <0,6,1,7>, <0,6,1,7>
+  2622022215U, // <1,7,0,7>: Cost 3 vext2 <0,7,1,7>, <0,7,1,7>
+  2626667165U, // <1,7,0,u>: Cost 3 vext2 <1,5,1,7>, LHS
+  2593128550U, // <1,7,1,0>: Cost 3 vext1 <7,1,7,1>, LHS
+  2626667316U, // <1,7,1,1>: Cost 3 vext2 <1,5,1,7>, <1,1,1,1>
+  3700409238U, // <1,7,1,2>: Cost 4 vext2 <1,5,1,7>, <1,2,3,0>
+  2257294428U, // <1,7,1,3>: Cost 3 vrev <7,1,3,1>
+  2593131830U, // <1,7,1,4>: Cost 3 vext1 <7,1,7,1>, RHS
+  2626667646U, // <1,7,1,5>: Cost 3 vext2 <1,5,1,7>, <1,5,1,7>
+  2627331279U, // <1,7,1,6>: Cost 3 vext2 <1,6,1,7>, <1,6,1,7>
+  2593133696U, // <1,7,1,7>: Cost 3 vext1 <7,1,7,1>, <7,1,7,1>
+  2628658545U, // <1,7,1,u>: Cost 3 vext2 <1,u,1,7>, <1,u,1,7>
+  2587164774U, // <1,7,2,0>: Cost 3 vext1 <6,1,7,2>, LHS
+  3701073445U, // <1,7,2,1>: Cost 4 vext2 <1,6,1,7>, <2,1,3,7>
+  3700409960U, // <1,7,2,2>: Cost 4 vext2 <1,5,1,7>, <2,2,2,2>
+  2638612134U, // <1,7,2,3>: Cost 3 vext2 <3,5,1,7>, <2,3,0,1>
+  2587168054U, // <1,7,2,4>: Cost 3 vext1 <6,1,7,2>, RHS
+  3706382167U, // <1,7,2,5>: Cost 4 vext2 <2,5,1,7>, <2,5,1,7>
+  2587169192U, // <1,7,2,6>: Cost 3 vext1 <6,1,7,2>, <6,1,7,2>
+  3660911610U, // <1,7,2,7>: Cost 4 vext1 <6,1,7,2>, <7,0,1,2>
+  2587170606U, // <1,7,2,u>: Cost 3 vext1 <6,1,7,2>, LHS
+  1507459174U, // <1,7,3,0>: Cost 2 vext1 <5,1,7,3>, LHS
+  2569257984U, // <1,7,3,1>: Cost 3 vext1 <3,1,7,3>, <1,3,5,7>
+  2581202536U, // <1,7,3,2>: Cost 3 vext1 <5,1,7,3>, <2,2,2,2>
+  2569259294U, // <1,7,3,3>: Cost 3 vext1 <3,1,7,3>, <3,1,7,3>
+  1507462454U, // <1,7,3,4>: Cost 2 vext1 <5,1,7,3>, RHS
+  1507462864U, // <1,7,3,5>: Cost 2 vext1 <5,1,7,3>, <5,1,7,3>
+  2581205498U, // <1,7,3,6>: Cost 3 vext1 <5,1,7,3>, <6,2,7,3>
+  2581206010U, // <1,7,3,7>: Cost 3 vext1 <5,1,7,3>, <7,0,1,2>
+  1507465006U, // <1,7,3,u>: Cost 2 vext1 <5,1,7,3>, LHS
+  2728826164U, // <1,7,4,0>: Cost 3 vext3 <7,4,0,1>, <7,4,0,1>
+  3654951732U, // <1,7,4,1>: Cost 4 vext1 <5,1,7,4>, <1,1,1,1>
+  3330987094U, // <1,7,4,2>: Cost 4 vrev <7,1,2,4>
+  3331060831U, // <1,7,4,3>: Cost 4 vrev <7,1,3,4>
+  3787674971U, // <1,7,4,4>: Cost 4 vext3 <4,u,5,1>, <7,4,4,4>
+  2626669878U, // <1,7,4,5>: Cost 3 vext2 <1,5,1,7>, RHS
+  3785979241U, // <1,7,4,6>: Cost 4 vext3 <4,6,0,1>, <7,4,6,0>
+  3787085176U, // <1,7,4,7>: Cost 4 vext3 <4,7,6,1>, <7,4,7,6>
+  2626670121U, // <1,7,4,u>: Cost 3 vext2 <1,5,1,7>, RHS
+  2569273446U, // <1,7,5,0>: Cost 3 vext1 <3,1,7,5>, LHS
+  2569274368U, // <1,7,5,1>: Cost 3 vext1 <3,1,7,5>, <1,3,5,7>
+  3643016808U, // <1,7,5,2>: Cost 4 vext1 <3,1,7,5>, <2,2,2,2>
+  2569275680U, // <1,7,5,3>: Cost 3 vext1 <3,1,7,5>, <3,1,7,5>
+  2569276726U, // <1,7,5,4>: Cost 3 vext1 <3,1,7,5>, RHS
+  4102034790U, // <1,7,5,5>: Cost 4 vtrnl <1,3,5,7>, <7,4,5,6>
+  2651222067U, // <1,7,5,6>: Cost 3 vext2 <5,6,1,7>, <5,6,1,7>
+  3899378998U, // <1,7,5,7>: Cost 4 vuzpr <1,1,5,7>, RHS
+  2569279278U, // <1,7,5,u>: Cost 3 vext1 <3,1,7,5>, LHS
+  2730153430U, // <1,7,6,0>: Cost 3 vext3 <7,6,0,1>, <7,6,0,1>
+  2724845022U, // <1,7,6,1>: Cost 3 vext3 <6,7,0,1>, <7,6,1,0>
+  3643025338U, // <1,7,6,2>: Cost 4 vext1 <3,1,7,6>, <2,6,3,7>
+  3643025697U, // <1,7,6,3>: Cost 4 vext1 <3,1,7,6>, <3,1,7,6>
+  3643026742U, // <1,7,6,4>: Cost 4 vext1 <3,1,7,6>, RHS
+  3654971091U, // <1,7,6,5>: Cost 4 vext1 <5,1,7,6>, <5,1,7,6>
+  3787675153U, // <1,7,6,6>: Cost 4 vext3 <4,u,5,1>, <7,6,6,6>
+  2724845076U, // <1,7,6,7>: Cost 3 vext3 <6,7,0,1>, <7,6,7,0>
+  2725508637U, // <1,7,6,u>: Cost 3 vext3 <6,u,0,1>, <7,6,u,0>
+  2730817063U, // <1,7,7,0>: Cost 3 vext3 <7,7,0,1>, <7,7,0,1>
+  3631088436U, // <1,7,7,1>: Cost 4 vext1 <1,1,7,7>, <1,1,1,1>
+  3660949158U, // <1,7,7,2>: Cost 4 vext1 <6,1,7,7>, <2,3,0,1>
+  3801904705U, // <1,7,7,3>: Cost 4 vext3 <7,3,0,1>, <7,7,3,0>
+  3631090998U, // <1,7,7,4>: Cost 4 vext1 <1,1,7,7>, RHS
+  2662503828U, // <1,7,7,5>: Cost 3 vext2 <7,5,1,7>, <7,5,1,7>
+  3660951981U, // <1,7,7,6>: Cost 4 vext1 <6,1,7,7>, <6,1,7,7>
+  2713933420U, // <1,7,7,7>: Cost 3 vext3 <4,u,5,1>, <7,7,7,7>
+  2731406959U, // <1,7,7,u>: Cost 3 vext3 <7,7,u,1>, <7,7,u,1>
+  1507500134U, // <1,7,u,0>: Cost 2 vext1 <5,1,7,u>, LHS
+  2626672430U, // <1,7,u,1>: Cost 3 vext2 <1,5,1,7>, LHS
+  2581243496U, // <1,7,u,2>: Cost 3 vext1 <5,1,7,u>, <2,2,2,2>
+  2569300259U, // <1,7,u,3>: Cost 3 vext1 <3,1,7,u>, <3,1,7,u>
+  1507503414U, // <1,7,u,4>: Cost 2 vext1 <5,1,7,u>, RHS
+  1507503829U, // <1,7,u,5>: Cost 2 vext1 <5,1,7,u>, <5,1,7,u>
+  2581246458U, // <1,7,u,6>: Cost 3 vext1 <5,1,7,u>, <6,2,7,3>
+  2581246970U, // <1,7,u,7>: Cost 3 vext1 <5,1,7,u>, <7,0,1,2>
+  1507505966U, // <1,7,u,u>: Cost 2 vext1 <5,1,7,u>, LHS
+  1543643153U, // <1,u,0,0>: Cost 2 vext2 <0,0,1,u>, <0,0,1,u>
+  1546297446U, // <1,u,0,1>: Cost 2 vext2 <0,4,1,u>, LHS
+  2819448852U, // <1,u,0,2>: Cost 3 vuzpr LHS, <0,0,2,2>
+  2619375876U, // <1,u,0,3>: Cost 3 vext2 <0,3,1,u>, <0,3,1,u>
+  1546297685U, // <1,u,0,4>: Cost 2 vext2 <0,4,1,u>, <0,4,1,u>
+  1658771190U, // <1,u,0,5>: Cost 2 vext3 <u,0,5,1>, <u,0,5,1>
+  2736789248U, // <1,u,0,6>: Cost 3 vext3 <u,7,0,1>, <u,0,6,2>
+  2659189376U, // <1,u,0,7>: Cost 3 vext2 <7,0,1,u>, <0,7,u,1>
+  1546298013U, // <1,u,0,u>: Cost 2 vext2 <0,4,1,u>, LHS
+  1483112550U, // <1,u,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
+  202162278U, // <1,u,1,1>: Cost 1 vdup1 LHS
+  1616009006U, // <1,u,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
+  1745707110U, // <1,u,1,3>: Cost 2 vuzpr LHS, LHS
+  1483115830U, // <1,u,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
+  2620040336U, // <1,u,1,5>: Cost 3 vext2 <0,4,1,u>, <1,5,3,7>
+  3026622618U, // <1,u,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
+  2958183752U, // <1,u,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
+  202162278U, // <1,u,1,u>: Cost 1 vdup1 LHS
+  2819449750U, // <1,u,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
+  2893207342U, // <1,u,2,1>: Cost 3 vzipl <1,2,3,0>, LHS
+  2819448996U, // <1,u,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
+  2819450482U, // <1,u,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
+  2819449754U, // <1,u,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
+  2893207706U, // <1,u,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
+  2819449036U, // <1,u,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
+  2970799432U, // <1,u,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
+  2819449002U, // <1,u,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
+  403931292U, // <1,u,3,0>: Cost 1 vext1 LHS, LHS
+  1477673718U, // <1,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  115726126U, // <1,u,3,2>: Cost 1 vrev LHS
+  2014102173U, // <1,u,3,3>: Cost 2 vtrnr LHS, LHS
+  403934518U, // <1,u,3,4>: Cost 1 vext1 LHS, RHS
+  1507536601U, // <1,u,3,5>: Cost 2 vext1 <5,1,u,3>, <5,1,u,3>
+  1525453306U, // <1,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  2014105129U, // <1,u,3,7>: Cost 2 vtrnr LHS, RHS
+  403937070U, // <1,u,3,u>: Cost 1 vext1 LHS, LHS
+  2620042157U, // <1,u,4,0>: Cost 3 vext2 <0,4,1,u>, <4,0,u,1>
+  2620042237U, // <1,u,4,1>: Cost 3 vext2 <0,4,1,u>, <4,1,u,0>
+  2263217967U, // <1,u,4,2>: Cost 3 vrev <u,1,2,4>
+  2569341224U, // <1,u,4,3>: Cost 3 vext1 <3,1,u,4>, <3,1,u,4>
+  2569342262U, // <1,u,4,4>: Cost 3 vext1 <3,1,u,4>, RHS
+  1546300726U, // <1,u,4,5>: Cost 2 vext2 <0,4,1,u>, RHS
+  2819449180U, // <1,u,4,6>: Cost 3 vuzpr LHS, <0,4,2,6>
+  2724845649U, // <1,u,4,7>: Cost 3 vext3 <6,7,0,1>, <u,4,7,6>
+  1546300969U, // <1,u,4,u>: Cost 2 vext2 <0,4,1,u>, RHS
+  2551431270U, // <1,u,5,0>: Cost 3 vext1 <0,1,u,5>, LHS
+  2551432192U, // <1,u,5,1>: Cost 3 vext1 <0,1,u,5>, <1,3,5,7>
+  3028293422U, // <1,u,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
+  2955559068U, // <1,u,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
+  2551434550U, // <1,u,5,4>: Cost 3 vext1 <0,1,u,5>, RHS
+  2895255706U, // <1,u,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
+  1616009370U, // <1,u,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  1745710390U, // <1,u,5,7>: Cost 2 vuzpr LHS, RHS
+  1745710391U, // <1,u,5,u>: Cost 2 vuzpr LHS, RHS
+  2653221159U, // <1,u,6,0>: Cost 3 vext2 <6,0,1,u>, <6,0,1,u>
+  2725509303U, // <1,u,6,1>: Cost 3 vext3 <6,u,0,1>, <u,6,1,0>
+  2659193338U, // <1,u,6,2>: Cost 3 vext2 <7,0,1,u>, <6,2,7,3>
+  2689751248U, // <1,u,6,3>: Cost 3 vext3 <0,u,1,1>, <u,6,3,7>
+  2867228774U, // <1,u,6,4>: Cost 3 vuzpr LHS, <5,6,7,4>
+  3764820194U, // <1,u,6,5>: Cost 4 vext3 <1,1,1,1>, <u,6,5,7>
+  2657202957U, // <1,u,6,6>: Cost 3 vext2 <6,6,1,u>, <6,6,1,u>
+  2819450810U, // <1,u,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
+  2819450811U, // <1,u,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
+  1585452032U, // <1,u,7,0>: Cost 2 vext2 <7,0,1,u>, <7,0,1,u>
+  2557420340U, // <1,u,7,1>: Cost 3 vext1 <1,1,u,7>, <1,1,1,1>
+  2569365158U, // <1,u,7,2>: Cost 3 vext1 <3,1,u,7>, <2,3,0,1>
+  2569365803U, // <1,u,7,3>: Cost 3 vext1 <3,1,u,7>, <3,1,u,7>
+  2557422902U, // <1,u,7,4>: Cost 3 vext1 <1,1,u,7>, RHS
+  2662512021U, // <1,u,7,5>: Cost 3 vext2 <7,5,1,u>, <7,5,1,u>
+  2724845884U, // <1,u,7,6>: Cost 3 vext3 <6,7,0,1>, <u,7,6,7>
+  2659194476U, // <1,u,7,7>: Cost 3 vext2 <7,0,1,u>, <7,7,7,7>
+  1590761096U, // <1,u,7,u>: Cost 2 vext2 <7,u,1,u>, <7,u,1,u>
+  403972257U, // <1,u,u,0>: Cost 1 vext1 LHS, LHS
+  202162278U, // <1,u,u,1>: Cost 1 vdup1 LHS
+  115767091U, // <1,u,u,2>: Cost 1 vrev LHS
+  1745707677U, // <1,u,u,3>: Cost 2 vuzpr LHS, LHS
+  403975478U, // <1,u,u,4>: Cost 1 vext1 LHS, RHS
+  1546303642U, // <1,u,u,5>: Cost 2 vext2 <0,4,1,u>, RHS
+  1616009613U, // <1,u,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  1745710633U, // <1,u,u,7>: Cost 2 vuzpr LHS, RHS
+  403978030U, // <1,u,u,u>: Cost 1 vext1 LHS, LHS
+  2551463936U, // <2,0,0,0>: Cost 3 vext1 <0,2,0,0>, <0,0,0,0>
+  2685698058U, // <2,0,0,1>: Cost 3 vext3 <0,2,0,2>, <0,0,1,1>
+  1610776596U, // <2,0,0,2>: Cost 2 vext3 <0,0,2,2>, <0,0,2,2>
+  2619384069U, // <2,0,0,3>: Cost 3 vext2 <0,3,2,0>, <0,3,2,0>
+  2551467318U, // <2,0,0,4>: Cost 3 vext1 <0,2,0,0>, RHS
+  3899836596U, // <2,0,0,5>: Cost 4 vuzpr <1,2,3,0>, <3,0,4,5>
+  2621374968U, // <2,0,0,6>: Cost 3 vext2 <0,6,2,0>, <0,6,2,0>
+  4168271334U, // <2,0,0,7>: Cost 4 vtrnr <1,2,3,0>, <2,0,5,7>
+  1611219018U, // <2,0,0,u>: Cost 2 vext3 <0,0,u,2>, <0,0,u,2>
+  2551472138U, // <2,0,1,0>: Cost 3 vext1 <0,2,0,1>, <0,0,1,1>
+  2690564186U, // <2,0,1,1>: Cost 3 vext3 <1,0,3,2>, <0,1,1,0>
+  1611956326U, // <2,0,1,2>: Cost 2 vext3 <0,2,0,2>, LHS
+  2826092646U, // <2,0,1,3>: Cost 3 vuzpr <1,2,3,0>, LHS
+  2551475510U, // <2,0,1,4>: Cost 3 vext1 <0,2,0,1>, RHS
+  3692463248U, // <2,0,1,5>: Cost 4 vext2 <0,2,2,0>, <1,5,3,7>
+  2587308473U, // <2,0,1,6>: Cost 3 vext1 <6,2,0,1>, <6,2,0,1>
+  3661050874U, // <2,0,1,7>: Cost 4 vext1 <6,2,0,1>, <7,0,1,2>
+  1611956380U, // <2,0,1,u>: Cost 2 vext3 <0,2,0,2>, LHS
+  1477738598U, // <2,0,2,0>: Cost 2 vext1 <0,2,0,2>, LHS
+  2551481078U, // <2,0,2,1>: Cost 3 vext1 <0,2,0,2>, <1,0,3,2>
+  2551481796U, // <2,0,2,2>: Cost 3 vext1 <0,2,0,2>, <2,0,2,0>
+  2551482518U, // <2,0,2,3>: Cost 3 vext1 <0,2,0,2>, <3,0,1,2>
+  1477741878U, // <2,0,2,4>: Cost 2 vext1 <0,2,0,2>, RHS
+  2551484112U, // <2,0,2,5>: Cost 3 vext1 <0,2,0,2>, <5,1,7,3>
+  2551484759U, // <2,0,2,6>: Cost 3 vext1 <0,2,0,2>, <6,0,7,2>
+  2551485434U, // <2,0,2,7>: Cost 3 vext1 <0,2,0,2>, <7,0,1,2>
+  1477744430U, // <2,0,2,u>: Cost 2 vext1 <0,2,0,2>, LHS
+  2953625600U, // <2,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
+  2953627302U, // <2,0,3,1>: Cost 3 vzipr LHS, <2,3,0,1>
+  2953625764U, // <2,0,3,2>: Cost 3 vzipr LHS, <0,2,0,2>
+  4027369695U, // <2,0,3,3>: Cost 4 vzipr LHS, <3,1,0,3>
+  3625233718U, // <2,0,3,4>: Cost 4 vext1 <0,2,0,3>, RHS
+  3899836110U, // <2,0,3,5>: Cost 4 vuzpr <1,2,3,0>, <2,3,4,5>
+  4032012618U, // <2,0,3,6>: Cost 4 vzipr LHS, <0,4,0,6>
+  3899835392U, // <2,0,3,7>: Cost 4 vuzpr <1,2,3,0>, <1,3,5,7>
+  2953625770U, // <2,0,3,u>: Cost 3 vzipr LHS, <0,2,0,u>
+  2551496806U, // <2,0,4,0>: Cost 3 vext1 <0,2,0,4>, LHS
+  2685698386U, // <2,0,4,1>: Cost 3 vext3 <0,2,0,2>, <0,4,1,5>
+  2685698396U, // <2,0,4,2>: Cost 3 vext3 <0,2,0,2>, <0,4,2,6>
+  3625240726U, // <2,0,4,3>: Cost 4 vext1 <0,2,0,4>, <3,0,1,2>
+  2551500086U, // <2,0,4,4>: Cost 3 vext1 <0,2,0,4>, RHS
+  2618723638U, // <2,0,4,5>: Cost 3 vext2 <0,2,2,0>, RHS
+  2765409590U, // <2,0,4,6>: Cost 3 vuzpl <2,3,0,1>, RHS
+  3799990664U, // <2,0,4,7>: Cost 4 vext3 <7,0,1,2>, <0,4,7,5>
+  2685698450U, // <2,0,4,u>: Cost 3 vext3 <0,2,0,2>, <0,4,u,6>
+  3625246822U, // <2,0,5,0>: Cost 4 vext1 <0,2,0,5>, LHS
+  3289776304U, // <2,0,5,1>: Cost 4 vrev <0,2,1,5>
+  2690564526U, // <2,0,5,2>: Cost 3 vext3 <1,0,3,2>, <0,5,2,7>
+  3289923778U, // <2,0,5,3>: Cost 4 vrev <0,2,3,5>
+  2216255691U, // <2,0,5,4>: Cost 3 vrev <0,2,4,5>
+  3726307332U, // <2,0,5,5>: Cost 4 vext2 <5,u,2,0>, <5,5,5,5>
+  3726307426U, // <2,0,5,6>: Cost 4 vext2 <5,u,2,0>, <5,6,7,0>
+  2826095926U, // <2,0,5,7>: Cost 3 vuzpr <1,2,3,0>, RHS
+  2216550639U, // <2,0,5,u>: Cost 3 vrev <0,2,u,5>
+  4162420736U, // <2,0,6,0>: Cost 4 vtrnr <0,2,4,6>, <0,0,0,0>
+  2901885030U, // <2,0,6,1>: Cost 3 vzipl <2,6,3,7>, LHS
+  2685698559U, // <2,0,6,2>: Cost 3 vext3 <0,2,0,2>, <0,6,2,7>
+  3643173171U, // <2,0,6,3>: Cost 4 vext1 <3,2,0,6>, <3,2,0,6>
+  2216263884U, // <2,0,6,4>: Cost 3 vrev <0,2,4,6>
+  3730289341U, // <2,0,6,5>: Cost 4 vext2 <6,5,2,0>, <6,5,2,0>
+  3726308152U, // <2,0,6,6>: Cost 4 vext2 <5,u,2,0>, <6,6,6,6>
+  3899836346U, // <2,0,6,7>: Cost 4 vuzpr <1,2,3,0>, <2,6,3,7>
+  2216558832U, // <2,0,6,u>: Cost 3 vrev <0,2,u,6>
+  2659202049U, // <2,0,7,0>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
+  3726308437U, // <2,0,7,1>: Cost 4 vext2 <5,u,2,0>, <7,1,2,3>
+  2726249034U, // <2,0,7,2>: Cost 3 vext3 <7,0,1,2>, <0,7,2,1>
+  3734934772U, // <2,0,7,3>: Cost 4 vext2 <7,3,2,0>, <7,3,2,0>
+  3726308710U, // <2,0,7,4>: Cost 4 vext2 <5,u,2,0>, <7,4,5,6>
+  3726308814U, // <2,0,7,5>: Cost 4 vext2 <5,u,2,0>, <7,5,u,2>
+  3736925671U, // <2,0,7,6>: Cost 4 vext2 <7,6,2,0>, <7,6,2,0>
+  3726308972U, // <2,0,7,7>: Cost 4 vext2 <5,u,2,0>, <7,7,7,7>
+  2659202049U, // <2,0,7,u>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
+  1477787750U, // <2,0,u,0>: Cost 2 vext1 <0,2,0,u>, LHS
+  2953668262U, // <2,0,u,1>: Cost 3 vzipr LHS, <2,3,0,1>
+  1611956893U, // <2,0,u,2>: Cost 2 vext3 <0,2,0,2>, LHS
+  2551531670U, // <2,0,u,3>: Cost 3 vext1 <0,2,0,u>, <3,0,1,2>
+  1477791030U, // <2,0,u,4>: Cost 2 vext1 <0,2,0,u>, RHS
+  2618726554U, // <2,0,u,5>: Cost 3 vext2 <0,2,2,0>, RHS
+  2765412506U, // <2,0,u,6>: Cost 3 vuzpl <2,3,0,1>, RHS
+  2826096169U, // <2,0,u,7>: Cost 3 vuzpr <1,2,3,0>, RHS
+  1611956947U, // <2,0,u,u>: Cost 2 vext3 <0,2,0,2>, LHS
+  2569453670U, // <2,1,0,0>: Cost 3 vext1 <3,2,1,0>, LHS
+  2619392102U, // <2,1,0,1>: Cost 3 vext2 <0,3,2,1>, LHS
+  3759440619U, // <2,1,0,2>: Cost 4 vext3 <0,2,0,2>, <1,0,2,0>
+  1616823030U, // <2,1,0,3>: Cost 2 vext3 <1,0,3,2>, <1,0,3,2>
+  2569456950U, // <2,1,0,4>: Cost 3 vext1 <3,2,1,0>, RHS
+  2690712328U, // <2,1,0,5>: Cost 3 vext3 <1,0,5,2>, <1,0,5,2>
+  3661115841U, // <2,1,0,6>: Cost 4 vext1 <6,2,1,0>, <6,2,1,0>
+  2622046794U, // <2,1,0,7>: Cost 3 vext2 <0,7,2,1>, <0,7,2,1>
+  1617191715U, // <2,1,0,u>: Cost 2 vext3 <1,0,u,2>, <1,0,u,2>
+  2551545958U, // <2,1,1,0>: Cost 3 vext1 <0,2,1,1>, LHS
+  2685698868U, // <2,1,1,1>: Cost 3 vext3 <0,2,0,2>, <1,1,1,1>
+  2628682646U, // <2,1,1,2>: Cost 3 vext2 <1,u,2,1>, <1,2,3,0>
+  2685698888U, // <2,1,1,3>: Cost 3 vext3 <0,2,0,2>, <1,1,3,3>
+  2551549238U, // <2,1,1,4>: Cost 3 vext1 <0,2,1,1>, RHS
+  3693134992U, // <2,1,1,5>: Cost 4 vext2 <0,3,2,1>, <1,5,3,7>
+  3661124034U, // <2,1,1,6>: Cost 4 vext1 <6,2,1,1>, <6,2,1,1>
+  3625292794U, // <2,1,1,7>: Cost 4 vext1 <0,2,1,1>, <7,0,1,2>
+  2685698933U, // <2,1,1,u>: Cost 3 vext3 <0,2,0,2>, <1,1,u,3>
+  2551554150U, // <2,1,2,0>: Cost 3 vext1 <0,2,1,2>, LHS
+  3893649571U, // <2,1,2,1>: Cost 4 vuzpr <0,2,0,1>, <0,2,0,1>
+  2551555688U, // <2,1,2,2>: Cost 3 vext1 <0,2,1,2>, <2,2,2,2>
+  2685698966U, // <2,1,2,3>: Cost 3 vext3 <0,2,0,2>, <1,2,3,0>
+  2551557430U, // <2,1,2,4>: Cost 3 vext1 <0,2,1,2>, RHS
+  3763422123U, // <2,1,2,5>: Cost 4 vext3 <0,u,0,2>, <1,2,5,3>
+  3693135802U, // <2,1,2,6>: Cost 4 vext2 <0,3,2,1>, <2,6,3,7>
+  2726249402U, // <2,1,2,7>: Cost 3 vext3 <7,0,1,2>, <1,2,7,0>
+  2685699011U, // <2,1,2,u>: Cost 3 vext3 <0,2,0,2>, <1,2,u,0>
+  2551562342U, // <2,1,3,0>: Cost 3 vext1 <0,2,1,3>, LHS
+  2953625610U, // <2,1,3,1>: Cost 3 vzipr LHS, <0,0,1,1>
+  2953627798U, // <2,1,3,2>: Cost 3 vzipr LHS, <3,0,1,2>
+  2953626584U, // <2,1,3,3>: Cost 3 vzipr LHS, <1,3,1,3>
+  2551565622U, // <2,1,3,4>: Cost 3 vext1 <0,2,1,3>, RHS
+  2953625938U, // <2,1,3,5>: Cost 3 vzipr LHS, <0,4,1,5>
+  2587398596U, // <2,1,3,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
+  4032013519U, // <2,1,3,7>: Cost 4 vzipr LHS, <1,6,1,7>
+  2953625617U, // <2,1,3,u>: Cost 3 vzipr LHS, <0,0,1,u>
+  2690565154U, // <2,1,4,0>: Cost 3 vext3 <1,0,3,2>, <1,4,0,5>
+  3625313270U, // <2,1,4,1>: Cost 4 vext1 <0,2,1,4>, <1,3,4,6>
+  3771532340U, // <2,1,4,2>: Cost 4 vext3 <2,2,2,2>, <1,4,2,5>
+  1148404634U, // <2,1,4,3>: Cost 2 vrev <1,2,3,4>
+  3625315638U, // <2,1,4,4>: Cost 4 vext1 <0,2,1,4>, RHS
+  2619395382U, // <2,1,4,5>: Cost 3 vext2 <0,3,2,1>, RHS
+  3837242678U, // <2,1,4,6>: Cost 4 vuzpl <2,0,1,2>, RHS
+  3799991394U, // <2,1,4,7>: Cost 4 vext3 <7,0,1,2>, <1,4,7,6>
+  1148773319U, // <2,1,4,u>: Cost 2 vrev <1,2,u,4>
+  2551578726U, // <2,1,5,0>: Cost 3 vext1 <0,2,1,5>, LHS
+  2551579648U, // <2,1,5,1>: Cost 3 vext1 <0,2,1,5>, <1,3,5,7>
+  3625321952U, // <2,1,5,2>: Cost 4 vext1 <0,2,1,5>, <2,0,5,1>
+  2685699216U, // <2,1,5,3>: Cost 3 vext3 <0,2,0,2>, <1,5,3,7>
+  2551582006U, // <2,1,5,4>: Cost 3 vext1 <0,2,1,5>, RHS
+  3740913668U, // <2,1,5,5>: Cost 4 vext2 <u,3,2,1>, <5,5,5,5>
+  3661156806U, // <2,1,5,6>: Cost 4 vext1 <6,2,1,5>, <6,2,1,5>
+  3893652790U, // <2,1,5,7>: Cost 4 vuzpr <0,2,0,1>, RHS
+  2685699261U, // <2,1,5,u>: Cost 3 vext3 <0,2,0,2>, <1,5,u,7>
+  2551586918U, // <2,1,6,0>: Cost 3 vext1 <0,2,1,6>, LHS
+  3625329398U, // <2,1,6,1>: Cost 4 vext1 <0,2,1,6>, <1,0,3,2>
+  2551588794U, // <2,1,6,2>: Cost 3 vext1 <0,2,1,6>, <2,6,3,7>
+  3088679014U, // <2,1,6,3>: Cost 3 vtrnr <0,2,4,6>, LHS
+  2551590198U, // <2,1,6,4>: Cost 3 vext1 <0,2,1,6>, RHS
+  4029382994U, // <2,1,6,5>: Cost 4 vzipr <0,4,2,6>, <0,4,1,5>
+  3625333560U, // <2,1,6,6>: Cost 4 vext1 <0,2,1,6>, <6,6,6,6>
+  3731624800U, // <2,1,6,7>: Cost 4 vext2 <6,7,2,1>, <6,7,2,1>
+  2551592750U, // <2,1,6,u>: Cost 3 vext1 <0,2,1,6>, LHS
+  2622051322U, // <2,1,7,0>: Cost 3 vext2 <0,7,2,1>, <7,0,1,2>
+  3733615699U, // <2,1,7,1>: Cost 4 vext2 <7,1,2,1>, <7,1,2,1>
+  3795125538U, // <2,1,7,2>: Cost 4 vext3 <6,1,7,2>, <1,7,2,0>
+  2222171037U, // <2,1,7,3>: Cost 3 vrev <1,2,3,7>
+  3740915046U, // <2,1,7,4>: Cost 4 vext2 <u,3,2,1>, <7,4,5,6>
+  3296060335U, // <2,1,7,5>: Cost 4 vrev <1,2,5,7>
+  3736933864U, // <2,1,7,6>: Cost 4 vext2 <7,6,2,1>, <7,6,2,1>
+  3805300055U, // <2,1,7,7>: Cost 4 vext3 <7,u,1,2>, <1,7,7,u>
+  2669827714U, // <2,1,7,u>: Cost 3 vext2 <u,7,2,1>, <7,u,1,2>
+  2551603302U, // <2,1,u,0>: Cost 3 vext1 <0,2,1,u>, LHS
+  2953666570U, // <2,1,u,1>: Cost 3 vzipr LHS, <0,0,1,1>
+  2953668758U, // <2,1,u,2>: Cost 3 vzipr LHS, <3,0,1,2>
+  1148437406U, // <2,1,u,3>: Cost 2 vrev <1,2,3,u>
+  2551606582U, // <2,1,u,4>: Cost 3 vext1 <0,2,1,u>, RHS
+  2953666898U, // <2,1,u,5>: Cost 3 vzipr LHS, <0,4,1,5>
+  2587398596U, // <2,1,u,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
+  2669828370U, // <2,1,u,7>: Cost 3 vext2 <u,7,2,1>, <u,7,2,1>
+  1148806091U, // <2,1,u,u>: Cost 2 vrev <1,2,u,u>
+  1543667732U, // <2,2,0,0>: Cost 2 vext2 <0,0,2,2>, <0,0,2,2>
+  1548976230U, // <2,2,0,1>: Cost 2 vext2 <0,u,2,2>, LHS
+  2685699524U, // <2,2,0,2>: Cost 3 vext3 <0,2,0,2>, <2,0,2,0>
+  2685699535U, // <2,2,0,3>: Cost 3 vext3 <0,2,0,2>, <2,0,3,2>
+  2551614774U, // <2,2,0,4>: Cost 3 vext1 <0,2,2,0>, RHS
+  3704422830U, // <2,2,0,5>: Cost 4 vext2 <2,2,2,2>, <0,5,2,7>
+  3893657642U, // <2,2,0,6>: Cost 4 vuzpr <0,2,0,2>, <0,0,4,6>
+  3770574323U, // <2,2,0,7>: Cost 4 vext3 <2,0,7,2>, <2,0,7,2>
+  1548976796U, // <2,2,0,u>: Cost 2 vext2 <0,u,2,2>, <0,u,2,2>
+  2622718710U, // <2,2,1,0>: Cost 3 vext2 <0,u,2,2>, <1,0,3,2>
+  2622718772U, // <2,2,1,1>: Cost 3 vext2 <0,u,2,2>, <1,1,1,1>
+  2622718870U, // <2,2,1,2>: Cost 3 vext2 <0,u,2,2>, <1,2,3,0>
+  2819915878U, // <2,2,1,3>: Cost 3 vuzpr <0,2,0,2>, LHS
+  3625364790U, // <2,2,1,4>: Cost 4 vext1 <0,2,2,1>, RHS
+  2622719120U, // <2,2,1,5>: Cost 3 vext2 <0,u,2,2>, <1,5,3,7>
+  3760031292U, // <2,2,1,6>: Cost 4 vext3 <0,2,u,2>, <2,1,6,3>
+  3667170468U, // <2,2,1,7>: Cost 4 vext1 <7,2,2,1>, <7,2,2,1>
+  2819915883U, // <2,2,1,u>: Cost 3 vuzpr <0,2,0,2>, LHS
+  1489829990U, // <2,2,2,0>: Cost 2 vext1 <2,2,2,2>, LHS
+  2563572470U, // <2,2,2,1>: Cost 3 vext1 <2,2,2,2>, <1,0,3,2>
+  269271142U, // <2,2,2,2>: Cost 1 vdup2 LHS
+  2685699698U, // <2,2,2,3>: Cost 3 vext3 <0,2,0,2>, <2,2,3,3>
+  1489833270U, // <2,2,2,4>: Cost 2 vext1 <2,2,2,2>, RHS
+  2685699720U, // <2,2,2,5>: Cost 3 vext3 <0,2,0,2>, <2,2,5,7>
+  2622719930U, // <2,2,2,6>: Cost 3 vext2 <0,u,2,2>, <2,6,3,7>
+  2593436837U, // <2,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
+  269271142U, // <2,2,2,u>: Cost 1 vdup2 LHS
+  2685699750U, // <2,2,3,0>: Cost 3 vext3 <0,2,0,2>, <2,3,0,1>
+  2690565806U, // <2,2,3,1>: Cost 3 vext3 <1,0,3,2>, <2,3,1,0>
+  2953627240U, // <2,2,3,2>: Cost 3 vzipr LHS, <2,2,2,2>
+  1879883878U, // <2,2,3,3>: Cost 2 vzipr LHS, LHS
+  2685699790U, // <2,2,3,4>: Cost 3 vext3 <0,2,0,2>, <2,3,4,5>
+  3893659342U, // <2,2,3,5>: Cost 4 vuzpr <0,2,0,2>, <2,3,4,5>
+  2958270812U, // <2,2,3,6>: Cost 3 vzipr LHS, <0,4,2,6>
+  2593445030U, // <2,2,3,7>: Cost 3 vext1 <7,2,2,3>, <7,2,2,3>
+  1879883883U, // <2,2,3,u>: Cost 2 vzipr LHS, LHS
+  2551644262U, // <2,2,4,0>: Cost 3 vext1 <0,2,2,4>, LHS
+  3625386742U, // <2,2,4,1>: Cost 4 vext1 <0,2,2,4>, <1,0,3,2>
+  2551645902U, // <2,2,4,2>: Cost 3 vext1 <0,2,2,4>, <2,3,4,5>
+  3759441686U, // <2,2,4,3>: Cost 4 vext3 <0,2,0,2>, <2,4,3,5>
+  2551647542U, // <2,2,4,4>: Cost 3 vext1 <0,2,2,4>, RHS
+  1548979510U, // <2,2,4,5>: Cost 2 vext2 <0,u,2,2>, RHS
+  2764901686U, // <2,2,4,6>: Cost 3 vuzpl <2,2,2,2>, RHS
+  3667195047U, // <2,2,4,7>: Cost 4 vext1 <7,2,2,4>, <7,2,2,4>
+  1548979753U, // <2,2,4,u>: Cost 2 vext2 <0,u,2,2>, RHS
+  3696463432U, // <2,2,5,0>: Cost 4 vext2 <0,u,2,2>, <5,0,1,2>
+  2617413328U, // <2,2,5,1>: Cost 3 vext2 <0,0,2,2>, <5,1,7,3>
+  2685699936U, // <2,2,5,2>: Cost 3 vext3 <0,2,0,2>, <2,5,2,7>
+  4027383910U, // <2,2,5,3>: Cost 4 vzipr <0,1,2,5>, LHS
+  2228201085U, // <2,2,5,4>: Cost 3 vrev <2,2,4,5>
+  2617413636U, // <2,2,5,5>: Cost 3 vext2 <0,0,2,2>, <5,5,5,5>
+  2617413730U, // <2,2,5,6>: Cost 3 vext2 <0,0,2,2>, <5,6,7,0>
+  2819919158U, // <2,2,5,7>: Cost 3 vuzpr <0,2,0,2>, RHS
+  2819919159U, // <2,2,5,u>: Cost 3 vuzpr <0,2,0,2>, RHS
+  3625402554U, // <2,2,6,0>: Cost 4 vext1 <0,2,2,6>, <0,2,2,6>
+  3760031652U, // <2,2,6,1>: Cost 4 vext3 <0,2,u,2>, <2,6,1,3>
+  2617414138U, // <2,2,6,2>: Cost 3 vext2 <0,0,2,2>, <6,2,7,3>
+  2685700026U, // <2,2,6,3>: Cost 3 vext3 <0,2,0,2>, <2,6,3,7>
+  3625405750U, // <2,2,6,4>: Cost 4 vext1 <0,2,2,6>, RHS
+  3760031692U, // <2,2,6,5>: Cost 4 vext3 <0,2,u,2>, <2,6,5,7>
+  3088679116U, // <2,2,6,6>: Cost 3 vtrnr <0,2,4,6>, <0,2,4,6>
+  2657891169U, // <2,2,6,7>: Cost 3 vext2 <6,7,2,2>, <6,7,2,2>
+  2685700071U, // <2,2,6,u>: Cost 3 vext3 <0,2,0,2>, <2,6,u,7>
+  2726250474U, // <2,2,7,0>: Cost 3 vext3 <7,0,1,2>, <2,7,0,1>
+  3704427616U, // <2,2,7,1>: Cost 4 vext2 <2,2,2,2>, <7,1,3,5>
+  2660545701U, // <2,2,7,2>: Cost 3 vext2 <7,2,2,2>, <7,2,2,2>
+  4030718054U, // <2,2,7,3>: Cost 4 vzipr <0,6,2,7>, LHS
+  2617415014U, // <2,2,7,4>: Cost 3 vext2 <0,0,2,2>, <7,4,5,6>
+  3302033032U, // <2,2,7,5>: Cost 4 vrev <2,2,5,7>
+  3661246929U, // <2,2,7,6>: Cost 4 vext1 <6,2,2,7>, <6,2,2,7>
+  2617415276U, // <2,2,7,7>: Cost 3 vext2 <0,0,2,2>, <7,7,7,7>
+  2731558962U, // <2,2,7,u>: Cost 3 vext3 <7,u,1,2>, <2,7,u,1>
+  1489829990U, // <2,2,u,0>: Cost 2 vext1 <2,2,2,2>, LHS
+  1548982062U, // <2,2,u,1>: Cost 2 vext2 <0,u,2,2>, LHS
+  269271142U, // <2,2,u,2>: Cost 1 vdup2 LHS
+  1879924838U, // <2,2,u,3>: Cost 2 vzipr LHS, LHS
+  1489833270U, // <2,2,u,4>: Cost 2 vext1 <2,2,2,2>, RHS
+  1548982426U, // <2,2,u,5>: Cost 2 vext2 <0,u,2,2>, RHS
+  2953666908U, // <2,2,u,6>: Cost 3 vzipr LHS, <0,4,2,6>
+  2819919401U, // <2,2,u,7>: Cost 3 vuzpr <0,2,0,2>, RHS
+  269271142U, // <2,2,u,u>: Cost 1 vdup2 LHS
+  1544339456U, // <2,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+  470597734U, // <2,3,0,1>: Cost 1 vext2 LHS, LHS
+  1548984484U, // <2,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  2619408648U, // <2,3,0,3>: Cost 3 vext2 <0,3,2,3>, <0,3,2,3>
+  1548984658U, // <2,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  2665857454U, // <2,3,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
+  2622726655U, // <2,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
+  2593494188U, // <2,3,0,7>: Cost 3 vext1 <7,2,3,0>, <7,2,3,0>
+  470598301U, // <2,3,0,u>: Cost 1 vext2 LHS, LHS
+  1544340214U, // <2,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  1544340276U, // <2,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+  1544340374U, // <2,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+  1548985304U, // <2,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  2551696694U, // <2,3,1,4>: Cost 3 vext1 <0,2,3,1>, RHS
+  1548985488U, // <2,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  2622727375U, // <2,3,1,6>: Cost 3 vext2 LHS, <1,6,1,7>
+  2665858347U, // <2,3,1,7>: Cost 3 vext2 LHS, <1,7,3,0>
+  1548985709U, // <2,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
+  2622727613U, // <2,3,2,0>: Cost 3 vext2 LHS, <2,0,1,2>
+  2622727711U, // <2,3,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
+  1544341096U, // <2,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
+  1544341158U, // <2,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+  2622727958U, // <2,3,2,4>: Cost 3 vext2 LHS, <2,4,3,5>
+  2622728032U, // <2,3,2,5>: Cost 3 vext2 LHS, <2,5,2,7>
+  1548986298U, // <2,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  2665859050U, // <2,3,2,7>: Cost 3 vext2 LHS, <2,7,0,1>
+  1548986427U, // <2,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
+  1548986518U, // <2,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+  2622728415U, // <2,3,3,1>: Cost 3 vext2 LHS, <3,1,0,3>
+  1489913458U, // <2,3,3,2>: Cost 2 vext1 <2,2,3,3>, <2,2,3,3>
+  1544341916U, // <2,3,3,3>: Cost 2 vext2 LHS, <3,3,3,3>
+  1548986882U, // <2,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+  2665859632U, // <2,3,3,5>: Cost 3 vext2 LHS, <3,5,1,7>
+  2234304870U, // <2,3,3,6>: Cost 3 vrev <3,2,6,3>
+  2958271632U, // <2,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
+  1548987166U, // <2,3,3,u>: Cost 2 vext2 LHS, <3,u,1,2>
+  1483948134U, // <2,3,4,0>: Cost 2 vext1 <1,2,3,4>, LHS
+  1483948954U, // <2,3,4,1>: Cost 2 vext1 <1,2,3,4>, <1,2,3,4>
+  2622729276U, // <2,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
+  2557692054U, // <2,3,4,3>: Cost 3 vext1 <1,2,3,4>, <3,0,1,2>
+  1483951414U, // <2,3,4,4>: Cost 2 vext1 <1,2,3,4>, RHS
+  470601014U, // <2,3,4,5>: Cost 1 vext2 LHS, RHS
+  1592118644U, // <2,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+  2593526960U, // <2,3,4,7>: Cost 3 vext1 <7,2,3,4>, <7,2,3,4>
+  470601257U, // <2,3,4,u>: Cost 1 vext2 LHS, RHS
+  2551726182U, // <2,3,5,0>: Cost 3 vext1 <0,2,3,5>, LHS
+  1592118992U, // <2,3,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
+  2665860862U, // <2,3,5,2>: Cost 3 vext2 LHS, <5,2,3,4>
+  2551728642U, // <2,3,5,3>: Cost 3 vext1 <0,2,3,5>, <3,4,5,6>
+  1592119238U, // <2,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+  1592119300U, // <2,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+  1592119394U, // <2,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
+  1592119464U, // <2,3,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
+  1592119545U, // <2,3,5,u>: Cost 2 vext2 LHS, <5,u,5,7>
+  2622730529U, // <2,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
+  2557707164U, // <2,3,6,1>: Cost 3 vext1 <1,2,3,6>, <1,2,3,6>
+  1592119802U, // <2,3,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
+  2665861682U, // <2,3,6,3>: Cost 3 vext2 LHS, <6,3,4,5>
+  2622730893U, // <2,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
+  2665861810U, // <2,3,6,5>: Cost 3 vext2 LHS, <6,5,0,7>
+  1592120120U, // <2,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+  1592120142U, // <2,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+  1592120223U, // <2,3,6,u>: Cost 2 vext2 LHS, <6,u,0,1>
+  1592120314U, // <2,3,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
+  2659890261U, // <2,3,7,1>: Cost 3 vext2 <7,1,2,3>, <7,1,2,3>
+  2660553894U, // <2,3,7,2>: Cost 3 vext2 <7,2,2,3>, <7,2,2,3>
+  2665862371U, // <2,3,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
+  1592120678U, // <2,3,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
+  2665862534U, // <2,3,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
+  2665862614U, // <2,3,7,6>: Cost 3 vext2 LHS, <7,6,0,1>
+  1592120940U, // <2,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+  1592120962U, // <2,3,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
+  1548990163U, // <2,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
+  470603566U, // <2,3,u,1>: Cost 1 vext2 LHS, LHS
+  1548990341U, // <2,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
+  1548990396U, // <2,3,u,3>: Cost 2 vext2 LHS, <u,3,0,1>
+  1548990527U, // <2,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
+  470603930U, // <2,3,u,5>: Cost 1 vext2 LHS, RHS
+  1548990672U, // <2,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
+  1592121600U, // <2,3,u,7>: Cost 2 vext2 LHS, <u,7,0,1>
+  470604133U, // <2,3,u,u>: Cost 1 vext2 LHS, LHS
+  2617425942U, // <2,4,0,0>: Cost 3 vext2 <0,0,2,4>, <0,0,2,4>
+  2618753126U, // <2,4,0,1>: Cost 3 vext2 <0,2,2,4>, LHS
+  2618753208U, // <2,4,0,2>: Cost 3 vext2 <0,2,2,4>, <0,2,2,4>
+  2619416841U, // <2,4,0,3>: Cost 3 vext2 <0,3,2,4>, <0,3,2,4>
+  2587593628U, // <2,4,0,4>: Cost 3 vext1 <6,2,4,0>, <4,0,6,2>
+  2712832914U, // <2,4,0,5>: Cost 3 vext3 <4,6,u,2>, <4,0,5,1>
+  1634962332U, // <2,4,0,6>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
+  3799993252U, // <2,4,0,7>: Cost 4 vext3 <7,0,1,2>, <4,0,7,1>
+  1634962332U, // <2,4,0,u>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
+  2619417334U, // <2,4,1,0>: Cost 3 vext2 <0,3,2,4>, <1,0,3,2>
+  3692495668U, // <2,4,1,1>: Cost 4 vext2 <0,2,2,4>, <1,1,1,1>
+  2625389466U, // <2,4,1,2>: Cost 3 vext2 <1,3,2,4>, <1,2,3,4>
+  2826125414U, // <2,4,1,3>: Cost 3 vuzpr <1,2,3,4>, LHS
+  3699794995U, // <2,4,1,4>: Cost 4 vext2 <1,4,2,4>, <1,4,2,4>
+  3692496016U, // <2,4,1,5>: Cost 4 vext2 <0,2,2,4>, <1,5,3,7>
+  3763424238U, // <2,4,1,6>: Cost 4 vext3 <0,u,0,2>, <4,1,6,3>
+  3667317942U, // <2,4,1,7>: Cost 4 vext1 <7,2,4,1>, <7,2,4,1>
+  2826125419U, // <2,4,1,u>: Cost 3 vuzpr <1,2,3,4>, LHS
+  2629371336U, // <2,4,2,0>: Cost 3 vext2 <2,0,2,4>, <2,0,2,4>
+  3699131946U, // <2,4,2,1>: Cost 4 vext2 <1,3,2,4>, <2,1,4,3>
+  2630698602U, // <2,4,2,2>: Cost 3 vext2 <2,2,2,4>, <2,2,2,4>
+  2618754766U, // <2,4,2,3>: Cost 3 vext2 <0,2,2,4>, <2,3,4,5>
+  2826126234U, // <2,4,2,4>: Cost 3 vuzpr <1,2,3,4>, <1,2,3,4>
+  2899119414U, // <2,4,2,5>: Cost 3 vzipl <2,2,2,2>, RHS
+  3033337142U, // <2,4,2,6>: Cost 3 vtrnl <2,2,2,2>, RHS
+  3800214597U, // <2,4,2,7>: Cost 4 vext3 <7,0,4,2>, <4,2,7,0>
+  2899119657U, // <2,4,2,u>: Cost 3 vzipl <2,2,2,2>, RHS
+  2635344033U, // <2,4,3,0>: Cost 3 vext2 <3,0,2,4>, <3,0,2,4>
+  4032012325U, // <2,4,3,1>: Cost 4 vzipr LHS, <0,0,4,1>
+  3692497228U, // <2,4,3,2>: Cost 4 vext2 <0,2,2,4>, <3,2,3,4>
+  3692497308U, // <2,4,3,3>: Cost 4 vext2 <0,2,2,4>, <3,3,3,3>
+  3001404624U, // <2,4,3,4>: Cost 3 vzipr LHS, <4,4,4,4>
+  2953627342U, // <2,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
+  2953625804U, // <2,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
+  3899868160U, // <2,4,3,7>: Cost 4 vuzpr <1,2,3,4>, <1,3,5,7>
+  2953625806U, // <2,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
+  2710916266U, // <2,4,4,0>: Cost 3 vext3 <4,4,0,2>, <4,4,0,2>
+  3899869648U, // <2,4,4,1>: Cost 4 vuzpr <1,2,3,4>, <3,4,0,1>
+  3899869658U, // <2,4,4,2>: Cost 4 vuzpr <1,2,3,4>, <3,4,1,2>
+  3899868930U, // <2,4,4,3>: Cost 4 vuzpr <1,2,3,4>, <2,4,1,3>
+  2712833232U, // <2,4,4,4>: Cost 3 vext3 <4,6,u,2>, <4,4,4,4>
+  2618756406U, // <2,4,4,5>: Cost 3 vext2 <0,2,2,4>, RHS
+  2765737270U, // <2,4,4,6>: Cost 3 vuzpl <2,3,4,5>, RHS
+  4168304426U, // <2,4,4,7>: Cost 4 vtrnr <1,2,3,4>, <2,4,5,7>
+  2618756649U, // <2,4,4,u>: Cost 3 vext2 <0,2,2,4>, RHS
+  2551800011U, // <2,4,5,0>: Cost 3 vext1 <0,2,4,5>, <0,2,4,5>
+  2569716470U, // <2,4,5,1>: Cost 3 vext1 <3,2,4,5>, <1,0,3,2>
+  2563745405U, // <2,4,5,2>: Cost 3 vext1 <2,2,4,5>, <2,2,4,5>
+  2569718102U, // <2,4,5,3>: Cost 3 vext1 <3,2,4,5>, <3,2,4,5>
+  2551803190U, // <2,4,5,4>: Cost 3 vext1 <0,2,4,5>, RHS
+  3625545732U, // <2,4,5,5>: Cost 4 vext1 <0,2,4,5>, <5,5,5,5>
+  1611959606U, // <2,4,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  2826128694U, // <2,4,5,7>: Cost 3 vuzpr <1,2,3,4>, RHS
+  1611959624U, // <2,4,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
+  1478066278U, // <2,4,6,0>: Cost 2 vext1 <0,2,4,6>, LHS
+  2551808758U, // <2,4,6,1>: Cost 3 vext1 <0,2,4,6>, <1,0,3,2>
+  2551809516U, // <2,4,6,2>: Cost 3 vext1 <0,2,4,6>, <2,0,6,4>
+  2551810198U, // <2,4,6,3>: Cost 3 vext1 <0,2,4,6>, <3,0,1,2>
+  1478069558U, // <2,4,6,4>: Cost 2 vext1 <0,2,4,6>, RHS
+  2901888310U, // <2,4,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
+  2551812920U, // <2,4,6,6>: Cost 3 vext1 <0,2,4,6>, <6,6,6,6>
+  2726251914U, // <2,4,6,7>: Cost 3 vext3 <7,0,1,2>, <4,6,7,1>
+  1478072110U, // <2,4,6,u>: Cost 2 vext1 <0,2,4,6>, LHS
+  2659234821U, // <2,4,7,0>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
+  3786722726U, // <2,4,7,1>: Cost 4 vext3 <4,7,1,2>, <4,7,1,2>
+  3734303911U, // <2,4,7,2>: Cost 4 vext2 <7,2,2,4>, <7,2,2,4>
+  3734967544U, // <2,4,7,3>: Cost 4 vext2 <7,3,2,4>, <7,3,2,4>
+  3727005030U, // <2,4,7,4>: Cost 4 vext2 <6,0,2,4>, <7,4,5,6>
+  2726251976U, // <2,4,7,5>: Cost 3 vext3 <7,0,1,2>, <4,7,5,0>
+  2726251986U, // <2,4,7,6>: Cost 3 vext3 <7,0,1,2>, <4,7,6,1>
+  3727005292U, // <2,4,7,7>: Cost 4 vext2 <6,0,2,4>, <7,7,7,7>
+  2659234821U, // <2,4,7,u>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
+  1478082662U, // <2,4,u,0>: Cost 2 vext1 <0,2,4,u>, LHS
+  2618758958U, // <2,4,u,1>: Cost 3 vext2 <0,2,2,4>, LHS
+  2551826024U, // <2,4,u,2>: Cost 3 vext1 <0,2,4,u>, <2,2,2,2>
+  2551826582U, // <2,4,u,3>: Cost 3 vext1 <0,2,4,u>, <3,0,1,2>
+  1478085942U, // <2,4,u,4>: Cost 2 vext1 <0,2,4,u>, RHS
+  2953668302U, // <2,4,u,5>: Cost 3 vzipr LHS, <2,3,4,5>
+  1611959849U, // <2,4,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  2826128937U, // <2,4,u,7>: Cost 3 vuzpr <1,2,3,4>, RHS
+  1611959867U, // <2,4,u,u>: Cost 2 vext3 <0,2,0,2>, RHS
+  3691839488U, // <2,5,0,0>: Cost 4 vext2 <0,1,2,5>, <0,0,0,0>
+  2618097766U, // <2,5,0,1>: Cost 3 vext2 <0,1,2,5>, LHS
+  2620088484U, // <2,5,0,2>: Cost 3 vext2 <0,4,2,5>, <0,2,0,2>
+  2619425034U, // <2,5,0,3>: Cost 3 vext2 <0,3,2,5>, <0,3,2,5>
+  2620088667U, // <2,5,0,4>: Cost 3 vext2 <0,4,2,5>, <0,4,2,5>
+  2620752300U, // <2,5,0,5>: Cost 3 vext2 <0,5,2,5>, <0,5,2,5>
+  3693830655U, // <2,5,0,6>: Cost 4 vext2 <0,4,2,5>, <0,6,2,7>
+  3094531382U, // <2,5,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
+  2618098333U, // <2,5,0,u>: Cost 3 vext2 <0,1,2,5>, LHS
+  3691840246U, // <2,5,1,0>: Cost 4 vext2 <0,1,2,5>, <1,0,3,2>
+  3691840308U, // <2,5,1,1>: Cost 4 vext2 <0,1,2,5>, <1,1,1,1>
+  2626061206U, // <2,5,1,2>: Cost 3 vext2 <1,4,2,5>, <1,2,3,0>
+  2618098688U, // <2,5,1,3>: Cost 3 vext2 <0,1,2,5>, <1,3,5,7>
+  2626061364U, // <2,5,1,4>: Cost 3 vext2 <1,4,2,5>, <1,4,2,5>
+  3691840656U, // <2,5,1,5>: Cost 4 vext2 <0,1,2,5>, <1,5,3,7>
+  3789082310U, // <2,5,1,6>: Cost 4 vext3 <5,1,6,2>, <5,1,6,2>
+  2712833744U, // <2,5,1,7>: Cost 3 vext3 <4,6,u,2>, <5,1,7,3>
+  2628715896U, // <2,5,1,u>: Cost 3 vext2 <1,u,2,5>, <1,u,2,5>
+  3693831613U, // <2,5,2,0>: Cost 4 vext2 <0,4,2,5>, <2,0,1,2>
+  4026698642U, // <2,5,2,1>: Cost 4 vzipr <0,0,2,2>, <4,0,5,1>
+  2632033896U, // <2,5,2,2>: Cost 3 vext2 <2,4,2,5>, <2,2,2,2>
+  3691841190U, // <2,5,2,3>: Cost 4 vext2 <0,1,2,5>, <2,3,0,1>
+  2632034061U, // <2,5,2,4>: Cost 3 vext2 <2,4,2,5>, <2,4,2,5>
+  3691841352U, // <2,5,2,5>: Cost 4 vext2 <0,1,2,5>, <2,5,0,1>
+  3691841466U, // <2,5,2,6>: Cost 4 vext2 <0,1,2,5>, <2,6,3,7>
+  3088354614U, // <2,5,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
+  3088354615U, // <2,5,2,u>: Cost 3 vtrnr <0,2,0,2>, RHS
+  2557829222U, // <2,5,3,0>: Cost 3 vext1 <1,2,5,3>, LHS
+  2557830059U, // <2,5,3,1>: Cost 3 vext1 <1,2,5,3>, <1,2,5,3>
+  2575746766U, // <2,5,3,2>: Cost 3 vext1 <4,2,5,3>, <2,3,4,5>
+  3691841948U, // <2,5,3,3>: Cost 4 vext2 <0,1,2,5>, <3,3,3,3>
+  2619427330U, // <2,5,3,4>: Cost 3 vext2 <0,3,2,5>, <3,4,5,6>
+  2581720847U, // <2,5,3,5>: Cost 3 vext1 <5,2,5,3>, <5,2,5,3>
+  2953628162U, // <2,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  2953626624U, // <2,5,3,7>: Cost 3 vzipr LHS, <1,3,5,7>
+  2953626625U, // <2,5,3,u>: Cost 3 vzipr LHS, <1,3,5,u>
+  2569781350U, // <2,5,4,0>: Cost 3 vext1 <3,2,5,4>, LHS
+  3631580076U, // <2,5,4,1>: Cost 4 vext1 <1,2,5,4>, <1,2,5,4>
+  2569782990U, // <2,5,4,2>: Cost 3 vext1 <3,2,5,4>, <2,3,4,5>
+  2569783646U, // <2,5,4,3>: Cost 3 vext1 <3,2,5,4>, <3,2,5,4>
+  2569784630U, // <2,5,4,4>: Cost 3 vext1 <3,2,5,4>, RHS
+  2618101046U, // <2,5,4,5>: Cost 3 vext2 <0,1,2,5>, RHS
+  3893905922U, // <2,5,4,6>: Cost 4 vuzpr <0,2,3,5>, <3,4,5,6>
+  3094564150U, // <2,5,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
+  2618101289U, // <2,5,4,u>: Cost 3 vext2 <0,1,2,5>, RHS
+  2551873638U, // <2,5,5,0>: Cost 3 vext1 <0,2,5,5>, LHS
+  3637560320U, // <2,5,5,1>: Cost 4 vext1 <2,2,5,5>, <1,3,5,7>
+  3637560966U, // <2,5,5,2>: Cost 4 vext1 <2,2,5,5>, <2,2,5,5>
+  3723030343U, // <2,5,5,3>: Cost 4 vext2 <5,3,2,5>, <5,3,2,5>
+  2551876918U, // <2,5,5,4>: Cost 3 vext1 <0,2,5,5>, RHS
+  2712834052U, // <2,5,5,5>: Cost 3 vext3 <4,6,u,2>, <5,5,5,5>
+  4028713474U, // <2,5,5,6>: Cost 4 vzipr <0,3,2,5>, <3,4,5,6>
+  2712834072U, // <2,5,5,7>: Cost 3 vext3 <4,6,u,2>, <5,5,7,7>
+  2712834081U, // <2,5,5,u>: Cost 3 vext3 <4,6,u,2>, <5,5,u,7>
+  2575769702U, // <2,5,6,0>: Cost 3 vext1 <4,2,5,6>, LHS
+  3631596462U, // <2,5,6,1>: Cost 4 vext1 <1,2,5,6>, <1,2,5,6>
+  2655924730U, // <2,5,6,2>: Cost 3 vext2 <6,4,2,5>, <6,2,7,3>
+  3643541856U, // <2,5,6,3>: Cost 4 vext1 <3,2,5,6>, <3,2,5,6>
+  2655924849U, // <2,5,6,4>: Cost 3 vext2 <6,4,2,5>, <6,4,2,5>
+  3787755607U, // <2,5,6,5>: Cost 4 vext3 <4,u,6,2>, <5,6,5,7>
+  4029385218U, // <2,5,6,6>: Cost 4 vzipr <0,4,2,6>, <3,4,5,6>
+  3088682294U, // <2,5,6,7>: Cost 3 vtrnr <0,2,4,6>, RHS
+  3088682295U, // <2,5,6,u>: Cost 3 vtrnr <0,2,4,6>, RHS
+  2563833958U, // <2,5,7,0>: Cost 3 vext1 <2,2,5,7>, LHS
+  2551890678U, // <2,5,7,1>: Cost 3 vext1 <0,2,5,7>, <1,0,3,2>
+  2563835528U, // <2,5,7,2>: Cost 3 vext1 <2,2,5,7>, <2,2,5,7>
+  3637577878U, // <2,5,7,3>: Cost 4 vext1 <2,2,5,7>, <3,0,1,2>
+  2563837238U, // <2,5,7,4>: Cost 3 vext1 <2,2,5,7>, RHS
+  2712834216U, // <2,5,7,5>: Cost 3 vext3 <4,6,u,2>, <5,7,5,7>
+  2712834220U, // <2,5,7,6>: Cost 3 vext3 <4,6,u,2>, <5,7,6,2>
+  4174449974U, // <2,5,7,7>: Cost 4 vtrnr <2,2,5,7>, RHS
+  2563839790U, // <2,5,7,u>: Cost 3 vext1 <2,2,5,7>, LHS
+  2563842150U, // <2,5,u,0>: Cost 3 vext1 <2,2,5,u>, LHS
+  2618103598U, // <2,5,u,1>: Cost 3 vext2 <0,1,2,5>, LHS
+  2563843721U, // <2,5,u,2>: Cost 3 vext1 <2,2,5,u>, <2,2,5,u>
+  2569816418U, // <2,5,u,3>: Cost 3 vext1 <3,2,5,u>, <3,2,5,u>
+  2622748735U, // <2,5,u,4>: Cost 3 vext2 <0,u,2,5>, <u,4,5,6>
+  2618103962U, // <2,5,u,5>: Cost 3 vext2 <0,1,2,5>, RHS
+  2953669122U, // <2,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  2953667584U, // <2,5,u,7>: Cost 3 vzipr LHS, <1,3,5,7>
+  2618104165U, // <2,5,u,u>: Cost 3 vext2 <0,1,2,5>, LHS
+  2620096512U, // <2,6,0,0>: Cost 3 vext2 <0,4,2,6>, <0,0,0,0>
+  1546354790U, // <2,6,0,1>: Cost 2 vext2 <0,4,2,6>, LHS
+  2620096676U, // <2,6,0,2>: Cost 3 vext2 <0,4,2,6>, <0,2,0,2>
+  3693838588U, // <2,6,0,3>: Cost 4 vext2 <0,4,2,6>, <0,3,1,0>
+  1546355036U, // <2,6,0,4>: Cost 2 vext2 <0,4,2,6>, <0,4,2,6>
+  3694502317U, // <2,6,0,5>: Cost 4 vext2 <0,5,2,6>, <0,5,2,6>
+  2551911246U, // <2,6,0,6>: Cost 3 vext1 <0,2,6,0>, <6,7,0,1>
+  2720723287U, // <2,6,0,7>: Cost 3 vext3 <6,0,7,2>, <6,0,7,2>
+  1546355357U, // <2,6,0,u>: Cost 2 vext2 <0,4,2,6>, LHS
+  2620097270U, // <2,6,1,0>: Cost 3 vext2 <0,4,2,6>, <1,0,3,2>
+  2620097332U, // <2,6,1,1>: Cost 3 vext2 <0,4,2,6>, <1,1,1,1>
+  2620097430U, // <2,6,1,2>: Cost 3 vext2 <0,4,2,6>, <1,2,3,0>
+  2820243558U, // <2,6,1,3>: Cost 3 vuzpr <0,2,4,6>, LHS
+  2620097598U, // <2,6,1,4>: Cost 3 vext2 <0,4,2,6>, <1,4,3,6>
+  2620097680U, // <2,6,1,5>: Cost 3 vext2 <0,4,2,6>, <1,5,3,7>
+  3693839585U, // <2,6,1,6>: Cost 4 vext2 <0,4,2,6>, <1,6,3,7>
+  2721386920U, // <2,6,1,7>: Cost 3 vext3 <6,1,7,2>, <6,1,7,2>
+  2820243563U, // <2,6,1,u>: Cost 3 vuzpr <0,2,4,6>, LHS
+  2714014137U, // <2,6,2,0>: Cost 3 vext3 <4,u,6,2>, <6,2,0,1>
+  2712834500U, // <2,6,2,1>: Cost 3 vext3 <4,6,u,2>, <6,2,1,3>
+  2620098152U, // <2,6,2,2>: Cost 3 vext2 <0,4,2,6>, <2,2,2,2>
+  2620098214U, // <2,6,2,3>: Cost 3 vext2 <0,4,2,6>, <2,3,0,1>
+  2632042254U, // <2,6,2,4>: Cost 3 vext2 <2,4,2,6>, <2,4,2,6>
+  2712834540U, // <2,6,2,5>: Cost 3 vext3 <4,6,u,2>, <6,2,5,7>
+  2820243660U, // <2,6,2,6>: Cost 3 vuzpr <0,2,4,6>, <0,2,4,6>
+  2958265654U, // <2,6,2,7>: Cost 3 vzipr <0,u,2,2>, RHS
+  2620098619U, // <2,6,2,u>: Cost 3 vext2 <0,4,2,6>, <2,u,0,1>
+  2620098710U, // <2,6,3,0>: Cost 3 vext2 <0,4,2,6>, <3,0,1,2>
+  3893986982U, // <2,6,3,1>: Cost 4 vuzpr <0,2,4,6>, <2,3,0,1>
+  2569848762U, // <2,6,3,2>: Cost 3 vext1 <3,2,6,3>, <2,6,3,7>
+  2620098972U, // <2,6,3,3>: Cost 3 vext2 <0,4,2,6>, <3,3,3,3>
+  2620099074U, // <2,6,3,4>: Cost 3 vext2 <0,4,2,6>, <3,4,5,6>
+  3893987022U, // <2,6,3,5>: Cost 4 vuzpr <0,2,4,6>, <2,3,4,5>
+  3001404644U, // <2,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
+  1879887158U, // <2,6,3,7>: Cost 2 vzipr LHS, RHS
+  1879887159U, // <2,6,3,u>: Cost 2 vzipr LHS, RHS
+  2620099484U, // <2,6,4,0>: Cost 3 vext2 <0,4,2,6>, <4,0,6,2>
+  2620099566U, // <2,6,4,1>: Cost 3 vext2 <0,4,2,6>, <4,1,6,3>
+  2620099644U, // <2,6,4,2>: Cost 3 vext2 <0,4,2,6>, <4,2,6,0>
+  3643599207U, // <2,6,4,3>: Cost 4 vext1 <3,2,6,4>, <3,2,6,4>
+  2575830080U, // <2,6,4,4>: Cost 3 vext1 <4,2,6,4>, <4,2,6,4>
+  1546358070U, // <2,6,4,5>: Cost 2 vext2 <0,4,2,6>, RHS
+  2667875700U, // <2,6,4,6>: Cost 3 vext2 <u,4,2,6>, <4,6,4,6>
+  4028042550U, // <2,6,4,7>: Cost 4 vzipr <0,2,2,4>, RHS
+  1546358313U, // <2,6,4,u>: Cost 2 vext2 <0,4,2,6>, RHS
+  3693841992U, // <2,6,5,0>: Cost 4 vext2 <0,4,2,6>, <5,0,1,2>
+  2667876048U, // <2,6,5,1>: Cost 3 vext2 <u,4,2,6>, <5,1,7,3>
+  2712834756U, // <2,6,5,2>: Cost 3 vext3 <4,6,u,2>, <6,5,2,7>
+  3643607400U, // <2,6,5,3>: Cost 4 vext1 <3,2,6,5>, <3,2,6,5>
+  2252091873U, // <2,6,5,4>: Cost 3 vrev <6,2,4,5>
+  2667876356U, // <2,6,5,5>: Cost 3 vext2 <u,4,2,6>, <5,5,5,5>
+  2667876450U, // <2,6,5,6>: Cost 3 vext2 <u,4,2,6>, <5,6,7,0>
+  2820246838U, // <2,6,5,7>: Cost 3 vuzpr <0,2,4,6>, RHS
+  2820246839U, // <2,6,5,u>: Cost 3 vuzpr <0,2,4,6>, RHS
+  2563899494U, // <2,6,6,0>: Cost 3 vext1 <2,2,6,6>, LHS
+  3893988683U, // <2,6,6,1>: Cost 4 vuzpr <0,2,4,6>, <4,6,0,1>
+  2563901072U, // <2,6,6,2>: Cost 3 vext1 <2,2,6,6>, <2,2,6,6>
+  3893987236U, // <2,6,6,3>: Cost 4 vuzpr <0,2,4,6>, <2,6,1,3>
+  2563902774U, // <2,6,6,4>: Cost 3 vext1 <2,2,6,6>, RHS
+  3893988723U, // <2,6,6,5>: Cost 4 vuzpr <0,2,4,6>, <4,6,4,5>
+  2712834872U, // <2,6,6,6>: Cost 3 vext3 <4,6,u,2>, <6,6,6,6>
+  2955644214U, // <2,6,6,7>: Cost 3 vzipr <0,4,2,6>, RHS
+  2955644215U, // <2,6,6,u>: Cost 3 vzipr <0,4,2,6>, RHS
+  2712834894U, // <2,6,7,0>: Cost 3 vext3 <4,6,u,2>, <6,7,0,1>
+  2724926296U, // <2,6,7,1>: Cost 3 vext3 <6,7,1,2>, <6,7,1,2>
+  2725000033U, // <2,6,7,2>: Cost 3 vext3 <6,7,2,2>, <6,7,2,2>
+  2702365544U, // <2,6,7,3>: Cost 3 vext3 <3,0,1,2>, <6,7,3,0>
+  2712834934U, // <2,6,7,4>: Cost 3 vext3 <4,6,u,2>, <6,7,4,5>
+  3776107393U, // <2,6,7,5>: Cost 4 vext3 <3,0,1,2>, <6,7,5,7>
+  2725294981U, // <2,6,7,6>: Cost 3 vext3 <6,7,6,2>, <6,7,6,2>
+  2726253452U, // <2,6,7,7>: Cost 3 vext3 <7,0,1,2>, <6,7,7,0>
+  2712834966U, // <2,6,7,u>: Cost 3 vext3 <4,6,u,2>, <6,7,u,1>
+  2620102355U, // <2,6,u,0>: Cost 3 vext2 <0,4,2,6>, <u,0,1,2>
+  1546360622U, // <2,6,u,1>: Cost 2 vext2 <0,4,2,6>, LHS
+  2620102536U, // <2,6,u,2>: Cost 3 vext2 <0,4,2,6>, <u,2,3,3>
+  2820244125U, // <2,6,u,3>: Cost 3 vuzpr <0,2,4,6>, LHS
+  1594136612U, // <2,6,u,4>: Cost 2 vext2 <u,4,2,6>, <u,4,2,6>
+  1546360986U, // <2,6,u,5>: Cost 2 vext2 <0,4,2,6>, RHS
+  2620102864U, // <2,6,u,6>: Cost 3 vext2 <0,4,2,6>, <u,6,3,7>
+  1879928118U, // <2,6,u,7>: Cost 2 vzipr LHS, RHS
+  1879928119U, // <2,6,u,u>: Cost 2 vzipr LHS, RHS
+  2726179825U, // <2,7,0,0>: Cost 3 vext3 <7,0,0,2>, <7,0,0,2>
+  1652511738U, // <2,7,0,1>: Cost 2 vext3 <7,0,1,2>, <7,0,1,2>
+  2621431972U, // <2,7,0,2>: Cost 3 vext2 <0,6,2,7>, <0,2,0,2>
+  2257949868U, // <2,7,0,3>: Cost 3 vrev <7,2,3,0>
+  2726474773U, // <2,7,0,4>: Cost 3 vext3 <7,0,4,2>, <7,0,4,2>
+  2620768686U, // <2,7,0,5>: Cost 3 vext2 <0,5,2,7>, <0,5,2,7>
+  2621432319U, // <2,7,0,6>: Cost 3 vext2 <0,6,2,7>, <0,6,2,7>
+  2599760953U, // <2,7,0,7>: Cost 3 vext1 <u,2,7,0>, <7,0,u,2>
+  1653027897U, // <2,7,0,u>: Cost 2 vext3 <7,0,u,2>, <7,0,u,2>
+  2639348470U, // <2,7,1,0>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
+  3695174452U, // <2,7,1,1>: Cost 4 vext2 <0,6,2,7>, <1,1,1,1>
+  3695174550U, // <2,7,1,2>: Cost 4 vext2 <0,6,2,7>, <1,2,3,0>
+  3694511104U, // <2,7,1,3>: Cost 4 vext2 <0,5,2,7>, <1,3,5,7>
+  3713090594U, // <2,7,1,4>: Cost 4 vext2 <3,6,2,7>, <1,4,0,5>
+  3693184144U, // <2,7,1,5>: Cost 4 vext2 <0,3,2,7>, <1,5,3,7>
+  2627405016U, // <2,7,1,6>: Cost 3 vext2 <1,6,2,7>, <1,6,2,7>
+  3799995519U, // <2,7,1,7>: Cost 4 vext3 <7,0,1,2>, <7,1,7,0>
+  2639348470U, // <2,7,1,u>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
+  3695175101U, // <2,7,2,0>: Cost 4 vext2 <0,6,2,7>, <2,0,1,2>
+  3643655168U, // <2,7,2,1>: Cost 4 vext1 <3,2,7,2>, <1,3,5,7>
+  2257892517U, // <2,7,2,2>: Cost 3 vrev <7,2,2,2>
+  3695175334U, // <2,7,2,3>: Cost 4 vext2 <0,6,2,7>, <2,3,0,1>
+  3695175465U, // <2,7,2,4>: Cost 4 vext2 <0,6,2,7>, <2,4,5,6>
+  2632714080U, // <2,7,2,5>: Cost 3 vext2 <2,5,2,7>, <2,5,2,7>
+  2633377713U, // <2,7,2,6>: Cost 3 vext2 <2,6,2,7>, <2,6,2,7>
+  3695175658U, // <2,7,2,7>: Cost 4 vext2 <0,6,2,7>, <2,7,0,1>
+  2634704979U, // <2,7,2,u>: Cost 3 vext2 <2,u,2,7>, <2,u,2,7>
+  1514094694U, // <2,7,3,0>: Cost 2 vext1 <6,2,7,3>, LHS
+  2569921680U, // <2,7,3,1>: Cost 3 vext1 <3,2,7,3>, <1,5,3,7>
+  2587838056U, // <2,7,3,2>: Cost 3 vext1 <6,2,7,3>, <2,2,2,2>
+  2569922927U, // <2,7,3,3>: Cost 3 vext1 <3,2,7,3>, <3,2,7,3>
+  1514097974U, // <2,7,3,4>: Cost 2 vext1 <6,2,7,3>, RHS
+  2581868321U, // <2,7,3,5>: Cost 3 vext1 <5,2,7,3>, <5,2,7,3>
+  1514099194U, // <2,7,3,6>: Cost 2 vext1 <6,2,7,3>, <6,2,7,3>
+  2587841530U, // <2,7,3,7>: Cost 3 vext1 <6,2,7,3>, <7,0,1,2>
+  1514100526U, // <2,7,3,u>: Cost 2 vext1 <6,2,7,3>, LHS
+  2708706617U, // <2,7,4,0>: Cost 3 vext3 <4,0,6,2>, <7,4,0,6>
+  3649643418U, // <2,7,4,1>: Cost 4 vext1 <4,2,7,4>, <1,2,3,4>
+  3649644330U, // <2,7,4,2>: Cost 4 vext1 <4,2,7,4>, <2,4,5,7>
+  2257982640U, // <2,7,4,3>: Cost 3 vrev <7,2,3,4>
+  3649645641U, // <2,7,4,4>: Cost 4 vext1 <4,2,7,4>, <4,2,7,4>
+  2621435190U, // <2,7,4,5>: Cost 3 vext2 <0,6,2,7>, RHS
+  2712835441U, // <2,7,4,6>: Cost 3 vext3 <4,6,u,2>, <7,4,6,u>
+  3799995762U, // <2,7,4,7>: Cost 4 vext3 <7,0,1,2>, <7,4,7,0>
+  2621435433U, // <2,7,4,u>: Cost 3 vext2 <0,6,2,7>, RHS
+  2729497990U, // <2,7,5,0>: Cost 3 vext3 <7,5,0,2>, <7,5,0,2>
+  3643679744U, // <2,7,5,1>: Cost 4 vext1 <3,2,7,5>, <1,3,5,7>
+  3637708424U, // <2,7,5,2>: Cost 4 vext1 <2,2,7,5>, <2,2,5,7>
+  3643681137U, // <2,7,5,3>: Cost 4 vext1 <3,2,7,5>, <3,2,7,5>
+  2599800118U, // <2,7,5,4>: Cost 3 vext1 <u,2,7,5>, RHS
+  3786577334U, // <2,7,5,5>: Cost 4 vext3 <4,6,u,2>, <7,5,5,5>
+  3786577345U, // <2,7,5,6>: Cost 4 vext3 <4,6,u,2>, <7,5,6,7>
+  2599802214U, // <2,7,5,7>: Cost 3 vext1 <u,2,7,5>, <7,4,5,6>
+  2599802670U, // <2,7,5,u>: Cost 3 vext1 <u,2,7,5>, LHS
+  2581889126U, // <2,7,6,0>: Cost 3 vext1 <5,2,7,6>, LHS
+  3643687936U, // <2,7,6,1>: Cost 4 vext1 <3,2,7,6>, <1,3,5,7>
+  2663240186U, // <2,7,6,2>: Cost 3 vext2 <7,6,2,7>, <6,2,7,3>
+  3643689330U, // <2,7,6,3>: Cost 4 vext1 <3,2,7,6>, <3,2,7,6>
+  2581892406U, // <2,7,6,4>: Cost 3 vext1 <5,2,7,6>, RHS
+  2581892900U, // <2,7,6,5>: Cost 3 vext1 <5,2,7,6>, <5,2,7,6>
+  2587865597U, // <2,7,6,6>: Cost 3 vext1 <6,2,7,6>, <6,2,7,6>
+  3786577428U, // <2,7,6,7>: Cost 4 vext3 <4,6,u,2>, <7,6,7,0>
+  2581894958U, // <2,7,6,u>: Cost 3 vext1 <5,2,7,6>, LHS
+  2726254119U, // <2,7,7,0>: Cost 3 vext3 <7,0,1,2>, <7,7,0,1>
+  3804640817U, // <2,7,7,1>: Cost 4 vext3 <7,7,1,2>, <7,7,1,2>
+  3637724826U, // <2,7,7,2>: Cost 4 vext1 <2,2,7,7>, <2,2,7,7>
+  3734992123U, // <2,7,7,3>: Cost 4 vext2 <7,3,2,7>, <7,3,2,7>
+  2552040758U, // <2,7,7,4>: Cost 3 vext1 <0,2,7,7>, RHS
+  3799995992U, // <2,7,7,5>: Cost 4 vext3 <7,0,1,2>, <7,7,5,5>
+  2663241198U, // <2,7,7,6>: Cost 3 vext2 <7,6,2,7>, <7,6,2,7>
+  2712835692U, // <2,7,7,7>: Cost 3 vext3 <4,6,u,2>, <7,7,7,7>
+  2731562607U, // <2,7,7,u>: Cost 3 vext3 <7,u,1,2>, <7,7,u,1>
+  1514135654U, // <2,7,u,0>: Cost 2 vext1 <6,2,7,u>, LHS
+  1657820802U, // <2,7,u,1>: Cost 2 vext3 <7,u,1,2>, <7,u,1,2>
+  2587879016U, // <2,7,u,2>: Cost 3 vext1 <6,2,7,u>, <2,2,2,2>
+  2569963892U, // <2,7,u,3>: Cost 3 vext1 <3,2,7,u>, <3,2,7,u>
+  1514138934U, // <2,7,u,4>: Cost 2 vext1 <6,2,7,u>, RHS
+  2621438106U, // <2,7,u,5>: Cost 3 vext2 <0,6,2,7>, RHS
+  1514140159U, // <2,7,u,6>: Cost 2 vext1 <6,2,7,u>, <6,2,7,u>
+  2587882490U, // <2,7,u,7>: Cost 3 vext1 <6,2,7,u>, <7,0,1,2>
+  1514141486U, // <2,7,u,u>: Cost 2 vext1 <6,2,7,u>, LHS
+  1544380416U, // <2,u,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+  470638699U, // <2,u,0,1>: Cost 1 vext2 LHS, LHS
+  1544380580U, // <2,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  1658631909U, // <2,u,0,3>: Cost 2 vext3 <u,0,3,2>, <u,0,3,2>
+  1544380754U, // <2,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  2665898414U, // <2,u,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
+  1658853120U, // <2,u,0,6>: Cost 2 vext3 <u,0,6,2>, <u,0,6,2>
+  3094531625U, // <2,u,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
+  470639261U, // <2,u,0,u>: Cost 1 vext2 LHS, LHS
+  1544381174U, // <2,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  1544381236U, // <2,u,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+  1544381334U, // <2,u,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+  1544381400U, // <2,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  2618123325U, // <2,u,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
+  1544381584U, // <2,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  2618123489U, // <2,u,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
+  2726254427U, // <2,u,1,7>: Cost 3 vext3 <7,0,1,2>, <u,1,7,3>
+  1544381823U, // <2,u,1,u>: Cost 2 vext2 LHS, <1,u,3,3>
+  1478328422U, // <2,u,2,0>: Cost 2 vext1 <0,2,u,2>, LHS
+  2618123807U, // <2,u,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
+  269271142U, // <2,u,2,2>: Cost 1 vdup2 LHS
+  1544382118U, // <2,u,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+  1478331702U, // <2,u,2,4>: Cost 2 vext1 <0,2,u,2>, RHS
+  2618124136U, // <2,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+  1544382394U, // <2,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  3088354857U, // <2,u,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
+  269271142U, // <2,u,2,u>: Cost 1 vdup2 LHS
+  1544382614U, // <2,u,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+  2953627374U, // <2,u,3,1>: Cost 3 vzipr LHS, <2,3,u,1>
+  1490282143U, // <2,u,3,2>: Cost 2 vext1 <2,2,u,3>, <2,2,u,3>
+  1879883932U, // <2,u,3,3>: Cost 2 vzipr LHS, LHS
+  1544382978U, // <2,u,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+  2953627378U, // <2,u,3,5>: Cost 3 vzipr LHS, <2,3,u,5>
+  1514172931U, // <2,u,3,6>: Cost 2 vext1 <6,2,u,3>, <6,2,u,3>
+  1879887176U, // <2,u,3,7>: Cost 2 vzipr LHS, RHS
+  1879883937U, // <2,u,3,u>: Cost 2 vzipr LHS, LHS
+  1484316774U, // <2,u,4,0>: Cost 2 vext1 <1,2,u,4>, LHS
+  1484317639U, // <2,u,4,1>: Cost 2 vext1 <1,2,u,4>, <1,2,u,4>
+  2552088270U, // <2,u,4,2>: Cost 3 vext1 <0,2,u,4>, <2,3,4,5>
+  1190213513U, // <2,u,4,3>: Cost 2 vrev <u,2,3,4>
+  1484320054U, // <2,u,4,4>: Cost 2 vext1 <1,2,u,4>, RHS
+  470641974U, // <2,u,4,5>: Cost 1 vext2 LHS, RHS
+  1592159604U, // <2,u,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+  3094564393U, // <2,u,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
+  470642217U, // <2,u,4,u>: Cost 1 vext2 LHS, RHS
+  2552094959U, // <2,u,5,0>: Cost 3 vext1 <0,2,u,5>, <0,2,u,5>
+  1592159952U, // <2,u,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
+  2564040353U, // <2,u,5,2>: Cost 3 vext1 <2,2,u,5>, <2,2,u,5>
+  2690275455U, // <2,u,5,3>: Cost 3 vext3 <0,u,u,2>, <u,5,3,7>
+  1592160198U, // <2,u,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+  1592160260U, // <2,u,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+  1611962522U, // <2,u,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  1592160424U, // <2,u,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
+  1611962540U, // <2,u,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
+  1478361190U, // <2,u,6,0>: Cost 2 vext1 <0,2,u,6>, LHS
+  2552103670U, // <2,u,6,1>: Cost 3 vext1 <0,2,u,6>, <1,0,3,2>
+  1592160762U, // <2,u,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
+  2685704400U, // <2,u,6,3>: Cost 3 vext3 <0,2,0,2>, <u,6,3,7>
+  1478364470U, // <2,u,6,4>: Cost 2 vext1 <0,2,u,6>, RHS
+  2901891226U, // <2,u,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
+  1592161080U, // <2,u,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+  1592161102U, // <2,u,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+  1478367022U, // <2,u,6,u>: Cost 2 vext1 <0,2,u,6>, LHS
+  1592161274U, // <2,u,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
+  2659931226U, // <2,u,7,1>: Cost 3 vext2 <7,1,2,u>, <7,1,2,u>
+  2564056739U, // <2,u,7,2>: Cost 3 vext1 <2,2,u,7>, <2,2,u,7>
+  2665903331U, // <2,u,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
+  1592161638U, // <2,u,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
+  2665903494U, // <2,u,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
+  2587947527U, // <2,u,7,6>: Cost 3 vext1 <6,2,u,7>, <6,2,u,7>
+  1592161900U, // <2,u,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+  1592161922U, // <2,u,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
+  1478377574U, // <2,u,u,0>: Cost 2 vext1 <0,2,u,u>, LHS
+  470644526U, // <2,u,u,1>: Cost 1 vext2 LHS, LHS
+  269271142U, // <2,u,u,2>: Cost 1 vdup2 LHS
+  1879924892U, // <2,u,u,3>: Cost 2 vzipr LHS, LHS
+  1478380854U, // <2,u,u,4>: Cost 2 vext1 <0,2,u,u>, RHS
+  470644890U, // <2,u,u,5>: Cost 1 vext2 LHS, RHS
+  1611962765U, // <2,u,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  1879928136U, // <2,u,u,7>: Cost 2 vzipr LHS, RHS
+  470645093U, // <2,u,u,u>: Cost 1 vext2 LHS, LHS
+  1611448320U, // <3,0,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
+  1611890698U, // <3,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
+  1611890708U, // <3,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
+  3763576860U, // <3,0,0,3>: Cost 4 vext3 LHS, <0,0,3,1>
+  2689835045U, // <3,0,0,4>: Cost 3 vext3 LHS, <0,0,4,1>
+  3698508206U, // <3,0,0,5>: Cost 4 vext2 <1,2,3,0>, <0,5,2,7>
+  3763576887U, // <3,0,0,6>: Cost 4 vext3 LHS, <0,0,6,1>
+  3667678434U, // <3,0,0,7>: Cost 4 vext1 <7,3,0,0>, <7,3,0,0>
+  1616093258U, // <3,0,0,u>: Cost 2 vext3 LHS, <0,0,u,2>
+  1490337894U, // <3,0,1,0>: Cost 2 vext1 <2,3,0,1>, LHS
+  2685632602U, // <3,0,1,1>: Cost 3 vext3 LHS, <0,1,1,0>
+  537706598U, // <3,0,1,2>: Cost 1 vext3 LHS, LHS
+  2624766936U, // <3,0,1,3>: Cost 3 vext2 <1,2,3,0>, <1,3,1,3>
+  1490341174U, // <3,0,1,4>: Cost 2 vext1 <2,3,0,1>, RHS
+  2624767120U, // <3,0,1,5>: Cost 3 vext2 <1,2,3,0>, <1,5,3,7>
+  2732966030U, // <3,0,1,6>: Cost 3 vext3 LHS, <0,1,6,7>
+  2593944803U, // <3,0,1,7>: Cost 3 vext1 <7,3,0,1>, <7,3,0,1>
+  537706652U, // <3,0,1,u>: Cost 1 vext3 LHS, LHS
+  1611890852U, // <3,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+  2685632684U, // <3,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
+  2685632692U, // <3,0,2,2>: Cost 3 vext3 LHS, <0,2,2,0>
+  2685632702U, // <3,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
+  1611890892U, // <3,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+  2732966102U, // <3,0,2,5>: Cost 3 vext3 LHS, <0,2,5,7>
+  2624767930U, // <3,0,2,6>: Cost 3 vext2 <1,2,3,0>, <2,6,3,7>
+  2685632744U, // <3,0,2,7>: Cost 3 vext3 LHS, <0,2,7,7>
+  1611890924U, // <3,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
+  2624768150U, // <3,0,3,0>: Cost 3 vext2 <1,2,3,0>, <3,0,1,2>
+  2685632764U, // <3,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
+  2685632774U, // <3,0,3,2>: Cost 3 vext3 LHS, <0,3,2,1>
+  2624768412U, // <3,0,3,3>: Cost 3 vext2 <1,2,3,0>, <3,3,3,3>
+  2624768514U, // <3,0,3,4>: Cost 3 vext2 <1,2,3,0>, <3,4,5,6>
+  3702491714U, // <3,0,3,5>: Cost 4 vext2 <1,u,3,0>, <3,5,3,7>
+  2624768632U, // <3,0,3,6>: Cost 3 vext2 <1,2,3,0>, <3,6,0,7>
+  3702491843U, // <3,0,3,7>: Cost 4 vext2 <1,u,3,0>, <3,7,0,1>
+  2686959934U, // <3,0,3,u>: Cost 3 vext3 <0,3,u,3>, <0,3,u,3>
+  2689835336U, // <3,0,4,0>: Cost 3 vext3 LHS, <0,4,0,4>
+  1611891026U, // <3,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
+  1611891036U, // <3,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
+  3763577184U, // <3,0,4,3>: Cost 4 vext3 LHS, <0,4,3,1>
+  2689835374U, // <3,0,4,4>: Cost 3 vext3 LHS, <0,4,4,6>
+  1551027510U, // <3,0,4,5>: Cost 2 vext2 <1,2,3,0>, RHS
+  2666573172U, // <3,0,4,6>: Cost 3 vext2 <u,2,3,0>, <4,6,4,6>
+  3667711206U, // <3,0,4,7>: Cost 4 vext1 <7,3,0,4>, <7,3,0,4>
+  1616093586U, // <3,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
+  2685190556U, // <3,0,5,0>: Cost 3 vext3 LHS, <0,5,0,7>
+  2666573520U, // <3,0,5,1>: Cost 3 vext2 <u,2,3,0>, <5,1,7,3>
+  3040886886U, // <3,0,5,2>: Cost 3 vtrnl <3,4,5,6>, LHS
+  3625912834U, // <3,0,5,3>: Cost 4 vext1 <0,3,0,5>, <3,4,5,6>
+  2666573766U, // <3,0,5,4>: Cost 3 vext2 <u,2,3,0>, <5,4,7,6>
+  2666573828U, // <3,0,5,5>: Cost 3 vext2 <u,2,3,0>, <5,5,5,5>
+  2732966354U, // <3,0,5,6>: Cost 3 vext3 LHS, <0,5,6,7>
+  2666573992U, // <3,0,5,7>: Cost 3 vext2 <u,2,3,0>, <5,7,5,7>
+  3040886940U, // <3,0,5,u>: Cost 3 vtrnl <3,4,5,6>, LHS
+  2685190637U, // <3,0,6,0>: Cost 3 vext3 LHS, <0,6,0,7>
+  2732966390U, // <3,0,6,1>: Cost 3 vext3 LHS, <0,6,1,7>
+  2689835519U, // <3,0,6,2>: Cost 3 vext3 LHS, <0,6,2,7>
+  3667724438U, // <3,0,6,3>: Cost 4 vext1 <7,3,0,6>, <3,0,1,2>
+  3763577355U, // <3,0,6,4>: Cost 4 vext3 LHS, <0,6,4,1>
+  3806708243U, // <3,0,6,5>: Cost 4 vext3 LHS, <0,6,5,0>
+  2666574648U, // <3,0,6,6>: Cost 3 vext2 <u,2,3,0>, <6,6,6,6>
+  2657948520U, // <3,0,6,7>: Cost 3 vext2 <6,7,3,0>, <6,7,3,0>
+  2689835573U, // <3,0,6,u>: Cost 3 vext3 LHS, <0,6,u,7>
+  2666574842U, // <3,0,7,0>: Cost 3 vext2 <u,2,3,0>, <7,0,1,2>
+  2685633095U, // <3,0,7,1>: Cost 3 vext3 LHS, <0,7,1,7>
+  2660603052U, // <3,0,7,2>: Cost 3 vext2 <7,2,3,0>, <7,2,3,0>
+  3643844997U, // <3,0,7,3>: Cost 4 vext1 <3,3,0,7>, <3,3,0,7>
+  2666575206U, // <3,0,7,4>: Cost 3 vext2 <u,2,3,0>, <7,4,5,6>
+  3655790391U, // <3,0,7,5>: Cost 4 vext1 <5,3,0,7>, <5,3,0,7>
+  3731690968U, // <3,0,7,6>: Cost 4 vext2 <6,7,3,0>, <7,6,0,3>
+  2666575468U, // <3,0,7,7>: Cost 3 vext2 <u,2,3,0>, <7,7,7,7>
+  2664584850U, // <3,0,7,u>: Cost 3 vext2 <7,u,3,0>, <7,u,3,0>
+  1616093834U, // <3,0,u,0>: Cost 2 vext3 LHS, <0,u,0,2>
+  1611891346U, // <3,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
+  537707165U, // <3,0,u,2>: Cost 1 vext3 LHS, LHS
+  2689835684U, // <3,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
+  1616093874U, // <3,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
+  1551030426U, // <3,0,u,5>: Cost 2 vext2 <1,2,3,0>, RHS
+  2624772304U, // <3,0,u,6>: Cost 3 vext2 <1,2,3,0>, <u,6,3,7>
+  2594002154U, // <3,0,u,7>: Cost 3 vext1 <7,3,0,u>, <7,3,0,u>
+  537707219U, // <3,0,u,u>: Cost 1 vext3 LHS, LHS
+  2552201318U, // <3,1,0,0>: Cost 3 vext1 <0,3,1,0>, LHS
+  2618802278U, // <3,1,0,1>: Cost 3 vext2 <0,2,3,1>, LHS
+  2618802366U, // <3,1,0,2>: Cost 3 vext2 <0,2,3,1>, <0,2,3,1>
+  1611449078U, // <3,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
+  2552204598U, // <3,1,0,4>: Cost 3 vext1 <0,3,1,0>, RHS
+  2732966663U, // <3,1,0,5>: Cost 3 vext3 LHS, <1,0,5,1>
+  3906258396U, // <3,1,0,6>: Cost 4 vuzpr <2,3,0,1>, <2,0,4,6>
+  3667752171U, // <3,1,0,7>: Cost 4 vext1 <7,3,1,0>, <7,3,1,0>
+  1611891491U, // <3,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
+  2689835819U, // <3,1,1,0>: Cost 3 vext3 LHS, <1,1,0,1>
+  1611449140U, // <3,1,1,1>: Cost 2 vext3 LHS, <1,1,1,1>
+  2624775063U, // <3,1,1,2>: Cost 3 vext2 <1,2,3,1>, <1,2,3,1>
+  1611891528U, // <3,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
+  2689835859U, // <3,1,1,4>: Cost 3 vext3 LHS, <1,1,4,5>
+  2689835868U, // <3,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
+  3763577701U, // <3,1,1,6>: Cost 4 vext3 LHS, <1,1,6,5>
+  3765273452U, // <3,1,1,7>: Cost 4 vext3 <1,1,7,3>, <1,1,7,3>
+  1611891573U, // <3,1,1,u>: Cost 2 vext3 LHS, <1,1,u,3>
+  2629420494U, // <3,1,2,0>: Cost 3 vext2 <2,0,3,1>, <2,0,3,1>
+  2689835911U, // <3,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
+  2564163248U, // <3,1,2,2>: Cost 3 vext1 <2,3,1,2>, <2,3,1,2>
+  1611449238U, // <3,1,2,3>: Cost 2 vext3 LHS, <1,2,3,0>
+  2564164918U, // <3,1,2,4>: Cost 3 vext1 <2,3,1,2>, RHS
+  2689835947U, // <3,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
+  3692545978U, // <3,1,2,6>: Cost 4 vext2 <0,2,3,1>, <2,6,3,7>
+  2732966842U, // <3,1,2,7>: Cost 3 vext3 LHS, <1,2,7,0>
+  1611891651U, // <3,1,2,u>: Cost 2 vext3 LHS, <1,2,u,0>
+  1484456038U, // <3,1,3,0>: Cost 2 vext1 <1,3,1,3>, LHS
+  1611891672U, // <3,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
+  2685633502U, // <3,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
+  2685633512U, // <3,1,3,3>: Cost 3 vext3 LHS, <1,3,3,1>
+  1484459318U, // <3,1,3,4>: Cost 2 vext1 <1,3,1,3>, RHS
+  1611891712U, // <3,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
+  2689836041U, // <3,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
+  2733409294U, // <3,1,3,7>: Cost 3 vext3 LHS, <1,3,7,3>
+  1611891735U, // <3,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
+  2552234086U, // <3,1,4,0>: Cost 3 vext1 <0,3,1,4>, LHS
+  2732966955U, // <3,1,4,1>: Cost 3 vext3 LHS, <1,4,1,5>
+  2732966964U, // <3,1,4,2>: Cost 3 vext3 LHS, <1,4,2,5>
+  2685633597U, // <3,1,4,3>: Cost 3 vext3 LHS, <1,4,3,5>
+  2552237366U, // <3,1,4,4>: Cost 3 vext1 <0,3,1,4>, RHS
+  2618805558U, // <3,1,4,5>: Cost 3 vext2 <0,2,3,1>, RHS
+  2769472822U, // <3,1,4,6>: Cost 3 vuzpl <3,0,1,2>, RHS
+  3667784943U, // <3,1,4,7>: Cost 4 vext1 <7,3,1,4>, <7,3,1,4>
+  2685633642U, // <3,1,4,u>: Cost 3 vext3 LHS, <1,4,u,5>
+  2689836143U, // <3,1,5,0>: Cost 3 vext3 LHS, <1,5,0,1>
+  2564187280U, // <3,1,5,1>: Cost 3 vext1 <2,3,1,5>, <1,5,3,7>
+  2564187827U, // <3,1,5,2>: Cost 3 vext1 <2,3,1,5>, <2,3,1,5>
+  1611891856U, // <3,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
+  2689836183U, // <3,1,5,4>: Cost 3 vext3 LHS, <1,5,4,5>
+  3759375522U, // <3,1,5,5>: Cost 4 vext3 LHS, <1,5,5,7>
+  3720417378U, // <3,1,5,6>: Cost 4 vext2 <4,u,3,1>, <5,6,7,0>
+  2832518454U, // <3,1,5,7>: Cost 3 vuzpr <2,3,0,1>, RHS
+  1611891901U, // <3,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
+  3763578048U, // <3,1,6,0>: Cost 4 vext3 LHS, <1,6,0,1>
+  2689836239U, // <3,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
+  2732967128U, // <3,1,6,2>: Cost 3 vext3 LHS, <1,6,2,7>
+  2685633761U, // <3,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
+  3763578088U, // <3,1,6,4>: Cost 4 vext3 LHS, <1,6,4,5>
+  2689836275U, // <3,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
+  3763578108U, // <3,1,6,6>: Cost 4 vext3 LHS, <1,6,6,7>
+  2732967166U, // <3,1,6,7>: Cost 3 vext3 LHS, <1,6,7,0>
+  2685633806U, // <3,1,6,u>: Cost 3 vext3 LHS, <1,6,u,7>
+  3631972454U, // <3,1,7,0>: Cost 4 vext1 <1,3,1,7>, LHS
+  2659947612U, // <3,1,7,1>: Cost 3 vext2 <7,1,3,1>, <7,1,3,1>
+  4036102294U, // <3,1,7,2>: Cost 4 vzipr <1,5,3,7>, <3,0,1,2>
+  3095396454U, // <3,1,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
+  3631975734U, // <3,1,7,4>: Cost 4 vext1 <1,3,1,7>, RHS
+  2222982144U, // <3,1,7,5>: Cost 3 vrev <1,3,5,7>
+  3296797705U, // <3,1,7,6>: Cost 4 vrev <1,3,6,7>
+  3720418924U, // <3,1,7,7>: Cost 4 vext2 <4,u,3,1>, <7,7,7,7>
+  3095396459U, // <3,1,7,u>: Cost 3 vtrnr <1,3,5,7>, LHS
+  1484496998U, // <3,1,u,0>: Cost 2 vext1 <1,3,1,u>, LHS
+  1611892077U, // <3,1,u,1>: Cost 2 vext3 LHS, <1,u,1,3>
+  2685633907U, // <3,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
+  1611892092U, // <3,1,u,3>: Cost 2 vext3 LHS, <1,u,3,0>
+  1484500278U, // <3,1,u,4>: Cost 2 vext1 <1,3,1,u>, RHS
+  1611892117U, // <3,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
+  2685633950U, // <3,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
+  2832518697U, // <3,1,u,7>: Cost 3 vuzpr <2,3,0,1>, RHS
+  1611892140U, // <3,1,u,u>: Cost 2 vext3 LHS, <1,u,u,3>
+  2623455232U, // <3,2,0,0>: Cost 3 vext2 <1,0,3,2>, <0,0,0,0>
+  1549713510U, // <3,2,0,1>: Cost 2 vext2 <1,0,3,2>, LHS
+  2689836484U, // <3,2,0,2>: Cost 3 vext3 LHS, <2,0,2,0>
+  2685633997U, // <3,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
+  2623455570U, // <3,2,0,4>: Cost 3 vext2 <1,0,3,2>, <0,4,1,5>
+  2732967398U, // <3,2,0,5>: Cost 3 vext3 LHS, <2,0,5,7>
+  2689836524U, // <3,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
+  2229044964U, // <3,2,0,7>: Cost 3 vrev <2,3,7,0>
+  1549714077U, // <3,2,0,u>: Cost 2 vext2 <1,0,3,2>, LHS
+  1549714166U, // <3,2,1,0>: Cost 2 vext2 <1,0,3,2>, <1,0,3,2>
+  2623456052U, // <3,2,1,1>: Cost 3 vext2 <1,0,3,2>, <1,1,1,1>
+  2623456150U, // <3,2,1,2>: Cost 3 vext2 <1,0,3,2>, <1,2,3,0>
+  2685634079U, // <3,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
+  2552286518U, // <3,2,1,4>: Cost 3 vext1 <0,3,2,1>, RHS
+  2623456400U, // <3,2,1,5>: Cost 3 vext2 <1,0,3,2>, <1,5,3,7>
+  2689836604U, // <3,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
+  3667834101U, // <3,2,1,7>: Cost 4 vext1 <7,3,2,1>, <7,3,2,1>
+  1155385070U, // <3,2,1,u>: Cost 2 vrev <2,3,u,1>
+  2689836629U, // <3,2,2,0>: Cost 3 vext3 LHS, <2,2,0,1>
+  2689836640U, // <3,2,2,1>: Cost 3 vext3 LHS, <2,2,1,3>
+  1611449960U, // <3,2,2,2>: Cost 2 vext3 LHS, <2,2,2,2>
+  1611892338U, // <3,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
+  2689836669U, // <3,2,2,4>: Cost 3 vext3 LHS, <2,2,4,5>
+  2689836680U, // <3,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
+  2689836688U, // <3,2,2,6>: Cost 3 vext3 LHS, <2,2,6,6>
+  3763578518U, // <3,2,2,7>: Cost 4 vext3 LHS, <2,2,7,3>
+  1611892383U, // <3,2,2,u>: Cost 2 vext3 LHS, <2,2,u,3>
+  1611450022U, // <3,2,3,0>: Cost 2 vext3 LHS, <2,3,0,1>
+  2685191854U, // <3,2,3,1>: Cost 3 vext3 LHS, <2,3,1,0>
+  2685191865U, // <3,2,3,2>: Cost 3 vext3 LHS, <2,3,2,2>
+  2685191875U, // <3,2,3,3>: Cost 3 vext3 LHS, <2,3,3,3>
+  1611450062U, // <3,2,3,4>: Cost 2 vext3 LHS, <2,3,4,5>
+  2732967635U, // <3,2,3,5>: Cost 3 vext3 LHS, <2,3,5,1>
+  2732967645U, // <3,2,3,6>: Cost 3 vext3 LHS, <2,3,6,2>
+  2732967652U, // <3,2,3,7>: Cost 3 vext3 LHS, <2,3,7,0>
+  1611450094U, // <3,2,3,u>: Cost 2 vext3 LHS, <2,3,u,1>
+  2558279782U, // <3,2,4,0>: Cost 3 vext1 <1,3,2,4>, LHS
+  2558280602U, // <3,2,4,1>: Cost 3 vext1 <1,3,2,4>, <1,2,3,4>
+  2732967692U, // <3,2,4,2>: Cost 3 vext3 LHS, <2,4,2,4>
+  2685634326U, // <3,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
+  2558283062U, // <3,2,4,4>: Cost 3 vext1 <1,3,2,4>, RHS
+  1549716790U, // <3,2,4,5>: Cost 2 vext2 <1,0,3,2>, RHS
+  2689836844U, // <3,2,4,6>: Cost 3 vext3 LHS, <2,4,6,0>
+  2229077736U, // <3,2,4,7>: Cost 3 vrev <2,3,7,4>
+  1549717033U, // <3,2,4,u>: Cost 2 vext2 <1,0,3,2>, RHS
+  2552316006U, // <3,2,5,0>: Cost 3 vext1 <0,3,2,5>, LHS
+  2228643507U, // <3,2,5,1>: Cost 3 vrev <2,3,1,5>
+  2689836896U, // <3,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
+  2685634408U, // <3,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
+  1155122894U, // <3,2,5,4>: Cost 2 vrev <2,3,4,5>
+  2665263108U, // <3,2,5,5>: Cost 3 vext2 <u,0,3,2>, <5,5,5,5>
+  2689836932U, // <3,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
+  2665263272U, // <3,2,5,7>: Cost 3 vext2 <u,0,3,2>, <5,7,5,7>
+  1155417842U, // <3,2,5,u>: Cost 2 vrev <2,3,u,5>
+  2689836953U, // <3,2,6,0>: Cost 3 vext3 LHS, <2,6,0,1>
+  2689836964U, // <3,2,6,1>: Cost 3 vext3 LHS, <2,6,1,3>
+  2689836976U, // <3,2,6,2>: Cost 3 vext3 LHS, <2,6,2,6>
+  1611892666U, // <3,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
+  2689836993U, // <3,2,6,4>: Cost 3 vext3 LHS, <2,6,4,5>
+  2689837004U, // <3,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
+  2689837013U, // <3,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
+  2665263950U, // <3,2,6,7>: Cost 3 vext2 <u,0,3,2>, <6,7,0,1>
+  1611892711U, // <3,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
+  2665264122U, // <3,2,7,0>: Cost 3 vext2 <u,0,3,2>, <7,0,1,2>
+  2623460419U, // <3,2,7,1>: Cost 3 vext2 <1,0,3,2>, <7,1,0,3>
+  4169138340U, // <3,2,7,2>: Cost 4 vtrnr <1,3,5,7>, <0,2,0,2>
+  2962358374U, // <3,2,7,3>: Cost 3 vzipr <1,5,3,7>, LHS
+  2665264486U, // <3,2,7,4>: Cost 3 vext2 <u,0,3,2>, <7,4,5,6>
+  2228954841U, // <3,2,7,5>: Cost 3 vrev <2,3,5,7>
+  2229028578U, // <3,2,7,6>: Cost 3 vrev <2,3,6,7>
+  2665264748U, // <3,2,7,7>: Cost 3 vext2 <u,0,3,2>, <7,7,7,7>
+  2962358379U, // <3,2,7,u>: Cost 3 vzipr <1,5,3,7>, LHS
+  1611892795U, // <3,2,u,0>: Cost 2 vext3 LHS, <2,u,0,1>
+  1549719342U, // <3,2,u,1>: Cost 2 vext2 <1,0,3,2>, LHS
+  1611449960U, // <3,2,u,2>: Cost 2 vext3 LHS, <2,2,2,2>
+  1611892824U, // <3,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
+  1611892835U, // <3,2,u,4>: Cost 2 vext3 LHS, <2,u,4,5>
+  1549719706U, // <3,2,u,5>: Cost 2 vext2 <1,0,3,2>, RHS
+  2689837168U, // <3,2,u,6>: Cost 3 vext3 LHS, <2,u,6,0>
+  2665265408U, // <3,2,u,7>: Cost 3 vext2 <u,0,3,2>, <u,7,0,1>
+  1611892867U, // <3,2,u,u>: Cost 2 vext3 LHS, <2,u,u,1>
+  2685192331U, // <3,3,0,0>: Cost 3 vext3 LHS, <3,0,0,0>
+  1611450518U, // <3,3,0,1>: Cost 2 vext3 LHS, <3,0,1,2>
+  2685634717U, // <3,3,0,2>: Cost 3 vext3 LHS, <3,0,2,0>
+  2564294806U, // <3,3,0,3>: Cost 3 vext1 <2,3,3,0>, <3,0,1,2>
+  2685634736U, // <3,3,0,4>: Cost 3 vext3 LHS, <3,0,4,1>
+  2732968122U, // <3,3,0,5>: Cost 3 vext3 LHS, <3,0,5,2>
+  3763579075U, // <3,3,0,6>: Cost 4 vext3 LHS, <3,0,6,2>
+  4034053264U, // <3,3,0,7>: Cost 4 vzipr <1,2,3,0>, <1,5,3,7>
+  1611450581U, // <3,3,0,u>: Cost 2 vext3 LHS, <3,0,u,2>
+  2685192415U, // <3,3,1,0>: Cost 3 vext3 LHS, <3,1,0,3>
+  1550385992U, // <3,3,1,1>: Cost 2 vext2 <1,1,3,3>, <1,1,3,3>
+  2685192433U, // <3,3,1,2>: Cost 3 vext3 LHS, <3,1,2,3>
+  2685634808U, // <3,3,1,3>: Cost 3 vext3 LHS, <3,1,3,1>
+  2558332214U, // <3,3,1,4>: Cost 3 vext1 <1,3,3,1>, RHS
+  2685634828U, // <3,3,1,5>: Cost 3 vext3 LHS, <3,1,5,3>
+  3759376661U, // <3,3,1,6>: Cost 4 vext3 LHS, <3,1,6,3>
+  2703477022U, // <3,3,1,7>: Cost 3 vext3 <3,1,7,3>, <3,1,7,3>
+  1555031423U, // <3,3,1,u>: Cost 2 vext2 <1,u,3,3>, <1,u,3,3>
+  2564309094U, // <3,3,2,0>: Cost 3 vext1 <2,3,3,2>, LHS
+  2630100513U, // <3,3,2,1>: Cost 3 vext2 <2,1,3,3>, <2,1,3,3>
+  1557022322U, // <3,3,2,2>: Cost 2 vext2 <2,2,3,3>, <2,2,3,3>
+  2685192520U, // <3,3,2,3>: Cost 3 vext3 LHS, <3,2,3,0>
+  2564312374U, // <3,3,2,4>: Cost 3 vext1 <2,3,3,2>, RHS
+  2732968286U, // <3,3,2,5>: Cost 3 vext3 LHS, <3,2,5,4>
+  2685634918U, // <3,3,2,6>: Cost 3 vext3 LHS, <3,2,6,3>
+  2704140655U, // <3,3,2,7>: Cost 3 vext3 <3,2,7,3>, <3,2,7,3>
+  1561004120U, // <3,3,2,u>: Cost 2 vext2 <2,u,3,3>, <2,u,3,3>
+  1496547430U, // <3,3,3,0>: Cost 2 vext1 <3,3,3,3>, LHS
+  2624129256U, // <3,3,3,1>: Cost 3 vext2 <1,1,3,3>, <3,1,1,3>
+  2630764866U, // <3,3,3,2>: Cost 3 vext2 <2,2,3,3>, <3,2,2,3>
+  336380006U, // <3,3,3,3>: Cost 1 vdup3 LHS
+  1496550710U, // <3,3,3,4>: Cost 2 vext1 <3,3,3,3>, RHS
+  2732968368U, // <3,3,3,5>: Cost 3 vext3 LHS, <3,3,5,5>
+  2624129683U, // <3,3,3,6>: Cost 3 vext2 <1,1,3,3>, <3,6,3,7>
+  2594182400U, // <3,3,3,7>: Cost 3 vext1 <7,3,3,3>, <7,3,3,3>
+  336380006U, // <3,3,3,u>: Cost 1 vdup3 LHS
+  2558353510U, // <3,3,4,0>: Cost 3 vext1 <1,3,3,4>, LHS
+  2558354411U, // <3,3,4,1>: Cost 3 vext1 <1,3,3,4>, <1,3,3,4>
+  2564327108U, // <3,3,4,2>: Cost 3 vext1 <2,3,3,4>, <2,3,3,4>
+  2564327938U, // <3,3,4,3>: Cost 3 vext1 <2,3,3,4>, <3,4,5,6>
+  2960343962U, // <3,3,4,4>: Cost 3 vzipr <1,2,3,4>, <1,2,3,4>
+  1611893250U, // <3,3,4,5>: Cost 2 vext3 LHS, <3,4,5,6>
+  2771619126U, // <3,3,4,6>: Cost 3 vuzpl <3,3,3,3>, RHS
+  4034086032U, // <3,3,4,7>: Cost 4 vzipr <1,2,3,4>, <1,5,3,7>
+  1611893277U, // <3,3,4,u>: Cost 2 vext3 LHS, <3,4,u,6>
+  2558361702U, // <3,3,5,0>: Cost 3 vext1 <1,3,3,5>, LHS
+  2558362604U, // <3,3,5,1>: Cost 3 vext1 <1,3,3,5>, <1,3,3,5>
+  2558363342U, // <3,3,5,2>: Cost 3 vext1 <1,3,3,5>, <2,3,4,5>
+  2732968512U, // <3,3,5,3>: Cost 3 vext3 LHS, <3,5,3,5>
+  2558364982U, // <3,3,5,4>: Cost 3 vext1 <1,3,3,5>, RHS
+  3101279950U, // <3,3,5,5>: Cost 3 vtrnr <2,3,4,5>, <2,3,4,5>
+  2665934946U, // <3,3,5,6>: Cost 3 vext2 <u,1,3,3>, <5,6,7,0>
+  2826636598U, // <3,3,5,7>: Cost 3 vuzpr <1,3,1,3>, RHS
+  2826636599U, // <3,3,5,u>: Cost 3 vuzpr <1,3,1,3>, RHS
+  2732968568U, // <3,3,6,0>: Cost 3 vext3 LHS, <3,6,0,7>
+  3763579521U, // <3,3,6,1>: Cost 4 vext3 LHS, <3,6,1,7>
+  2732968586U, // <3,3,6,2>: Cost 3 vext3 LHS, <3,6,2,7>
+  2732968595U, // <3,3,6,3>: Cost 3 vext3 LHS, <3,6,3,7>
+  2732968604U, // <3,3,6,4>: Cost 3 vext3 LHS, <3,6,4,7>
+  3763579557U, // <3,3,6,5>: Cost 4 vext3 LHS, <3,6,5,7>
+  2732968621U, // <3,3,6,6>: Cost 3 vext3 LHS, <3,6,6,6>
+  2657973099U, // <3,3,6,7>: Cost 3 vext2 <6,7,3,3>, <6,7,3,3>
+  2658636732U, // <3,3,6,u>: Cost 3 vext2 <6,u,3,3>, <6,u,3,3>
+  2558378086U, // <3,3,7,0>: Cost 3 vext1 <1,3,3,7>, LHS
+  2558378990U, // <3,3,7,1>: Cost 3 vext1 <1,3,3,7>, <1,3,3,7>
+  2564351687U, // <3,3,7,2>: Cost 3 vext1 <2,3,3,7>, <2,3,3,7>
+  2661291264U, // <3,3,7,3>: Cost 3 vext2 <7,3,3,3>, <7,3,3,3>
+  2558381366U, // <3,3,7,4>: Cost 3 vext1 <1,3,3,7>, RHS
+  2732968694U, // <3,3,7,5>: Cost 3 vext3 LHS, <3,7,5,7>
+  3781126907U, // <3,3,7,6>: Cost 4 vext3 <3,7,6,3>, <3,7,6,3>
+  3095397376U, // <3,3,7,7>: Cost 3 vtrnr <1,3,5,7>, <1,3,5,7>
+  2558383918U, // <3,3,7,u>: Cost 3 vext1 <1,3,3,7>, LHS
+  1496547430U, // <3,3,u,0>: Cost 2 vext1 <3,3,3,3>, LHS
+  1611893534U, // <3,3,u,1>: Cost 2 vext3 LHS, <3,u,1,2>
+  1592858504U, // <3,3,u,2>: Cost 2 vext2 <u,2,3,3>, <u,2,3,3>
+  336380006U, // <3,3,u,3>: Cost 1 vdup3 LHS
+  1496550710U, // <3,3,u,4>: Cost 2 vext1 <3,3,3,3>, RHS
+  1611893574U, // <3,3,u,5>: Cost 2 vext3 LHS, <3,u,5,6>
+  2690280268U, // <3,3,u,6>: Cost 3 vext3 LHS, <3,u,6,3>
+  2826636841U, // <3,3,u,7>: Cost 3 vuzpr <1,3,1,3>, RHS
+  336380006U, // <3,3,u,u>: Cost 1 vdup3 LHS
+  2624798720U, // <3,4,0,0>: Cost 3 vext2 <1,2,3,4>, <0,0,0,0>
+  1551056998U, // <3,4,0,1>: Cost 2 vext2 <1,2,3,4>, LHS
+  2624798884U, // <3,4,0,2>: Cost 3 vext2 <1,2,3,4>, <0,2,0,2>
+  3693232384U, // <3,4,0,3>: Cost 4 vext2 <0,3,3,4>, <0,3,1,4>
+  2624799058U, // <3,4,0,4>: Cost 3 vext2 <1,2,3,4>, <0,4,1,5>
+  1659227026U, // <3,4,0,5>: Cost 2 vext3 LHS, <4,0,5,1>
+  1659227036U, // <3,4,0,6>: Cost 2 vext3 LHS, <4,0,6,2>
+  3667973382U, // <3,4,0,7>: Cost 4 vext1 <7,3,4,0>, <7,3,4,0>
+  1551057565U, // <3,4,0,u>: Cost 2 vext2 <1,2,3,4>, LHS
+  2624799478U, // <3,4,1,0>: Cost 3 vext2 <1,2,3,4>, <1,0,3,2>
+  2624799540U, // <3,4,1,1>: Cost 3 vext2 <1,2,3,4>, <1,1,1,1>
+  1551057818U, // <3,4,1,2>: Cost 2 vext2 <1,2,3,4>, <1,2,3,4>
+  2624799704U, // <3,4,1,3>: Cost 3 vext2 <1,2,3,4>, <1,3,1,3>
+  2564377910U, // <3,4,1,4>: Cost 3 vext1 <2,3,4,1>, RHS
+  2689838050U, // <3,4,1,5>: Cost 3 vext3 LHS, <4,1,5,0>
+  2689838062U, // <3,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
+  2628117807U, // <3,4,1,7>: Cost 3 vext2 <1,7,3,4>, <1,7,3,4>
+  1555039616U, // <3,4,1,u>: Cost 2 vext2 <1,u,3,4>, <1,u,3,4>
+  3626180710U, // <3,4,2,0>: Cost 4 vext1 <0,3,4,2>, LHS
+  2624800298U, // <3,4,2,1>: Cost 3 vext2 <1,2,3,4>, <2,1,4,3>
+  2624800360U, // <3,4,2,2>: Cost 3 vext2 <1,2,3,4>, <2,2,2,2>
+  2624800422U, // <3,4,2,3>: Cost 3 vext2 <1,2,3,4>, <2,3,0,1>
+  2624800514U, // <3,4,2,4>: Cost 3 vext2 <1,2,3,4>, <2,4,1,3>
+  2709965878U, // <3,4,2,5>: Cost 3 vext3 <4,2,5,3>, <4,2,5,3>
+  2689838140U, // <3,4,2,6>: Cost 3 vext3 LHS, <4,2,6,0>
+  2634090504U, // <3,4,2,7>: Cost 3 vext2 <2,7,3,4>, <2,7,3,4>
+  2689838158U, // <3,4,2,u>: Cost 3 vext3 LHS, <4,2,u,0>
+  2624800918U, // <3,4,3,0>: Cost 3 vext2 <1,2,3,4>, <3,0,1,2>
+  2636081403U, // <3,4,3,1>: Cost 3 vext2 <3,1,3,4>, <3,1,3,4>
+  2636745036U, // <3,4,3,2>: Cost 3 vext2 <3,2,3,4>, <3,2,3,4>
+  2624801180U, // <3,4,3,3>: Cost 3 vext2 <1,2,3,4>, <3,3,3,3>
+  2624801232U, // <3,4,3,4>: Cost 3 vext2 <1,2,3,4>, <3,4,0,1>
+  2905836854U, // <3,4,3,5>: Cost 3 vzipl <3,3,3,3>, RHS
+  3040054582U, // <3,4,3,6>: Cost 3 vtrnl <3,3,3,3>, RHS
+  3702524611U, // <3,4,3,7>: Cost 4 vext2 <1,u,3,4>, <3,7,0,1>
+  2624801566U, // <3,4,3,u>: Cost 3 vext2 <1,2,3,4>, <3,u,1,2>
+  2564399206U, // <3,4,4,0>: Cost 3 vext1 <2,3,4,4>, LHS
+  2564400026U, // <3,4,4,1>: Cost 3 vext1 <2,3,4,4>, <1,2,3,4>
+  2564400845U, // <3,4,4,2>: Cost 3 vext1 <2,3,4,4>, <2,3,4,4>
+  2570373542U, // <3,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
+  1659227344U, // <3,4,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
+  1551060278U, // <3,4,4,5>: Cost 2 vext2 <1,2,3,4>, RHS
+  1659227364U, // <3,4,4,6>: Cost 2 vext3 LHS, <4,4,6,6>
+  3668006154U, // <3,4,4,7>: Cost 4 vext1 <7,3,4,4>, <7,3,4,4>
+  1551060521U, // <3,4,4,u>: Cost 2 vext2 <1,2,3,4>, RHS
+  1490665574U, // <3,4,5,0>: Cost 2 vext1 <2,3,4,5>, LHS
+  2689838341U, // <3,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
+  1490667214U, // <3,4,5,2>: Cost 2 vext1 <2,3,4,5>, <2,3,4,5>
+  2564409494U, // <3,4,5,3>: Cost 3 vext1 <2,3,4,5>, <3,0,1,2>
+  1490668854U, // <3,4,5,4>: Cost 2 vext1 <2,3,4,5>, RHS
+  2689838381U, // <3,4,5,5>: Cost 3 vext3 LHS, <4,5,5,7>
+  537709878U, // <3,4,5,6>: Cost 1 vext3 LHS, RHS
+  2594272523U, // <3,4,5,7>: Cost 3 vext1 <7,3,4,5>, <7,3,4,5>
+  537709896U, // <3,4,5,u>: Cost 1 vext3 LHS, RHS
+  2689838411U, // <3,4,6,0>: Cost 3 vext3 LHS, <4,6,0,1>
+  2558444534U, // <3,4,6,1>: Cost 3 vext1 <1,3,4,6>, <1,3,4,6>
+  2666607098U, // <3,4,6,2>: Cost 3 vext2 <u,2,3,4>, <6,2,7,3>
+  2558446082U, // <3,4,6,3>: Cost 3 vext1 <1,3,4,6>, <3,4,5,6>
+  1659227508U, // <3,4,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
+  2689838462U, // <3,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
+  2689838471U, // <3,4,6,6>: Cost 3 vext3 LHS, <4,6,6,7>
+  2657981292U, // <3,4,6,7>: Cost 3 vext2 <6,7,3,4>, <6,7,3,4>
+  1659227540U, // <3,4,6,u>: Cost 2 vext3 LHS, <4,6,u,2>
+  2666607610U, // <3,4,7,0>: Cost 3 vext2 <u,2,3,4>, <7,0,1,2>
+  3702527072U, // <3,4,7,1>: Cost 4 vext2 <1,u,3,4>, <7,1,3,5>
+  2660635824U, // <3,4,7,2>: Cost 3 vext2 <7,2,3,4>, <7,2,3,4>
+  3644139945U, // <3,4,7,3>: Cost 4 vext1 <3,3,4,7>, <3,3,4,7>
+  2666607974U, // <3,4,7,4>: Cost 3 vext2 <u,2,3,4>, <7,4,5,6>
+  2732969416U, // <3,4,7,5>: Cost 3 vext3 LHS, <4,7,5,0>
+  2732969425U, // <3,4,7,6>: Cost 3 vext3 LHS, <4,7,6,0>
+  2666608236U, // <3,4,7,7>: Cost 3 vext2 <u,2,3,4>, <7,7,7,7>
+  2664617622U, // <3,4,7,u>: Cost 3 vext2 <7,u,3,4>, <7,u,3,4>
+  1490690150U, // <3,4,u,0>: Cost 2 vext1 <2,3,4,u>, LHS
+  1551062830U, // <3,4,u,1>: Cost 2 vext2 <1,2,3,4>, LHS
+  1490691793U, // <3,4,u,2>: Cost 2 vext1 <2,3,4,u>, <2,3,4,u>
+  2624804796U, // <3,4,u,3>: Cost 3 vext2 <1,2,3,4>, <u,3,0,1>
+  1490693430U, // <3,4,u,4>: Cost 2 vext1 <2,3,4,u>, RHS
+  1551063194U, // <3,4,u,5>: Cost 2 vext2 <1,2,3,4>, RHS
+  537710121U, // <3,4,u,6>: Cost 1 vext3 LHS, RHS
+  2594297102U, // <3,4,u,7>: Cost 3 vext1 <7,3,4,u>, <7,3,4,u>
+  537710139U, // <3,4,u,u>: Cost 1 vext3 LHS, RHS
+  3692576768U, // <3,5,0,0>: Cost 4 vext2 <0,2,3,5>, <0,0,0,0>
+  2618835046U, // <3,5,0,1>: Cost 3 vext2 <0,2,3,5>, LHS
+  2618835138U, // <3,5,0,2>: Cost 3 vext2 <0,2,3,5>, <0,2,3,5>
+  3692577024U, // <3,5,0,3>: Cost 4 vext2 <0,2,3,5>, <0,3,1,4>
+  2689838690U, // <3,5,0,4>: Cost 3 vext3 LHS, <5,0,4,1>
+  2732969579U, // <3,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
+  2732969588U, // <3,5,0,6>: Cost 3 vext3 LHS, <5,0,6,1>
+  2246963055U, // <3,5,0,7>: Cost 3 vrev <5,3,7,0>
+  2618835613U, // <3,5,0,u>: Cost 3 vext2 <0,2,3,5>, LHS
+  2594308198U, // <3,5,1,0>: Cost 3 vext1 <7,3,5,1>, LHS
+  3692577588U, // <3,5,1,1>: Cost 4 vext2 <0,2,3,5>, <1,1,1,1>
+  2624807835U, // <3,5,1,2>: Cost 3 vext2 <1,2,3,5>, <1,2,3,5>
+  2625471468U, // <3,5,1,3>: Cost 3 vext2 <1,3,3,5>, <1,3,3,5>
+  2626135101U, // <3,5,1,4>: Cost 3 vext2 <1,4,3,5>, <1,4,3,5>
+  2594311888U, // <3,5,1,5>: Cost 3 vext1 <7,3,5,1>, <5,1,7,3>
+  3699877107U, // <3,5,1,6>: Cost 4 vext2 <1,4,3,5>, <1,6,5,7>
+  1641680592U, // <3,5,1,7>: Cost 2 vext3 <5,1,7,3>, <5,1,7,3>
+  1641754329U, // <3,5,1,u>: Cost 2 vext3 <5,1,u,3>, <5,1,u,3>
+  3692578274U, // <3,5,2,0>: Cost 4 vext2 <0,2,3,5>, <2,0,5,3>
+  2630116899U, // <3,5,2,1>: Cost 3 vext2 <2,1,3,5>, <2,1,3,5>
+  3692578408U, // <3,5,2,2>: Cost 4 vext2 <0,2,3,5>, <2,2,2,2>
+  2625472206U, // <3,5,2,3>: Cost 3 vext2 <1,3,3,5>, <2,3,4,5>
+  2632107798U, // <3,5,2,4>: Cost 3 vext2 <2,4,3,5>, <2,4,3,5>
+  2715938575U, // <3,5,2,5>: Cost 3 vext3 <5,2,5,3>, <5,2,5,3>
+  3692578746U, // <3,5,2,6>: Cost 4 vext2 <0,2,3,5>, <2,6,3,7>
+  2716086049U, // <3,5,2,7>: Cost 3 vext3 <5,2,7,3>, <5,2,7,3>
+  2634762330U, // <3,5,2,u>: Cost 3 vext2 <2,u,3,5>, <2,u,3,5>
+  3692578966U, // <3,5,3,0>: Cost 4 vext2 <0,2,3,5>, <3,0,1,2>
+  2636089596U, // <3,5,3,1>: Cost 3 vext2 <3,1,3,5>, <3,1,3,5>
+  3699214668U, // <3,5,3,2>: Cost 4 vext2 <1,3,3,5>, <3,2,3,4>
+  2638080412U, // <3,5,3,3>: Cost 3 vext2 <3,4,3,5>, <3,3,3,3>
+  2618837506U, // <3,5,3,4>: Cost 3 vext2 <0,2,3,5>, <3,4,5,6>
+  2832844494U, // <3,5,3,5>: Cost 3 vuzpr <2,3,4,5>, <2,3,4,5>
+  4033415682U, // <3,5,3,6>: Cost 4 vzipr <1,1,3,3>, <3,4,5,6>
+  3095072054U, // <3,5,3,7>: Cost 3 vtrnr <1,3,1,3>, RHS
+  3095072055U, // <3,5,3,u>: Cost 3 vtrnr <1,3,1,3>, RHS
+  2600304742U, // <3,5,4,0>: Cost 3 vext1 <u,3,5,4>, LHS
+  3763580815U, // <3,5,4,1>: Cost 4 vext3 LHS, <5,4,1,5>
+  2564474582U, // <3,5,4,2>: Cost 3 vext1 <2,3,5,4>, <2,3,5,4>
+  3699879044U, // <3,5,4,3>: Cost 4 vext2 <1,4,3,5>, <4,3,5,0>
+  2600308022U, // <3,5,4,4>: Cost 3 vext1 <u,3,5,4>, RHS
+  2618838326U, // <3,5,4,5>: Cost 3 vext2 <0,2,3,5>, RHS
+  2772454710U, // <3,5,4,6>: Cost 3 vuzpl <3,4,5,6>, RHS
+  1659228102U, // <3,5,4,7>: Cost 2 vext3 LHS, <5,4,7,6>
+  1659228111U, // <3,5,4,u>: Cost 2 vext3 LHS, <5,4,u,6>
+  2570453094U, // <3,5,5,0>: Cost 3 vext1 <3,3,5,5>, LHS
+  2624810704U, // <3,5,5,1>: Cost 3 vext2 <1,2,3,5>, <5,1,7,3>
+  2570454734U, // <3,5,5,2>: Cost 3 vext1 <3,3,5,5>, <2,3,4,5>
+  2570455472U, // <3,5,5,3>: Cost 3 vext1 <3,3,5,5>, <3,3,5,5>
+  2570456374U, // <3,5,5,4>: Cost 3 vext1 <3,3,5,5>, RHS
+  1659228164U, // <3,5,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
+  2732969998U, // <3,5,5,6>: Cost 3 vext3 LHS, <5,5,6,6>
+  1659228184U, // <3,5,5,7>: Cost 2 vext3 LHS, <5,5,7,7>
+  1659228193U, // <3,5,5,u>: Cost 2 vext3 LHS, <5,5,u,7>
+  2732970020U, // <3,5,6,0>: Cost 3 vext3 LHS, <5,6,0,1>
+  2732970035U, // <3,5,6,1>: Cost 3 vext3 LHS, <5,6,1,7>
+  2564490968U, // <3,5,6,2>: Cost 3 vext1 <2,3,5,6>, <2,3,5,6>
+  2732970050U, // <3,5,6,3>: Cost 3 vext3 LHS, <5,6,3,4>
+  2732970060U, // <3,5,6,4>: Cost 3 vext3 LHS, <5,6,4,5>
+  2732970071U, // <3,5,6,5>: Cost 3 vext3 LHS, <5,6,5,7>
+  2732970080U, // <3,5,6,6>: Cost 3 vext3 LHS, <5,6,6,7>
+  1659228258U, // <3,5,6,7>: Cost 2 vext3 LHS, <5,6,7,0>
+  1659228267U, // <3,5,6,u>: Cost 2 vext3 LHS, <5,6,u,0>
+  1484783718U, // <3,5,7,0>: Cost 2 vext1 <1,3,5,7>, LHS
+  1484784640U, // <3,5,7,1>: Cost 2 vext1 <1,3,5,7>, <1,3,5,7>
+  2558527080U, // <3,5,7,2>: Cost 3 vext1 <1,3,5,7>, <2,2,2,2>
+  2558527638U, // <3,5,7,3>: Cost 3 vext1 <1,3,5,7>, <3,0,1,2>
+  1484786998U, // <3,5,7,4>: Cost 2 vext1 <1,3,5,7>, RHS
+  1659228328U, // <3,5,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
+  2732970154U, // <3,5,7,6>: Cost 3 vext3 LHS, <5,7,6,0>
+  2558531180U, // <3,5,7,7>: Cost 3 vext1 <1,3,5,7>, <7,7,7,7>
+  1484789550U, // <3,5,7,u>: Cost 2 vext1 <1,3,5,7>, LHS
+  1484791910U, // <3,5,u,0>: Cost 2 vext1 <1,3,5,u>, LHS
+  1484792833U, // <3,5,u,1>: Cost 2 vext1 <1,3,5,u>, <1,3,5,u>
+  2558535272U, // <3,5,u,2>: Cost 3 vext1 <1,3,5,u>, <2,2,2,2>
+  2558535830U, // <3,5,u,3>: Cost 3 vext1 <1,3,5,u>, <3,0,1,2>
+  1484795190U, // <3,5,u,4>: Cost 2 vext1 <1,3,5,u>, RHS
+  1659228409U, // <3,5,u,5>: Cost 2 vext3 LHS, <5,u,5,7>
+  2772457626U, // <3,5,u,6>: Cost 3 vuzpl <3,4,5,6>, RHS
+  1646326023U, // <3,5,u,7>: Cost 2 vext3 <5,u,7,3>, <5,u,7,3>
+  1484797742U, // <3,5,u,u>: Cost 2 vext1 <1,3,5,u>, LHS
+  2558541926U, // <3,6,0,0>: Cost 3 vext1 <1,3,6,0>, LHS
+  2689839393U, // <3,6,0,1>: Cost 3 vext3 LHS, <6,0,1,2>
+  2689839404U, // <3,6,0,2>: Cost 3 vext3 LHS, <6,0,2,4>
+  3706519808U, // <3,6,0,3>: Cost 4 vext2 <2,5,3,6>, <0,3,1,4>
+  2689839420U, // <3,6,0,4>: Cost 3 vext3 LHS, <6,0,4,2>
+  2732970314U, // <3,6,0,5>: Cost 3 vext3 LHS, <6,0,5,7>
+  2732970316U, // <3,6,0,6>: Cost 3 vext3 LHS, <6,0,6,0>
+  2960313654U, // <3,6,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
+  2689839456U, // <3,6,0,u>: Cost 3 vext3 LHS, <6,0,u,2>
+  3763581290U, // <3,6,1,0>: Cost 4 vext3 LHS, <6,1,0,3>
+  3763581297U, // <3,6,1,1>: Cost 4 vext3 LHS, <6,1,1,1>
+  2624816028U, // <3,6,1,2>: Cost 3 vext2 <1,2,3,6>, <1,2,3,6>
+  3763581315U, // <3,6,1,3>: Cost 4 vext3 LHS, <6,1,3,1>
+  2626143294U, // <3,6,1,4>: Cost 3 vext2 <1,4,3,6>, <1,4,3,6>
+  3763581335U, // <3,6,1,5>: Cost 4 vext3 LHS, <6,1,5,3>
+  2721321376U, // <3,6,1,6>: Cost 3 vext3 <6,1,6,3>, <6,1,6,3>
+  2721395113U, // <3,6,1,7>: Cost 3 vext3 <6,1,7,3>, <6,1,7,3>
+  2628797826U, // <3,6,1,u>: Cost 3 vext2 <1,u,3,6>, <1,u,3,6>
+  2594390118U, // <3,6,2,0>: Cost 3 vext1 <7,3,6,2>, LHS
+  2721616324U, // <3,6,2,1>: Cost 3 vext3 <6,2,1,3>, <6,2,1,3>
+  2630788725U, // <3,6,2,2>: Cost 3 vext2 <2,2,3,6>, <2,2,3,6>
+  3763581395U, // <3,6,2,3>: Cost 4 vext3 LHS, <6,2,3,0>
+  2632115991U, // <3,6,2,4>: Cost 3 vext2 <2,4,3,6>, <2,4,3,6>
+  2632779624U, // <3,6,2,5>: Cost 3 vext2 <2,5,3,6>, <2,5,3,6>
+  2594394618U, // <3,6,2,6>: Cost 3 vext1 <7,3,6,2>, <6,2,7,3>
+  1648316922U, // <3,6,2,7>: Cost 2 vext3 <6,2,7,3>, <6,2,7,3>
+  1648390659U, // <3,6,2,u>: Cost 2 vext3 <6,2,u,3>, <6,2,u,3>
+  3693914262U, // <3,6,3,0>: Cost 4 vext2 <0,4,3,6>, <3,0,1,2>
+  3638281176U, // <3,6,3,1>: Cost 4 vext1 <2,3,6,3>, <1,3,1,3>
+  3696568678U, // <3,6,3,2>: Cost 4 vext2 <0,u,3,6>, <3,2,6,3>
+  2638088604U, // <3,6,3,3>: Cost 3 vext2 <3,4,3,6>, <3,3,3,3>
+  2632780290U, // <3,6,3,4>: Cost 3 vext2 <2,5,3,6>, <3,4,5,6>
+  3712494145U, // <3,6,3,5>: Cost 4 vext2 <3,5,3,6>, <3,5,3,6>
+  3698559612U, // <3,6,3,6>: Cost 4 vext2 <1,2,3,6>, <3,6,1,2>
+  2959674678U, // <3,6,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
+  2959674679U, // <3,6,3,u>: Cost 3 vzipr <1,1,3,3>, RHS
+  3763581536U, // <3,6,4,0>: Cost 4 vext3 LHS, <6,4,0,6>
+  2722943590U, // <3,6,4,1>: Cost 3 vext3 <6,4,1,3>, <6,4,1,3>
+  2732970609U, // <3,6,4,2>: Cost 3 vext3 LHS, <6,4,2,5>
+  3698560147U, // <3,6,4,3>: Cost 4 vext2 <1,2,3,6>, <4,3,6,6>
+  2732970628U, // <3,6,4,4>: Cost 3 vext3 LHS, <6,4,4,6>
+  2689839757U, // <3,6,4,5>: Cost 3 vext3 LHS, <6,4,5,6>
+  2732970640U, // <3,6,4,6>: Cost 3 vext3 LHS, <6,4,6,0>
+  2960346422U, // <3,6,4,7>: Cost 3 vzipr <1,2,3,4>, RHS
+  2689839784U, // <3,6,4,u>: Cost 3 vext3 LHS, <6,4,u,6>
+  2576498790U, // <3,6,5,0>: Cost 3 vext1 <4,3,6,5>, LHS
+  3650241270U, // <3,6,5,1>: Cost 4 vext1 <4,3,6,5>, <1,0,3,2>
+  2732970692U, // <3,6,5,2>: Cost 3 vext3 LHS, <6,5,2,7>
+  2576501250U, // <3,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
+  2576501906U, // <3,6,5,4>: Cost 3 vext1 <4,3,6,5>, <4,3,6,5>
+  3650244622U, // <3,6,5,5>: Cost 4 vext1 <4,3,6,5>, <5,5,6,6>
+  4114633528U, // <3,6,5,6>: Cost 4 vtrnl <3,4,5,6>, <6,6,6,6>
+  2732970735U, // <3,6,5,7>: Cost 3 vext3 LHS, <6,5,7,5>
+  2576504622U, // <3,6,5,u>: Cost 3 vext1 <4,3,6,5>, LHS
+  2732970749U, // <3,6,6,0>: Cost 3 vext3 LHS, <6,6,0,1>
+  2724270856U, // <3,6,6,1>: Cost 3 vext3 <6,6,1,3>, <6,6,1,3>
+  2624819706U, // <3,6,6,2>: Cost 3 vext2 <1,2,3,6>, <6,2,7,3>
+  3656223234U, // <3,6,6,3>: Cost 4 vext1 <5,3,6,6>, <3,4,5,6>
+  2732970788U, // <3,6,6,4>: Cost 3 vext3 LHS, <6,6,4,4>
+  2732970800U, // <3,6,6,5>: Cost 3 vext3 LHS, <6,6,5,7>
+  1659228984U, // <3,6,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
+  1659228994U, // <3,6,6,7>: Cost 2 vext3 LHS, <6,6,7,7>
+  1659229003U, // <3,6,6,u>: Cost 2 vext3 LHS, <6,6,u,7>
+  1659229006U, // <3,6,7,0>: Cost 2 vext3 LHS, <6,7,0,1>
+  2558600201U, // <3,6,7,1>: Cost 3 vext1 <1,3,6,7>, <1,3,6,7>
+  2558601146U, // <3,6,7,2>: Cost 3 vext1 <1,3,6,7>, <2,6,3,7>
+  2725081963U, // <3,6,7,3>: Cost 3 vext3 <6,7,3,3>, <6,7,3,3>
+  1659229046U, // <3,6,7,4>: Cost 2 vext3 LHS, <6,7,4,5>
+  2715423611U, // <3,6,7,5>: Cost 3 vext3 <5,1,7,3>, <6,7,5,1>
+  2722059141U, // <3,6,7,6>: Cost 3 vext3 <6,2,7,3>, <6,7,6,2>
+  2962361654U, // <3,6,7,7>: Cost 3 vzipr <1,5,3,7>, RHS
+  1659229078U, // <3,6,7,u>: Cost 2 vext3 LHS, <6,7,u,1>
+  1659229087U, // <3,6,u,0>: Cost 2 vext3 LHS, <6,u,0,1>
+  2689840041U, // <3,6,u,1>: Cost 3 vext3 LHS, <6,u,1,2>
+  2558609339U, // <3,6,u,2>: Cost 3 vext1 <1,3,6,u>, <2,6,3,u>
+  2576525853U, // <3,6,u,3>: Cost 3 vext1 <4,3,6,u>, <3,4,u,6>
+  1659229127U, // <3,6,u,4>: Cost 2 vext3 LHS, <6,u,4,5>
+  2689840081U, // <3,6,u,5>: Cost 3 vext3 LHS, <6,u,5,6>
+  1659228984U, // <3,6,u,6>: Cost 2 vext3 LHS, <6,6,6,6>
+  1652298720U, // <3,6,u,7>: Cost 2 vext3 <6,u,7,3>, <6,u,7,3>
+  1659229159U, // <3,6,u,u>: Cost 2 vext3 LHS, <6,u,u,1>
+  2626813952U, // <3,7,0,0>: Cost 3 vext2 <1,5,3,7>, <0,0,0,0>
+  1553072230U, // <3,7,0,1>: Cost 2 vext2 <1,5,3,7>, LHS
+  2626814116U, // <3,7,0,2>: Cost 3 vext2 <1,5,3,7>, <0,2,0,2>
+  3700556028U, // <3,7,0,3>: Cost 4 vext2 <1,5,3,7>, <0,3,1,0>
+  2626814290U, // <3,7,0,4>: Cost 3 vext2 <1,5,3,7>, <0,4,1,5>
+  2582507375U, // <3,7,0,5>: Cost 3 vext1 <5,3,7,0>, <5,3,7,0>
+  2588480072U, // <3,7,0,6>: Cost 3 vext1 <6,3,7,0>, <6,3,7,0>
+  2732971055U, // <3,7,0,7>: Cost 3 vext3 LHS, <7,0,7,1>
+  1553072797U, // <3,7,0,u>: Cost 2 vext2 <1,5,3,7>, LHS
+  2626814710U, // <3,7,1,0>: Cost 3 vext2 <1,5,3,7>, <1,0,3,2>
+  2626814772U, // <3,7,1,1>: Cost 3 vext2 <1,5,3,7>, <1,1,1,1>
+  2626814870U, // <3,7,1,2>: Cost 3 vext2 <1,5,3,7>, <1,2,3,0>
+  2625487854U, // <3,7,1,3>: Cost 3 vext2 <1,3,3,7>, <1,3,3,7>
+  2582514998U, // <3,7,1,4>: Cost 3 vext1 <5,3,7,1>, RHS
+  1553073296U, // <3,7,1,5>: Cost 2 vext2 <1,5,3,7>, <1,5,3,7>
+  2627478753U, // <3,7,1,6>: Cost 3 vext2 <1,6,3,7>, <1,6,3,7>
+  2727367810U, // <3,7,1,7>: Cost 3 vext3 <7,1,7,3>, <7,1,7,3>
+  1555064195U, // <3,7,1,u>: Cost 2 vext2 <1,u,3,7>, <1,u,3,7>
+  2588491878U, // <3,7,2,0>: Cost 3 vext1 <6,3,7,2>, LHS
+  3700557318U, // <3,7,2,1>: Cost 4 vext2 <1,5,3,7>, <2,1,0,3>
+  2626815592U, // <3,7,2,2>: Cost 3 vext2 <1,5,3,7>, <2,2,2,2>
+  2626815654U, // <3,7,2,3>: Cost 3 vext2 <1,5,3,7>, <2,3,0,1>
+  2588495158U, // <3,7,2,4>: Cost 3 vext1 <6,3,7,2>, RHS
+  2632787817U, // <3,7,2,5>: Cost 3 vext2 <2,5,3,7>, <2,5,3,7>
+  1559709626U, // <3,7,2,6>: Cost 2 vext2 <2,6,3,7>, <2,6,3,7>
+  2728031443U, // <3,7,2,7>: Cost 3 vext3 <7,2,7,3>, <7,2,7,3>
+  1561036892U, // <3,7,2,u>: Cost 2 vext2 <2,u,3,7>, <2,u,3,7>
+  2626816150U, // <3,7,3,0>: Cost 3 vext2 <1,5,3,7>, <3,0,1,2>
+  2626816268U, // <3,7,3,1>: Cost 3 vext2 <1,5,3,7>, <3,1,5,3>
+  2633451878U, // <3,7,3,2>: Cost 3 vext2 <2,6,3,7>, <3,2,6,3>
+  2626816412U, // <3,7,3,3>: Cost 3 vext2 <1,5,3,7>, <3,3,3,3>
+  2626816514U, // <3,7,3,4>: Cost 3 vext2 <1,5,3,7>, <3,4,5,6>
+  2638760514U, // <3,7,3,5>: Cost 3 vext2 <3,5,3,7>, <3,5,3,7>
+  2639424147U, // <3,7,3,6>: Cost 3 vext2 <3,6,3,7>, <3,6,3,7>
+  2826961920U, // <3,7,3,7>: Cost 3 vuzpr <1,3,5,7>, <1,3,5,7>
+  2626816798U, // <3,7,3,u>: Cost 3 vext2 <1,5,3,7>, <3,u,1,2>
+  2582536294U, // <3,7,4,0>: Cost 3 vext1 <5,3,7,4>, LHS
+  2582537360U, // <3,7,4,1>: Cost 3 vext1 <5,3,7,4>, <1,5,3,7>
+  2588510138U, // <3,7,4,2>: Cost 3 vext1 <6,3,7,4>, <2,6,3,7>
+  3700558996U, // <3,7,4,3>: Cost 4 vext2 <1,5,3,7>, <4,3,6,7>
+  2582539574U, // <3,7,4,4>: Cost 3 vext1 <5,3,7,4>, RHS
+  1553075510U, // <3,7,4,5>: Cost 2 vext2 <1,5,3,7>, RHS
+  2588512844U, // <3,7,4,6>: Cost 3 vext1 <6,3,7,4>, <6,3,7,4>
+  2564625766U, // <3,7,4,7>: Cost 3 vext1 <2,3,7,4>, <7,4,5,6>
+  1553075753U, // <3,7,4,u>: Cost 2 vext2 <1,5,3,7>, RHS
+  2732971398U, // <3,7,5,0>: Cost 3 vext3 LHS, <7,5,0,2>
+  2626817744U, // <3,7,5,1>: Cost 3 vext2 <1,5,3,7>, <5,1,7,3>
+  3700559649U, // <3,7,5,2>: Cost 4 vext2 <1,5,3,7>, <5,2,7,3>
+  2626817903U, // <3,7,5,3>: Cost 3 vext2 <1,5,3,7>, <5,3,7,0>
+  2258728203U, // <3,7,5,4>: Cost 3 vrev <7,3,4,5>
+  2732971446U, // <3,7,5,5>: Cost 3 vext3 LHS, <7,5,5,5>
+  2732971457U, // <3,7,5,6>: Cost 3 vext3 LHS, <7,5,6,7>
+  2826964278U, // <3,7,5,7>: Cost 3 vuzpr <1,3,5,7>, RHS
+  2826964279U, // <3,7,5,u>: Cost 3 vuzpr <1,3,5,7>, RHS
+  2732971478U, // <3,7,6,0>: Cost 3 vext3 LHS, <7,6,0,1>
+  2732971486U, // <3,7,6,1>: Cost 3 vext3 LHS, <7,6,1,0>
+  2633454074U, // <3,7,6,2>: Cost 3 vext2 <2,6,3,7>, <6,2,7,3>
+  2633454152U, // <3,7,6,3>: Cost 3 vext2 <2,6,3,7>, <6,3,7,0>
+  2732971518U, // <3,7,6,4>: Cost 3 vext3 LHS, <7,6,4,5>
+  2732971526U, // <3,7,6,5>: Cost 3 vext3 LHS, <7,6,5,4>
+  2732971537U, // <3,7,6,6>: Cost 3 vext3 LHS, <7,6,6,6>
+  2732971540U, // <3,7,6,7>: Cost 3 vext3 LHS, <7,6,7,0>
+  2726041124U, // <3,7,6,u>: Cost 3 vext3 <6,u,7,3>, <7,6,u,7>
+  2570616934U, // <3,7,7,0>: Cost 3 vext1 <3,3,7,7>, LHS
+  2570617856U, // <3,7,7,1>: Cost 3 vext1 <3,3,7,7>, <1,3,5,7>
+  2564646635U, // <3,7,7,2>: Cost 3 vext1 <2,3,7,7>, <2,3,7,7>
+  2570619332U, // <3,7,7,3>: Cost 3 vext1 <3,3,7,7>, <3,3,7,7>
+  2570620214U, // <3,7,7,4>: Cost 3 vext1 <3,3,7,7>, RHS
+  2582564726U, // <3,7,7,5>: Cost 3 vext1 <5,3,7,7>, <5,3,7,7>
+  2588537423U, // <3,7,7,6>: Cost 3 vext1 <6,3,7,7>, <6,3,7,7>
+  1659229804U, // <3,7,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
+  1659229804U, // <3,7,7,u>: Cost 2 vext3 LHS, <7,7,7,7>
+  2626819795U, // <3,7,u,0>: Cost 3 vext2 <1,5,3,7>, <u,0,1,2>
+  1553078062U, // <3,7,u,1>: Cost 2 vext2 <1,5,3,7>, LHS
+  2626819973U, // <3,7,u,2>: Cost 3 vext2 <1,5,3,7>, <u,2,3,0>
+  2826961565U, // <3,7,u,3>: Cost 3 vuzpr <1,3,5,7>, LHS
+  2626820159U, // <3,7,u,4>: Cost 3 vext2 <1,5,3,7>, <u,4,5,6>
+  1553078426U, // <3,7,u,5>: Cost 2 vext2 <1,5,3,7>, RHS
+  1595545808U, // <3,7,u,6>: Cost 2 vext2 <u,6,3,7>, <u,6,3,7>
+  1659229804U, // <3,7,u,7>: Cost 2 vext3 LHS, <7,7,7,7>
+  1553078629U, // <3,7,u,u>: Cost 2 vext2 <1,5,3,7>, LHS
+  1611448320U, // <3,u,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
+  1611896531U, // <3,u,0,1>: Cost 2 vext3 LHS, <u,0,1,2>
+  1659672284U, // <3,u,0,2>: Cost 2 vext3 LHS, <u,0,2,2>
+  1616099045U, // <3,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
+  2685638381U, // <3,u,0,4>: Cost 3 vext3 LHS, <u,0,4,1>
+  1663874806U, // <3,u,0,5>: Cost 2 vext3 LHS, <u,0,5,1>
+  1663874816U, // <3,u,0,6>: Cost 2 vext3 LHS, <u,0,6,2>
+  2960313672U, // <3,u,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
+  1611896594U, // <3,u,0,u>: Cost 2 vext3 LHS, <u,0,u,2>
+  1549763324U, // <3,u,1,0>: Cost 2 vext2 <1,0,3,u>, <1,0,3,u>
+  1550426957U, // <3,u,1,1>: Cost 2 vext2 <1,1,3,u>, <1,1,3,u>
+  537712430U, // <3,u,1,2>: Cost 1 vext3 LHS, LHS
+  1616541495U, // <3,u,1,3>: Cost 2 vext3 LHS, <u,1,3,3>
+  1490930998U, // <3,u,1,4>: Cost 2 vext1 <2,3,u,1>, RHS
+  1553081489U, // <3,u,1,5>: Cost 2 vext2 <1,5,3,u>, <1,5,3,u>
+  2627486946U, // <3,u,1,6>: Cost 3 vext2 <1,6,3,u>, <1,6,3,u>
+  1659230043U, // <3,u,1,7>: Cost 2 vext3 LHS, <u,1,7,3>
+  537712484U, // <3,u,1,u>: Cost 1 vext3 LHS, LHS
+  1611890852U, // <3,u,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+  2624833102U, // <3,u,2,1>: Cost 3 vext2 <1,2,3,u>, <2,1,u,3>
+  1557063287U, // <3,u,2,2>: Cost 2 vext2 <2,2,3,u>, <2,2,3,u>
+  1616099205U, // <3,u,2,3>: Cost 2 vext3 LHS, <u,2,3,0>
+  1611890892U, // <3,u,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+  2689841054U, // <3,u,2,5>: Cost 3 vext3 LHS, <u,2,5,7>
+  1559717819U, // <3,u,2,6>: Cost 2 vext2 <2,6,3,u>, <2,6,3,u>
+  1659230124U, // <3,u,2,7>: Cost 2 vext3 LHS, <u,2,7,3>
+  1616541618U, // <3,u,2,u>: Cost 2 vext3 LHS, <u,2,u,0>
+  1611896764U, // <3,u,3,0>: Cost 2 vext3 LHS, <u,3,0,1>
+  1484973079U, // <3,u,3,1>: Cost 2 vext1 <1,3,u,3>, <1,3,u,3>
+  2685638607U, // <3,u,3,2>: Cost 3 vext3 LHS, <u,3,2,2>
+  336380006U, // <3,u,3,3>: Cost 1 vdup3 LHS
+  1611896804U, // <3,u,3,4>: Cost 2 vext3 LHS, <u,3,4,5>
+  1616541679U, // <3,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
+  2690283512U, // <3,u,3,6>: Cost 3 vext3 LHS, <u,3,6,7>
+  2959674696U, // <3,u,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
+  336380006U, // <3,u,3,u>: Cost 1 vdup3 LHS
+  2558722150U, // <3,u,4,0>: Cost 3 vext1 <1,3,u,4>, LHS
+  1659672602U, // <3,u,4,1>: Cost 2 vext3 LHS, <u,4,1,5>
+  1659672612U, // <3,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
+  2689841196U, // <3,u,4,3>: Cost 3 vext3 LHS, <u,4,3,5>
+  1659227344U, // <3,u,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
+  1611896895U, // <3,u,4,5>: Cost 2 vext3 LHS, <u,4,5,6>
+  1663875144U, // <3,u,4,6>: Cost 2 vext3 LHS, <u,4,6,6>
+  1659230289U, // <3,u,4,7>: Cost 2 vext3 LHS, <u,4,7,6>
+  1611896922U, // <3,u,4,u>: Cost 2 vext3 LHS, <u,4,u,6>
+  1490960486U, // <3,u,5,0>: Cost 2 vext1 <2,3,u,5>, LHS
+  2689841261U, // <3,u,5,1>: Cost 3 vext3 LHS, <u,5,1,7>
+  1490962162U, // <3,u,5,2>: Cost 2 vext1 <2,3,u,5>, <2,3,u,5>
+  1616541823U, // <3,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
+  1490963766U, // <3,u,5,4>: Cost 2 vext1 <2,3,u,5>, RHS
+  1659228164U, // <3,u,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
+  537712794U, // <3,u,5,6>: Cost 1 vext3 LHS, RHS
+  1659230371U, // <3,u,5,7>: Cost 2 vext3 LHS, <u,5,7,7>
+  537712812U, // <3,u,5,u>: Cost 1 vext3 LHS, RHS
+  2689841327U, // <3,u,6,0>: Cost 3 vext3 LHS, <u,6,0,1>
+  2558739482U, // <3,u,6,1>: Cost 3 vext1 <1,3,u,6>, <1,3,u,6>
+  2689841351U, // <3,u,6,2>: Cost 3 vext3 LHS, <u,6,2,7>
+  1616099536U, // <3,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
+  1659227508U, // <3,u,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
+  2690283746U, // <3,u,6,5>: Cost 3 vext3 LHS, <u,6,5,7>
+  1659228984U, // <3,u,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
+  1659230445U, // <3,u,6,7>: Cost 2 vext3 LHS, <u,6,7,0>
+  1616099581U, // <3,u,6,u>: Cost 2 vext3 LHS, <u,6,u,7>
+  1485004902U, // <3,u,7,0>: Cost 2 vext1 <1,3,u,7>, LHS
+  1485005851U, // <3,u,7,1>: Cost 2 vext1 <1,3,u,7>, <1,3,u,7>
+  2558748264U, // <3,u,7,2>: Cost 3 vext1 <1,3,u,7>, <2,2,2,2>
+  3095397021U, // <3,u,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
+  1485008182U, // <3,u,7,4>: Cost 2 vext1 <1,3,u,7>, RHS
+  1659228328U, // <3,u,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
+  2722060599U, // <3,u,7,6>: Cost 3 vext3 <6,2,7,3>, <u,7,6,2>
+  1659229804U, // <3,u,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
+  1485010734U, // <3,u,7,u>: Cost 2 vext1 <1,3,u,7>, LHS
+  1616099665U, // <3,u,u,0>: Cost 2 vext3 LHS, <u,u,0,1>
+  1611897179U, // <3,u,u,1>: Cost 2 vext3 LHS, <u,u,1,2>
+  537712997U, // <3,u,u,2>: Cost 1 vext3 LHS, LHS
+  336380006U, // <3,u,u,3>: Cost 1 vdup3 LHS
+  1616099705U, // <3,u,u,4>: Cost 2 vext3 LHS, <u,u,4,5>
+  1611897219U, // <3,u,u,5>: Cost 2 vext3 LHS, <u,u,5,6>
+  537713037U, // <3,u,u,6>: Cost 1 vext3 LHS, RHS
+  1659230607U, // <3,u,u,7>: Cost 2 vext3 LHS, <u,u,7,0>
+  537713051U, // <3,u,u,u>: Cost 1 vext3 LHS, LHS
+  2691907584U, // <4,0,0,0>: Cost 3 vext3 <1,2,3,4>, <0,0,0,0>
+  2691907594U, // <4,0,0,1>: Cost 3 vext3 <1,2,3,4>, <0,0,1,1>
+  2691907604U, // <4,0,0,2>: Cost 3 vext3 <1,2,3,4>, <0,0,2,2>
+  3709862144U, // <4,0,0,3>: Cost 4 vext2 <3,1,4,0>, <0,3,1,4>
+  2684682280U, // <4,0,0,4>: Cost 3 vext3 <0,0,4,4>, <0,0,4,4>
+  3694600633U, // <4,0,0,5>: Cost 4 vext2 <0,5,4,0>, <0,5,4,0>
+  3291431290U, // <4,0,0,6>: Cost 4 vrev <0,4,6,0>
+  3668342067U, // <4,0,0,7>: Cost 4 vext1 <7,4,0,0>, <7,4,0,0>
+  2691907657U, // <4,0,0,u>: Cost 3 vext3 <1,2,3,4>, <0,0,u,1>
+  2570715238U, // <4,0,1,0>: Cost 3 vext1 <3,4,0,1>, LHS
+  2570716058U, // <4,0,1,1>: Cost 3 vext1 <3,4,0,1>, <1,2,3,4>
+  1618165862U, // <4,0,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  2570717648U, // <4,0,1,3>: Cost 3 vext1 <3,4,0,1>, <3,4,0,1>
+  2570718518U, // <4,0,1,4>: Cost 3 vext1 <3,4,0,1>, RHS
+  2594607206U, // <4,0,1,5>: Cost 3 vext1 <7,4,0,1>, <5,6,7,4>
+  3662377563U, // <4,0,1,6>: Cost 4 vext1 <6,4,0,1>, <6,4,0,1>
+  2594608436U, // <4,0,1,7>: Cost 3 vext1 <7,4,0,1>, <7,4,0,1>
+  1618165916U, // <4,0,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
+  2685714598U, // <4,0,2,0>: Cost 3 vext3 <0,2,0,4>, <0,2,0,4>
+  3759530159U, // <4,0,2,1>: Cost 4 vext3 <0,2,1,4>, <0,2,1,4>
+  2685862072U, // <4,0,2,2>: Cost 3 vext3 <0,2,2,4>, <0,2,2,4>
+  2631476937U, // <4,0,2,3>: Cost 3 vext2 <2,3,4,0>, <2,3,4,0>
+  2685714636U, // <4,0,2,4>: Cost 3 vext3 <0,2,0,4>, <0,2,4,6>
+  3765649622U, // <4,0,2,5>: Cost 4 vext3 <1,2,3,4>, <0,2,5,7>
+  2686157020U, // <4,0,2,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
+  3668358453U, // <4,0,2,7>: Cost 4 vext1 <7,4,0,2>, <7,4,0,2>
+  2686304494U, // <4,0,2,u>: Cost 3 vext3 <0,2,u,4>, <0,2,u,4>
+  3632529510U, // <4,0,3,0>: Cost 4 vext1 <1,4,0,3>, LHS
+  2686451968U, // <4,0,3,1>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
+  2686525705U, // <4,0,3,2>: Cost 3 vext3 <0,3,2,4>, <0,3,2,4>
+  3760341266U, // <4,0,3,3>: Cost 4 vext3 <0,3,3,4>, <0,3,3,4>
+  3632532790U, // <4,0,3,4>: Cost 4 vext1 <1,4,0,3>, RHS
+  3913254606U, // <4,0,3,5>: Cost 4 vuzpr <3,4,5,0>, <2,3,4,5>
+  3705219740U, // <4,0,3,6>: Cost 4 vext2 <2,3,4,0>, <3,6,4,7>
+  3713845990U, // <4,0,3,7>: Cost 4 vext2 <3,7,4,0>, <3,7,4,0>
+  2686451968U, // <4,0,3,u>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
+  2552823910U, // <4,0,4,0>: Cost 3 vext1 <0,4,0,4>, LHS
+  2691907922U, // <4,0,4,1>: Cost 3 vext3 <1,2,3,4>, <0,4,1,5>
+  2691907932U, // <4,0,4,2>: Cost 3 vext3 <1,2,3,4>, <0,4,2,6>
+  3626567830U, // <4,0,4,3>: Cost 4 vext1 <0,4,0,4>, <3,0,1,2>
+  2552827190U, // <4,0,4,4>: Cost 3 vext1 <0,4,0,4>, RHS
+  2631478582U, // <4,0,4,5>: Cost 3 vext2 <2,3,4,0>, RHS
+  3626570017U, // <4,0,4,6>: Cost 4 vext1 <0,4,0,4>, <6,0,1,2>
+  3668374839U, // <4,0,4,7>: Cost 4 vext1 <7,4,0,4>, <7,4,0,4>
+  2552829742U, // <4,0,4,u>: Cost 3 vext1 <0,4,0,4>, LHS
+  2558804070U, // <4,0,5,0>: Cost 3 vext1 <1,4,0,5>, LHS
+  1839644774U, // <4,0,5,1>: Cost 2 vzipl RHS, LHS
+  2913386660U, // <4,0,5,2>: Cost 3 vzipl RHS, <0,2,0,2>
+  2570750420U, // <4,0,5,3>: Cost 3 vext1 <3,4,0,5>, <3,4,0,5>
+  2558807350U, // <4,0,5,4>: Cost 3 vext1 <1,4,0,5>, RHS
+  3987128750U, // <4,0,5,5>: Cost 4 vzipl RHS, <0,5,2,7>
+  3987128822U, // <4,0,5,6>: Cost 4 vzipl RHS, <0,6,1,7>
+  2594641208U, // <4,0,5,7>: Cost 3 vext1 <7,4,0,5>, <7,4,0,5>
+  1839645341U, // <4,0,5,u>: Cost 2 vzipl RHS, LHS
+  2552840294U, // <4,0,6,0>: Cost 3 vext1 <0,4,0,6>, LHS
+  3047604234U, // <4,0,6,1>: Cost 3 vtrnl RHS, <0,0,1,1>
+  1973862502U, // <4,0,6,2>: Cost 2 vtrnl RHS, LHS
+  2570758613U, // <4,0,6,3>: Cost 3 vext1 <3,4,0,6>, <3,4,0,6>
+  2552843574U, // <4,0,6,4>: Cost 3 vext1 <0,4,0,6>, RHS
+  2217664887U, // <4,0,6,5>: Cost 3 vrev <0,4,5,6>
+  3662418528U, // <4,0,6,6>: Cost 4 vext1 <6,4,0,6>, <6,4,0,6>
+  2658022257U, // <4,0,6,7>: Cost 3 vext2 <6,7,4,0>, <6,7,4,0>
+  1973862556U, // <4,0,6,u>: Cost 2 vtrnl RHS, LHS
+  3731764218U, // <4,0,7,0>: Cost 4 vext2 <6,7,4,0>, <7,0,1,2>
+  3988324454U, // <4,0,7,1>: Cost 4 vzipl <4,7,5,0>, LHS
+  4122034278U, // <4,0,7,2>: Cost 4 vtrnl <4,6,7,1>, LHS
+  3735082246U, // <4,0,7,3>: Cost 4 vext2 <7,3,4,0>, <7,3,4,0>
+  3731764536U, // <4,0,7,4>: Cost 4 vext2 <6,7,4,0>, <7,4,0,5>
+  3937145718U, // <4,0,7,5>: Cost 4 vuzpr <7,4,5,0>, <6,7,4,5>
+  3737073145U, // <4,0,7,6>: Cost 4 vext2 <7,6,4,0>, <7,6,4,0>
+  3731764844U, // <4,0,7,7>: Cost 4 vext2 <6,7,4,0>, <7,7,7,7>
+  4122034332U, // <4,0,7,u>: Cost 4 vtrnl <4,6,7,1>, LHS
+  2552856678U, // <4,0,u,0>: Cost 3 vext1 <0,4,0,u>, LHS
+  1841635430U, // <4,0,u,1>: Cost 2 vzipl RHS, LHS
+  1618166429U, // <4,0,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  2570774999U, // <4,0,u,3>: Cost 3 vext1 <3,4,0,u>, <3,4,0,u>
+  2552859958U, // <4,0,u,4>: Cost 3 vext1 <0,4,0,u>, RHS
+  2631481498U, // <4,0,u,5>: Cost 3 vext2 <2,3,4,0>, RHS
+  2686157020U, // <4,0,u,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
+  2594665787U, // <4,0,u,7>: Cost 3 vext1 <7,4,0,u>, <7,4,0,u>
+  1618166483U, // <4,0,u,u>: Cost 2 vext3 <1,2,3,4>, LHS
+  2617548837U, // <4,1,0,0>: Cost 3 vext2 <0,0,4,1>, <0,0,4,1>
+  2622857318U, // <4,1,0,1>: Cost 3 vext2 <0,u,4,1>, LHS
+  3693281484U, // <4,1,0,2>: Cost 4 vext2 <0,3,4,1>, <0,2,4,6>
+  2691908342U, // <4,1,0,3>: Cost 3 vext3 <1,2,3,4>, <1,0,3,2>
+  2622857554U, // <4,1,0,4>: Cost 3 vext2 <0,u,4,1>, <0,4,1,5>
+  3764470538U, // <4,1,0,5>: Cost 4 vext3 <1,0,5,4>, <1,0,5,4>
+  3695272459U, // <4,1,0,6>: Cost 4 vext2 <0,6,4,1>, <0,6,4,1>
+  3733094980U, // <4,1,0,7>: Cost 4 vext2 <7,0,4,1>, <0,7,1,4>
+  2622857885U, // <4,1,0,u>: Cost 3 vext2 <0,u,4,1>, LHS
+  3696599798U, // <4,1,1,0>: Cost 4 vext2 <0,u,4,1>, <1,0,3,2>
+  2691097399U, // <4,1,1,1>: Cost 3 vext3 <1,1,1,4>, <1,1,1,4>
+  2631484314U, // <4,1,1,2>: Cost 3 vext2 <2,3,4,1>, <1,2,3,4>
+  2691908424U, // <4,1,1,3>: Cost 3 vext3 <1,2,3,4>, <1,1,3,3>
+  3696600125U, // <4,1,1,4>: Cost 4 vext2 <0,u,4,1>, <1,4,3,5>
+  3696600175U, // <4,1,1,5>: Cost 4 vext2 <0,u,4,1>, <1,5,0,1>
+  3696600307U, // <4,1,1,6>: Cost 4 vext2 <0,u,4,1>, <1,6,5,7>
+  3668423997U, // <4,1,1,7>: Cost 4 vext1 <7,4,1,1>, <7,4,1,1>
+  2691908469U, // <4,1,1,u>: Cost 3 vext3 <1,2,3,4>, <1,1,u,3>
+  2570797158U, // <4,1,2,0>: Cost 3 vext1 <3,4,1,2>, LHS
+  2570797978U, // <4,1,2,1>: Cost 3 vext1 <3,4,1,2>, <1,2,3,4>
+  3696600680U, // <4,1,2,2>: Cost 4 vext2 <0,u,4,1>, <2,2,2,2>
+  1618166682U, // <4,1,2,3>: Cost 2 vext3 <1,2,3,4>, <1,2,3,4>
+  2570800438U, // <4,1,2,4>: Cost 3 vext1 <3,4,1,2>, RHS
+  3765650347U, // <4,1,2,5>: Cost 4 vext3 <1,2,3,4>, <1,2,5,3>
+  3696601018U, // <4,1,2,6>: Cost 4 vext2 <0,u,4,1>, <2,6,3,7>
+  3668432190U, // <4,1,2,7>: Cost 4 vext1 <7,4,1,2>, <7,4,1,2>
+  1618535367U, // <4,1,2,u>: Cost 2 vext3 <1,2,u,4>, <1,2,u,4>
+  2564833382U, // <4,1,3,0>: Cost 3 vext1 <2,4,1,3>, LHS
+  2691908568U, // <4,1,3,1>: Cost 3 vext3 <1,2,3,4>, <1,3,1,3>
+  2691908578U, // <4,1,3,2>: Cost 3 vext3 <1,2,3,4>, <1,3,2,4>
+  2692572139U, // <4,1,3,3>: Cost 3 vext3 <1,3,3,4>, <1,3,3,4>
+  2564836662U, // <4,1,3,4>: Cost 3 vext1 <2,4,1,3>, RHS
+  2691908608U, // <4,1,3,5>: Cost 3 vext3 <1,2,3,4>, <1,3,5,7>
+  2588725862U, // <4,1,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+  3662468090U, // <4,1,3,7>: Cost 4 vext1 <6,4,1,3>, <7,0,1,2>
+  2691908631U, // <4,1,3,u>: Cost 3 vext3 <1,2,3,4>, <1,3,u,3>
+  3760194590U, // <4,1,4,0>: Cost 4 vext3 <0,3,1,4>, <1,4,0,1>
+  3693947874U, // <4,1,4,1>: Cost 4 vext2 <0,4,4,1>, <4,1,5,0>
+  3765650484U, // <4,1,4,2>: Cost 4 vext3 <1,2,3,4>, <1,4,2,5>
+  3113877606U, // <4,1,4,3>: Cost 3 vtrnr <4,4,4,4>, LHS
+  3760194630U, // <4,1,4,4>: Cost 4 vext3 <0,3,1,4>, <1,4,4,5>
+  2622860598U, // <4,1,4,5>: Cost 3 vext2 <0,u,4,1>, RHS
+  3297436759U, // <4,1,4,6>: Cost 4 vrev <1,4,6,4>
+  3800007772U, // <4,1,4,7>: Cost 4 vext3 <7,0,1,4>, <1,4,7,0>
+  2622860841U, // <4,1,4,u>: Cost 3 vext2 <0,u,4,1>, RHS
+  1479164006U, // <4,1,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
+  2552906486U, // <4,1,5,1>: Cost 3 vext1 <0,4,1,5>, <1,0,3,2>
+  2552907299U, // <4,1,5,2>: Cost 3 vext1 <0,4,1,5>, <2,1,3,5>
+  2552907926U, // <4,1,5,3>: Cost 3 vext1 <0,4,1,5>, <3,0,1,2>
+  1479167286U, // <4,1,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
+  2913387664U, // <4,1,5,5>: Cost 3 vzipl RHS, <1,5,3,7>
+  2600686074U, // <4,1,5,6>: Cost 3 vext1 <u,4,1,5>, <6,2,7,3>
+  2600686586U, // <4,1,5,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
+  1479169838U, // <4,1,5,u>: Cost 2 vext1 <0,4,1,5>, LHS
+  2552914022U, // <4,1,6,0>: Cost 3 vext1 <0,4,1,6>, LHS
+  2558886708U, // <4,1,6,1>: Cost 3 vext1 <1,4,1,6>, <1,1,1,1>
+  4028205206U, // <4,1,6,2>: Cost 4 vzipr <0,2,4,6>, <3,0,1,2>
+  3089858662U, // <4,1,6,3>: Cost 3 vtrnr <0,4,2,6>, LHS
+  2552917302U, // <4,1,6,4>: Cost 3 vext1 <0,4,1,6>, RHS
+  2223637584U, // <4,1,6,5>: Cost 3 vrev <1,4,5,6>
+  4121347081U, // <4,1,6,6>: Cost 4 vtrnl RHS, <1,3,6,7>
+  3721155406U, // <4,1,6,7>: Cost 4 vext2 <5,0,4,1>, <6,7,0,1>
+  2552919854U, // <4,1,6,u>: Cost 3 vext1 <0,4,1,6>, LHS
+  2659357716U, // <4,1,7,0>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
+  3733763173U, // <4,1,7,1>: Cost 4 vext2 <7,1,4,1>, <7,1,4,1>
+  3734426806U, // <4,1,7,2>: Cost 4 vext2 <7,2,4,1>, <7,2,4,1>
+  2695226671U, // <4,1,7,3>: Cost 3 vext3 <1,7,3,4>, <1,7,3,4>
+  3721155942U, // <4,1,7,4>: Cost 4 vext2 <5,0,4,1>, <7,4,5,6>
+  3721155976U, // <4,1,7,5>: Cost 4 vext2 <5,0,4,1>, <7,5,0,4>
+  3662500458U, // <4,1,7,6>: Cost 4 vext1 <6,4,1,7>, <6,4,1,7>
+  3721156204U, // <4,1,7,7>: Cost 4 vext2 <5,0,4,1>, <7,7,7,7>
+  2659357716U, // <4,1,7,u>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
+  1479188582U, // <4,1,u,0>: Cost 2 vext1 <0,4,1,u>, LHS
+  2552931062U, // <4,1,u,1>: Cost 3 vext1 <0,4,1,u>, <1,0,3,2>
+  2552931944U, // <4,1,u,2>: Cost 3 vext1 <0,4,1,u>, <2,2,2,2>
+  1622148480U, // <4,1,u,3>: Cost 2 vext3 <1,u,3,4>, <1,u,3,4>
+  1479191862U, // <4,1,u,4>: Cost 2 vext1 <0,4,1,u>, RHS
+  2622863514U, // <4,1,u,5>: Cost 3 vext2 <0,u,4,1>, RHS
+  2588725862U, // <4,1,u,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+  2600686586U, // <4,1,u,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
+  1479194414U, // <4,1,u,u>: Cost 2 vext1 <0,4,1,u>, LHS
+  2617557030U, // <4,2,0,0>: Cost 3 vext2 <0,0,4,2>, <0,0,4,2>
+  2622865510U, // <4,2,0,1>: Cost 3 vext2 <0,u,4,2>, LHS
+  2622865612U, // <4,2,0,2>: Cost 3 vext2 <0,u,4,2>, <0,2,4,6>
+  3693289753U, // <4,2,0,3>: Cost 4 vext2 <0,3,4,2>, <0,3,4,2>
+  2635473244U, // <4,2,0,4>: Cost 3 vext2 <3,0,4,2>, <0,4,2,6>
+  3765650918U, // <4,2,0,5>: Cost 4 vext3 <1,2,3,4>, <2,0,5,7>
+  2696775148U, // <4,2,0,6>: Cost 3 vext3 <2,0,6,4>, <2,0,6,4>
+  3695944285U, // <4,2,0,7>: Cost 4 vext2 <0,7,4,2>, <0,7,4,2>
+  2622866077U, // <4,2,0,u>: Cost 3 vext2 <0,u,4,2>, LHS
+  3696607990U, // <4,2,1,0>: Cost 4 vext2 <0,u,4,2>, <1,0,3,2>
+  3696608052U, // <4,2,1,1>: Cost 4 vext2 <0,u,4,2>, <1,1,1,1>
+  3696608150U, // <4,2,1,2>: Cost 4 vext2 <0,u,4,2>, <1,2,3,0>
+  3895574630U, // <4,2,1,3>: Cost 4 vuzpr <0,4,u,2>, LHS
+  2691909162U, // <4,2,1,4>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
+  3696608400U, // <4,2,1,5>: Cost 4 vext2 <0,u,4,2>, <1,5,3,7>
+  3760784956U, // <4,2,1,6>: Cost 4 vext3 <0,4,0,4>, <2,1,6,3>
+  3773908549U, // <4,2,1,7>: Cost 5 vext3 <2,5,7,4>, <2,1,7,3>
+  2691909162U, // <4,2,1,u>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
+  3696608748U, // <4,2,2,0>: Cost 4 vext2 <0,u,4,2>, <2,0,6,4>
+  3696608828U, // <4,2,2,1>: Cost 4 vext2 <0,u,4,2>, <2,1,6,3>
+  2691909224U, // <4,2,2,2>: Cost 3 vext3 <1,2,3,4>, <2,2,2,2>
+  2691909234U, // <4,2,2,3>: Cost 3 vext3 <1,2,3,4>, <2,2,3,3>
+  3759605368U, // <4,2,2,4>: Cost 4 vext3 <0,2,2,4>, <2,2,4,0>
+  3696609156U, // <4,2,2,5>: Cost 4 vext2 <0,u,4,2>, <2,5,6,7>
+  3760785040U, // <4,2,2,6>: Cost 4 vext3 <0,4,0,4>, <2,2,6,6>
+  3668505927U, // <4,2,2,7>: Cost 4 vext1 <7,4,2,2>, <7,4,2,2>
+  2691909279U, // <4,2,2,u>: Cost 3 vext3 <1,2,3,4>, <2,2,u,3>
+  2691909286U, // <4,2,3,0>: Cost 3 vext3 <1,2,3,4>, <2,3,0,1>
+  3764840111U, // <4,2,3,1>: Cost 4 vext3 <1,1,1,4>, <2,3,1,1>
+  3765651129U, // <4,2,3,2>: Cost 4 vext3 <1,2,3,4>, <2,3,2,2>
+  2698544836U, // <4,2,3,3>: Cost 3 vext3 <2,3,3,4>, <2,3,3,4>
+  2685863630U, // <4,2,3,4>: Cost 3 vext3 <0,2,2,4>, <2,3,4,5>
+  2698692310U, // <4,2,3,5>: Cost 3 vext3 <2,3,5,4>, <2,3,5,4>
+  3772507871U, // <4,2,3,6>: Cost 4 vext3 <2,3,6,4>, <2,3,6,4>
+  2698839784U, // <4,2,3,7>: Cost 3 vext3 <2,3,7,4>, <2,3,7,4>
+  2691909358U, // <4,2,3,u>: Cost 3 vext3 <1,2,3,4>, <2,3,u,1>
+  2564915302U, // <4,2,4,0>: Cost 3 vext1 <2,4,2,4>, LHS
+  2564916122U, // <4,2,4,1>: Cost 3 vext1 <2,4,2,4>, <1,2,3,4>
+  2564917004U, // <4,2,4,2>: Cost 3 vext1 <2,4,2,4>, <2,4,2,4>
+  2699208469U, // <4,2,4,3>: Cost 3 vext3 <2,4,3,4>, <2,4,3,4>
+  2564918582U, // <4,2,4,4>: Cost 3 vext1 <2,4,2,4>, RHS
+  2622868790U, // <4,2,4,5>: Cost 3 vext2 <0,u,4,2>, RHS
+  2229667632U, // <4,2,4,6>: Cost 3 vrev <2,4,6,4>
+  3800082229U, // <4,2,4,7>: Cost 4 vext3 <7,0,2,4>, <2,4,7,0>
+  2622869033U, // <4,2,4,u>: Cost 3 vext2 <0,u,4,2>, RHS
+  2552979558U, // <4,2,5,0>: Cost 3 vext1 <0,4,2,5>, LHS
+  2558952342U, // <4,2,5,1>: Cost 3 vext1 <1,4,2,5>, <1,2,3,0>
+  2564925032U, // <4,2,5,2>: Cost 3 vext1 <2,4,2,5>, <2,2,2,2>
+  2967060582U, // <4,2,5,3>: Cost 3 vzipr <2,3,4,5>, LHS
+  2552982838U, // <4,2,5,4>: Cost 3 vext1 <0,4,2,5>, RHS
+  3987130190U, // <4,2,5,5>: Cost 4 vzipl RHS, <2,5,0,7>
+  2913388474U, // <4,2,5,6>: Cost 3 vzipl RHS, <2,6,3,7>
+  3895577910U, // <4,2,5,7>: Cost 4 vuzpr <0,4,u,2>, RHS
+  2552985390U, // <4,2,5,u>: Cost 3 vext1 <0,4,2,5>, LHS
+  1479245926U, // <4,2,6,0>: Cost 2 vext1 <0,4,2,6>, LHS
+  2552988406U, // <4,2,6,1>: Cost 3 vext1 <0,4,2,6>, <1,0,3,2>
+  2552989288U, // <4,2,6,2>: Cost 3 vext1 <0,4,2,6>, <2,2,2,2>
+  2954461286U, // <4,2,6,3>: Cost 3 vzipr <0,2,4,6>, LHS
+  1479249206U, // <4,2,6,4>: Cost 2 vext1 <0,4,2,6>, RHS
+  2229610281U, // <4,2,6,5>: Cost 3 vrev <2,4,5,6>
+  2600767994U, // <4,2,6,6>: Cost 3 vext1 <u,4,2,6>, <6,2,7,3>
+  2600768506U, // <4,2,6,7>: Cost 3 vext1 <u,4,2,6>, <7,0,1,2>
+  1479251758U, // <4,2,6,u>: Cost 2 vext1 <0,4,2,6>, LHS
+  2659365909U, // <4,2,7,0>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
+  3733771366U, // <4,2,7,1>: Cost 4 vext2 <7,1,4,2>, <7,1,4,2>
+  3734434999U, // <4,2,7,2>: Cost 4 vext2 <7,2,4,2>, <7,2,4,2>
+  2701199368U, // <4,2,7,3>: Cost 3 vext3 <2,7,3,4>, <2,7,3,4>
+  4175774618U, // <4,2,7,4>: Cost 4 vtrnr <2,4,5,7>, <1,2,3,4>
+  3303360298U, // <4,2,7,5>: Cost 4 vrev <2,4,5,7>
+  3727136217U, // <4,2,7,6>: Cost 4 vext2 <6,0,4,2>, <7,6,0,4>
+  3727136364U, // <4,2,7,7>: Cost 4 vext2 <6,0,4,2>, <7,7,7,7>
+  2659365909U, // <4,2,7,u>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
+  1479262310U, // <4,2,u,0>: Cost 2 vext1 <0,4,2,u>, LHS
+  2553004790U, // <4,2,u,1>: Cost 3 vext1 <0,4,2,u>, <1,0,3,2>
+  2553005672U, // <4,2,u,2>: Cost 3 vext1 <0,4,2,u>, <2,2,2,2>
+  2954477670U, // <4,2,u,3>: Cost 3 vzipr <0,2,4,u>, LHS
+  1479265590U, // <4,2,u,4>: Cost 2 vext1 <0,4,2,u>, RHS
+  2622871706U, // <4,2,u,5>: Cost 3 vext2 <0,u,4,2>, RHS
+  2229700404U, // <4,2,u,6>: Cost 3 vrev <2,4,6,u>
+  2600784890U, // <4,2,u,7>: Cost 3 vext1 <u,4,2,u>, <7,0,1,2>
+  1479268142U, // <4,2,u,u>: Cost 2 vext1 <0,4,2,u>, LHS
+  3765651595U, // <4,3,0,0>: Cost 4 vext3 <1,2,3,4>, <3,0,0,0>
+  2691909782U, // <4,3,0,1>: Cost 3 vext3 <1,2,3,4>, <3,0,1,2>
+  2702452897U, // <4,3,0,2>: Cost 3 vext3 <3,0,2,4>, <3,0,2,4>
+  3693297946U, // <4,3,0,3>: Cost 4 vext2 <0,3,4,3>, <0,3,4,3>
+  3760711856U, // <4,3,0,4>: Cost 4 vext3 <0,3,u,4>, <3,0,4,1>
+  2235533820U, // <4,3,0,5>: Cost 3 vrev <3,4,5,0>
+  3309349381U, // <4,3,0,6>: Cost 4 vrev <3,4,6,0>
+  3668563278U, // <4,3,0,7>: Cost 4 vext1 <7,4,3,0>, <7,4,3,0>
+  2691909845U, // <4,3,0,u>: Cost 3 vext3 <1,2,3,4>, <3,0,u,2>
+  2235173328U, // <4,3,1,0>: Cost 3 vrev <3,4,0,1>
+  3764840678U, // <4,3,1,1>: Cost 4 vext3 <1,1,1,4>, <3,1,1,1>
+  2630173594U, // <4,3,1,2>: Cost 3 vext2 <2,1,4,3>, <1,2,3,4>
+  2703190267U, // <4,3,1,3>: Cost 3 vext3 <3,1,3,4>, <3,1,3,4>
+  3760195840U, // <4,3,1,4>: Cost 4 vext3 <0,3,1,4>, <3,1,4,0>
+  3765651724U, // <4,3,1,5>: Cost 4 vext3 <1,2,3,4>, <3,1,5,3>
+  3309357574U, // <4,3,1,6>: Cost 4 vrev <3,4,6,1>
+  3769633054U, // <4,3,1,7>: Cost 4 vext3 <1,u,3,4>, <3,1,7,3>
+  2703558952U, // <4,3,1,u>: Cost 3 vext3 <3,1,u,4>, <3,1,u,4>
+  3626770534U, // <4,3,2,0>: Cost 4 vext1 <0,4,3,2>, LHS
+  2630174250U, // <4,3,2,1>: Cost 3 vext2 <2,1,4,3>, <2,1,4,3>
+  3765651777U, // <4,3,2,2>: Cost 4 vext3 <1,2,3,4>, <3,2,2,2>
+  2703853900U, // <4,3,2,3>: Cost 3 vext3 <3,2,3,4>, <3,2,3,4>
+  3626773814U, // <4,3,2,4>: Cost 4 vext1 <0,4,3,2>, RHS
+  2704001374U, // <4,3,2,5>: Cost 3 vext3 <3,2,5,4>, <3,2,5,4>
+  3765651814U, // <4,3,2,6>: Cost 4 vext3 <1,2,3,4>, <3,2,6,3>
+  3769633135U, // <4,3,2,7>: Cost 4 vext3 <1,u,3,4>, <3,2,7,3>
+  2634819681U, // <4,3,2,u>: Cost 3 vext2 <2,u,4,3>, <2,u,4,3>
+  3765651839U, // <4,3,3,0>: Cost 4 vext3 <1,2,3,4>, <3,3,0,1>
+  3765651848U, // <4,3,3,1>: Cost 4 vext3 <1,2,3,4>, <3,3,1,1>
+  3710552404U, // <4,3,3,2>: Cost 4 vext2 <3,2,4,3>, <3,2,4,3>
+  2691910044U, // <4,3,3,3>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
+  2704591270U, // <4,3,3,4>: Cost 3 vext3 <3,3,4,4>, <3,3,4,4>
+  3769633202U, // <4,3,3,5>: Cost 4 vext3 <1,u,3,4>, <3,3,5,7>
+  3703917212U, // <4,3,3,6>: Cost 4 vext2 <2,1,4,3>, <3,6,4,7>
+  3769633220U, // <4,3,3,7>: Cost 4 vext3 <1,u,3,4>, <3,3,7,7>
+  2691910044U, // <4,3,3,u>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
+  2691910096U, // <4,3,4,0>: Cost 3 vext3 <1,2,3,4>, <3,4,0,1>
+  2691910106U, // <4,3,4,1>: Cost 3 vext3 <1,2,3,4>, <3,4,1,2>
+  2564990741U, // <4,3,4,2>: Cost 3 vext1 <2,4,3,4>, <2,4,3,4>
+  3765651946U, // <4,3,4,3>: Cost 4 vext3 <1,2,3,4>, <3,4,3,0>
+  2691910136U, // <4,3,4,4>: Cost 3 vext3 <1,2,3,4>, <3,4,4,5>
+  2686454274U, // <4,3,4,5>: Cost 3 vext3 <0,3,1,4>, <3,4,5,6>
+  2235640329U, // <4,3,4,6>: Cost 3 vrev <3,4,6,4>
+  3801483792U, // <4,3,4,7>: Cost 4 vext3 <7,2,3,4>, <3,4,7,2>
+  2691910168U, // <4,3,4,u>: Cost 3 vext3 <1,2,3,4>, <3,4,u,1>
+  2559025254U, // <4,3,5,0>: Cost 3 vext1 <1,4,3,5>, LHS
+  2559026237U, // <4,3,5,1>: Cost 3 vext1 <1,4,3,5>, <1,4,3,5>
+  2564998862U, // <4,3,5,2>: Cost 3 vext1 <2,4,3,5>, <2,3,4,5>
+  2570971548U, // <4,3,5,3>: Cost 3 vext1 <3,4,3,5>, <3,3,3,3>
+  2559028534U, // <4,3,5,4>: Cost 3 vext1 <1,4,3,5>, RHS
+  4163519477U, // <4,3,5,5>: Cost 4 vtrnr <0,4,1,5>, <1,3,4,5>
+  3309390346U, // <4,3,5,6>: Cost 4 vrev <3,4,6,5>
+  2706139747U, // <4,3,5,7>: Cost 3 vext3 <3,5,7,4>, <3,5,7,4>
+  2559031086U, // <4,3,5,u>: Cost 3 vext1 <1,4,3,5>, LHS
+  2559033446U, // <4,3,6,0>: Cost 3 vext1 <1,4,3,6>, LHS
+  2559034430U, // <4,3,6,1>: Cost 3 vext1 <1,4,3,6>, <1,4,3,6>
+  2565007127U, // <4,3,6,2>: Cost 3 vext1 <2,4,3,6>, <2,4,3,6>
+  2570979740U, // <4,3,6,3>: Cost 3 vext1 <3,4,3,6>, <3,3,3,3>
+  2559036726U, // <4,3,6,4>: Cost 3 vext1 <1,4,3,6>, RHS
+  1161841154U, // <4,3,6,5>: Cost 2 vrev <3,4,5,6>
+  4028203932U, // <4,3,6,6>: Cost 4 vzipr <0,2,4,6>, <1,2,3,6>
+  2706803380U, // <4,3,6,7>: Cost 3 vext3 <3,6,7,4>, <3,6,7,4>
+  1162062365U, // <4,3,6,u>: Cost 2 vrev <3,4,u,6>
+  3769633475U, // <4,3,7,0>: Cost 4 vext3 <1,u,3,4>, <3,7,0,1>
+  3769633488U, // <4,3,7,1>: Cost 4 vext3 <1,u,3,4>, <3,7,1,5>
+  3638757144U, // <4,3,7,2>: Cost 4 vext1 <2,4,3,7>, <2,4,3,7>
+  3769633508U, // <4,3,7,3>: Cost 4 vext3 <1,u,3,4>, <3,7,3,7>
+  3769633515U, // <4,3,7,4>: Cost 4 vext3 <1,u,3,4>, <3,7,4,5>
+  3769633526U, // <4,3,7,5>: Cost 4 vext3 <1,u,3,4>, <3,7,5,7>
+  3662647932U, // <4,3,7,6>: Cost 4 vext1 <6,4,3,7>, <6,4,3,7>
+  3781208837U, // <4,3,7,7>: Cost 4 vext3 <3,7,7,4>, <3,7,7,4>
+  3769633547U, // <4,3,7,u>: Cost 4 vext3 <1,u,3,4>, <3,7,u,1>
+  2559049830U, // <4,3,u,0>: Cost 3 vext1 <1,4,3,u>, LHS
+  2691910430U, // <4,3,u,1>: Cost 3 vext3 <1,2,3,4>, <3,u,1,2>
+  2565023513U, // <4,3,u,2>: Cost 3 vext1 <2,4,3,u>, <2,4,3,u>
+  2707835698U, // <4,3,u,3>: Cost 3 vext3 <3,u,3,4>, <3,u,3,4>
+  2559053110U, // <4,3,u,4>: Cost 3 vext1 <1,4,3,u>, RHS
+  1161857540U, // <4,3,u,5>: Cost 2 vrev <3,4,5,u>
+  2235673101U, // <4,3,u,6>: Cost 3 vrev <3,4,6,u>
+  2708130646U, // <4,3,u,7>: Cost 3 vext3 <3,u,7,4>, <3,u,7,4>
+  1162078751U, // <4,3,u,u>: Cost 2 vrev <3,4,u,u>
+  2617573416U, // <4,4,0,0>: Cost 3 vext2 <0,0,4,4>, <0,0,4,4>
+  1570373734U, // <4,4,0,1>: Cost 2 vext2 <4,4,4,4>, LHS
+  2779676774U, // <4,4,0,2>: Cost 3 vuzpl <4,6,4,6>, LHS
+  3760196480U, // <4,4,0,3>: Cost 4 vext3 <0,3,1,4>, <4,0,3,1>
+  2576977100U, // <4,4,0,4>: Cost 3 vext1 <4,4,4,0>, <4,4,4,0>
+  2718747538U, // <4,4,0,5>: Cost 3 vext3 <5,6,7,4>, <4,0,5,1>
+  2718747548U, // <4,4,0,6>: Cost 3 vext3 <5,6,7,4>, <4,0,6,2>
+  3668637015U, // <4,4,0,7>: Cost 4 vext1 <7,4,4,0>, <7,4,4,0>
+  1570374301U, // <4,4,0,u>: Cost 2 vext2 <4,4,4,4>, LHS
+  2644116214U, // <4,4,1,0>: Cost 3 vext2 <4,4,4,4>, <1,0,3,2>
+  2644116276U, // <4,4,1,1>: Cost 3 vext2 <4,4,4,4>, <1,1,1,1>
+  2691910602U, // <4,4,1,2>: Cost 3 vext3 <1,2,3,4>, <4,1,2,3>
+  2644116440U, // <4,4,1,3>: Cost 3 vext2 <4,4,4,4>, <1,3,1,3>
+  2711227356U, // <4,4,1,4>: Cost 3 vext3 <4,4,4,4>, <4,1,4,3>
+  2709310438U, // <4,4,1,5>: Cost 3 vext3 <4,1,5,4>, <4,1,5,4>
+  3765652462U, // <4,4,1,6>: Cost 4 vext3 <1,2,3,4>, <4,1,6,3>
+  3768970231U, // <4,4,1,7>: Cost 4 vext3 <1,7,3,4>, <4,1,7,3>
+  2695891968U, // <4,4,1,u>: Cost 3 vext3 <1,u,3,4>, <4,1,u,3>
+  3703260634U, // <4,4,2,0>: Cost 4 vext2 <2,0,4,4>, <2,0,4,4>
+  3765652499U, // <4,4,2,1>: Cost 4 vext3 <1,2,3,4>, <4,2,1,4>
+  2644117096U, // <4,4,2,2>: Cost 3 vext2 <4,4,4,4>, <2,2,2,2>
+  2631509709U, // <4,4,2,3>: Cost 3 vext2 <2,3,4,4>, <2,3,4,4>
+  2644117269U, // <4,4,2,4>: Cost 3 vext2 <4,4,4,4>, <2,4,3,4>
+  3705251698U, // <4,4,2,5>: Cost 4 vext2 <2,3,4,4>, <2,5,4,7>
+  2710047808U, // <4,4,2,6>: Cost 3 vext3 <4,2,6,4>, <4,2,6,4>
+  3783863369U, // <4,4,2,7>: Cost 4 vext3 <4,2,7,4>, <4,2,7,4>
+  2634827874U, // <4,4,2,u>: Cost 3 vext2 <2,u,4,4>, <2,u,4,4>
+  2644117654U, // <4,4,3,0>: Cost 3 vext2 <4,4,4,4>, <3,0,1,2>
+  3638797210U, // <4,4,3,1>: Cost 4 vext1 <2,4,4,3>, <1,2,3,4>
+  3638798082U, // <4,4,3,2>: Cost 4 vext1 <2,4,4,3>, <2,4,1,3>
+  2637482406U, // <4,4,3,3>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
+  2638146039U, // <4,4,3,4>: Cost 3 vext2 <3,4,4,4>, <3,4,4,4>
+  3913287374U, // <4,4,3,5>: Cost 4 vuzpr <3,4,5,4>, <2,3,4,5>
+  3765652625U, // <4,4,3,6>: Cost 4 vext3 <1,2,3,4>, <4,3,6,4>
+  3713878762U, // <4,4,3,7>: Cost 4 vext2 <3,7,4,4>, <3,7,4,4>
+  2637482406U, // <4,4,3,u>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
+  1503264870U, // <4,4,4,0>: Cost 2 vext1 <4,4,4,4>, LHS
+  2577007514U, // <4,4,4,1>: Cost 3 vext1 <4,4,4,4>, <1,2,3,4>
+  2577008232U, // <4,4,4,2>: Cost 3 vext1 <4,4,4,4>, <2,2,2,2>
+  2571037175U, // <4,4,4,3>: Cost 3 vext1 <3,4,4,4>, <3,4,4,4>
+  161926454U, // <4,4,4,4>: Cost 1 vdup0 RHS
+  1570377014U, // <4,4,4,5>: Cost 2 vext2 <4,4,4,4>, RHS
+  2779680054U, // <4,4,4,6>: Cost 3 vuzpl <4,6,4,6>, RHS
+  2594927963U, // <4,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
+  161926454U, // <4,4,4,u>: Cost 1 vdup0 RHS
+  2571042918U, // <4,4,5,0>: Cost 3 vext1 <3,4,4,5>, LHS
+  2571043738U, // <4,4,5,1>: Cost 3 vext1 <3,4,4,5>, <1,2,3,4>
+  3638814495U, // <4,4,5,2>: Cost 4 vext1 <2,4,4,5>, <2,4,4,5>
+  2571045368U, // <4,4,5,3>: Cost 3 vext1 <3,4,4,5>, <3,4,4,5>
+  2571046198U, // <4,4,5,4>: Cost 3 vext1 <3,4,4,5>, RHS
+  1839648054U, // <4,4,5,5>: Cost 2 vzipl RHS, RHS
+  1618169142U, // <4,4,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  2594936156U, // <4,4,5,7>: Cost 3 vext1 <7,4,4,5>, <7,4,4,5>
+  1618169160U, // <4,4,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
+  2553135206U, // <4,4,6,0>: Cost 3 vext1 <0,4,4,6>, LHS
+  3626877686U, // <4,4,6,1>: Cost 4 vext1 <0,4,4,6>, <1,0,3,2>
+  2565080782U, // <4,4,6,2>: Cost 3 vext1 <2,4,4,6>, <2,3,4,5>
+  2571053561U, // <4,4,6,3>: Cost 3 vext1 <3,4,4,6>, <3,4,4,6>
+  2553138486U, // <4,4,6,4>: Cost 3 vext1 <0,4,4,6>, RHS
+  2241555675U, // <4,4,6,5>: Cost 3 vrev <4,4,5,6>
+  1973865782U, // <4,4,6,6>: Cost 2 vtrnl RHS, RHS
+  2658055029U, // <4,4,6,7>: Cost 3 vext2 <6,7,4,4>, <6,7,4,4>
+  1973865800U, // <4,4,6,u>: Cost 2 vtrnl RHS, RHS
+  2644120570U, // <4,4,7,0>: Cost 3 vext2 <4,4,4,4>, <7,0,1,2>
+  3638829978U, // <4,4,7,1>: Cost 4 vext1 <2,4,4,7>, <1,2,3,4>
+  3638830881U, // <4,4,7,2>: Cost 4 vext1 <2,4,4,7>, <2,4,4,7>
+  3735115018U, // <4,4,7,3>: Cost 4 vext2 <7,3,4,4>, <7,3,4,4>
+  2662036827U, // <4,4,7,4>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
+  2713292236U, // <4,4,7,5>: Cost 3 vext3 <4,7,5,4>, <4,7,5,4>
+  2713365973U, // <4,4,7,6>: Cost 3 vext3 <4,7,6,4>, <4,7,6,4>
+  2644121196U, // <4,4,7,7>: Cost 3 vext2 <4,4,4,4>, <7,7,7,7>
+  2662036827U, // <4,4,7,u>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
+  1503297638U, // <4,4,u,0>: Cost 2 vext1 <4,4,4,u>, LHS
+  1570379566U, // <4,4,u,1>: Cost 2 vext2 <4,4,4,4>, LHS
+  2779682606U, // <4,4,u,2>: Cost 3 vuzpl <4,6,4,6>, LHS
+  2571069947U, // <4,4,u,3>: Cost 3 vext1 <3,4,4,u>, <3,4,4,u>
+  161926454U, // <4,4,u,4>: Cost 1 vdup0 RHS
+  1841638710U, // <4,4,u,5>: Cost 2 vzipl RHS, RHS
+  1618169385U, // <4,4,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  2594960735U, // <4,4,u,7>: Cost 3 vext1 <7,4,4,u>, <7,4,4,u>
+  161926454U, // <4,4,u,u>: Cost 1 vdup0 RHS
+  2631516160U, // <4,5,0,0>: Cost 3 vext2 <2,3,4,5>, <0,0,0,0>
+  1557774438U, // <4,5,0,1>: Cost 2 vext2 <2,3,4,5>, LHS
+  2618908875U, // <4,5,0,2>: Cost 3 vext2 <0,2,4,5>, <0,2,4,5>
+  2571078140U, // <4,5,0,3>: Cost 3 vext1 <3,4,5,0>, <3,4,5,0>
+  2626871634U, // <4,5,0,4>: Cost 3 vext2 <1,5,4,5>, <0,4,1,5>
+  3705258414U, // <4,5,0,5>: Cost 4 vext2 <2,3,4,5>, <0,5,2,7>
+  2594968438U, // <4,5,0,6>: Cost 3 vext1 <7,4,5,0>, <6,7,4,5>
+  2594968928U, // <4,5,0,7>: Cost 3 vext1 <7,4,5,0>, <7,4,5,0>
+  1557775005U, // <4,5,0,u>: Cost 2 vext2 <2,3,4,5>, LHS
+  2631516918U, // <4,5,1,0>: Cost 3 vext2 <2,3,4,5>, <1,0,3,2>
+  2624217939U, // <4,5,1,1>: Cost 3 vext2 <1,1,4,5>, <1,1,4,5>
+  2631517078U, // <4,5,1,2>: Cost 3 vext2 <2,3,4,5>, <1,2,3,0>
+  2821341286U, // <4,5,1,3>: Cost 3 vuzpr <0,4,1,5>, LHS
+  3895086054U, // <4,5,1,4>: Cost 4 vuzpr <0,4,1,5>, <4,1,5,4>
+  2626872471U, // <4,5,1,5>: Cost 3 vext2 <1,5,4,5>, <1,5,4,5>
+  3895083131U, // <4,5,1,6>: Cost 4 vuzpr <0,4,1,5>, <0,1,4,6>
+  2718748368U, // <4,5,1,7>: Cost 3 vext3 <5,6,7,4>, <5,1,7,3>
+  2821341291U, // <4,5,1,u>: Cost 3 vuzpr <0,4,1,5>, LHS
+  2571092070U, // <4,5,2,0>: Cost 3 vext1 <3,4,5,2>, LHS
+  3699287585U, // <4,5,2,1>: Cost 4 vext2 <1,3,4,5>, <2,1,3,3>
+  2630854269U, // <4,5,2,2>: Cost 3 vext2 <2,2,4,5>, <2,2,4,5>
+  1557776078U, // <4,5,2,3>: Cost 2 vext2 <2,3,4,5>, <2,3,4,5>
+  2631517974U, // <4,5,2,4>: Cost 3 vext2 <2,3,4,5>, <2,4,3,5>
+  3692652384U, // <4,5,2,5>: Cost 4 vext2 <0,2,4,5>, <2,5,2,7>
+  2631518138U, // <4,5,2,6>: Cost 3 vext2 <2,3,4,5>, <2,6,3,7>
+  4164013366U, // <4,5,2,7>: Cost 4 vtrnr <0,4,u,2>, RHS
+  1561094243U, // <4,5,2,u>: Cost 2 vext2 <2,u,4,5>, <2,u,4,5>
+  2631518358U, // <4,5,3,0>: Cost 3 vext2 <2,3,4,5>, <3,0,1,2>
+  3895084710U, // <4,5,3,1>: Cost 4 vuzpr <0,4,1,5>, <2,3,0,1>
+  2631518540U, // <4,5,3,2>: Cost 3 vext2 <2,3,4,5>, <3,2,3,4>
+  2631518620U, // <4,5,3,3>: Cost 3 vext2 <2,3,4,5>, <3,3,3,3>
+  2631518716U, // <4,5,3,4>: Cost 3 vext2 <2,3,4,5>, <3,4,5,0>
+  2631518784U, // <4,5,3,5>: Cost 3 vext2 <2,3,4,5>, <3,5,3,5>
+  2658060980U, // <4,5,3,6>: Cost 3 vext2 <6,7,4,5>, <3,6,7,4>
+  2640145131U, // <4,5,3,7>: Cost 3 vext2 <3,7,4,5>, <3,7,4,5>
+  2631519006U, // <4,5,3,u>: Cost 3 vext2 <2,3,4,5>, <3,u,1,2>
+  2571108454U, // <4,5,4,0>: Cost 3 vext1 <3,4,5,4>, LHS
+  3632907342U, // <4,5,4,1>: Cost 4 vext1 <1,4,5,4>, <1,4,5,4>
+  2571110094U, // <4,5,4,2>: Cost 3 vext1 <3,4,5,4>, <2,3,4,5>
+  2571110912U, // <4,5,4,3>: Cost 3 vext1 <3,4,5,4>, <3,4,5,4>
+  2571111734U, // <4,5,4,4>: Cost 3 vext1 <3,4,5,4>, RHS
+  1557777718U, // <4,5,4,5>: Cost 2 vext2 <2,3,4,5>, RHS
+  2645454195U, // <4,5,4,6>: Cost 3 vext2 <4,6,4,5>, <4,6,4,5>
+  2718748614U, // <4,5,4,7>: Cost 3 vext3 <5,6,7,4>, <5,4,7,6>
+  1557777961U, // <4,5,4,u>: Cost 2 vext2 <2,3,4,5>, RHS
+  1503346790U, // <4,5,5,0>: Cost 2 vext1 <4,4,5,5>, LHS
+  2913398480U, // <4,5,5,1>: Cost 3 vzipl RHS, <5,1,7,3>
+  2631519998U, // <4,5,5,2>: Cost 3 vext2 <2,3,4,5>, <5,2,3,4>
+  2577090710U, // <4,5,5,3>: Cost 3 vext1 <4,4,5,5>, <3,0,1,2>
+  1503349978U, // <4,5,5,4>: Cost 2 vext1 <4,4,5,5>, <4,4,5,5>
+  2631520260U, // <4,5,5,5>: Cost 3 vext2 <2,3,4,5>, <5,5,5,5>
+  2913390690U, // <4,5,5,6>: Cost 3 vzipl RHS, <5,6,7,0>
+  2821344566U, // <4,5,5,7>: Cost 3 vuzpr <0,4,1,5>, RHS
+  1503352622U, // <4,5,5,u>: Cost 2 vext1 <4,4,5,5>, LHS
+  1497383014U, // <4,5,6,0>: Cost 2 vext1 <3,4,5,6>, LHS
+  2559181904U, // <4,5,6,1>: Cost 3 vext1 <1,4,5,6>, <1,4,5,6>
+  2565154601U, // <4,5,6,2>: Cost 3 vext1 <2,4,5,6>, <2,4,5,6>
+  1497385474U, // <4,5,6,3>: Cost 2 vext1 <3,4,5,6>, <3,4,5,6>
+  1497386294U, // <4,5,6,4>: Cost 2 vext1 <3,4,5,6>, RHS
+  3047608324U, // <4,5,6,5>: Cost 3 vtrnl RHS, <5,5,5,5>
+  2571129656U, // <4,5,6,6>: Cost 3 vext1 <3,4,5,6>, <6,6,6,6>
+  27705344U, // <4,5,6,7>: Cost 0 copy RHS
+  27705344U, // <4,5,6,u>: Cost 0 copy RHS
+  2565161062U, // <4,5,7,0>: Cost 3 vext1 <2,4,5,7>, LHS
+  2565161882U, // <4,5,7,1>: Cost 3 vext1 <2,4,5,7>, <1,2,3,4>
+  2565162794U, // <4,5,7,2>: Cost 3 vext1 <2,4,5,7>, <2,4,5,7>
+  2661381387U, // <4,5,7,3>: Cost 3 vext2 <7,3,4,5>, <7,3,4,5>
+  2565164342U, // <4,5,7,4>: Cost 3 vext1 <2,4,5,7>, RHS
+  2718748840U, // <4,5,7,5>: Cost 3 vext3 <5,6,7,4>, <5,7,5,7>
+  2718748846U, // <4,5,7,6>: Cost 3 vext3 <5,6,7,4>, <5,7,6,4>
+  2719412407U, // <4,5,7,7>: Cost 3 vext3 <5,7,7,4>, <5,7,7,4>
+  2565166894U, // <4,5,7,u>: Cost 3 vext1 <2,4,5,7>, LHS
+  1497399398U, // <4,5,u,0>: Cost 2 vext1 <3,4,5,u>, LHS
+  1557780270U, // <4,5,u,1>: Cost 2 vext2 <2,3,4,5>, LHS
+  2631522181U, // <4,5,u,2>: Cost 3 vext2 <2,3,4,5>, <u,2,3,0>
+  1497401860U, // <4,5,u,3>: Cost 2 vext1 <3,4,5,u>, <3,4,5,u>
+  1497402678U, // <4,5,u,4>: Cost 2 vext1 <3,4,5,u>, RHS
+  1557780634U, // <4,5,u,5>: Cost 2 vext2 <2,3,4,5>, RHS
+  2631522512U, // <4,5,u,6>: Cost 3 vext2 <2,3,4,5>, <u,6,3,7>
+  27705344U, // <4,5,u,7>: Cost 0 copy RHS
+  27705344U, // <4,5,u,u>: Cost 0 copy RHS
+  2618916864U, // <4,6,0,0>: Cost 3 vext2 <0,2,4,6>, <0,0,0,0>
+  1545175142U, // <4,6,0,1>: Cost 2 vext2 <0,2,4,6>, LHS
+  1545175244U, // <4,6,0,2>: Cost 2 vext2 <0,2,4,6>, <0,2,4,6>
+  3692658940U, // <4,6,0,3>: Cost 4 vext2 <0,2,4,6>, <0,3,1,0>
+  2618917202U, // <4,6,0,4>: Cost 3 vext2 <0,2,4,6>, <0,4,1,5>
+  3852910806U, // <4,6,0,5>: Cost 4 vuzpl RHS, <0,2,5,7>
+  2253525648U, // <4,6,0,6>: Cost 3 vrev <6,4,6,0>
+  4040764726U, // <4,6,0,7>: Cost 4 vzipr <2,3,4,0>, RHS
+  1545175709U, // <4,6,0,u>: Cost 2 vext2 <0,2,4,6>, LHS
+  2618917622U, // <4,6,1,0>: Cost 3 vext2 <0,2,4,6>, <1,0,3,2>
+  2618917684U, // <4,6,1,1>: Cost 3 vext2 <0,2,4,6>, <1,1,1,1>
+  2618917782U, // <4,6,1,2>: Cost 3 vext2 <0,2,4,6>, <1,2,3,0>
+  2618917848U, // <4,6,1,3>: Cost 3 vext2 <0,2,4,6>, <1,3,1,3>
+  3692659773U, // <4,6,1,4>: Cost 4 vext2 <0,2,4,6>, <1,4,3,5>
+  2618918032U, // <4,6,1,5>: Cost 3 vext2 <0,2,4,6>, <1,5,3,7>
+  3692659937U, // <4,6,1,6>: Cost 4 vext2 <0,2,4,6>, <1,6,3,7>
+  4032146742U, // <4,6,1,7>: Cost 4 vzipr <0,u,4,1>, RHS
+  2618918253U, // <4,6,1,u>: Cost 3 vext2 <0,2,4,6>, <1,u,1,3>
+  2618918380U, // <4,6,2,0>: Cost 3 vext2 <0,2,4,6>, <2,0,6,4>
+  2618918460U, // <4,6,2,1>: Cost 3 vext2 <0,2,4,6>, <2,1,6,3>
+  2618918504U, // <4,6,2,2>: Cost 3 vext2 <0,2,4,6>, <2,2,2,2>
+  2618918566U, // <4,6,2,3>: Cost 3 vext2 <0,2,4,6>, <2,3,0,1>
+  2618918679U, // <4,6,2,4>: Cost 3 vext2 <0,2,4,6>, <2,4,3,6>
+  2618918788U, // <4,6,2,5>: Cost 3 vext2 <0,2,4,6>, <2,5,6,7>
+  2618918842U, // <4,6,2,6>: Cost 3 vext2 <0,2,4,6>, <2,6,3,7>
+  2718749178U, // <4,6,2,7>: Cost 3 vext3 <5,6,7,4>, <6,2,7,3>
+  2618918971U, // <4,6,2,u>: Cost 3 vext2 <0,2,4,6>, <2,u,0,1>
+  2618919062U, // <4,6,3,0>: Cost 3 vext2 <0,2,4,6>, <3,0,1,2>
+  2636171526U, // <4,6,3,1>: Cost 3 vext2 <3,1,4,6>, <3,1,4,6>
+  3692661057U, // <4,6,3,2>: Cost 4 vext2 <0,2,4,6>, <3,2,2,2>
+  2618919324U, // <4,6,3,3>: Cost 3 vext2 <0,2,4,6>, <3,3,3,3>
+  2618919426U, // <4,6,3,4>: Cost 3 vext2 <0,2,4,6>, <3,4,5,6>
+  2638826058U, // <4,6,3,5>: Cost 3 vext2 <3,5,4,6>, <3,5,4,6>
+  3913303030U, // <4,6,3,6>: Cost 4 vuzpr <3,4,5,6>, <1,3,4,6>
+  2722730572U, // <4,6,3,7>: Cost 3 vext3 <6,3,7,4>, <6,3,7,4>
+  2618919710U, // <4,6,3,u>: Cost 3 vext2 <0,2,4,6>, <3,u,1,2>
+  2565210214U, // <4,6,4,0>: Cost 3 vext1 <2,4,6,4>, LHS
+  2718749286U, // <4,6,4,1>: Cost 3 vext3 <5,6,7,4>, <6,4,1,3>
+  2565211952U, // <4,6,4,2>: Cost 3 vext1 <2,4,6,4>, <2,4,6,4>
+  2571184649U, // <4,6,4,3>: Cost 3 vext1 <3,4,6,4>, <3,4,6,4>
+  2565213494U, // <4,6,4,4>: Cost 3 vext1 <2,4,6,4>, RHS
+  1545178422U, // <4,6,4,5>: Cost 2 vext2 <0,2,4,6>, RHS
+  1705430326U, // <4,6,4,6>: Cost 2 vuzpl RHS, RHS
+  2595075437U, // <4,6,4,7>: Cost 3 vext1 <7,4,6,4>, <7,4,6,4>
+  1545178665U, // <4,6,4,u>: Cost 2 vext2 <0,2,4,6>, RHS
+  2565218406U, // <4,6,5,0>: Cost 3 vext1 <2,4,6,5>, LHS
+  2645462736U, // <4,6,5,1>: Cost 3 vext2 <4,6,4,6>, <5,1,7,3>
+  2913399290U, // <4,6,5,2>: Cost 3 vzipl RHS, <6,2,7,3>
+  3913305394U, // <4,6,5,3>: Cost 4 vuzpr <3,4,5,6>, <4,5,6,3>
+  2645462982U, // <4,6,5,4>: Cost 3 vext2 <4,6,4,6>, <5,4,7,6>
+  2779172868U, // <4,6,5,5>: Cost 3 vuzpl RHS, <5,5,5,5>
+  2913391416U, // <4,6,5,6>: Cost 3 vzipl RHS, <6,6,6,6>
+  2821426486U, // <4,6,5,7>: Cost 3 vuzpr <0,4,2,6>, RHS
+  2821426487U, // <4,6,5,u>: Cost 3 vuzpr <0,4,2,6>, RHS
+  1503428710U, // <4,6,6,0>: Cost 2 vext1 <4,4,6,6>, LHS
+  2577171190U, // <4,6,6,1>: Cost 3 vext1 <4,4,6,6>, <1,0,3,2>
+  2645463546U, // <4,6,6,2>: Cost 3 vext2 <4,6,4,6>, <6,2,7,3>
+  2577172630U, // <4,6,6,3>: Cost 3 vext1 <4,4,6,6>, <3,0,1,2>
+  1503431908U, // <4,6,6,4>: Cost 2 vext1 <4,4,6,6>, <4,4,6,6>
+  2253501069U, // <4,6,6,5>: Cost 3 vrev <6,4,5,6>
+  2618921784U, // <4,6,6,6>: Cost 3 vext2 <0,2,4,6>, <6,6,6,6>
+  2954464566U, // <4,6,6,7>: Cost 3 vzipr <0,2,4,6>, RHS
+  1503434542U, // <4,6,6,u>: Cost 2 vext1 <4,4,6,6>, LHS
+  2645464058U, // <4,6,7,0>: Cost 3 vext2 <4,6,4,6>, <7,0,1,2>
+  2779173882U, // <4,6,7,1>: Cost 3 vuzpl RHS, <7,0,1,2>
+  3638978355U, // <4,6,7,2>: Cost 4 vext1 <2,4,6,7>, <2,4,6,7>
+  2725090156U, // <4,6,7,3>: Cost 3 vext3 <6,7,3,4>, <6,7,3,4>
+  2645464422U, // <4,6,7,4>: Cost 3 vext2 <4,6,4,6>, <7,4,5,6>
+  2779174246U, // <4,6,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
+  3852915914U, // <4,6,7,6>: Cost 4 vuzpl RHS, <7,2,6,3>
+  2779174508U, // <4,6,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
+  2779173945U, // <4,6,7,u>: Cost 3 vuzpl RHS, <7,0,u,2>
+  1503445094U, // <4,6,u,0>: Cost 2 vext1 <4,4,6,u>, LHS
+  1545180974U, // <4,6,u,1>: Cost 2 vext2 <0,2,4,6>, LHS
+  1705432878U, // <4,6,u,2>: Cost 2 vuzpl RHS, LHS
+  2618922940U, // <4,6,u,3>: Cost 3 vext2 <0,2,4,6>, <u,3,0,1>
+  1503448294U, // <4,6,u,4>: Cost 2 vext1 <4,4,6,u>, <4,4,6,u>
+  1545181338U, // <4,6,u,5>: Cost 2 vext2 <0,2,4,6>, RHS
+  1705433242U, // <4,6,u,6>: Cost 2 vuzpl RHS, RHS
+  2954480950U, // <4,6,u,7>: Cost 3 vzipr <0,2,4,u>, RHS
+  1545181541U, // <4,6,u,u>: Cost 2 vext2 <0,2,4,6>, LHS
+  3706601472U, // <4,7,0,0>: Cost 4 vext2 <2,5,4,7>, <0,0,0,0>
+  2632859750U, // <4,7,0,1>: Cost 3 vext2 <2,5,4,7>, LHS
+  2726343685U, // <4,7,0,2>: Cost 3 vext3 <7,0,2,4>, <7,0,2,4>
+  3701293312U, // <4,7,0,3>: Cost 4 vext2 <1,6,4,7>, <0,3,1,4>
+  3706601810U, // <4,7,0,4>: Cost 4 vext2 <2,5,4,7>, <0,4,1,5>
+  2259424608U, // <4,7,0,5>: Cost 3 vrev <7,4,5,0>
+  3695321617U, // <4,7,0,6>: Cost 4 vext2 <0,6,4,7>, <0,6,4,7>
+  3800454194U, // <4,7,0,7>: Cost 4 vext3 <7,0,7,4>, <7,0,7,4>
+  2632860317U, // <4,7,0,u>: Cost 3 vext2 <2,5,4,7>, LHS
+  2259064116U, // <4,7,1,0>: Cost 3 vrev <7,4,0,1>
+  3700630324U, // <4,7,1,1>: Cost 4 vext2 <1,5,4,7>, <1,1,1,1>
+  2632860570U, // <4,7,1,2>: Cost 3 vext2 <2,5,4,7>, <1,2,3,4>
+  3769635936U, // <4,7,1,3>: Cost 4 vext3 <1,u,3,4>, <7,1,3,5>
+  3656920374U, // <4,7,1,4>: Cost 4 vext1 <5,4,7,1>, RHS
+  3700630681U, // <4,7,1,5>: Cost 4 vext2 <1,5,4,7>, <1,5,4,7>
+  3701294314U, // <4,7,1,6>: Cost 4 vext2 <1,6,4,7>, <1,6,4,7>
+  3793818754U, // <4,7,1,7>: Cost 4 vext3 <5,u,7,4>, <7,1,7,3>
+  2259654012U, // <4,7,1,u>: Cost 3 vrev <7,4,u,1>
+  3656925286U, // <4,7,2,0>: Cost 4 vext1 <5,4,7,2>, LHS
+  3706603050U, // <4,7,2,1>: Cost 4 vext2 <2,5,4,7>, <2,1,4,3>
+  3706603112U, // <4,7,2,2>: Cost 4 vext2 <2,5,4,7>, <2,2,2,2>
+  2727744688U, // <4,7,2,3>: Cost 3 vext3 <7,2,3,4>, <7,2,3,4>
+  3705939745U, // <4,7,2,4>: Cost 4 vext2 <2,4,4,7>, <2,4,4,7>
+  2632861554U, // <4,7,2,5>: Cost 3 vext2 <2,5,4,7>, <2,5,4,7>
+  3706603450U, // <4,7,2,6>: Cost 4 vext2 <2,5,4,7>, <2,6,3,7>
+  3792491731U, // <4,7,2,7>: Cost 4 vext3 <5,6,7,4>, <7,2,7,3>
+  2634852453U, // <4,7,2,u>: Cost 3 vext2 <2,u,4,7>, <2,u,4,7>
+  3706603670U, // <4,7,3,0>: Cost 4 vext2 <2,5,4,7>, <3,0,1,2>
+  3662906266U, // <4,7,3,1>: Cost 4 vext1 <6,4,7,3>, <1,2,3,4>
+  3725183326U, // <4,7,3,2>: Cost 4 vext2 <5,6,4,7>, <3,2,5,4>
+  3706603932U, // <4,7,3,3>: Cost 4 vext2 <2,5,4,7>, <3,3,3,3>
+  3701295618U, // <4,7,3,4>: Cost 4 vext2 <1,6,4,7>, <3,4,5,6>
+  2638834251U, // <4,7,3,5>: Cost 3 vext2 <3,5,4,7>, <3,5,4,7>
+  2639497884U, // <4,7,3,6>: Cost 3 vext2 <3,6,4,7>, <3,6,4,7>
+  3802445093U, // <4,7,3,7>: Cost 4 vext3 <7,3,7,4>, <7,3,7,4>
+  2640825150U, // <4,7,3,u>: Cost 3 vext2 <3,u,4,7>, <3,u,4,7>
+  2718750004U, // <4,7,4,0>: Cost 3 vext3 <5,6,7,4>, <7,4,0,1>
+  3706604490U, // <4,7,4,1>: Cost 4 vext2 <2,5,4,7>, <4,1,2,3>
+  3656943474U, // <4,7,4,2>: Cost 4 vext1 <5,4,7,4>, <2,5,4,7>
+  3779884371U, // <4,7,4,3>: Cost 4 vext3 <3,5,7,4>, <7,4,3,5>
+  2259383643U, // <4,7,4,4>: Cost 3 vrev <7,4,4,4>
+  2632863030U, // <4,7,4,5>: Cost 3 vext2 <2,5,4,7>, RHS
+  2259531117U, // <4,7,4,6>: Cost 3 vrev <7,4,6,4>
+  3907340074U, // <4,7,4,7>: Cost 4 vuzpr <2,4,5,7>, <2,4,5,7>
+  2632863273U, // <4,7,4,u>: Cost 3 vext2 <2,5,4,7>, RHS
+  2913391610U, // <4,7,5,0>: Cost 3 vzipl RHS, <7,0,1,2>
+  3645006848U, // <4,7,5,1>: Cost 4 vext1 <3,4,7,5>, <1,3,5,7>
+  2589181646U, // <4,7,5,2>: Cost 3 vext1 <6,4,7,5>, <2,3,4,5>
+  3645008403U, // <4,7,5,3>: Cost 4 vext1 <3,4,7,5>, <3,4,7,5>
+  2913391974U, // <4,7,5,4>: Cost 3 vzipl RHS, <7,4,5,6>
+  2583211973U, // <4,7,5,5>: Cost 3 vext1 <5,4,7,5>, <5,4,7,5>
+  2589184670U, // <4,7,5,6>: Cost 3 vext1 <6,4,7,5>, <6,4,7,5>
+  2913392236U, // <4,7,5,7>: Cost 3 vzipl RHS, <7,7,7,7>
+  2913392258U, // <4,7,5,u>: Cost 3 vzipl RHS, <7,u,1,2>
+  1509474406U, // <4,7,6,0>: Cost 2 vext1 <5,4,7,6>, LHS
+  3047609338U, // <4,7,6,1>: Cost 3 vtrnl RHS, <7,0,1,2>
+  2583217768U, // <4,7,6,2>: Cost 3 vext1 <5,4,7,6>, <2,2,2,2>
+  2583218326U, // <4,7,6,3>: Cost 3 vext1 <5,4,7,6>, <3,0,1,2>
+  1509477686U, // <4,7,6,4>: Cost 2 vext1 <5,4,7,6>, RHS
+  1509478342U, // <4,7,6,5>: Cost 2 vext1 <5,4,7,6>, <5,4,7,6>
+  2583220730U, // <4,7,6,6>: Cost 3 vext1 <5,4,7,6>, <6,2,7,3>
+  3047609964U, // <4,7,6,7>: Cost 3 vtrnl RHS, <7,7,7,7>
+  1509480238U, // <4,7,6,u>: Cost 2 vext1 <5,4,7,6>, LHS
+  3650994278U, // <4,7,7,0>: Cost 4 vext1 <4,4,7,7>, LHS
+  3650995098U, // <4,7,7,1>: Cost 4 vext1 <4,4,7,7>, <1,2,3,4>
+  3650996010U, // <4,7,7,2>: Cost 4 vext1 <4,4,7,7>, <2,4,5,7>
+  3804804677U, // <4,7,7,3>: Cost 4 vext3 <7,7,3,4>, <7,7,3,4>
+  3650997486U, // <4,7,7,4>: Cost 4 vext1 <4,4,7,7>, <4,4,7,7>
+  2662725039U, // <4,7,7,5>: Cost 3 vext2 <7,5,4,7>, <7,5,4,7>
+  3662942880U, // <4,7,7,6>: Cost 4 vext1 <6,4,7,7>, <6,4,7,7>
+  2718750316U, // <4,7,7,7>: Cost 3 vext3 <5,6,7,4>, <7,7,7,7>
+  2664715938U, // <4,7,7,u>: Cost 3 vext2 <7,u,4,7>, <7,u,4,7>
+  1509490790U, // <4,7,u,0>: Cost 2 vext1 <5,4,7,u>, LHS
+  2632865582U, // <4,7,u,1>: Cost 3 vext2 <2,5,4,7>, LHS
+  2583234152U, // <4,7,u,2>: Cost 3 vext1 <5,4,7,u>, <2,2,2,2>
+  2583234710U, // <4,7,u,3>: Cost 3 vext1 <5,4,7,u>, <3,0,1,2>
+  1509494070U, // <4,7,u,4>: Cost 2 vext1 <5,4,7,u>, RHS
+  1509494728U, // <4,7,u,5>: Cost 2 vext1 <5,4,7,u>, <5,4,7,u>
+  2583237114U, // <4,7,u,6>: Cost 3 vext1 <5,4,7,u>, <6,2,7,3>
+  3047757420U, // <4,7,u,7>: Cost 3 vtrnl RHS, <7,7,7,7>
+  1509496622U, // <4,7,u,u>: Cost 2 vext1 <5,4,7,u>, LHS
+  2618933248U, // <4,u,0,0>: Cost 3 vext2 <0,2,4,u>, <0,0,0,0>
+  1545191526U, // <4,u,0,1>: Cost 2 vext2 <0,2,4,u>, LHS
+  1545191630U, // <4,u,0,2>: Cost 2 vext2 <0,2,4,u>, <0,2,4,u>
+  2691913445U, // <4,u,0,3>: Cost 3 vext3 <1,2,3,4>, <u,0,3,2>
+  2618933586U, // <4,u,0,4>: Cost 3 vext2 <0,2,4,u>, <0,4,1,5>
+  2265397305U, // <4,u,0,5>: Cost 3 vrev <u,4,5,0>
+  2595189625U, // <4,u,0,6>: Cost 3 vext1 <7,4,u,0>, <6,7,4,u>
+  2595190139U, // <4,u,0,7>: Cost 3 vext1 <7,4,u,0>, <7,4,u,0>
+  1545192093U, // <4,u,0,u>: Cost 2 vext2 <0,2,4,u>, LHS
+  2618934006U, // <4,u,1,0>: Cost 3 vext2 <0,2,4,u>, <1,0,3,2>
+  2618934068U, // <4,u,1,1>: Cost 3 vext2 <0,2,4,u>, <1,1,1,1>
+  1618171694U, // <4,u,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  2618934232U, // <4,u,1,3>: Cost 3 vext2 <0,2,4,u>, <1,3,1,3>
+  2695894848U, // <4,u,1,4>: Cost 3 vext3 <1,u,3,4>, <u,1,4,3>
+  2618934416U, // <4,u,1,5>: Cost 3 vext2 <0,2,4,u>, <1,5,3,7>
+  3692676321U, // <4,u,1,6>: Cost 4 vext2 <0,2,4,u>, <1,6,3,7>
+  2718750555U, // <4,u,1,7>: Cost 3 vext3 <5,6,7,4>, <u,1,7,3>
+  1618171748U, // <4,u,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
+  2553397350U, // <4,u,2,0>: Cost 3 vext1 <0,4,u,2>, LHS
+  2630215215U, // <4,u,2,1>: Cost 3 vext2 <2,1,4,u>, <2,1,4,u>
+  2618934888U, // <4,u,2,2>: Cost 3 vext2 <0,2,4,u>, <2,2,2,2>
+  1557800657U, // <4,u,2,3>: Cost 2 vext2 <2,3,4,u>, <2,3,4,u>
+  2618935065U, // <4,u,2,4>: Cost 3 vext2 <0,2,4,u>, <2,4,3,u>
+  2733864859U, // <4,u,2,5>: Cost 3 vext3 <u,2,5,4>, <u,2,5,4>
+  2618935226U, // <4,u,2,6>: Cost 3 vext2 <0,2,4,u>, <2,6,3,7>
+  2718750636U, // <4,u,2,7>: Cost 3 vext3 <5,6,7,4>, <u,2,7,3>
+  1561118822U, // <4,u,2,u>: Cost 2 vext2 <2,u,4,u>, <2,u,4,u>
+  2618935446U, // <4,u,3,0>: Cost 3 vext2 <0,2,4,u>, <3,0,1,2>
+  2779318422U, // <4,u,3,1>: Cost 3 vuzpl RHS, <3,0,1,2>
+  2636851545U, // <4,u,3,2>: Cost 3 vext2 <3,2,4,u>, <3,2,4,u>
+  2618935708U, // <4,u,3,3>: Cost 3 vext2 <0,2,4,u>, <3,3,3,3>
+  2618935810U, // <4,u,3,4>: Cost 3 vext2 <0,2,4,u>, <3,4,5,6>
+  2691913711U, // <4,u,3,5>: Cost 3 vext3 <1,2,3,4>, <u,3,5,7>
+  2588725862U, // <4,u,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+  2640169710U, // <4,u,3,7>: Cost 3 vext2 <3,7,4,u>, <3,7,4,u>
+  2618936094U, // <4,u,3,u>: Cost 3 vext2 <0,2,4,u>, <3,u,1,2>
+  1503559782U, // <4,u,4,0>: Cost 2 vext1 <4,4,u,4>, LHS
+  2692282391U, // <4,u,4,1>: Cost 3 vext3 <1,2,u,4>, <u,4,1,2>
+  2565359426U, // <4,u,4,2>: Cost 3 vext1 <2,4,u,4>, <2,4,u,4>
+  2571332123U, // <4,u,4,3>: Cost 3 vext1 <3,4,u,4>, <3,4,u,4>
+  161926454U, // <4,u,4,4>: Cost 1 vdup0 RHS
+  1545194806U, // <4,u,4,5>: Cost 2 vext2 <0,2,4,u>, RHS
+  1705577782U, // <4,u,4,6>: Cost 2 vuzpl RHS, RHS
+  2718750801U, // <4,u,4,7>: Cost 3 vext3 <5,6,7,4>, <u,4,7,6>
+  161926454U, // <4,u,4,u>: Cost 1 vdup0 RHS
+  1479164006U, // <4,u,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
+  1839650606U, // <4,u,5,1>: Cost 2 vzipl RHS, LHS
+  2565367502U, // <4,u,5,2>: Cost 3 vext1 <2,4,u,5>, <2,3,4,5>
+  3089777309U, // <4,u,5,3>: Cost 3 vtrnr <0,4,1,5>, LHS
+  1479167286U, // <4,u,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
+  1839650970U, // <4,u,5,5>: Cost 2 vzipl RHS, RHS
+  1618172058U, // <4,u,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  3089780265U, // <4,u,5,7>: Cost 3 vtrnr <0,4,1,5>, RHS
+  1618172076U, // <4,u,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
+  1479688294U, // <4,u,6,0>: Cost 2 vext1 <0,4,u,6>, LHS
+  2553430774U, // <4,u,6,1>: Cost 3 vext1 <0,4,u,6>, <1,0,3,2>
+  1973868334U, // <4,u,6,2>: Cost 2 vtrnl RHS, LHS
+  1497606685U, // <4,u,6,3>: Cost 2 vext1 <3,4,u,6>, <3,4,u,6>
+  1479691574U, // <4,u,6,4>: Cost 2 vext1 <0,4,u,6>, RHS
+  1509552079U, // <4,u,6,5>: Cost 2 vext1 <5,4,u,6>, <5,4,u,6>
+  1973868698U, // <4,u,6,6>: Cost 2 vtrnl RHS, RHS
+  27705344U, // <4,u,6,7>: Cost 0 copy RHS
+  27705344U, // <4,u,6,u>: Cost 0 copy RHS
+  2565382246U, // <4,u,7,0>: Cost 3 vext1 <2,4,u,7>, LHS
+  2565383066U, // <4,u,7,1>: Cost 3 vext1 <2,4,u,7>, <1,2,3,4>
+  2565384005U, // <4,u,7,2>: Cost 3 vext1 <2,4,u,7>, <2,4,u,7>
+  2661405966U, // <4,u,7,3>: Cost 3 vext2 <7,3,4,u>, <7,3,4,u>
+  2565385526U, // <4,u,7,4>: Cost 3 vext1 <2,4,u,7>, RHS
+  2779321702U, // <4,u,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
+  2589274793U, // <4,u,7,6>: Cost 3 vext1 <6,4,u,7>, <6,4,u,7>
+  2779321964U, // <4,u,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
+  2565388078U, // <4,u,7,u>: Cost 3 vext1 <2,4,u,7>, LHS
+  1479704678U, // <4,u,u,0>: Cost 2 vext1 <0,4,u,u>, LHS
+  1545197358U, // <4,u,u,1>: Cost 2 vext2 <0,2,4,u>, LHS
+  1618172261U, // <4,u,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  1497623071U, // <4,u,u,3>: Cost 2 vext1 <3,4,u,u>, <3,4,u,u>
+  161926454U, // <4,u,u,4>: Cost 1 vdup0 RHS
+  1545197722U, // <4,u,u,5>: Cost 2 vext2 <0,2,4,u>, RHS
+  1618172301U, // <4,u,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  27705344U, // <4,u,u,7>: Cost 0 copy RHS
+  27705344U, // <4,u,u,u>: Cost 0 copy RHS
+  2687123456U, // <5,0,0,0>: Cost 3 vext3 <0,4,1,5>, <0,0,0,0>
+  2687123466U, // <5,0,0,1>: Cost 3 vext3 <0,4,1,5>, <0,0,1,1>
+  2687123476U, // <5,0,0,2>: Cost 3 vext3 <0,4,1,5>, <0,0,2,2>
+  3710599434U, // <5,0,0,3>: Cost 4 vext2 <3,2,5,0>, <0,3,2,5>
+  2642166098U, // <5,0,0,4>: Cost 3 vext2 <4,1,5,0>, <0,4,1,5>
+  3657060306U, // <5,0,0,5>: Cost 4 vext1 <5,5,0,0>, <5,5,0,0>
+  3292094923U, // <5,0,0,6>: Cost 4 vrev <0,5,6,0>
+  3669005700U, // <5,0,0,7>: Cost 4 vext1 <7,5,0,0>, <7,5,0,0>
+  2687123530U, // <5,0,0,u>: Cost 3 vext3 <0,4,1,5>, <0,0,u,2>
+  2559434854U, // <5,0,1,0>: Cost 3 vext1 <1,5,0,1>, LHS
+  2559435887U, // <5,0,1,1>: Cost 3 vext1 <1,5,0,1>, <1,5,0,1>
+  1613381734U, // <5,0,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  3698656256U, // <5,0,1,3>: Cost 4 vext2 <1,2,5,0>, <1,3,5,7>
+  2559438134U, // <5,0,1,4>: Cost 3 vext1 <1,5,0,1>, RHS
+  2583326675U, // <5,0,1,5>: Cost 3 vext1 <5,5,0,1>, <5,5,0,1>
+  3715908851U, // <5,0,1,6>: Cost 4 vext2 <4,1,5,0>, <1,6,5,7>
+  3657069562U, // <5,0,1,7>: Cost 4 vext1 <5,5,0,1>, <7,0,1,2>
+  1613381788U, // <5,0,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
+  2686017700U, // <5,0,2,0>: Cost 3 vext3 <0,2,4,5>, <0,2,0,2>
+  2685796528U, // <5,0,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
+  2698625208U, // <5,0,2,2>: Cost 3 vext3 <2,3,4,5>, <0,2,2,4>
+  2685944002U, // <5,0,2,3>: Cost 3 vext3 <0,2,3,5>, <0,2,3,5>
+  2686017739U, // <5,0,2,4>: Cost 3 vext3 <0,2,4,5>, <0,2,4,5>
+  2686091476U, // <5,0,2,5>: Cost 3 vext3 <0,2,5,5>, <0,2,5,5>
+  2725167324U, // <5,0,2,6>: Cost 3 vext3 <6,7,4,5>, <0,2,6,4>
+  2595280230U, // <5,0,2,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
+  2686312687U, // <5,0,2,u>: Cost 3 vext3 <0,2,u,5>, <0,2,u,5>
+  3760128248U, // <5,0,3,0>: Cost 4 vext3 <0,3,0,5>, <0,3,0,5>
+  3759685888U, // <5,0,3,1>: Cost 4 vext3 <0,2,3,5>, <0,3,1,4>
+  2686533898U, // <5,0,3,2>: Cost 3 vext3 <0,3,2,5>, <0,3,2,5>
+  3760349459U, // <5,0,3,3>: Cost 4 vext3 <0,3,3,5>, <0,3,3,5>
+  2638187004U, // <5,0,3,4>: Cost 3 vext2 <3,4,5,0>, <3,4,5,0>
+  3776348452U, // <5,0,3,5>: Cost 4 vext3 <3,0,4,5>, <0,3,5,4>
+  3713256094U, // <5,0,3,6>: Cost 4 vext2 <3,6,5,0>, <3,6,5,0>
+  3914064896U, // <5,0,3,7>: Cost 4 vuzpr <3,5,7,0>, <1,3,5,7>
+  2686976320U, // <5,0,3,u>: Cost 3 vext3 <0,3,u,5>, <0,3,u,5>
+  2559459430U, // <5,0,4,0>: Cost 3 vext1 <1,5,0,4>, LHS
+  1613381970U, // <5,0,4,1>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
+  2687123804U, // <5,0,4,2>: Cost 3 vext3 <0,4,1,5>, <0,4,2,6>
+  3761013092U, // <5,0,4,3>: Cost 4 vext3 <0,4,3,5>, <0,4,3,5>
+  2559462710U, // <5,0,4,4>: Cost 3 vext1 <1,5,0,4>, RHS
+  2638187830U, // <5,0,4,5>: Cost 3 vext2 <3,4,5,0>, RHS
+  3761234303U, // <5,0,4,6>: Cost 4 vext3 <0,4,6,5>, <0,4,6,5>
+  2646150600U, // <5,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
+  1613381970U, // <5,0,4,u>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
+  3766763926U, // <5,0,5,0>: Cost 4 vext3 <1,4,0,5>, <0,5,0,1>
+  2919268454U, // <5,0,5,1>: Cost 3 vzipl <5,5,5,5>, LHS
+  3053486182U, // <5,0,5,2>: Cost 3 vtrnl <5,5,5,5>, LHS
+  3723210589U, // <5,0,5,3>: Cost 4 vext2 <5,3,5,0>, <5,3,5,0>
+  3766763966U, // <5,0,5,4>: Cost 4 vext3 <1,4,0,5>, <0,5,4,5>
+  2650796031U, // <5,0,5,5>: Cost 3 vext2 <5,5,5,0>, <5,5,5,0>
+  3719893090U, // <5,0,5,6>: Cost 4 vext2 <4,7,5,0>, <5,6,7,0>
+  3914067254U, // <5,0,5,7>: Cost 4 vuzpr <3,5,7,0>, RHS
+  2919269021U, // <5,0,5,u>: Cost 3 vzipl <5,5,5,5>, LHS
+  4047519744U, // <5,0,6,0>: Cost 4 vzipr <3,4,5,6>, <0,0,0,0>
+  2920038502U, // <5,0,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
+  3759759871U, // <5,0,6,2>: Cost 4 vext3 <0,2,4,5>, <0,6,2,7>
+  3645164070U, // <5,0,6,3>: Cost 4 vext1 <3,5,0,6>, <3,5,0,6>
+  3762414095U, // <5,0,6,4>: Cost 4 vext3 <0,6,4,5>, <0,6,4,5>
+  3993780690U, // <5,0,6,5>: Cost 4 vzipl <5,6,7,0>, <0,5,6,7>
+  3719893816U, // <5,0,6,6>: Cost 4 vext2 <4,7,5,0>, <6,6,6,6>
+  2662077302U, // <5,0,6,7>: Cost 3 vext2 <7,4,5,0>, <6,7,4,5>
+  2920039069U, // <5,0,6,u>: Cost 3 vzipl <5,6,7,0>, LHS
+  2565455974U, // <5,0,7,0>: Cost 3 vext1 <2,5,0,7>, LHS
+  2565456790U, // <5,0,7,1>: Cost 3 vext1 <2,5,0,7>, <1,2,3,0>
+  2565457742U, // <5,0,7,2>: Cost 3 vext1 <2,5,0,7>, <2,5,0,7>
+  3639199894U, // <5,0,7,3>: Cost 4 vext1 <2,5,0,7>, <3,0,1,2>
+  2565459254U, // <5,0,7,4>: Cost 3 vext1 <2,5,0,7>, RHS
+  2589347938U, // <5,0,7,5>: Cost 3 vext1 <6,5,0,7>, <5,6,7,0>
+  2589348530U, // <5,0,7,6>: Cost 3 vext1 <6,5,0,7>, <6,5,0,7>
+  4188456422U, // <5,0,7,7>: Cost 4 vtrnr RHS, <2,0,5,7>
+  2565461806U, // <5,0,7,u>: Cost 3 vext1 <2,5,0,7>, LHS
+  2687124106U, // <5,0,u,0>: Cost 3 vext3 <0,4,1,5>, <0,u,0,2>
+  1616036502U, // <5,0,u,1>: Cost 2 vext3 <0,u,1,5>, <0,u,1,5>
+  1613382301U, // <5,0,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  2689925800U, // <5,0,u,3>: Cost 3 vext3 <0,u,3,5>, <0,u,3,5>
+  2687124146U, // <5,0,u,4>: Cost 3 vext3 <0,4,1,5>, <0,u,4,6>
+  2638190746U, // <5,0,u,5>: Cost 3 vext2 <3,4,5,0>, RHS
+  2589356723U, // <5,0,u,6>: Cost 3 vext1 <6,5,0,u>, <6,5,0,u>
+  2595280230U, // <5,0,u,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
+  1613382355U, // <5,0,u,u>: Cost 2 vext3 <0,4,1,5>, LHS
+  2646818816U, // <5,1,0,0>: Cost 3 vext2 <4,u,5,1>, <0,0,0,0>
+  1573077094U, // <5,1,0,1>: Cost 2 vext2 <4,u,5,1>, LHS
+  2646818980U, // <5,1,0,2>: Cost 3 vext2 <4,u,5,1>, <0,2,0,2>
+  2687124214U, // <5,1,0,3>: Cost 3 vext3 <0,4,1,5>, <1,0,3,2>
+  2641510738U, // <5,1,0,4>: Cost 3 vext2 <4,0,5,1>, <0,4,1,5>
+  2641510814U, // <5,1,0,5>: Cost 3 vext2 <4,0,5,1>, <0,5,1,0>
+  3720561142U, // <5,1,0,6>: Cost 4 vext2 <4,u,5,1>, <0,6,1,7>
+  3298141357U, // <5,1,0,7>: Cost 4 vrev <1,5,7,0>
+  1573077661U, // <5,1,0,u>: Cost 2 vext2 <4,u,5,1>, LHS
+  2223891567U, // <5,1,1,0>: Cost 3 vrev <1,5,0,1>
+  2687124276U, // <5,1,1,1>: Cost 3 vext3 <0,4,1,5>, <1,1,1,1>
+  2646819734U, // <5,1,1,2>: Cost 3 vext2 <4,u,5,1>, <1,2,3,0>
+  2687124296U, // <5,1,1,3>: Cost 3 vext3 <0,4,1,5>, <1,1,3,3>
+  2691326803U, // <5,1,1,4>: Cost 3 vext3 <1,1,4,5>, <1,1,4,5>
+  2691400540U, // <5,1,1,5>: Cost 3 vext3 <1,1,5,5>, <1,1,5,5>
+  3765216101U, // <5,1,1,6>: Cost 4 vext3 <1,1,6,5>, <1,1,6,5>
+  3765289838U, // <5,1,1,7>: Cost 4 vext3 <1,1,7,5>, <1,1,7,5>
+  2687124341U, // <5,1,1,u>: Cost 3 vext3 <0,4,1,5>, <1,1,u,3>
+  3297641584U, // <5,1,2,0>: Cost 4 vrev <1,5,0,2>
+  3763520391U, // <5,1,2,1>: Cost 4 vext3 <0,u,1,5>, <1,2,1,3>
+  2646820456U, // <5,1,2,2>: Cost 3 vext2 <4,u,5,1>, <2,2,2,2>
+  2687124374U, // <5,1,2,3>: Cost 3 vext3 <0,4,1,5>, <1,2,3,0>
+  2691990436U, // <5,1,2,4>: Cost 3 vext3 <1,2,4,5>, <1,2,4,5>
+  2687124395U, // <5,1,2,5>: Cost 3 vext3 <0,4,1,5>, <1,2,5,3>
+  2646820794U, // <5,1,2,6>: Cost 3 vext2 <4,u,5,1>, <2,6,3,7>
+  3808199610U, // <5,1,2,7>: Cost 4 vext3 <u,3,4,5>, <1,2,7,0>
+  2687124419U, // <5,1,2,u>: Cost 3 vext3 <0,4,1,5>, <1,2,u,0>
+  2577440870U, // <5,1,3,0>: Cost 3 vext1 <4,5,1,3>, LHS
+  2687124440U, // <5,1,3,1>: Cost 3 vext3 <0,4,1,5>, <1,3,1,3>
+  3759686627U, // <5,1,3,2>: Cost 4 vext3 <0,2,3,5>, <1,3,2,5>
+  2692580332U, // <5,1,3,3>: Cost 3 vext3 <1,3,3,5>, <1,3,3,5>
+  2687124469U, // <5,1,3,4>: Cost 3 vext3 <0,4,1,5>, <1,3,4,5>
+  2685207552U, // <5,1,3,5>: Cost 3 vext3 <0,1,2,5>, <1,3,5,7>
+  3760866313U, // <5,1,3,6>: Cost 4 vext3 <0,4,1,5>, <1,3,6,7>
+  2692875280U, // <5,1,3,7>: Cost 3 vext3 <1,3,7,5>, <1,3,7,5>
+  2687124503U, // <5,1,3,u>: Cost 3 vext3 <0,4,1,5>, <1,3,u,3>
+  1567771538U, // <5,1,4,0>: Cost 2 vext2 <4,0,5,1>, <4,0,5,1>
+  2693096491U, // <5,1,4,1>: Cost 3 vext3 <1,4,1,5>, <1,4,1,5>
+  2693170228U, // <5,1,4,2>: Cost 3 vext3 <1,4,2,5>, <1,4,2,5>
+  2687124541U, // <5,1,4,3>: Cost 3 vext3 <0,4,1,5>, <1,4,3,5>
+  2646822096U, // <5,1,4,4>: Cost 3 vext2 <4,u,5,1>, <4,4,4,4>
+  1573080374U, // <5,1,4,5>: Cost 2 vext2 <4,u,5,1>, RHS
+  2646822260U, // <5,1,4,6>: Cost 3 vext2 <4,u,5,1>, <4,6,4,6>
+  3298174129U, // <5,1,4,7>: Cost 4 vrev <1,5,7,4>
+  1573080602U, // <5,1,4,u>: Cost 2 vext2 <4,u,5,1>, <4,u,5,1>
+  2687124591U, // <5,1,5,0>: Cost 3 vext3 <0,4,1,5>, <1,5,0,1>
+  2646822543U, // <5,1,5,1>: Cost 3 vext2 <4,u,5,1>, <5,1,0,1>
+  3760866433U, // <5,1,5,2>: Cost 4 vext3 <0,4,1,5>, <1,5,2,1>
+  2687124624U, // <5,1,5,3>: Cost 3 vext3 <0,4,1,5>, <1,5,3,7>
+  2687124631U, // <5,1,5,4>: Cost 3 vext3 <0,4,1,5>, <1,5,4,5>
+  2646822916U, // <5,1,5,5>: Cost 3 vext2 <4,u,5,1>, <5,5,5,5>
+  2646823010U, // <5,1,5,6>: Cost 3 vext2 <4,u,5,1>, <5,6,7,0>
+  2646823080U, // <5,1,5,7>: Cost 3 vext2 <4,u,5,1>, <5,7,5,7>
+  2687124663U, // <5,1,5,u>: Cost 3 vext3 <0,4,1,5>, <1,5,u,1>
+  2553577574U, // <5,1,6,0>: Cost 3 vext1 <0,5,1,6>, LHS
+  3763520719U, // <5,1,6,1>: Cost 4 vext3 <0,u,1,5>, <1,6,1,7>
+  2646823418U, // <5,1,6,2>: Cost 3 vext2 <4,u,5,1>, <6,2,7,3>
+  3760866529U, // <5,1,6,3>: Cost 4 vext3 <0,4,1,5>, <1,6,3,7>
+  2553580854U, // <5,1,6,4>: Cost 3 vext1 <0,5,1,6>, RHS
+  2687124723U, // <5,1,6,5>: Cost 3 vext3 <0,4,1,5>, <1,6,5,7>
+  2646823736U, // <5,1,6,6>: Cost 3 vext2 <4,u,5,1>, <6,6,6,6>
+  2646823758U, // <5,1,6,7>: Cost 3 vext2 <4,u,5,1>, <6,7,0,1>
+  2646823839U, // <5,1,6,u>: Cost 3 vext2 <4,u,5,1>, <6,u,0,1>
+  2559557734U, // <5,1,7,0>: Cost 3 vext1 <1,5,1,7>, LHS
+  2559558452U, // <5,1,7,1>: Cost 3 vext1 <1,5,1,7>, <1,1,1,1>
+  2571503270U, // <5,1,7,2>: Cost 3 vext1 <3,5,1,7>, <2,3,0,1>
+  2040971366U, // <5,1,7,3>: Cost 2 vtrnr RHS, LHS
+  2559561014U, // <5,1,7,4>: Cost 3 vext1 <1,5,1,7>, RHS
+  2595393232U, // <5,1,7,5>: Cost 3 vext1 <7,5,1,7>, <5,1,7,3>
+  4188455035U, // <5,1,7,6>: Cost 4 vtrnr RHS, <0,1,4,6>
+  2646824556U, // <5,1,7,7>: Cost 3 vext2 <4,u,5,1>, <7,7,7,7>
+  2040971371U, // <5,1,7,u>: Cost 2 vtrnr RHS, LHS
+  1591662326U, // <5,1,u,0>: Cost 2 vext2 <u,0,5,1>, <u,0,5,1>
+  1573082926U, // <5,1,u,1>: Cost 2 vext2 <4,u,5,1>, LHS
+  2695824760U, // <5,1,u,2>: Cost 3 vext3 <1,u,2,5>, <1,u,2,5>
+  2040979558U, // <5,1,u,3>: Cost 2 vtrnr RHS, LHS
+  2687124874U, // <5,1,u,4>: Cost 3 vext3 <0,4,1,5>, <1,u,4,5>
+  1573083290U, // <5,1,u,5>: Cost 2 vext2 <4,u,5,1>, RHS
+  2646825168U, // <5,1,u,6>: Cost 3 vext2 <4,u,5,1>, <u,6,3,7>
+  2646825216U, // <5,1,u,7>: Cost 3 vext2 <4,u,5,1>, <u,7,0,1>
+  2040979563U, // <5,1,u,u>: Cost 2 vtrnr RHS, LHS
+  3702652928U, // <5,2,0,0>: Cost 4 vext2 <1,u,5,2>, <0,0,0,0>
+  2628911206U, // <5,2,0,1>: Cost 3 vext2 <1,u,5,2>, LHS
+  2641518756U, // <5,2,0,2>: Cost 3 vext2 <4,0,5,2>, <0,2,0,2>
+  3759760847U, // <5,2,0,3>: Cost 4 vext3 <0,2,4,5>, <2,0,3,2>
+  3760866775U, // <5,2,0,4>: Cost 4 vext3 <0,4,1,5>, <2,0,4,1>
+  3759539680U, // <5,2,0,5>: Cost 4 vext3 <0,2,1,5>, <2,0,5,1>
+  3760866796U, // <5,2,0,6>: Cost 4 vext3 <0,4,1,5>, <2,0,6,4>
+  3304114054U, // <5,2,0,7>: Cost 4 vrev <2,5,7,0>
+  2628911773U, // <5,2,0,u>: Cost 3 vext2 <1,u,5,2>, LHS
+  2623603464U, // <5,2,1,0>: Cost 3 vext2 <1,0,5,2>, <1,0,5,2>
+  3698008921U, // <5,2,1,1>: Cost 4 vext2 <1,1,5,2>, <1,1,5,2>
+  3633325603U, // <5,2,1,2>: Cost 4 vext1 <1,5,2,1>, <2,1,3,5>
+  2687125027U, // <5,2,1,3>: Cost 3 vext3 <0,4,1,5>, <2,1,3,5>
+  3633327414U, // <5,2,1,4>: Cost 4 vext1 <1,5,2,1>, RHS
+  3759539760U, // <5,2,1,5>: Cost 4 vext3 <0,2,1,5>, <2,1,5,0>
+  3760866876U, // <5,2,1,6>: Cost 4 vext3 <0,4,1,5>, <2,1,6,3>
+  3304122247U, // <5,2,1,7>: Cost 4 vrev <2,5,7,1>
+  2687125072U, // <5,2,1,u>: Cost 3 vext3 <0,4,1,5>, <2,1,u,5>
+  3633332326U, // <5,2,2,0>: Cost 4 vext1 <1,5,2,2>, LHS
+  3759760992U, // <5,2,2,1>: Cost 4 vext3 <0,2,4,5>, <2,2,1,3>
+  2687125096U, // <5,2,2,2>: Cost 3 vext3 <0,4,1,5>, <2,2,2,2>
+  2687125106U, // <5,2,2,3>: Cost 3 vext3 <0,4,1,5>, <2,2,3,3>
+  2697963133U, // <5,2,2,4>: Cost 3 vext3 <2,2,4,5>, <2,2,4,5>
+  3759466120U, // <5,2,2,5>: Cost 4 vext3 <0,2,0,5>, <2,2,5,7>
+  3760866960U, // <5,2,2,6>: Cost 4 vext3 <0,4,1,5>, <2,2,6,6>
+  3771926168U, // <5,2,2,7>: Cost 4 vext3 <2,2,7,5>, <2,2,7,5>
+  2687125151U, // <5,2,2,u>: Cost 3 vext3 <0,4,1,5>, <2,2,u,3>
+  2687125158U, // <5,2,3,0>: Cost 3 vext3 <0,4,1,5>, <2,3,0,1>
+  2698405555U, // <5,2,3,1>: Cost 3 vext3 <2,3,1,5>, <2,3,1,5>
+  2577516238U, // <5,2,3,2>: Cost 3 vext1 <4,5,2,3>, <2,3,4,5>
+  3759687365U, // <5,2,3,3>: Cost 4 vext3 <0,2,3,5>, <2,3,3,5>
+  1624884942U, // <5,2,3,4>: Cost 2 vext3 <2,3,4,5>, <2,3,4,5>
+  2698700503U, // <5,2,3,5>: Cost 3 vext3 <2,3,5,5>, <2,3,5,5>
+  3772368608U, // <5,2,3,6>: Cost 4 vext3 <2,3,4,5>, <2,3,6,5>
+  3702655716U, // <5,2,3,7>: Cost 4 vext2 <1,u,5,2>, <3,7,3,7>
+  1625179890U, // <5,2,3,u>: Cost 2 vext3 <2,3,u,5>, <2,3,u,5>
+  2641521555U, // <5,2,4,0>: Cost 3 vext2 <4,0,5,2>, <4,0,5,2>
+  3772368642U, // <5,2,4,1>: Cost 4 vext3 <2,3,4,5>, <2,4,1,3>
+  2699142925U, // <5,2,4,2>: Cost 3 vext3 <2,4,2,5>, <2,4,2,5>
+  2698626838U, // <5,2,4,3>: Cost 3 vext3 <2,3,4,5>, <2,4,3,5>
+  2698626848U, // <5,2,4,4>: Cost 3 vext3 <2,3,4,5>, <2,4,4,6>
+  2628914486U, // <5,2,4,5>: Cost 3 vext2 <1,u,5,2>, RHS
+  2645503353U, // <5,2,4,6>: Cost 3 vext2 <4,6,5,2>, <4,6,5,2>
+  3304146826U, // <5,2,4,7>: Cost 4 vrev <2,5,7,4>
+  2628914729U, // <5,2,4,u>: Cost 3 vext2 <1,u,5,2>, RHS
+  2553643110U, // <5,2,5,0>: Cost 3 vext1 <0,5,2,5>, LHS
+  3758950227U, // <5,2,5,1>: Cost 4 vext3 <0,1,2,5>, <2,5,1,3>
+  3759761248U, // <5,2,5,2>: Cost 4 vext3 <0,2,4,5>, <2,5,2,7>
+  2982396006U, // <5,2,5,3>: Cost 3 vzipr <4,u,5,5>, LHS
+  2553646390U, // <5,2,5,4>: Cost 3 vext1 <0,5,2,5>, RHS
+  2553647108U, // <5,2,5,5>: Cost 3 vext1 <0,5,2,5>, <5,5,5,5>
+  3760867204U, // <5,2,5,6>: Cost 4 vext3 <0,4,1,5>, <2,5,6,7>
+  3702657141U, // <5,2,5,7>: Cost 4 vext2 <1,u,5,2>, <5,7,0,1>
+  2982396011U, // <5,2,5,u>: Cost 3 vzipr <4,u,5,5>, LHS
+  3627393126U, // <5,2,6,0>: Cost 4 vext1 <0,5,2,6>, LHS
+  3760867236U, // <5,2,6,1>: Cost 4 vext3 <0,4,1,5>, <2,6,1,3>
+  2645504506U, // <5,2,6,2>: Cost 3 vext2 <4,6,5,2>, <6,2,7,3>
+  2687125434U, // <5,2,6,3>: Cost 3 vext3 <0,4,1,5>, <2,6,3,7>
+  2700617665U, // <5,2,6,4>: Cost 3 vext3 <2,6,4,5>, <2,6,4,5>
+  3760867276U, // <5,2,6,5>: Cost 4 vext3 <0,4,1,5>, <2,6,5,7>
+  3763521493U, // <5,2,6,6>: Cost 4 vext3 <0,u,1,5>, <2,6,6,7>
+  3719246670U, // <5,2,6,7>: Cost 4 vext2 <4,6,5,2>, <6,7,0,1>
+  2687125479U, // <5,2,6,u>: Cost 3 vext3 <0,4,1,5>, <2,6,u,7>
+  2565603430U, // <5,2,7,0>: Cost 3 vext1 <2,5,2,7>, LHS
+  2553660150U, // <5,2,7,1>: Cost 3 vext1 <0,5,2,7>, <1,0,3,2>
+  2565605216U, // <5,2,7,2>: Cost 3 vext1 <2,5,2,7>, <2,5,2,7>
+  2961178726U, // <5,2,7,3>: Cost 3 vzipr <1,3,5,7>, LHS
+  2565606710U, // <5,2,7,4>: Cost 3 vext1 <2,5,2,7>, RHS
+  4034920552U, // <5,2,7,5>: Cost 4 vzipr <1,3,5,7>, <0,1,2,5>
+  3114713292U, // <5,2,7,6>: Cost 3 vtrnr RHS, <0,2,4,6>
+  3702658668U, // <5,2,7,7>: Cost 4 vext2 <1,u,5,2>, <7,7,7,7>
+  2961178731U, // <5,2,7,u>: Cost 3 vzipr <1,3,5,7>, LHS
+  2687125563U, // <5,2,u,0>: Cost 3 vext3 <0,4,1,5>, <2,u,0,1>
+  2628917038U, // <5,2,u,1>: Cost 3 vext2 <1,u,5,2>, LHS
+  2565613409U, // <5,2,u,2>: Cost 3 vext1 <2,5,2,u>, <2,5,2,u>
+  2687125592U, // <5,2,u,3>: Cost 3 vext3 <0,4,1,5>, <2,u,3,3>
+  1628203107U, // <5,2,u,4>: Cost 2 vext3 <2,u,4,5>, <2,u,4,5>
+  2628917402U, // <5,2,u,5>: Cost 3 vext2 <1,u,5,2>, RHS
+  2702092405U, // <5,2,u,6>: Cost 3 vext3 <2,u,6,5>, <2,u,6,5>
+  3304179598U, // <5,2,u,7>: Cost 4 vrev <2,5,7,u>
+  1628498055U, // <5,2,u,u>: Cost 2 vext3 <2,u,u,5>, <2,u,u,5>
+  3760867467U, // <5,3,0,0>: Cost 4 vext3 <0,4,1,5>, <3,0,0,0>
+  2687125654U, // <5,3,0,1>: Cost 3 vext3 <0,4,1,5>, <3,0,1,2>
+  3759761565U, // <5,3,0,2>: Cost 4 vext3 <0,2,4,5>, <3,0,2,0>
+  3633391766U, // <5,3,0,3>: Cost 4 vext1 <1,5,3,0>, <3,0,1,2>
+  2687125680U, // <5,3,0,4>: Cost 3 vext3 <0,4,1,5>, <3,0,4,1>
+  3760277690U, // <5,3,0,5>: Cost 4 vext3 <0,3,2,5>, <3,0,5,2>
+  3310013014U, // <5,3,0,6>: Cost 4 vrev <3,5,6,0>
+  2236344927U, // <5,3,0,7>: Cost 3 vrev <3,5,7,0>
+  2687125717U, // <5,3,0,u>: Cost 3 vext3 <0,4,1,5>, <3,0,u,2>
+  3760867551U, // <5,3,1,0>: Cost 4 vext3 <0,4,1,5>, <3,1,0,3>
+  3760867558U, // <5,3,1,1>: Cost 4 vext3 <0,4,1,5>, <3,1,1,1>
+  2624938923U, // <5,3,1,2>: Cost 3 vext2 <1,2,5,3>, <1,2,5,3>
+  2703198460U, // <5,3,1,3>: Cost 3 vext3 <3,1,3,5>, <3,1,3,5>
+  3760867587U, // <5,3,1,4>: Cost 4 vext3 <0,4,1,5>, <3,1,4,3>
+  2636219536U, // <5,3,1,5>: Cost 3 vext2 <3,1,5,3>, <1,5,3,7>
+  3698681075U, // <5,3,1,6>: Cost 4 vext2 <1,2,5,3>, <1,6,5,7>
+  2703493408U, // <5,3,1,7>: Cost 3 vext3 <3,1,7,5>, <3,1,7,5>
+  2628920721U, // <5,3,1,u>: Cost 3 vext2 <1,u,5,3>, <1,u,5,3>
+  3766765870U, // <5,3,2,0>: Cost 4 vext3 <1,4,0,5>, <3,2,0,1>
+  3698681379U, // <5,3,2,1>: Cost 4 vext2 <1,2,5,3>, <2,1,3,5>
+  3760867649U, // <5,3,2,2>: Cost 4 vext3 <0,4,1,5>, <3,2,2,2>
+  2698627404U, // <5,3,2,3>: Cost 3 vext3 <2,3,4,5>, <3,2,3,4>
+  2703935830U, // <5,3,2,4>: Cost 3 vext3 <3,2,4,5>, <3,2,4,5>
+  2698627422U, // <5,3,2,5>: Cost 3 vext3 <2,3,4,5>, <3,2,5,4>
+  3760867686U, // <5,3,2,6>: Cost 4 vext3 <0,4,1,5>, <3,2,6,3>
+  3769788783U, // <5,3,2,7>: Cost 4 vext3 <1,u,5,5>, <3,2,7,3>
+  2701945209U, // <5,3,2,u>: Cost 3 vext3 <2,u,4,5>, <3,2,u,4>
+  3760867711U, // <5,3,3,0>: Cost 4 vext3 <0,4,1,5>, <3,3,0,1>
+  2636220684U, // <5,3,3,1>: Cost 3 vext2 <3,1,5,3>, <3,1,5,3>
+  3772369298U, // <5,3,3,2>: Cost 4 vext3 <2,3,4,5>, <3,3,2,2>
+  2687125916U, // <5,3,3,3>: Cost 3 vext3 <0,4,1,5>, <3,3,3,3>
+  2704599463U, // <5,3,3,4>: Cost 3 vext3 <3,3,4,5>, <3,3,4,5>
+  2704673200U, // <5,3,3,5>: Cost 3 vext3 <3,3,5,5>, <3,3,5,5>
+  3709962935U, // <5,3,3,6>: Cost 4 vext2 <3,1,5,3>, <3,6,7,7>
+  3772369346U, // <5,3,3,7>: Cost 4 vext3 <2,3,4,5>, <3,3,7,5>
+  2704894411U, // <5,3,3,u>: Cost 3 vext3 <3,3,u,5>, <3,3,u,5>
+  2704968148U, // <5,3,4,0>: Cost 3 vext3 <3,4,0,5>, <3,4,0,5>
+  3698682850U, // <5,3,4,1>: Cost 4 vext2 <1,2,5,3>, <4,1,5,0>
+  2642857014U, // <5,3,4,2>: Cost 3 vext2 <4,2,5,3>, <4,2,5,3>
+  2705189359U, // <5,3,4,3>: Cost 3 vext3 <3,4,3,5>, <3,4,3,5>
+  2705263096U, // <5,3,4,4>: Cost 3 vext3 <3,4,4,5>, <3,4,4,5>
+  2685946370U, // <5,3,4,5>: Cost 3 vext3 <0,2,3,5>, <3,4,5,6>
+  3779152394U, // <5,3,4,6>: Cost 4 vext3 <3,4,6,5>, <3,4,6,5>
+  2236377699U, // <5,3,4,7>: Cost 3 vrev <3,5,7,4>
+  2687126045U, // <5,3,4,u>: Cost 3 vext3 <0,4,1,5>, <3,4,u,6>
+  2571632742U, // <5,3,5,0>: Cost 3 vext1 <3,5,3,5>, LHS
+  2559689870U, // <5,3,5,1>: Cost 3 vext1 <1,5,3,5>, <1,5,3,5>
+  2571634382U, // <5,3,5,2>: Cost 3 vext1 <3,5,3,5>, <2,3,4,5>
+  2571635264U, // <5,3,5,3>: Cost 3 vext1 <3,5,3,5>, <3,5,3,5>
+  2571636022U, // <5,3,5,4>: Cost 3 vext1 <3,5,3,5>, RHS
+  2559692804U, // <5,3,5,5>: Cost 3 vext1 <1,5,3,5>, <5,5,5,5>
+  3720581218U, // <5,3,5,6>: Cost 4 vext2 <4,u,5,3>, <5,6,7,0>
+  2236385892U, // <5,3,5,7>: Cost 3 vrev <3,5,7,5>
+  2571638574U, // <5,3,5,u>: Cost 3 vext1 <3,5,3,5>, LHS
+  2565668966U, // <5,3,6,0>: Cost 3 vext1 <2,5,3,6>, LHS
+  3633439887U, // <5,3,6,1>: Cost 4 vext1 <1,5,3,6>, <1,5,3,6>
+  2565670760U, // <5,3,6,2>: Cost 3 vext1 <2,5,3,6>, <2,5,3,6>
+  2565671426U, // <5,3,6,3>: Cost 3 vext1 <2,5,3,6>, <3,4,5,6>
+  2565672246U, // <5,3,6,4>: Cost 3 vext1 <2,5,3,6>, RHS
+  3639414630U, // <5,3,6,5>: Cost 4 vext1 <2,5,3,6>, <5,3,6,0>
+  4047521640U, // <5,3,6,6>: Cost 4 vzipr <3,4,5,6>, <2,5,3,6>
+  2725169844U, // <5,3,6,7>: Cost 3 vext3 <6,7,4,5>, <3,6,7,4>
+  2565674798U, // <5,3,6,u>: Cost 3 vext1 <2,5,3,6>, LHS
+  1485963366U, // <5,3,7,0>: Cost 2 vext1 <1,5,3,7>, LHS
+  1485964432U, // <5,3,7,1>: Cost 2 vext1 <1,5,3,7>, <1,5,3,7>
+  2559706728U, // <5,3,7,2>: Cost 3 vext1 <1,5,3,7>, <2,2,2,2>
+  2559707286U, // <5,3,7,3>: Cost 3 vext1 <1,5,3,7>, <3,0,1,2>
+  1485966646U, // <5,3,7,4>: Cost 2 vext1 <1,5,3,7>, RHS
+  2559708880U, // <5,3,7,5>: Cost 3 vext1 <1,5,3,7>, <5,1,7,3>
+  2601513466U, // <5,3,7,6>: Cost 3 vext1 <u,5,3,7>, <6,2,7,3>
+  3114714112U, // <5,3,7,7>: Cost 3 vtrnr RHS, <1,3,5,7>
+  1485969198U, // <5,3,7,u>: Cost 2 vext1 <1,5,3,7>, LHS
+  1485971558U, // <5,3,u,0>: Cost 2 vext1 <1,5,3,u>, LHS
+  1485972625U, // <5,3,u,1>: Cost 2 vext1 <1,5,3,u>, <1,5,3,u>
+  2559714920U, // <5,3,u,2>: Cost 3 vext1 <1,5,3,u>, <2,2,2,2>
+  2559715478U, // <5,3,u,3>: Cost 3 vext1 <1,5,3,u>, <3,0,1,2>
+  1485974838U, // <5,3,u,4>: Cost 2 vext1 <1,5,3,u>, RHS
+  2687126342U, // <5,3,u,5>: Cost 3 vext3 <0,4,1,5>, <3,u,5,6>
+  2601521658U, // <5,3,u,6>: Cost 3 vext1 <u,5,3,u>, <6,2,7,3>
+  2236410471U, // <5,3,u,7>: Cost 3 vrev <3,5,7,u>
+  1485977390U, // <5,3,u,u>: Cost 2 vext1 <1,5,3,u>, LHS
+  3627491430U, // <5,4,0,0>: Cost 4 vext1 <0,5,4,0>, LHS
+  2636890214U, // <5,4,0,1>: Cost 3 vext2 <3,2,5,4>, LHS
+  3703333028U, // <5,4,0,2>: Cost 4 vext2 <2,0,5,4>, <0,2,0,2>
+  3782249348U, // <5,4,0,3>: Cost 4 vext3 <4,0,3,5>, <4,0,3,5>
+  2642198866U, // <5,4,0,4>: Cost 3 vext2 <4,1,5,4>, <0,4,1,5>
+  2687126418U, // <5,4,0,5>: Cost 3 vext3 <0,4,1,5>, <4,0,5,1>
+  2242243887U, // <5,4,0,6>: Cost 3 vrev <4,5,6,0>
+  3316059448U, // <5,4,0,7>: Cost 4 vrev <4,5,7,0>
+  2636890781U, // <5,4,0,u>: Cost 3 vext2 <3,2,5,4>, LHS
+  2241809658U, // <5,4,1,0>: Cost 3 vrev <4,5,0,1>
+  3698025307U, // <5,4,1,1>: Cost 4 vext2 <1,1,5,4>, <1,1,5,4>
+  3698688940U, // <5,4,1,2>: Cost 4 vext2 <1,2,5,4>, <1,2,5,4>
+  3698689024U, // <5,4,1,3>: Cost 4 vext2 <1,2,5,4>, <1,3,5,7>
+  3700016206U, // <5,4,1,4>: Cost 4 vext2 <1,4,5,4>, <1,4,5,4>
+  2687126498U, // <5,4,1,5>: Cost 3 vext3 <0,4,1,5>, <4,1,5,0>
+  3760868336U, // <5,4,1,6>: Cost 4 vext3 <0,4,1,5>, <4,1,6,5>
+  3316067641U, // <5,4,1,7>: Cost 4 vrev <4,5,7,1>
+  2242399554U, // <5,4,1,u>: Cost 3 vrev <4,5,u,1>
+  3703334371U, // <5,4,2,0>: Cost 4 vext2 <2,0,5,4>, <2,0,5,4>
+  3703998004U, // <5,4,2,1>: Cost 4 vext2 <2,1,5,4>, <2,1,5,4>
+  3704661637U, // <5,4,2,2>: Cost 4 vext2 <2,2,5,4>, <2,2,5,4>
+  2636891854U, // <5,4,2,3>: Cost 3 vext2 <3,2,5,4>, <2,3,4,5>
+  3705988903U, // <5,4,2,4>: Cost 4 vext2 <2,4,5,4>, <2,4,5,4>
+  2698628150U, // <5,4,2,5>: Cost 3 vext3 <2,3,4,5>, <4,2,5,3>
+  3760868415U, // <5,4,2,6>: Cost 4 vext3 <0,4,1,5>, <4,2,6,3>
+  3783871562U, // <5,4,2,7>: Cost 4 vext3 <4,2,7,5>, <4,2,7,5>
+  2666752099U, // <5,4,2,u>: Cost 3 vext2 <u,2,5,4>, <2,u,4,5>
+  3639459942U, // <5,4,3,0>: Cost 4 vext1 <2,5,4,3>, LHS
+  3709970701U, // <5,4,3,1>: Cost 4 vext2 <3,1,5,4>, <3,1,5,4>
+  2636892510U, // <5,4,3,2>: Cost 3 vext2 <3,2,5,4>, <3,2,5,4>
+  3710634396U, // <5,4,3,3>: Cost 4 vext2 <3,2,5,4>, <3,3,3,3>
+  2638219776U, // <5,4,3,4>: Cost 3 vext2 <3,4,5,4>, <3,4,5,4>
+  3766987908U, // <5,4,3,5>: Cost 4 vext3 <1,4,3,5>, <4,3,5,0>
+  2710719634U, // <5,4,3,6>: Cost 3 vext3 <4,3,6,5>, <4,3,6,5>
+  3914097664U, // <5,4,3,7>: Cost 4 vuzpr <3,5,7,4>, <1,3,5,7>
+  2640874308U, // <5,4,3,u>: Cost 3 vext2 <3,u,5,4>, <3,u,5,4>
+  2583642214U, // <5,4,4,0>: Cost 3 vext1 <5,5,4,4>, LHS
+  2642201574U, // <5,4,4,1>: Cost 3 vext2 <4,1,5,4>, <4,1,5,4>
+  3710635062U, // <5,4,4,2>: Cost 4 vext2 <3,2,5,4>, <4,2,5,3>
+  3717270664U, // <5,4,4,3>: Cost 4 vext2 <4,3,5,4>, <4,3,5,4>
+  2713963728U, // <5,4,4,4>: Cost 3 vext3 <4,u,5,5>, <4,4,4,4>
+  1637567706U, // <5,4,4,5>: Cost 2 vext3 <4,4,5,5>, <4,4,5,5>
+  2242276659U, // <5,4,4,6>: Cost 3 vrev <4,5,6,4>
+  2646183372U, // <5,4,4,7>: Cost 3 vext2 <4,7,5,4>, <4,7,5,4>
+  1637788917U, // <5,4,4,u>: Cost 2 vext3 <4,4,u,5>, <4,4,u,5>
+  2559762534U, // <5,4,5,0>: Cost 3 vext1 <1,5,4,5>, LHS
+  2559763607U, // <5,4,5,1>: Cost 3 vext1 <1,5,4,5>, <1,5,4,5>
+  2698628366U, // <5,4,5,2>: Cost 3 vext3 <2,3,4,5>, <4,5,2,3>
+  3633506454U, // <5,4,5,3>: Cost 4 vext1 <1,5,4,5>, <3,0,1,2>
+  2559765814U, // <5,4,5,4>: Cost 3 vext1 <1,5,4,5>, RHS
+  2583654395U, // <5,4,5,5>: Cost 3 vext1 <5,5,4,5>, <5,5,4,5>
+  1613385014U, // <5,4,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
+  3901639990U, // <5,4,5,7>: Cost 4 vuzpr <1,5,0,4>, RHS
+  1613385032U, // <5,4,5,u>: Cost 2 vext3 <0,4,1,5>, RHS
+  2559770726U, // <5,4,6,0>: Cost 3 vext1 <1,5,4,6>, LHS
+  2559771648U, // <5,4,6,1>: Cost 3 vext1 <1,5,4,6>, <1,3,5,7>
+  3633514088U, // <5,4,6,2>: Cost 4 vext1 <1,5,4,6>, <2,2,2,2>
+  2571717122U, // <5,4,6,3>: Cost 3 vext1 <3,5,4,6>, <3,4,5,6>
+  2559774006U, // <5,4,6,4>: Cost 3 vext1 <1,5,4,6>, RHS
+  2712636796U, // <5,4,6,5>: Cost 3 vext3 <4,6,5,5>, <4,6,5,5>
+  3760868743U, // <5,4,6,6>: Cost 4 vext3 <0,4,1,5>, <4,6,6,7>
+  2712784270U, // <5,4,6,7>: Cost 3 vext3 <4,6,7,5>, <4,6,7,5>
+  2559776558U, // <5,4,6,u>: Cost 3 vext1 <1,5,4,6>, LHS
+  2565750886U, // <5,4,7,0>: Cost 3 vext1 <2,5,4,7>, LHS
+  2565751706U, // <5,4,7,1>: Cost 3 vext1 <2,5,4,7>, <1,2,3,4>
+  2565752690U, // <5,4,7,2>: Cost 3 vext1 <2,5,4,7>, <2,5,4,7>
+  2571725387U, // <5,4,7,3>: Cost 3 vext1 <3,5,4,7>, <3,5,4,7>
+  2565754166U, // <5,4,7,4>: Cost 3 vext1 <2,5,4,7>, RHS
+  3114713426U, // <5,4,7,5>: Cost 3 vtrnr RHS, <0,4,1,5>
+  94817590U, // <5,4,7,6>: Cost 1 vrev RHS
+  2595616175U, // <5,4,7,7>: Cost 3 vext1 <7,5,4,7>, <7,5,4,7>
+  94965064U, // <5,4,7,u>: Cost 1 vrev RHS
+  2559787110U, // <5,4,u,0>: Cost 3 vext1 <1,5,4,u>, LHS
+  2559788186U, // <5,4,u,1>: Cost 3 vext1 <1,5,4,u>, <1,5,4,u>
+  2242014483U, // <5,4,u,2>: Cost 3 vrev <4,5,2,u>
+  2667419628U, // <5,4,u,3>: Cost 3 vext2 <u,3,5,4>, <u,3,5,4>
+  2559790390U, // <5,4,u,4>: Cost 3 vext1 <1,5,4,u>, RHS
+  1640222238U, // <5,4,u,5>: Cost 2 vext3 <4,u,5,5>, <4,u,5,5>
+  94825783U, // <5,4,u,6>: Cost 1 vrev RHS
+  2714111536U, // <5,4,u,7>: Cost 3 vext3 <4,u,7,5>, <4,u,7,5>
+  94973257U, // <5,4,u,u>: Cost 1 vrev RHS
+  2646851584U, // <5,5,0,0>: Cost 3 vext2 <4,u,5,5>, <0,0,0,0>
+  1573109862U, // <5,5,0,1>: Cost 2 vext2 <4,u,5,5>, LHS
+  2646851748U, // <5,5,0,2>: Cost 3 vext2 <4,u,5,5>, <0,2,0,2>
+  3760279130U, // <5,5,0,3>: Cost 4 vext3 <0,3,2,5>, <5,0,3,2>
+  2687127138U, // <5,5,0,4>: Cost 3 vext3 <0,4,1,5>, <5,0,4,1>
+  2248142847U, // <5,5,0,5>: Cost 3 vrev <5,5,5,0>
+  3720593910U, // <5,5,0,6>: Cost 4 vext2 <4,u,5,5>, <0,6,1,7>
+  4182502710U, // <5,5,0,7>: Cost 4 vtrnr <3,5,7,0>, RHS
+  1573110429U, // <5,5,0,u>: Cost 2 vext2 <4,u,5,5>, LHS
+  2646852342U, // <5,5,1,0>: Cost 3 vext2 <4,u,5,5>, <1,0,3,2>
+  2624291676U, // <5,5,1,1>: Cost 3 vext2 <1,1,5,5>, <1,1,5,5>
+  2646852502U, // <5,5,1,2>: Cost 3 vext2 <4,u,5,5>, <1,2,3,0>
+  2646852568U, // <5,5,1,3>: Cost 3 vext2 <4,u,5,5>, <1,3,1,3>
+  2715217591U, // <5,5,1,4>: Cost 3 vext3 <5,1,4,5>, <5,1,4,5>
+  2628936848U, // <5,5,1,5>: Cost 3 vext2 <1,u,5,5>, <1,5,3,7>
+  3698033907U, // <5,5,1,6>: Cost 4 vext2 <1,1,5,5>, <1,6,5,7>
+  2713964240U, // <5,5,1,7>: Cost 3 vext3 <4,u,5,5>, <5,1,7,3>
+  2628937107U, // <5,5,1,u>: Cost 3 vext2 <1,u,5,5>, <1,u,5,5>
+  3645497446U, // <5,5,2,0>: Cost 4 vext1 <3,5,5,2>, LHS
+  3760869099U, // <5,5,2,1>: Cost 4 vext3 <0,4,1,5>, <5,2,1,3>
+  2646853224U, // <5,5,2,2>: Cost 3 vext2 <4,u,5,5>, <2,2,2,2>
+  2698628862U, // <5,5,2,3>: Cost 3 vext3 <2,3,4,5>, <5,2,3,4>
+  3772370694U, // <5,5,2,4>: Cost 4 vext3 <2,3,4,5>, <5,2,4,3>
+  2713964303U, // <5,5,2,5>: Cost 3 vext3 <4,u,5,5>, <5,2,5,3>
+  2646853562U, // <5,5,2,6>: Cost 3 vext2 <4,u,5,5>, <2,6,3,7>
+  4038198272U, // <5,5,2,7>: Cost 4 vzipr <1,u,5,2>, <1,3,5,7>
+  2701946667U, // <5,5,2,u>: Cost 3 vext3 <2,u,4,5>, <5,2,u,4>
+  2646853782U, // <5,5,3,0>: Cost 3 vext2 <4,u,5,5>, <3,0,1,2>
+  3698034922U, // <5,5,3,1>: Cost 4 vext2 <1,1,5,5>, <3,1,1,5>
+  3702679919U, // <5,5,3,2>: Cost 4 vext2 <1,u,5,5>, <3,2,7,3>
+  2637564336U, // <5,5,3,3>: Cost 3 vext2 <3,3,5,5>, <3,3,5,5>
+  2646854146U, // <5,5,3,4>: Cost 3 vext2 <4,u,5,5>, <3,4,5,6>
+  2638891602U, // <5,5,3,5>: Cost 3 vext2 <3,5,5,5>, <3,5,5,5>
+  3702680247U, // <5,5,3,6>: Cost 4 vext2 <1,u,5,5>, <3,6,7,7>
+  3702680259U, // <5,5,3,7>: Cost 4 vext2 <1,u,5,5>, <3,7,0,1>
+  2646854430U, // <5,5,3,u>: Cost 3 vext2 <4,u,5,5>, <3,u,1,2>
+  2646854546U, // <5,5,4,0>: Cost 3 vext2 <4,u,5,5>, <4,0,5,1>
+  2642209767U, // <5,5,4,1>: Cost 3 vext2 <4,1,5,5>, <4,1,5,5>
+  3711306806U, // <5,5,4,2>: Cost 4 vext2 <3,3,5,5>, <4,2,5,3>
+  3645516369U, // <5,5,4,3>: Cost 4 vext1 <3,5,5,4>, <3,5,5,4>
+  1570458842U, // <5,5,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
+  1573113142U, // <5,5,4,5>: Cost 2 vext2 <4,u,5,5>, RHS
+  2645527932U, // <5,5,4,6>: Cost 3 vext2 <4,6,5,5>, <4,6,5,5>
+  2713964486U, // <5,5,4,7>: Cost 3 vext3 <4,u,5,5>, <5,4,7,6>
+  1573113374U, // <5,5,4,u>: Cost 2 vext2 <4,u,5,5>, <4,u,5,5>
+  1509982310U, // <5,5,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
+  2646855376U, // <5,5,5,1>: Cost 3 vext2 <4,u,5,5>, <5,1,7,3>
+  2583725672U, // <5,5,5,2>: Cost 3 vext1 <5,5,5,5>, <2,2,2,2>
+  2583726230U, // <5,5,5,3>: Cost 3 vext1 <5,5,5,5>, <3,0,1,2>
+  1509985590U, // <5,5,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
+  229035318U, // <5,5,5,5>: Cost 1 vdup1 RHS
+  2646855778U, // <5,5,5,6>: Cost 3 vext2 <4,u,5,5>, <5,6,7,0>
+  2646855848U, // <5,5,5,7>: Cost 3 vext2 <4,u,5,5>, <5,7,5,7>
+  229035318U, // <5,5,5,u>: Cost 1 vdup1 RHS
+  2577760358U, // <5,5,6,0>: Cost 3 vext1 <4,5,5,6>, LHS
+  3633587361U, // <5,5,6,1>: Cost 4 vext1 <1,5,5,6>, <1,5,5,6>
+  2646856186U, // <5,5,6,2>: Cost 3 vext2 <4,u,5,5>, <6,2,7,3>
+  3633588738U, // <5,5,6,3>: Cost 4 vext1 <1,5,5,6>, <3,4,5,6>
+  2718535756U, // <5,5,6,4>: Cost 3 vext3 <5,6,4,5>, <5,6,4,5>
+  2644202223U, // <5,5,6,5>: Cost 3 vext2 <4,4,5,5>, <6,5,7,5>
+  2973780482U, // <5,5,6,6>: Cost 3 vzipr <3,4,5,6>, <3,4,5,6>
+  2646856526U, // <5,5,6,7>: Cost 3 vext2 <4,u,5,5>, <6,7,0,1>
+  2646856607U, // <5,5,6,u>: Cost 3 vext2 <4,u,5,5>, <6,u,0,1>
+  2571796582U, // <5,5,7,0>: Cost 3 vext1 <3,5,5,7>, LHS
+  3633595392U, // <5,5,7,1>: Cost 4 vext1 <1,5,5,7>, <1,3,5,7>
+  2571798222U, // <5,5,7,2>: Cost 3 vext1 <3,5,5,7>, <2,3,4,5>
+  2571799124U, // <5,5,7,3>: Cost 3 vext1 <3,5,5,7>, <3,5,5,7>
+  2571799862U, // <5,5,7,4>: Cost 3 vext1 <3,5,5,7>, RHS
+  3114717188U, // <5,5,7,5>: Cost 3 vtrnr RHS, <5,5,5,5>
+  4034923010U, // <5,5,7,6>: Cost 4 vzipr <1,3,5,7>, <3,4,5,6>
+  2040974646U, // <5,5,7,7>: Cost 2 vtrnr RHS, RHS
+  2040974647U, // <5,5,7,u>: Cost 2 vtrnr RHS, RHS
+  1509982310U, // <5,5,u,0>: Cost 2 vext1 <5,5,5,5>, LHS
+  1573115694U, // <5,5,u,1>: Cost 2 vext2 <4,u,5,5>, LHS
+  2571806414U, // <5,5,u,2>: Cost 3 vext1 <3,5,5,u>, <2,3,4,5>
+  2571807317U, // <5,5,u,3>: Cost 3 vext1 <3,5,5,u>, <3,5,5,u>
+  1509985590U, // <5,5,u,4>: Cost 2 vext1 <5,5,5,5>, RHS
+  229035318U, // <5,5,u,5>: Cost 1 vdup1 RHS
+  2646857936U, // <5,5,u,6>: Cost 3 vext2 <4,u,5,5>, <u,6,3,7>
+  2040982838U, // <5,5,u,7>: Cost 2 vtrnr RHS, RHS
+  229035318U, // <5,5,u,u>: Cost 1 vdup1 RHS
+  2638233600U, // <5,6,0,0>: Cost 3 vext2 <3,4,5,6>, <0,0,0,0>
+  1564491878U, // <5,6,0,1>: Cost 2 vext2 <3,4,5,6>, LHS
+  2632261796U, // <5,6,0,2>: Cost 3 vext2 <2,4,5,6>, <0,2,0,2>
+  2638233856U, // <5,6,0,3>: Cost 3 vext2 <3,4,5,6>, <0,3,1,4>
+  2638233938U, // <5,6,0,4>: Cost 3 vext2 <3,4,5,6>, <0,4,1,5>
+  3706003885U, // <5,6,0,5>: Cost 4 vext2 <2,4,5,6>, <0,5,2,6>
+  3706003967U, // <5,6,0,6>: Cost 4 vext2 <2,4,5,6>, <0,6,2,7>
+  4047473974U, // <5,6,0,7>: Cost 4 vzipr <3,4,5,0>, RHS
+  1564492445U, // <5,6,0,u>: Cost 2 vext2 <3,4,5,6>, LHS
+  2638234358U, // <5,6,1,0>: Cost 3 vext2 <3,4,5,6>, <1,0,3,2>
+  2638234420U, // <5,6,1,1>: Cost 3 vext2 <3,4,5,6>, <1,1,1,1>
+  2638234518U, // <5,6,1,2>: Cost 3 vext2 <3,4,5,6>, <1,2,3,0>
+  2638234584U, // <5,6,1,3>: Cost 3 vext2 <3,4,5,6>, <1,3,1,3>
+  2626290768U, // <5,6,1,4>: Cost 3 vext2 <1,4,5,6>, <1,4,5,6>
+  2638234768U, // <5,6,1,5>: Cost 3 vext2 <3,4,5,6>, <1,5,3,7>
+  3700032719U, // <5,6,1,6>: Cost 4 vext2 <1,4,5,6>, <1,6,1,7>
+  2982366518U, // <5,6,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
+  2628945300U, // <5,6,1,u>: Cost 3 vext2 <1,u,5,6>, <1,u,5,6>
+  3706004925U, // <5,6,2,0>: Cost 4 vext2 <2,4,5,6>, <2,0,1,2>
+  3711976966U, // <5,6,2,1>: Cost 4 vext2 <3,4,5,6>, <2,1,0,3>
+  2638235240U, // <5,6,2,2>: Cost 3 vext2 <3,4,5,6>, <2,2,2,2>
+  2638235302U, // <5,6,2,3>: Cost 3 vext2 <3,4,5,6>, <2,3,0,1>
+  2632263465U, // <5,6,2,4>: Cost 3 vext2 <2,4,5,6>, <2,4,5,6>
+  2638235496U, // <5,6,2,5>: Cost 3 vext2 <3,4,5,6>, <2,5,3,6>
+  2638235578U, // <5,6,2,6>: Cost 3 vext2 <3,4,5,6>, <2,6,3,7>
+  2713965050U, // <5,6,2,7>: Cost 3 vext3 <4,u,5,5>, <6,2,7,3>
+  2634917997U, // <5,6,2,u>: Cost 3 vext2 <2,u,5,6>, <2,u,5,6>
+  2638235798U, // <5,6,3,0>: Cost 3 vext2 <3,4,5,6>, <3,0,1,2>
+  3711977695U, // <5,6,3,1>: Cost 4 vext2 <3,4,5,6>, <3,1,0,3>
+  3710650720U, // <5,6,3,2>: Cost 4 vext2 <3,2,5,6>, <3,2,5,6>
+  2638236060U, // <5,6,3,3>: Cost 3 vext2 <3,4,5,6>, <3,3,3,3>
+  1564494338U, // <5,6,3,4>: Cost 2 vext2 <3,4,5,6>, <3,4,5,6>
+  2638236234U, // <5,6,3,5>: Cost 3 vext2 <3,4,5,6>, <3,5,4,6>
+  3711978104U, // <5,6,3,6>: Cost 4 vext2 <3,4,5,6>, <3,6,0,7>
+  4034227510U, // <5,6,3,7>: Cost 4 vzipr <1,2,5,3>, RHS
+  1567148870U, // <5,6,3,u>: Cost 2 vext2 <3,u,5,6>, <3,u,5,6>
+  2577817702U, // <5,6,4,0>: Cost 3 vext1 <4,5,6,4>, LHS
+  3700034544U, // <5,6,4,1>: Cost 4 vext2 <1,4,5,6>, <4,1,6,5>
+  2723033713U, // <5,6,4,2>: Cost 3 vext3 <6,4,2,5>, <6,4,2,5>
+  2638236818U, // <5,6,4,3>: Cost 3 vext2 <3,4,5,6>, <4,3,6,5>
+  2644208859U, // <5,6,4,4>: Cost 3 vext2 <4,4,5,6>, <4,4,5,6>
+  1564495158U, // <5,6,4,5>: Cost 2 vext2 <3,4,5,6>, RHS
+  2645536125U, // <5,6,4,6>: Cost 3 vext2 <4,6,5,6>, <4,6,5,6>
+  2723402398U, // <5,6,4,7>: Cost 3 vext3 <6,4,7,5>, <6,4,7,5>
+  1564495401U, // <5,6,4,u>: Cost 2 vext2 <3,4,5,6>, RHS
+  2577825894U, // <5,6,5,0>: Cost 3 vext1 <4,5,6,5>, LHS
+  2662125264U, // <5,6,5,1>: Cost 3 vext2 <7,4,5,6>, <5,1,7,3>
+  3775836867U, // <5,6,5,2>: Cost 4 vext3 <2,u,6,5>, <6,5,2,6>
+  3711979343U, // <5,6,5,3>: Cost 4 vext2 <3,4,5,6>, <5,3,3,4>
+  2650181556U, // <5,6,5,4>: Cost 3 vext2 <5,4,5,6>, <5,4,5,6>
+  2662125572U, // <5,6,5,5>: Cost 3 vext2 <7,4,5,6>, <5,5,5,5>
+  2638237732U, // <5,6,5,6>: Cost 3 vext2 <3,4,5,6>, <5,6,0,1>
+  2982399286U, // <5,6,5,7>: Cost 3 vzipr <4,u,5,5>, RHS
+  2982399287U, // <5,6,5,u>: Cost 3 vzipr <4,u,5,5>, RHS
+  2583806054U, // <5,6,6,0>: Cost 3 vext1 <5,5,6,6>, LHS
+  3711979910U, // <5,6,6,1>: Cost 4 vext2 <3,4,5,6>, <6,1,3,4>
+  2662126074U, // <5,6,6,2>: Cost 3 vext2 <7,4,5,6>, <6,2,7,3>
+  2583808514U, // <5,6,6,3>: Cost 3 vext1 <5,5,6,6>, <3,4,5,6>
+  2583809334U, // <5,6,6,4>: Cost 3 vext1 <5,5,6,6>, RHS
+  2583810062U, // <5,6,6,5>: Cost 3 vext1 <5,5,6,6>, <5,5,6,6>
+  2638238520U, // <5,6,6,6>: Cost 3 vext2 <3,4,5,6>, <6,6,6,6>
+  2973781302U, // <5,6,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
+  2973781303U, // <5,6,6,u>: Cost 3 vzipr <3,4,5,6>, RHS
+  430358630U, // <5,6,7,0>: Cost 1 vext1 RHS, LHS
+  1504101110U, // <5,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
+  1504101992U, // <5,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1504102550U, // <5,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  430361910U, // <5,6,7,4>: Cost 1 vext1 RHS, RHS
+  1504104390U, // <5,6,7,5>: Cost 2 vext1 RHS, <5,4,7,6>
+  1504105272U, // <5,6,7,6>: Cost 2 vext1 RHS, <6,6,6,6>
+  1504106092U, // <5,6,7,7>: Cost 2 vext1 RHS, <7,7,7,7>
+  430364462U, // <5,6,7,u>: Cost 1 vext1 RHS, LHS
+  430366822U, // <5,6,u,0>: Cost 1 vext1 RHS, LHS
+  1564497710U, // <5,6,u,1>: Cost 2 vext2 <3,4,5,6>, LHS
+  1504110184U, // <5,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1504110742U, // <5,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  430370103U, // <5,6,u,4>: Cost 1 vext1 RHS, RHS
+  1564498074U, // <5,6,u,5>: Cost 2 vext2 <3,4,5,6>, RHS
+  1504113146U, // <5,6,u,6>: Cost 2 vext1 RHS, <6,2,7,3>
+  1504113658U, // <5,6,u,7>: Cost 2 vext1 RHS, <7,0,1,2>
+  430372654U, // <5,6,u,u>: Cost 1 vext1 RHS, LHS
+  2625634304U, // <5,7,0,0>: Cost 3 vext2 <1,3,5,7>, <0,0,0,0>
+  1551892582U, // <5,7,0,1>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625634468U, // <5,7,0,2>: Cost 3 vext2 <1,3,5,7>, <0,2,0,2>
+  2571889247U, // <5,7,0,3>: Cost 3 vext1 <3,5,7,0>, <3,5,7,0>
+  2625634642U, // <5,7,0,4>: Cost 3 vext2 <1,3,5,7>, <0,4,1,5>
+  2595778728U, // <5,7,0,5>: Cost 3 vext1 <7,5,7,0>, <5,7,5,7>
+  3699376639U, // <5,7,0,6>: Cost 4 vext2 <1,3,5,7>, <0,6,2,7>
+  2260235715U, // <5,7,0,7>: Cost 3 vrev <7,5,7,0>
+  1551893149U, // <5,7,0,u>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625635062U, // <5,7,1,0>: Cost 3 vext2 <1,3,5,7>, <1,0,3,2>
+  2624308020U, // <5,7,1,1>: Cost 3 vext2 <1,1,5,7>, <1,1,1,1>
+  2625635222U, // <5,7,1,2>: Cost 3 vext2 <1,3,5,7>, <1,2,3,0>
+  1551893504U, // <5,7,1,3>: Cost 2 vext2 <1,3,5,7>, <1,3,5,7>
+  2571898166U, // <5,7,1,4>: Cost 3 vext1 <3,5,7,1>, RHS
+  2625635472U, // <5,7,1,5>: Cost 3 vext2 <1,3,5,7>, <1,5,3,7>
+  2627626227U, // <5,7,1,6>: Cost 3 vext2 <1,6,5,7>, <1,6,5,7>
+  3702031684U, // <5,7,1,7>: Cost 4 vext2 <1,7,5,7>, <1,7,5,7>
+  1555211669U, // <5,7,1,u>: Cost 2 vext2 <1,u,5,7>, <1,u,5,7>
+  2629617126U, // <5,7,2,0>: Cost 3 vext2 <2,0,5,7>, <2,0,5,7>
+  3699377670U, // <5,7,2,1>: Cost 4 vext2 <1,3,5,7>, <2,1,0,3>
+  2625635944U, // <5,7,2,2>: Cost 3 vext2 <1,3,5,7>, <2,2,2,2>
+  2625636006U, // <5,7,2,3>: Cost 3 vext2 <1,3,5,7>, <2,3,0,1>
+  2632271658U, // <5,7,2,4>: Cost 3 vext2 <2,4,5,7>, <2,4,5,7>
+  2625636201U, // <5,7,2,5>: Cost 3 vext2 <1,3,5,7>, <2,5,3,7>
+  2625636282U, // <5,7,2,6>: Cost 3 vext2 <1,3,5,7>, <2,6,3,7>
+  3708004381U, // <5,7,2,7>: Cost 4 vext2 <2,7,5,7>, <2,7,5,7>
+  2625636411U, // <5,7,2,u>: Cost 3 vext2 <1,3,5,7>, <2,u,0,1>
+  2625636502U, // <5,7,3,0>: Cost 3 vext2 <1,3,5,7>, <3,0,1,2>
+  2625636604U, // <5,7,3,1>: Cost 3 vext2 <1,3,5,7>, <3,1,3,5>
+  3699378478U, // <5,7,3,2>: Cost 4 vext2 <1,3,5,7>, <3,2,0,1>
+  2625636764U, // <5,7,3,3>: Cost 3 vext2 <1,3,5,7>, <3,3,3,3>
+  2625636866U, // <5,7,3,4>: Cost 3 vext2 <1,3,5,7>, <3,4,5,6>
+  2625636959U, // <5,7,3,5>: Cost 3 vext2 <1,3,5,7>, <3,5,7,0>
+  3699378808U, // <5,7,3,6>: Cost 4 vext2 <1,3,5,7>, <3,6,0,7>
+  2640235254U, // <5,7,3,7>: Cost 3 vext2 <3,7,5,7>, <3,7,5,7>
+  2625637150U, // <5,7,3,u>: Cost 3 vext2 <1,3,5,7>, <3,u,1,2>
+  2571919462U, // <5,7,4,0>: Cost 3 vext1 <3,5,7,4>, LHS
+  2571920384U, // <5,7,4,1>: Cost 3 vext1 <3,5,7,4>, <1,3,5,7>
+  3699379260U, // <5,7,4,2>: Cost 4 vext2 <1,3,5,7>, <4,2,6,0>
+  2571922019U, // <5,7,4,3>: Cost 3 vext1 <3,5,7,4>, <3,5,7,4>
+  2571922742U, // <5,7,4,4>: Cost 3 vext1 <3,5,7,4>, RHS
+  1551895862U, // <5,7,4,5>: Cost 2 vext2 <1,3,5,7>, RHS
+  2846277980U, // <5,7,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
+  2646207951U, // <5,7,4,7>: Cost 3 vext2 <4,7,5,7>, <4,7,5,7>
+  1551896105U, // <5,7,4,u>: Cost 2 vext2 <1,3,5,7>, RHS
+  2583871590U, // <5,7,5,0>: Cost 3 vext1 <5,5,7,5>, LHS
+  2652180176U, // <5,7,5,1>: Cost 3 vext2 <5,7,5,7>, <5,1,7,3>
+  2625638177U, // <5,7,5,2>: Cost 3 vext2 <1,3,5,7>, <5,2,7,3>
+  2625638262U, // <5,7,5,3>: Cost 3 vext2 <1,3,5,7>, <5,3,7,7>
+  2583874870U, // <5,7,5,4>: Cost 3 vext1 <5,5,7,5>, RHS
+  2846281732U, // <5,7,5,5>: Cost 3 vuzpr RHS, <5,5,5,5>
+  2651517015U, // <5,7,5,6>: Cost 3 vext2 <5,6,5,7>, <5,6,5,7>
+  1772539190U, // <5,7,5,7>: Cost 2 vuzpr RHS, RHS
+  1772539191U, // <5,7,5,u>: Cost 2 vuzpr RHS, RHS
+  2846281826U, // <5,7,6,0>: Cost 3 vuzpr RHS, <5,6,7,0>
+  3699380615U, // <5,7,6,1>: Cost 4 vext2 <1,3,5,7>, <6,1,3,5>
+  2846281108U, // <5,7,6,2>: Cost 3 vuzpr RHS, <4,6,u,2>
+  2589854210U, // <5,7,6,3>: Cost 3 vext1 <6,5,7,6>, <3,4,5,6>
+  2846281830U, // <5,7,6,4>: Cost 3 vuzpr RHS, <5,6,7,4>
+  2725467658U, // <5,7,6,5>: Cost 3 vext3 <6,7,u,5>, <7,6,5,u>
+  2846281076U, // <5,7,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
+  2846279610U, // <5,7,6,7>: Cost 3 vuzpr RHS, <2,6,3,7>
+  2846279611U, // <5,7,6,u>: Cost 3 vuzpr RHS, <2,6,3,u>
+  1510146150U, // <5,7,7,0>: Cost 2 vext1 <5,5,7,7>, LHS
+  2846282574U, // <5,7,7,1>: Cost 3 vuzpr RHS, <6,7,0,1>
+  2583889512U, // <5,7,7,2>: Cost 3 vext1 <5,5,7,7>, <2,2,2,2>
+  2846281919U, // <5,7,7,3>: Cost 3 vuzpr RHS, <5,7,u,3>
+  1510149430U, // <5,7,7,4>: Cost 2 vext1 <5,5,7,7>, RHS
+  1510150168U, // <5,7,7,5>: Cost 2 vext1 <5,5,7,7>, <5,5,7,7>
+  2583892474U, // <5,7,7,6>: Cost 3 vext1 <5,5,7,7>, <6,2,7,3>
+  2625640044U, // <5,7,7,7>: Cost 3 vext2 <1,3,5,7>, <7,7,7,7>
+  1510151982U, // <5,7,7,u>: Cost 2 vext1 <5,5,7,7>, LHS
+  1510154342U, // <5,7,u,0>: Cost 2 vext1 <5,5,7,u>, LHS
+  1551898414U, // <5,7,u,1>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625640325U, // <5,7,u,2>: Cost 3 vext2 <1,3,5,7>, <u,2,3,0>
+  1772536477U, // <5,7,u,3>: Cost 2 vuzpr RHS, LHS
+  1510157622U, // <5,7,u,4>: Cost 2 vext1 <5,5,7,u>, RHS
+  1551898778U, // <5,7,u,5>: Cost 2 vext2 <1,3,5,7>, RHS
+  2625640656U, // <5,7,u,6>: Cost 3 vext2 <1,3,5,7>, <u,6,3,7>
+  1772539433U, // <5,7,u,7>: Cost 2 vuzpr RHS, RHS
+  1551898981U, // <5,7,u,u>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625642496U, // <5,u,0,0>: Cost 3 vext2 <1,3,5,u>, <0,0,0,0>
+  1551900774U, // <5,u,0,1>: Cost 2 vext2 <1,3,5,u>, LHS
+  2625642660U, // <5,u,0,2>: Cost 3 vext2 <1,3,5,u>, <0,2,0,2>
+  2698630885U, // <5,u,0,3>: Cost 3 vext3 <2,3,4,5>, <u,0,3,2>
+  2687129325U, // <5,u,0,4>: Cost 3 vext3 <0,4,1,5>, <u,0,4,1>
+  2689783542U, // <5,u,0,5>: Cost 3 vext3 <0,u,1,5>, <u,0,5,1>
+  2266134675U, // <5,u,0,6>: Cost 3 vrev <u,5,6,0>
+  2595853772U, // <5,u,0,7>: Cost 3 vext1 <7,5,u,0>, <7,5,u,0>
+  1551901341U, // <5,u,0,u>: Cost 2 vext2 <1,3,5,u>, LHS
+  2625643254U, // <5,u,1,0>: Cost 3 vext2 <1,3,5,u>, <1,0,3,2>
+  2625643316U, // <5,u,1,1>: Cost 3 vext2 <1,3,5,u>, <1,1,1,1>
+  1613387566U, // <5,u,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  1551901697U, // <5,u,1,3>: Cost 2 vext2 <1,3,5,u>, <1,3,5,u>
+  2626307154U, // <5,u,1,4>: Cost 3 vext2 <1,4,5,u>, <1,4,5,u>
+  2689783622U, // <5,u,1,5>: Cost 3 vext3 <0,u,1,5>, <u,1,5,0>
+  2627634420U, // <5,u,1,6>: Cost 3 vext2 <1,6,5,u>, <1,6,5,u>
+  2982366536U, // <5,u,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
+  1613387620U, // <5,u,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
+  2846286742U, // <5,u,2,0>: Cost 3 vuzpr RHS, <1,2,3,0>
+  2685796528U, // <5,u,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
+  2625644136U, // <5,u,2,2>: Cost 3 vext2 <1,3,5,u>, <2,2,2,2>
+  2687129480U, // <5,u,2,3>: Cost 3 vext3 <0,4,1,5>, <u,2,3,3>
+  2632279851U, // <5,u,2,4>: Cost 3 vext2 <2,4,5,u>, <2,4,5,u>
+  2625644394U, // <5,u,2,5>: Cost 3 vext2 <1,3,5,u>, <2,5,3,u>
+  2625644474U, // <5,u,2,6>: Cost 3 vext2 <1,3,5,u>, <2,6,3,7>
+  2713966508U, // <5,u,2,7>: Cost 3 vext3 <4,u,5,5>, <u,2,7,3>
+  2625644603U, // <5,u,2,u>: Cost 3 vext2 <1,3,5,u>, <2,u,0,1>
+  2687129532U, // <5,u,3,0>: Cost 3 vext3 <0,4,1,5>, <u,3,0,1>
+  2636261649U, // <5,u,3,1>: Cost 3 vext2 <3,1,5,u>, <3,1,5,u>
+  2636925282U, // <5,u,3,2>: Cost 3 vext2 <3,2,5,u>, <3,2,5,u>
+  2625644956U, // <5,u,3,3>: Cost 3 vext2 <1,3,5,u>, <3,3,3,3>
+  1564510724U, // <5,u,3,4>: Cost 2 vext2 <3,4,5,u>, <3,4,5,u>
+  2625645160U, // <5,u,3,5>: Cost 3 vext2 <1,3,5,u>, <3,5,u,0>
+  2734610422U, // <5,u,3,6>: Cost 3 vext3 <u,3,6,5>, <u,3,6,5>
+  2640243447U, // <5,u,3,7>: Cost 3 vext2 <3,7,5,u>, <3,7,5,u>
+  1567165256U, // <5,u,3,u>: Cost 2 vext2 <3,u,5,u>, <3,u,5,u>
+  1567828889U, // <5,u,4,0>: Cost 2 vext2 <4,0,5,u>, <4,0,5,u>
+  1661163546U, // <5,u,4,1>: Cost 2 vext3 <u,4,1,5>, <u,4,1,5>
+  2734463012U, // <5,u,4,2>: Cost 3 vext3 <u,3,4,5>, <u,4,2,6>
+  2698631212U, // <5,u,4,3>: Cost 3 vext3 <2,3,4,5>, <u,4,3,5>
+  1570458842U, // <5,u,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
+  1551904054U, // <5,u,4,5>: Cost 2 vext2 <1,3,5,u>, RHS
+  2846286172U, // <5,u,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
+  2646216144U, // <5,u,4,7>: Cost 3 vext2 <4,7,5,u>, <4,7,5,u>
+  1551904297U, // <5,u,4,u>: Cost 2 vext2 <1,3,5,u>, RHS
+  1509982310U, // <5,u,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
+  2560058555U, // <5,u,5,1>: Cost 3 vext1 <1,5,u,5>, <1,5,u,5>
+  2698926194U, // <5,u,5,2>: Cost 3 vext3 <2,3,u,5>, <u,5,2,3>
+  2698631295U, // <5,u,5,3>: Cost 3 vext3 <2,3,4,5>, <u,5,3,7>
+  1509985590U, // <5,u,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
+  229035318U, // <5,u,5,5>: Cost 1 vdup1 RHS
+  1613387930U, // <5,u,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
+  1772547382U, // <5,u,5,7>: Cost 2 vuzpr RHS, RHS
+  229035318U, // <5,u,5,u>: Cost 1 vdup1 RHS
+  2566037606U, // <5,u,6,0>: Cost 3 vext1 <2,5,u,6>, LHS
+  2920044334U, // <5,u,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
+  2566039445U, // <5,u,6,2>: Cost 3 vext1 <2,5,u,6>, <2,5,u,6>
+  2687129808U, // <5,u,6,3>: Cost 3 vext3 <0,4,1,5>, <u,6,3,7>
+  2566040886U, // <5,u,6,4>: Cost 3 vext1 <2,5,u,6>, RHS
+  2920044698U, // <5,u,6,5>: Cost 3 vzipl <5,6,7,0>, RHS
+  2846289268U, // <5,u,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
+  2973781320U, // <5,u,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
+  2687129853U, // <5,u,6,u>: Cost 3 vext3 <0,4,1,5>, <u,6,u,7>
+  430506086U, // <5,u,7,0>: Cost 1 vext1 RHS, LHS
+  1486333117U, // <5,u,7,1>: Cost 2 vext1 <1,5,u,7>, <1,5,u,7>
+  1504249448U, // <5,u,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  2040971933U, // <5,u,7,3>: Cost 2 vtrnr RHS, LHS
+  430509384U, // <5,u,7,4>: Cost 1 vext1 RHS, RHS
+  1504251600U, // <5,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+  118708378U, // <5,u,7,6>: Cost 1 vrev RHS
+  2040974889U, // <5,u,7,7>: Cost 2 vtrnr RHS, RHS
+  430511918U, // <5,u,7,u>: Cost 1 vext1 RHS, LHS
+  430514278U, // <5,u,u,0>: Cost 1 vext1 RHS, LHS
+  1551906606U, // <5,u,u,1>: Cost 2 vext2 <1,3,5,u>, LHS
+  1613388133U, // <5,u,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  1772544669U, // <5,u,u,3>: Cost 2 vuzpr RHS, LHS
+  430517577U, // <5,u,u,4>: Cost 1 vext1 RHS, RHS
+  229035318U, // <5,u,u,5>: Cost 1 vdup1 RHS
+  118716571U, // <5,u,u,6>: Cost 1 vrev RHS
+  1772547625U, // <5,u,u,7>: Cost 2 vuzpr RHS, RHS
+  430520110U, // <5,u,u,u>: Cost 1 vext1 RHS, LHS
+  2686025728U, // <6,0,0,0>: Cost 3 vext3 <0,2,4,6>, <0,0,0,0>
+  2686025738U, // <6,0,0,1>: Cost 3 vext3 <0,2,4,6>, <0,0,1,1>
+  2686025748U, // <6,0,0,2>: Cost 3 vext3 <0,2,4,6>, <0,0,2,2>
+  3779084320U, // <6,0,0,3>: Cost 4 vext3 <3,4,5,6>, <0,0,3,5>
+  2642903388U, // <6,0,0,4>: Cost 3 vext2 <4,2,6,0>, <0,4,2,6>
+  3657723939U, // <6,0,0,5>: Cost 4 vext1 <5,6,0,0>, <5,6,0,0>
+  3926676514U, // <6,0,0,6>: Cost 4 vuzpr <5,6,7,0>, <7,0,5,6>
+  3926675786U, // <6,0,0,7>: Cost 4 vuzpr <5,6,7,0>, <6,0,5,7>
+  2686025802U, // <6,0,0,u>: Cost 3 vext3 <0,2,4,6>, <0,0,u,2>
+  2566070374U, // <6,0,1,0>: Cost 3 vext1 <2,6,0,1>, LHS
+  3759767642U, // <6,0,1,1>: Cost 4 vext3 <0,2,4,6>, <0,1,1,0>
+  1612284006U, // <6,0,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  2583988738U, // <6,0,1,3>: Cost 3 vext1 <5,6,0,1>, <3,4,5,6>
+  2566073654U, // <6,0,1,4>: Cost 3 vext1 <2,6,0,1>, RHS
+  2583990308U, // <6,0,1,5>: Cost 3 vext1 <5,6,0,1>, <5,6,0,1>
+  2589963005U, // <6,0,1,6>: Cost 3 vext1 <6,6,0,1>, <6,6,0,1>
+  2595935702U, // <6,0,1,7>: Cost 3 vext1 <7,6,0,1>, <7,6,0,1>
+  1612284060U, // <6,0,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
+  2686025892U, // <6,0,2,0>: Cost 3 vext3 <0,2,4,6>, <0,2,0,2>
+  2685804721U, // <6,0,2,1>: Cost 3 vext3 <0,2,1,6>, <0,2,1,6>
+  3759620282U, // <6,0,2,2>: Cost 4 vext3 <0,2,2,6>, <0,2,2,6>
+  2705342658U, // <6,0,2,3>: Cost 3 vext3 <3,4,5,6>, <0,2,3,5>
+  1612284108U, // <6,0,2,4>: Cost 2 vext3 <0,2,4,6>, <0,2,4,6>
+  3706029956U, // <6,0,2,5>: Cost 4 vext2 <2,4,6,0>, <2,5,6,7>
+  2686173406U, // <6,0,2,6>: Cost 3 vext3 <0,2,6,6>, <0,2,6,6>
+  3651769338U, // <6,0,2,7>: Cost 4 vext1 <4,6,0,2>, <7,0,1,2>
+  1612579056U, // <6,0,2,u>: Cost 2 vext3 <0,2,u,6>, <0,2,u,6>
+  3706030230U, // <6,0,3,0>: Cost 4 vext2 <2,4,6,0>, <3,0,1,2>
+  2705342720U, // <6,0,3,1>: Cost 3 vext3 <3,4,5,6>, <0,3,1,4>
+  2705342730U, // <6,0,3,2>: Cost 3 vext3 <3,4,5,6>, <0,3,2,5>
+  3706030492U, // <6,0,3,3>: Cost 4 vext2 <2,4,6,0>, <3,3,3,3>
+  2644896258U, // <6,0,3,4>: Cost 3 vext2 <4,5,6,0>, <3,4,5,6>
+  3718638154U, // <6,0,3,5>: Cost 4 vext2 <4,5,6,0>, <3,5,4,6>
+  3729918619U, // <6,0,3,6>: Cost 4 vext2 <6,4,6,0>, <3,6,4,6>
+  3926672384U, // <6,0,3,7>: Cost 4 vuzpr <5,6,7,0>, <1,3,5,7>
+  2705342784U, // <6,0,3,u>: Cost 3 vext3 <3,4,5,6>, <0,3,u,5>
+  2687058250U, // <6,0,4,0>: Cost 3 vext3 <0,4,0,6>, <0,4,0,6>
+  2686026066U, // <6,0,4,1>: Cost 3 vext3 <0,2,4,6>, <0,4,1,5>
+  1613463900U, // <6,0,4,2>: Cost 2 vext3 <0,4,2,6>, <0,4,2,6>
+  3761021285U, // <6,0,4,3>: Cost 4 vext3 <0,4,3,6>, <0,4,3,6>
+  2687353198U, // <6,0,4,4>: Cost 3 vext3 <0,4,4,6>, <0,4,4,6>
+  2632289590U, // <6,0,4,5>: Cost 3 vext2 <2,4,6,0>, RHS
+  2645560704U, // <6,0,4,6>: Cost 3 vext2 <4,6,6,0>, <4,6,6,0>
+  2646224337U, // <6,0,4,7>: Cost 3 vext2 <4,7,6,0>, <4,7,6,0>
+  1613906322U, // <6,0,4,u>: Cost 2 vext3 <0,4,u,6>, <0,4,u,6>
+  3651788902U, // <6,0,5,0>: Cost 4 vext1 <4,6,0,5>, LHS
+  2687795620U, // <6,0,5,1>: Cost 3 vext3 <0,5,1,6>, <0,5,1,6>
+  3761611181U, // <6,0,5,2>: Cost 4 vext3 <0,5,2,6>, <0,5,2,6>
+  3723284326U, // <6,0,5,3>: Cost 4 vext2 <5,3,6,0>, <5,3,6,0>
+  2646224838U, // <6,0,5,4>: Cost 3 vext2 <4,7,6,0>, <5,4,7,6>
+  3718639630U, // <6,0,5,5>: Cost 4 vext2 <4,5,6,0>, <5,5,6,6>
+  2652196962U, // <6,0,5,6>: Cost 3 vext2 <5,7,6,0>, <5,6,7,0>
+  2852932918U, // <6,0,5,7>: Cost 3 vuzpr <5,6,7,0>, RHS
+  2852932919U, // <6,0,5,u>: Cost 3 vuzpr <5,6,7,0>, RHS
+  2852933730U, // <6,0,6,0>: Cost 3 vuzpr <5,6,7,0>, <5,6,7,0>
+  2925985894U, // <6,0,6,1>: Cost 3 vzipl <6,6,6,6>, LHS
+  3060203622U, // <6,0,6,2>: Cost 3 vtrnl <6,6,6,6>, LHS
+  3718640178U, // <6,0,6,3>: Cost 4 vext2 <4,5,6,0>, <6,3,4,5>
+  2656178832U, // <6,0,6,4>: Cost 3 vext2 <6,4,6,0>, <6,4,6,0>
+  3725939378U, // <6,0,6,5>: Cost 4 vext2 <5,7,6,0>, <6,5,0,7>
+  2657506098U, // <6,0,6,6>: Cost 3 vext2 <6,6,6,0>, <6,6,6,0>
+  2619020110U, // <6,0,6,7>: Cost 3 vext2 <0,2,6,0>, <6,7,0,1>
+  2925986461U, // <6,0,6,u>: Cost 3 vzipl <6,6,6,6>, LHS
+  2572091494U, // <6,0,7,0>: Cost 3 vext1 <3,6,0,7>, LHS
+  2572092310U, // <6,0,7,1>: Cost 3 vext1 <3,6,0,7>, <1,2,3,0>
+  2980495524U, // <6,0,7,2>: Cost 3 vzipr RHS, <0,2,0,2>
+  2572094072U, // <6,0,7,3>: Cost 3 vext1 <3,6,0,7>, <3,6,0,7>
+  2572094774U, // <6,0,7,4>: Cost 3 vext1 <3,6,0,7>, RHS
+  4054238242U, // <6,0,7,5>: Cost 4 vzipr RHS, <1,4,0,5>
+  3645837653U, // <6,0,7,6>: Cost 4 vext1 <3,6,0,7>, <6,0,7,0>
+  4054239054U, // <6,0,7,7>: Cost 4 vzipr RHS, <2,5,0,7>
+  2572097326U, // <6,0,7,u>: Cost 3 vext1 <3,6,0,7>, LHS
+  2686026378U, // <6,0,u,0>: Cost 3 vext3 <0,2,4,6>, <0,u,0,2>
+  2686026386U, // <6,0,u,1>: Cost 3 vext3 <0,2,4,6>, <0,u,1,1>
+  1612284573U, // <6,0,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  2705343144U, // <6,0,u,3>: Cost 3 vext3 <3,4,5,6>, <0,u,3,5>
+  1616265906U, // <6,0,u,4>: Cost 2 vext3 <0,u,4,6>, <0,u,4,6>
+  2632292506U, // <6,0,u,5>: Cost 3 vext2 <2,4,6,0>, RHS
+  2590020356U, // <6,0,u,6>: Cost 3 vext1 <6,6,0,u>, <6,6,0,u>
+  2852933161U, // <6,0,u,7>: Cost 3 vuzpr <5,6,7,0>, RHS
+  1612284627U, // <6,0,u,u>: Cost 2 vext3 <0,2,4,6>, LHS
+  2595995750U, // <6,1,0,0>: Cost 3 vext1 <7,6,1,0>, LHS
+  2646229094U, // <6,1,0,1>: Cost 3 vext2 <4,7,6,1>, LHS
+  3694092492U, // <6,1,0,2>: Cost 4 vext2 <0,4,6,1>, <0,2,4,6>
+  2686026486U, // <6,1,0,3>: Cost 3 vext3 <0,2,4,6>, <1,0,3,2>
+  2595999030U, // <6,1,0,4>: Cost 3 vext1 <7,6,1,0>, RHS
+  3767730952U, // <6,1,0,5>: Cost 4 vext3 <1,5,4,6>, <1,0,5,2>
+  2596000590U, // <6,1,0,6>: Cost 3 vext1 <7,6,1,0>, <6,7,0,1>
+  2596001246U, // <6,1,0,7>: Cost 3 vext1 <7,6,1,0>, <7,6,1,0>
+  2686026531U, // <6,1,0,u>: Cost 3 vext3 <0,2,4,6>, <1,0,u,2>
+  3763602219U, // <6,1,1,0>: Cost 4 vext3 <0,u,2,6>, <1,1,0,1>
+  2686026548U, // <6,1,1,1>: Cost 3 vext3 <0,2,4,6>, <1,1,1,1>
+  3764929346U, // <6,1,1,2>: Cost 4 vext3 <1,1,2,6>, <1,1,2,6>
+  2686026568U, // <6,1,1,3>: Cost 3 vext3 <0,2,4,6>, <1,1,3,3>
+  2691334996U, // <6,1,1,4>: Cost 3 vext3 <1,1,4,6>, <1,1,4,6>
+  3760874332U, // <6,1,1,5>: Cost 4 vext3 <0,4,1,6>, <1,1,5,5>
+  3765224294U, // <6,1,1,6>: Cost 4 vext3 <1,1,6,6>, <1,1,6,6>
+  3669751263U, // <6,1,1,7>: Cost 4 vext1 <7,6,1,1>, <7,6,1,1>
+  2686026613U, // <6,1,1,u>: Cost 3 vext3 <0,2,4,6>, <1,1,u,3>
+  2554208358U, // <6,1,2,0>: Cost 3 vext1 <0,6,1,2>, LHS
+  3763602311U, // <6,1,2,1>: Cost 4 vext3 <0,u,2,6>, <1,2,1,3>
+  3639895971U, // <6,1,2,2>: Cost 4 vext1 <2,6,1,2>, <2,6,1,2>
+  2686026646U, // <6,1,2,3>: Cost 3 vext3 <0,2,4,6>, <1,2,3,0>
+  2554211638U, // <6,1,2,4>: Cost 3 vext1 <0,6,1,2>, RHS
+  3760874411U, // <6,1,2,5>: Cost 4 vext3 <0,4,1,6>, <1,2,5,3>
+  2554212858U, // <6,1,2,6>: Cost 3 vext1 <0,6,1,2>, <6,2,7,3>
+  3802973114U, // <6,1,2,7>: Cost 4 vext3 <7,4,5,6>, <1,2,7,0>
+  2686026691U, // <6,1,2,u>: Cost 3 vext3 <0,2,4,6>, <1,2,u,0>
+  2566160486U, // <6,1,3,0>: Cost 3 vext1 <2,6,1,3>, LHS
+  2686026712U, // <6,1,3,1>: Cost 3 vext3 <0,2,4,6>, <1,3,1,3>
+  2686026724U, // <6,1,3,2>: Cost 3 vext3 <0,2,4,6>, <1,3,2,6>
+  3759768552U, // <6,1,3,3>: Cost 4 vext3 <0,2,4,6>, <1,3,3,1>
+  2692662262U, // <6,1,3,4>: Cost 3 vext3 <1,3,4,6>, <1,3,4,6>
+  2686026752U, // <6,1,3,5>: Cost 3 vext3 <0,2,4,6>, <1,3,5,7>
+  2590053128U, // <6,1,3,6>: Cost 3 vext1 <6,6,1,3>, <6,6,1,3>
+  3663795194U, // <6,1,3,7>: Cost 4 vext1 <6,6,1,3>, <7,0,1,2>
+  2686026775U, // <6,1,3,u>: Cost 3 vext3 <0,2,4,6>, <1,3,u,3>
+  2641587099U, // <6,1,4,0>: Cost 3 vext2 <4,0,6,1>, <4,0,6,1>
+  2693104684U, // <6,1,4,1>: Cost 3 vext3 <1,4,1,6>, <1,4,1,6>
+  3639912357U, // <6,1,4,2>: Cost 4 vext1 <2,6,1,4>, <2,6,1,4>
+  2687206462U, // <6,1,4,3>: Cost 3 vext3 <0,4,2,6>, <1,4,3,6>
+  3633941814U, // <6,1,4,4>: Cost 4 vext1 <1,6,1,4>, RHS
+  2693399632U, // <6,1,4,5>: Cost 3 vext3 <1,4,5,6>, <1,4,5,6>
+  3765077075U, // <6,1,4,6>: Cost 4 vext3 <1,1,4,6>, <1,4,6,0>
+  2646232530U, // <6,1,4,7>: Cost 3 vext2 <4,7,6,1>, <4,7,6,1>
+  2687206507U, // <6,1,4,u>: Cost 3 vext3 <0,4,2,6>, <1,4,u,6>
+  2647559796U, // <6,1,5,0>: Cost 3 vext2 <5,0,6,1>, <5,0,6,1>
+  3765077118U, // <6,1,5,1>: Cost 4 vext3 <1,1,4,6>, <1,5,1,7>
+  3767583878U, // <6,1,5,2>: Cost 4 vext3 <1,5,2,6>, <1,5,2,6>
+  2686026896U, // <6,1,5,3>: Cost 3 vext3 <0,2,4,6>, <1,5,3,7>
+  2693989528U, // <6,1,5,4>: Cost 3 vext3 <1,5,4,6>, <1,5,4,6>
+  3767805089U, // <6,1,5,5>: Cost 4 vext3 <1,5,5,6>, <1,5,5,6>
+  2652868706U, // <6,1,5,6>: Cost 3 vext2 <5,u,6,1>, <5,6,7,0>
+  3908250934U, // <6,1,5,7>: Cost 4 vuzpr <2,6,0,1>, RHS
+  2686026941U, // <6,1,5,u>: Cost 3 vext3 <0,2,4,6>, <1,5,u,7>
+  2554241126U, // <6,1,6,0>: Cost 3 vext1 <0,6,1,6>, LHS
+  3763602639U, // <6,1,6,1>: Cost 4 vext3 <0,u,2,6>, <1,6,1,7>
+  3759547607U, // <6,1,6,2>: Cost 4 vext3 <0,2,1,6>, <1,6,2,6>
+  3115221094U, // <6,1,6,3>: Cost 3 vtrnr <4,6,4,6>, LHS
+  2554244406U, // <6,1,6,4>: Cost 3 vext1 <0,6,1,6>, RHS
+  3760874739U, // <6,1,6,5>: Cost 4 vext3 <0,4,1,6>, <1,6,5,7>
+  2554245944U, // <6,1,6,6>: Cost 3 vext1 <0,6,1,6>, <6,6,6,6>
+  3719975758U, // <6,1,6,7>: Cost 4 vext2 <4,7,6,1>, <6,7,0,1>
+  3115221099U, // <6,1,6,u>: Cost 3 vtrnr <4,6,4,6>, LHS
+  2560221286U, // <6,1,7,0>: Cost 3 vext1 <1,6,1,7>, LHS
+  2560222415U, // <6,1,7,1>: Cost 3 vext1 <1,6,1,7>, <1,6,1,7>
+  2980497558U, // <6,1,7,2>: Cost 3 vzipr RHS, <3,0,1,2>
+  3103211622U, // <6,1,7,3>: Cost 3 vtrnr <2,6,3,7>, LHS
+  2560224566U, // <6,1,7,4>: Cost 3 vext1 <1,6,1,7>, RHS
+  2980495698U, // <6,1,7,5>: Cost 3 vzipr RHS, <0,4,1,5>
+  3633967526U, // <6,1,7,6>: Cost 4 vext1 <1,6,1,7>, <6,1,7,0>
+  4054237686U, // <6,1,7,7>: Cost 4 vzipr RHS, <0,6,1,7>
+  2560227118U, // <6,1,7,u>: Cost 3 vext1 <1,6,1,7>, LHS
+  2560229478U, // <6,1,u,0>: Cost 3 vext1 <1,6,1,u>, LHS
+  2686027117U, // <6,1,u,1>: Cost 3 vext3 <0,2,4,6>, <1,u,1,3>
+  2686027129U, // <6,1,u,2>: Cost 3 vext3 <0,2,4,6>, <1,u,2,6>
+  2686027132U, // <6,1,u,3>: Cost 3 vext3 <0,2,4,6>, <1,u,3,0>
+  2687206795U, // <6,1,u,4>: Cost 3 vext3 <0,4,2,6>, <1,u,4,6>
+  2686027157U, // <6,1,u,5>: Cost 3 vext3 <0,2,4,6>, <1,u,5,7>
+  2590094093U, // <6,1,u,6>: Cost 3 vext1 <6,6,1,u>, <6,6,1,u>
+  2596066790U, // <6,1,u,7>: Cost 3 vext1 <7,6,1,u>, <7,6,1,u>
+  2686027177U, // <6,1,u,u>: Cost 3 vext3 <0,2,4,6>, <1,u,u,0>
+  2646900736U, // <6,2,0,0>: Cost 3 vext2 <4,u,6,2>, <0,0,0,0>
+  1573159014U, // <6,2,0,1>: Cost 2 vext2 <4,u,6,2>, LHS
+  2646900900U, // <6,2,0,2>: Cost 3 vext2 <4,u,6,2>, <0,2,0,2>
+  3759769037U, // <6,2,0,3>: Cost 4 vext3 <0,2,4,6>, <2,0,3,0>
+  2641592668U, // <6,2,0,4>: Cost 3 vext2 <4,0,6,2>, <0,4,2,6>
+  3779085794U, // <6,2,0,5>: Cost 4 vext3 <3,4,5,6>, <2,0,5,3>
+  2686027244U, // <6,2,0,6>: Cost 3 vext3 <0,2,4,6>, <2,0,6,4>
+  3669816807U, // <6,2,0,7>: Cost 4 vext1 <7,6,2,0>, <7,6,2,0>
+  1573159581U, // <6,2,0,u>: Cost 2 vext2 <4,u,6,2>, LHS
+  2230527897U, // <6,2,1,0>: Cost 3 vrev <2,6,0,1>
+  2646901556U, // <6,2,1,1>: Cost 3 vext2 <4,u,6,2>, <1,1,1,1>
+  2646901654U, // <6,2,1,2>: Cost 3 vext2 <4,u,6,2>, <1,2,3,0>
+  2847047782U, // <6,2,1,3>: Cost 3 vuzpr <4,6,u,2>, LHS
+  3771049517U, // <6,2,1,4>: Cost 4 vext3 <2,1,4,6>, <2,1,4,6>
+  2646901904U, // <6,2,1,5>: Cost 3 vext2 <4,u,6,2>, <1,5,3,7>
+  2686027324U, // <6,2,1,6>: Cost 3 vext3 <0,2,4,6>, <2,1,6,3>
+  3669825000U, // <6,2,1,7>: Cost 4 vext1 <7,6,2,1>, <7,6,2,1>
+  2231117793U, // <6,2,1,u>: Cost 3 vrev <2,6,u,1>
+  3763603029U, // <6,2,2,0>: Cost 4 vext3 <0,u,2,6>, <2,2,0,1>
+  3759769184U, // <6,2,2,1>: Cost 4 vext3 <0,2,4,6>, <2,2,1,3>
+  2686027368U, // <6,2,2,2>: Cost 3 vext3 <0,2,4,6>, <2,2,2,2>
+  2686027378U, // <6,2,2,3>: Cost 3 vext3 <0,2,4,6>, <2,2,3,3>
+  2697971326U, // <6,2,2,4>: Cost 3 vext3 <2,2,4,6>, <2,2,4,6>
+  3759769224U, // <6,2,2,5>: Cost 4 vext3 <0,2,4,6>, <2,2,5,7>
+  2698118800U, // <6,2,2,6>: Cost 3 vext3 <2,2,6,6>, <2,2,6,6>
+  3920794092U, // <6,2,2,7>: Cost 4 vuzpr <4,6,u,2>, <6,2,5,7>
+  2686027423U, // <6,2,2,u>: Cost 3 vext3 <0,2,4,6>, <2,2,u,3>
+  2686027430U, // <6,2,3,0>: Cost 3 vext3 <0,2,4,6>, <2,3,0,1>
+  3759769262U, // <6,2,3,1>: Cost 4 vext3 <0,2,4,6>, <2,3,1,0>
+  2698487485U, // <6,2,3,2>: Cost 3 vext3 <2,3,2,6>, <2,3,2,6>
+  2705344196U, // <6,2,3,3>: Cost 3 vext3 <3,4,5,6>, <2,3,3,4>
+  2686027470U, // <6,2,3,4>: Cost 3 vext3 <0,2,4,6>, <2,3,4,5>
+  2698708696U, // <6,2,3,5>: Cost 3 vext3 <2,3,5,6>, <2,3,5,6>
+  2724660961U, // <6,2,3,6>: Cost 3 vext3 <6,6,6,6>, <2,3,6,6>
+  2729232104U, // <6,2,3,7>: Cost 3 vext3 <7,4,5,6>, <2,3,7,4>
+  2686027502U, // <6,2,3,u>: Cost 3 vext3 <0,2,4,6>, <2,3,u,1>
+  1567853468U, // <6,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
+  3759769351U, // <6,2,4,1>: Cost 4 vext3 <0,2,4,6>, <2,4,1,u>
+  2699151118U, // <6,2,4,2>: Cost 3 vext3 <2,4,2,6>, <2,4,2,6>
+  2686027543U, // <6,2,4,3>: Cost 3 vext3 <0,2,4,6>, <2,4,3,6>
+  2699298592U, // <6,2,4,4>: Cost 3 vext3 <2,4,4,6>, <2,4,4,6>
+  1573162294U, // <6,2,4,5>: Cost 2 vext2 <4,u,6,2>, RHS
+  2686027564U, // <6,2,4,6>: Cost 3 vext3 <0,2,4,6>, <2,4,6,0>
+  3719982547U, // <6,2,4,7>: Cost 4 vext2 <4,7,6,2>, <4,7,6,2>
+  1573162532U, // <6,2,4,u>: Cost 2 vext2 <4,u,6,2>, <4,u,6,2>
+  3779086154U, // <6,2,5,0>: Cost 4 vext3 <3,4,5,6>, <2,5,0,3>
+  2646904528U, // <6,2,5,1>: Cost 3 vext2 <4,u,6,2>, <5,1,7,3>
+  3759769440U, // <6,2,5,2>: Cost 4 vext3 <0,2,4,6>, <2,5,2,7>
+  2699888488U, // <6,2,5,3>: Cost 3 vext3 <2,5,3,6>, <2,5,3,6>
+  2230855617U, // <6,2,5,4>: Cost 3 vrev <2,6,4,5>
+  2646904836U, // <6,2,5,5>: Cost 3 vext2 <4,u,6,2>, <5,5,5,5>
+  2646904930U, // <6,2,5,6>: Cost 3 vext2 <4,u,6,2>, <5,6,7,0>
+  2847051062U, // <6,2,5,7>: Cost 3 vuzpr <4,6,u,2>, RHS
+  2700257173U, // <6,2,5,u>: Cost 3 vext3 <2,5,u,6>, <2,5,u,6>
+  2687207321U, // <6,2,6,0>: Cost 3 vext3 <0,4,2,6>, <2,6,0,1>
+  2686027684U, // <6,2,6,1>: Cost 3 vext3 <0,2,4,6>, <2,6,1,3>
+  2566260656U, // <6,2,6,2>: Cost 3 vext1 <2,6,2,6>, <2,6,2,6>
+  2685806522U, // <6,2,6,3>: Cost 3 vext3 <0,2,1,6>, <2,6,3,7>
+  2687207361U, // <6,2,6,4>: Cost 3 vext3 <0,4,2,6>, <2,6,4,5>
+  2686027724U, // <6,2,6,5>: Cost 3 vext3 <0,2,4,6>, <2,6,5,7>
+  2646905656U, // <6,2,6,6>: Cost 3 vext2 <4,u,6,2>, <6,6,6,6>
+  2646905678U, // <6,2,6,7>: Cost 3 vext2 <4,u,6,2>, <6,7,0,1>
+  2686027751U, // <6,2,6,u>: Cost 3 vext3 <0,2,4,6>, <2,6,u,7>
+  2554323046U, // <6,2,7,0>: Cost 3 vext1 <0,6,2,7>, LHS
+  2572239606U, // <6,2,7,1>: Cost 3 vext1 <3,6,2,7>, <1,0,3,2>
+  2566268849U, // <6,2,7,2>: Cost 3 vext1 <2,6,2,7>, <2,6,2,7>
+  1906753638U, // <6,2,7,3>: Cost 2 vzipr RHS, LHS
+  2554326326U, // <6,2,7,4>: Cost 3 vext1 <0,6,2,7>, RHS
+  3304687564U, // <6,2,7,5>: Cost 4 vrev <2,6,5,7>
+  2980495708U, // <6,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
+  2646906476U, // <6,2,7,7>: Cost 3 vext2 <4,u,6,2>, <7,7,7,7>
+  1906753643U, // <6,2,7,u>: Cost 2 vzipr RHS, LHS
+  1591744256U, // <6,2,u,0>: Cost 2 vext2 <u,0,6,2>, <u,0,6,2>
+  1573164846U, // <6,2,u,1>: Cost 2 vext2 <4,u,6,2>, LHS
+  2701805650U, // <6,2,u,2>: Cost 3 vext3 <2,u,2,6>, <2,u,2,6>
+  1906761830U, // <6,2,u,3>: Cost 2 vzipr RHS, LHS
+  2686027875U, // <6,2,u,4>: Cost 3 vext3 <0,2,4,6>, <2,u,4,5>
+  1573165210U, // <6,2,u,5>: Cost 2 vext2 <4,u,6,2>, RHS
+  2686322800U, // <6,2,u,6>: Cost 3 vext3 <0,2,u,6>, <2,u,6,0>
+  2847051305U, // <6,2,u,7>: Cost 3 vuzpr <4,6,u,2>, RHS
+  1906761835U, // <6,2,u,u>: Cost 2 vzipr RHS, LHS
+  3759769739U, // <6,3,0,0>: Cost 4 vext3 <0,2,4,6>, <3,0,0,0>
+  2686027926U, // <6,3,0,1>: Cost 3 vext3 <0,2,4,6>, <3,0,1,2>
+  2686027937U, // <6,3,0,2>: Cost 3 vext3 <0,2,4,6>, <3,0,2,4>
+  3640027286U, // <6,3,0,3>: Cost 4 vext1 <2,6,3,0>, <3,0,1,2>
+  2687207601U, // <6,3,0,4>: Cost 3 vext3 <0,4,2,6>, <3,0,4,2>
+  2705344698U, // <6,3,0,5>: Cost 3 vext3 <3,4,5,6>, <3,0,5,2>
+  3663917847U, // <6,3,0,6>: Cost 4 vext1 <6,6,3,0>, <6,6,3,0>
+  2237008560U, // <6,3,0,7>: Cost 3 vrev <3,6,7,0>
+  2686027989U, // <6,3,0,u>: Cost 3 vext3 <0,2,4,6>, <3,0,u,2>
+  3759769823U, // <6,3,1,0>: Cost 4 vext3 <0,2,4,6>, <3,1,0,3>
+  3759769830U, // <6,3,1,1>: Cost 4 vext3 <0,2,4,6>, <3,1,1,1>
+  3759769841U, // <6,3,1,2>: Cost 4 vext3 <0,2,4,6>, <3,1,2,3>
+  3759769848U, // <6,3,1,3>: Cost 4 vext3 <0,2,4,6>, <3,1,3,1>
+  2703280390U, // <6,3,1,4>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
+  3759769868U, // <6,3,1,5>: Cost 4 vext3 <0,2,4,6>, <3,1,5,3>
+  3704063194U, // <6,3,1,6>: Cost 4 vext2 <2,1,6,3>, <1,6,3,0>
+  3767732510U, // <6,3,1,7>: Cost 4 vext3 <1,5,4,6>, <3,1,7,3>
+  2703280390U, // <6,3,1,u>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
+  3704063468U, // <6,3,2,0>: Cost 4 vext2 <2,1,6,3>, <2,0,6,4>
+  2630321724U, // <6,3,2,1>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
+  3759769921U, // <6,3,2,2>: Cost 4 vext3 <0,2,4,6>, <3,2,2,2>
+  3759769928U, // <6,3,2,3>: Cost 4 vext3 <0,2,4,6>, <3,2,3,0>
+  3704063767U, // <6,3,2,4>: Cost 4 vext2 <2,1,6,3>, <2,4,3,6>
+  3704063876U, // <6,3,2,5>: Cost 4 vext2 <2,1,6,3>, <2,5,6,7>
+  2636957626U, // <6,3,2,6>: Cost 3 vext2 <3,2,6,3>, <2,6,3,7>
+  3777907058U, // <6,3,2,7>: Cost 4 vext3 <3,2,7,6>, <3,2,7,6>
+  2630321724U, // <6,3,2,u>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
+  3759769983U, // <6,3,3,0>: Cost 4 vext3 <0,2,4,6>, <3,3,0,1>
+  3710036245U, // <6,3,3,1>: Cost 4 vext2 <3,1,6,3>, <3,1,6,3>
+  2636958054U, // <6,3,3,2>: Cost 3 vext2 <3,2,6,3>, <3,2,6,3>
+  2686028188U, // <6,3,3,3>: Cost 3 vext3 <0,2,4,6>, <3,3,3,3>
+  2704607656U, // <6,3,3,4>: Cost 3 vext3 <3,3,4,6>, <3,3,4,6>
+  3773041072U, // <6,3,3,5>: Cost 4 vext3 <2,4,4,6>, <3,3,5,5>
+  3711363731U, // <6,3,3,6>: Cost 4 vext2 <3,3,6,3>, <3,6,3,7>
+  3767732676U, // <6,3,3,7>: Cost 4 vext3 <1,5,4,6>, <3,3,7,7>
+  2707999179U, // <6,3,3,u>: Cost 3 vext3 <3,u,5,6>, <3,3,u,5>
+  2584232038U, // <6,3,4,0>: Cost 3 vext1 <5,6,3,4>, LHS
+  2642267118U, // <6,3,4,1>: Cost 3 vext2 <4,1,6,3>, <4,1,6,3>
+  2642930751U, // <6,3,4,2>: Cost 3 vext2 <4,2,6,3>, <4,2,6,3>
+  2705197552U, // <6,3,4,3>: Cost 3 vext3 <3,4,3,6>, <3,4,3,6>
+  2584235318U, // <6,3,4,4>: Cost 3 vext1 <5,6,3,4>, RHS
+  1631603202U, // <6,3,4,5>: Cost 2 vext3 <3,4,5,6>, <3,4,5,6>
+  2654211444U, // <6,3,4,6>: Cost 3 vext2 <6,1,6,3>, <4,6,4,6>
+  2237041332U, // <6,3,4,7>: Cost 3 vrev <3,6,7,4>
+  1631824413U, // <6,3,4,u>: Cost 2 vext3 <3,4,u,6>, <3,4,u,6>
+  3640066150U, // <6,3,5,0>: Cost 4 vext1 <2,6,3,5>, LHS
+  3772746288U, // <6,3,5,1>: Cost 4 vext3 <2,4,0,6>, <3,5,1,7>
+  3640067790U, // <6,3,5,2>: Cost 4 vext1 <2,6,3,5>, <2,3,4,5>
+  3773041216U, // <6,3,5,3>: Cost 4 vext3 <2,4,4,6>, <3,5,3,5>
+  2705934922U, // <6,3,5,4>: Cost 3 vext3 <3,5,4,6>, <3,5,4,6>
+  3773041236U, // <6,3,5,5>: Cost 4 vext3 <2,4,4,6>, <3,5,5,7>
+  3779086940U, // <6,3,5,6>: Cost 4 vext3 <3,4,5,6>, <3,5,6,6>
+  3767732831U, // <6,3,5,7>: Cost 4 vext3 <1,5,4,6>, <3,5,7,0>
+  2706229870U, // <6,3,5,u>: Cost 3 vext3 <3,5,u,6>, <3,5,u,6>
+  2602164326U, // <6,3,6,0>: Cost 3 vext1 <u,6,3,6>, LHS
+  2654212512U, // <6,3,6,1>: Cost 3 vext2 <6,1,6,3>, <6,1,6,3>
+  2566334393U, // <6,3,6,2>: Cost 3 vext1 <2,6,3,6>, <2,6,3,6>
+  3704066588U, // <6,3,6,3>: Cost 4 vext2 <2,1,6,3>, <6,3,2,1>
+  2602167524U, // <6,3,6,4>: Cost 3 vext1 <u,6,3,6>, <4,4,6,6>
+  3710702321U, // <6,3,6,5>: Cost 4 vext2 <3,2,6,3>, <6,5,7,7>
+  2724661933U, // <6,3,6,6>: Cost 3 vext3 <6,6,6,6>, <3,6,6,6>
+  3710702465U, // <6,3,6,7>: Cost 4 vext2 <3,2,6,3>, <6,7,5,7>
+  2602170158U, // <6,3,6,u>: Cost 3 vext1 <u,6,3,6>, LHS
+  1492598886U, // <6,3,7,0>: Cost 2 vext1 <2,6,3,7>, LHS
+  2560369889U, // <6,3,7,1>: Cost 3 vext1 <1,6,3,7>, <1,6,3,7>
+  1492600762U, // <6,3,7,2>: Cost 2 vext1 <2,6,3,7>, <2,6,3,7>
+  2566342806U, // <6,3,7,3>: Cost 3 vext1 <2,6,3,7>, <3,0,1,2>
+  1492602166U, // <6,3,7,4>: Cost 2 vext1 <2,6,3,7>, RHS
+  2602176208U, // <6,3,7,5>: Cost 3 vext1 <u,6,3,7>, <5,1,7,3>
+  2566345210U, // <6,3,7,6>: Cost 3 vext1 <2,6,3,7>, <6,2,7,3>
+  2980496528U, // <6,3,7,7>: Cost 3 vzipr RHS, <1,5,3,7>
+  1492604718U, // <6,3,7,u>: Cost 2 vext1 <2,6,3,7>, LHS
+  1492607078U, // <6,3,u,0>: Cost 2 vext1 <2,6,3,u>, LHS
+  2686028574U, // <6,3,u,1>: Cost 3 vext3 <0,2,4,6>, <3,u,1,2>
+  1492608955U, // <6,3,u,2>: Cost 2 vext1 <2,6,3,u>, <2,6,3,u>
+  2566350998U, // <6,3,u,3>: Cost 3 vext1 <2,6,3,u>, <3,0,1,2>
+  1492610358U, // <6,3,u,4>: Cost 2 vext1 <2,6,3,u>, RHS
+  1634257734U, // <6,3,u,5>: Cost 2 vext3 <3,u,5,6>, <3,u,5,6>
+  2566353489U, // <6,3,u,6>: Cost 3 vext1 <2,6,3,u>, <6,3,u,0>
+  2980504720U, // <6,3,u,7>: Cost 3 vzipr RHS, <1,5,3,7>
+  1492612910U, // <6,3,u,u>: Cost 2 vext1 <2,6,3,u>, LHS
+  3703406592U, // <6,4,0,0>: Cost 4 vext2 <2,0,6,4>, <0,0,0,0>
+  2629664870U, // <6,4,0,1>: Cost 3 vext2 <2,0,6,4>, LHS
+  2629664972U, // <6,4,0,2>: Cost 3 vext2 <2,0,6,4>, <0,2,4,6>
+  3779087232U, // <6,4,0,3>: Cost 4 vext3 <3,4,5,6>, <4,0,3,1>
+  2642936156U, // <6,4,0,4>: Cost 3 vext2 <4,2,6,4>, <0,4,2,6>
+  2712570770U, // <6,4,0,5>: Cost 3 vext3 <4,6,4,6>, <4,0,5,1>
+  2687208348U, // <6,4,0,6>: Cost 3 vext3 <0,4,2,6>, <4,0,6,2>
+  3316723081U, // <6,4,0,7>: Cost 4 vrev <4,6,7,0>
+  2629665437U, // <6,4,0,u>: Cost 3 vext2 <2,0,6,4>, LHS
+  2242473291U, // <6,4,1,0>: Cost 3 vrev <4,6,0,1>
+  3700089652U, // <6,4,1,1>: Cost 4 vext2 <1,4,6,4>, <1,1,1,1>
+  3703407510U, // <6,4,1,2>: Cost 4 vext2 <2,0,6,4>, <1,2,3,0>
+  2852962406U, // <6,4,1,3>: Cost 3 vuzpr <5,6,7,4>, LHS
+  3628166454U, // <6,4,1,4>: Cost 4 vext1 <0,6,4,1>, RHS
+  3760876514U, // <6,4,1,5>: Cost 4 vext3 <0,4,1,6>, <4,1,5,0>
+  2687208430U, // <6,4,1,6>: Cost 3 vext3 <0,4,2,6>, <4,1,6,3>
+  3316731274U, // <6,4,1,7>: Cost 4 vrev <4,6,7,1>
+  2243063187U, // <6,4,1,u>: Cost 3 vrev <4,6,u,1>
+  2629666284U, // <6,4,2,0>: Cost 3 vext2 <2,0,6,4>, <2,0,6,4>
+  3703408188U, // <6,4,2,1>: Cost 4 vext2 <2,0,6,4>, <2,1,6,3>
+  3703408232U, // <6,4,2,2>: Cost 4 vext2 <2,0,6,4>, <2,2,2,2>
+  3703408294U, // <6,4,2,3>: Cost 4 vext2 <2,0,6,4>, <2,3,0,1>
+  2632320816U, // <6,4,2,4>: Cost 3 vext2 <2,4,6,4>, <2,4,6,4>
+  2923384118U, // <6,4,2,5>: Cost 3 vzipl <6,2,7,3>, RHS
+  2687208508U, // <6,4,2,6>: Cost 3 vext3 <0,4,2,6>, <4,2,6,0>
+  3760950341U, // <6,4,2,7>: Cost 4 vext3 <0,4,2,6>, <4,2,7,0>
+  2634975348U, // <6,4,2,u>: Cost 3 vext2 <2,u,6,4>, <2,u,6,4>
+  3703408790U, // <6,4,3,0>: Cost 4 vext2 <2,0,6,4>, <3,0,1,2>
+  3316305238U, // <6,4,3,1>: Cost 4 vrev <4,6,1,3>
+  3703408947U, // <6,4,3,2>: Cost 4 vext2 <2,0,6,4>, <3,2,0,6>
+  3703409052U, // <6,4,3,3>: Cost 4 vext2 <2,0,6,4>, <3,3,3,3>
+  2644929026U, // <6,4,3,4>: Cost 3 vext2 <4,5,6,4>, <3,4,5,6>
+  3718670922U, // <6,4,3,5>: Cost 4 vext2 <4,5,6,4>, <3,5,4,6>
+  2705345682U, // <6,4,3,6>: Cost 3 vext3 <3,4,5,6>, <4,3,6,5>
+  3926705152U, // <6,4,3,7>: Cost 4 vuzpr <5,6,7,4>, <1,3,5,7>
+  2668817222U, // <6,4,3,u>: Cost 3 vext2 <u,5,6,4>, <3,u,5,6>
+  2590277734U, // <6,4,4,0>: Cost 3 vext1 <6,6,4,4>, LHS
+  3716017135U, // <6,4,4,1>: Cost 4 vext2 <4,1,6,4>, <4,1,6,4>
+  2642938944U, // <6,4,4,2>: Cost 3 vext2 <4,2,6,4>, <4,2,6,4>
+  3717344401U, // <6,4,4,3>: Cost 4 vext2 <4,3,6,4>, <4,3,6,4>
+  2712571088U, // <6,4,4,4>: Cost 3 vext3 <4,6,4,6>, <4,4,4,4>
+  2629668150U, // <6,4,4,5>: Cost 3 vext2 <2,0,6,4>, RHS
+  1637649636U, // <6,4,4,6>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
+  2646257109U, // <6,4,4,7>: Cost 3 vext2 <4,7,6,4>, <4,7,6,4>
+  1637649636U, // <6,4,4,u>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
+  2566398054U, // <6,4,5,0>: Cost 3 vext1 <2,6,4,5>, LHS
+  3760876805U, // <6,4,5,1>: Cost 4 vext3 <0,4,1,6>, <4,5,1,3>
+  2566399937U, // <6,4,5,2>: Cost 3 vext1 <2,6,4,5>, <2,6,4,5>
+  2584316418U, // <6,4,5,3>: Cost 3 vext1 <5,6,4,5>, <3,4,5,6>
+  2566401334U, // <6,4,5,4>: Cost 3 vext1 <2,6,4,5>, RHS
+  2584318028U, // <6,4,5,5>: Cost 3 vext1 <5,6,4,5>, <5,6,4,5>
+  1612287286U, // <6,4,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
+  2852965686U, // <6,4,5,7>: Cost 3 vuzpr <5,6,7,4>, RHS
+  1612287304U, // <6,4,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
+  1504608358U, // <6,4,6,0>: Cost 2 vext1 <4,6,4,6>, LHS
+  2578350838U, // <6,4,6,1>: Cost 3 vext1 <4,6,4,6>, <1,0,3,2>
+  2578351720U, // <6,4,6,2>: Cost 3 vext1 <4,6,4,6>, <2,2,2,2>
+  2578352278U, // <6,4,6,3>: Cost 3 vext1 <4,6,4,6>, <3,0,1,2>
+  1504611638U, // <6,4,6,4>: Cost 2 vext1 <4,6,4,6>, RHS
+  2578353872U, // <6,4,6,5>: Cost 3 vext1 <4,6,4,6>, <5,1,7,3>
+  2578354682U, // <6,4,6,6>: Cost 3 vext1 <4,6,4,6>, <6,2,7,3>
+  2578355194U, // <6,4,6,7>: Cost 3 vext1 <4,6,4,6>, <7,0,1,2>
+  1504614190U, // <6,4,6,u>: Cost 2 vext1 <4,6,4,6>, LHS
+  2572386406U, // <6,4,7,0>: Cost 3 vext1 <3,6,4,7>, LHS
+  2572387226U, // <6,4,7,1>: Cost 3 vext1 <3,6,4,7>, <1,2,3,4>
+  3640157902U, // <6,4,7,2>: Cost 4 vext1 <2,6,4,7>, <2,3,4,5>
+  2572389020U, // <6,4,7,3>: Cost 3 vext1 <3,6,4,7>, <3,6,4,7>
+  2572389686U, // <6,4,7,4>: Cost 3 vext1 <3,6,4,7>, RHS
+  2980497102U, // <6,4,7,5>: Cost 3 vzipr RHS, <2,3,4,5>
+  2980495564U, // <6,4,7,6>: Cost 3 vzipr RHS, <0,2,4,6>
+  4054239090U, // <6,4,7,7>: Cost 4 vzipr RHS, <2,5,4,7>
+  2572392238U, // <6,4,7,u>: Cost 3 vext1 <3,6,4,7>, LHS
+  1504608358U, // <6,4,u,0>: Cost 2 vext1 <4,6,4,6>, LHS
+  2629670702U, // <6,4,u,1>: Cost 3 vext2 <2,0,6,4>, LHS
+  2566424516U, // <6,4,u,2>: Cost 3 vext1 <2,6,4,u>, <2,6,4,u>
+  2584340994U, // <6,4,u,3>: Cost 3 vext1 <5,6,4,u>, <3,4,5,6>
+  1640156694U, // <6,4,u,4>: Cost 2 vext3 <4,u,4,6>, <4,u,4,6>
+  2629671066U, // <6,4,u,5>: Cost 3 vext2 <2,0,6,4>, RHS
+  1612287529U, // <6,4,u,6>: Cost 2 vext3 <0,2,4,6>, RHS
+  2852965929U, // <6,4,u,7>: Cost 3 vuzpr <5,6,7,4>, RHS
+  1612287547U, // <6,4,u,u>: Cost 2 vext3 <0,2,4,6>, RHS
+  3708723200U, // <6,5,0,0>: Cost 4 vext2 <2,u,6,5>, <0,0,0,0>
+  2634981478U, // <6,5,0,1>: Cost 3 vext2 <2,u,6,5>, LHS
+  3694125260U, // <6,5,0,2>: Cost 4 vext2 <0,4,6,5>, <0,2,4,6>
+  3779087962U, // <6,5,0,3>: Cost 4 vext3 <3,4,5,6>, <5,0,3,2>
+  3760877154U, // <6,5,0,4>: Cost 4 vext3 <0,4,1,6>, <5,0,4,1>
+  4195110916U, // <6,5,0,5>: Cost 4 vtrnr <5,6,7,0>, <5,5,5,5>
+  3696779775U, // <6,5,0,6>: Cost 4 vext2 <0,u,6,5>, <0,6,2,7>
+  1175212130U, // <6,5,0,7>: Cost 2 vrev <5,6,7,0>
+  1175285867U, // <6,5,0,u>: Cost 2 vrev <5,6,u,0>
+  2248445988U, // <6,5,1,0>: Cost 3 vrev <5,6,0,1>
+  3698107237U, // <6,5,1,1>: Cost 4 vext2 <1,1,6,5>, <1,1,6,5>
+  3708724118U, // <6,5,1,2>: Cost 4 vext2 <2,u,6,5>, <1,2,3,0>
+  3908575334U, // <6,5,1,3>: Cost 4 vuzpr <2,6,4,5>, LHS
+  3716023376U, // <6,5,1,4>: Cost 4 vext2 <4,1,6,5>, <1,4,5,6>
+  3708724368U, // <6,5,1,5>: Cost 4 vext2 <2,u,6,5>, <1,5,3,7>
+  3767733960U, // <6,5,1,6>: Cost 4 vext3 <1,5,4,6>, <5,1,6,4>
+  2712571600U, // <6,5,1,7>: Cost 3 vext3 <4,6,4,6>, <5,1,7,3>
+  2712571609U, // <6,5,1,u>: Cost 3 vext3 <4,6,4,6>, <5,1,u,3>
+  2578391142U, // <6,5,2,0>: Cost 3 vext1 <4,6,5,2>, LHS
+  3704079934U, // <6,5,2,1>: Cost 4 vext2 <2,1,6,5>, <2,1,6,5>
+  3708724840U, // <6,5,2,2>: Cost 4 vext2 <2,u,6,5>, <2,2,2,2>
+  3705407182U, // <6,5,2,3>: Cost 4 vext2 <2,3,6,5>, <2,3,4,5>
+  2578394422U, // <6,5,2,4>: Cost 3 vext1 <4,6,5,2>, RHS
+  3717351272U, // <6,5,2,5>: Cost 4 vext2 <4,3,6,5>, <2,5,3,6>
+  2634983354U, // <6,5,2,6>: Cost 3 vext2 <2,u,6,5>, <2,6,3,7>
+  3115486518U, // <6,5,2,7>: Cost 3 vtrnr <4,6,u,2>, RHS
+  2634983541U, // <6,5,2,u>: Cost 3 vext2 <2,u,6,5>, <2,u,6,5>
+  3708725398U, // <6,5,3,0>: Cost 4 vext2 <2,u,6,5>, <3,0,1,2>
+  3710052631U, // <6,5,3,1>: Cost 4 vext2 <3,1,6,5>, <3,1,6,5>
+  3708725606U, // <6,5,3,2>: Cost 4 vext2 <2,u,6,5>, <3,2,6,3>
+  3708725660U, // <6,5,3,3>: Cost 4 vext2 <2,u,6,5>, <3,3,3,3>
+  2643610114U, // <6,5,3,4>: Cost 3 vext2 <4,3,6,5>, <3,4,5,6>
+  3717352010U, // <6,5,3,5>: Cost 4 vext2 <4,3,6,5>, <3,5,4,6>
+  3773632358U, // <6,5,3,6>: Cost 4 vext3 <2,5,3,6>, <5,3,6,0>
+  2248978533U, // <6,5,3,7>: Cost 3 vrev <5,6,7,3>
+  2249052270U, // <6,5,3,u>: Cost 3 vrev <5,6,u,3>
+  2596323430U, // <6,5,4,0>: Cost 3 vext1 <7,6,5,4>, LHS
+  3716025328U, // <6,5,4,1>: Cost 4 vext2 <4,1,6,5>, <4,1,6,5>
+  3716688961U, // <6,5,4,2>: Cost 4 vext2 <4,2,6,5>, <4,2,6,5>
+  2643610770U, // <6,5,4,3>: Cost 3 vext2 <4,3,6,5>, <4,3,6,5>
+  2596326710U, // <6,5,4,4>: Cost 3 vext1 <7,6,5,4>, RHS
+  2634984758U, // <6,5,4,5>: Cost 3 vext2 <2,u,6,5>, RHS
+  3767734199U, // <6,5,4,6>: Cost 4 vext3 <1,5,4,6>, <5,4,6,0>
+  1643696070U, // <6,5,4,7>: Cost 2 vext3 <5,4,7,6>, <5,4,7,6>
+  1643769807U, // <6,5,4,u>: Cost 2 vext3 <5,4,u,6>, <5,4,u,6>
+  2578415718U, // <6,5,5,0>: Cost 3 vext1 <4,6,5,5>, LHS
+  3652158198U, // <6,5,5,1>: Cost 4 vext1 <4,6,5,5>, <1,0,3,2>
+  3652159080U, // <6,5,5,2>: Cost 4 vext1 <4,6,5,5>, <2,2,2,2>
+  3652159638U, // <6,5,5,3>: Cost 4 vext1 <4,6,5,5>, <3,0,1,2>
+  2578418998U, // <6,5,5,4>: Cost 3 vext1 <4,6,5,5>, RHS
+  2712571908U, // <6,5,5,5>: Cost 3 vext3 <4,6,4,6>, <5,5,5,5>
+  2718027790U, // <6,5,5,6>: Cost 3 vext3 <5,5,6,6>, <5,5,6,6>
+  2712571928U, // <6,5,5,7>: Cost 3 vext3 <4,6,4,6>, <5,5,7,7>
+  2712571937U, // <6,5,5,u>: Cost 3 vext3 <4,6,4,6>, <5,5,u,7>
+  2705346596U, // <6,5,6,0>: Cost 3 vext3 <3,4,5,6>, <5,6,0,1>
+  3767144496U, // <6,5,6,1>: Cost 4 vext3 <1,4,5,6>, <5,6,1,4>
+  3773116473U, // <6,5,6,2>: Cost 4 vext3 <2,4,5,6>, <5,6,2,4>
+  2705346626U, // <6,5,6,3>: Cost 3 vext3 <3,4,5,6>, <5,6,3,4>
+  2705346636U, // <6,5,6,4>: Cost 3 vext3 <3,4,5,6>, <5,6,4,5>
+  3908577217U, // <6,5,6,5>: Cost 4 vuzpr <2,6,4,5>, <2,6,4,5>
+  2578428728U, // <6,5,6,6>: Cost 3 vext1 <4,6,5,6>, <6,6,6,6>
+  2712572002U, // <6,5,6,7>: Cost 3 vext3 <4,6,4,6>, <5,6,7,0>
+  2705346668U, // <6,5,6,u>: Cost 3 vext3 <3,4,5,6>, <5,6,u,1>
+  2560516198U, // <6,5,7,0>: Cost 3 vext1 <1,6,5,7>, LHS
+  2560517363U, // <6,5,7,1>: Cost 3 vext1 <1,6,5,7>, <1,6,5,7>
+  2566490060U, // <6,5,7,2>: Cost 3 vext1 <2,6,5,7>, <2,6,5,7>
+  3634260118U, // <6,5,7,3>: Cost 4 vext1 <1,6,5,7>, <3,0,1,2>
+  2560519478U, // <6,5,7,4>: Cost 3 vext1 <1,6,5,7>, RHS
+  2980498650U, // <6,5,7,5>: Cost 3 vzipr RHS, <4,4,5,5>
+  2980497922U, // <6,5,7,6>: Cost 3 vzipr RHS, <3,4,5,6>
+  3103214902U, // <6,5,7,7>: Cost 3 vtrnr <2,6,3,7>, RHS
+  2560522030U, // <6,5,7,u>: Cost 3 vext1 <1,6,5,7>, LHS
+  2560524390U, // <6,5,u,0>: Cost 3 vext1 <1,6,5,u>, LHS
+  2560525556U, // <6,5,u,1>: Cost 3 vext1 <1,6,5,u>, <1,6,5,u>
+  2566498253U, // <6,5,u,2>: Cost 3 vext1 <2,6,5,u>, <2,6,5,u>
+  2646931439U, // <6,5,u,3>: Cost 3 vext2 <4,u,6,5>, <u,3,5,7>
+  2560527670U, // <6,5,u,4>: Cost 3 vext1 <1,6,5,u>, RHS
+  2634987674U, // <6,5,u,5>: Cost 3 vext2 <2,u,6,5>, RHS
+  2980506114U, // <6,5,u,6>: Cost 3 vzipr RHS, <3,4,5,6>
+  1175277674U, // <6,5,u,7>: Cost 2 vrev <5,6,7,u>
+  1175351411U, // <6,5,u,u>: Cost 2 vrev <5,6,u,u>
+  2578448486U, // <6,6,0,0>: Cost 3 vext1 <4,6,6,0>, LHS
+  1573191782U, // <6,6,0,1>: Cost 2 vext2 <4,u,6,6>, LHS
+  2686030124U, // <6,6,0,2>: Cost 3 vext3 <0,2,4,6>, <6,0,2,4>
+  3779088690U, // <6,6,0,3>: Cost 4 vext3 <3,4,5,6>, <6,0,3,1>
+  2687209788U, // <6,6,0,4>: Cost 3 vext3 <0,4,2,6>, <6,0,4,2>
+  3652194000U, // <6,6,0,5>: Cost 4 vext1 <4,6,6,0>, <5,1,7,3>
+  2254852914U, // <6,6,0,6>: Cost 3 vrev <6,6,6,0>
+  4041575734U, // <6,6,0,7>: Cost 4 vzipr <2,4,6,0>, RHS
+  1573192349U, // <6,6,0,u>: Cost 2 vext2 <4,u,6,6>, LHS
+  2646934262U, // <6,6,1,0>: Cost 3 vext2 <4,u,6,6>, <1,0,3,2>
+  2646934324U, // <6,6,1,1>: Cost 3 vext2 <4,u,6,6>, <1,1,1,1>
+  2646934422U, // <6,6,1,2>: Cost 3 vext2 <4,u,6,6>, <1,2,3,0>
+  2846785638U, // <6,6,1,3>: Cost 3 vuzpr <4,6,4,6>, LHS
+  3760951694U, // <6,6,1,4>: Cost 4 vext3 <0,4,2,6>, <6,1,4,3>
+  2646934672U, // <6,6,1,5>: Cost 3 vext2 <4,u,6,6>, <1,5,3,7>
+  2712572320U, // <6,6,1,6>: Cost 3 vext3 <4,6,4,6>, <6,1,6,3>
+  3775549865U, // <6,6,1,7>: Cost 4 vext3 <2,u,2,6>, <6,1,7,3>
+  2846785643U, // <6,6,1,u>: Cost 3 vuzpr <4,6,4,6>, LHS
+  3759772094U, // <6,6,2,0>: Cost 4 vext3 <0,2,4,6>, <6,2,0,6>
+  3704751676U, // <6,6,2,1>: Cost 4 vext2 <2,2,6,6>, <2,1,6,3>
+  2631009936U, // <6,6,2,2>: Cost 3 vext2 <2,2,6,6>, <2,2,6,6>
+  2646935206U, // <6,6,2,3>: Cost 3 vext2 <4,u,6,6>, <2,3,0,1>
+  3759772127U, // <6,6,2,4>: Cost 4 vext3 <0,2,4,6>, <6,2,4,3>
+  3704752004U, // <6,6,2,5>: Cost 4 vext2 <2,2,6,6>, <2,5,6,7>
+  2646935482U, // <6,6,2,6>: Cost 3 vext2 <4,u,6,6>, <2,6,3,7>
+  2712572410U, // <6,6,2,7>: Cost 3 vext3 <4,6,4,6>, <6,2,7,3>
+  2712572419U, // <6,6,2,u>: Cost 3 vext3 <4,6,4,6>, <6,2,u,3>
+  2646935702U, // <6,6,3,0>: Cost 3 vext2 <4,u,6,6>, <3,0,1,2>
+  3777024534U, // <6,6,3,1>: Cost 4 vext3 <3,1,4,6>, <6,3,1,4>
+  3704752453U, // <6,6,3,2>: Cost 4 vext2 <2,2,6,6>, <3,2,2,6>
+  2646935964U, // <6,6,3,3>: Cost 3 vext2 <4,u,6,6>, <3,3,3,3>
+  2705347122U, // <6,6,3,4>: Cost 3 vext3 <3,4,5,6>, <6,3,4,5>
+  3779678778U, // <6,6,3,5>: Cost 4 vext3 <3,5,4,6>, <6,3,5,4>
+  2657553069U, // <6,6,3,6>: Cost 3 vext2 <6,6,6,6>, <3,6,6,6>
+  4039609654U, // <6,6,3,7>: Cost 4 vzipr <2,1,6,3>, RHS
+  2708001366U, // <6,6,3,u>: Cost 3 vext3 <3,u,5,6>, <6,3,u,5>
+  2578481254U, // <6,6,4,0>: Cost 3 vext1 <4,6,6,4>, LHS
+  3652223734U, // <6,6,4,1>: Cost 4 vext1 <4,6,6,4>, <1,0,3,2>
+  3760951922U, // <6,6,4,2>: Cost 4 vext3 <0,4,2,6>, <6,4,2,6>
+  3779089019U, // <6,6,4,3>: Cost 4 vext3 <3,4,5,6>, <6,4,3,6>
+  1570540772U, // <6,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
+  1573195062U, // <6,6,4,5>: Cost 2 vext2 <4,u,6,6>, RHS
+  2712572560U, // <6,6,4,6>: Cost 3 vext3 <4,6,4,6>, <6,4,6,0>
+  2723410591U, // <6,6,4,7>: Cost 3 vext3 <6,4,7,6>, <6,4,7,6>
+  1573195304U, // <6,6,4,u>: Cost 2 vext2 <4,u,6,6>, <4,u,6,6>
+  3640287334U, // <6,6,5,0>: Cost 4 vext1 <2,6,6,5>, LHS
+  2646937296U, // <6,6,5,1>: Cost 3 vext2 <4,u,6,6>, <5,1,7,3>
+  3640289235U, // <6,6,5,2>: Cost 4 vext1 <2,6,6,5>, <2,6,6,5>
+  3720679279U, // <6,6,5,3>: Cost 4 vext2 <4,u,6,6>, <5,3,7,0>
+  2646937542U, // <6,6,5,4>: Cost 3 vext2 <4,u,6,6>, <5,4,7,6>
+  2646937604U, // <6,6,5,5>: Cost 3 vext2 <4,u,6,6>, <5,5,5,5>
+  2646937698U, // <6,6,5,6>: Cost 3 vext2 <4,u,6,6>, <5,6,7,0>
+  2846788918U, // <6,6,5,7>: Cost 3 vuzpr <4,6,4,6>, RHS
+  2846788919U, // <6,6,5,u>: Cost 3 vuzpr <4,6,4,6>, RHS
+  1516699750U, // <6,6,6,0>: Cost 2 vext1 <6,6,6,6>, LHS
+  2590442230U, // <6,6,6,1>: Cost 3 vext1 <6,6,6,6>, <1,0,3,2>
+  2646938106U, // <6,6,6,2>: Cost 3 vext2 <4,u,6,6>, <6,2,7,3>
+  2590443670U, // <6,6,6,3>: Cost 3 vext1 <6,6,6,6>, <3,0,1,2>
+  1516703030U, // <6,6,6,4>: Cost 2 vext1 <6,6,6,6>, RHS
+  2590445264U, // <6,6,6,5>: Cost 3 vext1 <6,6,6,6>, <5,1,7,3>
+  296144182U, // <6,6,6,6>: Cost 1 vdup2 RHS
+  2712572738U, // <6,6,6,7>: Cost 3 vext3 <4,6,4,6>, <6,6,7,7>
+  296144182U, // <6,6,6,u>: Cost 1 vdup2 RHS
+  2566561894U, // <6,6,7,0>: Cost 3 vext1 <2,6,6,7>, LHS
+  3634332924U, // <6,6,7,1>: Cost 4 vext1 <1,6,6,7>, <1,6,6,7>
+  2566563797U, // <6,6,7,2>: Cost 3 vext1 <2,6,6,7>, <2,6,6,7>
+  2584480258U, // <6,6,7,3>: Cost 3 vext1 <5,6,6,7>, <3,4,5,6>
+  2566565174U, // <6,6,7,4>: Cost 3 vext1 <2,6,6,7>, RHS
+  2717438846U, // <6,6,7,5>: Cost 3 vext3 <5,4,7,6>, <6,7,5,4>
+  2980500280U, // <6,6,7,6>: Cost 3 vzipr RHS, <6,6,6,6>
+  1906756918U, // <6,6,7,7>: Cost 2 vzipr RHS, RHS
+  1906756919U, // <6,6,7,u>: Cost 2 vzipr RHS, RHS
+  1516699750U, // <6,6,u,0>: Cost 2 vext1 <6,6,6,6>, LHS
+  1573197614U, // <6,6,u,1>: Cost 2 vext2 <4,u,6,6>, LHS
+  2566571990U, // <6,6,u,2>: Cost 3 vext1 <2,6,6,u>, <2,6,6,u>
+  2846786205U, // <6,6,u,3>: Cost 3 vuzpr <4,6,4,6>, LHS
+  1516703030U, // <6,6,u,4>: Cost 2 vext1 <6,6,6,6>, RHS
+  1573197978U, // <6,6,u,5>: Cost 2 vext2 <4,u,6,6>, RHS
+  296144182U, // <6,6,u,6>: Cost 1 vdup2 RHS
+  1906765110U, // <6,6,u,7>: Cost 2 vzipr RHS, RHS
+  296144182U, // <6,6,u,u>: Cost 1 vdup2 RHS
+  1571209216U, // <6,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+  497467494U, // <6,7,0,1>: Cost 1 vext2 RHS, LHS
+  1571209380U, // <6,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+  2644951292U, // <6,7,0,3>: Cost 3 vext2 RHS, <0,3,1,0>
+  1571209554U, // <6,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+  1510756450U, // <6,7,0,5>: Cost 2 vext1 <5,6,7,0>, <5,6,7,0>
+  2644951542U, // <6,7,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
+  2584499194U, // <6,7,0,7>: Cost 3 vext1 <5,6,7,0>, <7,0,1,2>
+  497468061U, // <6,7,0,u>: Cost 1 vext2 RHS, LHS
+  1571209974U, // <6,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+  1571210036U, // <6,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+  1571210134U, // <6,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
+  1571210200U, // <6,7,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
+  2644952098U, // <6,7,1,4>: Cost 3 vext2 RHS, <1,4,0,5>
+  1571210384U, // <6,7,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
+  2644952271U, // <6,7,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
+  2578535418U, // <6,7,1,7>: Cost 3 vext1 <4,6,7,1>, <7,0,1,2>
+  1571210605U, // <6,7,1,u>: Cost 2 vext2 RHS, <1,u,1,3>
+  2644952509U, // <6,7,2,0>: Cost 3 vext2 RHS, <2,0,1,2>
+  2644952582U, // <6,7,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
+  1571210856U, // <6,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+  1571210918U, // <6,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+  2644952828U, // <6,7,2,4>: Cost 3 vext2 RHS, <2,4,0,6>
+  2633009028U, // <6,7,2,5>: Cost 3 vext2 <2,5,6,7>, <2,5,6,7>
+  1571211194U, // <6,7,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
+  2668840938U, // <6,7,2,7>: Cost 3 vext2 RHS, <2,7,0,1>
+  1571211323U, // <6,7,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
+  1571211414U, // <6,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+  2644953311U, // <6,7,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
+  2644953390U, // <6,7,3,2>: Cost 3 vext2 RHS, <3,2,0,1>
+  1571211676U, // <6,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+  1571211778U, // <6,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+  2644953648U, // <6,7,3,5>: Cost 3 vext2 RHS, <3,5,1,7>
+  2644953720U, // <6,7,3,6>: Cost 3 vext2 RHS, <3,6,0,7>
+  2644953795U, // <6,7,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
+  1571212062U, // <6,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+  1573202834U, // <6,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+  2644954058U, // <6,7,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
+  2644954166U, // <6,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
+  2644954258U, // <6,7,4,3>: Cost 3 vext2 RHS, <4,3,6,5>
+  1571212496U, // <6,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+  497470774U, // <6,7,4,5>: Cost 1 vext2 RHS, RHS
+  1573203316U, // <6,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  2646281688U, // <6,7,4,7>: Cost 3 vext2 <4,7,6,7>, <4,7,6,7>
+  497471017U, // <6,7,4,u>: Cost 1 vext2 RHS, RHS
+  2644954696U, // <6,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
+  1573203664U, // <6,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  2644954878U, // <6,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
+  2644954991U, // <6,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
+  1571213254U, // <6,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+  1571213316U, // <6,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+  1571213410U, // <6,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
+  1573204136U, // <6,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+  1573204217U, // <6,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
+  2644955425U, // <6,7,6,0>: Cost 3 vext2 RHS, <6,0,1,2>
+  2644955561U, // <6,7,6,1>: Cost 3 vext2 RHS, <6,1,7,3>
+  1573204474U, // <6,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  2644955698U, // <6,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
+  2644955789U, // <6,7,6,4>: Cost 3 vext2 RHS, <6,4,5,6>
+  2644955889U, // <6,7,6,5>: Cost 3 vext2 RHS, <6,5,7,7>
+  1571214136U, // <6,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
+  1571214158U, // <6,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+  1573204895U, // <6,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
+  1573204986U, // <6,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
+  2572608656U, // <6,7,7,1>: Cost 3 vext1 <3,6,7,7>, <1,5,3,7>
+  2644956362U, // <6,7,7,2>: Cost 3 vext2 RHS, <7,2,6,3>
+  2572610231U, // <6,7,7,3>: Cost 3 vext1 <3,6,7,7>, <3,6,7,7>
+  1573205350U, // <6,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
+  2646947220U, // <6,7,7,5>: Cost 3 vext2 RHS, <7,5,1,7>
+  1516786498U, // <6,7,7,6>: Cost 2 vext1 <6,6,7,7>, <6,6,7,7>
+  1571214956U, // <6,7,7,7>: Cost 2 vext2 RHS, <7,7,7,7>
+  1573205634U, // <6,7,7,u>: Cost 2 vext2 RHS, <7,u,1,2>
+  1571215059U, // <6,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
+  497473326U, // <6,7,u,1>: Cost 1 vext2 RHS, LHS
+  1571215237U, // <6,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
+  1571215292U, // <6,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+  1571215423U, // <6,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
+  497473690U, // <6,7,u,5>: Cost 1 vext2 RHS, RHS
+  1571215568U, // <6,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
+  1573206272U, // <6,7,u,7>: Cost 2 vext2 RHS, <u,7,0,1>
+  497473893U, // <6,7,u,u>: Cost 1 vext2 RHS, LHS
+  1571217408U, // <6,u,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+  497475686U, // <6,u,0,1>: Cost 1 vext2 RHS, LHS
+  1571217572U, // <6,u,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+  2689865445U, // <6,u,0,3>: Cost 3 vext3 <0,u,2,6>, <u,0,3,2>
+  1571217746U, // <6,u,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+  1510830187U, // <6,u,0,5>: Cost 2 vext1 <5,6,u,0>, <5,6,u,0>
+  2644959734U, // <6,u,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
+  1193130221U, // <6,u,0,7>: Cost 2 vrev <u,6,7,0>
+  497476253U, // <6,u,0,u>: Cost 1 vext2 RHS, LHS
+  1571218166U, // <6,u,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+  1571218228U, // <6,u,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+  1612289838U, // <6,u,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  1571218392U, // <6,u,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
+  2566663478U, // <6,u,1,4>: Cost 3 vext1 <2,6,u,1>, RHS
+  1571218576U, // <6,u,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
+  2644960463U, // <6,u,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
+  2717439835U, // <6,u,1,7>: Cost 3 vext3 <5,4,7,6>, <u,1,7,3>
+  1612289892U, // <6,u,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
+  1504870502U, // <6,u,2,0>: Cost 2 vext1 <4,6,u,2>, LHS
+  2644960774U, // <6,u,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
+  1571219048U, // <6,u,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+  1571219110U, // <6,u,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+  1504873782U, // <6,u,2,4>: Cost 2 vext1 <4,6,u,2>, RHS
+  2633017221U, // <6,u,2,5>: Cost 3 vext2 <2,5,6,u>, <2,5,6,u>
+  1571219386U, // <6,u,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
+  2712573868U, // <6,u,2,7>: Cost 3 vext3 <4,6,4,6>, <u,2,7,3>
+  1571219515U, // <6,u,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
+  1571219606U, // <6,u,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+  2644961503U, // <6,u,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
+  2566678499U, // <6,u,3,2>: Cost 3 vext1 <2,6,u,3>, <2,6,u,3>
+  1571219868U, // <6,u,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+  1571219970U, // <6,u,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+  2689865711U, // <6,u,3,5>: Cost 3 vext3 <0,u,2,6>, <u,3,5,7>
+  2708002806U, // <6,u,3,6>: Cost 3 vext3 <3,u,5,6>, <u,3,6,5>
+  2644961987U, // <6,u,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
+  1571220254U, // <6,u,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+  1571220370U, // <6,u,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+  2644962250U, // <6,u,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
+  1661245476U, // <6,u,4,2>: Cost 2 vext3 <u,4,2,6>, <u,4,2,6>
+  2686031917U, // <6,u,4,3>: Cost 3 vext3 <0,2,4,6>, <u,4,3,6>
+  1571220688U, // <6,u,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+  497478967U, // <6,u,4,5>: Cost 1 vext2 RHS, RHS
+  1571220852U, // <6,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  1661614161U, // <6,u,4,7>: Cost 2 vext3 <u,4,7,6>, <u,4,7,6>
+  497479209U, // <6,u,4,u>: Cost 1 vext2 RHS, RHS
+  2566692966U, // <6,u,5,0>: Cost 3 vext1 <2,6,u,5>, LHS
+  1571221200U, // <6,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  2566694885U, // <6,u,5,2>: Cost 3 vext1 <2,6,u,5>, <2,6,u,5>
+  2689865855U, // <6,u,5,3>: Cost 3 vext3 <0,u,2,6>, <u,5,3,7>
+  1571221446U, // <6,u,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+  1571221508U, // <6,u,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+  1612290202U, // <6,u,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
+  1571221672U, // <6,u,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+  1612290220U, // <6,u,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
+  1504903270U, // <6,u,6,0>: Cost 2 vext1 <4,6,u,6>, LHS
+  2644963752U, // <6,u,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
+  1571222010U, // <6,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  2686032080U, // <6,u,6,3>: Cost 3 vext3 <0,2,4,6>, <u,6,3,7>
+  1504906550U, // <6,u,6,4>: Cost 2 vext1 <4,6,u,6>, RHS
+  2644964079U, // <6,u,6,5>: Cost 3 vext2 RHS, <6,5,7,5>
+  296144182U, // <6,u,6,6>: Cost 1 vdup2 RHS
+  1571222350U, // <6,u,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+  296144182U, // <6,u,6,u>: Cost 1 vdup2 RHS
+  1492967526U, // <6,u,7,0>: Cost 2 vext1 <2,6,u,7>, LHS
+  2560738574U, // <6,u,7,1>: Cost 3 vext1 <1,6,u,7>, <1,6,u,7>
+  1492969447U, // <6,u,7,2>: Cost 2 vext1 <2,6,u,7>, <2,6,u,7>
+  1906753692U, // <6,u,7,3>: Cost 2 vzipr RHS, LHS
+  1492970806U, // <6,u,7,4>: Cost 2 vext1 <2,6,u,7>, RHS
+  2980495761U, // <6,u,7,5>: Cost 3 vzipr RHS, <0,4,u,5>
+  1516860235U, // <6,u,7,6>: Cost 2 vext1 <6,6,u,7>, <6,6,u,7>
+  1906756936U, // <6,u,7,7>: Cost 2 vzipr RHS, RHS
+  1492973358U, // <6,u,7,u>: Cost 2 vext1 <2,6,u,7>, LHS
+  1492975718U, // <6,u,u,0>: Cost 2 vext1 <2,6,u,u>, LHS
+  497481518U, // <6,u,u,1>: Cost 1 vext2 RHS, LHS
+  1612290405U, // <6,u,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  1571223484U, // <6,u,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+  1492978998U, // <6,u,u,4>: Cost 2 vext1 <2,6,u,u>, RHS
+  497481882U, // <6,u,u,5>: Cost 1 vext2 RHS, RHS
+  296144182U, // <6,u,u,6>: Cost 1 vdup2 RHS
+  1906765128U, // <6,u,u,7>: Cost 2 vzipr RHS, RHS
+  497482085U, // <6,u,u,u>: Cost 1 vext2 RHS, LHS
+  1638318080U, // <7,0,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
+  1638318090U, // <7,0,0,1>: Cost 2 vext3 RHS, <0,0,1,1>
+  1638318100U, // <7,0,0,2>: Cost 2 vext3 RHS, <0,0,2,2>
+  3646442178U, // <7,0,0,3>: Cost 4 vext1 <3,7,0,0>, <3,7,0,0>
+  2712059941U, // <7,0,0,4>: Cost 3 vext3 RHS, <0,0,4,1>
+  2651603364U, // <7,0,0,5>: Cost 3 vext2 <5,6,7,0>, <0,5,1,6>
+  2590618445U, // <7,0,0,6>: Cost 3 vext1 <6,7,0,0>, <6,7,0,0>
+  3785801798U, // <7,0,0,7>: Cost 4 vext3 RHS, <0,0,7,7>
+  1638318153U, // <7,0,0,u>: Cost 2 vext3 RHS, <0,0,u,1>
+  1516879974U, // <7,0,1,0>: Cost 2 vext1 <6,7,0,1>, LHS
+  2693922911U, // <7,0,1,1>: Cost 3 vext3 <1,5,3,7>, <0,1,1,5>
+  564576358U, // <7,0,1,2>: Cost 1 vext3 RHS, LHS
+  2638996480U, // <7,0,1,3>: Cost 3 vext2 <3,5,7,0>, <1,3,5,7>
+  1516883254U, // <7,0,1,4>: Cost 2 vext1 <6,7,0,1>, RHS
+  2649613456U, // <7,0,1,5>: Cost 3 vext2 <5,3,7,0>, <1,5,3,7>
+  1516884814U, // <7,0,1,6>: Cost 2 vext1 <6,7,0,1>, <6,7,0,1>
+  2590626808U, // <7,0,1,7>: Cost 3 vext1 <6,7,0,1>, <7,0,1,0>
+  564576412U, // <7,0,1,u>: Cost 1 vext3 RHS, LHS
+  1638318244U, // <7,0,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
+  2692743344U, // <7,0,2,1>: Cost 3 vext3 <1,3,5,7>, <0,2,1,5>
+  2712060084U, // <7,0,2,2>: Cost 3 vext3 RHS, <0,2,2,0>
+  2712060094U, // <7,0,2,3>: Cost 3 vext3 RHS, <0,2,3,1>
+  1638318284U, // <7,0,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
+  2712060118U, // <7,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
+  2651604922U, // <7,0,2,6>: Cost 3 vext2 <5,6,7,0>, <2,6,3,7>
+  2686255336U, // <7,0,2,7>: Cost 3 vext3 <0,2,7,7>, <0,2,7,7>
+  1638318316U, // <7,0,2,u>: Cost 2 vext3 RHS, <0,2,u,2>
+  2651605142U, // <7,0,3,0>: Cost 3 vext2 <5,6,7,0>, <3,0,1,2>
+  2712060156U, // <7,0,3,1>: Cost 3 vext3 RHS, <0,3,1,0>
+  2712060165U, // <7,0,3,2>: Cost 3 vext3 RHS, <0,3,2,0>
+  2651605404U, // <7,0,3,3>: Cost 3 vext2 <5,6,7,0>, <3,3,3,3>
+  2651605506U, // <7,0,3,4>: Cost 3 vext2 <5,6,7,0>, <3,4,5,6>
+  2638998111U, // <7,0,3,5>: Cost 3 vext2 <3,5,7,0>, <3,5,7,0>
+  2639661744U, // <7,0,3,6>: Cost 3 vext2 <3,6,7,0>, <3,6,7,0>
+  3712740068U, // <7,0,3,7>: Cost 4 vext2 <3,5,7,0>, <3,7,3,7>
+  2640989010U, // <7,0,3,u>: Cost 3 vext2 <3,u,7,0>, <3,u,7,0>
+  2712060232U, // <7,0,4,0>: Cost 3 vext3 RHS, <0,4,0,4>
+  1638318418U, // <7,0,4,1>: Cost 2 vext3 RHS, <0,4,1,5>
+  1638318428U, // <7,0,4,2>: Cost 2 vext3 RHS, <0,4,2,6>
+  3646474950U, // <7,0,4,3>: Cost 4 vext1 <3,7,0,4>, <3,7,0,4>
+  2712060270U, // <7,0,4,4>: Cost 3 vext3 RHS, <0,4,4,6>
+  1577864502U, // <7,0,4,5>: Cost 2 vext2 <5,6,7,0>, RHS
+  2651606388U, // <7,0,4,6>: Cost 3 vext2 <5,6,7,0>, <4,6,4,6>
+  3787792776U, // <7,0,4,7>: Cost 4 vext3 RHS, <0,4,7,5>
+  1638318481U, // <7,0,4,u>: Cost 2 vext3 RHS, <0,4,u,5>
+  2590654566U, // <7,0,5,0>: Cost 3 vext1 <6,7,0,5>, LHS
+  2651606736U, // <7,0,5,1>: Cost 3 vext2 <5,6,7,0>, <5,1,7,3>
+  2712060334U, // <7,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
+  2649616239U, // <7,0,5,3>: Cost 3 vext2 <5,3,7,0>, <5,3,7,0>
+  2651606982U, // <7,0,5,4>: Cost 3 vext2 <5,6,7,0>, <5,4,7,6>
+  2651607044U, // <7,0,5,5>: Cost 3 vext2 <5,6,7,0>, <5,5,5,5>
+  1577865314U, // <7,0,5,6>: Cost 2 vext2 <5,6,7,0>, <5,6,7,0>
+  2651607208U, // <7,0,5,7>: Cost 3 vext2 <5,6,7,0>, <5,7,5,7>
+  1579192580U, // <7,0,5,u>: Cost 2 vext2 <5,u,7,0>, <5,u,7,0>
+  2688393709U, // <7,0,6,0>: Cost 3 vext3 <0,6,0,7>, <0,6,0,7>
+  2712060406U, // <7,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
+  2688541183U, // <7,0,6,2>: Cost 3 vext3 <0,6,2,7>, <0,6,2,7>
+  2655588936U, // <7,0,6,3>: Cost 3 vext2 <6,3,7,0>, <6,3,7,0>
+  3762430481U, // <7,0,6,4>: Cost 4 vext3 <0,6,4,7>, <0,6,4,7>
+  2651607730U, // <7,0,6,5>: Cost 3 vext2 <5,6,7,0>, <6,5,0,7>
+  2651607864U, // <7,0,6,6>: Cost 3 vext2 <5,6,7,0>, <6,6,6,6>
+  2651607886U, // <7,0,6,7>: Cost 3 vext2 <5,6,7,0>, <6,7,0,1>
+  2688983605U, // <7,0,6,u>: Cost 3 vext3 <0,6,u,7>, <0,6,u,7>
+  2651608058U, // <7,0,7,0>: Cost 3 vext2 <5,6,7,0>, <7,0,1,2>
+  2932703334U, // <7,0,7,1>: Cost 3 vzipl <7,7,7,7>, LHS
+  3066921062U, // <7,0,7,2>: Cost 3 vtrnl <7,7,7,7>, LHS
+  3712742678U, // <7,0,7,3>: Cost 4 vext2 <3,5,7,0>, <7,3,5,7>
+  2651608422U, // <7,0,7,4>: Cost 3 vext2 <5,6,7,0>, <7,4,5,6>
+  2651608513U, // <7,0,7,5>: Cost 3 vext2 <5,6,7,0>, <7,5,6,7>
+  2663552532U, // <7,0,7,6>: Cost 3 vext2 <7,6,7,0>, <7,6,7,0>
+  2651608684U, // <7,0,7,7>: Cost 3 vext2 <5,6,7,0>, <7,7,7,7>
+  2651608706U, // <7,0,7,u>: Cost 3 vext2 <5,6,7,0>, <7,u,1,2>
+  1638318730U, // <7,0,u,0>: Cost 2 vext3 RHS, <0,u,0,2>
+  1638318738U, // <7,0,u,1>: Cost 2 vext3 RHS, <0,u,1,1>
+  564576925U, // <7,0,u,2>: Cost 1 vext3 RHS, LHS
+  2572765898U, // <7,0,u,3>: Cost 3 vext1 <3,7,0,u>, <3,7,0,u>
+  1638318770U, // <7,0,u,4>: Cost 2 vext3 RHS, <0,u,4,6>
+  1577867418U, // <7,0,u,5>: Cost 2 vext2 <5,6,7,0>, RHS
+  1516942165U, // <7,0,u,6>: Cost 2 vext1 <6,7,0,u>, <6,7,0,u>
+  2651609344U, // <7,0,u,7>: Cost 3 vext2 <5,6,7,0>, <u,7,0,1>
+  564576979U, // <7,0,u,u>: Cost 1 vext3 RHS, LHS
+  2590687334U, // <7,1,0,0>: Cost 3 vext1 <6,7,1,0>, LHS
+  2639003750U, // <7,1,0,1>: Cost 3 vext2 <3,5,7,1>, LHS
+  2793357414U, // <7,1,0,2>: Cost 3 vuzpl <7,0,1,2>, LHS
+  1638318838U, // <7,1,0,3>: Cost 2 vext3 RHS, <1,0,3,2>
+  2590690614U, // <7,1,0,4>: Cost 3 vext1 <6,7,1,0>, RHS
+  2712060679U, // <7,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
+  2590692182U, // <7,1,0,6>: Cost 3 vext1 <6,7,1,0>, <6,7,1,0>
+  3785802521U, // <7,1,0,7>: Cost 4 vext3 RHS, <1,0,7,1>
+  1638318883U, // <7,1,0,u>: Cost 2 vext3 RHS, <1,0,u,2>
+  2712060715U, // <7,1,1,0>: Cost 3 vext3 RHS, <1,1,0,1>
+  1638318900U, // <7,1,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
+  3774300994U, // <7,1,1,2>: Cost 4 vext3 <2,6,3,7>, <1,1,2,6>
+  1638318920U, // <7,1,1,3>: Cost 2 vext3 RHS, <1,1,3,3>
+  2712060755U, // <7,1,1,4>: Cost 3 vext3 RHS, <1,1,4,5>
+  2691416926U, // <7,1,1,5>: Cost 3 vext3 <1,1,5,7>, <1,1,5,7>
+  2590700375U, // <7,1,1,6>: Cost 3 vext1 <6,7,1,1>, <6,7,1,1>
+  3765158766U, // <7,1,1,7>: Cost 4 vext3 <1,1,5,7>, <1,1,7,5>
+  1638318965U, // <7,1,1,u>: Cost 2 vext3 RHS, <1,1,u,3>
+  2712060796U, // <7,1,2,0>: Cost 3 vext3 RHS, <1,2,0,1>
+  2712060807U, // <7,1,2,1>: Cost 3 vext3 RHS, <1,2,1,3>
+  3712747112U, // <7,1,2,2>: Cost 4 vext2 <3,5,7,1>, <2,2,2,2>
+  1638318998U, // <7,1,2,3>: Cost 2 vext3 RHS, <1,2,3,0>
+  2712060836U, // <7,1,2,4>: Cost 3 vext3 RHS, <1,2,4,5>
+  2712060843U, // <7,1,2,5>: Cost 3 vext3 RHS, <1,2,5,3>
+  2590708568U, // <7,1,2,6>: Cost 3 vext1 <6,7,1,2>, <6,7,1,2>
+  2735948730U, // <7,1,2,7>: Cost 3 vext3 RHS, <1,2,7,0>
+  1638319043U, // <7,1,2,u>: Cost 2 vext3 RHS, <1,2,u,0>
+  2712060876U, // <7,1,3,0>: Cost 3 vext3 RHS, <1,3,0,0>
+  1638319064U, // <7,1,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
+  2712060894U, // <7,1,3,2>: Cost 3 vext3 RHS, <1,3,2,0>
+  2692596718U, // <7,1,3,3>: Cost 3 vext3 <1,3,3,7>, <1,3,3,7>
+  2712060917U, // <7,1,3,4>: Cost 3 vext3 RHS, <1,3,4,5>
+  1619002368U, // <7,1,3,5>: Cost 2 vext3 <1,3,5,7>, <1,3,5,7>
+  2692817929U, // <7,1,3,6>: Cost 3 vext3 <1,3,6,7>, <1,3,6,7>
+  2735948814U, // <7,1,3,7>: Cost 3 vext3 RHS, <1,3,7,3>
+  1619223579U, // <7,1,3,u>: Cost 2 vext3 <1,3,u,7>, <1,3,u,7>
+  2712060962U, // <7,1,4,0>: Cost 3 vext3 RHS, <1,4,0,5>
+  2712060971U, // <7,1,4,1>: Cost 3 vext3 RHS, <1,4,1,5>
+  2712060980U, // <7,1,4,2>: Cost 3 vext3 RHS, <1,4,2,5>
+  2712060989U, // <7,1,4,3>: Cost 3 vext3 RHS, <1,4,3,5>
+  3785802822U, // <7,1,4,4>: Cost 4 vext3 RHS, <1,4,4,5>
+  2639007030U, // <7,1,4,5>: Cost 3 vext2 <3,5,7,1>, RHS
+  2645642634U, // <7,1,4,6>: Cost 3 vext2 <4,6,7,1>, <4,6,7,1>
+  3719384520U, // <7,1,4,7>: Cost 4 vext2 <4,6,7,1>, <4,7,5,0>
+  2639007273U, // <7,1,4,u>: Cost 3 vext2 <3,5,7,1>, RHS
+  2572812390U, // <7,1,5,0>: Cost 3 vext1 <3,7,1,5>, LHS
+  2693776510U, // <7,1,5,1>: Cost 3 vext3 <1,5,1,7>, <1,5,1,7>
+  3774301318U, // <7,1,5,2>: Cost 4 vext3 <2,6,3,7>, <1,5,2,6>
+  1620182160U, // <7,1,5,3>: Cost 2 vext3 <1,5,3,7>, <1,5,3,7>
+  2572815670U, // <7,1,5,4>: Cost 3 vext1 <3,7,1,5>, RHS
+  3766486178U, // <7,1,5,5>: Cost 4 vext3 <1,3,5,7>, <1,5,5,7>
+  2651615331U, // <7,1,5,6>: Cost 3 vext2 <5,6,7,1>, <5,6,7,1>
+  2652278964U, // <7,1,5,7>: Cost 3 vext2 <5,7,7,1>, <5,7,7,1>
+  1620550845U, // <7,1,5,u>: Cost 2 vext3 <1,5,u,7>, <1,5,u,7>
+  3768108230U, // <7,1,6,0>: Cost 4 vext3 <1,6,0,7>, <1,6,0,7>
+  2694440143U, // <7,1,6,1>: Cost 3 vext3 <1,6,1,7>, <1,6,1,7>
+  2712061144U, // <7,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
+  2694587617U, // <7,1,6,3>: Cost 3 vext3 <1,6,3,7>, <1,6,3,7>
+  3768403178U, // <7,1,6,4>: Cost 4 vext3 <1,6,4,7>, <1,6,4,7>
+  2694735091U, // <7,1,6,5>: Cost 3 vext3 <1,6,5,7>, <1,6,5,7>
+  3768550652U, // <7,1,6,6>: Cost 4 vext3 <1,6,6,7>, <1,6,6,7>
+  2652279630U, // <7,1,6,7>: Cost 3 vext2 <5,7,7,1>, <6,7,0,1>
+  2694956302U, // <7,1,6,u>: Cost 3 vext3 <1,6,u,7>, <1,6,u,7>
+  2645644282U, // <7,1,7,0>: Cost 3 vext2 <4,6,7,1>, <7,0,1,2>
+  2859062094U, // <7,1,7,1>: Cost 3 vuzpr <6,7,0,1>, <6,7,0,1>
+  3779462437U, // <7,1,7,2>: Cost 4 vext3 <3,5,1,7>, <1,7,2,3>
+  3121938534U, // <7,1,7,3>: Cost 3 vtrnr <5,7,5,7>, LHS
+  2554916150U, // <7,1,7,4>: Cost 3 vext1 <0,7,1,7>, RHS
+  3769140548U, // <7,1,7,5>: Cost 4 vext3 <1,7,5,7>, <1,7,5,7>
+  3726022164U, // <7,1,7,6>: Cost 4 vext2 <5,7,7,1>, <7,6,7,0>
+  2554918508U, // <7,1,7,7>: Cost 3 vext1 <0,7,1,7>, <7,7,7,7>
+  3121938539U, // <7,1,7,u>: Cost 3 vtrnr <5,7,5,7>, LHS
+  2572836966U, // <7,1,u,0>: Cost 3 vext1 <3,7,1,u>, LHS
+  1638319469U, // <7,1,u,1>: Cost 2 vext3 RHS, <1,u,1,3>
+  2712061299U, // <7,1,u,2>: Cost 3 vext3 RHS, <1,u,2,0>
+  1622173059U, // <7,1,u,3>: Cost 2 vext3 <1,u,3,7>, <1,u,3,7>
+  2572840246U, // <7,1,u,4>: Cost 3 vext1 <3,7,1,u>, RHS
+  1622320533U, // <7,1,u,5>: Cost 2 vext3 <1,u,5,7>, <1,u,5,7>
+  2696136094U, // <7,1,u,6>: Cost 3 vext3 <1,u,6,7>, <1,u,6,7>
+  2859060777U, // <7,1,u,7>: Cost 3 vuzpr <6,7,0,1>, RHS
+  1622541744U, // <7,1,u,u>: Cost 2 vext3 <1,u,u,7>, <1,u,u,7>
+  2712061364U, // <7,2,0,0>: Cost 3 vext3 RHS, <2,0,0,2>
+  2712061373U, // <7,2,0,1>: Cost 3 vext3 RHS, <2,0,1,2>
+  2712061380U, // <7,2,0,2>: Cost 3 vext3 RHS, <2,0,2,0>
+  2712061389U, // <7,2,0,3>: Cost 3 vext3 RHS, <2,0,3,0>
+  2712061404U, // <7,2,0,4>: Cost 3 vext3 RHS, <2,0,4,6>
+  2696725990U, // <7,2,0,5>: Cost 3 vext3 <2,0,5,7>, <2,0,5,7>
+  2712061417U, // <7,2,0,6>: Cost 3 vext3 RHS, <2,0,6,1>
+  3785803251U, // <7,2,0,7>: Cost 4 vext3 RHS, <2,0,7,2>
+  2696947201U, // <7,2,0,u>: Cost 3 vext3 <2,0,u,7>, <2,0,u,7>
+  2712061446U, // <7,2,1,0>: Cost 3 vext3 RHS, <2,1,0,3>
+  3785803276U, // <7,2,1,1>: Cost 4 vext3 RHS, <2,1,1,0>
+  3785803285U, // <7,2,1,2>: Cost 4 vext3 RHS, <2,1,2,0>
+  2712061471U, // <7,2,1,3>: Cost 3 vext3 RHS, <2,1,3,1>
+  2712061482U, // <7,2,1,4>: Cost 3 vext3 RHS, <2,1,4,3>
+  3766486576U, // <7,2,1,5>: Cost 4 vext3 <1,3,5,7>, <2,1,5,0>
+  2712061500U, // <7,2,1,6>: Cost 3 vext3 RHS, <2,1,6,3>
+  2602718850U, // <7,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
+  2712061516U, // <7,2,1,u>: Cost 3 vext3 RHS, <2,1,u,1>
+  2712061525U, // <7,2,2,0>: Cost 3 vext3 RHS, <2,2,0,1>
+  2712061536U, // <7,2,2,1>: Cost 3 vext3 RHS, <2,2,1,3>
+  1638319720U, // <7,2,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
+  1638319730U, // <7,2,2,3>: Cost 2 vext3 RHS, <2,2,3,3>
+  2712061565U, // <7,2,2,4>: Cost 3 vext3 RHS, <2,2,4,5>
+  2698053256U, // <7,2,2,5>: Cost 3 vext3 <2,2,5,7>, <2,2,5,7>
+  2712061584U, // <7,2,2,6>: Cost 3 vext3 RHS, <2,2,6,6>
+  3771795096U, // <7,2,2,7>: Cost 4 vext3 <2,2,5,7>, <2,2,7,5>
+  1638319775U, // <7,2,2,u>: Cost 2 vext3 RHS, <2,2,u,3>
+  1638319782U, // <7,2,3,0>: Cost 2 vext3 RHS, <2,3,0,1>
+  2693924531U, // <7,2,3,1>: Cost 3 vext3 <1,5,3,7>, <2,3,1,5>
+  2700560061U, // <7,2,3,2>: Cost 3 vext3 <2,6,3,7>, <2,3,2,6>
+  2693924551U, // <7,2,3,3>: Cost 3 vext3 <1,5,3,7>, <2,3,3,7>
+  1638319822U, // <7,2,3,4>: Cost 2 vext3 RHS, <2,3,4,5>
+  2698716889U, // <7,2,3,5>: Cost 3 vext3 <2,3,5,7>, <2,3,5,7>
+  2712061665U, // <7,2,3,6>: Cost 3 vext3 RHS, <2,3,6,6>
+  2735949540U, // <7,2,3,7>: Cost 3 vext3 RHS, <2,3,7,0>
+  1638319854U, // <7,2,3,u>: Cost 2 vext3 RHS, <2,3,u,1>
+  2712061692U, // <7,2,4,0>: Cost 3 vext3 RHS, <2,4,0,6>
+  2712061698U, // <7,2,4,1>: Cost 3 vext3 RHS, <2,4,1,3>
+  2712061708U, // <7,2,4,2>: Cost 3 vext3 RHS, <2,4,2,4>
+  2712061718U, // <7,2,4,3>: Cost 3 vext3 RHS, <2,4,3,5>
+  2712061728U, // <7,2,4,4>: Cost 3 vext3 RHS, <2,4,4,6>
+  2699380522U, // <7,2,4,5>: Cost 3 vext3 <2,4,5,7>, <2,4,5,7>
+  2712061740U, // <7,2,4,6>: Cost 3 vext3 RHS, <2,4,6,0>
+  3809691445U, // <7,2,4,7>: Cost 4 vext3 RHS, <2,4,7,0>
+  2699601733U, // <7,2,4,u>: Cost 3 vext3 <2,4,u,7>, <2,4,u,7>
+  2699675470U, // <7,2,5,0>: Cost 3 vext3 <2,5,0,7>, <2,5,0,7>
+  3766486867U, // <7,2,5,1>: Cost 4 vext3 <1,3,5,7>, <2,5,1,3>
+  2699822944U, // <7,2,5,2>: Cost 3 vext3 <2,5,2,7>, <2,5,2,7>
+  2692745065U, // <7,2,5,3>: Cost 3 vext3 <1,3,5,7>, <2,5,3,7>
+  2699970418U, // <7,2,5,4>: Cost 3 vext3 <2,5,4,7>, <2,5,4,7>
+  3766486907U, // <7,2,5,5>: Cost 4 vext3 <1,3,5,7>, <2,5,5,7>
+  2700117892U, // <7,2,5,6>: Cost 3 vext3 <2,5,6,7>, <2,5,6,7>
+  3771795334U, // <7,2,5,7>: Cost 4 vext3 <2,2,5,7>, <2,5,7,0>
+  2692745110U, // <7,2,5,u>: Cost 3 vext3 <1,3,5,7>, <2,5,u,7>
+  2572894310U, // <7,2,6,0>: Cost 3 vext1 <3,7,2,6>, LHS
+  2712061860U, // <7,2,6,1>: Cost 3 vext3 RHS, <2,6,1,3>
+  2700486577U, // <7,2,6,2>: Cost 3 vext3 <2,6,2,7>, <2,6,2,7>
+  1626818490U, // <7,2,6,3>: Cost 2 vext3 <2,6,3,7>, <2,6,3,7>
+  2572897590U, // <7,2,6,4>: Cost 3 vext1 <3,7,2,6>, RHS
+  2700707788U, // <7,2,6,5>: Cost 3 vext3 <2,6,5,7>, <2,6,5,7>
+  2700781525U, // <7,2,6,6>: Cost 3 vext3 <2,6,6,7>, <2,6,6,7>
+  3774597086U, // <7,2,6,7>: Cost 4 vext3 <2,6,7,7>, <2,6,7,7>
+  1627187175U, // <7,2,6,u>: Cost 2 vext3 <2,6,u,7>, <2,6,u,7>
+  2735949802U, // <7,2,7,0>: Cost 3 vext3 RHS, <2,7,0,1>
+  3780200434U, // <7,2,7,1>: Cost 4 vext3 <3,6,2,7>, <2,7,1,0>
+  3773564928U, // <7,2,7,2>: Cost 4 vext3 <2,5,2,7>, <2,7,2,5>
+  2986541158U, // <7,2,7,3>: Cost 3 vzipr <5,5,7,7>, LHS
+  2554989878U, // <7,2,7,4>: Cost 3 vext1 <0,7,2,7>, RHS
+  3775113245U, // <7,2,7,5>: Cost 4 vext3 <2,7,5,7>, <2,7,5,7>
+  4060283228U, // <7,2,7,6>: Cost 4 vzipr <5,5,7,7>, <0,4,2,6>
+  2554992236U, // <7,2,7,7>: Cost 3 vext1 <0,7,2,7>, <7,7,7,7>
+  2986541163U, // <7,2,7,u>: Cost 3 vzipr <5,5,7,7>, LHS
+  1638320187U, // <7,2,u,0>: Cost 2 vext3 RHS, <2,u,0,1>
+  2693924936U, // <7,2,u,1>: Cost 3 vext3 <1,5,3,7>, <2,u,1,5>
+  1638319720U, // <7,2,u,2>: Cost 2 vext3 RHS, <2,2,2,2>
+  1628145756U, // <7,2,u,3>: Cost 2 vext3 <2,u,3,7>, <2,u,3,7>
+  1638320227U, // <7,2,u,4>: Cost 2 vext3 RHS, <2,u,4,5>
+  2702035054U, // <7,2,u,5>: Cost 3 vext3 <2,u,5,7>, <2,u,5,7>
+  2702108791U, // <7,2,u,6>: Cost 3 vext3 <2,u,6,7>, <2,u,6,7>
+  2735949945U, // <7,2,u,7>: Cost 3 vext3 RHS, <2,u,7,0>
+  1628514441U, // <7,2,u,u>: Cost 2 vext3 <2,u,u,7>, <2,u,u,7>
+  2712062091U, // <7,3,0,0>: Cost 3 vext3 RHS, <3,0,0,0>
+  1638320278U, // <7,3,0,1>: Cost 2 vext3 RHS, <3,0,1,2>
+  2712062109U, // <7,3,0,2>: Cost 3 vext3 RHS, <3,0,2,0>
+  2590836886U, // <7,3,0,3>: Cost 3 vext1 <6,7,3,0>, <3,0,1,2>
+  2712062128U, // <7,3,0,4>: Cost 3 vext3 RHS, <3,0,4,1>
+  2712062138U, // <7,3,0,5>: Cost 3 vext3 RHS, <3,0,5,2>
+  2590839656U, // <7,3,0,6>: Cost 3 vext1 <6,7,3,0>, <6,7,3,0>
+  3311414017U, // <7,3,0,7>: Cost 4 vrev <3,7,7,0>
+  1638320341U, // <7,3,0,u>: Cost 2 vext3 RHS, <3,0,u,2>
+  2237164227U, // <7,3,1,0>: Cost 3 vrev <3,7,0,1>
+  2712062182U, // <7,3,1,1>: Cost 3 vext3 RHS, <3,1,1,1>
+  2712062193U, // <7,3,1,2>: Cost 3 vext3 RHS, <3,1,2,3>
+  2692745468U, // <7,3,1,3>: Cost 3 vext3 <1,3,5,7>, <3,1,3,5>
+  2712062214U, // <7,3,1,4>: Cost 3 vext3 RHS, <3,1,4,6>
+  2693925132U, // <7,3,1,5>: Cost 3 vext3 <1,5,3,7>, <3,1,5,3>
+  3768183059U, // <7,3,1,6>: Cost 4 vext3 <1,6,1,7>, <3,1,6,1>
+  2692745504U, // <7,3,1,7>: Cost 3 vext3 <1,3,5,7>, <3,1,7,5>
+  2696063273U, // <7,3,1,u>: Cost 3 vext3 <1,u,5,7>, <3,1,u,5>
+  2712062254U, // <7,3,2,0>: Cost 3 vext3 RHS, <3,2,0,1>
+  2712062262U, // <7,3,2,1>: Cost 3 vext3 RHS, <3,2,1,0>
+  2712062273U, // <7,3,2,2>: Cost 3 vext3 RHS, <3,2,2,2>
+  2712062280U, // <7,3,2,3>: Cost 3 vext3 RHS, <3,2,3,0>
+  2712062294U, // <7,3,2,4>: Cost 3 vext3 RHS, <3,2,4,5>
+  2712062302U, // <7,3,2,5>: Cost 3 vext3 RHS, <3,2,5,4>
+  2700560742U, // <7,3,2,6>: Cost 3 vext3 <2,6,3,7>, <3,2,6,3>
+  2712062319U, // <7,3,2,7>: Cost 3 vext3 RHS, <3,2,7,3>
+  2712062325U, // <7,3,2,u>: Cost 3 vext3 RHS, <3,2,u,0>
+  2712062335U, // <7,3,3,0>: Cost 3 vext3 RHS, <3,3,0,1>
+  2636368158U, // <7,3,3,1>: Cost 3 vext2 <3,1,7,3>, <3,1,7,3>
+  2637031791U, // <7,3,3,2>: Cost 3 vext2 <3,2,7,3>, <3,2,7,3>
+  1638320540U, // <7,3,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
+  2712062374U, // <7,3,3,4>: Cost 3 vext3 RHS, <3,3,4,4>
+  2704689586U, // <7,3,3,5>: Cost 3 vext3 <3,3,5,7>, <3,3,5,7>
+  2590864235U, // <7,3,3,6>: Cost 3 vext1 <6,7,3,3>, <6,7,3,3>
+  2704837060U, // <7,3,3,7>: Cost 3 vext3 <3,3,7,7>, <3,3,7,7>
+  1638320540U, // <7,3,3,u>: Cost 2 vext3 RHS, <3,3,3,3>
+  2712062416U, // <7,3,4,0>: Cost 3 vext3 RHS, <3,4,0,1>
+  2712062426U, // <7,3,4,1>: Cost 3 vext3 RHS, <3,4,1,2>
+  2566981640U, // <7,3,4,2>: Cost 3 vext1 <2,7,3,4>, <2,7,3,4>
+  2712062447U, // <7,3,4,3>: Cost 3 vext3 RHS, <3,4,3,5>
+  2712062456U, // <7,3,4,4>: Cost 3 vext3 RHS, <3,4,4,5>
+  1638320642U, // <7,3,4,5>: Cost 2 vext3 RHS, <3,4,5,6>
+  2648313204U, // <7,3,4,6>: Cost 3 vext2 <5,1,7,3>, <4,6,4,6>
+  3311446789U, // <7,3,4,7>: Cost 4 vrev <3,7,7,4>
+  1638320669U, // <7,3,4,u>: Cost 2 vext3 RHS, <3,4,u,6>
+  2602819686U, // <7,3,5,0>: Cost 3 vext1 <u,7,3,5>, LHS
+  1574571728U, // <7,3,5,1>: Cost 2 vext2 <5,1,7,3>, <5,1,7,3>
+  2648977185U, // <7,3,5,2>: Cost 3 vext2 <5,2,7,3>, <5,2,7,3>
+  2705869378U, // <7,3,5,3>: Cost 3 vext3 <3,5,3,7>, <3,5,3,7>
+  2237491947U, // <7,3,5,4>: Cost 3 vrev <3,7,4,5>
+  2706016852U, // <7,3,5,5>: Cost 3 vext3 <3,5,5,7>, <3,5,5,7>
+  2648313954U, // <7,3,5,6>: Cost 3 vext2 <5,1,7,3>, <5,6,7,0>
+  2692745823U, // <7,3,5,7>: Cost 3 vext3 <1,3,5,7>, <3,5,7,0>
+  1579217159U, // <7,3,5,u>: Cost 2 vext2 <5,u,7,3>, <5,u,7,3>
+  2706311800U, // <7,3,6,0>: Cost 3 vext3 <3,6,0,7>, <3,6,0,7>
+  2654286249U, // <7,3,6,1>: Cost 3 vext2 <6,1,7,3>, <6,1,7,3>
+  1581208058U, // <7,3,6,2>: Cost 2 vext2 <6,2,7,3>, <6,2,7,3>
+  2706533011U, // <7,3,6,3>: Cost 3 vext3 <3,6,3,7>, <3,6,3,7>
+  2706606748U, // <7,3,6,4>: Cost 3 vext3 <3,6,4,7>, <3,6,4,7>
+  3780422309U, // <7,3,6,5>: Cost 4 vext3 <3,6,5,7>, <3,6,5,7>
+  2712062637U, // <7,3,6,6>: Cost 3 vext3 RHS, <3,6,6,6>
+  2706827959U, // <7,3,6,7>: Cost 3 vext3 <3,6,7,7>, <3,6,7,7>
+  1585189856U, // <7,3,6,u>: Cost 2 vext2 <6,u,7,3>, <6,u,7,3>
+  2693925571U, // <7,3,7,0>: Cost 3 vext3 <1,5,3,7>, <3,7,0,1>
+  2693925584U, // <7,3,7,1>: Cost 3 vext3 <1,5,3,7>, <3,7,1,5>
+  2700561114U, // <7,3,7,2>: Cost 3 vext3 <2,6,3,7>, <3,7,2,6>
+  2572978916U, // <7,3,7,3>: Cost 3 vext1 <3,7,3,7>, <3,7,3,7>
+  2693925611U, // <7,3,7,4>: Cost 3 vext3 <1,5,3,7>, <3,7,4,5>
+  2707344118U, // <7,3,7,5>: Cost 3 vext3 <3,7,5,7>, <3,7,5,7>
+  2654950894U, // <7,3,7,6>: Cost 3 vext2 <6,2,7,3>, <7,6,2,7>
+  2648315500U, // <7,3,7,7>: Cost 3 vext2 <5,1,7,3>, <7,7,7,7>
+  2693925643U, // <7,3,7,u>: Cost 3 vext3 <1,5,3,7>, <3,7,u,1>
+  2237221578U, // <7,3,u,0>: Cost 3 vrev <3,7,0,u>
+  1638320926U, // <7,3,u,1>: Cost 2 vext3 RHS, <3,u,1,2>
+  1593153452U, // <7,3,u,2>: Cost 2 vext2 <u,2,7,3>, <u,2,7,3>
+  1638320540U, // <7,3,u,3>: Cost 2 vext3 RHS, <3,3,3,3>
+  2237516526U, // <7,3,u,4>: Cost 3 vrev <3,7,4,u>
+  1638320966U, // <7,3,u,5>: Cost 2 vext3 RHS, <3,u,5,6>
+  2712062796U, // <7,3,u,6>: Cost 3 vext3 RHS, <3,u,6,3>
+  2692967250U, // <7,3,u,7>: Cost 3 vext3 <1,3,u,7>, <3,u,7,0>
+  1638320989U, // <7,3,u,u>: Cost 2 vext3 RHS, <3,u,u,2>
+  2651635712U, // <7,4,0,0>: Cost 3 vext2 <5,6,7,4>, <0,0,0,0>
+  1577893990U, // <7,4,0,1>: Cost 2 vext2 <5,6,7,4>, LHS
+  2651635876U, // <7,4,0,2>: Cost 3 vext2 <5,6,7,4>, <0,2,0,2>
+  3785804672U, // <7,4,0,3>: Cost 4 vext3 RHS, <4,0,3,1>
+  2651636050U, // <7,4,0,4>: Cost 3 vext2 <5,6,7,4>, <0,4,1,5>
+  1638468498U, // <7,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
+  1638468508U, // <7,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
+  3787795364U, // <7,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
+  1640459181U, // <7,4,0,u>: Cost 2 vext3 RHS, <4,0,u,1>
+  2651636470U, // <7,4,1,0>: Cost 3 vext2 <5,6,7,4>, <1,0,3,2>
+  2651636532U, // <7,4,1,1>: Cost 3 vext2 <5,6,7,4>, <1,1,1,1>
+  2712062922U, // <7,4,1,2>: Cost 3 vext3 RHS, <4,1,2,3>
+  2639029248U, // <7,4,1,3>: Cost 3 vext2 <3,5,7,4>, <1,3,5,7>
+  2712062940U, // <7,4,1,4>: Cost 3 vext3 RHS, <4,1,4,3>
+  2712062946U, // <7,4,1,5>: Cost 3 vext3 RHS, <4,1,5,0>
+  2712062958U, // <7,4,1,6>: Cost 3 vext3 RHS, <4,1,6,3>
+  3785804791U, // <7,4,1,7>: Cost 4 vext3 RHS, <4,1,7,3>
+  2712062973U, // <7,4,1,u>: Cost 3 vext3 RHS, <4,1,u,0>
+  3785804807U, // <7,4,2,0>: Cost 4 vext3 RHS, <4,2,0,1>
+  3785804818U, // <7,4,2,1>: Cost 4 vext3 RHS, <4,2,1,3>
+  2651637352U, // <7,4,2,2>: Cost 3 vext2 <5,6,7,4>, <2,2,2,2>
+  2651637414U, // <7,4,2,3>: Cost 3 vext2 <5,6,7,4>, <2,3,0,1>
+  3716753194U, // <7,4,2,4>: Cost 4 vext2 <4,2,7,4>, <2,4,5,7>
+  2712063030U, // <7,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
+  2712063036U, // <7,4,2,6>: Cost 3 vext3 RHS, <4,2,6,0>
+  3773123658U, // <7,4,2,7>: Cost 4 vext3 <2,4,5,7>, <4,2,7,5>
+  2712063054U, // <7,4,2,u>: Cost 3 vext3 RHS, <4,2,u,0>
+  2651637910U, // <7,4,3,0>: Cost 3 vext2 <5,6,7,4>, <3,0,1,2>
+  3712772348U, // <7,4,3,1>: Cost 4 vext2 <3,5,7,4>, <3,1,3,5>
+  3785804906U, // <7,4,3,2>: Cost 4 vext3 RHS, <4,3,2,1>
+  2651638172U, // <7,4,3,3>: Cost 3 vext2 <5,6,7,4>, <3,3,3,3>
+  2651638274U, // <7,4,3,4>: Cost 3 vext2 <5,6,7,4>, <3,4,5,6>
+  2639030883U, // <7,4,3,5>: Cost 3 vext2 <3,5,7,4>, <3,5,7,4>
+  2712063122U, // <7,4,3,6>: Cost 3 vext3 RHS, <4,3,6,5>
+  3712772836U, // <7,4,3,7>: Cost 4 vext2 <3,5,7,4>, <3,7,3,7>
+  2641021782U, // <7,4,3,u>: Cost 3 vext2 <3,u,7,4>, <3,u,7,4>
+  2714053802U, // <7,4,4,0>: Cost 3 vext3 RHS, <4,4,0,2>
+  3785804978U, // <7,4,4,1>: Cost 4 vext3 RHS, <4,4,1,1>
+  3716754505U, // <7,4,4,2>: Cost 4 vext2 <4,2,7,4>, <4,2,7,4>
+  3785804998U, // <7,4,4,3>: Cost 4 vext3 RHS, <4,4,3,3>
+  1638321360U, // <7,4,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
+  1638468826U, // <7,4,4,5>: Cost 2 vext3 RHS, <4,4,5,5>
+  1638468836U, // <7,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
+  3785215214U, // <7,4,4,7>: Cost 4 vext3 <4,4,7,7>, <4,4,7,7>
+  1640459509U, // <7,4,4,u>: Cost 2 vext3 RHS, <4,4,u,5>
+  1517207654U, // <7,4,5,0>: Cost 2 vext1 <6,7,4,5>, LHS
+  2573034640U, // <7,4,5,1>: Cost 3 vext1 <3,7,4,5>, <1,5,3,7>
+  2712063246U, // <7,4,5,2>: Cost 3 vext3 RHS, <4,5,2,3>
+  2573036267U, // <7,4,5,3>: Cost 3 vext1 <3,7,4,5>, <3,7,4,5>
+  1517210934U, // <7,4,5,4>: Cost 2 vext1 <6,7,4,5>, RHS
+  2711989549U, // <7,4,5,5>: Cost 3 vext3 <4,5,5,7>, <4,5,5,7>
+  564579638U, // <7,4,5,6>: Cost 1 vext3 RHS, RHS
+  2651639976U, // <7,4,5,7>: Cost 3 vext2 <5,6,7,4>, <5,7,5,7>
+  564579656U, // <7,4,5,u>: Cost 1 vext3 RHS, RHS
+  2712063307U, // <7,4,6,0>: Cost 3 vext3 RHS, <4,6,0,1>
+  3767668056U, // <7,4,6,1>: Cost 4 vext3 <1,5,3,7>, <4,6,1,5>
+  2651640314U, // <7,4,6,2>: Cost 3 vext2 <5,6,7,4>, <6,2,7,3>
+  2655621708U, // <7,4,6,3>: Cost 3 vext2 <6,3,7,4>, <6,3,7,4>
+  1638468980U, // <7,4,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
+  2712063358U, // <7,4,6,5>: Cost 3 vext3 RHS, <4,6,5,7>
+  2712063367U, // <7,4,6,6>: Cost 3 vext3 RHS, <4,6,6,7>
+  2712210826U, // <7,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
+  1638469012U, // <7,4,6,u>: Cost 2 vext3 RHS, <4,6,u,2>
+  2651640826U, // <7,4,7,0>: Cost 3 vext2 <5,6,7,4>, <7,0,1,2>
+  3773713830U, // <7,4,7,1>: Cost 4 vext3 <2,5,4,7>, <4,7,1,2>
+  3773713842U, // <7,4,7,2>: Cost 4 vext3 <2,5,4,7>, <4,7,2,5>
+  3780349372U, // <7,4,7,3>: Cost 4 vext3 <3,6,4,7>, <4,7,3,6>
+  2651641140U, // <7,4,7,4>: Cost 3 vext2 <5,6,7,4>, <7,4,0,1>
+  2712210888U, // <7,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
+  2712210898U, // <7,4,7,6>: Cost 3 vext3 RHS, <4,7,6,1>
+  2651641452U, // <7,4,7,7>: Cost 3 vext2 <5,6,7,4>, <7,7,7,7>
+  2713538026U, // <7,4,7,u>: Cost 3 vext3 <4,7,u,7>, <4,7,u,7>
+  1517232230U, // <7,4,u,0>: Cost 2 vext1 <6,7,4,u>, LHS
+  1577899822U, // <7,4,u,1>: Cost 2 vext2 <5,6,7,4>, LHS
+  2712063489U, // <7,4,u,2>: Cost 3 vext3 RHS, <4,u,2,3>
+  2573060846U, // <7,4,u,3>: Cost 3 vext1 <3,7,4,u>, <3,7,4,u>
+  1640312342U, // <7,4,u,4>: Cost 2 vext3 RHS, <4,u,4,6>
+  1638469146U, // <7,4,u,5>: Cost 2 vext3 RHS, <4,u,5,1>
+  564579881U, // <7,4,u,6>: Cost 1 vext3 RHS, RHS
+  2714054192U, // <7,4,u,7>: Cost 3 vext3 RHS, <4,u,7,5>
+  564579899U, // <7,4,u,u>: Cost 1 vext3 RHS, RHS
+  2579038310U, // <7,5,0,0>: Cost 3 vext1 <4,7,5,0>, LHS
+  2636382310U, // <7,5,0,1>: Cost 3 vext2 <3,1,7,5>, LHS
+  2796339302U, // <7,5,0,2>: Cost 3 vuzpl <7,4,5,6>, LHS
+  3646810719U, // <7,5,0,3>: Cost 4 vext1 <3,7,5,0>, <3,5,7,0>
+  2712063586U, // <7,5,0,4>: Cost 3 vext3 RHS, <5,0,4,1>
+  2735951467U, // <7,5,0,5>: Cost 3 vext3 RHS, <5,0,5,1>
+  2735951476U, // <7,5,0,6>: Cost 3 vext3 RHS, <5,0,6,1>
+  2579043322U, // <7,5,0,7>: Cost 3 vext1 <4,7,5,0>, <7,0,1,2>
+  2636382877U, // <7,5,0,u>: Cost 3 vext2 <3,1,7,5>, LHS
+  2712211087U, // <7,5,1,0>: Cost 3 vext3 RHS, <5,1,0,1>
+  3698180916U, // <7,5,1,1>: Cost 4 vext2 <1,1,7,5>, <1,1,1,1>
+  3710124950U, // <7,5,1,2>: Cost 4 vext2 <3,1,7,5>, <1,2,3,0>
+  2636383232U, // <7,5,1,3>: Cost 3 vext2 <3,1,7,5>, <1,3,5,7>
+  2712211127U, // <7,5,1,4>: Cost 3 vext3 RHS, <5,1,4,5>
+  2590994128U, // <7,5,1,5>: Cost 3 vext1 <6,7,5,1>, <5,1,7,3>
+  2590995323U, // <7,5,1,6>: Cost 3 vext1 <6,7,5,1>, <6,7,5,1>
+  1638469328U, // <7,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
+  1638469337U, // <7,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
+  3785805536U, // <7,5,2,0>: Cost 4 vext3 RHS, <5,2,0,1>
+  3785805544U, // <7,5,2,1>: Cost 4 vext3 RHS, <5,2,1,0>
+  3704817288U, // <7,5,2,2>: Cost 4 vext2 <2,2,7,5>, <2,2,5,7>
+  2712063742U, // <7,5,2,3>: Cost 3 vext3 RHS, <5,2,3,4>
+  3716761386U, // <7,5,2,4>: Cost 4 vext2 <4,2,7,5>, <2,4,5,7>
+  2714054415U, // <7,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
+  3774304024U, // <7,5,2,6>: Cost 4 vext3 <2,6,3,7>, <5,2,6,3>
+  2712063777U, // <7,5,2,7>: Cost 3 vext3 RHS, <5,2,7,3>
+  2712063787U, // <7,5,2,u>: Cost 3 vext3 RHS, <5,2,u,4>
+  3634888806U, // <7,5,3,0>: Cost 4 vext1 <1,7,5,3>, LHS
+  2636384544U, // <7,5,3,1>: Cost 3 vext2 <3,1,7,5>, <3,1,7,5>
+  3710790001U, // <7,5,3,2>: Cost 4 vext2 <3,2,7,5>, <3,2,7,5>
+  3710126492U, // <7,5,3,3>: Cost 4 vext2 <3,1,7,5>, <3,3,3,3>
+  3634892086U, // <7,5,3,4>: Cost 4 vext1 <1,7,5,3>, RHS
+  2639039076U, // <7,5,3,5>: Cost 3 vext2 <3,5,7,5>, <3,5,7,5>
+  3713444533U, // <7,5,3,6>: Cost 4 vext2 <3,6,7,5>, <3,6,7,5>
+  2693926767U, // <7,5,3,7>: Cost 3 vext3 <1,5,3,7>, <5,3,7,0>
+  2712063864U, // <7,5,3,u>: Cost 3 vext3 RHS, <5,3,u,0>
+  2579071078U, // <7,5,4,0>: Cost 3 vext1 <4,7,5,4>, LHS
+  3646841856U, // <7,5,4,1>: Cost 4 vext1 <3,7,5,4>, <1,3,5,7>
+  3716762698U, // <7,5,4,2>: Cost 4 vext2 <4,2,7,5>, <4,2,7,5>
+  3646843491U, // <7,5,4,3>: Cost 4 vext1 <3,7,5,4>, <3,5,7,4>
+  2579074358U, // <7,5,4,4>: Cost 3 vext1 <4,7,5,4>, RHS
+  2636385590U, // <7,5,4,5>: Cost 3 vext2 <3,1,7,5>, RHS
+  2645675406U, // <7,5,4,6>: Cost 3 vext2 <4,6,7,5>, <4,6,7,5>
+  1638322118U, // <7,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
+  1638469583U, // <7,5,4,u>: Cost 2 vext3 RHS, <5,4,u,6>
+  2714054611U, // <7,5,5,0>: Cost 3 vext3 RHS, <5,5,0,1>
+  2652974800U, // <7,5,5,1>: Cost 3 vext2 <5,u,7,5>, <5,1,7,3>
+  3710127905U, // <7,5,5,2>: Cost 4 vext2 <3,1,7,5>, <5,2,7,3>
+  3785805808U, // <7,5,5,3>: Cost 4 vext3 RHS, <5,5,3,3>
+  2712211450U, // <7,5,5,4>: Cost 3 vext3 RHS, <5,5,4,4>
+  1638322180U, // <7,5,5,5>: Cost 2 vext3 RHS, <5,5,5,5>
+  2712064014U, // <7,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
+  1638469656U, // <7,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
+  1638469665U, // <7,5,5,u>: Cost 2 vext3 RHS, <5,5,u,7>
+  2712064036U, // <7,5,6,0>: Cost 3 vext3 RHS, <5,6,0,1>
+  2714054707U, // <7,5,6,1>: Cost 3 vext3 RHS, <5,6,1,7>
+  3785805879U, // <7,5,6,2>: Cost 4 vext3 RHS, <5,6,2,2>
+  2712064066U, // <7,5,6,3>: Cost 3 vext3 RHS, <5,6,3,4>
+  2712064076U, // <7,5,6,4>: Cost 3 vext3 RHS, <5,6,4,5>
+  2714054743U, // <7,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
+  2712064096U, // <7,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
+  1638322274U, // <7,5,6,7>: Cost 2 vext3 RHS, <5,6,7,0>
+  1638469739U, // <7,5,6,u>: Cost 2 vext3 RHS, <5,6,u,0>
+  1511325798U, // <7,5,7,0>: Cost 2 vext1 <5,7,5,7>, LHS
+  2692747392U, // <7,5,7,1>: Cost 3 vext3 <1,3,5,7>, <5,7,1,3>
+  2585069160U, // <7,5,7,2>: Cost 3 vext1 <5,7,5,7>, <2,2,2,2>
+  2573126390U, // <7,5,7,3>: Cost 3 vext1 <3,7,5,7>, <3,7,5,7>
+  1511329078U, // <7,5,7,4>: Cost 2 vext1 <5,7,5,7>, RHS
+  1638469800U, // <7,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
+  2712211626U, // <7,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
+  2712211636U, // <7,5,7,7>: Cost 3 vext3 RHS, <5,7,7,1>
+  1638469823U, // <7,5,7,u>: Cost 2 vext3 RHS, <5,7,u,3>
+  1511333990U, // <7,5,u,0>: Cost 2 vext1 <5,7,5,u>, LHS
+  2636388142U, // <7,5,u,1>: Cost 3 vext2 <3,1,7,5>, LHS
+  2712211671U, // <7,5,u,2>: Cost 3 vext3 RHS, <5,u,2,0>
+  2573134583U, // <7,5,u,3>: Cost 3 vext1 <3,7,5,u>, <3,7,5,u>
+  1511337270U, // <7,5,u,4>: Cost 2 vext1 <5,7,5,u>, RHS
+  1638469881U, // <7,5,u,5>: Cost 2 vext3 RHS, <5,u,5,7>
+  2712064258U, // <7,5,u,6>: Cost 3 vext3 RHS, <5,u,6,7>
+  1638469892U, // <7,5,u,7>: Cost 2 vext3 RHS, <5,u,7,0>
+  1638469904U, // <7,5,u,u>: Cost 2 vext3 RHS, <5,u,u,3>
+  2650324992U, // <7,6,0,0>: Cost 3 vext2 <5,4,7,6>, <0,0,0,0>
+  1576583270U, // <7,6,0,1>: Cost 2 vext2 <5,4,7,6>, LHS
+  2712064300U, // <7,6,0,2>: Cost 3 vext3 RHS, <6,0,2,4>
+  2255295336U, // <7,6,0,3>: Cost 3 vrev <6,7,3,0>
+  2712064316U, // <7,6,0,4>: Cost 3 vext3 RHS, <6,0,4,2>
+  2585088098U, // <7,6,0,5>: Cost 3 vext1 <5,7,6,0>, <5,6,7,0>
+  2735952204U, // <7,6,0,6>: Cost 3 vext3 RHS, <6,0,6,0>
+  2712211799U, // <7,6,0,7>: Cost 3 vext3 RHS, <6,0,7,2>
+  1576583837U, // <7,6,0,u>: Cost 2 vext2 <5,4,7,6>, LHS
+  1181340494U, // <7,6,1,0>: Cost 2 vrev <6,7,0,1>
+  2650325812U, // <7,6,1,1>: Cost 3 vext2 <5,4,7,6>, <1,1,1,1>
+  2650325910U, // <7,6,1,2>: Cost 3 vext2 <5,4,7,6>, <1,2,3,0>
+  2650325976U, // <7,6,1,3>: Cost 3 vext2 <5,4,7,6>, <1,3,1,3>
+  2579123510U, // <7,6,1,4>: Cost 3 vext1 <4,7,6,1>, RHS
+  2650326160U, // <7,6,1,5>: Cost 3 vext2 <5,4,7,6>, <1,5,3,7>
+  2714055072U, // <7,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
+  2712064425U, // <7,6,1,7>: Cost 3 vext3 RHS, <6,1,7,3>
+  1181930390U, // <7,6,1,u>: Cost 2 vrev <6,7,u,1>
+  2712211897U, // <7,6,2,0>: Cost 3 vext3 RHS, <6,2,0,1>
+  2714055108U, // <7,6,2,1>: Cost 3 vext3 RHS, <6,2,1,3>
+  2650326632U, // <7,6,2,2>: Cost 3 vext2 <5,4,7,6>, <2,2,2,2>
+  2650326694U, // <7,6,2,3>: Cost 3 vext2 <5,4,7,6>, <2,3,0,1>
+  2714055137U, // <7,6,2,4>: Cost 3 vext3 RHS, <6,2,4,5>
+  2714055148U, // <7,6,2,5>: Cost 3 vext3 RHS, <6,2,5,7>
+  2650326970U, // <7,6,2,6>: Cost 3 vext2 <5,4,7,6>, <2,6,3,7>
+  1638470138U, // <7,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
+  1638470147U, // <7,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
+  2650327190U, // <7,6,3,0>: Cost 3 vext2 <5,4,7,6>, <3,0,1,2>
+  2255172441U, // <7,6,3,1>: Cost 3 vrev <6,7,1,3>
+  2255246178U, // <7,6,3,2>: Cost 3 vrev <6,7,2,3>
+  2650327452U, // <7,6,3,3>: Cost 3 vext2 <5,4,7,6>, <3,3,3,3>
+  2712064562U, // <7,6,3,4>: Cost 3 vext3 RHS, <6,3,4,5>
+  2650327627U, // <7,6,3,5>: Cost 3 vext2 <5,4,7,6>, <3,5,4,7>
+  3713452726U, // <7,6,3,6>: Cost 4 vext2 <3,6,7,6>, <3,6,7,6>
+  2700563016U, // <7,6,3,7>: Cost 3 vext3 <2,6,3,7>, <6,3,7,0>
+  2712064593U, // <7,6,3,u>: Cost 3 vext3 RHS, <6,3,u,0>
+  2650327954U, // <7,6,4,0>: Cost 3 vext2 <5,4,7,6>, <4,0,5,1>
+  2735952486U, // <7,6,4,1>: Cost 3 vext3 RHS, <6,4,1,3>
+  2735952497U, // <7,6,4,2>: Cost 3 vext3 RHS, <6,4,2,5>
+  2255328108U, // <7,6,4,3>: Cost 3 vrev <6,7,3,4>
+  2712212100U, // <7,6,4,4>: Cost 3 vext3 RHS, <6,4,4,6>
+  1576586550U, // <7,6,4,5>: Cost 2 vext2 <5,4,7,6>, RHS
+  2714055312U, // <7,6,4,6>: Cost 3 vext3 RHS, <6,4,6,0>
+  2712212126U, // <7,6,4,7>: Cost 3 vext3 RHS, <6,4,7,5>
+  1576586793U, // <7,6,4,u>: Cost 2 vext2 <5,4,7,6>, RHS
+  2579152998U, // <7,6,5,0>: Cost 3 vext1 <4,7,6,5>, LHS
+  2650328784U, // <7,6,5,1>: Cost 3 vext2 <5,4,7,6>, <5,1,7,3>
+  2714055364U, // <7,6,5,2>: Cost 3 vext3 RHS, <6,5,2,7>
+  3785806538U, // <7,6,5,3>: Cost 4 vext3 RHS, <6,5,3,4>
+  1576587206U, // <7,6,5,4>: Cost 2 vext2 <5,4,7,6>, <5,4,7,6>
+  2650329092U, // <7,6,5,5>: Cost 3 vext2 <5,4,7,6>, <5,5,5,5>
+  2650329186U, // <7,6,5,6>: Cost 3 vext2 <5,4,7,6>, <5,6,7,0>
+  2712064753U, // <7,6,5,7>: Cost 3 vext3 RHS, <6,5,7,7>
+  1181963162U, // <7,6,5,u>: Cost 2 vrev <6,7,u,5>
+  2714055421U, // <7,6,6,0>: Cost 3 vext3 RHS, <6,6,0,1>
+  2714055432U, // <7,6,6,1>: Cost 3 vext3 RHS, <6,6,1,3>
+  2650329594U, // <7,6,6,2>: Cost 3 vext2 <5,4,7,6>, <6,2,7,3>
+  3785806619U, // <7,6,6,3>: Cost 4 vext3 RHS, <6,6,3,4>
+  2712212260U, // <7,6,6,4>: Cost 3 vext3 RHS, <6,6,4,4>
+  2714055472U, // <7,6,6,5>: Cost 3 vext3 RHS, <6,6,5,7>
+  1638323000U, // <7,6,6,6>: Cost 2 vext3 RHS, <6,6,6,6>
+  1638470466U, // <7,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
+  1638470475U, // <7,6,6,u>: Cost 2 vext3 RHS, <6,6,u,7>
+  1638323022U, // <7,6,7,0>: Cost 2 vext3 RHS, <6,7,0,1>
+  2712064854U, // <7,6,7,1>: Cost 3 vext3 RHS, <6,7,1,0>
+  2712064865U, // <7,6,7,2>: Cost 3 vext3 RHS, <6,7,2,2>
+  2712064872U, // <7,6,7,3>: Cost 3 vext3 RHS, <6,7,3,0>
+  1638323062U, // <7,6,7,4>: Cost 2 vext3 RHS, <6,7,4,5>
+  2712064894U, // <7,6,7,5>: Cost 3 vext3 RHS, <6,7,5,4>
+  2712064905U, // <7,6,7,6>: Cost 3 vext3 RHS, <6,7,6,6>
+  2712064915U, // <7,6,7,7>: Cost 3 vext3 RHS, <6,7,7,7>
+  1638323094U, // <7,6,7,u>: Cost 2 vext3 RHS, <6,7,u,1>
+  1638470559U, // <7,6,u,0>: Cost 2 vext3 RHS, <6,u,0,1>
+  1576589102U, // <7,6,u,1>: Cost 2 vext2 <5,4,7,6>, LHS
+  2712212402U, // <7,6,u,2>: Cost 3 vext3 RHS, <6,u,2,2>
+  2712212409U, // <7,6,u,3>: Cost 3 vext3 RHS, <6,u,3,0>
+  1638470599U, // <7,6,u,4>: Cost 2 vext3 RHS, <6,u,4,5>
+  1576589466U, // <7,6,u,5>: Cost 2 vext2 <5,4,7,6>, RHS
+  1638323000U, // <7,6,u,6>: Cost 2 vext3 RHS, <6,6,6,6>
+  1638470624U, // <7,6,u,7>: Cost 2 vext3 RHS, <6,u,7,3>
+  1638470631U, // <7,6,u,u>: Cost 2 vext3 RHS, <6,u,u,1>
+  2712065007U, // <7,7,0,0>: Cost 3 vext3 RHS, <7,0,0,0>
+  1638323194U, // <7,7,0,1>: Cost 2 vext3 RHS, <7,0,1,2>
+  2712065025U, // <7,7,0,2>: Cost 3 vext3 RHS, <7,0,2,0>
+  3646958337U, // <7,7,0,3>: Cost 4 vext1 <3,7,7,0>, <3,7,7,0>
+  2712065044U, // <7,7,0,4>: Cost 3 vext3 RHS, <7,0,4,1>
+  2585161907U, // <7,7,0,5>: Cost 3 vext1 <5,7,7,0>, <5,7,7,0>
+  2591134604U, // <7,7,0,6>: Cost 3 vext1 <6,7,7,0>, <6,7,7,0>
+  2591134714U, // <7,7,0,7>: Cost 3 vext1 <6,7,7,0>, <7,0,1,2>
+  1638323257U, // <7,7,0,u>: Cost 2 vext3 RHS, <7,0,u,2>
+  2712065091U, // <7,7,1,0>: Cost 3 vext3 RHS, <7,1,0,3>
+  2712065098U, // <7,7,1,1>: Cost 3 vext3 RHS, <7,1,1,1>
+  2712065109U, // <7,7,1,2>: Cost 3 vext3 RHS, <7,1,2,3>
+  2692748384U, // <7,7,1,3>: Cost 3 vext3 <1,3,5,7>, <7,1,3,5>
+  2585169206U, // <7,7,1,4>: Cost 3 vext1 <5,7,7,1>, RHS
+  2693928048U, // <7,7,1,5>: Cost 3 vext3 <1,5,3,7>, <7,1,5,3>
+  2585170766U, // <7,7,1,6>: Cost 3 vext1 <5,7,7,1>, <6,7,0,1>
+  2735953024U, // <7,7,1,7>: Cost 3 vext3 RHS, <7,1,7,1>
+  2695918731U, // <7,7,1,u>: Cost 3 vext3 <1,u,3,7>, <7,1,u,3>
+  3770471574U, // <7,7,2,0>: Cost 4 vext3 <2,0,5,7>, <7,2,0,5>
+  3785807002U, // <7,7,2,1>: Cost 4 vext3 RHS, <7,2,1,0>
+  2712065189U, // <7,7,2,2>: Cost 3 vext3 RHS, <7,2,2,2>
+  2712065196U, // <7,7,2,3>: Cost 3 vext3 RHS, <7,2,3,0>
+  3773125818U, // <7,7,2,4>: Cost 4 vext3 <2,4,5,7>, <7,2,4,5>
+  3766490305U, // <7,7,2,5>: Cost 4 vext3 <1,3,5,7>, <7,2,5,3>
+  2700563658U, // <7,7,2,6>: Cost 3 vext3 <2,6,3,7>, <7,2,6,3>
+  2735953107U, // <7,7,2,7>: Cost 3 vext3 RHS, <7,2,7,3>
+  2701890780U, // <7,7,2,u>: Cost 3 vext3 <2,u,3,7>, <7,2,u,3>
+  2712065251U, // <7,7,3,0>: Cost 3 vext3 RHS, <7,3,0,1>
+  3766490350U, // <7,7,3,1>: Cost 4 vext3 <1,3,5,7>, <7,3,1,3>
+  3774305530U, // <7,7,3,2>: Cost 4 vext3 <2,6,3,7>, <7,3,2,6>
+  2637728196U, // <7,7,3,3>: Cost 3 vext2 <3,3,7,7>, <3,3,7,7>
+  2712065291U, // <7,7,3,4>: Cost 3 vext3 RHS, <7,3,4,5>
+  2585186486U, // <7,7,3,5>: Cost 3 vext1 <5,7,7,3>, <5,7,7,3>
+  2639719095U, // <7,7,3,6>: Cost 3 vext2 <3,6,7,7>, <3,6,7,7>
+  2640382728U, // <7,7,3,7>: Cost 3 vext2 <3,7,7,7>, <3,7,7,7>
+  2641046361U, // <7,7,3,u>: Cost 3 vext2 <3,u,7,7>, <3,u,7,7>
+  2712212792U, // <7,7,4,0>: Cost 3 vext3 RHS, <7,4,0,5>
+  3646989312U, // <7,7,4,1>: Cost 4 vext1 <3,7,7,4>, <1,3,5,7>
+  3785807176U, // <7,7,4,2>: Cost 4 vext3 RHS, <7,4,2,3>
+  3646991109U, // <7,7,4,3>: Cost 4 vext1 <3,7,7,4>, <3,7,7,4>
+  2712065371U, // <7,7,4,4>: Cost 3 vext3 RHS, <7,4,4,4>
+  1638323558U, // <7,7,4,5>: Cost 2 vext3 RHS, <7,4,5,6>
+  2712212845U, // <7,7,4,6>: Cost 3 vext3 RHS, <7,4,6,4>
+  2591167846U, // <7,7,4,7>: Cost 3 vext1 <6,7,7,4>, <7,4,5,6>
+  1638323585U, // <7,7,4,u>: Cost 2 vext3 RHS, <7,4,u,6>
+  2585198694U, // <7,7,5,0>: Cost 3 vext1 <5,7,7,5>, LHS
+  2712212884U, // <7,7,5,1>: Cost 3 vext3 RHS, <7,5,1,7>
+  3711471393U, // <7,7,5,2>: Cost 4 vext2 <3,3,7,7>, <5,2,7,3>
+  2649673590U, // <7,7,5,3>: Cost 3 vext2 <5,3,7,7>, <5,3,7,7>
+  2712065455U, // <7,7,5,4>: Cost 3 vext3 RHS, <7,5,4,7>
+  1577259032U, // <7,7,5,5>: Cost 2 vext2 <5,5,7,7>, <5,5,7,7>
+  2712065473U, // <7,7,5,6>: Cost 3 vext3 RHS, <7,5,6,7>
+  2712212936U, // <7,7,5,7>: Cost 3 vext3 RHS, <7,5,7,5>
+  1579249931U, // <7,7,5,u>: Cost 2 vext2 <5,u,7,7>, <5,u,7,7>
+  2591178854U, // <7,7,6,0>: Cost 3 vext1 <6,7,7,6>, LHS
+  2735953374U, // <7,7,6,1>: Cost 3 vext3 RHS, <7,6,1,0>
+  2712212974U, // <7,7,6,2>: Cost 3 vext3 RHS, <7,6,2,7>
+  2655646287U, // <7,7,6,3>: Cost 3 vext2 <6,3,7,7>, <6,3,7,7>
+  2591182134U, // <7,7,6,4>: Cost 3 vext1 <6,7,7,6>, RHS
+  2656973553U, // <7,7,6,5>: Cost 3 vext2 <6,5,7,7>, <6,5,7,7>
+  1583895362U, // <7,7,6,6>: Cost 2 vext2 <6,6,7,7>, <6,6,7,7>
+  2712065556U, // <7,7,6,7>: Cost 3 vext3 RHS, <7,6,7,0>
+  1585222628U, // <7,7,6,u>: Cost 2 vext2 <6,u,7,7>, <6,u,7,7>
+  1523417190U, // <7,7,7,0>: Cost 2 vext1 <7,7,7,7>, LHS
+  2597159670U, // <7,7,7,1>: Cost 3 vext1 <7,7,7,7>, <1,0,3,2>
+  2597160552U, // <7,7,7,2>: Cost 3 vext1 <7,7,7,7>, <2,2,2,2>
+  2597161110U, // <7,7,7,3>: Cost 3 vext1 <7,7,7,7>, <3,0,1,2>
+  1523420470U, // <7,7,7,4>: Cost 2 vext1 <7,7,7,7>, RHS
+  2651002296U, // <7,7,7,5>: Cost 3 vext2 <5,5,7,7>, <7,5,5,7>
+  2657637906U, // <7,7,7,6>: Cost 3 vext2 <6,6,7,7>, <7,6,6,7>
+  363253046U, // <7,7,7,7>: Cost 1 vdup3 RHS
+  363253046U, // <7,7,7,u>: Cost 1 vdup3 RHS
+  1523417190U, // <7,7,u,0>: Cost 2 vext1 <7,7,7,7>, LHS
+  1638471298U, // <7,7,u,1>: Cost 2 vext3 RHS, <7,u,1,2>
+  2712213132U, // <7,7,u,2>: Cost 3 vext3 RHS, <7,u,2,3>
+  2712213138U, // <7,7,u,3>: Cost 3 vext3 RHS, <7,u,3,0>
+  1523420470U, // <7,7,u,4>: Cost 2 vext1 <7,7,7,7>, RHS
+  1638471338U, // <7,7,u,5>: Cost 2 vext3 RHS, <7,u,5,6>
+  1595840756U, // <7,7,u,6>: Cost 2 vext2 <u,6,7,7>, <u,6,7,7>
+  363253046U, // <7,7,u,7>: Cost 1 vdup3 RHS
+  363253046U, // <7,7,u,u>: Cost 1 vdup3 RHS
+  1638318080U, // <7,u,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
+  1638323923U, // <7,u,0,1>: Cost 2 vext3 RHS, <u,0,1,2>
+  1662211804U, // <7,u,0,2>: Cost 2 vext3 RHS, <u,0,2,2>
+  1638323941U, // <7,u,0,3>: Cost 2 vext3 RHS, <u,0,3,2>
+  2712065773U, // <7,u,0,4>: Cost 3 vext3 RHS, <u,0,4,1>
+  1662359286U, // <7,u,0,5>: Cost 2 vext3 RHS, <u,0,5,1>
+  1662359296U, // <7,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
+  2987150664U, // <7,u,0,7>: Cost 3 vzipr <5,6,7,0>, RHS
+  1638323986U, // <7,u,0,u>: Cost 2 vext3 RHS, <u,0,u,2>
+  1517469798U, // <7,u,1,0>: Cost 2 vext1 <6,7,u,1>, LHS
+  1638318900U, // <7,u,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
+  564582190U, // <7,u,1,2>: Cost 1 vext3 RHS, LHS
+  1638324023U, // <7,u,1,3>: Cost 2 vext3 RHS, <u,1,3,3>
+  1517473078U, // <7,u,1,4>: Cost 2 vext1 <6,7,u,1>, RHS
+  2693928777U, // <7,u,1,5>: Cost 3 vext3 <1,5,3,7>, <u,1,5,3>
+  1517474710U, // <7,u,1,6>: Cost 2 vext1 <6,7,u,1>, <6,7,u,1>
+  1640462171U, // <7,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
+  564582244U, // <7,u,1,u>: Cost 1 vext3 RHS, LHS
+  1638318244U, // <7,u,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
+  2712065907U, // <7,u,2,1>: Cost 3 vext3 RHS, <u,2,1,0>
+  1638319720U, // <7,u,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
+  1638324101U, // <7,u,2,3>: Cost 2 vext3 RHS, <u,2,3,0>
+  1638318284U, // <7,u,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
+  2712065947U, // <7,u,2,5>: Cost 3 vext3 RHS, <u,2,5,4>
+  2700564387U, // <7,u,2,6>: Cost 3 vext3 <2,6,3,7>, <u,2,6,3>
+  1640314796U, // <7,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
+  1638324146U, // <7,u,2,u>: Cost 2 vext3 RHS, <u,2,u,0>
+  1638324156U, // <7,u,3,0>: Cost 2 vext3 RHS, <u,3,0,1>
+  1638319064U, // <7,u,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
+  2700564435U, // <7,u,3,2>: Cost 3 vext3 <2,6,3,7>, <u,3,2,6>
+  1638320540U, // <7,u,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
+  1638324196U, // <7,u,3,4>: Cost 2 vext3 RHS, <u,3,4,5>
+  1638324207U, // <7,u,3,5>: Cost 2 vext3 RHS, <u,3,5,7>
+  2700564472U, // <7,u,3,6>: Cost 3 vext3 <2,6,3,7>, <u,3,6,7>
+  2695919610U, // <7,u,3,7>: Cost 3 vext3 <1,u,3,7>, <u,3,7,0>
+  1638324228U, // <7,u,3,u>: Cost 2 vext3 RHS, <u,3,u,1>
+  2712066061U, // <7,u,4,0>: Cost 3 vext3 RHS, <u,4,0,1>
+  1662212122U, // <7,u,4,1>: Cost 2 vext3 RHS, <u,4,1,5>
+  1662212132U, // <7,u,4,2>: Cost 2 vext3 RHS, <u,4,2,6>
+  2712066092U, // <7,u,4,3>: Cost 3 vext3 RHS, <u,4,3,5>
+  1638321360U, // <7,u,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
+  1638324287U, // <7,u,4,5>: Cost 2 vext3 RHS, <u,4,5,6>
+  1662359624U, // <7,u,4,6>: Cost 2 vext3 RHS, <u,4,6,6>
+  1640314961U, // <7,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
+  1638324314U, // <7,u,4,u>: Cost 2 vext3 RHS, <u,4,u,6>
+  1517502566U, // <7,u,5,0>: Cost 2 vext1 <6,7,u,5>, LHS
+  1574612693U, // <7,u,5,1>: Cost 2 vext2 <5,1,7,u>, <5,1,7,u>
+  2712066162U, // <7,u,5,2>: Cost 3 vext3 RHS, <u,5,2,3>
+  1638324351U, // <7,u,5,3>: Cost 2 vext3 RHS, <u,5,3,7>
+  1576603592U, // <7,u,5,4>: Cost 2 vext2 <5,4,7,u>, <5,4,7,u>
+  1577267225U, // <7,u,5,5>: Cost 2 vext2 <5,5,7,u>, <5,5,7,u>
+  564582554U, // <7,u,5,6>: Cost 1 vext3 RHS, RHS
+  1640462499U, // <7,u,5,7>: Cost 2 vext3 RHS, <u,5,7,7>
+  564582572U, // <7,u,5,u>: Cost 1 vext3 RHS, RHS
+  2712066223U, // <7,u,6,0>: Cost 3 vext3 RHS, <u,6,0,1>
+  2712066238U, // <7,u,6,1>: Cost 3 vext3 RHS, <u,6,1,7>
+  1581249023U, // <7,u,6,2>: Cost 2 vext2 <6,2,7,u>, <6,2,7,u>
+  1638324432U, // <7,u,6,3>: Cost 2 vext3 RHS, <u,6,3,7>
+  1638468980U, // <7,u,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
+  2712066274U, // <7,u,6,5>: Cost 3 vext3 RHS, <u,6,5,7>
+  1583903555U, // <7,u,6,6>: Cost 2 vext2 <6,6,7,u>, <6,6,7,u>
+  1640315117U, // <7,u,6,7>: Cost 2 vext3 RHS, <u,6,7,0>
+  1638324477U, // <7,u,6,u>: Cost 2 vext3 RHS, <u,6,u,7>
+  1638471936U, // <7,u,7,0>: Cost 2 vext3 RHS, <u,7,0,1>
+  2692970763U, // <7,u,7,1>: Cost 3 vext3 <1,3,u,7>, <u,7,1,3>
+  2700933399U, // <7,u,7,2>: Cost 3 vext3 <2,6,u,7>, <u,7,2,6>
+  2573347601U, // <7,u,7,3>: Cost 3 vext1 <3,7,u,7>, <3,7,u,7>
+  1638471976U, // <7,u,7,4>: Cost 2 vext3 RHS, <u,7,4,5>
+  1511551171U, // <7,u,7,5>: Cost 2 vext1 <5,7,u,7>, <5,7,u,7>
+  2712213815U, // <7,u,7,6>: Cost 3 vext3 RHS, <u,7,6,2>
+  363253046U, // <7,u,7,7>: Cost 1 vdup3 RHS
+  363253046U, // <7,u,7,u>: Cost 1 vdup3 RHS
+  1638324561U, // <7,u,u,0>: Cost 2 vext3 RHS, <u,u,0,1>
+  1638324571U, // <7,u,u,1>: Cost 2 vext3 RHS, <u,u,1,2>
+  564582757U, // <7,u,u,2>: Cost 1 vext3 RHS, LHS
+  1638324587U, // <7,u,u,3>: Cost 2 vext3 RHS, <u,u,3,0>
+  1638324601U, // <7,u,u,4>: Cost 2 vext3 RHS, <u,u,4,5>
+  1638324611U, // <7,u,u,5>: Cost 2 vext3 RHS, <u,u,5,6>
+  564582797U, // <7,u,u,6>: Cost 1 vext3 RHS, RHS
+  363253046U, // <7,u,u,7>: Cost 1 vdup3 RHS
+  564582811U, // <7,u,u,u>: Cost 1 vext3 RHS, LHS
+  135053414U, // <u,0,0,0>: Cost 1 vdup0 LHS
+  1611489290U, // <u,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
+  1611489300U, // <u,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
+  2568054923U, // <u,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
+  1481706806U, // <u,0,0,4>: Cost 2 vext1 <0,u,0,0>, RHS
+  2555449040U, // <u,0,0,5>: Cost 3 vext1 <0,u,0,0>, <5,1,7,3>
+  2591282078U, // <u,0,0,6>: Cost 3 vext1 <6,u,0,0>, <6,u,0,0>
+  2591945711U, // <u,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
+  135053414U, // <u,0,0,u>: Cost 1 vdup0 LHS
+  1493655654U, // <u,0,1,0>: Cost 2 vext1 <2,u,0,1>, LHS
+  1860550758U, // <u,0,1,1>: Cost 2 vzipl LHS, LHS
+  537747563U, // <u,0,1,2>: Cost 1 vext3 LHS, LHS
+  2625135576U, // <u,0,1,3>: Cost 3 vext2 <1,2,u,0>, <1,3,1,3>
+  1493658934U, // <u,0,1,4>: Cost 2 vext1 <2,u,0,1>, RHS
+  2625135760U, // <u,0,1,5>: Cost 3 vext2 <1,2,u,0>, <1,5,3,7>
+  1517548447U, // <u,0,1,6>: Cost 2 vext1 <6,u,0,1>, <6,u,0,1>
+  2591290362U, // <u,0,1,7>: Cost 3 vext1 <6,u,0,1>, <7,0,1,2>
+  537747612U, // <u,0,1,u>: Cost 1 vext3 LHS, LHS
+  1611489444U, // <u,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+  2685231276U, // <u,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
+  1994768486U, // <u,0,2,2>: Cost 2 vtrnl LHS, LHS
+  2685231294U, // <u,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
+  1611489484U, // <u,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+  2712068310U, // <u,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
+  2625136570U, // <u,0,2,6>: Cost 3 vext2 <1,2,u,0>, <2,6,3,7>
+  2591962097U, // <u,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
+  1611489516U, // <u,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
+  2954067968U, // <u,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
+  2685231356U, // <u,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
+  72589981U, // <u,0,3,2>: Cost 1 vrev LHS
+  2625137052U, // <u,0,3,3>: Cost 3 vext2 <1,2,u,0>, <3,3,3,3>
+  2625137154U, // <u,0,3,4>: Cost 3 vext2 <1,2,u,0>, <3,4,5,6>
+  2639071848U, // <u,0,3,5>: Cost 3 vext2 <3,5,u,0>, <3,5,u,0>
+  2639735481U, // <u,0,3,6>: Cost 3 vext2 <3,6,u,0>, <3,6,u,0>
+  2597279354U, // <u,0,3,7>: Cost 3 vext1 <7,u,0,3>, <7,u,0,3>
+  73032403U, // <u,0,3,u>: Cost 1 vrev LHS
+  2687074636U, // <u,0,4,0>: Cost 3 vext3 <0,4,0,u>, <0,4,0,u>
+  1611489618U, // <u,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
+  1611489628U, // <u,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
+  3629222038U, // <u,0,4,3>: Cost 4 vext1 <0,u,0,4>, <3,0,1,2>
+  2555481398U, // <u,0,4,4>: Cost 3 vext1 <0,u,0,4>, RHS
+  1551396150U, // <u,0,4,5>: Cost 2 vext2 <1,2,u,0>, RHS
+  2651680116U, // <u,0,4,6>: Cost 3 vext2 <5,6,u,0>, <4,6,4,6>
+  2646150600U, // <u,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
+  1611932050U, // <u,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
+  2561458278U, // <u,0,5,0>: Cost 3 vext1 <1,u,0,5>, LHS
+  1863532646U, // <u,0,5,1>: Cost 2 vzipl RHS, LHS
+  2712068526U, // <u,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
+  2649689976U, // <u,0,5,3>: Cost 3 vext2 <5,3,u,0>, <5,3,u,0>
+  2220237489U, // <u,0,5,4>: Cost 3 vrev <0,u,4,5>
+  2651680772U, // <u,0,5,5>: Cost 3 vext2 <5,6,u,0>, <5,5,5,5>
+  1577939051U, // <u,0,5,6>: Cost 2 vext2 <5,6,u,0>, <5,6,u,0>
+  2830077238U, // <u,0,5,7>: Cost 3 vuzpr <1,u,3,0>, RHS
+  1579266317U, // <u,0,5,u>: Cost 2 vext2 <5,u,u,0>, <5,u,u,0>
+  2555494502U, // <u,0,6,0>: Cost 3 vext1 <0,u,0,6>, LHS
+  2712068598U, // <u,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
+  1997750374U, // <u,0,6,2>: Cost 2 vtrnl RHS, LHS
+  2655662673U, // <u,0,6,3>: Cost 3 vext2 <6,3,u,0>, <6,3,u,0>
+  2555497782U, // <u,0,6,4>: Cost 3 vext1 <0,u,0,6>, RHS
+  2651681459U, // <u,0,6,5>: Cost 3 vext2 <5,6,u,0>, <6,5,0,u>
+  2651681592U, // <u,0,6,6>: Cost 3 vext2 <5,6,u,0>, <6,6,6,6>
+  2651681614U, // <u,0,6,7>: Cost 3 vext2 <5,6,u,0>, <6,7,0,1>
+  1997750428U, // <u,0,6,u>: Cost 2 vtrnl RHS, LHS
+  2567446630U, // <u,0,7,0>: Cost 3 vext1 <2,u,0,7>, LHS
+  2567447446U, // <u,0,7,1>: Cost 3 vext1 <2,u,0,7>, <1,2,3,0>
+  2567448641U, // <u,0,7,2>: Cost 3 vext1 <2,u,0,7>, <2,u,0,7>
+  2573421338U, // <u,0,7,3>: Cost 3 vext1 <3,u,0,7>, <3,u,0,7>
+  2567449910U, // <u,0,7,4>: Cost 3 vext1 <2,u,0,7>, RHS
+  2651682242U, // <u,0,7,5>: Cost 3 vext2 <5,6,u,0>, <7,5,6,u>
+  2591339429U, // <u,0,7,6>: Cost 3 vext1 <6,u,0,7>, <6,u,0,7>
+  2651682412U, // <u,0,7,7>: Cost 3 vext2 <5,6,u,0>, <7,7,7,7>
+  2567452462U, // <u,0,7,u>: Cost 3 vext1 <2,u,0,7>, LHS
+  135053414U, // <u,0,u,0>: Cost 1 vdup0 LHS
+  1611489938U, // <u,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
+  537748125U, // <u,0,u,2>: Cost 1 vext3 LHS, LHS
+  2685674148U, // <u,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
+  1611932338U, // <u,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
+  1551399066U, // <u,0,u,5>: Cost 2 vext2 <1,2,u,0>, RHS
+  1517605798U, // <u,0,u,6>: Cost 2 vext1 <6,u,0,u>, <6,u,0,u>
+  2830077481U, // <u,0,u,7>: Cost 3 vuzpr <1,u,3,0>, RHS
+  537748179U, // <u,0,u,u>: Cost 1 vext3 LHS, LHS
+  1544101961U, // <u,1,0,0>: Cost 2 vext2 <0,0,u,1>, <0,0,u,1>
+  1558036582U, // <u,1,0,1>: Cost 2 vext2 <2,3,u,1>, LHS
+  2619171051U, // <u,1,0,2>: Cost 3 vext2 <0,2,u,1>, <0,2,u,1>
+  1611490038U, // <u,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
+  2555522358U, // <u,1,0,4>: Cost 3 vext1 <0,u,1,0>, RHS
+  2712068871U, // <u,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
+  2591355815U, // <u,1,0,6>: Cost 3 vext1 <6,u,1,0>, <6,u,1,0>
+  2597328512U, // <u,1,0,7>: Cost 3 vext1 <7,u,1,0>, <7,u,1,0>
+  1611490083U, // <u,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
+  1481785446U, // <u,1,1,0>: Cost 2 vext1 <0,u,1,1>, LHS
+  202162278U, // <u,1,1,1>: Cost 1 vdup1 LHS
+  2555528808U, // <u,1,1,2>: Cost 3 vext1 <0,u,1,1>, <2,2,2,2>
+  1611490120U, // <u,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
+  1481788726U, // <u,1,1,4>: Cost 2 vext1 <0,u,1,1>, RHS
+  2689876828U, // <u,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
+  2591364008U, // <u,1,1,6>: Cost 3 vext1 <6,u,1,1>, <6,u,1,1>
+  2592691274U, // <u,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
+  202162278U, // <u,1,1,u>: Cost 1 vdup1 LHS
+  1499709542U, // <u,1,2,0>: Cost 2 vext1 <3,u,1,2>, LHS
+  2689876871U, // <u,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
+  2631116445U, // <u,1,2,2>: Cost 3 vext2 <2,2,u,1>, <2,2,u,1>
+  835584U, // <u,1,2,3>: Cost 0 copy LHS
+  1499712822U, // <u,1,2,4>: Cost 2 vext1 <3,u,1,2>, RHS
+  2689876907U, // <u,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
+  2631780282U, // <u,1,2,6>: Cost 3 vext2 <2,3,u,1>, <2,6,3,7>
+  1523603074U, // <u,1,2,7>: Cost 2 vext1 <7,u,1,2>, <7,u,1,2>
+  835584U, // <u,1,2,u>: Cost 0 copy LHS
+  1487773798U, // <u,1,3,0>: Cost 2 vext1 <1,u,1,3>, LHS
+  1611490264U, // <u,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
+  2685232094U, // <u,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
+  2018746470U, // <u,1,3,3>: Cost 2 vtrnr LHS, LHS
+  1487777078U, // <u,1,3,4>: Cost 2 vext1 <1,u,1,3>, RHS
+  1611490304U, // <u,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
+  2685674505U, // <u,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
+  2640407307U, // <u,1,3,7>: Cost 3 vext2 <3,7,u,1>, <3,7,u,1>
+  1611490327U, // <u,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
+  1567992749U, // <u,1,4,0>: Cost 2 vext2 <4,0,u,1>, <4,0,u,1>
+  2693121070U, // <u,1,4,1>: Cost 3 vext3 <1,4,1,u>, <1,4,1,u>
+  2693194807U, // <u,1,4,2>: Cost 3 vext3 <1,4,2,u>, <1,4,2,u>
+  1152386432U, // <u,1,4,3>: Cost 2 vrev <1,u,3,4>
+  2555555126U, // <u,1,4,4>: Cost 3 vext1 <0,u,1,4>, RHS
+  1558039862U, // <u,1,4,5>: Cost 2 vext2 <2,3,u,1>, RHS
+  2645716371U, // <u,1,4,6>: Cost 3 vext2 <4,6,u,1>, <4,6,u,1>
+  2597361284U, // <u,1,4,7>: Cost 3 vext1 <7,u,1,4>, <7,u,1,4>
+  1152755117U, // <u,1,4,u>: Cost 2 vrev <1,u,u,4>
+  1481818214U, // <u,1,5,0>: Cost 2 vext1 <0,u,1,5>, LHS
+  2555560694U, // <u,1,5,1>: Cost 3 vext1 <0,u,1,5>, <1,0,3,2>
+  2555561576U, // <u,1,5,2>: Cost 3 vext1 <0,u,1,5>, <2,2,2,2>
+  1611490448U, // <u,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
+  1481821494U, // <u,1,5,4>: Cost 2 vext1 <0,u,1,5>, RHS
+  2651025435U, // <u,1,5,5>: Cost 3 vext2 <5,5,u,1>, <5,5,u,1>
+  2651689068U, // <u,1,5,6>: Cost 3 vext2 <5,6,u,1>, <5,6,u,1>
+  2823966006U, // <u,1,5,7>: Cost 3 vuzpr <0,u,1,1>, RHS
+  1611932861U, // <u,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
+  2555568230U, // <u,1,6,0>: Cost 3 vext1 <0,u,1,6>, LHS
+  2689877199U, // <u,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
+  2712069336U, // <u,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
+  2685232353U, // <u,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
+  2555571510U, // <u,1,6,4>: Cost 3 vext1 <0,u,1,6>, RHS
+  2689877235U, // <u,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
+  2657661765U, // <u,1,6,6>: Cost 3 vext2 <6,6,u,1>, <6,6,u,1>
+  1584583574U, // <u,1,6,7>: Cost 2 vext2 <6,7,u,1>, <6,7,u,1>
+  1585247207U, // <u,1,6,u>: Cost 2 vext2 <6,u,u,1>, <6,u,u,1>
+  2561548390U, // <u,1,7,0>: Cost 3 vext1 <1,u,1,7>, LHS
+  2561549681U, // <u,1,7,1>: Cost 3 vext1 <1,u,1,7>, <1,u,1,7>
+  2573493926U, // <u,1,7,2>: Cost 3 vext1 <3,u,1,7>, <2,3,0,1>
+  2042962022U, // <u,1,7,3>: Cost 2 vtrnr RHS, LHS
+  2561551670U, // <u,1,7,4>: Cost 3 vext1 <1,u,1,7>, RHS
+  2226300309U, // <u,1,7,5>: Cost 3 vrev <1,u,5,7>
+  2658325990U, // <u,1,7,6>: Cost 3 vext2 <6,7,u,1>, <7,6,1,u>
+  2658326124U, // <u,1,7,7>: Cost 3 vext2 <6,7,u,1>, <7,7,7,7>
+  2042962027U, // <u,1,7,u>: Cost 2 vtrnr RHS, LHS
+  1481842790U, // <u,1,u,0>: Cost 2 vext1 <0,u,1,u>, LHS
+  202162278U, // <u,1,u,1>: Cost 1 vdup1 LHS
+  2685674867U, // <u,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
+  835584U, // <u,1,u,3>: Cost 0 copy LHS
+  1481846070U, // <u,1,u,4>: Cost 2 vext1 <0,u,1,u>, RHS
+  1611933077U, // <u,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
+  2685674910U, // <u,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
+  1523652232U, // <u,1,u,7>: Cost 2 vext1 <7,u,1,u>, <7,u,1,u>
+  835584U, // <u,1,u,u>: Cost 0 copy LHS
+  1544110154U, // <u,2,0,0>: Cost 2 vext2 <0,0,u,2>, <0,0,u,2>
+  1545437286U, // <u,2,0,1>: Cost 2 vext2 <0,2,u,2>, LHS
+  1545437420U, // <u,2,0,2>: Cost 2 vext2 <0,2,u,2>, <0,2,u,2>
+  2685232589U, // <u,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
+  2619179346U, // <u,2,0,4>: Cost 3 vext2 <0,2,u,2>, <0,4,1,5>
+  2712069606U, // <u,2,0,5>: Cost 3 vext3 RHS, <2,0,5,7>
+  2689877484U, // <u,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
+  2659656273U, // <u,2,0,7>: Cost 3 vext2 <7,0,u,2>, <0,7,2,u>
+  1545437853U, // <u,2,0,u>: Cost 2 vext2 <0,2,u,2>, LHS
+  1550082851U, // <u,2,1,0>: Cost 2 vext2 <1,0,u,2>, <1,0,u,2>
+  2619179828U, // <u,2,1,1>: Cost 3 vext2 <0,2,u,2>, <1,1,1,1>
+  2619179926U, // <u,2,1,2>: Cost 3 vext2 <0,2,u,2>, <1,2,3,0>
+  2685232671U, // <u,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
+  2555604278U, // <u,2,1,4>: Cost 3 vext1 <0,u,2,1>, RHS
+  2619180176U, // <u,2,1,5>: Cost 3 vext2 <0,2,u,2>, <1,5,3,7>
+  2689877564U, // <u,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
+  2602718850U, // <u,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
+  1158703235U, // <u,2,1,u>: Cost 2 vrev <2,u,u,1>
+  1481867366U, // <u,2,2,0>: Cost 2 vext1 <0,u,2,2>, LHS
+  2555609846U, // <u,2,2,1>: Cost 3 vext1 <0,u,2,2>, <1,0,3,2>
+  269271142U, // <u,2,2,2>: Cost 1 vdup2 LHS
+  1611490930U, // <u,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
+  1481870646U, // <u,2,2,4>: Cost 2 vext1 <0,u,2,2>, RHS
+  2689877640U, // <u,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
+  2619180986U, // <u,2,2,6>: Cost 3 vext2 <0,2,u,2>, <2,6,3,7>
+  2593436837U, // <u,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
+  269271142U, // <u,2,2,u>: Cost 1 vdup2 LHS
+  408134301U, // <u,2,3,0>: Cost 1 vext1 LHS, LHS
+  1481876214U, // <u,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  1481877096U, // <u,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
+  1880326246U, // <u,2,3,3>: Cost 2 vzipr LHS, LHS
+  408137014U, // <u,2,3,4>: Cost 1 vext1 LHS, RHS
+  1529654992U, // <u,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
+  1529655802U, // <u,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1529656314U, // <u,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  408139566U, // <u,2,3,u>: Cost 1 vext1 LHS, LHS
+  1567853468U, // <u,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
+  2561598362U, // <u,2,4,1>: Cost 3 vext1 <1,u,2,4>, <1,2,3,4>
+  2555627214U, // <u,2,4,2>: Cost 3 vext1 <0,u,2,4>, <2,3,4,5>
+  2685232918U, // <u,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
+  2555628854U, // <u,2,4,4>: Cost 3 vext1 <0,u,2,4>, RHS
+  1545440566U, // <u,2,4,5>: Cost 2 vext2 <0,2,u,2>, RHS
+  1571982740U, // <u,2,4,6>: Cost 2 vext2 <4,6,u,2>, <4,6,u,2>
+  2592125957U, // <u,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
+  1545440809U, // <u,2,4,u>: Cost 2 vext2 <0,2,u,2>, RHS
+  2555633766U, // <u,2,5,0>: Cost 3 vext1 <0,u,2,5>, LHS
+  2561606550U, // <u,2,5,1>: Cost 3 vext1 <1,u,2,5>, <1,2,3,0>
+  2689877856U, // <u,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
+  2685233000U, // <u,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
+  1158441059U, // <u,2,5,4>: Cost 2 vrev <2,u,4,5>
+  2645725188U, // <u,2,5,5>: Cost 3 vext2 <4,6,u,2>, <5,5,5,5>
+  2689877892U, // <u,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
+  2823900470U, // <u,2,5,7>: Cost 3 vuzpr <0,u,0,2>, RHS
+  1158736007U, // <u,2,5,u>: Cost 2 vrev <2,u,u,5>
+  1481900134U, // <u,2,6,0>: Cost 2 vext1 <0,u,2,6>, LHS
+  2555642614U, // <u,2,6,1>: Cost 3 vext1 <0,u,2,6>, <1,0,3,2>
+  2555643496U, // <u,2,6,2>: Cost 3 vext1 <0,u,2,6>, <2,2,2,2>
+  1611491258U, // <u,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
+  1481903414U, // <u,2,6,4>: Cost 2 vext1 <0,u,2,6>, RHS
+  2689877964U, // <u,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
+  2689877973U, // <u,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
+  2645726030U, // <u,2,6,7>: Cost 3 vext2 <4,6,u,2>, <6,7,0,1>
+  1611933671U, // <u,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
+  1585919033U, // <u,2,7,0>: Cost 2 vext2 <7,0,u,2>, <7,0,u,2>
+  2573566710U, // <u,2,7,1>: Cost 3 vext1 <3,u,2,7>, <1,0,3,2>
+  2567596115U, // <u,2,7,2>: Cost 3 vext1 <2,u,2,7>, <2,u,2,7>
+  1906901094U, // <u,2,7,3>: Cost 2 vzipr RHS, LHS
+  2555653430U, // <u,2,7,4>: Cost 3 vext1 <0,u,2,7>, RHS
+  2800080230U, // <u,2,7,5>: Cost 3 vuzpl LHS, <7,4,5,6>
+  2980643164U, // <u,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
+  2645726828U, // <u,2,7,7>: Cost 3 vext2 <4,6,u,2>, <7,7,7,7>
+  1906901099U, // <u,2,7,u>: Cost 2 vzipr RHS, LHS
+  408175266U, // <u,2,u,0>: Cost 1 vext1 LHS, LHS
+  1545443118U, // <u,2,u,1>: Cost 2 vext2 <0,2,u,2>, LHS
+  269271142U, // <u,2,u,2>: Cost 1 vdup2 LHS
+  1611491416U, // <u,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
+  408177974U, // <u,2,u,4>: Cost 1 vext1 LHS, RHS
+  1545443482U, // <u,2,u,5>: Cost 2 vext2 <0,2,u,2>, RHS
+  1726339226U, // <u,2,u,6>: Cost 2 vuzpl LHS, RHS
+  1529697274U, // <u,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  408180526U, // <u,2,u,u>: Cost 1 vext1 LHS, LHS
+  1544781824U, // <u,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+  471040156U, // <u,3,0,1>: Cost 1 vext2 LHS, LHS
+  1544781988U, // <u,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  2618523900U, // <u,3,0,3>: Cost 3 vext2 LHS, <0,3,1,0>
+  1544782162U, // <u,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  2238188352U, // <u,3,0,5>: Cost 3 vrev <3,u,5,0>
+  2623169023U, // <u,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
+  2238335826U, // <u,3,0,7>: Cost 3 vrev <3,u,7,0>
+  471040669U, // <u,3,0,u>: Cost 1 vext2 LHS, LHS
+  1544782582U, // <u,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  1544782644U, // <u,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+  1544782742U, // <u,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+  1544782808U, // <u,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  2618524733U, // <u,3,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
+  1544782992U, // <u,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  2618524897U, // <u,3,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
+  2703517987U, // <u,3,1,7>: Cost 3 vext3 <3,1,7,u>, <3,1,7,u>
+  1544783213U, // <u,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
+  1529716838U, // <u,3,2,0>: Cost 2 vext1 <u,u,3,2>, LHS
+  1164167966U, // <u,3,2,1>: Cost 2 vrev <3,u,1,2>
+  1544783464U, // <u,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
+  1544783526U, // <u,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+  1529720118U, // <u,3,2,4>: Cost 2 vext1 <u,u,3,2>, RHS
+  2618525544U, // <u,3,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+  1544783802U, // <u,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  2704181620U, // <u,3,2,7>: Cost 3 vext3 <3,2,7,u>, <3,2,7,u>
+  1544783931U, // <u,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
+  1544784022U, // <u,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+  1487922559U, // <u,3,3,1>: Cost 2 vext1 <1,u,3,3>, <1,u,3,3>
+  1493895256U, // <u,3,3,2>: Cost 2 vext1 <2,u,3,3>, <2,u,3,3>
+  336380006U, // <u,3,3,3>: Cost 1 vdup3 LHS
+  1544784386U, // <u,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+  2824054478U, // <u,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
+  2238286668U, // <u,3,3,6>: Cost 3 vrev <3,u,6,3>
+  2954069136U, // <u,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
+  336380006U, // <u,3,3,u>: Cost 1 vdup3 LHS
+  1487929446U, // <u,3,4,0>: Cost 2 vext1 <1,u,3,4>, LHS
+  1487930752U, // <u,3,4,1>: Cost 2 vext1 <1,u,3,4>, <1,u,3,4>
+  2623171644U, // <u,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
+  2561673366U, // <u,3,4,3>: Cost 3 vext1 <1,u,3,4>, <3,0,1,2>
+  1487932726U, // <u,3,4,4>: Cost 2 vext1 <1,u,3,4>, RHS
+  471043382U, // <u,3,4,5>: Cost 1 vext2 LHS, RHS
+  1592561012U, // <u,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+  2238368598U, // <u,3,4,7>: Cost 3 vrev <3,u,7,4>
+  471043625U, // <u,3,4,u>: Cost 1 vext2 LHS, RHS
+  2555707494U, // <u,3,5,0>: Cost 3 vext1 <0,u,3,5>, LHS
+  1574645465U, // <u,3,5,1>: Cost 2 vext2 <5,1,u,3>, <5,1,u,3>
+  2567653106U, // <u,3,5,2>: Cost 3 vext1 <2,u,3,5>, <2,3,u,5>
+  2555709954U, // <u,3,5,3>: Cost 3 vext1 <0,u,3,5>, <3,4,5,6>
+  1592561606U, // <u,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+  1592561668U, // <u,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+  1592561762U, // <u,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
+  1750314294U, // <u,3,5,7>: Cost 2 vuzpr LHS, RHS
+  1750314295U, // <u,3,5,u>: Cost 2 vuzpr LHS, RHS
+  2623172897U, // <u,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
+  2561688962U, // <u,3,6,1>: Cost 3 vext1 <1,u,3,6>, <1,u,3,6>
+  1581281795U, // <u,3,6,2>: Cost 2 vext2 <6,2,u,3>, <6,2,u,3>
+  2706541204U, // <u,3,6,3>: Cost 3 vext3 <3,6,3,u>, <3,6,3,u>
+  2623173261U, // <u,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
+  1164495686U, // <u,3,6,5>: Cost 2 vrev <3,u,5,6>
+  1592562488U, // <u,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+  1592562510U, // <u,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+  1164716897U, // <u,3,6,u>: Cost 2 vrev <3,u,u,6>
+  1487954022U, // <u,3,7,0>: Cost 2 vext1 <1,u,3,7>, LHS
+  1487955331U, // <u,3,7,1>: Cost 2 vext1 <1,u,3,7>, <1,u,3,7>
+  1493928028U, // <u,3,7,2>: Cost 2 vext1 <2,u,3,7>, <2,u,3,7>
+  2561697942U, // <u,3,7,3>: Cost 3 vext1 <1,u,3,7>, <3,0,1,2>
+  1487957302U, // <u,3,7,4>: Cost 2 vext1 <1,u,3,7>, RHS
+  2707352311U, // <u,3,7,5>: Cost 3 vext3 <3,7,5,u>, <3,7,5,u>
+  2655024623U, // <u,3,7,6>: Cost 3 vext2 <6,2,u,3>, <7,6,2,u>
+  1592563308U, // <u,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+  1487959854U, // <u,3,7,u>: Cost 2 vext1 <1,u,3,7>, LHS
+  1544787667U, // <u,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
+  471045934U, // <u,3,u,1>: Cost 1 vext2 LHS, LHS
+  1549432709U, // <u,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
+  336380006U, // <u,3,u,3>: Cost 1 vdup3 LHS
+  1544788031U, // <u,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
+  471046298U, // <u,3,u,5>: Cost 1 vext2 LHS, RHS
+  1549433040U, // <u,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
+  1750314537U, // <u,3,u,7>: Cost 2 vuzpr LHS, RHS
+  471046501U, // <u,3,u,u>: Cost 1 vext2 LHS, LHS
+  2625167360U, // <u,4,0,0>: Cost 3 vext2 <1,2,u,4>, <0,0,0,0>
+  1551425638U, // <u,4,0,1>: Cost 2 vext2 <1,2,u,4>, LHS
+  2619195630U, // <u,4,0,2>: Cost 3 vext2 <0,2,u,4>, <0,2,u,4>
+  2619343104U, // <u,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
+  2625167698U, // <u,4,0,4>: Cost 3 vext2 <1,2,u,4>, <0,4,1,5>
+  1638329234U, // <u,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
+  1638329244U, // <u,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
+  3787803556U, // <u,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
+  1551426205U, // <u,4,0,u>: Cost 2 vext2 <1,2,u,4>, LHS
+  2555748454U, // <u,4,1,0>: Cost 3 vext1 <0,u,4,1>, LHS
+  2625168180U, // <u,4,1,1>: Cost 3 vext2 <1,2,u,4>, <1,1,1,1>
+  1551426503U, // <u,4,1,2>: Cost 2 vext2 <1,2,u,4>, <1,2,u,4>
+  2625168344U, // <u,4,1,3>: Cost 3 vext2 <1,2,u,4>, <1,3,1,3>
+  2555751734U, // <u,4,1,4>: Cost 3 vext1 <0,u,4,1>, RHS
+  1860554038U, // <u,4,1,5>: Cost 2 vzipl LHS, RHS
+  2689879022U, // <u,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
+  2592248852U, // <u,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
+  1555408301U, // <u,4,1,u>: Cost 2 vext2 <1,u,u,4>, <1,u,u,4>
+  2555756646U, // <u,4,2,0>: Cost 3 vext1 <0,u,4,2>, LHS
+  2625168943U, // <u,4,2,1>: Cost 3 vext2 <1,2,u,4>, <2,1,4,u>
+  2625169000U, // <u,4,2,2>: Cost 3 vext2 <1,2,u,4>, <2,2,2,2>
+  2619197134U, // <u,4,2,3>: Cost 3 vext2 <0,2,u,4>, <2,3,4,5>
+  2555759926U, // <u,4,2,4>: Cost 3 vext1 <0,u,4,2>, RHS
+  2712071222U, // <u,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
+  1994771766U, // <u,4,2,6>: Cost 2 vtrnl LHS, RHS
+  2592257045U, // <u,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
+  1994771784U, // <u,4,2,u>: Cost 2 vtrnl LHS, RHS
+  2625169558U, // <u,4,3,0>: Cost 3 vext2 <1,2,u,4>, <3,0,1,2>
+  2567709594U, // <u,4,3,1>: Cost 3 vext1 <2,u,4,3>, <1,2,3,4>
+  2567710817U, // <u,4,3,2>: Cost 3 vext1 <2,u,4,3>, <2,u,4,3>
+  2625169820U, // <u,4,3,3>: Cost 3 vext2 <1,2,u,4>, <3,3,3,3>
+  2625169922U, // <u,4,3,4>: Cost 3 vext2 <1,2,u,4>, <3,4,5,6>
+  2954069710U, // <u,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
+  2954068172U, // <u,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
+  3903849472U, // <u,4,3,7>: Cost 4 vuzpr <1,u,3,4>, <1,3,5,7>
+  2954068174U, // <u,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
+  1505919078U, // <u,4,4,0>: Cost 2 vext1 <4,u,4,4>, LHS
+  2567717831U, // <u,4,4,1>: Cost 3 vext1 <2,u,4,4>, <1,2,u,4>
+  2567719010U, // <u,4,4,2>: Cost 3 vext1 <2,u,4,4>, <2,u,4,4>
+  2570373542U, // <u,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
+  161926454U, // <u,4,4,4>: Cost 1 vdup0 RHS
+  1551428918U, // <u,4,4,5>: Cost 2 vext2 <1,2,u,4>, RHS
+  1638329572U, // <u,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
+  2594927963U, // <u,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
+  161926454U, // <u,4,4,u>: Cost 1 vdup0 RHS
+  1493983334U, // <u,4,5,0>: Cost 2 vext1 <2,u,4,5>, LHS
+  2689879301U, // <u,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
+  1493985379U, // <u,4,5,2>: Cost 2 vext1 <2,u,4,5>, <2,u,4,5>
+  2567727254U, // <u,4,5,3>: Cost 3 vext1 <2,u,4,5>, <3,0,1,2>
+  1493986614U, // <u,4,5,4>: Cost 2 vext1 <2,u,4,5>, RHS
+  1863535926U, // <u,4,5,5>: Cost 2 vzipl RHS, RHS
+  537750838U, // <u,4,5,6>: Cost 1 vext3 LHS, RHS
+  2830110006U, // <u,4,5,7>: Cost 3 vuzpr <1,u,3,4>, RHS
+  537750856U, // <u,4,5,u>: Cost 1 vext3 LHS, RHS
+  1482047590U, // <u,4,6,0>: Cost 2 vext1 <0,u,4,6>, LHS
+  2555790070U, // <u,4,6,1>: Cost 3 vext1 <0,u,4,6>, <1,0,3,2>
+  2555790952U, // <u,4,6,2>: Cost 3 vext1 <0,u,4,6>, <2,2,2,2>
+  2555791510U, // <u,4,6,3>: Cost 3 vext1 <0,u,4,6>, <3,0,1,2>
+  1482050870U, // <u,4,6,4>: Cost 2 vext1 <0,u,4,6>, RHS
+  2689879422U, // <u,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
+  1997753654U, // <u,4,6,6>: Cost 2 vtrnl RHS, RHS
+  2712071562U, // <u,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
+  1482053422U, // <u,4,6,u>: Cost 2 vext1 <0,u,4,6>, LHS
+  2567741542U, // <u,4,7,0>: Cost 3 vext1 <2,u,4,7>, LHS
+  2567742362U, // <u,4,7,1>: Cost 3 vext1 <2,u,4,7>, <1,2,3,4>
+  2567743589U, // <u,4,7,2>: Cost 3 vext1 <2,u,4,7>, <2,u,4,7>
+  2573716286U, // <u,4,7,3>: Cost 3 vext1 <3,u,4,7>, <3,u,4,7>
+  2567744822U, // <u,4,7,4>: Cost 3 vext1 <2,u,4,7>, RHS
+  2712071624U, // <u,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
+  96808489U, // <u,4,7,6>: Cost 1 vrev RHS
+  2651715180U, // <u,4,7,7>: Cost 3 vext2 <5,6,u,4>, <7,7,7,7>
+  96955963U, // <u,4,7,u>: Cost 1 vrev RHS
+  1482063974U, // <u,4,u,0>: Cost 2 vext1 <0,u,4,u>, LHS
+  1551431470U, // <u,4,u,1>: Cost 2 vext2 <1,2,u,4>, LHS
+  1494009958U, // <u,4,u,2>: Cost 2 vext1 <2,u,4,u>, <2,u,4,u>
+  2555807894U, // <u,4,u,3>: Cost 3 vext1 <0,u,4,u>, <3,0,1,2>
+  161926454U, // <u,4,u,4>: Cost 1 vdup0 RHS
+  1551431834U, // <u,4,u,5>: Cost 2 vext2 <1,2,u,4>, RHS
+  537751081U, // <u,4,u,6>: Cost 1 vext3 LHS, RHS
+  2830110249U, // <u,4,u,7>: Cost 3 vuzpr <1,u,3,4>, RHS
+  537751099U, // <u,4,u,u>: Cost 1 vext3 LHS, RHS
+  2631811072U, // <u,5,0,0>: Cost 3 vext2 <2,3,u,5>, <0,0,0,0>
+  1558069350U, // <u,5,0,1>: Cost 2 vext2 <2,3,u,5>, LHS
+  2619203823U, // <u,5,0,2>: Cost 3 vext2 <0,2,u,5>, <0,2,u,5>
+  2619867456U, // <u,5,0,3>: Cost 3 vext2 <0,3,u,5>, <0,3,u,5>
+  1546273106U, // <u,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
+  2733010539U, // <u,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
+  2597622682U, // <u,5,0,6>: Cost 3 vext1 <7,u,5,0>, <6,7,u,5>
+  1176539396U, // <u,5,0,7>: Cost 2 vrev <5,u,7,0>
+  1558069917U, // <u,5,0,u>: Cost 2 vext2 <2,3,u,5>, LHS
+  1505968230U, // <u,5,1,0>: Cost 2 vext1 <4,u,5,1>, LHS
+  2624512887U, // <u,5,1,1>: Cost 3 vext2 <1,1,u,5>, <1,1,u,5>
+  2631811990U, // <u,5,1,2>: Cost 3 vext2 <2,3,u,5>, <1,2,3,0>
+  2618541056U, // <u,5,1,3>: Cost 3 vext2 <0,1,u,5>, <1,3,5,7>
+  1505971510U, // <u,5,1,4>: Cost 2 vext1 <4,u,5,1>, RHS
+  2627167419U, // <u,5,1,5>: Cost 3 vext2 <1,5,u,5>, <1,5,u,5>
+  2579714554U, // <u,5,1,6>: Cost 3 vext1 <4,u,5,1>, <6,2,7,3>
+  1638330064U, // <u,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
+  1638477529U, // <u,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
+  2561802342U, // <u,5,2,0>: Cost 3 vext1 <1,u,5,2>, LHS
+  2561803264U, // <u,5,2,1>: Cost 3 vext1 <1,u,5,2>, <1,3,5,7>
+  2631149217U, // <u,5,2,2>: Cost 3 vext2 <2,2,u,5>, <2,2,u,5>
+  1558071026U, // <u,5,2,3>: Cost 2 vext2 <2,3,u,5>, <2,3,u,5>
+  2561805622U, // <u,5,2,4>: Cost 3 vext1 <1,u,5,2>, RHS
+  2714062607U, // <u,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
+  2631813050U, // <u,5,2,6>: Cost 3 vext2 <2,3,u,5>, <2,6,3,7>
+  3092335926U, // <u,5,2,7>: Cost 3 vtrnr <0,u,0,2>, RHS
+  1561389191U, // <u,5,2,u>: Cost 2 vext2 <2,u,u,5>, <2,u,u,5>
+  2561810534U, // <u,5,3,0>: Cost 3 vext1 <1,u,5,3>, LHS
+  2561811857U, // <u,5,3,1>: Cost 3 vext1 <1,u,5,3>, <1,u,5,3>
+  2631813474U, // <u,5,3,2>: Cost 3 vext2 <2,3,u,5>, <3,2,5,u>
+  2631813532U, // <u,5,3,3>: Cost 3 vext2 <2,3,u,5>, <3,3,3,3>
+  2619869698U, // <u,5,3,4>: Cost 3 vext2 <0,3,u,5>, <3,4,5,6>
+  3001847002U, // <u,5,3,5>: Cost 3 vzipr LHS, <4,4,5,5>
+  2954070530U, // <u,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  2018749750U, // <u,5,3,7>: Cost 2 vtrnr LHS, RHS
+  2018749751U, // <u,5,3,u>: Cost 2 vtrnr LHS, RHS
+  2573762662U, // <u,5,4,0>: Cost 3 vext1 <3,u,5,4>, LHS
+  2620017634U, // <u,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
+  2573764338U, // <u,5,4,2>: Cost 3 vext1 <3,u,5,4>, <2,3,u,5>
+  2573765444U, // <u,5,4,3>: Cost 3 vext1 <3,u,5,4>, <3,u,5,4>
+  1570680053U, // <u,5,4,4>: Cost 2 vext2 <4,4,u,5>, <4,4,u,5>
+  1558072630U, // <u,5,4,5>: Cost 2 vext2 <2,3,u,5>, RHS
+  2645749143U, // <u,5,4,6>: Cost 3 vext2 <4,6,u,5>, <4,6,u,5>
+  1638330310U, // <u,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
+  1558072873U, // <u,5,4,u>: Cost 2 vext2 <2,3,u,5>, RHS
+  1506000998U, // <u,5,5,0>: Cost 2 vext1 <4,u,5,5>, LHS
+  2561827984U, // <u,5,5,1>: Cost 3 vext1 <1,u,5,5>, <1,5,3,7>
+  2579744360U, // <u,5,5,2>: Cost 3 vext1 <4,u,5,5>, <2,2,2,2>
+  2579744918U, // <u,5,5,3>: Cost 3 vext1 <4,u,5,5>, <3,0,1,2>
+  1506004278U, // <u,5,5,4>: Cost 2 vext1 <4,u,5,5>, RHS
+  229035318U, // <u,5,5,5>: Cost 1 vdup1 RHS
+  2712072206U, // <u,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
+  1638330392U, // <u,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
+  229035318U, // <u,5,5,u>: Cost 1 vdup1 RHS
+  1500037222U, // <u,5,6,0>: Cost 2 vext1 <3,u,5,6>, LHS
+  2561836436U, // <u,5,6,1>: Cost 3 vext1 <1,u,5,6>, <1,u,5,6>
+  2567809133U, // <u,5,6,2>: Cost 3 vext1 <2,u,5,6>, <2,u,5,6>
+  1500040006U, // <u,5,6,3>: Cost 2 vext1 <3,u,5,6>, <3,u,5,6>
+  1500040502U, // <u,5,6,4>: Cost 2 vext1 <3,u,5,6>, RHS
+  2714062935U, // <u,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
+  2712072288U, // <u,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
+  27705344U, // <u,5,6,7>: Cost 0 copy RHS
+  27705344U, // <u,5,6,u>: Cost 0 copy RHS
+  1488101478U, // <u,5,7,0>: Cost 2 vext1 <1,u,5,7>, LHS
+  1488102805U, // <u,5,7,1>: Cost 2 vext1 <1,u,5,7>, <1,u,5,7>
+  2561844840U, // <u,5,7,2>: Cost 3 vext1 <1,u,5,7>, <2,2,2,2>
+  2561845398U, // <u,5,7,3>: Cost 3 vext1 <1,u,5,7>, <3,0,1,2>
+  1488104758U, // <u,5,7,4>: Cost 2 vext1 <1,u,5,7>, RHS
+  1638330536U, // <u,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
+  2712072362U, // <u,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
+  2042965302U, // <u,5,7,7>: Cost 2 vtrnr RHS, RHS
+  1488107310U, // <u,5,7,u>: Cost 2 vext1 <1,u,5,7>, LHS
+  1488109670U, // <u,5,u,0>: Cost 2 vext1 <1,u,5,u>, LHS
+  1488110998U, // <u,5,u,1>: Cost 2 vext1 <1,u,5,u>, <1,u,5,u>
+  2561853032U, // <u,5,u,2>: Cost 3 vext1 <1,u,5,u>, <2,2,2,2>
+  1500056392U, // <u,5,u,3>: Cost 2 vext1 <3,u,5,u>, <3,u,5,u>
+  1488112950U, // <u,5,u,4>: Cost 2 vext1 <1,u,5,u>, RHS
+  229035318U, // <u,5,u,5>: Cost 1 vdup1 RHS
+  2954111490U, // <u,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  27705344U, // <u,5,u,7>: Cost 0 copy RHS
+  27705344U, // <u,5,u,u>: Cost 0 copy RHS
+  2619211776U, // <u,6,0,0>: Cost 3 vext2 <0,2,u,6>, <0,0,0,0>
+  1545470054U, // <u,6,0,1>: Cost 2 vext2 <0,2,u,6>, LHS
+  1545470192U, // <u,6,0,2>: Cost 2 vext2 <0,2,u,6>, <0,2,u,6>
+  2255958969U, // <u,6,0,3>: Cost 3 vrev <6,u,3,0>
+  1546797458U, // <u,6,0,4>: Cost 2 vext2 <0,4,u,6>, <0,4,u,6>
+  2720624971U, // <u,6,0,5>: Cost 3 vext3 <6,0,5,u>, <6,0,5,u>
+  2256180180U, // <u,6,0,6>: Cost 3 vrev <6,u,6,0>
+  2960682294U, // <u,6,0,7>: Cost 3 vzipr <1,2,u,0>, RHS
+  1545470621U, // <u,6,0,u>: Cost 2 vext2 <0,2,u,6>, LHS
+  1182004127U, // <u,6,1,0>: Cost 2 vrev <6,u,0,1>
+  2619212596U, // <u,6,1,1>: Cost 3 vext2 <0,2,u,6>, <1,1,1,1>
+  2619212694U, // <u,6,1,2>: Cost 3 vext2 <0,2,u,6>, <1,2,3,0>
+  2619212760U, // <u,6,1,3>: Cost 3 vext2 <0,2,u,6>, <1,3,1,3>
+  2626511979U, // <u,6,1,4>: Cost 3 vext2 <1,4,u,6>, <1,4,u,6>
+  2619212944U, // <u,6,1,5>: Cost 3 vext2 <0,2,u,6>, <1,5,3,7>
+  2714063264U, // <u,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
+  2967326006U, // <u,6,1,7>: Cost 3 vzipr <2,3,u,1>, RHS
+  1182594023U, // <u,6,1,u>: Cost 2 vrev <6,u,u,1>
+  1506050150U, // <u,6,2,0>: Cost 2 vext1 <4,u,6,2>, LHS
+  2579792630U, // <u,6,2,1>: Cost 3 vext1 <4,u,6,2>, <1,0,3,2>
+  2619213416U, // <u,6,2,2>: Cost 3 vext2 <0,2,u,6>, <2,2,2,2>
+  2619213478U, // <u,6,2,3>: Cost 3 vext2 <0,2,u,6>, <2,3,0,1>
+  1506053430U, // <u,6,2,4>: Cost 2 vext1 <4,u,6,2>, RHS
+  2633148309U, // <u,6,2,5>: Cost 3 vext2 <2,5,u,6>, <2,5,u,6>
+  2619213754U, // <u,6,2,6>: Cost 3 vext2 <0,2,u,6>, <2,6,3,7>
+  1638330874U, // <u,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
+  1638478339U, // <u,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
+  2619213974U, // <u,6,3,0>: Cost 3 vext2 <0,2,u,6>, <3,0,1,2>
+  2255836074U, // <u,6,3,1>: Cost 3 vrev <6,u,1,3>
+  2255909811U, // <u,6,3,2>: Cost 3 vrev <6,u,2,3>
+  2619214236U, // <u,6,3,3>: Cost 3 vext2 <0,2,u,6>, <3,3,3,3>
+  1564715549U, // <u,6,3,4>: Cost 2 vext2 <3,4,u,6>, <3,4,u,6>
+  2639121006U, // <u,6,3,5>: Cost 3 vext2 <3,5,u,6>, <3,5,u,6>
+  3001847012U, // <u,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
+  1880329526U, // <u,6,3,7>: Cost 2 vzipr LHS, RHS
+  1880329527U, // <u,6,3,u>: Cost 2 vzipr LHS, RHS
+  2567864422U, // <u,6,4,0>: Cost 3 vext1 <2,u,6,4>, LHS
+  2733011558U, // <u,6,4,1>: Cost 3 vext3 LHS, <6,4,1,3>
+  2567866484U, // <u,6,4,2>: Cost 3 vext1 <2,u,6,4>, <2,u,6,4>
+  2638458005U, // <u,6,4,3>: Cost 3 vext2 <3,4,u,6>, <4,3,6,u>
+  1570540772U, // <u,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
+  1545473334U, // <u,6,4,5>: Cost 2 vext2 <0,2,u,6>, RHS
+  1572015512U, // <u,6,4,6>: Cost 2 vext2 <4,6,u,6>, <4,6,u,6>
+  2960715062U, // <u,6,4,7>: Cost 3 vzipr <1,2,u,4>, RHS
+  1545473577U, // <u,6,4,u>: Cost 2 vext2 <0,2,u,6>, RHS
+  2567872614U, // <u,6,5,0>: Cost 3 vext1 <2,u,6,5>, LHS
+  2645757648U, // <u,6,5,1>: Cost 3 vext2 <4,6,u,6>, <5,1,7,3>
+  2567874490U, // <u,6,5,2>: Cost 3 vext1 <2,u,6,5>, <2,6,3,7>
+  2576501250U, // <u,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
+  1576660943U, // <u,6,5,4>: Cost 2 vext2 <5,4,u,6>, <5,4,u,6>
+  2645757956U, // <u,6,5,5>: Cost 3 vext2 <4,6,u,6>, <5,5,5,5>
+  2645758050U, // <u,6,5,6>: Cost 3 vext2 <4,6,u,6>, <5,6,7,0>
+  2824080694U, // <u,6,5,7>: Cost 3 vuzpr <0,u,2,6>, RHS
+  1182626795U, // <u,6,5,u>: Cost 2 vrev <6,u,u,5>
+  1506082918U, // <u,6,6,0>: Cost 2 vext1 <4,u,6,6>, LHS
+  2579825398U, // <u,6,6,1>: Cost 3 vext1 <4,u,6,6>, <1,0,3,2>
+  2645758458U, // <u,6,6,2>: Cost 3 vext2 <4,6,u,6>, <6,2,7,3>
+  2579826838U, // <u,6,6,3>: Cost 3 vext1 <4,u,6,6>, <3,0,1,2>
+  1506086198U, // <u,6,6,4>: Cost 2 vext1 <4,u,6,6>, RHS
+  2579828432U, // <u,6,6,5>: Cost 3 vext1 <4,u,6,6>, <5,1,7,3>
+  296144182U, // <u,6,6,6>: Cost 1 vdup2 RHS
+  1638331202U, // <u,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
+  296144182U, // <u,6,6,u>: Cost 1 vdup2 RHS
+  432349286U, // <u,6,7,0>: Cost 1 vext1 RHS, LHS
+  1506091766U, // <u,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
+  1506092648U, // <u,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1506093206U, // <u,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  432352809U, // <u,6,7,4>: Cost 1 vext1 RHS, RHS
+  1506094800U, // <u,6,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+  1506095610U, // <u,6,7,6>: Cost 2 vext1 RHS, <6,2,7,3>
+  1906904374U, // <u,6,7,7>: Cost 2 vzipr RHS, RHS
+  432355118U, // <u,6,7,u>: Cost 1 vext1 RHS, LHS
+  432357478U, // <u,6,u,0>: Cost 1 vext1 RHS, LHS
+  1545475886U, // <u,6,u,1>: Cost 2 vext2 <0,2,u,6>, LHS
+  1506100840U, // <u,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1506101398U, // <u,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  432361002U, // <u,6,u,4>: Cost 1 vext1 RHS, RHS
+  1545476250U, // <u,6,u,5>: Cost 2 vext2 <0,2,u,6>, RHS
+  296144182U, // <u,6,u,6>: Cost 1 vdup2 RHS
+  1880370486U, // <u,6,u,7>: Cost 2 vzipr LHS, RHS
+  432363310U, // <u,6,u,u>: Cost 1 vext1 RHS, LHS
+  1571356672U, // <u,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+  497614950U, // <u,7,0,1>: Cost 1 vext2 RHS, LHS
+  1571356836U, // <u,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+  2573880146U, // <u,7,0,3>: Cost 3 vext1 <3,u,7,0>, <3,u,7,0>
+  1571357010U, // <u,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+  1512083716U, // <u,7,0,5>: Cost 2 vext1 <5,u,7,0>, <5,u,7,0>
+  2621874741U, // <u,7,0,6>: Cost 3 vext2 <0,6,u,7>, <0,6,u,7>
+  2585826298U, // <u,7,0,7>: Cost 3 vext1 <5,u,7,0>, <7,0,1,2>
+  497615517U, // <u,7,0,u>: Cost 1 vext2 RHS, LHS
+  1571357430U, // <u,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+  1571357492U, // <u,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+  1571357590U, // <u,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
+  1552114715U, // <u,7,1,3>: Cost 2 vext2 <1,3,u,7>, <1,3,u,7>
+  2573888822U, // <u,7,1,4>: Cost 3 vext1 <3,u,7,1>, RHS
+  1553441981U, // <u,7,1,5>: Cost 2 vext2 <1,5,u,7>, <1,5,u,7>
+  2627847438U, // <u,7,1,6>: Cost 3 vext2 <1,6,u,7>, <1,6,u,7>
+  2727408775U, // <u,7,1,7>: Cost 3 vext3 <7,1,7,u>, <7,1,7,u>
+  1555432880U, // <u,7,1,u>: Cost 2 vext2 <1,u,u,7>, <1,u,u,7>
+  2629838337U, // <u,7,2,0>: Cost 3 vext2 <2,0,u,7>, <2,0,u,7>
+  1188058754U, // <u,7,2,1>: Cost 2 vrev <7,u,1,2>
+  1571358312U, // <u,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+  1571358374U, // <u,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+  2632492869U, // <u,7,2,4>: Cost 3 vext2 <2,4,u,7>, <2,4,u,7>
+  2633156502U, // <u,7,2,5>: Cost 3 vext2 <2,5,u,7>, <2,5,u,7>
+  1560078311U, // <u,7,2,6>: Cost 2 vext2 <2,6,u,7>, <2,6,u,7>
+  2728072408U, // <u,7,2,7>: Cost 3 vext3 <7,2,7,u>, <7,2,7,u>
+  1561405577U, // <u,7,2,u>: Cost 2 vext2 <2,u,u,7>, <2,u,u,7>
+  1571358870U, // <u,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+  2627184913U, // <u,7,3,1>: Cost 3 vext2 <1,5,u,7>, <3,1,5,u>
+  2633820523U, // <u,7,3,2>: Cost 3 vext2 <2,6,u,7>, <3,2,6,u>
+  1571359132U, // <u,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+  1571359234U, // <u,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+  1512108295U, // <u,7,3,5>: Cost 2 vext1 <5,u,7,3>, <5,u,7,3>
+  1518080992U, // <u,7,3,6>: Cost 2 vext1 <6,u,7,3>, <6,u,7,3>
+  2640456465U, // <u,7,3,7>: Cost 3 vext2 <3,7,u,7>, <3,7,u,7>
+  1571359518U, // <u,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+  1571359634U, // <u,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+  2573911067U, // <u,7,4,1>: Cost 3 vext1 <3,u,7,4>, <1,3,u,7>
+  2645101622U, // <u,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
+  2573912918U, // <u,7,4,3>: Cost 3 vext1 <3,u,7,4>, <3,u,7,4>
+  1571359952U, // <u,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+  497618248U, // <u,7,4,5>: Cost 1 vext2 RHS, RHS
+  1571360116U, // <u,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  2645102024U, // <u,7,4,7>: Cost 3 vext2 RHS, <4,7,5,0>
+  497618473U, // <u,7,4,u>: Cost 1 vext2 RHS, RHS
+  2645102152U, // <u,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
+  1571360464U, // <u,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  2645102334U, // <u,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
+  2645102447U, // <u,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
+  1571360710U, // <u,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+  1571360772U, // <u,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+  1571360866U, // <u,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
+  1571360936U, // <u,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+  1571361017U, // <u,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
+  1530044518U, // <u,7,6,0>: Cost 2 vext1 <u,u,7,6>, LHS
+  2645103016U, // <u,7,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
+  1571361274U, // <u,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  2645103154U, // <u,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
+  1530047798U, // <u,7,6,4>: Cost 2 vext1 <u,u,7,6>, RHS
+  1188386474U, // <u,7,6,5>: Cost 2 vrev <7,u,5,6>
+  1571361592U, // <u,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
+  1571361614U, // <u,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+  1571361695U, // <u,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
+  1571361786U, // <u,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
+  2573935616U, // <u,7,7,1>: Cost 3 vext1 <3,u,7,7>, <1,3,5,7>
+  2645103781U, // <u,7,7,2>: Cost 3 vext2 RHS, <7,2,2,2>
+  2573937497U, // <u,7,7,3>: Cost 3 vext1 <3,u,7,7>, <3,u,7,7>
+  1571362150U, // <u,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
+  1512141067U, // <u,7,7,5>: Cost 2 vext1 <5,u,7,7>, <5,u,7,7>
+  1518113764U, // <u,7,7,6>: Cost 2 vext1 <6,u,7,7>, <6,u,7,7>
+  363253046U, // <u,7,7,7>: Cost 1 vdup3 RHS
+  363253046U, // <u,7,7,u>: Cost 1 vdup3 RHS
+  1571362515U, // <u,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
+  497620782U, // <u,7,u,1>: Cost 1 vext2 RHS, LHS
+  1571362693U, // <u,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
+  1571362748U, // <u,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+  1571362879U, // <u,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
+  497621146U, // <u,7,u,5>: Cost 1 vext2 RHS, RHS
+  1571363024U, // <u,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
+  363253046U, // <u,7,u,7>: Cost 1 vdup3 RHS
+  497621349U, // <u,7,u,u>: Cost 1 vext2 RHS, LHS
+  135053414U, // <u,u,0,0>: Cost 1 vdup0 LHS
+  471081121U, // <u,u,0,1>: Cost 1 vext2 LHS, LHS
+  1544822948U, // <u,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  1616140005U, // <u,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
+  1544823122U, // <u,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  1512157453U, // <u,u,0,5>: Cost 2 vext1 <5,u,u,0>, <5,u,u,0>
+  1662220032U, // <u,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
+  1194457487U, // <u,u,0,7>: Cost 2 vrev <u,u,7,0>
+  471081629U, // <u,u,0,u>: Cost 1 vext2 LHS, LHS
+  1544823542U, // <u,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  202162278U, // <u,u,1,1>: Cost 1 vdup1 LHS
+  537753390U, // <u,u,1,2>: Cost 1 vext3 LHS, LHS
+  1544823768U, // <u,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  1494248758U, // <u,u,1,4>: Cost 2 vext1 <2,u,u,1>, RHS
+  1544823952U, // <u,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  1518138343U, // <u,u,1,6>: Cost 2 vext1 <6,u,u,1>, <6,u,u,1>
+  1640322907U, // <u,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
+  537753444U, // <u,u,1,u>: Cost 1 vext3 LHS, LHS
+  1482309734U, // <u,u,2,0>: Cost 2 vext1 <0,u,u,2>, LHS
+  1194031451U, // <u,u,2,1>: Cost 2 vrev <u,u,1,2>
+  269271142U, // <u,u,2,2>: Cost 1 vdup2 LHS
+  835584U, // <u,u,2,3>: Cost 0 copy LHS
+  1482313014U, // <u,u,2,4>: Cost 2 vext1 <0,u,u,2>, RHS
+  2618566504U, // <u,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+  1544824762U, // <u,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  1638479788U, // <u,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
+  835584U, // <u,u,2,u>: Cost 0 copy LHS
+  408576723U, // <u,u,3,0>: Cost 1 vext1 LHS, LHS
+  1482318582U, // <u,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  120371557U, // <u,u,3,2>: Cost 1 vrev LHS
+  336380006U, // <u,u,3,3>: Cost 1 vdup3 LHS
+  408579382U, // <u,u,3,4>: Cost 1 vext1 LHS, RHS
+  1616140271U, // <u,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
+  1530098170U, // <u,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1880329544U, // <u,u,3,7>: Cost 2 vzipr LHS, RHS
+  408581934U, // <u,u,3,u>: Cost 1 vext1 LHS, LHS
+  1488298086U, // <u,u,4,0>: Cost 2 vext1 <1,u,u,4>, LHS
+  1488299437U, // <u,u,4,1>: Cost 2 vext1 <1,u,u,4>, <1,u,u,4>
+  1659271204U, // <u,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
+  1194195311U, // <u,u,4,3>: Cost 2 vrev <u,u,3,4>
+  161926454U, // <u,u,4,4>: Cost 1 vdup0 RHS
+  471084342U, // <u,u,4,5>: Cost 1 vext2 LHS, RHS
+  1571368308U, // <u,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  1640323153U, // <u,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
+  471084585U, // <u,u,4,u>: Cost 1 vext2 LHS, RHS
+  1494278246U, // <u,u,5,0>: Cost 2 vext1 <2,u,u,5>, LHS
+  1571368656U, // <u,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  1494280327U, // <u,u,5,2>: Cost 2 vext1 <2,u,u,5>, <2,u,u,5>
+  1616140415U, // <u,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
+  1494281526U, // <u,u,5,4>: Cost 2 vext1 <2,u,u,5>, RHS
+  229035318U, // <u,u,5,5>: Cost 1 vdup1 RHS
+  537753754U, // <u,u,5,6>: Cost 1 vext3 LHS, RHS
+  1750355254U, // <u,u,5,7>: Cost 2 vuzpr LHS, RHS
+  537753772U, // <u,u,5,u>: Cost 1 vext3 LHS, RHS
+  1482342502U, // <u,u,6,0>: Cost 2 vext1 <0,u,u,6>, LHS
+  2556084982U, // <u,u,6,1>: Cost 3 vext1 <0,u,u,6>, <1,0,3,2>
+  1571369466U, // <u,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  1611938000U, // <u,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
+  1482345782U, // <u,u,6,4>: Cost 2 vext1 <0,u,u,6>, RHS
+  1194359171U, // <u,u,6,5>: Cost 2 vrev <u,u,5,6>
+  296144182U, // <u,u,6,6>: Cost 1 vdup2 RHS
+  27705344U, // <u,u,6,7>: Cost 0 copy RHS
+  27705344U, // <u,u,6,u>: Cost 0 copy RHS
+  432496742U, // <u,u,7,0>: Cost 1 vext1 RHS, LHS
+  1488324016U, // <u,u,7,1>: Cost 2 vext1 <1,u,u,7>, <1,u,u,7>
+  1494296713U, // <u,u,7,2>: Cost 2 vext1 <2,u,u,7>, <2,u,u,7>
+  1906901148U, // <u,u,7,3>: Cost 2 vzipr RHS, LHS
+  432500283U, // <u,u,7,4>: Cost 1 vext1 RHS, RHS
+  1506242256U, // <u,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+  120699277U, // <u,u,7,6>: Cost 1 vrev RHS
+  363253046U, // <u,u,7,7>: Cost 1 vdup3 RHS
+  432502574U, // <u,u,7,u>: Cost 1 vext1 RHS, LHS
+  408617688U, // <u,u,u,0>: Cost 1 vext1 LHS, LHS
+  471086894U, // <u,u,u,1>: Cost 1 vext2 LHS, LHS
+  537753957U, // <u,u,u,2>: Cost 1 vext3 LHS, LHS
+  835584U, // <u,u,u,3>: Cost 0 copy LHS
+  408620342U, // <u,u,u,4>: Cost 1 vext1 LHS, RHS
+  471087258U, // <u,u,u,5>: Cost 1 vext2 LHS, RHS
+  537753997U, // <u,u,u,6>: Cost 1 vext3 LHS, RHS
+  27705344U, // <u,u,u,7>: Cost 0 copy RHS
+  835584U, // <u,u,u,u>: Cost 0 copy LHS
+  0
+};
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp b/contrib/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp
new file mode 100644
index 0000000..4723cc4
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp
@@ -0,0 +1,578 @@
+//=- AArch64PromoteConstant.cpp --- Promote constant to global for AArch64 -==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AArch64PromoteConstant pass which promotes constants
+// to global variables when this is likely to be more efficient. Currently only
+// types related to constant vector (i.e., constant vector, array of constant
+// vectors, constant structure with a constant vector field, etc.) are promoted
+// to global variables. Constant vectors are likely to be lowered in target
+// constant pool during instruction selection already; therefore, the access
+// will remain the same (memory load), but the structure types are not split
+// into different constant pool accesses for each field. A bonus side effect is
+// that created globals may be merged by the global merge pass.
+//
+// FIXME: This pass may be useful for other targets too.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-promote-const"
+
+// Stress testing mode - disable heuristics.
+static cl::opt<bool> Stress("aarch64-stress-promote-const", cl::Hidden,
+                            cl::desc("Promote all vector constants"));
+
+STATISTIC(NumPromoted, "Number of promoted constants");
+STATISTIC(NumPromotedUses, "Number of promoted constants uses");
+
+//===----------------------------------------------------------------------===//
+//                       AArch64PromoteConstant
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// Promotes interesting constant into global variables.
+/// The motivating example is:
+/// static const uint16_t TableA[32] = {
+///   41944, 40330, 38837, 37450, 36158, 34953, 33826, 32768,
+///   31776, 30841, 29960, 29128, 28340, 27595, 26887, 26215,
+///   25576, 24967, 24386, 23832, 23302, 22796, 22311, 21846,
+///   21400, 20972, 20561, 20165, 19785, 19419, 19066, 18725,
+/// };
+///
+/// uint8x16x4_t LoadStatic(void) {
+///   uint8x16x4_t ret;
+///   ret.val[0] = vld1q_u16(TableA +  0);
+///   ret.val[1] = vld1q_u16(TableA +  8);
+///   ret.val[2] = vld1q_u16(TableA + 16);
+///   ret.val[3] = vld1q_u16(TableA + 24);
+///   return ret;
+/// }
+///
+/// The constants in this example are folded into the uses. Thus, 4 different
+/// constants are created.
+///
+/// As their type is vector the cheapest way to create them is to load them
+/// for the memory.
+///
+/// Therefore the final assembly final has 4 different loads. With this pass
+/// enabled, only one load is issued for the constants.
+class AArch64PromoteConstant : public ModulePass {
+
+public:
+  static char ID;
+  AArch64PromoteConstant() : ModulePass(ID) {}
+
+  const char *getPassName() const override { return "AArch64 Promote Constant"; }
+
+  /// Iterate over the functions and promote the interesting constants into
+  /// global variables with module scope.
+  bool runOnModule(Module &M) override {
+    DEBUG(dbgs() << getPassName() << '\n');
+    bool Changed = false;
+    for (auto &MF : M) {
+      Changed |= runOnFunction(MF);
+    }
+    return Changed;
+  }
+
+private:
+  /// Look for interesting constants used within the given function.
+  /// Promote them into global variables, load these global variables within
+  /// the related function, so that the number of inserted load is minimal.
+  bool runOnFunction(Function &F);
+
+  // This transformation requires dominator info
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+  }
+
+  /// Type to store a list of User.
+  typedef SmallVector<Value::user_iterator, 4> Users;
+  /// Map an insertion point to all the uses it dominates.
+  typedef DenseMap<Instruction *, Users> InsertionPoints;
+  /// Map a function to the required insertion point of load for a
+  /// global variable.
+  typedef DenseMap<Function *, InsertionPoints> InsertionPointsPerFunc;
+
+  /// Find the closest point that dominates the given Use.
+  Instruction *findInsertionPoint(Value::user_iterator &Use);
+
+  /// Check if the given insertion point is dominated by an existing
+  /// insertion point.
+  /// If true, the given use is added to the list of dominated uses for
+  /// the related existing point.
+  /// \param NewPt the insertion point to be checked
+  /// \param UseIt the use to be added into the list of dominated uses
+  /// \param InsertPts existing insertion points
+  /// \pre NewPt and all instruction in InsertPts belong to the same function
+  /// \return true if one of the insertion point in InsertPts dominates NewPt,
+  ///         false otherwise
+  bool isDominated(Instruction *NewPt, Value::user_iterator &UseIt,
+                   InsertionPoints &InsertPts);
+
+  /// Check if the given insertion point can be merged with an existing
+  /// insertion point in a common dominator.
+  /// If true, the given use is added to the list of the created insertion
+  /// point.
+  /// \param NewPt the insertion point to be checked
+  /// \param UseIt the use to be added into the list of dominated uses
+  /// \param InsertPts existing insertion points
+  /// \pre NewPt and all instruction in InsertPts belong to the same function
+  /// \pre isDominated returns false for the exact same parameters.
+  /// \return true if it exists an insertion point in InsertPts that could
+  ///         have been merged with NewPt in a common dominator,
+  ///         false otherwise
+  bool tryAndMerge(Instruction *NewPt, Value::user_iterator &UseIt,
+                   InsertionPoints &InsertPts);
+
+  /// Compute the minimal insertion points to dominates all the interesting
+  /// uses of value.
+  /// Insertion points are group per function and each insertion point
+  /// contains a list of all the uses it dominates within the related function
+  /// \param Val constant to be examined
+  /// \param[out] InsPtsPerFunc output storage of the analysis
+  void computeInsertionPoints(Constant *Val,
+                              InsertionPointsPerFunc &InsPtsPerFunc);
+
+  /// Insert a definition of a new global variable at each point contained in
+  /// InsPtsPerFunc and update the related uses (also contained in
+  /// InsPtsPerFunc).
+  bool insertDefinitions(Constant *Cst, InsertionPointsPerFunc &InsPtsPerFunc);
+
+  /// Compute the minimal insertion points to dominate all the interesting
+  /// uses of Val and insert a definition of a new global variable
+  /// at these points.
+  /// Also update the uses of Val accordingly.
+  /// Currently a use of Val is considered interesting if:
+  /// - Val is not UndefValue
+  /// - Val is not zeroinitialized
+  /// - Replacing Val per a load of a global variable is valid.
+  /// \see shouldConvert for more details
+  bool computeAndInsertDefinitions(Constant *Val);
+
+  /// Promote the given constant into a global variable if it is expected to
+  /// be profitable.
+  /// \return true if Cst has been promoted
+  bool promoteConstant(Constant *Cst);
+
+  /// Transfer the list of dominated uses of IPI to NewPt in InsertPts.
+  /// Append UseIt to this list and delete the entry of IPI in InsertPts.
+  static void appendAndTransferDominatedUses(Instruction *NewPt,
+                                             Value::user_iterator &UseIt,
+                                             InsertionPoints::iterator &IPI,
+                                             InsertionPoints &InsertPts) {
+    // Record the dominated use.
+    IPI->second.push_back(UseIt);
+    // Transfer the dominated uses of IPI to NewPt
+    // Inserting into the DenseMap may invalidate existing iterator.
+    // Keep a copy of the key to find the iterator to erase.
+    Instruction *OldInstr = IPI->first;
+    InsertPts.insert(InsertionPoints::value_type(NewPt, IPI->second));
+    // Erase IPI.
+    IPI = InsertPts.find(OldInstr);
+    InsertPts.erase(IPI);
+  }
+};
+} // end anonymous namespace
+
+char AArch64PromoteConstant::ID = 0;
+
+namespace llvm {
+void initializeAArch64PromoteConstantPass(PassRegistry &);
+}
+
+INITIALIZE_PASS_BEGIN(AArch64PromoteConstant, "aarch64-promote-const",
+                      "AArch64 Promote Constant Pass", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(AArch64PromoteConstant, "aarch64-promote-const",
+                    "AArch64 Promote Constant Pass", false, false)
+
+ModulePass *llvm::createAArch64PromoteConstantPass() {
+  return new AArch64PromoteConstant();
+}
+
+/// Check if the given type uses a vector type.
+static bool isConstantUsingVectorTy(const Type *CstTy) {
+  if (CstTy->isVectorTy())
+    return true;
+  if (CstTy->isStructTy()) {
+    for (unsigned EltIdx = 0, EndEltIdx = CstTy->getStructNumElements();
+         EltIdx < EndEltIdx; ++EltIdx)
+      if (isConstantUsingVectorTy(CstTy->getStructElementType(EltIdx)))
+        return true;
+  } else if (CstTy->isArrayTy())
+    return isConstantUsingVectorTy(CstTy->getArrayElementType());
+  return false;
+}
+
+/// Check if the given use (Instruction + OpIdx) of Cst should be converted into
+/// a load of a global variable initialized with Cst.
+/// A use should be converted if it is legal to do so.
+/// For instance, it is not legal to turn the mask operand of a shuffle vector
+/// into a load of a global variable.
+static bool shouldConvertUse(const Constant *Cst, const Instruction *Instr,
+                             unsigned OpIdx) {
+  // shufflevector instruction expects a const for the mask argument, i.e., the
+  // third argument. Do not promote this use in that case.
+  if (isa<const ShuffleVectorInst>(Instr) && OpIdx == 2)
+    return false;
+
+  // extractvalue instruction expects a const idx.
+  if (isa<const ExtractValueInst>(Instr) && OpIdx > 0)
+    return false;
+
+  // extractvalue instruction expects a const idx.
+  if (isa<const InsertValueInst>(Instr) && OpIdx > 1)
+    return false;
+
+  if (isa<const AllocaInst>(Instr) && OpIdx > 0)
+    return false;
+
+  // Alignment argument must be constant.
+  if (isa<const LoadInst>(Instr) && OpIdx > 0)
+    return false;
+
+  // Alignment argument must be constant.
+  if (isa<const StoreInst>(Instr) && OpIdx > 1)
+    return false;
+
+  // Index must be constant.
+  if (isa<const GetElementPtrInst>(Instr) && OpIdx > 0)
+    return false;
+
+  // Personality function and filters must be constant.
+  // Give up on that instruction.
+  if (isa<const LandingPadInst>(Instr))
+    return false;
+
+  // Switch instruction expects constants to compare to.
+  if (isa<const SwitchInst>(Instr))
+    return false;
+
+  // Expected address must be a constant.
+  if (isa<const IndirectBrInst>(Instr))
+    return false;
+
+  // Do not mess with intrinsics.
+  if (isa<const IntrinsicInst>(Instr))
+    return false;
+
+  // Do not mess with inline asm.
+  const CallInst *CI = dyn_cast<const CallInst>(Instr);
+  if (CI && isa<const InlineAsm>(CI->getCalledValue()))
+    return false;
+
+  return true;
+}
+
+/// Check if the given Cst should be converted into
+/// a load of a global variable initialized with Cst.
+/// A constant should be converted if it is likely that the materialization of
+/// the constant will be tricky. Thus, we give up on zero or undef values.
+///
+/// \todo Currently, accept only vector related types.
+/// Also we give up on all simple vector type to keep the existing
+/// behavior. Otherwise, we should push here all the check of the lowering of
+/// BUILD_VECTOR. By giving up, we lose the potential benefit of merging
+/// constant via global merge and the fact that the same constant is stored
+/// only once with this method (versus, as many function that uses the constant
+/// for the regular approach, even for float).
+/// Again, the simplest solution would be to promote every
+/// constant and rematerialize them when they are actually cheap to create.
+static bool shouldConvert(const Constant *Cst) {
+  if (isa<const UndefValue>(Cst))
+    return false;
+
+  // FIXME: In some cases, it may be interesting to promote in memory
+  // a zero initialized constant.
+  // E.g., when the type of Cst require more instructions than the
+  // adrp/add/load sequence or when this sequence can be shared by several
+  // instances of Cst.
+  // Ideally, we could promote this into a global and rematerialize the constant
+  // when it was a bad idea.
+  if (Cst->isZeroValue())
+    return false;
+
+  if (Stress)
+    return true;
+
+  // FIXME: see function \todo
+  if (Cst->getType()->isVectorTy())
+    return false;
+  return isConstantUsingVectorTy(Cst->getType());
+}
+
+Instruction *
+AArch64PromoteConstant::findInsertionPoint(Value::user_iterator &Use) {
+  // If this user is a phi, the insertion point is in the related
+  // incoming basic block.
+  PHINode *PhiInst = dyn_cast<PHINode>(*Use);
+  Instruction *InsertionPoint;
+  if (PhiInst)
+    InsertionPoint =
+        PhiInst->getIncomingBlock(Use.getOperandNo())->getTerminator();
+  else
+    InsertionPoint = dyn_cast<Instruction>(*Use);
+  assert(InsertionPoint && "User is not an instruction!");
+  return InsertionPoint;
+}
+
+bool AArch64PromoteConstant::isDominated(Instruction *NewPt,
+                                         Value::user_iterator &UseIt,
+                                         InsertionPoints &InsertPts) {
+
+  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(
+      *NewPt->getParent()->getParent()).getDomTree();
+
+  // Traverse all the existing insertion points and check if one is dominating
+  // NewPt. If it is, remember that.
+  for (auto &IPI : InsertPts) {
+    if (NewPt == IPI.first || DT.dominates(IPI.first, NewPt) ||
+        // When IPI.first is a terminator instruction, DT may think that
+        // the result is defined on the edge.
+        // Here we are testing the insertion point, not the definition.
+        (IPI.first->getParent() != NewPt->getParent() &&
+         DT.dominates(IPI.first->getParent(), NewPt->getParent()))) {
+      // No need to insert this point. Just record the dominated use.
+      DEBUG(dbgs() << "Insertion point dominated by:\n");
+      DEBUG(IPI.first->print(dbgs()));
+      DEBUG(dbgs() << '\n');
+      IPI.second.push_back(UseIt);
+      return true;
+    }
+  }
+  return false;
+}
+
+bool AArch64PromoteConstant::tryAndMerge(Instruction *NewPt,
+                                         Value::user_iterator &UseIt,
+                                         InsertionPoints &InsertPts) {
+  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(
+      *NewPt->getParent()->getParent()).getDomTree();
+  BasicBlock *NewBB = NewPt->getParent();
+
+  // Traverse all the existing insertion point and check if one is dominated by
+  // NewPt and thus useless or can be combined with NewPt into a common
+  // dominator.
+  for (InsertionPoints::iterator IPI = InsertPts.begin(),
+                                 EndIPI = InsertPts.end();
+       IPI != EndIPI; ++IPI) {
+    BasicBlock *CurBB = IPI->first->getParent();
+    if (NewBB == CurBB) {
+      // Instructions are in the same block.
+      // By construction, NewPt is dominating the other.
+      // Indeed, isDominated returned false with the exact same arguments.
+      DEBUG(dbgs() << "Merge insertion point with:\n");
+      DEBUG(IPI->first->print(dbgs()));
+      DEBUG(dbgs() << "\nat considered insertion point.\n");
+      appendAndTransferDominatedUses(NewPt, UseIt, IPI, InsertPts);
+      return true;
+    }
+
+    // Look for a common dominator
+    BasicBlock *CommonDominator = DT.findNearestCommonDominator(NewBB, CurBB);
+    // If none exists, we cannot merge these two points.
+    if (!CommonDominator)
+      continue;
+
+    if (CommonDominator != NewBB) {
+      // By construction, the CommonDominator cannot be CurBB.
+      assert(CommonDominator != CurBB &&
+             "Instruction has not been rejected during isDominated check!");
+      // Take the last instruction of the CommonDominator as insertion point
+      NewPt = CommonDominator->getTerminator();
+    }
+    // else, CommonDominator is the block of NewBB, hence NewBB is the last
+    // possible insertion point in that block.
+    DEBUG(dbgs() << "Merge insertion point with:\n");
+    DEBUG(IPI->first->print(dbgs()));
+    DEBUG(dbgs() << '\n');
+    DEBUG(NewPt->print(dbgs()));
+    DEBUG(dbgs() << '\n');
+    appendAndTransferDominatedUses(NewPt, UseIt, IPI, InsertPts);
+    return true;
+  }
+  return false;
+}
+
+void AArch64PromoteConstant::computeInsertionPoints(
+    Constant *Val, InsertionPointsPerFunc &InsPtsPerFunc) {
+  DEBUG(dbgs() << "** Compute insertion points **\n");
+  for (Value::user_iterator UseIt = Val->user_begin(),
+                            EndUseIt = Val->user_end();
+       UseIt != EndUseIt; ++UseIt) {
+    // If the user is not an Instruction, we cannot modify it.
+    if (!isa<Instruction>(*UseIt))
+      continue;
+
+    // Filter out uses that should not be converted.
+    if (!shouldConvertUse(Val, cast<Instruction>(*UseIt), UseIt.getOperandNo()))
+      continue;
+
+    DEBUG(dbgs() << "Considered use, opidx " << UseIt.getOperandNo() << ":\n");
+    DEBUG((*UseIt)->print(dbgs()));
+    DEBUG(dbgs() << '\n');
+
+    Instruction *InsertionPoint = findInsertionPoint(UseIt);
+
+    DEBUG(dbgs() << "Considered insertion point:\n");
+    DEBUG(InsertionPoint->print(dbgs()));
+    DEBUG(dbgs() << '\n');
+
+    // Check if the current insertion point is useless, i.e., it is dominated
+    // by another one.
+    InsertionPoints &InsertPts =
+        InsPtsPerFunc[InsertionPoint->getParent()->getParent()];
+    if (isDominated(InsertionPoint, UseIt, InsertPts))
+      continue;
+    // This insertion point is useful, check if we can merge some insertion
+    // point in a common dominator or if NewPt dominates an existing one.
+    if (tryAndMerge(InsertionPoint, UseIt, InsertPts))
+      continue;
+
+    DEBUG(dbgs() << "Keep considered insertion point\n");
+
+    // It is definitely useful by its own
+    InsertPts[InsertionPoint].push_back(UseIt);
+  }
+}
+
+bool AArch64PromoteConstant::insertDefinitions(
+    Constant *Cst, InsertionPointsPerFunc &InsPtsPerFunc) {
+  // We will create one global variable per Module.
+  DenseMap<Module *, GlobalVariable *> ModuleToMergedGV;
+  bool HasChanged = false;
+
+  // Traverse all insertion points in all the function.
+  for (InsertionPointsPerFunc::iterator FctToInstPtsIt = InsPtsPerFunc.begin(),
+                                        EndIt = InsPtsPerFunc.end();
+       FctToInstPtsIt != EndIt; ++FctToInstPtsIt) {
+    InsertionPoints &InsertPts = FctToInstPtsIt->second;
+// Do more checking for debug purposes.
+#ifndef NDEBUG
+    DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(
+        *FctToInstPtsIt->first).getDomTree();
+#endif
+    GlobalVariable *PromotedGV;
+    assert(!InsertPts.empty() && "Empty uses does not need a definition");
+
+    Module *M = FctToInstPtsIt->first->getParent();
+    DenseMap<Module *, GlobalVariable *>::iterator MapIt =
+        ModuleToMergedGV.find(M);
+    if (MapIt == ModuleToMergedGV.end()) {
+      PromotedGV = new GlobalVariable(
+          *M, Cst->getType(), true, GlobalValue::InternalLinkage, nullptr,
+          "_PromotedConst", nullptr, GlobalVariable::NotThreadLocal);
+      PromotedGV->setInitializer(Cst);
+      ModuleToMergedGV[M] = PromotedGV;
+      DEBUG(dbgs() << "Global replacement: ");
+      DEBUG(PromotedGV->print(dbgs()));
+      DEBUG(dbgs() << '\n');
+      ++NumPromoted;
+      HasChanged = true;
+    } else {
+      PromotedGV = MapIt->second;
+    }
+
+    for (InsertionPoints::iterator IPI = InsertPts.begin(),
+                                   EndIPI = InsertPts.end();
+         IPI != EndIPI; ++IPI) {
+      // Create the load of the global variable.
+      IRBuilder<> Builder(IPI->first->getParent(), IPI->first);
+      LoadInst *LoadedCst = Builder.CreateLoad(PromotedGV);
+      DEBUG(dbgs() << "**********\n");
+      DEBUG(dbgs() << "New def: ");
+      DEBUG(LoadedCst->print(dbgs()));
+      DEBUG(dbgs() << '\n');
+
+      // Update the dominated uses.
+      Users &DominatedUsers = IPI->second;
+      for (Value::user_iterator Use : DominatedUsers) {
+#ifndef NDEBUG
+        assert((DT.dominates(LoadedCst, cast<Instruction>(*Use)) ||
+                (isa<PHINode>(*Use) &&
+                 DT.dominates(LoadedCst, findInsertionPoint(Use)))) &&
+               "Inserted definition does not dominate all its uses!");
+#endif
+        DEBUG(dbgs() << "Use to update " << Use.getOperandNo() << ":");
+        DEBUG(Use->print(dbgs()));
+        DEBUG(dbgs() << '\n');
+        Use->setOperand(Use.getOperandNo(), LoadedCst);
+        ++NumPromotedUses;
+      }
+    }
+  }
+  return HasChanged;
+}
+
+bool AArch64PromoteConstant::computeAndInsertDefinitions(Constant *Val) {
+  InsertionPointsPerFunc InsertPtsPerFunc;
+  computeInsertionPoints(Val, InsertPtsPerFunc);
+  return insertDefinitions(Val, InsertPtsPerFunc);
+}
+
+bool AArch64PromoteConstant::promoteConstant(Constant *Cst) {
+  assert(Cst && "Given variable is not a valid constant.");
+
+  if (!shouldConvert(Cst))
+    return false;
+
+  DEBUG(dbgs() << "******************************\n");
+  DEBUG(dbgs() << "Candidate constant: ");
+  DEBUG(Cst->print(dbgs()));
+  DEBUG(dbgs() << '\n');
+
+  return computeAndInsertDefinitions(Cst);
+}
+
+bool AArch64PromoteConstant::runOnFunction(Function &F) {
+  // Look for instructions using constant vector. Promote that constant to a
+  // global variable. Create as few loads of this variable as possible and
+  // update the uses accordingly.
+  bool LocalChange = false;
+  SmallSet<Constant *, 8> AlreadyChecked;
+
+  for (auto &MBB : F) {
+    for (auto &MI : MBB) {
+      // Traverse the operand, looking for constant vectors. Replace them by a
+      // load of a global variable of constant vector type.
+      for (unsigned OpIdx = 0, EndOpIdx = MI.getNumOperands();
+           OpIdx != EndOpIdx; ++OpIdx) {
+        Constant *Cst = dyn_cast<Constant>(MI.getOperand(OpIdx));
+        // There is no point in promoting global values as they are already
+        // global. Do not promote constant expressions either, as they may
+        // require some code expansion.
+        if (Cst && !isa<GlobalValue>(Cst) && !isa<ConstantExpr>(Cst) &&
+            AlreadyChecked.insert(Cst))
+          LocalChange |= promoteConstant(Cst);
+      }
+    }
+  }
+  return LocalChange;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
new file mode 100644
index 0000000..01b9587
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -0,0 +1,404 @@
+//===- AArch64RegisterInfo.cpp - AArch64 Register Information -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AArch64 implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64RegisterInfo.h"
+#include "AArch64FrameLowering.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+#define GET_REGINFO_TARGET_DESC
+#include "AArch64GenRegisterInfo.inc"
+
+AArch64RegisterInfo::AArch64RegisterInfo(const AArch64InstrInfo *tii,
+                                         const AArch64Subtarget *sti)
+    : AArch64GenRegisterInfo(AArch64::LR), TII(tii), STI(sti) {}
+
+const MCPhysReg *
+AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  assert(MF && "Invalid MachineFunction pointer.");
+  if (MF->getFunction()->getCallingConv() == CallingConv::AnyReg)
+    return CSR_AArch64_AllRegs_SaveList;
+  else
+    return CSR_AArch64_AAPCS_SaveList;
+}
+
+const uint32_t *
+AArch64RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
+  if (CC == CallingConv::AnyReg)
+    return CSR_AArch64_AllRegs_RegMask;
+  else
+    return CSR_AArch64_AAPCS_RegMask;
+}
+
+const uint32_t *AArch64RegisterInfo::getTLSCallPreservedMask() const {
+  if (STI->isTargetDarwin())
+    return CSR_AArch64_TLS_Darwin_RegMask;
+
+  assert(STI->isTargetELF() && "only expect Darwin or ELF TLS");
+  return CSR_AArch64_TLS_ELF_RegMask;
+}
+
+const uint32_t *
+AArch64RegisterInfo::getThisReturnPreservedMask(CallingConv::ID) const {
+  // This should return a register mask that is the same as that returned by
+  // getCallPreservedMask but that additionally preserves the register used for
+  // the first i64 argument (which must also be the register used to return a
+  // single i64 return value)
+  //
+  // In case that the calling convention does not use the same register for
+  // both, the function should return NULL (does not currently apply)
+  return CSR_AArch64_AAPCS_ThisReturn_RegMask;
+}
+
+BitVector
+AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  // FIXME: avoid re-calculating this every time.
+  BitVector Reserved(getNumRegs());
+  Reserved.set(AArch64::SP);
+  Reserved.set(AArch64::XZR);
+  Reserved.set(AArch64::WSP);
+  Reserved.set(AArch64::WZR);
+
+  if (TFI->hasFP(MF) || STI->isTargetDarwin()) {
+    Reserved.set(AArch64::FP);
+    Reserved.set(AArch64::W29);
+  }
+
+  if (STI->isTargetDarwin()) {
+    Reserved.set(AArch64::X18); // Platform register
+    Reserved.set(AArch64::W18);
+  }
+
+  if (hasBasePointer(MF)) {
+    Reserved.set(AArch64::X19);
+    Reserved.set(AArch64::W19);
+  }
+
+  return Reserved;
+}
+
+bool AArch64RegisterInfo::isReservedReg(const MachineFunction &MF,
+                                      unsigned Reg) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  switch (Reg) {
+  default:
+    break;
+  case AArch64::SP:
+  case AArch64::XZR:
+  case AArch64::WSP:
+  case AArch64::WZR:
+    return true;
+  case AArch64::X18:
+  case AArch64::W18:
+    return STI->isTargetDarwin();
+  case AArch64::FP:
+  case AArch64::W29:
+    return TFI->hasFP(MF) || STI->isTargetDarwin();
+  case AArch64::W19:
+  case AArch64::X19:
+    return hasBasePointer(MF);
+  }
+
+  return false;
+}
+
+const TargetRegisterClass *
+AArch64RegisterInfo::getPointerRegClass(const MachineFunction &MF,
+                                      unsigned Kind) const {
+  return &AArch64::GPR64RegClass;
+}
+
+const TargetRegisterClass *
+AArch64RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
+  if (RC == &AArch64::CCRRegClass)
+    return &AArch64::GPR64RegClass; // Only MSR & MRS copy NZCV.
+  return RC;
+}
+
+unsigned AArch64RegisterInfo::getBaseRegister() const { return AArch64::X19; }
+
+bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // In the presence of variable sized objects, if the fixed stack size is
+  // large enough that referencing from the FP won't result in things being
+  // in range relatively often, we can use a base pointer to allow access
+  // from the other direction like the SP normally works.
+  if (MFI->hasVarSizedObjects()) {
+    // Conservatively estimate whether the negative offset from the frame
+    // pointer will be sufficient to reach. If a function has a smallish
+    // frame, it's less likely to have lots of spills and callee saved
+    // space, so it's all more likely to be within range of the frame pointer.
+    // If it's wrong, we'll materialize the constant and still get to the
+    // object; it's just suboptimal. Negative offsets use the unscaled
+    // load/store instructions, which have a 9-bit signed immediate.
+    if (MFI->getLocalFrameSize() < 256)
+      return false;
+    return true;
+  }
+
+  return false;
+}
+
+unsigned
+AArch64RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  return TFI->hasFP(MF) ? AArch64::FP : AArch64::SP;
+}
+
+bool AArch64RegisterInfo::requiresRegisterScavenging(
+    const MachineFunction &MF) const {
+  return true;
+}
+
+bool AArch64RegisterInfo::requiresVirtualBaseRegisters(
+    const MachineFunction &MF) const {
+  return true;
+}
+
+bool
+AArch64RegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  // AArch64FrameLowering::resolveFrameIndexReference() can always fall back
+  // to the stack pointer, so only put the emergency spill slot next to the
+  // FP when there's no better way to access it (SP or base pointer).
+  return MFI->hasVarSizedObjects() && !hasBasePointer(MF);
+}
+
+bool AArch64RegisterInfo::requiresFrameIndexScavenging(
+    const MachineFunction &MF) const {
+  return true;
+}
+
+bool
+AArch64RegisterInfo::cannotEliminateFrame(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  // Only consider eliminating leaf frames.
+  if (MFI->hasCalls() || (MF.getTarget().Options.DisableFramePointerElim(MF) &&
+                          MFI->adjustsStack()))
+    return true;
+  return MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken();
+}
+
+/// needsFrameBaseReg - Returns true if the instruction's frame index
+/// reference would be better served by a base register other than FP
+/// or SP. Used by LocalStackFrameAllocation to determine which frame index
+/// references it should create new base registers for.
+bool AArch64RegisterInfo::needsFrameBaseReg(MachineInstr *MI,
+                                            int64_t Offset) const {
+  for (unsigned i = 0; !MI->getOperand(i).isFI(); ++i)
+    assert(i < MI->getNumOperands() &&
+           "Instr doesn't have FrameIndex operand!");
+
+  // It's the load/store FI references that cause issues, as it can be difficult
+  // to materialize the offset if it won't fit in the literal field. Estimate
+  // based on the size of the local frame and some conservative assumptions
+  // about the rest of the stack frame (note, this is pre-regalloc, so
+  // we don't know everything for certain yet) whether this offset is likely
+  // to be out of range of the immediate. Return true if so.
+
+  // We only generate virtual base registers for loads and stores, so
+  // return false for everything else.
+  if (!MI->mayLoad() && !MI->mayStore())
+    return false;
+
+  // Without a virtual base register, if the function has variable sized
+  // objects, all fixed-size local references will be via the frame pointer,
+  // Approximate the offset and see if it's legal for the instruction.
+  // Note that the incoming offset is based on the SP value at function entry,
+  // so it'll be negative.
+  MachineFunction &MF = *MI->getParent()->getParent();
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Estimate an offset from the frame pointer.
+  // Conservatively assume all GPR callee-saved registers get pushed.
+  // FP, LR, X19-X28, D8-D15. 64-bits each.
+  int64_t FPOffset = Offset - 16 * 20;
+  // Estimate an offset from the stack pointer.
+  // The incoming offset is relating to the SP at the start of the function,
+  // but when we access the local it'll be relative to the SP after local
+  // allocation, so adjust our SP-relative offset by that allocation size.
+  Offset += MFI->getLocalFrameSize();
+  // Assume that we'll have at least some spill slots allocated.
+  // FIXME: This is a total SWAG number. We should run some statistics
+  //        and pick a real one.
+  Offset += 128; // 128 bytes of spill slots
+
+  // If there is a frame pointer, try using it.
+  // The FP is only available if there is no dynamic realignment. We
+  // don't know for sure yet whether we'll need that, so we guess based
+  // on whether there are any local variables that would trigger it.
+  if (TFI->hasFP(MF) && isFrameOffsetLegal(MI, FPOffset))
+    return false;
+
+  // If we can reference via the stack pointer or base pointer, try that.
+  // FIXME: This (and the code that resolves the references) can be improved
+  //        to only disallow SP relative references in the live range of
+  //        the VLA(s). In practice, it's unclear how much difference that
+  //        would make, but it may be worth doing.
+  if (isFrameOffsetLegal(MI, Offset))
+    return false;
+
+  // The offset likely isn't legal; we want to allocate a virtual base register.
+  return true;
+}
+
+bool AArch64RegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
+                                             int64_t Offset) const {
+  assert(Offset <= INT_MAX && "Offset too big to fit in int.");
+  assert(MI && "Unable to get the legal offset for nil instruction.");
+  int SaveOffset = Offset;
+  return isAArch64FrameOffsetLegal(*MI, SaveOffset) & AArch64FrameOffsetIsLegal;
+}
+
+/// Insert defining instruction(s) for BaseReg to be a pointer to FrameIdx
+/// at the beginning of the basic block.
+void AArch64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
+                                                       unsigned BaseReg,
+                                                       int FrameIdx,
+                                                       int64_t Offset) const {
+  MachineBasicBlock::iterator Ins = MBB->begin();
+  DebugLoc DL; // Defaults to "unknown"
+  if (Ins != MBB->end())
+    DL = Ins->getDebugLoc();
+
+  const MCInstrDesc &MCID = TII->get(AArch64::ADDXri);
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  const MachineFunction &MF = *MBB->getParent();
+  MRI.constrainRegClass(BaseReg, TII->getRegClass(MCID, 0, this, MF));
+  unsigned Shifter = AArch64_AM::getShifterImm(AArch64_AM::LSL, 0);
+
+  BuildMI(*MBB, Ins, DL, MCID, BaseReg)
+      .addFrameIndex(FrameIdx)
+      .addImm(Offset)
+      .addImm(Shifter);
+}
+
+void AArch64RegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+                                            int64_t Offset) const {
+  int Off = Offset; // ARM doesn't need the general 64-bit offsets
+  unsigned i = 0;
+
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+  bool Done = rewriteAArch64FrameIndex(MI, i, BaseReg, Off, TII);
+  assert(Done && "Unable to resolve frame index!");
+  (void)Done;
+}
+
+void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                              int SPAdj, unsigned FIOperandNum,
+                                              RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected");
+
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const AArch64FrameLowering *TFI = static_cast<const AArch64FrameLowering *>(
+      MF.getTarget().getFrameLowering());
+
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+  unsigned FrameReg;
+  int Offset;
+
+  // Special handling of dbg_value, stackmap and patchpoint instructions.
+  if (MI.isDebugValue() || MI.getOpcode() == TargetOpcode::STACKMAP ||
+      MI.getOpcode() == TargetOpcode::PATCHPOINT) {
+    Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg,
+                                             /*PreferFP=*/true);
+    Offset += MI.getOperand(FIOperandNum + 1).getImm();
+    MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false /*isDef*/);
+    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+    return;
+  }
+
+  // Modify MI as necessary to handle as much of 'Offset' as possible
+  Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg);
+  if (rewriteAArch64FrameIndex(MI, FIOperandNum, FrameReg, Offset, TII))
+    return;
+
+  assert((!RS || !RS->isScavengingFrameIndex(FrameIndex)) &&
+         "Emergency spill slot is out of reach");
+
+  // If we get here, the immediate doesn't fit into the instruction.  We folded
+  // as much as possible above.  Handle the rest, providing a register that is
+  // SP+LargeImm.
+  unsigned ScratchReg =
+      MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+  emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, TII);
+  MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true);
+}
+
+namespace llvm {
+
+unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
+                                                  MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  switch (RC->getID()) {
+  default:
+    return 0;
+  case AArch64::GPR32RegClassID:
+  case AArch64::GPR32spRegClassID:
+  case AArch64::GPR32allRegClassID:
+  case AArch64::GPR64spRegClassID:
+  case AArch64::GPR64allRegClassID:
+  case AArch64::GPR64RegClassID:
+  case AArch64::GPR32commonRegClassID:
+  case AArch64::GPR64commonRegClassID:
+    return 32 - 1                                      // XZR/SP
+           - (TFI->hasFP(MF) || STI->isTargetDarwin()) // FP
+           - STI->isTargetDarwin() // X18 reserved as platform register
+           - hasBasePointer(MF);   // X19
+  case AArch64::FPR8RegClassID:
+  case AArch64::FPR16RegClassID:
+  case AArch64::FPR32RegClassID:
+  case AArch64::FPR64RegClassID:
+  case AArch64::FPR128RegClassID:
+    return 32;
+
+  case AArch64::DDRegClassID:
+  case AArch64::DDDRegClassID:
+  case AArch64::DDDDRegClassID:
+  case AArch64::QQRegClassID:
+  case AArch64::QQQRegClassID:
+  case AArch64::QQQQRegClassID:
+    return 32;
+
+  case AArch64::FPR128_loRegClassID:
+    return 16;
+  }
+}
+
+} // namespace llvm
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
new file mode 100644
index 0000000..76af1ed
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -0,0 +1,101 @@
+//==- AArch64RegisterInfo.h - AArch64 Register Information Impl --*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AArch64 implementation of the MRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_AArch64REGISTERINFO_H
+#define LLVM_TARGET_AArch64REGISTERINFO_H
+
+#define GET_REGINFO_HEADER
+#include "AArch64GenRegisterInfo.inc"
+
+namespace llvm {
+
+class AArch64InstrInfo;
+class AArch64Subtarget;
+class MachineFunction;
+class RegScavenger;
+class TargetRegisterClass;
+
+struct AArch64RegisterInfo : public AArch64GenRegisterInfo {
+private:
+  const AArch64InstrInfo *TII;
+  const AArch64Subtarget *STI;
+
+public:
+  AArch64RegisterInfo(const AArch64InstrInfo *tii, const AArch64Subtarget *sti);
+
+  bool isReservedReg(const MachineFunction &MF, unsigned Reg) const;
+
+  /// Code Generation virtual methods...
+  const MCPhysReg *
+  getCalleeSavedRegs(const MachineFunction *MF = nullptr) const override;
+  const uint32_t *getCallPreservedMask(CallingConv::ID) const override;
+
+  unsigned getCSRFirstUseCost() const override {
+    // The cost will be compared against BlockFrequency where entry has the
+    // value of 1 << 14. A value of 5 will choose to spill or split really
+    // cold path instead of using a callee-saved register.
+    return 5;
+  }
+
+  // Calls involved in thread-local variable lookup save more registers than
+  // normal calls, so they need a different mask to represent this.
+  const uint32_t *getTLSCallPreservedMask() const;
+
+  /// getThisReturnPreservedMask - Returns a call preserved mask specific to the
+  /// case that 'returned' is on an i64 first argument if the calling convention
+  /// is one that can (partially) model this attribute with a preserved mask
+  /// (i.e. it is a calling convention that uses the same register for the first
+  /// i64 argument and an i64 return value)
+  ///
+  /// Should return NULL in the case that the calling convention does not have
+  /// this property
+  const uint32_t *getThisReturnPreservedMask(CallingConv::ID) const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+  const TargetRegisterClass *
+  getPointerRegClass(const MachineFunction &MF,
+                     unsigned Kind = 0) const override;
+  const TargetRegisterClass *
+  getCrossCopyRegClass(const TargetRegisterClass *RC) const override;
+
+  bool requiresRegisterScavenging(const MachineFunction &MF) const override;
+  bool useFPForScavengingIndex(const MachineFunction &MF) const override;
+  bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
+
+  bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override;
+  bool isFrameOffsetLegal(const MachineInstr *MI,
+                          int64_t Offset) const override;
+  void materializeFrameBaseRegister(MachineBasicBlock *MBB, unsigned BaseReg,
+                                    int FrameIdx,
+                                    int64_t Offset) const override;
+  void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+                         int64_t Offset) const override;
+  void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
+                           unsigned FIOperandNum,
+                           RegScavenger *RS = nullptr) const override;
+  bool cannotEliminateFrame(const MachineFunction &MF) const;
+
+  bool requiresVirtualBaseRegisters(const MachineFunction &MF) const override;
+  bool hasBasePointer(const MachineFunction &MF) const;
+  unsigned getBaseRegister() const;
+
+  // Debug information queries.
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
+
+  unsigned getRegPressureLimit(const TargetRegisterClass *RC,
+                               MachineFunction &MF) const override;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TARGET_AArch64REGISTERINFO_H
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
new file mode 100644
index 0000000..a30e4ad
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -0,0 +1,593 @@
+//=- AArch64RegisterInfo.td - Describe the AArch64 Regisers --*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+
+class AArch64Reg<bits<16> enc, string n, list<Register> subregs = [],
+               list<string> altNames = []>
+        : Register<n, altNames> {
+  let HWEncoding = enc;
+  let Namespace = "AArch64";
+  let SubRegs = subregs;
+}
+
+let Namespace = "AArch64" in {
+  def sub_32 : SubRegIndex<32>;
+
+  def bsub : SubRegIndex<8>;
+  def hsub : SubRegIndex<16>;
+  def ssub : SubRegIndex<32>;
+  def dsub : SubRegIndex<32>;
+  def qhisub : SubRegIndex<64>;
+  def qsub : SubRegIndex<64>;
+  // Note: Code depends on these having consecutive numbers
+  def dsub0 : SubRegIndex<64>;
+  def dsub1 : SubRegIndex<64>;
+  def dsub2 : SubRegIndex<64>;
+  def dsub3 : SubRegIndex<64>;
+  // Note: Code depends on these having consecutive numbers
+  def qsub0 : SubRegIndex<128>;
+  def qsub1 : SubRegIndex<128>;
+  def qsub2 : SubRegIndex<128>;
+  def qsub3 : SubRegIndex<128>;
+}
+
+let Namespace = "AArch64" in {
+  def vreg : RegAltNameIndex;
+  def vlist1 : RegAltNameIndex;
+}
+
+//===----------------------------------------------------------------------===//
+// Registers
+//===----------------------------------------------------------------------===//
+def W0    : AArch64Reg<0,   "w0" >, DwarfRegNum<[0]>;
+def W1    : AArch64Reg<1,   "w1" >, DwarfRegNum<[1]>;
+def W2    : AArch64Reg<2,   "w2" >, DwarfRegNum<[2]>;
+def W3    : AArch64Reg<3,   "w3" >, DwarfRegNum<[3]>;
+def W4    : AArch64Reg<4,   "w4" >, DwarfRegNum<[4]>;
+def W5    : AArch64Reg<5,   "w5" >, DwarfRegNum<[5]>;
+def W6    : AArch64Reg<6,   "w6" >, DwarfRegNum<[6]>;
+def W7    : AArch64Reg<7,   "w7" >, DwarfRegNum<[7]>;
+def W8    : AArch64Reg<8,   "w8" >, DwarfRegNum<[8]>;
+def W9    : AArch64Reg<9,   "w9" >, DwarfRegNum<[9]>;
+def W10   : AArch64Reg<10, "w10">, DwarfRegNum<[10]>;
+def W11   : AArch64Reg<11, "w11">, DwarfRegNum<[11]>;
+def W12   : AArch64Reg<12, "w12">, DwarfRegNum<[12]>;
+def W13   : AArch64Reg<13, "w13">, DwarfRegNum<[13]>;
+def W14   : AArch64Reg<14, "w14">, DwarfRegNum<[14]>;
+def W15   : AArch64Reg<15, "w15">, DwarfRegNum<[15]>;
+def W16   : AArch64Reg<16, "w16">, DwarfRegNum<[16]>;
+def W17   : AArch64Reg<17, "w17">, DwarfRegNum<[17]>;
+def W18   : AArch64Reg<18, "w18">, DwarfRegNum<[18]>;
+def W19   : AArch64Reg<19, "w19">, DwarfRegNum<[19]>;
+def W20   : AArch64Reg<20, "w20">, DwarfRegNum<[20]>;
+def W21   : AArch64Reg<21, "w21">, DwarfRegNum<[21]>;
+def W22   : AArch64Reg<22, "w22">, DwarfRegNum<[22]>;
+def W23   : AArch64Reg<23, "w23">, DwarfRegNum<[23]>;
+def W24   : AArch64Reg<24, "w24">, DwarfRegNum<[24]>;
+def W25   : AArch64Reg<25, "w25">, DwarfRegNum<[25]>;
+def W26   : AArch64Reg<26, "w26">, DwarfRegNum<[26]>;
+def W27   : AArch64Reg<27, "w27">, DwarfRegNum<[27]>;
+def W28   : AArch64Reg<28, "w28">, DwarfRegNum<[28]>;
+def W29   : AArch64Reg<29, "w29">, DwarfRegNum<[29]>;
+def W30   : AArch64Reg<30, "w30">, DwarfRegNum<[30]>;
+def WSP   : AArch64Reg<31, "wsp">, DwarfRegNum<[31]>;
+def WZR   : AArch64Reg<31, "wzr">, DwarfRegAlias<WSP>;
+
+let SubRegIndices = [sub_32] in {
+def X0    : AArch64Reg<0,   "x0",  [W0]>, DwarfRegAlias<W0>;
+def X1    : AArch64Reg<1,   "x1",  [W1]>, DwarfRegAlias<W1>;
+def X2    : AArch64Reg<2,   "x2",  [W2]>, DwarfRegAlias<W2>;
+def X3    : AArch64Reg<3,   "x3",  [W3]>, DwarfRegAlias<W3>;
+def X4    : AArch64Reg<4,   "x4",  [W4]>, DwarfRegAlias<W4>;
+def X5    : AArch64Reg<5,   "x5",  [W5]>, DwarfRegAlias<W5>;
+def X6    : AArch64Reg<6,   "x6",  [W6]>, DwarfRegAlias<W6>;
+def X7    : AArch64Reg<7,   "x7",  [W7]>, DwarfRegAlias<W7>;
+def X8    : AArch64Reg<8,   "x8",  [W8]>, DwarfRegAlias<W8>;
+def X9    : AArch64Reg<9,   "x9",  [W9]>, DwarfRegAlias<W9>;
+def X10   : AArch64Reg<10, "x10", [W10]>, DwarfRegAlias<W10>;
+def X11   : AArch64Reg<11, "x11", [W11]>, DwarfRegAlias<W11>;
+def X12   : AArch64Reg<12, "x12", [W12]>, DwarfRegAlias<W12>;
+def X13   : AArch64Reg<13, "x13", [W13]>, DwarfRegAlias<W13>;
+def X14   : AArch64Reg<14, "x14", [W14]>, DwarfRegAlias<W14>;
+def X15   : AArch64Reg<15, "x15", [W15]>, DwarfRegAlias<W15>;
+def X16   : AArch64Reg<16, "x16", [W16]>, DwarfRegAlias<W16>;
+def X17   : AArch64Reg<17, "x17", [W17]>, DwarfRegAlias<W17>;
+def X18   : AArch64Reg<18, "x18", [W18]>, DwarfRegAlias<W18>;
+def X19   : AArch64Reg<19, "x19", [W19]>, DwarfRegAlias<W19>;
+def X20   : AArch64Reg<20, "x20", [W20]>, DwarfRegAlias<W20>;
+def X21   : AArch64Reg<21, "x21", [W21]>, DwarfRegAlias<W21>;
+def X22   : AArch64Reg<22, "x22", [W22]>, DwarfRegAlias<W22>;
+def X23   : AArch64Reg<23, "x23", [W23]>, DwarfRegAlias<W23>;
+def X24   : AArch64Reg<24, "x24", [W24]>, DwarfRegAlias<W24>;
+def X25   : AArch64Reg<25, "x25", [W25]>, DwarfRegAlias<W25>;
+def X26   : AArch64Reg<26, "x26", [W26]>, DwarfRegAlias<W26>;
+def X27   : AArch64Reg<27, "x27", [W27]>, DwarfRegAlias<W27>;
+def X28   : AArch64Reg<28, "x28", [W28]>, DwarfRegAlias<W28>;
+def FP    : AArch64Reg<29, "x29", [W29]>, DwarfRegAlias<W29>;
+def LR    : AArch64Reg<30, "x30", [W30]>, DwarfRegAlias<W30>;
+def SP    : AArch64Reg<31, "sp",  [WSP]>, DwarfRegAlias<WSP>;
+def XZR   : AArch64Reg<31, "xzr", [WZR]>, DwarfRegAlias<WSP>;
+}
+
+// Condition code register.
+def NZCV  : AArch64Reg<0, "nzcv">;
+
+// GPR register classes with the intersections of GPR32/GPR32sp and
+// GPR64/GPR64sp for use by the coalescer.
+def GPR32common : RegisterClass<"AArch64", [i32], 32, (sequence "W%u", 0, 30)> {
+  let AltOrders = [(rotl GPR32common, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+def GPR64common : RegisterClass<"AArch64", [i64], 64,
+                                (add (sequence "X%u", 0, 28), FP, LR)> {
+  let AltOrders = [(rotl GPR64common, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+// GPR register classes which exclude SP/WSP.
+def GPR32 : RegisterClass<"AArch64", [i32], 32, (add GPR32common, WZR)> {
+  let AltOrders = [(rotl GPR32, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+def GPR64 : RegisterClass<"AArch64", [i64], 64, (add GPR64common, XZR)> {
+  let AltOrders = [(rotl GPR64, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+
+// GPR register classes which include SP/WSP.
+def GPR32sp : RegisterClass<"AArch64", [i32], 32, (add GPR32common, WSP)> {
+  let AltOrders = [(rotl GPR32sp, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+def GPR64sp : RegisterClass<"AArch64", [i64], 64, (add GPR64common, SP)> {
+  let AltOrders = [(rotl GPR64sp, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+
+def GPR32sponly : RegisterClass<"AArch64", [i32], 32, (add WSP)>;
+def GPR64sponly : RegisterClass<"AArch64", [i64], 64, (add SP)>;
+
+def GPR64spPlus0Operand : AsmOperandClass {
+  let Name = "GPR64sp0";
+  let RenderMethod = "addRegOperands";
+  let ParserMethod = "tryParseGPR64sp0Operand";
+}
+
+def GPR64sp0 : RegisterOperand<GPR64sp> {
+  let ParserMatchClass = GPR64spPlus0Operand;
+}
+
+// GPR register classes which include WZR/XZR AND SP/WSP. This is not a
+// constraint used by any instructions, it is used as a common super-class.
+def GPR32all : RegisterClass<"AArch64", [i32], 32, (add GPR32common, WZR, WSP)>;
+def GPR64all : RegisterClass<"AArch64", [i64], 64, (add GPR64common, XZR, SP)>;
+
+// For tail calls, we can't use callee-saved registers, as they are restored
+// to the saved value before the tail call, which would clobber a call address.
+// This is for indirect tail calls to store the address of the destination.
+def tcGPR64 : RegisterClass<"AArch64", [i64], 64, (sub GPR64common, X19, X20, X21,
+                                                     X22, X23, X24, X25, X26,
+                                                     X27, X28, FP, LR)>;
+
+// GPR register classes for post increment amount of vector load/store that
+// has alternate printing when Rm=31 and prints a constant immediate value
+// equal to the total number of bytes transferred.
+
+// FIXME: TableGen *should* be able to do these itself now. There appears to be
+// a bug in counting how many operands a Post-indexed MCInst should have which
+// means the aliases don't trigger.
+def GPR64pi1  : RegisterOperand<GPR64, "printPostIncOperand<1>">;
+def GPR64pi2  : RegisterOperand<GPR64, "printPostIncOperand<2>">;
+def GPR64pi3  : RegisterOperand<GPR64, "printPostIncOperand<3>">;
+def GPR64pi4  : RegisterOperand<GPR64, "printPostIncOperand<4>">;
+def GPR64pi6  : RegisterOperand<GPR64, "printPostIncOperand<6>">;
+def GPR64pi8  : RegisterOperand<GPR64, "printPostIncOperand<8>">;
+def GPR64pi12 : RegisterOperand<GPR64, "printPostIncOperand<12>">;
+def GPR64pi16 : RegisterOperand<GPR64, "printPostIncOperand<16>">;
+def GPR64pi24 : RegisterOperand<GPR64, "printPostIncOperand<24>">;
+def GPR64pi32 : RegisterOperand<GPR64, "printPostIncOperand<32>">;
+def GPR64pi48 : RegisterOperand<GPR64, "printPostIncOperand<48>">;
+def GPR64pi64 : RegisterOperand<GPR64, "printPostIncOperand<64>">;
+
+// Condition code regclass.
+def CCR : RegisterClass<"AArch64", [i32], 32, (add NZCV)> {
+  let CopyCost = -1;  // Don't allow copying of status registers.
+
+  // CCR is not allocatable.
+  let isAllocatable = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Floating Point Scalar Registers
+//===----------------------------------------------------------------------===//
+
+def B0    : AArch64Reg<0,   "b0">, DwarfRegNum<[64]>;
+def B1    : AArch64Reg<1,   "b1">, DwarfRegNum<[65]>;
+def B2    : AArch64Reg<2,   "b2">, DwarfRegNum<[66]>;
+def B3    : AArch64Reg<3,   "b3">, DwarfRegNum<[67]>;
+def B4    : AArch64Reg<4,   "b4">, DwarfRegNum<[68]>;
+def B5    : AArch64Reg<5,   "b5">, DwarfRegNum<[69]>;
+def B6    : AArch64Reg<6,   "b6">, DwarfRegNum<[70]>;
+def B7    : AArch64Reg<7,   "b7">, DwarfRegNum<[71]>;
+def B8    : AArch64Reg<8,   "b8">, DwarfRegNum<[72]>;
+def B9    : AArch64Reg<9,   "b9">, DwarfRegNum<[73]>;
+def B10   : AArch64Reg<10, "b10">, DwarfRegNum<[74]>;
+def B11   : AArch64Reg<11, "b11">, DwarfRegNum<[75]>;
+def B12   : AArch64Reg<12, "b12">, DwarfRegNum<[76]>;
+def B13   : AArch64Reg<13, "b13">, DwarfRegNum<[77]>;
+def B14   : AArch64Reg<14, "b14">, DwarfRegNum<[78]>;
+def B15   : AArch64Reg<15, "b15">, DwarfRegNum<[79]>;
+def B16   : AArch64Reg<16, "b16">, DwarfRegNum<[80]>;
+def B17   : AArch64Reg<17, "b17">, DwarfRegNum<[81]>;
+def B18   : AArch64Reg<18, "b18">, DwarfRegNum<[82]>;
+def B19   : AArch64Reg<19, "b19">, DwarfRegNum<[83]>;
+def B20   : AArch64Reg<20, "b20">, DwarfRegNum<[84]>;
+def B21   : AArch64Reg<21, "b21">, DwarfRegNum<[85]>;
+def B22   : AArch64Reg<22, "b22">, DwarfRegNum<[86]>;
+def B23   : AArch64Reg<23, "b23">, DwarfRegNum<[87]>;
+def B24   : AArch64Reg<24, "b24">, DwarfRegNum<[88]>;
+def B25   : AArch64Reg<25, "b25">, DwarfRegNum<[89]>;
+def B26   : AArch64Reg<26, "b26">, DwarfRegNum<[90]>;
+def B27   : AArch64Reg<27, "b27">, DwarfRegNum<[91]>;
+def B28   : AArch64Reg<28, "b28">, DwarfRegNum<[92]>;
+def B29   : AArch64Reg<29, "b29">, DwarfRegNum<[93]>;
+def B30   : AArch64Reg<30, "b30">, DwarfRegNum<[94]>;
+def B31   : AArch64Reg<31, "b31">, DwarfRegNum<[95]>;
+
+let SubRegIndices = [bsub] in {
+def H0    : AArch64Reg<0,   "h0", [B0]>, DwarfRegAlias<B0>;
+def H1    : AArch64Reg<1,   "h1", [B1]>, DwarfRegAlias<B1>;
+def H2    : AArch64Reg<2,   "h2", [B2]>, DwarfRegAlias<B2>;
+def H3    : AArch64Reg<3,   "h3", [B3]>, DwarfRegAlias<B3>;
+def H4    : AArch64Reg<4,   "h4", [B4]>, DwarfRegAlias<B4>;
+def H5    : AArch64Reg<5,   "h5", [B5]>, DwarfRegAlias<B5>;
+def H6    : AArch64Reg<6,   "h6", [B6]>, DwarfRegAlias<B6>;
+def H7    : AArch64Reg<7,   "h7", [B7]>, DwarfRegAlias<B7>;
+def H8    : AArch64Reg<8,   "h8", [B8]>, DwarfRegAlias<B8>;
+def H9    : AArch64Reg<9,   "h9", [B9]>, DwarfRegAlias<B9>;
+def H10   : AArch64Reg<10, "h10", [B10]>, DwarfRegAlias<B10>;
+def H11   : AArch64Reg<11, "h11", [B11]>, DwarfRegAlias<B11>;
+def H12   : AArch64Reg<12, "h12", [B12]>, DwarfRegAlias<B12>;
+def H13   : AArch64Reg<13, "h13", [B13]>, DwarfRegAlias<B13>;
+def H14   : AArch64Reg<14, "h14", [B14]>, DwarfRegAlias<B14>;
+def H15   : AArch64Reg<15, "h15", [B15]>, DwarfRegAlias<B15>;
+def H16   : AArch64Reg<16, "h16", [B16]>, DwarfRegAlias<B16>;
+def H17   : AArch64Reg<17, "h17", [B17]>, DwarfRegAlias<B17>;
+def H18   : AArch64Reg<18, "h18", [B18]>, DwarfRegAlias<B18>;
+def H19   : AArch64Reg<19, "h19", [B19]>, DwarfRegAlias<B19>;
+def H20   : AArch64Reg<20, "h20", [B20]>, DwarfRegAlias<B20>;
+def H21   : AArch64Reg<21, "h21", [B21]>, DwarfRegAlias<B21>;
+def H22   : AArch64Reg<22, "h22", [B22]>, DwarfRegAlias<B22>;
+def H23   : AArch64Reg<23, "h23", [B23]>, DwarfRegAlias<B23>;
+def H24   : AArch64Reg<24, "h24", [B24]>, DwarfRegAlias<B24>;
+def H25   : AArch64Reg<25, "h25", [B25]>, DwarfRegAlias<B25>;
+def H26   : AArch64Reg<26, "h26", [B26]>, DwarfRegAlias<B26>;
+def H27   : AArch64Reg<27, "h27", [B27]>, DwarfRegAlias<B27>;
+def H28   : AArch64Reg<28, "h28", [B28]>, DwarfRegAlias<B28>;
+def H29   : AArch64Reg<29, "h29", [B29]>, DwarfRegAlias<B29>;
+def H30   : AArch64Reg<30, "h30", [B30]>, DwarfRegAlias<B30>;
+def H31   : AArch64Reg<31, "h31", [B31]>, DwarfRegAlias<B31>;
+}
+
+let SubRegIndices = [hsub] in {
+def S0    : AArch64Reg<0,   "s0", [H0]>, DwarfRegAlias<B0>;
+def S1    : AArch64Reg<1,   "s1", [H1]>, DwarfRegAlias<B1>;
+def S2    : AArch64Reg<2,   "s2", [H2]>, DwarfRegAlias<B2>;
+def S3    : AArch64Reg<3,   "s3", [H3]>, DwarfRegAlias<B3>;
+def S4    : AArch64Reg<4,   "s4", [H4]>, DwarfRegAlias<B4>;
+def S5    : AArch64Reg<5,   "s5", [H5]>, DwarfRegAlias<B5>;
+def S6    : AArch64Reg<6,   "s6", [H6]>, DwarfRegAlias<B6>;
+def S7    : AArch64Reg<7,   "s7", [H7]>, DwarfRegAlias<B7>;
+def S8    : AArch64Reg<8,   "s8", [H8]>, DwarfRegAlias<B8>;
+def S9    : AArch64Reg<9,   "s9", [H9]>, DwarfRegAlias<B9>;
+def S10   : AArch64Reg<10, "s10", [H10]>, DwarfRegAlias<B10>;
+def S11   : AArch64Reg<11, "s11", [H11]>, DwarfRegAlias<B11>;
+def S12   : AArch64Reg<12, "s12", [H12]>, DwarfRegAlias<B12>;
+def S13   : AArch64Reg<13, "s13", [H13]>, DwarfRegAlias<B13>;
+def S14   : AArch64Reg<14, "s14", [H14]>, DwarfRegAlias<B14>;
+def S15   : AArch64Reg<15, "s15", [H15]>, DwarfRegAlias<B15>;
+def S16   : AArch64Reg<16, "s16", [H16]>, DwarfRegAlias<B16>;
+def S17   : AArch64Reg<17, "s17", [H17]>, DwarfRegAlias<B17>;
+def S18   : AArch64Reg<18, "s18", [H18]>, DwarfRegAlias<B18>;
+def S19   : AArch64Reg<19, "s19", [H19]>, DwarfRegAlias<B19>;
+def S20   : AArch64Reg<20, "s20", [H20]>, DwarfRegAlias<B20>;
+def S21   : AArch64Reg<21, "s21", [H21]>, DwarfRegAlias<B21>;
+def S22   : AArch64Reg<22, "s22", [H22]>, DwarfRegAlias<B22>;
+def S23   : AArch64Reg<23, "s23", [H23]>, DwarfRegAlias<B23>;
+def S24   : AArch64Reg<24, "s24", [H24]>, DwarfRegAlias<B24>;
+def S25   : AArch64Reg<25, "s25", [H25]>, DwarfRegAlias<B25>;
+def S26   : AArch64Reg<26, "s26", [H26]>, DwarfRegAlias<B26>;
+def S27   : AArch64Reg<27, "s27", [H27]>, DwarfRegAlias<B27>;
+def S28   : AArch64Reg<28, "s28", [H28]>, DwarfRegAlias<B28>;
+def S29   : AArch64Reg<29, "s29", [H29]>, DwarfRegAlias<B29>;
+def S30   : AArch64Reg<30, "s30", [H30]>, DwarfRegAlias<B30>;
+def S31   : AArch64Reg<31, "s31", [H31]>, DwarfRegAlias<B31>;
+}
+
+let SubRegIndices = [ssub], RegAltNameIndices = [vreg, vlist1] in {
+def D0    : AArch64Reg<0,   "d0", [S0], ["v0", ""]>, DwarfRegAlias<B0>;
+def D1    : AArch64Reg<1,   "d1", [S1], ["v1", ""]>, DwarfRegAlias<B1>;
+def D2    : AArch64Reg<2,   "d2", [S2], ["v2", ""]>, DwarfRegAlias<B2>;
+def D3    : AArch64Reg<3,   "d3", [S3], ["v3", ""]>, DwarfRegAlias<B3>;
+def D4    : AArch64Reg<4,   "d4", [S4], ["v4", ""]>, DwarfRegAlias<B4>;
+def D5    : AArch64Reg<5,   "d5", [S5], ["v5", ""]>, DwarfRegAlias<B5>;
+def D6    : AArch64Reg<6,   "d6", [S6], ["v6", ""]>, DwarfRegAlias<B6>;
+def D7    : AArch64Reg<7,   "d7", [S7], ["v7", ""]>, DwarfRegAlias<B7>;
+def D8    : AArch64Reg<8,   "d8", [S8], ["v8", ""]>, DwarfRegAlias<B8>;
+def D9    : AArch64Reg<9,   "d9", [S9], ["v9", ""]>, DwarfRegAlias<B9>;
+def D10   : AArch64Reg<10, "d10", [S10], ["v10", ""]>, DwarfRegAlias<B10>;
+def D11   : AArch64Reg<11, "d11", [S11], ["v11", ""]>, DwarfRegAlias<B11>;
+def D12   : AArch64Reg<12, "d12", [S12], ["v12", ""]>, DwarfRegAlias<B12>;
+def D13   : AArch64Reg<13, "d13", [S13], ["v13", ""]>, DwarfRegAlias<B13>;
+def D14   : AArch64Reg<14, "d14", [S14], ["v14", ""]>, DwarfRegAlias<B14>;
+def D15   : AArch64Reg<15, "d15", [S15], ["v15", ""]>, DwarfRegAlias<B15>;
+def D16   : AArch64Reg<16, "d16", [S16], ["v16", ""]>, DwarfRegAlias<B16>;
+def D17   : AArch64Reg<17, "d17", [S17], ["v17", ""]>, DwarfRegAlias<B17>;
+def D18   : AArch64Reg<18, "d18", [S18], ["v18", ""]>, DwarfRegAlias<B18>;
+def D19   : AArch64Reg<19, "d19", [S19], ["v19", ""]>, DwarfRegAlias<B19>;
+def D20   : AArch64Reg<20, "d20", [S20], ["v20", ""]>, DwarfRegAlias<B20>;
+def D21   : AArch64Reg<21, "d21", [S21], ["v21", ""]>, DwarfRegAlias<B21>;
+def D22   : AArch64Reg<22, "d22", [S22], ["v22", ""]>, DwarfRegAlias<B22>;
+def D23   : AArch64Reg<23, "d23", [S23], ["v23", ""]>, DwarfRegAlias<B23>;
+def D24   : AArch64Reg<24, "d24", [S24], ["v24", ""]>, DwarfRegAlias<B24>;
+def D25   : AArch64Reg<25, "d25", [S25], ["v25", ""]>, DwarfRegAlias<B25>;
+def D26   : AArch64Reg<26, "d26", [S26], ["v26", ""]>, DwarfRegAlias<B26>;
+def D27   : AArch64Reg<27, "d27", [S27], ["v27", ""]>, DwarfRegAlias<B27>;
+def D28   : AArch64Reg<28, "d28", [S28], ["v28", ""]>, DwarfRegAlias<B28>;
+def D29   : AArch64Reg<29, "d29", [S29], ["v29", ""]>, DwarfRegAlias<B29>;
+def D30   : AArch64Reg<30, "d30", [S30], ["v30", ""]>, DwarfRegAlias<B30>;
+def D31   : AArch64Reg<31, "d31", [S31], ["v31", ""]>, DwarfRegAlias<B31>;
+}
+
+let SubRegIndices = [dsub], RegAltNameIndices = [vreg, vlist1] in {
+def Q0    : AArch64Reg<0,   "q0", [D0], ["v0", ""]>, DwarfRegAlias<B0>;
+def Q1    : AArch64Reg<1,   "q1", [D1], ["v1", ""]>, DwarfRegAlias<B1>;
+def Q2    : AArch64Reg<2,   "q2", [D2], ["v2", ""]>, DwarfRegAlias<B2>;
+def Q3    : AArch64Reg<3,   "q3", [D3], ["v3", ""]>, DwarfRegAlias<B3>;
+def Q4    : AArch64Reg<4,   "q4", [D4], ["v4", ""]>, DwarfRegAlias<B4>;
+def Q5    : AArch64Reg<5,   "q5", [D5], ["v5", ""]>, DwarfRegAlias<B5>;
+def Q6    : AArch64Reg<6,   "q6", [D6], ["v6", ""]>, DwarfRegAlias<B6>;
+def Q7    : AArch64Reg<7,   "q7", [D7], ["v7", ""]>, DwarfRegAlias<B7>;
+def Q8    : AArch64Reg<8,   "q8", [D8], ["v8", ""]>, DwarfRegAlias<B8>;
+def Q9    : AArch64Reg<9,   "q9", [D9], ["v9", ""]>, DwarfRegAlias<B9>;
+def Q10   : AArch64Reg<10, "q10", [D10], ["v10", ""]>, DwarfRegAlias<B10>;
+def Q11   : AArch64Reg<11, "q11", [D11], ["v11", ""]>, DwarfRegAlias<B11>;
+def Q12   : AArch64Reg<12, "q12", [D12], ["v12", ""]>, DwarfRegAlias<B12>;
+def Q13   : AArch64Reg<13, "q13", [D13], ["v13", ""]>, DwarfRegAlias<B13>;
+def Q14   : AArch64Reg<14, "q14", [D14], ["v14", ""]>, DwarfRegAlias<B14>;
+def Q15   : AArch64Reg<15, "q15", [D15], ["v15", ""]>, DwarfRegAlias<B15>;
+def Q16   : AArch64Reg<16, "q16", [D16], ["v16", ""]>, DwarfRegAlias<B16>;
+def Q17   : AArch64Reg<17, "q17", [D17], ["v17", ""]>, DwarfRegAlias<B17>;
+def Q18   : AArch64Reg<18, "q18", [D18], ["v18", ""]>, DwarfRegAlias<B18>;
+def Q19   : AArch64Reg<19, "q19", [D19], ["v19", ""]>, DwarfRegAlias<B19>;
+def Q20   : AArch64Reg<20, "q20", [D20], ["v20", ""]>, DwarfRegAlias<B20>;
+def Q21   : AArch64Reg<21, "q21", [D21], ["v21", ""]>, DwarfRegAlias<B21>;
+def Q22   : AArch64Reg<22, "q22", [D22], ["v22", ""]>, DwarfRegAlias<B22>;
+def Q23   : AArch64Reg<23, "q23", [D23], ["v23", ""]>, DwarfRegAlias<B23>;
+def Q24   : AArch64Reg<24, "q24", [D24], ["v24", ""]>, DwarfRegAlias<B24>;
+def Q25   : AArch64Reg<25, "q25", [D25], ["v25", ""]>, DwarfRegAlias<B25>;
+def Q26   : AArch64Reg<26, "q26", [D26], ["v26", ""]>, DwarfRegAlias<B26>;
+def Q27   : AArch64Reg<27, "q27", [D27], ["v27", ""]>, DwarfRegAlias<B27>;
+def Q28   : AArch64Reg<28, "q28", [D28], ["v28", ""]>, DwarfRegAlias<B28>;
+def Q29   : AArch64Reg<29, "q29", [D29], ["v29", ""]>, DwarfRegAlias<B29>;
+def Q30   : AArch64Reg<30, "q30", [D30], ["v30", ""]>, DwarfRegAlias<B30>;
+def Q31   : AArch64Reg<31, "q31", [D31], ["v31", ""]>, DwarfRegAlias<B31>;
+}
+
+def FPR8  : RegisterClass<"AArch64", [untyped], 8, (sequence "B%u", 0, 31)> {
+  let Size = 8;
+}
+def FPR16 : RegisterClass<"AArch64", [f16], 16, (sequence "H%u", 0, 31)> {
+  let Size = 16;
+}
+def FPR32 : RegisterClass<"AArch64", [f32, i32], 32,(sequence "S%u", 0, 31)>;
+def FPR64 : RegisterClass<"AArch64", [f64, i64, v2f32, v1f64, v8i8, v4i16, v2i32,
+                                    v1i64],
+                                    64, (sequence "D%u", 0, 31)>;
+// We don't (yet) have an f128 legal type, so don't use that here. We
+// normalize 128-bit vectors to v2f64 for arg passing and such, so use
+// that here.
+def FPR128 : RegisterClass<"AArch64",
+                           [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, f128],
+                           128, (sequence "Q%u", 0, 31)>;
+
+// The lower 16 vector registers.  Some instructions can only take registers
+// in this range.
+def FPR128_lo : RegisterClass<"AArch64",
+                              [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+                              128, (trunc FPR128, 16)>;
+
+// Pairs, triples, and quads of 64-bit vector registers.
+def DSeqPairs : RegisterTuples<[dsub0, dsub1], [(rotl FPR64, 0), (rotl FPR64, 1)]>;
+def DSeqTriples : RegisterTuples<[dsub0, dsub1, dsub2],
+                                 [(rotl FPR64, 0), (rotl FPR64, 1),
+                                  (rotl FPR64, 2)]>;
+def DSeqQuads : RegisterTuples<[dsub0, dsub1, dsub2, dsub3],
+                               [(rotl FPR64, 0), (rotl FPR64, 1),
+                                (rotl FPR64, 2), (rotl FPR64, 3)]>;
+def DD   : RegisterClass<"AArch64", [untyped], 64, (add DSeqPairs)> {
+  let Size = 128;
+}
+def DDD  : RegisterClass<"AArch64", [untyped], 64, (add DSeqTriples)> {
+  let Size = 196;
+}
+def DDDD : RegisterClass<"AArch64", [untyped], 64, (add DSeqQuads)> {
+  let Size = 256;
+}
+
+// Pairs, triples, and quads of 128-bit vector registers.
+def QSeqPairs : RegisterTuples<[qsub0, qsub1], [(rotl FPR128, 0), (rotl FPR128, 1)]>;
+def QSeqTriples : RegisterTuples<[qsub0, qsub1, qsub2],
+                                 [(rotl FPR128, 0), (rotl FPR128, 1),
+                                  (rotl FPR128, 2)]>;
+def QSeqQuads : RegisterTuples<[qsub0, qsub1, qsub2, qsub3],
+                               [(rotl FPR128, 0), (rotl FPR128, 1),
+                                (rotl FPR128, 2), (rotl FPR128, 3)]>;
+def QQ   : RegisterClass<"AArch64", [untyped], 128, (add QSeqPairs)> {
+  let Size = 256;
+}
+def QQQ  : RegisterClass<"AArch64", [untyped], 128, (add QSeqTriples)> {
+  let Size = 384;
+}
+def QQQQ : RegisterClass<"AArch64", [untyped], 128, (add QSeqQuads)> {
+  let Size = 512;
+}
+
+
+// Vector operand versions of the FP registers. Alternate name printing and
+// assmebler matching.
+def VectorReg64AsmOperand : AsmOperandClass {
+  let Name = "VectorReg64";
+  let PredicateMethod = "isVectorReg";
+}
+def VectorReg128AsmOperand : AsmOperandClass {
+  let Name = "VectorReg128";
+  let PredicateMethod = "isVectorReg";
+}
+
+def V64  : RegisterOperand<FPR64, "printVRegOperand"> {
+  let ParserMatchClass = VectorReg64AsmOperand;
+}
+
+def V128 : RegisterOperand<FPR128, "printVRegOperand"> {
+  let ParserMatchClass = VectorReg128AsmOperand;
+}
+
+def VectorRegLoAsmOperand : AsmOperandClass { let Name = "VectorRegLo"; }
+def V128_lo : RegisterOperand<FPR128_lo, "printVRegOperand"> {
+  let ParserMatchClass = VectorRegLoAsmOperand;
+}
+
+class TypedVecListAsmOperand<int count, int regsize, int lanes, string kind>
+    : AsmOperandClass {
+  let Name = "TypedVectorList" # count # "_" # lanes # kind;
+
+  let PredicateMethod
+      = "isTypedVectorList<" # count # ", " # lanes # ", '" # kind # "'>";
+  let RenderMethod = "addVectorList" # regsize # "Operands<" # count # ">";
+}
+
+class TypedVecListRegOperand<RegisterClass Reg, int lanes, string kind>
+    : RegisterOperand<Reg, "printTypedVectorList<" # lanes # ", '"
+                                                   # kind # "'>">;
+
+multiclass VectorList<int count, RegisterClass Reg64, RegisterClass Reg128> {
+  // With implicit types (probably on instruction instead). E.g. { v0, v1 }
+  def _64AsmOperand : AsmOperandClass {
+    let Name = NAME # "64";
+    let PredicateMethod = "isImplicitlyTypedVectorList<" # count # ">";
+    let RenderMethod = "addVectorList64Operands<" # count # ">";
+  }
+
+  def "64" : RegisterOperand<Reg64, "printImplicitlyTypedVectorList"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_64AsmOperand");
+  }
+
+  def _128AsmOperand : AsmOperandClass {
+    let Name = NAME # "128";
+    let PredicateMethod = "isImplicitlyTypedVectorList<" # count # ">";
+    let RenderMethod = "addVectorList128Operands<" # count # ">";
+  }
+
+  def "128" : RegisterOperand<Reg128, "printImplicitlyTypedVectorList"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_128AsmOperand");
+  }
+
+  // 64-bit register lists with explicit type.
+
+  // { v0.8b, v1.8b }
+  def _8bAsmOperand : TypedVecListAsmOperand<count, 64, 8, "b">;
+  def "8b" : TypedVecListRegOperand<Reg64, 8, "b"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_8bAsmOperand");
+  }
+
+  // { v0.4h, v1.4h }
+  def _4hAsmOperand : TypedVecListAsmOperand<count, 64, 4, "h">;
+  def "4h" : TypedVecListRegOperand<Reg64, 4, "h"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_4hAsmOperand");
+  }
+
+  // { v0.2s, v1.2s }
+  def _2sAsmOperand : TypedVecListAsmOperand<count, 64, 2, "s">;
+  def "2s" : TypedVecListRegOperand<Reg64, 2, "s"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_2sAsmOperand");
+  }
+
+  // { v0.1d, v1.1d }
+  def _1dAsmOperand : TypedVecListAsmOperand<count, 64, 1, "d">;
+  def "1d" : TypedVecListRegOperand<Reg64, 1, "d"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_1dAsmOperand");
+  }
+
+  // 128-bit register lists with explicit type
+
+  // { v0.16b, v1.16b }
+  def _16bAsmOperand : TypedVecListAsmOperand<count, 128, 16, "b">;
+  def "16b" : TypedVecListRegOperand<Reg128, 16, "b"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_16bAsmOperand");
+  }
+
+  // { v0.8h, v1.8h }
+  def _8hAsmOperand : TypedVecListAsmOperand<count, 128, 8, "h">;
+  def "8h" : TypedVecListRegOperand<Reg128, 8, "h"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_8hAsmOperand");
+  }
+
+  // { v0.4s, v1.4s }
+  def _4sAsmOperand : TypedVecListAsmOperand<count, 128, 4, "s">;
+  def "4s" : TypedVecListRegOperand<Reg128, 4, "s"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_4sAsmOperand");
+  }
+
+  // { v0.2d, v1.2d }
+  def _2dAsmOperand : TypedVecListAsmOperand<count, 128, 2, "d">;
+  def "2d" : TypedVecListRegOperand<Reg128, 2, "d"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_2dAsmOperand");
+  }
+
+  // { v0.b, v1.b }
+  def _bAsmOperand : TypedVecListAsmOperand<count, 128, 0, "b">;
+  def "b" : TypedVecListRegOperand<Reg128, 0, "b"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_bAsmOperand");
+  }
+
+  // { v0.h, v1.h }
+  def _hAsmOperand : TypedVecListAsmOperand<count, 128, 0, "h">;
+  def "h" : TypedVecListRegOperand<Reg128, 0, "h"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_hAsmOperand");
+  }
+
+  // { v0.s, v1.s }
+  def _sAsmOperand : TypedVecListAsmOperand<count, 128, 0, "s">;
+  def "s" : TypedVecListRegOperand<Reg128, 0, "s"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_sAsmOperand");
+  }
+
+  // { v0.d, v1.d }
+  def _dAsmOperand : TypedVecListAsmOperand<count, 128, 0, "d">;
+  def "d" : TypedVecListRegOperand<Reg128, 0, "d"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_dAsmOperand");
+  }
+
+
+}
+
+defm VecListOne   : VectorList<1, FPR64, FPR128>;
+defm VecListTwo   : VectorList<2, DD,    QQ>;
+defm VecListThree : VectorList<3, DDD,   QQQ>;
+defm VecListFour  : VectorList<4, DDDD,  QQQQ>;
+
+
+// Register operand versions of the scalar FP registers.
+def FPR16Op : RegisterOperand<FPR16, "printOperand">;
+def FPR32Op : RegisterOperand<FPR32, "printOperand">;
+def FPR64Op : RegisterOperand<FPR64, "printOperand">;
+def FPR128Op : RegisterOperand<FPR128, "printOperand">;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedA53.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedA53.td
new file mode 100644
index 0000000..d709bee
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedA53.td
@@ -0,0 +1,291 @@
+//==- AArch64SchedA53.td - Cortex-A53 Scheduling Definitions -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the ARM Cortex A53 processors.
+//
+//===----------------------------------------------------------------------===//
+
+// ===---------------------------------------------------------------------===//
+// The following definitions describe the simpler per-operand machine model.
+// This works with MachineScheduler. See MCSchedModel.h for details.
+
+// Cortex-A53 machine model for scheduling and other instruction cost heuristics.
+def CortexA53Model : SchedMachineModel {
+  let MicroOpBufferSize = 0; // Explicitly set to zero since A53 is in-order.
+  let IssueWidth = 2;        // 2 micro-ops are dispatched per cycle.
+  let MinLatency = 1 ;       // OperandCycles are interpreted as MinLatency.
+  let LoadLatency = 3;       // Optimistic load latency assuming bypass.
+                             // This is overriden by OperandCycles if the
+                             // Itineraries are queried instead.
+  let MispredictPenalty = 9; // Based on "Cortex-A53 Software Optimisation
+                             // Specification - Instruction Timings"
+                             // v 1.0 Spreadsheet
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available.
+
+// Modeling each pipeline as a ProcResource using the BufferSize = 0 since
+// Cortex-A53 is in-order.
+
+def A53UnitALU    : ProcResource<2> { let BufferSize = 0; } // Int ALU
+def A53UnitMAC    : ProcResource<1> { let BufferSize = 0; } // Int MAC
+def A53UnitDiv    : ProcResource<1> { let BufferSize = 0; } // Int Division
+def A53UnitLdSt   : ProcResource<1> { let BufferSize = 0; } // Load/Store
+def A53UnitB      : ProcResource<1> { let BufferSize = 0; } // Branch
+def A53UnitFPALU  : ProcResource<1> { let BufferSize = 0; } // FP ALU
+def A53UnitFPMDS  : ProcResource<1> { let BufferSize = 0; } // FP Mult/Div/Sqrt
+
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific SchedWrite types which both map the ProcResources and
+// set the latency.
+
+let SchedModel = CortexA53Model in {
+
+// ALU - Despite having a full latency of 4, most of the ALU instructions can
+//       forward a cycle earlier and then two cycles earlier in the case of a
+//       shift-only instruction. These latencies will be incorrect when the
+//       result cannot be forwarded, but modeling isn't rocket surgery.
+def : WriteRes<WriteImm, [A53UnitALU]> { let Latency = 3; }
+def : WriteRes<WriteI, [A53UnitALU]> { let Latency = 3; }
+def : WriteRes<WriteISReg, [A53UnitALU]> { let Latency = 3; }
+def : WriteRes<WriteIEReg, [A53UnitALU]> { let Latency = 3; }
+def : WriteRes<WriteIS, [A53UnitALU]> { let Latency = 2; }
+def : WriteRes<WriteExtr, [A53UnitALU]> { let Latency = 3; }
+
+// MAC
+def : WriteRes<WriteIM32, [A53UnitMAC]> { let Latency = 4; }
+def : WriteRes<WriteIM64, [A53UnitMAC]> { let Latency = 4; }
+
+// Div
+def : WriteRes<WriteID32, [A53UnitDiv]> { let Latency = 4; }
+def : WriteRes<WriteID64, [A53UnitDiv]> { let Latency = 4; }
+
+// Load
+def : WriteRes<WriteLD, [A53UnitLdSt]> { let Latency = 4; }
+def : WriteRes<WriteLDIdx, [A53UnitLdSt]> { let Latency = 4; }
+def : WriteRes<WriteLDHi, [A53UnitLdSt]> { let Latency = 4; }
+
+// Vector Load - Vector loads take 1-5 cycles to issue. For the WriteVecLd
+//               below, choosing the median of 3 which makes the latency 6.
+//               May model this more carefully in the future. The remaining
+//               A53WriteVLD# types represent the 1-5 cycle issues explicitly.
+def : WriteRes<WriteVLD, [A53UnitLdSt]> { let Latency = 6;
+                                          let ResourceCycles = [3]; }
+def A53WriteVLD1 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 4; }
+def A53WriteVLD2 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 5;
+                                                  let ResourceCycles = [2]; }
+def A53WriteVLD3 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 6;
+                                                  let ResourceCycles = [3]; }
+def A53WriteVLD4 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 7;
+                                                  let ResourceCycles = [4]; }
+def A53WriteVLD5 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 8;
+                                                  let ResourceCycles = [5]; }
+
+// Pre/Post Indexing - Performed as part of address generation which is already
+//                     accounted for in the WriteST* latencies below
+def : WriteRes<WriteAdr, []> { let Latency = 0; }
+
+// Store
+def : WriteRes<WriteST, [A53UnitLdSt]> { let Latency = 4; }
+def : WriteRes<WriteSTP, [A53UnitLdSt]> { let Latency = 4; }
+def : WriteRes<WriteSTIdx, [A53UnitLdSt]> { let Latency = 4; }
+def : WriteRes<WriteSTX, [A53UnitLdSt]> { let Latency = 4; }
+
+// Vector Store - Similar to vector loads, can take 1-3 cycles to issue.
+def : WriteRes<WriteVST, [A53UnitLdSt]> { let Latency = 5;
+                                          let ResourceCycles = [2];}
+def A53WriteVST1 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 4; }
+def A53WriteVST2 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 5;
+                                                  let ResourceCycles = [2]; }
+def A53WriteVST3 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 6;
+                                                  let ResourceCycles = [3]; }
+
+// Branch
+def : WriteRes<WriteBr, [A53UnitB]>;
+def : WriteRes<WriteBrReg, [A53UnitB]>;
+def : WriteRes<WriteSys, [A53UnitB]>;
+def : WriteRes<WriteBarrier, [A53UnitB]>;
+def : WriteRes<WriteHint, [A53UnitB]>;
+
+// FP ALU
+def : WriteRes<WriteF, [A53UnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteFCmp, [A53UnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteFCvt, [A53UnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteFCopy, [A53UnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteFImm, [A53UnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteV, [A53UnitFPALU]> { let Latency = 6; }
+
+// FP Mul, Div, Sqrt
+def : WriteRes<WriteFMul, [A53UnitFPMDS]> { let Latency = 6; }
+def : WriteRes<WriteFDiv, [A53UnitFPMDS]> { let Latency = 33;
+                                            let ResourceCycles = [29]; }
+def A53WriteFMAC : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 10; }
+def A53WriteFDivSP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 18;
+                                                     let ResourceCycles = [14]; }
+def A53WriteFDivDP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 33;
+                                                     let ResourceCycles = [29]; }
+def A53WriteFSqrtSP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 17;
+                                                      let ResourceCycles = [13]; }
+def A53WriteFSqrtDP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 32;
+                                                      let ResourceCycles = [28]; }
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific SchedRead types.
+
+// No forwarding for these reads.
+def : ReadAdvance<ReadExtrHi, 0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD, 0>;
+
+// ALU - Most operands in the ALU pipes are not needed for two cycles. Shiftable
+//       operands are needed one cycle later if and only if they are to be
+//       shifted. Otherwise, they too are needed two cycles later. This same
+//       ReadAdvance applies to Extended registers as well, even though there is
+//       a separate SchedPredicate for them.
+def : ReadAdvance<ReadI, 2, [WriteImm,WriteI,
+                             WriteISReg, WriteIEReg,WriteIS,
+                             WriteID32,WriteID64,
+                             WriteIM32,WriteIM64]>;
+def A53ReadShifted : SchedReadAdvance<1, [WriteImm,WriteI,
+                                          WriteISReg, WriteIEReg,WriteIS,
+                                          WriteID32,WriteID64,
+                                          WriteIM32,WriteIM64]>;
+def A53ReadNotShifted : SchedReadAdvance<2, [WriteImm,WriteI,
+                                             WriteISReg, WriteIEReg,WriteIS,
+                                             WriteID32,WriteID64,
+                                             WriteIM32,WriteIM64]>;
+def A53ReadISReg : SchedReadVariant<[
+	SchedVar<RegShiftedPred, [A53ReadShifted]>,
+	SchedVar<NoSchedPred, [A53ReadNotShifted]>]>;
+def : SchedAlias<ReadISReg, A53ReadISReg>;
+
+def A53ReadIEReg : SchedReadVariant<[
+	SchedVar<RegExtendedPred, [A53ReadShifted]>,
+	SchedVar<NoSchedPred, [A53ReadNotShifted]>]>;
+def : SchedAlias<ReadIEReg, A53ReadIEReg>;
+
+// MAC - Operands are generally needed one cycle later in the MAC pipe.
+//       Accumulator operands are needed two cycles later.
+def : ReadAdvance<ReadIM, 1, [WriteImm,WriteI,
+                              WriteISReg, WriteIEReg,WriteIS,
+                              WriteID32,WriteID64,
+                              WriteIM32,WriteIM64]>;
+def : ReadAdvance<ReadIMA, 2, [WriteImm,WriteI,
+                               WriteISReg, WriteIEReg,WriteIS,
+                               WriteID32,WriteID64,
+                               WriteIM32,WriteIM64]>;
+
+// Div
+def : ReadAdvance<ReadID, 1, [WriteImm,WriteI,
+                              WriteISReg, WriteIEReg,WriteIS,
+                              WriteID32,WriteID64,
+                              WriteIM32,WriteIM64]>;
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific InstRWs.
+
+//---
+// Miscellaneous
+//---
+def : InstRW<[WriteI], (instrs COPY)>;
+
+//---
+// Vector Loads
+//---
+def : InstRW<[A53WriteVLD1], (instregex "LD1i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVLD1], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD1], (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD2], (instregex "LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD3], (instregex "LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD4], (instregex "LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD1i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD3, WriteAdr], (instregex "LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[A53WriteVLD1], (instregex "LD2i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVLD1], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD2], (instregex "LD2Twov(8b|4h|2s)$")>;
+def : InstRW<[A53WriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD2i(8|16|32|64)(_POST)?$")>;
+def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD2Twov(8b|4h|2s)(_POST)?$")>;
+def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD2Twov(16b|8h|4s|2d)(_POST)?$")>;
+
+def : InstRW<[A53WriteVLD2], (instregex "LD3i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD4], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[A53WriteVLD3], (instregex "LD3Threev(2d)$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD3i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[A53WriteVLD3, WriteAdr], (instregex "LD3Threev(2d)_POST$")>;
+
+def : InstRW<[A53WriteVLD2], (instregex "LD4i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD5], (instregex "LD4Fourv(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[A53WriteVLD4], (instregex "LD4Fourv(2d)$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD4i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD5, WriteAdr], (instregex "LD4Fourv(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD4Fourv(2d)_POST$")>;
+
+//---
+// Vector Stores
+//---
+def : InstRW<[A53WriteVST1], (instregex "ST1i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVST2], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST1i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[A53WriteVST1], (instregex "ST2i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVST1], (instregex "ST2Twov(8b|4h|2s)$")>;
+def : InstRW<[A53WriteVST2], (instregex "ST2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST2i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[A53WriteVST2], (instregex "ST3i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVST3], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[A53WriteVST2], (instregex "ST3Threev(2d)$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST3i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVST3, WriteAdr], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST3Threev(2d)_POST$")>;
+
+def : InstRW<[A53WriteVST2], (instregex "ST4i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVST3], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[A53WriteVST2], (instregex "ST4Fourv(2d)$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST4i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVST3, WriteAdr], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST4Fourv(2d)_POST$")>;
+
+//---
+// Floating Point MAC, DIV, SQRT
+//---
+def : InstRW<[A53WriteFMAC], (instregex "^FN?M(ADD|SUB).*")>;
+def : InstRW<[A53WriteFMAC], (instregex "^FML(A|S).*")>;
+def : InstRW<[A53WriteFDivSP], (instrs FDIVSrr)>;
+def : InstRW<[A53WriteFDivDP], (instrs FDIVDrr)>;
+def : InstRW<[A53WriteFDivSP], (instregex "^FDIVv.*32$")>;
+def : InstRW<[A53WriteFDivDP], (instregex "^FDIVv.*64$")>;
+def : InstRW<[A53WriteFSqrtSP], (instregex "^.*SQRT.*32$")>;
+def : InstRW<[A53WriteFSqrtDP], (instregex "^.*SQRT.*64$")>;
+
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedA57.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedA57.td
new file mode 100644
index 0000000..8209f96
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedA57.td
@@ -0,0 +1,304 @@
+//=- AArch64SchedA57.td - ARM Cortex-A57 Scheduling Defs -----*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for ARM Cortex-A57 to support
+// instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def CortexA57Model : SchedMachineModel {
+  let IssueWidth        =   8; // 3-way decode and 8-way issue
+  let MicroOpBufferSize = 128; // 128 micro-op re-order buffer
+  let LoadLatency       =   4; // Optimistic load latency
+  let MispredictPenalty =  14; // Fetch + Decode/Rename/Dispatch + Branch
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available on Cortex-A57.
+// Cortex A-57 has 8 pipelines that each has its own 8-entry queue where
+// micro-ops wait for their operands and then issue out-of-order.
+
+def A57UnitB : ProcResource<1> { let BufferSize = 8; }  // Type B micro-ops
+def A57UnitI : ProcResource<2> { let BufferSize = 8; }  // Type I micro-ops
+def A57UnitM : ProcResource<1> { let BufferSize = 8; }  // Type M micro-ops
+def A57UnitL : ProcResource<1> { let BufferSize = 8; }  // Type L micro-ops
+def A57UnitS : ProcResource<1> { let BufferSize = 8; }  // Type S micro-ops
+def A57UnitX : ProcResource<1> { let BufferSize = 8; }  // Type X micro-ops
+def A57UnitW : ProcResource<1> { let BufferSize = 8; }  // Type W micro-ops
+let SchedModel = CortexA57Model in {
+  def A57UnitV : ProcResGroup<[A57UnitX, A57UnitW]>;    // Type V micro-ops
+}
+
+
+let SchedModel = CortexA57Model in {
+
+//===----------------------------------------------------------------------===//
+// Define customized scheduler read/write types specific to the Cortex-A57.
+
+include "AArch64SchedA57WriteRes.td"
+
+//===----------------------------------------------------------------------===//
+// Map the target-defined scheduler read/write resources and latency for
+// Cortex-A57. The Cortex-A57 types are directly associated with resources, so
+// defining the aliases precludes the need for mapping them using WriteRes. The
+// aliases are sufficient for creating a coarse, working model. As the model
+// evolves, InstRWs will be used to override these SchedAliases.
+
+def : SchedAlias<WriteImm,   A57Write_1cyc_1I>;
+def : SchedAlias<WriteI,     A57Write_1cyc_1I>;
+def : SchedAlias<WriteISReg, A57Write_2cyc_1M>;
+def : SchedAlias<WriteIEReg, A57Write_2cyc_1M>;
+def : SchedAlias<WriteExtr,  A57Write_1cyc_1I>;
+def : SchedAlias<WriteIS,    A57Write_1cyc_1I>;
+def : SchedAlias<WriteID32,  A57Write_19cyc_1M>;
+def : SchedAlias<WriteID64,  A57Write_35cyc_1M>;
+def : SchedAlias<WriteIM32,  A57Write_3cyc_1M>;
+def : SchedAlias<WriteIM64,  A57Write_5cyc_1M>;
+def : SchedAlias<WriteBr,    A57Write_1cyc_1B>;
+def : SchedAlias<WriteBrReg, A57Write_1cyc_1B>;
+def : SchedAlias<WriteLD,    A57Write_4cyc_1L>;
+def : SchedAlias<WriteST,    A57Write_1cyc_1S>;
+def : SchedAlias<WriteSTP,   A57Write_1cyc_1S>;
+def : SchedAlias<WriteAdr,   A57Write_1cyc_1I>;
+def : SchedAlias<WriteLDIdx, A57Write_4cyc_1I_1L>;
+def : SchedAlias<WriteSTIdx, A57Write_1cyc_1I_1S>;
+def : SchedAlias<WriteF,     A57Write_3cyc_1V>;
+def : SchedAlias<WriteFCmp,  A57Write_3cyc_1V>;
+def : SchedAlias<WriteFCvt,  A57Write_5cyc_1V>;
+def : SchedAlias<WriteFCopy, A57Write_3cyc_1V>;
+def : SchedAlias<WriteFImm,  A57Write_3cyc_1V>;
+def : SchedAlias<WriteFMul,  A57Write_5cyc_1V>;
+def : SchedAlias<WriteFDiv,  A57Write_18cyc_1X>;
+def : SchedAlias<WriteV,     A57Write_3cyc_1V>;
+def : SchedAlias<WriteVLD,   A57Write_5cyc_1L>;
+def : SchedAlias<WriteVST,   A57Write_1cyc_1S>;
+
+def : WriteRes<WriteSys,     []> { let Latency = 1; }
+def : WriteRes<WriteBarrier, []> { let Latency = 1; }
+def : WriteRes<WriteHint,    []> { let Latency = 1; }
+
+def : WriteRes<WriteLDHi,    []> { let Latency = 4; }
+
+// Forwarding logic is not [yet] explicitly modeled beyond what is captured
+// in the latencies of the A57 Generic SchedWriteRes's.
+def : ReadAdvance<ReadI,       0>;
+def : ReadAdvance<ReadISReg,   0>;
+def : ReadAdvance<ReadIEReg,   0>;
+def : ReadAdvance<ReadIM,      0>;
+def : ReadAdvance<ReadIMA,     0>;
+def : ReadAdvance<ReadID,      0>;
+def : ReadAdvance<ReadExtrHi,  0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD,     0>;
+
+
+//===----------------------------------------------------------------------===//
+// Specialize the coarse model by associating instruction groups with the
+// subtarget-defined types. As the modeled is refined, this will override most
+// of the above ShchedAlias mappings.
+
+// Miscellaneous
+// -----------------------------------------------------------------------------
+
+def : InstRW<[WriteI], (instrs COPY)>;
+
+
+// Branch Instructions
+// -----------------------------------------------------------------------------
+
+def : InstRW<[A57Write_1cyc_1B_1I], (instrs BL)>;
+def : InstRW<[A57Write_2cyc_1B_1I], (instrs BLR)>;
+
+
+// Divide and Multiply Instructions
+// -----------------------------------------------------------------------------
+
+// Multiply high
+def : InstRW<[A57Write_6cyc_1M], (instrs SMULHrr, UMULHrr)>;
+
+
+// Miscellaneous Data-Processing Instructions
+// -----------------------------------------------------------------------------
+
+def : InstRW<[A57Write_1cyc_1I],    (instrs EXTRWrri)>;
+def : InstRW<[A57Write_3cyc_1I_1M], (instrs EXTRXrri)>;
+def : InstRW<[A57Write_2cyc_1M],    (instregex "BFM")>;
+
+
+// Cryptography Extensions
+// -----------------------------------------------------------------------------
+
+def : InstRW<[A57Write_3cyc_1W], (instregex "CRC32")>;
+
+
+// Vector Load
+// -----------------------------------------------------------------------------
+
+def : InstRW<[A57Write_8cyc_1L_1V],           (instregex "LD1i(8|16|32)$")>;
+def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr], (instregex "LD1i(8|16|32)_POST$")>;
+def : InstRW<[A57Write_5cyc_1L],            (instregex "LD1i(64)$")>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr],  (instregex "LD1i(64)_POST$")>;
+
+def : InstRW<[A57Write_8cyc_1L_1V],           (instregex "LD1Rv(8b|4h|2s)$")>;
+def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr], (instregex "LD1Rv(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_5cyc_1L],            (instregex "LD1Rv(1d)$")>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr],  (instregex "LD1Rv(1d)_POST$")>;
+def : InstRW<[A57Write_8cyc_1L_1V],           (instregex "LD1Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr], (instregex "LD1Rv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[A57Write_5cyc_1L],              (instregex "LD1Onev(8b|4h|2s|1d)$")>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr],    (instregex "LD1Onev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[A57Write_5cyc_1L],              (instregex "LD1Onev(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr],    (instregex "LD1Onev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A57Write_5cyc_1L],              (instregex "LD1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr],    (instregex "LD1Twov(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[A57Write_6cyc_2L],             (instregex "LD1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_6cyc_2L, WriteAdr],   (instregex "LD1Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A57Write_6cyc_2L],             (instregex "LD1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[A57Write_6cyc_2L, WriteAdr],   (instregex "LD1Threev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[A57Write_7cyc_3L],            (instregex "LD1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_7cyc_3L, WriteAdr],  (instregex "LD1Threev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A57Write_6cyc_2L],             (instregex "LD1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[A57Write_6cyc_2L, WriteAdr],   (instregex "LD1Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[A57Write_8cyc_4L],           (instregex "LD1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_8cyc_4L, WriteAdr], (instregex "LD1Fourv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[A57Write_8cyc_1L_2V],           (instregex "LD2i(8|16)$")>;
+def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr], (instregex "LD2i(8|16)_POST$")>;
+def : InstRW<[A57Write_6cyc_2L],            (instregex "LD2i(32)$")>;
+def : InstRW<[A57Write_6cyc_2L, WriteAdr],  (instregex "LD2i(32)_POST$")>;
+def : InstRW<[A57Write_8cyc_1L_1V],            (instregex "LD2i(64)$")>;
+def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr],  (instregex "LD2i(64)_POST$")>;
+
+def : InstRW<[A57Write_8cyc_1L_1V],            (instregex "LD2Rv(8b|4h|2s)$")>;
+def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr],  (instregex "LD2Rv(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_5cyc_1L],             (instregex "LD2Rv(1d)$")>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr],   (instregex "LD2Rv(1d)_POST$")>;
+def : InstRW<[A57Write_8cyc_1L_2V],           (instregex "LD2Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr], (instregex "LD2Rv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[A57Write_8cyc_1L_1V],             (instregex "LD2Twov(8b|4h|2s)$")>;
+def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr],   (instregex "LD2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_9cyc_2L_2V],           (instregex "LD2Twov(16b|8h|4s)$")>;
+def : InstRW<[A57Write_9cyc_2L_2V, WriteAdr], (instregex "LD2Twov(16b|8h|4s)_POST$")>;
+def : InstRW<[A57Write_6cyc_2L],             (instregex "LD2Twov(2d)$")>;
+def : InstRW<[A57Write_6cyc_2L, WriteAdr],   (instregex "LD2Twov(2d)_POST$")>;
+
+def : InstRW<[A57Write_9cyc_1L_3V],           (instregex "LD3i(8|16)$")>;
+def : InstRW<[A57Write_9cyc_1L_3V, WriteAdr], (instregex "LD3i(8|16)_POST$")>;
+def : InstRW<[A57Write_8cyc_1L_2V],            (instregex "LD3i(32)$")>;
+def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr],  (instregex "LD3i(32)_POST$")>;
+def : InstRW<[A57Write_6cyc_2L],             (instregex "LD3i(64)$")>;
+def : InstRW<[A57Write_6cyc_2L, WriteAdr],   (instregex "LD3i(64)_POST$")>;
+
+def : InstRW<[A57Write_8cyc_1L_2V],             (instregex "LD3Rv(8b|4h|2s)$")>;
+def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr],   (instregex "LD3Rv(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_6cyc_2L],              (instregex "LD3Rv(1d)$")>;
+def : InstRW<[A57Write_6cyc_2L, WriteAdr],    (instregex "LD3Rv(1d)_POST$")>;
+def : InstRW<[A57Write_9cyc_1L_3V],            (instregex "LD3Rv(16b|8h|4s)$")>;
+def : InstRW<[A57Write_9cyc_1L_3V, WriteAdr],  (instregex "LD3Rv(16b|8h|4s)_POST$")>;
+def : InstRW<[A57Write_9cyc_2L_3V],           (instregex "LD3Rv(2d)$")>;
+def : InstRW<[A57Write_9cyc_2L_3V, WriteAdr], (instregex "LD3Rv(2d)_POST$")>;
+
+def : InstRW<[A57Write_9cyc_2L_2V],               (instregex "LD3Threev(8b|4h|2s)$")>;
+def : InstRW<[A57Write_9cyc_2L_2V, WriteAdr],     (instregex "LD3Threev(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_10cyc_3L_4V],           (instregex "LD3Threev(16b|8h|4s)$")>;
+def : InstRW<[A57Write_10cyc_3L_4V, WriteAdr], (instregex "LD3Threev(16b|8h|4s)_POST$")>;
+def : InstRW<[A57Write_8cyc_4L],               (instregex "LD3Threev(2d)$")>;
+def : InstRW<[A57Write_8cyc_4L, WriteAdr],     (instregex "LD3Threev(2d)_POST$")>;
+
+def : InstRW<[A57Write_9cyc_2L_3V],           (instregex "LD4i(8|16)$")>;
+def : InstRW<[A57Write_9cyc_2L_3V, WriteAdr], (instregex "LD4i(8|16)_POST$")>;
+def : InstRW<[A57Write_8cyc_1L_2V],             (instregex "LD4i(32)$")>;
+def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr],   (instregex "LD4i(32)_POST$")>;
+def : InstRW<[A57Write_9cyc_2L_3V],           (instregex "LD4i(64)$")>;
+def : InstRW<[A57Write_9cyc_2L_3V, WriteAdr], (instregex "LD4i(64)_POST$")>;
+
+def : InstRW<[A57Write_8cyc_1L_2V],              (instregex "LD4Rv(8b|4h|2s)$")>;
+def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr],    (instregex "LD4Rv(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_6cyc_2L],               (instregex "LD4Rv(1d)$")>;
+def : InstRW<[A57Write_6cyc_2L, WriteAdr],     (instregex "LD4Rv(1d)_POST$")>;
+def : InstRW<[A57Write_9cyc_2L_3V],            (instregex "LD4Rv(16b|8h|4s)$")>;
+def : InstRW<[A57Write_9cyc_2L_3V, WriteAdr],  (instregex "LD4Rv(16b|8h|4s)_POST$")>;
+def : InstRW<[A57Write_9cyc_2L_4V],           (instregex "LD4Rv(2d)$")>;
+def : InstRW<[A57Write_9cyc_2L_4V, WriteAdr], (instregex "LD4Rv(2d)_POST$")>;
+
+def : InstRW<[A57Write_9cyc_2L_2V],                (instregex "LD4Fourv(8b|4h|2s)$")>;
+def : InstRW<[A57Write_9cyc_2L_2V, WriteAdr],      (instregex "LD4Fourv(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_11cyc_4L_4V],           (instregex "LD4Fourv(16b|8h|4s)$")>;
+def : InstRW<[A57Write_11cyc_4L_4V, WriteAdr], (instregex "LD4Fourv(16b|8h|4s)_POST$")>;
+def : InstRW<[A57Write_8cyc_4L],                (instregex "LD4Fourv(2d)$")>;
+def : InstRW<[A57Write_8cyc_4L, WriteAdr],      (instregex "LD4Fourv(2d)_POST$")>;
+
+// Vector Store
+// -----------------------------------------------------------------------------
+
+def : InstRW<[A57Write_1cyc_1S],            (instregex "ST1i(8|16|32)$")>;
+def : InstRW<[A57Write_1cyc_1S, WriteAdr],  (instregex "ST1i(8|16|32)_POST$")>;
+def : InstRW<[A57Write_3cyc_1S_1V],           (instregex "ST1i(64)$")>;
+def : InstRW<[A57Write_3cyc_1S_1V, WriteAdr], (instregex "ST1i(64)_POST$")>;
+
+def : InstRW<[A57Write_1cyc_1S],                  (instregex "ST1Onev(8b|4h|2s|1d)$")>;
+def : InstRW<[A57Write_1cyc_1S, WriteAdr],        (instregex "ST1Onev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[A57Write_2cyc_2S],                 (instregex "ST1Onev(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_2cyc_2S, WriteAdr],       (instregex "ST1Onev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A57Write_2cyc_2S],                 (instregex "ST1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[A57Write_2cyc_2S, WriteAdr],       (instregex "ST1Twov(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[A57Write_4cyc_4S],               (instregex "ST1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_4cyc_4S, WriteAdr],     (instregex "ST1Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A57Write_3cyc_3S],                (instregex "ST1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[A57Write_3cyc_3S, WriteAdr],      (instregex "ST1Threev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[A57Write_6cyc_6S],             (instregex "ST1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_6cyc_6S, WriteAdr],   (instregex "ST1Threev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A57Write_4cyc_4S],               (instregex "ST1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[A57Write_4cyc_4S, WriteAdr],     (instregex "ST1Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[A57Write_8cyc_8S],           (instregex "ST1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_8cyc_8S, WriteAdr], (instregex "ST1Fourv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[A57Write_3cyc_1S_1V],           (instregex "ST2i(8|16|32)$")>;
+def : InstRW<[A57Write_3cyc_1S_1V, WriteAdr], (instregex "ST2i(8|16|32)_POST$")>;
+def : InstRW<[A57Write_2cyc_2S],           (instregex "ST2i(64)$")>;
+def : InstRW<[A57Write_2cyc_2S, WriteAdr], (instregex "ST2i(64)_POST$")>;
+
+def : InstRW<[A57Write_3cyc_2S_1V],              (instregex "ST2Twov(8b|4h|2s)$")>;
+def : InstRW<[A57Write_3cyc_2S_1V, WriteAdr],    (instregex "ST2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_4cyc_4S_2V],           (instregex "ST2Twov(16b|8h|4s)$")>;
+def : InstRW<[A57Write_4cyc_4S_2V, WriteAdr], (instregex "ST2Twov(16b|8h|4s)_POST$")>;
+def : InstRW<[A57Write_4cyc_4S],             (instregex "ST2Twov(2d)$")>;
+def : InstRW<[A57Write_4cyc_4S, WriteAdr],   (instregex "ST2Twov(2d)_POST$")>;
+
+def : InstRW<[A57Write_3cyc_1S_1V],            (instregex "ST3i(8|16)$")>;
+def : InstRW<[A57Write_3cyc_1S_1V, WriteAdr],  (instregex "ST3i(8|16)_POST$")>;
+def : InstRW<[A57Write_3cyc_3S],           (instregex "ST3i(32)$")>;
+def : InstRW<[A57Write_3cyc_3S, WriteAdr], (instregex "ST3i(32)_POST$")>;
+def : InstRW<[A57Write_3cyc_2S_1V],           (instregex "ST3i(64)$")>;
+def : InstRW<[A57Write_3cyc_2S_1V, WriteAdr], (instregex "ST3i(64)_POST$")>;
+
+def : InstRW<[A57Write_3cyc_3S_2V],                 (instregex "ST3Threev(8b|4h|2s)$")>;
+def : InstRW<[A57Write_3cyc_3S_2V, WriteAdr],       (instregex "ST3Threev(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_6cyc_6S_4V],           (instregex "ST3Threev(16b|8h|4s)$")>;
+def : InstRW<[A57Write_6cyc_6S_4V, WriteAdr], (instregex "ST3Threev(16b|8h|4s)_POST$")>;
+def : InstRW<[A57Write_6cyc_6S],                (instregex "ST3Threev(2d)$")>;
+def : InstRW<[A57Write_6cyc_6S, WriteAdr],      (instregex "ST3Threev(2d)_POST$")>;
+
+def : InstRW<[A57Write_3cyc_1S_1V],             (instregex "ST4i(8|16)$")>;
+def : InstRW<[A57Write_3cyc_1S_1V, WriteAdr],   (instregex "ST4i(8|16)_POST$")>;
+def : InstRW<[A57Write_4cyc_4S],           (instregex "ST4i(32)$")>;
+def : InstRW<[A57Write_4cyc_4S, WriteAdr], (instregex "ST4i(32)_POST$")>;
+def : InstRW<[A57Write_3cyc_2S_1V],            (instregex "ST4i(64)$")>;
+def : InstRW<[A57Write_3cyc_2S_1V, WriteAdr],  (instregex "ST4i(64)_POST$")>;
+
+def : InstRW<[A57Write_4cyc_4S_2V],                  (instregex "ST4Fourv(8b|4h|2s)$")>;
+def : InstRW<[A57Write_4cyc_4S_2V, WriteAdr],        (instregex "ST4Fourv(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_8cyc_8S_4V],           (instregex "ST4Fourv(16b|8h|4s)$")>;
+def : InstRW<[A57Write_8cyc_8S_4V, WriteAdr], (instregex "ST4Fourv(16b|8h|4s)_POST$")>;
+def : InstRW<[A57Write_8cyc_8S],                (instregex "ST4Fourv(2d)$")>;
+def : InstRW<[A57Write_8cyc_8S, WriteAdr],      (instregex "ST4Fourv(2d)_POST$")>;
+
+} // SchedModel = CortexA57Model
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedA57WriteRes.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedA57WriteRes.td
new file mode 100644
index 0000000..a8f421b
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedA57WriteRes.td
@@ -0,0 +1,512 @@
+//=- AArch64SchedA57WriteRes.td - ARM Cortex-A57 Write Res ---*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Contains all of the Cortex-A57 specific SchedWriteRes types. The approach
+// below is to define a generic SchedWriteRes for every combination of
+// latency and microOps. The naming conventions is to use a prefix, one field
+// for latency, and one or more microOp count/type designators.
+//   Prefix: A57Write
+//   Latency: #cyc
+//   MicroOp Count/Types: #(B|I|M|L|S|X|W|V)
+//
+// e.g. A57Write_6cyc_1I_6S_4V means the total latency is 6 and there are
+//      11 micro-ops to be issued down one I pipe, six S pipes and four V pipes.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Define Generic 1 micro-op types
+
+def A57Write_5cyc_1L  : SchedWriteRes<[A57UnitL]> { let Latency = 5;  }
+def A57Write_5cyc_1M  : SchedWriteRes<[A57UnitM]> { let Latency = 5;  }
+def A57Write_5cyc_1V  : SchedWriteRes<[A57UnitV]> { let Latency = 5;  }
+def A57Write_5cyc_1W  : SchedWriteRes<[A57UnitW]> { let Latency = 5;  }
+def A57Write_10cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 10; }
+def A57Write_18cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 18; }
+def A57Write_19cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 19; }
+def A57Write_1cyc_1B  : SchedWriteRes<[A57UnitB]> { let Latency = 1;  }
+def A57Write_1cyc_1I  : SchedWriteRes<[A57UnitI]> { let Latency = 1;  }
+def A57Write_1cyc_1S  : SchedWriteRes<[A57UnitS]> { let Latency = 1;  }
+def A57Write_2cyc_1M  : SchedWriteRes<[A57UnitM]> { let Latency = 2;  }
+def A57Write_32cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 32; }
+def A57Write_35cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 35; }
+def A57Write_3cyc_1M  : SchedWriteRes<[A57UnitM]> { let Latency = 3;  }
+def A57Write_3cyc_1V  : SchedWriteRes<[A57UnitV]> { let Latency = 3;  }
+def A57Write_3cyc_1W  : SchedWriteRes<[A57UnitW]> { let Latency = 3;  }
+def A57Write_3cyc_1X  : SchedWriteRes<[A57UnitX]> { let Latency = 3;  }
+def A57Write_4cyc_1L  : SchedWriteRes<[A57UnitL]> { let Latency = 4;  }
+def A57Write_4cyc_1X  : SchedWriteRes<[A57UnitX]> { let Latency = 4;  }
+def A57Write_9cyc_1V  : SchedWriteRes<[A57UnitV]> { let Latency = 9;  }
+def A57Write_6cyc_1M  : SchedWriteRes<[A57UnitM]> { let Latency = 6;  }
+def A57Write_6cyc_1V  : SchedWriteRes<[A57UnitV]> { let Latency = 6;  }
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 2 micro-op types
+
+def A57Write_64cyc_2X    : SchedWriteRes<[A57UnitX, A57UnitX]> {
+  let Latency     = 64;
+  let NumMicroOps = 2;
+}
+def A57Write_6cyc_1I_1L  : SchedWriteRes<[A57UnitI,
+                                          A57UnitL]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+def A57Write_7cyc_1V_1X  : SchedWriteRes<[A57UnitV,
+                                          A57UnitX]> {
+  let Latency     = 7;
+  let NumMicroOps = 2;
+}
+def A57Write_8cyc_1L_1V  : SchedWriteRes<[A57UnitL,
+                                          A57UnitV]> {
+  let Latency     = 8;
+  let NumMicroOps = 2;
+}
+def A57Write_9cyc_2V     : SchedWriteRes<[A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 2;
+}
+def A57Write_8cyc_2X     : SchedWriteRes<[A57UnitX, A57UnitX]> {
+  let Latency     = 8;
+  let NumMicroOps = 2;
+}
+def A57Write_6cyc_2L     : SchedWriteRes<[A57UnitL, A57UnitL]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+def A57Write_6cyc_2V     : SchedWriteRes<[A57UnitV, A57UnitV]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+def A57Write_6cyc_2W     : SchedWriteRes<[A57UnitW, A57UnitW]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+def A57Write_5cyc_1I_1L  : SchedWriteRes<[A57UnitI,
+                                          A57UnitL]> {
+  let Latency     = 5;
+  let NumMicroOps = 2;
+}
+def A57Write_5cyc_2V     : SchedWriteRes<[A57UnitV, A57UnitV]> {
+  let Latency     = 5;
+  let NumMicroOps = 2;
+}
+def A57Write_5cyc_2X     : SchedWriteRes<[A57UnitX, A57UnitX]> {
+  let Latency     = 5;
+  let NumMicroOps = 2;
+}
+def A57Write_10cyc_1L_1V : SchedWriteRes<[A57UnitL,
+                                          A57UnitV]> {
+  let Latency     = 10;
+  let NumMicroOps = 2;
+}
+def A57Write_10cyc_2V    : SchedWriteRes<[A57UnitV, A57UnitV]> {
+  let Latency     = 10;
+  let NumMicroOps = 2;
+}
+def A57Write_1cyc_1B_1I  : SchedWriteRes<[A57UnitB,
+                                          A57UnitI]> {
+  let Latency     = 1;
+  let NumMicroOps = 2;
+}
+def A57Write_1cyc_1I_1S  : SchedWriteRes<[A57UnitI,
+                                          A57UnitS]> {
+  let Latency     = 1;
+  let NumMicroOps = 2;
+}
+def A57Write_2cyc_1B_1I  : SchedWriteRes<[A57UnitB,
+                                          A57UnitI]> {
+  let Latency     = 2;
+  let NumMicroOps = 2;
+}
+def A57Write_2cyc_2S     : SchedWriteRes<[A57UnitS, A57UnitS]> {
+  let Latency     = 2;
+  let NumMicroOps = 2;
+}
+def A57Write_2cyc_2V     : SchedWriteRes<[A57UnitV, A57UnitV]> {
+  let Latency     = 2;
+  let NumMicroOps = 2;
+}
+def A57Write_36cyc_2X    : SchedWriteRes<[A57UnitX, A57UnitX]> {
+  let Latency     = 36;
+  let NumMicroOps = 2;
+}
+def A57Write_3cyc_1I_1M  : SchedWriteRes<[A57UnitI,
+                                          A57UnitM]> {
+  let Latency     = 3;
+  let NumMicroOps = 2;
+}
+def A57Write_3cyc_1I_1S  : SchedWriteRes<[A57UnitI,
+                                          A57UnitS]> {
+  let Latency     = 3;
+  let NumMicroOps = 2;
+}
+def A57Write_3cyc_1S_1V  : SchedWriteRes<[A57UnitS,
+                                          A57UnitV]> {
+  let Latency     = 3;
+  let NumMicroOps = 2;
+}
+def A57Write_4cyc_1I_1L  : SchedWriteRes<[A57UnitI,
+                                          A57UnitL]> {
+  let Latency     = 4;
+  let NumMicroOps = 2;
+}
+def A57Write_4cyc_2X     : SchedWriteRes<[A57UnitX, A57UnitX]> {
+  let Latency     = 4;
+  let NumMicroOps = 2;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 3 micro-op types
+
+def A57Write_10cyc_3V       : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV]> {
+  let Latency     = 10;
+  let NumMicroOps = 3;
+}
+def A57Write_2cyc_1I_2S     : SchedWriteRes<[A57UnitI,
+                                             A57UnitS, A57UnitS]> {
+  let Latency     = 2;
+  let NumMicroOps = 3;
+}
+def A57Write_3cyc_1I_1S_1V  : SchedWriteRes<[A57UnitI,
+                                             A57UnitS,
+                                             A57UnitV]> {
+  let Latency     = 3;
+  let NumMicroOps = 3;
+}
+def A57Write_3cyc_1M_2S     : SchedWriteRes<[A57UnitM,
+                                             A57UnitS, A57UnitS]> {
+  let Latency     = 3;
+  let NumMicroOps = 3;
+}
+def A57Write_3cyc_3S        : SchedWriteRes<[A57UnitS, A57UnitS, A57UnitS]> {
+  let Latency     = 3;
+  let NumMicroOps = 3;
+}
+def A57Write_3cyc_2S_1V     : SchedWriteRes<[A57UnitS, A57UnitS,
+                                             A57UnitV]> {
+  let Latency     = 3;
+  let NumMicroOps = 3;
+}
+def A57Write_5cyc_1I_2L     : SchedWriteRes<[A57UnitI,
+                                             A57UnitL, A57UnitL]> {
+  let Latency     = 5;
+  let NumMicroOps = 3;
+}
+def A57Write_6cyc_1I_2L     : SchedWriteRes<[A57UnitI,
+                                             A57UnitL, A57UnitL]> {
+  let Latency     = 6;
+  let NumMicroOps = 3;
+}
+def A57Write_6cyc_3V        : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV]> {
+  let Latency     = 6;
+  let NumMicroOps = 3;
+}
+def A57Write_7cyc_3L        : SchedWriteRes<[A57UnitL, A57UnitL, A57UnitL]> {
+  let Latency     = 7;
+  let NumMicroOps = 3;
+}
+def A57Write_8cyc_1I_1L_1V  : SchedWriteRes<[A57UnitI,
+                                             A57UnitL,
+                                             A57UnitV]> {
+  let Latency     = 8;
+  let NumMicroOps = 3;
+}
+def A57Write_8cyc_1L_2V     : SchedWriteRes<[A57UnitL,
+                                             A57UnitV, A57UnitV]> {
+  let Latency     = 8;
+  let NumMicroOps = 3;
+}
+def A57Write_8cyc_3V        : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV]> {
+  let Latency     = 8;
+  let NumMicroOps = 3;
+}
+def A57Write_9cyc_3V        : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 3;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 4 micro-op types
+
+def A57Write_2cyc_2I_2S    : SchedWriteRes<[A57UnitI, A57UnitI,
+                                            A57UnitS, A57UnitS]> {
+  let Latency     = 2;
+  let NumMicroOps = 4;
+}
+def A57Write_3cyc_2I_2S    : SchedWriteRes<[A57UnitI, A57UnitI,
+                                            A57UnitS, A57UnitS]> {
+  let Latency     = 3;
+  let NumMicroOps = 4;
+}
+def A57Write_3cyc_1I_3S    : SchedWriteRes<[A57UnitI,
+                                            A57UnitS, A57UnitS, A57UnitS]> {
+  let Latency     = 3;
+  let NumMicroOps = 4;
+}
+def A57Write_3cyc_1I_2S_1V : SchedWriteRes<[A57UnitI,
+                                            A57UnitS, A57UnitS,
+                                            A57UnitV]> {
+  let Latency     = 3;
+  let NumMicroOps = 4;
+}
+def A57Write_4cyc_4S       : SchedWriteRes<[A57UnitS, A57UnitS,
+                                            A57UnitS, A57UnitS]> {
+  let Latency     = 4;
+  let NumMicroOps = 4;
+}
+def A57Write_7cyc_1I_3L    : SchedWriteRes<[A57UnitI,
+                                            A57UnitL, A57UnitL, A57UnitL]> {
+  let Latency     = 7;
+  let NumMicroOps = 4;
+}
+def A57Write_5cyc_2I_2L    : SchedWriteRes<[A57UnitI, A57UnitI,
+                                            A57UnitL, A57UnitL]> {
+  let Latency     = 5;
+  let NumMicroOps = 4;
+}
+def A57Write_8cyc_1I_1L_2V : SchedWriteRes<[A57UnitI,
+                                            A57UnitL,
+                                            A57UnitV, A57UnitV]> {
+  let Latency     = 8;
+  let NumMicroOps = 4;
+}
+def A57Write_8cyc_4L       : SchedWriteRes<[A57UnitL, A57UnitL,
+                                            A57UnitL, A57UnitL]> {
+  let Latency     = 8;
+  let NumMicroOps = 4;
+}
+def A57Write_9cyc_2L_2V    : SchedWriteRes<[A57UnitL, A57UnitL,
+                                            A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 4;
+}
+def A57Write_9cyc_1L_3V    : SchedWriteRes<[A57UnitL,
+                                            A57UnitV, A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 4;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 5 micro-op types
+
+def A57Write_3cyc_3S_2V    : SchedWriteRes<[A57UnitS, A57UnitS, A57UnitS,
+                                            A57UnitV, A57UnitV]> {
+  let Latency     = 3;
+  let NumMicroOps = 5;
+}
+def A57Write_8cyc_1I_4L    : SchedWriteRes<[A57UnitI,
+                                            A57UnitL, A57UnitL,
+                                            A57UnitL, A57UnitL]> {
+  let Latency     = 8;
+  let NumMicroOps = 5;
+}
+def A57Write_4cyc_1I_4S    : SchedWriteRes<[A57UnitI,
+                                            A57UnitS, A57UnitS,
+                                            A57UnitS, A57UnitS]> {
+  let Latency     = 4;
+  let NumMicroOps = 5;
+}
+def A57Write_9cyc_1I_2L_2V : SchedWriteRes<[A57UnitI,
+                                            A57UnitL, A57UnitL,
+                                            A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 5;
+}
+def A57Write_9cyc_1I_1L_3V : SchedWriteRes<[A57UnitI,
+                                            A57UnitL,
+                                            A57UnitV, A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 5;
+}
+def A57Write_9cyc_2L_3V    : SchedWriteRes<[A57UnitL, A57UnitL,
+                                            A57UnitV, A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 5;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 6 micro-op types
+
+def A57Write_3cyc_1I_3S_2V : SchedWriteRes<[A57UnitI,
+                                            A57UnitS, A57UnitS, A57UnitS,
+                                            A57UnitV, A57UnitV]> {
+  let Latency     = 3;
+  let NumMicroOps = 6;
+}
+def A57Write_4cyc_2I_4S    : SchedWriteRes<[A57UnitI, A57UnitI,
+                                            A57UnitS, A57UnitS,
+                                            A57UnitS, A57UnitS]> {
+  let Latency     = 4;
+  let NumMicroOps = 6;
+}
+def A57Write_4cyc_4S_2V    : SchedWriteRes<[A57UnitS, A57UnitS,
+                                            A57UnitS, A57UnitS,
+                                            A57UnitV, A57UnitV]> {
+  let Latency     = 4;
+  let NumMicroOps = 6;
+}
+def A57Write_6cyc_6S       : SchedWriteRes<[A57UnitS, A57UnitS, A57UnitS,
+                                            A57UnitS, A57UnitS, A57UnitS]> {
+  let Latency     = 6;
+  let NumMicroOps = 6;
+}
+def A57Write_9cyc_1I_2L_3V : SchedWriteRes<[A57UnitI,
+                                            A57UnitL, A57UnitL,
+                                            A57UnitV, A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 6;
+}
+def A57Write_9cyc_1I_1L_4V : SchedWriteRes<[A57UnitI,
+                                            A57UnitL,
+                                            A57UnitV, A57UnitV,
+                                            A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 6;
+}
+def A57Write_9cyc_2L_4V    : SchedWriteRes<[A57UnitL, A57UnitL,
+                                            A57UnitV, A57UnitV,
+                                            A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 6;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 7 micro-op types
+
+def A57Write_10cyc_3L_4V : SchedWriteRes<[A57UnitL, A57UnitL, A57UnitL,
+                                          A57UnitV, A57UnitV,
+                                          A57UnitV, A57UnitV]> {
+  let Latency     = 10;
+  let NumMicroOps = 7;
+}
+def A57Write_4cyc_1I_4S_2V  : SchedWriteRes<[A57UnitI,
+                                             A57UnitS, A57UnitS,
+                                             A57UnitS, A57UnitS,
+                                             A57UnitV, A57UnitV]> {
+  let Latency     = 4;
+  let NumMicroOps = 7;
+}
+def A57Write_6cyc_1I_6S  : SchedWriteRes<[A57UnitI,
+                                          A57UnitS, A57UnitS, A57UnitS,
+                                          A57UnitS, A57UnitS, A57UnitS]> {
+  let Latency     = 6;
+  let NumMicroOps = 7;
+}
+def A57Write_9cyc_1I_2L_4V  : SchedWriteRes<[A57UnitI,
+                                             A57UnitL, A57UnitL,
+                                             A57UnitV, A57UnitV,
+                                             A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 7;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 8 micro-op types
+
+def A57Write_10cyc_1I_3L_4V : SchedWriteRes<[A57UnitI,
+                                             A57UnitL, A57UnitL, A57UnitL,
+                                             A57UnitV, A57UnitV,
+                                             A57UnitV, A57UnitV]> {
+  let Latency     = 10;
+  let NumMicroOps = 8;
+}
+def A57Write_11cyc_4L_4V : SchedWriteRes<[A57UnitL, A57UnitL,
+                                          A57UnitL, A57UnitL,
+                                          A57UnitV, A57UnitV,
+                                          A57UnitV, A57UnitV]> {
+  let Latency     = 11;
+  let NumMicroOps = 8;
+}
+def A57Write_8cyc_8S  : SchedWriteRes<[A57UnitS, A57UnitS,
+                                       A57UnitS, A57UnitS,
+                                       A57UnitS, A57UnitS,
+                                       A57UnitS, A57UnitS]> {
+  let Latency     = 8;
+  let NumMicroOps = 8;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 9 micro-op types
+
+def A57Write_8cyc_1I_8S  : SchedWriteRes<[A57UnitI,
+                                          A57UnitS, A57UnitS,
+                                          A57UnitS, A57UnitS,
+                                          A57UnitS, A57UnitS,
+                                          A57UnitS, A57UnitS]> {
+  let Latency     = 8;
+  let NumMicroOps = 9;
+}
+def A57Write_11cyc_1I_4L_4V : SchedWriteRes<[A57UnitI,
+                                             A57UnitL, A57UnitL,
+                                             A57UnitL, A57UnitL,
+                                             A57UnitV, A57UnitV,
+                                             A57UnitV, A57UnitV]> {
+  let Latency     = 11;
+  let NumMicroOps = 9;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 10 micro-op types
+
+def A57Write_6cyc_6S_4V : SchedWriteRes<[A57UnitS, A57UnitS, A57UnitS,
+                                         A57UnitS, A57UnitS, A57UnitS,
+                                         A57UnitV, A57UnitV,
+                                         A57UnitV, A57UnitV]> {
+  let Latency     = 6;
+  let NumMicroOps = 10;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 11 micro-op types
+
+def A57Write_6cyc_1I_6S_4V : SchedWriteRes<[A57UnitI,
+                                            A57UnitS, A57UnitS, A57UnitS,
+                                            A57UnitS, A57UnitS, A57UnitS,
+                                            A57UnitV, A57UnitV,
+                                            A57UnitV, A57UnitV]> {
+  let Latency     = 6;
+  let NumMicroOps = 11;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 12 micro-op types
+
+def A57Write_8cyc_8S_4V : SchedWriteRes<[A57UnitS, A57UnitS, A57UnitS, A57UnitS,
+                                         A57UnitS, A57UnitS, A57UnitS, A57UnitS,
+                                         A57UnitV, A57UnitV,
+                                         A57UnitV, A57UnitV]> {
+  let Latency     = 8;
+  let NumMicroOps = 12;
+}
+
+//===----------------------------------------------------------------------===//
+// Define Generic 13 micro-op types
+
+def A57Write_8cyc_1I_8S_4V : SchedWriteRes<[A57UnitI,
+                                            A57UnitS, A57UnitS, A57UnitS,
+                                            A57UnitS, A57UnitS, A57UnitS,
+                                            A57UnitS, A57UnitS,
+                                            A57UnitV, A57UnitV,
+                                            A57UnitV, A57UnitV]> {
+  let Latency     = 8;
+  let NumMicroOps = 13;
+}
+
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedCyclone.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedCyclone.td
new file mode 100644
index 0000000..a2a1802
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedCyclone.td
@@ -0,0 +1,865 @@
+//=- ARMSchedCyclone.td - AArch64 Cyclone Scheduling Defs ----*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for AArch64 Cyclone to support
+// instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def CycloneModel : SchedMachineModel {
+  let IssueWidth = 6; // 6 micro-ops are dispatched per cycle.
+  let MicroOpBufferSize = 192; // Based on the reorder buffer.
+  let LoadLatency = 4; // Optimistic load latency.
+  let MispredictPenalty = 16; // 14-19 cycles are typical.
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available on Cyclone.
+
+// 4 integer pipes
+def CyUnitI : ProcResource<4> {
+  let BufferSize = 48;
+}
+
+// 2 branch units: I[0..1]
+def CyUnitB : ProcResource<2> {
+  let Super  = CyUnitI;
+  let BufferSize = 24;
+}
+
+// 1 indirect-branch unit: I[0]
+def CyUnitBR : ProcResource<1> {
+  let Super  = CyUnitB;
+}
+
+// 2 shifter pipes: I[2..3]
+// When an instruction consumes a CyUnitIS, it also consumes a CyUnitI
+def CyUnitIS : ProcResource<2> {
+  let Super = CyUnitI;
+  let BufferSize = 24;
+}
+
+// 1 mul pipe: I[0]
+def CyUnitIM : ProcResource<1> {
+  let Super = CyUnitBR;
+  let BufferSize = 32;
+}
+
+// 1 div pipe: I[1]
+def CyUnitID : ProcResource<1> {
+  let Super = CyUnitB;
+  let BufferSize = 16;
+}
+
+// 1 integer division unit. This is driven by the ID pipe, but only
+// consumes the pipe for one cycle at issue and another cycle at writeback.
+def CyUnitIntDiv : ProcResource<1>;
+
+// 2 ld/st pipes.
+def CyUnitLS : ProcResource<2> {
+  let BufferSize = 28;
+}
+
+// 3 fp/vector pipes.
+def CyUnitV : ProcResource<3> {
+  let BufferSize = 48;
+}
+// 2 fp/vector arithmetic and multiply pipes: V[0-1]
+def CyUnitVM : ProcResource<2> {
+  let Super = CyUnitV;
+  let BufferSize = 32;
+}
+// 1 fp/vector division/sqrt pipe: V[2]
+def CyUnitVD : ProcResource<1> {
+  let Super = CyUnitV;
+  let BufferSize = 16;
+}
+// 1 fp compare pipe: V[0]
+def CyUnitVC : ProcResource<1> {
+  let Super = CyUnitVM;
+  let BufferSize = 16;
+}
+
+// 2 fp division/square-root units.  These are driven by the VD pipe,
+// but only consume the pipe for one cycle at issue and a cycle at writeback.
+def CyUnitFloatDiv : ProcResource<2>;
+
+//===----------------------------------------------------------------------===//
+// Define scheduler read/write resources and latency on Cyclone.
+// This mirrors sections 7.7-7.9 of the Tuning Guide v1.0.1.
+
+let SchedModel = CycloneModel in {
+
+//---
+// 7.8.1. Moves
+//---
+
+// A single nop micro-op (uX).
+def WriteX : SchedWriteRes<[]> { let Latency = 0; }
+
+// Move zero is a register rename (to machine register zero).
+// The move is replaced by a single nop micro-op.
+// MOVZ Rd, #0
+// AND Rd, Rzr, #imm
+def WriteZPred : SchedPredicate<[{TII->isGPRZero(MI)}]>;
+def WriteImmZ  : SchedWriteVariant<[
+                   SchedVar<WriteZPred, [WriteX]>,
+                   SchedVar<NoSchedPred, [WriteImm]>]>;
+def : InstRW<[WriteImmZ], (instrs MOVZWi,MOVZXi,ANDWri,ANDXri)>;
+
+// Move GPR is a register rename and single nop micro-op.
+// ORR Xd, XZR, Xm
+// ADD Xd, Xn, #0
+def WriteIMovPred : SchedPredicate<[{TII->isGPRCopy(MI)}]>;
+def WriteVMovPred : SchedPredicate<[{TII->isFPRCopy(MI)}]>;
+def WriteMov      : SchedWriteVariant<[
+                      SchedVar<WriteIMovPred, [WriteX]>,
+                      SchedVar<WriteVMovPred, [WriteX]>,
+                      SchedVar<NoSchedPred,   [WriteI]>]>;
+def : InstRW<[WriteMov], (instrs COPY,ORRXrr,ADDXrr)>;
+
+// Move non-zero immediate is an integer ALU op.
+// MOVN,MOVZ,MOVK
+def : WriteRes<WriteImm, [CyUnitI]>;
+
+//---
+// 7.8.2-7.8.5. Arithmetic and Logical, Comparison, Conditional,
+//              Shifts and Bitfield Operations
+//---
+
+// ADR,ADRP
+// ADD(S)ri,SUB(S)ri,AND(S)ri,EORri,ORRri
+// ADD(S)rr,SUB(S)rr,AND(S)rr,BIC(S)rr,EONrr,EORrr,ORNrr,ORRrr
+// ADC(S),SBC(S)
+// Aliases: CMN, CMP, TST
+//
+// Conditional operations.
+// CCMNi,CCMPi,CCMNr,CCMPr,
+// CSEL,CSINC,CSINV,CSNEG
+//
+// Bit counting and reversal operations.
+// CLS,CLZ,RBIT,REV,REV16,REV32
+def : WriteRes<WriteI, [CyUnitI]>;
+
+// ADD with shifted register operand is a single micro-op that
+// consumes a shift pipeline for two cycles.
+// ADD(S)rs,SUB(S)rs,AND(S)rs,BIC(S)rs,EONrs,EORrs,ORNrs,ORRrs
+// EXAMPLE: ADDrs Xn, Xm LSL #imm
+def : WriteRes<WriteISReg, [CyUnitIS]> {
+  let Latency = 2;
+  let ResourceCycles = [2];
+}
+
+// ADD with extended register operand is the same as shifted reg operand.
+// ADD(S)re,SUB(S)re
+// EXAMPLE: ADDXre Xn, Xm, UXTB #1
+def : WriteRes<WriteIEReg, [CyUnitIS]> {
+  let Latency = 2;
+  let ResourceCycles = [2];
+}
+
+// Variable shift and bitfield operations.
+// ASRV,LSLV,LSRV,RORV,BFM,SBFM,UBFM
+def : WriteRes<WriteIS, [CyUnitIS]>;
+
+// EXTR Shifts a pair of registers and requires two micro-ops.
+// The second micro-op is delayed, as modeled by ReadExtrHi.
+// EXTR Xn, Xm, #imm
+def : WriteRes<WriteExtr, [CyUnitIS, CyUnitIS]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+// EXTR's first register read is delayed by one cycle, effectively
+// shortening its writer's latency.
+// EXTR Xn, Xm, #imm
+def : ReadAdvance<ReadExtrHi, 1>;
+
+//---
+// 7.8.6. Multiplies
+//---
+
+// MUL/MNEG are aliases for MADD/MSUB.
+// MADDW,MSUBW,SMADDL,SMSUBL,UMADDL,UMSUBL
+def : WriteRes<WriteIM32, [CyUnitIM]> {
+  let Latency = 4;
+}
+// MADDX,MSUBX,SMULH,UMULH
+def : WriteRes<WriteIM64, [CyUnitIM]> {
+  let Latency = 5;
+}
+
+//---
+// 7.8.7. Divide
+//---
+
+// 32-bit divide takes 7-13 cycles. 10 cycles covers a 20-bit quotient.
+// The ID pipe is consumed for 2 cycles: issue and writeback.
+// SDIVW,UDIVW
+def : WriteRes<WriteID32, [CyUnitID, CyUnitIntDiv]> {
+  let Latency = 10;
+  let ResourceCycles = [2, 10];
+}
+// 64-bit divide takes 7-21 cycles. 13 cycles covers a 32-bit quotient.
+// The ID pipe is consumed for 2 cycles: issue and writeback.
+// SDIVX,UDIVX
+def : WriteRes<WriteID64, [CyUnitID, CyUnitIntDiv]> {
+  let Latency = 13;
+  let ResourceCycles = [2, 13];
+}
+
+//---
+// 7.8.8,7.8.10. Load/Store, single element
+//---
+
+// Integer loads take 4 cycles and use one LS unit for one cycle.
+def : WriteRes<WriteLD, [CyUnitLS]> {
+  let Latency = 4;
+}
+
+// Store-load forwarding is 4 cycles.
+//
+// Note: The store-exclusive sequence incorporates this
+// latency. However, general heuristics should not model the
+// dependence between a store and subsequent may-alias load because
+// hardware speculation works.
+def : WriteRes<WriteST, [CyUnitLS]> {
+  let Latency = 4;
+}
+
+// Load from base address plus an optionally scaled register offset.
+// Rt latency is latency WriteIS + WriteLD.
+// EXAMPLE: LDR Xn, Xm [, lsl 3]
+def CyWriteLDIdx : SchedWriteVariant<[
+  SchedVar<ScaledIdxPred, [WriteIS, WriteLD]>, // Load from scaled register.
+  SchedVar<NoSchedPred,   [WriteLD]>]>;        // Load from register offset.
+def : SchedAlias<WriteLDIdx, CyWriteLDIdx>;    // Map AArch64->Cyclone type.
+
+// EXAMPLE: STR Xn, Xm [, lsl 3]
+def CyWriteSTIdx : SchedWriteVariant<[
+  SchedVar<ScaledIdxPred, [WriteIS, WriteST]>, // Store to scaled register.
+  SchedVar<NoSchedPred,   [WriteST]>]>;        // Store to register offset.
+def : SchedAlias<WriteSTIdx, CyWriteSTIdx>;    // Map AArch64->Cyclone type.
+
+// Read the (unshifted) base register Xn in the second micro-op one cycle later.
+// EXAMPLE: LDR Xn, Xm [, lsl 3]
+def ReadBaseRS : SchedReadAdvance<1>;
+def CyReadAdrBase : SchedReadVariant<[
+  SchedVar<ScaledIdxPred, [ReadBaseRS]>, // Read base reg after shifting offset.
+  SchedVar<NoSchedPred,   [ReadDefault]>]>;   // Read base reg with no shift.
+def : SchedAlias<ReadAdrBase, CyReadAdrBase>; // Map AArch64->Cyclone type.
+
+//---
+// 7.8.9,7.8.11. Load/Store, paired
+//---
+
+// Address pre/post increment is a simple ALU op with one cycle latency.
+def : WriteRes<WriteAdr, [CyUnitI]>;
+
+// LDP high register write is fused with the load, but a nop micro-op remains.
+def : WriteRes<WriteLDHi, []> {
+  let Latency = 4;
+}
+
+// STP is a vector op and store, except for QQ, which is just two stores.
+def : SchedAlias<WriteSTP, WriteVSTShuffle>;
+def : InstRW<[WriteST, WriteST], (instrs STPQi)>;
+
+//---
+// 7.8.13. Branches
+//---
+
+// Branches take a single micro-op.
+// The misprediction penalty is defined as a SchedMachineModel property.
+def : WriteRes<WriteBr,    [CyUnitB]>  {let Latency = 0;}
+def : WriteRes<WriteBrReg, [CyUnitBR]> {let Latency = 0;}
+
+//---
+// 7.8.14. Never-issued Instructions, Barrier and Hint Operations
+//---
+
+// NOP,SEV,SEVL,WFE,WFI,YIELD
+def : WriteRes<WriteHint, []> {let Latency = 0;}
+// ISB
+def : InstRW<[WriteI], (instrs ISB)>;
+// SLREX,DMB,DSB
+def : WriteRes<WriteBarrier, [CyUnitLS]>;
+
+// System instructions get an invalid latency because the latency of
+// other operations across them is meaningless.
+def : WriteRes<WriteSys, []> {let Latency = -1;}
+
+//===----------------------------------------------------------------------===//
+// 7.9 Vector Unit Instructions
+
+// Simple vector operations take 2 cycles.
+def : WriteRes<WriteV, [CyUnitV]> {let Latency = 2;}
+
+// Define some longer latency vector op types for Cyclone.
+def CyWriteV3 : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
+def CyWriteV4 : SchedWriteRes<[CyUnitV]> {let Latency = 4;}
+def CyWriteV5 : SchedWriteRes<[CyUnitV]> {let Latency = 5;}
+def CyWriteV6 : SchedWriteRes<[CyUnitV]> {let Latency = 6;}
+
+// Simple floating-point operations take 2 cycles.
+def : WriteRes<WriteF, [CyUnitV]> {let Latency = 2;}
+
+//---
+// 7.9.1 Vector Moves
+//---
+
+// TODO: Add Cyclone-specific zero-cycle zeros. LLVM currently
+// generates expensive int-float conversion instead:
+// FMOVDi Dd, #0.0
+// FMOVv2f64ns Vd.2d, #0.0
+
+// FMOVSi,FMOVDi
+def : WriteRes<WriteFImm, [CyUnitV]> {let Latency = 2;}
+
+// MOVI,MVNI are WriteV
+// FMOVv2f32ns,FMOVv2f64ns,FMOVv4f32ns are WriteV
+
+// Move FPR is a register rename and single nop micro-op.
+// ORR.16b Vd,Vn,Vn
+// COPY is handled above in the WriteMov Variant.
+def WriteVMov    : SchedWriteVariant<[
+                     SchedVar<WriteVMovPred, [WriteX]>,
+                     SchedVar<NoSchedPred,   [WriteV]>]>;
+def : InstRW<[WriteVMov], (instrs ORRv16i8)>;
+
+// FMOVSr,FMOVDr are WriteF.
+
+// MOV V,V is a WriteV.
+
+// CPY D,V[x] is a WriteV
+
+// INS V[x],V[y] is a WriteV.
+
+// FMOVWSr,FMOVXDr,FMOVXDHighr
+def : WriteRes<WriteFCopy, [CyUnitLS]> {
+  let Latency = 5;
+}
+
+// FMOVSWr,FMOVDXr
+def : InstRW<[WriteLD], (instrs FMOVSWr,FMOVDXr,FMOVDXHighr)>;
+
+// INS V[x],R
+def CyWriteCopyToFPR : WriteSequence<[WriteVLD, WriteV]>;
+def : InstRW<[CyWriteCopyToFPR], (instregex "INSv")>;
+
+// SMOV,UMOV R,V[x]
+def CyWriteCopyToGPR : WriteSequence<[WriteLD, WriteI]>;
+def : InstRW<[CyWriteCopyToGPR], (instregex "SMOVv","UMOVv")>;
+
+// DUP V,R
+def : InstRW<[CyWriteCopyToFPR], (instregex "DUPv")>;
+
+// DUP V,V[x] is a WriteV.
+
+//---
+// 7.9.2 Integer Arithmetic, Logical, and Comparisons
+//---
+
+// BIC,ORR V,#imm are WriteV
+
+def : InstRW<[CyWriteV3], (instregex "ABSv")>;
+
+// MVN,NEG,NOT are WriteV
+
+def : InstRW<[CyWriteV3], (instregex "SQABSv","SQNEGv")>;
+
+// ADDP is a WriteV.
+def CyWriteVADDLP : SchedWriteRes<[CyUnitV]> {let Latency = 2;}
+def : InstRW<[CyWriteVADDLP], (instregex "SADDLPv","UADDLPv")>;
+
+def : InstRW<[CyWriteV3],
+             (instregex "ADDVv","SMAXVv","UMAXVv","SMINVv","UMINVv")>;
+
+def : InstRW<[CyWriteV3], (instregex "SADDLV","UADDLV")>;
+
+// ADD,SUB are WriteV
+
+// Forward declare.
+def CyWriteVABD : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
+
+// Add/Diff and accumulate uses the vector multiply unit.
+def CyWriteVAccum : SchedWriteRes<[CyUnitVM]> {let Latency = 3;}
+def CyReadVAccum  : SchedReadAdvance<1,
+                    [CyWriteVAccum, CyWriteVADDLP, CyWriteVABD]>;
+
+def : InstRW<[CyWriteVAccum, CyReadVAccum],
+             (instregex "SADALP","UADALP")>;
+
+def : InstRW<[CyWriteVAccum, CyReadVAccum],
+             (instregex "SABAv","UABAv","SABALv","UABALv")>;
+
+def : InstRW<[CyWriteV3], (instregex "SQADDv","SQSUBv","UQADDv","UQSUBv")>;
+
+def : InstRW<[CyWriteV3], (instregex "SUQADDv","USQADDv")>;
+
+def : InstRW<[CyWriteV4], (instregex "ADDHNv","RADDHNv", "RSUBHNv", "SUBHNv")>;
+
+// WriteV includes:
+// AND,BIC,CMTST,EOR,ORN,ORR
+// ADDP
+// SHADD,SHSUB,SRHADD,UHADD,UHSUB,URHADD
+// SADDL,SSUBL,UADDL,USUBL
+// SADDW,SSUBW,UADDW,USUBW
+
+def : InstRW<[CyWriteV3], (instregex "CMEQv","CMGEv","CMGTv",
+                                     "CMLEv","CMLTv",
+                                     "CMHIv","CMHSv")>;
+
+def : InstRW<[CyWriteV3], (instregex "SMAXv","SMINv","UMAXv","UMINv",
+                                     "SMAXPv","SMINPv","UMAXPv","UMINPv")>;
+
+def : InstRW<[CyWriteVABD], (instregex "SABDv","UABDv",
+                                       "SABDLv","UABDLv")>;
+
+//---
+// 7.9.3 Floating Point Arithmetic and Comparisons
+//---
+
+// FABS,FNEG are WriteF
+
+def : InstRW<[CyWriteV4], (instrs FADDPv2i32p)>;
+def : InstRW<[CyWriteV5], (instrs FADDPv2i64p)>;
+
+def : InstRW<[CyWriteV3], (instregex "FMAXPv2i","FMAXNMPv2i",
+                                     "FMINPv2i","FMINNMPv2i")>;
+
+def : InstRW<[CyWriteV4], (instregex "FMAXVv","FMAXNMVv","FMINVv","FMINNMVv")>;
+
+def : InstRW<[CyWriteV4], (instrs FADDSrr,FADDv2f32,FADDv4f32,
+                                  FSUBSrr,FSUBv2f32,FSUBv4f32,
+                                  FADDPv2f32,FADDPv4f32,
+                                  FABD32,FABDv2f32,FABDv4f32)>;
+def : InstRW<[CyWriteV5], (instrs FADDDrr,FADDv2f64,
+                                  FSUBDrr,FSUBv2f64,
+                                  FADDPv2f64,
+                                  FABD64,FABDv2f64)>;
+
+def : InstRW<[CyWriteV3], (instregex "FCMEQ","FCMGT","FCMLE","FCMLT")>;
+
+def : InstRW<[CyWriteV3], (instregex "FACGE","FACGT",
+                                     "FMAXS","FMAXD","FMAXv",
+                                     "FMINS","FMIND","FMINv",
+                                     "FMAXNMS","FMAXNMD","FMAXNMv",
+                                     "FMINNMS","FMINNMD","FMINNMv",
+                                     "FMAXPv2f","FMAXPv4f",
+                                     "FMINPv2f","FMINPv4f",
+                                     "FMAXNMPv2f","FMAXNMPv4f",
+                                     "FMINNMPv2f","FMINNMPv4f")>;
+
+// FCMP,FCMPE,FCCMP,FCCMPE
+def : WriteRes<WriteFCmp, [CyUnitVC]> {let Latency = 4;}
+
+// FCSEL is a WriteF.
+
+//---
+// 7.9.4 Shifts and Bitfield Operations
+//---
+
+// SHL is a WriteV
+
+def CyWriteVSHR : SchedWriteRes<[CyUnitV]> {let Latency = 2;}
+def : InstRW<[CyWriteVSHR], (instregex "SSHRv","USHRv")>;
+
+def CyWriteVSRSHR : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
+def : InstRW<[CyWriteVSRSHR], (instregex "SRSHRv","URSHRv")>;
+
+// Shift and accumulate uses the vector multiply unit.
+def CyWriteVShiftAcc : SchedWriteRes<[CyUnitVM]> {let Latency = 3;}
+def CyReadVShiftAcc  : SchedReadAdvance<1,
+                        [CyWriteVShiftAcc, CyWriteVSHR, CyWriteVSRSHR]>;
+def : InstRW<[CyWriteVShiftAcc, CyReadVShiftAcc],
+             (instregex "SRSRAv","SSRAv","URSRAv","USRAv")>;
+
+// SSHL,USHL are WriteV.
+
+def : InstRW<[CyWriteV3], (instregex "SRSHLv","URSHLv")>;
+
+// SQSHL,SQSHLU,UQSHL are WriteV.
+
+def : InstRW<[CyWriteV3], (instregex "SQRSHLv","UQRSHLv")>;
+
+// WriteV includes:
+// SHLL,SSHLL,USHLL
+// SLI,SRI
+// BIF,BIT,BSL
+// EXT
+// CLS,CLZ,CNT,RBIT,REV16,REV32,REV64,XTN
+// XTN2
+
+def : InstRW<[CyWriteV4],
+             (instregex "RSHRNv","SHRNv",
+                        "SQRSHRNv","SQRSHRUNv","SQSHRNv","SQSHRUNv",
+                        "UQRSHRNv","UQSHRNv","SQXTNv","SQXTUNv","UQXTNv")>;
+
+//---
+// 7.9.5 Multiplication
+//---
+
+def CyWriteVMul : SchedWriteRes<[CyUnitVM]> { let Latency = 4;}
+def : InstRW<[CyWriteVMul], (instregex "MULv","SMULLv","UMULLv",
+                             "SQDMULLv","SQDMULHv","SQRDMULHv")>;
+
+// FMUL,FMULX,FNMUL default to WriteFMul.
+def : WriteRes<WriteFMul, [CyUnitVM]> { let Latency = 4;}
+
+def CyWriteV64Mul : SchedWriteRes<[CyUnitVM]> { let Latency = 5;}
+def : InstRW<[CyWriteV64Mul], (instrs FMULDrr,FMULv2f64,FMULv2i64_indexed,
+                               FNMULDrr,FMULX64,FMULXv2f64,FMULXv2i64_indexed)>;
+
+def CyReadVMulAcc : SchedReadAdvance<1, [CyWriteVMul, CyWriteV64Mul]>;
+def : InstRW<[CyWriteVMul, CyReadVMulAcc],
+             (instregex "MLA","MLS","SMLAL","SMLSL","UMLAL","UMLSL",
+              "SQDMLAL","SQDMLSL")>;
+
+def CyWriteSMul : SchedWriteRes<[CyUnitVM]> { let Latency = 8;}
+def CyWriteDMul : SchedWriteRes<[CyUnitVM]> { let Latency = 10;}
+def CyReadSMul : SchedReadAdvance<4, [CyWriteSMul]>;
+def CyReadDMul : SchedReadAdvance<5, [CyWriteDMul]>;
+
+def : InstRW<[CyWriteSMul, CyReadSMul],
+             (instrs FMADDSrrr,FMSUBSrrr,FNMADDSrrr,FNMSUBSrrr,
+              FMLAv2f32,FMLAv4f32,
+              FMLAv1i32_indexed,FMLAv1i64_indexed,FMLAv2i32_indexed)>;
+def : InstRW<[CyWriteDMul, CyReadDMul],
+             (instrs FMADDDrrr,FMSUBDrrr,FNMADDDrrr,FNMSUBDrrr,
+              FMLAv2f64,FMLAv2i64_indexed,
+              FMLSv2f64,FMLSv2i64_indexed)>;
+
+def CyWritePMUL : SchedWriteRes<[CyUnitVD]> { let Latency = 3; }
+def : InstRW<[CyWritePMUL], (instregex "PMULv", "PMULLv")>;
+
+//---
+// 7.9.6 Divide and Square Root
+//---
+
+// FDIV,FSQRT
+// TODO: Add 64-bit variant with 19 cycle latency.
+// TODO: Specialize FSQRT for longer latency.
+def : WriteRes<WriteFDiv, [CyUnitVD, CyUnitFloatDiv]> {
+  let Latency = 17;
+  let ResourceCycles = [2, 17];
+}
+
+def : InstRW<[CyWriteV4], (instregex "FRECPEv","FRECPXv","URECPEv","URSQRTEv")>;
+
+def WriteFRSQRTE : SchedWriteRes<[CyUnitVM]> { let Latency = 4; }
+def : InstRW<[WriteFRSQRTE], (instregex "FRSQRTEv")>;
+
+def WriteFRECPS : SchedWriteRes<[CyUnitVM]> { let Latency = 8; }
+def WriteFRSQRTS : SchedWriteRes<[CyUnitVM]> { let Latency = 10; }
+def : InstRW<[WriteFRECPS],  (instregex "FRECPSv")>;
+def : InstRW<[WriteFRSQRTS], (instregex "FRSQRTSv")>;
+
+//---
+// 7.9.7 Integer-FP Conversions
+//---
+
+// FCVT lengthen f16/s32
+def : InstRW<[WriteV], (instrs FCVTSHr,FCVTDHr,FCVTDSr)>;
+
+// FCVT,FCVTN,FCVTXN
+// SCVTF,UCVTF V,V
+// FRINT(AIMNPXZ) V,V
+def : WriteRes<WriteFCvt, [CyUnitV]> {let Latency = 4;}
+
+// SCVT/UCVT S/D, Rd = VLD5+V4: 9 cycles.
+def CyWriteCvtToFPR : WriteSequence<[WriteVLD, CyWriteV4]>;
+def : InstRW<[CyWriteCopyToFPR], (instregex "FCVT[AMNPZ][SU][SU][WX][SD]r")>;
+
+// FCVT Rd, S/D = V6+LD4: 10 cycles
+def CyWriteCvtToGPR : WriteSequence<[CyWriteV6, WriteLD]>;
+def : InstRW<[CyWriteCvtToGPR], (instregex "[SU]CVTF[SU][WX][SD]r")>;
+
+// FCVTL is a WriteV
+
+//---
+// 7.9.8-7.9.10 Cryptography, Data Transposition, Table Lookup
+//---
+
+def CyWriteCrypto2 : SchedWriteRes<[CyUnitVD]> {let Latency = 2;}
+def : InstRW<[CyWriteCrypto2], (instrs AESIMCrr, AESMCrr, SHA1Hrr,
+                                       AESDrr, AESErr, SHA1SU1rr, SHA256SU0rr,
+                                       SHA1SU0rrr)>;
+
+def CyWriteCrypto3 : SchedWriteRes<[CyUnitVD]> {let Latency = 3;}
+def : InstRW<[CyWriteCrypto3], (instrs SHA256SU1rrr)>;
+
+def CyWriteCrypto6 : SchedWriteRes<[CyUnitVD]> {let Latency = 6;}
+def : InstRW<[CyWriteCrypto6], (instrs SHA1Crrr, SHA1Mrrr, SHA1Prrr,
+                                       SHA256Hrrr,SHA256H2rrr)>;
+
+// TRN,UZP,ZUP are WriteV.
+
+// TBL,TBX are WriteV.
+
+//---
+// 7.9.11-7.9.14 Load/Store, single element and paired
+//---
+
+// Loading into the vector unit takes 5 cycles vs 4 for integer loads.
+def : WriteRes<WriteVLD, [CyUnitLS]> {
+  let Latency = 5;
+}
+
+// Store-load forwarding is 4 cycles.
+def : WriteRes<WriteVST, [CyUnitLS]> {
+  let Latency = 4;
+}
+
+// WriteVLDPair/VSTPair sequences are expanded by the target description.
+
+//---
+// 7.9.15 Load, element operations
+//---
+
+// Only the first WriteVLD and WriteAdr for writeback matches def operands.
+// Subsequent WriteVLDs consume resources. Since all loaded values have the
+// same latency, this is acceptable.
+
+// Vd is read 5 cycles after issuing the vector load.
+def : ReadAdvance<ReadVLD, 5>;
+
+def : InstRW<[WriteVLD],
+             (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLD, WriteAdr],
+             (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
+
+// Register writes from the load's high half are fused micro-ops.
+def : InstRW<[WriteVLD],
+             (instregex "LD1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteVLD, WriteAdr],
+             (instregex "LD1Twov(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVLD, WriteVLD],
+             (instregex "LD1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
+             (instregex "LD1Twov(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLD, WriteVLD],
+             (instregex "LD1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
+             (instregex "LD1Threev(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVLD, WriteVLD, WriteVLD],
+             (instregex "LD1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD],
+             (instregex "LD1Threev(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLD, WriteVLD],
+             (instregex "LD1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
+             (instregex "LD1Fourv(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVLD, WriteVLD, WriteVLD, WriteVLD],
+             (instregex "LD1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD, WriteVLD],
+             (instregex "LD1Fourv(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD],
+             (instregex "LD1i(8|16|32)$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr],
+             (instregex "LD1i(8|16|32)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD],          (instrs LD1i64)>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr],(instrs LD1i64_POST)>;
+
+def : InstRW<[WriteVLDShuffle],
+             (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr],
+             (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[WriteVLDShuffle, WriteV],
+             (instregex "LD2Twov(8b|4h|2s)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV],
+             (instregex "LD2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle],
+             (instregex "LD2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle],
+             (instregex "LD2Twov(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV],
+             (instregex "LD2i(8|16|32)$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV],
+             (instregex "LD2i(8|16|32)_POST")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV],
+             (instregex "LD2i64$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV],
+             (instregex "LD2i64_POST")>;
+
+def : InstRW<[WriteVLDShuffle, WriteV],
+             (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV],
+             (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV],
+             (instregex "LD3Threev(8b|4h|2s)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV],
+             (instregex "LD3Threev(8b|4h|2s)_POST")>;
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteVLDShuffle],
+             (instregex "LD3Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteVLDShuffle],
+             (instregex "LD3Threev(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV],
+             (instregex "LD3i(8|16|32)$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV],
+             (instregex "LD3i(8|16|32)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV],
+             (instregex "LD3i64$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV],
+             (instregex "LD3i64_POST")>;
+
+def : InstRW<[WriteVLDShuffle, WriteV, WriteV],
+             (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV],
+             (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV],
+             (instrs LD3Rv1d,LD3Rv2d)>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV],
+             (instrs LD3Rv2d_POST,LD3Rv2d_POST)>;
+
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV],
+             (instregex "LD4Fourv(8b|4h|2s)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV],
+             (instregex "LD4Fourv(8b|4h|2s)_POST")>;
+def : InstRW<[WriteVLDPairShuffle, WriteVLDPairShuffle,
+              WriteVLDPairShuffle, WriteVLDPairShuffle],
+             (instregex "LD4Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLDPairShuffle, WriteAdr, WriteVLDPairShuffle,
+              WriteVLDPairShuffle, WriteVLDPairShuffle],
+             (instregex "LD4Fourv(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV, WriteV],
+             (instregex "LD4i(8|16|32)$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV, WriteV],
+             (instregex "LD4i(8|16|32)_POST")>;
+
+
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV, WriteV],
+             (instrs LD4i64)>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV],
+             (instrs LD4i64_POST)>;
+
+def : InstRW<[WriteVLDShuffle, WriteV, WriteV, WriteV],
+             (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV, WriteV],
+             (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV],
+             (instrs LD4Rv1d,LD4Rv2d)>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV],
+             (instrs LD4Rv1d_POST,LD4Rv2d_POST)>;
+
+//---
+// 7.9.16 Store, element operations
+//---
+
+// Only the WriteAdr for writeback matches a def operands.
+// Subsequent WriteVLDs only consume resources.
+
+def : InstRW<[WriteVST],
+             (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVST],
+             (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle],
+             (instregex "ST1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle],
+             (instregex "ST1Twov(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVST, WriteVST],
+             (instregex "ST1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVST, WriteVST],
+             (instregex "ST1Twov(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle, WriteVST],
+             (instregex "ST1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVST],
+             (instregex "ST1Threev(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVST, WriteVST, WriteVST],
+             (instregex "ST1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST],
+             (instregex "ST1Threev(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST1Fourv(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST],
+             (instregex "ST1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST, WriteVST],
+             (instregex "ST1Fourv(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle],           (instregex "ST1i(8|16|32)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST1i(8|16|32)_POST")>;
+
+def : InstRW<[WriteVSTShuffle],           (instrs ST1i64)>;
+def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST1i64_POST)>;
+
+def : InstRW<[WriteVSTShuffle],
+             (instregex "ST2Twov(8b|4h|2s)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle],
+             (instregex "ST2Twov(8b|4h|2s)_POST")>;
+def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST2Twov(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle],           (instregex "ST2i(8|16|32)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST2i(8|16|32)_POST")>;
+def : InstRW<[WriteVSTShuffle],           (instrs ST2i64)>;
+def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST2i64_POST)>;
+
+def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST3Threev(8b|4h|2s)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST3Threev(8b|4h|2s)_POST")>;
+def : InstRW<[WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST3Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST3Threev(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle],           (instregex "ST3i(8|16|32)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST3i(8|16|32)_POST")>;
+
+def :InstRW<[WriteVSTShuffle, WriteVSTShuffle],           (instrs ST3i64)>;
+def :InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], (instrs ST3i64_POST)>;
+
+def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle],
+            (instregex "ST4Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle],
+            (instregex "ST4Fourv(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle,
+              WriteVSTPairShuffle, WriteVSTPairShuffle],
+             (instregex "ST4Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle,
+              WriteVSTPairShuffle, WriteVSTPairShuffle],
+             (instregex "ST4Fourv(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTPairShuffle],           (instregex "ST4i(8|16|32)$")>;
+def : InstRW<[WriteAdr, WriteVSTPairShuffle], (instregex "ST4i(8|16|32)_POST")>;
+
+def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],          (instrs ST4i64)>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],(instrs ST4i64_POST)>;
+
+//---
+// Unused SchedRead types
+//---
+
+def : ReadAdvance<ReadI, 0>;
+def : ReadAdvance<ReadISReg, 0>;
+def : ReadAdvance<ReadIEReg, 0>;
+def : ReadAdvance<ReadIM, 0>;
+def : ReadAdvance<ReadIMA, 0>;
+def : ReadAdvance<ReadID, 0>;
+
+} // SchedModel = CycloneModel
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64Schedule.td b/contrib/llvm/lib/Target/AArch64/AArch64Schedule.td
new file mode 100644
index 0000000..eaa9110
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64Schedule.td
@@ -0,0 +1,104 @@
+//==-- AArch64Schedule.td - AArch64 Scheduling Definitions -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// Define TII for use in SchedVariant Predicates.
+// const MachineInstr *MI and const TargetSchedModel *SchedModel
+// are defined by default.
+def : PredicateProlog<[{
+  const AArch64InstrInfo *TII =
+    static_cast<const AArch64InstrInfo*>(SchedModel->getInstrInfo());
+  (void)TII;
+}]>;
+
+// AArch64 Scheduler Definitions
+
+def WriteImm       : SchedWrite; // MOVN, MOVZ
+// TODO: Provide variants for MOV32/64imm Pseudos that dynamically
+// select the correct sequence of WriteImms.
+
+def WriteI         : SchedWrite; // ALU
+def WriteISReg     : SchedWrite; // ALU of Shifted-Reg
+def WriteIEReg     : SchedWrite; // ALU of Extended-Reg
+def ReadI          : SchedRead;  // ALU
+def ReadISReg      : SchedRead;  // ALU of Shifted-Reg
+def ReadIEReg      : SchedRead;  // ALU of Extended-Reg
+def WriteExtr      : SchedWrite; // EXTR shifts a reg pair
+def ReadExtrHi     : SchedRead;  // Read the high reg of the EXTR pair
+def WriteIS        : SchedWrite; // Shift/Scale
+def WriteID32      : SchedWrite; // 32-bit Divide
+def WriteID64      : SchedWrite; // 64-bit Divide
+def ReadID         : SchedRead;  // 32/64-bit Divide
+def WriteIM32      : SchedWrite; // 32-bit Multiply
+def WriteIM64      : SchedWrite; // 64-bit Multiply
+def ReadIM         : SchedRead;  // 32/64-bit Multiply
+def ReadIMA        : SchedRead;  // 32/64-bit Multiply Accumulate
+def WriteBr        : SchedWrite; // Branch
+def WriteBrReg     : SchedWrite; // Indirect Branch
+
+def WriteLD        : SchedWrite; // Load from base addr plus immediate offset
+def WriteST        : SchedWrite; // Store to base addr plus immediate offset
+def WriteSTP       : SchedWrite; // Store a register pair.
+def WriteAdr       : SchedWrite; // Address pre/post increment.
+
+def WriteLDIdx : SchedWrite; // Load from a register index (maybe scaled).
+def WriteSTIdx : SchedWrite; // Store to a register index (maybe scaled).
+def ReadAdrBase : SchedRead; // Read the base resister of a reg-offset LD/ST.
+
+// Predicate for determining when a shiftable register is shifted.
+def RegShiftedPred : SchedPredicate<[{TII->hasShiftedReg(MI)}]>;
+
+// Predicate for determining when a extendedable register is extended.
+def RegExtendedPred : SchedPredicate<[{TII->hasExtendedReg(MI)}]>;
+
+// ScaledIdxPred is true if a WriteLDIdx operand will be
+// scaled. Subtargets can use this to dynamically select resources and
+// latency for WriteLDIdx and ReadAdrBase.
+def ScaledIdxPred : SchedPredicate<[{TII->isScaledAddr(MI)}]>;
+
+// Serialized two-level address load.
+// EXAMPLE: LOADGot
+def WriteLDAdr : WriteSequence<[WriteAdr, WriteLD]>;
+
+// Serialized two-level address lookup.
+// EXAMPLE: MOVaddr...
+def WriteAdrAdr : WriteSequence<[WriteAdr, WriteAdr]>;
+
+// The second register of a load-pair.
+// LDP,LDPSW,LDNP,LDXP,LDAXP
+def WriteLDHi : SchedWrite;
+
+// Store-exclusive is a store followed by a dependent load.
+def WriteSTX : WriteSequence<[WriteST, WriteLD]>;
+
+def WriteSys     : SchedWrite; // Long, variable latency system ops.
+def WriteBarrier : SchedWrite; // Memory barrier.
+def WriteHint    : SchedWrite; // Hint instruction.
+
+def WriteF       : SchedWrite; // General floating-point ops.
+def WriteFCmp    : SchedWrite; // Floating-point compare.
+def WriteFCvt    : SchedWrite; // Float conversion.
+def WriteFCopy   : SchedWrite; // Float-int register copy.
+def WriteFImm    : SchedWrite; // Floating-point immediate.
+def WriteFMul    : SchedWrite; // Floating-point multiply.
+def WriteFDiv    : SchedWrite; // Floating-point division.
+
+def WriteV   : SchedWrite; // Vector ops.
+def WriteVLD : SchedWrite; // Vector loads.
+def WriteVST : SchedWrite; // Vector stores.
+
+// Read the unwritten lanes of the VLD's destination registers.
+def ReadVLD : SchedRead;
+
+// Sequential vector load and shuffle.
+def WriteVLDShuffle     : WriteSequence<[WriteVLD, WriteV]>;
+def WriteVLDPairShuffle : WriteSequence<[WriteVLD, WriteV, WriteV]>;
+
+// Store a shuffled vector.
+def WriteVSTShuffle : WriteSequence<[WriteV, WriteVST]>;
+def WriteVSTPairShuffle : WriteSequence<[WriteV, WriteV, WriteVST]>;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
new file mode 100644
index 0000000..1bf64fc
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -0,0 +1,60 @@
+//===-- AArch64SelectionDAGInfo.cpp - AArch64 SelectionDAG Info -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AArch64SelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64TargetMachine.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-selectiondag-info"
+
+AArch64SelectionDAGInfo::AArch64SelectionDAGInfo(const DataLayout *DL)
+    : TargetSelectionDAGInfo(DL) {}
+
+AArch64SelectionDAGInfo::~AArch64SelectionDAGInfo() {}
+
+SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
+    SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src,
+    SDValue Size, unsigned Align, bool isVolatile,
+    MachinePointerInfo DstPtrInfo) const {
+  // Check to see if there is a specialized entry-point for memory zeroing.
+  ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
+  ConstantSDNode *SizeValue = dyn_cast<ConstantSDNode>(Size);
+  const char *bzeroEntry =
+      (V && V->isNullValue())
+          ? DAG.getTarget().getSubtarget<AArch64Subtarget>().getBZeroEntry()
+          : nullptr;
+  // For small size (< 256), it is not beneficial to use bzero
+  // instead of memset.
+  if (bzeroEntry && (!SizeValue || SizeValue->getZExtValue() > 256)) {
+    const AArch64TargetLowering &TLI =
+        *static_cast<const AArch64TargetLowering *>(
+            DAG.getTarget().getTargetLowering());
+
+    EVT IntPtr = TLI.getPointerTy();
+    Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
+    TargetLowering::ArgListTy Args;
+    TargetLowering::ArgListEntry Entry;
+    Entry.Node = Dst;
+    Entry.Ty = IntPtrTy;
+    Args.push_back(Entry);
+    Entry.Node = Size;
+    Args.push_back(Entry);
+    TargetLowering::CallLoweringInfo CLI(DAG);
+    CLI.setDebugLoc(dl).setChain(Chain)
+      .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+                 DAG.getExternalSymbol(bzeroEntry, IntPtr), std::move(Args), 0)
+      .setDiscardResult();
+    std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
+    return CallResult.second;
+  }
+  return SDValue();
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
new file mode 100644
index 0000000..1180eea
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
@@ -0,0 +1,33 @@
+//===-- AArch64SelectionDAGInfo.h - AArch64 SelectionDAG Info ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the AArch64 subclass for TargetSelectionDAGInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AArch64SELECTIONDAGINFO_H
+#define AArch64SELECTIONDAGINFO_H
+
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+
+namespace llvm {
+
+class AArch64SelectionDAGInfo : public TargetSelectionDAGInfo {
+public:
+  explicit AArch64SelectionDAGInfo(const DataLayout *DL);
+  ~AArch64SelectionDAGInfo();
+
+  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, SDValue Chain,
+                                  SDValue Dst, SDValue Src, SDValue Size,
+                                  unsigned Align, bool isVolatile,
+                                  MachinePointerInfo DstPtrInfo) const override;
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/contrib/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp
new file mode 100644
index 0000000..45f8ddb
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp
@@ -0,0 +1,168 @@
+//===--- AArch64StorePairSuppress.cpp --- Suppress store pair formation ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass identifies floating point stores that should not be combined into
+// store pairs. Later we may do the same for floating point loads.
+// ===---------------------------------------------------------------------===//
+
+#include "AArch64InstrInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineTraceMetrics.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-stp-suppress"
+
+namespace {
+class AArch64StorePairSuppress : public MachineFunctionPass {
+  const AArch64InstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  const MachineRegisterInfo *MRI;
+  MachineFunction *MF;
+  TargetSchedModel SchedModel;
+  MachineTraceMetrics *Traces;
+  MachineTraceMetrics::Ensemble *MinInstr;
+
+public:
+  static char ID;
+  AArch64StorePairSuppress() : MachineFunctionPass(ID) {}
+
+  virtual const char *getPassName() const override {
+    return "AArch64 Store Pair Suppression";
+  }
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+
+private:
+  bool shouldAddSTPToBlock(const MachineBasicBlock *BB);
+
+  bool isNarrowFPStore(const MachineInstr &MI);
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineTraceMetrics>();
+    AU.addPreserved<MachineTraceMetrics>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+char AArch64StorePairSuppress::ID = 0;
+} // anonymous
+
+FunctionPass *llvm::createAArch64StorePairSuppressPass() {
+  return new AArch64StorePairSuppress();
+}
+
+/// Return true if an STP can be added to this block without increasing the
+/// critical resource height. STP is good to form in Ld/St limited blocks and
+/// bad to form in float-point limited blocks. This is true independent of the
+/// critical path. If the critical path is longer than the resource height, the
+/// extra vector ops can limit physreg renaming. Otherwise, it could simply
+/// oversaturate the vector units.
+bool AArch64StorePairSuppress::shouldAddSTPToBlock(const MachineBasicBlock *BB) {
+  if (!MinInstr)
+    MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
+
+  MachineTraceMetrics::Trace BBTrace = MinInstr->getTrace(BB);
+  unsigned ResLength = BBTrace.getResourceLength();
+
+  // Get the machine model's scheduling class for STPQi.
+  // Bypass TargetSchedule's SchedClass resolution since we only have an opcode.
+  unsigned SCIdx = TII->get(AArch64::STPDi).getSchedClass();
+  const MCSchedClassDesc *SCDesc =
+      SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx);
+
+  // If a subtarget does not define resources for STPQi, bail here.
+  if (SCDesc->isValid() && !SCDesc->isVariant()) {
+    unsigned ResLenWithSTP = BBTrace.getResourceLength(
+        ArrayRef<const MachineBasicBlock *>(), SCDesc);
+    if (ResLenWithSTP > ResLength) {
+      DEBUG(dbgs() << "  Suppress STP in BB: " << BB->getNumber()
+                   << " resources " << ResLength << " -> " << ResLenWithSTP
+                   << "\n");
+      return false;
+    }
+  }
+  return true;
+}
+
+/// Return true if this is a floating-point store smaller than the V reg. On
+/// cyclone, these require a vector shuffle before storing a pair.
+/// Ideally we would call getMatchingPairOpcode() and have the machine model
+/// tell us if it's profitable with no cpu knowledge here.
+///
+/// FIXME: We plan to develop a decent Target abstraction for simple loads and
+/// stores. Until then use a nasty switch similar to AArch64LoadStoreOptimizer.
+bool AArch64StorePairSuppress::isNarrowFPStore(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  case AArch64::STRSui:
+  case AArch64::STRDui:
+  case AArch64::STURSi:
+  case AArch64::STURDi:
+    return true;
+  }
+}
+
+bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &mf) {
+  MF = &mf;
+  TII = static_cast<const AArch64InstrInfo *>(MF->getTarget().getInstrInfo());
+  TRI = MF->getTarget().getRegisterInfo();
+  MRI = &MF->getRegInfo();
+  const TargetSubtargetInfo &ST =
+      MF->getTarget().getSubtarget<TargetSubtargetInfo>();
+  SchedModel.init(*ST.getSchedModel(), &ST, TII);
+
+  Traces = &getAnalysis<MachineTraceMetrics>();
+  MinInstr = nullptr;
+
+  DEBUG(dbgs() << "*** " << getPassName() << ": " << MF->getName() << '\n');
+
+  if (!SchedModel.hasInstrSchedModel()) {
+    DEBUG(dbgs() << "  Skipping pass: no machine model present.\n");
+    return false;
+  }
+
+  // Check for a sequence of stores to the same base address. We don't need to
+  // precisely determine whether a store pair can be formed. But we do want to
+  // filter out most situations where we can't form store pairs to avoid
+  // computing trace metrics in those cases.
+  for (auto &MBB : *MF) {
+    bool SuppressSTP = false;
+    unsigned PrevBaseReg = 0;
+    for (auto &MI : MBB) {
+      if (!isNarrowFPStore(MI))
+        continue;
+      unsigned BaseReg;
+      unsigned Offset;
+      if (TII->getLdStBaseRegImmOfs(&MI, BaseReg, Offset, TRI)) {
+        if (PrevBaseReg == BaseReg) {
+          // If this block can take STPs, skip ahead to the next block.
+          if (!SuppressSTP && shouldAddSTPToBlock(MI.getParent()))
+            break;
+          // Otherwise, continue unpairing the stores in this block.
+          DEBUG(dbgs() << "Unpairing store " << MI << "\n");
+          SuppressSTP = true;
+          TII->suppressLdStPair(&MI);
+        }
+        PrevBaseReg = BaseReg;
+      } else
+        PrevBaseReg = 0;
+    }
+  }
+  // This pass just sets some internal MachineMemOperand flags. It can't really
+  // invalidate anything.
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
new file mode 100644
index 0000000..bb0b72c
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -0,0 +1,130 @@
+//===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AArch64 specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-subtarget"
+
+#define GET_SUBTARGETINFO_CTOR
+#define GET_SUBTARGETINFO_TARGET_DESC
+#include "AArch64GenSubtargetInfo.inc"
+
+static cl::opt<bool>
+EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if "
+                     "converter pass"), cl::init(true), cl::Hidden);
+
+AArch64Subtarget &
+AArch64Subtarget::initializeSubtargetDependencies(StringRef FS) {
+  // Determine default and user-specified characteristics
+
+  if (CPUString.empty())
+    CPUString = "generic";
+
+  ParseSubtargetFeatures(CPUString, FS);
+  return *this;
+}
+
+AArch64Subtarget::AArch64Subtarget(const std::string &TT,
+                                   const std::string &CPU,
+                                   const std::string &FS, TargetMachine &TM,
+                                   bool LittleEndian)
+    : AArch64GenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others),
+      HasFPARMv8(false), HasNEON(false), HasCrypto(false), HasCRC(false),
+      HasZeroCycleRegMove(false), HasZeroCycleZeroing(false), CPUString(CPU),
+      TargetTriple(TT),
+      // This nested ternary is horrible, but DL needs to be properly
+      // initialized
+      // before TLInfo is constructed.
+      DL(isTargetMachO()
+             ? "e-m:o-i64:64-i128:128-n32:64-S128"
+             : (LittleEndian ? "e-m:e-i64:64-i128:128-n32:64-S128"
+                             : "E-m:e-i64:64-i128:128-n32:64-S128")),
+      FrameLowering(), InstrInfo(initializeSubtargetDependencies(FS)),
+      TSInfo(&DL), TLInfo(TM) {}
+
+/// ClassifyGlobalReference - Find the target operand flags that describe
+/// how a global value should be referenced for the current subtarget.
+unsigned char
+AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
+                                        const TargetMachine &TM) const {
+
+  // Determine whether this is a reference to a definition or a declaration.
+  // Materializable GVs (in JIT lazy compilation mode) do not require an extra
+  // load from stub.
+  bool isDecl = GV->hasAvailableExternallyLinkage();
+  if (GV->isDeclaration() && !GV->isMaterializable())
+    isDecl = true;
+
+  // MachO large model always goes via a GOT, simply to get a single 8-byte
+  // absolute relocation on all global addresses.
+  if (TM.getCodeModel() == CodeModel::Large && isTargetMachO())
+    return AArch64II::MO_GOT;
+
+  // The small code mode's direct accesses use ADRP, which cannot necessarily
+  // produce the value 0 (if the code is above 4GB). Therefore they must use the
+  // GOT.
+  if (TM.getCodeModel() == CodeModel::Small && GV->isWeakForLinker() && isDecl)
+    return AArch64II::MO_GOT;
+
+  // If symbol visibility is hidden, the extra load is not needed if
+  // the symbol is definitely defined in the current translation unit.
+
+  // The handling of non-hidden symbols in PIC mode is rather target-dependent:
+  //   + On MachO, if the symbol is defined in this module the GOT can be
+  //     skipped.
+  //   + On ELF, the R_AARCH64_COPY relocation means that even symbols actually
+  //     defined could end up in unexpected places. Use a GOT.
+  if (TM.getRelocationModel() != Reloc::Static && GV->hasDefaultVisibility()) {
+    if (isTargetMachO())
+      return (isDecl || GV->isWeakForLinker()) ? AArch64II::MO_GOT
+                                               : AArch64II::MO_NO_FLAG;
+    else
+      // No need to go through the GOT for local symbols on ELF.
+      return GV->hasLocalLinkage() ? AArch64II::MO_NO_FLAG : AArch64II::MO_GOT;
+  }
+
+  return AArch64II::MO_NO_FLAG;
+}
+
+/// This function returns the name of a function which has an interface
+/// like the non-standard bzero function, if such a function exists on
+/// the current subtarget and it is considered prefereable over
+/// memset with zero passed as the second argument. Otherwise it
+/// returns null.
+const char *AArch64Subtarget::getBZeroEntry() const {
+  // Prefer bzero on Darwin only.
+  if(isTargetDarwin())
+    return "bzero";
+
+  return nullptr;
+}
+
+void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
+                                         MachineInstr *begin, MachineInstr *end,
+                                         unsigned NumRegionInstrs) const {
+  // LNT run (at least on Cyclone) showed reasonably significant gains for
+  // bi-directional scheduling. 253.perlbmk.
+  Policy.OnlyTopDown = false;
+  Policy.OnlyBottomUp = false;
+}
+
+bool AArch64Subtarget::enableEarlyIfConversion() const {
+  return EnableEarlyIfConvert;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h
new file mode 100644
index 0000000..52124f6
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -0,0 +1,132 @@
+//===--- AArch64Subtarget.h - Define Subtarget for the AArch64 -*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the AArch64 specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AArch64SUBTARGET_H
+#define AArch64SUBTARGET_H
+
+#include "AArch64InstrInfo.h"
+#include "AArch64FrameLowering.h"
+#include "AArch64ISelLowering.h"
+#include "AArch64RegisterInfo.h"
+#include "AArch64SelectionDAGInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "AArch64GenSubtargetInfo.inc"
+
+namespace llvm {
+class GlobalValue;
+class StringRef;
+
+class AArch64Subtarget : public AArch64GenSubtargetInfo {
+protected:
+  enum ARMProcFamilyEnum {Others, CortexA53, CortexA57, Cyclone};
+
+  /// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others.
+  ARMProcFamilyEnum ARMProcFamily;
+
+  bool HasFPARMv8;
+  bool HasNEON;
+  bool HasCrypto;
+  bool HasCRC;
+
+  // HasZeroCycleRegMove - Has zero-cycle register mov instructions.
+  bool HasZeroCycleRegMove;
+
+  // HasZeroCycleZeroing - Has zero-cycle zeroing instructions.
+  bool HasZeroCycleZeroing;
+
+  /// CPUString - String name of used CPU.
+  std::string CPUString;
+
+  /// TargetTriple - What processor and OS we're targeting.
+  Triple TargetTriple;
+
+  const DataLayout DL;
+  AArch64FrameLowering FrameLowering;
+  AArch64InstrInfo InstrInfo;
+  AArch64SelectionDAGInfo TSInfo;
+  AArch64TargetLowering TLInfo;
+private:
+  /// initializeSubtargetDependencies - Initializes using CPUString and the
+  /// passed in feature string so that we can use initializer lists for
+  /// subtarget initialization.
+  AArch64Subtarget &initializeSubtargetDependencies(StringRef FS);
+
+public:
+  /// This constructor initializes the data members to match that
+  /// of the specified triple.
+  AArch64Subtarget(const std::string &TT, const std::string &CPU,
+		   const std::string &FS, TargetMachine &TM, bool LittleEndian);
+
+  const AArch64SelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; }
+  const AArch64FrameLowering *getFrameLowering() const {
+    return &FrameLowering;
+  }
+  const AArch64TargetLowering *getTargetLowering() const {
+    return &TLInfo;
+  }
+  const AArch64InstrInfo *getInstrInfo() const { return &InstrInfo; }
+  const DataLayout *getDataLayout() const { return &DL; }
+  bool enableMachineScheduler() const override { return true; }
+
+  bool hasZeroCycleRegMove() const { return HasZeroCycleRegMove; }
+
+  bool hasZeroCycleZeroing() const { return HasZeroCycleZeroing; }
+
+  bool hasFPARMv8() const { return HasFPARMv8; }
+  bool hasNEON() const { return HasNEON; }
+  bool hasCrypto() const { return HasCrypto; }
+  bool hasCRC() const { return HasCRC; }
+
+  bool isLittleEndian() const { return DL.isLittleEndian(); }
+
+  bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
+
+  bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
+
+  bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
+
+  bool isCyclone() const { return CPUString == "cyclone"; }
+
+  /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
+  /// that still makes it profitable to inline the call.
+  unsigned getMaxInlineSizeThreshold() const { return 64; }
+
+  /// ParseSubtargetFeatures - Parses features string setting specified
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  /// ClassifyGlobalReference - Find the target operand flags that describe
+  /// how a global value should be referenced for the current subtarget.
+  unsigned char ClassifyGlobalReference(const GlobalValue *GV,
+                                        const TargetMachine &TM) const;
+
+  /// This function returns the name of a function which has an interface
+  /// like the non-standard bzero function, if such a function exists on
+  /// the current subtarget and it is considered prefereable over
+  /// memset with zero passed as the second argument. Otherwise it
+  /// returns null.
+  const char *getBZeroEntry() const;
+
+  void overrideSchedPolicy(MachineSchedPolicy &Policy, MachineInstr *begin,
+                           MachineInstr *end,
+                           unsigned NumRegionInstrs) const override;
+
+  bool enableEarlyIfConversion() const override;
+};
+} // End llvm namespace
+
+#endif // AArch64SUBTARGET_H
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
new file mode 100644
index 0000000..f99b90b
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -0,0 +1,216 @@
+//===-- AArch64TargetMachine.cpp - Define TargetMachine for AArch64 -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64TargetMachine.h"
+#include "llvm/PassManager.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/Scalar.h"
+using namespace llvm;
+
+static cl::opt<bool>
+EnableCCMP("aarch64-ccmp", cl::desc("Enable the CCMP formation pass"),
+           cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
+EnableStPairSuppress("aarch64-stp-suppress", cl::desc("Suppress STP for AArch64"),
+                     cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
+EnableAdvSIMDScalar("aarch64-simd-scalar", cl::desc("Enable use of AdvSIMD scalar"
+                    " integer instructions"), cl::init(false), cl::Hidden);
+
+static cl::opt<bool>
+EnablePromoteConstant("aarch64-promote-const", cl::desc("Enable the promote "
+                      "constant pass"), cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
+EnableCollectLOH("aarch64-collect-loh", cl::desc("Enable the pass that emits the"
+                 " linker optimization hints (LOH)"), cl::init(true),
+                 cl::Hidden);
+
+static cl::opt<bool>
+EnableDeadRegisterElimination("aarch64-dead-def-elimination", cl::Hidden,
+                              cl::desc("Enable the pass that removes dead"
+                                       " definitons and replaces stores to"
+                                       " them with stores to the zero"
+                                       " register"),
+                              cl::init(true));
+
+static cl::opt<bool>
+EnableLoadStoreOpt("aarch64-load-store-opt", cl::desc("Enable the load/store pair"
+                   " optimization pass"), cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
+EnableAtomicTidy("aarch64-atomic-cfg-tidy", cl::Hidden,
+                 cl::desc("Run SimplifyCFG after expanding atomic operations"
+                          " to make use of cmpxchg flow-based information"),
+                 cl::init(true));
+
+extern "C" void LLVMInitializeAArch64Target() {
+  // Register the target.
+  RegisterTargetMachine<AArch64leTargetMachine> X(TheAArch64leTarget);
+  RegisterTargetMachine<AArch64beTargetMachine> Y(TheAArch64beTarget);
+
+  RegisterTargetMachine<AArch64leTargetMachine> Z(TheARM64leTarget);
+  RegisterTargetMachine<AArch64beTargetMachine> W(TheARM64beTarget);
+}
+
+/// TargetMachine ctor - Create an AArch64 architecture model.
+///
+AArch64TargetMachine::AArch64TargetMachine(const Target &T, StringRef TT,
+                                           StringRef CPU, StringRef FS,
+                                           const TargetOptions &Options,
+                                           Reloc::Model RM, CodeModel::Model CM,
+                                           CodeGenOpt::Level OL,
+                                           bool LittleEndian)
+    : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+      Subtarget(TT, CPU, FS, *this, LittleEndian) {
+  initAsmInfo();
+}
+
+void AArch64leTargetMachine::anchor() { }
+
+AArch64leTargetMachine::
+AArch64leTargetMachine(const Target &T, StringRef TT,
+                       StringRef CPU, StringRef FS, const TargetOptions &Options,
+                       Reloc::Model RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL)
+  : AArch64TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
+
+void AArch64beTargetMachine::anchor() { }
+
+AArch64beTargetMachine::
+AArch64beTargetMachine(const Target &T, StringRef TT,
+                       StringRef CPU, StringRef FS, const TargetOptions &Options,
+                       Reloc::Model RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL)
+  : AArch64TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
+
+namespace {
+/// AArch64 Code Generator Pass Configuration Options.
+class AArch64PassConfig : public TargetPassConfig {
+public:
+  AArch64PassConfig(AArch64TargetMachine *TM, PassManagerBase &PM)
+      : TargetPassConfig(TM, PM) {}
+
+  AArch64TargetMachine &getAArch64TargetMachine() const {
+    return getTM<AArch64TargetMachine>();
+  }
+
+  void addIRPasses()  override;
+  bool addPreISel() override;
+  bool addInstSelector() override;
+  bool addILPOpts() override;
+  bool addPreRegAlloc() override;
+  bool addPostRegAlloc() override;
+  bool addPreSched2() override;
+  bool addPreEmitPass() override;
+};
+} // namespace
+
+void AArch64TargetMachine::addAnalysisPasses(PassManagerBase &PM) {
+  // Add first the target-independent BasicTTI pass, then our AArch64 pass. This
+  // allows the AArch64 pass to delegate to the target independent layer when
+  // appropriate.
+  PM.add(createBasicTargetTransformInfoPass(this));
+  PM.add(createAArch64TargetTransformInfoPass(this));
+}
+
+TargetPassConfig *AArch64TargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new AArch64PassConfig(this, PM);
+}
+
+void AArch64PassConfig::addIRPasses() {
+  // Always expand atomic operations, we don't deal with atomicrmw or cmpxchg
+  // ourselves.
+  addPass(createAtomicExpandLoadLinkedPass(TM));
+
+  // Cmpxchg instructions are often used with a subsequent comparison to
+  // determine whether it succeeded. We can exploit existing control-flow in
+  // ldrex/strex loops to simplify this, but it needs tidying up.
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy)
+    addPass(createCFGSimplificationPass());
+
+  TargetPassConfig::addIRPasses();
+}
+
+// Pass Pipeline Configuration
+bool AArch64PassConfig::addPreISel() {
+  // Run promote constant before global merge, so that the promoted constants
+  // get a chance to be merged
+  if (TM->getOptLevel() != CodeGenOpt::None && EnablePromoteConstant)
+    addPass(createAArch64PromoteConstantPass());
+  if (TM->getOptLevel() != CodeGenOpt::None)
+    addPass(createGlobalMergePass(TM));
+  if (TM->getOptLevel() != CodeGenOpt::None)
+    addPass(createAArch64AddressTypePromotionPass());
+
+  return false;
+}
+
+bool AArch64PassConfig::addInstSelector() {
+  addPass(createAArch64ISelDag(getAArch64TargetMachine(), getOptLevel()));
+
+  // For ELF, cleanup any local-dynamic TLS accesses (i.e. combine as many
+  // references to _TLS_MODULE_BASE_ as possible.
+  if (TM->getSubtarget<AArch64Subtarget>().isTargetELF() &&
+      getOptLevel() != CodeGenOpt::None)
+    addPass(createAArch64CleanupLocalDynamicTLSPass());
+
+  return false;
+}
+
+bool AArch64PassConfig::addILPOpts() {
+  if (EnableCCMP)
+    addPass(createAArch64ConditionalCompares());
+  addPass(&EarlyIfConverterID);
+  if (EnableStPairSuppress)
+    addPass(createAArch64StorePairSuppressPass());
+  return true;
+}
+
+bool AArch64PassConfig::addPreRegAlloc() {
+  // Use AdvSIMD scalar instructions whenever profitable.
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableAdvSIMDScalar)
+    addPass(createAArch64AdvSIMDScalar());
+  return true;
+}
+
+bool AArch64PassConfig::addPostRegAlloc() {
+  // Change dead register definitions to refer to the zero register.
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableDeadRegisterElimination)
+    addPass(createAArch64DeadRegisterDefinitions());
+  return true;
+}
+
+bool AArch64PassConfig::addPreSched2() {
+  // Expand some pseudo instructions to allow proper scheduling.
+  addPass(createAArch64ExpandPseudoPass());
+  // Use load/store pair instructions when possible.
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableLoadStoreOpt)
+    addPass(createAArch64LoadStoreOptimizationPass());
+  return true;
+}
+
+bool AArch64PassConfig::addPreEmitPass() {
+  // Relax conditional branch instructions if they're otherwise out of
+  // range of their destination.
+  addPass(createAArch64BranchRelaxation());
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableCollectLOH &&
+      TM->getSubtarget<AArch64Subtarget>().isTargetMachO())
+    addPass(createAArch64CollectLOHPass());
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.h b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.h
new file mode 100644
index 0000000..852cb3f
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.h
@@ -0,0 +1,87 @@
+//==-- AArch64TargetMachine.h - Define TargetMachine for AArch64 -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the AArch64 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AArch64TARGETMACHINE_H
+#define AArch64TARGETMACHINE_H
+
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class AArch64TargetMachine : public LLVMTargetMachine {
+protected:
+  AArch64Subtarget Subtarget;
+
+public:
+  AArch64TargetMachine(const Target &T, StringRef TT, StringRef CPU,
+                       StringRef FS, const TargetOptions &Options,
+                       Reloc::Model RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL, bool IsLittleEndian);
+
+  const AArch64Subtarget *getSubtargetImpl() const override {
+    return &Subtarget;
+  }
+  const AArch64TargetLowering *getTargetLowering() const override {
+    return getSubtargetImpl()->getTargetLowering();
+  }
+  const DataLayout *getDataLayout() const override {
+    return getSubtargetImpl()->getDataLayout();
+  }
+  const AArch64FrameLowering *getFrameLowering() const override {
+    return getSubtargetImpl()->getFrameLowering();
+  }
+  const AArch64InstrInfo *getInstrInfo() const override {
+    return getSubtargetImpl()->getInstrInfo();
+  }
+  const AArch64RegisterInfo *getRegisterInfo() const override {
+    return &getInstrInfo()->getRegisterInfo();
+  }
+  const AArch64SelectionDAGInfo *getSelectionDAGInfo() const override {
+    return getSubtargetImpl()->getSelectionDAGInfo();
+  }
+
+  // Pass Pipeline Configuration
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+  /// \brief Register AArch64 analysis passes with a pass manager.
+  void addAnalysisPasses(PassManagerBase &PM) override;
+};
+
+// AArch64leTargetMachine - AArch64 little endian target machine.
+//
+class AArch64leTargetMachine : public AArch64TargetMachine {
+  virtual void anchor();
+public:
+  AArch64leTargetMachine(const Target &T, StringRef TT, StringRef CPU,
+                         StringRef FS, const TargetOptions &Options,
+                         Reloc::Model RM, CodeModel::Model CM,
+                         CodeGenOpt::Level OL);
+};
+
+// AArch64beTargetMachine - AArch64 big endian target machine.
+//
+class AArch64beTargetMachine : public AArch64TargetMachine {
+  virtual void anchor();
+public:
+  AArch64beTargetMachine(const Target &T, StringRef TT, StringRef CPU,
+                         StringRef FS, const TargetOptions &Options,
+                         Reloc::Model RM, CodeModel::Model CM,
+                         CodeGenOpt::Level OL);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp b/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp
new file mode 100644
index 0000000..4069038
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp
@@ -0,0 +1,52 @@
+//===-- AArch64TargetObjectFile.cpp - AArch64 Object Info -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64TargetObjectFile.h"
+#include "AArch64TargetMachine.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/Dwarf.h"
+using namespace llvm;
+using namespace dwarf;
+
+void AArch64_ELFTargetObjectFile::Initialize(MCContext &Ctx,
+                                             const TargetMachine &TM) {
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+  InitializeELF(TM.Options.UseInitArray);
+}
+
+const MCExpr *AArch64_MachoTargetObjectFile::getTTypeGlobalReference(
+    const GlobalValue *GV, unsigned Encoding, Mangler &Mang,
+    const TargetMachine &TM, MachineModuleInfo *MMI,
+    MCStreamer &Streamer) const {
+  // On Darwin, we can reference dwarf symbols with foo@GOT-., which
+  // is an indirect pc-relative reference. The default implementation
+  // won't reference using the GOT, so we need this target-specific
+  // version.
+  if (Encoding & (DW_EH_PE_indirect | DW_EH_PE_pcrel)) {
+    const MCSymbol *Sym = TM.getSymbol(GV, Mang);
+    const MCExpr *Res =
+        MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOT, getContext());
+    MCSymbol *PCSym = getContext().CreateTempSymbol();
+    Streamer.EmitLabel(PCSym);
+    const MCExpr *PC = MCSymbolRefExpr::Create(PCSym, getContext());
+    return MCBinaryExpr::CreateSub(Res, PC, getContext());
+  }
+
+  return TargetLoweringObjectFileMachO::getTTypeGlobalReference(
+      GV, Encoding, Mang, TM, MMI, Streamer);
+}
+
+MCSymbol *AArch64_MachoTargetObjectFile::getCFIPersonalitySymbol(
+    const GlobalValue *GV, Mangler &Mang, const TargetMachine &TM,
+    MachineModuleInfo *MMI) const {
+  return TM.getSymbol(GV, Mang);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h b/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h
new file mode 100644
index 0000000..de63cb4
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h
@@ -0,0 +1,40 @@
+//===-- AArch64TargetObjectFile.h - AArch64 Object Info -*- C++ ---------*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_AArch64_TARGETOBJECTFILE_H
+#define LLVM_TARGET_AArch64_TARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
+namespace llvm {
+class AArch64TargetMachine;
+
+/// This implementation is used for AArch64 ELF targets (Linux in particular).
+class AArch64_ELFTargetObjectFile : public TargetLoweringObjectFileELF {
+  void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+};
+
+/// AArch64_MachoTargetObjectFile - This TLOF implementation is used for Darwin.
+class AArch64_MachoTargetObjectFile : public TargetLoweringObjectFileMachO {
+public:
+  const MCExpr *getTTypeGlobalReference(const GlobalValue *GV,
+                                        unsigned Encoding, Mangler &Mang,
+                                        const TargetMachine &TM,
+                                        MachineModuleInfo *MMI,
+                                        MCStreamer &Streamer) const override;
+
+  MCSymbol *getCFIPersonalitySymbol(const GlobalValue *GV, Mangler &Mang,
+                                    const TargetMachine &TM,
+                                    MachineModuleInfo *MMI) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
new file mode 100644
index 0000000..1dac14b
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -0,0 +1,500 @@
+//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI pass --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements a TargetTransformInfo analysis pass specific to the
+/// AArch64 target machine. It uses the target's detailed information to provide
+/// more precise answers to certain TTI queries, while letting the target
+/// independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64TargetMachine.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/CostTable.h"
+#include "llvm/Target/TargetLowering.h"
+#include <algorithm>
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64tti"
+
+// Declare the pass initialization routine locally as target-specific passes
+// don't have a target-wide initialization entry point, and so we rely on the
+// pass constructor initialization.
+namespace llvm {
+void initializeAArch64TTIPass(PassRegistry &);
+}
+
+namespace {
+
+class AArch64TTI final : public ImmutablePass, public TargetTransformInfo {
+  const AArch64TargetMachine *TM;
+  const AArch64Subtarget *ST;
+  const AArch64TargetLowering *TLI;
+
+  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
+  /// are set if the result needs to be inserted and/or extracted from vectors.
+  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
+
+public:
+  AArch64TTI() : ImmutablePass(ID), TM(nullptr), ST(nullptr), TLI(nullptr) {
+    llvm_unreachable("This pass cannot be directly constructed");
+  }
+
+  AArch64TTI(const AArch64TargetMachine *TM)
+      : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
+        TLI(TM->getTargetLowering()) {
+    initializeAArch64TTIPass(*PassRegistry::getPassRegistry());
+  }
+
+  void initializePass() override { pushTTIStack(this); }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    TargetTransformInfo::getAnalysisUsage(AU);
+  }
+
+  /// Pass identification.
+  static char ID;
+
+  /// Provide necessary pointer adjustments for the two base classes.
+  void *getAdjustedAnalysisPointer(const void *ID) override {
+    if (ID == &TargetTransformInfo::ID)
+      return (TargetTransformInfo *)this;
+    return this;
+  }
+
+  /// \name Scalar TTI Implementations
+  /// @{
+  unsigned getIntImmCost(int64_t Val) const;
+  unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override;
+  unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
+                         Type *Ty) const override;
+  unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+                         Type *Ty) const override;
+  PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override;
+
+  /// @}
+
+  /// \name Vector TTI Implementations
+  /// @{
+
+  unsigned getNumberOfRegisters(bool Vector) const override {
+    if (Vector) {
+      if (ST->hasNEON())
+        return 32;
+      return 0;
+    }
+    return 31;
+  }
+
+  unsigned getRegisterBitWidth(bool Vector) const override {
+    if (Vector) {
+      if (ST->hasNEON())
+        return 128;
+      return 0;
+    }
+    return 64;
+  }
+
+  unsigned getMaximumUnrollFactor() const override { return 2; }
+
+  unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const
+      override;
+
+  unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const
+      override;
+
+  unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+                                  OperandValueKind Opd1Info = OK_AnyValue,
+                                  OperandValueKind Opd2Info = OK_AnyValue) const
+      override;
+
+  unsigned getAddressComputationCost(Type *Ty, bool IsComplex) const override;
+
+  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) const
+      override;
+
+  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                           unsigned AddressSpace) const override;
+  /// @}
+};
+
+} // end anonymous namespace
+
+INITIALIZE_AG_PASS(AArch64TTI, TargetTransformInfo, "aarch64tti",
+                   "AArch64 Target Transform Info", true, true, false)
+char AArch64TTI::ID = 0;
+
+ImmutablePass *
+llvm::createAArch64TargetTransformInfoPass(const AArch64TargetMachine *TM) {
+  return new AArch64TTI(TM);
+}
+
+/// \brief Calculate the cost of materializing a 64-bit value. This helper
+/// method might only calculate a fraction of a larger immediate. Therefore it
+/// is valid to return a cost of ZERO.
+unsigned AArch64TTI::getIntImmCost(int64_t Val) const {
+  // Check if the immediate can be encoded within an instruction.
+  if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
+    return 0;
+
+  if (Val < 0)
+    Val = ~Val;
+
+  // Calculate how many moves we will need to materialize this constant.
+  unsigned LZ = countLeadingZeros((uint64_t)Val);
+  return (64 - LZ + 15) / 16;
+}
+
+/// \brief Calculate the cost of materializing the given constant.
+unsigned AArch64TTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  if (BitSize == 0)
+    return ~0U;
+
+  // Sign-extend all constants to a multiple of 64-bit.
+  APInt ImmVal = Imm;
+  if (BitSize & 0x3f)
+    ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
+
+  // Split the constant into 64-bit chunks and calculate the cost for each
+  // chunk.
+  unsigned Cost = 0;
+  for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
+    APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
+    int64_t Val = Tmp.getSExtValue();
+    Cost += getIntImmCost(Val);
+  }
+  // We need at least one instruction to materialze the constant.
+  return std::max(1U, Cost);
+}
+
+unsigned AArch64TTI::getIntImmCost(unsigned Opcode, unsigned Idx,
+                                 const APInt &Imm, Type *Ty) const {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  // There is no cost model for constants with a bit size of 0. Return TCC_Free
+  // here, so that constant hoisting will ignore this constant.
+  if (BitSize == 0)
+    return TCC_Free;
+
+  unsigned ImmIdx = ~0U;
+  switch (Opcode) {
+  default:
+    return TCC_Free;
+  case Instruction::GetElementPtr:
+    // Always hoist the base address of a GetElementPtr.
+    if (Idx == 0)
+      return 2 * TCC_Basic;
+    return TCC_Free;
+  case Instruction::Store:
+    ImmIdx = 0;
+    break;
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::ICmp:
+    ImmIdx = 1;
+    break;
+  // Always return TCC_Free for the shift value of a shift instruction.
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+    if (Idx == 1)
+      return TCC_Free;
+    break;
+  case Instruction::Trunc:
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::IntToPtr:
+  case Instruction::PtrToInt:
+  case Instruction::BitCast:
+  case Instruction::PHI:
+  case Instruction::Call:
+  case Instruction::Select:
+  case Instruction::Ret:
+  case Instruction::Load:
+    break;
+  }
+
+  if (Idx == ImmIdx) {
+    unsigned NumConstants = (BitSize + 63) / 64;
+    unsigned Cost = AArch64TTI::getIntImmCost(Imm, Ty);
+    return (Cost <= NumConstants * TCC_Basic)
+      ? static_cast<unsigned>(TCC_Free) : Cost;
+  }
+  return AArch64TTI::getIntImmCost(Imm, Ty);
+}
+
+unsigned AArch64TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
+                                 const APInt &Imm, Type *Ty) const {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  // There is no cost model for constants with a bit size of 0. Return TCC_Free
+  // here, so that constant hoisting will ignore this constant.
+  if (BitSize == 0)
+    return TCC_Free;
+
+  switch (IID) {
+  default:
+    return TCC_Free;
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::uadd_with_overflow:
+  case Intrinsic::ssub_with_overflow:
+  case Intrinsic::usub_with_overflow:
+  case Intrinsic::smul_with_overflow:
+  case Intrinsic::umul_with_overflow:
+    if (Idx == 1) {
+      unsigned NumConstants = (BitSize + 63) / 64;
+      unsigned Cost = AArch64TTI::getIntImmCost(Imm, Ty);
+      return (Cost <= NumConstants * TCC_Basic)
+        ? static_cast<unsigned>(TCC_Free) : Cost;
+    }
+    break;
+  case Intrinsic::experimental_stackmap:
+    if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
+      return TCC_Free;
+    break;
+  case Intrinsic::experimental_patchpoint_void:
+  case Intrinsic::experimental_patchpoint_i64:
+    if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
+      return TCC_Free;
+    break;
+  }
+  return AArch64TTI::getIntImmCost(Imm, Ty);
+}
+
+AArch64TTI::PopcntSupportKind
+AArch64TTI::getPopcntSupport(unsigned TyWidth) const {
+  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
+  if (TyWidth == 32 || TyWidth == 64)
+    return PSK_FastHardware;
+  // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
+  return PSK_Software;
+}
+
+unsigned AArch64TTI::getCastInstrCost(unsigned Opcode, Type *Dst,
+                                    Type *Src) const {
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  EVT SrcTy = TLI->getValueType(Src);
+  EVT DstTy = TLI->getValueType(Dst);
+
+  if (!SrcTy.isSimple() || !DstTy.isSimple())
+    return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
+
+  static const TypeConversionCostTblEntry<MVT> ConversionTbl[] = {
+    // LowerVectorINT_TO_FP:
+    { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
+    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
+    { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
+    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
+
+    // Complex: to v2f32
+    { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8,  3 },
+    { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
+    { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
+    { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8,  3 },
+    { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
+    { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
+
+    // Complex: to v4f32
+    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8,  4 },
+    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
+    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8,  3 },
+    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
+
+    // Complex: to v2f64
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8,  4 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8,  4 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
+
+
+    // LowerVectorFP_TO_INT
+    { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 },
+    { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
+    { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
+    { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
+    { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
+    { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
+
+    // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
+    { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 },
+    { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 },
+    { ISD::FP_TO_SINT, MVT::v2i8,  MVT::v2f32, 1 },
+    { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 },
+    { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 },
+    { ISD::FP_TO_UINT, MVT::v2i8,  MVT::v2f32, 1 },
+
+    // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
+    { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
+    { ISD::FP_TO_SINT, MVT::v4i8,  MVT::v4f32, 2 },
+    { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
+    { ISD::FP_TO_UINT, MVT::v4i8,  MVT::v4f32, 2 },
+
+    // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
+    { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
+    { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
+    { ISD::FP_TO_SINT, MVT::v2i8,  MVT::v2f64, 2 },
+    { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
+    { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
+    { ISD::FP_TO_UINT, MVT::v2i8,  MVT::v2f64, 2 },
+  };
+
+  int Idx = ConvertCostTableLookup<MVT>(
+      ConversionTbl, array_lengthof(ConversionTbl), ISD, DstTy.getSimpleVT(),
+      SrcTy.getSimpleVT());
+  if (Idx != -1)
+    return ConversionTbl[Idx].Cost;
+
+  return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
+}
+
+unsigned AArch64TTI::getVectorInstrCost(unsigned Opcode, Type *Val,
+                                      unsigned Index) const {
+  assert(Val->isVectorTy() && "This must be a vector type");
+
+  if (Index != -1U) {
+    // Legalize the type.
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val);
+
+    // This type is legalized to a scalar type.
+    if (!LT.second.isVector())
+      return 0;
+
+    // The type may be split. Normalize the index to the new type.
+    unsigned Width = LT.second.getVectorNumElements();
+    Index = Index % Width;
+
+    // The element at index zero is already inside the vector.
+    if (Index == 0)
+      return 0;
+  }
+
+  // All other insert/extracts cost this much.
+  return 2;
+}
+
+unsigned AArch64TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+                                          OperandValueKind Opd1Info,
+                                          OperandValueKind Opd2Info) const {
+  // Legalize the type.
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
+
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+
+  switch (ISD) {
+  default:
+    return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Opd1Info,
+                                                       Opd2Info);
+  case ISD::ADD:
+  case ISD::MUL:
+  case ISD::XOR:
+  case ISD::OR:
+  case ISD::AND:
+    // These nodes are marked as 'custom' for combining purposes only.
+    // We know that they are legal. See LowerAdd in ISelLowering.
+    return 1 * LT.first;
+  }
+}
+
+unsigned AArch64TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
+  // Address computations in vectorized code with non-consecutive addresses will
+  // likely result in more instructions compared to scalar code where the
+  // computation can more often be merged into the index mode. The resulting
+  // extra micro-ops can significantly decrease throughput.
+  unsigned NumVectorInstToHideOverhead = 10;
+
+  if (Ty->isVectorTy() && IsComplex)
+    return NumVectorInstToHideOverhead;
+
+  // In many cases the address computation is not merged into the instruction
+  // addressing mode.
+  return 1;
+}
+
+unsigned AArch64TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                                      Type *CondTy) const {
+
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  // We don't lower vector selects well that are wider than the register width.
+  if (ValTy->isVectorTy() && ISD == ISD::SELECT) {
+    // We would need this many instructions to hide the scalarization happening.
+    unsigned AmortizationCost = 20;
+    static const TypeConversionCostTblEntry<MVT::SimpleValueType>
+    VectorSelectTbl[] = {
+      { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 * AmortizationCost },
+      { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 * AmortizationCost },
+      { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 * AmortizationCost },
+      { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
+      { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
+      { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
+    };
+
+    EVT SelCondTy = TLI->getValueType(CondTy);
+    EVT SelValTy = TLI->getValueType(ValTy);
+    if (SelCondTy.isSimple() && SelValTy.isSimple()) {
+      int Idx =
+          ConvertCostTableLookup(VectorSelectTbl, ISD, SelCondTy.getSimpleVT(),
+                                 SelValTy.getSimpleVT());
+      if (Idx != -1)
+        return VectorSelectTbl[Idx].Cost;
+    }
+  }
+  return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+}
+
+unsigned AArch64TTI::getMemoryOpCost(unsigned Opcode, Type *Src,
+                                   unsigned Alignment,
+                                   unsigned AddressSpace) const {
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
+
+  if (Opcode == Instruction::Store && Src->isVectorTy() && Alignment != 16 &&
+      Src->getVectorElementType()->isIntegerTy(64)) {
+    // Unaligned stores are extremely inefficient. We don't split
+    // unaligned v2i64 stores because the negative impact that has shown in
+    // practice on inlined memcpy code.
+    // We make v2i64 stores expensive so that we will only vectorize if there
+    // are 6 other instructions getting vectorized.
+    unsigned AmortizationCost = 6;
+
+    return LT.first * 2 * AmortizationCost;
+  }
+
+  if (Src->isVectorTy() && Src->getVectorElementType()->isIntegerTy(8) &&
+      Src->getVectorNumElements() < 8) {
+    // We scalarize the loads/stores because there is not v.4b register and we
+    // have to promote the elements to v.4h.
+    unsigned NumVecElts = Src->getVectorNumElements();
+    unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
+    // We generate 2 instructions per vector element.
+    return NumVectorizableInstsToAmortize * NumVecElts * 2;
+  }
+
+  return LT.first;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
new file mode 100644
index 0000000..37e9296
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -0,0 +1,4213 @@
+//==- AArch64AsmParser.cpp - Parse AArch64 assembly to MCInst instructions -==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "MCTargetDesc/AArch64MCExpr.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include <cstdio>
+using namespace llvm;
+
+namespace {
+
+class AArch64Operand;
+
+class AArch64AsmParser : public MCTargetAsmParser {
+private:
+  StringRef Mnemonic; ///< Instruction mnemonic.
+  MCSubtargetInfo &STI;
+  MCAsmParser &Parser;
+
+  // Map of register aliases registers via the .req directive.
+  StringMap<std::pair<bool, unsigned> > RegisterReqs;
+
+  AArch64TargetStreamer &getTargetStreamer() {
+    MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer();
+    return static_cast<AArch64TargetStreamer &>(TS);
+  }
+
+  MCAsmParser &getParser() const { return Parser; }
+  MCAsmLexer &getLexer() const { return Parser.getLexer(); }
+
+  SMLoc getLoc() const { return Parser.getTok().getLoc(); }
+
+  bool parseSysAlias(StringRef Name, SMLoc NameLoc, OperandVector &Operands);
+  AArch64CC::CondCode parseCondCodeString(StringRef Cond);
+  bool parseCondCode(OperandVector &Operands, bool invertCondCode);
+  unsigned matchRegisterNameAlias(StringRef Name, bool isVector);
+  int tryParseRegister();
+  int tryMatchVectorRegister(StringRef &Kind, bool expected);
+  bool parseRegister(OperandVector &Operands);
+  bool parseSymbolicImmVal(const MCExpr *&ImmVal);
+  bool parseVectorList(OperandVector &Operands);
+  bool parseOperand(OperandVector &Operands, bool isCondCode,
+                    bool invertCondCode);
+
+  void Warning(SMLoc L, const Twine &Msg) { Parser.Warning(L, Msg); }
+  bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); }
+  bool showMatchError(SMLoc Loc, unsigned ErrCode);
+
+  bool parseDirectiveWord(unsigned Size, SMLoc L);
+  bool parseDirectiveTLSDescCall(SMLoc L);
+
+  bool parseDirectiveLOH(StringRef LOH, SMLoc L);
+  bool parseDirectiveLtorg(SMLoc L);
+
+  bool parseDirectiveReq(StringRef Name, SMLoc L);
+  bool parseDirectiveUnreq(SMLoc L);
+
+  bool validateInstruction(MCInst &Inst, SmallVectorImpl<SMLoc> &Loc);
+  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                               OperandVector &Operands, MCStreamer &Out,
+                               unsigned &ErrorInfo,
+                               bool MatchingInlineAsm) override;
+/// @name Auto-generated Match Functions
+/// {
+
+#define GET_ASSEMBLER_HEADER
+#include "AArch64GenAsmMatcher.inc"
+
+  /// }
+
+  OperandMatchResultTy tryParseOptionalShiftExtend(OperandVector &Operands);
+  OperandMatchResultTy tryParseBarrierOperand(OperandVector &Operands);
+  OperandMatchResultTy tryParseMRSSystemRegister(OperandVector &Operands);
+  OperandMatchResultTy tryParseSysReg(OperandVector &Operands);
+  OperandMatchResultTy tryParseSysCROperand(OperandVector &Operands);
+  OperandMatchResultTy tryParsePrefetch(OperandVector &Operands);
+  OperandMatchResultTy tryParseAdrpLabel(OperandVector &Operands);
+  OperandMatchResultTy tryParseAdrLabel(OperandVector &Operands);
+  OperandMatchResultTy tryParseFPImm(OperandVector &Operands);
+  OperandMatchResultTy tryParseAddSubImm(OperandVector &Operands);
+  OperandMatchResultTy tryParseGPR64sp0Operand(OperandVector &Operands);
+  bool tryParseVectorRegister(OperandVector &Operands);
+
+public:
+  enum AArch64MatchResultTy {
+    Match_InvalidSuffix = FIRST_TARGET_MATCH_RESULT_TY,
+#define GET_OPERAND_DIAGNOSTIC_TYPES
+#include "AArch64GenAsmMatcher.inc"
+  };
+  AArch64AsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser,
+                 const MCInstrInfo &MII,
+                 const MCTargetOptions &Options)
+      : MCTargetAsmParser(), STI(_STI), Parser(_Parser) {
+    MCAsmParserExtension::Initialize(_Parser);
+    if (Parser.getStreamer().getTargetStreamer() == nullptr)
+      new AArch64TargetStreamer(Parser.getStreamer());
+
+    // Initialize the set of available features.
+    setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+  }
+
+  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                        SMLoc NameLoc, OperandVector &Operands) override;
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+  bool ParseDirective(AsmToken DirectiveID) override;
+  unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
+                                      unsigned Kind) override;
+
+  static bool classifySymbolRef(const MCExpr *Expr,
+                                AArch64MCExpr::VariantKind &ELFRefKind,
+                                MCSymbolRefExpr::VariantKind &DarwinRefKind,
+                                int64_t &Addend);
+};
+} // end anonymous namespace
+
+namespace {
+
+/// AArch64Operand - Instances of this class represent a parsed AArch64 machine
+/// instruction.
+class AArch64Operand : public MCParsedAsmOperand {
+private:
+  enum KindTy {
+    k_Immediate,
+    k_ShiftedImm,
+    k_CondCode,
+    k_Register,
+    k_VectorList,
+    k_VectorIndex,
+    k_Token,
+    k_SysReg,
+    k_SysCR,
+    k_Prefetch,
+    k_ShiftExtend,
+    k_FPImm,
+    k_Barrier
+  } Kind;
+
+  SMLoc StartLoc, EndLoc;
+
+  struct TokOp {
+    const char *Data;
+    unsigned Length;
+    bool IsSuffix; // Is the operand actually a suffix on the mnemonic.
+  };
+
+  struct RegOp {
+    unsigned RegNum;
+    bool isVector;
+  };
+
+  struct VectorListOp {
+    unsigned RegNum;
+    unsigned Count;
+    unsigned NumElements;
+    unsigned ElementKind;
+  };
+
+  struct VectorIndexOp {
+    unsigned Val;
+  };
+
+  struct ImmOp {
+    const MCExpr *Val;
+  };
+
+  struct ShiftedImmOp {
+    const MCExpr *Val;
+    unsigned ShiftAmount;
+  };
+
+  struct CondCodeOp {
+    AArch64CC::CondCode Code;
+  };
+
+  struct FPImmOp {
+    unsigned Val; // Encoded 8-bit representation.
+  };
+
+  struct BarrierOp {
+    unsigned Val; // Not the enum since not all values have names.
+  };
+
+  struct SysRegOp {
+    const char *Data;
+    unsigned Length;
+    uint64_t FeatureBits; // We need to pass through information about which
+                          // core we are compiling for so that the SysReg
+                          // Mappers can appropriately conditionalize.
+  };
+
+  struct SysCRImmOp {
+    unsigned Val;
+  };
+
+  struct PrefetchOp {
+    unsigned Val;
+  };
+
+  struct ShiftExtendOp {
+    AArch64_AM::ShiftExtendType Type;
+    unsigned Amount;
+    bool HasExplicitAmount;
+  };
+
+  struct ExtendOp {
+    unsigned Val;
+  };
+
+  union {
+    struct TokOp Tok;
+    struct RegOp Reg;
+    struct VectorListOp VectorList;
+    struct VectorIndexOp VectorIndex;
+    struct ImmOp Imm;
+    struct ShiftedImmOp ShiftedImm;
+    struct CondCodeOp CondCode;
+    struct FPImmOp FPImm;
+    struct BarrierOp Barrier;
+    struct SysRegOp SysReg;
+    struct SysCRImmOp SysCRImm;
+    struct PrefetchOp Prefetch;
+    struct ShiftExtendOp ShiftExtend;
+  };
+
+  // Keep the MCContext around as the MCExprs may need manipulated during
+  // the add<>Operands() calls.
+  MCContext &Ctx;
+
+public:
+  AArch64Operand(KindTy K, MCContext &_Ctx)
+      : MCParsedAsmOperand(), Kind(K), Ctx(_Ctx) {}
+
+  AArch64Operand(const AArch64Operand &o) : MCParsedAsmOperand(), Ctx(o.Ctx) {
+    Kind = o.Kind;
+    StartLoc = o.StartLoc;
+    EndLoc = o.EndLoc;
+    switch (Kind) {
+    case k_Token:
+      Tok = o.Tok;
+      break;
+    case k_Immediate:
+      Imm = o.Imm;
+      break;
+    case k_ShiftedImm:
+      ShiftedImm = o.ShiftedImm;
+      break;
+    case k_CondCode:
+      CondCode = o.CondCode;
+      break;
+    case k_FPImm:
+      FPImm = o.FPImm;
+      break;
+    case k_Barrier:
+      Barrier = o.Barrier;
+      break;
+    case k_Register:
+      Reg = o.Reg;
+      break;
+    case k_VectorList:
+      VectorList = o.VectorList;
+      break;
+    case k_VectorIndex:
+      VectorIndex = o.VectorIndex;
+      break;
+    case k_SysReg:
+      SysReg = o.SysReg;
+      break;
+    case k_SysCR:
+      SysCRImm = o.SysCRImm;
+      break;
+    case k_Prefetch:
+      Prefetch = o.Prefetch;
+      break;
+    case k_ShiftExtend:
+      ShiftExtend = o.ShiftExtend;
+      break;
+    }
+  }
+
+  /// getStartLoc - Get the location of the first token of this operand.
+  SMLoc getStartLoc() const override { return StartLoc; }
+  /// getEndLoc - Get the location of the last token of this operand.
+  SMLoc getEndLoc() const override { return EndLoc; }
+
+  StringRef getToken() const {
+    assert(Kind == k_Token && "Invalid access!");
+    return StringRef(Tok.Data, Tok.Length);
+  }
+
+  bool isTokenSuffix() const {
+    assert(Kind == k_Token && "Invalid access!");
+    return Tok.IsSuffix;
+  }
+
+  const MCExpr *getImm() const {
+    assert(Kind == k_Immediate && "Invalid access!");
+    return Imm.Val;
+  }
+
+  const MCExpr *getShiftedImmVal() const {
+    assert(Kind == k_ShiftedImm && "Invalid access!");
+    return ShiftedImm.Val;
+  }
+
+  unsigned getShiftedImmShift() const {
+    assert(Kind == k_ShiftedImm && "Invalid access!");
+    return ShiftedImm.ShiftAmount;
+  }
+
+  AArch64CC::CondCode getCondCode() const {
+    assert(Kind == k_CondCode && "Invalid access!");
+    return CondCode.Code;
+  }
+
+  unsigned getFPImm() const {
+    assert(Kind == k_FPImm && "Invalid access!");
+    return FPImm.Val;
+  }
+
+  unsigned getBarrier() const {
+    assert(Kind == k_Barrier && "Invalid access!");
+    return Barrier.Val;
+  }
+
+  unsigned getReg() const override {
+    assert(Kind == k_Register && "Invalid access!");
+    return Reg.RegNum;
+  }
+
+  unsigned getVectorListStart() const {
+    assert(Kind == k_VectorList && "Invalid access!");
+    return VectorList.RegNum;
+  }
+
+  unsigned getVectorListCount() const {
+    assert(Kind == k_VectorList && "Invalid access!");
+    return VectorList.Count;
+  }
+
+  unsigned getVectorIndex() const {
+    assert(Kind == k_VectorIndex && "Invalid access!");
+    return VectorIndex.Val;
+  }
+
+  StringRef getSysReg() const {
+    assert(Kind == k_SysReg && "Invalid access!");
+    return StringRef(SysReg.Data, SysReg.Length);
+  }
+
+  uint64_t getSysRegFeatureBits() const {
+    assert(Kind == k_SysReg && "Invalid access!");
+    return SysReg.FeatureBits;
+  }
+
+  unsigned getSysCR() const {
+    assert(Kind == k_SysCR && "Invalid access!");
+    return SysCRImm.Val;
+  }
+
+  unsigned getPrefetch() const {
+    assert(Kind == k_Prefetch && "Invalid access!");
+    return Prefetch.Val;
+  }
+
+  AArch64_AM::ShiftExtendType getShiftExtendType() const {
+    assert(Kind == k_ShiftExtend && "Invalid access!");
+    return ShiftExtend.Type;
+  }
+
+  unsigned getShiftExtendAmount() const {
+    assert(Kind == k_ShiftExtend && "Invalid access!");
+    return ShiftExtend.Amount;
+  }
+
+  bool hasShiftExtendAmount() const {
+    assert(Kind == k_ShiftExtend && "Invalid access!");
+    return ShiftExtend.HasExplicitAmount;
+  }
+
+  bool isImm() const override { return Kind == k_Immediate; }
+  bool isMem() const override { return false; }
+  bool isSImm9() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= -256 && Val < 256);
+  }
+  bool isSImm7s4() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= -256 && Val <= 252 && (Val & 3) == 0);
+  }
+  bool isSImm7s8() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= -512 && Val <= 504 && (Val & 7) == 0);
+  }
+  bool isSImm7s16() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= -1024 && Val <= 1008 && (Val & 15) == 0);
+  }
+
+  bool isSymbolicUImm12Offset(const MCExpr *Expr, unsigned Scale) const {
+    AArch64MCExpr::VariantKind ELFRefKind;
+    MCSymbolRefExpr::VariantKind DarwinRefKind;
+    int64_t Addend;
+    if (!AArch64AsmParser::classifySymbolRef(Expr, ELFRefKind, DarwinRefKind,
+                                           Addend)) {
+      // If we don't understand the expression, assume the best and
+      // let the fixup and relocation code deal with it.
+      return true;
+    }
+
+    if (DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF ||
+        ELFRefKind == AArch64MCExpr::VK_LO12 ||
+        ELFRefKind == AArch64MCExpr::VK_GOT_LO12 ||
+        ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12 ||
+        ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC ||
+        ELFRefKind == AArch64MCExpr::VK_TPREL_LO12 ||
+        ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC ||
+        ELFRefKind == AArch64MCExpr::VK_GOTTPREL_LO12_NC ||
+        ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12) {
+      // Note that we don't range-check the addend. It's adjusted modulo page
+      // size when converted, so there is no "out of range" condition when using
+      // @pageoff.
+      return Addend >= 0 && (Addend % Scale) == 0;
+    } else if (DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGEOFF ||
+               DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF) {
+      // @gotpageoff/@tlvppageoff can only be used directly, not with an addend.
+      return Addend == 0;
+    }
+
+    return false;
+  }
+
+  template <int Scale> bool isUImm12Offset() const {
+    if (!isImm())
+      return false;
+
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return isSymbolicUImm12Offset(getImm(), Scale);
+
+    int64_t Val = MCE->getValue();
+    return (Val % Scale) == 0 && Val >= 0 && (Val / Scale) < 0x1000;
+  }
+
+  bool isImm0_7() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 8);
+  }
+  bool isImm1_8() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val > 0 && Val < 9);
+  }
+  bool isImm0_15() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 16);
+  }
+  bool isImm1_16() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val > 0 && Val < 17);
+  }
+  bool isImm0_31() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 32);
+  }
+  bool isImm1_31() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 1 && Val < 32);
+  }
+  bool isImm1_32() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 1 && Val < 33);
+  }
+  bool isImm0_63() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 64);
+  }
+  bool isImm1_63() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 1 && Val < 64);
+  }
+  bool isImm1_64() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 1 && Val < 65);
+  }
+  bool isImm0_127() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 128);
+  }
+  bool isImm0_255() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 256);
+  }
+  bool isImm0_65535() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 65536);
+  }
+  bool isImm32_63() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 32 && Val < 64);
+  }
+  bool isLogicalImm32() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    if (Val >> 32 != 0 && Val >> 32 != ~0LL)
+      return false;
+    Val &= 0xFFFFFFFF;
+    return AArch64_AM::isLogicalImmediate(Val, 32);
+  }
+  bool isLogicalImm64() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    return AArch64_AM::isLogicalImmediate(MCE->getValue(), 64);
+  }
+  bool isLogicalImm32Not() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = ~MCE->getValue() & 0xFFFFFFFF;
+    return AArch64_AM::isLogicalImmediate(Val, 32);
+  }
+  bool isLogicalImm64Not() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    return AArch64_AM::isLogicalImmediate(~MCE->getValue(), 64);
+  }
+  bool isShiftedImm() const { return Kind == k_ShiftedImm; }
+  bool isAddSubImm() const {
+    if (!isShiftedImm() && !isImm())
+      return false;
+
+    const MCExpr *Expr;
+
+    // An ADD/SUB shifter is either 'lsl #0' or 'lsl #12'.
+    if (isShiftedImm()) {
+      unsigned Shift = ShiftedImm.ShiftAmount;
+      Expr = ShiftedImm.Val;
+      if (Shift != 0 && Shift != 12)
+        return false;
+    } else {
+      Expr = getImm();
+    }
+
+    AArch64MCExpr::VariantKind ELFRefKind;
+    MCSymbolRefExpr::VariantKind DarwinRefKind;
+    int64_t Addend;
+    if (AArch64AsmParser::classifySymbolRef(Expr, ELFRefKind,
+                                          DarwinRefKind, Addend)) {
+      return DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF
+          || DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF
+          || (DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGEOFF && Addend == 0)
+          || ELFRefKind == AArch64MCExpr::VK_LO12
+          || ELFRefKind == AArch64MCExpr::VK_DTPREL_HI12
+          || ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12
+          || ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC
+          || ELFRefKind == AArch64MCExpr::VK_TPREL_HI12
+          || ELFRefKind == AArch64MCExpr::VK_TPREL_LO12
+          || ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC
+          || ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12;
+    }
+
+    // Otherwise it should be a real immediate in range:
+    const MCConstantExpr *CE = cast<MCConstantExpr>(Expr);
+    return CE->getValue() >= 0 && CE->getValue() <= 0xfff;
+  }
+  bool isCondCode() const { return Kind == k_CondCode; }
+  bool isSIMDImmType10() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    return AArch64_AM::isAdvSIMDModImmType10(MCE->getValue());
+  }
+  bool isBranchTarget26() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return true;
+    int64_t Val = MCE->getValue();
+    if (Val & 0x3)
+      return false;
+    return (Val >= -(0x2000000 << 2) && Val <= (0x1ffffff << 2));
+  }
+  bool isPCRelLabel19() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return true;
+    int64_t Val = MCE->getValue();
+    if (Val & 0x3)
+      return false;
+    return (Val >= -(0x40000 << 2) && Val <= (0x3ffff << 2));
+  }
+  bool isBranchTarget14() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return true;
+    int64_t Val = MCE->getValue();
+    if (Val & 0x3)
+      return false;
+    return (Val >= -(0x2000 << 2) && Val <= (0x1fff << 2));
+  }
+
+  bool
+  isMovWSymbol(ArrayRef<AArch64MCExpr::VariantKind> AllowedModifiers) const {
+    if (!isImm())
+      return false;
+
+    AArch64MCExpr::VariantKind ELFRefKind;
+    MCSymbolRefExpr::VariantKind DarwinRefKind;
+    int64_t Addend;
+    if (!AArch64AsmParser::classifySymbolRef(getImm(), ELFRefKind,
+                                             DarwinRefKind, Addend)) {
+      return false;
+    }
+    if (DarwinRefKind != MCSymbolRefExpr::VK_None)
+      return false;
+
+    for (unsigned i = 0; i != AllowedModifiers.size(); ++i) {
+      if (ELFRefKind == AllowedModifiers[i])
+        return Addend == 0;
+    }
+
+    return false;
+  }
+
+  bool isMovZSymbolG3() const {
+    static AArch64MCExpr::VariantKind Variants[] = { AArch64MCExpr::VK_ABS_G3 };
+    return isMovWSymbol(Variants);
+  }
+
+  bool isMovZSymbolG2() const {
+    static AArch64MCExpr::VariantKind Variants[] = {
+        AArch64MCExpr::VK_ABS_G2, AArch64MCExpr::VK_ABS_G2_S,
+        AArch64MCExpr::VK_TPREL_G2, AArch64MCExpr::VK_DTPREL_G2};
+    return isMovWSymbol(Variants);
+  }
+
+  bool isMovZSymbolG1() const {
+    static AArch64MCExpr::VariantKind Variants[] = {
+        AArch64MCExpr::VK_ABS_G1,      AArch64MCExpr::VK_ABS_G1_S,
+        AArch64MCExpr::VK_GOTTPREL_G1, AArch64MCExpr::VK_TPREL_G1,
+        AArch64MCExpr::VK_DTPREL_G1,
+    };
+    return isMovWSymbol(Variants);
+  }
+
+  bool isMovZSymbolG0() const {
+    static AArch64MCExpr::VariantKind Variants[] = {
+        AArch64MCExpr::VK_ABS_G0, AArch64MCExpr::VK_ABS_G0_S,
+        AArch64MCExpr::VK_TPREL_G0, AArch64MCExpr::VK_DTPREL_G0};
+    return isMovWSymbol(Variants);
+  }
+
+  bool isMovKSymbolG3() const {
+    static AArch64MCExpr::VariantKind Variants[] = { AArch64MCExpr::VK_ABS_G3 };
+    return isMovWSymbol(Variants);
+  }
+
+  bool isMovKSymbolG2() const {
+    static AArch64MCExpr::VariantKind Variants[] = {
+        AArch64MCExpr::VK_ABS_G2_NC};
+    return isMovWSymbol(Variants);
+  }
+
+  bool isMovKSymbolG1() const {
+    static AArch64MCExpr::VariantKind Variants[] = {
+      AArch64MCExpr::VK_ABS_G1_NC, AArch64MCExpr::VK_TPREL_G1_NC,
+      AArch64MCExpr::VK_DTPREL_G1_NC
+    };
+    return isMovWSymbol(Variants);
+  }
+
+  bool isMovKSymbolG0() const {
+    static AArch64MCExpr::VariantKind Variants[] = {
+      AArch64MCExpr::VK_ABS_G0_NC,   AArch64MCExpr::VK_GOTTPREL_G0_NC,
+      AArch64MCExpr::VK_TPREL_G0_NC, AArch64MCExpr::VK_DTPREL_G0_NC
+    };
+    return isMovWSymbol(Variants);
+  }
+
+  template<int RegWidth, int Shift>
+  bool isMOVZMovAlias() const {
+    if (!isImm()) return false;
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    uint64_t Value = CE->getValue();
+
+    if (RegWidth == 32)
+      Value &= 0xffffffffULL;
+
+    // "lsl #0" takes precedence: in practice this only affects "#0, lsl #0".
+    if (Value == 0 && Shift != 0)
+      return false;
+
+    return (Value & ~(0xffffULL << Shift)) == 0;
+  }
+
+  template<int RegWidth, int Shift>
+  bool isMOVNMovAlias() const {
+    if (!isImm()) return false;
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    uint64_t Value = CE->getValue();
+
+    // MOVZ takes precedence over MOVN.
+    for (int MOVZShift = 0; MOVZShift <= 48; MOVZShift += 16)
+      if ((Value & ~(0xffffULL << MOVZShift)) == 0)
+        return false;
+
+    Value = ~Value;
+    if (RegWidth == 32)
+      Value &= 0xffffffffULL;
+
+    return (Value & ~(0xffffULL << Shift)) == 0;
+  }
+
+  bool isFPImm() const { return Kind == k_FPImm; }
+  bool isBarrier() const { return Kind == k_Barrier; }
+  bool isSysReg() const { return Kind == k_SysReg; }
+  bool isMRSSystemRegister() const {
+    if (!isSysReg()) return false;
+
+    bool IsKnownRegister;
+    auto Mapper = AArch64SysReg::MRSMapper(getSysRegFeatureBits());
+    Mapper.fromString(getSysReg(), IsKnownRegister);
+
+    return IsKnownRegister;
+  }
+  bool isMSRSystemRegister() const {
+    if (!isSysReg()) return false;
+
+    bool IsKnownRegister;
+    auto Mapper = AArch64SysReg::MSRMapper(getSysRegFeatureBits());
+    Mapper.fromString(getSysReg(), IsKnownRegister);
+
+    return IsKnownRegister;
+  }
+  bool isSystemPStateField() const {
+    if (!isSysReg()) return false;
+
+    bool IsKnownRegister;
+    AArch64PState::PStateMapper().fromString(getSysReg(), IsKnownRegister);
+
+    return IsKnownRegister;
+  }
+  bool isReg() const override { return Kind == k_Register && !Reg.isVector; }
+  bool isVectorReg() const { return Kind == k_Register && Reg.isVector; }
+  bool isVectorRegLo() const {
+    return Kind == k_Register && Reg.isVector &&
+           AArch64MCRegisterClasses[AArch64::FPR128_loRegClassID].contains(
+               Reg.RegNum);
+  }
+  bool isGPR32as64() const {
+    return Kind == k_Register && !Reg.isVector &&
+      AArch64MCRegisterClasses[AArch64::GPR64RegClassID].contains(Reg.RegNum);
+  }
+
+  bool isGPR64sp0() const {
+    return Kind == k_Register && !Reg.isVector &&
+      AArch64MCRegisterClasses[AArch64::GPR64spRegClassID].contains(Reg.RegNum);
+  }
+
+  /// Is this a vector list with the type implicit (presumably attached to the
+  /// instruction itself)?
+  template <unsigned NumRegs> bool isImplicitlyTypedVectorList() const {
+    return Kind == k_VectorList && VectorList.Count == NumRegs &&
+           !VectorList.ElementKind;
+  }
+
+  template <unsigned NumRegs, unsigned NumElements, char ElementKind>
+  bool isTypedVectorList() const {
+    if (Kind != k_VectorList)
+      return false;
+    if (VectorList.Count != NumRegs)
+      return false;
+    if (VectorList.ElementKind != ElementKind)
+      return false;
+    return VectorList.NumElements == NumElements;
+  }
+
+  bool isVectorIndex1() const {
+    return Kind == k_VectorIndex && VectorIndex.Val == 1;
+  }
+  bool isVectorIndexB() const {
+    return Kind == k_VectorIndex && VectorIndex.Val < 16;
+  }
+  bool isVectorIndexH() const {
+    return Kind == k_VectorIndex && VectorIndex.Val < 8;
+  }
+  bool isVectorIndexS() const {
+    return Kind == k_VectorIndex && VectorIndex.Val < 4;
+  }
+  bool isVectorIndexD() const {
+    return Kind == k_VectorIndex && VectorIndex.Val < 2;
+  }
+  bool isToken() const override { return Kind == k_Token; }
+  bool isTokenEqual(StringRef Str) const {
+    return Kind == k_Token && getToken() == Str;
+  }
+  bool isSysCR() const { return Kind == k_SysCR; }
+  bool isPrefetch() const { return Kind == k_Prefetch; }
+  bool isShiftExtend() const { return Kind == k_ShiftExtend; }
+  bool isShifter() const {
+    if (!isShiftExtend())
+      return false;
+
+    AArch64_AM::ShiftExtendType ST = getShiftExtendType();
+    return (ST == AArch64_AM::LSL || ST == AArch64_AM::LSR ||
+            ST == AArch64_AM::ASR || ST == AArch64_AM::ROR ||
+            ST == AArch64_AM::MSL);
+  }
+  bool isExtend() const {
+    if (!isShiftExtend())
+      return false;
+
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    return (ET == AArch64_AM::UXTB || ET == AArch64_AM::SXTB ||
+            ET == AArch64_AM::UXTH || ET == AArch64_AM::SXTH ||
+            ET == AArch64_AM::UXTW || ET == AArch64_AM::SXTW ||
+            ET == AArch64_AM::UXTX || ET == AArch64_AM::SXTX ||
+            ET == AArch64_AM::LSL) &&
+           getShiftExtendAmount() <= 4;
+  }
+
+  bool isExtend64() const {
+    if (!isExtend())
+      return false;
+    // UXTX and SXTX require a 64-bit source register (the ExtendLSL64 class).
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    return ET != AArch64_AM::UXTX && ET != AArch64_AM::SXTX;
+  }
+  bool isExtendLSL64() const {
+    if (!isExtend())
+      return false;
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    return (ET == AArch64_AM::UXTX || ET == AArch64_AM::SXTX ||
+            ET == AArch64_AM::LSL) &&
+           getShiftExtendAmount() <= 4;
+  }
+
+  template<int Width> bool isMemXExtend() const {
+    if (!isExtend())
+      return false;
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    return (ET == AArch64_AM::LSL || ET == AArch64_AM::SXTX) &&
+           (getShiftExtendAmount() == Log2_32(Width / 8) ||
+            getShiftExtendAmount() == 0);
+  }
+
+  template<int Width> bool isMemWExtend() const {
+    if (!isExtend())
+      return false;
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    return (ET == AArch64_AM::UXTW || ET == AArch64_AM::SXTW) &&
+           (getShiftExtendAmount() == Log2_32(Width / 8) ||
+            getShiftExtendAmount() == 0);
+  }
+
+  template <unsigned width>
+  bool isArithmeticShifter() const {
+    if (!isShifter())
+      return false;
+
+    // An arithmetic shifter is LSL, LSR, or ASR.
+    AArch64_AM::ShiftExtendType ST = getShiftExtendType();
+    return (ST == AArch64_AM::LSL || ST == AArch64_AM::LSR ||
+            ST == AArch64_AM::ASR) && getShiftExtendAmount() < width;
+  }
+
+  template <unsigned width>
+  bool isLogicalShifter() const {
+    if (!isShifter())
+      return false;
+
+    // A logical shifter is LSL, LSR, ASR or ROR.
+    AArch64_AM::ShiftExtendType ST = getShiftExtendType();
+    return (ST == AArch64_AM::LSL || ST == AArch64_AM::LSR ||
+            ST == AArch64_AM::ASR || ST == AArch64_AM::ROR) &&
+           getShiftExtendAmount() < width;
+  }
+
+  bool isMovImm32Shifter() const {
+    if (!isShifter())
+      return false;
+
+    // A MOVi shifter is LSL of 0, 16, 32, or 48.
+    AArch64_AM::ShiftExtendType ST = getShiftExtendType();
+    if (ST != AArch64_AM::LSL)
+      return false;
+    uint64_t Val = getShiftExtendAmount();
+    return (Val == 0 || Val == 16);
+  }
+
+  bool isMovImm64Shifter() const {
+    if (!isShifter())
+      return false;
+
+    // A MOVi shifter is LSL of 0 or 16.
+    AArch64_AM::ShiftExtendType ST = getShiftExtendType();
+    if (ST != AArch64_AM::LSL)
+      return false;
+    uint64_t Val = getShiftExtendAmount();
+    return (Val == 0 || Val == 16 || Val == 32 || Val == 48);
+  }
+
+  bool isLogicalVecShifter() const {
+    if (!isShifter())
+      return false;
+
+    // A logical vector shifter is a left shift by 0, 8, 16, or 24.
+    unsigned Shift = getShiftExtendAmount();
+    return getShiftExtendType() == AArch64_AM::LSL &&
+           (Shift == 0 || Shift == 8 || Shift == 16 || Shift == 24);
+  }
+
+  bool isLogicalVecHalfWordShifter() const {
+    if (!isLogicalVecShifter())
+      return false;
+
+    // A logical vector shifter is a left shift by 0 or 8.
+    unsigned Shift = getShiftExtendAmount();
+    return getShiftExtendType() == AArch64_AM::LSL &&
+           (Shift == 0 || Shift == 8);
+  }
+
+  bool isMoveVecShifter() const {
+    if (!isShiftExtend())
+      return false;
+
+    // A logical vector shifter is a left shift by 8 or 16.
+    unsigned Shift = getShiftExtendAmount();
+    return getShiftExtendType() == AArch64_AM::MSL &&
+           (Shift == 8 || Shift == 16);
+  }
+
+  // Fallback unscaled operands are for aliases of LDR/STR that fall back
+  // to LDUR/STUR when the offset is not legal for the former but is for
+  // the latter. As such, in addition to checking for being a legal unscaled
+  // address, also check that it is not a legal scaled address. This avoids
+  // ambiguity in the matcher.
+  template<int Width>
+  bool isSImm9OffsetFB() const {
+    return isSImm9() && !isUImm12Offset<Width / 8>();
+  }
+
+  bool isAdrpLabel() const {
+    // Validation was handled during parsing, so we just sanity check that
+    // something didn't go haywire.
+    if (!isImm())
+        return false;
+
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) {
+      int64_t Val = CE->getValue();
+      int64_t Min = - (4096 * (1LL << (21 - 1)));
+      int64_t Max = 4096 * ((1LL << (21 - 1)) - 1);
+      return (Val % 4096) == 0 && Val >= Min && Val <= Max;
+    }
+
+    return true;
+  }
+
+  bool isAdrLabel() const {
+    // Validation was handled during parsing, so we just sanity check that
+    // something didn't go haywire.
+    if (!isImm())
+        return false;
+
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) {
+      int64_t Val = CE->getValue();
+      int64_t Min = - (1LL << (21 - 1));
+      int64_t Max = ((1LL << (21 - 1)) - 1);
+      return Val >= Min && Val <= Max;
+    }
+
+    return true;
+  }
+
+  void addExpr(MCInst &Inst, const MCExpr *Expr) const {
+    // Add as immediates when possible.  Null MCExpr = 0.
+    if (!Expr)
+      Inst.addOperand(MCOperand::CreateImm(0));
+    else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+      Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::CreateExpr(Expr));
+  }
+
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getReg()));
+  }
+
+  void addGPR32as64Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    assert(
+        AArch64MCRegisterClasses[AArch64::GPR64RegClassID].contains(getReg()));
+
+    const MCRegisterInfo *RI = Ctx.getRegisterInfo();
+    uint32_t Reg = RI->getRegClass(AArch64::GPR32RegClassID).getRegister(
+        RI->getEncodingValue(getReg()));
+
+    Inst.addOperand(MCOperand::CreateReg(Reg));
+  }
+
+  void addVectorReg64Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    assert(
+        AArch64MCRegisterClasses[AArch64::FPR128RegClassID].contains(getReg()));
+    Inst.addOperand(MCOperand::CreateReg(AArch64::D0 + getReg() - AArch64::Q0));
+  }
+
+  void addVectorReg128Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    assert(
+        AArch64MCRegisterClasses[AArch64::FPR128RegClassID].contains(getReg()));
+    Inst.addOperand(MCOperand::CreateReg(getReg()));
+  }
+
+  void addVectorRegLoOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getReg()));
+  }
+
+  template <unsigned NumRegs>
+  void addVectorList64Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    static unsigned FirstRegs[] = { AArch64::D0,       AArch64::D0_D1,
+                                    AArch64::D0_D1_D2, AArch64::D0_D1_D2_D3 };
+    unsigned FirstReg = FirstRegs[NumRegs - 1];
+
+    Inst.addOperand(
+        MCOperand::CreateReg(FirstReg + getVectorListStart() - AArch64::Q0));
+  }
+
+  template <unsigned NumRegs>
+  void addVectorList128Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    static unsigned FirstRegs[] = { AArch64::Q0,       AArch64::Q0_Q1,
+                                    AArch64::Q0_Q1_Q2, AArch64::Q0_Q1_Q2_Q3 };
+    unsigned FirstReg = FirstRegs[NumRegs - 1];
+
+    Inst.addOperand(
+        MCOperand::CreateReg(FirstReg + getVectorListStart() - AArch64::Q0));
+  }
+
+  void addVectorIndex1Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
+  }
+
+  void addVectorIndexBOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
+  }
+
+  void addVectorIndexHOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
+  }
+
+  void addVectorIndexSOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
+  }
+
+  void addVectorIndexDOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
+  }
+
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // If this is a pageoff symrefexpr with an addend, adjust the addend
+    // to be only the page-offset portion. Otherwise, just add the expr
+    // as-is.
+    addExpr(Inst, getImm());
+  }
+
+  void addAddSubImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    if (isShiftedImm()) {
+      addExpr(Inst, getShiftedImmVal());
+      Inst.addOperand(MCOperand::CreateImm(getShiftedImmShift()));
+    } else {
+      addExpr(Inst, getImm());
+      Inst.addOperand(MCOperand::CreateImm(0));
+    }
+  }
+
+  void addCondCodeOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getCondCode()));
+  }
+
+  void addAdrpLabelOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      addExpr(Inst, getImm());
+    else
+      Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 12));
+  }
+
+  void addAdrLabelOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+
+  template<int Scale>
+  void addUImm12OffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+
+    if (!MCE) {
+      Inst.addOperand(MCOperand::CreateExpr(getImm()));
+      return;
+    }
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / Scale));
+  }
+
+  void addSImm9Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
+
+  void addSImm7s4Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / 4));
+  }
+
+  void addSImm7s8Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / 8));
+  }
+
+  void addSImm7s16Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / 16));
+  }
+
+  void addImm0_7Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
+
+  void addImm1_8Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
+
+  void addImm0_15Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
+
+  void addImm1_16Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
+
+  void addImm0_31Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
+
+  void addImm1_31Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
+
+  void addImm1_32Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
+
+  void addImm0_63Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
+
+  void addImm1_63Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
+
+  void addImm1_64Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
+
+  void addImm0_127Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
+
+  void addImm0_255Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
+
+  void addImm0_65535Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
+
+  void addImm32_63Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
+
+  void addLogicalImm32Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    uint64_t encoding =
+        AArch64_AM::encodeLogicalImmediate(MCE->getValue() & 0xFFFFFFFF, 32);
+    Inst.addOperand(MCOperand::CreateImm(encoding));
+  }
+
+  void addLogicalImm64Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    uint64_t encoding = AArch64_AM::encodeLogicalImmediate(MCE->getValue(), 64);
+    Inst.addOperand(MCOperand::CreateImm(encoding));
+  }
+
+  void addLogicalImm32NotOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    int64_t Val = ~MCE->getValue() & 0xFFFFFFFF;
+    uint64_t encoding = AArch64_AM::encodeLogicalImmediate(Val, 32);
+    Inst.addOperand(MCOperand::CreateImm(encoding));
+  }
+
+  void addLogicalImm64NotOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    uint64_t encoding =
+        AArch64_AM::encodeLogicalImmediate(~MCE->getValue(), 64);
+    Inst.addOperand(MCOperand::CreateImm(encoding));
+  }
+
+  void addSIMDImmType10Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    uint64_t encoding = AArch64_AM::encodeAdvSIMDModImmType10(MCE->getValue());
+    Inst.addOperand(MCOperand::CreateImm(encoding));
+  }
+
+  void addBranchTarget26Operands(MCInst &Inst, unsigned N) const {
+    // Branch operands don't encode the low bits, so shift them off
+    // here. If it's a label, however, just put it on directly as there's
+    // not enough information now to do anything.
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE) {
+      addExpr(Inst, getImm());
+      return;
+    }
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 2));
+  }
+
+  void addPCRelLabel19Operands(MCInst &Inst, unsigned N) const {
+    // Branch operands don't encode the low bits, so shift them off
+    // here. If it's a label, however, just put it on directly as there's
+    // not enough information now to do anything.
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE) {
+      addExpr(Inst, getImm());
+      return;
+    }
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 2));
+  }
+
+  void addBranchTarget14Operands(MCInst &Inst, unsigned N) const {
+    // Branch operands don't encode the low bits, so shift them off
+    // here. If it's a label, however, just put it on directly as there's
+    // not enough information now to do anything.
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE) {
+      addExpr(Inst, getImm());
+      return;
+    }
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 2));
+  }
+
+  void addFPImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getFPImm()));
+  }
+
+  void addBarrierOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getBarrier()));
+  }
+
+  void addMRSSystemRegisterOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    bool Valid;
+    auto Mapper = AArch64SysReg::MRSMapper(getSysRegFeatureBits());
+    uint32_t Bits = Mapper.fromString(getSysReg(), Valid);
+
+    Inst.addOperand(MCOperand::CreateImm(Bits));
+  }
+
+  void addMSRSystemRegisterOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    bool Valid;
+    auto Mapper = AArch64SysReg::MSRMapper(getSysRegFeatureBits());
+    uint32_t Bits = Mapper.fromString(getSysReg(), Valid);
+
+    Inst.addOperand(MCOperand::CreateImm(Bits));
+  }
+
+  void addSystemPStateFieldOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    bool Valid;
+    uint32_t Bits =
+        AArch64PState::PStateMapper().fromString(getSysReg(), Valid);
+
+    Inst.addOperand(MCOperand::CreateImm(Bits));
+  }
+
+  void addSysCROperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getSysCR()));
+  }
+
+  void addPrefetchOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getPrefetch()));
+  }
+
+  void addShifterOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    unsigned Imm =
+        AArch64_AM::getShifterImm(getShiftExtendType(), getShiftExtendAmount());
+    Inst.addOperand(MCOperand::CreateImm(Imm));
+  }
+
+  void addExtendOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    if (ET == AArch64_AM::LSL) ET = AArch64_AM::UXTW;
+    unsigned Imm = AArch64_AM::getArithExtendImm(ET, getShiftExtendAmount());
+    Inst.addOperand(MCOperand::CreateImm(Imm));
+  }
+
+  void addExtend64Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    if (ET == AArch64_AM::LSL) ET = AArch64_AM::UXTX;
+    unsigned Imm = AArch64_AM::getArithExtendImm(ET, getShiftExtendAmount());
+    Inst.addOperand(MCOperand::CreateImm(Imm));
+  }
+
+  void addMemExtendOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    bool IsSigned = ET == AArch64_AM::SXTW || ET == AArch64_AM::SXTX;
+    Inst.addOperand(MCOperand::CreateImm(IsSigned));
+    Inst.addOperand(MCOperand::CreateImm(getShiftExtendAmount() != 0));
+  }
+
+  // For 8-bit load/store instructions with a register offset, both the
+  // "DoShift" and "NoShift" variants have a shift of 0. Because of this,
+  // they're disambiguated by whether the shift was explicit or implicit rather
+  // than its size.
+  void addMemExtend8Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    bool IsSigned = ET == AArch64_AM::SXTW || ET == AArch64_AM::SXTX;
+    Inst.addOperand(MCOperand::CreateImm(IsSigned));
+    Inst.addOperand(MCOperand::CreateImm(hasShiftExtendAmount()));
+  }
+
+  template<int Shift>
+  void addMOVZMovAliasOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
+    uint64_t Value = CE->getValue();
+    Inst.addOperand(MCOperand::CreateImm((Value >> Shift) & 0xffff));
+  }
+
+  template<int Shift>
+  void addMOVNMovAliasOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
+    uint64_t Value = CE->getValue();
+    Inst.addOperand(MCOperand::CreateImm((~Value >> Shift) & 0xffff));
+  }
+
+  void print(raw_ostream &OS) const override;
+
+  static std::unique_ptr<AArch64Operand>
+  CreateToken(StringRef Str, bool IsSuffix, SMLoc S, MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_Token, Ctx);
+    Op->Tok.Data = Str.data();
+    Op->Tok.Length = Str.size();
+    Op->Tok.IsSuffix = IsSuffix;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static std::unique_ptr<AArch64Operand>
+  CreateReg(unsigned RegNum, bool isVector, SMLoc S, SMLoc E, MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_Register, Ctx);
+    Op->Reg.RegNum = RegNum;
+    Op->Reg.isVector = isVector;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<AArch64Operand>
+  CreateVectorList(unsigned RegNum, unsigned Count, unsigned NumElements,
+                   char ElementKind, SMLoc S, SMLoc E, MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_VectorList, Ctx);
+    Op->VectorList.RegNum = RegNum;
+    Op->VectorList.Count = Count;
+    Op->VectorList.NumElements = NumElements;
+    Op->VectorList.ElementKind = ElementKind;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<AArch64Operand>
+  CreateVectorIndex(unsigned Idx, SMLoc S, SMLoc E, MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_VectorIndex, Ctx);
+    Op->VectorIndex.Val = Idx;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<AArch64Operand> CreateImm(const MCExpr *Val, SMLoc S,
+                                                   SMLoc E, MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_Immediate, Ctx);
+    Op->Imm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<AArch64Operand> CreateShiftedImm(const MCExpr *Val,
+                                                          unsigned ShiftAmount,
+                                                          SMLoc S, SMLoc E,
+                                                          MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_ShiftedImm, Ctx);
+    Op->ShiftedImm .Val = Val;
+    Op->ShiftedImm.ShiftAmount = ShiftAmount;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<AArch64Operand>
+  CreateCondCode(AArch64CC::CondCode Code, SMLoc S, SMLoc E, MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_CondCode, Ctx);
+    Op->CondCode.Code = Code;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<AArch64Operand> CreateFPImm(unsigned Val, SMLoc S,
+                                                     MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_FPImm, Ctx);
+    Op->FPImm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static std::unique_ptr<AArch64Operand> CreateBarrier(unsigned Val, SMLoc S,
+                                                       MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_Barrier, Ctx);
+    Op->Barrier.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static std::unique_ptr<AArch64Operand>
+  CreateSysReg(StringRef Str, SMLoc S, uint64_t FeatureBits, MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_SysReg, Ctx);
+    Op->SysReg.Data = Str.data();
+    Op->SysReg.Length = Str.size();
+    Op->SysReg.FeatureBits = FeatureBits;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static std::unique_ptr<AArch64Operand> CreateSysCR(unsigned Val, SMLoc S,
+                                                     SMLoc E, MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_SysCR, Ctx);
+    Op->SysCRImm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<AArch64Operand> CreatePrefetch(unsigned Val, SMLoc S,
+                                                        MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_Prefetch, Ctx);
+    Op->Prefetch.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static std::unique_ptr<AArch64Operand>
+  CreateShiftExtend(AArch64_AM::ShiftExtendType ShOp, unsigned Val,
+                    bool HasExplicitAmount, SMLoc S, SMLoc E, MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_ShiftExtend, Ctx);
+    Op->ShiftExtend.Type = ShOp;
+    Op->ShiftExtend.Amount = Val;
+    Op->ShiftExtend.HasExplicitAmount = HasExplicitAmount;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+};
+
+} // end anonymous namespace.
+
+void AArch64Operand::print(raw_ostream &OS) const {
+  switch (Kind) {
+  case k_FPImm:
+    OS << "<fpimm " << getFPImm() << "("
+       << AArch64_AM::getFPImmFloat(getFPImm()) << ") >";
+    break;
+  case k_Barrier: {
+    bool Valid;
+    StringRef Name = AArch64DB::DBarrierMapper().toString(getBarrier(), Valid);
+    if (Valid)
+      OS << "<barrier " << Name << ">";
+    else
+      OS << "<barrier invalid #" << getBarrier() << ">";
+    break;
+  }
+  case k_Immediate:
+    getImm()->print(OS);
+    break;
+  case k_ShiftedImm: {
+    unsigned Shift = getShiftedImmShift();
+    OS << "<shiftedimm ";
+    getShiftedImmVal()->print(OS);
+    OS << ", lsl #" << AArch64_AM::getShiftValue(Shift) << ">";
+    break;
+  }
+  case k_CondCode:
+    OS << "<condcode " << getCondCode() << ">";
+    break;
+  case k_Register:
+    OS << "<register " << getReg() << ">";
+    break;
+  case k_VectorList: {
+    OS << "<vectorlist ";
+    unsigned Reg = getVectorListStart();
+    for (unsigned i = 0, e = getVectorListCount(); i != e; ++i)
+      OS << Reg + i << " ";
+    OS << ">";
+    break;
+  }
+  case k_VectorIndex:
+    OS << "<vectorindex " << getVectorIndex() << ">";
+    break;
+  case k_SysReg:
+    OS << "<sysreg: " << getSysReg() << '>';
+    break;
+  case k_Token:
+    OS << "'" << getToken() << "'";
+    break;
+  case k_SysCR:
+    OS << "c" << getSysCR();
+    break;
+  case k_Prefetch: {
+    bool Valid;
+    StringRef Name = AArch64PRFM::PRFMMapper().toString(getPrefetch(), Valid);
+    if (Valid)
+      OS << "<prfop " << Name << ">";
+    else
+      OS << "<prfop invalid #" << getPrefetch() << ">";
+    break;
+  }
+  case k_ShiftExtend: {
+    OS << "<" << AArch64_AM::getShiftExtendName(getShiftExtendType()) << " #"
+       << getShiftExtendAmount();
+    if (!hasShiftExtendAmount())
+      OS << "<imp>";
+    OS << '>';
+    break;
+  }
+  }
+}
+
+/// @name Auto-generated Match Functions
+/// {
+
+static unsigned MatchRegisterName(StringRef Name);
+
+/// }
+
+static unsigned matchVectorRegName(StringRef Name) {
+  return StringSwitch<unsigned>(Name)
+      .Case("v0", AArch64::Q0)
+      .Case("v1", AArch64::Q1)
+      .Case("v2", AArch64::Q2)
+      .Case("v3", AArch64::Q3)
+      .Case("v4", AArch64::Q4)
+      .Case("v5", AArch64::Q5)
+      .Case("v6", AArch64::Q6)
+      .Case("v7", AArch64::Q7)
+      .Case("v8", AArch64::Q8)
+      .Case("v9", AArch64::Q9)
+      .Case("v10", AArch64::Q10)
+      .Case("v11", AArch64::Q11)
+      .Case("v12", AArch64::Q12)
+      .Case("v13", AArch64::Q13)
+      .Case("v14", AArch64::Q14)
+      .Case("v15", AArch64::Q15)
+      .Case("v16", AArch64::Q16)
+      .Case("v17", AArch64::Q17)
+      .Case("v18", AArch64::Q18)
+      .Case("v19", AArch64::Q19)
+      .Case("v20", AArch64::Q20)
+      .Case("v21", AArch64::Q21)
+      .Case("v22", AArch64::Q22)
+      .Case("v23", AArch64::Q23)
+      .Case("v24", AArch64::Q24)
+      .Case("v25", AArch64::Q25)
+      .Case("v26", AArch64::Q26)
+      .Case("v27", AArch64::Q27)
+      .Case("v28", AArch64::Q28)
+      .Case("v29", AArch64::Q29)
+      .Case("v30", AArch64::Q30)
+      .Case("v31", AArch64::Q31)
+      .Default(0);
+}
+
+static bool isValidVectorKind(StringRef Name) {
+  return StringSwitch<bool>(Name.lower())
+      .Case(".8b", true)
+      .Case(".16b", true)
+      .Case(".4h", true)
+      .Case(".8h", true)
+      .Case(".2s", true)
+      .Case(".4s", true)
+      .Case(".1d", true)
+      .Case(".2d", true)
+      .Case(".1q", true)
+      // Accept the width neutral ones, too, for verbose syntax. If those
+      // aren't used in the right places, the token operand won't match so
+      // all will work out.
+      .Case(".b", true)
+      .Case(".h", true)
+      .Case(".s", true)
+      .Case(".d", true)
+      .Default(false);
+}
+
+static void parseValidVectorKind(StringRef Name, unsigned &NumElements,
+                                 char &ElementKind) {
+  assert(isValidVectorKind(Name));
+
+  ElementKind = Name.lower()[Name.size() - 1];
+  NumElements = 0;
+
+  if (Name.size() == 2)
+    return;
+
+  // Parse the lane count
+  Name = Name.drop_front();
+  while (isdigit(Name.front())) {
+    NumElements = 10 * NumElements + (Name.front() - '0');
+    Name = Name.drop_front();
+  }
+}
+
+bool AArch64AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                     SMLoc &EndLoc) {
+  StartLoc = getLoc();
+  RegNo = tryParseRegister();
+  EndLoc = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+  return (RegNo == (unsigned)-1);
+}
+
+// Matches a register name or register alias previously defined by '.req'
+unsigned AArch64AsmParser::matchRegisterNameAlias(StringRef Name,
+                                                  bool isVector) {
+  unsigned RegNum = isVector ? matchVectorRegName(Name)
+                             : MatchRegisterName(Name);
+
+  if (RegNum == 0) {
+    // Check for aliases registered via .req. Canonicalize to lower case.
+    // That's more consistent since register names are case insensitive, and
+    // it's how the original entry was passed in from MC/MCParser/AsmParser.
+    auto Entry = RegisterReqs.find(Name.lower());
+    if (Entry == RegisterReqs.end())
+      return 0;
+    // set RegNum if the match is the right kind of register
+    if (isVector == Entry->getValue().first)
+      RegNum = Entry->getValue().second;
+  }
+  return RegNum;
+}
+
+/// tryParseRegister - Try to parse a register name. The token must be an
+/// Identifier when called, and if it is a register name the token is eaten and
+/// the register is added to the operand list.
+int AArch64AsmParser::tryParseRegister() {
+  const AsmToken &Tok = Parser.getTok();
+  assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
+
+  std::string lowerCase = Tok.getString().lower();
+  unsigned RegNum = matchRegisterNameAlias(lowerCase, false);
+  // Also handle a few aliases of registers.
+  if (RegNum == 0)
+    RegNum = StringSwitch<unsigned>(lowerCase)
+                 .Case("fp",  AArch64::FP)
+                 .Case("lr",  AArch64::LR)
+                 .Case("x31", AArch64::XZR)
+                 .Case("w31", AArch64::WZR)
+                 .Default(0);
+
+  if (RegNum == 0)
+    return -1;
+
+  Parser.Lex(); // Eat identifier token.
+  return RegNum;
+}
+
+/// tryMatchVectorRegister - Try to parse a vector register name with optional
+/// kind specifier. If it is a register specifier, eat the token and return it.
+int AArch64AsmParser::tryMatchVectorRegister(StringRef &Kind, bool expected) {
+  if (Parser.getTok().isNot(AsmToken::Identifier)) {
+    TokError("vector register expected");
+    return -1;
+  }
+
+  StringRef Name = Parser.getTok().getString();
+  // If there is a kind specifier, it's separated from the register name by
+  // a '.'.
+  size_t Start = 0, Next = Name.find('.');
+  StringRef Head = Name.slice(Start, Next);
+  unsigned RegNum = matchRegisterNameAlias(Head, true);
+
+  if (RegNum) {
+    if (Next != StringRef::npos) {
+      Kind = Name.slice(Next, StringRef::npos);
+      if (!isValidVectorKind(Kind)) {
+        TokError("invalid vector kind qualifier");
+        return -1;
+      }
+    }
+    Parser.Lex(); // Eat the register token.
+    return RegNum;
+  }
+
+  if (expected)
+    TokError("vector register expected");
+  return -1;
+}
+
+/// tryParseSysCROperand - Try to parse a system instruction CR operand name.
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::tryParseSysCROperand(OperandVector &Operands) {
+  SMLoc S = getLoc();
+
+  if (Parser.getTok().isNot(AsmToken::Identifier)) {
+    Error(S, "Expected cN operand where 0 <= N <= 15");
+    return MatchOperand_ParseFail;
+  }
+
+  StringRef Tok = Parser.getTok().getIdentifier();
+  if (Tok[0] != 'c' && Tok[0] != 'C') {
+    Error(S, "Expected cN operand where 0 <= N <= 15");
+    return MatchOperand_ParseFail;
+  }
+
+  uint32_t CRNum;
+  bool BadNum = Tok.drop_front().getAsInteger(10, CRNum);
+  if (BadNum || CRNum > 15) {
+    Error(S, "Expected cN operand where 0 <= N <= 15");
+    return MatchOperand_ParseFail;
+  }
+
+  Parser.Lex(); // Eat identifier token.
+  Operands.push_back(
+      AArch64Operand::CreateSysCR(CRNum, S, getLoc(), getContext()));
+  return MatchOperand_Success;
+}
+
+/// tryParsePrefetch - Try to parse a prefetch operand.
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) {
+  SMLoc S = getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  // Either an identifier for named values or a 5-bit immediate.
+  bool Hash = Tok.is(AsmToken::Hash);
+  if (Hash || Tok.is(AsmToken::Integer)) {
+    if (Hash)
+      Parser.Lex(); // Eat hash token.
+    const MCExpr *ImmVal;
+    if (getParser().parseExpression(ImmVal))
+      return MatchOperand_ParseFail;
+
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+    if (!MCE) {
+      TokError("immediate value expected for prefetch operand");
+      return MatchOperand_ParseFail;
+    }
+    unsigned prfop = MCE->getValue();
+    if (prfop > 31) {
+      TokError("prefetch operand out of range, [0,31] expected");
+      return MatchOperand_ParseFail;
+    }
+
+    Operands.push_back(AArch64Operand::CreatePrefetch(prfop, S, getContext()));
+    return MatchOperand_Success;
+  }
+
+  if (Tok.isNot(AsmToken::Identifier)) {
+    TokError("pre-fetch hint expected");
+    return MatchOperand_ParseFail;
+  }
+
+  bool Valid;
+  unsigned prfop = AArch64PRFM::PRFMMapper().fromString(Tok.getString(), Valid);
+  if (!Valid) {
+    TokError("pre-fetch hint expected");
+    return MatchOperand_ParseFail;
+  }
+
+  Parser.Lex(); // Eat identifier token.
+  Operands.push_back(AArch64Operand::CreatePrefetch(prfop, S, getContext()));
+  return MatchOperand_Success;
+}
+
+/// tryParseAdrpLabel - Parse and validate a source label for the ADRP
+/// instruction.
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::tryParseAdrpLabel(OperandVector &Operands) {
+  SMLoc S = getLoc();
+  const MCExpr *Expr;
+
+  if (Parser.getTok().is(AsmToken::Hash)) {
+    Parser.Lex(); // Eat hash token.
+  }
+
+  if (parseSymbolicImmVal(Expr))
+    return MatchOperand_ParseFail;
+
+  AArch64MCExpr::VariantKind ELFRefKind;
+  MCSymbolRefExpr::VariantKind DarwinRefKind;
+  int64_t Addend;
+  if (classifySymbolRef(Expr, ELFRefKind, DarwinRefKind, Addend)) {
+    if (DarwinRefKind == MCSymbolRefExpr::VK_None &&
+        ELFRefKind == AArch64MCExpr::VK_INVALID) {
+      // No modifier was specified at all; this is the syntax for an ELF basic
+      // ADRP relocation (unfortunately).
+      Expr =
+          AArch64MCExpr::Create(Expr, AArch64MCExpr::VK_ABS_PAGE, getContext());
+    } else if ((DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGE ||
+                DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGE) &&
+               Addend != 0) {
+      Error(S, "gotpage label reference not allowed an addend");
+      return MatchOperand_ParseFail;
+    } else if (DarwinRefKind != MCSymbolRefExpr::VK_PAGE &&
+               DarwinRefKind != MCSymbolRefExpr::VK_GOTPAGE &&
+               DarwinRefKind != MCSymbolRefExpr::VK_TLVPPAGE &&
+               ELFRefKind != AArch64MCExpr::VK_GOT_PAGE &&
+               ELFRefKind != AArch64MCExpr::VK_GOTTPREL_PAGE &&
+               ELFRefKind != AArch64MCExpr::VK_TLSDESC_PAGE) {
+      // The operand must be an @page or @gotpage qualified symbolref.
+      Error(S, "page or gotpage label reference expected");
+      return MatchOperand_ParseFail;
+    }
+  }
+
+  // We have either a label reference possibly with addend or an immediate. The
+  // addend is a raw value here. The linker will adjust it to only reference the
+  // page.
+  SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+  Operands.push_back(AArch64Operand::CreateImm(Expr, S, E, getContext()));
+
+  return MatchOperand_Success;
+}
+
+/// tryParseAdrLabel - Parse and validate a source label for the ADR
+/// instruction.
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::tryParseAdrLabel(OperandVector &Operands) {
+  SMLoc S = getLoc();
+  const MCExpr *Expr;
+
+  if (Parser.getTok().is(AsmToken::Hash)) {
+    Parser.Lex(); // Eat hash token.
+  }
+
+  if (getParser().parseExpression(Expr))
+    return MatchOperand_ParseFail;
+
+  SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+  Operands.push_back(AArch64Operand::CreateImm(Expr, S, E, getContext()));
+
+  return MatchOperand_Success;
+}
+
+/// tryParseFPImm - A floating point immediate expression operand.
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::tryParseFPImm(OperandVector &Operands) {
+  SMLoc S = getLoc();
+
+  bool Hash = false;
+  if (Parser.getTok().is(AsmToken::Hash)) {
+    Parser.Lex(); // Eat '#'
+    Hash = true;
+  }
+
+  // Handle negation, as that still comes through as a separate token.
+  bool isNegative = false;
+  if (Parser.getTok().is(AsmToken::Minus)) {
+    isNegative = true;
+    Parser.Lex();
+  }
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.is(AsmToken::Real)) {
+    APFloat RealVal(APFloat::IEEEdouble, Tok.getString());
+    uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
+    // If we had a '-' in front, toggle the sign bit.
+    IntVal ^= (uint64_t)isNegative << 63;
+    int Val = AArch64_AM::getFP64Imm(APInt(64, IntVal));
+    Parser.Lex(); // Eat the token.
+    // Check for out of range values. As an exception, we let Zero through,
+    // as we handle that special case in post-processing before matching in
+    // order to use the zero register for it.
+    if (Val == -1 && !RealVal.isZero()) {
+      TokError("expected compatible register or floating-point constant");
+      return MatchOperand_ParseFail;
+    }
+    Operands.push_back(AArch64Operand::CreateFPImm(Val, S, getContext()));
+    return MatchOperand_Success;
+  }
+  if (Tok.is(AsmToken::Integer)) {
+    int64_t Val;
+    if (!isNegative && Tok.getString().startswith("0x")) {
+      Val = Tok.getIntVal();
+      if (Val > 255 || Val < 0) {
+        TokError("encoded floating point value out of range");
+        return MatchOperand_ParseFail;
+      }
+    } else {
+      APFloat RealVal(APFloat::IEEEdouble, Tok.getString());
+      uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
+      // If we had a '-' in front, toggle the sign bit.
+      IntVal ^= (uint64_t)isNegative << 63;
+      Val = AArch64_AM::getFP64Imm(APInt(64, IntVal));
+    }
+    Parser.Lex(); // Eat the token.
+    Operands.push_back(AArch64Operand::CreateFPImm(Val, S, getContext()));
+    return MatchOperand_Success;
+  }
+
+  if (!Hash)
+    return MatchOperand_NoMatch;
+
+  TokError("invalid floating point immediate");
+  return MatchOperand_ParseFail;
+}
+
+/// tryParseAddSubImm - Parse ADD/SUB shifted immediate operand
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::tryParseAddSubImm(OperandVector &Operands) {
+  SMLoc S = getLoc();
+
+  if (Parser.getTok().is(AsmToken::Hash))
+    Parser.Lex(); // Eat '#'
+  else if (Parser.getTok().isNot(AsmToken::Integer))
+    // Operand should start from # or should be integer, emit error otherwise.
+    return MatchOperand_NoMatch;
+
+  const MCExpr *Imm;
+  if (parseSymbolicImmVal(Imm))
+    return MatchOperand_ParseFail;
+  else if (Parser.getTok().isNot(AsmToken::Comma)) {
+    uint64_t ShiftAmount = 0;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Imm);
+    if (MCE) {
+      int64_t Val = MCE->getValue();
+      if (Val > 0xfff && (Val & 0xfff) == 0) {
+        Imm = MCConstantExpr::Create(Val >> 12, getContext());
+        ShiftAmount = 12;
+      }
+    }
+    SMLoc E = Parser.getTok().getLoc();
+    Operands.push_back(AArch64Operand::CreateShiftedImm(Imm, ShiftAmount, S, E,
+                                                        getContext()));
+    return MatchOperand_Success;
+  }
+
+  // Eat ','
+  Parser.Lex();
+
+  // The optional operand must be "lsl #N" where N is non-negative.
+  if (!Parser.getTok().is(AsmToken::Identifier) ||
+      !Parser.getTok().getIdentifier().equals_lower("lsl")) {
+    Error(Parser.getTok().getLoc(), "only 'lsl #+N' valid after immediate");
+    return MatchOperand_ParseFail;
+  }
+
+  // Eat 'lsl'
+  Parser.Lex();
+
+  if (Parser.getTok().is(AsmToken::Hash)) {
+    Parser.Lex();
+  }
+
+  if (Parser.getTok().isNot(AsmToken::Integer)) {
+    Error(Parser.getTok().getLoc(), "only 'lsl #+N' valid after immediate");
+    return MatchOperand_ParseFail;
+  }
+
+  int64_t ShiftAmount = Parser.getTok().getIntVal();
+
+  if (ShiftAmount < 0) {
+    Error(Parser.getTok().getLoc(), "positive shift amount required");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat the number
+
+  SMLoc E = Parser.getTok().getLoc();
+  Operands.push_back(AArch64Operand::CreateShiftedImm(Imm, ShiftAmount,
+                                                      S, E, getContext()));
+  return MatchOperand_Success;
+}
+
+/// parseCondCodeString - Parse a Condition Code string.
+AArch64CC::CondCode AArch64AsmParser::parseCondCodeString(StringRef Cond) {
+  AArch64CC::CondCode CC = StringSwitch<AArch64CC::CondCode>(Cond.lower())
+                    .Case("eq", AArch64CC::EQ)
+                    .Case("ne", AArch64CC::NE)
+                    .Case("cs", AArch64CC::HS)
+                    .Case("hs", AArch64CC::HS)
+                    .Case("cc", AArch64CC::LO)
+                    .Case("lo", AArch64CC::LO)
+                    .Case("mi", AArch64CC::MI)
+                    .Case("pl", AArch64CC::PL)
+                    .Case("vs", AArch64CC::VS)
+                    .Case("vc", AArch64CC::VC)
+                    .Case("hi", AArch64CC::HI)
+                    .Case("ls", AArch64CC::LS)
+                    .Case("ge", AArch64CC::GE)
+                    .Case("lt", AArch64CC::LT)
+                    .Case("gt", AArch64CC::GT)
+                    .Case("le", AArch64CC::LE)
+                    .Case("al", AArch64CC::AL)
+                    .Case("nv", AArch64CC::NV)
+                    .Default(AArch64CC::Invalid);
+  return CC;
+}
+
+/// parseCondCode - Parse a Condition Code operand.
+bool AArch64AsmParser::parseCondCode(OperandVector &Operands,
+                                     bool invertCondCode) {
+  SMLoc S = getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
+
+  StringRef Cond = Tok.getString();
+  AArch64CC::CondCode CC = parseCondCodeString(Cond);
+  if (CC == AArch64CC::Invalid)
+    return TokError("invalid condition code");
+  Parser.Lex(); // Eat identifier token.
+
+  if (invertCondCode) {
+    if (CC == AArch64CC::AL || CC == AArch64CC::NV)
+      return TokError("condition codes AL and NV are invalid for this instruction");
+    CC = AArch64CC::getInvertedCondCode(AArch64CC::CondCode(CC));
+  }
+
+  Operands.push_back(
+      AArch64Operand::CreateCondCode(CC, S, getLoc(), getContext()));
+  return false;
+}
+
+/// tryParseOptionalShift - Some operands take an optional shift argument. Parse
+/// them if present.
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::tryParseOptionalShiftExtend(OperandVector &Operands) {
+  const AsmToken &Tok = Parser.getTok();
+  std::string LowerID = Tok.getString().lower();
+  AArch64_AM::ShiftExtendType ShOp =
+      StringSwitch<AArch64_AM::ShiftExtendType>(LowerID)
+          .Case("lsl", AArch64_AM::LSL)
+          .Case("lsr", AArch64_AM::LSR)
+          .Case("asr", AArch64_AM::ASR)
+          .Case("ror", AArch64_AM::ROR)
+          .Case("msl", AArch64_AM::MSL)
+          .Case("uxtb", AArch64_AM::UXTB)
+          .Case("uxth", AArch64_AM::UXTH)
+          .Case("uxtw", AArch64_AM::UXTW)
+          .Case("uxtx", AArch64_AM::UXTX)
+          .Case("sxtb", AArch64_AM::SXTB)
+          .Case("sxth", AArch64_AM::SXTH)
+          .Case("sxtw", AArch64_AM::SXTW)
+          .Case("sxtx", AArch64_AM::SXTX)
+          .Default(AArch64_AM::InvalidShiftExtend);
+
+  if (ShOp == AArch64_AM::InvalidShiftExtend)
+    return MatchOperand_NoMatch;
+
+  SMLoc S = Tok.getLoc();
+  Parser.Lex();
+
+  bool Hash = getLexer().is(AsmToken::Hash);
+  if (!Hash && getLexer().isNot(AsmToken::Integer)) {
+    if (ShOp == AArch64_AM::LSL || ShOp == AArch64_AM::LSR ||
+        ShOp == AArch64_AM::ASR || ShOp == AArch64_AM::ROR ||
+        ShOp == AArch64_AM::MSL) {
+      // We expect a number here.
+      TokError("expected #imm after shift specifier");
+      return MatchOperand_ParseFail;
+    }
+
+    // "extend" type operatoins don't need an immediate, #0 is implicit.
+    SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+    Operands.push_back(
+        AArch64Operand::CreateShiftExtend(ShOp, 0, false, S, E, getContext()));
+    return MatchOperand_Success;
+  }
+
+  if (Hash)
+    Parser.Lex(); // Eat the '#'.
+
+  // Make sure we do actually have a number
+  if (!Parser.getTok().is(AsmToken::Integer)) {
+    Error(Parser.getTok().getLoc(),
+          "expected integer shift amount");
+    return MatchOperand_ParseFail;
+  }
+
+  const MCExpr *ImmVal;
+  if (getParser().parseExpression(ImmVal))
+    return MatchOperand_ParseFail;
+
+  const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+  if (!MCE) {
+    TokError("expected #imm after shift specifier");
+    return MatchOperand_ParseFail;
+  }
+
+  SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+  Operands.push_back(AArch64Operand::CreateShiftExtend(
+      ShOp, MCE->getValue(), true, S, E, getContext()));
+  return MatchOperand_Success;
+}
+
+/// parseSysAlias - The IC, DC, AT, and TLBI instructions are simple aliases for
+/// the SYS instruction. Parse them specially so that we create a SYS MCInst.
+bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
+                                   OperandVector &Operands) {
+  if (Name.find('.') != StringRef::npos)
+    return TokError("invalid operand");
+
+  Mnemonic = Name;
+  Operands.push_back(
+      AArch64Operand::CreateToken("sys", false, NameLoc, getContext()));
+
+  const AsmToken &Tok = Parser.getTok();
+  StringRef Op = Tok.getString();
+  SMLoc S = Tok.getLoc();
+
+  const MCExpr *Expr = nullptr;
+
+#define SYS_ALIAS(op1, Cn, Cm, op2)                                            \
+  do {                                                                         \
+    Expr = MCConstantExpr::Create(op1, getContext());                          \
+    Operands.push_back(                                                        \
+        AArch64Operand::CreateImm(Expr, S, getLoc(), getContext()));           \
+    Operands.push_back(                                                        \
+        AArch64Operand::CreateSysCR(Cn, S, getLoc(), getContext()));           \
+    Operands.push_back(                                                        \
+        AArch64Operand::CreateSysCR(Cm, S, getLoc(), getContext()));           \
+    Expr = MCConstantExpr::Create(op2, getContext());                          \
+    Operands.push_back(                                                        \
+        AArch64Operand::CreateImm(Expr, S, getLoc(), getContext()));           \
+  } while (0)
+
+  if (Mnemonic == "ic") {
+    if (!Op.compare_lower("ialluis")) {
+      // SYS #0, C7, C1, #0
+      SYS_ALIAS(0, 7, 1, 0);
+    } else if (!Op.compare_lower("iallu")) {
+      // SYS #0, C7, C5, #0
+      SYS_ALIAS(0, 7, 5, 0);
+    } else if (!Op.compare_lower("ivau")) {
+      // SYS #3, C7, C5, #1
+      SYS_ALIAS(3, 7, 5, 1);
+    } else {
+      return TokError("invalid operand for IC instruction");
+    }
+  } else if (Mnemonic == "dc") {
+    if (!Op.compare_lower("zva")) {
+      // SYS #3, C7, C4, #1
+      SYS_ALIAS(3, 7, 4, 1);
+    } else if (!Op.compare_lower("ivac")) {
+      // SYS #3, C7, C6, #1
+      SYS_ALIAS(0, 7, 6, 1);
+    } else if (!Op.compare_lower("isw")) {
+      // SYS #0, C7, C6, #2
+      SYS_ALIAS(0, 7, 6, 2);
+    } else if (!Op.compare_lower("cvac")) {
+      // SYS #3, C7, C10, #1
+      SYS_ALIAS(3, 7, 10, 1);
+    } else if (!Op.compare_lower("csw")) {
+      // SYS #0, C7, C10, #2
+      SYS_ALIAS(0, 7, 10, 2);
+    } else if (!Op.compare_lower("cvau")) {
+      // SYS #3, C7, C11, #1
+      SYS_ALIAS(3, 7, 11, 1);
+    } else if (!Op.compare_lower("civac")) {
+      // SYS #3, C7, C14, #1
+      SYS_ALIAS(3, 7, 14, 1);
+    } else if (!Op.compare_lower("cisw")) {
+      // SYS #0, C7, C14, #2
+      SYS_ALIAS(0, 7, 14, 2);
+    } else {
+      return TokError("invalid operand for DC instruction");
+    }
+  } else if (Mnemonic == "at") {
+    if (!Op.compare_lower("s1e1r")) {
+      // SYS #0, C7, C8, #0
+      SYS_ALIAS(0, 7, 8, 0);
+    } else if (!Op.compare_lower("s1e2r")) {
+      // SYS #4, C7, C8, #0
+      SYS_ALIAS(4, 7, 8, 0);
+    } else if (!Op.compare_lower("s1e3r")) {
+      // SYS #6, C7, C8, #0
+      SYS_ALIAS(6, 7, 8, 0);
+    } else if (!Op.compare_lower("s1e1w")) {
+      // SYS #0, C7, C8, #1
+      SYS_ALIAS(0, 7, 8, 1);
+    } else if (!Op.compare_lower("s1e2w")) {
+      // SYS #4, C7, C8, #1
+      SYS_ALIAS(4, 7, 8, 1);
+    } else if (!Op.compare_lower("s1e3w")) {
+      // SYS #6, C7, C8, #1
+      SYS_ALIAS(6, 7, 8, 1);
+    } else if (!Op.compare_lower("s1e0r")) {
+      // SYS #0, C7, C8, #3
+      SYS_ALIAS(0, 7, 8, 2);
+    } else if (!Op.compare_lower("s1e0w")) {
+      // SYS #0, C7, C8, #3
+      SYS_ALIAS(0, 7, 8, 3);
+    } else if (!Op.compare_lower("s12e1r")) {
+      // SYS #4, C7, C8, #4
+      SYS_ALIAS(4, 7, 8, 4);
+    } else if (!Op.compare_lower("s12e1w")) {
+      // SYS #4, C7, C8, #5
+      SYS_ALIAS(4, 7, 8, 5);
+    } else if (!Op.compare_lower("s12e0r")) {
+      // SYS #4, C7, C8, #6
+      SYS_ALIAS(4, 7, 8, 6);
+    } else if (!Op.compare_lower("s12e0w")) {
+      // SYS #4, C7, C8, #7
+      SYS_ALIAS(4, 7, 8, 7);
+    } else {
+      return TokError("invalid operand for AT instruction");
+    }
+  } else if (Mnemonic == "tlbi") {
+    if (!Op.compare_lower("vmalle1is")) {
+      // SYS #0, C8, C3, #0
+      SYS_ALIAS(0, 8, 3, 0);
+    } else if (!Op.compare_lower("alle2is")) {
+      // SYS #4, C8, C3, #0
+      SYS_ALIAS(4, 8, 3, 0);
+    } else if (!Op.compare_lower("alle3is")) {
+      // SYS #6, C8, C3, #0
+      SYS_ALIAS(6, 8, 3, 0);
+    } else if (!Op.compare_lower("vae1is")) {
+      // SYS #0, C8, C3, #1
+      SYS_ALIAS(0, 8, 3, 1);
+    } else if (!Op.compare_lower("vae2is")) {
+      // SYS #4, C8, C3, #1
+      SYS_ALIAS(4, 8, 3, 1);
+    } else if (!Op.compare_lower("vae3is")) {
+      // SYS #6, C8, C3, #1
+      SYS_ALIAS(6, 8, 3, 1);
+    } else if (!Op.compare_lower("aside1is")) {
+      // SYS #0, C8, C3, #2
+      SYS_ALIAS(0, 8, 3, 2);
+    } else if (!Op.compare_lower("vaae1is")) {
+      // SYS #0, C8, C3, #3
+      SYS_ALIAS(0, 8, 3, 3);
+    } else if (!Op.compare_lower("alle1is")) {
+      // SYS #4, C8, C3, #4
+      SYS_ALIAS(4, 8, 3, 4);
+    } else if (!Op.compare_lower("vale1is")) {
+      // SYS #0, C8, C3, #5
+      SYS_ALIAS(0, 8, 3, 5);
+    } else if (!Op.compare_lower("vaale1is")) {
+      // SYS #0, C8, C3, #7
+      SYS_ALIAS(0, 8, 3, 7);
+    } else if (!Op.compare_lower("vmalle1")) {
+      // SYS #0, C8, C7, #0
+      SYS_ALIAS(0, 8, 7, 0);
+    } else if (!Op.compare_lower("alle2")) {
+      // SYS #4, C8, C7, #0
+      SYS_ALIAS(4, 8, 7, 0);
+    } else if (!Op.compare_lower("vale2is")) {
+      // SYS #4, C8, C3, #5
+      SYS_ALIAS(4, 8, 3, 5);
+    } else if (!Op.compare_lower("vale3is")) {
+      // SYS #6, C8, C3, #5
+      SYS_ALIAS(6, 8, 3, 5);
+    } else if (!Op.compare_lower("alle3")) {
+      // SYS #6, C8, C7, #0
+      SYS_ALIAS(6, 8, 7, 0);
+    } else if (!Op.compare_lower("vae1")) {
+      // SYS #0, C8, C7, #1
+      SYS_ALIAS(0, 8, 7, 1);
+    } else if (!Op.compare_lower("vae2")) {
+      // SYS #4, C8, C7, #1
+      SYS_ALIAS(4, 8, 7, 1);
+    } else if (!Op.compare_lower("vae3")) {
+      // SYS #6, C8, C7, #1
+      SYS_ALIAS(6, 8, 7, 1);
+    } else if (!Op.compare_lower("aside1")) {
+      // SYS #0, C8, C7, #2
+      SYS_ALIAS(0, 8, 7, 2);
+    } else if (!Op.compare_lower("vaae1")) {
+      // SYS #0, C8, C7, #3
+      SYS_ALIAS(0, 8, 7, 3);
+    } else if (!Op.compare_lower("alle1")) {
+      // SYS #4, C8, C7, #4
+      SYS_ALIAS(4, 8, 7, 4);
+    } else if (!Op.compare_lower("vale1")) {
+      // SYS #0, C8, C7, #5
+      SYS_ALIAS(0, 8, 7, 5);
+    } else if (!Op.compare_lower("vale2")) {
+      // SYS #4, C8, C7, #5
+      SYS_ALIAS(4, 8, 7, 5);
+    } else if (!Op.compare_lower("vale3")) {
+      // SYS #6, C8, C7, #5
+      SYS_ALIAS(6, 8, 7, 5);
+    } else if (!Op.compare_lower("vaale1")) {
+      // SYS #0, C8, C7, #7
+      SYS_ALIAS(0, 8, 7, 7);
+    } else if (!Op.compare_lower("ipas2e1")) {
+      // SYS #4, C8, C4, #1
+      SYS_ALIAS(4, 8, 4, 1);
+    } else if (!Op.compare_lower("ipas2le1")) {
+      // SYS #4, C8, C4, #5
+      SYS_ALIAS(4, 8, 4, 5);
+    } else if (!Op.compare_lower("ipas2e1is")) {
+      // SYS #4, C8, C4, #1
+      SYS_ALIAS(4, 8, 0, 1);
+    } else if (!Op.compare_lower("ipas2le1is")) {
+      // SYS #4, C8, C4, #5
+      SYS_ALIAS(4, 8, 0, 5);
+    } else if (!Op.compare_lower("vmalls12e1")) {
+      // SYS #4, C8, C7, #6
+      SYS_ALIAS(4, 8, 7, 6);
+    } else if (!Op.compare_lower("vmalls12e1is")) {
+      // SYS #4, C8, C3, #6
+      SYS_ALIAS(4, 8, 3, 6);
+    } else {
+      return TokError("invalid operand for TLBI instruction");
+    }
+  }
+
+#undef SYS_ALIAS
+
+  Parser.Lex(); // Eat operand.
+
+  bool ExpectRegister = (Op.lower().find("all") == StringRef::npos);
+  bool HasRegister = false;
+
+  // Check for the optional register operand.
+  if (getLexer().is(AsmToken::Comma)) {
+    Parser.Lex(); // Eat comma.
+
+    if (Tok.isNot(AsmToken::Identifier) || parseRegister(Operands))
+      return TokError("expected register operand");
+
+    HasRegister = true;
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    Parser.eatToEndOfStatement();
+    return TokError("unexpected token in argument list");
+  }
+
+  if (ExpectRegister && !HasRegister) {
+    return TokError("specified " + Mnemonic + " op requires a register");
+  }
+  else if (!ExpectRegister && HasRegister) {
+    return TokError("specified " + Mnemonic + " op does not use a register");
+  }
+
+  Parser.Lex(); // Consume the EndOfStatement
+  return false;
+}
+
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
+  const AsmToken &Tok = Parser.getTok();
+
+  // Can be either a #imm style literal or an option name
+  bool Hash = Tok.is(AsmToken::Hash);
+  if (Hash || Tok.is(AsmToken::Integer)) {
+    // Immediate operand.
+    if (Hash)
+      Parser.Lex(); // Eat the '#'
+    const MCExpr *ImmVal;
+    SMLoc ExprLoc = getLoc();
+    if (getParser().parseExpression(ImmVal))
+      return MatchOperand_ParseFail;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+    if (!MCE) {
+      Error(ExprLoc, "immediate value expected for barrier operand");
+      return MatchOperand_ParseFail;
+    }
+    if (MCE->getValue() < 0 || MCE->getValue() > 15) {
+      Error(ExprLoc, "barrier operand out of range");
+      return MatchOperand_ParseFail;
+    }
+    Operands.push_back(
+        AArch64Operand::CreateBarrier(MCE->getValue(), ExprLoc, getContext()));
+    return MatchOperand_Success;
+  }
+
+  if (Tok.isNot(AsmToken::Identifier)) {
+    TokError("invalid operand for instruction");
+    return MatchOperand_ParseFail;
+  }
+
+  bool Valid;
+  unsigned Opt = AArch64DB::DBarrierMapper().fromString(Tok.getString(), Valid);
+  if (!Valid) {
+    TokError("invalid barrier option name");
+    return MatchOperand_ParseFail;
+  }
+
+  // The only valid named option for ISB is 'sy'
+  if (Mnemonic == "isb" && Opt != AArch64DB::SY) {
+    TokError("'sy' or #imm operand expected");
+    return MatchOperand_ParseFail;
+  }
+
+  Operands.push_back(
+      AArch64Operand::CreateBarrier(Opt, getLoc(), getContext()));
+  Parser.Lex(); // Consume the option
+
+  return MatchOperand_Success;
+}
+
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::tryParseSysReg(OperandVector &Operands) {
+  const AsmToken &Tok = Parser.getTok();
+
+  if (Tok.isNot(AsmToken::Identifier))
+    return MatchOperand_NoMatch;
+
+  Operands.push_back(AArch64Operand::CreateSysReg(Tok.getString(), getLoc(),
+                     STI.getFeatureBits(), getContext()));
+  Parser.Lex(); // Eat identifier
+
+  return MatchOperand_Success;
+}
+
+/// tryParseVectorRegister - Parse a vector register operand.
+bool AArch64AsmParser::tryParseVectorRegister(OperandVector &Operands) {
+  if (Parser.getTok().isNot(AsmToken::Identifier))
+    return true;
+
+  SMLoc S = getLoc();
+  // Check for a vector register specifier first.
+  StringRef Kind;
+  int64_t Reg = tryMatchVectorRegister(Kind, false);
+  if (Reg == -1)
+    return true;
+  Operands.push_back(
+      AArch64Operand::CreateReg(Reg, true, S, getLoc(), getContext()));
+  // If there was an explicit qualifier, that goes on as a literal text
+  // operand.
+  if (!Kind.empty())
+    Operands.push_back(
+        AArch64Operand::CreateToken(Kind, false, S, getContext()));
+
+  // If there is an index specifier following the register, parse that too.
+  if (Parser.getTok().is(AsmToken::LBrac)) {
+    SMLoc SIdx = getLoc();
+    Parser.Lex(); // Eat left bracket token.
+
+    const MCExpr *ImmVal;
+    if (getParser().parseExpression(ImmVal))
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+    if (!MCE) {
+      TokError("immediate value expected for vector index");
+      return false;
+    }
+
+    SMLoc E = getLoc();
+    if (Parser.getTok().isNot(AsmToken::RBrac)) {
+      Error(E, "']' expected");
+      return false;
+    }
+
+    Parser.Lex(); // Eat right bracket token.
+
+    Operands.push_back(AArch64Operand::CreateVectorIndex(MCE->getValue(), SIdx,
+                                                         E, getContext()));
+  }
+
+  return false;
+}
+
+/// parseRegister - Parse a non-vector register operand.
+bool AArch64AsmParser::parseRegister(OperandVector &Operands) {
+  SMLoc S = getLoc();
+  // Try for a vector register.
+  if (!tryParseVectorRegister(Operands))
+    return false;
+
+  // Try for a scalar register.
+  int64_t Reg = tryParseRegister();
+  if (Reg == -1)
+    return true;
+  Operands.push_back(
+      AArch64Operand::CreateReg(Reg, false, S, getLoc(), getContext()));
+
+  // A small number of instructions (FMOVXDhighr, for example) have "[1]"
+  // as a string token in the instruction itself.
+  if (getLexer().getKind() == AsmToken::LBrac) {
+    SMLoc LBracS = getLoc();
+    Parser.Lex();
+    const AsmToken &Tok = Parser.getTok();
+    if (Tok.is(AsmToken::Integer)) {
+      SMLoc IntS = getLoc();
+      int64_t Val = Tok.getIntVal();
+      if (Val == 1) {
+        Parser.Lex();
+        if (getLexer().getKind() == AsmToken::RBrac) {
+          SMLoc RBracS = getLoc();
+          Parser.Lex();
+          Operands.push_back(
+              AArch64Operand::CreateToken("[", false, LBracS, getContext()));
+          Operands.push_back(
+              AArch64Operand::CreateToken("1", false, IntS, getContext()));
+          Operands.push_back(
+              AArch64Operand::CreateToken("]", false, RBracS, getContext()));
+          return false;
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
+bool AArch64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) {
+  bool HasELFModifier = false;
+  AArch64MCExpr::VariantKind RefKind;
+
+  if (Parser.getTok().is(AsmToken::Colon)) {
+    Parser.Lex(); // Eat ':"
+    HasELFModifier = true;
+
+    if (Parser.getTok().isNot(AsmToken::Identifier)) {
+      Error(Parser.getTok().getLoc(),
+            "expect relocation specifier in operand after ':'");
+      return true;
+    }
+
+    std::string LowerCase = Parser.getTok().getIdentifier().lower();
+    RefKind = StringSwitch<AArch64MCExpr::VariantKind>(LowerCase)
+                  .Case("lo12", AArch64MCExpr::VK_LO12)
+                  .Case("abs_g3", AArch64MCExpr::VK_ABS_G3)
+                  .Case("abs_g2", AArch64MCExpr::VK_ABS_G2)
+                  .Case("abs_g2_s", AArch64MCExpr::VK_ABS_G2_S)
+                  .Case("abs_g2_nc", AArch64MCExpr::VK_ABS_G2_NC)
+                  .Case("abs_g1", AArch64MCExpr::VK_ABS_G1)
+                  .Case("abs_g1_s", AArch64MCExpr::VK_ABS_G1_S)
+                  .Case("abs_g1_nc", AArch64MCExpr::VK_ABS_G1_NC)
+                  .Case("abs_g0", AArch64MCExpr::VK_ABS_G0)
+                  .Case("abs_g0_s", AArch64MCExpr::VK_ABS_G0_S)
+                  .Case("abs_g0_nc", AArch64MCExpr::VK_ABS_G0_NC)
+                  .Case("dtprel_g2", AArch64MCExpr::VK_DTPREL_G2)
+                  .Case("dtprel_g1", AArch64MCExpr::VK_DTPREL_G1)
+                  .Case("dtprel_g1_nc", AArch64MCExpr::VK_DTPREL_G1_NC)
+                  .Case("dtprel_g0", AArch64MCExpr::VK_DTPREL_G0)
+                  .Case("dtprel_g0_nc", AArch64MCExpr::VK_DTPREL_G0_NC)
+                  .Case("dtprel_hi12", AArch64MCExpr::VK_DTPREL_HI12)
+                  .Case("dtprel_lo12", AArch64MCExpr::VK_DTPREL_LO12)
+                  .Case("dtprel_lo12_nc", AArch64MCExpr::VK_DTPREL_LO12_NC)
+                  .Case("tprel_g2", AArch64MCExpr::VK_TPREL_G2)
+                  .Case("tprel_g1", AArch64MCExpr::VK_TPREL_G1)
+                  .Case("tprel_g1_nc", AArch64MCExpr::VK_TPREL_G1_NC)
+                  .Case("tprel_g0", AArch64MCExpr::VK_TPREL_G0)
+                  .Case("tprel_g0_nc", AArch64MCExpr::VK_TPREL_G0_NC)
+                  .Case("tprel_hi12", AArch64MCExpr::VK_TPREL_HI12)
+                  .Case("tprel_lo12", AArch64MCExpr::VK_TPREL_LO12)
+                  .Case("tprel_lo12_nc", AArch64MCExpr::VK_TPREL_LO12_NC)
+                  .Case("tlsdesc_lo12", AArch64MCExpr::VK_TLSDESC_LO12)
+                  .Case("got", AArch64MCExpr::VK_GOT_PAGE)
+                  .Case("got_lo12", AArch64MCExpr::VK_GOT_LO12)
+                  .Case("gottprel", AArch64MCExpr::VK_GOTTPREL_PAGE)
+                  .Case("gottprel_lo12", AArch64MCExpr::VK_GOTTPREL_LO12_NC)
+                  .Case("gottprel_g1", AArch64MCExpr::VK_GOTTPREL_G1)
+                  .Case("gottprel_g0_nc", AArch64MCExpr::VK_GOTTPREL_G0_NC)
+                  .Case("tlsdesc", AArch64MCExpr::VK_TLSDESC_PAGE)
+                  .Default(AArch64MCExpr::VK_INVALID);
+
+    if (RefKind == AArch64MCExpr::VK_INVALID) {
+      Error(Parser.getTok().getLoc(),
+            "expect relocation specifier in operand after ':'");
+      return true;
+    }
+
+    Parser.Lex(); // Eat identifier
+
+    if (Parser.getTok().isNot(AsmToken::Colon)) {
+      Error(Parser.getTok().getLoc(), "expect ':' after relocation specifier");
+      return true;
+    }
+    Parser.Lex(); // Eat ':'
+  }
+
+  if (getParser().parseExpression(ImmVal))
+    return true;
+
+  if (HasELFModifier)
+    ImmVal = AArch64MCExpr::Create(ImmVal, RefKind, getContext());
+
+  return false;
+}
+
+/// parseVectorList - Parse a vector list operand for AdvSIMD instructions.
+bool AArch64AsmParser::parseVectorList(OperandVector &Operands) {
+  assert(Parser.getTok().is(AsmToken::LCurly) && "Token is not a Left Bracket");
+  SMLoc S = getLoc();
+  Parser.Lex(); // Eat left bracket token.
+  StringRef Kind;
+  int64_t FirstReg = tryMatchVectorRegister(Kind, true);
+  if (FirstReg == -1)
+    return true;
+  int64_t PrevReg = FirstReg;
+  unsigned Count = 1;
+
+  if (Parser.getTok().is(AsmToken::Minus)) {
+    Parser.Lex(); // Eat the minus.
+
+    SMLoc Loc = getLoc();
+    StringRef NextKind;
+    int64_t Reg = tryMatchVectorRegister(NextKind, true);
+    if (Reg == -1)
+      return true;
+    // Any Kind suffices must match on all regs in the list.
+    if (Kind != NextKind)
+      return Error(Loc, "mismatched register size suffix");
+
+    unsigned Space = (PrevReg < Reg) ? (Reg - PrevReg) : (Reg + 32 - PrevReg);
+
+    if (Space == 0 || Space > 3) {
+      return Error(Loc, "invalid number of vectors");
+    }
+
+    Count += Space;
+  }
+  else {
+    while (Parser.getTok().is(AsmToken::Comma)) {
+      Parser.Lex(); // Eat the comma token.
+
+      SMLoc Loc = getLoc();
+      StringRef NextKind;
+      int64_t Reg = tryMatchVectorRegister(NextKind, true);
+      if (Reg == -1)
+        return true;
+      // Any Kind suffices must match on all regs in the list.
+      if (Kind != NextKind)
+        return Error(Loc, "mismatched register size suffix");
+
+      // Registers must be incremental (with wraparound at 31)
+      if (getContext().getRegisterInfo()->getEncodingValue(Reg) !=
+          (getContext().getRegisterInfo()->getEncodingValue(PrevReg) + 1) % 32)
+       return Error(Loc, "registers must be sequential");
+
+      PrevReg = Reg;
+      ++Count;
+    }
+  }
+
+  if (Parser.getTok().isNot(AsmToken::RCurly))
+    return Error(getLoc(), "'}' expected");
+  Parser.Lex(); // Eat the '}' token.
+
+  if (Count > 4)
+    return Error(S, "invalid number of vectors");
+
+  unsigned NumElements = 0;
+  char ElementKind = 0;
+  if (!Kind.empty())
+    parseValidVectorKind(Kind, NumElements, ElementKind);
+
+  Operands.push_back(AArch64Operand::CreateVectorList(
+      FirstReg, Count, NumElements, ElementKind, S, getLoc(), getContext()));
+
+  // If there is an index specifier following the list, parse that too.
+  if (Parser.getTok().is(AsmToken::LBrac)) {
+    SMLoc SIdx = getLoc();
+    Parser.Lex(); // Eat left bracket token.
+
+    const MCExpr *ImmVal;
+    if (getParser().parseExpression(ImmVal))
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+    if (!MCE) {
+      TokError("immediate value expected for vector index");
+      return false;
+    }
+
+    SMLoc E = getLoc();
+    if (Parser.getTok().isNot(AsmToken::RBrac)) {
+      Error(E, "']' expected");
+      return false;
+    }
+
+    Parser.Lex(); // Eat right bracket token.
+
+    Operands.push_back(AArch64Operand::CreateVectorIndex(MCE->getValue(), SIdx,
+                                                         E, getContext()));
+  }
+  return false;
+}
+
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::tryParseGPR64sp0Operand(OperandVector &Operands) {
+  const AsmToken &Tok = Parser.getTok();
+  if (!Tok.is(AsmToken::Identifier))
+    return MatchOperand_NoMatch;
+
+  unsigned RegNum = matchRegisterNameAlias(Tok.getString().lower(), false);
+
+  MCContext &Ctx = getContext();
+  const MCRegisterInfo *RI = Ctx.getRegisterInfo();
+  if (!RI->getRegClass(AArch64::GPR64spRegClassID).contains(RegNum))
+    return MatchOperand_NoMatch;
+
+  SMLoc S = getLoc();
+  Parser.Lex(); // Eat register
+
+  if (Parser.getTok().isNot(AsmToken::Comma)) {
+    Operands.push_back(
+        AArch64Operand::CreateReg(RegNum, false, S, getLoc(), Ctx));
+    return MatchOperand_Success;
+  }
+  Parser.Lex(); // Eat comma.
+
+  if (Parser.getTok().is(AsmToken::Hash))
+    Parser.Lex(); // Eat hash
+
+  if (Parser.getTok().isNot(AsmToken::Integer)) {
+    Error(getLoc(), "index must be absent or #0");
+    return MatchOperand_ParseFail;
+  }
+
+  const MCExpr *ImmVal;
+  if (Parser.parseExpression(ImmVal) || !isa<MCConstantExpr>(ImmVal) ||
+      cast<MCConstantExpr>(ImmVal)->getValue() != 0) {
+    Error(getLoc(), "index must be absent or #0");
+    return MatchOperand_ParseFail;
+  }
+
+  Operands.push_back(
+      AArch64Operand::CreateReg(RegNum, false, S, getLoc(), Ctx));
+  return MatchOperand_Success;
+}
+
+/// parseOperand - Parse a arm instruction operand.  For now this parses the
+/// operand regardless of the mnemonic.
+bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
+                                  bool invertCondCode) {
+  // Check if the current operand has a custom associated parser, if so, try to
+  // custom parse the operand, or fallback to the general approach.
+  OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
+  if (ResTy == MatchOperand_Success)
+    return false;
+  // If there wasn't a custom match, try the generic matcher below. Otherwise,
+  // there was a match, but an error occurred, in which case, just return that
+  // the operand parsing failed.
+  if (ResTy == MatchOperand_ParseFail)
+    return true;
+
+  // Nothing custom, so do general case parsing.
+  SMLoc S, E;
+  switch (getLexer().getKind()) {
+  default: {
+    SMLoc S = getLoc();
+    const MCExpr *Expr;
+    if (parseSymbolicImmVal(Expr))
+      return Error(S, "invalid operand");
+
+    SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+    Operands.push_back(AArch64Operand::CreateImm(Expr, S, E, getContext()));
+    return false;
+  }
+  case AsmToken::LBrac: {
+    SMLoc Loc = Parser.getTok().getLoc();
+    Operands.push_back(AArch64Operand::CreateToken("[", false, Loc,
+                                                   getContext()));
+    Parser.Lex(); // Eat '['
+
+    // There's no comma after a '[', so we can parse the next operand
+    // immediately.
+    return parseOperand(Operands, false, false);
+  }
+  case AsmToken::LCurly:
+    return parseVectorList(Operands);
+  case AsmToken::Identifier: {
+    // If we're expecting a Condition Code operand, then just parse that.
+    if (isCondCode)
+      return parseCondCode(Operands, invertCondCode);
+
+    // If it's a register name, parse it.
+    if (!parseRegister(Operands))
+      return false;
+
+    // This could be an optional "shift" or "extend" operand.
+    OperandMatchResultTy GotShift = tryParseOptionalShiftExtend(Operands);
+    // We can only continue if no tokens were eaten.
+    if (GotShift != MatchOperand_NoMatch)
+      return GotShift;
+
+    // This was not a register so parse other operands that start with an
+    // identifier (like labels) as expressions and create them as immediates.
+    const MCExpr *IdVal;
+    S = getLoc();
+    if (getParser().parseExpression(IdVal))
+      return true;
+
+    E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+    Operands.push_back(AArch64Operand::CreateImm(IdVal, S, E, getContext()));
+    return false;
+  }
+  case AsmToken::Integer:
+  case AsmToken::Real:
+  case AsmToken::Hash: {
+    // #42 -> immediate.
+    S = getLoc();
+    if (getLexer().is(AsmToken::Hash))
+      Parser.Lex();
+
+    // Parse a negative sign
+    bool isNegative = false;
+    if (Parser.getTok().is(AsmToken::Minus)) {
+      isNegative = true;
+      // We need to consume this token only when we have a Real, otherwise
+      // we let parseSymbolicImmVal take care of it
+      if (Parser.getLexer().peekTok().is(AsmToken::Real))
+        Parser.Lex();
+    }
+
+    // The only Real that should come through here is a literal #0.0 for
+    // the fcmp[e] r, #0.0 instructions. They expect raw token operands,
+    // so convert the value.
+    const AsmToken &Tok = Parser.getTok();
+    if (Tok.is(AsmToken::Real)) {
+      APFloat RealVal(APFloat::IEEEdouble, Tok.getString());
+      uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
+      if (Mnemonic != "fcmp" && Mnemonic != "fcmpe" && Mnemonic != "fcmeq" &&
+          Mnemonic != "fcmge" && Mnemonic != "fcmgt" && Mnemonic != "fcmle" &&
+          Mnemonic != "fcmlt")
+        return TokError("unexpected floating point literal");
+      else if (IntVal != 0 || isNegative)
+        return TokError("expected floating-point constant #0.0");
+      Parser.Lex(); // Eat the token.
+
+      Operands.push_back(
+          AArch64Operand::CreateToken("#0", false, S, getContext()));
+      Operands.push_back(
+          AArch64Operand::CreateToken(".0", false, S, getContext()));
+      return false;
+    }
+
+    const MCExpr *ImmVal;
+    if (parseSymbolicImmVal(ImmVal))
+      return true;
+
+    E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+    Operands.push_back(AArch64Operand::CreateImm(ImmVal, S, E, getContext()));
+    return false;
+  }
+  case AsmToken::Equal: {
+    SMLoc Loc = Parser.getTok().getLoc();
+    if (Mnemonic != "ldr") // only parse for ldr pseudo (e.g. ldr r0, =val)
+      return Error(Loc, "unexpected token in operand");
+    Parser.Lex(); // Eat '='
+    const MCExpr *SubExprVal;
+    if (getParser().parseExpression(SubExprVal))
+      return true;
+
+    if (Operands.size() < 2 ||
+        !static_cast<AArch64Operand &>(*Operands[1]).isReg())
+      return true;
+
+    bool IsXReg =
+        AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
+            Operands[1]->getReg());
+
+    MCContext& Ctx = getContext();
+    E = SMLoc::getFromPointer(Loc.getPointer() - 1);
+    // If the op is an imm and can be fit into a mov, then replace ldr with mov.
+    if (isa<MCConstantExpr>(SubExprVal)) {
+      uint64_t Imm = (cast<MCConstantExpr>(SubExprVal))->getValue();
+      uint32_t ShiftAmt = 0, MaxShiftAmt = IsXReg ? 48 : 16;
+      while(Imm > 0xFFFF && countTrailingZeros(Imm) >= 16) {
+        ShiftAmt += 16;
+        Imm >>= 16;
+      }
+      if (ShiftAmt <= MaxShiftAmt && Imm <= 0xFFFF) {
+          Operands[0] = AArch64Operand::CreateToken("movz", false, Loc, Ctx);
+          Operands.push_back(AArch64Operand::CreateImm(
+                     MCConstantExpr::Create(Imm, Ctx), S, E, Ctx));
+        if (ShiftAmt)
+          Operands.push_back(AArch64Operand::CreateShiftExtend(AArch64_AM::LSL,
+                     ShiftAmt, true, S, E, Ctx));
+        return false;
+      }
+      APInt Simm = APInt(64, Imm << ShiftAmt);
+      // check if the immediate is an unsigned or signed 32-bit int for W regs
+      if (!IsXReg && !(Simm.isIntN(32) || Simm.isSignedIntN(32)))
+        return Error(Loc, "Immediate too large for register");
+    }
+    // If it is a label or an imm that cannot fit in a movz, put it into CP.
+    const MCExpr *CPLoc =
+        getTargetStreamer().addConstantPoolEntry(SubExprVal, IsXReg ? 8 : 4);
+    Operands.push_back(AArch64Operand::CreateImm(CPLoc, S, E, Ctx));
+    return false;
+  }
+  }
+}
+
+/// ParseInstruction - Parse an AArch64 instruction mnemonic followed by its
+/// operands.
+bool AArch64AsmParser::ParseInstruction(ParseInstructionInfo &Info,
+                                        StringRef Name, SMLoc NameLoc,
+                                        OperandVector &Operands) {
+  Name = StringSwitch<StringRef>(Name.lower())
+             .Case("beq", "b.eq")
+             .Case("bne", "b.ne")
+             .Case("bhs", "b.hs")
+             .Case("bcs", "b.cs")
+             .Case("blo", "b.lo")
+             .Case("bcc", "b.cc")
+             .Case("bmi", "b.mi")
+             .Case("bpl", "b.pl")
+             .Case("bvs", "b.vs")
+             .Case("bvc", "b.vc")
+             .Case("bhi", "b.hi")
+             .Case("bls", "b.ls")
+             .Case("bge", "b.ge")
+             .Case("blt", "b.lt")
+             .Case("bgt", "b.gt")
+             .Case("ble", "b.le")
+             .Case("bal", "b.al")
+             .Case("bnv", "b.nv")
+             .Default(Name);
+
+  // First check for the AArch64-specific .req directive.
+  if (Parser.getTok().is(AsmToken::Identifier) &&
+      Parser.getTok().getIdentifier() == ".req") {
+    parseDirectiveReq(Name, NameLoc);
+    // We always return 'error' for this, as we're done with this
+    // statement and don't need to match the 'instruction."
+    return true;
+  }
+
+  // Create the leading tokens for the mnemonic, split by '.' characters.
+  size_t Start = 0, Next = Name.find('.');
+  StringRef Head = Name.slice(Start, Next);
+
+  // IC, DC, AT, and TLBI instructions are aliases for the SYS instruction.
+  if (Head == "ic" || Head == "dc" || Head == "at" || Head == "tlbi") {
+    bool IsError = parseSysAlias(Head, NameLoc, Operands);
+    if (IsError && getLexer().isNot(AsmToken::EndOfStatement))
+      Parser.eatToEndOfStatement();
+    return IsError;
+  }
+
+  Operands.push_back(
+      AArch64Operand::CreateToken(Head, false, NameLoc, getContext()));
+  Mnemonic = Head;
+
+  // Handle condition codes for a branch mnemonic
+  if (Head == "b" && Next != StringRef::npos) {
+    Start = Next;
+    Next = Name.find('.', Start + 1);
+    Head = Name.slice(Start + 1, Next);
+
+    SMLoc SuffixLoc = SMLoc::getFromPointer(NameLoc.getPointer() +
+                                            (Head.data() - Name.data()));
+    AArch64CC::CondCode CC = parseCondCodeString(Head);
+    if (CC == AArch64CC::Invalid)
+      return Error(SuffixLoc, "invalid condition code");
+    Operands.push_back(
+        AArch64Operand::CreateToken(".", true, SuffixLoc, getContext()));
+    Operands.push_back(
+        AArch64Operand::CreateCondCode(CC, NameLoc, NameLoc, getContext()));
+  }
+
+  // Add the remaining tokens in the mnemonic.
+  while (Next != StringRef::npos) {
+    Start = Next;
+    Next = Name.find('.', Start + 1);
+    Head = Name.slice(Start, Next);
+    SMLoc SuffixLoc = SMLoc::getFromPointer(NameLoc.getPointer() +
+                                            (Head.data() - Name.data()) + 1);
+    Operands.push_back(
+        AArch64Operand::CreateToken(Head, true, SuffixLoc, getContext()));
+  }
+
+  // Conditional compare instructions have a Condition Code operand, which needs
+  // to be parsed and an immediate operand created.
+  bool condCodeFourthOperand =
+      (Head == "ccmp" || Head == "ccmn" || Head == "fccmp" ||
+       Head == "fccmpe" || Head == "fcsel" || Head == "csel" ||
+       Head == "csinc" || Head == "csinv" || Head == "csneg");
+
+  // These instructions are aliases to some of the conditional select
+  // instructions. However, the condition code is inverted in the aliased
+  // instruction.
+  //
+  // FIXME: Is this the correct way to handle these? Or should the parser
+  //        generate the aliased instructions directly?
+  bool condCodeSecondOperand = (Head == "cset" || Head == "csetm");
+  bool condCodeThirdOperand =
+      (Head == "cinc" || Head == "cinv" || Head == "cneg");
+
+  // Read the remaining operands.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    // Read the first operand.
+    if (parseOperand(Operands, false, false)) {
+      Parser.eatToEndOfStatement();
+      return true;
+    }
+
+    unsigned N = 2;
+    while (getLexer().is(AsmToken::Comma)) {
+      Parser.Lex(); // Eat the comma.
+
+      // Parse and remember the operand.
+      if (parseOperand(Operands, (N == 4 && condCodeFourthOperand) ||
+                                     (N == 3 && condCodeThirdOperand) ||
+                                     (N == 2 && condCodeSecondOperand),
+                       condCodeSecondOperand || condCodeThirdOperand)) {
+        Parser.eatToEndOfStatement();
+        return true;
+      }
+
+      // After successfully parsing some operands there are two special cases to
+      // consider (i.e. notional operands not separated by commas). Both are due
+      // to memory specifiers:
+      //  + An RBrac will end an address for load/store/prefetch
+      //  + An '!' will indicate a pre-indexed operation.
+      //
+      // It's someone else's responsibility to make sure these tokens are sane
+      // in the given context!
+      if (Parser.getTok().is(AsmToken::RBrac)) {
+        SMLoc Loc = Parser.getTok().getLoc();
+        Operands.push_back(AArch64Operand::CreateToken("]", false, Loc,
+                                                       getContext()));
+        Parser.Lex();
+      }
+
+      if (Parser.getTok().is(AsmToken::Exclaim)) {
+        SMLoc Loc = Parser.getTok().getLoc();
+        Operands.push_back(AArch64Operand::CreateToken("!", false, Loc,
+                                                       getContext()));
+        Parser.Lex();
+      }
+
+      ++N;
+    }
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    SMLoc Loc = Parser.getTok().getLoc();
+    Parser.eatToEndOfStatement();
+    return Error(Loc, "unexpected token in argument list");
+  }
+
+  Parser.Lex(); // Consume the EndOfStatement
+  return false;
+}
+
+// FIXME: This entire function is a giant hack to provide us with decent
+// operand range validation/diagnostics until TableGen/MC can be extended
+// to support autogeneration of this kind of validation.
+bool AArch64AsmParser::validateInstruction(MCInst &Inst,
+                                         SmallVectorImpl<SMLoc> &Loc) {
+  const MCRegisterInfo *RI = getContext().getRegisterInfo();
+  // Check for indexed addressing modes w/ the base register being the
+  // same as a destination/source register or pair load where
+  // the Rt == Rt2. All of those are undefined behaviour.
+  switch (Inst.getOpcode()) {
+  case AArch64::LDPSWpre:
+  case AArch64::LDPWpost:
+  case AArch64::LDPWpre:
+  case AArch64::LDPXpost:
+  case AArch64::LDPXpre: {
+    unsigned Rt = Inst.getOperand(1).getReg();
+    unsigned Rt2 = Inst.getOperand(2).getReg();
+    unsigned Rn = Inst.getOperand(3).getReg();
+    if (RI->isSubRegisterEq(Rn, Rt))
+      return Error(Loc[0], "unpredictable LDP instruction, writeback base "
+                           "is also a destination");
+    if (RI->isSubRegisterEq(Rn, Rt2))
+      return Error(Loc[1], "unpredictable LDP instruction, writeback base "
+                           "is also a destination");
+    // FALLTHROUGH
+  }
+  case AArch64::LDPDi:
+  case AArch64::LDPQi:
+  case AArch64::LDPSi:
+  case AArch64::LDPSWi:
+  case AArch64::LDPWi:
+  case AArch64::LDPXi: {
+    unsigned Rt = Inst.getOperand(0).getReg();
+    unsigned Rt2 = Inst.getOperand(1).getReg();
+    if (Rt == Rt2)
+      return Error(Loc[1], "unpredictable LDP instruction, Rt2==Rt");
+    break;
+  }
+  case AArch64::LDPDpost:
+  case AArch64::LDPDpre:
+  case AArch64::LDPQpost:
+  case AArch64::LDPQpre:
+  case AArch64::LDPSpost:
+  case AArch64::LDPSpre:
+  case AArch64::LDPSWpost: {
+    unsigned Rt = Inst.getOperand(1).getReg();
+    unsigned Rt2 = Inst.getOperand(2).getReg();
+    if (Rt == Rt2)
+      return Error(Loc[1], "unpredictable LDP instruction, Rt2==Rt");
+    break;
+  }
+  case AArch64::STPDpost:
+  case AArch64::STPDpre:
+  case AArch64::STPQpost:
+  case AArch64::STPQpre:
+  case AArch64::STPSpost:
+  case AArch64::STPSpre:
+  case AArch64::STPWpost:
+  case AArch64::STPWpre:
+  case AArch64::STPXpost:
+  case AArch64::STPXpre: {
+    unsigned Rt = Inst.getOperand(1).getReg();
+    unsigned Rt2 = Inst.getOperand(2).getReg();
+    unsigned Rn = Inst.getOperand(3).getReg();
+    if (RI->isSubRegisterEq(Rn, Rt))
+      return Error(Loc[0], "unpredictable STP instruction, writeback base "
+                           "is also a source");
+    if (RI->isSubRegisterEq(Rn, Rt2))
+      return Error(Loc[1], "unpredictable STP instruction, writeback base "
+                           "is also a source");
+    break;
+  }
+  case AArch64::LDRBBpre:
+  case AArch64::LDRBpre:
+  case AArch64::LDRHHpre:
+  case AArch64::LDRHpre:
+  case AArch64::LDRSBWpre:
+  case AArch64::LDRSBXpre:
+  case AArch64::LDRSHWpre:
+  case AArch64::LDRSHXpre:
+  case AArch64::LDRSWpre:
+  case AArch64::LDRWpre:
+  case AArch64::LDRXpre:
+  case AArch64::LDRBBpost:
+  case AArch64::LDRBpost:
+  case AArch64::LDRHHpost:
+  case AArch64::LDRHpost:
+  case AArch64::LDRSBWpost:
+  case AArch64::LDRSBXpost:
+  case AArch64::LDRSHWpost:
+  case AArch64::LDRSHXpost:
+  case AArch64::LDRSWpost:
+  case AArch64::LDRWpost:
+  case AArch64::LDRXpost: {
+    unsigned Rt = Inst.getOperand(1).getReg();
+    unsigned Rn = Inst.getOperand(2).getReg();
+    if (RI->isSubRegisterEq(Rn, Rt))
+      return Error(Loc[0], "unpredictable LDR instruction, writeback base "
+                           "is also a source");
+    break;
+  }
+  case AArch64::STRBBpost:
+  case AArch64::STRBpost:
+  case AArch64::STRHHpost:
+  case AArch64::STRHpost:
+  case AArch64::STRWpost:
+  case AArch64::STRXpost:
+  case AArch64::STRBBpre:
+  case AArch64::STRBpre:
+  case AArch64::STRHHpre:
+  case AArch64::STRHpre:
+  case AArch64::STRWpre:
+  case AArch64::STRXpre: {
+    unsigned Rt = Inst.getOperand(1).getReg();
+    unsigned Rn = Inst.getOperand(2).getReg();
+    if (RI->isSubRegisterEq(Rn, Rt))
+      return Error(Loc[0], "unpredictable STR instruction, writeback base "
+                           "is also a source");
+    break;
+  }
+  }
+
+  // Now check immediate ranges. Separate from the above as there is overlap
+  // in the instructions being checked and this keeps the nested conditionals
+  // to a minimum.
+  switch (Inst.getOpcode()) {
+  case AArch64::ADDSWri:
+  case AArch64::ADDSXri:
+  case AArch64::ADDWri:
+  case AArch64::ADDXri:
+  case AArch64::SUBSWri:
+  case AArch64::SUBSXri:
+  case AArch64::SUBWri:
+  case AArch64::SUBXri: {
+    // Annoyingly we can't do this in the isAddSubImm predicate, so there is
+    // some slight duplication here.
+    if (Inst.getOperand(2).isExpr()) {
+      const MCExpr *Expr = Inst.getOperand(2).getExpr();
+      AArch64MCExpr::VariantKind ELFRefKind;
+      MCSymbolRefExpr::VariantKind DarwinRefKind;
+      int64_t Addend;
+      if (!classifySymbolRef(Expr, ELFRefKind, DarwinRefKind, Addend)) {
+        return Error(Loc[2], "invalid immediate expression");
+      }
+
+      // Only allow these with ADDXri.
+      if ((DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF ||
+          DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF) &&
+          Inst.getOpcode() == AArch64::ADDXri)
+        return false;
+
+      // Only allow these with ADDXri/ADDWri
+      if ((ELFRefKind == AArch64MCExpr::VK_LO12 ||
+          ELFRefKind == AArch64MCExpr::VK_DTPREL_HI12 ||
+          ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12 ||
+          ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC ||
+          ELFRefKind == AArch64MCExpr::VK_TPREL_HI12 ||
+          ELFRefKind == AArch64MCExpr::VK_TPREL_LO12 ||
+          ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC ||
+          ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12) &&
+          (Inst.getOpcode() == AArch64::ADDXri ||
+          Inst.getOpcode() == AArch64::ADDWri))
+        return false;
+
+      // Don't allow expressions in the immediate field otherwise
+      return Error(Loc[2], "invalid immediate expression");
+    }
+    return false;
+  }
+  default:
+    return false;
+  }
+}
+
+bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) {
+  switch (ErrCode) {
+  case Match_MissingFeature:
+    return Error(Loc,
+                 "instruction requires a CPU feature not currently enabled");
+  case Match_InvalidOperand:
+    return Error(Loc, "invalid operand for instruction");
+  case Match_InvalidSuffix:
+    return Error(Loc, "invalid type suffix for instruction");
+  case Match_InvalidCondCode:
+    return Error(Loc, "expected AArch64 condition code");
+  case Match_AddSubRegExtendSmall:
+    return Error(Loc,
+      "expected '[su]xt[bhw]' or 'lsl' with optional integer in range [0, 4]");
+  case Match_AddSubRegExtendLarge:
+    return Error(Loc,
+      "expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]");
+  case Match_AddSubSecondSource:
+    return Error(Loc,
+      "expected compatible register, symbol or integer in range [0, 4095]");
+  case Match_LogicalSecondSource:
+    return Error(Loc, "expected compatible register or logical immediate");
+  case Match_InvalidMovImm32Shift:
+    return Error(Loc, "expected 'lsl' with optional integer 0 or 16");
+  case Match_InvalidMovImm64Shift:
+    return Error(Loc, "expected 'lsl' with optional integer 0, 16, 32 or 48");
+  case Match_AddSubRegShift32:
+    return Error(Loc,
+       "expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 31]");
+  case Match_AddSubRegShift64:
+    return Error(Loc,
+       "expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 63]");
+  case Match_InvalidFPImm:
+    return Error(Loc,
+                 "expected compatible register or floating-point constant");
+  case Match_InvalidMemoryIndexedSImm9:
+    return Error(Loc, "index must be an integer in range [-256, 255].");
+  case Match_InvalidMemoryIndexed4SImm7:
+    return Error(Loc, "index must be a multiple of 4 in range [-256, 252].");
+  case Match_InvalidMemoryIndexed8SImm7:
+    return Error(Loc, "index must be a multiple of 8 in range [-512, 504].");
+  case Match_InvalidMemoryIndexed16SImm7:
+    return Error(Loc, "index must be a multiple of 16 in range [-1024, 1008].");
+  case Match_InvalidMemoryWExtend8:
+    return Error(Loc,
+                 "expected 'uxtw' or 'sxtw' with optional shift of #0");
+  case Match_InvalidMemoryWExtend16:
+    return Error(Loc,
+                 "expected 'uxtw' or 'sxtw' with optional shift of #0 or #1");
+  case Match_InvalidMemoryWExtend32:
+    return Error(Loc,
+                 "expected 'uxtw' or 'sxtw' with optional shift of #0 or #2");
+  case Match_InvalidMemoryWExtend64:
+    return Error(Loc,
+                 "expected 'uxtw' or 'sxtw' with optional shift of #0 or #3");
+  case Match_InvalidMemoryWExtend128:
+    return Error(Loc,
+                 "expected 'uxtw' or 'sxtw' with optional shift of #0 or #4");
+  case Match_InvalidMemoryXExtend8:
+    return Error(Loc,
+                 "expected 'lsl' or 'sxtx' with optional shift of #0");
+  case Match_InvalidMemoryXExtend16:
+    return Error(Loc,
+                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #1");
+  case Match_InvalidMemoryXExtend32:
+    return Error(Loc,
+                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #2");
+  case Match_InvalidMemoryXExtend64:
+    return Error(Loc,
+                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #3");
+  case Match_InvalidMemoryXExtend128:
+    return Error(Loc,
+                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #4");
+  case Match_InvalidMemoryIndexed1:
+    return Error(Loc, "index must be an integer in range [0, 4095].");
+  case Match_InvalidMemoryIndexed2:
+    return Error(Loc, "index must be a multiple of 2 in range [0, 8190].");
+  case Match_InvalidMemoryIndexed4:
+    return Error(Loc, "index must be a multiple of 4 in range [0, 16380].");
+  case Match_InvalidMemoryIndexed8:
+    return Error(Loc, "index must be a multiple of 8 in range [0, 32760].");
+  case Match_InvalidMemoryIndexed16:
+    return Error(Loc, "index must be a multiple of 16 in range [0, 65520].");
+  case Match_InvalidImm0_7:
+    return Error(Loc, "immediate must be an integer in range [0, 7].");
+  case Match_InvalidImm0_15:
+    return Error(Loc, "immediate must be an integer in range [0, 15].");
+  case Match_InvalidImm0_31:
+    return Error(Loc, "immediate must be an integer in range [0, 31].");
+  case Match_InvalidImm0_63:
+    return Error(Loc, "immediate must be an integer in range [0, 63].");
+  case Match_InvalidImm0_127:
+    return Error(Loc, "immediate must be an integer in range [0, 127].");
+  case Match_InvalidImm0_65535:
+    return Error(Loc, "immediate must be an integer in range [0, 65535].");
+  case Match_InvalidImm1_8:
+    return Error(Loc, "immediate must be an integer in range [1, 8].");
+  case Match_InvalidImm1_16:
+    return Error(Loc, "immediate must be an integer in range [1, 16].");
+  case Match_InvalidImm1_32:
+    return Error(Loc, "immediate must be an integer in range [1, 32].");
+  case Match_InvalidImm1_64:
+    return Error(Loc, "immediate must be an integer in range [1, 64].");
+  case Match_InvalidIndex1:
+    return Error(Loc, "expected lane specifier '[1]'");
+  case Match_InvalidIndexB:
+    return Error(Loc, "vector lane must be an integer in range [0, 15].");
+  case Match_InvalidIndexH:
+    return Error(Loc, "vector lane must be an integer in range [0, 7].");
+  case Match_InvalidIndexS:
+    return Error(Loc, "vector lane must be an integer in range [0, 3].");
+  case Match_InvalidIndexD:
+    return Error(Loc, "vector lane must be an integer in range [0, 1].");
+  case Match_InvalidLabel:
+    return Error(Loc, "expected label or encodable integer pc offset");
+  case Match_MRS:
+    return Error(Loc, "expected readable system register");
+  case Match_MSR:
+    return Error(Loc, "expected writable system register or pstate");
+  case Match_MnemonicFail:
+    return Error(Loc, "unrecognized instruction mnemonic");
+  default:
+    llvm_unreachable("unexpected error code!");
+  }
+}
+
+static const char *getSubtargetFeatureName(unsigned Val);
+
+bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                               OperandVector &Operands,
+                                               MCStreamer &Out,
+                                               unsigned &ErrorInfo,
+                                               bool MatchingInlineAsm) {
+  assert(!Operands.empty() && "Unexpect empty operand list!");
+  AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[0]);
+  assert(Op.isToken() && "Leading operand should always be a mnemonic!");
+
+  StringRef Tok = Op.getToken();
+  unsigned NumOperands = Operands.size();
+
+  if (NumOperands == 4 && Tok == "lsl") {
+    AArch64Operand &Op2 = static_cast<AArch64Operand &>(*Operands[2]);
+    AArch64Operand &Op3 = static_cast<AArch64Operand &>(*Operands[3]);
+    if (Op2.isReg() && Op3.isImm()) {
+      const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3.getImm());
+      if (Op3CE) {
+        uint64_t Op3Val = Op3CE->getValue();
+        uint64_t NewOp3Val = 0;
+        uint64_t NewOp4Val = 0;
+        if (AArch64MCRegisterClasses[AArch64::GPR32allRegClassID].contains(
+                Op2.getReg())) {
+          NewOp3Val = (32 - Op3Val) & 0x1f;
+          NewOp4Val = 31 - Op3Val;
+        } else {
+          NewOp3Val = (64 - Op3Val) & 0x3f;
+          NewOp4Val = 63 - Op3Val;
+        }
+
+        const MCExpr *NewOp3 = MCConstantExpr::Create(NewOp3Val, getContext());
+        const MCExpr *NewOp4 = MCConstantExpr::Create(NewOp4Val, getContext());
+
+        Operands[0] = AArch64Operand::CreateToken(
+            "ubfm", false, Op.getStartLoc(), getContext());
+        Operands.push_back(AArch64Operand::CreateImm(
+            NewOp4, Op3.getStartLoc(), Op3.getEndLoc(), getContext()));
+        Operands[3] = AArch64Operand::CreateImm(NewOp3, Op3.getStartLoc(),
+                                                Op3.getEndLoc(), getContext());
+      }
+    }
+  } else if (NumOperands == 5) {
+    // FIXME: Horrible hack to handle the BFI -> BFM, SBFIZ->SBFM, and
+    // UBFIZ -> UBFM aliases.
+    if (Tok == "bfi" || Tok == "sbfiz" || Tok == "ubfiz") {
+      AArch64Operand &Op1 = static_cast<AArch64Operand &>(*Operands[1]);
+      AArch64Operand &Op3 = static_cast<AArch64Operand &>(*Operands[3]);
+      AArch64Operand &Op4 = static_cast<AArch64Operand &>(*Operands[4]);
+
+      if (Op1.isReg() && Op3.isImm() && Op4.isImm()) {
+        const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3.getImm());
+        const MCConstantExpr *Op4CE = dyn_cast<MCConstantExpr>(Op4.getImm());
+
+        if (Op3CE && Op4CE) {
+          uint64_t Op3Val = Op3CE->getValue();
+          uint64_t Op4Val = Op4CE->getValue();
+
+          uint64_t RegWidth = 0;
+          if (AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
+                  Op1.getReg()))
+            RegWidth = 64;
+          else
+            RegWidth = 32;
+
+          if (Op3Val >= RegWidth)
+            return Error(Op3.getStartLoc(),
+                         "expected integer in range [0, 31]");
+          if (Op4Val < 1 || Op4Val > RegWidth)
+            return Error(Op4.getStartLoc(),
+                         "expected integer in range [1, 32]");
+
+          uint64_t NewOp3Val = 0;
+          if (AArch64MCRegisterClasses[AArch64::GPR32allRegClassID].contains(
+                  Op1.getReg()))
+            NewOp3Val = (32 - Op3Val) & 0x1f;
+          else
+            NewOp3Val = (64 - Op3Val) & 0x3f;
+
+          uint64_t NewOp4Val = Op4Val - 1;
+
+          if (NewOp3Val != 0 && NewOp4Val >= NewOp3Val)
+            return Error(Op4.getStartLoc(),
+                         "requested insert overflows register");
+
+          const MCExpr *NewOp3 =
+              MCConstantExpr::Create(NewOp3Val, getContext());
+          const MCExpr *NewOp4 =
+              MCConstantExpr::Create(NewOp4Val, getContext());
+          Operands[3] = AArch64Operand::CreateImm(
+              NewOp3, Op3.getStartLoc(), Op3.getEndLoc(), getContext());
+          Operands[4] = AArch64Operand::CreateImm(
+              NewOp4, Op4.getStartLoc(), Op4.getEndLoc(), getContext());
+          if (Tok == "bfi")
+            Operands[0] = AArch64Operand::CreateToken(
+                "bfm", false, Op.getStartLoc(), getContext());
+          else if (Tok == "sbfiz")
+            Operands[0] = AArch64Operand::CreateToken(
+                "sbfm", false, Op.getStartLoc(), getContext());
+          else if (Tok == "ubfiz")
+            Operands[0] = AArch64Operand::CreateToken(
+                "ubfm", false, Op.getStartLoc(), getContext());
+          else
+            llvm_unreachable("No valid mnemonic for alias?");
+        }
+      }
+
+      // FIXME: Horrible hack to handle the BFXIL->BFM, SBFX->SBFM, and
+      // UBFX -> UBFM aliases.
+    } else if (NumOperands == 5 &&
+               (Tok == "bfxil" || Tok == "sbfx" || Tok == "ubfx")) {
+      AArch64Operand &Op1 = static_cast<AArch64Operand &>(*Operands[1]);
+      AArch64Operand &Op3 = static_cast<AArch64Operand &>(*Operands[3]);
+      AArch64Operand &Op4 = static_cast<AArch64Operand &>(*Operands[4]);
+
+      if (Op1.isReg() && Op3.isImm() && Op4.isImm()) {
+        const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3.getImm());
+        const MCConstantExpr *Op4CE = dyn_cast<MCConstantExpr>(Op4.getImm());
+
+        if (Op3CE && Op4CE) {
+          uint64_t Op3Val = Op3CE->getValue();
+          uint64_t Op4Val = Op4CE->getValue();
+
+          uint64_t RegWidth = 0;
+          if (AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
+                  Op1.getReg()))
+            RegWidth = 64;
+          else
+            RegWidth = 32;
+
+          if (Op3Val >= RegWidth)
+            return Error(Op3.getStartLoc(),
+                         "expected integer in range [0, 31]");
+          if (Op4Val < 1 || Op4Val > RegWidth)
+            return Error(Op4.getStartLoc(),
+                         "expected integer in range [1, 32]");
+
+          uint64_t NewOp4Val = Op3Val + Op4Val - 1;
+
+          if (NewOp4Val >= RegWidth || NewOp4Val < Op3Val)
+            return Error(Op4.getStartLoc(),
+                         "requested extract overflows register");
+
+          const MCExpr *NewOp4 =
+              MCConstantExpr::Create(NewOp4Val, getContext());
+          Operands[4] = AArch64Operand::CreateImm(
+              NewOp4, Op4.getStartLoc(), Op4.getEndLoc(), getContext());
+          if (Tok == "bfxil")
+            Operands[0] = AArch64Operand::CreateToken(
+                "bfm", false, Op.getStartLoc(), getContext());
+          else if (Tok == "sbfx")
+            Operands[0] = AArch64Operand::CreateToken(
+                "sbfm", false, Op.getStartLoc(), getContext());
+          else if (Tok == "ubfx")
+            Operands[0] = AArch64Operand::CreateToken(
+                "ubfm", false, Op.getStartLoc(), getContext());
+          else
+            llvm_unreachable("No valid mnemonic for alias?");
+        }
+      }
+    }
+  }
+  // FIXME: Horrible hack for sxtw and uxtw with Wn src and Xd dst operands.
+  //        InstAlias can't quite handle this since the reg classes aren't
+  //        subclasses.
+  if (NumOperands == 3 && (Tok == "sxtw" || Tok == "uxtw")) {
+    // The source register can be Wn here, but the matcher expects a
+    // GPR64. Twiddle it here if necessary.
+    AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[2]);
+    if (Op.isReg()) {
+      unsigned Reg = getXRegFromWReg(Op.getReg());
+      Operands[2] = AArch64Operand::CreateReg(Reg, false, Op.getStartLoc(),
+                                              Op.getEndLoc(), getContext());
+    }
+  }
+  // FIXME: Likewise for sxt[bh] with a Xd dst operand
+  else if (NumOperands == 3 && (Tok == "sxtb" || Tok == "sxth")) {
+    AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[1]);
+    if (Op.isReg() &&
+        AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
+            Op.getReg())) {
+      // The source register can be Wn here, but the matcher expects a
+      // GPR64. Twiddle it here if necessary.
+      AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[2]);
+      if (Op.isReg()) {
+        unsigned Reg = getXRegFromWReg(Op.getReg());
+        Operands[2] = AArch64Operand::CreateReg(Reg, false, Op.getStartLoc(),
+                                                Op.getEndLoc(), getContext());
+      }
+    }
+  }
+  // FIXME: Likewise for uxt[bh] with a Xd dst operand
+  else if (NumOperands == 3 && (Tok == "uxtb" || Tok == "uxth")) {
+    AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[1]);
+    if (Op.isReg() &&
+        AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
+            Op.getReg())) {
+      // The source register can be Wn here, but the matcher expects a
+      // GPR32. Twiddle it here if necessary.
+      AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[1]);
+      if (Op.isReg()) {
+        unsigned Reg = getWRegFromXReg(Op.getReg());
+        Operands[1] = AArch64Operand::CreateReg(Reg, false, Op.getStartLoc(),
+                                                Op.getEndLoc(), getContext());
+      }
+    }
+  }
+
+  // Yet another horrible hack to handle FMOV Rd, #0.0 using [WX]ZR.
+  if (NumOperands == 3 && Tok == "fmov") {
+    AArch64Operand &RegOp = static_cast<AArch64Operand &>(*Operands[1]);
+    AArch64Operand &ImmOp = static_cast<AArch64Operand &>(*Operands[2]);
+    if (RegOp.isReg() && ImmOp.isFPImm() && ImmOp.getFPImm() == (unsigned)-1) {
+      unsigned zreg =
+          AArch64MCRegisterClasses[AArch64::FPR32RegClassID].contains(
+              RegOp.getReg())
+              ? AArch64::WZR
+              : AArch64::XZR;
+      Operands[2] = AArch64Operand::CreateReg(zreg, false, Op.getStartLoc(),
+                                              Op.getEndLoc(), getContext());
+    }
+  }
+
+  MCInst Inst;
+  // First try to match against the secondary set of tables containing the
+  // short-form NEON instructions (e.g. "fadd.2s v0, v1, v2").
+  unsigned MatchResult =
+      MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm, 1);
+
+  // If that fails, try against the alternate table containing long-form NEON:
+  // "fadd v0.2s, v1.2s, v2.2s"
+  if (MatchResult != Match_Success)
+    MatchResult =
+        MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm, 0);
+
+  switch (MatchResult) {
+  case Match_Success: {
+    // Perform range checking and other semantic validations
+    SmallVector<SMLoc, 8> OperandLocs;
+    NumOperands = Operands.size();
+    for (unsigned i = 1; i < NumOperands; ++i)
+      OperandLocs.push_back(Operands[i]->getStartLoc());
+    if (validateInstruction(Inst, OperandLocs))
+      return true;
+
+    Inst.setLoc(IDLoc);
+    Out.EmitInstruction(Inst, STI);
+    return false;
+  }
+  case Match_MissingFeature: {
+    assert(ErrorInfo && "Unknown missing feature!");
+    // Special case the error message for the very common case where only
+    // a single subtarget feature is missing (neon, e.g.).
+    std::string Msg = "instruction requires:";
+    unsigned Mask = 1;
+    for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) {
+      if (ErrorInfo & Mask) {
+        Msg += " ";
+        Msg += getSubtargetFeatureName(ErrorInfo & Mask);
+      }
+      Mask <<= 1;
+    }
+    return Error(IDLoc, Msg);
+  }
+  case Match_MnemonicFail:
+    return showMatchError(IDLoc, MatchResult);
+  case Match_InvalidOperand: {
+    SMLoc ErrorLoc = IDLoc;
+    if (ErrorInfo != ~0U) {
+      if (ErrorInfo >= Operands.size())
+        return Error(IDLoc, "too few operands for instruction");
+
+      ErrorLoc = ((AArch64Operand &)*Operands[ErrorInfo]).getStartLoc();
+      if (ErrorLoc == SMLoc())
+        ErrorLoc = IDLoc;
+    }
+    // If the match failed on a suffix token operand, tweak the diagnostic
+    // accordingly.
+    if (((AArch64Operand &)*Operands[ErrorInfo]).isToken() &&
+        ((AArch64Operand &)*Operands[ErrorInfo]).isTokenSuffix())
+      MatchResult = Match_InvalidSuffix;
+
+    return showMatchError(ErrorLoc, MatchResult);
+  }
+  case Match_InvalidMemoryIndexed1:
+  case Match_InvalidMemoryIndexed2:
+  case Match_InvalidMemoryIndexed4:
+  case Match_InvalidMemoryIndexed8:
+  case Match_InvalidMemoryIndexed16:
+  case Match_InvalidCondCode:
+  case Match_AddSubRegExtendSmall:
+  case Match_AddSubRegExtendLarge:
+  case Match_AddSubSecondSource:
+  case Match_LogicalSecondSource:
+  case Match_AddSubRegShift32:
+  case Match_AddSubRegShift64:
+  case Match_InvalidMovImm32Shift:
+  case Match_InvalidMovImm64Shift:
+  case Match_InvalidFPImm:
+  case Match_InvalidMemoryWExtend8:
+  case Match_InvalidMemoryWExtend16:
+  case Match_InvalidMemoryWExtend32:
+  case Match_InvalidMemoryWExtend64:
+  case Match_InvalidMemoryWExtend128:
+  case Match_InvalidMemoryXExtend8:
+  case Match_InvalidMemoryXExtend16:
+  case Match_InvalidMemoryXExtend32:
+  case Match_InvalidMemoryXExtend64:
+  case Match_InvalidMemoryXExtend128:
+  case Match_InvalidMemoryIndexed4SImm7:
+  case Match_InvalidMemoryIndexed8SImm7:
+  case Match_InvalidMemoryIndexed16SImm7:
+  case Match_InvalidMemoryIndexedSImm9:
+  case Match_InvalidImm0_7:
+  case Match_InvalidImm0_15:
+  case Match_InvalidImm0_31:
+  case Match_InvalidImm0_63:
+  case Match_InvalidImm0_127:
+  case Match_InvalidImm0_65535:
+  case Match_InvalidImm1_8:
+  case Match_InvalidImm1_16:
+  case Match_InvalidImm1_32:
+  case Match_InvalidImm1_64:
+  case Match_InvalidIndex1:
+  case Match_InvalidIndexB:
+  case Match_InvalidIndexH:
+  case Match_InvalidIndexS:
+  case Match_InvalidIndexD:
+  case Match_InvalidLabel:
+  case Match_MSR:
+  case Match_MRS: {
+    if (ErrorInfo >= Operands.size())
+      return Error(IDLoc, "too few operands for instruction");
+    // Any time we get here, there's nothing fancy to do. Just get the
+    // operand SMLoc and display the diagnostic.
+    SMLoc ErrorLoc = ((AArch64Operand &)*Operands[ErrorInfo]).getStartLoc();
+    if (ErrorLoc == SMLoc())
+      ErrorLoc = IDLoc;
+    return showMatchError(ErrorLoc, MatchResult);
+  }
+  }
+
+  llvm_unreachable("Implement any new match types added!");
+  return true;
+}
+
+/// ParseDirective parses the arm specific directives
+bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) {
+  StringRef IDVal = DirectiveID.getIdentifier();
+  SMLoc Loc = DirectiveID.getLoc();
+  if (IDVal == ".hword")
+    return parseDirectiveWord(2, Loc);
+  if (IDVal == ".word")
+    return parseDirectiveWord(4, Loc);
+  if (IDVal == ".xword")
+    return parseDirectiveWord(8, Loc);
+  if (IDVal == ".tlsdesccall")
+    return parseDirectiveTLSDescCall(Loc);
+  if (IDVal == ".ltorg" || IDVal == ".pool")
+    return parseDirectiveLtorg(Loc);
+  if (IDVal == ".unreq")
+    return parseDirectiveUnreq(DirectiveID.getLoc());
+
+  return parseDirectiveLOH(IDVal, Loc);
+}
+
+/// parseDirectiveWord
+///  ::= .word [ expression (, expression)* ]
+bool AArch64AsmParser::parseDirectiveWord(unsigned Size, SMLoc L) {
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    for (;;) {
+      const MCExpr *Value;
+      if (getParser().parseExpression(Value))
+        return true;
+
+      getParser().getStreamer().EmitValue(Value, Size);
+
+      if (getLexer().is(AsmToken::EndOfStatement))
+        break;
+
+      // FIXME: Improve diagnostic.
+      if (getLexer().isNot(AsmToken::Comma))
+        return Error(L, "unexpected token in directive");
+      Parser.Lex();
+    }
+  }
+
+  Parser.Lex();
+  return false;
+}
+
+// parseDirectiveTLSDescCall:
+//   ::= .tlsdesccall symbol
+bool AArch64AsmParser::parseDirectiveTLSDescCall(SMLoc L) {
+  StringRef Name;
+  if (getParser().parseIdentifier(Name))
+    return Error(L, "expected symbol after directive");
+
+  MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
+  const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, getContext());
+  Expr = AArch64MCExpr::Create(Expr, AArch64MCExpr::VK_TLSDESC, getContext());
+
+  MCInst Inst;
+  Inst.setOpcode(AArch64::TLSDESCCALL);
+  Inst.addOperand(MCOperand::CreateExpr(Expr));
+
+  getParser().getStreamer().EmitInstruction(Inst, STI);
+  return false;
+}
+
+/// ::= .loh <lohName | lohId> label1, ..., labelN
+/// The number of arguments depends on the loh identifier.
+bool AArch64AsmParser::parseDirectiveLOH(StringRef IDVal, SMLoc Loc) {
+  if (IDVal != MCLOHDirectiveName())
+    return true;
+  MCLOHType Kind;
+  if (getParser().getTok().isNot(AsmToken::Identifier)) {
+    if (getParser().getTok().isNot(AsmToken::Integer))
+      return TokError("expected an identifier or a number in directive");
+    // We successfully get a numeric value for the identifier.
+    // Check if it is valid.
+    int64_t Id = getParser().getTok().getIntVal();
+    Kind = (MCLOHType)Id;
+    // Check that Id does not overflow MCLOHType.
+    if (!isValidMCLOHType(Kind) || Id != Kind)
+      return TokError("invalid numeric identifier in directive");
+  } else {
+    StringRef Name = getTok().getIdentifier();
+    // We successfully parse an identifier.
+    // Check if it is a recognized one.
+    int Id = MCLOHNameToId(Name);
+
+    if (Id == -1)
+      return TokError("invalid identifier in directive");
+    Kind = (MCLOHType)Id;
+  }
+  // Consume the identifier.
+  Lex();
+  // Get the number of arguments of this LOH.
+  int NbArgs = MCLOHIdToNbArgs(Kind);
+
+  assert(NbArgs != -1 && "Invalid number of arguments");
+
+  SmallVector<MCSymbol *, 3> Args;
+  for (int Idx = 0; Idx < NbArgs; ++Idx) {
+    StringRef Name;
+    if (getParser().parseIdentifier(Name))
+      return TokError("expected identifier in directive");
+    Args.push_back(getContext().GetOrCreateSymbol(Name));
+
+    if (Idx + 1 == NbArgs)
+      break;
+    if (getLexer().isNot(AsmToken::Comma))
+      return TokError("unexpected token in '" + Twine(IDVal) + "' directive");
+    Lex();
+  }
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '" + Twine(IDVal) + "' directive");
+
+  getStreamer().EmitLOHDirective((MCLOHType)Kind, Args);
+  return false;
+}
+
+/// parseDirectiveLtorg
+///  ::= .ltorg | .pool
+bool AArch64AsmParser::parseDirectiveLtorg(SMLoc L) {
+  getTargetStreamer().emitCurrentConstantPool();
+  return false;
+}
+
+/// parseDirectiveReq
+///  ::= name .req registername
+bool AArch64AsmParser::parseDirectiveReq(StringRef Name, SMLoc L) {
+  Parser.Lex(); // Eat the '.req' token.
+  SMLoc SRegLoc = getLoc();
+  unsigned RegNum = tryParseRegister();
+  bool IsVector = false;
+
+  if (RegNum == static_cast<unsigned>(-1)) {
+    StringRef Kind;
+    RegNum = tryMatchVectorRegister(Kind, false);
+    if (!Kind.empty()) {
+      Error(SRegLoc, "vector register without type specifier expected");
+      return false;
+    }
+    IsVector = true;
+  }
+
+  if (RegNum == static_cast<unsigned>(-1)) {
+    Parser.eatToEndOfStatement();
+    Error(SRegLoc, "register name or alias expected");
+    return false;
+  }
+
+  // Shouldn't be anything else.
+  if (Parser.getTok().isNot(AsmToken::EndOfStatement)) {
+    Error(Parser.getTok().getLoc(), "unexpected input in .req directive");
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
+  Parser.Lex(); // Consume the EndOfStatement
+
+  auto pair = std::make_pair(IsVector, RegNum);
+  if (RegisterReqs.GetOrCreateValue(Name, pair).getValue() != pair)
+    Warning(L, "ignoring redefinition of register alias '" + Name + "'");
+
+  return true;
+}
+
+/// parseDirectiveUneq
+///  ::= .unreq registername
+bool AArch64AsmParser::parseDirectiveUnreq(SMLoc L) {
+  if (Parser.getTok().isNot(AsmToken::Identifier)) {
+    Error(Parser.getTok().getLoc(), "unexpected input in .unreq directive.");
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+  RegisterReqs.erase(Parser.getTok().getIdentifier().lower());
+  Parser.Lex(); // Eat the identifier.
+  return false;
+}
+
+bool
+AArch64AsmParser::classifySymbolRef(const MCExpr *Expr,
+                                    AArch64MCExpr::VariantKind &ELFRefKind,
+                                    MCSymbolRefExpr::VariantKind &DarwinRefKind,
+                                    int64_t &Addend) {
+  ELFRefKind = AArch64MCExpr::VK_INVALID;
+  DarwinRefKind = MCSymbolRefExpr::VK_None;
+  Addend = 0;
+
+  if (const AArch64MCExpr *AE = dyn_cast<AArch64MCExpr>(Expr)) {
+    ELFRefKind = AE->getKind();
+    Expr = AE->getSubExpr();
+  }
+
+  const MCSymbolRefExpr *SE = dyn_cast<MCSymbolRefExpr>(Expr);
+  if (SE) {
+    // It's a simple symbol reference with no addend.
+    DarwinRefKind = SE->getKind();
+    return true;
+  }
+
+  const MCBinaryExpr *BE = dyn_cast<MCBinaryExpr>(Expr);
+  if (!BE)
+    return false;
+
+  SE = dyn_cast<MCSymbolRefExpr>(BE->getLHS());
+  if (!SE)
+    return false;
+  DarwinRefKind = SE->getKind();
+
+  if (BE->getOpcode() != MCBinaryExpr::Add &&
+      BE->getOpcode() != MCBinaryExpr::Sub)
+    return false;
+
+  // See if the addend is is a constant, otherwise there's more going
+  // on here than we can deal with.
+  auto AddendExpr = dyn_cast<MCConstantExpr>(BE->getRHS());
+  if (!AddendExpr)
+    return false;
+
+  Addend = AddendExpr->getValue();
+  if (BE->getOpcode() == MCBinaryExpr::Sub)
+    Addend = -Addend;
+
+  // It's some symbol reference + a constant addend, but really
+  // shouldn't use both Darwin and ELF syntax.
+  return ELFRefKind == AArch64MCExpr::VK_INVALID ||
+         DarwinRefKind == MCSymbolRefExpr::VK_None;
+}
+
+/// Force static initialization.
+extern "C" void LLVMInitializeAArch64AsmParser() {
+  RegisterMCAsmParser<AArch64AsmParser> X(TheAArch64leTarget);
+  RegisterMCAsmParser<AArch64AsmParser> Y(TheAArch64beTarget);
+
+  RegisterMCAsmParser<AArch64AsmParser> Z(TheARM64leTarget);
+  RegisterMCAsmParser<AArch64AsmParser> W(TheARM64beTarget);
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_SUBTARGET_FEATURE_NAME
+#define GET_MATCHER_IMPLEMENTATION
+#include "AArch64GenAsmMatcher.inc"
+
+// Define this matcher function after the auto-generated include so we
+// have the match class enum definitions.
+unsigned AArch64AsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
+                                                      unsigned Kind) {
+  AArch64Operand &Op = static_cast<AArch64Operand &>(AsmOp);
+  // If the kind is a token for a literal immediate, check if our asm
+  // operand matches. This is for InstAliases which have a fixed-value
+  // immediate in the syntax.
+  int64_t ExpectedVal;
+  switch (Kind) {
+  default:
+    return Match_InvalidOperand;
+  case MCK__35_0:
+    ExpectedVal = 0;
+    break;
+  case MCK__35_1:
+    ExpectedVal = 1;
+    break;
+  case MCK__35_12:
+    ExpectedVal = 12;
+    break;
+  case MCK__35_16:
+    ExpectedVal = 16;
+    break;
+  case MCK__35_2:
+    ExpectedVal = 2;
+    break;
+  case MCK__35_24:
+    ExpectedVal = 24;
+    break;
+  case MCK__35_3:
+    ExpectedVal = 3;
+    break;
+  case MCK__35_32:
+    ExpectedVal = 32;
+    break;
+  case MCK__35_4:
+    ExpectedVal = 4;
+    break;
+  case MCK__35_48:
+    ExpectedVal = 48;
+    break;
+  case MCK__35_6:
+    ExpectedVal = 6;
+    break;
+  case MCK__35_64:
+    ExpectedVal = 64;
+    break;
+  case MCK__35_8:
+    ExpectedVal = 8;
+    break;
+  }
+  if (!Op.isImm())
+    return Match_InvalidOperand;
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op.getImm());
+  if (!CE)
+    return Match_InvalidOperand;
+  if (CE->getValue() == ExpectedVal)
+    return Match_Success;
+  return Match_InvalidOperand;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
new file mode 100644
index 0000000..6de27d6
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -0,0 +1,1559 @@
+//===- AArch64Disassembler.cpp - Disassembler for AArch64 -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64Disassembler.h"
+#include "AArch64ExternalSymbolizer.h"
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MemoryObject.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-disassembler"
+
+// Pull DecodeStatus and its enum values into the global namespace.
+typedef llvm::MCDisassembler::DecodeStatus DecodeStatus;
+
+// Forward declare these because the autogenerated code will reference them.
+// Definitions are further down.
+static DecodeStatus DecodeFPR128RegisterClass(llvm::MCInst &Inst,
+                                              unsigned RegNo, uint64_t Address,
+                                              const void *Decoder);
+static DecodeStatus DecodeFPR128_loRegisterClass(llvm::MCInst &Inst,
+                                                 unsigned RegNo,
+                                                 uint64_t Address,
+                                                 const void *Decoder);
+static DecodeStatus DecodeFPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+static DecodeStatus DecodeFPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+static DecodeStatus DecodeFPR16RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+static DecodeStatus DecodeFPR8RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder);
+static DecodeStatus DecodeGPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+static DecodeStatus DecodeGPR64spRegisterClass(llvm::MCInst &Inst,
+                                               unsigned RegNo, uint64_t Address,
+                                               const void *Decoder);
+static DecodeStatus DecodeGPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+static DecodeStatus DecodeGPR32spRegisterClass(llvm::MCInst &Inst,
+                                               unsigned RegNo, uint64_t Address,
+                                               const void *Decoder);
+static DecodeStatus DecodeQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                          uint64_t Address,
+                                          const void *Decoder);
+static DecodeStatus DecodeQQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                           uint64_t Address,
+                                           const void *Decoder);
+static DecodeStatus DecodeQQQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder);
+static DecodeStatus DecodeDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                          uint64_t Address,
+                                          const void *Decoder);
+static DecodeStatus DecodeDDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                           uint64_t Address,
+                                           const void *Decoder);
+static DecodeStatus DecodeDDDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder);
+
+static DecodeStatus DecodeFixedPointScaleImm32(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Address,
+                                               const void *Decoder);
+static DecodeStatus DecodeFixedPointScaleImm64(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Address,
+                                               const void *Decoder);
+static DecodeStatus DecodePCRelLabel19(llvm::MCInst &Inst, unsigned Imm,
+                                       uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMemExtend(llvm::MCInst &Inst, unsigned Imm,
+                                    uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMRSSystemRegister(llvm::MCInst &Inst, unsigned Imm,
+                                            uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMSRSystemRegister(llvm::MCInst &Inst, unsigned Imm,
+                                            uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThreeAddrSRegInstruction(llvm::MCInst &Inst,
+                                                   uint32_t insn,
+                                                   uint64_t Address,
+                                                   const void *Decoder);
+static DecodeStatus DecodeMoveImmInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                             uint64_t Address,
+                                             const void *Decoder);
+static DecodeStatus DecodeUnsignedLdStInstruction(llvm::MCInst &Inst,
+                                                  uint32_t insn,
+                                                  uint64_t Address,
+                                                  const void *Decoder);
+static DecodeStatus DecodeSignedLdStInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Address,
+                                                const void *Decoder);
+static DecodeStatus DecodeExclusiveLdStInstruction(llvm::MCInst &Inst,
+                                                   uint32_t insn,
+                                                   uint64_t Address,
+                                                   const void *Decoder);
+static DecodeStatus DecodePairLdStInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                              uint64_t Address,
+                                              const void *Decoder);
+static DecodeStatus DecodeAddSubERegInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Address,
+                                                const void *Decoder);
+static DecodeStatus DecodeLogicalImmInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Address,
+                                                const void *Decoder);
+static DecodeStatus DecodeModImmInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                            uint64_t Address,
+                                            const void *Decoder);
+static DecodeStatus DecodeModImmTiedInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Address,
+                                                const void *Decoder);
+static DecodeStatus DecodeAdrInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                         uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeBaseAddSubImm(llvm::MCInst &Inst, uint32_t insn,
+                                        uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeUnconditionalBranch(llvm::MCInst &Inst, uint32_t insn,
+                                              uint64_t Address,
+                                              const void *Decoder);
+static DecodeStatus DecodeSystemPStateInstruction(llvm::MCInst &Inst,
+                                                  uint32_t insn,
+                                                  uint64_t Address,
+                                                  const void *Decoder);
+static DecodeStatus DecodeTestAndBranch(llvm::MCInst &Inst, uint32_t insn,
+                                        uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn,
+                                              uint64_t Address,
+                                              const void *Decoder);
+static DecodeStatus DecodeVecShiftR64Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftR64ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder);
+static DecodeStatus DecodeVecShiftR32Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftR32ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder);
+static DecodeStatus DecodeVecShiftR16Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftR16ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder);
+static DecodeStatus DecodeVecShiftR8Imm(llvm::MCInst &Inst, unsigned Imm,
+                                        uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftL64Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftL32Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftL16Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftL8Imm(llvm::MCInst &Inst, unsigned Imm,
+                                        uint64_t Addr, const void *Decoder);
+
+static bool Check(DecodeStatus &Out, DecodeStatus In) {
+  switch (In) {
+    case MCDisassembler::Success:
+      // Out stays the same.
+      return true;
+    case MCDisassembler::SoftFail:
+      Out = In;
+      return true;
+    case MCDisassembler::Fail:
+      Out = In;
+      return false;
+  }
+  llvm_unreachable("Invalid DecodeStatus!");
+}
+
+#include "AArch64GenDisassemblerTables.inc"
+#include "AArch64GenInstrInfo.inc"
+
+#define Success llvm::MCDisassembler::Success
+#define Fail llvm::MCDisassembler::Fail
+#define SoftFail llvm::MCDisassembler::SoftFail
+
+static MCDisassembler *createAArch64Disassembler(const Target &T,
+                                               const MCSubtargetInfo &STI,
+                                               MCContext &Ctx) {
+  return new AArch64Disassembler(STI, Ctx);
+}
+
+DecodeStatus AArch64Disassembler::getInstruction(MCInst &MI, uint64_t &Size,
+                                               const MemoryObject &Region,
+                                               uint64_t Address,
+                                               raw_ostream &os,
+                                               raw_ostream &cs) const {
+  CommentStream = &cs;
+
+  uint8_t bytes[4];
+
+  Size = 0;
+  // We want to read exactly 4 bytes of data.
+  if (Region.readBytes(Address, 4, (uint8_t *)bytes) == -1)
+    return Fail;
+  Size = 4;
+
+  // Encoded as a small-endian 32-bit word in the stream.
+  uint32_t insn =
+      (bytes[3] << 24) | (bytes[2] << 16) | (bytes[1] << 8) | (bytes[0] << 0);
+
+  // Calling the auto-generated decoder function.
+  return decodeInstruction(DecoderTable32, MI, insn, Address, this, STI);
+}
+
+static MCSymbolizer *
+createAArch64ExternalSymbolizer(StringRef TT, LLVMOpInfoCallback GetOpInfo,
+                              LLVMSymbolLookupCallback SymbolLookUp,
+                              void *DisInfo, MCContext *Ctx,
+                              MCRelocationInfo *RelInfo) {
+  return new llvm::AArch64ExternalSymbolizer(
+                                     *Ctx,
+                                     std::unique_ptr<MCRelocationInfo>(RelInfo),
+                                     GetOpInfo, SymbolLookUp, DisInfo);
+}
+
+extern "C" void LLVMInitializeAArch64Disassembler() {
+  TargetRegistry::RegisterMCDisassembler(TheAArch64leTarget,
+                                         createAArch64Disassembler);
+  TargetRegistry::RegisterMCDisassembler(TheAArch64beTarget,
+                                         createAArch64Disassembler);
+  TargetRegistry::RegisterMCSymbolizer(TheAArch64leTarget,
+                                       createAArch64ExternalSymbolizer);
+  TargetRegistry::RegisterMCSymbolizer(TheAArch64beTarget,
+                                       createAArch64ExternalSymbolizer);
+
+  TargetRegistry::RegisterMCDisassembler(TheARM64leTarget,
+                                         createAArch64Disassembler);
+  TargetRegistry::RegisterMCDisassembler(TheARM64beTarget,
+                                         createAArch64Disassembler);
+  TargetRegistry::RegisterMCSymbolizer(TheARM64leTarget,
+                                       createAArch64ExternalSymbolizer);
+  TargetRegistry::RegisterMCSymbolizer(TheARM64beTarget,
+                                       createAArch64ExternalSymbolizer);
+}
+
+static const unsigned FPR128DecoderTable[] = {
+    AArch64::Q0,  AArch64::Q1,  AArch64::Q2,  AArch64::Q3,  AArch64::Q4,
+    AArch64::Q5,  AArch64::Q6,  AArch64::Q7,  AArch64::Q8,  AArch64::Q9,
+    AArch64::Q10, AArch64::Q11, AArch64::Q12, AArch64::Q13, AArch64::Q14,
+    AArch64::Q15, AArch64::Q16, AArch64::Q17, AArch64::Q18, AArch64::Q19,
+    AArch64::Q20, AArch64::Q21, AArch64::Q22, AArch64::Q23, AArch64::Q24,
+    AArch64::Q25, AArch64::Q26, AArch64::Q27, AArch64::Q28, AArch64::Q29,
+    AArch64::Q30, AArch64::Q31
+};
+
+static DecodeStatus DecodeFPR128RegisterClass(MCInst &Inst, unsigned RegNo,
+                                              uint64_t Addr,
+                                              const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+
+  unsigned Register = FPR128DecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static DecodeStatus DecodeFPR128_loRegisterClass(MCInst &Inst, unsigned RegNo,
+                                                 uint64_t Addr,
+                                                 const void *Decoder) {
+  if (RegNo > 15)
+    return Fail;
+  return DecodeFPR128RegisterClass(Inst, RegNo, Addr, Decoder);
+}
+
+static const unsigned FPR64DecoderTable[] = {
+    AArch64::D0,  AArch64::D1,  AArch64::D2,  AArch64::D3,  AArch64::D4,
+    AArch64::D5,  AArch64::D6,  AArch64::D7,  AArch64::D8,  AArch64::D9,
+    AArch64::D10, AArch64::D11, AArch64::D12, AArch64::D13, AArch64::D14,
+    AArch64::D15, AArch64::D16, AArch64::D17, AArch64::D18, AArch64::D19,
+    AArch64::D20, AArch64::D21, AArch64::D22, AArch64::D23, AArch64::D24,
+    AArch64::D25, AArch64::D26, AArch64::D27, AArch64::D28, AArch64::D29,
+    AArch64::D30, AArch64::D31
+};
+
+static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Addr,
+                                             const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+
+  unsigned Register = FPR64DecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static const unsigned FPR32DecoderTable[] = {
+    AArch64::S0,  AArch64::S1,  AArch64::S2,  AArch64::S3,  AArch64::S4,
+    AArch64::S5,  AArch64::S6,  AArch64::S7,  AArch64::S8,  AArch64::S9,
+    AArch64::S10, AArch64::S11, AArch64::S12, AArch64::S13, AArch64::S14,
+    AArch64::S15, AArch64::S16, AArch64::S17, AArch64::S18, AArch64::S19,
+    AArch64::S20, AArch64::S21, AArch64::S22, AArch64::S23, AArch64::S24,
+    AArch64::S25, AArch64::S26, AArch64::S27, AArch64::S28, AArch64::S29,
+    AArch64::S30, AArch64::S31
+};
+
+static DecodeStatus DecodeFPR32RegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Addr,
+                                             const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+
+  unsigned Register = FPR32DecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static const unsigned FPR16DecoderTable[] = {
+    AArch64::H0,  AArch64::H1,  AArch64::H2,  AArch64::H3,  AArch64::H4,
+    AArch64::H5,  AArch64::H6,  AArch64::H7,  AArch64::H8,  AArch64::H9,
+    AArch64::H10, AArch64::H11, AArch64::H12, AArch64::H13, AArch64::H14,
+    AArch64::H15, AArch64::H16, AArch64::H17, AArch64::H18, AArch64::H19,
+    AArch64::H20, AArch64::H21, AArch64::H22, AArch64::H23, AArch64::H24,
+    AArch64::H25, AArch64::H26, AArch64::H27, AArch64::H28, AArch64::H29,
+    AArch64::H30, AArch64::H31
+};
+
+static DecodeStatus DecodeFPR16RegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Addr,
+                                             const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+
+  unsigned Register = FPR16DecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static const unsigned FPR8DecoderTable[] = {
+    AArch64::B0,  AArch64::B1,  AArch64::B2,  AArch64::B3,  AArch64::B4,
+    AArch64::B5,  AArch64::B6,  AArch64::B7,  AArch64::B8,  AArch64::B9,
+    AArch64::B10, AArch64::B11, AArch64::B12, AArch64::B13, AArch64::B14,
+    AArch64::B15, AArch64::B16, AArch64::B17, AArch64::B18, AArch64::B19,
+    AArch64::B20, AArch64::B21, AArch64::B22, AArch64::B23, AArch64::B24,
+    AArch64::B25, AArch64::B26, AArch64::B27, AArch64::B28, AArch64::B29,
+    AArch64::B30, AArch64::B31
+};
+
+static DecodeStatus DecodeFPR8RegisterClass(MCInst &Inst, unsigned RegNo,
+                                            uint64_t Addr,
+                                            const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+
+  unsigned Register = FPR8DecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static const unsigned GPR64DecoderTable[] = {
+    AArch64::X0,  AArch64::X1,  AArch64::X2,  AArch64::X3,  AArch64::X4,
+    AArch64::X5,  AArch64::X6,  AArch64::X7,  AArch64::X8,  AArch64::X9,
+    AArch64::X10, AArch64::X11, AArch64::X12, AArch64::X13, AArch64::X14,
+    AArch64::X15, AArch64::X16, AArch64::X17, AArch64::X18, AArch64::X19,
+    AArch64::X20, AArch64::X21, AArch64::X22, AArch64::X23, AArch64::X24,
+    AArch64::X25, AArch64::X26, AArch64::X27, AArch64::X28, AArch64::FP,
+    AArch64::LR,  AArch64::XZR
+};
+
+static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Addr,
+                                             const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+
+  unsigned Register = GPR64DecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static DecodeStatus DecodeGPR64spRegisterClass(MCInst &Inst, unsigned RegNo,
+                                               uint64_t Addr,
+                                               const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = GPR64DecoderTable[RegNo];
+  if (Register == AArch64::XZR)
+    Register = AArch64::SP;
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static const unsigned GPR32DecoderTable[] = {
+    AArch64::W0,  AArch64::W1,  AArch64::W2,  AArch64::W3,  AArch64::W4,
+    AArch64::W5,  AArch64::W6,  AArch64::W7,  AArch64::W8,  AArch64::W9,
+    AArch64::W10, AArch64::W11, AArch64::W12, AArch64::W13, AArch64::W14,
+    AArch64::W15, AArch64::W16, AArch64::W17, AArch64::W18, AArch64::W19,
+    AArch64::W20, AArch64::W21, AArch64::W22, AArch64::W23, AArch64::W24,
+    AArch64::W25, AArch64::W26, AArch64::W27, AArch64::W28, AArch64::W29,
+    AArch64::W30, AArch64::WZR
+};
+
+static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Addr,
+                                             const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+
+  unsigned Register = GPR32DecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static DecodeStatus DecodeGPR32spRegisterClass(MCInst &Inst, unsigned RegNo,
+                                               uint64_t Addr,
+                                               const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+
+  unsigned Register = GPR32DecoderTable[RegNo];
+  if (Register == AArch64::WZR)
+    Register = AArch64::WSP;
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static const unsigned VectorDecoderTable[] = {
+    AArch64::Q0,  AArch64::Q1,  AArch64::Q2,  AArch64::Q3,  AArch64::Q4,
+    AArch64::Q5,  AArch64::Q6,  AArch64::Q7,  AArch64::Q8,  AArch64::Q9,
+    AArch64::Q10, AArch64::Q11, AArch64::Q12, AArch64::Q13, AArch64::Q14,
+    AArch64::Q15, AArch64::Q16, AArch64::Q17, AArch64::Q18, AArch64::Q19,
+    AArch64::Q20, AArch64::Q21, AArch64::Q22, AArch64::Q23, AArch64::Q24,
+    AArch64::Q25, AArch64::Q26, AArch64::Q27, AArch64::Q28, AArch64::Q29,
+    AArch64::Q30, AArch64::Q31
+};
+
+static DecodeStatus DecodeVectorRegisterClass(MCInst &Inst, unsigned RegNo,
+                                              uint64_t Addr,
+                                              const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+
+  unsigned Register = VectorDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static const unsigned QQDecoderTable[] = {
+  AArch64::Q0_Q1,   AArch64::Q1_Q2,   AArch64::Q2_Q3,   AArch64::Q3_Q4,
+  AArch64::Q4_Q5,   AArch64::Q5_Q6,   AArch64::Q6_Q7,   AArch64::Q7_Q8,
+  AArch64::Q8_Q9,   AArch64::Q9_Q10,  AArch64::Q10_Q11, AArch64::Q11_Q12,
+  AArch64::Q12_Q13, AArch64::Q13_Q14, AArch64::Q14_Q15, AArch64::Q15_Q16,
+  AArch64::Q16_Q17, AArch64::Q17_Q18, AArch64::Q18_Q19, AArch64::Q19_Q20,
+  AArch64::Q20_Q21, AArch64::Q21_Q22, AArch64::Q22_Q23, AArch64::Q23_Q24,
+  AArch64::Q24_Q25, AArch64::Q25_Q26, AArch64::Q26_Q27, AArch64::Q27_Q28,
+  AArch64::Q28_Q29, AArch64::Q29_Q30, AArch64::Q30_Q31, AArch64::Q31_Q0
+};
+
+static DecodeStatus DecodeQQRegisterClass(MCInst &Inst, unsigned RegNo,
+                                          uint64_t Addr, const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = QQDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static const unsigned QQQDecoderTable[] = {
+  AArch64::Q0_Q1_Q2,    AArch64::Q1_Q2_Q3,    AArch64::Q2_Q3_Q4,
+  AArch64::Q3_Q4_Q5,    AArch64::Q4_Q5_Q6,    AArch64::Q5_Q6_Q7,
+  AArch64::Q6_Q7_Q8,    AArch64::Q7_Q8_Q9,    AArch64::Q8_Q9_Q10,
+  AArch64::Q9_Q10_Q11,  AArch64::Q10_Q11_Q12, AArch64::Q11_Q12_Q13,
+  AArch64::Q12_Q13_Q14, AArch64::Q13_Q14_Q15, AArch64::Q14_Q15_Q16,
+  AArch64::Q15_Q16_Q17, AArch64::Q16_Q17_Q18, AArch64::Q17_Q18_Q19,
+  AArch64::Q18_Q19_Q20, AArch64::Q19_Q20_Q21, AArch64::Q20_Q21_Q22,
+  AArch64::Q21_Q22_Q23, AArch64::Q22_Q23_Q24, AArch64::Q23_Q24_Q25,
+  AArch64::Q24_Q25_Q26, AArch64::Q25_Q26_Q27, AArch64::Q26_Q27_Q28,
+  AArch64::Q27_Q28_Q29, AArch64::Q28_Q29_Q30, AArch64::Q29_Q30_Q31,
+  AArch64::Q30_Q31_Q0,  AArch64::Q31_Q0_Q1
+};
+
+static DecodeStatus DecodeQQQRegisterClass(MCInst &Inst, unsigned RegNo,
+                                           uint64_t Addr, const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = QQQDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static const unsigned QQQQDecoderTable[] = {
+  AArch64::Q0_Q1_Q2_Q3,     AArch64::Q1_Q2_Q3_Q4,     AArch64::Q2_Q3_Q4_Q5,
+  AArch64::Q3_Q4_Q5_Q6,     AArch64::Q4_Q5_Q6_Q7,     AArch64::Q5_Q6_Q7_Q8,
+  AArch64::Q6_Q7_Q8_Q9,     AArch64::Q7_Q8_Q9_Q10,    AArch64::Q8_Q9_Q10_Q11,
+  AArch64::Q9_Q10_Q11_Q12,  AArch64::Q10_Q11_Q12_Q13, AArch64::Q11_Q12_Q13_Q14,
+  AArch64::Q12_Q13_Q14_Q15, AArch64::Q13_Q14_Q15_Q16, AArch64::Q14_Q15_Q16_Q17,
+  AArch64::Q15_Q16_Q17_Q18, AArch64::Q16_Q17_Q18_Q19, AArch64::Q17_Q18_Q19_Q20,
+  AArch64::Q18_Q19_Q20_Q21, AArch64::Q19_Q20_Q21_Q22, AArch64::Q20_Q21_Q22_Q23,
+  AArch64::Q21_Q22_Q23_Q24, AArch64::Q22_Q23_Q24_Q25, AArch64::Q23_Q24_Q25_Q26,
+  AArch64::Q24_Q25_Q26_Q27, AArch64::Q25_Q26_Q27_Q28, AArch64::Q26_Q27_Q28_Q29,
+  AArch64::Q27_Q28_Q29_Q30, AArch64::Q28_Q29_Q30_Q31, AArch64::Q29_Q30_Q31_Q0,
+  AArch64::Q30_Q31_Q0_Q1,   AArch64::Q31_Q0_Q1_Q2
+};
+
+static DecodeStatus DecodeQQQQRegisterClass(MCInst &Inst, unsigned RegNo,
+                                            uint64_t Addr,
+                                            const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = QQQQDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static const unsigned DDDecoderTable[] = {
+  AArch64::D0_D1,   AArch64::D1_D2,   AArch64::D2_D3,   AArch64::D3_D4,
+  AArch64::D4_D5,   AArch64::D5_D6,   AArch64::D6_D7,   AArch64::D7_D8,
+  AArch64::D8_D9,   AArch64::D9_D10,  AArch64::D10_D11, AArch64::D11_D12,
+  AArch64::D12_D13, AArch64::D13_D14, AArch64::D14_D15, AArch64::D15_D16,
+  AArch64::D16_D17, AArch64::D17_D18, AArch64::D18_D19, AArch64::D19_D20,
+  AArch64::D20_D21, AArch64::D21_D22, AArch64::D22_D23, AArch64::D23_D24,
+  AArch64::D24_D25, AArch64::D25_D26, AArch64::D26_D27, AArch64::D27_D28,
+  AArch64::D28_D29, AArch64::D29_D30, AArch64::D30_D31, AArch64::D31_D0
+};
+
+static DecodeStatus DecodeDDRegisterClass(MCInst &Inst, unsigned RegNo,
+                                          uint64_t Addr, const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = DDDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static const unsigned DDDDecoderTable[] = {
+  AArch64::D0_D1_D2,    AArch64::D1_D2_D3,    AArch64::D2_D3_D4,
+  AArch64::D3_D4_D5,    AArch64::D4_D5_D6,    AArch64::D5_D6_D7,
+  AArch64::D6_D7_D8,    AArch64::D7_D8_D9,    AArch64::D8_D9_D10,
+  AArch64::D9_D10_D11,  AArch64::D10_D11_D12, AArch64::D11_D12_D13,
+  AArch64::D12_D13_D14, AArch64::D13_D14_D15, AArch64::D14_D15_D16,
+  AArch64::D15_D16_D17, AArch64::D16_D17_D18, AArch64::D17_D18_D19,
+  AArch64::D18_D19_D20, AArch64::D19_D20_D21, AArch64::D20_D21_D22,
+  AArch64::D21_D22_D23, AArch64::D22_D23_D24, AArch64::D23_D24_D25,
+  AArch64::D24_D25_D26, AArch64::D25_D26_D27, AArch64::D26_D27_D28,
+  AArch64::D27_D28_D29, AArch64::D28_D29_D30, AArch64::D29_D30_D31,
+  AArch64::D30_D31_D0,  AArch64::D31_D0_D1
+};
+
+static DecodeStatus DecodeDDDRegisterClass(MCInst &Inst, unsigned RegNo,
+                                           uint64_t Addr, const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = DDDDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static const unsigned DDDDDecoderTable[] = {
+  AArch64::D0_D1_D2_D3,     AArch64::D1_D2_D3_D4,     AArch64::D2_D3_D4_D5,
+  AArch64::D3_D4_D5_D6,     AArch64::D4_D5_D6_D7,     AArch64::D5_D6_D7_D8,
+  AArch64::D6_D7_D8_D9,     AArch64::D7_D8_D9_D10,    AArch64::D8_D9_D10_D11,
+  AArch64::D9_D10_D11_D12,  AArch64::D10_D11_D12_D13, AArch64::D11_D12_D13_D14,
+  AArch64::D12_D13_D14_D15, AArch64::D13_D14_D15_D16, AArch64::D14_D15_D16_D17,
+  AArch64::D15_D16_D17_D18, AArch64::D16_D17_D18_D19, AArch64::D17_D18_D19_D20,
+  AArch64::D18_D19_D20_D21, AArch64::D19_D20_D21_D22, AArch64::D20_D21_D22_D23,
+  AArch64::D21_D22_D23_D24, AArch64::D22_D23_D24_D25, AArch64::D23_D24_D25_D26,
+  AArch64::D24_D25_D26_D27, AArch64::D25_D26_D27_D28, AArch64::D26_D27_D28_D29,
+  AArch64::D27_D28_D29_D30, AArch64::D28_D29_D30_D31, AArch64::D29_D30_D31_D0,
+  AArch64::D30_D31_D0_D1,   AArch64::D31_D0_D1_D2
+};
+
+static DecodeStatus DecodeDDDDRegisterClass(MCInst &Inst, unsigned RegNo,
+                                            uint64_t Addr,
+                                            const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = DDDDDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static DecodeStatus DecodeFixedPointScaleImm32(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder) {
+  // scale{5} is asserted as 1 in tblgen.
+  Imm |= 0x20;  
+  Inst.addOperand(MCOperand::CreateImm(64 - Imm));
+  return Success;
+}
+
+static DecodeStatus DecodeFixedPointScaleImm64(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(64 - Imm));
+  return Success;
+}
+
+static DecodeStatus DecodePCRelLabel19(llvm::MCInst &Inst, unsigned Imm,
+                                       uint64_t Addr, const void *Decoder) {
+  int64_t ImmVal = Imm;
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
+
+  // Sign-extend 19-bit immediate.
+  if (ImmVal & (1 << (19 - 1)))
+    ImmVal |= ~((1LL << 19) - 1);
+
+  if (!Dis->tryAddingSymbolicOperand(Inst, ImmVal << 2, Addr,
+                                     Inst.getOpcode() != AArch64::LDRXl, 0, 4))
+    Inst.addOperand(MCOperand::CreateImm(ImmVal));
+  return Success;
+}
+
+static DecodeStatus DecodeMemExtend(llvm::MCInst &Inst, unsigned Imm,
+                                    uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm((Imm  >> 1) & 1));
+  Inst.addOperand(MCOperand::CreateImm(Imm & 1));
+  return Success;
+}
+
+static DecodeStatus DecodeMRSSystemRegister(llvm::MCInst &Inst, unsigned Imm,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
+  const MCSubtargetInfo &STI = Dis->getSubtargetInfo();
+
+  Imm |= 0x8000;
+  Inst.addOperand(MCOperand::CreateImm(Imm));
+
+  bool ValidNamed;
+  (void)AArch64SysReg::MRSMapper(STI.getFeatureBits())
+      .toString(Imm, ValidNamed);
+
+  return ValidNamed ? Success : Fail;
+}
+
+static DecodeStatus DecodeMSRSystemRegister(llvm::MCInst &Inst, unsigned Imm,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
+  const MCSubtargetInfo &STI = Dis->getSubtargetInfo();
+
+  Imm |= 0x8000;
+  Inst.addOperand(MCOperand::CreateImm(Imm));
+
+  bool ValidNamed;
+  (void)AArch64SysReg::MSRMapper(STI.getFeatureBits())
+      .toString(Imm, ValidNamed);
+
+  return ValidNamed ? Success : Fail;
+}
+
+static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn,
+                                              uint64_t Address,
+                                              const void *Decoder) {
+  // This decoder exists to add the dummy Lane operand to the MCInst, which must
+  // be 1 in assembly but has no other real manifestation.
+  unsigned Rd = fieldFromInstruction(Insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(Insn, 5, 5);
+  unsigned IsToVec = fieldFromInstruction(Insn, 16, 1);
+
+  if (IsToVec) {
+    DecodeFPR128RegisterClass(Inst, Rd, Address, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rn, Address, Decoder);
+  } else {
+    DecodeGPR64RegisterClass(Inst, Rd, Address, Decoder);
+    DecodeFPR128RegisterClass(Inst, Rn, Address, Decoder);
+  }
+
+  // Add the lane
+  Inst.addOperand(MCOperand::CreateImm(1));
+
+  return Success;
+}
+
+static DecodeStatus DecodeVecShiftRImm(llvm::MCInst &Inst, unsigned Imm,
+                                       unsigned Add) {
+  Inst.addOperand(MCOperand::CreateImm(Add - Imm));
+  return Success;
+}
+
+static DecodeStatus DecodeVecShiftLImm(llvm::MCInst &Inst, unsigned Imm,
+                                       unsigned Add) {
+  Inst.addOperand(MCOperand::CreateImm((Imm + Add) & (Add - 1)));
+  return Success;
+}
+
+static DecodeStatus DecodeVecShiftR64Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm, 64);
+}
+
+static DecodeStatus DecodeVecShiftR64ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm | 0x20, 64);
+}
+
+static DecodeStatus DecodeVecShiftR32Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm, 32);
+}
+
+static DecodeStatus DecodeVecShiftR32ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm | 0x10, 32);
+}
+
+static DecodeStatus DecodeVecShiftR16Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm, 16);
+}
+
+static DecodeStatus DecodeVecShiftR16ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm | 0x8, 16);
+}
+
+static DecodeStatus DecodeVecShiftR8Imm(llvm::MCInst &Inst, unsigned Imm,
+                                        uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm, 8);
+}
+
+static DecodeStatus DecodeVecShiftL64Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftLImm(Inst, Imm, 64);
+}
+
+static DecodeStatus DecodeVecShiftL32Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftLImm(Inst, Imm, 32);
+}
+
+static DecodeStatus DecodeVecShiftL16Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftLImm(Inst, Imm, 16);
+}
+
+static DecodeStatus DecodeVecShiftL8Imm(llvm::MCInst &Inst, unsigned Imm,
+                                        uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftLImm(Inst, Imm, 8);
+}
+
+static DecodeStatus DecodeThreeAddrSRegInstruction(llvm::MCInst &Inst,
+                                                   uint32_t insn, uint64_t Addr,
+                                                   const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned Rm = fieldFromInstruction(insn, 16, 5);
+  unsigned shiftHi = fieldFromInstruction(insn, 22, 2);
+  unsigned shiftLo = fieldFromInstruction(insn, 10, 6);
+  unsigned shift = (shiftHi << 6) | shiftLo;
+  switch (Inst.getOpcode()) {
+  default:
+    return Fail;
+  case AArch64::ADDWrs:
+  case AArch64::ADDSWrs:
+  case AArch64::SUBWrs:
+  case AArch64::SUBSWrs:
+    // if shift == '11' then ReservedValue()
+    if (shiftHi == 0x3)
+      return Fail;
+    // Deliberate fallthrough
+  case AArch64::ANDWrs:
+  case AArch64::ANDSWrs:
+  case AArch64::BICWrs:
+  case AArch64::BICSWrs:
+  case AArch64::ORRWrs:
+  case AArch64::ORNWrs:
+  case AArch64::EORWrs:
+  case AArch64::EONWrs: {
+    // if sf == '0' and imm6<5> == '1' then ReservedValue()
+    if (shiftLo >> 5 == 1)
+      return Fail;
+    DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
+    break;
+  }
+  case AArch64::ADDXrs:
+  case AArch64::ADDSXrs:
+  case AArch64::SUBXrs:
+  case AArch64::SUBSXrs:
+    // if shift == '11' then ReservedValue()
+    if (shiftHi == 0x3)
+      return Fail;
+    // Deliberate fallthrough
+  case AArch64::ANDXrs:
+  case AArch64::ANDSXrs:
+  case AArch64::BICXrs:
+  case AArch64::BICSXrs:
+  case AArch64::ORRXrs:
+  case AArch64::ORNXrs:
+  case AArch64::EORXrs:
+  case AArch64::EONXrs:
+    DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
+    break;
+  }
+
+  Inst.addOperand(MCOperand::CreateImm(shift));
+  return Success;
+}
+
+static DecodeStatus DecodeMoveImmInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                             uint64_t Addr,
+                                             const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned imm = fieldFromInstruction(insn, 5, 16);
+  unsigned shift = fieldFromInstruction(insn, 21, 2);
+  shift <<= 4;
+  switch (Inst.getOpcode()) {
+  default:
+    return Fail;
+  case AArch64::MOVZWi:
+  case AArch64::MOVNWi:
+  case AArch64::MOVKWi:
+    if (shift & (1U << 5))
+      return Fail;
+    DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
+    break;
+  case AArch64::MOVZXi:
+  case AArch64::MOVNXi:
+  case AArch64::MOVKXi:
+    DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+    break;
+  }
+
+  if (Inst.getOpcode() == AArch64::MOVKWi ||
+      Inst.getOpcode() == AArch64::MOVKXi)
+    Inst.addOperand(Inst.getOperand(0));
+
+  Inst.addOperand(MCOperand::CreateImm(imm));
+  Inst.addOperand(MCOperand::CreateImm(shift));
+  return Success;
+}
+
+static DecodeStatus DecodeUnsignedLdStInstruction(llvm::MCInst &Inst,
+                                                  uint32_t insn, uint64_t Addr,
+                                                  const void *Decoder) {
+  unsigned Rt = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned offset = fieldFromInstruction(insn, 10, 12);
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
+
+  switch (Inst.getOpcode()) {
+  default:
+    return Fail;
+  case AArch64::PRFMui:
+    // Rt is an immediate in prefetch.
+    Inst.addOperand(MCOperand::CreateImm(Rt));
+    break;
+  case AArch64::STRBBui:
+  case AArch64::LDRBBui:
+  case AArch64::LDRSBWui:
+  case AArch64::STRHHui:
+  case AArch64::LDRHHui:
+  case AArch64::LDRSHWui:
+  case AArch64::STRWui:
+  case AArch64::LDRWui:
+    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDRSBXui:
+  case AArch64::LDRSHXui:
+  case AArch64::LDRSWui:
+  case AArch64::STRXui:
+  case AArch64::LDRXui:
+    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDRQui:
+  case AArch64::STRQui:
+    DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDRDui:
+  case AArch64::STRDui:
+    DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDRSui:
+  case AArch64::STRSui:
+    DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDRHui:
+  case AArch64::STRHui:
+    DecodeFPR16RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDRBui:
+  case AArch64::STRBui:
+    DecodeFPR8RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  }
+
+  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+  if (!Dis->tryAddingSymbolicOperand(Inst, offset, Addr, Fail, 0, 4))
+    Inst.addOperand(MCOperand::CreateImm(offset));
+  return Success;
+}
+
+static DecodeStatus DecodeSignedLdStInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Addr,
+                                                const void *Decoder) {
+  unsigned Rt = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  int64_t offset = fieldFromInstruction(insn, 12, 9);
+
+  // offset is a 9-bit signed immediate, so sign extend it to
+  // fill the unsigned.
+  if (offset & (1 << (9 - 1)))
+    offset |= ~((1LL << 9) - 1);
+
+  // First operand is always the writeback to the address register, if needed.
+  switch (Inst.getOpcode()) {
+  default:
+    break;
+  case AArch64::LDRSBWpre:
+  case AArch64::LDRSHWpre:
+  case AArch64::STRBBpre:
+  case AArch64::LDRBBpre:
+  case AArch64::STRHHpre:
+  case AArch64::LDRHHpre:
+  case AArch64::STRWpre:
+  case AArch64::LDRWpre:
+  case AArch64::LDRSBWpost:
+  case AArch64::LDRSHWpost:
+  case AArch64::STRBBpost:
+  case AArch64::LDRBBpost:
+  case AArch64::STRHHpost:
+  case AArch64::LDRHHpost:
+  case AArch64::STRWpost:
+  case AArch64::LDRWpost:
+  case AArch64::LDRSBXpre:
+  case AArch64::LDRSHXpre:
+  case AArch64::STRXpre:
+  case AArch64::LDRSWpre:
+  case AArch64::LDRXpre:
+  case AArch64::LDRSBXpost:
+  case AArch64::LDRSHXpost:
+  case AArch64::STRXpost:
+  case AArch64::LDRSWpost:
+  case AArch64::LDRXpost:
+  case AArch64::LDRQpre:
+  case AArch64::STRQpre:
+  case AArch64::LDRQpost:
+  case AArch64::STRQpost:
+  case AArch64::LDRDpre:
+  case AArch64::STRDpre:
+  case AArch64::LDRDpost:
+  case AArch64::STRDpost:
+  case AArch64::LDRSpre:
+  case AArch64::STRSpre:
+  case AArch64::LDRSpost:
+  case AArch64::STRSpost:
+  case AArch64::LDRHpre:
+  case AArch64::STRHpre:
+  case AArch64::LDRHpost:
+  case AArch64::STRHpost:
+  case AArch64::LDRBpre:
+  case AArch64::STRBpre:
+  case AArch64::LDRBpost:
+  case AArch64::STRBpost:
+    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+    break;
+  }
+
+  switch (Inst.getOpcode()) {
+  default:
+    return Fail;
+  case AArch64::PRFUMi:
+    // Rt is an immediate in prefetch.
+    Inst.addOperand(MCOperand::CreateImm(Rt));
+    break;
+  case AArch64::STURBBi:
+  case AArch64::LDURBBi:
+  case AArch64::LDURSBWi:
+  case AArch64::STURHHi:
+  case AArch64::LDURHHi:
+  case AArch64::LDURSHWi:
+  case AArch64::STURWi:
+  case AArch64::LDURWi:
+  case AArch64::LDTRSBWi:
+  case AArch64::LDTRSHWi:
+  case AArch64::STTRWi:
+  case AArch64::LDTRWi:
+  case AArch64::STTRHi:
+  case AArch64::LDTRHi:
+  case AArch64::LDTRBi:
+  case AArch64::STTRBi:
+  case AArch64::LDRSBWpre:
+  case AArch64::LDRSHWpre:
+  case AArch64::STRBBpre:
+  case AArch64::LDRBBpre:
+  case AArch64::STRHHpre:
+  case AArch64::LDRHHpre:
+  case AArch64::STRWpre:
+  case AArch64::LDRWpre:
+  case AArch64::LDRSBWpost:
+  case AArch64::LDRSHWpost:
+  case AArch64::STRBBpost:
+  case AArch64::LDRBBpost:
+  case AArch64::STRHHpost:
+  case AArch64::LDRHHpost:
+  case AArch64::STRWpost:
+  case AArch64::LDRWpost:
+    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDURSBXi:
+  case AArch64::LDURSHXi:
+  case AArch64::LDURSWi:
+  case AArch64::STURXi:
+  case AArch64::LDURXi:
+  case AArch64::LDTRSBXi:
+  case AArch64::LDTRSHXi:
+  case AArch64::LDTRSWi:
+  case AArch64::STTRXi:
+  case AArch64::LDTRXi:
+  case AArch64::LDRSBXpre:
+  case AArch64::LDRSHXpre:
+  case AArch64::STRXpre:
+  case AArch64::LDRSWpre:
+  case AArch64::LDRXpre:
+  case AArch64::LDRSBXpost:
+  case AArch64::LDRSHXpost:
+  case AArch64::STRXpost:
+  case AArch64::LDRSWpost:
+  case AArch64::LDRXpost:
+    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDURQi:
+  case AArch64::STURQi:
+  case AArch64::LDRQpre:
+  case AArch64::STRQpre:
+  case AArch64::LDRQpost:
+  case AArch64::STRQpost:
+    DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDURDi:
+  case AArch64::STURDi:
+  case AArch64::LDRDpre:
+  case AArch64::STRDpre:
+  case AArch64::LDRDpost:
+  case AArch64::STRDpost:
+    DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDURSi:
+  case AArch64::STURSi:
+  case AArch64::LDRSpre:
+  case AArch64::STRSpre:
+  case AArch64::LDRSpost:
+  case AArch64::STRSpost:
+    DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDURHi:
+  case AArch64::STURHi:
+  case AArch64::LDRHpre:
+  case AArch64::STRHpre:
+  case AArch64::LDRHpost:
+  case AArch64::STRHpost:
+    DecodeFPR16RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDURBi:
+  case AArch64::STURBi:
+  case AArch64::LDRBpre:
+  case AArch64::STRBpre:
+  case AArch64::LDRBpost:
+  case AArch64::STRBpost:
+    DecodeFPR8RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  }
+
+  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+  Inst.addOperand(MCOperand::CreateImm(offset));
+
+  bool IsLoad = fieldFromInstruction(insn, 22, 1);
+  bool IsIndexed = fieldFromInstruction(insn, 10, 2) != 0;
+  bool IsFP = fieldFromInstruction(insn, 26, 1);
+
+  // Cannot write back to a transfer register (but xzr != sp).
+  if (IsLoad && IsIndexed && !IsFP && Rn != 31 && Rt == Rn)
+    return SoftFail;
+
+  return Success;
+}
+
+static DecodeStatus DecodeExclusiveLdStInstruction(llvm::MCInst &Inst,
+                                                   uint32_t insn, uint64_t Addr,
+                                                   const void *Decoder) {
+  unsigned Rt = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned Rt2 = fieldFromInstruction(insn, 10, 5);
+  unsigned Rs = fieldFromInstruction(insn, 16, 5);
+
+  unsigned Opcode = Inst.getOpcode();
+  switch (Opcode) {
+  default:
+    return Fail;
+  case AArch64::STLXRW:
+  case AArch64::STLXRB:
+  case AArch64::STLXRH:
+  case AArch64::STXRW:
+  case AArch64::STXRB:
+  case AArch64::STXRH:
+    DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
+  // FALLTHROUGH
+  case AArch64::LDARW:
+  case AArch64::LDARB:
+  case AArch64::LDARH:
+  case AArch64::LDAXRW:
+  case AArch64::LDAXRB:
+  case AArch64::LDAXRH:
+  case AArch64::LDXRW:
+  case AArch64::LDXRB:
+  case AArch64::LDXRH:
+  case AArch64::STLRW:
+  case AArch64::STLRB:
+  case AArch64::STLRH:
+    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::STLXRX:
+  case AArch64::STXRX:
+    DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
+  // FALLTHROUGH
+  case AArch64::LDARX:
+  case AArch64::LDAXRX:
+  case AArch64::LDXRX:
+  case AArch64::STLRX:
+    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::STLXPW:
+  case AArch64::STXPW:
+    DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
+  // FALLTHROUGH
+  case AArch64::LDAXPW:
+  case AArch64::LDXPW:
+    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rt2, Addr, Decoder);
+    break;
+  case AArch64::STLXPX:
+  case AArch64::STXPX:
+    DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
+  // FALLTHROUGH
+  case AArch64::LDAXPX:
+  case AArch64::LDXPX:
+    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rt2, Addr, Decoder);
+    break;
+  }
+
+  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+
+  // You shouldn't load to the same register twice in an instruction...
+  if ((Opcode == AArch64::LDAXPW || Opcode == AArch64::LDXPW ||
+       Opcode == AArch64::LDAXPX || Opcode == AArch64::LDXPX) &&
+      Rt == Rt2)
+    return SoftFail;
+
+  return Success;
+}
+
+static DecodeStatus DecodePairLdStInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                              uint64_t Addr,
+                                              const void *Decoder) {
+  unsigned Rt = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned Rt2 = fieldFromInstruction(insn, 10, 5);
+  int64_t offset = fieldFromInstruction(insn, 15, 7);
+  bool IsLoad = fieldFromInstruction(insn, 22, 1);
+
+  // offset is a 7-bit signed immediate, so sign extend it to
+  // fill the unsigned.
+  if (offset & (1 << (7 - 1)))
+    offset |= ~((1LL << 7) - 1);
+
+  unsigned Opcode = Inst.getOpcode();
+  bool NeedsDisjointWritebackTransfer = false;
+
+  // First operand is always writeback of base register.
+  switch (Opcode) {
+  default:
+    break;
+  case AArch64::LDPXpost:
+  case AArch64::STPXpost:
+  case AArch64::LDPSWpost:
+  case AArch64::LDPXpre:
+  case AArch64::STPXpre:
+  case AArch64::LDPSWpre:
+  case AArch64::LDPWpost:
+  case AArch64::STPWpost:
+  case AArch64::LDPWpre:
+  case AArch64::STPWpre:
+  case AArch64::LDPQpost:
+  case AArch64::STPQpost:
+  case AArch64::LDPQpre:
+  case AArch64::STPQpre:
+  case AArch64::LDPDpost:
+  case AArch64::STPDpost:
+  case AArch64::LDPDpre:
+  case AArch64::STPDpre:
+  case AArch64::LDPSpost:
+  case AArch64::STPSpost:
+  case AArch64::LDPSpre:
+  case AArch64::STPSpre:
+    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+    break;
+  }
+
+  switch (Opcode) {
+  default:
+    return Fail;
+  case AArch64::LDPXpost:
+  case AArch64::STPXpost:
+  case AArch64::LDPSWpost:
+  case AArch64::LDPXpre:
+  case AArch64::STPXpre:
+  case AArch64::LDPSWpre:
+    NeedsDisjointWritebackTransfer = true;
+    // Fallthrough
+  case AArch64::LDNPXi:
+  case AArch64::STNPXi:
+  case AArch64::LDPXi:
+  case AArch64::STPXi:
+  case AArch64::LDPSWi:
+    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rt2, Addr, Decoder);
+    break;
+  case AArch64::LDPWpost:
+  case AArch64::STPWpost:
+  case AArch64::LDPWpre:
+  case AArch64::STPWpre:
+    NeedsDisjointWritebackTransfer = true;
+    // Fallthrough
+  case AArch64::LDNPWi:
+  case AArch64::STNPWi:
+  case AArch64::LDPWi:
+  case AArch64::STPWi:
+    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rt2, Addr, Decoder);
+    break;
+  case AArch64::LDNPQi:
+  case AArch64::STNPQi:
+  case AArch64::LDPQpost:
+  case AArch64::STPQpost:
+  case AArch64::LDPQi:
+  case AArch64::STPQi:
+  case AArch64::LDPQpre:
+  case AArch64::STPQpre:
+    DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeFPR128RegisterClass(Inst, Rt2, Addr, Decoder);
+    break;
+  case AArch64::LDNPDi:
+  case AArch64::STNPDi:
+  case AArch64::LDPDpost:
+  case AArch64::STPDpost:
+  case AArch64::LDPDi:
+  case AArch64::STPDi:
+  case AArch64::LDPDpre:
+  case AArch64::STPDpre:
+    DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeFPR64RegisterClass(Inst, Rt2, Addr, Decoder);
+    break;
+  case AArch64::LDNPSi:
+  case AArch64::STNPSi:
+  case AArch64::LDPSpost:
+  case AArch64::STPSpost:
+  case AArch64::LDPSi:
+  case AArch64::STPSi:
+  case AArch64::LDPSpre:
+  case AArch64::STPSpre:
+    DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeFPR32RegisterClass(Inst, Rt2, Addr, Decoder);
+    break;
+  }
+
+  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+  Inst.addOperand(MCOperand::CreateImm(offset));
+
+  // You shouldn't load to the same register twice in an instruction...
+  if (IsLoad && Rt == Rt2)
+    return SoftFail;
+
+  // ... or do any operation that writes-back to a transfer register. But note
+  // that "stp xzr, xzr, [sp], #4" is fine because xzr and sp are different.
+  if (NeedsDisjointWritebackTransfer && Rn != 31 && (Rt == Rn || Rt2 == Rn))
+    return SoftFail;
+
+  return Success;
+}
+
+static DecodeStatus DecodeAddSubERegInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Addr,
+                                                const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned Rm = fieldFromInstruction(insn, 16, 5);
+  unsigned extend = fieldFromInstruction(insn, 10, 6);
+
+  unsigned shift = extend & 0x7;
+  if (shift > 4)
+    return Fail;
+
+  switch (Inst.getOpcode()) {
+  default:
+    return Fail;
+  case AArch64::ADDWrx:
+  case AArch64::SUBWrx:
+    DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
+    break;
+  case AArch64::ADDSWrx:
+  case AArch64::SUBSWrx:
+    DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
+    break;
+  case AArch64::ADDXrx:
+  case AArch64::SUBXrx:
+    DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
+    break;
+  case AArch64::ADDSXrx:
+  case AArch64::SUBSXrx:
+    DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
+    break;
+  case AArch64::ADDXrx64:
+  case AArch64::SUBXrx64:
+    DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
+    break;
+  case AArch64::SUBSXrx64:
+  case AArch64::ADDSXrx64:
+    DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
+    break;
+  }
+
+  Inst.addOperand(MCOperand::CreateImm(extend));
+  return Success;
+}
+
+static DecodeStatus DecodeLogicalImmInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Addr,
+                                                const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned Datasize = fieldFromInstruction(insn, 31, 1);
+  unsigned imm;
+
+  if (Datasize) {
+    if (Inst.getOpcode() == AArch64::ANDSXri)
+      DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+    else
+      DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder);
+    imm = fieldFromInstruction(insn, 10, 13);
+    if (!AArch64_AM::isValidDecodeLogicalImmediate(imm, 64))
+      return Fail;
+  } else {
+    if (Inst.getOpcode() == AArch64::ANDSWri)
+      DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
+    else
+      DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rn, Addr, Decoder);
+    imm = fieldFromInstruction(insn, 10, 12);
+    if (!AArch64_AM::isValidDecodeLogicalImmediate(imm, 32))
+      return Fail;
+  }
+  Inst.addOperand(MCOperand::CreateImm(imm));
+  return Success;
+}
+
+static DecodeStatus DecodeModImmInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                            uint64_t Addr,
+                                            const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned cmode = fieldFromInstruction(insn, 12, 4);
+  unsigned imm = fieldFromInstruction(insn, 16, 3) << 5;
+  imm |= fieldFromInstruction(insn, 5, 5);
+
+  if (Inst.getOpcode() == AArch64::MOVID)
+    DecodeFPR64RegisterClass(Inst, Rd, Addr, Decoder);
+  else
+    DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder);
+
+  Inst.addOperand(MCOperand::CreateImm(imm));
+
+  switch (Inst.getOpcode()) {
+  default:
+    break;
+  case AArch64::MOVIv4i16:
+  case AArch64::MOVIv8i16:
+  case AArch64::MVNIv4i16:
+  case AArch64::MVNIv8i16:
+  case AArch64::MOVIv2i32:
+  case AArch64::MOVIv4i32:
+  case AArch64::MVNIv2i32:
+  case AArch64::MVNIv4i32:
+    Inst.addOperand(MCOperand::CreateImm((cmode & 6) << 2));
+    break;
+  case AArch64::MOVIv2s_msl:
+  case AArch64::MOVIv4s_msl:
+  case AArch64::MVNIv2s_msl:
+  case AArch64::MVNIv4s_msl:
+    Inst.addOperand(MCOperand::CreateImm(cmode & 1 ? 0x110 : 0x108));
+    break;
+  }
+
+  return Success;
+}
+
+static DecodeStatus DecodeModImmTiedInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Addr,
+                                                const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned cmode = fieldFromInstruction(insn, 12, 4);
+  unsigned imm = fieldFromInstruction(insn, 16, 3) << 5;
+  imm |= fieldFromInstruction(insn, 5, 5);
+
+  // Tied operands added twice.
+  DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder);
+  DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder);
+
+  Inst.addOperand(MCOperand::CreateImm(imm));
+  Inst.addOperand(MCOperand::CreateImm((cmode & 6) << 2));
+
+  return Success;
+}
+
+static DecodeStatus DecodeAdrInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                         uint64_t Addr, const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  int64_t imm = fieldFromInstruction(insn, 5, 19) << 2;
+  imm |= fieldFromInstruction(insn, 29, 2);
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
+
+  // Sign-extend the 21-bit immediate.
+  if (imm & (1 << (21 - 1)))
+    imm |= ~((1LL << 21) - 1);
+
+  DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+  if (!Dis->tryAddingSymbolicOperand(Inst, imm, Addr, Fail, 0, 4))
+    Inst.addOperand(MCOperand::CreateImm(imm));
+
+  return Success;
+}
+
+static DecodeStatus DecodeBaseAddSubImm(llvm::MCInst &Inst, uint32_t insn,
+                                        uint64_t Addr, const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned Imm = fieldFromInstruction(insn, 10, 14);
+  unsigned S = fieldFromInstruction(insn, 29, 1);
+  unsigned Datasize = fieldFromInstruction(insn, 31, 1);
+
+  unsigned ShifterVal = (Imm >> 12) & 3;
+  unsigned ImmVal = Imm & 0xFFF;
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
+
+  if (ShifterVal != 0 && ShifterVal != 1)
+    return Fail;
+
+  if (Datasize) {
+    if (Rd == 31 && !S)
+      DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
+    else
+      DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+  } else {
+    if (Rd == 31 && !S)
+      DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder);
+    else
+      DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder);
+  }
+
+  if (!Dis->tryAddingSymbolicOperand(Inst, Imm, Addr, Fail, 0, 4))
+    Inst.addOperand(MCOperand::CreateImm(ImmVal));
+  Inst.addOperand(MCOperand::CreateImm(12 * ShifterVal));
+  return Success;
+}
+
+static DecodeStatus DecodeUnconditionalBranch(llvm::MCInst &Inst, uint32_t insn,
+                                              uint64_t Addr,
+                                              const void *Decoder) {
+  int64_t imm = fieldFromInstruction(insn, 0, 26);
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
+
+  // Sign-extend the 26-bit immediate.
+  if (imm & (1 << (26 - 1)))
+    imm |= ~((1LL << 26) - 1);
+
+  if (!Dis->tryAddingSymbolicOperand(Inst, imm << 2, Addr, true, 0, 4))
+    Inst.addOperand(MCOperand::CreateImm(imm));
+
+  return Success;
+}
+
+static DecodeStatus DecodeSystemPStateInstruction(llvm::MCInst &Inst,
+                                                  uint32_t insn, uint64_t Addr,
+                                                  const void *Decoder) {
+  uint64_t op1 = fieldFromInstruction(insn, 16, 3);
+  uint64_t op2 = fieldFromInstruction(insn, 5, 3);
+  uint64_t crm = fieldFromInstruction(insn, 8, 4);
+
+  uint64_t pstate_field = (op1 << 3) | op2;
+
+  Inst.addOperand(MCOperand::CreateImm(pstate_field));
+  Inst.addOperand(MCOperand::CreateImm(crm));
+
+  bool ValidNamed;
+  (void)AArch64PState::PStateMapper().toString(pstate_field, ValidNamed);
+  
+  return ValidNamed ? Success : Fail;
+}
+
+static DecodeStatus DecodeTestAndBranch(llvm::MCInst &Inst, uint32_t insn,
+                                        uint64_t Addr, const void *Decoder) {
+  uint64_t Rt = fieldFromInstruction(insn, 0, 5);
+  uint64_t bit = fieldFromInstruction(insn, 31, 1) << 5;
+  bit |= fieldFromInstruction(insn, 19, 5);
+  int64_t dst = fieldFromInstruction(insn, 5, 14);
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
+
+  // Sign-extend 14-bit immediate.
+  if (dst & (1 << (14 - 1)))
+    dst |= ~((1LL << 14) - 1);
+
+  if (fieldFromInstruction(insn, 31, 1) == 0)
+    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+  else
+    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+  Inst.addOperand(MCOperand::CreateImm(bit));
+  if (!Dis->tryAddingSymbolicOperand(Inst, dst << 2, Addr, true, 0, 4))
+    Inst.addOperand(MCOperand::CreateImm(dst));
+
+  return Success;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.h b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
new file mode 100644
index 0000000..68d4867
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
@@ -0,0 +1,40 @@
+//===- AArch64Disassembler.h - Disassembler for AArch64 ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AArch64DISASSEMBLER_H
+#define AArch64DISASSEMBLER_H
+
+#include "llvm/MC/MCDisassembler.h"
+
+namespace llvm {
+
+class MCInst;
+class MemoryObject;
+class raw_ostream;
+
+class AArch64Disassembler : public MCDisassembler {
+public:
+  AArch64Disassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
+    : MCDisassembler(STI, Ctx) {}
+
+  ~AArch64Disassembler() {}
+
+  /// getInstruction - See MCDisassembler.
+  MCDisassembler::DecodeStatus
+  getInstruction(MCInst &instr, uint64_t &size, const MemoryObject &region,
+                 uint64_t address, raw_ostream &vStream,
+                 raw_ostream &cStream) const override;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
new file mode 100644
index 0000000..2057c51
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
@@ -0,0 +1,220 @@
+//===- AArch64ExternalSymbolizer.cpp - Symbolizer for AArch64 ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64ExternalSymbolizer.h"
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-disassembler"
+
+static MCSymbolRefExpr::VariantKind
+getVariant(uint64_t LLVMDisassembler_VariantKind) {
+  switch (LLVMDisassembler_VariantKind) {
+  case LLVMDisassembler_VariantKind_None:
+    return MCSymbolRefExpr::VK_None;
+  case LLVMDisassembler_VariantKind_ARM64_PAGE:
+    return MCSymbolRefExpr::VK_PAGE;
+  case LLVMDisassembler_VariantKind_ARM64_PAGEOFF:
+    return MCSymbolRefExpr::VK_PAGEOFF;
+  case LLVMDisassembler_VariantKind_ARM64_GOTPAGE:
+    return MCSymbolRefExpr::VK_GOTPAGE;
+  case LLVMDisassembler_VariantKind_ARM64_GOTPAGEOFF:
+    return MCSymbolRefExpr::VK_GOTPAGEOFF;
+  case LLVMDisassembler_VariantKind_ARM64_TLVP:
+  case LLVMDisassembler_VariantKind_ARM64_TLVOFF:
+  default:
+    llvm_unreachable("bad LLVMDisassembler_VariantKind");
+  }
+}
+
+/// tryAddingSymbolicOperand - tryAddingSymbolicOperand trys to add a symbolic
+/// operand in place of the immediate Value in the MCInst.  The immediate
+/// Value has not had any PC adjustment made by the caller. If the instruction
+/// is a branch that adds the PC to the immediate Value then isBranch is
+/// Success, else Fail. If GetOpInfo is non-null, then it is called to get any
+/// symbolic information at the Address for this instrution.  If that returns
+/// non-zero then the symbolic information it returns is used to create an
+/// MCExpr and that is added as an operand to the MCInst.  If GetOpInfo()
+/// returns zero and isBranch is Success then a symbol look up for
+/// Address + Value is done and if a symbol is found an MCExpr is created with
+/// that, else an MCExpr with Address + Value is created.  If GetOpInfo()
+/// returns zero and isBranch is Fail then the the Opcode of the MCInst is
+/// tested and for ADRP an other instructions that help to load of pointers
+/// a symbol look up is done to see it is returns a specific reference type
+/// to add to the comment stream.  This function returns Success if it adds
+/// an operand to the MCInst and Fail otherwise.
+bool AArch64ExternalSymbolizer::tryAddingSymbolicOperand(
+    MCInst &MI, raw_ostream &CommentStream, int64_t Value, uint64_t Address,
+    bool IsBranch, uint64_t Offset, uint64_t InstSize) {
+  // FIXME: This method shares a lot of code with
+  //        MCExternalSymbolizer::tryAddingSymbolicOperand. It may be possible
+  //        refactor the MCExternalSymbolizer interface to allow more of this
+  //        implementation to be shared.
+  //
+  struct LLVMOpInfo1 SymbolicOp;
+  memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1));
+  SymbolicOp.Value = Value;
+  uint64_t ReferenceType;
+  const char *ReferenceName;
+  if (!GetOpInfo ||
+      !GetOpInfo(DisInfo, Address, 0 /* Offset */, InstSize, 1, &SymbolicOp)) {
+    if (IsBranch) {
+      ReferenceType = LLVMDisassembler_ReferenceType_In_Branch;
+      const char *Name = SymbolLookUp(DisInfo, Address + Value, &ReferenceType,
+                                      Address, &ReferenceName);
+      if (Name) {
+        SymbolicOp.AddSymbol.Name = Name;
+        SymbolicOp.AddSymbol.Present = true;
+        SymbolicOp.Value = 0;
+      } else {
+        SymbolicOp.Value = Address + Value;
+      }
+      if (ReferenceType == LLVMDisassembler_ReferenceType_Out_SymbolStub)
+        CommentStream << "symbol stub for: " << ReferenceName;
+      else if (ReferenceType ==
+               LLVMDisassembler_ReferenceType_Out_Objc_Message)
+        CommentStream << "Objc message: " << ReferenceName;
+    } else if (MI.getOpcode() == AArch64::ADRP) {
+        ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_ADRP;
+        // otool expects the fully encoded ADRP instruction to be passed in as
+        // the value here, so reconstruct it:
+        const MCRegisterInfo &MCRI = *Ctx.getRegisterInfo();
+        uint32_t EncodedInst = 0x90000000;
+        EncodedInst |= (Value & 0x3) << 29; // immlo
+        EncodedInst |= ((Value >> 2) & 0x7FFFF) << 5; // immhi
+        EncodedInst |= MCRI.getEncodingValue(MI.getOperand(0).getReg()); // reg
+        SymbolLookUp(DisInfo, EncodedInst, &ReferenceType, Address,
+                     &ReferenceName);
+        CommentStream << format("0x%llx",
+                                0xfffffffffffff000LL & (Address + Value));
+    } else if (MI.getOpcode() == AArch64::ADDXri ||
+               MI.getOpcode() == AArch64::LDRXui ||
+               MI.getOpcode() == AArch64::LDRXl ||
+               MI.getOpcode() == AArch64::ADR) {
+      if (MI.getOpcode() == AArch64::ADDXri)
+        ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_ADDXri;
+      else if (MI.getOpcode() == AArch64::LDRXui)
+        ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_LDRXui;
+      if (MI.getOpcode() == AArch64::LDRXl) {
+        ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_LDRXl;
+        SymbolLookUp(DisInfo, Address + Value, &ReferenceType, Address,
+                     &ReferenceName);
+      } else if (MI.getOpcode() == AArch64::ADR) {
+        ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_ADR;
+        SymbolLookUp(DisInfo, Address + Value, &ReferenceType, Address,
+                            &ReferenceName);
+      } else {
+        const MCRegisterInfo &MCRI = *Ctx.getRegisterInfo();
+        // otool expects the fully encoded ADD/LDR instruction to be passed in
+        // as the value here, so reconstruct it:
+        unsigned EncodedInst =
+          MI.getOpcode() == AArch64::ADDXri ? 0x91000000: 0xF9400000;
+        EncodedInst |= Value << 10; // imm12 [+ shift:2 for ADD]
+        EncodedInst |=
+          MCRI.getEncodingValue(MI.getOperand(1).getReg()) << 5; // Rn
+        EncodedInst |= MCRI.getEncodingValue(MI.getOperand(0).getReg()); // Rd
+
+        SymbolLookUp(DisInfo, EncodedInst, &ReferenceType, Address,
+                     &ReferenceName);
+      }
+      if (ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_SymAddr)
+        CommentStream << "literal pool symbol address: " << ReferenceName;
+      else if (ReferenceType ==
+               LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr)
+        CommentStream << "literal pool for: \"" << ReferenceName << "\"";
+      else if (ReferenceType ==
+               LLVMDisassembler_ReferenceType_Out_Objc_CFString_Ref)
+        CommentStream << "Objc cfstring ref: @\"" << ReferenceName << "\"";
+      else if (ReferenceType ==
+               LLVMDisassembler_ReferenceType_Out_Objc_Message)
+        CommentStream << "Objc message: " << ReferenceName;
+      else if (ReferenceType ==
+               LLVMDisassembler_ReferenceType_Out_Objc_Message_Ref)
+        CommentStream << "Objc message ref: " << ReferenceName;
+      else if (ReferenceType ==
+               LLVMDisassembler_ReferenceType_Out_Objc_Selector_Ref)
+        CommentStream << "Objc selector ref: " << ReferenceName;
+      else if (ReferenceType ==
+               LLVMDisassembler_ReferenceType_Out_Objc_Class_Ref)
+        CommentStream << "Objc class ref: " << ReferenceName;
+      // For these instructions, the SymbolLookUp() above is just to get the
+      // ReferenceType and ReferenceName.  We want to make sure not to
+      // fall through so we don't build an MCExpr to leave the disassembly
+      // of the immediate values of these instructions to the InstPrinter.
+      return false;
+    } else {
+      return false;
+    }
+  }
+
+  const MCExpr *Add = nullptr;
+  if (SymbolicOp.AddSymbol.Present) {
+    if (SymbolicOp.AddSymbol.Name) {
+      StringRef Name(SymbolicOp.AddSymbol.Name);
+      MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name);
+      MCSymbolRefExpr::VariantKind Variant = getVariant(SymbolicOp.VariantKind);
+      if (Variant != MCSymbolRefExpr::VK_None)
+        Add = MCSymbolRefExpr::Create(Sym, Variant, Ctx);
+      else
+        Add = MCSymbolRefExpr::Create(Sym, Ctx);
+    } else {
+      Add = MCConstantExpr::Create(SymbolicOp.AddSymbol.Value, Ctx);
+    }
+  }
+
+  const MCExpr *Sub = nullptr;
+  if (SymbolicOp.SubtractSymbol.Present) {
+    if (SymbolicOp.SubtractSymbol.Name) {
+      StringRef Name(SymbolicOp.SubtractSymbol.Name);
+      MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name);
+      Sub = MCSymbolRefExpr::Create(Sym, Ctx);
+    } else {
+      Sub = MCConstantExpr::Create(SymbolicOp.SubtractSymbol.Value, Ctx);
+    }
+  }
+
+  const MCExpr *Off = nullptr;
+  if (SymbolicOp.Value != 0)
+    Off = MCConstantExpr::Create(SymbolicOp.Value, Ctx);
+
+  const MCExpr *Expr;
+  if (Sub) {
+    const MCExpr *LHS;
+    if (Add)
+      LHS = MCBinaryExpr::CreateSub(Add, Sub, Ctx);
+    else
+      LHS = MCUnaryExpr::CreateMinus(Sub, Ctx);
+    if (Off)
+      Expr = MCBinaryExpr::CreateAdd(LHS, Off, Ctx);
+    else
+      Expr = LHS;
+  } else if (Add) {
+    if (Off)
+      Expr = MCBinaryExpr::CreateAdd(Add, Off, Ctx);
+    else
+      Expr = Add;
+  } else {
+    if (Off)
+      Expr = Off;
+    else
+      Expr = MCConstantExpr::Create(0, Ctx);
+  }
+
+  MI.addOperand(MCOperand::CreateExpr(Expr));
+
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h
new file mode 100644
index 0000000..171d31c
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h
@@ -0,0 +1,38 @@
+//===- AArch64ExternalSymbolizer.h - Symbolizer for AArch64 -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Symbolize AArch64 assembly code during disassembly using callbacks.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AArch64EXTERNALSYMBOLIZER_H
+#define AArch64EXTERNALSYMBOLIZER_H
+
+#include "llvm/MC/MCExternalSymbolizer.h"
+
+namespace llvm {
+
+class AArch64ExternalSymbolizer : public MCExternalSymbolizer {
+public:
+  AArch64ExternalSymbolizer(MCContext &Ctx,
+                            std::unique_ptr<MCRelocationInfo> RelInfo,
+                            LLVMOpInfoCallback GetOpInfo,
+                            LLVMSymbolLookupCallback SymbolLookUp,
+                            void *DisInfo)
+      : MCExternalSymbolizer(Ctx, std::move(RelInfo), GetOpInfo, SymbolLookUp,
+                             DisInfo) {}
+
+  bool tryAddingSymbolicOperand(MCInst &MI, raw_ostream &CommentStream,
+                                int64_t Value, uint64_t Address, bool IsBranch,
+                                uint64_t Offset, uint64_t InstSize) override;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
new file mode 100644
index 0000000..8a21f06
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
@@ -0,0 +1,1316 @@
+//==-- AArch64InstPrinter.cpp - Convert AArch64 MCInst to assembly syntax --==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an AArch64 MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64InstPrinter.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+#define GET_INSTRUCTION_NAME
+#define PRINT_ALIAS_INSTR
+#include "AArch64GenAsmWriter.inc"
+#define GET_INSTRUCTION_NAME
+#define PRINT_ALIAS_INSTR
+#include "AArch64GenAsmWriter1.inc"
+
+AArch64InstPrinter::AArch64InstPrinter(const MCAsmInfo &MAI,
+                                       const MCInstrInfo &MII,
+                                       const MCRegisterInfo &MRI,
+                                       const MCSubtargetInfo &STI)
+    : MCInstPrinter(MAI, MII, MRI) {
+  // Initialize the set of available features.
+  setAvailableFeatures(STI.getFeatureBits());
+}
+
+AArch64AppleInstPrinter::AArch64AppleInstPrinter(const MCAsmInfo &MAI,
+                                                 const MCInstrInfo &MII,
+                                                 const MCRegisterInfo &MRI,
+                                                 const MCSubtargetInfo &STI)
+    : AArch64InstPrinter(MAI, MII, MRI, STI) {}
+
+void AArch64InstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+  // This is for .cfi directives.
+  OS << getRegisterName(RegNo);
+}
+
+void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                                   StringRef Annot) {
+  // Check for special encodings and print the canonical alias instead.
+
+  unsigned Opcode = MI->getOpcode();
+
+  if (Opcode == AArch64::SYSxt)
+    if (printSysAlias(MI, O)) {
+      printAnnotation(O, Annot);
+      return;
+    }
+
+  // SBFM/UBFM should print to a nicer aliased form if possible.
+  if (Opcode == AArch64::SBFMXri || Opcode == AArch64::SBFMWri ||
+      Opcode == AArch64::UBFMXri || Opcode == AArch64::UBFMWri) {
+    const MCOperand &Op0 = MI->getOperand(0);
+    const MCOperand &Op1 = MI->getOperand(1);
+    const MCOperand &Op2 = MI->getOperand(2);
+    const MCOperand &Op3 = MI->getOperand(3);
+
+    bool IsSigned = (Opcode == AArch64::SBFMXri || Opcode == AArch64::SBFMWri);
+    bool Is64Bit = (Opcode == AArch64::SBFMXri || Opcode == AArch64::UBFMXri);
+    if (Op2.isImm() && Op2.getImm() == 0 && Op3.isImm()) {
+      const char *AsmMnemonic = nullptr;
+
+      switch (Op3.getImm()) {
+      default:
+        break;
+      case 7:
+        if (IsSigned)
+          AsmMnemonic = "sxtb";
+        else if (!Is64Bit)
+          AsmMnemonic = "uxtb";
+        break;
+      case 15:
+        if (IsSigned)
+          AsmMnemonic = "sxth";
+        else if (!Is64Bit)
+          AsmMnemonic = "uxth";
+        break;
+      case 31:
+        // *xtw is only valid for signed 64-bit operations.
+        if (Is64Bit && IsSigned)
+          AsmMnemonic = "sxtw";
+        break;
+      }
+
+      if (AsmMnemonic) {
+        O << '\t' << AsmMnemonic << '\t' << getRegisterName(Op0.getReg())
+          << ", " << getRegisterName(getWRegFromXReg(Op1.getReg()));
+        printAnnotation(O, Annot);
+        return;
+      }
+    }
+
+    // All immediate shifts are aliases, implemented using the Bitfield
+    // instruction. In all cases the immediate shift amount shift must be in
+    // the range 0 to (reg.size -1).
+    if (Op2.isImm() && Op3.isImm()) {
+      const char *AsmMnemonic = nullptr;
+      int shift = 0;
+      int64_t immr = Op2.getImm();
+      int64_t imms = Op3.getImm();
+      if (Opcode == AArch64::UBFMWri && imms != 0x1F && ((imms + 1) == immr)) {
+        AsmMnemonic = "lsl";
+        shift = 31 - imms;
+      } else if (Opcode == AArch64::UBFMXri && imms != 0x3f &&
+                 ((imms + 1 == immr))) {
+        AsmMnemonic = "lsl";
+        shift = 63 - imms;
+      } else if (Opcode == AArch64::UBFMWri && imms == 0x1f) {
+        AsmMnemonic = "lsr";
+        shift = immr;
+      } else if (Opcode == AArch64::UBFMXri && imms == 0x3f) {
+        AsmMnemonic = "lsr";
+        shift = immr;
+      } else if (Opcode == AArch64::SBFMWri && imms == 0x1f) {
+        AsmMnemonic = "asr";
+        shift = immr;
+      } else if (Opcode == AArch64::SBFMXri && imms == 0x3f) {
+        AsmMnemonic = "asr";
+        shift = immr;
+      }
+      if (AsmMnemonic) {
+        O << '\t' << AsmMnemonic << '\t' << getRegisterName(Op0.getReg())
+          << ", " << getRegisterName(Op1.getReg()) << ", #" << shift;
+        printAnnotation(O, Annot);
+        return;
+      }
+    }
+
+    // SBFIZ/UBFIZ aliases
+    if (Op2.getImm() > Op3.getImm()) {
+      O << '\t' << (IsSigned ? "sbfiz" : "ubfiz") << '\t'
+        << getRegisterName(Op0.getReg()) << ", " << getRegisterName(Op1.getReg())
+        << ", #" << (Is64Bit ? 64 : 32) - Op2.getImm() << ", #" << Op3.getImm() + 1;
+      printAnnotation(O, Annot);
+      return;
+    }
+
+    // Otherwise SBFX/UBFX is the preferred form
+    O << '\t' << (IsSigned ? "sbfx" : "ubfx") << '\t'
+      << getRegisterName(Op0.getReg()) << ", " << getRegisterName(Op1.getReg())
+      << ", #" << Op2.getImm() << ", #" << Op3.getImm() - Op2.getImm() + 1;
+    printAnnotation(O, Annot);
+    return;
+  }
+
+  if (Opcode == AArch64::BFMXri || Opcode == AArch64::BFMWri) {
+    const MCOperand &Op0 = MI->getOperand(0); // Op1 == Op0
+    const MCOperand &Op2 = MI->getOperand(2);
+    int ImmR = MI->getOperand(3).getImm();
+    int ImmS = MI->getOperand(4).getImm();
+
+    // BFI alias
+    if (ImmS < ImmR) {
+      int BitWidth = Opcode == AArch64::BFMXri ? 64 : 32;
+      int LSB = (BitWidth - ImmR) % BitWidth;
+      int Width = ImmS + 1;
+      O << "\tbfi\t" << getRegisterName(Op0.getReg()) << ", "
+        << getRegisterName(Op2.getReg()) << ", #" << LSB << ", #" << Width;
+      printAnnotation(O, Annot);
+      return;
+    }
+
+    int LSB = ImmR;
+    int Width = ImmS - ImmR + 1;
+    // Otherwise BFXIL the preferred form
+    O << "\tbfxil\t"
+      << getRegisterName(Op0.getReg()) << ", " << getRegisterName(Op2.getReg())
+      << ", #" << LSB << ", #" << Width;
+    printAnnotation(O, Annot);
+    return;
+  }
+
+  // Symbolic operands for MOVZ, MOVN and MOVK already imply a shift
+  // (e.g. :gottprel_g1: is always going to be "lsl #16") so it should not be
+  // printed.
+  if ((Opcode == AArch64::MOVZXi || Opcode == AArch64::MOVZWi ||
+       Opcode == AArch64::MOVNXi || Opcode == AArch64::MOVNWi) &&
+      MI->getOperand(1).isExpr()) {
+    if (Opcode == AArch64::MOVZXi || Opcode == AArch64::MOVZWi)
+      O << "\tmovz\t";
+    else
+      O << "\tmovn\t";
+
+    O << getRegisterName(MI->getOperand(0).getReg()) << ", #"
+      << *MI->getOperand(1).getExpr();
+    return;
+  }
+
+  if ((Opcode == AArch64::MOVKXi || Opcode == AArch64::MOVKWi) &&
+      MI->getOperand(2).isExpr()) {
+    O << "\tmovk\t" << getRegisterName(MI->getOperand(0).getReg()) << ", #"
+      << *MI->getOperand(2).getExpr();
+    return;
+  }
+
+  if (!printAliasInstr(MI, O))
+    printInstruction(MI, O);
+
+  printAnnotation(O, Annot);
+}
+
+static bool isTblTbxInstruction(unsigned Opcode, StringRef &Layout,
+                                bool &IsTbx) {
+  switch (Opcode) {
+  case AArch64::TBXv8i8One:
+  case AArch64::TBXv8i8Two:
+  case AArch64::TBXv8i8Three:
+  case AArch64::TBXv8i8Four:
+    IsTbx = true;
+    Layout = ".8b";
+    return true;
+  case AArch64::TBLv8i8One:
+  case AArch64::TBLv8i8Two:
+  case AArch64::TBLv8i8Three:
+  case AArch64::TBLv8i8Four:
+    IsTbx = false;
+    Layout = ".8b";
+    return true;
+  case AArch64::TBXv16i8One:
+  case AArch64::TBXv16i8Two:
+  case AArch64::TBXv16i8Three:
+  case AArch64::TBXv16i8Four:
+    IsTbx = true;
+    Layout = ".16b";
+    return true;
+  case AArch64::TBLv16i8One:
+  case AArch64::TBLv16i8Two:
+  case AArch64::TBLv16i8Three:
+  case AArch64::TBLv16i8Four:
+    IsTbx = false;
+    Layout = ".16b";
+    return true;
+  default:
+    return false;
+  }
+}
+
+struct LdStNInstrDesc {
+  unsigned Opcode;
+  const char *Mnemonic;
+  const char *Layout;
+  int ListOperand;
+  bool HasLane;
+  int NaturalOffset;
+};
+
+static LdStNInstrDesc LdStNInstInfo[] = {
+  { AArch64::LD1i8,             "ld1",  ".b",     1, true,  0  },
+  { AArch64::LD1i16,            "ld1",  ".h",     1, true,  0  },
+  { AArch64::LD1i32,            "ld1",  ".s",     1, true,  0  },
+  { AArch64::LD1i64,            "ld1",  ".d",     1, true,  0  },
+  { AArch64::LD1i8_POST,        "ld1",  ".b",     2, true,  1  },
+  { AArch64::LD1i16_POST,       "ld1",  ".h",     2, true,  2  },
+  { AArch64::LD1i32_POST,       "ld1",  ".s",     2, true,  4  },
+  { AArch64::LD1i64_POST,       "ld1",  ".d",     2, true,  8  },
+  { AArch64::LD1Rv16b,          "ld1r", ".16b",   0, false, 0  },
+  { AArch64::LD1Rv8h,           "ld1r", ".8h",    0, false, 0  },
+  { AArch64::LD1Rv4s,           "ld1r", ".4s",    0, false, 0  },
+  { AArch64::LD1Rv2d,           "ld1r", ".2d",    0, false, 0  },
+  { AArch64::LD1Rv8b,           "ld1r", ".8b",    0, false, 0  },
+  { AArch64::LD1Rv4h,           "ld1r", ".4h",    0, false, 0  },
+  { AArch64::LD1Rv2s,           "ld1r", ".2s",    0, false, 0  },
+  { AArch64::LD1Rv1d,           "ld1r", ".1d",    0, false, 0  },
+  { AArch64::LD1Rv16b_POST,     "ld1r", ".16b",   1, false, 1  },
+  { AArch64::LD1Rv8h_POST,      "ld1r", ".8h",    1, false, 2  },
+  { AArch64::LD1Rv4s_POST,      "ld1r", ".4s",    1, false, 4  },
+  { AArch64::LD1Rv2d_POST,      "ld1r", ".2d",    1, false, 8  },
+  { AArch64::LD1Rv8b_POST,      "ld1r", ".8b",    1, false, 1  },
+  { AArch64::LD1Rv4h_POST,      "ld1r", ".4h",    1, false, 2  },
+  { AArch64::LD1Rv2s_POST,      "ld1r", ".2s",    1, false, 4  },
+  { AArch64::LD1Rv1d_POST,      "ld1r", ".1d",    1, false, 8  },
+  { AArch64::LD1Onev16b,        "ld1",  ".16b",   0, false, 0  },
+  { AArch64::LD1Onev8h,         "ld1",  ".8h",    0, false, 0  },
+  { AArch64::LD1Onev4s,         "ld1",  ".4s",    0, false, 0  },
+  { AArch64::LD1Onev2d,         "ld1",  ".2d",    0, false, 0  },
+  { AArch64::LD1Onev8b,         "ld1",  ".8b",    0, false, 0  },
+  { AArch64::LD1Onev4h,         "ld1",  ".4h",    0, false, 0  },
+  { AArch64::LD1Onev2s,         "ld1",  ".2s",    0, false, 0  },
+  { AArch64::LD1Onev1d,         "ld1",  ".1d",    0, false, 0  },
+  { AArch64::LD1Onev16b_POST,   "ld1",  ".16b",   1, false, 16 },
+  { AArch64::LD1Onev8h_POST,    "ld1",  ".8h",    1, false, 16 },
+  { AArch64::LD1Onev4s_POST,    "ld1",  ".4s",    1, false, 16 },
+  { AArch64::LD1Onev2d_POST,    "ld1",  ".2d",    1, false, 16 },
+  { AArch64::LD1Onev8b_POST,    "ld1",  ".8b",    1, false, 8  },
+  { AArch64::LD1Onev4h_POST,    "ld1",  ".4h",    1, false, 8  },
+  { AArch64::LD1Onev2s_POST,    "ld1",  ".2s",    1, false, 8  },
+  { AArch64::LD1Onev1d_POST,    "ld1",  ".1d",    1, false, 8  },
+  { AArch64::LD1Twov16b,        "ld1",  ".16b",   0, false, 0  },
+  { AArch64::LD1Twov8h,         "ld1",  ".8h",    0, false, 0  },
+  { AArch64::LD1Twov4s,         "ld1",  ".4s",    0, false, 0  },
+  { AArch64::LD1Twov2d,         "ld1",  ".2d",    0, false, 0  },
+  { AArch64::LD1Twov8b,         "ld1",  ".8b",    0, false, 0  },
+  { AArch64::LD1Twov4h,         "ld1",  ".4h",    0, false, 0  },
+  { AArch64::LD1Twov2s,         "ld1",  ".2s",    0, false, 0  },
+  { AArch64::LD1Twov1d,         "ld1",  ".1d",    0, false, 0  },
+  { AArch64::LD1Twov16b_POST,   "ld1",  ".16b",   1, false, 32 },
+  { AArch64::LD1Twov8h_POST,    "ld1",  ".8h",    1, false, 32 },
+  { AArch64::LD1Twov4s_POST,    "ld1",  ".4s",    1, false, 32 },
+  { AArch64::LD1Twov2d_POST,    "ld1",  ".2d",    1, false, 32 },
+  { AArch64::LD1Twov8b_POST,    "ld1",  ".8b",    1, false, 16 },
+  { AArch64::LD1Twov4h_POST,    "ld1",  ".4h",    1, false, 16 },
+  { AArch64::LD1Twov2s_POST,    "ld1",  ".2s",    1, false, 16 },
+  { AArch64::LD1Twov1d_POST,    "ld1",  ".1d",    1, false, 16 },
+  { AArch64::LD1Threev16b,      "ld1",  ".16b",   0, false, 0  },
+  { AArch64::LD1Threev8h,       "ld1",  ".8h",    0, false, 0  },
+  { AArch64::LD1Threev4s,       "ld1",  ".4s",    0, false, 0  },
+  { AArch64::LD1Threev2d,       "ld1",  ".2d",    0, false, 0  },
+  { AArch64::LD1Threev8b,       "ld1",  ".8b",    0, false, 0  },
+  { AArch64::LD1Threev4h,       "ld1",  ".4h",    0, false, 0  },
+  { AArch64::LD1Threev2s,       "ld1",  ".2s",    0, false, 0  },
+  { AArch64::LD1Threev1d,       "ld1",  ".1d",    0, false, 0  },
+  { AArch64::LD1Threev16b_POST, "ld1",  ".16b",   1, false, 48 },
+  { AArch64::LD1Threev8h_POST,  "ld1",  ".8h",    1, false, 48 },
+  { AArch64::LD1Threev4s_POST,  "ld1",  ".4s",    1, false, 48 },
+  { AArch64::LD1Threev2d_POST,  "ld1",  ".2d",    1, false, 48 },
+  { AArch64::LD1Threev8b_POST,  "ld1",  ".8b",    1, false, 24 },
+  { AArch64::LD1Threev4h_POST,  "ld1",  ".4h",    1, false, 24 },
+  { AArch64::LD1Threev2s_POST,  "ld1",  ".2s",    1, false, 24 },
+  { AArch64::LD1Threev1d_POST,  "ld1",  ".1d",    1, false, 24 },
+  { AArch64::LD1Fourv16b,       "ld1",  ".16b",   0, false, 0  },
+  { AArch64::LD1Fourv8h,        "ld1",  ".8h",    0, false, 0  },
+  { AArch64::LD1Fourv4s,        "ld1",  ".4s",    0, false, 0  },
+  { AArch64::LD1Fourv2d,        "ld1",  ".2d",    0, false, 0  },
+  { AArch64::LD1Fourv8b,        "ld1",  ".8b",    0, false, 0  },
+  { AArch64::LD1Fourv4h,        "ld1",  ".4h",    0, false, 0  },
+  { AArch64::LD1Fourv2s,        "ld1",  ".2s",    0, false, 0  },
+  { AArch64::LD1Fourv1d,        "ld1",  ".1d",    0, false, 0  },
+  { AArch64::LD1Fourv16b_POST,  "ld1",  ".16b",   1, false, 64 },
+  { AArch64::LD1Fourv8h_POST,   "ld1",  ".8h",    1, false, 64 },
+  { AArch64::LD1Fourv4s_POST,   "ld1",  ".4s",    1, false, 64 },
+  { AArch64::LD1Fourv2d_POST,   "ld1",  ".2d",    1, false, 64 },
+  { AArch64::LD1Fourv8b_POST,   "ld1",  ".8b",    1, false, 32 },
+  { AArch64::LD1Fourv4h_POST,   "ld1",  ".4h",    1, false, 32 },
+  { AArch64::LD1Fourv2s_POST,   "ld1",  ".2s",    1, false, 32 },
+  { AArch64::LD1Fourv1d_POST,   "ld1",  ".1d",    1, false, 32 },
+  { AArch64::LD2i8,             "ld2",  ".b",     1, true,  0  },
+  { AArch64::LD2i16,            "ld2",  ".h",     1, true,  0  },
+  { AArch64::LD2i32,            "ld2",  ".s",     1, true,  0  },
+  { AArch64::LD2i64,            "ld2",  ".d",     1, true,  0  },
+  { AArch64::LD2i8_POST,        "ld2",  ".b",     2, true,  2  },
+  { AArch64::LD2i16_POST,       "ld2",  ".h",     2, true,  4  },
+  { AArch64::LD2i32_POST,       "ld2",  ".s",     2, true,  8  },
+  { AArch64::LD2i64_POST,       "ld2",  ".d",     2, true,  16  },
+  { AArch64::LD2Rv16b,          "ld2r", ".16b",   0, false, 0  },
+  { AArch64::LD2Rv8h,           "ld2r", ".8h",    0, false, 0  },
+  { AArch64::LD2Rv4s,           "ld2r", ".4s",    0, false, 0  },
+  { AArch64::LD2Rv2d,           "ld2r", ".2d",    0, false, 0  },
+  { AArch64::LD2Rv8b,           "ld2r", ".8b",    0, false, 0  },
+  { AArch64::LD2Rv4h,           "ld2r", ".4h",    0, false, 0  },
+  { AArch64::LD2Rv2s,           "ld2r", ".2s",    0, false, 0  },
+  { AArch64::LD2Rv1d,           "ld2r", ".1d",    0, false, 0  },
+  { AArch64::LD2Rv16b_POST,     "ld2r", ".16b",   1, false, 2  },
+  { AArch64::LD2Rv8h_POST,      "ld2r", ".8h",    1, false, 4  },
+  { AArch64::LD2Rv4s_POST,      "ld2r", ".4s",    1, false, 8  },
+  { AArch64::LD2Rv2d_POST,      "ld2r", ".2d",    1, false, 16 },
+  { AArch64::LD2Rv8b_POST,      "ld2r", ".8b",    1, false, 2  },
+  { AArch64::LD2Rv4h_POST,      "ld2r", ".4h",    1, false, 4  },
+  { AArch64::LD2Rv2s_POST,      "ld2r", ".2s",    1, false, 8  },
+  { AArch64::LD2Rv1d_POST,      "ld2r", ".1d",    1, false, 16 },
+  { AArch64::LD2Twov16b,        "ld2",  ".16b",   0, false, 0  },
+  { AArch64::LD2Twov8h,         "ld2",  ".8h",    0, false, 0  },
+  { AArch64::LD2Twov4s,         "ld2",  ".4s",    0, false, 0  },
+  { AArch64::LD2Twov2d,         "ld2",  ".2d",    0, false, 0  },
+  { AArch64::LD2Twov8b,         "ld2",  ".8b",    0, false, 0  },
+  { AArch64::LD2Twov4h,         "ld2",  ".4h",    0, false, 0  },
+  { AArch64::LD2Twov2s,         "ld2",  ".2s",    0, false, 0  },
+  { AArch64::LD2Twov16b_POST,   "ld2",  ".16b",   1, false, 32 },
+  { AArch64::LD2Twov8h_POST,    "ld2",  ".8h",    1, false, 32 },
+  { AArch64::LD2Twov4s_POST,    "ld2",  ".4s",    1, false, 32 },
+  { AArch64::LD2Twov2d_POST,    "ld2",  ".2d",    1, false, 32 },
+  { AArch64::LD2Twov8b_POST,    "ld2",  ".8b",    1, false, 16 },
+  { AArch64::LD2Twov4h_POST,    "ld2",  ".4h",    1, false, 16 },
+  { AArch64::LD2Twov2s_POST,    "ld2",  ".2s",    1, false, 16 },
+  { AArch64::LD3i8,             "ld3",  ".b",     1, true,  0  },
+  { AArch64::LD3i16,            "ld3",  ".h",     1, true,  0  },
+  { AArch64::LD3i32,            "ld3",  ".s",     1, true,  0  },
+  { AArch64::LD3i64,            "ld3",  ".d",     1, true,  0  },
+  { AArch64::LD3i8_POST,        "ld3",  ".b",     2, true,  3  },
+  { AArch64::LD3i16_POST,       "ld3",  ".h",     2, true,  6  },
+  { AArch64::LD3i32_POST,       "ld3",  ".s",     2, true,  12  },
+  { AArch64::LD3i64_POST,       "ld3",  ".d",     2, true,  24  },
+  { AArch64::LD3Rv16b,          "ld3r", ".16b",   0, false, 0  },
+  { AArch64::LD3Rv8h,           "ld3r", ".8h",    0, false, 0  },
+  { AArch64::LD3Rv4s,           "ld3r", ".4s",    0, false, 0  },
+  { AArch64::LD3Rv2d,           "ld3r", ".2d",    0, false, 0  },
+  { AArch64::LD3Rv8b,           "ld3r", ".8b",    0, false, 0  },
+  { AArch64::LD3Rv4h,           "ld3r", ".4h",    0, false, 0  },
+  { AArch64::LD3Rv2s,           "ld3r", ".2s",    0, false, 0  },
+  { AArch64::LD3Rv1d,           "ld3r", ".1d",    0, false, 0  },
+  { AArch64::LD3Rv16b_POST,     "ld3r", ".16b",   1, false, 3  },
+  { AArch64::LD3Rv8h_POST,      "ld3r", ".8h",    1, false, 6  },
+  { AArch64::LD3Rv4s_POST,      "ld3r", ".4s",    1, false, 12 },
+  { AArch64::LD3Rv2d_POST,      "ld3r", ".2d",    1, false, 24 },
+  { AArch64::LD3Rv8b_POST,      "ld3r", ".8b",    1, false, 3  },
+  { AArch64::LD3Rv4h_POST,      "ld3r", ".4h",    1, false, 6  },
+  { AArch64::LD3Rv2s_POST,      "ld3r", ".2s",    1, false, 12 },
+  { AArch64::LD3Rv1d_POST,      "ld3r", ".1d",    1, false, 24 },
+  { AArch64::LD3Threev16b,      "ld3",  ".16b",   0, false, 0  },
+  { AArch64::LD3Threev8h,       "ld3",  ".8h",    0, false, 0  },
+  { AArch64::LD3Threev4s,       "ld3",  ".4s",    0, false, 0  },
+  { AArch64::LD3Threev2d,       "ld3",  ".2d",    0, false, 0  },
+  { AArch64::LD3Threev8b,       "ld3",  ".8b",    0, false, 0  },
+  { AArch64::LD3Threev4h,       "ld3",  ".4h",    0, false, 0  },
+  { AArch64::LD3Threev2s,       "ld3",  ".2s",    0, false, 0  },
+  { AArch64::LD3Threev16b_POST, "ld3",  ".16b",   1, false, 48 },
+  { AArch64::LD3Threev8h_POST,  "ld3",  ".8h",    1, false, 48 },
+  { AArch64::LD3Threev4s_POST,  "ld3",  ".4s",    1, false, 48 },
+  { AArch64::LD3Threev2d_POST,  "ld3",  ".2d",    1, false, 48 },
+  { AArch64::LD3Threev8b_POST,  "ld3",  ".8b",    1, false, 24 },
+  { AArch64::LD3Threev4h_POST,  "ld3",  ".4h",    1, false, 24 },
+  { AArch64::LD3Threev2s_POST,  "ld3",  ".2s",    1, false, 24 },
+  { AArch64::LD4i8,             "ld4",  ".b",     1, true,  0  },
+  { AArch64::LD4i16,            "ld4",  ".h",     1, true,  0  },
+  { AArch64::LD4i32,            "ld4",  ".s",     1, true,  0  },
+  { AArch64::LD4i64,            "ld4",  ".d",     1, true,  0  },
+  { AArch64::LD4i8_POST,        "ld4",  ".b",     2, true,  4  },
+  { AArch64::LD4i16_POST,       "ld4",  ".h",     2, true,  8  },
+  { AArch64::LD4i32_POST,       "ld4",  ".s",     2, true,  16 },
+  { AArch64::LD4i64_POST,       "ld4",  ".d",     2, true,  32 },
+  { AArch64::LD4Rv16b,          "ld4r", ".16b",   0, false, 0  },
+  { AArch64::LD4Rv8h,           "ld4r", ".8h",    0, false, 0  },
+  { AArch64::LD4Rv4s,           "ld4r", ".4s",    0, false, 0  },
+  { AArch64::LD4Rv2d,           "ld4r", ".2d",    0, false, 0  },
+  { AArch64::LD4Rv8b,           "ld4r", ".8b",    0, false, 0  },
+  { AArch64::LD4Rv4h,           "ld4r", ".4h",    0, false, 0  },
+  { AArch64::LD4Rv2s,           "ld4r", ".2s",    0, false, 0  },
+  { AArch64::LD4Rv1d,           "ld4r", ".1d",    0, false, 0  },
+  { AArch64::LD4Rv16b_POST,     "ld4r", ".16b",   1, false, 4  },
+  { AArch64::LD4Rv8h_POST,      "ld4r", ".8h",    1, false, 8  },
+  { AArch64::LD4Rv4s_POST,      "ld4r", ".4s",    1, false, 16 },
+  { AArch64::LD4Rv2d_POST,      "ld4r", ".2d",    1, false, 32 },
+  { AArch64::LD4Rv8b_POST,      "ld4r", ".8b",    1, false, 4  },
+  { AArch64::LD4Rv4h_POST,      "ld4r", ".4h",    1, false, 8  },
+  { AArch64::LD4Rv2s_POST,      "ld4r", ".2s",    1, false, 16 },
+  { AArch64::LD4Rv1d_POST,      "ld4r", ".1d",    1, false, 32 },
+  { AArch64::LD4Fourv16b,       "ld4",  ".16b",   0, false, 0  },
+  { AArch64::LD4Fourv8h,        "ld4",  ".8h",    0, false, 0  },
+  { AArch64::LD4Fourv4s,        "ld4",  ".4s",    0, false, 0  },
+  { AArch64::LD4Fourv2d,        "ld4",  ".2d",    0, false, 0  },
+  { AArch64::LD4Fourv8b,        "ld4",  ".8b",    0, false, 0  },
+  { AArch64::LD4Fourv4h,        "ld4",  ".4h",    0, false, 0  },
+  { AArch64::LD4Fourv2s,        "ld4",  ".2s",    0, false, 0  },
+  { AArch64::LD4Fourv16b_POST,  "ld4",  ".16b",   1, false, 64 },
+  { AArch64::LD4Fourv8h_POST,   "ld4",  ".8h",    1, false, 64 },
+  { AArch64::LD4Fourv4s_POST,   "ld4",  ".4s",    1, false, 64 },
+  { AArch64::LD4Fourv2d_POST,   "ld4",  ".2d",    1, false, 64 },
+  { AArch64::LD4Fourv8b_POST,   "ld4",  ".8b",    1, false, 32 },
+  { AArch64::LD4Fourv4h_POST,   "ld4",  ".4h",    1, false, 32 },
+  { AArch64::LD4Fourv2s_POST,   "ld4",  ".2s",    1, false, 32 },
+  { AArch64::ST1i8,             "st1",  ".b",     0, true,  0  },
+  { AArch64::ST1i16,            "st1",  ".h",     0, true,  0  },
+  { AArch64::ST1i32,            "st1",  ".s",     0, true,  0  },
+  { AArch64::ST1i64,            "st1",  ".d",     0, true,  0  },
+  { AArch64::ST1i8_POST,        "st1",  ".b",     1, true,  1  },
+  { AArch64::ST1i16_POST,       "st1",  ".h",     1, true,  2  },
+  { AArch64::ST1i32_POST,       "st1",  ".s",     1, true,  4  },
+  { AArch64::ST1i64_POST,       "st1",  ".d",     1, true,  8  },
+  { AArch64::ST1Onev16b,        "st1",  ".16b",   0, false, 0  },
+  { AArch64::ST1Onev8h,         "st1",  ".8h",    0, false, 0  },
+  { AArch64::ST1Onev4s,         "st1",  ".4s",    0, false, 0  },
+  { AArch64::ST1Onev2d,         "st1",  ".2d",    0, false, 0  },
+  { AArch64::ST1Onev8b,         "st1",  ".8b",    0, false, 0  },
+  { AArch64::ST1Onev4h,         "st1",  ".4h",    0, false, 0  },
+  { AArch64::ST1Onev2s,         "st1",  ".2s",    0, false, 0  },
+  { AArch64::ST1Onev1d,         "st1",  ".1d",    0, false, 0  },
+  { AArch64::ST1Onev16b_POST,   "st1",  ".16b",   1, false, 16 },
+  { AArch64::ST1Onev8h_POST,    "st1",  ".8h",    1, false, 16 },
+  { AArch64::ST1Onev4s_POST,    "st1",  ".4s",    1, false, 16 },
+  { AArch64::ST1Onev2d_POST,    "st1",  ".2d",    1, false, 16 },
+  { AArch64::ST1Onev8b_POST,    "st1",  ".8b",    1, false, 8  },
+  { AArch64::ST1Onev4h_POST,    "st1",  ".4h",    1, false, 8  },
+  { AArch64::ST1Onev2s_POST,    "st1",  ".2s",    1, false, 8  },
+  { AArch64::ST1Onev1d_POST,    "st1",  ".1d",    1, false, 8  },
+  { AArch64::ST1Twov16b,        "st1",  ".16b",   0, false, 0  },
+  { AArch64::ST1Twov8h,         "st1",  ".8h",    0, false, 0  },
+  { AArch64::ST1Twov4s,         "st1",  ".4s",    0, false, 0  },
+  { AArch64::ST1Twov2d,         "st1",  ".2d",    0, false, 0  },
+  { AArch64::ST1Twov8b,         "st1",  ".8b",    0, false, 0  },
+  { AArch64::ST1Twov4h,         "st1",  ".4h",    0, false, 0  },
+  { AArch64::ST1Twov2s,         "st1",  ".2s",    0, false, 0  },
+  { AArch64::ST1Twov1d,         "st1",  ".1d",    0, false, 0  },
+  { AArch64::ST1Twov16b_POST,   "st1",  ".16b",   1, false, 32 },
+  { AArch64::ST1Twov8h_POST,    "st1",  ".8h",    1, false, 32 },
+  { AArch64::ST1Twov4s_POST,    "st1",  ".4s",    1, false, 32 },
+  { AArch64::ST1Twov2d_POST,    "st1",  ".2d",    1, false, 32 },
+  { AArch64::ST1Twov8b_POST,    "st1",  ".8b",    1, false, 16 },
+  { AArch64::ST1Twov4h_POST,    "st1",  ".4h",    1, false, 16 },
+  { AArch64::ST1Twov2s_POST,    "st1",  ".2s",    1, false, 16 },
+  { AArch64::ST1Twov1d_POST,    "st1",  ".1d",    1, false, 16 },
+  { AArch64::ST1Threev16b,      "st1",  ".16b",   0, false, 0  },
+  { AArch64::ST1Threev8h,       "st1",  ".8h",    0, false, 0  },
+  { AArch64::ST1Threev4s,       "st1",  ".4s",    0, false, 0  },
+  { AArch64::ST1Threev2d,       "st1",  ".2d",    0, false, 0  },
+  { AArch64::ST1Threev8b,       "st1",  ".8b",    0, false, 0  },
+  { AArch64::ST1Threev4h,       "st1",  ".4h",    0, false, 0  },
+  { AArch64::ST1Threev2s,       "st1",  ".2s",    0, false, 0  },
+  { AArch64::ST1Threev1d,       "st1",  ".1d",    0, false, 0  },
+  { AArch64::ST1Threev16b_POST, "st1",  ".16b",   1, false, 48 },
+  { AArch64::ST1Threev8h_POST,  "st1",  ".8h",    1, false, 48 },
+  { AArch64::ST1Threev4s_POST,  "st1",  ".4s",    1, false, 48 },
+  { AArch64::ST1Threev2d_POST,  "st1",  ".2d",    1, false, 48 },
+  { AArch64::ST1Threev8b_POST,  "st1",  ".8b",    1, false, 24 },
+  { AArch64::ST1Threev4h_POST,  "st1",  ".4h",    1, false, 24 },
+  { AArch64::ST1Threev2s_POST,  "st1",  ".2s",    1, false, 24 },
+  { AArch64::ST1Threev1d_POST,  "st1",  ".1d",    1, false, 24 },
+  { AArch64::ST1Fourv16b,       "st1",  ".16b",   0, false, 0  },
+  { AArch64::ST1Fourv8h,        "st1",  ".8h",    0, false, 0  },
+  { AArch64::ST1Fourv4s,        "st1",  ".4s",    0, false, 0  },
+  { AArch64::ST1Fourv2d,        "st1",  ".2d",    0, false, 0  },
+  { AArch64::ST1Fourv8b,        "st1",  ".8b",    0, false, 0  },
+  { AArch64::ST1Fourv4h,        "st1",  ".4h",    0, false, 0  },
+  { AArch64::ST1Fourv2s,        "st1",  ".2s",    0, false, 0  },
+  { AArch64::ST1Fourv1d,        "st1",  ".1d",    0, false, 0  },
+  { AArch64::ST1Fourv16b_POST,  "st1",  ".16b",   1, false, 64 },
+  { AArch64::ST1Fourv8h_POST,   "st1",  ".8h",    1, false, 64 },
+  { AArch64::ST1Fourv4s_POST,   "st1",  ".4s",    1, false, 64 },
+  { AArch64::ST1Fourv2d_POST,   "st1",  ".2d",    1, false, 64 },
+  { AArch64::ST1Fourv8b_POST,   "st1",  ".8b",    1, false, 32 },
+  { AArch64::ST1Fourv4h_POST,   "st1",  ".4h",    1, false, 32 },
+  { AArch64::ST1Fourv2s_POST,   "st1",  ".2s",    1, false, 32 },
+  { AArch64::ST1Fourv1d_POST,   "st1",  ".1d",    1, false, 32 },
+  { AArch64::ST2i8,             "st2",  ".b",     0, true,  0  },
+  { AArch64::ST2i16,            "st2",  ".h",     0, true,  0  },
+  { AArch64::ST2i32,            "st2",  ".s",     0, true,  0  },
+  { AArch64::ST2i64,            "st2",  ".d",     0, true,  0  },
+  { AArch64::ST2i8_POST,        "st2",  ".b",     1, true,  2  },
+  { AArch64::ST2i16_POST,       "st2",  ".h",     1, true,  4  },
+  { AArch64::ST2i32_POST,       "st2",  ".s",     1, true,  8  },
+  { AArch64::ST2i64_POST,       "st2",  ".d",     1, true,  16 },
+  { AArch64::ST2Twov16b,        "st2",  ".16b",   0, false, 0  },
+  { AArch64::ST2Twov8h,         "st2",  ".8h",    0, false, 0  },
+  { AArch64::ST2Twov4s,         "st2",  ".4s",    0, false, 0  },
+  { AArch64::ST2Twov2d,         "st2",  ".2d",    0, false, 0  },
+  { AArch64::ST2Twov8b,         "st2",  ".8b",    0, false, 0  },
+  { AArch64::ST2Twov4h,         "st2",  ".4h",    0, false, 0  },
+  { AArch64::ST2Twov2s,         "st2",  ".2s",    0, false, 0  },
+  { AArch64::ST2Twov16b_POST,   "st2",  ".16b",   1, false, 32 },
+  { AArch64::ST2Twov8h_POST,    "st2",  ".8h",    1, false, 32 },
+  { AArch64::ST2Twov4s_POST,    "st2",  ".4s",    1, false, 32 },
+  { AArch64::ST2Twov2d_POST,    "st2",  ".2d",    1, false, 32 },
+  { AArch64::ST2Twov8b_POST,    "st2",  ".8b",    1, false, 16 },
+  { AArch64::ST2Twov4h_POST,    "st2",  ".4h",    1, false, 16 },
+  { AArch64::ST2Twov2s_POST,    "st2",  ".2s",    1, false, 16 },
+  { AArch64::ST3i8,             "st3",  ".b",     0, true,  0  },
+  { AArch64::ST3i16,            "st3",  ".h",     0, true,  0  },
+  { AArch64::ST3i32,            "st3",  ".s",     0, true,  0  },
+  { AArch64::ST3i64,            "st3",  ".d",     0, true,  0  },
+  { AArch64::ST3i8_POST,        "st3",  ".b",     1, true,  3  },
+  { AArch64::ST3i16_POST,       "st3",  ".h",     1, true,  6  },
+  { AArch64::ST3i32_POST,       "st3",  ".s",     1, true,  12 },
+  { AArch64::ST3i64_POST,       "st3",  ".d",     1, true,  24 },
+  { AArch64::ST3Threev16b,      "st3",  ".16b",   0, false, 0  },
+  { AArch64::ST3Threev8h,       "st3",  ".8h",    0, false, 0  },
+  { AArch64::ST3Threev4s,       "st3",  ".4s",    0, false, 0  },
+  { AArch64::ST3Threev2d,       "st3",  ".2d",    0, false, 0  },
+  { AArch64::ST3Threev8b,       "st3",  ".8b",    0, false, 0  },
+  { AArch64::ST3Threev4h,       "st3",  ".4h",    0, false, 0  },
+  { AArch64::ST3Threev2s,       "st3",  ".2s",    0, false, 0  },
+  { AArch64::ST3Threev16b_POST, "st3",  ".16b",   1, false, 48 },
+  { AArch64::ST3Threev8h_POST,  "st3",  ".8h",    1, false, 48 },
+  { AArch64::ST3Threev4s_POST,  "st3",  ".4s",    1, false, 48 },
+  { AArch64::ST3Threev2d_POST,  "st3",  ".2d",    1, false, 48 },
+  { AArch64::ST3Threev8b_POST,  "st3",  ".8b",    1, false, 24 },
+  { AArch64::ST3Threev4h_POST,  "st3",  ".4h",    1, false, 24 },
+  { AArch64::ST3Threev2s_POST,  "st3",  ".2s",    1, false, 24 },
+  { AArch64::ST4i8,             "st4",  ".b",     0, true,  0  },
+  { AArch64::ST4i16,            "st4",  ".h",     0, true,  0  },
+  { AArch64::ST4i32,            "st4",  ".s",     0, true,  0  },
+  { AArch64::ST4i64,            "st4",  ".d",     0, true,  0  },
+  { AArch64::ST4i8_POST,        "st4",  ".b",     1, true,  4  },
+  { AArch64::ST4i16_POST,       "st4",  ".h",     1, true,  8  },
+  { AArch64::ST4i32_POST,       "st4",  ".s",     1, true,  16 },
+  { AArch64::ST4i64_POST,       "st4",  ".d",     1, true,  32 },
+  { AArch64::ST4Fourv16b,       "st4",  ".16b",   0, false, 0  },
+  { AArch64::ST4Fourv8h,        "st4",  ".8h",    0, false, 0  },
+  { AArch64::ST4Fourv4s,        "st4",  ".4s",    0, false, 0  },
+  { AArch64::ST4Fourv2d,        "st4",  ".2d",    0, false, 0  },
+  { AArch64::ST4Fourv8b,        "st4",  ".8b",    0, false, 0  },
+  { AArch64::ST4Fourv4h,        "st4",  ".4h",    0, false, 0  },
+  { AArch64::ST4Fourv2s,        "st4",  ".2s",    0, false, 0  },
+  { AArch64::ST4Fourv16b_POST,  "st4",  ".16b",   1, false, 64 },
+  { AArch64::ST4Fourv8h_POST,   "st4",  ".8h",    1, false, 64 },
+  { AArch64::ST4Fourv4s_POST,   "st4",  ".4s",    1, false, 64 },
+  { AArch64::ST4Fourv2d_POST,   "st4",  ".2d",    1, false, 64 },
+  { AArch64::ST4Fourv8b_POST,   "st4",  ".8b",    1, false, 32 },
+  { AArch64::ST4Fourv4h_POST,   "st4",  ".4h",    1, false, 32 },
+  { AArch64::ST4Fourv2s_POST,   "st4",  ".2s",    1, false, 32 },
+};
+
+static LdStNInstrDesc *getLdStNInstrDesc(unsigned Opcode) {
+  unsigned Idx;
+  for (Idx = 0; Idx != array_lengthof(LdStNInstInfo); ++Idx)
+    if (LdStNInstInfo[Idx].Opcode == Opcode)
+      return &LdStNInstInfo[Idx];
+
+  return nullptr;
+}
+
+void AArch64AppleInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                                        StringRef Annot) {
+  unsigned Opcode = MI->getOpcode();
+  StringRef Layout, Mnemonic;
+
+  bool IsTbx;
+  if (isTblTbxInstruction(MI->getOpcode(), Layout, IsTbx)) {
+    O << "\t" << (IsTbx ? "tbx" : "tbl") << Layout << '\t'
+      << getRegisterName(MI->getOperand(0).getReg(), AArch64::vreg) << ", ";
+
+    unsigned ListOpNum = IsTbx ? 2 : 1;
+    printVectorList(MI, ListOpNum, O, "");
+
+    O << ", "
+      << getRegisterName(MI->getOperand(ListOpNum + 1).getReg(), AArch64::vreg);
+    printAnnotation(O, Annot);
+    return;
+  }
+
+  if (LdStNInstrDesc *LdStDesc = getLdStNInstrDesc(Opcode)) {
+    O << "\t" << LdStDesc->Mnemonic << LdStDesc->Layout << '\t';
+
+    // Now onto the operands: first a vector list with possible lane
+    // specifier. E.g. { v0 }[2]
+    int OpNum = LdStDesc->ListOperand;
+    printVectorList(MI, OpNum++, O, "");
+
+    if (LdStDesc->HasLane)
+      O << '[' << MI->getOperand(OpNum++).getImm() << ']';
+
+    // Next the address: [xN]
+    unsigned AddrReg = MI->getOperand(OpNum++).getReg();
+    O << ", [" << getRegisterName(AddrReg) << ']';
+
+    // Finally, there might be a post-indexed offset.
+    if (LdStDesc->NaturalOffset != 0) {
+      unsigned Reg = MI->getOperand(OpNum++).getReg();
+      if (Reg != AArch64::XZR)
+        O << ", " << getRegisterName(Reg);
+      else {
+        assert(LdStDesc->NaturalOffset && "no offset on post-inc instruction?");
+        O << ", #" << LdStDesc->NaturalOffset;
+      }
+    }
+
+    printAnnotation(O, Annot);
+    return;
+  }
+
+  AArch64InstPrinter::printInst(MI, O, Annot);
+}
+
+bool AArch64InstPrinter::printSysAlias(const MCInst *MI, raw_ostream &O) {
+#ifndef NDEBUG
+  unsigned Opcode = MI->getOpcode();
+  assert(Opcode == AArch64::SYSxt && "Invalid opcode for SYS alias!");
+#endif
+
+  const char *Asm = nullptr;
+  const MCOperand &Op1 = MI->getOperand(0);
+  const MCOperand &Cn = MI->getOperand(1);
+  const MCOperand &Cm = MI->getOperand(2);
+  const MCOperand &Op2 = MI->getOperand(3);
+
+  unsigned Op1Val = Op1.getImm();
+  unsigned CnVal = Cn.getImm();
+  unsigned CmVal = Cm.getImm();
+  unsigned Op2Val = Op2.getImm();
+
+  if (CnVal == 7) {
+    switch (CmVal) {
+    default:
+      break;
+
+    // IC aliases
+    case 1:
+      if (Op1Val == 0 && Op2Val == 0)
+        Asm = "ic\tialluis";
+      break;
+    case 5:
+      if (Op1Val == 0 && Op2Val == 0)
+        Asm = "ic\tiallu";
+      else if (Op1Val == 3 && Op2Val == 1)
+        Asm = "ic\tivau";
+      break;
+
+    // DC aliases
+    case 4:
+      if (Op1Val == 3 && Op2Val == 1)
+        Asm = "dc\tzva";
+      break;
+    case 6:
+      if (Op1Val == 0 && Op2Val == 1)
+        Asm = "dc\tivac";
+      if (Op1Val == 0 && Op2Val == 2)
+        Asm = "dc\tisw";
+      break;
+    case 10:
+      if (Op1Val == 3 && Op2Val == 1)
+        Asm = "dc\tcvac";
+      else if (Op1Val == 0 && Op2Val == 2)
+        Asm = "dc\tcsw";
+      break;
+    case 11:
+      if (Op1Val == 3 && Op2Val == 1)
+        Asm = "dc\tcvau";
+      break;
+    case 14:
+      if (Op1Val == 3 && Op2Val == 1)
+        Asm = "dc\tcivac";
+      else if (Op1Val == 0 && Op2Val == 2)
+        Asm = "dc\tcisw";
+      break;
+
+    // AT aliases
+    case 8:
+      switch (Op1Val) {
+      default:
+        break;
+      case 0:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "at\ts1e1r"; break;
+        case 1: Asm = "at\ts1e1w"; break;
+        case 2: Asm = "at\ts1e0r"; break;
+        case 3: Asm = "at\ts1e0w"; break;
+        }
+        break;
+      case 4:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "at\ts1e2r"; break;
+        case 1: Asm = "at\ts1e2w"; break;
+        case 4: Asm = "at\ts12e1r"; break;
+        case 5: Asm = "at\ts12e1w"; break;
+        case 6: Asm = "at\ts12e0r"; break;
+        case 7: Asm = "at\ts12e0w"; break;
+        }
+        break;
+      case 6:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "at\ts1e3r"; break;
+        case 1: Asm = "at\ts1e3w"; break;
+        }
+        break;
+      }
+      break;
+    }
+  } else if (CnVal == 8) {
+    // TLBI aliases
+    switch (CmVal) {
+    default:
+      break;
+    case 3:
+      switch (Op1Val) {
+      default:
+        break;
+      case 0:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "tlbi\tvmalle1is"; break;
+        case 1: Asm = "tlbi\tvae1is"; break;
+        case 2: Asm = "tlbi\taside1is"; break;
+        case 3: Asm = "tlbi\tvaae1is"; break;
+        case 5: Asm = "tlbi\tvale1is"; break;
+        case 7: Asm = "tlbi\tvaale1is"; break;
+        }
+        break;
+      case 4:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "tlbi\talle2is"; break;
+        case 1: Asm = "tlbi\tvae2is"; break;
+        case 4: Asm = "tlbi\talle1is"; break;
+        case 5: Asm = "tlbi\tvale2is"; break;
+        case 6: Asm = "tlbi\tvmalls12e1is"; break;
+        }
+        break;
+      case 6:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "tlbi\talle3is"; break;
+        case 1: Asm = "tlbi\tvae3is"; break;
+        case 5: Asm = "tlbi\tvale3is"; break;
+        }
+        break;
+      }
+      break;
+    case 0:
+      switch (Op1Val) {
+      default:
+        break;
+      case 4:
+        switch (Op2Val) {
+        default:
+          break;
+        case 1: Asm = "tlbi\tipas2e1is"; break;
+        case 5: Asm = "tlbi\tipas2le1is"; break;
+        }
+        break;
+      }
+      break;
+    case 4:
+      switch (Op1Val) {
+      default:
+        break;
+      case 4:
+        switch (Op2Val) {
+        default:
+          break;
+        case 1: Asm = "tlbi\tipas2e1"; break;
+        case 5: Asm = "tlbi\tipas2le1"; break;
+        }
+        break;
+      }
+      break;
+    case 7:
+      switch (Op1Val) {
+      default:
+        break;
+      case 0:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "tlbi\tvmalle1"; break;
+        case 1: Asm = "tlbi\tvae1"; break;
+        case 2: Asm = "tlbi\taside1"; break;
+        case 3: Asm = "tlbi\tvaae1"; break;
+        case 5: Asm = "tlbi\tvale1"; break;
+        case 7: Asm = "tlbi\tvaale1"; break;
+        }
+        break;
+      case 4:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "tlbi\talle2"; break;
+        case 1: Asm = "tlbi\tvae2"; break;
+        case 4: Asm = "tlbi\talle1"; break;
+        case 5: Asm = "tlbi\tvale2"; break;
+        case 6: Asm = "tlbi\tvmalls12e1"; break;
+        }
+        break;
+      case 6:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "tlbi\talle3"; break;
+        case 1: Asm = "tlbi\tvae3";  break;
+        case 5: Asm = "tlbi\tvale3"; break;
+        }
+        break;
+      }
+      break;
+    }
+  }
+
+  if (Asm) {
+    unsigned Reg = MI->getOperand(4).getReg();
+
+    O << '\t' << Asm;
+    if (StringRef(Asm).lower().find("all") == StringRef::npos)
+      O << ", " << getRegisterName(Reg);
+  }
+
+  return Asm != nullptr;
+}
+
+void AArch64InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                      raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    unsigned Reg = Op.getReg();
+    O << getRegisterName(Reg);
+  } else if (Op.isImm()) {
+    O << '#' << Op.getImm();
+  } else {
+    assert(Op.isExpr() && "unknown operand kind in printOperand");
+    O << *Op.getExpr();
+  }
+}
+
+void AArch64InstPrinter::printHexImm(const MCInst *MI, unsigned OpNo,
+                                     raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  O << format("#%#llx", Op.getImm());
+}
+
+void AArch64InstPrinter::printPostIncOperand(const MCInst *MI, unsigned OpNo,
+                                             unsigned Imm, raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    unsigned Reg = Op.getReg();
+    if (Reg == AArch64::XZR)
+      O << "#" << Imm;
+    else
+      O << getRegisterName(Reg);
+  } else
+    llvm_unreachable("unknown operand kind in printPostIncOperand64");
+}
+
+void AArch64InstPrinter::printVRegOperand(const MCInst *MI, unsigned OpNo,
+                                          raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  assert(Op.isReg() && "Non-register vreg operand!");
+  unsigned Reg = Op.getReg();
+  O << getRegisterName(Reg, AArch64::vreg);
+}
+
+void AArch64InstPrinter::printSysCROperand(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  assert(Op.isImm() && "System instruction C[nm] operands must be immediates!");
+  O << "c" << Op.getImm();
+}
+
+void AArch64InstPrinter::printAddSubImm(const MCInst *MI, unsigned OpNum,
+                                        raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  if (MO.isImm()) {
+    unsigned Val = (MO.getImm() & 0xfff);
+    assert(Val == MO.getImm() && "Add/sub immediate out of range!");
+    unsigned Shift =
+        AArch64_AM::getShiftValue(MI->getOperand(OpNum + 1).getImm());
+    O << '#' << Val;
+    if (Shift != 0)
+      printShifter(MI, OpNum + 1, O);
+
+    if (CommentStream)
+      *CommentStream << '=' << (Val << Shift) << '\n';
+  } else {
+    assert(MO.isExpr() && "Unexpected operand type!");
+    O << *MO.getExpr();
+    printShifter(MI, OpNum + 1, O);
+  }
+}
+
+void AArch64InstPrinter::printLogicalImm32(const MCInst *MI, unsigned OpNum,
+                                           raw_ostream &O) {
+  uint64_t Val = MI->getOperand(OpNum).getImm();
+  O << "#0x";
+  O.write_hex(AArch64_AM::decodeLogicalImmediate(Val, 32));
+}
+
+void AArch64InstPrinter::printLogicalImm64(const MCInst *MI, unsigned OpNum,
+                                           raw_ostream &O) {
+  uint64_t Val = MI->getOperand(OpNum).getImm();
+  O << "#0x";
+  O.write_hex(AArch64_AM::decodeLogicalImmediate(Val, 64));
+}
+
+void AArch64InstPrinter::printShifter(const MCInst *MI, unsigned OpNum,
+                                      raw_ostream &O) {
+  unsigned Val = MI->getOperand(OpNum).getImm();
+  // LSL #0 should not be printed.
+  if (AArch64_AM::getShiftType(Val) == AArch64_AM::LSL &&
+      AArch64_AM::getShiftValue(Val) == 0)
+    return;
+  O << ", " << AArch64_AM::getShiftExtendName(AArch64_AM::getShiftType(Val))
+    << " #" << AArch64_AM::getShiftValue(Val);
+}
+
+void AArch64InstPrinter::printShiftedRegister(const MCInst *MI, unsigned OpNum,
+                                              raw_ostream &O) {
+  O << getRegisterName(MI->getOperand(OpNum).getReg());
+  printShifter(MI, OpNum + 1, O);
+}
+
+void AArch64InstPrinter::printExtendedRegister(const MCInst *MI, unsigned OpNum,
+                                               raw_ostream &O) {
+  O << getRegisterName(MI->getOperand(OpNum).getReg());
+  printArithExtend(MI, OpNum + 1, O);
+}
+
+void AArch64InstPrinter::printArithExtend(const MCInst *MI, unsigned OpNum,
+                                          raw_ostream &O) {
+  unsigned Val = MI->getOperand(OpNum).getImm();
+  AArch64_AM::ShiftExtendType ExtType = AArch64_AM::getArithExtendType(Val);
+  unsigned ShiftVal = AArch64_AM::getArithShiftValue(Val);
+
+  // If the destination or first source register operand is [W]SP, print
+  // UXTW/UXTX as LSL, and if the shift amount is also zero, print nothing at
+  // all.
+  if (ExtType == AArch64_AM::UXTW || ExtType == AArch64_AM::UXTX) {
+    unsigned Dest = MI->getOperand(0).getReg();
+    unsigned Src1 = MI->getOperand(1).getReg();
+    if ( ((Dest == AArch64::SP || Src1 == AArch64::SP) &&
+          ExtType == AArch64_AM::UXTX) ||
+         ((Dest == AArch64::WSP || Src1 == AArch64::WSP) &&
+          ExtType == AArch64_AM::UXTW) ) {
+      if (ShiftVal != 0)
+        O << ", lsl #" << ShiftVal;
+      return;
+    }
+  }
+  O << ", " << AArch64_AM::getShiftExtendName(ExtType);
+  if (ShiftVal != 0)
+    O << " #" << ShiftVal;
+}
+
+void AArch64InstPrinter::printMemExtend(const MCInst *MI, unsigned OpNum,
+                                        raw_ostream &O, char SrcRegKind,
+                                        unsigned Width) {
+  unsigned SignExtend = MI->getOperand(OpNum).getImm();
+  unsigned DoShift = MI->getOperand(OpNum + 1).getImm();
+
+  // sxtw, sxtx, uxtw or lsl (== uxtx)
+  bool IsLSL = !SignExtend && SrcRegKind == 'x';
+  if (IsLSL)
+    O << "lsl";
+  else
+    O << (SignExtend ? 's' : 'u') << "xt" << SrcRegKind;
+
+  if (DoShift || IsLSL)
+    O << " #" << Log2_32(Width / 8);
+}
+
+void AArch64InstPrinter::printCondCode(const MCInst *MI, unsigned OpNum,
+                                       raw_ostream &O) {
+  AArch64CC::CondCode CC = (AArch64CC::CondCode)MI->getOperand(OpNum).getImm();
+  O << AArch64CC::getCondCodeName(CC);
+}
+
+void AArch64InstPrinter::printInverseCondCode(const MCInst *MI, unsigned OpNum,
+                                              raw_ostream &O) {
+  AArch64CC::CondCode CC = (AArch64CC::CondCode)MI->getOperand(OpNum).getImm();
+  O << AArch64CC::getCondCodeName(AArch64CC::getInvertedCondCode(CC));
+}
+
+void AArch64InstPrinter::printAMNoIndex(const MCInst *MI, unsigned OpNum,
+                                        raw_ostream &O) {
+  O << '[' << getRegisterName(MI->getOperand(OpNum).getReg()) << ']';
+}
+
+template<int Scale>
+void AArch64InstPrinter::printImmScale(const MCInst *MI, unsigned OpNum,
+                                       raw_ostream &O) {
+  O << '#' << Scale * MI->getOperand(OpNum).getImm();
+}
+
+void AArch64InstPrinter::printUImm12Offset(const MCInst *MI, unsigned OpNum,
+                                           unsigned Scale, raw_ostream &O) {
+  const MCOperand MO = MI->getOperand(OpNum);
+  if (MO.isImm()) {
+    O << "#" << (MO.getImm() * Scale);
+  } else {
+    assert(MO.isExpr() && "Unexpected operand type!");
+    O << *MO.getExpr();
+  }
+}
+
+void AArch64InstPrinter::printAMIndexedWB(const MCInst *MI, unsigned OpNum,
+                                          unsigned Scale, raw_ostream &O) {
+  const MCOperand MO1 = MI->getOperand(OpNum + 1);
+  O << '[' << getRegisterName(MI->getOperand(OpNum).getReg());
+  if (MO1.isImm()) {
+      O << ", #" << (MO1.getImm() * Scale);
+  } else {
+    assert(MO1.isExpr() && "Unexpected operand type!");
+    O << ", " << *MO1.getExpr();
+  }
+  O << ']';
+}
+
+void AArch64InstPrinter::printPrefetchOp(const MCInst *MI, unsigned OpNum,
+                                         raw_ostream &O) {
+  unsigned prfop = MI->getOperand(OpNum).getImm();
+  bool Valid;
+  StringRef Name = AArch64PRFM::PRFMMapper().toString(prfop, Valid);
+  if (Valid)
+    O << Name;
+  else
+    O << '#' << prfop;
+}
+
+void AArch64InstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum,
+                                           raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  float FPImm =
+      MO.isFPImm() ? MO.getFPImm() : AArch64_AM::getFPImmFloat(MO.getImm());
+
+  // 8 decimal places are enough to perfectly represent permitted floats.
+  O << format("#%.8f", FPImm);
+}
+
+static unsigned getNextVectorRegister(unsigned Reg, unsigned Stride = 1) {
+  while (Stride--) {
+    switch (Reg) {
+    default:
+      llvm_unreachable("Vector register expected!");
+    case AArch64::Q0:  Reg = AArch64::Q1;  break;
+    case AArch64::Q1:  Reg = AArch64::Q2;  break;
+    case AArch64::Q2:  Reg = AArch64::Q3;  break;
+    case AArch64::Q3:  Reg = AArch64::Q4;  break;
+    case AArch64::Q4:  Reg = AArch64::Q5;  break;
+    case AArch64::Q5:  Reg = AArch64::Q6;  break;
+    case AArch64::Q6:  Reg = AArch64::Q7;  break;
+    case AArch64::Q7:  Reg = AArch64::Q8;  break;
+    case AArch64::Q8:  Reg = AArch64::Q9;  break;
+    case AArch64::Q9:  Reg = AArch64::Q10; break;
+    case AArch64::Q10: Reg = AArch64::Q11; break;
+    case AArch64::Q11: Reg = AArch64::Q12; break;
+    case AArch64::Q12: Reg = AArch64::Q13; break;
+    case AArch64::Q13: Reg = AArch64::Q14; break;
+    case AArch64::Q14: Reg = AArch64::Q15; break;
+    case AArch64::Q15: Reg = AArch64::Q16; break;
+    case AArch64::Q16: Reg = AArch64::Q17; break;
+    case AArch64::Q17: Reg = AArch64::Q18; break;
+    case AArch64::Q18: Reg = AArch64::Q19; break;
+    case AArch64::Q19: Reg = AArch64::Q20; break;
+    case AArch64::Q20: Reg = AArch64::Q21; break;
+    case AArch64::Q21: Reg = AArch64::Q22; break;
+    case AArch64::Q22: Reg = AArch64::Q23; break;
+    case AArch64::Q23: Reg = AArch64::Q24; break;
+    case AArch64::Q24: Reg = AArch64::Q25; break;
+    case AArch64::Q25: Reg = AArch64::Q26; break;
+    case AArch64::Q26: Reg = AArch64::Q27; break;
+    case AArch64::Q27: Reg = AArch64::Q28; break;
+    case AArch64::Q28: Reg = AArch64::Q29; break;
+    case AArch64::Q29: Reg = AArch64::Q30; break;
+    case AArch64::Q30: Reg = AArch64::Q31; break;
+    // Vector lists can wrap around.
+    case AArch64::Q31:
+      Reg = AArch64::Q0;
+      break;
+    }
+  }
+  return Reg;
+}
+
+void AArch64InstPrinter::printVectorList(const MCInst *MI, unsigned OpNum,
+                                         raw_ostream &O,
+                                         StringRef LayoutSuffix) {
+  unsigned Reg = MI->getOperand(OpNum).getReg();
+
+  O << "{ ";
+
+  // Work out how many registers there are in the list (if there is an actual
+  // list).
+  unsigned NumRegs = 1;
+  if (MRI.getRegClass(AArch64::DDRegClassID).contains(Reg) ||
+      MRI.getRegClass(AArch64::QQRegClassID).contains(Reg))
+    NumRegs = 2;
+  else if (MRI.getRegClass(AArch64::DDDRegClassID).contains(Reg) ||
+           MRI.getRegClass(AArch64::QQQRegClassID).contains(Reg))
+    NumRegs = 3;
+  else if (MRI.getRegClass(AArch64::DDDDRegClassID).contains(Reg) ||
+           MRI.getRegClass(AArch64::QQQQRegClassID).contains(Reg))
+    NumRegs = 4;
+
+  // Now forget about the list and find out what the first register is.
+  if (unsigned FirstReg = MRI.getSubReg(Reg, AArch64::dsub0))
+    Reg = FirstReg;
+  else if (unsigned FirstReg = MRI.getSubReg(Reg, AArch64::qsub0))
+    Reg = FirstReg;
+
+  // If it's a D-reg, we need to promote it to the equivalent Q-reg before
+  // printing (otherwise getRegisterName fails).
+  if (MRI.getRegClass(AArch64::FPR64RegClassID).contains(Reg)) {
+    const MCRegisterClass &FPR128RC =
+        MRI.getRegClass(AArch64::FPR128RegClassID);
+    Reg = MRI.getMatchingSuperReg(Reg, AArch64::dsub, &FPR128RC);
+  }
+
+  for (unsigned i = 0; i < NumRegs; ++i, Reg = getNextVectorRegister(Reg)) {
+    O << getRegisterName(Reg, AArch64::vreg) << LayoutSuffix;
+    if (i + 1 != NumRegs)
+      O << ", ";
+  }
+
+  O << " }";
+}
+
+void AArch64InstPrinter::printImplicitlyTypedVectorList(const MCInst *MI,
+                                                        unsigned OpNum,
+                                                        raw_ostream &O) {
+  printVectorList(MI, OpNum, O, "");
+}
+
+template <unsigned NumLanes, char LaneKind>
+void AArch64InstPrinter::printTypedVectorList(const MCInst *MI, unsigned OpNum,
+                                              raw_ostream &O) {
+  std::string Suffix(".");
+  if (NumLanes)
+    Suffix += itostr(NumLanes) + LaneKind;
+  else
+    Suffix += LaneKind;
+
+  printVectorList(MI, OpNum, O, Suffix);
+}
+
+void AArch64InstPrinter::printVectorIndex(const MCInst *MI, unsigned OpNum,
+                                          raw_ostream &O) {
+  O << "[" << MI->getOperand(OpNum).getImm() << "]";
+}
+
+void AArch64InstPrinter::printAlignedLabel(const MCInst *MI, unsigned OpNum,
+                                           raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNum);
+
+  // If the label has already been resolved to an immediate offset (say, when
+  // we're running the disassembler), just print the immediate.
+  if (Op.isImm()) {
+    O << "#" << (Op.getImm() << 2);
+    return;
+  }
+
+  // If the branch target is simply an address then print it in hex.
+  const MCConstantExpr *BranchTarget =
+      dyn_cast<MCConstantExpr>(MI->getOperand(OpNum).getExpr());
+  int64_t Address;
+  if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) {
+    O << "0x";
+    O.write_hex(Address);
+  } else {
+    // Otherwise, just print the expression.
+    O << *MI->getOperand(OpNum).getExpr();
+  }
+}
+
+void AArch64InstPrinter::printAdrpLabel(const MCInst *MI, unsigned OpNum,
+                                        raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNum);
+
+  // If the label has already been resolved to an immediate offset (say, when
+  // we're running the disassembler), just print the immediate.
+  if (Op.isImm()) {
+    O << "#" << (Op.getImm() << 12);
+    return;
+  }
+
+  // Otherwise, just print the expression.
+  O << *MI->getOperand(OpNum).getExpr();
+}
+
+void AArch64InstPrinter::printBarrierOption(const MCInst *MI, unsigned OpNo,
+                                            raw_ostream &O) {
+  unsigned Val = MI->getOperand(OpNo).getImm();
+  unsigned Opcode = MI->getOpcode();
+
+  bool Valid;
+  StringRef Name;
+  if (Opcode == AArch64::ISB)
+    Name = AArch64ISB::ISBMapper().toString(Val, Valid);
+  else
+    Name = AArch64DB::DBarrierMapper().toString(Val, Valid);
+  if (Valid)
+    O << Name;
+  else
+    O << "#" << Val;
+}
+
+void AArch64InstPrinter::printMRSSystemRegister(const MCInst *MI, unsigned OpNo,
+                                                raw_ostream &O) {
+  unsigned Val = MI->getOperand(OpNo).getImm();
+
+  bool Valid;
+  auto Mapper = AArch64SysReg::MRSMapper(getAvailableFeatures());
+  std::string Name = Mapper.toString(Val, Valid);
+
+  if (Valid)
+    O << StringRef(Name).upper();
+}
+
+void AArch64InstPrinter::printMSRSystemRegister(const MCInst *MI, unsigned OpNo,
+                                                raw_ostream &O) {
+  unsigned Val = MI->getOperand(OpNo).getImm();
+
+  bool Valid;
+  auto Mapper = AArch64SysReg::MSRMapper(getAvailableFeatures());
+  std::string Name = Mapper.toString(Val, Valid);
+
+  if (Valid)
+    O << StringRef(Name).upper();
+}
+
+void AArch64InstPrinter::printSystemPStateField(const MCInst *MI, unsigned OpNo,
+                                                raw_ostream &O) {
+  unsigned Val = MI->getOperand(OpNo).getImm();
+
+  bool Valid;
+  StringRef Name = AArch64PState::PStateMapper().toString(Val, Valid);
+  if (Valid)
+    O << StringRef(Name.str()).upper();
+  else
+    O << "#" << Val;
+}
+
+void AArch64InstPrinter::printSIMDType10Operand(const MCInst *MI, unsigned OpNo,
+                                                raw_ostream &O) {
+  unsigned RawVal = MI->getOperand(OpNo).getImm();
+  uint64_t Val = AArch64_AM::decodeAdvSIMDModImmType10(RawVal);
+  O << format("#%#016llx", Val);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
new file mode 100644
index 0000000..fe7666e
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
@@ -0,0 +1,140 @@
+//===-- AArch64InstPrinter.h - Convert AArch64 MCInst to assembly syntax --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an AArch64 MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AArch64INSTPRINTER_H
+#define AArch64INSTPRINTER_H
+
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+
+namespace llvm {
+
+class MCOperand;
+
+class AArch64InstPrinter : public MCInstPrinter {
+public:
+  AArch64InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                     const MCRegisterInfo &MRI, const MCSubtargetInfo &STI);
+
+  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override;
+  void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+
+  // Autogenerated by tblgen.
+  virtual void printInstruction(const MCInst *MI, raw_ostream &O);
+  virtual bool printAliasInstr(const MCInst *MI, raw_ostream &O);
+  virtual void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+                                       unsigned PrintMethodIdx, raw_ostream &O);
+  virtual StringRef getRegName(unsigned RegNo) const {
+    return getRegisterName(RegNo);
+  }
+  static const char *getRegisterName(unsigned RegNo,
+                                     unsigned AltIdx = AArch64::NoRegAltName);
+
+protected:
+  bool printSysAlias(const MCInst *MI, raw_ostream &O);
+  // Operand printers
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printHexImm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printPostIncOperand(const MCInst *MI, unsigned OpNo, unsigned Imm,
+                           raw_ostream &O);
+  template<int Amount>
+  void printPostIncOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printPostIncOperand(MI, OpNo, Amount, O);
+  }
+
+  void printVRegOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printSysCROperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printAddSubImm(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printLogicalImm32(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printLogicalImm64(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printShifter(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printShiftedRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printExtendedRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printArithExtend(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+
+  void printMemExtend(const MCInst *MI, unsigned OpNum, raw_ostream &O,
+                      char SrcRegKind, unsigned Width);
+  template <char SrcRegKind, unsigned Width>
+  void printMemExtend(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
+    printMemExtend(MI, OpNum, O, SrcRegKind, Width);
+  }
+
+  void printCondCode(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printInverseCondCode(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printAlignedLabel(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printUImm12Offset(const MCInst *MI, unsigned OpNum, unsigned Scale,
+                         raw_ostream &O);
+  void printAMIndexedWB(const MCInst *MI, unsigned OpNum, unsigned Scale,
+                        raw_ostream &O);
+
+  template<int Scale>
+  void printUImm12Offset(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
+    printUImm12Offset(MI, OpNum, Scale, O);
+  }
+
+  template<int BitWidth>
+  void printAMIndexedWB(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
+    printAMIndexedWB(MI, OpNum, BitWidth / 8, O);
+  }
+
+  void printAMNoIndex(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+
+  template<int Scale>
+  void printImmScale(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+
+  void printPrefetchOp(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+
+  void printFPImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+
+  void printVectorList(const MCInst *MI, unsigned OpNum, raw_ostream &O,
+                       StringRef LayoutSuffix);
+
+  /// Print a list of vector registers where the type suffix is implicit
+  /// (i.e. attached to the instruction rather than the registers).
+  void printImplicitlyTypedVectorList(const MCInst *MI, unsigned OpNum,
+                                      raw_ostream &O);
+
+  template <unsigned NumLanes, char LaneKind>
+  void printTypedVectorList(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+
+  void printVectorIndex(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printAdrpLabel(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printBarrierOption(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printMSRSystemRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printMRSSystemRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printSystemPStateField(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printSIMDType10Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+};
+
+class AArch64AppleInstPrinter : public AArch64InstPrinter {
+public:
+  AArch64AppleInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                        const MCRegisterInfo &MRI, const MCSubtargetInfo &STI);
+
+  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override;
+
+  void printInstruction(const MCInst *MI, raw_ostream &O) override;
+  bool printAliasInstr(const MCInst *MI, raw_ostream &O) override;
+  virtual void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+                                       unsigned PrintMethodIdx, raw_ostream &O);
+  StringRef getRegName(unsigned RegNo) const override {
+    return getRegisterName(RegNo);
+  }
+  static const char *getRegisterName(unsigned RegNo,
+                                     unsigned AltIdx = AArch64::NoRegAltName);
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
new file mode 100644
index 0000000..8b1e44e2
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
@@ -0,0 +1,738 @@
+//===- AArch64AddressingModes.h - AArch64 Addressing Modes ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AArch64 addressing mode implementation stuff.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_AArch64_AArch64ADDRESSINGMODES_H
+#define LLVM_TARGET_AArch64_AArch64ADDRESSINGMODES_H
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include <cassert>
+
+namespace llvm {
+
+/// AArch64_AM - AArch64 Addressing Mode Stuff
+namespace AArch64_AM {
+
+//===----------------------------------------------------------------------===//
+// Shifts
+//
+
+enum ShiftExtendType {
+  InvalidShiftExtend = -1,
+  LSL = 0,
+  LSR,
+  ASR,
+  ROR,
+  MSL,
+
+  UXTB,
+  UXTH,
+  UXTW,
+  UXTX,
+
+  SXTB,
+  SXTH,
+  SXTW,
+  SXTX,
+};
+
+/// getShiftName - Get the string encoding for the shift type.
+static inline const char *getShiftExtendName(AArch64_AM::ShiftExtendType ST) {
+  switch (ST) {
+  default: assert(false && "unhandled shift type!");
+  case AArch64_AM::LSL: return "lsl";
+  case AArch64_AM::LSR: return "lsr";
+  case AArch64_AM::ASR: return "asr";
+  case AArch64_AM::ROR: return "ror";
+  case AArch64_AM::MSL: return "msl";
+  case AArch64_AM::UXTB: return "uxtb";
+  case AArch64_AM::UXTH: return "uxth";
+  case AArch64_AM::UXTW: return "uxtw";
+  case AArch64_AM::UXTX: return "uxtx";
+  case AArch64_AM::SXTB: return "sxtb";
+  case AArch64_AM::SXTH: return "sxth";
+  case AArch64_AM::SXTW: return "sxtw";
+  case AArch64_AM::SXTX: return "sxtx";
+  }
+  return nullptr;
+}
+
+/// getShiftType - Extract the shift type.
+static inline AArch64_AM::ShiftExtendType getShiftType(unsigned Imm) {
+  switch ((Imm >> 6) & 0x7) {
+  default: return AArch64_AM::InvalidShiftExtend;
+  case 0: return AArch64_AM::LSL;
+  case 1: return AArch64_AM::LSR;
+  case 2: return AArch64_AM::ASR;
+  case 3: return AArch64_AM::ROR;
+  case 4: return AArch64_AM::MSL;
+  }
+}
+
+/// getShiftValue - Extract the shift value.
+static inline unsigned getShiftValue(unsigned Imm) {
+  return Imm & 0x3f;
+}
+
+/// getShifterImm - Encode the shift type and amount:
+///   imm:     6-bit shift amount
+///   shifter: 000 ==> lsl
+///            001 ==> lsr
+///            010 ==> asr
+///            011 ==> ror
+///            100 ==> msl
+///   {8-6}  = shifter
+///   {5-0}  = imm
+static inline unsigned getShifterImm(AArch64_AM::ShiftExtendType ST,
+                                     unsigned Imm) {
+  assert((Imm & 0x3f) == Imm && "Illegal shifted immedate value!");
+  unsigned STEnc = 0;
+  switch (ST) {
+  default:  llvm_unreachable("Invalid shift requested");
+  case AArch64_AM::LSL: STEnc = 0; break;
+  case AArch64_AM::LSR: STEnc = 1; break;
+  case AArch64_AM::ASR: STEnc = 2; break;
+  case AArch64_AM::ROR: STEnc = 3; break;
+  case AArch64_AM::MSL: STEnc = 4; break;
+  }
+  return (STEnc << 6) | (Imm & 0x3f);
+}
+
+//===----------------------------------------------------------------------===//
+// Extends
+//
+
+/// getArithShiftValue - get the arithmetic shift value.
+static inline unsigned getArithShiftValue(unsigned Imm) {
+  return Imm & 0x7;
+}
+
+/// getExtendType - Extract the extend type for operands of arithmetic ops.
+static inline AArch64_AM::ShiftExtendType getExtendType(unsigned Imm) {
+  assert((Imm & 0x7) == Imm && "invalid immediate!");
+  switch (Imm) {
+  default: llvm_unreachable("Compiler bug!");
+  case 0: return AArch64_AM::UXTB;
+  case 1: return AArch64_AM::UXTH;
+  case 2: return AArch64_AM::UXTW;
+  case 3: return AArch64_AM::UXTX;
+  case 4: return AArch64_AM::SXTB;
+  case 5: return AArch64_AM::SXTH;
+  case 6: return AArch64_AM::SXTW;
+  case 7: return AArch64_AM::SXTX;
+  }
+}
+
+static inline AArch64_AM::ShiftExtendType getArithExtendType(unsigned Imm) {
+  return getExtendType((Imm >> 3) & 0x7);
+}
+
+/// Mapping from extend bits to required operation:
+///   shifter: 000 ==> uxtb
+///            001 ==> uxth
+///            010 ==> uxtw
+///            011 ==> uxtx
+///            100 ==> sxtb
+///            101 ==> sxth
+///            110 ==> sxtw
+///            111 ==> sxtx
+inline unsigned getExtendEncoding(AArch64_AM::ShiftExtendType ET) {
+  switch (ET) {
+  default: llvm_unreachable("Invalid extend type requested");
+  case AArch64_AM::UXTB: return 0; break;
+  case AArch64_AM::UXTH: return 1; break;
+  case AArch64_AM::UXTW: return 2; break;
+  case AArch64_AM::UXTX: return 3; break;
+  case AArch64_AM::SXTB: return 4; break;
+  case AArch64_AM::SXTH: return 5; break;
+  case AArch64_AM::SXTW: return 6; break;
+  case AArch64_AM::SXTX: return 7; break;
+  }
+}
+
+/// getArithExtendImm - Encode the extend type and shift amount for an
+///                     arithmetic instruction:
+///   imm:     3-bit extend amount
+///   {5-3}  = shifter
+///   {2-0}  = imm3
+static inline unsigned getArithExtendImm(AArch64_AM::ShiftExtendType ET,
+                                         unsigned Imm) {
+  assert((Imm & 0x7) == Imm && "Illegal shifted immedate value!");
+  return (getExtendEncoding(ET) << 3) | (Imm & 0x7);
+}
+
+/// getMemDoShift - Extract the "do shift" flag value for load/store
+/// instructions.
+static inline bool getMemDoShift(unsigned Imm) {
+  return (Imm & 0x1) != 0;
+}
+
+/// getExtendType - Extract the extend type for the offset operand of
+/// loads/stores.
+static inline AArch64_AM::ShiftExtendType getMemExtendType(unsigned Imm) {
+  return getExtendType((Imm >> 1) & 0x7);
+}
+
+/// getExtendImm - Encode the extend type and amount for a load/store inst:
+///   doshift:     should the offset be scaled by the access size
+///   shifter: 000 ==> uxtb
+///            001 ==> uxth
+///            010 ==> uxtw
+///            011 ==> uxtx
+///            100 ==> sxtb
+///            101 ==> sxth
+///            110 ==> sxtw
+///            111 ==> sxtx
+///   {3-1}  = shifter
+///   {0}  = doshift
+static inline unsigned getMemExtendImm(AArch64_AM::ShiftExtendType ET,
+                                       bool DoShift) {
+  return (getExtendEncoding(ET) << 1) | unsigned(DoShift);
+}
+
+static inline uint64_t ror(uint64_t elt, unsigned size) {
+  return ((elt & 1) << (size-1)) | (elt >> 1);
+}
+
+/// processLogicalImmediate - Determine if an immediate value can be encoded
+/// as the immediate operand of a logical instruction for the given register
+/// size.  If so, return true with "encoding" set to the encoded value in
+/// the form N:immr:imms.
+static inline bool processLogicalImmediate(uint64_t imm, unsigned regSize,
+                                           uint64_t &encoding) {
+  if (imm == 0ULL || imm == ~0ULL ||
+      (regSize != 64 && (imm >> regSize != 0 || imm == ~0U)))
+    return false;
+
+  unsigned size = 2;
+  uint64_t eltVal = imm;
+
+  // First, determine the element size.
+  while (size < regSize) {
+    unsigned numElts = regSize / size;
+    unsigned mask = (1ULL << size) - 1;
+    uint64_t lowestEltVal = imm & mask;
+
+    bool allMatched = true;
+    for (unsigned i = 1; i < numElts; ++i) {
+     uint64_t currEltVal = (imm >> (i*size)) & mask;
+      if (currEltVal != lowestEltVal) {
+        allMatched = false;
+        break;
+      }
+    }
+
+    if (allMatched) {
+      eltVal = lowestEltVal;
+      break;
+    }
+
+    size *= 2;
+  }
+
+  // Second, determine the rotation to make the element be: 0^m 1^n.
+  for (unsigned i = 0; i < size; ++i) {
+    eltVal = ror(eltVal, size);
+    uint32_t clz = countLeadingZeros(eltVal) - (64 - size);
+    uint32_t cto = CountTrailingOnes_64(eltVal);
+
+    if (clz + cto == size) {
+      // Encode in immr the number of RORs it would take to get *from* this
+      // element value to our target value, where i+1 is the number of RORs
+      // to go the opposite direction.
+      unsigned immr = size - (i + 1);
+
+      // If size has a 1 in the n'th bit, create a value that has zeroes in
+      // bits [0, n] and ones above that.
+      uint64_t nimms = ~(size-1) << 1;
+
+      // Or the CTO value into the low bits, which must be below the Nth bit
+      // bit mentioned above.
+      nimms |= (cto-1);
+
+      // Extract the seventh bit and toggle it to create the N field.
+      unsigned N = ((nimms >> 6) & 1) ^ 1;
+
+      encoding = (N << 12) | (immr << 6) | (nimms & 0x3f);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+/// isLogicalImmediate - Return true if the immediate is valid for a logical
+/// immediate instruction of the given register size. Return false otherwise.
+static inline bool isLogicalImmediate(uint64_t imm, unsigned regSize) {
+  uint64_t encoding;
+  return processLogicalImmediate(imm, regSize, encoding);
+}
+
+/// encodeLogicalImmediate - Return the encoded immediate value for a logical
+/// immediate instruction of the given register size.
+static inline uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize) {
+  uint64_t encoding = 0;
+  bool res = processLogicalImmediate(imm, regSize, encoding);
+  assert(res && "invalid logical immediate");
+  (void)res;
+  return encoding;
+}
+
+/// decodeLogicalImmediate - Decode a logical immediate value in the form
+/// "N:immr:imms" (where the immr and imms fields are each 6 bits) into the
+/// integer value it represents with regSize bits.
+static inline uint64_t decodeLogicalImmediate(uint64_t val, unsigned regSize) {
+  // Extract the N, imms, and immr fields.
+  unsigned N = (val >> 12) & 1;
+  unsigned immr = (val >> 6) & 0x3f;
+  unsigned imms = val & 0x3f;
+
+  assert((regSize == 64 || N == 0) && "undefined logical immediate encoding");
+  int len = 31 - countLeadingZeros((N << 6) | (~imms & 0x3f));
+  assert(len >= 0 && "undefined logical immediate encoding");
+  unsigned size = (1 << len);
+  unsigned R = immr & (size - 1);
+  unsigned S = imms & (size - 1);
+  assert(S != size - 1 && "undefined logical immediate encoding");
+  uint64_t pattern = (1ULL << (S + 1)) - 1;
+  for (unsigned i = 0; i < R; ++i)
+    pattern = ror(pattern, size);
+
+  // Replicate the pattern to fill the regSize.
+  while (size != regSize) {
+    pattern |= (pattern << size);
+    size *= 2;
+  }
+  return pattern;
+}
+
+/// isValidDecodeLogicalImmediate - Check to see if the logical immediate value
+/// in the form "N:immr:imms" (where the immr and imms fields are each 6 bits)
+/// is a valid encoding for an integer value with regSize bits.
+static inline bool isValidDecodeLogicalImmediate(uint64_t val,
+                                                 unsigned regSize) {
+  // Extract the N and imms fields needed for checking.
+  unsigned N = (val >> 12) & 1;
+  unsigned imms = val & 0x3f;
+
+  if (regSize == 32 && N != 0) // undefined logical immediate encoding
+    return false;
+  int len = 31 - countLeadingZeros((N << 6) | (~imms & 0x3f));
+  if (len < 0) // undefined logical immediate encoding
+    return false;
+  unsigned size = (1 << len);
+  unsigned S = imms & (size - 1);
+  if (S == size - 1) // undefined logical immediate encoding
+    return false;
+
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+// Floating-point Immediates
+//
+static inline float getFPImmFloat(unsigned Imm) {
+  // We expect an 8-bit binary encoding of a floating-point number here.
+  union {
+    uint32_t I;
+    float F;
+  } FPUnion;
+
+  uint8_t Sign = (Imm >> 7) & 0x1;
+  uint8_t Exp = (Imm >> 4) & 0x7;
+  uint8_t Mantissa = Imm & 0xf;
+
+  //   8-bit FP    iEEEE Float Encoding
+  //   abcd efgh   aBbbbbbc defgh000 00000000 00000000
+  //
+  // where B = NOT(b);
+
+  FPUnion.I = 0;
+  FPUnion.I |= Sign << 31;
+  FPUnion.I |= ((Exp & 0x4) != 0 ? 0 : 1) << 30;
+  FPUnion.I |= ((Exp & 0x4) != 0 ? 0x1f : 0) << 25;
+  FPUnion.I |= (Exp & 0x3) << 23;
+  FPUnion.I |= Mantissa << 19;
+  return FPUnion.F;
+}
+
+/// getFP32Imm - Return an 8-bit floating-point version of the 32-bit
+/// floating-point value. If the value cannot be represented as an 8-bit
+/// floating-point value, then return -1.
+static inline int getFP32Imm(const APInt &Imm) {
+  uint32_t Sign = Imm.lshr(31).getZExtValue() & 1;
+  int32_t Exp = (Imm.lshr(23).getSExtValue() & 0xff) - 127;  // -126 to 127
+  int64_t Mantissa = Imm.getZExtValue() & 0x7fffff;  // 23 bits
+
+  // We can handle 4 bits of mantissa.
+  // mantissa = (16+UInt(e:f:g:h))/16.
+  if (Mantissa & 0x7ffff)
+    return -1;
+  Mantissa >>= 19;
+  if ((Mantissa & 0xf) != Mantissa)
+    return -1;
+
+  // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
+  if (Exp < -3 || Exp > 4)
+    return -1;
+  Exp = ((Exp+3) & 0x7) ^ 4;
+
+  return ((int)Sign << 7) | (Exp << 4) | Mantissa;
+}
+
+static inline int getFP32Imm(const APFloat &FPImm) {
+  return getFP32Imm(FPImm.bitcastToAPInt());
+}
+
+/// getFP64Imm - Return an 8-bit floating-point version of the 64-bit
+/// floating-point value. If the value cannot be represented as an 8-bit
+/// floating-point value, then return -1.
+static inline int getFP64Imm(const APInt &Imm) {
+  uint64_t Sign = Imm.lshr(63).getZExtValue() & 1;
+  int64_t Exp = (Imm.lshr(52).getSExtValue() & 0x7ff) - 1023;   // -1022 to 1023
+  uint64_t Mantissa = Imm.getZExtValue() & 0xfffffffffffffULL;
+
+  // We can handle 4 bits of mantissa.
+  // mantissa = (16+UInt(e:f:g:h))/16.
+  if (Mantissa & 0xffffffffffffULL)
+    return -1;
+  Mantissa >>= 48;
+  if ((Mantissa & 0xf) != Mantissa)
+    return -1;
+
+  // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
+  if (Exp < -3 || Exp > 4)
+    return -1;
+  Exp = ((Exp+3) & 0x7) ^ 4;
+
+  return ((int)Sign << 7) | (Exp << 4) | Mantissa;
+}
+
+static inline int getFP64Imm(const APFloat &FPImm) {
+  return getFP64Imm(FPImm.bitcastToAPInt());
+}
+
+//===--------------------------------------------------------------------===//
+// AdvSIMD Modified Immediates
+//===--------------------------------------------------------------------===//
+
+// 0x00 0x00 0x00 abcdefgh 0x00 0x00 0x00 abcdefgh
+static inline bool isAdvSIMDModImmType1(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         ((Imm & 0xffffff00ffffff00ULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType1(uint64_t Imm) {
+  return (Imm & 0xffULL);
+}
+
+static inline uint64_t decodeAdvSIMDModImmType1(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 32) | EncVal;
+}
+
+// 0x00 0x00 abcdefgh 0x00 0x00 0x00 abcdefgh 0x00
+static inline bool isAdvSIMDModImmType2(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         ((Imm & 0xffff00ffffff00ffULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType2(uint64_t Imm) {
+  return (Imm & 0xff00ULL) >> 8;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType2(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 40) | (EncVal << 8);
+}
+
+// 0x00 abcdefgh 0x00 0x00 0x00 abcdefgh 0x00 0x00
+static inline bool isAdvSIMDModImmType3(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         ((Imm & 0xff00ffffff00ffffULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType3(uint64_t Imm) {
+  return (Imm & 0xff0000ULL) >> 16;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType3(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 48) | (EncVal << 16);
+}
+
+// abcdefgh 0x00 0x00 0x00 abcdefgh 0x00 0x00 0x00
+static inline bool isAdvSIMDModImmType4(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         ((Imm & 0x00ffffff00ffffffULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType4(uint64_t Imm) {
+  return (Imm & 0xff000000ULL) >> 24;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType4(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 56) | (EncVal << 24);
+}
+
+// 0x00 abcdefgh 0x00 abcdefgh 0x00 abcdefgh 0x00 abcdefgh
+static inline bool isAdvSIMDModImmType5(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         (((Imm & 0x00ff0000ULL) >> 16) == (Imm & 0x000000ffULL)) &&
+         ((Imm & 0xff00ff00ff00ff00ULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType5(uint64_t Imm) {
+  return (Imm & 0xffULL);
+}
+
+static inline uint64_t decodeAdvSIMDModImmType5(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 48) | (EncVal << 32) | (EncVal << 16) | EncVal;
+}
+
+// abcdefgh 0x00 abcdefgh 0x00 abcdefgh 0x00 abcdefgh 0x00
+static inline bool isAdvSIMDModImmType6(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         (((Imm & 0xff000000ULL) >> 16) == (Imm & 0x0000ff00ULL)) &&
+         ((Imm & 0x00ff00ff00ff00ffULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType6(uint64_t Imm) {
+  return (Imm & 0xff00ULL) >> 8;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType6(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 56) | (EncVal << 40) | (EncVal << 24) | (EncVal << 8);
+}
+
+// 0x00 0x00 abcdefgh 0xFF 0x00 0x00 abcdefgh 0xFF
+static inline bool isAdvSIMDModImmType7(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         ((Imm & 0xffff00ffffff00ffULL) == 0x000000ff000000ffULL);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType7(uint64_t Imm) {
+  return (Imm & 0xff00ULL) >> 8;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType7(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 40) | (EncVal << 8) | 0x000000ff000000ffULL;
+}
+
+// 0x00 abcdefgh 0xFF 0xFF 0x00 abcdefgh 0xFF 0xFF
+static inline bool isAdvSIMDModImmType8(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         ((Imm & 0xff00ffffff00ffffULL) == 0x0000ffff0000ffffULL);
+}
+
+static inline uint64_t decodeAdvSIMDModImmType8(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 48) | (EncVal << 16) | 0x0000ffff0000ffffULL;
+}
+
+static inline uint8_t encodeAdvSIMDModImmType8(uint64_t Imm) {
+  return (Imm & 0x00ff0000ULL) >> 16;
+}
+
+// abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh
+static inline bool isAdvSIMDModImmType9(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         ((Imm >> 48) == (Imm & 0x0000ffffULL)) &&
+         ((Imm >> 56) == (Imm & 0x000000ffULL));
+}
+
+static inline uint8_t encodeAdvSIMDModImmType9(uint64_t Imm) {
+  return (Imm & 0xffULL);
+}
+
+static inline uint64_t decodeAdvSIMDModImmType9(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  EncVal |= (EncVal << 8);
+  EncVal |= (EncVal << 16);
+  EncVal |= (EncVal << 32);
+  return EncVal;
+}
+
+// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
+// cmode: 1110, op: 1
+static inline bool isAdvSIMDModImmType10(uint64_t Imm) {
+  uint64_t ByteA = Imm & 0xff00000000000000ULL;
+  uint64_t ByteB = Imm & 0x00ff000000000000ULL;
+  uint64_t ByteC = Imm & 0x0000ff0000000000ULL;
+  uint64_t ByteD = Imm & 0x000000ff00000000ULL;
+  uint64_t ByteE = Imm & 0x00000000ff000000ULL;
+  uint64_t ByteF = Imm & 0x0000000000ff0000ULL;
+  uint64_t ByteG = Imm & 0x000000000000ff00ULL;
+  uint64_t ByteH = Imm & 0x00000000000000ffULL;
+
+  return (ByteA == 0ULL || ByteA == 0xff00000000000000ULL) &&
+         (ByteB == 0ULL || ByteB == 0x00ff000000000000ULL) &&
+         (ByteC == 0ULL || ByteC == 0x0000ff0000000000ULL) &&
+         (ByteD == 0ULL || ByteD == 0x000000ff00000000ULL) &&
+         (ByteE == 0ULL || ByteE == 0x00000000ff000000ULL) &&
+         (ByteF == 0ULL || ByteF == 0x0000000000ff0000ULL) &&
+         (ByteG == 0ULL || ByteG == 0x000000000000ff00ULL) &&
+         (ByteH == 0ULL || ByteH == 0x00000000000000ffULL);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType10(uint64_t Imm) {
+  uint8_t BitA = (Imm & 0xff00000000000000ULL) != 0;
+  uint8_t BitB = (Imm & 0x00ff000000000000ULL) != 0;
+  uint8_t BitC = (Imm & 0x0000ff0000000000ULL) != 0;
+  uint8_t BitD = (Imm & 0x000000ff00000000ULL) != 0;
+  uint8_t BitE = (Imm & 0x00000000ff000000ULL) != 0;
+  uint8_t BitF = (Imm & 0x0000000000ff0000ULL) != 0;
+  uint8_t BitG = (Imm & 0x000000000000ff00ULL) != 0;
+  uint8_t BitH = (Imm & 0x00000000000000ffULL) != 0;
+
+  uint8_t EncVal = BitA;
+  EncVal <<= 1;
+  EncVal |= BitB;
+  EncVal <<= 1;
+  EncVal |= BitC;
+  EncVal <<= 1;
+  EncVal |= BitD;
+  EncVal <<= 1;
+  EncVal |= BitE;
+  EncVal <<= 1;
+  EncVal |= BitF;
+  EncVal <<= 1;
+  EncVal |= BitG;
+  EncVal <<= 1;
+  EncVal |= BitH;
+  return EncVal;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType10(uint8_t Imm) {
+  uint64_t EncVal = 0;
+  if (Imm & 0x80) EncVal |= 0xff00000000000000ULL;
+  if (Imm & 0x40) EncVal |= 0x00ff000000000000ULL;
+  if (Imm & 0x20) EncVal |= 0x0000ff0000000000ULL;
+  if (Imm & 0x10) EncVal |= 0x000000ff00000000ULL;
+  if (Imm & 0x08) EncVal |= 0x00000000ff000000ULL;
+  if (Imm & 0x04) EncVal |= 0x0000000000ff0000ULL;
+  if (Imm & 0x02) EncVal |= 0x000000000000ff00ULL;
+  if (Imm & 0x01) EncVal |= 0x00000000000000ffULL;
+  return EncVal;
+}
+
+// aBbbbbbc defgh000 0x00 0x00 aBbbbbbc defgh000 0x00 0x00
+static inline bool isAdvSIMDModImmType11(uint64_t Imm) {
+  uint64_t BString = (Imm & 0x7E000000ULL) >> 25;
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         (BString == 0x1f || BString == 0x20) &&
+         ((Imm & 0x0007ffff0007ffffULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType11(uint64_t Imm) {
+  uint8_t BitA = (Imm & 0x80000000ULL) != 0;
+  uint8_t BitB = (Imm & 0x20000000ULL) != 0;
+  uint8_t BitC = (Imm & 0x01000000ULL) != 0;
+  uint8_t BitD = (Imm & 0x00800000ULL) != 0;
+  uint8_t BitE = (Imm & 0x00400000ULL) != 0;
+  uint8_t BitF = (Imm & 0x00200000ULL) != 0;
+  uint8_t BitG = (Imm & 0x00100000ULL) != 0;
+  uint8_t BitH = (Imm & 0x00080000ULL) != 0;
+
+  uint8_t EncVal = BitA;
+  EncVal <<= 1;
+  EncVal |= BitB;
+  EncVal <<= 1;
+  EncVal |= BitC;
+  EncVal <<= 1;
+  EncVal |= BitD;
+  EncVal <<= 1;
+  EncVal |= BitE;
+  EncVal <<= 1;
+  EncVal |= BitF;
+  EncVal <<= 1;
+  EncVal |= BitG;
+  EncVal <<= 1;
+  EncVal |= BitH;
+  return EncVal;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType11(uint8_t Imm) {
+  uint64_t EncVal = 0;
+  if (Imm & 0x80) EncVal |= 0x80000000ULL;
+  if (Imm & 0x40) EncVal |= 0x3e000000ULL;
+  else            EncVal |= 0x40000000ULL;
+  if (Imm & 0x20) EncVal |= 0x01000000ULL;
+  if (Imm & 0x10) EncVal |= 0x00800000ULL;
+  if (Imm & 0x08) EncVal |= 0x00400000ULL;
+  if (Imm & 0x04) EncVal |= 0x00200000ULL;
+  if (Imm & 0x02) EncVal |= 0x00100000ULL;
+  if (Imm & 0x01) EncVal |= 0x00080000ULL;
+  return (EncVal << 32) | EncVal;
+}
+
+// aBbbbbbb bbcdefgh 0x00 0x00 0x00 0x00 0x00 0x00
+static inline bool isAdvSIMDModImmType12(uint64_t Imm) {
+  uint64_t BString = (Imm & 0x7fc0000000000000ULL) >> 54;
+  return ((BString == 0xff || BString == 0x100) &&
+         ((Imm & 0x0000ffffffffffffULL) == 0));
+}
+
+static inline uint8_t encodeAdvSIMDModImmType12(uint64_t Imm) {
+  uint8_t BitA = (Imm & 0x8000000000000000ULL) != 0;
+  uint8_t BitB = (Imm & 0x0040000000000000ULL) != 0;
+  uint8_t BitC = (Imm & 0x0020000000000000ULL) != 0;
+  uint8_t BitD = (Imm & 0x0010000000000000ULL) != 0;
+  uint8_t BitE = (Imm & 0x0008000000000000ULL) != 0;
+  uint8_t BitF = (Imm & 0x0004000000000000ULL) != 0;
+  uint8_t BitG = (Imm & 0x0002000000000000ULL) != 0;
+  uint8_t BitH = (Imm & 0x0001000000000000ULL) != 0;
+
+  uint8_t EncVal = BitA;
+  EncVal <<= 1;
+  EncVal |= BitB;
+  EncVal <<= 1;
+  EncVal |= BitC;
+  EncVal <<= 1;
+  EncVal |= BitD;
+  EncVal <<= 1;
+  EncVal |= BitE;
+  EncVal <<= 1;
+  EncVal |= BitF;
+  EncVal <<= 1;
+  EncVal |= BitG;
+  EncVal <<= 1;
+  EncVal |= BitH;
+  return EncVal;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType12(uint8_t Imm) {
+  uint64_t EncVal = 0;
+  if (Imm & 0x80) EncVal |= 0x8000000000000000ULL;
+  if (Imm & 0x40) EncVal |= 0x3fc0000000000000ULL;
+  else            EncVal |= 0x4000000000000000ULL;
+  if (Imm & 0x20) EncVal |= 0x0020000000000000ULL;
+  if (Imm & 0x10) EncVal |= 0x0010000000000000ULL;
+  if (Imm & 0x08) EncVal |= 0x0008000000000000ULL;
+  if (Imm & 0x04) EncVal |= 0x0004000000000000ULL;
+  if (Imm & 0x02) EncVal |= 0x0002000000000000ULL;
+  if (Imm & 0x01) EncVal |= 0x0001000000000000ULL;
+  return (EncVal << 32) | EncVal;
+}
+
+} // end namespace AArch64_AM
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
new file mode 100644
index 0000000..a917616
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -0,0 +1,566 @@
+//===-- AArch64AsmBackend.cpp - AArch64 Assembler Backend -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64RegisterInfo.h"
+#include "MCTargetDesc/AArch64FixupKinds.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachO.h"
+using namespace llvm;
+
+namespace {
+
+class AArch64AsmBackend : public MCAsmBackend {
+  static const unsigned PCRelFlagVal =
+      MCFixupKindInfo::FKF_IsAlignedDownTo32Bits | MCFixupKindInfo::FKF_IsPCRel;
+
+public:
+  AArch64AsmBackend(const Target &T) : MCAsmBackend() {}
+
+  unsigned getNumFixupKinds() const override {
+    return AArch64::NumTargetFixupKinds;
+  }
+
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
+    const static MCFixupKindInfo Infos[AArch64::NumTargetFixupKinds] = {
+      // This table *must* be in the order that the fixup_* kinds are defined in
+      // AArch64FixupKinds.h.
+      //
+      // Name                           Offset (bits) Size (bits)     Flags
+      { "fixup_aarch64_pcrel_adr_imm21", 0, 32, PCRelFlagVal },
+      { "fixup_aarch64_pcrel_adrp_imm21", 0, 32, PCRelFlagVal },
+      { "fixup_aarch64_add_imm12", 10, 12, 0 },
+      { "fixup_aarch64_ldst_imm12_scale1", 10, 12, 0 },
+      { "fixup_aarch64_ldst_imm12_scale2", 10, 12, 0 },
+      { "fixup_aarch64_ldst_imm12_scale4", 10, 12, 0 },
+      { "fixup_aarch64_ldst_imm12_scale8", 10, 12, 0 },
+      { "fixup_aarch64_ldst_imm12_scale16", 10, 12, 0 },
+      { "fixup_aarch64_ldr_pcrel_imm19", 5, 19, PCRelFlagVal },
+      { "fixup_aarch64_movw", 5, 16, 0 },
+      { "fixup_aarch64_pcrel_branch14", 5, 14, PCRelFlagVal },
+      { "fixup_aarch64_pcrel_branch19", 5, 19, PCRelFlagVal },
+      { "fixup_aarch64_pcrel_branch26", 0, 26, PCRelFlagVal },
+      { "fixup_aarch64_pcrel_call26", 0, 26, PCRelFlagVal },
+      { "fixup_aarch64_tlsdesc_call", 0, 0, 0 }
+    };
+
+    if (Kind < FirstTargetFixupKind)
+      return MCAsmBackend::getFixupKindInfo(Kind);
+
+    assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+           "Invalid kind!");
+    return Infos[Kind - FirstTargetFixupKind];
+  }
+
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value, bool IsPCRel) const override;
+
+  bool mayNeedRelaxation(const MCInst &Inst) const override;
+  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const override;
+  void relaxInstruction(const MCInst &Inst, MCInst &Res) const override;
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+
+  void HandleAssemblerFlag(MCAssemblerFlag Flag) {}
+
+  unsigned getPointerSize() const { return 8; }
+};
+
+} // end anonymous namespace
+
+/// \brief The number of bytes the fixup may change.
+static unsigned getFixupKindNumBytes(unsigned Kind) {
+  switch (Kind) {
+  default:
+    llvm_unreachable("Unknown fixup kind!");
+
+  case AArch64::fixup_aarch64_tlsdesc_call:
+    return 0;
+
+  case FK_Data_1:
+    return 1;
+
+  case FK_Data_2:
+  case AArch64::fixup_aarch64_movw:
+    return 2;
+
+  case AArch64::fixup_aarch64_pcrel_branch14:
+  case AArch64::fixup_aarch64_add_imm12:
+  case AArch64::fixup_aarch64_ldst_imm12_scale1:
+  case AArch64::fixup_aarch64_ldst_imm12_scale2:
+  case AArch64::fixup_aarch64_ldst_imm12_scale4:
+  case AArch64::fixup_aarch64_ldst_imm12_scale8:
+  case AArch64::fixup_aarch64_ldst_imm12_scale16:
+  case AArch64::fixup_aarch64_ldr_pcrel_imm19:
+  case AArch64::fixup_aarch64_pcrel_branch19:
+    return 3;
+
+  case AArch64::fixup_aarch64_pcrel_adr_imm21:
+  case AArch64::fixup_aarch64_pcrel_adrp_imm21:
+  case AArch64::fixup_aarch64_pcrel_branch26:
+  case AArch64::fixup_aarch64_pcrel_call26:
+  case FK_Data_4:
+    return 4;
+
+  case FK_Data_8:
+    return 8;
+  }
+}
+
+static unsigned AdrImmBits(unsigned Value) {
+  unsigned lo2 = Value & 0x3;
+  unsigned hi19 = (Value & 0x1ffffc) >> 2;
+  return (hi19 << 5) | (lo2 << 29);
+}
+
+static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
+  int64_t SignedValue = static_cast<int64_t>(Value);
+  switch (Kind) {
+  default:
+    assert(false && "Unknown fixup kind!");
+  case AArch64::fixup_aarch64_pcrel_adr_imm21:
+    if (SignedValue > 2097151 || SignedValue < -2097152)
+      report_fatal_error("fixup value out of range");
+    return AdrImmBits(Value & 0x1fffffULL);
+  case AArch64::fixup_aarch64_pcrel_adrp_imm21:
+    return AdrImmBits((Value & 0x1fffff000ULL) >> 12);
+  case AArch64::fixup_aarch64_ldr_pcrel_imm19:
+  case AArch64::fixup_aarch64_pcrel_branch19:
+    // Signed 21-bit immediate
+    if (SignedValue > 2097151 || SignedValue < -2097152)
+      report_fatal_error("fixup value out of range");
+    // Low two bits are not encoded.
+    return (Value >> 2) & 0x7ffff;
+  case AArch64::fixup_aarch64_add_imm12:
+  case AArch64::fixup_aarch64_ldst_imm12_scale1:
+    // Unsigned 12-bit immediate
+    if (Value >= 0x1000)
+      report_fatal_error("invalid imm12 fixup value");
+    return Value;
+  case AArch64::fixup_aarch64_ldst_imm12_scale2:
+    // Unsigned 12-bit immediate which gets multiplied by 2
+    if (Value & 1 || Value >= 0x2000)
+      report_fatal_error("invalid imm12 fixup value");
+    return Value >> 1;
+  case AArch64::fixup_aarch64_ldst_imm12_scale4:
+    // Unsigned 12-bit immediate which gets multiplied by 4
+    if (Value & 3 || Value >= 0x4000)
+      report_fatal_error("invalid imm12 fixup value");
+    return Value >> 2;
+  case AArch64::fixup_aarch64_ldst_imm12_scale8:
+    // Unsigned 12-bit immediate which gets multiplied by 8
+    if (Value & 7 || Value >= 0x8000)
+      report_fatal_error("invalid imm12 fixup value");
+    return Value >> 3;
+  case AArch64::fixup_aarch64_ldst_imm12_scale16:
+    // Unsigned 12-bit immediate which gets multiplied by 16
+    if (Value & 15 || Value >= 0x10000)
+      report_fatal_error("invalid imm12 fixup value");
+    return Value >> 4;
+  case AArch64::fixup_aarch64_movw:
+    report_fatal_error("no resolvable MOVZ/MOVK fixups supported yet");
+    return Value;
+  case AArch64::fixup_aarch64_pcrel_branch14:
+    // Signed 16-bit immediate
+    if (SignedValue > 32767 || SignedValue < -32768)
+      report_fatal_error("fixup value out of range");
+    // Low two bits are not encoded (4-byte alignment assumed).
+    if (Value & 0x3)
+      report_fatal_error("fixup not sufficiently aligned");
+    return (Value >> 2) & 0x3fff;
+  case AArch64::fixup_aarch64_pcrel_branch26:
+  case AArch64::fixup_aarch64_pcrel_call26:
+    // Signed 28-bit immediate
+    if (SignedValue > 134217727 || SignedValue < -134217728)
+      report_fatal_error("fixup value out of range");
+    // Low two bits are not encoded (4-byte alignment assumed).
+    if (Value & 0x3)
+      report_fatal_error("fixup not sufficiently aligned");
+    return (Value >> 2) & 0x3ffffff;
+  case FK_Data_1:
+  case FK_Data_2:
+  case FK_Data_4:
+  case FK_Data_8:
+    return Value;
+  }
+}
+
+void AArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+                                   unsigned DataSize, uint64_t Value,
+                                   bool IsPCRel) const {
+  unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
+  if (!Value)
+    return; // Doesn't change encoding.
+  MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind());
+  // Apply any target-specific value adjustments.
+  Value = adjustFixupValue(Fixup.getKind(), Value);
+
+  // Shift the value into position.
+  Value <<= Info.TargetOffset;
+
+  unsigned Offset = Fixup.getOffset();
+  assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+
+  // For each byte of the fragment that the fixup touches, mask in the
+  // bits from the fixup value.
+  for (unsigned i = 0; i != NumBytes; ++i)
+    Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
+}
+
+bool AArch64AsmBackend::mayNeedRelaxation(const MCInst &Inst) const {
+  return false;
+}
+
+bool AArch64AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
+                                             uint64_t Value,
+                                             const MCRelaxableFragment *DF,
+                                             const MCAsmLayout &Layout) const {
+  // FIXME:  This isn't correct for AArch64. Just moving the "generic" logic
+  // into the targets for now.
+  //
+  // Relax if the value is too big for a (signed) i8.
+  return int64_t(Value) != int64_t(int8_t(Value));
+}
+
+void AArch64AsmBackend::relaxInstruction(const MCInst &Inst,
+                                         MCInst &Res) const {
+  assert(false && "AArch64AsmBackend::relaxInstruction() unimplemented");
+}
+
+bool AArch64AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+  // If the count is not 4-byte aligned, we must be writing data into the text
+  // section (otherwise we have unaligned instructions, and thus have far
+  // bigger problems), so just write zeros instead.
+  if ((Count & 3) != 0) {
+    for (uint64_t i = 0, e = (Count & 3); i != e; ++i)
+      OW->Write8(0);
+  }
+
+  // We are properly aligned, so write NOPs as requested.
+  Count /= 4;
+  for (uint64_t i = 0; i != Count; ++i)
+    OW->Write32(0xd503201f);
+  return true;
+}
+
+namespace {
+
+namespace CU {
+
+/// \brief Compact unwind encoding values.
+enum CompactUnwindEncodings {
+  /// \brief A "frameless" leaf function, where no non-volatile registers are
+  /// saved. The return remains in LR throughout the function.
+  UNWIND_AArch64_MODE_FRAMELESS = 0x02000000,
+
+  /// \brief No compact unwind encoding available. Instead the low 23-bits of
+  /// the compact unwind encoding is the offset of the DWARF FDE in the
+  /// __eh_frame section. This mode is never used in object files. It is only
+  /// generated by the linker in final linked images, which have only DWARF info
+  /// for a function.
+  UNWIND_AArch64_MODE_DWARF = 0x03000000,
+
+  /// \brief This is a standard arm64 prologue where FP/LR are immediately
+  /// pushed on the stack, then SP is copied to FP. If there are any
+  /// non-volatile register saved, they are copied into the stack fame in pairs
+  /// in a contiguous ranger right below the saved FP/LR pair. Any subset of the
+  /// five X pairs and four D pairs can be saved, but the memory layout must be
+  /// in register number order.
+  UNWIND_AArch64_MODE_FRAME = 0x04000000,
+
+  /// \brief Frame register pair encodings.
+  UNWIND_AArch64_FRAME_X19_X20_PAIR = 0x00000001,
+  UNWIND_AArch64_FRAME_X21_X22_PAIR = 0x00000002,
+  UNWIND_AArch64_FRAME_X23_X24_PAIR = 0x00000004,
+  UNWIND_AArch64_FRAME_X25_X26_PAIR = 0x00000008,
+  UNWIND_AArch64_FRAME_X27_X28_PAIR = 0x00000010,
+  UNWIND_AArch64_FRAME_D8_D9_PAIR = 0x00000100,
+  UNWIND_AArch64_FRAME_D10_D11_PAIR = 0x00000200,
+  UNWIND_AArch64_FRAME_D12_D13_PAIR = 0x00000400,
+  UNWIND_AArch64_FRAME_D14_D15_PAIR = 0x00000800
+};
+
+} // end CU namespace
+
+// FIXME: This should be in a separate file.
+class DarwinAArch64AsmBackend : public AArch64AsmBackend {
+  const MCRegisterInfo &MRI;
+
+  /// \brief Encode compact unwind stack adjustment for frameless functions.
+  /// See UNWIND_AArch64_FRAMELESS_STACK_SIZE_MASK in compact_unwind_encoding.h.
+  /// The stack size always needs to be 16 byte aligned.
+  uint32_t encodeStackAdjustment(uint32_t StackSize) const {
+    return (StackSize / 16) << 12;
+  }
+
+public:
+  DarwinAArch64AsmBackend(const Target &T, const MCRegisterInfo &MRI)
+      : AArch64AsmBackend(T), MRI(MRI) {}
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
+    return createAArch64MachObjectWriter(OS, MachO::CPU_TYPE_ARM64,
+                                         MachO::CPU_SUBTYPE_ARM64_ALL);
+  }
+
+  bool doesSectionRequireSymbols(const MCSection &Section) const override {
+    // Any section for which the linker breaks things into atoms needs to
+    // preserve symbols, including assembler local symbols, to identify
+    // those atoms. These sections are:
+    // Sections of type:
+    //
+    //    S_CSTRING_LITERALS  (e.g. __cstring)
+    //    S_LITERAL_POINTERS  (e.g.  objc selector pointers)
+    //    S_16BYTE_LITERALS, S_8BYTE_LITERALS, S_4BYTE_LITERALS
+    //
+    // Sections named:
+    //
+    //    __TEXT,__eh_frame
+    //    __TEXT,__ustring
+    //    __DATA,__cfstring
+    //    __DATA,__objc_classrefs
+    //    __DATA,__objc_catlist
+    //
+    // FIXME: It would be better if the compiler used actual linker local
+    // symbols for each of these sections rather than preserving what
+    // are ostensibly assembler local symbols.
+    const MCSectionMachO &SMO = static_cast<const MCSectionMachO &>(Section);
+    return (SMO.getType() == MachO::S_CSTRING_LITERALS ||
+            SMO.getType() == MachO::S_4BYTE_LITERALS ||
+            SMO.getType() == MachO::S_8BYTE_LITERALS ||
+            SMO.getType() == MachO::S_16BYTE_LITERALS ||
+            SMO.getType() == MachO::S_LITERAL_POINTERS ||
+            (SMO.getSegmentName() == "__TEXT" &&
+             (SMO.getSectionName() == "__eh_frame" ||
+              SMO.getSectionName() == "__ustring")) ||
+            (SMO.getSegmentName() == "__DATA" &&
+             (SMO.getSectionName() == "__cfstring" ||
+              SMO.getSectionName() == "__objc_classrefs" ||
+              SMO.getSectionName() == "__objc_catlist")));
+  }
+
+  /// \brief Generate the compact unwind encoding from the CFI directives.
+  uint32_t generateCompactUnwindEncoding(
+                             ArrayRef<MCCFIInstruction> Instrs) const override {
+    if (Instrs.empty())
+      return CU::UNWIND_AArch64_MODE_FRAMELESS;
+
+    bool HasFP = false;
+    unsigned StackSize = 0;
+
+    uint32_t CompactUnwindEncoding = 0;
+    for (size_t i = 0, e = Instrs.size(); i != e; ++i) {
+      const MCCFIInstruction &Inst = Instrs[i];
+
+      switch (Inst.getOperation()) {
+      default:
+        // Cannot handle this directive:  bail out.
+        return CU::UNWIND_AArch64_MODE_DWARF;
+      case MCCFIInstruction::OpDefCfa: {
+        // Defines a frame pointer.
+        assert(getXRegFromWReg(MRI.getLLVMRegNum(Inst.getRegister(), true)) ==
+                   AArch64::FP &&
+               "Invalid frame pointer!");
+        assert(i + 2 < e && "Insufficient CFI instructions to define a frame!");
+
+        const MCCFIInstruction &LRPush = Instrs[++i];
+        assert(LRPush.getOperation() == MCCFIInstruction::OpOffset &&
+               "Link register not pushed!");
+        const MCCFIInstruction &FPPush = Instrs[++i];
+        assert(FPPush.getOperation() == MCCFIInstruction::OpOffset &&
+               "Frame pointer not pushed!");
+
+        unsigned LRReg = MRI.getLLVMRegNum(LRPush.getRegister(), true);
+        unsigned FPReg = MRI.getLLVMRegNum(FPPush.getRegister(), true);
+
+        LRReg = getXRegFromWReg(LRReg);
+        FPReg = getXRegFromWReg(FPReg);
+
+        assert(LRReg == AArch64::LR && FPReg == AArch64::FP &&
+               "Pushing invalid registers for frame!");
+
+        // Indicate that the function has a frame.
+        CompactUnwindEncoding |= CU::UNWIND_AArch64_MODE_FRAME;
+        HasFP = true;
+        break;
+      }
+      case MCCFIInstruction::OpDefCfaOffset: {
+        assert(StackSize == 0 && "We already have the CFA offset!");
+        StackSize = std::abs(Inst.getOffset());
+        break;
+      }
+      case MCCFIInstruction::OpOffset: {
+        // Registers are saved in pairs. We expect there to be two consecutive
+        // `.cfi_offset' instructions with the appropriate registers specified.
+        unsigned Reg1 = MRI.getLLVMRegNum(Inst.getRegister(), true);
+        if (i + 1 == e)
+          return CU::UNWIND_AArch64_MODE_DWARF;
+
+        const MCCFIInstruction &Inst2 = Instrs[++i];
+        if (Inst2.getOperation() != MCCFIInstruction::OpOffset)
+          return CU::UNWIND_AArch64_MODE_DWARF;
+        unsigned Reg2 = MRI.getLLVMRegNum(Inst2.getRegister(), true);
+
+        // N.B. The encodings must be in register number order, and the X
+        // registers before the D registers.
+
+        // X19/X20 pair = 0x00000001,
+        // X21/X22 pair = 0x00000002,
+        // X23/X24 pair = 0x00000004,
+        // X25/X26 pair = 0x00000008,
+        // X27/X28 pair = 0x00000010
+        Reg1 = getXRegFromWReg(Reg1);
+        Reg2 = getXRegFromWReg(Reg2);
+
+        if (Reg1 == AArch64::X19 && Reg2 == AArch64::X20 &&
+            (CompactUnwindEncoding & 0xF1E) == 0)
+          CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X19_X20_PAIR;
+        else if (Reg1 == AArch64::X21 && Reg2 == AArch64::X22 &&
+                 (CompactUnwindEncoding & 0xF1C) == 0)
+          CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X21_X22_PAIR;
+        else if (Reg1 == AArch64::X23 && Reg2 == AArch64::X24 &&
+                 (CompactUnwindEncoding & 0xF18) == 0)
+          CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X23_X24_PAIR;
+        else if (Reg1 == AArch64::X25 && Reg2 == AArch64::X26 &&
+                 (CompactUnwindEncoding & 0xF10) == 0)
+          CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X25_X26_PAIR;
+        else if (Reg1 == AArch64::X27 && Reg2 == AArch64::X28 &&
+                 (CompactUnwindEncoding & 0xF00) == 0)
+          CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X27_X28_PAIR;
+        else {
+          Reg1 = getDRegFromBReg(Reg1);
+          Reg2 = getDRegFromBReg(Reg2);
+
+          // D8/D9 pair   = 0x00000100,
+          // D10/D11 pair = 0x00000200,
+          // D12/D13 pair = 0x00000400,
+          // D14/D15 pair = 0x00000800
+          if (Reg1 == AArch64::D8 && Reg2 == AArch64::D9 &&
+              (CompactUnwindEncoding & 0xE00) == 0)
+            CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D8_D9_PAIR;
+          else if (Reg1 == AArch64::D10 && Reg2 == AArch64::D11 &&
+                   (CompactUnwindEncoding & 0xC00) == 0)
+            CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D10_D11_PAIR;
+          else if (Reg1 == AArch64::D12 && Reg2 == AArch64::D13 &&
+                   (CompactUnwindEncoding & 0x800) == 0)
+            CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D12_D13_PAIR;
+          else if (Reg1 == AArch64::D14 && Reg2 == AArch64::D15)
+            CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D14_D15_PAIR;
+          else
+            // A pair was pushed which we cannot handle.
+            return CU::UNWIND_AArch64_MODE_DWARF;
+        }
+
+        break;
+      }
+      }
+    }
+
+    if (!HasFP) {
+      // With compact unwind info we can only represent stack adjustments of up
+      // to 65520 bytes.
+      if (StackSize > 65520)
+        return CU::UNWIND_AArch64_MODE_DWARF;
+
+      CompactUnwindEncoding |= CU::UNWIND_AArch64_MODE_FRAMELESS;
+      CompactUnwindEncoding |= encodeStackAdjustment(StackSize);
+    }
+
+    return CompactUnwindEncoding;
+  }
+};
+
+} // end anonymous namespace
+
+namespace {
+
+class ELFAArch64AsmBackend : public AArch64AsmBackend {
+public:
+  uint8_t OSABI;
+  bool IsLittleEndian;
+
+  ELFAArch64AsmBackend(const Target &T, uint8_t OSABI, bool IsLittleEndian)
+    : AArch64AsmBackend(T), OSABI(OSABI), IsLittleEndian(IsLittleEndian) {}
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
+    return createAArch64ELFObjectWriter(OS, OSABI, IsLittleEndian);
+  }
+
+  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
+                         const MCFixup &Fixup, const MCFragment *DF,
+                         const MCValue &Target, uint64_t &Value,
+                         bool &IsResolved) override;
+
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value, bool IsPCRel) const override;
+};
+
+void ELFAArch64AsmBackend::processFixupValue(
+    const MCAssembler &Asm, const MCAsmLayout &Layout, const MCFixup &Fixup,
+    const MCFragment *DF, const MCValue &Target, uint64_t &Value,
+    bool &IsResolved) {
+  // The ADRP instruction adds some multiple of 0x1000 to the current PC &
+  // ~0xfff. This means that the required offset to reach a symbol can vary by
+  // up to one step depending on where the ADRP is in memory. For example:
+  //
+  //     ADRP x0, there
+  //  there:
+  //
+  // If the ADRP occurs at address 0xffc then "there" will be at 0x1000 and
+  // we'll need that as an offset. At any other address "there" will be in the
+  // same page as the ADRP and the instruction should encode 0x0. Assuming the
+  // section isn't 0x1000-aligned, we therefore need to delegate this decision
+  // to the linker -- a relocation!
+  if ((uint32_t)Fixup.getKind() == AArch64::fixup_aarch64_pcrel_adrp_imm21)
+    IsResolved = false;
+}
+
+void ELFAArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+                                      unsigned DataSize, uint64_t Value,
+                                      bool IsPCRel) const {
+  // store fixups in .eh_frame section in big endian order
+  if (!IsLittleEndian && Fixup.getKind() == FK_Data_4) {
+    const MCSection *Sec = Fixup.getValue()->FindAssociatedSection();
+    const MCSectionELF *SecELF = static_cast<const MCSectionELF *>(Sec);
+    if (SecELF->getSectionName() == ".eh_frame")
+      Value = ByteSwap_32(unsigned(Value));
+  }
+  AArch64AsmBackend::applyFixup (Fixup, Data, DataSize, Value, IsPCRel);
+}
+}
+
+MCAsmBackend *llvm::createAArch64leAsmBackend(const Target &T,
+                                            const MCRegisterInfo &MRI,
+                                            StringRef TT, StringRef CPU) {
+  Triple TheTriple(TT);
+
+  if (TheTriple.isOSDarwin())
+    return new DarwinAArch64AsmBackend(T, MRI);
+
+  assert(TheTriple.isOSBinFormatELF() && "Expect either MachO or ELF target");
+  return new ELFAArch64AsmBackend(T, TheTriple.getOS(), /*IsLittleEndian=*/true);
+}
+
+MCAsmBackend *llvm::createAArch64beAsmBackend(const Target &T,
+                                            const MCRegisterInfo &MRI,
+                                            StringRef TT, StringRef CPU) {
+  Triple TheTriple(TT);
+
+  assert(TheTriple.isOSBinFormatELF() &&
+         "Big endian is only supported for ELF targets!");
+  return new ELFAArch64AsmBackend(T, TheTriple.getOS(),
+                                  /*IsLittleEndian=*/false);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
new file mode 100644
index 0000000..e05191e
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -0,0 +1,257 @@
+//===-- AArch64ELFObjectWriter.cpp - AArch64 ELF Writer -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file handles ELF-specific object emission, converting LLVM's internal
+// fixups into the appropriate relocations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AArch64FixupKinds.h"
+#include "MCTargetDesc/AArch64MCExpr.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace {
+class AArch64ELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+  AArch64ELFObjectWriter(uint8_t OSABI, bool IsLittleEndian);
+
+  virtual ~AArch64ELFObjectWriter();
+
+protected:
+  unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
+                        bool IsPCRel) const override;
+
+private:
+};
+}
+
+AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI,
+                                               bool IsLittleEndian)
+    : MCELFObjectTargetWriter(/*Is64Bit*/ true, OSABI, ELF::EM_AARCH64,
+                              /*HasRelocationAddend*/ true) {}
+
+AArch64ELFObjectWriter::~AArch64ELFObjectWriter() {}
+
+unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target,
+                                            const MCFixup &Fixup,
+                                            bool IsPCRel) const {
+  AArch64MCExpr::VariantKind RefKind =
+      static_cast<AArch64MCExpr::VariantKind>(Target.getRefKind());
+  AArch64MCExpr::VariantKind SymLoc = AArch64MCExpr::getSymbolLoc(RefKind);
+  bool IsNC = AArch64MCExpr::isNotChecked(RefKind);
+
+  assert((!Target.getSymA() ||
+          Target.getSymA()->getKind() == MCSymbolRefExpr::VK_None) &&
+         "Should only be expression-level modifiers here");
+
+  assert((!Target.getSymB() ||
+          Target.getSymB()->getKind() == MCSymbolRefExpr::VK_None) &&
+         "Should only be expression-level modifiers here");
+
+  if (IsPCRel) {
+    switch ((unsigned)Fixup.getKind()) {
+    case FK_Data_2:
+      return ELF::R_AARCH64_PREL16;
+    case FK_Data_4:
+      return ELF::R_AARCH64_PREL32;
+    case FK_Data_8:
+      return ELF::R_AARCH64_PREL64;
+    case AArch64::fixup_aarch64_pcrel_adr_imm21:
+      assert(SymLoc == AArch64MCExpr::VK_NONE && "unexpected ADR relocation");
+      return ELF::R_AARCH64_ADR_PREL_LO21;
+    case AArch64::fixup_aarch64_pcrel_adrp_imm21:
+      if (SymLoc == AArch64MCExpr::VK_ABS && !IsNC)
+        return ELF::R_AARCH64_ADR_PREL_PG_HI21;
+      if (SymLoc == AArch64MCExpr::VK_GOT && !IsNC)
+        return ELF::R_AARCH64_ADR_GOT_PAGE;
+      if (SymLoc == AArch64MCExpr::VK_GOTTPREL && !IsNC)
+        return ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21;
+      if (SymLoc == AArch64MCExpr::VK_TLSDESC && !IsNC)
+        return ELF::R_AARCH64_TLSDESC_ADR_PAGE;
+      llvm_unreachable("invalid symbol kind for ADRP relocation");
+    case AArch64::fixup_aarch64_pcrel_branch26:
+      return ELF::R_AARCH64_JUMP26;
+    case AArch64::fixup_aarch64_pcrel_call26:
+      return ELF::R_AARCH64_CALL26;
+    case AArch64::fixup_aarch64_ldr_pcrel_imm19:
+      if (SymLoc == AArch64MCExpr::VK_GOTTPREL)
+        return ELF::R_AARCH64_TLSIE_LD_GOTTPREL_PREL19;
+      return ELF::R_AARCH64_LD_PREL_LO19;
+    case AArch64::fixup_aarch64_pcrel_branch14:
+      return ELF::R_AARCH64_TSTBR14;
+    case AArch64::fixup_aarch64_pcrel_branch19:
+      return ELF::R_AARCH64_CONDBR19;
+    default:
+      llvm_unreachable("Unsupported pc-relative fixup kind");
+    }
+  } else {
+    switch ((unsigned)Fixup.getKind()) {
+    case FK_Data_2:
+      return ELF::R_AARCH64_ABS16;
+    case FK_Data_4:
+      return ELF::R_AARCH64_ABS32;
+    case FK_Data_8:
+      return ELF::R_AARCH64_ABS64;
+    case AArch64::fixup_aarch64_add_imm12:
+      if (RefKind == AArch64MCExpr::VK_DTPREL_HI12)
+        return ELF::R_AARCH64_TLSLD_ADD_DTPREL_HI12;
+      if (RefKind == AArch64MCExpr::VK_TPREL_HI12)
+        return ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12;
+      if (RefKind == AArch64MCExpr::VK_DTPREL_LO12_NC)
+        return ELF::R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC;
+      if (RefKind == AArch64MCExpr::VK_DTPREL_LO12)
+        return ELF::R_AARCH64_TLSLD_ADD_DTPREL_LO12;
+      if (RefKind == AArch64MCExpr::VK_TPREL_LO12_NC)
+        return ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC;
+      if (RefKind == AArch64MCExpr::VK_TPREL_LO12)
+        return ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12;
+      if (RefKind == AArch64MCExpr::VK_TLSDESC_LO12)
+        return ELF::R_AARCH64_TLSDESC_ADD_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+        return ELF::R_AARCH64_ADD_ABS_LO12_NC;
+
+      report_fatal_error("invalid fixup for add (uimm12) instruction");
+      return 0;
+    case AArch64::fixup_aarch64_ldst_imm12_scale1:
+      if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+        return ELF::R_AARCH64_LDST8_ABS_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST8_DTPREL_LO12;
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_TPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST8_TPREL_LO12;
+      if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC;
+
+      report_fatal_error("invalid fixup for 8-bit load/store instruction");
+      return 0;
+    case AArch64::fixup_aarch64_ldst_imm12_scale2:
+      if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+        return ELF::R_AARCH64_LDST16_ABS_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST16_DTPREL_LO12;
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_TPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST16_TPREL_LO12;
+      if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC;
+
+      report_fatal_error("invalid fixup for 16-bit load/store instruction");
+      return 0;
+    case AArch64::fixup_aarch64_ldst_imm12_scale4:
+      if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+        return ELF::R_AARCH64_LDST32_ABS_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST32_DTPREL_LO12;
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_TPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST32_TPREL_LO12;
+      if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC;
+
+      report_fatal_error("invalid fixup for 32-bit load/store instruction");
+      return 0;
+    case AArch64::fixup_aarch64_ldst_imm12_scale8:
+      if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+        return ELF::R_AARCH64_LDST64_ABS_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_GOT && IsNC)
+        return ELF::R_AARCH64_LD64_GOT_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST64_DTPREL_LO12;
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_TPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST64_TPREL_LO12;
+      if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_GOTTPREL && IsNC)
+        return ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_TLSDESC && IsNC)
+        return ELF::R_AARCH64_TLSDESC_LD64_LO12_NC;
+
+      report_fatal_error("invalid fixup for 64-bit load/store instruction");
+      return 0;
+    case AArch64::fixup_aarch64_ldst_imm12_scale16:
+      if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+        return ELF::R_AARCH64_LDST128_ABS_LO12_NC;
+
+      report_fatal_error("invalid fixup for 128-bit load/store instruction");
+      return 0;
+    case AArch64::fixup_aarch64_movw:
+      if (RefKind == AArch64MCExpr::VK_ABS_G3)
+        return ELF::R_AARCH64_MOVW_UABS_G3;
+      if (RefKind == AArch64MCExpr::VK_ABS_G2)
+        return ELF::R_AARCH64_MOVW_UABS_G2;
+      if (RefKind == AArch64MCExpr::VK_ABS_G2_S)
+        return ELF::R_AARCH64_MOVW_SABS_G2;
+      if (RefKind == AArch64MCExpr::VK_ABS_G2_NC)
+        return ELF::R_AARCH64_MOVW_UABS_G2_NC;
+      if (RefKind == AArch64MCExpr::VK_ABS_G1)
+        return ELF::R_AARCH64_MOVW_UABS_G1;
+      if (RefKind == AArch64MCExpr::VK_ABS_G1_S)
+        return ELF::R_AARCH64_MOVW_SABS_G1;
+      if (RefKind == AArch64MCExpr::VK_ABS_G1_NC)
+        return ELF::R_AARCH64_MOVW_UABS_G1_NC;
+      if (RefKind == AArch64MCExpr::VK_ABS_G0)
+        return ELF::R_AARCH64_MOVW_UABS_G0;
+      if (RefKind == AArch64MCExpr::VK_ABS_G0_S)
+        return ELF::R_AARCH64_MOVW_SABS_G0;
+      if (RefKind == AArch64MCExpr::VK_ABS_G0_NC)
+        return ELF::R_AARCH64_MOVW_UABS_G0_NC;
+      if (RefKind == AArch64MCExpr::VK_DTPREL_G2)
+        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G2;
+      if (RefKind == AArch64MCExpr::VK_DTPREL_G1)
+        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G1;
+      if (RefKind == AArch64MCExpr::VK_DTPREL_G1_NC)
+        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC;
+      if (RefKind == AArch64MCExpr::VK_DTPREL_G0)
+        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G0;
+      if (RefKind == AArch64MCExpr::VK_DTPREL_G0_NC)
+        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC;
+      if (RefKind == AArch64MCExpr::VK_TPREL_G2)
+        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G2;
+      if (RefKind == AArch64MCExpr::VK_TPREL_G1)
+        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G1;
+      if (RefKind == AArch64MCExpr::VK_TPREL_G1_NC)
+        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G1_NC;
+      if (RefKind == AArch64MCExpr::VK_TPREL_G0)
+        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0;
+      if (RefKind == AArch64MCExpr::VK_TPREL_G0_NC)
+        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0_NC;
+      if (RefKind == AArch64MCExpr::VK_GOTTPREL_G1)
+        return ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G1;
+      if (RefKind == AArch64MCExpr::VK_GOTTPREL_G0_NC)
+        return ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC;
+      report_fatal_error("invalid fixup for movz/movk instruction");
+      return 0;
+    case AArch64::fixup_aarch64_tlsdesc_call:
+      return ELF::R_AARCH64_TLSDESC_CALL;
+    default:
+      llvm_unreachable("Unknown ELF relocation type");
+    }
+  }
+
+  llvm_unreachable("Unimplemented fixup -> relocation");
+}
+
+MCObjectWriter *llvm::createAArch64ELFObjectWriter(raw_ostream &OS,
+                                                 uint8_t OSABI,
+                                                 bool IsLittleEndian) {
+  MCELFObjectTargetWriter *MOTW =
+      new AArch64ELFObjectWriter(OSABI, IsLittleEndian);
+  return createELFObjectWriter(MOTW, OS, IsLittleEndian);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
new file mode 100644
index 0000000..a79406d
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -0,0 +1,160 @@
+//===- lib/MC/AArch64ELFStreamer.cpp - ELF Object Output for AArch64 ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file assembles .s files and emits AArch64 ELF .o object files. Different
+// from generic ELF streamer in emitting mapping symbols ($x and $d) to delimit
+// regions of data and code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELF.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCELFSymbolFlags.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+
+/// Extend the generic ELFStreamer class so that it can emit mapping symbols at
+/// the appropriate points in the object files. These symbols are defined in the
+/// AArch64 ELF ABI:
+///    infocenter.arm.com/help/topic/com.arm.doc.ihi0056a/IHI0056A_aaelf64.pdf
+///
+/// In brief: $x or $d should be emitted at the start of each contiguous region
+/// of A64 code or data in a section. In practice, this emission does not rely
+/// on explicit assembler directives but on inherent properties of the
+/// directives doing the emission (e.g. ".byte" is data, "add x0, x0, x0" an
+/// instruction).
+///
+/// As a result this system is orthogonal to the DataRegion infrastructure used
+/// by MachO. Beware!
+class AArch64ELFStreamer : public MCELFStreamer {
+public:
+  AArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS,
+                   MCCodeEmitter *Emitter)
+      : MCELFStreamer(Context, TAB, OS, Emitter), MappingSymbolCounter(0),
+        LastEMS(EMS_None) {}
+
+  ~AArch64ELFStreamer() {}
+
+  void ChangeSection(const MCSection *Section,
+                     const MCExpr *Subsection) override {
+    // We have to keep track of the mapping symbol state of any sections we
+    // use. Each one should start off as EMS_None, which is provided as the
+    // default constructor by DenseMap::lookup.
+    LastMappingSymbols[getPreviousSection().first] = LastEMS;
+    LastEMS = LastMappingSymbols.lookup(Section);
+
+    MCELFStreamer::ChangeSection(Section, Subsection);
+  }
+
+  /// This function is the one used to emit instruction data into the ELF
+  /// streamer. We override it to add the appropriate mapping symbol if
+  /// necessary.
+  void EmitInstruction(const MCInst &Inst,
+                       const MCSubtargetInfo &STI) override {
+    EmitA64MappingSymbol();
+    MCELFStreamer::EmitInstruction(Inst, STI);
+  }
+
+  /// This is one of the functions used to emit data into an ELF section, so the
+  /// AArch64 streamer overrides it to add the appropriate mapping symbol ($d)
+  /// if necessary.
+  void EmitBytes(StringRef Data) override {
+    EmitDataMappingSymbol();
+    MCELFStreamer::EmitBytes(Data);
+  }
+
+  /// This is one of the functions used to emit data into an ELF section, so the
+  /// AArch64 streamer overrides it to add the appropriate mapping symbol ($d)
+  /// if necessary.
+  void EmitValueImpl(const MCExpr *Value, unsigned Size,
+                     const SMLoc &Loc) override {
+    EmitDataMappingSymbol();
+    MCELFStreamer::EmitValueImpl(Value, Size);
+  }
+
+private:
+  enum ElfMappingSymbol {
+    EMS_None,
+    EMS_A64,
+    EMS_Data
+  };
+
+  void EmitDataMappingSymbol() {
+    if (LastEMS == EMS_Data)
+      return;
+    EmitMappingSymbol("$d");
+    LastEMS = EMS_Data;
+  }
+
+  void EmitA64MappingSymbol() {
+    if (LastEMS == EMS_A64)
+      return;
+    EmitMappingSymbol("$x");
+    LastEMS = EMS_A64;
+  }
+
+  void EmitMappingSymbol(StringRef Name) {
+    MCSymbol *Start = getContext().CreateTempSymbol();
+    EmitLabel(Start);
+
+    MCSymbol *Symbol = getContext().GetOrCreateSymbol(
+        Name + "." + Twine(MappingSymbolCounter++));
+
+    MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
+    MCELF::SetType(SD, ELF::STT_NOTYPE);
+    MCELF::SetBinding(SD, ELF::STB_LOCAL);
+    SD.setExternal(false);
+    Symbol->setSection(*getCurrentSection().first);
+
+    const MCExpr *Value = MCSymbolRefExpr::Create(Start, getContext());
+    Symbol->setVariableValue(Value);
+  }
+
+  int64_t MappingSymbolCounter;
+
+  DenseMap<const MCSection *, ElfMappingSymbol> LastMappingSymbols;
+  ElfMappingSymbol LastEMS;
+
+  /// @}
+};
+}
+
+namespace llvm {
+MCELFStreamer *createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+                                        raw_ostream &OS, MCCodeEmitter *Emitter,
+                                        bool RelaxAll, bool NoExecStack) {
+  AArch64ELFStreamer *S = new AArch64ELFStreamer(Context, TAB, OS, Emitter);
+  if (RelaxAll)
+    S->getAssembler().setRelaxAll(true);
+  if (NoExecStack)
+    S->getAssembler().setNoExecStack(true);
+  return S;
+}
+}
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h
new file mode 100644
index 0000000..bc6973b
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h
@@ -0,0 +1,26 @@
+//===-- AArch64ELFStreamer.h - ELF Streamer for AArch64 ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements ELF streamer information for the AArch64 backend.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AARCH64_ELF_STREAMER_H
+#define LLVM_AARCH64_ELF_STREAMER_H
+
+#include "llvm/MC/MCELFStreamer.h"
+
+namespace llvm {
+
+MCELFStreamer *createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+                                        raw_ostream &OS, MCCodeEmitter *Emitter,
+                                        bool RelaxAll, bool NoExecStack);
+}
+
+#endif // AArch64_ELF_STREAMER_H
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
new file mode 100644
index 0000000..bf405fb
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
@@ -0,0 +1,76 @@
+//===-- AArch64FixupKinds.h - AArch64 Specific Fixup Entries ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AArch64FIXUPKINDS_H
+#define LLVM_AArch64FIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace AArch64 {
+
+enum Fixups {
+  // fixup_aarch64_pcrel_adr_imm21 - A 21-bit pc-relative immediate inserted into
+  // an ADR instruction.
+  fixup_aarch64_pcrel_adr_imm21 = FirstTargetFixupKind,
+
+  // fixup_aarch64_pcrel_adrp_imm21 - A 21-bit pc-relative immediate inserted into
+  // an ADRP instruction.
+  fixup_aarch64_pcrel_adrp_imm21,
+
+  // fixup_aarch64_imm12 - 12-bit fixup for add/sub instructions.
+  //     No alignment adjustment. All value bits are encoded.
+  fixup_aarch64_add_imm12,
+
+  // fixup_aarch64_ldst_imm12_* - unsigned 12-bit fixups for load and
+  // store instructions.
+  fixup_aarch64_ldst_imm12_scale1,
+  fixup_aarch64_ldst_imm12_scale2,
+  fixup_aarch64_ldst_imm12_scale4,
+  fixup_aarch64_ldst_imm12_scale8,
+  fixup_aarch64_ldst_imm12_scale16,
+
+  // fixup_aarch64_ldr_pcrel_imm19 - The high 19 bits of a 21-bit pc-relative
+  // immediate. Same encoding as fixup_aarch64_pcrel_adrhi, except this is used by
+  // pc-relative loads and generates relocations directly when necessary.
+  fixup_aarch64_ldr_pcrel_imm19,
+
+  // FIXME: comment
+  fixup_aarch64_movw,
+
+  // fixup_aarch64_pcrel_imm14 - The high 14 bits of a 21-bit pc-relative
+  // immediate.
+  fixup_aarch64_pcrel_branch14,
+
+  // fixup_aarch64_pcrel_branch19 - The high 19 bits of a 21-bit pc-relative
+  // immediate. Same encoding as fixup_aarch64_pcrel_adrhi, except this is use by
+  // b.cc and generates relocations directly when necessary.
+  fixup_aarch64_pcrel_branch19,
+
+  // fixup_aarch64_pcrel_branch26 - The high 26 bits of a 28-bit pc-relative
+  // immediate.
+  fixup_aarch64_pcrel_branch26,
+
+  // fixup_aarch64_pcrel_call26 - The high 26 bits of a 28-bit pc-relative
+  // immediate. Distinguished from branch26 only on ELF.
+  fixup_aarch64_pcrel_call26,
+
+  // fixup_aarch64_tlsdesc_call - zero-space placeholder for the ELF
+  // R_AARCH64_TLSDESC_CALL relocation.
+  fixup_aarch64_tlsdesc_call,
+
+  // Marker
+  LastTargetFixupKind,
+  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+
+} // end namespace AArch64
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
new file mode 100644
index 0000000..1763b40
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -0,0 +1,101 @@
+//===-- AArch64MCAsmInfo.cpp - AArch64 asm properties ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the AArch64MCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64MCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+enum AsmWriterVariantTy {
+  Default = -1,
+  Generic = 0,
+  Apple = 1
+};
+
+static cl::opt<AsmWriterVariantTy> AsmWriterVariant(
+    "aarch64-neon-syntax", cl::init(Default),
+    cl::desc("Choose style of NEON code to emit from AArch64 backend:"),
+    cl::values(clEnumValN(Generic, "generic", "Emit generic NEON assembly"),
+               clEnumValN(Apple, "apple", "Emit Apple-style NEON assembly"),
+               clEnumValEnd));
+
+AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin() {
+  // We prefer NEON instructions to be printed in the short form.
+  AssemblerDialect = AsmWriterVariant == Default ? 1 : AsmWriterVariant;
+
+  PrivateGlobalPrefix = "L";
+  SeparatorString = "%%";
+  CommentString = ";";
+  PointerSize = CalleeSaveStackSlotSize = 8;
+
+  AlignmentIsInBytes = false;
+  UsesELFSectionDirectiveForBSS = true;
+  SupportsDebugInformation = true;
+  UseDataRegionDirectives = true;
+
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+}
+
+const MCExpr *AArch64MCAsmInfoDarwin::getExprForPersonalitySymbol(
+    const MCSymbol *Sym, unsigned Encoding, MCStreamer &Streamer) const {
+  // On Darwin, we can reference dwarf symbols with foo@GOT-., which
+  // is an indirect pc-relative reference. The default implementation
+  // won't reference using the GOT, so we need this target-specific
+  // version.
+  MCContext &Context = Streamer.getContext();
+  const MCExpr *Res =
+      MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOT, Context);
+  MCSymbol *PCSym = Context.CreateTempSymbol();
+  Streamer.EmitLabel(PCSym);
+  const MCExpr *PC = MCSymbolRefExpr::Create(PCSym, Context);
+  return MCBinaryExpr::CreateSub(Res, PC, Context);
+}
+
+AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(StringRef TT) {
+  Triple T(TT);
+  if (T.getArch() == Triple::arm64_be || T.getArch() == Triple::aarch64_be)
+    IsLittleEndian = false;
+
+  // We prefer NEON instructions to be printed in the short form.
+  AssemblerDialect = AsmWriterVariant == Default ? 0 : AsmWriterVariant;
+
+  PointerSize = 8;
+
+  // ".comm align is in bytes but .align is pow-2."
+  AlignmentIsInBytes = false;
+
+  CommentString = "//";
+  PrivateGlobalPrefix = ".L";
+  Code32Directive = ".code\t32";
+
+  Data16bitsDirective = "\t.hword\t";
+  Data32bitsDirective = "\t.word\t";
+  Data64bitsDirective = "\t.xword\t";
+
+  UseDataRegionDirectives = false;
+
+  WeakRefDirective = "\t.weak\t";
+
+  HasLEB128 = true;
+  SupportsDebugInformation = true;
+
+  // Exceptions handling
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+
+  UseIntegratedAssembler = true;
+
+  HasIdentDirective = true;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
new file mode 100644
index 0000000..42a031d
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
@@ -0,0 +1,36 @@
+//=====-- AArch64MCAsmInfo.h - AArch64 asm properties ---------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the AArch64MCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AArch64TARGETASMINFO_H
+#define AArch64TARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfoDarwin.h"
+
+namespace llvm {
+class Target;
+class StringRef;
+class MCStreamer;
+struct AArch64MCAsmInfoDarwin : public MCAsmInfoDarwin {
+  explicit AArch64MCAsmInfoDarwin();
+  const MCExpr *
+  getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding,
+                              MCStreamer &Streamer) const override;
+};
+
+struct AArch64MCAsmInfoELF : public MCAsmInfo {
+  explicit AArch64MCAsmInfoELF(StringRef TT);
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
new file mode 100644
index 0000000..f051357
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
@@ -0,0 +1,650 @@
+//=- AArch64/AArch64MCCodeEmitter.cpp - Convert AArch64 code to machine code-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AArch64MCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "MCTargetDesc/AArch64FixupKinds.h"
+#include "MCTargetDesc/AArch64MCExpr.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "mccodeemitter"
+
+STATISTIC(MCNumEmitted, "Number of MC instructions emitted.");
+STATISTIC(MCNumFixups, "Number of MC fixups created.");
+
+namespace {
+
+class AArch64MCCodeEmitter : public MCCodeEmitter {
+  MCContext &Ctx;
+
+  AArch64MCCodeEmitter(const AArch64MCCodeEmitter &); // DO NOT IMPLEMENT
+  void operator=(const AArch64MCCodeEmitter &);     // DO NOT IMPLEMENT
+public:
+  AArch64MCCodeEmitter(const MCInstrInfo &mcii, const MCSubtargetInfo &sti,
+                     MCContext &ctx)
+      : Ctx(ctx) {}
+
+  ~AArch64MCCodeEmitter() {}
+
+  // getBinaryCodeForInstr - TableGen'erated function for getting the
+  // binary encoding for an instruction.
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  /// getMachineOpValue - Return binary encoding of operand. If the machine
+  /// operand requires relocation, record the relocation and return zero.
+  unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+
+  /// getLdStUImm12OpValue - Return encoding info for 12-bit unsigned immediate
+  /// attached to a load, store or prfm instruction. If operand requires a
+  /// relocation, record it and return zero in that part of the encoding.
+  template <uint32_t FixupKind>
+  uint32_t getLdStUImm12OpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+
+  /// getAdrLabelOpValue - Return encoding info for 21-bit immediate ADR label
+  /// target.
+  uint32_t getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
+
+  /// getAddSubImmOpValue - Return encoding for the 12-bit immediate value and
+  /// the 2-bit shift field.
+  uint32_t getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
+
+  /// getCondBranchTargetOpValue - Return the encoded value for a conditional
+  /// branch target.
+  uint32_t getCondBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const;
+
+  /// getLoadLiteralOpValue - Return the encoded value for a load-literal
+  /// pc-relative address.
+  uint32_t getLoadLiteralOpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  /// getMemExtendOpValue - Return the encoded value for a reg-extend load/store
+  /// instruction: bit 0 is whether a shift is present, bit 1 is whether the
+  /// operation is a sign extend (as opposed to a zero extend).
+  uint32_t getMemExtendOpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
+
+  /// getTestBranchTargetOpValue - Return the encoded value for a test-bit-and-
+  /// branch target.
+  uint32_t getTestBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const;
+
+  /// getBranchTargetOpValue - Return the encoded value for an unconditional
+  /// branch target.
+  uint32_t getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const;
+
+  /// getMoveWideImmOpValue - Return the encoded value for the immediate operand
+  /// of a MOVZ or MOVK instruction.
+  uint32_t getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  /// getVecShifterOpValue - Return the encoded value for the vector shifter.
+  uint32_t getVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+
+  /// getMoveVecShifterOpValue - Return the encoded value for the vector move
+  /// shifter (MSL).
+  uint32_t getMoveVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const;
+
+  /// getFixedPointScaleOpValue - Return the encoded value for the
+  // FP-to-fixed-point scale factor.
+  uint32_t getFixedPointScaleOpValue(const MCInst &MI, unsigned OpIdx,
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const;
+
+  uint32_t getVecShiftR64OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftR32OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftR16OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftR8OpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftL64OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftL32OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftL16OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftL8OpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+
+  /// getSIMDShift64OpValue - Return the encoded value for the
+  // shift-by-immediate AdvSIMD instructions.
+  uint32_t getSIMDShift64OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  uint32_t getSIMDShift64_32OpValue(const MCInst &MI, unsigned OpIdx,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const;
+
+  uint32_t getSIMDShift32OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  uint32_t getSIMDShift16OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  unsigned fixMOVZ(const MCInst &MI, unsigned EncodedValue,
+                   const MCSubtargetInfo &STI) const;
+
+  void EmitByte(unsigned char C, raw_ostream &OS) const { OS << (char)C; }
+
+  void EmitConstant(uint64_t Val, unsigned Size, raw_ostream &OS) const {
+    // Output the constant in little endian byte order.
+    for (unsigned i = 0; i != Size; ++i) {
+      EmitByte(Val & 255, OS);
+      Val >>= 8;
+    }
+  }
+
+  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const override;
+
+  unsigned fixMulHigh(const MCInst &MI, unsigned EncodedValue,
+                      const MCSubtargetInfo &STI) const;
+
+  template<int hasRs, int hasRt2> unsigned
+  fixLoadStoreExclusive(const MCInst &MI, unsigned EncodedValue,
+                        const MCSubtargetInfo &STI) const;
+
+  unsigned fixOneOperandFPComparison(const MCInst &MI, unsigned EncodedValue,
+                                     const MCSubtargetInfo &STI) const;
+};
+
+} // end anonymous namespace
+
+MCCodeEmitter *llvm::createAArch64MCCodeEmitter(const MCInstrInfo &MCII,
+                                                const MCRegisterInfo &MRI,
+                                                const MCSubtargetInfo &STI,
+                                                MCContext &Ctx) {
+  return new AArch64MCCodeEmitter(MCII, STI, Ctx);
+}
+
+/// getMachineOpValue - Return binary encoding of operand. If the machine
+/// operand requires relocation, record the relocation and return zero.
+unsigned
+AArch64MCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                                        SmallVectorImpl<MCFixup> &Fixups,
+                                        const MCSubtargetInfo &STI) const {
+  if (MO.isReg())
+    return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
+
+  assert(MO.isImm() && "did not expect relocated expression");
+  return static_cast<unsigned>(MO.getImm());
+}
+
+template<unsigned FixupKind> uint32_t
+AArch64MCCodeEmitter::getLdStUImm12OpValue(const MCInst &MI, unsigned OpIdx,
+                                           SmallVectorImpl<MCFixup> &Fixups,
+                                           const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  uint32_t ImmVal = 0;
+
+  if (MO.isImm())
+    ImmVal = static_cast<uint32_t>(MO.getImm());
+  else {
+    assert(MO.isExpr() && "unable to encode load/store imm operand");
+    MCFixupKind Kind = MCFixupKind(FixupKind);
+    Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
+    ++MCNumFixups;
+  }
+
+  return ImmVal;
+}
+
+/// getAdrLabelOpValue - Return encoding info for 21-bit immediate ADR label
+/// target.
+uint32_t
+AArch64MCCodeEmitter::getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                                         SmallVectorImpl<MCFixup> &Fixups,
+                                         const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+
+  // If the destination is an immediate, we have nothing to do.
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected target type!");
+  const MCExpr *Expr = MO.getExpr();
+
+  MCFixupKind Kind = MI.getOpcode() == AArch64::ADR
+                         ? MCFixupKind(AArch64::fixup_aarch64_pcrel_adr_imm21)
+                         : MCFixupKind(AArch64::fixup_aarch64_pcrel_adrp_imm21);
+  Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
+
+  MCNumFixups += 1;
+
+  // All of the information is in the fixup.
+  return 0;
+}
+
+/// getAddSubImmOpValue - Return encoding for the 12-bit immediate value and
+/// the 2-bit shift field.  The shift field is stored in bits 13-14 of the
+/// return value.
+uint32_t
+AArch64MCCodeEmitter::getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  // Suboperands are [imm, shifter].
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+  assert(AArch64_AM::getShiftType(MO1.getImm()) == AArch64_AM::LSL &&
+         "unexpected shift type for add/sub immediate");
+  unsigned ShiftVal = AArch64_AM::getShiftValue(MO1.getImm());
+  assert((ShiftVal == 0 || ShiftVal == 12) &&
+         "unexpected shift value for add/sub immediate");
+  if (MO.isImm())
+    return MO.getImm() | (ShiftVal == 0 ? 0 : (1 << 12));
+  assert(MO.isExpr() && "Unable to encode MCOperand!");
+  const MCExpr *Expr = MO.getExpr();
+
+  // Encode the 12 bits of the fixup.
+  MCFixupKind Kind = MCFixupKind(AArch64::fixup_aarch64_add_imm12);
+  Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
+
+  ++MCNumFixups;
+
+  return 0;
+}
+
+/// getCondBranchTargetOpValue - Return the encoded value for a conditional
+/// branch target.
+uint32_t AArch64MCCodeEmitter::getCondBranchTargetOpValue(
+    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+
+  // If the destination is an immediate, we have nothing to do.
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected target type!");
+
+  MCFixupKind Kind = MCFixupKind(AArch64::fixup_aarch64_pcrel_branch19);
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
+
+  ++MCNumFixups;
+
+  // All of the information is in the fixup.
+  return 0;
+}
+
+/// getLoadLiteralOpValue - Return the encoded value for a load-literal
+/// pc-relative address.
+uint32_t
+AArch64MCCodeEmitter::getLoadLiteralOpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+
+  // If the destination is an immediate, we have nothing to do.
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected target type!");
+
+  MCFixupKind Kind = MCFixupKind(AArch64::fixup_aarch64_ldr_pcrel_imm19);
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
+
+  ++MCNumFixups;
+
+  // All of the information is in the fixup.
+  return 0;
+}
+
+uint32_t
+AArch64MCCodeEmitter::getMemExtendOpValue(const MCInst &MI, unsigned OpIdx,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  unsigned SignExtend = MI.getOperand(OpIdx).getImm();
+  unsigned DoShift = MI.getOperand(OpIdx + 1).getImm();
+  return (SignExtend << 1) | DoShift;
+}
+
+uint32_t
+AArch64MCCodeEmitter::getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected movz/movk immediate");
+
+  Fixups.push_back(MCFixup::Create(
+      0, MO.getExpr(), MCFixupKind(AArch64::fixup_aarch64_movw), MI.getLoc()));
+
+  ++MCNumFixups;
+
+  return 0;
+}
+
+/// getTestBranchTargetOpValue - Return the encoded value for a test-bit-and-
+/// branch target.
+uint32_t AArch64MCCodeEmitter::getTestBranchTargetOpValue(
+    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+
+  // If the destination is an immediate, we have nothing to do.
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected ADR target type!");
+
+  MCFixupKind Kind = MCFixupKind(AArch64::fixup_aarch64_pcrel_branch14);
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
+
+  ++MCNumFixups;
+
+  // All of the information is in the fixup.
+  return 0;
+}
+
+/// getBranchTargetOpValue - Return the encoded value for an unconditional
+/// branch target.
+uint32_t
+AArch64MCCodeEmitter::getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                             SmallVectorImpl<MCFixup> &Fixups,
+                                             const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+
+  // If the destination is an immediate, we have nothing to do.
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected ADR target type!");
+
+  MCFixupKind Kind = MI.getOpcode() == AArch64::BL
+                         ? MCFixupKind(AArch64::fixup_aarch64_pcrel_call26)
+                         : MCFixupKind(AArch64::fixup_aarch64_pcrel_branch26);
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
+
+  ++MCNumFixups;
+
+  // All of the information is in the fixup.
+  return 0;
+}
+
+/// getVecShifterOpValue - Return the encoded value for the vector shifter:
+///
+///   00 -> 0
+///   01 -> 8
+///   10 -> 16
+///   11 -> 24
+uint32_t
+AArch64MCCodeEmitter::getVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
+                                           SmallVectorImpl<MCFixup> &Fixups,
+                                           const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
+
+  switch (MO.getImm()) {
+  default:
+    break;
+  case 0:
+    return 0;
+  case 8:
+    return 1;
+  case 16:
+    return 2;
+  case 24:
+    return 3;
+  }
+
+  assert(false && "Invalid value for vector shift amount!");
+  return 0;
+}
+
+uint32_t
+AArch64MCCodeEmitter::getSIMDShift64OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
+  return 64 - (MO.getImm());
+}
+
+uint32_t AArch64MCCodeEmitter::getSIMDShift64_32OpValue(
+    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
+  return 64 - (MO.getImm() | 32);
+}
+
+uint32_t
+AArch64MCCodeEmitter::getSIMDShift32OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
+  return 32 - (MO.getImm() | 16);
+}
+
+uint32_t
+AArch64MCCodeEmitter::getSIMDShift16OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
+  return 16 - (MO.getImm() | 8);
+}
+
+/// getFixedPointScaleOpValue - Return the encoded value for the
+// FP-to-fixed-point scale factor.
+uint32_t AArch64MCCodeEmitter::getFixedPointScaleOpValue(
+    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return 64 - MO.getImm();
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftR64OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return 64 - MO.getImm();
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftR32OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return 32 - MO.getImm();
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftR16OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return 16 - MO.getImm();
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftR8OpValue(const MCInst &MI, unsigned OpIdx,
+                                           SmallVectorImpl<MCFixup> &Fixups,
+                                           const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return 8 - MO.getImm();
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftL64OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return MO.getImm() - 64;
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftL32OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return MO.getImm() - 32;
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftL16OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return MO.getImm() - 16;
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftL8OpValue(const MCInst &MI, unsigned OpIdx,
+                                           SmallVectorImpl<MCFixup> &Fixups,
+                                           const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return MO.getImm() - 8;
+}
+
+/// getMoveVecShifterOpValue - Return the encoded value for the vector move
+/// shifter (MSL).
+uint32_t AArch64MCCodeEmitter::getMoveVecShifterOpValue(
+    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() &&
+         "Expected an immediate value for the move shift amount!");
+  unsigned ShiftVal = AArch64_AM::getShiftValue(MO.getImm());
+  assert((ShiftVal == 8 || ShiftVal == 16) && "Invalid shift amount!");
+  return ShiftVal == 8 ? 0 : 1;
+}
+
+unsigned AArch64MCCodeEmitter::fixMOVZ(const MCInst &MI, unsigned EncodedValue,
+                                       const MCSubtargetInfo &STI) const {
+  // If one of the signed fixup kinds is applied to a MOVZ instruction, the
+  // eventual result could be either a MOVZ or a MOVN. It's the MCCodeEmitter's
+  // job to ensure that any bits possibly affected by this are 0. This means we
+  // must zero out bit 30 (essentially emitting a MOVN).
+  MCOperand UImm16MO = MI.getOperand(1);
+
+  // Nothing to do if there's no fixup.
+  if (UImm16MO.isImm())
+    return EncodedValue;
+
+  const AArch64MCExpr *A64E = cast<AArch64MCExpr>(UImm16MO.getExpr());
+  switch (A64E->getKind()) {
+  case AArch64MCExpr::VK_DTPREL_G2:
+  case AArch64MCExpr::VK_DTPREL_G1:
+  case AArch64MCExpr::VK_DTPREL_G0:
+  case AArch64MCExpr::VK_GOTTPREL_G1:
+  case AArch64MCExpr::VK_TPREL_G2:
+  case AArch64MCExpr::VK_TPREL_G1:
+  case AArch64MCExpr::VK_TPREL_G0:
+    return EncodedValue & ~(1u << 30);
+  default:
+    // Nothing to do for an unsigned fixup.
+    return EncodedValue;
+  }
+
+
+  return EncodedValue & ~(1u << 30);
+}
+
+void AArch64MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                                             SmallVectorImpl<MCFixup> &Fixups,
+                                             const MCSubtargetInfo &STI) const {
+  if (MI.getOpcode() == AArch64::TLSDESCCALL) {
+    // This is a directive which applies an R_AARCH64_TLSDESC_CALL to the
+    // following (BLR) instruction. It doesn't emit any code itself so it
+    // doesn't go through the normal TableGenerated channels.
+    MCFixupKind Fixup = MCFixupKind(AArch64::fixup_aarch64_tlsdesc_call);
+    Fixups.push_back(MCFixup::Create(0, MI.getOperand(0).getExpr(), Fixup));
+    return;
+  }
+
+  uint64_t Binary = getBinaryCodeForInstr(MI, Fixups, STI);
+  EmitConstant(Binary, 4, OS);
+  ++MCNumEmitted; // Keep track of the # of mi's emitted.
+}
+
+unsigned
+AArch64MCCodeEmitter::fixMulHigh(const MCInst &MI,
+                                 unsigned EncodedValue,
+                                 const MCSubtargetInfo &STI) const {
+  // The Ra field of SMULH and UMULH is unused: it should be assembled as 31
+  // (i.e. all bits 1) but is ignored by the processor.
+  EncodedValue |= 0x1f << 10;
+  return EncodedValue;
+}
+
+template<int hasRs, int hasRt2> unsigned
+AArch64MCCodeEmitter::fixLoadStoreExclusive(const MCInst &MI,
+                                            unsigned EncodedValue,
+                                            const MCSubtargetInfo &STI) const {
+  if (!hasRs) EncodedValue |= 0x001F0000;
+  if (!hasRt2) EncodedValue |= 0x00007C00;
+
+  return EncodedValue;
+}
+
+unsigned AArch64MCCodeEmitter::fixOneOperandFPComparison(
+    const MCInst &MI, unsigned EncodedValue, const MCSubtargetInfo &STI) const {
+  // The Rm field of FCMP and friends is unused - it should be assembled
+  // as 0, but is ignored by the processor.
+  EncodedValue &= ~(0x1f << 16);
+  return EncodedValue;
+}
+
+#include "AArch64GenMCCodeEmitter.inc"
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
new file mode 100644
index 0000000..42a6787
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
@@ -0,0 +1,145 @@
+//===-- AArch64MCExpr.cpp - AArch64 specific MC expression classes --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the assembly expression modifiers
+// accepted by the AArch64 architecture (e.g. ":lo12:", ":gottprel_g1:", ...).
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64MCExpr.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELF.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Object/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64symbolrefexpr"
+
+const AArch64MCExpr *AArch64MCExpr::Create(const MCExpr *Expr, VariantKind Kind,
+                                       MCContext &Ctx) {
+  return new (Ctx) AArch64MCExpr(Expr, Kind);
+}
+
+StringRef AArch64MCExpr::getVariantKindName() const {
+  switch (static_cast<uint32_t>(getKind())) {
+  case VK_CALL:                return "";
+  case VK_LO12:                return ":lo12:";
+  case VK_ABS_G3:              return ":abs_g3:";
+  case VK_ABS_G2:              return ":abs_g2:";
+  case VK_ABS_G2_S:            return ":abs_g2_s:";
+  case VK_ABS_G2_NC:           return ":abs_g2_nc:";
+  case VK_ABS_G1:              return ":abs_g1:";
+  case VK_ABS_G1_S:            return ":abs_g1_s:";
+  case VK_ABS_G1_NC:           return ":abs_g1_nc:";
+  case VK_ABS_G0:              return ":abs_g0:";
+  case VK_ABS_G0_S:            return ":abs_g0_s:";
+  case VK_ABS_G0_NC:           return ":abs_g0_nc:";
+  case VK_DTPREL_G2:           return ":dtprel_g2:";
+  case VK_DTPREL_G1:           return ":dtprel_g1:";
+  case VK_DTPREL_G1_NC:        return ":dtprel_g1_nc:";
+  case VK_DTPREL_G0:           return ":dtprel_g0:";
+  case VK_DTPREL_G0_NC:        return ":dtprel_g0_nc:";
+  case VK_DTPREL_HI12:         return ":dtprel_hi12:";
+  case VK_DTPREL_LO12:         return ":dtprel_lo12:";
+  case VK_DTPREL_LO12_NC:      return ":dtprel_lo12_nc:";
+  case VK_TPREL_G2:            return ":tprel_g2:";
+  case VK_TPREL_G1:            return ":tprel_g1:";
+  case VK_TPREL_G1_NC:         return ":tprel_g1_nc:";
+  case VK_TPREL_G0:            return ":tprel_g0:";
+  case VK_TPREL_G0_NC:         return ":tprel_g0_nc:";
+  case VK_TPREL_HI12:          return ":tprel_hi12:";
+  case VK_TPREL_LO12:          return ":tprel_lo12:";
+  case VK_TPREL_LO12_NC:       return ":tprel_lo12_nc:";
+  case VK_TLSDESC_LO12:        return ":tlsdesc_lo12:";
+  case VK_ABS_PAGE:            return "";
+  case VK_GOT_PAGE:            return ":got:";
+  case VK_GOT_LO12:            return ":got_lo12:";
+  case VK_GOTTPREL_PAGE:       return ":gottprel:";
+  case VK_GOTTPREL_LO12_NC:    return ":gottprel_lo12:";
+  case VK_GOTTPREL_G1:         return ":gottprel_g1:";
+  case VK_GOTTPREL_G0_NC:      return ":gottprel_g0_nc:";
+  case VK_TLSDESC:             return "";
+  case VK_TLSDESC_PAGE:        return ":tlsdesc:";
+  default:
+    llvm_unreachable("Invalid ELF symbol kind");
+  }
+}
+
+void AArch64MCExpr::PrintImpl(raw_ostream &OS) const {
+  if (getKind() != VK_NONE)
+    OS << getVariantKindName();
+  OS << *Expr;
+}
+
+void AArch64MCExpr::visitUsedExpr(MCStreamer &Streamer) const {
+  Streamer.visitUsedExpr(*getSubExpr());
+}
+
+const MCSection *AArch64MCExpr::FindAssociatedSection() const {
+  llvm_unreachable("FIXME: what goes here?");
+}
+
+bool AArch64MCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
+                                            const MCAsmLayout *Layout) const {
+  if (!getSubExpr()->EvaluateAsRelocatable(Res, Layout))
+    return false;
+
+  Res =
+      MCValue::get(Res.getSymA(), Res.getSymB(), Res.getConstant(), getKind());
+
+  return true;
+}
+
+static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) {
+  switch (Expr->getKind()) {
+  case MCExpr::Target:
+    llvm_unreachable("Can't handle nested target expression");
+    break;
+  case MCExpr::Constant:
+    break;
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Expr);
+    fixELFSymbolsInTLSFixupsImpl(BE->getLHS(), Asm);
+    fixELFSymbolsInTLSFixupsImpl(BE->getRHS(), Asm);
+    break;
+  }
+
+  case MCExpr::SymbolRef: {
+    // We're known to be under a TLS fixup, so any symbol should be
+    // modified. There should be only one.
+    const MCSymbolRefExpr &SymRef = *cast<MCSymbolRefExpr>(Expr);
+    MCSymbolData &SD = Asm.getOrCreateSymbolData(SymRef.getSymbol());
+    MCELF::SetType(SD, ELF::STT_TLS);
+    break;
+  }
+
+  case MCExpr::Unary:
+    fixELFSymbolsInTLSFixupsImpl(cast<MCUnaryExpr>(Expr)->getSubExpr(), Asm);
+    break;
+  }
+}
+
+void AArch64MCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
+  switch (getSymbolLoc(Kind)) {
+  default:
+    return;
+  case VK_DTPREL:
+  case VK_GOTTPREL:
+  case VK_TPREL:
+  case VK_TLSDESC:
+    break;
+  }
+
+  fixELFSymbolsInTLSFixupsImpl(getSubExpr(), Asm);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
new file mode 100644
index 0000000..5422f9d
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
@@ -0,0 +1,168 @@
+//=--- AArch64MCExpr.h - AArch64 specific MC expression classes ---*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes AArch64-specific MCExprs, used for modifiers like
+// ":lo12:" or ":gottprel_g1:".
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AArch64MCEXPR_H
+#define LLVM_AArch64MCEXPR_H
+
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+class AArch64MCExpr : public MCTargetExpr {
+public:
+  enum VariantKind {
+    VK_NONE     = 0x000,
+
+    // Symbol locations specifying (roughly speaking) what calculation should be
+    // performed to construct the final address for the relocated
+    // symbol. E.g. direct, via the GOT, ...
+    VK_ABS      = 0x001,
+    VK_SABS     = 0x002,
+    VK_GOT      = 0x003,
+    VK_DTPREL   = 0x004,
+    VK_GOTTPREL = 0x005,
+    VK_TPREL    = 0x006,
+    VK_TLSDESC  = 0x007,
+    VK_SymLocBits = 0x00f,
+
+    // Variants specifying which part of the final address calculation is
+    // used. E.g. the low 12 bits for an ADD/LDR, the middle 16 bits for a
+    // MOVZ/MOVK.
+    VK_PAGE     = 0x010,
+    VK_PAGEOFF  = 0x020,
+    VK_HI12     = 0x030,
+    VK_G0       = 0x040,
+    VK_G1       = 0x050,
+    VK_G2       = 0x060,
+    VK_G3       = 0x070,
+    VK_AddressFragBits = 0x0f0,
+
+    // Whether the final relocation is a checked one (where a linker should
+    // perform a range-check on the final address) or not. Note that this field
+    // is unfortunately sometimes omitted from the assembly syntax. E.g. :lo12:
+    // on its own is a non-checked relocation. We side with ELF on being
+    // explicit about this!
+    VK_NC       = 0x100,
+
+    // Convenience definitions for referring to specific textual representations
+    // of relocation specifiers. Note that this means the "_NC" is sometimes
+    // omitted in line with assembly syntax here (VK_LO12 rather than VK_LO12_NC
+    // since a user would write ":lo12:").
+    VK_CALL              = VK_ABS,
+    VK_ABS_PAGE          = VK_ABS      | VK_PAGE,
+    VK_ABS_G3            = VK_ABS      | VK_G3,
+    VK_ABS_G2            = VK_ABS      | VK_G2,
+    VK_ABS_G2_S          = VK_SABS     | VK_G2,
+    VK_ABS_G2_NC         = VK_ABS      | VK_G2      | VK_NC,
+    VK_ABS_G1            = VK_ABS      | VK_G1,
+    VK_ABS_G1_S          = VK_SABS     | VK_G1,
+    VK_ABS_G1_NC         = VK_ABS      | VK_G1      | VK_NC,
+    VK_ABS_G0            = VK_ABS      | VK_G0,
+    VK_ABS_G0_S          = VK_SABS     | VK_G0,
+    VK_ABS_G0_NC         = VK_ABS      | VK_G0      | VK_NC,
+    VK_LO12              = VK_ABS      | VK_PAGEOFF | VK_NC,
+    VK_GOT_LO12          = VK_GOT      | VK_PAGEOFF | VK_NC,
+    VK_GOT_PAGE          = VK_GOT      | VK_PAGE,
+    VK_DTPREL_G2         = VK_DTPREL   | VK_G2,
+    VK_DTPREL_G1         = VK_DTPREL   | VK_G1,
+    VK_DTPREL_G1_NC      = VK_DTPREL   | VK_G1      | VK_NC,
+    VK_DTPREL_G0         = VK_DTPREL   | VK_G0,
+    VK_DTPREL_G0_NC      = VK_DTPREL   | VK_G0      | VK_NC,
+    VK_DTPREL_HI12       = VK_DTPREL   | VK_HI12,
+    VK_DTPREL_LO12       = VK_DTPREL   | VK_PAGEOFF,
+    VK_DTPREL_LO12_NC    = VK_DTPREL   | VK_PAGEOFF | VK_NC,
+    VK_GOTTPREL_PAGE     = VK_GOTTPREL | VK_PAGE,
+    VK_GOTTPREL_LO12_NC  = VK_GOTTPREL | VK_PAGEOFF | VK_NC,
+    VK_GOTTPREL_G1       = VK_GOTTPREL | VK_G1,
+    VK_GOTTPREL_G0_NC    = VK_GOTTPREL | VK_G0      | VK_NC,
+    VK_TPREL_G2          = VK_TPREL    | VK_G2,
+    VK_TPREL_G1          = VK_TPREL    | VK_G1,
+    VK_TPREL_G1_NC       = VK_TPREL    | VK_G1      | VK_NC,
+    VK_TPREL_G0          = VK_TPREL    | VK_G0,
+    VK_TPREL_G0_NC       = VK_TPREL    | VK_G0      | VK_NC,
+    VK_TPREL_HI12        = VK_TPREL    | VK_HI12,
+    VK_TPREL_LO12        = VK_TPREL    | VK_PAGEOFF,
+    VK_TPREL_LO12_NC     = VK_TPREL    | VK_PAGEOFF | VK_NC,
+    VK_TLSDESC_LO12      = VK_TLSDESC  | VK_PAGEOFF | VK_NC,
+    VK_TLSDESC_PAGE      = VK_TLSDESC  | VK_PAGE,
+
+    VK_INVALID  = 0xfff
+  };
+
+private:
+  const MCExpr *Expr;
+  const VariantKind Kind;
+
+  explicit AArch64MCExpr(const MCExpr *Expr, VariantKind Kind)
+    : Expr(Expr), Kind(Kind) {}
+
+public:
+  /// @name Construction
+  /// @{
+
+  static const AArch64MCExpr *Create(const MCExpr *Expr, VariantKind Kind,
+                                   MCContext &Ctx);
+
+  /// @}
+  /// @name Accessors
+  /// @{
+
+  /// Get the kind of this expression.
+  VariantKind getKind() const { return static_cast<VariantKind>(Kind); }
+
+  /// Get the expression this modifier applies to.
+  const MCExpr *getSubExpr() const { return Expr; }
+
+  /// @}
+  /// @name VariantKind information extractors.
+  /// @{
+
+  static VariantKind getSymbolLoc(VariantKind Kind) {
+    return static_cast<VariantKind>(Kind & VK_SymLocBits);
+  }
+
+  static VariantKind getAddressFrag(VariantKind Kind) {
+    return static_cast<VariantKind>(Kind & VK_AddressFragBits);
+  }
+
+  static bool isNotChecked(VariantKind Kind) { return Kind & VK_NC; }
+
+  /// @}
+
+  /// Convert the variant kind into an ELF-appropriate modifier
+  /// (e.g. ":got:", ":lo12:").
+  StringRef getVariantKindName() const;
+
+  void PrintImpl(raw_ostream &OS) const override;
+
+  void visitUsedExpr(MCStreamer &Streamer) const override;
+
+  const MCSection *FindAssociatedSection() const override;
+
+  bool EvaluateAsRelocatableImpl(MCValue &Res,
+                                 const MCAsmLayout *Layout) const override;
+
+  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override;
+
+  static bool classof(const MCExpr *E) {
+    return E->getKind() == MCExpr::Target;
+  }
+
+  static bool classof(const AArch64MCExpr *) { return true; }
+
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
new file mode 100644
index 0000000..ae698c5
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -0,0 +1,225 @@
+//===-- AArch64MCTargetDesc.cpp - AArch64 Target Descriptions ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides AArch64 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64MCTargetDesc.h"
+#include "AArch64ELFStreamer.h"
+#include "AArch64MCAsmInfo.h"
+#include "InstPrinter/AArch64InstPrinter.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_MC_DESC
+#include "AArch64GenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "AArch64GenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "AArch64GenRegisterInfo.inc"
+
+static MCInstrInfo *createAArch64MCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitAArch64MCInstrInfo(X);
+  return X;
+}
+
+static MCSubtargetInfo *
+createAArch64MCSubtargetInfo(StringRef TT, StringRef CPU, StringRef FS) {
+  MCSubtargetInfo *X = new MCSubtargetInfo();
+
+  if (CPU.empty())
+    CPU = "generic";
+
+  InitAArch64MCSubtargetInfo(X, TT, CPU, FS);
+  return X;
+}
+
+static MCRegisterInfo *createAArch64MCRegisterInfo(StringRef Triple) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitAArch64MCRegisterInfo(X, AArch64::LR);
+  return X;
+}
+
+static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI,
+                                         StringRef TT) {
+  Triple TheTriple(TT);
+
+  MCAsmInfo *MAI;
+  if (TheTriple.isOSDarwin())
+    MAI = new AArch64MCAsmInfoDarwin();
+  else {
+    assert(TheTriple.isOSBinFormatELF() && "Only expect Darwin or ELF");
+    MAI = new AArch64MCAsmInfoELF(TT);
+  }
+
+  // Initial state of the frame pointer is SP.
+  unsigned Reg = MRI.getDwarfRegNum(AArch64::SP, true);
+  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, Reg, 0);
+  MAI->addInitialFrameState(Inst);
+
+  return MAI;
+}
+
+static MCCodeGenInfo *createAArch64MCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                                 CodeModel::Model CM,
+                                                 CodeGenOpt::Level OL) {
+  Triple TheTriple(TT);
+  assert((TheTriple.isOSBinFormatELF() || TheTriple.isOSBinFormatMachO()) &&
+         "Only expect Darwin and ELF targets");
+
+  if (CM == CodeModel::Default)
+    CM = CodeModel::Small;
+  // The default MCJIT memory managers make no guarantees about where they can
+  // find an executable page; JITed code needs to be able to refer to globals
+  // no matter how far away they are.
+  else if (CM == CodeModel::JITDefault)
+    CM = CodeModel::Large;
+  else if (CM != CodeModel::Small && CM != CodeModel::Large)
+    report_fatal_error(
+        "Only small and large code models are allowed on AArch64");
+
+  // AArch64 Darwin is always PIC.
+  if (TheTriple.isOSDarwin())
+    RM = Reloc::PIC_;
+  // On ELF platforms the default static relocation model has a smart enough
+  // linker to cope with referencing external symbols defined in a shared
+  // library. Hence DynamicNoPIC doesn't need to be promoted to PIC.
+  else if (RM == Reloc::Default || RM == Reloc::DynamicNoPIC)
+    RM = Reloc::Static;
+
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  X->InitMCCodeGenInfo(RM, CM, OL);
+  return X;
+}
+
+static MCInstPrinter *createAArch64MCInstPrinter(const Target &T,
+                                                 unsigned SyntaxVariant,
+                                                 const MCAsmInfo &MAI,
+                                                 const MCInstrInfo &MII,
+                                                 const MCRegisterInfo &MRI,
+                                                 const MCSubtargetInfo &STI) {
+  if (SyntaxVariant == 0)
+    return new AArch64InstPrinter(MAI, MII, MRI, STI);
+  if (SyntaxVariant == 1)
+    return new AArch64AppleInstPrinter(MAI, MII, MRI, STI);
+
+  return nullptr;
+}
+
+static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
+                                    MCContext &Ctx, MCAsmBackend &TAB,
+                                    raw_ostream &OS, MCCodeEmitter *Emitter,
+                                    const MCSubtargetInfo &STI, bool RelaxAll,
+                                    bool NoExecStack) {
+  Triple TheTriple(TT);
+
+  if (TheTriple.isOSDarwin())
+    return createMachOStreamer(Ctx, TAB, OS, Emitter, RelaxAll,
+                               /*LabelSections*/ true);
+
+  return createAArch64ELFStreamer(Ctx, TAB, OS, Emitter, RelaxAll, NoExecStack);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeAArch64TargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfoFn X(TheAArch64leTarget, createAArch64MCAsmInfo);
+  RegisterMCAsmInfoFn Y(TheAArch64beTarget, createAArch64MCAsmInfo);
+  RegisterMCAsmInfoFn Z(TheARM64leTarget, createAArch64MCAsmInfo);
+  RegisterMCAsmInfoFn W(TheARM64beTarget, createAArch64MCAsmInfo);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheAArch64leTarget,
+                                        createAArch64MCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheAArch64beTarget,
+                                        createAArch64MCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheARM64leTarget,
+                                        createAArch64MCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheARM64beTarget,
+                                        createAArch64MCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheAArch64leTarget,
+                                      createAArch64MCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheAArch64beTarget,
+                                      createAArch64MCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheARM64leTarget,
+                                      createAArch64MCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheARM64beTarget,
+                                      createAArch64MCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheAArch64leTarget,
+                                    createAArch64MCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheAArch64beTarget,
+                                    createAArch64MCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheARM64leTarget,
+                                    createAArch64MCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheARM64beTarget,
+                                    createAArch64MCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheAArch64leTarget,
+                                          createAArch64MCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(TheAArch64beTarget,
+                                          createAArch64MCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(TheARM64leTarget,
+                                          createAArch64MCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(TheARM64beTarget,
+                                          createAArch64MCSubtargetInfo);
+
+  // Register the asm backend.
+  TargetRegistry::RegisterMCAsmBackend(TheAArch64leTarget,
+                                       createAArch64leAsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(TheAArch64beTarget,
+                                       createAArch64beAsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(TheARM64leTarget,
+                                       createAArch64leAsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(TheARM64beTarget,
+                                       createAArch64beAsmBackend);
+
+  // Register the MC Code Emitter
+  TargetRegistry::RegisterMCCodeEmitter(TheAArch64leTarget,
+                                        createAArch64MCCodeEmitter);
+  TargetRegistry::RegisterMCCodeEmitter(TheAArch64beTarget,
+                                        createAArch64MCCodeEmitter);
+  TargetRegistry::RegisterMCCodeEmitter(TheARM64leTarget,
+                                        createAArch64MCCodeEmitter);
+  TargetRegistry::RegisterMCCodeEmitter(TheARM64beTarget,
+                                        createAArch64MCCodeEmitter);
+
+  // Register the object streamer.
+  TargetRegistry::RegisterMCObjectStreamer(TheAArch64leTarget,
+                                           createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(TheAArch64beTarget,
+                                           createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(TheARM64leTarget, createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(TheARM64beTarget, createMCStreamer);
+
+  // Register the MCInstPrinter.
+  TargetRegistry::RegisterMCInstPrinter(TheAArch64leTarget,
+                                        createAArch64MCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheAArch64beTarget,
+                                        createAArch64MCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheARM64leTarget,
+                                        createAArch64MCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheARM64beTarget,
+                                        createAArch64MCInstPrinter);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
new file mode 100644
index 0000000..d886ea2
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
@@ -0,0 +1,70 @@
+//===-- AArch64MCTargetDesc.h - AArch64 Target Descriptions -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides AArch64 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AArch64MCTARGETDESC_H
+#define AArch64MCTARGETDESC_H
+
+#include "llvm/Support/DataTypes.h"
+#include <string>
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCRegisterInfo;
+class MCObjectWriter;
+class MCSubtargetInfo;
+class StringRef;
+class Target;
+class raw_ostream;
+
+extern Target TheAArch64leTarget;
+extern Target TheAArch64beTarget;
+extern Target TheARM64leTarget;
+extern Target TheARM64beTarget;
+
+MCCodeEmitter *createAArch64MCCodeEmitter(const MCInstrInfo &MCII,
+                                        const MCRegisterInfo &MRI,
+                                        const MCSubtargetInfo &STI,
+                                        MCContext &Ctx);
+MCAsmBackend *createAArch64leAsmBackend(const Target &T,
+                                        const MCRegisterInfo &MRI, StringRef TT,
+                                        StringRef CPU);
+MCAsmBackend *createAArch64beAsmBackend(const Target &T,
+                                        const MCRegisterInfo &MRI, StringRef TT,
+                                        StringRef CPU);
+
+MCObjectWriter *createAArch64ELFObjectWriter(raw_ostream &OS, uint8_t OSABI,
+                                             bool IsLittleEndian);
+
+MCObjectWriter *createAArch64MachObjectWriter(raw_ostream &OS, uint32_t CPUType,
+                                            uint32_t CPUSubtype);
+
+} // End llvm namespace
+
+// Defines symbolic names for AArch64 registers.  This defines a mapping from
+// register name to register number.
+//
+#define GET_REGINFO_ENUM
+#include "AArch64GenRegisterInfo.inc"
+
+// Defines symbolic names for the AArch64 instructions.
+//
+#define GET_INSTRINFO_ENUM
+#include "AArch64GenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "AArch64GenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
new file mode 100644
index 0000000..ba95366
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
@@ -0,0 +1,396 @@
+//===-- AArch64MachObjectWriter.cpp - ARM Mach Object Writer --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AArch64FixupKinds.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachO.h"
+using namespace llvm;
+
+namespace {
+class AArch64MachObjectWriter : public MCMachObjectTargetWriter {
+  bool getAArch64FixupKindMachOInfo(const MCFixup &Fixup, unsigned &RelocType,
+                                  const MCSymbolRefExpr *Sym,
+                                  unsigned &Log2Size, const MCAssembler &Asm);
+
+public:
+  AArch64MachObjectWriter(uint32_t CPUType, uint32_t CPUSubtype)
+      : MCMachObjectTargetWriter(true /* is64Bit */, CPUType, CPUSubtype,
+                                 /*UseAggressiveSymbolFolding=*/true) {}
+
+  void RecordRelocation(MachObjectWriter *Writer, const MCAssembler &Asm,
+                        const MCAsmLayout &Layout, const MCFragment *Fragment,
+                        const MCFixup &Fixup, MCValue Target,
+                        uint64_t &FixedValue) override;
+};
+}
+
+bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo(
+    const MCFixup &Fixup, unsigned &RelocType, const MCSymbolRefExpr *Sym,
+    unsigned &Log2Size, const MCAssembler &Asm) {
+  RelocType = unsigned(MachO::ARM64_RELOC_UNSIGNED);
+  Log2Size = ~0U;
+
+  switch ((unsigned)Fixup.getKind()) {
+  default:
+    return false;
+
+  case FK_Data_1:
+    Log2Size = llvm::Log2_32(1);
+    return true;
+  case FK_Data_2:
+    Log2Size = llvm::Log2_32(2);
+    return true;
+  case FK_Data_4:
+    Log2Size = llvm::Log2_32(4);
+    if (Sym->getKind() == MCSymbolRefExpr::VK_GOT)
+      RelocType = unsigned(MachO::ARM64_RELOC_POINTER_TO_GOT);
+    return true;
+  case FK_Data_8:
+    Log2Size = llvm::Log2_32(8);
+    if (Sym->getKind() == MCSymbolRefExpr::VK_GOT)
+      RelocType = unsigned(MachO::ARM64_RELOC_POINTER_TO_GOT);
+    return true;
+  case AArch64::fixup_aarch64_add_imm12:
+  case AArch64::fixup_aarch64_ldst_imm12_scale1:
+  case AArch64::fixup_aarch64_ldst_imm12_scale2:
+  case AArch64::fixup_aarch64_ldst_imm12_scale4:
+  case AArch64::fixup_aarch64_ldst_imm12_scale8:
+  case AArch64::fixup_aarch64_ldst_imm12_scale16:
+    Log2Size = llvm::Log2_32(4);
+    switch (Sym->getKind()) {
+    default:
+      llvm_unreachable("Unexpected symbol reference variant kind!");
+    case MCSymbolRefExpr::VK_PAGEOFF:
+      RelocType = unsigned(MachO::ARM64_RELOC_PAGEOFF12);
+      return true;
+    case MCSymbolRefExpr::VK_GOTPAGEOFF:
+      RelocType = unsigned(MachO::ARM64_RELOC_GOT_LOAD_PAGEOFF12);
+      return true;
+    case MCSymbolRefExpr::VK_TLVPPAGEOFF:
+      RelocType = unsigned(MachO::ARM64_RELOC_TLVP_LOAD_PAGEOFF12);
+      return true;
+    }
+  case AArch64::fixup_aarch64_pcrel_adrp_imm21:
+    Log2Size = llvm::Log2_32(4);
+    // This encompasses the relocation for the whole 21-bit value.
+    switch (Sym->getKind()) {
+    default:
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                                  "ADR/ADRP relocations must be GOT relative");
+    case MCSymbolRefExpr::VK_PAGE:
+      RelocType = unsigned(MachO::ARM64_RELOC_PAGE21);
+      return true;
+    case MCSymbolRefExpr::VK_GOTPAGE:
+      RelocType = unsigned(MachO::ARM64_RELOC_GOT_LOAD_PAGE21);
+      return true;
+    case MCSymbolRefExpr::VK_TLVPPAGE:
+      RelocType = unsigned(MachO::ARM64_RELOC_TLVP_LOAD_PAGE21);
+      return true;
+    }
+    return true;
+  case AArch64::fixup_aarch64_pcrel_branch26:
+  case AArch64::fixup_aarch64_pcrel_call26:
+    Log2Size = llvm::Log2_32(4);
+    RelocType = unsigned(MachO::ARM64_RELOC_BRANCH26);
+    return true;
+  }
+}
+
+void AArch64MachObjectWriter::RecordRelocation(
+    MachObjectWriter *Writer, const MCAssembler &Asm, const MCAsmLayout &Layout,
+    const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target,
+    uint64_t &FixedValue) {
+  unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+
+  // See <reloc.h>.
+  uint32_t FixupOffset = Layout.getFragmentOffset(Fragment);
+  unsigned Log2Size = 0;
+  int64_t Value = 0;
+  unsigned Index = 0;
+  unsigned IsExtern = 0;
+  unsigned Type = 0;
+  unsigned Kind = Fixup.getKind();
+
+  FixupOffset += Fixup.getOffset();
+
+  // AArch64 pcrel relocation addends do not include the section offset.
+  if (IsPCRel)
+    FixedValue += FixupOffset;
+
+  // ADRP fixups use relocations for the whole symbol value and only
+  // put the addend in the instruction itself. Clear out any value the
+  // generic code figured out from the sybmol definition.
+  if (Kind == AArch64::fixup_aarch64_pcrel_adrp_imm21)
+    FixedValue = 0;
+
+  // imm19 relocations are for conditional branches, which require
+  // assembler local symbols. If we got here, that's not what we have,
+  // so complain loudly.
+  if (Kind == AArch64::fixup_aarch64_pcrel_branch19) {
+    Asm.getContext().FatalError(Fixup.getLoc(),
+                                "conditional branch requires assembler-local"
+                                " label. '" +
+                                    Target.getSymA()->getSymbol().getName() +
+                                    "' is external.");
+    return;
+  }
+
+  // 14-bit branch relocations should only target internal labels, and so
+  // should never get here.
+  if (Kind == AArch64::fixup_aarch64_pcrel_branch14) {
+    Asm.getContext().FatalError(Fixup.getLoc(),
+                                "Invalid relocation on conditional branch!");
+    return;
+  }
+
+  if (!getAArch64FixupKindMachOInfo(Fixup, Type, Target.getSymA(), Log2Size,
+                                  Asm)) {
+    Asm.getContext().FatalError(Fixup.getLoc(), "unknown AArch64 fixup kind!");
+    return;
+  }
+
+  Value = Target.getConstant();
+
+  if (Target.isAbsolute()) { // constant
+    // FIXME: Should this always be extern?
+    // SymbolNum of 0 indicates the absolute section.
+    Type = MachO::ARM64_RELOC_UNSIGNED;
+    Index = 0;
+
+    if (IsPCRel) {
+      IsExtern = 1;
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                                  "PC relative absolute relocation!");
+
+      // FIXME: x86_64 sets the type to a branch reloc here. Should we do
+      // something similar?
+    }
+  } else if (Target.getSymB()) { // A - B + constant
+    const MCSymbol *A = &Target.getSymA()->getSymbol();
+    const MCSymbolData &A_SD = Asm.getSymbolData(*A);
+    const MCSymbolData *A_Base = Asm.getAtom(&A_SD);
+
+    const MCSymbol *B = &Target.getSymB()->getSymbol();
+    const MCSymbolData &B_SD = Asm.getSymbolData(*B);
+    const MCSymbolData *B_Base = Asm.getAtom(&B_SD);
+
+    // Check for "_foo@got - .", which comes through here as:
+    // Ltmp0:
+    //    ... _foo@got - Ltmp0
+    if (Target.getSymA()->getKind() == MCSymbolRefExpr::VK_GOT &&
+        Target.getSymB()->getKind() == MCSymbolRefExpr::VK_None &&
+        Layout.getSymbolOffset(&B_SD) ==
+            Layout.getFragmentOffset(Fragment) + Fixup.getOffset()) {
+      // SymB is the PC, so use a PC-rel pointer-to-GOT relocation.
+      Index = A_Base->getIndex();
+      IsExtern = 1;
+      Type = MachO::ARM64_RELOC_POINTER_TO_GOT;
+      IsPCRel = 1;
+      MachO::any_relocation_info MRE;
+      MRE.r_word0 = FixupOffset;
+      MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
+                     (IsExtern << 27) | (Type << 28));
+      Writer->addRelocation(Fragment->getParent(), MRE);
+      return;
+    } else if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None ||
+               Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None)
+      // Otherwise, neither symbol can be modified.
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                                  "unsupported relocation of modified symbol");
+
+    // We don't support PCrel relocations of differences.
+    if (IsPCRel)
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                                  "unsupported pc-relative relocation of "
+                                  "difference");
+
+    // AArch64 always uses external relocations. If there is no symbol to use as
+    // a base address (a local symbol with no preceding non-local symbol),
+    // error out.
+    //
+    // FIXME: We should probably just synthesize an external symbol and use
+    // that.
+    if (!A_Base)
+      Asm.getContext().FatalError(
+          Fixup.getLoc(),
+          "unsupported relocation of local symbol '" + A->getName() +
+              "'. Must have non-local symbol earlier in section.");
+    if (!B_Base)
+      Asm.getContext().FatalError(
+          Fixup.getLoc(),
+          "unsupported relocation of local symbol '" + B->getName() +
+              "'. Must have non-local symbol earlier in section.");
+
+    if (A_Base == B_Base && A_Base)
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                                  "unsupported relocation with identical base");
+
+    Value += (!A_SD.getFragment() ? 0
+                                  : Writer->getSymbolAddress(&A_SD, Layout)) -
+             (!A_Base || !A_Base->getFragment()
+                  ? 0
+                  : Writer->getSymbolAddress(A_Base, Layout));
+    Value -= (!B_SD.getFragment() ? 0
+                                  : Writer->getSymbolAddress(&B_SD, Layout)) -
+             (!B_Base || !B_Base->getFragment()
+                  ? 0
+                  : Writer->getSymbolAddress(B_Base, Layout));
+
+    Index = A_Base->getIndex();
+    IsExtern = 1;
+    Type = MachO::ARM64_RELOC_UNSIGNED;
+
+    MachO::any_relocation_info MRE;
+    MRE.r_word0 = FixupOffset;
+    MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
+                   (IsExtern << 27) | (Type << 28));
+    Writer->addRelocation(Fragment->getParent(), MRE);
+
+    Index = B_Base->getIndex();
+    IsExtern = 1;
+    Type = MachO::ARM64_RELOC_SUBTRACTOR;
+  } else { // A + constant
+    const MCSymbol *Symbol = &Target.getSymA()->getSymbol();
+    const MCSymbolData &SD = Asm.getSymbolData(*Symbol);
+    const MCSymbolData *Base = Asm.getAtom(&SD);
+    const MCSectionMachO &Section = static_cast<const MCSectionMachO &>(
+        Fragment->getParent()->getSection());
+
+    // If the symbol is a variable and we weren't able to get a Base for it
+    // (i.e., it's not in the symbol table associated with a section) resolve
+    // the relocation based its expansion instead.
+    if (Symbol->isVariable() && !Base) {
+      // If the evaluation is an absolute value, just use that directly
+      // to keep things easy.
+      int64_t Res;
+      if (SD.getSymbol().getVariableValue()->EvaluateAsAbsolute(
+              Res, Layout, Writer->getSectionAddressMap())) {
+        FixedValue = Res;
+        return;
+      }
+
+      // FIXME: Will the Target we already have ever have any data in it
+      // we need to preserve and merge with the new Target? How about
+      // the FixedValue?
+      if (!Symbol->getVariableValue()->EvaluateAsRelocatable(Target, &Layout))
+        Asm.getContext().FatalError(Fixup.getLoc(),
+                                    "unable to resolve variable '" +
+                                        Symbol->getName() + "'");
+      return RecordRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+                              FixedValue);
+    }
+
+    // Relocations inside debug sections always use local relocations when
+    // possible. This seems to be done because the debugger doesn't fully
+    // understand relocation entries and expects to find values that
+    // have already been fixed up.
+    if (Symbol->isInSection()) {
+      if (Section.hasAttribute(MachO::S_ATTR_DEBUG))
+        Base = nullptr;
+    }
+
+    // AArch64 uses external relocations as much as possible. For debug
+    // sections, and for pointer-sized relocations (.quad), we allow section
+    // relocations.  It's code sections that run into trouble.
+    if (Base) {
+      Index = Base->getIndex();
+      IsExtern = 1;
+
+      // Add the local offset, if needed.
+      if (Base != &SD)
+        Value += Layout.getSymbolOffset(&SD) - Layout.getSymbolOffset(Base);
+    } else if (Symbol->isInSection()) {
+      // Pointer-sized relocations can use a local relocation. Otherwise,
+      // we have to be in a debug info section.
+      if (!Section.hasAttribute(MachO::S_ATTR_DEBUG) && Log2Size != 3)
+        Asm.getContext().FatalError(
+            Fixup.getLoc(),
+            "unsupported relocation of local symbol '" + Symbol->getName() +
+                "'. Must have non-local symbol earlier in section.");
+      // Adjust the relocation to be section-relative.
+      // The index is the section ordinal (1-based).
+      const MCSectionData &SymSD =
+          Asm.getSectionData(SD.getSymbol().getSection());
+      Index = SymSD.getOrdinal() + 1;
+      IsExtern = 0;
+      Value += Writer->getSymbolAddress(&SD, Layout);
+
+      if (IsPCRel)
+        Value -= Writer->getFragmentAddress(Fragment, Layout) +
+                 Fixup.getOffset() + (1ULL << Log2Size);
+    } else {
+      // Resolve constant variables.
+      if (SD.getSymbol().isVariable()) {
+        int64_t Res;
+        if (SD.getSymbol().getVariableValue()->EvaluateAsAbsolute(
+                Res, Layout, Writer->getSectionAddressMap())) {
+          FixedValue = Res;
+          return;
+        }
+      }
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                                  "unsupported relocation of variable '" +
+                                      Symbol->getName() + "'");
+    }
+  }
+
+  // If the relocation kind is Branch26, Page21, or Pageoff12, any addend
+  // is represented via an Addend relocation, not encoded directly into
+  // the instruction.
+  if ((Type == MachO::ARM64_RELOC_BRANCH26 ||
+       Type == MachO::ARM64_RELOC_PAGE21 ||
+       Type == MachO::ARM64_RELOC_PAGEOFF12) &&
+      Value) {
+    assert((Value & 0xff000000) == 0 && "Added relocation out of range!");
+
+    MachO::any_relocation_info MRE;
+    MRE.r_word0 = FixupOffset;
+    MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
+                   (IsExtern << 27) | (Type << 28));
+    Writer->addRelocation(Fragment->getParent(), MRE);
+
+    // Now set up the Addend relocation.
+    Type = MachO::ARM64_RELOC_ADDEND;
+    Index = Value;
+    IsPCRel = 0;
+    Log2Size = 2;
+    IsExtern = 0;
+
+    // Put zero into the instruction itself. The addend is in the relocation.
+    Value = 0;
+  }
+
+  // If there's any addend left to handle, encode it in the instruction.
+  FixedValue = Value;
+
+  // struct relocation_info (8 bytes)
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = FixupOffset;
+  MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
+                 (IsExtern << 27) | (Type << 28));
+  Writer->addRelocation(Fragment->getParent(), MRE);
+}
+
+MCObjectWriter *llvm::createAArch64MachObjectWriter(raw_ostream &OS,
+                                                  uint32_t CPUType,
+                                                  uint32_t CPUSubtype) {
+  return createMachObjectWriter(
+      new AArch64MachObjectWriter(CPUType, CPUSubtype), OS,
+      /*IsLittleEndian=*/true);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
new file mode 100644
index 0000000..dcc1a3c
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
@@ -0,0 +1,41 @@
+//===- AArch64TargetStreamer.cpp - AArch64TargetStreamer class --*- C++ -*---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AArch64TargetStreamer class.
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/ADT/MapVector.h"
+#include "llvm/MC/ConstantPools.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCStreamer.h"
+
+using namespace llvm;
+
+//
+// AArch64TargetStreamer Implemenation
+//
+AArch64TargetStreamer::AArch64TargetStreamer(MCStreamer &S)
+    : MCTargetStreamer(S), ConstantPools(new AssemblerConstantPools()) {}
+
+AArch64TargetStreamer::~AArch64TargetStreamer() {}
+
+// The constant pool handling is shared by all AArch64TargetStreamer
+// implementations.
+const MCExpr *AArch64TargetStreamer::addConstantPoolEntry(const MCExpr *Expr,
+                                                          unsigned Size) {
+  return ConstantPools->addEntry(Streamer, Expr, Size);
+}
+
+void AArch64TargetStreamer::emitCurrentConstantPool() {
+  ConstantPools->emitForCurrentSection(Streamer);
+}
+
+// finish() - write out any non-empty assembler constant pools.
+void AArch64TargetStreamer::finish() { ConstantPools->emitAll(Streamer); }
diff --git a/contrib/llvm/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp b/contrib/llvm/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
new file mode 100644
index 0000000..3a382c1
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
@@ -0,0 +1,31 @@
+//===-- AArch64TargetInfo.cpp - AArch64 Target Implementation -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Triple.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+namespace llvm {
+Target TheAArch64leTarget;
+Target TheAArch64beTarget;
+Target TheARM64leTarget;
+Target TheARM64beTarget;
+} // end namespace llvm
+
+extern "C" void LLVMInitializeAArch64TargetInfo() {
+  RegisterTarget<Triple::arm64, /*HasJIT=*/true> X(TheARM64leTarget, "arm64",
+                                                   "AArch64 (little endian)");
+  RegisterTarget<Triple::arm64_be, /*HasJIT=*/true> Y(TheARM64beTarget, "arm64_be",
+                                                      "AArch64 (big endian)");
+
+  RegisterTarget<Triple::aarch64, /*HasJIT=*/true> Z(
+      TheAArch64leTarget, "aarch64", "AArch64 (little endian)");
+  RegisterTarget<Triple::aarch64_be, /*HasJIT=*/true> W(
+      TheAArch64beTarget, "aarch64_be", "AArch64 (big endian)");
+}
diff --git a/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
new file mode 100644
index 0000000..3c24bb3
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
@@ -0,0 +1,901 @@
+//===-- AArch64BaseInfo.cpp - AArch64 Base encoding information------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides basic encoding and assembly information for AArch64.
+//
+//===----------------------------------------------------------------------===//
+#include "AArch64BaseInfo.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Regex.h"
+
+using namespace llvm;
+
+StringRef AArch64NamedImmMapper::toString(uint32_t Value, bool &Valid) const {
+  for (unsigned i = 0; i < NumPairs; ++i) {
+    if (Pairs[i].Value == Value) {
+      Valid = true;
+      return Pairs[i].Name;
+    }
+  }
+
+  Valid = false;
+  return StringRef();
+}
+
+uint32_t AArch64NamedImmMapper::fromString(StringRef Name, bool &Valid) const {
+  std::string LowerCaseName = Name.lower();
+  for (unsigned i = 0; i < NumPairs; ++i) {
+    if (Pairs[i].Name == LowerCaseName) {
+      Valid = true;
+      return Pairs[i].Value;
+    }
+  }
+
+  Valid = false;
+  return -1;
+}
+
+bool AArch64NamedImmMapper::validImm(uint32_t Value) const {
+  return Value < TooBigImm;
+}
+
+const AArch64NamedImmMapper::Mapping AArch64AT::ATMapper::ATPairs[] = {
+  {"s1e1r", S1E1R},
+  {"s1e2r", S1E2R},
+  {"s1e3r", S1E3R},
+  {"s1e1w", S1E1W},
+  {"s1e2w", S1E2W},
+  {"s1e3w", S1E3W},
+  {"s1e0r", S1E0R},
+  {"s1e0w", S1E0W},
+  {"s12e1r", S12E1R},
+  {"s12e1w", S12E1W},
+  {"s12e0r", S12E0R},
+  {"s12e0w", S12E0W},
+};
+
+AArch64AT::ATMapper::ATMapper()
+  : AArch64NamedImmMapper(ATPairs, 0) {}
+
+const AArch64NamedImmMapper::Mapping AArch64DB::DBarrierMapper::DBarrierPairs[] = {
+  {"oshld", OSHLD},
+  {"oshst", OSHST},
+  {"osh", OSH},
+  {"nshld", NSHLD},
+  {"nshst", NSHST},
+  {"nsh", NSH},
+  {"ishld", ISHLD},
+  {"ishst", ISHST},
+  {"ish", ISH},
+  {"ld", LD},
+  {"st", ST},
+  {"sy", SY}
+};
+
+AArch64DB::DBarrierMapper::DBarrierMapper()
+  : AArch64NamedImmMapper(DBarrierPairs, 16u) {}
+
+const AArch64NamedImmMapper::Mapping AArch64DC::DCMapper::DCPairs[] = {
+  {"zva", ZVA},
+  {"ivac", IVAC},
+  {"isw", ISW},
+  {"cvac", CVAC},
+  {"csw", CSW},
+  {"cvau", CVAU},
+  {"civac", CIVAC},
+  {"cisw", CISW}
+};
+
+AArch64DC::DCMapper::DCMapper()
+  : AArch64NamedImmMapper(DCPairs, 0) {}
+
+const AArch64NamedImmMapper::Mapping AArch64IC::ICMapper::ICPairs[] = {
+  {"ialluis",  IALLUIS},
+  {"iallu", IALLU},
+  {"ivau", IVAU}
+};
+
+AArch64IC::ICMapper::ICMapper()
+  : AArch64NamedImmMapper(ICPairs, 0) {}
+
+const AArch64NamedImmMapper::Mapping AArch64ISB::ISBMapper::ISBPairs[] = {
+  {"sy",  SY},
+};
+
+AArch64ISB::ISBMapper::ISBMapper()
+  : AArch64NamedImmMapper(ISBPairs, 16) {}
+
+const AArch64NamedImmMapper::Mapping AArch64PRFM::PRFMMapper::PRFMPairs[] = {
+  {"pldl1keep", PLDL1KEEP},
+  {"pldl1strm", PLDL1STRM},
+  {"pldl2keep", PLDL2KEEP},
+  {"pldl2strm", PLDL2STRM},
+  {"pldl3keep", PLDL3KEEP},
+  {"pldl3strm", PLDL3STRM},
+  {"plil1keep", PLIL1KEEP},
+  {"plil1strm", PLIL1STRM},
+  {"plil2keep", PLIL2KEEP},
+  {"plil2strm", PLIL2STRM},
+  {"plil3keep", PLIL3KEEP},
+  {"plil3strm", PLIL3STRM},
+  {"pstl1keep", PSTL1KEEP},
+  {"pstl1strm", PSTL1STRM},
+  {"pstl2keep", PSTL2KEEP},
+  {"pstl2strm", PSTL2STRM},
+  {"pstl3keep", PSTL3KEEP},
+  {"pstl3strm", PSTL3STRM}
+};
+
+AArch64PRFM::PRFMMapper::PRFMMapper()
+  : AArch64NamedImmMapper(PRFMPairs, 32) {}
+
+const AArch64NamedImmMapper::Mapping AArch64PState::PStateMapper::PStatePairs[] = {
+  {"spsel", SPSel},
+  {"daifset", DAIFSet},
+  {"daifclr", DAIFClr}
+};
+
+AArch64PState::PStateMapper::PStateMapper()
+  : AArch64NamedImmMapper(PStatePairs, 0) {}
+
+const AArch64NamedImmMapper::Mapping AArch64SysReg::MRSMapper::MRSPairs[] = {
+  {"mdccsr_el0", MDCCSR_EL0},
+  {"dbgdtrrx_el0", DBGDTRRX_EL0},
+  {"mdrar_el1", MDRAR_EL1},
+  {"oslsr_el1", OSLSR_EL1},
+  {"dbgauthstatus_el1", DBGAUTHSTATUS_EL1},
+  {"pmceid0_el0", PMCEID0_EL0},
+  {"pmceid1_el0", PMCEID1_EL0},
+  {"midr_el1", MIDR_EL1},
+  {"ccsidr_el1", CCSIDR_EL1},
+  {"clidr_el1", CLIDR_EL1},
+  {"ctr_el0", CTR_EL0},
+  {"mpidr_el1", MPIDR_EL1},
+  {"revidr_el1", REVIDR_EL1},
+  {"aidr_el1", AIDR_EL1},
+  {"dczid_el0", DCZID_EL0},
+  {"id_pfr0_el1", ID_PFR0_EL1},
+  {"id_pfr1_el1", ID_PFR1_EL1},
+  {"id_dfr0_el1", ID_DFR0_EL1},
+  {"id_afr0_el1", ID_AFR0_EL1},
+  {"id_mmfr0_el1", ID_MMFR0_EL1},
+  {"id_mmfr1_el1", ID_MMFR1_EL1},
+  {"id_mmfr2_el1", ID_MMFR2_EL1},
+  {"id_mmfr3_el1", ID_MMFR3_EL1},
+  {"id_isar0_el1", ID_ISAR0_EL1},
+  {"id_isar1_el1", ID_ISAR1_EL1},
+  {"id_isar2_el1", ID_ISAR2_EL1},
+  {"id_isar3_el1", ID_ISAR3_EL1},
+  {"id_isar4_el1", ID_ISAR4_EL1},
+  {"id_isar5_el1", ID_ISAR5_EL1},
+  {"id_aa64pfr0_el1", ID_A64PFR0_EL1},
+  {"id_aa64pfr1_el1", ID_A64PFR1_EL1},
+  {"id_aa64dfr0_el1", ID_A64DFR0_EL1},
+  {"id_aa64dfr1_el1", ID_A64DFR1_EL1},
+  {"id_aa64afr0_el1", ID_A64AFR0_EL1},
+  {"id_aa64afr1_el1", ID_A64AFR1_EL1},
+  {"id_aa64isar0_el1", ID_A64ISAR0_EL1},
+  {"id_aa64isar1_el1", ID_A64ISAR1_EL1},
+  {"id_aa64mmfr0_el1", ID_A64MMFR0_EL1},
+  {"id_aa64mmfr1_el1", ID_A64MMFR1_EL1},
+  {"mvfr0_el1", MVFR0_EL1},
+  {"mvfr1_el1", MVFR1_EL1},
+  {"mvfr2_el1", MVFR2_EL1},
+  {"rvbar_el1", RVBAR_EL1},
+  {"rvbar_el2", RVBAR_EL2},
+  {"rvbar_el3", RVBAR_EL3},
+  {"isr_el1", ISR_EL1},
+  {"cntpct_el0", CNTPCT_EL0},
+  {"cntvct_el0", CNTVCT_EL0},
+
+  // Trace registers
+  {"trcstatr", TRCSTATR},
+  {"trcidr8", TRCIDR8},
+  {"trcidr9", TRCIDR9},
+  {"trcidr10", TRCIDR10},
+  {"trcidr11", TRCIDR11},
+  {"trcidr12", TRCIDR12},
+  {"trcidr13", TRCIDR13},
+  {"trcidr0", TRCIDR0},
+  {"trcidr1", TRCIDR1},
+  {"trcidr2", TRCIDR2},
+  {"trcidr3", TRCIDR3},
+  {"trcidr4", TRCIDR4},
+  {"trcidr5", TRCIDR5},
+  {"trcidr6", TRCIDR6},
+  {"trcidr7", TRCIDR7},
+  {"trcoslsr", TRCOSLSR},
+  {"trcpdsr", TRCPDSR},
+  {"trcdevaff0", TRCDEVAFF0},
+  {"trcdevaff1", TRCDEVAFF1},
+  {"trclsr", TRCLSR},
+  {"trcauthstatus", TRCAUTHSTATUS},
+  {"trcdevarch", TRCDEVARCH},
+  {"trcdevid", TRCDEVID},
+  {"trcdevtype", TRCDEVTYPE},
+  {"trcpidr4", TRCPIDR4},
+  {"trcpidr5", TRCPIDR5},
+  {"trcpidr6", TRCPIDR6},
+  {"trcpidr7", TRCPIDR7},
+  {"trcpidr0", TRCPIDR0},
+  {"trcpidr1", TRCPIDR1},
+  {"trcpidr2", TRCPIDR2},
+  {"trcpidr3", TRCPIDR3},
+  {"trccidr0", TRCCIDR0},
+  {"trccidr1", TRCCIDR1},
+  {"trccidr2", TRCCIDR2},
+  {"trccidr3", TRCCIDR3},
+
+  // GICv3 registers
+  {"icc_iar1_el1", ICC_IAR1_EL1},
+  {"icc_iar0_el1", ICC_IAR0_EL1},
+  {"icc_hppir1_el1", ICC_HPPIR1_EL1},
+  {"icc_hppir0_el1", ICC_HPPIR0_EL1},
+  {"icc_rpr_el1", ICC_RPR_EL1},
+  {"ich_vtr_el2", ICH_VTR_EL2},
+  {"ich_eisr_el2", ICH_EISR_EL2},
+  {"ich_elsr_el2", ICH_ELSR_EL2}
+};
+
+AArch64SysReg::MRSMapper::MRSMapper(uint64_t FeatureBits)
+  : SysRegMapper(FeatureBits) {
+    InstPairs = &MRSPairs[0];
+    NumInstPairs = llvm::array_lengthof(MRSPairs);
+}
+
+const AArch64NamedImmMapper::Mapping AArch64SysReg::MSRMapper::MSRPairs[] = {
+  {"dbgdtrtx_el0", DBGDTRTX_EL0},
+  {"oslar_el1", OSLAR_EL1},
+  {"pmswinc_el0", PMSWINC_EL0},
+
+  // Trace registers
+  {"trcoslar", TRCOSLAR},
+  {"trclar", TRCLAR},
+
+  // GICv3 registers
+  {"icc_eoir1_el1", ICC_EOIR1_EL1},
+  {"icc_eoir0_el1", ICC_EOIR0_EL1},
+  {"icc_dir_el1", ICC_DIR_EL1},
+  {"icc_sgi1r_el1", ICC_SGI1R_EL1},
+  {"icc_asgi1r_el1", ICC_ASGI1R_EL1},
+  {"icc_sgi0r_el1", ICC_SGI0R_EL1}
+};
+
+AArch64SysReg::MSRMapper::MSRMapper(uint64_t FeatureBits)
+  : SysRegMapper(FeatureBits) {
+    InstPairs = &MSRPairs[0];
+    NumInstPairs = llvm::array_lengthof(MSRPairs);
+}
+
+
+const AArch64NamedImmMapper::Mapping AArch64SysReg::SysRegMapper::SysRegPairs[] = {
+  {"osdtrrx_el1", OSDTRRX_EL1},
+  {"osdtrtx_el1",  OSDTRTX_EL1},
+  {"teecr32_el1", TEECR32_EL1},
+  {"mdccint_el1", MDCCINT_EL1},
+  {"mdscr_el1", MDSCR_EL1},
+  {"dbgdtr_el0", DBGDTR_EL0},
+  {"oseccr_el1", OSECCR_EL1},
+  {"dbgvcr32_el2", DBGVCR32_EL2},
+  {"dbgbvr0_el1", DBGBVR0_EL1},
+  {"dbgbvr1_el1", DBGBVR1_EL1},
+  {"dbgbvr2_el1", DBGBVR2_EL1},
+  {"dbgbvr3_el1", DBGBVR3_EL1},
+  {"dbgbvr4_el1", DBGBVR4_EL1},
+  {"dbgbvr5_el1", DBGBVR5_EL1},
+  {"dbgbvr6_el1", DBGBVR6_EL1},
+  {"dbgbvr7_el1", DBGBVR7_EL1},
+  {"dbgbvr8_el1", DBGBVR8_EL1},
+  {"dbgbvr9_el1", DBGBVR9_EL1},
+  {"dbgbvr10_el1", DBGBVR10_EL1},
+  {"dbgbvr11_el1", DBGBVR11_EL1},
+  {"dbgbvr12_el1", DBGBVR12_EL1},
+  {"dbgbvr13_el1", DBGBVR13_EL1},
+  {"dbgbvr14_el1", DBGBVR14_EL1},
+  {"dbgbvr15_el1", DBGBVR15_EL1},
+  {"dbgbcr0_el1", DBGBCR0_EL1},
+  {"dbgbcr1_el1", DBGBCR1_EL1},
+  {"dbgbcr2_el1", DBGBCR2_EL1},
+  {"dbgbcr3_el1", DBGBCR3_EL1},
+  {"dbgbcr4_el1", DBGBCR4_EL1},
+  {"dbgbcr5_el1", DBGBCR5_EL1},
+  {"dbgbcr6_el1", DBGBCR6_EL1},
+  {"dbgbcr7_el1", DBGBCR7_EL1},
+  {"dbgbcr8_el1", DBGBCR8_EL1},
+  {"dbgbcr9_el1", DBGBCR9_EL1},
+  {"dbgbcr10_el1", DBGBCR10_EL1},
+  {"dbgbcr11_el1", DBGBCR11_EL1},
+  {"dbgbcr12_el1", DBGBCR12_EL1},
+  {"dbgbcr13_el1", DBGBCR13_EL1},
+  {"dbgbcr14_el1", DBGBCR14_EL1},
+  {"dbgbcr15_el1", DBGBCR15_EL1},
+  {"dbgwvr0_el1", DBGWVR0_EL1},
+  {"dbgwvr1_el1", DBGWVR1_EL1},
+  {"dbgwvr2_el1", DBGWVR2_EL1},
+  {"dbgwvr3_el1", DBGWVR3_EL1},
+  {"dbgwvr4_el1", DBGWVR4_EL1},
+  {"dbgwvr5_el1", DBGWVR5_EL1},
+  {"dbgwvr6_el1", DBGWVR6_EL1},
+  {"dbgwvr7_el1", DBGWVR7_EL1},
+  {"dbgwvr8_el1", DBGWVR8_EL1},
+  {"dbgwvr9_el1", DBGWVR9_EL1},
+  {"dbgwvr10_el1", DBGWVR10_EL1},
+  {"dbgwvr11_el1", DBGWVR11_EL1},
+  {"dbgwvr12_el1", DBGWVR12_EL1},
+  {"dbgwvr13_el1", DBGWVR13_EL1},
+  {"dbgwvr14_el1", DBGWVR14_EL1},
+  {"dbgwvr15_el1", DBGWVR15_EL1},
+  {"dbgwcr0_el1", DBGWCR0_EL1},
+  {"dbgwcr1_el1", DBGWCR1_EL1},
+  {"dbgwcr2_el1", DBGWCR2_EL1},
+  {"dbgwcr3_el1", DBGWCR3_EL1},
+  {"dbgwcr4_el1", DBGWCR4_EL1},
+  {"dbgwcr5_el1", DBGWCR5_EL1},
+  {"dbgwcr6_el1", DBGWCR6_EL1},
+  {"dbgwcr7_el1", DBGWCR7_EL1},
+  {"dbgwcr8_el1", DBGWCR8_EL1},
+  {"dbgwcr9_el1", DBGWCR9_EL1},
+  {"dbgwcr10_el1", DBGWCR10_EL1},
+  {"dbgwcr11_el1", DBGWCR11_EL1},
+  {"dbgwcr12_el1", DBGWCR12_EL1},
+  {"dbgwcr13_el1", DBGWCR13_EL1},
+  {"dbgwcr14_el1", DBGWCR14_EL1},
+  {"dbgwcr15_el1", DBGWCR15_EL1},
+  {"teehbr32_el1", TEEHBR32_EL1},
+  {"osdlr_el1", OSDLR_EL1},
+  {"dbgprcr_el1", DBGPRCR_EL1},
+  {"dbgclaimset_el1", DBGCLAIMSET_EL1},
+  {"dbgclaimclr_el1", DBGCLAIMCLR_EL1},
+  {"csselr_el1", CSSELR_EL1},
+  {"vpidr_el2", VPIDR_EL2},
+  {"vmpidr_el2", VMPIDR_EL2},
+  {"sctlr_el1", SCTLR_EL1},
+  {"sctlr_el2", SCTLR_EL2},
+  {"sctlr_el3", SCTLR_EL3},
+  {"actlr_el1", ACTLR_EL1},
+  {"actlr_el2", ACTLR_EL2},
+  {"actlr_el3", ACTLR_EL3},
+  {"cpacr_el1", CPACR_EL1},
+  {"hcr_el2", HCR_EL2},
+  {"scr_el3", SCR_EL3},
+  {"mdcr_el2", MDCR_EL2},
+  {"sder32_el3", SDER32_EL3},
+  {"cptr_el2", CPTR_EL2},
+  {"cptr_el3", CPTR_EL3},
+  {"hstr_el2", HSTR_EL2},
+  {"hacr_el2", HACR_EL2},
+  {"mdcr_el3", MDCR_EL3},
+  {"ttbr0_el1", TTBR0_EL1},
+  {"ttbr0_el2", TTBR0_EL2},
+  {"ttbr0_el3", TTBR0_EL3},
+  {"ttbr1_el1", TTBR1_EL1},
+  {"tcr_el1", TCR_EL1},
+  {"tcr_el2", TCR_EL2},
+  {"tcr_el3", TCR_EL3},
+  {"vttbr_el2", VTTBR_EL2},
+  {"vtcr_el2", VTCR_EL2},
+  {"dacr32_el2", DACR32_EL2},
+  {"spsr_el1", SPSR_EL1},
+  {"spsr_el2", SPSR_EL2},
+  {"spsr_el3", SPSR_EL3},
+  {"elr_el1", ELR_EL1},
+  {"elr_el2", ELR_EL2},
+  {"elr_el3", ELR_EL3},
+  {"sp_el0", SP_EL0},
+  {"sp_el1", SP_EL1},
+  {"sp_el2", SP_EL2},
+  {"spsel", SPSel},
+  {"nzcv", NZCV},
+  {"daif", DAIF},
+  {"currentel", CurrentEL},
+  {"spsr_irq", SPSR_irq},
+  {"spsr_abt", SPSR_abt},
+  {"spsr_und", SPSR_und},
+  {"spsr_fiq", SPSR_fiq},
+  {"fpcr", FPCR},
+  {"fpsr", FPSR},
+  {"dspsr_el0", DSPSR_EL0},
+  {"dlr_el0", DLR_EL0},
+  {"ifsr32_el2", IFSR32_EL2},
+  {"afsr0_el1", AFSR0_EL1},
+  {"afsr0_el2", AFSR0_EL2},
+  {"afsr0_el3", AFSR0_EL3},
+  {"afsr1_el1", AFSR1_EL1},
+  {"afsr1_el2", AFSR1_EL2},
+  {"afsr1_el3", AFSR1_EL3},
+  {"esr_el1", ESR_EL1},
+  {"esr_el2", ESR_EL2},
+  {"esr_el3", ESR_EL3},
+  {"fpexc32_el2", FPEXC32_EL2},
+  {"far_el1", FAR_EL1},
+  {"far_el2", FAR_EL2},
+  {"far_el3", FAR_EL3},
+  {"hpfar_el2", HPFAR_EL2},
+  {"par_el1", PAR_EL1},
+  {"pmcr_el0", PMCR_EL0},
+  {"pmcntenset_el0", PMCNTENSET_EL0},
+  {"pmcntenclr_el0", PMCNTENCLR_EL0},
+  {"pmovsclr_el0", PMOVSCLR_EL0},
+  {"pmselr_el0", PMSELR_EL0},
+  {"pmccntr_el0", PMCCNTR_EL0},
+  {"pmxevtyper_el0", PMXEVTYPER_EL0},
+  {"pmxevcntr_el0", PMXEVCNTR_EL0},
+  {"pmuserenr_el0", PMUSERENR_EL0},
+  {"pmintenset_el1", PMINTENSET_EL1},
+  {"pmintenclr_el1", PMINTENCLR_EL1},
+  {"pmovsset_el0", PMOVSSET_EL0},
+  {"mair_el1", MAIR_EL1},
+  {"mair_el2", MAIR_EL2},
+  {"mair_el3", MAIR_EL3},
+  {"amair_el1", AMAIR_EL1},
+  {"amair_el2", AMAIR_EL2},
+  {"amair_el3", AMAIR_EL3},
+  {"vbar_el1", VBAR_EL1},
+  {"vbar_el2", VBAR_EL2},
+  {"vbar_el3", VBAR_EL3},
+  {"rmr_el1", RMR_EL1},
+  {"rmr_el2", RMR_EL2},
+  {"rmr_el3", RMR_EL3},
+  {"contextidr_el1", CONTEXTIDR_EL1},
+  {"tpidr_el0", TPIDR_EL0},
+  {"tpidr_el2", TPIDR_EL2},
+  {"tpidr_el3", TPIDR_EL3},
+  {"tpidrro_el0", TPIDRRO_EL0},
+  {"tpidr_el1", TPIDR_EL1},
+  {"cntfrq_el0", CNTFRQ_EL0},
+  {"cntvoff_el2", CNTVOFF_EL2},
+  {"cntkctl_el1", CNTKCTL_EL1},
+  {"cnthctl_el2", CNTHCTL_EL2},
+  {"cntp_tval_el0", CNTP_TVAL_EL0},
+  {"cnthp_tval_el2", CNTHP_TVAL_EL2},
+  {"cntps_tval_el1", CNTPS_TVAL_EL1},
+  {"cntp_ctl_el0", CNTP_CTL_EL0},
+  {"cnthp_ctl_el2", CNTHP_CTL_EL2},
+  {"cntps_ctl_el1", CNTPS_CTL_EL1},
+  {"cntp_cval_el0", CNTP_CVAL_EL0},
+  {"cnthp_cval_el2", CNTHP_CVAL_EL2},
+  {"cntps_cval_el1", CNTPS_CVAL_EL1},
+  {"cntv_tval_el0", CNTV_TVAL_EL0},
+  {"cntv_ctl_el0", CNTV_CTL_EL0},
+  {"cntv_cval_el0", CNTV_CVAL_EL0},
+  {"pmevcntr0_el0", PMEVCNTR0_EL0},
+  {"pmevcntr1_el0", PMEVCNTR1_EL0},
+  {"pmevcntr2_el0", PMEVCNTR2_EL0},
+  {"pmevcntr3_el0", PMEVCNTR3_EL0},
+  {"pmevcntr4_el0", PMEVCNTR4_EL0},
+  {"pmevcntr5_el0", PMEVCNTR5_EL0},
+  {"pmevcntr6_el0", PMEVCNTR6_EL0},
+  {"pmevcntr7_el0", PMEVCNTR7_EL0},
+  {"pmevcntr8_el0", PMEVCNTR8_EL0},
+  {"pmevcntr9_el0", PMEVCNTR9_EL0},
+  {"pmevcntr10_el0", PMEVCNTR10_EL0},
+  {"pmevcntr11_el0", PMEVCNTR11_EL0},
+  {"pmevcntr12_el0", PMEVCNTR12_EL0},
+  {"pmevcntr13_el0", PMEVCNTR13_EL0},
+  {"pmevcntr14_el0", PMEVCNTR14_EL0},
+  {"pmevcntr15_el0", PMEVCNTR15_EL0},
+  {"pmevcntr16_el0", PMEVCNTR16_EL0},
+  {"pmevcntr17_el0", PMEVCNTR17_EL0},
+  {"pmevcntr18_el0", PMEVCNTR18_EL0},
+  {"pmevcntr19_el0", PMEVCNTR19_EL0},
+  {"pmevcntr20_el0", PMEVCNTR20_EL0},
+  {"pmevcntr21_el0", PMEVCNTR21_EL0},
+  {"pmevcntr22_el0", PMEVCNTR22_EL0},
+  {"pmevcntr23_el0", PMEVCNTR23_EL0},
+  {"pmevcntr24_el0", PMEVCNTR24_EL0},
+  {"pmevcntr25_el0", PMEVCNTR25_EL0},
+  {"pmevcntr26_el0", PMEVCNTR26_EL0},
+  {"pmevcntr27_el0", PMEVCNTR27_EL0},
+  {"pmevcntr28_el0", PMEVCNTR28_EL0},
+  {"pmevcntr29_el0", PMEVCNTR29_EL0},
+  {"pmevcntr30_el0", PMEVCNTR30_EL0},
+  {"pmccfiltr_el0", PMCCFILTR_EL0},
+  {"pmevtyper0_el0", PMEVTYPER0_EL0},
+  {"pmevtyper1_el0", PMEVTYPER1_EL0},
+  {"pmevtyper2_el0", PMEVTYPER2_EL0},
+  {"pmevtyper3_el0", PMEVTYPER3_EL0},
+  {"pmevtyper4_el0", PMEVTYPER4_EL0},
+  {"pmevtyper5_el0", PMEVTYPER5_EL0},
+  {"pmevtyper6_el0", PMEVTYPER6_EL0},
+  {"pmevtyper7_el0", PMEVTYPER7_EL0},
+  {"pmevtyper8_el0", PMEVTYPER8_EL0},
+  {"pmevtyper9_el0", PMEVTYPER9_EL0},
+  {"pmevtyper10_el0", PMEVTYPER10_EL0},
+  {"pmevtyper11_el0", PMEVTYPER11_EL0},
+  {"pmevtyper12_el0", PMEVTYPER12_EL0},
+  {"pmevtyper13_el0", PMEVTYPER13_EL0},
+  {"pmevtyper14_el0", PMEVTYPER14_EL0},
+  {"pmevtyper15_el0", PMEVTYPER15_EL0},
+  {"pmevtyper16_el0", PMEVTYPER16_EL0},
+  {"pmevtyper17_el0", PMEVTYPER17_EL0},
+  {"pmevtyper18_el0", PMEVTYPER18_EL0},
+  {"pmevtyper19_el0", PMEVTYPER19_EL0},
+  {"pmevtyper20_el0", PMEVTYPER20_EL0},
+  {"pmevtyper21_el0", PMEVTYPER21_EL0},
+  {"pmevtyper22_el0", PMEVTYPER22_EL0},
+  {"pmevtyper23_el0", PMEVTYPER23_EL0},
+  {"pmevtyper24_el0", PMEVTYPER24_EL0},
+  {"pmevtyper25_el0", PMEVTYPER25_EL0},
+  {"pmevtyper26_el0", PMEVTYPER26_EL0},
+  {"pmevtyper27_el0", PMEVTYPER27_EL0},
+  {"pmevtyper28_el0", PMEVTYPER28_EL0},
+  {"pmevtyper29_el0", PMEVTYPER29_EL0},
+  {"pmevtyper30_el0", PMEVTYPER30_EL0},
+
+  // Trace registers
+  {"trcprgctlr", TRCPRGCTLR},
+  {"trcprocselr", TRCPROCSELR},
+  {"trcconfigr", TRCCONFIGR},
+  {"trcauxctlr", TRCAUXCTLR},
+  {"trceventctl0r", TRCEVENTCTL0R},
+  {"trceventctl1r", TRCEVENTCTL1R},
+  {"trcstallctlr", TRCSTALLCTLR},
+  {"trctsctlr", TRCTSCTLR},
+  {"trcsyncpr", TRCSYNCPR},
+  {"trcccctlr", TRCCCCTLR},
+  {"trcbbctlr", TRCBBCTLR},
+  {"trctraceidr", TRCTRACEIDR},
+  {"trcqctlr", TRCQCTLR},
+  {"trcvictlr", TRCVICTLR},
+  {"trcviiectlr", TRCVIIECTLR},
+  {"trcvissctlr", TRCVISSCTLR},
+  {"trcvipcssctlr", TRCVIPCSSCTLR},
+  {"trcvdctlr", TRCVDCTLR},
+  {"trcvdsacctlr", TRCVDSACCTLR},
+  {"trcvdarcctlr", TRCVDARCCTLR},
+  {"trcseqevr0", TRCSEQEVR0},
+  {"trcseqevr1", TRCSEQEVR1},
+  {"trcseqevr2", TRCSEQEVR2},
+  {"trcseqrstevr", TRCSEQRSTEVR},
+  {"trcseqstr", TRCSEQSTR},
+  {"trcextinselr", TRCEXTINSELR},
+  {"trccntrldvr0", TRCCNTRLDVR0},
+  {"trccntrldvr1", TRCCNTRLDVR1},
+  {"trccntrldvr2", TRCCNTRLDVR2},
+  {"trccntrldvr3", TRCCNTRLDVR3},
+  {"trccntctlr0", TRCCNTCTLR0},
+  {"trccntctlr1", TRCCNTCTLR1},
+  {"trccntctlr2", TRCCNTCTLR2},
+  {"trccntctlr3", TRCCNTCTLR3},
+  {"trccntvr0", TRCCNTVR0},
+  {"trccntvr1", TRCCNTVR1},
+  {"trccntvr2", TRCCNTVR2},
+  {"trccntvr3", TRCCNTVR3},
+  {"trcimspec0", TRCIMSPEC0},
+  {"trcimspec1", TRCIMSPEC1},
+  {"trcimspec2", TRCIMSPEC2},
+  {"trcimspec3", TRCIMSPEC3},
+  {"trcimspec4", TRCIMSPEC4},
+  {"trcimspec5", TRCIMSPEC5},
+  {"trcimspec6", TRCIMSPEC6},
+  {"trcimspec7", TRCIMSPEC7},
+  {"trcrsctlr2", TRCRSCTLR2},
+  {"trcrsctlr3", TRCRSCTLR3},
+  {"trcrsctlr4", TRCRSCTLR4},
+  {"trcrsctlr5", TRCRSCTLR5},
+  {"trcrsctlr6", TRCRSCTLR6},
+  {"trcrsctlr7", TRCRSCTLR7},
+  {"trcrsctlr8", TRCRSCTLR8},
+  {"trcrsctlr9", TRCRSCTLR9},
+  {"trcrsctlr10", TRCRSCTLR10},
+  {"trcrsctlr11", TRCRSCTLR11},
+  {"trcrsctlr12", TRCRSCTLR12},
+  {"trcrsctlr13", TRCRSCTLR13},
+  {"trcrsctlr14", TRCRSCTLR14},
+  {"trcrsctlr15", TRCRSCTLR15},
+  {"trcrsctlr16", TRCRSCTLR16},
+  {"trcrsctlr17", TRCRSCTLR17},
+  {"trcrsctlr18", TRCRSCTLR18},
+  {"trcrsctlr19", TRCRSCTLR19},
+  {"trcrsctlr20", TRCRSCTLR20},
+  {"trcrsctlr21", TRCRSCTLR21},
+  {"trcrsctlr22", TRCRSCTLR22},
+  {"trcrsctlr23", TRCRSCTLR23},
+  {"trcrsctlr24", TRCRSCTLR24},
+  {"trcrsctlr25", TRCRSCTLR25},
+  {"trcrsctlr26", TRCRSCTLR26},
+  {"trcrsctlr27", TRCRSCTLR27},
+  {"trcrsctlr28", TRCRSCTLR28},
+  {"trcrsctlr29", TRCRSCTLR29},
+  {"trcrsctlr30", TRCRSCTLR30},
+  {"trcrsctlr31", TRCRSCTLR31},
+  {"trcssccr0", TRCSSCCR0},
+  {"trcssccr1", TRCSSCCR1},
+  {"trcssccr2", TRCSSCCR2},
+  {"trcssccr3", TRCSSCCR3},
+  {"trcssccr4", TRCSSCCR4},
+  {"trcssccr5", TRCSSCCR5},
+  {"trcssccr6", TRCSSCCR6},
+  {"trcssccr7", TRCSSCCR7},
+  {"trcsscsr0", TRCSSCSR0},
+  {"trcsscsr1", TRCSSCSR1},
+  {"trcsscsr2", TRCSSCSR2},
+  {"trcsscsr3", TRCSSCSR3},
+  {"trcsscsr4", TRCSSCSR4},
+  {"trcsscsr5", TRCSSCSR5},
+  {"trcsscsr6", TRCSSCSR6},
+  {"trcsscsr7", TRCSSCSR7},
+  {"trcsspcicr0", TRCSSPCICR0},
+  {"trcsspcicr1", TRCSSPCICR1},
+  {"trcsspcicr2", TRCSSPCICR2},
+  {"trcsspcicr3", TRCSSPCICR3},
+  {"trcsspcicr4", TRCSSPCICR4},
+  {"trcsspcicr5", TRCSSPCICR5},
+  {"trcsspcicr6", TRCSSPCICR6},
+  {"trcsspcicr7", TRCSSPCICR7},
+  {"trcpdcr", TRCPDCR},
+  {"trcacvr0", TRCACVR0},
+  {"trcacvr1", TRCACVR1},
+  {"trcacvr2", TRCACVR2},
+  {"trcacvr3", TRCACVR3},
+  {"trcacvr4", TRCACVR4},
+  {"trcacvr5", TRCACVR5},
+  {"trcacvr6", TRCACVR6},
+  {"trcacvr7", TRCACVR7},
+  {"trcacvr8", TRCACVR8},
+  {"trcacvr9", TRCACVR9},
+  {"trcacvr10", TRCACVR10},
+  {"trcacvr11", TRCACVR11},
+  {"trcacvr12", TRCACVR12},
+  {"trcacvr13", TRCACVR13},
+  {"trcacvr14", TRCACVR14},
+  {"trcacvr15", TRCACVR15},
+  {"trcacatr0", TRCACATR0},
+  {"trcacatr1", TRCACATR1},
+  {"trcacatr2", TRCACATR2},
+  {"trcacatr3", TRCACATR3},
+  {"trcacatr4", TRCACATR4},
+  {"trcacatr5", TRCACATR5},
+  {"trcacatr6", TRCACATR6},
+  {"trcacatr7", TRCACATR7},
+  {"trcacatr8", TRCACATR8},
+  {"trcacatr9", TRCACATR9},
+  {"trcacatr10", TRCACATR10},
+  {"trcacatr11", TRCACATR11},
+  {"trcacatr12", TRCACATR12},
+  {"trcacatr13", TRCACATR13},
+  {"trcacatr14", TRCACATR14},
+  {"trcacatr15", TRCACATR15},
+  {"trcdvcvr0", TRCDVCVR0},
+  {"trcdvcvr1", TRCDVCVR1},
+  {"trcdvcvr2", TRCDVCVR2},
+  {"trcdvcvr3", TRCDVCVR3},
+  {"trcdvcvr4", TRCDVCVR4},
+  {"trcdvcvr5", TRCDVCVR5},
+  {"trcdvcvr6", TRCDVCVR6},
+  {"trcdvcvr7", TRCDVCVR7},
+  {"trcdvcmr0", TRCDVCMR0},
+  {"trcdvcmr1", TRCDVCMR1},
+  {"trcdvcmr2", TRCDVCMR2},
+  {"trcdvcmr3", TRCDVCMR3},
+  {"trcdvcmr4", TRCDVCMR4},
+  {"trcdvcmr5", TRCDVCMR5},
+  {"trcdvcmr6", TRCDVCMR6},
+  {"trcdvcmr7", TRCDVCMR7},
+  {"trccidcvr0", TRCCIDCVR0},
+  {"trccidcvr1", TRCCIDCVR1},
+  {"trccidcvr2", TRCCIDCVR2},
+  {"trccidcvr3", TRCCIDCVR3},
+  {"trccidcvr4", TRCCIDCVR4},
+  {"trccidcvr5", TRCCIDCVR5},
+  {"trccidcvr6", TRCCIDCVR6},
+  {"trccidcvr7", TRCCIDCVR7},
+  {"trcvmidcvr0", TRCVMIDCVR0},
+  {"trcvmidcvr1", TRCVMIDCVR1},
+  {"trcvmidcvr2", TRCVMIDCVR2},
+  {"trcvmidcvr3", TRCVMIDCVR3},
+  {"trcvmidcvr4", TRCVMIDCVR4},
+  {"trcvmidcvr5", TRCVMIDCVR5},
+  {"trcvmidcvr6", TRCVMIDCVR6},
+  {"trcvmidcvr7", TRCVMIDCVR7},
+  {"trccidcctlr0", TRCCIDCCTLR0},
+  {"trccidcctlr1", TRCCIDCCTLR1},
+  {"trcvmidcctlr0", TRCVMIDCCTLR0},
+  {"trcvmidcctlr1", TRCVMIDCCTLR1},
+  {"trcitctrl", TRCITCTRL},
+  {"trcclaimset", TRCCLAIMSET},
+  {"trcclaimclr", TRCCLAIMCLR},
+
+  // GICv3 registers
+  {"icc_bpr1_el1", ICC_BPR1_EL1},
+  {"icc_bpr0_el1", ICC_BPR0_EL1},
+  {"icc_pmr_el1", ICC_PMR_EL1},
+  {"icc_ctlr_el1", ICC_CTLR_EL1},
+  {"icc_ctlr_el3", ICC_CTLR_EL3},
+  {"icc_sre_el1", ICC_SRE_EL1},
+  {"icc_sre_el2", ICC_SRE_EL2},
+  {"icc_sre_el3", ICC_SRE_EL3},
+  {"icc_igrpen0_el1", ICC_IGRPEN0_EL1},
+  {"icc_igrpen1_el1", ICC_IGRPEN1_EL1},
+  {"icc_igrpen1_el3", ICC_IGRPEN1_EL3},
+  {"icc_seien_el1", ICC_SEIEN_EL1},
+  {"icc_ap0r0_el1", ICC_AP0R0_EL1},
+  {"icc_ap0r1_el1", ICC_AP0R1_EL1},
+  {"icc_ap0r2_el1", ICC_AP0R2_EL1},
+  {"icc_ap0r3_el1", ICC_AP0R3_EL1},
+  {"icc_ap1r0_el1", ICC_AP1R0_EL1},
+  {"icc_ap1r1_el1", ICC_AP1R1_EL1},
+  {"icc_ap1r2_el1", ICC_AP1R2_EL1},
+  {"icc_ap1r3_el1", ICC_AP1R3_EL1},
+  {"ich_ap0r0_el2", ICH_AP0R0_EL2},
+  {"ich_ap0r1_el2", ICH_AP0R1_EL2},
+  {"ich_ap0r2_el2", ICH_AP0R2_EL2},
+  {"ich_ap0r3_el2", ICH_AP0R3_EL2},
+  {"ich_ap1r0_el2", ICH_AP1R0_EL2},
+  {"ich_ap1r1_el2", ICH_AP1R1_EL2},
+  {"ich_ap1r2_el2", ICH_AP1R2_EL2},
+  {"ich_ap1r3_el2", ICH_AP1R3_EL2},
+  {"ich_hcr_el2", ICH_HCR_EL2},
+  {"ich_misr_el2", ICH_MISR_EL2},
+  {"ich_vmcr_el2", ICH_VMCR_EL2},
+  {"ich_vseir_el2", ICH_VSEIR_EL2},
+  {"ich_lr0_el2", ICH_LR0_EL2},
+  {"ich_lr1_el2", ICH_LR1_EL2},
+  {"ich_lr2_el2", ICH_LR2_EL2},
+  {"ich_lr3_el2", ICH_LR3_EL2},
+  {"ich_lr4_el2", ICH_LR4_EL2},
+  {"ich_lr5_el2", ICH_LR5_EL2},
+  {"ich_lr6_el2", ICH_LR6_EL2},
+  {"ich_lr7_el2", ICH_LR7_EL2},
+  {"ich_lr8_el2", ICH_LR8_EL2},
+  {"ich_lr9_el2", ICH_LR9_EL2},
+  {"ich_lr10_el2", ICH_LR10_EL2},
+  {"ich_lr11_el2", ICH_LR11_EL2},
+  {"ich_lr12_el2", ICH_LR12_EL2},
+  {"ich_lr13_el2", ICH_LR13_EL2},
+  {"ich_lr14_el2", ICH_LR14_EL2},
+  {"ich_lr15_el2", ICH_LR15_EL2}
+};
+
+const AArch64NamedImmMapper::Mapping
+AArch64SysReg::SysRegMapper::CycloneSysRegPairs[] = {
+  {"cpm_ioacc_ctl_el3", CPM_IOACC_CTL_EL3}
+};
+
+uint32_t
+AArch64SysReg::SysRegMapper::fromString(StringRef Name, bool &Valid) const {
+  std::string NameLower = Name.lower();
+
+  // First search the registers shared by all
+  for (unsigned i = 0; i < array_lengthof(SysRegPairs); ++i) {
+    if (SysRegPairs[i].Name == NameLower) {
+      Valid = true;
+      return SysRegPairs[i].Value;
+    }
+  }
+
+  // Next search for target specific registers
+  if (FeatureBits & AArch64::ProcCyclone) {
+    for (unsigned i = 0; i < array_lengthof(CycloneSysRegPairs); ++i) {
+      if (CycloneSysRegPairs[i].Name == NameLower) {
+        Valid = true;
+        return CycloneSysRegPairs[i].Value;
+      }
+    }
+  }
+
+  // Now try the instruction-specific registers (either read-only or
+  // write-only).
+  for (unsigned i = 0; i < NumInstPairs; ++i) {
+    if (InstPairs[i].Name == NameLower) {
+      Valid = true;
+      return InstPairs[i].Value;
+    }
+  }
+
+  // Try to parse an S<op0>_<op1>_<Cn>_<Cm>_<op2> register name, where the bits
+  // are: 11 xxx 1x11 xxxx xxx
+  Regex GenericRegPattern("^s3_([0-7])_c(1[15])_c([0-9]|1[0-5])_([0-7])$");
+
+  SmallVector<StringRef, 4> Ops;
+  if (!GenericRegPattern.match(NameLower, &Ops)) {
+    Valid = false;
+    return -1;
+  }
+
+  uint32_t Op0 = 3, Op1 = 0, CRn = 0, CRm = 0, Op2 = 0;
+  uint32_t Bits;
+  Ops[1].getAsInteger(10, Op1);
+  Ops[2].getAsInteger(10, CRn);
+  Ops[3].getAsInteger(10, CRm);
+  Ops[4].getAsInteger(10, Op2);
+  Bits = (Op0 << 14) | (Op1 << 11) | (CRn << 7) | (CRm << 3) | Op2;
+
+  Valid = true;
+  return Bits;
+}
+
+std::string
+AArch64SysReg::SysRegMapper::toString(uint32_t Bits, bool &Valid) const {
+  // First search the registers shared by all
+  for (unsigned i = 0; i < array_lengthof(SysRegPairs); ++i) {
+    if (SysRegPairs[i].Value == Bits) {
+      Valid = true;
+      return SysRegPairs[i].Name;
+    }
+  }
+
+  // Next search for target specific registers
+  if (FeatureBits & AArch64::ProcCyclone) {
+    for (unsigned i = 0; i < array_lengthof(CycloneSysRegPairs); ++i) {
+      if (CycloneSysRegPairs[i].Value == Bits) {
+        Valid = true;
+        return CycloneSysRegPairs[i].Name;
+      }
+    }
+  }
+
+  // Now try the instruction-specific registers (either read-only or
+  // write-only).
+  for (unsigned i = 0; i < NumInstPairs; ++i) {
+    if (InstPairs[i].Value == Bits) {
+      Valid = true;
+      return InstPairs[i].Name;
+    }
+  }
+
+  uint32_t Op0 = (Bits >> 14) & 0x3;
+  uint32_t Op1 = (Bits >> 11) & 0x7;
+  uint32_t CRn = (Bits >> 7) & 0xf;
+  uint32_t CRm = (Bits >> 3) & 0xf;
+  uint32_t Op2 = Bits & 0x7;
+
+  // Only combinations matching: 11 xxx 1x11 xxxx xxx are valid for a generic
+  // name.
+  if (Op0 != 3 || (CRn != 11 && CRn != 15)) {
+      Valid = false;
+      return "";
+  }
+
+  assert(Op0 == 3 && (CRn == 11 || CRn == 15) && "Invalid generic sysreg");
+
+  Valid = true;
+  return "s3_" + utostr(Op1) + "_c" + utostr(CRn)
+               + "_c" + utostr(CRm) + "_" + utostr(Op2);
+}
+
+const AArch64NamedImmMapper::Mapping AArch64TLBI::TLBIMapper::TLBIPairs[] = {
+  {"ipas2e1is", IPAS2E1IS},
+  {"ipas2le1is", IPAS2LE1IS},
+  {"vmalle1is", VMALLE1IS},
+  {"alle2is", ALLE2IS},
+  {"alle3is", ALLE3IS},
+  {"vae1is", VAE1IS},
+  {"vae2is", VAE2IS},
+  {"vae3is", VAE3IS},
+  {"aside1is", ASIDE1IS},
+  {"vaae1is", VAAE1IS},
+  {"alle1is", ALLE1IS},
+  {"vale1is", VALE1IS},
+  {"vale2is", VALE2IS},
+  {"vale3is", VALE3IS},
+  {"vmalls12e1is", VMALLS12E1IS},
+  {"vaale1is", VAALE1IS},
+  {"ipas2e1", IPAS2E1},
+  {"ipas2le1", IPAS2LE1},
+  {"vmalle1", VMALLE1},
+  {"alle2", ALLE2},
+  {"alle3", ALLE3},
+  {"vae1", VAE1},
+  {"vae2", VAE2},
+  {"vae3", VAE3},
+  {"aside1", ASIDE1},
+  {"vaae1", VAAE1},
+  {"alle1", ALLE1},
+  {"vale1", VALE1},
+  {"vale2", VALE2},
+  {"vale3", VALE3},
+  {"vmalls12e1", VMALLS12E1},
+  {"vaale1", VAALE1}
+};
+
+AArch64TLBI::TLBIMapper::TLBIMapper()
+  : AArch64NamedImmMapper(TLBIPairs, 0) {}
diff --git a/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
new file mode 100644
index 0000000..9d2ce21
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -0,0 +1,1280 @@
+//===-- AArch64BaseInfo.h - Top level definitions for AArch64 ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the AArch64 target useful for the compiler back-end and the MC libraries.
+// As such, it deliberately does not include references to LLVM core
+// code gen types, passes, etc..
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AArch64BASEINFO_H
+#define AArch64BASEINFO_H
+
+// FIXME: Is it easiest to fix this layering violation by moving the .inc
+// #includes from AArch64MCTargetDesc.h to here?
+#include "MCTargetDesc/AArch64MCTargetDesc.h" // For AArch64::X0 and friends.
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+inline static unsigned getWRegFromXReg(unsigned Reg) {
+  switch (Reg) {
+  case AArch64::X0: return AArch64::W0;
+  case AArch64::X1: return AArch64::W1;
+  case AArch64::X2: return AArch64::W2;
+  case AArch64::X3: return AArch64::W3;
+  case AArch64::X4: return AArch64::W4;
+  case AArch64::X5: return AArch64::W5;
+  case AArch64::X6: return AArch64::W6;
+  case AArch64::X7: return AArch64::W7;
+  case AArch64::X8: return AArch64::W8;
+  case AArch64::X9: return AArch64::W9;
+  case AArch64::X10: return AArch64::W10;
+  case AArch64::X11: return AArch64::W11;
+  case AArch64::X12: return AArch64::W12;
+  case AArch64::X13: return AArch64::W13;
+  case AArch64::X14: return AArch64::W14;
+  case AArch64::X15: return AArch64::W15;
+  case AArch64::X16: return AArch64::W16;
+  case AArch64::X17: return AArch64::W17;
+  case AArch64::X18: return AArch64::W18;
+  case AArch64::X19: return AArch64::W19;
+  case AArch64::X20: return AArch64::W20;
+  case AArch64::X21: return AArch64::W21;
+  case AArch64::X22: return AArch64::W22;
+  case AArch64::X23: return AArch64::W23;
+  case AArch64::X24: return AArch64::W24;
+  case AArch64::X25: return AArch64::W25;
+  case AArch64::X26: return AArch64::W26;
+  case AArch64::X27: return AArch64::W27;
+  case AArch64::X28: return AArch64::W28;
+  case AArch64::FP: return AArch64::W29;
+  case AArch64::LR: return AArch64::W30;
+  case AArch64::SP: return AArch64::WSP;
+  case AArch64::XZR: return AArch64::WZR;
+  }
+  // For anything else, return it unchanged.
+  return Reg;
+}
+
+inline static unsigned getXRegFromWReg(unsigned Reg) {
+  switch (Reg) {
+  case AArch64::W0: return AArch64::X0;
+  case AArch64::W1: return AArch64::X1;
+  case AArch64::W2: return AArch64::X2;
+  case AArch64::W3: return AArch64::X3;
+  case AArch64::W4: return AArch64::X4;
+  case AArch64::W5: return AArch64::X5;
+  case AArch64::W6: return AArch64::X6;
+  case AArch64::W7: return AArch64::X7;
+  case AArch64::W8: return AArch64::X8;
+  case AArch64::W9: return AArch64::X9;
+  case AArch64::W10: return AArch64::X10;
+  case AArch64::W11: return AArch64::X11;
+  case AArch64::W12: return AArch64::X12;
+  case AArch64::W13: return AArch64::X13;
+  case AArch64::W14: return AArch64::X14;
+  case AArch64::W15: return AArch64::X15;
+  case AArch64::W16: return AArch64::X16;
+  case AArch64::W17: return AArch64::X17;
+  case AArch64::W18: return AArch64::X18;
+  case AArch64::W19: return AArch64::X19;
+  case AArch64::W20: return AArch64::X20;
+  case AArch64::W21: return AArch64::X21;
+  case AArch64::W22: return AArch64::X22;
+  case AArch64::W23: return AArch64::X23;
+  case AArch64::W24: return AArch64::X24;
+  case AArch64::W25: return AArch64::X25;
+  case AArch64::W26: return AArch64::X26;
+  case AArch64::W27: return AArch64::X27;
+  case AArch64::W28: return AArch64::X28;
+  case AArch64::W29: return AArch64::FP;
+  case AArch64::W30: return AArch64::LR;
+  case AArch64::WSP: return AArch64::SP;
+  case AArch64::WZR: return AArch64::XZR;
+  }
+  // For anything else, return it unchanged.
+  return Reg;
+}
+
+static inline unsigned getBRegFromDReg(unsigned Reg) {
+  switch (Reg) {
+  case AArch64::D0:  return AArch64::B0;
+  case AArch64::D1:  return AArch64::B1;
+  case AArch64::D2:  return AArch64::B2;
+  case AArch64::D3:  return AArch64::B3;
+  case AArch64::D4:  return AArch64::B4;
+  case AArch64::D5:  return AArch64::B5;
+  case AArch64::D6:  return AArch64::B6;
+  case AArch64::D7:  return AArch64::B7;
+  case AArch64::D8:  return AArch64::B8;
+  case AArch64::D9:  return AArch64::B9;
+  case AArch64::D10: return AArch64::B10;
+  case AArch64::D11: return AArch64::B11;
+  case AArch64::D12: return AArch64::B12;
+  case AArch64::D13: return AArch64::B13;
+  case AArch64::D14: return AArch64::B14;
+  case AArch64::D15: return AArch64::B15;
+  case AArch64::D16: return AArch64::B16;
+  case AArch64::D17: return AArch64::B17;
+  case AArch64::D18: return AArch64::B18;
+  case AArch64::D19: return AArch64::B19;
+  case AArch64::D20: return AArch64::B20;
+  case AArch64::D21: return AArch64::B21;
+  case AArch64::D22: return AArch64::B22;
+  case AArch64::D23: return AArch64::B23;
+  case AArch64::D24: return AArch64::B24;
+  case AArch64::D25: return AArch64::B25;
+  case AArch64::D26: return AArch64::B26;
+  case AArch64::D27: return AArch64::B27;
+  case AArch64::D28: return AArch64::B28;
+  case AArch64::D29: return AArch64::B29;
+  case AArch64::D30: return AArch64::B30;
+  case AArch64::D31: return AArch64::B31;
+  }
+  // For anything else, return it unchanged.
+  return Reg;
+}
+
+
+static inline unsigned getDRegFromBReg(unsigned Reg) {
+  switch (Reg) {
+  case AArch64::B0:  return AArch64::D0;
+  case AArch64::B1:  return AArch64::D1;
+  case AArch64::B2:  return AArch64::D2;
+  case AArch64::B3:  return AArch64::D3;
+  case AArch64::B4:  return AArch64::D4;
+  case AArch64::B5:  return AArch64::D5;
+  case AArch64::B6:  return AArch64::D6;
+  case AArch64::B7:  return AArch64::D7;
+  case AArch64::B8:  return AArch64::D8;
+  case AArch64::B9:  return AArch64::D9;
+  case AArch64::B10: return AArch64::D10;
+  case AArch64::B11: return AArch64::D11;
+  case AArch64::B12: return AArch64::D12;
+  case AArch64::B13: return AArch64::D13;
+  case AArch64::B14: return AArch64::D14;
+  case AArch64::B15: return AArch64::D15;
+  case AArch64::B16: return AArch64::D16;
+  case AArch64::B17: return AArch64::D17;
+  case AArch64::B18: return AArch64::D18;
+  case AArch64::B19: return AArch64::D19;
+  case AArch64::B20: return AArch64::D20;
+  case AArch64::B21: return AArch64::D21;
+  case AArch64::B22: return AArch64::D22;
+  case AArch64::B23: return AArch64::D23;
+  case AArch64::B24: return AArch64::D24;
+  case AArch64::B25: return AArch64::D25;
+  case AArch64::B26: return AArch64::D26;
+  case AArch64::B27: return AArch64::D27;
+  case AArch64::B28: return AArch64::D28;
+  case AArch64::B29: return AArch64::D29;
+  case AArch64::B30: return AArch64::D30;
+  case AArch64::B31: return AArch64::D31;
+  }
+  // For anything else, return it unchanged.
+  return Reg;
+}
+
+namespace AArch64CC {
+
+// The CondCodes constants map directly to the 4-bit encoding of the condition
+// field for predicated instructions.
+enum CondCode {  // Meaning (integer)          Meaning (floating-point)
+  EQ = 0x0,      // Equal                      Equal
+  NE = 0x1,      // Not equal                  Not equal, or unordered
+  HS = 0x2,      // Unsigned higher or same    >, ==, or unordered
+  LO = 0x3,      // Unsigned lower             Less than
+  MI = 0x4,      // Minus, negative            Less than
+  PL = 0x5,      // Plus, positive or zero     >, ==, or unordered
+  VS = 0x6,      // Overflow                   Unordered
+  VC = 0x7,      // No overflow                Not unordered
+  HI = 0x8,      // Unsigned higher            Greater than, or unordered
+  LS = 0x9,      // Unsigned lower or same     Less than or equal
+  GE = 0xa,      // Greater than or equal      Greater than or equal
+  LT = 0xb,      // Less than                  Less than, or unordered
+  GT = 0xc,      // Greater than               Greater than
+  LE = 0xd,      // Less than or equal         <, ==, or unordered
+  AL = 0xe,      // Always (unconditional)     Always (unconditional)
+  NV = 0xf,      // Always (unconditional)     Always (unconditional)
+  // Note the NV exists purely to disassemble 0b1111. Execution is "always".
+  Invalid
+};
+
+inline static const char *getCondCodeName(CondCode Code) {
+  switch (Code) {
+  default: llvm_unreachable("Unknown condition code");
+  case EQ:  return "eq";
+  case NE:  return "ne";
+  case HS:  return "hs";
+  case LO:  return "lo";
+  case MI:  return "mi";
+  case PL:  return "pl";
+  case VS:  return "vs";
+  case VC:  return "vc";
+  case HI:  return "hi";
+  case LS:  return "ls";
+  case GE:  return "ge";
+  case LT:  return "lt";
+  case GT:  return "gt";
+  case LE:  return "le";
+  case AL:  return "al";
+  case NV:  return "nv";
+  }
+}
+
+inline static CondCode getInvertedCondCode(CondCode Code) {
+  // To reverse a condition it's necessary to only invert the low bit:
+
+  return static_cast<CondCode>(static_cast<unsigned>(Code) ^ 0x1);
+}
+
+/// Given a condition code, return NZCV flags that would satisfy that condition.
+/// The flag bits are in the format expected by the ccmp instructions.
+/// Note that many different flag settings can satisfy a given condition code,
+/// this function just returns one of them.
+inline static unsigned getNZCVToSatisfyCondCode(CondCode Code) {
+  // NZCV flags encoded as expected by ccmp instructions, ARMv8 ISA 5.5.7.
+  enum { N = 8, Z = 4, C = 2, V = 1 };
+  switch (Code) {
+  default: llvm_unreachable("Unknown condition code");
+  case EQ: return Z; // Z == 1
+  case NE: return 0; // Z == 0
+  case HS: return C; // C == 1
+  case LO: return 0; // C == 0
+  case MI: return N; // N == 1
+  case PL: return 0; // N == 0
+  case VS: return V; // V == 1
+  case VC: return 0; // V == 0
+  case HI: return C; // C == 1 && Z == 0
+  case LS: return 0; // C == 0 || Z == 1
+  case GE: return 0; // N == V
+  case LT: return N; // N != V
+  case GT: return 0; // Z == 0 && N == V
+  case LE: return Z; // Z == 1 || N != V
+  }
+}
+} // end namespace AArch64CC
+
+/// Instances of this class can perform bidirectional mapping from random
+/// identifier strings to operand encodings. For example "MSR" takes a named
+/// system-register which must be encoded somehow and decoded for printing. This
+/// central location means that the information for those transformations is not
+/// duplicated and remains in sync.
+///
+/// FIXME: currently the algorithm is a completely unoptimised linear
+/// search. Obviously this could be improved, but we would probably want to work
+/// out just how often these instructions are emitted before working on it. It
+/// might even be optimal to just reorder the tables for the common instructions
+/// rather than changing the algorithm.
+struct AArch64NamedImmMapper {
+  struct Mapping {
+    const char *Name;
+    uint32_t Value;
+  };
+
+  template<int N>
+  AArch64NamedImmMapper(const Mapping (&Pairs)[N], uint32_t TooBigImm)
+    : Pairs(&Pairs[0]), NumPairs(N), TooBigImm(TooBigImm) {}
+
+  StringRef toString(uint32_t Value, bool &Valid) const;
+  uint32_t fromString(StringRef Name, bool &Valid) const;
+
+  /// Many of the instructions allow an alternative assembly form consisting of
+  /// a simple immediate. Currently the only valid forms are ranges [0, N) where
+  /// N being 0 indicates no immediate syntax-form is allowed.
+  bool validImm(uint32_t Value) const;
+protected:
+  const Mapping *Pairs;
+  size_t NumPairs;
+  uint32_t TooBigImm;
+};
+
+namespace AArch64AT {
+  enum ATValues {
+    Invalid = -1,    // Op0 Op1  CRn   CRm   Op2
+    S1E1R = 0x43c0,  // 01  000  0111  1000  000
+    S1E2R = 0x63c0,  // 01  100  0111  1000  000
+    S1E3R = 0x73c0,  // 01  110  0111  1000  000
+    S1E1W = 0x43c1,  // 01  000  0111  1000  001
+    S1E2W = 0x63c1,  // 01  100  0111  1000  001
+    S1E3W = 0x73c1,  // 01  110  0111  1000  001
+    S1E0R = 0x43c2,  // 01  000  0111  1000  010
+    S1E0W = 0x43c3,  // 01  000  0111  1000  011
+    S12E1R = 0x63c4, // 01  100  0111  1000  100
+    S12E1W = 0x63c5, // 01  100  0111  1000  101
+    S12E0R = 0x63c6, // 01  100  0111  1000  110
+    S12E0W = 0x63c7  // 01  100  0111  1000  111
+  };
+
+  struct ATMapper : AArch64NamedImmMapper {
+    const static Mapping ATPairs[];
+
+    ATMapper();
+  };
+
+}
+namespace AArch64DB {
+  enum DBValues {
+    Invalid = -1,
+    OSHLD = 0x1,
+    OSHST = 0x2,
+    OSH =   0x3,
+    NSHLD = 0x5,
+    NSHST = 0x6,
+    NSH =   0x7,
+    ISHLD = 0x9,
+    ISHST = 0xa,
+    ISH =   0xb,
+    LD =    0xd,
+    ST =    0xe,
+    SY =    0xf
+  };
+
+  struct DBarrierMapper : AArch64NamedImmMapper {
+    const static Mapping DBarrierPairs[];
+
+    DBarrierMapper();
+  };
+}
+
+namespace  AArch64DC {
+  enum DCValues {
+    Invalid = -1,   // Op1  CRn   CRm   Op2
+    ZVA   = 0x5ba1, // 01  011  0111  0100  001
+    IVAC  = 0x43b1, // 01  000  0111  0110  001
+    ISW   = 0x43b2, // 01  000  0111  0110  010
+    CVAC  = 0x5bd1, // 01  011  0111  1010  001
+    CSW   = 0x43d2, // 01  000  0111  1010  010
+    CVAU  = 0x5bd9, // 01  011  0111  1011  001
+    CIVAC = 0x5bf1, // 01  011  0111  1110  001
+    CISW  = 0x43f2  // 01  000  0111  1110  010
+  };
+
+  struct DCMapper : AArch64NamedImmMapper {
+    const static Mapping DCPairs[];
+
+    DCMapper();
+  };
+
+}
+
+namespace  AArch64IC {
+  enum ICValues {
+    Invalid = -1,     // Op1  CRn   CRm   Op2
+    IALLUIS = 0x0388, // 000  0111  0001  000
+    IALLU = 0x03a8,   // 000  0111  0101  000
+    IVAU = 0x1ba9     // 011  0111  0101  001
+  };
+
+
+  struct ICMapper : AArch64NamedImmMapper {
+    const static Mapping ICPairs[];
+
+    ICMapper();
+  };
+
+  static inline bool NeedsRegister(ICValues Val) {
+    return Val == IVAU;
+  }
+}
+
+namespace  AArch64ISB {
+  enum ISBValues {
+    Invalid = -1,
+    SY = 0xf
+  };
+  struct ISBMapper : AArch64NamedImmMapper {
+    const static Mapping ISBPairs[];
+
+    ISBMapper();
+  };
+}
+
+namespace AArch64PRFM {
+  enum PRFMValues {
+    Invalid = -1,
+    PLDL1KEEP = 0x00,
+    PLDL1STRM = 0x01,
+    PLDL2KEEP = 0x02,
+    PLDL2STRM = 0x03,
+    PLDL3KEEP = 0x04,
+    PLDL3STRM = 0x05,
+    PLIL1KEEP = 0x08,
+    PLIL1STRM = 0x09,
+    PLIL2KEEP = 0x0a,
+    PLIL2STRM = 0x0b,
+    PLIL3KEEP = 0x0c,
+    PLIL3STRM = 0x0d,
+    PSTL1KEEP = 0x10,
+    PSTL1STRM = 0x11,
+    PSTL2KEEP = 0x12,
+    PSTL2STRM = 0x13,
+    PSTL3KEEP = 0x14,
+    PSTL3STRM = 0x15
+  };
+
+  struct PRFMMapper : AArch64NamedImmMapper {
+    const static Mapping PRFMPairs[];
+
+    PRFMMapper();
+  };
+}
+
+namespace AArch64PState {
+  enum PStateValues {
+    Invalid = -1,
+    SPSel = 0x05,
+    DAIFSet = 0x1e,
+    DAIFClr = 0x1f
+  };
+
+  struct PStateMapper : AArch64NamedImmMapper {
+    const static Mapping PStatePairs[];
+
+    PStateMapper();
+  };
+
+}
+
+namespace AArch64SE {
+    enum ShiftExtSpecifiers {
+        Invalid = -1,
+        LSL,
+        MSL,
+        LSR,
+        ASR,
+        ROR,
+
+        UXTB,
+        UXTH,
+        UXTW,
+        UXTX,
+
+        SXTB,
+        SXTH,
+        SXTW,
+        SXTX
+    };
+}
+
+namespace AArch64Layout {
+    enum VectorLayout {
+        Invalid = -1,
+        VL_8B,
+        VL_4H,
+        VL_2S,
+        VL_1D,
+
+        VL_16B,
+        VL_8H,
+        VL_4S,
+        VL_2D,
+
+        // Bare layout for the 128-bit vector
+        // (only show ".b", ".h", ".s", ".d" without vector number)
+        VL_B,
+        VL_H,
+        VL_S,
+        VL_D
+    };
+}
+
+inline static const char *
+AArch64VectorLayoutToString(AArch64Layout::VectorLayout Layout) {
+  switch (Layout) {
+  case AArch64Layout::VL_8B:  return ".8b";
+  case AArch64Layout::VL_4H:  return ".4h";
+  case AArch64Layout::VL_2S:  return ".2s";
+  case AArch64Layout::VL_1D:  return ".1d";
+  case AArch64Layout::VL_16B:  return ".16b";
+  case AArch64Layout::VL_8H:  return ".8h";
+  case AArch64Layout::VL_4S:  return ".4s";
+  case AArch64Layout::VL_2D:  return ".2d";
+  case AArch64Layout::VL_B:  return ".b";
+  case AArch64Layout::VL_H:  return ".h";
+  case AArch64Layout::VL_S:  return ".s";
+  case AArch64Layout::VL_D:  return ".d";
+  default: llvm_unreachable("Unknown Vector Layout");
+  }
+}
+
+inline static AArch64Layout::VectorLayout
+AArch64StringToVectorLayout(StringRef LayoutStr) {
+  return StringSwitch<AArch64Layout::VectorLayout>(LayoutStr)
+             .Case(".8b", AArch64Layout::VL_8B)
+             .Case(".4h", AArch64Layout::VL_4H)
+             .Case(".2s", AArch64Layout::VL_2S)
+             .Case(".1d", AArch64Layout::VL_1D)
+             .Case(".16b", AArch64Layout::VL_16B)
+             .Case(".8h", AArch64Layout::VL_8H)
+             .Case(".4s", AArch64Layout::VL_4S)
+             .Case(".2d", AArch64Layout::VL_2D)
+             .Case(".b", AArch64Layout::VL_B)
+             .Case(".h", AArch64Layout::VL_H)
+             .Case(".s", AArch64Layout::VL_S)
+             .Case(".d", AArch64Layout::VL_D)
+             .Default(AArch64Layout::Invalid);
+}
+
+namespace AArch64SysReg {
+  enum SysRegROValues {
+    MDCCSR_EL0        = 0x9808, // 10  011  0000  0001  000
+    DBGDTRRX_EL0      = 0x9828, // 10  011  0000  0101  000
+    MDRAR_EL1         = 0x8080, // 10  000  0001  0000  000
+    OSLSR_EL1         = 0x808c, // 10  000  0001  0001  100
+    DBGAUTHSTATUS_EL1 = 0x83f6, // 10  000  0111  1110  110
+    PMCEID0_EL0       = 0xdce6, // 11  011  1001  1100  110
+    PMCEID1_EL0       = 0xdce7, // 11  011  1001  1100  111
+    MIDR_EL1          = 0xc000, // 11  000  0000  0000  000
+    CCSIDR_EL1        = 0xc800, // 11  001  0000  0000  000
+    CLIDR_EL1         = 0xc801, // 11  001  0000  0000  001
+    CTR_EL0           = 0xd801, // 11  011  0000  0000  001
+    MPIDR_EL1         = 0xc005, // 11  000  0000  0000  101
+    REVIDR_EL1        = 0xc006, // 11  000  0000  0000  110
+    AIDR_EL1          = 0xc807, // 11  001  0000  0000  111
+    DCZID_EL0         = 0xd807, // 11  011  0000  0000  111
+    ID_PFR0_EL1       = 0xc008, // 11  000  0000  0001  000
+    ID_PFR1_EL1       = 0xc009, // 11  000  0000  0001  001
+    ID_DFR0_EL1       = 0xc00a, // 11  000  0000  0001  010
+    ID_AFR0_EL1       = 0xc00b, // 11  000  0000  0001  011
+    ID_MMFR0_EL1      = 0xc00c, // 11  000  0000  0001  100
+    ID_MMFR1_EL1      = 0xc00d, // 11  000  0000  0001  101
+    ID_MMFR2_EL1      = 0xc00e, // 11  000  0000  0001  110
+    ID_MMFR3_EL1      = 0xc00f, // 11  000  0000  0001  111
+    ID_ISAR0_EL1      = 0xc010, // 11  000  0000  0010  000
+    ID_ISAR1_EL1      = 0xc011, // 11  000  0000  0010  001
+    ID_ISAR2_EL1      = 0xc012, // 11  000  0000  0010  010
+    ID_ISAR3_EL1      = 0xc013, // 11  000  0000  0010  011
+    ID_ISAR4_EL1      = 0xc014, // 11  000  0000  0010  100
+    ID_ISAR5_EL1      = 0xc015, // 11  000  0000  0010  101
+    ID_A64PFR0_EL1    = 0xc020, // 11  000  0000  0100  000
+    ID_A64PFR1_EL1    = 0xc021, // 11  000  0000  0100  001
+    ID_A64DFR0_EL1    = 0xc028, // 11  000  0000  0101  000
+    ID_A64DFR1_EL1    = 0xc029, // 11  000  0000  0101  001
+    ID_A64AFR0_EL1    = 0xc02c, // 11  000  0000  0101  100
+    ID_A64AFR1_EL1    = 0xc02d, // 11  000  0000  0101  101
+    ID_A64ISAR0_EL1   = 0xc030, // 11  000  0000  0110  000
+    ID_A64ISAR1_EL1   = 0xc031, // 11  000  0000  0110  001
+    ID_A64MMFR0_EL1   = 0xc038, // 11  000  0000  0111  000
+    ID_A64MMFR1_EL1   = 0xc039, // 11  000  0000  0111  001
+    MVFR0_EL1         = 0xc018, // 11  000  0000  0011  000
+    MVFR1_EL1         = 0xc019, // 11  000  0000  0011  001
+    MVFR2_EL1         = 0xc01a, // 11  000  0000  0011  010
+    RVBAR_EL1         = 0xc601, // 11  000  1100  0000  001
+    RVBAR_EL2         = 0xe601, // 11  100  1100  0000  001
+    RVBAR_EL3         = 0xf601, // 11  110  1100  0000  001
+    ISR_EL1           = 0xc608, // 11  000  1100  0001  000
+    CNTPCT_EL0        = 0xdf01, // 11  011  1110  0000  001
+    CNTVCT_EL0        = 0xdf02,  // 11  011  1110  0000  010
+
+    // Trace registers
+    TRCSTATR          = 0x8818, // 10  001  0000  0011  000
+    TRCIDR8           = 0x8806, // 10  001  0000  0000  110
+    TRCIDR9           = 0x880e, // 10  001  0000  0001  110
+    TRCIDR10          = 0x8816, // 10  001  0000  0010  110
+    TRCIDR11          = 0x881e, // 10  001  0000  0011  110
+    TRCIDR12          = 0x8826, // 10  001  0000  0100  110
+    TRCIDR13          = 0x882e, // 10  001  0000  0101  110
+    TRCIDR0           = 0x8847, // 10  001  0000  1000  111
+    TRCIDR1           = 0x884f, // 10  001  0000  1001  111
+    TRCIDR2           = 0x8857, // 10  001  0000  1010  111
+    TRCIDR3           = 0x885f, // 10  001  0000  1011  111
+    TRCIDR4           = 0x8867, // 10  001  0000  1100  111
+    TRCIDR5           = 0x886f, // 10  001  0000  1101  111
+    TRCIDR6           = 0x8877, // 10  001  0000  1110  111
+    TRCIDR7           = 0x887f, // 10  001  0000  1111  111
+    TRCOSLSR          = 0x888c, // 10  001  0001  0001  100
+    TRCPDSR           = 0x88ac, // 10  001  0001  0101  100
+    TRCDEVAFF0        = 0x8bd6, // 10  001  0111  1010  110
+    TRCDEVAFF1        = 0x8bde, // 10  001  0111  1011  110
+    TRCLSR            = 0x8bee, // 10  001  0111  1101  110
+    TRCAUTHSTATUS     = 0x8bf6, // 10  001  0111  1110  110
+    TRCDEVARCH        = 0x8bfe, // 10  001  0111  1111  110
+    TRCDEVID          = 0x8b97, // 10  001  0111  0010  111
+    TRCDEVTYPE        = 0x8b9f, // 10  001  0111  0011  111
+    TRCPIDR4          = 0x8ba7, // 10  001  0111  0100  111
+    TRCPIDR5          = 0x8baf, // 10  001  0111  0101  111
+    TRCPIDR6          = 0x8bb7, // 10  001  0111  0110  111
+    TRCPIDR7          = 0x8bbf, // 10  001  0111  0111  111
+    TRCPIDR0          = 0x8bc7, // 10  001  0111  1000  111
+    TRCPIDR1          = 0x8bcf, // 10  001  0111  1001  111
+    TRCPIDR2          = 0x8bd7, // 10  001  0111  1010  111
+    TRCPIDR3          = 0x8bdf, // 10  001  0111  1011  111
+    TRCCIDR0          = 0x8be7, // 10  001  0111  1100  111
+    TRCCIDR1          = 0x8bef, // 10  001  0111  1101  111
+    TRCCIDR2          = 0x8bf7, // 10  001  0111  1110  111
+    TRCCIDR3          = 0x8bff, // 10  001  0111  1111  111
+
+    // GICv3 registers
+    ICC_IAR1_EL1      = 0xc660, // 11  000  1100  1100  000
+    ICC_IAR0_EL1      = 0xc640, // 11  000  1100  1000  000
+    ICC_HPPIR1_EL1    = 0xc662, // 11  000  1100  1100  010
+    ICC_HPPIR0_EL1    = 0xc642, // 11  000  1100  1000  010
+    ICC_RPR_EL1       = 0xc65b, // 11  000  1100  1011  011
+    ICH_VTR_EL2       = 0xe659, // 11  100  1100  1011  001
+    ICH_EISR_EL2      = 0xe65b, // 11  100  1100  1011  011
+    ICH_ELSR_EL2      = 0xe65d  // 11  100  1100  1011  101
+  };
+
+  enum SysRegWOValues {
+    DBGDTRTX_EL0      = 0x9828, // 10  011  0000  0101  000
+    OSLAR_EL1         = 0x8084, // 10  000  0001  0000  100
+    PMSWINC_EL0       = 0xdce4,  // 11  011  1001  1100  100
+
+    // Trace Registers
+    TRCOSLAR          = 0x8884, // 10  001  0001  0000  100
+    TRCLAR            = 0x8be6, // 10  001  0111  1100  110
+
+    // GICv3 registers
+    ICC_EOIR1_EL1     = 0xc661, // 11  000  1100  1100  001
+    ICC_EOIR0_EL1     = 0xc641, // 11  000  1100  1000  001
+    ICC_DIR_EL1       = 0xc659, // 11  000  1100  1011  001
+    ICC_SGI1R_EL1     = 0xc65d, // 11  000  1100  1011  101
+    ICC_ASGI1R_EL1    = 0xc65e, // 11  000  1100  1011  110
+    ICC_SGI0R_EL1     = 0xc65f  // 11  000  1100  1011  111
+  };
+
+  enum SysRegValues {
+    Invalid = -1,               // Op0 Op1  CRn   CRm   Op2
+    OSDTRRX_EL1       = 0x8002, // 10  000  0000  0000  010
+    OSDTRTX_EL1       = 0x801a, // 10  000  0000  0011  010
+    TEECR32_EL1       = 0x9000, // 10  010  0000  0000  000
+    MDCCINT_EL1       = 0x8010, // 10  000  0000  0010  000
+    MDSCR_EL1         = 0x8012, // 10  000  0000  0010  010
+    DBGDTR_EL0        = 0x9820, // 10  011  0000  0100  000
+    OSECCR_EL1        = 0x8032, // 10  000  0000  0110  010
+    DBGVCR32_EL2      = 0xa038, // 10  100  0000  0111  000
+    DBGBVR0_EL1       = 0x8004, // 10  000  0000  0000  100
+    DBGBVR1_EL1       = 0x800c, // 10  000  0000  0001  100
+    DBGBVR2_EL1       = 0x8014, // 10  000  0000  0010  100
+    DBGBVR3_EL1       = 0x801c, // 10  000  0000  0011  100
+    DBGBVR4_EL1       = 0x8024, // 10  000  0000  0100  100
+    DBGBVR5_EL1       = 0x802c, // 10  000  0000  0101  100
+    DBGBVR6_EL1       = 0x8034, // 10  000  0000  0110  100
+    DBGBVR7_EL1       = 0x803c, // 10  000  0000  0111  100
+    DBGBVR8_EL1       = 0x8044, // 10  000  0000  1000  100
+    DBGBVR9_EL1       = 0x804c, // 10  000  0000  1001  100
+    DBGBVR10_EL1      = 0x8054, // 10  000  0000  1010  100
+    DBGBVR11_EL1      = 0x805c, // 10  000  0000  1011  100
+    DBGBVR12_EL1      = 0x8064, // 10  000  0000  1100  100
+    DBGBVR13_EL1      = 0x806c, // 10  000  0000  1101  100
+    DBGBVR14_EL1      = 0x8074, // 10  000  0000  1110  100
+    DBGBVR15_EL1      = 0x807c, // 10  000  0000  1111  100
+    DBGBCR0_EL1       = 0x8005, // 10  000  0000  0000  101
+    DBGBCR1_EL1       = 0x800d, // 10  000  0000  0001  101
+    DBGBCR2_EL1       = 0x8015, // 10  000  0000  0010  101
+    DBGBCR3_EL1       = 0x801d, // 10  000  0000  0011  101
+    DBGBCR4_EL1       = 0x8025, // 10  000  0000  0100  101
+    DBGBCR5_EL1       = 0x802d, // 10  000  0000  0101  101
+    DBGBCR6_EL1       = 0x8035, // 10  000  0000  0110  101
+    DBGBCR7_EL1       = 0x803d, // 10  000  0000  0111  101
+    DBGBCR8_EL1       = 0x8045, // 10  000  0000  1000  101
+    DBGBCR9_EL1       = 0x804d, // 10  000  0000  1001  101
+    DBGBCR10_EL1      = 0x8055, // 10  000  0000  1010  101
+    DBGBCR11_EL1      = 0x805d, // 10  000  0000  1011  101
+    DBGBCR12_EL1      = 0x8065, // 10  000  0000  1100  101
+    DBGBCR13_EL1      = 0x806d, // 10  000  0000  1101  101
+    DBGBCR14_EL1      = 0x8075, // 10  000  0000  1110  101
+    DBGBCR15_EL1      = 0x807d, // 10  000  0000  1111  101
+    DBGWVR0_EL1       = 0x8006, // 10  000  0000  0000  110
+    DBGWVR1_EL1       = 0x800e, // 10  000  0000  0001  110
+    DBGWVR2_EL1       = 0x8016, // 10  000  0000  0010  110
+    DBGWVR3_EL1       = 0x801e, // 10  000  0000  0011  110
+    DBGWVR4_EL1       = 0x8026, // 10  000  0000  0100  110
+    DBGWVR5_EL1       = 0x802e, // 10  000  0000  0101  110
+    DBGWVR6_EL1       = 0x8036, // 10  000  0000  0110  110
+    DBGWVR7_EL1       = 0x803e, // 10  000  0000  0111  110
+    DBGWVR8_EL1       = 0x8046, // 10  000  0000  1000  110
+    DBGWVR9_EL1       = 0x804e, // 10  000  0000  1001  110
+    DBGWVR10_EL1      = 0x8056, // 10  000  0000  1010  110
+    DBGWVR11_EL1      = 0x805e, // 10  000  0000  1011  110
+    DBGWVR12_EL1      = 0x8066, // 10  000  0000  1100  110
+    DBGWVR13_EL1      = 0x806e, // 10  000  0000  1101  110
+    DBGWVR14_EL1      = 0x8076, // 10  000  0000  1110  110
+    DBGWVR15_EL1      = 0x807e, // 10  000  0000  1111  110
+    DBGWCR0_EL1       = 0x8007, // 10  000  0000  0000  111
+    DBGWCR1_EL1       = 0x800f, // 10  000  0000  0001  111
+    DBGWCR2_EL1       = 0x8017, // 10  000  0000  0010  111
+    DBGWCR3_EL1       = 0x801f, // 10  000  0000  0011  111
+    DBGWCR4_EL1       = 0x8027, // 10  000  0000  0100  111
+    DBGWCR5_EL1       = 0x802f, // 10  000  0000  0101  111
+    DBGWCR6_EL1       = 0x8037, // 10  000  0000  0110  111
+    DBGWCR7_EL1       = 0x803f, // 10  000  0000  0111  111
+    DBGWCR8_EL1       = 0x8047, // 10  000  0000  1000  111
+    DBGWCR9_EL1       = 0x804f, // 10  000  0000  1001  111
+    DBGWCR10_EL1      = 0x8057, // 10  000  0000  1010  111
+    DBGWCR11_EL1      = 0x805f, // 10  000  0000  1011  111
+    DBGWCR12_EL1      = 0x8067, // 10  000  0000  1100  111
+    DBGWCR13_EL1      = 0x806f, // 10  000  0000  1101  111
+    DBGWCR14_EL1      = 0x8077, // 10  000  0000  1110  111
+    DBGWCR15_EL1      = 0x807f, // 10  000  0000  1111  111
+    TEEHBR32_EL1      = 0x9080, // 10  010  0001  0000  000
+    OSDLR_EL1         = 0x809c, // 10  000  0001  0011  100
+    DBGPRCR_EL1       = 0x80a4, // 10  000  0001  0100  100
+    DBGCLAIMSET_EL1   = 0x83c6, // 10  000  0111  1000  110
+    DBGCLAIMCLR_EL1   = 0x83ce, // 10  000  0111  1001  110
+    CSSELR_EL1        = 0xd000, // 11  010  0000  0000  000
+    VPIDR_EL2         = 0xe000, // 11  100  0000  0000  000
+    VMPIDR_EL2        = 0xe005, // 11  100  0000  0000  101
+    CPACR_EL1         = 0xc082, // 11  000  0001  0000  010
+    SCTLR_EL1         = 0xc080, // 11  000  0001  0000  000
+    SCTLR_EL2         = 0xe080, // 11  100  0001  0000  000
+    SCTLR_EL3         = 0xf080, // 11  110  0001  0000  000
+    ACTLR_EL1         = 0xc081, // 11  000  0001  0000  001
+    ACTLR_EL2         = 0xe081, // 11  100  0001  0000  001
+    ACTLR_EL3         = 0xf081, // 11  110  0001  0000  001
+    HCR_EL2           = 0xe088, // 11  100  0001  0001  000
+    SCR_EL3           = 0xf088, // 11  110  0001  0001  000
+    MDCR_EL2          = 0xe089, // 11  100  0001  0001  001
+    SDER32_EL3        = 0xf089, // 11  110  0001  0001  001
+    CPTR_EL2          = 0xe08a, // 11  100  0001  0001  010
+    CPTR_EL3          = 0xf08a, // 11  110  0001  0001  010
+    HSTR_EL2          = 0xe08b, // 11  100  0001  0001  011
+    HACR_EL2          = 0xe08f, // 11  100  0001  0001  111
+    MDCR_EL3          = 0xf099, // 11  110  0001  0011  001
+    TTBR0_EL1         = 0xc100, // 11  000  0010  0000  000
+    TTBR0_EL2         = 0xe100, // 11  100  0010  0000  000
+    TTBR0_EL3         = 0xf100, // 11  110  0010  0000  000
+    TTBR1_EL1         = 0xc101, // 11  000  0010  0000  001
+    TCR_EL1           = 0xc102, // 11  000  0010  0000  010
+    TCR_EL2           = 0xe102, // 11  100  0010  0000  010
+    TCR_EL3           = 0xf102, // 11  110  0010  0000  010
+    VTTBR_EL2         = 0xe108, // 11  100  0010  0001  000
+    VTCR_EL2          = 0xe10a, // 11  100  0010  0001  010
+    DACR32_EL2        = 0xe180, // 11  100  0011  0000  000
+    SPSR_EL1          = 0xc200, // 11  000  0100  0000  000
+    SPSR_EL2          = 0xe200, // 11  100  0100  0000  000
+    SPSR_EL3          = 0xf200, // 11  110  0100  0000  000
+    ELR_EL1           = 0xc201, // 11  000  0100  0000  001
+    ELR_EL2           = 0xe201, // 11  100  0100  0000  001
+    ELR_EL3           = 0xf201, // 11  110  0100  0000  001
+    SP_EL0            = 0xc208, // 11  000  0100  0001  000
+    SP_EL1            = 0xe208, // 11  100  0100  0001  000
+    SP_EL2            = 0xf208, // 11  110  0100  0001  000
+    SPSel             = 0xc210, // 11  000  0100  0010  000
+    NZCV              = 0xda10, // 11  011  0100  0010  000
+    DAIF              = 0xda11, // 11  011  0100  0010  001
+    CurrentEL         = 0xc212, // 11  000  0100  0010  010
+    SPSR_irq          = 0xe218, // 11  100  0100  0011  000
+    SPSR_abt          = 0xe219, // 11  100  0100  0011  001
+    SPSR_und          = 0xe21a, // 11  100  0100  0011  010
+    SPSR_fiq          = 0xe21b, // 11  100  0100  0011  011
+    FPCR              = 0xda20, // 11  011  0100  0100  000
+    FPSR              = 0xda21, // 11  011  0100  0100  001
+    DSPSR_EL0         = 0xda28, // 11  011  0100  0101  000
+    DLR_EL0           = 0xda29, // 11  011  0100  0101  001
+    IFSR32_EL2        = 0xe281, // 11  100  0101  0000  001
+    AFSR0_EL1         = 0xc288, // 11  000  0101  0001  000
+    AFSR0_EL2         = 0xe288, // 11  100  0101  0001  000
+    AFSR0_EL3         = 0xf288, // 11  110  0101  0001  000
+    AFSR1_EL1         = 0xc289, // 11  000  0101  0001  001
+    AFSR1_EL2         = 0xe289, // 11  100  0101  0001  001
+    AFSR1_EL3         = 0xf289, // 11  110  0101  0001  001
+    ESR_EL1           = 0xc290, // 11  000  0101  0010  000
+    ESR_EL2           = 0xe290, // 11  100  0101  0010  000
+    ESR_EL3           = 0xf290, // 11  110  0101  0010  000
+    FPEXC32_EL2       = 0xe298, // 11  100  0101  0011  000
+    FAR_EL1           = 0xc300, // 11  000  0110  0000  000
+    FAR_EL2           = 0xe300, // 11  100  0110  0000  000
+    FAR_EL3           = 0xf300, // 11  110  0110  0000  000
+    HPFAR_EL2         = 0xe304, // 11  100  0110  0000  100
+    PAR_EL1           = 0xc3a0, // 11  000  0111  0100  000
+    PMCR_EL0          = 0xdce0, // 11  011  1001  1100  000
+    PMCNTENSET_EL0    = 0xdce1, // 11  011  1001  1100  001
+    PMCNTENCLR_EL0    = 0xdce2, // 11  011  1001  1100  010
+    PMOVSCLR_EL0      = 0xdce3, // 11  011  1001  1100  011
+    PMSELR_EL0        = 0xdce5, // 11  011  1001  1100  101
+    PMCCNTR_EL0       = 0xdce8, // 11  011  1001  1101  000
+    PMXEVTYPER_EL0    = 0xdce9, // 11  011  1001  1101  001
+    PMXEVCNTR_EL0     = 0xdcea, // 11  011  1001  1101  010
+    PMUSERENR_EL0     = 0xdcf0, // 11  011  1001  1110  000
+    PMINTENSET_EL1    = 0xc4f1, // 11  000  1001  1110  001
+    PMINTENCLR_EL1    = 0xc4f2, // 11  000  1001  1110  010
+    PMOVSSET_EL0      = 0xdcf3, // 11  011  1001  1110  011
+    MAIR_EL1          = 0xc510, // 11  000  1010  0010  000
+    MAIR_EL2          = 0xe510, // 11  100  1010  0010  000
+    MAIR_EL3          = 0xf510, // 11  110  1010  0010  000
+    AMAIR_EL1         = 0xc518, // 11  000  1010  0011  000
+    AMAIR_EL2         = 0xe518, // 11  100  1010  0011  000
+    AMAIR_EL3         = 0xf518, // 11  110  1010  0011  000
+    VBAR_EL1          = 0xc600, // 11  000  1100  0000  000
+    VBAR_EL2          = 0xe600, // 11  100  1100  0000  000
+    VBAR_EL3          = 0xf600, // 11  110  1100  0000  000
+    RMR_EL1           = 0xc602, // 11  000  1100  0000  010
+    RMR_EL2           = 0xe602, // 11  100  1100  0000  010
+    RMR_EL3           = 0xf602, // 11  110  1100  0000  010
+    CONTEXTIDR_EL1    = 0xc681, // 11  000  1101  0000  001
+    TPIDR_EL0         = 0xde82, // 11  011  1101  0000  010
+    TPIDR_EL2         = 0xe682, // 11  100  1101  0000  010
+    TPIDR_EL3         = 0xf682, // 11  110  1101  0000  010
+    TPIDRRO_EL0       = 0xde83, // 11  011  1101  0000  011
+    TPIDR_EL1         = 0xc684, // 11  000  1101  0000  100
+    CNTFRQ_EL0        = 0xdf00, // 11  011  1110  0000  000
+    CNTVOFF_EL2       = 0xe703, // 11  100  1110  0000  011
+    CNTKCTL_EL1       = 0xc708, // 11  000  1110  0001  000
+    CNTHCTL_EL2       = 0xe708, // 11  100  1110  0001  000
+    CNTP_TVAL_EL0     = 0xdf10, // 11  011  1110  0010  000
+    CNTHP_TVAL_EL2    = 0xe710, // 11  100  1110  0010  000
+    CNTPS_TVAL_EL1    = 0xff10, // 11  111  1110  0010  000
+    CNTP_CTL_EL0      = 0xdf11, // 11  011  1110  0010  001
+    CNTHP_CTL_EL2     = 0xe711, // 11  100  1110  0010  001
+    CNTPS_CTL_EL1     = 0xff11, // 11  111  1110  0010  001
+    CNTP_CVAL_EL0     = 0xdf12, // 11  011  1110  0010  010
+    CNTHP_CVAL_EL2    = 0xe712, // 11  100  1110  0010  010
+    CNTPS_CVAL_EL1    = 0xff12, // 11  111  1110  0010  010
+    CNTV_TVAL_EL0     = 0xdf18, // 11  011  1110  0011  000
+    CNTV_CTL_EL0      = 0xdf19, // 11  011  1110  0011  001
+    CNTV_CVAL_EL0     = 0xdf1a, // 11  011  1110  0011  010
+    PMEVCNTR0_EL0     = 0xdf40, // 11  011  1110  1000  000
+    PMEVCNTR1_EL0     = 0xdf41, // 11  011  1110  1000  001
+    PMEVCNTR2_EL0     = 0xdf42, // 11  011  1110  1000  010
+    PMEVCNTR3_EL0     = 0xdf43, // 11  011  1110  1000  011
+    PMEVCNTR4_EL0     = 0xdf44, // 11  011  1110  1000  100
+    PMEVCNTR5_EL0     = 0xdf45, // 11  011  1110  1000  101
+    PMEVCNTR6_EL0     = 0xdf46, // 11  011  1110  1000  110
+    PMEVCNTR7_EL0     = 0xdf47, // 11  011  1110  1000  111
+    PMEVCNTR8_EL0     = 0xdf48, // 11  011  1110  1001  000
+    PMEVCNTR9_EL0     = 0xdf49, // 11  011  1110  1001  001
+    PMEVCNTR10_EL0    = 0xdf4a, // 11  011  1110  1001  010
+    PMEVCNTR11_EL0    = 0xdf4b, // 11  011  1110  1001  011
+    PMEVCNTR12_EL0    = 0xdf4c, // 11  011  1110  1001  100
+    PMEVCNTR13_EL0    = 0xdf4d, // 11  011  1110  1001  101
+    PMEVCNTR14_EL0    = 0xdf4e, // 11  011  1110  1001  110
+    PMEVCNTR15_EL0    = 0xdf4f, // 11  011  1110  1001  111
+    PMEVCNTR16_EL0    = 0xdf50, // 11  011  1110  1010  000
+    PMEVCNTR17_EL0    = 0xdf51, // 11  011  1110  1010  001
+    PMEVCNTR18_EL0    = 0xdf52, // 11  011  1110  1010  010
+    PMEVCNTR19_EL0    = 0xdf53, // 11  011  1110  1010  011
+    PMEVCNTR20_EL0    = 0xdf54, // 11  011  1110  1010  100
+    PMEVCNTR21_EL0    = 0xdf55, // 11  011  1110  1010  101
+    PMEVCNTR22_EL0    = 0xdf56, // 11  011  1110  1010  110
+    PMEVCNTR23_EL0    = 0xdf57, // 11  011  1110  1010  111
+    PMEVCNTR24_EL0    = 0xdf58, // 11  011  1110  1011  000
+    PMEVCNTR25_EL0    = 0xdf59, // 11  011  1110  1011  001
+    PMEVCNTR26_EL0    = 0xdf5a, // 11  011  1110  1011  010
+    PMEVCNTR27_EL0    = 0xdf5b, // 11  011  1110  1011  011
+    PMEVCNTR28_EL0    = 0xdf5c, // 11  011  1110  1011  100
+    PMEVCNTR29_EL0    = 0xdf5d, // 11  011  1110  1011  101
+    PMEVCNTR30_EL0    = 0xdf5e, // 11  011  1110  1011  110
+    PMCCFILTR_EL0     = 0xdf7f, // 11  011  1110  1111  111
+    PMEVTYPER0_EL0    = 0xdf60, // 11  011  1110  1100  000
+    PMEVTYPER1_EL0    = 0xdf61, // 11  011  1110  1100  001
+    PMEVTYPER2_EL0    = 0xdf62, // 11  011  1110  1100  010
+    PMEVTYPER3_EL0    = 0xdf63, // 11  011  1110  1100  011
+    PMEVTYPER4_EL0    = 0xdf64, // 11  011  1110  1100  100
+    PMEVTYPER5_EL0    = 0xdf65, // 11  011  1110  1100  101
+    PMEVTYPER6_EL0    = 0xdf66, // 11  011  1110  1100  110
+    PMEVTYPER7_EL0    = 0xdf67, // 11  011  1110  1100  111
+    PMEVTYPER8_EL0    = 0xdf68, // 11  011  1110  1101  000
+    PMEVTYPER9_EL0    = 0xdf69, // 11  011  1110  1101  001
+    PMEVTYPER10_EL0   = 0xdf6a, // 11  011  1110  1101  010
+    PMEVTYPER11_EL0   = 0xdf6b, // 11  011  1110  1101  011
+    PMEVTYPER12_EL0   = 0xdf6c, // 11  011  1110  1101  100
+    PMEVTYPER13_EL0   = 0xdf6d, // 11  011  1110  1101  101
+    PMEVTYPER14_EL0   = 0xdf6e, // 11  011  1110  1101  110
+    PMEVTYPER15_EL0   = 0xdf6f, // 11  011  1110  1101  111
+    PMEVTYPER16_EL0   = 0xdf70, // 11  011  1110  1110  000
+    PMEVTYPER17_EL0   = 0xdf71, // 11  011  1110  1110  001
+    PMEVTYPER18_EL0   = 0xdf72, // 11  011  1110  1110  010
+    PMEVTYPER19_EL0   = 0xdf73, // 11  011  1110  1110  011
+    PMEVTYPER20_EL0   = 0xdf74, // 11  011  1110  1110  100
+    PMEVTYPER21_EL0   = 0xdf75, // 11  011  1110  1110  101
+    PMEVTYPER22_EL0   = 0xdf76, // 11  011  1110  1110  110
+    PMEVTYPER23_EL0   = 0xdf77, // 11  011  1110  1110  111
+    PMEVTYPER24_EL0   = 0xdf78, // 11  011  1110  1111  000
+    PMEVTYPER25_EL0   = 0xdf79, // 11  011  1110  1111  001
+    PMEVTYPER26_EL0   = 0xdf7a, // 11  011  1110  1111  010
+    PMEVTYPER27_EL0   = 0xdf7b, // 11  011  1110  1111  011
+    PMEVTYPER28_EL0   = 0xdf7c, // 11  011  1110  1111  100
+    PMEVTYPER29_EL0   = 0xdf7d, // 11  011  1110  1111  101
+    PMEVTYPER30_EL0   = 0xdf7e, // 11  011  1110  1111  110
+
+    // Trace registers
+    TRCPRGCTLR        = 0x8808, // 10  001  0000  0001  000
+    TRCPROCSELR       = 0x8810, // 10  001  0000  0010  000
+    TRCCONFIGR        = 0x8820, // 10  001  0000  0100  000
+    TRCAUXCTLR        = 0x8830, // 10  001  0000  0110  000
+    TRCEVENTCTL0R     = 0x8840, // 10  001  0000  1000  000
+    TRCEVENTCTL1R     = 0x8848, // 10  001  0000  1001  000
+    TRCSTALLCTLR      = 0x8858, // 10  001  0000  1011  000
+    TRCTSCTLR         = 0x8860, // 10  001  0000  1100  000
+    TRCSYNCPR         = 0x8868, // 10  001  0000  1101  000
+    TRCCCCTLR         = 0x8870, // 10  001  0000  1110  000
+    TRCBBCTLR         = 0x8878, // 10  001  0000  1111  000
+    TRCTRACEIDR       = 0x8801, // 10  001  0000  0000  001
+    TRCQCTLR          = 0x8809, // 10  001  0000  0001  001
+    TRCVICTLR         = 0x8802, // 10  001  0000  0000  010
+    TRCVIIECTLR       = 0x880a, // 10  001  0000  0001  010
+    TRCVISSCTLR       = 0x8812, // 10  001  0000  0010  010
+    TRCVIPCSSCTLR     = 0x881a, // 10  001  0000  0011  010
+    TRCVDCTLR         = 0x8842, // 10  001  0000  1000  010
+    TRCVDSACCTLR      = 0x884a, // 10  001  0000  1001  010
+    TRCVDARCCTLR      = 0x8852, // 10  001  0000  1010  010
+    TRCSEQEVR0        = 0x8804, // 10  001  0000  0000  100
+    TRCSEQEVR1        = 0x880c, // 10  001  0000  0001  100
+    TRCSEQEVR2        = 0x8814, // 10  001  0000  0010  100
+    TRCSEQRSTEVR      = 0x8834, // 10  001  0000  0110  100
+    TRCSEQSTR         = 0x883c, // 10  001  0000  0111  100
+    TRCEXTINSELR      = 0x8844, // 10  001  0000  1000  100
+    TRCCNTRLDVR0      = 0x8805, // 10  001  0000  0000  101
+    TRCCNTRLDVR1      = 0x880d, // 10  001  0000  0001  101
+    TRCCNTRLDVR2      = 0x8815, // 10  001  0000  0010  101
+    TRCCNTRLDVR3      = 0x881d, // 10  001  0000  0011  101
+    TRCCNTCTLR0       = 0x8825, // 10  001  0000  0100  101
+    TRCCNTCTLR1       = 0x882d, // 10  001  0000  0101  101
+    TRCCNTCTLR2       = 0x8835, // 10  001  0000  0110  101
+    TRCCNTCTLR3       = 0x883d, // 10  001  0000  0111  101
+    TRCCNTVR0         = 0x8845, // 10  001  0000  1000  101
+    TRCCNTVR1         = 0x884d, // 10  001  0000  1001  101
+    TRCCNTVR2         = 0x8855, // 10  001  0000  1010  101
+    TRCCNTVR3         = 0x885d, // 10  001  0000  1011  101
+    TRCIMSPEC0        = 0x8807, // 10  001  0000  0000  111
+    TRCIMSPEC1        = 0x880f, // 10  001  0000  0001  111
+    TRCIMSPEC2        = 0x8817, // 10  001  0000  0010  111
+    TRCIMSPEC3        = 0x881f, // 10  001  0000  0011  111
+    TRCIMSPEC4        = 0x8827, // 10  001  0000  0100  111
+    TRCIMSPEC5        = 0x882f, // 10  001  0000  0101  111
+    TRCIMSPEC6        = 0x8837, // 10  001  0000  0110  111
+    TRCIMSPEC7        = 0x883f, // 10  001  0000  0111  111
+    TRCRSCTLR2        = 0x8890, // 10  001  0001  0010  000
+    TRCRSCTLR3        = 0x8898, // 10  001  0001  0011  000
+    TRCRSCTLR4        = 0x88a0, // 10  001  0001  0100  000
+    TRCRSCTLR5        = 0x88a8, // 10  001  0001  0101  000
+    TRCRSCTLR6        = 0x88b0, // 10  001  0001  0110  000
+    TRCRSCTLR7        = 0x88b8, // 10  001  0001  0111  000
+    TRCRSCTLR8        = 0x88c0, // 10  001  0001  1000  000
+    TRCRSCTLR9        = 0x88c8, // 10  001  0001  1001  000
+    TRCRSCTLR10       = 0x88d0, // 10  001  0001  1010  000
+    TRCRSCTLR11       = 0x88d8, // 10  001  0001  1011  000
+    TRCRSCTLR12       = 0x88e0, // 10  001  0001  1100  000
+    TRCRSCTLR13       = 0x88e8, // 10  001  0001  1101  000
+    TRCRSCTLR14       = 0x88f0, // 10  001  0001  1110  000
+    TRCRSCTLR15       = 0x88f8, // 10  001  0001  1111  000
+    TRCRSCTLR16       = 0x8881, // 10  001  0001  0000  001
+    TRCRSCTLR17       = 0x8889, // 10  001  0001  0001  001
+    TRCRSCTLR18       = 0x8891, // 10  001  0001  0010  001
+    TRCRSCTLR19       = 0x8899, // 10  001  0001  0011  001
+    TRCRSCTLR20       = 0x88a1, // 10  001  0001  0100  001
+    TRCRSCTLR21       = 0x88a9, // 10  001  0001  0101  001
+    TRCRSCTLR22       = 0x88b1, // 10  001  0001  0110  001
+    TRCRSCTLR23       = 0x88b9, // 10  001  0001  0111  001
+    TRCRSCTLR24       = 0x88c1, // 10  001  0001  1000  001
+    TRCRSCTLR25       = 0x88c9, // 10  001  0001  1001  001
+    TRCRSCTLR26       = 0x88d1, // 10  001  0001  1010  001
+    TRCRSCTLR27       = 0x88d9, // 10  001  0001  1011  001
+    TRCRSCTLR28       = 0x88e1, // 10  001  0001  1100  001
+    TRCRSCTLR29       = 0x88e9, // 10  001  0001  1101  001
+    TRCRSCTLR30       = 0x88f1, // 10  001  0001  1110  001
+    TRCRSCTLR31       = 0x88f9, // 10  001  0001  1111  001
+    TRCSSCCR0         = 0x8882, // 10  001  0001  0000  010
+    TRCSSCCR1         = 0x888a, // 10  001  0001  0001  010
+    TRCSSCCR2         = 0x8892, // 10  001  0001  0010  010
+    TRCSSCCR3         = 0x889a, // 10  001  0001  0011  010
+    TRCSSCCR4         = 0x88a2, // 10  001  0001  0100  010
+    TRCSSCCR5         = 0x88aa, // 10  001  0001  0101  010
+    TRCSSCCR6         = 0x88b2, // 10  001  0001  0110  010
+    TRCSSCCR7         = 0x88ba, // 10  001  0001  0111  010
+    TRCSSCSR0         = 0x88c2, // 10  001  0001  1000  010
+    TRCSSCSR1         = 0x88ca, // 10  001  0001  1001  010
+    TRCSSCSR2         = 0x88d2, // 10  001  0001  1010  010
+    TRCSSCSR3         = 0x88da, // 10  001  0001  1011  010
+    TRCSSCSR4         = 0x88e2, // 10  001  0001  1100  010
+    TRCSSCSR5         = 0x88ea, // 10  001  0001  1101  010
+    TRCSSCSR6         = 0x88f2, // 10  001  0001  1110  010
+    TRCSSCSR7         = 0x88fa, // 10  001  0001  1111  010
+    TRCSSPCICR0       = 0x8883, // 10  001  0001  0000  011
+    TRCSSPCICR1       = 0x888b, // 10  001  0001  0001  011
+    TRCSSPCICR2       = 0x8893, // 10  001  0001  0010  011
+    TRCSSPCICR3       = 0x889b, // 10  001  0001  0011  011
+    TRCSSPCICR4       = 0x88a3, // 10  001  0001  0100  011
+    TRCSSPCICR5       = 0x88ab, // 10  001  0001  0101  011
+    TRCSSPCICR6       = 0x88b3, // 10  001  0001  0110  011
+    TRCSSPCICR7       = 0x88bb, // 10  001  0001  0111  011
+    TRCPDCR           = 0x88a4, // 10  001  0001  0100  100
+    TRCACVR0          = 0x8900, // 10  001  0010  0000  000
+    TRCACVR1          = 0x8910, // 10  001  0010  0010  000
+    TRCACVR2          = 0x8920, // 10  001  0010  0100  000
+    TRCACVR3          = 0x8930, // 10  001  0010  0110  000
+    TRCACVR4          = 0x8940, // 10  001  0010  1000  000
+    TRCACVR5          = 0x8950, // 10  001  0010  1010  000
+    TRCACVR6          = 0x8960, // 10  001  0010  1100  000
+    TRCACVR7          = 0x8970, // 10  001  0010  1110  000
+    TRCACVR8          = 0x8901, // 10  001  0010  0000  001
+    TRCACVR9          = 0x8911, // 10  001  0010  0010  001
+    TRCACVR10         = 0x8921, // 10  001  0010  0100  001
+    TRCACVR11         = 0x8931, // 10  001  0010  0110  001
+    TRCACVR12         = 0x8941, // 10  001  0010  1000  001
+    TRCACVR13         = 0x8951, // 10  001  0010  1010  001
+    TRCACVR14         = 0x8961, // 10  001  0010  1100  001
+    TRCACVR15         = 0x8971, // 10  001  0010  1110  001
+    TRCACATR0         = 0x8902, // 10  001  0010  0000  010
+    TRCACATR1         = 0x8912, // 10  001  0010  0010  010
+    TRCACATR2         = 0x8922, // 10  001  0010  0100  010
+    TRCACATR3         = 0x8932, // 10  001  0010  0110  010
+    TRCACATR4         = 0x8942, // 10  001  0010  1000  010
+    TRCACATR5         = 0x8952, // 10  001  0010  1010  010
+    TRCACATR6         = 0x8962, // 10  001  0010  1100  010
+    TRCACATR7         = 0x8972, // 10  001  0010  1110  010
+    TRCACATR8         = 0x8903, // 10  001  0010  0000  011
+    TRCACATR9         = 0x8913, // 10  001  0010  0010  011
+    TRCACATR10        = 0x8923, // 10  001  0010  0100  011
+    TRCACATR11        = 0x8933, // 10  001  0010  0110  011
+    TRCACATR12        = 0x8943, // 10  001  0010  1000  011
+    TRCACATR13        = 0x8953, // 10  001  0010  1010  011
+    TRCACATR14        = 0x8963, // 10  001  0010  1100  011
+    TRCACATR15        = 0x8973, // 10  001  0010  1110  011
+    TRCDVCVR0         = 0x8904, // 10  001  0010  0000  100
+    TRCDVCVR1         = 0x8924, // 10  001  0010  0100  100
+    TRCDVCVR2         = 0x8944, // 10  001  0010  1000  100
+    TRCDVCVR3         = 0x8964, // 10  001  0010  1100  100
+    TRCDVCVR4         = 0x8905, // 10  001  0010  0000  101
+    TRCDVCVR5         = 0x8925, // 10  001  0010  0100  101
+    TRCDVCVR6         = 0x8945, // 10  001  0010  1000  101
+    TRCDVCVR7         = 0x8965, // 10  001  0010  1100  101
+    TRCDVCMR0         = 0x8906, // 10  001  0010  0000  110
+    TRCDVCMR1         = 0x8926, // 10  001  0010  0100  110
+    TRCDVCMR2         = 0x8946, // 10  001  0010  1000  110
+    TRCDVCMR3         = 0x8966, // 10  001  0010  1100  110
+    TRCDVCMR4         = 0x8907, // 10  001  0010  0000  111
+    TRCDVCMR5         = 0x8927, // 10  001  0010  0100  111
+    TRCDVCMR6         = 0x8947, // 10  001  0010  1000  111
+    TRCDVCMR7         = 0x8967, // 10  001  0010  1100  111
+    TRCCIDCVR0        = 0x8980, // 10  001  0011  0000  000
+    TRCCIDCVR1        = 0x8990, // 10  001  0011  0010  000
+    TRCCIDCVR2        = 0x89a0, // 10  001  0011  0100  000
+    TRCCIDCVR3        = 0x89b0, // 10  001  0011  0110  000
+    TRCCIDCVR4        = 0x89c0, // 10  001  0011  1000  000
+    TRCCIDCVR5        = 0x89d0, // 10  001  0011  1010  000
+    TRCCIDCVR6        = 0x89e0, // 10  001  0011  1100  000
+    TRCCIDCVR7        = 0x89f0, // 10  001  0011  1110  000
+    TRCVMIDCVR0       = 0x8981, // 10  001  0011  0000  001
+    TRCVMIDCVR1       = 0x8991, // 10  001  0011  0010  001
+    TRCVMIDCVR2       = 0x89a1, // 10  001  0011  0100  001
+    TRCVMIDCVR3       = 0x89b1, // 10  001  0011  0110  001
+    TRCVMIDCVR4       = 0x89c1, // 10  001  0011  1000  001
+    TRCVMIDCVR5       = 0x89d1, // 10  001  0011  1010  001
+    TRCVMIDCVR6       = 0x89e1, // 10  001  0011  1100  001
+    TRCVMIDCVR7       = 0x89f1, // 10  001  0011  1110  001
+    TRCCIDCCTLR0      = 0x8982, // 10  001  0011  0000  010
+    TRCCIDCCTLR1      = 0x898a, // 10  001  0011  0001  010
+    TRCVMIDCCTLR0     = 0x8992, // 10  001  0011  0010  010
+    TRCVMIDCCTLR1     = 0x899a, // 10  001  0011  0011  010
+    TRCITCTRL         = 0x8b84, // 10  001  0111  0000  100
+    TRCCLAIMSET       = 0x8bc6, // 10  001  0111  1000  110
+    TRCCLAIMCLR       = 0x8bce, // 10  001  0111  1001  110
+
+    // GICv3 registers
+    ICC_BPR1_EL1      = 0xc663, // 11  000  1100  1100  011
+    ICC_BPR0_EL1      = 0xc643, // 11  000  1100  1000  011
+    ICC_PMR_EL1       = 0xc230, // 11  000  0100  0110  000
+    ICC_CTLR_EL1      = 0xc664, // 11  000  1100  1100  100
+    ICC_CTLR_EL3      = 0xf664, // 11  110  1100  1100  100
+    ICC_SRE_EL1       = 0xc665, // 11  000  1100  1100  101
+    ICC_SRE_EL2       = 0xe64d, // 11  100  1100  1001  101
+    ICC_SRE_EL3       = 0xf665, // 11  110  1100  1100  101
+    ICC_IGRPEN0_EL1   = 0xc666, // 11  000  1100  1100  110
+    ICC_IGRPEN1_EL1   = 0xc667, // 11  000  1100  1100  111
+    ICC_IGRPEN1_EL3   = 0xf667, // 11  110  1100  1100  111
+    ICC_SEIEN_EL1     = 0xc668, // 11  000  1100  1101  000
+    ICC_AP0R0_EL1     = 0xc644, // 11  000  1100  1000  100
+    ICC_AP0R1_EL1     = 0xc645, // 11  000  1100  1000  101
+    ICC_AP0R2_EL1     = 0xc646, // 11  000  1100  1000  110
+    ICC_AP0R3_EL1     = 0xc647, // 11  000  1100  1000  111
+    ICC_AP1R0_EL1     = 0xc648, // 11  000  1100  1001  000
+    ICC_AP1R1_EL1     = 0xc649, // 11  000  1100  1001  001
+    ICC_AP1R2_EL1     = 0xc64a, // 11  000  1100  1001  010
+    ICC_AP1R3_EL1     = 0xc64b, // 11  000  1100  1001  011
+    ICH_AP0R0_EL2     = 0xe640, // 11  100  1100  1000  000
+    ICH_AP0R1_EL2     = 0xe641, // 11  100  1100  1000  001
+    ICH_AP0R2_EL2     = 0xe642, // 11  100  1100  1000  010
+    ICH_AP0R3_EL2     = 0xe643, // 11  100  1100  1000  011
+    ICH_AP1R0_EL2     = 0xe648, // 11  100  1100  1001  000
+    ICH_AP1R1_EL2     = 0xe649, // 11  100  1100  1001  001
+    ICH_AP1R2_EL2     = 0xe64a, // 11  100  1100  1001  010
+    ICH_AP1R3_EL2     = 0xe64b, // 11  100  1100  1001  011
+    ICH_HCR_EL2       = 0xe658, // 11  100  1100  1011  000
+    ICH_MISR_EL2      = 0xe65a, // 11  100  1100  1011  010
+    ICH_VMCR_EL2      = 0xe65f, // 11  100  1100  1011  111
+    ICH_VSEIR_EL2     = 0xe64c, // 11  100  1100  1001  100
+    ICH_LR0_EL2       = 0xe660, // 11  100  1100  1100  000
+    ICH_LR1_EL2       = 0xe661, // 11  100  1100  1100  001
+    ICH_LR2_EL2       = 0xe662, // 11  100  1100  1100  010
+    ICH_LR3_EL2       = 0xe663, // 11  100  1100  1100  011
+    ICH_LR4_EL2       = 0xe664, // 11  100  1100  1100  100
+    ICH_LR5_EL2       = 0xe665, // 11  100  1100  1100  101
+    ICH_LR6_EL2       = 0xe666, // 11  100  1100  1100  110
+    ICH_LR7_EL2       = 0xe667, // 11  100  1100  1100  111
+    ICH_LR8_EL2       = 0xe668, // 11  100  1100  1101  000
+    ICH_LR9_EL2       = 0xe669, // 11  100  1100  1101  001
+    ICH_LR10_EL2      = 0xe66a, // 11  100  1100  1101  010
+    ICH_LR11_EL2      = 0xe66b, // 11  100  1100  1101  011
+    ICH_LR12_EL2      = 0xe66c, // 11  100  1100  1101  100
+    ICH_LR13_EL2      = 0xe66d, // 11  100  1100  1101  101
+    ICH_LR14_EL2      = 0xe66e, // 11  100  1100  1101  110
+    ICH_LR15_EL2      = 0xe66f, // 11  100  1100  1101  111
+  };
+
+  // Cyclone specific system registers
+  enum CycloneSysRegValues {
+    CPM_IOACC_CTL_EL3 = 0xff90
+  };
+
+  // Note that these do not inherit from AArch64NamedImmMapper. This class is
+  // sufficiently different in its behaviour that I don't believe it's worth
+  // burdening the common AArch64NamedImmMapper with abstractions only needed in
+  // this one case.
+  struct SysRegMapper {
+    static const AArch64NamedImmMapper::Mapping SysRegPairs[];
+    static const AArch64NamedImmMapper::Mapping CycloneSysRegPairs[];
+
+    const AArch64NamedImmMapper::Mapping *InstPairs;
+    size_t NumInstPairs;
+    uint64_t FeatureBits;
+
+    SysRegMapper(uint64_t FeatureBits) : FeatureBits(FeatureBits) { }
+    uint32_t fromString(StringRef Name, bool &Valid) const;
+    std::string toString(uint32_t Bits, bool &Valid) const;
+  };
+
+  struct MSRMapper : SysRegMapper {
+    static const AArch64NamedImmMapper::Mapping MSRPairs[];
+    MSRMapper(uint64_t FeatureBits);
+  };
+
+  struct MRSMapper : SysRegMapper {
+    static const AArch64NamedImmMapper::Mapping MRSPairs[];
+    MRSMapper(uint64_t FeatureBits);
+  };
+
+  uint32_t ParseGenericRegister(StringRef Name, bool &Valid);
+}
+
+namespace AArch64TLBI {
+  enum TLBIValues {
+    Invalid = -1,          // Op0 Op1  CRn   CRm   Op2
+    IPAS2E1IS    = 0x6401, // 01  100  1000  0000  001
+    IPAS2LE1IS   = 0x6405, // 01  100  1000  0000  101
+    VMALLE1IS    = 0x4418, // 01  000  1000  0011  000
+    ALLE2IS      = 0x6418, // 01  100  1000  0011  000
+    ALLE3IS      = 0x7418, // 01  110  1000  0011  000
+    VAE1IS       = 0x4419, // 01  000  1000  0011  001
+    VAE2IS       = 0x6419, // 01  100  1000  0011  001
+    VAE3IS       = 0x7419, // 01  110  1000  0011  001
+    ASIDE1IS     = 0x441a, // 01  000  1000  0011  010
+    VAAE1IS      = 0x441b, // 01  000  1000  0011  011
+    ALLE1IS      = 0x641c, // 01  100  1000  0011  100
+    VALE1IS      = 0x441d, // 01  000  1000  0011  101
+    VALE2IS      = 0x641d, // 01  100  1000  0011  101
+    VALE3IS      = 0x741d, // 01  110  1000  0011  101
+    VMALLS12E1IS = 0x641e, // 01  100  1000  0011  110
+    VAALE1IS     = 0x441f, // 01  000  1000  0011  111
+    IPAS2E1      = 0x6421, // 01  100  1000  0100  001
+    IPAS2LE1     = 0x6425, // 01  100  1000  0100  101
+    VMALLE1      = 0x4438, // 01  000  1000  0111  000
+    ALLE2        = 0x6438, // 01  100  1000  0111  000
+    ALLE3        = 0x7438, // 01  110  1000  0111  000
+    VAE1         = 0x4439, // 01  000  1000  0111  001
+    VAE2         = 0x6439, // 01  100  1000  0111  001
+    VAE3         = 0x7439, // 01  110  1000  0111  001
+    ASIDE1       = 0x443a, // 01  000  1000  0111  010
+    VAAE1        = 0x443b, // 01  000  1000  0111  011
+    ALLE1        = 0x643c, // 01  100  1000  0111  100
+    VALE1        = 0x443d, // 01  000  1000  0111  101
+    VALE2        = 0x643d, // 01  100  1000  0111  101
+    VALE3        = 0x743d, // 01  110  1000  0111  101
+    VMALLS12E1   = 0x643e, // 01  100  1000  0111  110
+    VAALE1       = 0x443f  // 01  000  1000  0111  111
+  };
+
+  struct TLBIMapper : AArch64NamedImmMapper {
+    const static Mapping TLBIPairs[];
+
+    TLBIMapper();
+  };
+
+  static inline bool NeedsRegister(TLBIValues Val) {
+    switch (Val) {
+    case VMALLE1IS:
+    case ALLE2IS:
+    case ALLE3IS:
+    case ALLE1IS:
+    case VMALLS12E1IS:
+    case VMALLE1:
+    case ALLE2:
+    case ALLE3:
+    case ALLE1:
+    case VMALLS12E1:
+      return false;
+    default:
+      return true;
+    }
+  }
+} 
+
+namespace AArch64II {
+  /// Target Operand Flag enum.
+  enum TOF {
+    //===------------------------------------------------------------------===//
+    // AArch64 Specific MachineOperand flags.
+
+    MO_NO_FLAG,
+
+    MO_FRAGMENT = 0x7,
+
+    /// MO_PAGE - A symbol operand with this flag represents the pc-relative
+    /// offset of the 4K page containing the symbol.  This is used with the
+    /// ADRP instruction.
+    MO_PAGE = 1,
+
+    /// MO_PAGEOFF - A symbol operand with this flag represents the offset of
+    /// that symbol within a 4K page.  This offset is added to the page address
+    /// to produce the complete address.
+    MO_PAGEOFF = 2,
+
+    /// MO_G3 - A symbol operand with this flag (granule 3) represents the high
+    /// 16-bits of a 64-bit address, used in a MOVZ or MOVK instruction
+    MO_G3 = 3,
+
+    /// MO_G2 - A symbol operand with this flag (granule 2) represents the bits
+    /// 32-47 of a 64-bit address, used in a MOVZ or MOVK instruction
+    MO_G2 = 4,
+
+    /// MO_G1 - A symbol operand with this flag (granule 1) represents the bits
+    /// 16-31 of a 64-bit address, used in a MOVZ or MOVK instruction
+    MO_G1 = 5,
+
+    /// MO_G0 - A symbol operand with this flag (granule 0) represents the bits
+    /// 0-15 of a 64-bit address, used in a MOVZ or MOVK instruction
+    MO_G0 = 6,
+
+    /// MO_GOT - This flag indicates that a symbol operand represents the
+    /// address of the GOT entry for the symbol, rather than the address of
+    /// the symbol itself.
+    MO_GOT = 8,
+
+    /// MO_NC - Indicates whether the linker is expected to check the symbol
+    /// reference for overflow. For example in an ADRP/ADD pair of relocations
+    /// the ADRP usually does check, but not the ADD.
+    MO_NC = 0x10,
+
+    /// MO_TLS - Indicates that the operand being accessed is some kind of
+    /// thread-local symbol. On Darwin, only one type of thread-local access
+    /// exists (pre linker-relaxation), but on ELF the TLSModel used for the
+    /// referee will affect interpretation.
+    MO_TLS = 0x20
+  };
+} // end namespace AArch64II
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/A15SDOptimizer.cpp b/contrib/llvm/lib/Target/ARM/A15SDOptimizer.cpp
new file mode 100644
index 0000000..92eaf9e
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/A15SDOptimizer.cpp
@@ -0,0 +1,710 @@
+//=== A15SDOptimizerPass.cpp - Optimize DPR and SPR register accesses on A15==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The Cortex-A15 processor employs a tracking scheme in its register renaming
+// in order to process each instruction's micro-ops speculatively and
+// out-of-order with appropriate forwarding. The ARM architecture allows VFP
+// instructions to read and write 32-bit S-registers.  Each S-register
+// corresponds to one half (upper or lower) of an overlaid 64-bit D-register.
+//
+// There are several instruction patterns which can be used to provide this
+// capability which can provide higher performance than other, potentially more
+// direct patterns, specifically around when one micro-op reads a D-register
+// operand that has recently been written as one or more S-register results.
+//
+// This file defines a pre-regalloc pass which looks for SPR producers which
+// are going to be used by a DPR (or QPR) consumers and creates the more
+// optimized access pattern.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMBaseRegisterInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <set>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "a15-sd-optimizer"
+
+namespace {
+  struct A15SDOptimizer : public MachineFunctionPass {
+    static char ID;
+    A15SDOptimizer() : MachineFunctionPass(ID) {}
+
+    bool runOnMachineFunction(MachineFunction &Fn) override;
+
+    const char *getPassName() const override {
+      return "ARM A15 S->D optimizer";
+    }
+
+  private:
+    const ARMBaseInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    MachineRegisterInfo *MRI;
+
+    bool runOnInstruction(MachineInstr *MI);
+
+    //
+    // Instruction builder helpers
+    //
+    unsigned createDupLane(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator InsertBefore,
+                           DebugLoc DL,
+                           unsigned Reg, unsigned Lane,
+                           bool QPR=false);
+
+    unsigned createExtractSubreg(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator InsertBefore,
+                                 DebugLoc DL,
+                                 unsigned DReg, unsigned Lane,
+                                 const TargetRegisterClass *TRC);
+
+    unsigned createVExt(MachineBasicBlock &MBB,
+                        MachineBasicBlock::iterator InsertBefore,
+                        DebugLoc DL,
+                        unsigned Ssub0, unsigned Ssub1);
+
+    unsigned createRegSequence(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator InsertBefore,
+                               DebugLoc DL,
+                               unsigned Reg1, unsigned Reg2);
+
+    unsigned createInsertSubreg(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator InsertBefore,
+                                DebugLoc DL, unsigned DReg, unsigned Lane,
+                                unsigned ToInsert);
+
+    unsigned createImplicitDef(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator InsertBefore,
+                               DebugLoc DL);
+
+    //
+    // Various property checkers
+    //
+    bool usesRegClass(MachineOperand &MO, const TargetRegisterClass *TRC);
+    bool hasPartialWrite(MachineInstr *MI);
+    SmallVector<unsigned, 8> getReadDPRs(MachineInstr *MI);
+    unsigned getDPRLaneFromSPR(unsigned SReg);
+
+    //
+    // Methods used for getting the definitions of partial registers
+    //
+
+    MachineInstr *elideCopies(MachineInstr *MI);
+    void elideCopiesAndPHIs(MachineInstr *MI,
+                            SmallVectorImpl<MachineInstr*> &Outs);
+
+    //
+    // Pattern optimization methods
+    //
+    unsigned optimizeAllLanesPattern(MachineInstr *MI, unsigned Reg);
+    unsigned optimizeSDPattern(MachineInstr *MI);
+    unsigned getPrefSPRLane(unsigned SReg);
+
+    //
+    // Sanitizing method - used to make sure if don't leave dead code around.
+    //
+    void eraseInstrWithNoUses(MachineInstr *MI);
+
+    //
+    // A map used to track the changes done by this pass.
+    //
+    std::map<MachineInstr*, unsigned> Replacements;
+    std::set<MachineInstr *> DeadInstr;
+  };
+  char A15SDOptimizer::ID = 0;
+} // end anonymous namespace
+
+// Returns true if this is a use of a SPR register.
+bool A15SDOptimizer::usesRegClass(MachineOperand &MO,
+                                  const TargetRegisterClass *TRC) {
+  if (!MO.isReg())
+    return false;
+  unsigned Reg = MO.getReg();
+
+  if (TargetRegisterInfo::isVirtualRegister(Reg))
+    return MRI->getRegClass(Reg)->hasSuperClassEq(TRC);
+  else
+    return TRC->contains(Reg);
+}
+
+unsigned A15SDOptimizer::getDPRLaneFromSPR(unsigned SReg) {
+  unsigned DReg = TRI->getMatchingSuperReg(SReg, ARM::ssub_1,
+                                           &ARM::DPRRegClass);
+  if (DReg != ARM::NoRegister) return ARM::ssub_1;
+  return ARM::ssub_0;
+}
+
+// Get the subreg type that is most likely to be coalesced
+// for an SPR register that will be used in VDUP32d pseudo.
+unsigned A15SDOptimizer::getPrefSPRLane(unsigned SReg) {
+  if (!TRI->isVirtualRegister(SReg))
+    return getDPRLaneFromSPR(SReg);
+
+  MachineInstr *MI = MRI->getVRegDef(SReg);
+  if (!MI) return ARM::ssub_0;
+  MachineOperand *MO = MI->findRegisterDefOperand(SReg);
+
+  assert(MO->isReg() && "Non-register operand found!");
+  if (!MO) return ARM::ssub_0;
+
+  if (MI->isCopy() && usesRegClass(MI->getOperand(1),
+                                    &ARM::SPRRegClass)) {
+    SReg = MI->getOperand(1).getReg();
+  }
+
+  if (TargetRegisterInfo::isVirtualRegister(SReg)) {
+    if (MO->getSubReg() == ARM::ssub_1) return ARM::ssub_1;
+    return ARM::ssub_0;
+  }
+  return getDPRLaneFromSPR(SReg);
+}
+
+// MI is known to be dead. Figure out what instructions
+// are also made dead by this and mark them for removal.
+void A15SDOptimizer::eraseInstrWithNoUses(MachineInstr *MI) {
+  SmallVector<MachineInstr *, 8> Front;
+  DeadInstr.insert(MI);
+
+  DEBUG(dbgs() << "Deleting base instruction " << *MI << "\n");
+  Front.push_back(MI);
+
+  while (Front.size() != 0) {
+    MI = Front.back();
+    Front.pop_back();
+
+    // MI is already known to be dead. We need to see
+    // if other instructions can also be removed.
+    for (unsigned int i = 0; i < MI->getNumOperands(); ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if ((!MO.isReg()) || (!MO.isUse()))
+        continue;
+      unsigned Reg = MO.getReg();
+      if (!TRI->isVirtualRegister(Reg))
+        continue;
+      MachineOperand *Op = MI->findRegisterDefOperand(Reg);
+
+      if (!Op)
+        continue;
+
+      MachineInstr *Def = Op->getParent();
+
+      // We don't need to do anything if we have already marked
+      // this instruction as being dead.
+      if (DeadInstr.find(Def) != DeadInstr.end())
+        continue;
+
+      // Check if all the uses of this instruction are marked as
+      // dead. If so, we can also mark this instruction as being
+      // dead.
+      bool IsDead = true;
+      for (unsigned int j = 0; j < Def->getNumOperands(); ++j) {
+        MachineOperand &MODef = Def->getOperand(j);
+        if ((!MODef.isReg()) || (!MODef.isDef()))
+          continue;
+        unsigned DefReg = MODef.getReg();
+        if (!TRI->isVirtualRegister(DefReg)) {
+          IsDead = false;
+          break;
+        }
+        for (MachineRegisterInfo::use_instr_iterator
+             II = MRI->use_instr_begin(Reg), EE = MRI->use_instr_end();
+             II != EE; ++II) {
+          // We don't care about self references.
+          if (&*II == Def)
+            continue;
+          if (DeadInstr.find(&*II) == DeadInstr.end()) {
+            IsDead = false;
+            break;
+          }
+        }
+      }
+
+      if (!IsDead) continue;
+
+      DEBUG(dbgs() << "Deleting instruction " << *Def << "\n");
+      DeadInstr.insert(Def);
+    }
+  }
+}
+
+// Creates the more optimized patterns and generally does all the code
+// transformations in this pass.
+unsigned A15SDOptimizer::optimizeSDPattern(MachineInstr *MI) {
+  if (MI->isCopy()) {
+    return optimizeAllLanesPattern(MI, MI->getOperand(1).getReg());
+  }
+
+  if (MI->isInsertSubreg()) {
+    unsigned DPRReg = MI->getOperand(1).getReg();
+    unsigned SPRReg = MI->getOperand(2).getReg();
+
+    if (TRI->isVirtualRegister(DPRReg) && TRI->isVirtualRegister(SPRReg)) {
+      MachineInstr *DPRMI = MRI->getVRegDef(MI->getOperand(1).getReg());
+      MachineInstr *SPRMI = MRI->getVRegDef(MI->getOperand(2).getReg());
+
+      if (DPRMI && SPRMI) {
+        // See if the first operand of this insert_subreg is IMPLICIT_DEF
+        MachineInstr *ECDef = elideCopies(DPRMI);
+        if (ECDef && ECDef->isImplicitDef()) {
+          // Another corner case - if we're inserting something that is purely
+          // a subreg copy of a DPR, just use that DPR.
+
+          MachineInstr *EC = elideCopies(SPRMI);
+          // Is it a subreg copy of ssub_0?
+          if (EC && EC->isCopy() &&
+              EC->getOperand(1).getSubReg() == ARM::ssub_0) {
+            DEBUG(dbgs() << "Found a subreg copy: " << *SPRMI);
+
+            // Find the thing we're subreg copying out of - is it of the same
+            // regclass as DPRMI? (i.e. a DPR or QPR).
+            unsigned FullReg = SPRMI->getOperand(1).getReg();
+            const TargetRegisterClass *TRC =
+              MRI->getRegClass(MI->getOperand(1).getReg());
+            if (TRC->hasSuperClassEq(MRI->getRegClass(FullReg))) {
+              DEBUG(dbgs() << "Subreg copy is compatible - returning ");
+              DEBUG(dbgs() << PrintReg(FullReg) << "\n");
+              eraseInstrWithNoUses(MI);
+              return FullReg;
+            }
+          }
+
+          return optimizeAllLanesPattern(MI, MI->getOperand(2).getReg());
+        }
+      }
+    }
+    return optimizeAllLanesPattern(MI, MI->getOperand(0).getReg());
+  }
+
+  if (MI->isRegSequence() && usesRegClass(MI->getOperand(1),
+                                          &ARM::SPRRegClass)) {
+    // See if all bar one of the operands are IMPLICIT_DEF and insert the
+    // optimizer pattern accordingly.
+    unsigned NumImplicit = 0, NumTotal = 0;
+    unsigned NonImplicitReg = ~0U;
+
+    for (unsigned I = 1; I < MI->getNumExplicitOperands(); ++I) {
+      if (!MI->getOperand(I).isReg())
+        continue;
+      ++NumTotal;
+      unsigned OpReg = MI->getOperand(I).getReg();
+
+      if (!TRI->isVirtualRegister(OpReg))
+        break;
+
+      MachineInstr *Def = MRI->getVRegDef(OpReg);
+      if (!Def)
+        break;
+      if (Def->isImplicitDef())
+        ++NumImplicit;
+      else
+        NonImplicitReg = MI->getOperand(I).getReg();
+    }
+
+    if (NumImplicit == NumTotal - 1)
+      return optimizeAllLanesPattern(MI, NonImplicitReg);
+    else
+      return optimizeAllLanesPattern(MI, MI->getOperand(0).getReg());
+  }
+
+  llvm_unreachable("Unhandled update pattern!");
+}
+
+// Return true if this MachineInstr inserts a scalar (SPR) value into
+// a D or Q register.
+bool A15SDOptimizer::hasPartialWrite(MachineInstr *MI) {
+  // The only way we can do a partial register update is through a COPY,
+  // INSERT_SUBREG or REG_SEQUENCE.
+  if (MI->isCopy() && usesRegClass(MI->getOperand(1), &ARM::SPRRegClass))
+    return true;
+
+  if (MI->isInsertSubreg() && usesRegClass(MI->getOperand(2),
+                                           &ARM::SPRRegClass))
+    return true;
+
+  if (MI->isRegSequence() && usesRegClass(MI->getOperand(1), &ARM::SPRRegClass))
+    return true;
+
+  return false;
+}
+
+// Looks through full copies to get the instruction that defines the input
+// operand for MI.
+MachineInstr *A15SDOptimizer::elideCopies(MachineInstr *MI) {
+  if (!MI->isFullCopy())
+    return MI;
+  if (!TRI->isVirtualRegister(MI->getOperand(1).getReg()))
+    return nullptr;
+  MachineInstr *Def = MRI->getVRegDef(MI->getOperand(1).getReg());
+  if (!Def)
+    return nullptr;
+  return elideCopies(Def);
+}
+
+// Look through full copies and PHIs to get the set of non-copy MachineInstrs
+// that can produce MI.
+void A15SDOptimizer::elideCopiesAndPHIs(MachineInstr *MI,
+                                        SmallVectorImpl<MachineInstr*> &Outs) {
+   // Looking through PHIs may create loops so we need to track what
+   // instructions we have visited before.
+   std::set<MachineInstr *> Reached;
+   SmallVector<MachineInstr *, 8> Front;
+   Front.push_back(MI);
+   while (Front.size() != 0) {
+     MI = Front.back();
+     Front.pop_back();
+
+     // If we have already explored this MachineInstr, ignore it.
+     if (Reached.find(MI) != Reached.end())
+       continue;
+     Reached.insert(MI);
+     if (MI->isPHI()) {
+       for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) {
+         unsigned Reg = MI->getOperand(I).getReg();
+         if (!TRI->isVirtualRegister(Reg)) {
+           continue;
+         }
+         MachineInstr *NewMI = MRI->getVRegDef(Reg);
+         if (!NewMI)
+           continue;
+         Front.push_back(NewMI);
+       }
+     } else if (MI->isFullCopy()) {
+       if (!TRI->isVirtualRegister(MI->getOperand(1).getReg()))
+         continue;
+       MachineInstr *NewMI = MRI->getVRegDef(MI->getOperand(1).getReg());
+       if (!NewMI)
+         continue;
+       Front.push_back(NewMI);
+     } else {
+       DEBUG(dbgs() << "Found partial copy" << *MI <<"\n");
+       Outs.push_back(MI);
+     }
+   }
+}
+
+// Return the DPR virtual registers that are read by this machine instruction
+// (if any).
+SmallVector<unsigned, 8> A15SDOptimizer::getReadDPRs(MachineInstr *MI) {
+  if (MI->isCopyLike() || MI->isInsertSubreg() || MI->isRegSequence() ||
+      MI->isKill())
+    return SmallVector<unsigned, 8>();
+
+  SmallVector<unsigned, 8> Defs;
+  for (unsigned i = 0; i < MI->getNumOperands(); ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+
+    if (!MO.isReg() || !MO.isUse())
+      continue;
+    if (!usesRegClass(MO, &ARM::DPRRegClass) &&
+        !usesRegClass(MO, &ARM::QPRRegClass) &&
+        !usesRegClass(MO, &ARM::DPairRegClass)) // Treat DPair as QPR
+      continue;
+
+    Defs.push_back(MO.getReg());
+  }
+  return Defs;
+}
+
+// Creates a DPR register from an SPR one by using a VDUP.
+unsigned
+A15SDOptimizer::createDupLane(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator InsertBefore,
+                              DebugLoc DL,
+                              unsigned Reg, unsigned Lane, bool QPR) {
+  unsigned Out = MRI->createVirtualRegister(QPR ? &ARM::QPRRegClass :
+                                                  &ARM::DPRRegClass);
+  AddDefaultPred(BuildMI(MBB,
+                         InsertBefore,
+                         DL,
+                         TII->get(QPR ? ARM::VDUPLN32q : ARM::VDUPLN32d),
+                         Out)
+                   .addReg(Reg)
+                   .addImm(Lane));
+
+  return Out;
+}
+
+// Creates a SPR register from a DPR by copying the value in lane 0.
+unsigned
+A15SDOptimizer::createExtractSubreg(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator InsertBefore,
+                                    DebugLoc DL,
+                                    unsigned DReg, unsigned Lane,
+                                    const TargetRegisterClass *TRC) {
+  unsigned Out = MRI->createVirtualRegister(TRC);
+  BuildMI(MBB,
+          InsertBefore,
+          DL,
+          TII->get(TargetOpcode::COPY), Out)
+    .addReg(DReg, 0, Lane);
+
+  return Out;
+}
+
+// Takes two SPR registers and creates a DPR by using a REG_SEQUENCE.
+unsigned
+A15SDOptimizer::createRegSequence(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator InsertBefore,
+                                  DebugLoc DL,
+                                  unsigned Reg1, unsigned Reg2) {
+  unsigned Out = MRI->createVirtualRegister(&ARM::QPRRegClass);
+  BuildMI(MBB,
+          InsertBefore,
+          DL,
+          TII->get(TargetOpcode::REG_SEQUENCE), Out)
+    .addReg(Reg1)
+    .addImm(ARM::dsub_0)
+    .addReg(Reg2)
+    .addImm(ARM::dsub_1);
+  return Out;
+}
+
+// Takes two DPR registers that have previously been VDUPed (Ssub0 and Ssub1)
+// and merges them into one DPR register.
+unsigned
+A15SDOptimizer::createVExt(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator InsertBefore,
+                           DebugLoc DL,
+                           unsigned Ssub0, unsigned Ssub1) {
+  unsigned Out = MRI->createVirtualRegister(&ARM::DPRRegClass);
+  AddDefaultPred(BuildMI(MBB,
+                         InsertBefore,
+                         DL,
+                         TII->get(ARM::VEXTd32), Out)
+                   .addReg(Ssub0)
+                   .addReg(Ssub1)
+                   .addImm(1));
+  return Out;
+}
+
+unsigned
+A15SDOptimizer::createInsertSubreg(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator InsertBefore,
+                                   DebugLoc DL, unsigned DReg, unsigned Lane,
+                                   unsigned ToInsert) {
+  unsigned Out = MRI->createVirtualRegister(&ARM::DPR_VFP2RegClass);
+  BuildMI(MBB,
+          InsertBefore,
+          DL,
+          TII->get(TargetOpcode::INSERT_SUBREG), Out)
+    .addReg(DReg)
+    .addReg(ToInsert)
+    .addImm(Lane);
+
+  return Out;
+}
+
+unsigned
+A15SDOptimizer::createImplicitDef(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator InsertBefore,
+                                  DebugLoc DL) {
+  unsigned Out = MRI->createVirtualRegister(&ARM::DPRRegClass);
+  BuildMI(MBB,
+          InsertBefore,
+          DL,
+          TII->get(TargetOpcode::IMPLICIT_DEF), Out);
+  return Out;
+}
+
+// This function inserts instructions in order to optimize interactions between
+// SPR registers and DPR/QPR registers. It does so by performing VDUPs on all
+// lanes, and the using VEXT instructions to recompose the result.
+unsigned
+A15SDOptimizer::optimizeAllLanesPattern(MachineInstr *MI, unsigned Reg) {
+  MachineBasicBlock::iterator InsertPt(MI);
+  DebugLoc DL = MI->getDebugLoc();
+  MachineBasicBlock &MBB = *MI->getParent();
+  InsertPt++;
+  unsigned Out;
+
+  // DPair has the same length as QPR and also has two DPRs as subreg.
+  // Treat DPair as QPR.
+  if (MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::QPRRegClass) ||
+      MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::DPairRegClass)) {
+    unsigned DSub0 = createExtractSubreg(MBB, InsertPt, DL, Reg,
+                                         ARM::dsub_0, &ARM::DPRRegClass);
+    unsigned DSub1 = createExtractSubreg(MBB, InsertPt, DL, Reg,
+                                         ARM::dsub_1, &ARM::DPRRegClass);
+
+    unsigned Out1 = createDupLane(MBB, InsertPt, DL, DSub0, 0);
+    unsigned Out2 = createDupLane(MBB, InsertPt, DL, DSub0, 1);
+    Out = createVExt(MBB, InsertPt, DL, Out1, Out2);
+
+    unsigned Out3 = createDupLane(MBB, InsertPt, DL, DSub1, 0);
+    unsigned Out4 = createDupLane(MBB, InsertPt, DL, DSub1, 1);
+    Out2 = createVExt(MBB, InsertPt, DL, Out3, Out4);
+
+    Out = createRegSequence(MBB, InsertPt, DL, Out, Out2);
+
+  } else if (MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::DPRRegClass)) {
+    unsigned Out1 = createDupLane(MBB, InsertPt, DL, Reg, 0);
+    unsigned Out2 = createDupLane(MBB, InsertPt, DL, Reg, 1);
+    Out = createVExt(MBB, InsertPt, DL, Out1, Out2);
+
+  } else {
+    assert(MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::SPRRegClass) &&
+           "Found unexpected regclass!");
+
+    unsigned PrefLane = getPrefSPRLane(Reg);
+    unsigned Lane;
+    switch (PrefLane) {
+      case ARM::ssub_0: Lane = 0; break;
+      case ARM::ssub_1: Lane = 1; break;
+      default: llvm_unreachable("Unknown preferred lane!");
+    }
+
+    // Treat DPair as QPR
+    bool UsesQPR = usesRegClass(MI->getOperand(0), &ARM::QPRRegClass) ||
+                   usesRegClass(MI->getOperand(0), &ARM::DPairRegClass);
+
+    Out = createImplicitDef(MBB, InsertPt, DL);
+    Out = createInsertSubreg(MBB, InsertPt, DL, Out, PrefLane, Reg);
+    Out = createDupLane(MBB, InsertPt, DL, Out, Lane, UsesQPR);
+    eraseInstrWithNoUses(MI);
+  }
+  return Out;
+}
+
+bool A15SDOptimizer::runOnInstruction(MachineInstr *MI) {
+  // We look for instructions that write S registers that are then read as
+  // D/Q registers. These can only be caused by COPY, INSERT_SUBREG and
+  // REG_SEQUENCE pseudos that insert an SPR value into a DPR register or
+  // merge two SPR values to form a DPR register.  In order avoid false
+  // positives we make sure that there is an SPR producer so we look past
+  // COPY and PHI nodes to find it.
+  //
+  // The best code pattern for when an SPR producer is going to be used by a
+  // DPR or QPR consumer depends on whether the other lanes of the
+  // corresponding DPR/QPR are currently defined.
+  //
+  // We can handle these efficiently, depending on the type of
+  // pseudo-instruction that is producing the pattern
+  //
+  //   * COPY:          * VDUP all lanes and merge the results together
+  //                      using VEXTs.
+  //
+  //   * INSERT_SUBREG: * If the SPR value was originally in another DPR/QPR
+  //                      lane, and the other lane(s) of the DPR/QPR register
+  //                      that we are inserting in are undefined, use the
+  //                      original DPR/QPR value.
+  //                    * Otherwise, fall back on the same stategy as COPY.
+  //
+  //   * REG_SEQUENCE:  * If all except one of the input operands are
+  //                      IMPLICIT_DEFs, insert the VDUP pattern for just the
+  //                      defined input operand
+  //                    * Otherwise, fall back on the same stategy as COPY.
+  //
+
+  // First, get all the reads of D-registers done by this instruction.
+  SmallVector<unsigned, 8> Defs = getReadDPRs(MI);
+  bool Modified = false;
+
+  for (SmallVectorImpl<unsigned>::iterator I = Defs.begin(), E = Defs.end();
+     I != E; ++I) {
+    // Follow the def-use chain for this DPR through COPYs, and also through
+    // PHIs (which are essentially multi-way COPYs). It is because of PHIs that
+    // we can end up with multiple defs of this DPR.
+
+    SmallVector<MachineInstr *, 8> DefSrcs;
+    if (!TRI->isVirtualRegister(*I))
+      continue;
+    MachineInstr *Def = MRI->getVRegDef(*I);
+    if (!Def)
+      continue;
+
+    elideCopiesAndPHIs(Def, DefSrcs);
+
+    for (SmallVectorImpl<MachineInstr *>::iterator II = DefSrcs.begin(),
+      EE = DefSrcs.end(); II != EE; ++II) {
+      MachineInstr *MI = *II;
+
+      // If we've already analyzed and replaced this operand, don't do
+      // anything.
+      if (Replacements.find(MI) != Replacements.end())
+        continue;
+
+      // Now, work out if the instruction causes a SPR->DPR dependency.
+      if (!hasPartialWrite(MI))
+        continue;
+
+      // Collect all the uses of this MI's DPR def for updating later.
+      SmallVector<MachineOperand*, 8> Uses;
+      unsigned DPRDefReg = MI->getOperand(0).getReg();
+      for (MachineRegisterInfo::use_iterator I = MRI->use_begin(DPRDefReg),
+             E = MRI->use_end(); I != E; ++I)
+        Uses.push_back(&*I);
+
+      // We can optimize this.
+      unsigned NewReg = optimizeSDPattern(MI);
+
+      if (NewReg != 0) {
+        Modified = true;
+        for (SmallVectorImpl<MachineOperand *>::const_iterator I = Uses.begin(),
+               E = Uses.end(); I != E; ++I) {
+          // Make sure to constrain the register class of the new register to
+          // match what we're replacing. Otherwise we can optimize a DPR_VFP2
+          // reference into a plain DPR, and that will end poorly. NewReg is
+          // always virtual here, so there will always be a matching subclass
+          // to find.
+          MRI->constrainRegClass(NewReg, MRI->getRegClass((*I)->getReg()));
+
+          DEBUG(dbgs() << "Replacing operand "
+                       << **I << " with "
+                       << PrintReg(NewReg) << "\n");
+          (*I)->substVirtReg(NewReg, 0, *TRI);
+        }
+      }
+      Replacements[MI] = NewReg;
+    }
+  }
+  return Modified;
+}
+
+bool A15SDOptimizer::runOnMachineFunction(MachineFunction &Fn) {
+  TII = static_cast<const ARMBaseInstrInfo*>(Fn.getTarget().getInstrInfo());
+  TRI = Fn.getTarget().getRegisterInfo();
+  MRI = &Fn.getRegInfo();
+  bool Modified = false;
+
+  DEBUG(dbgs() << "Running on function " << Fn.getName()<< "\n");
+
+  DeadInstr.clear();
+  Replacements.clear();
+
+  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+       ++MFI) {
+
+    for (MachineBasicBlock::iterator MI = MFI->begin(), ME = MFI->end();
+      MI != ME;) {
+      Modified |= runOnInstruction(MI++);
+    }
+
+  }
+
+  for (std::set<MachineInstr *>::iterator I = DeadInstr.begin(),
+                                            E = DeadInstr.end();
+                                            I != E; ++I) {
+    (*I)->eraseFromParent();
+  }
+
+  return Modified;
+}
+
+FunctionPass *llvm::createA15SDOptimizerPass() {
+  return new A15SDOptimizer();
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARM.h b/contrib/llvm/lib/Target/ARM/ARM.h
new file mode 100644
index 0000000..55df29c
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARM.h
@@ -0,0 +1,57 @@
+//===-- ARM.h - Top-level interface for ARM representation ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// ARM back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_ARM_H
+#define TARGET_ARM_H
+
+#include "llvm/Support/CodeGen.h"
+
+namespace llvm {
+
+class ARMAsmPrinter;
+class ARMBaseTargetMachine;
+class FunctionPass;
+class ImmutablePass;
+class JITCodeEmitter;
+class MachineInstr;
+class MCInst;
+class TargetLowering;
+class TargetMachine;
+
+FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM,
+                               CodeGenOpt::Level OptLevel);
+
+FunctionPass *createARMJITCodeEmitterPass(ARMBaseTargetMachine &TM,
+                                          JITCodeEmitter &JCE);
+
+FunctionPass *createA15SDOptimizerPass();
+FunctionPass *createARMLoadStoreOptimizationPass(bool PreAlloc = false);
+FunctionPass *createARMExpandPseudoPass();
+FunctionPass *createARMGlobalBaseRegPass();
+FunctionPass *createARMGlobalMergePass(const TargetLowering* tli);
+FunctionPass *createARMConstantIslandPass();
+FunctionPass *createMLxExpansionPass();
+FunctionPass *createThumb2ITBlockPass();
+FunctionPass *createARMOptimizeBarriersPass();
+FunctionPass *createThumb2SizeReductionPass();
+
+/// \brief Creates an ARM-specific Target Transformation Info pass.
+ImmutablePass *createARMTargetTransformInfoPass(const ARMBaseTargetMachine *TM);
+
+void LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
+                                  ARMAsmPrinter &AP);
+
+} // end namespace llvm;
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARM.td b/contrib/llvm/lib/Target/ARM/ARM.td
new file mode 100644
index 0000000..7916ccc
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARM.td
@@ -0,0 +1,440 @@
+//===-- ARM.td - Describe the ARM Target Machine -----------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// ARM Subtarget state.
+//
+
+def ModeThumb  : SubtargetFeature<"thumb-mode", "InThumbMode", "true",
+                                  "Thumb mode">;
+
+//===----------------------------------------------------------------------===//
+// ARM Subtarget features.
+//
+
+def FeatureVFP2 : SubtargetFeature<"vfp2", "HasVFPv2", "true",
+                                   "Enable VFP2 instructions">;
+def FeatureVFP3 : SubtargetFeature<"vfp3", "HasVFPv3", "true",
+                                   "Enable VFP3 instructions",
+                                   [FeatureVFP2]>;
+def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true",
+                                   "Enable NEON instructions",
+                                   [FeatureVFP3]>;
+def FeatureThumb2 : SubtargetFeature<"thumb2", "HasThumb2", "true",
+                                     "Enable Thumb2 instructions">;
+def FeatureNoARM  : SubtargetFeature<"noarm", "NoARM", "true",
+                                     "Does not support ARM mode execution",
+                                     [ModeThumb]>;
+def FeatureFP16   : SubtargetFeature<"fp16", "HasFP16", "true",
+                                     "Enable half-precision floating point">;
+def FeatureVFP4   : SubtargetFeature<"vfp4", "HasVFPv4", "true",
+                                     "Enable VFP4 instructions",
+                                     [FeatureVFP3, FeatureFP16]>;
+def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8",
+                                   "true", "Enable ARMv8 FP",
+                                   [FeatureVFP4]>;
+def FeatureD16    : SubtargetFeature<"d16", "HasD16", "true",
+                                     "Restrict VFP3 to 16 double registers">;
+def FeatureHWDiv  : SubtargetFeature<"hwdiv", "HasHardwareDivide", "true",
+                                     "Enable divide instructions">;
+def FeatureHWDivARM  : SubtargetFeature<"hwdiv-arm",
+                                        "HasHardwareDivideInARM", "true",
+                                      "Enable divide instructions in ARM mode">;
+def FeatureT2XtPk : SubtargetFeature<"t2xtpk", "HasT2ExtractPack", "true",
+                                 "Enable Thumb2 extract and pack instructions">;
+def FeatureDB     : SubtargetFeature<"db", "HasDataBarrier", "true",
+                                   "Has data barrier (dmb / dsb) instructions">;
+def FeatureSlowFPBrcc : SubtargetFeature<"slow-fp-brcc", "SlowFPBrcc", "true",
+                                         "FP compare + branch is slow">;
+def FeatureVFPOnlySP : SubtargetFeature<"fp-only-sp", "FPOnlySP", "true",
+                          "Floating point unit supports single precision only">;
+def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true",
+                           "Enable support for Performance Monitor extensions">;
+def FeatureTrustZone : SubtargetFeature<"trustzone", "HasTrustZone", "true",
+                          "Enable support for TrustZone security extensions">;
+def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
+                          "Enable support for Cryptography extensions",
+                          [FeatureNEON]>;
+def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true",
+                          "Enable support for CRC instructions">;
+
+// Cyclone has preferred instructions for zeroing VFP registers, which can
+// execute in 0 cycles.
+def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
+                                        "Has zero-cycle zeroing instructions">;
+
+// Some processors have FP multiply-accumulate instructions that don't
+// play nicely with other VFP / NEON instructions, and it's generally better
+// to just not use them.
+def FeatureHasSlowFPVMLx : SubtargetFeature<"slowfpvmlx", "SlowFPVMLx", "true",
+                                         "Disable VFP / NEON MAC instructions">;
+
+// Cortex-A8 / A9 Advanced SIMD has multiplier accumulator forwarding.
+def FeatureVMLxForwarding : SubtargetFeature<"vmlx-forwarding",
+                                       "HasVMLxForwarding", "true",
+                                       "Has multiplier accumulator forwarding">;
+
+// Some processors benefit from using NEON instructions for scalar
+// single-precision FP operations.
+def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP",
+                                        "true",
+                                        "Use NEON for single precision FP">;
+
+// Disable 32-bit to 16-bit narrowing for experimentation.
+def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Pref32BitThumb", "true",
+                                             "Prefer 32-bit Thumb instrs">;
+
+/// Some instructions update CPSR partially, which can add false dependency for
+/// out-of-order implementation, e.g. Cortex-A9, unless each individual bit is
+/// mapped to a separate physical register. Avoid partial CPSR update for these
+/// processors.
+def FeatureAvoidPartialCPSR : SubtargetFeature<"avoid-partial-cpsr",
+                                               "AvoidCPSRPartialUpdate", "true",
+                                 "Avoid CPSR partial update for OOO execution">;
+
+def FeatureAvoidMOVsShOp : SubtargetFeature<"avoid-movs-shop",
+                                            "AvoidMOVsShifterOperand", "true",
+                                "Avoid movs instructions with shifter operand">;
+
+// Some processors perform return stack prediction. CodeGen should avoid issue
+// "normal" call instructions to callees which do not return.
+def FeatureHasRAS : SubtargetFeature<"ras", "HasRAS", "true",
+                                     "Has return address stack">;
+
+/// Some M architectures don't have the DSP extension (v7E-M vs. v7M)
+def FeatureDSPThumb2 : SubtargetFeature<"t2dsp", "Thumb2DSP", "true",
+                                 "Supports v7 DSP instructions in Thumb2">;
+
+// Multiprocessing extension.
+def FeatureMP : SubtargetFeature<"mp", "HasMPExtension", "true",
+                                 "Supports Multiprocessing extension">;
+
+// Virtualization extension - requires HW divide (ARMv7-AR ARMARM - 4.4.8).
+def FeatureVirtualization : SubtargetFeature<"virtualization",
+                                 "HasVirtualization", "true",
+                                 "Supports Virtualization extension",
+                                 [FeatureHWDiv, FeatureHWDivARM]>;
+
+// M-series ISA
+def FeatureMClass : SubtargetFeature<"mclass", "ARMProcClass", "MClass",
+                                     "Is microcontroller profile ('M' series)">;
+
+// R-series ISA
+def FeatureRClass : SubtargetFeature<"rclass", "ARMProcClass", "RClass",
+                                     "Is realtime profile ('R' series)">;
+
+// A-series ISA
+def FeatureAClass : SubtargetFeature<"aclass", "ARMProcClass", "AClass",
+                                     "Is application profile ('A' series)">;
+
+// Special TRAP encoding for NaCl, which looks like a TRAP in Thumb too.
+// See ARMInstrInfo.td for details.
+def FeatureNaClTrap : SubtargetFeature<"nacl-trap", "UseNaClTrap", "true",
+                                       "NaCl trap">;
+
+// ARM ISAs.
+def HasV4TOps   : SubtargetFeature<"v4t", "HasV4TOps", "true",
+                                   "Support ARM v4T instructions">;
+def HasV5TOps   : SubtargetFeature<"v5t", "HasV5TOps", "true",
+                                   "Support ARM v5T instructions",
+                                   [HasV4TOps]>;
+def HasV5TEOps  : SubtargetFeature<"v5te", "HasV5TEOps", "true",
+                             "Support ARM v5TE, v5TEj, and v5TExp instructions",
+                                   [HasV5TOps]>;
+def HasV6Ops    : SubtargetFeature<"v6", "HasV6Ops", "true",
+                                   "Support ARM v6 instructions",
+                                   [HasV5TEOps]>;
+def HasV6MOps   : SubtargetFeature<"v6m", "HasV6MOps", "true",
+                                   "Support ARM v6M instructions",
+                                   [HasV6Ops]>;
+def HasV6T2Ops  : SubtargetFeature<"v6t2", "HasV6T2Ops", "true",
+                                   "Support ARM v6t2 instructions",
+                                   [HasV6MOps, FeatureThumb2]>;
+def HasV7Ops    : SubtargetFeature<"v7", "HasV7Ops", "true",
+                                   "Support ARM v7 instructions",
+                                   [HasV6T2Ops, FeaturePerfMon]>;
+def HasV8Ops    : SubtargetFeature<"v8", "HasV8Ops", "true",
+                                   "Support ARM v8 instructions",
+                                   [HasV7Ops, FeatureVirtualization,
+                                    FeatureMP]>;
+
+//===----------------------------------------------------------------------===//
+// ARM Processors supported.
+//
+
+include "ARMSchedule.td"
+
+// ARM processor families.
+def ProcA5      : SubtargetFeature<"a5", "ARMProcFamily", "CortexA5",
+                                   "Cortex-A5 ARM processors",
+                                   [FeatureSlowFPBrcc, FeatureHasSlowFPVMLx,
+                                    FeatureVMLxForwarding, FeatureT2XtPk,
+                                    FeatureTrustZone, FeatureMP]>;
+def ProcA7      : SubtargetFeature<"a7", "ARMProcFamily", "CortexA7",
+                                   "Cortex-A7 ARM processors",
+                                   [FeatureSlowFPBrcc, FeatureHasSlowFPVMLx,
+                                    FeatureVMLxForwarding, FeatureT2XtPk,
+                                    FeatureVFP4, FeatureMP,
+                                    FeatureHWDiv, FeatureHWDivARM,
+                                    FeatureTrustZone, FeatureVirtualization]>;
+def ProcA8      : SubtargetFeature<"a8", "ARMProcFamily", "CortexA8",
+                                   "Cortex-A8 ARM processors",
+                                   [FeatureSlowFPBrcc, FeatureHasSlowFPVMLx,
+                                    FeatureVMLxForwarding, FeatureT2XtPk,
+                                    FeatureTrustZone]>;
+def ProcA9      : SubtargetFeature<"a9", "ARMProcFamily", "CortexA9",
+                                   "Cortex-A9 ARM processors",
+                                   [FeatureVMLxForwarding,
+                                    FeatureT2XtPk, FeatureFP16,
+                                    FeatureAvoidPartialCPSR,
+                                    FeatureTrustZone]>;
+def ProcSwift   : SubtargetFeature<"swift", "ARMProcFamily", "Swift",
+                                   "Swift ARM processors",
+                                   [FeatureNEONForFP, FeatureT2XtPk,
+                                    FeatureVFP4, FeatureMP, FeatureHWDiv,
+                                    FeatureHWDivARM, FeatureAvoidPartialCPSR,
+                                    FeatureAvoidMOVsShOp,
+                                    FeatureHasSlowFPVMLx, FeatureTrustZone]>;
+def ProcA12     : SubtargetFeature<"a12", "ARMProcFamily", "CortexA12",
+                                   "Cortex-A12 ARM processors",
+                                   [FeatureVMLxForwarding,
+                                    FeatureT2XtPk, FeatureVFP4,
+                                    FeatureHWDiv, FeatureHWDivARM,
+                                    FeatureAvoidPartialCPSR,
+                                    FeatureVirtualization,
+                                    FeatureTrustZone]>;
+
+
+// FIXME: It has not been determined if A15 has these features.
+def ProcA15      : SubtargetFeature<"a15", "ARMProcFamily", "CortexA15",
+                                   "Cortex-A15 ARM processors",
+                                   [FeatureT2XtPk, FeatureVFP4,
+                                    FeatureMP, FeatureHWDiv, FeatureHWDivARM,
+                                    FeatureAvoidPartialCPSR,
+                                    FeatureTrustZone, FeatureVirtualization]>;
+
+def ProcA53     : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
+                                   "Cortex-A53 ARM processors",
+                                   [FeatureHWDiv, FeatureHWDivARM,
+                                    FeatureTrustZone, FeatureT2XtPk,
+                                    FeatureCrypto, FeatureCRC]>;
+
+def ProcA57     : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
+                                   "Cortex-A57 ARM processors",
+                                   [FeatureHWDiv, FeatureHWDivARM,
+                                    FeatureTrustZone, FeatureT2XtPk,
+                                    FeatureCrypto, FeatureCRC]>;
+
+def ProcR5      : SubtargetFeature<"r5", "ARMProcFamily", "CortexR5",
+                                   "Cortex-R5 ARM processors",
+                                   [FeatureSlowFPBrcc,
+                                    FeatureHWDiv, FeatureHWDivARM,
+                                    FeatureHasSlowFPVMLx,
+                                    FeatureAvoidPartialCPSR,
+                                    FeatureT2XtPk]>;
+
+// FIXME: krait has currently the same features as A9
+// plus VFP4 and hardware division features.
+def ProcKrait   : SubtargetFeature<"krait", "ARMProcFamily", "Krait",
+                                   "Qualcomm ARM processors",
+                                   [FeatureVMLxForwarding,
+                                    FeatureT2XtPk, FeatureFP16,
+                                    FeatureAvoidPartialCPSR,
+                                    FeatureTrustZone,
+                                    FeatureVFP4,
+                                    FeatureHWDiv,
+                                    FeatureHWDivARM]>;
+
+
+def FeatureAPCS  : SubtargetFeature<"apcs", "TargetABI", "ARM_ABI_APCS",
+                                   "Use the APCS ABI">;
+
+def FeatureAAPCS : SubtargetFeature<"aapcs", "TargetABI", "ARM_ABI_AAPCS",
+                                   "Use the AAPCS ABI">;
+
+
+class ProcNoItin<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+// V4 Processors.
+def : ProcNoItin<"generic",         []>;
+def : ProcNoItin<"arm8",            []>;
+def : ProcNoItin<"arm810",          []>;
+def : ProcNoItin<"strongarm",       []>;
+def : ProcNoItin<"strongarm110",    []>;
+def : ProcNoItin<"strongarm1100",   []>;
+def : ProcNoItin<"strongarm1110",   []>;
+
+// V4T Processors.
+def : ProcNoItin<"arm7tdmi",        [HasV4TOps]>;
+def : ProcNoItin<"arm7tdmi-s",      [HasV4TOps]>;
+def : ProcNoItin<"arm710t",         [HasV4TOps]>;
+def : ProcNoItin<"arm720t",         [HasV4TOps]>;
+def : ProcNoItin<"arm9",            [HasV4TOps]>;
+def : ProcNoItin<"arm9tdmi",        [HasV4TOps]>;
+def : ProcNoItin<"arm920",          [HasV4TOps]>;
+def : ProcNoItin<"arm920t",         [HasV4TOps]>;
+def : ProcNoItin<"arm922t",         [HasV4TOps]>;
+def : ProcNoItin<"arm940t",         [HasV4TOps]>;
+def : ProcNoItin<"ep9312",          [HasV4TOps]>;
+
+// V5T Processors.
+def : ProcNoItin<"arm10tdmi",       [HasV5TOps]>;
+def : ProcNoItin<"arm1020t",        [HasV5TOps]>;
+
+// V5TE Processors.
+def : ProcNoItin<"arm9e",           [HasV5TEOps]>;
+def : ProcNoItin<"arm926ej-s",      [HasV5TEOps]>;
+def : ProcNoItin<"arm946e-s",       [HasV5TEOps]>;
+def : ProcNoItin<"arm966e-s",       [HasV5TEOps]>;
+def : ProcNoItin<"arm968e-s",       [HasV5TEOps]>;
+def : ProcNoItin<"arm10e",          [HasV5TEOps]>;
+def : ProcNoItin<"arm1020e",        [HasV5TEOps]>;
+def : ProcNoItin<"arm1022e",        [HasV5TEOps]>;
+def : ProcNoItin<"xscale",          [HasV5TEOps]>;
+def : ProcNoItin<"iwmmxt",          [HasV5TEOps]>;
+
+// V6 Processors.
+def : Processor<"arm1136j-s",       ARMV6Itineraries, [HasV6Ops]>;
+def : Processor<"arm1136jf-s",      ARMV6Itineraries, [HasV6Ops, FeatureVFP2,
+                                                       FeatureHasSlowFPVMLx]>;
+def : Processor<"arm1176jz-s",      ARMV6Itineraries, [HasV6Ops]>;
+def : Processor<"arm1176jzf-s",     ARMV6Itineraries, [HasV6Ops, FeatureVFP2,
+                                                       FeatureHasSlowFPVMLx]>;
+def : Processor<"mpcorenovfp",      ARMV6Itineraries, [HasV6Ops]>;
+def : Processor<"mpcore",           ARMV6Itineraries, [HasV6Ops, FeatureVFP2,
+                                                       FeatureHasSlowFPVMLx]>;
+
+// V6M Processors.
+def : Processor<"cortex-m0",        ARMV6Itineraries, [HasV6MOps, FeatureNoARM,
+                                                       FeatureDB, FeatureMClass]>;
+
+// V6T2 Processors.
+def : Processor<"arm1156t2-s",      ARMV6Itineraries, [HasV6T2Ops,
+                                                       FeatureDSPThumb2]>;
+def : Processor<"arm1156t2f-s",     ARMV6Itineraries, [HasV6T2Ops, FeatureVFP2,
+                                                       FeatureHasSlowFPVMLx,
+                                                       FeatureDSPThumb2]>;
+
+// V7a Processors.
+// FIXME: A5 has currently the same Schedule model as A8
+def : ProcessorModel<"cortex-a5",   CortexA8Model,
+                                    [ProcA5, HasV7Ops, FeatureNEON, FeatureDB,
+                                     FeatureVFP4, FeatureDSPThumb2,
+                                     FeatureHasRAS, FeatureAClass]>;
+def : ProcessorModel<"cortex-a7",   CortexA8Model,
+                                    [ProcA7, HasV7Ops, FeatureNEON, FeatureDB,
+                                     FeatureDSPThumb2, FeatureHasRAS,
+                                     FeatureAClass]>;
+def : ProcessorModel<"cortex-a8",   CortexA8Model,
+                                    [ProcA8, HasV7Ops, FeatureNEON, FeatureDB,
+                                     FeatureDSPThumb2, FeatureHasRAS,
+                                     FeatureAClass]>;
+def : ProcessorModel<"cortex-a9",   CortexA9Model,
+                                    [ProcA9, HasV7Ops, FeatureNEON, FeatureDB,
+                                     FeatureDSPThumb2, FeatureHasRAS,
+                                     FeatureAClass]>;
+def : ProcessorModel<"cortex-a9-mp", CortexA9Model,
+                                    [ProcA9, HasV7Ops, FeatureNEON, FeatureDB,
+                                     FeatureDSPThumb2, FeatureMP,
+                                     FeatureHasRAS, FeatureAClass]>;
+
+// FIXME: A12 has currently the same Schedule model as A9
+def : ProcessorModel<"cortex-a12", CortexA9Model,
+                                    [ProcA12, HasV7Ops, FeatureNEON, FeatureDB,
+                                     FeatureDSPThumb2, FeatureMP,
+                                     FeatureHasRAS, FeatureAClass]>;
+
+// FIXME: A15 has currently the same ProcessorModel as A9.
+def : ProcessorModel<"cortex-a15",   CortexA9Model,
+                                    [ProcA15, HasV7Ops, FeatureNEON, FeatureDB,
+                                     FeatureDSPThumb2, FeatureHasRAS,
+                                     FeatureAClass]>;
+
+// FIXME: krait has currently the same Schedule model as A9
+def : ProcessorModel<"krait",       CortexA9Model,
+                                    [ProcKrait, HasV7Ops,
+                                     FeatureNEON, FeatureDB,
+                                     FeatureDSPThumb2, FeatureHasRAS,
+                                     FeatureAClass]>;
+
+// FIXME: R5 has currently the same ProcessorModel as A8.
+def : ProcessorModel<"cortex-r5",   CortexA8Model,
+                                    [ProcR5, HasV7Ops, FeatureDB,
+                                     FeatureVFP3, FeatureDSPThumb2,
+                                     FeatureHasRAS, FeatureVFPOnlySP,
+                                     FeatureD16, FeatureRClass]>;
+
+// V7M Processors.
+def : ProcNoItin<"cortex-m3",       [HasV7Ops,
+                                     FeatureThumb2, FeatureNoARM, FeatureDB,
+                                     FeatureHWDiv, FeatureMClass]>;
+
+// V7EM Processors.
+def : ProcNoItin<"cortex-m4",       [HasV7Ops,
+                                     FeatureThumb2, FeatureNoARM, FeatureDB,
+                                     FeatureHWDiv, FeatureDSPThumb2,
+                                     FeatureT2XtPk, FeatureVFP4,
+                                     FeatureVFPOnlySP, FeatureD16,
+                                     FeatureMClass]>;
+
+// Swift uArch Processors.
+def : ProcessorModel<"swift",       SwiftModel,
+                                    [ProcSwift, HasV7Ops, FeatureNEON,
+                                     FeatureDB, FeatureDSPThumb2,
+                                     FeatureHasRAS, FeatureAClass]>;
+
+// V8 Processors
+def : ProcNoItin<"cortex-a53",      [ProcA53, HasV8Ops, FeatureAClass,
+                                    FeatureDB, FeatureFPARMv8,
+                                    FeatureNEON, FeatureDSPThumb2]>;
+def : ProcNoItin<"cortex-a57",      [ProcA57, HasV8Ops, FeatureAClass,
+                                    FeatureDB, FeatureFPARMv8,
+                                    FeatureNEON, FeatureDSPThumb2]>;
+
+// Cyclone is very similar to swift
+def : ProcessorModel<"cyclone",     SwiftModel,
+                                    [ProcSwift, HasV8Ops, HasV7Ops,
+                                     FeatureCrypto, FeatureFPARMv8,
+                                     FeatureDB,FeatureDSPThumb2,
+                                     FeatureHasRAS, FeatureZCZeroing]>;
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "ARMRegisterInfo.td"
+
+include "ARMCallingConv.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "ARMInstrInfo.td"
+
+def ARMInstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// Declare the target which we are implementing
+//===----------------------------------------------------------------------===//
+
+def ARM : Target {
+  // Pull in Instruction Info:
+  let InstructionSet = ARMInstrInfo;
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
new file mode 100644
index 0000000..28d2610
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -0,0 +1,1834 @@
+//===-- ARMAsmPrinter.cpp - Print machine code to an ARM .s file ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to GAS-format ARM assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMAsmPrinter.h"
+#include "ARM.h"
+#include "ARMConstantPoolValue.h"
+#include "ARMFPUName.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMTargetMachine.h"
+#include "ARMTargetObjectFile.h"
+#include "InstPrinter/ARMInstPrinter.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMMCExpr.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ARMBuildAttributes.h"
+#include "llvm/Support/COFF.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <cctype>
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+void ARMAsmPrinter::EmitFunctionBodyEnd() {
+  // Make sure to terminate any constant pools that were at the end
+  // of the function.
+  if (!InConstantPool)
+    return;
+  InConstantPool = false;
+  OutStreamer.EmitDataRegion(MCDR_DataRegionEnd);
+}
+
+void ARMAsmPrinter::EmitFunctionEntryLabel() {
+  if (AFI->isThumbFunction()) {
+    OutStreamer.EmitAssemblerFlag(MCAF_Code16);
+    OutStreamer.EmitThumbFunc(CurrentFnSym);
+  }
+
+  OutStreamer.EmitLabel(CurrentFnSym);
+}
+
+void ARMAsmPrinter::EmitXXStructor(const Constant *CV) {
+  uint64_t Size = TM.getDataLayout()->getTypeAllocSize(CV->getType());
+  assert(Size && "C++ constructor pointer had zero size!");
+
+  const GlobalValue *GV = dyn_cast<GlobalValue>(CV->stripPointerCasts());
+  assert(GV && "C++ constructor pointer was not a GlobalValue!");
+
+  const MCExpr *E = MCSymbolRefExpr::Create(GetARMGVSymbol(GV,
+                                                           ARMII::MO_NO_FLAG),
+                                            (Subtarget->isTargetELF()
+                                             ? MCSymbolRefExpr::VK_ARM_TARGET1
+                                             : MCSymbolRefExpr::VK_None),
+                                            OutContext);
+
+  OutStreamer.EmitValue(E, Size);
+}
+
+/// runOnMachineFunction - This uses the EmitInstruction()
+/// method to print assembly for each instruction.
+///
+bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  AFI = MF.getInfo<ARMFunctionInfo>();
+  MCP = MF.getConstantPool();
+
+  SetupMachineFunction(MF);
+
+  if (Subtarget->isTargetCOFF()) {
+    bool Internal = MF.getFunction()->hasInternalLinkage();
+    COFF::SymbolStorageClass Scl = Internal ? COFF::IMAGE_SYM_CLASS_STATIC
+                                            : COFF::IMAGE_SYM_CLASS_EXTERNAL;
+    int Type = COFF::IMAGE_SYM_DTYPE_FUNCTION << COFF::SCT_COMPLEX_TYPE_SHIFT;
+
+    OutStreamer.BeginCOFFSymbolDef(CurrentFnSym);
+    OutStreamer.EmitCOFFSymbolStorageClass(Scl);
+    OutStreamer.EmitCOFFSymbolType(Type);
+    OutStreamer.EndCOFFSymbolDef();
+  }
+
+  // Have common code print out the function header with linkage info etc.
+  EmitFunctionHeader();
+
+  // Emit the rest of the function body.
+  EmitFunctionBody();
+
+  // We didn't modify anything.
+  return false;
+}
+
+void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
+                                 raw_ostream &O, const char *Modifier) {
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  unsigned TF = MO.getTargetFlags();
+
+  switch (MO.getType()) {
+  default: llvm_unreachable("<unknown operand type>");
+  case MachineOperand::MO_Register: {
+    unsigned Reg = MO.getReg();
+    assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+    assert(!MO.getSubReg() && "Subregs should be eliminated!");
+    if(ARM::GPRPairRegClass.contains(Reg)) {
+      const MachineFunction &MF = *MI->getParent()->getParent();
+      const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+      Reg = TRI->getSubReg(Reg, ARM::gsub_0);
+    }
+    O << ARMInstPrinter::getRegisterName(Reg);
+    break;
+  }
+  case MachineOperand::MO_Immediate: {
+    int64_t Imm = MO.getImm();
+    O << '#';
+    if ((Modifier && strcmp(Modifier, "lo16") == 0) ||
+        (TF == ARMII::MO_LO16))
+      O << ":lower16:";
+    else if ((Modifier && strcmp(Modifier, "hi16") == 0) ||
+             (TF == ARMII::MO_HI16))
+      O << ":upper16:";
+    O << Imm;
+    break;
+  }
+  case MachineOperand::MO_MachineBasicBlock:
+    O << *MO.getMBB()->getSymbol();
+    return;
+  case MachineOperand::MO_GlobalAddress: {
+    const GlobalValue *GV = MO.getGlobal();
+    if ((Modifier && strcmp(Modifier, "lo16") == 0) ||
+        (TF & ARMII::MO_LO16))
+      O << ":lower16:";
+    else if ((Modifier && strcmp(Modifier, "hi16") == 0) ||
+             (TF & ARMII::MO_HI16))
+      O << ":upper16:";
+    O << *GetARMGVSymbol(GV, TF);
+
+    printOffset(MO.getOffset(), O);
+    if (TF == ARMII::MO_PLT)
+      O << "(PLT)";
+    break;
+  }
+  case MachineOperand::MO_ConstantPoolIndex:
+    O << *GetCPISymbol(MO.getIndex());
+    break;
+  }
+}
+
+//===--------------------------------------------------------------------===//
+
+MCSymbol *ARMAsmPrinter::
+GetARMJTIPICJumpTableLabel2(unsigned uid, unsigned uid2) const {
+  const DataLayout *DL = TM.getDataLayout();
+  SmallString<60> Name;
+  raw_svector_ostream(Name) << DL->getPrivateGlobalPrefix() << "JTI"
+    << getFunctionNumber() << '_' << uid << '_' << uid2;
+  return OutContext.GetOrCreateSymbol(Name.str());
+}
+
+
+MCSymbol *ARMAsmPrinter::GetARMSJLJEHLabel() const {
+  const DataLayout *DL = TM.getDataLayout();
+  SmallString<60> Name;
+  raw_svector_ostream(Name) << DL->getPrivateGlobalPrefix() << "SJLJEH"
+    << getFunctionNumber();
+  return OutContext.GetOrCreateSymbol(Name.str());
+}
+
+bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
+                                    unsigned AsmVariant, const char *ExtraCode,
+                                    raw_ostream &O) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI, OpNum, AsmVariant, ExtraCode, O);
+    case 'a': // Print as a memory address.
+      if (MI->getOperand(OpNum).isReg()) {
+        O << "["
+          << ARMInstPrinter::getRegisterName(MI->getOperand(OpNum).getReg())
+          << "]";
+        return false;
+      }
+      // Fallthrough
+    case 'c': // Don't print "#" before an immediate operand.
+      if (!MI->getOperand(OpNum).isImm())
+        return true;
+      O << MI->getOperand(OpNum).getImm();
+      return false;
+    case 'P': // Print a VFP double precision register.
+    case 'q': // Print a NEON quad precision register.
+      printOperand(MI, OpNum, O);
+      return false;
+    case 'y': // Print a VFP single precision register as indexed double.
+      if (MI->getOperand(OpNum).isReg()) {
+        unsigned Reg = MI->getOperand(OpNum).getReg();
+        const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
+        // Find the 'd' register that has this 's' register as a sub-register,
+        // and determine the lane number.
+        for (MCSuperRegIterator SR(Reg, TRI); SR.isValid(); ++SR) {
+          if (!ARM::DPRRegClass.contains(*SR))
+            continue;
+          bool Lane0 = TRI->getSubReg(*SR, ARM::ssub_0) == Reg;
+          O << ARMInstPrinter::getRegisterName(*SR) << (Lane0 ? "[0]" : "[1]");
+          return false;
+        }
+      }
+      return true;
+    case 'B': // Bitwise inverse of integer or symbol without a preceding #.
+      if (!MI->getOperand(OpNum).isImm())
+        return true;
+      O << ~(MI->getOperand(OpNum).getImm());
+      return false;
+    case 'L': // The low 16 bits of an immediate constant.
+      if (!MI->getOperand(OpNum).isImm())
+        return true;
+      O << (MI->getOperand(OpNum).getImm() & 0xffff);
+      return false;
+    case 'M': { // A register range suitable for LDM/STM.
+      if (!MI->getOperand(OpNum).isReg())
+        return true;
+      const MachineOperand &MO = MI->getOperand(OpNum);
+      unsigned RegBegin = MO.getReg();
+      // This takes advantage of the 2 operand-ness of ldm/stm and that we've
+      // already got the operands in registers that are operands to the
+      // inline asm statement.
+      O << "{";
+      if (ARM::GPRPairRegClass.contains(RegBegin)) {
+        const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
+        unsigned Reg0 = TRI->getSubReg(RegBegin, ARM::gsub_0);
+        O << ARMInstPrinter::getRegisterName(Reg0) << ", ";
+        RegBegin = TRI->getSubReg(RegBegin, ARM::gsub_1);
+      }
+      O << ARMInstPrinter::getRegisterName(RegBegin);
+
+      // FIXME: The register allocator not only may not have given us the
+      // registers in sequence, but may not be in ascending registers. This
+      // will require changes in the register allocator that'll need to be
+      // propagated down here if the operands change.
+      unsigned RegOps = OpNum + 1;
+      while (MI->getOperand(RegOps).isReg()) {
+        O << ", "
+          << ARMInstPrinter::getRegisterName(MI->getOperand(RegOps).getReg());
+        RegOps++;
+      }
+
+      O << "}";
+
+      return false;
+    }
+    case 'R': // The most significant register of a pair.
+    case 'Q': { // The least significant register of a pair.
+      if (OpNum == 0)
+        return true;
+      const MachineOperand &FlagsOP = MI->getOperand(OpNum - 1);
+      if (!FlagsOP.isImm())
+        return true;
+      unsigned Flags = FlagsOP.getImm();
+
+      // This operand may not be the one that actually provides the register. If
+      // it's tied to a previous one then we should refer instead to that one
+      // for registers and their classes.
+      unsigned TiedIdx;
+      if (InlineAsm::isUseOperandTiedToDef(Flags, TiedIdx)) {
+        for (OpNum = InlineAsm::MIOp_FirstOperand; TiedIdx; --TiedIdx) {
+          unsigned OpFlags = MI->getOperand(OpNum).getImm();
+          OpNum += InlineAsm::getNumOperandRegisters(OpFlags) + 1;
+        }
+        Flags = MI->getOperand(OpNum).getImm();
+
+        // Later code expects OpNum to be pointing at the register rather than
+        // the flags.
+        OpNum += 1;
+      }
+
+      unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
+      unsigned RC;
+      InlineAsm::hasRegClassConstraint(Flags, RC);
+      if (RC == ARM::GPRPairRegClassID) {
+        if (NumVals != 1)
+          return true;
+        const MachineOperand &MO = MI->getOperand(OpNum);
+        if (!MO.isReg())
+          return true;
+        const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
+        unsigned Reg = TRI->getSubReg(MO.getReg(), ExtraCode[0] == 'Q' ?
+            ARM::gsub_0 : ARM::gsub_1);
+        O << ARMInstPrinter::getRegisterName(Reg);
+        return false;
+      }
+      if (NumVals != 2)
+        return true;
+      unsigned RegOp = ExtraCode[0] == 'Q' ? OpNum : OpNum + 1;
+      if (RegOp >= MI->getNumOperands())
+        return true;
+      const MachineOperand &MO = MI->getOperand(RegOp);
+      if (!MO.isReg())
+        return true;
+      unsigned Reg = MO.getReg();
+      O << ARMInstPrinter::getRegisterName(Reg);
+      return false;
+    }
+
+    case 'e': // The low doubleword register of a NEON quad register.
+    case 'f': { // The high doubleword register of a NEON quad register.
+      if (!MI->getOperand(OpNum).isReg())
+        return true;
+      unsigned Reg = MI->getOperand(OpNum).getReg();
+      if (!ARM::QPRRegClass.contains(Reg))
+        return true;
+      const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
+      unsigned SubReg = TRI->getSubReg(Reg, ExtraCode[0] == 'e' ?
+                                       ARM::dsub_0 : ARM::dsub_1);
+      O << ARMInstPrinter::getRegisterName(SubReg);
+      return false;
+    }
+
+    // This modifier is not yet supported.
+    case 'h': // A range of VFP/NEON registers suitable for VLD1/VST1.
+      return true;
+    case 'H': { // The highest-numbered register of a pair.
+      const MachineOperand &MO = MI->getOperand(OpNum);
+      if (!MO.isReg())
+        return true;
+      const MachineFunction &MF = *MI->getParent()->getParent();
+      const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+      unsigned Reg = MO.getReg();
+      if(!ARM::GPRPairRegClass.contains(Reg))
+        return false;
+      Reg = TRI->getSubReg(Reg, ARM::gsub_1);
+      O << ARMInstPrinter::getRegisterName(Reg);
+      return false;
+    }
+    }
+  }
+
+  printOperand(MI, OpNum, O);
+  return false;
+}
+
+bool ARMAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                          unsigned OpNum, unsigned AsmVariant,
+                                          const char *ExtraCode,
+                                          raw_ostream &O) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+      case 'A': // A memory operand for a VLD1/VST1 instruction.
+      default: return true;  // Unknown modifier.
+      case 'm': // The base register of a memory operand.
+        if (!MI->getOperand(OpNum).isReg())
+          return true;
+        O << ARMInstPrinter::getRegisterName(MI->getOperand(OpNum).getReg());
+        return false;
+    }
+  }
+
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  assert(MO.isReg() && "unexpected inline asm memory operand");
+  O << "[" << ARMInstPrinter::getRegisterName(MO.getReg()) << "]";
+  return false;
+}
+
+static bool isThumb(const MCSubtargetInfo& STI) {
+  return (STI.getFeatureBits() & ARM::ModeThumb) != 0;
+}
+
+void ARMAsmPrinter::emitInlineAsmEnd(const MCSubtargetInfo &StartInfo,
+                                     const MCSubtargetInfo *EndInfo) const {
+  // If either end mode is unknown (EndInfo == NULL) or different than
+  // the start mode, then restore the start mode.
+  const bool WasThumb = isThumb(StartInfo);
+  if (!EndInfo || WasThumb != isThumb(*EndInfo)) {
+    OutStreamer.EmitAssemblerFlag(WasThumb ? MCAF_Code16 : MCAF_Code32);
+  }
+}
+
+void ARMAsmPrinter::EmitStartOfAsmFile(Module &M) {
+  if (Subtarget->isTargetMachO()) {
+    Reloc::Model RelocM = TM.getRelocationModel();
+    if (RelocM == Reloc::PIC_ || RelocM == Reloc::DynamicNoPIC) {
+      // Declare all the text sections up front (before the DWARF sections
+      // emitted by AsmPrinter::doInitialization) so the assembler will keep
+      // them together at the beginning of the object file.  This helps
+      // avoid out-of-range branches that are due a fundamental limitation of
+      // the way symbol offsets are encoded with the current Darwin ARM
+      // relocations.
+      const TargetLoweringObjectFileMachO &TLOFMacho =
+        static_cast<const TargetLoweringObjectFileMachO &>(
+          getObjFileLowering());
+
+      // Collect the set of sections our functions will go into.
+      SetVector<const MCSection *, SmallVector<const MCSection *, 8>,
+        SmallPtrSet<const MCSection *, 8> > TextSections;
+      // Default text section comes first.
+      TextSections.insert(TLOFMacho.getTextSection());
+      // Now any user defined text sections from function attributes.
+      for (Module::iterator F = M.begin(), e = M.end(); F != e; ++F)
+        if (!F->isDeclaration() && !F->hasAvailableExternallyLinkage())
+          TextSections.insert(TLOFMacho.SectionForGlobal(F, *Mang, TM));
+      // Now the coalescable sections.
+      TextSections.insert(TLOFMacho.getTextCoalSection());
+      TextSections.insert(TLOFMacho.getConstTextCoalSection());
+
+      // Emit the sections in the .s file header to fix the order.
+      for (unsigned i = 0, e = TextSections.size(); i != e; ++i)
+        OutStreamer.SwitchSection(TextSections[i]);
+
+      if (RelocM == Reloc::DynamicNoPIC) {
+        const MCSection *sect =
+          OutContext.getMachOSection("__TEXT", "__symbol_stub4",
+                                     MachO::S_SYMBOL_STUBS,
+                                     12, SectionKind::getText());
+        OutStreamer.SwitchSection(sect);
+      } else {
+        const MCSection *sect =
+          OutContext.getMachOSection("__TEXT", "__picsymbolstub4",
+                                     MachO::S_SYMBOL_STUBS,
+                                     16, SectionKind::getText());
+        OutStreamer.SwitchSection(sect);
+      }
+      const MCSection *StaticInitSect =
+        OutContext.getMachOSection("__TEXT", "__StaticInit",
+                                   MachO::S_REGULAR |
+                                   MachO::S_ATTR_PURE_INSTRUCTIONS,
+                                   SectionKind::getText());
+      OutStreamer.SwitchSection(StaticInitSect);
+    }
+
+    // Compiling with debug info should not affect the code
+    // generation.  Ensure the cstring section comes before the
+    // optional __DWARF secion. Otherwise, PC-relative loads would
+    // have to use different instruction sequences at "-g" in order to
+    // reach global data in the same object file.
+    OutStreamer.SwitchSection(getObjFileLowering().getCStringSection());
+  }
+
+  // Use unified assembler syntax.
+  OutStreamer.EmitAssemblerFlag(MCAF_SyntaxUnified);
+
+  // Emit ARM Build Attributes
+  if (Subtarget->isTargetELF())
+    emitAttributes();
+}
+
+static void
+emitNonLazySymbolPointer(MCStreamer &OutStreamer, MCSymbol *StubLabel,
+                         MachineModuleInfoImpl::StubValueTy &MCSym) {
+  // L_foo$stub:
+  OutStreamer.EmitLabel(StubLabel);
+  //   .indirect_symbol _foo
+  OutStreamer.EmitSymbolAttribute(MCSym.getPointer(), MCSA_IndirectSymbol);
+
+  if (MCSym.getInt())
+    // External to current translation unit.
+    OutStreamer.EmitIntValue(0, 4/*size*/);
+  else
+    // Internal to current translation unit.
+    //
+    // When we place the LSDA into the TEXT section, the type info
+    // pointers need to be indirect and pc-rel. We accomplish this by
+    // using NLPs; however, sometimes the types are local to the file.
+    // We need to fill in the value for the NLP in those cases.
+    OutStreamer.EmitValue(
+        MCSymbolRefExpr::Create(MCSym.getPointer(), OutStreamer.getContext()),
+        4 /*size*/);
+}
+
+
+void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
+  if (Subtarget->isTargetMachO()) {
+    // All darwin targets use mach-o.
+    const TargetLoweringObjectFileMachO &TLOFMacho =
+      static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering());
+    MachineModuleInfoMachO &MMIMacho =
+      MMI->getObjFileInfo<MachineModuleInfoMachO>();
+
+    // Output non-lazy-pointers for external and common global variables.
+    MachineModuleInfoMachO::SymbolListTy Stubs = MMIMacho.GetGVStubList();
+
+    if (!Stubs.empty()) {
+      // Switch with ".non_lazy_symbol_pointer" directive.
+      OutStreamer.SwitchSection(TLOFMacho.getNonLazySymbolPointerSection());
+      EmitAlignment(2);
+
+      for (auto &Stub : Stubs)
+        emitNonLazySymbolPointer(OutStreamer, Stub.first, Stub.second);
+
+      Stubs.clear();
+      OutStreamer.AddBlankLine();
+    }
+
+    Stubs = MMIMacho.GetHiddenGVStubList();
+    if (!Stubs.empty()) {
+      OutStreamer.SwitchSection(TLOFMacho.getNonLazySymbolPointerSection());
+      EmitAlignment(2);
+
+      for (auto &Stub : Stubs)
+        emitNonLazySymbolPointer(OutStreamer, Stub.first, Stub.second);
+
+      Stubs.clear();
+      OutStreamer.AddBlankLine();
+    }
+
+    // Funny Darwin hack: This flag tells the linker that no global symbols
+    // contain code that falls through to other global symbols (e.g. the obvious
+    // implementation of multiple entry points).  If this doesn't occur, the
+    // linker can safely perform dead code stripping.  Since LLVM never
+    // generates code that does this, it is always safe to set.
+    OutStreamer.EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
+  }
+
+  // Emit a .data.rel section containing any stubs that were created.
+  if (Subtarget->isTargetELF()) {
+    const TargetLoweringObjectFileELF &TLOFELF =
+      static_cast<const TargetLoweringObjectFileELF &>(getObjFileLowering());
+
+    MachineModuleInfoELF &MMIELF = MMI->getObjFileInfo<MachineModuleInfoELF>();
+
+    // Output stubs for external and common global variables.
+    MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList();
+    if (!Stubs.empty()) {
+      OutStreamer.SwitchSection(TLOFELF.getDataRelSection());
+      const DataLayout *TD = TM.getDataLayout();
+
+      for (auto &stub: Stubs) {
+        OutStreamer.EmitLabel(stub.first);
+        OutStreamer.EmitSymbolValue(stub.second.getPointer(),
+                                    TD->getPointerSize(0));
+      }
+      Stubs.clear();
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Helper routines for EmitStartOfAsmFile() and EmitEndOfAsmFile()
+// FIXME:
+// The following seem like one-off assembler flags, but they actually need
+// to appear in the .ARM.attributes section in ELF.
+// Instead of subclassing the MCELFStreamer, we do the work here.
+
+static ARMBuildAttrs::CPUArch getArchForCPU(StringRef CPU,
+                                            const ARMSubtarget *Subtarget) {
+  if (CPU == "xscale")
+    return ARMBuildAttrs::v5TEJ;
+
+  if (Subtarget->hasV8Ops())
+    return ARMBuildAttrs::v8;
+  else if (Subtarget->hasV7Ops()) {
+    if (Subtarget->isMClass() && Subtarget->hasThumb2DSP())
+      return ARMBuildAttrs::v7E_M;
+    return ARMBuildAttrs::v7;
+  } else if (Subtarget->hasV6T2Ops())
+    return ARMBuildAttrs::v6T2;
+  else if (Subtarget->hasV6MOps())
+    return ARMBuildAttrs::v6S_M;
+  else if (Subtarget->hasV6Ops())
+    return ARMBuildAttrs::v6;
+  else if (Subtarget->hasV5TEOps())
+    return ARMBuildAttrs::v5TE;
+  else if (Subtarget->hasV5TOps())
+    return ARMBuildAttrs::v5T;
+  else if (Subtarget->hasV4TOps())
+    return ARMBuildAttrs::v4T;
+  else
+    return ARMBuildAttrs::v4;
+}
+
+void ARMAsmPrinter::emitAttributes() {
+  MCTargetStreamer &TS = *OutStreamer.getTargetStreamer();
+  ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
+
+  ATS.switchVendor("aeabi");
+
+  std::string CPUString = Subtarget->getCPUString();
+
+  // FIXME: remove krait check when GNU tools support krait cpu
+  if (CPUString != "generic" && CPUString != "krait")
+    ATS.emitTextAttribute(ARMBuildAttrs::CPU_name, CPUString);
+
+  ATS.emitAttribute(ARMBuildAttrs::CPU_arch,
+                    getArchForCPU(CPUString, Subtarget));
+
+  // Tag_CPU_arch_profile must have the default value of 0 when "Architecture
+  // profile is not applicable (e.g. pre v7, or cross-profile code)".
+  if (Subtarget->hasV7Ops()) {
+    if (Subtarget->isAClass()) {
+      ATS.emitAttribute(ARMBuildAttrs::CPU_arch_profile,
+                        ARMBuildAttrs::ApplicationProfile);
+    } else if (Subtarget->isRClass()) {
+      ATS.emitAttribute(ARMBuildAttrs::CPU_arch_profile,
+                        ARMBuildAttrs::RealTimeProfile);
+    } else if (Subtarget->isMClass()) {
+      ATS.emitAttribute(ARMBuildAttrs::CPU_arch_profile,
+                        ARMBuildAttrs::MicroControllerProfile);
+    }
+  }
+
+  ATS.emitAttribute(ARMBuildAttrs::ARM_ISA_use, Subtarget->hasARMOps() ?
+                      ARMBuildAttrs::Allowed : ARMBuildAttrs::Not_Allowed);
+  if (Subtarget->isThumb1Only()) {
+    ATS.emitAttribute(ARMBuildAttrs::THUMB_ISA_use,
+                      ARMBuildAttrs::Allowed);
+  } else if (Subtarget->hasThumb2()) {
+    ATS.emitAttribute(ARMBuildAttrs::THUMB_ISA_use,
+                      ARMBuildAttrs::AllowThumb32);
+  }
+
+  if (Subtarget->hasNEON()) {
+    /* NEON is not exactly a VFP architecture, but GAS emit one of
+     * neon/neon-fp-armv8/neon-vfpv4/vfpv3/vfpv2 for .fpu parameters */
+    if (Subtarget->hasFPARMv8()) {
+      if (Subtarget->hasCrypto())
+        ATS.emitFPU(ARM::CRYPTO_NEON_FP_ARMV8);
+      else
+        ATS.emitFPU(ARM::NEON_FP_ARMV8);
+    }
+    else if (Subtarget->hasVFP4())
+      ATS.emitFPU(ARM::NEON_VFPV4);
+    else
+      ATS.emitFPU(ARM::NEON);
+    // Emit Tag_Advanced_SIMD_arch for ARMv8 architecture
+    if (Subtarget->hasV8Ops())
+      ATS.emitAttribute(ARMBuildAttrs::Advanced_SIMD_arch,
+                        ARMBuildAttrs::AllowNeonARMv8);
+  } else {
+    if (Subtarget->hasFPARMv8())
+      ATS.emitFPU(ARM::FP_ARMV8);
+    else if (Subtarget->hasVFP4())
+      ATS.emitFPU(Subtarget->hasD16() ? ARM::VFPV4_D16 : ARM::VFPV4);
+    else if (Subtarget->hasVFP3())
+      ATS.emitFPU(Subtarget->hasD16() ? ARM::VFPV3_D16 : ARM::VFPV3);
+    else if (Subtarget->hasVFP2())
+      ATS.emitFPU(ARM::VFPV2);
+  }
+
+  if (TM.getRelocationModel() == Reloc::PIC_) {
+    // PIC specific attributes.
+    ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_RW_data,
+                      ARMBuildAttrs::AddressRWPCRel);
+    ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_RO_data,
+                      ARMBuildAttrs::AddressROPCRel);
+    ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_GOT_use,
+                      ARMBuildAttrs::AddressGOT);
+  } else {
+    // Allow direct addressing of imported data for all other relocation models.
+    ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_GOT_use,
+                      ARMBuildAttrs::AddressDirect);
+  }
+
+  // Signal various FP modes.
+  if (!TM.Options.UnsafeFPMath) {
+    ATS.emitAttribute(ARMBuildAttrs::ABI_FP_denormal, ARMBuildAttrs::Allowed);
+    ATS.emitAttribute(ARMBuildAttrs::ABI_FP_exceptions,
+                      ARMBuildAttrs::Allowed);
+  }
+
+  if (TM.Options.NoInfsFPMath && TM.Options.NoNaNsFPMath)
+    ATS.emitAttribute(ARMBuildAttrs::ABI_FP_number_model,
+                      ARMBuildAttrs::Allowed);
+  else
+    ATS.emitAttribute(ARMBuildAttrs::ABI_FP_number_model,
+                      ARMBuildAttrs::AllowIEE754);
+
+  // FIXME: add more flags to ARMBuildAttributes.h
+  // 8-bytes alignment stuff.
+  ATS.emitAttribute(ARMBuildAttrs::ABI_align_needed, 1);
+  ATS.emitAttribute(ARMBuildAttrs::ABI_align_preserved, 1);
+
+  // ABI_HardFP_use attribute to indicate single precision FP.
+  if (Subtarget->isFPOnlySP())
+    ATS.emitAttribute(ARMBuildAttrs::ABI_HardFP_use,
+                      ARMBuildAttrs::HardFPSinglePrecision);
+
+  // Hard float.  Use both S and D registers and conform to AAPCS-VFP.
+  if (Subtarget->isAAPCS_ABI() && TM.Options.FloatABIType == FloatABI::Hard)
+    ATS.emitAttribute(ARMBuildAttrs::ABI_VFP_args, ARMBuildAttrs::HardFPAAPCS);
+
+  // FIXME: Should we signal R9 usage?
+
+  if (Subtarget->hasFP16())
+      ATS.emitAttribute(ARMBuildAttrs::FP_HP_extension, ARMBuildAttrs::AllowHPFP);
+
+  if (Subtarget->hasMPExtension())
+      ATS.emitAttribute(ARMBuildAttrs::MPextension_use, ARMBuildAttrs::AllowMP);
+
+  // Hardware divide in ARM mode is part of base arch, starting from ARMv8.
+  // If only Thumb hwdiv is present, it must also be in base arch (ARMv7-R/M).
+  // It is not possible to produce DisallowDIV: if hwdiv is present in the base
+  // arch, supplying -hwdiv downgrades the effective arch, via ClearImpliedBits.
+  // AllowDIVExt is only emitted if hwdiv isn't available in the base arch;
+  // otherwise, the default value (AllowDIVIfExists) applies.
+  if (Subtarget->hasDivideInARMMode() && !Subtarget->hasV8Ops())
+      ATS.emitAttribute(ARMBuildAttrs::DIV_use, ARMBuildAttrs::AllowDIVExt);
+
+  if (MMI) {
+    if (const Module *SourceModule = MMI->getModule()) {
+      // ABI_PCS_wchar_t to indicate wchar_t width
+      // FIXME: There is no way to emit value 0 (wchar_t prohibited).
+      if (auto WCharWidthValue = cast_or_null<ConstantInt>(
+              SourceModule->getModuleFlag("wchar_size"))) {
+        int WCharWidth = WCharWidthValue->getZExtValue();
+        assert((WCharWidth == 2 || WCharWidth == 4) &&
+               "wchar_t width must be 2 or 4 bytes");
+        ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_wchar_t, WCharWidth);
+      }
+
+      // ABI_enum_size to indicate enum width
+      // FIXME: There is no way to emit value 0 (enums prohibited) or value 3
+      //        (all enums contain a value needing 32 bits to encode).
+      if (auto EnumWidthValue = cast_or_null<ConstantInt>(
+              SourceModule->getModuleFlag("min_enum_size"))) {
+        int EnumWidth = EnumWidthValue->getZExtValue();
+        assert((EnumWidth == 1 || EnumWidth == 4) &&
+               "Minimum enum width must be 1 or 4 bytes");
+        int EnumBuildAttr = EnumWidth == 1 ? 1 : 2;
+        ATS.emitAttribute(ARMBuildAttrs::ABI_enum_size, EnumBuildAttr);
+      }
+    }
+  }
+
+  if (Subtarget->hasTrustZone() && Subtarget->hasVirtualization())
+      ATS.emitAttribute(ARMBuildAttrs::Virtualization_use,
+                        ARMBuildAttrs::AllowTZVirtualization);
+  else if (Subtarget->hasTrustZone())
+      ATS.emitAttribute(ARMBuildAttrs::Virtualization_use,
+                        ARMBuildAttrs::AllowTZ);
+  else if (Subtarget->hasVirtualization())
+      ATS.emitAttribute(ARMBuildAttrs::Virtualization_use,
+                        ARMBuildAttrs::AllowVirtualization);
+
+  ATS.finishAttributeSection();
+}
+
+//===----------------------------------------------------------------------===//
+
+static MCSymbol *getPICLabel(const char *Prefix, unsigned FunctionNumber,
+                             unsigned LabelId, MCContext &Ctx) {
+
+  MCSymbol *Label = Ctx.GetOrCreateSymbol(Twine(Prefix)
+                       + "PC" + Twine(FunctionNumber) + "_" + Twine(LabelId));
+  return Label;
+}
+
+static MCSymbolRefExpr::VariantKind
+getModifierVariantKind(ARMCP::ARMCPModifier Modifier) {
+  switch (Modifier) {
+  case ARMCP::no_modifier: return MCSymbolRefExpr::VK_None;
+  case ARMCP::TLSGD:       return MCSymbolRefExpr::VK_TLSGD;
+  case ARMCP::TPOFF:       return MCSymbolRefExpr::VK_TPOFF;
+  case ARMCP::GOTTPOFF:    return MCSymbolRefExpr::VK_GOTTPOFF;
+  case ARMCP::GOT:         return MCSymbolRefExpr::VK_GOT;
+  case ARMCP::GOTOFF:      return MCSymbolRefExpr::VK_GOTOFF;
+  }
+  llvm_unreachable("Invalid ARMCPModifier!");
+}
+
+MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV,
+                                        unsigned char TargetFlags) {
+  if (Subtarget->isTargetMachO()) {
+    bool IsIndirect = (TargetFlags & ARMII::MO_NONLAZY) &&
+      Subtarget->GVIsIndirectSymbol(GV, TM.getRelocationModel());
+
+    if (!IsIndirect)
+      return getSymbol(GV);
+
+    // FIXME: Remove this when Darwin transition to @GOT like syntax.
+    MCSymbol *MCSym = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+    MachineModuleInfoMachO &MMIMachO =
+      MMI->getObjFileInfo<MachineModuleInfoMachO>();
+    MachineModuleInfoImpl::StubValueTy &StubSym =
+      GV->hasHiddenVisibility() ? MMIMachO.getHiddenGVStubEntry(MCSym)
+                                : MMIMachO.getGVStubEntry(MCSym);
+    if (!StubSym.getPointer())
+      StubSym = MachineModuleInfoImpl::StubValueTy(getSymbol(GV),
+                                                   !GV->hasInternalLinkage());
+    return MCSym;
+  } else if (Subtarget->isTargetCOFF()) {
+    assert(Subtarget->isTargetWindows() &&
+           "Windows is the only supported COFF target");
+
+    bool IsIndirect = (TargetFlags & ARMII::MO_DLLIMPORT);
+    if (!IsIndirect)
+      return getSymbol(GV);
+
+    SmallString<128> Name;
+    Name = "__imp_";
+    getNameWithPrefix(Name, GV);
+
+    return OutContext.GetOrCreateSymbol(Name);
+  } else if (Subtarget->isTargetELF()) {
+    return getSymbol(GV);
+  }
+  llvm_unreachable("unexpected target");
+}
+
+void ARMAsmPrinter::
+EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
+  const DataLayout *DL = TM.getDataLayout();
+  int Size = TM.getDataLayout()->getTypeAllocSize(MCPV->getType());
+
+  ARMConstantPoolValue *ACPV = static_cast<ARMConstantPoolValue*>(MCPV);
+
+  MCSymbol *MCSym;
+  if (ACPV->isLSDA()) {
+    SmallString<128> Str;
+    raw_svector_ostream OS(Str);
+    OS << DL->getPrivateGlobalPrefix() << "_LSDA_" << getFunctionNumber();
+    MCSym = OutContext.GetOrCreateSymbol(OS.str());
+  } else if (ACPV->isBlockAddress()) {
+    const BlockAddress *BA =
+      cast<ARMConstantPoolConstant>(ACPV)->getBlockAddress();
+    MCSym = GetBlockAddressSymbol(BA);
+  } else if (ACPV->isGlobalValue()) {
+    const GlobalValue *GV = cast<ARMConstantPoolConstant>(ACPV)->getGV();
+
+    // On Darwin, const-pool entries may get the "FOO$non_lazy_ptr" mangling, so
+    // flag the global as MO_NONLAZY.
+    unsigned char TF = Subtarget->isTargetMachO() ? ARMII::MO_NONLAZY : 0;
+    MCSym = GetARMGVSymbol(GV, TF);
+  } else if (ACPV->isMachineBasicBlock()) {
+    const MachineBasicBlock *MBB = cast<ARMConstantPoolMBB>(ACPV)->getMBB();
+    MCSym = MBB->getSymbol();
+  } else {
+    assert(ACPV->isExtSymbol() && "unrecognized constant pool value");
+    const char *Sym = cast<ARMConstantPoolSymbol>(ACPV)->getSymbol();
+    MCSym = GetExternalSymbolSymbol(Sym);
+  }
+
+  // Create an MCSymbol for the reference.
+  const MCExpr *Expr =
+    MCSymbolRefExpr::Create(MCSym, getModifierVariantKind(ACPV->getModifier()),
+                            OutContext);
+
+  if (ACPV->getPCAdjustment()) {
+    MCSymbol *PCLabel = getPICLabel(DL->getPrivateGlobalPrefix(),
+                                    getFunctionNumber(),
+                                    ACPV->getLabelId(),
+                                    OutContext);
+    const MCExpr *PCRelExpr = MCSymbolRefExpr::Create(PCLabel, OutContext);
+    PCRelExpr =
+      MCBinaryExpr::CreateAdd(PCRelExpr,
+                              MCConstantExpr::Create(ACPV->getPCAdjustment(),
+                                                     OutContext),
+                              OutContext);
+    if (ACPV->mustAddCurrentAddress()) {
+      // We want "(<expr> - .)", but MC doesn't have a concept of the '.'
+      // label, so just emit a local label end reference that instead.
+      MCSymbol *DotSym = OutContext.CreateTempSymbol();
+      OutStreamer.EmitLabel(DotSym);
+      const MCExpr *DotExpr = MCSymbolRefExpr::Create(DotSym, OutContext);
+      PCRelExpr = MCBinaryExpr::CreateSub(PCRelExpr, DotExpr, OutContext);
+    }
+    Expr = MCBinaryExpr::CreateSub(Expr, PCRelExpr, OutContext);
+  }
+  OutStreamer.EmitValue(Expr, Size);
+}
+
+void ARMAsmPrinter::EmitJumpTable(const MachineInstr *MI) {
+  unsigned Opcode = MI->getOpcode();
+  int OpNum = 1;
+  if (Opcode == ARM::BR_JTadd)
+    OpNum = 2;
+  else if (Opcode == ARM::BR_JTm)
+    OpNum = 3;
+
+  const MachineOperand &MO1 = MI->getOperand(OpNum);
+  const MachineOperand &MO2 = MI->getOperand(OpNum+1); // Unique Id
+  unsigned JTI = MO1.getIndex();
+
+  // Emit a label for the jump table.
+  MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel2(JTI, MO2.getImm());
+  OutStreamer.EmitLabel(JTISymbol);
+
+  // Mark the jump table as data-in-code.
+  OutStreamer.EmitDataRegion(MCDR_DataRegionJT32);
+
+  // Emit each entry of the table.
+  const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
+  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+  const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
+
+  for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) {
+    MachineBasicBlock *MBB = JTBBs[i];
+    // Construct an MCExpr for the entry. We want a value of the form:
+    // (BasicBlockAddr - TableBeginAddr)
+    //
+    // For example, a table with entries jumping to basic blocks BB0 and BB1
+    // would look like:
+    // LJTI_0_0:
+    //    .word (LBB0 - LJTI_0_0)
+    //    .word (LBB1 - LJTI_0_0)
+    const MCExpr *Expr = MCSymbolRefExpr::Create(MBB->getSymbol(), OutContext);
+
+    if (TM.getRelocationModel() == Reloc::PIC_)
+      Expr = MCBinaryExpr::CreateSub(Expr, MCSymbolRefExpr::Create(JTISymbol,
+                                                                   OutContext),
+                                     OutContext);
+    // If we're generating a table of Thumb addresses in static relocation
+    // model, we need to add one to keep interworking correctly.
+    else if (AFI->isThumbFunction())
+      Expr = MCBinaryExpr::CreateAdd(Expr, MCConstantExpr::Create(1,OutContext),
+                                     OutContext);
+    OutStreamer.EmitValue(Expr, 4);
+  }
+  // Mark the end of jump table data-in-code region.
+  OutStreamer.EmitDataRegion(MCDR_DataRegionEnd);
+}
+
+void ARMAsmPrinter::EmitJump2Table(const MachineInstr *MI) {
+  unsigned Opcode = MI->getOpcode();
+  int OpNum = (Opcode == ARM::t2BR_JT) ? 2 : 1;
+  const MachineOperand &MO1 = MI->getOperand(OpNum);
+  const MachineOperand &MO2 = MI->getOperand(OpNum+1); // Unique Id
+  unsigned JTI = MO1.getIndex();
+
+  MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel2(JTI, MO2.getImm());
+  OutStreamer.EmitLabel(JTISymbol);
+
+  // Emit each entry of the table.
+  const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
+  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+  const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
+  unsigned OffsetWidth = 4;
+  if (MI->getOpcode() == ARM::t2TBB_JT) {
+    OffsetWidth = 1;
+    // Mark the jump table as data-in-code.
+    OutStreamer.EmitDataRegion(MCDR_DataRegionJT8);
+  } else if (MI->getOpcode() == ARM::t2TBH_JT) {
+    OffsetWidth = 2;
+    // Mark the jump table as data-in-code.
+    OutStreamer.EmitDataRegion(MCDR_DataRegionJT16);
+  }
+
+  for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) {
+    MachineBasicBlock *MBB = JTBBs[i];
+    const MCExpr *MBBSymbolExpr = MCSymbolRefExpr::Create(MBB->getSymbol(),
+                                                          OutContext);
+    // If this isn't a TBB or TBH, the entries are direct branch instructions.
+    if (OffsetWidth == 4) {
+      EmitToStreamer(OutStreamer, MCInstBuilder(ARM::t2B)
+        .addExpr(MBBSymbolExpr)
+        .addImm(ARMCC::AL)
+        .addReg(0));
+      continue;
+    }
+    // Otherwise it's an offset from the dispatch instruction. Construct an
+    // MCExpr for the entry. We want a value of the form:
+    // (BasicBlockAddr - TableBeginAddr) / 2
+    //
+    // For example, a TBB table with entries jumping to basic blocks BB0 and BB1
+    // would look like:
+    // LJTI_0_0:
+    //    .byte (LBB0 - LJTI_0_0) / 2
+    //    .byte (LBB1 - LJTI_0_0) / 2
+    const MCExpr *Expr =
+      MCBinaryExpr::CreateSub(MBBSymbolExpr,
+                              MCSymbolRefExpr::Create(JTISymbol, OutContext),
+                              OutContext);
+    Expr = MCBinaryExpr::CreateDiv(Expr, MCConstantExpr::Create(2, OutContext),
+                                   OutContext);
+    OutStreamer.EmitValue(Expr, OffsetWidth);
+  }
+  // Mark the end of jump table data-in-code region. 32-bit offsets use
+  // actual branch instructions here, so we don't mark those as a data-region
+  // at all.
+  if (OffsetWidth != 4)
+    OutStreamer.EmitDataRegion(MCDR_DataRegionEnd);
+}
+
+void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
+  assert(MI->getFlag(MachineInstr::FrameSetup) &&
+      "Only instruction which are involved into frame setup code are allowed");
+
+  MCTargetStreamer &TS = *OutStreamer.getTargetStreamer();
+  ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
+  const MachineFunction &MF = *MI->getParent()->getParent();
+  const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
+  const ARMFunctionInfo &AFI = *MF.getInfo<ARMFunctionInfo>();
+
+  unsigned FramePtr = RegInfo->getFrameRegister(MF);
+  unsigned Opc = MI->getOpcode();
+  unsigned SrcReg, DstReg;
+
+  if (Opc == ARM::tPUSH || Opc == ARM::tLDRpci) {
+    // Two special cases:
+    // 1) tPUSH does not have src/dst regs.
+    // 2) for Thumb1 code we sometimes materialize the constant via constpool
+    // load. Yes, this is pretty fragile, but for now I don't see better
+    // way... :(
+    SrcReg = DstReg = ARM::SP;
+  } else {
+    SrcReg = MI->getOperand(1).getReg();
+    DstReg = MI->getOperand(0).getReg();
+  }
+
+  // Try to figure out the unwinding opcode out of src / dst regs.
+  if (MI->mayStore()) {
+    // Register saves.
+    assert(DstReg == ARM::SP &&
+           "Only stack pointer as a destination reg is supported");
+
+    SmallVector<unsigned, 4> RegList;
+    // Skip src & dst reg, and pred ops.
+    unsigned StartOp = 2 + 2;
+    // Use all the operands.
+    unsigned NumOffset = 0;
+
+    switch (Opc) {
+    default:
+      MI->dump();
+      llvm_unreachable("Unsupported opcode for unwinding information");
+    case ARM::tPUSH:
+      // Special case here: no src & dst reg, but two extra imp ops.
+      StartOp = 2; NumOffset = 2;
+    case ARM::STMDB_UPD:
+    case ARM::t2STMDB_UPD:
+    case ARM::VSTMDDB_UPD:
+      assert(SrcReg == ARM::SP &&
+             "Only stack pointer as a source reg is supported");
+      for (unsigned i = StartOp, NumOps = MI->getNumOperands() - NumOffset;
+           i != NumOps; ++i) {
+        const MachineOperand &MO = MI->getOperand(i);
+        // Actually, there should never be any impdef stuff here. Skip it
+        // temporary to workaround PR11902.
+        if (MO.isImplicit())
+          continue;
+        RegList.push_back(MO.getReg());
+      }
+      break;
+    case ARM::STR_PRE_IMM:
+    case ARM::STR_PRE_REG:
+    case ARM::t2STR_PRE:
+      assert(MI->getOperand(2).getReg() == ARM::SP &&
+             "Only stack pointer as a source reg is supported");
+      RegList.push_back(SrcReg);
+      break;
+    }
+    if (MAI->getExceptionHandlingType() == ExceptionHandling::ARM)
+      ATS.emitRegSave(RegList, Opc == ARM::VSTMDDB_UPD);
+  } else {
+    // Changes of stack / frame pointer.
+    if (SrcReg == ARM::SP) {
+      int64_t Offset = 0;
+      switch (Opc) {
+      default:
+        MI->dump();
+        llvm_unreachable("Unsupported opcode for unwinding information");
+      case ARM::MOVr:
+      case ARM::tMOVr:
+        Offset = 0;
+        break;
+      case ARM::ADDri:
+        Offset = -MI->getOperand(2).getImm();
+        break;
+      case ARM::SUBri:
+      case ARM::t2SUBri:
+        Offset = MI->getOperand(2).getImm();
+        break;
+      case ARM::tSUBspi:
+        Offset = MI->getOperand(2).getImm()*4;
+        break;
+      case ARM::tADDspi:
+      case ARM::tADDrSPi:
+        Offset = -MI->getOperand(2).getImm()*4;
+        break;
+      case ARM::tLDRpci: {
+        // Grab the constpool index and check, whether it corresponds to
+        // original or cloned constpool entry.
+        unsigned CPI = MI->getOperand(1).getIndex();
+        const MachineConstantPool *MCP = MF.getConstantPool();
+        if (CPI >= MCP->getConstants().size())
+          CPI = AFI.getOriginalCPIdx(CPI);
+        assert(CPI != -1U && "Invalid constpool index");
+
+        // Derive the actual offset.
+        const MachineConstantPoolEntry &CPE = MCP->getConstants()[CPI];
+        assert(!CPE.isMachineConstantPoolEntry() && "Invalid constpool entry");
+        // FIXME: Check for user, it should be "add" instruction!
+        Offset = -cast<ConstantInt>(CPE.Val.ConstVal)->getSExtValue();
+        break;
+      }
+      }
+
+      if (MAI->getExceptionHandlingType() == ExceptionHandling::ARM) {
+        if (DstReg == FramePtr && FramePtr != ARM::SP)
+          // Set-up of the frame pointer. Positive values correspond to "add"
+          // instruction.
+          ATS.emitSetFP(FramePtr, ARM::SP, -Offset);
+        else if (DstReg == ARM::SP) {
+          // Change of SP by an offset. Positive values correspond to "sub"
+          // instruction.
+          ATS.emitPad(Offset);
+        } else {
+          // Move of SP to a register.  Positive values correspond to an "add"
+          // instruction.
+          ATS.emitMovSP(DstReg, -Offset);
+        }
+      }
+    } else if (DstReg == ARM::SP) {
+      MI->dump();
+      llvm_unreachable("Unsupported opcode for unwinding information");
+    }
+    else {
+      MI->dump();
+      llvm_unreachable("Unsupported opcode for unwinding information");
+    }
+  }
+}
+
+// Simple pseudo-instructions have their lowering (with expansion to real
+// instructions) auto-generated.
+#include "ARMGenMCPseudoLowering.inc"
+
+void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  const DataLayout *DL = TM.getDataLayout();
+
+  // If we just ended a constant pool, mark it as such.
+  if (InConstantPool && MI->getOpcode() != ARM::CONSTPOOL_ENTRY) {
+    OutStreamer.EmitDataRegion(MCDR_DataRegionEnd);
+    InConstantPool = false;
+  }
+
+  // Emit unwinding stuff for frame-related instructions
+  if (Subtarget->isTargetEHABICompatible() &&
+       MI->getFlag(MachineInstr::FrameSetup))
+    EmitUnwindingInstruction(MI);
+
+  // Do any auto-generated pseudo lowerings.
+  if (emitPseudoExpansionLowering(OutStreamer, MI))
+    return;
+
+  assert(!convertAddSubFlagsOpcode(MI->getOpcode()) &&
+         "Pseudo flag setting opcode should be expanded early");
+
+  // Check for manual lowerings.
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+  case ARM::t2MOVi32imm: llvm_unreachable("Should be lowered by thumb2it pass");
+  case ARM::DBG_VALUE: llvm_unreachable("Should be handled by generic printing");
+  case ARM::LEApcrel:
+  case ARM::tLEApcrel:
+  case ARM::t2LEApcrel: {
+    // FIXME: Need to also handle globals and externals
+    MCSymbol *CPISymbol = GetCPISymbol(MI->getOperand(1).getIndex());
+    EmitToStreamer(OutStreamer, MCInstBuilder(MI->getOpcode() ==
+                                              ARM::t2LEApcrel ? ARM::t2ADR
+                  : (MI->getOpcode() == ARM::tLEApcrel ? ARM::tADR
+                     : ARM::ADR))
+      .addReg(MI->getOperand(0).getReg())
+      .addExpr(MCSymbolRefExpr::Create(CPISymbol, OutContext))
+      // Add predicate operands.
+      .addImm(MI->getOperand(2).getImm())
+      .addReg(MI->getOperand(3).getReg()));
+    return;
+  }
+  case ARM::LEApcrelJT:
+  case ARM::tLEApcrelJT:
+  case ARM::t2LEApcrelJT: {
+    MCSymbol *JTIPICSymbol =
+      GetARMJTIPICJumpTableLabel2(MI->getOperand(1).getIndex(),
+                                  MI->getOperand(2).getImm());
+    EmitToStreamer(OutStreamer, MCInstBuilder(MI->getOpcode() ==
+                                              ARM::t2LEApcrelJT ? ARM::t2ADR
+                  : (MI->getOpcode() == ARM::tLEApcrelJT ? ARM::tADR
+                     : ARM::ADR))
+      .addReg(MI->getOperand(0).getReg())
+      .addExpr(MCSymbolRefExpr::Create(JTIPICSymbol, OutContext))
+      // Add predicate operands.
+      .addImm(MI->getOperand(3).getImm())
+      .addReg(MI->getOperand(4).getReg()));
+    return;
+  }
+  // Darwin call instructions are just normal call instructions with different
+  // clobber semantics (they clobber R9).
+  case ARM::BX_CALL: {
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::MOVr)
+      .addReg(ARM::LR)
+      .addReg(ARM::PC)
+      // Add predicate operands.
+      .addImm(ARMCC::AL)
+      .addReg(0)
+      // Add 's' bit operand (always reg0 for this)
+      .addReg(0));
+
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::BX)
+      .addReg(MI->getOperand(0).getReg()));
+    return;
+  }
+  case ARM::tBX_CALL: {
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::tMOVr)
+      .addReg(ARM::LR)
+      .addReg(ARM::PC)
+      // Add predicate operands.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::tBX)
+      .addReg(MI->getOperand(0).getReg())
+      // Add predicate operands.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+    return;
+  }
+  case ARM::BMOVPCRX_CALL: {
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::MOVr)
+      .addReg(ARM::LR)
+      .addReg(ARM::PC)
+      // Add predicate operands.
+      .addImm(ARMCC::AL)
+      .addReg(0)
+      // Add 's' bit operand (always reg0 for this)
+      .addReg(0));
+
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::MOVr)
+      .addReg(ARM::PC)
+      .addReg(MI->getOperand(0).getReg())
+      // Add predicate operands.
+      .addImm(ARMCC::AL)
+      .addReg(0)
+      // Add 's' bit operand (always reg0 for this)
+      .addReg(0));
+    return;
+  }
+  case ARM::BMOVPCB_CALL: {
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::MOVr)
+      .addReg(ARM::LR)
+      .addReg(ARM::PC)
+      // Add predicate operands.
+      .addImm(ARMCC::AL)
+      .addReg(0)
+      // Add 's' bit operand (always reg0 for this)
+      .addReg(0));
+
+    const MachineOperand &Op = MI->getOperand(0);
+    const GlobalValue *GV = Op.getGlobal();
+    const unsigned TF = Op.getTargetFlags();
+    MCSymbol *GVSym = GetARMGVSymbol(GV, TF);
+    const MCExpr *GVSymExpr = MCSymbolRefExpr::Create(GVSym, OutContext);
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::Bcc)
+      .addExpr(GVSymExpr)
+      // Add predicate operands.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+    return;
+  }
+  case ARM::MOVi16_ga_pcrel:
+  case ARM::t2MOVi16_ga_pcrel: {
+    MCInst TmpInst;
+    TmpInst.setOpcode(Opc == ARM::MOVi16_ga_pcrel? ARM::MOVi16 : ARM::t2MOVi16);
+    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
+
+    unsigned TF = MI->getOperand(1).getTargetFlags();
+    const GlobalValue *GV = MI->getOperand(1).getGlobal();
+    MCSymbol *GVSym = GetARMGVSymbol(GV, TF);
+    const MCExpr *GVSymExpr = MCSymbolRefExpr::Create(GVSym, OutContext);
+
+    MCSymbol *LabelSym = getPICLabel(DL->getPrivateGlobalPrefix(),
+                                     getFunctionNumber(),
+                                     MI->getOperand(2).getImm(), OutContext);
+    const MCExpr *LabelSymExpr= MCSymbolRefExpr::Create(LabelSym, OutContext);
+    unsigned PCAdj = (Opc == ARM::MOVi16_ga_pcrel) ? 8 : 4;
+    const MCExpr *PCRelExpr =
+      ARMMCExpr::CreateLower16(MCBinaryExpr::CreateSub(GVSymExpr,
+                                      MCBinaryExpr::CreateAdd(LabelSymExpr,
+                                      MCConstantExpr::Create(PCAdj, OutContext),
+                                      OutContext), OutContext), OutContext);
+      TmpInst.addOperand(MCOperand::CreateExpr(PCRelExpr));
+
+    // Add predicate operands.
+    TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+    TmpInst.addOperand(MCOperand::CreateReg(0));
+    // Add 's' bit operand (always reg0 for this)
+    TmpInst.addOperand(MCOperand::CreateReg(0));
+    EmitToStreamer(OutStreamer, TmpInst);
+    return;
+  }
+  case ARM::MOVTi16_ga_pcrel:
+  case ARM::t2MOVTi16_ga_pcrel: {
+    MCInst TmpInst;
+    TmpInst.setOpcode(Opc == ARM::MOVTi16_ga_pcrel
+                      ? ARM::MOVTi16 : ARM::t2MOVTi16);
+    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
+    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(1).getReg()));
+
+    unsigned TF = MI->getOperand(2).getTargetFlags();
+    const GlobalValue *GV = MI->getOperand(2).getGlobal();
+    MCSymbol *GVSym = GetARMGVSymbol(GV, TF);
+    const MCExpr *GVSymExpr = MCSymbolRefExpr::Create(GVSym, OutContext);
+
+    MCSymbol *LabelSym = getPICLabel(DL->getPrivateGlobalPrefix(),
+                                     getFunctionNumber(),
+                                     MI->getOperand(3).getImm(), OutContext);
+    const MCExpr *LabelSymExpr= MCSymbolRefExpr::Create(LabelSym, OutContext);
+    unsigned PCAdj = (Opc == ARM::MOVTi16_ga_pcrel) ? 8 : 4;
+    const MCExpr *PCRelExpr =
+        ARMMCExpr::CreateUpper16(MCBinaryExpr::CreateSub(GVSymExpr,
+                                   MCBinaryExpr::CreateAdd(LabelSymExpr,
+                                      MCConstantExpr::Create(PCAdj, OutContext),
+                                          OutContext), OutContext), OutContext);
+      TmpInst.addOperand(MCOperand::CreateExpr(PCRelExpr));
+    // Add predicate operands.
+    TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+    TmpInst.addOperand(MCOperand::CreateReg(0));
+    // Add 's' bit operand (always reg0 for this)
+    TmpInst.addOperand(MCOperand::CreateReg(0));
+    EmitToStreamer(OutStreamer, TmpInst);
+    return;
+  }
+  case ARM::tPICADD: {
+    // This is a pseudo op for a label + instruction sequence, which looks like:
+    // LPC0:
+    //     add r0, pc
+    // This adds the address of LPC0 to r0.
+
+    // Emit the label.
+    OutStreamer.EmitLabel(getPICLabel(DL->getPrivateGlobalPrefix(),
+                          getFunctionNumber(), MI->getOperand(2).getImm(),
+                          OutContext));
+
+    // Form and emit the add.
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::tADDhirr)
+      .addReg(MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(0).getReg())
+      .addReg(ARM::PC)
+      // Add predicate operands.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+    return;
+  }
+  case ARM::PICADD: {
+    // This is a pseudo op for a label + instruction sequence, which looks like:
+    // LPC0:
+    //     add r0, pc, r0
+    // This adds the address of LPC0 to r0.
+
+    // Emit the label.
+    OutStreamer.EmitLabel(getPICLabel(DL->getPrivateGlobalPrefix(),
+                          getFunctionNumber(), MI->getOperand(2).getImm(),
+                          OutContext));
+
+    // Form and emit the add.
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::ADDrr)
+      .addReg(MI->getOperand(0).getReg())
+      .addReg(ARM::PC)
+      .addReg(MI->getOperand(1).getReg())
+      // Add predicate operands.
+      .addImm(MI->getOperand(3).getImm())
+      .addReg(MI->getOperand(4).getReg())
+      // Add 's' bit operand (always reg0 for this)
+      .addReg(0));
+    return;
+  }
+  case ARM::PICSTR:
+  case ARM::PICSTRB:
+  case ARM::PICSTRH:
+  case ARM::PICLDR:
+  case ARM::PICLDRB:
+  case ARM::PICLDRH:
+  case ARM::PICLDRSB:
+  case ARM::PICLDRSH: {
+    // This is a pseudo op for a label + instruction sequence, which looks like:
+    // LPC0:
+    //     OP r0, [pc, r0]
+    // The LCP0 label is referenced by a constant pool entry in order to get
+    // a PC-relative address at the ldr instruction.
+
+    // Emit the label.
+    OutStreamer.EmitLabel(getPICLabel(DL->getPrivateGlobalPrefix(),
+                          getFunctionNumber(), MI->getOperand(2).getImm(),
+                          OutContext));
+
+    // Form and emit the load
+    unsigned Opcode;
+    switch (MI->getOpcode()) {
+    default:
+      llvm_unreachable("Unexpected opcode!");
+    case ARM::PICSTR:   Opcode = ARM::STRrs; break;
+    case ARM::PICSTRB:  Opcode = ARM::STRBrs; break;
+    case ARM::PICSTRH:  Opcode = ARM::STRH; break;
+    case ARM::PICLDR:   Opcode = ARM::LDRrs; break;
+    case ARM::PICLDRB:  Opcode = ARM::LDRBrs; break;
+    case ARM::PICLDRH:  Opcode = ARM::LDRH; break;
+    case ARM::PICLDRSB: Opcode = ARM::LDRSB; break;
+    case ARM::PICLDRSH: Opcode = ARM::LDRSH; break;
+    }
+    EmitToStreamer(OutStreamer, MCInstBuilder(Opcode)
+      .addReg(MI->getOperand(0).getReg())
+      .addReg(ARM::PC)
+      .addReg(MI->getOperand(1).getReg())
+      .addImm(0)
+      // Add predicate operands.
+      .addImm(MI->getOperand(3).getImm())
+      .addReg(MI->getOperand(4).getReg()));
+
+    return;
+  }
+  case ARM::CONSTPOOL_ENTRY: {
+    /// CONSTPOOL_ENTRY - This instruction represents a floating constant pool
+    /// in the function.  The first operand is the ID# for this instruction, the
+    /// second is the index into the MachineConstantPool that this is, the third
+    /// is the size in bytes of this constant pool entry.
+    /// The required alignment is specified on the basic block holding this MI.
+    unsigned LabelId = (unsigned)MI->getOperand(0).getImm();
+    unsigned CPIdx   = (unsigned)MI->getOperand(1).getIndex();
+
+    // If this is the first entry of the pool, mark it.
+    if (!InConstantPool) {
+      OutStreamer.EmitDataRegion(MCDR_DataRegion);
+      InConstantPool = true;
+    }
+
+    OutStreamer.EmitLabel(GetCPISymbol(LabelId));
+
+    const MachineConstantPoolEntry &MCPE = MCP->getConstants()[CPIdx];
+    if (MCPE.isMachineConstantPoolEntry())
+      EmitMachineConstantPoolValue(MCPE.Val.MachineCPVal);
+    else
+      EmitGlobalConstant(MCPE.Val.ConstVal);
+    return;
+  }
+  case ARM::t2BR_JT: {
+    // Lower and emit the instruction itself, then the jump table following it.
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::tMOVr)
+      .addReg(ARM::PC)
+      .addReg(MI->getOperand(0).getReg())
+      // Add predicate operands.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    // Output the data for the jump table itself
+    EmitJump2Table(MI);
+    return;
+  }
+  case ARM::t2TBB_JT: {
+    // Lower and emit the instruction itself, then the jump table following it.
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::t2TBB)
+      .addReg(ARM::PC)
+      .addReg(MI->getOperand(0).getReg())
+      // Add predicate operands.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    // Output the data for the jump table itself
+    EmitJump2Table(MI);
+    // Make sure the next instruction is 2-byte aligned.
+    EmitAlignment(1);
+    return;
+  }
+  case ARM::t2TBH_JT: {
+    // Lower and emit the instruction itself, then the jump table following it.
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::t2TBH)
+      .addReg(ARM::PC)
+      .addReg(MI->getOperand(0).getReg())
+      // Add predicate operands.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    // Output the data for the jump table itself
+    EmitJump2Table(MI);
+    return;
+  }
+  case ARM::tBR_JTr:
+  case ARM::BR_JTr: {
+    // Lower and emit the instruction itself, then the jump table following it.
+    // mov pc, target
+    MCInst TmpInst;
+    unsigned Opc = MI->getOpcode() == ARM::BR_JTr ?
+      ARM::MOVr : ARM::tMOVr;
+    TmpInst.setOpcode(Opc);
+    TmpInst.addOperand(MCOperand::CreateReg(ARM::PC));
+    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
+    // Add predicate operands.
+    TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+    TmpInst.addOperand(MCOperand::CreateReg(0));
+    // Add 's' bit operand (always reg0 for this)
+    if (Opc == ARM::MOVr)
+      TmpInst.addOperand(MCOperand::CreateReg(0));
+    EmitToStreamer(OutStreamer, TmpInst);
+
+    // Make sure the Thumb jump table is 4-byte aligned.
+    if (Opc == ARM::tMOVr)
+      EmitAlignment(2);
+
+    // Output the data for the jump table itself
+    EmitJumpTable(MI);
+    return;
+  }
+  case ARM::BR_JTm: {
+    // Lower and emit the instruction itself, then the jump table following it.
+    // ldr pc, target
+    MCInst TmpInst;
+    if (MI->getOperand(1).getReg() == 0) {
+      // literal offset
+      TmpInst.setOpcode(ARM::LDRi12);
+      TmpInst.addOperand(MCOperand::CreateReg(ARM::PC));
+      TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
+      TmpInst.addOperand(MCOperand::CreateImm(MI->getOperand(2).getImm()));
+    } else {
+      TmpInst.setOpcode(ARM::LDRrs);
+      TmpInst.addOperand(MCOperand::CreateReg(ARM::PC));
+      TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
+      TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(1).getReg()));
+      TmpInst.addOperand(MCOperand::CreateImm(0));
+    }
+    // Add predicate operands.
+    TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+    TmpInst.addOperand(MCOperand::CreateReg(0));
+    EmitToStreamer(OutStreamer, TmpInst);
+
+    // Output the data for the jump table itself
+    EmitJumpTable(MI);
+    return;
+  }
+  case ARM::BR_JTadd: {
+    // Lower and emit the instruction itself, then the jump table following it.
+    // add pc, target, idx
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::ADDrr)
+      .addReg(ARM::PC)
+      .addReg(MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(1).getReg())
+      // Add predicate operands.
+      .addImm(ARMCC::AL)
+      .addReg(0)
+      // Add 's' bit operand (always reg0 for this)
+      .addReg(0));
+
+    // Output the data for the jump table itself
+    EmitJumpTable(MI);
+    return;
+  }
+  case ARM::TRAP: {
+    // Non-Darwin binutils don't yet support the "trap" mnemonic.
+    // FIXME: Remove this special case when they do.
+    if (!Subtarget->isTargetMachO()) {
+      //.long 0xe7ffdefe @ trap
+      uint32_t Val = 0xe7ffdefeUL;
+      OutStreamer.AddComment("trap");
+      OutStreamer.EmitIntValue(Val, 4);
+      return;
+    }
+    break;
+  }
+  case ARM::TRAPNaCl: {
+    //.long 0xe7fedef0 @ trap
+    uint32_t Val = 0xe7fedef0UL;
+    OutStreamer.AddComment("trap");
+    OutStreamer.EmitIntValue(Val, 4);
+    return;
+  }
+  case ARM::tTRAP: {
+    // Non-Darwin binutils don't yet support the "trap" mnemonic.
+    // FIXME: Remove this special case when they do.
+    if (!Subtarget->isTargetMachO()) {
+      //.short 57086 @ trap
+      uint16_t Val = 0xdefe;
+      OutStreamer.AddComment("trap");
+      OutStreamer.EmitIntValue(Val, 2);
+      return;
+    }
+    break;
+  }
+  case ARM::t2Int_eh_sjlj_setjmp:
+  case ARM::t2Int_eh_sjlj_setjmp_nofp:
+  case ARM::tInt_eh_sjlj_setjmp: {
+    // Two incoming args: GPR:$src, GPR:$val
+    // mov $val, pc
+    // adds $val, #7
+    // str $val, [$src, #4]
+    // movs r0, #0
+    // b 1f
+    // movs r0, #1
+    // 1:
+    unsigned SrcReg = MI->getOperand(0).getReg();
+    unsigned ValReg = MI->getOperand(1).getReg();
+    MCSymbol *Label = GetARMSJLJEHLabel();
+    OutStreamer.AddComment("eh_setjmp begin");
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::tMOVr)
+      .addReg(ValReg)
+      .addReg(ARM::PC)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::tADDi3)
+      .addReg(ValReg)
+      // 's' bit operand
+      .addReg(ARM::CPSR)
+      .addReg(ValReg)
+      .addImm(7)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::tSTRi)
+      .addReg(ValReg)
+      .addReg(SrcReg)
+      // The offset immediate is #4. The operand value is scaled by 4 for the
+      // tSTR instruction.
+      .addImm(1)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::tMOVi8)
+      .addReg(ARM::R0)
+      .addReg(ARM::CPSR)
+      .addImm(0)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    const MCExpr *SymbolExpr = MCSymbolRefExpr::Create(Label, OutContext);
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::tB)
+      .addExpr(SymbolExpr)
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    OutStreamer.AddComment("eh_setjmp end");
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::tMOVi8)
+      .addReg(ARM::R0)
+      .addReg(ARM::CPSR)
+      .addImm(1)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    OutStreamer.EmitLabel(Label);
+    return;
+  }
+
+  case ARM::Int_eh_sjlj_setjmp_nofp:
+  case ARM::Int_eh_sjlj_setjmp: {
+    // Two incoming args: GPR:$src, GPR:$val
+    // add $val, pc, #8
+    // str $val, [$src, #+4]
+    // mov r0, #0
+    // add pc, pc, #0
+    // mov r0, #1
+    unsigned SrcReg = MI->getOperand(0).getReg();
+    unsigned ValReg = MI->getOperand(1).getReg();
+
+    OutStreamer.AddComment("eh_setjmp begin");
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::ADDri)
+      .addReg(ValReg)
+      .addReg(ARM::PC)
+      .addImm(8)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0)
+      // 's' bit operand (always reg0 for this).
+      .addReg(0));
+
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::STRi12)
+      .addReg(ValReg)
+      .addReg(SrcReg)
+      .addImm(4)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::MOVi)
+      .addReg(ARM::R0)
+      .addImm(0)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0)
+      // 's' bit operand (always reg0 for this).
+      .addReg(0));
+
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::ADDri)
+      .addReg(ARM::PC)
+      .addReg(ARM::PC)
+      .addImm(0)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0)
+      // 's' bit operand (always reg0 for this).
+      .addReg(0));
+
+    OutStreamer.AddComment("eh_setjmp end");
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::MOVi)
+      .addReg(ARM::R0)
+      .addImm(1)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0)
+      // 's' bit operand (always reg0 for this).
+      .addReg(0));
+    return;
+  }
+  case ARM::Int_eh_sjlj_longjmp: {
+    // ldr sp, [$src, #8]
+    // ldr $scratch, [$src, #4]
+    // ldr r7, [$src]
+    // bx $scratch
+    unsigned SrcReg = MI->getOperand(0).getReg();
+    unsigned ScratchReg = MI->getOperand(1).getReg();
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::LDRi12)
+      .addReg(ARM::SP)
+      .addReg(SrcReg)
+      .addImm(8)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::LDRi12)
+      .addReg(ScratchReg)
+      .addReg(SrcReg)
+      .addImm(4)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::LDRi12)
+      .addReg(ARM::R7)
+      .addReg(SrcReg)
+      .addImm(0)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::BX)
+      .addReg(ScratchReg)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+    return;
+  }
+  case ARM::tInt_eh_sjlj_longjmp: {
+    // ldr $scratch, [$src, #8]
+    // mov sp, $scratch
+    // ldr $scratch, [$src, #4]
+    // ldr r7, [$src]
+    // bx $scratch
+    unsigned SrcReg = MI->getOperand(0).getReg();
+    unsigned ScratchReg = MI->getOperand(1).getReg();
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::tLDRi)
+      .addReg(ScratchReg)
+      .addReg(SrcReg)
+      // The offset immediate is #8. The operand value is scaled by 4 for the
+      // tLDR instruction.
+      .addImm(2)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::tMOVr)
+      .addReg(ARM::SP)
+      .addReg(ScratchReg)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::tLDRi)
+      .addReg(ScratchReg)
+      .addReg(SrcReg)
+      .addImm(1)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::tLDRi)
+      .addReg(ARM::R7)
+      .addReg(SrcReg)
+      .addImm(0)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::tBX)
+      .addReg(ScratchReg)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+    return;
+  }
+  }
+
+  MCInst TmpInst;
+  LowerARMMachineInstrToMCInst(MI, TmpInst, *this);
+
+  EmitToStreamer(OutStreamer, TmpInst);
+}
+
+//===----------------------------------------------------------------------===//
+// Target Registry Stuff
+//===----------------------------------------------------------------------===//
+
+// Force static initialization.
+extern "C" void LLVMInitializeARMAsmPrinter() {
+  RegisterAsmPrinter<ARMAsmPrinter> X(TheARMLETarget);
+  RegisterAsmPrinter<ARMAsmPrinter> Y(TheARMBETarget);
+  RegisterAsmPrinter<ARMAsmPrinter> A(TheThumbLETarget);
+  RegisterAsmPrinter<ARMAsmPrinter> B(TheThumbBETarget);
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.h b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.h
new file mode 100644
index 0000000..7c103c6
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.h
@@ -0,0 +1,126 @@
+//===-- ARMAsmPrinter.h - ARM implementation of AsmPrinter ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMASMPRINTER_H
+#define ARMASMPRINTER_H
+
+#include "ARMSubtarget.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class ARMFunctionInfo;
+class MCOperand;
+class MachineConstantPool;
+class MachineOperand;
+
+namespace ARM {
+  enum DW_ISA {
+    DW_ISA_ARM_thumb = 1,
+    DW_ISA_ARM_arm = 2
+  };
+}
+
+class LLVM_LIBRARY_VISIBILITY ARMAsmPrinter : public AsmPrinter {
+
+  /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
+  /// make the right decision when printing asm code for different targets.
+  const ARMSubtarget *Subtarget;
+
+  /// AFI - Keep a pointer to ARMFunctionInfo for the current
+  /// MachineFunction.
+  ARMFunctionInfo *AFI;
+
+  /// MCP - Keep a pointer to constantpool entries of the current
+  /// MachineFunction.
+  const MachineConstantPool *MCP;
+
+  /// InConstantPool - Maintain state when emitting a sequence of constant
+  /// pool entries so we can properly mark them as data regions.
+  bool InConstantPool;
+public:
+  explicit ARMAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+    : AsmPrinter(TM, Streamer), AFI(nullptr), MCP(nullptr),
+      InConstantPool(false) {
+    Subtarget = &TM.getSubtarget<ARMSubtarget>();
+  }
+
+  const char *getPassName() const override {
+    return "ARM Assembly / Object Emitter";
+  }
+
+  void printOperand(const MachineInstr *MI, int OpNum, raw_ostream &O,
+                    const char *Modifier = nullptr);
+
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
+                       unsigned AsmVariant, const char *ExtraCode,
+                       raw_ostream &O) override;
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
+                             unsigned AsmVariant, const char *ExtraCode,
+                             raw_ostream &O) override;
+
+  void emitInlineAsmEnd(const MCSubtargetInfo &StartInfo,
+                        const MCSubtargetInfo *EndInfo) const override;
+
+  void EmitJumpTable(const MachineInstr *MI);
+  void EmitJump2Table(const MachineInstr *MI);
+  void EmitInstruction(const MachineInstr *MI) override;
+  bool runOnMachineFunction(MachineFunction &F) override;
+
+  void EmitConstantPool() override {
+    // we emit constant pools customly!
+  }
+  void EmitFunctionBodyEnd() override;
+  void EmitFunctionEntryLabel() override;
+  void EmitStartOfAsmFile(Module &M) override;
+  void EmitEndOfAsmFile(Module &M) override;
+  void EmitXXStructor(const Constant *CV) override;
+
+  // lowerOperand - Convert a MachineOperand into the equivalent MCOperand.
+  bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp);
+
+private:
+  // Helpers for EmitStartOfAsmFile() and EmitEndOfAsmFile()
+  void emitAttributes();
+
+  // Generic helper used to emit e.g. ARMv5 mul pseudos
+  void EmitPatchedInstruction(const MachineInstr *MI, unsigned TargetOpc);
+
+  void EmitUnwindingInstruction(const MachineInstr *MI);
+
+  // emitPseudoExpansionLowering - tblgen'erated.
+  bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
+                                   const MachineInstr *MI);
+
+public:
+  unsigned getISAEncoding() override {
+    // ARM/Darwin adds ISA to the DWARF info for each function.
+    if (!Subtarget->isTargetMachO())
+      return 0;
+    return Subtarget->isThumb() ?
+      ARM::DW_ISA_ARM_thumb : ARM::DW_ISA_ARM_arm;
+  }
+
+private:
+  MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol);
+  MCSymbol *GetARMJTIPICJumpTableLabel2(unsigned uid, unsigned uid2) const;
+
+  MCSymbol *GetARMSJLJEHLabel() const;
+
+  MCSymbol *GetARMGVSymbol(const GlobalValue *GV, unsigned char TargetFlags);
+
+public:
+  /// EmitMachineConstantPoolValue - Print a machine constantpool value to
+  /// the .s file.
+  void EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) override;
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
new file mode 100644
index 0000000..0288db9
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -0,0 +1,4403 @@
+//===-- ARMBaseInstrInfo.cpp - ARM Instruction Information ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Base ARM implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMBaseRegisterInfo.h"
+#include "ARMConstantPoolValue.h"
+#include "ARMFeatures.h"
+#include "ARMHazardRecognizer.h"
+#include "ARMMachineFunctionInfo.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "arm-instrinfo"
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "ARMGenInstrInfo.inc"
+
+static cl::opt<bool>
+EnableARM3Addr("enable-arm-3-addr-conv", cl::Hidden,
+               cl::desc("Enable ARM 2-addr to 3-addr conv"));
+
+static cl::opt<bool>
+WidenVMOVS("widen-vmovs", cl::Hidden, cl::init(true),
+           cl::desc("Widen ARM vmovs to vmovd when possible"));
+
+static cl::opt<unsigned>
+SwiftPartialUpdateClearance("swift-partial-update-clearance",
+     cl::Hidden, cl::init(12),
+     cl::desc("Clearance before partial register updates"));
+
+/// ARM_MLxEntry - Record information about MLA / MLS instructions.
+struct ARM_MLxEntry {
+  uint16_t MLxOpc;     // MLA / MLS opcode
+  uint16_t MulOpc;     // Expanded multiplication opcode
+  uint16_t AddSubOpc;  // Expanded add / sub opcode
+  bool NegAcc;         // True if the acc is negated before the add / sub.
+  bool HasLane;        // True if instruction has an extra "lane" operand.
+};
+
+static const ARM_MLxEntry ARM_MLxTable[] = {
+  // MLxOpc,          MulOpc,           AddSubOpc,       NegAcc, HasLane
+  // fp scalar ops
+  { ARM::VMLAS,       ARM::VMULS,       ARM::VADDS,      false,  false },
+  { ARM::VMLSS,       ARM::VMULS,       ARM::VSUBS,      false,  false },
+  { ARM::VMLAD,       ARM::VMULD,       ARM::VADDD,      false,  false },
+  { ARM::VMLSD,       ARM::VMULD,       ARM::VSUBD,      false,  false },
+  { ARM::VNMLAS,      ARM::VNMULS,      ARM::VSUBS,      true,   false },
+  { ARM::VNMLSS,      ARM::VMULS,       ARM::VSUBS,      true,   false },
+  { ARM::VNMLAD,      ARM::VNMULD,      ARM::VSUBD,      true,   false },
+  { ARM::VNMLSD,      ARM::VMULD,       ARM::VSUBD,      true,   false },
+
+  // fp SIMD ops
+  { ARM::VMLAfd,      ARM::VMULfd,      ARM::VADDfd,     false,  false },
+  { ARM::VMLSfd,      ARM::VMULfd,      ARM::VSUBfd,     false,  false },
+  { ARM::VMLAfq,      ARM::VMULfq,      ARM::VADDfq,     false,  false },
+  { ARM::VMLSfq,      ARM::VMULfq,      ARM::VSUBfq,     false,  false },
+  { ARM::VMLAslfd,    ARM::VMULslfd,    ARM::VADDfd,     false,  true  },
+  { ARM::VMLSslfd,    ARM::VMULslfd,    ARM::VSUBfd,     false,  true  },
+  { ARM::VMLAslfq,    ARM::VMULslfq,    ARM::VADDfq,     false,  true  },
+  { ARM::VMLSslfq,    ARM::VMULslfq,    ARM::VSUBfq,     false,  true  },
+};
+
+ARMBaseInstrInfo::ARMBaseInstrInfo(const ARMSubtarget& STI)
+  : ARMGenInstrInfo(ARM::ADJCALLSTACKDOWN, ARM::ADJCALLSTACKUP),
+    Subtarget(STI) {
+  for (unsigned i = 0, e = array_lengthof(ARM_MLxTable); i != e; ++i) {
+    if (!MLxEntryMap.insert(std::make_pair(ARM_MLxTable[i].MLxOpc, i)).second)
+      assert(false && "Duplicated entries?");
+    MLxHazardOpcodes.insert(ARM_MLxTable[i].AddSubOpc);
+    MLxHazardOpcodes.insert(ARM_MLxTable[i].MulOpc);
+  }
+}
+
+// Use a ScoreboardHazardRecognizer for prepass ARM scheduling. TargetInstrImpl
+// currently defaults to no prepass hazard recognizer.
+ScheduleHazardRecognizer *
+ARMBaseInstrInfo::CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI,
+                                               const ScheduleDAG *DAG) const {
+  if (usePreRAHazardRecognizer()) {
+    const InstrItineraryData *II =
+        &static_cast<const ARMSubtarget *>(STI)->getInstrItineraryData();
+    return new ScoreboardHazardRecognizer(II, DAG, "pre-RA-sched");
+  }
+  return TargetInstrInfo::CreateTargetHazardRecognizer(STI, DAG);
+}
+
+ScheduleHazardRecognizer *ARMBaseInstrInfo::
+CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
+                                   const ScheduleDAG *DAG) const {
+  if (Subtarget.isThumb2() || Subtarget.hasVFP2())
+    return (ScheduleHazardRecognizer *)new ARMHazardRecognizer(II, DAG);
+  return TargetInstrInfo::CreateTargetPostRAHazardRecognizer(II, DAG);
+}
+
+MachineInstr *
+ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
+                                        MachineBasicBlock::iterator &MBBI,
+                                        LiveVariables *LV) const {
+  // FIXME: Thumb2 support.
+
+  if (!EnableARM3Addr)
+    return nullptr;
+
+  MachineInstr *MI = MBBI;
+  MachineFunction &MF = *MI->getParent()->getParent();
+  uint64_t TSFlags = MI->getDesc().TSFlags;
+  bool isPre = false;
+  switch ((TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift) {
+  default: return nullptr;
+  case ARMII::IndexModePre:
+    isPre = true;
+    break;
+  case ARMII::IndexModePost:
+    break;
+  }
+
+  // Try splitting an indexed load/store to an un-indexed one plus an add/sub
+  // operation.
+  unsigned MemOpc = getUnindexedOpcode(MI->getOpcode());
+  if (MemOpc == 0)
+    return nullptr;
+
+  MachineInstr *UpdateMI = nullptr;
+  MachineInstr *MemMI = nullptr;
+  unsigned AddrMode = (TSFlags & ARMII::AddrModeMask);
+  const MCInstrDesc &MCID = MI->getDesc();
+  unsigned NumOps = MCID.getNumOperands();
+  bool isLoad = !MI->mayStore();
+  const MachineOperand &WB = isLoad ? MI->getOperand(1) : MI->getOperand(0);
+  const MachineOperand &Base = MI->getOperand(2);
+  const MachineOperand &Offset = MI->getOperand(NumOps-3);
+  unsigned WBReg = WB.getReg();
+  unsigned BaseReg = Base.getReg();
+  unsigned OffReg = Offset.getReg();
+  unsigned OffImm = MI->getOperand(NumOps-2).getImm();
+  ARMCC::CondCodes Pred = (ARMCC::CondCodes)MI->getOperand(NumOps-1).getImm();
+  switch (AddrMode) {
+  default: llvm_unreachable("Unknown indexed op!");
+  case ARMII::AddrMode2: {
+    bool isSub = ARM_AM::getAM2Op(OffImm) == ARM_AM::sub;
+    unsigned Amt = ARM_AM::getAM2Offset(OffImm);
+    if (OffReg == 0) {
+      if (ARM_AM::getSOImmVal(Amt) == -1)
+        // Can't encode it in a so_imm operand. This transformation will
+        // add more than 1 instruction. Abandon!
+        return nullptr;
+      UpdateMI = BuildMI(MF, MI->getDebugLoc(),
+                         get(isSub ? ARM::SUBri : ARM::ADDri), WBReg)
+        .addReg(BaseReg).addImm(Amt)
+        .addImm(Pred).addReg(0).addReg(0);
+    } else if (Amt != 0) {
+      ARM_AM::ShiftOpc ShOpc = ARM_AM::getAM2ShiftOpc(OffImm);
+      unsigned SOOpc = ARM_AM::getSORegOpc(ShOpc, Amt);
+      UpdateMI = BuildMI(MF, MI->getDebugLoc(),
+                         get(isSub ? ARM::SUBrsi : ARM::ADDrsi), WBReg)
+        .addReg(BaseReg).addReg(OffReg).addReg(0).addImm(SOOpc)
+        .addImm(Pred).addReg(0).addReg(0);
+    } else
+      UpdateMI = BuildMI(MF, MI->getDebugLoc(),
+                         get(isSub ? ARM::SUBrr : ARM::ADDrr), WBReg)
+        .addReg(BaseReg).addReg(OffReg)
+        .addImm(Pred).addReg(0).addReg(0);
+    break;
+  }
+  case ARMII::AddrMode3 : {
+    bool isSub = ARM_AM::getAM3Op(OffImm) == ARM_AM::sub;
+    unsigned Amt = ARM_AM::getAM3Offset(OffImm);
+    if (OffReg == 0)
+      // Immediate is 8-bits. It's guaranteed to fit in a so_imm operand.
+      UpdateMI = BuildMI(MF, MI->getDebugLoc(),
+                         get(isSub ? ARM::SUBri : ARM::ADDri), WBReg)
+        .addReg(BaseReg).addImm(Amt)
+        .addImm(Pred).addReg(0).addReg(0);
+    else
+      UpdateMI = BuildMI(MF, MI->getDebugLoc(),
+                         get(isSub ? ARM::SUBrr : ARM::ADDrr), WBReg)
+        .addReg(BaseReg).addReg(OffReg)
+        .addImm(Pred).addReg(0).addReg(0);
+    break;
+  }
+  }
+
+  std::vector<MachineInstr*> NewMIs;
+  if (isPre) {
+    if (isLoad)
+      MemMI = BuildMI(MF, MI->getDebugLoc(),
+                      get(MemOpc), MI->getOperand(0).getReg())
+        .addReg(WBReg).addImm(0).addImm(Pred);
+    else
+      MemMI = BuildMI(MF, MI->getDebugLoc(),
+                      get(MemOpc)).addReg(MI->getOperand(1).getReg())
+        .addReg(WBReg).addReg(0).addImm(0).addImm(Pred);
+    NewMIs.push_back(MemMI);
+    NewMIs.push_back(UpdateMI);
+  } else {
+    if (isLoad)
+      MemMI = BuildMI(MF, MI->getDebugLoc(),
+                      get(MemOpc), MI->getOperand(0).getReg())
+        .addReg(BaseReg).addImm(0).addImm(Pred);
+    else
+      MemMI = BuildMI(MF, MI->getDebugLoc(),
+                      get(MemOpc)).addReg(MI->getOperand(1).getReg())
+        .addReg(BaseReg).addReg(0).addImm(0).addImm(Pred);
+    if (WB.isDead())
+      UpdateMI->getOperand(0).setIsDead();
+    NewMIs.push_back(UpdateMI);
+    NewMIs.push_back(MemMI);
+  }
+
+  // Transfer LiveVariables states, kill / dead info.
+  if (LV) {
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+        unsigned Reg = MO.getReg();
+
+        LiveVariables::VarInfo &VI = LV->getVarInfo(Reg);
+        if (MO.isDef()) {
+          MachineInstr *NewMI = (Reg == WBReg) ? UpdateMI : MemMI;
+          if (MO.isDead())
+            LV->addVirtualRegisterDead(Reg, NewMI);
+        }
+        if (MO.isUse() && MO.isKill()) {
+          for (unsigned j = 0; j < 2; ++j) {
+            // Look at the two new MI's in reverse order.
+            MachineInstr *NewMI = NewMIs[j];
+            if (!NewMI->readsRegister(Reg))
+              continue;
+            LV->addVirtualRegisterKilled(Reg, NewMI);
+            if (VI.removeKill(MI))
+              VI.Kills.push_back(NewMI);
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  MFI->insert(MBBI, NewMIs[1]);
+  MFI->insert(MBBI, NewMIs[0]);
+  return NewMIs[0];
+}
+
+// Branch analysis.
+bool
+ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
+                                MachineBasicBlock *&FBB,
+                                SmallVectorImpl<MachineOperand> &Cond,
+                                bool AllowModify) const {
+  TBB = nullptr;
+  FBB = nullptr;
+
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin())
+    return false; // Empty blocks are easy.
+  --I;
+
+  // Walk backwards from the end of the basic block until the branch is
+  // analyzed or we give up.
+  while (isPredicated(I) || I->isTerminator() || I->isDebugValue()) {
+
+    // Flag to be raised on unanalyzeable instructions. This is useful in cases
+    // where we want to clean up on the end of the basic block before we bail
+    // out.
+    bool CantAnalyze = false;
+
+    // Skip over DEBUG values and predicated nonterminators.
+    while (I->isDebugValue() || !I->isTerminator()) {
+      if (I == MBB.begin())
+        return false;
+      --I;
+    }
+
+    if (isIndirectBranchOpcode(I->getOpcode()) ||
+        isJumpTableBranchOpcode(I->getOpcode())) {
+      // Indirect branches and jump tables can't be analyzed, but we still want
+      // to clean up any instructions at the tail of the basic block.
+      CantAnalyze = true;
+    } else if (isUncondBranchOpcode(I->getOpcode())) {
+      TBB = I->getOperand(0).getMBB();
+    } else if (isCondBranchOpcode(I->getOpcode())) {
+      // Bail out if we encounter multiple conditional branches.
+      if (!Cond.empty())
+        return true;
+
+      assert(!FBB && "FBB should have been null.");
+      FBB = TBB;
+      TBB = I->getOperand(0).getMBB();
+      Cond.push_back(I->getOperand(1));
+      Cond.push_back(I->getOperand(2));
+    } else if (I->isReturn()) {
+      // Returns can't be analyzed, but we should run cleanup.
+      CantAnalyze = !isPredicated(I);
+    } else {
+      // We encountered other unrecognized terminator. Bail out immediately.
+      return true;
+    }
+
+    // Cleanup code - to be run for unpredicated unconditional branches and
+    //                returns.
+    if (!isPredicated(I) &&
+          (isUncondBranchOpcode(I->getOpcode()) ||
+           isIndirectBranchOpcode(I->getOpcode()) ||
+           isJumpTableBranchOpcode(I->getOpcode()) ||
+           I->isReturn())) {
+      // Forget any previous condition branch information - it no longer applies.
+      Cond.clear();
+      FBB = nullptr;
+
+      // If we can modify the function, delete everything below this
+      // unconditional branch.
+      if (AllowModify) {
+        MachineBasicBlock::iterator DI = std::next(I);
+        while (DI != MBB.end()) {
+          MachineInstr *InstToDelete = DI;
+          ++DI;
+          InstToDelete->eraseFromParent();
+        }
+      }
+    }
+
+    if (CantAnalyze)
+      return true;
+
+    if (I == MBB.begin())
+      return false;
+
+    --I;
+  }
+
+  // We made it past the terminators without bailing out - we must have
+  // analyzed this branch successfully.
+  return false;
+}
+
+
+unsigned ARMBaseInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin()) return 0;
+  --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return 0;
+    --I;
+  }
+  if (!isUncondBranchOpcode(I->getOpcode()) &&
+      !isCondBranchOpcode(I->getOpcode()))
+    return 0;
+
+  // Remove the branch.
+  I->eraseFromParent();
+
+  I = MBB.end();
+
+  if (I == MBB.begin()) return 1;
+  --I;
+  if (!isCondBranchOpcode(I->getOpcode()))
+    return 1;
+
+  // Remove the branch.
+  I->eraseFromParent();
+  return 2;
+}
+
+unsigned
+ARMBaseInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                               MachineBasicBlock *FBB,
+                               const SmallVectorImpl<MachineOperand> &Cond,
+                               DebugLoc DL) const {
+  ARMFunctionInfo *AFI = MBB.getParent()->getInfo<ARMFunctionInfo>();
+  int BOpc   = !AFI->isThumbFunction()
+    ? ARM::B : (AFI->isThumb2Function() ? ARM::t2B : ARM::tB);
+  int BccOpc = !AFI->isThumbFunction()
+    ? ARM::Bcc : (AFI->isThumb2Function() ? ARM::t2Bcc : ARM::tBcc);
+  bool isThumb = AFI->isThumbFunction() || AFI->isThumb2Function();
+
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 2 || Cond.size() == 0) &&
+         "ARM branch conditions have two components!");
+
+  if (!FBB) {
+    if (Cond.empty()) { // Unconditional branch?
+      if (isThumb)
+        BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB).addImm(ARMCC::AL).addReg(0);
+      else
+        BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB);
+    } else
+      BuildMI(&MBB, DL, get(BccOpc)).addMBB(TBB)
+        .addImm(Cond[0].getImm()).addReg(Cond[1].getReg());
+    return 1;
+  }
+
+  // Two-way conditional branch.
+  BuildMI(&MBB, DL, get(BccOpc)).addMBB(TBB)
+    .addImm(Cond[0].getImm()).addReg(Cond[1].getReg());
+  if (isThumb)
+    BuildMI(&MBB, DL, get(BOpc)).addMBB(FBB).addImm(ARMCC::AL).addReg(0);
+  else
+    BuildMI(&MBB, DL, get(BOpc)).addMBB(FBB);
+  return 2;
+}
+
+bool ARMBaseInstrInfo::
+ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+  ARMCC::CondCodes CC = (ARMCC::CondCodes)(int)Cond[0].getImm();
+  Cond[0].setImm(ARMCC::getOppositeCondition(CC));
+  return false;
+}
+
+bool ARMBaseInstrInfo::isPredicated(const MachineInstr *MI) const {
+  if (MI->isBundle()) {
+    MachineBasicBlock::const_instr_iterator I = MI;
+    MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
+    while (++I != E && I->isInsideBundle()) {
+      int PIdx = I->findFirstPredOperandIdx();
+      if (PIdx != -1 && I->getOperand(PIdx).getImm() != ARMCC::AL)
+        return true;
+    }
+    return false;
+  }
+
+  int PIdx = MI->findFirstPredOperandIdx();
+  return PIdx != -1 && MI->getOperand(PIdx).getImm() != ARMCC::AL;
+}
+
+bool ARMBaseInstrInfo::
+PredicateInstruction(MachineInstr *MI,
+                     const SmallVectorImpl<MachineOperand> &Pred) const {
+  unsigned Opc = MI->getOpcode();
+  if (isUncondBranchOpcode(Opc)) {
+    MI->setDesc(get(getMatchingCondBranchOpcode(Opc)));
+    MachineInstrBuilder(*MI->getParent()->getParent(), MI)
+      .addImm(Pred[0].getImm())
+      .addReg(Pred[1].getReg());
+    return true;
+  }
+
+  int PIdx = MI->findFirstPredOperandIdx();
+  if (PIdx != -1) {
+    MachineOperand &PMO = MI->getOperand(PIdx);
+    PMO.setImm(Pred[0].getImm());
+    MI->getOperand(PIdx+1).setReg(Pred[1].getReg());
+    return true;
+  }
+  return false;
+}
+
+bool ARMBaseInstrInfo::
+SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
+                  const SmallVectorImpl<MachineOperand> &Pred2) const {
+  if (Pred1.size() > 2 || Pred2.size() > 2)
+    return false;
+
+  ARMCC::CondCodes CC1 = (ARMCC::CondCodes)Pred1[0].getImm();
+  ARMCC::CondCodes CC2 = (ARMCC::CondCodes)Pred2[0].getImm();
+  if (CC1 == CC2)
+    return true;
+
+  switch (CC1) {
+  default:
+    return false;
+  case ARMCC::AL:
+    return true;
+  case ARMCC::HS:
+    return CC2 == ARMCC::HI;
+  case ARMCC::LS:
+    return CC2 == ARMCC::LO || CC2 == ARMCC::EQ;
+  case ARMCC::GE:
+    return CC2 == ARMCC::GT;
+  case ARMCC::LE:
+    return CC2 == ARMCC::LT;
+  }
+}
+
+bool ARMBaseInstrInfo::DefinesPredicate(MachineInstr *MI,
+                                    std::vector<MachineOperand> &Pred) const {
+  bool Found = false;
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if ((MO.isRegMask() && MO.clobbersPhysReg(ARM::CPSR)) ||
+        (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR)) {
+      Pred.push_back(MO);
+      Found = true;
+    }
+  }
+
+  return Found;
+}
+
+/// isPredicable - Return true if the specified instruction can be predicated.
+/// By default, this returns true for every instruction with a
+/// PredicateOperand.
+bool ARMBaseInstrInfo::isPredicable(MachineInstr *MI) const {
+  if (!MI->isPredicable())
+    return false;
+
+  ARMFunctionInfo *AFI =
+    MI->getParent()->getParent()->getInfo<ARMFunctionInfo>();
+
+  if (AFI->isThumb2Function()) {
+    if (getSubtarget().restrictIT())
+      return isV8EligibleForIT(MI);
+  } else { // non-Thumb
+    if ((MI->getDesc().TSFlags & ARMII::DomainMask) == ARMII::DomainNEON)
+      return false;
+  }
+
+  return true;
+}
+
+namespace llvm {
+template <> bool IsCPSRDead<MachineInstr>(MachineInstr *MI) {
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || MO.isUndef() || MO.isUse())
+      continue;
+    if (MO.getReg() != ARM::CPSR)
+      continue;
+    if (!MO.isDead())
+      return false;
+  }
+  // all definitions of CPSR are dead
+  return true;
+}
+}
+
+/// FIXME: Works around a gcc miscompilation with -fstrict-aliasing.
+LLVM_ATTRIBUTE_NOINLINE
+static unsigned getNumJTEntries(const std::vector<MachineJumpTableEntry> &JT,
+                                unsigned JTI);
+static unsigned getNumJTEntries(const std::vector<MachineJumpTableEntry> &JT,
+                                unsigned JTI) {
+  assert(JTI < JT.size());
+  return JT[JTI].MBBs.size();
+}
+
+/// GetInstSize - Return the size of the specified MachineInstr.
+///
+unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
+  const MachineBasicBlock &MBB = *MI->getParent();
+  const MachineFunction *MF = MBB.getParent();
+  const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
+
+  const MCInstrDesc &MCID = MI->getDesc();
+  if (MCID.getSize())
+    return MCID.getSize();
+
+  // If this machine instr is an inline asm, measure it.
+  if (MI->getOpcode() == ARM::INLINEASM)
+    return getInlineAsmLength(MI->getOperand(0).getSymbolName(), *MAI);
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+  default:
+    // pseudo-instruction sizes are zero.
+    return 0;
+  case TargetOpcode::BUNDLE:
+    return getInstBundleLength(MI);
+  case ARM::MOVi16_ga_pcrel:
+  case ARM::MOVTi16_ga_pcrel:
+  case ARM::t2MOVi16_ga_pcrel:
+  case ARM::t2MOVTi16_ga_pcrel:
+    return 4;
+  case ARM::MOVi32imm:
+  case ARM::t2MOVi32imm:
+    return 8;
+  case ARM::CONSTPOOL_ENTRY:
+    // If this machine instr is a constant pool entry, its size is recorded as
+    // operand #2.
+    return MI->getOperand(2).getImm();
+  case ARM::Int_eh_sjlj_longjmp:
+    return 16;
+  case ARM::tInt_eh_sjlj_longjmp:
+    return 10;
+  case ARM::Int_eh_sjlj_setjmp:
+  case ARM::Int_eh_sjlj_setjmp_nofp:
+    return 20;
+  case ARM::tInt_eh_sjlj_setjmp:
+  case ARM::t2Int_eh_sjlj_setjmp:
+  case ARM::t2Int_eh_sjlj_setjmp_nofp:
+    return 12;
+  case ARM::BR_JTr:
+  case ARM::BR_JTm:
+  case ARM::BR_JTadd:
+  case ARM::tBR_JTr:
+  case ARM::t2BR_JT:
+  case ARM::t2TBB_JT:
+  case ARM::t2TBH_JT: {
+    // These are jumptable branches, i.e. a branch followed by an inlined
+    // jumptable. The size is 4 + 4 * number of entries. For TBB, each
+    // entry is one byte; TBH two byte each.
+    unsigned EntrySize = (Opc == ARM::t2TBB_JT)
+      ? 1 : ((Opc == ARM::t2TBH_JT) ? 2 : 4);
+    unsigned NumOps = MCID.getNumOperands();
+    MachineOperand JTOP =
+      MI->getOperand(NumOps - (MI->isPredicable() ? 3 : 2));
+    unsigned JTI = JTOP.getIndex();
+    const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
+    assert(MJTI != nullptr);
+    const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+    assert(JTI < JT.size());
+    // Thumb instructions are 2 byte aligned, but JT entries are 4 byte
+    // 4 aligned. The assembler / linker may add 2 byte padding just before
+    // the JT entries.  The size does not include this padding; the
+    // constant islands pass does separate bookkeeping for it.
+    // FIXME: If we know the size of the function is less than (1 << 16) *2
+    // bytes, we can use 16-bit entries instead. Then there won't be an
+    // alignment issue.
+    unsigned InstSize = (Opc == ARM::tBR_JTr || Opc == ARM::t2BR_JT) ? 2 : 4;
+    unsigned NumEntries = getNumJTEntries(JT, JTI);
+    if (Opc == ARM::t2TBB_JT && (NumEntries & 1))
+      // Make sure the instruction that follows TBB is 2-byte aligned.
+      // FIXME: Constant island pass should insert an "ALIGN" instruction
+      // instead.
+      ++NumEntries;
+    return NumEntries * EntrySize + InstSize;
+  }
+  }
+}
+
+unsigned ARMBaseInstrInfo::getInstBundleLength(const MachineInstr *MI) const {
+  unsigned Size = 0;
+  MachineBasicBlock::const_instr_iterator I = MI;
+  MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
+  while (++I != E && I->isInsideBundle()) {
+    assert(!I->isBundle() && "No nested bundle!");
+    Size += GetInstSizeInBytes(&*I);
+  }
+  return Size;
+}
+
+void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator I, DebugLoc DL,
+                                   unsigned DestReg, unsigned SrcReg,
+                                   bool KillSrc) const {
+  bool GPRDest = ARM::GPRRegClass.contains(DestReg);
+  bool GPRSrc = ARM::GPRRegClass.contains(SrcReg);
+
+  if (GPRDest && GPRSrc) {
+    AddDefaultCC(AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::MOVr), DestReg)
+                                    .addReg(SrcReg, getKillRegState(KillSrc))));
+    return;
+  }
+
+  bool SPRDest = ARM::SPRRegClass.contains(DestReg);
+  bool SPRSrc = ARM::SPRRegClass.contains(SrcReg);
+
+  unsigned Opc = 0;
+  if (SPRDest && SPRSrc)
+    Opc = ARM::VMOVS;
+  else if (GPRDest && SPRSrc)
+    Opc = ARM::VMOVRS;
+  else if (SPRDest && GPRSrc)
+    Opc = ARM::VMOVSR;
+  else if (ARM::DPRRegClass.contains(DestReg, SrcReg))
+    Opc = ARM::VMOVD;
+  else if (ARM::QPRRegClass.contains(DestReg, SrcReg))
+    Opc = ARM::VORRq;
+
+  if (Opc) {
+    MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opc), DestReg);
+    MIB.addReg(SrcReg, getKillRegState(KillSrc));
+    if (Opc == ARM::VORRq)
+      MIB.addReg(SrcReg, getKillRegState(KillSrc));
+    AddDefaultPred(MIB);
+    return;
+  }
+
+  // Handle register classes that require multiple instructions.
+  unsigned BeginIdx = 0;
+  unsigned SubRegs = 0;
+  int Spacing = 1;
+
+  // Use VORRq when possible.
+  if (ARM::QQPRRegClass.contains(DestReg, SrcReg)) {
+    Opc = ARM::VORRq;
+    BeginIdx = ARM::qsub_0;
+    SubRegs = 2;
+  } else if (ARM::QQQQPRRegClass.contains(DestReg, SrcReg)) {
+    Opc = ARM::VORRq;
+    BeginIdx = ARM::qsub_0;
+    SubRegs = 4;
+  // Fall back to VMOVD.
+  } else if (ARM::DPairRegClass.contains(DestReg, SrcReg)) {
+    Opc = ARM::VMOVD;
+    BeginIdx = ARM::dsub_0;
+    SubRegs = 2;
+  } else if (ARM::DTripleRegClass.contains(DestReg, SrcReg)) {
+    Opc = ARM::VMOVD;
+    BeginIdx = ARM::dsub_0;
+    SubRegs = 3;
+  } else if (ARM::DQuadRegClass.contains(DestReg, SrcReg)) {
+    Opc = ARM::VMOVD;
+    BeginIdx = ARM::dsub_0;
+    SubRegs = 4;
+  } else if (ARM::GPRPairRegClass.contains(DestReg, SrcReg)) {
+    Opc = Subtarget.isThumb2() ? ARM::tMOVr : ARM::MOVr;
+    BeginIdx = ARM::gsub_0;
+    SubRegs = 2;
+  } else if (ARM::DPairSpcRegClass.contains(DestReg, SrcReg)) {
+    Opc = ARM::VMOVD;
+    BeginIdx = ARM::dsub_0;
+    SubRegs = 2;
+    Spacing = 2;
+  } else if (ARM::DTripleSpcRegClass.contains(DestReg, SrcReg)) {
+    Opc = ARM::VMOVD;
+    BeginIdx = ARM::dsub_0;
+    SubRegs = 3;
+    Spacing = 2;
+  } else if (ARM::DQuadSpcRegClass.contains(DestReg, SrcReg)) {
+    Opc = ARM::VMOVD;
+    BeginIdx = ARM::dsub_0;
+    SubRegs = 4;
+    Spacing = 2;
+  }
+
+  assert(Opc && "Impossible reg-to-reg copy");
+
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  MachineInstrBuilder Mov;
+
+  // Copy register tuples backward when the first Dest reg overlaps with SrcReg.
+  if (TRI->regsOverlap(SrcReg, TRI->getSubReg(DestReg, BeginIdx))) {
+    BeginIdx = BeginIdx + ((SubRegs - 1) * Spacing);
+    Spacing = -Spacing;
+  }
+#ifndef NDEBUG
+  SmallSet<unsigned, 4> DstRegs;
+#endif
+  for (unsigned i = 0; i != SubRegs; ++i) {
+    unsigned Dst = TRI->getSubReg(DestReg, BeginIdx + i * Spacing);
+    unsigned Src = TRI->getSubReg(SrcReg, BeginIdx + i * Spacing);
+    assert(Dst && Src && "Bad sub-register");
+#ifndef NDEBUG
+    assert(!DstRegs.count(Src) && "destructive vector copy");
+    DstRegs.insert(Dst);
+#endif
+    Mov = BuildMI(MBB, I, I->getDebugLoc(), get(Opc), Dst).addReg(Src);
+    // VORR takes two source operands.
+    if (Opc == ARM::VORRq)
+      Mov.addReg(Src);
+    Mov = AddDefaultPred(Mov);
+    // MOVr can set CC.
+    if (Opc == ARM::MOVr)
+      Mov = AddDefaultCC(Mov);
+  }
+  // Add implicit super-register defs and kills to the last instruction.
+  Mov->addRegisterDefined(DestReg, TRI);
+  if (KillSrc)
+    Mov->addRegisterKilled(SrcReg, TRI);
+}
+
+const MachineInstrBuilder &
+ARMBaseInstrInfo::AddDReg(MachineInstrBuilder &MIB, unsigned Reg,
+                          unsigned SubIdx, unsigned State,
+                          const TargetRegisterInfo *TRI) const {
+  if (!SubIdx)
+    return MIB.addReg(Reg, State);
+
+  if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
+  return MIB.addReg(Reg, State, SubIdx);
+}
+
+void ARMBaseInstrInfo::
+storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                    unsigned SrcReg, bool isKill, int FI,
+                    const TargetRegisterClass *RC,
+                    const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  if (I != MBB.end()) DL = I->getDebugLoc();
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+  unsigned Align = MFI.getObjectAlignment(FI);
+
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
+                            MachineMemOperand::MOStore,
+                            MFI.getObjectSize(FI),
+                            Align);
+
+  switch (RC->getSize()) {
+    case 4:
+      if (ARM::GPRRegClass.hasSubClassEq(RC)) {
+        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::STRi12))
+                   .addReg(SrcReg, getKillRegState(isKill))
+                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+      } else if (ARM::SPRRegClass.hasSubClassEq(RC)) {
+        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTRS))
+                   .addReg(SrcReg, getKillRegState(isKill))
+                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+      } else
+        llvm_unreachable("Unknown reg class!");
+      break;
+    case 8:
+      if (ARM::DPRRegClass.hasSubClassEq(RC)) {
+        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTRD))
+                   .addReg(SrcReg, getKillRegState(isKill))
+                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+      } else if (ARM::GPRPairRegClass.hasSubClassEq(RC)) {
+        if (Subtarget.hasV5TEOps()) {
+          MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::STRD));
+          AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill), TRI);
+          AddDReg(MIB, SrcReg, ARM::gsub_1, 0, TRI);
+          MIB.addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO);
+
+          AddDefaultPred(MIB);
+        } else {
+          // Fallback to STM instruction, which has existed since the dawn of
+          // time.
+          MachineInstrBuilder MIB =
+            AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::STMIA))
+                             .addFrameIndex(FI).addMemOperand(MMO));
+          AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill), TRI);
+          AddDReg(MIB, SrcReg, ARM::gsub_1, 0, TRI);
+        }
+      } else
+        llvm_unreachable("Unknown reg class!");
+      break;
+    case 16:
+      if (ARM::DPairRegClass.hasSubClassEq(RC)) {
+        // Use aligned spills if the stack can be realigned.
+        if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
+          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1q64))
+                     .addFrameIndex(FI).addImm(16)
+                     .addReg(SrcReg, getKillRegState(isKill))
+                     .addMemOperand(MMO));
+        } else {
+          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMQIA))
+                     .addReg(SrcReg, getKillRegState(isKill))
+                     .addFrameIndex(FI)
+                     .addMemOperand(MMO));
+        }
+      } else
+        llvm_unreachable("Unknown reg class!");
+      break;
+    case 24:
+      if (ARM::DTripleRegClass.hasSubClassEq(RC)) {
+        // Use aligned spills if the stack can be realigned.
+        if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
+          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1d64TPseudo))
+                     .addFrameIndex(FI).addImm(16)
+                     .addReg(SrcReg, getKillRegState(isKill))
+                     .addMemOperand(MMO));
+        } else {
+          MachineInstrBuilder MIB =
+          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMDIA))
+                       .addFrameIndex(FI))
+                       .addMemOperand(MMO);
+          MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI);
+          MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI);
+          AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI);
+        }
+      } else
+        llvm_unreachable("Unknown reg class!");
+      break;
+    case 32:
+      if (ARM::QQPRRegClass.hasSubClassEq(RC) || ARM::DQuadRegClass.hasSubClassEq(RC)) {
+        if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
+          // FIXME: It's possible to only store part of the QQ register if the
+          // spilled def has a sub-register index.
+          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1d64QPseudo))
+                     .addFrameIndex(FI).addImm(16)
+                     .addReg(SrcReg, getKillRegState(isKill))
+                     .addMemOperand(MMO));
+        } else {
+          MachineInstrBuilder MIB =
+          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMDIA))
+                       .addFrameIndex(FI))
+                       .addMemOperand(MMO);
+          MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI);
+          MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI);
+          MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI);
+                AddDReg(MIB, SrcReg, ARM::dsub_3, 0, TRI);
+        }
+      } else
+        llvm_unreachable("Unknown reg class!");
+      break;
+    case 64:
+      if (ARM::QQQQPRRegClass.hasSubClassEq(RC)) {
+        MachineInstrBuilder MIB =
+          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMDIA))
+                         .addFrameIndex(FI))
+                         .addMemOperand(MMO);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_3, 0, TRI);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_4, 0, TRI);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_5, 0, TRI);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_6, 0, TRI);
+              AddDReg(MIB, SrcReg, ARM::dsub_7, 0, TRI);
+      } else
+        llvm_unreachable("Unknown reg class!");
+      break;
+    default:
+      llvm_unreachable("Unknown reg class!");
+  }
+}
+
+unsigned
+ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+                                     int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case ARM::STRrs:
+  case ARM::t2STRs: // FIXME: don't use t2STRs to access frame.
+    if (MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isReg() &&
+        MI->getOperand(3).isImm() &&
+        MI->getOperand(2).getReg() == 0 &&
+        MI->getOperand(3).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  case ARM::STRi12:
+  case ARM::t2STRi12:
+  case ARM::tSTRspi:
+  case ARM::VSTRD:
+  case ARM::VSTRS:
+    if (MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isImm() &&
+        MI->getOperand(2).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  case ARM::VST1q64:
+  case ARM::VST1d64TPseudo:
+  case ARM::VST1d64QPseudo:
+    if (MI->getOperand(0).isFI() &&
+        MI->getOperand(2).getSubReg() == 0) {
+      FrameIndex = MI->getOperand(0).getIndex();
+      return MI->getOperand(2).getReg();
+    }
+    break;
+  case ARM::VSTMQIA:
+    if (MI->getOperand(1).isFI() &&
+        MI->getOperand(0).getSubReg() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+
+  return 0;
+}
+
+unsigned ARMBaseInstrInfo::isStoreToStackSlotPostFE(const MachineInstr *MI,
+                                                    int &FrameIndex) const {
+  const MachineMemOperand *Dummy;
+  return MI->mayStore() && hasStoreToStackSlot(MI, Dummy, FrameIndex);
+}
+
+void ARMBaseInstrInfo::
+loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                     unsigned DestReg, int FI,
+                     const TargetRegisterClass *RC,
+                     const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  if (I != MBB.end()) DL = I->getDebugLoc();
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+  unsigned Align = MFI.getObjectAlignment(FI);
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(
+                    MachinePointerInfo::getFixedStack(FI),
+                            MachineMemOperand::MOLoad,
+                            MFI.getObjectSize(FI),
+                            Align);
+
+  switch (RC->getSize()) {
+  case 4:
+    if (ARM::GPRRegClass.hasSubClassEq(RC)) {
+      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::LDRi12), DestReg)
+                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+
+    } else if (ARM::SPRRegClass.hasSubClassEq(RC)) {
+      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDRS), DestReg)
+                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+    } else
+      llvm_unreachable("Unknown reg class!");
+    break;
+  case 8:
+    if (ARM::DPRRegClass.hasSubClassEq(RC)) {
+      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDRD), DestReg)
+                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+    } else if (ARM::GPRPairRegClass.hasSubClassEq(RC)) {
+      MachineInstrBuilder MIB;
+
+      if (Subtarget.hasV5TEOps()) {
+        MIB = BuildMI(MBB, I, DL, get(ARM::LDRD));
+        AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead, TRI);
+        AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI);
+        MIB.addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO);
+
+        AddDefaultPred(MIB);
+      } else {
+        // Fallback to LDM instruction, which has existed since the dawn of
+        // time.
+        MIB = AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::LDMIA))
+                                 .addFrameIndex(FI).addMemOperand(MMO));
+        MIB = AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead, TRI);
+        MIB = AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI);
+      }
+
+      if (TargetRegisterInfo::isPhysicalRegister(DestReg))
+        MIB.addReg(DestReg, RegState::ImplicitDefine);
+    } else
+      llvm_unreachable("Unknown reg class!");
+    break;
+  case 16:
+    if (ARM::DPairRegClass.hasSubClassEq(RC)) {
+      if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
+        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1q64), DestReg)
+                     .addFrameIndex(FI).addImm(16)
+                     .addMemOperand(MMO));
+      } else {
+        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMQIA), DestReg)
+                       .addFrameIndex(FI)
+                       .addMemOperand(MMO));
+      }
+    } else
+      llvm_unreachable("Unknown reg class!");
+    break;
+  case 24:
+    if (ARM::DTripleRegClass.hasSubClassEq(RC)) {
+      if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
+        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1d64TPseudo), DestReg)
+                     .addFrameIndex(FI).addImm(16)
+                     .addMemOperand(MMO));
+      } else {
+        MachineInstrBuilder MIB =
+          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMDIA))
+                         .addFrameIndex(FI)
+                         .addMemOperand(MMO));
+        MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI);
+        MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI);
+        MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI);
+        if (TargetRegisterInfo::isPhysicalRegister(DestReg))
+          MIB.addReg(DestReg, RegState::ImplicitDefine);
+      }
+    } else
+      llvm_unreachable("Unknown reg class!");
+    break;
+   case 32:
+    if (ARM::QQPRRegClass.hasSubClassEq(RC) || ARM::DQuadRegClass.hasSubClassEq(RC)) {
+      if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
+        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1d64QPseudo), DestReg)
+                     .addFrameIndex(FI).addImm(16)
+                     .addMemOperand(MMO));
+      } else {
+        MachineInstrBuilder MIB =
+        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMDIA))
+                       .addFrameIndex(FI))
+                       .addMemOperand(MMO);
+        MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI);
+        MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI);
+        MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI);
+        MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::DefineNoRead, TRI);
+        if (TargetRegisterInfo::isPhysicalRegister(DestReg))
+          MIB.addReg(DestReg, RegState::ImplicitDefine);
+      }
+    } else
+      llvm_unreachable("Unknown reg class!");
+    break;
+  case 64:
+    if (ARM::QQQQPRRegClass.hasSubClassEq(RC)) {
+      MachineInstrBuilder MIB =
+      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMDIA))
+                     .addFrameIndex(FI))
+                     .addMemOperand(MMO);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::DefineNoRead, TRI);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_4, RegState::DefineNoRead, TRI);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_5, RegState::DefineNoRead, TRI);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_6, RegState::DefineNoRead, TRI);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_7, RegState::DefineNoRead, TRI);
+      if (TargetRegisterInfo::isPhysicalRegister(DestReg))
+        MIB.addReg(DestReg, RegState::ImplicitDefine);
+    } else
+      llvm_unreachable("Unknown reg class!");
+    break;
+  default:
+    llvm_unreachable("Unknown regclass!");
+  }
+}
+
+unsigned
+ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+                                      int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case ARM::LDRrs:
+  case ARM::t2LDRs:  // FIXME: don't use t2LDRs to access frame.
+    if (MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isReg() &&
+        MI->getOperand(3).isImm() &&
+        MI->getOperand(2).getReg() == 0 &&
+        MI->getOperand(3).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  case ARM::LDRi12:
+  case ARM::t2LDRi12:
+  case ARM::tLDRspi:
+  case ARM::VLDRD:
+  case ARM::VLDRS:
+    if (MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isImm() &&
+        MI->getOperand(2).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  case ARM::VLD1q64:
+  case ARM::VLD1d64TPseudo:
+  case ARM::VLD1d64QPseudo:
+    if (MI->getOperand(1).isFI() &&
+        MI->getOperand(0).getSubReg() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  case ARM::VLDMQIA:
+    if (MI->getOperand(1).isFI() &&
+        MI->getOperand(0).getSubReg() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+
+  return 0;
+}
+
+unsigned ARMBaseInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI,
+                                             int &FrameIndex) const {
+  const MachineMemOperand *Dummy;
+  return MI->mayLoad() && hasLoadFromStackSlot(MI, Dummy, FrameIndex);
+}
+
+bool ARMBaseInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const{
+  // This hook gets to expand COPY instructions before they become
+  // copyPhysReg() calls.  Look for VMOVS instructions that can legally be
+  // widened to VMOVD.  We prefer the VMOVD when possible because it may be
+  // changed into a VORR that can go down the NEON pipeline.
+  if (!WidenVMOVS || !MI->isCopy() || Subtarget.isCortexA15())
+    return false;
+
+  // Look for a copy between even S-registers.  That is where we keep floats
+  // when using NEON v2f32 instructions for f32 arithmetic.
+  unsigned DstRegS = MI->getOperand(0).getReg();
+  unsigned SrcRegS = MI->getOperand(1).getReg();
+  if (!ARM::SPRRegClass.contains(DstRegS, SrcRegS))
+    return false;
+
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  unsigned DstRegD = TRI->getMatchingSuperReg(DstRegS, ARM::ssub_0,
+                                              &ARM::DPRRegClass);
+  unsigned SrcRegD = TRI->getMatchingSuperReg(SrcRegS, ARM::ssub_0,
+                                              &ARM::DPRRegClass);
+  if (!DstRegD || !SrcRegD)
+    return false;
+
+  // We want to widen this into a DstRegD = VMOVD SrcRegD copy.  This is only
+  // legal if the COPY already defines the full DstRegD, and it isn't a
+  // sub-register insertion.
+  if (!MI->definesRegister(DstRegD, TRI) || MI->readsRegister(DstRegD, TRI))
+    return false;
+
+  // A dead copy shouldn't show up here, but reject it just in case.
+  if (MI->getOperand(0).isDead())
+    return false;
+
+  // All clear, widen the COPY.
+  DEBUG(dbgs() << "widening:    " << *MI);
+  MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI);
+
+  // Get rid of the old <imp-def> of DstRegD.  Leave it if it defines a Q-reg
+  // or some other super-register.
+  int ImpDefIdx = MI->findRegisterDefOperandIdx(DstRegD);
+  if (ImpDefIdx != -1)
+    MI->RemoveOperand(ImpDefIdx);
+
+  // Change the opcode and operands.
+  MI->setDesc(get(ARM::VMOVD));
+  MI->getOperand(0).setReg(DstRegD);
+  MI->getOperand(1).setReg(SrcRegD);
+  AddDefaultPred(MIB);
+
+  // We are now reading SrcRegD instead of SrcRegS.  This may upset the
+  // register scavenger and machine verifier, so we need to indicate that we
+  // are reading an undefined value from SrcRegD, but a proper value from
+  // SrcRegS.
+  MI->getOperand(1).setIsUndef();
+  MIB.addReg(SrcRegS, RegState::Implicit);
+
+  // SrcRegD may actually contain an unrelated value in the ssub_1
+  // sub-register.  Don't kill it.  Only kill the ssub_0 sub-register.
+  if (MI->getOperand(1).isKill()) {
+    MI->getOperand(1).setIsKill(false);
+    MI->addRegisterKilled(SrcRegS, TRI, true);
+  }
+
+  DEBUG(dbgs() << "replaced by: " << *MI);
+  return true;
+}
+
+/// Create a copy of a const pool value. Update CPI to the new index and return
+/// the label UID.
+static unsigned duplicateCPV(MachineFunction &MF, unsigned &CPI) {
+  MachineConstantPool *MCP = MF.getConstantPool();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+
+  const MachineConstantPoolEntry &MCPE = MCP->getConstants()[CPI];
+  assert(MCPE.isMachineConstantPoolEntry() &&
+         "Expecting a machine constantpool entry!");
+  ARMConstantPoolValue *ACPV =
+    static_cast<ARMConstantPoolValue*>(MCPE.Val.MachineCPVal);
+
+  unsigned PCLabelId = AFI->createPICLabelUId();
+  ARMConstantPoolValue *NewCPV = nullptr;
+
+  // FIXME: The below assumes PIC relocation model and that the function
+  // is Thumb mode (t1 or t2). PCAdjustment would be 8 for ARM mode PIC, and
+  // zero for non-PIC in ARM or Thumb. The callers are all of thumb LDR
+  // instructions, so that's probably OK, but is PIC always correct when
+  // we get here?
+  if (ACPV->isGlobalValue())
+    NewCPV = ARMConstantPoolConstant::
+      Create(cast<ARMConstantPoolConstant>(ACPV)->getGV(), PCLabelId,
+             ARMCP::CPValue, 4);
+  else if (ACPV->isExtSymbol())
+    NewCPV = ARMConstantPoolSymbol::
+      Create(MF.getFunction()->getContext(),
+             cast<ARMConstantPoolSymbol>(ACPV)->getSymbol(), PCLabelId, 4);
+  else if (ACPV->isBlockAddress())
+    NewCPV = ARMConstantPoolConstant::
+      Create(cast<ARMConstantPoolConstant>(ACPV)->getBlockAddress(), PCLabelId,
+             ARMCP::CPBlockAddress, 4);
+  else if (ACPV->isLSDA())
+    NewCPV = ARMConstantPoolConstant::Create(MF.getFunction(), PCLabelId,
+                                             ARMCP::CPLSDA, 4);
+  else if (ACPV->isMachineBasicBlock())
+    NewCPV = ARMConstantPoolMBB::
+      Create(MF.getFunction()->getContext(),
+             cast<ARMConstantPoolMBB>(ACPV)->getMBB(), PCLabelId, 4);
+  else
+    llvm_unreachable("Unexpected ARM constantpool value type!!");
+  CPI = MCP->getConstantPoolIndex(NewCPV, MCPE.getAlignment());
+  return PCLabelId;
+}
+
+void ARMBaseInstrInfo::
+reMaterialize(MachineBasicBlock &MBB,
+              MachineBasicBlock::iterator I,
+              unsigned DestReg, unsigned SubIdx,
+              const MachineInstr *Orig,
+              const TargetRegisterInfo &TRI) const {
+  unsigned Opcode = Orig->getOpcode();
+  switch (Opcode) {
+  default: {
+    MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig);
+    MI->substituteRegister(Orig->getOperand(0).getReg(), DestReg, SubIdx, TRI);
+    MBB.insert(I, MI);
+    break;
+  }
+  case ARM::tLDRpci_pic:
+  case ARM::t2LDRpci_pic: {
+    MachineFunction &MF = *MBB.getParent();
+    unsigned CPI = Orig->getOperand(1).getIndex();
+    unsigned PCLabelId = duplicateCPV(MF, CPI);
+    MachineInstrBuilder MIB = BuildMI(MBB, I, Orig->getDebugLoc(), get(Opcode),
+                                      DestReg)
+      .addConstantPoolIndex(CPI).addImm(PCLabelId);
+    MIB->setMemRefs(Orig->memoperands_begin(), Orig->memoperands_end());
+    break;
+  }
+  }
+}
+
+MachineInstr *
+ARMBaseInstrInfo::duplicate(MachineInstr *Orig, MachineFunction &MF) const {
+  MachineInstr *MI = TargetInstrInfo::duplicate(Orig, MF);
+  switch(Orig->getOpcode()) {
+  case ARM::tLDRpci_pic:
+  case ARM::t2LDRpci_pic: {
+    unsigned CPI = Orig->getOperand(1).getIndex();
+    unsigned PCLabelId = duplicateCPV(MF, CPI);
+    Orig->getOperand(1).setIndex(CPI);
+    Orig->getOperand(2).setImm(PCLabelId);
+    break;
+  }
+  }
+  return MI;
+}
+
+bool ARMBaseInstrInfo::produceSameValue(const MachineInstr *MI0,
+                                        const MachineInstr *MI1,
+                                        const MachineRegisterInfo *MRI) const {
+  int Opcode = MI0->getOpcode();
+  if (Opcode == ARM::t2LDRpci ||
+      Opcode == ARM::t2LDRpci_pic ||
+      Opcode == ARM::tLDRpci ||
+      Opcode == ARM::tLDRpci_pic ||
+      Opcode == ARM::LDRLIT_ga_pcrel ||
+      Opcode == ARM::LDRLIT_ga_pcrel_ldr ||
+      Opcode == ARM::tLDRLIT_ga_pcrel ||
+      Opcode == ARM::MOV_ga_pcrel ||
+      Opcode == ARM::MOV_ga_pcrel_ldr ||
+      Opcode == ARM::t2MOV_ga_pcrel) {
+    if (MI1->getOpcode() != Opcode)
+      return false;
+    if (MI0->getNumOperands() != MI1->getNumOperands())
+      return false;
+
+    const MachineOperand &MO0 = MI0->getOperand(1);
+    const MachineOperand &MO1 = MI1->getOperand(1);
+    if (MO0.getOffset() != MO1.getOffset())
+      return false;
+
+    if (Opcode == ARM::LDRLIT_ga_pcrel ||
+        Opcode == ARM::LDRLIT_ga_pcrel_ldr ||
+        Opcode == ARM::tLDRLIT_ga_pcrel ||
+        Opcode == ARM::MOV_ga_pcrel ||
+        Opcode == ARM::MOV_ga_pcrel_ldr ||
+        Opcode == ARM::t2MOV_ga_pcrel)
+      // Ignore the PC labels.
+      return MO0.getGlobal() == MO1.getGlobal();
+
+    const MachineFunction *MF = MI0->getParent()->getParent();
+    const MachineConstantPool *MCP = MF->getConstantPool();
+    int CPI0 = MO0.getIndex();
+    int CPI1 = MO1.getIndex();
+    const MachineConstantPoolEntry &MCPE0 = MCP->getConstants()[CPI0];
+    const MachineConstantPoolEntry &MCPE1 = MCP->getConstants()[CPI1];
+    bool isARMCP0 = MCPE0.isMachineConstantPoolEntry();
+    bool isARMCP1 = MCPE1.isMachineConstantPoolEntry();
+    if (isARMCP0 && isARMCP1) {
+      ARMConstantPoolValue *ACPV0 =
+        static_cast<ARMConstantPoolValue*>(MCPE0.Val.MachineCPVal);
+      ARMConstantPoolValue *ACPV1 =
+        static_cast<ARMConstantPoolValue*>(MCPE1.Val.MachineCPVal);
+      return ACPV0->hasSameValue(ACPV1);
+    } else if (!isARMCP0 && !isARMCP1) {
+      return MCPE0.Val.ConstVal == MCPE1.Val.ConstVal;
+    }
+    return false;
+  } else if (Opcode == ARM::PICLDR) {
+    if (MI1->getOpcode() != Opcode)
+      return false;
+    if (MI0->getNumOperands() != MI1->getNumOperands())
+      return false;
+
+    unsigned Addr0 = MI0->getOperand(1).getReg();
+    unsigned Addr1 = MI1->getOperand(1).getReg();
+    if (Addr0 != Addr1) {
+      if (!MRI ||
+          !TargetRegisterInfo::isVirtualRegister(Addr0) ||
+          !TargetRegisterInfo::isVirtualRegister(Addr1))
+        return false;
+
+      // This assumes SSA form.
+      MachineInstr *Def0 = MRI->getVRegDef(Addr0);
+      MachineInstr *Def1 = MRI->getVRegDef(Addr1);
+      // Check if the loaded value, e.g. a constantpool of a global address, are
+      // the same.
+      if (!produceSameValue(Def0, Def1, MRI))
+        return false;
+    }
+
+    for (unsigned i = 3, e = MI0->getNumOperands(); i != e; ++i) {
+      // %vreg12<def> = PICLDR %vreg11, 0, pred:14, pred:%noreg
+      const MachineOperand &MO0 = MI0->getOperand(i);
+      const MachineOperand &MO1 = MI1->getOperand(i);
+      if (!MO0.isIdenticalTo(MO1))
+        return false;
+    }
+    return true;
+  }
+
+  return MI0->isIdenticalTo(MI1, MachineInstr::IgnoreVRegDefs);
+}
+
+/// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler to
+/// determine if two loads are loading from the same base address. It should
+/// only return true if the base pointers are the same and the only differences
+/// between the two addresses is the offset. It also returns the offsets by
+/// reference.
+///
+/// FIXME: remove this in favor of the MachineInstr interface once pre-RA-sched
+/// is permanently disabled.
+bool ARMBaseInstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
+                                               int64_t &Offset1,
+                                               int64_t &Offset2) const {
+  // Don't worry about Thumb: just ARM and Thumb2.
+  if (Subtarget.isThumb1Only()) return false;
+
+  if (!Load1->isMachineOpcode() || !Load2->isMachineOpcode())
+    return false;
+
+  switch (Load1->getMachineOpcode()) {
+  default:
+    return false;
+  case ARM::LDRi12:
+  case ARM::LDRBi12:
+  case ARM::LDRD:
+  case ARM::LDRH:
+  case ARM::LDRSB:
+  case ARM::LDRSH:
+  case ARM::VLDRD:
+  case ARM::VLDRS:
+  case ARM::t2LDRi8:
+  case ARM::t2LDRBi8:
+  case ARM::t2LDRDi8:
+  case ARM::t2LDRSHi8:
+  case ARM::t2LDRi12:
+  case ARM::t2LDRBi12:
+  case ARM::t2LDRSHi12:
+    break;
+  }
+
+  switch (Load2->getMachineOpcode()) {
+  default:
+    return false;
+  case ARM::LDRi12:
+  case ARM::LDRBi12:
+  case ARM::LDRD:
+  case ARM::LDRH:
+  case ARM::LDRSB:
+  case ARM::LDRSH:
+  case ARM::VLDRD:
+  case ARM::VLDRS:
+  case ARM::t2LDRi8:
+  case ARM::t2LDRBi8:
+  case ARM::t2LDRSHi8:
+  case ARM::t2LDRi12:
+  case ARM::t2LDRBi12:
+  case ARM::t2LDRSHi12:
+    break;
+  }
+
+  // Check if base addresses and chain operands match.
+  if (Load1->getOperand(0) != Load2->getOperand(0) ||
+      Load1->getOperand(4) != Load2->getOperand(4))
+    return false;
+
+  // Index should be Reg0.
+  if (Load1->getOperand(3) != Load2->getOperand(3))
+    return false;
+
+  // Determine the offsets.
+  if (isa<ConstantSDNode>(Load1->getOperand(1)) &&
+      isa<ConstantSDNode>(Load2->getOperand(1))) {
+    Offset1 = cast<ConstantSDNode>(Load1->getOperand(1))->getSExtValue();
+    Offset2 = cast<ConstantSDNode>(Load2->getOperand(1))->getSExtValue();
+    return true;
+  }
+
+  return false;
+}
+
+/// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to
+/// determine (in conjunction with areLoadsFromSameBasePtr) if two loads should
+/// be scheduled togther. On some targets if two loads are loading from
+/// addresses in the same cache line, it's better if they are scheduled
+/// together. This function takes two integers that represent the load offsets
+/// from the common base address. It returns true if it decides it's desirable
+/// to schedule the two loads together. "NumLoads" is the number of loads that
+/// have already been scheduled after Load1.
+///
+/// FIXME: remove this in favor of the MachineInstr interface once pre-RA-sched
+/// is permanently disabled.
+bool ARMBaseInstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
+                                               int64_t Offset1, int64_t Offset2,
+                                               unsigned NumLoads) const {
+  // Don't worry about Thumb: just ARM and Thumb2.
+  if (Subtarget.isThumb1Only()) return false;
+
+  assert(Offset2 > Offset1);
+
+  if ((Offset2 - Offset1) / 8 > 64)
+    return false;
+
+  // Check if the machine opcodes are different. If they are different
+  // then we consider them to not be of the same base address,
+  // EXCEPT in the case of Thumb2 byte loads where one is LDRBi8 and the other LDRBi12.
+  // In this case, they are considered to be the same because they are different
+  // encoding forms of the same basic instruction.
+  if ((Load1->getMachineOpcode() != Load2->getMachineOpcode()) &&
+      !((Load1->getMachineOpcode() == ARM::t2LDRBi8 &&
+         Load2->getMachineOpcode() == ARM::t2LDRBi12) ||
+        (Load1->getMachineOpcode() == ARM::t2LDRBi12 &&
+         Load2->getMachineOpcode() == ARM::t2LDRBi8)))
+    return false;  // FIXME: overly conservative?
+
+  // Four loads in a row should be sufficient.
+  if (NumLoads >= 3)
+    return false;
+
+  return true;
+}
+
+bool ARMBaseInstrInfo::isSchedulingBoundary(const MachineInstr *MI,
+                                            const MachineBasicBlock *MBB,
+                                            const MachineFunction &MF) const {
+  // Debug info is never a scheduling boundary. It's necessary to be explicit
+  // due to the special treatment of IT instructions below, otherwise a
+  // dbg_value followed by an IT will result in the IT instruction being
+  // considered a scheduling hazard, which is wrong. It should be the actual
+  // instruction preceding the dbg_value instruction(s), just like it is
+  // when debug info is not present.
+  if (MI->isDebugValue())
+    return false;
+
+  // Terminators and labels can't be scheduled around.
+  if (MI->isTerminator() || MI->isPosition())
+    return true;
+
+  // Treat the start of the IT block as a scheduling boundary, but schedule
+  // t2IT along with all instructions following it.
+  // FIXME: This is a big hammer. But the alternative is to add all potential
+  // true and anti dependencies to IT block instructions as implicit operands
+  // to the t2IT instruction. The added compile time and complexity does not
+  // seem worth it.
+  MachineBasicBlock::const_iterator I = MI;
+  // Make sure to skip any dbg_value instructions
+  while (++I != MBB->end() && I->isDebugValue())
+    ;
+  if (I != MBB->end() && I->getOpcode() == ARM::t2IT)
+    return true;
+
+  // Don't attempt to schedule around any instruction that defines
+  // a stack-oriented pointer, as it's unlikely to be profitable. This
+  // saves compile time, because it doesn't require every single
+  // stack slot reference to depend on the instruction that does the
+  // modification.
+  // Calls don't actually change the stack pointer, even if they have imp-defs.
+  // No ARM calling conventions change the stack pointer. (X86 calling
+  // conventions sometimes do).
+  if (!MI->isCall() && MI->definesRegister(ARM::SP))
+    return true;
+
+  return false;
+}
+
+bool ARMBaseInstrInfo::
+isProfitableToIfCvt(MachineBasicBlock &MBB,
+                    unsigned NumCycles, unsigned ExtraPredCycles,
+                    const BranchProbability &Probability) const {
+  if (!NumCycles)
+    return false;
+
+  // Attempt to estimate the relative costs of predication versus branching.
+  unsigned UnpredCost = Probability.getNumerator() * NumCycles;
+  UnpredCost /= Probability.getDenominator();
+  UnpredCost += 1; // The branch itself
+  UnpredCost += Subtarget.getMispredictionPenalty() / 10;
+
+  return (NumCycles + ExtraPredCycles) <= UnpredCost;
+}
+
+bool ARMBaseInstrInfo::
+isProfitableToIfCvt(MachineBasicBlock &TMBB,
+                    unsigned TCycles, unsigned TExtra,
+                    MachineBasicBlock &FMBB,
+                    unsigned FCycles, unsigned FExtra,
+                    const BranchProbability &Probability) const {
+  if (!TCycles || !FCycles)
+    return false;
+
+  // Attempt to estimate the relative costs of predication versus branching.
+  unsigned TUnpredCost = Probability.getNumerator() * TCycles;
+  TUnpredCost /= Probability.getDenominator();
+
+  uint32_t Comp = Probability.getDenominator() - Probability.getNumerator();
+  unsigned FUnpredCost = Comp * FCycles;
+  FUnpredCost /= Probability.getDenominator();
+
+  unsigned UnpredCost = TUnpredCost + FUnpredCost;
+  UnpredCost += 1; // The branch itself
+  UnpredCost += Subtarget.getMispredictionPenalty() / 10;
+
+  return (TCycles + FCycles + TExtra + FExtra) <= UnpredCost;
+}
+
+bool
+ARMBaseInstrInfo::isProfitableToUnpredicate(MachineBasicBlock &TMBB,
+                                            MachineBasicBlock &FMBB) const {
+  // Reduce false anti-dependencies to let Swift's out-of-order execution
+  // engine do its thing.
+  return Subtarget.isSwift();
+}
+
+/// getInstrPredicate - If instruction is predicated, returns its predicate
+/// condition, otherwise returns AL. It also returns the condition code
+/// register by reference.
+ARMCC::CondCodes
+llvm::getInstrPredicate(const MachineInstr *MI, unsigned &PredReg) {
+  int PIdx = MI->findFirstPredOperandIdx();
+  if (PIdx == -1) {
+    PredReg = 0;
+    return ARMCC::AL;
+  }
+
+  PredReg = MI->getOperand(PIdx+1).getReg();
+  return (ARMCC::CondCodes)MI->getOperand(PIdx).getImm();
+}
+
+
+int llvm::getMatchingCondBranchOpcode(int Opc) {
+  if (Opc == ARM::B)
+    return ARM::Bcc;
+  if (Opc == ARM::tB)
+    return ARM::tBcc;
+  if (Opc == ARM::t2B)
+    return ARM::t2Bcc;
+
+  llvm_unreachable("Unknown unconditional branch opcode!");
+}
+
+/// commuteInstruction - Handle commutable instructions.
+MachineInstr *
+ARMBaseInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
+  switch (MI->getOpcode()) {
+  case ARM::MOVCCr:
+  case ARM::t2MOVCCr: {
+    // MOVCC can be commuted by inverting the condition.
+    unsigned PredReg = 0;
+    ARMCC::CondCodes CC = getInstrPredicate(MI, PredReg);
+    // MOVCC AL can't be inverted. Shouldn't happen.
+    if (CC == ARMCC::AL || PredReg != ARM::CPSR)
+      return nullptr;
+    MI = TargetInstrInfo::commuteInstruction(MI, NewMI);
+    if (!MI)
+      return nullptr;
+    // After swapping the MOVCC operands, also invert the condition.
+    MI->getOperand(MI->findFirstPredOperandIdx())
+      .setImm(ARMCC::getOppositeCondition(CC));
+    return MI;
+  }
+  }
+  return TargetInstrInfo::commuteInstruction(MI, NewMI);
+}
+
+/// Identify instructions that can be folded into a MOVCC instruction, and
+/// return the defining instruction.
+static MachineInstr *canFoldIntoMOVCC(unsigned Reg,
+                                      const MachineRegisterInfo &MRI,
+                                      const TargetInstrInfo *TII) {
+  if (!TargetRegisterInfo::isVirtualRegister(Reg))
+    return nullptr;
+  if (!MRI.hasOneNonDBGUse(Reg))
+    return nullptr;
+  MachineInstr *MI = MRI.getVRegDef(Reg);
+  if (!MI)
+    return nullptr;
+  // MI is folded into the MOVCC by predicating it.
+  if (!MI->isPredicable())
+    return nullptr;
+  // Check if MI has any non-dead defs or physreg uses. This also detects
+  // predicated instructions which will be reading CPSR.
+  for (unsigned i = 1, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    // Reject frame index operands, PEI can't handle the predicated pseudos.
+    if (MO.isFI() || MO.isCPI() || MO.isJTI())
+      return nullptr;
+    if (!MO.isReg())
+      continue;
+    // MI can't have any tied operands, that would conflict with predication.
+    if (MO.isTied())
+      return nullptr;
+    if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+      return nullptr;
+    if (MO.isDef() && !MO.isDead())
+      return nullptr;
+  }
+  bool DontMoveAcrossStores = true;
+  if (!MI->isSafeToMove(TII, /* AliasAnalysis = */ nullptr,
+                        DontMoveAcrossStores))
+    return nullptr;
+  return MI;
+}
+
+bool ARMBaseInstrInfo::analyzeSelect(const MachineInstr *MI,
+                                     SmallVectorImpl<MachineOperand> &Cond,
+                                     unsigned &TrueOp, unsigned &FalseOp,
+                                     bool &Optimizable) const {
+  assert((MI->getOpcode() == ARM::MOVCCr || MI->getOpcode() == ARM::t2MOVCCr) &&
+         "Unknown select instruction");
+  // MOVCC operands:
+  // 0: Def.
+  // 1: True use.
+  // 2: False use.
+  // 3: Condition code.
+  // 4: CPSR use.
+  TrueOp = 1;
+  FalseOp = 2;
+  Cond.push_back(MI->getOperand(3));
+  Cond.push_back(MI->getOperand(4));
+  // We can always fold a def.
+  Optimizable = true;
+  return false;
+}
+
+MachineInstr *ARMBaseInstrInfo::optimizeSelect(MachineInstr *MI,
+                                               bool PreferFalse) const {
+  assert((MI->getOpcode() == ARM::MOVCCr || MI->getOpcode() == ARM::t2MOVCCr) &&
+         "Unknown select instruction");
+  MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+  MachineInstr *DefMI = canFoldIntoMOVCC(MI->getOperand(2).getReg(), MRI, this);
+  bool Invert = !DefMI;
+  if (!DefMI)
+    DefMI = canFoldIntoMOVCC(MI->getOperand(1).getReg(), MRI, this);
+  if (!DefMI)
+    return nullptr;
+
+  // Find new register class to use.
+  MachineOperand FalseReg = MI->getOperand(Invert ? 2 : 1);
+  unsigned       DestReg  = MI->getOperand(0).getReg();
+  const TargetRegisterClass *PreviousClass = MRI.getRegClass(FalseReg.getReg());
+  if (!MRI.constrainRegClass(DestReg, PreviousClass))
+    return nullptr;
+
+  // Create a new predicated version of DefMI.
+  // Rfalse is the first use.
+  MachineInstrBuilder NewMI = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+                                      DefMI->getDesc(), DestReg);
+
+  // Copy all the DefMI operands, excluding its (null) predicate.
+  const MCInstrDesc &DefDesc = DefMI->getDesc();
+  for (unsigned i = 1, e = DefDesc.getNumOperands();
+       i != e && !DefDesc.OpInfo[i].isPredicate(); ++i)
+    NewMI.addOperand(DefMI->getOperand(i));
+
+  unsigned CondCode = MI->getOperand(3).getImm();
+  if (Invert)
+    NewMI.addImm(ARMCC::getOppositeCondition(ARMCC::CondCodes(CondCode)));
+  else
+    NewMI.addImm(CondCode);
+  NewMI.addOperand(MI->getOperand(4));
+
+  // DefMI is not the -S version that sets CPSR, so add an optional %noreg.
+  if (NewMI->hasOptionalDef())
+    AddDefaultCC(NewMI);
+
+  // The output register value when the predicate is false is an implicit
+  // register operand tied to the first def.
+  // The tie makes the register allocator ensure the FalseReg is allocated the
+  // same register as operand 0.
+  FalseReg.setImplicit();
+  NewMI.addOperand(FalseReg);
+  NewMI->tieOperands(0, NewMI->getNumOperands() - 1);
+
+  // The caller will erase MI, but not DefMI.
+  DefMI->eraseFromParent();
+  return NewMI;
+}
+
+/// Map pseudo instructions that imply an 'S' bit onto real opcodes. Whether the
+/// instruction is encoded with an 'S' bit is determined by the optional CPSR
+/// def operand.
+///
+/// This will go away once we can teach tblgen how to set the optional CPSR def
+/// operand itself.
+struct AddSubFlagsOpcodePair {
+  uint16_t PseudoOpc;
+  uint16_t MachineOpc;
+};
+
+static const AddSubFlagsOpcodePair AddSubFlagsOpcodeMap[] = {
+  {ARM::ADDSri, ARM::ADDri},
+  {ARM::ADDSrr, ARM::ADDrr},
+  {ARM::ADDSrsi, ARM::ADDrsi},
+  {ARM::ADDSrsr, ARM::ADDrsr},
+
+  {ARM::SUBSri, ARM::SUBri},
+  {ARM::SUBSrr, ARM::SUBrr},
+  {ARM::SUBSrsi, ARM::SUBrsi},
+  {ARM::SUBSrsr, ARM::SUBrsr},
+
+  {ARM::RSBSri, ARM::RSBri},
+  {ARM::RSBSrsi, ARM::RSBrsi},
+  {ARM::RSBSrsr, ARM::RSBrsr},
+
+  {ARM::t2ADDSri, ARM::t2ADDri},
+  {ARM::t2ADDSrr, ARM::t2ADDrr},
+  {ARM::t2ADDSrs, ARM::t2ADDrs},
+
+  {ARM::t2SUBSri, ARM::t2SUBri},
+  {ARM::t2SUBSrr, ARM::t2SUBrr},
+  {ARM::t2SUBSrs, ARM::t2SUBrs},
+
+  {ARM::t2RSBSri, ARM::t2RSBri},
+  {ARM::t2RSBSrs, ARM::t2RSBrs},
+};
+
+unsigned llvm::convertAddSubFlagsOpcode(unsigned OldOpc) {
+  for (unsigned i = 0, e = array_lengthof(AddSubFlagsOpcodeMap); i != e; ++i)
+    if (OldOpc == AddSubFlagsOpcodeMap[i].PseudoOpc)
+      return AddSubFlagsOpcodeMap[i].MachineOpc;
+  return 0;
+}
+
+void llvm::emitARMRegPlusImmediate(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator &MBBI, DebugLoc dl,
+                               unsigned DestReg, unsigned BaseReg, int NumBytes,
+                               ARMCC::CondCodes Pred, unsigned PredReg,
+                               const ARMBaseInstrInfo &TII, unsigned MIFlags) {
+  if (NumBytes == 0 && DestReg != BaseReg) {
+    BuildMI(MBB, MBBI, dl, TII.get(ARM::MOVr), DestReg)
+      .addReg(BaseReg, RegState::Kill)
+      .addImm((unsigned)Pred).addReg(PredReg).addReg(0)
+      .setMIFlags(MIFlags);
+    return;
+  }
+
+  bool isSub = NumBytes < 0;
+  if (isSub) NumBytes = -NumBytes;
+
+  while (NumBytes) {
+    unsigned RotAmt = ARM_AM::getSOImmValRotate(NumBytes);
+    unsigned ThisVal = NumBytes & ARM_AM::rotr32(0xFF, RotAmt);
+    assert(ThisVal && "Didn't extract field correctly");
+
+    // We will handle these bits from offset, clear them.
+    NumBytes &= ~ThisVal;
+
+    assert(ARM_AM::getSOImmVal(ThisVal) != -1 && "Bit extraction didn't work?");
+
+    // Build the new ADD / SUB.
+    unsigned Opc = isSub ? ARM::SUBri : ARM::ADDri;
+    BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
+      .addReg(BaseReg, RegState::Kill).addImm(ThisVal)
+      .addImm((unsigned)Pred).addReg(PredReg).addReg(0)
+      .setMIFlags(MIFlags);
+    BaseReg = DestReg;
+  }
+}
+
+static bool isAnySubRegLive(unsigned Reg, const TargetRegisterInfo *TRI,
+                      MachineInstr *MI) {
+  for (MCSubRegIterator Subreg(Reg, TRI, /* IncludeSelf */ true);
+       Subreg.isValid(); ++Subreg)
+    if (MI->getParent()->computeRegisterLiveness(TRI, *Subreg, MI) !=
+        MachineBasicBlock::LQR_Dead)
+      return true;
+  return false;
+}
+bool llvm::tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget,
+                                      MachineFunction &MF, MachineInstr *MI,
+                                      unsigned NumBytes) {
+  // This optimisation potentially adds lots of load and store
+  // micro-operations, it's only really a great benefit to code-size.
+  if (!MF.getFunction()->getAttributes().hasAttribute(
+          AttributeSet::FunctionIndex, Attribute::MinSize))
+    return false;
+
+  // If only one register is pushed/popped, LLVM can use an LDR/STR
+  // instead. We can't modify those so make sure we're dealing with an
+  // instruction we understand.
+  bool IsPop = isPopOpcode(MI->getOpcode());
+  bool IsPush = isPushOpcode(MI->getOpcode());
+  if (!IsPush && !IsPop)
+    return false;
+
+  bool IsVFPPushPop = MI->getOpcode() == ARM::VSTMDDB_UPD ||
+                      MI->getOpcode() == ARM::VLDMDIA_UPD;
+  bool IsT1PushPop = MI->getOpcode() == ARM::tPUSH ||
+                     MI->getOpcode() == ARM::tPOP ||
+                     MI->getOpcode() == ARM::tPOP_RET;
+
+  assert((IsT1PushPop || (MI->getOperand(0).getReg() == ARM::SP &&
+                          MI->getOperand(1).getReg() == ARM::SP)) &&
+         "trying to fold sp update into non-sp-updating push/pop");
+
+  // The VFP push & pop act on D-registers, so we can only fold an adjustment
+  // by a multiple of 8 bytes in correctly. Similarly rN is 4-bytes. Don't try
+  // if this is violated.
+  if (NumBytes % (IsVFPPushPop ? 8 : 4) != 0)
+    return false;
+
+  // ARM and Thumb2 push/pop insts have explicit "sp, sp" operands (+
+  // pred) so the list starts at 4. Thumb1 starts after the predicate.
+  int RegListIdx = IsT1PushPop ? 2 : 4;
+
+  // Calculate the space we'll need in terms of registers.
+  unsigned FirstReg = MI->getOperand(RegListIdx).getReg();
+  unsigned RD0Reg, RegsNeeded;
+  if (IsVFPPushPop) {
+    RD0Reg = ARM::D0;
+    RegsNeeded = NumBytes / 8;
+  } else {
+    RD0Reg = ARM::R0;
+    RegsNeeded = NumBytes / 4;
+  }
+
+  // We're going to have to strip all list operands off before
+  // re-adding them since the order matters, so save the existing ones
+  // for later.
+  SmallVector<MachineOperand, 4> RegList;
+  for (int i = MI->getNumOperands() - 1; i >= RegListIdx; --i)
+    RegList.push_back(MI->getOperand(i));
+
+  const TargetRegisterInfo *TRI = MF.getRegInfo().getTargetRegisterInfo();
+  const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF);
+
+  // Now try to find enough space in the reglist to allocate NumBytes.
+  for (unsigned CurReg = FirstReg - 1; CurReg >= RD0Reg && RegsNeeded;
+       --CurReg) {
+    if (!IsPop) {
+      // Pushing any register is completely harmless, mark the
+      // register involved as undef since we don't care about it in
+      // the slightest.
+      RegList.push_back(MachineOperand::CreateReg(CurReg, false, false,
+                                                  false, false, true));
+      --RegsNeeded;
+      continue;
+    }
+
+    // However, we can only pop an extra register if it's not live. For
+    // registers live within the function we might clobber a return value
+    // register; the other way a register can be live here is if it's
+    // callee-saved.
+    // TODO: Currently, computeRegisterLiveness() does not report "live" if a
+    // sub reg is live. When computeRegisterLiveness() works for sub reg, it
+    // can replace isAnySubRegLive().
+    if (isCalleeSavedRegister(CurReg, CSRegs) ||
+        isAnySubRegLive(CurReg, TRI, MI)) {
+      // VFP pops don't allow holes in the register list, so any skip is fatal
+      // for our transformation. GPR pops do, so we should just keep looking.
+      if (IsVFPPushPop)
+        return false;
+      else
+        continue;
+    }
+
+    // Mark the unimportant registers as <def,dead> in the POP.
+    RegList.push_back(MachineOperand::CreateReg(CurReg, true, false, false,
+                                                true));
+    --RegsNeeded;
+  }
+
+  if (RegsNeeded > 0)
+    return false;
+
+  // Finally we know we can profitably perform the optimisation so go
+  // ahead: strip all existing registers off and add them back again
+  // in the right order.
+  for (int i = MI->getNumOperands() - 1; i >= RegListIdx; --i)
+    MI->RemoveOperand(i);
+
+  // Add the complete list back in.
+  MachineInstrBuilder MIB(MF, &*MI);
+  for (int i = RegList.size() - 1; i >= 0; --i)
+    MIB.addOperand(RegList[i]);
+
+  return true;
+}
+
+bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+                                unsigned FrameReg, int &Offset,
+                                const ARMBaseInstrInfo &TII) {
+  unsigned Opcode = MI.getOpcode();
+  const MCInstrDesc &Desc = MI.getDesc();
+  unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
+  bool isSub = false;
+
+  // Memory operands in inline assembly always use AddrMode2.
+  if (Opcode == ARM::INLINEASM)
+    AddrMode = ARMII::AddrMode2;
+
+  if (Opcode == ARM::ADDri) {
+    Offset += MI.getOperand(FrameRegIdx+1).getImm();
+    if (Offset == 0) {
+      // Turn it into a move.
+      MI.setDesc(TII.get(ARM::MOVr));
+      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+      MI.RemoveOperand(FrameRegIdx+1);
+      Offset = 0;
+      return true;
+    } else if (Offset < 0) {
+      Offset = -Offset;
+      isSub = true;
+      MI.setDesc(TII.get(ARM::SUBri));
+    }
+
+    // Common case: small offset, fits into instruction.
+    if (ARM_AM::getSOImmVal(Offset) != -1) {
+      // Replace the FrameIndex with sp / fp
+      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+      MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Offset);
+      Offset = 0;
+      return true;
+    }
+
+    // Otherwise, pull as much of the immedidate into this ADDri/SUBri
+    // as possible.
+    unsigned RotAmt = ARM_AM::getSOImmValRotate(Offset);
+    unsigned ThisImmVal = Offset & ARM_AM::rotr32(0xFF, RotAmt);
+
+    // We will handle these bits from offset, clear them.
+    Offset &= ~ThisImmVal;
+
+    // Get the properly encoded SOImmVal field.
+    assert(ARM_AM::getSOImmVal(ThisImmVal) != -1 &&
+           "Bit extraction didn't work?");
+    MI.getOperand(FrameRegIdx+1).ChangeToImmediate(ThisImmVal);
+ } else {
+    unsigned ImmIdx = 0;
+    int InstrOffs = 0;
+    unsigned NumBits = 0;
+    unsigned Scale = 1;
+    switch (AddrMode) {
+    case ARMII::AddrMode_i12: {
+      ImmIdx = FrameRegIdx + 1;
+      InstrOffs = MI.getOperand(ImmIdx).getImm();
+      NumBits = 12;
+      break;
+    }
+    case ARMII::AddrMode2: {
+      ImmIdx = FrameRegIdx+2;
+      InstrOffs = ARM_AM::getAM2Offset(MI.getOperand(ImmIdx).getImm());
+      if (ARM_AM::getAM2Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub)
+        InstrOffs *= -1;
+      NumBits = 12;
+      break;
+    }
+    case ARMII::AddrMode3: {
+      ImmIdx = FrameRegIdx+2;
+      InstrOffs = ARM_AM::getAM3Offset(MI.getOperand(ImmIdx).getImm());
+      if (ARM_AM::getAM3Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub)
+        InstrOffs *= -1;
+      NumBits = 8;
+      break;
+    }
+    case ARMII::AddrMode4:
+    case ARMII::AddrMode6:
+      // Can't fold any offset even if it's zero.
+      return false;
+    case ARMII::AddrMode5: {
+      ImmIdx = FrameRegIdx+1;
+      InstrOffs = ARM_AM::getAM5Offset(MI.getOperand(ImmIdx).getImm());
+      if (ARM_AM::getAM5Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub)
+        InstrOffs *= -1;
+      NumBits = 8;
+      Scale = 4;
+      break;
+    }
+    default:
+      llvm_unreachable("Unsupported addressing mode!");
+    }
+
+    Offset += InstrOffs * Scale;
+    assert((Offset & (Scale-1)) == 0 && "Can't encode this offset!");
+    if (Offset < 0) {
+      Offset = -Offset;
+      isSub = true;
+    }
+
+    // Attempt to fold address comp. if opcode has offset bits
+    if (NumBits > 0) {
+      // Common case: small offset, fits into instruction.
+      MachineOperand &ImmOp = MI.getOperand(ImmIdx);
+      int ImmedOffset = Offset / Scale;
+      unsigned Mask = (1 << NumBits) - 1;
+      if ((unsigned)Offset <= Mask * Scale) {
+        // Replace the FrameIndex with sp
+        MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+        // FIXME: When addrmode2 goes away, this will simplify (like the
+        // T2 version), as the LDR.i12 versions don't need the encoding
+        // tricks for the offset value.
+        if (isSub) {
+          if (AddrMode == ARMII::AddrMode_i12)
+            ImmedOffset = -ImmedOffset;
+          else
+            ImmedOffset |= 1 << NumBits;
+        }
+        ImmOp.ChangeToImmediate(ImmedOffset);
+        Offset = 0;
+        return true;
+      }
+
+      // Otherwise, it didn't fit. Pull in what we can to simplify the immed.
+      ImmedOffset = ImmedOffset & Mask;
+      if (isSub) {
+        if (AddrMode == ARMII::AddrMode_i12)
+          ImmedOffset = -ImmedOffset;
+        else
+          ImmedOffset |= 1 << NumBits;
+      }
+      ImmOp.ChangeToImmediate(ImmedOffset);
+      Offset &= ~(Mask*Scale);
+    }
+  }
+
+  Offset = (isSub) ? -Offset : Offset;
+  return Offset == 0;
+}
+
+/// analyzeCompare - For a comparison instruction, return the source registers
+/// in SrcReg and SrcReg2 if having two register operands, and the value it
+/// compares against in CmpValue. Return true if the comparison instruction
+/// can be analyzed.
+bool ARMBaseInstrInfo::
+analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, unsigned &SrcReg2,
+               int &CmpMask, int &CmpValue) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case ARM::CMPri:
+  case ARM::t2CMPri:
+    SrcReg = MI->getOperand(0).getReg();
+    SrcReg2 = 0;
+    CmpMask = ~0;
+    CmpValue = MI->getOperand(1).getImm();
+    return true;
+  case ARM::CMPrr:
+  case ARM::t2CMPrr:
+    SrcReg = MI->getOperand(0).getReg();
+    SrcReg2 = MI->getOperand(1).getReg();
+    CmpMask = ~0;
+    CmpValue = 0;
+    return true;
+  case ARM::TSTri:
+  case ARM::t2TSTri:
+    SrcReg = MI->getOperand(0).getReg();
+    SrcReg2 = 0;
+    CmpMask = MI->getOperand(1).getImm();
+    CmpValue = 0;
+    return true;
+  }
+
+  return false;
+}
+
+/// isSuitableForMask - Identify a suitable 'and' instruction that
+/// operates on the given source register and applies the same mask
+/// as a 'tst' instruction. Provide a limited look-through for copies.
+/// When successful, MI will hold the found instruction.
+static bool isSuitableForMask(MachineInstr *&MI, unsigned SrcReg,
+                              int CmpMask, bool CommonUse) {
+  switch (MI->getOpcode()) {
+    case ARM::ANDri:
+    case ARM::t2ANDri:
+      if (CmpMask != MI->getOperand(2).getImm())
+        return false;
+      if (SrcReg == MI->getOperand(CommonUse ? 1 : 0).getReg())
+        return true;
+      break;
+    case ARM::COPY: {
+      // Walk down one instruction which is potentially an 'and'.
+      const MachineInstr &Copy = *MI;
+      MachineBasicBlock::iterator AND(
+        std::next(MachineBasicBlock::iterator(MI)));
+      if (AND == MI->getParent()->end()) return false;
+      MI = AND;
+      return isSuitableForMask(MI, Copy.getOperand(0).getReg(),
+                               CmpMask, true);
+    }
+  }
+
+  return false;
+}
+
+/// getSwappedCondition - assume the flags are set by MI(a,b), return
+/// the condition code if we modify the instructions such that flags are
+/// set by MI(b,a).
+inline static ARMCC::CondCodes getSwappedCondition(ARMCC::CondCodes CC) {
+  switch (CC) {
+  default: return ARMCC::AL;
+  case ARMCC::EQ: return ARMCC::EQ;
+  case ARMCC::NE: return ARMCC::NE;
+  case ARMCC::HS: return ARMCC::LS;
+  case ARMCC::LO: return ARMCC::HI;
+  case ARMCC::HI: return ARMCC::LO;
+  case ARMCC::LS: return ARMCC::HS;
+  case ARMCC::GE: return ARMCC::LE;
+  case ARMCC::LT: return ARMCC::GT;
+  case ARMCC::GT: return ARMCC::LT;
+  case ARMCC::LE: return ARMCC::GE;
+  }
+}
+
+/// isRedundantFlagInstr - check whether the first instruction, whose only
+/// purpose is to update flags, can be made redundant.
+/// CMPrr can be made redundant by SUBrr if the operands are the same.
+/// CMPri can be made redundant by SUBri if the operands are the same.
+/// This function can be extended later on.
+inline static bool isRedundantFlagInstr(MachineInstr *CmpI, unsigned SrcReg,
+                                        unsigned SrcReg2, int ImmValue,
+                                        MachineInstr *OI) {
+  if ((CmpI->getOpcode() == ARM::CMPrr ||
+       CmpI->getOpcode() == ARM::t2CMPrr) &&
+      (OI->getOpcode() == ARM::SUBrr ||
+       OI->getOpcode() == ARM::t2SUBrr) &&
+      ((OI->getOperand(1).getReg() == SrcReg &&
+        OI->getOperand(2).getReg() == SrcReg2) ||
+       (OI->getOperand(1).getReg() == SrcReg2 &&
+        OI->getOperand(2).getReg() == SrcReg)))
+    return true;
+
+  if ((CmpI->getOpcode() == ARM::CMPri ||
+       CmpI->getOpcode() == ARM::t2CMPri) &&
+      (OI->getOpcode() == ARM::SUBri ||
+       OI->getOpcode() == ARM::t2SUBri) &&
+      OI->getOperand(1).getReg() == SrcReg &&
+      OI->getOperand(2).getImm() == ImmValue)
+    return true;
+  return false;
+}
+
+/// optimizeCompareInstr - Convert the instruction supplying the argument to the
+/// comparison into one that sets the zero bit in the flags register;
+/// Remove a redundant Compare instruction if an earlier instruction can set the
+/// flags in the same way as Compare.
+/// E.g. SUBrr(r1,r2) and CMPrr(r1,r2). We also handle the case where two
+/// operands are swapped: SUBrr(r1,r2) and CMPrr(r2,r1), by updating the
+/// condition code of instructions which use the flags.
+bool ARMBaseInstrInfo::
+optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
+                     int CmpMask, int CmpValue,
+                     const MachineRegisterInfo *MRI) const {
+  // Get the unique definition of SrcReg.
+  MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
+  if (!MI) return false;
+
+  // Masked compares sometimes use the same register as the corresponding 'and'.
+  if (CmpMask != ~0) {
+    if (!isSuitableForMask(MI, SrcReg, CmpMask, false) || isPredicated(MI)) {
+      MI = nullptr;
+      for (MachineRegisterInfo::use_instr_iterator
+           UI = MRI->use_instr_begin(SrcReg), UE = MRI->use_instr_end();
+           UI != UE; ++UI) {
+        if (UI->getParent() != CmpInstr->getParent()) continue;
+        MachineInstr *PotentialAND = &*UI;
+        if (!isSuitableForMask(PotentialAND, SrcReg, CmpMask, true) ||
+            isPredicated(PotentialAND))
+          continue;
+        MI = PotentialAND;
+        break;
+      }
+      if (!MI) return false;
+    }
+  }
+
+  // Get ready to iterate backward from CmpInstr.
+  MachineBasicBlock::iterator I = CmpInstr, E = MI,
+                              B = CmpInstr->getParent()->begin();
+
+  // Early exit if CmpInstr is at the beginning of the BB.
+  if (I == B) return false;
+
+  // There are two possible candidates which can be changed to set CPSR:
+  // One is MI, the other is a SUB instruction.
+  // For CMPrr(r1,r2), we are looking for SUB(r1,r2) or SUB(r2,r1).
+  // For CMPri(r1, CmpValue), we are looking for SUBri(r1, CmpValue).
+  MachineInstr *Sub = nullptr;
+  if (SrcReg2 != 0)
+    // MI is not a candidate for CMPrr.
+    MI = nullptr;
+  else if (MI->getParent() != CmpInstr->getParent() || CmpValue != 0) {
+    // Conservatively refuse to convert an instruction which isn't in the same
+    // BB as the comparison.
+    // For CMPri, we need to check Sub, thus we can't return here.
+    if (CmpInstr->getOpcode() == ARM::CMPri ||
+       CmpInstr->getOpcode() == ARM::t2CMPri)
+      MI = nullptr;
+    else
+      return false;
+  }
+
+  // Check that CPSR isn't set between the comparison instruction and the one we
+  // want to change. At the same time, search for Sub.
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  --I;
+  for (; I != E; --I) {
+    const MachineInstr &Instr = *I;
+
+    if (Instr.modifiesRegister(ARM::CPSR, TRI) ||
+        Instr.readsRegister(ARM::CPSR, TRI))
+      // This instruction modifies or uses CPSR after the one we want to
+      // change. We can't do this transformation.
+      return false;
+
+    // Check whether CmpInstr can be made redundant by the current instruction.
+    if (isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpValue, &*I)) {
+      Sub = &*I;
+      break;
+    }
+
+    if (I == B)
+      // The 'and' is below the comparison instruction.
+      return false;
+  }
+
+  // Return false if no candidates exist.
+  if (!MI && !Sub)
+    return false;
+
+  // The single candidate is called MI.
+  if (!MI) MI = Sub;
+
+  // We can't use a predicated instruction - it doesn't always write the flags.
+  if (isPredicated(MI))
+    return false;
+
+  switch (MI->getOpcode()) {
+  default: break;
+  case ARM::RSBrr:
+  case ARM::RSBri:
+  case ARM::RSCrr:
+  case ARM::RSCri:
+  case ARM::ADDrr:
+  case ARM::ADDri:
+  case ARM::ADCrr:
+  case ARM::ADCri:
+  case ARM::SUBrr:
+  case ARM::SUBri:
+  case ARM::SBCrr:
+  case ARM::SBCri:
+  case ARM::t2RSBri:
+  case ARM::t2ADDrr:
+  case ARM::t2ADDri:
+  case ARM::t2ADCrr:
+  case ARM::t2ADCri:
+  case ARM::t2SUBrr:
+  case ARM::t2SUBri:
+  case ARM::t2SBCrr:
+  case ARM::t2SBCri:
+  case ARM::ANDrr:
+  case ARM::ANDri:
+  case ARM::t2ANDrr:
+  case ARM::t2ANDri:
+  case ARM::ORRrr:
+  case ARM::ORRri:
+  case ARM::t2ORRrr:
+  case ARM::t2ORRri:
+  case ARM::EORrr:
+  case ARM::EORri:
+  case ARM::t2EORrr:
+  case ARM::t2EORri: {
+    // Scan forward for the use of CPSR
+    // When checking against MI: if it's a conditional code requires
+    // checking of V bit, then this is not safe to do.
+    // It is safe to remove CmpInstr if CPSR is redefined or killed.
+    // If we are done with the basic block, we need to check whether CPSR is
+    // live-out.
+    SmallVector<std::pair<MachineOperand*, ARMCC::CondCodes>, 4>
+        OperandsToUpdate;
+    bool isSafe = false;
+    I = CmpInstr;
+    E = CmpInstr->getParent()->end();
+    while (!isSafe && ++I != E) {
+      const MachineInstr &Instr = *I;
+      for (unsigned IO = 0, EO = Instr.getNumOperands();
+           !isSafe && IO != EO; ++IO) {
+        const MachineOperand &MO = Instr.getOperand(IO);
+        if (MO.isRegMask() && MO.clobbersPhysReg(ARM::CPSR)) {
+          isSafe = true;
+          break;
+        }
+        if (!MO.isReg() || MO.getReg() != ARM::CPSR)
+          continue;
+        if (MO.isDef()) {
+          isSafe = true;
+          break;
+        }
+        // Condition code is after the operand before CPSR except for VSELs.
+        ARMCC::CondCodes CC;
+        bool IsInstrVSel = true;
+        switch (Instr.getOpcode()) {
+        default:
+          IsInstrVSel = false;
+          CC = (ARMCC::CondCodes)Instr.getOperand(IO - 1).getImm();
+          break;
+        case ARM::VSELEQD:
+        case ARM::VSELEQS:
+          CC = ARMCC::EQ;
+          break;
+        case ARM::VSELGTD:
+        case ARM::VSELGTS:
+          CC = ARMCC::GT;
+          break;
+        case ARM::VSELGED:
+        case ARM::VSELGES:
+          CC = ARMCC::GE;
+          break;
+        case ARM::VSELVSS:
+        case ARM::VSELVSD:
+          CC = ARMCC::VS;
+          break;
+        }
+
+        if (Sub) {
+          ARMCC::CondCodes NewCC = getSwappedCondition(CC);
+          if (NewCC == ARMCC::AL)
+            return false;
+          // If we have SUB(r1, r2) and CMP(r2, r1), the condition code based
+          // on CMP needs to be updated to be based on SUB.
+          // Push the condition code operands to OperandsToUpdate.
+          // If it is safe to remove CmpInstr, the condition code of these
+          // operands will be modified.
+          if (SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 &&
+              Sub->getOperand(2).getReg() == SrcReg) {
+            // VSel doesn't support condition code update.
+            if (IsInstrVSel)
+              return false;
+            OperandsToUpdate.push_back(
+                std::make_pair(&((*I).getOperand(IO - 1)), NewCC));
+          }
+        } else
+          switch (CC) {
+          default:
+            // CPSR can be used multiple times, we should continue.
+            break;
+          case ARMCC::VS:
+          case ARMCC::VC:
+          case ARMCC::GE:
+          case ARMCC::LT:
+          case ARMCC::GT:
+          case ARMCC::LE:
+            return false;
+          }
+      }
+    }
+
+    // If CPSR is not killed nor re-defined, we should check whether it is
+    // live-out. If it is live-out, do not optimize.
+    if (!isSafe) {
+      MachineBasicBlock *MBB = CmpInstr->getParent();
+      for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+               SE = MBB->succ_end(); SI != SE; ++SI)
+        if ((*SI)->isLiveIn(ARM::CPSR))
+          return false;
+    }
+
+    // Toggle the optional operand to CPSR.
+    MI->getOperand(5).setReg(ARM::CPSR);
+    MI->getOperand(5).setIsDef(true);
+    assert(!isPredicated(MI) && "Can't use flags from predicated instruction");
+    CmpInstr->eraseFromParent();
+
+    // Modify the condition code of operands in OperandsToUpdate.
+    // Since we have SUB(r1, r2) and CMP(r2, r1), the condition code needs to
+    // be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
+    for (unsigned i = 0, e = OperandsToUpdate.size(); i < e; i++)
+      OperandsToUpdate[i].first->setImm(OperandsToUpdate[i].second);
+    return true;
+  }
+  }
+
+  return false;
+}
+
+bool ARMBaseInstrInfo::FoldImmediate(MachineInstr *UseMI,
+                                     MachineInstr *DefMI, unsigned Reg,
+                                     MachineRegisterInfo *MRI) const {
+  // Fold large immediates into add, sub, or, xor.
+  unsigned DefOpc = DefMI->getOpcode();
+  if (DefOpc != ARM::t2MOVi32imm && DefOpc != ARM::MOVi32imm)
+    return false;
+  if (!DefMI->getOperand(1).isImm())
+    // Could be t2MOVi32imm <ga:xx>
+    return false;
+
+  if (!MRI->hasOneNonDBGUse(Reg))
+    return false;
+
+  const MCInstrDesc &DefMCID = DefMI->getDesc();
+  if (DefMCID.hasOptionalDef()) {
+    unsigned NumOps = DefMCID.getNumOperands();
+    const MachineOperand &MO = DefMI->getOperand(NumOps-1);
+    if (MO.getReg() == ARM::CPSR && !MO.isDead())
+      // If DefMI defines CPSR and it is not dead, it's obviously not safe
+      // to delete DefMI.
+      return false;
+  }
+
+  const MCInstrDesc &UseMCID = UseMI->getDesc();
+  if (UseMCID.hasOptionalDef()) {
+    unsigned NumOps = UseMCID.getNumOperands();
+    if (UseMI->getOperand(NumOps-1).getReg() == ARM::CPSR)
+      // If the instruction sets the flag, do not attempt this optimization
+      // since it may change the semantics of the code.
+      return false;
+  }
+
+  unsigned UseOpc = UseMI->getOpcode();
+  unsigned NewUseOpc = 0;
+  uint32_t ImmVal = (uint32_t)DefMI->getOperand(1).getImm();
+  uint32_t SOImmValV1 = 0, SOImmValV2 = 0;
+  bool Commute = false;
+  switch (UseOpc) {
+  default: return false;
+  case ARM::SUBrr:
+  case ARM::ADDrr:
+  case ARM::ORRrr:
+  case ARM::EORrr:
+  case ARM::t2SUBrr:
+  case ARM::t2ADDrr:
+  case ARM::t2ORRrr:
+  case ARM::t2EORrr: {
+    Commute = UseMI->getOperand(2).getReg() != Reg;
+    switch (UseOpc) {
+    default: break;
+    case ARM::SUBrr: {
+      if (Commute)
+        return false;
+      ImmVal = -ImmVal;
+      NewUseOpc = ARM::SUBri;
+      // Fallthrough
+    }
+    case ARM::ADDrr:
+    case ARM::ORRrr:
+    case ARM::EORrr: {
+      if (!ARM_AM::isSOImmTwoPartVal(ImmVal))
+        return false;
+      SOImmValV1 = (uint32_t)ARM_AM::getSOImmTwoPartFirst(ImmVal);
+      SOImmValV2 = (uint32_t)ARM_AM::getSOImmTwoPartSecond(ImmVal);
+      switch (UseOpc) {
+      default: break;
+      case ARM::ADDrr: NewUseOpc = ARM::ADDri; break;
+      case ARM::ORRrr: NewUseOpc = ARM::ORRri; break;
+      case ARM::EORrr: NewUseOpc = ARM::EORri; break;
+      }
+      break;
+    }
+    case ARM::t2SUBrr: {
+      if (Commute)
+        return false;
+      ImmVal = -ImmVal;
+      NewUseOpc = ARM::t2SUBri;
+      // Fallthrough
+    }
+    case ARM::t2ADDrr:
+    case ARM::t2ORRrr:
+    case ARM::t2EORrr: {
+      if (!ARM_AM::isT2SOImmTwoPartVal(ImmVal))
+        return false;
+      SOImmValV1 = (uint32_t)ARM_AM::getT2SOImmTwoPartFirst(ImmVal);
+      SOImmValV2 = (uint32_t)ARM_AM::getT2SOImmTwoPartSecond(ImmVal);
+      switch (UseOpc) {
+      default: break;
+      case ARM::t2ADDrr: NewUseOpc = ARM::t2ADDri; break;
+      case ARM::t2ORRrr: NewUseOpc = ARM::t2ORRri; break;
+      case ARM::t2EORrr: NewUseOpc = ARM::t2EORri; break;
+      }
+      break;
+    }
+    }
+  }
+  }
+
+  unsigned OpIdx = Commute ? 2 : 1;
+  unsigned Reg1 = UseMI->getOperand(OpIdx).getReg();
+  bool isKill = UseMI->getOperand(OpIdx).isKill();
+  unsigned NewReg = MRI->createVirtualRegister(MRI->getRegClass(Reg));
+  AddDefaultCC(AddDefaultPred(BuildMI(*UseMI->getParent(),
+                                      UseMI, UseMI->getDebugLoc(),
+                                      get(NewUseOpc), NewReg)
+                              .addReg(Reg1, getKillRegState(isKill))
+                              .addImm(SOImmValV1)));
+  UseMI->setDesc(get(NewUseOpc));
+  UseMI->getOperand(1).setReg(NewReg);
+  UseMI->getOperand(1).setIsKill();
+  UseMI->getOperand(2).ChangeToImmediate(SOImmValV2);
+  DefMI->eraseFromParent();
+  return true;
+}
+
+static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
+                                        const MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  default: {
+    const MCInstrDesc &Desc = MI->getDesc();
+    int UOps = ItinData->getNumMicroOps(Desc.getSchedClass());
+    assert(UOps >= 0 && "bad # UOps");
+    return UOps;
+  }
+
+  case ARM::LDRrs:
+  case ARM::LDRBrs:
+  case ARM::STRrs:
+  case ARM::STRBrs: {
+    unsigned ShOpVal = MI->getOperand(3).getImm();
+    bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
+    unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+    if (!isSub &&
+        (ShImm == 0 ||
+         ((ShImm == 1 || ShImm == 2 || ShImm == 3) &&
+          ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl)))
+      return 1;
+    return 2;
+  }
+
+  case ARM::LDRH:
+  case ARM::STRH: {
+    if (!MI->getOperand(2).getReg())
+      return 1;
+
+    unsigned ShOpVal = MI->getOperand(3).getImm();
+    bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
+    unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+    if (!isSub &&
+        (ShImm == 0 ||
+         ((ShImm == 1 || ShImm == 2 || ShImm == 3) &&
+          ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl)))
+      return 1;
+    return 2;
+  }
+
+  case ARM::LDRSB:
+  case ARM::LDRSH:
+    return (ARM_AM::getAM3Op(MI->getOperand(3).getImm()) == ARM_AM::sub) ? 3:2;
+
+  case ARM::LDRSB_POST:
+  case ARM::LDRSH_POST: {
+    unsigned Rt = MI->getOperand(0).getReg();
+    unsigned Rm = MI->getOperand(3).getReg();
+    return (Rt == Rm) ? 4 : 3;
+  }
+
+  case ARM::LDR_PRE_REG:
+  case ARM::LDRB_PRE_REG: {
+    unsigned Rt = MI->getOperand(0).getReg();
+    unsigned Rm = MI->getOperand(3).getReg();
+    if (Rt == Rm)
+      return 3;
+    unsigned ShOpVal = MI->getOperand(4).getImm();
+    bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
+    unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+    if (!isSub &&
+        (ShImm == 0 ||
+         ((ShImm == 1 || ShImm == 2 || ShImm == 3) &&
+          ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl)))
+      return 2;
+    return 3;
+  }
+
+  case ARM::STR_PRE_REG:
+  case ARM::STRB_PRE_REG: {
+    unsigned ShOpVal = MI->getOperand(4).getImm();
+    bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
+    unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+    if (!isSub &&
+        (ShImm == 0 ||
+         ((ShImm == 1 || ShImm == 2 || ShImm == 3) &&
+          ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl)))
+      return 2;
+    return 3;
+  }
+
+  case ARM::LDRH_PRE:
+  case ARM::STRH_PRE: {
+    unsigned Rt = MI->getOperand(0).getReg();
+    unsigned Rm = MI->getOperand(3).getReg();
+    if (!Rm)
+      return 2;
+    if (Rt == Rm)
+      return 3;
+    return (ARM_AM::getAM3Op(MI->getOperand(4).getImm()) == ARM_AM::sub)
+      ? 3 : 2;
+  }
+
+  case ARM::LDR_POST_REG:
+  case ARM::LDRB_POST_REG:
+  case ARM::LDRH_POST: {
+    unsigned Rt = MI->getOperand(0).getReg();
+    unsigned Rm = MI->getOperand(3).getReg();
+    return (Rt == Rm) ? 3 : 2;
+  }
+
+  case ARM::LDR_PRE_IMM:
+  case ARM::LDRB_PRE_IMM:
+  case ARM::LDR_POST_IMM:
+  case ARM::LDRB_POST_IMM:
+  case ARM::STRB_POST_IMM:
+  case ARM::STRB_POST_REG:
+  case ARM::STRB_PRE_IMM:
+  case ARM::STRH_POST:
+  case ARM::STR_POST_IMM:
+  case ARM::STR_POST_REG:
+  case ARM::STR_PRE_IMM:
+    return 2;
+
+  case ARM::LDRSB_PRE:
+  case ARM::LDRSH_PRE: {
+    unsigned Rm = MI->getOperand(3).getReg();
+    if (Rm == 0)
+      return 3;
+    unsigned Rt = MI->getOperand(0).getReg();
+    if (Rt == Rm)
+      return 4;
+    unsigned ShOpVal = MI->getOperand(4).getImm();
+    bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
+    unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+    if (!isSub &&
+        (ShImm == 0 ||
+         ((ShImm == 1 || ShImm == 2 || ShImm == 3) &&
+          ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl)))
+      return 3;
+    return 4;
+  }
+
+  case ARM::LDRD: {
+    unsigned Rt = MI->getOperand(0).getReg();
+    unsigned Rn = MI->getOperand(2).getReg();
+    unsigned Rm = MI->getOperand(3).getReg();
+    if (Rm)
+      return (ARM_AM::getAM3Op(MI->getOperand(4).getImm()) == ARM_AM::sub) ?4:3;
+    return (Rt == Rn) ? 3 : 2;
+  }
+
+  case ARM::STRD: {
+    unsigned Rm = MI->getOperand(3).getReg();
+    if (Rm)
+      return (ARM_AM::getAM3Op(MI->getOperand(4).getImm()) == ARM_AM::sub) ?4:3;
+    return 2;
+  }
+
+  case ARM::LDRD_POST:
+  case ARM::t2LDRD_POST:
+    return 3;
+
+  case ARM::STRD_POST:
+  case ARM::t2STRD_POST:
+    return 4;
+
+  case ARM::LDRD_PRE: {
+    unsigned Rt = MI->getOperand(0).getReg();
+    unsigned Rn = MI->getOperand(3).getReg();
+    unsigned Rm = MI->getOperand(4).getReg();
+    if (Rm)
+      return (ARM_AM::getAM3Op(MI->getOperand(5).getImm()) == ARM_AM::sub) ?5:4;
+    return (Rt == Rn) ? 4 : 3;
+  }
+
+  case ARM::t2LDRD_PRE: {
+    unsigned Rt = MI->getOperand(0).getReg();
+    unsigned Rn = MI->getOperand(3).getReg();
+    return (Rt == Rn) ? 4 : 3;
+  }
+
+  case ARM::STRD_PRE: {
+    unsigned Rm = MI->getOperand(4).getReg();
+    if (Rm)
+      return (ARM_AM::getAM3Op(MI->getOperand(5).getImm()) == ARM_AM::sub) ?5:4;
+    return 3;
+  }
+
+  case ARM::t2STRD_PRE:
+    return 3;
+
+  case ARM::t2LDR_POST:
+  case ARM::t2LDRB_POST:
+  case ARM::t2LDRB_PRE:
+  case ARM::t2LDRSBi12:
+  case ARM::t2LDRSBi8:
+  case ARM::t2LDRSBpci:
+  case ARM::t2LDRSBs:
+  case ARM::t2LDRH_POST:
+  case ARM::t2LDRH_PRE:
+  case ARM::t2LDRSBT:
+  case ARM::t2LDRSB_POST:
+  case ARM::t2LDRSB_PRE:
+  case ARM::t2LDRSH_POST:
+  case ARM::t2LDRSH_PRE:
+  case ARM::t2LDRSHi12:
+  case ARM::t2LDRSHi8:
+  case ARM::t2LDRSHpci:
+  case ARM::t2LDRSHs:
+    return 2;
+
+  case ARM::t2LDRDi8: {
+    unsigned Rt = MI->getOperand(0).getReg();
+    unsigned Rn = MI->getOperand(2).getReg();
+    return (Rt == Rn) ? 3 : 2;
+  }
+
+  case ARM::t2STRB_POST:
+  case ARM::t2STRB_PRE:
+  case ARM::t2STRBs:
+  case ARM::t2STRDi8:
+  case ARM::t2STRH_POST:
+  case ARM::t2STRH_PRE:
+  case ARM::t2STRHs:
+  case ARM::t2STR_POST:
+  case ARM::t2STR_PRE:
+  case ARM::t2STRs:
+    return 2;
+  }
+}
+
+// Return the number of 32-bit words loaded by LDM or stored by STM. If this
+// can't be easily determined return 0 (missing MachineMemOperand).
+//
+// FIXME: The current MachineInstr design does not support relying on machine
+// mem operands to determine the width of a memory access. Instead, we expect
+// the target to provide this information based on the instruction opcode and
+// operands. However, using MachineMemOperand is a the best solution now for
+// two reasons:
+//
+// 1) getNumMicroOps tries to infer LDM memory width from the total number of MI
+// operands. This is much more dangerous than using the MachineMemOperand
+// sizes because CodeGen passes can insert/remove optional machine operands. In
+// fact, it's totally incorrect for preRA passes and appears to be wrong for
+// postRA passes as well.
+//
+// 2) getNumLDMAddresses is only used by the scheduling machine model and any
+// machine model that calls this should handle the unknown (zero size) case.
+//
+// Long term, we should require a target hook that verifies MachineMemOperand
+// sizes during MC lowering. That target hook should be local to MC lowering
+// because we can't ensure that it is aware of other MI forms. Doing this will
+// ensure that MachineMemOperands are correctly propagated through all passes.
+unsigned ARMBaseInstrInfo::getNumLDMAddresses(const MachineInstr *MI) const {
+  unsigned Size = 0;
+  for (MachineInstr::mmo_iterator I = MI->memoperands_begin(),
+         E = MI->memoperands_end(); I != E; ++I) {
+    Size += (*I)->getSize();
+  }
+  return Size / 4;
+}
+
+unsigned
+ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
+                                 const MachineInstr *MI) const {
+  if (!ItinData || ItinData->isEmpty())
+    return 1;
+
+  const MCInstrDesc &Desc = MI->getDesc();
+  unsigned Class = Desc.getSchedClass();
+  int ItinUOps = ItinData->getNumMicroOps(Class);
+  if (ItinUOps >= 0) {
+    if (Subtarget.isSwift() && (Desc.mayLoad() || Desc.mayStore()))
+      return getNumMicroOpsSwiftLdSt(ItinData, MI);
+
+    return ItinUOps;
+  }
+
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+  default:
+    llvm_unreachable("Unexpected multi-uops instruction!");
+  case ARM::VLDMQIA:
+  case ARM::VSTMQIA:
+    return 2;
+
+  // The number of uOps for load / store multiple are determined by the number
+  // registers.
+  //
+  // On Cortex-A8, each pair of register loads / stores can be scheduled on the
+  // same cycle. The scheduling for the first load / store must be done
+  // separately by assuming the address is not 64-bit aligned.
+  //
+  // On Cortex-A9, the formula is simply (#reg / 2) + (#reg % 2). If the address
+  // is not 64-bit aligned, then AGU would take an extra cycle.  For VFP / NEON
+  // load / store multiple, the formula is (#reg / 2) + (#reg % 2) + 1.
+  case ARM::VLDMDIA:
+  case ARM::VLDMDIA_UPD:
+  case ARM::VLDMDDB_UPD:
+  case ARM::VLDMSIA:
+  case ARM::VLDMSIA_UPD:
+  case ARM::VLDMSDB_UPD:
+  case ARM::VSTMDIA:
+  case ARM::VSTMDIA_UPD:
+  case ARM::VSTMDDB_UPD:
+  case ARM::VSTMSIA:
+  case ARM::VSTMSIA_UPD:
+  case ARM::VSTMSDB_UPD: {
+    unsigned NumRegs = MI->getNumOperands() - Desc.getNumOperands();
+    return (NumRegs / 2) + (NumRegs % 2) + 1;
+  }
+
+  case ARM::LDMIA_RET:
+  case ARM::LDMIA:
+  case ARM::LDMDA:
+  case ARM::LDMDB:
+  case ARM::LDMIB:
+  case ARM::LDMIA_UPD:
+  case ARM::LDMDA_UPD:
+  case ARM::LDMDB_UPD:
+  case ARM::LDMIB_UPD:
+  case ARM::STMIA:
+  case ARM::STMDA:
+  case ARM::STMDB:
+  case ARM::STMIB:
+  case ARM::STMIA_UPD:
+  case ARM::STMDA_UPD:
+  case ARM::STMDB_UPD:
+  case ARM::STMIB_UPD:
+  case ARM::tLDMIA:
+  case ARM::tLDMIA_UPD:
+  case ARM::tSTMIA_UPD:
+  case ARM::tPOP_RET:
+  case ARM::tPOP:
+  case ARM::tPUSH:
+  case ARM::t2LDMIA_RET:
+  case ARM::t2LDMIA:
+  case ARM::t2LDMDB:
+  case ARM::t2LDMIA_UPD:
+  case ARM::t2LDMDB_UPD:
+  case ARM::t2STMIA:
+  case ARM::t2STMDB:
+  case ARM::t2STMIA_UPD:
+  case ARM::t2STMDB_UPD: {
+    unsigned NumRegs = MI->getNumOperands() - Desc.getNumOperands() + 1;
+    if (Subtarget.isSwift()) {
+      int UOps = 1 + NumRegs;  // One for address computation, one for each ld / st.
+      switch (Opc) {
+      default: break;
+      case ARM::VLDMDIA_UPD:
+      case ARM::VLDMDDB_UPD:
+      case ARM::VLDMSIA_UPD:
+      case ARM::VLDMSDB_UPD:
+      case ARM::VSTMDIA_UPD:
+      case ARM::VSTMDDB_UPD:
+      case ARM::VSTMSIA_UPD:
+      case ARM::VSTMSDB_UPD:
+      case ARM::LDMIA_UPD:
+      case ARM::LDMDA_UPD:
+      case ARM::LDMDB_UPD:
+      case ARM::LDMIB_UPD:
+      case ARM::STMIA_UPD:
+      case ARM::STMDA_UPD:
+      case ARM::STMDB_UPD:
+      case ARM::STMIB_UPD:
+      case ARM::tLDMIA_UPD:
+      case ARM::tSTMIA_UPD:
+      case ARM::t2LDMIA_UPD:
+      case ARM::t2LDMDB_UPD:
+      case ARM::t2STMIA_UPD:
+      case ARM::t2STMDB_UPD:
+        ++UOps; // One for base register writeback.
+        break;
+      case ARM::LDMIA_RET:
+      case ARM::tPOP_RET:
+      case ARM::t2LDMIA_RET:
+        UOps += 2; // One for base reg wb, one for write to pc.
+        break;
+      }
+      return UOps;
+    } else if (Subtarget.isCortexA8() || Subtarget.isCortexA7()) {
+      if (NumRegs < 4)
+        return 2;
+      // 4 registers would be issued: 2, 2.
+      // 5 registers would be issued: 2, 2, 1.
+      int A8UOps = (NumRegs / 2);
+      if (NumRegs % 2)
+        ++A8UOps;
+      return A8UOps;
+    } else if (Subtarget.isLikeA9() || Subtarget.isSwift()) {
+      int A9UOps = (NumRegs / 2);
+      // If there are odd number of registers or if it's not 64-bit aligned,
+      // then it takes an extra AGU (Address Generation Unit) cycle.
+      if ((NumRegs % 2) ||
+          !MI->hasOneMemOperand() ||
+          (*MI->memoperands_begin())->getAlignment() < 8)
+        ++A9UOps;
+      return A9UOps;
+    } else {
+      // Assume the worst.
+      return NumRegs;
+    }
+  }
+  }
+}
+
+int
+ARMBaseInstrInfo::getVLDMDefCycle(const InstrItineraryData *ItinData,
+                                  const MCInstrDesc &DefMCID,
+                                  unsigned DefClass,
+                                  unsigned DefIdx, unsigned DefAlign) const {
+  int RegNo = (int)(DefIdx+1) - DefMCID.getNumOperands() + 1;
+  if (RegNo <= 0)
+    // Def is the address writeback.
+    return ItinData->getOperandCycle(DefClass, DefIdx);
+
+  int DefCycle;
+  if (Subtarget.isCortexA8() || Subtarget.isCortexA7()) {
+    // (regno / 2) + (regno % 2) + 1
+    DefCycle = RegNo / 2 + 1;
+    if (RegNo % 2)
+      ++DefCycle;
+  } else if (Subtarget.isLikeA9() || Subtarget.isSwift()) {
+    DefCycle = RegNo;
+    bool isSLoad = false;
+
+    switch (DefMCID.getOpcode()) {
+    default: break;
+    case ARM::VLDMSIA:
+    case ARM::VLDMSIA_UPD:
+    case ARM::VLDMSDB_UPD:
+      isSLoad = true;
+      break;
+    }
+
+    // If there are odd number of 'S' registers or if it's not 64-bit aligned,
+    // then it takes an extra cycle.
+    if ((isSLoad && (RegNo % 2)) || DefAlign < 8)
+      ++DefCycle;
+  } else {
+    // Assume the worst.
+    DefCycle = RegNo + 2;
+  }
+
+  return DefCycle;
+}
+
+int
+ARMBaseInstrInfo::getLDMDefCycle(const InstrItineraryData *ItinData,
+                                 const MCInstrDesc &DefMCID,
+                                 unsigned DefClass,
+                                 unsigned DefIdx, unsigned DefAlign) const {
+  int RegNo = (int)(DefIdx+1) - DefMCID.getNumOperands() + 1;
+  if (RegNo <= 0)
+    // Def is the address writeback.
+    return ItinData->getOperandCycle(DefClass, DefIdx);
+
+  int DefCycle;
+  if (Subtarget.isCortexA8() || Subtarget.isCortexA7()) {
+    // 4 registers would be issued: 1, 2, 1.
+    // 5 registers would be issued: 1, 2, 2.
+    DefCycle = RegNo / 2;
+    if (DefCycle < 1)
+      DefCycle = 1;
+    // Result latency is issue cycle + 2: E2.
+    DefCycle += 2;
+  } else if (Subtarget.isLikeA9() || Subtarget.isSwift()) {
+    DefCycle = (RegNo / 2);
+    // If there are odd number of registers or if it's not 64-bit aligned,
+    // then it takes an extra AGU (Address Generation Unit) cycle.
+    if ((RegNo % 2) || DefAlign < 8)
+      ++DefCycle;
+    // Result latency is AGU cycles + 2.
+    DefCycle += 2;
+  } else {
+    // Assume the worst.
+    DefCycle = RegNo + 2;
+  }
+
+  return DefCycle;
+}
+
+int
+ARMBaseInstrInfo::getVSTMUseCycle(const InstrItineraryData *ItinData,
+                                  const MCInstrDesc &UseMCID,
+                                  unsigned UseClass,
+                                  unsigned UseIdx, unsigned UseAlign) const {
+  int RegNo = (int)(UseIdx+1) - UseMCID.getNumOperands() + 1;
+  if (RegNo <= 0)
+    return ItinData->getOperandCycle(UseClass, UseIdx);
+
+  int UseCycle;
+  if (Subtarget.isCortexA8() || Subtarget.isCortexA7()) {
+    // (regno / 2) + (regno % 2) + 1
+    UseCycle = RegNo / 2 + 1;
+    if (RegNo % 2)
+      ++UseCycle;
+  } else if (Subtarget.isLikeA9() || Subtarget.isSwift()) {
+    UseCycle = RegNo;
+    bool isSStore = false;
+
+    switch (UseMCID.getOpcode()) {
+    default: break;
+    case ARM::VSTMSIA:
+    case ARM::VSTMSIA_UPD:
+    case ARM::VSTMSDB_UPD:
+      isSStore = true;
+      break;
+    }
+
+    // If there are odd number of 'S' registers or if it's not 64-bit aligned,
+    // then it takes an extra cycle.
+    if ((isSStore && (RegNo % 2)) || UseAlign < 8)
+      ++UseCycle;
+  } else {
+    // Assume the worst.
+    UseCycle = RegNo + 2;
+  }
+
+  return UseCycle;
+}
+
+int
+ARMBaseInstrInfo::getSTMUseCycle(const InstrItineraryData *ItinData,
+                                 const MCInstrDesc &UseMCID,
+                                 unsigned UseClass,
+                                 unsigned UseIdx, unsigned UseAlign) const {
+  int RegNo = (int)(UseIdx+1) - UseMCID.getNumOperands() + 1;
+  if (RegNo <= 0)
+    return ItinData->getOperandCycle(UseClass, UseIdx);
+
+  int UseCycle;
+  if (Subtarget.isCortexA8() || Subtarget.isCortexA7()) {
+    UseCycle = RegNo / 2;
+    if (UseCycle < 2)
+      UseCycle = 2;
+    // Read in E3.
+    UseCycle += 2;
+  } else if (Subtarget.isLikeA9() || Subtarget.isSwift()) {
+    UseCycle = (RegNo / 2);
+    // If there are odd number of registers or if it's not 64-bit aligned,
+    // then it takes an extra AGU (Address Generation Unit) cycle.
+    if ((RegNo % 2) || UseAlign < 8)
+      ++UseCycle;
+  } else {
+    // Assume the worst.
+    UseCycle = 1;
+  }
+  return UseCycle;
+}
+
+int
+ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
+                                    const MCInstrDesc &DefMCID,
+                                    unsigned DefIdx, unsigned DefAlign,
+                                    const MCInstrDesc &UseMCID,
+                                    unsigned UseIdx, unsigned UseAlign) const {
+  unsigned DefClass = DefMCID.getSchedClass();
+  unsigned UseClass = UseMCID.getSchedClass();
+
+  if (DefIdx < DefMCID.getNumDefs() && UseIdx < UseMCID.getNumOperands())
+    return ItinData->getOperandLatency(DefClass, DefIdx, UseClass, UseIdx);
+
+  // This may be a def / use of a variable_ops instruction, the operand
+  // latency might be determinable dynamically. Let the target try to
+  // figure it out.
+  int DefCycle = -1;
+  bool LdmBypass = false;
+  switch (DefMCID.getOpcode()) {
+  default:
+    DefCycle = ItinData->getOperandCycle(DefClass, DefIdx);
+    break;
+
+  case ARM::VLDMDIA:
+  case ARM::VLDMDIA_UPD:
+  case ARM::VLDMDDB_UPD:
+  case ARM::VLDMSIA:
+  case ARM::VLDMSIA_UPD:
+  case ARM::VLDMSDB_UPD:
+    DefCycle = getVLDMDefCycle(ItinData, DefMCID, DefClass, DefIdx, DefAlign);
+    break;
+
+  case ARM::LDMIA_RET:
+  case ARM::LDMIA:
+  case ARM::LDMDA:
+  case ARM::LDMDB:
+  case ARM::LDMIB:
+  case ARM::LDMIA_UPD:
+  case ARM::LDMDA_UPD:
+  case ARM::LDMDB_UPD:
+  case ARM::LDMIB_UPD:
+  case ARM::tLDMIA:
+  case ARM::tLDMIA_UPD:
+  case ARM::tPUSH:
+  case ARM::t2LDMIA_RET:
+  case ARM::t2LDMIA:
+  case ARM::t2LDMDB:
+  case ARM::t2LDMIA_UPD:
+  case ARM::t2LDMDB_UPD:
+    LdmBypass = 1;
+    DefCycle = getLDMDefCycle(ItinData, DefMCID, DefClass, DefIdx, DefAlign);
+    break;
+  }
+
+  if (DefCycle == -1)
+    // We can't seem to determine the result latency of the def, assume it's 2.
+    DefCycle = 2;
+
+  int UseCycle = -1;
+  switch (UseMCID.getOpcode()) {
+  default:
+    UseCycle = ItinData->getOperandCycle(UseClass, UseIdx);
+    break;
+
+  case ARM::VSTMDIA:
+  case ARM::VSTMDIA_UPD:
+  case ARM::VSTMDDB_UPD:
+  case ARM::VSTMSIA:
+  case ARM::VSTMSIA_UPD:
+  case ARM::VSTMSDB_UPD:
+    UseCycle = getVSTMUseCycle(ItinData, UseMCID, UseClass, UseIdx, UseAlign);
+    break;
+
+  case ARM::STMIA:
+  case ARM::STMDA:
+  case ARM::STMDB:
+  case ARM::STMIB:
+  case ARM::STMIA_UPD:
+  case ARM::STMDA_UPD:
+  case ARM::STMDB_UPD:
+  case ARM::STMIB_UPD:
+  case ARM::tSTMIA_UPD:
+  case ARM::tPOP_RET:
+  case ARM::tPOP:
+  case ARM::t2STMIA:
+  case ARM::t2STMDB:
+  case ARM::t2STMIA_UPD:
+  case ARM::t2STMDB_UPD:
+    UseCycle = getSTMUseCycle(ItinData, UseMCID, UseClass, UseIdx, UseAlign);
+    break;
+  }
+
+  if (UseCycle == -1)
+    // Assume it's read in the first stage.
+    UseCycle = 1;
+
+  UseCycle = DefCycle - UseCycle + 1;
+  if (UseCycle > 0) {
+    if (LdmBypass) {
+      // It's a variable_ops instruction so we can't use DefIdx here. Just use
+      // first def operand.
+      if (ItinData->hasPipelineForwarding(DefClass, DefMCID.getNumOperands()-1,
+                                          UseClass, UseIdx))
+        --UseCycle;
+    } else if (ItinData->hasPipelineForwarding(DefClass, DefIdx,
+                                               UseClass, UseIdx)) {
+      --UseCycle;
+    }
+  }
+
+  return UseCycle;
+}
+
+static const MachineInstr *getBundledDefMI(const TargetRegisterInfo *TRI,
+                                           const MachineInstr *MI, unsigned Reg,
+                                           unsigned &DefIdx, unsigned &Dist) {
+  Dist = 0;
+
+  MachineBasicBlock::const_iterator I = MI; ++I;
+  MachineBasicBlock::const_instr_iterator II = std::prev(I.getInstrIterator());
+  assert(II->isInsideBundle() && "Empty bundle?");
+
+  int Idx = -1;
+  while (II->isInsideBundle()) {
+    Idx = II->findRegisterDefOperandIdx(Reg, false, true, TRI);
+    if (Idx != -1)
+      break;
+    --II;
+    ++Dist;
+  }
+
+  assert(Idx != -1 && "Cannot find bundled definition!");
+  DefIdx = Idx;
+  return II;
+}
+
+static const MachineInstr *getBundledUseMI(const TargetRegisterInfo *TRI,
+                                           const MachineInstr *MI, unsigned Reg,
+                                           unsigned &UseIdx, unsigned &Dist) {
+  Dist = 0;
+
+  MachineBasicBlock::const_instr_iterator II = MI; ++II;
+  assert(II->isInsideBundle() && "Empty bundle?");
+  MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
+
+  // FIXME: This doesn't properly handle multiple uses.
+  int Idx = -1;
+  while (II != E && II->isInsideBundle()) {
+    Idx = II->findRegisterUseOperandIdx(Reg, false, TRI);
+    if (Idx != -1)
+      break;
+    if (II->getOpcode() != ARM::t2IT)
+      ++Dist;
+    ++II;
+  }
+
+  if (Idx == -1) {
+    Dist = 0;
+    return nullptr;
+  }
+
+  UseIdx = Idx;
+  return II;
+}
+
+/// Return the number of cycles to add to (or subtract from) the static
+/// itinerary based on the def opcode and alignment. The caller will ensure that
+/// adjusted latency is at least one cycle.
+static int adjustDefLatency(const ARMSubtarget &Subtarget,
+                            const MachineInstr *DefMI,
+                            const MCInstrDesc *DefMCID, unsigned DefAlign) {
+  int Adjust = 0;
+  if (Subtarget.isCortexA8() || Subtarget.isLikeA9() || Subtarget.isCortexA7()) {
+    // FIXME: Shifter op hack: no shift (i.e. [r +/- r]) or [r + r << 2]
+    // variants are one cycle cheaper.
+    switch (DefMCID->getOpcode()) {
+    default: break;
+    case ARM::LDRrs:
+    case ARM::LDRBrs: {
+      unsigned ShOpVal = DefMI->getOperand(3).getImm();
+      unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+      if (ShImm == 0 ||
+          (ShImm == 2 && ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl))
+        --Adjust;
+      break;
+    }
+    case ARM::t2LDRs:
+    case ARM::t2LDRBs:
+    case ARM::t2LDRHs:
+    case ARM::t2LDRSHs: {
+      // Thumb2 mode: lsl only.
+      unsigned ShAmt = DefMI->getOperand(3).getImm();
+      if (ShAmt == 0 || ShAmt == 2)
+        --Adjust;
+      break;
+    }
+    }
+  } else if (Subtarget.isSwift()) {
+    // FIXME: Properly handle all of the latency adjustments for address
+    // writeback.
+    switch (DefMCID->getOpcode()) {
+    default: break;
+    case ARM::LDRrs:
+    case ARM::LDRBrs: {
+      unsigned ShOpVal = DefMI->getOperand(3).getImm();
+      bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
+      unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+      if (!isSub &&
+          (ShImm == 0 ||
+           ((ShImm == 1 || ShImm == 2 || ShImm == 3) &&
+            ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl)))
+        Adjust -= 2;
+      else if (!isSub &&
+               ShImm == 1 && ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsr)
+        --Adjust;
+      break;
+    }
+    case ARM::t2LDRs:
+    case ARM::t2LDRBs:
+    case ARM::t2LDRHs:
+    case ARM::t2LDRSHs: {
+      // Thumb2 mode: lsl only.
+      unsigned ShAmt = DefMI->getOperand(3).getImm();
+      if (ShAmt == 0 || ShAmt == 1 || ShAmt == 2 || ShAmt == 3)
+        Adjust -= 2;
+      break;
+    }
+    }
+  }
+
+  if (DefAlign < 8 && Subtarget.isLikeA9()) {
+    switch (DefMCID->getOpcode()) {
+    default: break;
+    case ARM::VLD1q8:
+    case ARM::VLD1q16:
+    case ARM::VLD1q32:
+    case ARM::VLD1q64:
+    case ARM::VLD1q8wb_fixed:
+    case ARM::VLD1q16wb_fixed:
+    case ARM::VLD1q32wb_fixed:
+    case ARM::VLD1q64wb_fixed:
+    case ARM::VLD1q8wb_register:
+    case ARM::VLD1q16wb_register:
+    case ARM::VLD1q32wb_register:
+    case ARM::VLD1q64wb_register:
+    case ARM::VLD2d8:
+    case ARM::VLD2d16:
+    case ARM::VLD2d32:
+    case ARM::VLD2q8:
+    case ARM::VLD2q16:
+    case ARM::VLD2q32:
+    case ARM::VLD2d8wb_fixed:
+    case ARM::VLD2d16wb_fixed:
+    case ARM::VLD2d32wb_fixed:
+    case ARM::VLD2q8wb_fixed:
+    case ARM::VLD2q16wb_fixed:
+    case ARM::VLD2q32wb_fixed:
+    case ARM::VLD2d8wb_register:
+    case ARM::VLD2d16wb_register:
+    case ARM::VLD2d32wb_register:
+    case ARM::VLD2q8wb_register:
+    case ARM::VLD2q16wb_register:
+    case ARM::VLD2q32wb_register:
+    case ARM::VLD3d8:
+    case ARM::VLD3d16:
+    case ARM::VLD3d32:
+    case ARM::VLD1d64T:
+    case ARM::VLD3d8_UPD:
+    case ARM::VLD3d16_UPD:
+    case ARM::VLD3d32_UPD:
+    case ARM::VLD1d64Twb_fixed:
+    case ARM::VLD1d64Twb_register:
+    case ARM::VLD3q8_UPD:
+    case ARM::VLD3q16_UPD:
+    case ARM::VLD3q32_UPD:
+    case ARM::VLD4d8:
+    case ARM::VLD4d16:
+    case ARM::VLD4d32:
+    case ARM::VLD1d64Q:
+    case ARM::VLD4d8_UPD:
+    case ARM::VLD4d16_UPD:
+    case ARM::VLD4d32_UPD:
+    case ARM::VLD1d64Qwb_fixed:
+    case ARM::VLD1d64Qwb_register:
+    case ARM::VLD4q8_UPD:
+    case ARM::VLD4q16_UPD:
+    case ARM::VLD4q32_UPD:
+    case ARM::VLD1DUPq8:
+    case ARM::VLD1DUPq16:
+    case ARM::VLD1DUPq32:
+    case ARM::VLD1DUPq8wb_fixed:
+    case ARM::VLD1DUPq16wb_fixed:
+    case ARM::VLD1DUPq32wb_fixed:
+    case ARM::VLD1DUPq8wb_register:
+    case ARM::VLD1DUPq16wb_register:
+    case ARM::VLD1DUPq32wb_register:
+    case ARM::VLD2DUPd8:
+    case ARM::VLD2DUPd16:
+    case ARM::VLD2DUPd32:
+    case ARM::VLD2DUPd8wb_fixed:
+    case ARM::VLD2DUPd16wb_fixed:
+    case ARM::VLD2DUPd32wb_fixed:
+    case ARM::VLD2DUPd8wb_register:
+    case ARM::VLD2DUPd16wb_register:
+    case ARM::VLD2DUPd32wb_register:
+    case ARM::VLD4DUPd8:
+    case ARM::VLD4DUPd16:
+    case ARM::VLD4DUPd32:
+    case ARM::VLD4DUPd8_UPD:
+    case ARM::VLD4DUPd16_UPD:
+    case ARM::VLD4DUPd32_UPD:
+    case ARM::VLD1LNd8:
+    case ARM::VLD1LNd16:
+    case ARM::VLD1LNd32:
+    case ARM::VLD1LNd8_UPD:
+    case ARM::VLD1LNd16_UPD:
+    case ARM::VLD1LNd32_UPD:
+    case ARM::VLD2LNd8:
+    case ARM::VLD2LNd16:
+    case ARM::VLD2LNd32:
+    case ARM::VLD2LNq16:
+    case ARM::VLD2LNq32:
+    case ARM::VLD2LNd8_UPD:
+    case ARM::VLD2LNd16_UPD:
+    case ARM::VLD2LNd32_UPD:
+    case ARM::VLD2LNq16_UPD:
+    case ARM::VLD2LNq32_UPD:
+    case ARM::VLD4LNd8:
+    case ARM::VLD4LNd16:
+    case ARM::VLD4LNd32:
+    case ARM::VLD4LNq16:
+    case ARM::VLD4LNq32:
+    case ARM::VLD4LNd8_UPD:
+    case ARM::VLD4LNd16_UPD:
+    case ARM::VLD4LNd32_UPD:
+    case ARM::VLD4LNq16_UPD:
+    case ARM::VLD4LNq32_UPD:
+      // If the address is not 64-bit aligned, the latencies of these
+      // instructions increases by one.
+      ++Adjust;
+      break;
+    }
+  }
+  return Adjust;
+}
+
+
+
+int
+ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
+                                    const MachineInstr *DefMI, unsigned DefIdx,
+                                    const MachineInstr *UseMI,
+                                    unsigned UseIdx) const {
+  // No operand latency. The caller may fall back to getInstrLatency.
+  if (!ItinData || ItinData->isEmpty())
+    return -1;
+
+  const MachineOperand &DefMO = DefMI->getOperand(DefIdx);
+  unsigned Reg = DefMO.getReg();
+  const MCInstrDesc *DefMCID = &DefMI->getDesc();
+  const MCInstrDesc *UseMCID = &UseMI->getDesc();
+
+  unsigned DefAdj = 0;
+  if (DefMI->isBundle()) {
+    DefMI = getBundledDefMI(&getRegisterInfo(), DefMI, Reg, DefIdx, DefAdj);
+    DefMCID = &DefMI->getDesc();
+  }
+  if (DefMI->isCopyLike() || DefMI->isInsertSubreg() ||
+      DefMI->isRegSequence() || DefMI->isImplicitDef()) {
+    return 1;
+  }
+
+  unsigned UseAdj = 0;
+  if (UseMI->isBundle()) {
+    unsigned NewUseIdx;
+    const MachineInstr *NewUseMI = getBundledUseMI(&getRegisterInfo(), UseMI,
+                                                   Reg, NewUseIdx, UseAdj);
+    if (!NewUseMI)
+      return -1;
+
+    UseMI = NewUseMI;
+    UseIdx = NewUseIdx;
+    UseMCID = &UseMI->getDesc();
+  }
+
+  if (Reg == ARM::CPSR) {
+    if (DefMI->getOpcode() == ARM::FMSTAT) {
+      // fpscr -> cpsr stalls over 20 cycles on A8 (and earlier?)
+      return Subtarget.isLikeA9() ? 1 : 20;
+    }
+
+    // CPSR set and branch can be paired in the same cycle.
+    if (UseMI->isBranch())
+      return 0;
+
+    // Otherwise it takes the instruction latency (generally one).
+    unsigned Latency = getInstrLatency(ItinData, DefMI);
+
+    // For Thumb2 and -Os, prefer scheduling CPSR setting instruction close to
+    // its uses. Instructions which are otherwise scheduled between them may
+    // incur a code size penalty (not able to use the CPSR setting 16-bit
+    // instructions).
+    if (Latency > 0 && Subtarget.isThumb2()) {
+      const MachineFunction *MF = DefMI->getParent()->getParent();
+      if (MF->getFunction()->getAttributes().
+            hasAttribute(AttributeSet::FunctionIndex,
+                         Attribute::OptimizeForSize))
+        --Latency;
+    }
+    return Latency;
+  }
+
+  if (DefMO.isImplicit() || UseMI->getOperand(UseIdx).isImplicit())
+    return -1;
+
+  unsigned DefAlign = DefMI->hasOneMemOperand()
+    ? (*DefMI->memoperands_begin())->getAlignment() : 0;
+  unsigned UseAlign = UseMI->hasOneMemOperand()
+    ? (*UseMI->memoperands_begin())->getAlignment() : 0;
+
+  // Get the itinerary's latency if possible, and handle variable_ops.
+  int Latency = getOperandLatency(ItinData, *DefMCID, DefIdx, DefAlign,
+                                  *UseMCID, UseIdx, UseAlign);
+  // Unable to find operand latency. The caller may resort to getInstrLatency.
+  if (Latency < 0)
+    return Latency;
+
+  // Adjust for IT block position.
+  int Adj = DefAdj + UseAdj;
+
+  // Adjust for dynamic def-side opcode variants not captured by the itinerary.
+  Adj += adjustDefLatency(Subtarget, DefMI, DefMCID, DefAlign);
+  if (Adj >= 0 || (int)Latency > -Adj) {
+    return Latency + Adj;
+  }
+  // Return the itinerary latency, which may be zero but not less than zero.
+  return Latency;
+}
+
+int
+ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
+                                    SDNode *DefNode, unsigned DefIdx,
+                                    SDNode *UseNode, unsigned UseIdx) const {
+  if (!DefNode->isMachineOpcode())
+    return 1;
+
+  const MCInstrDesc &DefMCID = get(DefNode->getMachineOpcode());
+
+  if (isZeroCost(DefMCID.Opcode))
+    return 0;
+
+  if (!ItinData || ItinData->isEmpty())
+    return DefMCID.mayLoad() ? 3 : 1;
+
+  if (!UseNode->isMachineOpcode()) {
+    int Latency = ItinData->getOperandCycle(DefMCID.getSchedClass(), DefIdx);
+    if (Subtarget.isLikeA9() || Subtarget.isSwift())
+      return Latency <= 2 ? 1 : Latency - 1;
+    else
+      return Latency <= 3 ? 1 : Latency - 2;
+  }
+
+  const MCInstrDesc &UseMCID = get(UseNode->getMachineOpcode());
+  const MachineSDNode *DefMN = dyn_cast<MachineSDNode>(DefNode);
+  unsigned DefAlign = !DefMN->memoperands_empty()
+    ? (*DefMN->memoperands_begin())->getAlignment() : 0;
+  const MachineSDNode *UseMN = dyn_cast<MachineSDNode>(UseNode);
+  unsigned UseAlign = !UseMN->memoperands_empty()
+    ? (*UseMN->memoperands_begin())->getAlignment() : 0;
+  int Latency = getOperandLatency(ItinData, DefMCID, DefIdx, DefAlign,
+                                  UseMCID, UseIdx, UseAlign);
+
+  if (Latency > 1 &&
+      (Subtarget.isCortexA8() || Subtarget.isLikeA9() ||
+       Subtarget.isCortexA7())) {
+    // FIXME: Shifter op hack: no shift (i.e. [r +/- r]) or [r + r << 2]
+    // variants are one cycle cheaper.
+    switch (DefMCID.getOpcode()) {
+    default: break;
+    case ARM::LDRrs:
+    case ARM::LDRBrs: {
+      unsigned ShOpVal =
+        cast<ConstantSDNode>(DefNode->getOperand(2))->getZExtValue();
+      unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+      if (ShImm == 0 ||
+          (ShImm == 2 && ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl))
+        --Latency;
+      break;
+    }
+    case ARM::t2LDRs:
+    case ARM::t2LDRBs:
+    case ARM::t2LDRHs:
+    case ARM::t2LDRSHs: {
+      // Thumb2 mode: lsl only.
+      unsigned ShAmt =
+        cast<ConstantSDNode>(DefNode->getOperand(2))->getZExtValue();
+      if (ShAmt == 0 || ShAmt == 2)
+        --Latency;
+      break;
+    }
+    }
+  } else if (DefIdx == 0 && Latency > 2 && Subtarget.isSwift()) {
+    // FIXME: Properly handle all of the latency adjustments for address
+    // writeback.
+    switch (DefMCID.getOpcode()) {
+    default: break;
+    case ARM::LDRrs:
+    case ARM::LDRBrs: {
+      unsigned ShOpVal =
+        cast<ConstantSDNode>(DefNode->getOperand(2))->getZExtValue();
+      unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+      if (ShImm == 0 ||
+          ((ShImm == 1 || ShImm == 2 || ShImm == 3) &&
+           ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl))
+        Latency -= 2;
+      else if (ShImm == 1 && ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsr)
+        --Latency;
+      break;
+    }
+    case ARM::t2LDRs:
+    case ARM::t2LDRBs:
+    case ARM::t2LDRHs:
+    case ARM::t2LDRSHs: {
+      // Thumb2 mode: lsl 0-3 only.
+      Latency -= 2;
+      break;
+    }
+    }
+  }
+
+  if (DefAlign < 8 && Subtarget.isLikeA9())
+    switch (DefMCID.getOpcode()) {
+    default: break;
+    case ARM::VLD1q8:
+    case ARM::VLD1q16:
+    case ARM::VLD1q32:
+    case ARM::VLD1q64:
+    case ARM::VLD1q8wb_register:
+    case ARM::VLD1q16wb_register:
+    case ARM::VLD1q32wb_register:
+    case ARM::VLD1q64wb_register:
+    case ARM::VLD1q8wb_fixed:
+    case ARM::VLD1q16wb_fixed:
+    case ARM::VLD1q32wb_fixed:
+    case ARM::VLD1q64wb_fixed:
+    case ARM::VLD2d8:
+    case ARM::VLD2d16:
+    case ARM::VLD2d32:
+    case ARM::VLD2q8Pseudo:
+    case ARM::VLD2q16Pseudo:
+    case ARM::VLD2q32Pseudo:
+    case ARM::VLD2d8wb_fixed:
+    case ARM::VLD2d16wb_fixed:
+    case ARM::VLD2d32wb_fixed:
+    case ARM::VLD2q8PseudoWB_fixed:
+    case ARM::VLD2q16PseudoWB_fixed:
+    case ARM::VLD2q32PseudoWB_fixed:
+    case ARM::VLD2d8wb_register:
+    case ARM::VLD2d16wb_register:
+    case ARM::VLD2d32wb_register:
+    case ARM::VLD2q8PseudoWB_register:
+    case ARM::VLD2q16PseudoWB_register:
+    case ARM::VLD2q32PseudoWB_register:
+    case ARM::VLD3d8Pseudo:
+    case ARM::VLD3d16Pseudo:
+    case ARM::VLD3d32Pseudo:
+    case ARM::VLD1d64TPseudo:
+    case ARM::VLD1d64TPseudoWB_fixed:
+    case ARM::VLD3d8Pseudo_UPD:
+    case ARM::VLD3d16Pseudo_UPD:
+    case ARM::VLD3d32Pseudo_UPD:
+    case ARM::VLD3q8Pseudo_UPD:
+    case ARM::VLD3q16Pseudo_UPD:
+    case ARM::VLD3q32Pseudo_UPD:
+    case ARM::VLD3q8oddPseudo:
+    case ARM::VLD3q16oddPseudo:
+    case ARM::VLD3q32oddPseudo:
+    case ARM::VLD3q8oddPseudo_UPD:
+    case ARM::VLD3q16oddPseudo_UPD:
+    case ARM::VLD3q32oddPseudo_UPD:
+    case ARM::VLD4d8Pseudo:
+    case ARM::VLD4d16Pseudo:
+    case ARM::VLD4d32Pseudo:
+    case ARM::VLD1d64QPseudo:
+    case ARM::VLD1d64QPseudoWB_fixed:
+    case ARM::VLD4d8Pseudo_UPD:
+    case ARM::VLD4d16Pseudo_UPD:
+    case ARM::VLD4d32Pseudo_UPD:
+    case ARM::VLD4q8Pseudo_UPD:
+    case ARM::VLD4q16Pseudo_UPD:
+    case ARM::VLD4q32Pseudo_UPD:
+    case ARM::VLD4q8oddPseudo:
+    case ARM::VLD4q16oddPseudo:
+    case ARM::VLD4q32oddPseudo:
+    case ARM::VLD4q8oddPseudo_UPD:
+    case ARM::VLD4q16oddPseudo_UPD:
+    case ARM::VLD4q32oddPseudo_UPD:
+    case ARM::VLD1DUPq8:
+    case ARM::VLD1DUPq16:
+    case ARM::VLD1DUPq32:
+    case ARM::VLD1DUPq8wb_fixed:
+    case ARM::VLD1DUPq16wb_fixed:
+    case ARM::VLD1DUPq32wb_fixed:
+    case ARM::VLD1DUPq8wb_register:
+    case ARM::VLD1DUPq16wb_register:
+    case ARM::VLD1DUPq32wb_register:
+    case ARM::VLD2DUPd8:
+    case ARM::VLD2DUPd16:
+    case ARM::VLD2DUPd32:
+    case ARM::VLD2DUPd8wb_fixed:
+    case ARM::VLD2DUPd16wb_fixed:
+    case ARM::VLD2DUPd32wb_fixed:
+    case ARM::VLD2DUPd8wb_register:
+    case ARM::VLD2DUPd16wb_register:
+    case ARM::VLD2DUPd32wb_register:
+    case ARM::VLD4DUPd8Pseudo:
+    case ARM::VLD4DUPd16Pseudo:
+    case ARM::VLD4DUPd32Pseudo:
+    case ARM::VLD4DUPd8Pseudo_UPD:
+    case ARM::VLD4DUPd16Pseudo_UPD:
+    case ARM::VLD4DUPd32Pseudo_UPD:
+    case ARM::VLD1LNq8Pseudo:
+    case ARM::VLD1LNq16Pseudo:
+    case ARM::VLD1LNq32Pseudo:
+    case ARM::VLD1LNq8Pseudo_UPD:
+    case ARM::VLD1LNq16Pseudo_UPD:
+    case ARM::VLD1LNq32Pseudo_UPD:
+    case ARM::VLD2LNd8Pseudo:
+    case ARM::VLD2LNd16Pseudo:
+    case ARM::VLD2LNd32Pseudo:
+    case ARM::VLD2LNq16Pseudo:
+    case ARM::VLD2LNq32Pseudo:
+    case ARM::VLD2LNd8Pseudo_UPD:
+    case ARM::VLD2LNd16Pseudo_UPD:
+    case ARM::VLD2LNd32Pseudo_UPD:
+    case ARM::VLD2LNq16Pseudo_UPD:
+    case ARM::VLD2LNq32Pseudo_UPD:
+    case ARM::VLD4LNd8Pseudo:
+    case ARM::VLD4LNd16Pseudo:
+    case ARM::VLD4LNd32Pseudo:
+    case ARM::VLD4LNq16Pseudo:
+    case ARM::VLD4LNq32Pseudo:
+    case ARM::VLD4LNd8Pseudo_UPD:
+    case ARM::VLD4LNd16Pseudo_UPD:
+    case ARM::VLD4LNd32Pseudo_UPD:
+    case ARM::VLD4LNq16Pseudo_UPD:
+    case ARM::VLD4LNq32Pseudo_UPD:
+      // If the address is not 64-bit aligned, the latencies of these
+      // instructions increases by one.
+      ++Latency;
+      break;
+    }
+
+  return Latency;
+}
+
+unsigned ARMBaseInstrInfo::getPredicationCost(const MachineInstr *MI) const {
+   if (MI->isCopyLike() || MI->isInsertSubreg() ||
+      MI->isRegSequence() || MI->isImplicitDef())
+    return 0;
+
+  if (MI->isBundle())
+    return 0;
+
+  const MCInstrDesc &MCID = MI->getDesc();
+
+  if (MCID.isCall() || MCID.hasImplicitDefOfPhysReg(ARM::CPSR)) {
+    // When predicated, CPSR is an additional source operand for CPSR updating
+    // instructions, this apparently increases their latencies.
+    return 1;
+  }
+  return 0;
+}
+
+unsigned ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
+                                           const MachineInstr *MI,
+                                           unsigned *PredCost) const {
+  if (MI->isCopyLike() || MI->isInsertSubreg() ||
+      MI->isRegSequence() || MI->isImplicitDef())
+    return 1;
+
+  // An instruction scheduler typically runs on unbundled instructions, however
+  // other passes may query the latency of a bundled instruction.
+  if (MI->isBundle()) {
+    unsigned Latency = 0;
+    MachineBasicBlock::const_instr_iterator I = MI;
+    MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
+    while (++I != E && I->isInsideBundle()) {
+      if (I->getOpcode() != ARM::t2IT)
+        Latency += getInstrLatency(ItinData, I, PredCost);
+    }
+    return Latency;
+  }
+
+  const MCInstrDesc &MCID = MI->getDesc();
+  if (PredCost && (MCID.isCall() || MCID.hasImplicitDefOfPhysReg(ARM::CPSR))) {
+    // When predicated, CPSR is an additional source operand for CPSR updating
+    // instructions, this apparently increases their latencies.
+    *PredCost = 1;
+  }
+  // Be sure to call getStageLatency for an empty itinerary in case it has a
+  // valid MinLatency property.
+  if (!ItinData)
+    return MI->mayLoad() ? 3 : 1;
+
+  unsigned Class = MCID.getSchedClass();
+
+  // For instructions with variable uops, use uops as latency.
+  if (!ItinData->isEmpty() && ItinData->getNumMicroOps(Class) < 0)
+    return getNumMicroOps(ItinData, MI);
+
+  // For the common case, fall back on the itinerary's latency.
+  unsigned Latency = ItinData->getStageLatency(Class);
+
+  // Adjust for dynamic def-side opcode variants not captured by the itinerary.
+  unsigned DefAlign = MI->hasOneMemOperand()
+    ? (*MI->memoperands_begin())->getAlignment() : 0;
+  int Adj = adjustDefLatency(Subtarget, MI, &MCID, DefAlign);
+  if (Adj >= 0 || (int)Latency > -Adj) {
+    return Latency + Adj;
+  }
+  return Latency;
+}
+
+int ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
+                                      SDNode *Node) const {
+  if (!Node->isMachineOpcode())
+    return 1;
+
+  if (!ItinData || ItinData->isEmpty())
+    return 1;
+
+  unsigned Opcode = Node->getMachineOpcode();
+  switch (Opcode) {
+  default:
+    return ItinData->getStageLatency(get(Opcode).getSchedClass());
+  case ARM::VLDMQIA:
+  case ARM::VSTMQIA:
+    return 2;
+  }
+}
+
+bool ARMBaseInstrInfo::
+hasHighOperandLatency(const InstrItineraryData *ItinData,
+                      const MachineRegisterInfo *MRI,
+                      const MachineInstr *DefMI, unsigned DefIdx,
+                      const MachineInstr *UseMI, unsigned UseIdx) const {
+  unsigned DDomain = DefMI->getDesc().TSFlags & ARMII::DomainMask;
+  unsigned UDomain = UseMI->getDesc().TSFlags & ARMII::DomainMask;
+  if (Subtarget.isCortexA8() &&
+      (DDomain == ARMII::DomainVFP || UDomain == ARMII::DomainVFP))
+    // CortexA8 VFP instructions are not pipelined.
+    return true;
+
+  // Hoist VFP / NEON instructions with 4 or higher latency.
+  int Latency = computeOperandLatency(ItinData, DefMI, DefIdx, UseMI, UseIdx);
+  if (Latency < 0)
+    Latency = getInstrLatency(ItinData, DefMI);
+  if (Latency <= 3)
+    return false;
+  return DDomain == ARMII::DomainVFP || DDomain == ARMII::DomainNEON ||
+         UDomain == ARMII::DomainVFP || UDomain == ARMII::DomainNEON;
+}
+
+bool ARMBaseInstrInfo::
+hasLowDefLatency(const InstrItineraryData *ItinData,
+                 const MachineInstr *DefMI, unsigned DefIdx) const {
+  if (!ItinData || ItinData->isEmpty())
+    return false;
+
+  unsigned DDomain = DefMI->getDesc().TSFlags & ARMII::DomainMask;
+  if (DDomain == ARMII::DomainGeneral) {
+    unsigned DefClass = DefMI->getDesc().getSchedClass();
+    int DefCycle = ItinData->getOperandCycle(DefClass, DefIdx);
+    return (DefCycle != -1 && DefCycle <= 2);
+  }
+  return false;
+}
+
+bool ARMBaseInstrInfo::verifyInstruction(const MachineInstr *MI,
+                                         StringRef &ErrInfo) const {
+  if (convertAddSubFlagsOpcode(MI->getOpcode())) {
+    ErrInfo = "Pseudo flag setting opcodes only exist in Selection DAG";
+    return false;
+  }
+  return true;
+}
+
+bool
+ARMBaseInstrInfo::isFpMLxInstruction(unsigned Opcode, unsigned &MulOpc,
+                                     unsigned &AddSubOpc,
+                                     bool &NegAcc, bool &HasLane) const {
+  DenseMap<unsigned, unsigned>::const_iterator I = MLxEntryMap.find(Opcode);
+  if (I == MLxEntryMap.end())
+    return false;
+
+  const ARM_MLxEntry &Entry = ARM_MLxTable[I->second];
+  MulOpc = Entry.MulOpc;
+  AddSubOpc = Entry.AddSubOpc;
+  NegAcc = Entry.NegAcc;
+  HasLane = Entry.HasLane;
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+// Execution domains.
+//===----------------------------------------------------------------------===//
+//
+// Some instructions go down the NEON pipeline, some go down the VFP pipeline,
+// and some can go down both.  The vmov instructions go down the VFP pipeline,
+// but they can be changed to vorr equivalents that are executed by the NEON
+// pipeline.
+//
+// We use the following execution domain numbering:
+//
+enum ARMExeDomain {
+  ExeGeneric = 0,
+  ExeVFP = 1,
+  ExeNEON = 2
+};
+//
+// Also see ARMInstrFormats.td and Domain* enums in ARMBaseInfo.h
+//
+std::pair<uint16_t, uint16_t>
+ARMBaseInstrInfo::getExecutionDomain(const MachineInstr *MI) const {
+  // VMOVD, VMOVRS and VMOVSR are VFP instructions, but can be changed to NEON
+  // if they are not predicated.
+  if (MI->getOpcode() == ARM::VMOVD && !isPredicated(MI))
+    return std::make_pair(ExeVFP, (1<<ExeVFP) | (1<<ExeNEON));
+
+  // CortexA9 is particularly picky about mixing the two and wants these
+  // converted.
+  if (Subtarget.isCortexA9() && !isPredicated(MI) &&
+      (MI->getOpcode() == ARM::VMOVRS ||
+       MI->getOpcode() == ARM::VMOVSR ||
+       MI->getOpcode() == ARM::VMOVS))
+    return std::make_pair(ExeVFP, (1<<ExeVFP) | (1<<ExeNEON));
+
+  // No other instructions can be swizzled, so just determine their domain.
+  unsigned Domain = MI->getDesc().TSFlags & ARMII::DomainMask;
+
+  if (Domain & ARMII::DomainNEON)
+    return std::make_pair(ExeNEON, 0);
+
+  // Certain instructions can go either way on Cortex-A8.
+  // Treat them as NEON instructions.
+  if ((Domain & ARMII::DomainNEONA8) && Subtarget.isCortexA8())
+    return std::make_pair(ExeNEON, 0);
+
+  if (Domain & ARMII::DomainVFP)
+    return std::make_pair(ExeVFP, 0);
+
+  return std::make_pair(ExeGeneric, 0);
+}
+
+static unsigned getCorrespondingDRegAndLane(const TargetRegisterInfo *TRI,
+                                            unsigned SReg, unsigned &Lane) {
+  unsigned DReg = TRI->getMatchingSuperReg(SReg, ARM::ssub_0, &ARM::DPRRegClass);
+  Lane = 0;
+
+  if (DReg != ARM::NoRegister)
+   return DReg;
+
+  Lane = 1;
+  DReg = TRI->getMatchingSuperReg(SReg, ARM::ssub_1, &ARM::DPRRegClass);
+
+  assert(DReg && "S-register with no D super-register?");
+  return DReg;
+}
+
+/// getImplicitSPRUseForDPRUse - Given a use of a DPR register and lane,
+/// set ImplicitSReg to a register number that must be marked as implicit-use or
+/// zero if no register needs to be defined as implicit-use.
+///
+/// If the function cannot determine if an SPR should be marked implicit use or
+/// not, it returns false.
+///
+/// This function handles cases where an instruction is being modified from taking
+/// an SPR to a DPR[Lane]. A use of the DPR is being added, which may conflict
+/// with an earlier def of an SPR corresponding to DPR[Lane^1] (i.e. the other
+/// lane of the DPR).
+///
+/// If the other SPR is defined, an implicit-use of it should be added. Else,
+/// (including the case where the DPR itself is defined), it should not.
+///
+static bool getImplicitSPRUseForDPRUse(const TargetRegisterInfo *TRI,
+                                       MachineInstr *MI,
+                                       unsigned DReg, unsigned Lane,
+                                       unsigned &ImplicitSReg) {
+  // If the DPR is defined or used already, the other SPR lane will be chained
+  // correctly, so there is nothing to be done.
+  if (MI->definesRegister(DReg, TRI) || MI->readsRegister(DReg, TRI)) {
+    ImplicitSReg = 0;
+    return true;
+  }
+
+  // Otherwise we need to go searching to see if the SPR is set explicitly.
+  ImplicitSReg = TRI->getSubReg(DReg,
+                                (Lane & 1) ? ARM::ssub_0 : ARM::ssub_1);
+  MachineBasicBlock::LivenessQueryResult LQR =
+    MI->getParent()->computeRegisterLiveness(TRI, ImplicitSReg, MI);
+
+  if (LQR == MachineBasicBlock::LQR_Live)
+    return true;
+  else if (LQR == MachineBasicBlock::LQR_Unknown)
+    return false;
+
+  // If the register is known not to be live, there is no need to add an
+  // implicit-use.
+  ImplicitSReg = 0;
+  return true;
+}
+
+void
+ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
+  unsigned DstReg, SrcReg, DReg;
+  unsigned Lane;
+  MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI);
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  switch (MI->getOpcode()) {
+    default:
+      llvm_unreachable("cannot handle opcode!");
+      break;
+    case ARM::VMOVD:
+      if (Domain != ExeNEON)
+        break;
+
+      // Zap the predicate operands.
+      assert(!isPredicated(MI) && "Cannot predicate a VORRd");
+
+      // Source instruction is %DDst = VMOVD %DSrc, 14, %noreg (; implicits)
+      DstReg = MI->getOperand(0).getReg();
+      SrcReg = MI->getOperand(1).getReg();
+
+      for (unsigned i = MI->getDesc().getNumOperands(); i; --i)
+        MI->RemoveOperand(i-1);
+
+      // Change to a %DDst = VORRd %DSrc, %DSrc, 14, %noreg (; implicits)
+      MI->setDesc(get(ARM::VORRd));
+      AddDefaultPred(MIB.addReg(DstReg, RegState::Define)
+                        .addReg(SrcReg)
+                        .addReg(SrcReg));
+      break;
+    case ARM::VMOVRS:
+      if (Domain != ExeNEON)
+        break;
+      assert(!isPredicated(MI) && "Cannot predicate a VGETLN");
+
+      // Source instruction is %RDst = VMOVRS %SSrc, 14, %noreg (; implicits)
+      DstReg = MI->getOperand(0).getReg();
+      SrcReg = MI->getOperand(1).getReg();
+
+      for (unsigned i = MI->getDesc().getNumOperands(); i; --i)
+        MI->RemoveOperand(i-1);
+
+      DReg = getCorrespondingDRegAndLane(TRI, SrcReg, Lane);
+
+      // Convert to %RDst = VGETLNi32 %DSrc, Lane, 14, %noreg (; imps)
+      // Note that DSrc has been widened and the other lane may be undef, which
+      // contaminates the entire register.
+      MI->setDesc(get(ARM::VGETLNi32));
+      AddDefaultPred(MIB.addReg(DstReg, RegState::Define)
+                        .addReg(DReg, RegState::Undef)
+                        .addImm(Lane));
+
+      // The old source should be an implicit use, otherwise we might think it
+      // was dead before here.
+      MIB.addReg(SrcReg, RegState::Implicit);
+      break;
+    case ARM::VMOVSR: {
+      if (Domain != ExeNEON)
+        break;
+      assert(!isPredicated(MI) && "Cannot predicate a VSETLN");
+
+      // Source instruction is %SDst = VMOVSR %RSrc, 14, %noreg (; implicits)
+      DstReg = MI->getOperand(0).getReg();
+      SrcReg = MI->getOperand(1).getReg();
+
+      DReg = getCorrespondingDRegAndLane(TRI, DstReg, Lane);
+
+      unsigned ImplicitSReg;
+      if (!getImplicitSPRUseForDPRUse(TRI, MI, DReg, Lane, ImplicitSReg))
+        break;
+
+      for (unsigned i = MI->getDesc().getNumOperands(); i; --i)
+        MI->RemoveOperand(i-1);
+
+      // Convert to %DDst = VSETLNi32 %DDst, %RSrc, Lane, 14, %noreg (; imps)
+      // Again DDst may be undefined at the beginning of this instruction.
+      MI->setDesc(get(ARM::VSETLNi32));
+      MIB.addReg(DReg, RegState::Define)
+         .addReg(DReg, getUndefRegState(!MI->readsRegister(DReg, TRI)))
+         .addReg(SrcReg)
+         .addImm(Lane);
+      AddDefaultPred(MIB);
+
+      // The narrower destination must be marked as set to keep previous chains
+      // in place.
+      MIB.addReg(DstReg, RegState::Define | RegState::Implicit);
+      if (ImplicitSReg != 0)
+        MIB.addReg(ImplicitSReg, RegState::Implicit);
+      break;
+    }
+    case ARM::VMOVS: {
+      if (Domain != ExeNEON)
+        break;
+
+      // Source instruction is %SDst = VMOVS %SSrc, 14, %noreg (; implicits)
+      DstReg = MI->getOperand(0).getReg();
+      SrcReg = MI->getOperand(1).getReg();
+
+      unsigned DstLane = 0, SrcLane = 0, DDst, DSrc;
+      DDst = getCorrespondingDRegAndLane(TRI, DstReg, DstLane);
+      DSrc = getCorrespondingDRegAndLane(TRI, SrcReg, SrcLane);
+
+      unsigned ImplicitSReg;
+      if (!getImplicitSPRUseForDPRUse(TRI, MI, DSrc, SrcLane, ImplicitSReg))
+        break;
+
+      for (unsigned i = MI->getDesc().getNumOperands(); i; --i)
+        MI->RemoveOperand(i-1);
+
+      if (DSrc == DDst) {
+        // Destination can be:
+        //     %DDst = VDUPLN32d %DDst, Lane, 14, %noreg (; implicits)
+        MI->setDesc(get(ARM::VDUPLN32d));
+        MIB.addReg(DDst, RegState::Define)
+           .addReg(DDst, getUndefRegState(!MI->readsRegister(DDst, TRI)))
+           .addImm(SrcLane);
+        AddDefaultPred(MIB);
+
+        // Neither the source or the destination are naturally represented any
+        // more, so add them in manually.
+        MIB.addReg(DstReg, RegState::Implicit | RegState::Define);
+        MIB.addReg(SrcReg, RegState::Implicit);
+        if (ImplicitSReg != 0)
+          MIB.addReg(ImplicitSReg, RegState::Implicit);
+        break;
+      }
+
+      // In general there's no single instruction that can perform an S <-> S
+      // move in NEON space, but a pair of VEXT instructions *can* do the
+      // job. It turns out that the VEXTs needed will only use DSrc once, with
+      // the position based purely on the combination of lane-0 and lane-1
+      // involved. For example
+      //     vmov s0, s2 -> vext.32 d0, d0, d1, #1  vext.32 d0, d0, d0, #1
+      //     vmov s1, s3 -> vext.32 d0, d1, d0, #1  vext.32 d0, d0, d0, #1
+      //     vmov s0, s3 -> vext.32 d0, d0, d0, #1  vext.32 d0, d1, d0, #1
+      //     vmov s1, s2 -> vext.32 d0, d0, d0, #1  vext.32 d0, d0, d1, #1
+      //
+      // Pattern of the MachineInstrs is:
+      //     %DDst = VEXTd32 %DSrc1, %DSrc2, Lane, 14, %noreg (;implicits)
+      MachineInstrBuilder NewMIB;
+      NewMIB = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+                       get(ARM::VEXTd32), DDst);
+
+      // On the first instruction, both DSrc and DDst may be <undef> if present.
+      // Specifically when the original instruction didn't have them as an
+      // <imp-use>.
+      unsigned CurReg = SrcLane == 1 && DstLane == 1 ? DSrc : DDst;
+      bool CurUndef = !MI->readsRegister(CurReg, TRI);
+      NewMIB.addReg(CurReg, getUndefRegState(CurUndef));
+
+      CurReg = SrcLane == 0 && DstLane == 0 ? DSrc : DDst;
+      CurUndef = !MI->readsRegister(CurReg, TRI);
+      NewMIB.addReg(CurReg, getUndefRegState(CurUndef));
+
+      NewMIB.addImm(1);
+      AddDefaultPred(NewMIB);
+
+      if (SrcLane == DstLane)
+        NewMIB.addReg(SrcReg, RegState::Implicit);
+
+      MI->setDesc(get(ARM::VEXTd32));
+      MIB.addReg(DDst, RegState::Define);
+
+      // On the second instruction, DDst has definitely been defined above, so
+      // it is not <undef>. DSrc, if present, can be <undef> as above.
+      CurReg = SrcLane == 1 && DstLane == 0 ? DSrc : DDst;
+      CurUndef = CurReg == DSrc && !MI->readsRegister(CurReg, TRI);
+      MIB.addReg(CurReg, getUndefRegState(CurUndef));
+
+      CurReg = SrcLane == 0 && DstLane == 1 ? DSrc : DDst;
+      CurUndef = CurReg == DSrc && !MI->readsRegister(CurReg, TRI);
+      MIB.addReg(CurReg, getUndefRegState(CurUndef));
+
+      MIB.addImm(1);
+      AddDefaultPred(MIB);
+
+      if (SrcLane != DstLane)
+        MIB.addReg(SrcReg, RegState::Implicit);
+
+      // As before, the original destination is no longer represented, add it
+      // implicitly.
+      MIB.addReg(DstReg, RegState::Define | RegState::Implicit);
+      if (ImplicitSReg != 0)
+        MIB.addReg(ImplicitSReg, RegState::Implicit);
+      break;
+    }
+  }
+
+}
+
+//===----------------------------------------------------------------------===//
+// Partial register updates
+//===----------------------------------------------------------------------===//
+//
+// Swift renames NEON registers with 64-bit granularity.  That means any
+// instruction writing an S-reg implicitly reads the containing D-reg.  The
+// problem is mostly avoided by translating f32 operations to v2f32 operations
+// on D-registers, but f32 loads are still a problem.
+//
+// These instructions can load an f32 into a NEON register:
+//
+// VLDRS - Only writes S, partial D update.
+// VLD1LNd32 - Writes all D-regs, explicit partial D update, 2 uops.
+// VLD1DUPd32 - Writes all D-regs, no partial reg update, 2 uops.
+//
+// FCONSTD can be used as a dependency-breaking instruction.
+unsigned ARMBaseInstrInfo::
+getPartialRegUpdateClearance(const MachineInstr *MI,
+                             unsigned OpNum,
+                             const TargetRegisterInfo *TRI) const {
+  if (!SwiftPartialUpdateClearance ||
+      !(Subtarget.isSwift() || Subtarget.isCortexA15()))
+    return 0;
+
+  assert(TRI && "Need TRI instance");
+
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  if (MO.readsReg())
+    return 0;
+  unsigned Reg = MO.getReg();
+  int UseOp = -1;
+
+  switch(MI->getOpcode()) {
+    // Normal instructions writing only an S-register.
+  case ARM::VLDRS:
+  case ARM::FCONSTS:
+  case ARM::VMOVSR:
+  case ARM::VMOVv8i8:
+  case ARM::VMOVv4i16:
+  case ARM::VMOVv2i32:
+  case ARM::VMOVv2f32:
+  case ARM::VMOVv1i64:
+    UseOp = MI->findRegisterUseOperandIdx(Reg, false, TRI);
+    break;
+
+    // Explicitly reads the dependency.
+  case ARM::VLD1LNd32:
+    UseOp = 3;
+    break;
+  default:
+    return 0;
+  }
+
+  // If this instruction actually reads a value from Reg, there is no unwanted
+  // dependency.
+  if (UseOp != -1 && MI->getOperand(UseOp).readsReg())
+    return 0;
+
+  // We must be able to clobber the whole D-reg.
+  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+    // Virtual register must be a foo:ssub_0<def,undef> operand.
+    if (!MO.getSubReg() || MI->readsVirtualRegister(Reg))
+      return 0;
+  } else if (ARM::SPRRegClass.contains(Reg)) {
+    // Physical register: MI must define the full D-reg.
+    unsigned DReg = TRI->getMatchingSuperReg(Reg, ARM::ssub_0,
+                                             &ARM::DPRRegClass);
+    if (!DReg || !MI->definesRegister(DReg, TRI))
+      return 0;
+  }
+
+  // MI has an unwanted D-register dependency.
+  // Avoid defs in the previous N instructrions.
+  return SwiftPartialUpdateClearance;
+}
+
+// Break a partial register dependency after getPartialRegUpdateClearance
+// returned non-zero.
+void ARMBaseInstrInfo::
+breakPartialRegDependency(MachineBasicBlock::iterator MI,
+                          unsigned OpNum,
+                          const TargetRegisterInfo *TRI) const {
+  assert(MI && OpNum < MI->getDesc().getNumDefs() && "OpNum is not a def");
+  assert(TRI && "Need TRI instance");
+
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  unsigned Reg = MO.getReg();
+  assert(TargetRegisterInfo::isPhysicalRegister(Reg) &&
+         "Can't break virtual register dependencies.");
+  unsigned DReg = Reg;
+
+  // If MI defines an S-reg, find the corresponding D super-register.
+  if (ARM::SPRRegClass.contains(Reg)) {
+    DReg = ARM::D0 + (Reg - ARM::S0) / 2;
+    assert(TRI->isSuperRegister(Reg, DReg) && "Register enums broken");
+  }
+
+  assert(ARM::DPRRegClass.contains(DReg) && "Can only break D-reg deps");
+  assert(MI->definesRegister(DReg, TRI) && "MI doesn't clobber full D-reg");
+
+  // FIXME: In some cases, VLDRS can be changed to a VLD1DUPd32 which defines
+  // the full D-register by loading the same value to both lanes.  The
+  // instruction is micro-coded with 2 uops, so don't do this until we can
+  // properly schedule micro-coded instructions.  The dispatcher stalls cause
+  // too big regressions.
+
+  // Insert the dependency-breaking FCONSTD before MI.
+  // 96 is the encoding of 0.5, but the actual value doesn't matter here.
+  AddDefaultPred(BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+                         get(ARM::FCONSTD), DReg).addImm(96));
+  MI->addRegisterKilled(DReg, TRI, true);
+}
+
+void ARMBaseInstrInfo::getUnconditionalBranch(
+    MCInst &Branch, const MCSymbolRefExpr *BranchTarget) const {
+  if (Subtarget.isThumb())
+    Branch.setOpcode(ARM::tB);
+  else if (Subtarget.isThumb2())
+    Branch.setOpcode(ARM::t2B);
+  else
+    Branch.setOpcode(ARM::Bcc);
+
+  Branch.addOperand(MCOperand::CreateExpr(BranchTarget));
+  Branch.addOperand(MCOperand::CreateImm(ARMCC::AL));
+  Branch.addOperand(MCOperand::CreateReg(0));
+}
+
+void ARMBaseInstrInfo::getTrap(MCInst &MI) const {
+  if (Subtarget.isThumb())
+    MI.setOpcode(ARM::tTRAP);
+  else if (Subtarget.useNaClTrap())
+    MI.setOpcode(ARM::TRAPNaCl);
+  else
+    MI.setOpcode(ARM::TRAP);
+}
+
+bool ARMBaseInstrInfo::hasNOP() const {
+  return (Subtarget.getFeatureBits() & ARM::HasV6T2Ops) != 0;
+}
+
+bool ARMBaseInstrInfo::isSwiftFastImmShift(const MachineInstr *MI) const {
+  if (MI->getNumOperands() < 4)
+    return true;
+  unsigned ShOpVal = MI->getOperand(3).getImm();
+  unsigned ShImm = ARM_AM::getSORegOffset(ShOpVal);
+  // Swift supports faster shifts for: lsl 2, lsl 1, and lsr 1.
+  if ((ShImm == 1 && ARM_AM::getSORegShOp(ShOpVal) == ARM_AM::lsr) ||
+      ((ShImm == 1 || ShImm == 2) &&
+       ARM_AM::getSORegShOp(ShOpVal) == ARM_AM::lsl))
+    return true;
+
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
new file mode 100644
index 0000000..b8d6758
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -0,0 +1,441 @@
+//===-- ARMBaseInstrInfo.h - ARM Base Instruction Information ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Base ARM implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMBASEINSTRUCTIONINFO_H
+#define ARMBASEINSTRUCTIONINFO_H
+
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "ARMGenInstrInfo.inc"
+
+namespace llvm {
+  class ARMSubtarget;
+  class ARMBaseRegisterInfo;
+
+class ARMBaseInstrInfo : public ARMGenInstrInfo {
+  const ARMSubtarget &Subtarget;
+
+protected:
+  // Can be only subclassed.
+  explicit ARMBaseInstrInfo(const ARMSubtarget &STI);
+
+public:
+  // Return whether the target has an explicit NOP encoding.
+  bool hasNOP() const;
+
+  // Return the non-pre/post incrementing version of 'Opc'. Return 0
+  // if there is not such an opcode.
+  virtual unsigned getUnindexedOpcode(unsigned Opc) const =0;
+
+  MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
+                                      MachineBasicBlock::iterator &MBBI,
+                                      LiveVariables *LV) const override;
+
+  virtual const ARMBaseRegisterInfo &getRegisterInfo() const = 0;
+  const ARMSubtarget &getSubtarget() const { return Subtarget; }
+
+  ScheduleHazardRecognizer *
+  CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI,
+                               const ScheduleDAG *DAG) const override;
+
+  ScheduleHazardRecognizer *
+  CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
+                                     const ScheduleDAG *DAG) const override;
+
+  // Branch analysis.
+  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify = false) const override;
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
+  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB,
+                        const SmallVectorImpl<MachineOperand> &Cond,
+                        DebugLoc DL) const override;
+
+  bool
+  ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+
+  // Predication support.
+  bool isPredicated(const MachineInstr *MI) const override;
+
+  ARMCC::CondCodes getPredicate(const MachineInstr *MI) const {
+    int PIdx = MI->findFirstPredOperandIdx();
+    return PIdx != -1 ? (ARMCC::CondCodes)MI->getOperand(PIdx).getImm()
+                      : ARMCC::AL;
+  }
+
+  bool PredicateInstruction(MachineInstr *MI,
+                    const SmallVectorImpl<MachineOperand> &Pred) const override;
+
+  bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
+                   const SmallVectorImpl<MachineOperand> &Pred2) const override;
+
+  bool DefinesPredicate(MachineInstr *MI,
+                        std::vector<MachineOperand> &Pred) const override;
+
+  bool isPredicable(MachineInstr *MI) const override;
+
+  /// GetInstSize - Returns the size of the specified MachineInstr.
+  ///
+  virtual unsigned GetInstSizeInBytes(const MachineInstr* MI) const;
+
+  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                               int &FrameIndex) const override;
+  unsigned isStoreToStackSlot(const MachineInstr *MI,
+                              int &FrameIndex) const override;
+  unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI,
+                                     int &FrameIndex) const override;
+  unsigned isStoreToStackSlotPostFE(const MachineInstr *MI,
+                                    int &FrameIndex) const override;
+
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                   DebugLoc DL, unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
+
+  bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
+
+  void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                     unsigned DestReg, unsigned SubIdx,
+                     const MachineInstr *Orig,
+                     const TargetRegisterInfo &TRI) const override;
+
+  MachineInstr *duplicate(MachineInstr *Orig,
+                          MachineFunction &MF) const override;
+
+  MachineInstr *commuteInstruction(MachineInstr*,
+                                   bool=false) const override;
+
+  const MachineInstrBuilder &AddDReg(MachineInstrBuilder &MIB, unsigned Reg,
+                                     unsigned SubIdx, unsigned State,
+                                     const TargetRegisterInfo *TRI) const;
+
+  bool produceSameValue(const MachineInstr *MI0, const MachineInstr *MI1,
+                        const MachineRegisterInfo *MRI) const override;
+
+  /// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler to
+  /// determine if two loads are loading from the same base address. It should
+  /// only return true if the base pointers are the same and the only
+  /// differences between the two addresses is the offset. It also returns the
+  /// offsets by reference.
+  bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, int64_t &Offset1,
+                               int64_t &Offset2) const override;
+
+  /// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to
+  /// determine (in conjunction with areLoadsFromSameBasePtr) if two loads
+  /// should be scheduled togther. On some targets if two loads are loading from
+  /// addresses in the same cache line, it's better if they are scheduled
+  /// together. This function takes two integers that represent the load offsets
+  /// from the common base address. It returns true if it decides it's desirable
+  /// to schedule the two loads together. "NumLoads" is the number of loads that
+  /// have already been scheduled after Load1.
+  bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
+                               int64_t Offset1, int64_t Offset2,
+                               unsigned NumLoads) const override;
+
+  bool isSchedulingBoundary(const MachineInstr *MI,
+                            const MachineBasicBlock *MBB,
+                            const MachineFunction &MF) const override;
+
+  bool isProfitableToIfCvt(MachineBasicBlock &MBB,
+                           unsigned NumCycles, unsigned ExtraPredCycles,
+                           const BranchProbability &Probability) const override;
+
+  bool isProfitableToIfCvt(MachineBasicBlock &TMBB, unsigned NumT,
+                           unsigned ExtraT, MachineBasicBlock &FMBB,
+                           unsigned NumF, unsigned ExtraF,
+                           const BranchProbability &Probability) const override;
+
+  bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
+                          const BranchProbability &Probability) const override {
+    return NumCycles == 1;
+  }
+
+  bool isProfitableToUnpredicate(MachineBasicBlock &TMBB,
+                                 MachineBasicBlock &FMBB) const override;
+
+  /// analyzeCompare - For a comparison instruction, return the source registers
+  /// in SrcReg and SrcReg2 if having two register operands, and the value it
+  /// compares against in CmpValue. Return true if the comparison instruction
+  /// can be analyzed.
+  bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
+                      unsigned &SrcReg2, int &CmpMask,
+                      int &CmpValue) const override;
+
+  /// optimizeCompareInstr - Convert the instruction to set the zero flag so
+  /// that we can remove a "comparison with zero"; Remove a redundant CMP
+  /// instruction if the flags can be updated in the same way by an earlier
+  /// instruction such as SUB.
+  bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
+                            unsigned SrcReg2, int CmpMask, int CmpValue,
+                            const MachineRegisterInfo *MRI) const override;
+
+  bool analyzeSelect(const MachineInstr *MI,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     unsigned &TrueOp, unsigned &FalseOp,
+                     bool &Optimizable) const override;
+
+  MachineInstr *optimizeSelect(MachineInstr *MI, bool) const override;
+
+  /// FoldImmediate - 'Reg' is known to be defined by a move immediate
+  /// instruction, try to fold the immediate into the use instruction.
+  bool FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
+                     unsigned Reg, MachineRegisterInfo *MRI) const override;
+
+  unsigned getNumMicroOps(const InstrItineraryData *ItinData,
+                          const MachineInstr *MI) const override;
+
+  int getOperandLatency(const InstrItineraryData *ItinData,
+                        const MachineInstr *DefMI, unsigned DefIdx,
+                        const MachineInstr *UseMI,
+                        unsigned UseIdx) const override;
+  int getOperandLatency(const InstrItineraryData *ItinData,
+                        SDNode *DefNode, unsigned DefIdx,
+                        SDNode *UseNode, unsigned UseIdx) const override;
+
+  /// VFP/NEON execution domains.
+  std::pair<uint16_t, uint16_t>
+  getExecutionDomain(const MachineInstr *MI) const override;
+  void setExecutionDomain(MachineInstr *MI, unsigned Domain) const override;
+
+  unsigned getPartialRegUpdateClearance(const MachineInstr*, unsigned,
+                                      const TargetRegisterInfo*) const override;
+  void breakPartialRegDependency(MachineBasicBlock::iterator, unsigned,
+                                 const TargetRegisterInfo *TRI) const override;
+
+  void
+  getUnconditionalBranch(MCInst &Branch,
+                         const MCSymbolRefExpr *BranchTarget) const override;
+
+  void getTrap(MCInst &MI) const override;
+
+  /// Get the number of addresses by LDM or VLDM or zero for unknown.
+  unsigned getNumLDMAddresses(const MachineInstr *MI) const;
+
+private:
+  unsigned getInstBundleLength(const MachineInstr *MI) const;
+
+  int getVLDMDefCycle(const InstrItineraryData *ItinData,
+                      const MCInstrDesc &DefMCID,
+                      unsigned DefClass,
+                      unsigned DefIdx, unsigned DefAlign) const;
+  int getLDMDefCycle(const InstrItineraryData *ItinData,
+                     const MCInstrDesc &DefMCID,
+                     unsigned DefClass,
+                     unsigned DefIdx, unsigned DefAlign) const;
+  int getVSTMUseCycle(const InstrItineraryData *ItinData,
+                      const MCInstrDesc &UseMCID,
+                      unsigned UseClass,
+                      unsigned UseIdx, unsigned UseAlign) const;
+  int getSTMUseCycle(const InstrItineraryData *ItinData,
+                     const MCInstrDesc &UseMCID,
+                     unsigned UseClass,
+                     unsigned UseIdx, unsigned UseAlign) const;
+  int getOperandLatency(const InstrItineraryData *ItinData,
+                        const MCInstrDesc &DefMCID,
+                        unsigned DefIdx, unsigned DefAlign,
+                        const MCInstrDesc &UseMCID,
+                        unsigned UseIdx, unsigned UseAlign) const;
+
+  unsigned getPredicationCost(const MachineInstr *MI) const override;
+
+  unsigned getInstrLatency(const InstrItineraryData *ItinData,
+                           const MachineInstr *MI,
+                           unsigned *PredCost = nullptr) const override;
+
+  int getInstrLatency(const InstrItineraryData *ItinData,
+                      SDNode *Node) const override;
+
+  bool hasHighOperandLatency(const InstrItineraryData *ItinData,
+                             const MachineRegisterInfo *MRI,
+                             const MachineInstr *DefMI, unsigned DefIdx,
+                             const MachineInstr *UseMI,
+                             unsigned UseIdx) const override;
+  bool hasLowDefLatency(const InstrItineraryData *ItinData,
+                        const MachineInstr *DefMI,
+                        unsigned DefIdx) const override;
+
+  /// verifyInstruction - Perform target specific instruction verification.
+  bool verifyInstruction(const MachineInstr *MI,
+                         StringRef &ErrInfo) const override;
+
+private:
+  /// Modeling special VFP / NEON fp MLA / MLS hazards.
+
+  /// MLxEntryMap - Map fp MLA / MLS to the corresponding entry in the internal
+  /// MLx table.
+  DenseMap<unsigned, unsigned> MLxEntryMap;
+
+  /// MLxHazardOpcodes - Set of add / sub and multiply opcodes that would cause
+  /// stalls when scheduled together with fp MLA / MLS opcodes.
+  SmallSet<unsigned, 16> MLxHazardOpcodes;
+
+public:
+  /// isFpMLxInstruction - Return true if the specified opcode is a fp MLA / MLS
+  /// instruction.
+  bool isFpMLxInstruction(unsigned Opcode) const {
+    return MLxEntryMap.count(Opcode);
+  }
+
+  /// isFpMLxInstruction - This version also returns the multiply opcode and the
+  /// addition / subtraction opcode to expand to. Return true for 'HasLane' for
+  /// the MLX instructions with an extra lane operand.
+  bool isFpMLxInstruction(unsigned Opcode, unsigned &MulOpc,
+                          unsigned &AddSubOpc, bool &NegAcc,
+                          bool &HasLane) const;
+
+  /// canCauseFpMLxStall - Return true if an instruction of the specified opcode
+  /// will cause stalls when scheduled after (within 4-cycle window) a fp
+  /// MLA / MLS instruction.
+  bool canCauseFpMLxStall(unsigned Opcode) const {
+    return MLxHazardOpcodes.count(Opcode);
+  }
+
+  /// Returns true if the instruction has a shift by immediate that can be
+  /// executed in one cycle less.
+  bool isSwiftFastImmShift(const MachineInstr *MI) const;
+};
+
+static inline
+const MachineInstrBuilder &AddDefaultPred(const MachineInstrBuilder &MIB) {
+  return MIB.addImm((int64_t)ARMCC::AL).addReg(0);
+}
+
+static inline
+const MachineInstrBuilder &AddDefaultCC(const MachineInstrBuilder &MIB) {
+  return MIB.addReg(0);
+}
+
+static inline
+const MachineInstrBuilder &AddDefaultT1CC(const MachineInstrBuilder &MIB,
+                                          bool isDead = false) {
+  return MIB.addReg(ARM::CPSR, getDefRegState(true) | getDeadRegState(isDead));
+}
+
+static inline
+const MachineInstrBuilder &AddNoT1CC(const MachineInstrBuilder &MIB) {
+  return MIB.addReg(0);
+}
+
+static inline
+bool isUncondBranchOpcode(int Opc) {
+  return Opc == ARM::B || Opc == ARM::tB || Opc == ARM::t2B;
+}
+
+static inline
+bool isCondBranchOpcode(int Opc) {
+  return Opc == ARM::Bcc || Opc == ARM::tBcc || Opc == ARM::t2Bcc;
+}
+
+static inline
+bool isJumpTableBranchOpcode(int Opc) {
+  return Opc == ARM::BR_JTr || Opc == ARM::BR_JTm || Opc == ARM::BR_JTadd ||
+    Opc == ARM::tBR_JTr || Opc == ARM::t2BR_JT;
+}
+
+static inline
+bool isIndirectBranchOpcode(int Opc) {
+  return Opc == ARM::BX || Opc == ARM::MOVPCRX || Opc == ARM::tBRIND;
+}
+
+static inline bool isPopOpcode(int Opc) {
+  return Opc == ARM::tPOP_RET || Opc == ARM::LDMIA_RET ||
+         Opc == ARM::t2LDMIA_RET || Opc == ARM::tPOP || Opc == ARM::LDMIA_UPD ||
+         Opc == ARM::t2LDMIA_UPD || Opc == ARM::VLDMDIA_UPD;
+}
+
+static inline bool isPushOpcode(int Opc) {
+  return Opc == ARM::tPUSH || Opc == ARM::t2STMDB_UPD ||
+         Opc == ARM::STMDB_UPD || Opc == ARM::VSTMDDB_UPD;
+}
+
+/// getInstrPredicate - If instruction is predicated, returns its predicate
+/// condition, otherwise returns AL. It also returns the condition code
+/// register by reference.
+ARMCC::CondCodes getInstrPredicate(const MachineInstr *MI, unsigned &PredReg);
+
+int getMatchingCondBranchOpcode(int Opc);
+
+/// Determine if MI can be folded into an ARM MOVCC instruction, and return the
+/// opcode of the SSA instruction representing the conditional MI.
+unsigned canFoldARMInstrIntoMOVCC(unsigned Reg,
+                                  MachineInstr *&MI,
+                                  const MachineRegisterInfo &MRI);
+
+/// Map pseudo instructions that imply an 'S' bit onto real opcodes. Whether
+/// the instruction is encoded with an 'S' bit is determined by the optional
+/// CPSR def operand.
+unsigned convertAddSubFlagsOpcode(unsigned OldOpc);
+
+/// emitARMRegPlusImmediate / emitT2RegPlusImmediate - Emits a series of
+/// instructions to materializea destreg = basereg + immediate in ARM / Thumb2
+/// code.
+void emitARMRegPlusImmediate(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator &MBBI, DebugLoc dl,
+                             unsigned DestReg, unsigned BaseReg, int NumBytes,
+                             ARMCC::CondCodes Pred, unsigned PredReg,
+                             const ARMBaseInstrInfo &TII, unsigned MIFlags = 0);
+
+void emitT2RegPlusImmediate(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator &MBBI, DebugLoc dl,
+                            unsigned DestReg, unsigned BaseReg, int NumBytes,
+                            ARMCC::CondCodes Pred, unsigned PredReg,
+                            const ARMBaseInstrInfo &TII, unsigned MIFlags = 0);
+void emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator &MBBI, DebugLoc dl,
+                               unsigned DestReg, unsigned BaseReg,
+                               int NumBytes, const TargetInstrInfo &TII,
+                               const ARMBaseRegisterInfo& MRI,
+                               unsigned MIFlags = 0);
+
+/// Tries to add registers to the reglist of a given base-updating
+/// push/pop instruction to adjust the stack by an additional
+/// NumBytes. This can save a few bytes per function in code-size, but
+/// obviously generates more memory traffic. As such, it only takes
+/// effect in functions being optimised for size.
+bool tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget,
+                                MachineFunction &MF, MachineInstr *MI,
+                                unsigned NumBytes);
+
+/// rewriteARMFrameIndex / rewriteT2FrameIndex -
+/// Rewrite MI to access 'Offset' bytes from the FP. Return false if the
+/// offset could not be handled directly in MI, and return the left-over
+/// portion by reference.
+bool rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+                          unsigned FrameReg, int &Offset,
+                          const ARMBaseInstrInfo &TII);
+
+bool rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+                         unsigned FrameReg, int &Offset,
+                         const ARMBaseInstrInfo &TII);
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
new file mode 100644
index 0000000..32b5f4a
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -0,0 +1,836 @@
+//===-- ARMBaseRegisterInfo.cpp - ARM Register Information ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the base ARM implementation of TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMBaseRegisterInfo.h"
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMFrameLowering.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMSubtarget.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+#define DEBUG_TYPE "arm-register-info"
+
+#define GET_REGINFO_TARGET_DESC
+#include "ARMGenRegisterInfo.inc"
+
+using namespace llvm;
+
+ARMBaseRegisterInfo::ARMBaseRegisterInfo(const ARMSubtarget &sti)
+    : ARMGenRegisterInfo(ARM::LR, 0, 0, ARM::PC), STI(sti), BasePtr(ARM::R6) {
+  if (STI.isTargetMachO()) {
+    if (STI.isTargetDarwin() || STI.isThumb1Only())
+      FramePtr = ARM::R7;
+    else
+      FramePtr = ARM::R11;
+  } else if (STI.isTargetWindows())
+    FramePtr = ARM::R11;
+  else // ARM EABI
+    FramePtr = STI.isThumb() ? ARM::R7 : ARM::R11;
+}
+
+const MCPhysReg*
+ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  const MCPhysReg *RegList = (STI.isTargetIOS() && !STI.isAAPCS_ABI())
+                                ? CSR_iOS_SaveList
+                                : CSR_AAPCS_SaveList;
+
+  if (!MF) return RegList;
+
+  const Function *F = MF->getFunction();
+  if (F->getCallingConv() == CallingConv::GHC) {
+    // GHC set of callee saved regs is empty as all those regs are
+    // used for passing STG regs around
+    return CSR_NoRegs_SaveList;
+  } else if (F->hasFnAttribute("interrupt")) {
+    if (STI.isMClass()) {
+      // M-class CPUs have hardware which saves the registers needed to allow a
+      // function conforming to the AAPCS to function as a handler.
+      return CSR_AAPCS_SaveList;
+    } else if (F->getFnAttribute("interrupt").getValueAsString() == "FIQ") {
+      // Fast interrupt mode gives the handler a private copy of R8-R14, so less
+      // need to be saved to restore user-mode state.
+      return CSR_FIQ_SaveList;
+    } else {
+      // Generally only R13-R14 (i.e. SP, LR) are automatically preserved by
+      // exception handling.
+      return CSR_GenericInt_SaveList;
+    }
+  }
+
+  return RegList;
+}
+
+const uint32_t*
+ARMBaseRegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
+  if (CC == CallingConv::GHC)
+    // This is academic becase all GHC calls are (supposed to be) tail calls
+    return CSR_NoRegs_RegMask;
+  return (STI.isTargetIOS() && !STI.isAAPCS_ABI())
+    ? CSR_iOS_RegMask : CSR_AAPCS_RegMask;
+}
+
+const uint32_t*
+ARMBaseRegisterInfo::getNoPreservedMask() const {
+  return CSR_NoRegs_RegMask;
+}
+
+const uint32_t*
+ARMBaseRegisterInfo::getThisReturnPreservedMask(CallingConv::ID CC) const {
+  // This should return a register mask that is the same as that returned by
+  // getCallPreservedMask but that additionally preserves the register used for
+  // the first i32 argument (which must also be the register used to return a
+  // single i32 return value)
+  //
+  // In case that the calling convention does not use the same register for
+  // both or otherwise does not want to enable this optimization, the function
+  // should return NULL
+  if (CC == CallingConv::GHC)
+    // This is academic becase all GHC calls are (supposed to be) tail calls
+    return nullptr;
+  return (STI.isTargetIOS() && !STI.isAAPCS_ABI())
+    ? CSR_iOS_ThisReturn_RegMask : CSR_AAPCS_ThisReturn_RegMask;
+}
+
+BitVector ARMBaseRegisterInfo::
+getReservedRegs(const MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  // FIXME: avoid re-calculating this every time.
+  BitVector Reserved(getNumRegs());
+  Reserved.set(ARM::SP);
+  Reserved.set(ARM::PC);
+  Reserved.set(ARM::FPSCR);
+  Reserved.set(ARM::APSR_NZCV);
+  if (TFI->hasFP(MF))
+    Reserved.set(FramePtr);
+  if (hasBasePointer(MF))
+    Reserved.set(BasePtr);
+  // Some targets reserve R9.
+  if (STI.isR9Reserved())
+    Reserved.set(ARM::R9);
+  // Reserve D16-D31 if the subtarget doesn't support them.
+  if (!STI.hasVFP3() || STI.hasD16()) {
+    assert(ARM::D31 == ARM::D16 + 15);
+    for (unsigned i = 0; i != 16; ++i)
+      Reserved.set(ARM::D16 + i);
+  }
+  const TargetRegisterClass *RC  = &ARM::GPRPairRegClass;
+  for(TargetRegisterClass::iterator I = RC->begin(), E = RC->end(); I!=E; ++I)
+    for (MCSubRegIterator SI(*I, this); SI.isValid(); ++SI)
+      if (Reserved.test(*SI)) Reserved.set(*I);
+
+  return Reserved;
+}
+
+const TargetRegisterClass*
+ARMBaseRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC)
+                                                                         const {
+  const TargetRegisterClass *Super = RC;
+  TargetRegisterClass::sc_iterator I = RC->getSuperClasses();
+  do {
+    switch (Super->getID()) {
+    case ARM::GPRRegClassID:
+    case ARM::SPRRegClassID:
+    case ARM::DPRRegClassID:
+    case ARM::QPRRegClassID:
+    case ARM::QQPRRegClassID:
+    case ARM::QQQQPRRegClassID:
+    case ARM::GPRPairRegClassID:
+      return Super;
+    }
+    Super = *I++;
+  } while (Super);
+  return RC;
+}
+
+const TargetRegisterClass *
+ARMBaseRegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind)
+                                                                         const {
+  return &ARM::GPRRegClass;
+}
+
+const TargetRegisterClass *
+ARMBaseRegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
+  if (RC == &ARM::CCRRegClass)
+    return nullptr;  // Can't copy CCR registers.
+  return RC;
+}
+
+unsigned
+ARMBaseRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
+                                         MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  switch (RC->getID()) {
+  default:
+    return 0;
+  case ARM::tGPRRegClassID:
+    return TFI->hasFP(MF) ? 4 : 5;
+  case ARM::GPRRegClassID: {
+    unsigned FP = TFI->hasFP(MF) ? 1 : 0;
+    return 10 - FP - (STI.isR9Reserved() ? 1 : 0);
+  }
+  case ARM::SPRRegClassID:  // Currently not used as 'rep' register class.
+  case ARM::DPRRegClassID:
+    return 32 - 10;
+  }
+}
+
+// Get the other register in a GPRPair.
+static unsigned getPairedGPR(unsigned Reg, bool Odd, const MCRegisterInfo *RI) {
+  for (MCSuperRegIterator Supers(Reg, RI); Supers.isValid(); ++Supers)
+    if (ARM::GPRPairRegClass.contains(*Supers))
+      return RI->getSubReg(*Supers, Odd ? ARM::gsub_1 : ARM::gsub_0);
+  return 0;
+}
+
+// Resolve the RegPairEven / RegPairOdd register allocator hints.
+void
+ARMBaseRegisterInfo::getRegAllocationHints(unsigned VirtReg,
+                                           ArrayRef<MCPhysReg> Order,
+                                           SmallVectorImpl<MCPhysReg> &Hints,
+                                           const MachineFunction &MF,
+                                           const VirtRegMap *VRM) const {
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  std::pair<unsigned, unsigned> Hint = MRI.getRegAllocationHint(VirtReg);
+
+  unsigned Odd;
+  switch (Hint.first) {
+  case ARMRI::RegPairEven:
+    Odd = 0;
+    break;
+  case ARMRI::RegPairOdd:
+    Odd = 1;
+    break;
+  default:
+    TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF, VRM);
+    return;
+  }
+
+  // This register should preferably be even (Odd == 0) or odd (Odd == 1).
+  // Check if the other part of the pair has already been assigned, and provide
+  // the paired register as the first hint.
+  unsigned PairedPhys = 0;
+  if (VRM && VRM->hasPhys(Hint.second)) {
+    PairedPhys = getPairedGPR(VRM->getPhys(Hint.second), Odd, this);
+    if (PairedPhys && MRI.isReserved(PairedPhys))
+      PairedPhys = 0;
+  }
+
+  // First prefer the paired physreg.
+  if (PairedPhys &&
+      std::find(Order.begin(), Order.end(), PairedPhys) != Order.end())
+    Hints.push_back(PairedPhys);
+
+  // Then prefer even or odd registers.
+  for (unsigned I = 0, E = Order.size(); I != E; ++I) {
+    unsigned Reg = Order[I];
+    if (Reg == PairedPhys || (getEncodingValue(Reg) & 1) != Odd)
+      continue;
+    // Don't provide hints that are paired to a reserved register.
+    unsigned Paired = getPairedGPR(Reg, !Odd, this);
+    if (!Paired || MRI.isReserved(Paired))
+      continue;
+    Hints.push_back(Reg);
+  }
+}
+
+void
+ARMBaseRegisterInfo::UpdateRegAllocHint(unsigned Reg, unsigned NewReg,
+                                        MachineFunction &MF) const {
+  MachineRegisterInfo *MRI = &MF.getRegInfo();
+  std::pair<unsigned, unsigned> Hint = MRI->getRegAllocationHint(Reg);
+  if ((Hint.first == (unsigned)ARMRI::RegPairOdd ||
+       Hint.first == (unsigned)ARMRI::RegPairEven) &&
+      TargetRegisterInfo::isVirtualRegister(Hint.second)) {
+    // If 'Reg' is one of the even / odd register pair and it's now changed
+    // (e.g. coalesced) into a different register. The other register of the
+    // pair allocation hint must be updated to reflect the relationship
+    // change.
+    unsigned OtherReg = Hint.second;
+    Hint = MRI->getRegAllocationHint(OtherReg);
+    if (Hint.second == Reg)
+      // Make sure the pair has not already divorced.
+      MRI->setRegAllocationHint(OtherReg, Hint.first, NewReg);
+  }
+}
+
+bool
+ARMBaseRegisterInfo::avoidWriteAfterWrite(const TargetRegisterClass *RC) const {
+  // CortexA9 has a Write-after-write hazard for NEON registers.
+  if (!STI.isLikeA9())
+    return false;
+
+  switch (RC->getID()) {
+  case ARM::DPRRegClassID:
+  case ARM::DPR_8RegClassID:
+  case ARM::DPR_VFP2RegClassID:
+  case ARM::QPRRegClassID:
+  case ARM::QPR_8RegClassID:
+  case ARM::QPR_VFP2RegClassID:
+  case ARM::SPRRegClassID:
+  case ARM::SPR_8RegClassID:
+    // Avoid reusing S, D, and Q registers.
+    // Don't increase register pressure for QQ and QQQQ.
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool ARMBaseRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  // When outgoing call frames are so large that we adjust the stack pointer
+  // around the call, we can no longer use the stack pointer to reach the
+  // emergency spill slot.
+  if (needsStackRealignment(MF) && !TFI->hasReservedCallFrame(MF))
+    return true;
+
+  // Thumb has trouble with negative offsets from the FP. Thumb2 has a limited
+  // negative range for ldr/str (255), and thumb1 is positive offsets only.
+  // It's going to be better to use the SP or Base Pointer instead. When there
+  // are variable sized objects, we can't reference off of the SP, so we
+  // reserve a Base Pointer.
+  if (AFI->isThumbFunction() && MFI->hasVarSizedObjects()) {
+    // Conservatively estimate whether the negative offset from the frame
+    // pointer will be sufficient to reach. If a function has a smallish
+    // frame, it's less likely to have lots of spills and callee saved
+    // space, so it's all more likely to be within range of the frame pointer.
+    // If it's wrong, the scavenger will still enable access to work, it just
+    // won't be optimal.
+    if (AFI->isThumb2Function() && MFI->getLocalFrameSize() < 128)
+      return false;
+    return true;
+  }
+
+  return false;
+}
+
+bool ARMBaseRegisterInfo::canRealignStack(const MachineFunction &MF) const {
+  const MachineRegisterInfo *MRI = &MF.getRegInfo();
+  const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  // We can't realign the stack if:
+  // 1. Dynamic stack realignment is explicitly disabled,
+  // 2. This is a Thumb1 function (it's not useful, so we don't bother), or
+  // 3. There are VLAs in the function and the base pointer is disabled.
+  if (MF.getFunction()->hasFnAttribute("no-realign-stack"))
+    return false;
+  if (AFI->isThumb1OnlyFunction())
+    return false;
+  // Stack realignment requires a frame pointer.  If we already started
+  // register allocation with frame pointer elimination, it is too late now.
+  if (!MRI->canReserveReg(FramePtr))
+    return false;
+  // We may also need a base pointer if there are dynamic allocas or stack
+  // pointer adjustments around calls.
+  if (MF.getTarget().getFrameLowering()->hasReservedCallFrame(MF))
+    return true;
+  // A base pointer is required and allowed.  Check that it isn't too late to
+  // reserve it.
+  return MRI->canReserveReg(BasePtr);
+}
+
+bool ARMBaseRegisterInfo::
+needsStackRealignment(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const Function *F = MF.getFunction();
+  unsigned StackAlign = MF.getTarget().getFrameLowering()->getStackAlignment();
+  bool requiresRealignment =
+    ((MFI->getMaxAlignment() > StackAlign) ||
+     F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                     Attribute::StackAlignment));
+
+  return requiresRealignment && canRealignStack(MF);
+}
+
+bool ARMBaseRegisterInfo::
+cannotEliminateFrame(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  if (MF.getTarget().Options.DisableFramePointerElim(MF) && MFI->adjustsStack())
+    return true;
+  return MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken()
+    || needsStackRealignment(MF);
+}
+
+unsigned
+ARMBaseRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  if (TFI->hasFP(MF))
+    return FramePtr;
+  return ARM::SP;
+}
+
+/// emitLoadConstPool - Emits a load from constpool to materialize the
+/// specified immediate.
+void ARMBaseRegisterInfo::
+emitLoadConstPool(MachineBasicBlock &MBB,
+                  MachineBasicBlock::iterator &MBBI,
+                  DebugLoc dl,
+                  unsigned DestReg, unsigned SubIdx, int Val,
+                  ARMCC::CondCodes Pred,
+                  unsigned PredReg, unsigned MIFlags) const {
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  MachineConstantPool *ConstantPool = MF.getConstantPool();
+  const Constant *C =
+        ConstantInt::get(Type::getInt32Ty(MF.getFunction()->getContext()), Val);
+  unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4);
+
+  BuildMI(MBB, MBBI, dl, TII.get(ARM::LDRcp))
+    .addReg(DestReg, getDefRegState(true), SubIdx)
+    .addConstantPoolIndex(Idx)
+    .addImm(0).addImm(Pred).addReg(PredReg)
+    .setMIFlags(MIFlags);
+}
+
+bool ARMBaseRegisterInfo::mayOverrideLocalAssignment() const {
+  // The native linux build hits a downstream codegen bug when this is enabled.
+  return STI.isTargetDarwin();
+}
+
+bool ARMBaseRegisterInfo::
+requiresRegisterScavenging(const MachineFunction &MF) const {
+  return true;
+}
+
+bool ARMBaseRegisterInfo::
+trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
+  return true;
+}
+
+bool ARMBaseRegisterInfo::
+requiresFrameIndexScavenging(const MachineFunction &MF) const {
+  return true;
+}
+
+bool ARMBaseRegisterInfo::
+requiresVirtualBaseRegisters(const MachineFunction &MF) const {
+  return true;
+}
+
+int64_t ARMBaseRegisterInfo::
+getFrameIndexInstrOffset(const MachineInstr *MI, int Idx) const {
+  const MCInstrDesc &Desc = MI->getDesc();
+  unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
+  int64_t InstrOffs = 0;
+  int Scale = 1;
+  unsigned ImmIdx = 0;
+  switch (AddrMode) {
+  case ARMII::AddrModeT2_i8:
+  case ARMII::AddrModeT2_i12:
+  case ARMII::AddrMode_i12:
+    InstrOffs = MI->getOperand(Idx+1).getImm();
+    Scale = 1;
+    break;
+  case ARMII::AddrMode5: {
+    // VFP address mode.
+    const MachineOperand &OffOp = MI->getOperand(Idx+1);
+    InstrOffs = ARM_AM::getAM5Offset(OffOp.getImm());
+    if (ARM_AM::getAM5Op(OffOp.getImm()) == ARM_AM::sub)
+      InstrOffs = -InstrOffs;
+    Scale = 4;
+    break;
+  }
+  case ARMII::AddrMode2: {
+    ImmIdx = Idx+2;
+    InstrOffs = ARM_AM::getAM2Offset(MI->getOperand(ImmIdx).getImm());
+    if (ARM_AM::getAM2Op(MI->getOperand(ImmIdx).getImm()) == ARM_AM::sub)
+      InstrOffs = -InstrOffs;
+    break;
+  }
+  case ARMII::AddrMode3: {
+    ImmIdx = Idx+2;
+    InstrOffs = ARM_AM::getAM3Offset(MI->getOperand(ImmIdx).getImm());
+    if (ARM_AM::getAM3Op(MI->getOperand(ImmIdx).getImm()) == ARM_AM::sub)
+      InstrOffs = -InstrOffs;
+    break;
+  }
+  case ARMII::AddrModeT1_s: {
+    ImmIdx = Idx+1;
+    InstrOffs = MI->getOperand(ImmIdx).getImm();
+    Scale = 4;
+    break;
+  }
+  default:
+    llvm_unreachable("Unsupported addressing mode!");
+  }
+
+  return InstrOffs * Scale;
+}
+
+/// needsFrameBaseReg - Returns true if the instruction's frame index
+/// reference would be better served by a base register other than FP
+/// or SP. Used by LocalStackFrameAllocation to determine which frame index
+/// references it should create new base registers for.
+bool ARMBaseRegisterInfo::
+needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
+  for (unsigned i = 0; !MI->getOperand(i).isFI(); ++i) {
+    assert(i < MI->getNumOperands() &&"Instr doesn't have FrameIndex operand!");
+  }
+
+  // It's the load/store FI references that cause issues, as it can be difficult
+  // to materialize the offset if it won't fit in the literal field. Estimate
+  // based on the size of the local frame and some conservative assumptions
+  // about the rest of the stack frame (note, this is pre-regalloc, so
+  // we don't know everything for certain yet) whether this offset is likely
+  // to be out of range of the immediate. Return true if so.
+
+  // We only generate virtual base registers for loads and stores, so
+  // return false for everything else.
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+  case ARM::LDRi12: case ARM::LDRH: case ARM::LDRBi12:
+  case ARM::STRi12: case ARM::STRH: case ARM::STRBi12:
+  case ARM::t2LDRi12: case ARM::t2LDRi8:
+  case ARM::t2STRi12: case ARM::t2STRi8:
+  case ARM::VLDRS: case ARM::VLDRD:
+  case ARM::VSTRS: case ARM::VSTRD:
+  case ARM::tSTRspi: case ARM::tLDRspi:
+    break;
+  default:
+    return false;
+  }
+
+  // Without a virtual base register, if the function has variable sized
+  // objects, all fixed-size local references will be via the frame pointer,
+  // Approximate the offset and see if it's legal for the instruction.
+  // Note that the incoming offset is based on the SP value at function entry,
+  // so it'll be negative.
+  MachineFunction &MF = *MI->getParent()->getParent();
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+
+  // Estimate an offset from the frame pointer.
+  // Conservatively assume all callee-saved registers get pushed. R4-R6
+  // will be earlier than the FP, so we ignore those.
+  // R7, LR
+  int64_t FPOffset = Offset - 8;
+  // ARM and Thumb2 functions also need to consider R8-R11 and D8-D15
+  if (!AFI->isThumbFunction() || !AFI->isThumb1OnlyFunction())
+    FPOffset -= 80;
+  // Estimate an offset from the stack pointer.
+  // The incoming offset is relating to the SP at the start of the function,
+  // but when we access the local it'll be relative to the SP after local
+  // allocation, so adjust our SP-relative offset by that allocation size.
+  Offset = -Offset;
+  Offset += MFI->getLocalFrameSize();
+  // Assume that we'll have at least some spill slots allocated.
+  // FIXME: This is a total SWAG number. We should run some statistics
+  //        and pick a real one.
+  Offset += 128; // 128 bytes of spill slots
+
+  // If there is a frame pointer, try using it.
+  // The FP is only available if there is no dynamic realignment. We
+  // don't know for sure yet whether we'll need that, so we guess based
+  // on whether there are any local variables that would trigger it.
+  unsigned StackAlign = TFI->getStackAlignment();
+  if (TFI->hasFP(MF) &&
+      !((MFI->getLocalFrameMaxAlign() > StackAlign) && canRealignStack(MF))) {
+    if (isFrameOffsetLegal(MI, FPOffset))
+      return false;
+  }
+  // If we can reference via the stack pointer, try that.
+  // FIXME: This (and the code that resolves the references) can be improved
+  //        to only disallow SP relative references in the live range of
+  //        the VLA(s). In practice, it's unclear how much difference that
+  //        would make, but it may be worth doing.
+  if (!MFI->hasVarSizedObjects() && isFrameOffsetLegal(MI, Offset))
+    return false;
+
+  // The offset likely isn't legal, we want to allocate a virtual base register.
+  return true;
+}
+
+/// materializeFrameBaseRegister - Insert defining instruction(s) for BaseReg to
+/// be a pointer to FrameIdx at the beginning of the basic block.
+void ARMBaseRegisterInfo::
+materializeFrameBaseRegister(MachineBasicBlock *MBB,
+                             unsigned BaseReg, int FrameIdx,
+                             int64_t Offset) const {
+  ARMFunctionInfo *AFI = MBB->getParent()->getInfo<ARMFunctionInfo>();
+  unsigned ADDriOpc = !AFI->isThumbFunction() ? ARM::ADDri :
+    (AFI->isThumb1OnlyFunction() ? ARM::tADDrSPi : ARM::t2ADDri);
+
+  MachineBasicBlock::iterator Ins = MBB->begin();
+  DebugLoc DL;                  // Defaults to "unknown"
+  if (Ins != MBB->end())
+    DL = Ins->getDebugLoc();
+
+  const MachineFunction &MF = *MBB->getParent();
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const MCInstrDesc &MCID = TII.get(ADDriOpc);
+  MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0, this, MF));
+
+  MachineInstrBuilder MIB = AddDefaultPred(BuildMI(*MBB, Ins, DL, MCID, BaseReg)
+    .addFrameIndex(FrameIdx).addImm(Offset));
+
+  if (!AFI->isThumb1OnlyFunction())
+    AddDefaultCC(MIB);
+}
+
+void ARMBaseRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+                                            int64_t Offset) const {
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const ARMBaseInstrInfo &TII =
+    *static_cast<const ARMBaseInstrInfo*>(MF.getTarget().getInstrInfo());
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  int Off = Offset; // ARM doesn't need the general 64-bit offsets
+  unsigned i = 0;
+
+  assert(!AFI->isThumb1OnlyFunction() &&
+         "This resolveFrameIndex does not support Thumb1!");
+
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+  bool Done = false;
+  if (!AFI->isThumbFunction())
+    Done = rewriteARMFrameIndex(MI, i, BaseReg, Off, TII);
+  else {
+    assert(AFI->isThumb2Function());
+    Done = rewriteT2FrameIndex(MI, i, BaseReg, Off, TII);
+  }
+  assert (Done && "Unable to resolve frame index!");
+  (void)Done;
+}
+
+bool ARMBaseRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
+                                             int64_t Offset) const {
+  const MCInstrDesc &Desc = MI->getDesc();
+  unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
+  unsigned i = 0;
+
+  while (!MI->getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI->getNumOperands() &&"Instr doesn't have FrameIndex operand!");
+  }
+
+  // AddrMode4 and AddrMode6 cannot handle any offset.
+  if (AddrMode == ARMII::AddrMode4 || AddrMode == ARMII::AddrMode6)
+    return Offset == 0;
+
+  unsigned NumBits = 0;
+  unsigned Scale = 1;
+  bool isSigned = true;
+  switch (AddrMode) {
+  case ARMII::AddrModeT2_i8:
+  case ARMII::AddrModeT2_i12:
+    // i8 supports only negative, and i12 supports only positive, so
+    // based on Offset sign, consider the appropriate instruction
+    Scale = 1;
+    if (Offset < 0) {
+      NumBits = 8;
+      Offset = -Offset;
+    } else {
+      NumBits = 12;
+    }
+    break;
+  case ARMII::AddrMode5:
+    // VFP address mode.
+    NumBits = 8;
+    Scale = 4;
+    break;
+  case ARMII::AddrMode_i12:
+  case ARMII::AddrMode2:
+    NumBits = 12;
+    break;
+  case ARMII::AddrMode3:
+    NumBits = 8;
+    break;
+  case ARMII::AddrModeT1_s:
+    NumBits = 5;
+    Scale = 4;
+    isSigned = false;
+    break;
+  default:
+    llvm_unreachable("Unsupported addressing mode!");
+  }
+
+  Offset += getFrameIndexInstrOffset(MI, i);
+  // Make sure the offset is encodable for instructions that scale the
+  // immediate.
+  if ((Offset & (Scale-1)) != 0)
+    return false;
+
+  if (isSigned && Offset < 0)
+    Offset = -Offset;
+
+  unsigned Mask = (1 << NumBits) - 1;
+  if ((unsigned)Offset <= Mask * Scale)
+    return true;
+
+  return false;
+}
+
+void
+ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                         int SPAdj, unsigned FIOperandNum,
+                                         RegScavenger *RS) const {
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const ARMBaseInstrInfo &TII =
+    *static_cast<const ARMBaseInstrInfo*>(MF.getTarget().getInstrInfo());
+  const ARMFrameLowering *TFI =
+    static_cast<const ARMFrameLowering*>(MF.getTarget().getFrameLowering());
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  assert(!AFI->isThumb1OnlyFunction() &&
+         "This eliminateFrameIndex does not support Thumb1!");
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+  unsigned FrameReg;
+
+  int Offset = TFI->ResolveFrameIndexReference(MF, FrameIndex, FrameReg, SPAdj);
+
+  // PEI::scavengeFrameVirtualRegs() cannot accurately track SPAdj because the
+  // call frame setup/destroy instructions have already been eliminated.  That
+  // means the stack pointer cannot be used to access the emergency spill slot
+  // when !hasReservedCallFrame().
+#ifndef NDEBUG
+  if (RS && FrameReg == ARM::SP && RS->isScavengingFrameIndex(FrameIndex)){
+    assert(TFI->hasReservedCallFrame(MF) &&
+           "Cannot use SP to access the emergency spill slot in "
+           "functions without a reserved call frame");
+    assert(!MF.getFrameInfo()->hasVarSizedObjects() &&
+           "Cannot use SP to access the emergency spill slot in "
+           "functions with variable sized frame objects");
+  }
+#endif // NDEBUG
+
+  assert(!MI.isDebugValue() && "DBG_VALUEs should be handled in target-independent code");
+
+  // Modify MI as necessary to handle as much of 'Offset' as possible
+  bool Done = false;
+  if (!AFI->isThumbFunction())
+    Done = rewriteARMFrameIndex(MI, FIOperandNum, FrameReg, Offset, TII);
+  else {
+    assert(AFI->isThumb2Function());
+    Done = rewriteT2FrameIndex(MI, FIOperandNum, FrameReg, Offset, TII);
+  }
+  if (Done)
+    return;
+
+  // If we get here, the immediate doesn't fit into the instruction.  We folded
+  // as much as possible above, handle the rest, providing a register that is
+  // SP+LargeImm.
+  assert((Offset ||
+          (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrMode4 ||
+          (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrMode6) &&
+         "This code isn't needed if offset already handled!");
+
+  unsigned ScratchReg = 0;
+  int PIdx = MI.findFirstPredOperandIdx();
+  ARMCC::CondCodes Pred = (PIdx == -1)
+    ? ARMCC::AL : (ARMCC::CondCodes)MI.getOperand(PIdx).getImm();
+  unsigned PredReg = (PIdx == -1) ? 0 : MI.getOperand(PIdx+1).getReg();
+  if (Offset == 0)
+    // Must be addrmode4/6.
+    MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false, false, false);
+  else {
+    ScratchReg = MF.getRegInfo().createVirtualRegister(&ARM::GPRRegClass);
+    if (!AFI->isThumbFunction())
+      emitARMRegPlusImmediate(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg,
+                              Offset, Pred, PredReg, TII);
+    else {
+      assert(AFI->isThumb2Function());
+      emitT2RegPlusImmediate(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg,
+                             Offset, Pred, PredReg, TII);
+    }
+    // Update the original instruction to use the scratch register.
+    MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false,true);
+  }
+}
+
+bool ARMBaseRegisterInfo::shouldCoalesce(MachineInstr *MI,
+                                  const TargetRegisterClass *SrcRC,
+                                  unsigned SubReg,
+                                  const TargetRegisterClass *DstRC,
+                                  unsigned DstSubReg,
+                                  const TargetRegisterClass *NewRC) const {
+  auto MBB = MI->getParent();
+  auto MF = MBB->getParent();
+  const MachineRegisterInfo &MRI = MF->getRegInfo();
+  // If not copying into a sub-register this should be ok because we shouldn't
+  // need to split the reg.
+  if (!DstSubReg)
+    return true;
+  // Small registers don't frequently cause a problem, so we can coalesce them.
+  if (NewRC->getSize() < 32 && DstRC->getSize() < 32 && SrcRC->getSize() < 32)
+    return true;
+
+  auto NewRCWeight =
+              MRI.getTargetRegisterInfo()->getRegClassWeight(NewRC);
+  auto SrcRCWeight =
+              MRI.getTargetRegisterInfo()->getRegClassWeight(SrcRC);
+  auto DstRCWeight =
+              MRI.getTargetRegisterInfo()->getRegClassWeight(DstRC);
+  // If the source register class is more expensive than the destination, the
+  // coalescing is probably profitable.
+  if (SrcRCWeight.RegWeight > NewRCWeight.RegWeight)
+    return true;
+  if (DstRCWeight.RegWeight > NewRCWeight.RegWeight)
+    return true;
+
+  // If the register allocator isn't constrained, we can always allow coalescing
+  // unfortunately we don't know yet if we will be constrained.
+  // The goal of this heuristic is to restrict how many expensive registers
+  // we allow to coalesce in a given basic block.
+  auto AFI = MF->getInfo<ARMFunctionInfo>();
+  auto It = AFI->getCoalescedWeight(MBB);
+
+  DEBUG(dbgs() << "\tARM::shouldCoalesce - Coalesced Weight: "
+    << It->second << "\n");
+  DEBUG(dbgs() << "\tARM::shouldCoalesce - Reg Weight: "
+    << NewRCWeight.RegWeight << "\n");
+
+  // This number is the largest round number that which meets the criteria:
+  //  (1) addresses PR18825
+  //  (2) generates better code in some test cases (like vldm-shed-a9.ll)
+  //  (3) Doesn't regress any test cases (in-tree, test-suite, and SPEC)
+  // In practice the SizeMultiplier will only factor in for straight line code
+  // that uses a lot of NEON vectors, which isn't terribly common.
+  unsigned SizeMultiplier = MBB->size()/100;
+  SizeMultiplier = SizeMultiplier ? SizeMultiplier : 1;
+  if (It->second < NewRCWeight.WeightLimit * SizeMultiplier) {
+    It->second += NewRCWeight.RegWeight;
+    return true;
+  }
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
new file mode 100644
index 0000000..833d3f2
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -0,0 +1,202 @@
+//===-- ARMBaseRegisterInfo.h - ARM Register Information Impl ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the base ARM implementation of TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMBASEREGISTERINFO_H
+#define ARMBASEREGISTERINFO_H
+
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "ARMGenRegisterInfo.inc"
+
+namespace llvm {
+  class ARMSubtarget;
+  class ARMBaseInstrInfo;
+  class Type;
+
+/// Register allocation hints.
+namespace ARMRI {
+  enum {
+    RegPairOdd  = 1,
+    RegPairEven = 2
+  };
+}
+
+/// isARMArea1Register - Returns true if the register is a low register (r0-r7)
+/// or a stack/pc register that we should push/pop.
+static inline bool isARMArea1Register(unsigned Reg, bool isIOS) {
+  using namespace ARM;
+  switch (Reg) {
+    case R0:  case R1:  case R2:  case R3:
+    case R4:  case R5:  case R6:  case R7:
+    case LR:  case SP:  case PC:
+      return true;
+    case R8:  case R9:  case R10: case R11: case R12:
+      // For iOS we want r7 and lr to be next to each other.
+      return !isIOS;
+    default:
+      return false;
+  }
+}
+
+static inline bool isARMArea2Register(unsigned Reg, bool isIOS) {
+  using namespace ARM;
+  switch (Reg) {
+    case R8: case R9: case R10: case R11: case R12:
+      // iOS has this second area.
+      return isIOS;
+    default:
+      return false;
+  }
+}
+
+static inline bool isARMArea3Register(unsigned Reg, bool isIOS) {
+  using namespace ARM;
+  switch (Reg) {
+    case D15: case D14: case D13: case D12:
+    case D11: case D10: case D9:  case D8:
+      return true;
+    default:
+      return false;
+  }
+}
+
+static inline bool isCalleeSavedRegister(unsigned Reg,
+                                         const MCPhysReg *CSRegs) {
+  for (unsigned i = 0; CSRegs[i]; ++i)
+    if (Reg == CSRegs[i])
+      return true;
+  return false;
+}
+
+class ARMBaseRegisterInfo : public ARMGenRegisterInfo {
+protected:
+  const ARMSubtarget &STI;
+
+  /// FramePtr - ARM physical register used as frame ptr.
+  unsigned FramePtr;
+
+  /// BasePtr - ARM physical register used as a base ptr in complex stack
+  /// frames. I.e., when we need a 3rd base, not just SP and FP, due to
+  /// variable size stack objects.
+  unsigned BasePtr;
+
+  // Can be only subclassed.
+  explicit ARMBaseRegisterInfo(const ARMSubtarget &STI);
+
+  // Return the opcode that implements 'Op', or 0 if no opcode
+  unsigned getOpcode(int Op) const;
+
+public:
+  /// Code Generation virtual methods...
+  const MCPhysReg *
+  getCalleeSavedRegs(const MachineFunction *MF = nullptr) const override;
+  const uint32_t *getCallPreservedMask(CallingConv::ID) const override;
+  const uint32_t *getNoPreservedMask() const;
+
+  /// getThisReturnPreservedMask - Returns a call preserved mask specific to the
+  /// case that 'returned' is on an i32 first argument if the calling convention
+  /// is one that can (partially) model this attribute with a preserved mask
+  /// (i.e. it is a calling convention that uses the same register for the first
+  /// i32 argument and an i32 return value)
+  ///
+  /// Should return NULL in the case that the calling convention does not have
+  /// this property
+  const uint32_t *getThisReturnPreservedMask(CallingConv::ID) const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+  const TargetRegisterClass *
+  getPointerRegClass(const MachineFunction &MF,
+                     unsigned Kind = 0) const override;
+  const TargetRegisterClass *
+  getCrossCopyRegClass(const TargetRegisterClass *RC) const override;
+
+  const TargetRegisterClass *
+  getLargestLegalSuperClass(const TargetRegisterClass *RC) const override;
+
+  unsigned getRegPressureLimit(const TargetRegisterClass *RC,
+                               MachineFunction &MF) const override;
+
+  void getRegAllocationHints(unsigned VirtReg,
+                             ArrayRef<MCPhysReg> Order,
+                             SmallVectorImpl<MCPhysReg> &Hints,
+                             const MachineFunction &MF,
+                             const VirtRegMap *VRM) const override;
+
+  void UpdateRegAllocHint(unsigned Reg, unsigned NewReg,
+                          MachineFunction &MF) const override;
+
+  bool avoidWriteAfterWrite(const TargetRegisterClass *RC) const override;
+
+  bool hasBasePointer(const MachineFunction &MF) const;
+
+  bool canRealignStack(const MachineFunction &MF) const;
+  bool needsStackRealignment(const MachineFunction &MF) const override;
+  int64_t getFrameIndexInstrOffset(const MachineInstr *MI,
+                                   int Idx) const override;
+  bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override;
+  void materializeFrameBaseRegister(MachineBasicBlock *MBB,
+                                    unsigned BaseReg, int FrameIdx,
+                                    int64_t Offset) const override;
+  void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+                         int64_t Offset) const override;
+  bool isFrameOffsetLegal(const MachineInstr *MI,
+                          int64_t Offset) const override;
+
+  bool cannotEliminateFrame(const MachineFunction &MF) const;
+
+  // Debug information queries.
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
+  unsigned getBaseRegister() const { return BasePtr; }
+
+  bool isLowRegister(unsigned Reg) const;
+
+
+  /// emitLoadConstPool - Emits a load from constpool to materialize the
+  /// specified immediate.
+  virtual void emitLoadConstPool(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator &MBBI,
+                                 DebugLoc dl, unsigned DestReg, unsigned SubIdx,
+                                 int Val, ARMCC::CondCodes Pred = ARMCC::AL,
+                                 unsigned PredReg = 0,
+                                 unsigned MIFlags = MachineInstr::NoFlags)const;
+
+  /// Code Generation virtual methods...
+  bool mayOverrideLocalAssignment() const override;
+
+  bool requiresRegisterScavenging(const MachineFunction &MF) const override;
+
+  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;
+
+  bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
+
+  bool requiresVirtualBaseRegisters(const MachineFunction &MF) const override;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                           int SPAdj, unsigned FIOperandNum,
+                           RegScavenger *RS = nullptr) const override;
+
+  /// \brief SrcRC and DstRC will be morphed into NewRC if this returns true
+  bool shouldCoalesce(MachineInstr *MI,
+                      const TargetRegisterClass *SrcRC,
+                      unsigned SubReg,
+                      const TargetRegisterClass *DstRC,
+                      unsigned DstSubReg,
+                      const TargetRegisterClass *NewRC) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMCallingConv.h b/contrib/llvm/lib/Target/ARM/ARMCallingConv.h
new file mode 100644
index 0000000..dc41c1c
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMCallingConv.h
@@ -0,0 +1,264 @@
+//=== ARMCallingConv.h - ARM Custom Calling Convention Routines -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the custom routines for the ARM Calling Convention that
+// aren't done by tablegen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMCALLINGCONV_H
+#define ARMCALLINGCONV_H
+
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMSubtarget.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+namespace llvm {
+
+// APCS f64 is in register pairs, possibly split to stack
+static bool f64AssignAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                          CCValAssign::LocInfo &LocInfo,
+                          CCState &State, bool CanFail) {
+  static const MCPhysReg RegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 };
+
+  // Try to get the first register.
+  if (unsigned Reg = State.AllocateReg(RegList, 4))
+    State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+  else {
+    // For the 2nd half of a v2f64, do not fail.
+    if (CanFail)
+      return false;
+
+    // Put the whole thing on the stack.
+    State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
+                                           State.AllocateStack(8, 4),
+                                           LocVT, LocInfo));
+    return true;
+  }
+
+  // Try to get the second register.
+  if (unsigned Reg = State.AllocateReg(RegList, 4))
+    State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+  else
+    State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
+                                           State.AllocateStack(4, 4),
+                                           LocVT, LocInfo));
+  return true;
+}
+
+static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                   CCValAssign::LocInfo &LocInfo,
+                                   ISD::ArgFlagsTy &ArgFlags,
+                                   CCState &State) {
+  if (!f64AssignAPCS(ValNo, ValVT, LocVT, LocInfo, State, true))
+    return false;
+  if (LocVT == MVT::v2f64 &&
+      !f64AssignAPCS(ValNo, ValVT, LocVT, LocInfo, State, false))
+    return false;
+  return true;  // we handled it
+}
+
+// AAPCS f64 is in aligned register pairs
+static bool f64AssignAAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                           CCValAssign::LocInfo &LocInfo,
+                           CCState &State, bool CanFail) {
+  static const MCPhysReg HiRegList[] = { ARM::R0, ARM::R2 };
+  static const MCPhysReg LoRegList[] = { ARM::R1, ARM::R3 };
+  static const MCPhysReg ShadowRegList[] = { ARM::R0, ARM::R1 };
+  static const MCPhysReg GPRArgRegs[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 };
+
+  unsigned Reg = State.AllocateReg(HiRegList, ShadowRegList, 2);
+  if (Reg == 0) {
+
+    // If we had R3 unallocated only, now we still must to waste it.
+    Reg = State.AllocateReg(GPRArgRegs, 4);
+    assert((!Reg || Reg == ARM::R3) && "Wrong GPRs usage for f64");
+
+    // For the 2nd half of a v2f64, do not just fail.
+    if (CanFail)
+      return false;
+
+    // Put the whole thing on the stack.
+    State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
+                                           State.AllocateStack(8, 8),
+                                           LocVT, LocInfo));
+    return true;
+  }
+
+  unsigned i;
+  for (i = 0; i < 2; ++i)
+    if (HiRegList[i] == Reg)
+      break;
+
+  unsigned T = State.AllocateReg(LoRegList[i]);
+  (void)T;
+  assert(T == LoRegList[i] && "Could not allocate register");
+
+  State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+  State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i],
+                                         LocVT, LocInfo));
+  return true;
+}
+
+static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                    CCValAssign::LocInfo &LocInfo,
+                                    ISD::ArgFlagsTy &ArgFlags,
+                                    CCState &State) {
+  if (!f64AssignAAPCS(ValNo, ValVT, LocVT, LocInfo, State, true))
+    return false;
+  if (LocVT == MVT::v2f64 &&
+      !f64AssignAAPCS(ValNo, ValVT, LocVT, LocInfo, State, false))
+    return false;
+  return true;  // we handled it
+}
+
+static bool f64RetAssign(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                         CCValAssign::LocInfo &LocInfo, CCState &State) {
+  static const MCPhysReg HiRegList[] = { ARM::R0, ARM::R2 };
+  static const MCPhysReg LoRegList[] = { ARM::R1, ARM::R3 };
+
+  unsigned Reg = State.AllocateReg(HiRegList, LoRegList, 2);
+  if (Reg == 0)
+    return false; // we didn't handle it
+
+  unsigned i;
+  for (i = 0; i < 2; ++i)
+    if (HiRegList[i] == Reg)
+      break;
+
+  State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+  State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i],
+                                         LocVT, LocInfo));
+  return true;
+}
+
+static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                      CCValAssign::LocInfo &LocInfo,
+                                      ISD::ArgFlagsTy &ArgFlags,
+                                      CCState &State) {
+  if (!f64RetAssign(ValNo, ValVT, LocVT, LocInfo, State))
+    return false;
+  if (LocVT == MVT::v2f64 && !f64RetAssign(ValNo, ValVT, LocVT, LocInfo, State))
+    return false;
+  return true;  // we handled it
+}
+
+static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                       CCValAssign::LocInfo &LocInfo,
+                                       ISD::ArgFlagsTy &ArgFlags,
+                                       CCState &State) {
+  return RetCC_ARM_APCS_Custom_f64(ValNo, ValVT, LocVT, LocInfo, ArgFlags,
+                                   State);
+}
+
+static const uint16_t SRegList[] = { ARM::S0,  ARM::S1,  ARM::S2,  ARM::S3,
+                                     ARM::S4,  ARM::S5,  ARM::S6,  ARM::S7,
+                                     ARM::S8,  ARM::S9,  ARM::S10, ARM::S11,
+                                     ARM::S12, ARM::S13, ARM::S14,  ARM::S15 };
+static const uint16_t DRegList[] = { ARM::D0, ARM::D1, ARM::D2, ARM::D3,
+                                     ARM::D4, ARM::D5, ARM::D6, ARM::D7 };
+static const uint16_t QRegList[] = { ARM::Q0, ARM::Q1, ARM::Q2, ARM::Q3 };
+
+// Allocate part of an AAPCS HFA or HVA. We assume that each member of the HA
+// has InConsecutiveRegs set, and that the last member also has
+// InConsecutiveRegsLast set. We must process all members of the HA before
+// we can allocate it, as we need to know the total number of registers that
+// will be needed in order to (attempt to) allocate a contiguous block.
+static bool CC_ARM_AAPCS_Custom_HA(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                   CCValAssign::LocInfo &LocInfo,
+                                   ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+  SmallVectorImpl<CCValAssign> &PendingHAMembers = State.getPendingLocs();
+  // AAPCS HFAs must have 1-4 elements, all of the same type
+  assert(PendingHAMembers.size() < 8);
+  if (PendingHAMembers.size() > 0)
+    assert(PendingHAMembers[0].getLocVT() == LocVT);
+
+  // Add the argument to the list to be allocated once we know the size of the
+  // HA
+  PendingHAMembers.push_back(
+      CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
+
+  if (ArgFlags.isInConsecutiveRegsLast()) {
+    assert(PendingHAMembers.size() > 0 && PendingHAMembers.size() <= 8 &&
+           "Homogeneous aggregates must have between 1 and 4 members");
+
+    // Try to allocate a contiguous block of registers, each of the correct
+    // size to hold one member.
+    const uint16_t *RegList;
+    unsigned NumRegs;
+    switch (LocVT.SimpleTy) {
+    case MVT::i32:
+    case MVT::f32:
+      RegList = SRegList;
+      NumRegs = 16;
+      break;
+    case MVT::f64:
+      RegList = DRegList;
+      NumRegs = 8;
+      break;
+    case MVT::v2f64:
+      RegList = QRegList;
+      NumRegs = 4;
+      break;
+    default:
+      llvm_unreachable("Unexpected member type for HA");
+      break;
+    }
+
+    unsigned RegResult =
+        State.AllocateRegBlock(RegList, NumRegs, PendingHAMembers.size());
+
+    if (RegResult) {
+      for (SmallVectorImpl<CCValAssign>::iterator It = PendingHAMembers.begin();
+           It != PendingHAMembers.end(); ++It) {
+        It->convertToReg(RegResult);
+        State.addLoc(*It);
+        ++RegResult;
+      }
+      PendingHAMembers.clear();
+      return true;
+    }
+
+    // Register allocation failed, fall back to the stack
+
+    // Mark all VFP regs as unavailable (AAPCS rule C.2.vfp)
+    for (unsigned regNo = 0; regNo < 16; ++regNo)
+      State.AllocateReg(SRegList[regNo]);
+
+    unsigned Size = LocVT.getSizeInBits() / 8;
+    unsigned Align = Size;
+
+    if (LocVT.SimpleTy == MVT::v2f64 || LocVT.SimpleTy == MVT::i32) {
+      // Vectors are always aligned to 8 bytes. If we've seen an i32 here
+      // it's because it's been split from a larger type, also with align 8.
+      Align = 8;
+    }
+
+    for (auto It : PendingHAMembers) {
+      It.convertToMem(State.AllocateStack(Size, Align));
+      State.addLoc(It);
+
+      // Only the first member needs to be aligned.
+      Align = 1;
+    }
+
+    // All pending members have now been allocated
+    PendingHAMembers.clear();
+  }
+
+  // This will be allocated by the last member of the HA
+  return true;
+}
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMCallingConv.td b/contrib/llvm/lib/Target/ARM/ARMCallingConv.td
new file mode 100644
index 0000000..526089b
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMCallingConv.td
@@ -0,0 +1,241 @@
+//===-- ARMCallingConv.td - Calling Conventions for ARM ----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This describes the calling conventions for ARM architecture.
+//===----------------------------------------------------------------------===//
+
+/// CCIfAlign - Match of the original alignment of the arg
+class CCIfAlign<string Align, CCAction A>:
+  CCIf<!strconcat("ArgFlags.getOrigAlign() == ", Align), A>;
+
+//===----------------------------------------------------------------------===//
+// ARM APCS Calling Convention
+//===----------------------------------------------------------------------===//
+def CC_ARM_APCS : CallingConv<[
+
+  // Handles byval parameters.
+  CCIfByVal<CCPassByVal<4, 4>>,
+    
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  // f64 and v2f64 are passed in adjacent GPRs, possibly split onto the stack
+  CCIfType<[f64, v2f64], CCCustom<"CC_ARM_APCS_Custom_f64">>,
+
+  CCIfType<[f32], CCBitConvertToType<i32>>,
+  CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>,
+
+  CCIfType<[i32], CCAssignToStack<4, 4>>,
+  CCIfType<[f64], CCAssignToStack<8, 4>>,
+  CCIfType<[v2f64], CCAssignToStack<16, 4>>
+]>;
+
+def RetCC_ARM_APCS : CallingConv<[
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+  CCIfType<[f32], CCBitConvertToType<i32>>,
+
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_APCS_Custom_f64">>,
+
+  CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>,
+  CCIfType<[i64], CCAssignToRegWithShadow<[R0, R2], [R1, R3]>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// ARM APCS Calling Convention for FastCC (when VFP2 or later is available)
+//===----------------------------------------------------------------------===//
+def FastCC_ARM_APCS : CallingConv<[
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
+  CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
+  CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
+                                 S9, S10, S11, S12, S13, S14, S15]>>,
+
+  // CPRCs may be allocated to co-processor registers or the stack - they
+  // may never be allocated to core registers. 
+  CCIfType<[f32], CCAssignToStackWithShadow<4, 4, [Q0, Q1, Q2, Q3]>>,
+  CCIfType<[f64], CCAssignToStackWithShadow<8, 4, [Q0, Q1, Q2, Q3]>>,
+  CCIfType<[v2f64], CCAssignToStackWithShadow<16, 4, [Q0, Q1, Q2, Q3]>>,
+
+  CCDelegateTo<CC_ARM_APCS>
+]>;
+
+def RetFastCC_ARM_APCS : CallingConv<[
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
+  CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
+  CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
+                                 S9, S10, S11, S12, S13, S14, S15]>>,
+  CCDelegateTo<RetCC_ARM_APCS>
+]>;
+
+//===----------------------------------------------------------------------===//
+// ARM APCS Calling Convention for GHC
+//===----------------------------------------------------------------------===//
+
+def CC_ARM_APCS_GHC : CallingConv<[
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  CCIfType<[v2f64], CCAssignToReg<[Q4, Q5]>>,
+  CCIfType<[f64], CCAssignToReg<[D8, D9, D10, D11]>>,
+  CCIfType<[f32], CCAssignToReg<[S16, S17, S18, S19, S20, S21, S22, S23]>>,
+
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // Pass in STG registers: Base, Sp, Hp, R1, R2, R3, R4, SpLim
+  CCIfType<[i32], CCAssignToReg<[R4, R5, R6, R7, R8, R9, R10, R11]>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// ARM AAPCS (EABI) Calling Convention, common parts
+//===----------------------------------------------------------------------===//
+
+def CC_ARM_AAPCS_Common : CallingConv<[
+
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+  // i64/f64 is passed in even pairs of GPRs
+  // i64 is 8-aligned i32 here, so we may need to eat R1 as a pad register
+  // (and the same is true for f64 if VFP is not enabled)
+  CCIfType<[i32], CCIfAlign<"8", CCAssignToRegWithShadow<[R0, R2], [R0, R1]>>>,
+  CCIfType<[i32], CCIf<"ArgFlags.getOrigAlign() != 8",
+                       CCAssignToReg<[R0, R1, R2, R3]>>>,
+
+  CCIfType<[i32], CCIfAlign<"8", CCAssignToStackWithShadow<4, 8, [R0, R1, R2, R3]>>>,
+  CCIfType<[i32], CCAssignToStackWithShadow<4, 4, [R0, R1, R2, R3]>>,
+  CCIfType<[f32], CCAssignToStackWithShadow<4, 4, [Q0, Q1, Q2, Q3]>>,
+  CCIfType<[f64], CCAssignToStackWithShadow<8, 8, [Q0, Q1, Q2, Q3]>>,
+  CCIfType<[v2f64], CCAssignToStackWithShadow<16, 8, [Q0, Q1, Q2, Q3]>>
+]>;
+
+def RetCC_ARM_AAPCS_Common : CallingConv<[
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+  CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>,
+  CCIfType<[i64], CCAssignToRegWithShadow<[R0, R2], [R1, R3]>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// ARM AAPCS (EABI) Calling Convention
+//===----------------------------------------------------------------------===//
+
+def CC_ARM_AAPCS : CallingConv<[
+  // Handles byval parameters.
+  CCIfByVal<CCPassByVal<4, 4>>,
+
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  CCIfType<[f64, v2f64], CCCustom<"CC_ARM_AAPCS_Custom_f64">>,
+  CCIfType<[f32], CCBitConvertToType<i32>>,
+  CCDelegateTo<CC_ARM_AAPCS_Common>
+]>;
+
+def RetCC_ARM_AAPCS : CallingConv<[
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_AAPCS_Custom_f64">>,
+  CCIfType<[f32], CCBitConvertToType<i32>>,
+  CCDelegateTo<RetCC_ARM_AAPCS_Common>
+]>;
+
+//===----------------------------------------------------------------------===//
+// ARM AAPCS-VFP (EABI) Calling Convention
+// Also used for FastCC (when VFP2 or later is available)
+//===----------------------------------------------------------------------===//
+
+def CC_ARM_AAPCS_VFP : CallingConv<[
+  // Handles byval parameters.
+  CCIfByVal<CCPassByVal<4, 4>>,
+
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  // HFAs are passed in a contiguous block of registers, or on the stack
+  CCIfConsecutiveRegs<CCCustom<"CC_ARM_AAPCS_Custom_HA">>,
+
+  CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
+  CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
+  CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
+                                 S9, S10, S11, S12, S13, S14, S15]>>,
+  CCDelegateTo<CC_ARM_AAPCS_Common>
+]>;
+
+def RetCC_ARM_AAPCS_VFP : CallingConv<[
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
+  CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
+  CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
+                                 S9, S10, S11, S12, S13, S14, S15]>>,
+  CCDelegateTo<RetCC_ARM_AAPCS_Common>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Callee-saved register lists.
+//===----------------------------------------------------------------------===//
+
+def CSR_NoRegs : CalleeSavedRegs<(add)>;
+
+def CSR_AAPCS : CalleeSavedRegs<(add LR, R11, R10, R9, R8, R7, R6, R5, R4,
+                                     (sequence "D%u", 15, 8))>;
+
+// Constructors and destructors return 'this' in the ARM C++ ABI; since 'this'
+// and the pointer return value are both passed in R0 in these cases, this can
+// be partially modelled by treating R0 as a callee-saved register
+// Only the resulting RegMask is used; the SaveList is ignored
+def CSR_AAPCS_ThisReturn : CalleeSavedRegs<(add LR, R11, R10, R9, R8, R7, R6,
+                                            R5, R4, (sequence "D%u", 15, 8),
+                                            R0)>;
+
+// iOS ABI deviates from ARM standard ABI. R9 is not a callee-saved register.
+// Also save R7-R4 first to match the stack frame fixed spill areas.
+def CSR_iOS : CalleeSavedRegs<(add LR, R7, R6, R5, R4, (sub CSR_AAPCS, R9))>;
+
+def CSR_iOS_ThisReturn : CalleeSavedRegs<(add LR, R7, R6, R5, R4,
+                                         (sub CSR_AAPCS_ThisReturn, R9))>;
+
+// The "interrupt" attribute is used to generate code that is acceptable in
+// exception-handlers of various kinds. It makes us use a different return
+// instruction (handled elsewhere) and affects which registers we must return to
+// our "caller" in the same state as we receive them.
+
+// For most interrupts, all registers except SP and LR are shared with
+// user-space. We mark LR to be saved anyway, since this is what the ARM backend
+// generally does rather than tracking its liveness as a normal register.
+def CSR_GenericInt : CalleeSavedRegs<(add LR, (sequence "R%u", 12, 0))>;
+
+// The fast interrupt handlers have more private state and get their own copies
+// of R8-R12, in addition to SP and LR. As before, mark LR for saving too.
+
+// FIXME: we mark R11 as callee-saved since it's often the frame-pointer, and
+// current frame lowering expects to encounter it while processing callee-saved
+// registers.
+def CSR_FIQ : CalleeSavedRegs<(add LR, R11, (sequence "R%u", 7, 0))>;
+
+
diff --git a/contrib/llvm/lib/Target/ARM/ARMCodeEmitter.cpp b/contrib/llvm/lib/Target/ARM/ARMCodeEmitter.cpp
new file mode 100644
index 0000000..5fb6ebfe
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMCodeEmitter.cpp
@@ -0,0 +1,1909 @@
+//===-- ARM/ARMCodeEmitter.cpp - Convert ARM code to machine code ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the pass that transforms the ARM machine instructions into
+// relocatable machine code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMConstantPoolValue.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMRelocations.h"
+#include "ARMSubtarget.h"
+#include "ARMTargetMachine.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#ifndef NDEBUG
+#include <iomanip>
+#endif
+using namespace llvm;
+
+#define DEBUG_TYPE "jit"
+
+STATISTIC(NumEmitted, "Number of machine instructions emitted");
+
+namespace {
+
+  class ARMCodeEmitter : public MachineFunctionPass {
+    ARMJITInfo                *JTI;
+    const ARMBaseInstrInfo    *II;
+    const DataLayout          *TD;
+    const ARMSubtarget        *Subtarget;
+    TargetMachine             &TM;
+    JITCodeEmitter            &MCE;
+    MachineModuleInfo *MMI;
+    const std::vector<MachineConstantPoolEntry> *MCPEs;
+    const std::vector<MachineJumpTableEntry> *MJTEs;
+    bool IsPIC;
+    bool IsThumb;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<MachineModuleInfo>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    static char ID;
+  public:
+    ARMCodeEmitter(TargetMachine &tm, JITCodeEmitter &mce)
+      : MachineFunctionPass(ID), JTI(nullptr),
+        II((const ARMBaseInstrInfo *)tm.getInstrInfo()),
+        TD(tm.getDataLayout()), TM(tm),
+        MCE(mce), MCPEs(nullptr), MJTEs(nullptr),
+        IsPIC(TM.getRelocationModel() == Reloc::PIC_), IsThumb(false) {}
+
+    /// getBinaryCodeForInstr - This function, generated by the
+    /// CodeEmitterGenerator using TableGen, produces the binary encoding for
+    /// machine instructions.
+    uint64_t getBinaryCodeForInstr(const MachineInstr &MI) const;
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    const char *getPassName() const override {
+      return "ARM Machine Code Emitter";
+    }
+
+    void emitInstruction(const MachineInstr &MI);
+
+  private:
+
+    void emitWordLE(unsigned Binary);
+    void emitDWordLE(uint64_t Binary);
+    void emitConstPoolInstruction(const MachineInstr &MI);
+    void emitMOVi32immInstruction(const MachineInstr &MI);
+    void emitMOVi2piecesInstruction(const MachineInstr &MI);
+    void emitLEApcrelJTInstruction(const MachineInstr &MI);
+    void emitPseudoMoveInstruction(const MachineInstr &MI);
+    void addPCLabel(unsigned LabelID);
+    void emitPseudoInstruction(const MachineInstr &MI);
+    unsigned getMachineSoRegOpValue(const MachineInstr &MI,
+                                    const MCInstrDesc &MCID,
+                                    const MachineOperand &MO,
+                                    unsigned OpIdx);
+
+    unsigned getMachineSoImmOpValue(unsigned SoImm);
+    unsigned getAddrModeSBit(const MachineInstr &MI,
+                             const MCInstrDesc &MCID) const;
+
+    void emitDataProcessingInstruction(const MachineInstr &MI,
+                                       unsigned ImplicitRd = 0,
+                                       unsigned ImplicitRn = 0);
+
+    void emitLoadStoreInstruction(const MachineInstr &MI,
+                                  unsigned ImplicitRd = 0,
+                                  unsigned ImplicitRn = 0);
+
+    void emitMiscLoadStoreInstruction(const MachineInstr &MI,
+                                      unsigned ImplicitRn = 0);
+
+    void emitLoadStoreMultipleInstruction(const MachineInstr &MI);
+
+    void emitMulFrmInstruction(const MachineInstr &MI);
+
+    void emitExtendInstruction(const MachineInstr &MI);
+
+    void emitMiscArithInstruction(const MachineInstr &MI);
+
+    void emitSaturateInstruction(const MachineInstr &MI);
+
+    void emitBranchInstruction(const MachineInstr &MI);
+
+    void emitInlineJumpTable(unsigned JTIndex);
+
+    void emitMiscBranchInstruction(const MachineInstr &MI);
+
+    void emitVFPArithInstruction(const MachineInstr &MI);
+
+    void emitVFPConversionInstruction(const MachineInstr &MI);
+
+    void emitVFPLoadStoreInstruction(const MachineInstr &MI);
+
+    void emitVFPLoadStoreMultipleInstruction(const MachineInstr &MI);
+
+    void emitNEONLaneInstruction(const MachineInstr &MI);
+    void emitNEONDupInstruction(const MachineInstr &MI);
+    void emitNEON1RegModImmInstruction(const MachineInstr &MI);
+    void emitNEON2RegInstruction(const MachineInstr &MI);
+    void emitNEON3RegInstruction(const MachineInstr &MI);
+
+    /// getMachineOpValue - Return binary encoding of operand. If the machine
+    /// operand requires relocation, record the relocation and return zero.
+    unsigned getMachineOpValue(const MachineInstr &MI,
+                               const MachineOperand &MO) const;
+    unsigned getMachineOpValue(const MachineInstr &MI, unsigned OpIdx) const {
+      return getMachineOpValue(MI, MI.getOperand(OpIdx));
+    }
+
+    // FIXME: The legacy JIT ARMCodeEmitter doesn't rely on the the
+    //  TableGen'erated getBinaryCodeForInstr() function to encode any
+    //  operand values, instead querying getMachineOpValue() directly for
+    //  each operand it needs to encode. Thus, any of the new encoder
+    //  helper functions can simply return 0 as the values the return
+    //  are already handled elsewhere. They are placeholders to allow this
+    //  encoder to continue to function until the MC encoder is sufficiently
+    //  far along that this one can be eliminated entirely.
+    unsigned NEONThumb2DataIPostEncoder(const MachineInstr &MI, unsigned Val)
+      const { return 0; }
+    unsigned NEONThumb2LoadStorePostEncoder(const MachineInstr &MI,unsigned Val)
+      const { return 0; }
+    unsigned NEONThumb2DupPostEncoder(const MachineInstr &MI,unsigned Val)
+      const { return 0; }
+    unsigned NEONThumb2V8PostEncoder(const MachineInstr &MI,unsigned Val)
+      const { return 0; }
+    unsigned VFPThumb2PostEncoder(const MachineInstr&MI, unsigned Val)
+      const { return 0; }
+    unsigned getAdrLabelOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getThumbAdrLabelOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getThumbBLTargetOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getThumbBLXTargetOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getThumbBRTargetOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getThumbBCCTargetOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getThumbCBTargetOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getBranchTargetOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getUnconditionalBranchTargetOpValue(const MachineInstr &MI,
+      unsigned Op) const { return 0; }
+    unsigned getARMBranchTargetOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getARMBLTargetOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getARMBLXTargetOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getCCOutOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getSOImmOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getT2SOImmOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getSORegRegOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getSORegImmOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getThumbAddrModeRegRegOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getT2AddrModeImm8OpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getT2Imm8s4OpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getT2AddrModeImm8s4OpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getT2AddrModeImm0_1020s4OpValue(const MachineInstr &MI,unsigned Op)
+      const { return 0; }
+    unsigned getT2AddrModeImm8OffsetOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getT2AddrModeSORegOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getT2SORegOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getT2AdrLabelOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getAddrMode6AddressOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getAddrMode6OneLane32AddressOpValue(const MachineInstr &MI,
+                                                 unsigned Op)
+      const { return 0; }
+    unsigned getAddrMode6DupAddressOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getAddrMode6OffsetOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getBitfieldInvertedMaskOpValue(const MachineInstr &MI,
+                                            unsigned Op) const { return 0; }
+    uint32_t getLdStSORegOpValue(const MachineInstr &MI, unsigned OpIdx)
+      const { return 0; }
+
+    unsigned getAddrModeImm12OpValue(const MachineInstr &MI, unsigned Op)
+      const {
+      // {17-13} = reg
+      // {12}    = (U)nsigned (add == '1', sub == '0')
+      // {11-0}  = imm12
+      const MachineOperand &MO  = MI.getOperand(Op);
+      const MachineOperand &MO1 = MI.getOperand(Op + 1);
+      if (!MO.isReg()) {
+        emitConstPoolAddress(MO.getIndex(), ARM::reloc_arm_cp_entry);
+        return 0;
+      }
+      unsigned Reg = II->getRegisterInfo().getEncodingValue(MO.getReg());
+      int32_t Imm12 = MO1.getImm();
+      uint32_t Binary;
+      Binary = Imm12 & 0xfff;
+      if (Imm12 >= 0)
+        Binary |= (1 << 12);
+      Binary |= (Reg << 13);
+      return Binary;
+    }
+
+    unsigned getHiLo16ImmOpValue(const MachineInstr &MI, unsigned Op) const {
+      return 0;
+    }
+
+    uint32_t getAddrMode2OffsetOpValue(const MachineInstr &MI, unsigned OpIdx)
+      const { return 0;}
+    uint32_t getPostIdxRegOpValue(const MachineInstr &MI, unsigned OpIdx)
+      const { return 0;}
+    uint32_t getAddrMode3OffsetOpValue(const MachineInstr &MI, unsigned OpIdx)
+      const { return 0;}
+    uint32_t getAddrMode3OpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    uint32_t getAddrModeThumbSPOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    uint32_t getAddrModeISOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    uint32_t getAddrModePCOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    uint32_t getAddrMode5OpValue(const MachineInstr &MI, unsigned Op) const {
+      // {17-13} = reg
+      // {12}    = (U)nsigned (add == '1', sub == '0')
+      // {11-0}  = imm12
+      const MachineOperand &MO  = MI.getOperand(Op);
+      const MachineOperand &MO1 = MI.getOperand(Op + 1);
+      if (!MO.isReg()) {
+        emitConstPoolAddress(MO.getIndex(), ARM::reloc_arm_cp_entry);
+        return 0;
+      }
+      unsigned Reg = II->getRegisterInfo().getEncodingValue(MO.getReg());
+      int32_t Imm12 = MO1.getImm();
+
+      // Special value for #-0
+      if (Imm12 == INT32_MIN)
+        Imm12 = 0;
+
+      // Immediate is always encoded as positive. The 'U' bit controls add vs
+      // sub.
+      bool isAdd = true;
+      if (Imm12 < 0) {
+        Imm12 = -Imm12;
+        isAdd = false;
+      }
+
+      uint32_t Binary = Imm12 & 0xfff;
+      if (isAdd)
+        Binary |= (1 << 12);
+      Binary |= (Reg << 13);
+      return Binary;
+    }
+    unsigned getNEONVcvtImm32OpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+
+    unsigned getRegisterListOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+
+    unsigned getShiftRight8Imm(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getShiftRight16Imm(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getShiftRight32Imm(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getShiftRight64Imm(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+
+    /// getMovi32Value - Return binary encoding of operand for movw/movt. If the
+    /// machine operand requires relocation, record the relocation and return
+    /// zero.
+    unsigned getMovi32Value(const MachineInstr &MI,const MachineOperand &MO,
+                            unsigned Reloc);
+
+    /// getShiftOp - Return the shift opcode (bit[6:5]) of the immediate value.
+    ///
+    unsigned getShiftOp(unsigned Imm) const ;
+
+    /// Routines that handle operands which add machine relocations which are
+    /// fixed up by the relocation stage.
+    void emitGlobalAddress(const GlobalValue *GV, unsigned Reloc,
+                           bool MayNeedFarStub,  bool Indirect,
+                           intptr_t ACPV = 0) const;
+    void emitExternalSymbolAddress(const char *ES, unsigned Reloc) const;
+    void emitConstPoolAddress(unsigned CPI, unsigned Reloc) const;
+    void emitJumpTableAddress(unsigned JTIndex, unsigned Reloc) const;
+    void emitMachineBasicBlock(MachineBasicBlock *BB, unsigned Reloc,
+                               intptr_t JTBase = 0) const;
+    unsigned encodeVFPRd(const MachineInstr &MI, unsigned OpIdx) const;
+    unsigned encodeVFPRn(const MachineInstr &MI, unsigned OpIdx) const;
+    unsigned encodeVFPRm(const MachineInstr &MI, unsigned OpIdx) const;
+    unsigned encodeNEONRd(const MachineInstr &MI, unsigned OpIdx) const;
+    unsigned encodeNEONRn(const MachineInstr &MI, unsigned OpIdx) const;
+    unsigned encodeNEONRm(const MachineInstr &MI, unsigned OpIdx) const;
+  };
+}
+
+char ARMCodeEmitter::ID = 0;
+
+/// createARMJITCodeEmitterPass - Return a pass that emits the collected ARM
+/// code to the specified MCE object.
+FunctionPass *llvm::createARMJITCodeEmitterPass(ARMBaseTargetMachine &TM,
+                                                JITCodeEmitter &JCE) {
+  return new ARMCodeEmitter(TM, JCE);
+}
+
+bool ARMCodeEmitter::runOnMachineFunction(MachineFunction &MF) {
+  TargetMachine &Target = const_cast<TargetMachine&>(MF.getTarget());
+
+  assert((Target.getRelocationModel() != Reloc::Default ||
+          Target.getRelocationModel() != Reloc::Static) &&
+         "JIT relocation model must be set to static or default!");
+
+  JTI = static_cast<ARMJITInfo*>(Target.getJITInfo());
+  II = static_cast<const ARMBaseInstrInfo*>(Target.getInstrInfo());
+  TD = Target.getDataLayout();
+
+  Subtarget = &TM.getSubtarget<ARMSubtarget>();
+  MCPEs = &MF.getConstantPool()->getConstants();
+  MJTEs = nullptr;
+  if (MF.getJumpTableInfo()) MJTEs = &MF.getJumpTableInfo()->getJumpTables();
+  IsPIC = TM.getRelocationModel() == Reloc::PIC_;
+  IsThumb = MF.getInfo<ARMFunctionInfo>()->isThumbFunction();
+  JTI->Initialize(MF, IsPIC);
+  MMI = &getAnalysis<MachineModuleInfo>();
+  MCE.setModuleInfo(MMI);
+
+  do {
+    DEBUG(errs() << "JITTing function '"
+          << MF.getName() << "'\n");
+    MCE.startFunction(MF);
+    for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
+         MBB != E; ++MBB) {
+      MCE.StartMachineBasicBlock(MBB);
+      for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+           I != E; ++I)
+        emitInstruction(*I);
+    }
+  } while (MCE.finishFunction(MF));
+
+  return false;
+}
+
+/// getShiftOp - Return the shift opcode (bit[6:5]) of the immediate value.
+///
+unsigned ARMCodeEmitter::getShiftOp(unsigned Imm) const {
+  switch (ARM_AM::getAM2ShiftOpc(Imm)) {
+  default: llvm_unreachable("Unknown shift opc!");
+  case ARM_AM::asr: return 2;
+  case ARM_AM::lsl: return 0;
+  case ARM_AM::lsr: return 1;
+  case ARM_AM::ror:
+  case ARM_AM::rrx: return 3;
+  }
+}
+
+/// getMovi32Value - Return binary encoding of operand for movw/movt. If the
+/// machine operand requires relocation, record the relocation and return zero.
+unsigned ARMCodeEmitter::getMovi32Value(const MachineInstr &MI,
+                                        const MachineOperand &MO,
+                                        unsigned Reloc) {
+  assert(((Reloc == ARM::reloc_arm_movt) || (Reloc == ARM::reloc_arm_movw))
+      && "Relocation to this function should be for movt or movw");
+
+  if (MO.isImm())
+    return static_cast<unsigned>(MO.getImm());
+  else if (MO.isGlobal())
+    emitGlobalAddress(MO.getGlobal(), Reloc, true, false);
+  else if (MO.isSymbol())
+    emitExternalSymbolAddress(MO.getSymbolName(), Reloc);
+  else if (MO.isMBB())
+    emitMachineBasicBlock(MO.getMBB(), Reloc);
+  else {
+#ifndef NDEBUG
+    errs() << MO;
+#endif
+    llvm_unreachable("Unsupported operand type for movw/movt");
+  }
+  return 0;
+}
+
+/// getMachineOpValue - Return binary encoding of operand. If the machine
+/// operand requires relocation, record the relocation and return zero.
+unsigned ARMCodeEmitter::getMachineOpValue(const MachineInstr &MI,
+                                           const MachineOperand &MO) const {
+  if (MO.isReg())
+    return II->getRegisterInfo().getEncodingValue(MO.getReg());
+  else if (MO.isImm())
+    return static_cast<unsigned>(MO.getImm());
+  else if (MO.isGlobal())
+    emitGlobalAddress(MO.getGlobal(), ARM::reloc_arm_branch, true, false);
+  else if (MO.isSymbol())
+    emitExternalSymbolAddress(MO.getSymbolName(), ARM::reloc_arm_branch);
+  else if (MO.isCPI()) {
+    const MCInstrDesc &MCID = MI.getDesc();
+    // For VFP load, the immediate offset is multiplied by 4.
+    unsigned Reloc =  ((MCID.TSFlags & ARMII::FormMask) == ARMII::VFPLdStFrm)
+      ? ARM::reloc_arm_vfp_cp_entry : ARM::reloc_arm_cp_entry;
+    emitConstPoolAddress(MO.getIndex(), Reloc);
+  } else if (MO.isJTI())
+    emitJumpTableAddress(MO.getIndex(), ARM::reloc_arm_relative);
+  else if (MO.isMBB())
+    emitMachineBasicBlock(MO.getMBB(), ARM::reloc_arm_branch);
+  else
+    llvm_unreachable("Unable to encode MachineOperand!");
+  return 0;
+}
+
+/// emitGlobalAddress - Emit the specified address to the code stream.
+///
+void ARMCodeEmitter::emitGlobalAddress(const GlobalValue *GV, unsigned Reloc,
+                                       bool MayNeedFarStub, bool Indirect,
+                                       intptr_t ACPV) const {
+  MachineRelocation MR = Indirect
+    ? MachineRelocation::getIndirectSymbol(MCE.getCurrentPCOffset(), Reloc,
+                                           const_cast<GlobalValue *>(GV),
+                                           ACPV, MayNeedFarStub)
+    : MachineRelocation::getGV(MCE.getCurrentPCOffset(), Reloc,
+                               const_cast<GlobalValue *>(GV), ACPV,
+                               MayNeedFarStub);
+  MCE.addRelocation(MR);
+}
+
+/// emitExternalSymbolAddress - Arrange for the address of an external symbol to
+/// be emitted to the current location in the function, and allow it to be PC
+/// relative.
+void ARMCodeEmitter::
+emitExternalSymbolAddress(const char *ES, unsigned Reloc) const {
+  MCE.addRelocation(MachineRelocation::getExtSym(MCE.getCurrentPCOffset(),
+                                                 Reloc, ES));
+}
+
+/// emitConstPoolAddress - Arrange for the address of an constant pool
+/// to be emitted to the current location in the function, and allow it to be PC
+/// relative.
+void ARMCodeEmitter::emitConstPoolAddress(unsigned CPI, unsigned Reloc) const {
+  // Tell JIT emitter we'll resolve the address.
+  MCE.addRelocation(MachineRelocation::getConstPool(MCE.getCurrentPCOffset(),
+                                                    Reloc, CPI, 0, true));
+}
+
+/// emitJumpTableAddress - Arrange for the address of a jump table to
+/// be emitted to the current location in the function, and allow it to be PC
+/// relative.
+void ARMCodeEmitter::
+emitJumpTableAddress(unsigned JTIndex, unsigned Reloc) const {
+  MCE.addRelocation(MachineRelocation::getJumpTable(MCE.getCurrentPCOffset(),
+                                                    Reloc, JTIndex, 0, true));
+}
+
+/// emitMachineBasicBlock - Emit the specified address basic block.
+void ARMCodeEmitter::emitMachineBasicBlock(MachineBasicBlock *BB,
+                                           unsigned Reloc,
+                                           intptr_t JTBase) const {
+  MCE.addRelocation(MachineRelocation::getBB(MCE.getCurrentPCOffset(),
+                                             Reloc, BB, JTBase));
+}
+
+void ARMCodeEmitter::emitWordLE(unsigned Binary) {
+  DEBUG(errs() << "  0x";
+        errs().write_hex(Binary) << "\n");
+  MCE.emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitDWordLE(uint64_t Binary) {
+  DEBUG(errs() << "  0x";
+        errs().write_hex(Binary) << "\n");
+  MCE.emitDWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitInstruction(const MachineInstr &MI) {
+  DEBUG(errs() << "JIT: " << (void*)MCE.getCurrentPCValue() << ":\t" << MI);
+
+  MCE.processDebugLoc(MI.getDebugLoc(), true);
+
+  ++NumEmitted;  // Keep track of the # of mi's emitted
+  switch (MI.getDesc().TSFlags & ARMII::FormMask) {
+  default: {
+    llvm_unreachable("Unhandled instruction encoding format!");
+  }
+  case ARMII::MiscFrm:
+    if (MI.getOpcode() == ARM::LEApcrelJT) {
+      // Materialize jumptable address.
+      emitLEApcrelJTInstruction(MI);
+      break;
+    }
+    llvm_unreachable("Unhandled instruction encoding!");
+  case ARMII::Pseudo:
+    emitPseudoInstruction(MI);
+    break;
+  case ARMII::DPFrm:
+  case ARMII::DPSoRegFrm:
+    emitDataProcessingInstruction(MI);
+    break;
+  case ARMII::LdFrm:
+  case ARMII::StFrm:
+    emitLoadStoreInstruction(MI);
+    break;
+  case ARMII::LdMiscFrm:
+  case ARMII::StMiscFrm:
+    emitMiscLoadStoreInstruction(MI);
+    break;
+  case ARMII::LdStMulFrm:
+    emitLoadStoreMultipleInstruction(MI);
+    break;
+  case ARMII::MulFrm:
+    emitMulFrmInstruction(MI);
+    break;
+  case ARMII::ExtFrm:
+    emitExtendInstruction(MI);
+    break;
+  case ARMII::ArithMiscFrm:
+    emitMiscArithInstruction(MI);
+    break;
+  case ARMII::SatFrm:
+    emitSaturateInstruction(MI);
+    break;
+  case ARMII::BrFrm:
+    emitBranchInstruction(MI);
+    break;
+  case ARMII::BrMiscFrm:
+    emitMiscBranchInstruction(MI);
+    break;
+  // VFP instructions.
+  case ARMII::VFPUnaryFrm:
+  case ARMII::VFPBinaryFrm:
+    emitVFPArithInstruction(MI);
+    break;
+  case ARMII::VFPConv1Frm:
+  case ARMII::VFPConv2Frm:
+  case ARMII::VFPConv3Frm:
+  case ARMII::VFPConv4Frm:
+  case ARMII::VFPConv5Frm:
+    emitVFPConversionInstruction(MI);
+    break;
+  case ARMII::VFPLdStFrm:
+    emitVFPLoadStoreInstruction(MI);
+    break;
+  case ARMII::VFPLdStMulFrm:
+    emitVFPLoadStoreMultipleInstruction(MI);
+    break;
+
+  // NEON instructions.
+  case ARMII::NGetLnFrm:
+  case ARMII::NSetLnFrm:
+    emitNEONLaneInstruction(MI);
+    break;
+  case ARMII::NDupFrm:
+    emitNEONDupInstruction(MI);
+    break;
+  case ARMII::N1RegModImmFrm:
+    emitNEON1RegModImmInstruction(MI);
+    break;
+  case ARMII::N2RegFrm:
+    emitNEON2RegInstruction(MI);
+    break;
+  case ARMII::N3RegFrm:
+    emitNEON3RegInstruction(MI);
+    break;
+  }
+  MCE.processDebugLoc(MI.getDebugLoc(), false);
+}
+
+void ARMCodeEmitter::emitConstPoolInstruction(const MachineInstr &MI) {
+  unsigned CPI = MI.getOperand(0).getImm();       // CP instruction index.
+  unsigned CPIndex = MI.getOperand(1).getIndex(); // Actual cp entry index.
+  const MachineConstantPoolEntry &MCPE = (*MCPEs)[CPIndex];
+
+  // Remember the CONSTPOOL_ENTRY address for later relocation.
+  JTI->addConstantPoolEntryAddr(CPI, MCE.getCurrentPCValue());
+
+  // Emit constpool island entry. In most cases, the actual values will be
+  // resolved and relocated after code emission.
+  if (MCPE.isMachineConstantPoolEntry()) {
+    ARMConstantPoolValue *ACPV =
+      static_cast<ARMConstantPoolValue*>(MCPE.Val.MachineCPVal);
+
+    DEBUG(errs() << "  ** ARM constant pool #" << CPI << " @ "
+          << (void*)MCE.getCurrentPCValue() << " " << *ACPV << '\n');
+
+    assert(ACPV->isGlobalValue() && "unsupported constant pool value");
+    const GlobalValue *GV = cast<ARMConstantPoolConstant>(ACPV)->getGV();
+    if (GV) {
+      Reloc::Model RelocM = TM.getRelocationModel();
+      emitGlobalAddress(GV, ARM::reloc_arm_machine_cp_entry,
+                        isa<Function>(GV),
+                        Subtarget->GVIsIndirectSymbol(GV, RelocM),
+                        (intptr_t)ACPV);
+    } else  {
+      const char *Sym = cast<ARMConstantPoolSymbol>(ACPV)->getSymbol();
+      emitExternalSymbolAddress(Sym, ARM::reloc_arm_absolute);
+    }
+    emitWordLE(0);
+  } else {
+    const Constant *CV = MCPE.Val.ConstVal;
+
+    DEBUG({
+        errs() << "  ** Constant pool #" << CPI << " @ "
+               << (void*)MCE.getCurrentPCValue() << " ";
+        if (const Function *F = dyn_cast<Function>(CV))
+          errs() << F->getName();
+        else
+          errs() << *CV;
+        errs() << '\n';
+      });
+
+    if (const GlobalValue *GV = dyn_cast<GlobalValue>(CV)) {
+      emitGlobalAddress(GV, ARM::reloc_arm_absolute, isa<Function>(GV), false);
+      emitWordLE(0);
+    } else if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
+      uint32_t Val = uint32_t(*CI->getValue().getRawData());
+      emitWordLE(Val);
+    } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CV)) {
+      if (CFP->getType()->isFloatTy())
+        emitWordLE(CFP->getValueAPF().bitcastToAPInt().getZExtValue());
+      else if (CFP->getType()->isDoubleTy())
+        emitDWordLE(CFP->getValueAPF().bitcastToAPInt().getZExtValue());
+      else {
+        llvm_unreachable("Unable to handle this constantpool entry!");
+      }
+    } else {
+      llvm_unreachable("Unable to handle this constantpool entry!");
+    }
+  }
+}
+
+void ARMCodeEmitter::emitMOVi32immInstruction(const MachineInstr &MI) {
+  const MachineOperand &MO0 = MI.getOperand(0);
+  const MachineOperand &MO1 = MI.getOperand(1);
+
+  // Emit the 'movw' instruction.
+  unsigned Binary = 0x30 << 20;  // mov: Insts{27-20} = 0b00110000
+
+  unsigned Lo16 = getMovi32Value(MI, MO1, ARM::reloc_arm_movw) & 0xFFFF;
+
+  // Set the conditional execution predicate.
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  // Encode Rd.
+  Binary |= getMachineOpValue(MI, MO0) << ARMII::RegRdShift;
+
+  // Encode imm16 as imm4:imm12
+  Binary |= Lo16 & 0xFFF; // Insts{11-0} = imm12
+  Binary |= ((Lo16 >> 12) & 0xF) << 16; // Insts{19-16} = imm4
+  emitWordLE(Binary);
+
+  unsigned Hi16 = getMovi32Value(MI, MO1, ARM::reloc_arm_movt) >> 16;
+  // Emit the 'movt' instruction.
+  Binary = 0x34 << 20; // movt: Insts{27-20} = 0b00110100
+
+  // Set the conditional execution predicate.
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  // Encode Rd.
+  Binary |= getMachineOpValue(MI, MO0) << ARMII::RegRdShift;
+
+  // Encode imm16 as imm4:imm1, same as movw above.
+  Binary |= Hi16 & 0xFFF;
+  Binary |= ((Hi16 >> 12) & 0xF) << 16;
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitMOVi2piecesInstruction(const MachineInstr &MI) {
+  const MachineOperand &MO0 = MI.getOperand(0);
+  const MachineOperand &MO1 = MI.getOperand(1);
+  assert(MO1.isImm() && ARM_AM::isSOImmTwoPartVal(MO1.getImm()) &&
+                                                  "Not a valid so_imm value!");
+  unsigned V1 = ARM_AM::getSOImmTwoPartFirst(MO1.getImm());
+  unsigned V2 = ARM_AM::getSOImmTwoPartSecond(MO1.getImm());
+
+  // Emit the 'mov' instruction.
+  unsigned Binary = 0xd << 21;  // mov: Insts{24-21} = 0b1101
+
+  // Set the conditional execution predicate.
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  // Encode Rd.
+  Binary |= getMachineOpValue(MI, MO0) << ARMII::RegRdShift;
+
+  // Encode so_imm.
+  // Set bit I(25) to identify this is the immediate form of <shifter_op>
+  Binary |= 1 << ARMII::I_BitShift;
+  Binary |= getMachineSoImmOpValue(V1);
+  emitWordLE(Binary);
+
+  // Now the 'orr' instruction.
+  Binary = 0xc << 21;  // orr: Insts{24-21} = 0b1100
+
+  // Set the conditional execution predicate.
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  // Encode Rd.
+  Binary |= getMachineOpValue(MI, MO0) << ARMII::RegRdShift;
+
+  // Encode Rn.
+  Binary |= getMachineOpValue(MI, MO0) << ARMII::RegRnShift;
+
+  // Encode so_imm.
+  // Set bit I(25) to identify this is the immediate form of <shifter_op>
+  Binary |= 1 << ARMII::I_BitShift;
+  Binary |= getMachineSoImmOpValue(V2);
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitLEApcrelJTInstruction(const MachineInstr &MI) {
+  // It's basically add r, pc, (LJTI - $+8)
+
+  const MCInstrDesc &MCID = MI.getDesc();
+
+  // Emit the 'add' instruction.
+  unsigned Binary = 0x4 << 21;  // add: Insts{24-21} = 0b0100
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  // Encode S bit if MI modifies CPSR.
+  Binary |= getAddrModeSBit(MI, MCID);
+
+  // Encode Rd.
+  Binary |= getMachineOpValue(MI, 0) << ARMII::RegRdShift;
+
+  // Encode Rn which is PC.
+  Binary |= II->getRegisterInfo().getEncodingValue(ARM::PC) << ARMII::RegRnShift;
+
+  // Encode the displacement.
+  Binary |= 1 << ARMII::I_BitShift;
+  emitJumpTableAddress(MI.getOperand(1).getIndex(), ARM::reloc_arm_jt_base);
+
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitPseudoMoveInstruction(const MachineInstr &MI) {
+  unsigned Opcode = MI.getDesc().Opcode;
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  // Encode S bit if MI modifies CPSR.
+  if (Opcode == ARM::MOVsrl_flag || Opcode == ARM::MOVsra_flag)
+    Binary |= 1 << ARMII::S_BitShift;
+
+  // Encode register def if there is one.
+  Binary |= getMachineOpValue(MI, 0) << ARMII::RegRdShift;
+
+  // Encode the shift operation.
+  switch (Opcode) {
+  default: break;
+  case ARM::RRX:
+    // rrx
+    Binary |= 0x6 << 4;
+    break;
+  case ARM::MOVsrl_flag:
+    // lsr #1
+    Binary |= (0x2 << 4) | (1 << 7);
+    break;
+  case ARM::MOVsra_flag:
+    // asr #1
+    Binary |= (0x4 << 4) | (1 << 7);
+    break;
+  }
+
+  // Encode register Rm.
+  Binary |= getMachineOpValue(MI, 1);
+
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::addPCLabel(unsigned LabelID) {
+  DEBUG(errs() << "  ** LPC" << LabelID << " @ "
+        << (void*)MCE.getCurrentPCValue() << '\n');
+  JTI->addPCLabelAddr(LabelID, MCE.getCurrentPCValue());
+}
+
+void ARMCodeEmitter::emitPseudoInstruction(const MachineInstr &MI) {
+  unsigned Opcode = MI.getDesc().Opcode;
+  switch (Opcode) {
+  default:
+    llvm_unreachable("ARMCodeEmitter::emitPseudoInstruction");
+  case ARM::BX_CALL:
+  case ARM::BMOVPCRX_CALL: {
+    // First emit mov lr, pc
+    unsigned Binary = 0x01a0e00f;
+    Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+    emitWordLE(Binary);
+
+    // and then emit the branch.
+    emitMiscBranchInstruction(MI);
+    break;
+  }
+  case TargetOpcode::INLINEASM: {
+    // We allow inline assembler nodes with empty bodies - they can
+    // implicitly define registers, which is ok for JIT.
+    if (MI.getOperand(0).getSymbolName()[0]) {
+      report_fatal_error("JIT does not support inline asm!");
+    }
+    break;
+  }
+  case TargetOpcode::CFI_INSTRUCTION:
+    break;
+  case TargetOpcode::EH_LABEL:
+    MCE.emitLabel(MI.getOperand(0).getMCSymbol());
+    break;
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::KILL:
+    // Do nothing.
+    break;
+  case ARM::CONSTPOOL_ENTRY:
+    emitConstPoolInstruction(MI);
+    break;
+  case ARM::PICADD: {
+    // Remember of the address of the PC label for relocation later.
+    addPCLabel(MI.getOperand(2).getImm());
+    // PICADD is just an add instruction that implicitly read pc.
+    emitDataProcessingInstruction(MI, 0, ARM::PC);
+    break;
+  }
+  case ARM::PICLDR:
+  case ARM::PICLDRB:
+  case ARM::PICSTR:
+  case ARM::PICSTRB: {
+    // Remember of the address of the PC label for relocation later.
+    addPCLabel(MI.getOperand(2).getImm());
+    // These are just load / store instructions that implicitly read pc.
+    emitLoadStoreInstruction(MI, 0, ARM::PC);
+    break;
+  }
+  case ARM::PICLDRH:
+  case ARM::PICLDRSH:
+  case ARM::PICLDRSB:
+  case ARM::PICSTRH: {
+    // Remember of the address of the PC label for relocation later.
+    addPCLabel(MI.getOperand(2).getImm());
+    // These are just load / store instructions that implicitly read pc.
+    emitMiscLoadStoreInstruction(MI, ARM::PC);
+    break;
+  }
+
+  case ARM::MOVi32imm:
+    // Two instructions to materialize a constant.
+    if (Subtarget->hasV6T2Ops())
+      emitMOVi32immInstruction(MI);
+    else
+      emitMOVi2piecesInstruction(MI);
+    break;
+
+  case ARM::LEApcrelJT:
+    // Materialize jumptable address.
+    emitLEApcrelJTInstruction(MI);
+    break;
+  case ARM::RRX:
+  case ARM::MOVsrl_flag:
+  case ARM::MOVsra_flag:
+    emitPseudoMoveInstruction(MI);
+    break;
+  }
+}
+
+unsigned ARMCodeEmitter::getMachineSoRegOpValue(const MachineInstr &MI,
+                                                const MCInstrDesc &MCID,
+                                                const MachineOperand &MO,
+                                                unsigned OpIdx) {
+  unsigned Binary = getMachineOpValue(MI, MO);
+
+  const MachineOperand &MO1 = MI.getOperand(OpIdx + 1);
+  const MachineOperand &MO2 = MI.getOperand(OpIdx + 2);
+  ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(MO2.getImm());
+
+  // Encode the shift opcode.
+  unsigned SBits = 0;
+  unsigned Rs = MO1.getReg();
+  if (Rs) {
+    // Set shift operand (bit[7:4]).
+    // LSL - 0001
+    // LSR - 0011
+    // ASR - 0101
+    // ROR - 0111
+    // RRX - 0110 and bit[11:8] clear.
+    switch (SOpc) {
+    default: llvm_unreachable("Unknown shift opc!");
+    case ARM_AM::lsl: SBits = 0x1; break;
+    case ARM_AM::lsr: SBits = 0x3; break;
+    case ARM_AM::asr: SBits = 0x5; break;
+    case ARM_AM::ror: SBits = 0x7; break;
+    case ARM_AM::rrx: SBits = 0x6; break;
+    }
+  } else {
+    // Set shift operand (bit[6:4]).
+    // LSL - 000
+    // LSR - 010
+    // ASR - 100
+    // ROR - 110
+    switch (SOpc) {
+    default: llvm_unreachable("Unknown shift opc!");
+    case ARM_AM::lsl: SBits = 0x0; break;
+    case ARM_AM::lsr: SBits = 0x2; break;
+    case ARM_AM::asr: SBits = 0x4; break;
+    case ARM_AM::ror: SBits = 0x6; break;
+    }
+  }
+  Binary |= SBits << 4;
+  if (SOpc == ARM_AM::rrx)
+    return Binary;
+
+  // Encode the shift operation Rs or shift_imm (except rrx).
+  if (Rs) {
+    // Encode Rs bit[11:8].
+    assert(ARM_AM::getSORegOffset(MO2.getImm()) == 0);
+    return Binary | (II->getRegisterInfo().getEncodingValue(Rs) << ARMII::RegRsShift);
+  }
+
+  // Encode shift_imm bit[11:7].
+  return Binary | ARM_AM::getSORegOffset(MO2.getImm()) << 7;
+}
+
+unsigned ARMCodeEmitter::getMachineSoImmOpValue(unsigned SoImm) {
+  int SoImmVal = ARM_AM::getSOImmVal(SoImm);
+  assert(SoImmVal != -1 && "Not a valid so_imm value!");
+
+  // Encode rotate_imm.
+  unsigned Binary = (ARM_AM::getSOImmValRot((unsigned)SoImmVal) >> 1)
+    << ARMII::SoRotImmShift;
+
+  // Encode immed_8.
+  Binary |= ARM_AM::getSOImmValImm((unsigned)SoImmVal);
+  return Binary;
+}
+
+unsigned ARMCodeEmitter::getAddrModeSBit(const MachineInstr &MI,
+                                         const MCInstrDesc &MCID) const {
+  for (unsigned i = MI.getNumOperands(), e = MCID.getNumOperands(); i >= e;--i){
+    const MachineOperand &MO = MI.getOperand(i-1);
+    if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR)
+      return 1 << ARMII::S_BitShift;
+  }
+  return 0;
+}
+
+void ARMCodeEmitter::emitDataProcessingInstruction(const MachineInstr &MI,
+                                                   unsigned ImplicitRd,
+                                                   unsigned ImplicitRn) {
+  const MCInstrDesc &MCID = MI.getDesc();
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  // Encode S bit if MI modifies CPSR.
+  Binary |= getAddrModeSBit(MI, MCID);
+
+  // Encode register def if there is one.
+  unsigned NumDefs = MCID.getNumDefs();
+  unsigned OpIdx = 0;
+  if (NumDefs)
+    Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdShift;
+  else if (ImplicitRd)
+    // Special handling for implicit use (e.g. PC).
+    Binary |= (II->getRegisterInfo().getEncodingValue(ImplicitRd) << ARMII::RegRdShift);
+
+  if (MCID.Opcode == ARM::MOVi16) {
+      // Get immediate from MI.
+      unsigned Lo16 = getMovi32Value(MI, MI.getOperand(OpIdx),
+                      ARM::reloc_arm_movw);
+      // Encode imm which is the same as in emitMOVi32immInstruction().
+      Binary |= Lo16 & 0xFFF;
+      Binary |= ((Lo16 >> 12) & 0xF) << 16;
+      emitWordLE(Binary);
+      return;
+  } else if(MCID.Opcode == ARM::MOVTi16) {
+      unsigned Hi16 = (getMovi32Value(MI, MI.getOperand(OpIdx),
+                       ARM::reloc_arm_movt) >> 16);
+      Binary |= Hi16 & 0xFFF;
+      Binary |= ((Hi16 >> 12) & 0xF) << 16;
+      emitWordLE(Binary);
+      return;
+  } else if ((MCID.Opcode == ARM::BFC) || (MCID.Opcode == ARM::BFI)) {
+      uint32_t v = ~MI.getOperand(2).getImm();
+      int32_t lsb = countTrailingZeros(v);
+      int32_t msb = (32 - countLeadingZeros(v)) - 1;
+      // Instr{20-16} = msb, Instr{11-7} = lsb
+      Binary |= (msb & 0x1F) << 16;
+      Binary |= (lsb & 0x1F) << 7;
+      emitWordLE(Binary);
+      return;
+  } else if ((MCID.Opcode == ARM::UBFX) || (MCID.Opcode == ARM::SBFX)) {
+      // Encode Rn in Instr{0-3}
+      Binary |= getMachineOpValue(MI, OpIdx++);
+
+      uint32_t lsb = MI.getOperand(OpIdx++).getImm();
+      uint32_t widthm1 = MI.getOperand(OpIdx++).getImm() - 1;
+
+      // Instr{20-16} = widthm1, Instr{11-7} = lsb
+      Binary |= (widthm1 & 0x1F) << 16;
+      Binary |= (lsb & 0x1F) << 7;
+      emitWordLE(Binary);
+      return;
+  }
+
+  // If this is a two-address operand, skip it. e.g. MOVCCr operand 1.
+  if (MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1)
+    ++OpIdx;
+
+  // Encode first non-shifter register operand if there is one.
+  bool isUnary = MCID.TSFlags & ARMII::UnaryDP;
+  if (!isUnary) {
+    if (ImplicitRn)
+      // Special handling for implicit use (e.g. PC).
+      Binary |= (II->getRegisterInfo().getEncodingValue(ImplicitRn) << ARMII::RegRnShift);
+    else {
+      Binary |= getMachineOpValue(MI, OpIdx) << ARMII::RegRnShift;
+      ++OpIdx;
+    }
+  }
+
+  // Encode shifter operand.
+  const MachineOperand &MO = MI.getOperand(OpIdx);
+  if ((MCID.TSFlags & ARMII::FormMask) == ARMII::DPSoRegFrm) {
+    // Encode SoReg.
+    emitWordLE(Binary | getMachineSoRegOpValue(MI, MCID, MO, OpIdx));
+    return;
+  }
+
+  if (MO.isReg()) {
+    // Encode register Rm.
+    emitWordLE(Binary | II->getRegisterInfo().getEncodingValue(MO.getReg()));
+    return;
+  }
+
+  // Encode so_imm.
+  Binary |= getMachineSoImmOpValue((unsigned)MO.getImm());
+
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitLoadStoreInstruction(const MachineInstr &MI,
+                                              unsigned ImplicitRd,
+                                              unsigned ImplicitRn) {
+  const MCInstrDesc &MCID = MI.getDesc();
+  unsigned Form = MCID.TSFlags & ARMII::FormMask;
+  bool IsPrePost = (MCID.TSFlags & ARMII::IndexModeMask) != 0;
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // If this is an LDRi12, STRi12 or LDRcp, nothing more needs be done.
+  if (MI.getOpcode() == ARM::LDRi12 || MI.getOpcode() == ARM::LDRcp ||
+      MI.getOpcode() == ARM::STRi12) {
+    emitWordLE(Binary);
+    return;
+  }
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  unsigned OpIdx = 0;
+
+  // Operand 0 of a pre- and post-indexed store is the address base
+  // writeback. Skip it.
+  bool Skipped = false;
+  if (IsPrePost && Form == ARMII::StFrm) {
+    ++OpIdx;
+    Skipped = true;
+  }
+
+  // Set first operand
+  if (ImplicitRd)
+    // Special handling for implicit use (e.g. PC).
+    Binary |= (II->getRegisterInfo().getEncodingValue(ImplicitRd) << ARMII::RegRdShift);
+  else
+    Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdShift;
+
+  // Set second operand
+  if (ImplicitRn)
+    // Special handling for implicit use (e.g. PC).
+    Binary |= (II->getRegisterInfo().getEncodingValue(ImplicitRn) << ARMII::RegRnShift);
+  else
+    Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRnShift;
+
+  // If this is a two-address operand, skip it. e.g. LDR_PRE.
+  if (!Skipped && MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1)
+    ++OpIdx;
+
+  const MachineOperand &MO2 = MI.getOperand(OpIdx);
+  unsigned AM2Opc = (ImplicitRn == ARM::PC)
+    ? 0 : MI.getOperand(OpIdx+1).getImm();
+
+  // Set bit U(23) according to sign of immed value (positive or negative).
+  Binary |= ((ARM_AM::getAM2Op(AM2Opc) == ARM_AM::add ? 1 : 0) <<
+             ARMII::U_BitShift);
+  if (!MO2.getReg()) { // is immediate
+    if (ARM_AM::getAM2Offset(AM2Opc))
+      // Set the value of offset_12 field
+      Binary |= ARM_AM::getAM2Offset(AM2Opc);
+    emitWordLE(Binary);
+    return;
+  }
+
+  // Set bit I(25), because this is not in immediate encoding.
+  Binary |= 1 << ARMII::I_BitShift;
+  assert(TargetRegisterInfo::isPhysicalRegister(MO2.getReg()));
+  // Set bit[3:0] to the corresponding Rm register
+  Binary |= II->getRegisterInfo().getEncodingValue(MO2.getReg());
+
+  // If this instr is in scaled register offset/index instruction, set
+  // shift_immed(bit[11:7]) and shift(bit[6:5]) fields.
+  if (unsigned ShImm = ARM_AM::getAM2Offset(AM2Opc)) {
+    Binary |= getShiftOp(AM2Opc) << ARMII::ShiftImmShift;  // shift
+    Binary |= ShImm              << ARMII::ShiftShift;     // shift_immed
+  }
+
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitMiscLoadStoreInstruction(const MachineInstr &MI,
+                                                  unsigned ImplicitRn) {
+  const MCInstrDesc &MCID = MI.getDesc();
+  unsigned Form = MCID.TSFlags & ARMII::FormMask;
+  bool IsPrePost = (MCID.TSFlags & ARMII::IndexModeMask) != 0;
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  unsigned OpIdx = 0;
+
+  // Operand 0 of a pre- and post-indexed store is the address base
+  // writeback. Skip it.
+  bool Skipped = false;
+  if (IsPrePost && Form == ARMII::StMiscFrm) {
+    ++OpIdx;
+    Skipped = true;
+  }
+
+  // Set first operand
+  Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdShift;
+
+  // Skip LDRD and STRD's second operand.
+  if (MCID.Opcode == ARM::LDRD || MCID.Opcode == ARM::STRD)
+    ++OpIdx;
+
+  // Set second operand
+  if (ImplicitRn)
+    // Special handling for implicit use (e.g. PC).
+    Binary |= (II->getRegisterInfo().getEncodingValue(ImplicitRn) << ARMII::RegRnShift);
+  else
+    Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRnShift;
+
+  // If this is a two-address operand, skip it. e.g. LDRH_POST.
+  if (!Skipped && MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1)
+    ++OpIdx;
+
+  const MachineOperand &MO2 = MI.getOperand(OpIdx);
+  unsigned AM3Opc = (ImplicitRn == ARM::PC)
+    ? 0 : MI.getOperand(OpIdx+1).getImm();
+
+  // Set bit U(23) according to sign of immed value (positive or negative)
+  Binary |= ((ARM_AM::getAM3Op(AM3Opc) == ARM_AM::add ? 1 : 0) <<
+             ARMII::U_BitShift);
+
+  // If this instr is in register offset/index encoding, set bit[3:0]
+  // to the corresponding Rm register.
+  if (MO2.getReg()) {
+    Binary |= II->getRegisterInfo().getEncodingValue(MO2.getReg());
+    emitWordLE(Binary);
+    return;
+  }
+
+  // This instr is in immediate offset/index encoding, set bit 22 to 1.
+  Binary |= 1 << ARMII::AM3_I_BitShift;
+  if (unsigned ImmOffs = ARM_AM::getAM3Offset(AM3Opc)) {
+    // Set operands
+    Binary |= (ImmOffs >> 4) << ARMII::ImmHiShift;  // immedH
+    Binary |= (ImmOffs & 0xF);                      // immedL
+  }
+
+  emitWordLE(Binary);
+}
+
+static unsigned getAddrModeUPBits(unsigned Mode) {
+  unsigned Binary = 0;
+
+  // Set addressing mode by modifying bits U(23) and P(24)
+  // IA - Increment after  - bit U = 1 and bit P = 0
+  // IB - Increment before - bit U = 1 and bit P = 1
+  // DA - Decrement after  - bit U = 0 and bit P = 0
+  // DB - Decrement before - bit U = 0 and bit P = 1
+  switch (Mode) {
+  default: llvm_unreachable("Unknown addressing sub-mode!");
+  case ARM_AM::da:                                     break;
+  case ARM_AM::db: Binary |= 0x1 << ARMII::P_BitShift; break;
+  case ARM_AM::ia: Binary |= 0x1 << ARMII::U_BitShift; break;
+  case ARM_AM::ib: Binary |= 0x3 << ARMII::U_BitShift; break;
+  }
+
+  return Binary;
+}
+
+void ARMCodeEmitter::emitLoadStoreMultipleInstruction(const MachineInstr &MI) {
+  const MCInstrDesc &MCID = MI.getDesc();
+  bool IsUpdating = (MCID.TSFlags & ARMII::IndexModeMask) != 0;
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  // Skip operand 0 of an instruction with base register update.
+  unsigned OpIdx = 0;
+  if (IsUpdating)
+    ++OpIdx;
+
+  // Set base address operand
+  Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRnShift;
+
+  // Set addressing mode by modifying bits U(23) and P(24)
+  ARM_AM::AMSubMode Mode = ARM_AM::getLoadStoreMultipleSubMode(MI.getOpcode());
+  Binary |= getAddrModeUPBits(ARM_AM::getAM4SubMode(Mode));
+
+  // Set bit W(21)
+  if (IsUpdating)
+    Binary |= 0x1 << ARMII::W_BitShift;
+
+  // Set registers
+  for (unsigned i = OpIdx+2, e = MI.getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || MO.isImplicit())
+      break;
+    unsigned RegNum = II->getRegisterInfo().getEncodingValue(MO.getReg());
+    assert(TargetRegisterInfo::isPhysicalRegister(MO.getReg()) &&
+           RegNum < 16);
+    Binary |= 0x1 << RegNum;
+  }
+
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitMulFrmInstruction(const MachineInstr &MI) {
+  const MCInstrDesc &MCID = MI.getDesc();
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  // Encode S bit if MI modifies CPSR.
+  Binary |= getAddrModeSBit(MI, MCID);
+
+  // 32x32->64bit operations have two destination registers. The number
+  // of register definitions will tell us if that's what we're dealing with.
+  unsigned OpIdx = 0;
+  if (MCID.getNumDefs() == 2)
+    Binary |= getMachineOpValue (MI, OpIdx++) << ARMII::RegRdLoShift;
+
+  // Encode Rd
+  Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdHiShift;
+
+  // Encode Rm
+  Binary |= getMachineOpValue(MI, OpIdx++);
+
+  // Encode Rs
+  Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRsShift;
+
+  // Many multiple instructions (e.g. MLA) have three src operands. Encode
+  // it as Rn (for multiply, that's in the same offset as RdLo.
+  if (MCID.getNumOperands() > OpIdx &&
+      !MCID.OpInfo[OpIdx].isPredicate() &&
+      !MCID.OpInfo[OpIdx].isOptionalDef())
+    Binary |= getMachineOpValue(MI, OpIdx) << ARMII::RegRdLoShift;
+
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitExtendInstruction(const MachineInstr &MI) {
+  const MCInstrDesc &MCID = MI.getDesc();
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  unsigned OpIdx = 0;
+
+  // Encode Rd
+  Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdShift;
+
+  const MachineOperand &MO1 = MI.getOperand(OpIdx++);
+  const MachineOperand &MO2 = MI.getOperand(OpIdx);
+  if (MO2.isReg()) {
+    // Two register operand form.
+    // Encode Rn.
+    Binary |= getMachineOpValue(MI, MO1) << ARMII::RegRnShift;
+
+    // Encode Rm.
+    Binary |= getMachineOpValue(MI, MO2);
+    ++OpIdx;
+  } else {
+    Binary |= getMachineOpValue(MI, MO1);
+  }
+
+  // Encode rot imm (0, 8, 16, or 24) if it has a rotate immediate operand.
+  if (MI.getOperand(OpIdx).isImm() &&
+      !MCID.OpInfo[OpIdx].isPredicate() &&
+      !MCID.OpInfo[OpIdx].isOptionalDef())
+    Binary |= (getMachineOpValue(MI, OpIdx) / 8) << ARMII::ExtRotImmShift;
+
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitMiscArithInstruction(const MachineInstr &MI) {
+  const MCInstrDesc &MCID = MI.getDesc();
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  // PKH instructions are finished at this point
+  if (MCID.Opcode == ARM::PKHBT || MCID.Opcode == ARM::PKHTB) {
+    emitWordLE(Binary);
+    return;
+  }
+
+  unsigned OpIdx = 0;
+
+  // Encode Rd
+  Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdShift;
+
+  const MachineOperand &MO = MI.getOperand(OpIdx++);
+  if (OpIdx == MCID.getNumOperands() ||
+      MCID.OpInfo[OpIdx].isPredicate() ||
+      MCID.OpInfo[OpIdx].isOptionalDef()) {
+    // Encode Rm and it's done.
+    Binary |= getMachineOpValue(MI, MO);
+    emitWordLE(Binary);
+    return;
+  }
+
+  // Encode Rn.
+  Binary |= getMachineOpValue(MI, MO) << ARMII::RegRnShift;
+
+  // Encode Rm.
+  Binary |= getMachineOpValue(MI, OpIdx++);
+
+  // Encode shift_imm.
+  unsigned ShiftAmt = MI.getOperand(OpIdx).getImm();
+  if (MCID.Opcode == ARM::PKHTB) {
+    assert(ShiftAmt != 0 && "PKHTB shift_imm is 0!");
+    if (ShiftAmt == 32)
+      ShiftAmt = 0;
+  }
+  assert(ShiftAmt < 32 && "shift_imm range is 0 to 31!");
+  Binary |= ShiftAmt << ARMII::ShiftShift;
+
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitSaturateInstruction(const MachineInstr &MI) {
+  const MCInstrDesc &MCID = MI.getDesc();
+
+  // Part of binary is determined by TableGen.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  // Encode Rd
+  Binary |= getMachineOpValue(MI, 0) << ARMII::RegRdShift;
+
+  // Encode saturate bit position.
+  unsigned Pos = MI.getOperand(1).getImm();
+  if (MCID.Opcode == ARM::SSAT || MCID.Opcode == ARM::SSAT16)
+    Pos -= 1;
+  assert((Pos < 16 || (Pos < 32 &&
+                       MCID.Opcode != ARM::SSAT16 &&
+                       MCID.Opcode != ARM::USAT16)) &&
+         "saturate bit position out of range");
+  Binary |= Pos << 16;
+
+  // Encode Rm
+  Binary |= getMachineOpValue(MI, 2);
+
+  // Encode shift_imm.
+  if (MCID.getNumOperands() == 4) {
+    unsigned ShiftOp = MI.getOperand(3).getImm();
+    ARM_AM::ShiftOpc Opc = ARM_AM::getSORegShOp(ShiftOp);
+    if (Opc == ARM_AM::asr)
+      Binary |= (1 << 6);
+    unsigned ShiftAmt = MI.getOperand(3).getImm();
+    if (ShiftAmt == 32 && Opc == ARM_AM::asr)
+      ShiftAmt = 0;
+    assert(ShiftAmt < 32 && "shift_imm range is 0 to 31!");
+    Binary |= ShiftAmt << ARMII::ShiftShift;
+  }
+
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitBranchInstruction(const MachineInstr &MI) {
+  const MCInstrDesc &MCID = MI.getDesc();
+
+  if (MCID.Opcode == ARM::TPsoft) {
+    llvm_unreachable("ARM::TPsoft FIXME"); // FIXME
+  }
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  // Set signed_immed_24 field
+  Binary |= getMachineOpValue(MI, 0);
+
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitInlineJumpTable(unsigned JTIndex) {
+  // Remember the base address of the inline jump table.
+  uintptr_t JTBase = MCE.getCurrentPCValue();
+  JTI->addJumpTableBaseAddr(JTIndex, JTBase);
+  DEBUG(errs() << "  ** Jump Table #" << JTIndex << " @ " << (void*)JTBase
+               << '\n');
+
+  // Now emit the jump table entries.
+  const std::vector<MachineBasicBlock*> &MBBs = (*MJTEs)[JTIndex].MBBs;
+  for (unsigned i = 0, e = MBBs.size(); i != e; ++i) {
+    if (IsPIC)
+      // DestBB address - JT base.
+      emitMachineBasicBlock(MBBs[i], ARM::reloc_arm_pic_jt, JTBase);
+    else
+      // Absolute DestBB address.
+      emitMachineBasicBlock(MBBs[i], ARM::reloc_arm_absolute);
+    emitWordLE(0);
+  }
+}
+
+void ARMCodeEmitter::emitMiscBranchInstruction(const MachineInstr &MI) {
+  const MCInstrDesc &MCID = MI.getDesc();
+
+  // Handle jump tables.
+  if (MCID.Opcode == ARM::BR_JTr || MCID.Opcode == ARM::BR_JTadd) {
+    // First emit a ldr pc, [] instruction.
+    emitDataProcessingInstruction(MI, ARM::PC);
+
+    // Then emit the inline jump table.
+    unsigned JTIndex =
+      (MCID.Opcode == ARM::BR_JTr)
+      ? MI.getOperand(1).getIndex() : MI.getOperand(2).getIndex();
+    emitInlineJumpTable(JTIndex);
+    return;
+  } else if (MCID.Opcode == ARM::BR_JTm) {
+    // First emit a ldr pc, [] instruction.
+    emitLoadStoreInstruction(MI, ARM::PC);
+
+    // Then emit the inline jump table.
+    emitInlineJumpTable(MI.getOperand(3).getIndex());
+    return;
+  }
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  if (MCID.Opcode == ARM::BX_RET || MCID.Opcode == ARM::MOVPCLR)
+    // The return register is LR.
+    Binary |= II->getRegisterInfo().getEncodingValue(ARM::LR);
+  else
+    // otherwise, set the return register
+    Binary |= getMachineOpValue(MI, 0);
+
+  emitWordLE(Binary);
+}
+
+unsigned ARMCodeEmitter::encodeVFPRd(const MachineInstr &MI,
+                                     unsigned OpIdx) const {
+  unsigned RegD = MI.getOperand(OpIdx).getReg();
+  unsigned Binary = 0;
+  bool isSPVFP = ARM::SPRRegClass.contains(RegD);
+  RegD = II->getRegisterInfo().getEncodingValue(RegD);
+  if (!isSPVFP)
+    Binary |=   RegD               << ARMII::RegRdShift;
+  else {
+    Binary |= ((RegD & 0x1E) >> 1) << ARMII::RegRdShift;
+    Binary |=  (RegD & 0x01)       << ARMII::D_BitShift;
+  }
+  return Binary;
+}
+
+unsigned ARMCodeEmitter::encodeVFPRn(const MachineInstr &MI,
+                                     unsigned OpIdx) const {
+  unsigned RegN = MI.getOperand(OpIdx).getReg();
+  unsigned Binary = 0;
+  bool isSPVFP = ARM::SPRRegClass.contains(RegN);
+  RegN = II->getRegisterInfo().getEncodingValue(RegN);
+  if (!isSPVFP)
+    Binary |=   RegN               << ARMII::RegRnShift;
+  else {
+    Binary |= ((RegN & 0x1E) >> 1) << ARMII::RegRnShift;
+    Binary |=  (RegN & 0x01)       << ARMII::N_BitShift;
+  }
+  return Binary;
+}
+
+unsigned ARMCodeEmitter::encodeVFPRm(const MachineInstr &MI,
+                                     unsigned OpIdx) const {
+  unsigned RegM = MI.getOperand(OpIdx).getReg();
+  unsigned Binary = 0;
+  bool isSPVFP = ARM::SPRRegClass.contains(RegM);
+  RegM = II->getRegisterInfo().getEncodingValue(RegM);
+  if (!isSPVFP)
+    Binary |=   RegM;
+  else {
+    Binary |= ((RegM & 0x1E) >> 1);
+    Binary |=  (RegM & 0x01)       << ARMII::M_BitShift;
+  }
+  return Binary;
+}
+
+void ARMCodeEmitter::emitVFPArithInstruction(const MachineInstr &MI) {
+  const MCInstrDesc &MCID = MI.getDesc();
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  unsigned OpIdx = 0;
+  assert((Binary & ARMII::D_BitShift) == 0 &&
+         (Binary & ARMII::N_BitShift) == 0 &&
+         (Binary & ARMII::M_BitShift) == 0 && "VFP encoding bug!");
+
+  // Encode Dd / Sd.
+  Binary |= encodeVFPRd(MI, OpIdx++);
+
+  // If this is a two-address operand, skip it, e.g. FMACD.
+  if (MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1)
+    ++OpIdx;
+
+  // Encode Dn / Sn.
+  if ((MCID.TSFlags & ARMII::FormMask) == ARMII::VFPBinaryFrm)
+    Binary |= encodeVFPRn(MI, OpIdx++);
+
+  if (OpIdx == MCID.getNumOperands() ||
+      MCID.OpInfo[OpIdx].isPredicate() ||
+      MCID.OpInfo[OpIdx].isOptionalDef()) {
+    // FCMPEZD etc. has only one operand.
+    emitWordLE(Binary);
+    return;
+  }
+
+  // Encode Dm / Sm.
+  Binary |= encodeVFPRm(MI, OpIdx);
+
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitVFPConversionInstruction(const MachineInstr &MI) {
+  const MCInstrDesc &MCID = MI.getDesc();
+  unsigned Form = MCID.TSFlags & ARMII::FormMask;
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  switch (Form) {
+  default: break;
+  case ARMII::VFPConv1Frm:
+  case ARMII::VFPConv2Frm:
+  case ARMII::VFPConv3Frm:
+    // Encode Dd / Sd.
+    Binary |= encodeVFPRd(MI, 0);
+    break;
+  case ARMII::VFPConv4Frm:
+    // Encode Dn / Sn.
+    Binary |= encodeVFPRn(MI, 0);
+    break;
+  case ARMII::VFPConv5Frm:
+    // Encode Dm / Sm.
+    Binary |= encodeVFPRm(MI, 0);
+    break;
+  }
+
+  switch (Form) {
+  default: break;
+  case ARMII::VFPConv1Frm:
+    // Encode Dm / Sm.
+    Binary |= encodeVFPRm(MI, 1);
+    break;
+  case ARMII::VFPConv2Frm:
+  case ARMII::VFPConv3Frm:
+    // Encode Dn / Sn.
+    Binary |= encodeVFPRn(MI, 1);
+    break;
+  case ARMII::VFPConv4Frm:
+  case ARMII::VFPConv5Frm:
+    // Encode Dd / Sd.
+    Binary |= encodeVFPRd(MI, 1);
+    break;
+  }
+
+  if (Form == ARMII::VFPConv5Frm)
+    // Encode Dn / Sn.
+    Binary |= encodeVFPRn(MI, 2);
+  else if (Form == ARMII::VFPConv3Frm)
+    // Encode Dm / Sm.
+    Binary |= encodeVFPRm(MI, 2);
+
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitVFPLoadStoreInstruction(const MachineInstr &MI) {
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  unsigned OpIdx = 0;
+
+  // Encode Dd / Sd.
+  Binary |= encodeVFPRd(MI, OpIdx++);
+
+  // Encode address base.
+  const MachineOperand &Base = MI.getOperand(OpIdx++);
+  Binary |= getMachineOpValue(MI, Base) << ARMII::RegRnShift;
+
+  // If there is a non-zero immediate offset, encode it.
+  if (Base.isReg()) {
+    const MachineOperand &Offset = MI.getOperand(OpIdx);
+    if (unsigned ImmOffs = ARM_AM::getAM5Offset(Offset.getImm())) {
+      if (ARM_AM::getAM5Op(Offset.getImm()) == ARM_AM::add)
+        Binary |= 1 << ARMII::U_BitShift;
+      Binary |= ImmOffs;
+      emitWordLE(Binary);
+      return;
+    }
+  }
+
+  // If immediate offset is omitted, default to +0.
+  Binary |= 1 << ARMII::U_BitShift;
+
+  emitWordLE(Binary);
+}
+
+void
+ARMCodeEmitter::emitVFPLoadStoreMultipleInstruction(const MachineInstr &MI) {
+  const MCInstrDesc &MCID = MI.getDesc();
+  bool IsUpdating = (MCID.TSFlags & ARMII::IndexModeMask) != 0;
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  // Skip operand 0 of an instruction with base register update.
+  unsigned OpIdx = 0;
+  if (IsUpdating)
+    ++OpIdx;
+
+  // Set base address operand
+  Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRnShift;
+
+  // Set addressing mode by modifying bits U(23) and P(24)
+  ARM_AM::AMSubMode Mode = ARM_AM::getLoadStoreMultipleSubMode(MI.getOpcode());
+  Binary |= getAddrModeUPBits(ARM_AM::getAM4SubMode(Mode));
+
+  // Set bit W(21)
+  if (IsUpdating)
+    Binary |= 0x1 << ARMII::W_BitShift;
+
+  // First register is encoded in Dd.
+  Binary |= encodeVFPRd(MI, OpIdx+2);
+
+  // Count the number of registers.
+  unsigned NumRegs = 1;
+  for (unsigned i = OpIdx+3, e = MI.getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || MO.isImplicit())
+      break;
+    ++NumRegs;
+  }
+  // Bit 8 will be set if <list> is consecutive 64-bit registers (e.g., D0)
+  // Otherwise, it will be 0, in the case of 32-bit registers.
+  if(Binary & 0x100)
+    Binary |= NumRegs * 2;
+  else
+    Binary |= NumRegs;
+
+  emitWordLE(Binary);
+}
+
+unsigned ARMCodeEmitter::encodeNEONRd(const MachineInstr &MI,
+                                      unsigned OpIdx) const {
+  unsigned RegD = MI.getOperand(OpIdx).getReg();
+  unsigned Binary = 0;
+  RegD = II->getRegisterInfo().getEncodingValue(RegD);
+  Binary |= (RegD & 0xf) << ARMII::RegRdShift;
+  Binary |= ((RegD >> 4) & 1) << ARMII::D_BitShift;
+  return Binary;
+}
+
+unsigned ARMCodeEmitter::encodeNEONRn(const MachineInstr &MI,
+                                      unsigned OpIdx) const {
+  unsigned RegN = MI.getOperand(OpIdx).getReg();
+  unsigned Binary = 0;
+  RegN = II->getRegisterInfo().getEncodingValue(RegN);
+  Binary |= (RegN & 0xf) << ARMII::RegRnShift;
+  Binary |= ((RegN >> 4) & 1) << ARMII::N_BitShift;
+  return Binary;
+}
+
+unsigned ARMCodeEmitter::encodeNEONRm(const MachineInstr &MI,
+                                      unsigned OpIdx) const {
+  unsigned RegM = MI.getOperand(OpIdx).getReg();
+  unsigned Binary = 0;
+  RegM = II->getRegisterInfo().getEncodingValue(RegM);
+  Binary |= (RegM & 0xf);
+  Binary |= ((RegM >> 4) & 1) << ARMII::M_BitShift;
+  return Binary;
+}
+
+/// convertNEONDataProcToThumb - Convert the ARM mode encoding for a NEON
+/// data-processing instruction to the corresponding Thumb encoding.
+static unsigned convertNEONDataProcToThumb(unsigned Binary) {
+  assert((Binary & 0xfe000000) == 0xf2000000 &&
+         "not an ARM NEON data-processing instruction");
+  unsigned UBit = (Binary >> 24) & 1;
+  return 0xef000000 | (UBit << 28) | (Binary & 0xffffff);
+}
+
+void ARMCodeEmitter::emitNEONLaneInstruction(const MachineInstr &MI) {
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  unsigned RegTOpIdx, RegNOpIdx, LnOpIdx;
+  const MCInstrDesc &MCID = MI.getDesc();
+  if ((MCID.TSFlags & ARMII::FormMask) == ARMII::NGetLnFrm) {
+    RegTOpIdx = 0;
+    RegNOpIdx = 1;
+    LnOpIdx = 2;
+  } else { // ARMII::NSetLnFrm
+    RegTOpIdx = 2;
+    RegNOpIdx = 0;
+    LnOpIdx = 3;
+  }
+
+  // Set the conditional execution predicate
+  Binary |= (IsThumb ? ARMCC::AL : II->getPredicate(&MI)) << ARMII::CondShift;
+
+  unsigned RegT = MI.getOperand(RegTOpIdx).getReg();
+  RegT = II->getRegisterInfo().getEncodingValue(RegT);
+  Binary |= (RegT << ARMII::RegRdShift);
+  Binary |= encodeNEONRn(MI, RegNOpIdx);
+
+  unsigned LaneShift;
+  if ((Binary & (1 << 22)) != 0)
+    LaneShift = 0; // 8-bit elements
+  else if ((Binary & (1 << 5)) != 0)
+    LaneShift = 1; // 16-bit elements
+  else
+    LaneShift = 2; // 32-bit elements
+
+  unsigned Lane = MI.getOperand(LnOpIdx).getImm() << LaneShift;
+  unsigned Opc1 = Lane >> 2;
+  unsigned Opc2 = Lane & 3;
+  assert((Opc1 & 3) == 0 && "out-of-range lane number operand");
+  Binary |= (Opc1 << 21);
+  Binary |= (Opc2 << 5);
+
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitNEONDupInstruction(const MachineInstr &MI) {
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= (IsThumb ? ARMCC::AL : II->getPredicate(&MI)) << ARMII::CondShift;
+
+  unsigned RegT = MI.getOperand(1).getReg();
+  RegT = II->getRegisterInfo().getEncodingValue(RegT);
+  Binary |= (RegT << ARMII::RegRdShift);
+  Binary |= encodeNEONRn(MI, 0);
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitNEON1RegModImmInstruction(const MachineInstr &MI) {
+  unsigned Binary = getBinaryCodeForInstr(MI);
+  // Destination register is encoded in Dd.
+  Binary |= encodeNEONRd(MI, 0);
+  // Immediate fields: Op, Cmode, I, Imm3, Imm4
+  unsigned Imm = MI.getOperand(1).getImm();
+  unsigned Op = (Imm >> 12) & 1;
+  unsigned Cmode = (Imm >> 8) & 0xf;
+  unsigned I = (Imm >> 7) & 1;
+  unsigned Imm3 = (Imm >> 4) & 0x7;
+  unsigned Imm4 = Imm & 0xf;
+  Binary |= (I << 24) | (Imm3 << 16) | (Cmode << 8) | (Op << 5) | Imm4;
+  if (IsThumb)
+    Binary = convertNEONDataProcToThumb(Binary);
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitNEON2RegInstruction(const MachineInstr &MI) {
+  const MCInstrDesc &MCID = MI.getDesc();
+  unsigned Binary = getBinaryCodeForInstr(MI);
+  // Destination register is encoded in Dd; source register in Dm.
+  unsigned OpIdx = 0;
+  Binary |= encodeNEONRd(MI, OpIdx++);
+  if (MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1)
+    ++OpIdx;
+  Binary |= encodeNEONRm(MI, OpIdx);
+  if (IsThumb)
+    Binary = convertNEONDataProcToThumb(Binary);
+  // FIXME: This does not handle VDUPfdf or VDUPfqf.
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitNEON3RegInstruction(const MachineInstr &MI) {
+  const MCInstrDesc &MCID = MI.getDesc();
+  unsigned Binary = getBinaryCodeForInstr(MI);
+  // Destination register is encoded in Dd; source registers in Dn and Dm.
+  unsigned OpIdx = 0;
+  Binary |= encodeNEONRd(MI, OpIdx++);
+  if (MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1)
+    ++OpIdx;
+  Binary |= encodeNEONRn(MI, OpIdx++);
+  if (MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1)
+    ++OpIdx;
+  Binary |= encodeNEONRm(MI, OpIdx);
+  if (IsThumb)
+    Binary = convertNEONDataProcToThumb(Binary);
+  // FIXME: This does not handle VMOVDneon or VMOVQ.
+  emitWordLE(Binary);
+}
+
+#include "ARMGenCodeEmitter.inc"
diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
new file mode 100644
index 0000000..ce264ee
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -0,0 +1,2062 @@
+//===-- ARMConstantIslandPass.cpp - ARM constant islands ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that splits the constant pool up into 'islands'
+// which are scattered through-out the function.  This is required due to the
+// limited pc-relative displacements that ARM has.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMMachineFunctionInfo.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "Thumb2InstrInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <algorithm>
+using namespace llvm;
+
+#define DEBUG_TYPE "arm-cp-islands"
+
+STATISTIC(NumCPEs,       "Number of constpool entries");
+STATISTIC(NumSplit,      "Number of uncond branches inserted");
+STATISTIC(NumCBrFixed,   "Number of cond branches fixed");
+STATISTIC(NumUBrFixed,   "Number of uncond branches fixed");
+STATISTIC(NumTBs,        "Number of table branches generated");
+STATISTIC(NumT2CPShrunk, "Number of Thumb2 constantpool instructions shrunk");
+STATISTIC(NumT2BrShrunk, "Number of Thumb2 immediate branches shrunk");
+STATISTIC(NumCBZ,        "Number of CBZ / CBNZ formed");
+STATISTIC(NumJTMoved,    "Number of jump table destination blocks moved");
+STATISTIC(NumJTInserted, "Number of jump table intermediate blocks inserted");
+
+
+static cl::opt<bool>
+AdjustJumpTableBlocks("arm-adjust-jump-tables", cl::Hidden, cl::init(true),
+          cl::desc("Adjust basic block layout to better use TB[BH]"));
+
+// FIXME: This option should be removed once it has received sufficient testing.
+static cl::opt<bool>
+AlignConstantIslands("arm-align-constant-islands", cl::Hidden, cl::init(true),
+          cl::desc("Align constant islands in code"));
+
+/// UnknownPadding - Return the worst case padding that could result from
+/// unknown offset bits.  This does not include alignment padding caused by
+/// known offset bits.
+///
+/// @param LogAlign log2(alignment)
+/// @param KnownBits Number of known low offset bits.
+static inline unsigned UnknownPadding(unsigned LogAlign, unsigned KnownBits) {
+  if (KnownBits < LogAlign)
+    return (1u << LogAlign) - (1u << KnownBits);
+  return 0;
+}
+
+namespace {
+  /// ARMConstantIslands - Due to limited PC-relative displacements, ARM
+  /// requires constant pool entries to be scattered among the instructions
+  /// inside a function.  To do this, it completely ignores the normal LLVM
+  /// constant pool; instead, it places constants wherever it feels like with
+  /// special instructions.
+  ///
+  /// The terminology used in this pass includes:
+  ///   Islands - Clumps of constants placed in the function.
+  ///   Water   - Potential places where an island could be formed.
+  ///   CPE     - A constant pool entry that has been placed somewhere, which
+  ///             tracks a list of users.
+  class ARMConstantIslands : public MachineFunctionPass {
+    /// BasicBlockInfo - Information about the offset and size of a single
+    /// basic block.
+    struct BasicBlockInfo {
+      /// Offset - Distance from the beginning of the function to the beginning
+      /// of this basic block.
+      ///
+      /// Offsets are computed assuming worst case padding before an aligned
+      /// block. This means that subtracting basic block offsets always gives a
+      /// conservative estimate of the real distance which may be smaller.
+      ///
+      /// Because worst case padding is used, the computed offset of an aligned
+      /// block may not actually be aligned.
+      unsigned Offset;
+
+      /// Size - Size of the basic block in bytes.  If the block contains
+      /// inline assembly, this is a worst case estimate.
+      ///
+      /// The size does not include any alignment padding whether from the
+      /// beginning of the block, or from an aligned jump table at the end.
+      unsigned Size;
+
+      /// KnownBits - The number of low bits in Offset that are known to be
+      /// exact.  The remaining bits of Offset are an upper bound.
+      uint8_t KnownBits;
+
+      /// Unalign - When non-zero, the block contains instructions (inline asm)
+      /// of unknown size.  The real size may be smaller than Size bytes by a
+      /// multiple of 1 << Unalign.
+      uint8_t Unalign;
+
+      /// PostAlign - When non-zero, the block terminator contains a .align
+      /// directive, so the end of the block is aligned to 1 << PostAlign
+      /// bytes.
+      uint8_t PostAlign;
+
+      BasicBlockInfo() : Offset(0), Size(0), KnownBits(0), Unalign(0),
+        PostAlign(0) {}
+
+      /// Compute the number of known offset bits internally to this block.
+      /// This number should be used to predict worst case padding when
+      /// splitting the block.
+      unsigned internalKnownBits() const {
+        unsigned Bits = Unalign ? Unalign : KnownBits;
+        // If the block size isn't a multiple of the known bits, assume the
+        // worst case padding.
+        if (Size & ((1u << Bits) - 1))
+          Bits = countTrailingZeros(Size);
+        return Bits;
+      }
+
+      /// Compute the offset immediately following this block.  If LogAlign is
+      /// specified, return the offset the successor block will get if it has
+      /// this alignment.
+      unsigned postOffset(unsigned LogAlign = 0) const {
+        unsigned PO = Offset + Size;
+        unsigned LA = std::max(unsigned(PostAlign), LogAlign);
+        if (!LA)
+          return PO;
+        // Add alignment padding from the terminator.
+        return PO + UnknownPadding(LA, internalKnownBits());
+      }
+
+      /// Compute the number of known low bits of postOffset.  If this block
+      /// contains inline asm, the number of known bits drops to the
+      /// instruction alignment.  An aligned terminator may increase the number
+      /// of know bits.
+      /// If LogAlign is given, also consider the alignment of the next block.
+      unsigned postKnownBits(unsigned LogAlign = 0) const {
+        return std::max(std::max(unsigned(PostAlign), LogAlign),
+                        internalKnownBits());
+      }
+    };
+
+    std::vector<BasicBlockInfo> BBInfo;
+
+    /// WaterList - A sorted list of basic blocks where islands could be placed
+    /// (i.e. blocks that don't fall through to the following block, due
+    /// to a return, unreachable, or unconditional branch).
+    std::vector<MachineBasicBlock*> WaterList;
+
+    /// NewWaterList - The subset of WaterList that was created since the
+    /// previous iteration by inserting unconditional branches.
+    SmallSet<MachineBasicBlock*, 4> NewWaterList;
+
+    typedef std::vector<MachineBasicBlock*>::iterator water_iterator;
+
+    /// CPUser - One user of a constant pool, keeping the machine instruction
+    /// pointer, the constant pool being referenced, and the max displacement
+    /// allowed from the instruction to the CP.  The HighWaterMark records the
+    /// highest basic block where a new CPEntry can be placed.  To ensure this
+    /// pass terminates, the CP entries are initially placed at the end of the
+    /// function and then move monotonically to lower addresses.  The
+    /// exception to this rule is when the current CP entry for a particular
+    /// CPUser is out of range, but there is another CP entry for the same
+    /// constant value in range.  We want to use the existing in-range CP
+    /// entry, but if it later moves out of range, the search for new water
+    /// should resume where it left off.  The HighWaterMark is used to record
+    /// that point.
+    struct CPUser {
+      MachineInstr *MI;
+      MachineInstr *CPEMI;
+      MachineBasicBlock *HighWaterMark;
+    private:
+      unsigned MaxDisp;
+    public:
+      bool NegOk;
+      bool IsSoImm;
+      bool KnownAlignment;
+      CPUser(MachineInstr *mi, MachineInstr *cpemi, unsigned maxdisp,
+             bool neg, bool soimm)
+        : MI(mi), CPEMI(cpemi), MaxDisp(maxdisp), NegOk(neg), IsSoImm(soimm),
+          KnownAlignment(false) {
+        HighWaterMark = CPEMI->getParent();
+      }
+      /// getMaxDisp - Returns the maximum displacement supported by MI.
+      /// Correct for unknown alignment.
+      /// Conservatively subtract 2 bytes to handle weird alignment effects.
+      unsigned getMaxDisp() const {
+        return (KnownAlignment ? MaxDisp : MaxDisp - 2) - 2;
+      }
+    };
+
+    /// CPUsers - Keep track of all of the machine instructions that use various
+    /// constant pools and their max displacement.
+    std::vector<CPUser> CPUsers;
+
+    /// CPEntry - One per constant pool entry, keeping the machine instruction
+    /// pointer, the constpool index, and the number of CPUser's which
+    /// reference this entry.
+    struct CPEntry {
+      MachineInstr *CPEMI;
+      unsigned CPI;
+      unsigned RefCount;
+      CPEntry(MachineInstr *cpemi, unsigned cpi, unsigned rc = 0)
+        : CPEMI(cpemi), CPI(cpi), RefCount(rc) {}
+    };
+
+    /// CPEntries - Keep track of all of the constant pool entry machine
+    /// instructions. For each original constpool index (i.e. those that
+    /// existed upon entry to this pass), it keeps a vector of entries.
+    /// Original elements are cloned as we go along; the clones are
+    /// put in the vector of the original element, but have distinct CPIs.
+    std::vector<std::vector<CPEntry> > CPEntries;
+
+    /// ImmBranch - One per immediate branch, keeping the machine instruction
+    /// pointer, conditional or unconditional, the max displacement,
+    /// and (if isCond is true) the corresponding unconditional branch
+    /// opcode.
+    struct ImmBranch {
+      MachineInstr *MI;
+      unsigned MaxDisp : 31;
+      bool isCond : 1;
+      int UncondBr;
+      ImmBranch(MachineInstr *mi, unsigned maxdisp, bool cond, int ubr)
+        : MI(mi), MaxDisp(maxdisp), isCond(cond), UncondBr(ubr) {}
+    };
+
+    /// ImmBranches - Keep track of all the immediate branch instructions.
+    ///
+    std::vector<ImmBranch> ImmBranches;
+
+    /// PushPopMIs - Keep track of all the Thumb push / pop instructions.
+    ///
+    SmallVector<MachineInstr*, 4> PushPopMIs;
+
+    /// T2JumpTables - Keep track of all the Thumb2 jumptable instructions.
+    SmallVector<MachineInstr*, 4> T2JumpTables;
+
+    /// HasFarJump - True if any far jump instruction has been emitted during
+    /// the branch fix up pass.
+    bool HasFarJump;
+
+    MachineFunction *MF;
+    MachineConstantPool *MCP;
+    const ARMBaseInstrInfo *TII;
+    const ARMSubtarget *STI;
+    ARMFunctionInfo *AFI;
+    bool isThumb;
+    bool isThumb1;
+    bool isThumb2;
+  public:
+    static char ID;
+    ARMConstantIslands() : MachineFunctionPass(ID) {}
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    const char *getPassName() const override {
+      return "ARM constant island placement and branch shortening pass";
+    }
+
+  private:
+    void doInitialPlacement(std::vector<MachineInstr*> &CPEMIs);
+    CPEntry *findConstPoolEntry(unsigned CPI, const MachineInstr *CPEMI);
+    unsigned getCPELogAlign(const MachineInstr *CPEMI);
+    void scanFunctionJumpTables();
+    void initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs);
+    MachineBasicBlock *splitBlockBeforeInstr(MachineInstr *MI);
+    void updateForInsertedWaterBlock(MachineBasicBlock *NewBB);
+    void adjustBBOffsetsAfter(MachineBasicBlock *BB);
+    bool decrementCPEReferenceCount(unsigned CPI, MachineInstr* CPEMI);
+    int findInRangeCPEntry(CPUser& U, unsigned UserOffset);
+    bool findAvailableWater(CPUser&U, unsigned UserOffset,
+                            water_iterator &WaterIter);
+    void createNewWater(unsigned CPUserIndex, unsigned UserOffset,
+                        MachineBasicBlock *&NewMBB);
+    bool handleConstantPoolUser(unsigned CPUserIndex);
+    void removeDeadCPEMI(MachineInstr *CPEMI);
+    bool removeUnusedCPEntries();
+    bool isCPEntryInRange(MachineInstr *MI, unsigned UserOffset,
+                          MachineInstr *CPEMI, unsigned Disp, bool NegOk,
+                          bool DoDump = false);
+    bool isWaterInRange(unsigned UserOffset, MachineBasicBlock *Water,
+                        CPUser &U, unsigned &Growth);
+    bool isBBInRange(MachineInstr *MI, MachineBasicBlock *BB, unsigned Disp);
+    bool fixupImmediateBr(ImmBranch &Br);
+    bool fixupConditionalBr(ImmBranch &Br);
+    bool fixupUnconditionalBr(ImmBranch &Br);
+    bool undoLRSpillRestore();
+    bool mayOptimizeThumb2Instruction(const MachineInstr *MI) const;
+    bool optimizeThumb2Instructions();
+    bool optimizeThumb2Branches();
+    bool reorderThumb2JumpTables();
+    bool optimizeThumb2JumpTables();
+    MachineBasicBlock *adjustJTTargetBlockForward(MachineBasicBlock *BB,
+                                                  MachineBasicBlock *JTBB);
+
+    void computeBlockSize(MachineBasicBlock *MBB);
+    unsigned getOffsetOf(MachineInstr *MI) const;
+    unsigned getUserOffset(CPUser&) const;
+    void dumpBBs();
+    void verify();
+
+    bool isOffsetInRange(unsigned UserOffset, unsigned TrialOffset,
+                         unsigned Disp, bool NegativeOK, bool IsSoImm = false);
+    bool isOffsetInRange(unsigned UserOffset, unsigned TrialOffset,
+                         const CPUser &U) {
+      return isOffsetInRange(UserOffset, TrialOffset,
+                             U.getMaxDisp(), U.NegOk, U.IsSoImm);
+    }
+  };
+  char ARMConstantIslands::ID = 0;
+}
+
+/// verify - check BBOffsets, BBSizes, alignment of islands
+void ARMConstantIslands::verify() {
+#ifndef NDEBUG
+  for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
+       MBBI != E; ++MBBI) {
+    MachineBasicBlock *MBB = MBBI;
+    unsigned MBBId = MBB->getNumber();
+    assert(!MBBId || BBInfo[MBBId - 1].postOffset() <= BBInfo[MBBId].Offset);
+  }
+  DEBUG(dbgs() << "Verifying " << CPUsers.size() << " CP users.\n");
+  for (unsigned i = 0, e = CPUsers.size(); i != e; ++i) {
+    CPUser &U = CPUsers[i];
+    unsigned UserOffset = getUserOffset(U);
+    // Verify offset using the real max displacement without the safety
+    // adjustment.
+    if (isCPEntryInRange(U.MI, UserOffset, U.CPEMI, U.getMaxDisp()+2, U.NegOk,
+                         /* DoDump = */ true)) {
+      DEBUG(dbgs() << "OK\n");
+      continue;
+    }
+    DEBUG(dbgs() << "Out of range.\n");
+    dumpBBs();
+    DEBUG(MF->dump());
+    llvm_unreachable("Constant pool entry out of range!");
+  }
+#endif
+}
+
+/// print block size and offset information - debugging
+void ARMConstantIslands::dumpBBs() {
+  DEBUG({
+    for (unsigned J = 0, E = BBInfo.size(); J !=E; ++J) {
+      const BasicBlockInfo &BBI = BBInfo[J];
+      dbgs() << format("%08x BB#%u\t", BBI.Offset, J)
+             << " kb=" << unsigned(BBI.KnownBits)
+             << " ua=" << unsigned(BBI.Unalign)
+             << " pa=" << unsigned(BBI.PostAlign)
+             << format(" size=%#x\n", BBInfo[J].Size);
+    }
+  });
+}
+
+/// createARMConstantIslandPass - returns an instance of the constpool
+/// island pass.
+FunctionPass *llvm::createARMConstantIslandPass() {
+  return new ARMConstantIslands();
+}
+
+bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
+  MF = &mf;
+  MCP = mf.getConstantPool();
+
+  DEBUG(dbgs() << "***** ARMConstantIslands: "
+               << MCP->getConstants().size() << " CP entries, aligned to "
+               << MCP->getConstantPoolAlignment() << " bytes *****\n");
+
+  TII = (const ARMBaseInstrInfo*)MF->getTarget().getInstrInfo();
+  AFI = MF->getInfo<ARMFunctionInfo>();
+  STI = &MF->getTarget().getSubtarget<ARMSubtarget>();
+
+  isThumb = AFI->isThumbFunction();
+  isThumb1 = AFI->isThumb1OnlyFunction();
+  isThumb2 = AFI->isThumb2Function();
+
+  HasFarJump = false;
+
+  // This pass invalidates liveness information when it splits basic blocks.
+  MF->getRegInfo().invalidateLiveness();
+
+  // Renumber all of the machine basic blocks in the function, guaranteeing that
+  // the numbers agree with the position of the block in the function.
+  MF->RenumberBlocks();
+
+  // Try to reorder and otherwise adjust the block layout to make good use
+  // of the TB[BH] instructions.
+  bool MadeChange = false;
+  if (isThumb2 && AdjustJumpTableBlocks) {
+    scanFunctionJumpTables();
+    MadeChange |= reorderThumb2JumpTables();
+    // Data is out of date, so clear it. It'll be re-computed later.
+    T2JumpTables.clear();
+    // Blocks may have shifted around. Keep the numbering up to date.
+    MF->RenumberBlocks();
+  }
+
+  // Thumb1 functions containing constant pools get 4-byte alignment.
+  // This is so we can keep exact track of where the alignment padding goes.
+
+  // ARM and Thumb2 functions need to be 4-byte aligned.
+  if (!isThumb1)
+    MF->ensureAlignment(2);  // 2 = log2(4)
+
+  // Perform the initial placement of the constant pool entries.  To start with,
+  // we put them all at the end of the function.
+  std::vector<MachineInstr*> CPEMIs;
+  if (!MCP->isEmpty())
+    doInitialPlacement(CPEMIs);
+
+  /// The next UID to take is the first unused one.
+  AFI->initPICLabelUId(CPEMIs.size());
+
+  // Do the initial scan of the function, building up information about the
+  // sizes of each block, the location of all the water, and finding all of the
+  // constant pool users.
+  initializeFunctionInfo(CPEMIs);
+  CPEMIs.clear();
+  DEBUG(dumpBBs());
+
+
+  /// Remove dead constant pool entries.
+  MadeChange |= removeUnusedCPEntries();
+
+  // Iteratively place constant pool entries and fix up branches until there
+  // is no change.
+  unsigned NoCPIters = 0, NoBRIters = 0;
+  while (true) {
+    DEBUG(dbgs() << "Beginning CP iteration #" << NoCPIters << '\n');
+    bool CPChange = false;
+    for (unsigned i = 0, e = CPUsers.size(); i != e; ++i)
+      CPChange |= handleConstantPoolUser(i);
+    if (CPChange && ++NoCPIters > 30)
+      report_fatal_error("Constant Island pass failed to converge!");
+    DEBUG(dumpBBs());
+
+    // Clear NewWaterList now.  If we split a block for branches, it should
+    // appear as "new water" for the next iteration of constant pool placement.
+    NewWaterList.clear();
+
+    DEBUG(dbgs() << "Beginning BR iteration #" << NoBRIters << '\n');
+    bool BRChange = false;
+    for (unsigned i = 0, e = ImmBranches.size(); i != e; ++i)
+      BRChange |= fixupImmediateBr(ImmBranches[i]);
+    if (BRChange && ++NoBRIters > 30)
+      report_fatal_error("Branch Fix Up pass failed to converge!");
+    DEBUG(dumpBBs());
+
+    if (!CPChange && !BRChange)
+      break;
+    MadeChange = true;
+  }
+
+  // Shrink 32-bit Thumb2 branch, load, and store instructions.
+  if (isThumb2 && !STI->prefers32BitThumb())
+    MadeChange |= optimizeThumb2Instructions();
+
+  // After a while, this might be made debug-only, but it is not expensive.
+  verify();
+
+  // If LR has been forced spilled and no far jump (i.e. BL) has been issued,
+  // undo the spill / restore of LR if possible.
+  if (isThumb && !HasFarJump && AFI->isLRSpilledForFarJump())
+    MadeChange |= undoLRSpillRestore();
+
+  // Save the mapping between original and cloned constpool entries.
+  for (unsigned i = 0, e = CPEntries.size(); i != e; ++i) {
+    for (unsigned j = 0, je = CPEntries[i].size(); j != je; ++j) {
+      const CPEntry & CPE = CPEntries[i][j];
+      AFI->recordCPEClone(i, CPE.CPI);
+    }
+  }
+
+  DEBUG(dbgs() << '\n'; dumpBBs());
+
+  BBInfo.clear();
+  WaterList.clear();
+  CPUsers.clear();
+  CPEntries.clear();
+  ImmBranches.clear();
+  PushPopMIs.clear();
+  T2JumpTables.clear();
+
+  return MadeChange;
+}
+
+/// doInitialPlacement - Perform the initial placement of the constant pool
+/// entries.  To start with, we put them all at the end of the function.
+void
+ARMConstantIslands::doInitialPlacement(std::vector<MachineInstr*> &CPEMIs) {
+  // Create the basic block to hold the CPE's.
+  MachineBasicBlock *BB = MF->CreateMachineBasicBlock();
+  MF->push_back(BB);
+
+  // MachineConstantPool measures alignment in bytes. We measure in log2(bytes).
+  unsigned MaxAlign = Log2_32(MCP->getConstantPoolAlignment());
+
+  // Mark the basic block as required by the const-pool.
+  // If AlignConstantIslands isn't set, use 4-byte alignment for everything.
+  BB->setAlignment(AlignConstantIslands ? MaxAlign : 2);
+
+  // The function needs to be as aligned as the basic blocks. The linker may
+  // move functions around based on their alignment.
+  MF->ensureAlignment(BB->getAlignment());
+
+  // Order the entries in BB by descending alignment.  That ensures correct
+  // alignment of all entries as long as BB is sufficiently aligned.  Keep
+  // track of the insertion point for each alignment.  We are going to bucket
+  // sort the entries as they are created.
+  SmallVector<MachineBasicBlock::iterator, 8> InsPoint(MaxAlign + 1, BB->end());
+
+  // Add all of the constants from the constant pool to the end block, use an
+  // identity mapping of CPI's to CPE's.
+  const std::vector<MachineConstantPoolEntry> &CPs = MCP->getConstants();
+
+  const DataLayout &TD = *MF->getTarget().getDataLayout();
+  for (unsigned i = 0, e = CPs.size(); i != e; ++i) {
+    unsigned Size = TD.getTypeAllocSize(CPs[i].getType());
+    assert(Size >= 4 && "Too small constant pool entry");
+    unsigned Align = CPs[i].getAlignment();
+    assert(isPowerOf2_32(Align) && "Invalid alignment");
+    // Verify that all constant pool entries are a multiple of their alignment.
+    // If not, we would have to pad them out so that instructions stay aligned.
+    assert((Size % Align) == 0 && "CP Entry not multiple of 4 bytes!");
+
+    // Insert CONSTPOOL_ENTRY before entries with a smaller alignment.
+    unsigned LogAlign = Log2_32(Align);
+    MachineBasicBlock::iterator InsAt = InsPoint[LogAlign];
+    MachineInstr *CPEMI =
+      BuildMI(*BB, InsAt, DebugLoc(), TII->get(ARM::CONSTPOOL_ENTRY))
+        .addImm(i).addConstantPoolIndex(i).addImm(Size);
+    CPEMIs.push_back(CPEMI);
+
+    // Ensure that future entries with higher alignment get inserted before
+    // CPEMI. This is bucket sort with iterators.
+    for (unsigned a = LogAlign + 1; a <= MaxAlign; ++a)
+      if (InsPoint[a] == InsAt)
+        InsPoint[a] = CPEMI;
+
+    // Add a new CPEntry, but no corresponding CPUser yet.
+    std::vector<CPEntry> CPEs;
+    CPEs.push_back(CPEntry(CPEMI, i));
+    CPEntries.push_back(CPEs);
+    ++NumCPEs;
+    DEBUG(dbgs() << "Moved CPI#" << i << " to end of function, size = "
+                 << Size << ", align = " << Align <<'\n');
+  }
+  DEBUG(BB->dump());
+}
+
+/// BBHasFallthrough - Return true if the specified basic block can fallthrough
+/// into the block immediately after it.
+static bool BBHasFallthrough(MachineBasicBlock *MBB) {
+  // Get the next machine basic block in the function.
+  MachineFunction::iterator MBBI = MBB;
+  // Can't fall off end of function.
+  if (std::next(MBBI) == MBB->getParent()->end())
+    return false;
+
+  MachineBasicBlock *NextBB = std::next(MBBI);
+  for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
+       E = MBB->succ_end(); I != E; ++I)
+    if (*I == NextBB)
+      return true;
+
+  return false;
+}
+
+/// findConstPoolEntry - Given the constpool index and CONSTPOOL_ENTRY MI,
+/// look up the corresponding CPEntry.
+ARMConstantIslands::CPEntry
+*ARMConstantIslands::findConstPoolEntry(unsigned CPI,
+                                        const MachineInstr *CPEMI) {
+  std::vector<CPEntry> &CPEs = CPEntries[CPI];
+  // Number of entries per constpool index should be small, just do a
+  // linear search.
+  for (unsigned i = 0, e = CPEs.size(); i != e; ++i) {
+    if (CPEs[i].CPEMI == CPEMI)
+      return &CPEs[i];
+  }
+  return nullptr;
+}
+
+/// getCPELogAlign - Returns the required alignment of the constant pool entry
+/// represented by CPEMI.  Alignment is measured in log2(bytes) units.
+unsigned ARMConstantIslands::getCPELogAlign(const MachineInstr *CPEMI) {
+  assert(CPEMI && CPEMI->getOpcode() == ARM::CONSTPOOL_ENTRY);
+
+  // Everything is 4-byte aligned unless AlignConstantIslands is set.
+  if (!AlignConstantIslands)
+    return 2;
+
+  unsigned CPI = CPEMI->getOperand(1).getIndex();
+  assert(CPI < MCP->getConstants().size() && "Invalid constant pool index.");
+  unsigned Align = MCP->getConstants()[CPI].getAlignment();
+  assert(isPowerOf2_32(Align) && "Invalid CPE alignment");
+  return Log2_32(Align);
+}
+
+/// scanFunctionJumpTables - Do a scan of the function, building up
+/// information about the sizes of each block and the locations of all
+/// the jump tables.
+void ARMConstantIslands::scanFunctionJumpTables() {
+  for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
+       MBBI != E; ++MBBI) {
+    MachineBasicBlock &MBB = *MBBI;
+
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+         I != E; ++I)
+      if (I->isBranch() && I->getOpcode() == ARM::t2BR_JT)
+        T2JumpTables.push_back(I);
+  }
+}
+
+/// initializeFunctionInfo - Do the initial scan of the function, building up
+/// information about the sizes of each block, the location of all the water,
+/// and finding all of the constant pool users.
+void ARMConstantIslands::
+initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
+  BBInfo.clear();
+  BBInfo.resize(MF->getNumBlockIDs());
+
+  // First thing, compute the size of all basic blocks, and see if the function
+  // has any inline assembly in it. If so, we have to be conservative about
+  // alignment assumptions, as we don't know for sure the size of any
+  // instructions in the inline assembly.
+  for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I)
+    computeBlockSize(I);
+
+  // The known bits of the entry block offset are determined by the function
+  // alignment.
+  BBInfo.front().KnownBits = MF->getAlignment();
+
+  // Compute block offsets and known bits.
+  adjustBBOffsetsAfter(MF->begin());
+
+  // Now go back through the instructions and build up our data structures.
+  for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
+       MBBI != E; ++MBBI) {
+    MachineBasicBlock &MBB = *MBBI;
+
+    // If this block doesn't fall through into the next MBB, then this is
+    // 'water' that a constant pool island could be placed.
+    if (!BBHasFallthrough(&MBB))
+      WaterList.push_back(&MBB);
+
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+         I != E; ++I) {
+      if (I->isDebugValue())
+        continue;
+
+      int Opc = I->getOpcode();
+      if (I->isBranch()) {
+        bool isCond = false;
+        unsigned Bits = 0;
+        unsigned Scale = 1;
+        int UOpc = Opc;
+        switch (Opc) {
+        default:
+          continue;  // Ignore other JT branches
+        case ARM::t2BR_JT:
+          T2JumpTables.push_back(I);
+          continue;   // Does not get an entry in ImmBranches
+        case ARM::Bcc:
+          isCond = true;
+          UOpc = ARM::B;
+          // Fallthrough
+        case ARM::B:
+          Bits = 24;
+          Scale = 4;
+          break;
+        case ARM::tBcc:
+          isCond = true;
+          UOpc = ARM::tB;
+          Bits = 8;
+          Scale = 2;
+          break;
+        case ARM::tB:
+          Bits = 11;
+          Scale = 2;
+          break;
+        case ARM::t2Bcc:
+          isCond = true;
+          UOpc = ARM::t2B;
+          Bits = 20;
+          Scale = 2;
+          break;
+        case ARM::t2B:
+          Bits = 24;
+          Scale = 2;
+          break;
+        }
+
+        // Record this immediate branch.
+        unsigned MaxOffs = ((1 << (Bits-1))-1) * Scale;
+        ImmBranches.push_back(ImmBranch(I, MaxOffs, isCond, UOpc));
+      }
+
+      if (Opc == ARM::tPUSH || Opc == ARM::tPOP_RET)
+        PushPopMIs.push_back(I);
+
+      if (Opc == ARM::CONSTPOOL_ENTRY)
+        continue;
+
+      // Scan the instructions for constant pool operands.
+      for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op)
+        if (I->getOperand(op).isCPI()) {
+          // We found one.  The addressing mode tells us the max displacement
+          // from the PC that this instruction permits.
+
+          // Basic size info comes from the TSFlags field.
+          unsigned Bits = 0;
+          unsigned Scale = 1;
+          bool NegOk = false;
+          bool IsSoImm = false;
+
+          switch (Opc) {
+          default:
+            llvm_unreachable("Unknown addressing mode for CP reference!");
+
+          // Taking the address of a CP entry.
+          case ARM::LEApcrel:
+            // This takes a SoImm, which is 8 bit immediate rotated. We'll
+            // pretend the maximum offset is 255 * 4. Since each instruction
+            // 4 byte wide, this is always correct. We'll check for other
+            // displacements that fits in a SoImm as well.
+            Bits = 8;
+            Scale = 4;
+            NegOk = true;
+            IsSoImm = true;
+            break;
+          case ARM::t2LEApcrel:
+            Bits = 12;
+            NegOk = true;
+            break;
+          case ARM::tLEApcrel:
+            Bits = 8;
+            Scale = 4;
+            break;
+
+          case ARM::LDRBi12:
+          case ARM::LDRi12:
+          case ARM::LDRcp:
+          case ARM::t2LDRpci:
+            Bits = 12;  // +-offset_12
+            NegOk = true;
+            break;
+
+          case ARM::tLDRpci:
+            Bits = 8;
+            Scale = 4;  // +(offset_8*4)
+            break;
+
+          case ARM::VLDRD:
+          case ARM::VLDRS:
+            Bits = 8;
+            Scale = 4;  // +-(offset_8*4)
+            NegOk = true;
+            break;
+          }
+
+          // Remember that this is a user of a CP entry.
+          unsigned CPI = I->getOperand(op).getIndex();
+          MachineInstr *CPEMI = CPEMIs[CPI];
+          unsigned MaxOffs = ((1 << Bits)-1) * Scale;
+          CPUsers.push_back(CPUser(I, CPEMI, MaxOffs, NegOk, IsSoImm));
+
+          // Increment corresponding CPEntry reference count.
+          CPEntry *CPE = findConstPoolEntry(CPI, CPEMI);
+          assert(CPE && "Cannot find a corresponding CPEntry!");
+          CPE->RefCount++;
+
+          // Instructions can only use one CP entry, don't bother scanning the
+          // rest of the operands.
+          break;
+        }
+    }
+  }
+}
+
+/// computeBlockSize - Compute the size and some alignment information for MBB.
+/// This function updates BBInfo directly.
+void ARMConstantIslands::computeBlockSize(MachineBasicBlock *MBB) {
+  BasicBlockInfo &BBI = BBInfo[MBB->getNumber()];
+  BBI.Size = 0;
+  BBI.Unalign = 0;
+  BBI.PostAlign = 0;
+
+  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
+       ++I) {
+    BBI.Size += TII->GetInstSizeInBytes(I);
+    // For inline asm, GetInstSizeInBytes returns a conservative estimate.
+    // The actual size may be smaller, but still a multiple of the instr size.
+    if (I->isInlineAsm())
+      BBI.Unalign = isThumb ? 1 : 2;
+    // Also consider instructions that may be shrunk later.
+    else if (isThumb && mayOptimizeThumb2Instruction(I))
+      BBI.Unalign = 1;
+  }
+
+  // tBR_JTr contains a .align 2 directive.
+  if (!MBB->empty() && MBB->back().getOpcode() == ARM::tBR_JTr) {
+    BBI.PostAlign = 2;
+    MBB->getParent()->ensureAlignment(2);
+  }
+}
+
+/// getOffsetOf - Return the current offset of the specified machine instruction
+/// from the start of the function.  This offset changes as stuff is moved
+/// around inside the function.
+unsigned ARMConstantIslands::getOffsetOf(MachineInstr *MI) const {
+  MachineBasicBlock *MBB = MI->getParent();
+
+  // The offset is composed of two things: the sum of the sizes of all MBB's
+  // before this instruction's block, and the offset from the start of the block
+  // it is in.
+  unsigned Offset = BBInfo[MBB->getNumber()].Offset;
+
+  // Sum instructions before MI in MBB.
+  for (MachineBasicBlock::iterator I = MBB->begin(); &*I != MI; ++I) {
+    assert(I != MBB->end() && "Didn't find MI in its own basic block?");
+    Offset += TII->GetInstSizeInBytes(I);
+  }
+  return Offset;
+}
+
+/// CompareMBBNumbers - Little predicate function to sort the WaterList by MBB
+/// ID.
+static bool CompareMBBNumbers(const MachineBasicBlock *LHS,
+                              const MachineBasicBlock *RHS) {
+  return LHS->getNumber() < RHS->getNumber();
+}
+
+/// updateForInsertedWaterBlock - When a block is newly inserted into the
+/// machine function, it upsets all of the block numbers.  Renumber the blocks
+/// and update the arrays that parallel this numbering.
+void ARMConstantIslands::updateForInsertedWaterBlock(MachineBasicBlock *NewBB) {
+  // Renumber the MBB's to keep them consecutive.
+  NewBB->getParent()->RenumberBlocks(NewBB);
+
+  // Insert an entry into BBInfo to align it properly with the (newly
+  // renumbered) block numbers.
+  BBInfo.insert(BBInfo.begin() + NewBB->getNumber(), BasicBlockInfo());
+
+  // Next, update WaterList.  Specifically, we need to add NewMBB as having
+  // available water after it.
+  water_iterator IP =
+    std::lower_bound(WaterList.begin(), WaterList.end(), NewBB,
+                     CompareMBBNumbers);
+  WaterList.insert(IP, NewBB);
+}
+
+
+/// Split the basic block containing MI into two blocks, which are joined by
+/// an unconditional branch.  Update data structures and renumber blocks to
+/// account for this change and returns the newly created block.
+MachineBasicBlock *ARMConstantIslands::splitBlockBeforeInstr(MachineInstr *MI) {
+  MachineBasicBlock *OrigBB = MI->getParent();
+
+  // Create a new MBB for the code after the OrigBB.
+  MachineBasicBlock *NewBB =
+    MF->CreateMachineBasicBlock(OrigBB->getBasicBlock());
+  MachineFunction::iterator MBBI = OrigBB; ++MBBI;
+  MF->insert(MBBI, NewBB);
+
+  // Splice the instructions starting with MI over to NewBB.
+  NewBB->splice(NewBB->end(), OrigBB, MI, OrigBB->end());
+
+  // Add an unconditional branch from OrigBB to NewBB.
+  // Note the new unconditional branch is not being recorded.
+  // There doesn't seem to be meaningful DebugInfo available; this doesn't
+  // correspond to anything in the source.
+  unsigned Opc = isThumb ? (isThumb2 ? ARM::t2B : ARM::tB) : ARM::B;
+  if (!isThumb)
+    BuildMI(OrigBB, DebugLoc(), TII->get(Opc)).addMBB(NewBB);
+  else
+    BuildMI(OrigBB, DebugLoc(), TII->get(Opc)).addMBB(NewBB)
+            .addImm(ARMCC::AL).addReg(0);
+  ++NumSplit;
+
+  // Update the CFG.  All succs of OrigBB are now succs of NewBB.
+  NewBB->transferSuccessors(OrigBB);
+
+  // OrigBB branches to NewBB.
+  OrigBB->addSuccessor(NewBB);
+
+  // Update internal data structures to account for the newly inserted MBB.
+  // This is almost the same as updateForInsertedWaterBlock, except that
+  // the Water goes after OrigBB, not NewBB.
+  MF->RenumberBlocks(NewBB);
+
+  // Insert an entry into BBInfo to align it properly with the (newly
+  // renumbered) block numbers.
+  BBInfo.insert(BBInfo.begin() + NewBB->getNumber(), BasicBlockInfo());
+
+  // Next, update WaterList.  Specifically, we need to add OrigMBB as having
+  // available water after it (but not if it's already there, which happens
+  // when splitting before a conditional branch that is followed by an
+  // unconditional branch - in that case we want to insert NewBB).
+  water_iterator IP =
+    std::lower_bound(WaterList.begin(), WaterList.end(), OrigBB,
+                     CompareMBBNumbers);
+  MachineBasicBlock* WaterBB = *IP;
+  if (WaterBB == OrigBB)
+    WaterList.insert(std::next(IP), NewBB);
+  else
+    WaterList.insert(IP, OrigBB);
+  NewWaterList.insert(OrigBB);
+
+  // Figure out how large the OrigBB is.  As the first half of the original
+  // block, it cannot contain a tablejump.  The size includes
+  // the new jump we added.  (It should be possible to do this without
+  // recounting everything, but it's very confusing, and this is rarely
+  // executed.)
+  computeBlockSize(OrigBB);
+
+  // Figure out how large the NewMBB is.  As the second half of the original
+  // block, it may contain a tablejump.
+  computeBlockSize(NewBB);
+
+  // All BBOffsets following these blocks must be modified.
+  adjustBBOffsetsAfter(OrigBB);
+
+  return NewBB;
+}
+
+/// getUserOffset - Compute the offset of U.MI as seen by the hardware
+/// displacement computation.  Update U.KnownAlignment to match its current
+/// basic block location.
+unsigned ARMConstantIslands::getUserOffset(CPUser &U) const {
+  unsigned UserOffset = getOffsetOf(U.MI);
+  const BasicBlockInfo &BBI = BBInfo[U.MI->getParent()->getNumber()];
+  unsigned KnownBits = BBI.internalKnownBits();
+
+  // The value read from PC is offset from the actual instruction address.
+  UserOffset += (isThumb ? 4 : 8);
+
+  // Because of inline assembly, we may not know the alignment (mod 4) of U.MI.
+  // Make sure U.getMaxDisp() returns a constrained range.
+  U.KnownAlignment = (KnownBits >= 2);
+
+  // On Thumb, offsets==2 mod 4 are rounded down by the hardware for
+  // purposes of the displacement computation; compensate for that here.
+  // For unknown alignments, getMaxDisp() constrains the range instead.
+  if (isThumb && U.KnownAlignment)
+    UserOffset &= ~3u;
+
+  return UserOffset;
+}
+
+/// isOffsetInRange - Checks whether UserOffset (the location of a constant pool
+/// reference) is within MaxDisp of TrialOffset (a proposed location of a
+/// constant pool entry).
+/// UserOffset is computed by getUserOffset above to include PC adjustments. If
+/// the mod 4 alignment of UserOffset is not known, the uncertainty must be
+/// subtracted from MaxDisp instead. CPUser::getMaxDisp() does that.
+bool ARMConstantIslands::isOffsetInRange(unsigned UserOffset,
+                                         unsigned TrialOffset, unsigned MaxDisp,
+                                         bool NegativeOK, bool IsSoImm) {
+  if (UserOffset <= TrialOffset) {
+    // User before the Trial.
+    if (TrialOffset - UserOffset <= MaxDisp)
+      return true;
+    // FIXME: Make use full range of soimm values.
+  } else if (NegativeOK) {
+    if (UserOffset - TrialOffset <= MaxDisp)
+      return true;
+    // FIXME: Make use full range of soimm values.
+  }
+  return false;
+}
+
+/// isWaterInRange - Returns true if a CPE placed after the specified
+/// Water (a basic block) will be in range for the specific MI.
+///
+/// Compute how much the function will grow by inserting a CPE after Water.
+bool ARMConstantIslands::isWaterInRange(unsigned UserOffset,
+                                        MachineBasicBlock* Water, CPUser &U,
+                                        unsigned &Growth) {
+  unsigned CPELogAlign = getCPELogAlign(U.CPEMI);
+  unsigned CPEOffset = BBInfo[Water->getNumber()].postOffset(CPELogAlign);
+  unsigned NextBlockOffset, NextBlockAlignment;
+  MachineFunction::const_iterator NextBlock = Water;
+  if (++NextBlock == MF->end()) {
+    NextBlockOffset = BBInfo[Water->getNumber()].postOffset();
+    NextBlockAlignment = 0;
+  } else {
+    NextBlockOffset = BBInfo[NextBlock->getNumber()].Offset;
+    NextBlockAlignment = NextBlock->getAlignment();
+  }
+  unsigned Size = U.CPEMI->getOperand(2).getImm();
+  unsigned CPEEnd = CPEOffset + Size;
+
+  // The CPE may be able to hide in the alignment padding before the next
+  // block. It may also cause more padding to be required if it is more aligned
+  // that the next block.
+  if (CPEEnd > NextBlockOffset) {
+    Growth = CPEEnd - NextBlockOffset;
+    // Compute the padding that would go at the end of the CPE to align the next
+    // block.
+    Growth += OffsetToAlignment(CPEEnd, 1u << NextBlockAlignment);
+
+    // If the CPE is to be inserted before the instruction, that will raise
+    // the offset of the instruction. Also account for unknown alignment padding
+    // in blocks between CPE and the user.
+    if (CPEOffset < UserOffset)
+      UserOffset += Growth + UnknownPadding(MF->getAlignment(), CPELogAlign);
+  } else
+    // CPE fits in existing padding.
+    Growth = 0;
+
+  return isOffsetInRange(UserOffset, CPEOffset, U);
+}
+
+/// isCPEntryInRange - Returns true if the distance between specific MI and
+/// specific ConstPool entry instruction can fit in MI's displacement field.
+bool ARMConstantIslands::isCPEntryInRange(MachineInstr *MI, unsigned UserOffset,
+                                      MachineInstr *CPEMI, unsigned MaxDisp,
+                                      bool NegOk, bool DoDump) {
+  unsigned CPEOffset  = getOffsetOf(CPEMI);
+
+  if (DoDump) {
+    DEBUG({
+      unsigned Block = MI->getParent()->getNumber();
+      const BasicBlockInfo &BBI = BBInfo[Block];
+      dbgs() << "User of CPE#" << CPEMI->getOperand(0).getImm()
+             << " max delta=" << MaxDisp
+             << format(" insn address=%#x", UserOffset)
+             << " in BB#" << Block << ": "
+             << format("%#x-%x\t", BBI.Offset, BBI.postOffset()) << *MI
+             << format("CPE address=%#x offset=%+d: ", CPEOffset,
+                       int(CPEOffset-UserOffset));
+    });
+  }
+
+  return isOffsetInRange(UserOffset, CPEOffset, MaxDisp, NegOk);
+}
+
+#ifndef NDEBUG
+/// BBIsJumpedOver - Return true of the specified basic block's only predecessor
+/// unconditionally branches to its only successor.
+static bool BBIsJumpedOver(MachineBasicBlock *MBB) {
+  if (MBB->pred_size() != 1 || MBB->succ_size() != 1)
+    return false;
+
+  MachineBasicBlock *Succ = *MBB->succ_begin();
+  MachineBasicBlock *Pred = *MBB->pred_begin();
+  MachineInstr *PredMI = &Pred->back();
+  if (PredMI->getOpcode() == ARM::B || PredMI->getOpcode() == ARM::tB
+      || PredMI->getOpcode() == ARM::t2B)
+    return PredMI->getOperand(0).getMBB() == Succ;
+  return false;
+}
+#endif // NDEBUG
+
+void ARMConstantIslands::adjustBBOffsetsAfter(MachineBasicBlock *BB) {
+  unsigned BBNum = BB->getNumber();
+  for(unsigned i = BBNum + 1, e = MF->getNumBlockIDs(); i < e; ++i) {
+    // Get the offset and known bits at the end of the layout predecessor.
+    // Include the alignment of the current block.
+    unsigned LogAlign = MF->getBlockNumbered(i)->getAlignment();
+    unsigned Offset = BBInfo[i - 1].postOffset(LogAlign);
+    unsigned KnownBits = BBInfo[i - 1].postKnownBits(LogAlign);
+
+    // This is where block i begins.  Stop if the offset is already correct,
+    // and we have updated 2 blocks.  This is the maximum number of blocks
+    // changed before calling this function.
+    if (i > BBNum + 2 &&
+        BBInfo[i].Offset == Offset &&
+        BBInfo[i].KnownBits == KnownBits)
+      break;
+
+    BBInfo[i].Offset = Offset;
+    BBInfo[i].KnownBits = KnownBits;
+  }
+}
+
+/// decrementCPEReferenceCount - find the constant pool entry with index CPI
+/// and instruction CPEMI, and decrement its refcount.  If the refcount
+/// becomes 0 remove the entry and instruction.  Returns true if we removed
+/// the entry, false if we didn't.
+
+bool ARMConstantIslands::decrementCPEReferenceCount(unsigned CPI,
+                                                    MachineInstr *CPEMI) {
+  // Find the old entry. Eliminate it if it is no longer used.
+  CPEntry *CPE = findConstPoolEntry(CPI, CPEMI);
+  assert(CPE && "Unexpected!");
+  if (--CPE->RefCount == 0) {
+    removeDeadCPEMI(CPEMI);
+    CPE->CPEMI = nullptr;
+    --NumCPEs;
+    return true;
+  }
+  return false;
+}
+
+/// LookForCPEntryInRange - see if the currently referenced CPE is in range;
+/// if not, see if an in-range clone of the CPE is in range, and if so,
+/// change the data structures so the user references the clone.  Returns:
+/// 0 = no existing entry found
+/// 1 = entry found, and there were no code insertions or deletions
+/// 2 = entry found, and there were code insertions or deletions
+int ARMConstantIslands::findInRangeCPEntry(CPUser& U, unsigned UserOffset)
+{
+  MachineInstr *UserMI = U.MI;
+  MachineInstr *CPEMI  = U.CPEMI;
+
+  // Check to see if the CPE is already in-range.
+  if (isCPEntryInRange(UserMI, UserOffset, CPEMI, U.getMaxDisp(), U.NegOk,
+                       true)) {
+    DEBUG(dbgs() << "In range\n");
+    return 1;
+  }
+
+  // No.  Look for previously created clones of the CPE that are in range.
+  unsigned CPI = CPEMI->getOperand(1).getIndex();
+  std::vector<CPEntry> &CPEs = CPEntries[CPI];
+  for (unsigned i = 0, e = CPEs.size(); i != e; ++i) {
+    // We already tried this one
+    if (CPEs[i].CPEMI == CPEMI)
+      continue;
+    // Removing CPEs can leave empty entries, skip
+    if (CPEs[i].CPEMI == nullptr)
+      continue;
+    if (isCPEntryInRange(UserMI, UserOffset, CPEs[i].CPEMI, U.getMaxDisp(),
+                     U.NegOk)) {
+      DEBUG(dbgs() << "Replacing CPE#" << CPI << " with CPE#"
+                   << CPEs[i].CPI << "\n");
+      // Point the CPUser node to the replacement
+      U.CPEMI = CPEs[i].CPEMI;
+      // Change the CPI in the instruction operand to refer to the clone.
+      for (unsigned j = 0, e = UserMI->getNumOperands(); j != e; ++j)
+        if (UserMI->getOperand(j).isCPI()) {
+          UserMI->getOperand(j).setIndex(CPEs[i].CPI);
+          break;
+        }
+      // Adjust the refcount of the clone...
+      CPEs[i].RefCount++;
+      // ...and the original.  If we didn't remove the old entry, none of the
+      // addresses changed, so we don't need another pass.
+      return decrementCPEReferenceCount(CPI, CPEMI) ? 2 : 1;
+    }
+  }
+  return 0;
+}
+
+/// getUnconditionalBrDisp - Returns the maximum displacement that can fit in
+/// the specific unconditional branch instruction.
+static inline unsigned getUnconditionalBrDisp(int Opc) {
+  switch (Opc) {
+  case ARM::tB:
+    return ((1<<10)-1)*2;
+  case ARM::t2B:
+    return ((1<<23)-1)*2;
+  default:
+    break;
+  }
+
+  return ((1<<23)-1)*4;
+}
+
+/// findAvailableWater - Look for an existing entry in the WaterList in which
+/// we can place the CPE referenced from U so it's within range of U's MI.
+/// Returns true if found, false if not.  If it returns true, WaterIter
+/// is set to the WaterList entry.  For Thumb, prefer water that will not
+/// introduce padding to water that will.  To ensure that this pass
+/// terminates, the CPE location for a particular CPUser is only allowed to
+/// move to a lower address, so search backward from the end of the list and
+/// prefer the first water that is in range.
+bool ARMConstantIslands::findAvailableWater(CPUser &U, unsigned UserOffset,
+                                      water_iterator &WaterIter) {
+  if (WaterList.empty())
+    return false;
+
+  unsigned BestGrowth = ~0u;
+  for (water_iterator IP = std::prev(WaterList.end()), B = WaterList.begin();;
+       --IP) {
+    MachineBasicBlock* WaterBB = *IP;
+    // Check if water is in range and is either at a lower address than the
+    // current "high water mark" or a new water block that was created since
+    // the previous iteration by inserting an unconditional branch.  In the
+    // latter case, we want to allow resetting the high water mark back to
+    // this new water since we haven't seen it before.  Inserting branches
+    // should be relatively uncommon and when it does happen, we want to be
+    // sure to take advantage of it for all the CPEs near that block, so that
+    // we don't insert more branches than necessary.
+    unsigned Growth;
+    if (isWaterInRange(UserOffset, WaterBB, U, Growth) &&
+        (WaterBB->getNumber() < U.HighWaterMark->getNumber() ||
+         NewWaterList.count(WaterBB)) && Growth < BestGrowth) {
+      // This is the least amount of required padding seen so far.
+      BestGrowth = Growth;
+      WaterIter = IP;
+      DEBUG(dbgs() << "Found water after BB#" << WaterBB->getNumber()
+                   << " Growth=" << Growth << '\n');
+
+      // Keep looking unless it is perfect.
+      if (BestGrowth == 0)
+        return true;
+    }
+    if (IP == B)
+      break;
+  }
+  return BestGrowth != ~0u;
+}
+
+/// createNewWater - No existing WaterList entry will work for
+/// CPUsers[CPUserIndex], so create a place to put the CPE.  The end of the
+/// block is used if in range, and the conditional branch munged so control
+/// flow is correct.  Otherwise the block is split to create a hole with an
+/// unconditional branch around it.  In either case NewMBB is set to a
+/// block following which the new island can be inserted (the WaterList
+/// is not adjusted).
+void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
+                                        unsigned UserOffset,
+                                        MachineBasicBlock *&NewMBB) {
+  CPUser &U = CPUsers[CPUserIndex];
+  MachineInstr *UserMI = U.MI;
+  MachineInstr *CPEMI  = U.CPEMI;
+  unsigned CPELogAlign = getCPELogAlign(CPEMI);
+  MachineBasicBlock *UserMBB = UserMI->getParent();
+  const BasicBlockInfo &UserBBI = BBInfo[UserMBB->getNumber()];
+
+  // If the block does not end in an unconditional branch already, and if the
+  // end of the block is within range, make new water there.  (The addition
+  // below is for the unconditional branch we will be adding: 4 bytes on ARM +
+  // Thumb2, 2 on Thumb1.
+  if (BBHasFallthrough(UserMBB)) {
+    // Size of branch to insert.
+    unsigned Delta = isThumb1 ? 2 : 4;
+    // Compute the offset where the CPE will begin.
+    unsigned CPEOffset = UserBBI.postOffset(CPELogAlign) + Delta;
+
+    if (isOffsetInRange(UserOffset, CPEOffset, U)) {
+      DEBUG(dbgs() << "Split at end of BB#" << UserMBB->getNumber()
+            << format(", expected CPE offset %#x\n", CPEOffset));
+      NewMBB = std::next(MachineFunction::iterator(UserMBB));
+      // Add an unconditional branch from UserMBB to fallthrough block.  Record
+      // it for branch lengthening; this new branch will not get out of range,
+      // but if the preceding conditional branch is out of range, the targets
+      // will be exchanged, and the altered branch may be out of range, so the
+      // machinery has to know about it.
+      int UncondBr = isThumb ? ((isThumb2) ? ARM::t2B : ARM::tB) : ARM::B;
+      if (!isThumb)
+        BuildMI(UserMBB, DebugLoc(), TII->get(UncondBr)).addMBB(NewMBB);
+      else
+        BuildMI(UserMBB, DebugLoc(), TII->get(UncondBr)).addMBB(NewMBB)
+          .addImm(ARMCC::AL).addReg(0);
+      unsigned MaxDisp = getUnconditionalBrDisp(UncondBr);
+      ImmBranches.push_back(ImmBranch(&UserMBB->back(),
+                                      MaxDisp, false, UncondBr));
+      BBInfo[UserMBB->getNumber()].Size += Delta;
+      adjustBBOffsetsAfter(UserMBB);
+      return;
+    }
+  }
+
+  // What a big block.  Find a place within the block to split it.  This is a
+  // little tricky on Thumb1 since instructions are 2 bytes and constant pool
+  // entries are 4 bytes: if instruction I references island CPE, and
+  // instruction I+1 references CPE', it will not work well to put CPE as far
+  // forward as possible, since then CPE' cannot immediately follow it (that
+  // location is 2 bytes farther away from I+1 than CPE was from I) and we'd
+  // need to create a new island.  So, we make a first guess, then walk through
+  // the instructions between the one currently being looked at and the
+  // possible insertion point, and make sure any other instructions that
+  // reference CPEs will be able to use the same island area; if not, we back
+  // up the insertion point.
+
+  // Try to split the block so it's fully aligned.  Compute the latest split
+  // point where we can add a 4-byte branch instruction, and then align to
+  // LogAlign which is the largest possible alignment in the function.
+  unsigned LogAlign = MF->getAlignment();
+  assert(LogAlign >= CPELogAlign && "Over-aligned constant pool entry");
+  unsigned KnownBits = UserBBI.internalKnownBits();
+  unsigned UPad = UnknownPadding(LogAlign, KnownBits);
+  unsigned BaseInsertOffset = UserOffset + U.getMaxDisp() - UPad;
+  DEBUG(dbgs() << format("Split in middle of big block before %#x",
+                         BaseInsertOffset));
+
+  // The 4 in the following is for the unconditional branch we'll be inserting
+  // (allows for long branch on Thumb1).  Alignment of the island is handled
+  // inside isOffsetInRange.
+  BaseInsertOffset -= 4;
+
+  DEBUG(dbgs() << format(", adjusted to %#x", BaseInsertOffset)
+               << " la=" << LogAlign
+               << " kb=" << KnownBits
+               << " up=" << UPad << '\n');
+
+  // This could point off the end of the block if we've already got constant
+  // pool entries following this block; only the last one is in the water list.
+  // Back past any possible branches (allow for a conditional and a maximally
+  // long unconditional).
+  if (BaseInsertOffset + 8 >= UserBBI.postOffset()) {
+    BaseInsertOffset = UserBBI.postOffset() - UPad - 8;
+    DEBUG(dbgs() << format("Move inside block: %#x\n", BaseInsertOffset));
+  }
+  unsigned EndInsertOffset = BaseInsertOffset + 4 + UPad +
+    CPEMI->getOperand(2).getImm();
+  MachineBasicBlock::iterator MI = UserMI;
+  ++MI;
+  unsigned CPUIndex = CPUserIndex+1;
+  unsigned NumCPUsers = CPUsers.size();
+  MachineInstr *LastIT = nullptr;
+  for (unsigned Offset = UserOffset+TII->GetInstSizeInBytes(UserMI);
+       Offset < BaseInsertOffset;
+       Offset += TII->GetInstSizeInBytes(MI), MI = std::next(MI)) {
+    assert(MI != UserMBB->end() && "Fell off end of block");
+    if (CPUIndex < NumCPUsers && CPUsers[CPUIndex].MI == MI) {
+      CPUser &U = CPUsers[CPUIndex];
+      if (!isOffsetInRange(Offset, EndInsertOffset, U)) {
+        // Shift intertion point by one unit of alignment so it is within reach.
+        BaseInsertOffset -= 1u << LogAlign;
+        EndInsertOffset  -= 1u << LogAlign;
+      }
+      // This is overly conservative, as we don't account for CPEMIs being
+      // reused within the block, but it doesn't matter much.  Also assume CPEs
+      // are added in order with alignment padding.  We may eventually be able
+      // to pack the aligned CPEs better.
+      EndInsertOffset += U.CPEMI->getOperand(2).getImm();
+      CPUIndex++;
+    }
+
+    // Remember the last IT instruction.
+    if (MI->getOpcode() == ARM::t2IT)
+      LastIT = MI;
+  }
+
+  --MI;
+
+  // Avoid splitting an IT block.
+  if (LastIT) {
+    unsigned PredReg = 0;
+    ARMCC::CondCodes CC = getITInstrPredicate(MI, PredReg);
+    if (CC != ARMCC::AL)
+      MI = LastIT;
+  }
+  NewMBB = splitBlockBeforeInstr(MI);
+}
+
+/// handleConstantPoolUser - Analyze the specified user, checking to see if it
+/// is out-of-range.  If so, pick up the constant pool value and move it some
+/// place in-range.  Return true if we changed any addresses (thus must run
+/// another pass of branch lengthening), false otherwise.
+bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
+  CPUser &U = CPUsers[CPUserIndex];
+  MachineInstr *UserMI = U.MI;
+  MachineInstr *CPEMI  = U.CPEMI;
+  unsigned CPI = CPEMI->getOperand(1).getIndex();
+  unsigned Size = CPEMI->getOperand(2).getImm();
+  // Compute this only once, it's expensive.
+  unsigned UserOffset = getUserOffset(U);
+
+  // See if the current entry is within range, or there is a clone of it
+  // in range.
+  int result = findInRangeCPEntry(U, UserOffset);
+  if (result==1) return false;
+  else if (result==2) return true;
+
+  // No existing clone of this CPE is within range.
+  // We will be generating a new clone.  Get a UID for it.
+  unsigned ID = AFI->createPICLabelUId();
+
+  // Look for water where we can place this CPE.
+  MachineBasicBlock *NewIsland = MF->CreateMachineBasicBlock();
+  MachineBasicBlock *NewMBB;
+  water_iterator IP;
+  if (findAvailableWater(U, UserOffset, IP)) {
+    DEBUG(dbgs() << "Found water in range\n");
+    MachineBasicBlock *WaterBB = *IP;
+
+    // If the original WaterList entry was "new water" on this iteration,
+    // propagate that to the new island.  This is just keeping NewWaterList
+    // updated to match the WaterList, which will be updated below.
+    if (NewWaterList.erase(WaterBB))
+      NewWaterList.insert(NewIsland);
+
+    // The new CPE goes before the following block (NewMBB).
+    NewMBB = std::next(MachineFunction::iterator(WaterBB));
+
+  } else {
+    // No water found.
+    DEBUG(dbgs() << "No water found\n");
+    createNewWater(CPUserIndex, UserOffset, NewMBB);
+
+    // splitBlockBeforeInstr adds to WaterList, which is important when it is
+    // called while handling branches so that the water will be seen on the
+    // next iteration for constant pools, but in this context, we don't want
+    // it.  Check for this so it will be removed from the WaterList.
+    // Also remove any entry from NewWaterList.
+    MachineBasicBlock *WaterBB = std::prev(MachineFunction::iterator(NewMBB));
+    IP = std::find(WaterList.begin(), WaterList.end(), WaterBB);
+    if (IP != WaterList.end())
+      NewWaterList.erase(WaterBB);
+
+    // We are adding new water.  Update NewWaterList.
+    NewWaterList.insert(NewIsland);
+  }
+
+  // Remove the original WaterList entry; we want subsequent insertions in
+  // this vicinity to go after the one we're about to insert.  This
+  // considerably reduces the number of times we have to move the same CPE
+  // more than once and is also important to ensure the algorithm terminates.
+  if (IP != WaterList.end())
+    WaterList.erase(IP);
+
+  // Okay, we know we can put an island before NewMBB now, do it!
+  MF->insert(NewMBB, NewIsland);
+
+  // Update internal data structures to account for the newly inserted MBB.
+  updateForInsertedWaterBlock(NewIsland);
+
+  // Decrement the old entry, and remove it if refcount becomes 0.
+  decrementCPEReferenceCount(CPI, CPEMI);
+
+  // Now that we have an island to add the CPE to, clone the original CPE and
+  // add it to the island.
+  U.HighWaterMark = NewIsland;
+  U.CPEMI = BuildMI(NewIsland, DebugLoc(), TII->get(ARM::CONSTPOOL_ENTRY))
+                .addImm(ID).addConstantPoolIndex(CPI).addImm(Size);
+  CPEntries[CPI].push_back(CPEntry(U.CPEMI, ID, 1));
+  ++NumCPEs;
+
+  // Mark the basic block as aligned as required by the const-pool entry.
+  NewIsland->setAlignment(getCPELogAlign(U.CPEMI));
+
+  // Increase the size of the island block to account for the new entry.
+  BBInfo[NewIsland->getNumber()].Size += Size;
+  adjustBBOffsetsAfter(std::prev(MachineFunction::iterator(NewIsland)));
+
+  // Finally, change the CPI in the instruction operand to be ID.
+  for (unsigned i = 0, e = UserMI->getNumOperands(); i != e; ++i)
+    if (UserMI->getOperand(i).isCPI()) {
+      UserMI->getOperand(i).setIndex(ID);
+      break;
+    }
+
+  DEBUG(dbgs() << "  Moved CPE to #" << ID << " CPI=" << CPI
+        << format(" offset=%#x\n", BBInfo[NewIsland->getNumber()].Offset));
+
+  return true;
+}
+
+/// removeDeadCPEMI - Remove a dead constant pool entry instruction. Update
+/// sizes and offsets of impacted basic blocks.
+void ARMConstantIslands::removeDeadCPEMI(MachineInstr *CPEMI) {
+  MachineBasicBlock *CPEBB = CPEMI->getParent();
+  unsigned Size = CPEMI->getOperand(2).getImm();
+  CPEMI->eraseFromParent();
+  BBInfo[CPEBB->getNumber()].Size -= Size;
+  // All succeeding offsets have the current size value added in, fix this.
+  if (CPEBB->empty()) {
+    BBInfo[CPEBB->getNumber()].Size = 0;
+
+    // This block no longer needs to be aligned.
+    CPEBB->setAlignment(0);
+  } else
+    // Entries are sorted by descending alignment, so realign from the front.
+    CPEBB->setAlignment(getCPELogAlign(CPEBB->begin()));
+
+  adjustBBOffsetsAfter(CPEBB);
+  // An island has only one predecessor BB and one successor BB. Check if
+  // this BB's predecessor jumps directly to this BB's successor. This
+  // shouldn't happen currently.
+  assert(!BBIsJumpedOver(CPEBB) && "How did this happen?");
+  // FIXME: remove the empty blocks after all the work is done?
+}
+
+/// removeUnusedCPEntries - Remove constant pool entries whose refcounts
+/// are zero.
+bool ARMConstantIslands::removeUnusedCPEntries() {
+  unsigned MadeChange = false;
+  for (unsigned i = 0, e = CPEntries.size(); i != e; ++i) {
+      std::vector<CPEntry> &CPEs = CPEntries[i];
+      for (unsigned j = 0, ee = CPEs.size(); j != ee; ++j) {
+        if (CPEs[j].RefCount == 0 && CPEs[j].CPEMI) {
+          removeDeadCPEMI(CPEs[j].CPEMI);
+          CPEs[j].CPEMI = nullptr;
+          MadeChange = true;
+        }
+      }
+  }
+  return MadeChange;
+}
+
+/// isBBInRange - Returns true if the distance between specific MI and
+/// specific BB can fit in MI's displacement field.
+bool ARMConstantIslands::isBBInRange(MachineInstr *MI,MachineBasicBlock *DestBB,
+                                     unsigned MaxDisp) {
+  unsigned PCAdj      = isThumb ? 4 : 8;
+  unsigned BrOffset   = getOffsetOf(MI) + PCAdj;
+  unsigned DestOffset = BBInfo[DestBB->getNumber()].Offset;
+
+  DEBUG(dbgs() << "Branch of destination BB#" << DestBB->getNumber()
+               << " from BB#" << MI->getParent()->getNumber()
+               << " max delta=" << MaxDisp
+               << " from " << getOffsetOf(MI) << " to " << DestOffset
+               << " offset " << int(DestOffset-BrOffset) << "\t" << *MI);
+
+  if (BrOffset <= DestOffset) {
+    // Branch before the Dest.
+    if (DestOffset-BrOffset <= MaxDisp)
+      return true;
+  } else {
+    if (BrOffset-DestOffset <= MaxDisp)
+      return true;
+  }
+  return false;
+}
+
+/// fixupImmediateBr - Fix up an immediate branch whose destination is too far
+/// away to fit in its displacement field.
+bool ARMConstantIslands::fixupImmediateBr(ImmBranch &Br) {
+  MachineInstr *MI = Br.MI;
+  MachineBasicBlock *DestBB = MI->getOperand(0).getMBB();
+
+  // Check to see if the DestBB is already in-range.
+  if (isBBInRange(MI, DestBB, Br.MaxDisp))
+    return false;
+
+  if (!Br.isCond)
+    return fixupUnconditionalBr(Br);
+  return fixupConditionalBr(Br);
+}
+
+/// fixupUnconditionalBr - Fix up an unconditional branch whose destination is
+/// too far away to fit in its displacement field. If the LR register has been
+/// spilled in the epilogue, then we can use BL to implement a far jump.
+/// Otherwise, add an intermediate branch instruction to a branch.
+bool
+ARMConstantIslands::fixupUnconditionalBr(ImmBranch &Br) {
+  MachineInstr *MI = Br.MI;
+  MachineBasicBlock *MBB = MI->getParent();
+  if (!isThumb1)
+    llvm_unreachable("fixupUnconditionalBr is Thumb1 only!");
+
+  // Use BL to implement far jump.
+  Br.MaxDisp = (1 << 21) * 2;
+  MI->setDesc(TII->get(ARM::tBfar));
+  BBInfo[MBB->getNumber()].Size += 2;
+  adjustBBOffsetsAfter(MBB);
+  HasFarJump = true;
+  ++NumUBrFixed;
+
+  DEBUG(dbgs() << "  Changed B to long jump " << *MI);
+
+  return true;
+}
+
+/// fixupConditionalBr - Fix up a conditional branch whose destination is too
+/// far away to fit in its displacement field. It is converted to an inverse
+/// conditional branch + an unconditional branch to the destination.
+bool
+ARMConstantIslands::fixupConditionalBr(ImmBranch &Br) {
+  MachineInstr *MI = Br.MI;
+  MachineBasicBlock *DestBB = MI->getOperand(0).getMBB();
+
+  // Add an unconditional branch to the destination and invert the branch
+  // condition to jump over it:
+  // blt L1
+  // =>
+  // bge L2
+  // b   L1
+  // L2:
+  ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(1).getImm();
+  CC = ARMCC::getOppositeCondition(CC);
+  unsigned CCReg = MI->getOperand(2).getReg();
+
+  // If the branch is at the end of its MBB and that has a fall-through block,
+  // direct the updated conditional branch to the fall-through block. Otherwise,
+  // split the MBB before the next instruction.
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineInstr *BMI = &MBB->back();
+  bool NeedSplit = (BMI != MI) || !BBHasFallthrough(MBB);
+
+  ++NumCBrFixed;
+  if (BMI != MI) {
+    if (std::next(MachineBasicBlock::iterator(MI)) == std::prev(MBB->end()) &&
+        BMI->getOpcode() == Br.UncondBr) {
+      // Last MI in the BB is an unconditional branch. Can we simply invert the
+      // condition and swap destinations:
+      // beq L1
+      // b   L2
+      // =>
+      // bne L2
+      // b   L1
+      MachineBasicBlock *NewDest = BMI->getOperand(0).getMBB();
+      if (isBBInRange(MI, NewDest, Br.MaxDisp)) {
+        DEBUG(dbgs() << "  Invert Bcc condition and swap its destination with "
+                     << *BMI);
+        BMI->getOperand(0).setMBB(DestBB);
+        MI->getOperand(0).setMBB(NewDest);
+        MI->getOperand(1).setImm(CC);
+        return true;
+      }
+    }
+  }
+
+  if (NeedSplit) {
+    splitBlockBeforeInstr(MI);
+    // No need for the branch to the next block. We're adding an unconditional
+    // branch to the destination.
+    int delta = TII->GetInstSizeInBytes(&MBB->back());
+    BBInfo[MBB->getNumber()].Size -= delta;
+    MBB->back().eraseFromParent();
+    // BBInfo[SplitBB].Offset is wrong temporarily, fixed below
+  }
+  MachineBasicBlock *NextBB = std::next(MachineFunction::iterator(MBB));
+
+  DEBUG(dbgs() << "  Insert B to BB#" << DestBB->getNumber()
+               << " also invert condition and change dest. to BB#"
+               << NextBB->getNumber() << "\n");
+
+  // Insert a new conditional branch and a new unconditional branch.
+  // Also update the ImmBranch as well as adding a new entry for the new branch.
+  BuildMI(MBB, DebugLoc(), TII->get(MI->getOpcode()))
+    .addMBB(NextBB).addImm(CC).addReg(CCReg);
+  Br.MI = &MBB->back();
+  BBInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back());
+  if (isThumb)
+    BuildMI(MBB, DebugLoc(), TII->get(Br.UncondBr)).addMBB(DestBB)
+            .addImm(ARMCC::AL).addReg(0);
+  else
+    BuildMI(MBB, DebugLoc(), TII->get(Br.UncondBr)).addMBB(DestBB);
+  BBInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back());
+  unsigned MaxDisp = getUnconditionalBrDisp(Br.UncondBr);
+  ImmBranches.push_back(ImmBranch(&MBB->back(), MaxDisp, false, Br.UncondBr));
+
+  // Remove the old conditional branch.  It may or may not still be in MBB.
+  BBInfo[MI->getParent()->getNumber()].Size -= TII->GetInstSizeInBytes(MI);
+  MI->eraseFromParent();
+  adjustBBOffsetsAfter(MBB);
+  return true;
+}
+
+/// undoLRSpillRestore - Remove Thumb push / pop instructions that only spills
+/// LR / restores LR to pc. FIXME: This is done here because it's only possible
+/// to do this if tBfar is not used.
+bool ARMConstantIslands::undoLRSpillRestore() {
+  bool MadeChange = false;
+  for (unsigned i = 0, e = PushPopMIs.size(); i != e; ++i) {
+    MachineInstr *MI = PushPopMIs[i];
+    // First two operands are predicates.
+    if (MI->getOpcode() == ARM::tPOP_RET &&
+        MI->getOperand(2).getReg() == ARM::PC &&
+        MI->getNumExplicitOperands() == 3) {
+      // Create the new insn and copy the predicate from the old.
+      BuildMI(MI->getParent(), MI->getDebugLoc(), TII->get(ARM::tBX_RET))
+        .addOperand(MI->getOperand(0))
+        .addOperand(MI->getOperand(1));
+      MI->eraseFromParent();
+      MadeChange = true;
+    }
+  }
+  return MadeChange;
+}
+
+// mayOptimizeThumb2Instruction - Returns true if optimizeThumb2Instructions
+// below may shrink MI.
+bool
+ARMConstantIslands::mayOptimizeThumb2Instruction(const MachineInstr *MI) const {
+  switch(MI->getOpcode()) {
+    // optimizeThumb2Instructions.
+    case ARM::t2LEApcrel:
+    case ARM::t2LDRpci:
+    // optimizeThumb2Branches.
+    case ARM::t2B:
+    case ARM::t2Bcc:
+    case ARM::tBcc:
+    // optimizeThumb2JumpTables.
+    case ARM::t2BR_JT:
+      return true;
+  }
+  return false;
+}
+
+bool ARMConstantIslands::optimizeThumb2Instructions() {
+  bool MadeChange = false;
+
+  // Shrink ADR and LDR from constantpool.
+  for (unsigned i = 0, e = CPUsers.size(); i != e; ++i) {
+    CPUser &U = CPUsers[i];
+    unsigned Opcode = U.MI->getOpcode();
+    unsigned NewOpc = 0;
+    unsigned Scale = 1;
+    unsigned Bits = 0;
+    switch (Opcode) {
+    default: break;
+    case ARM::t2LEApcrel:
+      if (isARMLowRegister(U.MI->getOperand(0).getReg())) {
+        NewOpc = ARM::tLEApcrel;
+        Bits = 8;
+        Scale = 4;
+      }
+      break;
+    case ARM::t2LDRpci:
+      if (isARMLowRegister(U.MI->getOperand(0).getReg())) {
+        NewOpc = ARM::tLDRpci;
+        Bits = 8;
+        Scale = 4;
+      }
+      break;
+    }
+
+    if (!NewOpc)
+      continue;
+
+    unsigned UserOffset = getUserOffset(U);
+    unsigned MaxOffs = ((1 << Bits) - 1) * Scale;
+
+    // Be conservative with inline asm.
+    if (!U.KnownAlignment)
+      MaxOffs -= 2;
+
+    // FIXME: Check if offset is multiple of scale if scale is not 4.
+    if (isCPEntryInRange(U.MI, UserOffset, U.CPEMI, MaxOffs, false, true)) {
+      DEBUG(dbgs() << "Shrink: " << *U.MI);
+      U.MI->setDesc(TII->get(NewOpc));
+      MachineBasicBlock *MBB = U.MI->getParent();
+      BBInfo[MBB->getNumber()].Size -= 2;
+      adjustBBOffsetsAfter(MBB);
+      ++NumT2CPShrunk;
+      MadeChange = true;
+    }
+  }
+
+  MadeChange |= optimizeThumb2Branches();
+  MadeChange |= optimizeThumb2JumpTables();
+  return MadeChange;
+}
+
+bool ARMConstantIslands::optimizeThumb2Branches() {
+  bool MadeChange = false;
+
+  for (unsigned i = 0, e = ImmBranches.size(); i != e; ++i) {
+    ImmBranch &Br = ImmBranches[i];
+    unsigned Opcode = Br.MI->getOpcode();
+    unsigned NewOpc = 0;
+    unsigned Scale = 1;
+    unsigned Bits = 0;
+    switch (Opcode) {
+    default: break;
+    case ARM::t2B:
+      NewOpc = ARM::tB;
+      Bits = 11;
+      Scale = 2;
+      break;
+    case ARM::t2Bcc: {
+      NewOpc = ARM::tBcc;
+      Bits = 8;
+      Scale = 2;
+      break;
+    }
+    }
+    if (NewOpc) {
+      unsigned MaxOffs = ((1 << (Bits-1))-1) * Scale;
+      MachineBasicBlock *DestBB = Br.MI->getOperand(0).getMBB();
+      if (isBBInRange(Br.MI, DestBB, MaxOffs)) {
+        DEBUG(dbgs() << "Shrink branch: " << *Br.MI);
+        Br.MI->setDesc(TII->get(NewOpc));
+        MachineBasicBlock *MBB = Br.MI->getParent();
+        BBInfo[MBB->getNumber()].Size -= 2;
+        adjustBBOffsetsAfter(MBB);
+        ++NumT2BrShrunk;
+        MadeChange = true;
+      }
+    }
+
+    Opcode = Br.MI->getOpcode();
+    if (Opcode != ARM::tBcc)
+      continue;
+
+    // If the conditional branch doesn't kill CPSR, then CPSR can be liveout
+    // so this transformation is not safe.
+    if (!Br.MI->killsRegister(ARM::CPSR))
+      continue;
+
+    NewOpc = 0;
+    unsigned PredReg = 0;
+    ARMCC::CondCodes Pred = getInstrPredicate(Br.MI, PredReg);
+    if (Pred == ARMCC::EQ)
+      NewOpc = ARM::tCBZ;
+    else if (Pred == ARMCC::NE)
+      NewOpc = ARM::tCBNZ;
+    if (!NewOpc)
+      continue;
+    MachineBasicBlock *DestBB = Br.MI->getOperand(0).getMBB();
+    // Check if the distance is within 126. Subtract starting offset by 2
+    // because the cmp will be eliminated.
+    unsigned BrOffset = getOffsetOf(Br.MI) + 4 - 2;
+    unsigned DestOffset = BBInfo[DestBB->getNumber()].Offset;
+    if (BrOffset < DestOffset && (DestOffset - BrOffset) <= 126) {
+      MachineBasicBlock::iterator CmpMI = Br.MI;
+      if (CmpMI != Br.MI->getParent()->begin()) {
+        --CmpMI;
+        if (CmpMI->getOpcode() == ARM::tCMPi8) {
+          unsigned Reg = CmpMI->getOperand(0).getReg();
+          Pred = getInstrPredicate(CmpMI, PredReg);
+          if (Pred == ARMCC::AL &&
+              CmpMI->getOperand(1).getImm() == 0 &&
+              isARMLowRegister(Reg)) {
+            MachineBasicBlock *MBB = Br.MI->getParent();
+            DEBUG(dbgs() << "Fold: " << *CmpMI << " and: " << *Br.MI);
+            MachineInstr *NewBR =
+              BuildMI(*MBB, CmpMI, Br.MI->getDebugLoc(), TII->get(NewOpc))
+              .addReg(Reg).addMBB(DestBB,Br.MI->getOperand(0).getTargetFlags());
+            CmpMI->eraseFromParent();
+            Br.MI->eraseFromParent();
+            Br.MI = NewBR;
+            BBInfo[MBB->getNumber()].Size -= 2;
+            adjustBBOffsetsAfter(MBB);
+            ++NumCBZ;
+            MadeChange = true;
+          }
+        }
+      }
+    }
+  }
+
+  return MadeChange;
+}
+
+/// optimizeThumb2JumpTables - Use tbb / tbh instructions to generate smaller
+/// jumptables when it's possible.
+bool ARMConstantIslands::optimizeThumb2JumpTables() {
+  bool MadeChange = false;
+
+  // FIXME: After the tables are shrunk, can we get rid some of the
+  // constantpool tables?
+  MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
+  if (!MJTI) return false;
+
+  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+  for (unsigned i = 0, e = T2JumpTables.size(); i != e; ++i) {
+    MachineInstr *MI = T2JumpTables[i];
+    const MCInstrDesc &MCID = MI->getDesc();
+    unsigned NumOps = MCID.getNumOperands();
+    unsigned JTOpIdx = NumOps - (MI->isPredicable() ? 3 : 2);
+    MachineOperand JTOP = MI->getOperand(JTOpIdx);
+    unsigned JTI = JTOP.getIndex();
+    assert(JTI < JT.size());
+
+    bool ByteOk = true;
+    bool HalfWordOk = true;
+    unsigned JTOffset = getOffsetOf(MI) + 4;
+    const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
+    for (unsigned j = 0, ee = JTBBs.size(); j != ee; ++j) {
+      MachineBasicBlock *MBB = JTBBs[j];
+      unsigned DstOffset = BBInfo[MBB->getNumber()].Offset;
+      // Negative offset is not ok. FIXME: We should change BB layout to make
+      // sure all the branches are forward.
+      if (ByteOk && (DstOffset - JTOffset) > ((1<<8)-1)*2)
+        ByteOk = false;
+      unsigned TBHLimit = ((1<<16)-1)*2;
+      if (HalfWordOk && (DstOffset - JTOffset) > TBHLimit)
+        HalfWordOk = false;
+      if (!ByteOk && !HalfWordOk)
+        break;
+    }
+
+    if (ByteOk || HalfWordOk) {
+      MachineBasicBlock *MBB = MI->getParent();
+      unsigned BaseReg = MI->getOperand(0).getReg();
+      bool BaseRegKill = MI->getOperand(0).isKill();
+      if (!BaseRegKill)
+        continue;
+      unsigned IdxReg = MI->getOperand(1).getReg();
+      bool IdxRegKill = MI->getOperand(1).isKill();
+
+      // Scan backwards to find the instruction that defines the base
+      // register. Due to post-RA scheduling, we can't count on it
+      // immediately preceding the branch instruction.
+      MachineBasicBlock::iterator PrevI = MI;
+      MachineBasicBlock::iterator B = MBB->begin();
+      while (PrevI != B && !PrevI->definesRegister(BaseReg))
+        --PrevI;
+
+      // If for some reason we didn't find it, we can't do anything, so
+      // just skip this one.
+      if (!PrevI->definesRegister(BaseReg))
+        continue;
+
+      MachineInstr *AddrMI = PrevI;
+      bool OptOk = true;
+      // Examine the instruction that calculates the jumptable entry address.
+      // Make sure it only defines the base register and kills any uses
+      // other than the index register.
+      for (unsigned k = 0, eee = AddrMI->getNumOperands(); k != eee; ++k) {
+        const MachineOperand &MO = AddrMI->getOperand(k);
+        if (!MO.isReg() || !MO.getReg())
+          continue;
+        if (MO.isDef() && MO.getReg() != BaseReg) {
+          OptOk = false;
+          break;
+        }
+        if (MO.isUse() && !MO.isKill() && MO.getReg() != IdxReg) {
+          OptOk = false;
+          break;
+        }
+      }
+      if (!OptOk)
+        continue;
+
+      // Now scan back again to find the tLEApcrel or t2LEApcrelJT instruction
+      // that gave us the initial base register definition.
+      for (--PrevI; PrevI != B && !PrevI->definesRegister(BaseReg); --PrevI)
+        ;
+
+      // The instruction should be a tLEApcrel or t2LEApcrelJT; we want
+      // to delete it as well.
+      MachineInstr *LeaMI = PrevI;
+      if ((LeaMI->getOpcode() != ARM::tLEApcrelJT &&
+           LeaMI->getOpcode() != ARM::t2LEApcrelJT) ||
+          LeaMI->getOperand(0).getReg() != BaseReg)
+        OptOk = false;
+
+      if (!OptOk)
+        continue;
+
+      DEBUG(dbgs() << "Shrink JT: " << *MI << "     addr: " << *AddrMI
+                   << "      lea: " << *LeaMI);
+      unsigned Opc = ByteOk ? ARM::t2TBB_JT : ARM::t2TBH_JT;
+      MachineInstr *NewJTMI = BuildMI(MBB, MI->getDebugLoc(), TII->get(Opc))
+        .addReg(IdxReg, getKillRegState(IdxRegKill))
+        .addJumpTableIndex(JTI, JTOP.getTargetFlags())
+        .addImm(MI->getOperand(JTOpIdx+1).getImm());
+      DEBUG(dbgs() << "BB#" << MBB->getNumber() << ": " << *NewJTMI);
+      // FIXME: Insert an "ALIGN" instruction to ensure the next instruction
+      // is 2-byte aligned. For now, asm printer will fix it up.
+      unsigned NewSize = TII->GetInstSizeInBytes(NewJTMI);
+      unsigned OrigSize = TII->GetInstSizeInBytes(AddrMI);
+      OrigSize += TII->GetInstSizeInBytes(LeaMI);
+      OrigSize += TII->GetInstSizeInBytes(MI);
+
+      AddrMI->eraseFromParent();
+      LeaMI->eraseFromParent();
+      MI->eraseFromParent();
+
+      int delta = OrigSize - NewSize;
+      BBInfo[MBB->getNumber()].Size -= delta;
+      adjustBBOffsetsAfter(MBB);
+
+      ++NumTBs;
+      MadeChange = true;
+    }
+  }
+
+  return MadeChange;
+}
+
+/// reorderThumb2JumpTables - Adjust the function's block layout to ensure that
+/// jump tables always branch forwards, since that's what tbb and tbh need.
+bool ARMConstantIslands::reorderThumb2JumpTables() {
+  bool MadeChange = false;
+
+  MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
+  if (!MJTI) return false;
+
+  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+  for (unsigned i = 0, e = T2JumpTables.size(); i != e; ++i) {
+    MachineInstr *MI = T2JumpTables[i];
+    const MCInstrDesc &MCID = MI->getDesc();
+    unsigned NumOps = MCID.getNumOperands();
+    unsigned JTOpIdx = NumOps - (MI->isPredicable() ? 3 : 2);
+    MachineOperand JTOP = MI->getOperand(JTOpIdx);
+    unsigned JTI = JTOP.getIndex();
+    assert(JTI < JT.size());
+
+    // We prefer if target blocks for the jump table come after the jump
+    // instruction so we can use TB[BH]. Loop through the target blocks
+    // and try to adjust them such that that's true.
+    int JTNumber = MI->getParent()->getNumber();
+    const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
+    for (unsigned j = 0, ee = JTBBs.size(); j != ee; ++j) {
+      MachineBasicBlock *MBB = JTBBs[j];
+      int DTNumber = MBB->getNumber();
+
+      if (DTNumber < JTNumber) {
+        // The destination precedes the switch. Try to move the block forward
+        // so we have a positive offset.
+        MachineBasicBlock *NewBB =
+          adjustJTTargetBlockForward(MBB, MI->getParent());
+        if (NewBB)
+          MJTI->ReplaceMBBInJumpTable(JTI, JTBBs[j], NewBB);
+        MadeChange = true;
+      }
+    }
+  }
+
+  return MadeChange;
+}
+
+MachineBasicBlock *ARMConstantIslands::
+adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) {
+  // If the destination block is terminated by an unconditional branch,
+  // try to move it; otherwise, create a new block following the jump
+  // table that branches back to the actual target. This is a very simple
+  // heuristic. FIXME: We can definitely improve it.
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+  SmallVector<MachineOperand, 4> Cond;
+  SmallVector<MachineOperand, 4> CondPrior;
+  MachineFunction::iterator BBi = BB;
+  MachineFunction::iterator OldPrior = std::prev(BBi);
+
+  // If the block terminator isn't analyzable, don't try to move the block
+  bool B = TII->AnalyzeBranch(*BB, TBB, FBB, Cond);
+
+  // If the block ends in an unconditional branch, move it. The prior block
+  // has to have an analyzable terminator for us to move this one. Be paranoid
+  // and make sure we're not trying to move the entry block of the function.
+  if (!B && Cond.empty() && BB != MF->begin() &&
+      !TII->AnalyzeBranch(*OldPrior, TBB, FBB, CondPrior)) {
+    BB->moveAfter(JTBB);
+    OldPrior->updateTerminator();
+    BB->updateTerminator();
+    // Update numbering to account for the block being moved.
+    MF->RenumberBlocks();
+    ++NumJTMoved;
+    return nullptr;
+  }
+
+  // Create a new MBB for the code after the jump BB.
+  MachineBasicBlock *NewBB =
+    MF->CreateMachineBasicBlock(JTBB->getBasicBlock());
+  MachineFunction::iterator MBBI = JTBB; ++MBBI;
+  MF->insert(MBBI, NewBB);
+
+  // Add an unconditional branch from NewBB to BB.
+  // There doesn't seem to be meaningful DebugInfo available; this doesn't
+  // correspond directly to anything in the source.
+  assert (isThumb2 && "Adjusting for TB[BH] but not in Thumb2?");
+  BuildMI(NewBB, DebugLoc(), TII->get(ARM::t2B)).addMBB(BB)
+          .addImm(ARMCC::AL).addReg(0);
+
+  // Update internal data structures to account for the newly inserted MBB.
+  MF->RenumberBlocks(NewBB);
+
+  // Update the CFG.
+  NewBB->addSuccessor(BB);
+  JTBB->removeSuccessor(BB);
+  JTBB->addSuccessor(NewBB);
+
+  ++NumJTInserted;
+  return NewBB;
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp
new file mode 100644
index 0000000..7d41c69
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp
@@ -0,0 +1,262 @@
+//===-- ARMConstantPoolValue.cpp - ARM constantpool value -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARM specific constantpool value class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMConstantPoolValue.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstdlib>
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// ARMConstantPoolValue
+//===----------------------------------------------------------------------===//
+
+ARMConstantPoolValue::ARMConstantPoolValue(Type *Ty, unsigned id,
+                                           ARMCP::ARMCPKind kind,
+                                           unsigned char PCAdj,
+                                           ARMCP::ARMCPModifier modifier,
+                                           bool addCurrentAddress)
+  : MachineConstantPoolValue(Ty), LabelId(id), Kind(kind),
+    PCAdjust(PCAdj), Modifier(modifier),
+    AddCurrentAddress(addCurrentAddress) {}
+
+ARMConstantPoolValue::ARMConstantPoolValue(LLVMContext &C, unsigned id,
+                                           ARMCP::ARMCPKind kind,
+                                           unsigned char PCAdj,
+                                           ARMCP::ARMCPModifier modifier,
+                                           bool addCurrentAddress)
+  : MachineConstantPoolValue((Type*)Type::getInt32Ty(C)),
+    LabelId(id), Kind(kind), PCAdjust(PCAdj), Modifier(modifier),
+    AddCurrentAddress(addCurrentAddress) {}
+
+ARMConstantPoolValue::~ARMConstantPoolValue() {}
+
+const char *ARMConstantPoolValue::getModifierText() const {
+  switch (Modifier) {
+    // FIXME: Are these case sensitive? It'd be nice to lower-case all the
+    // strings if that's legal.
+  case ARMCP::no_modifier: return "none";
+  case ARMCP::TLSGD:       return "tlsgd";
+  case ARMCP::GOT:         return "GOT";
+  case ARMCP::GOTOFF:      return "GOTOFF";
+  case ARMCP::GOTTPOFF:    return "gottpoff";
+  case ARMCP::TPOFF:       return "tpoff";
+  }
+  llvm_unreachable("Unknown modifier!");
+}
+
+int ARMConstantPoolValue::getExistingMachineCPValue(MachineConstantPool *CP,
+                                                    unsigned Alignment) {
+  llvm_unreachable("Shouldn't be calling this directly!");
+}
+
+void
+ARMConstantPoolValue::addSelectionDAGCSEId(FoldingSetNodeID &ID) {
+  ID.AddInteger(LabelId);
+  ID.AddInteger(PCAdjust);
+}
+
+bool
+ARMConstantPoolValue::hasSameValue(ARMConstantPoolValue *ACPV) {
+  if (ACPV->Kind == Kind &&
+      ACPV->PCAdjust == PCAdjust &&
+      ACPV->Modifier == Modifier) {
+    if (ACPV->LabelId == LabelId)
+      return true;
+    // Two PC relative constpool entries containing the same GV address or
+    // external symbols. FIXME: What about blockaddress?
+    if (Kind == ARMCP::CPValue || Kind == ARMCP::CPExtSymbol)
+      return true;
+  }
+  return false;
+}
+
+void ARMConstantPoolValue::dump() const {
+  errs() << "  " << *this;
+}
+
+void ARMConstantPoolValue::print(raw_ostream &O) const {
+  if (Modifier) O << "(" << getModifierText() << ")";
+  if (PCAdjust != 0) {
+    O << "-(LPC" << LabelId << "+" << (unsigned)PCAdjust;
+    if (AddCurrentAddress) O << "-.";
+    O << ")";
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// ARMConstantPoolConstant
+//===----------------------------------------------------------------------===//
+
+ARMConstantPoolConstant::ARMConstantPoolConstant(Type *Ty,
+                                                 const Constant *C,
+                                                 unsigned ID,
+                                                 ARMCP::ARMCPKind Kind,
+                                                 unsigned char PCAdj,
+                                                 ARMCP::ARMCPModifier Modifier,
+                                                 bool AddCurrentAddress)
+  : ARMConstantPoolValue(Ty, ID, Kind, PCAdj, Modifier, AddCurrentAddress),
+    CVal(C) {}
+
+ARMConstantPoolConstant::ARMConstantPoolConstant(const Constant *C,
+                                                 unsigned ID,
+                                                 ARMCP::ARMCPKind Kind,
+                                                 unsigned char PCAdj,
+                                                 ARMCP::ARMCPModifier Modifier,
+                                                 bool AddCurrentAddress)
+  : ARMConstantPoolValue((Type*)C->getType(), ID, Kind, PCAdj, Modifier,
+                         AddCurrentAddress),
+    CVal(C) {}
+
+ARMConstantPoolConstant *
+ARMConstantPoolConstant::Create(const Constant *C, unsigned ID) {
+  return new ARMConstantPoolConstant(C, ID, ARMCP::CPValue, 0,
+                                     ARMCP::no_modifier, false);
+}
+
+ARMConstantPoolConstant *
+ARMConstantPoolConstant::Create(const GlobalValue *GV,
+                                ARMCP::ARMCPModifier Modifier) {
+  return new ARMConstantPoolConstant((Type*)Type::getInt32Ty(GV->getContext()),
+                                     GV, 0, ARMCP::CPValue, 0,
+                                     Modifier, false);
+}
+
+ARMConstantPoolConstant *
+ARMConstantPoolConstant::Create(const Constant *C, unsigned ID,
+                                ARMCP::ARMCPKind Kind, unsigned char PCAdj) {
+  return new ARMConstantPoolConstant(C, ID, Kind, PCAdj,
+                                     ARMCP::no_modifier, false);
+}
+
+ARMConstantPoolConstant *
+ARMConstantPoolConstant::Create(const Constant *C, unsigned ID,
+                                ARMCP::ARMCPKind Kind, unsigned char PCAdj,
+                                ARMCP::ARMCPModifier Modifier,
+                                bool AddCurrentAddress) {
+  return new ARMConstantPoolConstant(C, ID, Kind, PCAdj, Modifier,
+                                     AddCurrentAddress);
+}
+
+const GlobalValue *ARMConstantPoolConstant::getGV() const {
+  return dyn_cast_or_null<GlobalValue>(CVal);
+}
+
+const BlockAddress *ARMConstantPoolConstant::getBlockAddress() const {
+  return dyn_cast_or_null<BlockAddress>(CVal);
+}
+
+int ARMConstantPoolConstant::getExistingMachineCPValue(MachineConstantPool *CP,
+                                                       unsigned Alignment) {
+  return getExistingMachineCPValueImpl<ARMConstantPoolConstant>(CP, Alignment);
+}
+
+bool ARMConstantPoolConstant::hasSameValue(ARMConstantPoolValue *ACPV) {
+  const ARMConstantPoolConstant *ACPC = dyn_cast<ARMConstantPoolConstant>(ACPV);
+  return ACPC && ACPC->CVal == CVal && ARMConstantPoolValue::hasSameValue(ACPV);
+}
+
+void ARMConstantPoolConstant::addSelectionDAGCSEId(FoldingSetNodeID &ID) {
+  ID.AddPointer(CVal);
+  ARMConstantPoolValue::addSelectionDAGCSEId(ID);
+}
+
+void ARMConstantPoolConstant::print(raw_ostream &O) const {
+  O << CVal->getName();
+  ARMConstantPoolValue::print(O);
+}
+
+//===----------------------------------------------------------------------===//
+// ARMConstantPoolSymbol
+//===----------------------------------------------------------------------===//
+
+ARMConstantPoolSymbol::ARMConstantPoolSymbol(LLVMContext &C, const char *s,
+                                             unsigned id,
+                                             unsigned char PCAdj,
+                                             ARMCP::ARMCPModifier Modifier,
+                                             bool AddCurrentAddress)
+  : ARMConstantPoolValue(C, id, ARMCP::CPExtSymbol, PCAdj, Modifier,
+                         AddCurrentAddress),
+    S(s) {}
+
+ARMConstantPoolSymbol *
+ARMConstantPoolSymbol::Create(LLVMContext &C, const char *s,
+                              unsigned ID, unsigned char PCAdj) {
+  return new ARMConstantPoolSymbol(C, s, ID, PCAdj, ARMCP::no_modifier, false);
+}
+
+int ARMConstantPoolSymbol::getExistingMachineCPValue(MachineConstantPool *CP,
+                                                     unsigned Alignment) {
+  return getExistingMachineCPValueImpl<ARMConstantPoolSymbol>(CP, Alignment);
+}
+
+bool ARMConstantPoolSymbol::hasSameValue(ARMConstantPoolValue *ACPV) {
+  const ARMConstantPoolSymbol *ACPS = dyn_cast<ARMConstantPoolSymbol>(ACPV);
+  return ACPS && ACPS->S == S && ARMConstantPoolValue::hasSameValue(ACPV);
+}
+
+void ARMConstantPoolSymbol::addSelectionDAGCSEId(FoldingSetNodeID &ID) {
+  ID.AddString(S);
+  ARMConstantPoolValue::addSelectionDAGCSEId(ID);
+}
+
+void ARMConstantPoolSymbol::print(raw_ostream &O) const {
+  O << S;
+  ARMConstantPoolValue::print(O);
+}
+
+//===----------------------------------------------------------------------===//
+// ARMConstantPoolMBB
+//===----------------------------------------------------------------------===//
+
+ARMConstantPoolMBB::ARMConstantPoolMBB(LLVMContext &C,
+                                       const MachineBasicBlock *mbb,
+                                       unsigned id, unsigned char PCAdj,
+                                       ARMCP::ARMCPModifier Modifier,
+                                       bool AddCurrentAddress)
+  : ARMConstantPoolValue(C, id, ARMCP::CPMachineBasicBlock, PCAdj,
+                         Modifier, AddCurrentAddress),
+    MBB(mbb) {}
+
+ARMConstantPoolMBB *ARMConstantPoolMBB::Create(LLVMContext &C,
+                                               const MachineBasicBlock *mbb,
+                                               unsigned ID,
+                                               unsigned char PCAdj) {
+  return new ARMConstantPoolMBB(C, mbb, ID, PCAdj, ARMCP::no_modifier, false);
+}
+
+int ARMConstantPoolMBB::getExistingMachineCPValue(MachineConstantPool *CP,
+                                                  unsigned Alignment) {
+  return getExistingMachineCPValueImpl<ARMConstantPoolMBB>(CP, Alignment);
+}
+
+bool ARMConstantPoolMBB::hasSameValue(ARMConstantPoolValue *ACPV) {
+  const ARMConstantPoolMBB *ACPMBB = dyn_cast<ARMConstantPoolMBB>(ACPV);
+  return ACPMBB && ACPMBB->MBB == MBB &&
+    ARMConstantPoolValue::hasSameValue(ACPV);
+}
+
+void ARMConstantPoolMBB::addSelectionDAGCSEId(FoldingSetNodeID &ID) {
+  ID.AddPointer(MBB);
+  ARMConstantPoolValue::addSelectionDAGCSEId(ID);
+}
+
+void ARMConstantPoolMBB::print(raw_ostream &O) const {
+  O << "BB#" << MBB->getNumber();
+  ARMConstantPoolValue::print(O);
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h
new file mode 100644
index 0000000..c7a8415
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h
@@ -0,0 +1,259 @@
+//===-- ARMConstantPoolValue.h - ARM constantpool value ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARM specific constantpool value class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_ARM_CONSTANTPOOLVALUE_H
+#define LLVM_TARGET_ARM_CONSTANTPOOLVALUE_H
+
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cstddef>
+
+namespace llvm {
+
+class BlockAddress;
+class Constant;
+class GlobalValue;
+class LLVMContext;
+class MachineBasicBlock;
+
+namespace ARMCP {
+  enum ARMCPKind {
+    CPValue,
+    CPExtSymbol,
+    CPBlockAddress,
+    CPLSDA,
+    CPMachineBasicBlock
+  };
+
+  enum ARMCPModifier {
+    no_modifier,
+    TLSGD,
+    GOT,
+    GOTOFF,
+    GOTTPOFF,
+    TPOFF
+  };
+}
+
+/// ARMConstantPoolValue - ARM specific constantpool value. This is used to
+/// represent PC-relative displacement between the address of the load
+/// instruction and the constant being loaded, i.e. (&GV-(LPIC+8)).
+class ARMConstantPoolValue : public MachineConstantPoolValue {
+  unsigned LabelId;        // Label id of the load.
+  ARMCP::ARMCPKind Kind;   // Kind of constant.
+  unsigned char PCAdjust;  // Extra adjustment if constantpool is pc-relative.
+                           // 8 for ARM, 4 for Thumb.
+  ARMCP::ARMCPModifier Modifier;   // GV modifier i.e. (&GV(modifier)-(LPIC+8))
+  bool AddCurrentAddress;
+
+protected:
+  ARMConstantPoolValue(Type *Ty, unsigned id, ARMCP::ARMCPKind Kind,
+                       unsigned char PCAdj, ARMCP::ARMCPModifier Modifier,
+                       bool AddCurrentAddress);
+
+  ARMConstantPoolValue(LLVMContext &C, unsigned id, ARMCP::ARMCPKind Kind,
+                       unsigned char PCAdj, ARMCP::ARMCPModifier Modifier,
+                       bool AddCurrentAddress);
+
+  template <typename Derived>
+  int getExistingMachineCPValueImpl(MachineConstantPool *CP,
+                                    unsigned Alignment) {
+    unsigned AlignMask = Alignment - 1;
+    const std::vector<MachineConstantPoolEntry> &Constants = CP->getConstants();
+    for (unsigned i = 0, e = Constants.size(); i != e; ++i) {
+      if (Constants[i].isMachineConstantPoolEntry() &&
+          (Constants[i].getAlignment() & AlignMask) == 0) {
+        ARMConstantPoolValue *CPV =
+            (ARMConstantPoolValue *)Constants[i].Val.MachineCPVal;
+        if (Derived *APC = dyn_cast<Derived>(CPV))
+          if (cast<Derived>(this)->equals(APC))
+            return i;
+      }
+    }
+
+    return -1;
+  }
+
+public:
+  virtual ~ARMConstantPoolValue();
+
+  ARMCP::ARMCPModifier getModifier() const { return Modifier; }
+  const char *getModifierText() const;
+  bool hasModifier() const { return Modifier != ARMCP::no_modifier; }
+
+  bool mustAddCurrentAddress() const { return AddCurrentAddress; }
+
+  unsigned getLabelId() const { return LabelId; }
+  unsigned char getPCAdjustment() const { return PCAdjust; }
+
+  bool isGlobalValue() const { return Kind == ARMCP::CPValue; }
+  bool isExtSymbol() const { return Kind == ARMCP::CPExtSymbol; }
+  bool isBlockAddress() const { return Kind == ARMCP::CPBlockAddress; }
+  bool isLSDA() const { return Kind == ARMCP::CPLSDA; }
+  bool isMachineBasicBlock() const{ return Kind == ARMCP::CPMachineBasicBlock; }
+
+  unsigned getRelocationInfo() const override { return 2; }
+
+  int getExistingMachineCPValue(MachineConstantPool *CP,
+                                unsigned Alignment) override;
+
+  void addSelectionDAGCSEId(FoldingSetNodeID &ID) override;
+
+  /// hasSameValue - Return true if this ARM constpool value can share the same
+  /// constantpool entry as another ARM constpool value.
+  virtual bool hasSameValue(ARMConstantPoolValue *ACPV);
+
+  bool equals(const ARMConstantPoolValue *A) const {
+    return this->LabelId == A->LabelId &&
+      this->PCAdjust == A->PCAdjust &&
+      this->Modifier == A->Modifier;
+  }
+
+  void print(raw_ostream &O) const override;
+  void print(raw_ostream *O) const { if (O) print(*O); }
+  void dump() const;
+};
+
+inline raw_ostream &operator<<(raw_ostream &O, const ARMConstantPoolValue &V) {
+  V.print(O);
+  return O;
+}
+
+/// ARMConstantPoolConstant - ARM-specific constant pool values for Constants,
+/// Functions, and BlockAddresses.
+class ARMConstantPoolConstant : public ARMConstantPoolValue {
+  const Constant *CVal;         // Constant being loaded.
+
+  ARMConstantPoolConstant(const Constant *C,
+                          unsigned ID,
+                          ARMCP::ARMCPKind Kind,
+                          unsigned char PCAdj,
+                          ARMCP::ARMCPModifier Modifier,
+                          bool AddCurrentAddress);
+  ARMConstantPoolConstant(Type *Ty, const Constant *C,
+                          unsigned ID,
+                          ARMCP::ARMCPKind Kind,
+                          unsigned char PCAdj,
+                          ARMCP::ARMCPModifier Modifier,
+                          bool AddCurrentAddress);
+
+public:
+  static ARMConstantPoolConstant *Create(const Constant *C, unsigned ID);
+  static ARMConstantPoolConstant *Create(const GlobalValue *GV,
+                                         ARMCP::ARMCPModifier Modifier);
+  static ARMConstantPoolConstant *Create(const Constant *C, unsigned ID,
+                                         ARMCP::ARMCPKind Kind,
+                                         unsigned char PCAdj);
+  static ARMConstantPoolConstant *Create(const Constant *C, unsigned ID,
+                                         ARMCP::ARMCPKind Kind,
+                                         unsigned char PCAdj,
+                                         ARMCP::ARMCPModifier Modifier,
+                                         bool AddCurrentAddress);
+
+  const GlobalValue *getGV() const;
+  const BlockAddress *getBlockAddress() const;
+
+  int getExistingMachineCPValue(MachineConstantPool *CP,
+                                unsigned Alignment) override;
+
+  /// hasSameValue - Return true if this ARM constpool value can share the same
+  /// constantpool entry as another ARM constpool value.
+  bool hasSameValue(ARMConstantPoolValue *ACPV) override;
+
+  void addSelectionDAGCSEId(FoldingSetNodeID &ID) override;
+
+  void print(raw_ostream &O) const override;
+  static bool classof(const ARMConstantPoolValue *APV) {
+    return APV->isGlobalValue() || APV->isBlockAddress() || APV->isLSDA();
+  }
+
+  bool equals(const ARMConstantPoolConstant *A) const {
+    return CVal == A->CVal && ARMConstantPoolValue::equals(A);
+  }
+};
+
+/// ARMConstantPoolSymbol - ARM-specific constantpool values for external
+/// symbols.
+class ARMConstantPoolSymbol : public ARMConstantPoolValue {
+  const std::string S;          // ExtSymbol being loaded.
+
+  ARMConstantPoolSymbol(LLVMContext &C, const char *s, unsigned id,
+                        unsigned char PCAdj, ARMCP::ARMCPModifier Modifier,
+                        bool AddCurrentAddress);
+
+public:
+  static ARMConstantPoolSymbol *Create(LLVMContext &C, const char *s,
+                                       unsigned ID, unsigned char PCAdj);
+
+  const char *getSymbol() const { return S.c_str(); }
+
+  int getExistingMachineCPValue(MachineConstantPool *CP,
+                                unsigned Alignment) override;
+
+  void addSelectionDAGCSEId(FoldingSetNodeID &ID) override;
+
+  /// hasSameValue - Return true if this ARM constpool value can share the same
+  /// constantpool entry as another ARM constpool value.
+  bool hasSameValue(ARMConstantPoolValue *ACPV) override;
+
+  void print(raw_ostream &O) const override;
+
+  static bool classof(const ARMConstantPoolValue *ACPV) {
+    return ACPV->isExtSymbol();
+  }
+
+  bool equals(const ARMConstantPoolSymbol *A) const {
+    return S == A->S && ARMConstantPoolValue::equals(A);
+  }
+};
+
+/// ARMConstantPoolMBB - ARM-specific constantpool value of a machine basic
+/// block.
+class ARMConstantPoolMBB : public ARMConstantPoolValue {
+  const MachineBasicBlock *MBB; // Machine basic block.
+
+  ARMConstantPoolMBB(LLVMContext &C, const MachineBasicBlock *mbb, unsigned id,
+                     unsigned char PCAdj, ARMCP::ARMCPModifier Modifier,
+                     bool AddCurrentAddress);
+
+public:
+  static ARMConstantPoolMBB *Create(LLVMContext &C,
+                                    const MachineBasicBlock *mbb,
+                                    unsigned ID, unsigned char PCAdj);
+
+  const MachineBasicBlock *getMBB() const { return MBB; }
+
+  int getExistingMachineCPValue(MachineConstantPool *CP,
+                                unsigned Alignment) override;
+
+  void addSelectionDAGCSEId(FoldingSetNodeID &ID) override;
+
+  /// hasSameValue - Return true if this ARM constpool value can share the same
+  /// constantpool entry as another ARM constpool value.
+  bool hasSameValue(ARMConstantPoolValue *ACPV) override;
+
+  void print(raw_ostream &O) const override;
+
+  static bool classof(const ARMConstantPoolValue *ACPV) {
+    return ACPV->isMachineBasicBlock();
+  }
+
+  bool equals(const ARMConstantPoolMBB *A) const {
+    return MBB == A->MBB && ARMConstantPoolValue::equals(A);
+  }
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
new file mode 100644
index 0000000..51d3dbb
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -0,0 +1,1364 @@
+//===-- ARMExpandPseudoInsts.cpp - Expand pseudo instructions -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that expands pseudo instructions into target
+// instructions to allow proper scheduling, if-conversion, and other late
+// optimizations. This pass should be run after register allocation but before
+// the post-regalloc scheduling pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMBaseRegisterInfo.h"
+#include "ARMConstantPoolValue.h"
+#include "ARMMachineFunctionInfo.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h" // FIXME: for debug only. remove!
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "arm-pseudo"
+
+static cl::opt<bool>
+VerifyARMPseudo("verify-arm-pseudo-expand", cl::Hidden,
+                cl::desc("Verify machine code after expanding ARM pseudos"));
+
+namespace {
+  class ARMExpandPseudo : public MachineFunctionPass {
+  public:
+    static char ID;
+    ARMExpandPseudo() : MachineFunctionPass(ID) {}
+
+    const ARMBaseInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    const ARMSubtarget *STI;
+    ARMFunctionInfo *AFI;
+
+    bool runOnMachineFunction(MachineFunction &Fn) override;
+
+    const char *getPassName() const override {
+      return "ARM pseudo instruction expansion pass";
+    }
+
+  private:
+    void TransferImpOps(MachineInstr &OldMI,
+                        MachineInstrBuilder &UseMI, MachineInstrBuilder &DefMI);
+    bool ExpandMI(MachineBasicBlock &MBB,
+                  MachineBasicBlock::iterator MBBI);
+    bool ExpandMBB(MachineBasicBlock &MBB);
+    void ExpandVLD(MachineBasicBlock::iterator &MBBI);
+    void ExpandVST(MachineBasicBlock::iterator &MBBI);
+    void ExpandLaneOp(MachineBasicBlock::iterator &MBBI);
+    void ExpandVTBL(MachineBasicBlock::iterator &MBBI,
+                    unsigned Opc, bool IsExt);
+    void ExpandMOV32BitImm(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator &MBBI);
+  };
+  char ARMExpandPseudo::ID = 0;
+}
+
+/// TransferImpOps - Transfer implicit operands on the pseudo instruction to
+/// the instructions created from the expansion.
+void ARMExpandPseudo::TransferImpOps(MachineInstr &OldMI,
+                                     MachineInstrBuilder &UseMI,
+                                     MachineInstrBuilder &DefMI) {
+  const MCInstrDesc &Desc = OldMI.getDesc();
+  for (unsigned i = Desc.getNumOperands(), e = OldMI.getNumOperands();
+       i != e; ++i) {
+    const MachineOperand &MO = OldMI.getOperand(i);
+    assert(MO.isReg() && MO.getReg());
+    if (MO.isUse())
+      UseMI.addOperand(MO);
+    else
+      DefMI.addOperand(MO);
+  }
+}
+
+namespace {
+  // Constants for register spacing in NEON load/store instructions.
+  // For quad-register load-lane and store-lane pseudo instructors, the
+  // spacing is initially assumed to be EvenDblSpc, and that is changed to
+  // OddDblSpc depending on the lane number operand.
+  enum NEONRegSpacing {
+    SingleSpc,
+    EvenDblSpc,
+    OddDblSpc
+  };
+
+  // Entries for NEON load/store information table.  The table is sorted by
+  // PseudoOpc for fast binary-search lookups.
+  struct NEONLdStTableEntry {
+    uint16_t PseudoOpc;
+    uint16_t RealOpc;
+    bool IsLoad;
+    bool isUpdating;
+    bool hasWritebackOperand;
+    uint8_t RegSpacing; // One of type NEONRegSpacing
+    uint8_t NumRegs; // D registers loaded or stored
+    uint8_t RegElts; // elements per D register; used for lane ops
+    // FIXME: Temporary flag to denote whether the real instruction takes
+    // a single register (like the encoding) or all of the registers in
+    // the list (like the asm syntax and the isel DAG). When all definitions
+    // are converted to take only the single encoded register, this will
+    // go away.
+    bool copyAllListRegs;
+
+    // Comparison methods for binary search of the table.
+    bool operator<(const NEONLdStTableEntry &TE) const {
+      return PseudoOpc < TE.PseudoOpc;
+    }
+    friend bool operator<(const NEONLdStTableEntry &TE, unsigned PseudoOpc) {
+      return TE.PseudoOpc < PseudoOpc;
+    }
+    friend bool LLVM_ATTRIBUTE_UNUSED operator<(unsigned PseudoOpc,
+                                                const NEONLdStTableEntry &TE) {
+      return PseudoOpc < TE.PseudoOpc;
+    }
+  };
+}
+
+static const NEONLdStTableEntry NEONLdStTable[] = {
+{ ARM::VLD1LNq16Pseudo,     ARM::VLD1LNd16,     true, false, false, EvenDblSpc, 1, 4 ,true},
+{ ARM::VLD1LNq16Pseudo_UPD, ARM::VLD1LNd16_UPD, true, true, true,  EvenDblSpc, 1, 4 ,true},
+{ ARM::VLD1LNq32Pseudo,     ARM::VLD1LNd32,     true, false, false, EvenDblSpc, 1, 2 ,true},
+{ ARM::VLD1LNq32Pseudo_UPD, ARM::VLD1LNd32_UPD, true, true, true,  EvenDblSpc, 1, 2 ,true},
+{ ARM::VLD1LNq8Pseudo,      ARM::VLD1LNd8,      true, false, false, EvenDblSpc, 1, 8 ,true},
+{ ARM::VLD1LNq8Pseudo_UPD,  ARM::VLD1LNd8_UPD, true, true, true,  EvenDblSpc, 1, 8 ,true},
+
+{ ARM::VLD1d64QPseudo,      ARM::VLD1d64Q,     true,  false, false, SingleSpc,  4, 1 ,false},
+{ ARM::VLD1d64QPseudoWB_fixed,  ARM::VLD1d64Qwb_fixed,   true,  true, false, SingleSpc,  4, 1 ,false},
+{ ARM::VLD1d64TPseudo,      ARM::VLD1d64T,     true,  false, false, SingleSpc,  3, 1 ,false},
+{ ARM::VLD1d64TPseudoWB_fixed,  ARM::VLD1d64Twb_fixed,   true,  true, false, SingleSpc,  3, 1 ,false},
+
+{ ARM::VLD2LNd16Pseudo,     ARM::VLD2LNd16,     true, false, false, SingleSpc,  2, 4 ,true},
+{ ARM::VLD2LNd16Pseudo_UPD, ARM::VLD2LNd16_UPD, true, true, true,  SingleSpc,  2, 4 ,true},
+{ ARM::VLD2LNd32Pseudo,     ARM::VLD2LNd32,     true, false, false, SingleSpc,  2, 2 ,true},
+{ ARM::VLD2LNd32Pseudo_UPD, ARM::VLD2LNd32_UPD, true, true, true,  SingleSpc,  2, 2 ,true},
+{ ARM::VLD2LNd8Pseudo,      ARM::VLD2LNd8,      true, false, false, SingleSpc,  2, 8 ,true},
+{ ARM::VLD2LNd8Pseudo_UPD,  ARM::VLD2LNd8_UPD, true, true, true,  SingleSpc,  2, 8 ,true},
+{ ARM::VLD2LNq16Pseudo,     ARM::VLD2LNq16,     true, false, false, EvenDblSpc, 2, 4 ,true},
+{ ARM::VLD2LNq16Pseudo_UPD, ARM::VLD2LNq16_UPD, true, true, true,  EvenDblSpc, 2, 4 ,true},
+{ ARM::VLD2LNq32Pseudo,     ARM::VLD2LNq32,     true, false, false, EvenDblSpc, 2, 2 ,true},
+{ ARM::VLD2LNq32Pseudo_UPD, ARM::VLD2LNq32_UPD, true, true, true,  EvenDblSpc, 2, 2 ,true},
+
+{ ARM::VLD2q16Pseudo,       ARM::VLD2q16,      true,  false, false, SingleSpc,  4, 4 ,false},
+{ ARM::VLD2q16PseudoWB_fixed,   ARM::VLD2q16wb_fixed, true, true, false,  SingleSpc,  4, 4 ,false},
+{ ARM::VLD2q16PseudoWB_register,   ARM::VLD2q16wb_register, true, true, true,  SingleSpc,  4, 4 ,false},
+{ ARM::VLD2q32Pseudo,       ARM::VLD2q32,      true,  false, false, SingleSpc,  4, 2 ,false},
+{ ARM::VLD2q32PseudoWB_fixed,   ARM::VLD2q32wb_fixed, true, true, false,  SingleSpc,  4, 2 ,false},
+{ ARM::VLD2q32PseudoWB_register,   ARM::VLD2q32wb_register, true, true, true,  SingleSpc,  4, 2 ,false},
+{ ARM::VLD2q8Pseudo,        ARM::VLD2q8,       true,  false, false, SingleSpc,  4, 8 ,false},
+{ ARM::VLD2q8PseudoWB_fixed,    ARM::VLD2q8wb_fixed, true, true, false,  SingleSpc,  4, 8 ,false},
+{ ARM::VLD2q8PseudoWB_register,    ARM::VLD2q8wb_register, true, true, true,  SingleSpc,  4, 8 ,false},
+
+{ ARM::VLD3DUPd16Pseudo,     ARM::VLD3DUPd16,     true, false, false, SingleSpc, 3, 4,true},
+{ ARM::VLD3DUPd16Pseudo_UPD, ARM::VLD3DUPd16_UPD, true, true, true,  SingleSpc, 3, 4,true},
+{ ARM::VLD3DUPd32Pseudo,     ARM::VLD3DUPd32,     true, false, false, SingleSpc, 3, 2,true},
+{ ARM::VLD3DUPd32Pseudo_UPD, ARM::VLD3DUPd32_UPD, true, true, true,  SingleSpc, 3, 2,true},
+{ ARM::VLD3DUPd8Pseudo,      ARM::VLD3DUPd8,      true, false, false, SingleSpc, 3, 8,true},
+{ ARM::VLD3DUPd8Pseudo_UPD,  ARM::VLD3DUPd8_UPD, true, true, true,  SingleSpc, 3, 8,true},
+
+{ ARM::VLD3LNd16Pseudo,     ARM::VLD3LNd16,     true, false, false, SingleSpc,  3, 4 ,true},
+{ ARM::VLD3LNd16Pseudo_UPD, ARM::VLD3LNd16_UPD, true, true, true,  SingleSpc,  3, 4 ,true},
+{ ARM::VLD3LNd32Pseudo,     ARM::VLD3LNd32,     true, false, false, SingleSpc,  3, 2 ,true},
+{ ARM::VLD3LNd32Pseudo_UPD, ARM::VLD3LNd32_UPD, true, true, true,  SingleSpc,  3, 2 ,true},
+{ ARM::VLD3LNd8Pseudo,      ARM::VLD3LNd8,      true, false, false, SingleSpc,  3, 8 ,true},
+{ ARM::VLD3LNd8Pseudo_UPD,  ARM::VLD3LNd8_UPD, true, true, true,  SingleSpc,  3, 8 ,true},
+{ ARM::VLD3LNq16Pseudo,     ARM::VLD3LNq16,     true, false, false, EvenDblSpc, 3, 4 ,true},
+{ ARM::VLD3LNq16Pseudo_UPD, ARM::VLD3LNq16_UPD, true, true, true,  EvenDblSpc, 3, 4 ,true},
+{ ARM::VLD3LNq32Pseudo,     ARM::VLD3LNq32,     true, false, false, EvenDblSpc, 3, 2 ,true},
+{ ARM::VLD3LNq32Pseudo_UPD, ARM::VLD3LNq32_UPD, true, true, true,  EvenDblSpc, 3, 2 ,true},
+
+{ ARM::VLD3d16Pseudo,       ARM::VLD3d16,      true,  false, false, SingleSpc,  3, 4 ,true},
+{ ARM::VLD3d16Pseudo_UPD,   ARM::VLD3d16_UPD, true, true, true,  SingleSpc,  3, 4 ,true},
+{ ARM::VLD3d32Pseudo,       ARM::VLD3d32,      true,  false, false, SingleSpc,  3, 2 ,true},
+{ ARM::VLD3d32Pseudo_UPD,   ARM::VLD3d32_UPD, true, true, true,  SingleSpc,  3, 2 ,true},
+{ ARM::VLD3d8Pseudo,        ARM::VLD3d8,       true,  false, false, SingleSpc,  3, 8 ,true},
+{ ARM::VLD3d8Pseudo_UPD,    ARM::VLD3d8_UPD, true, true, true,  SingleSpc,  3, 8 ,true},
+
+{ ARM::VLD3q16Pseudo_UPD,    ARM::VLD3q16_UPD, true, true, true,  EvenDblSpc, 3, 4 ,true},
+{ ARM::VLD3q16oddPseudo,     ARM::VLD3q16,     true,  false, false, OddDblSpc,  3, 4 ,true},
+{ ARM::VLD3q16oddPseudo_UPD, ARM::VLD3q16_UPD, true, true, true,  OddDblSpc,  3, 4 ,true},
+{ ARM::VLD3q32Pseudo_UPD,    ARM::VLD3q32_UPD, true, true, true,  EvenDblSpc, 3, 2 ,true},
+{ ARM::VLD3q32oddPseudo,     ARM::VLD3q32,     true,  false, false, OddDblSpc,  3, 2 ,true},
+{ ARM::VLD3q32oddPseudo_UPD, ARM::VLD3q32_UPD, true, true, true,  OddDblSpc,  3, 2 ,true},
+{ ARM::VLD3q8Pseudo_UPD,     ARM::VLD3q8_UPD, true, true, true,  EvenDblSpc, 3, 8 ,true},
+{ ARM::VLD3q8oddPseudo,      ARM::VLD3q8,      true,  false, false, OddDblSpc,  3, 8 ,true},
+{ ARM::VLD3q8oddPseudo_UPD,  ARM::VLD3q8_UPD, true, true, true,  OddDblSpc,  3, 8 ,true},
+
+{ ARM::VLD4DUPd16Pseudo,     ARM::VLD4DUPd16,     true, false, false, SingleSpc, 4, 4,true},
+{ ARM::VLD4DUPd16Pseudo_UPD, ARM::VLD4DUPd16_UPD, true, true, true,  SingleSpc, 4, 4,true},
+{ ARM::VLD4DUPd32Pseudo,     ARM::VLD4DUPd32,     true, false, false, SingleSpc, 4, 2,true},
+{ ARM::VLD4DUPd32Pseudo_UPD, ARM::VLD4DUPd32_UPD, true, true, true,  SingleSpc, 4, 2,true},
+{ ARM::VLD4DUPd8Pseudo,      ARM::VLD4DUPd8,      true, false, false, SingleSpc, 4, 8,true},
+{ ARM::VLD4DUPd8Pseudo_UPD,  ARM::VLD4DUPd8_UPD, true, true, true,  SingleSpc, 4, 8,true},
+
+{ ARM::VLD4LNd16Pseudo,     ARM::VLD4LNd16,     true, false, false, SingleSpc,  4, 4 ,true},
+{ ARM::VLD4LNd16Pseudo_UPD, ARM::VLD4LNd16_UPD, true, true, true,  SingleSpc,  4, 4 ,true},
+{ ARM::VLD4LNd32Pseudo,     ARM::VLD4LNd32,     true, false, false, SingleSpc,  4, 2 ,true},
+{ ARM::VLD4LNd32Pseudo_UPD, ARM::VLD4LNd32_UPD, true, true, true,  SingleSpc,  4, 2 ,true},
+{ ARM::VLD4LNd8Pseudo,      ARM::VLD4LNd8,      true, false, false, SingleSpc,  4, 8 ,true},
+{ ARM::VLD4LNd8Pseudo_UPD,  ARM::VLD4LNd8_UPD, true, true, true,  SingleSpc,  4, 8 ,true},
+{ ARM::VLD4LNq16Pseudo,     ARM::VLD4LNq16,     true, false, false, EvenDblSpc, 4, 4 ,true},
+{ ARM::VLD4LNq16Pseudo_UPD, ARM::VLD4LNq16_UPD, true, true, true,  EvenDblSpc, 4, 4 ,true},
+{ ARM::VLD4LNq32Pseudo,     ARM::VLD4LNq32,     true, false, false, EvenDblSpc, 4, 2 ,true},
+{ ARM::VLD4LNq32Pseudo_UPD, ARM::VLD4LNq32_UPD, true, true, true,  EvenDblSpc, 4, 2 ,true},
+
+{ ARM::VLD4d16Pseudo,       ARM::VLD4d16,      true,  false, false, SingleSpc,  4, 4 ,true},
+{ ARM::VLD4d16Pseudo_UPD,   ARM::VLD4d16_UPD, true, true, true,  SingleSpc,  4, 4 ,true},
+{ ARM::VLD4d32Pseudo,       ARM::VLD4d32,      true,  false, false, SingleSpc,  4, 2 ,true},
+{ ARM::VLD4d32Pseudo_UPD,   ARM::VLD4d32_UPD, true, true, true,  SingleSpc,  4, 2 ,true},
+{ ARM::VLD4d8Pseudo,        ARM::VLD4d8,       true,  false, false, SingleSpc,  4, 8 ,true},
+{ ARM::VLD4d8Pseudo_UPD,    ARM::VLD4d8_UPD, true, true, true,  SingleSpc,  4, 8 ,true},
+
+{ ARM::VLD4q16Pseudo_UPD,    ARM::VLD4q16_UPD, true, true, true,  EvenDblSpc, 4, 4 ,true},
+{ ARM::VLD4q16oddPseudo,     ARM::VLD4q16,     true,  false, false, OddDblSpc,  4, 4 ,true},
+{ ARM::VLD4q16oddPseudo_UPD, ARM::VLD4q16_UPD, true, true, true,  OddDblSpc,  4, 4 ,true},
+{ ARM::VLD4q32Pseudo_UPD,    ARM::VLD4q32_UPD, true, true, true,  EvenDblSpc, 4, 2 ,true},
+{ ARM::VLD4q32oddPseudo,     ARM::VLD4q32,     true,  false, false, OddDblSpc,  4, 2 ,true},
+{ ARM::VLD4q32oddPseudo_UPD, ARM::VLD4q32_UPD, true, true, true,  OddDblSpc,  4, 2 ,true},
+{ ARM::VLD4q8Pseudo_UPD,     ARM::VLD4q8_UPD, true, true, true,  EvenDblSpc, 4, 8 ,true},
+{ ARM::VLD4q8oddPseudo,      ARM::VLD4q8,      true,  false, false, OddDblSpc,  4, 8 ,true},
+{ ARM::VLD4q8oddPseudo_UPD,  ARM::VLD4q8_UPD, true, true, true,  OddDblSpc,  4, 8 ,true},
+
+{ ARM::VST1LNq16Pseudo,     ARM::VST1LNd16,    false, false, false, EvenDblSpc, 1, 4 ,true},
+{ ARM::VST1LNq16Pseudo_UPD, ARM::VST1LNd16_UPD, false, true, true,  EvenDblSpc, 1, 4 ,true},
+{ ARM::VST1LNq32Pseudo,     ARM::VST1LNd32,    false, false, false, EvenDblSpc, 1, 2 ,true},
+{ ARM::VST1LNq32Pseudo_UPD, ARM::VST1LNd32_UPD, false, true, true,  EvenDblSpc, 1, 2 ,true},
+{ ARM::VST1LNq8Pseudo,      ARM::VST1LNd8,     false, false, false, EvenDblSpc, 1, 8 ,true},
+{ ARM::VST1LNq8Pseudo_UPD,  ARM::VST1LNd8_UPD, false, true, true,  EvenDblSpc, 1, 8 ,true},
+
+{ ARM::VST1d64QPseudo,      ARM::VST1d64Q,     false, false, false, SingleSpc,  4, 1 ,false},
+{ ARM::VST1d64QPseudoWB_fixed,  ARM::VST1d64Qwb_fixed, false, true, false,  SingleSpc,  4, 1 ,false},
+{ ARM::VST1d64QPseudoWB_register, ARM::VST1d64Qwb_register, false, true, true,  SingleSpc,  4, 1 ,false},
+{ ARM::VST1d64TPseudo,      ARM::VST1d64T,     false, false, false, SingleSpc,  3, 1 ,false},
+{ ARM::VST1d64TPseudoWB_fixed,  ARM::VST1d64Twb_fixed, false, true, false,  SingleSpc,  3, 1 ,false},
+{ ARM::VST1d64TPseudoWB_register,  ARM::VST1d64Twb_register, false, true, true,  SingleSpc,  3, 1 ,false},
+
+{ ARM::VST2LNd16Pseudo,     ARM::VST2LNd16,     false, false, false, SingleSpc, 2, 4 ,true},
+{ ARM::VST2LNd16Pseudo_UPD, ARM::VST2LNd16_UPD, false, true, true,  SingleSpc, 2, 4 ,true},
+{ ARM::VST2LNd32Pseudo,     ARM::VST2LNd32,     false, false, false, SingleSpc, 2, 2 ,true},
+{ ARM::VST2LNd32Pseudo_UPD, ARM::VST2LNd32_UPD, false, true, true,  SingleSpc, 2, 2 ,true},
+{ ARM::VST2LNd8Pseudo,      ARM::VST2LNd8,      false, false, false, SingleSpc, 2, 8 ,true},
+{ ARM::VST2LNd8Pseudo_UPD,  ARM::VST2LNd8_UPD, false, true, true,  SingleSpc, 2, 8 ,true},
+{ ARM::VST2LNq16Pseudo,     ARM::VST2LNq16,     false, false, false, EvenDblSpc, 2, 4,true},
+{ ARM::VST2LNq16Pseudo_UPD, ARM::VST2LNq16_UPD, false, true, true,  EvenDblSpc, 2, 4,true},
+{ ARM::VST2LNq32Pseudo,     ARM::VST2LNq32,     false, false, false, EvenDblSpc, 2, 2,true},
+{ ARM::VST2LNq32Pseudo_UPD, ARM::VST2LNq32_UPD, false, true, true,  EvenDblSpc, 2, 2,true},
+
+{ ARM::VST2q16Pseudo,       ARM::VST2q16,      false, false, false, SingleSpc,  4, 4 ,false},
+{ ARM::VST2q16PseudoWB_fixed,   ARM::VST2q16wb_fixed, false, true, false,  SingleSpc,  4, 4 ,false},
+{ ARM::VST2q16PseudoWB_register,   ARM::VST2q16wb_register, false, true, true,  SingleSpc,  4, 4 ,false},
+{ ARM::VST2q32Pseudo,       ARM::VST2q32,      false, false, false, SingleSpc,  4, 2 ,false},
+{ ARM::VST2q32PseudoWB_fixed,   ARM::VST2q32wb_fixed, false, true, false,  SingleSpc,  4, 2 ,false},
+{ ARM::VST2q32PseudoWB_register,   ARM::VST2q32wb_register, false, true, true,  SingleSpc,  4, 2 ,false},
+{ ARM::VST2q8Pseudo,        ARM::VST2q8,       false, false, false, SingleSpc,  4, 8 ,false},
+{ ARM::VST2q8PseudoWB_fixed,    ARM::VST2q8wb_fixed, false, true, false,  SingleSpc,  4, 8 ,false},
+{ ARM::VST2q8PseudoWB_register,    ARM::VST2q8wb_register, false, true, true,  SingleSpc,  4, 8 ,false},
+
+{ ARM::VST3LNd16Pseudo,     ARM::VST3LNd16,     false, false, false, SingleSpc, 3, 4 ,true},
+{ ARM::VST3LNd16Pseudo_UPD, ARM::VST3LNd16_UPD, false, true, true,  SingleSpc, 3, 4 ,true},
+{ ARM::VST3LNd32Pseudo,     ARM::VST3LNd32,     false, false, false, SingleSpc, 3, 2 ,true},
+{ ARM::VST3LNd32Pseudo_UPD, ARM::VST3LNd32_UPD, false, true, true,  SingleSpc, 3, 2 ,true},
+{ ARM::VST3LNd8Pseudo,      ARM::VST3LNd8,      false, false, false, SingleSpc, 3, 8 ,true},
+{ ARM::VST3LNd8Pseudo_UPD,  ARM::VST3LNd8_UPD, false, true, true,  SingleSpc, 3, 8 ,true},
+{ ARM::VST3LNq16Pseudo,     ARM::VST3LNq16,     false, false, false, EvenDblSpc, 3, 4,true},
+{ ARM::VST3LNq16Pseudo_UPD, ARM::VST3LNq16_UPD, false, true, true,  EvenDblSpc, 3, 4,true},
+{ ARM::VST3LNq32Pseudo,     ARM::VST3LNq32,     false, false, false, EvenDblSpc, 3, 2,true},
+{ ARM::VST3LNq32Pseudo_UPD, ARM::VST3LNq32_UPD, false, true, true,  EvenDblSpc, 3, 2,true},
+
+{ ARM::VST3d16Pseudo,       ARM::VST3d16,      false, false, false, SingleSpc,  3, 4 ,true},
+{ ARM::VST3d16Pseudo_UPD,   ARM::VST3d16_UPD, false, true, true,  SingleSpc,  3, 4 ,true},
+{ ARM::VST3d32Pseudo,       ARM::VST3d32,      false, false, false, SingleSpc,  3, 2 ,true},
+{ ARM::VST3d32Pseudo_UPD,   ARM::VST3d32_UPD, false, true, true,  SingleSpc,  3, 2 ,true},
+{ ARM::VST3d8Pseudo,        ARM::VST3d8,       false, false, false, SingleSpc,  3, 8 ,true},
+{ ARM::VST3d8Pseudo_UPD,    ARM::VST3d8_UPD, false, true, true,  SingleSpc,  3, 8 ,true},
+
+{ ARM::VST3q16Pseudo_UPD,    ARM::VST3q16_UPD, false, true, true,  EvenDblSpc, 3, 4 ,true},
+{ ARM::VST3q16oddPseudo,     ARM::VST3q16,     false, false, false, OddDblSpc,  3, 4 ,true},
+{ ARM::VST3q16oddPseudo_UPD, ARM::VST3q16_UPD, false, true, true,  OddDblSpc,  3, 4 ,true},
+{ ARM::VST3q32Pseudo_UPD,    ARM::VST3q32_UPD, false, true, true,  EvenDblSpc, 3, 2 ,true},
+{ ARM::VST3q32oddPseudo,     ARM::VST3q32,     false, false, false, OddDblSpc,  3, 2 ,true},
+{ ARM::VST3q32oddPseudo_UPD, ARM::VST3q32_UPD, false, true, true,  OddDblSpc,  3, 2 ,true},
+{ ARM::VST3q8Pseudo_UPD,     ARM::VST3q8_UPD, false, true, true,  EvenDblSpc, 3, 8 ,true},
+{ ARM::VST3q8oddPseudo,      ARM::VST3q8,      false, false, false, OddDblSpc,  3, 8 ,true},
+{ ARM::VST3q8oddPseudo_UPD,  ARM::VST3q8_UPD, false, true, true,  OddDblSpc,  3, 8 ,true},
+
+{ ARM::VST4LNd16Pseudo,     ARM::VST4LNd16,     false, false, false, SingleSpc, 4, 4 ,true},
+{ ARM::VST4LNd16Pseudo_UPD, ARM::VST4LNd16_UPD, false, true, true,  SingleSpc, 4, 4 ,true},
+{ ARM::VST4LNd32Pseudo,     ARM::VST4LNd32,     false, false, false, SingleSpc, 4, 2 ,true},
+{ ARM::VST4LNd32Pseudo_UPD, ARM::VST4LNd32_UPD, false, true, true,  SingleSpc, 4, 2 ,true},
+{ ARM::VST4LNd8Pseudo,      ARM::VST4LNd8,      false, false, false, SingleSpc, 4, 8 ,true},
+{ ARM::VST4LNd8Pseudo_UPD,  ARM::VST4LNd8_UPD, false, true, true,  SingleSpc, 4, 8 ,true},
+{ ARM::VST4LNq16Pseudo,     ARM::VST4LNq16,     false, false, false, EvenDblSpc, 4, 4,true},
+{ ARM::VST4LNq16Pseudo_UPD, ARM::VST4LNq16_UPD, false, true, true,  EvenDblSpc, 4, 4,true},
+{ ARM::VST4LNq32Pseudo,     ARM::VST4LNq32,     false, false, false, EvenDblSpc, 4, 2,true},
+{ ARM::VST4LNq32Pseudo_UPD, ARM::VST4LNq32_UPD, false, true, true,  EvenDblSpc, 4, 2,true},
+
+{ ARM::VST4d16Pseudo,       ARM::VST4d16,      false, false, false, SingleSpc,  4, 4 ,true},
+{ ARM::VST4d16Pseudo_UPD,   ARM::VST4d16_UPD, false, true, true,  SingleSpc,  4, 4 ,true},
+{ ARM::VST4d32Pseudo,       ARM::VST4d32,      false, false, false, SingleSpc,  4, 2 ,true},
+{ ARM::VST4d32Pseudo_UPD,   ARM::VST4d32_UPD, false, true, true,  SingleSpc,  4, 2 ,true},
+{ ARM::VST4d8Pseudo,        ARM::VST4d8,       false, false, false, SingleSpc,  4, 8 ,true},
+{ ARM::VST4d8Pseudo_UPD,    ARM::VST4d8_UPD, false, true, true,  SingleSpc,  4, 8 ,true},
+
+{ ARM::VST4q16Pseudo_UPD,    ARM::VST4q16_UPD, false, true, true,  EvenDblSpc, 4, 4 ,true},
+{ ARM::VST4q16oddPseudo,     ARM::VST4q16,     false, false, false, OddDblSpc,  4, 4 ,true},
+{ ARM::VST4q16oddPseudo_UPD, ARM::VST4q16_UPD, false, true, true,  OddDblSpc,  4, 4 ,true},
+{ ARM::VST4q32Pseudo_UPD,    ARM::VST4q32_UPD, false, true, true,  EvenDblSpc, 4, 2 ,true},
+{ ARM::VST4q32oddPseudo,     ARM::VST4q32,     false, false, false, OddDblSpc,  4, 2 ,true},
+{ ARM::VST4q32oddPseudo_UPD, ARM::VST4q32_UPD, false, true, true,  OddDblSpc,  4, 2 ,true},
+{ ARM::VST4q8Pseudo_UPD,     ARM::VST4q8_UPD, false, true, true,  EvenDblSpc, 4, 8 ,true},
+{ ARM::VST4q8oddPseudo,      ARM::VST4q8,      false, false, false, OddDblSpc,  4, 8 ,true},
+{ ARM::VST4q8oddPseudo_UPD,  ARM::VST4q8_UPD, false, true, true,  OddDblSpc,  4, 8 ,true}
+};
+
+/// LookupNEONLdSt - Search the NEONLdStTable for information about a NEON
+/// load or store pseudo instruction.
+static const NEONLdStTableEntry *LookupNEONLdSt(unsigned Opcode) {
+  const unsigned NumEntries = array_lengthof(NEONLdStTable);
+
+#ifndef NDEBUG
+  // Make sure the table is sorted.
+  static bool TableChecked = false;
+  if (!TableChecked) {
+    for (unsigned i = 0; i != NumEntries-1; ++i)
+      assert(NEONLdStTable[i] < NEONLdStTable[i+1] &&
+             "NEONLdStTable is not sorted!");
+    TableChecked = true;
+  }
+#endif
+
+  const NEONLdStTableEntry *I =
+    std::lower_bound(NEONLdStTable, NEONLdStTable + NumEntries, Opcode);
+  if (I != NEONLdStTable + NumEntries && I->PseudoOpc == Opcode)
+    return I;
+  return nullptr;
+}
+
+/// GetDSubRegs - Get 4 D subregisters of a Q, QQ, or QQQQ register,
+/// corresponding to the specified register spacing.  Not all of the results
+/// are necessarily valid, e.g., a Q register only has 2 D subregisters.
+static void GetDSubRegs(unsigned Reg, NEONRegSpacing RegSpc,
+                        const TargetRegisterInfo *TRI, unsigned &D0,
+                        unsigned &D1, unsigned &D2, unsigned &D3) {
+  if (RegSpc == SingleSpc) {
+    D0 = TRI->getSubReg(Reg, ARM::dsub_0);
+    D1 = TRI->getSubReg(Reg, ARM::dsub_1);
+    D2 = TRI->getSubReg(Reg, ARM::dsub_2);
+    D3 = TRI->getSubReg(Reg, ARM::dsub_3);
+  } else if (RegSpc == EvenDblSpc) {
+    D0 = TRI->getSubReg(Reg, ARM::dsub_0);
+    D1 = TRI->getSubReg(Reg, ARM::dsub_2);
+    D2 = TRI->getSubReg(Reg, ARM::dsub_4);
+    D3 = TRI->getSubReg(Reg, ARM::dsub_6);
+  } else {
+    assert(RegSpc == OddDblSpc && "unknown register spacing");
+    D0 = TRI->getSubReg(Reg, ARM::dsub_1);
+    D1 = TRI->getSubReg(Reg, ARM::dsub_3);
+    D2 = TRI->getSubReg(Reg, ARM::dsub_5);
+    D3 = TRI->getSubReg(Reg, ARM::dsub_7);
+  }
+}
+
+/// ExpandVLD - Translate VLD pseudo instructions with Q, QQ or QQQQ register
+/// operands to real VLD instructions with D register operands.
+void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) {
+  MachineInstr &MI = *MBBI;
+  MachineBasicBlock &MBB = *MI.getParent();
+
+  const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode());
+  assert(TableEntry && TableEntry->IsLoad && "NEONLdStTable lookup failed");
+  NEONRegSpacing RegSpc = (NEONRegSpacing)TableEntry->RegSpacing;
+  unsigned NumRegs = TableEntry->NumRegs;
+
+  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                                    TII->get(TableEntry->RealOpc));
+  unsigned OpIdx = 0;
+
+  bool DstIsDead = MI.getOperand(OpIdx).isDead();
+  unsigned DstReg = MI.getOperand(OpIdx++).getReg();
+  unsigned D0, D1, D2, D3;
+  GetDSubRegs(DstReg, RegSpc, TRI, D0, D1, D2, D3);
+  MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead));
+  if (NumRegs > 1 && TableEntry->copyAllListRegs)
+    MIB.addReg(D1, RegState::Define | getDeadRegState(DstIsDead));
+  if (NumRegs > 2 && TableEntry->copyAllListRegs)
+    MIB.addReg(D2, RegState::Define | getDeadRegState(DstIsDead));
+  if (NumRegs > 3 && TableEntry->copyAllListRegs)
+    MIB.addReg(D3, RegState::Define | getDeadRegState(DstIsDead));
+
+  if (TableEntry->isUpdating)
+    MIB.addOperand(MI.getOperand(OpIdx++));
+
+  // Copy the addrmode6 operands.
+  MIB.addOperand(MI.getOperand(OpIdx++));
+  MIB.addOperand(MI.getOperand(OpIdx++));
+  // Copy the am6offset operand.
+  if (TableEntry->hasWritebackOperand)
+    MIB.addOperand(MI.getOperand(OpIdx++));
+
+  // For an instruction writing double-spaced subregs, the pseudo instruction
+  // has an extra operand that is a use of the super-register.  Record the
+  // operand index and skip over it.
+  unsigned SrcOpIdx = 0;
+  if (RegSpc == EvenDblSpc || RegSpc == OddDblSpc)
+    SrcOpIdx = OpIdx++;
+
+  // Copy the predicate operands.
+  MIB.addOperand(MI.getOperand(OpIdx++));
+  MIB.addOperand(MI.getOperand(OpIdx++));
+
+  // Copy the super-register source operand used for double-spaced subregs over
+  // to the new instruction as an implicit operand.
+  if (SrcOpIdx != 0) {
+    MachineOperand MO = MI.getOperand(SrcOpIdx);
+    MO.setImplicit(true);
+    MIB.addOperand(MO);
+  }
+  // Add an implicit def for the super-register.
+  MIB.addReg(DstReg, RegState::ImplicitDefine | getDeadRegState(DstIsDead));
+  TransferImpOps(MI, MIB, MIB);
+
+  // Transfer memoperands.
+  MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
+  MI.eraseFromParent();
+}
+
+/// ExpandVST - Translate VST pseudo instructions with Q, QQ or QQQQ register
+/// operands to real VST instructions with D register operands.
+void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI) {
+  MachineInstr &MI = *MBBI;
+  MachineBasicBlock &MBB = *MI.getParent();
+
+  const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode());
+  assert(TableEntry && !TableEntry->IsLoad && "NEONLdStTable lookup failed");
+  NEONRegSpacing RegSpc = (NEONRegSpacing)TableEntry->RegSpacing;
+  unsigned NumRegs = TableEntry->NumRegs;
+
+  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                                    TII->get(TableEntry->RealOpc));
+  unsigned OpIdx = 0;
+  if (TableEntry->isUpdating)
+    MIB.addOperand(MI.getOperand(OpIdx++));
+
+  // Copy the addrmode6 operands.
+  MIB.addOperand(MI.getOperand(OpIdx++));
+  MIB.addOperand(MI.getOperand(OpIdx++));
+  // Copy the am6offset operand.
+  if (TableEntry->hasWritebackOperand)
+    MIB.addOperand(MI.getOperand(OpIdx++));
+
+  bool SrcIsKill = MI.getOperand(OpIdx).isKill();
+  bool SrcIsUndef = MI.getOperand(OpIdx).isUndef();
+  unsigned SrcReg = MI.getOperand(OpIdx++).getReg();
+  unsigned D0, D1, D2, D3;
+  GetDSubRegs(SrcReg, RegSpc, TRI, D0, D1, D2, D3);
+  MIB.addReg(D0, getUndefRegState(SrcIsUndef));
+  if (NumRegs > 1 && TableEntry->copyAllListRegs)
+    MIB.addReg(D1, getUndefRegState(SrcIsUndef));
+  if (NumRegs > 2 && TableEntry->copyAllListRegs)
+    MIB.addReg(D2, getUndefRegState(SrcIsUndef));
+  if (NumRegs > 3 && TableEntry->copyAllListRegs)
+    MIB.addReg(D3, getUndefRegState(SrcIsUndef));
+
+  // Copy the predicate operands.
+  MIB.addOperand(MI.getOperand(OpIdx++));
+  MIB.addOperand(MI.getOperand(OpIdx++));
+
+  if (SrcIsKill && !SrcIsUndef) // Add an implicit kill for the super-reg.
+    MIB->addRegisterKilled(SrcReg, TRI, true);
+  else if (!SrcIsUndef)
+    MIB.addReg(SrcReg, RegState::Implicit); // Add implicit uses for src reg.
+  TransferImpOps(MI, MIB, MIB);
+
+  // Transfer memoperands.
+  MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
+  MI.eraseFromParent();
+}
+
+/// ExpandLaneOp - Translate VLD*LN and VST*LN instructions with Q, QQ or QQQQ
+/// register operands to real instructions with D register operands.
+void ARMExpandPseudo::ExpandLaneOp(MachineBasicBlock::iterator &MBBI) {
+  MachineInstr &MI = *MBBI;
+  MachineBasicBlock &MBB = *MI.getParent();
+
+  const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode());
+  assert(TableEntry && "NEONLdStTable lookup failed");
+  NEONRegSpacing RegSpc = (NEONRegSpacing)TableEntry->RegSpacing;
+  unsigned NumRegs = TableEntry->NumRegs;
+  unsigned RegElts = TableEntry->RegElts;
+
+  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                                    TII->get(TableEntry->RealOpc));
+  unsigned OpIdx = 0;
+  // The lane operand is always the 3rd from last operand, before the 2
+  // predicate operands.
+  unsigned Lane = MI.getOperand(MI.getDesc().getNumOperands() - 3).getImm();
+
+  // Adjust the lane and spacing as needed for Q registers.
+  assert(RegSpc != OddDblSpc && "unexpected register spacing for VLD/VST-lane");
+  if (RegSpc == EvenDblSpc && Lane >= RegElts) {
+    RegSpc = OddDblSpc;
+    Lane -= RegElts;
+  }
+  assert(Lane < RegElts && "out of range lane for VLD/VST-lane");
+
+  unsigned D0 = 0, D1 = 0, D2 = 0, D3 = 0;
+  unsigned DstReg = 0;
+  bool DstIsDead = false;
+  if (TableEntry->IsLoad) {
+    DstIsDead = MI.getOperand(OpIdx).isDead();
+    DstReg = MI.getOperand(OpIdx++).getReg();
+    GetDSubRegs(DstReg, RegSpc, TRI, D0, D1, D2, D3);
+    MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead));
+    if (NumRegs > 1)
+      MIB.addReg(D1, RegState::Define | getDeadRegState(DstIsDead));
+    if (NumRegs > 2)
+      MIB.addReg(D2, RegState::Define | getDeadRegState(DstIsDead));
+    if (NumRegs > 3)
+      MIB.addReg(D3, RegState::Define | getDeadRegState(DstIsDead));
+  }
+
+  if (TableEntry->isUpdating)
+    MIB.addOperand(MI.getOperand(OpIdx++));
+
+  // Copy the addrmode6 operands.
+  MIB.addOperand(MI.getOperand(OpIdx++));
+  MIB.addOperand(MI.getOperand(OpIdx++));
+  // Copy the am6offset operand.
+  if (TableEntry->hasWritebackOperand)
+    MIB.addOperand(MI.getOperand(OpIdx++));
+
+  // Grab the super-register source.
+  MachineOperand MO = MI.getOperand(OpIdx++);
+  if (!TableEntry->IsLoad)
+    GetDSubRegs(MO.getReg(), RegSpc, TRI, D0, D1, D2, D3);
+
+  // Add the subregs as sources of the new instruction.
+  unsigned SrcFlags = (getUndefRegState(MO.isUndef()) |
+                       getKillRegState(MO.isKill()));
+  MIB.addReg(D0, SrcFlags);
+  if (NumRegs > 1)
+    MIB.addReg(D1, SrcFlags);
+  if (NumRegs > 2)
+    MIB.addReg(D2, SrcFlags);
+  if (NumRegs > 3)
+    MIB.addReg(D3, SrcFlags);
+
+  // Add the lane number operand.
+  MIB.addImm(Lane);
+  OpIdx += 1;
+
+  // Copy the predicate operands.
+  MIB.addOperand(MI.getOperand(OpIdx++));
+  MIB.addOperand(MI.getOperand(OpIdx++));
+
+  // Copy the super-register source to be an implicit source.
+  MO.setImplicit(true);
+  MIB.addOperand(MO);
+  if (TableEntry->IsLoad)
+    // Add an implicit def for the super-register.
+    MIB.addReg(DstReg, RegState::ImplicitDefine | getDeadRegState(DstIsDead));
+  TransferImpOps(MI, MIB, MIB);
+  // Transfer memoperands.
+  MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+  MI.eraseFromParent();
+}
+
+/// ExpandVTBL - Translate VTBL and VTBX pseudo instructions with Q or QQ
+/// register operands to real instructions with D register operands.
+void ARMExpandPseudo::ExpandVTBL(MachineBasicBlock::iterator &MBBI,
+                                 unsigned Opc, bool IsExt) {
+  MachineInstr &MI = *MBBI;
+  MachineBasicBlock &MBB = *MI.getParent();
+
+  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc));
+  unsigned OpIdx = 0;
+
+  // Transfer the destination register operand.
+  MIB.addOperand(MI.getOperand(OpIdx++));
+  if (IsExt)
+    MIB.addOperand(MI.getOperand(OpIdx++));
+
+  bool SrcIsKill = MI.getOperand(OpIdx).isKill();
+  unsigned SrcReg = MI.getOperand(OpIdx++).getReg();
+  unsigned D0, D1, D2, D3;
+  GetDSubRegs(SrcReg, SingleSpc, TRI, D0, D1, D2, D3);
+  MIB.addReg(D0);
+
+  // Copy the other source register operand.
+  MIB.addOperand(MI.getOperand(OpIdx++));
+
+  // Copy the predicate operands.
+  MIB.addOperand(MI.getOperand(OpIdx++));
+  MIB.addOperand(MI.getOperand(OpIdx++));
+
+  // Add an implicit kill and use for the super-reg.
+  MIB.addReg(SrcReg, RegState::Implicit | getKillRegState(SrcIsKill));
+  TransferImpOps(MI, MIB, MIB);
+  MI.eraseFromParent();
+}
+
+static bool IsAnAddressOperand(const MachineOperand &MO) {
+  // This check is overly conservative.  Unless we are certain that the machine
+  // operand is not a symbol reference, we return that it is a symbol reference.
+  // This is important as the load pair may not be split up Windows.
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+  case MachineOperand::MO_Immediate:
+  case MachineOperand::MO_CImmediate:
+  case MachineOperand::MO_FPImmediate:
+    return false;
+  case MachineOperand::MO_MachineBasicBlock:
+    return true;
+  case MachineOperand::MO_FrameIndex:
+    return false;
+  case MachineOperand::MO_ConstantPoolIndex:
+  case MachineOperand::MO_TargetIndex:
+  case MachineOperand::MO_JumpTableIndex:
+  case MachineOperand::MO_ExternalSymbol:
+  case MachineOperand::MO_GlobalAddress:
+  case MachineOperand::MO_BlockAddress:
+    return true;
+  case MachineOperand::MO_RegisterMask:
+  case MachineOperand::MO_RegisterLiveOut:
+    return false;
+  case MachineOperand::MO_Metadata:
+  case MachineOperand::MO_MCSymbol:
+    return true;
+  case MachineOperand::MO_CFIIndex:
+    return false;
+  }
+  llvm_unreachable("unhandled machine operand type");
+}
+
+void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator &MBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned Opcode = MI.getOpcode();
+  unsigned PredReg = 0;
+  ARMCC::CondCodes Pred = getInstrPredicate(&MI, PredReg);
+  unsigned DstReg = MI.getOperand(0).getReg();
+  bool DstIsDead = MI.getOperand(0).isDead();
+  bool isCC = Opcode == ARM::MOVCCi32imm || Opcode == ARM::t2MOVCCi32imm;
+  const MachineOperand &MO = MI.getOperand(isCC ? 2 : 1);
+  bool RequiresBundling = STI->isTargetWindows() && IsAnAddressOperand(MO);
+  MachineInstrBuilder LO16, HI16;
+
+  if (!STI->hasV6T2Ops() &&
+      (Opcode == ARM::MOVi32imm || Opcode == ARM::MOVCCi32imm)) {
+    // FIXME Windows CE supports older ARM CPUs
+    assert(!STI->isTargetWindows() && "Windows on ARM requires ARMv7+");
+
+    // Expand into a movi + orr.
+    LO16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVi), DstReg);
+    HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::ORRri))
+      .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+      .addReg(DstReg);
+
+    assert (MO.isImm() && "MOVi32imm w/ non-immediate source operand!");
+    unsigned ImmVal = (unsigned)MO.getImm();
+    unsigned SOImmValV1 = ARM_AM::getSOImmTwoPartFirst(ImmVal);
+    unsigned SOImmValV2 = ARM_AM::getSOImmTwoPartSecond(ImmVal);
+    LO16 = LO16.addImm(SOImmValV1);
+    HI16 = HI16.addImm(SOImmValV2);
+    LO16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+    HI16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+    LO16.addImm(Pred).addReg(PredReg).addReg(0);
+    HI16.addImm(Pred).addReg(PredReg).addReg(0);
+    TransferImpOps(MI, LO16, HI16);
+    MI.eraseFromParent();
+    return;
+  }
+
+  unsigned LO16Opc = 0;
+  unsigned HI16Opc = 0;
+  if (Opcode == ARM::t2MOVi32imm || Opcode == ARM::t2MOVCCi32imm) {
+    LO16Opc = ARM::t2MOVi16;
+    HI16Opc = ARM::t2MOVTi16;
+  } else {
+    LO16Opc = ARM::MOVi16;
+    HI16Opc = ARM::MOVTi16;
+  }
+
+  LO16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(LO16Opc), DstReg);
+  HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(HI16Opc))
+    .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+    .addReg(DstReg);
+
+  switch (MO.getType()) {
+  case MachineOperand::MO_Immediate: {
+    unsigned Imm = MO.getImm();
+    unsigned Lo16 = Imm & 0xffff;
+    unsigned Hi16 = (Imm >> 16) & 0xffff;
+    LO16 = LO16.addImm(Lo16);
+    HI16 = HI16.addImm(Hi16);
+    break;
+  }
+  case MachineOperand::MO_ExternalSymbol: {
+    const char *ES = MO.getSymbolName();
+    unsigned TF = MO.getTargetFlags();
+    LO16 = LO16.addExternalSymbol(ES, TF | ARMII::MO_LO16);
+    HI16 = HI16.addExternalSymbol(ES, TF | ARMII::MO_HI16);
+    break;
+  }
+  default: {
+    const GlobalValue *GV = MO.getGlobal();
+    unsigned TF = MO.getTargetFlags();
+    LO16 = LO16.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_LO16);
+    HI16 = HI16.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_HI16);
+    break;
+  }
+  }
+
+  LO16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+  HI16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+  LO16.addImm(Pred).addReg(PredReg);
+  HI16.addImm(Pred).addReg(PredReg);
+
+  if (RequiresBundling)
+    finalizeBundle(MBB, &*LO16, &*MBBI);
+
+  TransferImpOps(MI, LO16, HI16);
+  MI.eraseFromParent();
+}
+
+bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator MBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned Opcode = MI.getOpcode();
+  switch (Opcode) {
+    default:
+      return false;
+    case ARM::VMOVScc:
+    case ARM::VMOVDcc: {
+      unsigned newOpc = Opcode == ARM::VMOVScc ? ARM::VMOVS : ARM::VMOVD;
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(newOpc),
+              MI.getOperand(1).getReg())
+        .addOperand(MI.getOperand(2))
+        .addImm(MI.getOperand(3).getImm()) // 'pred'
+        .addOperand(MI.getOperand(4));
+
+      MI.eraseFromParent();
+      return true;
+    }
+    case ARM::t2MOVCCr:
+    case ARM::MOVCCr: {
+      unsigned Opc = AFI->isThumbFunction() ? ARM::t2MOVr : ARM::MOVr;
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc),
+              MI.getOperand(1).getReg())
+        .addOperand(MI.getOperand(2))
+        .addImm(MI.getOperand(3).getImm()) // 'pred'
+        .addOperand(MI.getOperand(4))
+        .addReg(0); // 's' bit
+
+      MI.eraseFromParent();
+      return true;
+    }
+    case ARM::MOVCCsi: {
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsi),
+              (MI.getOperand(1).getReg()))
+        .addOperand(MI.getOperand(2))
+        .addImm(MI.getOperand(3).getImm())
+        .addImm(MI.getOperand(4).getImm()) // 'pred'
+        .addOperand(MI.getOperand(5))
+        .addReg(0); // 's' bit
+
+      MI.eraseFromParent();
+      return true;
+    }
+    case ARM::MOVCCsr: {
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsr),
+              (MI.getOperand(1).getReg()))
+        .addOperand(MI.getOperand(2))
+        .addOperand(MI.getOperand(3))
+        .addImm(MI.getOperand(4).getImm())
+        .addImm(MI.getOperand(5).getImm()) // 'pred'
+        .addOperand(MI.getOperand(6))
+        .addReg(0); // 's' bit
+
+      MI.eraseFromParent();
+      return true;
+    }
+    case ARM::t2MOVCCi16:
+    case ARM::MOVCCi16: {
+      unsigned NewOpc = AFI->isThumbFunction() ? ARM::t2MOVi16 : ARM::MOVi16;
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc),
+              MI.getOperand(1).getReg())
+        .addImm(MI.getOperand(2).getImm())
+        .addImm(MI.getOperand(3).getImm()) // 'pred'
+        .addOperand(MI.getOperand(4));
+      MI.eraseFromParent();
+      return true;
+    }
+    case ARM::t2MOVCCi:
+    case ARM::MOVCCi: {
+      unsigned Opc = AFI->isThumbFunction() ? ARM::t2MOVi : ARM::MOVi;
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc),
+              MI.getOperand(1).getReg())
+        .addImm(MI.getOperand(2).getImm())
+        .addImm(MI.getOperand(3).getImm()) // 'pred'
+        .addOperand(MI.getOperand(4))
+        .addReg(0); // 's' bit
+
+      MI.eraseFromParent();
+      return true;
+    }
+    case ARM::t2MVNCCi:
+    case ARM::MVNCCi: {
+      unsigned Opc = AFI->isThumbFunction() ? ARM::t2MVNi : ARM::MVNi;
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc),
+              MI.getOperand(1).getReg())
+        .addImm(MI.getOperand(2).getImm())
+        .addImm(MI.getOperand(3).getImm()) // 'pred'
+        .addOperand(MI.getOperand(4))
+        .addReg(0); // 's' bit
+
+      MI.eraseFromParent();
+      return true;
+    }
+    case ARM::t2MOVCClsl:
+    case ARM::t2MOVCClsr:
+    case ARM::t2MOVCCasr:
+    case ARM::t2MOVCCror: {
+      unsigned NewOpc;
+      switch (Opcode) {
+      case ARM::t2MOVCClsl: NewOpc = ARM::t2LSLri; break;
+      case ARM::t2MOVCClsr: NewOpc = ARM::t2LSRri; break;
+      case ARM::t2MOVCCasr: NewOpc = ARM::t2ASRri; break;
+      case ARM::t2MOVCCror: NewOpc = ARM::t2RORri; break;
+      default: llvm_unreachable("unexpeced conditional move");
+      }
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc),
+              MI.getOperand(1).getReg())
+        .addOperand(MI.getOperand(2))
+        .addImm(MI.getOperand(3).getImm())
+        .addImm(MI.getOperand(4).getImm()) // 'pred'
+        .addOperand(MI.getOperand(5))
+        .addReg(0); // 's' bit
+      MI.eraseFromParent();
+      return true;
+    }
+    case ARM::Int_eh_sjlj_dispatchsetup: {
+      MachineFunction &MF = *MI.getParent()->getParent();
+      const ARMBaseInstrInfo *AII =
+        static_cast<const ARMBaseInstrInfo*>(TII);
+      const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
+      // For functions using a base pointer, we rematerialize it (via the frame
+      // pointer) here since eh.sjlj.setjmp and eh.sjlj.longjmp don't do it
+      // for us. Otherwise, expand to nothing.
+      if (RI.hasBasePointer(MF)) {
+        int32_t NumBytes = AFI->getFramePtrSpillOffset();
+        unsigned FramePtr = RI.getFrameRegister(MF);
+        assert(MF.getTarget().getFrameLowering()->hasFP(MF) &&
+               "base pointer without frame pointer?");
+
+        if (AFI->isThumb2Function()) {
+          emitT2RegPlusImmediate(MBB, MBBI, MI.getDebugLoc(), ARM::R6,
+                                 FramePtr, -NumBytes, ARMCC::AL, 0, *TII);
+        } else if (AFI->isThumbFunction()) {
+          emitThumbRegPlusImmediate(MBB, MBBI, MI.getDebugLoc(), ARM::R6,
+                                    FramePtr, -NumBytes, *TII, RI);
+        } else {
+          emitARMRegPlusImmediate(MBB, MBBI, MI.getDebugLoc(), ARM::R6,
+                                  FramePtr, -NumBytes, ARMCC::AL, 0,
+                                  *TII);
+        }
+        // If there's dynamic realignment, adjust for it.
+        if (RI.needsStackRealignment(MF)) {
+          MachineFrameInfo  *MFI = MF.getFrameInfo();
+          unsigned MaxAlign = MFI->getMaxAlignment();
+          assert (!AFI->isThumb1OnlyFunction());
+          // Emit bic r6, r6, MaxAlign
+          unsigned bicOpc = AFI->isThumbFunction() ?
+            ARM::t2BICri : ARM::BICri;
+          AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                                              TII->get(bicOpc), ARM::R6)
+                                      .addReg(ARM::R6, RegState::Kill)
+                                      .addImm(MaxAlign-1)));
+        }
+
+      }
+      MI.eraseFromParent();
+      return true;
+    }
+
+    case ARM::MOVsrl_flag:
+    case ARM::MOVsra_flag: {
+      // These are just fancy MOVs instructions.
+      AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsi),
+                             MI.getOperand(0).getReg())
+                     .addOperand(MI.getOperand(1))
+                     .addImm(ARM_AM::getSORegOpc((Opcode == ARM::MOVsrl_flag ?
+                                                  ARM_AM::lsr : ARM_AM::asr),
+                                                 1)))
+        .addReg(ARM::CPSR, RegState::Define);
+      MI.eraseFromParent();
+      return true;
+    }
+    case ARM::RRX: {
+      // This encodes as "MOVs Rd, Rm, rrx
+      MachineInstrBuilder MIB =
+        AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(),TII->get(ARM::MOVsi),
+                               MI.getOperand(0).getReg())
+                       .addOperand(MI.getOperand(1))
+                       .addImm(ARM_AM::getSORegOpc(ARM_AM::rrx, 0)))
+        .addReg(0);
+      TransferImpOps(MI, MIB, MIB);
+      MI.eraseFromParent();
+      return true;
+    }
+    case ARM::tTPsoft:
+    case ARM::TPsoft: {
+      MachineInstrBuilder MIB;
+      if (Opcode == ARM::tTPsoft)
+        MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                      TII->get( ARM::tBL))
+              .addImm((unsigned)ARMCC::AL).addReg(0)
+              .addExternalSymbol("__aeabi_read_tp", 0);
+      else
+        MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                      TII->get( ARM::BL))
+              .addExternalSymbol("__aeabi_read_tp", 0);
+
+      MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+      TransferImpOps(MI, MIB, MIB);
+      MI.eraseFromParent();
+      return true;
+    }
+    case ARM::tLDRpci_pic:
+    case ARM::t2LDRpci_pic: {
+      unsigned NewLdOpc = (Opcode == ARM::tLDRpci_pic)
+        ? ARM::tLDRpci : ARM::t2LDRpci;
+      unsigned DstReg = MI.getOperand(0).getReg();
+      bool DstIsDead = MI.getOperand(0).isDead();
+      MachineInstrBuilder MIB1 =
+        AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                               TII->get(NewLdOpc), DstReg)
+                       .addOperand(MI.getOperand(1)));
+      MIB1->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+      MachineInstrBuilder MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                                         TII->get(ARM::tPICADD))
+        .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+        .addReg(DstReg)
+        .addOperand(MI.getOperand(2));
+      TransferImpOps(MI, MIB1, MIB2);
+      MI.eraseFromParent();
+      return true;
+    }
+
+    case ARM::LDRLIT_ga_abs:
+    case ARM::LDRLIT_ga_pcrel:
+    case ARM::LDRLIT_ga_pcrel_ldr:
+    case ARM::tLDRLIT_ga_abs:
+    case ARM::tLDRLIT_ga_pcrel: {
+      unsigned DstReg = MI.getOperand(0).getReg();
+      bool DstIsDead = MI.getOperand(0).isDead();
+      const MachineOperand &MO1 = MI.getOperand(1);
+      const GlobalValue *GV = MO1.getGlobal();
+      bool IsARM =
+          Opcode != ARM::tLDRLIT_ga_pcrel && Opcode != ARM::tLDRLIT_ga_abs;
+      bool IsPIC =
+          Opcode != ARM::LDRLIT_ga_abs && Opcode != ARM::tLDRLIT_ga_abs;
+      unsigned LDRLITOpc = IsARM ? ARM::LDRi12 : ARM::tLDRpci;
+      unsigned PICAddOpc =
+          IsARM
+              ? (Opcode == ARM::LDRLIT_ga_pcrel_ldr ? ARM::PICADD : ARM::PICLDR)
+              : ARM::tPICADD;
+
+      // We need a new const-pool entry to load from.
+      MachineConstantPool *MCP = MBB.getParent()->getConstantPool();
+      unsigned ARMPCLabelIndex = 0;
+      MachineConstantPoolValue *CPV;
+
+      if (IsPIC) {
+        unsigned PCAdj = IsARM ? 8 : 4;
+        ARMPCLabelIndex = AFI->createPICLabelUId();
+        CPV = ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex,
+                                              ARMCP::CPValue, PCAdj);
+      } else
+        CPV = ARMConstantPoolConstant::Create(GV, ARMCP::no_modifier);
+
+      MachineInstrBuilder MIB =
+          BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(LDRLITOpc), DstReg)
+            .addConstantPoolIndex(MCP->getConstantPoolIndex(CPV, 4));
+      if (IsARM)
+        MIB.addImm(0);
+      AddDefaultPred(MIB);
+
+      if (IsPIC) {
+        MachineInstrBuilder MIB =
+          BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(PICAddOpc))
+            .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+            .addReg(DstReg)
+            .addImm(ARMPCLabelIndex);
+
+        if (IsARM)
+          AddDefaultPred(MIB);
+      }
+
+      MI.eraseFromParent();
+      return true;
+    }
+    case ARM::MOV_ga_pcrel:
+    case ARM::MOV_ga_pcrel_ldr:
+    case ARM::t2MOV_ga_pcrel: {
+      // Expand into movw + movw. Also "add pc" / ldr [pc] in PIC mode.
+      unsigned LabelId = AFI->createPICLabelUId();
+      unsigned DstReg = MI.getOperand(0).getReg();
+      bool DstIsDead = MI.getOperand(0).isDead();
+      const MachineOperand &MO1 = MI.getOperand(1);
+      const GlobalValue *GV = MO1.getGlobal();
+      unsigned TF = MO1.getTargetFlags();
+      bool isARM = Opcode != ARM::t2MOV_ga_pcrel;
+      unsigned LO16Opc = isARM ? ARM::MOVi16_ga_pcrel : ARM::t2MOVi16_ga_pcrel;
+      unsigned HI16Opc = isARM ? ARM::MOVTi16_ga_pcrel :ARM::t2MOVTi16_ga_pcrel;
+      unsigned LO16TF = TF | ARMII::MO_LO16;
+      unsigned HI16TF = TF | ARMII::MO_HI16;
+      unsigned PICAddOpc = isARM
+        ? (Opcode == ARM::MOV_ga_pcrel_ldr ? ARM::PICLDR : ARM::PICADD)
+        : ARM::tPICADD;
+      MachineInstrBuilder MIB1 = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                                         TII->get(LO16Opc), DstReg)
+        .addGlobalAddress(GV, MO1.getOffset(), TF | LO16TF)
+        .addImm(LabelId);
+
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(HI16Opc), DstReg)
+        .addReg(DstReg)
+        .addGlobalAddress(GV, MO1.getOffset(), TF | HI16TF)
+        .addImm(LabelId);
+
+      MachineInstrBuilder MIB3 = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                                         TII->get(PICAddOpc))
+        .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+        .addReg(DstReg).addImm(LabelId);
+      if (isARM) {
+        AddDefaultPred(MIB3);
+        if (Opcode == ARM::MOV_ga_pcrel_ldr)
+          MIB3->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+      }
+      TransferImpOps(MI, MIB1, MIB3);
+      MI.eraseFromParent();
+      return true;
+    }
+
+    case ARM::MOVi32imm:
+    case ARM::MOVCCi32imm:
+    case ARM::t2MOVi32imm:
+    case ARM::t2MOVCCi32imm:
+      ExpandMOV32BitImm(MBB, MBBI);
+      return true;
+
+    case ARM::SUBS_PC_LR: {
+      MachineInstrBuilder MIB =
+          BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::SUBri), ARM::PC)
+              .addReg(ARM::LR)
+              .addOperand(MI.getOperand(0))
+              .addOperand(MI.getOperand(1))
+              .addOperand(MI.getOperand(2))
+              .addReg(ARM::CPSR, RegState::Undef);
+      TransferImpOps(MI, MIB, MIB);
+      MI.eraseFromParent();
+      return true;
+    }
+    case ARM::VLDMQIA: {
+      unsigned NewOpc = ARM::VLDMDIA;
+      MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc));
+      unsigned OpIdx = 0;
+
+      // Grab the Q register destination.
+      bool DstIsDead = MI.getOperand(OpIdx).isDead();
+      unsigned DstReg = MI.getOperand(OpIdx++).getReg();
+
+      // Copy the source register.
+      MIB.addOperand(MI.getOperand(OpIdx++));
+
+      // Copy the predicate operands.
+      MIB.addOperand(MI.getOperand(OpIdx++));
+      MIB.addOperand(MI.getOperand(OpIdx++));
+
+      // Add the destination operands (D subregs).
+      unsigned D0 = TRI->getSubReg(DstReg, ARM::dsub_0);
+      unsigned D1 = TRI->getSubReg(DstReg, ARM::dsub_1);
+      MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead))
+        .addReg(D1, RegState::Define | getDeadRegState(DstIsDead));
+
+      // Add an implicit def for the super-register.
+      MIB.addReg(DstReg, RegState::ImplicitDefine | getDeadRegState(DstIsDead));
+      TransferImpOps(MI, MIB, MIB);
+      MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+      MI.eraseFromParent();
+      return true;
+    }
+
+    case ARM::VSTMQIA: {
+      unsigned NewOpc = ARM::VSTMDIA;
+      MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc));
+      unsigned OpIdx = 0;
+
+      // Grab the Q register source.
+      bool SrcIsKill = MI.getOperand(OpIdx).isKill();
+      unsigned SrcReg = MI.getOperand(OpIdx++).getReg();
+
+      // Copy the destination register.
+      MIB.addOperand(MI.getOperand(OpIdx++));
+
+      // Copy the predicate operands.
+      MIB.addOperand(MI.getOperand(OpIdx++));
+      MIB.addOperand(MI.getOperand(OpIdx++));
+
+      // Add the source operands (D subregs).
+      unsigned D0 = TRI->getSubReg(SrcReg, ARM::dsub_0);
+      unsigned D1 = TRI->getSubReg(SrcReg, ARM::dsub_1);
+      MIB.addReg(D0).addReg(D1);
+
+      if (SrcIsKill)      // Add an implicit kill for the Q register.
+        MIB->addRegisterKilled(SrcReg, TRI, true);
+
+      TransferImpOps(MI, MIB, MIB);
+      MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+      MI.eraseFromParent();
+      return true;
+    }
+
+    case ARM::VLD2q8Pseudo:
+    case ARM::VLD2q16Pseudo:
+    case ARM::VLD2q32Pseudo:
+    case ARM::VLD2q8PseudoWB_fixed:
+    case ARM::VLD2q16PseudoWB_fixed:
+    case ARM::VLD2q32PseudoWB_fixed:
+    case ARM::VLD2q8PseudoWB_register:
+    case ARM::VLD2q16PseudoWB_register:
+    case ARM::VLD2q32PseudoWB_register:
+    case ARM::VLD3d8Pseudo:
+    case ARM::VLD3d16Pseudo:
+    case ARM::VLD3d32Pseudo:
+    case ARM::VLD1d64TPseudo:
+    case ARM::VLD1d64TPseudoWB_fixed:
+    case ARM::VLD3d8Pseudo_UPD:
+    case ARM::VLD3d16Pseudo_UPD:
+    case ARM::VLD3d32Pseudo_UPD:
+    case ARM::VLD3q8Pseudo_UPD:
+    case ARM::VLD3q16Pseudo_UPD:
+    case ARM::VLD3q32Pseudo_UPD:
+    case ARM::VLD3q8oddPseudo:
+    case ARM::VLD3q16oddPseudo:
+    case ARM::VLD3q32oddPseudo:
+    case ARM::VLD3q8oddPseudo_UPD:
+    case ARM::VLD3q16oddPseudo_UPD:
+    case ARM::VLD3q32oddPseudo_UPD:
+    case ARM::VLD4d8Pseudo:
+    case ARM::VLD4d16Pseudo:
+    case ARM::VLD4d32Pseudo:
+    case ARM::VLD1d64QPseudo:
+    case ARM::VLD1d64QPseudoWB_fixed:
+    case ARM::VLD4d8Pseudo_UPD:
+    case ARM::VLD4d16Pseudo_UPD:
+    case ARM::VLD4d32Pseudo_UPD:
+    case ARM::VLD4q8Pseudo_UPD:
+    case ARM::VLD4q16Pseudo_UPD:
+    case ARM::VLD4q32Pseudo_UPD:
+    case ARM::VLD4q8oddPseudo:
+    case ARM::VLD4q16oddPseudo:
+    case ARM::VLD4q32oddPseudo:
+    case ARM::VLD4q8oddPseudo_UPD:
+    case ARM::VLD4q16oddPseudo_UPD:
+    case ARM::VLD4q32oddPseudo_UPD:
+    case ARM::VLD3DUPd8Pseudo:
+    case ARM::VLD3DUPd16Pseudo:
+    case ARM::VLD3DUPd32Pseudo:
+    case ARM::VLD3DUPd8Pseudo_UPD:
+    case ARM::VLD3DUPd16Pseudo_UPD:
+    case ARM::VLD3DUPd32Pseudo_UPD:
+    case ARM::VLD4DUPd8Pseudo:
+    case ARM::VLD4DUPd16Pseudo:
+    case ARM::VLD4DUPd32Pseudo:
+    case ARM::VLD4DUPd8Pseudo_UPD:
+    case ARM::VLD4DUPd16Pseudo_UPD:
+    case ARM::VLD4DUPd32Pseudo_UPD:
+      ExpandVLD(MBBI);
+      return true;
+
+    case ARM::VST2q8Pseudo:
+    case ARM::VST2q16Pseudo:
+    case ARM::VST2q32Pseudo:
+    case ARM::VST2q8PseudoWB_fixed:
+    case ARM::VST2q16PseudoWB_fixed:
+    case ARM::VST2q32PseudoWB_fixed:
+    case ARM::VST2q8PseudoWB_register:
+    case ARM::VST2q16PseudoWB_register:
+    case ARM::VST2q32PseudoWB_register:
+    case ARM::VST3d8Pseudo:
+    case ARM::VST3d16Pseudo:
+    case ARM::VST3d32Pseudo:
+    case ARM::VST1d64TPseudo:
+    case ARM::VST3d8Pseudo_UPD:
+    case ARM::VST3d16Pseudo_UPD:
+    case ARM::VST3d32Pseudo_UPD:
+    case ARM::VST1d64TPseudoWB_fixed:
+    case ARM::VST1d64TPseudoWB_register:
+    case ARM::VST3q8Pseudo_UPD:
+    case ARM::VST3q16Pseudo_UPD:
+    case ARM::VST3q32Pseudo_UPD:
+    case ARM::VST3q8oddPseudo:
+    case ARM::VST3q16oddPseudo:
+    case ARM::VST3q32oddPseudo:
+    case ARM::VST3q8oddPseudo_UPD:
+    case ARM::VST3q16oddPseudo_UPD:
+    case ARM::VST3q32oddPseudo_UPD:
+    case ARM::VST4d8Pseudo:
+    case ARM::VST4d16Pseudo:
+    case ARM::VST4d32Pseudo:
+    case ARM::VST1d64QPseudo:
+    case ARM::VST4d8Pseudo_UPD:
+    case ARM::VST4d16Pseudo_UPD:
+    case ARM::VST4d32Pseudo_UPD:
+    case ARM::VST1d64QPseudoWB_fixed:
+    case ARM::VST1d64QPseudoWB_register:
+    case ARM::VST4q8Pseudo_UPD:
+    case ARM::VST4q16Pseudo_UPD:
+    case ARM::VST4q32Pseudo_UPD:
+    case ARM::VST4q8oddPseudo:
+    case ARM::VST4q16oddPseudo:
+    case ARM::VST4q32oddPseudo:
+    case ARM::VST4q8oddPseudo_UPD:
+    case ARM::VST4q16oddPseudo_UPD:
+    case ARM::VST4q32oddPseudo_UPD:
+      ExpandVST(MBBI);
+      return true;
+
+    case ARM::VLD1LNq8Pseudo:
+    case ARM::VLD1LNq16Pseudo:
+    case ARM::VLD1LNq32Pseudo:
+    case ARM::VLD1LNq8Pseudo_UPD:
+    case ARM::VLD1LNq16Pseudo_UPD:
+    case ARM::VLD1LNq32Pseudo_UPD:
+    case ARM::VLD2LNd8Pseudo:
+    case ARM::VLD2LNd16Pseudo:
+    case ARM::VLD2LNd32Pseudo:
+    case ARM::VLD2LNq16Pseudo:
+    case ARM::VLD2LNq32Pseudo:
+    case ARM::VLD2LNd8Pseudo_UPD:
+    case ARM::VLD2LNd16Pseudo_UPD:
+    case ARM::VLD2LNd32Pseudo_UPD:
+    case ARM::VLD2LNq16Pseudo_UPD:
+    case ARM::VLD2LNq32Pseudo_UPD:
+    case ARM::VLD3LNd8Pseudo:
+    case ARM::VLD3LNd16Pseudo:
+    case ARM::VLD3LNd32Pseudo:
+    case ARM::VLD3LNq16Pseudo:
+    case ARM::VLD3LNq32Pseudo:
+    case ARM::VLD3LNd8Pseudo_UPD:
+    case ARM::VLD3LNd16Pseudo_UPD:
+    case ARM::VLD3LNd32Pseudo_UPD:
+    case ARM::VLD3LNq16Pseudo_UPD:
+    case ARM::VLD3LNq32Pseudo_UPD:
+    case ARM::VLD4LNd8Pseudo:
+    case ARM::VLD4LNd16Pseudo:
+    case ARM::VLD4LNd32Pseudo:
+    case ARM::VLD4LNq16Pseudo:
+    case ARM::VLD4LNq32Pseudo:
+    case ARM::VLD4LNd8Pseudo_UPD:
+    case ARM::VLD4LNd16Pseudo_UPD:
+    case ARM::VLD4LNd32Pseudo_UPD:
+    case ARM::VLD4LNq16Pseudo_UPD:
+    case ARM::VLD4LNq32Pseudo_UPD:
+    case ARM::VST1LNq8Pseudo:
+    case ARM::VST1LNq16Pseudo:
+    case ARM::VST1LNq32Pseudo:
+    case ARM::VST1LNq8Pseudo_UPD:
+    case ARM::VST1LNq16Pseudo_UPD:
+    case ARM::VST1LNq32Pseudo_UPD:
+    case ARM::VST2LNd8Pseudo:
+    case ARM::VST2LNd16Pseudo:
+    case ARM::VST2LNd32Pseudo:
+    case ARM::VST2LNq16Pseudo:
+    case ARM::VST2LNq32Pseudo:
+    case ARM::VST2LNd8Pseudo_UPD:
+    case ARM::VST2LNd16Pseudo_UPD:
+    case ARM::VST2LNd32Pseudo_UPD:
+    case ARM::VST2LNq16Pseudo_UPD:
+    case ARM::VST2LNq32Pseudo_UPD:
+    case ARM::VST3LNd8Pseudo:
+    case ARM::VST3LNd16Pseudo:
+    case ARM::VST3LNd32Pseudo:
+    case ARM::VST3LNq16Pseudo:
+    case ARM::VST3LNq32Pseudo:
+    case ARM::VST3LNd8Pseudo_UPD:
+    case ARM::VST3LNd16Pseudo_UPD:
+    case ARM::VST3LNd32Pseudo_UPD:
+    case ARM::VST3LNq16Pseudo_UPD:
+    case ARM::VST3LNq32Pseudo_UPD:
+    case ARM::VST4LNd8Pseudo:
+    case ARM::VST4LNd16Pseudo:
+    case ARM::VST4LNd32Pseudo:
+    case ARM::VST4LNq16Pseudo:
+    case ARM::VST4LNq32Pseudo:
+    case ARM::VST4LNd8Pseudo_UPD:
+    case ARM::VST4LNd16Pseudo_UPD:
+    case ARM::VST4LNd32Pseudo_UPD:
+    case ARM::VST4LNq16Pseudo_UPD:
+    case ARM::VST4LNq32Pseudo_UPD:
+      ExpandLaneOp(MBBI);
+      return true;
+
+    case ARM::VTBL3Pseudo: ExpandVTBL(MBBI, ARM::VTBL3, false); return true;
+    case ARM::VTBL4Pseudo: ExpandVTBL(MBBI, ARM::VTBL4, false); return true;
+    case ARM::VTBX3Pseudo: ExpandVTBL(MBBI, ARM::VTBX3, true); return true;
+    case ARM::VTBX4Pseudo: ExpandVTBL(MBBI, ARM::VTBX4, true); return true;
+  }
+}
+
+bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
+  bool Modified = false;
+
+  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+  while (MBBI != E) {
+    MachineBasicBlock::iterator NMBBI = std::next(MBBI);
+    Modified |= ExpandMI(MBB, MBBI);
+    MBBI = NMBBI;
+  }
+
+  return Modified;
+}
+
+bool ARMExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
+  const TargetMachine &TM = MF.getTarget();
+  TII = static_cast<const ARMBaseInstrInfo*>(TM.getInstrInfo());
+  TRI = TM.getRegisterInfo();
+  STI = &TM.getSubtarget<ARMSubtarget>();
+  AFI = MF.getInfo<ARMFunctionInfo>();
+
+  bool Modified = false;
+  for (MachineFunction::iterator MFI = MF.begin(), E = MF.end(); MFI != E;
+       ++MFI)
+    Modified |= ExpandMBB(*MFI);
+  if (VerifyARMPseudo)
+    MF.verify(this, "After expanding ARM pseudo instructions.");
+  return Modified;
+}
+
+/// createARMExpandPseudoPass - returns an instance of the pseudo instruction
+/// expansion pass.
+FunctionPass *llvm::createARMExpandPseudoPass() {
+  return new ARMExpandPseudo();
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMFPUName.def b/contrib/llvm/lib/Target/ARM/ARMFPUName.def
new file mode 100644
index 0000000..1fef3b3
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMFPUName.def
@@ -0,0 +1,33 @@
+//===-- ARMFPUName.def - List of the ARM FPU names --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the list of the supported ARM FPU names.
+//
+//===----------------------------------------------------------------------===//
+
+// NOTE: NO INCLUDE GUARD DESIRED!
+
+#ifndef ARM_FPU_NAME
+#error "You must define ARM_FPU_NAME(NAME, ID) before including ARMFPUName.h"
+#endif
+
+ARM_FPU_NAME("vfp", VFP)
+ARM_FPU_NAME("vfpv2", VFPV2)
+ARM_FPU_NAME("vfpv3", VFPV3)
+ARM_FPU_NAME("vfpv3-d16", VFPV3_D16)
+ARM_FPU_NAME("vfpv4", VFPV4)
+ARM_FPU_NAME("vfpv4-d16", VFPV4_D16)
+ARM_FPU_NAME("fp-armv8", FP_ARMV8)
+ARM_FPU_NAME("neon", NEON)
+ARM_FPU_NAME("neon-vfpv4", NEON_VFPV4)
+ARM_FPU_NAME("neon-fp-armv8", NEON_FP_ARMV8)
+ARM_FPU_NAME("crypto-neon-fp-armv8", CRYPTO_NEON_FP_ARMV8)
+ARM_FPU_NAME("softvfp", SOFTVFP)
+
+#undef ARM_FPU_NAME
diff --git a/contrib/llvm/lib/Target/ARM/ARMFPUName.h b/contrib/llvm/lib/Target/ARM/ARMFPUName.h
new file mode 100644
index 0000000..2a64cce
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMFPUName.h
@@ -0,0 +1,26 @@
+//===-- ARMFPUName.h - List of the ARM FPU names ----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMFPUNAME_H
+#define ARMFPUNAME_H
+
+namespace llvm {
+namespace ARM {
+
+enum FPUKind {
+  INVALID_FPU = 0
+
+#define ARM_FPU_NAME(NAME, ID) , ID
+#include "ARMFPUName.def"
+};
+
+} // namespace ARM
+} // namespace llvm
+
+#endif // ARMFPUNAME_H
diff --git a/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp b/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp
new file mode 100644
index 0000000..e2d90cd
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp
@@ -0,0 +1,3081 @@
+//===-- ARMFastISel.cpp - ARM FastISel implementation ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the ARM-specific support for the FastISel class. Some
+// of the target-specific code is generated by tablegen in the file
+// ARMGenFastISel.inc, which is #included here.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMBaseRegisterInfo.h"
+#include "ARMCallingConv.h"
+#include "ARMConstantPoolValue.h"
+#include "ARMISelLowering.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMSubtarget.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+extern cl::opt<bool> EnableARMLongCalls;
+
+namespace {
+
+  // All possible address modes, plus some.
+  typedef struct Address {
+    enum {
+      RegBase,
+      FrameIndexBase
+    } BaseType;
+
+    union {
+      unsigned Reg;
+      int FI;
+    } Base;
+
+    int Offset;
+
+    // Innocuous defaults for our address.
+    Address()
+     : BaseType(RegBase), Offset(0) {
+       Base.Reg = 0;
+     }
+  } Address;
+
+class ARMFastISel final : public FastISel {
+
+  /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const ARMSubtarget *Subtarget;
+  Module &M;
+  const TargetMachine &TM;
+  const TargetInstrInfo &TII;
+  const TargetLowering &TLI;
+  ARMFunctionInfo *AFI;
+
+  // Convenience variables to avoid some queries.
+  bool isThumb2;
+  LLVMContext *Context;
+
+  public:
+    explicit ARMFastISel(FunctionLoweringInfo &funcInfo,
+                         const TargetLibraryInfo *libInfo)
+    : FastISel(funcInfo, libInfo),
+      M(const_cast<Module&>(*funcInfo.Fn->getParent())),
+      TM(funcInfo.MF->getTarget()),
+      TII(*TM.getInstrInfo()),
+      TLI(*TM.getTargetLowering()) {
+      Subtarget = &TM.getSubtarget<ARMSubtarget>();
+      AFI = funcInfo.MF->getInfo<ARMFunctionInfo>();
+      isThumb2 = AFI->isThumbFunction();
+      Context = &funcInfo.Fn->getContext();
+    }
+
+    // Code from FastISel.cpp.
+  private:
+    unsigned FastEmitInst_r(unsigned MachineInstOpcode,
+                            const TargetRegisterClass *RC,
+                            unsigned Op0, bool Op0IsKill);
+    unsigned FastEmitInst_rr(unsigned MachineInstOpcode,
+                             const TargetRegisterClass *RC,
+                             unsigned Op0, bool Op0IsKill,
+                             unsigned Op1, bool Op1IsKill);
+    unsigned FastEmitInst_rrr(unsigned MachineInstOpcode,
+                              const TargetRegisterClass *RC,
+                              unsigned Op0, bool Op0IsKill,
+                              unsigned Op1, bool Op1IsKill,
+                              unsigned Op2, bool Op2IsKill);
+    unsigned FastEmitInst_ri(unsigned MachineInstOpcode,
+                             const TargetRegisterClass *RC,
+                             unsigned Op0, bool Op0IsKill,
+                             uint64_t Imm);
+    unsigned FastEmitInst_rri(unsigned MachineInstOpcode,
+                              const TargetRegisterClass *RC,
+                              unsigned Op0, bool Op0IsKill,
+                              unsigned Op1, bool Op1IsKill,
+                              uint64_t Imm);
+    unsigned FastEmitInst_i(unsigned MachineInstOpcode,
+                            const TargetRegisterClass *RC,
+                            uint64_t Imm);
+
+    // Backend specific FastISel code.
+  private:
+    bool TargetSelectInstruction(const Instruction *I) override;
+    unsigned TargetMaterializeConstant(const Constant *C) override;
+    unsigned TargetMaterializeAlloca(const AllocaInst *AI) override;
+    bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
+                             const LoadInst *LI) override;
+    bool FastLowerArguments() override;
+  private:
+  #include "ARMGenFastISel.inc"
+
+    // Instruction selection routines.
+  private:
+    bool SelectLoad(const Instruction *I);
+    bool SelectStore(const Instruction *I);
+    bool SelectBranch(const Instruction *I);
+    bool SelectIndirectBr(const Instruction *I);
+    bool SelectCmp(const Instruction *I);
+    bool SelectFPExt(const Instruction *I);
+    bool SelectFPTrunc(const Instruction *I);
+    bool SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode);
+    bool SelectBinaryFPOp(const Instruction *I, unsigned ISDOpcode);
+    bool SelectIToFP(const Instruction *I, bool isSigned);
+    bool SelectFPToI(const Instruction *I, bool isSigned);
+    bool SelectDiv(const Instruction *I, bool isSigned);
+    bool SelectRem(const Instruction *I, bool isSigned);
+    bool SelectCall(const Instruction *I, const char *IntrMemName);
+    bool SelectIntrinsicCall(const IntrinsicInst &I);
+    bool SelectSelect(const Instruction *I);
+    bool SelectRet(const Instruction *I);
+    bool SelectTrunc(const Instruction *I);
+    bool SelectIntExt(const Instruction *I);
+    bool SelectShift(const Instruction *I, ARM_AM::ShiftOpc ShiftTy);
+
+    // Utility routines.
+  private:
+    bool isTypeLegal(Type *Ty, MVT &VT);
+    bool isLoadTypeLegal(Type *Ty, MVT &VT);
+    bool ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
+                    bool isZExt);
+    bool ARMEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
+                     unsigned Alignment = 0, bool isZExt = true,
+                     bool allocReg = true);
+    bool ARMEmitStore(MVT VT, unsigned SrcReg, Address &Addr,
+                      unsigned Alignment = 0);
+    bool ARMComputeAddress(const Value *Obj, Address &Addr);
+    void ARMSimplifyAddress(Address &Addr, MVT VT, bool useAM3);
+    bool ARMIsMemCpySmall(uint64_t Len);
+    bool ARMTryEmitSmallMemCpy(Address Dest, Address Src, uint64_t Len,
+                               unsigned Alignment);
+    unsigned ARMEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, bool isZExt);
+    unsigned ARMMaterializeFP(const ConstantFP *CFP, MVT VT);
+    unsigned ARMMaterializeInt(const Constant *C, MVT VT);
+    unsigned ARMMaterializeGV(const GlobalValue *GV, MVT VT);
+    unsigned ARMMoveToFPReg(MVT VT, unsigned SrcReg);
+    unsigned ARMMoveToIntReg(MVT VT, unsigned SrcReg);
+    unsigned ARMSelectCallOp(bool UseReg);
+    unsigned ARMLowerPICELF(const GlobalValue *GV, unsigned Align, MVT VT);
+
+    const TargetLowering *getTargetLowering() { return TM.getTargetLowering(); }
+
+    // Call handling routines.
+  private:
+    CCAssignFn *CCAssignFnForCall(CallingConv::ID CC,
+                                  bool Return,
+                                  bool isVarArg);
+    bool ProcessCallArgs(SmallVectorImpl<Value*> &Args,
+                         SmallVectorImpl<unsigned> &ArgRegs,
+                         SmallVectorImpl<MVT> &ArgVTs,
+                         SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
+                         SmallVectorImpl<unsigned> &RegArgs,
+                         CallingConv::ID CC,
+                         unsigned &NumBytes,
+                         bool isVarArg);
+    unsigned getLibcallReg(const Twine &Name);
+    bool FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
+                    const Instruction *I, CallingConv::ID CC,
+                    unsigned &NumBytes, bool isVarArg);
+    bool ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call);
+
+    // OptionalDef handling routines.
+  private:
+    bool isARMNEONPred(const MachineInstr *MI);
+    bool DefinesOptionalPredicate(MachineInstr *MI, bool *CPSR);
+    const MachineInstrBuilder &AddOptionalDefs(const MachineInstrBuilder &MIB);
+    void AddLoadStoreOperands(MVT VT, Address &Addr,
+                              const MachineInstrBuilder &MIB,
+                              unsigned Flags, bool useAM3);
+};
+
+} // end anonymous namespace
+
+#include "ARMGenCallingConv.inc"
+
+// DefinesOptionalPredicate - This is different from DefinesPredicate in that
+// we don't care about implicit defs here, just places we'll need to add a
+// default CCReg argument. Sets CPSR if we're setting CPSR instead of CCR.
+bool ARMFastISel::DefinesOptionalPredicate(MachineInstr *MI, bool *CPSR) {
+  if (!MI->hasOptionalDef())
+    return false;
+
+  // Look to see if our OptionalDef is defining CPSR or CCR.
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isDef()) continue;
+    if (MO.getReg() == ARM::CPSR)
+      *CPSR = true;
+  }
+  return true;
+}
+
+bool ARMFastISel::isARMNEONPred(const MachineInstr *MI) {
+  const MCInstrDesc &MCID = MI->getDesc();
+
+  // If we're a thumb2 or not NEON function we'll be handled via isPredicable.
+  if ((MCID.TSFlags & ARMII::DomainMask) != ARMII::DomainNEON ||
+       AFI->isThumb2Function())
+    return MI->isPredicable();
+
+  for (unsigned i = 0, e = MCID.getNumOperands(); i != e; ++i)
+    if (MCID.OpInfo[i].isPredicate())
+      return true;
+
+  return false;
+}
+
+// If the machine is predicable go ahead and add the predicate operands, if
+// it needs default CC operands add those.
+// TODO: If we want to support thumb1 then we'll need to deal with optional
+// CPSR defs that need to be added before the remaining operands. See s_cc_out
+// for descriptions why.
+const MachineInstrBuilder &
+ARMFastISel::AddOptionalDefs(const MachineInstrBuilder &MIB) {
+  MachineInstr *MI = &*MIB;
+
+  // Do we use a predicate? or...
+  // Are we NEON in ARM mode and have a predicate operand? If so, I know
+  // we're not predicable but add it anyways.
+  if (isARMNEONPred(MI))
+    AddDefaultPred(MIB);
+
+  // Do we optionally set a predicate?  Preds is size > 0 iff the predicate
+  // defines CPSR. All other OptionalDefines in ARM are the CCR register.
+  bool CPSR = false;
+  if (DefinesOptionalPredicate(MI, &CPSR)) {
+    if (CPSR)
+      AddDefaultT1CC(MIB);
+    else
+      AddDefaultCC(MIB);
+  }
+  return MIB;
+}
+
+unsigned ARMFastISel::FastEmitInst_r(unsigned MachineInstOpcode,
+                                     const TargetRegisterClass *RC,
+                                     unsigned Op0, bool Op0IsKill) {
+  unsigned ResultReg = createResultReg(RC);
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  // Make sure the input operand is sufficiently constrained to be legal
+  // for this instruction.
+  Op0 = constrainOperandRegClass(II, Op0, 1);
+  if (II.getNumDefs() >= 1) {
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II,
+                            ResultReg).addReg(Op0, Op0IsKill * RegState::Kill));
+  } else {
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+                   .addReg(Op0, Op0IsKill * RegState::Kill));
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                   TII.get(TargetOpcode::COPY), ResultReg)
+                   .addReg(II.ImplicitDefs[0]));
+  }
+  return ResultReg;
+}
+
+unsigned ARMFastISel::FastEmitInst_rr(unsigned MachineInstOpcode,
+                                      const TargetRegisterClass *RC,
+                                      unsigned Op0, bool Op0IsKill,
+                                      unsigned Op1, bool Op1IsKill) {
+  unsigned ResultReg = createResultReg(RC);
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  // Make sure the input operands are sufficiently constrained to be legal
+  // for this instruction.
+  Op0 = constrainOperandRegClass(II, Op0, 1);
+  Op1 = constrainOperandRegClass(II, Op1, 2);
+
+  if (II.getNumDefs() >= 1) {
+    AddOptionalDefs(
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+            .addReg(Op0, Op0IsKill * RegState::Kill)
+            .addReg(Op1, Op1IsKill * RegState::Kill));
+  } else {
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+                   .addReg(Op0, Op0IsKill * RegState::Kill)
+                   .addReg(Op1, Op1IsKill * RegState::Kill));
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                           TII.get(TargetOpcode::COPY), ResultReg)
+                   .addReg(II.ImplicitDefs[0]));
+  }
+  return ResultReg;
+}
+
+unsigned ARMFastISel::FastEmitInst_rrr(unsigned MachineInstOpcode,
+                                       const TargetRegisterClass *RC,
+                                       unsigned Op0, bool Op0IsKill,
+                                       unsigned Op1, bool Op1IsKill,
+                                       unsigned Op2, bool Op2IsKill) {
+  unsigned ResultReg = createResultReg(RC);
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  // Make sure the input operands are sufficiently constrained to be legal
+  // for this instruction.
+  Op0 = constrainOperandRegClass(II, Op0, 1);
+  Op1 = constrainOperandRegClass(II, Op1, 2);
+  Op2 = constrainOperandRegClass(II, Op1, 3);
+
+  if (II.getNumDefs() >= 1) {
+    AddOptionalDefs(
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+            .addReg(Op0, Op0IsKill * RegState::Kill)
+            .addReg(Op1, Op1IsKill * RegState::Kill)
+            .addReg(Op2, Op2IsKill * RegState::Kill));
+  } else {
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+                   .addReg(Op0, Op0IsKill * RegState::Kill)
+                   .addReg(Op1, Op1IsKill * RegState::Kill)
+                   .addReg(Op2, Op2IsKill * RegState::Kill));
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                           TII.get(TargetOpcode::COPY), ResultReg)
+                   .addReg(II.ImplicitDefs[0]));
+  }
+  return ResultReg;
+}
+
+unsigned ARMFastISel::FastEmitInst_ri(unsigned MachineInstOpcode,
+                                      const TargetRegisterClass *RC,
+                                      unsigned Op0, bool Op0IsKill,
+                                      uint64_t Imm) {
+  unsigned ResultReg = createResultReg(RC);
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  // Make sure the input operand is sufficiently constrained to be legal
+  // for this instruction.
+  Op0 = constrainOperandRegClass(II, Op0, 1);
+  if (II.getNumDefs() >= 1) {
+    AddOptionalDefs(
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+            .addReg(Op0, Op0IsKill * RegState::Kill)
+            .addImm(Imm));
+  } else {
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+                   .addReg(Op0, Op0IsKill * RegState::Kill)
+                   .addImm(Imm));
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                           TII.get(TargetOpcode::COPY), ResultReg)
+                   .addReg(II.ImplicitDefs[0]));
+  }
+  return ResultReg;
+}
+
+unsigned ARMFastISel::FastEmitInst_rri(unsigned MachineInstOpcode,
+                                       const TargetRegisterClass *RC,
+                                       unsigned Op0, bool Op0IsKill,
+                                       unsigned Op1, bool Op1IsKill,
+                                       uint64_t Imm) {
+  unsigned ResultReg = createResultReg(RC);
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  // Make sure the input operands are sufficiently constrained to be legal
+  // for this instruction.
+  Op0 = constrainOperandRegClass(II, Op0, 1);
+  Op1 = constrainOperandRegClass(II, Op1, 2);
+  if (II.getNumDefs() >= 1) {
+    AddOptionalDefs(
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+            .addReg(Op0, Op0IsKill * RegState::Kill)
+            .addReg(Op1, Op1IsKill * RegState::Kill)
+            .addImm(Imm));
+  } else {
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+                   .addReg(Op0, Op0IsKill * RegState::Kill)
+                   .addReg(Op1, Op1IsKill * RegState::Kill)
+                   .addImm(Imm));
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                           TII.get(TargetOpcode::COPY), ResultReg)
+                   .addReg(II.ImplicitDefs[0]));
+  }
+  return ResultReg;
+}
+
+unsigned ARMFastISel::FastEmitInst_i(unsigned MachineInstOpcode,
+                                     const TargetRegisterClass *RC,
+                                     uint64_t Imm) {
+  unsigned ResultReg = createResultReg(RC);
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  if (II.getNumDefs() >= 1) {
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II,
+                            ResultReg).addImm(Imm));
+  } else {
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+                   .addImm(Imm));
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                           TII.get(TargetOpcode::COPY), ResultReg)
+                   .addReg(II.ImplicitDefs[0]));
+  }
+  return ResultReg;
+}
+
+// TODO: Don't worry about 64-bit now, but when this is fixed remove the
+// checks from the various callers.
+unsigned ARMFastISel::ARMMoveToFPReg(MVT VT, unsigned SrcReg) {
+  if (VT == MVT::f64) return 0;
+
+  unsigned MoveReg = createResultReg(TLI.getRegClassFor(VT));
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                          TII.get(ARM::VMOVSR), MoveReg)
+                  .addReg(SrcReg));
+  return MoveReg;
+}
+
+unsigned ARMFastISel::ARMMoveToIntReg(MVT VT, unsigned SrcReg) {
+  if (VT == MVT::i64) return 0;
+
+  unsigned MoveReg = createResultReg(TLI.getRegClassFor(VT));
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                          TII.get(ARM::VMOVRS), MoveReg)
+                  .addReg(SrcReg));
+  return MoveReg;
+}
+
+// For double width floating point we need to materialize two constants
+// (the high and the low) into integer registers then use a move to get
+// the combined constant into an FP reg.
+unsigned ARMFastISel::ARMMaterializeFP(const ConstantFP *CFP, MVT VT) {
+  const APFloat Val = CFP->getValueAPF();
+  bool is64bit = VT == MVT::f64;
+
+  // This checks to see if we can use VFP3 instructions to materialize
+  // a constant, otherwise we have to go through the constant pool.
+  if (TLI.isFPImmLegal(Val, VT)) {
+    int Imm;
+    unsigned Opc;
+    if (is64bit) {
+      Imm = ARM_AM::getFP64Imm(Val);
+      Opc = ARM::FCONSTD;
+    } else {
+      Imm = ARM_AM::getFP32Imm(Val);
+      Opc = ARM::FCONSTS;
+    }
+    unsigned DestReg = createResultReg(TLI.getRegClassFor(VT));
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                            TII.get(Opc), DestReg).addImm(Imm));
+    return DestReg;
+  }
+
+  // Require VFP2 for loading fp constants.
+  if (!Subtarget->hasVFP2()) return false;
+
+  // MachineConstantPool wants an explicit alignment.
+  unsigned Align = DL.getPrefTypeAlignment(CFP->getType());
+  if (Align == 0) {
+    // TODO: Figure out if this is correct.
+    Align = DL.getTypeAllocSize(CFP->getType());
+  }
+  unsigned Idx = MCP.getConstantPoolIndex(cast<Constant>(CFP), Align);
+  unsigned DestReg = createResultReg(TLI.getRegClassFor(VT));
+  unsigned Opc = is64bit ? ARM::VLDRD : ARM::VLDRS;
+
+  // The extra reg is for addrmode5.
+  AddOptionalDefs(
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
+          .addConstantPoolIndex(Idx)
+          .addReg(0));
+  return DestReg;
+}
+
+unsigned ARMFastISel::ARMMaterializeInt(const Constant *C, MVT VT) {
+
+  if (VT != MVT::i32 && VT != MVT::i16 && VT != MVT::i8 && VT != MVT::i1)
+    return false;
+
+  // If we can do this in a single instruction without a constant pool entry
+  // do so now.
+  const ConstantInt *CI = cast<ConstantInt>(C);
+  if (Subtarget->hasV6T2Ops() && isUInt<16>(CI->getZExtValue())) {
+    unsigned Opc = isThumb2 ? ARM::t2MOVi16 : ARM::MOVi16;
+    const TargetRegisterClass *RC = isThumb2 ? &ARM::rGPRRegClass :
+      &ARM::GPRRegClass;
+    unsigned ImmReg = createResultReg(RC);
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                            TII.get(Opc), ImmReg)
+                    .addImm(CI->getZExtValue()));
+    return ImmReg;
+  }
+
+  // Use MVN to emit negative constants.
+  if (VT == MVT::i32 && Subtarget->hasV6T2Ops() && CI->isNegative()) {
+    unsigned Imm = (unsigned)~(CI->getSExtValue());
+    bool UseImm = isThumb2 ? (ARM_AM::getT2SOImmVal(Imm) != -1) :
+      (ARM_AM::getSOImmVal(Imm) != -1);
+    if (UseImm) {
+      unsigned Opc = isThumb2 ? ARM::t2MVNi : ARM::MVNi;
+      unsigned ImmReg = createResultReg(TLI.getRegClassFor(MVT::i32));
+      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                              TII.get(Opc), ImmReg)
+                      .addImm(Imm));
+      return ImmReg;
+    }
+  }
+
+  // Load from constant pool.  For now 32-bit only.
+  if (VT != MVT::i32)
+    return false;
+
+  unsigned DestReg = createResultReg(TLI.getRegClassFor(VT));
+
+  // MachineConstantPool wants an explicit alignment.
+  unsigned Align = DL.getPrefTypeAlignment(C->getType());
+  if (Align == 0) {
+    // TODO: Figure out if this is correct.
+    Align = DL.getTypeAllocSize(C->getType());
+  }
+  unsigned Idx = MCP.getConstantPoolIndex(C, Align);
+
+  if (isThumb2)
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                            TII.get(ARM::t2LDRpci), DestReg)
+                    .addConstantPoolIndex(Idx));
+  else {
+    // The extra immediate is for addrmode2.
+    DestReg = constrainOperandRegClass(TII.get(ARM::LDRcp), DestReg, 0);
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                            TII.get(ARM::LDRcp), DestReg)
+                    .addConstantPoolIndex(Idx)
+                    .addImm(0));
+  }
+
+  return DestReg;
+}
+
+unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) {
+  // For now 32-bit only.
+  if (VT != MVT::i32) return 0;
+
+  Reloc::Model RelocM = TM.getRelocationModel();
+  bool IsIndirect = Subtarget->GVIsIndirectSymbol(GV, RelocM);
+  const TargetRegisterClass *RC = isThumb2 ?
+    (const TargetRegisterClass*)&ARM::rGPRRegClass :
+    (const TargetRegisterClass*)&ARM::GPRRegClass;
+  unsigned DestReg = createResultReg(RC);
+
+  // FastISel TLS support on non-MachO is broken, punt to SelectionDAG.
+  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
+  bool IsThreadLocal = GVar && GVar->isThreadLocal();
+  if (!Subtarget->isTargetMachO() && IsThreadLocal) return 0;
+
+  // Use movw+movt when possible, it avoids constant pool entries.
+  // Non-darwin targets only support static movt relocations in FastISel.
+  if (Subtarget->useMovt(*FuncInfo.MF) &&
+      (Subtarget->isTargetMachO() || RelocM == Reloc::Static)) {
+    unsigned Opc;
+    unsigned char TF = 0;
+    if (Subtarget->isTargetMachO())
+      TF = ARMII::MO_NONLAZY;
+
+    switch (RelocM) {
+    case Reloc::PIC_:
+      Opc = isThumb2 ? ARM::t2MOV_ga_pcrel : ARM::MOV_ga_pcrel;
+      break;
+    default:
+      Opc = isThumb2 ? ARM::t2MOVi32imm : ARM::MOVi32imm;
+      break;
+    }
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                            TII.get(Opc), DestReg).addGlobalAddress(GV, 0, TF));
+  } else {
+    // MachineConstantPool wants an explicit alignment.
+    unsigned Align = DL.getPrefTypeAlignment(GV->getType());
+    if (Align == 0) {
+      // TODO: Figure out if this is correct.
+      Align = DL.getTypeAllocSize(GV->getType());
+    }
+
+    if (Subtarget->isTargetELF() && RelocM == Reloc::PIC_)
+      return ARMLowerPICELF(GV, Align, VT);
+
+    // Grab index.
+    unsigned PCAdj = (RelocM != Reloc::PIC_) ? 0 :
+      (Subtarget->isThumb() ? 4 : 8);
+    unsigned Id = AFI->createPICLabelUId();
+    ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(GV, Id,
+                                                                ARMCP::CPValue,
+                                                                PCAdj);
+    unsigned Idx = MCP.getConstantPoolIndex(CPV, Align);
+
+    // Load value.
+    MachineInstrBuilder MIB;
+    if (isThumb2) {
+      unsigned Opc = (RelocM!=Reloc::PIC_) ? ARM::t2LDRpci : ARM::t2LDRpci_pic;
+      MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
+                    DestReg).addConstantPoolIndex(Idx);
+      if (RelocM == Reloc::PIC_)
+        MIB.addImm(Id);
+      AddOptionalDefs(MIB);
+    } else {
+      // The extra immediate is for addrmode2.
+      DestReg = constrainOperandRegClass(TII.get(ARM::LDRcp), DestReg, 0);
+      MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                    TII.get(ARM::LDRcp), DestReg)
+                .addConstantPoolIndex(Idx)
+                .addImm(0);
+      AddOptionalDefs(MIB);
+
+      if (RelocM == Reloc::PIC_) {
+        unsigned Opc = IsIndirect ? ARM::PICLDR : ARM::PICADD;
+        unsigned NewDestReg = createResultReg(TLI.getRegClassFor(VT));
+
+        MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
+                                          DbgLoc, TII.get(Opc), NewDestReg)
+                                  .addReg(DestReg)
+                                  .addImm(Id);
+        AddOptionalDefs(MIB);
+        return NewDestReg;
+      }
+    }
+  }
+
+  if (IsIndirect) {
+    MachineInstrBuilder MIB;
+    unsigned NewDestReg = createResultReg(TLI.getRegClassFor(VT));
+    if (isThumb2)
+      MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                    TII.get(ARM::t2LDRi12), NewDestReg)
+            .addReg(DestReg)
+            .addImm(0);
+    else
+      MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                    TII.get(ARM::LDRi12), NewDestReg)
+                .addReg(DestReg)
+                .addImm(0);
+    DestReg = NewDestReg;
+    AddOptionalDefs(MIB);
+  }
+
+  return DestReg;
+}
+
+unsigned ARMFastISel::TargetMaterializeConstant(const Constant *C) {
+  EVT CEVT = TLI.getValueType(C->getType(), true);
+
+  // Only handle simple types.
+  if (!CEVT.isSimple()) return 0;
+  MVT VT = CEVT.getSimpleVT();
+
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
+    return ARMMaterializeFP(CFP, VT);
+  else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+    return ARMMaterializeGV(GV, VT);
+  else if (isa<ConstantInt>(C))
+    return ARMMaterializeInt(C, VT);
+
+  return 0;
+}
+
+// TODO: unsigned ARMFastISel::TargetMaterializeFloatZero(const ConstantFP *CF);
+
+unsigned ARMFastISel::TargetMaterializeAlloca(const AllocaInst *AI) {
+  // Don't handle dynamic allocas.
+  if (!FuncInfo.StaticAllocaMap.count(AI)) return 0;
+
+  MVT VT;
+  if (!isLoadTypeLegal(AI->getType(), VT)) return 0;
+
+  DenseMap<const AllocaInst*, int>::iterator SI =
+    FuncInfo.StaticAllocaMap.find(AI);
+
+  // This will get lowered later into the correct offsets and registers
+  // via rewriteXFrameIndex.
+  if (SI != FuncInfo.StaticAllocaMap.end()) {
+    unsigned Opc = isThumb2 ? ARM::t2ADDri : ARM::ADDri;
+    const TargetRegisterClass* RC = TLI.getRegClassFor(VT);
+    unsigned ResultReg = createResultReg(RC);
+    ResultReg = constrainOperandRegClass(TII.get(Opc), ResultReg, 0);
+
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                            TII.get(Opc), ResultReg)
+                            .addFrameIndex(SI->second)
+                            .addImm(0));
+    return ResultReg;
+  }
+
+  return 0;
+}
+
+bool ARMFastISel::isTypeLegal(Type *Ty, MVT &VT) {
+  EVT evt = TLI.getValueType(Ty, true);
+
+  // Only handle simple types.
+  if (evt == MVT::Other || !evt.isSimple()) return false;
+  VT = evt.getSimpleVT();
+
+  // Handle all legal types, i.e. a register that will directly hold this
+  // value.
+  return TLI.isTypeLegal(VT);
+}
+
+bool ARMFastISel::isLoadTypeLegal(Type *Ty, MVT &VT) {
+  if (isTypeLegal(Ty, VT)) return true;
+
+  // If this is a type than can be sign or zero-extended to a basic operation
+  // go ahead and accept it now.
+  if (VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16)
+    return true;
+
+  return false;
+}
+
+// Computes the address to get to an object.
+bool ARMFastISel::ARMComputeAddress(const Value *Obj, Address &Addr) {
+  // Some boilerplate from the X86 FastISel.
+  const User *U = nullptr;
+  unsigned Opcode = Instruction::UserOp1;
+  if (const Instruction *I = dyn_cast<Instruction>(Obj)) {
+    // Don't walk into other basic blocks unless the object is an alloca from
+    // another block, otherwise it may not have a virtual register assigned.
+    if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(Obj)) ||
+        FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
+      Opcode = I->getOpcode();
+      U = I;
+    }
+  } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(Obj)) {
+    Opcode = C->getOpcode();
+    U = C;
+  }
+
+  if (PointerType *Ty = dyn_cast<PointerType>(Obj->getType()))
+    if (Ty->getAddressSpace() > 255)
+      // Fast instruction selection doesn't support the special
+      // address spaces.
+      return false;
+
+  switch (Opcode) {
+    default:
+    break;
+    case Instruction::BitCast:
+      // Look through bitcasts.
+      return ARMComputeAddress(U->getOperand(0), Addr);
+    case Instruction::IntToPtr:
+      // Look past no-op inttoptrs.
+      if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
+        return ARMComputeAddress(U->getOperand(0), Addr);
+      break;
+    case Instruction::PtrToInt:
+      // Look past no-op ptrtoints.
+      if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
+        return ARMComputeAddress(U->getOperand(0), Addr);
+      break;
+    case Instruction::GetElementPtr: {
+      Address SavedAddr = Addr;
+      int TmpOffset = Addr.Offset;
+
+      // Iterate through the GEP folding the constants into offsets where
+      // we can.
+      gep_type_iterator GTI = gep_type_begin(U);
+      for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end();
+           i != e; ++i, ++GTI) {
+        const Value *Op = *i;
+        if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+          const StructLayout *SL = DL.getStructLayout(STy);
+          unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
+          TmpOffset += SL->getElementOffset(Idx);
+        } else {
+          uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
+          for (;;) {
+            if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+              // Constant-offset addressing.
+              TmpOffset += CI->getSExtValue() * S;
+              break;
+            }
+            if (canFoldAddIntoGEP(U, Op)) {
+              // A compatible add with a constant operand. Fold the constant.
+              ConstantInt *CI =
+              cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
+              TmpOffset += CI->getSExtValue() * S;
+              // Iterate on the other operand.
+              Op = cast<AddOperator>(Op)->getOperand(0);
+              continue;
+            }
+            // Unsupported
+            goto unsupported_gep;
+          }
+        }
+      }
+
+      // Try to grab the base operand now.
+      Addr.Offset = TmpOffset;
+      if (ARMComputeAddress(U->getOperand(0), Addr)) return true;
+
+      // We failed, restore everything and try the other options.
+      Addr = SavedAddr;
+
+      unsupported_gep:
+      break;
+    }
+    case Instruction::Alloca: {
+      const AllocaInst *AI = cast<AllocaInst>(Obj);
+      DenseMap<const AllocaInst*, int>::iterator SI =
+        FuncInfo.StaticAllocaMap.find(AI);
+      if (SI != FuncInfo.StaticAllocaMap.end()) {
+        Addr.BaseType = Address::FrameIndexBase;
+        Addr.Base.FI = SI->second;
+        return true;
+      }
+      break;
+    }
+  }
+
+  // Try to get this in a register if nothing else has worked.
+  if (Addr.Base.Reg == 0) Addr.Base.Reg = getRegForValue(Obj);
+  return Addr.Base.Reg != 0;
+}
+
+void ARMFastISel::ARMSimplifyAddress(Address &Addr, MVT VT, bool useAM3) {
+  bool needsLowering = false;
+  switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unhandled load/store type!");
+    case MVT::i1:
+    case MVT::i8:
+    case MVT::i16:
+    case MVT::i32:
+      if (!useAM3) {
+        // Integer loads/stores handle 12-bit offsets.
+        needsLowering = ((Addr.Offset & 0xfff) != Addr.Offset);
+        // Handle negative offsets.
+        if (needsLowering && isThumb2)
+          needsLowering = !(Subtarget->hasV6T2Ops() && Addr.Offset < 0 &&
+                            Addr.Offset > -256);
+      } else {
+        // ARM halfword load/stores and signed byte loads use +/-imm8 offsets.
+        needsLowering = (Addr.Offset > 255 || Addr.Offset < -255);
+      }
+      break;
+    case MVT::f32:
+    case MVT::f64:
+      // Floating point operands handle 8-bit offsets.
+      needsLowering = ((Addr.Offset & 0xff) != Addr.Offset);
+      break;
+  }
+
+  // If this is a stack pointer and the offset needs to be simplified then
+  // put the alloca address into a register, set the base type back to
+  // register and continue. This should almost never happen.
+  if (needsLowering && Addr.BaseType == Address::FrameIndexBase) {
+    const TargetRegisterClass *RC = isThumb2 ?
+      (const TargetRegisterClass*)&ARM::tGPRRegClass :
+      (const TargetRegisterClass*)&ARM::GPRRegClass;
+    unsigned ResultReg = createResultReg(RC);
+    unsigned Opc = isThumb2 ? ARM::t2ADDri : ARM::ADDri;
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                            TII.get(Opc), ResultReg)
+                            .addFrameIndex(Addr.Base.FI)
+                            .addImm(0));
+    Addr.Base.Reg = ResultReg;
+    Addr.BaseType = Address::RegBase;
+  }
+
+  // Since the offset is too large for the load/store instruction
+  // get the reg+offset into a register.
+  if (needsLowering) {
+    Addr.Base.Reg = FastEmit_ri_(MVT::i32, ISD::ADD, Addr.Base.Reg,
+                                 /*Op0IsKill*/false, Addr.Offset, MVT::i32);
+    Addr.Offset = 0;
+  }
+}
+
+void ARMFastISel::AddLoadStoreOperands(MVT VT, Address &Addr,
+                                       const MachineInstrBuilder &MIB,
+                                       unsigned Flags, bool useAM3) {
+  // addrmode5 output depends on the selection dag addressing dividing the
+  // offset by 4 that it then later multiplies. Do this here as well.
+  if (VT.SimpleTy == MVT::f32 || VT.SimpleTy == MVT::f64)
+    Addr.Offset /= 4;
+
+  // Frame base works a bit differently. Handle it separately.
+  if (Addr.BaseType == Address::FrameIndexBase) {
+    int FI = Addr.Base.FI;
+    int Offset = Addr.Offset;
+    MachineMemOperand *MMO =
+          FuncInfo.MF->getMachineMemOperand(
+                                  MachinePointerInfo::getFixedStack(FI, Offset),
+                                  Flags,
+                                  MFI.getObjectSize(FI),
+                                  MFI.getObjectAlignment(FI));
+    // Now add the rest of the operands.
+    MIB.addFrameIndex(FI);
+
+    // ARM halfword load/stores and signed byte loads need an additional
+    // operand.
+    if (useAM3) {
+      signed Imm = (Addr.Offset < 0) ? (0x100 | -Addr.Offset) : Addr.Offset;
+      MIB.addReg(0);
+      MIB.addImm(Imm);
+    } else {
+      MIB.addImm(Addr.Offset);
+    }
+    MIB.addMemOperand(MMO);
+  } else {
+    // Now add the rest of the operands.
+    MIB.addReg(Addr.Base.Reg);
+
+    // ARM halfword load/stores and signed byte loads need an additional
+    // operand.
+    if (useAM3) {
+      signed Imm = (Addr.Offset < 0) ? (0x100 | -Addr.Offset) : Addr.Offset;
+      MIB.addReg(0);
+      MIB.addImm(Imm);
+    } else {
+      MIB.addImm(Addr.Offset);
+    }
+  }
+  AddOptionalDefs(MIB);
+}
+
+bool ARMFastISel::ARMEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
+                              unsigned Alignment, bool isZExt, bool allocReg) {
+  unsigned Opc;
+  bool useAM3 = false;
+  bool needVMOV = false;
+  const TargetRegisterClass *RC;
+  switch (VT.SimpleTy) {
+    // This is mostly going to be Neon/vector support.
+    default: return false;
+    case MVT::i1:
+    case MVT::i8:
+      if (isThumb2) {
+        if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops())
+          Opc = isZExt ? ARM::t2LDRBi8 : ARM::t2LDRSBi8;
+        else
+          Opc = isZExt ? ARM::t2LDRBi12 : ARM::t2LDRSBi12;
+      } else {
+        if (isZExt) {
+          Opc = ARM::LDRBi12;
+        } else {
+          Opc = ARM::LDRSB;
+          useAM3 = true;
+        }
+      }
+      RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRnopcRegClass;
+      break;
+    case MVT::i16:
+      if (Alignment && Alignment < 2 && !Subtarget->allowsUnalignedMem())
+        return false;
+
+      if (isThumb2) {
+        if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops())
+          Opc = isZExt ? ARM::t2LDRHi8 : ARM::t2LDRSHi8;
+        else
+          Opc = isZExt ? ARM::t2LDRHi12 : ARM::t2LDRSHi12;
+      } else {
+        Opc = isZExt ? ARM::LDRH : ARM::LDRSH;
+        useAM3 = true;
+      }
+      RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRnopcRegClass;
+      break;
+    case MVT::i32:
+      if (Alignment && Alignment < 4 && !Subtarget->allowsUnalignedMem())
+        return false;
+
+      if (isThumb2) {
+        if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops())
+          Opc = ARM::t2LDRi8;
+        else
+          Opc = ARM::t2LDRi12;
+      } else {
+        Opc = ARM::LDRi12;
+      }
+      RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRnopcRegClass;
+      break;
+    case MVT::f32:
+      if (!Subtarget->hasVFP2()) return false;
+      // Unaligned loads need special handling. Floats require word-alignment.
+      if (Alignment && Alignment < 4) {
+        needVMOV = true;
+        VT = MVT::i32;
+        Opc = isThumb2 ? ARM::t2LDRi12 : ARM::LDRi12;
+        RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRnopcRegClass;
+      } else {
+        Opc = ARM::VLDRS;
+        RC = TLI.getRegClassFor(VT);
+      }
+      break;
+    case MVT::f64:
+      if (!Subtarget->hasVFP2()) return false;
+      // FIXME: Unaligned loads need special handling.  Doublewords require
+      // word-alignment.
+      if (Alignment && Alignment < 4)
+        return false;
+
+      Opc = ARM::VLDRD;
+      RC = TLI.getRegClassFor(VT);
+      break;
+  }
+  // Simplify this down to something we can handle.
+  ARMSimplifyAddress(Addr, VT, useAM3);
+
+  // Create the base instruction, then add the operands.
+  if (allocReg)
+    ResultReg = createResultReg(RC);
+  assert (ResultReg > 255 && "Expected an allocated virtual register.");
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                                    TII.get(Opc), ResultReg);
+  AddLoadStoreOperands(VT, Addr, MIB, MachineMemOperand::MOLoad, useAM3);
+
+  // If we had an unaligned load of a float we've converted it to an regular
+  // load.  Now we must move from the GRP to the FP register.
+  if (needVMOV) {
+    unsigned MoveReg = createResultReg(TLI.getRegClassFor(MVT::f32));
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                            TII.get(ARM::VMOVSR), MoveReg)
+                    .addReg(ResultReg));
+    ResultReg = MoveReg;
+  }
+  return true;
+}
+
+bool ARMFastISel::SelectLoad(const Instruction *I) {
+  // Atomic loads need special handling.
+  if (cast<LoadInst>(I)->isAtomic())
+    return false;
+
+  // Verify we have a legal type before going any further.
+  MVT VT;
+  if (!isLoadTypeLegal(I->getType(), VT))
+    return false;
+
+  // See if we can handle this address.
+  Address Addr;
+  if (!ARMComputeAddress(I->getOperand(0), Addr)) return false;
+
+  unsigned ResultReg;
+  if (!ARMEmitLoad(VT, ResultReg, Addr, cast<LoadInst>(I)->getAlignment()))
+    return false;
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool ARMFastISel::ARMEmitStore(MVT VT, unsigned SrcReg, Address &Addr,
+                               unsigned Alignment) {
+  unsigned StrOpc;
+  bool useAM3 = false;
+  switch (VT.SimpleTy) {
+    // This is mostly going to be Neon/vector support.
+    default: return false;
+    case MVT::i1: {
+      unsigned Res = createResultReg(isThumb2 ?
+        (const TargetRegisterClass*)&ARM::tGPRRegClass :
+        (const TargetRegisterClass*)&ARM::GPRRegClass);
+      unsigned Opc = isThumb2 ? ARM::t2ANDri : ARM::ANDri;
+      SrcReg = constrainOperandRegClass(TII.get(Opc), SrcReg, 1);
+      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                              TII.get(Opc), Res)
+                      .addReg(SrcReg).addImm(1));
+      SrcReg = Res;
+    } // Fallthrough here.
+    case MVT::i8:
+      if (isThumb2) {
+        if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops())
+          StrOpc = ARM::t2STRBi8;
+        else
+          StrOpc = ARM::t2STRBi12;
+      } else {
+        StrOpc = ARM::STRBi12;
+      }
+      break;
+    case MVT::i16:
+      if (Alignment && Alignment < 2 && !Subtarget->allowsUnalignedMem())
+        return false;
+
+      if (isThumb2) {
+        if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops())
+          StrOpc = ARM::t2STRHi8;
+        else
+          StrOpc = ARM::t2STRHi12;
+      } else {
+        StrOpc = ARM::STRH;
+        useAM3 = true;
+      }
+      break;
+    case MVT::i32:
+      if (Alignment && Alignment < 4 && !Subtarget->allowsUnalignedMem())
+        return false;
+
+      if (isThumb2) {
+        if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops())
+          StrOpc = ARM::t2STRi8;
+        else
+          StrOpc = ARM::t2STRi12;
+      } else {
+        StrOpc = ARM::STRi12;
+      }
+      break;
+    case MVT::f32:
+      if (!Subtarget->hasVFP2()) return false;
+      // Unaligned stores need special handling. Floats require word-alignment.
+      if (Alignment && Alignment < 4) {
+        unsigned MoveReg = createResultReg(TLI.getRegClassFor(MVT::i32));
+        AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                                TII.get(ARM::VMOVRS), MoveReg)
+                        .addReg(SrcReg));
+        SrcReg = MoveReg;
+        VT = MVT::i32;
+        StrOpc = isThumb2 ? ARM::t2STRi12 : ARM::STRi12;
+      } else {
+        StrOpc = ARM::VSTRS;
+      }
+      break;
+    case MVT::f64:
+      if (!Subtarget->hasVFP2()) return false;
+      // FIXME: Unaligned stores need special handling.  Doublewords require
+      // word-alignment.
+      if (Alignment && Alignment < 4)
+          return false;
+
+      StrOpc = ARM::VSTRD;
+      break;
+  }
+  // Simplify this down to something we can handle.
+  ARMSimplifyAddress(Addr, VT, useAM3);
+
+  // Create the base instruction, then add the operands.
+  SrcReg = constrainOperandRegClass(TII.get(StrOpc), SrcReg, 0);
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                                    TII.get(StrOpc))
+                            .addReg(SrcReg);
+  AddLoadStoreOperands(VT, Addr, MIB, MachineMemOperand::MOStore, useAM3);
+  return true;
+}
+
+bool ARMFastISel::SelectStore(const Instruction *I) {
+  Value *Op0 = I->getOperand(0);
+  unsigned SrcReg = 0;
+
+  // Atomic stores need special handling.
+  if (cast<StoreInst>(I)->isAtomic())
+    return false;
+
+  // Verify we have a legal type before going any further.
+  MVT VT;
+  if (!isLoadTypeLegal(I->getOperand(0)->getType(), VT))
+    return false;
+
+  // Get the value to be stored into a register.
+  SrcReg = getRegForValue(Op0);
+  if (SrcReg == 0) return false;
+
+  // See if we can handle this address.
+  Address Addr;
+  if (!ARMComputeAddress(I->getOperand(1), Addr))
+    return false;
+
+  if (!ARMEmitStore(VT, SrcReg, Addr, cast<StoreInst>(I)->getAlignment()))
+    return false;
+  return true;
+}
+
+static ARMCC::CondCodes getComparePred(CmpInst::Predicate Pred) {
+  switch (Pred) {
+    // Needs two compares...
+    case CmpInst::FCMP_ONE:
+    case CmpInst::FCMP_UEQ:
+    default:
+      // AL is our "false" for now. The other two need more compares.
+      return ARMCC::AL;
+    case CmpInst::ICMP_EQ:
+    case CmpInst::FCMP_OEQ:
+      return ARMCC::EQ;
+    case CmpInst::ICMP_SGT:
+    case CmpInst::FCMP_OGT:
+      return ARMCC::GT;
+    case CmpInst::ICMP_SGE:
+    case CmpInst::FCMP_OGE:
+      return ARMCC::GE;
+    case CmpInst::ICMP_UGT:
+    case CmpInst::FCMP_UGT:
+      return ARMCC::HI;
+    case CmpInst::FCMP_OLT:
+      return ARMCC::MI;
+    case CmpInst::ICMP_ULE:
+    case CmpInst::FCMP_OLE:
+      return ARMCC::LS;
+    case CmpInst::FCMP_ORD:
+      return ARMCC::VC;
+    case CmpInst::FCMP_UNO:
+      return ARMCC::VS;
+    case CmpInst::FCMP_UGE:
+      return ARMCC::PL;
+    case CmpInst::ICMP_SLT:
+    case CmpInst::FCMP_ULT:
+      return ARMCC::LT;
+    case CmpInst::ICMP_SLE:
+    case CmpInst::FCMP_ULE:
+      return ARMCC::LE;
+    case CmpInst::FCMP_UNE:
+    case CmpInst::ICMP_NE:
+      return ARMCC::NE;
+    case CmpInst::ICMP_UGE:
+      return ARMCC::HS;
+    case CmpInst::ICMP_ULT:
+      return ARMCC::LO;
+  }
+}
+
+bool ARMFastISel::SelectBranch(const Instruction *I) {
+  const BranchInst *BI = cast<BranchInst>(I);
+  MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
+  MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
+
+  // Simple branch support.
+
+  // If we can, avoid recomputing the compare - redoing it could lead to wonky
+  // behavior.
+  if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
+    if (CI->hasOneUse() && (CI->getParent() == I->getParent())) {
+
+      // Get the compare predicate.
+      // Try to take advantage of fallthrough opportunities.
+      CmpInst::Predicate Predicate = CI->getPredicate();
+      if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
+        std::swap(TBB, FBB);
+        Predicate = CmpInst::getInversePredicate(Predicate);
+      }
+
+      ARMCC::CondCodes ARMPred = getComparePred(Predicate);
+
+      // We may not handle every CC for now.
+      if (ARMPred == ARMCC::AL) return false;
+
+      // Emit the compare.
+      if (!ARMEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
+        return false;
+
+      unsigned BrOpc = isThumb2 ? ARM::t2Bcc : ARM::Bcc;
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BrOpc))
+      .addMBB(TBB).addImm(ARMPred).addReg(ARM::CPSR);
+      FastEmitBranch(FBB, DbgLoc);
+      FuncInfo.MBB->addSuccessor(TBB);
+      return true;
+    }
+  } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
+    MVT SourceVT;
+    if (TI->hasOneUse() && TI->getParent() == I->getParent() &&
+        (isLoadTypeLegal(TI->getOperand(0)->getType(), SourceVT))) {
+      unsigned TstOpc = isThumb2 ? ARM::t2TSTri : ARM::TSTri;
+      unsigned OpReg = getRegForValue(TI->getOperand(0));
+      OpReg = constrainOperandRegClass(TII.get(TstOpc), OpReg, 0);
+      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                              TII.get(TstOpc))
+                      .addReg(OpReg).addImm(1));
+
+      unsigned CCMode = ARMCC::NE;
+      if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
+        std::swap(TBB, FBB);
+        CCMode = ARMCC::EQ;
+      }
+
+      unsigned BrOpc = isThumb2 ? ARM::t2Bcc : ARM::Bcc;
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BrOpc))
+      .addMBB(TBB).addImm(CCMode).addReg(ARM::CPSR);
+
+      FastEmitBranch(FBB, DbgLoc);
+      FuncInfo.MBB->addSuccessor(TBB);
+      return true;
+    }
+  } else if (const ConstantInt *CI =
+             dyn_cast<ConstantInt>(BI->getCondition())) {
+    uint64_t Imm = CI->getZExtValue();
+    MachineBasicBlock *Target = (Imm == 0) ? FBB : TBB;
+    FastEmitBranch(Target, DbgLoc);
+    return true;
+  }
+
+  unsigned CmpReg = getRegForValue(BI->getCondition());
+  if (CmpReg == 0) return false;
+
+  // We've been divorced from our compare!  Our block was split, and
+  // now our compare lives in a predecessor block.  We musn't
+  // re-compare here, as the children of the compare aren't guaranteed
+  // live across the block boundary (we *could* check for this).
+  // Regardless, the compare has been done in the predecessor block,
+  // and it left a value for us in a virtual register.  Ergo, we test
+  // the one-bit value left in the virtual register.
+  unsigned TstOpc = isThumb2 ? ARM::t2TSTri : ARM::TSTri;
+  CmpReg = constrainOperandRegClass(TII.get(TstOpc), CmpReg, 0);
+  AddOptionalDefs(
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TstOpc))
+          .addReg(CmpReg)
+          .addImm(1));
+
+  unsigned CCMode = ARMCC::NE;
+  if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
+    std::swap(TBB, FBB);
+    CCMode = ARMCC::EQ;
+  }
+
+  unsigned BrOpc = isThumb2 ? ARM::t2Bcc : ARM::Bcc;
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BrOpc))
+                  .addMBB(TBB).addImm(CCMode).addReg(ARM::CPSR);
+  FastEmitBranch(FBB, DbgLoc);
+  FuncInfo.MBB->addSuccessor(TBB);
+  return true;
+}
+
+bool ARMFastISel::SelectIndirectBr(const Instruction *I) {
+  unsigned AddrReg = getRegForValue(I->getOperand(0));
+  if (AddrReg == 0) return false;
+
+  unsigned Opc = isThumb2 ? ARM::tBRIND : ARM::BX;
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                          TII.get(Opc)).addReg(AddrReg));
+
+  const IndirectBrInst *IB = cast<IndirectBrInst>(I);
+  for (unsigned i = 0, e = IB->getNumSuccessors(); i != e; ++i)
+    FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[IB->getSuccessor(i)]);
+
+  return true;
+}
+
+bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
+                             bool isZExt) {
+  Type *Ty = Src1Value->getType();
+  EVT SrcEVT = TLI.getValueType(Ty, true);
+  if (!SrcEVT.isSimple()) return false;
+  MVT SrcVT = SrcEVT.getSimpleVT();
+
+  bool isFloat = (Ty->isFloatTy() || Ty->isDoubleTy());
+  if (isFloat && !Subtarget->hasVFP2())
+    return false;
+
+  // Check to see if the 2nd operand is a constant that we can encode directly
+  // in the compare.
+  int Imm = 0;
+  bool UseImm = false;
+  bool isNegativeImm = false;
+  // FIXME: At -O0 we don't have anything that canonicalizes operand order.
+  // Thus, Src1Value may be a ConstantInt, but we're missing it.
+  if (const ConstantInt *ConstInt = dyn_cast<ConstantInt>(Src2Value)) {
+    if (SrcVT == MVT::i32 || SrcVT == MVT::i16 || SrcVT == MVT::i8 ||
+        SrcVT == MVT::i1) {
+      const APInt &CIVal = ConstInt->getValue();
+      Imm = (isZExt) ? (int)CIVal.getZExtValue() : (int)CIVal.getSExtValue();
+      // For INT_MIN/LONG_MIN (i.e., 0x80000000) we need to use a cmp, rather
+      // then a cmn, because there is no way to represent 2147483648 as a
+      // signed 32-bit int.
+      if (Imm < 0 && Imm != (int)0x80000000) {
+        isNegativeImm = true;
+        Imm = -Imm;
+      }
+      UseImm = isThumb2 ? (ARM_AM::getT2SOImmVal(Imm) != -1) :
+        (ARM_AM::getSOImmVal(Imm) != -1);
+    }
+  } else if (const ConstantFP *ConstFP = dyn_cast<ConstantFP>(Src2Value)) {
+    if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
+      if (ConstFP->isZero() && !ConstFP->isNegative())
+        UseImm = true;
+  }
+
+  unsigned CmpOpc;
+  bool isICmp = true;
+  bool needsExt = false;
+  switch (SrcVT.SimpleTy) {
+    default: return false;
+    // TODO: Verify compares.
+    case MVT::f32:
+      isICmp = false;
+      CmpOpc = UseImm ? ARM::VCMPEZS : ARM::VCMPES;
+      break;
+    case MVT::f64:
+      isICmp = false;
+      CmpOpc = UseImm ? ARM::VCMPEZD : ARM::VCMPED;
+      break;
+    case MVT::i1:
+    case MVT::i8:
+    case MVT::i16:
+      needsExt = true;
+    // Intentional fall-through.
+    case MVT::i32:
+      if (isThumb2) {
+        if (!UseImm)
+          CmpOpc = ARM::t2CMPrr;
+        else
+          CmpOpc = isNegativeImm ? ARM::t2CMNri : ARM::t2CMPri;
+      } else {
+        if (!UseImm)
+          CmpOpc = ARM::CMPrr;
+        else
+          CmpOpc = isNegativeImm ? ARM::CMNri : ARM::CMPri;
+      }
+      break;
+  }
+
+  unsigned SrcReg1 = getRegForValue(Src1Value);
+  if (SrcReg1 == 0) return false;
+
+  unsigned SrcReg2 = 0;
+  if (!UseImm) {
+    SrcReg2 = getRegForValue(Src2Value);
+    if (SrcReg2 == 0) return false;
+  }
+
+  // We have i1, i8, or i16, we need to either zero extend or sign extend.
+  if (needsExt) {
+    SrcReg1 = ARMEmitIntExt(SrcVT, SrcReg1, MVT::i32, isZExt);
+    if (SrcReg1 == 0) return false;
+    if (!UseImm) {
+      SrcReg2 = ARMEmitIntExt(SrcVT, SrcReg2, MVT::i32, isZExt);
+      if (SrcReg2 == 0) return false;
+    }
+  }
+
+  const MCInstrDesc &II = TII.get(CmpOpc);
+  SrcReg1 = constrainOperandRegClass(II, SrcReg1, 0);
+  if (!UseImm) {
+    SrcReg2 = constrainOperandRegClass(II, SrcReg2, 1);
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+                    .addReg(SrcReg1).addReg(SrcReg2));
+  } else {
+    MachineInstrBuilder MIB;
+    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+      .addReg(SrcReg1);
+
+    // Only add immediate for icmp as the immediate for fcmp is an implicit 0.0.
+    if (isICmp)
+      MIB.addImm(Imm);
+    AddOptionalDefs(MIB);
+  }
+
+  // For floating point we need to move the result to a comparison register
+  // that we can then use for branches.
+  if (Ty->isFloatTy() || Ty->isDoubleTy())
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                            TII.get(ARM::FMSTAT)));
+  return true;
+}
+
+bool ARMFastISel::SelectCmp(const Instruction *I) {
+  const CmpInst *CI = cast<CmpInst>(I);
+
+  // Get the compare predicate.
+  ARMCC::CondCodes ARMPred = getComparePred(CI->getPredicate());
+
+  // We may not handle every CC for now.
+  if (ARMPred == ARMCC::AL) return false;
+
+  // Emit the compare.
+  if (!ARMEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
+    return false;
+
+  // Now set a register based on the comparison. Explicitly set the predicates
+  // here.
+  unsigned MovCCOpc = isThumb2 ? ARM::t2MOVCCi : ARM::MOVCCi;
+  const TargetRegisterClass *RC = isThumb2 ?
+    (const TargetRegisterClass*)&ARM::rGPRRegClass :
+    (const TargetRegisterClass*)&ARM::GPRRegClass;
+  unsigned DestReg = createResultReg(RC);
+  Constant *Zero = ConstantInt::get(Type::getInt32Ty(*Context), 0);
+  unsigned ZeroReg = TargetMaterializeConstant(Zero);
+  // ARMEmitCmp emits a FMSTAT when necessary, so it's always safe to use CPSR.
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MovCCOpc), DestReg)
+          .addReg(ZeroReg).addImm(1)
+          .addImm(ARMPred).addReg(ARM::CPSR);
+
+  UpdateValueMap(I, DestReg);
+  return true;
+}
+
+bool ARMFastISel::SelectFPExt(const Instruction *I) {
+  // Make sure we have VFP and that we're extending float to double.
+  if (!Subtarget->hasVFP2()) return false;
+
+  Value *V = I->getOperand(0);
+  if (!I->getType()->isDoubleTy() ||
+      !V->getType()->isFloatTy()) return false;
+
+  unsigned Op = getRegForValue(V);
+  if (Op == 0) return false;
+
+  unsigned Result = createResultReg(&ARM::DPRRegClass);
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                          TII.get(ARM::VCVTDS), Result)
+                  .addReg(Op));
+  UpdateValueMap(I, Result);
+  return true;
+}
+
+bool ARMFastISel::SelectFPTrunc(const Instruction *I) {
+  // Make sure we have VFP and that we're truncating double to float.
+  if (!Subtarget->hasVFP2()) return false;
+
+  Value *V = I->getOperand(0);
+  if (!(I->getType()->isFloatTy() &&
+        V->getType()->isDoubleTy())) return false;
+
+  unsigned Op = getRegForValue(V);
+  if (Op == 0) return false;
+
+  unsigned Result = createResultReg(&ARM::SPRRegClass);
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                          TII.get(ARM::VCVTSD), Result)
+                  .addReg(Op));
+  UpdateValueMap(I, Result);
+  return true;
+}
+
+bool ARMFastISel::SelectIToFP(const Instruction *I, bool isSigned) {
+  // Make sure we have VFP.
+  if (!Subtarget->hasVFP2()) return false;
+
+  MVT DstVT;
+  Type *Ty = I->getType();
+  if (!isTypeLegal(Ty, DstVT))
+    return false;
+
+  Value *Src = I->getOperand(0);
+  EVT SrcEVT = TLI.getValueType(Src->getType(), true);
+  if (!SrcEVT.isSimple())
+    return false;
+  MVT SrcVT = SrcEVT.getSimpleVT();
+  if (SrcVT != MVT::i32 && SrcVT != MVT::i16 && SrcVT != MVT::i8)
+    return false;
+
+  unsigned SrcReg = getRegForValue(Src);
+  if (SrcReg == 0) return false;
+
+  // Handle sign-extension.
+  if (SrcVT == MVT::i16 || SrcVT == MVT::i8) {
+    SrcReg = ARMEmitIntExt(SrcVT, SrcReg, MVT::i32,
+                                       /*isZExt*/!isSigned);
+    if (SrcReg == 0) return false;
+  }
+
+  // The conversion routine works on fp-reg to fp-reg and the operand above
+  // was an integer, move it to the fp registers if possible.
+  unsigned FP = ARMMoveToFPReg(MVT::f32, SrcReg);
+  if (FP == 0) return false;
+
+  unsigned Opc;
+  if (Ty->isFloatTy()) Opc = isSigned ? ARM::VSITOS : ARM::VUITOS;
+  else if (Ty->isDoubleTy()) Opc = isSigned ? ARM::VSITOD : ARM::VUITOD;
+  else return false;
+
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(DstVT));
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                          TII.get(Opc), ResultReg).addReg(FP));
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool ARMFastISel::SelectFPToI(const Instruction *I, bool isSigned) {
+  // Make sure we have VFP.
+  if (!Subtarget->hasVFP2()) return false;
+
+  MVT DstVT;
+  Type *RetTy = I->getType();
+  if (!isTypeLegal(RetTy, DstVT))
+    return false;
+
+  unsigned Op = getRegForValue(I->getOperand(0));
+  if (Op == 0) return false;
+
+  unsigned Opc;
+  Type *OpTy = I->getOperand(0)->getType();
+  if (OpTy->isFloatTy()) Opc = isSigned ? ARM::VTOSIZS : ARM::VTOUIZS;
+  else if (OpTy->isDoubleTy()) Opc = isSigned ? ARM::VTOSIZD : ARM::VTOUIZD;
+  else return false;
+
+  // f64->s32/u32 or f32->s32/u32 both need an intermediate f32 reg.
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(MVT::f32));
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                          TII.get(Opc), ResultReg).addReg(Op));
+
+  // This result needs to be in an integer register, but the conversion only
+  // takes place in fp-regs.
+  unsigned IntReg = ARMMoveToIntReg(DstVT, ResultReg);
+  if (IntReg == 0) return false;
+
+  UpdateValueMap(I, IntReg);
+  return true;
+}
+
+bool ARMFastISel::SelectSelect(const Instruction *I) {
+  MVT VT;
+  if (!isTypeLegal(I->getType(), VT))
+    return false;
+
+  // Things need to be register sized for register moves.
+  if (VT != MVT::i32) return false;
+
+  unsigned CondReg = getRegForValue(I->getOperand(0));
+  if (CondReg == 0) return false;
+  unsigned Op1Reg = getRegForValue(I->getOperand(1));
+  if (Op1Reg == 0) return false;
+
+  // Check to see if we can use an immediate in the conditional move.
+  int Imm = 0;
+  bool UseImm = false;
+  bool isNegativeImm = false;
+  if (const ConstantInt *ConstInt = dyn_cast<ConstantInt>(I->getOperand(2))) {
+    assert (VT == MVT::i32 && "Expecting an i32.");
+    Imm = (int)ConstInt->getValue().getZExtValue();
+    if (Imm < 0) {
+      isNegativeImm = true;
+      Imm = ~Imm;
+    }
+    UseImm = isThumb2 ? (ARM_AM::getT2SOImmVal(Imm) != -1) :
+      (ARM_AM::getSOImmVal(Imm) != -1);
+  }
+
+  unsigned Op2Reg = 0;
+  if (!UseImm) {
+    Op2Reg = getRegForValue(I->getOperand(2));
+    if (Op2Reg == 0) return false;
+  }
+
+  unsigned CmpOpc = isThumb2 ? ARM::t2CMPri : ARM::CMPri;
+  CondReg = constrainOperandRegClass(TII.get(CmpOpc), CondReg, 0);
+  AddOptionalDefs(
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc))
+          .addReg(CondReg)
+          .addImm(0));
+
+  unsigned MovCCOpc;
+  const TargetRegisterClass *RC;
+  if (!UseImm) {
+    RC = isThumb2 ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
+    MovCCOpc = isThumb2 ? ARM::t2MOVCCr : ARM::MOVCCr;
+  } else {
+    RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass;
+    if (!isNegativeImm)
+      MovCCOpc = isThumb2 ? ARM::t2MOVCCi : ARM::MOVCCi;
+    else
+      MovCCOpc = isThumb2 ? ARM::t2MVNCCi : ARM::MVNCCi;
+  }
+  unsigned ResultReg = createResultReg(RC);
+  if (!UseImm) {
+    Op2Reg = constrainOperandRegClass(TII.get(MovCCOpc), Op2Reg, 1);
+    Op1Reg = constrainOperandRegClass(TII.get(MovCCOpc), Op1Reg, 2);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MovCCOpc),
+            ResultReg)
+        .addReg(Op2Reg)
+        .addReg(Op1Reg)
+        .addImm(ARMCC::NE)
+        .addReg(ARM::CPSR);
+  } else {
+    Op1Reg = constrainOperandRegClass(TII.get(MovCCOpc), Op1Reg, 1);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MovCCOpc),
+            ResultReg)
+        .addReg(Op1Reg)
+        .addImm(Imm)
+        .addImm(ARMCC::EQ)
+        .addReg(ARM::CPSR);
+  }
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool ARMFastISel::SelectDiv(const Instruction *I, bool isSigned) {
+  MVT VT;
+  Type *Ty = I->getType();
+  if (!isTypeLegal(Ty, VT))
+    return false;
+
+  // If we have integer div support we should have selected this automagically.
+  // In case we have a real miss go ahead and return false and we'll pick
+  // it up later.
+  if (Subtarget->hasDivide()) return false;
+
+  // Otherwise emit a libcall.
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  if (VT == MVT::i8)
+    LC = isSigned ? RTLIB::SDIV_I8 : RTLIB::UDIV_I8;
+  else if (VT == MVT::i16)
+    LC = isSigned ? RTLIB::SDIV_I16 : RTLIB::UDIV_I16;
+  else if (VT == MVT::i32)
+    LC = isSigned ? RTLIB::SDIV_I32 : RTLIB::UDIV_I32;
+  else if (VT == MVT::i64)
+    LC = isSigned ? RTLIB::SDIV_I64 : RTLIB::UDIV_I64;
+  else if (VT == MVT::i128)
+    LC = isSigned ? RTLIB::SDIV_I128 : RTLIB::UDIV_I128;
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SDIV!");
+
+  return ARMEmitLibcall(I, LC);
+}
+
+bool ARMFastISel::SelectRem(const Instruction *I, bool isSigned) {
+  MVT VT;
+  Type *Ty = I->getType();
+  if (!isTypeLegal(Ty, VT))
+    return false;
+
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  if (VT == MVT::i8)
+    LC = isSigned ? RTLIB::SREM_I8 : RTLIB::UREM_I8;
+  else if (VT == MVT::i16)
+    LC = isSigned ? RTLIB::SREM_I16 : RTLIB::UREM_I16;
+  else if (VT == MVT::i32)
+    LC = isSigned ? RTLIB::SREM_I32 : RTLIB::UREM_I32;
+  else if (VT == MVT::i64)
+    LC = isSigned ? RTLIB::SREM_I64 : RTLIB::UREM_I64;
+  else if (VT == MVT::i128)
+    LC = isSigned ? RTLIB::SREM_I128 : RTLIB::UREM_I128;
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SREM!");
+
+  return ARMEmitLibcall(I, LC);
+}
+
+bool ARMFastISel::SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode) {
+  EVT DestVT  = TLI.getValueType(I->getType(), true);
+
+  // We can get here in the case when we have a binary operation on a non-legal
+  // type and the target independent selector doesn't know how to handle it.
+  if (DestVT != MVT::i16 && DestVT != MVT::i8 && DestVT != MVT::i1)
+    return false;
+
+  unsigned Opc;
+  switch (ISDOpcode) {
+    default: return false;
+    case ISD::ADD:
+      Opc = isThumb2 ? ARM::t2ADDrr : ARM::ADDrr;
+      break;
+    case ISD::OR:
+      Opc = isThumb2 ? ARM::t2ORRrr : ARM::ORRrr;
+      break;
+    case ISD::SUB:
+      Opc = isThumb2 ? ARM::t2SUBrr : ARM::SUBrr;
+      break;
+  }
+
+  unsigned SrcReg1 = getRegForValue(I->getOperand(0));
+  if (SrcReg1 == 0) return false;
+
+  // TODO: Often the 2nd operand is an immediate, which can be encoded directly
+  // in the instruction, rather then materializing the value in a register.
+  unsigned SrcReg2 = getRegForValue(I->getOperand(1));
+  if (SrcReg2 == 0) return false;
+
+  unsigned ResultReg = createResultReg(&ARM::GPRnopcRegClass);
+  SrcReg1 = constrainOperandRegClass(TII.get(Opc), SrcReg1, 1);
+  SrcReg2 = constrainOperandRegClass(TII.get(Opc), SrcReg2, 2);
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                          TII.get(Opc), ResultReg)
+                  .addReg(SrcReg1).addReg(SrcReg2));
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool ARMFastISel::SelectBinaryFPOp(const Instruction *I, unsigned ISDOpcode) {
+  EVT FPVT = TLI.getValueType(I->getType(), true);
+  if (!FPVT.isSimple()) return false;
+  MVT VT = FPVT.getSimpleVT();
+
+  // We can get here in the case when we want to use NEON for our fp
+  // operations, but can't figure out how to. Just use the vfp instructions
+  // if we have them.
+  // FIXME: It'd be nice to use NEON instructions.
+  Type *Ty = I->getType();
+  bool isFloat = (Ty->isDoubleTy() || Ty->isFloatTy());
+  if (isFloat && !Subtarget->hasVFP2())
+    return false;
+
+  unsigned Opc;
+  bool is64bit = VT == MVT::f64 || VT == MVT::i64;
+  switch (ISDOpcode) {
+    default: return false;
+    case ISD::FADD:
+      Opc = is64bit ? ARM::VADDD : ARM::VADDS;
+      break;
+    case ISD::FSUB:
+      Opc = is64bit ? ARM::VSUBD : ARM::VSUBS;
+      break;
+    case ISD::FMUL:
+      Opc = is64bit ? ARM::VMULD : ARM::VMULS;
+      break;
+  }
+  unsigned Op1 = getRegForValue(I->getOperand(0));
+  if (Op1 == 0) return false;
+
+  unsigned Op2 = getRegForValue(I->getOperand(1));
+  if (Op2 == 0) return false;
+
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT.SimpleTy));
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                          TII.get(Opc), ResultReg)
+                  .addReg(Op1).addReg(Op2));
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+// Call Handling Code
+
+// This is largely taken directly from CCAssignFnForNode
+// TODO: We may not support all of this.
+CCAssignFn *ARMFastISel::CCAssignFnForCall(CallingConv::ID CC,
+                                           bool Return,
+                                           bool isVarArg) {
+  switch (CC) {
+  default:
+    llvm_unreachable("Unsupported calling convention");
+  case CallingConv::Fast:
+    if (Subtarget->hasVFP2() && !isVarArg) {
+      if (!Subtarget->isAAPCS_ABI())
+        return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
+      // For AAPCS ABI targets, just use VFP variant of the calling convention.
+      return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
+    }
+    // Fallthrough
+  case CallingConv::C:
+    // Use target triple & subtarget features to do actual dispatch.
+    if (Subtarget->isAAPCS_ABI()) {
+      if (Subtarget->hasVFP2() &&
+          TM.Options.FloatABIType == FloatABI::Hard && !isVarArg)
+        return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP);
+      else
+        return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS);
+    } else
+        return (Return ? RetCC_ARM_APCS: CC_ARM_APCS);
+  case CallingConv::ARM_AAPCS_VFP:
+    if (!isVarArg)
+      return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP);
+    // Fall through to soft float variant, variadic functions don't
+    // use hard floating point ABI.
+  case CallingConv::ARM_AAPCS:
+    return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS);
+  case CallingConv::ARM_APCS:
+    return (Return ? RetCC_ARM_APCS: CC_ARM_APCS);
+  case CallingConv::GHC:
+    if (Return)
+      llvm_unreachable("Can't return in GHC call convention");
+    else
+      return CC_ARM_APCS_GHC;
+  }
+}
+
+bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
+                                  SmallVectorImpl<unsigned> &ArgRegs,
+                                  SmallVectorImpl<MVT> &ArgVTs,
+                                  SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
+                                  SmallVectorImpl<unsigned> &RegArgs,
+                                  CallingConv::ID CC,
+                                  unsigned &NumBytes,
+                                  bool isVarArg) {
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CC, isVarArg, *FuncInfo.MF, TM, ArgLocs, *Context);
+  CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags,
+                             CCAssignFnForCall(CC, false, isVarArg));
+
+  // Check that we can handle all of the arguments. If we can't, then bail out
+  // now before we add code to the MBB.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    MVT ArgVT = ArgVTs[VA.getValNo()];
+
+    // We don't handle NEON/vector parameters yet.
+    if (ArgVT.isVector() || ArgVT.getSizeInBits() > 64)
+      return false;
+
+    // Now copy/store arg to correct locations.
+    if (VA.isRegLoc() && !VA.needsCustom()) {
+      continue;
+    } else if (VA.needsCustom()) {
+      // TODO: We need custom lowering for vector (v2f64) args.
+      if (VA.getLocVT() != MVT::f64 ||
+          // TODO: Only handle register args for now.
+          !VA.isRegLoc() || !ArgLocs[++i].isRegLoc())
+        return false;
+    } else {
+      switch (ArgVT.SimpleTy) {
+      default:
+        return false;
+      case MVT::i1:
+      case MVT::i8:
+      case MVT::i16:
+      case MVT::i32:
+        break;
+      case MVT::f32:
+        if (!Subtarget->hasVFP2())
+          return false;
+        break;
+      case MVT::f64:
+        if (!Subtarget->hasVFP2())
+          return false;
+        break;
+      }
+    }
+  }
+
+  // At the point, we are able to handle the call's arguments in fast isel.
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  NumBytes = CCInfo.getNextStackOffset();
+
+  // Issue CALLSEQ_START
+  unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                          TII.get(AdjStackDown))
+                  .addImm(NumBytes));
+
+  // Process the args.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    unsigned Arg = ArgRegs[VA.getValNo()];
+    MVT ArgVT = ArgVTs[VA.getValNo()];
+
+    assert((!ArgVT.isVector() && ArgVT.getSizeInBits() <= 64) &&
+           "We don't handle NEON/vector parameters yet.");
+
+    // Handle arg promotion, etc.
+    switch (VA.getLocInfo()) {
+      case CCValAssign::Full: break;
+      case CCValAssign::SExt: {
+        MVT DestVT = VA.getLocVT();
+        Arg = ARMEmitIntExt(ArgVT, Arg, DestVT, /*isZExt*/false);
+        assert (Arg != 0 && "Failed to emit a sext");
+        ArgVT = DestVT;
+        break;
+      }
+      case CCValAssign::AExt:
+        // Intentional fall-through.  Handle AExt and ZExt.
+      case CCValAssign::ZExt: {
+        MVT DestVT = VA.getLocVT();
+        Arg = ARMEmitIntExt(ArgVT, Arg, DestVT, /*isZExt*/true);
+        assert (Arg != 0 && "Failed to emit a zext");
+        ArgVT = DestVT;
+        break;
+      }
+      case CCValAssign::BCvt: {
+        unsigned BC = FastEmit_r(ArgVT, VA.getLocVT(), ISD::BITCAST, Arg,
+                                 /*TODO: Kill=*/false);
+        assert(BC != 0 && "Failed to emit a bitcast!");
+        Arg = BC;
+        ArgVT = VA.getLocVT();
+        break;
+      }
+      default: llvm_unreachable("Unknown arg promotion!");
+    }
+
+    // Now copy/store arg to correct locations.
+    if (VA.isRegLoc() && !VA.needsCustom()) {
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(Arg);
+      RegArgs.push_back(VA.getLocReg());
+    } else if (VA.needsCustom()) {
+      // TODO: We need custom lowering for vector (v2f64) args.
+      assert(VA.getLocVT() == MVT::f64 &&
+             "Custom lowering for v2f64 args not available");
+
+      CCValAssign &NextVA = ArgLocs[++i];
+
+      assert(VA.isRegLoc() && NextVA.isRegLoc() &&
+             "We only handle register args!");
+
+      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                              TII.get(ARM::VMOVRRD), VA.getLocReg())
+                      .addReg(NextVA.getLocReg(), RegState::Define)
+                      .addReg(Arg));
+      RegArgs.push_back(VA.getLocReg());
+      RegArgs.push_back(NextVA.getLocReg());
+    } else {
+      assert(VA.isMemLoc());
+      // Need to store on the stack.
+      Address Addr;
+      Addr.BaseType = Address::RegBase;
+      Addr.Base.Reg = ARM::SP;
+      Addr.Offset = VA.getLocMemOffset();
+
+      bool EmitRet = ARMEmitStore(ArgVT, Arg, Addr); (void)EmitRet;
+      assert(EmitRet && "Could not emit a store for argument!");
+    }
+  }
+
+  return true;
+}
+
+bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
+                             const Instruction *I, CallingConv::ID CC,
+                             unsigned &NumBytes, bool isVarArg) {
+  // Issue CALLSEQ_END
+  unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                          TII.get(AdjStackUp))
+                  .addImm(NumBytes).addImm(0));
+
+  // Now the return value.
+  if (RetVT != MVT::isVoid) {
+    SmallVector<CCValAssign, 16> RVLocs;
+    CCState CCInfo(CC, isVarArg, *FuncInfo.MF, TM, RVLocs, *Context);
+    CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC, true, isVarArg));
+
+    // Copy all of the result registers out of their specified physreg.
+    if (RVLocs.size() == 2 && RetVT == MVT::f64) {
+      // For this move we copy into two registers and then move into the
+      // double fp reg we want.
+      MVT DestVT = RVLocs[0].getValVT();
+      const TargetRegisterClass* DstRC = TLI.getRegClassFor(DestVT);
+      unsigned ResultReg = createResultReg(DstRC);
+      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                              TII.get(ARM::VMOVDRR), ResultReg)
+                      .addReg(RVLocs[0].getLocReg())
+                      .addReg(RVLocs[1].getLocReg()));
+
+      UsedRegs.push_back(RVLocs[0].getLocReg());
+      UsedRegs.push_back(RVLocs[1].getLocReg());
+
+      // Finally update the result.
+      UpdateValueMap(I, ResultReg);
+    } else {
+      assert(RVLocs.size() == 1 &&"Can't handle non-double multi-reg retvals!");
+      MVT CopyVT = RVLocs[0].getValVT();
+
+      // Special handling for extended integers.
+      if (RetVT == MVT::i1 || RetVT == MVT::i8 || RetVT == MVT::i16)
+        CopyVT = MVT::i32;
+
+      const TargetRegisterClass* DstRC = TLI.getRegClassFor(CopyVT);
+
+      unsigned ResultReg = createResultReg(DstRC);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY),
+              ResultReg).addReg(RVLocs[0].getLocReg());
+      UsedRegs.push_back(RVLocs[0].getLocReg());
+
+      // Finally update the result.
+      UpdateValueMap(I, ResultReg);
+    }
+  }
+
+  return true;
+}
+
+bool ARMFastISel::SelectRet(const Instruction *I) {
+  const ReturnInst *Ret = cast<ReturnInst>(I);
+  const Function &F = *I->getParent()->getParent();
+
+  if (!FuncInfo.CanLowerReturn)
+    return false;
+
+  // Build a list of return value registers.
+  SmallVector<unsigned, 4> RetRegs;
+
+  CallingConv::ID CC = F.getCallingConv();
+  if (Ret->getNumOperands() > 0) {
+    SmallVector<ISD::OutputArg, 4> Outs;
+    GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI);
+
+    // Analyze operands of the call, assigning locations to each operand.
+    SmallVector<CCValAssign, 16> ValLocs;
+    CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs,I->getContext());
+    CCInfo.AnalyzeReturn(Outs, CCAssignFnForCall(CC, true /* is Ret */,
+                                                 F.isVarArg()));
+
+    const Value *RV = Ret->getOperand(0);
+    unsigned Reg = getRegForValue(RV);
+    if (Reg == 0)
+      return false;
+
+    // Only handle a single return value for now.
+    if (ValLocs.size() != 1)
+      return false;
+
+    CCValAssign &VA = ValLocs[0];
+
+    // Don't bother handling odd stuff for now.
+    if (VA.getLocInfo() != CCValAssign::Full)
+      return false;
+    // Only handle register returns for now.
+    if (!VA.isRegLoc())
+      return false;
+
+    unsigned SrcReg = Reg + VA.getValNo();
+    EVT RVEVT = TLI.getValueType(RV->getType());
+    if (!RVEVT.isSimple()) return false;
+    MVT RVVT = RVEVT.getSimpleVT();
+    MVT DestVT = VA.getValVT();
+    // Special handling for extended integers.
+    if (RVVT != DestVT) {
+      if (RVVT != MVT::i1 && RVVT != MVT::i8 && RVVT != MVT::i16)
+        return false;
+
+      assert(DestVT == MVT::i32 && "ARM should always ext to i32");
+
+      // Perform extension if flagged as either zext or sext.  Otherwise, do
+      // nothing.
+      if (Outs[0].Flags.isZExt() || Outs[0].Flags.isSExt()) {
+        SrcReg = ARMEmitIntExt(RVVT, SrcReg, DestVT, Outs[0].Flags.isZExt());
+        if (SrcReg == 0) return false;
+      }
+    }
+
+    // Make the copy.
+    unsigned DstReg = VA.getLocReg();
+    const TargetRegisterClass* SrcRC = MRI.getRegClass(SrcReg);
+    // Avoid a cross-class copy. This is very unlikely.
+    if (!SrcRC->contains(DstReg))
+      return false;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), DstReg).addReg(SrcReg);
+
+    // Add register to return instruction.
+    RetRegs.push_back(VA.getLocReg());
+  }
+
+  unsigned RetOpc = isThumb2 ? ARM::tBX_RET : ARM::BX_RET;
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                                    TII.get(RetOpc));
+  AddOptionalDefs(MIB);
+  for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
+    MIB.addReg(RetRegs[i], RegState::Implicit);
+  return true;
+}
+
+unsigned ARMFastISel::ARMSelectCallOp(bool UseReg) {
+  if (UseReg)
+    return isThumb2 ? ARM::tBLXr : ARM::BLX;
+  else
+    return isThumb2 ? ARM::tBL : ARM::BL;
+}
+
+unsigned ARMFastISel::getLibcallReg(const Twine &Name) {
+  // Manually compute the global's type to avoid building it when unnecessary.
+  Type *GVTy = Type::getInt32PtrTy(*Context, /*AS=*/0);
+  EVT LCREVT = TLI.getValueType(GVTy);
+  if (!LCREVT.isSimple()) return 0;
+
+  GlobalValue *GV = new GlobalVariable(M, Type::getInt32Ty(*Context), false,
+                                       GlobalValue::ExternalLinkage, nullptr,
+                                       Name);
+  assert(GV->getType() == GVTy && "We miscomputed the type for the global!");
+  return ARMMaterializeGV(GV, LCREVT.getSimpleVT());
+}
+
+// A quick function that will emit a call for a named libcall in F with the
+// vector of passed arguments for the Instruction in I. We can assume that we
+// can emit a call for any libcall we can produce. This is an abridged version
+// of the full call infrastructure since we won't need to worry about things
+// like computed function pointers or strange arguments at call sites.
+// TODO: Try to unify this and the normal call bits for ARM, then try to unify
+// with X86.
+bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
+  CallingConv::ID CC = TLI.getLibcallCallingConv(Call);
+
+  // Handle *simple* calls for now.
+  Type *RetTy = I->getType();
+  MVT RetVT;
+  if (RetTy->isVoidTy())
+    RetVT = MVT::isVoid;
+  else if (!isTypeLegal(RetTy, RetVT))
+    return false;
+
+  // Can't handle non-double multi-reg retvals.
+  if (RetVT != MVT::isVoid && RetVT != MVT::i32) {
+    SmallVector<CCValAssign, 16> RVLocs;
+    CCState CCInfo(CC, false, *FuncInfo.MF, TM, RVLocs, *Context);
+    CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC, true, false));
+    if (RVLocs.size() >= 2 && RetVT != MVT::f64)
+      return false;
+  }
+
+  // Set up the argument vectors.
+  SmallVector<Value*, 8> Args;
+  SmallVector<unsigned, 8> ArgRegs;
+  SmallVector<MVT, 8> ArgVTs;
+  SmallVector<ISD::ArgFlagsTy, 8> ArgFlags;
+  Args.reserve(I->getNumOperands());
+  ArgRegs.reserve(I->getNumOperands());
+  ArgVTs.reserve(I->getNumOperands());
+  ArgFlags.reserve(I->getNumOperands());
+  for (unsigned i = 0; i < I->getNumOperands(); ++i) {
+    Value *Op = I->getOperand(i);
+    unsigned Arg = getRegForValue(Op);
+    if (Arg == 0) return false;
+
+    Type *ArgTy = Op->getType();
+    MVT ArgVT;
+    if (!isTypeLegal(ArgTy, ArgVT)) return false;
+
+    ISD::ArgFlagsTy Flags;
+    unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy);
+    Flags.setOrigAlign(OriginalAlignment);
+
+    Args.push_back(Op);
+    ArgRegs.push_back(Arg);
+    ArgVTs.push_back(ArgVT);
+    ArgFlags.push_back(Flags);
+  }
+
+  // Handle the arguments now that we've gotten them.
+  SmallVector<unsigned, 4> RegArgs;
+  unsigned NumBytes;
+  if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags,
+                       RegArgs, CC, NumBytes, false))
+    return false;
+
+  unsigned CalleeReg = 0;
+  if (EnableARMLongCalls) {
+    CalleeReg = getLibcallReg(TLI.getLibcallName(Call));
+    if (CalleeReg == 0) return false;
+  }
+
+  // Issue the call.
+  unsigned CallOpc = ARMSelectCallOp(EnableARMLongCalls);
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
+                                    DbgLoc, TII.get(CallOpc));
+  // BL / BLX don't take a predicate, but tBL / tBLX do.
+  if (isThumb2)
+    AddDefaultPred(MIB);
+  if (EnableARMLongCalls)
+    MIB.addReg(CalleeReg);
+  else
+    MIB.addExternalSymbol(TLI.getLibcallName(Call));
+
+  // Add implicit physical register uses to the call.
+  for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
+    MIB.addReg(RegArgs[i], RegState::Implicit);
+
+  // Add a register mask with the call-preserved registers.
+  // Proper defs for return values will be added by setPhysRegsDeadExcept().
+  MIB.addRegMask(TRI.getCallPreservedMask(CC));
+
+  // Finish off the call including any return values.
+  SmallVector<unsigned, 4> UsedRegs;
+  if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes, false)) return false;
+
+  // Set all unused physreg defs as dead.
+  static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI);
+
+  return true;
+}
+
+bool ARMFastISel::SelectCall(const Instruction *I,
+                             const char *IntrMemName = nullptr) {
+  const CallInst *CI = cast<CallInst>(I);
+  const Value *Callee = CI->getCalledValue();
+
+  // Can't handle inline asm.
+  if (isa<InlineAsm>(Callee)) return false;
+
+  // Allow SelectionDAG isel to handle tail calls.
+  if (CI->isTailCall()) return false;
+
+  // Check the calling convention.
+  ImmutableCallSite CS(CI);
+  CallingConv::ID CC = CS.getCallingConv();
+
+  // TODO: Avoid some calling conventions?
+
+  PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
+  FunctionType *FTy = cast<FunctionType>(PT->getElementType());
+  bool isVarArg = FTy->isVarArg();
+
+  // Handle *simple* calls for now.
+  Type *RetTy = I->getType();
+  MVT RetVT;
+  if (RetTy->isVoidTy())
+    RetVT = MVT::isVoid;
+  else if (!isTypeLegal(RetTy, RetVT) && RetVT != MVT::i16 &&
+           RetVT != MVT::i8  && RetVT != MVT::i1)
+    return false;
+
+  // Can't handle non-double multi-reg retvals.
+  if (RetVT != MVT::isVoid && RetVT != MVT::i1 && RetVT != MVT::i8 &&
+      RetVT != MVT::i16 && RetVT != MVT::i32) {
+    SmallVector<CCValAssign, 16> RVLocs;
+    CCState CCInfo(CC, isVarArg, *FuncInfo.MF, TM, RVLocs, *Context);
+    CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC, true, isVarArg));
+    if (RVLocs.size() >= 2 && RetVT != MVT::f64)
+      return false;
+  }
+
+  // Set up the argument vectors.
+  SmallVector<Value*, 8> Args;
+  SmallVector<unsigned, 8> ArgRegs;
+  SmallVector<MVT, 8> ArgVTs;
+  SmallVector<ISD::ArgFlagsTy, 8> ArgFlags;
+  unsigned arg_size = CS.arg_size();
+  Args.reserve(arg_size);
+  ArgRegs.reserve(arg_size);
+  ArgVTs.reserve(arg_size);
+  ArgFlags.reserve(arg_size);
+  for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
+       i != e; ++i) {
+    // If we're lowering a memory intrinsic instead of a regular call, skip the
+    // last two arguments, which shouldn't be passed to the underlying function.
+    if (IntrMemName && e-i <= 2)
+      break;
+
+    ISD::ArgFlagsTy Flags;
+    unsigned AttrInd = i - CS.arg_begin() + 1;
+    if (CS.paramHasAttr(AttrInd, Attribute::SExt))
+      Flags.setSExt();
+    if (CS.paramHasAttr(AttrInd, Attribute::ZExt))
+      Flags.setZExt();
+
+    // FIXME: Only handle *easy* calls for now.
+    if (CS.paramHasAttr(AttrInd, Attribute::InReg) ||
+        CS.paramHasAttr(AttrInd, Attribute::StructRet) ||
+        CS.paramHasAttr(AttrInd, Attribute::Nest) ||
+        CS.paramHasAttr(AttrInd, Attribute::ByVal))
+      return false;
+
+    Type *ArgTy = (*i)->getType();
+    MVT ArgVT;
+    if (!isTypeLegal(ArgTy, ArgVT) && ArgVT != MVT::i16 && ArgVT != MVT::i8 &&
+        ArgVT != MVT::i1)
+      return false;
+
+    unsigned Arg = getRegForValue(*i);
+    if (Arg == 0)
+      return false;
+
+    unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy);
+    Flags.setOrigAlign(OriginalAlignment);
+
+    Args.push_back(*i);
+    ArgRegs.push_back(Arg);
+    ArgVTs.push_back(ArgVT);
+    ArgFlags.push_back(Flags);
+  }
+
+  // Handle the arguments now that we've gotten them.
+  SmallVector<unsigned, 4> RegArgs;
+  unsigned NumBytes;
+  if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags,
+                       RegArgs, CC, NumBytes, isVarArg))
+    return false;
+
+  bool UseReg = false;
+  const GlobalValue *GV = dyn_cast<GlobalValue>(Callee);
+  if (!GV || EnableARMLongCalls) UseReg = true;
+
+  unsigned CalleeReg = 0;
+  if (UseReg) {
+    if (IntrMemName)
+      CalleeReg = getLibcallReg(IntrMemName);
+    else
+      CalleeReg = getRegForValue(Callee);
+
+    if (CalleeReg == 0) return false;
+  }
+
+  // Issue the call.
+  unsigned CallOpc = ARMSelectCallOp(UseReg);
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
+                                    DbgLoc, TII.get(CallOpc));
+
+  unsigned char OpFlags = 0;
+
+  // Add MO_PLT for global address or external symbol in the PIC relocation
+  // model.
+  if (Subtarget->isTargetELF() && TM.getRelocationModel() == Reloc::PIC_)
+    OpFlags = ARMII::MO_PLT;
+
+  // ARM calls don't take a predicate, but tBL / tBLX do.
+  if(isThumb2)
+    AddDefaultPred(MIB);
+  if (UseReg)
+    MIB.addReg(CalleeReg);
+  else if (!IntrMemName)
+    MIB.addGlobalAddress(GV, 0, OpFlags);
+  else
+    MIB.addExternalSymbol(IntrMemName, OpFlags);
+
+  // Add implicit physical register uses to the call.
+  for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
+    MIB.addReg(RegArgs[i], RegState::Implicit);
+
+  // Add a register mask with the call-preserved registers.
+  // Proper defs for return values will be added by setPhysRegsDeadExcept().
+  MIB.addRegMask(TRI.getCallPreservedMask(CC));
+
+  // Finish off the call including any return values.
+  SmallVector<unsigned, 4> UsedRegs;
+  if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes, isVarArg))
+    return false;
+
+  // Set all unused physreg defs as dead.
+  static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI);
+
+  return true;
+}
+
+bool ARMFastISel::ARMIsMemCpySmall(uint64_t Len) {
+  return Len <= 16;
+}
+
+bool ARMFastISel::ARMTryEmitSmallMemCpy(Address Dest, Address Src,
+                                        uint64_t Len, unsigned Alignment) {
+  // Make sure we don't bloat code by inlining very large memcpy's.
+  if (!ARMIsMemCpySmall(Len))
+    return false;
+
+  while (Len) {
+    MVT VT;
+    if (!Alignment || Alignment >= 4) {
+      if (Len >= 4)
+        VT = MVT::i32;
+      else if (Len >= 2)
+        VT = MVT::i16;
+      else {
+        assert (Len == 1 && "Expected a length of 1!");
+        VT = MVT::i8;
+      }
+    } else {
+      // Bound based on alignment.
+      if (Len >= 2 && Alignment == 2)
+        VT = MVT::i16;
+      else {
+        VT = MVT::i8;
+      }
+    }
+
+    bool RV;
+    unsigned ResultReg;
+    RV = ARMEmitLoad(VT, ResultReg, Src);
+    assert (RV == true && "Should be able to handle this load.");
+    RV = ARMEmitStore(VT, ResultReg, Dest);
+    assert (RV == true && "Should be able to handle this store.");
+    (void)RV;
+
+    unsigned Size = VT.getSizeInBits()/8;
+    Len -= Size;
+    Dest.Offset += Size;
+    Src.Offset += Size;
+  }
+
+  return true;
+}
+
+bool ARMFastISel::SelectIntrinsicCall(const IntrinsicInst &I) {
+  // FIXME: Handle more intrinsics.
+  switch (I.getIntrinsicID()) {
+  default: return false;
+  case Intrinsic::frameaddress: {
+    MachineFrameInfo *MFI = FuncInfo.MF->getFrameInfo();
+    MFI->setFrameAddressIsTaken(true);
+
+    unsigned LdrOpc;
+    const TargetRegisterClass *RC;
+    if (isThumb2) {
+      LdrOpc =  ARM::t2LDRi12;
+      RC = (const TargetRegisterClass*)&ARM::tGPRRegClass;
+    } else {
+      LdrOpc =  ARM::LDRi12;
+      RC = (const TargetRegisterClass*)&ARM::GPRRegClass;
+    }
+
+    const ARMBaseRegisterInfo *RegInfo =
+          static_cast<const ARMBaseRegisterInfo*>(TM.getRegisterInfo());
+    unsigned FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF));
+    unsigned SrcReg = FramePtr;
+
+    // Recursively load frame address
+    // ldr r0 [fp]
+    // ldr r0 [r0]
+    // ldr r0 [r0]
+    // ...
+    unsigned DestReg;
+    unsigned Depth = cast<ConstantInt>(I.getOperand(0))->getZExtValue();
+    while (Depth--) {
+      DestReg = createResultReg(RC);
+      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                              TII.get(LdrOpc), DestReg)
+                      .addReg(SrcReg).addImm(0));
+      SrcReg = DestReg;
+    }
+    UpdateValueMap(&I, SrcReg);
+    return true;
+  }
+  case Intrinsic::memcpy:
+  case Intrinsic::memmove: {
+    const MemTransferInst &MTI = cast<MemTransferInst>(I);
+    // Don't handle volatile.
+    if (MTI.isVolatile())
+      return false;
+
+    // Disable inlining for memmove before calls to ComputeAddress.  Otherwise,
+    // we would emit dead code because we don't currently handle memmoves.
+    bool isMemCpy = (I.getIntrinsicID() == Intrinsic::memcpy);
+    if (isa<ConstantInt>(MTI.getLength()) && isMemCpy) {
+      // Small memcpy's are common enough that we want to do them without a call
+      // if possible.
+      uint64_t Len = cast<ConstantInt>(MTI.getLength())->getZExtValue();
+      if (ARMIsMemCpySmall(Len)) {
+        Address Dest, Src;
+        if (!ARMComputeAddress(MTI.getRawDest(), Dest) ||
+            !ARMComputeAddress(MTI.getRawSource(), Src))
+          return false;
+        unsigned Alignment = MTI.getAlignment();
+        if (ARMTryEmitSmallMemCpy(Dest, Src, Len, Alignment))
+          return true;
+      }
+    }
+
+    if (!MTI.getLength()->getType()->isIntegerTy(32))
+      return false;
+
+    if (MTI.getSourceAddressSpace() > 255 || MTI.getDestAddressSpace() > 255)
+      return false;
+
+    const char *IntrMemName = isa<MemCpyInst>(I) ? "memcpy" : "memmove";
+    return SelectCall(&I, IntrMemName);
+  }
+  case Intrinsic::memset: {
+    const MemSetInst &MSI = cast<MemSetInst>(I);
+    // Don't handle volatile.
+    if (MSI.isVolatile())
+      return false;
+
+    if (!MSI.getLength()->getType()->isIntegerTy(32))
+      return false;
+
+    if (MSI.getDestAddressSpace() > 255)
+      return false;
+
+    return SelectCall(&I, "memset");
+  }
+  case Intrinsic::trap: {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(
+      Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP));
+    return true;
+  }
+  }
+}
+
+bool ARMFastISel::SelectTrunc(const Instruction *I) {
+  // The high bits for a type smaller than the register size are assumed to be
+  // undefined.
+  Value *Op = I->getOperand(0);
+
+  EVT SrcVT, DestVT;
+  SrcVT = TLI.getValueType(Op->getType(), true);
+  DestVT = TLI.getValueType(I->getType(), true);
+
+  if (SrcVT != MVT::i32 && SrcVT != MVT::i16 && SrcVT != MVT::i8)
+    return false;
+  if (DestVT != MVT::i16 && DestVT != MVT::i8 && DestVT != MVT::i1)
+    return false;
+
+  unsigned SrcReg = getRegForValue(Op);
+  if (!SrcReg) return false;
+
+  // Because the high bits are undefined, a truncate doesn't generate
+  // any code.
+  UpdateValueMap(I, SrcReg);
+  return true;
+}
+
+unsigned ARMFastISel::ARMEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+                                    bool isZExt) {
+  if (DestVT != MVT::i32 && DestVT != MVT::i16 && DestVT != MVT::i8)
+    return 0;
+  if (SrcVT != MVT::i16 && SrcVT != MVT::i8 && SrcVT != MVT::i1)
+    return 0;
+
+  // Table of which combinations can be emitted as a single instruction,
+  // and which will require two.
+  static const uint8_t isSingleInstrTbl[3][2][2][2] = {
+    //            ARM                     Thumb
+    //           !hasV6Ops  hasV6Ops     !hasV6Ops  hasV6Ops
+    //    ext:     s  z      s  z          s  z      s  z
+    /*  1 */ { { { 0, 1 }, { 0, 1 } }, { { 0, 0 }, { 0, 1 } } },
+    /*  8 */ { { { 0, 1 }, { 1, 1 } }, { { 0, 0 }, { 1, 1 } } },
+    /* 16 */ { { { 0, 0 }, { 1, 1 } }, { { 0, 0 }, { 1, 1 } } }
+  };
+
+  // Target registers for:
+  //  - For ARM can never be PC.
+  //  - For 16-bit Thumb are restricted to lower 8 registers.
+  //  - For 32-bit Thumb are restricted to non-SP and non-PC.
+  static const TargetRegisterClass *RCTbl[2][2] = {
+    // Instructions: Two                     Single
+    /* ARM      */ { &ARM::GPRnopcRegClass, &ARM::GPRnopcRegClass },
+    /* Thumb    */ { &ARM::tGPRRegClass,    &ARM::rGPRRegClass    }
+  };
+
+  // Table governing the instruction(s) to be emitted.
+  static const struct InstructionTable {
+    uint32_t Opc   : 16;
+    uint32_t hasS  :  1; // Some instructions have an S bit, always set it to 0.
+    uint32_t Shift :  7; // For shift operand addressing mode, used by MOVsi.
+    uint32_t Imm   :  8; // All instructions have either a shift or a mask.
+  } IT[2][2][3][2] = {
+    { // Two instructions (first is left shift, second is in this table).
+      { // ARM                Opc           S  Shift             Imm
+        /*  1 bit sext */ { { ARM::MOVsi  , 1, ARM_AM::asr     ,  31 },
+        /*  1 bit zext */   { ARM::MOVsi  , 1, ARM_AM::lsr     ,  31 } },
+        /*  8 bit sext */ { { ARM::MOVsi  , 1, ARM_AM::asr     ,  24 },
+        /*  8 bit zext */   { ARM::MOVsi  , 1, ARM_AM::lsr     ,  24 } },
+        /* 16 bit sext */ { { ARM::MOVsi  , 1, ARM_AM::asr     ,  16 },
+        /* 16 bit zext */   { ARM::MOVsi  , 1, ARM_AM::lsr     ,  16 } }
+      },
+      { // Thumb              Opc           S  Shift             Imm
+        /*  1 bit sext */ { { ARM::tASRri , 0, ARM_AM::no_shift,  31 },
+        /*  1 bit zext */   { ARM::tLSRri , 0, ARM_AM::no_shift,  31 } },
+        /*  8 bit sext */ { { ARM::tASRri , 0, ARM_AM::no_shift,  24 },
+        /*  8 bit zext */   { ARM::tLSRri , 0, ARM_AM::no_shift,  24 } },
+        /* 16 bit sext */ { { ARM::tASRri , 0, ARM_AM::no_shift,  16 },
+        /* 16 bit zext */   { ARM::tLSRri , 0, ARM_AM::no_shift,  16 } }
+      }
+    },
+    { // Single instruction.
+      { // ARM                Opc           S  Shift             Imm
+        /*  1 bit sext */ { { ARM::KILL   , 0, ARM_AM::no_shift,   0 },
+        /*  1 bit zext */   { ARM::ANDri  , 1, ARM_AM::no_shift,   1 } },
+        /*  8 bit sext */ { { ARM::SXTB   , 0, ARM_AM::no_shift,   0 },
+        /*  8 bit zext */   { ARM::ANDri  , 1, ARM_AM::no_shift, 255 } },
+        /* 16 bit sext */ { { ARM::SXTH   , 0, ARM_AM::no_shift,   0 },
+        /* 16 bit zext */   { ARM::UXTH   , 0, ARM_AM::no_shift,   0 } }
+      },
+      { // Thumb              Opc           S  Shift             Imm
+        /*  1 bit sext */ { { ARM::KILL   , 0, ARM_AM::no_shift,   0 },
+        /*  1 bit zext */   { ARM::t2ANDri, 1, ARM_AM::no_shift,   1 } },
+        /*  8 bit sext */ { { ARM::t2SXTB , 0, ARM_AM::no_shift,   0 },
+        /*  8 bit zext */   { ARM::t2ANDri, 1, ARM_AM::no_shift, 255 } },
+        /* 16 bit sext */ { { ARM::t2SXTH , 0, ARM_AM::no_shift,   0 },
+        /* 16 bit zext */   { ARM::t2UXTH , 0, ARM_AM::no_shift,   0 } }
+      }
+    }
+  };
+
+  unsigned SrcBits = SrcVT.getSizeInBits();
+  unsigned DestBits = DestVT.getSizeInBits();
+  (void) DestBits;
+  assert((SrcBits < DestBits) && "can only extend to larger types");
+  assert((DestBits == 32 || DestBits == 16 || DestBits == 8) &&
+         "other sizes unimplemented");
+  assert((SrcBits == 16 || SrcBits == 8 || SrcBits == 1) &&
+         "other sizes unimplemented");
+
+  bool hasV6Ops = Subtarget->hasV6Ops();
+  unsigned Bitness = SrcBits / 8;  // {1,8,16}=>{0,1,2}
+  assert((Bitness < 3) && "sanity-check table bounds");
+
+  bool isSingleInstr = isSingleInstrTbl[Bitness][isThumb2][hasV6Ops][isZExt];
+  const TargetRegisterClass *RC = RCTbl[isThumb2][isSingleInstr];
+  const InstructionTable *ITP = &IT[isSingleInstr][isThumb2][Bitness][isZExt];
+  unsigned Opc = ITP->Opc;
+  assert(ARM::KILL != Opc && "Invalid table entry");
+  unsigned hasS = ITP->hasS;
+  ARM_AM::ShiftOpc Shift = (ARM_AM::ShiftOpc) ITP->Shift;
+  assert(((Shift == ARM_AM::no_shift) == (Opc != ARM::MOVsi)) &&
+         "only MOVsi has shift operand addressing mode");
+  unsigned Imm = ITP->Imm;
+
+  // 16-bit Thumb instructions always set CPSR (unless they're in an IT block).
+  bool setsCPSR = &ARM::tGPRRegClass == RC;
+  unsigned LSLOpc = isThumb2 ? ARM::tLSLri : ARM::MOVsi;
+  unsigned ResultReg;
+  // MOVsi encodes shift and immediate in shift operand addressing mode.
+  // The following condition has the same value when emitting two
+  // instruction sequences: both are shifts.
+  bool ImmIsSO = (Shift != ARM_AM::no_shift);
+
+  // Either one or two instructions are emitted.
+  // They're always of the form:
+  //   dst = in OP imm
+  // CPSR is set only by 16-bit Thumb instructions.
+  // Predicate, if any, is AL.
+  // S bit, if available, is always 0.
+  // When two are emitted the first's result will feed as the second's input,
+  // that value is then dead.
+  unsigned NumInstrsEmitted = isSingleInstr ? 1 : 2;
+  for (unsigned Instr = 0; Instr != NumInstrsEmitted; ++Instr) {
+    ResultReg = createResultReg(RC);
+    bool isLsl = (0 == Instr) && !isSingleInstr;
+    unsigned Opcode = isLsl ? LSLOpc : Opc;
+    ARM_AM::ShiftOpc ShiftAM = isLsl ? ARM_AM::lsl : Shift;
+    unsigned ImmEnc = ImmIsSO ? ARM_AM::getSORegOpc(ShiftAM, Imm) : Imm;
+    bool isKill = 1 == Instr;
+    MachineInstrBuilder MIB = BuildMI(
+        *FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opcode), ResultReg);
+    if (setsCPSR)
+      MIB.addReg(ARM::CPSR, RegState::Define);
+    SrcReg = constrainOperandRegClass(TII.get(Opcode), SrcReg, 1 + setsCPSR);
+    AddDefaultPred(MIB.addReg(SrcReg, isKill * RegState::Kill).addImm(ImmEnc));
+    if (hasS)
+      AddDefaultCC(MIB);
+    // Second instruction consumes the first's result.
+    SrcReg = ResultReg;
+  }
+
+  return ResultReg;
+}
+
+bool ARMFastISel::SelectIntExt(const Instruction *I) {
+  // On ARM, in general, integer casts don't involve legal types; this code
+  // handles promotable integers.
+  Type *DestTy = I->getType();
+  Value *Src = I->getOperand(0);
+  Type *SrcTy = Src->getType();
+
+  bool isZExt = isa<ZExtInst>(I);
+  unsigned SrcReg = getRegForValue(Src);
+  if (!SrcReg) return false;
+
+  EVT SrcEVT, DestEVT;
+  SrcEVT = TLI.getValueType(SrcTy, true);
+  DestEVT = TLI.getValueType(DestTy, true);
+  if (!SrcEVT.isSimple()) return false;
+  if (!DestEVT.isSimple()) return false;
+
+  MVT SrcVT = SrcEVT.getSimpleVT();
+  MVT DestVT = DestEVT.getSimpleVT();
+  unsigned ResultReg = ARMEmitIntExt(SrcVT, SrcReg, DestVT, isZExt);
+  if (ResultReg == 0) return false;
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool ARMFastISel::SelectShift(const Instruction *I,
+                              ARM_AM::ShiftOpc ShiftTy) {
+  // We handle thumb2 mode by target independent selector
+  // or SelectionDAG ISel.
+  if (isThumb2)
+    return false;
+
+  // Only handle i32 now.
+  EVT DestVT = TLI.getValueType(I->getType(), true);
+  if (DestVT != MVT::i32)
+    return false;
+
+  unsigned Opc = ARM::MOVsr;
+  unsigned ShiftImm;
+  Value *Src2Value = I->getOperand(1);
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(Src2Value)) {
+    ShiftImm = CI->getZExtValue();
+
+    // Fall back to selection DAG isel if the shift amount
+    // is zero or greater than the width of the value type.
+    if (ShiftImm == 0 || ShiftImm >=32)
+      return false;
+
+    Opc = ARM::MOVsi;
+  }
+
+  Value *Src1Value = I->getOperand(0);
+  unsigned Reg1 = getRegForValue(Src1Value);
+  if (Reg1 == 0) return false;
+
+  unsigned Reg2 = 0;
+  if (Opc == ARM::MOVsr) {
+    Reg2 = getRegForValue(Src2Value);
+    if (Reg2 == 0) return false;
+  }
+
+  unsigned ResultReg = createResultReg(&ARM::GPRnopcRegClass);
+  if(ResultReg == 0) return false;
+
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                                    TII.get(Opc), ResultReg)
+                            .addReg(Reg1);
+
+  if (Opc == ARM::MOVsi)
+    MIB.addImm(ARM_AM::getSORegOpc(ShiftTy, ShiftImm));
+  else if (Opc == ARM::MOVsr) {
+    MIB.addReg(Reg2);
+    MIB.addImm(ARM_AM::getSORegOpc(ShiftTy, 0));
+  }
+
+  AddOptionalDefs(MIB);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+// TODO: SoftFP support.
+bool ARMFastISel::TargetSelectInstruction(const Instruction *I) {
+
+  switch (I->getOpcode()) {
+    case Instruction::Load:
+      return SelectLoad(I);
+    case Instruction::Store:
+      return SelectStore(I);
+    case Instruction::Br:
+      return SelectBranch(I);
+    case Instruction::IndirectBr:
+      return SelectIndirectBr(I);
+    case Instruction::ICmp:
+    case Instruction::FCmp:
+      return SelectCmp(I);
+    case Instruction::FPExt:
+      return SelectFPExt(I);
+    case Instruction::FPTrunc:
+      return SelectFPTrunc(I);
+    case Instruction::SIToFP:
+      return SelectIToFP(I, /*isSigned*/ true);
+    case Instruction::UIToFP:
+      return SelectIToFP(I, /*isSigned*/ false);
+    case Instruction::FPToSI:
+      return SelectFPToI(I, /*isSigned*/ true);
+    case Instruction::FPToUI:
+      return SelectFPToI(I, /*isSigned*/ false);
+    case Instruction::Add:
+      return SelectBinaryIntOp(I, ISD::ADD);
+    case Instruction::Or:
+      return SelectBinaryIntOp(I, ISD::OR);
+    case Instruction::Sub:
+      return SelectBinaryIntOp(I, ISD::SUB);
+    case Instruction::FAdd:
+      return SelectBinaryFPOp(I, ISD::FADD);
+    case Instruction::FSub:
+      return SelectBinaryFPOp(I, ISD::FSUB);
+    case Instruction::FMul:
+      return SelectBinaryFPOp(I, ISD::FMUL);
+    case Instruction::SDiv:
+      return SelectDiv(I, /*isSigned*/ true);
+    case Instruction::UDiv:
+      return SelectDiv(I, /*isSigned*/ false);
+    case Instruction::SRem:
+      return SelectRem(I, /*isSigned*/ true);
+    case Instruction::URem:
+      return SelectRem(I, /*isSigned*/ false);
+    case Instruction::Call:
+      if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
+        return SelectIntrinsicCall(*II);
+      return SelectCall(I);
+    case Instruction::Select:
+      return SelectSelect(I);
+    case Instruction::Ret:
+      return SelectRet(I);
+    case Instruction::Trunc:
+      return SelectTrunc(I);
+    case Instruction::ZExt:
+    case Instruction::SExt:
+      return SelectIntExt(I);
+    case Instruction::Shl:
+      return SelectShift(I, ARM_AM::lsl);
+    case Instruction::LShr:
+      return SelectShift(I, ARM_AM::lsr);
+    case Instruction::AShr:
+      return SelectShift(I, ARM_AM::asr);
+    default: break;
+  }
+  return false;
+}
+
+namespace {
+// This table describes sign- and zero-extend instructions which can be
+// folded into a preceding load. All of these extends have an immediate
+// (sometimes a mask and sometimes a shift) that's applied after
+// extension.
+const struct FoldableLoadExtendsStruct {
+  uint16_t Opc[2];  // ARM, Thumb.
+  uint8_t ExpectedImm;
+  uint8_t isZExt     : 1;
+  uint8_t ExpectedVT : 7;
+} FoldableLoadExtends[] = {
+  { { ARM::SXTH,  ARM::t2SXTH  },   0, 0, MVT::i16 },
+  { { ARM::UXTH,  ARM::t2UXTH  },   0, 1, MVT::i16 },
+  { { ARM::ANDri, ARM::t2ANDri }, 255, 1, MVT::i8  },
+  { { ARM::SXTB,  ARM::t2SXTB  },   0, 0, MVT::i8  },
+  { { ARM::UXTB,  ARM::t2UXTB  },   0, 1, MVT::i8  }
+};
+}
+
+/// \brief The specified machine instr operand is a vreg, and that
+/// vreg is being provided by the specified load instruction.  If possible,
+/// try to fold the load as an operand to the instruction, returning true if
+/// successful.
+bool ARMFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
+                                      const LoadInst *LI) {
+  // Verify we have a legal type before going any further.
+  MVT VT;
+  if (!isLoadTypeLegal(LI->getType(), VT))
+    return false;
+
+  // Combine load followed by zero- or sign-extend.
+  // ldrb r1, [r0]       ldrb r1, [r0]
+  // uxtb r2, r1     =>
+  // mov  r3, r2         mov  r3, r1
+  if (MI->getNumOperands() < 3 || !MI->getOperand(2).isImm())
+    return false;
+  const uint64_t Imm = MI->getOperand(2).getImm();
+
+  bool Found = false;
+  bool isZExt;
+  for (unsigned i = 0, e = array_lengthof(FoldableLoadExtends);
+       i != e; ++i) {
+    if (FoldableLoadExtends[i].Opc[isThumb2] == MI->getOpcode() &&
+        (uint64_t)FoldableLoadExtends[i].ExpectedImm == Imm &&
+        MVT((MVT::SimpleValueType)FoldableLoadExtends[i].ExpectedVT) == VT) {
+      Found = true;
+      isZExt = FoldableLoadExtends[i].isZExt;
+    }
+  }
+  if (!Found) return false;
+
+  // See if we can handle this address.
+  Address Addr;
+  if (!ARMComputeAddress(LI->getOperand(0), Addr)) return false;
+
+  unsigned ResultReg = MI->getOperand(0).getReg();
+  if (!ARMEmitLoad(VT, ResultReg, Addr, LI->getAlignment(), isZExt, false))
+    return false;
+  MI->eraseFromParent();
+  return true;
+}
+
+unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV,
+                                     unsigned Align, MVT VT) {
+  bool UseGOTOFF = GV->hasLocalLinkage() || GV->hasHiddenVisibility();
+  ARMConstantPoolConstant *CPV =
+    ARMConstantPoolConstant::Create(GV, UseGOTOFF ? ARMCP::GOTOFF : ARMCP::GOT);
+  unsigned Idx = MCP.getConstantPoolIndex(CPV, Align);
+
+  unsigned Opc;
+  unsigned DestReg1 = createResultReg(TLI.getRegClassFor(VT));
+  // Load value.
+  if (isThumb2) {
+    DestReg1 = constrainOperandRegClass(TII.get(ARM::t2LDRpci), DestReg1, 0);
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                            TII.get(ARM::t2LDRpci), DestReg1)
+                    .addConstantPoolIndex(Idx));
+    Opc = UseGOTOFF ? ARM::t2ADDrr : ARM::t2LDRs;
+  } else {
+    // The extra immediate is for addrmode2.
+    DestReg1 = constrainOperandRegClass(TII.get(ARM::LDRcp), DestReg1, 0);
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
+                            DbgLoc, TII.get(ARM::LDRcp), DestReg1)
+                    .addConstantPoolIndex(Idx).addImm(0));
+    Opc = UseGOTOFF ? ARM::ADDrr : ARM::LDRrs;
+  }
+
+  unsigned GlobalBaseReg = AFI->getGlobalBaseReg();
+  if (GlobalBaseReg == 0) {
+    GlobalBaseReg = MRI.createVirtualRegister(TLI.getRegClassFor(VT));
+    AFI->setGlobalBaseReg(GlobalBaseReg);
+  }
+
+  unsigned DestReg2 = createResultReg(TLI.getRegClassFor(VT));
+  DestReg2 = constrainOperandRegClass(TII.get(Opc), DestReg2, 0);
+  DestReg1 = constrainOperandRegClass(TII.get(Opc), DestReg1, 1);
+  GlobalBaseReg = constrainOperandRegClass(TII.get(Opc), GlobalBaseReg, 2);
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
+                                    DbgLoc, TII.get(Opc), DestReg2)
+                            .addReg(DestReg1)
+                            .addReg(GlobalBaseReg);
+  if (!UseGOTOFF)
+    MIB.addImm(0);
+  AddOptionalDefs(MIB);
+
+  return DestReg2;
+}
+
+bool ARMFastISel::FastLowerArguments() {
+  if (!FuncInfo.CanLowerReturn)
+    return false;
+
+  const Function *F = FuncInfo.Fn;
+  if (F->isVarArg())
+    return false;
+
+  CallingConv::ID CC = F->getCallingConv();
+  switch (CC) {
+  default:
+    return false;
+  case CallingConv::Fast:
+  case CallingConv::C:
+  case CallingConv::ARM_AAPCS_VFP:
+  case CallingConv::ARM_AAPCS:
+  case CallingConv::ARM_APCS:
+    break;
+  }
+
+  // Only handle simple cases. i.e. Up to 4 i8/i16/i32 scalar arguments
+  // which are passed in r0 - r3.
+  unsigned Idx = 1;
+  for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+       I != E; ++I, ++Idx) {
+    if (Idx > 4)
+      return false;
+
+    if (F->getAttributes().hasAttribute(Idx, Attribute::InReg) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::StructRet) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::ByVal))
+      return false;
+
+    Type *ArgTy = I->getType();
+    if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy())
+      return false;
+
+    EVT ArgVT = TLI.getValueType(ArgTy);
+    if (!ArgVT.isSimple()) return false;
+    switch (ArgVT.getSimpleVT().SimpleTy) {
+    case MVT::i8:
+    case MVT::i16:
+    case MVT::i32:
+      break;
+    default:
+      return false;
+    }
+  }
+
+
+  static const uint16_t GPRArgRegs[] = {
+    ARM::R0, ARM::R1, ARM::R2, ARM::R3
+  };
+
+  const TargetRegisterClass *RC = &ARM::rGPRRegClass;
+  Idx = 0;
+  for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+       I != E; ++I, ++Idx) {
+    unsigned SrcReg = GPRArgRegs[Idx];
+    unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
+    // FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
+    // Without this, EmitLiveInCopies may eliminate the livein if its only
+    // use is a bitcast (which isn't turned into an instruction).
+    unsigned ResultReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY),
+            ResultReg).addReg(DstReg, getKillRegState(true));
+    UpdateValueMap(I, ResultReg);
+  }
+
+  return true;
+}
+
+namespace llvm {
+  FastISel *ARM::createFastISel(FunctionLoweringInfo &funcInfo,
+                                const TargetLibraryInfo *libInfo) {
+    const TargetMachine &TM = funcInfo.MF->getTarget();
+
+    const ARMSubtarget *Subtarget = &TM.getSubtarget<ARMSubtarget>();
+    // Thumb2 support on iOS; ARM support on iOS, Linux and NaCl.
+    bool UseFastISel = false;
+    UseFastISel |= Subtarget->isTargetMachO() && !Subtarget->isThumb1Only();
+    UseFastISel |= Subtarget->isTargetLinux() && !Subtarget->isThumb();
+    UseFastISel |= Subtarget->isTargetNaCl() && !Subtarget->isThumb();
+
+    if (UseFastISel) {
+      // iOS always has a FP for backtracking, force other targets
+      // to keep their FP when doing FastISel. The emitted code is
+      // currently superior, and in cases like test-suite's lencod
+      // FastISel isn't quite correct when FP is eliminated.
+      TM.Options.NoFramePointerElim = true;
+      return new ARMFastISel(funcInfo, libInfo);
+    }
+    return nullptr;
+  }
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMFeatures.h b/contrib/llvm/lib/Target/ARM/ARMFeatures.h
new file mode 100644
index 0000000..e191a3c
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMFeatures.h
@@ -0,0 +1,97 @@
+//===-- ARMFeatures.h - Checks for ARM instruction features -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the code shared between ARM CodeGen and ARM MC
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_ARM_FEATURES_H
+#define TARGET_ARM_FEATURES_H
+
+#include "MCTargetDesc/ARMMCTargetDesc.h"
+
+namespace llvm {
+
+template<typename InstrType> // could be MachineInstr or MCInst
+bool IsCPSRDead(InstrType *Instr);
+
+template<typename InstrType> // could be MachineInstr or MCInst
+inline bool isV8EligibleForIT(InstrType *Instr) {
+  switch (Instr->getOpcode()) {
+  default:
+    return false;
+  case ARM::tADC:
+  case ARM::tADDi3:
+  case ARM::tADDi8:
+  case ARM::tADDrr:
+  case ARM::tAND:
+  case ARM::tASRri:
+  case ARM::tASRrr:
+  case ARM::tBIC:
+  case ARM::tEOR:
+  case ARM::tLSLri:
+  case ARM::tLSLrr:
+  case ARM::tLSRri:
+  case ARM::tLSRrr:
+  case ARM::tMOVi8:
+  case ARM::tMUL:
+  case ARM::tMVN:
+  case ARM::tORR:
+  case ARM::tROR:
+  case ARM::tRSB:
+  case ARM::tSBC:
+  case ARM::tSUBi3:
+  case ARM::tSUBi8:
+  case ARM::tSUBrr:
+    // Outside of an IT block, these set CPSR.
+    return IsCPSRDead(Instr);
+  case ARM::tADDrSPi:
+  case ARM::tCMNz:
+  case ARM::tCMPi8:
+  case ARM::tCMPr:
+  case ARM::tLDRBi:
+  case ARM::tLDRBr:
+  case ARM::tLDRHi:
+  case ARM::tLDRHr:
+  case ARM::tLDRSB:
+  case ARM::tLDRSH:
+  case ARM::tLDRi:
+  case ARM::tLDRr:
+  case ARM::tLDRspi:
+  case ARM::tSTRBi:
+  case ARM::tSTRBr:
+  case ARM::tSTRHi:
+  case ARM::tSTRHr:
+  case ARM::tSTRi:
+  case ARM::tSTRr:
+  case ARM::tSTRspi:
+  case ARM::tTST:
+    return true;
+// there are some "conditionally deprecated" opcodes
+  case ARM::tADDspr:
+  case ARM::tBLXr:
+    return Instr->getOperand(2).getReg() != ARM::PC;
+  // ADD PC, SP and BLX PC were always unpredictable,
+  // now on top of it they're deprecated
+  case ARM::tADDrSP:
+  case ARM::tBX:
+    return Instr->getOperand(0).getReg() != ARM::PC;
+  case ARM::tADDhirr:
+    return Instr->getOperand(0).getReg() != ARM::PC &&
+           Instr->getOperand(2).getReg() != ARM::PC;
+  case ARM::tCMPhir:
+  case ARM::tMOVr:
+    return Instr->getOperand(0).getReg() != ARM::PC &&
+           Instr->getOperand(1).getReg() != ARM::PC;
+  }
+}
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp
new file mode 100644
index 0000000..a67b360
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -0,0 +1,2039 @@
+//===-- ARMFrameLowering.cpp - ARM Frame Information ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMFrameLowering.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMBaseRegisterInfo.h"
+#include "ARMConstantPoolValue.h"
+#include "ARMMachineFunctionInfo.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+static cl::opt<bool>
+SpillAlignedNEONRegs("align-neon-spills", cl::Hidden, cl::init(true),
+                     cl::desc("Align ARM NEON spills in prolog and epilog"));
+
+static MachineBasicBlock::iterator
+skipAlignedDPRCS2Spills(MachineBasicBlock::iterator MI,
+                        unsigned NumAlignedDPRCS2Regs);
+
+ARMFrameLowering::ARMFrameLowering(const ARMSubtarget &sti)
+    : TargetFrameLowering(StackGrowsDown, sti.getStackAlignment(), 0, 4),
+      STI(sti) {}
+
+/// hasFP - Return true if the specified function should have a dedicated frame
+/// pointer register.  This is true if the function has variable sized allocas
+/// or if frame pointer elimination is disabled.
+bool ARMFrameLowering::hasFP(const MachineFunction &MF) const {
+  const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
+
+  // iOS requires FP not to be clobbered for backtracing purpose.
+  if (STI.isTargetIOS())
+    return true;
+
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  // Always eliminate non-leaf frame pointers.
+  return ((MF.getTarget().Options.DisableFramePointerElim(MF) &&
+           MFI->hasCalls()) ||
+          RegInfo->needsStackRealignment(MF) ||
+          MFI->hasVarSizedObjects() ||
+          MFI->isFrameAddressTaken());
+}
+
+/// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
+/// not required, we reserve argument space for call sites in the function
+/// immediately on entry to the current function.  This eliminates the need for
+/// add/sub sp brackets around call sites.  Returns true if the call frame is
+/// included as part of the stack frame.
+bool ARMFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+  const MachineFrameInfo *FFI = MF.getFrameInfo();
+  unsigned CFSize = FFI->getMaxCallFrameSize();
+  // It's not always a good idea to include the call frame as part of the
+  // stack frame. ARM (especially Thumb) has small immediate offset to
+  // address the stack frame. So a large call frame can cause poor codegen
+  // and may even makes it impossible to scavenge a register.
+  if (CFSize >= ((1 << 12) - 1) / 2)  // Half of imm12
+    return false;
+
+  return !MF.getFrameInfo()->hasVarSizedObjects();
+}
+
+/// canSimplifyCallFramePseudos - If there is a reserved call frame, the
+/// call frame pseudos can be simplified.  Unlike most targets, having a FP
+/// is not sufficient here since we still may reference some objects via SP
+/// even when FP is available in Thumb2 mode.
+bool
+ARMFrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const {
+  return hasReservedCallFrame(MF) || MF.getFrameInfo()->hasVarSizedObjects();
+}
+
+static bool isCSRestore(MachineInstr *MI,
+                        const ARMBaseInstrInfo &TII,
+                        const MCPhysReg *CSRegs) {
+  // Integer spill area is handled with "pop".
+  if (isPopOpcode(MI->getOpcode())) {
+    // The first two operands are predicates. The last two are
+    // imp-def and imp-use of SP. Check everything in between.
+    for (int i = 5, e = MI->getNumOperands(); i != e; ++i)
+      if (!isCalleeSavedRegister(MI->getOperand(i).getReg(), CSRegs))
+        return false;
+    return true;
+  }
+  if ((MI->getOpcode() == ARM::LDR_POST_IMM ||
+       MI->getOpcode() == ARM::LDR_POST_REG ||
+       MI->getOpcode() == ARM::t2LDR_POST) &&
+      isCalleeSavedRegister(MI->getOperand(0).getReg(), CSRegs) &&
+      MI->getOperand(1).getReg() == ARM::SP)
+    return true;
+
+  return false;
+}
+
+static void emitRegPlusImmediate(bool isARM, MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator &MBBI, DebugLoc dl,
+                                 const ARMBaseInstrInfo &TII, unsigned DestReg,
+                                 unsigned SrcReg, int NumBytes,
+                                 unsigned MIFlags = MachineInstr::NoFlags,
+                                 ARMCC::CondCodes Pred = ARMCC::AL,
+                                 unsigned PredReg = 0) {
+  if (isARM)
+    emitARMRegPlusImmediate(MBB, MBBI, dl, DestReg, SrcReg, NumBytes,
+                            Pred, PredReg, TII, MIFlags);
+  else
+    emitT2RegPlusImmediate(MBB, MBBI, dl, DestReg, SrcReg, NumBytes,
+                           Pred, PredReg, TII, MIFlags);
+}
+
+static void emitSPUpdate(bool isARM, MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator &MBBI, DebugLoc dl,
+                         const ARMBaseInstrInfo &TII, int NumBytes,
+                         unsigned MIFlags = MachineInstr::NoFlags,
+                         ARMCC::CondCodes Pred = ARMCC::AL,
+                         unsigned PredReg = 0) {
+  emitRegPlusImmediate(isARM, MBB, MBBI, dl, TII, ARM::SP, ARM::SP, NumBytes,
+                       MIFlags, Pred, PredReg);
+}
+
+static int sizeOfSPAdjustment(const MachineInstr *MI) {
+  assert(MI->getOpcode() == ARM::VSTMDDB_UPD);
+  int count = 0;
+  // ARM and Thumb2 push/pop insts have explicit "sp, sp" operands (+
+  // pred) so the list starts at 4.
+  for (int i = MI->getNumOperands() - 1; i >= 4; --i)
+    count += 8;
+  return count;
+}
+
+static bool WindowsRequiresStackProbe(const MachineFunction &MF,
+                                      size_t StackSizeInBytes) {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  if (MFI->getStackProtectorIndex() > 0)
+    return StackSizeInBytes >= 4080;
+  return StackSizeInBytes >= 4096;
+}
+
+void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  MachineFrameInfo  *MFI = MF.getFrameInfo();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  MachineModuleInfo &MMI = MF.getMMI();
+  MCContext &Context = MMI.getContext();
+  const TargetMachine &TM = MF.getTarget();
+  const MCRegisterInfo *MRI = Context.getRegisterInfo();
+  const ARMBaseRegisterInfo *RegInfo =
+    static_cast<const ARMBaseRegisterInfo*>(TM.getRegisterInfo());
+  const ARMBaseInstrInfo &TII =
+    *static_cast<const ARMBaseInstrInfo*>(TM.getInstrInfo());
+  assert(!AFI->isThumb1OnlyFunction() &&
+         "This emitPrologue does not support Thumb1!");
+  bool isARM = !AFI->isThumbFunction();
+  unsigned Align = TM.getFrameLowering()->getStackAlignment();
+  unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(Align);
+  unsigned NumBytes = MFI->getStackSize();
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+  DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+  unsigned FramePtr = RegInfo->getFrameRegister(MF);
+  int CFAOffset = 0;
+
+  // Determine the sizes of each callee-save spill areas and record which frame
+  // belongs to which callee-save spill areas.
+  unsigned GPRCS1Size = 0, GPRCS2Size = 0, DPRCSSize = 0;
+  int FramePtrSpillFI = 0;
+  int D8SpillFI = 0;
+
+  // All calls are tail calls in GHC calling conv, and functions have no
+  // prologue/epilogue.
+  if (MF.getFunction()->getCallingConv() == CallingConv::GHC)
+    return;
+
+  // Allocate the vararg register save area.
+  if (ArgRegsSaveSize) {
+    emitSPUpdate(isARM, MBB, MBBI, dl, TII, -ArgRegsSaveSize,
+                 MachineInstr::FrameSetup);
+    CFAOffset -= ArgRegsSaveSize;
+    unsigned CFIIndex = MMI.addFrameInst(
+        MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
+    BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
+  }
+
+  if (!AFI->hasStackFrame() &&
+      (!STI.isTargetWindows() || !WindowsRequiresStackProbe(MF, NumBytes))) {
+    if (NumBytes - ArgRegsSaveSize != 0) {
+      emitSPUpdate(isARM, MBB, MBBI, dl, TII, -(NumBytes - ArgRegsSaveSize),
+                   MachineInstr::FrameSetup);
+      CFAOffset -= NumBytes - ArgRegsSaveSize;
+      unsigned CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    }
+    return;
+  }
+
+  // Determine spill area sizes.
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    int FI = CSI[i].getFrameIdx();
+    switch (Reg) {
+    case ARM::R8:
+    case ARM::R9:
+    case ARM::R10:
+    case ARM::R11:
+    case ARM::R12:
+      if (STI.isTargetDarwin()) {
+        GPRCS2Size += 4;
+        break;
+      }
+      // fallthrough
+    case ARM::R0:
+    case ARM::R1:
+    case ARM::R2:
+    case ARM::R3:
+    case ARM::R4:
+    case ARM::R5:
+    case ARM::R6:
+    case ARM::R7:
+    case ARM::LR:
+      if (Reg == FramePtr)
+        FramePtrSpillFI = FI;
+      GPRCS1Size += 4;
+      break;
+    default:
+      // This is a DPR. Exclude the aligned DPRCS2 spills.
+      if (Reg == ARM::D8)
+        D8SpillFI = FI;
+      if (Reg < ARM::D8 || Reg >= ARM::D8 + AFI->getNumAlignedDPRCS2Regs())
+        DPRCSSize += 8;
+    }
+  }
+
+  // Move past area 1.
+  MachineBasicBlock::iterator LastPush = MBB.end(), GPRCS1Push, GPRCS2Push,
+      DPRCSPush;
+  if (GPRCS1Size > 0)
+    GPRCS1Push = LastPush = MBBI++;
+
+  // Determine starting offsets of spill areas.
+  bool HasFP = hasFP(MF);
+  unsigned DPRCSOffset  = NumBytes - (ArgRegsSaveSize + GPRCS1Size
+                                      + GPRCS2Size + DPRCSSize);
+  unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize;
+  unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size;
+  int FramePtrOffsetInPush = 0;
+  if (HasFP) {
+    FramePtrOffsetInPush = MFI->getObjectOffset(FramePtrSpillFI)
+                           + GPRCS1Size + ArgRegsSaveSize;
+    AFI->setFramePtrSpillOffset(MFI->getObjectOffset(FramePtrSpillFI) +
+                                NumBytes);
+  }
+  AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset);
+  AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset);
+  AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset);
+
+  // Move past area 2.
+  if (GPRCS2Size > 0)
+    GPRCS2Push = LastPush = MBBI++;
+
+  // Move past area 3.
+  if (DPRCSSize > 0) {
+    DPRCSPush = MBBI;
+    // Since vpush register list cannot have gaps, there may be multiple vpush
+    // instructions in the prologue.
+    while (MBBI->getOpcode() == ARM::VSTMDDB_UPD)
+      LastPush = MBBI++;
+  }
+
+  // Move past the aligned DPRCS2 area.
+  if (AFI->getNumAlignedDPRCS2Regs() > 0) {
+    MBBI = skipAlignedDPRCS2Spills(MBBI, AFI->getNumAlignedDPRCS2Regs());
+    // The code inserted by emitAlignedDPRCS2Spills realigns the stack, and
+    // leaves the stack pointer pointing to the DPRCS2 area.
+    //
+    // Adjust NumBytes to represent the stack slots below the DPRCS2 area.
+    NumBytes += MFI->getObjectOffset(D8SpillFI);
+  } else
+    NumBytes = DPRCSOffset;
+
+  if (STI.isTargetWindows() && WindowsRequiresStackProbe(MF, NumBytes)) {
+    uint32_t NumWords = NumBytes >> 2;
+
+    if (NumWords < 65536)
+      AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi16), ARM::R4)
+                     .addImm(NumWords)
+                     .setMIFlags(MachineInstr::FrameSetup));
+    else
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi32imm), ARM::R4)
+        .addImm(NumWords)
+        .setMIFlags(MachineInstr::FrameSetup);
+
+    switch (TM.getCodeModel()) {
+    case CodeModel::Small:
+    case CodeModel::Medium:
+    case CodeModel::Default:
+    case CodeModel::Kernel:
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::tBL))
+        .addImm((unsigned)ARMCC::AL).addReg(0)
+        .addExternalSymbol("__chkstk")
+        .addReg(ARM::R4, RegState::Implicit)
+        .setMIFlags(MachineInstr::FrameSetup);
+      break;
+    case CodeModel::Large:
+    case CodeModel::JITDefault:
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi32imm), ARM::R12)
+        .addExternalSymbol("__chkstk")
+        .setMIFlags(MachineInstr::FrameSetup);
+
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::tBLXr))
+        .addImm((unsigned)ARMCC::AL).addReg(0)
+        .addReg(ARM::R12, RegState::Kill)
+        .addReg(ARM::R4, RegState::Implicit)
+        .setMIFlags(MachineInstr::FrameSetup);
+      break;
+    }
+
+    AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::t2SUBrr),
+                                        ARM::SP)
+                                .addReg(ARM::SP, RegState::Define)
+                                .addReg(ARM::R4, RegState::Kill)
+                                .setMIFlags(MachineInstr::FrameSetup)));
+    NumBytes = 0;
+  }
+
+  unsigned adjustedGPRCS1Size = GPRCS1Size;
+  if (NumBytes) {
+    // Adjust SP after all the callee-save spills.
+    if (tryFoldSPUpdateIntoPushPop(STI, MF, LastPush, NumBytes)) {
+      if (LastPush == GPRCS1Push) {
+        FramePtrOffsetInPush += NumBytes;
+        adjustedGPRCS1Size += NumBytes;
+        NumBytes = 0;
+      }
+    } else
+      emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes,
+                   MachineInstr::FrameSetup);
+
+    if (HasFP && isARM)
+      // Restore from fp only in ARM mode: e.g. sub sp, r7, #24
+      // Note it's not safe to do this in Thumb2 mode because it would have
+      // taken two instructions:
+      // mov sp, r7
+      // sub sp, #24
+      // If an interrupt is taken between the two instructions, then sp is in
+      // an inconsistent state (pointing to the middle of callee-saved area).
+      // The interrupt handler can end up clobbering the registers.
+      AFI->setShouldRestoreSPFromFP(true);
+  }
+
+  if (adjustedGPRCS1Size > 0) {
+    CFAOffset -= adjustedGPRCS1Size;
+    unsigned CFIIndex = MMI.addFrameInst(
+        MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
+    MachineBasicBlock::iterator Pos = ++GPRCS1Push;
+    BuildMI(MBB, Pos, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
+    for (const auto &Entry : CSI) {
+      unsigned Reg = Entry.getReg();
+      int FI = Entry.getFrameIdx();
+      switch (Reg) {
+      case ARM::R8:
+      case ARM::R9:
+      case ARM::R10:
+      case ARM::R11:
+      case ARM::R12:
+        if (STI.isTargetDarwin())
+          break;
+        // fallthrough
+      case ARM::R0:
+      case ARM::R1:
+      case ARM::R2:
+      case ARM::R3:
+      case ARM::R4:
+      case ARM::R5:
+      case ARM::R6:
+      case ARM::R7:
+      case ARM::LR:
+        CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
+            nullptr, MRI->getDwarfRegNum(Reg, true), MFI->getObjectOffset(FI)));
+        BuildMI(MBB, Pos, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+            .addCFIIndex(CFIIndex);
+        break;
+      }
+    }
+  }
+
+  // Set FP to point to the stack slot that contains the previous FP.
+  // For iOS, FP is R7, which has now been stored in spill area 1.
+  // Otherwise, if this is not iOS, all the callee-saved registers go
+  // into spill area 1, including the FP in R11.  In either case, it
+  // is in area one and the adjustment needs to take place just after
+  // that push.
+  if (HasFP) {
+    emitRegPlusImmediate(!AFI->isThumbFunction(), MBB, GPRCS1Push, dl, TII,
+                         FramePtr, ARM::SP, FramePtrOffsetInPush,
+                         MachineInstr::FrameSetup);
+    if (FramePtrOffsetInPush) {
+      CFAOffset += FramePtrOffsetInPush;
+      unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createDefCfa(
+          nullptr, MRI->getDwarfRegNum(FramePtr, true), CFAOffset));
+      BuildMI(MBB, GPRCS1Push, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+
+    } else {
+      unsigned CFIIndex =
+          MMI.addFrameInst(MCCFIInstruction::createDefCfaRegister(
+              nullptr, MRI->getDwarfRegNum(FramePtr, true)));
+      BuildMI(MBB, GPRCS1Push, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    }
+  }
+
+  if (GPRCS2Size > 0) {
+    MachineBasicBlock::iterator Pos = ++GPRCS2Push;
+    if (!HasFP) {
+      CFAOffset -= GPRCS2Size;
+      unsigned CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
+      BuildMI(MBB, Pos, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    }
+    for (const auto &Entry : CSI) {
+      unsigned Reg = Entry.getReg();
+      int FI = Entry.getFrameIdx();
+      switch (Reg) {
+      case ARM::R8:
+      case ARM::R9:
+      case ARM::R10:
+      case ARM::R11:
+      case ARM::R12:
+        if (STI.isTargetDarwin()) {
+          unsigned DwarfReg =  MRI->getDwarfRegNum(Reg, true);
+          unsigned Offset = MFI->getObjectOffset(FI);
+          unsigned CFIIndex = MMI.addFrameInst(
+              MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
+          BuildMI(MBB, Pos, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+              .addCFIIndex(CFIIndex);
+        }
+        break;
+      }
+    }
+  }
+
+  if (DPRCSSize > 0) {
+    // Since vpush register list cannot have gaps, there may be multiple vpush
+    // instructions in the prologue.
+    do {
+      MachineBasicBlock::iterator Push = DPRCSPush++;
+      if (!HasFP) {
+        CFAOffset -= sizeOfSPAdjustment(Push);
+        unsigned CFIIndex = MMI.addFrameInst(
+            MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
+        BuildMI(MBB, DPRCSPush, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+            .addCFIIndex(CFIIndex);
+      }
+    } while (DPRCSPush->getOpcode() == ARM::VSTMDDB_UPD);
+
+    for (const auto &Entry : CSI) {
+      unsigned Reg = Entry.getReg();
+      int FI = Entry.getFrameIdx();
+      if ((Reg >= ARM::D0 && Reg <= ARM::D31) &&
+          (Reg < ARM::D8 || Reg >= ARM::D8 + AFI->getNumAlignedDPRCS2Regs())) {
+        unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
+        unsigned Offset = MFI->getObjectOffset(FI);
+        unsigned CFIIndex = MMI.addFrameInst(
+            MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
+        BuildMI(MBB, DPRCSPush, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+            .addCFIIndex(CFIIndex);
+      }
+    }
+  }
+
+  if (NumBytes) {
+    if (!HasFP) {
+      CFAOffset -= NumBytes;
+      unsigned CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    }
+  }
+
+  if (STI.isTargetELF() && hasFP(MF))
+    MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() -
+                             AFI->getFramePtrSpillOffset());
+
+  AFI->setGPRCalleeSavedArea1Size(GPRCS1Size);
+  AFI->setGPRCalleeSavedArea2Size(GPRCS2Size);
+  AFI->setDPRCalleeSavedAreaSize(DPRCSSize);
+
+  // If we need dynamic stack realignment, do it here. Be paranoid and make
+  // sure if we also have VLAs, we have a base pointer for frame access.
+  // If aligned NEON registers were spilled, the stack has already been
+  // realigned.
+  if (!AFI->getNumAlignedDPRCS2Regs() && RegInfo->needsStackRealignment(MF)) {
+    unsigned MaxAlign = MFI->getMaxAlignment();
+    assert (!AFI->isThumb1OnlyFunction());
+    if (!AFI->isThumbFunction()) {
+      // Emit bic sp, sp, MaxAlign
+      AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, dl,
+                                          TII.get(ARM::BICri), ARM::SP)
+                                  .addReg(ARM::SP, RegState::Kill)
+                                  .addImm(MaxAlign-1)));
+    } else {
+      // We cannot use sp as source/dest register here, thus we're emitting the
+      // following sequence:
+      // mov r4, sp
+      // bic r4, r4, MaxAlign
+      // mov sp, r4
+      // FIXME: It will be better just to find spare register here.
+      AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::R4)
+        .addReg(ARM::SP, RegState::Kill));
+      AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, dl,
+                                          TII.get(ARM::t2BICri), ARM::R4)
+                                  .addReg(ARM::R4, RegState::Kill)
+                                  .addImm(MaxAlign-1)));
+      AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP)
+        .addReg(ARM::R4, RegState::Kill));
+    }
+
+    AFI->setShouldRestoreSPFromFP(true);
+  }
+
+  // If we need a base pointer, set it up here. It's whatever the value
+  // of the stack pointer is at this point. Any variable size objects
+  // will be allocated after this, so we can still use the base pointer
+  // to reference locals.
+  // FIXME: Clarify FrameSetup flags here.
+  if (RegInfo->hasBasePointer(MF)) {
+    if (isARM)
+      BuildMI(MBB, MBBI, dl,
+              TII.get(ARM::MOVr), RegInfo->getBaseRegister())
+        .addReg(ARM::SP)
+        .addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
+    else
+      AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr),
+                             RegInfo->getBaseRegister())
+        .addReg(ARM::SP));
+  }
+
+  // If the frame has variable sized objects then the epilogue must restore
+  // the sp from fp. We can assume there's an FP here since hasFP already
+  // checks for hasVarSizedObjects.
+  if (MFI->hasVarSizedObjects())
+    AFI->setShouldRestoreSPFromFP(true);
+}
+
+void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
+                                    MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  assert(MBBI->isReturn() && "Can only insert epilog into returning blocks");
+  unsigned RetOpcode = MBBI->getOpcode();
+  DebugLoc dl = MBBI->getDebugLoc();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
+  const ARMBaseInstrInfo &TII =
+    *static_cast<const ARMBaseInstrInfo*>(MF.getTarget().getInstrInfo());
+  assert(!AFI->isThumb1OnlyFunction() &&
+         "This emitEpilogue does not support Thumb1!");
+  bool isARM = !AFI->isThumbFunction();
+
+  unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment();
+  unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(Align);
+  int NumBytes = (int)MFI->getStackSize();
+  unsigned FramePtr = RegInfo->getFrameRegister(MF);
+
+  // All calls are tail calls in GHC calling conv, and functions have no
+  // prologue/epilogue.
+  if (MF.getFunction()->getCallingConv() == CallingConv::GHC)
+    return;
+
+  if (!AFI->hasStackFrame()) {
+    if (NumBytes - ArgRegsSaveSize != 0)
+      emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes - ArgRegsSaveSize);
+  } else {
+    // Unwind MBBI to point to first LDR / VLDRD.
+    const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
+    if (MBBI != MBB.begin()) {
+      do {
+        --MBBI;
+      } while (MBBI != MBB.begin() && isCSRestore(MBBI, TII, CSRegs));
+      if (!isCSRestore(MBBI, TII, CSRegs))
+        ++MBBI;
+    }
+
+    // Move SP to start of FP callee save spill area.
+    NumBytes -= (ArgRegsSaveSize +
+                 AFI->getGPRCalleeSavedArea1Size() +
+                 AFI->getGPRCalleeSavedArea2Size() +
+                 AFI->getDPRCalleeSavedAreaSize());
+
+    // Reset SP based on frame pointer only if the stack frame extends beyond
+    // frame pointer stack slot or target is ELF and the function has FP.
+    if (AFI->shouldRestoreSPFromFP()) {
+      NumBytes = AFI->getFramePtrSpillOffset() - NumBytes;
+      if (NumBytes) {
+        if (isARM)
+          emitARMRegPlusImmediate(MBB, MBBI, dl, ARM::SP, FramePtr, -NumBytes,
+                                  ARMCC::AL, 0, TII);
+        else {
+          // It's not possible to restore SP from FP in a single instruction.
+          // For iOS, this looks like:
+          // mov sp, r7
+          // sub sp, #24
+          // This is bad, if an interrupt is taken after the mov, sp is in an
+          // inconsistent state.
+          // Use the first callee-saved register as a scratch register.
+          assert(MF.getRegInfo().isPhysRegUsed(ARM::R4) &&
+                 "No scratch register to restore SP from FP!");
+          emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::R4, FramePtr, -NumBytes,
+                                 ARMCC::AL, 0, TII);
+          AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr),
+                                 ARM::SP)
+            .addReg(ARM::R4));
+        }
+      } else {
+        // Thumb2 or ARM.
+        if (isARM)
+          BuildMI(MBB, MBBI, dl, TII.get(ARM::MOVr), ARM::SP)
+            .addReg(FramePtr).addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
+        else
+          AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr),
+                                 ARM::SP)
+            .addReg(FramePtr));
+      }
+    } else if (NumBytes &&
+               !tryFoldSPUpdateIntoPushPop(STI, MF, MBBI, NumBytes))
+        emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes);
+
+    // Increment past our save areas.
+    if (AFI->getDPRCalleeSavedAreaSize()) {
+      MBBI++;
+      // Since vpop register list cannot have gaps, there may be multiple vpop
+      // instructions in the epilogue.
+      while (MBBI->getOpcode() == ARM::VLDMDIA_UPD)
+        MBBI++;
+    }
+    if (AFI->getGPRCalleeSavedArea2Size()) MBBI++;
+    if (AFI->getGPRCalleeSavedArea1Size()) MBBI++;
+  }
+
+  if (RetOpcode == ARM::TCRETURNdi || RetOpcode == ARM::TCRETURNri) {
+    // Tail call return: adjust the stack pointer and jump to callee.
+    MBBI = MBB.getLastNonDebugInstr();
+    MachineOperand &JumpTarget = MBBI->getOperand(0);
+
+    // Jump to label or value in register.
+    if (RetOpcode == ARM::TCRETURNdi) {
+      unsigned TCOpcode = STI.isThumb() ?
+               (STI.isTargetMachO() ? ARM::tTAILJMPd : ARM::tTAILJMPdND) :
+               ARM::TAILJMPd;
+      MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(TCOpcode));
+      if (JumpTarget.isGlobal())
+        MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(),
+                             JumpTarget.getTargetFlags());
+      else {
+        assert(JumpTarget.isSymbol());
+        MIB.addExternalSymbol(JumpTarget.getSymbolName(),
+                              JumpTarget.getTargetFlags());
+      }
+
+      // Add the default predicate in Thumb mode.
+      if (STI.isThumb()) MIB.addImm(ARMCC::AL).addReg(0);
+    } else if (RetOpcode == ARM::TCRETURNri) {
+      BuildMI(MBB, MBBI, dl,
+              TII.get(STI.isThumb() ? ARM::tTAILJMPr : ARM::TAILJMPr)).
+        addReg(JumpTarget.getReg(), RegState::Kill);
+    }
+
+    MachineInstr *NewMI = std::prev(MBBI);
+    for (unsigned i = 1, e = MBBI->getNumOperands(); i != e; ++i)
+      NewMI->addOperand(MBBI->getOperand(i));
+
+    // Delete the pseudo instruction TCRETURN.
+    MBB.erase(MBBI);
+    MBBI = NewMI;
+  }
+
+  if (ArgRegsSaveSize)
+    emitSPUpdate(isARM, MBB, MBBI, dl, TII, ArgRegsSaveSize);
+}
+
+/// getFrameIndexReference - Provide a base+offset reference to an FI slot for
+/// debug info.  It's the same as what we use for resolving the code-gen
+/// references for now.  FIXME: This can go wrong when references are
+/// SP-relative and simple call frames aren't used.
+int
+ARMFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+                                         unsigned &FrameReg) const {
+  return ResolveFrameIndexReference(MF, FI, FrameReg, 0);
+}
+
+int
+ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF,
+                                             int FI, unsigned &FrameReg,
+                                             int SPAdj) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const ARMBaseRegisterInfo *RegInfo =
+    static_cast<const ARMBaseRegisterInfo*>(MF.getTarget().getRegisterInfo());
+  const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  int Offset = MFI->getObjectOffset(FI) + MFI->getStackSize();
+  int FPOffset = Offset - AFI->getFramePtrSpillOffset();
+  bool isFixed = MFI->isFixedObjectIndex(FI);
+
+  FrameReg = ARM::SP;
+  Offset += SPAdj;
+
+  // SP can move around if there are allocas.  We may also lose track of SP
+  // when emergency spilling inside a non-reserved call frame setup.
+  bool hasMovingSP = !hasReservedCallFrame(MF);
+
+  // When dynamically realigning the stack, use the frame pointer for
+  // parameters, and the stack/base pointer for locals.
+  if (RegInfo->needsStackRealignment(MF)) {
+    assert (hasFP(MF) && "dynamic stack realignment without a FP!");
+    if (isFixed) {
+      FrameReg = RegInfo->getFrameRegister(MF);
+      Offset = FPOffset;
+    } else if (hasMovingSP) {
+      assert(RegInfo->hasBasePointer(MF) &&
+             "VLAs and dynamic stack alignment, but missing base pointer!");
+      FrameReg = RegInfo->getBaseRegister();
+    }
+    return Offset;
+  }
+
+  // If there is a frame pointer, use it when we can.
+  if (hasFP(MF) && AFI->hasStackFrame()) {
+    // Use frame pointer to reference fixed objects. Use it for locals if
+    // there are VLAs (and thus the SP isn't reliable as a base).
+    if (isFixed || (hasMovingSP && !RegInfo->hasBasePointer(MF))) {
+      FrameReg = RegInfo->getFrameRegister(MF);
+      return FPOffset;
+    } else if (hasMovingSP) {
+      assert(RegInfo->hasBasePointer(MF) && "missing base pointer!");
+      if (AFI->isThumb2Function()) {
+        // Try to use the frame pointer if we can, else use the base pointer
+        // since it's available. This is handy for the emergency spill slot, in
+        // particular.
+        if (FPOffset >= -255 && FPOffset < 0) {
+          FrameReg = RegInfo->getFrameRegister(MF);
+          return FPOffset;
+        }
+      }
+    } else if (AFI->isThumb2Function()) {
+      // Use  add <rd>, sp, #<imm8>
+      //      ldr <rd>, [sp, #<imm8>]
+      // if at all possible to save space.
+      if (Offset >= 0 && (Offset & 3) == 0 && Offset <= 1020)
+        return Offset;
+      // In Thumb2 mode, the negative offset is very limited. Try to avoid
+      // out of range references. ldr <rt>,[<rn>, #-<imm8>]
+      if (FPOffset >= -255 && FPOffset < 0) {
+        FrameReg = RegInfo->getFrameRegister(MF);
+        return FPOffset;
+      }
+    } else if (Offset > (FPOffset < 0 ? -FPOffset : FPOffset)) {
+      // Otherwise, use SP or FP, whichever is closer to the stack slot.
+      FrameReg = RegInfo->getFrameRegister(MF);
+      return FPOffset;
+    }
+  }
+  // Use the base pointer if we have one.
+  if (RegInfo->hasBasePointer(MF))
+    FrameReg = RegInfo->getBaseRegister();
+  return Offset;
+}
+
+int ARMFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
+                                          int FI) const {
+  unsigned FrameReg;
+  return getFrameIndexReference(MF, FI, FrameReg);
+}
+
+void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MI,
+                                    const std::vector<CalleeSavedInfo> &CSI,
+                                    unsigned StmOpc, unsigned StrOpc,
+                                    bool NoGap,
+                                    bool(*Func)(unsigned, bool),
+                                    unsigned NumAlignedDPRCS2Regs,
+                                    unsigned MIFlags) const {
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+
+  DebugLoc DL;
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  SmallVector<std::pair<unsigned,bool>, 4> Regs;
+  unsigned i = CSI.size();
+  while (i != 0) {
+    unsigned LastReg = 0;
+    for (; i != 0; --i) {
+      unsigned Reg = CSI[i-1].getReg();
+      if (!(Func)(Reg, STI.isTargetDarwin())) continue;
+
+      // D-registers in the aligned area DPRCS2 are NOT spilled here.
+      if (Reg >= ARM::D8 && Reg < ARM::D8 + NumAlignedDPRCS2Regs)
+        continue;
+
+      // Add the callee-saved register as live-in unless it's LR and
+      // @llvm.returnaddress is called. If LR is returned for
+      // @llvm.returnaddress then it's already added to the function and
+      // entry block live-in sets.
+      bool isKill = true;
+      if (Reg == ARM::LR) {
+        if (MF.getFrameInfo()->isReturnAddressTaken() &&
+            MF.getRegInfo().isLiveIn(Reg))
+          isKill = false;
+      }
+
+      if (isKill)
+        MBB.addLiveIn(Reg);
+
+      // If NoGap is true, push consecutive registers and then leave the rest
+      // for other instructions. e.g.
+      // vpush {d8, d10, d11} -> vpush {d8}, vpush {d10, d11}
+      if (NoGap && LastReg && LastReg != Reg-1)
+        break;
+      LastReg = Reg;
+      Regs.push_back(std::make_pair(Reg, isKill));
+    }
+
+    if (Regs.empty())
+      continue;
+    if (Regs.size() > 1 || StrOpc== 0) {
+      MachineInstrBuilder MIB =
+        AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(StmOpc), ARM::SP)
+                       .addReg(ARM::SP).setMIFlags(MIFlags));
+      for (unsigned i = 0, e = Regs.size(); i < e; ++i)
+        MIB.addReg(Regs[i].first, getKillRegState(Regs[i].second));
+    } else if (Regs.size() == 1) {
+      MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc),
+                                        ARM::SP)
+        .addReg(Regs[0].first, getKillRegState(Regs[0].second))
+        .addReg(ARM::SP).setMIFlags(MIFlags)
+        .addImm(-4);
+      AddDefaultPred(MIB);
+    }
+    Regs.clear();
+
+    // Put any subsequent vpush instructions before this one: they will refer to
+    // higher register numbers so need to be pushed first in order to preserve
+    // monotonicity.
+    --MI;
+  }
+}
+
+void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   const std::vector<CalleeSavedInfo> &CSI,
+                                   unsigned LdmOpc, unsigned LdrOpc,
+                                   bool isVarArg, bool NoGap,
+                                   bool(*Func)(unsigned, bool),
+                                   unsigned NumAlignedDPRCS2Regs) const {
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned RetOpcode = MI->getOpcode();
+  bool isTailCall = (RetOpcode == ARM::TCRETURNdi ||
+                     RetOpcode == ARM::TCRETURNri);
+  bool isInterrupt =
+      RetOpcode == ARM::SUBS_PC_LR || RetOpcode == ARM::t2SUBS_PC_LR;
+
+  SmallVector<unsigned, 4> Regs;
+  unsigned i = CSI.size();
+  while (i != 0) {
+    unsigned LastReg = 0;
+    bool DeleteRet = false;
+    for (; i != 0; --i) {
+      unsigned Reg = CSI[i-1].getReg();
+      if (!(Func)(Reg, STI.isTargetDarwin())) continue;
+
+      // The aligned reloads from area DPRCS2 are not inserted here.
+      if (Reg >= ARM::D8 && Reg < ARM::D8 + NumAlignedDPRCS2Regs)
+        continue;
+
+      if (Reg == ARM::LR && !isTailCall && !isVarArg && !isInterrupt &&
+          STI.hasV5TOps()) {
+        Reg = ARM::PC;
+        LdmOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_RET : ARM::LDMIA_RET;
+        // Fold the return instruction into the LDM.
+        DeleteRet = true;
+      }
+
+      // If NoGap is true, pop consecutive registers and then leave the rest
+      // for other instructions. e.g.
+      // vpop {d8, d10, d11} -> vpop {d8}, vpop {d10, d11}
+      if (NoGap && LastReg && LastReg != Reg-1)
+        break;
+
+      LastReg = Reg;
+      Regs.push_back(Reg);
+    }
+
+    if (Regs.empty())
+      continue;
+    if (Regs.size() > 1 || LdrOpc == 0) {
+      MachineInstrBuilder MIB =
+        AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(LdmOpc), ARM::SP)
+                       .addReg(ARM::SP));
+      for (unsigned i = 0, e = Regs.size(); i < e; ++i)
+        MIB.addReg(Regs[i], getDefRegState(true));
+      if (DeleteRet) {
+        MIB.copyImplicitOps(&*MI);
+        MI->eraseFromParent();
+      }
+      MI = MIB;
+    } else if (Regs.size() == 1) {
+      // If we adjusted the reg to PC from LR above, switch it back here. We
+      // only do that for LDM.
+      if (Regs[0] == ARM::PC)
+        Regs[0] = ARM::LR;
+      MachineInstrBuilder MIB =
+        BuildMI(MBB, MI, DL, TII.get(LdrOpc), Regs[0])
+          .addReg(ARM::SP, RegState::Define)
+          .addReg(ARM::SP);
+      // ARM mode needs an extra reg0 here due to addrmode2. Will go away once
+      // that refactoring is complete (eventually).
+      if (LdrOpc == ARM::LDR_POST_REG || LdrOpc == ARM::LDR_POST_IMM) {
+        MIB.addReg(0);
+        MIB.addImm(ARM_AM::getAM2Opc(ARM_AM::add, 4, ARM_AM::no_shift));
+      } else
+        MIB.addImm(4);
+      AddDefaultPred(MIB);
+    }
+    Regs.clear();
+
+    // Put any subsequent vpop instructions after this one: they will refer to
+    // higher register numbers so need to be popped afterwards.
+    ++MI;
+  }
+}
+
+/// Emit aligned spill instructions for NumAlignedDPRCS2Regs D-registers
+/// starting from d8.  Also insert stack realignment code and leave the stack
+/// pointer pointing to the d8 spill slot.
+static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MI,
+                                    unsigned NumAlignedDPRCS2Regs,
+                                    const std::vector<CalleeSavedInfo> &CSI,
+                                    const TargetRegisterInfo *TRI) {
+  MachineFunction &MF = *MBB.getParent();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  DebugLoc DL = MI->getDebugLoc();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+
+  // Mark the D-register spill slots as properly aligned.  Since MFI computes
+  // stack slot layout backwards, this can actually mean that the d-reg stack
+  // slot offsets can be wrong. The offset for d8 will always be correct.
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned DNum = CSI[i].getReg() - ARM::D8;
+    if (DNum >= 8)
+      continue;
+    int FI = CSI[i].getFrameIdx();
+    // The even-numbered registers will be 16-byte aligned, the odd-numbered
+    // registers will be 8-byte aligned.
+    MFI.setObjectAlignment(FI, DNum % 2 ? 8 : 16);
+
+    // The stack slot for D8 needs to be maximally aligned because this is
+    // actually the point where we align the stack pointer.  MachineFrameInfo
+    // computes all offsets relative to the incoming stack pointer which is a
+    // bit weird when realigning the stack.  Any extra padding for this
+    // over-alignment is not realized because the code inserted below adjusts
+    // the stack pointer by numregs * 8 before aligning the stack pointer.
+    if (DNum == 0)
+      MFI.setObjectAlignment(FI, MFI.getMaxAlignment());
+  }
+
+  // Move the stack pointer to the d8 spill slot, and align it at the same
+  // time. Leave the stack slot address in the scratch register r4.
+  //
+  //   sub r4, sp, #numregs * 8
+  //   bic r4, r4, #align - 1
+  //   mov sp, r4
+  //
+  bool isThumb = AFI->isThumbFunction();
+  assert(!AFI->isThumb1OnlyFunction() && "Can't realign stack for thumb1");
+  AFI->setShouldRestoreSPFromFP(true);
+
+  // sub r4, sp, #numregs * 8
+  // The immediate is <= 64, so it doesn't need any special encoding.
+  unsigned Opc = isThumb ? ARM::t2SUBri : ARM::SUBri;
+  AddDefaultCC(AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(Opc), ARM::R4)
+                              .addReg(ARM::SP)
+                              .addImm(8 * NumAlignedDPRCS2Regs)));
+
+  // bic r4, r4, #align-1
+  Opc = isThumb ? ARM::t2BICri : ARM::BICri;
+  unsigned MaxAlign = MF.getFrameInfo()->getMaxAlignment();
+  AddDefaultCC(AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(Opc), ARM::R4)
+                              .addReg(ARM::R4, RegState::Kill)
+                              .addImm(MaxAlign - 1)));
+
+  // mov sp, r4
+  // The stack pointer must be adjusted before spilling anything, otherwise
+  // the stack slots could be clobbered by an interrupt handler.
+  // Leave r4 live, it is used below.
+  Opc = isThumb ? ARM::tMOVr : ARM::MOVr;
+  MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(Opc), ARM::SP)
+                            .addReg(ARM::R4);
+  MIB = AddDefaultPred(MIB);
+  if (!isThumb)
+    AddDefaultCC(MIB);
+
+  // Now spill NumAlignedDPRCS2Regs registers starting from d8.
+  // r4 holds the stack slot address.
+  unsigned NextReg = ARM::D8;
+
+  // 16-byte aligned vst1.64 with 4 d-regs and address writeback.
+  // The writeback is only needed when emitting two vst1.64 instructions.
+  if (NumAlignedDPRCS2Regs >= 6) {
+    unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
+                                               &ARM::QQPRRegClass);
+    MBB.addLiveIn(SupReg);
+    AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VST1d64Qwb_fixed),
+                           ARM::R4)
+                   .addReg(ARM::R4, RegState::Kill).addImm(16)
+                   .addReg(NextReg)
+                   .addReg(SupReg, RegState::ImplicitKill));
+    NextReg += 4;
+    NumAlignedDPRCS2Regs -= 4;
+  }
+
+  // We won't modify r4 beyond this point.  It currently points to the next
+  // register to be spilled.
+  unsigned R4BaseReg = NextReg;
+
+  // 16-byte aligned vst1.64 with 4 d-regs, no writeback.
+  if (NumAlignedDPRCS2Regs >= 4) {
+    unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
+                                               &ARM::QQPRRegClass);
+    MBB.addLiveIn(SupReg);
+    AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VST1d64Q))
+                   .addReg(ARM::R4).addImm(16).addReg(NextReg)
+                   .addReg(SupReg, RegState::ImplicitKill));
+    NextReg += 4;
+    NumAlignedDPRCS2Regs -= 4;
+  }
+
+  // 16-byte aligned vst1.64 with 2 d-regs.
+  if (NumAlignedDPRCS2Regs >= 2) {
+    unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
+                                               &ARM::QPRRegClass);
+    MBB.addLiveIn(SupReg);
+    AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VST1q64))
+                   .addReg(ARM::R4).addImm(16).addReg(SupReg));
+    NextReg += 2;
+    NumAlignedDPRCS2Regs -= 2;
+  }
+
+  // Finally, use a vanilla vstr.64 for the odd last register.
+  if (NumAlignedDPRCS2Regs) {
+    MBB.addLiveIn(NextReg);
+    // vstr.64 uses addrmode5 which has an offset scale of 4.
+    AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VSTRD))
+                   .addReg(NextReg)
+                   .addReg(ARM::R4).addImm((NextReg-R4BaseReg)*2));
+  }
+
+  // The last spill instruction inserted should kill the scratch register r4.
+  std::prev(MI)->addRegisterKilled(ARM::R4, TRI);
+}
+
+/// Skip past the code inserted by emitAlignedDPRCS2Spills, and return an
+/// iterator to the following instruction.
+static MachineBasicBlock::iterator
+skipAlignedDPRCS2Spills(MachineBasicBlock::iterator MI,
+                        unsigned NumAlignedDPRCS2Regs) {
+  //   sub r4, sp, #numregs * 8
+  //   bic r4, r4, #align - 1
+  //   mov sp, r4
+  ++MI; ++MI; ++MI;
+  assert(MI->mayStore() && "Expecting spill instruction");
+
+  // These switches all fall through.
+  switch(NumAlignedDPRCS2Regs) {
+  case 7:
+    ++MI;
+    assert(MI->mayStore() && "Expecting spill instruction");
+  default:
+    ++MI;
+    assert(MI->mayStore() && "Expecting spill instruction");
+  case 1:
+  case 2:
+  case 4:
+    assert(MI->killsRegister(ARM::R4) && "Missed kill flag");
+    ++MI;
+  }
+  return MI;
+}
+
+/// Emit aligned reload instructions for NumAlignedDPRCS2Regs D-registers
+/// starting from d8.  These instructions are assumed to execute while the
+/// stack is still aligned, unlike the code inserted by emitPopInst.
+static void emitAlignedDPRCS2Restores(MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator MI,
+                                      unsigned NumAlignedDPRCS2Regs,
+                                      const std::vector<CalleeSavedInfo> &CSI,
+                                      const TargetRegisterInfo *TRI) {
+  MachineFunction &MF = *MBB.getParent();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  DebugLoc DL = MI->getDebugLoc();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+
+  // Find the frame index assigned to d8.
+  int D8SpillFI = 0;
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i)
+    if (CSI[i].getReg() == ARM::D8) {
+      D8SpillFI = CSI[i].getFrameIdx();
+      break;
+    }
+
+  // Materialize the address of the d8 spill slot into the scratch register r4.
+  // This can be fairly complicated if the stack frame is large, so just use
+  // the normal frame index elimination mechanism to do it.  This code runs as
+  // the initial part of the epilog where the stack and base pointers haven't
+  // been changed yet.
+  bool isThumb = AFI->isThumbFunction();
+  assert(!AFI->isThumb1OnlyFunction() && "Can't realign stack for thumb1");
+
+  unsigned Opc = isThumb ? ARM::t2ADDri : ARM::ADDri;
+  AddDefaultCC(AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(Opc), ARM::R4)
+                              .addFrameIndex(D8SpillFI).addImm(0)));
+
+  // Now restore NumAlignedDPRCS2Regs registers starting from d8.
+  unsigned NextReg = ARM::D8;
+
+  // 16-byte aligned vld1.64 with 4 d-regs and writeback.
+  if (NumAlignedDPRCS2Regs >= 6) {
+    unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
+                                               &ARM::QQPRRegClass);
+    AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VLD1d64Qwb_fixed), NextReg)
+                   .addReg(ARM::R4, RegState::Define)
+                   .addReg(ARM::R4, RegState::Kill).addImm(16)
+                   .addReg(SupReg, RegState::ImplicitDefine));
+    NextReg += 4;
+    NumAlignedDPRCS2Regs -= 4;
+  }
+
+  // We won't modify r4 beyond this point.  It currently points to the next
+  // register to be spilled.
+  unsigned R4BaseReg = NextReg;
+
+  // 16-byte aligned vld1.64 with 4 d-regs, no writeback.
+  if (NumAlignedDPRCS2Regs >= 4) {
+    unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
+                                               &ARM::QQPRRegClass);
+    AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VLD1d64Q), NextReg)
+                   .addReg(ARM::R4).addImm(16)
+                   .addReg(SupReg, RegState::ImplicitDefine));
+    NextReg += 4;
+    NumAlignedDPRCS2Regs -= 4;
+  }
+
+  // 16-byte aligned vld1.64 with 2 d-regs.
+  if (NumAlignedDPRCS2Regs >= 2) {
+    unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
+                                               &ARM::QPRRegClass);
+    AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VLD1q64), SupReg)
+                   .addReg(ARM::R4).addImm(16));
+    NextReg += 2;
+    NumAlignedDPRCS2Regs -= 2;
+  }
+
+  // Finally, use a vanilla vldr.64 for the remaining odd register.
+  if (NumAlignedDPRCS2Regs)
+    AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VLDRD), NextReg)
+                   .addReg(ARM::R4).addImm(2*(NextReg-R4BaseReg)));
+
+  // Last store kills r4.
+  std::prev(MI)->addRegisterKilled(ARM::R4, TRI);
+}
+
+bool ARMFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MI,
+                                        const std::vector<CalleeSavedInfo> &CSI,
+                                        const TargetRegisterInfo *TRI) const {
+  if (CSI.empty())
+    return false;
+
+  MachineFunction &MF = *MBB.getParent();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+
+  unsigned PushOpc = AFI->isThumbFunction() ? ARM::t2STMDB_UPD : ARM::STMDB_UPD;
+  unsigned PushOneOpc = AFI->isThumbFunction() ?
+    ARM::t2STR_PRE : ARM::STR_PRE_IMM;
+  unsigned FltOpc = ARM::VSTMDDB_UPD;
+  unsigned NumAlignedDPRCS2Regs = AFI->getNumAlignedDPRCS2Regs();
+  emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, &isARMArea1Register, 0,
+               MachineInstr::FrameSetup);
+  emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, &isARMArea2Register, 0,
+               MachineInstr::FrameSetup);
+  emitPushInst(MBB, MI, CSI, FltOpc, 0, true, &isARMArea3Register,
+               NumAlignedDPRCS2Regs, MachineInstr::FrameSetup);
+
+  // The code above does not insert spill code for the aligned DPRCS2 registers.
+  // The stack realignment code will be inserted between the push instructions
+  // and these spills.
+  if (NumAlignedDPRCS2Regs)
+    emitAlignedDPRCS2Spills(MBB, MI, NumAlignedDPRCS2Regs, CSI, TRI);
+
+  return true;
+}
+
+bool ARMFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MI,
+                                        const std::vector<CalleeSavedInfo> &CSI,
+                                        const TargetRegisterInfo *TRI) const {
+  if (CSI.empty())
+    return false;
+
+  MachineFunction &MF = *MBB.getParent();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  bool isVarArg = AFI->getArgRegsSaveSize() > 0;
+  unsigned NumAlignedDPRCS2Regs = AFI->getNumAlignedDPRCS2Regs();
+
+  // The emitPopInst calls below do not insert reloads for the aligned DPRCS2
+  // registers. Do that here instead.
+  if (NumAlignedDPRCS2Regs)
+    emitAlignedDPRCS2Restores(MBB, MI, NumAlignedDPRCS2Regs, CSI, TRI);
+
+  unsigned PopOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_UPD : ARM::LDMIA_UPD;
+  unsigned LdrOpc = AFI->isThumbFunction() ? ARM::t2LDR_POST :ARM::LDR_POST_IMM;
+  unsigned FltOpc = ARM::VLDMDIA_UPD;
+  emitPopInst(MBB, MI, CSI, FltOpc, 0, isVarArg, true, &isARMArea3Register,
+              NumAlignedDPRCS2Regs);
+  emitPopInst(MBB, MI, CSI, PopOpc, LdrOpc, isVarArg, false,
+              &isARMArea2Register, 0);
+  emitPopInst(MBB, MI, CSI, PopOpc, LdrOpc, isVarArg, false,
+              &isARMArea1Register, 0);
+
+  return true;
+}
+
+// FIXME: Make generic?
+static unsigned GetFunctionSizeInBytes(const MachineFunction &MF,
+                                       const ARMBaseInstrInfo &TII) {
+  unsigned FnSize = 0;
+  for (auto &MBB : MF) {
+    for (auto &MI : MBB)
+      FnSize += TII.GetInstSizeInBytes(&MI);
+  }
+  return FnSize;
+}
+
+/// estimateRSStackSizeLimit - Look at each instruction that references stack
+/// frames and return the stack size limit beyond which some of these
+/// instructions will require a scratch register during their expansion later.
+// FIXME: Move to TII?
+static unsigned estimateRSStackSizeLimit(MachineFunction &MF,
+                                         const TargetFrameLowering *TFI) {
+  const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  unsigned Limit = (1 << 12) - 1;
+  for (auto &MBB : MF) {
+    for (auto &MI : MBB) {
+      for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+        if (!MI.getOperand(i).isFI())
+          continue;
+
+        // When using ADDri to get the address of a stack object, 255 is the
+        // largest offset guaranteed to fit in the immediate offset.
+        if (MI.getOpcode() == ARM::ADDri) {
+          Limit = std::min(Limit, (1U << 8) - 1);
+          break;
+        }
+
+        // Otherwise check the addressing mode.
+        switch (MI.getDesc().TSFlags & ARMII::AddrModeMask) {
+        case ARMII::AddrMode3:
+        case ARMII::AddrModeT2_i8:
+          Limit = std::min(Limit, (1U << 8) - 1);
+          break;
+        case ARMII::AddrMode5:
+        case ARMII::AddrModeT2_i8s4:
+          Limit = std::min(Limit, ((1U << 8) - 1) * 4);
+          break;
+        case ARMII::AddrModeT2_i12:
+          // i12 supports only positive offset so these will be converted to
+          // i8 opcodes. See llvm::rewriteT2FrameIndex.
+          if (TFI->hasFP(MF) && AFI->hasStackFrame())
+            Limit = std::min(Limit, (1U << 8) - 1);
+          break;
+        case ARMII::AddrMode4:
+        case ARMII::AddrMode6:
+          // Addressing modes 4 & 6 (load/store) instructions can't encode an
+          // immediate offset for stack references.
+          return 0;
+        default:
+          break;
+        }
+        break; // At most one FI per instruction
+      }
+    }
+  }
+
+  return Limit;
+}
+
+// In functions that realign the stack, it can be an advantage to spill the
+// callee-saved vector registers after realigning the stack. The vst1 and vld1
+// instructions take alignment hints that can improve performance.
+//
+static void checkNumAlignedDPRCS2Regs(MachineFunction &MF) {
+  MF.getInfo<ARMFunctionInfo>()->setNumAlignedDPRCS2Regs(0);
+  if (!SpillAlignedNEONRegs)
+    return;
+
+  // Naked functions don't spill callee-saved registers.
+  if (MF.getFunction()->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                                     Attribute::Naked))
+    return;
+
+  // We are planning to use NEON instructions vst1 / vld1.
+  if (!MF.getTarget().getSubtarget<ARMSubtarget>().hasNEON())
+    return;
+
+  // Don't bother if the default stack alignment is sufficiently high.
+  if (MF.getTarget().getFrameLowering()->getStackAlignment() >= 8)
+    return;
+
+  // Aligned spills require stack realignment.
+  const ARMBaseRegisterInfo *RegInfo =
+    static_cast<const ARMBaseRegisterInfo*>(MF.getTarget().getRegisterInfo());
+  if (!RegInfo->canRealignStack(MF))
+    return;
+
+  // We always spill contiguous d-registers starting from d8. Count how many
+  // needs spilling.  The register allocator will almost always use the
+  // callee-saved registers in order, but it can happen that there are holes in
+  // the range.  Registers above the hole will be spilled to the standard DPRCS
+  // area.
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  unsigned NumSpills = 0;
+  for (; NumSpills < 8; ++NumSpills)
+    if (!MRI.isPhysRegUsed(ARM::D8 + NumSpills))
+      break;
+
+  // Don't do this for just one d-register. It's not worth it.
+  if (NumSpills < 2)
+    return;
+
+  // Spill the first NumSpills D-registers after realigning the stack.
+  MF.getInfo<ARMFunctionInfo>()->setNumAlignedDPRCS2Regs(NumSpills);
+
+  // A scratch register is required for the vst1 / vld1 instructions.
+  MF.getRegInfo().setPhysRegUsed(ARM::R4);
+}
+
+void
+ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                                       RegScavenger *RS) const {
+  // This tells PEI to spill the FP as if it is any other callee-save register
+  // to take advantage the eliminateFrameIndex machinery. This also ensures it
+  // is spilled in the order specified by getCalleeSavedRegs() to make it easier
+  // to combine multiple loads / stores.
+  bool CanEliminateFrame = true;
+  bool CS1Spilled = false;
+  bool LRSpilled = false;
+  unsigned NumGPRSpills = 0;
+  SmallVector<unsigned, 4> UnspilledCS1GPRs;
+  SmallVector<unsigned, 4> UnspilledCS2GPRs;
+  const ARMBaseRegisterInfo *RegInfo =
+    static_cast<const ARMBaseRegisterInfo*>(MF.getTarget().getRegisterInfo());
+  const ARMBaseInstrInfo &TII =
+    *static_cast<const ARMBaseInstrInfo*>(MF.getTarget().getInstrInfo());
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  unsigned FramePtr = RegInfo->getFrameRegister(MF);
+
+  // Spill R4 if Thumb2 function requires stack realignment - it will be used as
+  // scratch register. Also spill R4 if Thumb2 function has varsized objects,
+  // since it's not always possible to restore sp from fp in a single
+  // instruction.
+  // FIXME: It will be better just to find spare register here.
+  if (AFI->isThumb2Function() &&
+      (MFI->hasVarSizedObjects() || RegInfo->needsStackRealignment(MF)))
+    MRI.setPhysRegUsed(ARM::R4);
+
+  if (AFI->isThumb1OnlyFunction()) {
+    // Spill LR if Thumb1 function uses variable length argument lists.
+    if (AFI->getArgRegsSaveSize() > 0)
+      MRI.setPhysRegUsed(ARM::LR);
+
+    // Spill R4 if Thumb1 epilogue has to restore SP from FP. We don't know
+    // for sure what the stack size will be, but for this, an estimate is good
+    // enough. If there anything changes it, it'll be a spill, which implies
+    // we've used all the registers and so R4 is already used, so not marking
+    // it here will be OK.
+    // FIXME: It will be better just to find spare register here.
+    unsigned StackSize = MFI->estimateStackSize(MF);
+    if (MFI->hasVarSizedObjects() || StackSize > 508)
+      MRI.setPhysRegUsed(ARM::R4);
+  }
+
+  // See if we can spill vector registers to aligned stack.
+  checkNumAlignedDPRCS2Regs(MF);
+
+  // Spill the BasePtr if it's used.
+  if (RegInfo->hasBasePointer(MF))
+    MRI.setPhysRegUsed(RegInfo->getBaseRegister());
+
+  // Don't spill FP if the frame can be eliminated. This is determined
+  // by scanning the callee-save registers to see if any is used.
+  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
+  for (unsigned i = 0; CSRegs[i]; ++i) {
+    unsigned Reg = CSRegs[i];
+    bool Spilled = false;
+    if (MRI.isPhysRegUsed(Reg)) {
+      Spilled = true;
+      CanEliminateFrame = false;
+    }
+
+    if (!ARM::GPRRegClass.contains(Reg))
+      continue;
+
+    if (Spilled) {
+      NumGPRSpills++;
+
+      if (!STI.isTargetDarwin()) {
+        if (Reg == ARM::LR)
+          LRSpilled = true;
+        CS1Spilled = true;
+        continue;
+      }
+
+      // Keep track if LR and any of R4, R5, R6, and R7 is spilled.
+      switch (Reg) {
+      case ARM::LR:
+        LRSpilled = true;
+        // Fallthrough
+      case ARM::R0: case ARM::R1:
+      case ARM::R2: case ARM::R3:
+      case ARM::R4: case ARM::R5:
+      case ARM::R6: case ARM::R7:
+        CS1Spilled = true;
+        break;
+      default:
+        break;
+      }
+    } else {
+      if (!STI.isTargetDarwin()) {
+        UnspilledCS1GPRs.push_back(Reg);
+        continue;
+      }
+
+      switch (Reg) {
+      case ARM::R0: case ARM::R1:
+      case ARM::R2: case ARM::R3:
+      case ARM::R4: case ARM::R5:
+      case ARM::R6: case ARM::R7:
+      case ARM::LR:
+        UnspilledCS1GPRs.push_back(Reg);
+        break;
+      default:
+        UnspilledCS2GPRs.push_back(Reg);
+        break;
+      }
+    }
+  }
+
+  bool ForceLRSpill = false;
+  if (!LRSpilled && AFI->isThumb1OnlyFunction()) {
+    unsigned FnSize = GetFunctionSizeInBytes(MF, TII);
+    // Force LR to be spilled if the Thumb function size is > 2048. This enables
+    // use of BL to implement far jump. If it turns out that it's not needed
+    // then the branch fix up path will undo it.
+    if (FnSize >= (1 << 11)) {
+      CanEliminateFrame = false;
+      ForceLRSpill = true;
+    }
+  }
+
+  // If any of the stack slot references may be out of range of an immediate
+  // offset, make sure a register (or a spill slot) is available for the
+  // register scavenger. Note that if we're indexing off the frame pointer, the
+  // effective stack size is 4 bytes larger since the FP points to the stack
+  // slot of the previous FP. Also, if we have variable sized objects in the
+  // function, stack slot references will often be negative, and some of
+  // our instructions are positive-offset only, so conservatively consider
+  // that case to want a spill slot (or register) as well. Similarly, if
+  // the function adjusts the stack pointer during execution and the
+  // adjustments aren't already part of our stack size estimate, our offset
+  // calculations may be off, so be conservative.
+  // FIXME: We could add logic to be more precise about negative offsets
+  //        and which instructions will need a scratch register for them. Is it
+  //        worth the effort and added fragility?
+  bool BigStack =
+    (RS &&
+     (MFI->estimateStackSize(MF) +
+      ((hasFP(MF) && AFI->hasStackFrame()) ? 4:0) >=
+      estimateRSStackSizeLimit(MF, this)))
+    || MFI->hasVarSizedObjects()
+    || (MFI->adjustsStack() && !canSimplifyCallFramePseudos(MF));
+
+  bool ExtraCSSpill = false;
+  if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) {
+    AFI->setHasStackFrame(true);
+
+    // If LR is not spilled, but at least one of R4, R5, R6, and R7 is spilled.
+    // Spill LR as well so we can fold BX_RET to the registers restore (LDM).
+    if (!LRSpilled && CS1Spilled) {
+      MRI.setPhysRegUsed(ARM::LR);
+      NumGPRSpills++;
+      SmallVectorImpl<unsigned>::iterator LRPos;
+      LRPos = std::find(UnspilledCS1GPRs.begin(), UnspilledCS1GPRs.end(),
+                        (unsigned)ARM::LR);
+      if (LRPos != UnspilledCS1GPRs.end())
+        UnspilledCS1GPRs.erase(LRPos);
+
+      ForceLRSpill = false;
+      ExtraCSSpill = true;
+    }
+
+    if (hasFP(MF)) {
+      MRI.setPhysRegUsed(FramePtr);
+      auto FPPos = std::find(UnspilledCS1GPRs.begin(), UnspilledCS1GPRs.end(),
+                             FramePtr);
+      if (FPPos != UnspilledCS1GPRs.end())
+        UnspilledCS1GPRs.erase(FPPos);
+      NumGPRSpills++;
+    }
+
+    // If stack and double are 8-byte aligned and we are spilling an odd number
+    // of GPRs, spill one extra callee save GPR so we won't have to pad between
+    // the integer and double callee save areas.
+    unsigned TargetAlign = getStackAlignment();
+    if (TargetAlign == 8 && (NumGPRSpills & 1)) {
+      if (CS1Spilled && !UnspilledCS1GPRs.empty()) {
+        for (unsigned i = 0, e = UnspilledCS1GPRs.size(); i != e; ++i) {
+          unsigned Reg = UnspilledCS1GPRs[i];
+          // Don't spill high register if the function is thumb1
+          if (!AFI->isThumb1OnlyFunction() ||
+              isARMLowRegister(Reg) || Reg == ARM::LR) {
+            MRI.setPhysRegUsed(Reg);
+            if (!MRI.isReserved(Reg))
+              ExtraCSSpill = true;
+            break;
+          }
+        }
+      } else if (!UnspilledCS2GPRs.empty() && !AFI->isThumb1OnlyFunction()) {
+        unsigned Reg = UnspilledCS2GPRs.front();
+        MRI.setPhysRegUsed(Reg);
+        if (!MRI.isReserved(Reg))
+          ExtraCSSpill = true;
+      }
+    }
+
+    // Estimate if we might need to scavenge a register at some point in order
+    // to materialize a stack offset. If so, either spill one additional
+    // callee-saved register or reserve a special spill slot to facilitate
+    // register scavenging. Thumb1 needs a spill slot for stack pointer
+    // adjustments also, even when the frame itself is small.
+    if (BigStack && !ExtraCSSpill) {
+      // If any non-reserved CS register isn't spilled, just spill one or two
+      // extra. That should take care of it!
+      unsigned NumExtras = TargetAlign / 4;
+      SmallVector<unsigned, 2> Extras;
+      while (NumExtras && !UnspilledCS1GPRs.empty()) {
+        unsigned Reg = UnspilledCS1GPRs.back();
+        UnspilledCS1GPRs.pop_back();
+        if (!MRI.isReserved(Reg) &&
+            (!AFI->isThumb1OnlyFunction() || isARMLowRegister(Reg) ||
+             Reg == ARM::LR)) {
+          Extras.push_back(Reg);
+          NumExtras--;
+        }
+      }
+      // For non-Thumb1 functions, also check for hi-reg CS registers
+      if (!AFI->isThumb1OnlyFunction()) {
+        while (NumExtras && !UnspilledCS2GPRs.empty()) {
+          unsigned Reg = UnspilledCS2GPRs.back();
+          UnspilledCS2GPRs.pop_back();
+          if (!MRI.isReserved(Reg)) {
+            Extras.push_back(Reg);
+            NumExtras--;
+          }
+        }
+      }
+      if (Extras.size() && NumExtras == 0) {
+        for (unsigned i = 0, e = Extras.size(); i != e; ++i) {
+          MRI.setPhysRegUsed(Extras[i]);
+        }
+      } else if (!AFI->isThumb1OnlyFunction()) {
+        // note: Thumb1 functions spill to R12, not the stack.  Reserve a slot
+        // closest to SP or frame pointer.
+        const TargetRegisterClass *RC = &ARM::GPRRegClass;
+        RS->addScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(),
+                                                           RC->getAlignment(),
+                                                           false));
+      }
+    }
+  }
+
+  if (ForceLRSpill) {
+    MRI.setPhysRegUsed(ARM::LR);
+    AFI->setLRIsSpilledForFarJump(true);
+  }
+}
+
+
+void ARMFrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  const ARMBaseInstrInfo &TII =
+    *static_cast<const ARMBaseInstrInfo*>(MF.getTarget().getInstrInfo());
+  if (!hasReservedCallFrame(MF)) {
+    // If we have alloca, convert as follows:
+    // ADJCALLSTACKDOWN -> sub, sp, sp, amount
+    // ADJCALLSTACKUP   -> add, sp, sp, amount
+    MachineInstr *Old = I;
+    DebugLoc dl = Old->getDebugLoc();
+    unsigned Amount = Old->getOperand(0).getImm();
+    if (Amount != 0) {
+      // We need to keep the stack aligned properly.  To do this, we round the
+      // amount of space needed for the outgoing arguments up to the next
+      // alignment boundary.
+      unsigned Align = getStackAlignment();
+      Amount = (Amount+Align-1)/Align*Align;
+
+      ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+      assert(!AFI->isThumb1OnlyFunction() &&
+             "This eliminateCallFramePseudoInstr does not support Thumb1!");
+      bool isARM = !AFI->isThumbFunction();
+
+      // Replace the pseudo instruction with a new instruction...
+      unsigned Opc = Old->getOpcode();
+      int PIdx = Old->findFirstPredOperandIdx();
+      ARMCC::CondCodes Pred = (PIdx == -1)
+        ? ARMCC::AL : (ARMCC::CondCodes)Old->getOperand(PIdx).getImm();
+      if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) {
+        // Note: PredReg is operand 2 for ADJCALLSTACKDOWN.
+        unsigned PredReg = Old->getOperand(2).getReg();
+        emitSPUpdate(isARM, MBB, I, dl, TII, -Amount, MachineInstr::NoFlags,
+                     Pred, PredReg);
+      } else {
+        // Note: PredReg is operand 3 for ADJCALLSTACKUP.
+        unsigned PredReg = Old->getOperand(3).getReg();
+        assert(Opc == ARM::ADJCALLSTACKUP || Opc == ARM::tADJCALLSTACKUP);
+        emitSPUpdate(isARM, MBB, I, dl, TII, Amount, MachineInstr::NoFlags,
+                     Pred, PredReg);
+      }
+    }
+  }
+  MBB.erase(I);
+}
+
+/// Get the minimum constant for ARM that is greater than or equal to the
+/// argument. In ARM, constants can have any value that can be produced by
+/// rotating an 8-bit value to the right by an even number of bits within a
+/// 32-bit word.
+static uint32_t alignToARMConstant(uint32_t Value) {
+  unsigned Shifted = 0;
+
+  if (Value == 0)
+      return 0;
+
+  while (!(Value & 0xC0000000)) {
+      Value = Value << 2;
+      Shifted += 2;
+  }
+
+  bool Carry = (Value & 0x00FFFFFF);
+  Value = ((Value & 0xFF000000) >> 24) + Carry;
+
+  if (Value & 0x0000100)
+      Value = Value & 0x000001FC;
+
+  if (Shifted > 24)
+      Value = Value >> (Shifted - 24);
+  else
+      Value = Value << (24 - Shifted);
+
+  return Value;
+}
+
+// The stack limit in the TCB is set to this many bytes above the actual
+// stack limit.
+static const uint64_t kSplitStackAvailable = 256;
+
+// Adjust the function prologue to enable split stacks. This currently only
+// supports android and linux.
+//
+// The ABI of the segmented stack prologue is a little arbitrarily chosen, but
+// must be well defined in order to allow for consistent implementations of the
+// __morestack helper function. The ABI is also not a normal ABI in that it
+// doesn't follow the normal calling conventions because this allows the
+// prologue of each function to be optimized further.
+//
+// Currently, the ABI looks like (when calling __morestack)
+//
+//  * r4 holds the minimum stack size requested for this function call
+//  * r5 holds the stack size of the arguments to the function
+//  * the beginning of the function is 3 instructions after the call to
+//    __morestack
+//
+// Implementations of __morestack should use r4 to allocate a new stack, r5 to
+// place the arguments on to the new stack, and the 3-instruction knowledge to
+// jump directly to the body of the function when working on the new stack.
+//
+// An old (and possibly no longer compatible) implementation of __morestack for
+// ARM can be found at [1].
+//
+// [1] - https://github.com/mozilla/rust/blob/86efd9/src/rt/arch/arm/morestack.S
+void ARMFrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
+  unsigned Opcode;
+  unsigned CFIIndex;
+  const ARMSubtarget *ST = &MF.getTarget().getSubtarget<ARMSubtarget>();
+  bool Thumb = ST->isThumb();
+
+  // Sadly, this currently doesn't support varargs, platforms other than
+  // android/linux. Note that thumb1/thumb2 are support for android/linux.
+  if (MF.getFunction()->isVarArg())
+    report_fatal_error("Segmented stacks do not support vararg functions.");
+  if (!ST->isTargetAndroid() && !ST->isTargetLinux())
+    report_fatal_error("Segmented stacks not supported on this platform.");
+
+  MachineBasicBlock &prologueMBB = MF.front();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineModuleInfo &MMI = MF.getMMI();
+  MCContext &Context = MMI.getContext();
+  const MCRegisterInfo *MRI = Context.getRegisterInfo();
+  const ARMBaseInstrInfo &TII =
+      *static_cast<const ARMBaseInstrInfo*>(MF.getTarget().getInstrInfo());
+  ARMFunctionInfo *ARMFI = MF.getInfo<ARMFunctionInfo>();
+  DebugLoc DL;
+
+  uint64_t StackSize = MFI->getStackSize();
+
+  // Do not generate a prologue for functions with a stack of size zero
+  if (StackSize == 0)
+    return;
+
+  // Use R4 and R5 as scratch registers.
+  // We save R4 and R5 before use and restore them before leaving the function.
+  unsigned ScratchReg0 = ARM::R4;
+  unsigned ScratchReg1 = ARM::R5;
+  uint64_t AlignedStackSize;
+
+  MachineBasicBlock *PrevStackMBB = MF.CreateMachineBasicBlock();
+  MachineBasicBlock *PostStackMBB = MF.CreateMachineBasicBlock();
+  MachineBasicBlock *AllocMBB = MF.CreateMachineBasicBlock();
+  MachineBasicBlock *GetMBB = MF.CreateMachineBasicBlock();
+  MachineBasicBlock *McrMBB = MF.CreateMachineBasicBlock();
+
+  for (MachineBasicBlock::livein_iterator i = prologueMBB.livein_begin(),
+                                          e = prologueMBB.livein_end();
+       i != e; ++i) {
+    AllocMBB->addLiveIn(*i);
+    GetMBB->addLiveIn(*i);
+    McrMBB->addLiveIn(*i);
+    PrevStackMBB->addLiveIn(*i);
+    PostStackMBB->addLiveIn(*i);
+  }
+
+  MF.push_front(PostStackMBB);
+  MF.push_front(AllocMBB);
+  MF.push_front(GetMBB);
+  MF.push_front(McrMBB);
+  MF.push_front(PrevStackMBB);
+
+  // The required stack size that is aligned to ARM constant criterion.
+  AlignedStackSize = alignToARMConstant(StackSize);
+
+  // When the frame size is less than 256 we just compare the stack
+  // boundary directly to the value of the stack pointer, per gcc.
+  bool CompareStackPointer = AlignedStackSize < kSplitStackAvailable;
+
+  // We will use two of the callee save registers as scratch registers so we
+  // need to save those registers onto the stack.
+  // We will use SR0 to hold stack limit and SR1 to hold the stack size
+  // requested and arguments for __morestack().
+  // SR0: Scratch Register #0
+  // SR1: Scratch Register #1
+  // push {SR0, SR1}
+  if (Thumb) {
+    AddDefaultPred(BuildMI(PrevStackMBB, DL, TII.get(ARM::tPUSH)))
+        .addReg(ScratchReg0).addReg(ScratchReg1);
+  } else {
+    AddDefaultPred(BuildMI(PrevStackMBB, DL, TII.get(ARM::STMDB_UPD))
+                   .addReg(ARM::SP, RegState::Define).addReg(ARM::SP))
+        .addReg(ScratchReg0).addReg(ScratchReg1);
+  }
+
+  // Emit the relevant DWARF information about the change in stack pointer as
+  // well as where to find both r4 and r5 (the callee-save registers)
+  CFIIndex =
+      MMI.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, -8));
+  BuildMI(PrevStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+  CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
+      nullptr, MRI->getDwarfRegNum(ScratchReg1, true), -4));
+  BuildMI(PrevStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+  CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
+      nullptr, MRI->getDwarfRegNum(ScratchReg0, true), -8));
+  BuildMI(PrevStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+
+  // mov SR1, sp
+  if (Thumb) {
+    AddDefaultPred(BuildMI(McrMBB, DL, TII.get(ARM::tMOVr), ScratchReg1)
+                      .addReg(ARM::SP));
+  } else if (CompareStackPointer) {
+    AddDefaultPred(BuildMI(McrMBB, DL, TII.get(ARM::MOVr), ScratchReg1)
+                      .addReg(ARM::SP)).addReg(0);
+  }
+
+  // sub SR1, sp, #StackSize
+  if (!CompareStackPointer && Thumb) {
+    AddDefaultPred(
+        AddDefaultCC(BuildMI(McrMBB, DL, TII.get(ARM::tSUBi8), ScratchReg1))
+            .addReg(ScratchReg1).addImm(AlignedStackSize));
+  } else if (!CompareStackPointer) {
+    AddDefaultPred(BuildMI(McrMBB, DL, TII.get(ARM::SUBri), ScratchReg1)
+                      .addReg(ARM::SP).addImm(AlignedStackSize)).addReg(0);
+  }
+
+  if (Thumb && ST->isThumb1Only()) {
+    unsigned PCLabelId = ARMFI->createPICLabelUId();
+    ARMConstantPoolValue *NewCPV = ARMConstantPoolSymbol::Create(
+        MF.getFunction()->getContext(), "__STACK_LIMIT", PCLabelId, 0);
+    MachineConstantPool *MCP = MF.getConstantPool();
+    unsigned CPI = MCP->getConstantPoolIndex(NewCPV, MF.getAlignment());
+
+    // ldr SR0, [pc, offset(STACK_LIMIT)]
+    AddDefaultPred(BuildMI(GetMBB, DL, TII.get(ARM::tLDRpci), ScratchReg0)
+                      .addConstantPoolIndex(CPI));
+
+    // ldr SR0, [SR0]
+    AddDefaultPred(BuildMI(GetMBB, DL, TII.get(ARM::tLDRi), ScratchReg0)
+                      .addReg(ScratchReg0).addImm(0));
+  } else {
+    // Get TLS base address from the coprocessor
+    // mrc p15, #0, SR0, c13, c0, #3
+    AddDefaultPred(BuildMI(McrMBB, DL, TII.get(ARM::MRC), ScratchReg0)
+                     .addImm(15)
+                     .addImm(0)
+                     .addImm(13)
+                     .addImm(0)
+                     .addImm(3));
+
+    // Use the last tls slot on android and a private field of the TCP on linux.
+    assert(ST->isTargetAndroid() || ST->isTargetLinux());
+    unsigned TlsOffset = ST->isTargetAndroid() ? 63 : 1;
+
+    // Get the stack limit from the right offset
+    // ldr SR0, [sr0, #4 * TlsOffset]
+    AddDefaultPred(BuildMI(GetMBB, DL, TII.get(ARM::LDRi12), ScratchReg0)
+                      .addReg(ScratchReg0).addImm(4 * TlsOffset));
+  }
+
+  // Compare stack limit with stack size requested.
+  // cmp SR0, SR1
+  Opcode = Thumb ? ARM::tCMPr : ARM::CMPrr;
+  AddDefaultPred(BuildMI(GetMBB, DL, TII.get(Opcode))
+                    .addReg(ScratchReg0)
+                    .addReg(ScratchReg1));
+
+  // This jump is taken if StackLimit < SP - stack required.
+  Opcode = Thumb ? ARM::tBcc : ARM::Bcc;
+  BuildMI(GetMBB, DL, TII.get(Opcode)).addMBB(PostStackMBB)
+       .addImm(ARMCC::LO)
+       .addReg(ARM::CPSR);
+
+
+  // Calling __morestack(StackSize, Size of stack arguments).
+  // __morestack knows that the stack size requested is in SR0(r4)
+  // and amount size of stack arguments is in SR1(r5).
+
+  // Pass first argument for the __morestack by Scratch Register #0.
+  //   The amount size of stack required
+  if (Thumb) {
+    AddDefaultPred(AddDefaultCC(BuildMI(AllocMBB, DL, TII.get(ARM::tMOVi8),
+                                        ScratchReg0)).addImm(AlignedStackSize));
+  } else {
+    AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::MOVi), ScratchReg0)
+                      .addImm(AlignedStackSize)).addReg(0);
+  }
+  // Pass second argument for the __morestack by Scratch Register #1.
+  //   The amount size of stack consumed to save function arguments.
+  if (Thumb) {
+    AddDefaultPred(
+        AddDefaultCC(BuildMI(AllocMBB, DL, TII.get(ARM::tMOVi8), ScratchReg1))
+            .addImm(alignToARMConstant(ARMFI->getArgumentStackSize())));
+  } else {
+    AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::MOVi), ScratchReg1)
+                   .addImm(alignToARMConstant(ARMFI->getArgumentStackSize())))
+                   .addReg(0);
+  }
+
+  // push {lr} - Save return address of this function.
+  if (Thumb) {
+    AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::tPUSH)))
+        .addReg(ARM::LR);
+  } else {
+    AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::STMDB_UPD))
+                   .addReg(ARM::SP, RegState::Define)
+                   .addReg(ARM::SP))
+        .addReg(ARM::LR);
+  }
+
+  // Emit the DWARF info about the change in stack as well as where to find the
+  // previous link register
+  CFIIndex =
+      MMI.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, -12));
+  BuildMI(AllocMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+  CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
+        nullptr, MRI->getDwarfRegNum(ARM::LR, true), -12));
+  BuildMI(AllocMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+
+  // Call __morestack().
+  if (Thumb) {
+    AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::tBL)))
+        .addExternalSymbol("__morestack");
+  } else {
+    BuildMI(AllocMBB, DL, TII.get(ARM::BL))
+        .addExternalSymbol("__morestack");
+  }
+
+  // pop {lr} - Restore return address of this original function.
+  if (Thumb) {
+    if (ST->isThumb1Only()) {
+      AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::tPOP)))
+                     .addReg(ScratchReg0);
+      AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::tMOVr), ARM::LR)
+                     .addReg(ScratchReg0));
+    } else {
+      AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::t2LDR_POST))
+                     .addReg(ARM::LR, RegState::Define)
+                     .addReg(ARM::SP, RegState::Define)
+                     .addReg(ARM::SP)
+                     .addImm(4));
+    }
+  } else {
+    AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::LDMIA_UPD))
+                   .addReg(ARM::SP, RegState::Define)
+                   .addReg(ARM::SP))
+      .addReg(ARM::LR);
+  }
+
+  // Restore SR0 and SR1 in case of __morestack() was called.
+  // __morestack() will skip PostStackMBB block so we need to restore
+  // scratch registers from here.
+  // pop {SR0, SR1}
+  if (Thumb) {
+    AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::tPOP)))
+      .addReg(ScratchReg0)
+      .addReg(ScratchReg1);
+  } else {
+    AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::LDMIA_UPD))
+                   .addReg(ARM::SP, RegState::Define)
+                   .addReg(ARM::SP))
+      .addReg(ScratchReg0)
+      .addReg(ScratchReg1);
+  }
+
+  // Update the CFA offset now that we've popped
+  CFIIndex = MMI.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, 0));
+  BuildMI(AllocMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+
+  // bx lr - Return from this function.
+  Opcode = Thumb ? ARM::tBX_RET : ARM::BX_RET;
+  AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(Opcode)));
+
+  // Restore SR0 and SR1 in case of __morestack() was not called.
+  // pop {SR0, SR1}
+  if (Thumb) {
+    AddDefaultPred(BuildMI(PostStackMBB, DL, TII.get(ARM::tPOP)))
+      .addReg(ScratchReg0)
+      .addReg(ScratchReg1);
+  } else {
+    AddDefaultPred(BuildMI(PostStackMBB, DL, TII.get(ARM::LDMIA_UPD))
+                   .addReg(ARM::SP, RegState::Define)
+                   .addReg(ARM::SP))
+      .addReg(ScratchReg0)
+      .addReg(ScratchReg1);
+  }
+
+  // Update the CFA offset now that we've popped
+  CFIIndex = MMI.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, 0));
+  BuildMI(PostStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+
+  // Tell debuggers that r4 and r5 are now the same as they were in the
+  // previous function, that they're the "Same Value".
+  CFIIndex = MMI.addFrameInst(MCCFIInstruction::createSameValue(
+      nullptr, MRI->getDwarfRegNum(ScratchReg0, true)));
+  BuildMI(PostStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+  CFIIndex = MMI.addFrameInst(MCCFIInstruction::createSameValue(
+      nullptr, MRI->getDwarfRegNum(ScratchReg1, true)));
+  BuildMI(PostStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+
+  // Organizing MBB lists
+  PostStackMBB->addSuccessor(&prologueMBB);
+
+  AllocMBB->addSuccessor(PostStackMBB);
+
+  GetMBB->addSuccessor(PostStackMBB);
+  GetMBB->addSuccessor(AllocMBB);
+
+  McrMBB->addSuccessor(GetMBB);
+
+  PrevStackMBB->addSuccessor(McrMBB);
+
+#ifdef XDEBUG
+  MF.verify();
+#endif
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h
new file mode 100644
index 0000000..709afbc
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h
@@ -0,0 +1,78 @@
+//==-- ARMTargetFrameLowering.h - Define frame lowering for ARM --*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARM_FRAMEINFO_H
+#define ARM_FRAMEINFO_H
+
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+  class ARMSubtarget;
+
+class ARMFrameLowering : public TargetFrameLowering {
+protected:
+  const ARMSubtarget &STI;
+
+public:
+  explicit ARMFrameLowering(const ARMSubtarget &sti);
+
+  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+  /// the function.
+  void emitPrologue(MachineFunction &MF) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 const TargetRegisterInfo *TRI) const override;
+
+  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator MI,
+                                  const std::vector<CalleeSavedInfo> &CSI,
+                                  const TargetRegisterInfo *TRI) const override;
+
+  bool hasFP(const MachineFunction &MF) const override;
+  bool hasReservedCallFrame(const MachineFunction &MF) const override;
+  bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override;
+  int getFrameIndexReference(const MachineFunction &MF, int FI,
+                             unsigned &FrameReg) const override;
+  int ResolveFrameIndexReference(const MachineFunction &MF, int FI,
+                                 unsigned &FrameReg, int SPAdj) const;
+  int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
+
+  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                            RegScavenger *RS) const override;
+
+  void adjustForSegmentedStacks(MachineFunction &MF) const override;
+
+ private:
+  void emitPushInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                    const std::vector<CalleeSavedInfo> &CSI, unsigned StmOpc,
+                    unsigned StrOpc, bool NoGap,
+                    bool(*Func)(unsigned, bool), unsigned NumAlignedDPRCS2Regs,
+                    unsigned MIFlags = 0) const;
+  void emitPopInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                   const std::vector<CalleeSavedInfo> &CSI, unsigned LdmOpc,
+                   unsigned LdrOpc, bool isVarArg, bool NoGap,
+                   bool(*Func)(unsigned, bool),
+                   unsigned NumAlignedDPRCS2Regs) const;
+
+  void
+  eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MI) const override;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp b/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp
new file mode 100644
index 0000000..0885c4e
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp
@@ -0,0 +1,104 @@
+//===-- ARMHazardRecognizer.cpp - ARM postra hazard recognizer ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMHazardRecognizer.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMBaseRegisterInfo.h"
+#include "ARMSubtarget.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+static bool hasRAWHazard(MachineInstr *DefMI, MachineInstr *MI,
+                         const TargetRegisterInfo &TRI) {
+  // FIXME: Detect integer instructions properly.
+  const MCInstrDesc &MCID = MI->getDesc();
+  unsigned Domain = MCID.TSFlags & ARMII::DomainMask;
+  if (MI->mayStore())
+    return false;
+  unsigned Opcode = MCID.getOpcode();
+  if (Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD)
+    return false;
+  if ((Domain & ARMII::DomainVFP) || (Domain & ARMII::DomainNEON))
+    return MI->readsRegister(DefMI->getOperand(0).getReg(), &TRI);
+  return false;
+}
+
+ScheduleHazardRecognizer::HazardType
+ARMHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
+  assert(Stalls == 0 && "ARM hazards don't support scoreboard lookahead");
+
+  MachineInstr *MI = SU->getInstr();
+
+  if (!MI->isDebugValue()) {
+    // Look for special VMLA / VMLS hazards. A VMUL / VADD / VSUB following
+    // a VMLA / VMLS will cause 4 cycle stall.
+    const MCInstrDesc &MCID = MI->getDesc();
+    if (LastMI && (MCID.TSFlags & ARMII::DomainMask) != ARMII::DomainGeneral) {
+      MachineInstr *DefMI = LastMI;
+      const MCInstrDesc &LastMCID = LastMI->getDesc();
+      const TargetMachine &TM =
+        MI->getParent()->getParent()->getTarget();
+      const ARMBaseInstrInfo &TII =
+        *static_cast<const ARMBaseInstrInfo*>(TM.getInstrInfo());
+
+      // Skip over one non-VFP / NEON instruction.
+      if (!LastMI->isBarrier() &&
+          // On A9, AGU and NEON/FPU are muxed.
+          !(TII.getSubtarget().isLikeA9() &&
+            (LastMI->mayLoad() || LastMI->mayStore())) &&
+          (LastMCID.TSFlags & ARMII::DomainMask) == ARMII::DomainGeneral) {
+        MachineBasicBlock::iterator I = LastMI;
+        if (I != LastMI->getParent()->begin()) {
+          I = std::prev(I);
+          DefMI = &*I;
+        }
+      }
+
+      if (TII.isFpMLxInstruction(DefMI->getOpcode()) &&
+          (TII.canCauseFpMLxStall(MI->getOpcode()) ||
+           hasRAWHazard(DefMI, MI, TII.getRegisterInfo()))) {
+        // Try to schedule another instruction for the next 4 cycles.
+        if (FpMLxStalls == 0)
+          FpMLxStalls = 4;
+        return Hazard;
+      }
+    }
+  }
+
+  return ScoreboardHazardRecognizer::getHazardType(SU, Stalls);
+}
+
+void ARMHazardRecognizer::Reset() {
+  LastMI = nullptr;
+  FpMLxStalls = 0;
+  ScoreboardHazardRecognizer::Reset();
+}
+
+void ARMHazardRecognizer::EmitInstruction(SUnit *SU) {
+  MachineInstr *MI = SU->getInstr();
+  if (!MI->isDebugValue()) {
+    LastMI = MI;
+    FpMLxStalls = 0;
+  }
+
+  ScoreboardHazardRecognizer::EmitInstruction(SU);
+}
+
+void ARMHazardRecognizer::AdvanceCycle() {
+  if (FpMLxStalls && --FpMLxStalls == 0)
+    // Stalled for 4 cycles but still can't schedule any other instructions.
+    LastMI = nullptr;
+  ScoreboardHazardRecognizer::AdvanceCycle();
+}
+
+void ARMHazardRecognizer::RecedeCycle() {
+  llvm_unreachable("reverse ARM hazard checking unsupported");
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.h b/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.h
new file mode 100644
index 0000000..a8198e2
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.h
@@ -0,0 +1,49 @@
+//===-- ARMHazardRecognizer.h - ARM Hazard Recognizers ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines hazard recognizers for scheduling ARM functions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMHAZARDRECOGNIZER_H
+#define ARMHAZARDRECOGNIZER_H
+
+#include "llvm/CodeGen/ScoreboardHazardRecognizer.h"
+
+namespace llvm {
+
+class ARMBaseInstrInfo;
+class ARMBaseRegisterInfo;
+class ARMSubtarget;
+class MachineInstr;
+
+/// ARMHazardRecognizer handles special constraints that are not expressed in
+/// the scheduling itinerary. This is only used during postRA scheduling. The
+/// ARM preRA scheduler uses an unspecialized instance of the
+/// ScoreboardHazardRecognizer.
+class ARMHazardRecognizer : public ScoreboardHazardRecognizer {
+  MachineInstr *LastMI;
+  unsigned FpMLxStalls;
+
+public:
+  ARMHazardRecognizer(const InstrItineraryData *ItinData,
+                      const ScheduleDAG *DAG)
+    : ScoreboardHazardRecognizer(ItinData, DAG, "post-RA-sched"),
+      LastMI(nullptr) {}
+
+  HazardType getHazardType(SUnit *SU, int Stalls) override;
+  void Reset() override;
+  void EmitInstruction(SUnit *SU) override;
+  void AdvanceCycle() override;
+  void RecedeCycle() override;
+};
+
+} // end namespace llvm
+
+#endif // ARMHAZARDRECOGNIZER_H
diff --git a/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
new file mode 100644
index 0000000..38547cf
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -0,0 +1,3468 @@
+//===-- ARMISelDAGToDAG.cpp - A dag to dag inst selector for ARM ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the ARM target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMTargetMachine.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "arm-isel"
+
+static cl::opt<bool>
+DisableShifterOp("disable-shifter-op", cl::Hidden,
+  cl::desc("Disable isel of shifter-op"),
+  cl::init(false));
+
+static cl::opt<bool>
+CheckVMLxHazard("check-vmlx-hazard", cl::Hidden,
+  cl::desc("Check fp vmla / vmls hazard at isel time"),
+  cl::init(true));
+
+//===--------------------------------------------------------------------===//
+/// ARMDAGToDAGISel - ARM specific code to select ARM machine
+/// instructions for SelectionDAG operations.
+///
+namespace {
+
+enum AddrMode2Type {
+  AM2_BASE, // Simple AM2 (+-imm12)
+  AM2_SHOP  // Shifter-op AM2
+};
+
+class ARMDAGToDAGISel : public SelectionDAGISel {
+  /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const ARMSubtarget *Subtarget;
+
+public:
+  explicit ARMDAGToDAGISel(ARMBaseTargetMachine &tm, CodeGenOpt::Level OptLevel)
+      : SelectionDAGISel(tm, OptLevel) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    // Reset the subtarget each time through.
+    Subtarget = &MF.getTarget().getSubtarget<ARMSubtarget>();
+    SelectionDAGISel::runOnMachineFunction(MF);
+    return true;
+  }
+
+  const char *getPassName() const override {
+    return "ARM Instruction Selection";
+  }
+
+  void PreprocessISelDAG() override;
+
+  /// getI32Imm - Return a target constant of type i32 with the specified
+  /// value.
+  inline SDValue getI32Imm(unsigned Imm) {
+    return CurDAG->getTargetConstant(Imm, MVT::i32);
+  }
+
+  SDNode *Select(SDNode *N) override;
+
+
+  bool hasNoVMLxHazardUse(SDNode *N) const;
+  bool isShifterOpProfitable(const SDValue &Shift,
+                             ARM_AM::ShiftOpc ShOpcVal, unsigned ShAmt);
+  bool SelectRegShifterOperand(SDValue N, SDValue &A,
+                               SDValue &B, SDValue &C,
+                               bool CheckProfitability = true);
+  bool SelectImmShifterOperand(SDValue N, SDValue &A,
+                               SDValue &B, bool CheckProfitability = true);
+  bool SelectShiftRegShifterOperand(SDValue N, SDValue &A,
+                                    SDValue &B, SDValue &C) {
+    // Don't apply the profitability check
+    return SelectRegShifterOperand(N, A, B, C, false);
+  }
+  bool SelectShiftImmShifterOperand(SDValue N, SDValue &A,
+                                    SDValue &B) {
+    // Don't apply the profitability check
+    return SelectImmShifterOperand(N, A, B, false);
+  }
+
+  bool SelectAddrModeImm12(SDValue N, SDValue &Base, SDValue &OffImm);
+  bool SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset, SDValue &Opc);
+
+  AddrMode2Type SelectAddrMode2Worker(SDValue N, SDValue &Base,
+                                      SDValue &Offset, SDValue &Opc);
+  bool SelectAddrMode2Base(SDValue N, SDValue &Base, SDValue &Offset,
+                           SDValue &Opc) {
+    return SelectAddrMode2Worker(N, Base, Offset, Opc) == AM2_BASE;
+  }
+
+  bool SelectAddrMode2ShOp(SDValue N, SDValue &Base, SDValue &Offset,
+                           SDValue &Opc) {
+    return SelectAddrMode2Worker(N, Base, Offset, Opc) == AM2_SHOP;
+  }
+
+  bool SelectAddrMode2(SDValue N, SDValue &Base, SDValue &Offset,
+                       SDValue &Opc) {
+    SelectAddrMode2Worker(N, Base, Offset, Opc);
+//    return SelectAddrMode2ShOp(N, Base, Offset, Opc);
+    // This always matches one way or another.
+    return true;
+  }
+
+  bool SelectCMOVPred(SDValue N, SDValue &Pred, SDValue &Reg) {
+    const ConstantSDNode *CN = cast<ConstantSDNode>(N);
+    Pred = CurDAG->getTargetConstant(CN->getZExtValue(), MVT::i32);
+    Reg = CurDAG->getRegister(ARM::CPSR, MVT::i32);
+    return true;
+  }
+
+  bool SelectAddrMode2OffsetReg(SDNode *Op, SDValue N,
+                             SDValue &Offset, SDValue &Opc);
+  bool SelectAddrMode2OffsetImm(SDNode *Op, SDValue N,
+                             SDValue &Offset, SDValue &Opc);
+  bool SelectAddrMode2OffsetImmPre(SDNode *Op, SDValue N,
+                             SDValue &Offset, SDValue &Opc);
+  bool SelectAddrOffsetNone(SDValue N, SDValue &Base);
+  bool SelectAddrMode3(SDValue N, SDValue &Base,
+                       SDValue &Offset, SDValue &Opc);
+  bool SelectAddrMode3Offset(SDNode *Op, SDValue N,
+                             SDValue &Offset, SDValue &Opc);
+  bool SelectAddrMode5(SDValue N, SDValue &Base,
+                       SDValue &Offset);
+  bool SelectAddrMode6(SDNode *Parent, SDValue N, SDValue &Addr,SDValue &Align);
+  bool SelectAddrMode6Offset(SDNode *Op, SDValue N, SDValue &Offset);
+
+  bool SelectAddrModePC(SDValue N, SDValue &Offset, SDValue &Label);
+
+  // Thumb Addressing Modes:
+  bool SelectThumbAddrModeRR(SDValue N, SDValue &Base, SDValue &Offset);
+  bool SelectThumbAddrModeRI(SDValue N, SDValue &Base, SDValue &Offset,
+                             unsigned Scale);
+  bool SelectThumbAddrModeRI5S1(SDValue N, SDValue &Base, SDValue &Offset);
+  bool SelectThumbAddrModeRI5S2(SDValue N, SDValue &Base, SDValue &Offset);
+  bool SelectThumbAddrModeRI5S4(SDValue N, SDValue &Base, SDValue &Offset);
+  bool SelectThumbAddrModeImm5S(SDValue N, unsigned Scale, SDValue &Base,
+                                SDValue &OffImm);
+  bool SelectThumbAddrModeImm5S1(SDValue N, SDValue &Base,
+                                 SDValue &OffImm);
+  bool SelectThumbAddrModeImm5S2(SDValue N, SDValue &Base,
+                                 SDValue &OffImm);
+  bool SelectThumbAddrModeImm5S4(SDValue N, SDValue &Base,
+                                 SDValue &OffImm);
+  bool SelectThumbAddrModeSP(SDValue N, SDValue &Base, SDValue &OffImm);
+
+  // Thumb 2 Addressing Modes:
+  bool SelectT2ShifterOperandReg(SDValue N,
+                                 SDValue &BaseReg, SDValue &Opc);
+  bool SelectT2AddrModeImm12(SDValue N, SDValue &Base, SDValue &OffImm);
+  bool SelectT2AddrModeImm8(SDValue N, SDValue &Base,
+                            SDValue &OffImm);
+  bool SelectT2AddrModeImm8Offset(SDNode *Op, SDValue N,
+                                 SDValue &OffImm);
+  bool SelectT2AddrModeSoReg(SDValue N, SDValue &Base,
+                             SDValue &OffReg, SDValue &ShImm);
+  bool SelectT2AddrModeExclusive(SDValue N, SDValue &Base, SDValue &OffImm);
+
+  inline bool is_so_imm(unsigned Imm) const {
+    return ARM_AM::getSOImmVal(Imm) != -1;
+  }
+
+  inline bool is_so_imm_not(unsigned Imm) const {
+    return ARM_AM::getSOImmVal(~Imm) != -1;
+  }
+
+  inline bool is_t2_so_imm(unsigned Imm) const {
+    return ARM_AM::getT2SOImmVal(Imm) != -1;
+  }
+
+  inline bool is_t2_so_imm_not(unsigned Imm) const {
+    return ARM_AM::getT2SOImmVal(~Imm) != -1;
+  }
+
+  // Include the pieces autogenerated from the target description.
+#include "ARMGenDAGISel.inc"
+
+private:
+  /// SelectARMIndexedLoad - Indexed (pre/post inc/dec) load matching code for
+  /// ARM.
+  SDNode *SelectARMIndexedLoad(SDNode *N);
+  SDNode *SelectT2IndexedLoad(SDNode *N);
+
+  /// SelectVLD - Select NEON load intrinsics.  NumVecs should be
+  /// 1, 2, 3 or 4.  The opcode arrays specify the instructions used for
+  /// loads of D registers and even subregs and odd subregs of Q registers.
+  /// For NumVecs <= 2, QOpcodes1 is not used.
+  SDNode *SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
+                    const uint16_t *DOpcodes,
+                    const uint16_t *QOpcodes0, const uint16_t *QOpcodes1);
+
+  /// SelectVST - Select NEON store intrinsics.  NumVecs should
+  /// be 1, 2, 3 or 4.  The opcode arrays specify the instructions used for
+  /// stores of D registers and even subregs and odd subregs of Q registers.
+  /// For NumVecs <= 2, QOpcodes1 is not used.
+  SDNode *SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
+                    const uint16_t *DOpcodes,
+                    const uint16_t *QOpcodes0, const uint16_t *QOpcodes1);
+
+  /// SelectVLDSTLane - Select NEON load/store lane intrinsics.  NumVecs should
+  /// be 2, 3 or 4.  The opcode arrays specify the instructions used for
+  /// load/store of D registers and Q registers.
+  SDNode *SelectVLDSTLane(SDNode *N, bool IsLoad,
+                          bool isUpdating, unsigned NumVecs,
+                          const uint16_t *DOpcodes, const uint16_t *QOpcodes);
+
+  /// SelectVLDDup - Select NEON load-duplicate intrinsics.  NumVecs
+  /// should be 2, 3 or 4.  The opcode array specifies the instructions used
+  /// for loading D registers.  (Q registers are not supported.)
+  SDNode *SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
+                       const uint16_t *Opcodes);
+
+  /// SelectVTBL - Select NEON VTBL and VTBX intrinsics.  NumVecs should be 2,
+  /// 3 or 4.  These are custom-selected so that a REG_SEQUENCE can be
+  /// generated to force the table registers to be consecutive.
+  SDNode *SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs, unsigned Opc);
+
+  /// SelectV6T2BitfieldExtractOp - Select SBFX/UBFX instructions for ARM.
+  SDNode *SelectV6T2BitfieldExtractOp(SDNode *N, bool isSigned);
+
+  // Select special operations if node forms integer ABS pattern
+  SDNode *SelectABSOp(SDNode *N);
+
+  SDNode *SelectInlineAsm(SDNode *N);
+
+  SDNode *SelectConcatVector(SDNode *N);
+
+  /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+  /// inline asm expressions.
+  bool SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
+                                    std::vector<SDValue> &OutOps) override;
+
+  // Form pairs of consecutive R, S, D, or Q registers.
+  SDNode *createGPRPairNode(EVT VT, SDValue V0, SDValue V1);
+  SDNode *createSRegPairNode(EVT VT, SDValue V0, SDValue V1);
+  SDNode *createDRegPairNode(EVT VT, SDValue V0, SDValue V1);
+  SDNode *createQRegPairNode(EVT VT, SDValue V0, SDValue V1);
+
+  // Form sequences of 4 consecutive S, D, or Q registers.
+  SDNode *createQuadSRegsNode(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3);
+  SDNode *createQuadDRegsNode(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3);
+  SDNode *createQuadQRegsNode(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3);
+
+  // Get the alignment operand for a NEON VLD or VST instruction.
+  SDValue GetVLDSTAlign(SDValue Align, unsigned NumVecs, bool is64BitVector);
+};
+}
+
+/// isInt32Immediate - This method tests to see if the node is a 32-bit constant
+/// operand. If so Imm will receive the 32-bit value.
+static bool isInt32Immediate(SDNode *N, unsigned &Imm) {
+  if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i32) {
+    Imm = cast<ConstantSDNode>(N)->getZExtValue();
+    return true;
+  }
+  return false;
+}
+
+// isInt32Immediate - This method tests to see if a constant operand.
+// If so Imm will receive the 32 bit value.
+static bool isInt32Immediate(SDValue N, unsigned &Imm) {
+  return isInt32Immediate(N.getNode(), Imm);
+}
+
+// isOpcWithIntImmediate - This method tests to see if the node is a specific
+// opcode and that it has a immediate integer right operand.
+// If so Imm will receive the 32 bit value.
+static bool isOpcWithIntImmediate(SDNode *N, unsigned Opc, unsigned& Imm) {
+  return N->getOpcode() == Opc &&
+         isInt32Immediate(N->getOperand(1).getNode(), Imm);
+}
+
+/// \brief Check whether a particular node is a constant value representable as
+/// (N * Scale) where (N in [\p RangeMin, \p RangeMax).
+///
+/// \param ScaledConstant [out] - On success, the pre-scaled constant value.
+static bool isScaledConstantInRange(SDValue Node, int Scale,
+                                    int RangeMin, int RangeMax,
+                                    int &ScaledConstant) {
+  assert(Scale > 0 && "Invalid scale!");
+
+  // Check that this is a constant.
+  const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Node);
+  if (!C)
+    return false;
+
+  ScaledConstant = (int) C->getZExtValue();
+  if ((ScaledConstant % Scale) != 0)
+    return false;
+
+  ScaledConstant /= Scale;
+  return ScaledConstant >= RangeMin && ScaledConstant < RangeMax;
+}
+
+void ARMDAGToDAGISel::PreprocessISelDAG() {
+  if (!Subtarget->hasV6T2Ops())
+    return;
+
+  bool isThumb2 = Subtarget->isThumb();
+  for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
+       E = CurDAG->allnodes_end(); I != E; ) {
+    SDNode *N = I++;  // Preincrement iterator to avoid invalidation issues.
+
+    if (N->getOpcode() != ISD::ADD)
+      continue;
+
+    // Look for (add X1, (and (srl X2, c1), c2)) where c2 is constant with
+    // leading zeros, followed by consecutive set bits, followed by 1 or 2
+    // trailing zeros, e.g. 1020.
+    // Transform the expression to
+    // (add X1, (shl (and (srl X2, c1), (c2>>tz)), tz)) where tz is the number
+    // of trailing zeros of c2. The left shift would be folded as an shifter
+    // operand of 'add' and the 'and' and 'srl' would become a bits extraction
+    // node (UBFX).
+
+    SDValue N0 = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+    unsigned And_imm = 0;
+    if (!isOpcWithIntImmediate(N1.getNode(), ISD::AND, And_imm)) {
+      if (isOpcWithIntImmediate(N0.getNode(), ISD::AND, And_imm))
+        std::swap(N0, N1);
+    }
+    if (!And_imm)
+      continue;
+
+    // Check if the AND mask is an immediate of the form: 000.....1111111100
+    unsigned TZ = countTrailingZeros(And_imm);
+    if (TZ != 1 && TZ != 2)
+      // Be conservative here. Shifter operands aren't always free. e.g. On
+      // Swift, left shifter operand of 1 / 2 for free but others are not.
+      // e.g.
+      //  ubfx   r3, r1, #16, #8
+      //  ldr.w  r3, [r0, r3, lsl #2]
+      // vs.
+      //  mov.w  r9, #1020
+      //  and.w  r2, r9, r1, lsr #14
+      //  ldr    r2, [r0, r2]
+      continue;
+    And_imm >>= TZ;
+    if (And_imm & (And_imm + 1))
+      continue;
+
+    // Look for (and (srl X, c1), c2).
+    SDValue Srl = N1.getOperand(0);
+    unsigned Srl_imm = 0;
+    if (!isOpcWithIntImmediate(Srl.getNode(), ISD::SRL, Srl_imm) ||
+        (Srl_imm <= 2))
+      continue;
+
+    // Make sure first operand is not a shifter operand which would prevent
+    // folding of the left shift.
+    SDValue CPTmp0;
+    SDValue CPTmp1;
+    SDValue CPTmp2;
+    if (isThumb2) {
+      if (SelectT2ShifterOperandReg(N0, CPTmp0, CPTmp1))
+        continue;
+    } else {
+      if (SelectImmShifterOperand(N0, CPTmp0, CPTmp1) ||
+          SelectRegShifterOperand(N0, CPTmp0, CPTmp1, CPTmp2))
+        continue;
+    }
+
+    // Now make the transformation.
+    Srl = CurDAG->getNode(ISD::SRL, SDLoc(Srl), MVT::i32,
+                          Srl.getOperand(0),
+                          CurDAG->getConstant(Srl_imm+TZ, MVT::i32));
+    N1 = CurDAG->getNode(ISD::AND, SDLoc(N1), MVT::i32,
+                         Srl, CurDAG->getConstant(And_imm, MVT::i32));
+    N1 = CurDAG->getNode(ISD::SHL, SDLoc(N1), MVT::i32,
+                         N1, CurDAG->getConstant(TZ, MVT::i32));
+    CurDAG->UpdateNodeOperands(N, N0, N1);
+  }
+}
+
+/// hasNoVMLxHazardUse - Return true if it's desirable to select a FP MLA / MLS
+/// node. VFP / NEON fp VMLA / VMLS instructions have special RAW hazards (at
+/// least on current ARM implementations) which should be avoidded.
+bool ARMDAGToDAGISel::hasNoVMLxHazardUse(SDNode *N) const {
+  if (OptLevel == CodeGenOpt::None)
+    return true;
+
+  if (!CheckVMLxHazard)
+    return true;
+
+  if (!Subtarget->isCortexA7() && !Subtarget->isCortexA8() &&
+      !Subtarget->isCortexA9() && !Subtarget->isSwift())
+    return true;
+
+  if (!N->hasOneUse())
+    return false;
+
+  SDNode *Use = *N->use_begin();
+  if (Use->getOpcode() == ISD::CopyToReg)
+    return true;
+  if (Use->isMachineOpcode()) {
+    const ARMBaseInstrInfo *TII = static_cast<const ARMBaseInstrInfo *>(
+        CurDAG->getTarget().getInstrInfo());
+
+    const MCInstrDesc &MCID = TII->get(Use->getMachineOpcode());
+    if (MCID.mayStore())
+      return true;
+    unsigned Opcode = MCID.getOpcode();
+    if (Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD)
+      return true;
+    // vmlx feeding into another vmlx. We actually want to unfold
+    // the use later in the MLxExpansion pass. e.g.
+    // vmla
+    // vmla (stall 8 cycles)
+    //
+    // vmul (5 cycles)
+    // vadd (5 cycles)
+    // vmla
+    // This adds up to about 18 - 19 cycles.
+    //
+    // vmla
+    // vmul (stall 4 cycles)
+    // vadd adds up to about 14 cycles.
+    return TII->isFpMLxInstruction(Opcode);
+  }
+
+  return false;
+}
+
+bool ARMDAGToDAGISel::isShifterOpProfitable(const SDValue &Shift,
+                                            ARM_AM::ShiftOpc ShOpcVal,
+                                            unsigned ShAmt) {
+  if (!Subtarget->isLikeA9() && !Subtarget->isSwift())
+    return true;
+  if (Shift.hasOneUse())
+    return true;
+  // R << 2 is free.
+  return ShOpcVal == ARM_AM::lsl &&
+         (ShAmt == 2 || (Subtarget->isSwift() && ShAmt == 1));
+}
+
+bool ARMDAGToDAGISel::SelectImmShifterOperand(SDValue N,
+                                              SDValue &BaseReg,
+                                              SDValue &Opc,
+                                              bool CheckProfitability) {
+  if (DisableShifterOp)
+    return false;
+
+  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOpcode());
+
+  // Don't match base register only case. That is matched to a separate
+  // lower complexity pattern with explicit register operand.
+  if (ShOpcVal == ARM_AM::no_shift) return false;
+
+  BaseReg = N.getOperand(0);
+  unsigned ShImmVal = 0;
+  ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1));
+  if (!RHS) return false;
+  ShImmVal = RHS->getZExtValue() & 31;
+  Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ShOpcVal, ShImmVal),
+                                  MVT::i32);
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectRegShifterOperand(SDValue N,
+                                              SDValue &BaseReg,
+                                              SDValue &ShReg,
+                                              SDValue &Opc,
+                                              bool CheckProfitability) {
+  if (DisableShifterOp)
+    return false;
+
+  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOpcode());
+
+  // Don't match base register only case. That is matched to a separate
+  // lower complexity pattern with explicit register operand.
+  if (ShOpcVal == ARM_AM::no_shift) return false;
+
+  BaseReg = N.getOperand(0);
+  unsigned ShImmVal = 0;
+  ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1));
+  if (RHS) return false;
+
+  ShReg = N.getOperand(1);
+  if (CheckProfitability && !isShifterOpProfitable(N, ShOpcVal, ShImmVal))
+    return false;
+  Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ShOpcVal, ShImmVal),
+                                  MVT::i32);
+  return true;
+}
+
+
+bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N,
+                                          SDValue &Base,
+                                          SDValue &OffImm) {
+  // Match simple R + imm12 operands.
+
+  // Base only.
+  if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB &&
+      !CurDAG->isBaseWithConstantOffset(N)) {
+    if (N.getOpcode() == ISD::FrameIndex) {
+      // Match frame index.
+      int FI = cast<FrameIndexSDNode>(N)->getIndex();
+      Base = CurDAG->getTargetFrameIndex(FI,
+                                         getTargetLowering()->getPointerTy());
+      OffImm  = CurDAG->getTargetConstant(0, MVT::i32);
+      return true;
+    }
+
+    if (N.getOpcode() == ARMISD::Wrapper &&
+        N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) {
+      Base = N.getOperand(0);
+    } else
+      Base = N;
+    OffImm  = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    int RHSC = (int)RHS->getZExtValue();
+    if (N.getOpcode() == ISD::SUB)
+      RHSC = -RHSC;
+
+    if (RHSC >= 0 && RHSC < 0x1000) { // 12 bits (unsigned)
+      Base   = N.getOperand(0);
+      if (Base.getOpcode() == ISD::FrameIndex) {
+        int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+        Base = CurDAG->getTargetFrameIndex(FI,
+                                           getTargetLowering()->getPointerTy());
+      }
+      OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
+      return true;
+    }
+  }
+
+  // Base only.
+  Base = N;
+  OffImm  = CurDAG->getTargetConstant(0, MVT::i32);
+  return true;
+}
+
+
+
+bool ARMDAGToDAGISel::SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset,
+                                      SDValue &Opc) {
+  if (N.getOpcode() == ISD::MUL &&
+      ((!Subtarget->isLikeA9() && !Subtarget->isSwift()) || N.hasOneUse())) {
+    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+      // X * [3,5,9] -> X + X * [2,4,8] etc.
+      int RHSC = (int)RHS->getZExtValue();
+      if (RHSC & 1) {
+        RHSC = RHSC & ~1;
+        ARM_AM::AddrOpc AddSub = ARM_AM::add;
+        if (RHSC < 0) {
+          AddSub = ARM_AM::sub;
+          RHSC = - RHSC;
+        }
+        if (isPowerOf2_32(RHSC)) {
+          unsigned ShAmt = Log2_32(RHSC);
+          Base = Offset = N.getOperand(0);
+          Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt,
+                                                            ARM_AM::lsl),
+                                          MVT::i32);
+          return true;
+        }
+      }
+    }
+  }
+
+  if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB &&
+      // ISD::OR that is equivalent to an ISD::ADD.
+      !CurDAG->isBaseWithConstantOffset(N))
+    return false;
+
+  // Leave simple R +/- imm12 operands for LDRi12
+  if (N.getOpcode() == ISD::ADD || N.getOpcode() == ISD::OR) {
+    int RHSC;
+    if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/1,
+                                -0x1000+1, 0x1000, RHSC)) // 12 bits.
+      return false;
+  }
+
+  // Otherwise this is R +/- [possibly shifted] R.
+  ARM_AM::AddrOpc AddSub = N.getOpcode() == ISD::SUB ? ARM_AM::sub:ARM_AM::add;
+  ARM_AM::ShiftOpc ShOpcVal =
+    ARM_AM::getShiftOpcForNode(N.getOperand(1).getOpcode());
+  unsigned ShAmt = 0;
+
+  Base   = N.getOperand(0);
+  Offset = N.getOperand(1);
+
+  if (ShOpcVal != ARM_AM::no_shift) {
+    // Check to see if the RHS of the shift is a constant, if not, we can't fold
+    // it.
+    if (ConstantSDNode *Sh =
+           dyn_cast<ConstantSDNode>(N.getOperand(1).getOperand(1))) {
+      ShAmt = Sh->getZExtValue();
+      if (isShifterOpProfitable(Offset, ShOpcVal, ShAmt))
+        Offset = N.getOperand(1).getOperand(0);
+      else {
+        ShAmt = 0;
+        ShOpcVal = ARM_AM::no_shift;
+      }
+    } else {
+      ShOpcVal = ARM_AM::no_shift;
+    }
+  }
+
+  // Try matching (R shl C) + (R).
+  if (N.getOpcode() != ISD::SUB && ShOpcVal == ARM_AM::no_shift &&
+      !(Subtarget->isLikeA9() || Subtarget->isSwift() ||
+        N.getOperand(0).hasOneUse())) {
+    ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(0).getOpcode());
+    if (ShOpcVal != ARM_AM::no_shift) {
+      // Check to see if the RHS of the shift is a constant, if not, we can't
+      // fold it.
+      if (ConstantSDNode *Sh =
+          dyn_cast<ConstantSDNode>(N.getOperand(0).getOperand(1))) {
+        ShAmt = Sh->getZExtValue();
+        if (isShifterOpProfitable(N.getOperand(0), ShOpcVal, ShAmt)) {
+          Offset = N.getOperand(0).getOperand(0);
+          Base = N.getOperand(1);
+        } else {
+          ShAmt = 0;
+          ShOpcVal = ARM_AM::no_shift;
+        }
+      } else {
+        ShOpcVal = ARM_AM::no_shift;
+      }
+    }
+  }
+
+  Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, ShOpcVal),
+                                  MVT::i32);
+  return true;
+}
+
+
+//-----
+
+AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N,
+                                                     SDValue &Base,
+                                                     SDValue &Offset,
+                                                     SDValue &Opc) {
+  if (N.getOpcode() == ISD::MUL &&
+      (!(Subtarget->isLikeA9() || Subtarget->isSwift()) || N.hasOneUse())) {
+    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+      // X * [3,5,9] -> X + X * [2,4,8] etc.
+      int RHSC = (int)RHS->getZExtValue();
+      if (RHSC & 1) {
+        RHSC = RHSC & ~1;
+        ARM_AM::AddrOpc AddSub = ARM_AM::add;
+        if (RHSC < 0) {
+          AddSub = ARM_AM::sub;
+          RHSC = - RHSC;
+        }
+        if (isPowerOf2_32(RHSC)) {
+          unsigned ShAmt = Log2_32(RHSC);
+          Base = Offset = N.getOperand(0);
+          Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt,
+                                                            ARM_AM::lsl),
+                                          MVT::i32);
+          return AM2_SHOP;
+        }
+      }
+    }
+  }
+
+  if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB &&
+      // ISD::OR that is equivalent to an ADD.
+      !CurDAG->isBaseWithConstantOffset(N)) {
+    Base = N;
+    if (N.getOpcode() == ISD::FrameIndex) {
+      int FI = cast<FrameIndexSDNode>(N)->getIndex();
+      Base = CurDAG->getTargetFrameIndex(FI,
+                                         getTargetLowering()->getPointerTy());
+    } else if (N.getOpcode() == ARMISD::Wrapper &&
+               N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) {
+      Base = N.getOperand(0);
+    }
+    Offset = CurDAG->getRegister(0, MVT::i32);
+    Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(ARM_AM::add, 0,
+                                                      ARM_AM::no_shift),
+                                    MVT::i32);
+    return AM2_BASE;
+  }
+
+  // Match simple R +/- imm12 operands.
+  if (N.getOpcode() != ISD::SUB) {
+    int RHSC;
+    if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/1,
+                                -0x1000+1, 0x1000, RHSC)) { // 12 bits.
+      Base = N.getOperand(0);
+      if (Base.getOpcode() == ISD::FrameIndex) {
+        int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+        Base = CurDAG->getTargetFrameIndex(FI,
+                                           getTargetLowering()->getPointerTy());
+      }
+      Offset = CurDAG->getRegister(0, MVT::i32);
+
+      ARM_AM::AddrOpc AddSub = ARM_AM::add;
+      if (RHSC < 0) {
+        AddSub = ARM_AM::sub;
+        RHSC = - RHSC;
+      }
+      Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, RHSC,
+                                                        ARM_AM::no_shift),
+                                      MVT::i32);
+      return AM2_BASE;
+    }
+  }
+
+  if ((Subtarget->isLikeA9() || Subtarget->isSwift()) && !N.hasOneUse()) {
+    // Compute R +/- (R << N) and reuse it.
+    Base = N;
+    Offset = CurDAG->getRegister(0, MVT::i32);
+    Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(ARM_AM::add, 0,
+                                                      ARM_AM::no_shift),
+                                    MVT::i32);
+    return AM2_BASE;
+  }
+
+  // Otherwise this is R +/- [possibly shifted] R.
+  ARM_AM::AddrOpc AddSub = N.getOpcode() != ISD::SUB ? ARM_AM::add:ARM_AM::sub;
+  ARM_AM::ShiftOpc ShOpcVal =
+    ARM_AM::getShiftOpcForNode(N.getOperand(1).getOpcode());
+  unsigned ShAmt = 0;
+
+  Base   = N.getOperand(0);
+  Offset = N.getOperand(1);
+
+  if (ShOpcVal != ARM_AM::no_shift) {
+    // Check to see if the RHS of the shift is a constant, if not, we can't fold
+    // it.
+    if (ConstantSDNode *Sh =
+           dyn_cast<ConstantSDNode>(N.getOperand(1).getOperand(1))) {
+      ShAmt = Sh->getZExtValue();
+      if (isShifterOpProfitable(Offset, ShOpcVal, ShAmt))
+        Offset = N.getOperand(1).getOperand(0);
+      else {
+        ShAmt = 0;
+        ShOpcVal = ARM_AM::no_shift;
+      }
+    } else {
+      ShOpcVal = ARM_AM::no_shift;
+    }
+  }
+
+  // Try matching (R shl C) + (R).
+  if (N.getOpcode() != ISD::SUB && ShOpcVal == ARM_AM::no_shift &&
+      !(Subtarget->isLikeA9() || Subtarget->isSwift() ||
+        N.getOperand(0).hasOneUse())) {
+    ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(0).getOpcode());
+    if (ShOpcVal != ARM_AM::no_shift) {
+      // Check to see if the RHS of the shift is a constant, if not, we can't
+      // fold it.
+      if (ConstantSDNode *Sh =
+          dyn_cast<ConstantSDNode>(N.getOperand(0).getOperand(1))) {
+        ShAmt = Sh->getZExtValue();
+        if (isShifterOpProfitable(N.getOperand(0), ShOpcVal, ShAmt)) {
+          Offset = N.getOperand(0).getOperand(0);
+          Base = N.getOperand(1);
+        } else {
+          ShAmt = 0;
+          ShOpcVal = ARM_AM::no_shift;
+        }
+      } else {
+        ShOpcVal = ARM_AM::no_shift;
+      }
+    }
+  }
+
+  Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, ShOpcVal),
+                                  MVT::i32);
+  return AM2_SHOP;
+}
+
+bool ARMDAGToDAGISel::SelectAddrMode2OffsetReg(SDNode *Op, SDValue N,
+                                            SDValue &Offset, SDValue &Opc) {
+  unsigned Opcode = Op->getOpcode();
+  ISD::MemIndexedMode AM = (Opcode == ISD::LOAD)
+    ? cast<LoadSDNode>(Op)->getAddressingMode()
+    : cast<StoreSDNode>(Op)->getAddressingMode();
+  ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC)
+    ? ARM_AM::add : ARM_AM::sub;
+  int Val;
+  if (isScaledConstantInRange(N, /*Scale=*/1, 0, 0x1000, Val))
+    return false;
+
+  Offset = N;
+  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOpcode());
+  unsigned ShAmt = 0;
+  if (ShOpcVal != ARM_AM::no_shift) {
+    // Check to see if the RHS of the shift is a constant, if not, we can't fold
+    // it.
+    if (ConstantSDNode *Sh = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+      ShAmt = Sh->getZExtValue();
+      if (isShifterOpProfitable(N, ShOpcVal, ShAmt))
+        Offset = N.getOperand(0);
+      else {
+        ShAmt = 0;
+        ShOpcVal = ARM_AM::no_shift;
+      }
+    } else {
+      ShOpcVal = ARM_AM::no_shift;
+    }
+  }
+
+  Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, ShOpcVal),
+                                  MVT::i32);
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectAddrMode2OffsetImmPre(SDNode *Op, SDValue N,
+                                            SDValue &Offset, SDValue &Opc) {
+  unsigned Opcode = Op->getOpcode();
+  ISD::MemIndexedMode AM = (Opcode == ISD::LOAD)
+    ? cast<LoadSDNode>(Op)->getAddressingMode()
+    : cast<StoreSDNode>(Op)->getAddressingMode();
+  ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC)
+    ? ARM_AM::add : ARM_AM::sub;
+  int Val;
+  if (isScaledConstantInRange(N, /*Scale=*/1, 0, 0x1000, Val)) { // 12 bits.
+    if (AddSub == ARM_AM::sub) Val *= -1;
+    Offset = CurDAG->getRegister(0, MVT::i32);
+    Opc = CurDAG->getTargetConstant(Val, MVT::i32);
+    return true;
+  }
+
+  return false;
+}
+
+
+bool ARMDAGToDAGISel::SelectAddrMode2OffsetImm(SDNode *Op, SDValue N,
+                                            SDValue &Offset, SDValue &Opc) {
+  unsigned Opcode = Op->getOpcode();
+  ISD::MemIndexedMode AM = (Opcode == ISD::LOAD)
+    ? cast<LoadSDNode>(Op)->getAddressingMode()
+    : cast<StoreSDNode>(Op)->getAddressingMode();
+  ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC)
+    ? ARM_AM::add : ARM_AM::sub;
+  int Val;
+  if (isScaledConstantInRange(N, /*Scale=*/1, 0, 0x1000, Val)) { // 12 bits.
+    Offset = CurDAG->getRegister(0, MVT::i32);
+    Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, Val,
+                                                      ARM_AM::no_shift),
+                                    MVT::i32);
+    return true;
+  }
+
+  return false;
+}
+
+bool ARMDAGToDAGISel::SelectAddrOffsetNone(SDValue N, SDValue &Base) {
+  Base = N;
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectAddrMode3(SDValue N,
+                                      SDValue &Base, SDValue &Offset,
+                                      SDValue &Opc) {
+  if (N.getOpcode() == ISD::SUB) {
+    // X - C  is canonicalize to X + -C, no need to handle it here.
+    Base = N.getOperand(0);
+    Offset = N.getOperand(1);
+    Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::sub, 0),MVT::i32);
+    return true;
+  }
+
+  if (!CurDAG->isBaseWithConstantOffset(N)) {
+    Base = N;
+    if (N.getOpcode() == ISD::FrameIndex) {
+      int FI = cast<FrameIndexSDNode>(N)->getIndex();
+      Base = CurDAG->getTargetFrameIndex(FI,
+                                         getTargetLowering()->getPointerTy());
+    }
+    Offset = CurDAG->getRegister(0, MVT::i32);
+    Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::add, 0),MVT::i32);
+    return true;
+  }
+
+  // If the RHS is +/- imm8, fold into addr mode.
+  int RHSC;
+  if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/1,
+                              -256 + 1, 256, RHSC)) { // 8 bits.
+    Base = N.getOperand(0);
+    if (Base.getOpcode() == ISD::FrameIndex) {
+      int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+      Base = CurDAG->getTargetFrameIndex(FI,
+                                         getTargetLowering()->getPointerTy());
+    }
+    Offset = CurDAG->getRegister(0, MVT::i32);
+
+    ARM_AM::AddrOpc AddSub = ARM_AM::add;
+    if (RHSC < 0) {
+      AddSub = ARM_AM::sub;
+      RHSC = -RHSC;
+    }
+    Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(AddSub, RHSC),MVT::i32);
+    return true;
+  }
+
+  Base = N.getOperand(0);
+  Offset = N.getOperand(1);
+  Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::add, 0), MVT::i32);
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectAddrMode3Offset(SDNode *Op, SDValue N,
+                                            SDValue &Offset, SDValue &Opc) {
+  unsigned Opcode = Op->getOpcode();
+  ISD::MemIndexedMode AM = (Opcode == ISD::LOAD)
+    ? cast<LoadSDNode>(Op)->getAddressingMode()
+    : cast<StoreSDNode>(Op)->getAddressingMode();
+  ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC)
+    ? ARM_AM::add : ARM_AM::sub;
+  int Val;
+  if (isScaledConstantInRange(N, /*Scale=*/1, 0, 256, Val)) { // 12 bits.
+    Offset = CurDAG->getRegister(0, MVT::i32);
+    Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(AddSub, Val), MVT::i32);
+    return true;
+  }
+
+  Offset = N;
+  Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(AddSub, 0), MVT::i32);
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N,
+                                      SDValue &Base, SDValue &Offset) {
+  if (!CurDAG->isBaseWithConstantOffset(N)) {
+    Base = N;
+    if (N.getOpcode() == ISD::FrameIndex) {
+      int FI = cast<FrameIndexSDNode>(N)->getIndex();
+      Base = CurDAG->getTargetFrameIndex(FI,
+                                         getTargetLowering()->getPointerTy());
+    } else if (N.getOpcode() == ARMISD::Wrapper &&
+               N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) {
+      Base = N.getOperand(0);
+    }
+    Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::add, 0),
+                                       MVT::i32);
+    return true;
+  }
+
+  // If the RHS is +/- imm8, fold into addr mode.
+  int RHSC;
+  if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/4,
+                              -256 + 1, 256, RHSC)) {
+    Base = N.getOperand(0);
+    if (Base.getOpcode() == ISD::FrameIndex) {
+      int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+      Base = CurDAG->getTargetFrameIndex(FI,
+                                         getTargetLowering()->getPointerTy());
+    }
+
+    ARM_AM::AddrOpc AddSub = ARM_AM::add;
+    if (RHSC < 0) {
+      AddSub = ARM_AM::sub;
+      RHSC = -RHSC;
+    }
+    Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(AddSub, RHSC),
+                                       MVT::i32);
+    return true;
+  }
+
+  Base = N;
+  Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::add, 0),
+                                     MVT::i32);
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectAddrMode6(SDNode *Parent, SDValue N, SDValue &Addr,
+                                      SDValue &Align) {
+  Addr = N;
+
+  unsigned Alignment = 0;
+  if (LSBaseSDNode *LSN = dyn_cast<LSBaseSDNode>(Parent)) {
+    // This case occurs only for VLD1-lane/dup and VST1-lane instructions.
+    // The maximum alignment is equal to the memory size being referenced.
+    unsigned LSNAlign = LSN->getAlignment();
+    unsigned MemSize = LSN->getMemoryVT().getSizeInBits() / 8;
+    if (LSNAlign >= MemSize && MemSize > 1)
+      Alignment = MemSize;
+  } else {
+    // All other uses of addrmode6 are for intrinsics.  For now just record
+    // the raw alignment value; it will be refined later based on the legal
+    // alignment operands for the intrinsic.
+    Alignment = cast<MemIntrinsicSDNode>(Parent)->getAlignment();
+  }
+
+  Align = CurDAG->getTargetConstant(Alignment, MVT::i32);
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectAddrMode6Offset(SDNode *Op, SDValue N,
+                                            SDValue &Offset) {
+  LSBaseSDNode *LdSt = cast<LSBaseSDNode>(Op);
+  ISD::MemIndexedMode AM = LdSt->getAddressingMode();
+  if (AM != ISD::POST_INC)
+    return false;
+  Offset = N;
+  if (ConstantSDNode *NC = dyn_cast<ConstantSDNode>(N)) {
+    if (NC->getZExtValue() * 8 == LdSt->getMemoryVT().getSizeInBits())
+      Offset = CurDAG->getRegister(0, MVT::i32);
+  }
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectAddrModePC(SDValue N,
+                                       SDValue &Offset, SDValue &Label) {
+  if (N.getOpcode() == ARMISD::PIC_ADD && N.hasOneUse()) {
+    Offset = N.getOperand(0);
+    SDValue N1 = N.getOperand(1);
+    Label = CurDAG->getTargetConstant(cast<ConstantSDNode>(N1)->getZExtValue(),
+                                      MVT::i32);
+    return true;
+  }
+
+  return false;
+}
+
+
+//===----------------------------------------------------------------------===//
+//                         Thumb Addressing Modes
+//===----------------------------------------------------------------------===//
+
+bool ARMDAGToDAGISel::SelectThumbAddrModeRR(SDValue N,
+                                            SDValue &Base, SDValue &Offset){
+  if (N.getOpcode() != ISD::ADD && !CurDAG->isBaseWithConstantOffset(N)) {
+    ConstantSDNode *NC = dyn_cast<ConstantSDNode>(N);
+    if (!NC || !NC->isNullValue())
+      return false;
+
+    Base = Offset = N;
+    return true;
+  }
+
+  Base = N.getOperand(0);
+  Offset = N.getOperand(1);
+  return true;
+}
+
+bool
+ARMDAGToDAGISel::SelectThumbAddrModeRI(SDValue N, SDValue &Base,
+                                       SDValue &Offset, unsigned Scale) {
+  if (Scale == 4) {
+    SDValue TmpBase, TmpOffImm;
+    if (SelectThumbAddrModeSP(N, TmpBase, TmpOffImm))
+      return false;  // We want to select tLDRspi / tSTRspi instead.
+
+    if (N.getOpcode() == ARMISD::Wrapper &&
+        N.getOperand(0).getOpcode() == ISD::TargetConstantPool)
+      return false;  // We want to select tLDRpci instead.
+  }
+
+  if (!CurDAG->isBaseWithConstantOffset(N))
+    return false;
+
+  // Thumb does not have [sp, r] address mode.
+  RegisterSDNode *LHSR = dyn_cast<RegisterSDNode>(N.getOperand(0));
+  RegisterSDNode *RHSR = dyn_cast<RegisterSDNode>(N.getOperand(1));
+  if ((LHSR && LHSR->getReg() == ARM::SP) ||
+      (RHSR && RHSR->getReg() == ARM::SP))
+    return false;
+
+  // FIXME: Why do we explicitly check for a match here and then return false?
+  // Presumably to allow something else to match, but shouldn't this be
+  // documented?
+  int RHSC;
+  if (isScaledConstantInRange(N.getOperand(1), Scale, 0, 32, RHSC))
+    return false;
+
+  Base = N.getOperand(0);
+  Offset = N.getOperand(1);
+  return true;
+}
+
+bool
+ARMDAGToDAGISel::SelectThumbAddrModeRI5S1(SDValue N,
+                                          SDValue &Base,
+                                          SDValue &Offset) {
+  return SelectThumbAddrModeRI(N, Base, Offset, 1);
+}
+
+bool
+ARMDAGToDAGISel::SelectThumbAddrModeRI5S2(SDValue N,
+                                          SDValue &Base,
+                                          SDValue &Offset) {
+  return SelectThumbAddrModeRI(N, Base, Offset, 2);
+}
+
+bool
+ARMDAGToDAGISel::SelectThumbAddrModeRI5S4(SDValue N,
+                                          SDValue &Base,
+                                          SDValue &Offset) {
+  return SelectThumbAddrModeRI(N, Base, Offset, 4);
+}
+
+bool
+ARMDAGToDAGISel::SelectThumbAddrModeImm5S(SDValue N, unsigned Scale,
+                                          SDValue &Base, SDValue &OffImm) {
+  if (Scale == 4) {
+    SDValue TmpBase, TmpOffImm;
+    if (SelectThumbAddrModeSP(N, TmpBase, TmpOffImm))
+      return false;  // We want to select tLDRspi / tSTRspi instead.
+
+    if (N.getOpcode() == ARMISD::Wrapper &&
+        N.getOperand(0).getOpcode() == ISD::TargetConstantPool)
+      return false;  // We want to select tLDRpci instead.
+  }
+
+  if (!CurDAG->isBaseWithConstantOffset(N)) {
+    if (N.getOpcode() == ARMISD::Wrapper &&
+        N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) {
+      Base = N.getOperand(0);
+    } else {
+      Base = N;
+    }
+
+    OffImm = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+
+  RegisterSDNode *LHSR = dyn_cast<RegisterSDNode>(N.getOperand(0));
+  RegisterSDNode *RHSR = dyn_cast<RegisterSDNode>(N.getOperand(1));
+  if ((LHSR && LHSR->getReg() == ARM::SP) ||
+      (RHSR && RHSR->getReg() == ARM::SP)) {
+    ConstantSDNode *LHS = dyn_cast<ConstantSDNode>(N.getOperand(0));
+    ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1));
+    unsigned LHSC = LHS ? LHS->getZExtValue() : 0;
+    unsigned RHSC = RHS ? RHS->getZExtValue() : 0;
+
+    // Thumb does not have [sp, #imm5] address mode for non-zero imm5.
+    if (LHSC != 0 || RHSC != 0) return false;
+
+    Base = N;
+    OffImm = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+
+  // If the RHS is + imm5 * scale, fold into addr mode.
+  int RHSC;
+  if (isScaledConstantInRange(N.getOperand(1), Scale, 0, 32, RHSC)) {
+    Base = N.getOperand(0);
+    OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
+    return true;
+  }
+
+  Base = N.getOperand(0);
+  OffImm = CurDAG->getTargetConstant(0, MVT::i32);
+  return true;
+}
+
+bool
+ARMDAGToDAGISel::SelectThumbAddrModeImm5S4(SDValue N, SDValue &Base,
+                                           SDValue &OffImm) {
+  return SelectThumbAddrModeImm5S(N, 4, Base, OffImm);
+}
+
+bool
+ARMDAGToDAGISel::SelectThumbAddrModeImm5S2(SDValue N, SDValue &Base,
+                                           SDValue &OffImm) {
+  return SelectThumbAddrModeImm5S(N, 2, Base, OffImm);
+}
+
+bool
+ARMDAGToDAGISel::SelectThumbAddrModeImm5S1(SDValue N, SDValue &Base,
+                                           SDValue &OffImm) {
+  return SelectThumbAddrModeImm5S(N, 1, Base, OffImm);
+}
+
+bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N,
+                                            SDValue &Base, SDValue &OffImm) {
+  if (N.getOpcode() == ISD::FrameIndex) {
+    int FI = cast<FrameIndexSDNode>(N)->getIndex();
+    Base = CurDAG->getTargetFrameIndex(FI,
+                                       getTargetLowering()->getPointerTy());
+    OffImm = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+
+  if (!CurDAG->isBaseWithConstantOffset(N))
+    return false;
+
+  RegisterSDNode *LHSR = dyn_cast<RegisterSDNode>(N.getOperand(0));
+  if (N.getOperand(0).getOpcode() == ISD::FrameIndex ||
+      (LHSR && LHSR->getReg() == ARM::SP)) {
+    // If the RHS is + imm8 * scale, fold into addr mode.
+    int RHSC;
+    if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/4, 0, 256, RHSC)) {
+      Base = N.getOperand(0);
+      if (Base.getOpcode() == ISD::FrameIndex) {
+        int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+        Base = CurDAG->getTargetFrameIndex(FI,
+                                           getTargetLowering()->getPointerTy());
+      }
+      OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+
+//===----------------------------------------------------------------------===//
+//                        Thumb 2 Addressing Modes
+//===----------------------------------------------------------------------===//
+
+
+bool ARMDAGToDAGISel::SelectT2ShifterOperandReg(SDValue N, SDValue &BaseReg,
+                                                SDValue &Opc) {
+  if (DisableShifterOp)
+    return false;
+
+  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOpcode());
+
+  // Don't match base register only case. That is matched to a separate
+  // lower complexity pattern with explicit register operand.
+  if (ShOpcVal == ARM_AM::no_shift) return false;
+
+  BaseReg = N.getOperand(0);
+  unsigned ShImmVal = 0;
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    ShImmVal = RHS->getZExtValue() & 31;
+    Opc = getI32Imm(ARM_AM::getSORegOpc(ShOpcVal, ShImmVal));
+    return true;
+  }
+
+  return false;
+}
+
+bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue N,
+                                            SDValue &Base, SDValue &OffImm) {
+  // Match simple R + imm12 operands.
+
+  // Base only.
+  if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB &&
+      !CurDAG->isBaseWithConstantOffset(N)) {
+    if (N.getOpcode() == ISD::FrameIndex) {
+      // Match frame index.
+      int FI = cast<FrameIndexSDNode>(N)->getIndex();
+      Base = CurDAG->getTargetFrameIndex(FI,
+                                         getTargetLowering()->getPointerTy());
+      OffImm  = CurDAG->getTargetConstant(0, MVT::i32);
+      return true;
+    }
+
+    if (N.getOpcode() == ARMISD::Wrapper &&
+        N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) {
+      Base = N.getOperand(0);
+      if (Base.getOpcode() == ISD::TargetConstantPool)
+        return false;  // We want to select t2LDRpci instead.
+    } else
+      Base = N;
+    OffImm  = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    if (SelectT2AddrModeImm8(N, Base, OffImm))
+      // Let t2LDRi8 handle (R - imm8).
+      return false;
+
+    int RHSC = (int)RHS->getZExtValue();
+    if (N.getOpcode() == ISD::SUB)
+      RHSC = -RHSC;
+
+    if (RHSC >= 0 && RHSC < 0x1000) { // 12 bits (unsigned)
+      Base   = N.getOperand(0);
+      if (Base.getOpcode() == ISD::FrameIndex) {
+        int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+        Base = CurDAG->getTargetFrameIndex(FI,
+                                           getTargetLowering()->getPointerTy());
+      }
+      OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
+      return true;
+    }
+  }
+
+  // Base only.
+  Base = N;
+  OffImm  = CurDAG->getTargetConstant(0, MVT::i32);
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectT2AddrModeImm8(SDValue N,
+                                           SDValue &Base, SDValue &OffImm) {
+  // Match simple R - imm8 operands.
+  if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB &&
+      !CurDAG->isBaseWithConstantOffset(N))
+    return false;
+
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    int RHSC = (int)RHS->getSExtValue();
+    if (N.getOpcode() == ISD::SUB)
+      RHSC = -RHSC;
+
+    if ((RHSC >= -255) && (RHSC < 0)) { // 8 bits (always negative)
+      Base = N.getOperand(0);
+      if (Base.getOpcode() == ISD::FrameIndex) {
+        int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+        Base = CurDAG->getTargetFrameIndex(FI,
+                                           getTargetLowering()->getPointerTy());
+      }
+      OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool ARMDAGToDAGISel::SelectT2AddrModeImm8Offset(SDNode *Op, SDValue N,
+                                                 SDValue &OffImm){
+  unsigned Opcode = Op->getOpcode();
+  ISD::MemIndexedMode AM = (Opcode == ISD::LOAD)
+    ? cast<LoadSDNode>(Op)->getAddressingMode()
+    : cast<StoreSDNode>(Op)->getAddressingMode();
+  int RHSC;
+  if (isScaledConstantInRange(N, /*Scale=*/1, 0, 0x100, RHSC)) { // 8 bits.
+    OffImm = ((AM == ISD::PRE_INC) || (AM == ISD::POST_INC))
+      ? CurDAG->getTargetConstant(RHSC, MVT::i32)
+      : CurDAG->getTargetConstant(-RHSC, MVT::i32);
+    return true;
+  }
+
+  return false;
+}
+
+bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDValue N,
+                                            SDValue &Base,
+                                            SDValue &OffReg, SDValue &ShImm) {
+  // (R - imm8) should be handled by t2LDRi8. The rest are handled by t2LDRi12.
+  if (N.getOpcode() != ISD::ADD && !CurDAG->isBaseWithConstantOffset(N))
+    return false;
+
+  // Leave (R + imm12) for t2LDRi12, (R - imm8) for t2LDRi8.
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    int RHSC = (int)RHS->getZExtValue();
+    if (RHSC >= 0 && RHSC < 0x1000) // 12 bits (unsigned)
+      return false;
+    else if (RHSC < 0 && RHSC >= -255) // 8 bits
+      return false;
+  }
+
+  // Look for (R + R) or (R + (R << [1,2,3])).
+  unsigned ShAmt = 0;
+  Base   = N.getOperand(0);
+  OffReg = N.getOperand(1);
+
+  // Swap if it is ((R << c) + R).
+  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(OffReg.getOpcode());
+  if (ShOpcVal != ARM_AM::lsl) {
+    ShOpcVal = ARM_AM::getShiftOpcForNode(Base.getOpcode());
+    if (ShOpcVal == ARM_AM::lsl)
+      std::swap(Base, OffReg);
+  }
+
+  if (ShOpcVal == ARM_AM::lsl) {
+    // Check to see if the RHS of the shift is a constant, if not, we can't fold
+    // it.
+    if (ConstantSDNode *Sh = dyn_cast<ConstantSDNode>(OffReg.getOperand(1))) {
+      ShAmt = Sh->getZExtValue();
+      if (ShAmt < 4 && isShifterOpProfitable(OffReg, ShOpcVal, ShAmt))
+        OffReg = OffReg.getOperand(0);
+      else {
+        ShAmt = 0;
+        ShOpcVal = ARM_AM::no_shift;
+      }
+    } else {
+      ShOpcVal = ARM_AM::no_shift;
+    }
+  }
+
+  ShImm = CurDAG->getTargetConstant(ShAmt, MVT::i32);
+
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectT2AddrModeExclusive(SDValue N, SDValue &Base,
+                                                SDValue &OffImm) {
+  // This *must* succeed since it's used for the irreplaceable ldrex and strex
+  // instructions.
+  Base = N;
+  OffImm = CurDAG->getTargetConstant(0, MVT::i32);
+
+  if (N.getOpcode() != ISD::ADD || !CurDAG->isBaseWithConstantOffset(N))
+    return true;
+
+  ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1));
+  if (!RHS)
+    return true;
+
+  uint32_t RHSC = (int)RHS->getZExtValue();
+  if (RHSC > 1020 || RHSC % 4 != 0)
+    return true;
+
+  Base = N.getOperand(0);
+  if (Base.getOpcode() == ISD::FrameIndex) {
+    int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+    Base = CurDAG->getTargetFrameIndex(FI, getTargetLowering()->getPointerTy());
+  }
+
+  OffImm = CurDAG->getTargetConstant(RHSC / 4, MVT::i32);
+  return true;
+}
+
+//===--------------------------------------------------------------------===//
+
+/// getAL - Returns a ARMCC::AL immediate node.
+static inline SDValue getAL(SelectionDAG *CurDAG) {
+  return CurDAG->getTargetConstant((uint64_t)ARMCC::AL, MVT::i32);
+}
+
+SDNode *ARMDAGToDAGISel::SelectARMIndexedLoad(SDNode *N) {
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  ISD::MemIndexedMode AM = LD->getAddressingMode();
+  if (AM == ISD::UNINDEXED)
+    return nullptr;
+
+  EVT LoadedVT = LD->getMemoryVT();
+  SDValue Offset, AMOpc;
+  bool isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC);
+  unsigned Opcode = 0;
+  bool Match = false;
+  if (LoadedVT == MVT::i32 && isPre &&
+      SelectAddrMode2OffsetImmPre(N, LD->getOffset(), Offset, AMOpc)) {
+    Opcode = ARM::LDR_PRE_IMM;
+    Match = true;
+  } else if (LoadedVT == MVT::i32 && !isPre &&
+      SelectAddrMode2OffsetImm(N, LD->getOffset(), Offset, AMOpc)) {
+    Opcode = ARM::LDR_POST_IMM;
+    Match = true;
+  } else if (LoadedVT == MVT::i32 &&
+      SelectAddrMode2OffsetReg(N, LD->getOffset(), Offset, AMOpc)) {
+    Opcode = isPre ? ARM::LDR_PRE_REG : ARM::LDR_POST_REG;
+    Match = true;
+
+  } else if (LoadedVT == MVT::i16 &&
+             SelectAddrMode3Offset(N, LD->getOffset(), Offset, AMOpc)) {
+    Match = true;
+    Opcode = (LD->getExtensionType() == ISD::SEXTLOAD)
+      ? (isPre ? ARM::LDRSH_PRE : ARM::LDRSH_POST)
+      : (isPre ? ARM::LDRH_PRE : ARM::LDRH_POST);
+  } else if (LoadedVT == MVT::i8 || LoadedVT == MVT::i1) {
+    if (LD->getExtensionType() == ISD::SEXTLOAD) {
+      if (SelectAddrMode3Offset(N, LD->getOffset(), Offset, AMOpc)) {
+        Match = true;
+        Opcode = isPre ? ARM::LDRSB_PRE : ARM::LDRSB_POST;
+      }
+    } else {
+      if (isPre &&
+          SelectAddrMode2OffsetImmPre(N, LD->getOffset(), Offset, AMOpc)) {
+        Match = true;
+        Opcode = ARM::LDRB_PRE_IMM;
+      } else if (!isPre &&
+                  SelectAddrMode2OffsetImm(N, LD->getOffset(), Offset, AMOpc)) {
+        Match = true;
+        Opcode = ARM::LDRB_POST_IMM;
+      } else if (SelectAddrMode2OffsetReg(N, LD->getOffset(), Offset, AMOpc)) {
+        Match = true;
+        Opcode = isPre ? ARM::LDRB_PRE_REG : ARM::LDRB_POST_REG;
+      }
+    }
+  }
+
+  if (Match) {
+    if (Opcode == ARM::LDR_PRE_IMM || Opcode == ARM::LDRB_PRE_IMM) {
+      SDValue Chain = LD->getChain();
+      SDValue Base = LD->getBasePtr();
+      SDValue Ops[]= { Base, AMOpc, getAL(CurDAG),
+                       CurDAG->getRegister(0, MVT::i32), Chain };
+      return CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32,
+                                    MVT::i32, MVT::Other, Ops);
+    } else {
+      SDValue Chain = LD->getChain();
+      SDValue Base = LD->getBasePtr();
+      SDValue Ops[]= { Base, Offset, AMOpc, getAL(CurDAG),
+                       CurDAG->getRegister(0, MVT::i32), Chain };
+      return CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32,
+                                    MVT::i32, MVT::Other, Ops);
+    }
+  }
+
+  return nullptr;
+}
+
+SDNode *ARMDAGToDAGISel::SelectT2IndexedLoad(SDNode *N) {
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  ISD::MemIndexedMode AM = LD->getAddressingMode();
+  if (AM == ISD::UNINDEXED)
+    return nullptr;
+
+  EVT LoadedVT = LD->getMemoryVT();
+  bool isSExtLd = LD->getExtensionType() == ISD::SEXTLOAD;
+  SDValue Offset;
+  bool isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC);
+  unsigned Opcode = 0;
+  bool Match = false;
+  if (SelectT2AddrModeImm8Offset(N, LD->getOffset(), Offset)) {
+    switch (LoadedVT.getSimpleVT().SimpleTy) {
+    case MVT::i32:
+      Opcode = isPre ? ARM::t2LDR_PRE : ARM::t2LDR_POST;
+      break;
+    case MVT::i16:
+      if (isSExtLd)
+        Opcode = isPre ? ARM::t2LDRSH_PRE : ARM::t2LDRSH_POST;
+      else
+        Opcode = isPre ? ARM::t2LDRH_PRE : ARM::t2LDRH_POST;
+      break;
+    case MVT::i8:
+    case MVT::i1:
+      if (isSExtLd)
+        Opcode = isPre ? ARM::t2LDRSB_PRE : ARM::t2LDRSB_POST;
+      else
+        Opcode = isPre ? ARM::t2LDRB_PRE : ARM::t2LDRB_POST;
+      break;
+    default:
+      return nullptr;
+    }
+    Match = true;
+  }
+
+  if (Match) {
+    SDValue Chain = LD->getChain();
+    SDValue Base = LD->getBasePtr();
+    SDValue Ops[]= { Base, Offset, getAL(CurDAG),
+                     CurDAG->getRegister(0, MVT::i32), Chain };
+    return CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32, MVT::i32,
+                                  MVT::Other, Ops);
+  }
+
+  return nullptr;
+}
+
+/// \brief Form a GPRPair pseudo register from a pair of GPR regs.
+SDNode *ARMDAGToDAGISel::createGPRPairNode(EVT VT, SDValue V0, SDValue V1) {
+  SDLoc dl(V0.getNode());
+  SDValue RegClass =
+    CurDAG->getTargetConstant(ARM::GPRPairRegClassID, MVT::i32);
+  SDValue SubReg0 = CurDAG->getTargetConstant(ARM::gsub_0, MVT::i32);
+  SDValue SubReg1 = CurDAG->getTargetConstant(ARM::gsub_1, MVT::i32);
+  const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1 };
+  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops);
+}
+
+/// \brief Form a D register from a pair of S registers.
+SDNode *ARMDAGToDAGISel::createSRegPairNode(EVT VT, SDValue V0, SDValue V1) {
+  SDLoc dl(V0.getNode());
+  SDValue RegClass =
+    CurDAG->getTargetConstant(ARM::DPR_VFP2RegClassID, MVT::i32);
+  SDValue SubReg0 = CurDAG->getTargetConstant(ARM::ssub_0, MVT::i32);
+  SDValue SubReg1 = CurDAG->getTargetConstant(ARM::ssub_1, MVT::i32);
+  const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1 };
+  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops);
+}
+
+/// \brief Form a quad register from a pair of D registers.
+SDNode *ARMDAGToDAGISel::createDRegPairNode(EVT VT, SDValue V0, SDValue V1) {
+  SDLoc dl(V0.getNode());
+  SDValue RegClass = CurDAG->getTargetConstant(ARM::QPRRegClassID, MVT::i32);
+  SDValue SubReg0 = CurDAG->getTargetConstant(ARM::dsub_0, MVT::i32);
+  SDValue SubReg1 = CurDAG->getTargetConstant(ARM::dsub_1, MVT::i32);
+  const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1 };
+  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops);
+}
+
+/// \brief Form 4 consecutive D registers from a pair of Q registers.
+SDNode *ARMDAGToDAGISel::createQRegPairNode(EVT VT, SDValue V0, SDValue V1) {
+  SDLoc dl(V0.getNode());
+  SDValue RegClass = CurDAG->getTargetConstant(ARM::QQPRRegClassID, MVT::i32);
+  SDValue SubReg0 = CurDAG->getTargetConstant(ARM::qsub_0, MVT::i32);
+  SDValue SubReg1 = CurDAG->getTargetConstant(ARM::qsub_1, MVT::i32);
+  const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1 };
+  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops);
+}
+
+/// \brief Form 4 consecutive S registers.
+SDNode *ARMDAGToDAGISel::createQuadSRegsNode(EVT VT, SDValue V0, SDValue V1,
+                                   SDValue V2, SDValue V3) {
+  SDLoc dl(V0.getNode());
+  SDValue RegClass =
+    CurDAG->getTargetConstant(ARM::QPR_VFP2RegClassID, MVT::i32);
+  SDValue SubReg0 = CurDAG->getTargetConstant(ARM::ssub_0, MVT::i32);
+  SDValue SubReg1 = CurDAG->getTargetConstant(ARM::ssub_1, MVT::i32);
+  SDValue SubReg2 = CurDAG->getTargetConstant(ARM::ssub_2, MVT::i32);
+  SDValue SubReg3 = CurDAG->getTargetConstant(ARM::ssub_3, MVT::i32);
+  const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1,
+                                    V2, SubReg2, V3, SubReg3 };
+  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops);
+}
+
+/// \brief Form 4 consecutive D registers.
+SDNode *ARMDAGToDAGISel::createQuadDRegsNode(EVT VT, SDValue V0, SDValue V1,
+                                   SDValue V2, SDValue V3) {
+  SDLoc dl(V0.getNode());
+  SDValue RegClass = CurDAG->getTargetConstant(ARM::QQPRRegClassID, MVT::i32);
+  SDValue SubReg0 = CurDAG->getTargetConstant(ARM::dsub_0, MVT::i32);
+  SDValue SubReg1 = CurDAG->getTargetConstant(ARM::dsub_1, MVT::i32);
+  SDValue SubReg2 = CurDAG->getTargetConstant(ARM::dsub_2, MVT::i32);
+  SDValue SubReg3 = CurDAG->getTargetConstant(ARM::dsub_3, MVT::i32);
+  const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1,
+                                    V2, SubReg2, V3, SubReg3 };
+  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops);
+}
+
+/// \brief Form 4 consecutive Q registers.
+SDNode *ARMDAGToDAGISel::createQuadQRegsNode(EVT VT, SDValue V0, SDValue V1,
+                                   SDValue V2, SDValue V3) {
+  SDLoc dl(V0.getNode());
+  SDValue RegClass = CurDAG->getTargetConstant(ARM::QQQQPRRegClassID, MVT::i32);
+  SDValue SubReg0 = CurDAG->getTargetConstant(ARM::qsub_0, MVT::i32);
+  SDValue SubReg1 = CurDAG->getTargetConstant(ARM::qsub_1, MVT::i32);
+  SDValue SubReg2 = CurDAG->getTargetConstant(ARM::qsub_2, MVT::i32);
+  SDValue SubReg3 = CurDAG->getTargetConstant(ARM::qsub_3, MVT::i32);
+  const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1,
+                                    V2, SubReg2, V3, SubReg3 };
+  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops);
+}
+
+/// GetVLDSTAlign - Get the alignment (in bytes) for the alignment operand
+/// of a NEON VLD or VST instruction.  The supported values depend on the
+/// number of registers being loaded.
+SDValue ARMDAGToDAGISel::GetVLDSTAlign(SDValue Align, unsigned NumVecs,
+                                       bool is64BitVector) {
+  unsigned NumRegs = NumVecs;
+  if (!is64BitVector && NumVecs < 3)
+    NumRegs *= 2;
+
+  unsigned Alignment = cast<ConstantSDNode>(Align)->getZExtValue();
+  if (Alignment >= 32 && NumRegs == 4)
+    Alignment = 32;
+  else if (Alignment >= 16 && (NumRegs == 2 || NumRegs == 4))
+    Alignment = 16;
+  else if (Alignment >= 8)
+    Alignment = 8;
+  else
+    Alignment = 0;
+
+  return CurDAG->getTargetConstant(Alignment, MVT::i32);
+}
+
+static bool isVLDfixed(unsigned Opc)
+{
+  switch (Opc) {
+  default: return false;
+  case ARM::VLD1d8wb_fixed : return true;
+  case ARM::VLD1d16wb_fixed : return true;
+  case ARM::VLD1d64Qwb_fixed : return true;
+  case ARM::VLD1d32wb_fixed : return true;
+  case ARM::VLD1d64wb_fixed : return true;
+  case ARM::VLD1d64TPseudoWB_fixed : return true;
+  case ARM::VLD1d64QPseudoWB_fixed : return true;
+  case ARM::VLD1q8wb_fixed : return true;
+  case ARM::VLD1q16wb_fixed : return true;
+  case ARM::VLD1q32wb_fixed : return true;
+  case ARM::VLD1q64wb_fixed : return true;
+  case ARM::VLD2d8wb_fixed : return true;
+  case ARM::VLD2d16wb_fixed : return true;
+  case ARM::VLD2d32wb_fixed : return true;
+  case ARM::VLD2q8PseudoWB_fixed : return true;
+  case ARM::VLD2q16PseudoWB_fixed : return true;
+  case ARM::VLD2q32PseudoWB_fixed : return true;
+  case ARM::VLD2DUPd8wb_fixed : return true;
+  case ARM::VLD2DUPd16wb_fixed : return true;
+  case ARM::VLD2DUPd32wb_fixed : return true;
+  }
+}
+
+static bool isVSTfixed(unsigned Opc)
+{
+  switch (Opc) {
+  default: return false;
+  case ARM::VST1d8wb_fixed : return true;
+  case ARM::VST1d16wb_fixed : return true;
+  case ARM::VST1d32wb_fixed : return true;
+  case ARM::VST1d64wb_fixed : return true;
+  case ARM::VST1q8wb_fixed : return true;
+  case ARM::VST1q16wb_fixed : return true;
+  case ARM::VST1q32wb_fixed : return true;
+  case ARM::VST1q64wb_fixed : return true;
+  case ARM::VST1d64TPseudoWB_fixed : return true;
+  case ARM::VST1d64QPseudoWB_fixed : return true;
+  case ARM::VST2d8wb_fixed : return true;
+  case ARM::VST2d16wb_fixed : return true;
+  case ARM::VST2d32wb_fixed : return true;
+  case ARM::VST2q8PseudoWB_fixed : return true;
+  case ARM::VST2q16PseudoWB_fixed : return true;
+  case ARM::VST2q32PseudoWB_fixed : return true;
+  }
+}
+
+// Get the register stride update opcode of a VLD/VST instruction that
+// is otherwise equivalent to the given fixed stride updating instruction.
+static unsigned getVLDSTRegisterUpdateOpcode(unsigned Opc) {
+  assert((isVLDfixed(Opc) || isVSTfixed(Opc))
+    && "Incorrect fixed stride updating instruction.");
+  switch (Opc) {
+  default: break;
+  case ARM::VLD1d8wb_fixed: return ARM::VLD1d8wb_register;
+  case ARM::VLD1d16wb_fixed: return ARM::VLD1d16wb_register;
+  case ARM::VLD1d32wb_fixed: return ARM::VLD1d32wb_register;
+  case ARM::VLD1d64wb_fixed: return ARM::VLD1d64wb_register;
+  case ARM::VLD1q8wb_fixed: return ARM::VLD1q8wb_register;
+  case ARM::VLD1q16wb_fixed: return ARM::VLD1q16wb_register;
+  case ARM::VLD1q32wb_fixed: return ARM::VLD1q32wb_register;
+  case ARM::VLD1q64wb_fixed: return ARM::VLD1q64wb_register;
+  case ARM::VLD1d64Twb_fixed: return ARM::VLD1d64Twb_register;
+  case ARM::VLD1d64Qwb_fixed: return ARM::VLD1d64Qwb_register;
+  case ARM::VLD1d64TPseudoWB_fixed: return ARM::VLD1d64TPseudoWB_register;
+  case ARM::VLD1d64QPseudoWB_fixed: return ARM::VLD1d64QPseudoWB_register;
+
+  case ARM::VST1d8wb_fixed: return ARM::VST1d8wb_register;
+  case ARM::VST1d16wb_fixed: return ARM::VST1d16wb_register;
+  case ARM::VST1d32wb_fixed: return ARM::VST1d32wb_register;
+  case ARM::VST1d64wb_fixed: return ARM::VST1d64wb_register;
+  case ARM::VST1q8wb_fixed: return ARM::VST1q8wb_register;
+  case ARM::VST1q16wb_fixed: return ARM::VST1q16wb_register;
+  case ARM::VST1q32wb_fixed: return ARM::VST1q32wb_register;
+  case ARM::VST1q64wb_fixed: return ARM::VST1q64wb_register;
+  case ARM::VST1d64TPseudoWB_fixed: return ARM::VST1d64TPseudoWB_register;
+  case ARM::VST1d64QPseudoWB_fixed: return ARM::VST1d64QPseudoWB_register;
+
+  case ARM::VLD2d8wb_fixed: return ARM::VLD2d8wb_register;
+  case ARM::VLD2d16wb_fixed: return ARM::VLD2d16wb_register;
+  case ARM::VLD2d32wb_fixed: return ARM::VLD2d32wb_register;
+  case ARM::VLD2q8PseudoWB_fixed: return ARM::VLD2q8PseudoWB_register;
+  case ARM::VLD2q16PseudoWB_fixed: return ARM::VLD2q16PseudoWB_register;
+  case ARM::VLD2q32PseudoWB_fixed: return ARM::VLD2q32PseudoWB_register;
+
+  case ARM::VST2d8wb_fixed: return ARM::VST2d8wb_register;
+  case ARM::VST2d16wb_fixed: return ARM::VST2d16wb_register;
+  case ARM::VST2d32wb_fixed: return ARM::VST2d32wb_register;
+  case ARM::VST2q8PseudoWB_fixed: return ARM::VST2q8PseudoWB_register;
+  case ARM::VST2q16PseudoWB_fixed: return ARM::VST2q16PseudoWB_register;
+  case ARM::VST2q32PseudoWB_fixed: return ARM::VST2q32PseudoWB_register;
+
+  case ARM::VLD2DUPd8wb_fixed: return ARM::VLD2DUPd8wb_register;
+  case ARM::VLD2DUPd16wb_fixed: return ARM::VLD2DUPd16wb_register;
+  case ARM::VLD2DUPd32wb_fixed: return ARM::VLD2DUPd32wb_register;
+  }
+  return Opc; // If not one we handle, return it unchanged.
+}
+
+SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
+                                   const uint16_t *DOpcodes,
+                                   const uint16_t *QOpcodes0,
+                                   const uint16_t *QOpcodes1) {
+  assert(NumVecs >= 1 && NumVecs <= 4 && "VLD NumVecs out-of-range");
+  SDLoc dl(N);
+
+  SDValue MemAddr, Align;
+  unsigned AddrOpIdx = isUpdating ? 1 : 2;
+  if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
+    return nullptr;
+
+  SDValue Chain = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  bool is64BitVector = VT.is64BitVector();
+  Align = GetVLDSTAlign(Align, NumVecs, is64BitVector);
+
+  unsigned OpcodeIndex;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("unhandled vld type");
+    // Double-register operations:
+  case MVT::v8i8:  OpcodeIndex = 0; break;
+  case MVT::v4i16: OpcodeIndex = 1; break;
+  case MVT::v2f32:
+  case MVT::v2i32: OpcodeIndex = 2; break;
+  case MVT::v1i64: OpcodeIndex = 3; break;
+    // Quad-register operations:
+  case MVT::v16i8: OpcodeIndex = 0; break;
+  case MVT::v8i16: OpcodeIndex = 1; break;
+  case MVT::v4f32:
+  case MVT::v4i32: OpcodeIndex = 2; break;
+  case MVT::v2i64: OpcodeIndex = 3;
+    assert(NumVecs == 1 && "v2i64 type only supported for VLD1");
+    break;
+  }
+
+  EVT ResTy;
+  if (NumVecs == 1)
+    ResTy = VT;
+  else {
+    unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs;
+    if (!is64BitVector)
+      ResTyElts *= 2;
+    ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, ResTyElts);
+  }
+  std::vector<EVT> ResTys;
+  ResTys.push_back(ResTy);
+  if (isUpdating)
+    ResTys.push_back(MVT::i32);
+  ResTys.push_back(MVT::Other);
+
+  SDValue Pred = getAL(CurDAG);
+  SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+  SDNode *VLd;
+  SmallVector<SDValue, 7> Ops;
+
+  // Double registers and VLD1/VLD2 quad registers are directly supported.
+  if (is64BitVector || NumVecs <= 2) {
+    unsigned Opc = (is64BitVector ? DOpcodes[OpcodeIndex] :
+                    QOpcodes0[OpcodeIndex]);
+    Ops.push_back(MemAddr);
+    Ops.push_back(Align);
+    if (isUpdating) {
+      SDValue Inc = N->getOperand(AddrOpIdx + 1);
+      // FIXME: VLD1/VLD2 fixed increment doesn't need Reg0. Remove the reg0
+      // case entirely when the rest are updated to that form, too.
+      if ((NumVecs <= 2) && !isa<ConstantSDNode>(Inc.getNode()))
+        Opc = getVLDSTRegisterUpdateOpcode(Opc);
+      // FIXME: We use a VLD1 for v1i64 even if the pseudo says vld2/3/4, so
+      // check for that explicitly too. Horribly hacky, but temporary.
+      if ((NumVecs > 2 && !isVLDfixed(Opc)) ||
+          !isa<ConstantSDNode>(Inc.getNode()))
+        Ops.push_back(isa<ConstantSDNode>(Inc.getNode()) ? Reg0 : Inc);
+    }
+    Ops.push_back(Pred);
+    Ops.push_back(Reg0);
+    Ops.push_back(Chain);
+    VLd = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+
+  } else {
+    // Otherwise, quad registers are loaded with two separate instructions,
+    // where one loads the even registers and the other loads the odd registers.
+    EVT AddrTy = MemAddr.getValueType();
+
+    // Load the even subregs.  This is always an updating load, so that it
+    // provides the address to the second load for the odd subregs.
+    SDValue ImplDef =
+      SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, ResTy), 0);
+    const SDValue OpsA[] = { MemAddr, Align, Reg0, ImplDef, Pred, Reg0, Chain };
+    SDNode *VLdA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], dl,
+                                          ResTy, AddrTy, MVT::Other, OpsA);
+    Chain = SDValue(VLdA, 2);
+
+    // Load the odd subregs.
+    Ops.push_back(SDValue(VLdA, 1));
+    Ops.push_back(Align);
+    if (isUpdating) {
+      SDValue Inc = N->getOperand(AddrOpIdx + 1);
+      assert(isa<ConstantSDNode>(Inc.getNode()) &&
+             "only constant post-increment update allowed for VLD3/4");
+      (void)Inc;
+      Ops.push_back(Reg0);
+    }
+    Ops.push_back(SDValue(VLdA, 0));
+    Ops.push_back(Pred);
+    Ops.push_back(Reg0);
+    Ops.push_back(Chain);
+    VLd = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys, Ops);
+  }
+
+  // Transfer memoperands.
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(VLd)->setMemRefs(MemOp, MemOp + 1);
+
+  if (NumVecs == 1)
+    return VLd;
+
+  // Extract out the subregisters.
+  SDValue SuperReg = SDValue(VLd, 0);
+  assert(ARM::dsub_7 == ARM::dsub_0+7 &&
+         ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering");
+  unsigned Sub0 = (is64BitVector ? ARM::dsub_0 : ARM::qsub_0);
+  for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
+    ReplaceUses(SDValue(N, Vec),
+                CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg));
+  ReplaceUses(SDValue(N, NumVecs), SDValue(VLd, 1));
+  if (isUpdating)
+    ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLd, 2));
+  return nullptr;
+}
+
+SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
+                                   const uint16_t *DOpcodes,
+                                   const uint16_t *QOpcodes0,
+                                   const uint16_t *QOpcodes1) {
+  assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range");
+  SDLoc dl(N);
+
+  SDValue MemAddr, Align;
+  unsigned AddrOpIdx = isUpdating ? 1 : 2;
+  unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1)
+  if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
+    return nullptr;
+
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+
+  SDValue Chain = N->getOperand(0);
+  EVT VT = N->getOperand(Vec0Idx).getValueType();
+  bool is64BitVector = VT.is64BitVector();
+  Align = GetVLDSTAlign(Align, NumVecs, is64BitVector);
+
+  unsigned OpcodeIndex;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("unhandled vst type");
+    // Double-register operations:
+  case MVT::v8i8:  OpcodeIndex = 0; break;
+  case MVT::v4i16: OpcodeIndex = 1; break;
+  case MVT::v2f32:
+  case MVT::v2i32: OpcodeIndex = 2; break;
+  case MVT::v1i64: OpcodeIndex = 3; break;
+    // Quad-register operations:
+  case MVT::v16i8: OpcodeIndex = 0; break;
+  case MVT::v8i16: OpcodeIndex = 1; break;
+  case MVT::v4f32:
+  case MVT::v4i32: OpcodeIndex = 2; break;
+  case MVT::v2i64: OpcodeIndex = 3;
+    assert(NumVecs == 1 && "v2i64 type only supported for VST1");
+    break;
+  }
+
+  std::vector<EVT> ResTys;
+  if (isUpdating)
+    ResTys.push_back(MVT::i32);
+  ResTys.push_back(MVT::Other);
+
+  SDValue Pred = getAL(CurDAG);
+  SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+  SmallVector<SDValue, 7> Ops;
+
+  // Double registers and VST1/VST2 quad registers are directly supported.
+  if (is64BitVector || NumVecs <= 2) {
+    SDValue SrcReg;
+    if (NumVecs == 1) {
+      SrcReg = N->getOperand(Vec0Idx);
+    } else if (is64BitVector) {
+      // Form a REG_SEQUENCE to force register allocation.
+      SDValue V0 = N->getOperand(Vec0Idx + 0);
+      SDValue V1 = N->getOperand(Vec0Idx + 1);
+      if (NumVecs == 2)
+        SrcReg = SDValue(createDRegPairNode(MVT::v2i64, V0, V1), 0);
+      else {
+        SDValue V2 = N->getOperand(Vec0Idx + 2);
+        // If it's a vst3, form a quad D-register and leave the last part as
+        // an undef.
+        SDValue V3 = (NumVecs == 3)
+          ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0)
+          : N->getOperand(Vec0Idx + 3);
+        SrcReg = SDValue(createQuadDRegsNode(MVT::v4i64, V0, V1, V2, V3), 0);
+      }
+    } else {
+      // Form a QQ register.
+      SDValue Q0 = N->getOperand(Vec0Idx);
+      SDValue Q1 = N->getOperand(Vec0Idx + 1);
+      SrcReg = SDValue(createQRegPairNode(MVT::v4i64, Q0, Q1), 0);
+    }
+
+    unsigned Opc = (is64BitVector ? DOpcodes[OpcodeIndex] :
+                    QOpcodes0[OpcodeIndex]);
+    Ops.push_back(MemAddr);
+    Ops.push_back(Align);
+    if (isUpdating) {
+      SDValue Inc = N->getOperand(AddrOpIdx + 1);
+      // FIXME: VST1/VST2 fixed increment doesn't need Reg0. Remove the reg0
+      // case entirely when the rest are updated to that form, too.
+      if (NumVecs <= 2 && !isa<ConstantSDNode>(Inc.getNode()))
+        Opc = getVLDSTRegisterUpdateOpcode(Opc);
+      // FIXME: We use a VST1 for v1i64 even if the pseudo says vld2/3/4, so
+      // check for that explicitly too. Horribly hacky, but temporary.
+      if  (!isa<ConstantSDNode>(Inc.getNode()))
+        Ops.push_back(Inc);
+      else if (NumVecs > 2 && !isVSTfixed(Opc))
+        Ops.push_back(Reg0);
+    }
+    Ops.push_back(SrcReg);
+    Ops.push_back(Pred);
+    Ops.push_back(Reg0);
+    Ops.push_back(Chain);
+    SDNode *VSt = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+
+    // Transfer memoperands.
+    cast<MachineSDNode>(VSt)->setMemRefs(MemOp, MemOp + 1);
+
+    return VSt;
+  }
+
+  // Otherwise, quad registers are stored with two separate instructions,
+  // where one stores the even registers and the other stores the odd registers.
+
+  // Form the QQQQ REG_SEQUENCE.
+  SDValue V0 = N->getOperand(Vec0Idx + 0);
+  SDValue V1 = N->getOperand(Vec0Idx + 1);
+  SDValue V2 = N->getOperand(Vec0Idx + 2);
+  SDValue V3 = (NumVecs == 3)
+    ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, VT), 0)
+    : N->getOperand(Vec0Idx + 3);
+  SDValue RegSeq = SDValue(createQuadQRegsNode(MVT::v8i64, V0, V1, V2, V3), 0);
+
+  // Store the even D registers.  This is always an updating store, so that it
+  // provides the address to the second store for the odd subregs.
+  const SDValue OpsA[] = { MemAddr, Align, Reg0, RegSeq, Pred, Reg0, Chain };
+  SDNode *VStA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], dl,
+                                        MemAddr.getValueType(),
+                                        MVT::Other, OpsA);
+  cast<MachineSDNode>(VStA)->setMemRefs(MemOp, MemOp + 1);
+  Chain = SDValue(VStA, 1);
+
+  // Store the odd D registers.
+  Ops.push_back(SDValue(VStA, 0));
+  Ops.push_back(Align);
+  if (isUpdating) {
+    SDValue Inc = N->getOperand(AddrOpIdx + 1);
+    assert(isa<ConstantSDNode>(Inc.getNode()) &&
+           "only constant post-increment update allowed for VST3/4");
+    (void)Inc;
+    Ops.push_back(Reg0);
+  }
+  Ops.push_back(RegSeq);
+  Ops.push_back(Pred);
+  Ops.push_back(Reg0);
+  Ops.push_back(Chain);
+  SDNode *VStB = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys,
+                                        Ops);
+  cast<MachineSDNode>(VStB)->setMemRefs(MemOp, MemOp + 1);
+  return VStB;
+}
+
+SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
+                                         bool isUpdating, unsigned NumVecs,
+                                         const uint16_t *DOpcodes,
+                                         const uint16_t *QOpcodes) {
+  assert(NumVecs >=2 && NumVecs <= 4 && "VLDSTLane NumVecs out-of-range");
+  SDLoc dl(N);
+
+  SDValue MemAddr, Align;
+  unsigned AddrOpIdx = isUpdating ? 1 : 2;
+  unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1)
+  if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
+    return nullptr;
+
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+
+  SDValue Chain = N->getOperand(0);
+  unsigned Lane =
+    cast<ConstantSDNode>(N->getOperand(Vec0Idx + NumVecs))->getZExtValue();
+  EVT VT = N->getOperand(Vec0Idx).getValueType();
+  bool is64BitVector = VT.is64BitVector();
+
+  unsigned Alignment = 0;
+  if (NumVecs != 3) {
+    Alignment = cast<ConstantSDNode>(Align)->getZExtValue();
+    unsigned NumBytes = NumVecs * VT.getVectorElementType().getSizeInBits()/8;
+    if (Alignment > NumBytes)
+      Alignment = NumBytes;
+    if (Alignment < 8 && Alignment < NumBytes)
+      Alignment = 0;
+    // Alignment must be a power of two; make sure of that.
+    Alignment = (Alignment & -Alignment);
+    if (Alignment == 1)
+      Alignment = 0;
+  }
+  Align = CurDAG->getTargetConstant(Alignment, MVT::i32);
+
+  unsigned OpcodeIndex;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("unhandled vld/vst lane type");
+    // Double-register operations:
+  case MVT::v8i8:  OpcodeIndex = 0; break;
+  case MVT::v4i16: OpcodeIndex = 1; break;
+  case MVT::v2f32:
+  case MVT::v2i32: OpcodeIndex = 2; break;
+    // Quad-register operations:
+  case MVT::v8i16: OpcodeIndex = 0; break;
+  case MVT::v4f32:
+  case MVT::v4i32: OpcodeIndex = 1; break;
+  }
+
+  std::vector<EVT> ResTys;
+  if (IsLoad) {
+    unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs;
+    if (!is64BitVector)
+      ResTyElts *= 2;
+    ResTys.push_back(EVT::getVectorVT(*CurDAG->getContext(),
+                                      MVT::i64, ResTyElts));
+  }
+  if (isUpdating)
+    ResTys.push_back(MVT::i32);
+  ResTys.push_back(MVT::Other);
+
+  SDValue Pred = getAL(CurDAG);
+  SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(MemAddr);
+  Ops.push_back(Align);
+  if (isUpdating) {
+    SDValue Inc = N->getOperand(AddrOpIdx + 1);
+    Ops.push_back(isa<ConstantSDNode>(Inc.getNode()) ? Reg0 : Inc);
+  }
+
+  SDValue SuperReg;
+  SDValue V0 = N->getOperand(Vec0Idx + 0);
+  SDValue V1 = N->getOperand(Vec0Idx + 1);
+  if (NumVecs == 2) {
+    if (is64BitVector)
+      SuperReg = SDValue(createDRegPairNode(MVT::v2i64, V0, V1), 0);
+    else
+      SuperReg = SDValue(createQRegPairNode(MVT::v4i64, V0, V1), 0);
+  } else {
+    SDValue V2 = N->getOperand(Vec0Idx + 2);
+    SDValue V3 = (NumVecs == 3)
+      ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, VT), 0)
+      : N->getOperand(Vec0Idx + 3);
+    if (is64BitVector)
+      SuperReg = SDValue(createQuadDRegsNode(MVT::v4i64, V0, V1, V2, V3), 0);
+    else
+      SuperReg = SDValue(createQuadQRegsNode(MVT::v8i64, V0, V1, V2, V3), 0);
+  }
+  Ops.push_back(SuperReg);
+  Ops.push_back(getI32Imm(Lane));
+  Ops.push_back(Pred);
+  Ops.push_back(Reg0);
+  Ops.push_back(Chain);
+
+  unsigned Opc = (is64BitVector ? DOpcodes[OpcodeIndex] :
+                                  QOpcodes[OpcodeIndex]);
+  SDNode *VLdLn = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+  cast<MachineSDNode>(VLdLn)->setMemRefs(MemOp, MemOp + 1);
+  if (!IsLoad)
+    return VLdLn;
+
+  // Extract the subregisters.
+  SuperReg = SDValue(VLdLn, 0);
+  assert(ARM::dsub_7 == ARM::dsub_0+7 &&
+         ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering");
+  unsigned Sub0 = is64BitVector ? ARM::dsub_0 : ARM::qsub_0;
+  for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
+    ReplaceUses(SDValue(N, Vec),
+                CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg));
+  ReplaceUses(SDValue(N, NumVecs), SDValue(VLdLn, 1));
+  if (isUpdating)
+    ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdLn, 2));
+  return nullptr;
+}
+
+SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating,
+                                      unsigned NumVecs,
+                                      const uint16_t *Opcodes) {
+  assert(NumVecs >=2 && NumVecs <= 4 && "VLDDup NumVecs out-of-range");
+  SDLoc dl(N);
+
+  SDValue MemAddr, Align;
+  if (!SelectAddrMode6(N, N->getOperand(1), MemAddr, Align))
+    return nullptr;
+
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+
+  SDValue Chain = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  unsigned Alignment = 0;
+  if (NumVecs != 3) {
+    Alignment = cast<ConstantSDNode>(Align)->getZExtValue();
+    unsigned NumBytes = NumVecs * VT.getVectorElementType().getSizeInBits()/8;
+    if (Alignment > NumBytes)
+      Alignment = NumBytes;
+    if (Alignment < 8 && Alignment < NumBytes)
+      Alignment = 0;
+    // Alignment must be a power of two; make sure of that.
+    Alignment = (Alignment & -Alignment);
+    if (Alignment == 1)
+      Alignment = 0;
+  }
+  Align = CurDAG->getTargetConstant(Alignment, MVT::i32);
+
+  unsigned OpcodeIndex;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("unhandled vld-dup type");
+  case MVT::v8i8:  OpcodeIndex = 0; break;
+  case MVT::v4i16: OpcodeIndex = 1; break;
+  case MVT::v2f32:
+  case MVT::v2i32: OpcodeIndex = 2; break;
+  }
+
+  SDValue Pred = getAL(CurDAG);
+  SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+  SDValue SuperReg;
+  unsigned Opc = Opcodes[OpcodeIndex];
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(MemAddr);
+  Ops.push_back(Align);
+  if (isUpdating) {
+    // fixed-stride update instructions don't have an explicit writeback
+    // operand. It's implicit in the opcode itself.
+    SDValue Inc = N->getOperand(2);
+    if (!isa<ConstantSDNode>(Inc.getNode()))
+      Ops.push_back(Inc);
+    // FIXME: VLD3 and VLD4 haven't been updated to that form yet.
+    else if (NumVecs > 2)
+      Ops.push_back(Reg0);
+  }
+  Ops.push_back(Pred);
+  Ops.push_back(Reg0);
+  Ops.push_back(Chain);
+
+  unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs;
+  std::vector<EVT> ResTys;
+  ResTys.push_back(EVT::getVectorVT(*CurDAG->getContext(), MVT::i64,ResTyElts));
+  if (isUpdating)
+    ResTys.push_back(MVT::i32);
+  ResTys.push_back(MVT::Other);
+  SDNode *VLdDup = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+  cast<MachineSDNode>(VLdDup)->setMemRefs(MemOp, MemOp + 1);
+  SuperReg = SDValue(VLdDup, 0);
+
+  // Extract the subregisters.
+  assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
+  unsigned SubIdx = ARM::dsub_0;
+  for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
+    ReplaceUses(SDValue(N, Vec),
+                CurDAG->getTargetExtractSubreg(SubIdx+Vec, dl, VT, SuperReg));
+  ReplaceUses(SDValue(N, NumVecs), SDValue(VLdDup, 1));
+  if (isUpdating)
+    ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdDup, 2));
+  return nullptr;
+}
+
+SDNode *ARMDAGToDAGISel::SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs,
+                                    unsigned Opc) {
+  assert(NumVecs >= 2 && NumVecs <= 4 && "VTBL NumVecs out-of-range");
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  unsigned FirstTblReg = IsExt ? 2 : 1;
+
+  // Form a REG_SEQUENCE to force register allocation.
+  SDValue RegSeq;
+  SDValue V0 = N->getOperand(FirstTblReg + 0);
+  SDValue V1 = N->getOperand(FirstTblReg + 1);
+  if (NumVecs == 2)
+    RegSeq = SDValue(createDRegPairNode(MVT::v16i8, V0, V1), 0);
+  else {
+    SDValue V2 = N->getOperand(FirstTblReg + 2);
+    // If it's a vtbl3, form a quad D-register and leave the last part as
+    // an undef.
+    SDValue V3 = (NumVecs == 3)
+      ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, VT), 0)
+      : N->getOperand(FirstTblReg + 3);
+    RegSeq = SDValue(createQuadDRegsNode(MVT::v4i64, V0, V1, V2, V3), 0);
+  }
+
+  SmallVector<SDValue, 6> Ops;
+  if (IsExt)
+    Ops.push_back(N->getOperand(1));
+  Ops.push_back(RegSeq);
+  Ops.push_back(N->getOperand(FirstTblReg + NumVecs));
+  Ops.push_back(getAL(CurDAG)); // predicate
+  Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // predicate register
+  return CurDAG->getMachineNode(Opc, dl, VT, Ops);
+}
+
+SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
+                                                     bool isSigned) {
+  if (!Subtarget->hasV6T2Ops())
+    return nullptr;
+
+  unsigned Opc = isSigned
+    ? (Subtarget->isThumb() ? ARM::t2SBFX : ARM::SBFX)
+    : (Subtarget->isThumb() ? ARM::t2UBFX : ARM::UBFX);
+
+  // For unsigned extracts, check for a shift right and mask
+  unsigned And_imm = 0;
+  if (N->getOpcode() == ISD::AND) {
+    if (isOpcWithIntImmediate(N, ISD::AND, And_imm)) {
+
+      // The immediate is a mask of the low bits iff imm & (imm+1) == 0
+      if (And_imm & (And_imm + 1))
+        return nullptr;
+
+      unsigned Srl_imm = 0;
+      if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SRL,
+                                Srl_imm)) {
+        assert(Srl_imm > 0 && Srl_imm < 32 && "bad amount in shift node!");
+
+        // Note: The width operand is encoded as width-1.
+        unsigned Width = CountTrailingOnes_32(And_imm) - 1;
+        unsigned LSB = Srl_imm;
+
+        SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+
+        if ((LSB + Width + 1) == N->getValueType(0).getSizeInBits()) {
+          // It's cheaper to use a right shift to extract the top bits.
+          if (Subtarget->isThumb()) {
+            Opc = isSigned ? ARM::t2ASRri : ARM::t2LSRri;
+            SDValue Ops[] = { N->getOperand(0).getOperand(0),
+                              CurDAG->getTargetConstant(LSB, MVT::i32),
+                              getAL(CurDAG), Reg0, Reg0 };
+            return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+          }
+
+          // ARM models shift instructions as MOVsi with shifter operand.
+          ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(ISD::SRL);
+          SDValue ShOpc =
+            CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ShOpcVal, LSB),
+                                      MVT::i32);
+          SDValue Ops[] = { N->getOperand(0).getOperand(0), ShOpc,
+                            getAL(CurDAG), Reg0, Reg0 };
+          return CurDAG->SelectNodeTo(N, ARM::MOVsi, MVT::i32, Ops);
+        }
+
+        SDValue Ops[] = { N->getOperand(0).getOperand(0),
+                          CurDAG->getTargetConstant(LSB, MVT::i32),
+                          CurDAG->getTargetConstant(Width, MVT::i32),
+                          getAL(CurDAG), Reg0 };
+        return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+      }
+    }
+    return nullptr;
+  }
+
+  // Otherwise, we're looking for a shift of a shift
+  unsigned Shl_imm = 0;
+  if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL, Shl_imm)) {
+    assert(Shl_imm > 0 && Shl_imm < 32 && "bad amount in shift node!");
+    unsigned Srl_imm = 0;
+    if (isInt32Immediate(N->getOperand(1), Srl_imm)) {
+      assert(Srl_imm > 0 && Srl_imm < 32 && "bad amount in shift node!");
+      // Note: The width operand is encoded as width-1.
+      unsigned Width = 32 - Srl_imm - 1;
+      int LSB = Srl_imm - Shl_imm;
+      if (LSB < 0)
+        return nullptr;
+      SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+      SDValue Ops[] = { N->getOperand(0).getOperand(0),
+                        CurDAG->getTargetConstant(LSB, MVT::i32),
+                        CurDAG->getTargetConstant(Width, MVT::i32),
+                        getAL(CurDAG), Reg0 };
+      return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+    }
+  }
+  return nullptr;
+}
+
+/// Target-specific DAG combining for ISD::XOR.
+/// Target-independent combining lowers SELECT_CC nodes of the form
+/// select_cc setg[ge] X,  0,  X, -X
+/// select_cc setgt    X, -1,  X, -X
+/// select_cc setl[te] X,  0, -X,  X
+/// select_cc setlt    X,  1, -X,  X
+/// which represent Integer ABS into:
+/// Y = sra (X, size(X)-1); xor (add (X, Y), Y)
+/// ARM instruction selection detects the latter and matches it to
+/// ARM::ABS or ARM::t2ABS machine node.
+SDNode *ARMDAGToDAGISel::SelectABSOp(SDNode *N){
+  SDValue XORSrc0 = N->getOperand(0);
+  SDValue XORSrc1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+
+  if (Subtarget->isThumb1Only())
+    return nullptr;
+
+  if (XORSrc0.getOpcode() != ISD::ADD || XORSrc1.getOpcode() != ISD::SRA)
+    return nullptr;
+
+  SDValue ADDSrc0 = XORSrc0.getOperand(0);
+  SDValue ADDSrc1 = XORSrc0.getOperand(1);
+  SDValue SRASrc0 = XORSrc1.getOperand(0);
+  SDValue SRASrc1 = XORSrc1.getOperand(1);
+  ConstantSDNode *SRAConstant =  dyn_cast<ConstantSDNode>(SRASrc1);
+  EVT XType = SRASrc0.getValueType();
+  unsigned Size = XType.getSizeInBits() - 1;
+
+  if (ADDSrc1 == XORSrc1 && ADDSrc0 == SRASrc0 &&
+      XType.isInteger() && SRAConstant != nullptr &&
+      Size == SRAConstant->getZExtValue()) {
+    unsigned Opcode = Subtarget->isThumb2() ? ARM::t2ABS : ARM::ABS;
+    return CurDAG->SelectNodeTo(N, Opcode, VT, ADDSrc0);
+  }
+
+  return nullptr;
+}
+
+SDNode *ARMDAGToDAGISel::SelectConcatVector(SDNode *N) {
+  // The only time a CONCAT_VECTORS operation can have legal types is when
+  // two 64-bit vectors are concatenated to a 128-bit vector.
+  EVT VT = N->getValueType(0);
+  if (!VT.is128BitVector() || N->getNumOperands() != 2)
+    llvm_unreachable("unexpected CONCAT_VECTORS");
+  return createDRegPairNode(VT, N->getOperand(0), N->getOperand(1));
+}
+
+SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
+  SDLoc dl(N);
+
+  if (N->isMachineOpcode()) {
+    N->setNodeId(-1);
+    return nullptr;   // Already selected.
+  }
+
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::INLINEASM: {
+    SDNode *ResNode = SelectInlineAsm(N);
+    if (ResNode)
+      return ResNode;
+    break;
+  }
+  case ISD::XOR: {
+    // Select special operations if XOR node forms integer ABS pattern
+    SDNode *ResNode = SelectABSOp(N);
+    if (ResNode)
+      return ResNode;
+    // Other cases are autogenerated.
+    break;
+  }
+  case ISD::Constant: {
+    unsigned Val = cast<ConstantSDNode>(N)->getZExtValue();
+    bool UseCP = true;
+    if (Subtarget->useMovt(*MF))
+      // Thumb2-aware targets have the MOVT instruction, so all immediates can
+      // be done with MOV + MOVT, at worst.
+      UseCP = false;
+    else {
+      if (Subtarget->isThumb()) {
+        UseCP = (Val > 255 &&                                  // MOV
+                 ~Val > 255 &&                                 // MOV + MVN
+                 !ARM_AM::isThumbImmShiftedVal(Val) &&         // MOV + LSL
+                 !(Subtarget->hasV6T2Ops() && Val <= 0xffff)); // MOVW
+      } else
+        UseCP = (ARM_AM::getSOImmVal(Val) == -1 &&             // MOV
+                 ARM_AM::getSOImmVal(~Val) == -1 &&            // MVN
+                 !ARM_AM::isSOImmTwoPartVal(Val) &&            // two instrs.
+                 !(Subtarget->hasV6T2Ops() && Val <= 0xffff)); // MOVW
+    }
+
+    if (UseCP) {
+      SDValue CPIdx =
+        CurDAG->getTargetConstantPool(ConstantInt::get(
+                                  Type::getInt32Ty(*CurDAG->getContext()), Val),
+                                      getTargetLowering()->getPointerTy());
+
+      SDNode *ResNode;
+      if (Subtarget->isThumb()) {
+        SDValue Pred = getAL(CurDAG);
+        SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
+        SDValue Ops[] = { CPIdx, Pred, PredReg, CurDAG->getEntryNode() };
+        ResNode = CurDAG->getMachineNode(ARM::tLDRpci, dl, MVT::i32, MVT::Other,
+                                         Ops);
+      } else {
+        SDValue Ops[] = {
+          CPIdx,
+          CurDAG->getTargetConstant(0, MVT::i32),
+          getAL(CurDAG),
+          CurDAG->getRegister(0, MVT::i32),
+          CurDAG->getEntryNode()
+        };
+        ResNode=CurDAG->getMachineNode(ARM::LDRcp, dl, MVT::i32, MVT::Other,
+                                       Ops);
+      }
+      ReplaceUses(SDValue(N, 0), SDValue(ResNode, 0));
+      return nullptr;
+    }
+
+    // Other cases are autogenerated.
+    break;
+  }
+  case ISD::FrameIndex: {
+    // Selects to ADDri FI, 0 which in turn will become ADDri SP, imm.
+    int FI = cast<FrameIndexSDNode>(N)->getIndex();
+    SDValue TFI = CurDAG->getTargetFrameIndex(FI,
+                                           getTargetLowering()->getPointerTy());
+    if (Subtarget->isThumb1Only()) {
+      SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, MVT::i32),
+                        getAL(CurDAG), CurDAG->getRegister(0, MVT::i32) };
+      return CurDAG->SelectNodeTo(N, ARM::tADDrSPi, MVT::i32, Ops);
+    } else {
+      unsigned Opc = ((Subtarget->isThumb() && Subtarget->hasThumb2()) ?
+                      ARM::t2ADDri : ARM::ADDri);
+      SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, MVT::i32),
+                        getAL(CurDAG), CurDAG->getRegister(0, MVT::i32),
+                        CurDAG->getRegister(0, MVT::i32) };
+      return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+    }
+  }
+  case ISD::SRL:
+    if (SDNode *I = SelectV6T2BitfieldExtractOp(N, false))
+      return I;
+    break;
+  case ISD::SRA:
+    if (SDNode *I = SelectV6T2BitfieldExtractOp(N, true))
+      return I;
+    break;
+  case ISD::MUL:
+    if (Subtarget->isThumb1Only())
+      break;
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
+      unsigned RHSV = C->getZExtValue();
+      if (!RHSV) break;
+      if (isPowerOf2_32(RHSV-1)) {  // 2^n+1?
+        unsigned ShImm = Log2_32(RHSV-1);
+        if (ShImm >= 32)
+          break;
+        SDValue V = N->getOperand(0);
+        ShImm = ARM_AM::getSORegOpc(ARM_AM::lsl, ShImm);
+        SDValue ShImmOp = CurDAG->getTargetConstant(ShImm, MVT::i32);
+        SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+        if (Subtarget->isThumb()) {
+          SDValue Ops[] = { V, V, ShImmOp, getAL(CurDAG), Reg0, Reg0 };
+          return CurDAG->SelectNodeTo(N, ARM::t2ADDrs, MVT::i32, Ops);
+        } else {
+          SDValue Ops[] = { V, V, Reg0, ShImmOp, getAL(CurDAG), Reg0, Reg0 };
+          return CurDAG->SelectNodeTo(N, ARM::ADDrsi, MVT::i32, Ops);
+        }
+      }
+      if (isPowerOf2_32(RHSV+1)) {  // 2^n-1?
+        unsigned ShImm = Log2_32(RHSV+1);
+        if (ShImm >= 32)
+          break;
+        SDValue V = N->getOperand(0);
+        ShImm = ARM_AM::getSORegOpc(ARM_AM::lsl, ShImm);
+        SDValue ShImmOp = CurDAG->getTargetConstant(ShImm, MVT::i32);
+        SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+        if (Subtarget->isThumb()) {
+          SDValue Ops[] = { V, V, ShImmOp, getAL(CurDAG), Reg0, Reg0 };
+          return CurDAG->SelectNodeTo(N, ARM::t2RSBrs, MVT::i32, Ops);
+        } else {
+          SDValue Ops[] = { V, V, Reg0, ShImmOp, getAL(CurDAG), Reg0, Reg0 };
+          return CurDAG->SelectNodeTo(N, ARM::RSBrsi, MVT::i32, Ops);
+        }
+      }
+    }
+    break;
+  case ISD::AND: {
+    // Check for unsigned bitfield extract
+    if (SDNode *I = SelectV6T2BitfieldExtractOp(N, false))
+      return I;
+
+    // (and (or x, c2), c1) and top 16-bits of c1 and c2 match, lower 16-bits
+    // of c1 are 0xffff, and lower 16-bit of c2 are 0. That is, the top 16-bits
+    // are entirely contributed by c2 and lower 16-bits are entirely contributed
+    // by x. That's equal to (or (and x, 0xffff), (and c1, 0xffff0000)).
+    // Select it to: "movt x, ((c1 & 0xffff) >> 16)
+    EVT VT = N->getValueType(0);
+    if (VT != MVT::i32)
+      break;
+    unsigned Opc = (Subtarget->isThumb() && Subtarget->hasThumb2())
+      ? ARM::t2MOVTi16
+      : (Subtarget->hasV6T2Ops() ? ARM::MOVTi16 : 0);
+    if (!Opc)
+      break;
+    SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
+    ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+    if (!N1C)
+      break;
+    if (N0.getOpcode() == ISD::OR && N0.getNode()->hasOneUse()) {
+      SDValue N2 = N0.getOperand(1);
+      ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2);
+      if (!N2C)
+        break;
+      unsigned N1CVal = N1C->getZExtValue();
+      unsigned N2CVal = N2C->getZExtValue();
+      if ((N1CVal & 0xffff0000U) == (N2CVal & 0xffff0000U) &&
+          (N1CVal & 0xffffU) == 0xffffU &&
+          (N2CVal & 0xffffU) == 0x0U) {
+        SDValue Imm16 = CurDAG->getTargetConstant((N2CVal & 0xFFFF0000U) >> 16,
+                                                  MVT::i32);
+        SDValue Ops[] = { N0.getOperand(0), Imm16,
+                          getAL(CurDAG), CurDAG->getRegister(0, MVT::i32) };
+        return CurDAG->getMachineNode(Opc, dl, VT, Ops);
+      }
+    }
+    break;
+  }
+  case ARMISD::VMOVRRD:
+    return CurDAG->getMachineNode(ARM::VMOVRRD, dl, MVT::i32, MVT::i32,
+                                  N->getOperand(0), getAL(CurDAG),
+                                  CurDAG->getRegister(0, MVT::i32));
+  case ISD::UMUL_LOHI: {
+    if (Subtarget->isThumb1Only())
+      break;
+    if (Subtarget->isThumb()) {
+      SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                        getAL(CurDAG), CurDAG->getRegister(0, MVT::i32) };
+      return CurDAG->getMachineNode(ARM::t2UMULL, dl, MVT::i32, MVT::i32, Ops);
+    } else {
+      SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                        getAL(CurDAG), CurDAG->getRegister(0, MVT::i32),
+                        CurDAG->getRegister(0, MVT::i32) };
+      return CurDAG->getMachineNode(Subtarget->hasV6Ops() ?
+                                    ARM::UMULL : ARM::UMULLv5,
+                                    dl, MVT::i32, MVT::i32, Ops);
+    }
+  }
+  case ISD::SMUL_LOHI: {
+    if (Subtarget->isThumb1Only())
+      break;
+    if (Subtarget->isThumb()) {
+      SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                        getAL(CurDAG), CurDAG->getRegister(0, MVT::i32) };
+      return CurDAG->getMachineNode(ARM::t2SMULL, dl, MVT::i32, MVT::i32, Ops);
+    } else {
+      SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                        getAL(CurDAG), CurDAG->getRegister(0, MVT::i32),
+                        CurDAG->getRegister(0, MVT::i32) };
+      return CurDAG->getMachineNode(Subtarget->hasV6Ops() ?
+                                    ARM::SMULL : ARM::SMULLv5,
+                                    dl, MVT::i32, MVT::i32, Ops);
+    }
+  }
+  case ARMISD::UMLAL:{
+    if (Subtarget->isThumb()) {
+      SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
+                        N->getOperand(3), getAL(CurDAG),
+                        CurDAG->getRegister(0, MVT::i32)};
+      return CurDAG->getMachineNode(ARM::t2UMLAL, dl, MVT::i32, MVT::i32, Ops);
+    }else{
+      SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
+                        N->getOperand(3), getAL(CurDAG),
+                        CurDAG->getRegister(0, MVT::i32),
+                        CurDAG->getRegister(0, MVT::i32) };
+      return CurDAG->getMachineNode(Subtarget->hasV6Ops() ?
+                                      ARM::UMLAL : ARM::UMLALv5,
+                                      dl, MVT::i32, MVT::i32, Ops);
+    }
+  }
+  case ARMISD::SMLAL:{
+    if (Subtarget->isThumb()) {
+      SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
+                        N->getOperand(3), getAL(CurDAG),
+                        CurDAG->getRegister(0, MVT::i32)};
+      return CurDAG->getMachineNode(ARM::t2SMLAL, dl, MVT::i32, MVT::i32, Ops);
+    }else{
+      SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
+                        N->getOperand(3), getAL(CurDAG),
+                        CurDAG->getRegister(0, MVT::i32),
+                        CurDAG->getRegister(0, MVT::i32) };
+      return CurDAG->getMachineNode(Subtarget->hasV6Ops() ?
+                                      ARM::SMLAL : ARM::SMLALv5,
+                                      dl, MVT::i32, MVT::i32, Ops);
+    }
+  }
+  case ISD::LOAD: {
+    SDNode *ResNode = nullptr;
+    if (Subtarget->isThumb() && Subtarget->hasThumb2())
+      ResNode = SelectT2IndexedLoad(N);
+    else
+      ResNode = SelectARMIndexedLoad(N);
+    if (ResNode)
+      return ResNode;
+    // Other cases are autogenerated.
+    break;
+  }
+  case ARMISD::BRCOND: {
+    // Pattern: (ARMbrcond:void (bb:Other):$dst, (imm:i32):$cc)
+    // Emits: (Bcc:void (bb:Other):$dst, (imm:i32):$cc)
+    // Pattern complexity = 6  cost = 1  size = 0
+
+    // Pattern: (ARMbrcond:void (bb:Other):$dst, (imm:i32):$cc)
+    // Emits: (tBcc:void (bb:Other):$dst, (imm:i32):$cc)
+    // Pattern complexity = 6  cost = 1  size = 0
+
+    // Pattern: (ARMbrcond:void (bb:Other):$dst, (imm:i32):$cc)
+    // Emits: (t2Bcc:void (bb:Other):$dst, (imm:i32):$cc)
+    // Pattern complexity = 6  cost = 1  size = 0
+
+    unsigned Opc = Subtarget->isThumb() ?
+      ((Subtarget->hasThumb2()) ? ARM::t2Bcc : ARM::tBcc) : ARM::Bcc;
+    SDValue Chain = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+    SDValue N2 = N->getOperand(2);
+    SDValue N3 = N->getOperand(3);
+    SDValue InFlag = N->getOperand(4);
+    assert(N1.getOpcode() == ISD::BasicBlock);
+    assert(N2.getOpcode() == ISD::Constant);
+    assert(N3.getOpcode() == ISD::Register);
+
+    SDValue Tmp2 = CurDAG->getTargetConstant(((unsigned)
+                               cast<ConstantSDNode>(N2)->getZExtValue()),
+                               MVT::i32);
+    SDValue Ops[] = { N1, Tmp2, N3, Chain, InFlag };
+    SDNode *ResNode = CurDAG->getMachineNode(Opc, dl, MVT::Other,
+                                             MVT::Glue, Ops);
+    Chain = SDValue(ResNode, 0);
+    if (N->getNumValues() == 2) {
+      InFlag = SDValue(ResNode, 1);
+      ReplaceUses(SDValue(N, 1), InFlag);
+    }
+    ReplaceUses(SDValue(N, 0),
+                SDValue(Chain.getNode(), Chain.getResNo()));
+    return nullptr;
+  }
+  case ARMISD::VZIP: {
+    unsigned Opc = 0;
+    EVT VT = N->getValueType(0);
+    switch (VT.getSimpleVT().SimpleTy) {
+    default: return nullptr;
+    case MVT::v8i8:  Opc = ARM::VZIPd8; break;
+    case MVT::v4i16: Opc = ARM::VZIPd16; break;
+    case MVT::v2f32:
+    // vzip.32 Dd, Dm is a pseudo-instruction expanded to vtrn.32 Dd, Dm.
+    case MVT::v2i32: Opc = ARM::VTRNd32; break;
+    case MVT::v16i8: Opc = ARM::VZIPq8; break;
+    case MVT::v8i16: Opc = ARM::VZIPq16; break;
+    case MVT::v4f32:
+    case MVT::v4i32: Opc = ARM::VZIPq32; break;
+    }
+    SDValue Pred = getAL(CurDAG);
+    SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
+    SDValue Ops[] = { N->getOperand(0), N->getOperand(1), Pred, PredReg };
+    return CurDAG->getMachineNode(Opc, dl, VT, VT, Ops);
+  }
+  case ARMISD::VUZP: {
+    unsigned Opc = 0;
+    EVT VT = N->getValueType(0);
+    switch (VT.getSimpleVT().SimpleTy) {
+    default: return nullptr;
+    case MVT::v8i8:  Opc = ARM::VUZPd8; break;
+    case MVT::v4i16: Opc = ARM::VUZPd16; break;
+    case MVT::v2f32:
+    // vuzp.32 Dd, Dm is a pseudo-instruction expanded to vtrn.32 Dd, Dm.
+    case MVT::v2i32: Opc = ARM::VTRNd32; break;
+    case MVT::v16i8: Opc = ARM::VUZPq8; break;
+    case MVT::v8i16: Opc = ARM::VUZPq16; break;
+    case MVT::v4f32:
+    case MVT::v4i32: Opc = ARM::VUZPq32; break;
+    }
+    SDValue Pred = getAL(CurDAG);
+    SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
+    SDValue Ops[] = { N->getOperand(0), N->getOperand(1), Pred, PredReg };
+    return CurDAG->getMachineNode(Opc, dl, VT, VT, Ops);
+  }
+  case ARMISD::VTRN: {
+    unsigned Opc = 0;
+    EVT VT = N->getValueType(0);
+    switch (VT.getSimpleVT().SimpleTy) {
+    default: return nullptr;
+    case MVT::v8i8:  Opc = ARM::VTRNd8; break;
+    case MVT::v4i16: Opc = ARM::VTRNd16; break;
+    case MVT::v2f32:
+    case MVT::v2i32: Opc = ARM::VTRNd32; break;
+    case MVT::v16i8: Opc = ARM::VTRNq8; break;
+    case MVT::v8i16: Opc = ARM::VTRNq16; break;
+    case MVT::v4f32:
+    case MVT::v4i32: Opc = ARM::VTRNq32; break;
+    }
+    SDValue Pred = getAL(CurDAG);
+    SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
+    SDValue Ops[] = { N->getOperand(0), N->getOperand(1), Pred, PredReg };
+    return CurDAG->getMachineNode(Opc, dl, VT, VT, Ops);
+  }
+  case ARMISD::BUILD_VECTOR: {
+    EVT VecVT = N->getValueType(0);
+    EVT EltVT = VecVT.getVectorElementType();
+    unsigned NumElts = VecVT.getVectorNumElements();
+    if (EltVT == MVT::f64) {
+      assert(NumElts == 2 && "unexpected type for BUILD_VECTOR");
+      return createDRegPairNode(VecVT, N->getOperand(0), N->getOperand(1));
+    }
+    assert(EltVT == MVT::f32 && "unexpected type for BUILD_VECTOR");
+    if (NumElts == 2)
+      return createSRegPairNode(VecVT, N->getOperand(0), N->getOperand(1));
+    assert(NumElts == 4 && "unexpected type for BUILD_VECTOR");
+    return createQuadSRegsNode(VecVT, N->getOperand(0), N->getOperand(1),
+                     N->getOperand(2), N->getOperand(3));
+  }
+
+  case ARMISD::VLD2DUP: {
+    static const uint16_t Opcodes[] = { ARM::VLD2DUPd8, ARM::VLD2DUPd16,
+                                        ARM::VLD2DUPd32 };
+    return SelectVLDDup(N, false, 2, Opcodes);
+  }
+
+  case ARMISD::VLD3DUP: {
+    static const uint16_t Opcodes[] = { ARM::VLD3DUPd8Pseudo,
+                                        ARM::VLD3DUPd16Pseudo,
+                                        ARM::VLD3DUPd32Pseudo };
+    return SelectVLDDup(N, false, 3, Opcodes);
+  }
+
+  case ARMISD::VLD4DUP: {
+    static const uint16_t Opcodes[] = { ARM::VLD4DUPd8Pseudo,
+                                        ARM::VLD4DUPd16Pseudo,
+                                        ARM::VLD4DUPd32Pseudo };
+    return SelectVLDDup(N, false, 4, Opcodes);
+  }
+
+  case ARMISD::VLD2DUP_UPD: {
+    static const uint16_t Opcodes[] = { ARM::VLD2DUPd8wb_fixed,
+                                        ARM::VLD2DUPd16wb_fixed,
+                                        ARM::VLD2DUPd32wb_fixed };
+    return SelectVLDDup(N, true, 2, Opcodes);
+  }
+
+  case ARMISD::VLD3DUP_UPD: {
+    static const uint16_t Opcodes[] = { ARM::VLD3DUPd8Pseudo_UPD,
+                                        ARM::VLD3DUPd16Pseudo_UPD,
+                                        ARM::VLD3DUPd32Pseudo_UPD };
+    return SelectVLDDup(N, true, 3, Opcodes);
+  }
+
+  case ARMISD::VLD4DUP_UPD: {
+    static const uint16_t Opcodes[] = { ARM::VLD4DUPd8Pseudo_UPD,
+                                        ARM::VLD4DUPd16Pseudo_UPD,
+                                        ARM::VLD4DUPd32Pseudo_UPD };
+    return SelectVLDDup(N, true, 4, Opcodes);
+  }
+
+  case ARMISD::VLD1_UPD: {
+    static const uint16_t DOpcodes[] = { ARM::VLD1d8wb_fixed,
+                                         ARM::VLD1d16wb_fixed,
+                                         ARM::VLD1d32wb_fixed,
+                                         ARM::VLD1d64wb_fixed };
+    static const uint16_t QOpcodes[] = { ARM::VLD1q8wb_fixed,
+                                         ARM::VLD1q16wb_fixed,
+                                         ARM::VLD1q32wb_fixed,
+                                         ARM::VLD1q64wb_fixed };
+    return SelectVLD(N, true, 1, DOpcodes, QOpcodes, nullptr);
+  }
+
+  case ARMISD::VLD2_UPD: {
+    static const uint16_t DOpcodes[] = { ARM::VLD2d8wb_fixed,
+                                         ARM::VLD2d16wb_fixed,
+                                         ARM::VLD2d32wb_fixed,
+                                         ARM::VLD1q64wb_fixed};
+    static const uint16_t QOpcodes[] = { ARM::VLD2q8PseudoWB_fixed,
+                                         ARM::VLD2q16PseudoWB_fixed,
+                                         ARM::VLD2q32PseudoWB_fixed };
+    return SelectVLD(N, true, 2, DOpcodes, QOpcodes, nullptr);
+  }
+
+  case ARMISD::VLD3_UPD: {
+    static const uint16_t DOpcodes[] = { ARM::VLD3d8Pseudo_UPD,
+                                         ARM::VLD3d16Pseudo_UPD,
+                                         ARM::VLD3d32Pseudo_UPD,
+                                         ARM::VLD1d64TPseudoWB_fixed};
+    static const uint16_t QOpcodes0[] = { ARM::VLD3q8Pseudo_UPD,
+                                          ARM::VLD3q16Pseudo_UPD,
+                                          ARM::VLD3q32Pseudo_UPD };
+    static const uint16_t QOpcodes1[] = { ARM::VLD3q8oddPseudo_UPD,
+                                          ARM::VLD3q16oddPseudo_UPD,
+                                          ARM::VLD3q32oddPseudo_UPD };
+    return SelectVLD(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1);
+  }
+
+  case ARMISD::VLD4_UPD: {
+    static const uint16_t DOpcodes[] = { ARM::VLD4d8Pseudo_UPD,
+                                         ARM::VLD4d16Pseudo_UPD,
+                                         ARM::VLD4d32Pseudo_UPD,
+                                         ARM::VLD1d64QPseudoWB_fixed};
+    static const uint16_t QOpcodes0[] = { ARM::VLD4q8Pseudo_UPD,
+                                          ARM::VLD4q16Pseudo_UPD,
+                                          ARM::VLD4q32Pseudo_UPD };
+    static const uint16_t QOpcodes1[] = { ARM::VLD4q8oddPseudo_UPD,
+                                          ARM::VLD4q16oddPseudo_UPD,
+                                          ARM::VLD4q32oddPseudo_UPD };
+    return SelectVLD(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
+  }
+
+  case ARMISD::VLD2LN_UPD: {
+    static const uint16_t DOpcodes[] = { ARM::VLD2LNd8Pseudo_UPD,
+                                         ARM::VLD2LNd16Pseudo_UPD,
+                                         ARM::VLD2LNd32Pseudo_UPD };
+    static const uint16_t QOpcodes[] = { ARM::VLD2LNq16Pseudo_UPD,
+                                         ARM::VLD2LNq32Pseudo_UPD };
+    return SelectVLDSTLane(N, true, true, 2, DOpcodes, QOpcodes);
+  }
+
+  case ARMISD::VLD3LN_UPD: {
+    static const uint16_t DOpcodes[] = { ARM::VLD3LNd8Pseudo_UPD,
+                                         ARM::VLD3LNd16Pseudo_UPD,
+                                         ARM::VLD3LNd32Pseudo_UPD };
+    static const uint16_t QOpcodes[] = { ARM::VLD3LNq16Pseudo_UPD,
+                                         ARM::VLD3LNq32Pseudo_UPD };
+    return SelectVLDSTLane(N, true, true, 3, DOpcodes, QOpcodes);
+  }
+
+  case ARMISD::VLD4LN_UPD: {
+    static const uint16_t DOpcodes[] = { ARM::VLD4LNd8Pseudo_UPD,
+                                         ARM::VLD4LNd16Pseudo_UPD,
+                                         ARM::VLD4LNd32Pseudo_UPD };
+    static const uint16_t QOpcodes[] = { ARM::VLD4LNq16Pseudo_UPD,
+                                         ARM::VLD4LNq32Pseudo_UPD };
+    return SelectVLDSTLane(N, true, true, 4, DOpcodes, QOpcodes);
+  }
+
+  case ARMISD::VST1_UPD: {
+    static const uint16_t DOpcodes[] = { ARM::VST1d8wb_fixed,
+                                         ARM::VST1d16wb_fixed,
+                                         ARM::VST1d32wb_fixed,
+                                         ARM::VST1d64wb_fixed };
+    static const uint16_t QOpcodes[] = { ARM::VST1q8wb_fixed,
+                                         ARM::VST1q16wb_fixed,
+                                         ARM::VST1q32wb_fixed,
+                                         ARM::VST1q64wb_fixed };
+    return SelectVST(N, true, 1, DOpcodes, QOpcodes, nullptr);
+  }
+
+  case ARMISD::VST2_UPD: {
+    static const uint16_t DOpcodes[] = { ARM::VST2d8wb_fixed,
+                                         ARM::VST2d16wb_fixed,
+                                         ARM::VST2d32wb_fixed,
+                                         ARM::VST1q64wb_fixed};
+    static const uint16_t QOpcodes[] = { ARM::VST2q8PseudoWB_fixed,
+                                         ARM::VST2q16PseudoWB_fixed,
+                                         ARM::VST2q32PseudoWB_fixed };
+    return SelectVST(N, true, 2, DOpcodes, QOpcodes, nullptr);
+  }
+
+  case ARMISD::VST3_UPD: {
+    static const uint16_t DOpcodes[] = { ARM::VST3d8Pseudo_UPD,
+                                         ARM::VST3d16Pseudo_UPD,
+                                         ARM::VST3d32Pseudo_UPD,
+                                         ARM::VST1d64TPseudoWB_fixed};
+    static const uint16_t QOpcodes0[] = { ARM::VST3q8Pseudo_UPD,
+                                          ARM::VST3q16Pseudo_UPD,
+                                          ARM::VST3q32Pseudo_UPD };
+    static const uint16_t QOpcodes1[] = { ARM::VST3q8oddPseudo_UPD,
+                                          ARM::VST3q16oddPseudo_UPD,
+                                          ARM::VST3q32oddPseudo_UPD };
+    return SelectVST(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1);
+  }
+
+  case ARMISD::VST4_UPD: {
+    static const uint16_t DOpcodes[] = { ARM::VST4d8Pseudo_UPD,
+                                         ARM::VST4d16Pseudo_UPD,
+                                         ARM::VST4d32Pseudo_UPD,
+                                         ARM::VST1d64QPseudoWB_fixed};
+    static const uint16_t QOpcodes0[] = { ARM::VST4q8Pseudo_UPD,
+                                          ARM::VST4q16Pseudo_UPD,
+                                          ARM::VST4q32Pseudo_UPD };
+    static const uint16_t QOpcodes1[] = { ARM::VST4q8oddPseudo_UPD,
+                                          ARM::VST4q16oddPseudo_UPD,
+                                          ARM::VST4q32oddPseudo_UPD };
+    return SelectVST(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
+  }
+
+  case ARMISD::VST2LN_UPD: {
+    static const uint16_t DOpcodes[] = { ARM::VST2LNd8Pseudo_UPD,
+                                         ARM::VST2LNd16Pseudo_UPD,
+                                         ARM::VST2LNd32Pseudo_UPD };
+    static const uint16_t QOpcodes[] = { ARM::VST2LNq16Pseudo_UPD,
+                                         ARM::VST2LNq32Pseudo_UPD };
+    return SelectVLDSTLane(N, false, true, 2, DOpcodes, QOpcodes);
+  }
+
+  case ARMISD::VST3LN_UPD: {
+    static const uint16_t DOpcodes[] = { ARM::VST3LNd8Pseudo_UPD,
+                                         ARM::VST3LNd16Pseudo_UPD,
+                                         ARM::VST3LNd32Pseudo_UPD };
+    static const uint16_t QOpcodes[] = { ARM::VST3LNq16Pseudo_UPD,
+                                         ARM::VST3LNq32Pseudo_UPD };
+    return SelectVLDSTLane(N, false, true, 3, DOpcodes, QOpcodes);
+  }
+
+  case ARMISD::VST4LN_UPD: {
+    static const uint16_t DOpcodes[] = { ARM::VST4LNd8Pseudo_UPD,
+                                         ARM::VST4LNd16Pseudo_UPD,
+                                         ARM::VST4LNd32Pseudo_UPD };
+    static const uint16_t QOpcodes[] = { ARM::VST4LNq16Pseudo_UPD,
+                                         ARM::VST4LNq32Pseudo_UPD };
+    return SelectVLDSTLane(N, false, true, 4, DOpcodes, QOpcodes);
+  }
+
+  case ISD::INTRINSIC_VOID:
+  case ISD::INTRINSIC_W_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+    switch (IntNo) {
+    default:
+      break;
+
+    case Intrinsic::arm_ldaexd:
+    case Intrinsic::arm_ldrexd: {
+      SDLoc dl(N);
+      SDValue Chain = N->getOperand(0);
+      SDValue MemAddr = N->getOperand(2);
+      bool isThumb = Subtarget->isThumb() && Subtarget->hasThumb2();
+
+      bool IsAcquire = IntNo == Intrinsic::arm_ldaexd;
+      unsigned NewOpc = isThumb ? (IsAcquire ? ARM::t2LDAEXD : ARM::t2LDREXD)
+                                : (IsAcquire ? ARM::LDAEXD : ARM::LDREXD);
+
+      // arm_ldrexd returns a i64 value in {i32, i32}
+      std::vector<EVT> ResTys;
+      if (isThumb) {
+        ResTys.push_back(MVT::i32);
+        ResTys.push_back(MVT::i32);
+      } else
+        ResTys.push_back(MVT::Untyped);
+      ResTys.push_back(MVT::Other);
+
+      // Place arguments in the right order.
+      SmallVector<SDValue, 7> Ops;
+      Ops.push_back(MemAddr);
+      Ops.push_back(getAL(CurDAG));
+      Ops.push_back(CurDAG->getRegister(0, MVT::i32));
+      Ops.push_back(Chain);
+      SDNode *Ld = CurDAG->getMachineNode(NewOpc, dl, ResTys, Ops);
+      // Transfer memoperands.
+      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+      MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+      cast<MachineSDNode>(Ld)->setMemRefs(MemOp, MemOp + 1);
+
+      // Remap uses.
+      SDValue OutChain = isThumb ? SDValue(Ld, 2) : SDValue(Ld, 1);
+      if (!SDValue(N, 0).use_empty()) {
+        SDValue Result;
+        if (isThumb)
+          Result = SDValue(Ld, 0);
+        else {
+          SDValue SubRegIdx = CurDAG->getTargetConstant(ARM::gsub_0, MVT::i32);
+          SDNode *ResNode = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+              dl, MVT::i32, SDValue(Ld, 0), SubRegIdx);
+          Result = SDValue(ResNode,0);
+        }
+        ReplaceUses(SDValue(N, 0), Result);
+      }
+      if (!SDValue(N, 1).use_empty()) {
+        SDValue Result;
+        if (isThumb)
+          Result = SDValue(Ld, 1);
+        else {
+          SDValue SubRegIdx = CurDAG->getTargetConstant(ARM::gsub_1, MVT::i32);
+          SDNode *ResNode = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+              dl, MVT::i32, SDValue(Ld, 0), SubRegIdx);
+          Result = SDValue(ResNode,0);
+        }
+        ReplaceUses(SDValue(N, 1), Result);
+      }
+      ReplaceUses(SDValue(N, 2), OutChain);
+      return nullptr;
+    }
+    case Intrinsic::arm_stlexd:
+    case Intrinsic::arm_strexd: {
+      SDLoc dl(N);
+      SDValue Chain = N->getOperand(0);
+      SDValue Val0 = N->getOperand(2);
+      SDValue Val1 = N->getOperand(3);
+      SDValue MemAddr = N->getOperand(4);
+
+      // Store exclusive double return a i32 value which is the return status
+      // of the issued store.
+      EVT ResTys[] = { MVT::i32, MVT::Other };
+
+      bool isThumb = Subtarget->isThumb() && Subtarget->hasThumb2();
+      // Place arguments in the right order.
+      SmallVector<SDValue, 7> Ops;
+      if (isThumb) {
+        Ops.push_back(Val0);
+        Ops.push_back(Val1);
+      } else
+        // arm_strexd uses GPRPair.
+        Ops.push_back(SDValue(createGPRPairNode(MVT::Untyped, Val0, Val1), 0));
+      Ops.push_back(MemAddr);
+      Ops.push_back(getAL(CurDAG));
+      Ops.push_back(CurDAG->getRegister(0, MVT::i32));
+      Ops.push_back(Chain);
+
+      bool IsRelease = IntNo == Intrinsic::arm_stlexd;
+      unsigned NewOpc = isThumb ? (IsRelease ? ARM::t2STLEXD : ARM::t2STREXD)
+                                : (IsRelease ? ARM::STLEXD : ARM::STREXD);
+
+      SDNode *St = CurDAG->getMachineNode(NewOpc, dl, ResTys, Ops);
+      // Transfer memoperands.
+      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+      MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+      cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
+
+      return St;
+    }
+
+    case Intrinsic::arm_neon_vld1: {
+      static const uint16_t DOpcodes[] = { ARM::VLD1d8, ARM::VLD1d16,
+                                           ARM::VLD1d32, ARM::VLD1d64 };
+      static const uint16_t QOpcodes[] = { ARM::VLD1q8, ARM::VLD1q16,
+                                           ARM::VLD1q32, ARM::VLD1q64};
+      return SelectVLD(N, false, 1, DOpcodes, QOpcodes, nullptr);
+    }
+
+    case Intrinsic::arm_neon_vld2: {
+      static const uint16_t DOpcodes[] = { ARM::VLD2d8, ARM::VLD2d16,
+                                           ARM::VLD2d32, ARM::VLD1q64 };
+      static const uint16_t QOpcodes[] = { ARM::VLD2q8Pseudo, ARM::VLD2q16Pseudo,
+                                           ARM::VLD2q32Pseudo };
+      return SelectVLD(N, false, 2, DOpcodes, QOpcodes, nullptr);
+    }
+
+    case Intrinsic::arm_neon_vld3: {
+      static const uint16_t DOpcodes[] = { ARM::VLD3d8Pseudo,
+                                           ARM::VLD3d16Pseudo,
+                                           ARM::VLD3d32Pseudo,
+                                           ARM::VLD1d64TPseudo };
+      static const uint16_t QOpcodes0[] = { ARM::VLD3q8Pseudo_UPD,
+                                            ARM::VLD3q16Pseudo_UPD,
+                                            ARM::VLD3q32Pseudo_UPD };
+      static const uint16_t QOpcodes1[] = { ARM::VLD3q8oddPseudo,
+                                            ARM::VLD3q16oddPseudo,
+                                            ARM::VLD3q32oddPseudo };
+      return SelectVLD(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1);
+    }
+
+    case Intrinsic::arm_neon_vld4: {
+      static const uint16_t DOpcodes[] = { ARM::VLD4d8Pseudo,
+                                           ARM::VLD4d16Pseudo,
+                                           ARM::VLD4d32Pseudo,
+                                           ARM::VLD1d64QPseudo };
+      static const uint16_t QOpcodes0[] = { ARM::VLD4q8Pseudo_UPD,
+                                            ARM::VLD4q16Pseudo_UPD,
+                                            ARM::VLD4q32Pseudo_UPD };
+      static const uint16_t QOpcodes1[] = { ARM::VLD4q8oddPseudo,
+                                            ARM::VLD4q16oddPseudo,
+                                            ARM::VLD4q32oddPseudo };
+      return SelectVLD(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1);
+    }
+
+    case Intrinsic::arm_neon_vld2lane: {
+      static const uint16_t DOpcodes[] = { ARM::VLD2LNd8Pseudo,
+                                           ARM::VLD2LNd16Pseudo,
+                                           ARM::VLD2LNd32Pseudo };
+      static const uint16_t QOpcodes[] = { ARM::VLD2LNq16Pseudo,
+                                           ARM::VLD2LNq32Pseudo };
+      return SelectVLDSTLane(N, true, false, 2, DOpcodes, QOpcodes);
+    }
+
+    case Intrinsic::arm_neon_vld3lane: {
+      static const uint16_t DOpcodes[] = { ARM::VLD3LNd8Pseudo,
+                                           ARM::VLD3LNd16Pseudo,
+                                           ARM::VLD3LNd32Pseudo };
+      static const uint16_t QOpcodes[] = { ARM::VLD3LNq16Pseudo,
+                                           ARM::VLD3LNq32Pseudo };
+      return SelectVLDSTLane(N, true, false, 3, DOpcodes, QOpcodes);
+    }
+
+    case Intrinsic::arm_neon_vld4lane: {
+      static const uint16_t DOpcodes[] = { ARM::VLD4LNd8Pseudo,
+                                           ARM::VLD4LNd16Pseudo,
+                                           ARM::VLD4LNd32Pseudo };
+      static const uint16_t QOpcodes[] = { ARM::VLD4LNq16Pseudo,
+                                           ARM::VLD4LNq32Pseudo };
+      return SelectVLDSTLane(N, true, false, 4, DOpcodes, QOpcodes);
+    }
+
+    case Intrinsic::arm_neon_vst1: {
+      static const uint16_t DOpcodes[] = { ARM::VST1d8, ARM::VST1d16,
+                                           ARM::VST1d32, ARM::VST1d64 };
+      static const uint16_t QOpcodes[] = { ARM::VST1q8, ARM::VST1q16,
+                                           ARM::VST1q32, ARM::VST1q64 };
+      return SelectVST(N, false, 1, DOpcodes, QOpcodes, nullptr);
+    }
+
+    case Intrinsic::arm_neon_vst2: {
+      static const uint16_t DOpcodes[] = { ARM::VST2d8, ARM::VST2d16,
+                                           ARM::VST2d32, ARM::VST1q64 };
+      static uint16_t QOpcodes[] = { ARM::VST2q8Pseudo, ARM::VST2q16Pseudo,
+                                     ARM::VST2q32Pseudo };
+      return SelectVST(N, false, 2, DOpcodes, QOpcodes, nullptr);
+    }
+
+    case Intrinsic::arm_neon_vst3: {
+      static const uint16_t DOpcodes[] = { ARM::VST3d8Pseudo,
+                                           ARM::VST3d16Pseudo,
+                                           ARM::VST3d32Pseudo,
+                                           ARM::VST1d64TPseudo };
+      static const uint16_t QOpcodes0[] = { ARM::VST3q8Pseudo_UPD,
+                                            ARM::VST3q16Pseudo_UPD,
+                                            ARM::VST3q32Pseudo_UPD };
+      static const uint16_t QOpcodes1[] = { ARM::VST3q8oddPseudo,
+                                            ARM::VST3q16oddPseudo,
+                                            ARM::VST3q32oddPseudo };
+      return SelectVST(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1);
+    }
+
+    case Intrinsic::arm_neon_vst4: {
+      static const uint16_t DOpcodes[] = { ARM::VST4d8Pseudo,
+                                           ARM::VST4d16Pseudo,
+                                           ARM::VST4d32Pseudo,
+                                           ARM::VST1d64QPseudo };
+      static const uint16_t QOpcodes0[] = { ARM::VST4q8Pseudo_UPD,
+                                            ARM::VST4q16Pseudo_UPD,
+                                            ARM::VST4q32Pseudo_UPD };
+      static const uint16_t QOpcodes1[] = { ARM::VST4q8oddPseudo,
+                                            ARM::VST4q16oddPseudo,
+                                            ARM::VST4q32oddPseudo };
+      return SelectVST(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1);
+    }
+
+    case Intrinsic::arm_neon_vst2lane: {
+      static const uint16_t DOpcodes[] = { ARM::VST2LNd8Pseudo,
+                                           ARM::VST2LNd16Pseudo,
+                                           ARM::VST2LNd32Pseudo };
+      static const uint16_t QOpcodes[] = { ARM::VST2LNq16Pseudo,
+                                           ARM::VST2LNq32Pseudo };
+      return SelectVLDSTLane(N, false, false, 2, DOpcodes, QOpcodes);
+    }
+
+    case Intrinsic::arm_neon_vst3lane: {
+      static const uint16_t DOpcodes[] = { ARM::VST3LNd8Pseudo,
+                                           ARM::VST3LNd16Pseudo,
+                                           ARM::VST3LNd32Pseudo };
+      static const uint16_t QOpcodes[] = { ARM::VST3LNq16Pseudo,
+                                           ARM::VST3LNq32Pseudo };
+      return SelectVLDSTLane(N, false, false, 3, DOpcodes, QOpcodes);
+    }
+
+    case Intrinsic::arm_neon_vst4lane: {
+      static const uint16_t DOpcodes[] = { ARM::VST4LNd8Pseudo,
+                                           ARM::VST4LNd16Pseudo,
+                                           ARM::VST4LNd32Pseudo };
+      static const uint16_t QOpcodes[] = { ARM::VST4LNq16Pseudo,
+                                           ARM::VST4LNq32Pseudo };
+      return SelectVLDSTLane(N, false, false, 4, DOpcodes, QOpcodes);
+    }
+    }
+    break;
+  }
+
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+    switch (IntNo) {
+    default:
+      break;
+
+    case Intrinsic::arm_neon_vtbl2:
+      return SelectVTBL(N, false, 2, ARM::VTBL2);
+    case Intrinsic::arm_neon_vtbl3:
+      return SelectVTBL(N, false, 3, ARM::VTBL3Pseudo);
+    case Intrinsic::arm_neon_vtbl4:
+      return SelectVTBL(N, false, 4, ARM::VTBL4Pseudo);
+
+    case Intrinsic::arm_neon_vtbx2:
+      return SelectVTBL(N, true, 2, ARM::VTBX2);
+    case Intrinsic::arm_neon_vtbx3:
+      return SelectVTBL(N, true, 3, ARM::VTBX3Pseudo);
+    case Intrinsic::arm_neon_vtbx4:
+      return SelectVTBL(N, true, 4, ARM::VTBX4Pseudo);
+    }
+    break;
+  }
+
+  case ARMISD::VTBL1: {
+    SDLoc dl(N);
+    EVT VT = N->getValueType(0);
+    SmallVector<SDValue, 6> Ops;
+
+    Ops.push_back(N->getOperand(0));
+    Ops.push_back(N->getOperand(1));
+    Ops.push_back(getAL(CurDAG));                    // Predicate
+    Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // Predicate Register
+    return CurDAG->getMachineNode(ARM::VTBL1, dl, VT, Ops);
+  }
+  case ARMISD::VTBL2: {
+    SDLoc dl(N);
+    EVT VT = N->getValueType(0);
+
+    // Form a REG_SEQUENCE to force register allocation.
+    SDValue V0 = N->getOperand(0);
+    SDValue V1 = N->getOperand(1);
+    SDValue RegSeq = SDValue(createDRegPairNode(MVT::v16i8, V0, V1), 0);
+
+    SmallVector<SDValue, 6> Ops;
+    Ops.push_back(RegSeq);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(getAL(CurDAG));                    // Predicate
+    Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // Predicate Register
+    return CurDAG->getMachineNode(ARM::VTBL2, dl, VT, Ops);
+  }
+
+  case ISD::CONCAT_VECTORS:
+    return SelectConcatVector(N);
+  }
+
+  return SelectCode(N);
+}
+
+SDNode *ARMDAGToDAGISel::SelectInlineAsm(SDNode *N){
+  std::vector<SDValue> AsmNodeOperands;
+  unsigned Flag, Kind;
+  bool Changed = false;
+  unsigned NumOps = N->getNumOperands();
+
+  // Normally, i64 data is bounded to two arbitrary GRPs for "%r" constraint.
+  // However, some instrstions (e.g. ldrexd/strexd in ARM mode) require
+  // (even/even+1) GPRs and use %n and %Hn to refer to the individual regs
+  // respectively. Since there is no constraint to explicitly specify a
+  // reg pair, we use GPRPair reg class for "%r" for 64-bit data. For Thumb,
+  // the 64-bit data may be referred by H, Q, R modifiers, so we still pack
+  // them into a GPRPair.
+
+  SDLoc dl(N);
+  SDValue Glue = N->getGluedNode() ? N->getOperand(NumOps-1)
+                                   : SDValue(nullptr,0);
+
+  SmallVector<bool, 8> OpChanged;
+  // Glue node will be appended late.
+  for(unsigned i = 0, e = N->getGluedNode() ? NumOps - 1 : NumOps; i < e; ++i) {
+    SDValue op = N->getOperand(i);
+    AsmNodeOperands.push_back(op);
+
+    if (i < InlineAsm::Op_FirstOperand)
+      continue;
+
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(i))) {
+      Flag = C->getZExtValue();
+      Kind = InlineAsm::getKind(Flag);
+    }
+    else
+      continue;
+
+    // Immediate operands to inline asm in the SelectionDAG are modeled with
+    // two operands. The first is a constant of value InlineAsm::Kind_Imm, and
+    // the second is a constant with the value of the immediate. If we get here
+    // and we have a Kind_Imm, skip the next operand, and continue.
+    if (Kind == InlineAsm::Kind_Imm) {
+      SDValue op = N->getOperand(++i);
+      AsmNodeOperands.push_back(op);
+      continue;
+    }
+
+    unsigned NumRegs = InlineAsm::getNumOperandRegisters(Flag);
+    if (NumRegs)
+      OpChanged.push_back(false);
+
+    unsigned DefIdx = 0;
+    bool IsTiedToChangedOp = false;
+    // If it's a use that is tied with a previous def, it has no
+    // reg class constraint.
+    if (Changed && InlineAsm::isUseOperandTiedToDef(Flag, DefIdx))
+      IsTiedToChangedOp = OpChanged[DefIdx];
+
+    if (Kind != InlineAsm::Kind_RegUse && Kind != InlineAsm::Kind_RegDef
+        && Kind != InlineAsm::Kind_RegDefEarlyClobber)
+      continue;
+
+    unsigned RC;
+    bool HasRC = InlineAsm::hasRegClassConstraint(Flag, RC);
+    if ((!IsTiedToChangedOp && (!HasRC || RC != ARM::GPRRegClassID))
+        || NumRegs != 2)
+      continue;
+
+    assert((i+2 < NumOps) && "Invalid number of operands in inline asm");
+    SDValue V0 = N->getOperand(i+1);
+    SDValue V1 = N->getOperand(i+2);
+    unsigned Reg0 = cast<RegisterSDNode>(V0)->getReg();
+    unsigned Reg1 = cast<RegisterSDNode>(V1)->getReg();
+    SDValue PairedReg;
+    MachineRegisterInfo &MRI = MF->getRegInfo();
+
+    if (Kind == InlineAsm::Kind_RegDef ||
+        Kind == InlineAsm::Kind_RegDefEarlyClobber) {
+      // Replace the two GPRs with 1 GPRPair and copy values from GPRPair to
+      // the original GPRs.
+
+      unsigned GPVR = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+      PairedReg = CurDAG->getRegister(GPVR, MVT::Untyped);
+      SDValue Chain = SDValue(N,0);
+
+      SDNode *GU = N->getGluedUser();
+      SDValue RegCopy = CurDAG->getCopyFromReg(Chain, dl, GPVR, MVT::Untyped,
+                                               Chain.getValue(1));
+
+      // Extract values from a GPRPair reg and copy to the original GPR reg.
+      SDValue Sub0 = CurDAG->getTargetExtractSubreg(ARM::gsub_0, dl, MVT::i32,
+                                                    RegCopy);
+      SDValue Sub1 = CurDAG->getTargetExtractSubreg(ARM::gsub_1, dl, MVT::i32,
+                                                    RegCopy);
+      SDValue T0 = CurDAG->getCopyToReg(Sub0, dl, Reg0, Sub0,
+                                        RegCopy.getValue(1));
+      SDValue T1 = CurDAG->getCopyToReg(Sub1, dl, Reg1, Sub1, T0.getValue(1));
+
+      // Update the original glue user.
+      std::vector<SDValue> Ops(GU->op_begin(), GU->op_end()-1);
+      Ops.push_back(T1.getValue(1));
+      CurDAG->UpdateNodeOperands(GU, Ops);
+      GU = T1.getNode();
+    }
+    else {
+      // For Kind  == InlineAsm::Kind_RegUse, we first copy two GPRs into a
+      // GPRPair and then pass the GPRPair to the inline asm.
+      SDValue Chain = AsmNodeOperands[InlineAsm::Op_InputChain];
+
+      // As REG_SEQ doesn't take RegisterSDNode, we copy them first.
+      SDValue T0 = CurDAG->getCopyFromReg(Chain, dl, Reg0, MVT::i32,
+                                          Chain.getValue(1));
+      SDValue T1 = CurDAG->getCopyFromReg(Chain, dl, Reg1, MVT::i32,
+                                          T0.getValue(1));
+      SDValue Pair = SDValue(createGPRPairNode(MVT::Untyped, T0, T1), 0);
+
+      // Copy REG_SEQ into a GPRPair-typed VR and replace the original two
+      // i32 VRs of inline asm with it.
+      unsigned GPVR = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+      PairedReg = CurDAG->getRegister(GPVR, MVT::Untyped);
+      Chain = CurDAG->getCopyToReg(T1, dl, GPVR, Pair, T1.getValue(1));
+
+      AsmNodeOperands[InlineAsm::Op_InputChain] = Chain;
+      Glue = Chain.getValue(1);
+    }
+
+    Changed = true;
+
+    if(PairedReg.getNode()) {
+      OpChanged[OpChanged.size() -1 ] = true;
+      Flag = InlineAsm::getFlagWord(Kind, 1 /* RegNum*/);
+      if (IsTiedToChangedOp)
+        Flag = InlineAsm::getFlagWordForMatchingOp(Flag, DefIdx);
+      else
+        Flag = InlineAsm::getFlagWordForRegClass(Flag, ARM::GPRPairRegClassID);
+      // Replace the current flag.
+      AsmNodeOperands[AsmNodeOperands.size() -1] = CurDAG->getTargetConstant(
+          Flag, MVT::i32);
+      // Add the new register node and skip the original two GPRs.
+      AsmNodeOperands.push_back(PairedReg);
+      // Skip the next two GPRs.
+      i += 2;
+    }
+  }
+
+  if (Glue.getNode())
+    AsmNodeOperands.push_back(Glue);
+  if (!Changed)
+    return nullptr;
+
+  SDValue New = CurDAG->getNode(ISD::INLINEASM, SDLoc(N),
+      CurDAG->getVTList(MVT::Other, MVT::Glue), AsmNodeOperands);
+  New->setNodeId(-1);
+  return New.getNode();
+}
+
+
+bool ARMDAGToDAGISel::
+SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
+                             std::vector<SDValue> &OutOps) {
+  assert(ConstraintCode == 'm' && "unexpected asm memory constraint");
+  // Require the address to be in a register.  That is safe for all ARM
+  // variants and it is hard to do anything much smarter without knowing
+  // how the operand is used.
+  OutOps.push_back(Op);
+  return false;
+}
+
+/// createARMISelDag - This pass converts a legalized DAG into a
+/// ARM-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createARMISelDag(ARMBaseTargetMachine &TM,
+                                     CodeGenOpt::Level OptLevel) {
+  return new ARMDAGToDAGISel(TM, OptLevel);
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp
new file mode 100644
index 0000000..a76531a
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -0,0 +1,10949 @@
+//===-- ARMISelLowering.cpp - ARM DAG Lowering Implementation -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that ARM uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMISelLowering.h"
+#include "ARMCallingConv.h"
+#include "ARMConstantPoolValue.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMPerfectShuffle.h"
+#include "ARMSubtarget.h"
+#include "ARMTargetMachine.h"
+#include "ARMTargetObjectFile.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/IntrinsicLowering.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetOptions.h"
+#include <utility>
+using namespace llvm;
+
+#define DEBUG_TYPE "arm-isel"
+
+STATISTIC(NumTailCalls, "Number of tail calls");
+STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
+STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
+
+cl::opt<bool>
+EnableARMLongCalls("arm-long-calls", cl::Hidden,
+  cl::desc("Generate calls via indirect call instructions"),
+  cl::init(false));
+
+static cl::opt<bool>
+ARMInterworking("arm-interworking", cl::Hidden,
+  cl::desc("Enable / disable ARM interworking (for debugging only)"),
+  cl::init(true));
+
+namespace {
+  class ARMCCState : public CCState {
+  public:
+    ARMCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
+               const TargetMachine &TM, SmallVectorImpl<CCValAssign> &locs,
+               LLVMContext &C, ParmContext PC)
+        : CCState(CC, isVarArg, MF, TM, locs, C) {
+      assert(((PC == Call) || (PC == Prologue)) &&
+             "ARMCCState users must specify whether their context is call"
+             "or prologue generation.");
+      CallOrPrologue = PC;
+    }
+  };
+}
+
+// The APCS parameter registers.
+static const MCPhysReg GPRArgRegs[] = {
+  ARM::R0, ARM::R1, ARM::R2, ARM::R3
+};
+
+void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT,
+                                       MVT PromotedBitwiseVT) {
+  if (VT != PromotedLdStVT) {
+    setOperationAction(ISD::LOAD, VT, Promote);
+    AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
+
+    setOperationAction(ISD::STORE, VT, Promote);
+    AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);
+  }
+
+  MVT ElemTy = VT.getVectorElementType();
+  if (ElemTy != MVT::i64 && ElemTy != MVT::f64)
+    setOperationAction(ISD::SETCC, VT, Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+  if (ElemTy == MVT::i32) {
+    setOperationAction(ISD::SINT_TO_FP, VT, Custom);
+    setOperationAction(ISD::UINT_TO_FP, VT, Custom);
+    setOperationAction(ISD::FP_TO_SINT, VT, Custom);
+    setOperationAction(ISD::FP_TO_UINT, VT, Custom);
+  } else {
+    setOperationAction(ISD::SINT_TO_FP, VT, Expand);
+    setOperationAction(ISD::UINT_TO_FP, VT, Expand);
+    setOperationAction(ISD::FP_TO_SINT, VT, Expand);
+    setOperationAction(ISD::FP_TO_UINT, VT, Expand);
+  }
+  setOperationAction(ISD::BUILD_VECTOR,      VT, Custom);
+  setOperationAction(ISD::VECTOR_SHUFFLE,    VT, Custom);
+  setOperationAction(ISD::CONCAT_VECTORS,    VT, Legal);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
+  setOperationAction(ISD::SELECT,            VT, Expand);
+  setOperationAction(ISD::SELECT_CC,         VT, Expand);
+  setOperationAction(ISD::VSELECT,           VT, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
+  if (VT.isInteger()) {
+    setOperationAction(ISD::SHL, VT, Custom);
+    setOperationAction(ISD::SRA, VT, Custom);
+    setOperationAction(ISD::SRL, VT, Custom);
+  }
+
+  // Promote all bit-wise operations.
+  if (VT.isInteger() && VT != PromotedBitwiseVT) {
+    setOperationAction(ISD::AND, VT, Promote);
+    AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT);
+    setOperationAction(ISD::OR,  VT, Promote);
+    AddPromotedToType (ISD::OR,  VT, PromotedBitwiseVT);
+    setOperationAction(ISD::XOR, VT, Promote);
+    AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT);
+  }
+
+  // Neon does not support vector divide/remainder operations.
+  setOperationAction(ISD::SDIV, VT, Expand);
+  setOperationAction(ISD::UDIV, VT, Expand);
+  setOperationAction(ISD::FDIV, VT, Expand);
+  setOperationAction(ISD::SREM, VT, Expand);
+  setOperationAction(ISD::UREM, VT, Expand);
+  setOperationAction(ISD::FREM, VT, Expand);
+}
+
+void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
+  addRegisterClass(VT, &ARM::DPRRegClass);
+  addTypeForNEON(VT, MVT::f64, MVT::v2i32);
+}
+
+void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
+  addRegisterClass(VT, &ARM::DPairRegClass);
+  addTypeForNEON(VT, MVT::v2f64, MVT::v4i32);
+}
+
+static TargetLoweringObjectFile *createTLOF(const Triple &TT) {
+  if (TT.isOSBinFormatMachO())
+    return new TargetLoweringObjectFileMachO();
+  if (TT.isOSWindows())
+    return new TargetLoweringObjectFileCOFF();
+  return new ARMElfTargetObjectFile();
+}
+
+ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
+    : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) {
+  Subtarget = &TM.getSubtarget<ARMSubtarget>();
+  RegInfo = TM.getRegisterInfo();
+  Itins = TM.getInstrItineraryData();
+
+  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+
+  if (Subtarget->isTargetMachO()) {
+    // Uses VFP for Thumb libfuncs if available.
+    if (Subtarget->isThumb() && Subtarget->hasVFP2() &&
+        Subtarget->hasARMOps() && !TM.Options.UseSoftFloat) {
+      // Single-precision floating-point arithmetic.
+      setLibcallName(RTLIB::ADD_F32, "__addsf3vfp");
+      setLibcallName(RTLIB::SUB_F32, "__subsf3vfp");
+      setLibcallName(RTLIB::MUL_F32, "__mulsf3vfp");
+      setLibcallName(RTLIB::DIV_F32, "__divsf3vfp");
+
+      // Double-precision floating-point arithmetic.
+      setLibcallName(RTLIB::ADD_F64, "__adddf3vfp");
+      setLibcallName(RTLIB::SUB_F64, "__subdf3vfp");
+      setLibcallName(RTLIB::MUL_F64, "__muldf3vfp");
+      setLibcallName(RTLIB::DIV_F64, "__divdf3vfp");
+
+      // Single-precision comparisons.
+      setLibcallName(RTLIB::OEQ_F32, "__eqsf2vfp");
+      setLibcallName(RTLIB::UNE_F32, "__nesf2vfp");
+      setLibcallName(RTLIB::OLT_F32, "__ltsf2vfp");
+      setLibcallName(RTLIB::OLE_F32, "__lesf2vfp");
+      setLibcallName(RTLIB::OGE_F32, "__gesf2vfp");
+      setLibcallName(RTLIB::OGT_F32, "__gtsf2vfp");
+      setLibcallName(RTLIB::UO_F32,  "__unordsf2vfp");
+      setLibcallName(RTLIB::O_F32,   "__unordsf2vfp");
+
+      setCmpLibcallCC(RTLIB::OEQ_F32, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::UNE_F32, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::OLT_F32, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::OLE_F32, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::OGE_F32, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::OGT_F32, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::UO_F32,  ISD::SETNE);
+      setCmpLibcallCC(RTLIB::O_F32,   ISD::SETEQ);
+
+      // Double-precision comparisons.
+      setLibcallName(RTLIB::OEQ_F64, "__eqdf2vfp");
+      setLibcallName(RTLIB::UNE_F64, "__nedf2vfp");
+      setLibcallName(RTLIB::OLT_F64, "__ltdf2vfp");
+      setLibcallName(RTLIB::OLE_F64, "__ledf2vfp");
+      setLibcallName(RTLIB::OGE_F64, "__gedf2vfp");
+      setLibcallName(RTLIB::OGT_F64, "__gtdf2vfp");
+      setLibcallName(RTLIB::UO_F64,  "__unorddf2vfp");
+      setLibcallName(RTLIB::O_F64,   "__unorddf2vfp");
+
+      setCmpLibcallCC(RTLIB::OEQ_F64, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::UNE_F64, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::OLT_F64, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::OLE_F64, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::OGE_F64, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::OGT_F64, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::UO_F64,  ISD::SETNE);
+      setCmpLibcallCC(RTLIB::O_F64,   ISD::SETEQ);
+
+      // Floating-point to integer conversions.
+      // i64 conversions are done via library routines even when generating VFP
+      // instructions, so use the same ones.
+      setLibcallName(RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp");
+      setLibcallName(RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp");
+      setLibcallName(RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp");
+      setLibcallName(RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp");
+
+      // Conversions between floating types.
+      setLibcallName(RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp");
+      setLibcallName(RTLIB::FPEXT_F32_F64,   "__extendsfdf2vfp");
+
+      // Integer to floating-point conversions.
+      // i64 conversions are done via library routines even when generating VFP
+      // instructions, so use the same ones.
+      // FIXME: There appears to be some naming inconsistency in ARM libgcc:
+      // e.g., __floatunsidf vs. __floatunssidfvfp.
+      setLibcallName(RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp");
+      setLibcallName(RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp");
+      setLibcallName(RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp");
+      setLibcallName(RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp");
+    }
+  }
+
+  // These libcalls are not available in 32-bit.
+  setLibcallName(RTLIB::SHL_I128, nullptr);
+  setLibcallName(RTLIB::SRL_I128, nullptr);
+  setLibcallName(RTLIB::SRA_I128, nullptr);
+
+  if (Subtarget->isAAPCS_ABI() && !Subtarget->isTargetMachO() &&
+      !Subtarget->isTargetWindows()) {
+    static const struct {
+      const RTLIB::Libcall Op;
+      const char * const Name;
+      const CallingConv::ID CC;
+      const ISD::CondCode Cond;
+    } LibraryCalls[] = {
+      // Double-precision floating-point arithmetic helper functions
+      // RTABI chapter 4.1.2, Table 2
+      { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+
+      // Double-precision floating-point comparison helper functions
+      // RTABI chapter 4.1.2, Table 3
+      { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
+      { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::UO_F64,  "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::O_F64,   "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ },
+
+      // Single-precision floating-point arithmetic helper functions
+      // RTABI chapter 4.1.2, Table 4
+      { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+
+      // Single-precision floating-point comparison helper functions
+      // RTABI chapter 4.1.2, Table 5
+      { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
+      { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::UO_F32,  "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::O_F32,   "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ },
+
+      // Floating-point to integer conversions.
+      // RTABI chapter 4.1.2, Table 6
+      { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+
+      // Conversions between floating types.
+      // RTABI chapter 4.1.2, Table 7
+      { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::FPEXT_F32_F64,   "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+
+      // Integer to floating-point conversions.
+      // RTABI chapter 4.1.2, Table 8
+      { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+
+      // Long long helper functions
+      // RTABI chapter 4.2, Table 9
+      { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+
+      // Integer division functions
+      // RTABI chapter 4.3.1
+      { RTLIB::SDIV_I8,  "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SDIV_I16, "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SDIV_I32, "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SDIV_I64, "__aeabi_ldivmod",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::UDIV_I8,  "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::UDIV_I16, "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::UDIV_I32, "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+
+      // Memory operations
+      // RTABI chapter 4.3.4
+      { RTLIB::MEMCPY,  "__aeabi_memcpy",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::MEMSET,  "__aeabi_memset",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+    };
+
+    for (const auto &LC : LibraryCalls) {
+      setLibcallName(LC.Op, LC.Name);
+      setLibcallCallingConv(LC.Op, LC.CC);
+      if (LC.Cond != ISD::SETCC_INVALID)
+        setCmpLibcallCC(LC.Op, LC.Cond);
+    }
+  }
+
+  if (Subtarget->isTargetWindows()) {
+    static const struct {
+      const RTLIB::Libcall Op;
+      const char * const Name;
+      const CallingConv::ID CC;
+    } LibraryCalls[] = {
+      { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP },
+      { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP },
+      { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP },
+      { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP },
+      { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP },
+      { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP },
+      { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP },
+      { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP },
+    };
+
+    for (const auto &LC : LibraryCalls) {
+      setLibcallName(LC.Op, LC.Name);
+      setLibcallCallingConv(LC.Op, LC.CC);
+    }
+  }
+
+  // Use divmod compiler-rt calls for iOS 5.0 and later.
+  if (Subtarget->getTargetTriple().isiOS() &&
+      !Subtarget->getTargetTriple().isOSVersionLT(5, 0)) {
+    setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
+    setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
+  }
+
+  if (Subtarget->isThumb1Only())
+    addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
+  else
+    addRegisterClass(MVT::i32, &ARM::GPRRegClass);
+  if (!TM.Options.UseSoftFloat && Subtarget->hasVFP2() &&
+      !Subtarget->isThumb1Only()) {
+    addRegisterClass(MVT::f32, &ARM::SPRRegClass);
+    if (!Subtarget->isFPOnlySP())
+      addRegisterClass(MVT::f64, &ARM::DPRRegClass);
+  }
+
+  for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+       VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
+    for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+         InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
+      setTruncStoreAction((MVT::SimpleValueType)VT,
+                          (MVT::SimpleValueType)InnerVT, Expand);
+    setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
+    setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand);
+    setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand);
+
+    setOperationAction(ISD::MULHS, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::MULHU, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
+
+    setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
+  }
+
+  setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
+  setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
+
+  if (Subtarget->hasNEON()) {
+    addDRTypeForNEON(MVT::v2f32);
+    addDRTypeForNEON(MVT::v8i8);
+    addDRTypeForNEON(MVT::v4i16);
+    addDRTypeForNEON(MVT::v2i32);
+    addDRTypeForNEON(MVT::v1i64);
+
+    addQRTypeForNEON(MVT::v4f32);
+    addQRTypeForNEON(MVT::v2f64);
+    addQRTypeForNEON(MVT::v16i8);
+    addQRTypeForNEON(MVT::v8i16);
+    addQRTypeForNEON(MVT::v4i32);
+    addQRTypeForNEON(MVT::v2i64);
+
+    // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
+    // neither Neon nor VFP support any arithmetic operations on it.
+    // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
+    // supported for v4f32.
+    setOperationAction(ISD::FADD, MVT::v2f64, Expand);
+    setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
+    setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
+    // FIXME: Code duplication: FDIV and FREM are expanded always, see
+    // ARMTargetLowering::addTypeForNEON method for details.
+    setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
+    setOperationAction(ISD::FREM, MVT::v2f64, Expand);
+    // FIXME: Create unittest.
+    // In another words, find a way when "copysign" appears in DAG with vector
+    // operands.
+    setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand);
+    // FIXME: Code duplication: SETCC has custom operation action, see
+    // ARMTargetLowering::addTypeForNEON method for details.
+    setOperationAction(ISD::SETCC, MVT::v2f64, Expand);
+    // FIXME: Create unittest for FNEG and for FABS.
+    setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
+    setOperationAction(ISD::FABS, MVT::v2f64, Expand);
+    setOperationAction(ISD::FSQRT, MVT::v2f64, Expand);
+    setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
+    setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
+    setOperationAction(ISD::FPOWI, MVT::v2f64, Expand);
+    setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
+    setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
+    setOperationAction(ISD::FLOG2, MVT::v2f64, Expand);
+    setOperationAction(ISD::FLOG10, MVT::v2f64, Expand);
+    setOperationAction(ISD::FEXP, MVT::v2f64, Expand);
+    setOperationAction(ISD::FEXP2, MVT::v2f64, Expand);
+    // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR.
+    setOperationAction(ISD::FCEIL, MVT::v2f64, Expand);
+    setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand);
+    setOperationAction(ISD::FRINT, MVT::v2f64, Expand);
+    setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand);
+    setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand);
+    setOperationAction(ISD::FMA, MVT::v2f64, Expand);
+
+    setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
+    setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
+    setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
+    setOperationAction(ISD::FPOWI, MVT::v4f32, Expand);
+    setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
+    setOperationAction(ISD::FLOG, MVT::v4f32, Expand);
+    setOperationAction(ISD::FLOG2, MVT::v4f32, Expand);
+    setOperationAction(ISD::FLOG10, MVT::v4f32, Expand);
+    setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
+    setOperationAction(ISD::FEXP2, MVT::v4f32, Expand);
+    setOperationAction(ISD::FCEIL, MVT::v4f32, Expand);
+    setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand);
+    setOperationAction(ISD::FRINT, MVT::v4f32, Expand);
+    setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand);
+    setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand);
+
+    // Mark v2f32 intrinsics.
+    setOperationAction(ISD::FSQRT, MVT::v2f32, Expand);
+    setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
+    setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
+    setOperationAction(ISD::FPOWI, MVT::v2f32, Expand);
+    setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
+    setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
+    setOperationAction(ISD::FLOG2, MVT::v2f32, Expand);
+    setOperationAction(ISD::FLOG10, MVT::v2f32, Expand);
+    setOperationAction(ISD::FEXP, MVT::v2f32, Expand);
+    setOperationAction(ISD::FEXP2, MVT::v2f32, Expand);
+    setOperationAction(ISD::FCEIL, MVT::v2f32, Expand);
+    setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand);
+    setOperationAction(ISD::FRINT, MVT::v2f32, Expand);
+    setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand);
+    setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand);
+
+    // Neon does not support some operations on v1i64 and v2i64 types.
+    setOperationAction(ISD::MUL, MVT::v1i64, Expand);
+    // Custom handling for some quad-vector types to detect VMULL.
+    setOperationAction(ISD::MUL, MVT::v8i16, Custom);
+    setOperationAction(ISD::MUL, MVT::v4i32, Custom);
+    setOperationAction(ISD::MUL, MVT::v2i64, Custom);
+    // Custom handling for some vector types to avoid expensive expansions
+    setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
+    setOperationAction(ISD::SDIV, MVT::v8i8, Custom);
+    setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
+    setOperationAction(ISD::UDIV, MVT::v8i8, Custom);
+    setOperationAction(ISD::SETCC, MVT::v1i64, Expand);
+    setOperationAction(ISD::SETCC, MVT::v2i64, Expand);
+    // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
+    // a destination type that is wider than the source, and nor does
+    // it have a FP_TO_[SU]INT instruction with a narrower destination than
+    // source.
+    setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
+    setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom);
+    setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom);
+
+    setOperationAction(ISD::FP_ROUND,   MVT::v2f32, Expand);
+    setOperationAction(ISD::FP_EXTEND,  MVT::v2f64, Expand);
+
+    // NEON does not have single instruction CTPOP for vectors with element
+    // types wider than 8-bits.  However, custom lowering can leverage the
+    // v8i8/v16i8 vcnt instruction.
+    setOperationAction(ISD::CTPOP,      MVT::v2i32, Custom);
+    setOperationAction(ISD::CTPOP,      MVT::v4i32, Custom);
+    setOperationAction(ISD::CTPOP,      MVT::v4i16, Custom);
+    setOperationAction(ISD::CTPOP,      MVT::v8i16, Custom);
+
+    // NEON only has FMA instructions as of VFP4.
+    if (!Subtarget->hasVFP4()) {
+      setOperationAction(ISD::FMA, MVT::v2f32, Expand);
+      setOperationAction(ISD::FMA, MVT::v4f32, Expand);
+    }
+
+    setTargetDAGCombine(ISD::INTRINSIC_VOID);
+    setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
+    setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
+    setTargetDAGCombine(ISD::SHL);
+    setTargetDAGCombine(ISD::SRL);
+    setTargetDAGCombine(ISD::SRA);
+    setTargetDAGCombine(ISD::SIGN_EXTEND);
+    setTargetDAGCombine(ISD::ZERO_EXTEND);
+    setTargetDAGCombine(ISD::ANY_EXTEND);
+    setTargetDAGCombine(ISD::SELECT_CC);
+    setTargetDAGCombine(ISD::BUILD_VECTOR);
+    setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
+    setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
+    setTargetDAGCombine(ISD::STORE);
+    setTargetDAGCombine(ISD::FP_TO_SINT);
+    setTargetDAGCombine(ISD::FP_TO_UINT);
+    setTargetDAGCombine(ISD::FDIV);
+
+    // It is legal to extload from v4i8 to v4i16 or v4i32.
+    MVT Tys[6] = {MVT::v8i8, MVT::v4i8, MVT::v2i8,
+                  MVT::v4i16, MVT::v2i16,
+                  MVT::v2i32};
+    for (unsigned i = 0; i < 6; ++i) {
+      setLoadExtAction(ISD::EXTLOAD, Tys[i], Legal);
+      setLoadExtAction(ISD::ZEXTLOAD, Tys[i], Legal);
+      setLoadExtAction(ISD::SEXTLOAD, Tys[i], Legal);
+    }
+  }
+
+  // ARM and Thumb2 support UMLAL/SMLAL.
+  if (!Subtarget->isThumb1Only())
+    setTargetDAGCombine(ISD::ADDC);
+
+
+  computeRegisterProperties();
+
+  // ARM does not have floating-point extending loads.
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
+
+  // ... or truncating stores
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
+
+  // ARM does not have i1 sign extending load.
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+
+  // ARM supports all 4 flavors of integer indexed load / store.
+  if (!Subtarget->isThumb1Only()) {
+    for (unsigned im = (unsigned)ISD::PRE_INC;
+         im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
+      setIndexedLoadAction(im,  MVT::i1,  Legal);
+      setIndexedLoadAction(im,  MVT::i8,  Legal);
+      setIndexedLoadAction(im,  MVT::i16, Legal);
+      setIndexedLoadAction(im,  MVT::i32, Legal);
+      setIndexedStoreAction(im, MVT::i1,  Legal);
+      setIndexedStoreAction(im, MVT::i8,  Legal);
+      setIndexedStoreAction(im, MVT::i16, Legal);
+      setIndexedStoreAction(im, MVT::i32, Legal);
+    }
+  }
+
+  setOperationAction(ISD::SADDO, MVT::i32, Custom);
+  setOperationAction(ISD::UADDO, MVT::i32, Custom);
+  setOperationAction(ISD::SSUBO, MVT::i32, Custom);
+  setOperationAction(ISD::USUBO, MVT::i32, Custom);
+
+  // i64 operation support.
+  setOperationAction(ISD::MUL,     MVT::i64, Expand);
+  setOperationAction(ISD::MULHU,   MVT::i32, Expand);
+  if (Subtarget->isThumb1Only()) {
+    setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
+    setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
+  }
+  if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()
+      || (Subtarget->isThumb2() && !Subtarget->hasThumb2DSP()))
+    setOperationAction(ISD::MULHS, MVT::i32, Expand);
+
+  setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
+  setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
+  setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
+  setOperationAction(ISD::SRL,       MVT::i64, Custom);
+  setOperationAction(ISD::SRA,       MVT::i64, Custom);
+
+  if (!Subtarget->isThumb1Only()) {
+    // FIXME: We should do this for Thumb1 as well.
+    setOperationAction(ISD::ADDC,    MVT::i32, Custom);
+    setOperationAction(ISD::ADDE,    MVT::i32, Custom);
+    setOperationAction(ISD::SUBC,    MVT::i32, Custom);
+    setOperationAction(ISD::SUBE,    MVT::i32, Custom);
+  }
+
+  // ARM does not have ROTL.
+  setOperationAction(ISD::ROTL,  MVT::i32, Expand);
+  setOperationAction(ISD::CTTZ,  MVT::i32, Custom);
+  setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+  if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only())
+    setOperationAction(ISD::CTLZ, MVT::i32, Expand);
+
+  // These just redirect to CTTZ and CTLZ on ARM.
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i32  , Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF  , MVT::i32  , Expand);
+
+  setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);
+
+  // Only ARMv6 has BSWAP.
+  if (!Subtarget->hasV6Ops())
+    setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+
+  if (!(Subtarget->hasDivide() && Subtarget->isThumb2()) &&
+      !(Subtarget->hasDivideInARMMode() && !Subtarget->isThumb())) {
+    // These are expanded into libcalls if the cpu doesn't have HW divider.
+    setOperationAction(ISD::SDIV,  MVT::i32, Expand);
+    setOperationAction(ISD::UDIV,  MVT::i32, Expand);
+  }
+
+  // FIXME: Also set divmod for SREM on EABI
+  setOperationAction(ISD::SREM,  MVT::i32, Expand);
+  setOperationAction(ISD::UREM,  MVT::i32, Expand);
+  // Register based DivRem for AEABI (RTABI 4.2)
+  if (Subtarget->isTargetAEABI()) {
+    setLibcallName(RTLIB::SDIVREM_I8,  "__aeabi_idivmod");
+    setLibcallName(RTLIB::SDIVREM_I16, "__aeabi_idivmod");
+    setLibcallName(RTLIB::SDIVREM_I32, "__aeabi_idivmod");
+    setLibcallName(RTLIB::SDIVREM_I64, "__aeabi_ldivmod");
+    setLibcallName(RTLIB::UDIVREM_I8,  "__aeabi_uidivmod");
+    setLibcallName(RTLIB::UDIVREM_I16, "__aeabi_uidivmod");
+    setLibcallName(RTLIB::UDIVREM_I32, "__aeabi_uidivmod");
+    setLibcallName(RTLIB::UDIVREM_I64, "__aeabi_uldivmod");
+
+    setLibcallCallingConv(RTLIB::SDIVREM_I8, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::SDIVREM_I16, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::SDIVREM_I32, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::SDIVREM_I64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::UDIVREM_I8, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::UDIVREM_I16, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::UDIVREM_I32, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::UDIVREM_I64, CallingConv::ARM_AAPCS);
+
+    setOperationAction(ISD::SDIVREM, MVT::i32, Custom);
+    setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
+  } else {
+    setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+    setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+  }
+
+  setOperationAction(ISD::GlobalAddress, MVT::i32,   Custom);
+  setOperationAction(ISD::ConstantPool,  MVT::i32,   Custom);
+  setOperationAction(ISD::GLOBAL_OFFSET_TABLE, MVT::i32, Custom);
+  setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
+  setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
+
+  setOperationAction(ISD::TRAP, MVT::Other, Legal);
+
+  // Use the default implementation.
+  setOperationAction(ISD::VASTART,            MVT::Other, Custom);
+  setOperationAction(ISD::VAARG,              MVT::Other, Expand);
+  setOperationAction(ISD::VACOPY,             MVT::Other, Expand);
+  setOperationAction(ISD::VAEND,              MVT::Other, Expand);
+  setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
+
+  if (!Subtarget->isTargetMachO()) {
+    // Non-MachO platforms may return values in these registers via the
+    // personality function.
+    setExceptionPointerRegister(ARM::R0);
+    setExceptionSelectorRegister(ARM::R1);
+  }
+
+  if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment())
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
+  else
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
+
+  // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
+  // the default expansion.
+  if (Subtarget->hasAnyDataBarrier() && !Subtarget->isThumb1Only()) {
+    // ATOMIC_FENCE needs custom lowering; the others should have been expanded
+    // to ldrex/strex loops already.
+    setOperationAction(ISD::ATOMIC_FENCE,     MVT::Other, Custom);
+
+    // On v8, we have particularly efficient implementations of atomic fences
+    // if they can be combined with nearby atomic loads and stores.
+    if (!Subtarget->hasV8Ops()) {
+      // Automatically insert fences (dmb ist) around ATOMIC_SWAP etc.
+      setInsertFencesForAtomic(true);
+    }
+  } else {
+    // If there's anything we can use as a barrier, go through custom lowering
+    // for ATOMIC_FENCE.
+    setOperationAction(ISD::ATOMIC_FENCE,   MVT::Other,
+                       Subtarget->hasAnyDataBarrier() ? Custom : Expand);
+
+    // Set them all for expansion, which will force libcalls.
+    setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_SWAP,      MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand);
+    // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
+    // Unordered/Monotonic case.
+    setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
+    setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);
+  }
+
+  setOperationAction(ISD::PREFETCH,         MVT::Other, Custom);
+
+  // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
+  if (!Subtarget->hasV6Ops()) {
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8,  Expand);
+  }
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+  if (!TM.Options.UseSoftFloat && Subtarget->hasVFP2() &&
+      !Subtarget->isThumb1Only()) {
+    // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
+    // iff target supports vfp2.
+    setOperationAction(ISD::BITCAST, MVT::i64, Custom);
+    setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
+  }
+
+  // We want to custom lower some of our intrinsics.
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+  if (Subtarget->isTargetDarwin()) {
+    setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
+    setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
+    setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
+  }
+
+  setOperationAction(ISD::SETCC,     MVT::i32, Expand);
+  setOperationAction(ISD::SETCC,     MVT::f32, Expand);
+  setOperationAction(ISD::SETCC,     MVT::f64, Expand);
+  setOperationAction(ISD::SELECT,    MVT::i32, Custom);
+  setOperationAction(ISD::SELECT,    MVT::f32, Custom);
+  setOperationAction(ISD::SELECT,    MVT::f64, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
+
+  setOperationAction(ISD::BRCOND,    MVT::Other, Expand);
+  setOperationAction(ISD::BR_CC,     MVT::i32,   Custom);
+  setOperationAction(ISD::BR_CC,     MVT::f32,   Custom);
+  setOperationAction(ISD::BR_CC,     MVT::f64,   Custom);
+  setOperationAction(ISD::BR_JT,     MVT::Other, Custom);
+
+  // We don't support sin/cos/fmod/copysign/pow
+  setOperationAction(ISD::FSIN,      MVT::f64, Expand);
+  setOperationAction(ISD::FSIN,      MVT::f32, Expand);
+  setOperationAction(ISD::FCOS,      MVT::f32, Expand);
+  setOperationAction(ISD::FCOS,      MVT::f64, Expand);
+  setOperationAction(ISD::FSINCOS,   MVT::f64, Expand);
+  setOperationAction(ISD::FSINCOS,   MVT::f32, Expand);
+  setOperationAction(ISD::FREM,      MVT::f64, Expand);
+  setOperationAction(ISD::FREM,      MVT::f32, Expand);
+  if (!TM.Options.UseSoftFloat && Subtarget->hasVFP2() &&
+      !Subtarget->isThumb1Only()) {
+    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
+  }
+  setOperationAction(ISD::FPOW,      MVT::f64, Expand);
+  setOperationAction(ISD::FPOW,      MVT::f32, Expand);
+
+  if (!Subtarget->hasVFP4()) {
+    setOperationAction(ISD::FMA, MVT::f64, Expand);
+    setOperationAction(ISD::FMA, MVT::f32, Expand);
+  }
+
+  // Various VFP goodness
+  if (!TM.Options.UseSoftFloat && !Subtarget->isThumb1Only()) {
+    // int <-> fp are custom expanded into bit_convert + ARMISD ops.
+    if (Subtarget->hasVFP2()) {
+      setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+      setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
+      setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+      setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+    }
+
+    // v8 adds f64 <-> f16 conversion. Before that it should be expanded.
+    if (!Subtarget->hasV8Ops()) {
+      setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
+      setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
+    }
+
+    // fp16 is a special v7 extension that adds f16 <-> f32 conversions.
+    if (!Subtarget->hasFP16()) {
+      setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
+      setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
+    }
+  }
+
+  // Combine sin / cos into one node or libcall if possible.
+  if (Subtarget->hasSinCos()) {
+    setLibcallName(RTLIB::SINCOS_F32, "sincosf");
+    setLibcallName(RTLIB::SINCOS_F64, "sincos");
+    if (Subtarget->getTargetTriple().getOS() == Triple::IOS) {
+      // For iOS, we don't want to the normal expansion of a libcall to
+      // sincos. We want to issue a libcall to __sincos_stret.
+      setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
+      setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
+    }
+  }
+
+  // We have target-specific dag combine patterns for the following nodes:
+  // ARMISD::VMOVRRD  - No need to call setTargetDAGCombine
+  setTargetDAGCombine(ISD::ADD);
+  setTargetDAGCombine(ISD::SUB);
+  setTargetDAGCombine(ISD::MUL);
+  setTargetDAGCombine(ISD::AND);
+  setTargetDAGCombine(ISD::OR);
+  setTargetDAGCombine(ISD::XOR);
+
+  if (Subtarget->hasV6Ops())
+    setTargetDAGCombine(ISD::SRL);
+
+  setStackPointerRegisterToSaveRestore(ARM::SP);
+
+  if (TM.Options.UseSoftFloat || Subtarget->isThumb1Only() ||
+      !Subtarget->hasVFP2())
+    setSchedulingPreference(Sched::RegPressure);
+  else
+    setSchedulingPreference(Sched::Hybrid);
+
+  //// temporary - rewrite interface to use type
+  MaxStoresPerMemset = 8;
+  MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
+  MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
+  MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 4 : 2;
+  MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
+  MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 4 : 2;
+
+  // On ARM arguments smaller than 4 bytes are extended, so all arguments
+  // are at least 4 bytes aligned.
+  setMinStackArgumentAlignment(4);
+
+  // Prefer likely predicted branches to selects on out-of-order cores.
+  PredictableSelectIsExpensive = Subtarget->isLikeA9();
+
+  setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2);
+}
+
+// FIXME: It might make sense to define the representative register class as the
+// nearest super-register that has a non-null superset. For example, DPR_VFP2 is
+// a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
+// SPR's representative would be DPR_VFP2. This should work well if register
+// pressure tracking were modified such that a register use would increment the
+// pressure of the register class's representative and all of it's super
+// classes' representatives transitively. We have not implemented this because
+// of the difficulty prior to coalescing of modeling operand register classes
+// due to the common occurrence of cross class copies and subregister insertions
+// and extractions.
+std::pair<const TargetRegisterClass*, uint8_t>
+ARMTargetLowering::findRepresentativeClass(MVT VT) const{
+  const TargetRegisterClass *RRC = nullptr;
+  uint8_t Cost = 1;
+  switch (VT.SimpleTy) {
+  default:
+    return TargetLowering::findRepresentativeClass(VT);
+  // Use DPR as representative register class for all floating point
+  // and vector types. Since there are 32 SPR registers and 32 DPR registers so
+  // the cost is 1 for both f32 and f64.
+  case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
+  case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
+    RRC = &ARM::DPRRegClass;
+    // When NEON is used for SP, only half of the register file is available
+    // because operations that define both SP and DP results will be constrained
+    // to the VFP2 class (D0-D15). We currently model this constraint prior to
+    // coalescing by double-counting the SP regs. See the FIXME above.
+    if (Subtarget->useNEONForSinglePrecisionFP())
+      Cost = 2;
+    break;
+  case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
+  case MVT::v4f32: case MVT::v2f64:
+    RRC = &ARM::DPRRegClass;
+    Cost = 2;
+    break;
+  case MVT::v4i64:
+    RRC = &ARM::DPRRegClass;
+    Cost = 4;
+    break;
+  case MVT::v8i64:
+    RRC = &ARM::DPRRegClass;
+    Cost = 8;
+    break;
+  }
+  return std::make_pair(RRC, Cost);
+}
+
+const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default: return nullptr;
+  case ARMISD::Wrapper:       return "ARMISD::Wrapper";
+  case ARMISD::WrapperPIC:    return "ARMISD::WrapperPIC";
+  case ARMISD::WrapperJT:     return "ARMISD::WrapperJT";
+  case ARMISD::CALL:          return "ARMISD::CALL";
+  case ARMISD::CALL_PRED:     return "ARMISD::CALL_PRED";
+  case ARMISD::CALL_NOLINK:   return "ARMISD::CALL_NOLINK";
+  case ARMISD::tCALL:         return "ARMISD::tCALL";
+  case ARMISD::BRCOND:        return "ARMISD::BRCOND";
+  case ARMISD::BR_JT:         return "ARMISD::BR_JT";
+  case ARMISD::BR2_JT:        return "ARMISD::BR2_JT";
+  case ARMISD::RET_FLAG:      return "ARMISD::RET_FLAG";
+  case ARMISD::INTRET_FLAG:   return "ARMISD::INTRET_FLAG";
+  case ARMISD::PIC_ADD:       return "ARMISD::PIC_ADD";
+  case ARMISD::CMP:           return "ARMISD::CMP";
+  case ARMISD::CMN:           return "ARMISD::CMN";
+  case ARMISD::CMPZ:          return "ARMISD::CMPZ";
+  case ARMISD::CMPFP:         return "ARMISD::CMPFP";
+  case ARMISD::CMPFPw0:       return "ARMISD::CMPFPw0";
+  case ARMISD::BCC_i64:       return "ARMISD::BCC_i64";
+  case ARMISD::FMSTAT:        return "ARMISD::FMSTAT";
+
+  case ARMISD::CMOV:          return "ARMISD::CMOV";
+
+  case ARMISD::RBIT:          return "ARMISD::RBIT";
+
+  case ARMISD::FTOSI:         return "ARMISD::FTOSI";
+  case ARMISD::FTOUI:         return "ARMISD::FTOUI";
+  case ARMISD::SITOF:         return "ARMISD::SITOF";
+  case ARMISD::UITOF:         return "ARMISD::UITOF";
+
+  case ARMISD::SRL_FLAG:      return "ARMISD::SRL_FLAG";
+  case ARMISD::SRA_FLAG:      return "ARMISD::SRA_FLAG";
+  case ARMISD::RRX:           return "ARMISD::RRX";
+
+  case ARMISD::ADDC:          return "ARMISD::ADDC";
+  case ARMISD::ADDE:          return "ARMISD::ADDE";
+  case ARMISD::SUBC:          return "ARMISD::SUBC";
+  case ARMISD::SUBE:          return "ARMISD::SUBE";
+
+  case ARMISD::VMOVRRD:       return "ARMISD::VMOVRRD";
+  case ARMISD::VMOVDRR:       return "ARMISD::VMOVDRR";
+
+  case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP";
+  case ARMISD::EH_SJLJ_LONGJMP:return "ARMISD::EH_SJLJ_LONGJMP";
+
+  case ARMISD::TC_RETURN:     return "ARMISD::TC_RETURN";
+
+  case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER";
+
+  case ARMISD::DYN_ALLOC:     return "ARMISD::DYN_ALLOC";
+
+  case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR";
+
+  case ARMISD::PRELOAD:       return "ARMISD::PRELOAD";
+
+  case ARMISD::WIN__CHKSTK:   return "ARMISD:::WIN__CHKSTK";
+
+  case ARMISD::VCEQ:          return "ARMISD::VCEQ";
+  case ARMISD::VCEQZ:         return "ARMISD::VCEQZ";
+  case ARMISD::VCGE:          return "ARMISD::VCGE";
+  case ARMISD::VCGEZ:         return "ARMISD::VCGEZ";
+  case ARMISD::VCLEZ:         return "ARMISD::VCLEZ";
+  case ARMISD::VCGEU:         return "ARMISD::VCGEU";
+  case ARMISD::VCGT:          return "ARMISD::VCGT";
+  case ARMISD::VCGTZ:         return "ARMISD::VCGTZ";
+  case ARMISD::VCLTZ:         return "ARMISD::VCLTZ";
+  case ARMISD::VCGTU:         return "ARMISD::VCGTU";
+  case ARMISD::VTST:          return "ARMISD::VTST";
+
+  case ARMISD::VSHL:          return "ARMISD::VSHL";
+  case ARMISD::VSHRs:         return "ARMISD::VSHRs";
+  case ARMISD::VSHRu:         return "ARMISD::VSHRu";
+  case ARMISD::VRSHRs:        return "ARMISD::VRSHRs";
+  case ARMISD::VRSHRu:        return "ARMISD::VRSHRu";
+  case ARMISD::VRSHRN:        return "ARMISD::VRSHRN";
+  case ARMISD::VQSHLs:        return "ARMISD::VQSHLs";
+  case ARMISD::VQSHLu:        return "ARMISD::VQSHLu";
+  case ARMISD::VQSHLsu:       return "ARMISD::VQSHLsu";
+  case ARMISD::VQSHRNs:       return "ARMISD::VQSHRNs";
+  case ARMISD::VQSHRNu:       return "ARMISD::VQSHRNu";
+  case ARMISD::VQSHRNsu:      return "ARMISD::VQSHRNsu";
+  case ARMISD::VQRSHRNs:      return "ARMISD::VQRSHRNs";
+  case ARMISD::VQRSHRNu:      return "ARMISD::VQRSHRNu";
+  case ARMISD::VQRSHRNsu:     return "ARMISD::VQRSHRNsu";
+  case ARMISD::VGETLANEu:     return "ARMISD::VGETLANEu";
+  case ARMISD::VGETLANEs:     return "ARMISD::VGETLANEs";
+  case ARMISD::VMOVIMM:       return "ARMISD::VMOVIMM";
+  case ARMISD::VMVNIMM:       return "ARMISD::VMVNIMM";
+  case ARMISD::VMOVFPIMM:     return "ARMISD::VMOVFPIMM";
+  case ARMISD::VDUP:          return "ARMISD::VDUP";
+  case ARMISD::VDUPLANE:      return "ARMISD::VDUPLANE";
+  case ARMISD::VEXT:          return "ARMISD::VEXT";
+  case ARMISD::VREV64:        return "ARMISD::VREV64";
+  case ARMISD::VREV32:        return "ARMISD::VREV32";
+  case ARMISD::VREV16:        return "ARMISD::VREV16";
+  case ARMISD::VZIP:          return "ARMISD::VZIP";
+  case ARMISD::VUZP:          return "ARMISD::VUZP";
+  case ARMISD::VTRN:          return "ARMISD::VTRN";
+  case ARMISD::VTBL1:         return "ARMISD::VTBL1";
+  case ARMISD::VTBL2:         return "ARMISD::VTBL2";
+  case ARMISD::VMULLs:        return "ARMISD::VMULLs";
+  case ARMISD::VMULLu:        return "ARMISD::VMULLu";
+  case ARMISD::UMLAL:         return "ARMISD::UMLAL";
+  case ARMISD::SMLAL:         return "ARMISD::SMLAL";
+  case ARMISD::BUILD_VECTOR:  return "ARMISD::BUILD_VECTOR";
+  case ARMISD::FMAX:          return "ARMISD::FMAX";
+  case ARMISD::FMIN:          return "ARMISD::FMIN";
+  case ARMISD::VMAXNM:        return "ARMISD::VMAX";
+  case ARMISD::VMINNM:        return "ARMISD::VMIN";
+  case ARMISD::BFI:           return "ARMISD::BFI";
+  case ARMISD::VORRIMM:       return "ARMISD::VORRIMM";
+  case ARMISD::VBICIMM:       return "ARMISD::VBICIMM";
+  case ARMISD::VBSL:          return "ARMISD::VBSL";
+  case ARMISD::VLD2DUP:       return "ARMISD::VLD2DUP";
+  case ARMISD::VLD3DUP:       return "ARMISD::VLD3DUP";
+  case ARMISD::VLD4DUP:       return "ARMISD::VLD4DUP";
+  case ARMISD::VLD1_UPD:      return "ARMISD::VLD1_UPD";
+  case ARMISD::VLD2_UPD:      return "ARMISD::VLD2_UPD";
+  case ARMISD::VLD3_UPD:      return "ARMISD::VLD3_UPD";
+  case ARMISD::VLD4_UPD:      return "ARMISD::VLD4_UPD";
+  case ARMISD::VLD2LN_UPD:    return "ARMISD::VLD2LN_UPD";
+  case ARMISD::VLD3LN_UPD:    return "ARMISD::VLD3LN_UPD";
+  case ARMISD::VLD4LN_UPD:    return "ARMISD::VLD4LN_UPD";
+  case ARMISD::VLD2DUP_UPD:   return "ARMISD::VLD2DUP_UPD";
+  case ARMISD::VLD3DUP_UPD:   return "ARMISD::VLD3DUP_UPD";
+  case ARMISD::VLD4DUP_UPD:   return "ARMISD::VLD4DUP_UPD";
+  case ARMISD::VST1_UPD:      return "ARMISD::VST1_UPD";
+  case ARMISD::VST2_UPD:      return "ARMISD::VST2_UPD";
+  case ARMISD::VST3_UPD:      return "ARMISD::VST3_UPD";
+  case ARMISD::VST4_UPD:      return "ARMISD::VST4_UPD";
+  case ARMISD::VST2LN_UPD:    return "ARMISD::VST2LN_UPD";
+  case ARMISD::VST3LN_UPD:    return "ARMISD::VST3LN_UPD";
+  case ARMISD::VST4LN_UPD:    return "ARMISD::VST4LN_UPD";
+  }
+}
+
+EVT ARMTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
+  if (!VT.isVector()) return getPointerTy();
+  return VT.changeVectorElementTypeToInteger();
+}
+
+/// getRegClassFor - Return the register class that should be used for the
+/// specified value type.
+const TargetRegisterClass *ARMTargetLowering::getRegClassFor(MVT VT) const {
+  // Map v4i64 to QQ registers but do not make the type legal. Similarly map
+  // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
+  // load / store 4 to 8 consecutive D registers.
+  if (Subtarget->hasNEON()) {
+    if (VT == MVT::v4i64)
+      return &ARM::QQPRRegClass;
+    if (VT == MVT::v8i64)
+      return &ARM::QQQQPRRegClass;
+  }
+  return TargetLowering::getRegClassFor(VT);
+}
+
+// Create a fast isel object.
+FastISel *
+ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
+                                  const TargetLibraryInfo *libInfo) const {
+  return ARM::createFastISel(funcInfo, libInfo);
+}
+
+/// getMaximalGlobalOffset - Returns the maximal possible offset which can
+/// be used for loads / stores from the global.
+unsigned ARMTargetLowering::getMaximalGlobalOffset() const {
+  return (Subtarget->isThumb1Only() ? 127 : 4095);
+}
+
+Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const {
+  unsigned NumVals = N->getNumValues();
+  if (!NumVals)
+    return Sched::RegPressure;
+
+  for (unsigned i = 0; i != NumVals; ++i) {
+    EVT VT = N->getValueType(i);
+    if (VT == MVT::Glue || VT == MVT::Other)
+      continue;
+    if (VT.isFloatingPoint() || VT.isVector())
+      return Sched::ILP;
+  }
+
+  if (!N->isMachineOpcode())
+    return Sched::RegPressure;
+
+  // Load are scheduled for latency even if there instruction itinerary
+  // is not available.
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
+
+  if (MCID.getNumDefs() == 0)
+    return Sched::RegPressure;
+  if (!Itins->isEmpty() &&
+      Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2)
+    return Sched::ILP;
+
+  return Sched::RegPressure;
+}
+
+//===----------------------------------------------------------------------===//
+// Lowering Code
+//===----------------------------------------------------------------------===//
+
+/// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
+static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) {
+  switch (CC) {
+  default: llvm_unreachable("Unknown condition code!");
+  case ISD::SETNE:  return ARMCC::NE;
+  case ISD::SETEQ:  return ARMCC::EQ;
+  case ISD::SETGT:  return ARMCC::GT;
+  case ISD::SETGE:  return ARMCC::GE;
+  case ISD::SETLT:  return ARMCC::LT;
+  case ISD::SETLE:  return ARMCC::LE;
+  case ISD::SETUGT: return ARMCC::HI;
+  case ISD::SETUGE: return ARMCC::HS;
+  case ISD::SETULT: return ARMCC::LO;
+  case ISD::SETULE: return ARMCC::LS;
+  }
+}
+
+/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
+static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
+                        ARMCC::CondCodes &CondCode2) {
+  CondCode2 = ARMCC::AL;
+  switch (CC) {
+  default: llvm_unreachable("Unknown FP condition!");
+  case ISD::SETEQ:
+  case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
+  case ISD::SETGT:
+  case ISD::SETOGT: CondCode = ARMCC::GT; break;
+  case ISD::SETGE:
+  case ISD::SETOGE: CondCode = ARMCC::GE; break;
+  case ISD::SETOLT: CondCode = ARMCC::MI; break;
+  case ISD::SETOLE: CondCode = ARMCC::LS; break;
+  case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
+  case ISD::SETO:   CondCode = ARMCC::VC; break;
+  case ISD::SETUO:  CondCode = ARMCC::VS; break;
+  case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
+  case ISD::SETUGT: CondCode = ARMCC::HI; break;
+  case ISD::SETUGE: CondCode = ARMCC::PL; break;
+  case ISD::SETLT:
+  case ISD::SETULT: CondCode = ARMCC::LT; break;
+  case ISD::SETLE:
+  case ISD::SETULE: CondCode = ARMCC::LE; break;
+  case ISD::SETNE:
+  case ISD::SETUNE: CondCode = ARMCC::NE; break;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                      Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "ARMGenCallingConv.inc"
+
+/// getEffectiveCallingConv - Get the effective calling convention, taking into
+/// account presence of floating point hardware and calling convention
+/// limitations, such as support for variadic functions.
+CallingConv::ID
+ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
+                                           bool isVarArg) const {
+  switch (CC) {
+  default:
+    llvm_unreachable("Unsupported calling convention");
+  case CallingConv::ARM_AAPCS:
+  case CallingConv::ARM_APCS:
+  case CallingConv::GHC:
+    return CC;
+  case CallingConv::ARM_AAPCS_VFP:
+    return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP;
+  case CallingConv::C:
+    if (!Subtarget->isAAPCS_ABI())
+      return CallingConv::ARM_APCS;
+    else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() &&
+             getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
+             !isVarArg)
+      return CallingConv::ARM_AAPCS_VFP;
+    else
+      return CallingConv::ARM_AAPCS;
+  case CallingConv::Fast:
+    if (!Subtarget->isAAPCS_ABI()) {
+      if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg)
+        return CallingConv::Fast;
+      return CallingConv::ARM_APCS;
+    } else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg)
+      return CallingConv::ARM_AAPCS_VFP;
+    else
+      return CallingConv::ARM_AAPCS;
+  }
+}
+
+/// CCAssignFnForNode - Selects the correct CCAssignFn for the given
+/// CallingConvention.
+CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
+                                                 bool Return,
+                                                 bool isVarArg) const {
+  switch (getEffectiveCallingConv(CC, isVarArg)) {
+  default:
+    llvm_unreachable("Unsupported calling convention");
+  case CallingConv::ARM_APCS:
+    return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
+  case CallingConv::ARM_AAPCS:
+    return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
+  case CallingConv::ARM_AAPCS_VFP:
+    return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
+  case CallingConv::Fast:
+    return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
+  case CallingConv::GHC:
+    return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
+  }
+}
+
+/// LowerCallResult - Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+SDValue
+ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
+                                   CallingConv::ID CallConv, bool isVarArg,
+                                   const SmallVectorImpl<ISD::InputArg> &Ins,
+                                   SDLoc dl, SelectionDAG &DAG,
+                                   SmallVectorImpl<SDValue> &InVals,
+                                   bool isThisReturn, SDValue ThisVal) const {
+
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                    getTargetMachine(), RVLocs, *DAG.getContext(), Call);
+  CCInfo.AnalyzeCallResult(Ins,
+                           CCAssignFnForNode(CallConv, /* Return*/ true,
+                                             isVarArg));
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign VA = RVLocs[i];
+
+    // Pass 'this' value directly from the argument to return value, to avoid
+    // reg unit interference
+    if (i == 0 && isThisReturn) {
+      assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 &&
+             "unexpected return calling convention register assignment");
+      InVals.push_back(ThisVal);
+      continue;
+    }
+
+    SDValue Val;
+    if (VA.needsCustom()) {
+      // Handle f64 or half of a v2f64.
+      SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
+                                      InFlag);
+      Chain = Lo.getValue(1);
+      InFlag = Lo.getValue(2);
+      VA = RVLocs[++i]; // skip ahead to next loc
+      SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
+                                      InFlag);
+      Chain = Hi.getValue(1);
+      InFlag = Hi.getValue(2);
+      if (!Subtarget->isLittle())
+        std::swap (Lo, Hi);
+      Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
+
+      if (VA.getLocVT() == MVT::v2f64) {
+        SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
+        Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
+                          DAG.getConstant(0, MVT::i32));
+
+        VA = RVLocs[++i]; // skip ahead to next loc
+        Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
+        Chain = Lo.getValue(1);
+        InFlag = Lo.getValue(2);
+        VA = RVLocs[++i]; // skip ahead to next loc
+        Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
+        Chain = Hi.getValue(1);
+        InFlag = Hi.getValue(2);
+        if (!Subtarget->isLittle())
+          std::swap (Lo, Hi);
+        Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
+        Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
+                          DAG.getConstant(1, MVT::i32));
+      }
+    } else {
+      Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
+                               InFlag);
+      Chain = Val.getValue(1);
+      InFlag = Val.getValue(2);
+    }
+
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::BCvt:
+      Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
+      break;
+    }
+
+    InVals.push_back(Val);
+  }
+
+  return Chain;
+}
+
+/// LowerMemOpCallTo - Store the argument to the stack.
+SDValue
+ARMTargetLowering::LowerMemOpCallTo(SDValue Chain,
+                                    SDValue StackPtr, SDValue Arg,
+                                    SDLoc dl, SelectionDAG &DAG,
+                                    const CCValAssign &VA,
+                                    ISD::ArgFlagsTy Flags) const {
+  unsigned LocMemOffset = VA.getLocMemOffset();
+  SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
+  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
+  return DAG.getStore(Chain, dl, Arg, PtrOff,
+                      MachinePointerInfo::getStack(LocMemOffset),
+                      false, false, 0);
+}
+
+void ARMTargetLowering::PassF64ArgInRegs(SDLoc dl, SelectionDAG &DAG,
+                                         SDValue Chain, SDValue &Arg,
+                                         RegsToPassVector &RegsToPass,
+                                         CCValAssign &VA, CCValAssign &NextVA,
+                                         SDValue &StackPtr,
+                                         SmallVectorImpl<SDValue> &MemOpChains,
+                                         ISD::ArgFlagsTy Flags) const {
+
+  SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
+                              DAG.getVTList(MVT::i32, MVT::i32), Arg);
+  unsigned id = Subtarget->isLittle() ? 0 : 1;
+  RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id)));
+
+  if (NextVA.isRegLoc())
+    RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id)));
+  else {
+    assert(NextVA.isMemLoc());
+    if (!StackPtr.getNode())
+      StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy());
+
+    MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1-id),
+                                           dl, DAG, NextVA,
+                                           Flags));
+  }
+}
+
+/// LowerCall - Lowering a call into a callseq_start <-
+/// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
+/// nodes.
+SDValue
+ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                             SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  SDLoc &dl                          = CLI.DL;
+  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+  SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
+  SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  bool &isTailCall                      = CLI.IsTailCall;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool doesNotRet                       = CLI.DoesNotReturn;
+  bool isVarArg                         = CLI.IsVarArg;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool isStructRet    = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
+  bool isThisReturn   = false;
+  bool isSibCall      = false;
+
+  // Disable tail calls if they're not supported.
+  if (!Subtarget->supportsTailCall() || MF.getTarget().Options.DisableTailCalls)
+    isTailCall = false;
+
+  if (isTailCall) {
+    // Check if it's really possible to do a tail call.
+    isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
+                    isVarArg, isStructRet, MF.getFunction()->hasStructRetAttr(),
+                                                   Outs, OutVals, Ins, DAG);
+    if (!isTailCall && CLI.CS && CLI.CS->isMustTailCall())
+      report_fatal_error("failed to perform tail call elimination on a call "
+                         "site marked musttail");
+    // We don't support GuaranteedTailCallOpt for ARM, only automatically
+    // detected sibcalls.
+    if (isTailCall) {
+      ++NumTailCalls;
+      isSibCall = true;
+    }
+  }
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext(), Call);
+  CCInfo.AnalyzeCallOperands(Outs,
+                             CCAssignFnForNode(CallConv, /* Return*/ false,
+                                               isVarArg));
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+
+  // For tail calls, memory operands are available in our caller's stack.
+  if (isSibCall)
+    NumBytes = 0;
+
+  // Adjust the stack pointer for the new arguments...
+  // These operations are automatically eliminated by the prolog/epilog pass
+  if (!isSibCall)
+    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                                 dl);
+
+  SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy());
+
+  RegsToPassVector RegsToPass;
+  SmallVector<SDValue, 8> MemOpChains;
+
+  // Walk the register/memloc assignments, inserting copies/loads.  In the case
+  // of tail call optimization, arguments are handled later.
+  for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
+       i != e;
+       ++i, ++realArgIdx) {
+    CCValAssign &VA = ArgLocs[i];
+    SDValue Arg = OutVals[realArgIdx];
+    ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
+    bool isByVal = Flags.isByVal();
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::AExt:
+      Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::BCvt:
+      Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
+      break;
+    }
+
+    // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
+    if (VA.needsCustom()) {
+      if (VA.getLocVT() == MVT::v2f64) {
+        SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
+                                  DAG.getConstant(0, MVT::i32));
+        SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
+                                  DAG.getConstant(1, MVT::i32));
+
+        PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass,
+                         VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
+
+        VA = ArgLocs[++i]; // skip ahead to next loc
+        if (VA.isRegLoc()) {
+          PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass,
+                           VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
+        } else {
+          assert(VA.isMemLoc());
+
+          MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1,
+                                                 dl, DAG, VA, Flags));
+        }
+      } else {
+        PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
+                         StackPtr, MemOpChains, Flags);
+      }
+    } else if (VA.isRegLoc()) {
+      if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i32) {
+        assert(VA.getLocVT() == MVT::i32 &&
+               "unexpected calling convention register assignment");
+        assert(!Ins.empty() && Ins[0].VT == MVT::i32 &&
+               "unexpected use of 'returned'");
+        isThisReturn = true;
+      }
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else if (isByVal) {
+      assert(VA.isMemLoc());
+      unsigned offset = 0;
+
+      // True if this byval aggregate will be split between registers
+      // and memory.
+      unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();
+      unsigned CurByValIdx = CCInfo.getInRegsParamsProceed();
+
+      if (CurByValIdx < ByValArgsCount) {
+
+        unsigned RegBegin, RegEnd;
+        CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);
+
+        EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+        unsigned int i, j;
+        for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
+          SDValue Const = DAG.getConstant(4*i, MVT::i32);
+          SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
+          SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg,
+                                     MachinePointerInfo(),
+                                     false, false, false,
+                                     DAG.InferPtrAlignment(AddArg));
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(j, Load));
+        }
+
+        // If parameter size outsides register area, "offset" value
+        // helps us to calculate stack slot for remained part properly.
+        offset = RegEnd - RegBegin;
+
+        CCInfo.nextInRegsParam();
+      }
+
+      if (Flags.getByValSize() > 4*offset) {
+        unsigned LocMemOffset = VA.getLocMemOffset();
+        SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset);
+        SDValue Dst = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr,
+                                  StkPtrOff);
+        SDValue SrcOffset = DAG.getIntPtrConstant(4*offset);
+        SDValue Src = DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg, SrcOffset);
+        SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset,
+                                           MVT::i32);
+        SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), MVT::i32);
+
+        SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+        SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
+        MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
+                                          Ops));
+      }
+    } else if (!isSibCall) {
+      assert(VA.isMemLoc());
+
+      MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
+                                             dl, DAG, VA, Flags));
+    }
+  }
+
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into the appropriate regs.
+  SDValue InFlag;
+  // Tail call byval lowering might overwrite argument registers so in case of
+  // tail call optimization the copies to registers are lowered later.
+  if (!isTailCall)
+    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                               RegsToPass[i].second, InFlag);
+      InFlag = Chain.getValue(1);
+    }
+
+  // For tail calls lower the arguments to the 'real' stack slot.
+  if (isTailCall) {
+    // Force all the incoming stack arguments to be loaded from the stack
+    // before any new outgoing arguments are stored to the stack, because the
+    // outgoing stack slots may alias the incoming argument stack slots, and
+    // the alias isn't otherwise explicit. This is slightly more conservative
+    // than necessary, because it means that each store effectively depends
+    // on every argument instead of just those arguments it would clobber.
+
+    // Do not flag preceding copytoreg stuff together with the following stuff.
+    InFlag = SDValue();
+    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                               RegsToPass[i].second, InFlag);
+      InFlag = Chain.getValue(1);
+    }
+    InFlag = SDValue();
+  }
+
+  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
+  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
+  // node so that legalize doesn't hack it.
+  bool isDirect = false;
+  bool isARMFunc = false;
+  bool isLocalARMFunc = false;
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+
+  if (EnableARMLongCalls) {
+    assert((Subtarget->isTargetWindows() ||
+            getTargetMachine().getRelocationModel() == Reloc::Static) &&
+           "long-calls with non-static relocation model!");
+    // Handle a global address or an external symbol. If it's not one of
+    // those, the target's already in a register, so we don't need to do
+    // anything extra.
+    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+      const GlobalValue *GV = G->getGlobal();
+      // Create a constant pool entry for the callee address
+      unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
+      ARMConstantPoolValue *CPV =
+        ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0);
+
+      // Get the address of the callee into a register
+      SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
+      CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+      Callee = DAG.getLoad(getPointerTy(), dl,
+                           DAG.getEntryNode(), CPAddr,
+                           MachinePointerInfo::getConstantPool(),
+                           false, false, false, 0);
+    } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
+      const char *Sym = S->getSymbol();
+
+      // Create a constant pool entry for the callee address
+      unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
+      ARMConstantPoolValue *CPV =
+        ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
+                                      ARMPCLabelIndex, 0);
+      // Get the address of the callee into a register
+      SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
+      CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+      Callee = DAG.getLoad(getPointerTy(), dl,
+                           DAG.getEntryNode(), CPAddr,
+                           MachinePointerInfo::getConstantPool(),
+                           false, false, false, 0);
+    }
+  } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    const GlobalValue *GV = G->getGlobal();
+    isDirect = true;
+    bool isExt = GV->isDeclaration() || GV->isWeakForLinker();
+    bool isStub = (isExt && Subtarget->isTargetMachO()) &&
+                   getTargetMachine().getRelocationModel() != Reloc::Static;
+    isARMFunc = !Subtarget->isThumb() || isStub;
+    // ARM call to a local ARM function is predicable.
+    isLocalARMFunc = !Subtarget->isThumb() && (!isExt || !ARMInterworking);
+    // tBX takes a register source operand.
+    if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
+      assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");
+      Callee = DAG.getNode(ARMISD::WrapperPIC, dl, getPointerTy(),
+                           DAG.getTargetGlobalAddress(GV, dl, getPointerTy()));
+    } else if (Subtarget->isTargetCOFF()) {
+      assert(Subtarget->isTargetWindows() &&
+             "Windows is the only supported COFF target");
+      unsigned TargetFlags = GV->hasDLLImportStorageClass()
+                                 ? ARMII::MO_DLLIMPORT
+                                 : ARMII::MO_NO_FLAG;
+      Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), /*Offset=*/0,
+                                          TargetFlags);
+      if (GV->hasDLLImportStorageClass())
+        Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
+                             DAG.getNode(ARMISD::Wrapper, dl, getPointerTy(),
+                                         Callee), MachinePointerInfo::getGOT(),
+                             false, false, false, 0);
+    } else {
+      // On ELF targets for PIC code, direct calls should go through the PLT
+      unsigned OpFlags = 0;
+      if (Subtarget->isTargetELF() &&
+          getTargetMachine().getRelocationModel() == Reloc::PIC_)
+        OpFlags = ARMII::MO_PLT;
+      Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
+    }
+  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    isDirect = true;
+    bool isStub = Subtarget->isTargetMachO() &&
+                  getTargetMachine().getRelocationModel() != Reloc::Static;
+    isARMFunc = !Subtarget->isThumb() || isStub;
+    // tBX takes a register source operand.
+    const char *Sym = S->getSymbol();
+    if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
+      unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
+      ARMConstantPoolValue *CPV =
+        ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
+                                      ARMPCLabelIndex, 4);
+      SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
+      CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+      Callee = DAG.getLoad(getPointerTy(), dl,
+                           DAG.getEntryNode(), CPAddr,
+                           MachinePointerInfo::getConstantPool(),
+                           false, false, false, 0);
+      SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
+      Callee = DAG.getNode(ARMISD::PIC_ADD, dl,
+                           getPointerTy(), Callee, PICLabel);
+    } else {
+      unsigned OpFlags = 0;
+      // On ELF targets for PIC code, direct calls should go through the PLT
+      if (Subtarget->isTargetELF() &&
+                  getTargetMachine().getRelocationModel() == Reloc::PIC_)
+        OpFlags = ARMII::MO_PLT;
+      Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlags);
+    }
+  }
+
+  // FIXME: handle tail calls differently.
+  unsigned CallOpc;
+  bool HasMinSizeAttr = MF.getFunction()->getAttributes().hasAttribute(
+      AttributeSet::FunctionIndex, Attribute::MinSize);
+  if (Subtarget->isThumb()) {
+    if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
+      CallOpc = ARMISD::CALL_NOLINK;
+    else
+      CallOpc = isARMFunc ? ARMISD::CALL : ARMISD::tCALL;
+  } else {
+    if (!isDirect && !Subtarget->hasV5TOps())
+      CallOpc = ARMISD::CALL_NOLINK;
+    else if (doesNotRet && isDirect && Subtarget->hasRAS() &&
+               // Emit regular call when code size is the priority
+               !HasMinSizeAttr)
+      // "mov lr, pc; b _foo" to avoid confusing the RSP
+      CallOpc = ARMISD::CALL_NOLINK;
+    else
+      CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;
+  }
+
+  std::vector<SDValue> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add argument registers to the end of the list so that they are known live
+  // into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  // Add a register mask operand representing the call-preserved registers.
+  if (!isTailCall) {
+    const uint32_t *Mask;
+    const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+    const ARMBaseRegisterInfo *ARI = static_cast<const ARMBaseRegisterInfo*>(TRI);
+    if (isThisReturn) {
+      // For 'this' returns, use the R0-preserving mask if applicable
+      Mask = ARI->getThisReturnPreservedMask(CallConv);
+      if (!Mask) {
+        // Set isThisReturn to false if the calling convention is not one that
+        // allows 'returned' to be modeled in this way, so LowerCallResult does
+        // not try to pass 'this' straight through
+        isThisReturn = false;
+        Mask = ARI->getCallPreservedMask(CallConv);
+      }
+    } else
+      Mask = ARI->getCallPreservedMask(CallConv);
+
+    assert(Mask && "Missing call preserved mask for calling convention");
+    Ops.push_back(DAG.getRegisterMask(Mask));
+  }
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  if (isTailCall)
+    return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops);
+
+  // Returns a chain and a flag for retval copy to use.
+  Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
+  InFlag = Chain.getValue(1);
+
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                             DAG.getIntPtrConstant(0, true), InFlag, dl);
+  if (!Ins.empty())
+    InFlag = Chain.getValue(1);
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
+                         InVals, isThisReturn,
+                         isThisReturn ? OutVals[0] : SDValue());
+}
+
+/// HandleByVal - Every parameter *after* a byval parameter is passed
+/// on the stack.  Remember the next parameter register to allocate,
+/// and then confiscate the rest of the parameter registers to insure
+/// this.
+void
+ARMTargetLowering::HandleByVal(
+    CCState *State, unsigned &size, unsigned Align) const {
+  unsigned reg = State->AllocateReg(GPRArgRegs, 4);
+  assert((State->getCallOrPrologue() == Prologue ||
+          State->getCallOrPrologue() == Call) &&
+         "unhandled ParmContext");
+
+  if ((ARM::R0 <= reg) && (reg <= ARM::R3)) {
+    if (Subtarget->isAAPCS_ABI() && Align > 4) {
+      unsigned AlignInRegs = Align / 4;
+      unsigned Waste = (ARM::R4 - reg) % AlignInRegs;
+      for (unsigned i = 0; i < Waste; ++i)
+        reg = State->AllocateReg(GPRArgRegs, 4);
+    }
+    if (reg != 0) {
+      unsigned excess = 4 * (ARM::R4 - reg);
+
+      // Special case when NSAA != SP and parameter size greater than size of
+      // all remained GPR regs. In that case we can't split parameter, we must
+      // send it to stack. We also must set NCRN to R4, so waste all
+      // remained registers.
+      const unsigned NSAAOffset = State->getNextStackOffset();
+      if (Subtarget->isAAPCS_ABI() && NSAAOffset != 0 && size > excess) {
+        while (State->AllocateReg(GPRArgRegs, 4))
+          ;
+        return;
+      }
+
+      // First register for byval parameter is the first register that wasn't
+      // allocated before this method call, so it would be "reg".
+      // If parameter is small enough to be saved in range [reg, r4), then
+      // the end (first after last) register would be reg + param-size-in-regs,
+      // else parameter would be splitted between registers and stack,
+      // end register would be r4 in this case.
+      unsigned ByValRegBegin = reg;
+      unsigned ByValRegEnd = (size < excess) ? reg + size/4 : (unsigned)ARM::R4;
+      State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd);
+      // Note, first register is allocated in the beginning of function already,
+      // allocate remained amount of registers we need.
+      for (unsigned i = reg+1; i != ByValRegEnd; ++i)
+        State->AllocateReg(GPRArgRegs, 4);
+      // A byval parameter that is split between registers and memory needs its
+      // size truncated here.
+      // In the case where the entire structure fits in registers, we set the
+      // size in memory to zero.
+      if (size < excess)
+        size = 0;
+      else
+        size -= excess;
+    }
+  }
+}
+
+/// MatchingStackOffset - Return true if the given stack call argument is
+/// already available in the same position (relatively) of the caller's
+/// incoming argument stack.
+static
+bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
+                         MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
+                         const TargetInstrInfo *TII) {
+  unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
+  int FI = INT_MAX;
+  if (Arg.getOpcode() == ISD::CopyFromReg) {
+    unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(VR))
+      return false;
+    MachineInstr *Def = MRI->getVRegDef(VR);
+    if (!Def)
+      return false;
+    if (!Flags.isByVal()) {
+      if (!TII->isLoadFromStackSlot(Def, FI))
+        return false;
+    } else {
+      return false;
+    }
+  } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
+    if (Flags.isByVal())
+      // ByVal argument is passed in as a pointer but it's now being
+      // dereferenced. e.g.
+      // define @foo(%struct.X* %A) {
+      //   tail call @bar(%struct.X* byval %A)
+      // }
+      return false;
+    SDValue Ptr = Ld->getBasePtr();
+    FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
+    if (!FINode)
+      return false;
+    FI = FINode->getIndex();
+  } else
+    return false;
+
+  assert(FI != INT_MAX);
+  if (!MFI->isFixedObjectIndex(FI))
+    return false;
+  return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
+}
+
+/// IsEligibleForTailCallOptimization - Check whether the call is eligible
+/// for tail call optimization. Targets which want to do tail call
+/// optimization should implement this function.
+bool
+ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
+                                                     CallingConv::ID CalleeCC,
+                                                     bool isVarArg,
+                                                     bool isCalleeStructRet,
+                                                     bool isCallerStructRet,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<SDValue> &OutVals,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                                     SelectionDAG& DAG) const {
+  const Function *CallerF = DAG.getMachineFunction().getFunction();
+  CallingConv::ID CallerCC = CallerF->getCallingConv();
+  bool CCMatch = CallerCC == CalleeCC;
+
+  // Look for obvious safe cases to perform tail call optimization that do not
+  // require ABI changes. This is what gcc calls sibcall.
+
+  // Do not sibcall optimize vararg calls unless the call site is not passing
+  // any arguments.
+  if (isVarArg && !Outs.empty())
+    return false;
+
+  // Exception-handling functions need a special set of instructions to indicate
+  // a return to the hardware. Tail-calling another function would probably
+  // break this.
+  if (CallerF->hasFnAttribute("interrupt"))
+    return false;
+
+  // Also avoid sibcall optimization if either caller or callee uses struct
+  // return semantics.
+  if (isCalleeStructRet || isCallerStructRet)
+    return false;
+
+  // FIXME: Completely disable sibcall for Thumb1 since Thumb1RegisterInfo::
+  // emitEpilogue is not ready for them. Thumb tail calls also use t2B, as
+  // the Thumb1 16-bit unconditional branch doesn't have sufficient relocation
+  // support in the assembler and linker to be used. This would need to be
+  // fixed to fully support tail calls in Thumb1.
+  //
+  // Doing this is tricky, since the LDM/POP instruction on Thumb doesn't take
+  // LR.  This means if we need to reload LR, it takes an extra instructions,
+  // which outweighs the value of the tail call; but here we don't know yet
+  // whether LR is going to be used.  Probably the right approach is to
+  // generate the tail call here and turn it back into CALL/RET in
+  // emitEpilogue if LR is used.
+
+  // Thumb1 PIC calls to external symbols use BX, so they can be tail calls,
+  // but we need to make sure there are enough registers; the only valid
+  // registers are the 4 used for parameters.  We don't currently do this
+  // case.
+  if (Subtarget->isThumb1Only())
+    return false;
+
+  // If the calling conventions do not match, then we'd better make sure the
+  // results are returned in the same way as what the caller expects.
+  if (!CCMatch) {
+    SmallVector<CCValAssign, 16> RVLocs1;
+    ARMCCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
+                       getTargetMachine(), RVLocs1, *DAG.getContext(), Call);
+    CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForNode(CalleeCC, true, isVarArg));
+
+    SmallVector<CCValAssign, 16> RVLocs2;
+    ARMCCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
+                       getTargetMachine(), RVLocs2, *DAG.getContext(), Call);
+    CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForNode(CallerCC, true, isVarArg));
+
+    if (RVLocs1.size() != RVLocs2.size())
+      return false;
+    for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
+      if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
+        return false;
+      if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
+        return false;
+      if (RVLocs1[i].isRegLoc()) {
+        if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
+          return false;
+      } else {
+        if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
+          return false;
+      }
+    }
+  }
+
+  // If Caller's vararg or byval argument has been split between registers and
+  // stack, do not perform tail call, since part of the argument is in caller's
+  // local frame.
+  const ARMFunctionInfo *AFI_Caller = DAG.getMachineFunction().
+                                      getInfo<ARMFunctionInfo>();
+  if (AFI_Caller->getArgRegsSaveSize())
+    return false;
+
+  // If the callee takes no arguments then go on to check the results of the
+  // call.
+  if (!Outs.empty()) {
+    // Check if stack adjustment is needed. For now, do not do this if any
+    // argument is passed on the stack.
+    SmallVector<CCValAssign, 16> ArgLocs;
+    ARMCCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
+                      getTargetMachine(), ArgLocs, *DAG.getContext(), Call);
+    CCInfo.AnalyzeCallOperands(Outs,
+                               CCAssignFnForNode(CalleeCC, false, isVarArg));
+    if (CCInfo.getNextStackOffset()) {
+      MachineFunction &MF = DAG.getMachineFunction();
+
+      // Check if the arguments are already laid out in the right way as
+      // the caller's fixed stack objects.
+      MachineFrameInfo *MFI = MF.getFrameInfo();
+      const MachineRegisterInfo *MRI = &MF.getRegInfo();
+      const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+      for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
+           i != e;
+           ++i, ++realArgIdx) {
+        CCValAssign &VA = ArgLocs[i];
+        EVT RegVT = VA.getLocVT();
+        SDValue Arg = OutVals[realArgIdx];
+        ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
+        if (VA.getLocInfo() == CCValAssign::Indirect)
+          return false;
+        if (VA.needsCustom()) {
+          // f64 and vector types are split into multiple registers or
+          // register/stack-slot combinations.  The types will not match
+          // the registers; give up on memory f64 refs until we figure
+          // out what to do about this.
+          if (!VA.isRegLoc())
+            return false;
+          if (!ArgLocs[++i].isRegLoc())
+            return false;
+          if (RegVT == MVT::v2f64) {
+            if (!ArgLocs[++i].isRegLoc())
+              return false;
+            if (!ArgLocs[++i].isRegLoc())
+              return false;
+          }
+        } else if (!VA.isRegLoc()) {
+          if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
+                                   MFI, MRI, TII))
+            return false;
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+bool
+ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
+                                  MachineFunction &MF, bool isVarArg,
+                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                  LLVMContext &Context) const {
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), RVLocs, Context);
+  return CCInfo.CheckReturn(Outs, CCAssignFnForNode(CallConv, /*Return=*/true,
+                                                    isVarArg));
+}
+
+static SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps,
+                                    SDLoc DL, SelectionDAG &DAG) {
+  const MachineFunction &MF = DAG.getMachineFunction();
+  const Function *F = MF.getFunction();
+
+  StringRef IntKind = F->getFnAttribute("interrupt").getValueAsString();
+
+  // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset
+  // version of the "preferred return address". These offsets affect the return
+  // instruction if this is a return from PL1 without hypervisor extensions.
+  //    IRQ/FIQ: +4     "subs pc, lr, #4"
+  //    SWI:     0      "subs pc, lr, #0"
+  //    ABORT:   +4     "subs pc, lr, #4"
+  //    UNDEF:   +4/+2  "subs pc, lr, #0"
+  // UNDEF varies depending on where the exception came from ARM or Thumb
+  // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0.
+
+  int64_t LROffset;
+  if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" ||
+      IntKind == "ABORT")
+    LROffset = 4;
+  else if (IntKind == "SWI" || IntKind == "UNDEF")
+    LROffset = 0;
+  else
+    report_fatal_error("Unsupported interrupt attribute. If present, value "
+                       "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF");
+
+  RetOps.insert(RetOps.begin() + 1, DAG.getConstant(LROffset, MVT::i32, false));
+
+  return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other, RetOps);
+}
+
+SDValue
+ARMTargetLowering::LowerReturn(SDValue Chain,
+                               CallingConv::ID CallConv, bool isVarArg,
+                               const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               const SmallVectorImpl<SDValue> &OutVals,
+                               SDLoc dl, SelectionDAG &DAG) const {
+
+  // CCValAssign - represent the assignment of the return value to a location.
+  SmallVector<CCValAssign, 16> RVLocs;
+
+  // CCState - Info about the registers and stack slots.
+  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                    getTargetMachine(), RVLocs, *DAG.getContext(), Call);
+
+  // Analyze outgoing return values.
+  CCInfo.AnalyzeReturn(Outs, CCAssignFnForNode(CallConv, /* Return */ true,
+                                               isVarArg));
+
+  SDValue Flag;
+  SmallVector<SDValue, 4> RetOps;
+  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
+  bool isLittleEndian = Subtarget->isLittle();
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0, realRVLocIdx = 0;
+       i != RVLocs.size();
+       ++i, ++realRVLocIdx) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    SDValue Arg = OutVals[realRVLocIdx];
+
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::BCvt:
+      Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
+      break;
+    }
+
+    if (VA.needsCustom()) {
+      if (VA.getLocVT() == MVT::v2f64) {
+        // Extract the first half and return it in two registers.
+        SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
+                                   DAG.getConstant(0, MVT::i32));
+        SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
+                                       DAG.getVTList(MVT::i32, MVT::i32), Half);
+
+        Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
+                                 HalfGPRs.getValue(isLittleEndian ? 0 : 1),
+                                 Flag);
+        Flag = Chain.getValue(1);
+        RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+        VA = RVLocs[++i]; // skip ahead to next loc
+        Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
+                                 HalfGPRs.getValue(isLittleEndian ? 1 : 0),
+                                 Flag);
+        Flag = Chain.getValue(1);
+        RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+        VA = RVLocs[++i]; // skip ahead to next loc
+
+        // Extract the 2nd half and fall through to handle it as an f64 value.
+        Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
+                          DAG.getConstant(1, MVT::i32));
+      }
+      // Legalize ret f64 -> ret 2 x i32.  We always have fmrrd if f64 is
+      // available.
+      SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
+                                  DAG.getVTList(MVT::i32, MVT::i32), Arg);
+      Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
+                               fmrrd.getValue(isLittleEndian ? 0 : 1),
+                               Flag);
+      Flag = Chain.getValue(1);
+      RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+      VA = RVLocs[++i]; // skip ahead to next loc
+      Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
+                               fmrrd.getValue(isLittleEndian ? 1 : 0),
+                               Flag);
+    } else
+      Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
+
+    // Guarantee that all emitted copies are
+    // stuck together, avoiding something bad.
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
+
+  // Update chain and glue.
+  RetOps[0] = Chain;
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
+  // CPUs which aren't M-class use a special sequence to return from
+  // exceptions (roughly, any instruction setting pc and cpsr simultaneously,
+  // though we use "subs pc, lr, #N").
+  //
+  // M-class CPUs actually use a normal return sequence with a special
+  // (hardware-provided) value in LR, so the normal code path works.
+  if (DAG.getMachineFunction().getFunction()->hasFnAttribute("interrupt") &&
+      !Subtarget->isMClass()) {
+    if (Subtarget->isThumb1Only())
+      report_fatal_error("interrupt attribute is not supported in Thumb1");
+    return LowerInterruptReturn(RetOps, dl, DAG);
+  }
+
+  return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, RetOps);
+}
+
+bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
+  if (N->getNumValues() != 1)
+    return false;
+  if (!N->hasNUsesOfValue(1, 0))
+    return false;
+
+  SDValue TCChain = Chain;
+  SDNode *Copy = *N->use_begin();
+  if (Copy->getOpcode() == ISD::CopyToReg) {
+    // If the copy has a glue operand, we conservatively assume it isn't safe to
+    // perform a tail call.
+    if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
+      return false;
+    TCChain = Copy->getOperand(0);
+  } else if (Copy->getOpcode() == ARMISD::VMOVRRD) {
+    SDNode *VMov = Copy;
+    // f64 returned in a pair of GPRs.
+    SmallPtrSet<SDNode*, 2> Copies;
+    for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
+         UI != UE; ++UI) {
+      if (UI->getOpcode() != ISD::CopyToReg)
+        return false;
+      Copies.insert(*UI);
+    }
+    if (Copies.size() > 2)
+      return false;
+
+    for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
+         UI != UE; ++UI) {
+      SDValue UseChain = UI->getOperand(0);
+      if (Copies.count(UseChain.getNode()))
+        // Second CopyToReg
+        Copy = *UI;
+      else
+        // First CopyToReg
+        TCChain = UseChain;
+    }
+  } else if (Copy->getOpcode() == ISD::BITCAST) {
+    // f32 returned in a single GPR.
+    if (!Copy->hasOneUse())
+      return false;
+    Copy = *Copy->use_begin();
+    if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))
+      return false;
+    TCChain = Copy->getOperand(0);
+  } else {
+    return false;
+  }
+
+  bool HasRet = false;
+  for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
+       UI != UE; ++UI) {
+    if (UI->getOpcode() != ARMISD::RET_FLAG &&
+        UI->getOpcode() != ARMISD::INTRET_FLAG)
+      return false;
+    HasRet = true;
+  }
+
+  if (!HasRet)
+    return false;
+
+  Chain = TCChain;
+  return true;
+}
+
+bool ARMTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
+  if (!Subtarget->supportsTailCall())
+    return false;
+
+  if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
+    return false;
+
+  return !Subtarget->isThumb1Only();
+}
+
+// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
+// their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
+// one of the above mentioned nodes. It has to be wrapped because otherwise
+// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
+// be used to form addressing mode. These wrapped nodes will be selected
+// into MOVi.
+static SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) {
+  EVT PtrVT = Op.getValueType();
+  // FIXME there is no actual debug info here
+  SDLoc dl(Op);
+  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+  SDValue Res;
+  if (CP->isMachineConstantPoolEntry())
+    Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
+                                    CP->getAlignment());
+  else
+    Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT,
+                                    CP->getAlignment());
+  return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
+}
+
+unsigned ARMTargetLowering::getJumpTableEncoding() const {
+  return MachineJumpTableInfo::EK_Inline;
+}
+
+SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  unsigned ARMPCLabelIndex = 0;
+  SDLoc DL(Op);
+  EVT PtrVT = getPointerTy();
+  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+  Reloc::Model RelocM = getTargetMachine().getRelocationModel();
+  SDValue CPAddr;
+  if (RelocM == Reloc::Static) {
+    CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4);
+  } else {
+    unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
+    ARMPCLabelIndex = AFI->createPICLabelUId();
+    ARMConstantPoolValue *CPV =
+      ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,
+                                      ARMCP::CPBlockAddress, PCAdj);
+    CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+  }
+  CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
+  SDValue Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), CPAddr,
+                               MachinePointerInfo::getConstantPool(),
+                               false, false, false, 0);
+  if (RelocM == Reloc::Static)
+    return Result;
+  SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
+  return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
+}
+
+// Lower ISD::GlobalTLSAddress using the "general dynamic" model
+SDValue
+ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
+                                                 SelectionDAG &DAG) const {
+  SDLoc dl(GA);
+  EVT PtrVT = getPointerTy();
+  unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
+  MachineFunction &MF = DAG.getMachineFunction();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
+  ARMConstantPoolValue *CPV =
+    ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
+                                    ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
+  SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+  Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
+  Argument = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Argument,
+                         MachinePointerInfo::getConstantPool(),
+                         false, false, false, 0);
+  SDValue Chain = Argument.getValue(1);
+
+  SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
+  Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel);
+
+  // call __tls_get_addr.
+  ArgListTy Args;
+  ArgListEntry Entry;
+  Entry.Node = Argument;
+  Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext());
+  Args.push_back(Entry);
+
+  // FIXME: is there useful debug info available here?
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(Chain)
+    .setCallee(CallingConv::C, Type::getInt32Ty(*DAG.getContext()),
+               DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args),
+               0);
+
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+  return CallResult.first;
+}
+
+// Lower ISD::GlobalTLSAddress using the "initial exec" or
+// "local exec" model.
+SDValue
+ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
+                                        SelectionDAG &DAG,
+                                        TLSModel::Model model) const {
+  const GlobalValue *GV = GA->getGlobal();
+  SDLoc dl(GA);
+  SDValue Offset;
+  SDValue Chain = DAG.getEntryNode();
+  EVT PtrVT = getPointerTy();
+  // Get the Thread Pointer
+  SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
+
+  if (model == TLSModel::InitialExec) {
+    MachineFunction &MF = DAG.getMachineFunction();
+    ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+    unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
+    // Initial exec model.
+    unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
+    ARMConstantPoolValue *CPV =
+      ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
+                                      ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF,
+                                      true);
+    Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+    Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
+    Offset = DAG.getLoad(PtrVT, dl, Chain, Offset,
+                         MachinePointerInfo::getConstantPool(),
+                         false, false, false, 0);
+    Chain = Offset.getValue(1);
+
+    SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
+    Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);
+
+    Offset = DAG.getLoad(PtrVT, dl, Chain, Offset,
+                         MachinePointerInfo::getConstantPool(),
+                         false, false, false, 0);
+  } else {
+    // local exec model
+    assert(model == TLSModel::LocalExec);
+    ARMConstantPoolValue *CPV =
+      ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF);
+    Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+    Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
+    Offset = DAG.getLoad(PtrVT, dl, Chain, Offset,
+                         MachinePointerInfo::getConstantPool(),
+                         false, false, false, 0);
+  }
+
+  // The address of the thread local variable is the add of the thread
+  // pointer with the offset of the variable.
+  return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
+}
+
+SDValue
+ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
+  // TODO: implement the "local dynamic" model
+  assert(Subtarget->isTargetELF() &&
+         "TLS not implemented for non-ELF targets");
+  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+
+  TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal());
+
+  switch (model) {
+    case TLSModel::GeneralDynamic:
+    case TLSModel::LocalDynamic:
+      return LowerToTLSGeneralDynamicModel(GA, DAG);
+    case TLSModel::InitialExec:
+    case TLSModel::LocalExec:
+      return LowerToTLSExecModels(GA, DAG, model);
+  }
+  llvm_unreachable("bogus TLS model");
+}
+
+SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  EVT PtrVT = getPointerTy();
+  SDLoc dl(Op);
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
+    bool UseGOTOFF = GV->hasLocalLinkage() || GV->hasHiddenVisibility();
+    ARMConstantPoolValue *CPV =
+      ARMConstantPoolConstant::Create(GV,
+                                      UseGOTOFF ? ARMCP::GOTOFF : ARMCP::GOT);
+    SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+    CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+    SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
+                                 CPAddr,
+                                 MachinePointerInfo::getConstantPool(),
+                                 false, false, false, 0);
+    SDValue Chain = Result.getValue(1);
+    SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(PtrVT);
+    Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result, GOT);
+    if (!UseGOTOFF)
+      Result = DAG.getLoad(PtrVT, dl, Chain, Result,
+                           MachinePointerInfo::getGOT(),
+                           false, false, false, 0);
+    return Result;
+  }
+
+  // If we have T2 ops, we can materialize the address directly via movt/movw
+  // pair. This is always cheaper.
+  if (Subtarget->useMovt(DAG.getMachineFunction())) {
+    ++NumMovwMovt;
+    // FIXME: Once remat is capable of dealing with instructions with register
+    // operands, expand this into two nodes.
+    return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
+                       DAG.getTargetGlobalAddress(GV, dl, PtrVT));
+  } else {
+    SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
+    CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+    return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
+                       MachinePointerInfo::getConstantPool(),
+                       false, false, false, 0);
+  }
+}
+
+SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
+                                                    SelectionDAG &DAG) const {
+  EVT PtrVT = getPointerTy();
+  SDLoc dl(Op);
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  Reloc::Model RelocM = getTargetMachine().getRelocationModel();
+
+  if (Subtarget->useMovt(DAG.getMachineFunction()))
+    ++NumMovwMovt;
+
+  // FIXME: Once remat is capable of dealing with instructions with register
+  // operands, expand this into multiple nodes
+  unsigned Wrapper =
+      RelocM == Reloc::PIC_ ? ARMISD::WrapperPIC : ARMISD::Wrapper;
+
+  SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY);
+  SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G);
+
+  if (Subtarget->GVIsIndirectSymbol(GV, RelocM))
+    Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
+                         MachinePointerInfo::getGOT(), false, false, false, 0);
+  return Result;
+}
+
+SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported");
+  assert(Subtarget->useMovt(DAG.getMachineFunction()) &&
+         "Windows on ARM expects to use movw/movt");
+
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  const ARMII::TOF TargetFlags =
+    (GV->hasDLLImportStorageClass() ? ARMII::MO_DLLIMPORT : ARMII::MO_NO_FLAG);
+  EVT PtrVT = getPointerTy();
+  SDValue Result;
+  SDLoc DL(Op);
+
+  ++NumMovwMovt;
+
+  // FIXME: Once remat is capable of dealing with instructions with register
+  // operands, expand this into two nodes.
+  Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT,
+                       DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*Offset=*/0,
+                                                  TargetFlags));
+  if (GV->hasDLLImportStorageClass())
+    Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
+                         MachinePointerInfo::getGOT(), false, false, false, 0);
+  return Result;
+}
+
+SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op,
+                                                    SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetELF() &&
+         "GLOBAL OFFSET TABLE not implemented for non-ELF targets");
+  MachineFunction &MF = DAG.getMachineFunction();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
+  EVT PtrVT = getPointerTy();
+  SDLoc dl(Op);
+  unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
+  ARMConstantPoolValue *CPV =
+    ARMConstantPoolSymbol::Create(*DAG.getContext(), "_GLOBAL_OFFSET_TABLE_",
+                                  ARMPCLabelIndex, PCAdj);
+  SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+  CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+  SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
+                               MachinePointerInfo::getConstantPool(),
+                               false, false, false, 0);
+  SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
+  return DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
+}
+
+SDValue
+ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  SDValue Val = DAG.getConstant(0, MVT::i32);
+  return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl,
+                     DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
+                     Op.getOperand(1), Val);
+}
+
+SDValue
+ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0),
+                     Op.getOperand(1), DAG.getConstant(0, MVT::i32));
+}
+
+SDValue
+ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
+                                          const ARMSubtarget *Subtarget) const {
+  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  SDLoc dl(Op);
+  switch (IntNo) {
+  default: return SDValue();    // Don't custom lower most intrinsics.
+  case Intrinsic::arm_rbit: {
+    assert(Op.getOperand(1).getValueType() == MVT::i32 &&
+           "RBIT intrinsic must have i32 type!");
+    return DAG.getNode(ARMISD::RBIT, dl, MVT::i32, Op.getOperand(1));
+  }
+  case Intrinsic::arm_thread_pointer: {
+    EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+    return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
+  }
+  case Intrinsic::eh_sjlj_lsda: {
+    MachineFunction &MF = DAG.getMachineFunction();
+    ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+    unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
+    EVT PtrVT = getPointerTy();
+    Reloc::Model RelocM = getTargetMachine().getRelocationModel();
+    SDValue CPAddr;
+    unsigned PCAdj = (RelocM != Reloc::PIC_)
+      ? 0 : (Subtarget->isThumb() ? 4 : 8);
+    ARMConstantPoolValue *CPV =
+      ARMConstantPoolConstant::Create(MF.getFunction(), ARMPCLabelIndex,
+                                      ARMCP::CPLSDA, PCAdj);
+    CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+    CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+    SDValue Result =
+      DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
+                  MachinePointerInfo::getConstantPool(),
+                  false, false, false, 0);
+
+    if (RelocM == Reloc::PIC_) {
+      SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
+      Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
+    }
+    return Result;
+  }
+  case Intrinsic::arm_neon_vmulls:
+  case Intrinsic::arm_neon_vmullu: {
+    unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
+      ? ARMISD::VMULLs : ARMISD::VMULLu;
+    return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  }
+  }
+}
+
+static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG,
+                                 const ARMSubtarget *Subtarget) {
+  // FIXME: handle "fence singlethread" more efficiently.
+  SDLoc dl(Op);
+  if (!Subtarget->hasDataBarrier()) {
+    // Some ARMv6 cpus can support data barriers with an mcr instruction.
+    // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
+    // here.
+    assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
+           "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!");
+    return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
+                       DAG.getConstant(0, MVT::i32));
+  }
+
+  ConstantSDNode *OrdN = cast<ConstantSDNode>(Op.getOperand(1));
+  AtomicOrdering Ord = static_cast<AtomicOrdering>(OrdN->getZExtValue());
+  unsigned Domain = ARM_MB::ISH;
+  if (Subtarget->isMClass()) {
+    // Only a full system barrier exists in the M-class architectures.
+    Domain = ARM_MB::SY;
+  } else if (Subtarget->isSwift() && Ord == Release) {
+    // Swift happens to implement ISHST barriers in a way that's compatible with
+    // Release semantics but weaker than ISH so we'd be fools not to use
+    // it. Beware: other processors probably don't!
+    Domain = ARM_MB::ISHST;
+  }
+
+  return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0),
+                     DAG.getConstant(Intrinsic::arm_dmb, MVT::i32),
+                     DAG.getConstant(Domain, MVT::i32));
+}
+
+static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG,
+                             const ARMSubtarget *Subtarget) {
+  // ARM pre v5TE and Thumb1 does not have preload instructions.
+  if (!(Subtarget->isThumb2() ||
+        (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps())))
+    // Just preserve the chain.
+    return Op.getOperand(0);
+
+  SDLoc dl(Op);
+  unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1;
+  if (!isRead &&
+      (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))
+    // ARMv7 with MP extension has PLDW.
+    return Op.getOperand(0);
+
+  unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
+  if (Subtarget->isThumb()) {
+    // Invert the bits.
+    isRead = ~isRead & 1;
+    isData = ~isData & 1;
+  }
+
+  return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0),
+                     Op.getOperand(1), DAG.getConstant(isRead, MVT::i32),
+                     DAG.getConstant(isData, MVT::i32));
+}
+
+static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>();
+
+  // vastart just stores the address of the VarArgsFrameIndex slot into the
+  // memory location argument.
+  SDLoc dl(Op);
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
+                      MachinePointerInfo(SV), false, false, 0);
+}
+
+SDValue
+ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
+                                        SDValue &Root, SelectionDAG &DAG,
+                                        SDLoc dl) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+
+  const TargetRegisterClass *RC;
+  if (AFI->isThumb1OnlyFunction())
+    RC = &ARM::tGPRRegClass;
+  else
+    RC = &ARM::GPRRegClass;
+
+  // Transform the arguments stored in physical registers into virtual ones.
+  unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+  SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
+
+  SDValue ArgValue2;
+  if (NextVA.isMemLoc()) {
+    MachineFrameInfo *MFI = MF.getFrameInfo();
+    int FI = MFI->CreateFixedObject(4, NextVA.getLocMemOffset(), true);
+
+    // Create load node to retrieve arguments from the stack.
+    SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+    ArgValue2 = DAG.getLoad(MVT::i32, dl, Root, FIN,
+                            MachinePointerInfo::getFixedStack(FI),
+                            false, false, false, 0);
+  } else {
+    Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
+    ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
+  }
+  if (!Subtarget->isLittle())
+    std::swap (ArgValue, ArgValue2);
+  return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);
+}
+
+void
+ARMTargetLowering::computeRegArea(CCState &CCInfo, MachineFunction &MF,
+                                  unsigned InRegsParamRecordIdx,
+                                  unsigned ArgSize,
+                                  unsigned &ArgRegsSize,
+                                  unsigned &ArgRegsSaveSize)
+  const {
+  unsigned NumGPRs;
+  if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) {
+    unsigned RBegin, REnd;
+    CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd);
+    NumGPRs = REnd - RBegin;
+  } else {
+    unsigned int firstUnalloced;
+    firstUnalloced = CCInfo.getFirstUnallocated(GPRArgRegs,
+                                                sizeof(GPRArgRegs) /
+                                                sizeof(GPRArgRegs[0]));
+    NumGPRs = (firstUnalloced <= 3) ? (4 - firstUnalloced) : 0;
+  }
+
+  unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment();
+  ArgRegsSize = NumGPRs * 4;
+
+  // If parameter is split between stack and GPRs...
+  if (NumGPRs && Align > 4 &&
+      (ArgRegsSize < ArgSize ||
+        InRegsParamRecordIdx >= CCInfo.getInRegsParamsCount())) {
+    // Add padding for part of param recovered from GPRs.  For example,
+    // if Align == 8, its last byte must be at address K*8 - 1.
+    // We need to do it, since remained (stack) part of parameter has
+    // stack alignment, and we need to "attach" "GPRs head" without gaps
+    // to it:
+    // Stack:
+    // |---- 8 bytes block ----| |---- 8 bytes block ----| |---- 8 bytes...
+    // [ [padding] [GPRs head] ] [        Tail passed via stack       ....
+    //
+    ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+    unsigned Padding =
+        OffsetToAlignment(ArgRegsSize + AFI->getArgRegsSaveSize(), Align);
+    ArgRegsSaveSize = ArgRegsSize + Padding;
+  } else
+    // We don't need to extend regs save size for byval parameters if they
+    // are passed via GPRs only.
+    ArgRegsSaveSize = ArgRegsSize;
+}
+
+// The remaining GPRs hold either the beginning of variable-argument
+// data, or the beginning of an aggregate passed by value (usually
+// byval).  Either way, we allocate stack slots adjacent to the data
+// provided by our caller, and store the unallocated registers there.
+// If this is a variadic function, the va_list pointer will begin with
+// these values; otherwise, this reassembles a (byval) structure that
+// was split between registers and memory.
+// Return: The frame index registers were stored into.
+int
+ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
+                                  SDLoc dl, SDValue &Chain,
+                                  const Value *OrigArg,
+                                  unsigned InRegsParamRecordIdx,
+                                  unsigned OffsetFromOrigArg,
+                                  unsigned ArgOffset,
+                                  unsigned ArgSize,
+                                  bool ForceMutable,
+                                  unsigned ByValStoreOffset,
+                                  unsigned TotalArgRegsSaveSize) const {
+
+  // Currently, two use-cases possible:
+  // Case #1. Non-var-args function, and we meet first byval parameter.
+  //          Setup first unallocated register as first byval register;
+  //          eat all remained registers
+  //          (these two actions are performed by HandleByVal method).
+  //          Then, here, we initialize stack frame with
+  //          "store-reg" instructions.
+  // Case #2. Var-args function, that doesn't contain byval parameters.
+  //          The same: eat all remained unallocated registers,
+  //          initialize stack frame.
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  unsigned firstRegToSaveIndex, lastRegToSaveIndex;
+  unsigned RBegin, REnd;
+  if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) {
+    CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd);
+    firstRegToSaveIndex = RBegin - ARM::R0;
+    lastRegToSaveIndex = REnd - ARM::R0;
+  } else {
+    firstRegToSaveIndex = CCInfo.getFirstUnallocated
+      (GPRArgRegs, array_lengthof(GPRArgRegs));
+    lastRegToSaveIndex = 4;
+  }
+
+  unsigned ArgRegsSize, ArgRegsSaveSize;
+  computeRegArea(CCInfo, MF, InRegsParamRecordIdx, ArgSize,
+                 ArgRegsSize, ArgRegsSaveSize);
+
+  // Store any by-val regs to their spots on the stack so that they may be
+  // loaded by deferencing the result of formal parameter pointer or va_next.
+  // Note: once stack area for byval/varargs registers
+  // was initialized, it can't be initialized again.
+  if (ArgRegsSaveSize) {
+    unsigned Padding = ArgRegsSaveSize - ArgRegsSize;
+
+    if (Padding) {
+      assert(AFI->getStoredByValParamsPadding() == 0 &&
+             "The only parameter may be padded.");
+      AFI->setStoredByValParamsPadding(Padding);
+    }
+
+    int FrameIndex = MFI->CreateFixedObject(ArgRegsSaveSize,
+                                            Padding +
+                                              ByValStoreOffset -
+                                              (int64_t)TotalArgRegsSaveSize,
+                                            false);
+    SDValue FIN = DAG.getFrameIndex(FrameIndex, getPointerTy());
+    if (Padding) {
+       MFI->CreateFixedObject(Padding,
+                              ArgOffset + ByValStoreOffset -
+                                (int64_t)ArgRegsSaveSize,
+                              false);
+    }
+
+    SmallVector<SDValue, 4> MemOps;
+    for (unsigned i = 0; firstRegToSaveIndex < lastRegToSaveIndex;
+         ++firstRegToSaveIndex, ++i) {
+      const TargetRegisterClass *RC;
+      if (AFI->isThumb1OnlyFunction())
+        RC = &ARM::tGPRRegClass;
+      else
+        RC = &ARM::GPRRegClass;
+
+      unsigned VReg = MF.addLiveIn(GPRArgRegs[firstRegToSaveIndex], RC);
+      SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
+      SDValue Store =
+        DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                     MachinePointerInfo(OrigArg, OffsetFromOrigArg + 4*i),
+                     false, false, 0);
+      MemOps.push_back(Store);
+      FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN,
+                        DAG.getConstant(4, getPointerTy()));
+    }
+
+    AFI->setArgRegsSaveSize(ArgRegsSaveSize + AFI->getArgRegsSaveSize());
+
+    if (!MemOps.empty())
+      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
+    return FrameIndex;
+  } else {
+    if (ArgSize == 0) {
+      // We cannot allocate a zero-byte object for the first variadic argument,
+      // so just make up a size.
+      ArgSize = 4;
+    }
+    // This will point to the next argument passed via stack.
+    return MFI->CreateFixedObject(
+      ArgSize, ArgOffset, !ForceMutable);
+  }
+}
+
+// Setup stack frame, the va_list pointer will start from.
+void
+ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
+                                        SDLoc dl, SDValue &Chain,
+                                        unsigned ArgOffset,
+                                        unsigned TotalArgRegsSaveSize,
+                                        bool ForceMutable) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+
+  // Try to store any remaining integer argument regs
+  // to their spots on the stack so that they may be loaded by deferencing
+  // the result of va_next.
+  // If there is no regs to be stored, just point address after last
+  // argument passed via stack.
+  int FrameIndex =
+    StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr,
+                   CCInfo.getInRegsParamsCount(), 0, ArgOffset, 0, ForceMutable,
+                   0, TotalArgRegsSaveSize);
+
+  AFI->setVarArgsFrameIndex(FrameIndex);
+}
+
+SDValue
+ARMTargetLowering::LowerFormalArguments(SDValue Chain,
+                                        CallingConv::ID CallConv, bool isVarArg,
+                                        const SmallVectorImpl<ISD::InputArg>
+                                          &Ins,
+                                        SDLoc dl, SelectionDAG &DAG,
+                                        SmallVectorImpl<SDValue> &InVals)
+                                          const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                    getTargetMachine(), ArgLocs, *DAG.getContext(), Prologue);
+  CCInfo.AnalyzeFormalArguments(Ins,
+                                CCAssignFnForNode(CallConv, /* Return*/ false,
+                                                  isVarArg));
+
+  SmallVector<SDValue, 16> ArgValues;
+  int lastInsIndex = -1;
+  SDValue ArgValue;
+  Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin();
+  unsigned CurArgIdx = 0;
+
+  // Initially ArgRegsSaveSize is zero.
+  // Then we increase this value each time we meet byval parameter.
+  // We also increase this value in case of varargs function.
+  AFI->setArgRegsSaveSize(0);
+
+  unsigned ByValStoreOffset = 0;
+  unsigned TotalArgRegsSaveSize = 0;
+  unsigned ArgRegsSaveSizeMaxAlign = 4;
+
+  // Calculate the amount of stack space that we need to allocate to store
+  // byval and variadic arguments that are passed in registers.
+  // We need to know this before we allocate the first byval or variadic
+  // argument, as they will be allocated a stack slot below the CFA (Canonical
+  // Frame Address, the stack pointer at entry to the function).
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    if (VA.isMemLoc()) {
+      int index = VA.getValNo();
+      if (index != lastInsIndex) {
+        ISD::ArgFlagsTy Flags = Ins[index].Flags;
+        if (Flags.isByVal()) {
+          unsigned ExtraArgRegsSize;
+          unsigned ExtraArgRegsSaveSize;
+          computeRegArea(CCInfo, MF, CCInfo.getInRegsParamsProceed(),
+                         Flags.getByValSize(),
+                         ExtraArgRegsSize, ExtraArgRegsSaveSize);
+
+          TotalArgRegsSaveSize += ExtraArgRegsSaveSize;
+          if (Flags.getByValAlign() > ArgRegsSaveSizeMaxAlign)
+              ArgRegsSaveSizeMaxAlign = Flags.getByValAlign();
+          CCInfo.nextInRegsParam();
+        }
+        lastInsIndex = index;
+      }
+    }
+  }
+  CCInfo.rewindByValRegsInfo();
+  lastInsIndex = -1;
+  if (isVarArg) {
+    unsigned ExtraArgRegsSize;
+    unsigned ExtraArgRegsSaveSize;
+    computeRegArea(CCInfo, MF, CCInfo.getInRegsParamsCount(), 0,
+                   ExtraArgRegsSize, ExtraArgRegsSaveSize);
+    TotalArgRegsSaveSize += ExtraArgRegsSaveSize;
+  }
+  // If the arg regs save area contains N-byte aligned values, the
+  // bottom of it must be at least N-byte aligned.
+  TotalArgRegsSaveSize = RoundUpToAlignment(TotalArgRegsSaveSize, ArgRegsSaveSizeMaxAlign);
+  TotalArgRegsSaveSize = std::min(TotalArgRegsSaveSize, 16U);
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    std::advance(CurOrigArg, Ins[VA.getValNo()].OrigArgIndex - CurArgIdx);
+    CurArgIdx = Ins[VA.getValNo()].OrigArgIndex;
+    // Arguments stored in registers.
+    if (VA.isRegLoc()) {
+      EVT RegVT = VA.getLocVT();
+
+      if (VA.needsCustom()) {
+        // f64 and vector types are split up into multiple registers or
+        // combinations of registers and stack slots.
+        if (VA.getLocVT() == MVT::v2f64) {
+          SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i],
+                                                   Chain, DAG, dl);
+          VA = ArgLocs[++i]; // skip ahead to next loc
+          SDValue ArgValue2;
+          if (VA.isMemLoc()) {
+            int FI = MFI->CreateFixedObject(8, VA.getLocMemOffset(), true);
+            SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+            ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN,
+                                    MachinePointerInfo::getFixedStack(FI),
+                                    false, false, false, 0);
+          } else {
+            ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i],
+                                             Chain, DAG, dl);
+          }
+          ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
+          ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
+                                 ArgValue, ArgValue1, DAG.getIntPtrConstant(0));
+          ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
+                                 ArgValue, ArgValue2, DAG.getIntPtrConstant(1));
+        } else
+          ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
+
+      } else {
+        const TargetRegisterClass *RC;
+
+        if (RegVT == MVT::f32)
+          RC = &ARM::SPRRegClass;
+        else if (RegVT == MVT::f64)
+          RC = &ARM::DPRRegClass;
+        else if (RegVT == MVT::v2f64)
+          RC = &ARM::QPRRegClass;
+        else if (RegVT == MVT::i32)
+          RC = AFI->isThumb1OnlyFunction() ?
+            (const TargetRegisterClass*)&ARM::tGPRRegClass :
+            (const TargetRegisterClass*)&ARM::GPRRegClass;
+        else
+          llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
+
+        // Transform the arguments in physical registers into virtual ones.
+        unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+        ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
+      }
+
+      // If this is an 8 or 16-bit value, it is really passed promoted
+      // to 32 bits.  Insert an assert[sz]ext to capture this, then
+      // truncate to the right size.
+      switch (VA.getLocInfo()) {
+      default: llvm_unreachable("Unknown loc info!");
+      case CCValAssign::Full: break;
+      case CCValAssign::BCvt:
+        ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
+        break;
+      case CCValAssign::SExt:
+        ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
+                               DAG.getValueType(VA.getValVT()));
+        ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
+        break;
+      case CCValAssign::ZExt:
+        ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
+                               DAG.getValueType(VA.getValVT()));
+        ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
+        break;
+      }
+
+      InVals.push_back(ArgValue);
+
+    } else { // VA.isRegLoc()
+
+      // sanity check
+      assert(VA.isMemLoc());
+      assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
+
+      int index = ArgLocs[i].getValNo();
+
+      // Some Ins[] entries become multiple ArgLoc[] entries.
+      // Process them only once.
+      if (index != lastInsIndex)
+        {
+          ISD::ArgFlagsTy Flags = Ins[index].Flags;
+          // FIXME: For now, all byval parameter objects are marked mutable.
+          // This can be changed with more analysis.
+          // In case of tail call optimization mark all arguments mutable.
+          // Since they could be overwritten by lowering of arguments in case of
+          // a tail call.
+          if (Flags.isByVal()) {
+            unsigned CurByValIndex = CCInfo.getInRegsParamsProceed();
+
+            ByValStoreOffset = RoundUpToAlignment(ByValStoreOffset, Flags.getByValAlign());
+            int FrameIndex = StoreByValRegs(
+                CCInfo, DAG, dl, Chain, CurOrigArg,
+                CurByValIndex,
+                Ins[VA.getValNo()].PartOffset,
+                VA.getLocMemOffset(),
+                Flags.getByValSize(),
+                true /*force mutable frames*/,
+                ByValStoreOffset,
+                TotalArgRegsSaveSize);
+            ByValStoreOffset += Flags.getByValSize();
+            ByValStoreOffset = std::min(ByValStoreOffset, 16U);
+            InVals.push_back(DAG.getFrameIndex(FrameIndex, getPointerTy()));
+            CCInfo.nextInRegsParam();
+          } else {
+            unsigned FIOffset = VA.getLocMemOffset();
+            int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
+                                            FIOffset, true);
+
+            // Create load nodes to retrieve arguments from the stack.
+            SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+            InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
+                                         MachinePointerInfo::getFixedStack(FI),
+                                         false, false, false, 0));
+          }
+          lastInsIndex = index;
+        }
+    }
+  }
+
+  // varargs
+  if (isVarArg)
+    VarArgStyleRegisters(CCInfo, DAG, dl, Chain,
+                         CCInfo.getNextStackOffset(),
+                         TotalArgRegsSaveSize);
+
+  AFI->setArgumentStackSize(CCInfo.getNextStackOffset());
+
+  return Chain;
+}
+
+/// isFloatingPointZero - Return true if this is +0.0.
+static bool isFloatingPointZero(SDValue Op) {
+  if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op))
+    return CFP->getValueAPF().isPosZero();
+  else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
+    // Maybe this has already been legalized into the constant pool?
+    if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) {
+      SDValue WrapperOp = Op.getOperand(1).getOperand(0);
+      if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp))
+        if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
+          return CFP->getValueAPF().isPosZero();
+    }
+  }
+  return false;
+}
+
+/// Returns appropriate ARM CMP (cmp) and corresponding condition code for
+/// the given operands.
+SDValue
+ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+                             SDValue &ARMcc, SelectionDAG &DAG,
+                             SDLoc dl) const {
+  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
+    unsigned C = RHSC->getZExtValue();
+    if (!isLegalICmpImmediate(C)) {
+      // Constant does not fit, try adjusting it by one?
+      switch (CC) {
+      default: break;
+      case ISD::SETLT:
+      case ISD::SETGE:
+        if (C != 0x80000000 && isLegalICmpImmediate(C-1)) {
+          CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
+          RHS = DAG.getConstant(C-1, MVT::i32);
+        }
+        break;
+      case ISD::SETULT:
+      case ISD::SETUGE:
+        if (C != 0 && isLegalICmpImmediate(C-1)) {
+          CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
+          RHS = DAG.getConstant(C-1, MVT::i32);
+        }
+        break;
+      case ISD::SETLE:
+      case ISD::SETGT:
+        if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) {
+          CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
+          RHS = DAG.getConstant(C+1, MVT::i32);
+        }
+        break;
+      case ISD::SETULE:
+      case ISD::SETUGT:
+        if (C != 0xffffffff && isLegalICmpImmediate(C+1)) {
+          CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
+          RHS = DAG.getConstant(C+1, MVT::i32);
+        }
+        break;
+      }
+    }
+  }
+
+  ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
+  ARMISD::NodeType CompareType;
+  switch (CondCode) {
+  default:
+    CompareType = ARMISD::CMP;
+    break;
+  case ARMCC::EQ:
+  case ARMCC::NE:
+    // Uses only Z Flag
+    CompareType = ARMISD::CMPZ;
+    break;
+  }
+  ARMcc = DAG.getConstant(CondCode, MVT::i32);
+  return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS);
+}
+
+/// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
+SDValue
+ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG,
+                             SDLoc dl) const {
+  SDValue Cmp;
+  if (!isFloatingPointZero(RHS))
+    Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS);
+  else
+    Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS);
+  return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp);
+}
+
+/// duplicateCmp - Glue values can have only one use, so this function
+/// duplicates a comparison node.
+SDValue
+ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const {
+  unsigned Opc = Cmp.getOpcode();
+  SDLoc DL(Cmp);
+  if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ)
+    return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
+
+  assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation");
+  Cmp = Cmp.getOperand(0);
+  Opc = Cmp.getOpcode();
+  if (Opc == ARMISD::CMPFP)
+    Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
+  else {
+    assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT");
+    Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0));
+  }
+  return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp);
+}
+
+std::pair<SDValue, SDValue>
+ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
+                                 SDValue &ARMcc) const {
+  assert(Op.getValueType() == MVT::i32 &&  "Unsupported value type");
+
+  SDValue Value, OverflowCmp;
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+
+
+  // FIXME: We are currently always generating CMPs because we don't support
+  // generating CMN through the backend. This is not as good as the natural
+  // CMP case because it causes a register dependency and cannot be folded
+  // later.
+
+  switch (Op.getOpcode()) {
+  default:
+    llvm_unreachable("Unknown overflow instruction!");
+  case ISD::SADDO:
+    ARMcc = DAG.getConstant(ARMCC::VC, MVT::i32);
+    Value = DAG.getNode(ISD::ADD, SDLoc(Op), Op.getValueType(), LHS, RHS);
+    OverflowCmp = DAG.getNode(ARMISD::CMP, SDLoc(Op), MVT::Glue, Value, LHS);
+    break;
+  case ISD::UADDO:
+    ARMcc = DAG.getConstant(ARMCC::HS, MVT::i32);
+    Value = DAG.getNode(ISD::ADD, SDLoc(Op), Op.getValueType(), LHS, RHS);
+    OverflowCmp = DAG.getNode(ARMISD::CMP, SDLoc(Op), MVT::Glue, Value, LHS);
+    break;
+  case ISD::SSUBO:
+    ARMcc = DAG.getConstant(ARMCC::VC, MVT::i32);
+    Value = DAG.getNode(ISD::SUB, SDLoc(Op), Op.getValueType(), LHS, RHS);
+    OverflowCmp = DAG.getNode(ARMISD::CMP, SDLoc(Op), MVT::Glue, LHS, RHS);
+    break;
+  case ISD::USUBO:
+    ARMcc = DAG.getConstant(ARMCC::HS, MVT::i32);
+    Value = DAG.getNode(ISD::SUB, SDLoc(Op), Op.getValueType(), LHS, RHS);
+    OverflowCmp = DAG.getNode(ARMISD::CMP, SDLoc(Op), MVT::Glue, LHS, RHS);
+    break;
+  } // switch (...)
+
+  return std::make_pair(Value, OverflowCmp);
+}
+
+
+SDValue
+ARMTargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
+  // Let legalize expand this if it isn't a legal type yet.
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
+    return SDValue();
+
+  SDValue Value, OverflowCmp;
+  SDValue ARMcc;
+  std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc);
+  SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+  // We use 0 and 1 as false and true values.
+  SDValue TVal = DAG.getConstant(1, MVT::i32);
+  SDValue FVal = DAG.getConstant(0, MVT::i32);
+  EVT VT = Op.getValueType();
+
+  SDValue Overflow = DAG.getNode(ARMISD::CMOV, SDLoc(Op), VT, TVal, FVal,
+                                 ARMcc, CCR, OverflowCmp);
+
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+  return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), VTs, Value, Overflow);
+}
+
+
+SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Cond = Op.getOperand(0);
+  SDValue SelectTrue = Op.getOperand(1);
+  SDValue SelectFalse = Op.getOperand(2);
+  SDLoc dl(Op);
+  unsigned Opc = Cond.getOpcode();
+
+  if (Cond.getResNo() == 1 &&
+      (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
+       Opc == ISD::USUBO)) {
+    if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0)))
+      return SDValue();
+
+    SDValue Value, OverflowCmp;
+    SDValue ARMcc;
+    std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
+    SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+    EVT VT = Op.getValueType();
+
+    return DAG.getNode(ARMISD::CMOV, SDLoc(Op), VT, SelectTrue, SelectFalse,
+                       ARMcc, CCR, OverflowCmp);
+
+  }
+
+  // Convert:
+  //
+  //   (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond)
+  //   (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond)
+  //
+  if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) {
+    const ConstantSDNode *CMOVTrue =
+      dyn_cast<ConstantSDNode>(Cond.getOperand(0));
+    const ConstantSDNode *CMOVFalse =
+      dyn_cast<ConstantSDNode>(Cond.getOperand(1));
+
+    if (CMOVTrue && CMOVFalse) {
+      unsigned CMOVTrueVal = CMOVTrue->getZExtValue();
+      unsigned CMOVFalseVal = CMOVFalse->getZExtValue();
+
+      SDValue True;
+      SDValue False;
+      if (CMOVTrueVal == 1 && CMOVFalseVal == 0) {
+        True = SelectTrue;
+        False = SelectFalse;
+      } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) {
+        True = SelectFalse;
+        False = SelectTrue;
+      }
+
+      if (True.getNode() && False.getNode()) {
+        EVT VT = Op.getValueType();
+        SDValue ARMcc = Cond.getOperand(2);
+        SDValue CCR = Cond.getOperand(3);
+        SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG);
+        assert(True.getValueType() == VT);
+        return DAG.getNode(ARMISD::CMOV, dl, VT, True, False, ARMcc, CCR, Cmp);
+      }
+    }
+  }
+
+  // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the
+  // undefined bits before doing a full-word comparison with zero.
+  Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond,
+                     DAG.getConstant(1, Cond.getValueType()));
+
+  return DAG.getSelectCC(dl, Cond,
+                         DAG.getConstant(0, Cond.getValueType()),
+                         SelectTrue, SelectFalse, ISD::SETNE);
+}
+
+static ISD::CondCode getInverseCCForVSEL(ISD::CondCode CC) {
+  if (CC == ISD::SETNE)
+    return ISD::SETEQ;
+  return ISD::getSetCCInverse(CC, true);
+}
+
+static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
+                                 bool &swpCmpOps, bool &swpVselOps) {
+  // Start by selecting the GE condition code for opcodes that return true for
+  // 'equality'
+  if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE ||
+      CC == ISD::SETULE)
+    CondCode = ARMCC::GE;
+
+  // and GT for opcodes that return false for 'equality'.
+  else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT ||
+           CC == ISD::SETULT)
+    CondCode = ARMCC::GT;
+
+  // Since we are constrained to GE/GT, if the opcode contains 'less', we need
+  // to swap the compare operands.
+  if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT ||
+      CC == ISD::SETULT)
+    swpCmpOps = true;
+
+  // Both GT and GE are ordered comparisons, and return false for 'unordered'.
+  // If we have an unordered opcode, we need to swap the operands to the VSEL
+  // instruction (effectively negating the condition).
+  //
+  // This also has the effect of swapping which one of 'less' or 'greater'
+  // returns true, so we also swap the compare operands. It also switches
+  // whether we return true for 'equality', so we compensate by picking the
+  // opposite condition code to our original choice.
+  if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE ||
+      CC == ISD::SETUGT) {
+    swpCmpOps = !swpCmpOps;
+    swpVselOps = !swpVselOps;
+    CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT;
+  }
+
+  // 'ordered' is 'anything but unordered', so use the VS condition code and
+  // swap the VSEL operands.
+  if (CC == ISD::SETO) {
+    CondCode = ARMCC::VS;
+    swpVselOps = true;
+  }
+
+  // 'unordered or not equal' is 'anything but equal', so use the EQ condition
+  // code and swap the VSEL operands.
+  if (CC == ISD::SETUNE) {
+    CondCode = ARMCC::EQ;
+    swpVselOps = true;
+  }
+}
+
+SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+  SDValue TrueVal = Op.getOperand(2);
+  SDValue FalseVal = Op.getOperand(3);
+  SDLoc dl(Op);
+
+  if (LHS.getValueType() == MVT::i32) {
+    // Try to generate VSEL on ARMv8.
+    // The VSEL instruction can't use all the usual ARM condition
+    // codes: it only has two bits to select the condition code, so it's
+    // constrained to use only GE, GT, VS and EQ.
+    //
+    // To implement all the various ISD::SETXXX opcodes, we sometimes need to
+    // swap the operands of the previous compare instruction (effectively
+    // inverting the compare condition, swapping 'less' and 'greater') and
+    // sometimes need to swap the operands to the VSEL (which inverts the
+    // condition in the sense of firing whenever the previous condition didn't)
+    if (getSubtarget()->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 ||
+                                      TrueVal.getValueType() == MVT::f64)) {
+      ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
+      if (CondCode == ARMCC::LT || CondCode == ARMCC::LE ||
+          CondCode == ARMCC::VC || CondCode == ARMCC::NE) {
+        CC = getInverseCCForVSEL(CC);
+        std::swap(TrueVal, FalseVal);
+      }
+    }
+
+    SDValue ARMcc;
+    SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+    SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
+    return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR,
+                       Cmp);
+  }
+
+  ARMCC::CondCodes CondCode, CondCode2;
+  FPCCToARMCC(CC, CondCode, CondCode2);
+
+  // Try to generate VSEL on ARMv8.
+  if (getSubtarget()->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 ||
+                                    TrueVal.getValueType() == MVT::f64)) {
+    // We can select VMAXNM/VMINNM from a compare followed by a select with the
+    // same operands, as follows:
+    //   c = fcmp [ogt, olt, ugt, ult] a, b
+    //   select c, a, b
+    // We only do this in unsafe-fp-math, because signed zeros and NaNs are
+    // handled differently than the original code sequence.
+    if (getTargetMachine().Options.UnsafeFPMath && LHS == TrueVal &&
+        RHS == FalseVal) {
+      if (CC == ISD::SETOGT || CC == ISD::SETUGT)
+        return DAG.getNode(ARMISD::VMAXNM, dl, VT, TrueVal, FalseVal);
+      if (CC == ISD::SETOLT || CC == ISD::SETULT)
+        return DAG.getNode(ARMISD::VMINNM, dl, VT, TrueVal, FalseVal);
+    }
+
+    bool swpCmpOps = false;
+    bool swpVselOps = false;
+    checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);
+
+    if (CondCode == ARMCC::GT || CondCode == ARMCC::GE ||
+        CondCode == ARMCC::VS || CondCode == ARMCC::EQ) {
+      if (swpCmpOps)
+        std::swap(LHS, RHS);
+      if (swpVselOps)
+        std::swap(TrueVal, FalseVal);
+    }
+  }
+
+  SDValue ARMcc = DAG.getConstant(CondCode, MVT::i32);
+  SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
+  SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+  SDValue Result = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal,
+                               ARMcc, CCR, Cmp);
+  if (CondCode2 != ARMCC::AL) {
+    SDValue ARMcc2 = DAG.getConstant(CondCode2, MVT::i32);
+    // FIXME: Needs another CMP because flag can have but one use.
+    SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl);
+    Result = DAG.getNode(ARMISD::CMOV, dl, VT,
+                         Result, TrueVal, ARMcc2, CCR, Cmp2);
+  }
+  return Result;
+}
+
+/// canChangeToInt - Given the fp compare operand, return true if it is suitable
+/// to morph to an integer compare sequence.
+static bool canChangeToInt(SDValue Op, bool &SeenZero,
+                           const ARMSubtarget *Subtarget) {
+  SDNode *N = Op.getNode();
+  if (!N->hasOneUse())
+    // Otherwise it requires moving the value from fp to integer registers.
+    return false;
+  if (!N->getNumValues())
+    return false;
+  EVT VT = Op.getValueType();
+  if (VT != MVT::f32 && !Subtarget->isFPBrccSlow())
+    // f32 case is generally profitable. f64 case only makes sense when vcmpe +
+    // vmrs are very slow, e.g. cortex-a8.
+    return false;
+
+  if (isFloatingPointZero(Op)) {
+    SeenZero = true;
+    return true;
+  }
+  return ISD::isNormalLoad(N);
+}
+
+static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) {
+  if (isFloatingPointZero(Op))
+    return DAG.getConstant(0, MVT::i32);
+
+  if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op))
+    return DAG.getLoad(MVT::i32, SDLoc(Op),
+                       Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
+                       Ld->isVolatile(), Ld->isNonTemporal(),
+                       Ld->isInvariant(), Ld->getAlignment());
+
+  llvm_unreachable("Unknown VFP cmp argument!");
+}
+
+static void expandf64Toi32(SDValue Op, SelectionDAG &DAG,
+                           SDValue &RetVal1, SDValue &RetVal2) {
+  if (isFloatingPointZero(Op)) {
+    RetVal1 = DAG.getConstant(0, MVT::i32);
+    RetVal2 = DAG.getConstant(0, MVT::i32);
+    return;
+  }
+
+  if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) {
+    SDValue Ptr = Ld->getBasePtr();
+    RetVal1 = DAG.getLoad(MVT::i32, SDLoc(Op),
+                          Ld->getChain(), Ptr,
+                          Ld->getPointerInfo(),
+                          Ld->isVolatile(), Ld->isNonTemporal(),
+                          Ld->isInvariant(), Ld->getAlignment());
+
+    EVT PtrType = Ptr.getValueType();
+    unsigned NewAlign = MinAlign(Ld->getAlignment(), 4);
+    SDValue NewPtr = DAG.getNode(ISD::ADD, SDLoc(Op),
+                                 PtrType, Ptr, DAG.getConstant(4, PtrType));
+    RetVal2 = DAG.getLoad(MVT::i32, SDLoc(Op),
+                          Ld->getChain(), NewPtr,
+                          Ld->getPointerInfo().getWithOffset(4),
+                          Ld->isVolatile(), Ld->isNonTemporal(),
+                          Ld->isInvariant(), NewAlign);
+    return;
+  }
+
+  llvm_unreachable("Unknown VFP cmp argument!");
+}
+
+/// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some
+/// f32 and even f64 comparisons to integer ones.
+SDValue
+ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+  SDValue LHS = Op.getOperand(2);
+  SDValue RHS = Op.getOperand(3);
+  SDValue Dest = Op.getOperand(4);
+  SDLoc dl(Op);
+
+  bool LHSSeenZero = false;
+  bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget);
+  bool RHSSeenZero = false;
+  bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget);
+  if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) {
+    // If unsafe fp math optimization is enabled and there are no other uses of
+    // the CMP operands, and the condition code is EQ or NE, we can optimize it
+    // to an integer comparison.
+    if (CC == ISD::SETOEQ)
+      CC = ISD::SETEQ;
+    else if (CC == ISD::SETUNE)
+      CC = ISD::SETNE;
+
+    SDValue Mask = DAG.getConstant(0x7fffffff, MVT::i32);
+    SDValue ARMcc;
+    if (LHS.getValueType() == MVT::f32) {
+      LHS = DAG.getNode(ISD::AND, dl, MVT::i32,
+                        bitcastf32Toi32(LHS, DAG), Mask);
+      RHS = DAG.getNode(ISD::AND, dl, MVT::i32,
+                        bitcastf32Toi32(RHS, DAG), Mask);
+      SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
+      SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+      return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
+                         Chain, Dest, ARMcc, CCR, Cmp);
+    }
+
+    SDValue LHS1, LHS2;
+    SDValue RHS1, RHS2;
+    expandf64Toi32(LHS, DAG, LHS1, LHS2);
+    expandf64Toi32(RHS, DAG, RHS1, RHS2);
+    LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask);
+    RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask);
+    ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
+    ARMcc = DAG.getConstant(CondCode, MVT::i32);
+    SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
+    SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest };
+    return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops);
+  }
+
+  return SDValue();
+}
+
+SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+  SDValue LHS = Op.getOperand(2);
+  SDValue RHS = Op.getOperand(3);
+  SDValue Dest = Op.getOperand(4);
+  SDLoc dl(Op);
+
+  if (LHS.getValueType() == MVT::i32) {
+    SDValue ARMcc;
+    SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
+    SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+    return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
+                       Chain, Dest, ARMcc, CCR, Cmp);
+  }
+
+  assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
+
+  if (getTargetMachine().Options.UnsafeFPMath &&
+      (CC == ISD::SETEQ || CC == ISD::SETOEQ ||
+       CC == ISD::SETNE || CC == ISD::SETUNE)) {
+    SDValue Result = OptimizeVFPBrcond(Op, DAG);
+    if (Result.getNode())
+      return Result;
+  }
+
+  ARMCC::CondCodes CondCode, CondCode2;
+  FPCCToARMCC(CC, CondCode, CondCode2);
+
+  SDValue ARMcc = DAG.getConstant(CondCode, MVT::i32);
+  SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
+  SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+  SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp };
+  SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
+  if (CondCode2 != ARMCC::AL) {
+    ARMcc = DAG.getConstant(CondCode2, MVT::i32);
+    SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) };
+    Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
+  }
+  return Res;
+}
+
+SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  SDValue Table = Op.getOperand(1);
+  SDValue Index = Op.getOperand(2);
+  SDLoc dl(Op);
+
+  EVT PTy = getPointerTy();
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
+  ARMFunctionInfo *AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
+  SDValue UId = DAG.getConstant(AFI->createJumpTableUId(), PTy);
+  SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
+  Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI, UId);
+  Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, PTy));
+  SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Index, Table);
+  if (Subtarget->isThumb2()) {
+    // Thumb2 uses a two-level jump. That is, it jumps into the jump table
+    // which does another jump to the destination. This also makes it easier
+    // to translate it to TBB / TBH later.
+    // FIXME: This might not work if the function is extremely large.
+    return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,
+                       Addr, Op.getOperand(2), JTI, UId);
+  }
+  if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
+    Addr = DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
+                       MachinePointerInfo::getJumpTable(),
+                       false, false, false, 0);
+    Chain = Addr.getValue(1);
+    Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr, Table);
+    return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI, UId);
+  } else {
+    Addr = DAG.getLoad(PTy, dl, Chain, Addr,
+                       MachinePointerInfo::getJumpTable(),
+                       false, false, false, 0);
+    Chain = Addr.getValue(1);
+    return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI, UId);
+  }
+}
+
+static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+  SDLoc dl(Op);
+
+  if (Op.getValueType().getVectorElementType() == MVT::i32) {
+    if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32)
+      return Op;
+    return DAG.UnrollVectorOp(Op.getNode());
+  }
+
+  assert(Op.getOperand(0).getValueType() == MVT::v4f32 &&
+         "Invalid type for custom lowering!");
+  if (VT != MVT::v4i16)
+    return DAG.UnrollVectorOp(Op.getNode());
+
+  Op = DAG.getNode(Op.getOpcode(), dl, MVT::v4i32, Op.getOperand(0));
+  return DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
+}
+
+static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+  if (VT.isVector())
+    return LowerVectorFP_TO_INT(Op, DAG);
+
+  SDLoc dl(Op);
+  unsigned Opc;
+
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Invalid opcode!");
+  case ISD::FP_TO_SINT:
+    Opc = ARMISD::FTOSI;
+    break;
+  case ISD::FP_TO_UINT:
+    Opc = ARMISD::FTOUI;
+    break;
+  }
+  Op = DAG.getNode(Opc, dl, MVT::f32, Op.getOperand(0));
+  return DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op);
+}
+
+static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+  SDLoc dl(Op);
+
+  if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) {
+    if (VT.getVectorElementType() == MVT::f32)
+      return Op;
+    return DAG.UnrollVectorOp(Op.getNode());
+  }
+
+  assert(Op.getOperand(0).getValueType() == MVT::v4i16 &&
+         "Invalid type for custom lowering!");
+  if (VT != MVT::v4f32)
+    return DAG.UnrollVectorOp(Op.getNode());
+
+  unsigned CastOpc;
+  unsigned Opc;
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Invalid opcode!");
+  case ISD::SINT_TO_FP:
+    CastOpc = ISD::SIGN_EXTEND;
+    Opc = ISD::SINT_TO_FP;
+    break;
+  case ISD::UINT_TO_FP:
+    CastOpc = ISD::ZERO_EXTEND;
+    Opc = ISD::UINT_TO_FP;
+    break;
+  }
+
+  Op = DAG.getNode(CastOpc, dl, MVT::v4i32, Op.getOperand(0));
+  return DAG.getNode(Opc, dl, VT, Op);
+}
+
+static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+  if (VT.isVector())
+    return LowerVectorINT_TO_FP(Op, DAG);
+
+  SDLoc dl(Op);
+  unsigned Opc;
+
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Invalid opcode!");
+  case ISD::SINT_TO_FP:
+    Opc = ARMISD::SITOF;
+    break;
+  case ISD::UINT_TO_FP:
+    Opc = ARMISD::UITOF;
+    break;
+  }
+
+  Op = DAG.getNode(ISD::BITCAST, dl, MVT::f32, Op.getOperand(0));
+  return DAG.getNode(Opc, dl, VT, Op);
+}
+
+SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
+  // Implement fcopysign with a fabs and a conditional fneg.
+  SDValue Tmp0 = Op.getOperand(0);
+  SDValue Tmp1 = Op.getOperand(1);
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+  EVT SrcVT = Tmp1.getValueType();
+  bool InGPR = Tmp0.getOpcode() == ISD::BITCAST ||
+    Tmp0.getOpcode() == ARMISD::VMOVDRR;
+  bool UseNEON = !InGPR && Subtarget->hasNEON();
+
+  if (UseNEON) {
+    // Use VBSL to copy the sign bit.
+    unsigned EncodedVal = ARM_AM::createNEONModImm(0x6, 0x80);
+    SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32,
+                               DAG.getTargetConstant(EncodedVal, MVT::i32));
+    EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
+    if (VT == MVT::f64)
+      Mask = DAG.getNode(ARMISD::VSHL, dl, OpVT,
+                         DAG.getNode(ISD::BITCAST, dl, OpVT, Mask),
+                         DAG.getConstant(32, MVT::i32));
+    else /*if (VT == MVT::f32)*/
+      Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0);
+    if (SrcVT == MVT::f32) {
+      Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1);
+      if (VT == MVT::f64)
+        Tmp1 = DAG.getNode(ARMISD::VSHL, dl, OpVT,
+                           DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
+                           DAG.getConstant(32, MVT::i32));
+    } else if (VT == MVT::f32)
+      Tmp1 = DAG.getNode(ARMISD::VSHRu, dl, MVT::v1i64,
+                         DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1),
+                         DAG.getConstant(32, MVT::i32));
+    Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
+    Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);
+
+    SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createNEONModImm(0xe, 0xff),
+                                            MVT::i32);
+    AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
+    SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
+                                  DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes));
+
+    SDValue Res = DAG.getNode(ISD::OR, dl, OpVT,
+                              DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask),
+                              DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot));
+    if (VT == MVT::f32) {
+      Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res);
+      Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
+                        DAG.getConstant(0, MVT::i32));
+    } else {
+      Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res);
+    }
+
+    return Res;
+  }
+
+  // Bitcast operand 1 to i32.
+  if (SrcVT == MVT::f64)
+    Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
+                       Tmp1).getValue(1);
+  Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);
+
+  // Or in the signbit with integer operations.
+  SDValue Mask1 = DAG.getConstant(0x80000000, MVT::i32);
+  SDValue Mask2 = DAG.getConstant(0x7fffffff, MVT::i32);
+  Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
+  if (VT == MVT::f32) {
+    Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
+                       DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);
+    return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
+                       DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));
+  }
+
+  // f64: Or the high part with signbit and then combine two parts.
+  Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
+                     Tmp0);
+  SDValue Lo = Tmp0.getValue(0);
+  SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
+  Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
+  return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
+}
+
+SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MFI->setReturnAddressIsTaken(true);
+
+  if (verifyReturnAddressArgumentIsConstant(Op, DAG))
+    return SDValue();
+
+  EVT VT = Op.getValueType();
+  SDLoc dl(Op);
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  if (Depth) {
+    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+    SDValue Offset = DAG.getConstant(4, MVT::i32);
+    return DAG.getLoad(VT, dl, DAG.getEntryNode(),
+                       DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
+                       MachinePointerInfo(), false, false, false, 0);
+  }
+
+  // Return LR, which contains the return address. Mark it an implicit live-in.
+  unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
+  return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
+}
+
+SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
+  const ARMBaseRegisterInfo &ARI =
+    *static_cast<const ARMBaseRegisterInfo*>(RegInfo);
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MFI->setFrameAddressIsTaken(true);
+
+  EVT VT = Op.getValueType();
+  SDLoc dl(Op);  // FIXME probably not meaningful
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned FrameReg = ARI.getFrameRegister(MF);
+  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
+  while (Depth--)
+    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
+                            MachinePointerInfo(),
+                            false, false, false, 0);
+  return FrameAddr;
+}
+
+// FIXME? Maybe this could be a TableGen attribute on some registers and
+// this table could be generated automatically from RegInfo.
+unsigned ARMTargetLowering::getRegisterByName(const char* RegName,
+                                              EVT VT) const {
+  unsigned Reg = StringSwitch<unsigned>(RegName)
+                       .Case("sp", ARM::SP)
+                       .Default(0);
+  if (Reg)
+    return Reg;
+  report_fatal_error("Invalid register name global variable");
+}
+
+/// ExpandBITCAST - If the target supports VFP, this function is called to
+/// expand a bit convert where either the source or destination type is i64 to
+/// use a VMOVDRR or VMOVRRD node.  This should not be done when the non-i64
+/// operand type is illegal (e.g., v2f32 for a target that doesn't support
+/// vectors), since the legalizer won't know what to do with that.
+static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDLoc dl(N);
+  SDValue Op = N->getOperand(0);
+
+  // This function is only supposed to be called for i64 types, either as the
+  // source or destination of the bit convert.
+  EVT SrcVT = Op.getValueType();
+  EVT DstVT = N->getValueType(0);
+  assert((SrcVT == MVT::i64 || DstVT == MVT::i64) &&
+         "ExpandBITCAST called for non-i64 type");
+
+  // Turn i64->f64 into VMOVDRR.
+  if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) {
+    SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
+                             DAG.getConstant(0, MVT::i32));
+    SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
+                             DAG.getConstant(1, MVT::i32));
+    return DAG.getNode(ISD::BITCAST, dl, DstVT,
+                       DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi));
+  }
+
+  // Turn f64->i64 into VMOVRRD.
+  if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) {
+    SDValue Cvt;
+    if (TLI.isBigEndian() && SrcVT.isVector() &&
+        SrcVT.getVectorNumElements() > 1)
+      Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
+                        DAG.getVTList(MVT::i32, MVT::i32),
+                        DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op));
+    else
+      Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
+                        DAG.getVTList(MVT::i32, MVT::i32), Op);
+    // Merge the pieces into a single i64 value.
+    return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));
+  }
+
+  return SDValue();
+}
+
+/// getZeroVector - Returns a vector of specified type with all zero elements.
+/// Zero vectors are used to represent vector negation and in those cases
+/// will be implemented with the NEON VNEG instruction.  However, VNEG does
+/// not support i64 elements, so sometimes the zero vectors will need to be
+/// explicitly constructed.  Regardless, use a canonical VMOV to create the
+/// zero vector.
+static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, SDLoc dl) {
+  assert(VT.isVector() && "Expected a vector type");
+  // The canonical modified immediate encoding of a zero vector is....0!
+  SDValue EncodedVal = DAG.getTargetConstant(0, MVT::i32);
+  EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
+  SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal);
+  return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
+}
+
+/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
+/// i32 values and take a 2 x i32 value to shift plus a shift amount.
+SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+  EVT VT = Op.getValueType();
+  unsigned VTBits = VT.getSizeInBits();
+  SDLoc dl(Op);
+  SDValue ShOpLo = Op.getOperand(0);
+  SDValue ShOpHi = Op.getOperand(1);
+  SDValue ShAmt  = Op.getOperand(2);
+  SDValue ARMcc;
+  unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
+
+  assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
+
+  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
+                                 DAG.getConstant(VTBits, MVT::i32), ShAmt);
+  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
+  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
+                                   DAG.getConstant(VTBits, MVT::i32));
+  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
+  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+  SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
+
+  SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+  SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, MVT::i32), ISD::SETGE,
+                          ARMcc, DAG, dl);
+  SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
+  SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc,
+                           CCR, Cmp);
+
+  SDValue Ops[2] = { Lo, Hi };
+  return DAG.getMergeValues(Ops, dl);
+}
+
+/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
+/// i32 values and take a 2 x i32 value to shift plus a shift amount.
+SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+  EVT VT = Op.getValueType();
+  unsigned VTBits = VT.getSizeInBits();
+  SDLoc dl(Op);
+  SDValue ShOpLo = Op.getOperand(0);
+  SDValue ShOpHi = Op.getOperand(1);
+  SDValue ShAmt  = Op.getOperand(2);
+  SDValue ARMcc;
+
+  assert(Op.getOpcode() == ISD::SHL_PARTS);
+  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
+                                 DAG.getConstant(VTBits, MVT::i32), ShAmt);
+  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
+  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
+                                   DAG.getConstant(VTBits, MVT::i32));
+  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
+  SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
+
+  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+  SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+  SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, MVT::i32), ISD::SETGE,
+                          ARMcc, DAG, dl);
+  SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+  SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, Tmp3, ARMcc,
+                           CCR, Cmp);
+
+  SDValue Ops[2] = { Lo, Hi };
+  return DAG.getMergeValues(Ops, dl);
+}
+
+SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  // The rounding mode is in bits 23:22 of the FPSCR.
+  // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
+  // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
+  // so that the shift + and get folded into a bitfield extract.
+  SDLoc dl(Op);
+  SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32,
+                              DAG.getConstant(Intrinsic::arm_get_fpscr,
+                                              MVT::i32));
+  SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
+                                  DAG.getConstant(1U << 22, MVT::i32));
+  SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
+                              DAG.getConstant(22, MVT::i32));
+  return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
+                     DAG.getConstant(3, MVT::i32));
+}
+
+static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
+                         const ARMSubtarget *ST) {
+  EVT VT = N->getValueType(0);
+  SDLoc dl(N);
+
+  if (!ST->hasV6T2Ops())
+    return SDValue();
+
+  SDValue rbit = DAG.getNode(ARMISD::RBIT, dl, VT, N->getOperand(0));
+  return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
+}
+
+/// getCTPOP16BitCounts - Returns a v8i8/v16i8 vector containing the bit-count
+/// for each 16-bit element from operand, repeated.  The basic idea is to
+/// leverage vcnt to get the 8-bit counts, gather and add the results.
+///
+/// Trace for v4i16:
+/// input    = [v0    v1    v2    v3   ] (vi 16-bit element)
+/// cast: N0 = [w0 w1 w2 w3 w4 w5 w6 w7] (v0 = [w0 w1], wi 8-bit element)
+/// vcnt: N1 = [b0 b1 b2 b3 b4 b5 b6 b7] (bi = bit-count of 8-bit element wi)
+/// vrev: N2 = [b1 b0 b3 b2 b5 b4 b7 b6]
+///            [b0 b1 b2 b3 b4 b5 b6 b7]
+///           +[b1 b0 b3 b2 b5 b4 b7 b6]
+/// N3=N1+N2 = [k0 k0 k1 k1 k2 k2 k3 k3] (k0 = b0+b1 = bit-count of 16-bit v0,
+/// vuzp:    = [k0 k1 k2 k3 k0 k1 k2 k3]  each ki is 8-bits)
+static SDValue getCTPOP16BitCounts(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
+  SDValue N0 = DAG.getNode(ISD::BITCAST, DL, VT8Bit, N->getOperand(0));
+  SDValue N1 = DAG.getNode(ISD::CTPOP, DL, VT8Bit, N0);
+  SDValue N2 = DAG.getNode(ARMISD::VREV16, DL, VT8Bit, N1);
+  SDValue N3 = DAG.getNode(ISD::ADD, DL, VT8Bit, N1, N2);
+  return DAG.getNode(ARMISD::VUZP, DL, VT8Bit, N3, N3);
+}
+
+/// lowerCTPOP16BitElements - Returns a v4i16/v8i16 vector containing the
+/// bit-count for each 16-bit element from the operand.  We need slightly
+/// different sequencing for v4i16 and v8i16 to stay within NEON's available
+/// 64/128-bit registers.
+///
+/// Trace for v4i16:
+/// input           = [v0    v1    v2    v3    ] (vi 16-bit element)
+/// v8i8: BitCounts = [k0 k1 k2 k3 k0 k1 k2 k3 ] (ki is the bit-count of vi)
+/// v8i16:Extended  = [k0    k1    k2    k3    k0    k1    k2    k3    ]
+/// v4i16:Extracted = [k0    k1    k2    k3    ]
+static SDValue lowerCTPOP16BitElements(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  SDValue BitCounts = getCTPOP16BitCounts(N, DAG);
+  if (VT.is64BitVector()) {
+    SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, BitCounts);
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Extended,
+                       DAG.getIntPtrConstant(0));
+  } else {
+    SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8,
+                                    BitCounts, DAG.getIntPtrConstant(0));
+    return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, Extracted);
+  }
+}
+
+/// lowerCTPOP32BitElements - Returns a v2i32/v4i32 vector containing the
+/// bit-count for each 32-bit element from the operand.  The idea here is
+/// to split the vector into 16-bit elements, leverage the 16-bit count
+/// routine, and then combine the results.
+///
+/// Trace for v2i32 (v4i32 similar with Extracted/Extended exchanged):
+/// input    = [v0    v1    ] (vi: 32-bit elements)
+/// Bitcast  = [w0 w1 w2 w3 ] (wi: 16-bit elements, v0 = [w0 w1])
+/// Counts16 = [k0 k1 k2 k3 ] (ki: 16-bit elements, bit-count of wi)
+/// vrev: N0 = [k1 k0 k3 k2 ]
+///            [k0 k1 k2 k3 ]
+///       N1 =+[k1 k0 k3 k2 ]
+///            [k0 k2 k1 k3 ]
+///       N2 =+[k1 k3 k0 k2 ]
+///            [k0    k2    k1    k3    ]
+/// Extended =+[k1    k3    k0    k2    ]
+///            [k0    k2    ]
+/// Extracted=+[k1    k3    ]
+///
+static SDValue lowerCTPOP32BitElements(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16;
+
+  SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, VT16Bit, N->getOperand(0));
+  SDValue Counts16 = lowerCTPOP16BitElements(Bitcast.getNode(), DAG);
+  SDValue N0 = DAG.getNode(ARMISD::VREV32, DL, VT16Bit, Counts16);
+  SDValue N1 = DAG.getNode(ISD::ADD, DL, VT16Bit, Counts16, N0);
+  SDValue N2 = DAG.getNode(ARMISD::VUZP, DL, VT16Bit, N1, N1);
+
+  if (VT.is64BitVector()) {
+    SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, N2);
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Extended,
+                       DAG.getIntPtrConstant(0));
+  } else {
+    SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, N2,
+                                    DAG.getIntPtrConstant(0));
+    return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, Extracted);
+  }
+}
+
+static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
+                          const ARMSubtarget *ST) {
+  EVT VT = N->getValueType(0);
+
+  assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
+  assert((VT == MVT::v2i32 || VT == MVT::v4i32 ||
+          VT == MVT::v4i16 || VT == MVT::v8i16) &&
+         "Unexpected type for custom ctpop lowering");
+
+  if (VT.getVectorElementType() == MVT::i32)
+    return lowerCTPOP32BitElements(N, DAG);
+  else
+    return lowerCTPOP16BitElements(N, DAG);
+}
+
+static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
+                          const ARMSubtarget *ST) {
+  EVT VT = N->getValueType(0);
+  SDLoc dl(N);
+
+  if (!VT.isVector())
+    return SDValue();
+
+  // Lower vector shifts on NEON to use VSHL.
+  assert(ST->hasNEON() && "unexpected vector shift");
+
+  // Left shifts translate directly to the vshiftu intrinsic.
+  if (N->getOpcode() == ISD::SHL)
+    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                       DAG.getConstant(Intrinsic::arm_neon_vshiftu, MVT::i32),
+                       N->getOperand(0), N->getOperand(1));
+
+  assert((N->getOpcode() == ISD::SRA ||
+          N->getOpcode() == ISD::SRL) && "unexpected vector shift opcode");
+
+  // NEON uses the same intrinsics for both left and right shifts.  For
+  // right shifts, the shift amounts are negative, so negate the vector of
+  // shift amounts.
+  EVT ShiftVT = N->getOperand(1).getValueType();
+  SDValue NegatedCount = DAG.getNode(ISD::SUB, dl, ShiftVT,
+                                     getZeroVector(ShiftVT, DAG, dl),
+                                     N->getOperand(1));
+  Intrinsic::ID vshiftInt = (N->getOpcode() == ISD::SRA ?
+                             Intrinsic::arm_neon_vshifts :
+                             Intrinsic::arm_neon_vshiftu);
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                     DAG.getConstant(vshiftInt, MVT::i32),
+                     N->getOperand(0), NegatedCount);
+}
+
+static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,
+                                const ARMSubtarget *ST) {
+  EVT VT = N->getValueType(0);
+  SDLoc dl(N);
+
+  // We can get here for a node like i32 = ISD::SHL i32, i64
+  if (VT != MVT::i64)
+    return SDValue();
+
+  assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
+         "Unknown shift to lower!");
+
+  // We only lower SRA, SRL of 1 here, all others use generic lowering.
+  if (!isa<ConstantSDNode>(N->getOperand(1)) ||
+      cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() != 1)
+    return SDValue();
+
+  // If we are in thumb mode, we don't have RRX.
+  if (ST->isThumb1Only()) return SDValue();
+
+  // Okay, we have a 64-bit SRA or SRL of 1.  Lower this to an RRX expr.
+  SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
+                           DAG.getConstant(0, MVT::i32));
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
+                           DAG.getConstant(1, MVT::i32));
+
+  // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and
+  // captures the result into a carry flag.
+  unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG;
+  Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), Hi);
+
+  // The low part is an ARMISD::RRX operand, which shifts the carry in.
+  Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));
+
+  // Merge the pieces into a single i64 value.
+ return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
+}
+
+static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
+  SDValue TmpOp0, TmpOp1;
+  bool Invert = false;
+  bool Swap = false;
+  unsigned Opc = 0;
+
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  SDValue CC = Op.getOperand(2);
+  EVT VT = Op.getValueType();
+  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
+  SDLoc dl(Op);
+
+  if (Op.getOperand(1).getValueType().isFloatingPoint()) {
+    switch (SetCCOpcode) {
+    default: llvm_unreachable("Illegal FP comparison");
+    case ISD::SETUNE:
+    case ISD::SETNE:  Invert = true; // Fallthrough
+    case ISD::SETOEQ:
+    case ISD::SETEQ:  Opc = ARMISD::VCEQ; break;
+    case ISD::SETOLT:
+    case ISD::SETLT: Swap = true; // Fallthrough
+    case ISD::SETOGT:
+    case ISD::SETGT:  Opc = ARMISD::VCGT; break;
+    case ISD::SETOLE:
+    case ISD::SETLE:  Swap = true; // Fallthrough
+    case ISD::SETOGE:
+    case ISD::SETGE: Opc = ARMISD::VCGE; break;
+    case ISD::SETUGE: Swap = true; // Fallthrough
+    case ISD::SETULE: Invert = true; Opc = ARMISD::VCGT; break;
+    case ISD::SETUGT: Swap = true; // Fallthrough
+    case ISD::SETULT: Invert = true; Opc = ARMISD::VCGE; break;
+    case ISD::SETUEQ: Invert = true; // Fallthrough
+    case ISD::SETONE:
+      // Expand this to (OLT | OGT).
+      TmpOp0 = Op0;
+      TmpOp1 = Op1;
+      Opc = ISD::OR;
+      Op0 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp1, TmpOp0);
+      Op1 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp0, TmpOp1);
+      break;
+    case ISD::SETUO: Invert = true; // Fallthrough
+    case ISD::SETO:
+      // Expand this to (OLT | OGE).
+      TmpOp0 = Op0;
+      TmpOp1 = Op1;
+      Opc = ISD::OR;
+      Op0 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp1, TmpOp0);
+      Op1 = DAG.getNode(ARMISD::VCGE, dl, VT, TmpOp0, TmpOp1);
+      break;
+    }
+  } else {
+    // Integer comparisons.
+    switch (SetCCOpcode) {
+    default: llvm_unreachable("Illegal integer comparison");
+    case ISD::SETNE:  Invert = true;
+    case ISD::SETEQ:  Opc = ARMISD::VCEQ; break;
+    case ISD::SETLT:  Swap = true;
+    case ISD::SETGT:  Opc = ARMISD::VCGT; break;
+    case ISD::SETLE:  Swap = true;
+    case ISD::SETGE:  Opc = ARMISD::VCGE; break;
+    case ISD::SETULT: Swap = true;
+    case ISD::SETUGT: Opc = ARMISD::VCGTU; break;
+    case ISD::SETULE: Swap = true;
+    case ISD::SETUGE: Opc = ARMISD::VCGEU; break;
+    }
+
+    // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
+    if (Opc == ARMISD::VCEQ) {
+
+      SDValue AndOp;
+      if (ISD::isBuildVectorAllZeros(Op1.getNode()))
+        AndOp = Op0;
+      else if (ISD::isBuildVectorAllZeros(Op0.getNode()))
+        AndOp = Op1;
+
+      // Ignore bitconvert.
+      if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST)
+        AndOp = AndOp.getOperand(0);
+
+      if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
+        Opc = ARMISD::VTST;
+        Op0 = DAG.getNode(ISD::BITCAST, dl, VT, AndOp.getOperand(0));
+        Op1 = DAG.getNode(ISD::BITCAST, dl, VT, AndOp.getOperand(1));
+        Invert = !Invert;
+      }
+    }
+  }
+
+  if (Swap)
+    std::swap(Op0, Op1);
+
+  // If one of the operands is a constant vector zero, attempt to fold the
+  // comparison to a specialized compare-against-zero form.
+  SDValue SingleOp;
+  if (ISD::isBuildVectorAllZeros(Op1.getNode()))
+    SingleOp = Op0;
+  else if (ISD::isBuildVectorAllZeros(Op0.getNode())) {
+    if (Opc == ARMISD::VCGE)
+      Opc = ARMISD::VCLEZ;
+    else if (Opc == ARMISD::VCGT)
+      Opc = ARMISD::VCLTZ;
+    SingleOp = Op1;
+  }
+
+  SDValue Result;
+  if (SingleOp.getNode()) {
+    switch (Opc) {
+    case ARMISD::VCEQ:
+      Result = DAG.getNode(ARMISD::VCEQZ, dl, VT, SingleOp); break;
+    case ARMISD::VCGE:
+      Result = DAG.getNode(ARMISD::VCGEZ, dl, VT, SingleOp); break;
+    case ARMISD::VCLEZ:
+      Result = DAG.getNode(ARMISD::VCLEZ, dl, VT, SingleOp); break;
+    case ARMISD::VCGT:
+      Result = DAG.getNode(ARMISD::VCGTZ, dl, VT, SingleOp); break;
+    case ARMISD::VCLTZ:
+      Result = DAG.getNode(ARMISD::VCLTZ, dl, VT, SingleOp); break;
+    default:
+      Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
+    }
+  } else {
+     Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
+  }
+
+  if (Invert)
+    Result = DAG.getNOT(dl, Result, VT);
+
+  return Result;
+}
+
+/// isNEONModifiedImm - Check if the specified splat value corresponds to a
+/// valid vector constant for a NEON instruction with a "modified immediate"
+/// operand (e.g., VMOV).  If so, return the encoded value.
+static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
+                                 unsigned SplatBitSize, SelectionDAG &DAG,
+                                 EVT &VT, bool is128Bits, NEONModImmType type) {
+  unsigned OpCmode, Imm;
+
+  // SplatBitSize is set to the smallest size that splats the vector, so a
+  // zero vector will always have SplatBitSize == 8.  However, NEON modified
+  // immediate instructions others than VMOV do not support the 8-bit encoding
+  // of a zero vector, and the default encoding of zero is supposed to be the
+  // 32-bit version.
+  if (SplatBits == 0)
+    SplatBitSize = 32;
+
+  switch (SplatBitSize) {
+  case 8:
+    if (type != VMOVModImm)
+      return SDValue();
+    // Any 1-byte value is OK.  Op=0, Cmode=1110.
+    assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
+    OpCmode = 0xe;
+    Imm = SplatBits;
+    VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
+    break;
+
+  case 16:
+    // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
+    VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
+    if ((SplatBits & ~0xff) == 0) {
+      // Value = 0x00nn: Op=x, Cmode=100x.
+      OpCmode = 0x8;
+      Imm = SplatBits;
+      break;
+    }
+    if ((SplatBits & ~0xff00) == 0) {
+      // Value = 0xnn00: Op=x, Cmode=101x.
+      OpCmode = 0xa;
+      Imm = SplatBits >> 8;
+      break;
+    }
+    return SDValue();
+
+  case 32:
+    // NEON's 32-bit VMOV supports splat values where:
+    // * only one byte is nonzero, or
+    // * the least significant byte is 0xff and the second byte is nonzero, or
+    // * the least significant 2 bytes are 0xff and the third is nonzero.
+    VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
+    if ((SplatBits & ~0xff) == 0) {
+      // Value = 0x000000nn: Op=x, Cmode=000x.
+      OpCmode = 0;
+      Imm = SplatBits;
+      break;
+    }
+    if ((SplatBits & ~0xff00) == 0) {
+      // Value = 0x0000nn00: Op=x, Cmode=001x.
+      OpCmode = 0x2;
+      Imm = SplatBits >> 8;
+      break;
+    }
+    if ((SplatBits & ~0xff0000) == 0) {
+      // Value = 0x00nn0000: Op=x, Cmode=010x.
+      OpCmode = 0x4;
+      Imm = SplatBits >> 16;
+      break;
+    }
+    if ((SplatBits & ~0xff000000) == 0) {
+      // Value = 0xnn000000: Op=x, Cmode=011x.
+      OpCmode = 0x6;
+      Imm = SplatBits >> 24;
+      break;
+    }
+
+    // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC
+    if (type == OtherModImm) return SDValue();
+
+    if ((SplatBits & ~0xffff) == 0 &&
+        ((SplatBits | SplatUndef) & 0xff) == 0xff) {
+      // Value = 0x0000nnff: Op=x, Cmode=1100.
+      OpCmode = 0xc;
+      Imm = SplatBits >> 8;
+      break;
+    }
+
+    if ((SplatBits & ~0xffffff) == 0 &&
+        ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
+      // Value = 0x00nnffff: Op=x, Cmode=1101.
+      OpCmode = 0xd;
+      Imm = SplatBits >> 16;
+      break;
+    }
+
+    // Note: there are a few 32-bit splat values (specifically: 00ffff00,
+    // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
+    // VMOV.I32.  A (very) minor optimization would be to replicate the value
+    // and fall through here to test for a valid 64-bit splat.  But, then the
+    // caller would also need to check and handle the change in size.
+    return SDValue();
+
+  case 64: {
+    if (type != VMOVModImm)
+      return SDValue();
+    // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
+    uint64_t BitMask = 0xff;
+    uint64_t Val = 0;
+    unsigned ImmMask = 1;
+    Imm = 0;
+    for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
+      if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
+        Val |= BitMask;
+        Imm |= ImmMask;
+      } else if ((SplatBits & BitMask) != 0) {
+        return SDValue();
+      }
+      BitMask <<= 8;
+      ImmMask <<= 1;
+    }
+
+    if (DAG.getTargetLoweringInfo().isBigEndian())
+      // swap higher and lower 32 bit word
+      Imm = ((Imm & 0xf) << 4) | ((Imm & 0xf0) >> 4);
+
+    // Op=1, Cmode=1110.
+    OpCmode = 0x1e;
+    VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
+    break;
+  }
+
+  default:
+    llvm_unreachable("unexpected size for isNEONModifiedImm");
+  }
+
+  unsigned EncodedVal = ARM_AM::createNEONModImm(OpCmode, Imm);
+  return DAG.getTargetConstant(EncodedVal, MVT::i32);
+}
+
+SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
+                                           const ARMSubtarget *ST) const {
+  if (!ST->hasVFP3())
+    return SDValue();
+
+  bool IsDouble = Op.getValueType() == MVT::f64;
+  ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
+
+  // Try splatting with a VMOV.f32...
+  APFloat FPVal = CFP->getValueAPF();
+  int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal);
+
+  if (ImmVal != -1) {
+    if (IsDouble || !ST->useNEONForSinglePrecisionFP()) {
+      // We have code in place to select a valid ConstantFP already, no need to
+      // do any mangling.
+      return Op;
+    }
+
+    // It's a float and we are trying to use NEON operations where
+    // possible. Lower it to a splat followed by an extract.
+    SDLoc DL(Op);
+    SDValue NewVal = DAG.getTargetConstant(ImmVal, MVT::i32);
+    SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32,
+                                      NewVal);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant,
+                       DAG.getConstant(0, MVT::i32));
+  }
+
+  // The rest of our options are NEON only, make sure that's allowed before
+  // proceeding..
+  if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP()))
+    return SDValue();
+
+  EVT VMovVT;
+  uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue();
+
+  // It wouldn't really be worth bothering for doubles except for one very
+  // important value, which does happen to match: 0.0. So make sure we don't do
+  // anything stupid.
+  if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32))
+    return SDValue();
+
+  // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
+  SDValue NewVal = isNEONModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, VMovVT,
+                                     false, VMOVModImm);
+  if (NewVal != SDValue()) {
+    SDLoc DL(Op);
+    SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
+                                      NewVal);
+    if (IsDouble)
+      return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
+
+    // It's a float: cast and extract a vector element.
+    SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
+                                       VecConstant);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
+                       DAG.getConstant(0, MVT::i32));
+  }
+
+  // Finally, try a VMVN.i32
+  NewVal = isNEONModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, VMovVT,
+                             false, VMVNModImm);
+  if (NewVal != SDValue()) {
+    SDLoc DL(Op);
+    SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);
+
+    if (IsDouble)
+      return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
+
+    // It's a float: cast and extract a vector element.
+    SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
+                                       VecConstant);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
+                       DAG.getConstant(0, MVT::i32));
+  }
+
+  return SDValue();
+}
+
+// check if an VEXT instruction can handle the shuffle mask when the
+// vector sources of the shuffle are the same.
+static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  // Assume that the first shuffle index is not UNDEF.  Fail if it is.
+  if (M[0] < 0)
+    return false;
+
+  Imm = M[0];
+
+  // If this is a VEXT shuffle, the immediate value is the index of the first
+  // element.  The other shuffle indices must be the successive elements after
+  // the first one.
+  unsigned ExpectedElt = Imm;
+  for (unsigned i = 1; i < NumElts; ++i) {
+    // Increment the expected index.  If it wraps around, just follow it
+    // back to index zero and keep going.
+    ++ExpectedElt;
+    if (ExpectedElt == NumElts)
+      ExpectedElt = 0;
+
+    if (M[i] < 0) continue; // ignore UNDEF indices
+    if (ExpectedElt != static_cast<unsigned>(M[i]))
+      return false;
+  }
+
+  return true;
+}
+
+
+static bool isVEXTMask(ArrayRef<int> M, EVT VT,
+                       bool &ReverseVEXT, unsigned &Imm) {
+  unsigned NumElts = VT.getVectorNumElements();
+  ReverseVEXT = false;
+
+  // Assume that the first shuffle index is not UNDEF.  Fail if it is.
+  if (M[0] < 0)
+    return false;
+
+  Imm = M[0];
+
+  // If this is a VEXT shuffle, the immediate value is the index of the first
+  // element.  The other shuffle indices must be the successive elements after
+  // the first one.
+  unsigned ExpectedElt = Imm;
+  for (unsigned i = 1; i < NumElts; ++i) {
+    // Increment the expected index.  If it wraps around, it may still be
+    // a VEXT but the source vectors must be swapped.
+    ExpectedElt += 1;
+    if (ExpectedElt == NumElts * 2) {
+      ExpectedElt = 0;
+      ReverseVEXT = true;
+    }
+
+    if (M[i] < 0) continue; // ignore UNDEF indices
+    if (ExpectedElt != static_cast<unsigned>(M[i]))
+      return false;
+  }
+
+  // Adjust the index value if the source operands will be swapped.
+  if (ReverseVEXT)
+    Imm -= NumElts;
+
+  return true;
+}
+
+/// isVREVMask - Check if a vector shuffle corresponds to a VREV
+/// instruction with the specified blocksize.  (The order of the elements
+/// within each block of the vector is reversed.)
+static bool isVREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
+  assert((BlockSize==16 || BlockSize==32 || BlockSize==64) &&
+         "Only possible block sizes for VREV are: 16, 32, 64");
+
+  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
+  if (EltSz == 64)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned BlockElts = M[0] + 1;
+  // If the first shuffle index is UNDEF, be optimistic.
+  if (M[0] < 0)
+    BlockElts = BlockSize / EltSz;
+
+  if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
+    return false;
+
+  for (unsigned i = 0; i < NumElts; ++i) {
+    if (M[i] < 0) continue; // ignore UNDEF indices
+    if ((unsigned) M[i] != (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts))
+      return false;
+  }
+
+  return true;
+}
+
+static bool isVTBLMask(ArrayRef<int> M, EVT VT) {
+  // We can handle <8 x i8> vector shuffles. If the index in the mask is out of
+  // range, then 0 is placed into the resulting vector. So pretty much any mask
+  // of 8 elements can work here.
+  return VT == MVT::v8i8 && M.size() == 8;
+}
+
+static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
+  if (EltSz == 64)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned i = 0; i < NumElts; i += 2) {
+    if ((M[i] >= 0 && (unsigned) M[i] != i + WhichResult) ||
+        (M[i+1] >= 0 && (unsigned) M[i+1] != i + NumElts + WhichResult))
+      return false;
+  }
+  return true;
+}
+
+/// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of
+/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
+/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
+static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
+  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
+  if (EltSz == 64)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned i = 0; i < NumElts; i += 2) {
+    if ((M[i] >= 0 && (unsigned) M[i] != i + WhichResult) ||
+        (M[i+1] >= 0 && (unsigned) M[i+1] != i + WhichResult))
+      return false;
+  }
+  return true;
+}
+
+static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
+  if (EltSz == 64)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned i = 0; i != NumElts; ++i) {
+    if (M[i] < 0) continue; // ignore UNDEF indices
+    if ((unsigned) M[i] != 2 * i + WhichResult)
+      return false;
+  }
+
+  // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
+  if (VT.is64BitVector() && EltSz == 32)
+    return false;
+
+  return true;
+}
+
+/// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of
+/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
+/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
+static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
+  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
+  if (EltSz == 64)
+    return false;
+
+  unsigned Half = VT.getVectorNumElements() / 2;
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned j = 0; j != 2; ++j) {
+    unsigned Idx = WhichResult;
+    for (unsigned i = 0; i != Half; ++i) {
+      int MIdx = M[i + j * Half];
+      if (MIdx >= 0 && (unsigned) MIdx != Idx)
+        return false;
+      Idx += 2;
+    }
+  }
+
+  // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
+  if (VT.is64BitVector() && EltSz == 32)
+    return false;
+
+  return true;
+}
+
+static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
+  if (EltSz == 64)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  unsigned Idx = WhichResult * NumElts / 2;
+  for (unsigned i = 0; i != NumElts; i += 2) {
+    if ((M[i] >= 0 && (unsigned) M[i] != Idx) ||
+        (M[i+1] >= 0 && (unsigned) M[i+1] != Idx + NumElts))
+      return false;
+    Idx += 1;
+  }
+
+  // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
+  if (VT.is64BitVector() && EltSz == 32)
+    return false;
+
+  return true;
+}
+
+/// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of
+/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
+/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
+static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
+  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
+  if (EltSz == 64)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  unsigned Idx = WhichResult * NumElts / 2;
+  for (unsigned i = 0; i != NumElts; i += 2) {
+    if ((M[i] >= 0 && (unsigned) M[i] != Idx) ||
+        (M[i+1] >= 0 && (unsigned) M[i+1] != Idx))
+      return false;
+    Idx += 1;
+  }
+
+  // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
+  if (VT.is64BitVector() && EltSz == 32)
+    return false;
+
+  return true;
+}
+
+/// \return true if this is a reverse operation on an vector.
+static bool isReverseMask(ArrayRef<int> M, EVT VT) {
+  unsigned NumElts = VT.getVectorNumElements();
+  // Make sure the mask has the right size.
+  if (NumElts != M.size())
+      return false;
+
+  // Look for <15, ..., 3, -1, 1, 0>.
+  for (unsigned i = 0; i != NumElts; ++i)
+    if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i))
+      return false;
+
+  return true;
+}
+
+// If N is an integer constant that can be moved into a register in one
+// instruction, return an SDValue of such a constant (will become a MOV
+// instruction).  Otherwise return null.
+static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG,
+                                     const ARMSubtarget *ST, SDLoc dl) {
+  uint64_t Val;
+  if (!isa<ConstantSDNode>(N))
+    return SDValue();
+  Val = cast<ConstantSDNode>(N)->getZExtValue();
+
+  if (ST->isThumb1Only()) {
+    if (Val <= 255 || ~Val <= 255)
+      return DAG.getConstant(Val, MVT::i32);
+  } else {
+    if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1)
+      return DAG.getConstant(Val, MVT::i32);
+  }
+  return SDValue();
+}
+
+// If this is a case we can't handle, return null and let the default
+// expansion code take care of it.
+SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
+                                             const ARMSubtarget *ST) const {
+  BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+
+  APInt SplatBits, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
+    if (SplatBitSize <= 64) {
+      // Check if an immediate VMOV works.
+      EVT VmovVT;
+      SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
+                                      SplatUndef.getZExtValue(), SplatBitSize,
+                                      DAG, VmovVT, VT.is128BitVector(),
+                                      VMOVModImm);
+      if (Val.getNode()) {
+        SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
+        return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
+      }
+
+      // Try an immediate VMVN.
+      uint64_t NegatedImm = (~SplatBits).getZExtValue();
+      Val = isNEONModifiedImm(NegatedImm,
+                                      SplatUndef.getZExtValue(), SplatBitSize,
+                                      DAG, VmovVT, VT.is128BitVector(),
+                                      VMVNModImm);
+      if (Val.getNode()) {
+        SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
+        return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
+      }
+
+      // Use vmov.f32 to materialize other v2f32 and v4f32 splats.
+      if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) {
+        int ImmVal = ARM_AM::getFP32Imm(SplatBits);
+        if (ImmVal != -1) {
+          SDValue Val = DAG.getTargetConstant(ImmVal, MVT::i32);
+          return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val);
+        }
+      }
+    }
+  }
+
+  // Scan through the operands to see if only one value is used.
+  //
+  // As an optimisation, even if more than one value is used it may be more
+  // profitable to splat with one value then change some lanes.
+  //
+  // Heuristically we decide to do this if the vector has a "dominant" value,
+  // defined as splatted to more than half of the lanes.
+  unsigned NumElts = VT.getVectorNumElements();
+  bool isOnlyLowElement = true;
+  bool usesOnlyOneValue = true;
+  bool hasDominantValue = false;
+  bool isConstant = true;
+
+  // Map of the number of times a particular SDValue appears in the
+  // element list.
+  DenseMap<SDValue, unsigned> ValueCounts;
+  SDValue Value;
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue V = Op.getOperand(i);
+    if (V.getOpcode() == ISD::UNDEF)
+      continue;
+    if (i > 0)
+      isOnlyLowElement = false;
+    if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
+      isConstant = false;
+
+    ValueCounts.insert(std::make_pair(V, 0));
+    unsigned &Count = ValueCounts[V];
+
+    // Is this value dominant? (takes up more than half of the lanes)
+    if (++Count > (NumElts / 2)) {
+      hasDominantValue = true;
+      Value = V;
+    }
+  }
+  if (ValueCounts.size() != 1)
+    usesOnlyOneValue = false;
+  if (!Value.getNode() && ValueCounts.size() > 0)
+    Value = ValueCounts.begin()->first;
+
+  if (ValueCounts.size() == 0)
+    return DAG.getUNDEF(VT);
+
+  // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR.
+  // Keep going if we are hitting this case.
+  if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()))
+    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
+
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+
+  // Use VDUP for non-constant splats.  For f32 constant splats, reduce to
+  // i32 and try again.
+  if (hasDominantValue && EltSize <= 32) {
+    if (!isConstant) {
+      SDValue N;
+
+      // If we are VDUPing a value that comes directly from a vector, that will
+      // cause an unnecessary move to and from a GPR, where instead we could
+      // just use VDUPLANE. We can only do this if the lane being extracted
+      // is at a constant index, as the VDUP from lane instructions only have
+      // constant-index forms.
+      if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+          isa<ConstantSDNode>(Value->getOperand(1))) {
+        // We need to create a new undef vector to use for the VDUPLANE if the
+        // size of the vector from which we get the value is different than the
+        // size of the vector that we need to create. We will insert the element
+        // such that the register coalescer will remove unnecessary copies.
+        if (VT != Value->getOperand(0).getValueType()) {
+          ConstantSDNode *constIndex;
+          constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1));
+          assert(constIndex && "The index is not a constant!");
+          unsigned index = constIndex->getAPIntValue().getLimitedValue() %
+                             VT.getVectorNumElements();
+          N =  DAG.getNode(ARMISD::VDUPLANE, dl, VT,
+                 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT),
+                        Value, DAG.getConstant(index, MVT::i32)),
+                           DAG.getConstant(index, MVT::i32));
+        } else
+          N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
+                        Value->getOperand(0), Value->getOperand(1));
+      } else
+        N = DAG.getNode(ARMISD::VDUP, dl, VT, Value);
+
+      if (!usesOnlyOneValue) {
+        // The dominant value was splatted as 'N', but we now have to insert
+        // all differing elements.
+        for (unsigned I = 0; I < NumElts; ++I) {
+          if (Op.getOperand(I) == Value)
+            continue;
+          SmallVector<SDValue, 3> Ops;
+          Ops.push_back(N);
+          Ops.push_back(Op.getOperand(I));
+          Ops.push_back(DAG.getConstant(I, MVT::i32));
+          N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops);
+        }
+      }
+      return N;
+    }
+    if (VT.getVectorElementType().isFloatingPoint()) {
+      SmallVector<SDValue, 8> Ops;
+      for (unsigned i = 0; i < NumElts; ++i)
+        Ops.push_back(DAG.getNode(ISD::BITCAST, dl, MVT::i32,
+                                  Op.getOperand(i)));
+      EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
+      SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, Ops);
+      Val = LowerBUILD_VECTOR(Val, DAG, ST);
+      if (Val.getNode())
+        return DAG.getNode(ISD::BITCAST, dl, VT, Val);
+    }
+    if (usesOnlyOneValue) {
+      SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
+      if (isConstant && Val.getNode())
+        return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
+    }
+  }
+
+  // If all elements are constants and the case above didn't get hit, fall back
+  // to the default expansion, which will generate a load from the constant
+  // pool.
+  if (isConstant)
+    return SDValue();
+
+  // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
+  if (NumElts >= 4) {
+    SDValue shuffle = ReconstructShuffle(Op, DAG);
+    if (shuffle != SDValue())
+      return shuffle;
+  }
+
+  // Vectors with 32- or 64-bit elements can be built by directly assigning
+  // the subregisters.  Lower it to an ARMISD::BUILD_VECTOR so the operands
+  // will be legalized.
+  if (EltSize >= 32) {
+    // Do the expansion with floating-point types, since that is what the VFP
+    // registers are defined to use, and since i64 is not legal.
+    EVT EltVT = EVT::getFloatingPointVT(EltSize);
+    EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
+    SmallVector<SDValue, 8> Ops;
+    for (unsigned i = 0; i < NumElts; ++i)
+      Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i)));
+    SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
+    return DAG.getNode(ISD::BITCAST, dl, VT, Val);
+  }
+
+  // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
+  // know the default expansion would otherwise fall back on something even
+  // worse. For a vector with one or two non-undef values, that's
+  // scalar_to_vector for the elements followed by a shuffle (provided the
+  // shuffle is valid for the target) and materialization element by element
+  // on the stack followed by a load for everything else.
+  if (!isConstant && !usesOnlyOneValue) {
+    SDValue Vec = DAG.getUNDEF(VT);
+    for (unsigned i = 0 ; i < NumElts; ++i) {
+      SDValue V = Op.getOperand(i);
+      if (V.getOpcode() == ISD::UNDEF)
+        continue;
+      SDValue LaneIdx = DAG.getConstant(i, MVT::i32);
+      Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
+    }
+    return Vec;
+  }
+
+  return SDValue();
+}
+
+// Gather data to see if the operation can be modelled as a
+// shuffle in combination with VEXTs.
+SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+  unsigned NumElts = VT.getVectorNumElements();
+
+  SmallVector<SDValue, 2> SourceVecs;
+  SmallVector<unsigned, 2> MinElts;
+  SmallVector<unsigned, 2> MaxElts;
+
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue V = Op.getOperand(i);
+    if (V.getOpcode() == ISD::UNDEF)
+      continue;
+    else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
+      // A shuffle can only come from building a vector from various
+      // elements of other vectors.
+      return SDValue();
+    } else if (V.getOperand(0).getValueType().getVectorElementType() !=
+               VT.getVectorElementType()) {
+      // This code doesn't know how to handle shuffles where the vector
+      // element types do not match (this happens because type legalization
+      // promotes the return type of EXTRACT_VECTOR_ELT).
+      // FIXME: It might be appropriate to extend this code to handle
+      // mismatched types.
+      return SDValue();
+    }
+
+    // Record this extraction against the appropriate vector if possible...
+    SDValue SourceVec = V.getOperand(0);
+    // If the element number isn't a constant, we can't effectively
+    // analyze what's going on.
+    if (!isa<ConstantSDNode>(V.getOperand(1)))
+      return SDValue();
+    unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
+    bool FoundSource = false;
+    for (unsigned j = 0; j < SourceVecs.size(); ++j) {
+      if (SourceVecs[j] == SourceVec) {
+        if (MinElts[j] > EltNo)
+          MinElts[j] = EltNo;
+        if (MaxElts[j] < EltNo)
+          MaxElts[j] = EltNo;
+        FoundSource = true;
+        break;
+      }
+    }
+
+    // Or record a new source if not...
+    if (!FoundSource) {
+      SourceVecs.push_back(SourceVec);
+      MinElts.push_back(EltNo);
+      MaxElts.push_back(EltNo);
+    }
+  }
+
+  // Currently only do something sane when at most two source vectors
+  // involved.
+  if (SourceVecs.size() > 2)
+    return SDValue();
+
+  SDValue ShuffleSrcs[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT) };
+  int VEXTOffsets[2] = {0, 0};
+
+  // This loop extracts the usage patterns of the source vectors
+  // and prepares appropriate SDValues for a shuffle if possible.
+  for (unsigned i = 0; i < SourceVecs.size(); ++i) {
+    if (SourceVecs[i].getValueType() == VT) {
+      // No VEXT necessary
+      ShuffleSrcs[i] = SourceVecs[i];
+      VEXTOffsets[i] = 0;
+      continue;
+    } else if (SourceVecs[i].getValueType().getVectorNumElements() < NumElts) {
+      // It probably isn't worth padding out a smaller vector just to
+      // break it down again in a shuffle.
+      return SDValue();
+    }
+
+    // Since only 64-bit and 128-bit vectors are legal on ARM and
+    // we've eliminated the other cases...
+    assert(SourceVecs[i].getValueType().getVectorNumElements() == 2*NumElts &&
+           "unexpected vector sizes in ReconstructShuffle");
+
+    if (MaxElts[i] - MinElts[i] >= NumElts) {
+      // Span too large for a VEXT to cope
+      return SDValue();
+    }
+
+    if (MinElts[i] >= NumElts) {
+      // The extraction can just take the second half
+      VEXTOffsets[i] = NumElts;
+      ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+                                   SourceVecs[i],
+                                   DAG.getIntPtrConstant(NumElts));
+    } else if (MaxElts[i] < NumElts) {
+      // The extraction can just take the first half
+      VEXTOffsets[i] = 0;
+      ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+                                   SourceVecs[i],
+                                   DAG.getIntPtrConstant(0));
+    } else {
+      // An actual VEXT is needed
+      VEXTOffsets[i] = MinElts[i];
+      SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+                                     SourceVecs[i],
+                                     DAG.getIntPtrConstant(0));
+      SDValue VEXTSrc2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+                                     SourceVecs[i],
+                                     DAG.getIntPtrConstant(NumElts));
+      ShuffleSrcs[i] = DAG.getNode(ARMISD::VEXT, dl, VT, VEXTSrc1, VEXTSrc2,
+                                   DAG.getConstant(VEXTOffsets[i], MVT::i32));
+    }
+  }
+
+  SmallVector<int, 8> Mask;
+
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue Entry = Op.getOperand(i);
+    if (Entry.getOpcode() == ISD::UNDEF) {
+      Mask.push_back(-1);
+      continue;
+    }
+
+    SDValue ExtractVec = Entry.getOperand(0);
+    int ExtractElt = cast<ConstantSDNode>(Op.getOperand(i)
+                                          .getOperand(1))->getSExtValue();
+    if (ExtractVec == SourceVecs[0]) {
+      Mask.push_back(ExtractElt - VEXTOffsets[0]);
+    } else {
+      Mask.push_back(ExtractElt + NumElts - VEXTOffsets[1]);
+    }
+  }
+
+  // Final check before we try to produce nonsense...
+  if (isShuffleMaskLegal(Mask, VT))
+    return DAG.getVectorShuffle(VT, dl, ShuffleSrcs[0], ShuffleSrcs[1],
+                                &Mask[0]);
+
+  return SDValue();
+}
+
+/// isShuffleMaskLegal - Targets can use this to indicate that they only
+/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
+/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
+/// are assumed to be legal.
+bool
+ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
+                                      EVT VT) const {
+  if (VT.getVectorNumElements() == 4 &&
+      (VT.is128BitVector() || VT.is64BitVector())) {
+    unsigned PFIndexes[4];
+    for (unsigned i = 0; i != 4; ++i) {
+      if (M[i] < 0)
+        PFIndexes[i] = 8;
+      else
+        PFIndexes[i] = M[i];
+    }
+
+    // Compute the index in the perfect shuffle table.
+    unsigned PFTableIndex =
+      PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
+    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
+    unsigned Cost = (PFEntry >> 30);
+
+    if (Cost <= 4)
+      return true;
+  }
+
+  bool ReverseVEXT;
+  unsigned Imm, WhichResult;
+
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+  return (EltSize >= 32 ||
+          ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
+          isVREVMask(M, VT, 64) ||
+          isVREVMask(M, VT, 32) ||
+          isVREVMask(M, VT, 16) ||
+          isVEXTMask(M, VT, ReverseVEXT, Imm) ||
+          isVTBLMask(M, VT) ||
+          isVTRNMask(M, VT, WhichResult) ||
+          isVUZPMask(M, VT, WhichResult) ||
+          isVZIPMask(M, VT, WhichResult) ||
+          isVTRN_v_undef_Mask(M, VT, WhichResult) ||
+          isVUZP_v_undef_Mask(M, VT, WhichResult) ||
+          isVZIP_v_undef_Mask(M, VT, WhichResult) ||
+          ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(M, VT)));
+}
+
+/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
+/// the specified operations to build the shuffle.
+static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
+                                      SDValue RHS, SelectionDAG &DAG,
+                                      SDLoc dl) {
+  unsigned OpNum = (PFEntry >> 26) & 0x0F;
+  unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
+  unsigned RHSID = (PFEntry >>  0) & ((1 << 13)-1);
+
+  enum {
+    OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
+    OP_VREV,
+    OP_VDUP0,
+    OP_VDUP1,
+    OP_VDUP2,
+    OP_VDUP3,
+    OP_VEXT1,
+    OP_VEXT2,
+    OP_VEXT3,
+    OP_VUZPL, // VUZP, left result
+    OP_VUZPR, // VUZP, right result
+    OP_VZIPL, // VZIP, left result
+    OP_VZIPR, // VZIP, right result
+    OP_VTRNL, // VTRN, left result
+    OP_VTRNR  // VTRN, right result
+  };
+
+  if (OpNum == OP_COPY) {
+    if (LHSID == (1*9+2)*9+3) return LHS;
+    assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
+    return RHS;
+  }
+
+  SDValue OpLHS, OpRHS;
+  OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
+  OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
+  EVT VT = OpLHS.getValueType();
+
+  switch (OpNum) {
+  default: llvm_unreachable("Unknown shuffle opcode!");
+  case OP_VREV:
+    // VREV divides the vector in half and swaps within the half.
+    if (VT.getVectorElementType() == MVT::i32 ||
+        VT.getVectorElementType() == MVT::f32)
+      return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
+    // vrev <4 x i16> -> VREV32
+    if (VT.getVectorElementType() == MVT::i16)
+      return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS);
+    // vrev <4 x i8> -> VREV16
+    assert(VT.getVectorElementType() == MVT::i8);
+    return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS);
+  case OP_VDUP0:
+  case OP_VDUP1:
+  case OP_VDUP2:
+  case OP_VDUP3:
+    return DAG.getNode(ARMISD::VDUPLANE, dl, VT,
+                       OpLHS, DAG.getConstant(OpNum-OP_VDUP0, MVT::i32));
+  case OP_VEXT1:
+  case OP_VEXT2:
+  case OP_VEXT3:
+    return DAG.getNode(ARMISD::VEXT, dl, VT,
+                       OpLHS, OpRHS,
+                       DAG.getConstant(OpNum-OP_VEXT1+1, MVT::i32));
+  case OP_VUZPL:
+  case OP_VUZPR:
+    return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
+                       OpLHS, OpRHS).getValue(OpNum-OP_VUZPL);
+  case OP_VZIPL:
+  case OP_VZIPR:
+    return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
+                       OpLHS, OpRHS).getValue(OpNum-OP_VZIPL);
+  case OP_VTRNL:
+  case OP_VTRNR:
+    return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
+                       OpLHS, OpRHS).getValue(OpNum-OP_VTRNL);
+  }
+}
+
+static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op,
+                                       ArrayRef<int> ShuffleMask,
+                                       SelectionDAG &DAG) {
+  // Check to see if we can use the VTBL instruction.
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  SDLoc DL(Op);
+
+  SmallVector<SDValue, 8> VTBLMask;
+  for (ArrayRef<int>::iterator
+         I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I)
+    VTBLMask.push_back(DAG.getConstant(*I, MVT::i32));
+
+  if (V2.getNode()->getOpcode() == ISD::UNDEF)
+    return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
+                       DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8, VTBLMask));
+
+  return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
+                     DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8, VTBLMask));
+}
+
+static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op,
+                                                      SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  SDValue OpLHS = Op.getOperand(0);
+  EVT VT = OpLHS.getValueType();
+
+  assert((VT == MVT::v8i16 || VT == MVT::v16i8) &&
+         "Expect an v8i16/v16i8 type");
+  OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, OpLHS);
+  // For a v16i8 type: After the VREV, we have got <8, ...15, 8, ..., 0>. Now,
+  // extract the first 8 bytes into the top double word and the last 8 bytes
+  // into the bottom double word. The v8i16 case is similar.
+  unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4;
+  return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS,
+                     DAG.getConstant(ExtractNum, MVT::i32));
+}
+
+static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
+
+  // Convert shuffles that are directly supported on NEON to target-specific
+  // DAG nodes, instead of keeping them as shuffles and matching them again
+  // during code selection.  This is more efficient and avoids the possibility
+  // of inconsistencies between legalization and selection.
+  // FIXME: floating-point vectors should be canonicalized to integer vectors
+  // of the same time so that they get CSEd properly.
+  ArrayRef<int> ShuffleMask = SVN->getMask();
+
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+  if (EltSize <= 32) {
+    if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) {
+      int Lane = SVN->getSplatIndex();
+      // If this is undef splat, generate it via "just" vdup, if possible.
+      if (Lane == -1) Lane = 0;
+
+      // Test if V1 is a SCALAR_TO_VECTOR.
+      if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+        return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
+      }
+      // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
+      // (and probably will turn into a SCALAR_TO_VECTOR once legalization
+      // reaches it).
+      if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR &&
+          !isa<ConstantSDNode>(V1.getOperand(0))) {
+        bool IsScalarToVector = true;
+        for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i)
+          if (V1.getOperand(i).getOpcode() != ISD::UNDEF) {
+            IsScalarToVector = false;
+            break;
+          }
+        if (IsScalarToVector)
+          return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
+      }
+      return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
+                         DAG.getConstant(Lane, MVT::i32));
+    }
+
+    bool ReverseVEXT;
+    unsigned Imm;
+    if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
+      if (ReverseVEXT)
+        std::swap(V1, V2);
+      return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
+                         DAG.getConstant(Imm, MVT::i32));
+    }
+
+    if (isVREVMask(ShuffleMask, VT, 64))
+      return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
+    if (isVREVMask(ShuffleMask, VT, 32))
+      return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
+    if (isVREVMask(ShuffleMask, VT, 16))
+      return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
+
+    if (V2->getOpcode() == ISD::UNDEF &&
+        isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
+      return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
+                         DAG.getConstant(Imm, MVT::i32));
+    }
+
+    // Check for Neon shuffles that modify both input vectors in place.
+    // If both results are used, i.e., if there are two shuffles with the same
+    // source operands and with masks corresponding to both results of one of
+    // these operations, DAG memoization will ensure that a single node is
+    // used for both shuffles.
+    unsigned WhichResult;
+    if (isVTRNMask(ShuffleMask, VT, WhichResult))
+      return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
+                         V1, V2).getValue(WhichResult);
+    if (isVUZPMask(ShuffleMask, VT, WhichResult))
+      return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
+                         V1, V2).getValue(WhichResult);
+    if (isVZIPMask(ShuffleMask, VT, WhichResult))
+      return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
+                         V1, V2).getValue(WhichResult);
+
+    if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
+      return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
+                         V1, V1).getValue(WhichResult);
+    if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
+      return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
+                         V1, V1).getValue(WhichResult);
+    if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
+      return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
+                         V1, V1).getValue(WhichResult);
+  }
+
+  // If the shuffle is not directly supported and it has 4 elements, use
+  // the PerfectShuffle-generated table to synthesize it from other shuffles.
+  unsigned NumElts = VT.getVectorNumElements();
+  if (NumElts == 4) {
+    unsigned PFIndexes[4];
+    for (unsigned i = 0; i != 4; ++i) {
+      if (ShuffleMask[i] < 0)
+        PFIndexes[i] = 8;
+      else
+        PFIndexes[i] = ShuffleMask[i];
+    }
+
+    // Compute the index in the perfect shuffle table.
+    unsigned PFTableIndex =
+      PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
+    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
+    unsigned Cost = (PFEntry >> 30);
+
+    if (Cost <= 4)
+      return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
+  }
+
+  // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
+  if (EltSize >= 32) {
+    // Do the expansion with floating-point types, since that is what the VFP
+    // registers are defined to use, and since i64 is not legal.
+    EVT EltVT = EVT::getFloatingPointVT(EltSize);
+    EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
+    V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1);
+    V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2);
+    SmallVector<SDValue, 8> Ops;
+    for (unsigned i = 0; i < NumElts; ++i) {
+      if (ShuffleMask[i] < 0)
+        Ops.push_back(DAG.getUNDEF(EltVT));
+      else
+        Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
+                                  ShuffleMask[i] < (int)NumElts ? V1 : V2,
+                                  DAG.getConstant(ShuffleMask[i] & (NumElts-1),
+                                                  MVT::i32)));
+    }
+    SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
+    return DAG.getNode(ISD::BITCAST, dl, VT, Val);
+  }
+
+  if ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT))
+    return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG);
+
+  if (VT == MVT::v8i8) {
+    SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG);
+    if (NewOp.getNode())
+      return NewOp;
+  }
+
+  return SDValue();
+}
+
+static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
+  // INSERT_VECTOR_ELT is legal only for immediate indexes.
+  SDValue Lane = Op.getOperand(2);
+  if (!isa<ConstantSDNode>(Lane))
+    return SDValue();
+
+  return Op;
+}
+
+static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
+  // EXTRACT_VECTOR_ELT is legal only for immediate indexes.
+  SDValue Lane = Op.getOperand(1);
+  if (!isa<ConstantSDNode>(Lane))
+    return SDValue();
+
+  SDValue Vec = Op.getOperand(0);
+  if (Op.getValueType() == MVT::i32 &&
+      Vec.getValueType().getVectorElementType().getSizeInBits() < 32) {
+    SDLoc dl(Op);
+    return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
+  }
+
+  return Op;
+}
+
+static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
+  // The only time a CONCAT_VECTORS operation can have legal types is when
+  // two 64-bit vectors are concatenated to a 128-bit vector.
+  assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
+         "unexpected CONCAT_VECTORS");
+  SDLoc dl(Op);
+  SDValue Val = DAG.getUNDEF(MVT::v2f64);
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  if (Op0.getOpcode() != ISD::UNDEF)
+    Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
+                      DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0),
+                      DAG.getIntPtrConstant(0));
+  if (Op1.getOpcode() != ISD::UNDEF)
+    Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
+                      DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1),
+                      DAG.getIntPtrConstant(1));
+  return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val);
+}
+
+/// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
+/// element has been zero/sign-extended, depending on the isSigned parameter,
+/// from an integer type half its size.
+static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
+                                   bool isSigned) {
+  // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32.
+  EVT VT = N->getValueType(0);
+  if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) {
+    SDNode *BVN = N->getOperand(0).getNode();
+    if (BVN->getValueType(0) != MVT::v4i32 ||
+        BVN->getOpcode() != ISD::BUILD_VECTOR)
+      return false;
+    unsigned LoElt = DAG.getTargetLoweringInfo().isBigEndian() ? 1 : 0;
+    unsigned HiElt = 1 - LoElt;
+    ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt));
+    ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt));
+    ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2));
+    ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2));
+    if (!Lo0 || !Hi0 || !Lo1 || !Hi1)
+      return false;
+    if (isSigned) {
+      if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 &&
+          Hi1->getSExtValue() == Lo1->getSExtValue() >> 32)
+        return true;
+    } else {
+      if (Hi0->isNullValue() && Hi1->isNullValue())
+        return true;
+    }
+    return false;
+  }
+
+  if (N->getOpcode() != ISD::BUILD_VECTOR)
+    return false;
+
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    SDNode *Elt = N->getOperand(i).getNode();
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
+      unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+      unsigned HalfSize = EltSize / 2;
+      if (isSigned) {
+        if (!isIntN(HalfSize, C->getSExtValue()))
+          return false;
+      } else {
+        if (!isUIntN(HalfSize, C->getZExtValue()))
+          return false;
+      }
+      continue;
+    }
+    return false;
+  }
+
+  return true;
+}
+
+/// isSignExtended - Check if a node is a vector value that is sign-extended
+/// or a constant BUILD_VECTOR with sign-extended elements.
+static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
+  if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N))
+    return true;
+  if (isExtendedBUILD_VECTOR(N, DAG, true))
+    return true;
+  return false;
+}
+
+/// isZeroExtended - Check if a node is a vector value that is zero-extended
+/// or a constant BUILD_VECTOR with zero-extended elements.
+static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
+  if (N->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N))
+    return true;
+  if (isExtendedBUILD_VECTOR(N, DAG, false))
+    return true;
+  return false;
+}
+
+static EVT getExtensionTo64Bits(const EVT &OrigVT) {
+  if (OrigVT.getSizeInBits() >= 64)
+    return OrigVT;
+
+  assert(OrigVT.isSimple() && "Expecting a simple value type");
+
+  MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
+  switch (OrigSimpleTy) {
+  default: llvm_unreachable("Unexpected Vector Type");
+  case MVT::v2i8:
+  case MVT::v2i16:
+     return MVT::v2i32;
+  case MVT::v4i8:
+    return  MVT::v4i16;
+  }
+}
+
+/// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total
+/// value size to 64 bits. We need a 64-bit D register as an operand to VMULL.
+/// We insert the required extension here to get the vector to fill a D register.
+static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG,
+                                            const EVT &OrigTy,
+                                            const EVT &ExtTy,
+                                            unsigned ExtOpcode) {
+  // The vector originally had a size of OrigTy. It was then extended to ExtTy.
+  // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
+  // 64-bits we need to insert a new extension so that it will be 64-bits.
+  assert(ExtTy.is128BitVector() && "Unexpected extension size");
+  if (OrigTy.getSizeInBits() >= 64)
+    return N;
+
+  // Must extend size to at least 64 bits to be used as an operand for VMULL.
+  EVT NewVT = getExtensionTo64Bits(OrigTy);
+
+  return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
+}
+
+/// SkipLoadExtensionForVMULL - return a load of the original vector size that
+/// does not do any sign/zero extension. If the original vector is less
+/// than 64 bits, an appropriate extension will be added after the load to
+/// reach a total size of 64 bits. We have to add the extension separately
+/// because ARM does not have a sign/zero extending load for vectors.
+static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) {
+  EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT());
+
+  // The load already has the right type.
+  if (ExtendedTy == LD->getMemoryVT())
+    return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(),
+                LD->getBasePtr(), LD->getPointerInfo(), LD->isVolatile(),
+                LD->isNonTemporal(), LD->isInvariant(),
+                LD->getAlignment());
+
+  // We need to create a zextload/sextload. We cannot just create a load
+  // followed by a zext/zext node because LowerMUL is also run during normal
+  // operation legalization where we can't create illegal types.
+  return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy,
+                        LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
+                        LD->getMemoryVT(), LD->isVolatile(),
+                        LD->isNonTemporal(), LD->getAlignment());
+}
+
+/// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
+/// extending load, or BUILD_VECTOR with extended elements, return the
+/// unextended value. The unextended vector should be 64 bits so that it can
+/// be used as an operand to a VMULL instruction. If the original vector size
+/// before extension is less than 64 bits we add a an extension to resize
+/// the vector to 64 bits.
+static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) {
+  if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
+    return AddRequiredExtensionForVMULL(N->getOperand(0), DAG,
+                                        N->getOperand(0)->getValueType(0),
+                                        N->getValueType(0),
+                                        N->getOpcode());
+
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N))
+    return SkipLoadExtensionForVMULL(LD, DAG);
+
+  // Otherwise, the value must be a BUILD_VECTOR.  For v2i64, it will
+  // have been legalized as a BITCAST from v4i32.
+  if (N->getOpcode() == ISD::BITCAST) {
+    SDNode *BVN = N->getOperand(0).getNode();
+    assert(BVN->getOpcode() == ISD::BUILD_VECTOR &&
+           BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
+    unsigned LowElt = DAG.getTargetLoweringInfo().isBigEndian() ? 1 : 0;
+    return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), MVT::v2i32,
+                       BVN->getOperand(LowElt), BVN->getOperand(LowElt+2));
+  }
+  // Construct a new BUILD_VECTOR with elements truncated to half the size.
+  assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
+  EVT VT = N->getValueType(0);
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits() / 2;
+  unsigned NumElts = VT.getVectorNumElements();
+  MVT TruncVT = MVT::getIntegerVT(EltSize);
+  SmallVector<SDValue, 8> Ops;
+  for (unsigned i = 0; i != NumElts; ++i) {
+    ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
+    const APInt &CInt = C->getAPIntValue();
+    // Element types smaller than 32 bits are not legal, so use i32 elements.
+    // The values are implicitly truncated so sext vs. zext doesn't matter.
+    Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), MVT::i32));
+  }
+  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N),
+                     MVT::getVectorVT(TruncVT, NumElts), Ops);
+}
+
+static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
+  unsigned Opcode = N->getOpcode();
+  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
+    SDNode *N0 = N->getOperand(0).getNode();
+    SDNode *N1 = N->getOperand(1).getNode();
+    return N0->hasOneUse() && N1->hasOneUse() &&
+      isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
+  }
+  return false;
+}
+
+static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
+  unsigned Opcode = N->getOpcode();
+  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
+    SDNode *N0 = N->getOperand(0).getNode();
+    SDNode *N1 = N->getOperand(1).getNode();
+    return N0->hasOneUse() && N1->hasOneUse() &&
+      isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
+  }
+  return false;
+}
+
+static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
+  // Multiplications are only custom-lowered for 128-bit vectors so that
+  // VMULL can be detected.  Otherwise v2i64 multiplications are not legal.
+  EVT VT = Op.getValueType();
+  assert(VT.is128BitVector() && VT.isInteger() &&
+         "unexpected type for custom-lowering ISD::MUL");
+  SDNode *N0 = Op.getOperand(0).getNode();
+  SDNode *N1 = Op.getOperand(1).getNode();
+  unsigned NewOpc = 0;
+  bool isMLA = false;
+  bool isN0SExt = isSignExtended(N0, DAG);
+  bool isN1SExt = isSignExtended(N1, DAG);
+  if (isN0SExt && isN1SExt)
+    NewOpc = ARMISD::VMULLs;
+  else {
+    bool isN0ZExt = isZeroExtended(N0, DAG);
+    bool isN1ZExt = isZeroExtended(N1, DAG);
+    if (isN0ZExt && isN1ZExt)
+      NewOpc = ARMISD::VMULLu;
+    else if (isN1SExt || isN1ZExt) {
+      // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
+      // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
+      if (isN1SExt && isAddSubSExt(N0, DAG)) {
+        NewOpc = ARMISD::VMULLs;
+        isMLA = true;
+      } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
+        NewOpc = ARMISD::VMULLu;
+        isMLA = true;
+      } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
+        std::swap(N0, N1);
+        NewOpc = ARMISD::VMULLu;
+        isMLA = true;
+      }
+    }
+
+    if (!NewOpc) {
+      if (VT == MVT::v2i64)
+        // Fall through to expand this.  It is not legal.
+        return SDValue();
+      else
+        // Other vector multiplications are legal.
+        return Op;
+    }
+  }
+
+  // Legalize to a VMULL instruction.
+  SDLoc DL(Op);
+  SDValue Op0;
+  SDValue Op1 = SkipExtensionForVMULL(N1, DAG);
+  if (!isMLA) {
+    Op0 = SkipExtensionForVMULL(N0, DAG);
+    assert(Op0.getValueType().is64BitVector() &&
+           Op1.getValueType().is64BitVector() &&
+           "unexpected types for extended operands to VMULL");
+    return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
+  }
+
+  // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during
+  // isel lowering to take advantage of no-stall back to back vmul + vmla.
+  //   vmull q0, d4, d6
+  //   vmlal q0, d5, d6
+  // is faster than
+  //   vaddl q0, d4, d5
+  //   vmovl q1, d6
+  //   vmul  q0, q0, q1
+  SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG);
+  SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG);
+  EVT Op1VT = Op1.getValueType();
+  return DAG.getNode(N0->getOpcode(), DL, VT,
+                     DAG.getNode(NewOpc, DL, VT,
+                               DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
+                     DAG.getNode(NewOpc, DL, VT,
+                               DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
+}
+
+static SDValue
+LowerSDIV_v4i8(SDValue X, SDValue Y, SDLoc dl, SelectionDAG &DAG) {
+  // Convert to float
+  // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
+  // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
+  X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X);
+  Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y);
+  X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X);
+  Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y);
+  // Get reciprocal estimate.
+  // float4 recip = vrecpeq_f32(yf);
+  Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
+                   DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), Y);
+  // Because char has a smaller range than uchar, we can actually get away
+  // without any newton steps.  This requires that we use a weird bias
+  // of 0xb000, however (again, this has been exhaustively tested).
+  // float4 result = as_float4(as_int4(xf*recip) + 0xb000);
+  X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y);
+  X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X);
+  Y = DAG.getConstant(0xb000, MVT::i32);
+  Y = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Y, Y, Y, Y);
+  X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y);
+  X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X);
+  // Convert back to short.
+  X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X);
+  X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X);
+  return X;
+}
+
+static SDValue
+LowerSDIV_v4i16(SDValue N0, SDValue N1, SDLoc dl, SelectionDAG &DAG) {
+  SDValue N2;
+  // Convert to float.
+  // float4 yf = vcvt_f32_s32(vmovl_s16(y));
+  // float4 xf = vcvt_f32_s32(vmovl_s16(x));
+  N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0);
+  N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1);
+  N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
+  N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
+
+  // Use reciprocal estimate and one refinement step.
+  // float4 recip = vrecpeq_f32(yf);
+  // recip *= vrecpsq_f32(yf, recip);
+  N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
+                   DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), N1);
+  N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
+                   DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32),
+                   N1, N2);
+  N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
+  // Because short has a smaller range than ushort, we can actually get away
+  // with only a single newton step.  This requires that we use a weird bias
+  // of 89, however (again, this has been exhaustively tested).
+  // float4 result = as_float4(as_int4(xf*recip) + 0x89);
+  N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
+  N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
+  N1 = DAG.getConstant(0x89, MVT::i32);
+  N1 = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, N1, N1, N1, N1);
+  N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
+  N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
+  // Convert back to integer and return.
+  // return vmovn_s32(vcvt_s32_f32(result));
+  N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
+  N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
+  return N0;
+}
+
+static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+  assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
+         "unexpected type for custom-lowering ISD::SDIV");
+
+  SDLoc dl(Op);
+  SDValue N0 = Op.getOperand(0);
+  SDValue N1 = Op.getOperand(1);
+  SDValue N2, N3;
+
+  if (VT == MVT::v8i8) {
+    N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0);
+    N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1);
+
+    N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
+                     DAG.getIntPtrConstant(4));
+    N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
+                     DAG.getIntPtrConstant(4));
+    N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
+                     DAG.getIntPtrConstant(0));
+    N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
+                     DAG.getIntPtrConstant(0));
+
+    N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16
+    N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16
+
+    N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
+    N0 = LowerCONCAT_VECTORS(N0, DAG);
+
+    N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
+    return N0;
+  }
+  return LowerSDIV_v4i16(N0, N1, dl, DAG);
+}
+
+static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+  assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
+         "unexpected type for custom-lowering ISD::UDIV");
+
+  SDLoc dl(Op);
+  SDValue N0 = Op.getOperand(0);
+  SDValue N1 = Op.getOperand(1);
+  SDValue N2, N3;
+
+  if (VT == MVT::v8i8) {
+    N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0);
+    N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1);
+
+    N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
+                     DAG.getIntPtrConstant(4));
+    N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
+                     DAG.getIntPtrConstant(4));
+    N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
+                     DAG.getIntPtrConstant(0));
+    N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
+                     DAG.getIntPtrConstant(0));
+
+    N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16
+    N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
+
+    N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
+    N0 = LowerCONCAT_VECTORS(N0, DAG);
+
+    N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
+                     DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, MVT::i32),
+                     N0);
+    return N0;
+  }
+
+  // v4i16 sdiv ... Convert to float.
+  // float4 yf = vcvt_f32_s32(vmovl_u16(y));
+  // float4 xf = vcvt_f32_s32(vmovl_u16(x));
+  N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0);
+  N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1);
+  N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
+  SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
+
+  // Use reciprocal estimate and two refinement steps.
+  // float4 recip = vrecpeq_f32(yf);
+  // recip *= vrecpsq_f32(yf, recip);
+  // recip *= vrecpsq_f32(yf, recip);
+  N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
+                   DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), BN1);
+  N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
+                   DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32),
+                   BN1, N2);
+  N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
+  N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
+                   DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32),
+                   BN1, N2);
+  N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
+  // Simply multiplying by the reciprocal estimate can leave us a few ulps
+  // too low, so we add 2 ulps (exhaustive testing shows that this is enough,
+  // and that it will never cause us to return an answer too large).
+  // float4 result = as_float4(as_int4(xf*recip) + 2);
+  N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
+  N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
+  N1 = DAG.getConstant(2, MVT::i32);
+  N1 = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, N1, N1, N1, N1);
+  N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
+  N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
+  // Convert back to integer and return.
+  // return vmovn_u32(vcvt_s32_f32(result));
+  N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
+  N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
+  return N0;
+}
+
+static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getNode()->getValueType(0);
+  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+
+  unsigned Opc;
+  bool ExtraOp = false;
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Invalid code");
+  case ISD::ADDC: Opc = ARMISD::ADDC; break;
+  case ISD::ADDE: Opc = ARMISD::ADDE; ExtraOp = true; break;
+  case ISD::SUBC: Opc = ARMISD::SUBC; break;
+  case ISD::SUBE: Opc = ARMISD::SUBE; ExtraOp = true; break;
+  }
+
+  if (!ExtraOp)
+    return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
+                       Op.getOperand(1));
+  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
+                     Op.getOperand(1), Op.getOperand(2));
+}
+
+SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetDarwin());
+
+  // For iOS, we want to call an alternative entry point: __sincos_stret,
+  // return values are passed via sret.
+  SDLoc dl(Op);
+  SDValue Arg = Op.getOperand(0);
+  EVT ArgVT = Arg.getValueType();
+  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+
+  MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // Pair of floats / doubles used to pass the result.
+  StructType *RetTy = StructType::get(ArgTy, ArgTy, NULL);
+
+  // Create stack object for sret.
+  const uint64_t ByteSize = TLI.getDataLayout()->getTypeAllocSize(RetTy);
+  const unsigned StackAlign = TLI.getDataLayout()->getPrefTypeAlignment(RetTy);
+  int FrameIdx = FrameInfo->CreateStackObject(ByteSize, StackAlign, false);
+  SDValue SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy());
+
+  ArgListTy Args;
+  ArgListEntry Entry;
+
+  Entry.Node = SRet;
+  Entry.Ty = RetTy->getPointerTo();
+  Entry.isSExt = false;
+  Entry.isZExt = false;
+  Entry.isSRet = true;
+  Args.push_back(Entry);
+
+  Entry.Node = Arg;
+  Entry.Ty = ArgTy;
+  Entry.isSExt = false;
+  Entry.isZExt = false;
+  Args.push_back(Entry);
+
+  const char *LibcallName  = (ArgVT == MVT::f64)
+  ? "__sincos_stret" : "__sincosf_stret";
+  SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy());
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
+    .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), Callee,
+               std::move(Args), 0)
+    .setDiscardResult();
+
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+
+  SDValue LoadSin = DAG.getLoad(ArgVT, dl, CallResult.second, SRet,
+                                MachinePointerInfo(), false, false, false, 0);
+
+  // Address of cos field.
+  SDValue Add = DAG.getNode(ISD::ADD, dl, getPointerTy(), SRet,
+                            DAG.getIntPtrConstant(ArgVT.getStoreSize()));
+  SDValue LoadCos = DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add,
+                                MachinePointerInfo(), false, false, false, 0);
+
+  SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
+  return DAG.getNode(ISD::MERGE_VALUES, dl, Tys,
+                     LoadSin.getValue(0), LoadCos.getValue(0));
+}
+
+static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
+  // Monotonic load/store is legal for all targets
+  if (cast<AtomicSDNode>(Op)->getOrdering() <= Monotonic)
+    return Op;
+
+  // Acquire/Release load/store is not legal for targets without a
+  // dmb or equivalent available.
+  return SDValue();
+}
+
+static void ReplaceREADCYCLECOUNTER(SDNode *N,
+                                    SmallVectorImpl<SDValue> &Results,
+                                    SelectionDAG &DAG,
+                                    const ARMSubtarget *Subtarget) {
+  SDLoc DL(N);
+  SDValue Cycles32, OutChain;
+
+  if (Subtarget->hasPerfMon()) {
+    // Under Power Management extensions, the cycle-count is:
+    //    mrc p15, #0, <Rt>, c9, c13, #0
+    SDValue Ops[] = { N->getOperand(0), // Chain
+                      DAG.getConstant(Intrinsic::arm_mrc, MVT::i32),
+                      DAG.getConstant(15, MVT::i32),
+                      DAG.getConstant(0, MVT::i32),
+                      DAG.getConstant(9, MVT::i32),
+                      DAG.getConstant(13, MVT::i32),
+                      DAG.getConstant(0, MVT::i32)
+    };
+
+    Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
+                           DAG.getVTList(MVT::i32, MVT::Other), Ops);
+    OutChain = Cycles32.getValue(1);
+  } else {
+    // Intrinsic is defined to return 0 on unsupported platforms. Technically
+    // there are older ARM CPUs that have implementation-specific ways of
+    // obtaining this information (FIXME!).
+    Cycles32 = DAG.getConstant(0, MVT::i32);
+    OutChain = DAG.getEntryNode();
+  }
+
+
+  SDValue Cycles64 = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64,
+                                 Cycles32, DAG.getConstant(0, MVT::i32));
+  Results.push_back(Cycles64);
+  Results.push_back(OutChain);
+}
+
+SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Don't know how to custom lower this!");
+  case ISD::ConstantPool:  return LowerConstantPool(Op, DAG);
+  case ISD::BlockAddress:  return LowerBlockAddress(Op, DAG);
+  case ISD::GlobalAddress:
+    switch (Subtarget->getTargetTriple().getObjectFormat()) {
+    default: llvm_unreachable("unknown object format");
+    case Triple::COFF:
+      return LowerGlobalAddressWindows(Op, DAG);
+    case Triple::ELF:
+      return LowerGlobalAddressELF(Op, DAG);
+    case Triple::MachO:
+      return LowerGlobalAddressDarwin(Op, DAG);
+    }
+  case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
+  case ISD::SELECT:        return LowerSELECT(Op, DAG);
+  case ISD::SELECT_CC:     return LowerSELECT_CC(Op, DAG);
+  case ISD::BR_CC:         return LowerBR_CC(Op, DAG);
+  case ISD::BR_JT:         return LowerBR_JT(Op, DAG);
+  case ISD::VASTART:       return LowerVASTART(Op, DAG);
+  case ISD::ATOMIC_FENCE:  return LowerATOMIC_FENCE(Op, DAG, Subtarget);
+  case ISD::PREFETCH:      return LowerPREFETCH(Op, DAG, Subtarget);
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:    return LowerINT_TO_FP(Op, DAG);
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:    return LowerFP_TO_INT(Op, DAG);
+  case ISD::FCOPYSIGN:     return LowerFCOPYSIGN(Op, DAG);
+  case ISD::RETURNADDR:    return LowerRETURNADDR(Op, DAG);
+  case ISD::FRAMEADDR:     return LowerFRAMEADDR(Op, DAG);
+  case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG);
+  case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
+  case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
+  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
+                                                               Subtarget);
+  case ISD::BITCAST:       return ExpandBITCAST(Op.getNode(), DAG);
+  case ISD::SHL:
+  case ISD::SRL:
+  case ISD::SRA:           return LowerShift(Op.getNode(), DAG, Subtarget);
+  case ISD::SHL_PARTS:     return LowerShiftLeftParts(Op, DAG);
+  case ISD::SRL_PARTS:
+  case ISD::SRA_PARTS:     return LowerShiftRightParts(Op, DAG);
+  case ISD::CTTZ:          return LowerCTTZ(Op.getNode(), DAG, Subtarget);
+  case ISD::CTPOP:         return LowerCTPOP(Op.getNode(), DAG, Subtarget);
+  case ISD::SETCC:         return LowerVSETCC(Op, DAG);
+  case ISD::ConstantFP:    return LowerConstantFP(Op, DAG, Subtarget);
+  case ISD::BUILD_VECTOR:  return LowerBUILD_VECTOR(Op, DAG, Subtarget);
+  case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
+  case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
+  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
+  case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
+  case ISD::FLT_ROUNDS_:   return LowerFLT_ROUNDS_(Op, DAG);
+  case ISD::MUL:           return LowerMUL(Op, DAG);
+  case ISD::SDIV:          return LowerSDIV(Op, DAG);
+  case ISD::UDIV:          return LowerUDIV(Op, DAG);
+  case ISD::ADDC:
+  case ISD::ADDE:
+  case ISD::SUBC:
+  case ISD::SUBE:          return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
+  case ISD::SADDO:
+  case ISD::UADDO:
+  case ISD::SSUBO:
+  case ISD::USUBO:
+    return LowerXALUO(Op, DAG);
+  case ISD::ATOMIC_LOAD:
+  case ISD::ATOMIC_STORE:  return LowerAtomicLoadStore(Op, DAG);
+  case ISD::FSINCOS:       return LowerFSINCOS(Op, DAG);
+  case ISD::SDIVREM:
+  case ISD::UDIVREM:       return LowerDivRem(Op, DAG);
+  case ISD::DYNAMIC_STACKALLOC:
+    if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment())
+      return LowerDYNAMIC_STACKALLOC(Op, DAG);
+    llvm_unreachable("Don't know how to custom lower this!");
+  }
+}
+
+/// ReplaceNodeResults - Replace the results of node with an illegal result
+/// type with new values built out of custom code.
+void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
+                                           SmallVectorImpl<SDValue>&Results,
+                                           SelectionDAG &DAG) const {
+  SDValue Res;
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Don't know how to custom expand this!");
+  case ISD::BITCAST:
+    Res = ExpandBITCAST(N, DAG);
+    break;
+  case ISD::SRL:
+  case ISD::SRA:
+    Res = Expand64BitShift(N, DAG, Subtarget);
+    break;
+  case ISD::READCYCLECOUNTER:
+    ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
+    return;
+  }
+  if (Res.getNode())
+    Results.push_back(Res);
+}
+
+//===----------------------------------------------------------------------===//
+//                           ARM Scheduler Hooks
+//===----------------------------------------------------------------------===//
+
+/// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
+/// registers the function context.
+void ARMTargetLowering::
+SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB,
+                       MachineBasicBlock *DispatchBB, int FI) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc dl = MI->getDebugLoc();
+  MachineFunction *MF = MBB->getParent();
+  MachineRegisterInfo *MRI = &MF->getRegInfo();
+  MachineConstantPool *MCP = MF->getConstantPool();
+  ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>();
+  const Function *F = MF->getFunction();
+
+  bool isThumb = Subtarget->isThumb();
+  bool isThumb2 = Subtarget->isThumb2();
+
+  unsigned PCLabelId = AFI->createPICLabelUId();
+  unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8;
+  ARMConstantPoolValue *CPV =
+    ARMConstantPoolMBB::Create(F->getContext(), DispatchBB, PCLabelId, PCAdj);
+  unsigned CPI = MCP->getConstantPoolIndex(CPV, 4);
+
+  const TargetRegisterClass *TRC = isThumb ?
+    (const TargetRegisterClass*)&ARM::tGPRRegClass :
+    (const TargetRegisterClass*)&ARM::GPRRegClass;
+
+  // Grab constant pool and fixed stack memory operands.
+  MachineMemOperand *CPMMO =
+    MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(),
+                             MachineMemOperand::MOLoad, 4, 4);
+
+  MachineMemOperand *FIMMOSt =
+    MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
+                             MachineMemOperand::MOStore, 4, 4);
+
+  // Load the address of the dispatch MBB into the jump buffer.
+  if (isThumb2) {
+    // Incoming value: jbuf
+    //   ldr.n  r5, LCPI1_1
+    //   orr    r5, r5, #1
+    //   add    r5, pc
+    //   str    r5, [$jbuf, #+4] ; &jbuf[1]
+    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
+                   .addConstantPoolIndex(CPI)
+                   .addMemOperand(CPMMO));
+    // Set the low bit because of thumb mode.
+    unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
+    AddDefaultCC(
+      AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
+                     .addReg(NewVReg1, RegState::Kill)
+                     .addImm(0x01)));
+    unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
+    BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)
+      .addReg(NewVReg2, RegState::Kill)
+      .addImm(PCLabelId);
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12))
+                   .addReg(NewVReg3, RegState::Kill)
+                   .addFrameIndex(FI)
+                   .addImm(36)  // &jbuf[1] :: pc
+                   .addMemOperand(FIMMOSt));
+  } else if (isThumb) {
+    // Incoming value: jbuf
+    //   ldr.n  r1, LCPI1_4
+    //   add    r1, pc
+    //   mov    r2, #1
+    //   orrs   r1, r2
+    //   add    r2, $jbuf, #+4 ; &jbuf[1]
+    //   str    r1, [r2]
+    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
+                   .addConstantPoolIndex(CPI)
+                   .addMemOperand(CPMMO));
+    unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
+    BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)
+      .addReg(NewVReg1, RegState::Kill)
+      .addImm(PCLabelId);
+    // Set the low bit because of thumb mode.
+    unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
+                   .addReg(ARM::CPSR, RegState::Define)
+                   .addImm(1));
+    unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
+                   .addReg(ARM::CPSR, RegState::Define)
+                   .addReg(NewVReg2, RegState::Kill)
+                   .addReg(NewVReg3, RegState::Kill));
+    unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tADDrSPi), NewVReg5)
+                   .addFrameIndex(FI)
+                   .addImm(36)); // &jbuf[1] :: pc
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
+                   .addReg(NewVReg4, RegState::Kill)
+                   .addReg(NewVReg5, RegState::Kill)
+                   .addImm(0)
+                   .addMemOperand(FIMMOSt));
+  } else {
+    // Incoming value: jbuf
+    //   ldr  r1, LCPI1_1
+    //   add  r1, pc, r1
+    //   str  r1, [$jbuf, #+4] ; &jbuf[1]
+    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12),  NewVReg1)
+                   .addConstantPoolIndex(CPI)
+                   .addImm(0)
+                   .addMemOperand(CPMMO));
+    unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
+                   .addReg(NewVReg1, RegState::Kill)
+                   .addImm(PCLabelId));
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12))
+                   .addReg(NewVReg2, RegState::Kill)
+                   .addFrameIndex(FI)
+                   .addImm(36)  // &jbuf[1] :: pc
+                   .addMemOperand(FIMMOSt));
+  }
+}
+
+MachineBasicBlock *ARMTargetLowering::
+EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc dl = MI->getDebugLoc();
+  MachineFunction *MF = MBB->getParent();
+  MachineRegisterInfo *MRI = &MF->getRegInfo();
+  ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>();
+  MachineFrameInfo *MFI = MF->getFrameInfo();
+  int FI = MFI->getFunctionContextIndex();
+
+  const TargetRegisterClass *TRC = Subtarget->isThumb() ?
+    (const TargetRegisterClass*)&ARM::tGPRRegClass :
+    (const TargetRegisterClass*)&ARM::GPRnopcRegClass;
+
+  // Get a mapping of the call site numbers to all of the landing pads they're
+  // associated with.
+  DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2> > CallSiteNumToLPad;
+  unsigned MaxCSNum = 0;
+  MachineModuleInfo &MMI = MF->getMMI();
+  for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E;
+       ++BB) {
+    if (!BB->isLandingPad()) continue;
+
+    // FIXME: We should assert that the EH_LABEL is the first MI in the landing
+    // pad.
+    for (MachineBasicBlock::iterator
+           II = BB->begin(), IE = BB->end(); II != IE; ++II) {
+      if (!II->isEHLabel()) continue;
+
+      MCSymbol *Sym = II->getOperand(0).getMCSymbol();
+      if (!MMI.hasCallSiteLandingPad(Sym)) continue;
+
+      SmallVectorImpl<unsigned> &CallSiteIdxs = MMI.getCallSiteLandingPad(Sym);
+      for (SmallVectorImpl<unsigned>::iterator
+             CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end();
+           CSI != CSE; ++CSI) {
+        CallSiteNumToLPad[*CSI].push_back(BB);
+        MaxCSNum = std::max(MaxCSNum, *CSI);
+      }
+      break;
+    }
+  }
+
+  // Get an ordered list of the machine basic blocks for the jump table.
+  std::vector<MachineBasicBlock*> LPadList;
+  SmallPtrSet<MachineBasicBlock*, 64> InvokeBBs;
+  LPadList.reserve(CallSiteNumToLPad.size());
+  for (unsigned I = 1; I <= MaxCSNum; ++I) {
+    SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I];
+    for (SmallVectorImpl<MachineBasicBlock*>::iterator
+           II = MBBList.begin(), IE = MBBList.end(); II != IE; ++II) {
+      LPadList.push_back(*II);
+      InvokeBBs.insert((*II)->pred_begin(), (*II)->pred_end());
+    }
+  }
+
+  assert(!LPadList.empty() &&
+         "No landing pad destinations for the dispatch jump table!");
+
+  // Create the jump table and associated information.
+  MachineJumpTableInfo *JTI =
+    MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline);
+  unsigned MJTI = JTI->createJumpTableIndex(LPadList);
+  unsigned UId = AFI->createJumpTableUId();
+  Reloc::Model RelocM = getTargetMachine().getRelocationModel();
+
+  // Create the MBBs for the dispatch code.
+
+  // Shove the dispatch's address into the return slot in the function context.
+  MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
+  DispatchBB->setIsLandingPad();
+
+  MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
+  unsigned trap_opcode;
+  if (Subtarget->isThumb())
+    trap_opcode = ARM::tTRAP;
+  else
+    trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP;
+
+  BuildMI(TrapBB, dl, TII->get(trap_opcode));
+  DispatchBB->addSuccessor(TrapBB);
+
+  MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
+  DispatchBB->addSuccessor(DispContBB);
+
+  // Insert and MBBs.
+  MF->insert(MF->end(), DispatchBB);
+  MF->insert(MF->end(), DispContBB);
+  MF->insert(MF->end(), TrapBB);
+
+  // Insert code into the entry block that creates and registers the function
+  // context.
+  SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI);
+
+  MachineMemOperand *FIMMOLd =
+    MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
+                             MachineMemOperand::MOLoad |
+                             MachineMemOperand::MOVolatile, 4, 4);
+
+  MachineInstrBuilder MIB;
+  MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
+
+  const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
+  const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
+
+  // Add a register mask with no preserved registers.  This results in all
+  // registers being marked as clobbered.
+  MIB.addRegMask(RI.getNoPreservedMask());
+
+  unsigned NumLPads = LPadList.size();
+  if (Subtarget->isThumb2()) {
+    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
+                   .addFrameIndex(FI)
+                   .addImm(4)
+                   .addMemOperand(FIMMOLd));
+
+    if (NumLPads < 256) {
+      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri))
+                     .addReg(NewVReg1)
+                     .addImm(LPadList.size()));
+    } else {
+      unsigned VReg1 = MRI->createVirtualRegister(TRC);
+      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)
+                     .addImm(NumLPads & 0xFFFF));
+
+      unsigned VReg2 = VReg1;
+      if ((NumLPads & 0xFFFF0000) != 0) {
+        VReg2 = MRI->createVirtualRegister(TRC);
+        AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2)
+                       .addReg(VReg1)
+                       .addImm(NumLPads >> 16));
+      }
+
+      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr))
+                     .addReg(NewVReg1)
+                     .addReg(VReg2));
+    }
+
+    BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc))
+      .addMBB(TrapBB)
+      .addImm(ARMCC::HI)
+      .addReg(ARM::CPSR);
+
+    unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT),NewVReg3)
+                   .addJumpTableIndex(MJTI)
+                   .addImm(UId));
+
+    unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
+    AddDefaultCC(
+      AddDefaultPred(
+        BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)
+        .addReg(NewVReg3, RegState::Kill)
+        .addReg(NewVReg1)
+        .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))));
+
+    BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT))
+      .addReg(NewVReg4, RegState::Kill)
+      .addReg(NewVReg1)
+      .addJumpTableIndex(MJTI)
+      .addImm(UId);
+  } else if (Subtarget->isThumb()) {
+    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
+                   .addFrameIndex(FI)
+                   .addImm(1)
+                   .addMemOperand(FIMMOLd));
+
+    if (NumLPads < 256) {
+      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8))
+                     .addReg(NewVReg1)
+                     .addImm(NumLPads));
+    } else {
+      MachineConstantPool *ConstantPool = MF->getConstantPool();
+      Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
+      const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
+
+      // MachineConstantPool wants an explicit alignment.
+      unsigned Align = getDataLayout()->getPrefTypeAlignment(Int32Ty);
+      if (Align == 0)
+        Align = getDataLayout()->getTypeAllocSize(C->getType());
+      unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
+
+      unsigned VReg1 = MRI->createVirtualRegister(TRC);
+      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
+                     .addReg(VReg1, RegState::Define)
+                     .addConstantPoolIndex(Idx));
+      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr))
+                     .addReg(NewVReg1)
+                     .addReg(VReg1));
+    }
+
+    BuildMI(DispatchBB, dl, TII->get(ARM::tBcc))
+      .addMBB(TrapBB)
+      .addImm(ARMCC::HI)
+      .addReg(ARM::CPSR);
+
+    unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
+                   .addReg(ARM::CPSR, RegState::Define)
+                   .addReg(NewVReg1)
+                   .addImm(2));
+
+    unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
+                   .addJumpTableIndex(MJTI)
+                   .addImm(UId));
+
+    unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
+                   .addReg(ARM::CPSR, RegState::Define)
+                   .addReg(NewVReg2, RegState::Kill)
+                   .addReg(NewVReg3));
+
+    MachineMemOperand *JTMMOLd =
+      MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(),
+                               MachineMemOperand::MOLoad, 4, 4);
+
+    unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
+                   .addReg(NewVReg4, RegState::Kill)
+                   .addImm(0)
+                   .addMemOperand(JTMMOLd));
+
+    unsigned NewVReg6 = NewVReg5;
+    if (RelocM == Reloc::PIC_) {
+      NewVReg6 = MRI->createVirtualRegister(TRC);
+      AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
+                     .addReg(ARM::CPSR, RegState::Define)
+                     .addReg(NewVReg5, RegState::Kill)
+                     .addReg(NewVReg3));
+    }
+
+    BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
+      .addReg(NewVReg6, RegState::Kill)
+      .addJumpTableIndex(MJTI)
+      .addImm(UId);
+  } else {
+    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
+                   .addFrameIndex(FI)
+                   .addImm(4)
+                   .addMemOperand(FIMMOLd));
+
+    if (NumLPads < 256) {
+      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPri))
+                     .addReg(NewVReg1)
+                     .addImm(NumLPads));
+    } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) {
+      unsigned VReg1 = MRI->createVirtualRegister(TRC);
+      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)
+                     .addImm(NumLPads & 0xFFFF));
+
+      unsigned VReg2 = VReg1;
+      if ((NumLPads & 0xFFFF0000) != 0) {
+        VReg2 = MRI->createVirtualRegister(TRC);
+        AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2)
+                       .addReg(VReg1)
+                       .addImm(NumLPads >> 16));
+      }
+
+      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
+                     .addReg(NewVReg1)
+                     .addReg(VReg2));
+    } else {
+      MachineConstantPool *ConstantPool = MF->getConstantPool();
+      Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
+      const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
+
+      // MachineConstantPool wants an explicit alignment.
+      unsigned Align = getDataLayout()->getPrefTypeAlignment(Int32Ty);
+      if (Align == 0)
+        Align = getDataLayout()->getTypeAllocSize(C->getType());
+      unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
+
+      unsigned VReg1 = MRI->createVirtualRegister(TRC);
+      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
+                     .addReg(VReg1, RegState::Define)
+                     .addConstantPoolIndex(Idx)
+                     .addImm(0));
+      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
+                     .addReg(NewVReg1)
+                     .addReg(VReg1, RegState::Kill));
+    }
+
+    BuildMI(DispatchBB, dl, TII->get(ARM::Bcc))
+      .addMBB(TrapBB)
+      .addImm(ARMCC::HI)
+      .addReg(ARM::CPSR);
+
+    unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
+    AddDefaultCC(
+      AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)
+                     .addReg(NewVReg1)
+                     .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))));
+    unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
+                   .addJumpTableIndex(MJTI)
+                   .addImm(UId));
+
+    MachineMemOperand *JTMMOLd =
+      MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(),
+                               MachineMemOperand::MOLoad, 4, 4);
+    unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(
+      BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
+      .addReg(NewVReg3, RegState::Kill)
+      .addReg(NewVReg4)
+      .addImm(0)
+      .addMemOperand(JTMMOLd));
+
+    if (RelocM == Reloc::PIC_) {
+      BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
+        .addReg(NewVReg5, RegState::Kill)
+        .addReg(NewVReg4)
+        .addJumpTableIndex(MJTI)
+        .addImm(UId);
+    } else {
+      BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr))
+        .addReg(NewVReg5, RegState::Kill)
+        .addJumpTableIndex(MJTI)
+        .addImm(UId);
+    }
+  }
+
+  // Add the jump table entries as successors to the MBB.
+  SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs;
+  for (std::vector<MachineBasicBlock*>::iterator
+         I = LPadList.begin(), E = LPadList.end(); I != E; ++I) {
+    MachineBasicBlock *CurMBB = *I;
+    if (SeenMBBs.insert(CurMBB))
+      DispContBB->addSuccessor(CurMBB);
+  }
+
+  // N.B. the order the invoke BBs are processed in doesn't matter here.
+  const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF);
+  SmallVector<MachineBasicBlock*, 64> MBBLPads;
+  for (SmallPtrSet<MachineBasicBlock*, 64>::iterator
+         I = InvokeBBs.begin(), E = InvokeBBs.end(); I != E; ++I) {
+    MachineBasicBlock *BB = *I;
+
+    // Remove the landing pad successor from the invoke block and replace it
+    // with the new dispatch block.
+    SmallVector<MachineBasicBlock*, 4> Successors(BB->succ_begin(),
+                                                  BB->succ_end());
+    while (!Successors.empty()) {
+      MachineBasicBlock *SMBB = Successors.pop_back_val();
+      if (SMBB->isLandingPad()) {
+        BB->removeSuccessor(SMBB);
+        MBBLPads.push_back(SMBB);
+      }
+    }
+
+    BB->addSuccessor(DispatchBB);
+
+    // Find the invoke call and mark all of the callee-saved registers as
+    // 'implicit defined' so that they're spilled. This prevents code from
+    // moving instructions to before the EH block, where they will never be
+    // executed.
+    for (MachineBasicBlock::reverse_iterator
+           II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) {
+      if (!II->isCall()) continue;
+
+      DenseMap<unsigned, bool> DefRegs;
+      for (MachineInstr::mop_iterator
+             OI = II->operands_begin(), OE = II->operands_end();
+           OI != OE; ++OI) {
+        if (!OI->isReg()) continue;
+        DefRegs[OI->getReg()] = true;
+      }
+
+      MachineInstrBuilder MIB(*MF, &*II);
+
+      for (unsigned i = 0; SavedRegs[i] != 0; ++i) {
+        unsigned Reg = SavedRegs[i];
+        if (Subtarget->isThumb2() &&
+            !ARM::tGPRRegClass.contains(Reg) &&
+            !ARM::hGPRRegClass.contains(Reg))
+          continue;
+        if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg))
+          continue;
+        if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg))
+          continue;
+        if (!DefRegs[Reg])
+          MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
+      }
+
+      break;
+    }
+  }
+
+  // Mark all former landing pads as non-landing pads. The dispatch is the only
+  // landing pad now.
+  for (SmallVectorImpl<MachineBasicBlock*>::iterator
+         I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I)
+    (*I)->setIsLandingPad(false);
+
+  // The instruction is gone now.
+  MI->eraseFromParent();
+
+  return MBB;
+}
+
+static
+MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) {
+  for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
+       E = MBB->succ_end(); I != E; ++I)
+    if (*I != Succ)
+      return *I;
+  llvm_unreachable("Expecting a BB with two successors!");
+}
+
+/// Return the load opcode for a given load size. If load size >= 8,
+/// neon opcode will be returned.
+static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) {
+  if (LdSize >= 8)
+    return LdSize == 16 ? ARM::VLD1q32wb_fixed
+                        : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0;
+  if (IsThumb1)
+    return LdSize == 4 ? ARM::tLDRi
+                       : LdSize == 2 ? ARM::tLDRHi
+                                     : LdSize == 1 ? ARM::tLDRBi : 0;
+  if (IsThumb2)
+    return LdSize == 4 ? ARM::t2LDR_POST
+                       : LdSize == 2 ? ARM::t2LDRH_POST
+                                     : LdSize == 1 ? ARM::t2LDRB_POST : 0;
+  return LdSize == 4 ? ARM::LDR_POST_IMM
+                     : LdSize == 2 ? ARM::LDRH_POST
+                                   : LdSize == 1 ? ARM::LDRB_POST_IMM : 0;
+}
+
+/// Return the store opcode for a given store size. If store size >= 8,
+/// neon opcode will be returned.
+static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) {
+  if (StSize >= 8)
+    return StSize == 16 ? ARM::VST1q32wb_fixed
+                        : StSize == 8 ? ARM::VST1d32wb_fixed : 0;
+  if (IsThumb1)
+    return StSize == 4 ? ARM::tSTRi
+                       : StSize == 2 ? ARM::tSTRHi
+                                     : StSize == 1 ? ARM::tSTRBi : 0;
+  if (IsThumb2)
+    return StSize == 4 ? ARM::t2STR_POST
+                       : StSize == 2 ? ARM::t2STRH_POST
+                                     : StSize == 1 ? ARM::t2STRB_POST : 0;
+  return StSize == 4 ? ARM::STR_POST_IMM
+                     : StSize == 2 ? ARM::STRH_POST
+                                   : StSize == 1 ? ARM::STRB_POST_IMM : 0;
+}
+
+/// Emit a post-increment load operation with given size. The instructions
+/// will be added to BB at Pos.
+static void emitPostLd(MachineBasicBlock *BB, MachineInstr *Pos,
+                       const TargetInstrInfo *TII, DebugLoc dl,
+                       unsigned LdSize, unsigned Data, unsigned AddrIn,
+                       unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
+  unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2);
+  assert(LdOpc != 0 && "Should have a load opcode");
+  if (LdSize >= 8) {
+    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
+                       .addReg(AddrOut, RegState::Define).addReg(AddrIn)
+                       .addImm(0));
+  } else if (IsThumb1) {
+    // load + update AddrIn
+    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
+                       .addReg(AddrIn).addImm(0));
+    MachineInstrBuilder MIB =
+        BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut);
+    MIB = AddDefaultT1CC(MIB);
+    MIB.addReg(AddrIn).addImm(LdSize);
+    AddDefaultPred(MIB);
+  } else if (IsThumb2) {
+    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
+                       .addReg(AddrOut, RegState::Define).addReg(AddrIn)
+                       .addImm(LdSize));
+  } else { // arm
+    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
+                       .addReg(AddrOut, RegState::Define).addReg(AddrIn)
+                       .addReg(0).addImm(LdSize));
+  }
+}
+
+/// Emit a post-increment store operation with given size. The instructions
+/// will be added to BB at Pos.
+static void emitPostSt(MachineBasicBlock *BB, MachineInstr *Pos,
+                       const TargetInstrInfo *TII, DebugLoc dl,
+                       unsigned StSize, unsigned Data, unsigned AddrIn,
+                       unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
+  unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2);
+  assert(StOpc != 0 && "Should have a store opcode");
+  if (StSize >= 8) {
+    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
+                       .addReg(AddrIn).addImm(0).addReg(Data));
+  } else if (IsThumb1) {
+    // store + update AddrIn
+    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc)).addReg(Data)
+                       .addReg(AddrIn).addImm(0));
+    MachineInstrBuilder MIB =
+        BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut);
+    MIB = AddDefaultT1CC(MIB);
+    MIB.addReg(AddrIn).addImm(StSize);
+    AddDefaultPred(MIB);
+  } else if (IsThumb2) {
+    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
+                       .addReg(Data).addReg(AddrIn).addImm(StSize));
+  } else { // arm
+    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
+                       .addReg(Data).addReg(AddrIn).addReg(0)
+                       .addImm(StSize));
+  }
+}
+
+MachineBasicBlock *
+ARMTargetLowering::EmitStructByval(MachineInstr *MI,
+                                   MachineBasicBlock *BB) const {
+  // This pseudo instruction has 3 operands: dst, src, size
+  // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
+  // Otherwise, we will generate unrolled scalar copies.
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  unsigned dest = MI->getOperand(0).getReg();
+  unsigned src = MI->getOperand(1).getReg();
+  unsigned SizeVal = MI->getOperand(2).getImm();
+  unsigned Align = MI->getOperand(3).getImm();
+  DebugLoc dl = MI->getDebugLoc();
+
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  unsigned UnitSize = 0;
+  const TargetRegisterClass *TRC = nullptr;
+  const TargetRegisterClass *VecTRC = nullptr;
+
+  bool IsThumb1 = Subtarget->isThumb1Only();
+  bool IsThumb2 = Subtarget->isThumb2();
+
+  if (Align & 1) {
+    UnitSize = 1;
+  } else if (Align & 2) {
+    UnitSize = 2;
+  } else {
+    // Check whether we can use NEON instructions.
+    if (!MF->getFunction()->getAttributes().
+          hasAttribute(AttributeSet::FunctionIndex,
+                       Attribute::NoImplicitFloat) &&
+        Subtarget->hasNEON()) {
+      if ((Align % 16 == 0) && SizeVal >= 16)
+        UnitSize = 16;
+      else if ((Align % 8 == 0) && SizeVal >= 8)
+        UnitSize = 8;
+    }
+    // Can't use NEON instructions.
+    if (UnitSize == 0)
+      UnitSize = 4;
+  }
+
+  // Select the correct opcode and register class for unit size load/store
+  bool IsNeon = UnitSize >= 8;
+  TRC = (IsThumb1 || IsThumb2) ? (const TargetRegisterClass *)&ARM::tGPRRegClass
+                               : (const TargetRegisterClass *)&ARM::GPRRegClass;
+  if (IsNeon)
+    VecTRC = UnitSize == 16
+                 ? (const TargetRegisterClass *)&ARM::DPairRegClass
+                 : UnitSize == 8
+                       ? (const TargetRegisterClass *)&ARM::DPRRegClass
+                       : nullptr;
+
+  unsigned BytesLeft = SizeVal % UnitSize;
+  unsigned LoopSize = SizeVal - BytesLeft;
+
+  if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) {
+    // Use LDR and STR to copy.
+    // [scratch, srcOut] = LDR_POST(srcIn, UnitSize)
+    // [destOut] = STR_POST(scratch, destIn, UnitSize)
+    unsigned srcIn = src;
+    unsigned destIn = dest;
+    for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
+      unsigned srcOut = MRI.createVirtualRegister(TRC);
+      unsigned destOut = MRI.createVirtualRegister(TRC);
+      unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
+      emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut,
+                 IsThumb1, IsThumb2);
+      emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut,
+                 IsThumb1, IsThumb2);
+      srcIn = srcOut;
+      destIn = destOut;
+    }
+
+    // Handle the leftover bytes with LDRB and STRB.
+    // [scratch, srcOut] = LDRB_POST(srcIn, 1)
+    // [destOut] = STRB_POST(scratch, destIn, 1)
+    for (unsigned i = 0; i < BytesLeft; i++) {
+      unsigned srcOut = MRI.createVirtualRegister(TRC);
+      unsigned destOut = MRI.createVirtualRegister(TRC);
+      unsigned scratch = MRI.createVirtualRegister(TRC);
+      emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut,
+                 IsThumb1, IsThumb2);
+      emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut,
+                 IsThumb1, IsThumb2);
+      srcIn = srcOut;
+      destIn = destOut;
+    }
+    MI->eraseFromParent();   // The instruction is gone now.
+    return BB;
+  }
+
+  // Expand the pseudo op to a loop.
+  // thisMBB:
+  //   ...
+  //   movw varEnd, # --> with thumb2
+  //   movt varEnd, #
+  //   ldrcp varEnd, idx --> without thumb2
+  //   fallthrough --> loopMBB
+  // loopMBB:
+  //   PHI varPhi, varEnd, varLoop
+  //   PHI srcPhi, src, srcLoop
+  //   PHI destPhi, dst, destLoop
+  //   [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
+  //   [destLoop] = STR_POST(scratch, destPhi, UnitSize)
+  //   subs varLoop, varPhi, #UnitSize
+  //   bne loopMBB
+  //   fallthrough --> exitMBB
+  // exitMBB:
+  //   epilogue to handle left-over bytes
+  //   [scratch, srcOut] = LDRB_POST(srcLoop, 1)
+  //   [destOut] = STRB_POST(scratch, destLoop, 1)
+  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MF->insert(It, loopMBB);
+  MF->insert(It, exitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), BB,
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Load an immediate to varEnd.
+  unsigned varEnd = MRI.createVirtualRegister(TRC);
+  if (IsThumb2) {
+    unsigned Vtmp = varEnd;
+    if ((LoopSize & 0xFFFF0000) != 0)
+      Vtmp = MRI.createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVi16), Vtmp)
+                       .addImm(LoopSize & 0xFFFF));
+
+    if ((LoopSize & 0xFFFF0000) != 0)
+      AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVTi16), varEnd)
+                         .addReg(Vtmp).addImm(LoopSize >> 16));
+  } else {
+    MachineConstantPool *ConstantPool = MF->getConstantPool();
+    Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
+    const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
+
+    // MachineConstantPool wants an explicit alignment.
+    unsigned Align = getDataLayout()->getPrefTypeAlignment(Int32Ty);
+    if (Align == 0)
+      Align = getDataLayout()->getTypeAllocSize(C->getType());
+    unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
+
+    if (IsThumb1)
+      AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci)).addReg(
+          varEnd, RegState::Define).addConstantPoolIndex(Idx));
+    else
+      AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp)).addReg(
+          varEnd, RegState::Define).addConstantPoolIndex(Idx).addImm(0));
+  }
+  BB->addSuccessor(loopMBB);
+
+  // Generate the loop body:
+  //   varPhi = PHI(varLoop, varEnd)
+  //   srcPhi = PHI(srcLoop, src)
+  //   destPhi = PHI(destLoop, dst)
+  MachineBasicBlock *entryBB = BB;
+  BB = loopMBB;
+  unsigned varLoop = MRI.createVirtualRegister(TRC);
+  unsigned varPhi = MRI.createVirtualRegister(TRC);
+  unsigned srcLoop = MRI.createVirtualRegister(TRC);
+  unsigned srcPhi = MRI.createVirtualRegister(TRC);
+  unsigned destLoop = MRI.createVirtualRegister(TRC);
+  unsigned destPhi = MRI.createVirtualRegister(TRC);
+
+  BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
+    .addReg(varLoop).addMBB(loopMBB)
+    .addReg(varEnd).addMBB(entryBB);
+  BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi)
+    .addReg(srcLoop).addMBB(loopMBB)
+    .addReg(src).addMBB(entryBB);
+  BuildMI(BB, dl, TII->get(ARM::PHI), destPhi)
+    .addReg(destLoop).addMBB(loopMBB)
+    .addReg(dest).addMBB(entryBB);
+
+  //   [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
+  //   [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
+  unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
+  emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop,
+             IsThumb1, IsThumb2);
+  emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop,
+             IsThumb1, IsThumb2);
+
+  // Decrement loop variable by UnitSize.
+  if (IsThumb1) {
+    MachineInstrBuilder MIB =
+        BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop);
+    MIB = AddDefaultT1CC(MIB);
+    MIB.addReg(varPhi).addImm(UnitSize);
+    AddDefaultPred(MIB);
+  } else {
+    MachineInstrBuilder MIB =
+        BuildMI(*BB, BB->end(), dl,
+                TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
+    AddDefaultCC(AddDefaultPred(MIB.addReg(varPhi).addImm(UnitSize)));
+    MIB->getOperand(5).setReg(ARM::CPSR);
+    MIB->getOperand(5).setIsDef(true);
+  }
+  BuildMI(*BB, BB->end(), dl,
+          TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc))
+      .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
+
+  // loopMBB can loop back to loopMBB or fall through to exitMBB.
+  BB->addSuccessor(loopMBB);
+  BB->addSuccessor(exitMBB);
+
+  // Add epilogue to handle BytesLeft.
+  BB = exitMBB;
+  MachineInstr *StartOfExit = exitMBB->begin();
+
+  //   [scratch, srcOut] = LDRB_POST(srcLoop, 1)
+  //   [destOut] = STRB_POST(scratch, destLoop, 1)
+  unsigned srcIn = srcLoop;
+  unsigned destIn = destLoop;
+  for (unsigned i = 0; i < BytesLeft; i++) {
+    unsigned srcOut = MRI.createVirtualRegister(TRC);
+    unsigned destOut = MRI.createVirtualRegister(TRC);
+    unsigned scratch = MRI.createVirtualRegister(TRC);
+    emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut,
+               IsThumb1, IsThumb2);
+    emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut,
+               IsThumb1, IsThumb2);
+    srcIn = srcOut;
+    destIn = destOut;
+  }
+
+  MI->eraseFromParent();   // The instruction is gone now.
+  return BB;
+}
+
+MachineBasicBlock *
+ARMTargetLowering::EmitLowered__chkstk(MachineInstr *MI,
+                                       MachineBasicBlock *MBB) const {
+  const TargetMachine &TM = getTargetMachine();
+  const TargetInstrInfo &TII = *TM.getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+
+  assert(Subtarget->isTargetWindows() &&
+         "__chkstk is only supported on Windows");
+  assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode");
+
+  // __chkstk takes the number of words to allocate on the stack in R4, and
+  // returns the stack adjustment in number of bytes in R4.  This will not
+  // clober any other registers (other than the obvious lr).
+  //
+  // Although, technically, IP should be considered a register which may be
+  // clobbered, the call itself will not touch it.  Windows on ARM is a pure
+  // thumb-2 environment, so there is no interworking required.  As a result, we
+  // do not expect a veneer to be emitted by the linker, clobbering IP.
+  //
+  // Each module receives its own copy of __chkstk, so no import thunk is
+  // required, again, ensuring that IP is not clobbered.
+  //
+  // Finally, although some linkers may theoretically provide a trampoline for
+  // out of range calls (which is quite common due to a 32M range limitation of
+  // branches for Thumb), we can generate the long-call version via
+  // -mcmodel=large, alleviating the need for the trampoline which may clobber
+  // IP.
+
+  switch (TM.getCodeModel()) {
+  case CodeModel::Small:
+  case CodeModel::Medium:
+  case CodeModel::Default:
+  case CodeModel::Kernel:
+    BuildMI(*MBB, MI, DL, TII.get(ARM::tBL))
+      .addImm((unsigned)ARMCC::AL).addReg(0)
+      .addExternalSymbol("__chkstk")
+      .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
+      .addReg(ARM::R4, RegState::Implicit | RegState::Define)
+      .addReg(ARM::R12, RegState::Implicit | RegState::Define | RegState::Dead);
+    break;
+  case CodeModel::Large:
+  case CodeModel::JITDefault: {
+    MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+    unsigned Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
+
+    BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg)
+      .addExternalSymbol("__chkstk");
+    BuildMI(*MBB, MI, DL, TII.get(ARM::tBLXr))
+      .addImm((unsigned)ARMCC::AL).addReg(0)
+      .addReg(Reg, RegState::Kill)
+      .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
+      .addReg(ARM::R4, RegState::Implicit | RegState::Define)
+      .addReg(ARM::R12, RegState::Implicit | RegState::Define | RegState::Dead);
+    break;
+  }
+  }
+
+  AddDefaultCC(AddDefaultPred(BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr),
+                                      ARM::SP)
+                              .addReg(ARM::SP).addReg(ARM::R4)));
+
+  MI->eraseFromParent();
+  return MBB;
+}
+
+MachineBasicBlock *
+ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                               MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc dl = MI->getDebugLoc();
+  bool isThumb2 = Subtarget->isThumb2();
+  switch (MI->getOpcode()) {
+  default: {
+    MI->dump();
+    llvm_unreachable("Unexpected instr type to insert");
+  }
+  // The Thumb2 pre-indexed stores have the same MI operands, they just
+  // define them differently in the .td files from the isel patterns, so
+  // they need pseudos.
+  case ARM::t2STR_preidx:
+    MI->setDesc(TII->get(ARM::t2STR_PRE));
+    return BB;
+  case ARM::t2STRB_preidx:
+    MI->setDesc(TII->get(ARM::t2STRB_PRE));
+    return BB;
+  case ARM::t2STRH_preidx:
+    MI->setDesc(TII->get(ARM::t2STRH_PRE));
+    return BB;
+
+  case ARM::STRi_preidx:
+  case ARM::STRBi_preidx: {
+    unsigned NewOpc = MI->getOpcode() == ARM::STRi_preidx ?
+      ARM::STR_PRE_IMM : ARM::STRB_PRE_IMM;
+    // Decode the offset.
+    unsigned Offset = MI->getOperand(4).getImm();
+    bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub;
+    Offset = ARM_AM::getAM2Offset(Offset);
+    if (isSub)
+      Offset = -Offset;
+
+    MachineMemOperand *MMO = *MI->memoperands_begin();
+    BuildMI(*BB, MI, dl, TII->get(NewOpc))
+      .addOperand(MI->getOperand(0))  // Rn_wb
+      .addOperand(MI->getOperand(1))  // Rt
+      .addOperand(MI->getOperand(2))  // Rn
+      .addImm(Offset)                 // offset (skip GPR==zero_reg)
+      .addOperand(MI->getOperand(5))  // pred
+      .addOperand(MI->getOperand(6))
+      .addMemOperand(MMO);
+    MI->eraseFromParent();
+    return BB;
+  }
+  case ARM::STRr_preidx:
+  case ARM::STRBr_preidx:
+  case ARM::STRH_preidx: {
+    unsigned NewOpc;
+    switch (MI->getOpcode()) {
+    default: llvm_unreachable("unexpected opcode!");
+    case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break;
+    case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break;
+    case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break;
+    }
+    MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
+    for (unsigned i = 0; i < MI->getNumOperands(); ++i)
+      MIB.addOperand(MI->getOperand(i));
+    MI->eraseFromParent();
+    return BB;
+  }
+
+  case ARM::tMOVCCr_pseudo: {
+    // To "insert" a SELECT_CC instruction, we actually have to insert the
+    // diamond control-flow pattern.  The incoming instruction knows the
+    // destination vreg to set, the condition code register to branch on, the
+    // true/false values to select between, and a branch opcode to use.
+    const BasicBlock *LLVM_BB = BB->getBasicBlock();
+    MachineFunction::iterator It = BB;
+    ++It;
+
+    //  thisMBB:
+    //  ...
+    //   TrueVal = ...
+    //   cmpTY ccX, r1, r2
+    //   bCC copy1MBB
+    //   fallthrough --> copy0MBB
+    MachineBasicBlock *thisMBB  = BB;
+    MachineFunction *F = BB->getParent();
+    MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *sinkMBB  = F->CreateMachineBasicBlock(LLVM_BB);
+    F->insert(It, copy0MBB);
+    F->insert(It, sinkMBB);
+
+    // Transfer the remainder of BB and its successor edges to sinkMBB.
+    sinkMBB->splice(sinkMBB->begin(), BB,
+                    std::next(MachineBasicBlock::iterator(MI)), BB->end());
+    sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+    BB->addSuccessor(copy0MBB);
+    BB->addSuccessor(sinkMBB);
+
+    BuildMI(BB, dl, TII->get(ARM::tBcc)).addMBB(sinkMBB)
+      .addImm(MI->getOperand(3).getImm()).addReg(MI->getOperand(4).getReg());
+
+    //  copy0MBB:
+    //   %FalseValue = ...
+    //   # fallthrough to sinkMBB
+    BB = copy0MBB;
+
+    // Update machine-CFG edges
+    BB->addSuccessor(sinkMBB);
+
+    //  sinkMBB:
+    //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+    //  ...
+    BB = sinkMBB;
+    BuildMI(*BB, BB->begin(), dl,
+            TII->get(ARM::PHI), MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
+      .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
+
+    MI->eraseFromParent();   // The pseudo instruction is gone now.
+    return BB;
+  }
+
+  case ARM::BCCi64:
+  case ARM::BCCZi64: {
+    // If there is an unconditional branch to the other successor, remove it.
+    BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end());
+
+    // Compare both parts that make up the double comparison separately for
+    // equality.
+    bool RHSisZero = MI->getOpcode() == ARM::BCCZi64;
+
+    unsigned LHS1 = MI->getOperand(1).getReg();
+    unsigned LHS2 = MI->getOperand(2).getReg();
+    if (RHSisZero) {
+      AddDefaultPred(BuildMI(BB, dl,
+                             TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
+                     .addReg(LHS1).addImm(0));
+      BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
+        .addReg(LHS2).addImm(0)
+        .addImm(ARMCC::EQ).addReg(ARM::CPSR);
+    } else {
+      unsigned RHS1 = MI->getOperand(3).getReg();
+      unsigned RHS2 = MI->getOperand(4).getReg();
+      AddDefaultPred(BuildMI(BB, dl,
+                             TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
+                     .addReg(LHS1).addReg(RHS1));
+      BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
+        .addReg(LHS2).addReg(RHS2)
+        .addImm(ARMCC::EQ).addReg(ARM::CPSR);
+    }
+
+    MachineBasicBlock *destMBB = MI->getOperand(RHSisZero ? 3 : 5).getMBB();
+    MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB);
+    if (MI->getOperand(0).getImm() == ARMCC::NE)
+      std::swap(destMBB, exitMBB);
+
+    BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
+      .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR);
+    if (isThumb2)
+      AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2B)).addMBB(exitMBB));
+    else
+      BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB);
+
+    MI->eraseFromParent();   // The pseudo instruction is gone now.
+    return BB;
+  }
+
+  case ARM::Int_eh_sjlj_setjmp:
+  case ARM::Int_eh_sjlj_setjmp_nofp:
+  case ARM::tInt_eh_sjlj_setjmp:
+  case ARM::t2Int_eh_sjlj_setjmp:
+  case ARM::t2Int_eh_sjlj_setjmp_nofp:
+    EmitSjLjDispatchBlock(MI, BB);
+    return BB;
+
+  case ARM::ABS:
+  case ARM::t2ABS: {
+    // To insert an ABS instruction, we have to insert the
+    // diamond control-flow pattern.  The incoming instruction knows the
+    // source vreg to test against 0, the destination vreg to set,
+    // the condition code register to branch on, the
+    // true/false values to select between, and a branch opcode to use.
+    // It transforms
+    //     V1 = ABS V0
+    // into
+    //     V2 = MOVS V0
+    //     BCC                      (branch to SinkBB if V0 >= 0)
+    //     RSBBB: V3 = RSBri V2, 0  (compute ABS if V2 < 0)
+    //     SinkBB: V1 = PHI(V2, V3)
+    const BasicBlock *LLVM_BB = BB->getBasicBlock();
+    MachineFunction::iterator BBI = BB;
+    ++BBI;
+    MachineFunction *Fn = BB->getParent();
+    MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *SinkBB  = Fn->CreateMachineBasicBlock(LLVM_BB);
+    Fn->insert(BBI, RSBBB);
+    Fn->insert(BBI, SinkBB);
+
+    unsigned int ABSSrcReg = MI->getOperand(1).getReg();
+    unsigned int ABSDstReg = MI->getOperand(0).getReg();
+    bool isThumb2 = Subtarget->isThumb2();
+    MachineRegisterInfo &MRI = Fn->getRegInfo();
+    // In Thumb mode S must not be specified if source register is the SP or
+    // PC and if destination register is the SP, so restrict register class
+    unsigned NewRsbDstReg = MRI.createVirtualRegister(isThumb2 ?
+      (const TargetRegisterClass*)&ARM::rGPRRegClass :
+      (const TargetRegisterClass*)&ARM::GPRRegClass);
+
+    // Transfer the remainder of BB and its successor edges to sinkMBB.
+    SinkBB->splice(SinkBB->begin(), BB,
+                   std::next(MachineBasicBlock::iterator(MI)), BB->end());
+    SinkBB->transferSuccessorsAndUpdatePHIs(BB);
+
+    BB->addSuccessor(RSBBB);
+    BB->addSuccessor(SinkBB);
+
+    // fall through to SinkMBB
+    RSBBB->addSuccessor(SinkBB);
+
+    // insert a cmp at the end of BB
+    AddDefaultPred(BuildMI(BB, dl,
+                           TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
+                   .addReg(ABSSrcReg).addImm(0));
+
+    // insert a bcc with opposite CC to ARMCC::MI at the end of BB
+    BuildMI(BB, dl,
+      TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB)
+      .addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR);
+
+    // insert rsbri in RSBBB
+    // Note: BCC and rsbri will be converted into predicated rsbmi
+    // by if-conversion pass
+    BuildMI(*RSBBB, RSBBB->begin(), dl,
+      TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg)
+      .addReg(ABSSrcReg, RegState::Kill)
+      .addImm(0).addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
+
+    // insert PHI in SinkBB,
+    // reuse ABSDstReg to not change uses of ABS instruction
+    BuildMI(*SinkBB, SinkBB->begin(), dl,
+      TII->get(ARM::PHI), ABSDstReg)
+      .addReg(NewRsbDstReg).addMBB(RSBBB)
+      .addReg(ABSSrcReg).addMBB(BB);
+
+    // remove ABS instruction
+    MI->eraseFromParent();
+
+    // return last added BB
+    return SinkBB;
+  }
+  case ARM::COPY_STRUCT_BYVAL_I32:
+    ++NumLoopByVals;
+    return EmitStructByval(MI, BB);
+  case ARM::WIN__CHKSTK:
+    return EmitLowered__chkstk(MI, BB);
+  }
+}
+
+void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
+                                                      SDNode *Node) const {
+  if (!MI->hasPostISelHook()) {
+    assert(!convertAddSubFlagsOpcode(MI->getOpcode()) &&
+           "Pseudo flag-setting opcodes must be marked with 'hasPostISelHook'");
+    return;
+  }
+
+  const MCInstrDesc *MCID = &MI->getDesc();
+  // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
+  // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
+  // operand is still set to noreg. If needed, set the optional operand's
+  // register to CPSR, and remove the redundant implicit def.
+  //
+  // e.g. ADCS (..., CPSR<imp-def>) -> ADC (... opt:CPSR<def>).
+
+  // Rename pseudo opcodes.
+  unsigned NewOpc = convertAddSubFlagsOpcode(MI->getOpcode());
+  if (NewOpc) {
+    const ARMBaseInstrInfo *TII =
+      static_cast<const ARMBaseInstrInfo*>(getTargetMachine().getInstrInfo());
+    MCID = &TII->get(NewOpc);
+
+    assert(MCID->getNumOperands() == MI->getDesc().getNumOperands() + 1 &&
+           "converted opcode should be the same except for cc_out");
+
+    MI->setDesc(*MCID);
+
+    // Add the optional cc_out operand
+    MI->addOperand(MachineOperand::CreateReg(0, /*isDef=*/true));
+  }
+  unsigned ccOutIdx = MCID->getNumOperands() - 1;
+
+  // Any ARM instruction that sets the 's' bit should specify an optional
+  // "cc_out" operand in the last operand position.
+  if (!MI->hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) {
+    assert(!NewOpc && "Optional cc_out operand required");
+    return;
+  }
+  // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it
+  // since we already have an optional CPSR def.
+  bool definesCPSR = false;
+  bool deadCPSR = false;
+  for (unsigned i = MCID->getNumOperands(), e = MI->getNumOperands();
+       i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) {
+      definesCPSR = true;
+      if (MO.isDead())
+        deadCPSR = true;
+      MI->RemoveOperand(i);
+      break;
+    }
+  }
+  if (!definesCPSR) {
+    assert(!NewOpc && "Optional cc_out operand required");
+    return;
+  }
+  assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag");
+  if (deadCPSR) {
+    assert(!MI->getOperand(ccOutIdx).getReg() &&
+           "expect uninitialized optional cc_out operand");
+    return;
+  }
+
+  // If this instruction was defined with an optional CPSR def and its dag node
+  // had a live implicit CPSR def, then activate the optional CPSR def.
+  MachineOperand &MO = MI->getOperand(ccOutIdx);
+  MO.setReg(ARM::CPSR);
+  MO.setIsDef(true);
+}
+
+//===----------------------------------------------------------------------===//
+//                           ARM Optimization Hooks
+//===----------------------------------------------------------------------===//
+
+// Helper function that checks if N is a null or all ones constant.
+static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N);
+  if (!C)
+    return false;
+  return AllOnes ? C->isAllOnesValue() : C->isNullValue();
+}
+
+// Return true if N is conditionally 0 or all ones.
+// Detects these expressions where cc is an i1 value:
+//
+//   (select cc 0, y)   [AllOnes=0]
+//   (select cc y, 0)   [AllOnes=0]
+//   (zext cc)          [AllOnes=0]
+//   (sext cc)          [AllOnes=0/1]
+//   (select cc -1, y)  [AllOnes=1]
+//   (select cc y, -1)  [AllOnes=1]
+//
+// Invert is set when N is the null/all ones constant when CC is false.
+// OtherOp is set to the alternative value of N.
+static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes,
+                                       SDValue &CC, bool &Invert,
+                                       SDValue &OtherOp,
+                                       SelectionDAG &DAG) {
+  switch (N->getOpcode()) {
+  default: return false;
+  case ISD::SELECT: {
+    CC = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+    SDValue N2 = N->getOperand(2);
+    if (isZeroOrAllOnes(N1, AllOnes)) {
+      Invert = false;
+      OtherOp = N2;
+      return true;
+    }
+    if (isZeroOrAllOnes(N2, AllOnes)) {
+      Invert = true;
+      OtherOp = N1;
+      return true;
+    }
+    return false;
+  }
+  case ISD::ZERO_EXTEND:
+    // (zext cc) can never be the all ones value.
+    if (AllOnes)
+      return false;
+    // Fall through.
+  case ISD::SIGN_EXTEND: {
+    EVT VT = N->getValueType(0);
+    CC = N->getOperand(0);
+    if (CC.getValueType() != MVT::i1)
+      return false;
+    Invert = !AllOnes;
+    if (AllOnes)
+      // When looking for an AllOnes constant, N is an sext, and the 'other'
+      // value is 0.
+      OtherOp = DAG.getConstant(0, VT);
+    else if (N->getOpcode() == ISD::ZERO_EXTEND)
+      // When looking for a 0 constant, N can be zext or sext.
+      OtherOp = DAG.getConstant(1, VT);
+    else
+      OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), VT);
+    return true;
+  }
+  }
+}
+
+// Combine a constant select operand into its use:
+//
+//   (add (select cc, 0, c), x)  -> (select cc, x, (add, x, c))
+//   (sub x, (select cc, 0, c))  -> (select cc, x, (sub, x, c))
+//   (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))  [AllOnes=1]
+//   (or  (select cc, 0, c), x)  -> (select cc, x, (or, x, c))
+//   (xor (select cc, 0, c), x)  -> (select cc, x, (xor, x, c))
+//
+// The transform is rejected if the select doesn't have a constant operand that
+// is null, or all ones when AllOnes is set.
+//
+// Also recognize sext/zext from i1:
+//
+//   (add (zext cc), x) -> (select cc (add x, 1), x)
+//   (add (sext cc), x) -> (select cc (add x, -1), x)
+//
+// These transformations eventually create predicated instructions.
+//
+// @param N       The node to transform.
+// @param Slct    The N operand that is a select.
+// @param OtherOp The other N operand (x above).
+// @param DCI     Context.
+// @param AllOnes Require the select constant to be all ones instead of null.
+// @returns The new node, or SDValue() on failure.
+static
+SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
+                            TargetLowering::DAGCombinerInfo &DCI,
+                            bool AllOnes = false) {
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+  SDValue NonConstantVal;
+  SDValue CCOp;
+  bool SwapSelectOps;
+  if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps,
+                                  NonConstantVal, DAG))
+    return SDValue();
+
+  // Slct is now know to be the desired identity constant when CC is true.
+  SDValue TrueVal = OtherOp;
+  SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
+                                 OtherOp, NonConstantVal);
+  // Unless SwapSelectOps says CC should be false.
+  if (SwapSelectOps)
+    std::swap(TrueVal, FalseVal);
+
+  return DAG.getNode(ISD::SELECT, SDLoc(N), VT,
+                     CCOp, TrueVal, FalseVal);
+}
+
+// Attempt combineSelectAndUse on each operand of a commutative operator N.
+static
+SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes,
+                                       TargetLowering::DAGCombinerInfo &DCI) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  if (N0.getNode()->hasOneUse()) {
+    SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes);
+    if (Result.getNode())
+      return Result;
+  }
+  if (N1.getNode()->hasOneUse()) {
+    SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes);
+    if (Result.getNode())
+      return Result;
+  }
+  return SDValue();
+}
+
+// AddCombineToVPADDL- For pair-wise add on neon, use the vpaddl instruction
+// (only after legalization).
+static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const ARMSubtarget *Subtarget) {
+
+  // Only perform optimization if after legalize, and if NEON is available. We
+  // also expected both operands to be BUILD_VECTORs.
+  if (DCI.isBeforeLegalize() || !Subtarget->hasNEON()
+      || N0.getOpcode() != ISD::BUILD_VECTOR
+      || N1.getOpcode() != ISD::BUILD_VECTOR)
+    return SDValue();
+
+  // Check output type since VPADDL operand elements can only be 8, 16, or 32.
+  EVT VT = N->getValueType(0);
+  if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64)
+    return SDValue();
+
+  // Check that the vector operands are of the right form.
+  // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR
+  // operands, where N is the size of the formed vector.
+  // Each EXTRACT_VECTOR should have the same input vector and odd or even
+  // index such that we have a pair wise add pattern.
+
+  // Grab the vector that all EXTRACT_VECTOR nodes should be referencing.
+  if (N0->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+    return SDValue();
+  SDValue Vec = N0->getOperand(0)->getOperand(0);
+  SDNode *V = Vec.getNode();
+  unsigned nextIndex = 0;
+
+  // For each operands to the ADD which are BUILD_VECTORs,
+  // check to see if each of their operands are an EXTRACT_VECTOR with
+  // the same vector and appropriate index.
+  for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) {
+    if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT
+        && N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+
+      SDValue ExtVec0 = N0->getOperand(i);
+      SDValue ExtVec1 = N1->getOperand(i);
+
+      // First operand is the vector, verify its the same.
+      if (V != ExtVec0->getOperand(0).getNode() ||
+          V != ExtVec1->getOperand(0).getNode())
+        return SDValue();
+
+      // Second is the constant, verify its correct.
+      ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1));
+      ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1));
+
+      // For the constant, we want to see all the even or all the odd.
+      if (!C0 || !C1 || C0->getZExtValue() != nextIndex
+          || C1->getZExtValue() != nextIndex+1)
+        return SDValue();
+
+      // Increment index.
+      nextIndex+=2;
+    } else
+      return SDValue();
+  }
+
+  // Create VPADDL node.
+  SelectionDAG &DAG = DCI.DAG;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // Build operand list.
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls,
+                                TLI.getPointerTy()));
+
+  // Input is the vector.
+  Ops.push_back(Vec);
+
+  // Get widened type and narrowed type.
+  MVT widenType;
+  unsigned numElem = VT.getVectorNumElements();
+  
+  EVT inputLaneType = Vec.getValueType().getVectorElementType();
+  switch (inputLaneType.getSimpleVT().SimpleTy) {
+    case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break;
+    case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break;
+    case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break;
+    default:
+      llvm_unreachable("Invalid vector element type for padd optimization.");
+  }
+
+  SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), widenType, Ops);
+  unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE;
+  return DAG.getNode(ExtOp, SDLoc(N), VT, tmp);
+}
+
+static SDValue findMUL_LOHI(SDValue V) {
+  if (V->getOpcode() == ISD::UMUL_LOHI ||
+      V->getOpcode() == ISD::SMUL_LOHI)
+    return V;
+  return SDValue();
+}
+
+static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     const ARMSubtarget *Subtarget) {
+
+  if (Subtarget->isThumb1Only()) return SDValue();
+
+  // Only perform the checks after legalize when the pattern is available.
+  if (DCI.isBeforeLegalize()) return SDValue();
+
+  // Look for multiply add opportunities.
+  // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where
+  // each add nodes consumes a value from ISD::UMUL_LOHI and there is
+  // a glue link from the first add to the second add.
+  // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by
+  // a S/UMLAL instruction.
+  //          loAdd   UMUL_LOHI
+  //            \    / :lo    \ :hi
+  //             \  /          \          [no multiline comment]
+  //              ADDC         |  hiAdd
+  //                 \ :glue  /  /
+  //                  \      /  /
+  //                    ADDE
+  //
+  assert(AddcNode->getOpcode() == ISD::ADDC && "Expect an ADDC");
+  SDValue AddcOp0 = AddcNode->getOperand(0);
+  SDValue AddcOp1 = AddcNode->getOperand(1);
+
+  // Check if the two operands are from the same mul_lohi node.
+  if (AddcOp0.getNode() == AddcOp1.getNode())
+    return SDValue();
+
+  assert(AddcNode->getNumValues() == 2 &&
+         AddcNode->getValueType(0) == MVT::i32 &&
+         "Expect ADDC with two result values. First: i32");
+
+  // Check that we have a glued ADDC node.
+  if (AddcNode->getValueType(1) != MVT::Glue)
+    return SDValue();
+
+  // Check that the ADDC adds the low result of the S/UMUL_LOHI.
+  if (AddcOp0->getOpcode() != ISD::UMUL_LOHI &&
+      AddcOp0->getOpcode() != ISD::SMUL_LOHI &&
+      AddcOp1->getOpcode() != ISD::UMUL_LOHI &&
+      AddcOp1->getOpcode() != ISD::SMUL_LOHI)
+    return SDValue();
+
+  // Look for the glued ADDE.
+  SDNode* AddeNode = AddcNode->getGluedUser();
+  if (!AddeNode)
+    return SDValue();
+
+  // Make sure it is really an ADDE.
+  if (AddeNode->getOpcode() != ISD::ADDE)
+    return SDValue();
+
+  assert(AddeNode->getNumOperands() == 3 &&
+         AddeNode->getOperand(2).getValueType() == MVT::Glue &&
+         "ADDE node has the wrong inputs");
+
+  // Check for the triangle shape.
+  SDValue AddeOp0 = AddeNode->getOperand(0);
+  SDValue AddeOp1 = AddeNode->getOperand(1);
+
+  // Make sure that the ADDE operands are not coming from the same node.
+  if (AddeOp0.getNode() == AddeOp1.getNode())
+    return SDValue();
+
+  // Find the MUL_LOHI node walking up ADDE's operands.
+  bool IsLeftOperandMUL = false;
+  SDValue MULOp = findMUL_LOHI(AddeOp0);
+  if (MULOp == SDValue())
+   MULOp = findMUL_LOHI(AddeOp1);
+  else
+    IsLeftOperandMUL = true;
+  if (MULOp == SDValue())
+     return SDValue();
+
+  // Figure out the right opcode.
+  unsigned Opc = MULOp->getOpcode();
+  unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;
+
+  // Figure out the high and low input values to the MLAL node.
+  SDValue* HiMul = &MULOp;
+  SDValue* HiAdd = nullptr;
+  SDValue* LoMul = nullptr;
+  SDValue* LowAdd = nullptr;
+
+  if (IsLeftOperandMUL)
+    HiAdd = &AddeOp1;
+  else
+    HiAdd = &AddeOp0;
+
+
+  if (AddcOp0->getOpcode() == Opc) {
+    LoMul = &AddcOp0;
+    LowAdd = &AddcOp1;
+  }
+  if (AddcOp1->getOpcode() == Opc) {
+    LoMul = &AddcOp1;
+    LowAdd = &AddcOp0;
+  }
+
+  if (!LoMul)
+    return SDValue();
+
+  if (LoMul->getNode() != HiMul->getNode())
+    return SDValue();
+
+  // Create the merged node.
+  SelectionDAG &DAG = DCI.DAG;
+
+  // Build operand list.
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(LoMul->getOperand(0));
+  Ops.push_back(LoMul->getOperand(1));
+  Ops.push_back(*LowAdd);
+  Ops.push_back(*HiAdd);
+
+  SDValue MLALNode =  DAG.getNode(FinalOpc, SDLoc(AddcNode),
+                                 DAG.getVTList(MVT::i32, MVT::i32), Ops);
+
+  // Replace the ADDs' nodes uses by the MLA node's values.
+  SDValue HiMLALResult(MLALNode.getNode(), 1);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);
+
+  SDValue LoMLALResult(MLALNode.getNode(), 0);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
+
+  // Return original node to notify the driver to stop replacing.
+  SDValue resNode(AddcNode, 0);
+  return resNode;
+}
+
+/// PerformADDCCombine - Target-specific dag combine transform from
+/// ISD::ADDC, ISD::ADDE, and ISD::MUL_LOHI to MLAL.
+static SDValue PerformADDCCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const ARMSubtarget *Subtarget) {
+
+  return AddCombineTo64bitMLAL(N, DCI, Subtarget);
+
+}
+
+/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
+/// operands N0 and N1.  This is a helper for PerformADDCombine that is
+/// called with the default operands, and if that fails, with commuted
+/// operands.
+static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
+                                          TargetLowering::DAGCombinerInfo &DCI,
+                                          const ARMSubtarget *Subtarget){
+
+  // Attempt to create vpaddl for this add.
+  SDValue Result = AddCombineToVPADDL(N, N0, N1, DCI, Subtarget);
+  if (Result.getNode())
+    return Result;
+
+  // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
+  if (N0.getNode()->hasOneUse()) {
+    SDValue Result = combineSelectAndUse(N, N0, N1, DCI);
+    if (Result.getNode()) return Result;
+  }
+  return SDValue();
+}
+
+/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
+///
+static SDValue PerformADDCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const ARMSubtarget *Subtarget) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  // First try with the default operand order.
+  SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget);
+  if (Result.getNode())
+    return Result;
+
+  // If that didn't work, try again with the operands commuted.
+  return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget);
+}
+
+/// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
+///
+static SDValue PerformSUBCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
+  if (N1.getNode()->hasOneUse()) {
+    SDValue Result = combineSelectAndUse(N, N1, N0, DCI);
+    if (Result.getNode()) return Result;
+  }
+
+  return SDValue();
+}
+
+/// PerformVMULCombine
+/// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
+/// special multiplier accumulator forwarding.
+///   vmul d3, d0, d2
+///   vmla d3, d1, d2
+/// is faster than
+///   vadd d3, d0, d1
+///   vmul d3, d3, d2
+//  However, for (A + B) * (A + B),
+//    vadd d2, d0, d1
+//    vmul d3, d0, d2
+//    vmla d3, d1, d2
+//  is slower than
+//    vadd d2, d0, d1
+//    vmul d3, d2, d2
+static SDValue PerformVMULCombine(SDNode *N,
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const ARMSubtarget *Subtarget) {
+  if (!Subtarget->hasVMLxForwarding())
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  unsigned Opcode = N0.getOpcode();
+  if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
+      Opcode != ISD::FADD && Opcode != ISD::FSUB) {
+    Opcode = N1.getOpcode();
+    if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
+        Opcode != ISD::FADD && Opcode != ISD::FSUB)
+      return SDValue();
+    std::swap(N0, N1);
+  }
+
+  if (N0 == N1)
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+  SDValue N00 = N0->getOperand(0);
+  SDValue N01 = N0->getOperand(1);
+  return DAG.getNode(Opcode, DL, VT,
+                     DAG.getNode(ISD::MUL, DL, VT, N00, N1),
+                     DAG.getNode(ISD::MUL, DL, VT, N01, N1));
+}
+
+static SDValue PerformMULCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const ARMSubtarget *Subtarget) {
+  SelectionDAG &DAG = DCI.DAG;
+
+  if (Subtarget->isThumb1Only())
+    return SDValue();
+
+  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  if (VT.is64BitVector() || VT.is128BitVector())
+    return PerformVMULCombine(N, DCI, Subtarget);
+  if (VT != MVT::i32)
+    return SDValue();
+
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  if (!C)
+    return SDValue();
+
+  int64_t MulAmt = C->getSExtValue();
+  unsigned ShiftAmt = countTrailingZeros<uint64_t>(MulAmt);
+
+  ShiftAmt = ShiftAmt & (32 - 1);
+  SDValue V = N->getOperand(0);
+  SDLoc DL(N);
+
+  SDValue Res;
+  MulAmt >>= ShiftAmt;
+
+  if (MulAmt >= 0) {
+    if (isPowerOf2_32(MulAmt - 1)) {
+      // (mul x, 2^N + 1) => (add (shl x, N), x)
+      Res = DAG.getNode(ISD::ADD, DL, VT,
+                        V,
+                        DAG.getNode(ISD::SHL, DL, VT,
+                                    V,
+                                    DAG.getConstant(Log2_32(MulAmt - 1),
+                                                    MVT::i32)));
+    } else if (isPowerOf2_32(MulAmt + 1)) {
+      // (mul x, 2^N - 1) => (sub (shl x, N), x)
+      Res = DAG.getNode(ISD::SUB, DL, VT,
+                        DAG.getNode(ISD::SHL, DL, VT,
+                                    V,
+                                    DAG.getConstant(Log2_32(MulAmt + 1),
+                                                    MVT::i32)),
+                        V);
+    } else
+      return SDValue();
+  } else {
+    uint64_t MulAmtAbs = -MulAmt;
+    if (isPowerOf2_32(MulAmtAbs + 1)) {
+      // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
+      Res = DAG.getNode(ISD::SUB, DL, VT,
+                        V,
+                        DAG.getNode(ISD::SHL, DL, VT,
+                                    V,
+                                    DAG.getConstant(Log2_32(MulAmtAbs + 1),
+                                                    MVT::i32)));
+    } else if (isPowerOf2_32(MulAmtAbs - 1)) {
+      // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
+      Res = DAG.getNode(ISD::ADD, DL, VT,
+                        V,
+                        DAG.getNode(ISD::SHL, DL, VT,
+                                    V,
+                                    DAG.getConstant(Log2_32(MulAmtAbs-1),
+                                                    MVT::i32)));
+      Res = DAG.getNode(ISD::SUB, DL, VT,
+                        DAG.getConstant(0, MVT::i32),Res);
+
+    } else
+      return SDValue();
+  }
+
+  if (ShiftAmt != 0)
+    Res = DAG.getNode(ISD::SHL, DL, VT,
+                      Res, DAG.getConstant(ShiftAmt, MVT::i32));
+
+  // Do not add new nodes to DAG combiner worklist.
+  DCI.CombineTo(N, Res, false);
+  return SDValue();
+}
+
+static SDValue PerformANDCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const ARMSubtarget *Subtarget) {
+
+  // Attempt to use immediate-form VBIC
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  SelectionDAG &DAG = DCI.DAG;
+
+  if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+
+  APInt SplatBits, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (BVN &&
+      BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
+    if (SplatBitSize <= 64) {
+      EVT VbicVT;
+      SDValue Val = isNEONModifiedImm((~SplatBits).getZExtValue(),
+                                      SplatUndef.getZExtValue(), SplatBitSize,
+                                      DAG, VbicVT, VT.is128BitVector(),
+                                      OtherModImm);
+      if (Val.getNode()) {
+        SDValue Input =
+          DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0));
+        SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val);
+        return DAG.getNode(ISD::BITCAST, dl, VT, Vbic);
+      }
+    }
+  }
+
+  if (!Subtarget->isThumb1Only()) {
+    // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
+    SDValue Result = combineSelectAndUseCommutative(N, true, DCI);
+    if (Result.getNode())
+      return Result;
+  }
+
+  return SDValue();
+}
+
+/// PerformORCombine - Target-specific dag combine xforms for ISD::OR
+static SDValue PerformORCombine(SDNode *N,
+                                TargetLowering::DAGCombinerInfo &DCI,
+                                const ARMSubtarget *Subtarget) {
+  // Attempt to use immediate-form VORR
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  SelectionDAG &DAG = DCI.DAG;
+
+  if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+
+  APInt SplatBits, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (BVN && Subtarget->hasNEON() &&
+      BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
+    if (SplatBitSize <= 64) {
+      EVT VorrVT;
+      SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
+                                      SplatUndef.getZExtValue(), SplatBitSize,
+                                      DAG, VorrVT, VT.is128BitVector(),
+                                      OtherModImm);
+      if (Val.getNode()) {
+        SDValue Input =
+          DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0));
+        SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val);
+        return DAG.getNode(ISD::BITCAST, dl, VT, Vorr);
+      }
+    }
+  }
+
+  if (!Subtarget->isThumb1Only()) {
+    // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
+    SDValue Result = combineSelectAndUseCommutative(N, false, DCI);
+    if (Result.getNode())
+      return Result;
+  }
+
+  // The code below optimizes (or (and X, Y), Z).
+  // The AND operand needs to have a single user to make these optimizations
+  // profitable.
+  SDValue N0 = N->getOperand(0);
+  if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
+    return SDValue();
+  SDValue N1 = N->getOperand(1);
+
+  // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
+  if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
+      DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
+    APInt SplatUndef;
+    unsigned SplatBitSize;
+    bool HasAnyUndefs;
+
+    APInt SplatBits0, SplatBits1;
+    BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1));
+    BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));
+    // Ensure that the second operand of both ands are constants
+    if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
+                                      HasAnyUndefs) && !HasAnyUndefs) {
+        if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
+                                          HasAnyUndefs) && !HasAnyUndefs) {
+            // Ensure that the bit width of the constants are the same and that
+            // the splat arguments are logical inverses as per the pattern we
+            // are trying to simplify.
+            if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
+                SplatBits0 == ~SplatBits1) {
+                // Canonicalize the vector type to make instruction selection
+                // simpler.
+                EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
+                SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT,
+                                             N0->getOperand(1),
+                                             N0->getOperand(0),
+                                             N1->getOperand(0));
+                return DAG.getNode(ISD::BITCAST, dl, VT, Result);
+            }
+        }
+    }
+  }
+
+  // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
+  // reasonable.
+
+  // BFI is only available on V6T2+
+  if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
+    return SDValue();
+
+  SDLoc DL(N);
+  // 1) or (and A, mask), val => ARMbfi A, val, mask
+  //      iff (val & mask) == val
+  //
+  // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
+  //  2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
+  //          && mask == ~mask2
+  //  2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
+  //          && ~mask == mask2
+  //  (i.e., copy a bitfield value into another bitfield of the same width)
+
+  if (VT != MVT::i32)
+    return SDValue();
+
+  SDValue N00 = N0.getOperand(0);
+
+  // The value and the mask need to be constants so we can verify this is
+  // actually a bitfield set. If the mask is 0xffff, we can do better
+  // via a movt instruction, so don't use BFI in that case.
+  SDValue MaskOp = N0.getOperand(1);
+  ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp);
+  if (!MaskC)
+    return SDValue();
+  unsigned Mask = MaskC->getZExtValue();
+  if (Mask == 0xffff)
+    return SDValue();
+  SDValue Res;
+  // Case (1): or (and A, mask), val => ARMbfi A, val, mask
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  if (N1C) {
+    unsigned Val = N1C->getZExtValue();
+    if ((Val & ~Mask) != Val)
+      return SDValue();
+
+    if (ARM::isBitFieldInvertedMask(Mask)) {
+      Val >>= countTrailingZeros(~Mask);
+
+      Res = DAG.getNode(ARMISD::BFI, DL, VT, N00,
+                        DAG.getConstant(Val, MVT::i32),
+                        DAG.getConstant(Mask, MVT::i32));
+
+      // Do not add new nodes to DAG combiner worklist.
+      DCI.CombineTo(N, Res, false);
+      return SDValue();
+    }
+  } else if (N1.getOpcode() == ISD::AND) {
+    // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
+    ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
+    if (!N11C)
+      return SDValue();
+    unsigned Mask2 = N11C->getZExtValue();
+
+    // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern
+    // as is to match.
+    if (ARM::isBitFieldInvertedMask(Mask) &&
+        (Mask == ~Mask2)) {
+      // The pack halfword instruction works better for masks that fit it,
+      // so use that when it's available.
+      if (Subtarget->hasT2ExtractPack() &&
+          (Mask == 0xffff || Mask == 0xffff0000))
+        return SDValue();
+      // 2a
+      unsigned amt = countTrailingZeros(Mask2);
+      Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
+                        DAG.getConstant(amt, MVT::i32));
+      Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
+                        DAG.getConstant(Mask, MVT::i32));
+      // Do not add new nodes to DAG combiner worklist.
+      DCI.CombineTo(N, Res, false);
+      return SDValue();
+    } else if (ARM::isBitFieldInvertedMask(~Mask) &&
+               (~Mask == Mask2)) {
+      // The pack halfword instruction works better for masks that fit it,
+      // so use that when it's available.
+      if (Subtarget->hasT2ExtractPack() &&
+          (Mask2 == 0xffff || Mask2 == 0xffff0000))
+        return SDValue();
+      // 2b
+      unsigned lsb = countTrailingZeros(Mask);
+      Res = DAG.getNode(ISD::SRL, DL, VT, N00,
+                        DAG.getConstant(lsb, MVT::i32));
+      Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
+                        DAG.getConstant(Mask2, MVT::i32));
+      // Do not add new nodes to DAG combiner worklist.
+      DCI.CombineTo(N, Res, false);
+      return SDValue();
+    }
+  }
+
+  if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) &&
+      N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) &&
+      ARM::isBitFieldInvertedMask(~Mask)) {
+    // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask
+    // where lsb(mask) == #shamt and masked bits of B are known zero.
+    SDValue ShAmt = N00.getOperand(1);
+    unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue();
+    unsigned LSB = countTrailingZeros(Mask);
+    if (ShAmtC != LSB)
+      return SDValue();
+
+    Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0),
+                      DAG.getConstant(~Mask, MVT::i32));
+
+    // Do not add new nodes to DAG combiner worklist.
+    DCI.CombineTo(N, Res, false);
+  }
+
+  return SDValue();
+}
+
+static SDValue PerformXORCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const ARMSubtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  SelectionDAG &DAG = DCI.DAG;
+
+  if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+
+  if (!Subtarget->isThumb1Only()) {
+    // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
+    SDValue Result = combineSelectAndUseCommutative(N, false, DCI);
+    if (Result.getNode())
+      return Result;
+  }
+
+  return SDValue();
+}
+
+/// PerformBFICombine - (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
+/// the bits being cleared by the AND are not demanded by the BFI.
+static SDValue PerformBFICombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  SDValue N1 = N->getOperand(1);
+  if (N1.getOpcode() == ISD::AND) {
+    ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
+    if (!N11C)
+      return SDValue();
+    unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+    unsigned LSB = countTrailingZeros(~InvMask);
+    unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB;
+    unsigned Mask = (1 << Width)-1;
+    unsigned Mask2 = N11C->getZExtValue();
+    if ((Mask & (~Mask2)) == 0)
+      return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
+                             N->getOperand(0), N1.getOperand(0),
+                             N->getOperand(2));
+  }
+  return SDValue();
+}
+
+/// PerformVMOVRRDCombine - Target-specific dag combine xforms for
+/// ARMISD::VMOVRRD.
+static SDValue PerformVMOVRRDCombine(SDNode *N,
+                                     TargetLowering::DAGCombinerInfo &DCI) {
+  // vmovrrd(vmovdrr x, y) -> x,y
+  SDValue InDouble = N->getOperand(0);
+  if (InDouble.getOpcode() == ARMISD::VMOVDRR)
+    return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
+
+  // vmovrrd(load f64) -> (load i32), (load i32)
+  SDNode *InNode = InDouble.getNode();
+  if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() &&
+      InNode->getValueType(0) == MVT::f64 &&
+      InNode->getOperand(1).getOpcode() == ISD::FrameIndex &&
+      !cast<LoadSDNode>(InNode)->isVolatile()) {
+    // TODO: Should this be done for non-FrameIndex operands?
+    LoadSDNode *LD = cast<LoadSDNode>(InNode);
+
+    SelectionDAG &DAG = DCI.DAG;
+    SDLoc DL(LD);
+    SDValue BasePtr = LD->getBasePtr();
+    SDValue NewLD1 = DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr,
+                                 LD->getPointerInfo(), LD->isVolatile(),
+                                 LD->isNonTemporal(), LD->isInvariant(),
+                                 LD->getAlignment());
+
+    SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
+                                    DAG.getConstant(4, MVT::i32));
+    SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, NewLD1.getValue(1), OffsetPtr,
+                                 LD->getPointerInfo(), LD->isVolatile(),
+                                 LD->isNonTemporal(), LD->isInvariant(),
+                                 std::min(4U, LD->getAlignment() / 2));
+
+    DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
+    if (DCI.DAG.getTargetLoweringInfo().isBigEndian())
+      std::swap (NewLD1, NewLD2);
+    SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
+    DCI.RemoveFromWorklist(LD);
+    DAG.DeleteNode(LD);
+    return Result;
+  }
+
+  return SDValue();
+}
+
+/// PerformVMOVDRRCombine - Target-specific dag combine xforms for
+/// ARMISD::VMOVDRR.  This is also used for BUILD_VECTORs with 2 operands.
+static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) {
+  // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X)
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  if (Op0.getOpcode() == ISD::BITCAST)
+    Op0 = Op0.getOperand(0);
+  if (Op1.getOpcode() == ISD::BITCAST)
+    Op1 = Op1.getOperand(0);
+  if (Op0.getOpcode() == ARMISD::VMOVRRD &&
+      Op0.getNode() == Op1.getNode() &&
+      Op0.getResNo() == 0 && Op1.getResNo() == 1)
+    return DAG.getNode(ISD::BITCAST, SDLoc(N),
+                       N->getValueType(0), Op0.getOperand(0));
+  return SDValue();
+}
+
+/// PerformSTORECombine - Target-specific dag combine xforms for
+/// ISD::STORE.
+static SDValue PerformSTORECombine(SDNode *N,
+                                   TargetLowering::DAGCombinerInfo &DCI) {
+  StoreSDNode *St = cast<StoreSDNode>(N);
+  if (St->isVolatile())
+    return SDValue();
+
+  // Optimize trunc store (of multiple scalars) to shuffle and store.  First,
+  // pack all of the elements in one place.  Next, store to memory in fewer
+  // chunks.
+  SDValue StVal = St->getValue();
+  EVT VT = StVal.getValueType();
+  if (St->isTruncatingStore() && VT.isVector()) {
+    SelectionDAG &DAG = DCI.DAG;
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    EVT StVT = St->getMemoryVT();
+    unsigned NumElems = VT.getVectorNumElements();
+    assert(StVT != VT && "Cannot truncate to the same type");
+    unsigned FromEltSz = VT.getVectorElementType().getSizeInBits();
+    unsigned ToEltSz = StVT.getVectorElementType().getSizeInBits();
+
+    // From, To sizes and ElemCount must be pow of two
+    if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue();
+
+    // We are going to use the original vector elt for storing.
+    // Accumulated smaller vector elements must be a multiple of the store size.
+    if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue();
+
+    unsigned SizeRatio  = FromEltSz / ToEltSz;
+    assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
+
+    // Create a type on which we perform the shuffle.
+    EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
+                                     NumElems*SizeRatio);
+    assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+
+    SDLoc DL(St);
+    SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
+    SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
+    for (unsigned i = 0; i < NumElems; ++i)
+      ShuffleVec[i] = TLI.isBigEndian() ? (i+1) * SizeRatio - 1 : i * SizeRatio;
+
+    // Can't shuffle using an illegal type.
+    if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
+
+    SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec,
+                                DAG.getUNDEF(WideVec.getValueType()),
+                                ShuffleVec.data());
+    // At this point all of the data is stored at the bottom of the
+    // register. We now need to save it to mem.
+
+    // Find the largest store unit
+    MVT StoreType = MVT::i8;
+    for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
+         tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
+      MVT Tp = (MVT::SimpleValueType)tp;
+      if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
+        StoreType = Tp;
+    }
+    // Didn't find a legal store type.
+    if (!TLI.isTypeLegal(StoreType))
+      return SDValue();
+
+    // Bitcast the original vector into a vector of store-size units
+    EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
+            StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits());
+    assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
+    SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
+    SmallVector<SDValue, 8> Chains;
+    SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,
+                                        TLI.getPointerTy());
+    SDValue BasePtr = St->getBasePtr();
+
+    // Perform one or more big stores into memory.
+    unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits();
+    for (unsigned I = 0; I < E; I++) {
+      SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
+                                   StoreType, ShuffWide,
+                                   DAG.getIntPtrConstant(I));
+      SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr,
+                                St->getPointerInfo(), St->isVolatile(),
+                                St->isNonTemporal(), St->getAlignment());
+      BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr,
+                            Increment);
+      Chains.push_back(Ch);
+    }
+    return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+  }
+
+  if (!ISD::isNormalStore(St))
+    return SDValue();
+
+  // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
+  // ARM stores of arguments in the same cache line.
+  if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
+      StVal.getNode()->hasOneUse()) {
+    SelectionDAG  &DAG = DCI.DAG;
+    bool isBigEndian = DAG.getTargetLoweringInfo().isBigEndian();
+    SDLoc DL(St);
+    SDValue BasePtr = St->getBasePtr();
+    SDValue NewST1 = DAG.getStore(St->getChain(), DL,
+                                  StVal.getNode()->getOperand(isBigEndian ? 1 : 0 ),
+                                  BasePtr, St->getPointerInfo(), St->isVolatile(),
+                                  St->isNonTemporal(), St->getAlignment());
+
+    SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
+                                    DAG.getConstant(4, MVT::i32));
+    return DAG.getStore(NewST1.getValue(0), DL,
+                        StVal.getNode()->getOperand(isBigEndian ? 0 : 1),
+                        OffsetPtr, St->getPointerInfo(), St->isVolatile(),
+                        St->isNonTemporal(),
+                        std::min(4U, St->getAlignment() / 2));
+  }
+
+  if (StVal.getValueType() != MVT::i64 ||
+      StVal.getNode()->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+    return SDValue();
+
+  // Bitcast an i64 store extracted from a vector to f64.
+  // Otherwise, the i64 value will be legalized to a pair of i32 values.
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc dl(StVal);
+  SDValue IntVec = StVal.getOperand(0);
+  EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
+                                 IntVec.getValueType().getVectorNumElements());
+  SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
+  SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
+                               Vec, StVal.getOperand(1));
+  dl = SDLoc(N);
+  SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
+  // Make the DAGCombiner fold the bitcasts.
+  DCI.AddToWorklist(Vec.getNode());
+  DCI.AddToWorklist(ExtElt.getNode());
+  DCI.AddToWorklist(V.getNode());
+  return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
+                      St->getPointerInfo(), St->isVolatile(),
+                      St->isNonTemporal(), St->getAlignment(),
+                      St->getTBAAInfo());
+}
+
+/// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
+/// are normal, non-volatile loads.  If so, it is profitable to bitcast an
+/// i64 vector to have f64 elements, since the value can then be loaded
+/// directly into a VFP register.
+static bool hasNormalLoadOperand(SDNode *N) {
+  unsigned NumElts = N->getValueType(0).getVectorNumElements();
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDNode *Elt = N->getOperand(i).getNode();
+    if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile())
+      return true;
+  }
+  return false;
+}
+
+/// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for
+/// ISD::BUILD_VECTOR.
+static SDValue PerformBUILD_VECTORCombine(SDNode *N,
+                                          TargetLowering::DAGCombinerInfo &DCI){
+  // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X):
+  // VMOVRRD is introduced when legalizing i64 types.  It forces the i64 value
+  // into a pair of GPRs, which is fine when the value is used as a scalar,
+  // but if the i64 value is converted to a vector, we need to undo the VMOVRRD.
+  SelectionDAG &DAG = DCI.DAG;
+  if (N->getNumOperands() == 2) {
+    SDValue RV = PerformVMOVDRRCombine(N, DAG);
+    if (RV.getNode())
+      return RV;
+  }
+
+  // Load i64 elements as f64 values so that type legalization does not split
+  // them up into i32 values.
+  EVT VT = N->getValueType(0);
+  if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N))
+    return SDValue();
+  SDLoc dl(N);
+  SmallVector<SDValue, 8> Ops;
+  unsigned NumElts = VT.getVectorNumElements();
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i));
+    Ops.push_back(V);
+    // Make the DAGCombiner fold the bitcast.
+    DCI.AddToWorklist(V.getNode());
+  }
+  EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts);
+  SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, FloatVT, Ops);
+  return DAG.getNode(ISD::BITCAST, dl, VT, BV);
+}
+
+/// \brief Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
+static SDValue
+PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
+  // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.
+  // At that time, we may have inserted bitcasts from integer to float.
+  // If these bitcasts have survived DAGCombine, change the lowering of this
+  // BUILD_VECTOR in something more vector friendly, i.e., that does not
+  // force to use floating point types.
+
+  // Make sure we can change the type of the vector.
+  // This is possible iff:
+  // 1. The vector is only used in a bitcast to a integer type. I.e.,
+  //    1.1. Vector is used only once.
+  //    1.2. Use is a bit convert to an integer type.
+  // 2. The size of its operands are 32-bits (64-bits are not legal).
+  EVT VT = N->getValueType(0);
+  EVT EltVT = VT.getVectorElementType();
+
+  // Check 1.1. and 2.
+  if (EltVT.getSizeInBits() != 32 || !N->hasOneUse())
+    return SDValue();
+
+  // By construction, the input type must be float.
+  assert(EltVT == MVT::f32 && "Unexpected type!");
+
+  // Check 1.2.
+  SDNode *Use = *N->use_begin();
+  if (Use->getOpcode() != ISD::BITCAST ||
+      Use->getValueType(0).isFloatingPoint())
+    return SDValue();
+
+  // Check profitability.
+  // Model is, if more than half of the relevant operands are bitcast from
+  // i32, turn the build_vector into a sequence of insert_vector_elt.
+  // Relevant operands are everything that is not statically
+  // (i.e., at compile time) bitcasted.
+  unsigned NumOfBitCastedElts = 0;
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumOfRelevantElts = NumElts;
+  for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
+    SDValue Elt = N->getOperand(Idx);
+    if (Elt->getOpcode() == ISD::BITCAST) {
+      // Assume only bit cast to i32 will go away.
+      if (Elt->getOperand(0).getValueType() == MVT::i32)
+        ++NumOfBitCastedElts;
+    } else if (Elt.getOpcode() == ISD::UNDEF || isa<ConstantSDNode>(Elt))
+      // Constants are statically casted, thus do not count them as
+      // relevant operands.
+      --NumOfRelevantElts;
+  }
+
+  // Check if more than half of the elements require a non-free bitcast.
+  if (NumOfBitCastedElts <= NumOfRelevantElts / 2)
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  // Create the new vector type.
+  EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
+  // Check if the type is legal.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!TLI.isTypeLegal(VecVT))
+    return SDValue();
+
+  // Combine:
+  // ARMISD::BUILD_VECTOR E1, E2, ..., EN.
+  // => BITCAST INSERT_VECTOR_ELT
+  //                      (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1),
+  //                      (BITCAST EN), N.
+  SDValue Vec = DAG.getUNDEF(VecVT);
+  SDLoc dl(N);
+  for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
+    SDValue V = N->getOperand(Idx);
+    if (V.getOpcode() == ISD::UNDEF)
+      continue;
+    if (V.getOpcode() == ISD::BITCAST &&
+        V->getOperand(0).getValueType() == MVT::i32)
+      // Fold obvious case.
+      V = V.getOperand(0);
+    else {
+      V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V);
+      // Make the DAGCombiner fold the bitcasts.
+      DCI.AddToWorklist(V.getNode());
+    }
+    SDValue LaneIdx = DAG.getConstant(Idx, MVT::i32);
+    Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx);
+  }
+  Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec);
+  // Make the DAGCombiner fold the bitcasts.
+  DCI.AddToWorklist(Vec.getNode());
+  return Vec;
+}
+
+/// PerformInsertEltCombine - Target-specific dag combine xforms for
+/// ISD::INSERT_VECTOR_ELT.
+static SDValue PerformInsertEltCombine(SDNode *N,
+                                       TargetLowering::DAGCombinerInfo &DCI) {
+  // Bitcast an i64 load inserted into a vector to f64.
+  // Otherwise, the i64 value will be legalized to a pair of i32 values.
+  EVT VT = N->getValueType(0);
+  SDNode *Elt = N->getOperand(1).getNode();
+  if (VT.getVectorElementType() != MVT::i64 ||
+      !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile())
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc dl(N);
+  EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
+                                 VT.getVectorNumElements());
+  SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0));
+  SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1));
+  // Make the DAGCombiner fold the bitcasts.
+  DCI.AddToWorklist(Vec.getNode());
+  DCI.AddToWorklist(V.getNode());
+  SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT,
+                               Vec, V, N->getOperand(2));
+  return DAG.getNode(ISD::BITCAST, dl, VT, InsElt);
+}
+
+/// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
+/// ISD::VECTOR_SHUFFLE.
+static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
+  // The LLVM shufflevector instruction does not require the shuffle mask
+  // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does
+  // have that requirement.  When translating to ISD::VECTOR_SHUFFLE, if the
+  // operands do not match the mask length, they are extended by concatenating
+  // them with undef vectors.  That is probably the right thing for other
+  // targets, but for NEON it is better to concatenate two double-register
+  // size vector operands into a single quad-register size vector.  Do that
+  // transformation here:
+  //   shuffle(concat(v1, undef), concat(v2, undef)) ->
+  //   shuffle(concat(v1, v2), undef)
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  if (Op0.getOpcode() != ISD::CONCAT_VECTORS ||
+      Op1.getOpcode() != ISD::CONCAT_VECTORS ||
+      Op0.getNumOperands() != 2 ||
+      Op1.getNumOperands() != 2)
+    return SDValue();
+  SDValue Concat0Op1 = Op0.getOperand(1);
+  SDValue Concat1Op1 = Op1.getOperand(1);
+  if (Concat0Op1.getOpcode() != ISD::UNDEF ||
+      Concat1Op1.getOpcode() != ISD::UNDEF)
+    return SDValue();
+  // Skip the transformation if any of the types are illegal.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT VT = N->getValueType(0);
+  if (!TLI.isTypeLegal(VT) ||
+      !TLI.isTypeLegal(Concat0Op1.getValueType()) ||
+      !TLI.isTypeLegal(Concat1Op1.getValueType()))
+    return SDValue();
+
+  SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
+                                  Op0.getOperand(0), Op1.getOperand(0));
+  // Translate the shuffle mask.
+  SmallVector<int, 16> NewMask;
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned HalfElts = NumElts/2;
+  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
+  for (unsigned n = 0; n < NumElts; ++n) {
+    int MaskElt = SVN->getMaskElt(n);
+    int NewElt = -1;
+    if (MaskElt < (int)HalfElts)
+      NewElt = MaskElt;
+    else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts))
+      NewElt = HalfElts + MaskElt - NumElts;
+    NewMask.push_back(NewElt);
+  }
+  return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat,
+                              DAG.getUNDEF(VT), NewMask.data());
+}
+
+/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP and
+/// NEON load/store intrinsics to merge base address updates.
+static SDValue CombineBaseUpdate(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
+                      N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
+  unsigned AddrOpIdx = (isIntrinsic ? 2 : 1);
+  SDValue Addr = N->getOperand(AddrOpIdx);
+
+  // Search for a use of the address operand that is an increment.
+  for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
+         UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
+    SDNode *User = *UI;
+    if (User->getOpcode() != ISD::ADD ||
+        UI.getUse().getResNo() != Addr.getResNo())
+      continue;
+
+    // Check that the add is independent of the load/store.  Otherwise, folding
+    // it would create a cycle.
+    if (User->isPredecessorOf(N) || N->isPredecessorOf(User))
+      continue;
+
+    // Find the new opcode for the updating load/store.
+    bool isLoad = true;
+    bool isLaneOp = false;
+    unsigned NewOpc = 0;
+    unsigned NumVecs = 0;
+    if (isIntrinsic) {
+      unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+      switch (IntNo) {
+      default: llvm_unreachable("unexpected intrinsic for Neon base update");
+      case Intrinsic::arm_neon_vld1:     NewOpc = ARMISD::VLD1_UPD;
+        NumVecs = 1; break;
+      case Intrinsic::arm_neon_vld2:     NewOpc = ARMISD::VLD2_UPD;
+        NumVecs = 2; break;
+      case Intrinsic::arm_neon_vld3:     NewOpc = ARMISD::VLD3_UPD;
+        NumVecs = 3; break;
+      case Intrinsic::arm_neon_vld4:     NewOpc = ARMISD::VLD4_UPD;
+        NumVecs = 4; break;
+      case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD;
+        NumVecs = 2; isLaneOp = true; break;
+      case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD;
+        NumVecs = 3; isLaneOp = true; break;
+      case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD;
+        NumVecs = 4; isLaneOp = true; break;
+      case Intrinsic::arm_neon_vst1:     NewOpc = ARMISD::VST1_UPD;
+        NumVecs = 1; isLoad = false; break;
+      case Intrinsic::arm_neon_vst2:     NewOpc = ARMISD::VST2_UPD;
+        NumVecs = 2; isLoad = false; break;
+      case Intrinsic::arm_neon_vst3:     NewOpc = ARMISD::VST3_UPD;
+        NumVecs = 3; isLoad = false; break;
+      case Intrinsic::arm_neon_vst4:     NewOpc = ARMISD::VST4_UPD;
+        NumVecs = 4; isLoad = false; break;
+      case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD;
+        NumVecs = 2; isLoad = false; isLaneOp = true; break;
+      case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD;
+        NumVecs = 3; isLoad = false; isLaneOp = true; break;
+      case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD;
+        NumVecs = 4; isLoad = false; isLaneOp = true; break;
+      }
+    } else {
+      isLaneOp = true;
+      switch (N->getOpcode()) {
+      default: llvm_unreachable("unexpected opcode for Neon base update");
+      case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break;
+      case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break;
+      case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break;
+      }
+    }
+
+    // Find the size of memory referenced by the load/store.
+    EVT VecTy;
+    if (isLoad)
+      VecTy = N->getValueType(0);
+    else
+      VecTy = N->getOperand(AddrOpIdx+1).getValueType();
+    unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
+    if (isLaneOp)
+      NumBytes /= VecTy.getVectorNumElements();
+
+    // If the increment is a constant, it must match the memory ref size.
+    SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
+    if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
+      uint64_t IncVal = CInc->getZExtValue();
+      if (IncVal != NumBytes)
+        continue;
+    } else if (NumBytes >= 3 * 16) {
+      // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
+      // separate instructions that make it harder to use a non-constant update.
+      continue;
+    }
+
+    // Create the new updating load/store node.
+    EVT Tys[6];
+    unsigned NumResultVecs = (isLoad ? NumVecs : 0);
+    unsigned n;
+    for (n = 0; n < NumResultVecs; ++n)
+      Tys[n] = VecTy;
+    Tys[n++] = MVT::i32;
+    Tys[n] = MVT::Other;
+    SDVTList SDTys = DAG.getVTList(ArrayRef<EVT>(Tys, NumResultVecs+2));
+    SmallVector<SDValue, 8> Ops;
+    Ops.push_back(N->getOperand(0)); // incoming chain
+    Ops.push_back(N->getOperand(AddrOpIdx));
+    Ops.push_back(Inc);
+    for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands(); ++i) {
+      Ops.push_back(N->getOperand(i));
+    }
+    MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
+    SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys,
+                                           Ops, MemInt->getMemoryVT(),
+                                           MemInt->getMemOperand());
+
+    // Update the uses.
+    std::vector<SDValue> NewResults;
+    for (unsigned i = 0; i < NumResultVecs; ++i) {
+      NewResults.push_back(SDValue(UpdN.getNode(), i));
+    }
+    NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain
+    DCI.CombineTo(N, NewResults);
+    DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
+
+    break;
+  }
+  return SDValue();
+}
+
+/// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
+/// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
+/// are also VDUPLANEs.  If so, combine them to a vldN-dup operation and
+/// return true.
+static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+  // vldN-dup instructions only support 64-bit vectors for N > 1.
+  if (!VT.is64BitVector())
+    return false;
+
+  // Check if the VDUPLANE operand is a vldN-dup intrinsic.
+  SDNode *VLD = N->getOperand(0).getNode();
+  if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
+    return false;
+  unsigned NumVecs = 0;
+  unsigned NewOpc = 0;
+  unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue();
+  if (IntNo == Intrinsic::arm_neon_vld2lane) {
+    NumVecs = 2;
+    NewOpc = ARMISD::VLD2DUP;
+  } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
+    NumVecs = 3;
+    NewOpc = ARMISD::VLD3DUP;
+  } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
+    NumVecs = 4;
+    NewOpc = ARMISD::VLD4DUP;
+  } else {
+    return false;
+  }
+
+  // First check that all the vldN-lane uses are VDUPLANEs and that the lane
+  // numbers match the load.
+  unsigned VLDLaneNo =
+    cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue();
+  for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
+       UI != UE; ++UI) {
+    // Ignore uses of the chain result.
+    if (UI.getUse().getResNo() == NumVecs)
+      continue;
+    SDNode *User = *UI;
+    if (User->getOpcode() != ARMISD::VDUPLANE ||
+        VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue())
+      return false;
+  }
+
+  // Create the vldN-dup node.
+  EVT Tys[5];
+  unsigned n;
+  for (n = 0; n < NumVecs; ++n)
+    Tys[n] = VT;
+  Tys[n] = MVT::Other;
+  SDVTList SDTys = DAG.getVTList(ArrayRef<EVT>(Tys, NumVecs+1));
+  SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
+  MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
+  SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys,
+                                           Ops, VLDMemInt->getMemoryVT(),
+                                           VLDMemInt->getMemOperand());
+
+  // Update the uses.
+  for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
+       UI != UE; ++UI) {
+    unsigned ResNo = UI.getUse().getResNo();
+    // Ignore uses of the chain result.
+    if (ResNo == NumVecs)
+      continue;
+    SDNode *User = *UI;
+    DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo));
+  }
+
+  // Now the vldN-lane intrinsic is dead except for its chain result.
+  // Update uses of the chain.
+  std::vector<SDValue> VLDDupResults;
+  for (unsigned n = 0; n < NumVecs; ++n)
+    VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
+  VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
+  DCI.CombineTo(VLD, VLDDupResults);
+
+  return true;
+}
+
+/// PerformVDUPLANECombine - Target-specific dag combine xforms for
+/// ARMISD::VDUPLANE.
+static SDValue PerformVDUPLANECombine(SDNode *N,
+                                      TargetLowering::DAGCombinerInfo &DCI) {
+  SDValue Op = N->getOperand(0);
+
+  // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
+  // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
+  if (CombineVLDDUP(N, DCI))
+    return SDValue(N, 0);
+
+  // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
+  // redundant.  Ignore bit_converts for now; element sizes are checked below.
+  while (Op.getOpcode() == ISD::BITCAST)
+    Op = Op.getOperand(0);
+  if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM)
+    return SDValue();
+
+  // Make sure the VMOV element size is not bigger than the VDUPLANE elements.
+  unsigned EltSize = Op.getValueType().getVectorElementType().getSizeInBits();
+  // The canonical VMOV for a zero vector uses a 32-bit element size.
+  unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned EltBits;
+  if (ARM_AM::decodeNEONModImm(Imm, EltBits) == 0)
+    EltSize = 8;
+  EVT VT = N->getValueType(0);
+  if (EltSize > VT.getVectorElementType().getSizeInBits())
+    return SDValue();
+
+  return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
+}
+
+// isConstVecPow2 - Return true if each vector element is a power of 2, all
+// elements are the same constant, C, and Log2(C) ranges from 1 to 32.
+static bool isConstVecPow2(SDValue ConstVec, bool isSigned, uint64_t &C)
+{
+  integerPart cN;
+  integerPart c0 = 0;
+  for (unsigned I = 0, E = ConstVec.getValueType().getVectorNumElements();
+       I != E; I++) {
+    ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(ConstVec.getOperand(I));
+    if (!C)
+      return false;
+
+    bool isExact;
+    APFloat APF = C->getValueAPF();
+    if (APF.convertToInteger(&cN, 64, isSigned, APFloat::rmTowardZero, &isExact)
+        != APFloat::opOK || !isExact)
+      return false;
+
+    c0 = (I == 0) ? cN : c0;
+    if (!isPowerOf2_64(cN) || c0 != cN || Log2_64(c0) < 1 || Log2_64(c0) > 32)
+      return false;
+  }
+  C = c0;
+  return true;
+}
+
+/// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD)
+/// can replace combinations of VMUL and VCVT (floating-point to integer)
+/// when the VMUL has a constant operand that is a power of 2.
+///
+/// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
+///  vmul.f32        d16, d17, d16
+///  vcvt.s32.f32    d16, d16
+/// becomes:
+///  vcvt.s32.f32    d16, d16, #3
+static SDValue PerformVCVTCombine(SDNode *N,
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const ARMSubtarget *Subtarget) {
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue Op = N->getOperand(0);
+
+  if (!Subtarget->hasNEON() || !Op.getValueType().isVector() ||
+      Op.getOpcode() != ISD::FMUL)
+    return SDValue();
+
+  uint64_t C;
+  SDValue N0 = Op->getOperand(0);
+  SDValue ConstVec = Op->getOperand(1);
+  bool isSigned = N->getOpcode() == ISD::FP_TO_SINT;
+
+  if (ConstVec.getOpcode() != ISD::BUILD_VECTOR ||
+      !isConstVecPow2(ConstVec, isSigned, C))
+    return SDValue();
+
+  MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
+  MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
+  if (FloatTy.getSizeInBits() != 32 || IntTy.getSizeInBits() > 32) {
+    // These instructions only exist converting from f32 to i32. We can handle
+    // smaller integers by generating an extra truncate, but larger ones would
+    // be lossy.
+    return SDValue();
+  }
+
+  unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
+    Intrinsic::arm_neon_vcvtfp2fxu;
+  unsigned NumLanes = Op.getValueType().getVectorNumElements();
+  SDValue FixConv =  DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N),
+                                 NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
+                                 DAG.getConstant(IntrinsicOpcode, MVT::i32), N0,
+                                 DAG.getConstant(Log2_64(C), MVT::i32));
+
+  if (IntTy.getSizeInBits() < FloatTy.getSizeInBits())
+    FixConv = DAG.getNode(ISD::TRUNCATE, SDLoc(N), N->getValueType(0), FixConv);
+
+  return FixConv;
+}
+
+/// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
+/// can replace combinations of VCVT (integer to floating-point) and VDIV
+/// when the VDIV has a constant operand that is a power of 2.
+///
+/// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
+///  vcvt.f32.s32    d16, d16
+///  vdiv.f32        d16, d17, d16
+/// becomes:
+///  vcvt.f32.s32    d16, d16, #3
+static SDValue PerformVDIVCombine(SDNode *N,
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const ARMSubtarget *Subtarget) {
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue Op = N->getOperand(0);
+  unsigned OpOpcode = Op.getNode()->getOpcode();
+
+  if (!Subtarget->hasNEON() || !N->getValueType(0).isVector() ||
+      (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP))
+    return SDValue();
+
+  uint64_t C;
+  SDValue ConstVec = N->getOperand(1);
+  bool isSigned = OpOpcode == ISD::SINT_TO_FP;
+
+  if (ConstVec.getOpcode() != ISD::BUILD_VECTOR ||
+      !isConstVecPow2(ConstVec, isSigned, C))
+    return SDValue();
+
+  MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
+  MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
+  if (FloatTy.getSizeInBits() != 32 || IntTy.getSizeInBits() > 32) {
+    // These instructions only exist converting from i32 to f32. We can handle
+    // smaller integers by generating an extra extend, but larger ones would
+    // be lossy.
+    return SDValue();
+  }
+
+  SDValue ConvInput = Op.getOperand(0);
+  unsigned NumLanes = Op.getValueType().getVectorNumElements();
+  if (IntTy.getSizeInBits() < FloatTy.getSizeInBits())
+    ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
+                            SDLoc(N), NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
+                            ConvInput);
+
+  unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp :
+    Intrinsic::arm_neon_vcvtfxu2fp;
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N),
+                     Op.getValueType(),
+                     DAG.getConstant(IntrinsicOpcode, MVT::i32),
+                     ConvInput, DAG.getConstant(Log2_64(C), MVT::i32));
+}
+
+/// Getvshiftimm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift operation, where all the elements of the
+/// build_vector must have the same constant integer value.
+static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
+  // Ignore bit_converts.
+  while (Op.getOpcode() == ISD::BITCAST)
+    Op = Op.getOperand(0);
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
+  APInt SplatBits, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (! BVN || ! BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
+                                      HasAnyUndefs, ElementBits) ||
+      SplatBitSize > ElementBits)
+    return false;
+  Cnt = SplatBits.getSExtValue();
+  return true;
+}
+
+/// isVShiftLImm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift left operation.  That value must be in the range:
+///   0 <= Value < ElementBits for a left shift; or
+///   0 <= Value <= ElementBits for a long left shift.
+static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
+  assert(VT.isVector() && "vector shift count is not a vector type");
+  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
+  if (! getVShiftImm(Op, ElementBits, Cnt))
+    return false;
+  return (Cnt >= 0 && (isLong ? Cnt-1 : Cnt) < ElementBits);
+}
+
+/// isVShiftRImm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift right operation.  For a shift opcode, the value
+/// is positive, but for an intrinsic the value count must be negative. The
+/// absolute value must be in the range:
+///   1 <= |Value| <= ElementBits for a right shift; or
+///   1 <= |Value| <= ElementBits/2 for a narrow right shift.
+static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
+                         int64_t &Cnt) {
+  assert(VT.isVector() && "vector shift count is not a vector type");
+  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
+  if (! getVShiftImm(Op, ElementBits, Cnt))
+    return false;
+  if (isIntrinsic)
+    Cnt = -Cnt;
+  return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits));
+}
+
+/// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
+static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
+  unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+  switch (IntNo) {
+  default:
+    // Don't do anything for most intrinsics.
+    break;
+
+  // Vector shifts: check for immediate versions and lower them.
+  // Note: This is done during DAG combining instead of DAG legalizing because
+  // the build_vectors for 64-bit vector element shift counts are generally
+  // not legal, and it is hard to see their values after they get legalized to
+  // loads from a constant pool.
+  case Intrinsic::arm_neon_vshifts:
+  case Intrinsic::arm_neon_vshiftu:
+  case Intrinsic::arm_neon_vrshifts:
+  case Intrinsic::arm_neon_vrshiftu:
+  case Intrinsic::arm_neon_vrshiftn:
+  case Intrinsic::arm_neon_vqshifts:
+  case Intrinsic::arm_neon_vqshiftu:
+  case Intrinsic::arm_neon_vqshiftsu:
+  case Intrinsic::arm_neon_vqshiftns:
+  case Intrinsic::arm_neon_vqshiftnu:
+  case Intrinsic::arm_neon_vqshiftnsu:
+  case Intrinsic::arm_neon_vqrshiftns:
+  case Intrinsic::arm_neon_vqrshiftnu:
+  case Intrinsic::arm_neon_vqrshiftnsu: {
+    EVT VT = N->getOperand(1).getValueType();
+    int64_t Cnt;
+    unsigned VShiftOpc = 0;
+
+    switch (IntNo) {
+    case Intrinsic::arm_neon_vshifts:
+    case Intrinsic::arm_neon_vshiftu:
+      if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) {
+        VShiftOpc = ARMISD::VSHL;
+        break;
+      }
+      if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) {
+        VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ?
+                     ARMISD::VSHRs : ARMISD::VSHRu);
+        break;
+      }
+      return SDValue();
+
+    case Intrinsic::arm_neon_vrshifts:
+    case Intrinsic::arm_neon_vrshiftu:
+      if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt))
+        break;
+      return SDValue();
+
+    case Intrinsic::arm_neon_vqshifts:
+    case Intrinsic::arm_neon_vqshiftu:
+      if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
+        break;
+      return SDValue();
+
+    case Intrinsic::arm_neon_vqshiftsu:
+      if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
+        break;
+      llvm_unreachable("invalid shift count for vqshlu intrinsic");
+
+    case Intrinsic::arm_neon_vrshiftn:
+    case Intrinsic::arm_neon_vqshiftns:
+    case Intrinsic::arm_neon_vqshiftnu:
+    case Intrinsic::arm_neon_vqshiftnsu:
+    case Intrinsic::arm_neon_vqrshiftns:
+    case Intrinsic::arm_neon_vqrshiftnu:
+    case Intrinsic::arm_neon_vqrshiftnsu:
+      // Narrowing shifts require an immediate right shift.
+      if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
+        break;
+      llvm_unreachable("invalid shift count for narrowing vector shift "
+                       "intrinsic");
+
+    default:
+      llvm_unreachable("unhandled vector shift");
+    }
+
+    switch (IntNo) {
+    case Intrinsic::arm_neon_vshifts:
+    case Intrinsic::arm_neon_vshiftu:
+      // Opcode already set above.
+      break;
+    case Intrinsic::arm_neon_vrshifts:
+      VShiftOpc = ARMISD::VRSHRs; break;
+    case Intrinsic::arm_neon_vrshiftu:
+      VShiftOpc = ARMISD::VRSHRu; break;
+    case Intrinsic::arm_neon_vrshiftn:
+      VShiftOpc = ARMISD::VRSHRN; break;
+    case Intrinsic::arm_neon_vqshifts:
+      VShiftOpc = ARMISD::VQSHLs; break;
+    case Intrinsic::arm_neon_vqshiftu:
+      VShiftOpc = ARMISD::VQSHLu; break;
+    case Intrinsic::arm_neon_vqshiftsu:
+      VShiftOpc = ARMISD::VQSHLsu; break;
+    case Intrinsic::arm_neon_vqshiftns:
+      VShiftOpc = ARMISD::VQSHRNs; break;
+    case Intrinsic::arm_neon_vqshiftnu:
+      VShiftOpc = ARMISD::VQSHRNu; break;
+    case Intrinsic::arm_neon_vqshiftnsu:
+      VShiftOpc = ARMISD::VQSHRNsu; break;
+    case Intrinsic::arm_neon_vqrshiftns:
+      VShiftOpc = ARMISD::VQRSHRNs; break;
+    case Intrinsic::arm_neon_vqrshiftnu:
+      VShiftOpc = ARMISD::VQRSHRNu; break;
+    case Intrinsic::arm_neon_vqrshiftnsu:
+      VShiftOpc = ARMISD::VQRSHRNsu; break;
+    }
+
+    return DAG.getNode(VShiftOpc, SDLoc(N), N->getValueType(0),
+                       N->getOperand(1), DAG.getConstant(Cnt, MVT::i32));
+  }
+
+  case Intrinsic::arm_neon_vshiftins: {
+    EVT VT = N->getOperand(1).getValueType();
+    int64_t Cnt;
+    unsigned VShiftOpc = 0;
+
+    if (isVShiftLImm(N->getOperand(3), VT, false, Cnt))
+      VShiftOpc = ARMISD::VSLI;
+    else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))
+      VShiftOpc = ARMISD::VSRI;
+    else {
+      llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
+    }
+
+    return DAG.getNode(VShiftOpc, SDLoc(N), N->getValueType(0),
+                       N->getOperand(1), N->getOperand(2),
+                       DAG.getConstant(Cnt, MVT::i32));
+  }
+
+  case Intrinsic::arm_neon_vqrshifts:
+  case Intrinsic::arm_neon_vqrshiftu:
+    // No immediate versions of these to check for.
+    break;
+  }
+
+  return SDValue();
+}
+
+/// PerformShiftCombine - Checks for immediate versions of vector shifts and
+/// lowers them.  As with the vector shift intrinsics, this is done during DAG
+/// combining instead of DAG legalizing because the build_vectors for 64-bit
+/// vector element shift counts are generally not legal, and it is hard to see
+/// their values after they get legalized to loads from a constant pool.
+static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG,
+                                   const ARMSubtarget *ST) {
+  EVT VT = N->getValueType(0);
+  if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) {
+    // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high
+    // 16-bits of x is zero. This optimizes rev + lsr 16 to rev16.
+    SDValue N1 = N->getOperand(1);
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
+      SDValue N0 = N->getOperand(0);
+      if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP &&
+          DAG.MaskedValueIsZero(N0.getOperand(0),
+                                APInt::getHighBitsSet(32, 16)))
+        return DAG.getNode(ISD::ROTR, SDLoc(N), VT, N0, N1);
+    }
+  }
+
+  // Nothing to be done for scalar shifts.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!VT.isVector() || !TLI.isTypeLegal(VT))
+    return SDValue();
+
+  assert(ST->hasNEON() && "unexpected vector shift");
+  int64_t Cnt;
+
+  switch (N->getOpcode()) {
+  default: llvm_unreachable("unexpected shift opcode");
+
+  case ISD::SHL:
+    if (isVShiftLImm(N->getOperand(1), VT, false, Cnt))
+      return DAG.getNode(ARMISD::VSHL, SDLoc(N), VT, N->getOperand(0),
+                         DAG.getConstant(Cnt, MVT::i32));
+    break;
+
+  case ISD::SRA:
+  case ISD::SRL:
+    if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
+      unsigned VShiftOpc = (N->getOpcode() == ISD::SRA ?
+                            ARMISD::VSHRs : ARMISD::VSHRu);
+      return DAG.getNode(VShiftOpc, SDLoc(N), VT, N->getOperand(0),
+                         DAG.getConstant(Cnt, MVT::i32));
+    }
+  }
+  return SDValue();
+}
+
+/// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
+/// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
+static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
+                                    const ARMSubtarget *ST) {
+  SDValue N0 = N->getOperand(0);
+
+  // Check for sign- and zero-extensions of vector extract operations of 8-
+  // and 16-bit vector elements.  NEON supports these directly.  They are
+  // handled during DAG combining because type legalization will promote them
+  // to 32-bit types and it is messy to recognize the operations after that.
+  if (ST->hasNEON() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+    SDValue Vec = N0.getOperand(0);
+    SDValue Lane = N0.getOperand(1);
+    EVT VT = N->getValueType(0);
+    EVT EltVT = N0.getValueType();
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+    if (VT == MVT::i32 &&
+        (EltVT == MVT::i8 || EltVT == MVT::i16) &&
+        TLI.isTypeLegal(Vec.getValueType()) &&
+        isa<ConstantSDNode>(Lane)) {
+
+      unsigned Opc = 0;
+      switch (N->getOpcode()) {
+      default: llvm_unreachable("unexpected opcode");
+      case ISD::SIGN_EXTEND:
+        Opc = ARMISD::VGETLANEs;
+        break;
+      case ISD::ZERO_EXTEND:
+      case ISD::ANY_EXTEND:
+        Opc = ARMISD::VGETLANEu;
+        break;
+      }
+      return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane);
+    }
+  }
+
+  return SDValue();
+}
+
+/// PerformSELECT_CCCombine - Target-specific DAG combining for ISD::SELECT_CC
+/// to match f32 max/min patterns to use NEON vmax/vmin instructions.
+static SDValue PerformSELECT_CCCombine(SDNode *N, SelectionDAG &DAG,
+                                       const ARMSubtarget *ST) {
+  // If the target supports NEON, try to use vmax/vmin instructions for f32
+  // selects like "x < y ? x : y".  Unless the NoNaNsFPMath option is set,
+  // be careful about NaNs:  NEON's vmax/vmin return NaN if either operand is
+  // a NaN; only do the transformation when it matches that behavior.
+
+  // For now only do this when using NEON for FP operations; if using VFP, it
+  // is not obvious that the benefit outweighs the cost of switching to the
+  // NEON pipeline.
+  if (!ST->hasNEON() || !ST->useNEONForSinglePrecisionFP() ||
+      N->getValueType(0) != MVT::f32)
+    return SDValue();
+
+  SDValue CondLHS = N->getOperand(0);
+  SDValue CondRHS = N->getOperand(1);
+  SDValue LHS = N->getOperand(2);
+  SDValue RHS = N->getOperand(3);
+  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
+
+  unsigned Opcode = 0;
+  bool IsReversed;
+  if (DAG.isEqualTo(LHS, CondLHS) && DAG.isEqualTo(RHS, CondRHS)) {
+    IsReversed = false; // x CC y ? x : y
+  } else if (DAG.isEqualTo(LHS, CondRHS) && DAG.isEqualTo(RHS, CondLHS)) {
+    IsReversed = true ; // x CC y ? y : x
+  } else {
+    return SDValue();
+  }
+
+  bool IsUnordered;
+  switch (CC) {
+  default: break;
+  case ISD::SETOLT:
+  case ISD::SETOLE:
+  case ISD::SETLT:
+  case ISD::SETLE:
+  case ISD::SETULT:
+  case ISD::SETULE:
+    // If LHS is NaN, an ordered comparison will be false and the result will
+    // be the RHS, but vmin(NaN, RHS) = NaN.  Avoid this by checking that LHS
+    // != NaN.  Likewise, for unordered comparisons, check for RHS != NaN.
+    IsUnordered = (CC == ISD::SETULT || CC == ISD::SETULE);
+    if (!DAG.isKnownNeverNaN(IsUnordered ? RHS : LHS))
+      break;
+    // For less-than-or-equal comparisons, "+0 <= -0" will be true but vmin
+    // will return -0, so vmin can only be used for unsafe math or if one of
+    // the operands is known to be nonzero.
+    if ((CC == ISD::SETLE || CC == ISD::SETOLE || CC == ISD::SETULE) &&
+        !DAG.getTarget().Options.UnsafeFPMath &&
+        !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
+      break;
+    Opcode = IsReversed ? ARMISD::FMAX : ARMISD::FMIN;
+    break;
+
+  case ISD::SETOGT:
+  case ISD::SETOGE:
+  case ISD::SETGT:
+  case ISD::SETGE:
+  case ISD::SETUGT:
+  case ISD::SETUGE:
+    // If LHS is NaN, an ordered comparison will be false and the result will
+    // be the RHS, but vmax(NaN, RHS) = NaN.  Avoid this by checking that LHS
+    // != NaN.  Likewise, for unordered comparisons, check for RHS != NaN.
+    IsUnordered = (CC == ISD::SETUGT || CC == ISD::SETUGE);
+    if (!DAG.isKnownNeverNaN(IsUnordered ? RHS : LHS))
+      break;
+    // For greater-than-or-equal comparisons, "-0 >= +0" will be true but vmax
+    // will return +0, so vmax can only be used for unsafe math or if one of
+    // the operands is known to be nonzero.
+    if ((CC == ISD::SETGE || CC == ISD::SETOGE || CC == ISD::SETUGE) &&
+        !DAG.getTarget().Options.UnsafeFPMath &&
+        !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
+      break;
+    Opcode = IsReversed ? ARMISD::FMIN : ARMISD::FMAX;
+    break;
+  }
+
+  if (!Opcode)
+    return SDValue();
+  return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), LHS, RHS);
+}
+
+/// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
+SDValue
+ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
+  SDValue Cmp = N->getOperand(4);
+  if (Cmp.getOpcode() != ARMISD::CMPZ)
+    // Only looking at EQ and NE cases.
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  SDLoc dl(N);
+  SDValue LHS = Cmp.getOperand(0);
+  SDValue RHS = Cmp.getOperand(1);
+  SDValue FalseVal = N->getOperand(0);
+  SDValue TrueVal = N->getOperand(1);
+  SDValue ARMcc = N->getOperand(2);
+  ARMCC::CondCodes CC =
+    (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
+
+  // Simplify
+  //   mov     r1, r0
+  //   cmp     r1, x
+  //   mov     r0, y
+  //   moveq   r0, x
+  // to
+  //   cmp     r0, x
+  //   movne   r0, y
+  //
+  //   mov     r1, r0
+  //   cmp     r1, x
+  //   mov     r0, x
+  //   movne   r0, y
+  // to
+  //   cmp     r0, x
+  //   movne   r0, y
+  /// FIXME: Turn this into a target neutral optimization?
+  SDValue Res;
+  if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) {
+    Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc,
+                      N->getOperand(3), Cmp);
+  } else if (CC == ARMCC::EQ && TrueVal == RHS) {
+    SDValue ARMcc;
+    SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl);
+    Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc,
+                      N->getOperand(3), NewCmp);
+  }
+
+  if (Res.getNode()) {
+    APInt KnownZero, KnownOne;
+    DAG.computeKnownBits(SDValue(N,0), KnownZero, KnownOne);
+    // Capture demanded bits information that would be otherwise lost.
+    if (KnownZero == 0xfffffffe)
+      Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
+                        DAG.getValueType(MVT::i1));
+    else if (KnownZero == 0xffffff00)
+      Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
+                        DAG.getValueType(MVT::i8));
+    else if (KnownZero == 0xffff0000)
+      Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
+                        DAG.getValueType(MVT::i16));
+  }
+
+  return Res;
+}
+
+SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
+                                             DAGCombinerInfo &DCI) const {
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::ADDC:       return PerformADDCCombine(N, DCI, Subtarget);
+  case ISD::ADD:        return PerformADDCombine(N, DCI, Subtarget);
+  case ISD::SUB:        return PerformSUBCombine(N, DCI);
+  case ISD::MUL:        return PerformMULCombine(N, DCI, Subtarget);
+  case ISD::OR:         return PerformORCombine(N, DCI, Subtarget);
+  case ISD::XOR:        return PerformXORCombine(N, DCI, Subtarget);
+  case ISD::AND:        return PerformANDCombine(N, DCI, Subtarget);
+  case ARMISD::BFI:     return PerformBFICombine(N, DCI);
+  case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI);
+  case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
+  case ISD::STORE:      return PerformSTORECombine(N, DCI);
+  case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI);
+  case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI);
+  case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
+  case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI);
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT: return PerformVCVTCombine(N, DCI, Subtarget);
+  case ISD::FDIV:       return PerformVDIVCombine(N, DCI, Subtarget);
+  case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG);
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:        return PerformShiftCombine(N, DCI.DAG, Subtarget);
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget);
+  case ISD::SELECT_CC:  return PerformSELECT_CCCombine(N, DCI.DAG, Subtarget);
+  case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG);
+  case ARMISD::VLD2DUP:
+  case ARMISD::VLD3DUP:
+  case ARMISD::VLD4DUP:
+    return CombineBaseUpdate(N, DCI);
+  case ARMISD::BUILD_VECTOR:
+    return PerformARMBUILD_VECTORCombine(N, DCI);
+  case ISD::INTRINSIC_VOID:
+  case ISD::INTRINSIC_W_CHAIN:
+    switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+    case Intrinsic::arm_neon_vld1:
+    case Intrinsic::arm_neon_vld2:
+    case Intrinsic::arm_neon_vld3:
+    case Intrinsic::arm_neon_vld4:
+    case Intrinsic::arm_neon_vld2lane:
+    case Intrinsic::arm_neon_vld3lane:
+    case Intrinsic::arm_neon_vld4lane:
+    case Intrinsic::arm_neon_vst1:
+    case Intrinsic::arm_neon_vst2:
+    case Intrinsic::arm_neon_vst3:
+    case Intrinsic::arm_neon_vst4:
+    case Intrinsic::arm_neon_vst2lane:
+    case Intrinsic::arm_neon_vst3lane:
+    case Intrinsic::arm_neon_vst4lane:
+      return CombineBaseUpdate(N, DCI);
+    default: break;
+    }
+    break;
+  }
+  return SDValue();
+}
+
+bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc,
+                                                          EVT VT) const {
+  return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
+}
+
+bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT, unsigned,
+                                                      bool *Fast) const {
+  // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus
+  bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  default:
+    return false;
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32: {
+    // Unaligned access can use (for example) LRDB, LRDH, LDR
+    if (AllowsUnaligned) {
+      if (Fast)
+        *Fast = Subtarget->hasV7Ops();
+      return true;
+    }
+    return false;
+  }
+  case MVT::f64:
+  case MVT::v2f64: {
+    // For any little-endian targets with neon, we can support unaligned ld/st
+    // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
+    // A big-endian target may also explicitly support unaligned accesses
+    if (Subtarget->hasNEON() && (AllowsUnaligned || isLittleEndian())) {
+      if (Fast)
+        *Fast = true;
+      return true;
+    }
+    return false;
+  }
+  }
+}
+
+static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
+                       unsigned AlignCheck) {
+  return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) &&
+          (DstAlign == 0 || DstAlign % AlignCheck == 0));
+}
+
+EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size,
+                                           unsigned DstAlign, unsigned SrcAlign,
+                                           bool IsMemset, bool ZeroMemset,
+                                           bool MemcpyStrSrc,
+                                           MachineFunction &MF) const {
+  const Function *F = MF.getFunction();
+
+  // See if we can use NEON instructions for this...
+  if ((!IsMemset || ZeroMemset) &&
+      Subtarget->hasNEON() &&
+      !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                       Attribute::NoImplicitFloat)) {
+    bool Fast;
+    if (Size >= 16 &&
+        (memOpAlign(SrcAlign, DstAlign, 16) ||
+         (allowsUnalignedMemoryAccesses(MVT::v2f64, 0, &Fast) && Fast))) {
+      return MVT::v2f64;
+    } else if (Size >= 8 &&
+               (memOpAlign(SrcAlign, DstAlign, 8) ||
+                (allowsUnalignedMemoryAccesses(MVT::f64, 0, &Fast) && Fast))) {
+      return MVT::f64;
+    }
+  }
+
+  // Lowering to i32/i16 if the size permits.
+  if (Size >= 4)
+    return MVT::i32;
+  else if (Size >= 2)
+    return MVT::i16;
+
+  // Let the target-independent logic figure it out.
+  return MVT::Other;
+}
+
+bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+  if (Val.getOpcode() != ISD::LOAD)
+    return false;
+
+  EVT VT1 = Val.getValueType();
+  if (!VT1.isSimple() || !VT1.isInteger() ||
+      !VT2.isSimple() || !VT2.isInteger())
+    return false;
+
+  switch (VT1.getSimpleVT().SimpleTy) {
+  default: break;
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i16:
+    // 8-bit and 16-bit loads implicitly zero-extend to 32-bits.
+    return true;
+  }
+
+  return false;
+}
+
+bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
+  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+    return false;
+
+  if (!isTypeLegal(EVT::getEVT(Ty1)))
+    return false;
+
+  assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
+
+  // Assuming the caller doesn't have a zeroext or signext return parameter,
+  // truncation all the way down to i1 is valid.
+  return true;
+}
+
+
+static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
+  if (V < 0)
+    return false;
+
+  unsigned Scale = 1;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: return false;
+  case MVT::i1:
+  case MVT::i8:
+    // Scale == 1;
+    break;
+  case MVT::i16:
+    // Scale == 2;
+    Scale = 2;
+    break;
+  case MVT::i32:
+    // Scale == 4;
+    Scale = 4;
+    break;
+  }
+
+  if ((V & (Scale - 1)) != 0)
+    return false;
+  V /= Scale;
+  return V == (V & ((1LL << 5) - 1));
+}
+
+static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
+                                      const ARMSubtarget *Subtarget) {
+  bool isNeg = false;
+  if (V < 0) {
+    isNeg = true;
+    V = - V;
+  }
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: return false;
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+    // + imm12 or - imm8
+    if (isNeg)
+      return V == (V & ((1LL << 8) - 1));
+    return V == (V & ((1LL << 12) - 1));
+  case MVT::f32:
+  case MVT::f64:
+    // Same as ARM mode. FIXME: NEON?
+    if (!Subtarget->hasVFP2())
+      return false;
+    if ((V & 3) != 0)
+      return false;
+    V >>= 2;
+    return V == (V & ((1LL << 8) - 1));
+  }
+}
+
+/// isLegalAddressImmediate - Return true if the integer value can be used
+/// as the offset of the target addressing mode for load / store of the
+/// given type.
+static bool isLegalAddressImmediate(int64_t V, EVT VT,
+                                    const ARMSubtarget *Subtarget) {
+  if (V == 0)
+    return true;
+
+  if (!VT.isSimple())
+    return false;
+
+  if (Subtarget->isThumb1Only())
+    return isLegalT1AddressImmediate(V, VT);
+  else if (Subtarget->isThumb2())
+    return isLegalT2AddressImmediate(V, VT, Subtarget);
+
+  // ARM mode.
+  if (V < 0)
+    V = - V;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: return false;
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i32:
+    // +- imm12
+    return V == (V & ((1LL << 12) - 1));
+  case MVT::i16:
+    // +- imm8
+    return V == (V & ((1LL << 8) - 1));
+  case MVT::f32:
+  case MVT::f64:
+    if (!Subtarget->hasVFP2()) // FIXME: NEON?
+      return false;
+    if ((V & 3) != 0)
+      return false;
+    V >>= 2;
+    return V == (V & ((1LL << 8) - 1));
+  }
+}
+
+bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM,
+                                                      EVT VT) const {
+  int Scale = AM.Scale;
+  if (Scale < 0)
+    return false;
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: return false;
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+    if (Scale == 1)
+      return true;
+    // r + r << imm
+    Scale = Scale & ~1;
+    return Scale == 2 || Scale == 4 || Scale == 8;
+  case MVT::i64:
+    // r + r
+    if (((unsigned)AM.HasBaseReg + Scale) <= 2)
+      return true;
+    return false;
+  case MVT::isVoid:
+    // Note, we allow "void" uses (basically, uses that aren't loads or
+    // stores), because arm allows folding a scale into many arithmetic
+    // operations.  This should be made more precise and revisited later.
+
+    // Allow r << imm, but the imm has to be a multiple of two.
+    if (Scale & 1) return false;
+    return isPowerOf2_32(Scale);
+  }
+}
+
+/// isLegalAddressingMode - Return true if the addressing mode represented
+/// by AM is legal for this target, for a load/store of the specified type.
+bool ARMTargetLowering::isLegalAddressingMode(const AddrMode &AM,
+                                              Type *Ty) const {
+  EVT VT = getValueType(Ty, true);
+  if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
+    return false;
+
+  // Can never fold addr of global into load/store.
+  if (AM.BaseGV)
+    return false;
+
+  switch (AM.Scale) {
+  case 0:  // no scale reg, must be "r+i" or "r", or "i".
+    break;
+  case 1:
+    if (Subtarget->isThumb1Only())
+      return false;
+    // FALL THROUGH.
+  default:
+    // ARM doesn't support any R+R*scale+imm addr modes.
+    if (AM.BaseOffs)
+      return false;
+
+    if (!VT.isSimple())
+      return false;
+
+    if (Subtarget->isThumb2())
+      return isLegalT2ScaledAddressingMode(AM, VT);
+
+    int Scale = AM.Scale;
+    switch (VT.getSimpleVT().SimpleTy) {
+    default: return false;
+    case MVT::i1:
+    case MVT::i8:
+    case MVT::i32:
+      if (Scale < 0) Scale = -Scale;
+      if (Scale == 1)
+        return true;
+      // r + r << imm
+      return isPowerOf2_32(Scale & ~1);
+    case MVT::i16:
+    case MVT::i64:
+      // r + r
+      if (((unsigned)AM.HasBaseReg + Scale) <= 2)
+        return true;
+      return false;
+
+    case MVT::isVoid:
+      // Note, we allow "void" uses (basically, uses that aren't loads or
+      // stores), because arm allows folding a scale into many arithmetic
+      // operations.  This should be made more precise and revisited later.
+
+      // Allow r << imm, but the imm has to be a multiple of two.
+      if (Scale & 1) return false;
+      return isPowerOf2_32(Scale);
+    }
+  }
+  return true;
+}
+
+/// isLegalICmpImmediate - Return true if the specified immediate is legal
+/// icmp immediate, that is the target has icmp instructions which can compare
+/// a register against the immediate without having to materialize the
+/// immediate into a register.
+bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
+  // Thumb2 and ARM modes can use cmn for negative immediates.
+  if (!Subtarget->isThumb())
+    return ARM_AM::getSOImmVal(llvm::abs64(Imm)) != -1;
+  if (Subtarget->isThumb2())
+    return ARM_AM::getT2SOImmVal(llvm::abs64(Imm)) != -1;
+  // Thumb1 doesn't have cmn, and only 8-bit immediates.
+  return Imm >= 0 && Imm <= 255;
+}
+
+/// isLegalAddImmediate - Return true if the specified immediate is a legal add
+/// *or sub* immediate, that is the target has add or sub instructions which can
+/// add a register with the immediate without having to materialize the
+/// immediate into a register.
+bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const {
+  // Same encoding for add/sub, just flip the sign.
+  int64_t AbsImm = llvm::abs64(Imm);
+  if (!Subtarget->isThumb())
+    return ARM_AM::getSOImmVal(AbsImm) != -1;
+  if (Subtarget->isThumb2())
+    return ARM_AM::getT2SOImmVal(AbsImm) != -1;
+  // Thumb1 only has 8-bit unsigned immediate.
+  return AbsImm >= 0 && AbsImm <= 255;
+}
+
+static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT,
+                                      bool isSEXTLoad, SDValue &Base,
+                                      SDValue &Offset, bool &isInc,
+                                      SelectionDAG &DAG) {
+  if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
+    return false;
+
+  if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) {
+    // AddressingMode 3
+    Base = Ptr->getOperand(0);
+    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
+      int RHSC = (int)RHS->getZExtValue();
+      if (RHSC < 0 && RHSC > -256) {
+        assert(Ptr->getOpcode() == ISD::ADD);
+        isInc = false;
+        Offset = DAG.getConstant(-RHSC, RHS->getValueType(0));
+        return true;
+      }
+    }
+    isInc = (Ptr->getOpcode() == ISD::ADD);
+    Offset = Ptr->getOperand(1);
+    return true;
+  } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) {
+    // AddressingMode 2
+    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
+      int RHSC = (int)RHS->getZExtValue();
+      if (RHSC < 0 && RHSC > -0x1000) {
+        assert(Ptr->getOpcode() == ISD::ADD);
+        isInc = false;
+        Offset = DAG.getConstant(-RHSC, RHS->getValueType(0));
+        Base = Ptr->getOperand(0);
+        return true;
+      }
+    }
+
+    if (Ptr->getOpcode() == ISD::ADD) {
+      isInc = true;
+      ARM_AM::ShiftOpc ShOpcVal=
+        ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode());
+      if (ShOpcVal != ARM_AM::no_shift) {
+        Base = Ptr->getOperand(1);
+        Offset = Ptr->getOperand(0);
+      } else {
+        Base = Ptr->getOperand(0);
+        Offset = Ptr->getOperand(1);
+      }
+      return true;
+    }
+
+    isInc = (Ptr->getOpcode() == ISD::ADD);
+    Base = Ptr->getOperand(0);
+    Offset = Ptr->getOperand(1);
+    return true;
+  }
+
+  // FIXME: Use VLDM / VSTM to emulate indexed FP load / store.
+  return false;
+}
+
+static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT,
+                                     bool isSEXTLoad, SDValue &Base,
+                                     SDValue &Offset, bool &isInc,
+                                     SelectionDAG &DAG) {
+  if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
+    return false;
+
+  Base = Ptr->getOperand(0);
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
+    int RHSC = (int)RHS->getZExtValue();
+    if (RHSC < 0 && RHSC > -0x100) { // 8 bits.
+      assert(Ptr->getOpcode() == ISD::ADD);
+      isInc = false;
+      Offset = DAG.getConstant(-RHSC, RHS->getValueType(0));
+      return true;
+    } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero.
+      isInc = Ptr->getOpcode() == ISD::ADD;
+      Offset = DAG.getConstant(RHSC, RHS->getValueType(0));
+      return true;
+    }
+  }
+
+  return false;
+}
+
+/// getPreIndexedAddressParts - returns true by value, base pointer and
+/// offset pointer and addressing mode by reference if the node's address
+/// can be legally represented as pre-indexed load / store address.
+bool
+ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
+                                             SDValue &Offset,
+                                             ISD::MemIndexedMode &AM,
+                                             SelectionDAG &DAG) const {
+  if (Subtarget->isThumb1Only())
+    return false;
+
+  EVT VT;
+  SDValue Ptr;
+  bool isSEXTLoad = false;
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    Ptr = LD->getBasePtr();
+    VT  = LD->getMemoryVT();
+    isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
+  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    Ptr = ST->getBasePtr();
+    VT  = ST->getMemoryVT();
+  } else
+    return false;
+
+  bool isInc;
+  bool isLegal = false;
+  if (Subtarget->isThumb2())
+    isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
+                                       Offset, isInc, DAG);
+  else
+    isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
+                                        Offset, isInc, DAG);
+  if (!isLegal)
+    return false;
+
+  AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC;
+  return true;
+}
+
+/// getPostIndexedAddressParts - returns true by value, base pointer and
+/// offset pointer and addressing mode by reference if this node can be
+/// combined with a load / store to form a post-indexed load / store.
+bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
+                                                   SDValue &Base,
+                                                   SDValue &Offset,
+                                                   ISD::MemIndexedMode &AM,
+                                                   SelectionDAG &DAG) const {
+  if (Subtarget->isThumb1Only())
+    return false;
+
+  EVT VT;
+  SDValue Ptr;
+  bool isSEXTLoad = false;
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    VT  = LD->getMemoryVT();
+    Ptr = LD->getBasePtr();
+    isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
+  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    VT  = ST->getMemoryVT();
+    Ptr = ST->getBasePtr();
+  } else
+    return false;
+
+  bool isInc;
+  bool isLegal = false;
+  if (Subtarget->isThumb2())
+    isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
+                                       isInc, DAG);
+  else
+    isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
+                                        isInc, DAG);
+  if (!isLegal)
+    return false;
+
+  if (Ptr != Base) {
+    // Swap base ptr and offset to catch more post-index load / store when
+    // it's legal. In Thumb2 mode, offset must be an immediate.
+    if (Ptr == Offset && Op->getOpcode() == ISD::ADD &&
+        !Subtarget->isThumb2())
+      std::swap(Base, Offset);
+
+    // Post-indexed load / store update the base pointer.
+    if (Ptr != Base)
+      return false;
+  }
+
+  AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
+  return true;
+}
+
+void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
+                                                      APInt &KnownZero,
+                                                      APInt &KnownOne,
+                                                      const SelectionDAG &DAG,
+                                                      unsigned Depth) const {
+  unsigned BitWidth = KnownOne.getBitWidth();
+  KnownZero = KnownOne = APInt(BitWidth, 0);
+  switch (Op.getOpcode()) {
+  default: break;
+  case ARMISD::ADDC:
+  case ARMISD::ADDE:
+  case ARMISD::SUBC:
+  case ARMISD::SUBE:
+    // These nodes' second result is a boolean
+    if (Op.getResNo() == 0)
+      break;
+    KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
+    break;
+  case ARMISD::CMOV: {
+    // Bits are known zero/one if known on the LHS and RHS.
+    DAG.computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
+    if (KnownZero == 0 && KnownOne == 0) return;
+
+    APInt KnownZeroRHS, KnownOneRHS;
+    DAG.computeKnownBits(Op.getOperand(1), KnownZeroRHS, KnownOneRHS, Depth+1);
+    KnownZero &= KnownZeroRHS;
+    KnownOne  &= KnownOneRHS;
+    return;
+  }
+  case ISD::INTRINSIC_W_CHAIN: {
+    ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
+    Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
+    switch (IntID) {
+    default: return;
+    case Intrinsic::arm_ldaex:
+    case Intrinsic::arm_ldrex: {
+      EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
+      unsigned MemBits = VT.getScalarType().getSizeInBits();
+      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
+      return;
+    }
+    }
+  }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                           ARM Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const {
+  // Looking for "rev" which is V6+.
+  if (!Subtarget->hasV6Ops())
+    return false;
+
+  InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
+  std::string AsmStr = IA->getAsmString();
+  SmallVector<StringRef, 4> AsmPieces;
+  SplitString(AsmStr, AsmPieces, ";\n");
+
+  switch (AsmPieces.size()) {
+  default: return false;
+  case 1:
+    AsmStr = AsmPieces[0];
+    AsmPieces.clear();
+    SplitString(AsmStr, AsmPieces, " \t,");
+
+    // rev $0, $1
+    if (AsmPieces.size() == 3 &&
+        AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" &&
+        IA->getConstraintString().compare(0, 4, "=l,l") == 0) {
+      IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+      if (Ty && Ty->getBitWidth() == 32)
+        return IntrinsicLowering::LowerToByteSwap(CI);
+    }
+    break;
+  }
+
+  return false;
+}
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+ARMTargetLowering::ConstraintType
+ARMTargetLowering::getConstraintType(const std::string &Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default:  break;
+    case 'l': return C_RegisterClass;
+    case 'w': return C_RegisterClass;
+    case 'h': return C_RegisterClass;
+    case 'x': return C_RegisterClass;
+    case 't': return C_RegisterClass;
+    case 'j': return C_Other; // Constant for movw.
+      // An address with a single base register. Due to the way we
+      // currently handle addresses it is the same as an 'r' memory constraint.
+    case 'Q': return C_Memory;
+    }
+  } else if (Constraint.size() == 2) {
+    switch (Constraint[0]) {
+    default: break;
+    // All 'U+' constraints are addresses.
+    case 'U': return C_Memory;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+/// Examine constraint type and operand type and determine a weight value.
+/// This object must already have been set up with the operand type
+/// and the current alternative constraint selected.
+TargetLowering::ConstraintWeight
+ARMTargetLowering::getSingleConstraintMatchWeight(
+    AsmOperandInfo &info, const char *constraint) const {
+  ConstraintWeight weight = CW_Invalid;
+  Value *CallOperandVal = info.CallOperandVal;
+    // If we don't have a value, we can't do a match,
+    // but allow it at the lowest weight.
+  if (!CallOperandVal)
+    return CW_Default;
+  Type *type = CallOperandVal->getType();
+  // Look at the constraint type.
+  switch (*constraint) {
+  default:
+    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+    break;
+  case 'l':
+    if (type->isIntegerTy()) {
+      if (Subtarget->isThumb())
+        weight = CW_SpecificReg;
+      else
+        weight = CW_Register;
+    }
+    break;
+  case 'w':
+    if (type->isFloatingPointTy())
+      weight = CW_Register;
+    break;
+  }
+  return weight;
+}
+
+typedef std::pair<unsigned, const TargetRegisterClass*> RCPair;
+RCPair
+ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
+                                                MVT VT) const {
+  if (Constraint.size() == 1) {
+    // GCC ARM Constraint Letters
+    switch (Constraint[0]) {
+    case 'l': // Low regs or general regs.
+      if (Subtarget->isThumb())
+        return RCPair(0U, &ARM::tGPRRegClass);
+      return RCPair(0U, &ARM::GPRRegClass);
+    case 'h': // High regs or no regs.
+      if (Subtarget->isThumb())
+        return RCPair(0U, &ARM::hGPRRegClass);
+      break;
+    case 'r':
+      return RCPair(0U, &ARM::GPRRegClass);
+    case 'w':
+      if (VT == MVT::Other)
+        break;
+      if (VT == MVT::f32)
+        return RCPair(0U, &ARM::SPRRegClass);
+      if (VT.getSizeInBits() == 64)
+        return RCPair(0U, &ARM::DPRRegClass);
+      if (VT.getSizeInBits() == 128)
+        return RCPair(0U, &ARM::QPRRegClass);
+      break;
+    case 'x':
+      if (VT == MVT::Other)
+        break;
+      if (VT == MVT::f32)
+        return RCPair(0U, &ARM::SPR_8RegClass);
+      if (VT.getSizeInBits() == 64)
+        return RCPair(0U, &ARM::DPR_8RegClass);
+      if (VT.getSizeInBits() == 128)
+        return RCPair(0U, &ARM::QPR_8RegClass);
+      break;
+    case 't':
+      if (VT == MVT::f32)
+        return RCPair(0U, &ARM::SPRRegClass);
+      break;
+    }
+  }
+  if (StringRef("{cc}").equals_lower(Constraint))
+    return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
+
+  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+}
+
+/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+/// vector.  If it is invalid, don't add anything to Ops.
+void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+                                                     std::string &Constraint,
+                                                     std::vector<SDValue>&Ops,
+                                                     SelectionDAG &DAG) const {
+  SDValue Result;
+
+  // Currently only support length 1 constraints.
+  if (Constraint.length() != 1) return;
+
+  char ConstraintLetter = Constraint[0];
+  switch (ConstraintLetter) {
+  default: break;
+  case 'j':
+  case 'I': case 'J': case 'K': case 'L':
+  case 'M': case 'N': case 'O':
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
+    if (!C)
+      return;
+
+    int64_t CVal64 = C->getSExtValue();
+    int CVal = (int) CVal64;
+    // None of these constraints allow values larger than 32 bits.  Check
+    // that the value fits in an int.
+    if (CVal != CVal64)
+      return;
+
+    switch (ConstraintLetter) {
+      case 'j':
+        // Constant suitable for movw, must be between 0 and
+        // 65535.
+        if (Subtarget->hasV6T2Ops())
+          if (CVal >= 0 && CVal <= 65535)
+            break;
+        return;
+      case 'I':
+        if (Subtarget->isThumb1Only()) {
+          // This must be a constant between 0 and 255, for ADD
+          // immediates.
+          if (CVal >= 0 && CVal <= 255)
+            break;
+        } else if (Subtarget->isThumb2()) {
+          // A constant that can be used as an immediate value in a
+          // data-processing instruction.
+          if (ARM_AM::getT2SOImmVal(CVal) != -1)
+            break;
+        } else {
+          // A constant that can be used as an immediate value in a
+          // data-processing instruction.
+          if (ARM_AM::getSOImmVal(CVal) != -1)
+            break;
+        }
+        return;
+
+      case 'J':
+        if (Subtarget->isThumb()) {  // FIXME thumb2
+          // This must be a constant between -255 and -1, for negated ADD
+          // immediates. This can be used in GCC with an "n" modifier that
+          // prints the negated value, for use with SUB instructions. It is
+          // not useful otherwise but is implemented for compatibility.
+          if (CVal >= -255 && CVal <= -1)
+            break;
+        } else {
+          // This must be a constant between -4095 and 4095. It is not clear
+          // what this constraint is intended for. Implemented for
+          // compatibility with GCC.
+          if (CVal >= -4095 && CVal <= 4095)
+            break;
+        }
+        return;
+
+      case 'K':
+        if (Subtarget->isThumb1Only()) {
+          // A 32-bit value where only one byte has a nonzero value. Exclude
+          // zero to match GCC. This constraint is used by GCC internally for
+          // constants that can be loaded with a move/shift combination.
+          // It is not useful otherwise but is implemented for compatibility.
+          if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal))
+            break;
+        } else if (Subtarget->isThumb2()) {
+          // A constant whose bitwise inverse can be used as an immediate
+          // value in a data-processing instruction. This can be used in GCC
+          // with a "B" modifier that prints the inverted value, for use with
+          // BIC and MVN instructions. It is not useful otherwise but is
+          // implemented for compatibility.
+          if (ARM_AM::getT2SOImmVal(~CVal) != -1)
+            break;
+        } else {
+          // A constant whose bitwise inverse can be used as an immediate
+          // value in a data-processing instruction. This can be used in GCC
+          // with a "B" modifier that prints the inverted value, for use with
+          // BIC and MVN instructions. It is not useful otherwise but is
+          // implemented for compatibility.
+          if (ARM_AM::getSOImmVal(~CVal) != -1)
+            break;
+        }
+        return;
+
+      case 'L':
+        if (Subtarget->isThumb1Only()) {
+          // This must be a constant between -7 and 7,
+          // for 3-operand ADD/SUB immediate instructions.
+          if (CVal >= -7 && CVal < 7)
+            break;
+        } else if (Subtarget->isThumb2()) {
+          // A constant whose negation can be used as an immediate value in a
+          // data-processing instruction. This can be used in GCC with an "n"
+          // modifier that prints the negated value, for use with SUB
+          // instructions. It is not useful otherwise but is implemented for
+          // compatibility.
+          if (ARM_AM::getT2SOImmVal(-CVal) != -1)
+            break;
+        } else {
+          // A constant whose negation can be used as an immediate value in a
+          // data-processing instruction. This can be used in GCC with an "n"
+          // modifier that prints the negated value, for use with SUB
+          // instructions. It is not useful otherwise but is implemented for
+          // compatibility.
+          if (ARM_AM::getSOImmVal(-CVal) != -1)
+            break;
+        }
+        return;
+
+      case 'M':
+        if (Subtarget->isThumb()) { // FIXME thumb2
+          // This must be a multiple of 4 between 0 and 1020, for
+          // ADD sp + immediate.
+          if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
+            break;
+        } else {
+          // A power of two or a constant between 0 and 32.  This is used in
+          // GCC for the shift amount on shifted register operands, but it is
+          // useful in general for any shift amounts.
+          if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0))
+            break;
+        }
+        return;
+
+      case 'N':
+        if (Subtarget->isThumb()) {  // FIXME thumb2
+          // This must be a constant between 0 and 31, for shift amounts.
+          if (CVal >= 0 && CVal <= 31)
+            break;
+        }
+        return;
+
+      case 'O':
+        if (Subtarget->isThumb()) {  // FIXME thumb2
+          // This must be a multiple of 4 between -508 and 508, for
+          // ADD/SUB sp = sp + immediate.
+          if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
+            break;
+        }
+        return;
+    }
+    Result = DAG.getTargetConstant(CVal, Op.getValueType());
+    break;
+  }
+
+  if (Result.getNode()) {
+    Ops.push_back(Result);
+    return;
+  }
+  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetAEABI() && "Register-based DivRem lowering only");
+  unsigned Opcode = Op->getOpcode();
+  assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
+      "Invalid opcode for Div/Rem lowering");
+  bool isSigned = (Opcode == ISD::SDIVREM);
+  EVT VT = Op->getValueType(0);
+  Type *Ty = VT.getTypeForEVT(*DAG.getContext());
+
+  RTLIB::Libcall LC;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("Unexpected request for libcall!");
+  case MVT::i8:   LC= isSigned ? RTLIB::SDIVREM_I8  : RTLIB::UDIVREM_I8;  break;
+  case MVT::i16:  LC= isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
+  case MVT::i32:  LC= isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
+  case MVT::i64:  LC= isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
+  }
+
+  SDValue InChain = DAG.getEntryNode();
+
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
+    EVT ArgVT = Op->getOperand(i).getValueType();
+    Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+    Entry.Node = Op->getOperand(i);
+    Entry.Ty = ArgTy;
+    Entry.isSExt = isSigned;
+    Entry.isZExt = !isSigned;
+    Args.push_back(Entry);
+  }
+
+  SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
+                                         getPointerTy());
+
+  Type *RetTy = (Type*)StructType::get(Ty, Ty, NULL);
+
+  SDLoc dl(Op);
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(InChain)
+    .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args), 0)
+    .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
+
+  std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
+  return CallInfo.first;
+}
+
+SDValue
+ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetWindows() && "unsupported target platform");
+  SDLoc DL(Op);
+
+  // Get the inputs.
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size  = Op.getOperand(1);
+
+  SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size,
+                              DAG.getConstant(2, MVT::i32));
+
+  SDValue Flag;
+  Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Flag);
+  Flag = Chain.getValue(1);
+
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Flag);
+
+  SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
+  Chain = NewSP.getValue(1);
+
+  SDValue Ops[2] = { NewSP, Chain };
+  return DAG.getMergeValues(Ops, DL);
+}
+
+bool
+ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+  // The ARM target isn't yet aware of offsets.
+  return false;
+}
+
+bool ARM::isBitFieldInvertedMask(unsigned v) {
+  if (v == 0xffffffff)
+    return false;
+
+  // there can be 1's on either or both "outsides", all the "inside"
+  // bits must be 0's
+  unsigned TO = CountTrailingOnes_32(v);
+  unsigned LO = CountLeadingOnes_32(v);
+  v = (v >> TO) << TO;
+  v = (v << LO) >> LO;
+  return v == 0;
+}
+
+/// isFPImmLegal - Returns true if the target can instruction select the
+/// specified FP immediate natively. If false, the legalizer will
+/// materialize the FP immediate as a load from a constant pool.
+bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+  if (!Subtarget->hasVFP3())
+    return false;
+  if (VT == MVT::f32)
+    return ARM_AM::getFP32Imm(Imm) != -1;
+  if (VT == MVT::f64)
+    return ARM_AM::getFP64Imm(Imm) != -1;
+  return false;
+}
+
+/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
+/// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment
+/// specified in the intrinsic calls.
+bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
+                                           const CallInst &I,
+                                           unsigned Intrinsic) const {
+  switch (Intrinsic) {
+  case Intrinsic::arm_neon_vld1:
+  case Intrinsic::arm_neon_vld2:
+  case Intrinsic::arm_neon_vld3:
+  case Intrinsic::arm_neon_vld4:
+  case Intrinsic::arm_neon_vld2lane:
+  case Intrinsic::arm_neon_vld3lane:
+  case Intrinsic::arm_neon_vld4lane: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    // Conservatively set memVT to the entire set of vectors loaded.
+    uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8;
+    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
+    Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
+    Info.vol = false; // volatile loads with NEON intrinsics not supported
+    Info.readMem = true;
+    Info.writeMem = false;
+    return true;
+  }
+  case Intrinsic::arm_neon_vst1:
+  case Intrinsic::arm_neon_vst2:
+  case Intrinsic::arm_neon_vst3:
+  case Intrinsic::arm_neon_vst4:
+  case Intrinsic::arm_neon_vst2lane:
+  case Intrinsic::arm_neon_vst3lane:
+  case Intrinsic::arm_neon_vst4lane: {
+    Info.opc = ISD::INTRINSIC_VOID;
+    // Conservatively set memVT to the entire set of vectors stored.
+    unsigned NumElts = 0;
+    for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
+      Type *ArgTy = I.getArgOperand(ArgI)->getType();
+      if (!ArgTy->isVectorTy())
+        break;
+      NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8;
+    }
+    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
+    Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
+    Info.vol = false; // volatile stores with NEON intrinsics not supported
+    Info.readMem = false;
+    Info.writeMem = true;
+    return true;
+  }
+  case Intrinsic::arm_ldaex:
+  case Intrinsic::arm_ldrex: {
+    PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::getVT(PtrTy->getElementType());
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
+    Info.vol = true;
+    Info.readMem = true;
+    Info.writeMem = false;
+    return true;
+  }
+  case Intrinsic::arm_stlex:
+  case Intrinsic::arm_strex: {
+    PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::getVT(PtrTy->getElementType());
+    Info.ptrVal = I.getArgOperand(1);
+    Info.offset = 0;
+    Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
+    Info.vol = true;
+    Info.readMem = false;
+    Info.writeMem = true;
+    return true;
+  }
+  case Intrinsic::arm_stlexd:
+  case Intrinsic::arm_strexd: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::i64;
+    Info.ptrVal = I.getArgOperand(2);
+    Info.offset = 0;
+    Info.align = 8;
+    Info.vol = true;
+    Info.readMem = false;
+    Info.writeMem = true;
+    return true;
+  }
+  case Intrinsic::arm_ldaexd:
+  case Intrinsic::arm_ldrexd: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::i64;
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.align = 8;
+    Info.vol = true;
+    Info.readMem = true;
+    Info.writeMem = false;
+    return true;
+  }
+  default:
+    break;
+  }
+
+  return false;
+}
+
+/// \brief Returns true if it is beneficial to convert a load of a constant
+/// to just the constant itself.
+bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                                          Type *Ty) const {
+  assert(Ty->isIntegerTy());
+
+  unsigned Bits = Ty->getPrimitiveSizeInBits();
+  if (Bits == 0 || Bits > 32)
+    return false;
+  return true;
+}
+
+bool ARMTargetLowering::shouldExpandAtomicInIR(Instruction *Inst) const {
+  // Loads and stores less than 64-bits are already atomic; ones above that
+  // are doomed anyway, so defer to the default libcall and blame the OS when
+  // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
+  // anything for those.
+  bool IsMClass = Subtarget->isMClass();
+  if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+    unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
+    return Size == 64 && !IsMClass;
+  } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+    return LI->getType()->getPrimitiveSizeInBits() == 64 && !IsMClass;
+  }
+
+  // For the real atomic operations, we have ldrex/strex up to 32 bits,
+  // and up to 64 bits on the non-M profiles
+  unsigned AtomicLimit = IsMClass ? 32 : 64;
+  return Inst->getType()->getPrimitiveSizeInBits() <= AtomicLimit;
+}
+
+Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
+                                         AtomicOrdering Ord) const {
+  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+  Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
+  bool IsAcquire =
+      Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent;
+
+  // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
+  // intrinsic must return {i32, i32} and we have to recombine them into a
+  // single i64 here.
+  if (ValTy->getPrimitiveSizeInBits() == 64) {
+    Intrinsic::ID Int =
+        IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;
+    Function *Ldrex = llvm::Intrinsic::getDeclaration(M, Int);
+
+    Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
+    Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi");
+
+    Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
+    Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
+    if (!Subtarget->isLittle())
+      std::swap (Lo, Hi);
+    Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
+    Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
+    return Builder.CreateOr(
+        Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 32)), "val64");
+  }
+
+  Type *Tys[] = { Addr->getType() };
+  Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
+  Function *Ldrex = llvm::Intrinsic::getDeclaration(M, Int, Tys);
+
+  return Builder.CreateTruncOrBitCast(
+      Builder.CreateCall(Ldrex, Addr),
+      cast<PointerType>(Addr->getType())->getElementType());
+}
+
+Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val,
+                                               Value *Addr,
+                                               AtomicOrdering Ord) const {
+  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+  bool IsRelease =
+      Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent;
+
+  // Since the intrinsics must have legal type, the i64 intrinsics take two
+  // parameters: "i32, i32". We must marshal Val into the appropriate form
+  // before the call.
+  if (Val->getType()->getPrimitiveSizeInBits() == 64) {
+    Intrinsic::ID Int =
+        IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd;
+    Function *Strex = Intrinsic::getDeclaration(M, Int);
+    Type *Int32Ty = Type::getInt32Ty(M->getContext());
+
+    Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo");
+    Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi");
+    if (!Subtarget->isLittle())
+      std::swap (Lo, Hi);
+    Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
+    return Builder.CreateCall3(Strex, Lo, Hi, Addr);
+  }
+
+  Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex;
+  Type *Tys[] = { Addr->getType() };
+  Function *Strex = Intrinsic::getDeclaration(M, Int, Tys);
+
+  return Builder.CreateCall2(
+      Strex, Builder.CreateZExtOrBitCast(
+                 Val, Strex->getFunctionType()->getParamType(0)),
+      Addr);
+}
+
+enum HABaseType {
+  HA_UNKNOWN = 0,
+  HA_FLOAT,
+  HA_DOUBLE,
+  HA_VECT64,
+  HA_VECT128
+};
+
+static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base,
+                                   uint64_t &Members) {
+  if (const StructType *ST = dyn_cast<StructType>(Ty)) {
+    for (unsigned i = 0; i < ST->getNumElements(); ++i) {
+      uint64_t SubMembers = 0;
+      if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers))
+        return false;
+      Members += SubMembers;
+    }
+  } else if (const ArrayType *AT = dyn_cast<ArrayType>(Ty)) {
+    uint64_t SubMembers = 0;
+    if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers))
+      return false;
+    Members += SubMembers * AT->getNumElements();
+  } else if (Ty->isFloatTy()) {
+    if (Base != HA_UNKNOWN && Base != HA_FLOAT)
+      return false;
+    Members = 1;
+    Base = HA_FLOAT;
+  } else if (Ty->isDoubleTy()) {
+    if (Base != HA_UNKNOWN && Base != HA_DOUBLE)
+      return false;
+    Members = 1;
+    Base = HA_DOUBLE;
+  } else if (const VectorType *VT = dyn_cast<VectorType>(Ty)) {
+    Members = 1;
+    switch (Base) {
+    case HA_FLOAT:
+    case HA_DOUBLE:
+      return false;
+    case HA_VECT64:
+      return VT->getBitWidth() == 64;
+    case HA_VECT128:
+      return VT->getBitWidth() == 128;
+    case HA_UNKNOWN:
+      switch (VT->getBitWidth()) {
+      case 64:
+        Base = HA_VECT64;
+        return true;
+      case 128:
+        Base = HA_VECT128;
+        return true;
+      default:
+        return false;
+      }
+    }
+  }
+
+  return (Members > 0 && Members <= 4);
+}
+
+/// \brief Return true if a type is an AAPCS-VFP homogeneous aggregate.
+bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(
+    Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
+  if (getEffectiveCallingConv(CallConv, isVarArg) !=
+      CallingConv::ARM_AAPCS_VFP)
+    return false;
+
+  HABaseType Base = HA_UNKNOWN;
+  uint64_t Members = 0;
+  bool result = isHomogeneousAggregate(Ty, Base, Members);
+  DEBUG(dbgs() << "isHA: " << result << " "; Ty->dump(); dbgs() << "\n");
+  return result;
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.h b/contrib/llvm/lib/Target/ARM/ARMISelLowering.h
new file mode 100644
index 0000000..1ace0f3
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -0,0 +1,601 @@
+//===-- ARMISelLowering.h - ARM DAG Lowering Interface ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that ARM uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMISELLOWERING_H
+#define ARMISELLOWERING_H
+
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+#include <vector>
+
+namespace llvm {
+  class ARMConstantPoolValue;
+  class ARMSubtarget;
+
+  namespace ARMISD {
+    // ARM Specific DAG Nodes
+    enum NodeType {
+      // Start the numbering where the builtin ops and target ops leave off.
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+      Wrapper,      // Wrapper - A wrapper node for TargetConstantPool,
+                    // TargetExternalSymbol, and TargetGlobalAddress.
+      WrapperPIC,   // WrapperPIC - A wrapper node for TargetGlobalAddress in
+                    // PIC mode.
+      WrapperJT,    // WrapperJT - A wrapper node for TargetJumpTable
+
+      // Add pseudo op to model memcpy for struct byval.
+      COPY_STRUCT_BYVAL,
+
+      CALL,         // Function call.
+      CALL_PRED,    // Function call that's predicable.
+      CALL_NOLINK,  // Function call with branch not branch-and-link.
+      tCALL,        // Thumb function call.
+      BRCOND,       // Conditional branch.
+      BR_JT,        // Jumptable branch.
+      BR2_JT,       // Jumptable branch (2 level - jumptable entry is a jump).
+      RET_FLAG,     // Return with a flag operand.
+      INTRET_FLAG,  // Interrupt return with an LR-offset and a flag operand.
+
+      PIC_ADD,      // Add with a PC operand and a PIC label.
+
+      CMP,          // ARM compare instructions.
+      CMN,          // ARM CMN instructions.
+      CMPZ,         // ARM compare that sets only Z flag.
+      CMPFP,        // ARM VFP compare instruction, sets FPSCR.
+      CMPFPw0,      // ARM VFP compare against zero instruction, sets FPSCR.
+      FMSTAT,       // ARM fmstat instruction.
+
+      CMOV,         // ARM conditional move instructions.
+
+      BCC_i64,
+
+      RBIT,         // ARM bitreverse instruction
+
+      FTOSI,        // FP to sint within a FP register.
+      FTOUI,        // FP to uint within a FP register.
+      SITOF,        // sint to FP within a FP register.
+      UITOF,        // uint to FP within a FP register.
+
+      SRL_FLAG,     // V,Flag = srl_flag X -> srl X, 1 + save carry out.
+      SRA_FLAG,     // V,Flag = sra_flag X -> sra X, 1 + save carry out.
+      RRX,          // V = RRX X, Flag     -> srl X, 1 + shift in carry flag.
+
+      ADDC,         // Add with carry
+      ADDE,         // Add using carry
+      SUBC,         // Sub with carry
+      SUBE,         // Sub using carry
+
+      VMOVRRD,      // double to two gprs.
+      VMOVDRR,      // Two gprs to double.
+
+      EH_SJLJ_SETJMP,         // SjLj exception handling setjmp.
+      EH_SJLJ_LONGJMP,        // SjLj exception handling longjmp.
+
+      TC_RETURN,    // Tail call return pseudo.
+
+      THREAD_POINTER,
+
+      DYN_ALLOC,    // Dynamic allocation on the stack.
+
+      MEMBARRIER_MCR, // Memory barrier (MCR)
+
+      PRELOAD,      // Preload
+
+      WIN__CHKSTK,  // Windows' __chkstk call to do stack probing.
+
+      VCEQ,         // Vector compare equal.
+      VCEQZ,        // Vector compare equal to zero.
+      VCGE,         // Vector compare greater than or equal.
+      VCGEZ,        // Vector compare greater than or equal to zero.
+      VCLEZ,        // Vector compare less than or equal to zero.
+      VCGEU,        // Vector compare unsigned greater than or equal.
+      VCGT,         // Vector compare greater than.
+      VCGTZ,        // Vector compare greater than zero.
+      VCLTZ,        // Vector compare less than zero.
+      VCGTU,        // Vector compare unsigned greater than.
+      VTST,         // Vector test bits.
+
+      // Vector shift by immediate:
+      VSHL,         // ...left
+      VSHRs,        // ...right (signed)
+      VSHRu,        // ...right (unsigned)
+
+      // Vector rounding shift by immediate:
+      VRSHRs,       // ...right (signed)
+      VRSHRu,       // ...right (unsigned)
+      VRSHRN,       // ...right narrow
+
+      // Vector saturating shift by immediate:
+      VQSHLs,       // ...left (signed)
+      VQSHLu,       // ...left (unsigned)
+      VQSHLsu,      // ...left (signed to unsigned)
+      VQSHRNs,      // ...right narrow (signed)
+      VQSHRNu,      // ...right narrow (unsigned)
+      VQSHRNsu,     // ...right narrow (signed to unsigned)
+
+      // Vector saturating rounding shift by immediate:
+      VQRSHRNs,     // ...right narrow (signed)
+      VQRSHRNu,     // ...right narrow (unsigned)
+      VQRSHRNsu,    // ...right narrow (signed to unsigned)
+
+      // Vector shift and insert:
+      VSLI,         // ...left
+      VSRI,         // ...right
+
+      // Vector get lane (VMOV scalar to ARM core register)
+      // (These are used for 8- and 16-bit element types only.)
+      VGETLANEu,    // zero-extend vector extract element
+      VGETLANEs,    // sign-extend vector extract element
+
+      // Vector move immediate and move negated immediate:
+      VMOVIMM,
+      VMVNIMM,
+
+      // Vector move f32 immediate:
+      VMOVFPIMM,
+
+      // Vector duplicate:
+      VDUP,
+      VDUPLANE,
+
+      // Vector shuffles:
+      VEXT,         // extract
+      VREV64,       // reverse elements within 64-bit doublewords
+      VREV32,       // reverse elements within 32-bit words
+      VREV16,       // reverse elements within 16-bit halfwords
+      VZIP,         // zip (interleave)
+      VUZP,         // unzip (deinterleave)
+      VTRN,         // transpose
+      VTBL1,        // 1-register shuffle with mask
+      VTBL2,        // 2-register shuffle with mask
+
+      // Vector multiply long:
+      VMULLs,       // ...signed
+      VMULLu,       // ...unsigned
+
+      UMLAL,        // 64bit Unsigned Accumulate Multiply
+      SMLAL,        // 64bit Signed Accumulate Multiply
+
+      // Operands of the standard BUILD_VECTOR node are not legalized, which
+      // is fine if BUILD_VECTORs are always lowered to shuffles or other
+      // operations, but for ARM some BUILD_VECTORs are legal as-is and their
+      // operands need to be legalized.  Define an ARM-specific version of
+      // BUILD_VECTOR for this purpose.
+      BUILD_VECTOR,
+
+      // Floating-point max and min:
+      FMAX,
+      FMIN,
+      VMAXNM,
+      VMINNM,
+
+      // Bit-field insert
+      BFI,
+
+      // Vector OR with immediate
+      VORRIMM,
+      // Vector AND with NOT of immediate
+      VBICIMM,
+
+      // Vector bitwise select
+      VBSL,
+
+      // Vector load N-element structure to all lanes:
+      VLD2DUP = ISD::FIRST_TARGET_MEMORY_OPCODE,
+      VLD3DUP,
+      VLD4DUP,
+
+      // NEON loads with post-increment base updates:
+      VLD1_UPD,
+      VLD2_UPD,
+      VLD3_UPD,
+      VLD4_UPD,
+      VLD2LN_UPD,
+      VLD3LN_UPD,
+      VLD4LN_UPD,
+      VLD2DUP_UPD,
+      VLD3DUP_UPD,
+      VLD4DUP_UPD,
+
+      // NEON stores with post-increment base updates:
+      VST1_UPD,
+      VST2_UPD,
+      VST3_UPD,
+      VST4_UPD,
+      VST2LN_UPD,
+      VST3LN_UPD,
+      VST4LN_UPD
+    };
+  }
+
+  /// Define some predicates that are used for node matching.
+  namespace ARM {
+    bool isBitFieldInvertedMask(unsigned v);
+  }
+
+  //===--------------------------------------------------------------------===//
+  //  ARMTargetLowering - ARM Implementation of the TargetLowering interface
+
+  class ARMTargetLowering : public TargetLowering {
+  public:
+    explicit ARMTargetLowering(TargetMachine &TM);
+
+    unsigned getJumpTableEncoding() const override;
+
+    SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+    /// ReplaceNodeResults - Replace the results of node with an illegal result
+    /// type with new values built out of custom code.
+    ///
+    void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+                            SelectionDAG &DAG) const override;
+
+    const char *getTargetNodeName(unsigned Opcode) const override;
+
+    bool isSelectSupported(SelectSupportKind Kind) const override {
+      // ARM does not support scalar condition selects on vectors.
+      return (Kind != ScalarCondVectorVal);
+    }
+
+    /// getSetCCResultType - Return the value type to use for ISD::SETCC.
+    EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
+
+    MachineBasicBlock *
+      EmitInstrWithCustomInserter(MachineInstr *MI,
+                                  MachineBasicBlock *MBB) const override;
+
+    void AdjustInstrPostInstrSelection(MachineInstr *MI,
+                                       SDNode *Node) const override;
+
+    SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const;
+    SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
+    bool isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const override;
+
+    /// allowsUnalignedMemoryAccesses - Returns true if the target allows
+    /// unaligned memory accesses of the specified type. Returns whether it
+    /// is "fast" by reference in the second argument.
+    bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AddrSpace,
+                                       bool *Fast) const override;
+
+    EVT getOptimalMemOpType(uint64_t Size,
+                            unsigned DstAlign, unsigned SrcAlign,
+                            bool IsMemset, bool ZeroMemset,
+                            bool MemcpyStrSrc,
+                            MachineFunction &MF) const override;
+
+    using TargetLowering::isZExtFree;
+    bool isZExtFree(SDValue Val, EVT VT2) const override;
+
+    bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override;
+
+
+    /// isLegalAddressingMode - Return true if the addressing mode represented
+    /// by AM is legal for this target, for a load/store of the specified type.
+    bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
+    bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const;
+
+    /// isLegalICmpImmediate - Return true if the specified immediate is legal
+    /// icmp immediate, that is the target has icmp instructions which can
+    /// compare a register against the immediate without having to materialize
+    /// the immediate into a register.
+    bool isLegalICmpImmediate(int64_t Imm) const override;
+
+    /// isLegalAddImmediate - Return true if the specified immediate is legal
+    /// add immediate, that is the target has add instructions which can
+    /// add a register and the immediate without having to materialize
+    /// the immediate into a register.
+    bool isLegalAddImmediate(int64_t Imm) const override;
+
+    /// getPreIndexedAddressParts - returns true by value, base pointer and
+    /// offset pointer and addressing mode by reference if the node's address
+    /// can be legally represented as pre-indexed load / store address.
+    bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset,
+                                   ISD::MemIndexedMode &AM,
+                                   SelectionDAG &DAG) const override;
+
+    /// getPostIndexedAddressParts - returns true by value, base pointer and
+    /// offset pointer and addressing mode by reference if this node can be
+    /// combined with a load / store to form a post-indexed load / store.
+    bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base,
+                                    SDValue &Offset, ISD::MemIndexedMode &AM,
+                                    SelectionDAG &DAG) const override;
+
+    void computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero,
+                                       APInt &KnownOne,
+                                       const SelectionDAG &DAG,
+                                       unsigned Depth) const override;
+
+
+    bool ExpandInlineAsm(CallInst *CI) const override;
+
+    ConstraintType
+      getConstraintType(const std::string &Constraint) const override;
+
+    /// Examine constraint string and operand type and determine a weight value.
+    /// The operand object must already have been set up with the operand type.
+    ConstraintWeight getSingleConstraintMatchWeight(
+      AsmOperandInfo &info, const char *constraint) const override;
+
+    std::pair<unsigned, const TargetRegisterClass*>
+      getRegForInlineAsmConstraint(const std::string &Constraint,
+                                   MVT VT) const override;
+
+    /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+    /// vector.  If it is invalid, don't add anything to Ops. If hasMemory is
+    /// true it means one of the asm constraint of the inline asm instruction
+    /// being processed is 'm'.
+    void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
+                                      std::vector<SDValue> &Ops,
+                                      SelectionDAG &DAG) const override;
+
+    const ARMSubtarget* getSubtarget() const {
+      return Subtarget;
+    }
+
+    /// getRegClassFor - Return the register class that should be used for the
+    /// specified value type.
+    const TargetRegisterClass *getRegClassFor(MVT VT) const override;
+
+    /// getMaximalGlobalOffset - Returns the maximal possible offset which can
+    /// be used for loads / stores from the global.
+    unsigned getMaximalGlobalOffset() const override;
+
+    /// Returns true if a cast between SrcAS and DestAS is a noop.
+    bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
+      // Addrspacecasts are always noops.
+      return true;
+    }
+
+    /// createFastISel - This method returns a target specific FastISel object,
+    /// or null if the target does not support "fast" ISel.
+    FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                             const TargetLibraryInfo *libInfo) const override;
+
+    Sched::Preference getSchedulingPreference(SDNode *N) const override;
+
+    bool
+    isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const override;
+    bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+
+    /// isFPImmLegal - Returns true if the target can instruction select the
+    /// specified FP immediate natively. If false, the legalizer will
+    /// materialize the FP immediate as a load from a constant pool.
+    bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
+
+    bool getTgtMemIntrinsic(IntrinsicInfo &Info,
+                            const CallInst &I,
+                            unsigned Intrinsic) const override;
+
+    /// \brief Returns true if it is beneficial to convert a load of a constant
+    /// to just the constant itself.
+    bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                           Type *Ty) const override;
+
+    /// \brief Returns true if an argument of type Ty needs to be passed in a
+    /// contiguous block of registers in calling convention CallConv.
+    bool functionArgumentNeedsConsecutiveRegisters(
+        Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override;
+
+    Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
+                          AtomicOrdering Ord) const override;
+    Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
+                                Value *Addr, AtomicOrdering Ord) const override;
+
+    bool shouldExpandAtomicInIR(Instruction *Inst) const override;
+
+  protected:
+    std::pair<const TargetRegisterClass*, uint8_t>
+    findRepresentativeClass(MVT VT) const override;
+
+  private:
+    /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
+    /// make the right decision when generating code for different targets.
+    const ARMSubtarget *Subtarget;
+
+    const TargetRegisterInfo *RegInfo;
+
+    const InstrItineraryData *Itins;
+
+    /// ARMPCLabelIndex - Keep track of the number of ARM PC labels created.
+    ///
+    unsigned ARMPCLabelIndex;
+
+    void addTypeForNEON(MVT VT, MVT PromotedLdStVT, MVT PromotedBitwiseVT);
+    void addDRTypeForNEON(MVT VT);
+    void addQRTypeForNEON(MVT VT);
+    std::pair<SDValue, SDValue> getARMXALUOOp(SDValue Op, SelectionDAG &DAG, SDValue &ARMcc) const;
+
+    typedef SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPassVector;
+    void PassF64ArgInRegs(SDLoc dl, SelectionDAG &DAG,
+                          SDValue Chain, SDValue &Arg,
+                          RegsToPassVector &RegsToPass,
+                          CCValAssign &VA, CCValAssign &NextVA,
+                          SDValue &StackPtr,
+                          SmallVectorImpl<SDValue> &MemOpChains,
+                          ISD::ArgFlagsTy Flags) const;
+    SDValue GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
+                                 SDValue &Root, SelectionDAG &DAG,
+                                 SDLoc dl) const;
+
+    CallingConv::ID getEffectiveCallingConv(CallingConv::ID CC,
+                                            bool isVarArg) const;
+    CCAssignFn *CCAssignFnForNode(CallingConv::ID CC, bool Return,
+                                  bool isVarArg) const;
+    SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
+                             SDLoc dl, SelectionDAG &DAG,
+                             const CCValAssign &VA,
+                             ISD::ArgFlagsTy Flags) const;
+    SDValue LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
+                                    const ARMSubtarget *Subtarget) const;
+    SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalAddressDarwin(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalAddressELF(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalAddressWindows(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
+                                            SelectionDAG &DAG) const;
+    SDValue LowerToTLSExecModels(GlobalAddressSDNode *GA,
+                                 SelectionDAG &DAG,
+                                 TLSModel::Model model) const;
+    SDValue LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerConstantFP(SDValue Op, SelectionDAG &DAG,
+                            const ARMSubtarget *ST) const;
+    SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
+                              const ARMSubtarget *ST) const;
+    SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerDivRem(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+
+    unsigned getRegisterByName(const char* RegName, EVT VT) const override;
+
+    /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
+    /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
+    /// expanded to FMAs when this method returns true, otherwise fmuladd is
+    /// expanded to fmul + fadd.
+    ///
+    /// ARM supports both fused and unfused multiply-add operations; we already
+    /// lower a pair of fmul and fadd to the latter so it's not clear that there
+    /// would be a gain or that the gain would be worthwhile enough to risk
+    /// correctness bugs.
+    bool isFMAFasterThanFMulAndFAdd(EVT VT) const override { return false; }
+
+    SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const;
+
+    SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                            CallingConv::ID CallConv, bool isVarArg,
+                            const SmallVectorImpl<ISD::InputArg> &Ins,
+                            SDLoc dl, SelectionDAG &DAG,
+                            SmallVectorImpl<SDValue> &InVals,
+                            bool isThisReturn, SDValue ThisVal) const;
+
+    SDValue
+      LowerFormalArguments(SDValue Chain,
+                           CallingConv::ID CallConv, bool isVarArg,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           SDLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals) const override;
+
+    int StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
+                       SDLoc dl, SDValue &Chain,
+                       const Value *OrigArg,
+                       unsigned InRegsParamRecordIdx,
+                       unsigned OffsetFromOrigArg,
+                       unsigned ArgOffset,
+                       unsigned ArgSize,
+                       bool ForceMutable,
+                       unsigned ByValStoreOffset,
+                       unsigned TotalArgRegsSaveSize) const;
+
+    void VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
+                              SDLoc dl, SDValue &Chain,
+                              unsigned ArgOffset,
+                              unsigned TotalArgRegsSaveSize,
+                              bool ForceMutable = false) const;
+
+    void computeRegArea(CCState &CCInfo, MachineFunction &MF,
+                        unsigned InRegsParamRecordIdx,
+                        unsigned ArgSize,
+                        unsigned &ArgRegsSize,
+                        unsigned &ArgRegsSaveSize) const;
+
+    SDValue
+      LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                SmallVectorImpl<SDValue> &InVals) const override;
+
+    /// HandleByVal - Target-specific cleanup for ByVal support.
+    void HandleByVal(CCState *, unsigned &, unsigned) const override;
+
+    /// IsEligibleForTailCallOptimization - Check whether the call is eligible
+    /// for tail call optimization. Targets which want to do tail call
+    /// optimization should implement this function.
+    bool IsEligibleForTailCallOptimization(SDValue Callee,
+                                           CallingConv::ID CalleeCC,
+                                           bool isVarArg,
+                                           bool isCalleeStructRet,
+                                           bool isCallerStructRet,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<SDValue> &OutVals,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                           SelectionDAG& DAG) const;
+
+    bool CanLowerReturn(CallingConv::ID CallConv,
+                        MachineFunction &MF, bool isVarArg,
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        LLVMContext &Context) const override;
+
+    SDValue
+      LowerReturn(SDValue Chain,
+                  CallingConv::ID CallConv, bool isVarArg,
+                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  const SmallVectorImpl<SDValue> &OutVals,
+                  SDLoc dl, SelectionDAG &DAG) const override;
+
+    bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
+
+    bool mayBeEmittedAsTailCall(CallInst *CI) const override;
+
+    SDValue getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+                      SDValue &ARMcc, SelectionDAG &DAG, SDLoc dl) const;
+    SDValue getVFPCmp(SDValue LHS, SDValue RHS,
+                      SelectionDAG &DAG, SDLoc dl) const;
+    SDValue duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const;
+
+    SDValue OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const;
+
+    void SetupEntryBlockForSjLj(MachineInstr *MI,
+                                MachineBasicBlock *MBB,
+                                MachineBasicBlock *DispatchBB, int FI) const;
+
+    MachineBasicBlock *EmitSjLjDispatchBlock(MachineInstr *MI,
+                                             MachineBasicBlock *MBB) const;
+
+    bool RemapAddSubWithFlags(MachineInstr *MI, MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *EmitStructByval(MachineInstr *MI,
+                                       MachineBasicBlock *MBB) const;
+
+    MachineBasicBlock *EmitLowered__chkstk(MachineInstr *MI,
+                                           MachineBasicBlock *MBB) const;
+  };
+
+  enum NEONModImmType {
+    VMOVModImm,
+    VMVNModImm,
+    OtherModImm
+  };
+
+  namespace ARM {
+    FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                             const TargetLibraryInfo *libInfo);
+  }
+}
+
+#endif  // ARMISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td b/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td
new file mode 100644
index 0000000..59e9260
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td
@@ -0,0 +1,2357 @@
+//===-- ARMInstrFormats.td - ARM Instruction Formats -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//
+// ARM Instruction Format Definitions.
+//
+
+// Format specifies the encoding used by the instruction.  This is part of the
+// ad-hoc solution used to emit machine instruction encodings by our machine
+// code emitter.
+class Format<bits<6> val> {
+  bits<6> Value = val;
+}
+
+def Pseudo        : Format<0>;
+def MulFrm        : Format<1>;
+def BrFrm         : Format<2>;
+def BrMiscFrm     : Format<3>;
+
+def DPFrm         : Format<4>;
+def DPSoRegRegFrm    : Format<5>;
+
+def LdFrm         : Format<6>;
+def StFrm         : Format<7>;
+def LdMiscFrm     : Format<8>;
+def StMiscFrm     : Format<9>;
+def LdStMulFrm    : Format<10>;
+
+def LdStExFrm     : Format<11>;
+
+def ArithMiscFrm  : Format<12>;
+def SatFrm        : Format<13>;
+def ExtFrm        : Format<14>;
+
+def VFPUnaryFrm   : Format<15>;
+def VFPBinaryFrm  : Format<16>;
+def VFPConv1Frm   : Format<17>;
+def VFPConv2Frm   : Format<18>;
+def VFPConv3Frm   : Format<19>;
+def VFPConv4Frm   : Format<20>;
+def VFPConv5Frm   : Format<21>;
+def VFPLdStFrm    : Format<22>;
+def VFPLdStMulFrm : Format<23>;
+def VFPMiscFrm    : Format<24>;
+
+def ThumbFrm      : Format<25>;
+def MiscFrm       : Format<26>;
+
+def NGetLnFrm     : Format<27>;
+def NSetLnFrm     : Format<28>;
+def NDupFrm       : Format<29>;
+def NLdStFrm      : Format<30>;
+def N1RegModImmFrm: Format<31>;
+def N2RegFrm      : Format<32>;
+def NVCVTFrm      : Format<33>;
+def NVDupLnFrm    : Format<34>;
+def N2RegVShLFrm  : Format<35>;
+def N2RegVShRFrm  : Format<36>;
+def N3RegFrm      : Format<37>;
+def N3RegVShFrm   : Format<38>;
+def NVExtFrm      : Format<39>;
+def NVMulSLFrm    : Format<40>;
+def NVTBLFrm      : Format<41>;
+def DPSoRegImmFrm  : Format<42>;
+
+// Misc flags.
+
+// The instruction has an Rn register operand.
+// UnaryDP - Indicates this is a unary data processing instruction, i.e.
+// it doesn't have a Rn operand.
+class UnaryDP    { bit isUnaryDataProc = 1; }
+
+// Xform16Bit - Indicates this Thumb2 instruction may be transformed into
+// a 16-bit Thumb instruction if certain conditions are met.
+class Xform16Bit { bit canXformTo16Bit = 1; }
+
+//===----------------------------------------------------------------------===//
+// ARM Instruction flags.  These need to match ARMBaseInstrInfo.h.
+//
+
+// FIXME: Once the JIT is MC-ized, these can go away.
+// Addressing mode.
+class AddrMode<bits<5> val> {
+  bits<5> Value = val;
+}
+def AddrModeNone    : AddrMode<0>;
+def AddrMode1       : AddrMode<1>;
+def AddrMode2       : AddrMode<2>;
+def AddrMode3       : AddrMode<3>;
+def AddrMode4       : AddrMode<4>;
+def AddrMode5       : AddrMode<5>;
+def AddrMode6       : AddrMode<6>;
+def AddrModeT1_1    : AddrMode<7>;
+def AddrModeT1_2    : AddrMode<8>;
+def AddrModeT1_4    : AddrMode<9>;
+def AddrModeT1_s    : AddrMode<10>;
+def AddrModeT2_i12  : AddrMode<11>;
+def AddrModeT2_i8   : AddrMode<12>;
+def AddrModeT2_so   : AddrMode<13>;
+def AddrModeT2_pc   : AddrMode<14>;
+def AddrModeT2_i8s4 : AddrMode<15>;
+def AddrMode_i12    : AddrMode<16>;
+
+// Load / store index mode.
+class IndexMode<bits<2> val> {
+  bits<2> Value = val;
+}
+def IndexModeNone : IndexMode<0>;
+def IndexModePre  : IndexMode<1>;
+def IndexModePost : IndexMode<2>;
+def IndexModeUpd  : IndexMode<3>;
+
+// Instruction execution domain.
+class Domain<bits<3> val> {
+  bits<3> Value = val;
+}
+def GenericDomain : Domain<0>;
+def VFPDomain     : Domain<1>; // Instructions in VFP domain only
+def NeonDomain    : Domain<2>; // Instructions in Neon domain only
+def VFPNeonDomain : Domain<3>; // Instructions in both VFP & Neon domains
+def VFPNeonA8Domain : Domain<5>; // Instructions in VFP & Neon under A8
+
+//===----------------------------------------------------------------------===//
+// ARM special operands.
+//
+
+// ARM imod and iflag operands, used only by the CPS instruction.
+def imod_op : Operand<i32> {
+  let PrintMethod = "printCPSIMod";
+}
+
+def ProcIFlagsOperand : AsmOperandClass {
+  let Name = "ProcIFlags";
+  let ParserMethod = "parseProcIFlagsOperand";
+}
+def iflags_op : Operand<i32> {
+  let PrintMethod = "printCPSIFlag";
+  let ParserMatchClass = ProcIFlagsOperand;
+}
+
+// ARM Predicate operand. Default to 14 = always (AL). Second part is CC
+// register whose default is 0 (no register).
+def CondCodeOperand : AsmOperandClass { let Name = "CondCode"; }
+def pred : PredicateOperand<OtherVT, (ops i32imm, i32imm),
+                                     (ops (i32 14), (i32 zero_reg))> {
+  let PrintMethod = "printPredicateOperand";
+  let ParserMatchClass = CondCodeOperand;
+  let DecoderMethod = "DecodePredicateOperand";
+}
+
+// Selectable predicate operand for CMOV instructions. We can't use a normal
+// predicate because the default values interfere with instruction selection. In
+// all other respects it is identical though: pseudo-instruction expansion
+// relies on the MachineOperands being compatible.
+def cmovpred : Operand<i32>, PredicateOp,
+               ComplexPattern<i32, 2, "SelectCMOVPred"> {
+  let MIOperandInfo = (ops i32imm, i32imm);
+  let PrintMethod = "printPredicateOperand";
+}
+
+// Conditional code result for instructions whose 's' bit is set, e.g. subs.
+def CCOutOperand : AsmOperandClass { let Name = "CCOut"; }
+def cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 zero_reg))> {
+  let EncoderMethod = "getCCOutOpValue";
+  let PrintMethod = "printSBitModifierOperand";
+  let ParserMatchClass = CCOutOperand;
+  let DecoderMethod = "DecodeCCOutOperand";
+}
+
+// Same as cc_out except it defaults to setting CPSR.
+def s_cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 CPSR))> {
+  let EncoderMethod = "getCCOutOpValue";
+  let PrintMethod = "printSBitModifierOperand";
+  let ParserMatchClass = CCOutOperand;
+  let DecoderMethod = "DecodeCCOutOperand";
+}
+
+// ARM special operands for disassembly only.
+//
+def SetEndAsmOperand : ImmAsmOperand {
+  let Name = "SetEndImm";
+  let ParserMethod = "parseSetEndImm";
+}
+def setend_op : Operand<i32> {
+  let PrintMethod = "printSetendOperand";
+  let ParserMatchClass = SetEndAsmOperand;
+}
+
+def MSRMaskOperand : AsmOperandClass {
+  let Name = "MSRMask";
+  let ParserMethod = "parseMSRMaskOperand";
+}
+def msr_mask : Operand<i32> {
+  let PrintMethod = "printMSRMaskOperand";
+  let DecoderMethod = "DecodeMSRMask";
+  let ParserMatchClass = MSRMaskOperand;
+}
+
+// Shift Right Immediate - A shift right immediate is encoded differently from
+// other shift immediates. The imm6 field is encoded like so:
+//
+//    Offset    Encoding
+//     8        imm6<5:3> = '001', 8 - <imm> is encoded in imm6<2:0>
+//     16       imm6<5:4> = '01', 16 - <imm> is encoded in imm6<3:0>
+//     32       imm6<5> = '1', 32 - <imm> is encoded in imm6<4:0>
+//     64       64 - <imm> is encoded in imm6<5:0>
+def shr_imm8_asm_operand : ImmAsmOperand { let Name = "ShrImm8"; }
+def shr_imm8  : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm <= 8; }]> {
+  let EncoderMethod = "getShiftRight8Imm";
+  let DecoderMethod = "DecodeShiftRight8Imm";
+  let ParserMatchClass = shr_imm8_asm_operand;
+}
+def shr_imm16_asm_operand : ImmAsmOperand { let Name = "ShrImm16"; }
+def shr_imm16 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm <= 16; }]> {
+  let EncoderMethod = "getShiftRight16Imm";
+  let DecoderMethod = "DecodeShiftRight16Imm";
+  let ParserMatchClass = shr_imm16_asm_operand;
+}
+def shr_imm32_asm_operand : ImmAsmOperand { let Name = "ShrImm32"; }
+def shr_imm32 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm <= 32; }]> {
+  let EncoderMethod = "getShiftRight32Imm";
+  let DecoderMethod = "DecodeShiftRight32Imm";
+  let ParserMatchClass = shr_imm32_asm_operand;
+}
+def shr_imm64_asm_operand : ImmAsmOperand { let Name = "ShrImm64"; }
+def shr_imm64 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm <= 64; }]> {
+  let EncoderMethod = "getShiftRight64Imm";
+  let DecoderMethod = "DecodeShiftRight64Imm";
+  let ParserMatchClass = shr_imm64_asm_operand;
+}
+
+//===----------------------------------------------------------------------===//
+// ARM Assembler alias templates.
+//
+class ARMInstAlias<string Asm, dag Result, bit Emit = 0b1>
+      : InstAlias<Asm, Result, Emit>, Requires<[IsARM]>;
+class  tInstAlias<string Asm, dag Result, bit Emit = 0b1>
+      : InstAlias<Asm, Result, Emit>, Requires<[IsThumb]>;
+class t2InstAlias<string Asm, dag Result, bit Emit = 0b1>
+      : InstAlias<Asm, Result, Emit>, Requires<[IsThumb2]>;
+class VFP2InstAlias<string Asm, dag Result, bit Emit = 0b1>
+      : InstAlias<Asm, Result, Emit>, Requires<[HasVFP2]>;
+class VFP2DPInstAlias<string Asm, dag Result, bit Emit = 0b1>
+      : InstAlias<Asm, Result, Emit>, Requires<[HasVFP2,HasDPVFP]>;
+class VFP3InstAlias<string Asm, dag Result, bit Emit = 0b1>
+      : InstAlias<Asm, Result, Emit>, Requires<[HasVFP3]>;
+class NEONInstAlias<string Asm, dag Result, bit Emit = 0b1>
+      : InstAlias<Asm, Result, Emit>, Requires<[HasNEON]>;
+
+
+class VFP2MnemonicAlias<string src, string dst> : MnemonicAlias<src, dst>,
+          Requires<[HasVFP2]>;
+class NEONMnemonicAlias<string src, string dst> : MnemonicAlias<src, dst>,
+          Requires<[HasNEON]>;
+
+//===----------------------------------------------------------------------===//
+// ARM Instruction templates.
+//
+
+
+class InstTemplate<AddrMode am, int sz, IndexMode im,
+                   Format f, Domain d, string cstr, InstrItinClass itin>
+  : Instruction {
+  let Namespace = "ARM";
+
+  AddrMode AM = am;
+  int Size = sz;
+  IndexMode IM = im;
+  bits<2> IndexModeBits = IM.Value;
+  Format F = f;
+  bits<6> Form = F.Value;
+  Domain D = d;
+  bit isUnaryDataProc = 0;
+  bit canXformTo16Bit = 0;
+  // The instruction is a 16-bit flag setting Thumb instruction. Used
+  // by the parser to determine whether to require the 'S' suffix on the
+  // mnemonic (when not in an IT block) or preclude it (when in an IT block).
+  bit thumbArithFlagSetting = 0;
+
+  // If this is a pseudo instruction, mark it isCodeGenOnly.
+  let isCodeGenOnly = !eq(!cast<string>(f), "Pseudo");
+
+  // The layout of TSFlags should be kept in sync with ARMBaseInfo.h.
+  let TSFlags{4-0}   = AM.Value;
+  let TSFlags{6-5}   = IndexModeBits;
+  let TSFlags{12-7} = Form;
+  let TSFlags{13}    = isUnaryDataProc;
+  let TSFlags{14}    = canXformTo16Bit;
+  let TSFlags{17-15} = D.Value;
+  let TSFlags{18}    = thumbArithFlagSetting;
+
+  let Constraints = cstr;
+  let Itinerary = itin;
+}
+
+class Encoding {
+  field bits<32> Inst;
+  // Mask of bits that cause an encoding to be UNPREDICTABLE.
+  // If a bit is set, then if the corresponding bit in the
+  // target encoding differs from its value in the "Inst" field,
+  // the instruction is UNPREDICTABLE (SoftFail in abstract parlance).
+  field bits<32> Unpredictable = 0;
+  // SoftFail is the generic name for this field, but we alias it so
+  // as to make it more obvious what it means in ARM-land.
+  field bits<32> SoftFail = Unpredictable;
+}
+
+class InstARM<AddrMode am, int sz, IndexMode im,
+              Format f, Domain d, string cstr, InstrItinClass itin>
+  : InstTemplate<am, sz, im, f, d, cstr, itin>, Encoding {
+  let DecoderNamespace = "ARM";
+}
+
+// This Encoding-less class is used by Thumb1 to specify the encoding bits later
+// on by adding flavors to specific instructions.
+class InstThumb<AddrMode am, int sz, IndexMode im,
+                Format f, Domain d, string cstr, InstrItinClass itin>
+  : InstTemplate<am, sz, im, f, d, cstr, itin> {
+  let DecoderNamespace = "Thumb";
+}
+
+// Pseudo-instructions for alternate assembly syntax (never used by codegen).
+// These are aliases that require C++ handling to convert to the target
+// instruction, while InstAliases can be handled directly by tblgen.
+class AsmPseudoInst<string asm, dag iops, dag oops = (outs)>
+  : InstTemplate<AddrModeNone, 0, IndexModeNone, Pseudo, GenericDomain,
+                 "", NoItinerary> {
+  let OutOperandList = oops;
+  let InOperandList = iops;
+  let Pattern = [];
+  let isCodeGenOnly = 0; // So we get asm matcher for it.
+  let AsmString = asm;
+  let isPseudo = 1;
+}
+
+class ARMAsmPseudo<string asm, dag iops, dag oops = (outs)>
+  : AsmPseudoInst<asm, iops, oops>, Requires<[IsARM]>;
+class tAsmPseudo<string asm, dag iops, dag oops = (outs)>
+  : AsmPseudoInst<asm, iops, oops>, Requires<[IsThumb]>;
+class t2AsmPseudo<string asm, dag iops, dag oops = (outs)>
+  : AsmPseudoInst<asm, iops, oops>, Requires<[IsThumb2]>;
+class VFP2AsmPseudo<string asm, dag iops, dag oops = (outs)>
+  : AsmPseudoInst<asm, iops, oops>, Requires<[HasVFP2]>;
+class NEONAsmPseudo<string asm, dag iops, dag oops = (outs)>
+  : AsmPseudoInst<asm, iops, oops>, Requires<[HasNEON]>;
+
+// Pseudo instructions for the code generator.
+class PseudoInst<dag oops, dag iops, InstrItinClass itin, list<dag> pattern>
+  : InstTemplate<AddrModeNone, 0, IndexModeNone, Pseudo,
+                 GenericDomain, "", itin> {
+  let OutOperandList = oops;
+  let InOperandList = iops;
+  let Pattern = pattern;
+  let isCodeGenOnly = 1;
+  let isPseudo = 1;
+}
+
+// PseudoInst that's ARM-mode only.
+class ARMPseudoInst<dag oops, dag iops, int sz, InstrItinClass itin,
+                    list<dag> pattern>
+  : PseudoInst<oops, iops, itin, pattern> {
+  let Size = sz;
+  list<Predicate> Predicates = [IsARM];
+}
+
+// PseudoInst that's Thumb-mode only.
+class tPseudoInst<dag oops, dag iops, int sz, InstrItinClass itin,
+                    list<dag> pattern>
+  : PseudoInst<oops, iops, itin, pattern> {
+  let Size = sz;
+  list<Predicate> Predicates = [IsThumb];
+}
+
+// PseudoInst that's Thumb2-mode only.
+class t2PseudoInst<dag oops, dag iops, int sz, InstrItinClass itin,
+                    list<dag> pattern>
+  : PseudoInst<oops, iops, itin, pattern> {
+  let Size = sz;
+  list<Predicate> Predicates = [IsThumb2];
+}
+
+class ARMPseudoExpand<dag oops, dag iops, int sz,
+                      InstrItinClass itin, list<dag> pattern,
+                      dag Result>
+  : ARMPseudoInst<oops, iops, sz, itin, pattern>,
+    PseudoInstExpansion<Result>;
+
+class tPseudoExpand<dag oops, dag iops, int sz,
+                    InstrItinClass itin, list<dag> pattern,
+                    dag Result>
+  : tPseudoInst<oops, iops, sz, itin, pattern>,
+    PseudoInstExpansion<Result>;
+
+class t2PseudoExpand<dag oops, dag iops, int sz,
+                    InstrItinClass itin, list<dag> pattern,
+                    dag Result>
+  : t2PseudoInst<oops, iops, sz, itin, pattern>,
+    PseudoInstExpansion<Result>;
+
+// Almost all ARM instructions are predicable.
+class I<dag oops, dag iops, AddrMode am, int sz,
+        IndexMode im, Format f, InstrItinClass itin,
+        string opc, string asm, string cstr,
+        list<dag> pattern>
+  : InstARM<am, sz, im, f, GenericDomain, cstr, itin> {
+  bits<4> p;
+  let Inst{31-28} = p;
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ins pred:$p));
+  let AsmString = !strconcat(opc, "${p}", asm);
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsARM];
+}
+
+// A few are not predicable
+class InoP<dag oops, dag iops, AddrMode am, int sz,
+           IndexMode im, Format f, InstrItinClass itin,
+           string opc, string asm, string cstr,
+           list<dag> pattern>
+  : InstARM<am, sz, im, f, GenericDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = iops;
+  let AsmString = !strconcat(opc, asm);
+  let Pattern = pattern;
+  let isPredicable = 0;
+  list<Predicate> Predicates = [IsARM];
+}
+
+// Same as I except it can optionally modify CPSR. Note it's modeled as an input
+// operand since by default it's a zero register. It will become an implicit def
+// once it's "flipped".
+class sI<dag oops, dag iops, AddrMode am, int sz,
+         IndexMode im, Format f, InstrItinClass itin,
+         string opc, string asm, string cstr,
+         list<dag> pattern>
+  : InstARM<am, sz, im, f, GenericDomain, cstr, itin> {
+  bits<4> p; // Predicate operand
+  bits<1> s; // condition-code set flag ('1' if the insn should set the flags)
+  let Inst{31-28} = p;
+  let Inst{20} = s;
+
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ins pred:$p, cc_out:$s));
+  let AsmString = !strconcat(opc, "${s}${p}", asm);
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsARM];
+}
+
+// Special cases
+class XI<dag oops, dag iops, AddrMode am, int sz,
+         IndexMode im, Format f, InstrItinClass itin,
+         string asm, string cstr, list<dag> pattern>
+  : InstARM<am, sz, im, f, GenericDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = iops;
+  let AsmString = asm;
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsARM];
+}
+
+class AI<dag oops, dag iops, Format f, InstrItinClass itin,
+         string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, 4, IndexModeNone, f, itin,
+      opc, asm, "", pattern>;
+class AsI<dag oops, dag iops, Format f, InstrItinClass itin,
+          string opc, string asm, list<dag> pattern>
+  : sI<oops, iops, AddrModeNone, 4, IndexModeNone, f, itin,
+       opc, asm, "", pattern>;
+class AXI<dag oops, dag iops, Format f, InstrItinClass itin,
+          string asm, list<dag> pattern>
+  : XI<oops, iops, AddrModeNone, 4, IndexModeNone, f, itin,
+       asm, "", pattern>;
+class AXIM<dag oops, dag iops, AddrMode am, Format f, InstrItinClass itin,
+          string asm, list<dag> pattern>
+  : XI<oops, iops, am, 4, IndexModeNone, f, itin,
+       asm, "", pattern>;
+class AInoP<dag oops, dag iops, Format f, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : InoP<oops, iops, AddrModeNone, 4, IndexModeNone, f, itin,
+         opc, asm, "", pattern>;
+
+// Ctrl flow instructions
+class ABI<bits<4> opcod, dag oops, dag iops, InstrItinClass itin,
+          string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, 4, IndexModeNone, BrFrm, itin,
+      opc, asm, "", pattern> {
+  let Inst{27-24} = opcod;
+}
+class ABXI<bits<4> opcod, dag oops, dag iops, InstrItinClass itin,
+           string asm, list<dag> pattern>
+  : XI<oops, iops, AddrModeNone, 4, IndexModeNone, BrFrm, itin,
+       asm, "", pattern> {
+  let Inst{27-24} = opcod;
+}
+
+// BR_JT instructions
+class JTI<dag oops, dag iops, InstrItinClass itin,
+          string asm, list<dag> pattern>
+  : XI<oops, iops, AddrModeNone, 0, IndexModeNone, BrMiscFrm, itin,
+       asm, "", pattern>;
+
+class AIldr_ex_or_acq<bits<2> opcod, bits<2> opcod2, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, 4, IndexModeNone, LdStExFrm, itin,
+      opc, asm, "", pattern> {
+  bits<4> Rt;
+  bits<4> addr;
+  let Inst{27-23} = 0b00011;
+  let Inst{22-21} = opcod;
+  let Inst{20}    = 1;
+  let Inst{19-16} = addr;
+  let Inst{15-12} = Rt;
+  let Inst{11-10} = 0b11;
+  let Inst{9-8}   = opcod2;
+  let Inst{7-0}   = 0b10011111;
+}
+class AIstr_ex_or_rel<bits<2> opcod, bits<2> opcod2, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, 4, IndexModeNone, LdStExFrm, itin,
+      opc, asm, "", pattern> {
+  bits<4> Rt;
+  bits<4> addr;
+  let Inst{27-23} = 0b00011;
+  let Inst{22-21} = opcod;
+  let Inst{20}    = 0;
+  let Inst{19-16} = addr;
+  let Inst{11-10} = 0b11;
+  let Inst{9-8}   = opcod2;
+  let Inst{7-4}   = 0b1001;
+  let Inst{3-0}   = Rt;
+}
+// Atomic load/store instructions
+class AIldrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : AIldr_ex_or_acq<opcod, 0b11, oops, iops, itin, opc, asm, pattern>;
+
+class AIstrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : AIstr_ex_or_rel<opcod, 0b11, oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  let Inst{15-12} = Rd;
+}
+
+// Exclusive load/store instructions
+
+class AIldaex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : AIldr_ex_or_acq<opcod, 0b10, oops, iops, itin, opc, asm, pattern>,
+    Requires<[IsARM, HasV8]>;
+
+class AIstlex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : AIstr_ex_or_rel<opcod, 0b10, oops, iops, itin, opc, asm, pattern>,
+    Requires<[IsARM, HasV8]> {
+  bits<4> Rd;
+  let Inst{15-12} = Rd;
+}
+
+class AIswp<bit b, dag oops, dag iops, string opc, list<dag> pattern>
+  : AI<oops, iops, MiscFrm, NoItinerary, opc, "\t$Rt, $Rt2, $addr", pattern> {
+  bits<4> Rt;
+  bits<4> Rt2;
+  bits<4> addr;
+  let Inst{27-23} = 0b00010;
+  let Inst{22} = b;
+  let Inst{21-20} = 0b00;
+  let Inst{19-16} = addr;
+  let Inst{15-12} = Rt;
+  let Inst{11-4} = 0b00001001;
+  let Inst{3-0} = Rt2;
+
+  let Unpredictable{11-8} = 0b1111;
+  let DecoderMethod = "DecodeSwap";
+}
+// Acquire/Release load/store instructions
+class AIldracq<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : AIldr_ex_or_acq<opcod, 0b00, oops, iops, itin, opc, asm, pattern>,
+    Requires<[IsARM, HasV8]>;
+
+class AIstrrel<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : AIstr_ex_or_rel<opcod, 0b00, oops, iops, itin, opc, asm, pattern>,
+    Requires<[IsARM, HasV8]> {
+  let Inst{15-12}   = 0b1111;
+}
+
+// addrmode1 instructions
+class AI1<bits<4> opcod, dag oops, dag iops, Format f, InstrItinClass itin,
+          string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode1, 4, IndexModeNone, f, itin,
+      opc, asm, "", pattern> {
+  let Inst{24-21} = opcod;
+  let Inst{27-26} = 0b00;
+}
+class AsI1<bits<4> opcod, dag oops, dag iops, Format f, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : sI<oops, iops, AddrMode1, 4, IndexModeNone, f, itin,
+       opc, asm, "", pattern> {
+  let Inst{24-21} = opcod;
+  let Inst{27-26} = 0b00;
+}
+class AXI1<bits<4> opcod, dag oops, dag iops, Format f, InstrItinClass itin,
+           string asm, list<dag> pattern>
+  : XI<oops, iops, AddrMode1, 4, IndexModeNone, f, itin,
+       asm, "", pattern> {
+  let Inst{24-21} = opcod;
+  let Inst{27-26} = 0b00;
+}
+
+// loads
+
+// LDR/LDRB/STR/STRB/...
+class AI2ldst<bits<3> op, bit isLd, bit isByte, dag oops, dag iops, AddrMode am,
+             Format f, InstrItinClass itin, string opc, string asm,
+             list<dag> pattern>
+  : I<oops, iops, am, 4, IndexModeNone, f, itin, opc, asm,
+      "", pattern> {
+  let Inst{27-25} = op;
+  let Inst{24} = 1;  // 24 == P
+  // 23 == U
+  let Inst{22} = isByte;
+  let Inst{21} = 0;  // 21 == W
+  let Inst{20} = isLd;
+}
+// Indexed load/stores
+class AI2ldstidx<bit isLd, bit isByte, bit isPre, dag oops, dag iops,
+                IndexMode im, Format f, InstrItinClass itin, string opc,
+                string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode2, 4, im, f, itin,
+      opc, asm, cstr, pattern> {
+  bits<4> Rt;
+  let Inst{27-26} = 0b01;
+  let Inst{24}    = isPre; // P bit
+  let Inst{22}    = isByte; // B bit
+  let Inst{21}    = isPre; // W bit
+  let Inst{20}    = isLd; // L bit
+  let Inst{15-12} = Rt;
+}
+class AI2stridx_reg<bit isByte, bit isPre, dag oops, dag iops,
+                IndexMode im, Format f, InstrItinClass itin, string opc,
+                string asm, string cstr, list<dag> pattern>
+  : AI2ldstidx<0, isByte, isPre, oops, iops, im, f, itin, opc, asm, cstr,
+               pattern> {
+  // AM2 store w/ two operands: (GPR, am2offset)
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<14> offset;
+  bits<4> Rn;
+  let Inst{25} = 1;
+  let Inst{23} = offset{12};
+  let Inst{19-16} = Rn;
+  let Inst{11-5} = offset{11-5};
+  let Inst{4} = 0;
+  let Inst{3-0} = offset{3-0};
+}
+
+class AI2stridx_imm<bit isByte, bit isPre, dag oops, dag iops,
+                IndexMode im, Format f, InstrItinClass itin, string opc,
+                string asm, string cstr, list<dag> pattern>
+  : AI2ldstidx<0, isByte, isPre, oops, iops, im, f, itin, opc, asm, cstr,
+               pattern> {
+  // AM2 store w/ two operands: (GPR, am2offset)
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<14> offset;
+  bits<4> Rn;
+  let Inst{25} = 0;
+  let Inst{23} = offset{12};
+  let Inst{19-16} = Rn;
+  let Inst{11-0} = offset{11-0};
+}
+
+
+// FIXME: Merge with the above class when addrmode2 gets used for STR, STRB
+// but for now use this class for STRT and STRBT.
+class AI2stridxT<bit isByte, bit isPre, dag oops, dag iops,
+                IndexMode im, Format f, InstrItinClass itin, string opc,
+                string asm, string cstr, list<dag> pattern>
+  : AI2ldstidx<0, isByte, isPre, oops, iops, im, f, itin, opc, asm, cstr,
+               pattern> {
+  // AM2 store w/ two operands: (GPR, am2offset)
+  // {17-14}  Rn
+  // {13}     1 == Rm, 0 == imm12
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<18> addr;
+  let Inst{25} = addr{13};
+  let Inst{23} = addr{12};
+  let Inst{19-16} = addr{17-14};
+  let Inst{11-0} = addr{11-0};
+}
+
+// addrmode3 instructions
+class AI3ld<bits<4> op, bit op20, dag oops, dag iops, Format f,
+            InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode3, 4, IndexModeNone, f, itin,
+      opc, asm, "", pattern> {
+  bits<14> addr;
+  bits<4> Rt;
+  let Inst{27-25} = 0b000;
+  let Inst{24}    = 1;            // P bit
+  let Inst{23}    = addr{8};      // U bit
+  let Inst{22}    = addr{13};     // 1 == imm8, 0 == Rm
+  let Inst{21}    = 0;            // W bit
+  let Inst{20}    = op20;         // L bit
+  let Inst{19-16} = addr{12-9};   // Rn
+  let Inst{15-12} = Rt;           // Rt
+  let Inst{11-8}  = addr{7-4};    // imm7_4/zero
+  let Inst{7-4}   = op;
+  let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
+
+  let DecoderMethod = "DecodeAddrMode3Instruction";
+}
+
+class AI3ldstidx<bits<4> op, bit op20, bit isPre, dag oops, dag iops,
+                IndexMode im, Format f, InstrItinClass itin, string opc,
+                string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode3, 4, im, f, itin,
+      opc, asm, cstr, pattern> {
+  bits<4> Rt;
+  let Inst{27-25} = 0b000;
+  let Inst{24}    = isPre;        // P bit
+  let Inst{21}    = isPre;        // W bit
+  let Inst{20}    = op20;         // L bit
+  let Inst{15-12} = Rt;           // Rt
+  let Inst{7-4}   = op;
+}
+
+// FIXME: Merge with the above class when addrmode2 gets used for LDR, LDRB
+// but for now use this class for LDRSBT, LDRHT, LDSHT.
+class AI3ldstidxT<bits<4> op, bit isLoad, dag oops, dag iops,
+                  IndexMode im, Format f, InstrItinClass itin, string opc,
+                  string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode3, 4, im, f, itin, opc, asm, cstr, pattern> {
+  // {13}     1 == imm8, 0 == Rm
+  // {12-9}   Rn
+  // {8}      isAdd
+  // {7-4}    imm7_4/zero
+  // {3-0}    imm3_0/Rm
+  bits<4> addr;
+  bits<4> Rt;
+  let Inst{27-25} = 0b000;
+  let Inst{24}    = 0;            // P bit
+  let Inst{21}    = 1;
+  let Inst{20}    = isLoad;       // L bit
+  let Inst{19-16} = addr;         // Rn
+  let Inst{15-12} = Rt;           // Rt
+  let Inst{7-4}   = op;
+}
+
+// stores
+class AI3str<bits<4> op, dag oops, dag iops, Format f, InstrItinClass itin,
+             string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode3, 4, IndexModeNone, f, itin,
+      opc, asm, "", pattern> {
+  bits<14> addr;
+  bits<4> Rt;
+  let Inst{27-25} = 0b000;
+  let Inst{24}    = 1;            // P bit
+  let Inst{23}    = addr{8};      // U bit
+  let Inst{22}    = addr{13};     // 1 == imm8, 0 == Rm
+  let Inst{21}    = 0;            // W bit
+  let Inst{20}    = 0;            // L bit
+  let Inst{19-16} = addr{12-9};   // Rn
+  let Inst{15-12} = Rt;           // Rt
+  let Inst{11-8}  = addr{7-4};    // imm7_4/zero
+  let Inst{7-4}   = op;
+  let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
+  let DecoderMethod = "DecodeAddrMode3Instruction";
+}
+
+// addrmode4 instructions
+class AXI4<dag oops, dag iops, IndexMode im, Format f, InstrItinClass itin,
+           string asm, string cstr, list<dag> pattern>
+  : XI<oops, iops, AddrMode4, 4, im, f, itin, asm, cstr, pattern> {
+  bits<4>  p;
+  bits<16> regs;
+  bits<4>  Rn;
+  let Inst{31-28} = p;
+  let Inst{27-25} = 0b100;
+  let Inst{22}    = 0; // S bit
+  let Inst{19-16} = Rn;
+  let Inst{15-0}  = regs;
+}
+
+// Unsigned multiply, multiply-accumulate instructions.
+class AMul1I<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
+             string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, 4, IndexModeNone, MulFrm, itin,
+      opc, asm, "", pattern> {
+  let Inst{7-4}   = 0b1001;
+  let Inst{20}    = 0; // S bit
+  let Inst{27-21} = opcod;
+}
+class AsMul1I<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : sI<oops, iops, AddrModeNone, 4, IndexModeNone, MulFrm, itin,
+       opc, asm, "", pattern> {
+  let Inst{7-4}   = 0b1001;
+  let Inst{27-21} = opcod;
+}
+
+// Most significant word multiply
+class AMul2I<bits<7> opcod, bits<4> opc7_4, dag oops, dag iops,
+             InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, 4, IndexModeNone, MulFrm, itin,
+      opc, asm, "", pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+  let Inst{7-4}   = opc7_4;
+  let Inst{20}    = 1;
+  let Inst{27-21} = opcod;
+  let Inst{19-16} = Rd;
+  let Inst{11-8}  = Rm;
+  let Inst{3-0}   = Rn;
+}
+// MSW multiple w/ Ra operand
+class AMul2Ia<bits<7> opcod, bits<4> opc7_4, dag oops, dag iops,
+              InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : AMul2I<opcod, opc7_4, oops, iops, itin, opc, asm, pattern> {
+  bits<4> Ra;
+  let Inst{15-12} = Ra;
+}
+
+// SMUL<x><y> / SMULW<y> / SMLA<x><y> / SMLAW<x><y>
+class AMulxyIbase<bits<7> opcod, bits<2> bit6_5, dag oops, dag iops,
+              InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, 4, IndexModeNone, MulFrm, itin,
+      opc, asm, "", pattern> {
+  bits<4> Rn;
+  bits<4> Rm;
+  let Inst{4}     = 0;
+  let Inst{7}     = 1;
+  let Inst{20}    = 0;
+  let Inst{27-21} = opcod;
+  let Inst{6-5}   = bit6_5;
+  let Inst{11-8}  = Rm;
+  let Inst{3-0}   = Rn;
+}
+class AMulxyI<bits<7> opcod, bits<2> bit6_5, dag oops, dag iops,
+              InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : AMulxyIbase<opcod, bit6_5, oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  let Inst{19-16} = Rd;
+}
+
+// AMulxyI with Ra operand
+class AMulxyIa<bits<7> opcod, bits<2> bit6_5, dag oops, dag iops,
+              InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : AMulxyI<opcod, bit6_5, oops, iops, itin, opc, asm, pattern> {
+  bits<4> Ra;
+  let Inst{15-12} = Ra;
+}
+// SMLAL*
+class AMulxyI64<bits<7> opcod, bits<2> bit6_5, dag oops, dag iops,
+              InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : AMulxyIbase<opcod, bit6_5, oops, iops, itin, opc, asm, pattern> {
+  bits<4> RdLo;
+  bits<4> RdHi;
+  let Inst{19-16} = RdHi;
+  let Inst{15-12} = RdLo;
+}
+
+// Extend instructions.
+class AExtI<bits<8> opcod, dag oops, dag iops, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, 4, IndexModeNone, ExtFrm, itin,
+      opc, asm, "", pattern> {
+  // All AExtI instructions have Rd and Rm register operands.
+  bits<4> Rd;
+  bits<4> Rm;
+  let Inst{15-12} = Rd;
+  let Inst{3-0}   = Rm;
+  let Inst{7-4}   = 0b0111;
+  let Inst{9-8}   = 0b00;
+  let Inst{27-20} = opcod;
+
+  let Unpredictable{9-8} = 0b11;
+}
+
+// Misc Arithmetic instructions.
+class AMiscA1I<bits<8> opcod, bits<4> opc7_4, dag oops, dag iops,
+               InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, 4, IndexModeNone, ArithMiscFrm, itin,
+      opc, asm, "", pattern> {
+  bits<4> Rd;
+  bits<4> Rm;
+  let Inst{27-20} = opcod;
+  let Inst{19-16} = 0b1111;
+  let Inst{15-12} = Rd;
+  let Inst{11-8}  = 0b1111;
+  let Inst{7-4}   = opc7_4;
+  let Inst{3-0}   = Rm;
+}
+
+// Division instructions.
+class ADivA1I<bits<3> opcod, dag oops, dag iops,
+              InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, 4, IndexModeNone, ArithMiscFrm, itin,
+      opc, asm, "", pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+  let Inst{27-23} = 0b01110;
+  let Inst{22-20} = opcod;
+  let Inst{19-16} = Rd;
+  let Inst{15-12} = 0b1111;
+  let Inst{11-8}  = Rm;
+  let Inst{7-4}   = 0b0001;
+  let Inst{3-0}   = Rn;
+}
+
+// PKH instructions
+def PKHLSLAsmOperand : ImmAsmOperand {
+  let Name = "PKHLSLImm";
+  let ParserMethod = "parsePKHLSLImm";
+}
+def pkh_lsl_amt: Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm < 32; }]>{
+  let PrintMethod = "printPKHLSLShiftImm";
+  let ParserMatchClass = PKHLSLAsmOperand;
+}
+def PKHASRAsmOperand : AsmOperandClass {
+  let Name = "PKHASRImm";
+  let ParserMethod = "parsePKHASRImm";
+}
+def pkh_asr_amt: Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm <= 32; }]>{
+  let PrintMethod = "printPKHASRShiftImm";
+  let ParserMatchClass = PKHASRAsmOperand;
+}
+
+class APKHI<bits<8> opcod, bit tb, dag oops, dag iops, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, 4, IndexModeNone, ArithMiscFrm, itin,
+      opc, asm, "", pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+  bits<5> sh;
+  let Inst{27-20} = opcod;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = Rd;
+  let Inst{11-7}  = sh;
+  let Inst{6}     = tb;
+  let Inst{5-4}   = 0b01;
+  let Inst{3-0}   = Rm;
+}
+
+//===----------------------------------------------------------------------===//
+
+// ARMPat - Same as Pat<>, but requires that the compiler be in ARM mode.
+class ARMPat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsARM];
+}
+class ARMV5TPat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsARM, HasV5T];
+}
+class ARMV5TEPat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsARM, HasV5TE];
+}
+// ARMV5MOPat - Same as ARMV5TEPat with UseMulOps.
+class ARMV5MOPat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsARM, HasV5TE, UseMulOps];
+}
+class ARMV6Pat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsARM, HasV6];
+}
+
+//===----------------------------------------------------------------------===//
+// Thumb Instruction Format Definitions.
+//
+
+class ThumbI<dag oops, dag iops, AddrMode am, int sz,
+             InstrItinClass itin, string asm, string cstr, list<dag> pattern>
+  : InstThumb<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = iops;
+  let AsmString = asm;
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsThumb];
+}
+
+// TI - Thumb instruction.
+class TI<dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern>
+  : ThumbI<oops, iops, AddrModeNone, 2, itin, asm, "", pattern>;
+
+// Two-address instructions
+class TIt<dag oops, dag iops, InstrItinClass itin, string asm,
+          list<dag> pattern>
+  : ThumbI<oops, iops, AddrModeNone, 2, itin, asm, "$lhs = $dst",
+           pattern>;
+
+// tBL, tBX 32-bit instructions
+class TIx2<bits<5> opcod1, bits<2> opcod2, bit opcod3,
+           dag oops, dag iops, InstrItinClass itin, string asm,
+           list<dag> pattern>
+    : ThumbI<oops, iops, AddrModeNone, 4, itin, asm, "", pattern>,
+      Encoding {
+  let Inst{31-27} = opcod1;
+  let Inst{15-14} = opcod2;
+  let Inst{12}    = opcod3;
+}
+
+// BR_JT instructions
+class TJTI<dag oops, dag iops, InstrItinClass itin, string asm,
+           list<dag> pattern>
+  : ThumbI<oops, iops, AddrModeNone, 0, itin, asm, "", pattern>;
+
+// Thumb1 only
+class Thumb1I<dag oops, dag iops, AddrMode am, int sz,
+              InstrItinClass itin, string asm, string cstr, list<dag> pattern>
+  : InstThumb<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = iops;
+  let AsmString = asm;
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsThumb, IsThumb1Only];
+}
+
+class T1I<dag oops, dag iops, InstrItinClass itin,
+          string asm, list<dag> pattern>
+  : Thumb1I<oops, iops, AddrModeNone, 2, itin, asm, "", pattern>;
+class T1Ix2<dag oops, dag iops, InstrItinClass itin,
+            string asm, list<dag> pattern>
+  : Thumb1I<oops, iops, AddrModeNone, 4, itin, asm, "", pattern>;
+
+// Two-address instructions
+class T1It<dag oops, dag iops, InstrItinClass itin,
+           string asm, string cstr, list<dag> pattern>
+  : Thumb1I<oops, iops, AddrModeNone, 2, itin,
+            asm, cstr, pattern>;
+
+// Thumb1 instruction that can either be predicated or set CPSR.
+class Thumb1sI<dag oops, dag iops, AddrMode am, int sz,
+               InstrItinClass itin,
+               string opc, string asm, string cstr, list<dag> pattern>
+  : InstThumb<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
+  let OutOperandList = !con(oops, (outs s_cc_out:$s));
+  let InOperandList = !con(iops, (ins pred:$p));
+  let AsmString = !strconcat(opc, "${s}${p}", asm);
+  let Pattern = pattern;
+  let thumbArithFlagSetting = 1;
+  list<Predicate> Predicates = [IsThumb, IsThumb1Only];
+  let DecoderNamespace = "ThumbSBit";
+}
+
+class T1sI<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : Thumb1sI<oops, iops, AddrModeNone, 2, itin, opc, asm, "", pattern>;
+
+// Two-address instructions
+class T1sIt<dag oops, dag iops, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : Thumb1sI<oops, iops, AddrModeNone, 2, itin, opc, asm,
+             "$Rn = $Rdn", pattern>;
+
+// Thumb1 instruction that can be predicated.
+class Thumb1pI<dag oops, dag iops, AddrMode am, int sz,
+               InstrItinClass itin,
+               string opc, string asm, string cstr, list<dag> pattern>
+  : InstThumb<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ins pred:$p));
+  let AsmString = !strconcat(opc, "${p}", asm);
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsThumb, IsThumb1Only];
+}
+
+class T1pI<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : Thumb1pI<oops, iops, AddrModeNone, 2, itin, opc, asm, "", pattern>;
+
+// Two-address instructions
+class T1pIt<dag oops, dag iops, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : Thumb1pI<oops, iops, AddrModeNone, 2, itin, opc, asm,
+             "$Rn = $Rdn", pattern>;
+
+class T1pIs<dag oops, dag iops,
+            InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : Thumb1pI<oops, iops, AddrModeT1_s, 2, itin, opc, asm, "", pattern>;
+
+class Encoding16 : Encoding {
+  let Inst{31-16} = 0x0000;
+}
+
+// A6.2 16-bit Thumb instruction encoding
+class T1Encoding<bits<6> opcode> : Encoding16 {
+  let Inst{15-10} = opcode;
+}
+
+// A6.2.1 Shift (immediate), add, subtract, move, and compare encoding.
+class T1General<bits<5> opcode> : Encoding16 {
+  let Inst{15-14} = 0b00;
+  let Inst{13-9} = opcode;
+}
+
+// A6.2.2 Data-processing encoding.
+class T1DataProcessing<bits<4> opcode> : Encoding16 {
+  let Inst{15-10} = 0b010000;
+  let Inst{9-6} = opcode;
+}
+
+// A6.2.3 Special data instructions and branch and exchange encoding.
+class T1Special<bits<4> opcode> : Encoding16 {
+  let Inst{15-10} = 0b010001;
+  let Inst{9-6}   = opcode;
+}
+
+// A6.2.4 Load/store single data item encoding.
+class T1LoadStore<bits<4> opA, bits<3> opB> : Encoding16 {
+  let Inst{15-12} = opA;
+  let Inst{11-9}  = opB;
+}
+class T1LdStSP<bits<3> opB>   : T1LoadStore<0b1001, opB>; // SP relative
+
+class T1BranchCond<bits<4> opcode> : Encoding16 {
+  let Inst{15-12} = opcode;
+}
+
+// Helper classes to encode Thumb1 loads and stores. For immediates, the
+// following bits are used for "opA" (see A6.2.4):
+//
+//   0b0110 => Immediate, 4 bytes
+//   0b1000 => Immediate, 2 bytes
+//   0b0111 => Immediate, 1 byte
+class T1pILdStEncode<bits<3> opcode, dag oops, dag iops, AddrMode am,
+                     InstrItinClass itin, string opc, string asm,
+                     list<dag> pattern>
+  : Thumb1pI<oops, iops, am, 2, itin, opc, asm, "", pattern>,
+    T1LoadStore<0b0101, opcode> {
+  bits<3> Rt;
+  bits<8> addr;
+  let Inst{8-6} = addr{5-3};    // Rm
+  let Inst{5-3} = addr{2-0};    // Rn
+  let Inst{2-0} = Rt;
+}
+class T1pILdStEncodeImm<bits<4> opA, bit opB, dag oops, dag iops, AddrMode am,
+                        InstrItinClass itin, string opc, string asm,
+                        list<dag> pattern>
+  : Thumb1pI<oops, iops, am, 2, itin, opc, asm, "", pattern>,
+    T1LoadStore<opA, {opB,?,?}> {
+  bits<3> Rt;
+  bits<8> addr;
+  let Inst{10-6} = addr{7-3};   // imm5
+  let Inst{5-3}  = addr{2-0};   // Rn
+  let Inst{2-0}  = Rt;
+}
+
+// A6.2.5 Miscellaneous 16-bit instructions encoding.
+class T1Misc<bits<7> opcode> : Encoding16 {
+  let Inst{15-12} = 0b1011;
+  let Inst{11-5} = opcode;
+}
+
+// Thumb2I - Thumb2 instruction. Almost all Thumb2 instructions are predicable.
+class Thumb2I<dag oops, dag iops, AddrMode am, int sz,
+              InstrItinClass itin,
+              string opc, string asm, string cstr, list<dag> pattern>
+  : InstARM<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ins pred:$p));
+  let AsmString = !strconcat(opc, "${p}", asm);
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsThumb2];
+  let DecoderNamespace = "Thumb2";
+}
+
+// Same as Thumb2I except it can optionally modify CPSR. Note it's modeled as an
+// input operand since by default it's a zero register. It will become an
+// implicit def once it's "flipped".
+//
+// FIXME: This uses unified syntax so {s} comes before {p}. We should make it
+// more consistent.
+class Thumb2sI<dag oops, dag iops, AddrMode am, int sz,
+               InstrItinClass itin,
+               string opc, string asm, string cstr, list<dag> pattern>
+  : InstARM<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
+  bits<1> s; // condition-code set flag ('1' if the insn should set the flags)
+  let Inst{20} = s;
+
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ins pred:$p, cc_out:$s));
+  let AsmString = !strconcat(opc, "${s}${p}", asm);
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsThumb2];
+  let DecoderNamespace = "Thumb2";
+}
+
+// Special cases
+class Thumb2XI<dag oops, dag iops, AddrMode am, int sz,
+               InstrItinClass itin,
+               string asm, string cstr, list<dag> pattern>
+  : InstARM<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = iops;
+  let AsmString = asm;
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsThumb2];
+  let DecoderNamespace = "Thumb2";
+}
+
+class ThumbXI<dag oops, dag iops, AddrMode am, int sz,
+              InstrItinClass itin,
+              string asm, string cstr, list<dag> pattern>
+  : InstARM<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = iops;
+  let AsmString = asm;
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsThumb, IsThumb1Only];
+  let DecoderNamespace = "Thumb";
+}
+
+class T2I<dag oops, dag iops, InstrItinClass itin,
+          string opc, string asm, list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeNone, 4, itin, opc, asm, "", pattern>;
+class T2Ii12<dag oops, dag iops, InstrItinClass itin,
+             string opc, string asm, list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeT2_i12, 4, itin, opc, asm, "",pattern>;
+class T2Ii8<dag oops, dag iops, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeT2_i8, 4, itin, opc, asm, "", pattern>;
+class T2Iso<dag oops, dag iops, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeT2_so, 4, itin, opc, asm, "", pattern>;
+class T2Ipc<dag oops, dag iops, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeT2_pc, 4, itin, opc, asm, "", pattern>;
+class T2Ii8s4<bit P, bit W, bit isLoad, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, string cstr, list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeT2_i8s4, 4, itin, opc, asm, cstr,
+            pattern> {
+  bits<4> Rt;
+  bits<4> Rt2;
+  bits<13> addr;
+  let Inst{31-25} = 0b1110100;
+  let Inst{24}    = P;
+  let Inst{23}    = addr{8};
+  let Inst{22}    = 1;
+  let Inst{21}    = W;
+  let Inst{20}    = isLoad;
+  let Inst{19-16} = addr{12-9};
+  let Inst{15-12} = Rt{3-0};
+  let Inst{11-8}  = Rt2{3-0};
+  let Inst{7-0}   = addr{7-0};
+}
+class T2Ii8s4post<bit P, bit W, bit isLoad, dag oops, dag iops,
+                  InstrItinClass itin, string opc, string asm, string cstr,
+                  list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeT2_i8s4, 4, itin, opc, asm, cstr,
+            pattern> {
+  bits<4> Rt;
+  bits<4> Rt2;
+  bits<4> addr;
+  bits<9> imm;
+  let Inst{31-25} = 0b1110100;
+  let Inst{24}    = P;
+  let Inst{23}    = imm{8};
+  let Inst{22}    = 1;
+  let Inst{21}    = W;
+  let Inst{20}    = isLoad;
+  let Inst{19-16} = addr;
+  let Inst{15-12} = Rt{3-0};
+  let Inst{11-8}  = Rt2{3-0};
+  let Inst{7-0}   = imm{7-0};
+}
+
+class T2sI<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : Thumb2sI<oops, iops, AddrModeNone, 4, itin, opc, asm, "", pattern>;
+
+class T2XI<dag oops, dag iops, InstrItinClass itin,
+           string asm, list<dag> pattern>
+  : Thumb2XI<oops, iops, AddrModeNone, 4, itin, asm, "", pattern>;
+class T2JTI<dag oops, dag iops, InstrItinClass itin,
+            string asm, list<dag> pattern>
+  : Thumb2XI<oops, iops, AddrModeNone, 0, itin, asm, "", pattern>;
+
+// Move to/from coprocessor instructions
+class T2Cop<bits<4> opc, dag oops, dag iops, string opcstr, string asm,
+            list<dag> pattern>
+  : T2I <oops, iops, NoItinerary, opcstr, asm, pattern>, Requires<[IsThumb2]> {
+  let Inst{31-28} = opc;
+}
+
+// Two-address instructions
+class T2XIt<dag oops, dag iops, InstrItinClass itin,
+            string asm, string cstr, list<dag> pattern>
+  : Thumb2XI<oops, iops, AddrModeNone, 4, itin, asm, cstr, pattern>;
+
+// T2Ipreldst - Thumb2 pre-indexed load / store instructions.
+class T2Ipreldst<bit signed, bits<2> opcod, bit load, bit pre,
+                 dag oops, dag iops,
+                 AddrMode am, IndexMode im, InstrItinClass itin,
+                 string opc, string asm, string cstr, list<dag> pattern>
+  : InstARM<am, 4, im, ThumbFrm, GenericDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ins pred:$p));
+  let AsmString = !strconcat(opc, "${p}", asm);
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsThumb2];
+  let DecoderNamespace = "Thumb2";
+
+  bits<4> Rt;
+  bits<13> addr;
+  let Inst{31-27} = 0b11111;
+  let Inst{26-25} = 0b00;
+  let Inst{24}    = signed;
+  let Inst{23}    = 0;
+  let Inst{22-21} = opcod;
+  let Inst{20}    = load;
+  let Inst{19-16} = addr{12-9};
+  let Inst{15-12} = Rt{3-0};
+  let Inst{11}    = 1;
+  // (P, W) = (1, 1) Pre-indexed or (0, 1) Post-indexed
+  let Inst{10}    = pre; // The P bit.
+  let Inst{9}     = addr{8}; // Sign bit
+  let Inst{8}     = 1; // The W bit.
+  let Inst{7-0}   = addr{7-0};
+
+  let DecoderMethod = "DecodeT2LdStPre";
+}
+
+// T2Ipostldst - Thumb2 post-indexed load / store instructions.
+class T2Ipostldst<bit signed, bits<2> opcod, bit load, bit pre,
+                 dag oops, dag iops,
+                 AddrMode am, IndexMode im, InstrItinClass itin,
+                 string opc, string asm, string cstr, list<dag> pattern>
+  : InstARM<am, 4, im, ThumbFrm, GenericDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ins pred:$p));
+  let AsmString = !strconcat(opc, "${p}", asm);
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsThumb2];
+  let DecoderNamespace = "Thumb2";
+
+  bits<4> Rt;
+  bits<4> Rn;
+  bits<9> offset;
+  let Inst{31-27} = 0b11111;
+  let Inst{26-25} = 0b00;
+  let Inst{24}    = signed;
+  let Inst{23}    = 0;
+  let Inst{22-21} = opcod;
+  let Inst{20}    = load;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = Rt{3-0};
+  let Inst{11}    = 1;
+  // (P, W) = (1, 1) Pre-indexed or (0, 1) Post-indexed
+  let Inst{10}    = pre; // The P bit.
+  let Inst{9}     = offset{8}; // Sign bit
+  let Inst{8}     = 1; // The W bit.
+  let Inst{7-0}   = offset{7-0};
+
+  let DecoderMethod = "DecodeT2LdStPre";
+}
+
+// Tv5Pat - Same as Pat<>, but requires V5T Thumb mode.
+class Tv5Pat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsThumb, IsThumb1Only, HasV5T];
+}
+
+// T1Pat - Same as Pat<>, but requires that the compiler be in Thumb1 mode.
+class T1Pat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsThumb, IsThumb1Only];
+}
+
+// T2v6Pat - Same as Pat<>, but requires V6T2 Thumb2 mode.
+class T2v6Pat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsThumb2, HasV6T2];
+}
+
+// T2Pat - Same as Pat<>, but requires that the compiler be in Thumb2 mode.
+class T2Pat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsThumb2];
+}
+
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ARM VFP Instruction templates.
+//
+
+// Almost all VFP instructions are predicable.
+class VFPI<dag oops, dag iops, AddrMode am, int sz,
+           IndexMode im, Format f, InstrItinClass itin,
+           string opc, string asm, string cstr, list<dag> pattern>
+  : InstARM<am, sz, im, f, VFPDomain, cstr, itin> {
+  bits<4> p;
+  let Inst{31-28} = p;
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ins pred:$p));
+  let AsmString = !strconcat(opc, "${p}", asm);
+  let Pattern = pattern;
+  let PostEncoderMethod = "VFPThumb2PostEncoder";
+  let DecoderNamespace = "VFP";
+  list<Predicate> Predicates = [HasVFP2];
+}
+
+// Special cases
+class VFPXI<dag oops, dag iops, AddrMode am, int sz,
+            IndexMode im, Format f, InstrItinClass itin,
+            string asm, string cstr, list<dag> pattern>
+  : InstARM<am, sz, im, f, VFPDomain, cstr, itin> {
+  bits<4> p;
+  let Inst{31-28} = p;
+  let OutOperandList = oops;
+  let InOperandList = iops;
+  let AsmString = asm;
+  let Pattern = pattern;
+  let PostEncoderMethod = "VFPThumb2PostEncoder";
+  let DecoderNamespace = "VFP";
+  list<Predicate> Predicates = [HasVFP2];
+}
+
+class VFPAI<dag oops, dag iops, Format f, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : VFPI<oops, iops, AddrModeNone, 4, IndexModeNone, f, itin,
+         opc, asm, "", pattern> {
+  let PostEncoderMethod = "VFPThumb2PostEncoder";
+}
+
+// ARM VFP addrmode5 loads and stores
+class ADI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops,
+           InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : VFPI<oops, iops, AddrMode5, 4, IndexModeNone,
+         VFPLdStFrm, itin, opc, asm, "", pattern> {
+  // Instruction operands.
+  bits<5>  Dd;
+  bits<13> addr;
+
+  // Encode instruction operands.
+  let Inst{23}    = addr{8};      // U (add = (U == '1'))
+  let Inst{22}    = Dd{4};
+  let Inst{19-16} = addr{12-9};   // Rn
+  let Inst{15-12} = Dd{3-0};
+  let Inst{7-0}   = addr{7-0};    // imm8
+
+  let Inst{27-24} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 1;          // Double precision
+
+  // Loads & stores operate on both NEON and VFP pipelines.
+  let D = VFPNeonDomain;
+}
+
+class ASI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops,
+           InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : VFPI<oops, iops, AddrMode5, 4, IndexModeNone,
+         VFPLdStFrm, itin, opc, asm, "", pattern> {
+  // Instruction operands.
+  bits<5>  Sd;
+  bits<13> addr;
+
+  // Encode instruction operands.
+  let Inst{23}    = addr{8};      // U (add = (U == '1'))
+  let Inst{22}    = Sd{0};
+  let Inst{19-16} = addr{12-9};   // Rn
+  let Inst{15-12} = Sd{4-1};
+  let Inst{7-0}   = addr{7-0};    // imm8
+
+  let Inst{27-24} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 0;          // Single precision
+
+  // Loads & stores operate on both NEON and VFP pipelines.
+  let D = VFPNeonDomain;
+}
+
+// VFP Load / store multiple pseudo instructions.
+class PseudoVFPLdStM<dag oops, dag iops, InstrItinClass itin, string cstr,
+                     list<dag> pattern>
+  : InstARM<AddrMode4, 4, IndexModeNone, Pseudo, VFPNeonDomain,
+            cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ins pred:$p));
+  let Pattern = pattern;
+  list<Predicate> Predicates = [HasVFP2];
+}
+
+// Load / store multiple
+
+// Unknown precision
+class AXXI4<dag oops, dag iops, IndexMode im,
+            string asm, string cstr, list<dag> pattern>
+  : VFPXI<oops, iops, AddrMode4, 4, im,
+          VFPLdStFrm, NoItinerary, asm, cstr, pattern> {
+  // Instruction operands.
+  bits<4>  Rn;
+  bits<13> regs;
+
+  // Encode instruction operands.
+  let Inst{19-16} = Rn;
+  let Inst{22}    = 0;
+  let Inst{15-12} = regs{11-8};
+  let Inst{7-1}   = regs{7-1};
+
+  let Inst{27-25} = 0b110;
+  let Inst{11-8}  = 0b1011;
+  let Inst{0}     = 1;
+}
+
+// Double precision
+class AXDI4<dag oops, dag iops, IndexMode im, InstrItinClass itin,
+            string asm, string cstr, list<dag> pattern>
+  : VFPXI<oops, iops, AddrMode4, 4, im,
+          VFPLdStMulFrm, itin, asm, cstr, pattern> {
+  // Instruction operands.
+  bits<4>  Rn;
+  bits<13> regs;
+
+  // Encode instruction operands.
+  let Inst{19-16} = Rn;
+  let Inst{22}    = regs{12};
+  let Inst{15-12} = regs{11-8};
+  let Inst{7-1}   = regs{7-1};
+
+  let Inst{27-25} = 0b110;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 1;          // Double precision
+  let Inst{0}     = 0;
+}
+
+// Single Precision
+class AXSI4<dag oops, dag iops, IndexMode im, InstrItinClass itin,
+            string asm, string cstr, list<dag> pattern>
+  : VFPXI<oops, iops, AddrMode4, 4, im,
+          VFPLdStMulFrm, itin, asm, cstr, pattern> {
+  // Instruction operands.
+  bits<4> Rn;
+  bits<13> regs;
+
+  // Encode instruction operands.
+  let Inst{19-16} = Rn;
+  let Inst{22}    = regs{8};
+  let Inst{15-12} = regs{12-9};
+  let Inst{7-0}   = regs{7-0};
+
+  let Inst{27-25} = 0b110;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 0;          // Single precision
+}
+
+// Double precision, unary
+class ADuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
+           bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc,
+           string asm, list<dag> pattern>
+  : VFPAI<oops, iops, VFPUnaryFrm, itin, opc, asm, pattern> {
+  // Instruction operands.
+  bits<5> Dd;
+  bits<5> Dm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Dm{3-0};
+  let Inst{5}     = Dm{4};
+  let Inst{15-12} = Dd{3-0};
+  let Inst{22}    = Dd{4};
+
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{19-16} = opcod3;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 1;          // Double precision
+  let Inst{7-6}   = opcod4;
+  let Inst{4}     = opcod5;
+
+  let Predicates = [HasVFP2, HasDPVFP];
+}
+
+// Double precision, unary, not-predicated
+class ADuInp<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
+           bit opcod5, dag oops, dag iops, InstrItinClass itin,
+           string asm, list<dag> pattern>
+  : VFPXI<oops, iops, AddrModeNone, 4, IndexModeNone, VFPUnaryFrm, itin, asm, "", pattern> {
+  // Instruction operands.
+  bits<5> Dd;
+  bits<5> Dm;
+
+  let Inst{31-28} = 0b1111;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Dm{3-0};
+  let Inst{5}     = Dm{4};
+  let Inst{15-12} = Dd{3-0};
+  let Inst{22}    = Dd{4};
+
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{19-16} = opcod3;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 1;          // Double precision
+  let Inst{7-6}   = opcod4;
+  let Inst{4}     = opcod5;
+}
+
+// Double precision, binary
+class ADbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops,
+           dag iops, InstrItinClass itin, string opc, string asm,
+           list<dag> pattern>
+  : VFPAI<oops, iops, VFPBinaryFrm, itin, opc, asm, pattern> {
+  // Instruction operands.
+  bits<5> Dd;
+  bits<5> Dn;
+  bits<5> Dm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Dm{3-0};
+  let Inst{5}     = Dm{4};
+  let Inst{19-16} = Dn{3-0};
+  let Inst{7}     = Dn{4};
+  let Inst{15-12} = Dd{3-0};
+  let Inst{22}    = Dd{4};
+
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 1;          // Double precision
+  let Inst{6}     = op6;
+  let Inst{4}     = op4;
+
+  let Predicates = [HasVFP2, HasDPVFP];
+}
+
+// FP, binary, not predicated
+class ADbInp<bits<5> opcod1, bits<2> opcod2, bit opcod3, dag oops, dag iops,
+           InstrItinClass itin, string asm, list<dag> pattern>
+  : VFPXI<oops, iops, AddrModeNone, 4, IndexModeNone, VFPBinaryFrm, itin,
+          asm, "", pattern>
+{
+  // Instruction operands.
+  bits<5> Dd;
+  bits<5> Dn;
+  bits<5> Dm;
+
+  let Inst{31-28} = 0b1111;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Dm{3-0};
+  let Inst{5}     = Dm{4};
+  let Inst{19-16} = Dn{3-0};
+  let Inst{7}     = Dn{4};
+  let Inst{15-12} = Dd{3-0};
+  let Inst{22}    = Dd{4};
+
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 1; // double precision
+  let Inst{6}     = opcod3;
+  let Inst{4}     = 0;
+
+  let Predicates = [HasVFP2, HasDPVFP];
+}
+
+// Single precision, unary, predicated
+class ASuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
+           bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc,
+           string asm, list<dag> pattern>
+  : VFPAI<oops, iops, VFPUnaryFrm, itin, opc, asm, pattern> {
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{19-16} = opcod3;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 0;          // Single precision
+  let Inst{7-6}   = opcod4;
+  let Inst{4}     = opcod5;
+}
+
+// Single precision, unary, non-predicated
+class ASuInp<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
+             bit opcod5, dag oops, dag iops, InstrItinClass itin,
+             string asm, list<dag> pattern>
+  : VFPXI<oops, iops, AddrModeNone, 4, IndexModeNone,
+          VFPUnaryFrm, itin, asm, "", pattern> {
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Sm;
+
+  let Inst{31-28} = 0b1111;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{19-16} = opcod3;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 0;          // Single precision
+  let Inst{7-6}   = opcod4;
+  let Inst{4}     = opcod5;
+}
+
+// Single precision unary, if no NEON. Same as ASuI except not available if
+// NEON is enabled.
+class ASuIn<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
+            bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc,
+            string asm, list<dag> pattern>
+  : ASuI<opcod1, opcod2, opcod3, opcod4, opcod5, oops, iops, itin, opc, asm,
+         pattern> {
+  list<Predicate> Predicates = [HasVFP2,DontUseNEONForFP];
+}
+
+// Single precision, binary
+class ASbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, dag iops,
+           InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : VFPAI<oops, iops, VFPBinaryFrm, itin, opc, asm, pattern> {
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Sn;
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{19-16} = Sn{4-1};
+  let Inst{7}     = Sn{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 0;          // Single precision
+  let Inst{6}     = op6;
+  let Inst{4}     = op4;
+}
+
+// Single precision, binary, not predicated
+class ASbInp<bits<5> opcod1, bits<2> opcod2, bit opcod3, dag oops, dag iops,
+           InstrItinClass itin, string asm, list<dag> pattern>
+  : VFPXI<oops, iops, AddrModeNone, 4, IndexModeNone,
+          VFPBinaryFrm, itin, asm, "", pattern>
+{
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Sn;
+  bits<5> Sm;
+
+  let Inst{31-28} = 0b1111;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{19-16} = Sn{4-1};
+  let Inst{7}     = Sn{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 0; // Single precision
+  let Inst{6}     = opcod3;
+  let Inst{4}     = 0;
+}
+
+// Single precision binary, if no NEON. Same as ASbI except not available if
+// NEON is enabled.
+class ASbIn<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops,
+            dag iops, InstrItinClass itin, string opc, string asm,
+            list<dag> pattern>
+  : ASbI<opcod1, opcod2, op6, op4, oops, iops, itin, opc, asm, pattern> {
+  list<Predicate> Predicates = [HasVFP2,DontUseNEONForFP];
+
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Sn;
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{19-16} = Sn{4-1};
+  let Inst{7}     = Sn{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+}
+
+// VFP conversion instructions
+class AVConv1I<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<4> opcod4,
+               dag oops, dag iops, InstrItinClass itin, string opc, string asm,
+               list<dag> pattern>
+  : VFPAI<oops, iops, VFPConv1Frm, itin, opc, asm, pattern> {
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{19-16} = opcod3;
+  let Inst{11-8}  = opcod4;
+  let Inst{6}     = 1;
+  let Inst{4}     = 0;
+}
+
+// VFP conversion between floating-point and fixed-point
+class AVConv1XI<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4, bit op5,
+                dag oops, dag iops, InstrItinClass itin, string opc, string asm,
+                list<dag> pattern>
+  : AVConv1I<op1, op2, op3, op4, oops, iops, itin, opc, asm, pattern> {
+  bits<5> fbits;
+  // size (fixed-point number): sx == 0 ? 16 : 32
+  let Inst{7} = op5; // sx
+  let Inst{5} = fbits{0};
+  let Inst{3-0} = fbits{4-1};
+}
+
+// VFP conversion instructions, if no NEON
+class AVConv1In<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<4> opcod4,
+                dag oops, dag iops, InstrItinClass itin,
+                string opc, string asm, list<dag> pattern>
+  : AVConv1I<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm,
+             pattern> {
+  list<Predicate> Predicates = [HasVFP2,DontUseNEONForFP];
+}
+
+class AVConvXI<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, Format f,
+               InstrItinClass itin,
+               string opc, string asm, list<dag> pattern>
+  : VFPAI<oops, iops, f, itin, opc, asm, pattern> {
+  let Inst{27-20} = opcod1;
+  let Inst{11-8}  = opcod2;
+  let Inst{4}     = 1;
+}
+
+class AVConv2I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops,
+               InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : AVConvXI<opcod1, opcod2, oops, iops, VFPConv2Frm, itin, opc, asm, pattern>;
+
+class AVConv3I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops,
+               InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : AVConvXI<opcod1, opcod2, oops, iops, VFPConv3Frm, itin, opc, asm, pattern>;
+
+class AVConv4I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops,
+               InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : AVConvXI<opcod1, opcod2, oops, iops, VFPConv4Frm, itin, opc, asm, pattern>;
+
+class AVConv5I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops,
+               InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : AVConvXI<opcod1, opcod2, oops, iops, VFPConv5Frm, itin, opc, asm, pattern>;
+
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ARM NEON Instruction templates.
+//
+
+class NeonI<dag oops, dag iops, AddrMode am, IndexMode im, Format f,
+            InstrItinClass itin, string opc, string dt, string asm, string cstr,
+            list<dag> pattern>
+  : InstARM<am, 4, im, f, NeonDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ins pred:$p));
+  let AsmString = !strconcat(opc, "${p}", ".", dt, "\t", asm);
+  let Pattern = pattern;
+  list<Predicate> Predicates = [HasNEON];
+  let DecoderNamespace = "NEON";
+}
+
+// Same as NeonI except it does not have a "data type" specifier.
+class NeonXI<dag oops, dag iops, AddrMode am, IndexMode im, Format f,
+             InstrItinClass itin, string opc, string asm, string cstr,
+             list<dag> pattern>
+  : InstARM<am, 4, im, f, NeonDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ins pred:$p));
+  let AsmString = !strconcat(opc, "${p}", "\t", asm);
+  let Pattern = pattern;
+  list<Predicate> Predicates = [HasNEON];
+  let DecoderNamespace = "NEON";
+}
+
+// Same as NeonI except it is not predicated
+class NeonInp<dag oops, dag iops, AddrMode am, IndexMode im, Format f,
+            InstrItinClass itin, string opc, string dt, string asm, string cstr,
+            list<dag> pattern>
+  : InstARM<am, 4, im, f, NeonDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = iops;
+  let AsmString = !strconcat(opc, ".", dt, "\t", asm);
+  let Pattern = pattern;
+  list<Predicate> Predicates = [HasNEON];
+  let DecoderNamespace = "NEON";
+
+  let Inst{31-28} = 0b1111;
+}
+
+class NLdSt<bit op23, bits<2> op21_20, bits<4> op11_8, bits<4> op7_4,
+            dag oops, dag iops, InstrItinClass itin,
+            string opc, string dt, string asm, string cstr, list<dag> pattern>
+  : NeonI<oops, iops, AddrMode6, IndexModeNone, NLdStFrm, itin, opc, dt, asm,
+          cstr, pattern> {
+  let Inst{31-24} = 0b11110100;
+  let Inst{23}    = op23;
+  let Inst{21-20} = op21_20;
+  let Inst{11-8}  = op11_8;
+  let Inst{7-4}   = op7_4;
+
+  let PostEncoderMethod = "NEONThumb2LoadStorePostEncoder";
+  let DecoderNamespace = "NEONLoadStore";
+
+  bits<5> Vd;
+  bits<6> Rn;
+  bits<4> Rm;
+
+  let Inst{22}    = Vd{4};
+  let Inst{15-12} = Vd{3-0};
+  let Inst{19-16} = Rn{3-0};
+  let Inst{3-0}   = Rm{3-0};
+}
+
+class NLdStLn<bit op23, bits<2> op21_20, bits<4> op11_8, bits<4> op7_4,
+            dag oops, dag iops, InstrItinClass itin,
+            string opc, string dt, string asm, string cstr, list<dag> pattern>
+  : NLdSt<op23, op21_20, op11_8, op7_4, oops, iops, itin, opc,
+          dt, asm, cstr, pattern> {
+  bits<3> lane;
+}
+
+class PseudoNLdSt<dag oops, dag iops, InstrItinClass itin, string cstr>
+  : InstARM<AddrMode6, 4, IndexModeNone, Pseudo, NeonDomain, cstr,
+            itin> {
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ins pred:$p));
+  list<Predicate> Predicates = [HasNEON];
+}
+
+class PseudoNeonI<dag oops, dag iops, InstrItinClass itin, string cstr,
+                  list<dag> pattern>
+  : InstARM<AddrModeNone, 4, IndexModeNone, Pseudo, NeonDomain, cstr,
+            itin> {
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ins pred:$p));
+  let Pattern = pattern;
+  list<Predicate> Predicates = [HasNEON];
+}
+
+class NDataI<dag oops, dag iops, Format f, InstrItinClass itin,
+             string opc, string dt, string asm, string cstr, list<dag> pattern>
+  : NeonI<oops, iops, AddrModeNone, IndexModeNone, f, itin, opc, dt, asm, cstr,
+          pattern> {
+  let Inst{31-25} = 0b1111001;
+  let PostEncoderMethod = "NEONThumb2DataIPostEncoder";
+  let DecoderNamespace = "NEONData";
+}
+
+class NDataXI<dag oops, dag iops, Format f, InstrItinClass itin,
+              string opc, string asm, string cstr, list<dag> pattern>
+  : NeonXI<oops, iops, AddrModeNone, IndexModeNone, f, itin, opc, asm,
+           cstr, pattern> {
+  let Inst{31-25} = 0b1111001;
+  let PostEncoderMethod = "NEONThumb2DataIPostEncoder";
+  let DecoderNamespace = "NEONData";
+}
+
+// NEON "one register and a modified immediate" format.
+class N1ModImm<bit op23, bits<3> op21_19, bits<4> op11_8, bit op7, bit op6,
+               bit op5, bit op4,
+               dag oops, dag iops, InstrItinClass itin,
+               string opc, string dt, string asm, string cstr,
+               list<dag> pattern>
+  : NDataI<oops, iops, N1RegModImmFrm, itin, opc, dt, asm, cstr, pattern> {
+  let Inst{23}    = op23;
+  let Inst{21-19} = op21_19;
+  let Inst{11-8}  = op11_8;
+  let Inst{7}     = op7;
+  let Inst{6}     = op6;
+  let Inst{5}     = op5;
+  let Inst{4}     = op4;
+
+  // Instruction operands.
+  bits<5> Vd;
+  bits<13> SIMM;
+
+  let Inst{15-12} = Vd{3-0};
+  let Inst{22}    = Vd{4};
+  let Inst{24}    = SIMM{7};
+  let Inst{18-16} = SIMM{6-4};
+  let Inst{3-0}   = SIMM{3-0};
+  let DecoderMethod = "DecodeNEONModImmInstruction";
+}
+
+// NEON 2 vector register format.
+class N2V<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16,
+          bits<5> op11_7, bit op6, bit op4,
+          dag oops, dag iops, InstrItinClass itin,
+          string opc, string dt, string asm, string cstr, list<dag> pattern>
+  : NDataI<oops, iops, N2RegFrm, itin, opc, dt, asm, cstr, pattern> {
+  let Inst{24-23} = op24_23;
+  let Inst{21-20} = op21_20;
+  let Inst{19-18} = op19_18;
+  let Inst{17-16} = op17_16;
+  let Inst{11-7}  = op11_7;
+  let Inst{6}     = op6;
+  let Inst{4}     = op4;
+
+  // Instruction operands.
+  bits<5> Vd;
+  bits<5> Vm;
+
+  let Inst{15-12} = Vd{3-0};
+  let Inst{22}    = Vd{4};
+  let Inst{3-0}   = Vm{3-0};
+  let Inst{5}     = Vm{4};
+}
+
+// Same as N2V but not predicated.
+class N2Vnp<bits<2> op19_18, bits<2> op17_16, bits<3> op10_8, bit op7, bit op6,
+            dag oops, dag iops, InstrItinClass itin, string OpcodeStr,
+            string Dt, list<dag> pattern>
+   : NeonInp<oops, iops, AddrModeNone, IndexModeNone, N2RegFrm, itin,
+             OpcodeStr, Dt, "$Vd, $Vm", "", pattern> {
+  bits<5> Vd;
+  bits<5> Vm;
+
+  // Encode instruction operands
+  let Inst{22}    = Vd{4};
+  let Inst{15-12} = Vd{3-0};
+  let Inst{5}     = Vm{4};
+  let Inst{3-0}   = Vm{3-0};
+
+  // Encode constant bits
+  let Inst{27-23} = 0b00111;
+  let Inst{21-20} = 0b11;
+  let Inst{19-18} = op19_18;
+  let Inst{17-16} = op17_16;
+  let Inst{11} = 0;
+  let Inst{10-8} = op10_8;
+  let Inst{7} = op7;
+  let Inst{6} = op6;
+  let Inst{4} = 0;
+
+  let DecoderNamespace = "NEON";
+}
+
+// Same as N2V except it doesn't have a datatype suffix.
+class N2VX<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16,
+           bits<5> op11_7, bit op6, bit op4,
+           dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, string cstr, list<dag> pattern>
+  : NDataXI<oops, iops, N2RegFrm, itin, opc, asm, cstr, pattern> {
+  let Inst{24-23} = op24_23;
+  let Inst{21-20} = op21_20;
+  let Inst{19-18} = op19_18;
+  let Inst{17-16} = op17_16;
+  let Inst{11-7}  = op11_7;
+  let Inst{6}     = op6;
+  let Inst{4}     = op4;
+
+  // Instruction operands.
+  bits<5> Vd;
+  bits<5> Vm;
+
+  let Inst{15-12} = Vd{3-0};
+  let Inst{22}    = Vd{4};
+  let Inst{3-0}   = Vm{3-0};
+  let Inst{5}     = Vm{4};
+}
+
+// NEON 2 vector register with immediate.
+class N2VImm<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
+             dag oops, dag iops, Format f, InstrItinClass itin,
+             string opc, string dt, string asm, string cstr, list<dag> pattern>
+  : NDataI<oops, iops, f, itin, opc, dt, asm, cstr, pattern> {
+  let Inst{24}   = op24;
+  let Inst{23}   = op23;
+  let Inst{11-8} = op11_8;
+  let Inst{7}    = op7;
+  let Inst{6}    = op6;
+  let Inst{4}    = op4;
+
+  // Instruction operands.
+  bits<5> Vd;
+  bits<5> Vm;
+  bits<6> SIMM;
+
+  let Inst{15-12} = Vd{3-0};
+  let Inst{22}    = Vd{4};
+  let Inst{3-0}   = Vm{3-0};
+  let Inst{5}     = Vm{4};
+  let Inst{21-16} = SIMM{5-0};
+}
+
+// NEON 3 vector register format.
+
+class N3VCommon<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6,
+                bit op4, dag oops, dag iops, Format f, InstrItinClass itin,
+                string opc, string dt, string asm, string cstr,
+                list<dag> pattern>
+  : NDataI<oops, iops, f, itin, opc, dt, asm, cstr, pattern> {
+  let Inst{24}    = op24;
+  let Inst{23}    = op23;
+  let Inst{21-20} = op21_20;
+  let Inst{11-8}  = op11_8;
+  let Inst{6}     = op6;
+  let Inst{4}     = op4;
+}
+
+class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4,
+          dag oops, dag iops, Format f, InstrItinClass itin,
+          string opc, string dt, string asm, string cstr, list<dag> pattern>
+  : N3VCommon<op24, op23, op21_20, op11_8, op6, op4,
+              oops, iops, f, itin, opc, dt, asm, cstr, pattern> {
+  // Instruction operands.
+  bits<5> Vd;
+  bits<5> Vn;
+  bits<5> Vm;
+
+  let Inst{15-12} = Vd{3-0};
+  let Inst{22}    = Vd{4};
+  let Inst{19-16} = Vn{3-0};
+  let Inst{7}     = Vn{4};
+  let Inst{3-0}   = Vm{3-0};
+  let Inst{5}     = Vm{4};
+}
+
+class N3Vnp<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6,
+                bit op4, dag oops, dag iops,Format f, InstrItinClass itin,
+                string OpcodeStr, string Dt, list<dag> pattern>
+  : NeonInp<oops, iops, AddrModeNone, IndexModeNone, f, itin, OpcodeStr,
+            Dt, "$Vd, $Vn, $Vm", "", pattern> {
+  bits<5> Vd;
+  bits<5> Vn;
+  bits<5> Vm;
+
+  // Encode instruction operands
+  let Inst{22} = Vd{4};
+  let Inst{15-12} = Vd{3-0};
+  let Inst{19-16} = Vn{3-0};
+  let Inst{7} = Vn{4};
+  let Inst{5} = Vm{4};
+  let Inst{3-0} = Vm{3-0};
+
+  // Encode constant bits
+  let Inst{27-23} = op27_23;
+  let Inst{21-20} = op21_20;
+  let Inst{11-8}  = op11_8;
+  let Inst{6}     = op6;
+  let Inst{4}     = op4;
+}
+
+class N3VLane32<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6,
+                bit op4, dag oops, dag iops, Format f, InstrItinClass itin,
+                string opc, string dt, string asm, string cstr,
+                list<dag> pattern>
+  : N3VCommon<op24, op23, op21_20, op11_8, op6, op4,
+              oops, iops, f, itin, opc, dt, asm, cstr, pattern> {
+
+  // Instruction operands.
+  bits<5> Vd;
+  bits<5> Vn;
+  bits<5> Vm;
+  bit lane;
+
+  let Inst{15-12} = Vd{3-0};
+  let Inst{22}    = Vd{4};
+  let Inst{19-16} = Vn{3-0};
+  let Inst{7}     = Vn{4};
+  let Inst{3-0}   = Vm{3-0};
+  let Inst{5}     = lane;
+}
+
+class N3VLane16<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6,
+                bit op4, dag oops, dag iops, Format f, InstrItinClass itin,
+                string opc, string dt, string asm, string cstr,
+                list<dag> pattern>
+  : N3VCommon<op24, op23, op21_20, op11_8, op6, op4,
+              oops, iops, f, itin, opc, dt, asm, cstr, pattern> {
+
+  // Instruction operands.
+  bits<5> Vd;
+  bits<5> Vn;
+  bits<5> Vm;
+  bits<2> lane;
+
+  let Inst{15-12} = Vd{3-0};
+  let Inst{22}    = Vd{4};
+  let Inst{19-16} = Vn{3-0};
+  let Inst{7}     = Vn{4};
+  let Inst{2-0}   = Vm{2-0};
+  let Inst{5}     = lane{1};
+  let Inst{3}     = lane{0};
+}
+
+// Same as N3V except it doesn't have a data type suffix.
+class N3VX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6,
+           bit op4,
+           dag oops, dag iops, Format f, InstrItinClass itin,
+           string opc, string asm, string cstr, list<dag> pattern>
+  : NDataXI<oops, iops, f, itin, opc, asm, cstr, pattern> {
+  let Inst{24}    = op24;
+  let Inst{23}    = op23;
+  let Inst{21-20} = op21_20;
+  let Inst{11-8}  = op11_8;
+  let Inst{6}     = op6;
+  let Inst{4}     = op4;
+
+  // Instruction operands.
+  bits<5> Vd;
+  bits<5> Vn;
+  bits<5> Vm;
+
+  let Inst{15-12} = Vd{3-0};
+  let Inst{22}    = Vd{4};
+  let Inst{19-16} = Vn{3-0};
+  let Inst{7}     = Vn{4};
+  let Inst{3-0}   = Vm{3-0};
+  let Inst{5}     = Vm{4};
+}
+
+// NEON VMOVs between scalar and core registers.
+class NVLaneOp<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
+               dag oops, dag iops, Format f, InstrItinClass itin,
+               string opc, string dt, string asm, list<dag> pattern>
+  : InstARM<AddrModeNone, 4, IndexModeNone, f, NeonDomain,
+            "", itin> {
+  let Inst{27-20} = opcod1;
+  let Inst{11-8}  = opcod2;
+  let Inst{6-5}   = opcod3;
+  let Inst{4}     = 1;
+  // A8.6.303, A8.6.328, A8.6.329
+  let Inst{3-0}   = 0b0000;
+
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ins pred:$p));
+  let AsmString = !strconcat(opc, "${p}", ".", dt, "\t", asm);
+  let Pattern = pattern;
+  list<Predicate> Predicates = [HasNEON];
+
+  let PostEncoderMethod = "NEONThumb2DupPostEncoder";
+  let DecoderNamespace = "NEONDup";
+
+  bits<5> V;
+  bits<4> R;
+  bits<4> p;
+  bits<4> lane;
+
+  let Inst{31-28} = p{3-0};
+  let Inst{7}     = V{4};
+  let Inst{19-16} = V{3-0};
+  let Inst{15-12} = R{3-0};
+}
+class NVGetLane<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
+                dag oops, dag iops, InstrItinClass itin,
+                string opc, string dt, string asm, list<dag> pattern>
+  : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NGetLnFrm, itin,
+             opc, dt, asm, pattern>;
+class NVSetLane<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
+                dag oops, dag iops, InstrItinClass itin,
+                string opc, string dt, string asm, list<dag> pattern>
+  : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NSetLnFrm, itin,
+             opc, dt, asm, pattern>;
+class NVDup<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
+            dag oops, dag iops, InstrItinClass itin,
+            string opc, string dt, string asm, list<dag> pattern>
+  : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NDupFrm, itin,
+             opc, dt, asm, pattern>;
+
+// Vector Duplicate Lane (from scalar to all elements)
+class NVDupLane<bits<4> op19_16, bit op6, dag oops, dag iops,
+                InstrItinClass itin, string opc, string dt, string asm,
+                list<dag> pattern>
+  : NDataI<oops, iops, NVDupLnFrm, itin, opc, dt, asm, "", pattern> {
+  let Inst{24-23} = 0b11;
+  let Inst{21-20} = 0b11;
+  let Inst{19-16} = op19_16;
+  let Inst{11-7}  = 0b11000;
+  let Inst{6}     = op6;
+  let Inst{4}     = 0;
+
+  bits<5> Vd;
+  bits<5> Vm;
+
+  let Inst{22}     = Vd{4};
+  let Inst{15-12} = Vd{3-0};
+  let Inst{5}     = Vm{4};
+  let Inst{3-0} = Vm{3-0};
+}
+
+// NEONFPPat - Same as Pat<>, but requires that the compiler be using NEON
+// for single-precision FP.
+class NEONFPPat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [HasNEON,UseNEONForFP];
+}
+
+// VFP/NEON Instruction aliases for type suffices.
+class VFPDataTypeInstAlias<string opc, string dt, string asm, dag Result> :
+  InstAlias<!strconcat(opc, dt, "\t", asm), Result>, Requires<[HasVFP2]>;
+
+multiclass VFPDTAnyInstAlias<string opc, string asm, dag Result> {
+  def : VFPDataTypeInstAlias<opc, ".8", asm, Result>;
+  def : VFPDataTypeInstAlias<opc, ".16", asm, Result>;
+  def : VFPDataTypeInstAlias<opc, ".32", asm, Result>;
+  def : VFPDataTypeInstAlias<opc, ".64", asm, Result>;
+}
+
+multiclass NEONDTAnyInstAlias<string opc, string asm, dag Result> {
+  let Predicates = [HasNEON] in {
+  def : VFPDataTypeInstAlias<opc, ".8", asm, Result>;
+  def : VFPDataTypeInstAlias<opc, ".16", asm, Result>;
+  def : VFPDataTypeInstAlias<opc, ".32", asm, Result>;
+  def : VFPDataTypeInstAlias<opc, ".64", asm, Result>;
+}
+}
+
+// The same alias classes using AsmPseudo instead, for the more complex
+// stuff in NEON that InstAlias can't quite handle.
+// Note that we can't use anonymous defm references here like we can
+// above, as we care about the ultimate instruction enum names generated, unlike
+// for instalias defs.
+class NEONDataTypeAsmPseudoInst<string opc, string dt, string asm, dag iops> :
+  AsmPseudoInst<!strconcat(opc, dt, "\t", asm), iops>, Requires<[HasNEON]>;
+
+// Data type suffix token aliases. Implements Table A7-3 in the ARM ARM.
+def : TokenAlias<".s8", ".i8">;
+def : TokenAlias<".u8", ".i8">;
+def : TokenAlias<".s16", ".i16">;
+def : TokenAlias<".u16", ".i16">;
+def : TokenAlias<".s32", ".i32">;
+def : TokenAlias<".u32", ".i32">;
+def : TokenAlias<".s64", ".i64">;
+def : TokenAlias<".u64", ".i64">;
+
+def : TokenAlias<".i8", ".8">;
+def : TokenAlias<".i16", ".16">;
+def : TokenAlias<".i32", ".32">;
+def : TokenAlias<".i64", ".64">;
+
+def : TokenAlias<".p8", ".8">;
+def : TokenAlias<".p16", ".16">;
+
+def : TokenAlias<".f32", ".32">;
+def : TokenAlias<".f64", ".64">;
+def : TokenAlias<".f", ".f32">;
+def : TokenAlias<".d", ".f64">;
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp
new file mode 100644
index 0000000..f235ac2
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp
@@ -0,0 +1,162 @@
+//===-- ARMInstrInfo.cpp - ARM Instruction Information --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMInstrInfo.h"
+#include "ARM.h"
+#include "ARMConstantPoolValue.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMTargetMachine.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInst.h"
+using namespace llvm;
+
+ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI)
+  : ARMBaseInstrInfo(STI), RI(STI) {
+}
+
+/// getNoopForMachoTarget - Return the noop instruction to use for a noop.
+void ARMInstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+  if (hasNOP()) {
+    NopInst.setOpcode(ARM::HINT);
+    NopInst.addOperand(MCOperand::CreateImm(0));
+    NopInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+    NopInst.addOperand(MCOperand::CreateReg(0));
+  } else {
+    NopInst.setOpcode(ARM::MOVr);
+    NopInst.addOperand(MCOperand::CreateReg(ARM::R0));
+    NopInst.addOperand(MCOperand::CreateReg(ARM::R0));
+    NopInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+    NopInst.addOperand(MCOperand::CreateReg(0));
+    NopInst.addOperand(MCOperand::CreateReg(0));
+  }
+}
+
+unsigned ARMInstrInfo::getUnindexedOpcode(unsigned Opc) const {
+  switch (Opc) {
+  default: break;
+  case ARM::LDR_PRE_IMM:
+  case ARM::LDR_PRE_REG:
+  case ARM::LDR_POST_IMM:
+  case ARM::LDR_POST_REG:
+    return ARM::LDRi12;
+  case ARM::LDRH_PRE:
+  case ARM::LDRH_POST:
+    return ARM::LDRH;
+  case ARM::LDRB_PRE_IMM:
+  case ARM::LDRB_PRE_REG:
+  case ARM::LDRB_POST_IMM:
+  case ARM::LDRB_POST_REG:
+    return ARM::LDRBi12;
+  case ARM::LDRSH_PRE:
+  case ARM::LDRSH_POST:
+    return ARM::LDRSH;
+  case ARM::LDRSB_PRE:
+  case ARM::LDRSB_POST:
+    return ARM::LDRSB;
+  case ARM::STR_PRE_IMM:
+  case ARM::STR_PRE_REG:
+  case ARM::STR_POST_IMM:
+  case ARM::STR_POST_REG:
+    return ARM::STRi12;
+  case ARM::STRH_PRE:
+  case ARM::STRH_POST:
+    return ARM::STRH;
+  case ARM::STRB_PRE_IMM:
+  case ARM::STRB_PRE_REG:
+  case ARM::STRB_POST_IMM:
+  case ARM::STRB_POST_REG:
+    return ARM::STRBi12;
+  }
+
+  return 0;
+}
+
+namespace {
+  /// ARMCGBR - Create Global Base Reg pass. This initializes the PIC
+  /// global base register for ARM ELF.
+  struct ARMCGBR : public MachineFunctionPass {
+    static char ID;
+    ARMCGBR() : MachineFunctionPass(ID) {}
+
+    bool runOnMachineFunction(MachineFunction &MF) override {
+      ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+      if (AFI->getGlobalBaseReg() == 0)
+        return false;
+
+      const ARMTargetMachine *TM =
+        static_cast<const ARMTargetMachine *>(&MF.getTarget());
+      if (TM->getRelocationModel() != Reloc::PIC_)
+        return false;
+
+      LLVMContext *Context = &MF.getFunction()->getContext();
+      unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
+      unsigned PCAdj = TM->getSubtarget<ARMSubtarget>().isThumb() ? 4 : 8;
+      ARMConstantPoolValue *CPV = ARMConstantPoolSymbol::Create(
+          *Context, "_GLOBAL_OFFSET_TABLE_", ARMPCLabelIndex, PCAdj);
+
+      unsigned Align = TM->getDataLayout()
+          ->getPrefTypeAlignment(Type::getInt32PtrTy(*Context));
+      unsigned Idx = MF.getConstantPool()->getConstantPoolIndex(CPV, Align);
+
+      MachineBasicBlock &FirstMBB = MF.front();
+      MachineBasicBlock::iterator MBBI = FirstMBB.begin();
+      DebugLoc DL = FirstMBB.findDebugLoc(MBBI);
+      unsigned TempReg =
+          MF.getRegInfo().createVirtualRegister(&ARM::rGPRRegClass);
+      unsigned Opc = TM->getSubtarget<ARMSubtarget>().isThumb2() ?
+                     ARM::t2LDRpci : ARM::LDRcp;
+      const TargetInstrInfo &TII = *TM->getInstrInfo();
+      MachineInstrBuilder MIB = BuildMI(FirstMBB, MBBI, DL,
+                                        TII.get(Opc), TempReg)
+                                .addConstantPoolIndex(Idx);
+      if (Opc == ARM::LDRcp)
+        MIB.addImm(0);
+      AddDefaultPred(MIB);
+
+      // Fix the GOT address by adding pc.
+      unsigned GlobalBaseReg = AFI->getGlobalBaseReg();
+      Opc = TM->getSubtarget<ARMSubtarget>().isThumb2() ? ARM::tPICADD
+                                                        : ARM::PICADD;
+      MIB = BuildMI(FirstMBB, MBBI, DL, TII.get(Opc), GlobalBaseReg)
+                .addReg(TempReg)
+                .addImm(ARMPCLabelIndex);
+      if (Opc == ARM::PICADD)
+        AddDefaultPred(MIB);
+
+
+      return true;
+    }
+
+    const char *getPassName() const override {
+      return "ARM PIC Global Base Reg Initialization";
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+  };
+}
+
+char ARMCGBR::ID = 0;
+FunctionPass*
+llvm::createARMGlobalBaseRegPass() { return new ARMCGBR(); }
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h
new file mode 100644
index 0000000..b09958a
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h
@@ -0,0 +1,44 @@
+//===-- ARMInstrInfo.h - ARM Instruction Information ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMINSTRUCTIONINFO_H
+#define ARMINSTRUCTIONINFO_H
+
+#include "ARMBaseInstrInfo.h"
+#include "ARMRegisterInfo.h"
+
+namespace llvm {
+  class ARMSubtarget;
+
+class ARMInstrInfo : public ARMBaseInstrInfo {
+  ARMRegisterInfo RI;
+public:
+  explicit ARMInstrInfo(const ARMSubtarget &STI);
+
+  /// getNoopForMachoTarget - Return the noop instruction to use for a noop.
+  void getNoopForMachoTarget(MCInst &NopInst) const override;
+
+  // Return the non-pre/post incrementing version of 'Opc'. Return 0
+  // if there is not such an opcode.
+  unsigned getUnindexedOpcode(unsigned Opc) const override;
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  const ARMRegisterInfo &getRegisterInfo() const override { return RI; }
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td
new file mode 100644
index 0000000..a02d997
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -0,0 +1,5596 @@
+//===- ARMInstrInfo.td - Target Description for ARM Target -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the ARM instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ARM specific DAG Nodes.
+//
+
+// Type profiles.
+def SDT_ARMCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_ARMCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>;
+def SDT_ARMStructByVal : SDTypeProfile<0, 4,
+                                       [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
+                                        SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
+
+def SDT_ARMSaveCallPC : SDTypeProfile<0, 1, []>;
+
+def SDT_ARMcall    : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
+
+def SDT_ARMCMov    : SDTypeProfile<1, 3,
+                                   [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+                                    SDTCisVT<3, i32>]>;
+
+def SDT_ARMBrcond  : SDTypeProfile<0, 2,
+                                   [SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>]>;
+
+def SDT_ARMBrJT    : SDTypeProfile<0, 3,
+                                  [SDTCisPtrTy<0>, SDTCisVT<1, i32>,
+                                   SDTCisVT<2, i32>]>;
+
+def SDT_ARMBr2JT   : SDTypeProfile<0, 4,
+                                  [SDTCisPtrTy<0>, SDTCisVT<1, i32>,
+                                   SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
+
+def SDT_ARMBCC_i64 : SDTypeProfile<0, 6,
+                                  [SDTCisVT<0, i32>,
+                                   SDTCisVT<1, i32>, SDTCisVT<2, i32>,
+                                   SDTCisVT<3, i32>, SDTCisVT<4, i32>,
+                                   SDTCisVT<5, OtherVT>]>;
+
+def SDT_ARMAnd     : SDTypeProfile<1, 2,
+                                   [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
+                                    SDTCisVT<2, i32>]>;
+
+def SDT_ARMCmp     : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
+
+def SDT_ARMPICAdd  : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>,
+                                          SDTCisPtrTy<1>, SDTCisVT<2, i32>]>;
+
+def SDT_ARMThreadPointer : SDTypeProfile<1, 0, [SDTCisPtrTy<0>]>;
+def SDT_ARMEH_SJLJ_Setjmp : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisPtrTy<1>,
+                                                 SDTCisInt<2>]>;
+def SDT_ARMEH_SJLJ_Longjmp: SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisInt<1>]>;
+
+def SDT_ARMMEMBARRIER     : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def SDT_ARMPREFETCH : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisSameAs<1, 2>,
+                                           SDTCisInt<1>]>;
+
+def SDT_ARMTCRET : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+
+def SDT_ARMBFI : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
+                                      SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
+
+def SDT_ARMVMAXNM : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>, SDTCisFP<2>]>;
+def SDT_ARMVMINNM : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>, SDTCisFP<2>]>;
+
+def SDTBinaryArithWithFlags : SDTypeProfile<2, 2,
+                                            [SDTCisSameAs<0, 2>,
+                                             SDTCisSameAs<0, 3>,
+                                             SDTCisInt<0>, SDTCisVT<1, i32>]>;
+
+// SDTBinaryArithWithFlagsInOut - RES1, CPSR = op LHS, RHS, CPSR
+def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3,
+                                            [SDTCisSameAs<0, 2>,
+                                             SDTCisSameAs<0, 3>,
+                                             SDTCisInt<0>,
+                                             SDTCisVT<1, i32>,
+                                             SDTCisVT<4, i32>]>;
+
+def SDT_ARM64bitmlal : SDTypeProfile<2,4, [ SDTCisVT<0, i32>, SDTCisVT<1, i32>,
+                                        SDTCisVT<2, i32>, SDTCisVT<3, i32>,
+                                        SDTCisVT<4, i32>, SDTCisVT<5, i32> ] >;
+def ARMUmlal         : SDNode<"ARMISD::UMLAL", SDT_ARM64bitmlal>;
+def ARMSmlal         : SDNode<"ARMISD::SMLAL", SDT_ARM64bitmlal>;
+
+// Node definitions.
+def ARMWrapper       : SDNode<"ARMISD::Wrapper",     SDTIntUnaryOp>;
+def ARMWrapperPIC    : SDNode<"ARMISD::WrapperPIC",  SDTIntUnaryOp>;
+def ARMWrapperJT     : SDNode<"ARMISD::WrapperJT",   SDTIntBinOp>;
+
+def ARMcallseq_start : SDNode<"ISD::CALLSEQ_START", SDT_ARMCallSeqStart,
+                              [SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>;
+def ARMcallseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_ARMCallSeqEnd,
+                              [SDNPHasChain, SDNPSideEffect,
+                               SDNPOptInGlue, SDNPOutGlue]>;
+def ARMcopystructbyval : SDNode<"ARMISD::COPY_STRUCT_BYVAL" ,
+                                SDT_ARMStructByVal,
+                                [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
+                                 SDNPMayStore, SDNPMayLoad]>;
+
+def ARMcall          : SDNode<"ARMISD::CALL", SDT_ARMcall,
+                              [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                               SDNPVariadic]>;
+def ARMcall_pred    : SDNode<"ARMISD::CALL_PRED", SDT_ARMcall,
+                              [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                               SDNPVariadic]>;
+def ARMcall_nolink   : SDNode<"ARMISD::CALL_NOLINK", SDT_ARMcall,
+                              [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                               SDNPVariadic]>;
+
+def ARMretflag       : SDNode<"ARMISD::RET_FLAG", SDTNone,
+                              [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def ARMintretflag    : SDNode<"ARMISD::INTRET_FLAG", SDT_ARMcall,
+                              [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def ARMcmov          : SDNode<"ARMISD::CMOV", SDT_ARMCMov,
+                              [SDNPInGlue]>;
+
+def ARMbrcond        : SDNode<"ARMISD::BRCOND", SDT_ARMBrcond,
+                              [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>;
+
+def ARMbrjt          : SDNode<"ARMISD::BR_JT", SDT_ARMBrJT,
+                              [SDNPHasChain]>;
+def ARMbr2jt         : SDNode<"ARMISD::BR2_JT", SDT_ARMBr2JT,
+                              [SDNPHasChain]>;
+
+def ARMBcci64        : SDNode<"ARMISD::BCC_i64", SDT_ARMBCC_i64,
+                              [SDNPHasChain]>;
+
+def ARMcmp           : SDNode<"ARMISD::CMP", SDT_ARMCmp,
+                              [SDNPOutGlue]>;
+
+def ARMcmn           : SDNode<"ARMISD::CMN", SDT_ARMCmp,
+                              [SDNPOutGlue]>;
+
+def ARMcmpZ          : SDNode<"ARMISD::CMPZ", SDT_ARMCmp,
+                              [SDNPOutGlue, SDNPCommutative]>;
+
+def ARMpic_add       : SDNode<"ARMISD::PIC_ADD", SDT_ARMPICAdd>;
+
+def ARMsrl_flag      : SDNode<"ARMISD::SRL_FLAG", SDTIntUnaryOp, [SDNPOutGlue]>;
+def ARMsra_flag      : SDNode<"ARMISD::SRA_FLAG", SDTIntUnaryOp, [SDNPOutGlue]>;
+def ARMrrx           : SDNode<"ARMISD::RRX"     , SDTIntUnaryOp, [SDNPInGlue ]>;
+
+def ARMaddc          : SDNode<"ARMISD::ADDC",  SDTBinaryArithWithFlags,
+                              [SDNPCommutative]>;
+def ARMsubc          : SDNode<"ARMISD::SUBC",  SDTBinaryArithWithFlags>;
+def ARMadde          : SDNode<"ARMISD::ADDE",  SDTBinaryArithWithFlagsInOut>;
+def ARMsube          : SDNode<"ARMISD::SUBE",  SDTBinaryArithWithFlagsInOut>;
+
+def ARMthread_pointer: SDNode<"ARMISD::THREAD_POINTER", SDT_ARMThreadPointer>;
+def ARMeh_sjlj_setjmp: SDNode<"ARMISD::EH_SJLJ_SETJMP",
+                               SDT_ARMEH_SJLJ_Setjmp,
+                               [SDNPHasChain, SDNPSideEffect]>;
+def ARMeh_sjlj_longjmp: SDNode<"ARMISD::EH_SJLJ_LONGJMP",
+                               SDT_ARMEH_SJLJ_Longjmp,
+                               [SDNPHasChain, SDNPSideEffect]>;
+
+def ARMMemBarrierMCR  : SDNode<"ARMISD::MEMBARRIER_MCR", SDT_ARMMEMBARRIER,
+                               [SDNPHasChain, SDNPSideEffect]>;
+def ARMPreload        : SDNode<"ARMISD::PRELOAD", SDT_ARMPREFETCH,
+                               [SDNPHasChain, SDNPMayLoad, SDNPMayStore]>;
+
+def ARMrbit          : SDNode<"ARMISD::RBIT", SDTIntUnaryOp>;
+
+def ARMtcret         : SDNode<"ARMISD::TC_RETURN", SDT_ARMTCRET,
+                        [SDNPHasChain,  SDNPOptInGlue, SDNPVariadic]>;
+
+def ARMbfi           : SDNode<"ARMISD::BFI", SDT_ARMBFI>;
+
+def ARMvmaxnm        : SDNode<"ARMISD::VMAXNM", SDT_ARMVMAXNM, []>;
+def ARMvminnm        : SDNode<"ARMISD::VMINNM", SDT_ARMVMINNM, []>;
+
+//===----------------------------------------------------------------------===//
+// ARM Instruction Predicate Definitions.
+//
+def HasV4T           : Predicate<"Subtarget->hasV4TOps()">,
+                                 AssemblerPredicate<"HasV4TOps", "armv4t">;
+def NoV4T            : Predicate<"!Subtarget->hasV4TOps()">;
+def HasV5T           : Predicate<"Subtarget->hasV5TOps()">,
+                                 AssemblerPredicate<"HasV5TOps", "armv5t">;
+def HasV5TE          : Predicate<"Subtarget->hasV5TEOps()">,
+                                 AssemblerPredicate<"HasV5TEOps", "armv5te">;
+def HasV6            : Predicate<"Subtarget->hasV6Ops()">,
+                                 AssemblerPredicate<"HasV6Ops", "armv6">;
+def NoV6             : Predicate<"!Subtarget->hasV6Ops()">;
+def HasV6M           : Predicate<"Subtarget->hasV6MOps()">,
+                                 AssemblerPredicate<"HasV6MOps",
+                                                    "armv6m or armv6t2">;
+def HasV6T2          : Predicate<"Subtarget->hasV6T2Ops()">,
+                                 AssemblerPredicate<"HasV6T2Ops", "armv6t2">;
+def NoV6T2           : Predicate<"!Subtarget->hasV6T2Ops()">;
+def HasV7            : Predicate<"Subtarget->hasV7Ops()">,
+                                 AssemblerPredicate<"HasV7Ops", "armv7">;
+def HasV8            : Predicate<"Subtarget->hasV8Ops()">,
+                                 AssemblerPredicate<"HasV8Ops", "armv8">;
+def PreV8            : Predicate<"!Subtarget->hasV8Ops()">,
+                                 AssemblerPredicate<"!HasV8Ops", "armv7 or earlier">;
+def NoVFP            : Predicate<"!Subtarget->hasVFP2()">;
+def HasVFP2          : Predicate<"Subtarget->hasVFP2()">,
+                                 AssemblerPredicate<"FeatureVFP2", "VFP2">;
+def HasVFP3          : Predicate<"Subtarget->hasVFP3()">,
+                                 AssemblerPredicate<"FeatureVFP3", "VFP3">;
+def HasVFP4          : Predicate<"Subtarget->hasVFP4()">,
+                                 AssemblerPredicate<"FeatureVFP4", "VFP4">;
+def HasDPVFP         : Predicate<"!Subtarget->isFPOnlySP()">,
+                                 AssemblerPredicate<"!FeatureVFPOnlySP",
+                                                    "double precision VFP">;
+def HasFPARMv8       : Predicate<"Subtarget->hasFPARMv8()">,
+                                 AssemblerPredicate<"FeatureFPARMv8", "FPARMv8">;
+def HasNEON          : Predicate<"Subtarget->hasNEON()">,
+                                 AssemblerPredicate<"FeatureNEON", "NEON">;
+def HasCrypto        : Predicate<"Subtarget->hasCrypto()">,
+                                 AssemblerPredicate<"FeatureCrypto", "crypto">;
+def HasCRC           : Predicate<"Subtarget->hasCRC()">,
+                                 AssemblerPredicate<"FeatureCRC", "crc">;
+def HasFP16          : Predicate<"Subtarget->hasFP16()">,
+                                 AssemblerPredicate<"FeatureFP16","half-float">;
+def HasDivide        : Predicate<"Subtarget->hasDivide()">,
+                                 AssemblerPredicate<"FeatureHWDiv", "divide in THUMB">;
+def HasDivideInARM   : Predicate<"Subtarget->hasDivideInARMMode()">,
+                                 AssemblerPredicate<"FeatureHWDivARM", "divide in ARM">;
+def HasT2ExtractPack : Predicate<"Subtarget->hasT2ExtractPack()">,
+                                 AssemblerPredicate<"FeatureT2XtPk",
+                                                     "pack/extract">;
+def HasThumb2DSP     : Predicate<"Subtarget->hasThumb2DSP()">,
+                                 AssemblerPredicate<"FeatureDSPThumb2",
+                                                    "thumb2-dsp">;
+def HasDB            : Predicate<"Subtarget->hasDataBarrier()">,
+                                 AssemblerPredicate<"FeatureDB",
+                                                    "data-barriers">;
+def HasMP            : Predicate<"Subtarget->hasMPExtension()">,
+                                 AssemblerPredicate<"FeatureMP",
+                                                    "mp-extensions">;
+def HasTrustZone     : Predicate<"Subtarget->hasTrustZone()">,
+                                 AssemblerPredicate<"FeatureTrustZone",
+                                                    "TrustZone">;
+def HasZCZ           : Predicate<"Subtarget->hasZeroCycleZeroing()">;
+def UseNEONForFP     : Predicate<"Subtarget->useNEONForSinglePrecisionFP()">;
+def DontUseNEONForFP : Predicate<"!Subtarget->useNEONForSinglePrecisionFP()">;
+def IsThumb          : Predicate<"Subtarget->isThumb()">,
+                                 AssemblerPredicate<"ModeThumb", "thumb">;
+def IsThumb1Only     : Predicate<"Subtarget->isThumb1Only()">;
+def IsThumb2         : Predicate<"Subtarget->isThumb2()">,
+                                 AssemblerPredicate<"ModeThumb,FeatureThumb2",
+                                                    "thumb2">;
+def IsMClass         : Predicate<"Subtarget->isMClass()">,
+                                 AssemblerPredicate<"FeatureMClass", "armv*m">;
+def IsNotMClass      : Predicate<"!Subtarget->isMClass()">,
+                                 AssemblerPredicate<"!FeatureMClass",
+                                                    "!armv*m">;
+def IsARM            : Predicate<"!Subtarget->isThumb()">,
+                                 AssemblerPredicate<"!ModeThumb", "arm-mode">;
+def IsIOS            : Predicate<"Subtarget->isTargetIOS()">;
+def IsNotIOS         : Predicate<"!Subtarget->isTargetIOS()">;
+def IsMachO          : Predicate<"Subtarget->isTargetMachO()">;
+def IsNotMachO       : Predicate<"!Subtarget->isTargetMachO()">;
+def IsNaCl           : Predicate<"Subtarget->isTargetNaCl()">;
+def UseNaClTrap      : Predicate<"Subtarget->useNaClTrap()">,
+                                 AssemblerPredicate<"FeatureNaClTrap", "NaCl">;
+def DontUseNaClTrap  : Predicate<"!Subtarget->useNaClTrap()">;
+
+// FIXME: Eventually this will be just "hasV6T2Ops".
+def UseMovt          : Predicate<"Subtarget->useMovt(*MF)">;
+def DontUseMovt      : Predicate<"!Subtarget->useMovt(*MF)">;
+def UseFPVMLx        : Predicate<"Subtarget->useFPVMLx()">;
+def UseMulOps        : Predicate<"Subtarget->useMulOps()">;
+
+// Prefer fused MAC for fp mul + add over fp VMLA / VMLS if they are available.
+// But only select them if more precision in FP computation is allowed.
+// Do not use them for Darwin platforms.
+def UseFusedMAC      : Predicate<"(TM.Options.AllowFPOpFusion =="
+                                 " FPOpFusion::Fast && "
+                                 " Subtarget->hasVFP4()) && "
+                                 "!Subtarget->isTargetDarwin()">;
+def DontUseFusedMAC  : Predicate<"!(TM.Options.AllowFPOpFusion =="
+                                 " FPOpFusion::Fast &&"
+                                 " Subtarget->hasVFP4()) || "
+                                 "Subtarget->isTargetDarwin()">;
+
+// VGETLNi32 is microcoded on Swift - prefer VMOV.
+def HasFastVGETLNi32 : Predicate<"!Subtarget->isSwift()">;
+def HasSlowVGETLNi32 : Predicate<"Subtarget->isSwift()">;
+
+// VDUP.32 is microcoded on Swift - prefer VMOV.
+def HasFastVDUP32 : Predicate<"!Subtarget->isSwift()">;
+def HasSlowVDUP32 : Predicate<"Subtarget->isSwift()">;
+
+// Cortex-A9 prefers VMOVSR to VMOVDRR even when using NEON for scalar FP, as
+// this allows more effective execution domain optimization. See
+// setExecutionDomain().
+def UseVMOVSR : Predicate<"Subtarget->isCortexA9() || !Subtarget->useNEONForSinglePrecisionFP()">;
+def DontUseVMOVSR : Predicate<"!Subtarget->isCortexA9() && Subtarget->useNEONForSinglePrecisionFP()">;
+
+def IsLE             : Predicate<"getTargetLowering()->isLittleEndian()">;
+def IsBE             : Predicate<"getTargetLowering()->isBigEndian()">;
+
+//===----------------------------------------------------------------------===//
+// ARM Flag Definitions.
+
+class RegConstraint<string C> {
+  string Constraints = C;
+}
+
+//===----------------------------------------------------------------------===//
+//  ARM specific transformation functions and pattern fragments.
+//
+
+// imm_neg_XFORM - Return the negation of an i32 immediate value.
+def imm_neg_XFORM : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(-(int)N->getZExtValue(), MVT::i32);
+}]>;
+
+// imm_not_XFORM - Return the complement of a i32 immediate value.
+def imm_not_XFORM : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(~(int)N->getZExtValue(), MVT::i32);
+}]>;
+
+/// imm16_31 predicate - True if the 32-bit immediate is in the range [16,31].
+def imm16_31 : ImmLeaf<i32, [{
+  return (int32_t)Imm >= 16 && (int32_t)Imm < 32;
+}]>;
+
+def so_imm_neg_asmoperand : AsmOperandClass { let Name = "ARMSOImmNeg"; }
+def so_imm_neg : Operand<i32>, PatLeaf<(imm), [{
+    unsigned Value = -(unsigned)N->getZExtValue();
+    return Value && ARM_AM::getSOImmVal(Value) != -1;
+  }], imm_neg_XFORM> {
+  let ParserMatchClass = so_imm_neg_asmoperand;
+}
+
+// Note: this pattern doesn't require an encoder method and such, as it's
+// only used on aliases (Pat<> and InstAlias<>). The actual encoding
+// is handled by the destination instructions, which use so_imm.
+def so_imm_not_asmoperand : AsmOperandClass { let Name = "ARMSOImmNot"; }
+def so_imm_not : Operand<i32>, PatLeaf<(imm), [{
+    return ARM_AM::getSOImmVal(~(uint32_t)N->getZExtValue()) != -1;
+  }], imm_not_XFORM> {
+  let ParserMatchClass = so_imm_not_asmoperand;
+}
+
+// sext_16_node predicate - True if the SDNode is sign-extended 16 or more bits.
+def sext_16_node : PatLeaf<(i32 GPR:$a), [{
+  return CurDAG->ComputeNumSignBits(SDValue(N,0)) >= 17;
+}]>;
+
+/// Split a 32-bit immediate into two 16 bit parts.
+def hi16 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant((uint32_t)N->getZExtValue() >> 16, MVT::i32);
+}]>;
+
+def lo16AllZero : PatLeaf<(i32 imm), [{
+  // Returns true if all low 16-bits are 0.
+  return (((uint32_t)N->getZExtValue()) & 0xFFFFUL) == 0;
+}], hi16>;
+
+class BinOpWithFlagFrag<dag res> :
+      PatFrag<(ops node:$LHS, node:$RHS, node:$FLAG), res>;
+class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
+class UnOpFrag <dag res> : PatFrag<(ops node:$Src), res>;
+
+// An 'and' node with a single use.
+def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{
+  return N->hasOneUse();
+}]>;
+
+// An 'xor' node with a single use.
+def xor_su : PatFrag<(ops node:$lhs, node:$rhs), (xor node:$lhs, node:$rhs), [{
+  return N->hasOneUse();
+}]>;
+
+// An 'fmul' node with a single use.
+def fmul_su : PatFrag<(ops node:$lhs, node:$rhs), (fmul node:$lhs, node:$rhs),[{
+  return N->hasOneUse();
+}]>;
+
+// An 'fadd' node which checks for single non-hazardous use.
+def fadd_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fadd node:$lhs, node:$rhs),[{
+  return hasNoVMLxHazardUse(N);
+}]>;
+
+// An 'fsub' node which checks for single non-hazardous use.
+def fsub_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fsub node:$lhs, node:$rhs),[{
+  return hasNoVMLxHazardUse(N);
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Operand Definitions.
+//
+
+// Immediate operands with a shared generic asm render method.
+class ImmAsmOperand : AsmOperandClass { let RenderMethod = "addImmOperands"; }
+
+// Branch target.
+// FIXME: rename brtarget to t2_brtarget
+def brtarget : Operand<OtherVT> {
+  let EncoderMethod = "getBranchTargetOpValue";
+  let OperandType = "OPERAND_PCREL";
+  let DecoderMethod = "DecodeT2BROperand";
+}
+
+// FIXME: get rid of this one?
+def uncondbrtarget : Operand<OtherVT> {
+  let EncoderMethod = "getUnconditionalBranchTargetOpValue";
+  let OperandType = "OPERAND_PCREL";
+}
+
+// Branch target for ARM. Handles conditional/unconditional
+def br_target : Operand<OtherVT> {
+  let EncoderMethod = "getARMBranchTargetOpValue";
+  let OperandType = "OPERAND_PCREL";
+}
+
+// Call target.
+// FIXME: rename bltarget to t2_bl_target?
+def bltarget : Operand<i32> {
+  // Encoded the same as branch targets.
+  let EncoderMethod = "getBranchTargetOpValue";
+  let OperandType = "OPERAND_PCREL";
+}
+
+// Call target for ARM. Handles conditional/unconditional
+// FIXME: rename bl_target to t2_bltarget?
+def bl_target : Operand<i32> {
+  let EncoderMethod = "getARMBLTargetOpValue";
+  let OperandType = "OPERAND_PCREL";
+}
+
+def blx_target : Operand<i32> {
+  let EncoderMethod = "getARMBLXTargetOpValue";
+  let OperandType = "OPERAND_PCREL";
+}
+
+// A list of registers separated by comma. Used by load/store multiple.
+def RegListAsmOperand : AsmOperandClass { let Name = "RegList"; }
+def reglist : Operand<i32> {
+  let EncoderMethod = "getRegisterListOpValue";
+  let ParserMatchClass = RegListAsmOperand;
+  let PrintMethod = "printRegisterList";
+  let DecoderMethod = "DecodeRegListOperand";
+}
+
+def GPRPairOp : RegisterOperand<GPRPair, "printGPRPairOperand">;
+
+def DPRRegListAsmOperand : AsmOperandClass { let Name = "DPRRegList"; }
+def dpr_reglist : Operand<i32> {
+  let EncoderMethod = "getRegisterListOpValue";
+  let ParserMatchClass = DPRRegListAsmOperand;
+  let PrintMethod = "printRegisterList";
+  let DecoderMethod = "DecodeDPRRegListOperand";
+}
+
+def SPRRegListAsmOperand : AsmOperandClass { let Name = "SPRRegList"; }
+def spr_reglist : Operand<i32> {
+  let EncoderMethod = "getRegisterListOpValue";
+  let ParserMatchClass = SPRRegListAsmOperand;
+  let PrintMethod = "printRegisterList";
+  let DecoderMethod = "DecodeSPRRegListOperand";
+}
+
+// An operand for the CONSTPOOL_ENTRY pseudo-instruction.
+def cpinst_operand : Operand<i32> {
+  let PrintMethod = "printCPInstOperand";
+}
+
+// Local PC labels.
+def pclabel : Operand<i32> {
+  let PrintMethod = "printPCLabel";
+}
+
+// ADR instruction labels.
+def AdrLabelAsmOperand : AsmOperandClass { let Name = "AdrLabel"; }
+def adrlabel : Operand<i32> {
+  let EncoderMethod = "getAdrLabelOpValue";
+  let ParserMatchClass = AdrLabelAsmOperand;
+  let PrintMethod = "printAdrLabelOperand<0>";
+}
+
+def neon_vcvt_imm32 : Operand<i32> {
+  let EncoderMethod = "getNEONVcvtImm32OpValue";
+  let DecoderMethod = "DecodeVCVTImmOperand";
+}
+
+// rot_imm: An integer that encodes a rotate amount. Must be 8, 16, or 24.
+def rot_imm_XFORM: SDNodeXForm<imm, [{
+  switch (N->getZExtValue()){
+  default: llvm_unreachable(nullptr);
+  case 0:  return CurDAG->getTargetConstant(0, MVT::i32);
+  case 8:  return CurDAG->getTargetConstant(1, MVT::i32);
+  case 16: return CurDAG->getTargetConstant(2, MVT::i32);
+  case 24: return CurDAG->getTargetConstant(3, MVT::i32);
+  }
+}]>;
+def RotImmAsmOperand : AsmOperandClass {
+  let Name = "RotImm";
+  let ParserMethod = "parseRotImm";
+}
+def rot_imm : Operand<i32>, PatLeaf<(i32 imm), [{
+    int32_t v = N->getZExtValue();
+    return v == 8 || v == 16 || v == 24; }],
+    rot_imm_XFORM> {
+  let PrintMethod = "printRotImmOperand";
+  let ParserMatchClass = RotImmAsmOperand;
+}
+
+// shift_imm: An integer that encodes a shift amount and the type of shift
+// (asr or lsl). The 6-bit immediate encodes as:
+//    {5}     0 ==> lsl
+//            1     asr
+//    {4-0}   imm5 shift amount.
+//            asr #32 encoded as imm5 == 0.
+def ShifterImmAsmOperand : AsmOperandClass {
+  let Name = "ShifterImm";
+  let ParserMethod = "parseShifterImm";
+}
+def shift_imm : Operand<i32> {
+  let PrintMethod = "printShiftImmOperand";
+  let ParserMatchClass = ShifterImmAsmOperand;
+}
+
+// shifter_operand operands: so_reg_reg, so_reg_imm, and so_imm.
+def ShiftedRegAsmOperand : AsmOperandClass { let Name = "RegShiftedReg"; }
+def so_reg_reg : Operand<i32>,  // reg reg imm
+                 ComplexPattern<i32, 3, "SelectRegShifterOperand",
+                                [shl, srl, sra, rotr]> {
+  let EncoderMethod = "getSORegRegOpValue";
+  let PrintMethod = "printSORegRegOperand";
+  let DecoderMethod = "DecodeSORegRegOperand";
+  let ParserMatchClass = ShiftedRegAsmOperand;
+  let MIOperandInfo = (ops GPRnopc, GPRnopc, i32imm);
+}
+
+def ShiftedImmAsmOperand : AsmOperandClass { let Name = "RegShiftedImm"; }
+def so_reg_imm : Operand<i32>, // reg imm
+                 ComplexPattern<i32, 2, "SelectImmShifterOperand",
+                                [shl, srl, sra, rotr]> {
+  let EncoderMethod = "getSORegImmOpValue";
+  let PrintMethod = "printSORegImmOperand";
+  let DecoderMethod = "DecodeSORegImmOperand";
+  let ParserMatchClass = ShiftedImmAsmOperand;
+  let MIOperandInfo = (ops GPR, i32imm);
+}
+
+// FIXME: Does this need to be distinct from so_reg?
+def shift_so_reg_reg : Operand<i32>,    // reg reg imm
+                   ComplexPattern<i32, 3, "SelectShiftRegShifterOperand",
+                                  [shl,srl,sra,rotr]> {
+  let EncoderMethod = "getSORegRegOpValue";
+  let PrintMethod = "printSORegRegOperand";
+  let DecoderMethod = "DecodeSORegRegOperand";
+  let ParserMatchClass = ShiftedRegAsmOperand;
+  let MIOperandInfo = (ops GPR, GPR, i32imm);
+}
+
+// FIXME: Does this need to be distinct from so_reg?
+def shift_so_reg_imm : Operand<i32>,    // reg reg imm
+                   ComplexPattern<i32, 2, "SelectShiftImmShifterOperand",
+                                  [shl,srl,sra,rotr]> {
+  let EncoderMethod = "getSORegImmOpValue";
+  let PrintMethod = "printSORegImmOperand";
+  let DecoderMethod = "DecodeSORegImmOperand";
+  let ParserMatchClass = ShiftedImmAsmOperand;
+  let MIOperandInfo = (ops GPR, i32imm);
+}
+
+
+// so_imm - Match a 32-bit shifter_operand immediate operand, which is an
+// 8-bit immediate rotated by an arbitrary number of bits.
+def SOImmAsmOperand: ImmAsmOperand { let Name = "ARMSOImm"; }
+def so_imm : Operand<i32>, ImmLeaf<i32, [{
+    return ARM_AM::getSOImmVal(Imm) != -1;
+  }]> {
+  let EncoderMethod = "getSOImmOpValue";
+  let ParserMatchClass = SOImmAsmOperand;
+  let DecoderMethod = "DecodeSOImmOperand";
+}
+
+// Break so_imm's up into two pieces.  This handles immediates with up to 16
+// bits set in them.  This uses so_imm2part to match and so_imm2part_[12] to
+// get the first/second pieces.
+def so_imm2part : PatLeaf<(imm), [{
+      return ARM_AM::isSOImmTwoPartVal((unsigned)N->getZExtValue());
+}]>;
+
+/// arm_i32imm - True for +V6T2, or true only if so_imm2part is true.
+///
+def arm_i32imm : PatLeaf<(imm), [{
+  if (Subtarget->useMovt(*MF))
+    return true;
+  return ARM_AM::isSOImmTwoPartVal((unsigned)N->getZExtValue());
+}]>;
+
+/// imm0_1 predicate - Immediate in the range [0,1].
+def Imm0_1AsmOperand: ImmAsmOperand { let Name = "Imm0_1"; }
+def imm0_1 : Operand<i32> { let ParserMatchClass = Imm0_1AsmOperand; }
+
+/// imm0_3 predicate - Immediate in the range [0,3].
+def Imm0_3AsmOperand: ImmAsmOperand { let Name = "Imm0_3"; }
+def imm0_3 : Operand<i32> { let ParserMatchClass = Imm0_3AsmOperand; }
+
+/// imm0_7 predicate - Immediate in the range [0,7].
+def Imm0_7AsmOperand: ImmAsmOperand { let Name = "Imm0_7"; }
+def imm0_7 : Operand<i32>, ImmLeaf<i32, [{
+  return Imm >= 0 && Imm < 8;
+}]> {
+  let ParserMatchClass = Imm0_7AsmOperand;
+}
+
+/// imm8 predicate - Immediate is exactly 8.
+def Imm8AsmOperand: ImmAsmOperand { let Name = "Imm8"; }
+def imm8 : Operand<i32>, ImmLeaf<i32, [{ return Imm == 8; }]> {
+  let ParserMatchClass = Imm8AsmOperand;
+}
+
+/// imm16 predicate - Immediate is exactly 16.
+def Imm16AsmOperand: ImmAsmOperand { let Name = "Imm16"; }
+def imm16 : Operand<i32>, ImmLeaf<i32, [{ return Imm == 16; }]> {
+  let ParserMatchClass = Imm16AsmOperand;
+}
+
+/// imm32 predicate - Immediate is exactly 32.
+def Imm32AsmOperand: ImmAsmOperand { let Name = "Imm32"; }
+def imm32 : Operand<i32>, ImmLeaf<i32, [{ return Imm == 32; }]> {
+  let ParserMatchClass = Imm32AsmOperand;
+}
+
+/// imm1_7 predicate - Immediate in the range [1,7].
+def Imm1_7AsmOperand: ImmAsmOperand { let Name = "Imm1_7"; }
+def imm1_7 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm < 8; }]> {
+  let ParserMatchClass = Imm1_7AsmOperand;
+}
+
+/// imm1_15 predicate - Immediate in the range [1,15].
+def Imm1_15AsmOperand: ImmAsmOperand { let Name = "Imm1_15"; }
+def imm1_15 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm < 16; }]> {
+  let ParserMatchClass = Imm1_15AsmOperand;
+}
+
+/// imm1_31 predicate - Immediate in the range [1,31].
+def Imm1_31AsmOperand: ImmAsmOperand { let Name = "Imm1_31"; }
+def imm1_31 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm < 32; }]> {
+  let ParserMatchClass = Imm1_31AsmOperand;
+}
+
+/// imm0_15 predicate - Immediate in the range [0,15].
+def Imm0_15AsmOperand: ImmAsmOperand {
+  let Name = "Imm0_15";
+  let DiagnosticType = "ImmRange0_15";
+}
+def imm0_15 : Operand<i32>, ImmLeaf<i32, [{
+  return Imm >= 0 && Imm < 16;
+}]> {
+  let ParserMatchClass = Imm0_15AsmOperand;
+}
+
+/// imm0_31 predicate - True if the 32-bit immediate is in the range [0,31].
+def Imm0_31AsmOperand: ImmAsmOperand { let Name = "Imm0_31"; }
+def imm0_31 : Operand<i32>, ImmLeaf<i32, [{
+  return Imm >= 0 && Imm < 32;
+}]> {
+  let ParserMatchClass = Imm0_31AsmOperand;
+}
+
+/// imm0_32 predicate - True if the 32-bit immediate is in the range [0,32].
+def Imm0_32AsmOperand: ImmAsmOperand { let Name = "Imm0_32"; }
+def imm0_32 : Operand<i32>, ImmLeaf<i32, [{
+  return Imm >= 0 && Imm < 32;
+}]> {
+  let ParserMatchClass = Imm0_32AsmOperand;
+}
+
+/// imm0_63 predicate - True if the 32-bit immediate is in the range [0,63].
+def Imm0_63AsmOperand: ImmAsmOperand { let Name = "Imm0_63"; }
+def imm0_63 : Operand<i32>, ImmLeaf<i32, [{
+  return Imm >= 0 && Imm < 64;
+}]> {
+  let ParserMatchClass = Imm0_63AsmOperand;
+}
+
+/// imm0_239 predicate - Immediate in the range [0,239].
+def Imm0_239AsmOperand : ImmAsmOperand {
+  let Name = "Imm0_239";
+  let DiagnosticType = "ImmRange0_239";
+}
+def imm0_239 : Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm < 240; }]> {
+  let ParserMatchClass = Imm0_239AsmOperand;
+}
+
+/// imm0_255 predicate - Immediate in the range [0,255].
+def Imm0_255AsmOperand : ImmAsmOperand { let Name = "Imm0_255"; }
+def imm0_255 : Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm < 256; }]> {
+  let ParserMatchClass = Imm0_255AsmOperand;
+}
+
+/// imm0_65535 - An immediate is in the range [0.65535].
+def Imm0_65535AsmOperand: ImmAsmOperand { let Name = "Imm0_65535"; }
+def imm0_65535 : Operand<i32>, ImmLeaf<i32, [{
+  return Imm >= 0 && Imm < 65536;
+}]> {
+  let ParserMatchClass = Imm0_65535AsmOperand;
+}
+
+// imm0_65535_neg - An immediate whose negative value is in the range [0.65535].
+def imm0_65535_neg : Operand<i32>, ImmLeaf<i32, [{
+  return -Imm >= 0 && -Imm < 65536;
+}]>;
+
+// imm0_65535_expr - For movt/movw - 16-bit immediate that can also reference
+// a relocatable expression.
+//
+// FIXME: This really needs a Thumb version separate from the ARM version.
+// While the range is the same, and can thus use the same match class,
+// the encoding is different so it should have a different encoder method.
+def Imm0_65535ExprAsmOperand: ImmAsmOperand { let Name = "Imm0_65535Expr"; }
+def imm0_65535_expr : Operand<i32> {
+  let EncoderMethod = "getHiLo16ImmOpValue";
+  let ParserMatchClass = Imm0_65535ExprAsmOperand;
+}
+
+def Imm256_65535ExprAsmOperand: ImmAsmOperand { let Name = "Imm256_65535Expr"; }
+def imm256_65535_expr : Operand<i32> {
+  let ParserMatchClass = Imm256_65535ExprAsmOperand;
+}
+
+/// imm24b - True if the 32-bit immediate is encodable in 24 bits.
+def Imm24bitAsmOperand: ImmAsmOperand { let Name = "Imm24bit"; }
+def imm24b : Operand<i32>, ImmLeaf<i32, [{
+  return Imm >= 0 && Imm <= 0xffffff;
+}]> {
+  let ParserMatchClass = Imm24bitAsmOperand;
+}
+
+
+/// bf_inv_mask_imm predicate - An AND mask to clear an arbitrary width bitfield
+/// e.g., 0xf000ffff
+def BitfieldAsmOperand : AsmOperandClass {
+  let Name = "Bitfield";
+  let ParserMethod = "parseBitfield";
+}
+
+def bf_inv_mask_imm : Operand<i32>,
+                      PatLeaf<(imm), [{
+  return ARM::isBitFieldInvertedMask(N->getZExtValue());
+}] > {
+  let EncoderMethod = "getBitfieldInvertedMaskOpValue";
+  let PrintMethod = "printBitfieldInvMaskImmOperand";
+  let DecoderMethod = "DecodeBitfieldMaskOperand";
+  let ParserMatchClass = BitfieldAsmOperand;
+}
+
+def imm1_32_XFORM: SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant((int)N->getZExtValue() - 1, MVT::i32);
+}]>;
+def Imm1_32AsmOperand: AsmOperandClass { let Name = "Imm1_32"; }
+def imm1_32 : Operand<i32>, PatLeaf<(imm), [{
+   uint64_t Imm = N->getZExtValue();
+   return Imm > 0 && Imm <= 32;
+ }],
+    imm1_32_XFORM> {
+  let PrintMethod = "printImmPlusOneOperand";
+  let ParserMatchClass = Imm1_32AsmOperand;
+}
+
+def imm1_16_XFORM: SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant((int)N->getZExtValue() - 1, MVT::i32);
+}]>;
+def Imm1_16AsmOperand: AsmOperandClass { let Name = "Imm1_16"; }
+def imm1_16 : Operand<i32>, PatLeaf<(imm), [{ return Imm > 0 && Imm <= 16; }],
+    imm1_16_XFORM> {
+  let PrintMethod = "printImmPlusOneOperand";
+  let ParserMatchClass = Imm1_16AsmOperand;
+}
+
+// Define ARM specific addressing modes.
+// addrmode_imm12 := reg +/- imm12
+//
+def MemImm12OffsetAsmOperand : AsmOperandClass { let Name = "MemImm12Offset"; }
+class AddrMode_Imm12 : Operand<i32>,
+                     ComplexPattern<i32, 2, "SelectAddrModeImm12", []> {
+  // 12-bit immediate operand. Note that instructions using this encode
+  // #0 and #-0 differently. We flag #-0 as the magic value INT32_MIN. All other
+  // immediate values are as normal.
+
+  let EncoderMethod = "getAddrModeImm12OpValue";
+  let DecoderMethod = "DecodeAddrModeImm12Operand";
+  let ParserMatchClass = MemImm12OffsetAsmOperand;
+  let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
+}
+
+def addrmode_imm12 : AddrMode_Imm12 {
+  let PrintMethod = "printAddrModeImm12Operand<false>";
+}
+
+def addrmode_imm12_pre : AddrMode_Imm12 {
+  let PrintMethod = "printAddrModeImm12Operand<true>";
+}
+
+// ldst_so_reg := reg +/- reg shop imm
+//
+def MemRegOffsetAsmOperand : AsmOperandClass { let Name = "MemRegOffset"; }
+def ldst_so_reg : Operand<i32>,
+                  ComplexPattern<i32, 3, "SelectLdStSOReg", []> {
+  let EncoderMethod = "getLdStSORegOpValue";
+  // FIXME: Simplify the printer
+  let PrintMethod = "printAddrMode2Operand";
+  let DecoderMethod = "DecodeSORegMemOperand";
+  let ParserMatchClass = MemRegOffsetAsmOperand;
+  let MIOperandInfo = (ops GPR:$base, GPRnopc:$offsreg, i32imm:$shift);
+}
+
+// postidx_imm8 := +/- [0,255]
+//
+// 9 bit value:
+//  {8}       1 is imm8 is non-negative. 0 otherwise.
+//  {7-0}     [0,255] imm8 value.
+def PostIdxImm8AsmOperand : AsmOperandClass { let Name = "PostIdxImm8"; }
+def postidx_imm8 : Operand<i32> {
+  let PrintMethod = "printPostIdxImm8Operand";
+  let ParserMatchClass = PostIdxImm8AsmOperand;
+  let MIOperandInfo = (ops i32imm);
+}
+
+// postidx_imm8s4 := +/- [0,1020]
+//
+// 9 bit value:
+//  {8}       1 is imm8 is non-negative. 0 otherwise.
+//  {7-0}     [0,255] imm8 value, scaled by 4.
+def PostIdxImm8s4AsmOperand : AsmOperandClass { let Name = "PostIdxImm8s4"; }
+def postidx_imm8s4 : Operand<i32> {
+  let PrintMethod = "printPostIdxImm8s4Operand";
+  let ParserMatchClass = PostIdxImm8s4AsmOperand;
+  let MIOperandInfo = (ops i32imm);
+}
+
+
+// postidx_reg := +/- reg
+//
+def PostIdxRegAsmOperand : AsmOperandClass {
+  let Name = "PostIdxReg";
+  let ParserMethod = "parsePostIdxReg";
+}
+def postidx_reg : Operand<i32> {
+  let EncoderMethod = "getPostIdxRegOpValue";
+  let DecoderMethod = "DecodePostIdxReg";
+  let PrintMethod = "printPostIdxRegOperand";
+  let ParserMatchClass = PostIdxRegAsmOperand;
+  let MIOperandInfo = (ops GPRnopc, i32imm);
+}
+
+
+// addrmode2 := reg +/- imm12
+//           := reg +/- reg shop imm
+//
+// FIXME: addrmode2 should be refactored the rest of the way to always
+// use explicit imm vs. reg versions above (addrmode_imm12 and ldst_so_reg).
+def AddrMode2AsmOperand : AsmOperandClass { let Name = "AddrMode2"; }
+def addrmode2 : Operand<i32>,
+                ComplexPattern<i32, 3, "SelectAddrMode2", []> {
+  let EncoderMethod = "getAddrMode2OpValue";
+  let PrintMethod = "printAddrMode2Operand";
+  let ParserMatchClass = AddrMode2AsmOperand;
+  let MIOperandInfo = (ops GPR:$base, GPR:$offsreg, i32imm:$offsimm);
+}
+
+def PostIdxRegShiftedAsmOperand : AsmOperandClass {
+  let Name = "PostIdxRegShifted";
+  let ParserMethod = "parsePostIdxReg";
+}
+def am2offset_reg : Operand<i32>,
+                ComplexPattern<i32, 2, "SelectAddrMode2OffsetReg",
+                [], [SDNPWantRoot]> {
+  let EncoderMethod = "getAddrMode2OffsetOpValue";
+  let PrintMethod = "printAddrMode2OffsetOperand";
+  // When using this for assembly, it's always as a post-index offset.
+  let ParserMatchClass = PostIdxRegShiftedAsmOperand;
+  let MIOperandInfo = (ops GPRnopc, i32imm);
+}
+
+// FIXME: am2offset_imm should only need the immediate, not the GPR. Having
+// the GPR is purely vestigal at this point.
+def AM2OffsetImmAsmOperand : AsmOperandClass { let Name = "AM2OffsetImm"; }
+def am2offset_imm : Operand<i32>,
+                ComplexPattern<i32, 2, "SelectAddrMode2OffsetImm",
+                [], [SDNPWantRoot]> {
+  let EncoderMethod = "getAddrMode2OffsetOpValue";
+  let PrintMethod = "printAddrMode2OffsetOperand";
+  let ParserMatchClass = AM2OffsetImmAsmOperand;
+  let MIOperandInfo = (ops GPRnopc, i32imm);
+}
+
+
+// addrmode3 := reg +/- reg
+// addrmode3 := reg +/- imm8
+//
+// FIXME: split into imm vs. reg versions.
+def AddrMode3AsmOperand : AsmOperandClass { let Name = "AddrMode3"; }
+class AddrMode3 : Operand<i32>,
+                  ComplexPattern<i32, 3, "SelectAddrMode3", []> {
+  let EncoderMethod = "getAddrMode3OpValue";
+  let ParserMatchClass = AddrMode3AsmOperand;
+  let MIOperandInfo = (ops GPR:$base, GPR:$offsreg, i32imm:$offsimm);
+}
+
+def addrmode3 : AddrMode3
+{
+  let PrintMethod = "printAddrMode3Operand<false>";
+}
+
+def addrmode3_pre : AddrMode3
+{
+  let PrintMethod = "printAddrMode3Operand<true>";
+}
+
+// FIXME: split into imm vs. reg versions.
+// FIXME: parser method to handle +/- register.
+def AM3OffsetAsmOperand : AsmOperandClass {
+  let Name = "AM3Offset";
+  let ParserMethod = "parseAM3Offset";
+}
+def am3offset : Operand<i32>,
+                ComplexPattern<i32, 2, "SelectAddrMode3Offset",
+                               [], [SDNPWantRoot]> {
+  let EncoderMethod = "getAddrMode3OffsetOpValue";
+  let PrintMethod = "printAddrMode3OffsetOperand";
+  let ParserMatchClass = AM3OffsetAsmOperand;
+  let MIOperandInfo = (ops GPR, i32imm);
+}
+
+// ldstm_mode := {ia, ib, da, db}
+//
+def ldstm_mode : OptionalDefOperand<OtherVT, (ops i32), (ops (i32 1))> {
+  let EncoderMethod = "getLdStmModeOpValue";
+  let PrintMethod = "printLdStmModeOperand";
+}
+
+// addrmode5 := reg +/- imm8*4
+//
+def AddrMode5AsmOperand : AsmOperandClass { let Name = "AddrMode5"; }
+class AddrMode5 : Operand<i32>,
+                  ComplexPattern<i32, 2, "SelectAddrMode5", []> {
+  let EncoderMethod = "getAddrMode5OpValue";
+  let DecoderMethod = "DecodeAddrMode5Operand";
+  let ParserMatchClass = AddrMode5AsmOperand;
+  let MIOperandInfo = (ops GPR:$base, i32imm);
+}
+
+def addrmode5 : AddrMode5 {
+   let PrintMethod = "printAddrMode5Operand<false>";
+}
+
+def addrmode5_pre : AddrMode5 {
+   let PrintMethod = "printAddrMode5Operand<true>";
+}
+
+// addrmode6 := reg with optional alignment
+//
+def AddrMode6AsmOperand : AsmOperandClass { let Name = "AlignedMemory"; }
+def addrmode6 : Operand<i32>,
+                ComplexPattern<i32, 2, "SelectAddrMode6", [], [SDNPWantParent]>{
+  let PrintMethod = "printAddrMode6Operand";
+  let MIOperandInfo = (ops GPR:$addr, i32imm:$align);
+  let EncoderMethod = "getAddrMode6AddressOpValue";
+  let DecoderMethod = "DecodeAddrMode6Operand";
+  let ParserMatchClass = AddrMode6AsmOperand;
+}
+
+def am6offset : Operand<i32>,
+                ComplexPattern<i32, 1, "SelectAddrMode6Offset",
+                               [], [SDNPWantRoot]> {
+  let PrintMethod = "printAddrMode6OffsetOperand";
+  let MIOperandInfo = (ops GPR);
+  let EncoderMethod = "getAddrMode6OffsetOpValue";
+  let DecoderMethod = "DecodeGPRRegisterClass";
+}
+
+// Special version of addrmode6 to handle alignment encoding for VST1/VLD1
+// (single element from one lane) for size 32.
+def addrmode6oneL32 : Operand<i32>,
+                ComplexPattern<i32, 2, "SelectAddrMode6", [], [SDNPWantParent]>{
+  let PrintMethod = "printAddrMode6Operand";
+  let MIOperandInfo = (ops GPR:$addr, i32imm);
+  let EncoderMethod = "getAddrMode6OneLane32AddressOpValue";
+}
+
+// Base class for addrmode6 with specific alignment restrictions.
+class AddrMode6Align : Operand<i32>,
+                ComplexPattern<i32, 2, "SelectAddrMode6", [], [SDNPWantParent]>{
+  let PrintMethod = "printAddrMode6Operand";
+  let MIOperandInfo = (ops GPR:$addr, i32imm:$align);
+  let EncoderMethod = "getAddrMode6AddressOpValue";
+  let DecoderMethod = "DecodeAddrMode6Operand";
+}
+
+// Special version of addrmode6 to handle no allowed alignment encoding for
+// VLD/VST instructions and checking the alignment is not specified.
+def AddrMode6AlignNoneAsmOperand : AsmOperandClass {
+  let Name = "AlignedMemoryNone";
+  let DiagnosticType = "AlignedMemoryRequiresNone";
+}
+def addrmode6alignNone : AddrMode6Align {
+  // The alignment specifier can only be omitted.
+  let ParserMatchClass = AddrMode6AlignNoneAsmOperand;
+}
+
+// Special version of addrmode6 to handle 16-bit alignment encoding for
+// VLD/VST instructions and checking the alignment value.
+def AddrMode6Align16AsmOperand : AsmOperandClass {
+  let Name = "AlignedMemory16";
+  let DiagnosticType = "AlignedMemoryRequires16";
+}
+def addrmode6align16 : AddrMode6Align {
+  // The alignment specifier can only be 16 or omitted.
+  let ParserMatchClass = AddrMode6Align16AsmOperand;
+}
+
+// Special version of addrmode6 to handle 32-bit alignment encoding for
+// VLD/VST instructions and checking the alignment value.
+def AddrMode6Align32AsmOperand : AsmOperandClass {
+  let Name = "AlignedMemory32";
+  let DiagnosticType = "AlignedMemoryRequires32";
+}
+def addrmode6align32 : AddrMode6Align {
+  // The alignment specifier can only be 32 or omitted.
+  let ParserMatchClass = AddrMode6Align32AsmOperand;
+}
+
+// Special version of addrmode6 to handle 64-bit alignment encoding for
+// VLD/VST instructions and checking the alignment value.
+def AddrMode6Align64AsmOperand : AsmOperandClass {
+  let Name = "AlignedMemory64";
+  let DiagnosticType = "AlignedMemoryRequires64";
+}
+def addrmode6align64 : AddrMode6Align {
+  // The alignment specifier can only be 64 or omitted.
+  let ParserMatchClass = AddrMode6Align64AsmOperand;
+}
+
+// Special version of addrmode6 to handle 64-bit or 128-bit alignment encoding
+// for VLD/VST instructions and checking the alignment value.
+def AddrMode6Align64or128AsmOperand : AsmOperandClass {
+  let Name = "AlignedMemory64or128";
+  let DiagnosticType = "AlignedMemoryRequires64or128";
+}
+def addrmode6align64or128 : AddrMode6Align {
+  // The alignment specifier can only be 64, 128 or omitted.
+  let ParserMatchClass = AddrMode6Align64or128AsmOperand;
+}
+
+// Special version of addrmode6 to handle 64-bit, 128-bit or 256-bit alignment
+// encoding for VLD/VST instructions and checking the alignment value.
+def AddrMode6Align64or128or256AsmOperand : AsmOperandClass {
+  let Name = "AlignedMemory64or128or256";
+  let DiagnosticType = "AlignedMemoryRequires64or128or256";
+}
+def addrmode6align64or128or256 : AddrMode6Align {
+  // The alignment specifier can only be 64, 128, 256 or omitted.
+  let ParserMatchClass = AddrMode6Align64or128or256AsmOperand;
+}
+
+// Special version of addrmode6 to handle alignment encoding for VLD-dup
+// instructions, specifically VLD4-dup.
+def addrmode6dup : Operand<i32>,
+                ComplexPattern<i32, 2, "SelectAddrMode6", [], [SDNPWantParent]>{
+  let PrintMethod = "printAddrMode6Operand";
+  let MIOperandInfo = (ops GPR:$addr, i32imm);
+  let EncoderMethod = "getAddrMode6DupAddressOpValue";
+  // FIXME: This is close, but not quite right. The alignment specifier is
+  // different.
+  let ParserMatchClass = AddrMode6AsmOperand;
+}
+
+// Base class for addrmode6dup with specific alignment restrictions.
+class AddrMode6DupAlign : Operand<i32>,
+                ComplexPattern<i32, 2, "SelectAddrMode6", [], [SDNPWantParent]>{
+  let PrintMethod = "printAddrMode6Operand";
+  let MIOperandInfo = (ops GPR:$addr, i32imm);
+  let EncoderMethod = "getAddrMode6DupAddressOpValue";
+}
+
+// Special version of addrmode6 to handle no allowed alignment encoding for
+// VLD-dup instruction and checking the alignment is not specified.
+def AddrMode6dupAlignNoneAsmOperand : AsmOperandClass {
+  let Name = "DupAlignedMemoryNone";
+  let DiagnosticType = "DupAlignedMemoryRequiresNone";
+}
+def addrmode6dupalignNone : AddrMode6DupAlign {
+  // The alignment specifier can only be omitted.
+  let ParserMatchClass = AddrMode6dupAlignNoneAsmOperand;
+}
+
+// Special version of addrmode6 to handle 16-bit alignment encoding for VLD-dup
+// instruction and checking the alignment value.
+def AddrMode6dupAlign16AsmOperand : AsmOperandClass {
+  let Name = "DupAlignedMemory16";
+  let DiagnosticType = "DupAlignedMemoryRequires16";
+}
+def addrmode6dupalign16 : AddrMode6DupAlign {
+  // The alignment specifier can only be 16 or omitted.
+  let ParserMatchClass = AddrMode6dupAlign16AsmOperand;
+}
+
+// Special version of addrmode6 to handle 32-bit alignment encoding for VLD-dup
+// instruction and checking the alignment value.
+def AddrMode6dupAlign32AsmOperand : AsmOperandClass {
+  let Name = "DupAlignedMemory32";
+  let DiagnosticType = "DupAlignedMemoryRequires32";
+}
+def addrmode6dupalign32 : AddrMode6DupAlign {
+  // The alignment specifier can only be 32 or omitted.
+  let ParserMatchClass = AddrMode6dupAlign32AsmOperand;
+}
+
+// Special version of addrmode6 to handle 64-bit alignment encoding for VLD
+// instructions and checking the alignment value.
+def AddrMode6dupAlign64AsmOperand : AsmOperandClass {
+  let Name = "DupAlignedMemory64";
+  let DiagnosticType = "DupAlignedMemoryRequires64";
+}
+def addrmode6dupalign64 : AddrMode6DupAlign {
+  // The alignment specifier can only be 64 or omitted.
+  let ParserMatchClass = AddrMode6dupAlign64AsmOperand;
+}
+
+// Special version of addrmode6 to handle 64-bit or 128-bit alignment encoding
+// for VLD instructions and checking the alignment value.
+def AddrMode6dupAlign64or128AsmOperand : AsmOperandClass {
+  let Name = "DupAlignedMemory64or128";
+  let DiagnosticType = "DupAlignedMemoryRequires64or128";
+}
+def addrmode6dupalign64or128 : AddrMode6DupAlign {
+  // The alignment specifier can only be 64, 128 or omitted.
+  let ParserMatchClass = AddrMode6dupAlign64or128AsmOperand;
+}
+
+// addrmodepc := pc + reg
+//
+def addrmodepc : Operand<i32>,
+                 ComplexPattern<i32, 2, "SelectAddrModePC", []> {
+  let PrintMethod = "printAddrModePCOperand";
+  let MIOperandInfo = (ops GPR, i32imm);
+}
+
+// addr_offset_none := reg
+//
+def MemNoOffsetAsmOperand : AsmOperandClass { let Name = "MemNoOffset"; }
+def addr_offset_none : Operand<i32>,
+                       ComplexPattern<i32, 1, "SelectAddrOffsetNone", []> {
+  let PrintMethod = "printAddrMode7Operand";
+  let DecoderMethod = "DecodeAddrMode7Operand";
+  let ParserMatchClass = MemNoOffsetAsmOperand;
+  let MIOperandInfo = (ops GPR:$base);
+}
+
+def nohash_imm : Operand<i32> {
+  let PrintMethod = "printNoHashImmediate";
+}
+
+def CoprocNumAsmOperand : AsmOperandClass {
+  let Name = "CoprocNum";
+  let ParserMethod = "parseCoprocNumOperand";
+}
+def p_imm : Operand<i32> {
+  let PrintMethod = "printPImmediate";
+  let ParserMatchClass = CoprocNumAsmOperand;
+  let DecoderMethod = "DecodeCoprocessor";
+}
+
+def CoprocRegAsmOperand : AsmOperandClass {
+  let Name = "CoprocReg";
+  let ParserMethod = "parseCoprocRegOperand";
+}
+def c_imm : Operand<i32> {
+  let PrintMethod = "printCImmediate";
+  let ParserMatchClass = CoprocRegAsmOperand;
+}
+def CoprocOptionAsmOperand : AsmOperandClass {
+  let Name = "CoprocOption";
+  let ParserMethod = "parseCoprocOptionOperand";
+}
+def coproc_option_imm : Operand<i32> {
+  let PrintMethod = "printCoprocOptionImm";
+  let ParserMatchClass = CoprocOptionAsmOperand;
+}
+
+//===----------------------------------------------------------------------===//
+
+include "ARMInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Multiclass helpers...
+//
+
+/// AsI1_bin_irs - Defines a set of (op r, {so_imm|r|so_reg}) patterns for a
+/// binop that produces a value.
+let TwoOperandAliasConstraint = "$Rn = $Rd" in
+multiclass AsI1_bin_irs<bits<4> opcod, string opc,
+                     InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
+                        PatFrag opnode, bit Commutable = 0> {
+  // The register-immediate version is re-materializable. This is useful
+  // in particular for taking the address of a local.
+  let isReMaterializable = 1 in {
+  def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), DPFrm,
+               iii, opc, "\t$Rd, $Rn, $imm",
+               [(set GPR:$Rd, (opnode GPR:$Rn, so_imm:$imm))]>,
+           Sched<[WriteALU, ReadALU]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> imm;
+    let Inst{25} = 1;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = Rd;
+    let Inst{11-0} = imm;
+  }
+  }
+  def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm,
+               iir, opc, "\t$Rd, $Rn, $Rm",
+               [(set GPR:$Rd, (opnode GPR:$Rn, GPR:$Rm))]>,
+           Sched<[WriteALU, ReadALU, ReadALU]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<4> Rm;
+    let Inst{25} = 0;
+    let isCommutable = Commutable;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = Rd;
+    let Inst{11-4} = 0b00000000;
+    let Inst{3-0} = Rm;
+  }
+
+  def rsi : AsI1<opcod, (outs GPR:$Rd),
+               (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm,
+               iis, opc, "\t$Rd, $Rn, $shift",
+               [(set GPR:$Rd, (opnode GPR:$Rn, so_reg_imm:$shift))]>,
+            Sched<[WriteALUsi, ReadALU]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> shift;
+    let Inst{25} = 0;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = Rd;
+    let Inst{11-5} = shift{11-5};
+    let Inst{4} = 0;
+    let Inst{3-0} = shift{3-0};
+  }
+
+  def rsr : AsI1<opcod, (outs GPR:$Rd),
+               (ins GPR:$Rn, so_reg_reg:$shift), DPSoRegRegFrm,
+               iis, opc, "\t$Rd, $Rn, $shift",
+               [(set GPR:$Rd, (opnode GPR:$Rn, so_reg_reg:$shift))]>,
+            Sched<[WriteALUsr, ReadALUsr]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> shift;
+    let Inst{25} = 0;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = Rd;
+    let Inst{11-8} = shift{11-8};
+    let Inst{7} = 0;
+    let Inst{6-5} = shift{6-5};
+    let Inst{4} = 1;
+    let Inst{3-0} = shift{3-0};
+  }
+}
+
+/// AsI1_rbin_irs - Same as AsI1_bin_irs except the order of operands are
+/// reversed.  The 'rr' form is only defined for the disassembler; for codegen
+/// it is equivalent to the AsI1_bin_irs counterpart.
+let TwoOperandAliasConstraint = "$Rn = $Rd" in
+multiclass AsI1_rbin_irs<bits<4> opcod, string opc,
+                     InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
+                        PatFrag opnode, bit Commutable = 0> {
+  // The register-immediate version is re-materializable. This is useful
+  // in particular for taking the address of a local.
+  let isReMaterializable = 1 in {
+  def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), DPFrm,
+               iii, opc, "\t$Rd, $Rn, $imm",
+               [(set GPR:$Rd, (opnode so_imm:$imm, GPR:$Rn))]>,
+           Sched<[WriteALU, ReadALU]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> imm;
+    let Inst{25} = 1;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = Rd;
+    let Inst{11-0} = imm;
+  }
+  }
+  def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm,
+               iir, opc, "\t$Rd, $Rn, $Rm",
+               [/* pattern left blank */]>,
+           Sched<[WriteALU, ReadALU, ReadALU]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<4> Rm;
+    let Inst{11-4} = 0b00000000;
+    let Inst{25} = 0;
+    let Inst{3-0} = Rm;
+    let Inst{15-12} = Rd;
+    let Inst{19-16} = Rn;
+  }
+
+  def rsi : AsI1<opcod, (outs GPR:$Rd),
+               (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm,
+               iis, opc, "\t$Rd, $Rn, $shift",
+               [(set GPR:$Rd, (opnode so_reg_imm:$shift, GPR:$Rn))]>,
+            Sched<[WriteALUsi, ReadALU]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> shift;
+    let Inst{25} = 0;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = Rd;
+    let Inst{11-5} = shift{11-5};
+    let Inst{4} = 0;
+    let Inst{3-0} = shift{3-0};
+  }
+
+  def rsr : AsI1<opcod, (outs GPR:$Rd),
+               (ins GPR:$Rn, so_reg_reg:$shift), DPSoRegRegFrm,
+               iis, opc, "\t$Rd, $Rn, $shift",
+               [(set GPR:$Rd, (opnode so_reg_reg:$shift, GPR:$Rn))]>,
+            Sched<[WriteALUsr, ReadALUsr]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> shift;
+    let Inst{25} = 0;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = Rd;
+    let Inst{11-8} = shift{11-8};
+    let Inst{7} = 0;
+    let Inst{6-5} = shift{6-5};
+    let Inst{4} = 1;
+    let Inst{3-0} = shift{3-0};
+  }
+}
+
+/// AsI1_bin_s_irs - Same as AsI1_bin_irs except it sets the 's' bit by default.
+///
+/// These opcodes will be converted to the real non-S opcodes by
+/// AdjustInstrPostInstrSelection after giving them an optional CPSR operand.
+let hasPostISelHook = 1, Defs = [CPSR] in {
+multiclass AsI1_bin_s_irs<InstrItinClass iii, InstrItinClass iir,
+                          InstrItinClass iis, PatFrag opnode,
+                          bit Commutable = 0> {
+  def ri : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm, pred:$p),
+                         4, iii,
+                         [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, so_imm:$imm))]>,
+                         Sched<[WriteALU, ReadALU]>;
+
+  def rr : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, pred:$p),
+                         4, iir,
+                         [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, GPR:$Rm))]>,
+                         Sched<[WriteALU, ReadALU, ReadALU]> {
+    let isCommutable = Commutable;
+  }
+  def rsi : ARMPseudoInst<(outs GPR:$Rd),
+                          (ins GPR:$Rn, so_reg_imm:$shift, pred:$p),
+                          4, iis,
+                          [(set GPR:$Rd, CPSR, (opnode GPR:$Rn,
+                                                so_reg_imm:$shift))]>,
+                          Sched<[WriteALUsi, ReadALU]>;
+
+  def rsr : ARMPseudoInst<(outs GPR:$Rd),
+                          (ins GPR:$Rn, so_reg_reg:$shift, pred:$p),
+                          4, iis,
+                          [(set GPR:$Rd, CPSR, (opnode GPR:$Rn,
+                                                so_reg_reg:$shift))]>,
+                          Sched<[WriteALUSsr, ReadALUsr]>;
+}
+}
+
+/// AsI1_rbin_s_is - Same as AsI1_bin_s_irs, except selection DAG
+/// operands are reversed.
+let hasPostISelHook = 1, Defs = [CPSR] in {
+multiclass AsI1_rbin_s_is<InstrItinClass iii, InstrItinClass iir,
+                          InstrItinClass iis, PatFrag opnode,
+                          bit Commutable = 0> {
+  def ri : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm, pred:$p),
+                         4, iii,
+                         [(set GPR:$Rd, CPSR, (opnode so_imm:$imm, GPR:$Rn))]>,
+           Sched<[WriteALU, ReadALU]>;
+
+  def rsi : ARMPseudoInst<(outs GPR:$Rd),
+                          (ins GPR:$Rn, so_reg_imm:$shift, pred:$p),
+                          4, iis,
+                          [(set GPR:$Rd, CPSR, (opnode so_reg_imm:$shift,
+                                             GPR:$Rn))]>,
+            Sched<[WriteALUsi, ReadALU]>;
+
+  def rsr : ARMPseudoInst<(outs GPR:$Rd),
+                          (ins GPR:$Rn, so_reg_reg:$shift, pred:$p),
+                          4, iis,
+                          [(set GPR:$Rd, CPSR, (opnode so_reg_reg:$shift,
+                                             GPR:$Rn))]>,
+            Sched<[WriteALUSsr, ReadALUsr]>;
+}
+}
+
+/// AI1_cmp_irs - Defines a set of (op r, {so_imm|r|so_reg}) cmp / test
+/// patterns. Similar to AsI1_bin_irs except the instruction does not produce
+/// a explicit result, only implicitly set CPSR.
+let isCompare = 1, Defs = [CPSR] in {
+multiclass AI1_cmp_irs<bits<4> opcod, string opc,
+                     InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
+                       PatFrag opnode, bit Commutable = 0> {
+  def ri : AI1<opcod, (outs), (ins GPR:$Rn, so_imm:$imm), DPFrm, iii,
+               opc, "\t$Rn, $imm",
+               [(opnode GPR:$Rn, so_imm:$imm)]>,
+           Sched<[WriteCMP, ReadALU]> {
+    bits<4> Rn;
+    bits<12> imm;
+    let Inst{25} = 1;
+    let Inst{20} = 1;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = 0b0000;
+    let Inst{11-0} = imm;
+
+    let Unpredictable{15-12} = 0b1111;
+  }
+  def rr : AI1<opcod, (outs), (ins GPR:$Rn, GPR:$Rm), DPFrm, iir,
+               opc, "\t$Rn, $Rm",
+               [(opnode GPR:$Rn, GPR:$Rm)]>,
+           Sched<[WriteCMP, ReadALU, ReadALU]> {
+    bits<4> Rn;
+    bits<4> Rm;
+    let isCommutable = Commutable;
+    let Inst{25} = 0;
+    let Inst{20} = 1;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = 0b0000;
+    let Inst{11-4} = 0b00000000;
+    let Inst{3-0} = Rm;
+
+    let Unpredictable{15-12} = 0b1111;
+  }
+  def rsi : AI1<opcod, (outs),
+               (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm, iis,
+               opc, "\t$Rn, $shift",
+               [(opnode GPR:$Rn, so_reg_imm:$shift)]>,
+            Sched<[WriteCMPsi, ReadALU]> {
+    bits<4> Rn;
+    bits<12> shift;
+    let Inst{25} = 0;
+    let Inst{20} = 1;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = 0b0000;
+    let Inst{11-5} = shift{11-5};
+    let Inst{4} = 0;
+    let Inst{3-0} = shift{3-0};
+
+    let Unpredictable{15-12} = 0b1111;
+  }
+  def rsr : AI1<opcod, (outs),
+               (ins GPRnopc:$Rn, so_reg_reg:$shift), DPSoRegRegFrm, iis,
+               opc, "\t$Rn, $shift",
+               [(opnode GPRnopc:$Rn, so_reg_reg:$shift)]>,
+            Sched<[WriteCMPsr, ReadALU]> {
+    bits<4> Rn;
+    bits<12> shift;
+    let Inst{25} = 0;
+    let Inst{20} = 1;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = 0b0000;
+    let Inst{11-8} = shift{11-8};
+    let Inst{7} = 0;
+    let Inst{6-5} = shift{6-5};
+    let Inst{4} = 1;
+    let Inst{3-0} = shift{3-0};
+
+    let Unpredictable{15-12} = 0b1111;
+  }
+
+}
+}
+
+/// AI_ext_rrot - A unary operation with two forms: one whose operand is a
+/// register and one whose operand is a register rotated by 8/16/24.
+/// FIXME: Remove the 'r' variant. Its rot_imm is zero.
+class AI_ext_rrot<bits<8> opcod, string opc, PatFrag opnode>
+  : AExtI<opcod, (outs GPRnopc:$Rd), (ins GPRnopc:$Rm, rot_imm:$rot),
+          IIC_iEXTr, opc, "\t$Rd, $Rm$rot",
+          [(set GPRnopc:$Rd, (opnode (rotr GPRnopc:$Rm, rot_imm:$rot)))]>,
+       Requires<[IsARM, HasV6]>, Sched<[WriteALUsi]> {
+  bits<4> Rd;
+  bits<4> Rm;
+  bits<2> rot;
+  let Inst{19-16} = 0b1111;
+  let Inst{15-12} = Rd;
+  let Inst{11-10} = rot;
+  let Inst{3-0}   = Rm;
+}
+
+class AI_ext_rrot_np<bits<8> opcod, string opc>
+  : AExtI<opcod, (outs GPRnopc:$Rd), (ins GPRnopc:$Rm, rot_imm:$rot),
+          IIC_iEXTr, opc, "\t$Rd, $Rm$rot", []>,
+       Requires<[IsARM, HasV6]>, Sched<[WriteALUsi]> {
+  bits<2> rot;
+  let Inst{19-16} = 0b1111;
+  let Inst{11-10} = rot;
+ }
+
+/// AI_exta_rrot - A binary operation with two forms: one whose operand is a
+/// register and one whose operand is a register rotated by 8/16/24.
+class AI_exta_rrot<bits<8> opcod, string opc, PatFrag opnode>
+  : AExtI<opcod, (outs GPRnopc:$Rd), (ins GPR:$Rn, GPRnopc:$Rm, rot_imm:$rot),
+          IIC_iEXTAr, opc, "\t$Rd, $Rn, $Rm$rot",
+          [(set GPRnopc:$Rd, (opnode GPR:$Rn,
+                                     (rotr GPRnopc:$Rm, rot_imm:$rot)))]>,
+        Requires<[IsARM, HasV6]>, Sched<[WriteALUsr]> {
+  bits<4> Rd;
+  bits<4> Rm;
+  bits<4> Rn;
+  bits<2> rot;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = Rd;
+  let Inst{11-10} = rot;
+  let Inst{9-4}   = 0b000111;
+  let Inst{3-0}   = Rm;
+}
+
+class AI_exta_rrot_np<bits<8> opcod, string opc>
+  : AExtI<opcod, (outs GPRnopc:$Rd), (ins GPR:$Rn, GPRnopc:$Rm, rot_imm:$rot),
+          IIC_iEXTAr, opc, "\t$Rd, $Rn, $Rm$rot", []>,
+       Requires<[IsARM, HasV6]>, Sched<[WriteALUsr]> {
+  bits<4> Rn;
+  bits<2> rot;
+  let Inst{19-16} = Rn;
+  let Inst{11-10} = rot;
+}
+
+/// AI1_adde_sube_irs - Define instructions and patterns for adde and sube.
+let TwoOperandAliasConstraint = "$Rn = $Rd" in
+multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
+                             bit Commutable = 0> {
+  let hasPostISelHook = 1, Defs = [CPSR], Uses = [CPSR] in {
+  def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm),
+                DPFrm, IIC_iALUi, opc, "\t$Rd, $Rn, $imm",
+               [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, so_imm:$imm, CPSR))]>,
+               Requires<[IsARM]>,
+           Sched<[WriteALU, ReadALU]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> imm;
+    let Inst{25} = 1;
+    let Inst{15-12} = Rd;
+    let Inst{19-16} = Rn;
+    let Inst{11-0} = imm;
+  }
+  def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+                DPFrm, IIC_iALUr, opc, "\t$Rd, $Rn, $Rm",
+               [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, GPR:$Rm, CPSR))]>,
+               Requires<[IsARM]>,
+           Sched<[WriteALU, ReadALU, ReadALU]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<4> Rm;
+    let Inst{11-4} = 0b00000000;
+    let Inst{25} = 0;
+    let isCommutable = Commutable;
+    let Inst{3-0} = Rm;
+    let Inst{15-12} = Rd;
+    let Inst{19-16} = Rn;
+  }
+  def rsi : AsI1<opcod, (outs GPR:$Rd),
+                (ins GPR:$Rn, so_reg_imm:$shift),
+                DPSoRegImmFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift",
+              [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, so_reg_imm:$shift, CPSR))]>,
+               Requires<[IsARM]>,
+            Sched<[WriteALUsi, ReadALU]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> shift;
+    let Inst{25} = 0;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = Rd;
+    let Inst{11-5} = shift{11-5};
+    let Inst{4} = 0;
+    let Inst{3-0} = shift{3-0};
+  }
+  def rsr : AsI1<opcod, (outs GPRnopc:$Rd),
+                (ins GPRnopc:$Rn, so_reg_reg:$shift),
+                DPSoRegRegFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift",
+              [(set GPRnopc:$Rd, CPSR,
+                    (opnode GPRnopc:$Rn, so_reg_reg:$shift, CPSR))]>,
+               Requires<[IsARM]>,
+            Sched<[WriteALUsr, ReadALUsr]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> shift;
+    let Inst{25} = 0;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = Rd;
+    let Inst{11-8} = shift{11-8};
+    let Inst{7} = 0;
+    let Inst{6-5} = shift{6-5};
+    let Inst{4} = 1;
+    let Inst{3-0} = shift{3-0};
+  }
+  }
+}
+
+/// AI1_rsc_irs - Define instructions and patterns for rsc
+let TwoOperandAliasConstraint = "$Rn = $Rd" in
+multiclass AI1_rsc_irs<bits<4> opcod, string opc, PatFrag opnode> {
+  let hasPostISelHook = 1, Defs = [CPSR], Uses = [CPSR] in {
+  def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm),
+                DPFrm, IIC_iALUi, opc, "\t$Rd, $Rn, $imm",
+               [(set GPR:$Rd, CPSR, (opnode so_imm:$imm, GPR:$Rn, CPSR))]>,
+               Requires<[IsARM]>,
+           Sched<[WriteALU, ReadALU]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> imm;
+    let Inst{25} = 1;
+    let Inst{15-12} = Rd;
+    let Inst{19-16} = Rn;
+    let Inst{11-0} = imm;
+  }
+  def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+                DPFrm, IIC_iALUr, opc, "\t$Rd, $Rn, $Rm",
+               [/* pattern left blank */]>,
+           Sched<[WriteALU, ReadALU, ReadALU]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<4> Rm;
+    let Inst{11-4} = 0b00000000;
+    let Inst{25} = 0;
+    let Inst{3-0} = Rm;
+    let Inst{15-12} = Rd;
+    let Inst{19-16} = Rn;
+  }
+  def rsi : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_reg_imm:$shift),
+                DPSoRegImmFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift",
+              [(set GPR:$Rd, CPSR, (opnode so_reg_imm:$shift, GPR:$Rn, CPSR))]>,
+               Requires<[IsARM]>,
+            Sched<[WriteALUsi, ReadALU]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> shift;
+    let Inst{25} = 0;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = Rd;
+    let Inst{11-5} = shift{11-5};
+    let Inst{4} = 0;
+    let Inst{3-0} = shift{3-0};
+  }
+  def rsr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_reg_reg:$shift),
+                DPSoRegRegFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift",
+              [(set GPR:$Rd, CPSR, (opnode so_reg_reg:$shift, GPR:$Rn, CPSR))]>,
+               Requires<[IsARM]>,
+            Sched<[WriteALUsr, ReadALUsr]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> shift;
+    let Inst{25} = 0;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = Rd;
+    let Inst{11-8} = shift{11-8};
+    let Inst{7} = 0;
+    let Inst{6-5} = shift{6-5};
+    let Inst{4} = 1;
+    let Inst{3-0} = shift{3-0};
+  }
+  }
+}
+
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
+multiclass AI_ldr1<bit isByte, string opc, InstrItinClass iii,
+           InstrItinClass iir, PatFrag opnode> {
+  // Note: We use the complex addrmode_imm12 rather than just an input
+  // GPR and a constrained immediate so that we can use this to match
+  // frame index references and avoid matching constant pool references.
+  def i12: AI2ldst<0b010, 1, isByte, (outs GPR:$Rt), (ins addrmode_imm12:$addr),
+                   AddrMode_i12, LdFrm, iii, opc, "\t$Rt, $addr",
+                  [(set GPR:$Rt, (opnode addrmode_imm12:$addr))]> {
+    bits<4>  Rt;
+    bits<17> addr;
+    let Inst{23}    = addr{12};     // U (add = ('U' == 1))
+    let Inst{19-16} = addr{16-13};  // Rn
+    let Inst{15-12} = Rt;
+    let Inst{11-0}  = addr{11-0};   // imm12
+  }
+  def rs : AI2ldst<0b011, 1, isByte, (outs GPR:$Rt), (ins ldst_so_reg:$shift),
+                  AddrModeNone, LdFrm, iir, opc, "\t$Rt, $shift",
+                 [(set GPR:$Rt, (opnode ldst_so_reg:$shift))]> {
+    bits<4>  Rt;
+    bits<17> shift;
+    let shift{4}    = 0;            // Inst{4} = 0
+    let Inst{23}    = shift{12};    // U (add = ('U' == 1))
+    let Inst{19-16} = shift{16-13}; // Rn
+    let Inst{15-12} = Rt;
+    let Inst{11-0}  = shift{11-0};
+  }
+}
+}
+
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
+multiclass AI_ldr1nopc<bit isByte, string opc, InstrItinClass iii,
+           InstrItinClass iir, PatFrag opnode> {
+  // Note: We use the complex addrmode_imm12 rather than just an input
+  // GPR and a constrained immediate so that we can use this to match
+  // frame index references and avoid matching constant pool references.
+  def i12: AI2ldst<0b010, 1, isByte, (outs GPRnopc:$Rt),
+                   (ins addrmode_imm12:$addr),
+                   AddrMode_i12, LdFrm, iii, opc, "\t$Rt, $addr",
+                   [(set GPRnopc:$Rt, (opnode addrmode_imm12:$addr))]> {
+    bits<4>  Rt;
+    bits<17> addr;
+    let Inst{23}    = addr{12};     // U (add = ('U' == 1))
+    let Inst{19-16} = addr{16-13};  // Rn
+    let Inst{15-12} = Rt;
+    let Inst{11-0}  = addr{11-0};   // imm12
+  }
+  def rs : AI2ldst<0b011, 1, isByte, (outs GPRnopc:$Rt),
+                   (ins ldst_so_reg:$shift),
+                   AddrModeNone, LdFrm, iir, opc, "\t$Rt, $shift",
+                   [(set GPRnopc:$Rt, (opnode ldst_so_reg:$shift))]> {
+    bits<4>  Rt;
+    bits<17> shift;
+    let shift{4}    = 0;            // Inst{4} = 0
+    let Inst{23}    = shift{12};    // U (add = ('U' == 1))
+    let Inst{19-16} = shift{16-13}; // Rn
+    let Inst{15-12} = Rt;
+    let Inst{11-0}  = shift{11-0};
+  }
+}
+}
+
+
+multiclass AI_str1<bit isByte, string opc, InstrItinClass iii,
+           InstrItinClass iir, PatFrag opnode> {
+  // Note: We use the complex addrmode_imm12 rather than just an input
+  // GPR and a constrained immediate so that we can use this to match
+  // frame index references and avoid matching constant pool references.
+  def i12 : AI2ldst<0b010, 0, isByte, (outs),
+                   (ins GPR:$Rt, addrmode_imm12:$addr),
+                   AddrMode_i12, StFrm, iii, opc, "\t$Rt, $addr",
+                  [(opnode GPR:$Rt, addrmode_imm12:$addr)]> {
+    bits<4> Rt;
+    bits<17> addr;
+    let Inst{23}    = addr{12};     // U (add = ('U' == 1))
+    let Inst{19-16} = addr{16-13};  // Rn
+    let Inst{15-12} = Rt;
+    let Inst{11-0}  = addr{11-0};   // imm12
+  }
+  def rs : AI2ldst<0b011, 0, isByte, (outs), (ins GPR:$Rt, ldst_so_reg:$shift),
+                  AddrModeNone, StFrm, iir, opc, "\t$Rt, $shift",
+                 [(opnode GPR:$Rt, ldst_so_reg:$shift)]> {
+    bits<4> Rt;
+    bits<17> shift;
+    let shift{4}    = 0;            // Inst{4} = 0
+    let Inst{23}    = shift{12};    // U (add = ('U' == 1))
+    let Inst{19-16} = shift{16-13}; // Rn
+    let Inst{15-12} = Rt;
+    let Inst{11-0}  = shift{11-0};
+  }
+}
+
+multiclass AI_str1nopc<bit isByte, string opc, InstrItinClass iii,
+           InstrItinClass iir, PatFrag opnode> {
+  // Note: We use the complex addrmode_imm12 rather than just an input
+  // GPR and a constrained immediate so that we can use this to match
+  // frame index references and avoid matching constant pool references.
+  def i12 : AI2ldst<0b010, 0, isByte, (outs),
+                   (ins GPRnopc:$Rt, addrmode_imm12:$addr),
+                   AddrMode_i12, StFrm, iii, opc, "\t$Rt, $addr",
+                  [(opnode GPRnopc:$Rt, addrmode_imm12:$addr)]> {
+    bits<4> Rt;
+    bits<17> addr;
+    let Inst{23}    = addr{12};     // U (add = ('U' == 1))
+    let Inst{19-16} = addr{16-13};  // Rn
+    let Inst{15-12} = Rt;
+    let Inst{11-0}  = addr{11-0};   // imm12
+  }
+  def rs : AI2ldst<0b011, 0, isByte, (outs),
+                   (ins GPRnopc:$Rt, ldst_so_reg:$shift),
+                   AddrModeNone, StFrm, iir, opc, "\t$Rt, $shift",
+                   [(opnode GPRnopc:$Rt, ldst_so_reg:$shift)]> {
+    bits<4> Rt;
+    bits<17> shift;
+    let shift{4}    = 0;            // Inst{4} = 0
+    let Inst{23}    = shift{12};    // U (add = ('U' == 1))
+    let Inst{19-16} = shift{16-13}; // Rn
+    let Inst{15-12} = Rt;
+    let Inst{11-0}  = shift{11-0};
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Miscellaneous Instructions.
+//
+
+/// CONSTPOOL_ENTRY - This instruction represents a floating constant pool in
+/// the function.  The first operand is the ID# for this instruction, the second
+/// is the index into the MachineConstantPool that this is, the third is the
+/// size in bytes of this constant pool entry.
+let neverHasSideEffects = 1, isNotDuplicable = 1 in
+def CONSTPOOL_ENTRY :
+PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx,
+                    i32imm:$size), NoItinerary, []>;
+
+// FIXME: Marking these as hasSideEffects is necessary to prevent machine DCE
+// from removing one half of the matched pairs. That breaks PEI, which assumes
+// these will always be in pairs, and asserts if it finds otherwise. Better way?
+let Defs = [SP], Uses = [SP], hasSideEffects = 1 in {
+def ADJCALLSTACKUP :
+PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2, pred:$p), NoItinerary,
+           [(ARMcallseq_end timm:$amt1, timm:$amt2)]>;
+
+def ADJCALLSTACKDOWN :
+PseudoInst<(outs), (ins i32imm:$amt, pred:$p), NoItinerary,
+           [(ARMcallseq_start timm:$amt)]>;
+}
+
+def HINT : AI<(outs), (ins imm0_239:$imm), MiscFrm, NoItinerary,
+              "hint", "\t$imm", [(int_arm_hint imm0_239:$imm)]>,
+           Requires<[IsARM, HasV6]> {
+  bits<8> imm;
+  let Inst{27-8} = 0b00110010000011110000;
+  let Inst{7-0} = imm;
+}
+
+def : InstAlias<"nop$p", (HINT 0, pred:$p)>, Requires<[IsARM, HasV6T2]>;
+def : InstAlias<"yield$p", (HINT 1, pred:$p)>, Requires<[IsARM, HasV6T2]>;
+def : InstAlias<"wfe$p", (HINT 2, pred:$p)>, Requires<[IsARM, HasV6T2]>;
+def : InstAlias<"wfi$p", (HINT 3, pred:$p)>, Requires<[IsARM, HasV6T2]>;
+def : InstAlias<"sev$p", (HINT 4, pred:$p)>, Requires<[IsARM, HasV6T2]>;
+def : InstAlias<"sevl$p", (HINT 5, pred:$p)>, Requires<[IsARM, HasV8]>;
+
+def SEL : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm, NoItinerary, "sel",
+             "\t$Rd, $Rn, $Rm", []>, Requires<[IsARM, HasV6]> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+  let Inst{3-0} = Rm;
+  let Inst{15-12} = Rd;
+  let Inst{19-16} = Rn;
+  let Inst{27-20} = 0b01101000;
+  let Inst{7-4} = 0b1011;
+  let Inst{11-8} = 0b1111;
+  let Unpredictable{11-8} = 0b1111;
+}
+
+// The 16-bit operand $val can be used by a debugger to store more information
+// about the breakpoint.
+def BKPT : AInoP<(outs), (ins imm0_65535:$val), MiscFrm, NoItinerary,
+                 "bkpt", "\t$val", []>, Requires<[IsARM]> {
+  bits<16> val;
+  let Inst{3-0} = val{3-0};
+  let Inst{19-8} = val{15-4};
+  let Inst{27-20} = 0b00010010;
+  let Inst{31-28} = 0xe; // AL
+  let Inst{7-4} = 0b0111;
+}
+// default immediate for breakpoint mnemonic
+def : InstAlias<"bkpt", (BKPT 0)>, Requires<[IsARM]>;
+
+def HLT : AInoP<(outs), (ins imm0_65535:$val), MiscFrm, NoItinerary,
+                 "hlt", "\t$val", []>, Requires<[IsARM, HasV8]> {
+  bits<16> val;
+  let Inst{3-0} = val{3-0};
+  let Inst{19-8} = val{15-4};
+  let Inst{27-20} = 0b00010000;
+  let Inst{31-28} = 0xe; // AL
+  let Inst{7-4} = 0b0111;
+}
+
+// Change Processor State
+// FIXME: We should use InstAlias to handle the optional operands.
+class CPS<dag iops, string asm_ops>
+  : AXI<(outs), iops, MiscFrm, NoItinerary, !strconcat("cps", asm_ops),
+        []>, Requires<[IsARM]> {
+  bits<2> imod;
+  bits<3> iflags;
+  bits<5> mode;
+  bit M;
+
+  let Inst{31-28} = 0b1111;
+  let Inst{27-20} = 0b00010000;
+  let Inst{19-18} = imod;
+  let Inst{17}    = M; // Enabled if mode is set;
+  let Inst{16-9}  = 0b00000000;
+  let Inst{8-6}   = iflags;
+  let Inst{5}     = 0;
+  let Inst{4-0}   = mode;
+}
+
+let DecoderMethod = "DecodeCPSInstruction" in {
+let M = 1 in
+  def CPS3p : CPS<(ins imod_op:$imod, iflags_op:$iflags, imm0_31:$mode),
+                  "$imod\t$iflags, $mode">;
+let mode = 0, M = 0 in
+  def CPS2p : CPS<(ins imod_op:$imod, iflags_op:$iflags), "$imod\t$iflags">;
+
+let imod = 0, iflags = 0, M = 1 in
+  def CPS1p : CPS<(ins imm0_31:$mode), "\t$mode">;
+}
+
+// Preload signals the memory system of possible future data/instruction access.
+multiclass APreLoad<bits<1> read, bits<1> data, string opc> {
+
+  def i12 : AXIM<(outs), (ins addrmode_imm12:$addr), AddrMode_i12, MiscFrm,
+                IIC_Preload, !strconcat(opc, "\t$addr"),
+                [(ARMPreload addrmode_imm12:$addr, (i32 read), (i32 data))]>,
+                Sched<[WritePreLd]> {
+    bits<4> Rt;
+    bits<17> addr;
+    let Inst{31-26} = 0b111101;
+    let Inst{25} = 0; // 0 for immediate form
+    let Inst{24} = data;
+    let Inst{23} = addr{12};        // U (add = ('U' == 1))
+    let Inst{22} = read;
+    let Inst{21-20} = 0b01;
+    let Inst{19-16} = addr{16-13};  // Rn
+    let Inst{15-12} = 0b1111;
+    let Inst{11-0}  = addr{11-0};   // imm12
+  }
+
+  def rs : AXI<(outs), (ins ldst_so_reg:$shift), MiscFrm, IIC_Preload,
+               !strconcat(opc, "\t$shift"),
+               [(ARMPreload ldst_so_reg:$shift, (i32 read), (i32 data))]>,
+               Sched<[WritePreLd]> {
+    bits<17> shift;
+    let Inst{31-26} = 0b111101;
+    let Inst{25} = 1; // 1 for register form
+    let Inst{24} = data;
+    let Inst{23} = shift{12};    // U (add = ('U' == 1))
+    let Inst{22} = read;
+    let Inst{21-20} = 0b01;
+    let Inst{19-16} = shift{16-13}; // Rn
+    let Inst{15-12} = 0b1111;
+    let Inst{11-0}  = shift{11-0};
+    let Inst{4} = 0;
+  }
+}
+
+defm PLD  : APreLoad<1, 1, "pld">,  Requires<[IsARM]>;
+defm PLDW : APreLoad<0, 1, "pldw">, Requires<[IsARM,HasV7,HasMP]>;
+defm PLI  : APreLoad<1, 0, "pli">,  Requires<[IsARM,HasV7]>;
+
+def SETEND : AXI<(outs), (ins setend_op:$end), MiscFrm, NoItinerary,
+                 "setend\t$end", []>, Requires<[IsARM]>, Deprecated<HasV8Ops> {
+  bits<1> end;
+  let Inst{31-10} = 0b1111000100000001000000;
+  let Inst{9} = end;
+  let Inst{8-0} = 0;
+}
+
+def DBG : AI<(outs), (ins imm0_15:$opt), MiscFrm, NoItinerary, "dbg", "\t$opt",
+             []>, Requires<[IsARM, HasV7]> {
+  bits<4> opt;
+  let Inst{27-4} = 0b001100100000111100001111;
+  let Inst{3-0} = opt;
+}
+
+// A8.8.247  UDF - Undefined (Encoding A1)
+def UDF : AInoP<(outs), (ins imm0_65535:$imm16), MiscFrm, NoItinerary,
+                "udf", "\t$imm16", [(int_arm_undefined imm0_65535:$imm16)]> {
+  bits<16> imm16;
+  let Inst{31-28} = 0b1110; // AL
+  let Inst{27-25} = 0b011;
+  let Inst{24-20} = 0b11111;
+  let Inst{19-8} = imm16{15-4};
+  let Inst{7-4} = 0b1111;
+  let Inst{3-0} = imm16{3-0};
+}
+
+/*
+ * A5.4 Permanently UNDEFINED instructions.
+ *
+ * For most targets use UDF #65006, for which the OS will generate SIGTRAP.
+ * Other UDF encodings generate SIGILL.
+ *
+ * NaCl's OS instead chooses an ARM UDF encoding that's also a UDF in Thumb.
+ * Encoding A1:
+ *  1110 0111 1111 iiii iiii iiii 1111 iiii
+ * Encoding T1:
+ *  1101 1110 iiii iiii
+ * It uses the following encoding:
+ *  1110 0111 1111 1110 1101 1110 1111 0000
+ *  - In ARM: UDF #60896;
+ *  - In Thumb: UDF #254 followed by a branch-to-self.
+ */
+let isBarrier = 1, isTerminator = 1 in
+def TRAPNaCl : AXI<(outs), (ins), MiscFrm, NoItinerary,
+               "trap", [(trap)]>,
+           Requires<[IsARM,UseNaClTrap]> {
+  let Inst = 0xe7fedef0;
+}
+let isBarrier = 1, isTerminator = 1 in
+def TRAP : AXI<(outs), (ins), MiscFrm, NoItinerary,
+               "trap", [(trap)]>,
+           Requires<[IsARM,DontUseNaClTrap]> {
+  let Inst = 0xe7ffdefe;
+}
+
+// Address computation and loads and stores in PIC mode.
+let isNotDuplicable = 1 in {
+def PICADD  : ARMPseudoInst<(outs GPR:$dst), (ins GPR:$a, pclabel:$cp, pred:$p),
+                            4, IIC_iALUr,
+                            [(set GPR:$dst, (ARMpic_add GPR:$a, imm:$cp))]>,
+                            Sched<[WriteALU, ReadALU]>;
+
+let AddedComplexity = 10 in {
+def PICLDR  : ARMPseudoInst<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p),
+                            4, IIC_iLoad_r,
+                            [(set GPR:$dst, (load addrmodepc:$addr))]>;
+
+def PICLDRH : ARMPseudoInst<(outs GPR:$Rt), (ins addrmodepc:$addr, pred:$p),
+                            4, IIC_iLoad_bh_r,
+                            [(set GPR:$Rt, (zextloadi16 addrmodepc:$addr))]>;
+
+def PICLDRB : ARMPseudoInst<(outs GPR:$Rt), (ins addrmodepc:$addr, pred:$p),
+                            4, IIC_iLoad_bh_r,
+                            [(set GPR:$Rt, (zextloadi8 addrmodepc:$addr))]>;
+
+def PICLDRSH : ARMPseudoInst<(outs GPR:$Rt), (ins addrmodepc:$addr, pred:$p),
+                            4, IIC_iLoad_bh_r,
+                            [(set GPR:$Rt, (sextloadi16 addrmodepc:$addr))]>;
+
+def PICLDRSB : ARMPseudoInst<(outs GPR:$Rt), (ins addrmodepc:$addr, pred:$p),
+                            4, IIC_iLoad_bh_r,
+                            [(set GPR:$Rt, (sextloadi8 addrmodepc:$addr))]>;
+}
+let AddedComplexity = 10 in {
+def PICSTR  : ARMPseudoInst<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p),
+      4, IIC_iStore_r, [(store GPR:$src, addrmodepc:$addr)]>;
+
+def PICSTRH : ARMPseudoInst<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p),
+      4, IIC_iStore_bh_r, [(truncstorei16 GPR:$src,
+                                                   addrmodepc:$addr)]>;
+
+def PICSTRB : ARMPseudoInst<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p),
+      4, IIC_iStore_bh_r, [(truncstorei8 GPR:$src, addrmodepc:$addr)]>;
+}
+} // isNotDuplicable = 1
+
+
+// LEApcrel - Load a pc-relative address into a register without offending the
+// assembler.
+let neverHasSideEffects = 1, isReMaterializable = 1 in
+// The 'adr' mnemonic encodes differently if the label is before or after
+// the instruction. The {24-21} opcode bits are set by the fixup, as we don't
+// know until then which form of the instruction will be used.
+def ADR : AI1<{0,?,?,0}, (outs GPR:$Rd), (ins adrlabel:$label),
+                 MiscFrm, IIC_iALUi, "adr", "\t$Rd, $label", []>,
+                 Sched<[WriteALU, ReadALU]> {
+  bits<4> Rd;
+  bits<14> label;
+  let Inst{27-25} = 0b001;
+  let Inst{24} = 0;
+  let Inst{23-22} = label{13-12};
+  let Inst{21} = 0;
+  let Inst{20} = 0;
+  let Inst{19-16} = 0b1111;
+  let Inst{15-12} = Rd;
+  let Inst{11-0} = label{11-0};
+}
+
+let hasSideEffects = 1 in {
+def LEApcrel : ARMPseudoInst<(outs GPR:$Rd), (ins i32imm:$label, pred:$p),
+                    4, IIC_iALUi, []>, Sched<[WriteALU, ReadALU]>;
+
+def LEApcrelJT : ARMPseudoInst<(outs GPR:$Rd),
+                      (ins i32imm:$label, nohash_imm:$id, pred:$p),
+                      4, IIC_iALUi, []>, Sched<[WriteALU, ReadALU]>;
+}
+
+//===----------------------------------------------------------------------===//
+//  Control Flow Instructions.
+//
+
+let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
+  // ARMV4T and above
+  def BX_RET : AI<(outs), (ins), BrMiscFrm, IIC_Br,
+                  "bx", "\tlr", [(ARMretflag)]>,
+               Requires<[IsARM, HasV4T]>, Sched<[WriteBr]> {
+    let Inst{27-0}  = 0b0001001011111111111100011110;
+  }
+
+  // ARMV4 only
+  def MOVPCLR : AI<(outs), (ins), BrMiscFrm, IIC_Br,
+                  "mov", "\tpc, lr", [(ARMretflag)]>,
+               Requires<[IsARM, NoV4T]>, Sched<[WriteBr]> {
+    let Inst{27-0} = 0b0001101000001111000000001110;
+  }
+
+  // Exception return: N.b. doesn't set CPSR as far as we're concerned (it sets
+  // the user-space one).
+  def SUBS_PC_LR : ARMPseudoInst<(outs), (ins i32imm:$offset, pred:$p),
+                                 4, IIC_Br,
+                                 [(ARMintretflag imm:$offset)]>;
+}
+
+// Indirect branches
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
+  // ARMV4T and above
+  def BX : AXI<(outs), (ins GPR:$dst), BrMiscFrm, IIC_Br, "bx\t$dst",
+                  [(brind GPR:$dst)]>,
+              Requires<[IsARM, HasV4T]>, Sched<[WriteBr]> {
+    bits<4> dst;
+    let Inst{31-4} = 0b1110000100101111111111110001;
+    let Inst{3-0}  = dst;
+  }
+
+  def BX_pred : AI<(outs), (ins GPR:$dst), BrMiscFrm, IIC_Br,
+                  "bx", "\t$dst", [/* pattern left blank */]>,
+              Requires<[IsARM, HasV4T]>, Sched<[WriteBr]> {
+    bits<4> dst;
+    let Inst{27-4} = 0b000100101111111111110001;
+    let Inst{3-0}  = dst;
+  }
+}
+
+// SP is marked as a use to prevent stack-pointer assignments that appear
+// immediately before calls from potentially appearing dead.
+let isCall = 1,
+  // FIXME:  Do we really need a non-predicated version? If so, it should
+  // at least be a pseudo instruction expanding to the predicated version
+  // at MC lowering time.
+  Defs = [LR], Uses = [SP] in {
+  def BL  : ABXI<0b1011, (outs), (ins bl_target:$func),
+                IIC_Br, "bl\t$func",
+                [(ARMcall tglobaladdr:$func)]>,
+            Requires<[IsARM]>, Sched<[WriteBrL]> {
+    let Inst{31-28} = 0b1110;
+    bits<24> func;
+    let Inst{23-0} = func;
+    let DecoderMethod = "DecodeBranchImmInstruction";
+  }
+
+  def BL_pred : ABI<0b1011, (outs), (ins bl_target:$func),
+                   IIC_Br, "bl", "\t$func",
+                   [(ARMcall_pred tglobaladdr:$func)]>,
+                Requires<[IsARM]>, Sched<[WriteBrL]> {
+    bits<24> func;
+    let Inst{23-0} = func;
+    let DecoderMethod = "DecodeBranchImmInstruction";
+  }
+
+  // ARMv5T and above
+  def BLX : AXI<(outs), (ins GPR:$func), BrMiscFrm,
+                IIC_Br, "blx\t$func",
+                [(ARMcall GPR:$func)]>,
+            Requires<[IsARM, HasV5T]>, Sched<[WriteBrL]> {
+    bits<4> func;
+    let Inst{31-4} = 0b1110000100101111111111110011;
+    let Inst{3-0}  = func;
+  }
+
+  def BLX_pred : AI<(outs), (ins GPR:$func), BrMiscFrm,
+                    IIC_Br, "blx", "\t$func",
+                    [(ARMcall_pred GPR:$func)]>,
+                 Requires<[IsARM, HasV5T]>, Sched<[WriteBrL]> {
+    bits<4> func;
+    let Inst{27-4} = 0b000100101111111111110011;
+    let Inst{3-0}  = func;
+  }
+
+  // ARMv4T
+  // Note: Restrict $func to the tGPR regclass to prevent it being in LR.
+  def BX_CALL : ARMPseudoInst<(outs), (ins tGPR:$func),
+                   8, IIC_Br, [(ARMcall_nolink tGPR:$func)]>,
+                   Requires<[IsARM, HasV4T]>, Sched<[WriteBr]>;
+
+  // ARMv4
+  def BMOVPCRX_CALL : ARMPseudoInst<(outs), (ins tGPR:$func),
+                   8, IIC_Br, [(ARMcall_nolink tGPR:$func)]>,
+                   Requires<[IsARM, NoV4T]>, Sched<[WriteBr]>;
+
+  // mov lr, pc; b if callee is marked noreturn to avoid confusing the
+  // return stack predictor.
+  def BMOVPCB_CALL : ARMPseudoInst<(outs), (ins bl_target:$func),
+                               8, IIC_Br, [(ARMcall_nolink tglobaladdr:$func)]>,
+                      Requires<[IsARM]>, Sched<[WriteBr]>;
+}
+
+let isBranch = 1, isTerminator = 1 in {
+  // FIXME: should be able to write a pattern for ARMBrcond, but can't use
+  // a two-value operand where a dag node expects two operands. :(
+  def Bcc : ABI<0b1010, (outs), (ins br_target:$target),
+               IIC_Br, "b", "\t$target",
+               [/*(ARMbrcond bb:$target, imm:$cc, CCR:$ccr)*/]>,
+               Sched<[WriteBr]>  {
+    bits<24> target;
+    let Inst{23-0} = target;
+    let DecoderMethod = "DecodeBranchImmInstruction";
+  }
+
+  let isBarrier = 1 in {
+    // B is "predicable" since it's just a Bcc with an 'always' condition.
+    let isPredicable = 1 in
+    // FIXME: We shouldn't need this pseudo at all. Just using Bcc directly
+    // should be sufficient.
+    // FIXME: Is B really a Barrier? That doesn't seem right.
+    def B : ARMPseudoExpand<(outs), (ins br_target:$target), 4, IIC_Br,
+                [(br bb:$target)], (Bcc br_target:$target, (ops 14, zero_reg))>,
+                Sched<[WriteBr]>;
+
+    let isNotDuplicable = 1, isIndirectBranch = 1 in {
+    def BR_JTr : ARMPseudoInst<(outs),
+                      (ins GPR:$target, i32imm:$jt, i32imm:$id),
+                      0, IIC_Br,
+                      [(ARMbrjt GPR:$target, tjumptable:$jt, imm:$id)]>,
+                      Sched<[WriteBr]>;
+    // FIXME: This shouldn't use the generic "addrmode2," but rather be split
+    // into i12 and rs suffixed versions.
+    def BR_JTm : ARMPseudoInst<(outs),
+                     (ins addrmode2:$target, i32imm:$jt, i32imm:$id),
+                     0, IIC_Br,
+                     [(ARMbrjt (i32 (load addrmode2:$target)), tjumptable:$jt,
+                       imm:$id)]>, Sched<[WriteBrTbl]>;
+    def BR_JTadd : ARMPseudoInst<(outs),
+                   (ins GPR:$target, GPR:$idx, i32imm:$jt, i32imm:$id),
+                   0, IIC_Br,
+                   [(ARMbrjt (add GPR:$target, GPR:$idx), tjumptable:$jt,
+                     imm:$id)]>, Sched<[WriteBrTbl]>;
+    } // isNotDuplicable = 1, isIndirectBranch = 1
+  } // isBarrier = 1
+
+}
+
+// BLX (immediate)
+def BLXi : AXI<(outs), (ins blx_target:$target), BrMiscFrm, NoItinerary,
+               "blx\t$target", []>,
+           Requires<[IsARM, HasV5T]>, Sched<[WriteBrL]> {
+  let Inst{31-25} = 0b1111101;
+  bits<25> target;
+  let Inst{23-0} = target{24-1};
+  let Inst{24} = target{0};
+}
+
+// Branch and Exchange Jazelle
+def BXJ : ABI<0b0001, (outs), (ins GPR:$func), NoItinerary, "bxj", "\t$func",
+              [/* pattern left blank */]>, Sched<[WriteBr]> {
+  bits<4> func;
+  let Inst{23-20} = 0b0010;
+  let Inst{19-8} = 0xfff;
+  let Inst{7-4} = 0b0010;
+  let Inst{3-0} = func;
+}
+
+// Tail calls.
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
+  def TCRETURNdi : PseudoInst<(outs), (ins i32imm:$dst), IIC_Br, []>,
+                   Sched<[WriteBr]>;
+
+  def TCRETURNri : PseudoInst<(outs), (ins tcGPR:$dst), IIC_Br, []>,
+                   Sched<[WriteBr]>;
+
+  def TAILJMPd : ARMPseudoExpand<(outs), (ins br_target:$dst),
+                                 4, IIC_Br, [],
+                                 (Bcc br_target:$dst, (ops 14, zero_reg))>,
+                                 Requires<[IsARM]>, Sched<[WriteBr]>;
+
+  def TAILJMPr : ARMPseudoExpand<(outs), (ins tcGPR:$dst),
+                                 4, IIC_Br, [],
+                                 (BX GPR:$dst)>, Sched<[WriteBr]>,
+                                 Requires<[IsARM]>;
+}
+
+// Secure Monitor Call is a system instruction.
+def SMC : ABI<0b0001, (outs), (ins imm0_15:$opt), NoItinerary, "smc", "\t$opt",
+              []>, Requires<[IsARM, HasTrustZone]> {
+  bits<4> opt;
+  let Inst{23-4} = 0b01100000000000000111;
+  let Inst{3-0} = opt;
+}
+
+// Supervisor Call (Software Interrupt)
+let isCall = 1, Uses = [SP] in {
+def SVC : ABI<0b1111, (outs), (ins imm24b:$svc), IIC_Br, "svc", "\t$svc", []>,
+          Sched<[WriteBr]> {
+  bits<24> svc;
+  let Inst{23-0} = svc;
+}
+}
+
+// Store Return State
+class SRSI<bit wb, string asm>
+  : XI<(outs), (ins imm0_31:$mode), AddrModeNone, 4, IndexModeNone, BrFrm,
+       NoItinerary, asm, "", []> {
+  bits<5> mode;
+  let Inst{31-28} = 0b1111;
+  let Inst{27-25} = 0b100;
+  let Inst{22} = 1;
+  let Inst{21} = wb;
+  let Inst{20} = 0;
+  let Inst{19-16} = 0b1101;  // SP
+  let Inst{15-5} = 0b00000101000;
+  let Inst{4-0} = mode;
+}
+
+def SRSDA : SRSI<0, "srsda\tsp, $mode"> {
+  let Inst{24-23} = 0;
+}
+def SRSDA_UPD : SRSI<1, "srsda\tsp!, $mode"> {
+  let Inst{24-23} = 0;
+}
+def SRSDB : SRSI<0, "srsdb\tsp, $mode"> {
+  let Inst{24-23} = 0b10;
+}
+def SRSDB_UPD : SRSI<1, "srsdb\tsp!, $mode"> {
+  let Inst{24-23} = 0b10;
+}
+def SRSIA : SRSI<0, "srsia\tsp, $mode"> {
+  let Inst{24-23} = 0b01;
+}
+def SRSIA_UPD : SRSI<1, "srsia\tsp!, $mode"> {
+  let Inst{24-23} = 0b01;
+}
+def SRSIB : SRSI<0, "srsib\tsp, $mode"> {
+  let Inst{24-23} = 0b11;
+}
+def SRSIB_UPD : SRSI<1, "srsib\tsp!, $mode"> {
+  let Inst{24-23} = 0b11;
+}
+
+def : ARMInstAlias<"srsda $mode", (SRSDA imm0_31:$mode)>;
+def : ARMInstAlias<"srsda $mode!", (SRSDA_UPD imm0_31:$mode)>;
+
+def : ARMInstAlias<"srsdb $mode", (SRSDB imm0_31:$mode)>;
+def : ARMInstAlias<"srsdb $mode!", (SRSDB_UPD imm0_31:$mode)>;
+
+def : ARMInstAlias<"srsia $mode", (SRSIA imm0_31:$mode)>;
+def : ARMInstAlias<"srsia $mode!", (SRSIA_UPD imm0_31:$mode)>;
+
+def : ARMInstAlias<"srsib $mode", (SRSIB imm0_31:$mode)>;
+def : ARMInstAlias<"srsib $mode!", (SRSIB_UPD imm0_31:$mode)>;
+
+// Return From Exception
+class RFEI<bit wb, string asm>
+  : XI<(outs), (ins GPR:$Rn), AddrModeNone, 4, IndexModeNone, BrFrm,
+       NoItinerary, asm, "", []> {
+  bits<4> Rn;
+  let Inst{31-28} = 0b1111;
+  let Inst{27-25} = 0b100;
+  let Inst{22} = 0;
+  let Inst{21} = wb;
+  let Inst{20} = 1;
+  let Inst{19-16} = Rn;
+  let Inst{15-0} = 0xa00;
+}
+
+def RFEDA : RFEI<0, "rfeda\t$Rn"> {
+  let Inst{24-23} = 0;
+}
+def RFEDA_UPD : RFEI<1, "rfeda\t$Rn!"> {
+  let Inst{24-23} = 0;
+}
+def RFEDB : RFEI<0, "rfedb\t$Rn"> {
+  let Inst{24-23} = 0b10;
+}
+def RFEDB_UPD : RFEI<1, "rfedb\t$Rn!"> {
+  let Inst{24-23} = 0b10;
+}
+def RFEIA : RFEI<0, "rfeia\t$Rn"> {
+  let Inst{24-23} = 0b01;
+}
+def RFEIA_UPD : RFEI<1, "rfeia\t$Rn!"> {
+  let Inst{24-23} = 0b01;
+}
+def RFEIB : RFEI<0, "rfeib\t$Rn"> {
+  let Inst{24-23} = 0b11;
+}
+def RFEIB_UPD : RFEI<1, "rfeib\t$Rn!"> {
+  let Inst{24-23} = 0b11;
+}
+
+//===----------------------------------------------------------------------===//
+//  Load / Store Instructions.
+//
+
+// Load
+
+
+defm LDR  : AI_ldr1<0, "ldr", IIC_iLoad_r, IIC_iLoad_si,
+                    UnOpFrag<(load node:$Src)>>;
+defm LDRB : AI_ldr1nopc<1, "ldrb", IIC_iLoad_bh_r, IIC_iLoad_bh_si,
+                    UnOpFrag<(zextloadi8 node:$Src)>>;
+defm STR  : AI_str1<0, "str", IIC_iStore_r, IIC_iStore_si,
+                   BinOpFrag<(store node:$LHS, node:$RHS)>>;
+defm STRB : AI_str1nopc<1, "strb", IIC_iStore_bh_r, IIC_iStore_bh_si,
+                   BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>;
+
+// Special LDR for loads from non-pc-relative constpools.
+let canFoldAsLoad = 1, mayLoad = 1, neverHasSideEffects = 1,
+    isReMaterializable = 1, isCodeGenOnly = 1 in
+def LDRcp : AI2ldst<0b010, 1, 0, (outs GPR:$Rt), (ins addrmode_imm12:$addr),
+                 AddrMode_i12, LdFrm, IIC_iLoad_r, "ldr", "\t$Rt, $addr",
+                 []> {
+  bits<4> Rt;
+  bits<17> addr;
+  let Inst{23}    = addr{12};     // U (add = ('U' == 1))
+  let Inst{19-16} = 0b1111;
+  let Inst{15-12} = Rt;
+  let Inst{11-0}  = addr{11-0};   // imm12
+}
+
+// Loads with zero extension
+def LDRH  : AI3ld<0b1011, 1, (outs GPR:$Rt), (ins addrmode3:$addr), LdMiscFrm,
+                  IIC_iLoad_bh_r, "ldrh", "\t$Rt, $addr",
+                  [(set GPR:$Rt, (zextloadi16 addrmode3:$addr))]>;
+
+// Loads with sign extension
+def LDRSH : AI3ld<0b1111, 1, (outs GPR:$Rt), (ins addrmode3:$addr), LdMiscFrm,
+                   IIC_iLoad_bh_r, "ldrsh", "\t$Rt, $addr",
+                   [(set GPR:$Rt, (sextloadi16 addrmode3:$addr))]>;
+
+def LDRSB : AI3ld<0b1101, 1, (outs GPR:$Rt), (ins addrmode3:$addr), LdMiscFrm,
+                   IIC_iLoad_bh_r, "ldrsb", "\t$Rt, $addr",
+                   [(set GPR:$Rt, (sextloadi8 addrmode3:$addr))]>;
+
+let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
+  // Load doubleword
+  def LDRD : AI3ld<0b1101, 0, (outs GPR:$Rt, GPR:$Rt2), (ins addrmode3:$addr),
+                   LdMiscFrm, IIC_iLoad_d_r, "ldrd", "\t$Rt, $Rt2, $addr", []>,
+             Requires<[IsARM, HasV5TE]>;
+}
+
+def LDA : AIldracq<0b00, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+                    NoItinerary, "lda", "\t$Rt, $addr", []>;
+def LDAB : AIldracq<0b10, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+                    NoItinerary, "ldab", "\t$Rt, $addr", []>;
+def LDAH : AIldracq<0b11, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+                    NoItinerary, "ldah", "\t$Rt, $addr", []>;
+
+// Indexed loads
+multiclass AI2_ldridx<bit isByte, string opc,
+                      InstrItinClass iii, InstrItinClass iir> {
+  def _PRE_IMM  : AI2ldstidx<1, isByte, 1, (outs GPR:$Rt, GPR:$Rn_wb),
+                      (ins addrmode_imm12_pre:$addr), IndexModePre, LdFrm, iii,
+                      opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> {
+    bits<17> addr;
+    let Inst{25} = 0;
+    let Inst{23} = addr{12};
+    let Inst{19-16} = addr{16-13};
+    let Inst{11-0} = addr{11-0};
+    let DecoderMethod = "DecodeLDRPreImm";
+  }
+
+  def _PRE_REG  : AI2ldstidx<1, isByte, 1, (outs GPR:$Rt, GPR:$Rn_wb),
+                      (ins ldst_so_reg:$addr), IndexModePre, LdFrm, iir,
+                      opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> {
+    bits<17> addr;
+    let Inst{25} = 1;
+    let Inst{23} = addr{12};
+    let Inst{19-16} = addr{16-13};
+    let Inst{11-0} = addr{11-0};
+    let Inst{4} = 0;
+    let DecoderMethod = "DecodeLDRPreReg";
+  }
+
+  def _POST_REG : AI2ldstidx<1, isByte, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                       (ins addr_offset_none:$addr, am2offset_reg:$offset),
+                       IndexModePost, LdFrm, iir,
+                       opc, "\t$Rt, $addr, $offset",
+                       "$addr.base = $Rn_wb", []> {
+     // {12}     isAdd
+     // {11-0}   imm12/Rm
+     bits<14> offset;
+     bits<4> addr;
+     let Inst{25} = 1;
+     let Inst{23} = offset{12};
+     let Inst{19-16} = addr;
+     let Inst{11-0} = offset{11-0};
+     let Inst{4} = 0;
+
+    let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+   }
+
+   def _POST_IMM : AI2ldstidx<1, isByte, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                       (ins addr_offset_none:$addr, am2offset_imm:$offset),
+                      IndexModePost, LdFrm, iii,
+                      opc, "\t$Rt, $addr, $offset",
+                      "$addr.base = $Rn_wb", []> {
+    // {12}     isAdd
+    // {11-0}   imm12/Rm
+    bits<14> offset;
+    bits<4> addr;
+    let Inst{25} = 0;
+    let Inst{23} = offset{12};
+    let Inst{19-16} = addr;
+    let Inst{11-0} = offset{11-0};
+
+    let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+  }
+
+}
+
+let mayLoad = 1, neverHasSideEffects = 1 in {
+// FIXME: for LDR_PRE_REG etc. the itineray should be either IIC_iLoad_ru or
+// IIC_iLoad_siu depending on whether it the offset register is shifted.
+defm LDR  : AI2_ldridx<0, "ldr", IIC_iLoad_iu, IIC_iLoad_ru>;
+defm LDRB : AI2_ldridx<1, "ldrb", IIC_iLoad_bh_iu, IIC_iLoad_bh_ru>;
+}
+
+multiclass AI3_ldridx<bits<4> op, string opc, InstrItinClass itin> {
+  def _PRE  : AI3ldstidx<op, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
+                        (ins addrmode3_pre:$addr), IndexModePre,
+                        LdMiscFrm, itin,
+                        opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> {
+    bits<14> addr;
+    let Inst{23}    = addr{8};      // U bit
+    let Inst{22}    = addr{13};     // 1 == imm8, 0 == Rm
+    let Inst{19-16} = addr{12-9};   // Rn
+    let Inst{11-8}  = addr{7-4};    // imm7_4/zero
+    let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
+    let DecoderMethod = "DecodeAddrMode3Instruction";
+  }
+  def _POST : AI3ldstidx<op, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                        (ins addr_offset_none:$addr, am3offset:$offset),
+                        IndexModePost, LdMiscFrm, itin,
+                        opc, "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb",
+                        []> {
+    bits<10> offset;
+    bits<4> addr;
+    let Inst{23}    = offset{8};      // U bit
+    let Inst{22}    = offset{9};      // 1 == imm8, 0 == Rm
+    let Inst{19-16} = addr;
+    let Inst{11-8}  = offset{7-4};    // imm7_4/zero
+    let Inst{3-0}   = offset{3-0};    // imm3_0/Rm
+    let DecoderMethod = "DecodeAddrMode3Instruction";
+  }
+}
+
+let mayLoad = 1, neverHasSideEffects = 1 in {
+defm LDRH  : AI3_ldridx<0b1011, "ldrh", IIC_iLoad_bh_ru>;
+defm LDRSH : AI3_ldridx<0b1111, "ldrsh", IIC_iLoad_bh_ru>;
+defm LDRSB : AI3_ldridx<0b1101, "ldrsb", IIC_iLoad_bh_ru>;
+let hasExtraDefRegAllocReq = 1 in {
+def LDRD_PRE : AI3ldstidx<0b1101, 0, 1, (outs GPR:$Rt, GPR:$Rt2, GPR:$Rn_wb),
+                          (ins addrmode3_pre:$addr), IndexModePre,
+                          LdMiscFrm, IIC_iLoad_d_ru,
+                          "ldrd", "\t$Rt, $Rt2, $addr!",
+                          "$addr.base = $Rn_wb", []> {
+  bits<14> addr;
+  let Inst{23}    = addr{8};      // U bit
+  let Inst{22}    = addr{13};     // 1 == imm8, 0 == Rm
+  let Inst{19-16} = addr{12-9};   // Rn
+  let Inst{11-8}  = addr{7-4};    // imm7_4/zero
+  let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
+  let DecoderMethod = "DecodeAddrMode3Instruction";
+}
+def LDRD_POST: AI3ldstidx<0b1101, 0, 0, (outs GPR:$Rt, GPR:$Rt2, GPR:$Rn_wb),
+                          (ins addr_offset_none:$addr, am3offset:$offset),
+                          IndexModePost, LdMiscFrm, IIC_iLoad_d_ru,
+                          "ldrd", "\t$Rt, $Rt2, $addr, $offset",
+                          "$addr.base = $Rn_wb", []> {
+  bits<10> offset;
+  bits<4> addr;
+  let Inst{23}    = offset{8};      // U bit
+  let Inst{22}    = offset{9};      // 1 == imm8, 0 == Rm
+  let Inst{19-16} = addr;
+  let Inst{11-8}  = offset{7-4};    // imm7_4/zero
+  let Inst{3-0}   = offset{3-0};    // imm3_0/Rm
+  let DecoderMethod = "DecodeAddrMode3Instruction";
+}
+} // hasExtraDefRegAllocReq = 1
+} // mayLoad = 1, neverHasSideEffects = 1
+
+// LDRT, LDRBT, LDRSBT, LDRHT, LDRSHT.
+let mayLoad = 1, neverHasSideEffects = 1 in {
+def LDRT_POST_REG : AI2ldstidx<1, 0, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                    (ins addr_offset_none:$addr, am2offset_reg:$offset),
+                    IndexModePost, LdFrm, IIC_iLoad_ru,
+                    "ldrt", "\t$Rt, $addr, $offset",
+                    "$addr.base = $Rn_wb", []> {
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<14> offset;
+  bits<4> addr;
+  let Inst{25} = 1;
+  let Inst{23} = offset{12};
+  let Inst{21} = 1; // overwrite
+  let Inst{19-16} = addr;
+  let Inst{11-5} = offset{11-5};
+  let Inst{4} = 0;
+  let Inst{3-0} = offset{3-0};
+  let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+}
+
+def LDRT_POST_IMM
+  : AI2ldstidx<1, 0, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+               (ins addr_offset_none:$addr, am2offset_imm:$offset),
+               IndexModePost, LdFrm, IIC_iLoad_ru,
+               "ldrt", "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb", []> {
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<14> offset;
+  bits<4> addr;
+  let Inst{25} = 0;
+  let Inst{23} = offset{12};
+  let Inst{21} = 1; // overwrite
+  let Inst{19-16} = addr;
+  let Inst{11-0} = offset{11-0};
+  let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+}
+
+def LDRBT_POST_REG : AI2ldstidx<1, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                     (ins addr_offset_none:$addr, am2offset_reg:$offset),
+                     IndexModePost, LdFrm, IIC_iLoad_bh_ru,
+                     "ldrbt", "\t$Rt, $addr, $offset",
+                     "$addr.base = $Rn_wb", []> {
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<14> offset;
+  bits<4> addr;
+  let Inst{25} = 1;
+  let Inst{23} = offset{12};
+  let Inst{21} = 1; // overwrite
+  let Inst{19-16} = addr;
+  let Inst{11-5} = offset{11-5};
+  let Inst{4} = 0;
+  let Inst{3-0} = offset{3-0};
+  let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+}
+
+def LDRBT_POST_IMM
+  : AI2ldstidx<1, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+               (ins addr_offset_none:$addr, am2offset_imm:$offset),
+               IndexModePost, LdFrm, IIC_iLoad_bh_ru,
+               "ldrbt", "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb", []> {
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<14> offset;
+  bits<4> addr;
+  let Inst{25} = 0;
+  let Inst{23} = offset{12};
+  let Inst{21} = 1; // overwrite
+  let Inst{19-16} = addr;
+  let Inst{11-0} = offset{11-0};
+  let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+}
+
+multiclass AI3ldrT<bits<4> op, string opc> {
+  def i : AI3ldstidxT<op, 1, (outs GPR:$Rt, GPR:$base_wb),
+                      (ins addr_offset_none:$addr, postidx_imm8:$offset),
+                      IndexModePost, LdMiscFrm, IIC_iLoad_bh_ru, opc,
+                      "\t$Rt, $addr, $offset", "$addr.base = $base_wb", []> {
+    bits<9> offset;
+    let Inst{23} = offset{8};
+    let Inst{22} = 1;
+    let Inst{11-8} = offset{7-4};
+    let Inst{3-0} = offset{3-0};
+  }
+  def r : AI3ldstidxT<op, 1, (outs GPRnopc:$Rt, GPRnopc:$base_wb),
+                      (ins addr_offset_none:$addr, postidx_reg:$Rm),
+                      IndexModePost, LdMiscFrm, IIC_iLoad_bh_ru, opc,
+                      "\t$Rt, $addr, $Rm", "$addr.base = $base_wb", []> {
+    bits<5> Rm;
+    let Inst{23} = Rm{4};
+    let Inst{22} = 0;
+    let Inst{11-8} = 0;
+    let Unpredictable{11-8} = 0b1111;
+    let Inst{3-0} = Rm{3-0};
+    let DecoderMethod = "DecodeLDR";
+  }
+}
+
+defm LDRSBT : AI3ldrT<0b1101, "ldrsbt">;
+defm LDRHT  : AI3ldrT<0b1011, "ldrht">;
+defm LDRSHT : AI3ldrT<0b1111, "ldrsht">;
+}
+
+def LDRT_POST
+  : ARMAsmPseudo<"ldrt${q} $Rt, $addr", (ins addr_offset_none:$addr, pred:$q),
+                 (outs GPR:$Rt)>;
+
+def LDRBT_POST
+  : ARMAsmPseudo<"ldrbt${q} $Rt, $addr", (ins addr_offset_none:$addr, pred:$q),
+                 (outs GPR:$Rt)>;
+
+// Store
+
+// Stores with truncate
+def STRH : AI3str<0b1011, (outs), (ins GPR:$Rt, addrmode3:$addr), StMiscFrm,
+               IIC_iStore_bh_r, "strh", "\t$Rt, $addr",
+               [(truncstorei16 GPR:$Rt, addrmode3:$addr)]>;
+
+// Store doubleword
+let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in {
+  def STRD : AI3str<0b1111, (outs), (ins GPR:$Rt, GPR:$Rt2, addrmode3:$addr),
+                    StMiscFrm, IIC_iStore_d_r, "strd", "\t$Rt, $Rt2, $addr", []>,
+             Requires<[IsARM, HasV5TE]> {
+    let Inst{21} = 0;
+  }
+}
+
+// Indexed stores
+multiclass AI2_stridx<bit isByte, string opc,
+                      InstrItinClass iii, InstrItinClass iir> {
+  def _PRE_IMM : AI2ldstidx<0, isByte, 1, (outs GPR:$Rn_wb),
+                            (ins GPR:$Rt, addrmode_imm12_pre:$addr), IndexModePre,
+                            StFrm, iii,
+                            opc, "\t$Rt, $addr!",
+                            "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> {
+    bits<17> addr;
+    let Inst{25} = 0;
+    let Inst{23}    = addr{12};     // U (add = ('U' == 1))
+    let Inst{19-16} = addr{16-13};  // Rn
+    let Inst{11-0}  = addr{11-0};   // imm12
+    let DecoderMethod = "DecodeSTRPreImm";
+  }
+
+  def _PRE_REG  : AI2ldstidx<0, isByte, 1, (outs GPR:$Rn_wb),
+                      (ins GPR:$Rt, ldst_so_reg:$addr),
+                      IndexModePre, StFrm, iir,
+                      opc, "\t$Rt, $addr!",
+                      "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> {
+    bits<17> addr;
+    let Inst{25} = 1;
+    let Inst{23}    = addr{12};    // U (add = ('U' == 1))
+    let Inst{19-16} = addr{16-13}; // Rn
+    let Inst{11-0}  = addr{11-0};
+    let Inst{4}     = 0;           // Inst{4} = 0
+    let DecoderMethod = "DecodeSTRPreReg";
+  }
+  def _POST_REG : AI2ldstidx<0, isByte, 0, (outs GPR:$Rn_wb),
+                (ins GPR:$Rt, addr_offset_none:$addr, am2offset_reg:$offset),
+                IndexModePost, StFrm, iir,
+                opc, "\t$Rt, $addr, $offset",
+                "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> {
+     // {12}     isAdd
+     // {11-0}   imm12/Rm
+     bits<14> offset;
+     bits<4> addr;
+     let Inst{25} = 1;
+     let Inst{23} = offset{12};
+     let Inst{19-16} = addr;
+     let Inst{11-0} = offset{11-0};
+     let Inst{4} = 0;
+
+    let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+   }
+
+   def _POST_IMM : AI2ldstidx<0, isByte, 0, (outs GPR:$Rn_wb),
+                (ins GPR:$Rt, addr_offset_none:$addr, am2offset_imm:$offset),
+                IndexModePost, StFrm, iii,
+                opc, "\t$Rt, $addr, $offset",
+                "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> {
+    // {12}     isAdd
+    // {11-0}   imm12/Rm
+    bits<14> offset;
+    bits<4> addr;
+    let Inst{25} = 0;
+    let Inst{23} = offset{12};
+    let Inst{19-16} = addr;
+    let Inst{11-0} = offset{11-0};
+
+    let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+  }
+}
+
+let mayStore = 1, neverHasSideEffects = 1 in {
+// FIXME: for STR_PRE_REG etc. the itineray should be either IIC_iStore_ru or
+// IIC_iStore_siu depending on whether it the offset register is shifted.
+defm STR  : AI2_stridx<0, "str", IIC_iStore_iu, IIC_iStore_ru>;
+defm STRB : AI2_stridx<1, "strb", IIC_iStore_bh_iu, IIC_iStore_bh_ru>;
+}
+
+def : ARMPat<(post_store GPR:$Rt, addr_offset_none:$addr,
+                         am2offset_reg:$offset),
+             (STR_POST_REG GPR:$Rt, addr_offset_none:$addr,
+                           am2offset_reg:$offset)>;
+def : ARMPat<(post_store GPR:$Rt, addr_offset_none:$addr,
+                         am2offset_imm:$offset),
+             (STR_POST_IMM GPR:$Rt, addr_offset_none:$addr,
+                           am2offset_imm:$offset)>;
+def : ARMPat<(post_truncsti8 GPR:$Rt, addr_offset_none:$addr,
+                             am2offset_reg:$offset),
+             (STRB_POST_REG GPR:$Rt, addr_offset_none:$addr,
+                            am2offset_reg:$offset)>;
+def : ARMPat<(post_truncsti8 GPR:$Rt, addr_offset_none:$addr,
+                             am2offset_imm:$offset),
+             (STRB_POST_IMM GPR:$Rt, addr_offset_none:$addr,
+                            am2offset_imm:$offset)>;
+
+// Pseudo-instructions for pattern matching the pre-indexed stores. We can't
+// put the patterns on the instruction definitions directly as ISel wants
+// the address base and offset to be separate operands, not a single
+// complex operand like we represent the instructions themselves. The
+// pseudos map between the two.
+let usesCustomInserter = 1,
+    Constraints = "$Rn = $Rn_wb,@earlyclobber $Rn_wb" in {
+def STRi_preidx: ARMPseudoInst<(outs GPR:$Rn_wb),
+               (ins GPR:$Rt, GPR:$Rn, am2offset_imm:$offset, pred:$p),
+               4, IIC_iStore_ru,
+            [(set GPR:$Rn_wb,
+                  (pre_store GPR:$Rt, GPR:$Rn, am2offset_imm:$offset))]>;
+def STRr_preidx: ARMPseudoInst<(outs GPR:$Rn_wb),
+               (ins GPR:$Rt, GPR:$Rn, am2offset_reg:$offset, pred:$p),
+               4, IIC_iStore_ru,
+            [(set GPR:$Rn_wb,
+                  (pre_store GPR:$Rt, GPR:$Rn, am2offset_reg:$offset))]>;
+def STRBi_preidx: ARMPseudoInst<(outs GPR:$Rn_wb),
+               (ins GPR:$Rt, GPR:$Rn, am2offset_imm:$offset, pred:$p),
+               4, IIC_iStore_ru,
+            [(set GPR:$Rn_wb,
+                  (pre_truncsti8 GPR:$Rt, GPR:$Rn, am2offset_imm:$offset))]>;
+def STRBr_preidx: ARMPseudoInst<(outs GPR:$Rn_wb),
+               (ins GPR:$Rt, GPR:$Rn, am2offset_reg:$offset, pred:$p),
+               4, IIC_iStore_ru,
+            [(set GPR:$Rn_wb,
+                  (pre_truncsti8 GPR:$Rt, GPR:$Rn, am2offset_reg:$offset))]>;
+def STRH_preidx: ARMPseudoInst<(outs GPR:$Rn_wb),
+               (ins GPR:$Rt, GPR:$Rn, am3offset:$offset, pred:$p),
+               4, IIC_iStore_ru,
+            [(set GPR:$Rn_wb,
+                  (pre_truncsti16 GPR:$Rt, GPR:$Rn, am3offset:$offset))]>;
+}
+
+
+
+def STRH_PRE  : AI3ldstidx<0b1011, 0, 1, (outs GPR:$Rn_wb),
+                           (ins GPR:$Rt, addrmode3_pre:$addr), IndexModePre,
+                           StMiscFrm, IIC_iStore_bh_ru,
+                           "strh", "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> {
+  bits<14> addr;
+  let Inst{23}    = addr{8};      // U bit
+  let Inst{22}    = addr{13};     // 1 == imm8, 0 == Rm
+  let Inst{19-16} = addr{12-9};   // Rn
+  let Inst{11-8}  = addr{7-4};    // imm7_4/zero
+  let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
+  let DecoderMethod = "DecodeAddrMode3Instruction";
+}
+
+def STRH_POST : AI3ldstidx<0b1011, 0, 0, (outs GPR:$Rn_wb),
+                       (ins GPR:$Rt, addr_offset_none:$addr, am3offset:$offset),
+                       IndexModePost, StMiscFrm, IIC_iStore_bh_ru,
+                       "strh", "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb",
+                   [(set GPR:$Rn_wb, (post_truncsti16 GPR:$Rt,
+                                                      addr_offset_none:$addr,
+                                                      am3offset:$offset))]> {
+  bits<10> offset;
+  bits<4> addr;
+  let Inst{23}    = offset{8};      // U bit
+  let Inst{22}    = offset{9};      // 1 == imm8, 0 == Rm
+  let Inst{19-16} = addr;
+  let Inst{11-8}  = offset{7-4};    // imm7_4/zero
+  let Inst{3-0}   = offset{3-0};    // imm3_0/Rm
+  let DecoderMethod = "DecodeAddrMode3Instruction";
+}
+
+let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in {
+def STRD_PRE : AI3ldstidx<0b1111, 0, 1, (outs GPR:$Rn_wb),
+                          (ins GPR:$Rt, GPR:$Rt2, addrmode3_pre:$addr),
+                          IndexModePre, StMiscFrm, IIC_iStore_d_ru,
+                          "strd", "\t$Rt, $Rt2, $addr!",
+                          "$addr.base = $Rn_wb", []> {
+  bits<14> addr;
+  let Inst{23}    = addr{8};      // U bit
+  let Inst{22}    = addr{13};     // 1 == imm8, 0 == Rm
+  let Inst{19-16} = addr{12-9};   // Rn
+  let Inst{11-8}  = addr{7-4};    // imm7_4/zero
+  let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
+  let DecoderMethod = "DecodeAddrMode3Instruction";
+}
+
+def STRD_POST: AI3ldstidx<0b1111, 0, 0, (outs GPR:$Rn_wb),
+                          (ins GPR:$Rt, GPR:$Rt2, addr_offset_none:$addr,
+                               am3offset:$offset),
+                          IndexModePost, StMiscFrm, IIC_iStore_d_ru,
+                          "strd", "\t$Rt, $Rt2, $addr, $offset",
+                          "$addr.base = $Rn_wb", []> {
+  bits<10> offset;
+  bits<4> addr;
+  let Inst{23}    = offset{8};      // U bit
+  let Inst{22}    = offset{9};      // 1 == imm8, 0 == Rm
+  let Inst{19-16} = addr;
+  let Inst{11-8}  = offset{7-4};    // imm7_4/zero
+  let Inst{3-0}   = offset{3-0};    // imm3_0/Rm
+  let DecoderMethod = "DecodeAddrMode3Instruction";
+}
+} // mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1
+
+// STRT, STRBT, and STRHT
+
+def STRBT_POST_REG : AI2ldstidx<0, 1, 0, (outs GPR:$Rn_wb),
+                   (ins GPR:$Rt, addr_offset_none:$addr, am2offset_reg:$offset),
+                   IndexModePost, StFrm, IIC_iStore_bh_ru,
+                   "strbt", "\t$Rt, $addr, $offset",
+                   "$addr.base = $Rn_wb", []> {
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<14> offset;
+  bits<4> addr;
+  let Inst{25} = 1;
+  let Inst{23} = offset{12};
+  let Inst{21} = 1; // overwrite
+  let Inst{19-16} = addr;
+  let Inst{11-5} = offset{11-5};
+  let Inst{4} = 0;
+  let Inst{3-0} = offset{3-0};
+  let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+}
+
+def STRBT_POST_IMM
+  : AI2ldstidx<0, 1, 0, (outs GPR:$Rn_wb),
+               (ins GPR:$Rt, addr_offset_none:$addr, am2offset_imm:$offset),
+               IndexModePost, StFrm, IIC_iStore_bh_ru,
+               "strbt", "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb", []> {
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<14> offset;
+  bits<4> addr;
+  let Inst{25} = 0;
+  let Inst{23} = offset{12};
+  let Inst{21} = 1; // overwrite
+  let Inst{19-16} = addr;
+  let Inst{11-0} = offset{11-0};
+  let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+}
+
+def STRBT_POST
+  : ARMAsmPseudo<"strbt${q} $Rt, $addr",
+                 (ins GPR:$Rt, addr_offset_none:$addr, pred:$q)>;
+
+let mayStore = 1, neverHasSideEffects = 1 in {
+def STRT_POST_REG : AI2ldstidx<0, 0, 0, (outs GPR:$Rn_wb),
+                   (ins GPR:$Rt, addr_offset_none:$addr, am2offset_reg:$offset),
+                   IndexModePost, StFrm, IIC_iStore_ru,
+                   "strt", "\t$Rt, $addr, $offset",
+                   "$addr.base = $Rn_wb", []> {
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<14> offset;
+  bits<4> addr;
+  let Inst{25} = 1;
+  let Inst{23} = offset{12};
+  let Inst{21} = 1; // overwrite
+  let Inst{19-16} = addr;
+  let Inst{11-5} = offset{11-5};
+  let Inst{4} = 0;
+  let Inst{3-0} = offset{3-0};
+  let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+}
+
+def STRT_POST_IMM
+  : AI2ldstidx<0, 0, 0, (outs GPR:$Rn_wb),
+               (ins GPR:$Rt, addr_offset_none:$addr, am2offset_imm:$offset),
+               IndexModePost, StFrm, IIC_iStore_ru,
+               "strt", "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb", []> {
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<14> offset;
+  bits<4> addr;
+  let Inst{25} = 0;
+  let Inst{23} = offset{12};
+  let Inst{21} = 1; // overwrite
+  let Inst{19-16} = addr;
+  let Inst{11-0} = offset{11-0};
+  let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+}
+}
+
+def STRT_POST
+  : ARMAsmPseudo<"strt${q} $Rt, $addr",
+                 (ins GPR:$Rt, addr_offset_none:$addr, pred:$q)>;
+
+multiclass AI3strT<bits<4> op, string opc> {
+  def i : AI3ldstidxT<op, 0, (outs GPR:$base_wb),
+                    (ins GPR:$Rt, addr_offset_none:$addr, postidx_imm8:$offset),
+                    IndexModePost, StMiscFrm, IIC_iStore_bh_ru, opc,
+                    "\t$Rt, $addr, $offset", "$addr.base = $base_wb", []> {
+    bits<9> offset;
+    let Inst{23} = offset{8};
+    let Inst{22} = 1;
+    let Inst{11-8} = offset{7-4};
+    let Inst{3-0} = offset{3-0};
+  }
+  def r : AI3ldstidxT<op, 0, (outs GPR:$base_wb),
+                      (ins GPR:$Rt, addr_offset_none:$addr, postidx_reg:$Rm),
+                      IndexModePost, StMiscFrm, IIC_iStore_bh_ru, opc,
+                      "\t$Rt, $addr, $Rm", "$addr.base = $base_wb", []> {
+    bits<5> Rm;
+    let Inst{23} = Rm{4};
+    let Inst{22} = 0;
+    let Inst{11-8} = 0;
+    let Inst{3-0} = Rm{3-0};
+  }
+}
+
+
+defm STRHT : AI3strT<0b1011, "strht">;
+
+def STL : AIstrrel<0b00, (outs), (ins GPR:$Rt, addr_offset_none:$addr),
+                   NoItinerary, "stl", "\t$Rt, $addr", []>;
+def STLB : AIstrrel<0b10, (outs), (ins GPR:$Rt, addr_offset_none:$addr),
+                    NoItinerary, "stlb", "\t$Rt, $addr", []>;
+def STLH : AIstrrel<0b11, (outs), (ins GPR:$Rt, addr_offset_none:$addr),
+                    NoItinerary, "stlh", "\t$Rt, $addr", []>;
+
+//===----------------------------------------------------------------------===//
+//  Load / store multiple Instructions.
+//
+
+multiclass arm_ldst_mult<string asm, string sfx, bit L_bit, bit P_bit, Format f,
+                         InstrItinClass itin, InstrItinClass itin_upd> {
+  // IA is the default, so no need for an explicit suffix on the
+  // mnemonic here. Without it is the canonical spelling.
+  def IA :
+    AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         IndexModeNone, f, itin,
+         !strconcat(asm, "${p}\t$Rn, $regs", sfx), "", []> {
+    let Inst{24-23} = 0b01;       // Increment After
+    let Inst{22}    = P_bit;
+    let Inst{21}    = 0;          // No writeback
+    let Inst{20}    = L_bit;
+  }
+  def IA_UPD :
+    AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         IndexModeUpd, f, itin_upd,
+         !strconcat(asm, "${p}\t$Rn!, $regs", sfx), "$Rn = $wb", []> {
+    let Inst{24-23} = 0b01;       // Increment After
+    let Inst{22}    = P_bit;
+    let Inst{21}    = 1;          // Writeback
+    let Inst{20}    = L_bit;
+
+    let DecoderMethod = "DecodeMemMultipleWritebackInstruction";
+  }
+  def DA :
+    AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         IndexModeNone, f, itin,
+         !strconcat(asm, "da${p}\t$Rn, $regs", sfx), "", []> {
+    let Inst{24-23} = 0b00;       // Decrement After
+    let Inst{22}    = P_bit;
+    let Inst{21}    = 0;          // No writeback
+    let Inst{20}    = L_bit;
+  }
+  def DA_UPD :
+    AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         IndexModeUpd, f, itin_upd,
+         !strconcat(asm, "da${p}\t$Rn!, $regs", sfx), "$Rn = $wb", []> {
+    let Inst{24-23} = 0b00;       // Decrement After
+    let Inst{22}    = P_bit;
+    let Inst{21}    = 1;          // Writeback
+    let Inst{20}    = L_bit;
+
+    let DecoderMethod = "DecodeMemMultipleWritebackInstruction";
+  }
+  def DB :
+    AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         IndexModeNone, f, itin,
+         !strconcat(asm, "db${p}\t$Rn, $regs", sfx), "", []> {
+    let Inst{24-23} = 0b10;       // Decrement Before
+    let Inst{22}    = P_bit;
+    let Inst{21}    = 0;          // No writeback
+    let Inst{20}    = L_bit;
+  }
+  def DB_UPD :
+    AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         IndexModeUpd, f, itin_upd,
+         !strconcat(asm, "db${p}\t$Rn!, $regs", sfx), "$Rn = $wb", []> {
+    let Inst{24-23} = 0b10;       // Decrement Before
+    let Inst{22}    = P_bit;
+    let Inst{21}    = 1;          // Writeback
+    let Inst{20}    = L_bit;
+
+    let DecoderMethod = "DecodeMemMultipleWritebackInstruction";
+  }
+  def IB :
+    AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         IndexModeNone, f, itin,
+         !strconcat(asm, "ib${p}\t$Rn, $regs", sfx), "", []> {
+    let Inst{24-23} = 0b11;       // Increment Before
+    let Inst{22}    = P_bit;
+    let Inst{21}    = 0;          // No writeback
+    let Inst{20}    = L_bit;
+  }
+  def IB_UPD :
+    AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         IndexModeUpd, f, itin_upd,
+         !strconcat(asm, "ib${p}\t$Rn!, $regs", sfx), "$Rn = $wb", []> {
+    let Inst{24-23} = 0b11;       // Increment Before
+    let Inst{22}    = P_bit;
+    let Inst{21}    = 1;          // Writeback
+    let Inst{20}    = L_bit;
+
+    let DecoderMethod = "DecodeMemMultipleWritebackInstruction";
+  }
+}
+
+let neverHasSideEffects = 1 in {
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
+defm LDM : arm_ldst_mult<"ldm", "", 1, 0, LdStMulFrm, IIC_iLoad_m,
+                         IIC_iLoad_mu>;
+
+let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
+defm STM : arm_ldst_mult<"stm", "", 0, 0, LdStMulFrm, IIC_iStore_m,
+                         IIC_iStore_mu>;
+
+} // neverHasSideEffects
+
+// FIXME: remove when we have a way to marking a MI with these properties.
+// FIXME: Should pc be an implicit operand like PICADD, etc?
+let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1,
+    hasExtraDefRegAllocReq = 1, isCodeGenOnly = 1 in
+def LDMIA_RET : ARMPseudoExpand<(outs GPR:$wb), (ins GPR:$Rn, pred:$p,
+                                                 reglist:$regs, variable_ops),
+                     4, IIC_iLoad_mBr, [],
+                     (LDMIA_UPD GPR:$wb, GPR:$Rn, pred:$p, reglist:$regs)>,
+      RegConstraint<"$Rn = $wb">;
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
+defm sysLDM : arm_ldst_mult<"ldm", " ^", 1, 1, LdStMulFrm, IIC_iLoad_m,
+                               IIC_iLoad_mu>;
+
+let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
+defm sysSTM : arm_ldst_mult<"stm", " ^", 0, 1, LdStMulFrm, IIC_iStore_m,
+                               IIC_iStore_mu>;
+
+
+
+//===----------------------------------------------------------------------===//
+//  Move Instructions.
+//
+
+let neverHasSideEffects = 1 in
+def MOVr : AsI1<0b1101, (outs GPR:$Rd), (ins GPR:$Rm), DPFrm, IIC_iMOVr,
+                "mov", "\t$Rd, $Rm", []>, UnaryDP, Sched<[WriteALU]> {
+  bits<4> Rd;
+  bits<4> Rm;
+
+  let Inst{19-16} = 0b0000;
+  let Inst{11-4} = 0b00000000;
+  let Inst{25} = 0;
+  let Inst{3-0} = Rm;
+  let Inst{15-12} = Rd;
+}
+
+// A version for the smaller set of tail call registers.
+let neverHasSideEffects = 1 in
+def MOVr_TC : AsI1<0b1101, (outs tcGPR:$Rd), (ins tcGPR:$Rm), DPFrm,
+                IIC_iMOVr, "mov", "\t$Rd, $Rm", []>, UnaryDP, Sched<[WriteALU]> {
+  bits<4> Rd;
+  bits<4> Rm;
+
+  let Inst{11-4} = 0b00000000;
+  let Inst{25} = 0;
+  let Inst{3-0} = Rm;
+  let Inst{15-12} = Rd;
+}
+
+def MOVsr : AsI1<0b1101, (outs GPRnopc:$Rd), (ins shift_so_reg_reg:$src),
+                DPSoRegRegFrm, IIC_iMOVsr,
+                "mov", "\t$Rd, $src",
+                [(set GPRnopc:$Rd, shift_so_reg_reg:$src)]>, UnaryDP,
+                Sched<[WriteALU]> {
+  bits<4> Rd;
+  bits<12> src;
+  let Inst{15-12} = Rd;
+  let Inst{19-16} = 0b0000;
+  let Inst{11-8} = src{11-8};
+  let Inst{7} = 0;
+  let Inst{6-5} = src{6-5};
+  let Inst{4} = 1;
+  let Inst{3-0} = src{3-0};
+  let Inst{25} = 0;
+}
+
+def MOVsi : AsI1<0b1101, (outs GPR:$Rd), (ins shift_so_reg_imm:$src),
+                DPSoRegImmFrm, IIC_iMOVsr,
+                "mov", "\t$Rd, $src", [(set GPR:$Rd, shift_so_reg_imm:$src)]>,
+                UnaryDP, Sched<[WriteALU]> {
+  bits<4> Rd;
+  bits<12> src;
+  let Inst{15-12} = Rd;
+  let Inst{19-16} = 0b0000;
+  let Inst{11-5} = src{11-5};
+  let Inst{4} = 0;
+  let Inst{3-0} = src{3-0};
+  let Inst{25} = 0;
+}
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in
+def MOVi : AsI1<0b1101, (outs GPR:$Rd), (ins so_imm:$imm), DPFrm, IIC_iMOVi,
+                "mov", "\t$Rd, $imm", [(set GPR:$Rd, so_imm:$imm)]>, UnaryDP,
+                Sched<[WriteALU]> {
+  bits<4> Rd;
+  bits<12> imm;
+  let Inst{25} = 1;
+  let Inst{15-12} = Rd;
+  let Inst{19-16} = 0b0000;
+  let Inst{11-0} = imm;
+}
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in
+def MOVi16 : AI1<0b1000, (outs GPR:$Rd), (ins imm0_65535_expr:$imm),
+                 DPFrm, IIC_iMOVi,
+                 "movw", "\t$Rd, $imm",
+                 [(set GPR:$Rd, imm0_65535:$imm)]>,
+                 Requires<[IsARM, HasV6T2]>, UnaryDP, Sched<[WriteALU]> {
+  bits<4> Rd;
+  bits<16> imm;
+  let Inst{15-12} = Rd;
+  let Inst{11-0}  = imm{11-0};
+  let Inst{19-16} = imm{15-12};
+  let Inst{20} = 0;
+  let Inst{25} = 1;
+  let DecoderMethod = "DecodeArmMOVTWInstruction";
+}
+
+def : InstAlias<"mov${p} $Rd, $imm",
+                (MOVi16 GPR:$Rd, imm0_65535_expr:$imm, pred:$p)>,
+        Requires<[IsARM]>;
+
+def MOVi16_ga_pcrel : PseudoInst<(outs GPR:$Rd),
+                                (ins i32imm:$addr, pclabel:$id), IIC_iMOVi, []>,
+                      Sched<[WriteALU]>;
+
+let Constraints = "$src = $Rd" in {
+def MOVTi16 : AI1<0b1010, (outs GPRnopc:$Rd),
+                  (ins GPR:$src, imm0_65535_expr:$imm),
+                  DPFrm, IIC_iMOVi,
+                  "movt", "\t$Rd, $imm",
+                  [(set GPRnopc:$Rd,
+                        (or (and GPR:$src, 0xffff),
+                            lo16AllZero:$imm))]>, UnaryDP,
+                  Requires<[IsARM, HasV6T2]>, Sched<[WriteALU]> {
+  bits<4> Rd;
+  bits<16> imm;
+  let Inst{15-12} = Rd;
+  let Inst{11-0}  = imm{11-0};
+  let Inst{19-16} = imm{15-12};
+  let Inst{20} = 0;
+  let Inst{25} = 1;
+  let DecoderMethod = "DecodeArmMOVTWInstruction";
+}
+
+def MOVTi16_ga_pcrel : PseudoInst<(outs GPR:$Rd),
+                      (ins GPR:$src, i32imm:$addr, pclabel:$id), IIC_iMOVi, []>,
+                      Sched<[WriteALU]>;
+
+} // Constraints
+
+def : ARMPat<(or GPR:$src, 0xffff0000), (MOVTi16 GPR:$src, 0xffff)>,
+      Requires<[IsARM, HasV6T2]>;
+
+let Uses = [CPSR] in
+def RRX: PseudoInst<(outs GPR:$Rd), (ins GPR:$Rm), IIC_iMOVsi,
+                    [(set GPR:$Rd, (ARMrrx GPR:$Rm))]>, UnaryDP,
+                    Requires<[IsARM]>, Sched<[WriteALU]>;
+
+// These aren't really mov instructions, but we have to define them this way
+// due to flag operands.
+
+let Defs = [CPSR] in {
+def MOVsrl_flag : PseudoInst<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi,
+                      [(set GPR:$dst, (ARMsrl_flag GPR:$src))]>, UnaryDP,
+                      Sched<[WriteALU]>, Requires<[IsARM]>;
+def MOVsra_flag : PseudoInst<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi,
+                      [(set GPR:$dst, (ARMsra_flag GPR:$src))]>, UnaryDP,
+                      Sched<[WriteALU]>, Requires<[IsARM]>;
+}
+
+//===----------------------------------------------------------------------===//
+//  Extend Instructions.
+//
+
+// Sign extenders
+
+def SXTB  : AI_ext_rrot<0b01101010,
+                         "sxtb", UnOpFrag<(sext_inreg node:$Src, i8)>>;
+def SXTH  : AI_ext_rrot<0b01101011,
+                         "sxth", UnOpFrag<(sext_inreg node:$Src, i16)>>;
+
+def SXTAB : AI_exta_rrot<0b01101010,
+               "sxtab", BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS, i8))>>;
+def SXTAH : AI_exta_rrot<0b01101011,
+               "sxtah", BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS,i16))>>;
+
+def SXTB16  : AI_ext_rrot_np<0b01101000, "sxtb16">;
+
+def SXTAB16 : AI_exta_rrot_np<0b01101000, "sxtab16">;
+
+// Zero extenders
+
+let AddedComplexity = 16 in {
+def UXTB   : AI_ext_rrot<0b01101110,
+                          "uxtb"  , UnOpFrag<(and node:$Src, 0x000000FF)>>;
+def UXTH   : AI_ext_rrot<0b01101111,
+                          "uxth"  , UnOpFrag<(and node:$Src, 0x0000FFFF)>>;
+def UXTB16 : AI_ext_rrot<0b01101100,
+                          "uxtb16", UnOpFrag<(and node:$Src, 0x00FF00FF)>>;
+
+// FIXME: This pattern incorrectly assumes the shl operator is a rotate.
+//        The transformation should probably be done as a combiner action
+//        instead so we can include a check for masking back in the upper
+//        eight bits of the source into the lower eight bits of the result.
+//def : ARMV6Pat<(and (shl GPR:$Src, (i32 8)), 0xFF00FF),
+//               (UXTB16r_rot GPR:$Src, 3)>;
+def : ARMV6Pat<(and (srl GPR:$Src, (i32 8)), 0xFF00FF),
+               (UXTB16 GPR:$Src, 1)>;
+
+def UXTAB : AI_exta_rrot<0b01101110, "uxtab",
+                        BinOpFrag<(add node:$LHS, (and node:$RHS, 0x00FF))>>;
+def UXTAH : AI_exta_rrot<0b01101111, "uxtah",
+                        BinOpFrag<(add node:$LHS, (and node:$RHS, 0xFFFF))>>;
+}
+
+// This isn't safe in general, the add is two 16-bit units, not a 32-bit add.
+def UXTAB16 : AI_exta_rrot_np<0b01101100, "uxtab16">;
+
+
+def SBFX  : I<(outs GPRnopc:$Rd),
+              (ins GPRnopc:$Rn, imm0_31:$lsb, imm1_32:$width),
+               AddrMode1, 4, IndexModeNone, DPFrm, IIC_iUNAsi,
+               "sbfx", "\t$Rd, $Rn, $lsb, $width", "", []>,
+               Requires<[IsARM, HasV6T2]> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<5> lsb;
+  bits<5> width;
+  let Inst{27-21} = 0b0111101;
+  let Inst{6-4}   = 0b101;
+  let Inst{20-16} = width;
+  let Inst{15-12} = Rd;
+  let Inst{11-7}  = lsb;
+  let Inst{3-0}   = Rn;
+}
+
+def UBFX  : I<(outs GPRnopc:$Rd),
+              (ins GPRnopc:$Rn, imm0_31:$lsb, imm1_32:$width),
+               AddrMode1, 4, IndexModeNone, DPFrm, IIC_iUNAsi,
+               "ubfx", "\t$Rd, $Rn, $lsb, $width", "", []>,
+               Requires<[IsARM, HasV6T2]> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<5> lsb;
+  bits<5> width;
+  let Inst{27-21} = 0b0111111;
+  let Inst{6-4}   = 0b101;
+  let Inst{20-16} = width;
+  let Inst{15-12} = Rd;
+  let Inst{11-7}  = lsb;
+  let Inst{3-0}   = Rn;
+}
+
+//===----------------------------------------------------------------------===//
+//  Arithmetic Instructions.
+//
+
+defm ADD  : AsI1_bin_irs<0b0100, "add",
+                         IIC_iALUi, IIC_iALUr, IIC_iALUsr,
+                         BinOpFrag<(add  node:$LHS, node:$RHS)>, 1>;
+defm SUB  : AsI1_bin_irs<0b0010, "sub",
+                         IIC_iALUi, IIC_iALUr, IIC_iALUsr,
+                         BinOpFrag<(sub  node:$LHS, node:$RHS)>>;
+
+// ADD and SUB with 's' bit set.
+//
+// Currently, ADDS/SUBS are pseudo opcodes that exist only in the
+// selection DAG. They are "lowered" to real ADD/SUB opcodes by
+// AdjustInstrPostInstrSelection where we determine whether or not to
+// set the "s" bit based on CPSR liveness.
+//
+// FIXME: Eliminate ADDS/SUBS pseudo opcodes after adding tablegen
+// support for an optional CPSR definition that corresponds to the DAG
+// node's second value. We can then eliminate the implicit def of CPSR.
+defm ADDS : AsI1_bin_s_irs<IIC_iALUi, IIC_iALUr, IIC_iALUsr,
+                           BinOpFrag<(ARMaddc node:$LHS, node:$RHS)>, 1>;
+defm SUBS : AsI1_bin_s_irs<IIC_iALUi, IIC_iALUr, IIC_iALUsr,
+                           BinOpFrag<(ARMsubc node:$LHS, node:$RHS)>>;
+
+defm ADC : AI1_adde_sube_irs<0b0101, "adc",
+              BinOpWithFlagFrag<(ARMadde node:$LHS, node:$RHS, node:$FLAG)>, 1>;
+defm SBC : AI1_adde_sube_irs<0b0110, "sbc",
+              BinOpWithFlagFrag<(ARMsube node:$LHS, node:$RHS, node:$FLAG)>>;
+
+defm RSB  : AsI1_rbin_irs<0b0011, "rsb",
+                          IIC_iALUi, IIC_iALUr, IIC_iALUsr,
+                          BinOpFrag<(sub node:$LHS, node:$RHS)>>;
+
+// FIXME: Eliminate them if we can write def : Pat patterns which defines
+// CPSR and the implicit def of CPSR is not needed.
+defm RSBS : AsI1_rbin_s_is<IIC_iALUi, IIC_iALUr, IIC_iALUsr,
+                           BinOpFrag<(ARMsubc node:$LHS, node:$RHS)>>;
+
+defm RSC : AI1_rsc_irs<0b0111, "rsc",
+                BinOpWithFlagFrag<(ARMsube node:$LHS, node:$RHS, node:$FLAG)>>;
+
+// (sub X, imm) gets canonicalized to (add X, -imm).  Match this form.
+// The assume-no-carry-in form uses the negation of the input since add/sub
+// assume opposite meanings of the carry flag (i.e., carry == !borrow).
+// See the definition of AddWithCarry() in the ARM ARM A2.2.1 for the gory
+// details.
+def : ARMPat<(add     GPR:$src, so_imm_neg:$imm),
+             (SUBri   GPR:$src, so_imm_neg:$imm)>;
+def : ARMPat<(ARMaddc GPR:$src, so_imm_neg:$imm),
+             (SUBSri  GPR:$src, so_imm_neg:$imm)>;
+
+def : ARMPat<(add     GPR:$src, imm0_65535_neg:$imm),
+             (SUBrr   GPR:$src, (MOVi16 (imm_neg_XFORM imm:$imm)))>,
+             Requires<[IsARM, HasV6T2]>;
+def : ARMPat<(ARMaddc GPR:$src, imm0_65535_neg:$imm),
+             (SUBSrr  GPR:$src, (MOVi16 (imm_neg_XFORM imm:$imm)))>,
+             Requires<[IsARM, HasV6T2]>;
+
+// The with-carry-in form matches bitwise not instead of the negation.
+// Effectively, the inverse interpretation of the carry flag already accounts
+// for part of the negation.
+def : ARMPat<(ARMadde GPR:$src, so_imm_not:$imm, CPSR),
+             (SBCri   GPR:$src, so_imm_not:$imm)>;
+def : ARMPat<(ARMadde GPR:$src, imm0_65535_neg:$imm, CPSR),
+             (SBCrr   GPR:$src, (MOVi16 (imm_not_XFORM imm:$imm)))>,
+             Requires<[IsARM, HasV6T2]>;
+
+// Note: These are implemented in C++ code, because they have to generate
+// ADD/SUBrs instructions, which use a complex pattern that a xform function
+// cannot produce.
+// (mul X, 2^n+1) -> (add (X << n), X)
+// (mul X, 2^n-1) -> (rsb X, (X << n))
+
+// ARM Arithmetic Instruction
+// GPR:$dst = GPR:$a op GPR:$b
+class AAI<bits<8> op27_20, bits<8> op11_4, string opc,
+          list<dag> pattern = [],
+          dag iops = (ins GPRnopc:$Rn, GPRnopc:$Rm),
+          string asm = "\t$Rd, $Rn, $Rm">
+  : AI<(outs GPRnopc:$Rd), iops, DPFrm, IIC_iALUr, opc, asm, pattern>,
+    Sched<[WriteALU, ReadALU, ReadALU]> {
+  bits<4> Rn;
+  bits<4> Rd;
+  bits<4> Rm;
+  let Inst{27-20} = op27_20;
+  let Inst{11-4} = op11_4;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = Rd;
+  let Inst{3-0}   = Rm;
+
+  let Unpredictable{11-8} = 0b1111;
+}
+
+// Saturating add/subtract
+
+let DecoderMethod = "DecodeQADDInstruction" in
+def QADD    : AAI<0b00010000, 0b00000101, "qadd",
+                  [(set GPRnopc:$Rd, (int_arm_qadd GPRnopc:$Rm, GPRnopc:$Rn))],
+                  (ins GPRnopc:$Rm, GPRnopc:$Rn), "\t$Rd, $Rm, $Rn">;
+
+def QSUB    : AAI<0b00010010, 0b00000101, "qsub",
+                  [(set GPRnopc:$Rd, (int_arm_qsub GPRnopc:$Rm, GPRnopc:$Rn))],
+                  (ins GPRnopc:$Rm, GPRnopc:$Rn), "\t$Rd, $Rm, $Rn">;
+def QDADD   : AAI<0b00010100, 0b00000101, "qdadd", [],
+                  (ins GPRnopc:$Rm, GPRnopc:$Rn),
+                  "\t$Rd, $Rm, $Rn">;
+def QDSUB   : AAI<0b00010110, 0b00000101, "qdsub", [],
+                  (ins GPRnopc:$Rm, GPRnopc:$Rn),
+                  "\t$Rd, $Rm, $Rn">;
+
+def QADD16  : AAI<0b01100010, 0b11110001, "qadd16">;
+def QADD8   : AAI<0b01100010, 0b11111001, "qadd8">;
+def QASX    : AAI<0b01100010, 0b11110011, "qasx">;
+def QSAX    : AAI<0b01100010, 0b11110101, "qsax">;
+def QSUB16  : AAI<0b01100010, 0b11110111, "qsub16">;
+def QSUB8   : AAI<0b01100010, 0b11111111, "qsub8">;
+def UQADD16 : AAI<0b01100110, 0b11110001, "uqadd16">;
+def UQADD8  : AAI<0b01100110, 0b11111001, "uqadd8">;
+def UQASX   : AAI<0b01100110, 0b11110011, "uqasx">;
+def UQSAX   : AAI<0b01100110, 0b11110101, "uqsax">;
+def UQSUB16 : AAI<0b01100110, 0b11110111, "uqsub16">;
+def UQSUB8  : AAI<0b01100110, 0b11111111, "uqsub8">;
+
+// Signed/Unsigned add/subtract
+
+def SASX   : AAI<0b01100001, 0b11110011, "sasx">;
+def SADD16 : AAI<0b01100001, 0b11110001, "sadd16">;
+def SADD8  : AAI<0b01100001, 0b11111001, "sadd8">;
+def SSAX   : AAI<0b01100001, 0b11110101, "ssax">;
+def SSUB16 : AAI<0b01100001, 0b11110111, "ssub16">;
+def SSUB8  : AAI<0b01100001, 0b11111111, "ssub8">;
+def UASX   : AAI<0b01100101, 0b11110011, "uasx">;
+def UADD16 : AAI<0b01100101, 0b11110001, "uadd16">;
+def UADD8  : AAI<0b01100101, 0b11111001, "uadd8">;
+def USAX   : AAI<0b01100101, 0b11110101, "usax">;
+def USUB16 : AAI<0b01100101, 0b11110111, "usub16">;
+def USUB8  : AAI<0b01100101, 0b11111111, "usub8">;
+
+// Signed/Unsigned halving add/subtract
+
+def SHASX   : AAI<0b01100011, 0b11110011, "shasx">;
+def SHADD16 : AAI<0b01100011, 0b11110001, "shadd16">;
+def SHADD8  : AAI<0b01100011, 0b11111001, "shadd8">;
+def SHSAX   : AAI<0b01100011, 0b11110101, "shsax">;
+def SHSUB16 : AAI<0b01100011, 0b11110111, "shsub16">;
+def SHSUB8  : AAI<0b01100011, 0b11111111, "shsub8">;
+def UHASX   : AAI<0b01100111, 0b11110011, "uhasx">;
+def UHADD16 : AAI<0b01100111, 0b11110001, "uhadd16">;
+def UHADD8  : AAI<0b01100111, 0b11111001, "uhadd8">;
+def UHSAX   : AAI<0b01100111, 0b11110101, "uhsax">;
+def UHSUB16 : AAI<0b01100111, 0b11110111, "uhsub16">;
+def UHSUB8  : AAI<0b01100111, 0b11111111, "uhsub8">;
+
+// Unsigned Sum of Absolute Differences [and Accumulate].
+
+def USAD8  : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+                MulFrm /* for convenience */, NoItinerary, "usad8",
+                "\t$Rd, $Rn, $Rm", []>,
+             Requires<[IsARM, HasV6]>, Sched<[WriteALU, ReadALU, ReadALU]> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+  let Inst{27-20} = 0b01111000;
+  let Inst{15-12} = 0b1111;
+  let Inst{7-4} = 0b0001;
+  let Inst{19-16} = Rd;
+  let Inst{11-8} = Rm;
+  let Inst{3-0} = Rn;
+}
+def USADA8 : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+                MulFrm /* for convenience */, NoItinerary, "usada8",
+                "\t$Rd, $Rn, $Rm, $Ra", []>,
+             Requires<[IsARM, HasV6]>, Sched<[WriteALU, ReadALU, ReadALU]>{
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+  bits<4> Ra;
+  let Inst{27-20} = 0b01111000;
+  let Inst{7-4} = 0b0001;
+  let Inst{19-16} = Rd;
+  let Inst{15-12} = Ra;
+  let Inst{11-8} = Rm;
+  let Inst{3-0} = Rn;
+}
+
+// Signed/Unsigned saturate
+
+def SSAT : AI<(outs GPRnopc:$Rd),
+              (ins imm1_32:$sat_imm, GPRnopc:$Rn, shift_imm:$sh),
+              SatFrm, NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", []> {
+  bits<4> Rd;
+  bits<5> sat_imm;
+  bits<4> Rn;
+  bits<8> sh;
+  let Inst{27-21} = 0b0110101;
+  let Inst{5-4} = 0b01;
+  let Inst{20-16} = sat_imm;
+  let Inst{15-12} = Rd;
+  let Inst{11-7} = sh{4-0};
+  let Inst{6} = sh{5};
+  let Inst{3-0} = Rn;
+}
+
+def SSAT16 : AI<(outs GPRnopc:$Rd),
+                (ins imm1_16:$sat_imm, GPRnopc:$Rn), SatFrm,
+                NoItinerary, "ssat16", "\t$Rd, $sat_imm, $Rn", []> {
+  bits<4> Rd;
+  bits<4> sat_imm;
+  bits<4> Rn;
+  let Inst{27-20} = 0b01101010;
+  let Inst{11-4} = 0b11110011;
+  let Inst{15-12} = Rd;
+  let Inst{19-16} = sat_imm;
+  let Inst{3-0} = Rn;
+}
+
+def USAT : AI<(outs GPRnopc:$Rd),
+              (ins imm0_31:$sat_imm, GPRnopc:$Rn, shift_imm:$sh),
+              SatFrm, NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh", []> {
+  bits<4> Rd;
+  bits<5> sat_imm;
+  bits<4> Rn;
+  bits<8> sh;
+  let Inst{27-21} = 0b0110111;
+  let Inst{5-4} = 0b01;
+  let Inst{15-12} = Rd;
+  let Inst{11-7} = sh{4-0};
+  let Inst{6} = sh{5};
+  let Inst{20-16} = sat_imm;
+  let Inst{3-0} = Rn;
+}
+
+def USAT16 : AI<(outs GPRnopc:$Rd),
+                (ins imm0_15:$sat_imm, GPRnopc:$Rn), SatFrm,
+                NoItinerary, "usat16", "\t$Rd, $sat_imm, $Rn", []> {
+  bits<4> Rd;
+  bits<4> sat_imm;
+  bits<4> Rn;
+  let Inst{27-20} = 0b01101110;
+  let Inst{11-4} = 0b11110011;
+  let Inst{15-12} = Rd;
+  let Inst{19-16} = sat_imm;
+  let Inst{3-0} = Rn;
+}
+
+def : ARMV6Pat<(int_arm_ssat GPRnopc:$a, imm:$pos),
+               (SSAT imm:$pos, GPRnopc:$a, 0)>;
+def : ARMV6Pat<(int_arm_usat GPRnopc:$a, imm:$pos),
+               (USAT imm:$pos, GPRnopc:$a, 0)>;
+
+//===----------------------------------------------------------------------===//
+//  Bitwise Instructions.
+//
+
+defm AND   : AsI1_bin_irs<0b0000, "and",
+                          IIC_iBITi, IIC_iBITr, IIC_iBITsr,
+                          BinOpFrag<(and node:$LHS, node:$RHS)>, 1>;
+defm ORR   : AsI1_bin_irs<0b1100, "orr",
+                          IIC_iBITi, IIC_iBITr, IIC_iBITsr,
+                          BinOpFrag<(or  node:$LHS, node:$RHS)>, 1>;
+defm EOR   : AsI1_bin_irs<0b0001, "eor",
+                          IIC_iBITi, IIC_iBITr, IIC_iBITsr,
+                          BinOpFrag<(xor node:$LHS, node:$RHS)>, 1>;
+defm BIC   : AsI1_bin_irs<0b1110, "bic",
+                          IIC_iBITi, IIC_iBITr, IIC_iBITsr,
+                          BinOpFrag<(and node:$LHS, (not node:$RHS))>>;
+
+// FIXME: bf_inv_mask_imm should be two operands, the lsb and the msb, just
+// like in the actual instruction encoding. The complexity of mapping the mask
+// to the lsb/msb pair should be handled by ISel, not encapsulated in the
+// instruction description.
+def BFC    : I<(outs GPR:$Rd), (ins GPR:$src, bf_inv_mask_imm:$imm),
+               AddrMode1, 4, IndexModeNone, DPFrm, IIC_iUNAsi,
+               "bfc", "\t$Rd, $imm", "$src = $Rd",
+               [(set GPR:$Rd, (and GPR:$src, bf_inv_mask_imm:$imm))]>,
+               Requires<[IsARM, HasV6T2]> {
+  bits<4> Rd;
+  bits<10> imm;
+  let Inst{27-21} = 0b0111110;
+  let Inst{6-0}   = 0b0011111;
+  let Inst{15-12} = Rd;
+  let Inst{11-7}  = imm{4-0}; // lsb
+  let Inst{20-16} = imm{9-5}; // msb
+}
+
+// A8.6.18  BFI - Bitfield insert (Encoding A1)
+def BFI:I<(outs GPRnopc:$Rd), (ins GPRnopc:$src, GPR:$Rn, bf_inv_mask_imm:$imm),
+          AddrMode1, 4, IndexModeNone, DPFrm, IIC_iUNAsi,
+          "bfi", "\t$Rd, $Rn, $imm", "$src = $Rd",
+          [(set GPRnopc:$Rd, (ARMbfi GPRnopc:$src, GPR:$Rn,
+                           bf_inv_mask_imm:$imm))]>,
+          Requires<[IsARM, HasV6T2]> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<10> imm;
+  let Inst{27-21} = 0b0111110;
+  let Inst{6-4}   = 0b001; // Rn: Inst{3-0} != 15
+  let Inst{15-12} = Rd;
+  let Inst{11-7}  = imm{4-0}; // lsb
+  let Inst{20-16} = imm{9-5}; // width
+  let Inst{3-0}   = Rn;
+}
+
+def  MVNr  : AsI1<0b1111, (outs GPR:$Rd), (ins GPR:$Rm), DPFrm, IIC_iMVNr,
+                  "mvn", "\t$Rd, $Rm",
+                  [(set GPR:$Rd, (not GPR:$Rm))]>, UnaryDP, Sched<[WriteALU]> {
+  bits<4> Rd;
+  bits<4> Rm;
+  let Inst{25} = 0;
+  let Inst{19-16} = 0b0000;
+  let Inst{11-4} = 0b00000000;
+  let Inst{15-12} = Rd;
+  let Inst{3-0} = Rm;
+}
+def  MVNsi  : AsI1<0b1111, (outs GPR:$Rd), (ins so_reg_imm:$shift),
+                  DPSoRegImmFrm, IIC_iMVNsr, "mvn", "\t$Rd, $shift",
+                  [(set GPR:$Rd, (not so_reg_imm:$shift))]>, UnaryDP,
+                  Sched<[WriteALU]> {
+  bits<4> Rd;
+  bits<12> shift;
+  let Inst{25} = 0;
+  let Inst{19-16} = 0b0000;
+  let Inst{15-12} = Rd;
+  let Inst{11-5} = shift{11-5};
+  let Inst{4} = 0;
+  let Inst{3-0} = shift{3-0};
+}
+def  MVNsr  : AsI1<0b1111, (outs GPR:$Rd), (ins so_reg_reg:$shift),
+                  DPSoRegRegFrm, IIC_iMVNsr, "mvn", "\t$Rd, $shift",
+                  [(set GPR:$Rd, (not so_reg_reg:$shift))]>, UnaryDP,
+                  Sched<[WriteALU]> {
+  bits<4> Rd;
+  bits<12> shift;
+  let Inst{25} = 0;
+  let Inst{19-16} = 0b0000;
+  let Inst{15-12} = Rd;
+  let Inst{11-8} = shift{11-8};
+  let Inst{7} = 0;
+  let Inst{6-5} = shift{6-5};
+  let Inst{4} = 1;
+  let Inst{3-0} = shift{3-0};
+}
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in
+def  MVNi  : AsI1<0b1111, (outs GPR:$Rd), (ins so_imm:$imm), DPFrm,
+                  IIC_iMVNi, "mvn", "\t$Rd, $imm",
+                  [(set GPR:$Rd, so_imm_not:$imm)]>,UnaryDP, Sched<[WriteALU]> {
+  bits<4> Rd;
+  bits<12> imm;
+  let Inst{25} = 1;
+  let Inst{19-16} = 0b0000;
+  let Inst{15-12} = Rd;
+  let Inst{11-0} = imm;
+}
+
+def : ARMPat<(and   GPR:$src, so_imm_not:$imm),
+             (BICri GPR:$src, so_imm_not:$imm)>;
+
+//===----------------------------------------------------------------------===//
+//  Multiply Instructions.
+//
+class AsMul1I32<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
+             string opc, string asm, list<dag> pattern>
+  : AsMul1I<opcod, oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rm;
+  bits<4> Rn;
+  let Inst{19-16} = Rd;
+  let Inst{11-8}  = Rm;
+  let Inst{3-0}   = Rn;
+}
+class AsMul1I64<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
+             string opc, string asm, list<dag> pattern>
+  : AsMul1I<opcod, oops, iops, itin, opc, asm, pattern> {
+  bits<4> RdLo;
+  bits<4> RdHi;
+  bits<4> Rm;
+  bits<4> Rn;
+  let Inst{19-16} = RdHi;
+  let Inst{15-12} = RdLo;
+  let Inst{11-8}  = Rm;
+  let Inst{3-0}   = Rn;
+}
+class AsMla1I64<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
+             string opc, string asm, list<dag> pattern>
+  : AsMul1I<opcod, oops, iops, itin, opc, asm, pattern> {
+  bits<4> RdLo;
+  bits<4> RdHi;
+  bits<4> Rm;
+  bits<4> Rn;
+  let Inst{19-16} = RdHi;
+  let Inst{15-12} = RdLo;
+  let Inst{11-8}  = Rm;
+  let Inst{3-0}   = Rn;
+}
+
+// FIXME: The v5 pseudos are only necessary for the additional Constraint
+//        property. Remove them when it's possible to add those properties
+//        on an individual MachineInstr, not just an instruction description.
+let isCommutable = 1, TwoOperandAliasConstraint = "$Rn = $Rd" in {
+def MUL : AsMul1I32<0b0000000, (outs GPRnopc:$Rd),
+                    (ins GPRnopc:$Rn, GPRnopc:$Rm),
+                    IIC_iMUL32, "mul", "\t$Rd, $Rn, $Rm",
+                  [(set GPRnopc:$Rd, (mul GPRnopc:$Rn, GPRnopc:$Rm))]>,
+                  Requires<[IsARM, HasV6]> {
+  let Inst{15-12} = 0b0000;
+  let Unpredictable{15-12} = 0b1111;
+}
+
+let Constraints = "@earlyclobber $Rd" in
+def MULv5: ARMPseudoExpand<(outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm,
+                                                    pred:$p, cc_out:$s),
+                           4, IIC_iMUL32,
+               [(set GPRnopc:$Rd, (mul GPRnopc:$Rn, GPRnopc:$Rm))],
+               (MUL GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p, cc_out:$s)>,
+               Requires<[IsARM, NoV6, UseMulOps]>;
+}
+
+def MLA  : AsMul1I32<0b0000001, (outs GPRnopc:$Rd),
+                     (ins GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$Ra),
+                     IIC_iMAC32, "mla", "\t$Rd, $Rn, $Rm, $Ra",
+        [(set GPRnopc:$Rd, (add (mul GPRnopc:$Rn, GPRnopc:$Rm), GPRnopc:$Ra))]>,
+                     Requires<[IsARM, HasV6, UseMulOps]> {
+  bits<4> Ra;
+  let Inst{15-12} = Ra;
+}
+
+let Constraints = "@earlyclobber $Rd" in
+def MLAv5: ARMPseudoExpand<(outs GPRnopc:$Rd),
+                           (ins GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$Ra,
+                            pred:$p, cc_out:$s), 4, IIC_iMAC32,
+         [(set GPRnopc:$Rd, (add (mul GPRnopc:$Rn, GPRnopc:$Rm), GPRnopc:$Ra))],
+  (MLA GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$Ra, pred:$p, cc_out:$s)>,
+                           Requires<[IsARM, NoV6]>;
+
+def MLS  : AMul1I<0b0000011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+                   IIC_iMAC32, "mls", "\t$Rd, $Rn, $Rm, $Ra",
+                   [(set GPR:$Rd, (sub GPR:$Ra, (mul GPR:$Rn, GPR:$Rm)))]>,
+                   Requires<[IsARM, HasV6T2, UseMulOps]> {
+  bits<4> Rd;
+  bits<4> Rm;
+  bits<4> Rn;
+  bits<4> Ra;
+  let Inst{19-16} = Rd;
+  let Inst{15-12} = Ra;
+  let Inst{11-8}  = Rm;
+  let Inst{3-0}   = Rn;
+}
+
+// Extra precision multiplies with low / high results
+let neverHasSideEffects = 1 in {
+let isCommutable = 1 in {
+def SMULL : AsMul1I64<0b0000110, (outs GPR:$RdLo, GPR:$RdHi),
+                                 (ins GPR:$Rn, GPR:$Rm), IIC_iMUL64,
+                    "smull", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+                    Requires<[IsARM, HasV6]>;
+
+def UMULL : AsMul1I64<0b0000100, (outs GPR:$RdLo, GPR:$RdHi),
+                                 (ins GPR:$Rn, GPR:$Rm), IIC_iMUL64,
+                    "umull", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+                    Requires<[IsARM, HasV6]>;
+
+let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi" in {
+def SMULLv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
+                            (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s),
+                            4, IIC_iMUL64, [],
+          (SMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>,
+                           Requires<[IsARM, NoV6]>;
+
+def UMULLv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
+                            (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s),
+                            4, IIC_iMUL64, [],
+          (UMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>,
+                           Requires<[IsARM, NoV6]>;
+}
+}
+
+// Multiply + accumulate
+def SMLAL : AsMla1I64<0b0000111, (outs GPR:$RdLo, GPR:$RdHi),
+                        (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), IIC_iMAC64,
+                    "smlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+         RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]>;
+def UMLAL : AsMla1I64<0b0000101, (outs GPR:$RdLo, GPR:$RdHi),
+                        (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), IIC_iMAC64,
+                    "umlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+         RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]>;
+
+def UMAAL : AMul1I <0b0000010, (outs GPR:$RdLo, GPR:$RdHi),
+                               (ins GPR:$Rn, GPR:$Rm), IIC_iMAC64,
+                    "umaal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+                    Requires<[IsARM, HasV6]> {
+  bits<4> RdLo;
+  bits<4> RdHi;
+  bits<4> Rm;
+  bits<4> Rn;
+  let Inst{19-16} = RdHi;
+  let Inst{15-12} = RdLo;
+  let Inst{11-8}  = Rm;
+  let Inst{3-0}   = Rn;
+}
+
+let Constraints =
+    "@earlyclobber $RdLo,@earlyclobber $RdHi,$RLo = $RdLo,$RHi = $RdHi" in {
+def SMLALv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
+                (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi, pred:$p, cc_out:$s),
+                              4, IIC_iMAC64, [],
+             (SMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi,
+                           pred:$p, cc_out:$s)>,
+                           Requires<[IsARM, NoV6]>;
+def UMLALv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
+                (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi, pred:$p, cc_out:$s),
+                              4, IIC_iMAC64, [],
+             (UMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi,
+                           pred:$p, cc_out:$s)>,
+                           Requires<[IsARM, NoV6]>;
+}
+
+} // neverHasSideEffects
+
+// Most significant word multiply
+def SMMUL : AMul2I <0b0111010, 0b0001, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+               IIC_iMUL32, "smmul", "\t$Rd, $Rn, $Rm",
+               [(set GPR:$Rd, (mulhs GPR:$Rn, GPR:$Rm))]>,
+            Requires<[IsARM, HasV6]> {
+  let Inst{15-12} = 0b1111;
+}
+
+def SMMULR : AMul2I <0b0111010, 0b0011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+               IIC_iMUL32, "smmulr", "\t$Rd, $Rn, $Rm", []>,
+            Requires<[IsARM, HasV6]> {
+  let Inst{15-12} = 0b1111;
+}
+
+def SMMLA : AMul2Ia <0b0111010, 0b0001, (outs GPR:$Rd),
+               (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+               IIC_iMAC32, "smmla", "\t$Rd, $Rn, $Rm, $Ra",
+               [(set GPR:$Rd, (add (mulhs GPR:$Rn, GPR:$Rm), GPR:$Ra))]>,
+            Requires<[IsARM, HasV6, UseMulOps]>;
+
+def SMMLAR : AMul2Ia <0b0111010, 0b0011, (outs GPR:$Rd),
+               (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+               IIC_iMAC32, "smmlar", "\t$Rd, $Rn, $Rm, $Ra", []>,
+            Requires<[IsARM, HasV6]>;
+
+def SMMLS : AMul2Ia <0b0111010, 0b1101, (outs GPR:$Rd),
+               (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+               IIC_iMAC32, "smmls", "\t$Rd, $Rn, $Rm, $Ra", []>,
+            Requires<[IsARM, HasV6, UseMulOps]>;
+
+def SMMLSR : AMul2Ia <0b0111010, 0b1111, (outs GPR:$Rd),
+               (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+               IIC_iMAC32, "smmlsr", "\t$Rd, $Rn, $Rm, $Ra", []>,
+            Requires<[IsARM, HasV6]>;
+
+multiclass AI_smul<string opc, PatFrag opnode> {
+  def BB : AMulxyI<0b0001011, 0b00, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+              IIC_iMUL16, !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm",
+              [(set GPR:$Rd, (opnode (sext_inreg GPR:$Rn, i16),
+                                      (sext_inreg GPR:$Rm, i16)))]>,
+           Requires<[IsARM, HasV5TE]>;
+
+  def BT : AMulxyI<0b0001011, 0b10, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+              IIC_iMUL16, !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm",
+              [(set GPR:$Rd, (opnode (sext_inreg GPR:$Rn, i16),
+                                      (sra GPR:$Rm, (i32 16))))]>,
+           Requires<[IsARM, HasV5TE]>;
+
+  def TB : AMulxyI<0b0001011, 0b01, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+              IIC_iMUL16, !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm",
+              [(set GPR:$Rd, (opnode (sra GPR:$Rn, (i32 16)),
+                                      (sext_inreg GPR:$Rm, i16)))]>,
+           Requires<[IsARM, HasV5TE]>;
+
+  def TT : AMulxyI<0b0001011, 0b11, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+              IIC_iMUL16, !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm",
+              [(set GPR:$Rd, (opnode (sra GPR:$Rn, (i32 16)),
+                                      (sra GPR:$Rm, (i32 16))))]>,
+            Requires<[IsARM, HasV5TE]>;
+
+  def WB : AMulxyI<0b0001001, 0b01, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+              IIC_iMUL16, !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm",
+              [(set GPR:$Rd, (sra (opnode GPR:$Rn,
+                                    (sext_inreg GPR:$Rm, i16)), (i32 16)))]>,
+           Requires<[IsARM, HasV5TE]>;
+
+  def WT : AMulxyI<0b0001001, 0b11, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+              IIC_iMUL16, !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm",
+              [(set GPR:$Rd, (sra (opnode GPR:$Rn,
+                                    (sra GPR:$Rm, (i32 16))), (i32 16)))]>,
+            Requires<[IsARM, HasV5TE]>;
+}
+
+
+multiclass AI_smla<string opc, PatFrag opnode> {
+  let DecoderMethod = "DecodeSMLAInstruction" in {
+  def BB : AMulxyIa<0b0001000, 0b00, (outs GPRnopc:$Rd),
+              (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
+              IIC_iMAC16, !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm, $Ra",
+              [(set GPRnopc:$Rd, (add GPR:$Ra,
+                               (opnode (sext_inreg GPRnopc:$Rn, i16),
+                                       (sext_inreg GPRnopc:$Rm, i16))))]>,
+           Requires<[IsARM, HasV5TE, UseMulOps]>;
+
+  def BT : AMulxyIa<0b0001000, 0b10, (outs GPRnopc:$Rd),
+              (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
+              IIC_iMAC16, !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm, $Ra",
+              [(set GPRnopc:$Rd,
+                    (add GPR:$Ra, (opnode (sext_inreg GPRnopc:$Rn, i16),
+                                          (sra GPRnopc:$Rm, (i32 16)))))]>,
+           Requires<[IsARM, HasV5TE, UseMulOps]>;
+
+  def TB : AMulxyIa<0b0001000, 0b01, (outs GPRnopc:$Rd),
+              (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
+              IIC_iMAC16, !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm, $Ra",
+              [(set GPRnopc:$Rd,
+                    (add GPR:$Ra, (opnode (sra GPRnopc:$Rn, (i32 16)),
+                                          (sext_inreg GPRnopc:$Rm, i16))))]>,
+           Requires<[IsARM, HasV5TE, UseMulOps]>;
+
+  def TT : AMulxyIa<0b0001000, 0b11, (outs GPRnopc:$Rd),
+              (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
+              IIC_iMAC16, !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm, $Ra",
+             [(set GPRnopc:$Rd,
+                   (add GPR:$Ra, (opnode (sra GPRnopc:$Rn, (i32 16)),
+                                         (sra GPRnopc:$Rm, (i32 16)))))]>,
+            Requires<[IsARM, HasV5TE, UseMulOps]>;
+
+  def WB : AMulxyIa<0b0001001, 0b00, (outs GPRnopc:$Rd),
+              (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
+              IIC_iMAC16, !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm, $Ra",
+              [(set GPRnopc:$Rd,
+                    (add GPR:$Ra, (sra (opnode GPRnopc:$Rn,
+                                  (sext_inreg GPRnopc:$Rm, i16)), (i32 16))))]>,
+           Requires<[IsARM, HasV5TE, UseMulOps]>;
+
+  def WT : AMulxyIa<0b0001001, 0b10, (outs GPRnopc:$Rd),
+              (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
+              IIC_iMAC16, !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm, $Ra",
+              [(set GPRnopc:$Rd,
+                 (add GPR:$Ra, (sra (opnode GPRnopc:$Rn,
+                                    (sra GPRnopc:$Rm, (i32 16))), (i32 16))))]>,
+            Requires<[IsARM, HasV5TE, UseMulOps]>;
+  }
+}
+
+defm SMUL : AI_smul<"smul", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
+defm SMLA : AI_smla<"smla", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
+
+// Halfword multiply accumulate long: SMLAL<x><y>.
+def SMLALBB : AMulxyI64<0b0001010, 0b00, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
+                      (ins GPRnopc:$Rn, GPRnopc:$Rm),
+                      IIC_iMAC64, "smlalbb", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+              Requires<[IsARM, HasV5TE]>;
+
+def SMLALBT : AMulxyI64<0b0001010, 0b10, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
+                      (ins GPRnopc:$Rn, GPRnopc:$Rm),
+                      IIC_iMAC64, "smlalbt", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+              Requires<[IsARM, HasV5TE]>;
+
+def SMLALTB : AMulxyI64<0b0001010, 0b01, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
+                      (ins GPRnopc:$Rn, GPRnopc:$Rm),
+                      IIC_iMAC64, "smlaltb", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+              Requires<[IsARM, HasV5TE]>;
+
+def SMLALTT : AMulxyI64<0b0001010, 0b11, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
+                      (ins GPRnopc:$Rn, GPRnopc:$Rm),
+                      IIC_iMAC64, "smlaltt", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+              Requires<[IsARM, HasV5TE]>;
+
+// Helper class for AI_smld.
+class AMulDualIbase<bit long, bit sub, bit swap, dag oops, dag iops,
+                    InstrItinClass itin, string opc, string asm>
+  : AI<oops, iops, MulFrm, itin, opc, asm, []>, Requires<[IsARM, HasV6]> {
+  bits<4> Rn;
+  bits<4> Rm;
+  let Inst{27-23} = 0b01110;
+  let Inst{22}    = long;
+  let Inst{21-20} = 0b00;
+  let Inst{11-8}  = Rm;
+  let Inst{7}     = 0;
+  let Inst{6}     = sub;
+  let Inst{5}     = swap;
+  let Inst{4}     = 1;
+  let Inst{3-0}   = Rn;
+}
+class AMulDualI<bit long, bit sub, bit swap, dag oops, dag iops,
+                InstrItinClass itin, string opc, string asm>
+  : AMulDualIbase<long, sub, swap, oops, iops, itin, opc, asm> {
+  bits<4> Rd;
+  let Inst{15-12} = 0b1111;
+  let Inst{19-16} = Rd;
+}
+class AMulDualIa<bit long, bit sub, bit swap, dag oops, dag iops,
+                InstrItinClass itin, string opc, string asm>
+  : AMulDualIbase<long, sub, swap, oops, iops, itin, opc, asm> {
+  bits<4> Ra;
+  bits<4> Rd;
+  let Inst{19-16} = Rd;
+  let Inst{15-12} = Ra;
+}
+class AMulDualI64<bit long, bit sub, bit swap, dag oops, dag iops,
+                  InstrItinClass itin, string opc, string asm>
+  : AMulDualIbase<long, sub, swap, oops, iops, itin, opc, asm> {
+  bits<4> RdLo;
+  bits<4> RdHi;
+  let Inst{19-16} = RdHi;
+  let Inst{15-12} = RdLo;
+}
+
+multiclass AI_smld<bit sub, string opc> {
+
+  def D : AMulDualIa<0, sub, 0, (outs GPRnopc:$Rd),
+                  (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
+                  NoItinerary, !strconcat(opc, "d"), "\t$Rd, $Rn, $Rm, $Ra">;
+
+  def DX: AMulDualIa<0, sub, 1, (outs GPRnopc:$Rd),
+                  (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
+                  NoItinerary, !strconcat(opc, "dx"), "\t$Rd, $Rn, $Rm, $Ra">;
+
+  def LD: AMulDualI64<1, sub, 0, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
+                  (ins GPRnopc:$Rn, GPRnopc:$Rm), NoItinerary,
+                  !strconcat(opc, "ld"), "\t$RdLo, $RdHi, $Rn, $Rm">;
+
+  def LDX : AMulDualI64<1, sub, 1, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
+                  (ins GPRnopc:$Rn, GPRnopc:$Rm), NoItinerary,
+                  !strconcat(opc, "ldx"),"\t$RdLo, $RdHi, $Rn, $Rm">;
+
+}
+
+defm SMLA : AI_smld<0, "smla">;
+defm SMLS : AI_smld<1, "smls">;
+
+multiclass AI_sdml<bit sub, string opc> {
+
+  def D:AMulDualI<0, sub, 0, (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm),
+                  NoItinerary, !strconcat(opc, "d"), "\t$Rd, $Rn, $Rm">;
+  def DX:AMulDualI<0, sub, 1, (outs GPRnopc:$Rd),(ins GPRnopc:$Rn, GPRnopc:$Rm),
+                  NoItinerary, !strconcat(opc, "dx"), "\t$Rd, $Rn, $Rm">;
+}
+
+defm SMUA : AI_sdml<0, "smua">;
+defm SMUS : AI_sdml<1, "smus">;
+
+//===----------------------------------------------------------------------===//
+//  Division Instructions (ARMv7-A with virtualization extension)
+//
+def SDIV : ADivA1I<0b001, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), IIC_iDIV,
+                   "sdiv", "\t$Rd, $Rn, $Rm",
+                   [(set GPR:$Rd, (sdiv GPR:$Rn, GPR:$Rm))]>,
+           Requires<[IsARM, HasDivideInARM]>;
+
+def UDIV : ADivA1I<0b011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), IIC_iDIV,
+                   "udiv", "\t$Rd, $Rn, $Rm",
+                   [(set GPR:$Rd, (udiv GPR:$Rn, GPR:$Rm))]>,
+           Requires<[IsARM, HasDivideInARM]>;
+
+//===----------------------------------------------------------------------===//
+//  Misc. Arithmetic Instructions.
+//
+
+def CLZ  : AMiscA1I<0b000010110, 0b0001, (outs GPR:$Rd), (ins GPR:$Rm),
+              IIC_iUNAr, "clz", "\t$Rd, $Rm",
+              [(set GPR:$Rd, (ctlz GPR:$Rm))]>, Requires<[IsARM, HasV5T]>,
+           Sched<[WriteALU]>;
+
+def RBIT : AMiscA1I<0b01101111, 0b0011, (outs GPR:$Rd), (ins GPR:$Rm),
+              IIC_iUNAr, "rbit", "\t$Rd, $Rm",
+              [(set GPR:$Rd, (ARMrbit GPR:$Rm))]>,
+           Requires<[IsARM, HasV6T2]>,
+           Sched<[WriteALU]>;
+
+def REV  : AMiscA1I<0b01101011, 0b0011, (outs GPR:$Rd), (ins GPR:$Rm),
+              IIC_iUNAr, "rev", "\t$Rd, $Rm",
+              [(set GPR:$Rd, (bswap GPR:$Rm))]>, Requires<[IsARM, HasV6]>,
+           Sched<[WriteALU]>;
+
+let AddedComplexity = 5 in
+def REV16 : AMiscA1I<0b01101011, 0b1011, (outs GPR:$Rd), (ins GPR:$Rm),
+               IIC_iUNAr, "rev16", "\t$Rd, $Rm",
+               [(set GPR:$Rd, (rotr (bswap GPR:$Rm), (i32 16)))]>,
+               Requires<[IsARM, HasV6]>,
+           Sched<[WriteALU]>;
+
+def : ARMV6Pat<(srl (bswap (extloadi16 addrmode3:$addr)), (i32 16)),
+              (REV16 (LDRH addrmode3:$addr))>;
+def : ARMV6Pat<(truncstorei16 (srl (bswap GPR:$Rn), (i32 16)), addrmode3:$addr),
+               (STRH (REV16 GPR:$Rn), addrmode3:$addr)>;
+
+let AddedComplexity = 5 in
+def REVSH : AMiscA1I<0b01101111, 0b1011, (outs GPR:$Rd), (ins GPR:$Rm),
+               IIC_iUNAr, "revsh", "\t$Rd, $Rm",
+               [(set GPR:$Rd, (sra (bswap GPR:$Rm), (i32 16)))]>,
+               Requires<[IsARM, HasV6]>,
+           Sched<[WriteALU]>;
+
+def : ARMV6Pat<(or (sra (shl GPR:$Rm, (i32 24)), (i32 16)),
+                   (and (srl GPR:$Rm, (i32 8)), 0xFF)),
+               (REVSH GPR:$Rm)>;
+
+def PKHBT : APKHI<0b01101000, 0, (outs GPRnopc:$Rd),
+                              (ins GPRnopc:$Rn, GPRnopc:$Rm, pkh_lsl_amt:$sh),
+               IIC_iALUsi, "pkhbt", "\t$Rd, $Rn, $Rm$sh",
+               [(set GPRnopc:$Rd, (or (and GPRnopc:$Rn, 0xFFFF),
+                                      (and (shl GPRnopc:$Rm, pkh_lsl_amt:$sh),
+                                           0xFFFF0000)))]>,
+               Requires<[IsARM, HasV6]>,
+           Sched<[WriteALUsi, ReadALU]>;
+
+// Alternate cases for PKHBT where identities eliminate some nodes.
+def : ARMV6Pat<(or (and GPRnopc:$Rn, 0xFFFF), (and GPRnopc:$Rm, 0xFFFF0000)),
+               (PKHBT GPRnopc:$Rn, GPRnopc:$Rm, 0)>;
+def : ARMV6Pat<(or (and GPRnopc:$Rn, 0xFFFF), (shl GPRnopc:$Rm, imm16_31:$sh)),
+               (PKHBT GPRnopc:$Rn, GPRnopc:$Rm, imm16_31:$sh)>;
+
+// Note: Shifts of 1-15 bits will be transformed to srl instead of sra and
+// will match the pattern below.
+def PKHTB : APKHI<0b01101000, 1, (outs GPRnopc:$Rd),
+                              (ins GPRnopc:$Rn, GPRnopc:$Rm, pkh_asr_amt:$sh),
+               IIC_iBITsi, "pkhtb", "\t$Rd, $Rn, $Rm$sh",
+               [(set GPRnopc:$Rd, (or (and GPRnopc:$Rn, 0xFFFF0000),
+                                      (and (sra GPRnopc:$Rm, pkh_asr_amt:$sh),
+                                           0xFFFF)))]>,
+               Requires<[IsARM, HasV6]>,
+           Sched<[WriteALUsi, ReadALU]>;
+
+// Alternate cases for PKHTB where identities eliminate some nodes.  Note that
+// a shift amount of 0 is *not legal* here, it is PKHBT instead.
+// We also can not replace a srl (17..31) by an arithmetic shift we would use in
+// pkhtb src1, src2, asr (17..31).
+def : ARMV6Pat<(or (and GPRnopc:$src1, 0xFFFF0000),
+                   (srl GPRnopc:$src2, imm16:$sh)),
+               (PKHTB GPRnopc:$src1, GPRnopc:$src2, imm16:$sh)>;
+def : ARMV6Pat<(or (and GPRnopc:$src1, 0xFFFF0000),
+                   (sra GPRnopc:$src2, imm16_31:$sh)),
+               (PKHTB GPRnopc:$src1, GPRnopc:$src2, imm16_31:$sh)>;
+def : ARMV6Pat<(or (and GPRnopc:$src1, 0xFFFF0000),
+                   (and (srl GPRnopc:$src2, imm1_15:$sh), 0xFFFF)),
+               (PKHTB GPRnopc:$src1, GPRnopc:$src2, imm1_15:$sh)>;
+
+//===----------------------------------------------------------------------===//
+// CRC Instructions
+//
+// Polynomials:
+// + CRC32{B,H,W}       0x04C11DB7
+// + CRC32C{B,H,W}      0x1EDC6F41
+//
+
+class AI_crc32<bit C, bits<2> sz, string suffix, SDPatternOperator builtin>
+  : AInoP<(outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm), MiscFrm, NoItinerary,
+               !strconcat("crc32", suffix), "\t$Rd, $Rn, $Rm",
+               [(set GPRnopc:$Rd, (builtin GPRnopc:$Rn, GPRnopc:$Rm))]>,
+               Requires<[IsARM, HasV8, HasCRC]> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+
+  let Inst{31-28} = 0b1110;
+  let Inst{27-23} = 0b00010;
+  let Inst{22-21} = sz;
+  let Inst{20}    = 0;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = Rd;
+  let Inst{11-10} = 0b00;
+  let Inst{9}     = C;
+  let Inst{8}     = 0;
+  let Inst{7-4}   = 0b0100;
+  let Inst{3-0}   = Rm;
+
+  let Unpredictable{11-8} = 0b1101;
+}
+
+def CRC32B  : AI_crc32<0, 0b00, "b", int_arm_crc32b>;
+def CRC32CB : AI_crc32<1, 0b00, "cb", int_arm_crc32cb>;
+def CRC32H  : AI_crc32<0, 0b01, "h", int_arm_crc32h>;
+def CRC32CH : AI_crc32<1, 0b01, "ch", int_arm_crc32ch>;
+def CRC32W  : AI_crc32<0, 0b10, "w", int_arm_crc32w>;
+def CRC32CW : AI_crc32<1, 0b10, "cw", int_arm_crc32cw>;
+
+//===----------------------------------------------------------------------===//
+//  Comparison Instructions...
+//
+
+defm CMP  : AI1_cmp_irs<0b1010, "cmp",
+                        IIC_iCMPi, IIC_iCMPr, IIC_iCMPsr,
+                        BinOpFrag<(ARMcmp node:$LHS, node:$RHS)>>;
+
+// ARMcmpZ can re-use the above instruction definitions.
+def : ARMPat<(ARMcmpZ GPR:$src, so_imm:$imm),
+             (CMPri   GPR:$src, so_imm:$imm)>;
+def : ARMPat<(ARMcmpZ GPR:$src, GPR:$rhs),
+             (CMPrr   GPR:$src, GPR:$rhs)>;
+def : ARMPat<(ARMcmpZ GPR:$src, so_reg_imm:$rhs),
+             (CMPrsi   GPR:$src, so_reg_imm:$rhs)>;
+def : ARMPat<(ARMcmpZ GPR:$src, so_reg_reg:$rhs),
+             (CMPrsr   GPR:$src, so_reg_reg:$rhs)>;
+
+// CMN register-integer
+let isCompare = 1, Defs = [CPSR] in {
+def CMNri : AI1<0b1011, (outs), (ins GPR:$Rn, so_imm:$imm), DPFrm, IIC_iCMPi,
+                "cmn", "\t$Rn, $imm",
+                [(ARMcmn GPR:$Rn, so_imm:$imm)]>,
+                Sched<[WriteCMP, ReadALU]> {
+  bits<4> Rn;
+  bits<12> imm;
+  let Inst{25} = 1;
+  let Inst{20} = 1;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = 0b0000;
+  let Inst{11-0} = imm;
+
+  let Unpredictable{15-12} = 0b1111;
+}
+
+// CMN register-register/shift
+def CMNzrr : AI1<0b1011, (outs), (ins GPR:$Rn, GPR:$Rm), DPFrm, IIC_iCMPr,
+                 "cmn", "\t$Rn, $Rm",
+                 [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>
+                   GPR:$Rn, GPR:$Rm)]>, Sched<[WriteCMP, ReadALU, ReadALU]> {
+  bits<4> Rn;
+  bits<4> Rm;
+  let isCommutable = 1;
+  let Inst{25} = 0;
+  let Inst{20} = 1;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = 0b0000;
+  let Inst{11-4} = 0b00000000;
+  let Inst{3-0} = Rm;
+
+  let Unpredictable{15-12} = 0b1111;
+}
+
+def CMNzrsi : AI1<0b1011, (outs),
+                  (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm, IIC_iCMPsr,
+                  "cmn", "\t$Rn, $shift",
+                  [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>
+                    GPR:$Rn, so_reg_imm:$shift)]>,
+                    Sched<[WriteCMPsi, ReadALU]> {
+  bits<4> Rn;
+  bits<12> shift;
+  let Inst{25} = 0;
+  let Inst{20} = 1;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = 0b0000;
+  let Inst{11-5} = shift{11-5};
+  let Inst{4} = 0;
+  let Inst{3-0} = shift{3-0};
+
+  let Unpredictable{15-12} = 0b1111;
+}
+
+def CMNzrsr : AI1<0b1011, (outs),
+                  (ins GPRnopc:$Rn, so_reg_reg:$shift), DPSoRegRegFrm, IIC_iCMPsr,
+                  "cmn", "\t$Rn, $shift",
+                  [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>
+                    GPRnopc:$Rn, so_reg_reg:$shift)]>,
+                    Sched<[WriteCMPsr, ReadALU]> {
+  bits<4> Rn;
+  bits<12> shift;
+  let Inst{25} = 0;
+  let Inst{20} = 1;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = 0b0000;
+  let Inst{11-8} = shift{11-8};
+  let Inst{7} = 0;
+  let Inst{6-5} = shift{6-5};
+  let Inst{4} = 1;
+  let Inst{3-0} = shift{3-0};
+
+  let Unpredictable{15-12} = 0b1111;
+}
+
+}
+
+def : ARMPat<(ARMcmp  GPR:$src, so_imm_neg:$imm),
+             (CMNri   GPR:$src, so_imm_neg:$imm)>;
+
+def : ARMPat<(ARMcmpZ GPR:$src, so_imm_neg:$imm),
+             (CMNri   GPR:$src, so_imm_neg:$imm)>;
+
+// Note that TST/TEQ don't set all the same flags that CMP does!
+defm TST  : AI1_cmp_irs<0b1000, "tst",
+                        IIC_iTSTi, IIC_iTSTr, IIC_iTSTsr,
+                      BinOpFrag<(ARMcmpZ (and_su node:$LHS, node:$RHS), 0)>, 1>;
+defm TEQ  : AI1_cmp_irs<0b1001, "teq",
+                        IIC_iTSTi, IIC_iTSTr, IIC_iTSTsr,
+                      BinOpFrag<(ARMcmpZ (xor_su node:$LHS, node:$RHS), 0)>, 1>;
+
+// Pseudo i64 compares for some floating point compares.
+let usesCustomInserter = 1, isBranch = 1, isTerminator = 1,
+    Defs = [CPSR] in {
+def BCCi64 : PseudoInst<(outs),
+    (ins i32imm:$cc, GPR:$lhs1, GPR:$lhs2, GPR:$rhs1, GPR:$rhs2, brtarget:$dst),
+     IIC_Br,
+    [(ARMBcci64 imm:$cc, GPR:$lhs1, GPR:$lhs2, GPR:$rhs1, GPR:$rhs2, bb:$dst)]>,
+    Sched<[WriteBr]>;
+
+def BCCZi64 : PseudoInst<(outs),
+     (ins i32imm:$cc, GPR:$lhs1, GPR:$lhs2, brtarget:$dst), IIC_Br,
+    [(ARMBcci64 imm:$cc, GPR:$lhs1, GPR:$lhs2, 0, 0, bb:$dst)]>,
+    Sched<[WriteBr]>;
+} // usesCustomInserter
+
+
+// Conditional moves
+let neverHasSideEffects = 1 in {
+
+let isCommutable = 1, isSelect = 1 in
+def MOVCCr : ARMPseudoInst<(outs GPR:$Rd),
+                           (ins GPR:$false, GPR:$Rm, cmovpred:$p),
+                           4, IIC_iCMOVr,
+                           [(set GPR:$Rd, (ARMcmov GPR:$false, GPR:$Rm,
+                                                   cmovpred:$p))]>,
+             RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
+
+def MOVCCsi : ARMPseudoInst<(outs GPR:$Rd),
+                            (ins GPR:$false, so_reg_imm:$shift, cmovpred:$p),
+                            4, IIC_iCMOVsr,
+                            [(set GPR:$Rd,
+                                  (ARMcmov GPR:$false, so_reg_imm:$shift,
+                                           cmovpred:$p))]>,
+      RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
+def MOVCCsr : ARMPseudoInst<(outs GPR:$Rd),
+                            (ins GPR:$false, so_reg_reg:$shift, cmovpred:$p),
+                           4, IIC_iCMOVsr,
+  [(set GPR:$Rd, (ARMcmov GPR:$false, so_reg_reg:$shift,
+                            cmovpred:$p))]>,
+      RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
+
+
+let isMoveImm = 1 in
+def MOVCCi16
+    : ARMPseudoInst<(outs GPR:$Rd),
+                    (ins GPR:$false, imm0_65535_expr:$imm, cmovpred:$p),
+                    4, IIC_iMOVi,
+                    [(set GPR:$Rd, (ARMcmov GPR:$false, imm0_65535:$imm,
+                                            cmovpred:$p))]>,
+      RegConstraint<"$false = $Rd">, Requires<[IsARM, HasV6T2]>,
+      Sched<[WriteALU]>;
+
+let isMoveImm = 1 in
+def MOVCCi : ARMPseudoInst<(outs GPR:$Rd),
+                           (ins GPR:$false, so_imm:$imm, cmovpred:$p),
+                           4, IIC_iCMOVi,
+                           [(set GPR:$Rd, (ARMcmov GPR:$false, so_imm:$imm,
+                                                   cmovpred:$p))]>,
+      RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
+
+// Two instruction predicate mov immediate.
+let isMoveImm = 1 in
+def MOVCCi32imm
+    : ARMPseudoInst<(outs GPR:$Rd),
+                    (ins GPR:$false, i32imm:$src, cmovpred:$p),
+                    8, IIC_iCMOVix2,
+                    [(set GPR:$Rd, (ARMcmov GPR:$false, imm:$src,
+                                            cmovpred:$p))]>,
+      RegConstraint<"$false = $Rd">, Requires<[IsARM, HasV6T2]>;
+
+let isMoveImm = 1 in
+def MVNCCi : ARMPseudoInst<(outs GPR:$Rd),
+                           (ins GPR:$false, so_imm:$imm, cmovpred:$p),
+                           4, IIC_iCMOVi,
+                           [(set GPR:$Rd, (ARMcmov GPR:$false, so_imm_not:$imm,
+                                                   cmovpred:$p))]>,
+                RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
+
+} // neverHasSideEffects
+
+
+//===----------------------------------------------------------------------===//
+// Atomic operations intrinsics
+//
+
+def MemBarrierOptOperand : AsmOperandClass {
+  let Name = "MemBarrierOpt";
+  let ParserMethod = "parseMemBarrierOptOperand";
+}
+def memb_opt : Operand<i32> {
+  let PrintMethod = "printMemBOption";
+  let ParserMatchClass = MemBarrierOptOperand;
+  let DecoderMethod = "DecodeMemBarrierOption";
+}
+
+def InstSyncBarrierOptOperand : AsmOperandClass {
+  let Name = "InstSyncBarrierOpt";
+  let ParserMethod = "parseInstSyncBarrierOptOperand";
+}
+def instsyncb_opt : Operand<i32> {
+  let PrintMethod = "printInstSyncBOption";
+  let ParserMatchClass = InstSyncBarrierOptOperand;
+  let DecoderMethod = "DecodeInstSyncBarrierOption";
+}
+
+// Memory barriers protect the atomic sequences
+let hasSideEffects = 1 in {
+def DMB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary,
+                "dmb", "\t$opt", [(int_arm_dmb (i32 imm0_15:$opt))]>,
+                Requires<[IsARM, HasDB]> {
+  bits<4> opt;
+  let Inst{31-4} = 0xf57ff05;
+  let Inst{3-0} = opt;
+}
+
+def DSB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary,
+                "dsb", "\t$opt", [(int_arm_dsb (i32 imm0_15:$opt))]>,
+                Requires<[IsARM, HasDB]> {
+  bits<4> opt;
+  let Inst{31-4} = 0xf57ff04;
+  let Inst{3-0} = opt;
+}
+
+// ISB has only full system option
+def ISB : AInoP<(outs), (ins instsyncb_opt:$opt), MiscFrm, NoItinerary,
+                "isb", "\t$opt", [(int_arm_isb (i32 imm0_15:$opt))]>,
+                Requires<[IsARM, HasDB]> {
+  bits<4> opt;
+  let Inst{31-4} = 0xf57ff06;
+  let Inst{3-0} = opt;
+}
+}
+
+let usesCustomInserter = 1, Defs = [CPSR] in {
+
+// Pseudo instruction that combines movs + predicated rsbmi
+// to implement integer ABS
+  def ABS : ARMPseudoInst<(outs GPR:$dst), (ins GPR:$src), 8, NoItinerary, []>;
+}
+
+let usesCustomInserter = 1 in {
+    def COPY_STRUCT_BYVAL_I32 : PseudoInst<
+      (outs), (ins GPR:$dst, GPR:$src, i32imm:$size, i32imm:$alignment),
+      NoItinerary,
+      [(ARMcopystructbyval GPR:$dst, GPR:$src, imm:$size, imm:$alignment)]>;
+}
+
+def ldrex_1 : PatFrag<(ops node:$ptr), (int_arm_ldrex node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def ldrex_2 : PatFrag<(ops node:$ptr), (int_arm_ldrex node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def ldrex_4 : PatFrag<(ops node:$ptr), (int_arm_ldrex node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def strex_1 : PatFrag<(ops node:$val, node:$ptr),
+                      (int_arm_strex node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def strex_2 : PatFrag<(ops node:$val, node:$ptr),
+                      (int_arm_strex node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def strex_4 : PatFrag<(ops node:$val, node:$ptr),
+                      (int_arm_strex node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def ldaex_1 : PatFrag<(ops node:$ptr), (int_arm_ldaex node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def ldaex_2 : PatFrag<(ops node:$ptr), (int_arm_ldaex node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def ldaex_4 : PatFrag<(ops node:$ptr), (int_arm_ldaex node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def stlex_1 : PatFrag<(ops node:$val, node:$ptr),
+                      (int_arm_stlex node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def stlex_2 : PatFrag<(ops node:$val, node:$ptr),
+                      (int_arm_stlex node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def stlex_4 : PatFrag<(ops node:$val, node:$ptr),
+                      (int_arm_stlex node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+let mayLoad = 1 in {
+def LDREXB : AIldrex<0b10, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+                     NoItinerary, "ldrexb", "\t$Rt, $addr",
+                     [(set GPR:$Rt, (ldrex_1 addr_offset_none:$addr))]>;
+def LDREXH : AIldrex<0b11, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+                     NoItinerary, "ldrexh", "\t$Rt, $addr",
+                     [(set GPR:$Rt, (ldrex_2 addr_offset_none:$addr))]>;
+def LDREX  : AIldrex<0b00, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+                     NoItinerary, "ldrex", "\t$Rt, $addr",
+                     [(set GPR:$Rt, (ldrex_4 addr_offset_none:$addr))]>;
+let hasExtraDefRegAllocReq = 1 in
+def LDREXD : AIldrex<0b01, (outs GPRPairOp:$Rt),(ins addr_offset_none:$addr),
+                      NoItinerary, "ldrexd", "\t$Rt, $addr", []> {
+  let DecoderMethod = "DecodeDoubleRegLoad";
+}
+
+def LDAEXB : AIldaex<0b10, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+                     NoItinerary, "ldaexb", "\t$Rt, $addr",
+                     [(set GPR:$Rt, (ldaex_1 addr_offset_none:$addr))]>;
+def LDAEXH : AIldaex<0b11, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+                     NoItinerary, "ldaexh", "\t$Rt, $addr",
+                    [(set GPR:$Rt, (ldaex_2 addr_offset_none:$addr))]>;
+def LDAEX  : AIldaex<0b00, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+                     NoItinerary, "ldaex", "\t$Rt, $addr",
+                    [(set GPR:$Rt, (ldaex_4 addr_offset_none:$addr))]>;
+let hasExtraDefRegAllocReq = 1 in
+def LDAEXD : AIldaex<0b01, (outs GPRPairOp:$Rt),(ins addr_offset_none:$addr),
+                      NoItinerary, "ldaexd", "\t$Rt, $addr", []> {
+  let DecoderMethod = "DecodeDoubleRegLoad";
+}
+}
+
+let mayStore = 1, Constraints = "@earlyclobber $Rd" in {
+def STREXB: AIstrex<0b10, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
+                    NoItinerary, "strexb", "\t$Rd, $Rt, $addr",
+                    [(set GPR:$Rd, (strex_1 GPR:$Rt,
+                                            addr_offset_none:$addr))]>;
+def STREXH: AIstrex<0b11, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
+                    NoItinerary, "strexh", "\t$Rd, $Rt, $addr",
+                    [(set GPR:$Rd, (strex_2 GPR:$Rt,
+                                            addr_offset_none:$addr))]>;
+def STREX : AIstrex<0b00, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
+                    NoItinerary, "strex", "\t$Rd, $Rt, $addr",
+                    [(set GPR:$Rd, (strex_4 GPR:$Rt,
+                                            addr_offset_none:$addr))]>;
+let hasExtraSrcRegAllocReq = 1 in
+def STREXD : AIstrex<0b01, (outs GPR:$Rd),
+                    (ins GPRPairOp:$Rt, addr_offset_none:$addr),
+                    NoItinerary, "strexd", "\t$Rd, $Rt, $addr", []> {
+  let DecoderMethod = "DecodeDoubleRegStore";
+}
+def STLEXB: AIstlex<0b10, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
+                    NoItinerary, "stlexb", "\t$Rd, $Rt, $addr",
+                    [(set GPR:$Rd,
+                          (stlex_1 GPR:$Rt, addr_offset_none:$addr))]>;
+def STLEXH: AIstlex<0b11, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
+                    NoItinerary, "stlexh", "\t$Rd, $Rt, $addr",
+                    [(set GPR:$Rd,
+                          (stlex_2 GPR:$Rt, addr_offset_none:$addr))]>;
+def STLEX : AIstlex<0b00, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
+                    NoItinerary, "stlex", "\t$Rd, $Rt, $addr",
+                    [(set GPR:$Rd,
+                          (stlex_4 GPR:$Rt, addr_offset_none:$addr))]>;
+let hasExtraSrcRegAllocReq = 1 in
+def STLEXD : AIstlex<0b01, (outs GPR:$Rd),
+                    (ins GPRPairOp:$Rt, addr_offset_none:$addr),
+                    NoItinerary, "stlexd", "\t$Rd, $Rt, $addr", []> {
+  let DecoderMethod = "DecodeDoubleRegStore";
+}
+}
+
+def CLREX : AXI<(outs), (ins), MiscFrm, NoItinerary, "clrex",
+                [(int_arm_clrex)]>,
+            Requires<[IsARM, HasV7]>  {
+  let Inst{31-0} = 0b11110101011111111111000000011111;
+}
+
+def : ARMPat<(strex_1 (and GPR:$Rt, 0xff), addr_offset_none:$addr),
+             (STREXB GPR:$Rt, addr_offset_none:$addr)>;
+def : ARMPat<(strex_2 (and GPR:$Rt, 0xffff), addr_offset_none:$addr),
+             (STREXH GPR:$Rt, addr_offset_none:$addr)>;
+
+def : ARMPat<(stlex_1 (and GPR:$Rt, 0xff), addr_offset_none:$addr),
+             (STLEXB GPR:$Rt, addr_offset_none:$addr)>;
+def : ARMPat<(stlex_2 (and GPR:$Rt, 0xffff), addr_offset_none:$addr),
+             (STLEXH GPR:$Rt, addr_offset_none:$addr)>;
+
+class acquiring_load<PatFrag base>
+  : PatFrag<(ops node:$ptr), (base node:$ptr), [{
+  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+  return Ordering == Acquire || Ordering == SequentiallyConsistent;
+}]>;
+
+def atomic_load_acquire_8  : acquiring_load<atomic_load_8>;
+def atomic_load_acquire_16 : acquiring_load<atomic_load_16>;
+def atomic_load_acquire_32 : acquiring_load<atomic_load_32>;
+
+class releasing_store<PatFrag base>
+  : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
+  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+  return Ordering == Release || Ordering == SequentiallyConsistent;
+}]>;
+
+def atomic_store_release_8  : releasing_store<atomic_store_8>;
+def atomic_store_release_16 : releasing_store<atomic_store_16>;
+def atomic_store_release_32 : releasing_store<atomic_store_32>;
+
+let AddedComplexity = 8 in {
+  def : ARMPat<(atomic_load_acquire_8 addr_offset_none:$addr),  (LDAB addr_offset_none:$addr)>;
+  def : ARMPat<(atomic_load_acquire_16 addr_offset_none:$addr), (LDAH addr_offset_none:$addr)>;
+  def : ARMPat<(atomic_load_acquire_32 addr_offset_none:$addr), (LDA  addr_offset_none:$addr)>;
+  def : ARMPat<(atomic_store_release_8 addr_offset_none:$addr, GPR:$val),  (STLB GPR:$val, addr_offset_none:$addr)>;
+  def : ARMPat<(atomic_store_release_16 addr_offset_none:$addr, GPR:$val), (STLH GPR:$val, addr_offset_none:$addr)>;
+  def : ARMPat<(atomic_store_release_32 addr_offset_none:$addr, GPR:$val), (STL  GPR:$val, addr_offset_none:$addr)>;
+}
+
+// SWP/SWPB are deprecated in V6/V7.
+let mayLoad = 1, mayStore = 1 in {
+def SWP : AIswp<0, (outs GPRnopc:$Rt),
+                (ins GPRnopc:$Rt2, addr_offset_none:$addr), "swp", []>,
+                Requires<[PreV8]>;
+def SWPB: AIswp<1, (outs GPRnopc:$Rt),
+                (ins GPRnopc:$Rt2, addr_offset_none:$addr), "swpb", []>,
+                Requires<[PreV8]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Coprocessor Instructions.
+//
+
+def CDP : ABI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
+            c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2),
+            NoItinerary, "cdp", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
+            [(int_arm_cdp imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
+                          imm:$CRm, imm:$opc2)]>,
+            Requires<[PreV8]> {
+  bits<4> opc1;
+  bits<4> CRn;
+  bits<4> CRd;
+  bits<4> cop;
+  bits<3> opc2;
+  bits<4> CRm;
+
+  let Inst{3-0}   = CRm;
+  let Inst{4}     = 0;
+  let Inst{7-5}   = opc2;
+  let Inst{11-8}  = cop;
+  let Inst{15-12} = CRd;
+  let Inst{19-16} = CRn;
+  let Inst{23-20} = opc1;
+}
+
+def CDP2 : ABXI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
+               c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2),
+               NoItinerary, "cdp2\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
+               [(int_arm_cdp2 imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
+                              imm:$CRm, imm:$opc2)]>,
+               Requires<[PreV8]> {
+  let Inst{31-28} = 0b1111;
+  bits<4> opc1;
+  bits<4> CRn;
+  bits<4> CRd;
+  bits<4> cop;
+  bits<3> opc2;
+  bits<4> CRm;
+
+  let Inst{3-0}   = CRm;
+  let Inst{4}     = 0;
+  let Inst{7-5}   = opc2;
+  let Inst{11-8}  = cop;
+  let Inst{15-12} = CRd;
+  let Inst{19-16} = CRn;
+  let Inst{23-20} = opc1;
+}
+
+class ACI<dag oops, dag iops, string opc, string asm,
+          IndexMode im = IndexModeNone>
+  : I<oops, iops, AddrModeNone, 4, im, BrFrm, NoItinerary,
+      opc, asm, "", []> {
+  let Inst{27-25} = 0b110;
+}
+class ACInoP<dag oops, dag iops, string opc, string asm,
+          IndexMode im = IndexModeNone>
+  : InoP<oops, iops, AddrModeNone, 4, im, BrFrm, NoItinerary,
+         opc, asm, "", []> {
+  let Inst{31-28} = 0b1111;
+  let Inst{27-25} = 0b110;
+}
+multiclass LdStCop<bit load, bit Dbit, string asm> {
+  def _OFFSET : ACI<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr),
+                    asm, "\t$cop, $CRd, $addr"> {
+    bits<13> addr;
+    bits<4> cop;
+    bits<4> CRd;
+    let Inst{24} = 1; // P = 1
+    let Inst{23} = addr{8};
+    let Inst{22} = Dbit;
+    let Inst{21} = 0; // W = 0
+    let Inst{20} = load;
+    let Inst{19-16} = addr{12-9};
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = addr{7-0};
+    let DecoderMethod = "DecodeCopMemInstruction";
+  }
+  def _PRE : ACI<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5_pre:$addr),
+                 asm, "\t$cop, $CRd, $addr!", IndexModePre> {
+    bits<13> addr;
+    bits<4> cop;
+    bits<4> CRd;
+    let Inst{24} = 1; // P = 1
+    let Inst{23} = addr{8};
+    let Inst{22} = Dbit;
+    let Inst{21} = 1; // W = 1
+    let Inst{20} = load;
+    let Inst{19-16} = addr{12-9};
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = addr{7-0};
+    let DecoderMethod = "DecodeCopMemInstruction";
+  }
+  def _POST: ACI<(outs), (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
+                              postidx_imm8s4:$offset),
+                 asm, "\t$cop, $CRd, $addr, $offset", IndexModePost> {
+    bits<9> offset;
+    bits<4> addr;
+    bits<4> cop;
+    bits<4> CRd;
+    let Inst{24} = 0; // P = 0
+    let Inst{23} = offset{8};
+    let Inst{22} = Dbit;
+    let Inst{21} = 1; // W = 1
+    let Inst{20} = load;
+    let Inst{19-16} = addr;
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = offset{7-0};
+    let DecoderMethod = "DecodeCopMemInstruction";
+  }
+  def _OPTION : ACI<(outs),
+                    (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
+                         coproc_option_imm:$option),
+      asm, "\t$cop, $CRd, $addr, $option"> {
+    bits<8> option;
+    bits<4> addr;
+    bits<4> cop;
+    bits<4> CRd;
+    let Inst{24} = 0; // P = 0
+    let Inst{23} = 1; // U = 1
+    let Inst{22} = Dbit;
+    let Inst{21} = 0; // W = 0
+    let Inst{20} = load;
+    let Inst{19-16} = addr;
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = option;
+    let DecoderMethod = "DecodeCopMemInstruction";
+  }
+}
+multiclass LdSt2Cop<bit load, bit Dbit, string asm> {
+  def _OFFSET : ACInoP<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr),
+                       asm, "\t$cop, $CRd, $addr"> {
+    bits<13> addr;
+    bits<4> cop;
+    bits<4> CRd;
+    let Inst{24} = 1; // P = 1
+    let Inst{23} = addr{8};
+    let Inst{22} = Dbit;
+    let Inst{21} = 0; // W = 0
+    let Inst{20} = load;
+    let Inst{19-16} = addr{12-9};
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = addr{7-0};
+    let DecoderMethod = "DecodeCopMemInstruction";
+  }
+  def _PRE : ACInoP<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5_pre:$addr),
+                    asm, "\t$cop, $CRd, $addr!", IndexModePre> {
+    bits<13> addr;
+    bits<4> cop;
+    bits<4> CRd;
+    let Inst{24} = 1; // P = 1
+    let Inst{23} = addr{8};
+    let Inst{22} = Dbit;
+    let Inst{21} = 1; // W = 1
+    let Inst{20} = load;
+    let Inst{19-16} = addr{12-9};
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = addr{7-0};
+    let DecoderMethod = "DecodeCopMemInstruction";
+  }
+  def _POST: ACInoP<(outs), (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
+                                 postidx_imm8s4:$offset),
+                 asm, "\t$cop, $CRd, $addr, $offset", IndexModePost> {
+    bits<9> offset;
+    bits<4> addr;
+    bits<4> cop;
+    bits<4> CRd;
+    let Inst{24} = 0; // P = 0
+    let Inst{23} = offset{8};
+    let Inst{22} = Dbit;
+    let Inst{21} = 1; // W = 1
+    let Inst{20} = load;
+    let Inst{19-16} = addr;
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = offset{7-0};
+    let DecoderMethod = "DecodeCopMemInstruction";
+  }
+  def _OPTION : ACInoP<(outs),
+                       (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
+                            coproc_option_imm:$option),
+      asm, "\t$cop, $CRd, $addr, $option"> {
+    bits<8> option;
+    bits<4> addr;
+    bits<4> cop;
+    bits<4> CRd;
+    let Inst{24} = 0; // P = 0
+    let Inst{23} = 1; // U = 1
+    let Inst{22} = Dbit;
+    let Inst{21} = 0; // W = 0
+    let Inst{20} = load;
+    let Inst{19-16} = addr;
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = option;
+    let DecoderMethod = "DecodeCopMemInstruction";
+  }
+}
+
+defm LDC   : LdStCop <1, 0, "ldc">;
+defm LDCL  : LdStCop <1, 1, "ldcl">;
+defm STC   : LdStCop <0, 0, "stc">;
+defm STCL  : LdStCop <0, 1, "stcl">;
+defm LDC2  : LdSt2Cop<1, 0, "ldc2">, Requires<[PreV8]>;
+defm LDC2L : LdSt2Cop<1, 1, "ldc2l">, Requires<[PreV8]>;
+defm STC2  : LdSt2Cop<0, 0, "stc2">, Requires<[PreV8]>;
+defm STC2L : LdSt2Cop<0, 1, "stc2l">, Requires<[PreV8]>;
+
+//===----------------------------------------------------------------------===//
+// Move between coprocessor and ARM core register.
+//
+
+class MovRCopro<string opc, bit direction, dag oops, dag iops,
+                list<dag> pattern>
+  : ABI<0b1110, oops, iops, NoItinerary, opc,
+        "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2", pattern> {
+  let Inst{20} = direction;
+  let Inst{4} = 1;
+
+  bits<4> Rt;
+  bits<4> cop;
+  bits<3> opc1;
+  bits<3> opc2;
+  bits<4> CRm;
+  bits<4> CRn;
+
+  let Inst{15-12} = Rt;
+  let Inst{11-8}  = cop;
+  let Inst{23-21} = opc1;
+  let Inst{7-5}   = opc2;
+  let Inst{3-0}   = CRm;
+  let Inst{19-16} = CRn;
+}
+
+def MCR : MovRCopro<"mcr", 0 /* from ARM core register to coprocessor */,
+                    (outs),
+                    (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
+                         c_imm:$CRm, imm0_7:$opc2),
+                    [(int_arm_mcr imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
+                                  imm:$CRm, imm:$opc2)]>,
+                    ComplexDeprecationPredicate<"MCR">;
+def : ARMInstAlias<"mcr${p} $cop, $opc1, $Rt, $CRn, $CRm",
+                   (MCR p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
+                        c_imm:$CRm, 0, pred:$p)>;
+def MRC : MovRCopro<"mrc", 1 /* from coprocessor to ARM core register */,
+                    (outs GPRwithAPSR:$Rt),
+                    (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm,
+                         imm0_7:$opc2), []>;
+def : ARMInstAlias<"mrc${p} $cop, $opc1, $Rt, $CRn, $CRm",
+                   (MRC GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
+                        c_imm:$CRm, 0, pred:$p)>;
+
+def : ARMPat<(int_arm_mrc imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2),
+             (MRC imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>;
+
+class MovRCopro2<string opc, bit direction, dag oops, dag iops,
+                 list<dag> pattern>
+  : ABXI<0b1110, oops, iops, NoItinerary,
+         !strconcat(opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2"), pattern> {
+  let Inst{31-24} = 0b11111110;
+  let Inst{20} = direction;
+  let Inst{4} = 1;
+
+  bits<4> Rt;
+  bits<4> cop;
+  bits<3> opc1;
+  bits<3> opc2;
+  bits<4> CRm;
+  bits<4> CRn;
+
+  let Inst{15-12} = Rt;
+  let Inst{11-8}  = cop;
+  let Inst{23-21} = opc1;
+  let Inst{7-5}   = opc2;
+  let Inst{3-0}   = CRm;
+  let Inst{19-16} = CRn;
+}
+
+def MCR2 : MovRCopro2<"mcr2", 0 /* from ARM core register to coprocessor */,
+                      (outs),
+                      (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
+                           c_imm:$CRm, imm0_7:$opc2),
+                      [(int_arm_mcr2 imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
+                                     imm:$CRm, imm:$opc2)]>,
+                      Requires<[PreV8]>;
+def : ARMInstAlias<"mcr2 $cop, $opc1, $Rt, $CRn, $CRm",
+                   (MCR2 p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
+                         c_imm:$CRm, 0)>;
+def MRC2 : MovRCopro2<"mrc2", 1 /* from coprocessor to ARM core register */,
+                      (outs GPRwithAPSR:$Rt),
+                      (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm,
+                           imm0_7:$opc2), []>,
+                      Requires<[PreV8]>;
+def : ARMInstAlias<"mrc2 $cop, $opc1, $Rt, $CRn, $CRm",
+                   (MRC2 GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
+                         c_imm:$CRm, 0)>;
+
+def : ARMV5TPat<(int_arm_mrc2 imm:$cop, imm:$opc1, imm:$CRn,
+                              imm:$CRm, imm:$opc2),
+                (MRC2 imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>;
+
+class MovRRCopro<string opc, bit direction, list<dag> pattern = []>
+  : ABI<0b1100, (outs), (ins p_imm:$cop, imm0_15:$opc1,
+        GPRnopc:$Rt, GPRnopc:$Rt2, c_imm:$CRm),
+        NoItinerary, opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm", pattern> {
+  let Inst{23-21} = 0b010;
+  let Inst{20} = direction;
+
+  bits<4> Rt;
+  bits<4> Rt2;
+  bits<4> cop;
+  bits<4> opc1;
+  bits<4> CRm;
+
+  let Inst{15-12} = Rt;
+  let Inst{19-16} = Rt2;
+  let Inst{11-8}  = cop;
+  let Inst{7-4}   = opc1;
+  let Inst{3-0}   = CRm;
+}
+
+def MCRR : MovRRCopro<"mcrr", 0 /* from ARM core register to coprocessor */,
+                      [(int_arm_mcrr imm:$cop, imm:$opc1, GPRnopc:$Rt,
+                                     GPRnopc:$Rt2, imm:$CRm)]>;
+def MRRC : MovRRCopro<"mrrc", 1 /* from coprocessor to ARM core register */>;
+
+class MovRRCopro2<string opc, bit direction, list<dag> pattern = []>
+  : ABXI<0b1100, (outs), (ins p_imm:$cop, imm0_15:$opc1,
+         GPRnopc:$Rt, GPRnopc:$Rt2, c_imm:$CRm), NoItinerary,
+         !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"), pattern>,
+    Requires<[PreV8]> {
+  let Inst{31-28} = 0b1111;
+  let Inst{23-21} = 0b010;
+  let Inst{20} = direction;
+
+  bits<4> Rt;
+  bits<4> Rt2;
+  bits<4> cop;
+  bits<4> opc1;
+  bits<4> CRm;
+
+  let Inst{15-12} = Rt;
+  let Inst{19-16} = Rt2;
+  let Inst{11-8}  = cop;
+  let Inst{7-4}   = opc1;
+  let Inst{3-0}   = CRm;
+
+  let DecoderMethod = "DecodeMRRC2";
+}
+
+def MCRR2 : MovRRCopro2<"mcrr2", 0 /* from ARM core register to coprocessor */,
+                        [(int_arm_mcrr2 imm:$cop, imm:$opc1, GPRnopc:$Rt,
+                                        GPRnopc:$Rt2, imm:$CRm)]>;
+def MRRC2 : MovRRCopro2<"mrrc2", 1 /* from coprocessor to ARM core register */>;
+
+//===----------------------------------------------------------------------===//
+// Move between special register and ARM core register
+//
+
+// Move to ARM core register from Special Register
+def MRS : ABI<0b0001, (outs GPRnopc:$Rd), (ins), NoItinerary,
+              "mrs", "\t$Rd, apsr", []> {
+  bits<4> Rd;
+  let Inst{23-16} = 0b00001111;
+  let Unpredictable{19-17} = 0b111;
+
+  let Inst{15-12} = Rd;
+
+  let Inst{11-0} = 0b000000000000;
+  let Unpredictable{11-0} = 0b110100001111;
+}
+
+def : InstAlias<"mrs${p} $Rd, cpsr", (MRS GPRnopc:$Rd, pred:$p)>,
+         Requires<[IsARM]>;
+
+// The MRSsys instruction is the MRS instruction from the ARM ARM,
+// section B9.3.9, with the R bit set to 1.
+def MRSsys : ABI<0b0001, (outs GPRnopc:$Rd), (ins), NoItinerary,
+                 "mrs", "\t$Rd, spsr", []> {
+  bits<4> Rd;
+  let Inst{23-16} = 0b01001111;
+  let Unpredictable{19-16} = 0b1111;
+
+  let Inst{15-12} = Rd;
+
+  let Inst{11-0} = 0b000000000000;
+  let Unpredictable{11-0} = 0b110100001111;
+}
+
+// Move from ARM core register to Special Register
+//
+// No need to have both system and application versions, the encodings are the
+// same and the assembly parser has no way to distinguish between them. The mask
+// operand contains the special register (R Bit) in bit 4 and bits 3-0 contains
+// the mask with the fields to be accessed in the special register.
+def MSR : ABI<0b0001, (outs), (ins msr_mask:$mask, GPR:$Rn), NoItinerary,
+              "msr", "\t$mask, $Rn", []> {
+  bits<5> mask;
+  bits<4> Rn;
+
+  let Inst{23} = 0;
+  let Inst{22} = mask{4}; // R bit
+  let Inst{21-20} = 0b10;
+  let Inst{19-16} = mask{3-0};
+  let Inst{15-12} = 0b1111;
+  let Inst{11-4} = 0b00000000;
+  let Inst{3-0} = Rn;
+}
+
+def MSRi : ABI<0b0011, (outs), (ins msr_mask:$mask,  so_imm:$a), NoItinerary,
+               "msr", "\t$mask, $a", []> {
+  bits<5> mask;
+  bits<12> a;
+
+  let Inst{23} = 0;
+  let Inst{22} = mask{4}; // R bit
+  let Inst{21-20} = 0b10;
+  let Inst{19-16} = mask{3-0};
+  let Inst{15-12} = 0b1111;
+  let Inst{11-0} = a;
+}
+
+// Dynamic stack allocation yields a _chkstk for Windows targets.  These calls
+// are needed to probe the stack when allocating more than
+// 4k bytes in one go. Touching the stack at 4K increments is necessary to
+// ensure that the guard pages used by the OS virtual memory manager are
+// allocated in correct sequence.
+// The main point of having separate instruction are extra unmodelled effects
+// (compared to ordinary calls) like stack pointer change.
+
+def win__chkstk : SDNode<"ARMISD::WIN__CHKSTK", SDTNone,
+                      [SDNPHasChain, SDNPSideEffect]>;
+let usesCustomInserter = 1, Uses = [R4], Defs = [R4, SP] in
+  def WIN__CHKSTK : PseudoInst<(outs), (ins), NoItinerary, [(win__chkstk)]>;
+
+//===----------------------------------------------------------------------===//
+// TLS Instructions
+//
+
+// __aeabi_read_tp preserves the registers r1-r3.
+// This is a pseudo inst so that we can get the encoding right,
+// complete with fixup for the aeabi_read_tp function.
+// TPsoft is valid for ARM mode only, in case of Thumb mode a tTPsoft pattern
+// is defined in "ARMInstrThumb.td".
+let isCall = 1,
+  Defs = [R0, R12, LR, CPSR], Uses = [SP] in {
+  def TPsoft : ARMPseudoInst<(outs), (ins), 4, IIC_Br,
+               [(set R0, ARMthread_pointer)]>, Sched<[WriteBr]>;
+}
+
+//===----------------------------------------------------------------------===//
+// SJLJ Exception handling intrinsics
+//   eh_sjlj_setjmp() is an instruction sequence to store the return
+//   address and save #0 in R0 for the non-longjmp case.
+//   Since by its nature we may be coming from some other function to get
+//   here, and we're using the stack frame for the containing function to
+//   save/restore registers, we can't keep anything live in regs across
+//   the eh_sjlj_setjmp(), else it will almost certainly have been tromped upon
+//   when we get here from a longjmp(). We force everything out of registers
+//   except for our own input by listing the relevant registers in Defs. By
+//   doing so, we also cause the prologue/epilogue code to actively preserve
+//   all of the callee-saved resgisters, which is exactly what we want.
+//   A constant value is passed in $val, and we use the location as a scratch.
+//
+// These are pseudo-instructions and are lowered to individual MC-insts, so
+// no encoding information is necessary.
+let Defs =
+  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR, CPSR,
+    Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q13, Q14, Q15 ],
+  hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in {
+  def Int_eh_sjlj_setjmp : PseudoInst<(outs), (ins GPR:$src, GPR:$val),
+                               NoItinerary,
+                         [(set R0, (ARMeh_sjlj_setjmp GPR:$src, GPR:$val))]>,
+                           Requires<[IsARM, HasVFP2]>;
+}
+
+let Defs =
+  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR, CPSR ],
+  hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in {
+  def Int_eh_sjlj_setjmp_nofp : PseudoInst<(outs), (ins GPR:$src, GPR:$val),
+                                   NoItinerary,
+                         [(set R0, (ARMeh_sjlj_setjmp GPR:$src, GPR:$val))]>,
+                                Requires<[IsARM, NoVFP]>;
+}
+
+// FIXME: Non-IOS version(s)
+let isBarrier = 1, hasSideEffects = 1, isTerminator = 1,
+    Defs = [ R7, LR, SP ] in {
+def Int_eh_sjlj_longjmp : PseudoInst<(outs), (ins GPR:$src, GPR:$scratch),
+                             NoItinerary,
+                         [(ARMeh_sjlj_longjmp GPR:$src, GPR:$scratch)]>,
+                                Requires<[IsARM, IsIOS]>;
+}
+
+// eh.sjlj.dispatchsetup pseudo-instruction.
+// This pseudo is used for both ARM and Thumb. Any differences are handled when
+// the pseudo is expanded (which happens before any passes that need the
+// instruction size).
+let isBarrier = 1 in
+def Int_eh_sjlj_dispatchsetup : PseudoInst<(outs), (ins), NoItinerary, []>;
+
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//
+
+// ARMv4 indirect branch using (MOVr PC, dst)
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in
+  def MOVPCRX : ARMPseudoExpand<(outs), (ins GPR:$dst),
+                    4, IIC_Br, [(brind GPR:$dst)],
+                    (MOVr PC, GPR:$dst, (ops 14, zero_reg), zero_reg)>,
+                  Requires<[IsARM, NoV4T]>, Sched<[WriteBr]>;
+
+// Large immediate handling.
+
+// 32-bit immediate using two piece so_imms or movw + movt.
+// This is a single pseudo instruction, the benefit is that it can be remat'd
+// as a single unit instead of having to handle reg inputs.
+// FIXME: Remove this when we can do generalized remat.
+let isReMaterializable = 1, isMoveImm = 1 in
+def MOVi32imm : PseudoInst<(outs GPR:$dst), (ins i32imm:$src), IIC_iMOVix2,
+                           [(set GPR:$dst, (arm_i32imm:$src))]>,
+                           Requires<[IsARM]>;
+
+def LDRLIT_ga_abs : PseudoInst<(outs GPR:$dst), (ins i32imm:$src), IIC_iLoad_i,
+                               [(set GPR:$dst, (ARMWrapper tglobaladdr:$src))]>,
+                    Requires<[IsARM, DontUseMovt]>;
+
+// Pseudo instruction that combines movw + movt + add pc (if PIC).
+// It also makes it possible to rematerialize the instructions.
+// FIXME: Remove this when we can do generalized remat and when machine licm
+// can properly the instructions.
+let isReMaterializable = 1 in {
+def MOV_ga_pcrel : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr),
+                              IIC_iMOVix2addpc,
+                        [(set GPR:$dst, (ARMWrapperPIC tglobaladdr:$addr))]>,
+                        Requires<[IsARM, UseMovt]>;
+
+def LDRLIT_ga_pcrel : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr),
+                                 IIC_iLoadiALU,
+                                 [(set GPR:$dst,
+                                       (ARMWrapperPIC tglobaladdr:$addr))]>,
+                      Requires<[IsARM, DontUseMovt]>;
+
+def LDRLIT_ga_pcrel_ldr : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr),
+                              NoItinerary,
+                              [(set GPR:$dst,
+                                    (load (ARMWrapperPIC tglobaladdr:$addr)))]>,
+                          Requires<[IsARM, DontUseMovt]>;
+
+let AddedComplexity = 10 in
+def MOV_ga_pcrel_ldr : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr),
+                                IIC_iMOVix2ld,
+                    [(set GPR:$dst, (load (ARMWrapperPIC tglobaladdr:$addr)))]>,
+                    Requires<[IsARM, UseMovt]>;
+} // isReMaterializable
+
+// ConstantPool, GlobalAddress, and JumpTable
+def : ARMPat<(ARMWrapper  tconstpool  :$dst), (LEApcrel tconstpool  :$dst)>;
+def : ARMPat<(ARMWrapper  tglobaladdr :$dst), (MOVi32imm tglobaladdr :$dst)>,
+            Requires<[IsARM, UseMovt]>;
+def : ARMPat<(ARMWrapperJT tjumptable:$dst, imm:$id),
+             (LEApcrelJT tjumptable:$dst, imm:$id)>;
+
+// TODO: add,sub,and, 3-instr forms?
+
+// Tail calls. These patterns also apply to Thumb mode.
+def : Pat<(ARMtcret tcGPR:$dst), (TCRETURNri tcGPR:$dst)>;
+def : Pat<(ARMtcret (i32 tglobaladdr:$dst)), (TCRETURNdi texternalsym:$dst)>;
+def : Pat<(ARMtcret (i32 texternalsym:$dst)), (TCRETURNdi texternalsym:$dst)>;
+
+// Direct calls
+def : ARMPat<(ARMcall texternalsym:$func), (BL texternalsym:$func)>;
+def : ARMPat<(ARMcall_nolink texternalsym:$func),
+             (BMOVPCB_CALL texternalsym:$func)>;
+
+// zextload i1 -> zextload i8
+def : ARMPat<(zextloadi1 addrmode_imm12:$addr), (LDRBi12 addrmode_imm12:$addr)>;
+def : ARMPat<(zextloadi1 ldst_so_reg:$addr),    (LDRBrs ldst_so_reg:$addr)>;
+
+// extload -> zextload
+def : ARMPat<(extloadi1 addrmode_imm12:$addr),  (LDRBi12 addrmode_imm12:$addr)>;
+def : ARMPat<(extloadi1 ldst_so_reg:$addr),     (LDRBrs ldst_so_reg:$addr)>;
+def : ARMPat<(extloadi8 addrmode_imm12:$addr),  (LDRBi12 addrmode_imm12:$addr)>;
+def : ARMPat<(extloadi8 ldst_so_reg:$addr),     (LDRBrs ldst_so_reg:$addr)>;
+
+def : ARMPat<(extloadi16 addrmode3:$addr),  (LDRH addrmode3:$addr)>;
+
+def : ARMPat<(extloadi8  addrmodepc:$addr), (PICLDRB addrmodepc:$addr)>;
+def : ARMPat<(extloadi16 addrmodepc:$addr), (PICLDRH addrmodepc:$addr)>;
+
+// smul* and smla*
+def : ARMV5TEPat<(mul (sra (shl GPR:$a, (i32 16)), (i32 16)),
+                      (sra (shl GPR:$b, (i32 16)), (i32 16))),
+                 (SMULBB GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(mul sext_16_node:$a, sext_16_node:$b),
+                 (SMULBB GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(mul (sra (shl GPR:$a, (i32 16)), (i32 16)),
+                      (sra GPR:$b, (i32 16))),
+                 (SMULBT GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(mul sext_16_node:$a, (sra GPR:$b, (i32 16))),
+                 (SMULBT GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(mul (sra GPR:$a, (i32 16)),
+                      (sra (shl GPR:$b, (i32 16)), (i32 16))),
+                 (SMULTB GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(mul (sra GPR:$a, (i32 16)), sext_16_node:$b),
+                (SMULTB GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(sra (mul GPR:$a, (sra (shl GPR:$b, (i32 16)), (i32 16))),
+                      (i32 16)),
+                 (SMULWB GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(sra (mul GPR:$a, sext_16_node:$b), (i32 16)),
+                 (SMULWB GPR:$a, GPR:$b)>;
+
+def : ARMV5MOPat<(add GPR:$acc,
+                      (mul (sra (shl GPR:$a, (i32 16)), (i32 16)),
+                           (sra (shl GPR:$b, (i32 16)), (i32 16)))),
+                 (SMLABB GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5MOPat<(add GPR:$acc,
+                      (mul sext_16_node:$a, sext_16_node:$b)),
+                 (SMLABB GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5MOPat<(add GPR:$acc,
+                      (mul (sra (shl GPR:$a, (i32 16)), (i32 16)),
+                           (sra GPR:$b, (i32 16)))),
+                 (SMLABT GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5MOPat<(add GPR:$acc,
+                      (mul sext_16_node:$a, (sra GPR:$b, (i32 16)))),
+                 (SMLABT GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5MOPat<(add GPR:$acc,
+                      (mul (sra GPR:$a, (i32 16)),
+                           (sra (shl GPR:$b, (i32 16)), (i32 16)))),
+                 (SMLATB GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5MOPat<(add GPR:$acc,
+                      (mul (sra GPR:$a, (i32 16)), sext_16_node:$b)),
+                 (SMLATB GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5MOPat<(add GPR:$acc,
+                      (sra (mul GPR:$a, (sra (shl GPR:$b, (i32 16)), (i32 16))),
+                           (i32 16))),
+                 (SMLAWB GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5MOPat<(add GPR:$acc,
+                      (sra (mul GPR:$a, sext_16_node:$b), (i32 16))),
+                 (SMLAWB GPR:$a, GPR:$b, GPR:$acc)>;
+
+
+// Pre-v7 uses MCR for synchronization barriers.
+def : ARMPat<(ARMMemBarrierMCR GPR:$zero), (MCR 15, 0, GPR:$zero, 7, 10, 5)>,
+         Requires<[IsARM, HasV6]>;
+
+// SXT/UXT with no rotate
+let AddedComplexity = 16 in {
+def : ARMV6Pat<(and GPR:$Src, 0x000000FF), (UXTB GPR:$Src, 0)>;
+def : ARMV6Pat<(and GPR:$Src, 0x0000FFFF), (UXTH GPR:$Src, 0)>;
+def : ARMV6Pat<(and GPR:$Src, 0x00FF00FF), (UXTB16 GPR:$Src, 0)>;
+def : ARMV6Pat<(add GPR:$Rn, (and GPR:$Rm, 0x00FF)),
+               (UXTAB GPR:$Rn, GPR:$Rm, 0)>;
+def : ARMV6Pat<(add GPR:$Rn, (and GPR:$Rm, 0xFFFF)),
+               (UXTAH GPR:$Rn, GPR:$Rm, 0)>;
+}
+
+def : ARMV6Pat<(sext_inreg GPR:$Src, i8),  (SXTB GPR:$Src, 0)>;
+def : ARMV6Pat<(sext_inreg GPR:$Src, i16), (SXTH GPR:$Src, 0)>;
+
+def : ARMV6Pat<(add GPR:$Rn, (sext_inreg GPRnopc:$Rm, i8)),
+               (SXTAB GPR:$Rn, GPRnopc:$Rm, 0)>;
+def : ARMV6Pat<(add GPR:$Rn, (sext_inreg GPRnopc:$Rm, i16)),
+               (SXTAH GPR:$Rn, GPRnopc:$Rm, 0)>;
+
+// Atomic load/store patterns
+def : ARMPat<(atomic_load_8 ldst_so_reg:$src),
+             (LDRBrs ldst_so_reg:$src)>;
+def : ARMPat<(atomic_load_8 addrmode_imm12:$src),
+             (LDRBi12 addrmode_imm12:$src)>;
+def : ARMPat<(atomic_load_16 addrmode3:$src),
+             (LDRH addrmode3:$src)>;
+def : ARMPat<(atomic_load_32 ldst_so_reg:$src),
+             (LDRrs ldst_so_reg:$src)>;
+def : ARMPat<(atomic_load_32 addrmode_imm12:$src),
+             (LDRi12 addrmode_imm12:$src)>;
+def : ARMPat<(atomic_store_8 ldst_so_reg:$ptr, GPR:$val),
+             (STRBrs GPR:$val, ldst_so_reg:$ptr)>;
+def : ARMPat<(atomic_store_8 addrmode_imm12:$ptr, GPR:$val),
+             (STRBi12 GPR:$val, addrmode_imm12:$ptr)>;
+def : ARMPat<(atomic_store_16 addrmode3:$ptr, GPR:$val),
+             (STRH GPR:$val, addrmode3:$ptr)>;
+def : ARMPat<(atomic_store_32 ldst_so_reg:$ptr, GPR:$val),
+             (STRrs GPR:$val, ldst_so_reg:$ptr)>;
+def : ARMPat<(atomic_store_32 addrmode_imm12:$ptr, GPR:$val),
+             (STRi12 GPR:$val, addrmode_imm12:$ptr)>;
+
+
+//===----------------------------------------------------------------------===//
+// Thumb Support
+//
+
+include "ARMInstrThumb.td"
+
+//===----------------------------------------------------------------------===//
+// Thumb2 Support
+//
+
+include "ARMInstrThumb2.td"
+
+//===----------------------------------------------------------------------===//
+// Floating Point Support
+//
+
+include "ARMInstrVFP.td"
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD (NEON) Support
+//
+
+include "ARMInstrNEON.td"
+
+//===----------------------------------------------------------------------===//
+// Assembler aliases
+//
+
+// Memory barriers
+def : InstAlias<"dmb", (DMB 0xf)>, Requires<[IsARM, HasDB]>;
+def : InstAlias<"dsb", (DSB 0xf)>, Requires<[IsARM, HasDB]>;
+def : InstAlias<"isb", (ISB 0xf)>, Requires<[IsARM, HasDB]>;
+
+// System instructions
+def : MnemonicAlias<"swi", "svc">;
+
+// Load / Store Multiple
+def : MnemonicAlias<"ldmfd", "ldm">;
+def : MnemonicAlias<"ldmia", "ldm">;
+def : MnemonicAlias<"ldmea", "ldmdb">;
+def : MnemonicAlias<"stmfd", "stmdb">;
+def : MnemonicAlias<"stmia", "stm">;
+def : MnemonicAlias<"stmea", "stm">;
+
+// PKHBT/PKHTB with default shift amount. PKHTB is equivalent to PKHBT when the
+// shift amount is zero (i.e., unspecified).
+def : InstAlias<"pkhbt${p} $Rd, $Rn, $Rm",
+                (PKHBT GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, 0, pred:$p)>,
+        Requires<[IsARM, HasV6]>;
+def : InstAlias<"pkhtb${p} $Rd, $Rn, $Rm",
+                (PKHBT GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, 0, pred:$p)>,
+        Requires<[IsARM, HasV6]>;
+
+// PUSH/POP aliases for STM/LDM
+def : ARMInstAlias<"push${p} $regs", (STMDB_UPD SP, pred:$p, reglist:$regs)>;
+def : ARMInstAlias<"pop${p} $regs", (LDMIA_UPD SP, pred:$p, reglist:$regs)>;
+
+// SSAT/USAT optional shift operand.
+def : ARMInstAlias<"ssat${p} $Rd, $sat_imm, $Rn",
+                (SSAT GPRnopc:$Rd, imm1_32:$sat_imm, GPRnopc:$Rn, 0, pred:$p)>;
+def : ARMInstAlias<"usat${p} $Rd, $sat_imm, $Rn",
+                (USAT GPRnopc:$Rd, imm0_31:$sat_imm, GPRnopc:$Rn, 0, pred:$p)>;
+
+
+// Extend instruction optional rotate operand.
+def : ARMInstAlias<"sxtab${p} $Rd, $Rn, $Rm",
+                (SXTAB GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"sxtah${p} $Rd, $Rn, $Rm",
+                (SXTAH GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"sxtab16${p} $Rd, $Rn, $Rm",
+                (SXTAB16 GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"sxtb${p} $Rd, $Rm",
+                (SXTB GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"sxtb16${p} $Rd, $Rm",
+                (SXTB16 GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"sxth${p} $Rd, $Rm",
+                (SXTH GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>;
+
+def : ARMInstAlias<"uxtab${p} $Rd, $Rn, $Rm",
+                (UXTAB GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"uxtah${p} $Rd, $Rn, $Rm",
+                (UXTAH GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"uxtab16${p} $Rd, $Rn, $Rm",
+                (UXTAB16 GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"uxtb${p} $Rd, $Rm",
+                (UXTB GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"uxtb16${p} $Rd, $Rm",
+                (UXTB16 GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"uxth${p} $Rd, $Rm",
+                (UXTH GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>;
+
+
+// RFE aliases
+def : MnemonicAlias<"rfefa", "rfeda">;
+def : MnemonicAlias<"rfeea", "rfedb">;
+def : MnemonicAlias<"rfefd", "rfeia">;
+def : MnemonicAlias<"rfeed", "rfeib">;
+def : MnemonicAlias<"rfe", "rfeia">;
+
+// SRS aliases
+def : MnemonicAlias<"srsfa", "srsib">;
+def : MnemonicAlias<"srsea", "srsia">;
+def : MnemonicAlias<"srsfd", "srsdb">;
+def : MnemonicAlias<"srsed", "srsda">;
+def : MnemonicAlias<"srs", "srsia">;
+
+// QSAX == QSUBADDX
+def : MnemonicAlias<"qsubaddx", "qsax">;
+// SASX == SADDSUBX
+def : MnemonicAlias<"saddsubx", "sasx">;
+// SHASX == SHADDSUBX
+def : MnemonicAlias<"shaddsubx", "shasx">;
+// SHSAX == SHSUBADDX
+def : MnemonicAlias<"shsubaddx", "shsax">;
+// SSAX == SSUBADDX
+def : MnemonicAlias<"ssubaddx", "ssax">;
+// UASX == UADDSUBX
+def : MnemonicAlias<"uaddsubx", "uasx">;
+// UHASX == UHADDSUBX
+def : MnemonicAlias<"uhaddsubx", "uhasx">;
+// UHSAX == UHSUBADDX
+def : MnemonicAlias<"uhsubaddx", "uhsax">;
+// UQASX == UQADDSUBX
+def : MnemonicAlias<"uqaddsubx", "uqasx">;
+// UQSAX == UQSUBADDX
+def : MnemonicAlias<"uqsubaddx", "uqsax">;
+// USAX == USUBADDX
+def : MnemonicAlias<"usubaddx", "usax">;
+
+// "mov Rd, so_imm_not" can be handled via "mvn" in assembly, just like
+// for isel.
+def : ARMInstAlias<"mov${s}${p} $Rd, $imm",
+                   (MVNi rGPR:$Rd, so_imm_not:$imm, pred:$p, cc_out:$s)>;
+def : ARMInstAlias<"mvn${s}${p} $Rd, $imm",
+                   (MOVi rGPR:$Rd, so_imm_not:$imm, pred:$p, cc_out:$s)>;
+// Same for AND <--> BIC
+def : ARMInstAlias<"bic${s}${p} $Rd, $Rn, $imm",
+                   (ANDri rGPR:$Rd, rGPR:$Rn, so_imm_not:$imm,
+                          pred:$p, cc_out:$s)>;
+def : ARMInstAlias<"bic${s}${p} $Rdn, $imm",
+                   (ANDri rGPR:$Rdn, rGPR:$Rdn, so_imm_not:$imm,
+                          pred:$p, cc_out:$s)>;
+def : ARMInstAlias<"and${s}${p} $Rd, $Rn, $imm",
+                   (BICri rGPR:$Rd, rGPR:$Rn, so_imm_not:$imm,
+                          pred:$p, cc_out:$s)>;
+def : ARMInstAlias<"and${s}${p} $Rdn, $imm",
+                   (BICri rGPR:$Rdn, rGPR:$Rdn, so_imm_not:$imm,
+                          pred:$p, cc_out:$s)>;
+
+// Likewise, "add Rd, so_imm_neg" -> sub
+def : ARMInstAlias<"add${s}${p} $Rd, $Rn, $imm",
+                 (SUBri GPR:$Rd, GPR:$Rn, so_imm_neg:$imm, pred:$p, cc_out:$s)>;
+def : ARMInstAlias<"add${s}${p} $Rd, $imm",
+                 (SUBri GPR:$Rd, GPR:$Rd, so_imm_neg:$imm, pred:$p, cc_out:$s)>;
+// Same for CMP <--> CMN via so_imm_neg
+def : ARMInstAlias<"cmp${p} $Rd, $imm",
+                   (CMNri rGPR:$Rd, so_imm_neg:$imm, pred:$p)>;
+def : ARMInstAlias<"cmn${p} $Rd, $imm",
+                   (CMPri rGPR:$Rd, so_imm_neg:$imm, pred:$p)>;
+
+// The shifter forms of the MOV instruction are aliased to the ASR, LSL,
+// LSR, ROR, and RRX instructions.
+// FIXME: We need C++ parser hooks to map the alias to the MOV
+//        encoding. It seems we should be able to do that sort of thing
+//        in tblgen, but it could get ugly.
+let TwoOperandAliasConstraint = "$Rm = $Rd" in {
+def ASRi : ARMAsmPseudo<"asr${s}${p} $Rd, $Rm, $imm",
+                        (ins GPR:$Rd, GPR:$Rm, imm0_32:$imm, pred:$p,
+                             cc_out:$s)>;
+def LSRi : ARMAsmPseudo<"lsr${s}${p} $Rd, $Rm, $imm",
+                        (ins GPR:$Rd, GPR:$Rm, imm0_32:$imm, pred:$p,
+                             cc_out:$s)>;
+def LSLi : ARMAsmPseudo<"lsl${s}${p} $Rd, $Rm, $imm",
+                        (ins GPR:$Rd, GPR:$Rm, imm0_31:$imm, pred:$p,
+                             cc_out:$s)>;
+def RORi : ARMAsmPseudo<"ror${s}${p} $Rd, $Rm, $imm",
+                        (ins GPR:$Rd, GPR:$Rm, imm0_31:$imm, pred:$p,
+                             cc_out:$s)>;
+}
+def RRXi : ARMAsmPseudo<"rrx${s}${p} $Rd, $Rm",
+                        (ins GPR:$Rd, GPR:$Rm, pred:$p, cc_out:$s)>;
+let TwoOperandAliasConstraint = "$Rn = $Rd" in {
+def ASRr : ARMAsmPseudo<"asr${s}${p} $Rd, $Rn, $Rm",
+                        (ins GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p,
+                             cc_out:$s)>;
+def LSRr : ARMAsmPseudo<"lsr${s}${p} $Rd, $Rn, $Rm",
+                        (ins GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p,
+                             cc_out:$s)>;
+def LSLr : ARMAsmPseudo<"lsl${s}${p} $Rd, $Rn, $Rm",
+                        (ins GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p,
+                             cc_out:$s)>;
+def RORr : ARMAsmPseudo<"ror${s}${p} $Rd, $Rn, $Rm",
+                        (ins GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p,
+                             cc_out:$s)>;
+}
+
+// "neg" is and alias for "rsb rd, rn, #0"
+def : ARMInstAlias<"neg${s}${p} $Rd, $Rm",
+                   (RSBri GPR:$Rd, GPR:$Rm, 0, pred:$p, cc_out:$s)>;
+
+// Pre-v6, 'mov r0, r0' was used as a NOP encoding.
+def : InstAlias<"nop${p}", (MOVr R0, R0, pred:$p, zero_reg)>,
+         Requires<[IsARM, NoV6]>;
+
+// MUL/UMLAL/SMLAL/UMULL/SMULL are available on all arches, but
+// the instruction definitions need difference constraints pre-v6.
+// Use these aliases for the assembly parsing on pre-v6.
+def : InstAlias<"mul${s}${p} $Rd, $Rn, $Rm",
+            (MUL GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p, cc_out:$s)>,
+         Requires<[IsARM, NoV6]>;
+def : InstAlias<"mla${s}${p} $Rd, $Rn, $Rm, $Ra",
+            (MLA GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$Ra,
+             pred:$p, cc_out:$s)>,
+         Requires<[IsARM, NoV6]>;
+def : InstAlias<"smlal${s}${p} $RdLo, $RdHi, $Rn, $Rm",
+            (SMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>,
+         Requires<[IsARM, NoV6]>;
+def : InstAlias<"umlal${s}${p} $RdLo, $RdHi, $Rn, $Rm",
+            (UMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>,
+         Requires<[IsARM, NoV6]>;
+def : InstAlias<"smull${s}${p} $RdLo, $RdHi, $Rn, $Rm",
+            (SMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>,
+         Requires<[IsARM, NoV6]>;
+def : InstAlias<"umull${s}${p} $RdLo, $RdHi, $Rn, $Rm",
+            (UMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>,
+         Requires<[IsARM, NoV6]>;
+
+// 'it' blocks in ARM mode just validate the predicates. The IT itself
+// is discarded.
+def ITasm : ARMAsmPseudo<"it$mask $cc", (ins it_pred:$cc, it_mask:$mask)>,
+         ComplexDeprecationPredicate<"IT">;
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td b/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td
new file mode 100644
index 0000000..c02bb3b
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td
@@ -0,0 +1,7626 @@
+//===-- ARMInstrNEON.td - NEON support for ARM -------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the ARM NEON instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// NEON-specific Operands.
+//===----------------------------------------------------------------------===//
+def nModImm : Operand<i32> {
+  let PrintMethod = "printNEONModImmOperand";
+}
+
+def nImmSplatI8AsmOperand : AsmOperandClass { let Name = "NEONi8splat"; }
+def nImmSplatI8 : Operand<i32> {
+  let PrintMethod = "printNEONModImmOperand";
+  let ParserMatchClass = nImmSplatI8AsmOperand;
+}
+def nImmSplatI16AsmOperand : AsmOperandClass { let Name = "NEONi16splat"; }
+def nImmSplatI16 : Operand<i32> {
+  let PrintMethod = "printNEONModImmOperand";
+  let ParserMatchClass = nImmSplatI16AsmOperand;
+}
+def nImmSplatI32AsmOperand : AsmOperandClass { let Name = "NEONi32splat"; }
+def nImmSplatI32 : Operand<i32> {
+  let PrintMethod = "printNEONModImmOperand";
+  let ParserMatchClass = nImmSplatI32AsmOperand;
+}
+def nImmVMOVI32AsmOperand : AsmOperandClass { let Name = "NEONi32vmov"; }
+def nImmVMOVI32 : Operand<i32> {
+  let PrintMethod = "printNEONModImmOperand";
+  let ParserMatchClass = nImmVMOVI32AsmOperand;
+}
+
+def nImmVMOVI16AsmOperandByteReplicate :
+  AsmOperandClass {
+  let Name = "NEONi16vmovByteReplicate";
+  let PredicateMethod = "isNEONi16ByteReplicate";
+  let RenderMethod = "addNEONvmovByteReplicateOperands";
+}
+def nImmVMOVI32AsmOperandByteReplicate :
+  AsmOperandClass {
+  let Name = "NEONi32vmovByteReplicate";
+  let PredicateMethod = "isNEONi32ByteReplicate";
+  let RenderMethod = "addNEONvmovByteReplicateOperands";
+}
+def nImmVMVNI16AsmOperandByteReplicate :
+  AsmOperandClass {
+  let Name = "NEONi16invByteReplicate";
+  let PredicateMethod = "isNEONi16ByteReplicate";
+  let RenderMethod = "addNEONinvByteReplicateOperands";
+}
+def nImmVMVNI32AsmOperandByteReplicate :
+  AsmOperandClass {
+  let Name = "NEONi32invByteReplicate";
+  let PredicateMethod = "isNEONi32ByteReplicate";
+  let RenderMethod = "addNEONinvByteReplicateOperands";
+}
+
+def nImmVMOVI16ByteReplicate : Operand<i32> {
+  let PrintMethod = "printNEONModImmOperand";
+  let ParserMatchClass = nImmVMOVI16AsmOperandByteReplicate;
+}
+def nImmVMOVI32ByteReplicate : Operand<i32> {
+  let PrintMethod = "printNEONModImmOperand";
+  let ParserMatchClass = nImmVMOVI32AsmOperandByteReplicate;
+}
+def nImmVMVNI16ByteReplicate : Operand<i32> {
+  let PrintMethod = "printNEONModImmOperand";
+  let ParserMatchClass = nImmVMVNI16AsmOperandByteReplicate;
+}
+def nImmVMVNI32ByteReplicate : Operand<i32> {
+  let PrintMethod = "printNEONModImmOperand";
+  let ParserMatchClass = nImmVMVNI32AsmOperandByteReplicate;
+}
+
+def nImmVMOVI32NegAsmOperand : AsmOperandClass { let Name = "NEONi32vmovNeg"; }
+def nImmVMOVI32Neg : Operand<i32> {
+  let PrintMethod = "printNEONModImmOperand";
+  let ParserMatchClass = nImmVMOVI32NegAsmOperand;
+}
+def nImmVMOVF32 : Operand<i32> {
+  let PrintMethod = "printFPImmOperand";
+  let ParserMatchClass = FPImmOperand;
+}
+def nImmSplatI64AsmOperand : AsmOperandClass { let Name = "NEONi64splat"; }
+def nImmSplatI64 : Operand<i32> {
+  let PrintMethod = "printNEONModImmOperand";
+  let ParserMatchClass = nImmSplatI64AsmOperand;
+}
+
+def VectorIndex8Operand  : AsmOperandClass { let Name = "VectorIndex8"; }
+def VectorIndex16Operand : AsmOperandClass { let Name = "VectorIndex16"; }
+def VectorIndex32Operand : AsmOperandClass { let Name = "VectorIndex32"; }
+def VectorIndex8 : Operand<i32>, ImmLeaf<i32, [{
+  return ((uint64_t)Imm) < 8;
+}]> {
+  let ParserMatchClass = VectorIndex8Operand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i32imm);
+}
+def VectorIndex16 : Operand<i32>, ImmLeaf<i32, [{
+  return ((uint64_t)Imm) < 4;
+}]> {
+  let ParserMatchClass = VectorIndex16Operand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i32imm);
+}
+def VectorIndex32 : Operand<i32>, ImmLeaf<i32, [{
+  return ((uint64_t)Imm) < 2;
+}]> {
+  let ParserMatchClass = VectorIndex32Operand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i32imm);
+}
+
+// Register list of one D register.
+def VecListOneDAsmOperand : AsmOperandClass {
+  let Name = "VecListOneD";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
+}
+def VecListOneD : RegisterOperand<DPR, "printVectorListOne"> {
+  let ParserMatchClass = VecListOneDAsmOperand;
+}
+// Register list of two sequential D registers.
+def VecListDPairAsmOperand : AsmOperandClass {
+  let Name = "VecListDPair";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
+}
+def VecListDPair : RegisterOperand<DPair, "printVectorListTwo"> {
+  let ParserMatchClass = VecListDPairAsmOperand;
+}
+// Register list of three sequential D registers.
+def VecListThreeDAsmOperand : AsmOperandClass {
+  let Name = "VecListThreeD";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
+}
+def VecListThreeD : RegisterOperand<DPR, "printVectorListThree"> {
+  let ParserMatchClass = VecListThreeDAsmOperand;
+}
+// Register list of four sequential D registers.
+def VecListFourDAsmOperand : AsmOperandClass {
+  let Name = "VecListFourD";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
+}
+def VecListFourD : RegisterOperand<DPR, "printVectorListFour"> {
+  let ParserMatchClass = VecListFourDAsmOperand;
+}
+// Register list of two D registers spaced by 2 (two sequential Q registers).
+def VecListDPairSpacedAsmOperand : AsmOperandClass {
+  let Name = "VecListDPairSpaced";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
+}
+def VecListDPairSpaced : RegisterOperand<DPair, "printVectorListTwoSpaced"> {
+  let ParserMatchClass = VecListDPairSpacedAsmOperand;
+}
+// Register list of three D registers spaced by 2 (three Q registers).
+def VecListThreeQAsmOperand : AsmOperandClass {
+  let Name = "VecListThreeQ";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
+}
+def VecListThreeQ : RegisterOperand<DPR, "printVectorListThreeSpaced"> {
+  let ParserMatchClass = VecListThreeQAsmOperand;
+}
+// Register list of three D registers spaced by 2 (three Q registers).
+def VecListFourQAsmOperand : AsmOperandClass {
+  let Name = "VecListFourQ";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
+}
+def VecListFourQ : RegisterOperand<DPR, "printVectorListFourSpaced"> {
+  let ParserMatchClass = VecListFourQAsmOperand;
+}
+
+// Register list of one D register, with "all lanes" subscripting.
+def VecListOneDAllLanesAsmOperand : AsmOperandClass {
+  let Name = "VecListOneDAllLanes";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
+}
+def VecListOneDAllLanes : RegisterOperand<DPR, "printVectorListOneAllLanes"> {
+  let ParserMatchClass = VecListOneDAllLanesAsmOperand;
+}
+// Register list of two D registers, with "all lanes" subscripting.
+def VecListDPairAllLanesAsmOperand : AsmOperandClass {
+  let Name = "VecListDPairAllLanes";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
+}
+def VecListDPairAllLanes : RegisterOperand<DPair,
+                                           "printVectorListTwoAllLanes"> {
+  let ParserMatchClass = VecListDPairAllLanesAsmOperand;
+}
+// Register list of two D registers spaced by 2 (two sequential Q registers).
+def VecListDPairSpacedAllLanesAsmOperand : AsmOperandClass {
+  let Name = "VecListDPairSpacedAllLanes";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
+}
+def VecListDPairSpacedAllLanes : RegisterOperand<DPair,
+                                         "printVectorListTwoSpacedAllLanes"> {
+  let ParserMatchClass = VecListDPairSpacedAllLanesAsmOperand;
+}
+// Register list of three D registers, with "all lanes" subscripting.
+def VecListThreeDAllLanesAsmOperand : AsmOperandClass {
+  let Name = "VecListThreeDAllLanes";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
+}
+def VecListThreeDAllLanes : RegisterOperand<DPR,
+                                            "printVectorListThreeAllLanes"> {
+  let ParserMatchClass = VecListThreeDAllLanesAsmOperand;
+}
+// Register list of three D registers spaced by 2 (three sequential Q regs).
+def VecListThreeQAllLanesAsmOperand : AsmOperandClass {
+  let Name = "VecListThreeQAllLanes";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
+}
+def VecListThreeQAllLanes : RegisterOperand<DPR,
+                                         "printVectorListThreeSpacedAllLanes"> {
+  let ParserMatchClass = VecListThreeQAllLanesAsmOperand;
+}
+// Register list of four D registers, with "all lanes" subscripting.
+def VecListFourDAllLanesAsmOperand : AsmOperandClass {
+  let Name = "VecListFourDAllLanes";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
+}
+def VecListFourDAllLanes : RegisterOperand<DPR, "printVectorListFourAllLanes"> {
+  let ParserMatchClass = VecListFourDAllLanesAsmOperand;
+}
+// Register list of four D registers spaced by 2 (four sequential Q regs).
+def VecListFourQAllLanesAsmOperand : AsmOperandClass {
+  let Name = "VecListFourQAllLanes";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
+}
+def VecListFourQAllLanes : RegisterOperand<DPR,
+                                         "printVectorListFourSpacedAllLanes"> {
+  let ParserMatchClass = VecListFourQAllLanesAsmOperand;
+}
+
+
+// Register list of one D register, with byte lane subscripting.
+def VecListOneDByteIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListOneDByteIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListOneDByteIndexed : Operand<i32> {
+  let ParserMatchClass = VecListOneDByteIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with half-word lane subscripting.
+def VecListOneDHWordIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListOneDHWordIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListOneDHWordIndexed : Operand<i32> {
+  let ParserMatchClass = VecListOneDHWordIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with word lane subscripting.
+def VecListOneDWordIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListOneDWordIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListOneDWordIndexed : Operand<i32> {
+  let ParserMatchClass = VecListOneDWordIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+
+// Register list of two D registers with byte lane subscripting.
+def VecListTwoDByteIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListTwoDByteIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListTwoDByteIndexed : Operand<i32> {
+  let ParserMatchClass = VecListTwoDByteIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with half-word lane subscripting.
+def VecListTwoDHWordIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListTwoDHWordIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListTwoDHWordIndexed : Operand<i32> {
+  let ParserMatchClass = VecListTwoDHWordIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with word lane subscripting.
+def VecListTwoDWordIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListTwoDWordIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListTwoDWordIndexed : Operand<i32> {
+  let ParserMatchClass = VecListTwoDWordIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// Register list of two Q registers with half-word lane subscripting.
+def VecListTwoQHWordIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListTwoQHWordIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListTwoQHWordIndexed : Operand<i32> {
+  let ParserMatchClass = VecListTwoQHWordIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with word lane subscripting.
+def VecListTwoQWordIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListTwoQWordIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListTwoQWordIndexed : Operand<i32> {
+  let ParserMatchClass = VecListTwoQWordIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+
+
+// Register list of three D registers with byte lane subscripting.
+def VecListThreeDByteIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListThreeDByteIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListThreeDByteIndexed : Operand<i32> {
+  let ParserMatchClass = VecListThreeDByteIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with half-word lane subscripting.
+def VecListThreeDHWordIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListThreeDHWordIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListThreeDHWordIndexed : Operand<i32> {
+  let ParserMatchClass = VecListThreeDHWordIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with word lane subscripting.
+def VecListThreeDWordIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListThreeDWordIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListThreeDWordIndexed : Operand<i32> {
+  let ParserMatchClass = VecListThreeDWordIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// Register list of three Q registers with half-word lane subscripting.
+def VecListThreeQHWordIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListThreeQHWordIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListThreeQHWordIndexed : Operand<i32> {
+  let ParserMatchClass = VecListThreeQHWordIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with word lane subscripting.
+def VecListThreeQWordIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListThreeQWordIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListThreeQWordIndexed : Operand<i32> {
+  let ParserMatchClass = VecListThreeQWordIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+
+// Register list of four D registers with byte lane subscripting.
+def VecListFourDByteIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListFourDByteIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListFourDByteIndexed : Operand<i32> {
+  let ParserMatchClass = VecListFourDByteIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with half-word lane subscripting.
+def VecListFourDHWordIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListFourDHWordIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListFourDHWordIndexed : Operand<i32> {
+  let ParserMatchClass = VecListFourDHWordIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with word lane subscripting.
+def VecListFourDWordIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListFourDWordIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListFourDWordIndexed : Operand<i32> {
+  let ParserMatchClass = VecListFourDWordIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// Register list of four Q registers with half-word lane subscripting.
+def VecListFourQHWordIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListFourQHWordIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListFourQHWordIndexed : Operand<i32> {
+  let ParserMatchClass = VecListFourQHWordIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with word lane subscripting.
+def VecListFourQWordIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListFourQWordIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListFourQWordIndexed : Operand<i32> {
+  let ParserMatchClass = VecListFourQWordIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+
+def dword_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 8;
+}]>;
+def dword_alignedstore : PatFrag<(ops node:$val, node:$ptr),
+                                 (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() >= 8;
+}]>;
+def word_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() == 4;
+}]>;
+def word_alignedstore : PatFrag<(ops node:$val, node:$ptr),
+                                 (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() == 4;
+}]>;
+def hword_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() == 2;
+}]>;
+def hword_alignedstore : PatFrag<(ops node:$val, node:$ptr),
+                                 (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() == 2;
+}]>;
+def byte_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() == 1;
+}]>;
+def byte_alignedstore : PatFrag<(ops node:$val, node:$ptr),
+                             (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() == 1;
+}]>;
+def non_word_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() < 4;
+}]>;
+def non_word_alignedstore : PatFrag<(ops node:$val, node:$ptr),
+                                    (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() < 4;
+}]>;
+
+//===----------------------------------------------------------------------===//
+// NEON-specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDTARMVCMP    : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<1, 2>]>;
+def SDTARMVCMPZ   : SDTypeProfile<1, 1, []>;
+
+def NEONvceq      : SDNode<"ARMISD::VCEQ", SDTARMVCMP>;
+def NEONvceqz     : SDNode<"ARMISD::VCEQZ", SDTARMVCMPZ>;
+def NEONvcge      : SDNode<"ARMISD::VCGE", SDTARMVCMP>;
+def NEONvcgez     : SDNode<"ARMISD::VCGEZ", SDTARMVCMPZ>;
+def NEONvclez     : SDNode<"ARMISD::VCLEZ", SDTARMVCMPZ>;
+def NEONvcgeu     : SDNode<"ARMISD::VCGEU", SDTARMVCMP>;
+def NEONvcgt      : SDNode<"ARMISD::VCGT", SDTARMVCMP>;
+def NEONvcgtz     : SDNode<"ARMISD::VCGTZ", SDTARMVCMPZ>;
+def NEONvcltz     : SDNode<"ARMISD::VCLTZ", SDTARMVCMPZ>;
+def NEONvcgtu     : SDNode<"ARMISD::VCGTU", SDTARMVCMP>;
+def NEONvtst      : SDNode<"ARMISD::VTST", SDTARMVCMP>;
+
+// Types for vector shift by immediates.  The "SHX" version is for long and
+// narrow operations where the source and destination vectors have different
+// types.  The "SHINS" version is for shift and insert operations.
+def SDTARMVSH     : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
+                                         SDTCisVT<2, i32>]>;
+def SDTARMVSHX    : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
+                                         SDTCisVT<2, i32>]>;
+def SDTARMVSHINS  : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
+                                         SDTCisSameAs<0, 2>, SDTCisVT<3, i32>]>;
+
+def NEONvshl      : SDNode<"ARMISD::VSHL", SDTARMVSH>;
+def NEONvshrs     : SDNode<"ARMISD::VSHRs", SDTARMVSH>;
+def NEONvshru     : SDNode<"ARMISD::VSHRu", SDTARMVSH>;
+def NEONvshrn     : SDNode<"ARMISD::VSHRN", SDTARMVSHX>;
+
+def NEONvrshrs    : SDNode<"ARMISD::VRSHRs", SDTARMVSH>;
+def NEONvrshru    : SDNode<"ARMISD::VRSHRu", SDTARMVSH>;
+def NEONvrshrn    : SDNode<"ARMISD::VRSHRN", SDTARMVSHX>;
+
+def NEONvqshls    : SDNode<"ARMISD::VQSHLs", SDTARMVSH>;
+def NEONvqshlu    : SDNode<"ARMISD::VQSHLu", SDTARMVSH>;
+def NEONvqshlsu   : SDNode<"ARMISD::VQSHLsu", SDTARMVSH>;
+def NEONvqshrns   : SDNode<"ARMISD::VQSHRNs", SDTARMVSHX>;
+def NEONvqshrnu   : SDNode<"ARMISD::VQSHRNu", SDTARMVSHX>;
+def NEONvqshrnsu  : SDNode<"ARMISD::VQSHRNsu", SDTARMVSHX>;
+
+def NEONvqrshrns  : SDNode<"ARMISD::VQRSHRNs", SDTARMVSHX>;
+def NEONvqrshrnu  : SDNode<"ARMISD::VQRSHRNu", SDTARMVSHX>;
+def NEONvqrshrnsu : SDNode<"ARMISD::VQRSHRNsu", SDTARMVSHX>;
+
+def NEONvsli      : SDNode<"ARMISD::VSLI", SDTARMVSHINS>;
+def NEONvsri      : SDNode<"ARMISD::VSRI", SDTARMVSHINS>;
+
+def SDTARMVGETLN  : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisInt<1>,
+                                         SDTCisVT<2, i32>]>;
+def NEONvgetlaneu : SDNode<"ARMISD::VGETLANEu", SDTARMVGETLN>;
+def NEONvgetlanes : SDNode<"ARMISD::VGETLANEs", SDTARMVGETLN>;
+
+def SDTARMVMOVIMM : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVT<1, i32>]>;
+def NEONvmovImm   : SDNode<"ARMISD::VMOVIMM", SDTARMVMOVIMM>;
+def NEONvmvnImm   : SDNode<"ARMISD::VMVNIMM", SDTARMVMOVIMM>;
+def NEONvmovFPImm : SDNode<"ARMISD::VMOVFPIMM", SDTARMVMOVIMM>;
+
+def SDTARMVORRIMM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
+                                           SDTCisVT<2, i32>]>;
+def NEONvorrImm   : SDNode<"ARMISD::VORRIMM", SDTARMVORRIMM>;
+def NEONvbicImm   : SDNode<"ARMISD::VBICIMM", SDTARMVORRIMM>;
+
+def NEONvbsl      : SDNode<"ARMISD::VBSL",
+                           SDTypeProfile<1, 3, [SDTCisVec<0>,
+                                                SDTCisSameAs<0, 1>,
+                                                SDTCisSameAs<0, 2>,
+                                                SDTCisSameAs<0, 3>]>>;
+
+def NEONvdup      : SDNode<"ARMISD::VDUP", SDTypeProfile<1, 1, [SDTCisVec<0>]>>;
+
+// VDUPLANE can produce a quad-register result from a double-register source,
+// so the result is not constrained to match the source.
+def NEONvduplane  : SDNode<"ARMISD::VDUPLANE",
+                           SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                                SDTCisVT<2, i32>]>>;
+
+def SDTARMVEXT    : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
+                                         SDTCisSameAs<0, 2>, SDTCisVT<3, i32>]>;
+def NEONvext      : SDNode<"ARMISD::VEXT", SDTARMVEXT>;
+
+def SDTARMVSHUF   : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0, 1>]>;
+def NEONvrev64    : SDNode<"ARMISD::VREV64", SDTARMVSHUF>;
+def NEONvrev32    : SDNode<"ARMISD::VREV32", SDTARMVSHUF>;
+def NEONvrev16    : SDNode<"ARMISD::VREV16", SDTARMVSHUF>;
+
+def SDTARMVSHUF2  : SDTypeProfile<2, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
+                                         SDTCisSameAs<0, 2>,
+                                         SDTCisSameAs<0, 3>]>;
+def NEONzip       : SDNode<"ARMISD::VZIP", SDTARMVSHUF2>;
+def NEONuzp       : SDNode<"ARMISD::VUZP", SDTARMVSHUF2>;
+def NEONtrn       : SDNode<"ARMISD::VTRN", SDTARMVSHUF2>;
+
+def SDTARMVMULL   : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
+                                         SDTCisSameAs<1, 2>]>;
+def NEONvmulls    : SDNode<"ARMISD::VMULLs", SDTARMVMULL>;
+def NEONvmullu    : SDNode<"ARMISD::VMULLu", SDTARMVMULL>;
+
+def SDTARMFMAX    : SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisSameAs<0, 1>,
+                                         SDTCisSameAs<0, 2>]>;
+def NEONfmax      : SDNode<"ARMISD::FMAX", SDTARMFMAX>;
+def NEONfmin      : SDNode<"ARMISD::FMIN", SDTARMFMAX>;
+
+def NEONimmAllZerosV: PatLeaf<(NEONvmovImm (i32 timm)), [{
+  ConstantSDNode *ConstVal = cast<ConstantSDNode>(N->getOperand(0));
+  unsigned EltBits = 0;
+  uint64_t EltVal = ARM_AM::decodeNEONModImm(ConstVal->getZExtValue(), EltBits);
+  return (EltBits == 32 && EltVal == 0);
+}]>;
+
+def NEONimmAllOnesV: PatLeaf<(NEONvmovImm (i32 timm)), [{
+  ConstantSDNode *ConstVal = cast<ConstantSDNode>(N->getOperand(0));
+  unsigned EltBits = 0;
+  uint64_t EltVal = ARM_AM::decodeNEONModImm(ConstVal->getZExtValue(), EltBits);
+  return (EltBits == 8 && EltVal == 0xff);
+}]>;
+
+//===----------------------------------------------------------------------===//
+// NEON load / store instructions
+//===----------------------------------------------------------------------===//
+
+// Use VLDM to load a Q register as a D register pair.
+// This is a pseudo instruction that is expanded to VLDMD after reg alloc.
+def VLDMQIA
+  : PseudoVFPLdStM<(outs DPair:$dst), (ins GPR:$Rn),
+                    IIC_fpLoad_m, "",
+                   [(set DPair:$dst, (v2f64 (load GPR:$Rn)))]>;
+
+// Use VSTM to store a Q register as a D register pair.
+// This is a pseudo instruction that is expanded to VSTMD after reg alloc.
+def VSTMQIA
+  : PseudoVFPLdStM<(outs), (ins DPair:$src, GPR:$Rn),
+                    IIC_fpStore_m, "",
+                   [(store (v2f64 DPair:$src), GPR:$Rn)]>;
+
+// Classes for VLD* pseudo-instructions with multi-register operands.
+// These are expanded to real instructions after register allocation.
+class VLDQPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QPR:$dst), (ins addrmode6:$addr), itin, "">;
+class VLDQWBPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QPR:$dst, GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset), itin,
+                "$addr.addr = $wb">;
+class VLDQWBfixedPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QPR:$dst, GPR:$wb),
+                (ins addrmode6:$addr), itin,
+                "$addr.addr = $wb">;
+class VLDQWBregisterPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QPR:$dst, GPR:$wb),
+                (ins addrmode6:$addr, rGPR:$offset), itin,
+                "$addr.addr = $wb">;
+
+class VLDQQPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QQPR:$dst), (ins addrmode6:$addr), itin, "">;
+class VLDQQWBPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QQPR:$dst, GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset), itin,
+                "$addr.addr = $wb">;
+class VLDQQWBfixedPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QQPR:$dst, GPR:$wb),
+                (ins addrmode6:$addr), itin,
+                "$addr.addr = $wb">;
+class VLDQQWBregisterPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QQPR:$dst, GPR:$wb),
+                (ins addrmode6:$addr, rGPR:$offset), itin,
+                "$addr.addr = $wb">;
+
+
+class VLDQQQQPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QQQQPR:$dst), (ins addrmode6:$addr, QQQQPR:$src),itin,
+                "$src = $dst">;
+class VLDQQQQWBPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QQQQPR:$dst, GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src), itin,
+                "$addr.addr = $wb, $src = $dst">;
+
+let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
+
+//   VLD1     : Vector Load (multiple single elements)
+class VLD1D<bits<4> op7_4, string Dt, Operand AddrMode>
+  : NLdSt<0,0b10,0b0111,op7_4, (outs VecListOneD:$Vd),
+          (ins AddrMode:$Rn), IIC_VLD1,
+          "vld1", Dt, "$Vd, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLDST1Instruction";
+}
+class VLD1Q<bits<4> op7_4, string Dt, Operand AddrMode>
+  : NLdSt<0,0b10,0b1010,op7_4, (outs VecListDPair:$Vd),
+          (ins AddrMode:$Rn), IIC_VLD1x2,
+          "vld1", Dt, "$Vd, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVLDST1Instruction";
+}
+
+def  VLD1d8   : VLD1D<{0,0,0,?}, "8",  addrmode6align64>;
+def  VLD1d16  : VLD1D<{0,1,0,?}, "16", addrmode6align64>;
+def  VLD1d32  : VLD1D<{1,0,0,?}, "32", addrmode6align64>;
+def  VLD1d64  : VLD1D<{1,1,0,?}, "64", addrmode6align64>;
+
+def  VLD1q8   : VLD1Q<{0,0,?,?}, "8",  addrmode6align64or128>;
+def  VLD1q16  : VLD1Q<{0,1,?,?}, "16", addrmode6align64or128>;
+def  VLD1q32  : VLD1Q<{1,0,?,?}, "32", addrmode6align64or128>;
+def  VLD1q64  : VLD1Q<{1,1,?,?}, "64", addrmode6align64or128>;
+
+// ...with address register writeback:
+multiclass VLD1DWB<bits<4> op7_4, string Dt, Operand AddrMode> {
+  def _fixed : NLdSt<0,0b10, 0b0111,op7_4, (outs VecListOneD:$Vd, GPR:$wb),
+                     (ins AddrMode:$Rn), IIC_VLD1u,
+                     "vld1", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{4} = Rn{4};
+    let DecoderMethod = "DecodeVLDST1Instruction";
+  }
+  def _register : NLdSt<0,0b10,0b0111,op7_4, (outs VecListOneD:$Vd, GPR:$wb),
+                        (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1u,
+                        "vld1", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{4} = Rn{4};
+    let DecoderMethod = "DecodeVLDST1Instruction";
+  }
+}
+multiclass VLD1QWB<bits<4> op7_4, string Dt, Operand AddrMode> {
+  def _fixed : NLdSt<0,0b10,0b1010,op7_4, (outs VecListDPair:$Vd, GPR:$wb),
+                    (ins AddrMode:$Rn), IIC_VLD1x2u,
+                     "vld1", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVLDST1Instruction";
+  }
+  def _register : NLdSt<0,0b10,0b1010,op7_4, (outs VecListDPair:$Vd, GPR:$wb),
+                        (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1x2u,
+                        "vld1", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVLDST1Instruction";
+  }
+}
+
+defm VLD1d8wb  : VLD1DWB<{0,0,0,?}, "8",  addrmode6align64>;
+defm VLD1d16wb : VLD1DWB<{0,1,0,?}, "16", addrmode6align64>;
+defm VLD1d32wb : VLD1DWB<{1,0,0,?}, "32", addrmode6align64>;
+defm VLD1d64wb : VLD1DWB<{1,1,0,?}, "64", addrmode6align64>;
+defm VLD1q8wb  : VLD1QWB<{0,0,?,?}, "8",  addrmode6align64or128>;
+defm VLD1q16wb : VLD1QWB<{0,1,?,?}, "16", addrmode6align64or128>;
+defm VLD1q32wb : VLD1QWB<{1,0,?,?}, "32", addrmode6align64or128>;
+defm VLD1q64wb : VLD1QWB<{1,1,?,?}, "64", addrmode6align64or128>;
+
+// ...with 3 registers
+class VLD1D3<bits<4> op7_4, string Dt, Operand AddrMode>
+  : NLdSt<0,0b10,0b0110,op7_4, (outs VecListThreeD:$Vd),
+          (ins AddrMode:$Rn), IIC_VLD1x3, "vld1", Dt,
+          "$Vd, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLDST1Instruction";
+}
+multiclass VLD1D3WB<bits<4> op7_4, string Dt, Operand AddrMode> {
+  def _fixed : NLdSt<0,0b10,0b0110, op7_4, (outs VecListThreeD:$Vd, GPR:$wb),
+                    (ins AddrMode:$Rn), IIC_VLD1x2u,
+                     "vld1", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{4} = Rn{4};
+    let DecoderMethod = "DecodeVLDST1Instruction";
+  }
+  def _register : NLdSt<0,0b10,0b0110,op7_4, (outs VecListThreeD:$Vd, GPR:$wb),
+                        (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1x2u,
+                        "vld1", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{4} = Rn{4};
+    let DecoderMethod = "DecodeVLDST1Instruction";
+  }
+}
+
+def VLD1d8T      : VLD1D3<{0,0,0,?}, "8",  addrmode6align64>;
+def VLD1d16T     : VLD1D3<{0,1,0,?}, "16", addrmode6align64>;
+def VLD1d32T     : VLD1D3<{1,0,0,?}, "32", addrmode6align64>;
+def VLD1d64T     : VLD1D3<{1,1,0,?}, "64", addrmode6align64>;
+
+defm VLD1d8Twb  : VLD1D3WB<{0,0,0,?}, "8",  addrmode6align64>;
+defm VLD1d16Twb : VLD1D3WB<{0,1,0,?}, "16", addrmode6align64>;
+defm VLD1d32Twb : VLD1D3WB<{1,0,0,?}, "32", addrmode6align64>;
+defm VLD1d64Twb : VLD1D3WB<{1,1,0,?}, "64", addrmode6align64>;
+
+def VLD1d64TPseudo : VLDQQPseudo<IIC_VLD1x3>;
+def VLD1d64TPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD1x3>;
+def VLD1d64TPseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD1x3>;
+
+// ...with 4 registers
+class VLD1D4<bits<4> op7_4, string Dt, Operand AddrMode>
+  : NLdSt<0, 0b10, 0b0010, op7_4, (outs VecListFourD:$Vd),
+          (ins AddrMode:$Rn), IIC_VLD1x4, "vld1", Dt,
+          "$Vd, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVLDST1Instruction";
+}
+multiclass VLD1D4WB<bits<4> op7_4, string Dt, Operand AddrMode> {
+  def _fixed : NLdSt<0,0b10,0b0010, op7_4, (outs VecListFourD:$Vd, GPR:$wb),
+                    (ins AddrMode:$Rn), IIC_VLD1x2u,
+                     "vld1", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVLDST1Instruction";
+  }
+  def _register : NLdSt<0,0b10,0b0010,op7_4, (outs VecListFourD:$Vd, GPR:$wb),
+                        (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1x2u,
+                        "vld1", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVLDST1Instruction";
+  }
+}
+
+def VLD1d8Q      : VLD1D4<{0,0,?,?}, "8",  addrmode6align64or128or256>;
+def VLD1d16Q     : VLD1D4<{0,1,?,?}, "16", addrmode6align64or128or256>;
+def VLD1d32Q     : VLD1D4<{1,0,?,?}, "32", addrmode6align64or128or256>;
+def VLD1d64Q     : VLD1D4<{1,1,?,?}, "64", addrmode6align64or128or256>;
+
+defm VLD1d8Qwb   : VLD1D4WB<{0,0,?,?}, "8",  addrmode6align64or128or256>;
+defm VLD1d16Qwb  : VLD1D4WB<{0,1,?,?}, "16", addrmode6align64or128or256>;
+defm VLD1d32Qwb  : VLD1D4WB<{1,0,?,?}, "32", addrmode6align64or128or256>;
+defm VLD1d64Qwb  : VLD1D4WB<{1,1,?,?}, "64", addrmode6align64or128or256>;
+
+def VLD1d64QPseudo : VLDQQPseudo<IIC_VLD1x4>;
+def VLD1d64QPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD1x4>;
+def VLD1d64QPseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD1x4>;
+
+//   VLD2     : Vector Load (multiple 2-element structures)
+class VLD2<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy,
+           InstrItinClass itin, Operand AddrMode>
+  : NLdSt<0, 0b10, op11_8, op7_4, (outs VdTy:$Vd),
+          (ins AddrMode:$Rn), itin,
+          "vld2", Dt, "$Vd, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVLDST2Instruction";
+}
+
+def  VLD2d8   : VLD2<0b1000, {0,0,?,?}, "8", VecListDPair, IIC_VLD2,
+                     addrmode6align64or128>;
+def  VLD2d16  : VLD2<0b1000, {0,1,?,?}, "16", VecListDPair, IIC_VLD2,
+                     addrmode6align64or128>;
+def  VLD2d32  : VLD2<0b1000, {1,0,?,?}, "32", VecListDPair, IIC_VLD2,
+                     addrmode6align64or128>;
+
+def  VLD2q8   : VLD2<0b0011, {0,0,?,?}, "8", VecListFourD, IIC_VLD2x2,
+                     addrmode6align64or128or256>;
+def  VLD2q16  : VLD2<0b0011, {0,1,?,?}, "16", VecListFourD, IIC_VLD2x2,
+                     addrmode6align64or128or256>;
+def  VLD2q32  : VLD2<0b0011, {1,0,?,?}, "32", VecListFourD, IIC_VLD2x2,
+                     addrmode6align64or128or256>;
+
+def  VLD2q8Pseudo  : VLDQQPseudo<IIC_VLD2x2>;
+def  VLD2q16Pseudo : VLDQQPseudo<IIC_VLD2x2>;
+def  VLD2q32Pseudo : VLDQQPseudo<IIC_VLD2x2>;
+
+// ...with address register writeback:
+multiclass VLD2WB<bits<4> op11_8, bits<4> op7_4, string Dt,
+                  RegisterOperand VdTy, InstrItinClass itin, Operand AddrMode> {
+  def _fixed : NLdSt<0, 0b10, op11_8, op7_4, (outs VdTy:$Vd, GPR:$wb),
+                     (ins AddrMode:$Rn), itin,
+                     "vld2", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVLDST2Instruction";
+  }
+  def _register : NLdSt<0, 0b10, op11_8, op7_4, (outs VdTy:$Vd, GPR:$wb),
+                        (ins AddrMode:$Rn, rGPR:$Rm), itin,
+                        "vld2", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVLDST2Instruction";
+  }
+}
+
+defm VLD2d8wb  : VLD2WB<0b1000, {0,0,?,?}, "8", VecListDPair, IIC_VLD2u,
+                        addrmode6align64or128>;
+defm VLD2d16wb : VLD2WB<0b1000, {0,1,?,?}, "16", VecListDPair, IIC_VLD2u,
+                        addrmode6align64or128>;
+defm VLD2d32wb : VLD2WB<0b1000, {1,0,?,?}, "32", VecListDPair, IIC_VLD2u,
+                        addrmode6align64or128>;
+
+defm VLD2q8wb  : VLD2WB<0b0011, {0,0,?,?}, "8", VecListFourD, IIC_VLD2x2u,
+                        addrmode6align64or128or256>;
+defm VLD2q16wb : VLD2WB<0b0011, {0,1,?,?}, "16", VecListFourD, IIC_VLD2x2u,
+                        addrmode6align64or128or256>;
+defm VLD2q32wb : VLD2WB<0b0011, {1,0,?,?}, "32", VecListFourD, IIC_VLD2x2u,
+                        addrmode6align64or128or256>;
+
+def VLD2q8PseudoWB_fixed     : VLDQQWBfixedPseudo<IIC_VLD2x2u>;
+def VLD2q16PseudoWB_fixed    : VLDQQWBfixedPseudo<IIC_VLD2x2u>;
+def VLD2q32PseudoWB_fixed    : VLDQQWBfixedPseudo<IIC_VLD2x2u>;
+def VLD2q8PseudoWB_register  : VLDQQWBregisterPseudo<IIC_VLD2x2u>;
+def VLD2q16PseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD2x2u>;
+def VLD2q32PseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD2x2u>;
+
+// ...with double-spaced registers
+def  VLD2b8    : VLD2<0b1001, {0,0,?,?}, "8", VecListDPairSpaced, IIC_VLD2,
+                      addrmode6align64or128>;
+def  VLD2b16   : VLD2<0b1001, {0,1,?,?}, "16", VecListDPairSpaced, IIC_VLD2,
+                      addrmode6align64or128>;
+def  VLD2b32   : VLD2<0b1001, {1,0,?,?}, "32", VecListDPairSpaced, IIC_VLD2,
+                      addrmode6align64or128>;
+defm VLD2b8wb  : VLD2WB<0b1001, {0,0,?,?}, "8", VecListDPairSpaced, IIC_VLD2u,
+                        addrmode6align64or128>;
+defm VLD2b16wb : VLD2WB<0b1001, {0,1,?,?}, "16", VecListDPairSpaced, IIC_VLD2u,
+                        addrmode6align64or128>;
+defm VLD2b32wb : VLD2WB<0b1001, {1,0,?,?}, "32", VecListDPairSpaced, IIC_VLD2u,
+                        addrmode6align64or128>;
+
+//   VLD3     : Vector Load (multiple 3-element structures)
+class VLD3D<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<0, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3),
+          (ins addrmode6:$Rn), IIC_VLD3,
+          "vld3", Dt, "\\{$Vd, $dst2, $dst3\\}, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLDST3Instruction";
+}
+
+def  VLD3d8   : VLD3D<0b0100, {0,0,0,?}, "8">;
+def  VLD3d16  : VLD3D<0b0100, {0,1,0,?}, "16">;
+def  VLD3d32  : VLD3D<0b0100, {1,0,0,?}, "32">;
+
+def  VLD3d8Pseudo  : VLDQQPseudo<IIC_VLD3>;
+def  VLD3d16Pseudo : VLDQQPseudo<IIC_VLD3>;
+def  VLD3d32Pseudo : VLDQQPseudo<IIC_VLD3>;
+
+// ...with address register writeback:
+class VLD3DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<0, 0b10, op11_8, op7_4,
+          (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD3u,
+          "vld3", Dt, "\\{$Vd, $dst2, $dst3\\}, $Rn$Rm",
+          "$Rn.addr = $wb", []> {
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLDST3Instruction";
+}
+
+def VLD3d8_UPD  : VLD3DWB<0b0100, {0,0,0,?}, "8">;
+def VLD3d16_UPD : VLD3DWB<0b0100, {0,1,0,?}, "16">;
+def VLD3d32_UPD : VLD3DWB<0b0100, {1,0,0,?}, "32">;
+
+def VLD3d8Pseudo_UPD  : VLDQQWBPseudo<IIC_VLD3u>;
+def VLD3d16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3u>;
+def VLD3d32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3u>;
+
+// ...with double-spaced registers:
+def VLD3q8      : VLD3D<0b0101, {0,0,0,?}, "8">;
+def VLD3q16     : VLD3D<0b0101, {0,1,0,?}, "16">;
+def VLD3q32     : VLD3D<0b0101, {1,0,0,?}, "32">;
+def VLD3q8_UPD  : VLD3DWB<0b0101, {0,0,0,?}, "8">;
+def VLD3q16_UPD : VLD3DWB<0b0101, {0,1,0,?}, "16">;
+def VLD3q32_UPD : VLD3DWB<0b0101, {1,0,0,?}, "32">;
+
+def VLD3q8Pseudo_UPD  : VLDQQQQWBPseudo<IIC_VLD3u>;
+def VLD3q16Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>;
+def VLD3q32Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>;
+
+// ...alternate versions to be allocated odd register numbers:
+def VLD3q8oddPseudo   : VLDQQQQPseudo<IIC_VLD3>;
+def VLD3q16oddPseudo  : VLDQQQQPseudo<IIC_VLD3>;
+def VLD3q32oddPseudo  : VLDQQQQPseudo<IIC_VLD3>;
+
+def VLD3q8oddPseudo_UPD  : VLDQQQQWBPseudo<IIC_VLD3u>;
+def VLD3q16oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>;
+def VLD3q32oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>;
+
+//   VLD4     : Vector Load (multiple 4-element structures)
+class VLD4D<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<0, 0b10, op11_8, op7_4,
+          (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4),
+          (ins addrmode6:$Rn), IIC_VLD4,
+          "vld4", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVLDST4Instruction";
+}
+
+def  VLD4d8   : VLD4D<0b0000, {0,0,?,?}, "8">;
+def  VLD4d16  : VLD4D<0b0000, {0,1,?,?}, "16">;
+def  VLD4d32  : VLD4D<0b0000, {1,0,?,?}, "32">;
+
+def  VLD4d8Pseudo  : VLDQQPseudo<IIC_VLD4>;
+def  VLD4d16Pseudo : VLDQQPseudo<IIC_VLD4>;
+def  VLD4d32Pseudo : VLDQQPseudo<IIC_VLD4>;
+
+// ...with address register writeback:
+class VLD4DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<0, 0b10, op11_8, op7_4,
+          (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD4u,
+          "vld4", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn$Rm",
+          "$Rn.addr = $wb", []> {
+  let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVLDST4Instruction";
+}
+
+def VLD4d8_UPD  : VLD4DWB<0b0000, {0,0,?,?}, "8">;
+def VLD4d16_UPD : VLD4DWB<0b0000, {0,1,?,?}, "16">;
+def VLD4d32_UPD : VLD4DWB<0b0000, {1,0,?,?}, "32">;
+
+def VLD4d8Pseudo_UPD  : VLDQQWBPseudo<IIC_VLD4u>;
+def VLD4d16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4u>;
+def VLD4d32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4u>;
+
+// ...with double-spaced registers:
+def VLD4q8      : VLD4D<0b0001, {0,0,?,?}, "8">;
+def VLD4q16     : VLD4D<0b0001, {0,1,?,?}, "16">;
+def VLD4q32     : VLD4D<0b0001, {1,0,?,?}, "32">;
+def VLD4q8_UPD  : VLD4DWB<0b0001, {0,0,?,?}, "8">;
+def VLD4q16_UPD : VLD4DWB<0b0001, {0,1,?,?}, "16">;
+def VLD4q32_UPD : VLD4DWB<0b0001, {1,0,?,?}, "32">;
+
+def VLD4q8Pseudo_UPD  : VLDQQQQWBPseudo<IIC_VLD4u>;
+def VLD4q16Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>;
+def VLD4q32Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>;
+
+// ...alternate versions to be allocated odd register numbers:
+def VLD4q8oddPseudo   : VLDQQQQPseudo<IIC_VLD4>;
+def VLD4q16oddPseudo  : VLDQQQQPseudo<IIC_VLD4>;
+def VLD4q32oddPseudo  : VLDQQQQPseudo<IIC_VLD4>;
+
+def VLD4q8oddPseudo_UPD  : VLDQQQQWBPseudo<IIC_VLD4u>;
+def VLD4q16oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>;
+def VLD4q32oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>;
+
+} // mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1
+
+// Classes for VLD*LN pseudo-instructions with multi-register operands.
+// These are expanded to real instructions after register allocation.
+class VLDQLNPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QPR:$dst),
+                (ins addrmode6:$addr, QPR:$src, nohash_imm:$lane),
+                itin, "$src = $dst">;
+class VLDQLNWBPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QPR:$dst, GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset, QPR:$src,
+                 nohash_imm:$lane), itin, "$addr.addr = $wb, $src = $dst">;
+class VLDQQLNPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QQPR:$dst),
+                (ins addrmode6:$addr, QQPR:$src, nohash_imm:$lane),
+                itin, "$src = $dst">;
+class VLDQQLNWBPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QQPR:$dst, GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset, QQPR:$src,
+                 nohash_imm:$lane), itin, "$addr.addr = $wb, $src = $dst">;
+class VLDQQQQLNPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QQQQPR:$dst),
+                (ins addrmode6:$addr, QQQQPR:$src, nohash_imm:$lane),
+                itin, "$src = $dst">;
+class VLDQQQQLNWBPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QQQQPR:$dst, GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src,
+                 nohash_imm:$lane), itin, "$addr.addr = $wb, $src = $dst">;
+
+//   VLD1LN   : Vector Load (single element to one lane)
+class VLD1LN<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
+             PatFrag LoadOp>
+  : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd),
+          (ins addrmode6:$Rn, DPR:$src, nohash_imm:$lane),
+          IIC_VLD1ln, "vld1", Dt, "\\{$Vd[$lane]\\}, $Rn",
+          "$src = $Vd",
+          [(set DPR:$Vd, (vector_insert (Ty DPR:$src),
+                                         (i32 (LoadOp addrmode6:$Rn)),
+                                         imm:$lane))]> {
+  let Rm = 0b1111;
+  let DecoderMethod = "DecodeVLD1LN";
+}
+class VLD1LN32<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
+             PatFrag LoadOp>
+  : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd),
+          (ins addrmode6oneL32:$Rn, DPR:$src, nohash_imm:$lane),
+          IIC_VLD1ln, "vld1", Dt, "\\{$Vd[$lane]\\}, $Rn",
+          "$src = $Vd",
+          [(set DPR:$Vd, (vector_insert (Ty DPR:$src),
+                                         (i32 (LoadOp addrmode6oneL32:$Rn)),
+                                         imm:$lane))]> {
+  let Rm = 0b1111;
+  let DecoderMethod = "DecodeVLD1LN";
+}
+class VLD1QLNPseudo<ValueType Ty, PatFrag LoadOp> : VLDQLNPseudo<IIC_VLD1ln> {
+  let Pattern = [(set QPR:$dst, (vector_insert (Ty QPR:$src),
+                                               (i32 (LoadOp addrmode6:$addr)),
+                                               imm:$lane))];
+}
+
+def VLD1LNd8  : VLD1LN<0b0000, {?,?,?,0}, "8", v8i8, extloadi8> {
+  let Inst{7-5} = lane{2-0};
+}
+def VLD1LNd16 : VLD1LN<0b0100, {?,?,0,?}, "16", v4i16, extloadi16> {
+  let Inst{7-6} = lane{1-0};
+  let Inst{5-4} = Rn{5-4};
+}
+def VLD1LNd32 : VLD1LN32<0b1000, {?,0,?,?}, "32", v2i32, load> {
+  let Inst{7} = lane{0};
+  let Inst{5-4} = Rn{5-4};
+}
+
+def VLD1LNq8Pseudo  : VLD1QLNPseudo<v16i8, extloadi8>;
+def VLD1LNq16Pseudo : VLD1QLNPseudo<v8i16, extloadi16>;
+def VLD1LNq32Pseudo : VLD1QLNPseudo<v4i32, load>;
+
+def : Pat<(vector_insert (v2f32 DPR:$src),
+                         (f32 (load addrmode6:$addr)), imm:$lane),
+          (VLD1LNd32 addrmode6:$addr, DPR:$src, imm:$lane)>;
+def : Pat<(vector_insert (v4f32 QPR:$src),
+                         (f32 (load addrmode6:$addr)), imm:$lane),
+          (VLD1LNq32Pseudo addrmode6:$addr, QPR:$src, imm:$lane)>;
+
+let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
+
+// ...with address register writeback:
+class VLD1LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd, GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm,
+           DPR:$src, nohash_imm:$lane), IIC_VLD1lnu, "vld1", Dt,
+          "\\{$Vd[$lane]\\}, $Rn$Rm",
+          "$src = $Vd, $Rn.addr = $wb", []> {
+  let DecoderMethod = "DecodeVLD1LN";
+}
+
+def VLD1LNd8_UPD  : VLD1LNWB<0b0000, {?,?,?,0}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VLD1LNd16_UPD : VLD1LNWB<0b0100, {?,?,0,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+  let Inst{4}   = Rn{4};
+}
+def VLD1LNd32_UPD : VLD1LNWB<0b1000, {?,0,?,?}, "32"> {
+  let Inst{7} = lane{0};
+  let Inst{5} = Rn{4};
+  let Inst{4} = Rn{4};
+}
+
+def VLD1LNq8Pseudo_UPD  : VLDQLNWBPseudo<IIC_VLD1lnu>;
+def VLD1LNq16Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD1lnu>;
+def VLD1LNq32Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD1lnu>;
+
+//   VLD2LN   : Vector Load (single 2-element structure to one lane)
+class VLD2LN<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2),
+          (ins addrmode6:$Rn, DPR:$src1, DPR:$src2, nohash_imm:$lane),
+          IIC_VLD2ln, "vld2", Dt, "\\{$Vd[$lane], $dst2[$lane]\\}, $Rn",
+          "$src1 = $Vd, $src2 = $dst2", []> {
+  let Rm = 0b1111;
+  let Inst{4}   = Rn{4};
+  let DecoderMethod = "DecodeVLD2LN";
+}
+
+def VLD2LNd8  : VLD2LN<0b0001, {?,?,?,?}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VLD2LNd16 : VLD2LN<0b0101, {?,?,0,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD2LNd32 : VLD2LN<0b1001, {?,0,0,?}, "32"> {
+  let Inst{7} = lane{0};
+}
+
+def VLD2LNd8Pseudo  : VLDQLNPseudo<IIC_VLD2ln>;
+def VLD2LNd16Pseudo : VLDQLNPseudo<IIC_VLD2ln>;
+def VLD2LNd32Pseudo : VLDQLNPseudo<IIC_VLD2ln>;
+
+// ...with double-spaced registers:
+def VLD2LNq16 : VLD2LN<0b0101, {?,?,1,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD2LNq32 : VLD2LN<0b1001, {?,1,0,?}, "32"> {
+  let Inst{7} = lane{0};
+}
+
+def VLD2LNq16Pseudo : VLDQQLNPseudo<IIC_VLD2ln>;
+def VLD2LNq32Pseudo : VLDQQLNPseudo<IIC_VLD2ln>;
+
+// ...with address register writeback:
+class VLD2LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2, GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm,
+           DPR:$src1, DPR:$src2, nohash_imm:$lane), IIC_VLD2lnu, "vld2", Dt,
+          "\\{$Vd[$lane], $dst2[$lane]\\}, $Rn$Rm",
+          "$src1 = $Vd, $src2 = $dst2, $Rn.addr = $wb", []> {
+  let Inst{4}   = Rn{4};
+  let DecoderMethod = "DecodeVLD2LN";
+}
+
+def VLD2LNd8_UPD  : VLD2LNWB<0b0001, {?,?,?,?}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VLD2LNd16_UPD : VLD2LNWB<0b0101, {?,?,0,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD2LNd32_UPD : VLD2LNWB<0b1001, {?,0,0,?}, "32"> {
+  let Inst{7} = lane{0};
+}
+
+def VLD2LNd8Pseudo_UPD  : VLDQLNWBPseudo<IIC_VLD2lnu>;
+def VLD2LNd16Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD2lnu>;
+def VLD2LNd32Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD2lnu>;
+
+def VLD2LNq16_UPD : VLD2LNWB<0b0101, {?,?,1,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD2LNq32_UPD : VLD2LNWB<0b1001, {?,1,0,?}, "32"> {
+  let Inst{7} = lane{0};
+}
+
+def VLD2LNq16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD2lnu>;
+def VLD2LNq32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD2lnu>;
+
+//   VLD3LN   : Vector Load (single 3-element structure to one lane)
+class VLD3LN<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3),
+          (ins addrmode6:$Rn, DPR:$src1, DPR:$src2, DPR:$src3,
+          nohash_imm:$lane), IIC_VLD3ln, "vld3", Dt,
+          "\\{$Vd[$lane], $dst2[$lane], $dst3[$lane]\\}, $Rn",
+          "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3", []> {
+  let Rm = 0b1111;
+  let DecoderMethod = "DecodeVLD3LN";
+}
+
+def VLD3LNd8  : VLD3LN<0b0010, {?,?,?,0}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VLD3LNd16 : VLD3LN<0b0110, {?,?,0,0}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD3LNd32 : VLD3LN<0b1010, {?,0,0,0}, "32"> {
+  let Inst{7}   = lane{0};
+}
+
+def VLD3LNd8Pseudo  : VLDQQLNPseudo<IIC_VLD3ln>;
+def VLD3LNd16Pseudo : VLDQQLNPseudo<IIC_VLD3ln>;
+def VLD3LNd32Pseudo : VLDQQLNPseudo<IIC_VLD3ln>;
+
+// ...with double-spaced registers:
+def VLD3LNq16 : VLD3LN<0b0110, {?,?,1,0}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD3LNq32 : VLD3LN<0b1010, {?,1,0,0}, "32"> {
+  let Inst{7}   = lane{0};
+}
+
+def VLD3LNq16Pseudo : VLDQQQQLNPseudo<IIC_VLD3ln>;
+def VLD3LNq32Pseudo : VLDQQQQLNPseudo<IIC_VLD3ln>;
+
+// ...with address register writeback:
+class VLD3LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdStLn<1, 0b10, op11_8, op7_4,
+          (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm,
+           DPR:$src1, DPR:$src2, DPR:$src3, nohash_imm:$lane),
+          IIC_VLD3lnu, "vld3", Dt,
+          "\\{$Vd[$lane], $dst2[$lane], $dst3[$lane]\\}, $Rn$Rm",
+          "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3, $Rn.addr = $wb",
+          []> {
+  let DecoderMethod = "DecodeVLD3LN";
+}
+
+def VLD3LNd8_UPD  : VLD3LNWB<0b0010, {?,?,?,0}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VLD3LNd16_UPD : VLD3LNWB<0b0110, {?,?,0,0}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD3LNd32_UPD : VLD3LNWB<0b1010, {?,0,0,0}, "32"> {
+  let Inst{7} = lane{0};
+}
+
+def VLD3LNd8Pseudo_UPD  : VLDQQLNWBPseudo<IIC_VLD3lnu>;
+def VLD3LNd16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD3lnu>;
+def VLD3LNd32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD3lnu>;
+
+def VLD3LNq16_UPD : VLD3LNWB<0b0110, {?,?,1,0}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD3LNq32_UPD : VLD3LNWB<0b1010, {?,1,0,0}, "32"> {
+  let Inst{7} = lane{0};
+}
+
+def VLD3LNq16Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD3lnu>;
+def VLD3LNq32Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD3lnu>;
+
+//   VLD4LN   : Vector Load (single 4-element structure to one lane)
+class VLD4LN<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdStLn<1, 0b10, op11_8, op7_4,
+          (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4),
+          (ins addrmode6:$Rn, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4,
+          nohash_imm:$lane), IIC_VLD4ln, "vld4", Dt,
+          "\\{$Vd[$lane], $dst2[$lane], $dst3[$lane], $dst4[$lane]\\}, $Rn",
+          "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4", []> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLD4LN";
+}
+
+def VLD4LNd8  : VLD4LN<0b0011, {?,?,?,?}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VLD4LNd16 : VLD4LN<0b0111, {?,?,0,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD4LNd32 : VLD4LN<0b1011, {?,0,?,?}, "32"> {
+  let Inst{7} = lane{0};
+  let Inst{5} = Rn{5};
+}
+
+def VLD4LNd8Pseudo  : VLDQQLNPseudo<IIC_VLD4ln>;
+def VLD4LNd16Pseudo : VLDQQLNPseudo<IIC_VLD4ln>;
+def VLD4LNd32Pseudo : VLDQQLNPseudo<IIC_VLD4ln>;
+
+// ...with double-spaced registers:
+def VLD4LNq16 : VLD4LN<0b0111, {?,?,1,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD4LNq32 : VLD4LN<0b1011, {?,1,?,?}, "32"> {
+  let Inst{7} = lane{0};
+  let Inst{5} = Rn{5};
+}
+
+def VLD4LNq16Pseudo : VLDQQQQLNPseudo<IIC_VLD4ln>;
+def VLD4LNq32Pseudo : VLDQQQQLNPseudo<IIC_VLD4ln>;
+
+// ...with address register writeback:
+class VLD4LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdStLn<1, 0b10, op11_8, op7_4,
+          (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm,
+           DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4, nohash_imm:$lane),
+          IIC_VLD4lnu, "vld4", Dt,
+"\\{$Vd[$lane], $dst2[$lane], $dst3[$lane], $dst4[$lane]\\}, $Rn$Rm",
+"$src1 = $Vd, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4, $Rn.addr = $wb",
+          []> {
+  let Inst{4}   = Rn{4};
+  let DecoderMethod = "DecodeVLD4LN"  ;
+}
+
+def VLD4LNd8_UPD  : VLD4LNWB<0b0011, {?,?,?,?}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VLD4LNd16_UPD : VLD4LNWB<0b0111, {?,?,0,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD4LNd32_UPD : VLD4LNWB<0b1011, {?,0,?,?}, "32"> {
+  let Inst{7} = lane{0};
+  let Inst{5} = Rn{5};
+}
+
+def VLD4LNd8Pseudo_UPD  : VLDQQLNWBPseudo<IIC_VLD4lnu>;
+def VLD4LNd16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD4lnu>;
+def VLD4LNd32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD4lnu>;
+
+def VLD4LNq16_UPD : VLD4LNWB<0b0111, {?,?,1,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD4LNq32_UPD : VLD4LNWB<0b1011, {?,1,?,?}, "32"> {
+  let Inst{7} = lane{0};
+  let Inst{5} = Rn{5};
+}
+
+def VLD4LNq16Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD4lnu>;
+def VLD4LNq32Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD4lnu>;
+
+} // mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1
+
+//   VLD1DUP  : Vector Load (single element to all lanes)
+class VLD1DUP<bits<4> op7_4, string Dt, ValueType Ty, PatFrag LoadOp,
+              Operand AddrMode>
+  : NLdSt<1, 0b10, 0b1100, op7_4, (outs VecListOneDAllLanes:$Vd),
+          (ins AddrMode:$Rn),
+          IIC_VLD1dup, "vld1", Dt, "$Vd, $Rn", "",
+          [(set VecListOneDAllLanes:$Vd,
+                (Ty (NEONvdup (i32 (LoadOp AddrMode:$Rn)))))]> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLD1DupInstruction";
+}
+def VLD1DUPd8  : VLD1DUP<{0,0,0,?}, "8", v8i8, extloadi8,
+                         addrmode6dupalignNone>;
+def VLD1DUPd16 : VLD1DUP<{0,1,0,?}, "16", v4i16, extloadi16,
+                         addrmode6dupalign16>;
+def VLD1DUPd32 : VLD1DUP<{1,0,0,?}, "32", v2i32, load,
+                         addrmode6dupalign32>;
+
+def : Pat<(v2f32 (NEONvdup (f32 (load addrmode6dup:$addr)))),
+          (VLD1DUPd32 addrmode6:$addr)>;
+
+class VLD1QDUP<bits<4> op7_4, string Dt, ValueType Ty, PatFrag LoadOp,
+               Operand AddrMode>
+  : NLdSt<1, 0b10, 0b1100, op7_4, (outs VecListDPairAllLanes:$Vd),
+          (ins AddrMode:$Rn), IIC_VLD1dup,
+          "vld1", Dt, "$Vd, $Rn", "",
+          [(set VecListDPairAllLanes:$Vd,
+                (Ty (NEONvdup (i32 (LoadOp AddrMode:$Rn)))))]> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLD1DupInstruction";
+}
+
+def VLD1DUPq8  : VLD1QDUP<{0,0,1,0}, "8", v16i8, extloadi8,
+                          addrmode6dupalignNone>;
+def VLD1DUPq16 : VLD1QDUP<{0,1,1,?}, "16", v8i16, extloadi16,
+                          addrmode6dupalign16>;
+def VLD1DUPq32 : VLD1QDUP<{1,0,1,?}, "32", v4i32, load,
+                          addrmode6dupalign32>;
+
+def : Pat<(v4f32 (NEONvdup (f32 (load addrmode6dup:$addr)))),
+          (VLD1DUPq32 addrmode6:$addr)>;
+
+let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
+// ...with address register writeback:
+multiclass VLD1DUPWB<bits<4> op7_4, string Dt, Operand AddrMode> {
+  def _fixed : NLdSt<1, 0b10, 0b1100, op7_4,
+                     (outs VecListOneDAllLanes:$Vd, GPR:$wb),
+                     (ins AddrMode:$Rn), IIC_VLD1dupu,
+                     "vld1", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{4} = Rn{4};
+    let DecoderMethod = "DecodeVLD1DupInstruction";
+  }
+  def _register : NLdSt<1, 0b10, 0b1100, op7_4,
+                        (outs VecListOneDAllLanes:$Vd, GPR:$wb),
+                        (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1dupu,
+                        "vld1", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{4} = Rn{4};
+    let DecoderMethod = "DecodeVLD1DupInstruction";
+  }
+}
+multiclass VLD1QDUPWB<bits<4> op7_4, string Dt, Operand AddrMode> {
+  def _fixed : NLdSt<1, 0b10, 0b1100, op7_4,
+                     (outs VecListDPairAllLanes:$Vd, GPR:$wb),
+                     (ins AddrMode:$Rn), IIC_VLD1dupu,
+                     "vld1", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{4} = Rn{4};
+    let DecoderMethod = "DecodeVLD1DupInstruction";
+  }
+  def _register : NLdSt<1, 0b10, 0b1100, op7_4,
+                        (outs VecListDPairAllLanes:$Vd, GPR:$wb),
+                        (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1dupu,
+                        "vld1", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{4} = Rn{4};
+    let DecoderMethod = "DecodeVLD1DupInstruction";
+  }
+}
+
+defm VLD1DUPd8wb  : VLD1DUPWB<{0,0,0,0}, "8", addrmode6dupalignNone>;
+defm VLD1DUPd16wb : VLD1DUPWB<{0,1,0,?}, "16", addrmode6dupalign16>;
+defm VLD1DUPd32wb : VLD1DUPWB<{1,0,0,?}, "32", addrmode6dupalign32>;
+
+defm VLD1DUPq8wb  : VLD1QDUPWB<{0,0,1,0}, "8", addrmode6dupalignNone>;
+defm VLD1DUPq16wb : VLD1QDUPWB<{0,1,1,?}, "16", addrmode6dupalign16>;
+defm VLD1DUPq32wb : VLD1QDUPWB<{1,0,1,?}, "32", addrmode6dupalign32>;
+
+//   VLD2DUP  : Vector Load (single 2-element structure to all lanes)
+class VLD2DUP<bits<4> op7_4, string Dt, RegisterOperand VdTy, Operand AddrMode>
+  : NLdSt<1, 0b10, 0b1101, op7_4, (outs VdTy:$Vd),
+          (ins AddrMode:$Rn), IIC_VLD2dup,
+          "vld2", Dt, "$Vd, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLD2DupInstruction";
+}
+
+def VLD2DUPd8  : VLD2DUP<{0,0,0,?}, "8",  VecListDPairAllLanes,
+                         addrmode6dupalign16>;
+def VLD2DUPd16 : VLD2DUP<{0,1,0,?}, "16", VecListDPairAllLanes,
+                         addrmode6dupalign32>;
+def VLD2DUPd32 : VLD2DUP<{1,0,0,?}, "32", VecListDPairAllLanes,
+                         addrmode6dupalign64>;
+
+// HACK this one, VLD2DUPd8x2 must be changed at the same time with VLD2b8 or
+// "vld2.8 {d0[], d2[]}, [r4:32]" will become "vld2.8 {d0, d2}, [r4:32]".
+// ...with double-spaced registers
+def VLD2DUPd8x2  : VLD2DUP<{0,0,1,?}, "8",  VecListDPairSpacedAllLanes,
+                           addrmode6dupalign16>;
+def VLD2DUPd16x2 : VLD2DUP<{0,1,1,?}, "16", VecListDPairSpacedAllLanes,
+                           addrmode6dupalign32>;
+def VLD2DUPd32x2 : VLD2DUP<{1,0,1,?}, "32", VecListDPairSpacedAllLanes,
+                           addrmode6dupalign64>;
+
+// ...with address register writeback:
+multiclass VLD2DUPWB<bits<4> op7_4, string Dt, RegisterOperand VdTy,
+                     Operand AddrMode> {
+  def _fixed : NLdSt<1, 0b10, 0b1101, op7_4,
+                     (outs VdTy:$Vd, GPR:$wb),
+                     (ins AddrMode:$Rn), IIC_VLD2dupu,
+                     "vld2", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{4} = Rn{4};
+    let DecoderMethod = "DecodeVLD2DupInstruction";
+  }
+  def _register : NLdSt<1, 0b10, 0b1101, op7_4,
+                        (outs VdTy:$Vd, GPR:$wb),
+                        (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD2dupu,
+                        "vld2", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{4} = Rn{4};
+    let DecoderMethod = "DecodeVLD2DupInstruction";
+  }
+}
+
+defm VLD2DUPd8wb    : VLD2DUPWB<{0,0,0,0}, "8",  VecListDPairAllLanes,
+                                addrmode6dupalign16>;
+defm VLD2DUPd16wb   : VLD2DUPWB<{0,1,0,?}, "16", VecListDPairAllLanes,
+                                addrmode6dupalign32>;
+defm VLD2DUPd32wb   : VLD2DUPWB<{1,0,0,?}, "32", VecListDPairAllLanes,
+                                addrmode6dupalign64>;
+
+defm VLD2DUPd8x2wb  : VLD2DUPWB<{0,0,1,0}, "8",  VecListDPairSpacedAllLanes,
+                                addrmode6dupalign16>;
+defm VLD2DUPd16x2wb : VLD2DUPWB<{0,1,1,?}, "16", VecListDPairSpacedAllLanes,
+                                addrmode6dupalign32>;
+defm VLD2DUPd32x2wb : VLD2DUPWB<{1,0,1,?}, "32", VecListDPairSpacedAllLanes,
+                                addrmode6dupalign64>;
+
+//   VLD3DUP  : Vector Load (single 3-element structure to all lanes)
+class VLD3DUP<bits<4> op7_4, string Dt>
+  : NLdSt<1, 0b10, 0b1110, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3),
+          (ins addrmode6dup:$Rn), IIC_VLD3dup,
+          "vld3", Dt, "\\{$Vd[], $dst2[], $dst3[]\\}, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{4} = 0;
+  let DecoderMethod = "DecodeVLD3DupInstruction";
+}
+
+def VLD3DUPd8  : VLD3DUP<{0,0,0,?}, "8">;
+def VLD3DUPd16 : VLD3DUP<{0,1,0,?}, "16">;
+def VLD3DUPd32 : VLD3DUP<{1,0,0,?}, "32">;
+
+def VLD3DUPd8Pseudo  : VLDQQPseudo<IIC_VLD3dup>;
+def VLD3DUPd16Pseudo : VLDQQPseudo<IIC_VLD3dup>;
+def VLD3DUPd32Pseudo : VLDQQPseudo<IIC_VLD3dup>;
+
+// ...with double-spaced registers (not used for codegen):
+def VLD3DUPq8  : VLD3DUP<{0,0,1,?}, "8">;
+def VLD3DUPq16 : VLD3DUP<{0,1,1,?}, "16">;
+def VLD3DUPq32 : VLD3DUP<{1,0,1,?}, "32">;
+
+// ...with address register writeback:
+class VLD3DUPWB<bits<4> op7_4, string Dt, Operand AddrMode>
+  : NLdSt<1, 0b10, 0b1110, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, GPR:$wb),
+          (ins AddrMode:$Rn, am6offset:$Rm), IIC_VLD3dupu,
+          "vld3", Dt, "\\{$Vd[], $dst2[], $dst3[]\\}, $Rn$Rm",
+          "$Rn.addr = $wb", []> {
+  let Inst{4} = 0;
+  let DecoderMethod = "DecodeVLD3DupInstruction";
+}
+
+def VLD3DUPd8_UPD  : VLD3DUPWB<{0,0,0,0}, "8",  addrmode6dupalign64>;
+def VLD3DUPd16_UPD : VLD3DUPWB<{0,1,0,?}, "16", addrmode6dupalign64>;
+def VLD3DUPd32_UPD : VLD3DUPWB<{1,0,0,?}, "32", addrmode6dupalign64>;
+
+def VLD3DUPq8_UPD  : VLD3DUPWB<{0,0,1,0}, "8",  addrmode6dupalign64>;
+def VLD3DUPq16_UPD : VLD3DUPWB<{0,1,1,?}, "16", addrmode6dupalign64>;
+def VLD3DUPq32_UPD : VLD3DUPWB<{1,0,1,?}, "32", addrmode6dupalign64>;
+
+def VLD3DUPd8Pseudo_UPD  : VLDQQWBPseudo<IIC_VLD3dupu>;
+def VLD3DUPd16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>;
+def VLD3DUPd32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>;
+
+//   VLD4DUP  : Vector Load (single 4-element structure to all lanes)
+class VLD4DUP<bits<4> op7_4, string Dt>
+  : NLdSt<1, 0b10, 0b1111, op7_4,
+          (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4),
+          (ins addrmode6dup:$Rn), IIC_VLD4dup,
+          "vld4", Dt, "\\{$Vd[], $dst2[], $dst3[], $dst4[]\\}, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLD4DupInstruction";
+}
+
+def VLD4DUPd8  : VLD4DUP<{0,0,0,?}, "8">;
+def VLD4DUPd16 : VLD4DUP<{0,1,0,?}, "16">;
+def VLD4DUPd32 : VLD4DUP<{1,?,0,?}, "32"> { let Inst{6} = Rn{5}; }
+
+def VLD4DUPd8Pseudo  : VLDQQPseudo<IIC_VLD4dup>;
+def VLD4DUPd16Pseudo : VLDQQPseudo<IIC_VLD4dup>;
+def VLD4DUPd32Pseudo : VLDQQPseudo<IIC_VLD4dup>;
+
+// ...with double-spaced registers (not used for codegen):
+def VLD4DUPq8  : VLD4DUP<{0,0,1,?}, "8">;
+def VLD4DUPq16 : VLD4DUP<{0,1,1,?}, "16">;
+def VLD4DUPq32 : VLD4DUP<{1,?,1,?}, "32"> { let Inst{6} = Rn{5}; }
+
+// ...with address register writeback:
+class VLD4DUPWB<bits<4> op7_4, string Dt>
+  : NLdSt<1, 0b10, 0b1111, op7_4,
+          (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb),
+          (ins addrmode6dup:$Rn, am6offset:$Rm), IIC_VLD4dupu,
+          "vld4", Dt, "\\{$Vd[], $dst2[], $dst3[], $dst4[]\\}, $Rn$Rm",
+          "$Rn.addr = $wb", []> {
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLD4DupInstruction";
+}
+
+def VLD4DUPd8_UPD  : VLD4DUPWB<{0,0,0,0}, "8">;
+def VLD4DUPd16_UPD : VLD4DUPWB<{0,1,0,?}, "16">;
+def VLD4DUPd32_UPD : VLD4DUPWB<{1,?,0,?}, "32"> { let Inst{6} = Rn{5}; }
+
+def VLD4DUPq8_UPD  : VLD4DUPWB<{0,0,1,0}, "8">;
+def VLD4DUPq16_UPD : VLD4DUPWB<{0,1,1,?}, "16">;
+def VLD4DUPq32_UPD : VLD4DUPWB<{1,?,1,?}, "32"> { let Inst{6} = Rn{5}; }
+
+def VLD4DUPd8Pseudo_UPD  : VLDQQWBPseudo<IIC_VLD4dupu>;
+def VLD4DUPd16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>;
+def VLD4DUPd32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>;
+
+} // mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1
+
+let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in {
+
+// Classes for VST* pseudo-instructions with multi-register operands.
+// These are expanded to real instructions after register allocation.
+class VSTQPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs), (ins addrmode6:$addr, QPR:$src), itin, "">;
+class VSTQWBPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset, QPR:$src), itin,
+                "$addr.addr = $wb">;
+class VSTQWBfixedPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs GPR:$wb),
+                (ins addrmode6:$addr, QPR:$src), itin,
+                "$addr.addr = $wb">;
+class VSTQWBregisterPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs GPR:$wb),
+                (ins addrmode6:$addr, rGPR:$offset, QPR:$src), itin,
+                "$addr.addr = $wb">;
+class VSTQQPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs), (ins addrmode6:$addr, QQPR:$src), itin, "">;
+class VSTQQWBPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset, QQPR:$src), itin,
+                "$addr.addr = $wb">;
+class VSTQQWBfixedPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs GPR:$wb),
+                (ins addrmode6:$addr, QQPR:$src), itin,
+                "$addr.addr = $wb">;
+class VSTQQWBregisterPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs GPR:$wb),
+                (ins addrmode6:$addr, rGPR:$offset, QQPR:$src), itin,
+                "$addr.addr = $wb">;
+
+class VSTQQQQPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs), (ins addrmode6:$addr, QQQQPR:$src), itin, "">;
+class VSTQQQQWBPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src), itin,
+                "$addr.addr = $wb">;
+
+//   VST1     : Vector Store (multiple single elements)
+class VST1D<bits<4> op7_4, string Dt, Operand AddrMode>
+  : NLdSt<0,0b00,0b0111,op7_4, (outs), (ins AddrMode:$Rn, VecListOneD:$Vd),
+          IIC_VST1, "vst1", Dt, "$Vd, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLDST1Instruction";
+}
+class VST1Q<bits<4> op7_4, string Dt, Operand AddrMode>
+  : NLdSt<0,0b00,0b1010,op7_4, (outs), (ins AddrMode:$Rn, VecListDPair:$Vd),
+          IIC_VST1x2, "vst1", Dt, "$Vd, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVLDST1Instruction";
+}
+
+def  VST1d8   : VST1D<{0,0,0,?}, "8",  addrmode6align64>;
+def  VST1d16  : VST1D<{0,1,0,?}, "16", addrmode6align64>;
+def  VST1d32  : VST1D<{1,0,0,?}, "32", addrmode6align64>;
+def  VST1d64  : VST1D<{1,1,0,?}, "64", addrmode6align64>;
+
+def  VST1q8   : VST1Q<{0,0,?,?}, "8",  addrmode6align64or128>;
+def  VST1q16  : VST1Q<{0,1,?,?}, "16", addrmode6align64or128>;
+def  VST1q32  : VST1Q<{1,0,?,?}, "32", addrmode6align64or128>;
+def  VST1q64  : VST1Q<{1,1,?,?}, "64", addrmode6align64or128>;
+
+// ...with address register writeback:
+multiclass VST1DWB<bits<4> op7_4, string Dt, Operand AddrMode> {
+  def _fixed : NLdSt<0,0b00, 0b0111,op7_4, (outs GPR:$wb),
+                     (ins AddrMode:$Rn, VecListOneD:$Vd), IIC_VLD1u,
+                     "vst1", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{4} = Rn{4};
+    let DecoderMethod = "DecodeVLDST1Instruction";
+  }
+  def _register : NLdSt<0,0b00,0b0111,op7_4, (outs GPR:$wb),
+                        (ins AddrMode:$Rn, rGPR:$Rm, VecListOneD:$Vd),
+                        IIC_VLD1u,
+                        "vst1", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{4} = Rn{4};
+    let DecoderMethod = "DecodeVLDST1Instruction";
+  }
+}
+multiclass VST1QWB<bits<4> op7_4, string Dt, Operand AddrMode> {
+  def _fixed : NLdSt<0,0b00,0b1010,op7_4, (outs GPR:$wb),
+                    (ins AddrMode:$Rn, VecListDPair:$Vd), IIC_VLD1x2u,
+                     "vst1", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVLDST1Instruction";
+  }
+  def _register : NLdSt<0,0b00,0b1010,op7_4, (outs GPR:$wb),
+                        (ins AddrMode:$Rn, rGPR:$Rm, VecListDPair:$Vd),
+                        IIC_VLD1x2u,
+                        "vst1", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVLDST1Instruction";
+  }
+}
+
+defm VST1d8wb  : VST1DWB<{0,0,0,?}, "8",  addrmode6align64>;
+defm VST1d16wb : VST1DWB<{0,1,0,?}, "16", addrmode6align64>;
+defm VST1d32wb : VST1DWB<{1,0,0,?}, "32", addrmode6align64>;
+defm VST1d64wb : VST1DWB<{1,1,0,?}, "64", addrmode6align64>;
+
+defm VST1q8wb  : VST1QWB<{0,0,?,?}, "8",  addrmode6align64or128>;
+defm VST1q16wb : VST1QWB<{0,1,?,?}, "16", addrmode6align64or128>;
+defm VST1q32wb : VST1QWB<{1,0,?,?}, "32", addrmode6align64or128>;
+defm VST1q64wb : VST1QWB<{1,1,?,?}, "64", addrmode6align64or128>;
+
+// ...with 3 registers
+class VST1D3<bits<4> op7_4, string Dt, Operand AddrMode>
+  : NLdSt<0, 0b00, 0b0110, op7_4, (outs),
+          (ins AddrMode:$Rn, VecListThreeD:$Vd),
+          IIC_VST1x3, "vst1", Dt, "$Vd, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLDST1Instruction";
+}
+multiclass VST1D3WB<bits<4> op7_4, string Dt, Operand AddrMode> {
+  def _fixed : NLdSt<0,0b00,0b0110,op7_4, (outs GPR:$wb),
+                    (ins AddrMode:$Rn, VecListThreeD:$Vd), IIC_VLD1x3u,
+                     "vst1", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVLDST1Instruction";
+  }
+  def _register : NLdSt<0,0b00,0b0110,op7_4, (outs GPR:$wb),
+                        (ins AddrMode:$Rn, rGPR:$Rm, VecListThreeD:$Vd),
+                        IIC_VLD1x3u,
+                        "vst1", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVLDST1Instruction";
+  }
+}
+
+def VST1d8T     : VST1D3<{0,0,0,?}, "8",  addrmode6align64>;
+def VST1d16T    : VST1D3<{0,1,0,?}, "16", addrmode6align64>;
+def VST1d32T    : VST1D3<{1,0,0,?}, "32", addrmode6align64>;
+def VST1d64T    : VST1D3<{1,1,0,?}, "64", addrmode6align64>;
+
+defm VST1d8Twb  : VST1D3WB<{0,0,0,?}, "8",  addrmode6align64>;
+defm VST1d16Twb : VST1D3WB<{0,1,0,?}, "16", addrmode6align64>;
+defm VST1d32Twb : VST1D3WB<{1,0,0,?}, "32", addrmode6align64>;
+defm VST1d64Twb : VST1D3WB<{1,1,0,?}, "64", addrmode6align64>;
+
+def VST1d64TPseudo            : VSTQQPseudo<IIC_VST1x3>;
+def VST1d64TPseudoWB_fixed    : VSTQQWBfixedPseudo<IIC_VST1x3u>;
+def VST1d64TPseudoWB_register : VSTQQWBPseudo<IIC_VST1x3u>;
+
+// ...with 4 registers
+class VST1D4<bits<4> op7_4, string Dt, Operand AddrMode>
+  : NLdSt<0, 0b00, 0b0010, op7_4, (outs),
+          (ins AddrMode:$Rn, VecListFourD:$Vd),
+          IIC_VST1x4, "vst1", Dt, "$Vd, $Rn", "",
+          []> {
+  let Rm = 0b1111;
+  let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVLDST1Instruction";
+}
+multiclass VST1D4WB<bits<4> op7_4, string Dt, Operand AddrMode> {
+  def _fixed : NLdSt<0,0b00,0b0010,op7_4, (outs GPR:$wb),
+                    (ins AddrMode:$Rn, VecListFourD:$Vd), IIC_VLD1x4u,
+                     "vst1", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVLDST1Instruction";
+  }
+  def _register : NLdSt<0,0b00,0b0010,op7_4, (outs GPR:$wb),
+                        (ins AddrMode:$Rn, rGPR:$Rm, VecListFourD:$Vd),
+                        IIC_VLD1x4u,
+                        "vst1", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVLDST1Instruction";
+  }
+}
+
+def VST1d8Q     : VST1D4<{0,0,?,?}, "8",  addrmode6align64or128or256>;
+def VST1d16Q    : VST1D4<{0,1,?,?}, "16", addrmode6align64or128or256>;
+def VST1d32Q    : VST1D4<{1,0,?,?}, "32", addrmode6align64or128or256>;
+def VST1d64Q    : VST1D4<{1,1,?,?}, "64", addrmode6align64or128or256>;
+
+defm VST1d8Qwb  : VST1D4WB<{0,0,?,?}, "8",  addrmode6align64or128or256>;
+defm VST1d16Qwb : VST1D4WB<{0,1,?,?}, "16", addrmode6align64or128or256>;
+defm VST1d32Qwb : VST1D4WB<{1,0,?,?}, "32", addrmode6align64or128or256>;
+defm VST1d64Qwb : VST1D4WB<{1,1,?,?}, "64", addrmode6align64or128or256>;
+
+def VST1d64QPseudo            : VSTQQPseudo<IIC_VST1x4>;
+def VST1d64QPseudoWB_fixed    : VSTQQWBfixedPseudo<IIC_VST1x4u>;
+def VST1d64QPseudoWB_register : VSTQQWBPseudo<IIC_VST1x4u>;
+
+//   VST2     : Vector Store (multiple 2-element structures)
+class VST2<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy,
+            InstrItinClass itin, Operand AddrMode>
+  : NLdSt<0, 0b00, op11_8, op7_4, (outs), (ins AddrMode:$Rn, VdTy:$Vd),
+          itin, "vst2", Dt, "$Vd, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVLDST2Instruction";
+}
+
+def  VST2d8   : VST2<0b1000, {0,0,?,?}, "8",  VecListDPair, IIC_VST2,
+                     addrmode6align64or128>;
+def  VST2d16  : VST2<0b1000, {0,1,?,?}, "16", VecListDPair, IIC_VST2,
+                     addrmode6align64or128>;
+def  VST2d32  : VST2<0b1000, {1,0,?,?}, "32", VecListDPair, IIC_VST2,
+                     addrmode6align64or128>;
+
+def  VST2q8   : VST2<0b0011, {0,0,?,?}, "8",  VecListFourD, IIC_VST2x2,
+                     addrmode6align64or128or256>;
+def  VST2q16  : VST2<0b0011, {0,1,?,?}, "16", VecListFourD, IIC_VST2x2,
+                     addrmode6align64or128or256>;
+def  VST2q32  : VST2<0b0011, {1,0,?,?}, "32", VecListFourD, IIC_VST2x2,
+                     addrmode6align64or128or256>;
+
+def  VST2q8Pseudo  : VSTQQPseudo<IIC_VST2x2>;
+def  VST2q16Pseudo : VSTQQPseudo<IIC_VST2x2>;
+def  VST2q32Pseudo : VSTQQPseudo<IIC_VST2x2>;
+
+// ...with address register writeback:
+multiclass VST2DWB<bits<4> op11_8, bits<4> op7_4, string Dt,
+                   RegisterOperand VdTy, Operand AddrMode> {
+  def _fixed : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb),
+                     (ins AddrMode:$Rn, VdTy:$Vd), IIC_VLD1u,
+                     "vst2", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVLDST2Instruction";
+  }
+  def _register : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb),
+                        (ins AddrMode:$Rn, rGPR:$Rm, VdTy:$Vd), IIC_VLD1u,
+                        "vst2", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVLDST2Instruction";
+  }
+}
+multiclass VST2QWB<bits<4> op7_4, string Dt, Operand AddrMode> {
+  def _fixed : NLdSt<0, 0b00, 0b0011, op7_4, (outs GPR:$wb),
+                     (ins AddrMode:$Rn, VecListFourD:$Vd), IIC_VLD1u,
+                     "vst2", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVLDST2Instruction";
+  }
+  def _register : NLdSt<0, 0b00, 0b0011, op7_4, (outs GPR:$wb),
+                        (ins AddrMode:$Rn, rGPR:$Rm, VecListFourD:$Vd),
+                        IIC_VLD1u,
+                        "vst2", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVLDST2Instruction";
+  }
+}
+
+defm VST2d8wb    : VST2DWB<0b1000, {0,0,?,?}, "8",  VecListDPair,
+                           addrmode6align64or128>;
+defm VST2d16wb   : VST2DWB<0b1000, {0,1,?,?}, "16", VecListDPair,
+                           addrmode6align64or128>;
+defm VST2d32wb   : VST2DWB<0b1000, {1,0,?,?}, "32", VecListDPair,
+                           addrmode6align64or128>;
+
+defm VST2q8wb    : VST2QWB<{0,0,?,?}, "8", addrmode6align64or128or256>;
+defm VST2q16wb   : VST2QWB<{0,1,?,?}, "16", addrmode6align64or128or256>;
+defm VST2q32wb   : VST2QWB<{1,0,?,?}, "32", addrmode6align64or128or256>;
+
+def VST2q8PseudoWB_fixed     : VSTQQWBfixedPseudo<IIC_VST2x2u>;
+def VST2q16PseudoWB_fixed    : VSTQQWBfixedPseudo<IIC_VST2x2u>;
+def VST2q32PseudoWB_fixed    : VSTQQWBfixedPseudo<IIC_VST2x2u>;
+def VST2q8PseudoWB_register  : VSTQQWBregisterPseudo<IIC_VST2x2u>;
+def VST2q16PseudoWB_register : VSTQQWBregisterPseudo<IIC_VST2x2u>;
+def VST2q32PseudoWB_register : VSTQQWBregisterPseudo<IIC_VST2x2u>;
+
+// ...with double-spaced registers
+def VST2b8      : VST2<0b1001, {0,0,?,?}, "8",  VecListDPairSpaced, IIC_VST2,
+                      addrmode6align64or128>;
+def VST2b16     : VST2<0b1001, {0,1,?,?}, "16", VecListDPairSpaced, IIC_VST2,
+                      addrmode6align64or128>;
+def VST2b32     : VST2<0b1001, {1,0,?,?}, "32", VecListDPairSpaced, IIC_VST2,
+                      addrmode6align64or128>;
+defm VST2b8wb   : VST2DWB<0b1001, {0,0,?,?}, "8",  VecListDPairSpaced,
+                          addrmode6align64or128>;
+defm VST2b16wb  : VST2DWB<0b1001, {0,1,?,?}, "16", VecListDPairSpaced,
+                          addrmode6align64or128>;
+defm VST2b32wb  : VST2DWB<0b1001, {1,0,?,?}, "32", VecListDPairSpaced,
+                          addrmode6align64or128>;
+
+//   VST3     : Vector Store (multiple 3-element structures)
+class VST3D<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<0, 0b00, op11_8, op7_4, (outs),
+          (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3), IIC_VST3,
+          "vst3", Dt, "\\{$Vd, $src2, $src3\\}, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLDST3Instruction";
+}
+
+def  VST3d8   : VST3D<0b0100, {0,0,0,?}, "8">;
+def  VST3d16  : VST3D<0b0100, {0,1,0,?}, "16">;
+def  VST3d32  : VST3D<0b0100, {1,0,0,?}, "32">;
+
+def  VST3d8Pseudo  : VSTQQPseudo<IIC_VST3>;
+def  VST3d16Pseudo : VSTQQPseudo<IIC_VST3>;
+def  VST3d32Pseudo : VSTQQPseudo<IIC_VST3>;
+
+// ...with address register writeback:
+class VST3DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm,
+           DPR:$Vd, DPR:$src2, DPR:$src3), IIC_VST3u,
+          "vst3", Dt, "\\{$Vd, $src2, $src3\\}, $Rn$Rm",
+          "$Rn.addr = $wb", []> {
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLDST3Instruction";
+}
+
+def VST3d8_UPD  : VST3DWB<0b0100, {0,0,0,?}, "8">;
+def VST3d16_UPD : VST3DWB<0b0100, {0,1,0,?}, "16">;
+def VST3d32_UPD : VST3DWB<0b0100, {1,0,0,?}, "32">;
+
+def VST3d8Pseudo_UPD  : VSTQQWBPseudo<IIC_VST3u>;
+def VST3d16Pseudo_UPD : VSTQQWBPseudo<IIC_VST3u>;
+def VST3d32Pseudo_UPD : VSTQQWBPseudo<IIC_VST3u>;
+
+// ...with double-spaced registers:
+def VST3q8      : VST3D<0b0101, {0,0,0,?}, "8">;
+def VST3q16     : VST3D<0b0101, {0,1,0,?}, "16">;
+def VST3q32     : VST3D<0b0101, {1,0,0,?}, "32">;
+def VST3q8_UPD  : VST3DWB<0b0101, {0,0,0,?}, "8">;
+def VST3q16_UPD : VST3DWB<0b0101, {0,1,0,?}, "16">;
+def VST3q32_UPD : VST3DWB<0b0101, {1,0,0,?}, "32">;
+
+def VST3q8Pseudo_UPD  : VSTQQQQWBPseudo<IIC_VST3u>;
+def VST3q16Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>;
+def VST3q32Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>;
+
+// ...alternate versions to be allocated odd register numbers:
+def VST3q8oddPseudo   : VSTQQQQPseudo<IIC_VST3>;
+def VST3q16oddPseudo  : VSTQQQQPseudo<IIC_VST3>;
+def VST3q32oddPseudo  : VSTQQQQPseudo<IIC_VST3>;
+
+def VST3q8oddPseudo_UPD  : VSTQQQQWBPseudo<IIC_VST3u>;
+def VST3q16oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>;
+def VST3q32oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>;
+
+//   VST4     : Vector Store (multiple 4-element structures)
+class VST4D<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<0, 0b00, op11_8, op7_4, (outs),
+          (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4),
+          IIC_VST4, "vst4", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn",
+          "", []> {
+  let Rm = 0b1111;
+  let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVLDST4Instruction";
+}
+
+def  VST4d8   : VST4D<0b0000, {0,0,?,?}, "8">;
+def  VST4d16  : VST4D<0b0000, {0,1,?,?}, "16">;
+def  VST4d32  : VST4D<0b0000, {1,0,?,?}, "32">;
+
+def  VST4d8Pseudo  : VSTQQPseudo<IIC_VST4>;
+def  VST4d16Pseudo : VSTQQPseudo<IIC_VST4>;
+def  VST4d32Pseudo : VSTQQPseudo<IIC_VST4>;
+
+// ...with address register writeback:
+class VST4DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm,
+           DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4), IIC_VST4u,
+           "vst4", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn$Rm",
+          "$Rn.addr = $wb", []> {
+  let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVLDST4Instruction";
+}
+
+def VST4d8_UPD  : VST4DWB<0b0000, {0,0,?,?}, "8">;
+def VST4d16_UPD : VST4DWB<0b0000, {0,1,?,?}, "16">;
+def VST4d32_UPD : VST4DWB<0b0000, {1,0,?,?}, "32">;
+
+def VST4d8Pseudo_UPD  : VSTQQWBPseudo<IIC_VST4u>;
+def VST4d16Pseudo_UPD : VSTQQWBPseudo<IIC_VST4u>;
+def VST4d32Pseudo_UPD : VSTQQWBPseudo<IIC_VST4u>;
+
+// ...with double-spaced registers:
+def VST4q8      : VST4D<0b0001, {0,0,?,?}, "8">;
+def VST4q16     : VST4D<0b0001, {0,1,?,?}, "16">;
+def VST4q32     : VST4D<0b0001, {1,0,?,?}, "32">;
+def VST4q8_UPD  : VST4DWB<0b0001, {0,0,?,?}, "8">;
+def VST4q16_UPD : VST4DWB<0b0001, {0,1,?,?}, "16">;
+def VST4q32_UPD : VST4DWB<0b0001, {1,0,?,?}, "32">;
+
+def VST4q8Pseudo_UPD  : VSTQQQQWBPseudo<IIC_VST4u>;
+def VST4q16Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>;
+def VST4q32Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>;
+
+// ...alternate versions to be allocated odd register numbers:
+def VST4q8oddPseudo   : VSTQQQQPseudo<IIC_VST4>;
+def VST4q16oddPseudo  : VSTQQQQPseudo<IIC_VST4>;
+def VST4q32oddPseudo  : VSTQQQQPseudo<IIC_VST4>;
+
+def VST4q8oddPseudo_UPD  : VSTQQQQWBPseudo<IIC_VST4u>;
+def VST4q16oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>;
+def VST4q32oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>;
+
+} // mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1
+
+// Classes for VST*LN pseudo-instructions with multi-register operands.
+// These are expanded to real instructions after register allocation.
+class VSTQLNPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs), (ins addrmode6:$addr, QPR:$src, nohash_imm:$lane),
+                itin, "">;
+class VSTQLNWBPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset, QPR:$src,
+                 nohash_imm:$lane), itin, "$addr.addr = $wb">;
+class VSTQQLNPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs), (ins addrmode6:$addr, QQPR:$src, nohash_imm:$lane),
+                itin, "">;
+class VSTQQLNWBPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset, QQPR:$src,
+                 nohash_imm:$lane), itin, "$addr.addr = $wb">;
+class VSTQQQQLNPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs), (ins addrmode6:$addr, QQQQPR:$src, nohash_imm:$lane),
+                itin, "">;
+class VSTQQQQLNWBPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src,
+                 nohash_imm:$lane), itin, "$addr.addr = $wb">;
+
+//   VST1LN   : Vector Store (single element from one lane)
+class VST1LN<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
+             PatFrag StoreOp, SDNode ExtractOp, Operand AddrMode>
+  : NLdStLn<1, 0b00, op11_8, op7_4, (outs),
+          (ins AddrMode:$Rn, DPR:$Vd, nohash_imm:$lane),
+          IIC_VST1ln, "vst1", Dt, "\\{$Vd[$lane]\\}, $Rn", "",
+          [(StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane), AddrMode:$Rn)]> {
+  let Rm = 0b1111;
+  let DecoderMethod = "DecodeVST1LN";
+}
+class VST1QLNPseudo<ValueType Ty, PatFrag StoreOp, SDNode ExtractOp>
+  : VSTQLNPseudo<IIC_VST1ln> {
+  let Pattern = [(StoreOp (ExtractOp (Ty QPR:$src), imm:$lane),
+                          addrmode6:$addr)];
+}
+
+def VST1LNd8  : VST1LN<0b0000, {?,?,?,0}, "8", v8i8, truncstorei8,
+                       NEONvgetlaneu, addrmode6> {
+  let Inst{7-5} = lane{2-0};
+}
+def VST1LNd16 : VST1LN<0b0100, {?,?,0,?}, "16", v4i16, truncstorei16,
+                       NEONvgetlaneu, addrmode6> {
+  let Inst{7-6} = lane{1-0};
+  let Inst{4}   = Rn{4};
+}
+
+def VST1LNd32 : VST1LN<0b1000, {?,0,?,?}, "32", v2i32, store, extractelt,
+                       addrmode6oneL32> {
+  let Inst{7}   = lane{0};
+  let Inst{5-4} = Rn{5-4};
+}
+
+def VST1LNq8Pseudo  : VST1QLNPseudo<v16i8, truncstorei8, NEONvgetlaneu>;
+def VST1LNq16Pseudo : VST1QLNPseudo<v8i16, truncstorei16, NEONvgetlaneu>;
+def VST1LNq32Pseudo : VST1QLNPseudo<v4i32, store, extractelt>;
+
+def : Pat<(store (extractelt (v2f32 DPR:$src), imm:$lane), addrmode6:$addr),
+          (VST1LNd32 addrmode6:$addr, DPR:$src, imm:$lane)>;
+def : Pat<(store (extractelt (v4f32 QPR:$src), imm:$lane), addrmode6:$addr),
+          (VST1LNq32Pseudo addrmode6:$addr, QPR:$src, imm:$lane)>;
+
+// ...with address register writeback:
+class VST1LNWB<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
+               PatFrag StoreOp, SDNode ExtractOp, Operand AdrMode>
+  : NLdStLn<1, 0b00, op11_8, op7_4, (outs GPR:$wb),
+          (ins AdrMode:$Rn, am6offset:$Rm,
+           DPR:$Vd, nohash_imm:$lane), IIC_VST1lnu, "vst1", Dt,
+          "\\{$Vd[$lane]\\}, $Rn$Rm",
+          "$Rn.addr = $wb",
+          [(set GPR:$wb, (StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane),
+                                  AdrMode:$Rn, am6offset:$Rm))]> {
+  let DecoderMethod = "DecodeVST1LN";
+}
+class VST1QLNWBPseudo<ValueType Ty, PatFrag StoreOp, SDNode ExtractOp>
+  : VSTQLNWBPseudo<IIC_VST1lnu> {
+  let Pattern = [(set GPR:$wb, (StoreOp (ExtractOp (Ty QPR:$src), imm:$lane),
+                                        addrmode6:$addr, am6offset:$offset))];
+}
+
+def VST1LNd8_UPD  : VST1LNWB<0b0000, {?,?,?,0}, "8", v8i8, post_truncsti8,
+                             NEONvgetlaneu, addrmode6> {
+  let Inst{7-5} = lane{2-0};
+}
+def VST1LNd16_UPD : VST1LNWB<0b0100, {?,?,0,?}, "16", v4i16, post_truncsti16,
+                             NEONvgetlaneu, addrmode6> {
+  let Inst{7-6} = lane{1-0};
+  let Inst{4}   = Rn{4};
+}
+def VST1LNd32_UPD : VST1LNWB<0b1000, {?,0,?,?}, "32", v2i32, post_store,
+                             extractelt, addrmode6oneL32> {
+  let Inst{7}   = lane{0};
+  let Inst{5-4} = Rn{5-4};
+}
+
+def VST1LNq8Pseudo_UPD  : VST1QLNWBPseudo<v16i8, post_truncsti8, NEONvgetlaneu>;
+def VST1LNq16Pseudo_UPD : VST1QLNWBPseudo<v8i16, post_truncsti16,NEONvgetlaneu>;
+def VST1LNq32Pseudo_UPD : VST1QLNWBPseudo<v4i32, post_store, extractelt>;
+
+let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in {
+
+//   VST2LN   : Vector Store (single 2-element structure from one lane)
+class VST2LN<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdStLn<1, 0b00, op11_8, op7_4, (outs),
+          (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, nohash_imm:$lane),
+          IIC_VST2ln, "vst2", Dt, "\\{$Vd[$lane], $src2[$lane]\\}, $Rn",
+          "", []> {
+  let Rm = 0b1111;
+  let Inst{4}   = Rn{4};
+  let DecoderMethod = "DecodeVST2LN";
+}
+
+def VST2LNd8  : VST2LN<0b0001, {?,?,?,?}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VST2LNd16 : VST2LN<0b0101, {?,?,0,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VST2LNd32 : VST2LN<0b1001, {?,0,0,?}, "32"> {
+  let Inst{7}   = lane{0};
+}
+
+def VST2LNd8Pseudo  : VSTQLNPseudo<IIC_VST2ln>;
+def VST2LNd16Pseudo : VSTQLNPseudo<IIC_VST2ln>;
+def VST2LNd32Pseudo : VSTQLNPseudo<IIC_VST2ln>;
+
+// ...with double-spaced registers:
+def VST2LNq16 : VST2LN<0b0101, {?,?,1,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+  let Inst{4}   = Rn{4};
+}
+def VST2LNq32 : VST2LN<0b1001, {?,1,0,?}, "32"> {
+  let Inst{7}   = lane{0};
+  let Inst{4}   = Rn{4};
+}
+
+def VST2LNq16Pseudo : VSTQQLNPseudo<IIC_VST2ln>;
+def VST2LNq32Pseudo : VSTQQLNPseudo<IIC_VST2ln>;
+
+// ...with address register writeback:
+class VST2LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdStLn<1, 0b00, op11_8, op7_4, (outs GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm,
+           DPR:$Vd, DPR:$src2, nohash_imm:$lane), IIC_VST2lnu, "vst2", Dt,
+          "\\{$Vd[$lane], $src2[$lane]\\}, $Rn$Rm",
+          "$Rn.addr = $wb", []> {
+  let Inst{4}   = Rn{4};
+  let DecoderMethod = "DecodeVST2LN";
+}
+
+def VST2LNd8_UPD  : VST2LNWB<0b0001, {?,?,?,?}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VST2LNd16_UPD : VST2LNWB<0b0101, {?,?,0,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VST2LNd32_UPD : VST2LNWB<0b1001, {?,0,0,?}, "32"> {
+  let Inst{7}   = lane{0};
+}
+
+def VST2LNd8Pseudo_UPD  : VSTQLNWBPseudo<IIC_VST2lnu>;
+def VST2LNd16Pseudo_UPD : VSTQLNWBPseudo<IIC_VST2lnu>;
+def VST2LNd32Pseudo_UPD : VSTQLNWBPseudo<IIC_VST2lnu>;
+
+def VST2LNq16_UPD : VST2LNWB<0b0101, {?,?,1,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VST2LNq32_UPD : VST2LNWB<0b1001, {?,1,0,?}, "32"> {
+  let Inst{7}   = lane{0};
+}
+
+def VST2LNq16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST2lnu>;
+def VST2LNq32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST2lnu>;
+
+//   VST3LN   : Vector Store (single 3-element structure from one lane)
+class VST3LN<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdStLn<1, 0b00, op11_8, op7_4, (outs),
+          (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3,
+           nohash_imm:$lane), IIC_VST3ln, "vst3", Dt,
+          "\\{$Vd[$lane], $src2[$lane], $src3[$lane]\\}, $Rn", "", []> {
+  let Rm = 0b1111;
+  let DecoderMethod = "DecodeVST3LN";
+}
+
+def VST3LNd8  : VST3LN<0b0010, {?,?,?,0}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VST3LNd16 : VST3LN<0b0110, {?,?,0,0}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VST3LNd32 : VST3LN<0b1010, {?,0,0,0}, "32"> {
+  let Inst{7}   = lane{0};
+}
+
+def VST3LNd8Pseudo  : VSTQQLNPseudo<IIC_VST3ln>;
+def VST3LNd16Pseudo : VSTQQLNPseudo<IIC_VST3ln>;
+def VST3LNd32Pseudo : VSTQQLNPseudo<IIC_VST3ln>;
+
+// ...with double-spaced registers:
+def VST3LNq16 : VST3LN<0b0110, {?,?,1,0}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VST3LNq32 : VST3LN<0b1010, {?,1,0,0}, "32"> {
+  let Inst{7}   = lane{0};
+}
+
+def VST3LNq16Pseudo : VSTQQQQLNPseudo<IIC_VST3ln>;
+def VST3LNq32Pseudo : VSTQQQQLNPseudo<IIC_VST3ln>;
+
+// ...with address register writeback:
+class VST3LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdStLn<1, 0b00, op11_8, op7_4, (outs GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm,
+           DPR:$Vd, DPR:$src2, DPR:$src3, nohash_imm:$lane),
+          IIC_VST3lnu, "vst3", Dt,
+          "\\{$Vd[$lane], $src2[$lane], $src3[$lane]\\}, $Rn$Rm",
+          "$Rn.addr = $wb", []> {
+  let DecoderMethod = "DecodeVST3LN";
+}
+
+def VST3LNd8_UPD  : VST3LNWB<0b0010, {?,?,?,0}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VST3LNd16_UPD : VST3LNWB<0b0110, {?,?,0,0}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VST3LNd32_UPD : VST3LNWB<0b1010, {?,0,0,0}, "32"> {
+  let Inst{7}   = lane{0};
+}
+
+def VST3LNd8Pseudo_UPD  : VSTQQLNWBPseudo<IIC_VST3lnu>;
+def VST3LNd16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST3lnu>;
+def VST3LNd32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST3lnu>;
+
+def VST3LNq16_UPD : VST3LNWB<0b0110, {?,?,1,0}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VST3LNq32_UPD : VST3LNWB<0b1010, {?,1,0,0}, "32"> {
+  let Inst{7}   = lane{0};
+}
+
+def VST3LNq16Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST3lnu>;
+def VST3LNq32Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST3lnu>;
+
+//   VST4LN   : Vector Store (single 4-element structure from one lane)
+class VST4LN<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdStLn<1, 0b00, op11_8, op7_4, (outs),
+          (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4,
+           nohash_imm:$lane), IIC_VST4ln, "vst4", Dt,
+          "\\{$Vd[$lane], $src2[$lane], $src3[$lane], $src4[$lane]\\}, $Rn",
+          "", []> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVST4LN";
+}
+
+def VST4LNd8  : VST4LN<0b0011, {?,?,?,?}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VST4LNd16 : VST4LN<0b0111, {?,?,0,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VST4LNd32 : VST4LN<0b1011, {?,0,?,?}, "32"> {
+  let Inst{7}   = lane{0};
+  let Inst{5} = Rn{5};
+}
+
+def VST4LNd8Pseudo  : VSTQQLNPseudo<IIC_VST4ln>;
+def VST4LNd16Pseudo : VSTQQLNPseudo<IIC_VST4ln>;
+def VST4LNd32Pseudo : VSTQQLNPseudo<IIC_VST4ln>;
+
+// ...with double-spaced registers:
+def VST4LNq16 : VST4LN<0b0111, {?,?,1,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VST4LNq32 : VST4LN<0b1011, {?,1,?,?}, "32"> {
+  let Inst{7}   = lane{0};
+  let Inst{5} = Rn{5};
+}
+
+def VST4LNq16Pseudo : VSTQQQQLNPseudo<IIC_VST4ln>;
+def VST4LNq32Pseudo : VSTQQQQLNPseudo<IIC_VST4ln>;
+
+// ...with address register writeback:
+class VST4LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdStLn<1, 0b00, op11_8, op7_4, (outs GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm,
+           DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4, nohash_imm:$lane),
+          IIC_VST4lnu, "vst4", Dt,
+  "\\{$Vd[$lane], $src2[$lane], $src3[$lane], $src4[$lane]\\}, $Rn$Rm",
+          "$Rn.addr = $wb", []> {
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVST4LN";
+}
+
+def VST4LNd8_UPD  : VST4LNWB<0b0011, {?,?,?,?}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VST4LNd16_UPD : VST4LNWB<0b0111, {?,?,0,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VST4LNd32_UPD : VST4LNWB<0b1011, {?,0,?,?}, "32"> {
+  let Inst{7}   = lane{0};
+  let Inst{5} = Rn{5};
+}
+
+def VST4LNd8Pseudo_UPD  : VSTQQLNWBPseudo<IIC_VST4lnu>;
+def VST4LNd16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST4lnu>;
+def VST4LNd32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST4lnu>;
+
+def VST4LNq16_UPD : VST4LNWB<0b0111, {?,?,1,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VST4LNq32_UPD : VST4LNWB<0b1011, {?,1,?,?}, "32"> {
+  let Inst{7}   = lane{0};
+  let Inst{5} = Rn{5};
+}
+
+def VST4LNq16Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST4lnu>;
+def VST4LNq32Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST4lnu>;
+
+} // mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1
+
+// Use vld1/vst1 for unaligned f64 load / store
+def : Pat<(f64 (hword_alignedload addrmode6:$addr)),
+          (VLD1d16 addrmode6:$addr)>, Requires<[IsLE]>;
+def : Pat<(hword_alignedstore (f64 DPR:$value), addrmode6:$addr),
+          (VST1d16 addrmode6:$addr, DPR:$value)>, Requires<[IsLE]>;
+def : Pat<(f64 (byte_alignedload addrmode6:$addr)),
+          (VLD1d8 addrmode6:$addr)>, Requires<[IsLE]>;
+def : Pat<(byte_alignedstore (f64 DPR:$value), addrmode6:$addr),
+          (VST1d8 addrmode6:$addr, DPR:$value)>, Requires<[IsLE]>;
+def : Pat<(f64 (non_word_alignedload addrmode6:$addr)),
+          (VLD1d64 addrmode6:$addr)>, Requires<[IsBE]>;
+def : Pat<(non_word_alignedstore (f64 DPR:$value), addrmode6:$addr),
+          (VST1d64 addrmode6:$addr, DPR:$value)>, Requires<[IsBE]>;
+
+// Use vld1/vst1 for Q and QQ. Also use them for unaligned v2f64
+// load / store if it's legal.
+def : Pat<(v2f64 (dword_alignedload addrmode6:$addr)),
+          (VLD1q64 addrmode6:$addr)>;
+def : Pat<(dword_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
+          (VST1q64 addrmode6:$addr, QPR:$value)>;
+def : Pat<(v2f64 (word_alignedload addrmode6:$addr)),
+          (VLD1q32 addrmode6:$addr)>, Requires<[IsLE]>;
+def : Pat<(word_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
+          (VST1q32 addrmode6:$addr, QPR:$value)>, Requires<[IsLE]>;
+def : Pat<(v2f64 (hword_alignedload addrmode6:$addr)),
+          (VLD1q16 addrmode6:$addr)>, Requires<[IsLE]>;
+def : Pat<(hword_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
+          (VST1q16 addrmode6:$addr, QPR:$value)>, Requires<[IsLE]>;
+def : Pat<(v2f64 (byte_alignedload addrmode6:$addr)),
+          (VLD1q8 addrmode6:$addr)>, Requires<[IsLE]>;
+def : Pat<(byte_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
+          (VST1q8 addrmode6:$addr, QPR:$value)>, Requires<[IsLE]>;
+
+//===----------------------------------------------------------------------===//
+// NEON pattern fragments
+//===----------------------------------------------------------------------===//
+
+// Extract D sub-registers of Q registers.
+def DSubReg_i8_reg  : SDNodeXForm<imm, [{
+  assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
+  return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue()/8, MVT::i32);
+}]>;
+def DSubReg_i16_reg : SDNodeXForm<imm, [{
+  assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
+  return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue()/4, MVT::i32);
+}]>;
+def DSubReg_i32_reg : SDNodeXForm<imm, [{
+  assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
+  return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue()/2, MVT::i32);
+}]>;
+def DSubReg_f64_reg : SDNodeXForm<imm, [{
+  assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
+  return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue(), MVT::i32);
+}]>;
+
+// Extract S sub-registers of Q/D registers.
+def SSubReg_f32_reg : SDNodeXForm<imm, [{
+  assert(ARM::ssub_3 == ARM::ssub_0+3 && "Unexpected subreg numbering");
+  return CurDAG->getTargetConstant(ARM::ssub_0 + N->getZExtValue(), MVT::i32);
+}]>;
+
+// Translate lane numbers from Q registers to D subregs.
+def SubReg_i8_lane  : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() & 7, MVT::i32);
+}]>;
+def SubReg_i16_lane : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() & 3, MVT::i32);
+}]>;
+def SubReg_i32_lane : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() & 1, MVT::i32);
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Instruction Classes
+//===----------------------------------------------------------------------===//
+
+// Basic 2-register operations: double- and quad-register.
+class N2VD<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+           bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
+           string Dt, ValueType ResTy, ValueType OpTy, SDNode OpNode>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$Vd),
+        (ins DPR:$Vm), IIC_VUNAD, OpcodeStr, Dt,"$Vd, $Vm", "",
+        [(set DPR:$Vd, (ResTy (OpNode (OpTy DPR:$Vm))))]>;
+class N2VQ<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+           bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
+           string Dt, ValueType ResTy, ValueType OpTy, SDNode OpNode>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$Vd),
+        (ins QPR:$Vm), IIC_VUNAQ, OpcodeStr, Dt,"$Vd, $Vm", "",
+        [(set QPR:$Vd, (ResTy (OpNode (OpTy QPR:$Vm))))]>;
+
+// Basic 2-register intrinsics, both double- and quad-register.
+class N2VDInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+              bits<2> op17_16, bits<5> op11_7, bit op4,
+              InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$Vd),
+        (ins DPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm))))]>;
+class N2VQInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+              bits<2> op17_16, bits<5> op11_7, bit op4,
+              InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$Vd),
+        (ins QPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>;
+
+// Same as above, but not predicated.
+class N2VDIntnp<bits<2> op17_16, bits<3> op10_8, bit op7,
+              InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N2Vnp<0b10, op17_16, op10_8, op7, 0,  (outs DPR:$Vd), (ins DPR:$Vm),
+          itin, OpcodeStr, Dt,
+          [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm))))]>;
+
+class N2VQIntnp<bits<2> op17_16, bits<3> op10_8, bit op7,
+              InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N2Vnp<0b10, op17_16, op10_8, op7, 1,  (outs QPR:$Vd), (ins QPR:$Vm),
+          itin, OpcodeStr, Dt,
+          [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>;
+
+// Similar to NV2VQIntnp with some more encoding bits exposed (crypto).
+class N2VQIntXnp<bits<2> op19_18, bits<2> op17_16, bits<3> op10_8, bit op6,
+              bit op7, InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N2Vnp<op19_18, op17_16, op10_8, op7, op6,  (outs QPR:$Vd), (ins QPR:$Vm),
+          itin, OpcodeStr, Dt,
+          [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>;
+
+// Same as N2VQIntXnp but with Vd as a src register.
+class N2VQIntX2np<bits<2> op19_18, bits<2> op17_16, bits<3> op10_8, bit op6,
+              bit op7, InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N2Vnp<op19_18, op17_16, op10_8, op7, op6,
+          (outs QPR:$Vd), (ins QPR:$src, QPR:$Vm),
+          itin, OpcodeStr, Dt,
+          [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$src), (OpTy QPR:$Vm))))]> {
+  let Constraints = "$src = $Vd";
+}
+
+// Narrow 2-register operations.
+class N2VN<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+           bits<2> op17_16, bits<5> op11_7, bit op6, bit op4,
+           InstrItinClass itin, string OpcodeStr, string Dt,
+           ValueType TyD, ValueType TyQ, SDNode OpNode>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs DPR:$Vd),
+        (ins QPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set DPR:$Vd, (TyD (OpNode (TyQ QPR:$Vm))))]>;
+
+// Narrow 2-register intrinsics.
+class N2VNInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+              bits<2> op17_16, bits<5> op11_7, bit op6, bit op4,
+              InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType TyD, ValueType TyQ, SDPatternOperator IntOp>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs DPR:$Vd),
+        (ins QPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set DPR:$Vd, (TyD (IntOp (TyQ QPR:$Vm))))]>;
+
+// Long 2-register operations (currently only used for VMOVL).
+class N2VL<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+           bits<2> op17_16, bits<5> op11_7, bit op6, bit op4,
+           InstrItinClass itin, string OpcodeStr, string Dt,
+           ValueType TyQ, ValueType TyD, SDNode OpNode>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs QPR:$Vd),
+        (ins DPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set QPR:$Vd, (TyQ (OpNode (TyD DPR:$Vm))))]>;
+
+// Long 2-register intrinsics.
+class N2VLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+              bits<2> op17_16, bits<5> op11_7, bit op6, bit op4,
+              InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType TyQ, ValueType TyD, SDPatternOperator IntOp>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs QPR:$Vd),
+        (ins DPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set QPR:$Vd, (TyQ (IntOp (TyD DPR:$Vm))))]>;
+
+// 2-register shuffles (VTRN/VZIP/VUZP), both double- and quad-register.
+class N2VDShuffle<bits<2> op19_18, bits<5> op11_7, string OpcodeStr, string Dt>
+  : N2V<0b11, 0b11, op19_18, 0b10, op11_7, 0, 0, (outs DPR:$Vd, DPR:$Vm),
+        (ins DPR:$src1, DPR:$src2), IIC_VPERMD,
+        OpcodeStr, Dt, "$Vd, $Vm",
+        "$src1 = $Vd, $src2 = $Vm", []>;
+class N2VQShuffle<bits<2> op19_18, bits<5> op11_7,
+                  InstrItinClass itin, string OpcodeStr, string Dt>
+  : N2V<0b11, 0b11, op19_18, 0b10, op11_7, 1, 0, (outs QPR:$Vd, QPR:$Vm),
+        (ins QPR:$src1, QPR:$src2), itin, OpcodeStr, Dt, "$Vd, $Vm",
+        "$src1 = $Vd, $src2 = $Vm", []>;
+
+// Basic 3-register operations: double- and quad-register.
+class N3VD<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+           InstrItinClass itin, string OpcodeStr, string Dt,
+           ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+        [(set DPR:$Vd, (ResTy (OpNode (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]> {
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
+  let isCommutable = Commutable;
+}
+// Same as N3VD but no data type.
+class N3VDX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+           InstrItinClass itin, string OpcodeStr,
+           ValueType ResTy, ValueType OpTy,
+           SDNode OpNode, bit Commutable>
+  : N3VX<op24, op23, op21_20, op11_8, 0, op4,
+         (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+         OpcodeStr, "$Vd, $Vn, $Vm", "",
+         [(set DPR:$Vd, (ResTy (OpNode (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]>{
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
+  let isCommutable = Commutable;
+}
+
+class N3VDSL<bits<2> op21_20, bits<4> op11_8,
+             InstrItinClass itin, string OpcodeStr, string Dt,
+             ValueType Ty, SDNode ShOp>
+  : N3VLane32<0, 1, op21_20, op11_8, 1, 0,
+        (outs DPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
+        [(set (Ty DPR:$Vd),
+              (Ty (ShOp (Ty DPR:$Vn),
+                        (Ty (NEONvduplane (Ty DPR_VFP2:$Vm),imm:$lane)))))]> {
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
+  let isCommutable = 0;
+}
+class N3VDSL16<bits<2> op21_20, bits<4> op11_8,
+               string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp>
+  : N3VLane16<0, 1, op21_20, op11_8, 1, 0,
+        (outs DPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
+        NVMulSLFrm, IIC_VMULi16D, OpcodeStr, Dt,"$Vd, $Vn, $Vm$lane","",
+        [(set (Ty DPR:$Vd),
+              (Ty (ShOp (Ty DPR:$Vn),
+                        (Ty (NEONvduplane (Ty DPR_8:$Vm), imm:$lane)))))]> {
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
+  let isCommutable = 0;
+}
+
+class N3VQ<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+           InstrItinClass itin, string OpcodeStr, string Dt,
+           ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 1, op4,
+        (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+        [(set QPR:$Vd, (ResTy (OpNode (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]> {
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
+  let isCommutable = Commutable;
+}
+class N3VQX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+           InstrItinClass itin, string OpcodeStr,
+           ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable>
+  : N3VX<op24, op23, op21_20, op11_8, 1, op4,
+         (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), N3RegFrm, itin,
+         OpcodeStr, "$Vd, $Vn, $Vm", "",
+         [(set QPR:$Vd, (ResTy (OpNode (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]>{
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
+  let isCommutable = Commutable;
+}
+class N3VQSL<bits<2> op21_20, bits<4> op11_8,
+             InstrItinClass itin, string OpcodeStr, string Dt,
+             ValueType ResTy, ValueType OpTy, SDNode ShOp>
+  : N3VLane32<1, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$Vd), (ins QPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
+        [(set (ResTy QPR:$Vd),
+              (ResTy (ShOp (ResTy QPR:$Vn),
+                           (ResTy (NEONvduplane (OpTy DPR_VFP2:$Vm),
+                                                imm:$lane)))))]> {
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
+  let isCommutable = 0;
+}
+class N3VQSL16<bits<2> op21_20, bits<4> op11_8, string OpcodeStr, string Dt,
+               ValueType ResTy, ValueType OpTy, SDNode ShOp>
+  : N3VLane16<1, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$Vd), (ins QPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
+        NVMulSLFrm, IIC_VMULi16Q, OpcodeStr, Dt,"$Vd, $Vn, $Vm$lane", "",
+        [(set (ResTy QPR:$Vd),
+              (ResTy (ShOp (ResTy QPR:$Vn),
+                           (ResTy (NEONvduplane (OpTy DPR_8:$Vm),
+                                                imm:$lane)))))]> {
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
+  let isCommutable = 0;
+}
+
+// Basic 3-register intrinsics, both double- and quad-register.
+class N3VDInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+              Format f, InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp, bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), f, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+        [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]> {
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
+  let isCommutable = Commutable;
+}
+
+class N3VDIntnp<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6,
+                bit op4, Format f, InstrItinClass itin, string OpcodeStr,
+                string Dt, ValueType ResTy, ValueType OpTy,
+                SDPatternOperator IntOp, bit Commutable>
+  : N3Vnp<op27_23, op21_20, op11_8, op6, op4,
+          (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin, OpcodeStr, Dt,
+          [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]>;
+
+class N3VDIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                string OpcodeStr, string Dt, ValueType Ty, SDPatternOperator IntOp>
+  : N3VLane32<0, 1, op21_20, op11_8, 1, 0,
+        (outs DPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
+        [(set (Ty DPR:$Vd),
+              (Ty (IntOp (Ty DPR:$Vn),
+                         (Ty (NEONvduplane (Ty DPR_VFP2:$Vm),
+                                           imm:$lane)))))]> {
+  let isCommutable = 0;
+}
+
+class N3VDIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                  string OpcodeStr, string Dt, ValueType Ty, SDPatternOperator IntOp>
+  : N3VLane16<0, 1, op21_20, op11_8, 1, 0,
+        (outs DPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
+        [(set (Ty DPR:$Vd),
+              (Ty (IntOp (Ty DPR:$Vn),
+                         (Ty (NEONvduplane (Ty DPR_8:$Vm), imm:$lane)))))]> {
+  let isCommutable = 0;
+}
+class N3VDIntSh<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+              Format f, InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs DPR:$Vd), (ins DPR:$Vm, DPR:$Vn), f, itin,
+        OpcodeStr, Dt, "$Vd, $Vm, $Vn", "",
+        [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm), (OpTy DPR:$Vn))))]> {
+  let TwoOperandAliasConstraint = "$Vm = $Vd";
+  let isCommutable = 0;
+}
+
+class N3VQInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+              Format f, InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp, bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 1, op4,
+        (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), f, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+        [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]> {
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
+  let isCommutable = Commutable;
+}
+
+class N3VQIntnp<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6,
+                bit op4, Format f, InstrItinClass itin, string OpcodeStr,
+                string Dt, ValueType ResTy, ValueType OpTy,
+                SDPatternOperator IntOp, bit Commutable>
+  : N3Vnp<op27_23, op21_20, op11_8, op6, op4,
+          (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), f, itin, OpcodeStr, Dt,
+          [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]>;
+
+// Same as N3VQIntnp but with Vd as a src register.
+class N3VQInt3np<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6,
+                bit op4, Format f, InstrItinClass itin, string OpcodeStr,
+                string Dt, ValueType ResTy, ValueType OpTy,
+                SDPatternOperator IntOp, bit Commutable>
+  : N3Vnp<op27_23, op21_20, op11_8, op6, op4,
+          (outs QPR:$Vd), (ins QPR:$src, QPR:$Vn, QPR:$Vm),
+          f, itin, OpcodeStr, Dt,
+          [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$src), (OpTy QPR:$Vn),
+                                       (OpTy QPR:$Vm))))]> {
+  let Constraints = "$src = $Vd";
+}
+
+class N3VQIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                string OpcodeStr, string Dt,
+                ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N3VLane32<1, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$Vd), (ins QPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
+        [(set (ResTy QPR:$Vd),
+              (ResTy (IntOp (ResTy QPR:$Vn),
+                            (ResTy (NEONvduplane (OpTy DPR_VFP2:$Vm),
+                                                 imm:$lane)))))]> {
+  let isCommutable = 0;
+}
+class N3VQIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                  string OpcodeStr, string Dt,
+                  ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N3VLane16<1, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$Vd), (ins QPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
+        [(set (ResTy QPR:$Vd),
+              (ResTy (IntOp (ResTy QPR:$Vn),
+                            (ResTy (NEONvduplane (OpTy DPR_8:$Vm),
+                                                 imm:$lane)))))]> {
+  let isCommutable = 0;
+}
+class N3VQIntSh<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+              Format f, InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N3V<op24, op23, op21_20, op11_8, 1, op4,
+        (outs QPR:$Vd), (ins QPR:$Vm, QPR:$Vn), f, itin,
+        OpcodeStr, Dt, "$Vd, $Vm, $Vn", "",
+        [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm), (OpTy QPR:$Vn))))]> {
+  let TwoOperandAliasConstraint = "$Vm = $Vd";
+  let isCommutable = 0;
+}
+
+// Multiply-Add/Sub operations: double- and quad-register.
+class N3VDMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+                InstrItinClass itin, string OpcodeStr, string Dt,
+                ValueType Ty, SDPatternOperator MulOp, SDPatternOperator OpNode>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
+        [(set DPR:$Vd, (Ty (OpNode DPR:$src1,
+                             (Ty (MulOp DPR:$Vn, DPR:$Vm)))))]>;
+
+class N3VDMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                  string OpcodeStr, string Dt,
+                  ValueType Ty, SDPatternOperator MulOp, SDPatternOperator ShOp>
+  : N3VLane32<0, 1, op21_20, op11_8, 1, 0,
+        (outs DPR:$Vd),
+        (ins DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
+        NVMulSLFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "$src1 = $Vd",
+        [(set (Ty DPR:$Vd),
+              (Ty (ShOp (Ty DPR:$src1),
+                        (Ty (MulOp DPR:$Vn,
+                                   (Ty (NEONvduplane (Ty DPR_VFP2:$Vm),
+                                                     imm:$lane)))))))]>;
+class N3VDMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                    string OpcodeStr, string Dt,
+                    ValueType Ty, SDNode MulOp, SDNode ShOp>
+  : N3VLane16<0, 1, op21_20, op11_8, 1, 0,
+        (outs DPR:$Vd),
+        (ins DPR:$src1, DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
+        NVMulSLFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "$src1 = $Vd",
+        [(set (Ty DPR:$Vd),
+              (Ty (ShOp (Ty DPR:$src1),
+                        (Ty (MulOp DPR:$Vn,
+                                   (Ty (NEONvduplane (Ty DPR_8:$Vm),
+                                                     imm:$lane)))))))]>;
+
+class N3VQMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+                InstrItinClass itin, string OpcodeStr, string Dt, ValueType Ty,
+                SDPatternOperator MulOp, SDPatternOperator OpNode>
+  : N3V<op24, op23, op21_20, op11_8, 1, op4,
+        (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
+        [(set QPR:$Vd, (Ty (OpNode QPR:$src1,
+                             (Ty (MulOp QPR:$Vn, QPR:$Vm)))))]>;
+class N3VQMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                  string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy,
+                  SDPatternOperator MulOp, SDPatternOperator ShOp>
+  : N3VLane32<1, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$Vd),
+        (ins QPR:$src1, QPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
+        NVMulSLFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "$src1 = $Vd",
+        [(set (ResTy QPR:$Vd),
+              (ResTy (ShOp (ResTy QPR:$src1),
+                           (ResTy (MulOp QPR:$Vn,
+                                   (ResTy (NEONvduplane (OpTy DPR_VFP2:$Vm),
+                                                        imm:$lane)))))))]>;
+class N3VQMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                    string OpcodeStr, string Dt,
+                    ValueType ResTy, ValueType OpTy,
+                    SDNode MulOp, SDNode ShOp>
+  : N3VLane16<1, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$Vd),
+        (ins QPR:$src1, QPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
+        NVMulSLFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "$src1 = $Vd",
+        [(set (ResTy QPR:$Vd),
+              (ResTy (ShOp (ResTy QPR:$src1),
+                           (ResTy (MulOp QPR:$Vn,
+                                   (ResTy (NEONvduplane (OpTy DPR_8:$Vm),
+                                                        imm:$lane)))))))]>;
+
+// Neon Intrinsic-Op instructions (VABA): double- and quad-register.
+class N3VDIntOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+                InstrItinClass itin, string OpcodeStr, string Dt,
+                ValueType Ty, SDPatternOperator IntOp, SDNode OpNode>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
+        [(set DPR:$Vd, (Ty (OpNode DPR:$src1,
+                             (Ty (IntOp (Ty DPR:$Vn), (Ty DPR:$Vm))))))]>;
+class N3VQIntOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+                InstrItinClass itin, string OpcodeStr, string Dt,
+                ValueType Ty, SDPatternOperator IntOp, SDNode OpNode>
+  : N3V<op24, op23, op21_20, op11_8, 1, op4,
+        (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
+        [(set QPR:$Vd, (Ty (OpNode QPR:$src1,
+                             (Ty (IntOp (Ty QPR:$Vn), (Ty QPR:$Vm))))))]>;
+
+// Neon 3-argument intrinsics, both double- and quad-register.
+// The destination register is also used as the first source operand register.
+class N3VDInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+               InstrItinClass itin, string OpcodeStr, string Dt,
+               ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
+        [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$src1),
+                                      (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]>;
+class N3VQInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+               InstrItinClass itin, string OpcodeStr, string Dt,
+               ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N3V<op24, op23, op21_20, op11_8, 1, op4,
+        (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
+        [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$src1),
+                                      (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]>;
+
+// Long Multiply-Add/Sub operations.
+class N3VLMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+                InstrItinClass itin, string OpcodeStr, string Dt,
+                ValueType TyQ, ValueType TyD, SDNode MulOp, SDNode OpNode>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs QPR:$Vd), (ins QPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
+        [(set QPR:$Vd, (OpNode (TyQ QPR:$src1),
+                                (TyQ (MulOp (TyD DPR:$Vn),
+                                            (TyD DPR:$Vm)))))]>;
+class N3VLMulOpSL<bit op24, bits<2> op21_20, bits<4> op11_8,
+                  InstrItinClass itin, string OpcodeStr, string Dt,
+                  ValueType TyQ, ValueType TyD, SDNode MulOp, SDNode OpNode>
+  : N3VLane32<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$Vd),
+        (ins QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
+        NVMulSLFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "$src1 = $Vd",
+        [(set QPR:$Vd,
+          (OpNode (TyQ QPR:$src1),
+                  (TyQ (MulOp (TyD DPR:$Vn),
+                              (TyD (NEONvduplane (TyD DPR_VFP2:$Vm),
+                                                 imm:$lane))))))]>;
+class N3VLMulOpSL16<bit op24, bits<2> op21_20, bits<4> op11_8,
+                    InstrItinClass itin, string OpcodeStr, string Dt,
+                    ValueType TyQ, ValueType TyD, SDNode MulOp, SDNode OpNode>
+  : N3VLane16<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$Vd),
+        (ins QPR:$src1, DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
+        NVMulSLFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "$src1 = $Vd",
+        [(set QPR:$Vd,
+          (OpNode (TyQ QPR:$src1),
+                  (TyQ (MulOp (TyD DPR:$Vn),
+                              (TyD (NEONvduplane (TyD DPR_8:$Vm),
+                                                 imm:$lane))))))]>;
+
+// Long Intrinsic-Op vector operations with explicit extend (VABAL).
+class N3VLIntExtOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+                   InstrItinClass itin, string OpcodeStr, string Dt,
+                   ValueType TyQ, ValueType TyD, SDPatternOperator IntOp, SDNode ExtOp,
+                   SDNode OpNode>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs QPR:$Vd), (ins QPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
+        [(set QPR:$Vd, (OpNode (TyQ QPR:$src1),
+                                (TyQ (ExtOp (TyD (IntOp (TyD DPR:$Vn),
+                                                        (TyD DPR:$Vm)))))))]>;
+
+// Neon Long 3-argument intrinsic.  The destination register is
+// a quad-register and is also used as the first source operand register.
+class N3VLInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+               InstrItinClass itin, string OpcodeStr, string Dt,
+               ValueType TyQ, ValueType TyD, SDPatternOperator IntOp>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs QPR:$Vd), (ins QPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
+        [(set QPR:$Vd,
+          (TyQ (IntOp (TyQ QPR:$src1), (TyD DPR:$Vn), (TyD DPR:$Vm))))]>;
+class N3VLInt3SL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                 string OpcodeStr, string Dt,
+                 ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N3VLane32<op24, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$Vd),
+        (ins QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
+        NVMulSLFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "$src1 = $Vd",
+        [(set (ResTy QPR:$Vd),
+              (ResTy (IntOp (ResTy QPR:$src1),
+                            (OpTy DPR:$Vn),
+                            (OpTy (NEONvduplane (OpTy DPR_VFP2:$Vm),
+                                                imm:$lane)))))]>;
+class N3VLInt3SL16<bit op24, bits<2> op21_20, bits<4> op11_8,
+                   InstrItinClass itin, string OpcodeStr, string Dt,
+                   ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N3VLane16<op24, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$Vd),
+        (ins QPR:$src1, DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
+        NVMulSLFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "$src1 = $Vd",
+        [(set (ResTy QPR:$Vd),
+              (ResTy (IntOp (ResTy QPR:$src1),
+                            (OpTy DPR:$Vn),
+                            (OpTy (NEONvduplane (OpTy DPR_8:$Vm),
+                                                imm:$lane)))))]>;
+
+// Narrowing 3-register intrinsics.
+class N3VNInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+              string OpcodeStr, string Dt, ValueType TyD, ValueType TyQ,
+              SDPatternOperator IntOp, bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs DPR:$Vd), (ins QPR:$Vn, QPR:$Vm), N3RegFrm, IIC_VBINi4D,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+        [(set DPR:$Vd, (TyD (IntOp (TyQ QPR:$Vn), (TyQ QPR:$Vm))))]> {
+  let isCommutable = Commutable;
+}
+
+// Long 3-register operations.
+class N3VL<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+           InstrItinClass itin, string OpcodeStr, string Dt,
+           ValueType TyQ, ValueType TyD, SDNode OpNode, bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+        [(set QPR:$Vd, (TyQ (OpNode (TyD DPR:$Vn), (TyD DPR:$Vm))))]> {
+  let isCommutable = Commutable;
+}
+
+class N3VLSL<bit op24, bits<2> op21_20, bits<4> op11_8,
+             InstrItinClass itin, string OpcodeStr, string Dt,
+             ValueType TyQ, ValueType TyD, SDNode OpNode>
+  : N3VLane32<op24, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
+        [(set QPR:$Vd,
+          (TyQ (OpNode (TyD DPR:$Vn),
+                       (TyD (NEONvduplane (TyD DPR_VFP2:$Vm),imm:$lane)))))]>;
+class N3VLSL16<bit op24, bits<2> op21_20, bits<4> op11_8,
+               InstrItinClass itin, string OpcodeStr, string Dt,
+               ValueType TyQ, ValueType TyD, SDNode OpNode>
+  : N3VLane16<op24, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
+        [(set QPR:$Vd,
+          (TyQ (OpNode (TyD DPR:$Vn),
+                       (TyD (NEONvduplane (TyD DPR_8:$Vm), imm:$lane)))))]>;
+
+// Long 3-register operations with explicitly extended operands.
+class N3VLExt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+              InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType TyQ, ValueType TyD, SDNode OpNode, SDNode ExtOp,
+              bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+        [(set QPR:$Vd, (OpNode (TyQ (ExtOp (TyD DPR:$Vn))),
+                                (TyQ (ExtOp (TyD DPR:$Vm)))))]> {
+  let isCommutable = Commutable;
+}
+
+// Long 3-register intrinsics with explicit extend (VABDL).
+class N3VLIntExt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+                 InstrItinClass itin, string OpcodeStr, string Dt,
+                 ValueType TyQ, ValueType TyD, SDPatternOperator IntOp, SDNode ExtOp,
+                 bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+        [(set QPR:$Vd, (TyQ (ExtOp (TyD (IntOp (TyD DPR:$Vn),
+                                                (TyD DPR:$Vm))))))]> {
+  let isCommutable = Commutable;
+}
+
+// Long 3-register intrinsics.
+class N3VLInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+              InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType TyQ, ValueType TyD, SDPatternOperator IntOp, bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+        [(set QPR:$Vd, (TyQ (IntOp (TyD DPR:$Vn), (TyD DPR:$Vm))))]> {
+  let isCommutable = Commutable;
+}
+
+// Same as above, but not predicated.
+class N3VLIntnp<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6,
+                bit op4, InstrItinClass itin, string OpcodeStr,
+                string Dt, ValueType ResTy, ValueType OpTy,
+                SDPatternOperator IntOp, bit Commutable>
+  : N3Vnp<op27_23, op21_20, op11_8, op6, op4,
+          (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin, OpcodeStr, Dt,
+          [(set QPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]>;
+
+class N3VLIntSL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                string OpcodeStr, string Dt,
+                ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N3VLane32<op24, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
+        [(set (ResTy QPR:$Vd),
+              (ResTy (IntOp (OpTy DPR:$Vn),
+                            (OpTy (NEONvduplane (OpTy DPR_VFP2:$Vm),
+                                                imm:$lane)))))]>;
+class N3VLIntSL16<bit op24, bits<2> op21_20, bits<4> op11_8,
+                  InstrItinClass itin, string OpcodeStr, string Dt,
+                  ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N3VLane16<op24, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
+        [(set (ResTy QPR:$Vd),
+              (ResTy (IntOp (OpTy DPR:$Vn),
+                            (OpTy (NEONvduplane (OpTy DPR_8:$Vm),
+                                                imm:$lane)))))]>;
+
+// Wide 3-register operations.
+class N3VW<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+           string OpcodeStr, string Dt, ValueType TyQ, ValueType TyD,
+           SDNode OpNode, SDNode ExtOp, bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs QPR:$Vd), (ins QPR:$Vn, DPR:$Vm), N3RegFrm, IIC_VSUBiD,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+        [(set QPR:$Vd, (OpNode (TyQ QPR:$Vn),
+                                (TyQ (ExtOp (TyD DPR:$Vm)))))]> {
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
+  let isCommutable = Commutable;
+}
+
+// Pairwise long 2-register intrinsics, both double- and quad-register.
+class N2VDPLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+                bits<2> op17_16, bits<5> op11_7, bit op4,
+                string OpcodeStr, string Dt,
+                ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$Vd),
+        (ins DPR:$Vm), IIC_VSHLiD, OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm))))]>;
+class N2VQPLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+                bits<2> op17_16, bits<5> op11_7, bit op4,
+                string OpcodeStr, string Dt,
+                ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$Vd),
+        (ins QPR:$Vm), IIC_VSHLiD, OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>;
+
+// Pairwise long 2-register accumulate intrinsics,
+// both double- and quad-register.
+// The destination register is also used as the first source operand register.
+class N2VDPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+                 bits<2> op17_16, bits<5> op11_7, bit op4,
+                 string OpcodeStr, string Dt,
+                 ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4,
+        (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vm), IIC_VPALiD,
+        OpcodeStr, Dt, "$Vd, $Vm", "$src1 = $Vd",
+        [(set DPR:$Vd, (ResTy (IntOp (ResTy DPR:$src1), (OpTy DPR:$Vm))))]>;
+class N2VQPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+                 bits<2> op17_16, bits<5> op11_7, bit op4,
+                 string OpcodeStr, string Dt,
+                 ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4,
+        (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vm), IIC_VPALiQ,
+        OpcodeStr, Dt, "$Vd, $Vm", "$src1 = $Vd",
+        [(set QPR:$Vd, (ResTy (IntOp (ResTy QPR:$src1), (OpTy QPR:$Vm))))]>;
+
+// Shift by immediate,
+// both double- and quad-register.
+let TwoOperandAliasConstraint = "$Vm = $Vd" in {
+class N2VDSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+             Format f, InstrItinClass itin, Operand ImmTy,
+             string OpcodeStr, string Dt, ValueType Ty, SDNode OpNode>
+  : N2VImm<op24, op23, op11_8, op7, 0, op4,
+           (outs DPR:$Vd), (ins DPR:$Vm, ImmTy:$SIMM), f, itin,
+           OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "",
+           [(set DPR:$Vd, (Ty (OpNode (Ty DPR:$Vm), (i32 imm:$SIMM))))]>;
+class N2VQSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+             Format f, InstrItinClass itin, Operand ImmTy,
+             string OpcodeStr, string Dt, ValueType Ty, SDNode OpNode>
+  : N2VImm<op24, op23, op11_8, op7, 1, op4,
+           (outs QPR:$Vd), (ins QPR:$Vm, ImmTy:$SIMM), f, itin,
+           OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "",
+           [(set QPR:$Vd, (Ty (OpNode (Ty QPR:$Vm), (i32 imm:$SIMM))))]>;
+}
+
+// Long shift by immediate.
+class N2VLSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
+             string OpcodeStr, string Dt,
+             ValueType ResTy, ValueType OpTy, Operand ImmTy,
+             SDPatternOperator OpNode>
+  : N2VImm<op24, op23, op11_8, op7, op6, op4,
+           (outs QPR:$Vd), (ins DPR:$Vm, ImmTy:$SIMM), N2RegVShLFrm,
+           IIC_VSHLiD, OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "",
+           [(set QPR:$Vd, (ResTy (OpNode (OpTy DPR:$Vm), ImmTy:$SIMM)))]>;
+
+// Narrow shift by immediate.
+class N2VNSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
+             InstrItinClass itin, string OpcodeStr, string Dt,
+             ValueType ResTy, ValueType OpTy, Operand ImmTy,
+             SDPatternOperator OpNode>
+  : N2VImm<op24, op23, op11_8, op7, op6, op4,
+           (outs DPR:$Vd), (ins QPR:$Vm, ImmTy:$SIMM), N2RegVShRFrm, itin,
+           OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "",
+           [(set DPR:$Vd, (ResTy (OpNode (OpTy QPR:$Vm),
+                                          (i32 ImmTy:$SIMM))))]>;
+
+// Shift right by immediate and accumulate,
+// both double- and quad-register.
+let TwoOperandAliasConstraint = "$Vm = $Vd" in {
+class N2VDShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+                Operand ImmTy, string OpcodeStr, string Dt,
+                ValueType Ty, SDNode ShOp>
+  : N2VImm<op24, op23, op11_8, op7, 0, op4, (outs DPR:$Vd),
+           (ins DPR:$src1, DPR:$Vm, ImmTy:$SIMM), N2RegVShRFrm, IIC_VPALiD,
+           OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "$src1 = $Vd",
+           [(set DPR:$Vd, (Ty (add DPR:$src1,
+                                (Ty (ShOp DPR:$Vm, (i32 imm:$SIMM))))))]>;
+class N2VQShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+                Operand ImmTy, string OpcodeStr, string Dt,
+                ValueType Ty, SDNode ShOp>
+  : N2VImm<op24, op23, op11_8, op7, 1, op4, (outs QPR:$Vd),
+           (ins QPR:$src1, QPR:$Vm, ImmTy:$SIMM), N2RegVShRFrm, IIC_VPALiD,
+           OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "$src1 = $Vd",
+           [(set QPR:$Vd, (Ty (add QPR:$src1,
+                                (Ty (ShOp QPR:$Vm, (i32 imm:$SIMM))))))]>;
+}
+
+// Shift by immediate and insert,
+// both double- and quad-register.
+let TwoOperandAliasConstraint = "$Vm = $Vd" in {
+class N2VDShIns<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+                Operand ImmTy, Format f, string OpcodeStr, string Dt,
+                ValueType Ty,SDNode ShOp>
+  : N2VImm<op24, op23, op11_8, op7, 0, op4, (outs DPR:$Vd),
+           (ins DPR:$src1, DPR:$Vm, ImmTy:$SIMM), f, IIC_VSHLiD,
+           OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "$src1 = $Vd",
+           [(set DPR:$Vd, (Ty (ShOp DPR:$src1, DPR:$Vm, (i32 imm:$SIMM))))]>;
+class N2VQShIns<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+                Operand ImmTy, Format f, string OpcodeStr, string Dt,
+                ValueType Ty,SDNode ShOp>
+  : N2VImm<op24, op23, op11_8, op7, 1, op4, (outs QPR:$Vd),
+           (ins QPR:$src1, QPR:$Vm, ImmTy:$SIMM), f, IIC_VSHLiQ,
+           OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "$src1 = $Vd",
+           [(set QPR:$Vd, (Ty (ShOp QPR:$src1, QPR:$Vm, (i32 imm:$SIMM))))]>;
+}
+
+// Convert, with fractional bits immediate,
+// both double- and quad-register.
+class N2VCvtD<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+              string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy,
+              SDPatternOperator IntOp>
+  : N2VImm<op24, op23, op11_8, op7, 0, op4,
+           (outs DPR:$Vd), (ins DPR:$Vm, neon_vcvt_imm32:$SIMM), NVCVTFrm,
+           IIC_VUNAD, OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "",
+           [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm), (i32 imm:$SIMM))))]>;
+class N2VCvtQ<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+              string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy,
+              SDPatternOperator IntOp>
+  : N2VImm<op24, op23, op11_8, op7, 1, op4,
+           (outs QPR:$Vd), (ins QPR:$Vm, neon_vcvt_imm32:$SIMM), NVCVTFrm,
+           IIC_VUNAQ, OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "",
+           [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm), (i32 imm:$SIMM))))]>;
+
+//===----------------------------------------------------------------------===//
+// Multiclasses
+//===----------------------------------------------------------------------===//
+
+// Abbreviations used in multiclass suffixes:
+//   Q = quarter int (8 bit) elements
+//   H = half int (16 bit) elements
+//   S = single int (32 bit) elements
+//   D = double int (64 bit) elements
+
+// Neon 2-register vector operations and intrinsics.
+
+// Neon 2-register comparisons.
+//   source operand element sizes of 8, 16 and 32 bits:
+multiclass N2V_QHS_cmp<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
+                       bits<5> op11_7, bit op4, string opc, string Dt,
+                       string asm, SDNode OpNode> {
+  // 64-bit vector types.
+  def v8i8  : N2V<op24_23, op21_20, 0b00, op17_16, op11_7, 0, op4,
+                  (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
+                  opc, !strconcat(Dt, "8"), asm, "",
+                  [(set DPR:$Vd, (v8i8 (OpNode (v8i8 DPR:$Vm))))]>;
+  def v4i16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 0, op4,
+                  (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
+                  opc, !strconcat(Dt, "16"), asm, "",
+                  [(set DPR:$Vd, (v4i16 (OpNode (v4i16 DPR:$Vm))))]>;
+  def v2i32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 0, op4,
+                  (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
+                  opc, !strconcat(Dt, "32"), asm, "",
+                  [(set DPR:$Vd, (v2i32 (OpNode (v2i32 DPR:$Vm))))]>;
+  def v2f32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 0, op4,
+                  (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
+                  opc, "f32", asm, "",
+                  [(set DPR:$Vd, (v2i32 (OpNode (v2f32 DPR:$Vm))))]> {
+    let Inst{10} = 1; // overwrite F = 1
+  }
+
+  // 128-bit vector types.
+  def v16i8 : N2V<op24_23, op21_20, 0b00, op17_16, op11_7, 1, op4,
+                  (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
+                  opc, !strconcat(Dt, "8"), asm, "",
+                  [(set QPR:$Vd, (v16i8 (OpNode (v16i8 QPR:$Vm))))]>;
+  def v8i16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 1, op4,
+                  (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
+                  opc, !strconcat(Dt, "16"), asm, "",
+                  [(set QPR:$Vd, (v8i16 (OpNode (v8i16 QPR:$Vm))))]>;
+  def v4i32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 1, op4,
+                  (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
+                  opc, !strconcat(Dt, "32"), asm, "",
+                  [(set QPR:$Vd, (v4i32 (OpNode (v4i32 QPR:$Vm))))]>;
+  def v4f32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 1, op4,
+                  (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
+                  opc, "f32", asm, "",
+                  [(set QPR:$Vd, (v4i32 (OpNode (v4f32 QPR:$Vm))))]> {
+    let Inst{10} = 1; // overwrite F = 1
+  }
+}
+
+
+// Neon 2-register vector intrinsics,
+//   element sizes of 8, 16 and 32 bits:
+multiclass N2VInt_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
+                      bits<5> op11_7, bit op4,
+                      InstrItinClass itinD, InstrItinClass itinQ,
+                      string OpcodeStr, string Dt, SDPatternOperator IntOp> {
+  // 64-bit vector types.
+  def v8i8  : N2VDInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
+                      itinD, OpcodeStr, !strconcat(Dt, "8"), v8i8, v8i8, IntOp>;
+  def v4i16 : N2VDInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4,
+                      itinD, OpcodeStr, !strconcat(Dt, "16"),v4i16,v4i16,IntOp>;
+  def v2i32 : N2VDInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4,
+                      itinD, OpcodeStr, !strconcat(Dt, "32"),v2i32,v2i32,IntOp>;
+
+  // 128-bit vector types.
+  def v16i8 : N2VQInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
+                      itinQ, OpcodeStr, !strconcat(Dt, "8"), v16i8,v16i8,IntOp>;
+  def v8i16 : N2VQInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4,
+                      itinQ, OpcodeStr, !strconcat(Dt, "16"),v8i16,v8i16,IntOp>;
+  def v4i32 : N2VQInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4,
+                      itinQ, OpcodeStr, !strconcat(Dt, "32"),v4i32,v4i32,IntOp>;
+}
+
+
+// Neon Narrowing 2-register vector operations,
+//   source operand element sizes of 16, 32 and 64 bits:
+multiclass N2VN_HSD<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
+                    bits<5> op11_7, bit op6, bit op4,
+                    InstrItinClass itin, string OpcodeStr, string Dt,
+                    SDNode OpNode> {
+  def v8i8  : N2VN<op24_23, op21_20, 0b00, op17_16, op11_7, op6, op4,
+                   itin, OpcodeStr, !strconcat(Dt, "16"),
+                   v8i8, v8i16, OpNode>;
+  def v4i16 : N2VN<op24_23, op21_20, 0b01, op17_16, op11_7, op6, op4,
+                   itin, OpcodeStr, !strconcat(Dt, "32"),
+                   v4i16, v4i32, OpNode>;
+  def v2i32 : N2VN<op24_23, op21_20, 0b10, op17_16, op11_7, op6, op4,
+                   itin, OpcodeStr, !strconcat(Dt, "64"),
+                   v2i32, v2i64, OpNode>;
+}
+
+// Neon Narrowing 2-register vector intrinsics,
+//   source operand element sizes of 16, 32 and 64 bits:
+multiclass N2VNInt_HSD<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
+                       bits<5> op11_7, bit op6, bit op4,
+                       InstrItinClass itin, string OpcodeStr, string Dt,
+                       SDPatternOperator IntOp> {
+  def v8i8  : N2VNInt<op24_23, op21_20, 0b00, op17_16, op11_7, op6, op4,
+                      itin, OpcodeStr, !strconcat(Dt, "16"),
+                      v8i8, v8i16, IntOp>;
+  def v4i16 : N2VNInt<op24_23, op21_20, 0b01, op17_16, op11_7, op6, op4,
+                      itin, OpcodeStr, !strconcat(Dt, "32"),
+                      v4i16, v4i32, IntOp>;
+  def v2i32 : N2VNInt<op24_23, op21_20, 0b10, op17_16, op11_7, op6, op4,
+                      itin, OpcodeStr, !strconcat(Dt, "64"),
+                      v2i32, v2i64, IntOp>;
+}
+
+
+// Neon Lengthening 2-register vector intrinsic (currently specific to VMOVL).
+//   source operand element sizes of 16, 32 and 64 bits:
+multiclass N2VL_QHS<bits<2> op24_23, bits<5> op11_7, bit op6, bit op4,
+                    string OpcodeStr, string Dt, SDNode OpNode> {
+  def v8i16 : N2VL<op24_23, 0b00, 0b10, 0b00, op11_7, op6, op4, IIC_VQUNAiD,
+                   OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, OpNode>;
+  def v4i32 : N2VL<op24_23, 0b01, 0b00, 0b00, op11_7, op6, op4, IIC_VQUNAiD,
+                   OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, OpNode>;
+  def v2i64 : N2VL<op24_23, 0b10, 0b00, 0b00, op11_7, op6, op4, IIC_VQUNAiD,
+                   OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, OpNode>;
+}
+
+
+// Neon 3-register vector operations.
+
+// First with only element sizes of 8, 16 and 32 bits:
+multiclass N3V_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                   InstrItinClass itinD16, InstrItinClass itinD32,
+                   InstrItinClass itinQ16, InstrItinClass itinQ32,
+                   string OpcodeStr, string Dt,
+                   SDNode OpNode, bit Commutable = 0> {
+  // 64-bit vector types.
+  def v8i8  : N3VD<op24, op23, 0b00, op11_8, op4, itinD16,
+                   OpcodeStr, !strconcat(Dt, "8"),
+                   v8i8, v8i8, OpNode, Commutable>;
+  def v4i16 : N3VD<op24, op23, 0b01, op11_8, op4, itinD16,
+                   OpcodeStr, !strconcat(Dt, "16"),
+                   v4i16, v4i16, OpNode, Commutable>;
+  def v2i32 : N3VD<op24, op23, 0b10, op11_8, op4, itinD32,
+                   OpcodeStr, !strconcat(Dt, "32"),
+                   v2i32, v2i32, OpNode, Commutable>;
+
+  // 128-bit vector types.
+  def v16i8 : N3VQ<op24, op23, 0b00, op11_8, op4, itinQ16,
+                   OpcodeStr, !strconcat(Dt, "8"),
+                   v16i8, v16i8, OpNode, Commutable>;
+  def v8i16 : N3VQ<op24, op23, 0b01, op11_8, op4, itinQ16,
+                   OpcodeStr, !strconcat(Dt, "16"),
+                   v8i16, v8i16, OpNode, Commutable>;
+  def v4i32 : N3VQ<op24, op23, 0b10, op11_8, op4, itinQ32,
+                   OpcodeStr, !strconcat(Dt, "32"),
+                   v4i32, v4i32, OpNode, Commutable>;
+}
+
+multiclass N3VSL_HS<bits<4> op11_8, string OpcodeStr, SDNode ShOp> {
+  def v4i16 : N3VDSL16<0b01, op11_8, OpcodeStr, "i16", v4i16, ShOp>;
+  def v2i32 : N3VDSL<0b10, op11_8, IIC_VMULi32D, OpcodeStr, "i32", v2i32, ShOp>;
+  def v8i16 : N3VQSL16<0b01, op11_8, OpcodeStr, "i16", v8i16, v4i16, ShOp>;
+  def v4i32 : N3VQSL<0b10, op11_8, IIC_VMULi32Q, OpcodeStr, "i32",
+                     v4i32, v2i32, ShOp>;
+}
+
+// ....then also with element size 64 bits:
+multiclass N3V_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+                    InstrItinClass itinD, InstrItinClass itinQ,
+                    string OpcodeStr, string Dt,
+                    SDNode OpNode, bit Commutable = 0>
+  : N3V_QHS<op24, op23, op11_8, op4, itinD, itinD, itinQ, itinQ,
+            OpcodeStr, Dt, OpNode, Commutable> {
+  def v1i64 : N3VD<op24, op23, 0b11, op11_8, op4, itinD,
+                   OpcodeStr, !strconcat(Dt, "64"),
+                   v1i64, v1i64, OpNode, Commutable>;
+  def v2i64 : N3VQ<op24, op23, 0b11, op11_8, op4, itinQ,
+                   OpcodeStr, !strconcat(Dt, "64"),
+                   v2i64, v2i64, OpNode, Commutable>;
+}
+
+
+// Neon 3-register vector intrinsics.
+
+// First with only element sizes of 16 and 32 bits:
+multiclass N3VInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
+                     InstrItinClass itinD16, InstrItinClass itinD32,
+                     InstrItinClass itinQ16, InstrItinClass itinQ32,
+                     string OpcodeStr, string Dt,
+                     SDPatternOperator IntOp, bit Commutable = 0> {
+  // 64-bit vector types.
+  def v4i16 : N3VDInt<op24, op23, 0b01, op11_8, op4, f, itinD16,
+                      OpcodeStr, !strconcat(Dt, "16"),
+                      v4i16, v4i16, IntOp, Commutable>;
+  def v2i32 : N3VDInt<op24, op23, 0b10, op11_8, op4, f, itinD32,
+                      OpcodeStr, !strconcat(Dt, "32"),
+                      v2i32, v2i32, IntOp, Commutable>;
+
+  // 128-bit vector types.
+  def v8i16 : N3VQInt<op24, op23, 0b01, op11_8, op4, f, itinQ16,
+                      OpcodeStr, !strconcat(Dt, "16"),
+                      v8i16, v8i16, IntOp, Commutable>;
+  def v4i32 : N3VQInt<op24, op23, 0b10, op11_8, op4, f, itinQ32,
+                      OpcodeStr, !strconcat(Dt, "32"),
+                      v4i32, v4i32, IntOp, Commutable>;
+}
+multiclass N3VInt_HSSh<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
+                     InstrItinClass itinD16, InstrItinClass itinD32,
+                     InstrItinClass itinQ16, InstrItinClass itinQ32,
+                     string OpcodeStr, string Dt,
+                     SDPatternOperator IntOp> {
+  // 64-bit vector types.
+  def v4i16 : N3VDIntSh<op24, op23, 0b01, op11_8, op4, f, itinD16,
+                      OpcodeStr, !strconcat(Dt, "16"),
+                      v4i16, v4i16, IntOp>;
+  def v2i32 : N3VDIntSh<op24, op23, 0b10, op11_8, op4, f, itinD32,
+                      OpcodeStr, !strconcat(Dt, "32"),
+                      v2i32, v2i32, IntOp>;
+
+  // 128-bit vector types.
+  def v8i16 : N3VQIntSh<op24, op23, 0b01, op11_8, op4, f, itinQ16,
+                      OpcodeStr, !strconcat(Dt, "16"),
+                      v8i16, v8i16, IntOp>;
+  def v4i32 : N3VQIntSh<op24, op23, 0b10, op11_8, op4, f, itinQ32,
+                      OpcodeStr, !strconcat(Dt, "32"),
+                      v4i32, v4i32, IntOp>;
+}
+
+multiclass N3VIntSL_HS<bits<4> op11_8,
+                       InstrItinClass itinD16, InstrItinClass itinD32,
+                       InstrItinClass itinQ16, InstrItinClass itinQ32,
+                       string OpcodeStr, string Dt, SDPatternOperator IntOp> {
+  def v4i16 : N3VDIntSL16<0b01, op11_8, itinD16,
+                          OpcodeStr, !strconcat(Dt, "16"), v4i16, IntOp>;
+  def v2i32 : N3VDIntSL<0b10, op11_8, itinD32,
+                        OpcodeStr, !strconcat(Dt, "32"), v2i32, IntOp>;
+  def v8i16 : N3VQIntSL16<0b01, op11_8, itinQ16,
+                          OpcodeStr, !strconcat(Dt, "16"), v8i16, v4i16, IntOp>;
+  def v4i32 : N3VQIntSL<0b10, op11_8, itinQ32,
+                        OpcodeStr, !strconcat(Dt, "32"), v4i32, v2i32, IntOp>;
+}
+
+// ....then also with element size of 8 bits:
+multiclass N3VInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
+                      InstrItinClass itinD16, InstrItinClass itinD32,
+                      InstrItinClass itinQ16, InstrItinClass itinQ32,
+                      string OpcodeStr, string Dt,
+                      SDPatternOperator IntOp, bit Commutable = 0>
+  : N3VInt_HS<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32,
+              OpcodeStr, Dt, IntOp, Commutable> {
+  def v8i8  : N3VDInt<op24, op23, 0b00, op11_8, op4, f, itinD16,
+                      OpcodeStr, !strconcat(Dt, "8"),
+                      v8i8, v8i8, IntOp, Commutable>;
+  def v16i8 : N3VQInt<op24, op23, 0b00, op11_8, op4, f, itinQ16,
+                      OpcodeStr, !strconcat(Dt, "8"),
+                      v16i8, v16i8, IntOp, Commutable>;
+}
+multiclass N3VInt_QHSSh<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
+                      InstrItinClass itinD16, InstrItinClass itinD32,
+                      InstrItinClass itinQ16, InstrItinClass itinQ32,
+                      string OpcodeStr, string Dt,
+                      SDPatternOperator IntOp>
+  : N3VInt_HSSh<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32,
+              OpcodeStr, Dt, IntOp> {
+  def v8i8  : N3VDIntSh<op24, op23, 0b00, op11_8, op4, f, itinD16,
+                      OpcodeStr, !strconcat(Dt, "8"),
+                      v8i8, v8i8, IntOp>;
+  def v16i8 : N3VQIntSh<op24, op23, 0b00, op11_8, op4, f, itinQ16,
+                      OpcodeStr, !strconcat(Dt, "8"),
+                      v16i8, v16i8, IntOp>;
+}
+
+
+// ....then also with element size of 64 bits:
+multiclass N3VInt_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
+                       InstrItinClass itinD16, InstrItinClass itinD32,
+                       InstrItinClass itinQ16, InstrItinClass itinQ32,
+                       string OpcodeStr, string Dt,
+                       SDPatternOperator IntOp, bit Commutable = 0>
+  : N3VInt_QHS<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32,
+               OpcodeStr, Dt, IntOp, Commutable> {
+  def v1i64 : N3VDInt<op24, op23, 0b11, op11_8, op4, f, itinD32,
+                      OpcodeStr, !strconcat(Dt, "64"),
+                      v1i64, v1i64, IntOp, Commutable>;
+  def v2i64 : N3VQInt<op24, op23, 0b11, op11_8, op4, f, itinQ32,
+                      OpcodeStr, !strconcat(Dt, "64"),
+                      v2i64, v2i64, IntOp, Commutable>;
+}
+multiclass N3VInt_QHSDSh<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
+                       InstrItinClass itinD16, InstrItinClass itinD32,
+                       InstrItinClass itinQ16, InstrItinClass itinQ32,
+                       string OpcodeStr, string Dt,
+                       SDPatternOperator IntOp>
+  : N3VInt_QHSSh<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32,
+               OpcodeStr, Dt, IntOp> {
+  def v1i64 : N3VDIntSh<op24, op23, 0b11, op11_8, op4, f, itinD32,
+                      OpcodeStr, !strconcat(Dt, "64"),
+                      v1i64, v1i64, IntOp>;
+  def v2i64 : N3VQIntSh<op24, op23, 0b11, op11_8, op4, f, itinQ32,
+                      OpcodeStr, !strconcat(Dt, "64"),
+                      v2i64, v2i64, IntOp>;
+}
+
+// Neon Narrowing 3-register vector intrinsics,
+//   source operand element sizes of 16, 32 and 64 bits:
+multiclass N3VNInt_HSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+                       string OpcodeStr, string Dt,
+                       SDPatternOperator IntOp, bit Commutable = 0> {
+  def v8i8  : N3VNInt<op24, op23, 0b00, op11_8, op4,
+                      OpcodeStr, !strconcat(Dt, "16"),
+                      v8i8, v8i16, IntOp, Commutable>;
+  def v4i16 : N3VNInt<op24, op23, 0b01, op11_8, op4,
+                      OpcodeStr, !strconcat(Dt, "32"),
+                      v4i16, v4i32, IntOp, Commutable>;
+  def v2i32 : N3VNInt<op24, op23, 0b10, op11_8, op4,
+                      OpcodeStr, !strconcat(Dt, "64"),
+                      v2i32, v2i64, IntOp, Commutable>;
+}
+
+
+// Neon Long 3-register vector operations.
+
+multiclass N3VL_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                    InstrItinClass itin16, InstrItinClass itin32,
+                    string OpcodeStr, string Dt,
+                    SDNode OpNode, bit Commutable = 0> {
+  def v8i16 : N3VL<op24, op23, 0b00, op11_8, op4, itin16,
+                   OpcodeStr, !strconcat(Dt, "8"),
+                   v8i16, v8i8, OpNode, Commutable>;
+  def v4i32 : N3VL<op24, op23, 0b01, op11_8, op4, itin16,
+                   OpcodeStr, !strconcat(Dt, "16"),
+                   v4i32, v4i16, OpNode, Commutable>;
+  def v2i64 : N3VL<op24, op23, 0b10, op11_8, op4, itin32,
+                   OpcodeStr, !strconcat(Dt, "32"),
+                   v2i64, v2i32, OpNode, Commutable>;
+}
+
+multiclass N3VLSL_HS<bit op24, bits<4> op11_8,
+                     InstrItinClass itin, string OpcodeStr, string Dt,
+                     SDNode OpNode> {
+  def v4i16 : N3VLSL16<op24, 0b01, op11_8, itin, OpcodeStr,
+                       !strconcat(Dt, "16"), v4i32, v4i16, OpNode>;
+  def v2i32 : N3VLSL<op24, 0b10, op11_8, itin, OpcodeStr,
+                     !strconcat(Dt, "32"), v2i64, v2i32, OpNode>;
+}
+
+multiclass N3VLExt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                       InstrItinClass itin16, InstrItinClass itin32,
+                       string OpcodeStr, string Dt,
+                       SDNode OpNode, SDNode ExtOp, bit Commutable = 0> {
+  def v8i16 : N3VLExt<op24, op23, 0b00, op11_8, op4, itin16,
+                      OpcodeStr, !strconcat(Dt, "8"),
+                      v8i16, v8i8, OpNode, ExtOp, Commutable>;
+  def v4i32 : N3VLExt<op24, op23, 0b01, op11_8, op4, itin16,
+                      OpcodeStr, !strconcat(Dt, "16"),
+                      v4i32, v4i16, OpNode, ExtOp, Commutable>;
+  def v2i64 : N3VLExt<op24, op23, 0b10, op11_8, op4, itin32,
+                      OpcodeStr, !strconcat(Dt, "32"),
+                      v2i64, v2i32, OpNode, ExtOp, Commutable>;
+}
+
+// Neon Long 3-register vector intrinsics.
+
+// First with only element sizes of 16 and 32 bits:
+multiclass N3VLInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                      InstrItinClass itin16, InstrItinClass itin32,
+                      string OpcodeStr, string Dt,
+                      SDPatternOperator IntOp, bit Commutable = 0> {
+  def v4i32 : N3VLInt<op24, op23, 0b01, op11_8, op4, itin16,
+                      OpcodeStr, !strconcat(Dt, "16"),
+                      v4i32, v4i16, IntOp, Commutable>;
+  def v2i64 : N3VLInt<op24, op23, 0b10, op11_8, op4, itin32,
+                      OpcodeStr, !strconcat(Dt, "32"),
+                      v2i64, v2i32, IntOp, Commutable>;
+}
+
+multiclass N3VLIntSL_HS<bit op24, bits<4> op11_8,
+                        InstrItinClass itin, string OpcodeStr, string Dt,
+                        SDPatternOperator IntOp> {
+  def v4i16 : N3VLIntSL16<op24, 0b01, op11_8, itin,
+                          OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, IntOp>;
+  def v2i32 : N3VLIntSL<op24, 0b10, op11_8, itin,
+                        OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, IntOp>;
+}
+
+// ....then also with element size of 8 bits:
+multiclass N3VLInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                       InstrItinClass itin16, InstrItinClass itin32,
+                       string OpcodeStr, string Dt,
+                       SDPatternOperator IntOp, bit Commutable = 0>
+  : N3VLInt_HS<op24, op23, op11_8, op4, itin16, itin32, OpcodeStr, Dt,
+               IntOp, Commutable> {
+  def v8i16 : N3VLInt<op24, op23, 0b00, op11_8, op4, itin16,
+                      OpcodeStr, !strconcat(Dt, "8"),
+                      v8i16, v8i8, IntOp, Commutable>;
+}
+
+// ....with explicit extend (VABDL).
+multiclass N3VLIntExt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                       InstrItinClass itin, string OpcodeStr, string Dt,
+                       SDPatternOperator IntOp, SDNode ExtOp, bit Commutable = 0> {
+  def v8i16 : N3VLIntExt<op24, op23, 0b00, op11_8, op4, itin,
+                         OpcodeStr, !strconcat(Dt, "8"),
+                         v8i16, v8i8, IntOp, ExtOp, Commutable>;
+  def v4i32 : N3VLIntExt<op24, op23, 0b01, op11_8, op4, itin,
+                         OpcodeStr, !strconcat(Dt, "16"),
+                         v4i32, v4i16, IntOp, ExtOp, Commutable>;
+  def v2i64 : N3VLIntExt<op24, op23, 0b10, op11_8, op4, itin,
+                         OpcodeStr, !strconcat(Dt, "32"),
+                         v2i64, v2i32, IntOp, ExtOp, Commutable>;
+}
+
+
+// Neon Wide 3-register vector intrinsics,
+//   source operand element sizes of 8, 16 and 32 bits:
+multiclass N3VW_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                    string OpcodeStr, string Dt,
+                    SDNode OpNode, SDNode ExtOp, bit Commutable = 0> {
+  def v8i16 : N3VW<op24, op23, 0b00, op11_8, op4,
+                   OpcodeStr, !strconcat(Dt, "8"),
+                   v8i16, v8i8, OpNode, ExtOp, Commutable>;
+  def v4i32 : N3VW<op24, op23, 0b01, op11_8, op4,
+                   OpcodeStr, !strconcat(Dt, "16"),
+                   v4i32, v4i16, OpNode, ExtOp, Commutable>;
+  def v2i64 : N3VW<op24, op23, 0b10, op11_8, op4,
+                   OpcodeStr, !strconcat(Dt, "32"),
+                   v2i64, v2i32, OpNode, ExtOp, Commutable>;
+}
+
+
+// Neon Multiply-Op vector operations,
+//   element sizes of 8, 16 and 32 bits:
+multiclass N3VMulOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                        InstrItinClass itinD16, InstrItinClass itinD32,
+                        InstrItinClass itinQ16, InstrItinClass itinQ32,
+                        string OpcodeStr, string Dt, SDNode OpNode> {
+  // 64-bit vector types.
+  def v8i8  : N3VDMulOp<op24, op23, 0b00, op11_8, op4, itinD16,
+                        OpcodeStr, !strconcat(Dt, "8"), v8i8, mul, OpNode>;
+  def v4i16 : N3VDMulOp<op24, op23, 0b01, op11_8, op4, itinD16,
+                        OpcodeStr, !strconcat(Dt, "16"), v4i16, mul, OpNode>;
+  def v2i32 : N3VDMulOp<op24, op23, 0b10, op11_8, op4, itinD32,
+                        OpcodeStr, !strconcat(Dt, "32"), v2i32, mul, OpNode>;
+
+  // 128-bit vector types.
+  def v16i8 : N3VQMulOp<op24, op23, 0b00, op11_8, op4, itinQ16,
+                        OpcodeStr, !strconcat(Dt, "8"), v16i8, mul, OpNode>;
+  def v8i16 : N3VQMulOp<op24, op23, 0b01, op11_8, op4, itinQ16,
+                        OpcodeStr, !strconcat(Dt, "16"), v8i16, mul, OpNode>;
+  def v4i32 : N3VQMulOp<op24, op23, 0b10, op11_8, op4, itinQ32,
+                        OpcodeStr, !strconcat(Dt, "32"), v4i32, mul, OpNode>;
+}
+
+multiclass N3VMulOpSL_HS<bits<4> op11_8,
+                         InstrItinClass itinD16, InstrItinClass itinD32,
+                         InstrItinClass itinQ16, InstrItinClass itinQ32,
+                         string OpcodeStr, string Dt, SDNode ShOp> {
+  def v4i16 : N3VDMulOpSL16<0b01, op11_8, itinD16,
+                            OpcodeStr, !strconcat(Dt, "16"), v4i16, mul, ShOp>;
+  def v2i32 : N3VDMulOpSL<0b10, op11_8, itinD32,
+                          OpcodeStr, !strconcat(Dt, "32"), v2i32, mul, ShOp>;
+  def v8i16 : N3VQMulOpSL16<0b01, op11_8, itinQ16,
+                            OpcodeStr, !strconcat(Dt, "16"), v8i16, v4i16,
+                            mul, ShOp>;
+  def v4i32 : N3VQMulOpSL<0b10, op11_8, itinQ32,
+                          OpcodeStr, !strconcat(Dt, "32"), v4i32, v2i32,
+                          mul, ShOp>;
+}
+
+// Neon Intrinsic-Op vector operations,
+//   element sizes of 8, 16 and 32 bits:
+multiclass N3VIntOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                        InstrItinClass itinD, InstrItinClass itinQ,
+                        string OpcodeStr, string Dt, SDPatternOperator IntOp,
+                        SDNode OpNode> {
+  // 64-bit vector types.
+  def v8i8  : N3VDIntOp<op24, op23, 0b00, op11_8, op4, itinD,
+                        OpcodeStr, !strconcat(Dt, "8"), v8i8, IntOp, OpNode>;
+  def v4i16 : N3VDIntOp<op24, op23, 0b01, op11_8, op4, itinD,
+                        OpcodeStr, !strconcat(Dt, "16"), v4i16, IntOp, OpNode>;
+  def v2i32 : N3VDIntOp<op24, op23, 0b10, op11_8, op4, itinD,
+                        OpcodeStr, !strconcat(Dt, "32"), v2i32, IntOp, OpNode>;
+
+  // 128-bit vector types.
+  def v16i8 : N3VQIntOp<op24, op23, 0b00, op11_8, op4, itinQ,
+                        OpcodeStr, !strconcat(Dt, "8"), v16i8, IntOp, OpNode>;
+  def v8i16 : N3VQIntOp<op24, op23, 0b01, op11_8, op4, itinQ,
+                        OpcodeStr, !strconcat(Dt, "16"), v8i16, IntOp, OpNode>;
+  def v4i32 : N3VQIntOp<op24, op23, 0b10, op11_8, op4, itinQ,
+                        OpcodeStr, !strconcat(Dt, "32"), v4i32, IntOp, OpNode>;
+}
+
+// Neon 3-argument intrinsics,
+//   element sizes of 8, 16 and 32 bits:
+multiclass N3VInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                       InstrItinClass itinD, InstrItinClass itinQ,
+                       string OpcodeStr, string Dt, SDPatternOperator IntOp> {
+  // 64-bit vector types.
+  def v8i8  : N3VDInt3<op24, op23, 0b00, op11_8, op4, itinD,
+                       OpcodeStr, !strconcat(Dt, "8"), v8i8, v8i8, IntOp>;
+  def v4i16 : N3VDInt3<op24, op23, 0b01, op11_8, op4, itinD,
+                       OpcodeStr, !strconcat(Dt, "16"), v4i16, v4i16, IntOp>;
+  def v2i32 : N3VDInt3<op24, op23, 0b10, op11_8, op4, itinD,
+                       OpcodeStr, !strconcat(Dt, "32"), v2i32, v2i32, IntOp>;
+
+  // 128-bit vector types.
+  def v16i8 : N3VQInt3<op24, op23, 0b00, op11_8, op4, itinQ,
+                       OpcodeStr, !strconcat(Dt, "8"), v16i8, v16i8, IntOp>;
+  def v8i16 : N3VQInt3<op24, op23, 0b01, op11_8, op4, itinQ,
+                       OpcodeStr, !strconcat(Dt, "16"), v8i16, v8i16, IntOp>;
+  def v4i32 : N3VQInt3<op24, op23, 0b10, op11_8, op4, itinQ,
+                       OpcodeStr, !strconcat(Dt, "32"), v4i32, v4i32, IntOp>;
+}
+
+
+// Neon Long Multiply-Op vector operations,
+//   element sizes of 8, 16 and 32 bits:
+multiclass N3VLMulOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                         InstrItinClass itin16, InstrItinClass itin32,
+                         string OpcodeStr, string Dt, SDNode MulOp,
+                         SDNode OpNode> {
+  def v8i16 : N3VLMulOp<op24, op23, 0b00, op11_8, op4, itin16, OpcodeStr,
+                        !strconcat(Dt, "8"), v8i16, v8i8, MulOp, OpNode>;
+  def v4i32 : N3VLMulOp<op24, op23, 0b01, op11_8, op4, itin16, OpcodeStr,
+                        !strconcat(Dt, "16"), v4i32, v4i16, MulOp, OpNode>;
+  def v2i64 : N3VLMulOp<op24, op23, 0b10, op11_8, op4, itin32, OpcodeStr,
+                        !strconcat(Dt, "32"), v2i64, v2i32, MulOp, OpNode>;
+}
+
+multiclass N3VLMulOpSL_HS<bit op24, bits<4> op11_8, string OpcodeStr,
+                          string Dt, SDNode MulOp, SDNode OpNode> {
+  def v4i16 : N3VLMulOpSL16<op24, 0b01, op11_8, IIC_VMACi16D, OpcodeStr,
+                            !strconcat(Dt,"16"), v4i32, v4i16, MulOp, OpNode>;
+  def v2i32 : N3VLMulOpSL<op24, 0b10, op11_8, IIC_VMACi32D, OpcodeStr,
+                          !strconcat(Dt, "32"), v2i64, v2i32, MulOp, OpNode>;
+}
+
+
+// Neon Long 3-argument intrinsics.
+
+// First with only element sizes of 16 and 32 bits:
+multiclass N3VLInt3_HS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                       InstrItinClass itin16, InstrItinClass itin32,
+                       string OpcodeStr, string Dt, SDPatternOperator IntOp> {
+  def v4i32 : N3VLInt3<op24, op23, 0b01, op11_8, op4, itin16,
+                       OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, IntOp>;
+  def v2i64 : N3VLInt3<op24, op23, 0b10, op11_8, op4, itin32,
+                       OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, IntOp>;
+}
+
+multiclass N3VLInt3SL_HS<bit op24, bits<4> op11_8,
+                         string OpcodeStr, string Dt, SDPatternOperator IntOp> {
+  def v4i16 : N3VLInt3SL16<op24, 0b01, op11_8, IIC_VMACi16D,
+                           OpcodeStr, !strconcat(Dt,"16"), v4i32, v4i16, IntOp>;
+  def v2i32 : N3VLInt3SL<op24, 0b10, op11_8, IIC_VMACi32D,
+                         OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, IntOp>;
+}
+
+// ....then also with element size of 8 bits:
+multiclass N3VLInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                        InstrItinClass itin16, InstrItinClass itin32,
+                        string OpcodeStr, string Dt, SDPatternOperator IntOp>
+  : N3VLInt3_HS<op24, op23, op11_8, op4, itin16, itin32, OpcodeStr, Dt, IntOp> {
+  def v8i16 : N3VLInt3<op24, op23, 0b00, op11_8, op4, itin16,
+                       OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, IntOp>;
+}
+
+// ....with explicit extend (VABAL).
+multiclass N3VLIntExtOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                            InstrItinClass itin, string OpcodeStr, string Dt,
+                            SDPatternOperator IntOp, SDNode ExtOp, SDNode OpNode> {
+  def v8i16 : N3VLIntExtOp<op24, op23, 0b00, op11_8, op4, itin,
+                           OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8,
+                           IntOp, ExtOp, OpNode>;
+  def v4i32 : N3VLIntExtOp<op24, op23, 0b01, op11_8, op4, itin,
+                           OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16,
+                           IntOp, ExtOp, OpNode>;
+  def v2i64 : N3VLIntExtOp<op24, op23, 0b10, op11_8, op4, itin,
+                           OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32,
+                           IntOp, ExtOp, OpNode>;
+}
+
+
+// Neon Pairwise long 2-register intrinsics,
+//   element sizes of 8, 16 and 32 bits:
+multiclass N2VPLInt_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
+                        bits<5> op11_7, bit op4,
+                        string OpcodeStr, string Dt, SDPatternOperator IntOp> {
+  // 64-bit vector types.
+  def v8i8  : N2VDPLInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
+                        OpcodeStr, !strconcat(Dt, "8"), v4i16, v8i8, IntOp>;
+  def v4i16 : N2VDPLInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4,
+                        OpcodeStr, !strconcat(Dt, "16"), v2i32, v4i16, IntOp>;
+  def v2i32 : N2VDPLInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4,
+                        OpcodeStr, !strconcat(Dt, "32"), v1i64, v2i32, IntOp>;
+
+  // 128-bit vector types.
+  def v16i8 : N2VQPLInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
+                        OpcodeStr, !strconcat(Dt, "8"), v8i16, v16i8, IntOp>;
+  def v8i16 : N2VQPLInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4,
+                        OpcodeStr, !strconcat(Dt, "16"), v4i32, v8i16, IntOp>;
+  def v4i32 : N2VQPLInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4,
+                        OpcodeStr, !strconcat(Dt, "32"), v2i64, v4i32, IntOp>;
+}
+
+
+// Neon Pairwise long 2-register accumulate intrinsics,
+//   element sizes of 8, 16 and 32 bits:
+multiclass N2VPLInt2_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
+                         bits<5> op11_7, bit op4,
+                         string OpcodeStr, string Dt, SDPatternOperator IntOp> {
+  // 64-bit vector types.
+  def v8i8  : N2VDPLInt2<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
+                         OpcodeStr, !strconcat(Dt, "8"), v4i16, v8i8, IntOp>;
+  def v4i16 : N2VDPLInt2<op24_23, op21_20, 0b01, op17_16, op11_7, op4,
+                         OpcodeStr, !strconcat(Dt, "16"), v2i32, v4i16, IntOp>;
+  def v2i32 : N2VDPLInt2<op24_23, op21_20, 0b10, op17_16, op11_7, op4,
+                         OpcodeStr, !strconcat(Dt, "32"), v1i64, v2i32, IntOp>;
+
+  // 128-bit vector types.
+  def v16i8 : N2VQPLInt2<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
+                         OpcodeStr, !strconcat(Dt, "8"), v8i16, v16i8, IntOp>;
+  def v8i16 : N2VQPLInt2<op24_23, op21_20, 0b01, op17_16, op11_7, op4,
+                         OpcodeStr, !strconcat(Dt, "16"), v4i32, v8i16, IntOp>;
+  def v4i32 : N2VQPLInt2<op24_23, op21_20, 0b10, op17_16, op11_7, op4,
+                         OpcodeStr, !strconcat(Dt, "32"), v2i64, v4i32, IntOp>;
+}
+
+
+// Neon 2-register vector shift by immediate,
+//   with f of either N2RegVShLFrm or N2RegVShRFrm
+//   element sizes of 8, 16, 32 and 64 bits:
+multiclass N2VShL_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+                       InstrItinClass itin, string OpcodeStr, string Dt,
+                       SDNode OpNode> {
+  // 64-bit vector types.
+  def v8i8  : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm,
+                     OpcodeStr, !strconcat(Dt, "8"), v8i8, OpNode> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v4i16 : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm,
+                     OpcodeStr, !strconcat(Dt, "16"), v4i16, OpNode> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v2i32 : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm,
+                     OpcodeStr, !strconcat(Dt, "32"), v2i32, OpNode> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v1i64 : N2VDSh<op24, op23, op11_8, 1, op4, N2RegVShLFrm, itin, i32imm,
+                     OpcodeStr, !strconcat(Dt, "64"), v1i64, OpNode>;
+                             // imm6 = xxxxxx
+
+  // 128-bit vector types.
+  def v16i8 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm,
+                     OpcodeStr, !strconcat(Dt, "8"), v16i8, OpNode> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v8i16 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm,
+                     OpcodeStr, !strconcat(Dt, "16"), v8i16, OpNode> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v4i32 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm,
+                     OpcodeStr, !strconcat(Dt, "32"), v4i32, OpNode> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v2i64 : N2VQSh<op24, op23, op11_8, 1, op4, N2RegVShLFrm, itin, i32imm,
+                     OpcodeStr, !strconcat(Dt, "64"), v2i64, OpNode>;
+                             // imm6 = xxxxxx
+}
+multiclass N2VShR_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+                       InstrItinClass itin, string OpcodeStr, string Dt,
+                       string baseOpc, SDNode OpNode> {
+  // 64-bit vector types.
+  def v8i8  : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm8,
+                     OpcodeStr, !strconcat(Dt, "8"), v8i8, OpNode> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v4i16 : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm16,
+                     OpcodeStr, !strconcat(Dt, "16"), v4i16, OpNode> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v2i32 : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm32,
+                     OpcodeStr, !strconcat(Dt, "32"), v2i32, OpNode> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v1i64 : N2VDSh<op24, op23, op11_8, 1, op4, N2RegVShRFrm, itin, shr_imm64,
+                     OpcodeStr, !strconcat(Dt, "64"), v1i64, OpNode>;
+                             // imm6 = xxxxxx
+
+  // 128-bit vector types.
+  def v16i8 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm8,
+                     OpcodeStr, !strconcat(Dt, "8"), v16i8, OpNode> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v8i16 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm16,
+                     OpcodeStr, !strconcat(Dt, "16"), v8i16, OpNode> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v4i32 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm32,
+                     OpcodeStr, !strconcat(Dt, "32"), v4i32, OpNode> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v2i64 : N2VQSh<op24, op23, op11_8, 1, op4, N2RegVShRFrm, itin, shr_imm64,
+                     OpcodeStr, !strconcat(Dt, "64"), v2i64, OpNode>;
+                             // imm6 = xxxxxx
+}
+
+// Neon Shift-Accumulate vector operations,
+//   element sizes of 8, 16, 32 and 64 bits:
+multiclass N2VShAdd_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+                         string OpcodeStr, string Dt, SDNode ShOp> {
+  // 64-bit vector types.
+  def v8i8  : N2VDShAdd<op24, op23, op11_8, 0, op4, shr_imm8,
+                        OpcodeStr, !strconcat(Dt, "8"), v8i8, ShOp> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v4i16 : N2VDShAdd<op24, op23, op11_8, 0, op4, shr_imm16,
+                        OpcodeStr, !strconcat(Dt, "16"), v4i16, ShOp> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v2i32 : N2VDShAdd<op24, op23, op11_8, 0, op4, shr_imm32,
+                        OpcodeStr, !strconcat(Dt, "32"), v2i32, ShOp> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v1i64 : N2VDShAdd<op24, op23, op11_8, 1, op4, shr_imm64,
+                        OpcodeStr, !strconcat(Dt, "64"), v1i64, ShOp>;
+                             // imm6 = xxxxxx
+
+  // 128-bit vector types.
+  def v16i8 : N2VQShAdd<op24, op23, op11_8, 0, op4, shr_imm8,
+                        OpcodeStr, !strconcat(Dt, "8"), v16i8, ShOp> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v8i16 : N2VQShAdd<op24, op23, op11_8, 0, op4, shr_imm16,
+                        OpcodeStr, !strconcat(Dt, "16"), v8i16, ShOp> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v4i32 : N2VQShAdd<op24, op23, op11_8, 0, op4, shr_imm32,
+                        OpcodeStr, !strconcat(Dt, "32"), v4i32, ShOp> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v2i64 : N2VQShAdd<op24, op23, op11_8, 1, op4, shr_imm64,
+                        OpcodeStr, !strconcat(Dt, "64"), v2i64, ShOp>;
+                             // imm6 = xxxxxx
+}
+
+// Neon Shift-Insert vector operations,
+//   with f of either N2RegVShLFrm or N2RegVShRFrm
+//   element sizes of 8, 16, 32 and 64 bits:
+multiclass N2VShInsL_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+                          string OpcodeStr> {
+  // 64-bit vector types.
+  def v8i8  : N2VDShIns<op24, op23, op11_8, 0, op4, i32imm,
+                        N2RegVShLFrm, OpcodeStr, "8", v8i8, NEONvsli> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v4i16 : N2VDShIns<op24, op23, op11_8, 0, op4, i32imm,
+                        N2RegVShLFrm, OpcodeStr, "16", v4i16, NEONvsli> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v2i32 : N2VDShIns<op24, op23, op11_8, 0, op4, i32imm,
+                        N2RegVShLFrm, OpcodeStr, "32", v2i32, NEONvsli> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v1i64 : N2VDShIns<op24, op23, op11_8, 1, op4, i32imm,
+                        N2RegVShLFrm, OpcodeStr, "64", v1i64, NEONvsli>;
+                             // imm6 = xxxxxx
+
+  // 128-bit vector types.
+  def v16i8 : N2VQShIns<op24, op23, op11_8, 0, op4, i32imm,
+                        N2RegVShLFrm, OpcodeStr, "8", v16i8, NEONvsli> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v8i16 : N2VQShIns<op24, op23, op11_8, 0, op4, i32imm,
+                        N2RegVShLFrm, OpcodeStr, "16", v8i16, NEONvsli> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v4i32 : N2VQShIns<op24, op23, op11_8, 0, op4, i32imm,
+                        N2RegVShLFrm, OpcodeStr, "32", v4i32, NEONvsli> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v2i64 : N2VQShIns<op24, op23, op11_8, 1, op4, i32imm,
+                        N2RegVShLFrm, OpcodeStr, "64", v2i64, NEONvsli>;
+                             // imm6 = xxxxxx
+}
+multiclass N2VShInsR_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+                          string OpcodeStr> {
+  // 64-bit vector types.
+  def v8i8  : N2VDShIns<op24, op23, op11_8, 0, op4, shr_imm8,
+                        N2RegVShRFrm, OpcodeStr, "8", v8i8, NEONvsri> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v4i16 : N2VDShIns<op24, op23, op11_8, 0, op4, shr_imm16,
+                        N2RegVShRFrm, OpcodeStr, "16", v4i16, NEONvsri> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v2i32 : N2VDShIns<op24, op23, op11_8, 0, op4, shr_imm32,
+                        N2RegVShRFrm, OpcodeStr, "32", v2i32, NEONvsri> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v1i64 : N2VDShIns<op24, op23, op11_8, 1, op4, shr_imm64,
+                        N2RegVShRFrm, OpcodeStr, "64", v1i64, NEONvsri>;
+                             // imm6 = xxxxxx
+
+  // 128-bit vector types.
+  def v16i8 : N2VQShIns<op24, op23, op11_8, 0, op4, shr_imm8,
+                        N2RegVShRFrm, OpcodeStr, "8", v16i8, NEONvsri> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v8i16 : N2VQShIns<op24, op23, op11_8, 0, op4, shr_imm16,
+                        N2RegVShRFrm, OpcodeStr, "16", v8i16, NEONvsri> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v4i32 : N2VQShIns<op24, op23, op11_8, 0, op4, shr_imm32,
+                        N2RegVShRFrm, OpcodeStr, "32", v4i32, NEONvsri> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v2i64 : N2VQShIns<op24, op23, op11_8, 1, op4, shr_imm64,
+                        N2RegVShRFrm, OpcodeStr, "64", v2i64, NEONvsri>;
+                             // imm6 = xxxxxx
+}
+
+// Neon Shift Long operations,
+//   element sizes of 8, 16, 32 bits:
+multiclass N2VLSh_QHS<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6,
+                      bit op4, string OpcodeStr, string Dt,
+                      SDPatternOperator OpNode> {
+  def v8i16 : N2VLSh<op24, op23, op11_8, op7, op6, op4,
+              OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, imm1_7, OpNode> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v4i32 : N2VLSh<op24, op23, op11_8, op7, op6, op4,
+               OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, imm1_15, OpNode> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v2i64 : N2VLSh<op24, op23, op11_8, op7, op6, op4,
+               OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, imm1_31, OpNode> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+}
+
+// Neon Shift Narrow operations,
+//   element sizes of 16, 32, 64 bits:
+multiclass N2VNSh_HSD<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6,
+                      bit op4, InstrItinClass itin, string OpcodeStr, string Dt,
+                      SDPatternOperator OpNode> {
+  def v8i8 : N2VNSh<op24, op23, op11_8, op7, op6, op4, itin,
+                    OpcodeStr, !strconcat(Dt, "16"),
+                    v8i8, v8i16, shr_imm8, OpNode> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v4i16 : N2VNSh<op24, op23, op11_8, op7, op6, op4, itin,
+                     OpcodeStr, !strconcat(Dt, "32"),
+                     v4i16, v4i32, shr_imm16, OpNode> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v2i32 : N2VNSh<op24, op23, op11_8, op7, op6, op4, itin,
+                     OpcodeStr, !strconcat(Dt, "64"),
+                     v2i32, v2i64, shr_imm32, OpNode> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction Definitions.
+//===----------------------------------------------------------------------===//
+
+// Vector Add Operations.
+
+//   VADD     : Vector Add (integer and floating-point)
+defm VADD     : N3V_QHSD<0, 0, 0b1000, 0, IIC_VBINiD, IIC_VBINiQ, "vadd", "i",
+                         add, 1>;
+def  VADDfd   : N3VD<0, 0, 0b00, 0b1101, 0, IIC_VBIND, "vadd", "f32",
+                     v2f32, v2f32, fadd, 1>;
+def  VADDfq   : N3VQ<0, 0, 0b00, 0b1101, 0, IIC_VBINQ, "vadd", "f32",
+                     v4f32, v4f32, fadd, 1>;
+//   VADDL    : Vector Add Long (Q = D + D)
+defm VADDLs   : N3VLExt_QHS<0,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD,
+                            "vaddl", "s", add, sext, 1>;
+defm VADDLu   : N3VLExt_QHS<1,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD,
+                            "vaddl", "u", add, zext, 1>;
+//   VADDW    : Vector Add Wide (Q = Q + D)
+defm VADDWs   : N3VW_QHS<0,1,0b0001,0, "vaddw", "s", add, sext, 0>;
+defm VADDWu   : N3VW_QHS<1,1,0b0001,0, "vaddw", "u", add, zext, 0>;
+//   VHADD    : Vector Halving Add
+defm VHADDs   : N3VInt_QHS<0, 0, 0b0000, 0, N3RegFrm,
+                           IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
+                           "vhadd", "s", int_arm_neon_vhadds, 1>;
+defm VHADDu   : N3VInt_QHS<1, 0, 0b0000, 0, N3RegFrm,
+                           IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
+                           "vhadd", "u", int_arm_neon_vhaddu, 1>;
+//   VRHADD   : Vector Rounding Halving Add
+defm VRHADDs  : N3VInt_QHS<0, 0, 0b0001, 0, N3RegFrm,
+                           IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
+                           "vrhadd", "s", int_arm_neon_vrhadds, 1>;
+defm VRHADDu  : N3VInt_QHS<1, 0, 0b0001, 0, N3RegFrm,
+                           IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
+                           "vrhadd", "u", int_arm_neon_vrhaddu, 1>;
+//   VQADD    : Vector Saturating Add
+defm VQADDs   : N3VInt_QHSD<0, 0, 0b0000, 1, N3RegFrm,
+                            IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
+                            "vqadd", "s", int_arm_neon_vqadds, 1>;
+defm VQADDu   : N3VInt_QHSD<1, 0, 0b0000, 1, N3RegFrm,
+                            IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
+                            "vqadd", "u", int_arm_neon_vqaddu, 1>;
+//   VADDHN   : Vector Add and Narrow Returning High Half (D = Q + Q)
+defm VADDHN   : N3VNInt_HSD<0,1,0b0100,0, "vaddhn", "i", null_frag, 1>;
+//   VRADDHN  : Vector Rounding Add and Narrow Returning High Half (D = Q + Q)
+defm VRADDHN  : N3VNInt_HSD<1,1,0b0100,0, "vraddhn", "i",
+                            int_arm_neon_vraddhn, 1>;
+
+def : Pat<(v8i8  (trunc (NEONvshru (add (v8i16 QPR:$Vn), QPR:$Vm), 8))),
+          (VADDHNv8i8 QPR:$Vn, QPR:$Vm)>;
+def : Pat<(v4i16 (trunc (NEONvshru (add (v4i32 QPR:$Vn), QPR:$Vm), 16))),
+          (VADDHNv4i16 QPR:$Vn, QPR:$Vm)>;
+def : Pat<(v2i32 (trunc (NEONvshru (add (v2i64 QPR:$Vn), QPR:$Vm), 32))),
+          (VADDHNv2i32 QPR:$Vn, QPR:$Vm)>;
+
+// Vector Multiply Operations.
+
+//   VMUL     : Vector Multiply (integer, polynomial and floating-point)
+defm VMUL     : N3V_QHS<0, 0, 0b1001, 1, IIC_VMULi16D, IIC_VMULi32D,
+                        IIC_VMULi16Q, IIC_VMULi32Q, "vmul", "i", mul, 1>;
+def  VMULpd   : N3VDInt<1, 0, 0b00, 0b1001, 1, N3RegFrm, IIC_VMULi16D, "vmul",
+                        "p8", v8i8, v8i8, int_arm_neon_vmulp, 1>;
+def  VMULpq   : N3VQInt<1, 0, 0b00, 0b1001, 1, N3RegFrm, IIC_VMULi16Q, "vmul",
+                        "p8", v16i8, v16i8, int_arm_neon_vmulp, 1>;
+def  VMULfd   : N3VD<1, 0, 0b00, 0b1101, 1, IIC_VFMULD, "vmul", "f32",
+                     v2f32, v2f32, fmul, 1>;
+def  VMULfq   : N3VQ<1, 0, 0b00, 0b1101, 1, IIC_VFMULQ, "vmul", "f32",
+                     v4f32, v4f32, fmul, 1>;
+defm VMULsl   : N3VSL_HS<0b1000, "vmul", mul>;
+def  VMULslfd : N3VDSL<0b10, 0b1001, IIC_VBIND, "vmul", "f32", v2f32, fmul>;
+def  VMULslfq : N3VQSL<0b10, 0b1001, IIC_VBINQ, "vmul", "f32", v4f32,
+                       v2f32, fmul>;
+
+def : Pat<(v8i16 (mul (v8i16 QPR:$src1),
+                      (v8i16 (NEONvduplane (v8i16 QPR:$src2), imm:$lane)))),
+          (v8i16 (VMULslv8i16 (v8i16 QPR:$src1),
+                              (v4i16 (EXTRACT_SUBREG QPR:$src2,
+                                      (DSubReg_i16_reg imm:$lane))),
+                              (SubReg_i16_lane imm:$lane)))>;
+def : Pat<(v4i32 (mul (v4i32 QPR:$src1),
+                      (v4i32 (NEONvduplane (v4i32 QPR:$src2), imm:$lane)))),
+          (v4i32 (VMULslv4i32 (v4i32 QPR:$src1),
+                              (v2i32 (EXTRACT_SUBREG QPR:$src2,
+                                      (DSubReg_i32_reg imm:$lane))),
+                              (SubReg_i32_lane imm:$lane)))>;
+def : Pat<(v4f32 (fmul (v4f32 QPR:$src1),
+                       (v4f32 (NEONvduplane (v4f32 QPR:$src2), imm:$lane)))),
+          (v4f32 (VMULslfq (v4f32 QPR:$src1),
+                           (v2f32 (EXTRACT_SUBREG QPR:$src2,
+                                   (DSubReg_i32_reg imm:$lane))),
+                           (SubReg_i32_lane imm:$lane)))>;
+
+
+def : Pat<(v2f32 (fmul DPR:$Rn, (NEONvdup (f32 SPR:$Rm)))),
+          (VMULslfd DPR:$Rn,
+            (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$Rm, ssub_0),
+            (i32 0))>;
+def : Pat<(v4f32 (fmul QPR:$Rn, (NEONvdup (f32 SPR:$Rm)))),
+          (VMULslfq QPR:$Rn,
+            (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$Rm, ssub_0),
+            (i32 0))>;
+
+
+//   VQDMULH  : Vector Saturating Doubling Multiply Returning High Half
+defm VQDMULH  : N3VInt_HS<0, 0, 0b1011, 0, N3RegFrm, IIC_VMULi16D, IIC_VMULi32D,
+                          IIC_VMULi16Q, IIC_VMULi32Q,
+                          "vqdmulh", "s", int_arm_neon_vqdmulh, 1>;
+defm VQDMULHsl: N3VIntSL_HS<0b1100, IIC_VMULi16D, IIC_VMULi32D,
+                            IIC_VMULi16Q, IIC_VMULi32Q,
+                            "vqdmulh", "s",  int_arm_neon_vqdmulh>;
+def : Pat<(v8i16 (int_arm_neon_vqdmulh (v8i16 QPR:$src1),
+                                       (v8i16 (NEONvduplane (v8i16 QPR:$src2),
+                                                            imm:$lane)))),
+          (v8i16 (VQDMULHslv8i16 (v8i16 QPR:$src1),
+                                 (v4i16 (EXTRACT_SUBREG QPR:$src2,
+                                         (DSubReg_i16_reg imm:$lane))),
+                                 (SubReg_i16_lane imm:$lane)))>;
+def : Pat<(v4i32 (int_arm_neon_vqdmulh (v4i32 QPR:$src1),
+                                       (v4i32 (NEONvduplane (v4i32 QPR:$src2),
+                                                            imm:$lane)))),
+          (v4i32 (VQDMULHslv4i32 (v4i32 QPR:$src1),
+                                 (v2i32 (EXTRACT_SUBREG QPR:$src2,
+                                         (DSubReg_i32_reg imm:$lane))),
+                                 (SubReg_i32_lane imm:$lane)))>;
+
+//   VQRDMULH : Vector Rounding Saturating Doubling Multiply Returning High Half
+defm VQRDMULH   : N3VInt_HS<1, 0, 0b1011, 0, N3RegFrm,
+                            IIC_VMULi16D,IIC_VMULi32D,IIC_VMULi16Q,IIC_VMULi32Q,
+                            "vqrdmulh", "s", int_arm_neon_vqrdmulh, 1>;
+defm VQRDMULHsl : N3VIntSL_HS<0b1101, IIC_VMULi16D, IIC_VMULi32D,
+                              IIC_VMULi16Q, IIC_VMULi32Q,
+                              "vqrdmulh", "s",  int_arm_neon_vqrdmulh>;
+def : Pat<(v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$src1),
+                                        (v8i16 (NEONvduplane (v8i16 QPR:$src2),
+                                                             imm:$lane)))),
+          (v8i16 (VQRDMULHslv8i16 (v8i16 QPR:$src1),
+                                  (v4i16 (EXTRACT_SUBREG QPR:$src2,
+                                          (DSubReg_i16_reg imm:$lane))),
+                                  (SubReg_i16_lane imm:$lane)))>;
+def : Pat<(v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$src1),
+                                        (v4i32 (NEONvduplane (v4i32 QPR:$src2),
+                                                             imm:$lane)))),
+          (v4i32 (VQRDMULHslv4i32 (v4i32 QPR:$src1),
+                                  (v2i32 (EXTRACT_SUBREG QPR:$src2,
+                                          (DSubReg_i32_reg imm:$lane))),
+                                  (SubReg_i32_lane imm:$lane)))>;
+
+//   VMULL    : Vector Multiply Long (integer and polynomial) (Q = D * D)
+let PostEncoderMethod = "NEONThumb2DataIPostEncoder",
+    DecoderNamespace = "NEONData" in {
+  defm VMULLs   : N3VL_QHS<0,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D,
+                           "vmull", "s", NEONvmulls, 1>;
+  defm VMULLu   : N3VL_QHS<1,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D,
+                           "vmull", "u", NEONvmullu, 1>;
+  def  VMULLp8   :  N3VLInt<0, 1, 0b00, 0b1110, 0, IIC_VMULi16D, "vmull", "p8",
+                            v8i16, v8i8, int_arm_neon_vmullp, 1>;
+  def  VMULLp64  : N3VLIntnp<0b00101, 0b10, 0b1110, 0, 0, NoItinerary,
+                          "vmull", "p64", v2i64, v1i64, int_arm_neon_vmullp, 1>,
+                    Requires<[HasV8, HasCrypto]>;
+}
+defm VMULLsls : N3VLSL_HS<0, 0b1010, IIC_VMULi16D, "vmull", "s", NEONvmulls>;
+defm VMULLslu : N3VLSL_HS<1, 0b1010, IIC_VMULi16D, "vmull", "u", NEONvmullu>;
+
+//   VQDMULL  : Vector Saturating Doubling Multiply Long (Q = D * D)
+defm VQDMULL  : N3VLInt_HS<0,1,0b1101,0, IIC_VMULi16D, IIC_VMULi32D,
+                           "vqdmull", "s", int_arm_neon_vqdmull, 1>;
+defm VQDMULLsl: N3VLIntSL_HS<0, 0b1011, IIC_VMULi16D,
+                             "vqdmull", "s", int_arm_neon_vqdmull>;
+
+// Vector Multiply-Accumulate and Multiply-Subtract Operations.
+
+//   VMLA     : Vector Multiply Accumulate (integer and floating-point)
+defm VMLA     : N3VMulOp_QHS<0, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
+                             IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>;
+def  VMLAfd   : N3VDMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACD, "vmla", "f32",
+                          v2f32, fmul_su, fadd_mlx>,
+                Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>;
+def  VMLAfq   : N3VQMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACQ, "vmla", "f32",
+                          v4f32, fmul_su, fadd_mlx>,
+                Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>;
+defm VMLAsl   : N3VMulOpSL_HS<0b0000, IIC_VMACi16D, IIC_VMACi32D,
+                              IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>;
+def  VMLAslfd : N3VDMulOpSL<0b10, 0b0001, IIC_VMACD, "vmla", "f32",
+                            v2f32, fmul_su, fadd_mlx>,
+                Requires<[HasNEON, UseFPVMLx]>;
+def  VMLAslfq : N3VQMulOpSL<0b10, 0b0001, IIC_VMACQ, "vmla", "f32",
+                            v4f32, v2f32, fmul_su, fadd_mlx>,
+                Requires<[HasNEON, UseFPVMLx]>;
+
+def : Pat<(v8i16 (add (v8i16 QPR:$src1),
+                  (mul (v8i16 QPR:$src2),
+                       (v8i16 (NEONvduplane (v8i16 QPR:$src3), imm:$lane))))),
+          (v8i16 (VMLAslv8i16 (v8i16 QPR:$src1), (v8i16 QPR:$src2),
+                              (v4i16 (EXTRACT_SUBREG QPR:$src3,
+                                      (DSubReg_i16_reg imm:$lane))),
+                              (SubReg_i16_lane imm:$lane)))>;
+
+def : Pat<(v4i32 (add (v4i32 QPR:$src1),
+                  (mul (v4i32 QPR:$src2),
+                       (v4i32 (NEONvduplane (v4i32 QPR:$src3), imm:$lane))))),
+          (v4i32 (VMLAslv4i32 (v4i32 QPR:$src1), (v4i32 QPR:$src2),
+                              (v2i32 (EXTRACT_SUBREG QPR:$src3,
+                                      (DSubReg_i32_reg imm:$lane))),
+                              (SubReg_i32_lane imm:$lane)))>;
+
+def : Pat<(v4f32 (fadd_mlx (v4f32 QPR:$src1),
+                  (fmul_su (v4f32 QPR:$src2),
+                        (v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))),
+          (v4f32 (VMLAslfq (v4f32 QPR:$src1),
+                           (v4f32 QPR:$src2),
+                           (v2f32 (EXTRACT_SUBREG QPR:$src3,
+                                   (DSubReg_i32_reg imm:$lane))),
+                           (SubReg_i32_lane imm:$lane)))>,
+          Requires<[HasNEON, UseFPVMLx]>;
+
+//   VMLAL    : Vector Multiply Accumulate Long (Q += D * D)
+defm VMLALs   : N3VLMulOp_QHS<0,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D,
+                              "vmlal", "s", NEONvmulls, add>;
+defm VMLALu   : N3VLMulOp_QHS<1,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D,
+                              "vmlal", "u", NEONvmullu, add>;
+
+defm VMLALsls : N3VLMulOpSL_HS<0, 0b0010, "vmlal", "s", NEONvmulls, add>;
+defm VMLALslu : N3VLMulOpSL_HS<1, 0b0010, "vmlal", "u", NEONvmullu, add>;
+
+//   VQDMLAL  : Vector Saturating Doubling Multiply Accumulate Long (Q += D * D)
+defm VQDMLAL  : N3VLInt3_HS<0, 1, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
+                            "vqdmlal", "s", null_frag>;
+defm VQDMLALsl: N3VLInt3SL_HS<0, 0b0011, "vqdmlal", "s", null_frag>;
+
+def : Pat<(v4i32 (int_arm_neon_vqadds (v4i32 QPR:$src1),
+                     (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
+                                                  (v4i16 DPR:$Vm))))),
+          (VQDMLALv4i32 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
+def : Pat<(v2i64 (int_arm_neon_vqadds (v2i64 QPR:$src1),
+                     (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
+                                                  (v2i32 DPR:$Vm))))),
+          (VQDMLALv2i64 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
+def : Pat<(v4i32 (int_arm_neon_vqadds (v4i32 QPR:$src1),
+                     (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
+                                (v4i16 (NEONvduplane (v4i16 DPR_8:$Vm),
+                                                     imm:$lane)))))),
+          (VQDMLALslv4i16 QPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane)>;
+def : Pat<(v2i64 (int_arm_neon_vqadds (v2i64 QPR:$src1),
+                     (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
+                                (v2i32 (NEONvduplane (v2i32 DPR_VFP2:$Vm),
+                                                     imm:$lane)))))),
+          (VQDMLALslv2i32 QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, imm:$lane)>;
+
+//   VMLS     : Vector Multiply Subtract (integer and floating-point)
+defm VMLS     : N3VMulOp_QHS<1, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
+                             IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>;
+def  VMLSfd   : N3VDMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACD, "vmls", "f32",
+                          v2f32, fmul_su, fsub_mlx>,
+                Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>;
+def  VMLSfq   : N3VQMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACQ, "vmls", "f32",
+                          v4f32, fmul_su, fsub_mlx>,
+                Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>;
+defm VMLSsl   : N3VMulOpSL_HS<0b0100, IIC_VMACi16D, IIC_VMACi32D,
+                              IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>;
+def  VMLSslfd : N3VDMulOpSL<0b10, 0b0101, IIC_VMACD, "vmls", "f32",
+                            v2f32, fmul_su, fsub_mlx>,
+                Requires<[HasNEON, UseFPVMLx]>;
+def  VMLSslfq : N3VQMulOpSL<0b10, 0b0101, IIC_VMACQ, "vmls", "f32",
+                            v4f32, v2f32, fmul_su, fsub_mlx>,
+                Requires<[HasNEON, UseFPVMLx]>;
+
+def : Pat<(v8i16 (sub (v8i16 QPR:$src1),
+                  (mul (v8i16 QPR:$src2),
+                       (v8i16 (NEONvduplane (v8i16 QPR:$src3), imm:$lane))))),
+          (v8i16 (VMLSslv8i16 (v8i16 QPR:$src1), (v8i16 QPR:$src2),
+                              (v4i16 (EXTRACT_SUBREG QPR:$src3,
+                                      (DSubReg_i16_reg imm:$lane))),
+                              (SubReg_i16_lane imm:$lane)))>;
+
+def : Pat<(v4i32 (sub (v4i32 QPR:$src1),
+                  (mul (v4i32 QPR:$src2),
+                     (v4i32 (NEONvduplane (v4i32 QPR:$src3), imm:$lane))))),
+          (v4i32 (VMLSslv4i32 (v4i32 QPR:$src1), (v4i32 QPR:$src2),
+                              (v2i32 (EXTRACT_SUBREG QPR:$src3,
+                                      (DSubReg_i32_reg imm:$lane))),
+                              (SubReg_i32_lane imm:$lane)))>;
+
+def : Pat<(v4f32 (fsub_mlx (v4f32 QPR:$src1),
+                  (fmul_su (v4f32 QPR:$src2),
+                        (v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))),
+          (v4f32 (VMLSslfq (v4f32 QPR:$src1), (v4f32 QPR:$src2),
+                           (v2f32 (EXTRACT_SUBREG QPR:$src3,
+                                   (DSubReg_i32_reg imm:$lane))),
+                           (SubReg_i32_lane imm:$lane)))>,
+          Requires<[HasNEON, UseFPVMLx]>;
+
+//   VMLSL    : Vector Multiply Subtract Long (Q -= D * D)
+defm VMLSLs   : N3VLMulOp_QHS<0,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D,
+                              "vmlsl", "s", NEONvmulls, sub>;
+defm VMLSLu   : N3VLMulOp_QHS<1,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D,
+                              "vmlsl", "u", NEONvmullu, sub>;
+
+defm VMLSLsls : N3VLMulOpSL_HS<0, 0b0110, "vmlsl", "s", NEONvmulls, sub>;
+defm VMLSLslu : N3VLMulOpSL_HS<1, 0b0110, "vmlsl", "u", NEONvmullu, sub>;
+
+//   VQDMLSL  : Vector Saturating Doubling Multiply Subtract Long (Q -= D * D)
+defm VQDMLSL  : N3VLInt3_HS<0, 1, 0b1011, 0, IIC_VMACi16D, IIC_VMACi32D,
+                            "vqdmlsl", "s", null_frag>;
+defm VQDMLSLsl: N3VLInt3SL_HS<0, 0b111, "vqdmlsl", "s", null_frag>;
+
+def : Pat<(v4i32 (int_arm_neon_vqsubs (v4i32 QPR:$src1),
+                     (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
+                                                  (v4i16 DPR:$Vm))))),
+          (VQDMLSLv4i32 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
+def : Pat<(v2i64 (int_arm_neon_vqsubs (v2i64 QPR:$src1),
+                     (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
+                                                  (v2i32 DPR:$Vm))))),
+          (VQDMLSLv2i64 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
+def : Pat<(v4i32 (int_arm_neon_vqsubs (v4i32 QPR:$src1),
+                     (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
+                                (v4i16 (NEONvduplane (v4i16 DPR_8:$Vm),
+                                                     imm:$lane)))))),
+          (VQDMLSLslv4i16 QPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane)>;
+def : Pat<(v2i64 (int_arm_neon_vqsubs (v2i64 QPR:$src1),
+                     (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
+                                (v2i32 (NEONvduplane (v2i32 DPR_VFP2:$Vm),
+                                                     imm:$lane)))))),
+          (VQDMLSLslv2i32 QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, imm:$lane)>;
+
+// Fused Vector Multiply-Accumulate and Fused Multiply-Subtract Operations.
+def  VFMAfd   : N3VDMulOp<0, 0, 0b00, 0b1100, 1, IIC_VFMACD, "vfma", "f32",
+                          v2f32, fmul_su, fadd_mlx>,
+                Requires<[HasNEON,HasVFP4,UseFusedMAC]>;
+
+def  VFMAfq   : N3VQMulOp<0, 0, 0b00, 0b1100, 1, IIC_VFMACQ, "vfma", "f32",
+                          v4f32, fmul_su, fadd_mlx>,
+                Requires<[HasNEON,HasVFP4,UseFusedMAC]>;
+
+//   Fused Vector Multiply Subtract (floating-point)
+def  VFMSfd   : N3VDMulOp<0, 0, 0b10, 0b1100, 1, IIC_VFMACD, "vfms", "f32",
+                          v2f32, fmul_su, fsub_mlx>,
+                Requires<[HasNEON,HasVFP4,UseFusedMAC]>;
+def  VFMSfq   : N3VQMulOp<0, 0, 0b10, 0b1100, 1, IIC_VFMACQ, "vfms", "f32",
+                          v4f32, fmul_su, fsub_mlx>,
+                Requires<[HasNEON,HasVFP4,UseFusedMAC]>;
+
+// Match @llvm.fma.* intrinsics
+def : Pat<(v2f32 (fma DPR:$Vn, DPR:$Vm, DPR:$src1)),
+          (VFMAfd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
+          Requires<[HasVFP4]>;
+def : Pat<(v4f32 (fma QPR:$Vn, QPR:$Vm, QPR:$src1)),
+          (VFMAfq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
+          Requires<[HasVFP4]>;
+def : Pat<(v2f32 (fma (fneg DPR:$Vn), DPR:$Vm, DPR:$src1)),
+          (VFMSfd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
+      Requires<[HasVFP4]>;
+def : Pat<(v4f32 (fma (fneg QPR:$Vn), QPR:$Vm, QPR:$src1)),
+          (VFMSfq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
+      Requires<[HasVFP4]>;
+
+// Vector Subtract Operations.
+
+//   VSUB     : Vector Subtract (integer and floating-point)
+defm VSUB     : N3V_QHSD<1, 0, 0b1000, 0, IIC_VSUBiD, IIC_VSUBiQ,
+                         "vsub", "i", sub, 0>;
+def  VSUBfd   : N3VD<0, 0, 0b10, 0b1101, 0, IIC_VBIND, "vsub", "f32",
+                     v2f32, v2f32, fsub, 0>;
+def  VSUBfq   : N3VQ<0, 0, 0b10, 0b1101, 0, IIC_VBINQ, "vsub", "f32",
+                     v4f32, v4f32, fsub, 0>;
+//   VSUBL    : Vector Subtract Long (Q = D - D)
+defm VSUBLs   : N3VLExt_QHS<0,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD,
+                            "vsubl", "s", sub, sext, 0>;
+defm VSUBLu   : N3VLExt_QHS<1,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD,
+                            "vsubl", "u", sub, zext, 0>;
+//   VSUBW    : Vector Subtract Wide (Q = Q - D)
+defm VSUBWs   : N3VW_QHS<0,1,0b0011,0, "vsubw", "s", sub, sext, 0>;
+defm VSUBWu   : N3VW_QHS<1,1,0b0011,0, "vsubw", "u", sub, zext, 0>;
+//   VHSUB    : Vector Halving Subtract
+defm VHSUBs   : N3VInt_QHS<0, 0, 0b0010, 0, N3RegFrm,
+                           IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
+                           "vhsub", "s", int_arm_neon_vhsubs, 0>;
+defm VHSUBu   : N3VInt_QHS<1, 0, 0b0010, 0, N3RegFrm,
+                           IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
+                           "vhsub", "u", int_arm_neon_vhsubu, 0>;
+//   VQSUB    : Vector Saturing Subtract
+defm VQSUBs   : N3VInt_QHSD<0, 0, 0b0010, 1, N3RegFrm,
+                            IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
+                            "vqsub", "s", int_arm_neon_vqsubs, 0>;
+defm VQSUBu   : N3VInt_QHSD<1, 0, 0b0010, 1, N3RegFrm,
+                            IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
+                            "vqsub", "u", int_arm_neon_vqsubu, 0>;
+//   VSUBHN   : Vector Subtract and Narrow Returning High Half (D = Q - Q)
+defm VSUBHN   : N3VNInt_HSD<0,1,0b0110,0, "vsubhn", "i", null_frag, 0>;
+//   VRSUBHN  : Vector Rounding Subtract and Narrow Returning High Half (D=Q-Q)
+defm VRSUBHN  : N3VNInt_HSD<1,1,0b0110,0, "vrsubhn", "i",
+                            int_arm_neon_vrsubhn, 0>;
+
+def : Pat<(v8i8  (trunc (NEONvshru (sub (v8i16 QPR:$Vn), QPR:$Vm), 8))),
+          (VSUBHNv8i8 QPR:$Vn, QPR:$Vm)>;
+def : Pat<(v4i16 (trunc (NEONvshru (sub (v4i32 QPR:$Vn), QPR:$Vm), 16))),
+          (VSUBHNv4i16 QPR:$Vn, QPR:$Vm)>;
+def : Pat<(v2i32 (trunc (NEONvshru (sub (v2i64 QPR:$Vn), QPR:$Vm), 32))),
+          (VSUBHNv2i32 QPR:$Vn, QPR:$Vm)>;
+
+// Vector Comparisons.
+
+//   VCEQ     : Vector Compare Equal
+defm VCEQ     : N3V_QHS<1, 0, 0b1000, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
+                        IIC_VSUBi4Q, "vceq", "i", NEONvceq, 1>;
+def  VCEQfd   : N3VD<0,0,0b00,0b1110,0, IIC_VBIND, "vceq", "f32", v2i32, v2f32,
+                     NEONvceq, 1>;
+def  VCEQfq   : N3VQ<0,0,0b00,0b1110,0, IIC_VBINQ, "vceq", "f32", v4i32, v4f32,
+                     NEONvceq, 1>;
+
+let TwoOperandAliasConstraint = "$Vm = $Vd" in
+defm VCEQz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00010, 0, "vceq", "i",
+                            "$Vd, $Vm, #0", NEONvceqz>;
+
+//   VCGE     : Vector Compare Greater Than or Equal
+defm VCGEs    : N3V_QHS<0, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
+                        IIC_VSUBi4Q, "vcge", "s", NEONvcge, 0>;
+defm VCGEu    : N3V_QHS<1, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
+                        IIC_VSUBi4Q, "vcge", "u", NEONvcgeu, 0>;
+def  VCGEfd   : N3VD<1,0,0b00,0b1110,0, IIC_VBIND, "vcge", "f32", v2i32, v2f32,
+                     NEONvcge, 0>;
+def  VCGEfq   : N3VQ<1,0,0b00,0b1110,0, IIC_VBINQ, "vcge", "f32", v4i32, v4f32,
+                     NEONvcge, 0>;
+
+let TwoOperandAliasConstraint = "$Vm = $Vd" in {
+defm VCGEz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00001, 0, "vcge", "s",
+                            "$Vd, $Vm, #0", NEONvcgez>;
+defm VCLEz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00011, 0, "vcle", "s",
+                            "$Vd, $Vm, #0", NEONvclez>;
+}
+
+//   VCGT     : Vector Compare Greater Than
+defm VCGTs    : N3V_QHS<0, 0, 0b0011, 0, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
+                        IIC_VSUBi4Q, "vcgt", "s", NEONvcgt, 0>;
+defm VCGTu    : N3V_QHS<1, 0, 0b0011, 0, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
+                        IIC_VSUBi4Q, "vcgt", "u", NEONvcgtu, 0>;
+def  VCGTfd   : N3VD<1,0,0b10,0b1110,0, IIC_VBIND, "vcgt", "f32", v2i32, v2f32,
+                     NEONvcgt, 0>;
+def  VCGTfq   : N3VQ<1,0,0b10,0b1110,0, IIC_VBINQ, "vcgt", "f32", v4i32, v4f32,
+                     NEONvcgt, 0>;
+
+let TwoOperandAliasConstraint = "$Vm = $Vd" in {
+defm VCGTz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00000, 0, "vcgt", "s",
+                            "$Vd, $Vm, #0", NEONvcgtz>;
+defm VCLTz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00100, 0, "vclt", "s",
+                            "$Vd, $Vm, #0", NEONvcltz>;
+}
+
+//   VACGE    : Vector Absolute Compare Greater Than or Equal (aka VCAGE)
+def  VACGEd   : N3VDInt<1, 0, 0b00, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacge",
+                        "f32", v2i32, v2f32, int_arm_neon_vacge, 0>;
+def  VACGEq   : N3VQInt<1, 0, 0b00, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacge",
+                        "f32", v4i32, v4f32, int_arm_neon_vacge, 0>;
+//   VACGT    : Vector Absolute Compare Greater Than (aka VCAGT)
+def  VACGTd   : N3VDInt<1, 0, 0b10, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacgt",
+                        "f32", v2i32, v2f32, int_arm_neon_vacgt, 0>;
+def  VACGTq   : N3VQInt<1, 0, 0b10, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacgt",
+                        "f32", v4i32, v4f32, int_arm_neon_vacgt, 0>;
+//   VTST     : Vector Test Bits
+defm VTST     : N3V_QHS<0, 0, 0b1000, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+                        IIC_VBINi4Q, "vtst", "", NEONvtst, 1>;
+
+def: NEONInstAlias<"vaclt${p}.f32 $Vd, $Vn, $Vm",
+                   (VACGTd DPR:$Vd, DPR:$Vm, DPR:$Vn, pred:$p)>;
+def: NEONInstAlias<"vaclt${p}.f32 $Vd, $Vn, $Vm",
+                   (VACGTq QPR:$Vd, QPR:$Vm, QPR:$Vn, pred:$p)>;
+def: NEONInstAlias<"vacle${p}.f32 $Vd, $Vn, $Vm",
+                   (VACGEd DPR:$Vd, DPR:$Vm, DPR:$Vn, pred:$p)>;
+def: NEONInstAlias<"vacle${p}.f32 $Vd, $Vn, $Vm",
+                   (VACGEq QPR:$Vd, QPR:$Vm, QPR:$Vn, pred:$p)>;
+
+def: NEONInstAlias<"vaclt${p}.f32 $Vd, $Vm",
+                   (VACGTd DPR:$Vd, DPR:$Vm, DPR:$Vd, pred:$p)>;
+def: NEONInstAlias<"vaclt${p}.f32 $Vd, $Vm",
+                   (VACGTq QPR:$Vd, QPR:$Vm, QPR:$Vd, pred:$p)>;
+def: NEONInstAlias<"vacle${p}.f32 $Vd, $Vm",
+                   (VACGEd DPR:$Vd, DPR:$Vm, DPR:$Vd, pred:$p)>;
+def: NEONInstAlias<"vacle${p}.f32 $Vd, $Vm",
+                   (VACGEq QPR:$Vd, QPR:$Vm, QPR:$Vd, pred:$p)>;
+
+// Vector Bitwise Operations.
+
+def vnotd : PatFrag<(ops node:$in),
+                    (xor node:$in, (bitconvert (v8i8 NEONimmAllOnesV)))>;
+def vnotq : PatFrag<(ops node:$in),
+                    (xor node:$in, (bitconvert (v16i8 NEONimmAllOnesV)))>;
+
+
+//   VAND     : Vector Bitwise AND
+def  VANDd    : N3VDX<0, 0, 0b00, 0b0001, 1, IIC_VBINiD, "vand",
+                      v2i32, v2i32, and, 1>;
+def  VANDq    : N3VQX<0, 0, 0b00, 0b0001, 1, IIC_VBINiQ, "vand",
+                      v4i32, v4i32, and, 1>;
+
+//   VEOR     : Vector Bitwise Exclusive OR
+def  VEORd    : N3VDX<1, 0, 0b00, 0b0001, 1, IIC_VBINiD, "veor",
+                      v2i32, v2i32, xor, 1>;
+def  VEORq    : N3VQX<1, 0, 0b00, 0b0001, 1, IIC_VBINiQ, "veor",
+                      v4i32, v4i32, xor, 1>;
+
+//   VORR     : Vector Bitwise OR
+def  VORRd    : N3VDX<0, 0, 0b10, 0b0001, 1, IIC_VBINiD, "vorr",
+                      v2i32, v2i32, or, 1>;
+def  VORRq    : N3VQX<0, 0, 0b10, 0b0001, 1, IIC_VBINiQ, "vorr",
+                      v4i32, v4i32, or, 1>;
+
+def VORRiv4i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 0, 0, 1,
+                          (outs DPR:$Vd), (ins nImmSplatI16:$SIMM, DPR:$src),
+                          IIC_VMOVImm,
+                          "vorr", "i16", "$Vd, $SIMM", "$src = $Vd",
+                          [(set DPR:$Vd,
+                            (v4i16 (NEONvorrImm DPR:$src, timm:$SIMM)))]> {
+  let Inst{9} = SIMM{9};
+}
+
+def VORRiv2i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 0, 0, 1,
+                          (outs DPR:$Vd), (ins nImmSplatI32:$SIMM, DPR:$src),
+                          IIC_VMOVImm,
+                          "vorr", "i32", "$Vd, $SIMM", "$src = $Vd",
+                          [(set DPR:$Vd,
+                            (v2i32 (NEONvorrImm DPR:$src, timm:$SIMM)))]> {
+  let Inst{10-9} = SIMM{10-9};
+}
+
+def VORRiv8i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 1, 0, 1,
+                          (outs QPR:$Vd), (ins nImmSplatI16:$SIMM, QPR:$src),
+                          IIC_VMOVImm,
+                          "vorr", "i16", "$Vd, $SIMM", "$src = $Vd",
+                          [(set QPR:$Vd,
+                            (v8i16 (NEONvorrImm QPR:$src, timm:$SIMM)))]> {
+  let Inst{9} = SIMM{9};
+}
+
+def VORRiv4i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 1, 0, 1,
+                          (outs QPR:$Vd), (ins nImmSplatI32:$SIMM, QPR:$src),
+                          IIC_VMOVImm,
+                          "vorr", "i32", "$Vd, $SIMM", "$src = $Vd",
+                          [(set QPR:$Vd,
+                            (v4i32 (NEONvorrImm QPR:$src, timm:$SIMM)))]> {
+  let Inst{10-9} = SIMM{10-9};
+}
+
+
+//   VBIC     : Vector Bitwise Bit Clear (AND NOT)
+let TwoOperandAliasConstraint = "$Vn = $Vd" in {
+def  VBICd    : N3VX<0, 0, 0b01, 0b0001, 0, 1, (outs DPR:$Vd),
+                     (ins DPR:$Vn, DPR:$Vm), N3RegFrm, IIC_VBINiD,
+                     "vbic", "$Vd, $Vn, $Vm", "",
+                     [(set DPR:$Vd, (v2i32 (and DPR:$Vn,
+                                                 (vnotd DPR:$Vm))))]>;
+def  VBICq    : N3VX<0, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd),
+                     (ins QPR:$Vn, QPR:$Vm), N3RegFrm, IIC_VBINiQ,
+                     "vbic", "$Vd, $Vn, $Vm", "",
+                     [(set QPR:$Vd, (v4i32 (and QPR:$Vn,
+                                                 (vnotq QPR:$Vm))))]>;
+}
+
+def VBICiv4i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 0, 1, 1,
+                          (outs DPR:$Vd), (ins nImmSplatI16:$SIMM, DPR:$src),
+                          IIC_VMOVImm,
+                          "vbic", "i16", "$Vd, $SIMM", "$src = $Vd",
+                          [(set DPR:$Vd,
+                            (v4i16 (NEONvbicImm DPR:$src, timm:$SIMM)))]> {
+  let Inst{9} = SIMM{9};
+}
+
+def VBICiv2i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 0, 1, 1,
+                          (outs DPR:$Vd), (ins nImmSplatI32:$SIMM, DPR:$src),
+                          IIC_VMOVImm,
+                          "vbic", "i32", "$Vd, $SIMM", "$src = $Vd",
+                          [(set DPR:$Vd,
+                            (v2i32 (NEONvbicImm DPR:$src, timm:$SIMM)))]> {
+  let Inst{10-9} = SIMM{10-9};
+}
+
+def VBICiv8i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 1, 1, 1,
+                          (outs QPR:$Vd), (ins nImmSplatI16:$SIMM, QPR:$src),
+                          IIC_VMOVImm,
+                          "vbic", "i16", "$Vd, $SIMM", "$src = $Vd",
+                          [(set QPR:$Vd,
+                            (v8i16 (NEONvbicImm QPR:$src, timm:$SIMM)))]> {
+  let Inst{9} = SIMM{9};
+}
+
+def VBICiv4i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 1, 1, 1,
+                          (outs QPR:$Vd), (ins nImmSplatI32:$SIMM, QPR:$src),
+                          IIC_VMOVImm,
+                          "vbic", "i32", "$Vd, $SIMM", "$src = $Vd",
+                          [(set QPR:$Vd,
+                            (v4i32 (NEONvbicImm QPR:$src, timm:$SIMM)))]> {
+  let Inst{10-9} = SIMM{10-9};
+}
+
+//   VORN     : Vector Bitwise OR NOT
+def  VORNd    : N3VX<0, 0, 0b11, 0b0001, 0, 1, (outs DPR:$Vd),
+                     (ins DPR:$Vn, DPR:$Vm), N3RegFrm, IIC_VBINiD,
+                     "vorn", "$Vd, $Vn, $Vm", "",
+                     [(set DPR:$Vd, (v2i32 (or DPR:$Vn,
+                                                (vnotd DPR:$Vm))))]>;
+def  VORNq    : N3VX<0, 0, 0b11, 0b0001, 1, 1, (outs QPR:$Vd),
+                     (ins QPR:$Vn, QPR:$Vm), N3RegFrm, IIC_VBINiQ,
+                     "vorn", "$Vd, $Vn, $Vm", "",
+                     [(set QPR:$Vd, (v4i32 (or QPR:$Vn,
+                                                (vnotq QPR:$Vm))))]>;
+
+//   VMVN     : Vector Bitwise NOT (Immediate)
+
+let isReMaterializable = 1 in {
+
+def VMVNv4i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 0, 1, 1, (outs DPR:$Vd),
+                         (ins nImmSplatI16:$SIMM), IIC_VMOVImm,
+                         "vmvn", "i16", "$Vd, $SIMM", "",
+                         [(set DPR:$Vd, (v4i16 (NEONvmvnImm timm:$SIMM)))]> {
+  let Inst{9} = SIMM{9};
+}
+
+def VMVNv8i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 1, 1, 1, (outs QPR:$Vd),
+                         (ins nImmSplatI16:$SIMM), IIC_VMOVImm,
+                         "vmvn", "i16", "$Vd, $SIMM", "",
+                         [(set QPR:$Vd, (v8i16 (NEONvmvnImm timm:$SIMM)))]> {
+  let Inst{9} = SIMM{9};
+}
+
+def VMVNv2i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 0, 1, 1, (outs DPR:$Vd),
+                         (ins nImmVMOVI32:$SIMM), IIC_VMOVImm,
+                         "vmvn", "i32", "$Vd, $SIMM", "",
+                         [(set DPR:$Vd, (v2i32 (NEONvmvnImm timm:$SIMM)))]> {
+  let Inst{11-8} = SIMM{11-8};
+}
+
+def VMVNv4i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 1, 1, 1, (outs QPR:$Vd),
+                         (ins nImmVMOVI32:$SIMM), IIC_VMOVImm,
+                         "vmvn", "i32", "$Vd, $SIMM", "",
+                         [(set QPR:$Vd, (v4i32 (NEONvmvnImm timm:$SIMM)))]> {
+  let Inst{11-8} = SIMM{11-8};
+}
+}
+
+//   VMVN     : Vector Bitwise NOT
+def  VMVNd    : N2VX<0b11, 0b11, 0b00, 0b00, 0b01011, 0, 0,
+                     (outs DPR:$Vd), (ins DPR:$Vm), IIC_VSUBiD,
+                     "vmvn", "$Vd, $Vm", "",
+                     [(set DPR:$Vd, (v2i32 (vnotd DPR:$Vm)))]>;
+def  VMVNq    : N2VX<0b11, 0b11, 0b00, 0b00, 0b01011, 1, 0,
+                     (outs QPR:$Vd), (ins QPR:$Vm), IIC_VSUBiD,
+                     "vmvn", "$Vd, $Vm", "",
+                     [(set QPR:$Vd, (v4i32 (vnotq QPR:$Vm)))]>;
+def : Pat<(v2i32 (vnotd DPR:$src)), (VMVNd DPR:$src)>;
+def : Pat<(v4i32 (vnotq QPR:$src)), (VMVNq QPR:$src)>;
+
+//   VBSL     : Vector Bitwise Select
+def  VBSLd    : N3VX<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$Vd),
+                     (ins DPR:$src1, DPR:$Vn, DPR:$Vm),
+                     N3RegFrm, IIC_VCNTiD,
+                     "vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd",
+                     [(set DPR:$Vd,
+                           (v2i32 (NEONvbsl DPR:$src1, DPR:$Vn, DPR:$Vm)))]>;
+def : Pat<(v8i8 (int_arm_neon_vbsl (v8i8 DPR:$src1),
+                                   (v8i8 DPR:$Vn), (v8i8 DPR:$Vm))),
+          (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v4i16 (int_arm_neon_vbsl (v4i16 DPR:$src1),
+                                    (v4i16 DPR:$Vn), (v4i16 DPR:$Vm))),
+          (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v2i32 (int_arm_neon_vbsl (v2i32 DPR:$src1),
+                                    (v2i32 DPR:$Vn), (v2i32 DPR:$Vm))),
+          (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v2f32 (int_arm_neon_vbsl (v2f32 DPR:$src1),
+                                    (v2f32 DPR:$Vn), (v2f32 DPR:$Vm))),
+          (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v1i64 (int_arm_neon_vbsl (v1i64 DPR:$src1),
+                                    (v1i64 DPR:$Vn), (v1i64 DPR:$Vm))),
+          (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
+        Requires<[HasNEON]>;
+
+def : Pat<(v2i32 (or (and DPR:$Vn, DPR:$Vd),
+                     (and DPR:$Vm, (vnotd DPR:$Vd)))),
+          (VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm)>,
+        Requires<[HasNEON]>;
+
+def : Pat<(v1i64 (or (and DPR:$Vn, DPR:$Vd),
+                     (and DPR:$Vm, (vnotd DPR:$Vd)))),
+          (VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm)>,
+        Requires<[HasNEON]>;
+
+def  VBSLq    : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd),
+                     (ins QPR:$src1, QPR:$Vn, QPR:$Vm),
+                     N3RegFrm, IIC_VCNTiQ,
+                     "vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd",
+                     [(set QPR:$Vd,
+                           (v4i32 (NEONvbsl QPR:$src1, QPR:$Vn, QPR:$Vm)))]>;
+
+def : Pat<(v16i8 (int_arm_neon_vbsl (v16i8 QPR:$src1),
+                                   (v16i8 QPR:$Vn), (v16i8 QPR:$Vm))),
+          (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v8i16 (int_arm_neon_vbsl (v8i16 QPR:$src1),
+                                    (v8i16 QPR:$Vn), (v8i16 QPR:$Vm))),
+          (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v4i32 (int_arm_neon_vbsl (v4i32 QPR:$src1),
+                                    (v4i32 QPR:$Vn), (v4i32 QPR:$Vm))),
+          (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v4f32 (int_arm_neon_vbsl (v4f32 QPR:$src1),
+                                    (v4f32 QPR:$Vn), (v4f32 QPR:$Vm))),
+          (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v2i64 (int_arm_neon_vbsl (v2i64 QPR:$src1),
+                                    (v2i64 QPR:$Vn), (v2i64 QPR:$Vm))),
+          (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
+        Requires<[HasNEON]>;
+
+def : Pat<(v4i32 (or (and QPR:$Vn, QPR:$Vd),
+                     (and QPR:$Vm, (vnotq QPR:$Vd)))),
+          (VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v2i64 (or (and QPR:$Vn, QPR:$Vd),
+                     (and QPR:$Vm, (vnotq QPR:$Vd)))),
+          (VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm)>,
+        Requires<[HasNEON]>;
+
+//   VBIF     : Vector Bitwise Insert if False
+//              like VBSL but with: "vbif $dst, $src3, $src1", "$src2 = $dst",
+// FIXME: This instruction's encoding MAY NOT BE correct.
+def  VBIFd    : N3VX<1, 0, 0b11, 0b0001, 0, 1,
+                     (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm),
+                     N3RegFrm, IIC_VBINiD,
+                     "vbif", "$Vd, $Vn, $Vm", "$src1 = $Vd",
+                     []>;
+def  VBIFq    : N3VX<1, 0, 0b11, 0b0001, 1, 1,
+                     (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm),
+                     N3RegFrm, IIC_VBINiQ,
+                     "vbif", "$Vd, $Vn, $Vm", "$src1 = $Vd",
+                     []>;
+
+//   VBIT     : Vector Bitwise Insert if True
+//              like VBSL but with: "vbit $dst, $src2, $src1", "$src3 = $dst",
+// FIXME: This instruction's encoding MAY NOT BE correct.
+def  VBITd    : N3VX<1, 0, 0b10, 0b0001, 0, 1,
+                     (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm),
+                     N3RegFrm, IIC_VBINiD,
+                     "vbit", "$Vd, $Vn, $Vm", "$src1 = $Vd",
+                     []>;
+def  VBITq    : N3VX<1, 0, 0b10, 0b0001, 1, 1,
+                     (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm),
+                     N3RegFrm, IIC_VBINiQ,
+                     "vbit", "$Vd, $Vn, $Vm", "$src1 = $Vd",
+                     []>;
+
+// VBIT/VBIF are not yet implemented.  The TwoAddress pass will not go looking
+// for equivalent operations with different register constraints; it just
+// inserts copies.
+
+// Vector Absolute Differences.
+
+//   VABD     : Vector Absolute Difference
+defm VABDs    : N3VInt_QHS<0, 0, 0b0111, 0, N3RegFrm,
+                           IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
+                           "vabd", "s", int_arm_neon_vabds, 1>;
+defm VABDu    : N3VInt_QHS<1, 0, 0b0111, 0, N3RegFrm,
+                           IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
+                           "vabd", "u", int_arm_neon_vabdu, 1>;
+def  VABDfd   : N3VDInt<1, 0, 0b10, 0b1101, 0, N3RegFrm, IIC_VBIND,
+                        "vabd", "f32", v2f32, v2f32, int_arm_neon_vabds, 1>;
+def  VABDfq   : N3VQInt<1, 0, 0b10, 0b1101, 0, N3RegFrm, IIC_VBINQ,
+                        "vabd", "f32", v4f32, v4f32, int_arm_neon_vabds, 1>;
+
+//   VABDL    : Vector Absolute Difference Long (Q = | D - D |)
+defm VABDLs   : N3VLIntExt_QHS<0,1,0b0111,0, IIC_VSUBi4Q,
+                               "vabdl", "s", int_arm_neon_vabds, zext, 1>;
+defm VABDLu   : N3VLIntExt_QHS<1,1,0b0111,0, IIC_VSUBi4Q,
+                               "vabdl", "u", int_arm_neon_vabdu, zext, 1>;
+
+//   VABA     : Vector Absolute Difference and Accumulate
+defm VABAs    : N3VIntOp_QHS<0,0,0b0111,1, IIC_VABAD, IIC_VABAQ,
+                             "vaba", "s", int_arm_neon_vabds, add>;
+defm VABAu    : N3VIntOp_QHS<1,0,0b0111,1, IIC_VABAD, IIC_VABAQ,
+                             "vaba", "u", int_arm_neon_vabdu, add>;
+
+//   VABAL    : Vector Absolute Difference and Accumulate Long (Q += | D - D |)
+defm VABALs   : N3VLIntExtOp_QHS<0,1,0b0101,0, IIC_VABAD,
+                                 "vabal", "s", int_arm_neon_vabds, zext, add>;
+defm VABALu   : N3VLIntExtOp_QHS<1,1,0b0101,0, IIC_VABAD,
+                                 "vabal", "u", int_arm_neon_vabdu, zext, add>;
+
+// Vector Maximum and Minimum.
+
+//   VMAX     : Vector Maximum
+defm VMAXs    : N3VInt_QHS<0, 0, 0b0110, 0, N3RegFrm,
+                           IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
+                           "vmax", "s", int_arm_neon_vmaxs, 1>;
+defm VMAXu    : N3VInt_QHS<1, 0, 0b0110, 0, N3RegFrm,
+                           IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
+                           "vmax", "u", int_arm_neon_vmaxu, 1>;
+def  VMAXfd   : N3VDInt<0, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VBIND,
+                        "vmax", "f32",
+                        v2f32, v2f32, int_arm_neon_vmaxs, 1>;
+def  VMAXfq   : N3VQInt<0, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VBINQ,
+                        "vmax", "f32",
+                        v4f32, v4f32, int_arm_neon_vmaxs, 1>;
+
+// VMAXNM
+let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in {
+  def VMAXNMND  : N3VDIntnp<0b00110, 0b00, 0b1111, 0, 1,
+                            N3RegFrm, NoItinerary, "vmaxnm", "f32",
+                            v2f32, v2f32, int_arm_neon_vmaxnm, 1>,
+                            Requires<[HasV8, HasNEON]>;
+  def VMAXNMNQ  : N3VQIntnp<0b00110, 0b00, 0b1111, 1, 1,
+                            N3RegFrm, NoItinerary, "vmaxnm", "f32",
+                            v4f32, v4f32, int_arm_neon_vmaxnm, 1>,
+                            Requires<[HasV8, HasNEON]>;
+}
+
+//   VMIN     : Vector Minimum
+defm VMINs    : N3VInt_QHS<0, 0, 0b0110, 1, N3RegFrm,
+                           IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
+                           "vmin", "s", int_arm_neon_vmins, 1>;
+defm VMINu    : N3VInt_QHS<1, 0, 0b0110, 1, N3RegFrm,
+                           IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
+                           "vmin", "u", int_arm_neon_vminu, 1>;
+def  VMINfd   : N3VDInt<0, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VBIND,
+                        "vmin", "f32",
+                        v2f32, v2f32, int_arm_neon_vmins, 1>;
+def  VMINfq   : N3VQInt<0, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VBINQ,
+                        "vmin", "f32",
+                        v4f32, v4f32, int_arm_neon_vmins, 1>;
+
+// VMINNM
+let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in {
+  def VMINNMND  : N3VDIntnp<0b00110, 0b10, 0b1111, 0, 1,
+                            N3RegFrm, NoItinerary, "vminnm", "f32",
+                            v2f32, v2f32, int_arm_neon_vminnm, 1>,
+                            Requires<[HasV8, HasNEON]>;
+  def VMINNMNQ  : N3VQIntnp<0b00110, 0b10, 0b1111, 1, 1,
+                            N3RegFrm, NoItinerary, "vminnm", "f32",
+                            v4f32, v4f32, int_arm_neon_vminnm, 1>,
+                            Requires<[HasV8, HasNEON]>;
+}
+
+// Vector Pairwise Operations.
+
+//   VPADD    : Vector Pairwise Add
+def  VPADDi8  : N3VDInt<0, 0, 0b00, 0b1011, 1, N3RegFrm, IIC_VSHLiD,
+                        "vpadd", "i8",
+                        v8i8, v8i8, int_arm_neon_vpadd, 0>;
+def  VPADDi16 : N3VDInt<0, 0, 0b01, 0b1011, 1, N3RegFrm, IIC_VSHLiD,
+                        "vpadd", "i16",
+                        v4i16, v4i16, int_arm_neon_vpadd, 0>;
+def  VPADDi32 : N3VDInt<0, 0, 0b10, 0b1011, 1, N3RegFrm, IIC_VSHLiD,
+                        "vpadd", "i32",
+                        v2i32, v2i32, int_arm_neon_vpadd, 0>;
+def  VPADDf   : N3VDInt<1, 0, 0b00, 0b1101, 0, N3RegFrm,
+                        IIC_VPBIND, "vpadd", "f32",
+                        v2f32, v2f32, int_arm_neon_vpadd, 0>;
+
+//   VPADDL   : Vector Pairwise Add Long
+defm VPADDLs  : N2VPLInt_QHS<0b11, 0b11, 0b00, 0b00100, 0, "vpaddl", "s",
+                             int_arm_neon_vpaddls>;
+defm VPADDLu  : N2VPLInt_QHS<0b11, 0b11, 0b00, 0b00101, 0, "vpaddl", "u",
+                             int_arm_neon_vpaddlu>;
+
+//   VPADAL   : Vector Pairwise Add and Accumulate Long
+defm VPADALs  : N2VPLInt2_QHS<0b11, 0b11, 0b00, 0b01100, 0, "vpadal", "s",
+                              int_arm_neon_vpadals>;
+defm VPADALu  : N2VPLInt2_QHS<0b11, 0b11, 0b00, 0b01101, 0, "vpadal", "u",
+                              int_arm_neon_vpadalu>;
+
+//   VPMAX    : Vector Pairwise Maximum
+def  VPMAXs8  : N3VDInt<0, 0, 0b00, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax",
+                        "s8", v8i8, v8i8, int_arm_neon_vpmaxs, 0>;
+def  VPMAXs16 : N3VDInt<0, 0, 0b01, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax",
+                        "s16", v4i16, v4i16, int_arm_neon_vpmaxs, 0>;
+def  VPMAXs32 : N3VDInt<0, 0, 0b10, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax",
+                        "s32", v2i32, v2i32, int_arm_neon_vpmaxs, 0>;
+def  VPMAXu8  : N3VDInt<1, 0, 0b00, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax",
+                        "u8", v8i8, v8i8, int_arm_neon_vpmaxu, 0>;
+def  VPMAXu16 : N3VDInt<1, 0, 0b01, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax",
+                        "u16", v4i16, v4i16, int_arm_neon_vpmaxu, 0>;
+def  VPMAXu32 : N3VDInt<1, 0, 0b10, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax",
+                        "u32", v2i32, v2i32, int_arm_neon_vpmaxu, 0>;
+def  VPMAXf   : N3VDInt<1, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VPBIND, "vpmax",
+                        "f32", v2f32, v2f32, int_arm_neon_vpmaxs, 0>;
+
+//   VPMIN    : Vector Pairwise Minimum
+def  VPMINs8  : N3VDInt<0, 0, 0b00, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin",
+                        "s8", v8i8, v8i8, int_arm_neon_vpmins, 0>;
+def  VPMINs16 : N3VDInt<0, 0, 0b01, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin",
+                        "s16", v4i16, v4i16, int_arm_neon_vpmins, 0>;
+def  VPMINs32 : N3VDInt<0, 0, 0b10, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin",
+                        "s32", v2i32, v2i32, int_arm_neon_vpmins, 0>;
+def  VPMINu8  : N3VDInt<1, 0, 0b00, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin",
+                        "u8", v8i8, v8i8, int_arm_neon_vpminu, 0>;
+def  VPMINu16 : N3VDInt<1, 0, 0b01, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin",
+                        "u16", v4i16, v4i16, int_arm_neon_vpminu, 0>;
+def  VPMINu32 : N3VDInt<1, 0, 0b10, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin",
+                        "u32", v2i32, v2i32, int_arm_neon_vpminu, 0>;
+def  VPMINf   : N3VDInt<1, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VPBIND, "vpmin",
+                        "f32", v2f32, v2f32, int_arm_neon_vpmins, 0>;
+
+// Vector Reciprocal and Reciprocal Square Root Estimate and Step.
+
+//   VRECPE   : Vector Reciprocal Estimate
+def  VRECPEd  : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01000, 0,
+                        IIC_VUNAD, "vrecpe", "u32",
+                        v2i32, v2i32, int_arm_neon_vrecpe>;
+def  VRECPEq  : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01000, 0,
+                        IIC_VUNAQ, "vrecpe", "u32",
+                        v4i32, v4i32, int_arm_neon_vrecpe>;
+def  VRECPEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0,
+                        IIC_VUNAD, "vrecpe", "f32",
+                        v2f32, v2f32, int_arm_neon_vrecpe>;
+def  VRECPEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0,
+                        IIC_VUNAQ, "vrecpe", "f32",
+                        v4f32, v4f32, int_arm_neon_vrecpe>;
+
+//   VRECPS   : Vector Reciprocal Step
+def  VRECPSfd : N3VDInt<0, 0, 0b00, 0b1111, 1, N3RegFrm,
+                        IIC_VRECSD, "vrecps", "f32",
+                        v2f32, v2f32, int_arm_neon_vrecps, 1>;
+def  VRECPSfq : N3VQInt<0, 0, 0b00, 0b1111, 1, N3RegFrm,
+                        IIC_VRECSQ, "vrecps", "f32",
+                        v4f32, v4f32, int_arm_neon_vrecps, 1>;
+
+//   VRSQRTE  : Vector Reciprocal Square Root Estimate
+def  VRSQRTEd  : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01001, 0,
+                         IIC_VUNAD, "vrsqrte", "u32",
+                         v2i32, v2i32, int_arm_neon_vrsqrte>;
+def  VRSQRTEq  : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01001, 0,
+                         IIC_VUNAQ, "vrsqrte", "u32",
+                         v4i32, v4i32, int_arm_neon_vrsqrte>;
+def  VRSQRTEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0,
+                         IIC_VUNAD, "vrsqrte", "f32",
+                         v2f32, v2f32, int_arm_neon_vrsqrte>;
+def  VRSQRTEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0,
+                         IIC_VUNAQ, "vrsqrte", "f32",
+                         v4f32, v4f32, int_arm_neon_vrsqrte>;
+
+//   VRSQRTS  : Vector Reciprocal Square Root Step
+def VRSQRTSfd : N3VDInt<0, 0, 0b10, 0b1111, 1, N3RegFrm,
+                        IIC_VRECSD, "vrsqrts", "f32",
+                        v2f32, v2f32, int_arm_neon_vrsqrts, 1>;
+def VRSQRTSfq : N3VQInt<0, 0, 0b10, 0b1111, 1, N3RegFrm,
+                        IIC_VRECSQ, "vrsqrts", "f32",
+                        v4f32, v4f32, int_arm_neon_vrsqrts, 1>;
+
+// Vector Shifts.
+
+//   VSHL     : Vector Shift
+defm VSHLs    : N3VInt_QHSDSh<0, 0, 0b0100, 0, N3RegVShFrm,
+                            IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ, IIC_VSHLiQ,
+                            "vshl", "s", int_arm_neon_vshifts>;
+defm VSHLu    : N3VInt_QHSDSh<1, 0, 0b0100, 0, N3RegVShFrm,
+                            IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ, IIC_VSHLiQ,
+                            "vshl", "u", int_arm_neon_vshiftu>;
+
+//   VSHL     : Vector Shift Left (Immediate)
+defm VSHLi    : N2VShL_QHSD<0, 1, 0b0101, 1, IIC_VSHLiD, "vshl", "i", NEONvshl>;
+
+//   VSHR     : Vector Shift Right (Immediate)
+defm VSHRs    : N2VShR_QHSD<0, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "s", "VSHRs",
+                            NEONvshrs>;
+defm VSHRu    : N2VShR_QHSD<1, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "u", "VSHRu",
+                            NEONvshru>;
+
+//   VSHLL    : Vector Shift Left Long
+defm VSHLLs   : N2VLSh_QHS<0, 1, 0b1010, 0, 0, 1, "vshll", "s",
+  PatFrag<(ops node:$LHS, node:$RHS), (NEONvshl (sext node:$LHS), node:$RHS)>>;
+defm VSHLLu   : N2VLSh_QHS<1, 1, 0b1010, 0, 0, 1, "vshll", "u",
+  PatFrag<(ops node:$LHS, node:$RHS), (NEONvshl (zext node:$LHS), node:$RHS)>>;
+
+//   VSHLL    : Vector Shift Left Long (with maximum shift count)
+class N2VLShMax<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
+                bit op6, bit op4, string OpcodeStr, string Dt, ValueType ResTy,
+                ValueType OpTy, Operand ImmTy>
+  : N2VLSh<op24, op23, op11_8, op7, op6, op4, OpcodeStr, Dt,
+           ResTy, OpTy, ImmTy, null_frag> {
+  let Inst{21-16} = op21_16;
+  let DecoderMethod = "DecodeVSHLMaxInstruction";
+}
+def  VSHLLi8  : N2VLShMax<1, 1, 0b110010, 0b0011, 0, 0, 0, "vshll", "i8",
+                          v8i16, v8i8, imm8>;
+def  VSHLLi16 : N2VLShMax<1, 1, 0b110110, 0b0011, 0, 0, 0, "vshll", "i16",
+                          v4i32, v4i16, imm16>;
+def  VSHLLi32 : N2VLShMax<1, 1, 0b111010, 0b0011, 0, 0, 0, "vshll", "i32",
+                          v2i64, v2i32, imm32>;
+
+def : Pat<(v8i16 (NEONvshl (zext (v8i8 DPR:$Rn)), (i32 8))),
+          (VSHLLi8 DPR:$Rn, 8)>;
+def : Pat<(v4i32 (NEONvshl (zext (v4i16 DPR:$Rn)), (i32 16))),
+          (VSHLLi16 DPR:$Rn, 16)>;
+def : Pat<(v2i64 (NEONvshl (zext (v2i32 DPR:$Rn)), (i32 32))),
+          (VSHLLi32 DPR:$Rn, 32)>;
+def : Pat<(v8i16 (NEONvshl (sext (v8i8 DPR:$Rn)), (i32 8))),
+          (VSHLLi8 DPR:$Rn, 8)>;
+def : Pat<(v4i32 (NEONvshl (sext (v4i16 DPR:$Rn)), (i32 16))),
+          (VSHLLi16 DPR:$Rn, 16)>;
+def : Pat<(v2i64 (NEONvshl (sext (v2i32 DPR:$Rn)), (i32 32))),
+          (VSHLLi32 DPR:$Rn, 32)>;
+
+//   VSHRN    : Vector Shift Right and Narrow
+defm VSHRN    : N2VNSh_HSD<0,1,0b1000,0,0,1, IIC_VSHLiD, "vshrn", "i",
+                           PatFrag<(ops node:$Rn, node:$amt),
+                                   (trunc (NEONvshrs node:$Rn, node:$amt))>>;
+
+def : Pat<(v8i8 (trunc (NEONvshru (v8i16 QPR:$Vn), shr_imm8:$amt))),
+          (VSHRNv8i8 QPR:$Vn, shr_imm8:$amt)>;
+def : Pat<(v4i16 (trunc (NEONvshru (v4i32 QPR:$Vn), shr_imm16:$amt))),
+          (VSHRNv4i16 QPR:$Vn, shr_imm16:$amt)>;
+def : Pat<(v2i32 (trunc (NEONvshru (v2i64 QPR:$Vn), shr_imm32:$amt))),
+          (VSHRNv2i32 QPR:$Vn, shr_imm32:$amt)>;
+
+//   VRSHL    : Vector Rounding Shift
+defm VRSHLs   : N3VInt_QHSDSh<0, 0, 0b0101, 0, N3RegVShFrm,
+                            IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q,
+                            "vrshl", "s", int_arm_neon_vrshifts>;
+defm VRSHLu   : N3VInt_QHSDSh<1, 0, 0b0101, 0, N3RegVShFrm,
+                            IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q,
+                            "vrshl", "u", int_arm_neon_vrshiftu>;
+//   VRSHR    : Vector Rounding Shift Right
+defm VRSHRs   : N2VShR_QHSD<0,1,0b0010,1, IIC_VSHLi4D, "vrshr", "s", "VRSHRs",
+                            NEONvrshrs>;
+defm VRSHRu   : N2VShR_QHSD<1,1,0b0010,1, IIC_VSHLi4D, "vrshr", "u", "VRSHRu",
+                            NEONvrshru>;
+
+//   VRSHRN   : Vector Rounding Shift Right and Narrow
+defm VRSHRN   : N2VNSh_HSD<0, 1, 0b1000, 0, 1, 1, IIC_VSHLi4D, "vrshrn", "i",
+                           NEONvrshrn>;
+
+//   VQSHL    : Vector Saturating Shift
+defm VQSHLs   : N3VInt_QHSDSh<0, 0, 0b0100, 1, N3RegVShFrm,
+                            IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q,
+                            "vqshl", "s", int_arm_neon_vqshifts>;
+defm VQSHLu   : N3VInt_QHSDSh<1, 0, 0b0100, 1, N3RegVShFrm,
+                            IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q,
+                            "vqshl", "u", int_arm_neon_vqshiftu>;
+//   VQSHL    : Vector Saturating Shift Left (Immediate)
+defm VQSHLsi  : N2VShL_QHSD<0,1,0b0111,1, IIC_VSHLi4D, "vqshl", "s",NEONvqshls>;
+defm VQSHLui  : N2VShL_QHSD<1,1,0b0111,1, IIC_VSHLi4D, "vqshl", "u",NEONvqshlu>;
+
+//   VQSHLU   : Vector Saturating Shift Left (Immediate, Unsigned)
+defm VQSHLsu  : N2VShL_QHSD<1,1,0b0110,1, IIC_VSHLi4D,"vqshlu","s",NEONvqshlsu>;
+
+//   VQSHRN   : Vector Saturating Shift Right and Narrow
+defm VQSHRNs  : N2VNSh_HSD<0, 1, 0b1001, 0, 0, 1, IIC_VSHLi4D, "vqshrn", "s",
+                           NEONvqshrns>;
+defm VQSHRNu  : N2VNSh_HSD<1, 1, 0b1001, 0, 0, 1, IIC_VSHLi4D, "vqshrn", "u",
+                           NEONvqshrnu>;
+
+//   VQSHRUN  : Vector Saturating Shift Right and Narrow (Unsigned)
+defm VQSHRUN  : N2VNSh_HSD<1, 1, 0b1000, 0, 0, 1, IIC_VSHLi4D, "vqshrun", "s",
+                           NEONvqshrnsu>;
+
+//   VQRSHL   : Vector Saturating Rounding Shift
+defm VQRSHLs  : N3VInt_QHSDSh<0, 0, 0b0101, 1, N3RegVShFrm,
+                            IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q,
+                            "vqrshl", "s", int_arm_neon_vqrshifts>;
+defm VQRSHLu  : N3VInt_QHSDSh<1, 0, 0b0101, 1, N3RegVShFrm,
+                            IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q,
+                            "vqrshl", "u", int_arm_neon_vqrshiftu>;
+
+//   VQRSHRN  : Vector Saturating Rounding Shift Right and Narrow
+defm VQRSHRNs : N2VNSh_HSD<0, 1, 0b1001, 0, 1, 1, IIC_VSHLi4D, "vqrshrn", "s",
+                           NEONvqrshrns>;
+defm VQRSHRNu : N2VNSh_HSD<1, 1, 0b1001, 0, 1, 1, IIC_VSHLi4D, "vqrshrn", "u",
+                           NEONvqrshrnu>;
+
+//   VQRSHRUN : Vector Saturating Rounding Shift Right and Narrow (Unsigned)
+defm VQRSHRUN : N2VNSh_HSD<1, 1, 0b1000, 0, 1, 1, IIC_VSHLi4D, "vqrshrun", "s",
+                           NEONvqrshrnsu>;
+
+//   VSRA     : Vector Shift Right and Accumulate
+defm VSRAs    : N2VShAdd_QHSD<0, 1, 0b0001, 1, "vsra", "s", NEONvshrs>;
+defm VSRAu    : N2VShAdd_QHSD<1, 1, 0b0001, 1, "vsra", "u", NEONvshru>;
+//   VRSRA    : Vector Rounding Shift Right and Accumulate
+defm VRSRAs   : N2VShAdd_QHSD<0, 1, 0b0011, 1, "vrsra", "s", NEONvrshrs>;
+defm VRSRAu   : N2VShAdd_QHSD<1, 1, 0b0011, 1, "vrsra", "u", NEONvrshru>;
+
+//   VSLI     : Vector Shift Left and Insert
+defm VSLI     : N2VShInsL_QHSD<1, 1, 0b0101, 1, "vsli">;
+
+//   VSRI     : Vector Shift Right and Insert
+defm VSRI     : N2VShInsR_QHSD<1, 1, 0b0100, 1, "vsri">;
+
+// Vector Absolute and Saturating Absolute.
+
+//   VABS     : Vector Absolute Value
+defm VABS     : N2VInt_QHS<0b11, 0b11, 0b01, 0b00110, 0,
+                           IIC_VUNAiD, IIC_VUNAiQ, "vabs", "s",
+                           int_arm_neon_vabs>;
+def  VABSfd   : N2VD<0b11, 0b11, 0b10, 0b01, 0b01110, 0,
+                     "vabs", "f32",
+                     v2f32, v2f32, fabs>;
+def  VABSfq   : N2VQ<0b11, 0b11, 0b10, 0b01, 0b01110, 0,
+                     "vabs", "f32",
+                      v4f32, v4f32, fabs>;
+
+def : Pat<(xor (v2i32 (bitconvert (v8i8 (NEONvshrs DPR:$src, (i32 7))))),
+               (v2i32 (bitconvert (v8i8 (add DPR:$src,
+                                             (NEONvshrs DPR:$src, (i32 7))))))),
+          (VABSv8i8 DPR:$src)>;
+def : Pat<(xor (v2i32 (bitconvert (v4i16 (NEONvshrs DPR:$src, (i32 15))))),
+               (v2i32 (bitconvert (v4i16 (add DPR:$src,
+                                            (NEONvshrs DPR:$src, (i32 15))))))),
+          (VABSv4i16 DPR:$src)>;
+def : Pat<(xor (v2i32 (NEONvshrs DPR:$src, (i32 31))),
+               (v2i32 (add DPR:$src, (NEONvshrs DPR:$src, (i32 31))))),
+          (VABSv2i32 DPR:$src)>;
+def : Pat<(xor (v4i32 (bitconvert (v16i8 (NEONvshrs QPR:$src, (i32 7))))),
+               (v4i32 (bitconvert (v16i8 (add QPR:$src,
+                                             (NEONvshrs QPR:$src, (i32 7))))))),
+          (VABSv16i8 QPR:$src)>;
+def : Pat<(xor (v4i32 (bitconvert (v8i16 (NEONvshrs QPR:$src, (i32 15))))),
+               (v4i32 (bitconvert (v8i16 (add QPR:$src,
+                                            (NEONvshrs QPR:$src, (i32 15))))))),
+          (VABSv8i16 QPR:$src)>;
+def : Pat<(xor (v4i32 (NEONvshrs QPR:$src, (i32 31))),
+               (v4i32 (add QPR:$src, (NEONvshrs QPR:$src, (i32 31))))),
+          (VABSv4i32 QPR:$src)>;
+
+//   VQABS    : Vector Saturating Absolute Value
+defm VQABS    : N2VInt_QHS<0b11, 0b11, 0b00, 0b01110, 0,
+                           IIC_VQUNAiD, IIC_VQUNAiQ, "vqabs", "s",
+                           int_arm_neon_vqabs>;
+
+// Vector Negate.
+
+def vnegd  : PatFrag<(ops node:$in),
+                     (sub (bitconvert (v2i32 NEONimmAllZerosV)), node:$in)>;
+def vnegq  : PatFrag<(ops node:$in),
+                     (sub (bitconvert (v4i32 NEONimmAllZerosV)), node:$in)>;
+
+class VNEGD<bits<2> size, string OpcodeStr, string Dt, ValueType Ty>
+  : N2V<0b11, 0b11, size, 0b01, 0b00111, 0, 0, (outs DPR:$Vd), (ins DPR:$Vm),
+        IIC_VSHLiD, OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set DPR:$Vd, (Ty (vnegd DPR:$Vm)))]>;
+class VNEGQ<bits<2> size, string OpcodeStr, string Dt, ValueType Ty>
+  : N2V<0b11, 0b11, size, 0b01, 0b00111, 1, 0, (outs QPR:$Vd), (ins QPR:$Vm),
+        IIC_VSHLiQ, OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set QPR:$Vd, (Ty (vnegq QPR:$Vm)))]>;
+
+//   VNEG     : Vector Negate (integer)
+def  VNEGs8d  : VNEGD<0b00, "vneg", "s8", v8i8>;
+def  VNEGs16d : VNEGD<0b01, "vneg", "s16", v4i16>;
+def  VNEGs32d : VNEGD<0b10, "vneg", "s32", v2i32>;
+def  VNEGs8q  : VNEGQ<0b00, "vneg", "s8", v16i8>;
+def  VNEGs16q : VNEGQ<0b01, "vneg", "s16", v8i16>;
+def  VNEGs32q : VNEGQ<0b10, "vneg", "s32", v4i32>;
+
+//   VNEG     : Vector Negate (floating-point)
+def  VNEGfd   : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 0, 0,
+                    (outs DPR:$Vd), (ins DPR:$Vm), IIC_VUNAD,
+                    "vneg", "f32", "$Vd, $Vm", "",
+                    [(set DPR:$Vd, (v2f32 (fneg DPR:$Vm)))]>;
+def  VNEGf32q : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 1, 0,
+                    (outs QPR:$Vd), (ins QPR:$Vm), IIC_VUNAQ,
+                    "vneg", "f32", "$Vd, $Vm", "",
+                    [(set QPR:$Vd, (v4f32 (fneg QPR:$Vm)))]>;
+
+def : Pat<(v8i8  (vnegd  DPR:$src)), (VNEGs8d DPR:$src)>;
+def : Pat<(v4i16 (vnegd  DPR:$src)), (VNEGs16d DPR:$src)>;
+def : Pat<(v2i32 (vnegd  DPR:$src)), (VNEGs32d DPR:$src)>;
+def : Pat<(v16i8 (vnegq QPR:$src)), (VNEGs8q QPR:$src)>;
+def : Pat<(v8i16 (vnegq QPR:$src)), (VNEGs16q QPR:$src)>;
+def : Pat<(v4i32 (vnegq QPR:$src)), (VNEGs32q QPR:$src)>;
+
+//   VQNEG    : Vector Saturating Negate
+defm VQNEG    : N2VInt_QHS<0b11, 0b11, 0b00, 0b01111, 0,
+                           IIC_VQUNAiD, IIC_VQUNAiQ, "vqneg", "s",
+                           int_arm_neon_vqneg>;
+
+// Vector Bit Counting Operations.
+
+//   VCLS     : Vector Count Leading Sign Bits
+defm VCLS     : N2VInt_QHS<0b11, 0b11, 0b00, 0b01000, 0,
+                           IIC_VCNTiD, IIC_VCNTiQ, "vcls", "s",
+                           int_arm_neon_vcls>;
+//   VCLZ     : Vector Count Leading Zeros
+defm VCLZ     : N2VInt_QHS<0b11, 0b11, 0b00, 0b01001, 0,
+                           IIC_VCNTiD, IIC_VCNTiQ, "vclz", "i",
+                           ctlz>;
+//   VCNT     : Vector Count One Bits
+def  VCNTd    : N2VDInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0,
+                        IIC_VCNTiD, "vcnt", "8",
+                        v8i8, v8i8, ctpop>;
+def  VCNTq    : N2VQInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0,
+                        IIC_VCNTiQ, "vcnt", "8",
+                        v16i8, v16i8, ctpop>;
+
+// Vector Swap
+def  VSWPd    : N2VX<0b11, 0b11, 0b00, 0b10, 0b00000, 0, 0,
+                     (outs DPR:$Vd, DPR:$Vm), (ins DPR:$in1, DPR:$in2),
+                     NoItinerary, "vswp", "$Vd, $Vm", "$in1 = $Vd, $in2 = $Vm",
+                     []>;
+def  VSWPq    : N2VX<0b11, 0b11, 0b00, 0b10, 0b00000, 1, 0,
+                     (outs QPR:$Vd, QPR:$Vm), (ins QPR:$in1, QPR:$in2),
+                     NoItinerary, "vswp", "$Vd, $Vm", "$in1 = $Vd, $in2 = $Vm",
+                     []>;
+
+// Vector Move Operations.
+
+//   VMOV     : Vector Move (Register)
+def : NEONInstAlias<"vmov${p} $Vd, $Vm",
+                    (VORRd DPR:$Vd, DPR:$Vm, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vmov${p} $Vd, $Vm",
+                    (VORRq QPR:$Vd, QPR:$Vm, QPR:$Vm, pred:$p)>;
+
+//   VMOV     : Vector Move (Immediate)
+
+let isReMaterializable = 1 in {
+def VMOVv8i8  : N1ModImm<1, 0b000, 0b1110, 0, 0, 0, 1, (outs DPR:$Vd),
+                         (ins nImmSplatI8:$SIMM), IIC_VMOVImm,
+                         "vmov", "i8", "$Vd, $SIMM", "",
+                         [(set DPR:$Vd, (v8i8 (NEONvmovImm timm:$SIMM)))]>;
+def VMOVv16i8 : N1ModImm<1, 0b000, 0b1110, 0, 1, 0, 1, (outs QPR:$Vd),
+                         (ins nImmSplatI8:$SIMM), IIC_VMOVImm,
+                         "vmov", "i8", "$Vd, $SIMM", "",
+                         [(set QPR:$Vd, (v16i8 (NEONvmovImm timm:$SIMM)))]>;
+
+def VMOVv4i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 0, 0, 1, (outs DPR:$Vd),
+                         (ins nImmSplatI16:$SIMM), IIC_VMOVImm,
+                         "vmov", "i16", "$Vd, $SIMM", "",
+                         [(set DPR:$Vd, (v4i16 (NEONvmovImm timm:$SIMM)))]> {
+  let Inst{9} = SIMM{9};
+}
+
+def VMOVv8i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 1, 0, 1, (outs QPR:$Vd),
+                         (ins nImmSplatI16:$SIMM), IIC_VMOVImm,
+                         "vmov", "i16", "$Vd, $SIMM", "",
+                         [(set QPR:$Vd, (v8i16 (NEONvmovImm timm:$SIMM)))]> {
+ let Inst{9} = SIMM{9};
+}
+
+def VMOVv2i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 0, 0, 1, (outs DPR:$Vd),
+                         (ins nImmVMOVI32:$SIMM), IIC_VMOVImm,
+                         "vmov", "i32", "$Vd, $SIMM", "",
+                         [(set DPR:$Vd, (v2i32 (NEONvmovImm timm:$SIMM)))]> {
+  let Inst{11-8} = SIMM{11-8};
+}
+
+def VMOVv4i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 1, 0, 1, (outs QPR:$Vd),
+                         (ins nImmVMOVI32:$SIMM), IIC_VMOVImm,
+                         "vmov", "i32", "$Vd, $SIMM", "",
+                         [(set QPR:$Vd, (v4i32 (NEONvmovImm timm:$SIMM)))]> {
+  let Inst{11-8} = SIMM{11-8};
+}
+
+def VMOVv1i64 : N1ModImm<1, 0b000, 0b1110, 0, 0, 1, 1, (outs DPR:$Vd),
+                         (ins nImmSplatI64:$SIMM), IIC_VMOVImm,
+                         "vmov", "i64", "$Vd, $SIMM", "",
+                         [(set DPR:$Vd, (v1i64 (NEONvmovImm timm:$SIMM)))]>;
+def VMOVv2i64 : N1ModImm<1, 0b000, 0b1110, 0, 1, 1, 1, (outs QPR:$Vd),
+                         (ins nImmSplatI64:$SIMM), IIC_VMOVImm,
+                         "vmov", "i64", "$Vd, $SIMM", "",
+                         [(set QPR:$Vd, (v2i64 (NEONvmovImm timm:$SIMM)))]>;
+
+def VMOVv2f32 : N1ModImm<1, 0b000, 0b1111, 0, 0, 0, 1, (outs DPR:$Vd),
+                         (ins nImmVMOVF32:$SIMM), IIC_VMOVImm,
+                         "vmov", "f32", "$Vd, $SIMM", "",
+                         [(set DPR:$Vd, (v2f32 (NEONvmovFPImm timm:$SIMM)))]>;
+def VMOVv4f32 : N1ModImm<1, 0b000, 0b1111, 0, 1, 0, 1, (outs QPR:$Vd),
+                         (ins nImmVMOVF32:$SIMM), IIC_VMOVImm,
+                         "vmov", "f32", "$Vd, $SIMM", "",
+                         [(set QPR:$Vd, (v4f32 (NEONvmovFPImm timm:$SIMM)))]>;
+} // isReMaterializable
+
+// Add support for bytes replication feature, so it could be GAS compatible.
+// E.g. instructions below:
+// "vmov.i32 d0, 0xffffffff"
+// "vmov.i32 d0, 0xabababab"
+// "vmov.i16 d0, 0xabab"
+// are incorrect, but we could deal with such cases.
+// For last two instructions, for example, it should emit:
+// "vmov.i8 d0, 0xab"
+def : NEONInstAlias<"vmov${p}.i16 $Vd, $Vm",
+                    (VMOVv8i8 DPR:$Vd, nImmVMOVI16ByteReplicate:$Vm, pred:$p)>;
+def : NEONInstAlias<"vmov${p}.i32 $Vd, $Vm",
+                    (VMOVv8i8 DPR:$Vd, nImmVMOVI32ByteReplicate:$Vm, pred:$p)>;
+def : NEONInstAlias<"vmov${p}.i16 $Vd, $Vm",
+                    (VMOVv16i8 QPR:$Vd, nImmVMOVI16ByteReplicate:$Vm, pred:$p)>;
+def : NEONInstAlias<"vmov${p}.i32 $Vd, $Vm",
+                    (VMOVv16i8 QPR:$Vd, nImmVMOVI32ByteReplicate:$Vm, pred:$p)>;
+
+// Also add same support for VMVN instructions. So instruction:
+// "vmvn.i32 d0, 0xabababab"
+// actually means:
+// "vmov.i8 d0, 0x54"
+def : NEONInstAlias<"vmvn${p}.i16 $Vd, $Vm",
+                    (VMOVv8i8 DPR:$Vd, nImmVMVNI16ByteReplicate:$Vm, pred:$p)>;
+def : NEONInstAlias<"vmvn${p}.i32 $Vd, $Vm",
+                    (VMOVv8i8 DPR:$Vd, nImmVMVNI32ByteReplicate:$Vm, pred:$p)>;
+def : NEONInstAlias<"vmvn${p}.i16 $Vd, $Vm",
+                    (VMOVv16i8 QPR:$Vd, nImmVMVNI16ByteReplicate:$Vm, pred:$p)>;
+def : NEONInstAlias<"vmvn${p}.i32 $Vd, $Vm",
+                    (VMOVv16i8 QPR:$Vd, nImmVMVNI32ByteReplicate:$Vm, pred:$p)>;
+
+// On some CPUs the two instructions "vmov.i32 dD, #0" and "vmov.i32 qD, #0"
+// require zero cycles to execute so they should be used wherever possible for
+// setting a register to zero.
+
+// Even without these pseudo-insts we would probably end up with the correct
+// instruction, but we could not mark the general ones with "isAsCheapAsAMove"
+// since they are sometimes rather expensive (in general).
+
+let AddedComplexity = 50, isAsCheapAsAMove = 1, isReMaterializable = 1 in {
+  def VMOVD0 : ARMPseudoExpand<(outs DPR:$Vd), (ins), 4, IIC_VMOVImm,
+                               [(set DPR:$Vd, (v2i32 NEONimmAllZerosV))],
+                               (VMOVv2i32 DPR:$Vd, 0, (ops 14, zero_reg))>,
+               Requires<[HasZCZ]>;
+  def VMOVQ0 : ARMPseudoExpand<(outs QPR:$Vd), (ins), 4, IIC_VMOVImm,
+                               [(set QPR:$Vd, (v4i32 NEONimmAllZerosV))],
+                               (VMOVv4i32 QPR:$Vd, 0, (ops 14, zero_reg))>,
+               Requires<[HasZCZ]>;
+}
+
+//   VMOV     : Vector Get Lane (move scalar to ARM core register)
+
+def VGETLNs8  : NVGetLane<{1,1,1,0,0,1,?,1}, 0b1011, {?,?},
+                          (outs GPR:$R), (ins DPR:$V, VectorIndex8:$lane),
+                          IIC_VMOVSI, "vmov", "s8", "$R, $V$lane",
+                          [(set GPR:$R, (NEONvgetlanes (v8i8 DPR:$V),
+                                           imm:$lane))]> {
+  let Inst{21}  = lane{2};
+  let Inst{6-5} = lane{1-0};
+}
+def VGETLNs16 : NVGetLane<{1,1,1,0,0,0,?,1}, 0b1011, {?,1},
+                          (outs GPR:$R), (ins DPR:$V, VectorIndex16:$lane),
+                          IIC_VMOVSI, "vmov", "s16", "$R, $V$lane",
+                          [(set GPR:$R, (NEONvgetlanes (v4i16 DPR:$V),
+                                           imm:$lane))]> {
+  let Inst{21} = lane{1};
+  let Inst{6}  = lane{0};
+}
+def VGETLNu8  : NVGetLane<{1,1,1,0,1,1,?,1}, 0b1011, {?,?},
+                          (outs GPR:$R), (ins DPR:$V, VectorIndex8:$lane),
+                          IIC_VMOVSI, "vmov", "u8", "$R, $V$lane",
+                          [(set GPR:$R, (NEONvgetlaneu (v8i8 DPR:$V),
+                                           imm:$lane))]> {
+  let Inst{21}  = lane{2};
+  let Inst{6-5} = lane{1-0};
+}
+def VGETLNu16 : NVGetLane<{1,1,1,0,1,0,?,1}, 0b1011, {?,1},
+                          (outs GPR:$R), (ins DPR:$V, VectorIndex16:$lane),
+                          IIC_VMOVSI, "vmov", "u16", "$R, $V$lane",
+                          [(set GPR:$R, (NEONvgetlaneu (v4i16 DPR:$V),
+                                           imm:$lane))]> {
+  let Inst{21} = lane{1};
+  let Inst{6}  = lane{0};
+}
+def VGETLNi32 : NVGetLane<{1,1,1,0,0,0,?,1}, 0b1011, 0b00,
+                          (outs GPR:$R), (ins DPR:$V, VectorIndex32:$lane),
+                          IIC_VMOVSI, "vmov", "32", "$R, $V$lane",
+                          [(set GPR:$R, (extractelt (v2i32 DPR:$V),
+                                           imm:$lane))]>,
+                Requires<[HasNEON, HasFastVGETLNi32]> {
+  let Inst{21} = lane{0};
+}
+// def VGETLNf32: see FMRDH and FMRDL in ARMInstrVFP.td
+def : Pat<(NEONvgetlanes (v16i8 QPR:$src), imm:$lane),
+          (VGETLNs8 (v8i8 (EXTRACT_SUBREG QPR:$src,
+                           (DSubReg_i8_reg imm:$lane))),
+                     (SubReg_i8_lane imm:$lane))>;
+def : Pat<(NEONvgetlanes (v8i16 QPR:$src), imm:$lane),
+          (VGETLNs16 (v4i16 (EXTRACT_SUBREG QPR:$src,
+                             (DSubReg_i16_reg imm:$lane))),
+                     (SubReg_i16_lane imm:$lane))>;
+def : Pat<(NEONvgetlaneu (v16i8 QPR:$src), imm:$lane),
+          (VGETLNu8 (v8i8 (EXTRACT_SUBREG QPR:$src,
+                           (DSubReg_i8_reg imm:$lane))),
+                     (SubReg_i8_lane imm:$lane))>;
+def : Pat<(NEONvgetlaneu (v8i16 QPR:$src), imm:$lane),
+          (VGETLNu16 (v4i16 (EXTRACT_SUBREG QPR:$src,
+                             (DSubReg_i16_reg imm:$lane))),
+                     (SubReg_i16_lane imm:$lane))>;
+def : Pat<(extractelt (v4i32 QPR:$src), imm:$lane),
+          (VGETLNi32 (v2i32 (EXTRACT_SUBREG QPR:$src,
+                             (DSubReg_i32_reg imm:$lane))),
+                     (SubReg_i32_lane imm:$lane))>,
+      Requires<[HasNEON, HasFastVGETLNi32]>;
+def : Pat<(extractelt (v2i32 DPR:$src), imm:$lane),
+          (COPY_TO_REGCLASS
+            (i32 (EXTRACT_SUBREG DPR:$src, (SSubReg_f32_reg imm:$lane))), GPR)>,
+      Requires<[HasNEON, HasSlowVGETLNi32]>;
+def : Pat<(extractelt (v4i32 QPR:$src), imm:$lane),
+          (COPY_TO_REGCLASS
+            (i32 (EXTRACT_SUBREG QPR:$src, (SSubReg_f32_reg imm:$lane))), GPR)>,
+      Requires<[HasNEON, HasSlowVGETLNi32]>;
+def : Pat<(extractelt (v2f32 DPR:$src1), imm:$src2),
+          (EXTRACT_SUBREG (v2f32 (COPY_TO_REGCLASS (v2f32 DPR:$src1),DPR_VFP2)),
+                          (SSubReg_f32_reg imm:$src2))>;
+def : Pat<(extractelt (v4f32 QPR:$src1), imm:$src2),
+          (EXTRACT_SUBREG (v4f32 (COPY_TO_REGCLASS (v4f32 QPR:$src1),QPR_VFP2)),
+                          (SSubReg_f32_reg imm:$src2))>;
+//def : Pat<(extractelt (v2i64 QPR:$src1), imm:$src2),
+//          (EXTRACT_SUBREG QPR:$src1, (DSubReg_f64_reg imm:$src2))>;
+def : Pat<(extractelt (v2f64 QPR:$src1), imm:$src2),
+          (EXTRACT_SUBREG QPR:$src1, (DSubReg_f64_reg imm:$src2))>;
+
+
+//   VMOV     : Vector Set Lane (move ARM core register to scalar)
+
+let Constraints = "$src1 = $V" in {
+def VSETLNi8  : NVSetLane<{1,1,1,0,0,1,?,0}, 0b1011, {?,?}, (outs DPR:$V),
+                          (ins DPR:$src1, GPR:$R, VectorIndex8:$lane),
+                          IIC_VMOVISL, "vmov", "8", "$V$lane, $R",
+                          [(set DPR:$V, (vector_insert (v8i8 DPR:$src1),
+                                           GPR:$R, imm:$lane))]> {
+  let Inst{21}  = lane{2};
+  let Inst{6-5} = lane{1-0};
+}
+def VSETLNi16 : NVSetLane<{1,1,1,0,0,0,?,0}, 0b1011, {?,1}, (outs DPR:$V),
+                          (ins DPR:$src1, GPR:$R, VectorIndex16:$lane),
+                          IIC_VMOVISL, "vmov", "16", "$V$lane, $R",
+                          [(set DPR:$V, (vector_insert (v4i16 DPR:$src1),
+                                           GPR:$R, imm:$lane))]> {
+  let Inst{21} = lane{1};
+  let Inst{6}  = lane{0};
+}
+def VSETLNi32 : NVSetLane<{1,1,1,0,0,0,?,0}, 0b1011, 0b00, (outs DPR:$V),
+                          (ins DPR:$src1, GPR:$R, VectorIndex32:$lane),
+                          IIC_VMOVISL, "vmov", "32", "$V$lane, $R",
+                          [(set DPR:$V, (insertelt (v2i32 DPR:$src1),
+                                           GPR:$R, imm:$lane))]> {
+  let Inst{21} = lane{0};
+}
+}
+def : Pat<(vector_insert (v16i8 QPR:$src1), GPR:$src2, imm:$lane),
+          (v16i8 (INSERT_SUBREG QPR:$src1,
+                  (v8i8 (VSETLNi8 (v8i8 (EXTRACT_SUBREG QPR:$src1,
+                                   (DSubReg_i8_reg imm:$lane))),
+                            GPR:$src2, (SubReg_i8_lane imm:$lane))),
+                  (DSubReg_i8_reg imm:$lane)))>;
+def : Pat<(vector_insert (v8i16 QPR:$src1), GPR:$src2, imm:$lane),
+          (v8i16 (INSERT_SUBREG QPR:$src1,
+                  (v4i16 (VSETLNi16 (v4i16 (EXTRACT_SUBREG QPR:$src1,
+                                     (DSubReg_i16_reg imm:$lane))),
+                             GPR:$src2, (SubReg_i16_lane imm:$lane))),
+                  (DSubReg_i16_reg imm:$lane)))>;
+def : Pat<(insertelt (v4i32 QPR:$src1), GPR:$src2, imm:$lane),
+          (v4i32 (INSERT_SUBREG QPR:$src1,
+                  (v2i32 (VSETLNi32 (v2i32 (EXTRACT_SUBREG QPR:$src1,
+                                     (DSubReg_i32_reg imm:$lane))),
+                             GPR:$src2, (SubReg_i32_lane imm:$lane))),
+                  (DSubReg_i32_reg imm:$lane)))>;
+
+def : Pat<(v2f32 (insertelt DPR:$src1, SPR:$src2, imm:$src3)),
+          (INSERT_SUBREG (v2f32 (COPY_TO_REGCLASS DPR:$src1, DPR_VFP2)),
+                                SPR:$src2, (SSubReg_f32_reg imm:$src3))>;
+def : Pat<(v4f32 (insertelt QPR:$src1, SPR:$src2, imm:$src3)),
+          (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS QPR:$src1, QPR_VFP2)),
+                                SPR:$src2, (SSubReg_f32_reg imm:$src3))>;
+
+//def : Pat<(v2i64 (insertelt QPR:$src1, DPR:$src2, imm:$src3)),
+//          (INSERT_SUBREG QPR:$src1, DPR:$src2, (DSubReg_f64_reg imm:$src3))>;
+def : Pat<(v2f64 (insertelt QPR:$src1, DPR:$src2, imm:$src3)),
+          (INSERT_SUBREG QPR:$src1, DPR:$src2, (DSubReg_f64_reg imm:$src3))>;
+
+def : Pat<(v2f32 (scalar_to_vector SPR:$src)),
+          (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$src, ssub_0)>;
+def : Pat<(v2f64 (scalar_to_vector (f64 DPR:$src))),
+          (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), DPR:$src, dsub_0)>;
+def : Pat<(v4f32 (scalar_to_vector SPR:$src)),
+          (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), SPR:$src, ssub_0)>;
+
+def : Pat<(v8i8 (scalar_to_vector GPR:$src)),
+          (VSETLNi8  (v8i8  (IMPLICIT_DEF)), GPR:$src, (i32 0))>;
+def : Pat<(v4i16 (scalar_to_vector GPR:$src)),
+          (VSETLNi16 (v4i16 (IMPLICIT_DEF)), GPR:$src, (i32 0))>;
+def : Pat<(v2i32 (scalar_to_vector GPR:$src)),
+          (VSETLNi32 (v2i32 (IMPLICIT_DEF)), GPR:$src, (i32 0))>;
+
+def : Pat<(v16i8 (scalar_to_vector GPR:$src)),
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+                         (VSETLNi8 (v8i8 (IMPLICIT_DEF)), GPR:$src, (i32 0)),
+                         dsub_0)>;
+def : Pat<(v8i16 (scalar_to_vector GPR:$src)),
+          (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
+                         (VSETLNi16 (v4i16 (IMPLICIT_DEF)), GPR:$src, (i32 0)),
+                         dsub_0)>;
+def : Pat<(v4i32 (scalar_to_vector GPR:$src)),
+          (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
+                         (VSETLNi32 (v2i32 (IMPLICIT_DEF)), GPR:$src, (i32 0)),
+                         dsub_0)>;
+
+//   VDUP     : Vector Duplicate (from ARM core register to all elements)
+
+class VDUPD<bits<8> opcod1, bits<2> opcod3, string Dt, ValueType Ty>
+  : NVDup<opcod1, 0b1011, opcod3, (outs DPR:$V), (ins GPR:$R),
+          IIC_VMOVIS, "vdup", Dt, "$V, $R",
+          [(set DPR:$V, (Ty (NEONvdup (i32 GPR:$R))))]>;
+class VDUPQ<bits<8> opcod1, bits<2> opcod3, string Dt, ValueType Ty>
+  : NVDup<opcod1, 0b1011, opcod3, (outs QPR:$V), (ins GPR:$R),
+          IIC_VMOVIS, "vdup", Dt, "$V, $R",
+          [(set QPR:$V, (Ty (NEONvdup (i32 GPR:$R))))]>;
+
+def  VDUP8d   : VDUPD<0b11101100, 0b00, "8", v8i8>;
+def  VDUP16d  : VDUPD<0b11101000, 0b01, "16", v4i16>;
+def  VDUP32d  : VDUPD<0b11101000, 0b00, "32", v2i32>,
+                Requires<[HasNEON, HasFastVDUP32]>;
+def  VDUP8q   : VDUPQ<0b11101110, 0b00, "8", v16i8>;
+def  VDUP16q  : VDUPQ<0b11101010, 0b01, "16", v8i16>;
+def  VDUP32q  : VDUPQ<0b11101010, 0b00, "32", v4i32>;
+
+// NEONvdup patterns for uarchs with fast VDUP.32.
+def : Pat<(v2f32 (NEONvdup (f32 (bitconvert GPR:$R)))), (VDUP32d GPR:$R)>,
+      Requires<[HasNEON,HasFastVDUP32]>;
+def : Pat<(v4f32 (NEONvdup (f32 (bitconvert GPR:$R)))), (VDUP32q GPR:$R)>;
+
+// NEONvdup patterns for uarchs with slow VDUP.32 - use VMOVDRR instead.
+def : Pat<(v2i32 (NEONvdup (i32 GPR:$R))), (VMOVDRR GPR:$R, GPR:$R)>,
+      Requires<[HasNEON,HasSlowVDUP32]>;
+def : Pat<(v2f32 (NEONvdup (f32 (bitconvert GPR:$R)))), (VMOVDRR GPR:$R, GPR:$R)>,
+      Requires<[HasNEON,HasSlowVDUP32]>;
+
+//   VDUP     : Vector Duplicate Lane (from scalar to all elements)
+
+class VDUPLND<bits<4> op19_16, string OpcodeStr, string Dt,
+              ValueType Ty, Operand IdxTy>
+  : NVDupLane<op19_16, 0, (outs DPR:$Vd), (ins DPR:$Vm, IdxTy:$lane),
+              IIC_VMOVD, OpcodeStr, Dt, "$Vd, $Vm$lane",
+              [(set DPR:$Vd, (Ty (NEONvduplane (Ty DPR:$Vm), imm:$lane)))]>;
+
+class VDUPLNQ<bits<4> op19_16, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy, Operand IdxTy>
+  : NVDupLane<op19_16, 1, (outs QPR:$Vd), (ins DPR:$Vm, IdxTy:$lane),
+              IIC_VMOVQ, OpcodeStr, Dt, "$Vd, $Vm$lane",
+              [(set QPR:$Vd, (ResTy (NEONvduplane (OpTy DPR:$Vm),
+                                      VectorIndex32:$lane)))]>;
+
+// Inst{19-16} is partially specified depending on the element size.
+
+def VDUPLN8d  : VDUPLND<{?,?,?,1}, "vdup", "8", v8i8, VectorIndex8> {
+  bits<3> lane;
+  let Inst{19-17} = lane{2-0};
+}
+def VDUPLN16d : VDUPLND<{?,?,1,0}, "vdup", "16", v4i16, VectorIndex16> {
+  bits<2> lane;
+  let Inst{19-18} = lane{1-0};
+}
+def VDUPLN32d : VDUPLND<{?,1,0,0}, "vdup", "32", v2i32, VectorIndex32> {
+  bits<1> lane;
+  let Inst{19} = lane{0};
+}
+def VDUPLN8q  : VDUPLNQ<{?,?,?,1}, "vdup", "8", v16i8, v8i8, VectorIndex8> {
+  bits<3> lane;
+  let Inst{19-17} = lane{2-0};
+}
+def VDUPLN16q : VDUPLNQ<{?,?,1,0}, "vdup", "16", v8i16, v4i16, VectorIndex16> {
+  bits<2> lane;
+  let Inst{19-18} = lane{1-0};
+}
+def VDUPLN32q : VDUPLNQ<{?,1,0,0}, "vdup", "32", v4i32, v2i32, VectorIndex32> {
+  bits<1> lane;
+  let Inst{19} = lane{0};
+}
+
+def : Pat<(v2f32 (NEONvduplane (v2f32 DPR:$Vm), imm:$lane)),
+          (VDUPLN32d DPR:$Vm, imm:$lane)>;
+
+def : Pat<(v4f32 (NEONvduplane (v2f32 DPR:$Vm), imm:$lane)),
+          (VDUPLN32q DPR:$Vm, imm:$lane)>;
+
+def : Pat<(v16i8 (NEONvduplane (v16i8 QPR:$src), imm:$lane)),
+          (v16i8 (VDUPLN8q (v8i8 (EXTRACT_SUBREG QPR:$src,
+                                  (DSubReg_i8_reg imm:$lane))),
+                           (SubReg_i8_lane imm:$lane)))>;
+def : Pat<(v8i16 (NEONvduplane (v8i16 QPR:$src), imm:$lane)),
+          (v8i16 (VDUPLN16q (v4i16 (EXTRACT_SUBREG QPR:$src,
+                                    (DSubReg_i16_reg imm:$lane))),
+                            (SubReg_i16_lane imm:$lane)))>;
+def : Pat<(v4i32 (NEONvduplane (v4i32 QPR:$src), imm:$lane)),
+          (v4i32 (VDUPLN32q (v2i32 (EXTRACT_SUBREG QPR:$src,
+                                    (DSubReg_i32_reg imm:$lane))),
+                            (SubReg_i32_lane imm:$lane)))>;
+def : Pat<(v4f32 (NEONvduplane (v4f32 QPR:$src), imm:$lane)),
+          (v4f32 (VDUPLN32q (v2f32 (EXTRACT_SUBREG QPR:$src,
+                                   (DSubReg_i32_reg imm:$lane))),
+                           (SubReg_i32_lane imm:$lane)))>;
+
+def : Pat<(v2f32 (NEONvdup (f32 SPR:$src))),
+          (v2f32 (VDUPLN32d (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)),
+                             SPR:$src, ssub_0), (i32 0)))>;
+def : Pat<(v4f32 (NEONvdup (f32 SPR:$src))),
+          (v4f32 (VDUPLN32q (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)),
+                             SPR:$src, ssub_0), (i32 0)))>;
+
+//   VMOVN    : Vector Narrowing Move
+defm VMOVN    : N2VN_HSD<0b11,0b11,0b10,0b00100,0,0, IIC_VMOVN,
+                         "vmovn", "i", trunc>;
+//   VQMOVN   : Vector Saturating Narrowing Move
+defm VQMOVNs  : N2VNInt_HSD<0b11,0b11,0b10,0b00101,0,0, IIC_VQUNAiD,
+                            "vqmovn", "s", int_arm_neon_vqmovns>;
+defm VQMOVNu  : N2VNInt_HSD<0b11,0b11,0b10,0b00101,1,0, IIC_VQUNAiD,
+                            "vqmovn", "u", int_arm_neon_vqmovnu>;
+defm VQMOVNsu : N2VNInt_HSD<0b11,0b11,0b10,0b00100,1,0, IIC_VQUNAiD,
+                            "vqmovun", "s", int_arm_neon_vqmovnsu>;
+//   VMOVL    : Vector Lengthening Move
+defm VMOVLs   : N2VL_QHS<0b01,0b10100,0,1, "vmovl", "s", sext>;
+defm VMOVLu   : N2VL_QHS<0b11,0b10100,0,1, "vmovl", "u", zext>;
+def : Pat<(v8i16 (anyext (v8i8 DPR:$Vm))), (VMOVLuv8i16 DPR:$Vm)>;
+def : Pat<(v4i32 (anyext (v4i16 DPR:$Vm))), (VMOVLuv4i32 DPR:$Vm)>;
+def : Pat<(v2i64 (anyext (v2i32 DPR:$Vm))), (VMOVLuv2i64 DPR:$Vm)>;
+
+// Vector Conversions.
+
+//   VCVT     : Vector Convert Between Floating-Point and Integers
+def  VCVTf2sd : N2VD<0b11, 0b11, 0b10, 0b11, 0b01110, 0, "vcvt", "s32.f32",
+                     v2i32, v2f32, fp_to_sint>;
+def  VCVTf2ud : N2VD<0b11, 0b11, 0b10, 0b11, 0b01111, 0, "vcvt", "u32.f32",
+                     v2i32, v2f32, fp_to_uint>;
+def  VCVTs2fd : N2VD<0b11, 0b11, 0b10, 0b11, 0b01100, 0, "vcvt", "f32.s32",
+                     v2f32, v2i32, sint_to_fp>;
+def  VCVTu2fd : N2VD<0b11, 0b11, 0b10, 0b11, 0b01101, 0, "vcvt", "f32.u32",
+                     v2f32, v2i32, uint_to_fp>;
+
+def  VCVTf2sq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01110, 0, "vcvt", "s32.f32",
+                     v4i32, v4f32, fp_to_sint>;
+def  VCVTf2uq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01111, 0, "vcvt", "u32.f32",
+                     v4i32, v4f32, fp_to_uint>;
+def  VCVTs2fq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01100, 0, "vcvt", "f32.s32",
+                     v4f32, v4i32, sint_to_fp>;
+def  VCVTu2fq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01101, 0, "vcvt", "f32.u32",
+                     v4f32, v4i32, uint_to_fp>;
+
+// VCVT{A, N, P, M}
+multiclass VCVT_FPI<string op, bits<3> op10_8, SDPatternOperator IntS,
+                    SDPatternOperator IntU> {
+  let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in {
+    def SD : N2VDIntnp<0b11, op10_8, 0, NoItinerary, !strconcat("vcvt", op),
+                       "s32.f32", v2i32, v2f32, IntS>, Requires<[HasV8, HasNEON]>;
+    def SQ : N2VQIntnp<0b11, op10_8, 0, NoItinerary, !strconcat("vcvt", op),
+                       "s32.f32", v4i32, v4f32, IntS>, Requires<[HasV8, HasNEON]>;
+    def UD : N2VDIntnp<0b11, op10_8, 1, NoItinerary, !strconcat("vcvt", op),
+                       "u32.f32", v2i32, v2f32, IntU>, Requires<[HasV8, HasNEON]>;
+    def UQ : N2VQIntnp<0b11, op10_8, 1, NoItinerary, !strconcat("vcvt", op),
+                       "u32.f32", v4i32, v4f32, IntU>, Requires<[HasV8, HasNEON]>;
+  }
+}
+
+defm VCVTAN : VCVT_FPI<"a", 0b000, int_arm_neon_vcvtas, int_arm_neon_vcvtau>;
+defm VCVTNN : VCVT_FPI<"n", 0b001, int_arm_neon_vcvtns, int_arm_neon_vcvtnu>;
+defm VCVTPN : VCVT_FPI<"p", 0b010, int_arm_neon_vcvtps, int_arm_neon_vcvtpu>;
+defm VCVTMN : VCVT_FPI<"m", 0b011, int_arm_neon_vcvtms, int_arm_neon_vcvtmu>;
+
+//   VCVT     : Vector Convert Between Floating-Point and Fixed-Point.
+let DecoderMethod = "DecodeVCVTD" in {
+def VCVTf2xsd : N2VCvtD<0, 1, 0b1111, 0, 1, "vcvt", "s32.f32",
+                        v2i32, v2f32, int_arm_neon_vcvtfp2fxs>;
+def VCVTf2xud : N2VCvtD<1, 1, 0b1111, 0, 1, "vcvt", "u32.f32",
+                        v2i32, v2f32, int_arm_neon_vcvtfp2fxu>;
+def VCVTxs2fd : N2VCvtD<0, 1, 0b1110, 0, 1, "vcvt", "f32.s32",
+                        v2f32, v2i32, int_arm_neon_vcvtfxs2fp>;
+def VCVTxu2fd : N2VCvtD<1, 1, 0b1110, 0, 1, "vcvt", "f32.u32",
+                        v2f32, v2i32, int_arm_neon_vcvtfxu2fp>;
+}
+
+let DecoderMethod = "DecodeVCVTQ" in {
+def VCVTf2xsq : N2VCvtQ<0, 1, 0b1111, 0, 1, "vcvt", "s32.f32",
+                        v4i32, v4f32, int_arm_neon_vcvtfp2fxs>;
+def VCVTf2xuq : N2VCvtQ<1, 1, 0b1111, 0, 1, "vcvt", "u32.f32",
+                        v4i32, v4f32, int_arm_neon_vcvtfp2fxu>;
+def VCVTxs2fq : N2VCvtQ<0, 1, 0b1110, 0, 1, "vcvt", "f32.s32",
+                        v4f32, v4i32, int_arm_neon_vcvtfxs2fp>;
+def VCVTxu2fq : N2VCvtQ<1, 1, 0b1110, 0, 1, "vcvt", "f32.u32",
+                        v4f32, v4i32, int_arm_neon_vcvtfxu2fp>;
+}
+
+def : NEONInstAlias<"vcvt${p}.s32.f32 $Dd, $Dm, #0",
+                    (VCVTf2sd DPR:$Dd, DPR:$Dm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.u32.f32 $Dd, $Dm, #0",
+                    (VCVTf2ud DPR:$Dd, DPR:$Dm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.f32.s32 $Dd, $Dm, #0",
+                    (VCVTs2fd DPR:$Dd, DPR:$Dm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.f32.u32 $Dd, $Dm, #0",
+                    (VCVTu2fd DPR:$Dd, DPR:$Dm, pred:$p)>;
+
+def : NEONInstAlias<"vcvt${p}.s32.f32 $Qd, $Qm, #0",
+                    (VCVTf2sq QPR:$Qd, QPR:$Qm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.u32.f32 $Qd, $Qm, #0",
+                    (VCVTf2uq QPR:$Qd, QPR:$Qm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.f32.s32 $Qd, $Qm, #0",
+                    (VCVTs2fq QPR:$Qd, QPR:$Qm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.f32.u32 $Qd, $Qm, #0",
+                    (VCVTu2fq QPR:$Qd, QPR:$Qm, pred:$p)>;
+
+
+//   VCVT     : Vector Convert Between Half-Precision and Single-Precision.
+def  VCVTf2h  : N2VNInt<0b11, 0b11, 0b01, 0b10, 0b01100, 0, 0,
+                        IIC_VUNAQ, "vcvt", "f16.f32",
+                        v4i16, v4f32, int_arm_neon_vcvtfp2hf>,
+                Requires<[HasNEON, HasFP16]>;
+def  VCVTh2f  : N2VLInt<0b11, 0b11, 0b01, 0b10, 0b01110, 0, 0,
+                        IIC_VUNAQ, "vcvt", "f32.f16",
+                        v4f32, v4i16, int_arm_neon_vcvthf2fp>,
+                Requires<[HasNEON, HasFP16]>;
+
+// Vector Reverse.
+
+//   VREV64   : Vector Reverse elements within 64-bit doublewords
+
+class VREV64D<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty>
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00000, 0, 0, (outs DPR:$Vd),
+        (ins DPR:$Vm), IIC_VMOVD,
+        OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set DPR:$Vd, (Ty (NEONvrev64 (Ty DPR:$Vm))))]>;
+class VREV64Q<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty>
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00000, 1, 0, (outs QPR:$Vd),
+        (ins QPR:$Vm), IIC_VMOVQ,
+        OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set QPR:$Vd, (Ty (NEONvrev64 (Ty QPR:$Vm))))]>;
+
+def VREV64d8  : VREV64D<0b00, "vrev64", "8", v8i8>;
+def VREV64d16 : VREV64D<0b01, "vrev64", "16", v4i16>;
+def VREV64d32 : VREV64D<0b10, "vrev64", "32", v2i32>;
+def : Pat<(v2f32 (NEONvrev64 (v2f32 DPR:$Vm))), (VREV64d32 DPR:$Vm)>;
+
+def VREV64q8  : VREV64Q<0b00, "vrev64", "8", v16i8>;
+def VREV64q16 : VREV64Q<0b01, "vrev64", "16", v8i16>;
+def VREV64q32 : VREV64Q<0b10, "vrev64", "32", v4i32>;
+def : Pat<(v4f32 (NEONvrev64 (v4f32 QPR:$Vm))), (VREV64q32 QPR:$Vm)>;
+
+//   VREV32   : Vector Reverse elements within 32-bit words
+
+class VREV32D<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty>
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00001, 0, 0, (outs DPR:$Vd),
+        (ins DPR:$Vm), IIC_VMOVD,
+        OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set DPR:$Vd, (Ty (NEONvrev32 (Ty DPR:$Vm))))]>;
+class VREV32Q<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty>
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00001, 1, 0, (outs QPR:$Vd),
+        (ins QPR:$Vm), IIC_VMOVQ,
+        OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set QPR:$Vd, (Ty (NEONvrev32 (Ty QPR:$Vm))))]>;
+
+def VREV32d8  : VREV32D<0b00, "vrev32", "8", v8i8>;
+def VREV32d16 : VREV32D<0b01, "vrev32", "16", v4i16>;
+
+def VREV32q8  : VREV32Q<0b00, "vrev32", "8", v16i8>;
+def VREV32q16 : VREV32Q<0b01, "vrev32", "16", v8i16>;
+
+//   VREV16   : Vector Reverse elements within 16-bit halfwords
+
+class VREV16D<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty>
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00010, 0, 0, (outs DPR:$Vd),
+        (ins DPR:$Vm), IIC_VMOVD,
+        OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set DPR:$Vd, (Ty (NEONvrev16 (Ty DPR:$Vm))))]>;
+class VREV16Q<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty>
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00010, 1, 0, (outs QPR:$Vd),
+        (ins QPR:$Vm), IIC_VMOVQ,
+        OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set QPR:$Vd, (Ty (NEONvrev16 (Ty QPR:$Vm))))]>;
+
+def VREV16d8  : VREV16D<0b00, "vrev16", "8", v8i8>;
+def VREV16q8  : VREV16Q<0b00, "vrev16", "8", v16i8>;
+
+// Other Vector Shuffles.
+
+//  Aligned extractions: really just dropping registers
+
+class AlignedVEXTq<ValueType DestTy, ValueType SrcTy, SDNodeXForm LaneCVT>
+      : Pat<(DestTy (vector_extract_subvec (SrcTy QPR:$src), (i32 imm:$start))),
+             (EXTRACT_SUBREG (SrcTy QPR:$src), (LaneCVT imm:$start))>;
+
+def : AlignedVEXTq<v8i8, v16i8, DSubReg_i8_reg>;
+
+def : AlignedVEXTq<v4i16, v8i16, DSubReg_i16_reg>;
+
+def : AlignedVEXTq<v2i32, v4i32, DSubReg_i32_reg>;
+
+def : AlignedVEXTq<v1i64, v2i64, DSubReg_f64_reg>;
+
+def : AlignedVEXTq<v2f32, v4f32, DSubReg_i32_reg>;
+
+
+//   VEXT     : Vector Extract
+
+
+// All of these have a two-operand InstAlias.
+let TwoOperandAliasConstraint = "$Vn = $Vd" in {
+class VEXTd<string OpcodeStr, string Dt, ValueType Ty, Operand immTy>
+  : N3V<0,1,0b11,{?,?,?,?},0,0, (outs DPR:$Vd),
+        (ins DPR:$Vn, DPR:$Vm, immTy:$index), NVExtFrm,
+        IIC_VEXTD, OpcodeStr, Dt, "$Vd, $Vn, $Vm, $index", "",
+        [(set DPR:$Vd, (Ty (NEONvext (Ty DPR:$Vn),
+                                     (Ty DPR:$Vm), imm:$index)))]> {
+  bits<3> index;
+  let Inst{11} = 0b0;
+  let Inst{10-8} = index{2-0};
+}
+
+class VEXTq<string OpcodeStr, string Dt, ValueType Ty, Operand immTy>
+  : N3V<0,1,0b11,{?,?,?,?},1,0, (outs QPR:$Vd),
+        (ins QPR:$Vn, QPR:$Vm, imm0_15:$index), NVExtFrm,
+        IIC_VEXTQ, OpcodeStr, Dt, "$Vd, $Vn, $Vm, $index", "",
+        [(set QPR:$Vd, (Ty (NEONvext (Ty QPR:$Vn),
+                                     (Ty QPR:$Vm), imm:$index)))]> {
+  bits<4> index;
+  let Inst{11-8} = index{3-0};
+}
+}
+
+def VEXTd8  : VEXTd<"vext", "8",  v8i8, imm0_7> {
+  let Inst{10-8} = index{2-0};
+}
+def VEXTd16 : VEXTd<"vext", "16", v4i16, imm0_3> {
+  let Inst{10-9} = index{1-0};
+  let Inst{8}    = 0b0;
+}
+def VEXTd32 : VEXTd<"vext", "32", v2i32, imm0_1> {
+  let Inst{10}     = index{0};
+  let Inst{9-8}    = 0b00;
+}
+def : Pat<(v2f32 (NEONvext (v2f32 DPR:$Vn),
+                           (v2f32 DPR:$Vm),
+                           (i32 imm:$index))),
+          (VEXTd32 DPR:$Vn, DPR:$Vm, imm:$index)>;
+
+def VEXTq8  : VEXTq<"vext", "8",  v16i8, imm0_15> {
+  let Inst{11-8} = index{3-0};
+}
+def VEXTq16 : VEXTq<"vext", "16", v8i16, imm0_7> {
+  let Inst{11-9} = index{2-0};
+  let Inst{8}    = 0b0;
+}
+def VEXTq32 : VEXTq<"vext", "32", v4i32, imm0_3> {
+  let Inst{11-10} = index{1-0};
+  let Inst{9-8}    = 0b00;
+}
+def VEXTq64 : VEXTq<"vext", "64", v2i64, imm0_1> {
+  let Inst{11} = index{0};
+  let Inst{10-8}    = 0b000;
+}
+def : Pat<(v4f32 (NEONvext (v4f32 QPR:$Vn),
+                           (v4f32 QPR:$Vm),
+                           (i32 imm:$index))),
+          (VEXTq32 QPR:$Vn, QPR:$Vm, imm:$index)>;
+
+//   VTRN     : Vector Transpose
+
+def  VTRNd8   : N2VDShuffle<0b00, 0b00001, "vtrn", "8">;
+def  VTRNd16  : N2VDShuffle<0b01, 0b00001, "vtrn", "16">;
+def  VTRNd32  : N2VDShuffle<0b10, 0b00001, "vtrn", "32">;
+
+def  VTRNq8   : N2VQShuffle<0b00, 0b00001, IIC_VPERMQ, "vtrn", "8">;
+def  VTRNq16  : N2VQShuffle<0b01, 0b00001, IIC_VPERMQ, "vtrn", "16">;
+def  VTRNq32  : N2VQShuffle<0b10, 0b00001, IIC_VPERMQ, "vtrn", "32">;
+
+//   VUZP     : Vector Unzip (Deinterleave)
+
+def  VUZPd8   : N2VDShuffle<0b00, 0b00010, "vuzp", "8">;
+def  VUZPd16  : N2VDShuffle<0b01, 0b00010, "vuzp", "16">;
+// vuzp.32 Dd, Dm is a pseudo-instruction expanded to vtrn.32 Dd, Dm.
+def : NEONInstAlias<"vuzp${p}.32 $Dd, $Dm",
+                    (VTRNd32 DPR:$Dd, DPR:$Dm, pred:$p)>;
+
+def  VUZPq8   : N2VQShuffle<0b00, 0b00010, IIC_VPERMQ3, "vuzp", "8">;
+def  VUZPq16  : N2VQShuffle<0b01, 0b00010, IIC_VPERMQ3, "vuzp", "16">;
+def  VUZPq32  : N2VQShuffle<0b10, 0b00010, IIC_VPERMQ3, "vuzp", "32">;
+
+//   VZIP     : Vector Zip (Interleave)
+
+def  VZIPd8   : N2VDShuffle<0b00, 0b00011, "vzip", "8">;
+def  VZIPd16  : N2VDShuffle<0b01, 0b00011, "vzip", "16">;
+// vzip.32 Dd, Dm is a pseudo-instruction expanded to vtrn.32 Dd, Dm.
+def : NEONInstAlias<"vzip${p}.32 $Dd, $Dm",
+                    (VTRNd32 DPR:$Dd, DPR:$Dm, pred:$p)>;
+
+def  VZIPq8   : N2VQShuffle<0b00, 0b00011, IIC_VPERMQ3, "vzip", "8">;
+def  VZIPq16  : N2VQShuffle<0b01, 0b00011, IIC_VPERMQ3, "vzip", "16">;
+def  VZIPq32  : N2VQShuffle<0b10, 0b00011, IIC_VPERMQ3, "vzip", "32">;
+
+// Vector Table Lookup and Table Extension.
+
+//   VTBL     : Vector Table Lookup
+let DecoderMethod = "DecodeTBLInstruction" in {
+def  VTBL1
+  : N3V<1,1,0b11,0b1000,0,0, (outs DPR:$Vd),
+        (ins VecListOneD:$Vn, DPR:$Vm), NVTBLFrm, IIC_VTB1,
+        "vtbl", "8", "$Vd, $Vn, $Vm", "",
+        [(set DPR:$Vd, (v8i8 (int_arm_neon_vtbl1 VecListOneD:$Vn, DPR:$Vm)))]>;
+let hasExtraSrcRegAllocReq = 1 in {
+def  VTBL2
+  : N3V<1,1,0b11,0b1001,0,0, (outs DPR:$Vd),
+        (ins VecListDPair:$Vn, DPR:$Vm), NVTBLFrm, IIC_VTB2,
+        "vtbl", "8", "$Vd, $Vn, $Vm", "", []>;
+def  VTBL3
+  : N3V<1,1,0b11,0b1010,0,0, (outs DPR:$Vd),
+        (ins VecListThreeD:$Vn, DPR:$Vm), NVTBLFrm, IIC_VTB3,
+        "vtbl", "8", "$Vd, $Vn, $Vm", "", []>;
+def  VTBL4
+  : N3V<1,1,0b11,0b1011,0,0, (outs DPR:$Vd),
+        (ins VecListFourD:$Vn, DPR:$Vm),
+        NVTBLFrm, IIC_VTB4,
+        "vtbl", "8", "$Vd, $Vn, $Vm", "", []>;
+} // hasExtraSrcRegAllocReq = 1
+
+def  VTBL3Pseudo
+  : PseudoNeonI<(outs DPR:$dst), (ins QQPR:$tbl, DPR:$src), IIC_VTB3, "", []>;
+def  VTBL4Pseudo
+  : PseudoNeonI<(outs DPR:$dst), (ins QQPR:$tbl, DPR:$src), IIC_VTB4, "", []>;
+
+//   VTBX     : Vector Table Extension
+def  VTBX1
+  : N3V<1,1,0b11,0b1000,1,0, (outs DPR:$Vd),
+        (ins DPR:$orig, VecListOneD:$Vn, DPR:$Vm), NVTBLFrm, IIC_VTBX1,
+        "vtbx", "8", "$Vd, $Vn, $Vm", "$orig = $Vd",
+        [(set DPR:$Vd, (v8i8 (int_arm_neon_vtbx1
+                               DPR:$orig, VecListOneD:$Vn, DPR:$Vm)))]>;
+let hasExtraSrcRegAllocReq = 1 in {
+def  VTBX2
+  : N3V<1,1,0b11,0b1001,1,0, (outs DPR:$Vd),
+        (ins DPR:$orig, VecListDPair:$Vn, DPR:$Vm), NVTBLFrm, IIC_VTBX2,
+        "vtbx", "8", "$Vd, $Vn, $Vm", "$orig = $Vd", []>;
+def  VTBX3
+  : N3V<1,1,0b11,0b1010,1,0, (outs DPR:$Vd),
+        (ins DPR:$orig, VecListThreeD:$Vn, DPR:$Vm),
+        NVTBLFrm, IIC_VTBX3,
+        "vtbx", "8", "$Vd, $Vn, $Vm",
+        "$orig = $Vd", []>;
+def  VTBX4
+  : N3V<1,1,0b11,0b1011,1,0, (outs DPR:$Vd),
+        (ins DPR:$orig, VecListFourD:$Vn, DPR:$Vm), NVTBLFrm, IIC_VTBX4,
+        "vtbx", "8", "$Vd, $Vn, $Vm",
+        "$orig = $Vd", []>;
+} // hasExtraSrcRegAllocReq = 1
+
+def  VTBX3Pseudo
+  : PseudoNeonI<(outs DPR:$dst), (ins DPR:$orig, QQPR:$tbl, DPR:$src),
+                IIC_VTBX3, "$orig = $dst", []>;
+def  VTBX4Pseudo
+  : PseudoNeonI<(outs DPR:$dst), (ins DPR:$orig, QQPR:$tbl, DPR:$src),
+                IIC_VTBX4, "$orig = $dst", []>;
+} // DecoderMethod = "DecodeTBLInstruction"
+
+// VRINT      : Vector Rounding
+multiclass VRINT_FPI<string op, bits<3> op9_7, SDPatternOperator Int> {
+  let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in {
+    def D : N2VDIntnp<0b10, 0b100, 0, NoItinerary,
+                      !strconcat("vrint", op), "f32",
+                      v2f32, v2f32, Int>, Requires<[HasV8, HasNEON]> {
+      let Inst{9-7} = op9_7;
+    }
+    def Q : N2VQIntnp<0b10, 0b100, 0, NoItinerary,
+                      !strconcat("vrint", op), "f32",
+                      v4f32, v4f32, Int>, Requires<[HasV8, HasNEON]> {
+      let Inst{9-7} = op9_7;
+    }
+  }
+
+  def : NEONInstAlias<!strconcat("vrint", op, ".f32.f32\t$Dd, $Dm"),
+                  (!cast<Instruction>(NAME#"D") DPR:$Dd, DPR:$Dm)>;
+  def : NEONInstAlias<!strconcat("vrint", op, ".f32.f32\t$Qd, $Qm"),
+                  (!cast<Instruction>(NAME#"Q") QPR:$Qd, QPR:$Qm)>;
+}
+
+defm VRINTNN : VRINT_FPI<"n", 0b000, int_arm_neon_vrintn>;
+defm VRINTXN : VRINT_FPI<"x", 0b001, int_arm_neon_vrintx>;
+defm VRINTAN : VRINT_FPI<"a", 0b010, int_arm_neon_vrinta>;
+defm VRINTZN : VRINT_FPI<"z", 0b011, int_arm_neon_vrintz>;
+defm VRINTMN : VRINT_FPI<"m", 0b101, int_arm_neon_vrintm>;
+defm VRINTPN : VRINT_FPI<"p", 0b111, int_arm_neon_vrintp>;
+
+// Cryptography instructions
+let PostEncoderMethod = "NEONThumb2DataIPostEncoder",
+    DecoderNamespace = "v8Crypto", hasSideEffects = 0 in {
+  class AES<string op, bit op7, bit op6, SDPatternOperator Int>
+    : N2VQIntXnp<0b00, 0b00, 0b011, op6, op7, NoItinerary,
+                 !strconcat("aes", op), "8", v16i8, v16i8, Int>,
+      Requires<[HasV8, HasCrypto]>;
+  class AES2Op<string op, bit op7, bit op6, SDPatternOperator Int>
+    : N2VQIntX2np<0b00, 0b00, 0b011, op6, op7, NoItinerary,
+                 !strconcat("aes", op), "8", v16i8, v16i8, Int>,
+      Requires<[HasV8, HasCrypto]>;
+  class N2SHA<string op, bits<2> op17_16, bits<3> op10_8, bit op7, bit op6,
+              SDPatternOperator Int>
+    : N2VQIntXnp<0b10, op17_16, op10_8, op6, op7, NoItinerary,
+                 !strconcat("sha", op), "32", v4i32, v4i32, Int>,
+      Requires<[HasV8, HasCrypto]>;
+  class N2SHA2Op<string op, bits<2> op17_16, bits<3> op10_8, bit op7, bit op6,
+              SDPatternOperator Int>
+    : N2VQIntX2np<0b10, op17_16, op10_8, op6, op7, NoItinerary,
+                 !strconcat("sha", op), "32", v4i32, v4i32, Int>,
+      Requires<[HasV8, HasCrypto]>;
+  class N3SHA3Op<string op, bits<5> op27_23, bits<2> op21_20, SDPatternOperator Int>
+    : N3VQInt3np<op27_23, op21_20, 0b1100, 1, 0, N3RegFrm, NoItinerary,
+                !strconcat("sha", op), "32", v4i32, v4i32, Int, 0>,
+      Requires<[HasV8, HasCrypto]>;
+}
+
+def AESD : AES2Op<"d", 0, 1, int_arm_neon_aesd>;
+def AESE : AES2Op<"e", 0, 0, int_arm_neon_aese>;
+def AESIMC : AES<"imc", 1, 1, int_arm_neon_aesimc>;
+def AESMC : AES<"mc", 1, 0, int_arm_neon_aesmc>;
+
+def SHA1H : N2SHA<"1h", 0b01, 0b010, 1, 1, null_frag>;
+def SHA1SU1 : N2SHA2Op<"1su1", 0b10, 0b011, 1, 0, int_arm_neon_sha1su1>;
+def SHA256SU0 : N2SHA2Op<"256su0", 0b10, 0b011, 1, 1, int_arm_neon_sha256su0>;
+def SHA1C : N3SHA3Op<"1c", 0b00100, 0b00, null_frag>;
+def SHA1M : N3SHA3Op<"1m", 0b00100, 0b10, null_frag>;
+def SHA1P : N3SHA3Op<"1p", 0b00100, 0b01, null_frag>;
+def SHA1SU0 : N3SHA3Op<"1su0", 0b00100, 0b11, int_arm_neon_sha1su0>;
+def SHA256H : N3SHA3Op<"256h", 0b00110, 0b00, int_arm_neon_sha256h>;
+def SHA256H2 : N3SHA3Op<"256h2", 0b00110, 0b01, int_arm_neon_sha256h2>;
+def SHA256SU1 : N3SHA3Op<"256su1", 0b00110, 0b10, int_arm_neon_sha256su1>;
+
+def : Pat<(i32 (int_arm_neon_sha1h i32:$Rn)),
+          (COPY_TO_REGCLASS (f32 (EXTRACT_SUBREG
+              (SHA1H (SUBREG_TO_REG (i64 0),
+                                    (f32 (COPY_TO_REGCLASS i32:$Rn, SPR)),
+                                    ssub_0)),
+              ssub_0)), GPR)>;
+
+def : Pat<(v4i32 (int_arm_neon_sha1c v4i32:$hash_abcd, i32:$hash_e, v4i32:$wk)),
+          (SHA1C v4i32:$hash_abcd,
+                 (SUBREG_TO_REG (i64 0),
+                                (f32 (COPY_TO_REGCLASS i32:$hash_e, SPR)),
+                                ssub_0),
+                 v4i32:$wk)>;
+
+def : Pat<(v4i32 (int_arm_neon_sha1m v4i32:$hash_abcd, i32:$hash_e, v4i32:$wk)),
+          (SHA1M v4i32:$hash_abcd,
+                 (SUBREG_TO_REG (i64 0),
+                                (f32 (COPY_TO_REGCLASS i32:$hash_e, SPR)),
+                                ssub_0),
+                 v4i32:$wk)>;
+
+def : Pat<(v4i32 (int_arm_neon_sha1p v4i32:$hash_abcd, i32:$hash_e, v4i32:$wk)),
+          (SHA1P v4i32:$hash_abcd,
+                 (SUBREG_TO_REG (i64 0),
+                                (f32 (COPY_TO_REGCLASS i32:$hash_e, SPR)),
+                                ssub_0),
+                 v4i32:$wk)>;
+
+//===----------------------------------------------------------------------===//
+// NEON instructions for single-precision FP math
+//===----------------------------------------------------------------------===//
+
+class N2VSPat<SDNode OpNode, NeonI Inst>
+  : NEONFPPat<(f32 (OpNode SPR:$a)),
+              (EXTRACT_SUBREG
+               (v2f32 (COPY_TO_REGCLASS (Inst
+                (INSERT_SUBREG
+                 (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)),
+                 SPR:$a, ssub_0)), DPR_VFP2)), ssub_0)>;
+
+class N3VSPat<SDNode OpNode, NeonI Inst>
+  : NEONFPPat<(f32 (OpNode SPR:$a, SPR:$b)),
+              (EXTRACT_SUBREG
+               (v2f32 (COPY_TO_REGCLASS (Inst
+                (INSERT_SUBREG
+                 (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)),
+                 SPR:$a, ssub_0),
+                (INSERT_SUBREG
+                 (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)),
+                 SPR:$b, ssub_0)), DPR_VFP2)), ssub_0)>;
+
+class N3VSMulOpPat<SDNode MulNode, SDNode OpNode, NeonI Inst>
+  : NEONFPPat<(f32 (OpNode SPR:$acc, (f32 (MulNode SPR:$a, SPR:$b)))),
+              (EXTRACT_SUBREG
+               (v2f32 (COPY_TO_REGCLASS (Inst
+                (INSERT_SUBREG
+                 (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)),
+                 SPR:$acc, ssub_0),
+                (INSERT_SUBREG
+                 (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)),
+                 SPR:$a, ssub_0),
+                (INSERT_SUBREG
+                 (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)),
+                 SPR:$b, ssub_0)), DPR_VFP2)), ssub_0)>;
+
+def : N3VSPat<fadd, VADDfd>;
+def : N3VSPat<fsub, VSUBfd>;
+def : N3VSPat<fmul, VMULfd>;
+def : N3VSMulOpPat<fmul, fadd, VMLAfd>,
+      Requires<[HasNEON, UseNEONForFP, UseFPVMLx, DontUseFusedMAC]>;
+def : N3VSMulOpPat<fmul, fsub, VMLSfd>,
+      Requires<[HasNEON, UseNEONForFP, UseFPVMLx, DontUseFusedMAC]>;
+def : N3VSMulOpPat<fmul, fadd, VFMAfd>,
+      Requires<[HasVFP4, UseNEONForFP, UseFusedMAC]>;
+def : N3VSMulOpPat<fmul, fsub, VFMSfd>,
+      Requires<[HasVFP4, UseNEONForFP, UseFusedMAC]>;
+def : N2VSPat<fabs, VABSfd>;
+def : N2VSPat<fneg, VNEGfd>;
+def : N3VSPat<NEONfmax, VMAXfd>;
+def : N3VSPat<NEONfmin, VMINfd>;
+def : N2VSPat<arm_ftosi, VCVTf2sd>;
+def : N2VSPat<arm_ftoui, VCVTf2ud>;
+def : N2VSPat<arm_sitof, VCVTs2fd>;
+def : N2VSPat<arm_uitof, VCVTu2fd>;
+
+// Prefer VMOVDRR for i32 -> f32 bitcasts, it can write all DPR registers.
+def : Pat<(f32 (bitconvert GPR:$a)),
+          (EXTRACT_SUBREG (VMOVDRR GPR:$a, GPR:$a), ssub_0)>,
+        Requires<[HasNEON, DontUseVMOVSR]>;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// bit_convert
+let Predicates = [IsLE] in {
+  def : Pat<(v1i64 (bitconvert (v2i32 DPR:$src))), (v1i64 DPR:$src)>;
+  def : Pat<(v1i64 (bitconvert (v4i16 DPR:$src))), (v1i64 DPR:$src)>;
+  def : Pat<(v1i64 (bitconvert (v8i8  DPR:$src))), (v1i64 DPR:$src)>;
+}
+def : Pat<(v1i64 (bitconvert (f64   DPR:$src))), (v1i64 DPR:$src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(v1i64 (bitconvert (v2f32 DPR:$src))), (v1i64 DPR:$src)>;
+  def : Pat<(v2i32 (bitconvert (v1i64 DPR:$src))), (v2i32 DPR:$src)>;
+  def : Pat<(v2i32 (bitconvert (v4i16 DPR:$src))), (v2i32 DPR:$src)>;
+  def : Pat<(v2i32 (bitconvert (v8i8  DPR:$src))), (v2i32 DPR:$src)>;
+  def : Pat<(v2i32 (bitconvert (f64   DPR:$src))), (v2i32 DPR:$src)>;
+}
+def : Pat<(v2i32 (bitconvert (v2f32 DPR:$src))), (v2i32 DPR:$src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(v4i16 (bitconvert (v1i64 DPR:$src))), (v4i16 DPR:$src)>;
+  def : Pat<(v4i16 (bitconvert (v2i32 DPR:$src))), (v4i16 DPR:$src)>;
+  def : Pat<(v4i16 (bitconvert (v8i8  DPR:$src))), (v4i16 DPR:$src)>;
+  def : Pat<(v4i16 (bitconvert (f64   DPR:$src))), (v4i16 DPR:$src)>;
+  def : Pat<(v4i16 (bitconvert (v2f32 DPR:$src))), (v4i16 DPR:$src)>;
+  def : Pat<(v8i8  (bitconvert (v1i64 DPR:$src))), (v8i8  DPR:$src)>;
+  def : Pat<(v8i8  (bitconvert (v2i32 DPR:$src))), (v8i8  DPR:$src)>;
+  def : Pat<(v8i8  (bitconvert (v4i16 DPR:$src))), (v8i8  DPR:$src)>;
+  def : Pat<(v8i8  (bitconvert (f64   DPR:$src))), (v8i8  DPR:$src)>;
+  def : Pat<(v8i8  (bitconvert (v2f32 DPR:$src))), (v8i8  DPR:$src)>;
+}
+def : Pat<(f64   (bitconvert (v1i64 DPR:$src))), (f64   DPR:$src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(f64   (bitconvert (v2i32 DPR:$src))), (f64   DPR:$src)>;
+  def : Pat<(f64   (bitconvert (v4i16 DPR:$src))), (f64   DPR:$src)>;
+  def : Pat<(f64   (bitconvert (v8i8  DPR:$src))), (f64   DPR:$src)>;
+  def : Pat<(f64   (bitconvert (v2f32 DPR:$src))), (f64   DPR:$src)>;
+  def : Pat<(v2f32 (bitconvert (f64   DPR:$src))), (v2f32 DPR:$src)>;
+  def : Pat<(v2f32 (bitconvert (v1i64 DPR:$src))), (v2f32 DPR:$src)>;
+}
+def : Pat<(v2f32 (bitconvert (v2i32 DPR:$src))), (v2f32 DPR:$src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(v2f32 (bitconvert (v4i16 DPR:$src))), (v2f32 DPR:$src)>;
+  def : Pat<(v2f32 (bitconvert (v8i8  DPR:$src))), (v2f32 DPR:$src)>;
+}
+
+let Predicates = [IsLE] in {
+  def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (v2i64 QPR:$src)>;
+  def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (v2i64 QPR:$src)>;
+  def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (v2i64 QPR:$src)>;
+}
+def : Pat<(v2i64 (bitconvert (v2f64 QPR:$src))), (v2i64 QPR:$src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (v2i64 QPR:$src)>;
+  def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (v4i32 QPR:$src)>;
+  def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (v4i32 QPR:$src)>;
+  def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (v4i32 QPR:$src)>;
+  def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (v4i32 QPR:$src)>;
+}
+def : Pat<(v4i32 (bitconvert (v4f32 QPR:$src))), (v4i32 QPR:$src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (v8i16 QPR:$src)>;
+  def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (v8i16 QPR:$src)>;
+  def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (v8i16 QPR:$src)>;
+  def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (v8i16 QPR:$src)>;
+  def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (v8i16 QPR:$src)>;
+  def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (v16i8 QPR:$src)>;
+  def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (v16i8 QPR:$src)>;
+  def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (v16i8 QPR:$src)>;
+  def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (v16i8 QPR:$src)>;
+  def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (v16i8 QPR:$src)>;
+  def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (v4f32 QPR:$src)>;
+}
+def : Pat<(v4f32 (bitconvert (v4i32 QPR:$src))), (v4f32 QPR:$src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (v4f32 QPR:$src)>;
+  def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (v4f32 QPR:$src)>;
+  def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (v4f32 QPR:$src)>;
+}
+def : Pat<(v2f64 (bitconvert (v2i64 QPR:$src))), (v2f64 QPR:$src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (v2f64 QPR:$src)>;
+  def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (v2f64 QPR:$src)>;
+  def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (v2f64 QPR:$src)>;
+  def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (v2f64 QPR:$src)>;
+}
+
+let Predicates = [IsBE] in {
+  // 64 bit conversions
+  def : Pat<(v1i64 (bitconvert (v2i32 DPR:$src))), (VREV64d32 DPR:$src)>;
+  def : Pat<(v1i64 (bitconvert (v4i16 DPR:$src))), (VREV64d16 DPR:$src)>;
+  def : Pat<(v1i64 (bitconvert (v8i8  DPR:$src))), (VREV64d8  DPR:$src)>;
+  def : Pat<(v1i64 (bitconvert (v2f32 DPR:$src))), (VREV64d32 DPR:$src)>;
+  def : Pat<(v2i32 (bitconvert (v1i64 DPR:$src))), (VREV64d32 DPR:$src)>;
+  def : Pat<(v2i32 (bitconvert (v4i16 DPR:$src))), (VREV32d16 DPR:$src)>;
+  def : Pat<(v2i32 (bitconvert (v8i8  DPR:$src))), (VREV32d8  DPR:$src)>;
+  def : Pat<(v2i32 (bitconvert (f64   DPR:$src))), (VREV64d32 DPR:$src)>;
+  def : Pat<(v4i16 (bitconvert (v1i64 DPR:$src))), (VREV64d16 DPR:$src)>;
+  def : Pat<(v4i16 (bitconvert (v2i32 DPR:$src))), (VREV32d16 DPR:$src)>;
+  def : Pat<(v4i16 (bitconvert (v8i8  DPR:$src))), (VREV16d8  DPR:$src)>;
+  def : Pat<(v4i16 (bitconvert (f64   DPR:$src))), (VREV64d16 DPR:$src)>;
+  def : Pat<(v4i16 (bitconvert (v2f32 DPR:$src))), (VREV32d16 DPR:$src)>;
+  def : Pat<(v8i8  (bitconvert (v1i64 DPR:$src))), (VREV64d8  DPR:$src)>;
+  def : Pat<(v8i8  (bitconvert (v2i32 DPR:$src))), (VREV32d8  DPR:$src)>;
+  def : Pat<(v8i8  (bitconvert (v4i16 DPR:$src))), (VREV16d8  DPR:$src)>;
+  def : Pat<(v8i8  (bitconvert (f64   DPR:$src))), (VREV64d8  DPR:$src)>;
+  def : Pat<(v8i8  (bitconvert (v2f32 DPR:$src))), (VREV32d8  DPR:$src)>;
+  def : Pat<(f64   (bitconvert (v2i32 DPR:$src))), (VREV64d32 DPR:$src)>;
+  def : Pat<(f64   (bitconvert (v4i16 DPR:$src))), (VREV64d16 DPR:$src)>;
+  def : Pat<(f64   (bitconvert (v8i8  DPR:$src))), (VREV64d8  DPR:$src)>;
+  def : Pat<(f64   (bitconvert (v2f32 DPR:$src))), (VREV64d32 DPR:$src)>;
+  def : Pat<(v2f32 (bitconvert (f64   DPR:$src))), (VREV64d32 DPR:$src)>;
+  def : Pat<(v2f32 (bitconvert (v1i64 DPR:$src))), (VREV64d32 DPR:$src)>;
+  def : Pat<(v2f32 (bitconvert (v4i16 DPR:$src))), (VREV32d16 DPR:$src)>;
+  def : Pat<(v2f32 (bitconvert (v8i8  DPR:$src))), (VREV32d8  DPR:$src)>;
+
+  // 128 bit conversions
+  def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (VREV64q32 QPR:$src)>;
+  def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (VREV64q16 QPR:$src)>;
+  def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (VREV64q8  QPR:$src)>;
+  def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (VREV64q32 QPR:$src)>;
+  def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (VREV64q32 QPR:$src)>;
+  def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (VREV32q16 QPR:$src)>;
+  def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (VREV32q8  QPR:$src)>;
+  def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (VREV64q32 QPR:$src)>;
+  def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (VREV64q16 QPR:$src)>;
+  def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (VREV32q16 QPR:$src)>;
+  def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (VREV16q8  QPR:$src)>;
+  def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (VREV64q16 QPR:$src)>;
+  def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (VREV32q16 QPR:$src)>;
+  def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (VREV64q8  QPR:$src)>;
+  def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (VREV32q8  QPR:$src)>;
+  def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (VREV16q8  QPR:$src)>;
+  def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (VREV64q8  QPR:$src)>;
+  def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (VREV32q8  QPR:$src)>;
+  def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (VREV64q32 QPR:$src)>;
+  def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (VREV32q16 QPR:$src)>;
+  def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (VREV32q8  QPR:$src)>;
+  def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (VREV64q32 QPR:$src)>;
+  def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (VREV64q32 QPR:$src)>;
+  def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (VREV64q16 QPR:$src)>;
+  def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (VREV64q8  QPR:$src)>;
+  def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (VREV64q32 QPR:$src)>;
+}
+
+// Fold extracting an element out of a v2i32 into a vfp register.
+def : Pat<(f32 (bitconvert (i32 (extractelt (v2i32 DPR:$src), imm:$lane)))),
+          (f32 (EXTRACT_SUBREG DPR:$src, (SSubReg_f32_reg imm:$lane)))>;
+
+// Vector lengthening move with load, matching extending loads.
+
+// extload, zextload and sextload for a standard lengthening load. Example:
+// Lengthen_Single<"8", "i16", "8"> =
+//     Pat<(v8i16 (extloadvi8 addrmode6:$addr))
+//         (VMOVLuv8i16 (VLD1d8 addrmode6:$addr,
+//                              (f64 (IMPLICIT_DEF)), (i32 0)))>;
+multiclass Lengthen_Single<string DestLanes, string DestTy, string SrcTy> {
+  let AddedComplexity = 10 in {
+  def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                    (!cast<PatFrag>("extloadvi" # SrcTy) addrmode6:$addr)),
+                  (!cast<Instruction>("VMOVLuv" # DestLanes # DestTy)
+                    (!cast<Instruction>("VLD1d" # SrcTy) addrmode6:$addr))>;
+
+  def _Z : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                  (!cast<PatFrag>("zextloadvi" # SrcTy) addrmode6:$addr)),
+                (!cast<Instruction>("VMOVLuv" # DestLanes # DestTy)
+                    (!cast<Instruction>("VLD1d" # SrcTy) addrmode6:$addr))>;
+
+  def _S : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                  (!cast<PatFrag>("sextloadvi" # SrcTy) addrmode6:$addr)),
+                (!cast<Instruction>("VMOVLsv" # DestLanes # DestTy)
+                    (!cast<Instruction>("VLD1d" # SrcTy) addrmode6:$addr))>;
+  }
+}
+
+// extload, zextload and sextload for a lengthening load which only uses
+// half the lanes available. Example:
+// Lengthen_HalfSingle<"4", "i16", "8", "i16", "i8"> =
+//     Pat<(v4i16 (extloadvi8 addrmode6oneL32:$addr)),
+//         (EXTRACT_SUBREG (VMOVLuv8i16 (VLD1LNd32 addrmode6oneL32:$addr,
+//                                      (f64 (IMPLICIT_DEF)), (i32 0))),
+//                         dsub_0)>;
+multiclass Lengthen_HalfSingle<string DestLanes, string DestTy, string SrcTy,
+                               string InsnLanes, string InsnTy> {
+  def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("extloadv" # SrcTy) addrmode6oneL32:$addr)),
+       (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # InsnLanes # InsnTy)
+         (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
+         dsub_0)>;
+  def _Z   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6oneL32:$addr)),
+       (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # InsnLanes # InsnTy)
+         (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
+         dsub_0)>;
+  def _S   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6oneL32:$addr)),
+       (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # InsnLanes # InsnTy)
+         (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
+         dsub_0)>;
+}
+
+// The following class definition is basically a copy of the
+// Lengthen_HalfSingle definition above, however with an additional parameter
+// "RevLanes" to select the correct VREV32dXX instruction. This is to convert
+// data loaded by VLD1LN into proper vector format in big endian mode.
+multiclass Lengthen_HalfSingle_Big_Endian<string DestLanes, string DestTy, string SrcTy,
+                               string InsnLanes, string InsnTy, string RevLanes> {
+  def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("extloadv" # SrcTy) addrmode6oneL32:$addr)),
+       (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # InsnLanes # InsnTy)
+         (!cast<Instruction>("VREV32d" # RevLanes)
+           (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))),
+         dsub_0)>;
+  def _Z   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6oneL32:$addr)),
+       (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # InsnLanes # InsnTy)
+         (!cast<Instruction>("VREV32d" # RevLanes)
+           (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))),
+         dsub_0)>;
+  def _S   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6oneL32:$addr)),
+       (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # InsnLanes # InsnTy)
+         (!cast<Instruction>("VREV32d" # RevLanes)
+           (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))),
+         dsub_0)>;
+}
+
+// extload, zextload and sextload for a lengthening load followed by another
+// lengthening load, to quadruple the initial length.
+//
+// Lengthen_Double<"4", "i32", "i8", "8", "i16", "4", "i32"> =
+//     Pat<(v4i32 (extloadvi8 addrmode6oneL32:$addr))
+//         (EXTRACT_SUBREG (VMOVLuv4i32
+//           (EXTRACT_SUBREG (VMOVLuv8i16 (VLD1LNd32 addrmode6oneL32:$addr,
+//                                                   (f64 (IMPLICIT_DEF)),
+//                                                   (i32 0))),
+//                           dsub_0)),
+//           dsub_0)>;
+multiclass Lengthen_Double<string DestLanes, string DestTy, string SrcTy,
+                           string Insn1Lanes, string Insn1Ty, string Insn2Lanes,
+                           string Insn2Ty> {
+  def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("extloadv" # SrcTy) addrmode6oneL32:$addr)),
+         (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
+           (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
+             (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
+             dsub_0))>;
+  def _Z   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6oneL32:$addr)),
+         (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
+           (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
+             (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
+             dsub_0))>;
+  def _S   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6oneL32:$addr)),
+         (!cast<Instruction>("VMOVLsv" # Insn2Lanes # Insn2Ty)
+           (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn1Lanes # Insn1Ty)
+             (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
+             dsub_0))>;
+}
+
+// The following class definition is basically a copy of the
+// Lengthen_Double definition above, however with an additional parameter
+// "RevLanes" to select the correct VREV32dXX instruction. This is to convert
+// data loaded by VLD1LN into proper vector format in big endian mode.
+multiclass Lengthen_Double_Big_Endian<string DestLanes, string DestTy, string SrcTy,
+                           string Insn1Lanes, string Insn1Ty, string Insn2Lanes,
+                           string Insn2Ty, string RevLanes> {
+  def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("extloadv" # SrcTy) addrmode6oneL32:$addr)),
+         (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
+           (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
+            (!cast<Instruction>("VREV32d" # RevLanes)
+             (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))),
+             dsub_0))>;
+  def _Z   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6oneL32:$addr)),
+         (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
+           (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
+            (!cast<Instruction>("VREV32d" # RevLanes)
+             (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))),
+             dsub_0))>;
+  def _S   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6oneL32:$addr)),
+         (!cast<Instruction>("VMOVLsv" # Insn2Lanes # Insn2Ty)
+           (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn1Lanes # Insn1Ty)
+            (!cast<Instruction>("VREV32d" # RevLanes)
+             (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))),
+             dsub_0))>;
+}
+
+// extload, zextload and sextload for a lengthening load followed by another
+// lengthening load, to quadruple the initial length, but which ends up only
+// requiring half the available lanes (a 64-bit outcome instead of a 128-bit).
+//
+// Lengthen_HalfDouble<"2", "i32", "i8", "8", "i16", "4", "i32"> =
+// Pat<(v2i32 (extloadvi8 addrmode6:$addr))
+//     (EXTRACT_SUBREG (VMOVLuv4i32
+//       (EXTRACT_SUBREG (VMOVLuv8i16 (VLD1LNd16 addrmode6:$addr,
+//                                               (f64 (IMPLICIT_DEF)), (i32 0))),
+//                       dsub_0)),
+//       dsub_0)>;
+multiclass Lengthen_HalfDouble<string DestLanes, string DestTy, string SrcTy,
+                           string Insn1Lanes, string Insn1Ty, string Insn2Lanes,
+                           string Insn2Ty> {
+  def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("extloadv" # SrcTy) addrmode6:$addr)),
+         (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
+           (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
+             (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
+             dsub_0)),
+          dsub_0)>;
+  def _Z   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6:$addr)),
+         (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
+           (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
+             (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
+             dsub_0)),
+          dsub_0)>;
+  def _S   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6:$addr)),
+         (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn2Lanes # Insn2Ty)
+           (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn1Lanes # Insn1Ty)
+             (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
+             dsub_0)),
+          dsub_0)>;
+}
+
+// The following class definition is basically a copy of the
+// Lengthen_HalfDouble definition above, however with an additional VREV16d8
+// instruction to convert data loaded by VLD1LN into proper vector format
+// in big endian mode.
+multiclass Lengthen_HalfDouble_Big_Endian<string DestLanes, string DestTy, string SrcTy,
+                           string Insn1Lanes, string Insn1Ty, string Insn2Lanes,
+                           string Insn2Ty> {
+  def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("extloadv" # SrcTy) addrmode6:$addr)),
+         (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
+           (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
+            (!cast<Instruction>("VREV16d8")
+             (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))),
+             dsub_0)),
+          dsub_0)>;
+  def _Z   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6:$addr)),
+         (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
+           (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
+            (!cast<Instruction>("VREV16d8")
+             (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))),
+             dsub_0)),
+          dsub_0)>;
+  def _S   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6:$addr)),
+         (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn2Lanes # Insn2Ty)
+           (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn1Lanes # Insn1Ty)
+            (!cast<Instruction>("VREV16d8")
+             (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))),
+             dsub_0)),
+          dsub_0)>;
+}
+
+defm : Lengthen_Single<"8", "i16", "8">; // v8i8 -> v8i16
+defm : Lengthen_Single<"4", "i32", "16">; // v4i16 -> v4i32
+defm : Lengthen_Single<"2", "i64", "32">; // v2i32 -> v2i64
+
+let Predicates = [IsLE] in {
+  defm : Lengthen_HalfSingle<"4", "i16", "i8", "8", "i16">; // v4i8 -> v4i16
+  defm : Lengthen_HalfSingle<"2", "i32", "i16", "4", "i32">; // v2i16 -> v2i32
+
+  // Double lengthening - v4i8 -> v4i16 -> v4i32
+  defm : Lengthen_Double<"4", "i32", "i8", "8", "i16", "4", "i32">;
+  // v2i8 -> v2i16 -> v2i32
+  defm : Lengthen_HalfDouble<"2", "i32", "i8", "8", "i16", "4", "i32">;
+  // v2i16 -> v2i32 -> v2i64
+  defm : Lengthen_Double<"2", "i64", "i16", "4", "i32", "2", "i64">;
+}
+
+let Predicates = [IsBE] in {
+  defm : Lengthen_HalfSingle_Big_Endian<"4", "i16", "i8", "8", "i16", "8">; // v4i8 -> v4i16
+  defm : Lengthen_HalfSingle_Big_Endian<"2", "i32", "i16", "4", "i32", "16">; // v2i16 -> v2i32
+
+  // Double lengthening - v4i8 -> v4i16 -> v4i32
+  defm : Lengthen_Double_Big_Endian<"4", "i32", "i8", "8", "i16", "4", "i32", "8">;
+  // v2i8 -> v2i16 -> v2i32
+  defm : Lengthen_HalfDouble_Big_Endian<"2", "i32", "i8", "8", "i16", "4", "i32">;
+  // v2i16 -> v2i32 -> v2i64
+  defm : Lengthen_Double_Big_Endian<"2", "i64", "i16", "4", "i32", "2", "i64", "16">;
+}
+
+// Triple lengthening - v2i8 -> v2i16 -> v2i32 -> v2i64
+let Predicates = [IsLE] in {
+  def : Pat<(v2i64 (extloadvi8 addrmode6:$addr)),
+        (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16
+           (VLD1LNd16 addrmode6:$addr,
+                      (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>;
+  def : Pat<(v2i64 (zextloadvi8 addrmode6:$addr)),
+        (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16
+           (VLD1LNd16 addrmode6:$addr,
+                      (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>;
+  def : Pat<(v2i64 (sextloadvi8 addrmode6:$addr)),
+        (VMOVLsv2i64 (EXTRACT_SUBREG (VMOVLsv4i32 (EXTRACT_SUBREG (VMOVLsv8i16
+           (VLD1LNd16 addrmode6:$addr,
+                      (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>;
+}
+// The following patterns are basically a copy of the patterns above, 
+// however with an additional VREV16d instruction to convert data
+// loaded by VLD1LN into proper vector format in big endian mode.
+let Predicates = [IsBE] in {
+  def : Pat<(v2i64 (extloadvi8 addrmode6:$addr)),
+        (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16
+           (!cast<Instruction>("VREV16d8")
+             (VLD1LNd16 addrmode6:$addr,
+                        (f64 (IMPLICIT_DEF)), (i32 0)))), dsub_0)), dsub_0))>;
+  def : Pat<(v2i64 (zextloadvi8 addrmode6:$addr)),
+        (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16
+           (!cast<Instruction>("VREV16d8")
+             (VLD1LNd16 addrmode6:$addr,
+                        (f64 (IMPLICIT_DEF)), (i32 0)))), dsub_0)), dsub_0))>;
+  def : Pat<(v2i64 (sextloadvi8 addrmode6:$addr)),
+        (VMOVLsv2i64 (EXTRACT_SUBREG (VMOVLsv4i32 (EXTRACT_SUBREG (VMOVLsv8i16
+           (!cast<Instruction>("VREV16d8")
+             (VLD1LNd16 addrmode6:$addr,
+                        (f64 (IMPLICIT_DEF)), (i32 0)))), dsub_0)), dsub_0))>;
+}
+
+//===----------------------------------------------------------------------===//
+// Assembler aliases
+//
+
+def : VFP2InstAlias<"fmdhr${p} $Dd, $Rn",
+                    (VSETLNi32 DPR:$Dd, GPR:$Rn, 1, pred:$p)>;
+def : VFP2InstAlias<"fmdlr${p} $Dd, $Rn",
+                    (VSETLNi32 DPR:$Dd, GPR:$Rn, 0, pred:$p)>;
+
+// VAND/VBIC/VEOR/VORR accept but do not require a type suffix.
+defm : NEONDTAnyInstAlias<"vand${p}", "$Vd, $Vn, $Vm",
+                         (VANDd DPR:$Vd, DPR:$Vn, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vand${p}", "$Vd, $Vn, $Vm",
+                         (VANDq QPR:$Vd, QPR:$Vn, QPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vbic${p}", "$Vd, $Vn, $Vm",
+                         (VBICd DPR:$Vd, DPR:$Vn, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vbic${p}", "$Vd, $Vn, $Vm",
+                         (VBICq QPR:$Vd, QPR:$Vn, QPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"veor${p}", "$Vd, $Vn, $Vm",
+                         (VEORd DPR:$Vd, DPR:$Vn, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"veor${p}", "$Vd, $Vn, $Vm",
+                         (VEORq QPR:$Vd, QPR:$Vn, QPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vorr${p}", "$Vd, $Vn, $Vm",
+                         (VORRd DPR:$Vd, DPR:$Vn, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vorr${p}", "$Vd, $Vn, $Vm",
+                         (VORRq QPR:$Vd, QPR:$Vn, QPR:$Vm, pred:$p)>;
+// ... two-operand aliases
+defm : NEONDTAnyInstAlias<"vand${p}", "$Vdn, $Vm",
+                         (VANDd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vand${p}", "$Vdn, $Vm",
+                         (VANDq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"veor${p}", "$Vdn, $Vm",
+                         (VEORd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"veor${p}", "$Vdn, $Vm",
+                         (VEORq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vorr${p}", "$Vdn, $Vm",
+                         (VORRd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vorr${p}", "$Vdn, $Vm",
+                         (VORRq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+
+// VLD1 single-lane pseudo-instructions. These need special handling for
+// the lane index that an InstAlias can't handle, so we use these instead.
+def VLD1LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vld1${p}", ".8", "$list, $addr",
+                 (ins VecListOneDByteIndexed:$list, addrmode6alignNone:$addr,
+                      pred:$p)>;
+def VLD1LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vld1${p}", ".16", "$list, $addr",
+                 (ins VecListOneDHWordIndexed:$list, addrmode6align16:$addr,
+                      pred:$p)>;
+def VLD1LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vld1${p}", ".32", "$list, $addr",
+                 (ins VecListOneDWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
+
+def VLD1LNdWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld1${p}", ".8", "$list, $addr!",
+                 (ins VecListOneDByteIndexed:$list, addrmode6alignNone:$addr,
+                      pred:$p)>;
+def VLD1LNdWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld1${p}", ".16", "$list, $addr!",
+                 (ins VecListOneDHWordIndexed:$list, addrmode6align16:$addr,
+                      pred:$p)>;
+def VLD1LNdWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld1${p}", ".32", "$list, $addr!",
+                 (ins VecListOneDWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
+def VLD1LNdWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld1${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListOneDByteIndexed:$list, addrmode6alignNone:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD1LNdWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld1${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListOneDHWordIndexed:$list, addrmode6align16:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD1LNdWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld1${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListOneDWordIndexed:$list, addrmode6align32:$addr,
+                       rGPR:$Rm, pred:$p)>;
+
+
+// VST1 single-lane pseudo-instructions. These need special handling for
+// the lane index that an InstAlias can't handle, so we use these instead.
+def VST1LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vst1${p}", ".8", "$list, $addr",
+                 (ins VecListOneDByteIndexed:$list, addrmode6alignNone:$addr,
+                      pred:$p)>;
+def VST1LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vst1${p}", ".16", "$list, $addr",
+                 (ins VecListOneDHWordIndexed:$list, addrmode6align16:$addr,
+                      pred:$p)>;
+def VST1LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vst1${p}", ".32", "$list, $addr",
+                 (ins VecListOneDWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
+
+def VST1LNdWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst1${p}", ".8", "$list, $addr!",
+                 (ins VecListOneDByteIndexed:$list, addrmode6alignNone:$addr,
+                      pred:$p)>;
+def VST1LNdWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst1${p}", ".16", "$list, $addr!",
+                 (ins VecListOneDHWordIndexed:$list, addrmode6align16:$addr,
+                      pred:$p)>;
+def VST1LNdWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst1${p}", ".32", "$list, $addr!",
+                 (ins VecListOneDWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
+def VST1LNdWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst1${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListOneDByteIndexed:$list, addrmode6alignNone:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST1LNdWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst1${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListOneDHWordIndexed:$list, addrmode6align16:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST1LNdWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst1${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListOneDWordIndexed:$list, addrmode6align32:$addr,
+                       rGPR:$Rm, pred:$p)>;
+
+// VLD2 single-lane pseudo-instructions. These need special handling for
+// the lane index that an InstAlias can't handle, so we use these instead.
+def VLD2LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vld2${p}", ".8", "$list, $addr",
+                 (ins VecListTwoDByteIndexed:$list, addrmode6align16:$addr,
+                  pred:$p)>;
+def VLD2LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vld2${p}", ".16", "$list, $addr",
+                 (ins VecListTwoDHWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
+def VLD2LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vld2${p}", ".32", "$list, $addr",
+                 (ins VecListTwoDWordIndexed:$list, addrmode6align64:$addr, pred:$p)>;
+def VLD2LNqAsm_16 : NEONDataTypeAsmPseudoInst<"vld2${p}", ".16", "$list, $addr",
+                 (ins VecListTwoQHWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
+def VLD2LNqAsm_32 : NEONDataTypeAsmPseudoInst<"vld2${p}", ".32", "$list, $addr",
+                 (ins VecListTwoQWordIndexed:$list, addrmode6align64:$addr,
+                      pred:$p)>;
+
+def VLD2LNdWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld2${p}", ".8", "$list, $addr!",
+                 (ins VecListTwoDByteIndexed:$list, addrmode6align16:$addr,
+                      pred:$p)>;
+def VLD2LNdWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld2${p}", ".16", "$list, $addr!",
+                 (ins VecListTwoDHWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
+def VLD2LNdWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld2${p}", ".32", "$list, $addr!",
+                 (ins VecListTwoDWordIndexed:$list, addrmode6align64:$addr,
+                      pred:$p)>;
+def VLD2LNqWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld2${p}", ".16", "$list, $addr!",
+                 (ins VecListTwoQHWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
+def VLD2LNqWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld2${p}", ".32", "$list, $addr!",
+                 (ins VecListTwoQWordIndexed:$list, addrmode6align64:$addr,
+                      pred:$p)>;
+def VLD2LNdWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld2${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListTwoDByteIndexed:$list, addrmode6align16:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD2LNdWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld2${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListTwoDHWordIndexed:$list, addrmode6align32:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD2LNdWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld2${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListTwoDWordIndexed:$list, addrmode6align64:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD2LNqWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld2${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListTwoQHWordIndexed:$list, addrmode6align32:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD2LNqWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld2${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListTwoQWordIndexed:$list, addrmode6align64:$addr,
+                       rGPR:$Rm, pred:$p)>;
+
+
+// VST2 single-lane pseudo-instructions. These need special handling for
+// the lane index that an InstAlias can't handle, so we use these instead.
+def VST2LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vst2${p}", ".8", "$list, $addr",
+                 (ins VecListTwoDByteIndexed:$list, addrmode6align16:$addr,
+                      pred:$p)>;
+def VST2LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vst2${p}", ".16", "$list, $addr",
+                 (ins VecListTwoDHWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
+def VST2LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vst2${p}", ".32", "$list, $addr",
+                 (ins VecListTwoDWordIndexed:$list, addrmode6align64:$addr,
+                      pred:$p)>;
+def VST2LNqAsm_16 : NEONDataTypeAsmPseudoInst<"vst2${p}", ".16", "$list, $addr",
+                 (ins VecListTwoQHWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
+def VST2LNqAsm_32 : NEONDataTypeAsmPseudoInst<"vst2${p}", ".32", "$list, $addr",
+                 (ins VecListTwoQWordIndexed:$list, addrmode6align64:$addr,
+                      pred:$p)>;
+
+def VST2LNdWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst2${p}", ".8", "$list, $addr!",
+                 (ins VecListTwoDByteIndexed:$list, addrmode6align16:$addr,
+                      pred:$p)>;
+def VST2LNdWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst2${p}", ".16", "$list, $addr!",
+                 (ins VecListTwoDHWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
+def VST2LNdWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst2${p}", ".32", "$list, $addr!",
+                 (ins VecListTwoDWordIndexed:$list, addrmode6align64:$addr,
+                      pred:$p)>;
+def VST2LNqWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst2${p}", ".16", "$list, $addr!",
+                 (ins VecListTwoQHWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
+def VST2LNqWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst2${p}", ".32", "$list, $addr!",
+                 (ins VecListTwoQWordIndexed:$list, addrmode6align64:$addr,
+                      pred:$p)>;
+def VST2LNdWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst2${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListTwoDByteIndexed:$list, addrmode6align16:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST2LNdWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst2${p}", ".16","$list, $addr, $Rm",
+                  (ins VecListTwoDHWordIndexed:$list, addrmode6align32:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST2LNdWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst2${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListTwoDWordIndexed:$list, addrmode6align64:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST2LNqWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst2${p}", ".16","$list, $addr, $Rm",
+                  (ins VecListTwoQHWordIndexed:$list, addrmode6align32:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST2LNqWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst2${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListTwoQWordIndexed:$list, addrmode6align64:$addr,
+                       rGPR:$Rm, pred:$p)>;
+
+// VLD3 all-lanes pseudo-instructions. These need special handling for
+// the lane index that an InstAlias can't handle, so we use these instead.
+def VLD3DUPdAsm_8 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr",
+               (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
+def VLD3DUPdAsm_16: NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr",
+               (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
+def VLD3DUPdAsm_32: NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr",
+               (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
+def VLD3DUPqAsm_8 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr",
+               (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
+def VLD3DUPqAsm_16: NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr",
+               (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
+def VLD3DUPqAsm_32: NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr",
+               (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
+
+def VLD3DUPdWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr!",
+               (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
+def VLD3DUPdWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr!",
+               (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
+def VLD3DUPdWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr!",
+               (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
+def VLD3DUPqWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr!",
+               (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
+def VLD3DUPqWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr!",
+               (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
+def VLD3DUPqWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr!",
+               (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
+def VLD3DUPdWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD3DUPdWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD3DUPdWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD3DUPqWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD3DUPqWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD3DUPqWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr,
+                       rGPR:$Rm, pred:$p)>;
+
+
+// VLD3 single-lane pseudo-instructions. These need special handling for
+// the lane index that an InstAlias can't handle, so we use these instead.
+def VLD3LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr",
+               (ins VecListThreeDByteIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
+def VLD3LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr",
+               (ins VecListThreeDHWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
+def VLD3LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr",
+               (ins VecListThreeDWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
+def VLD3LNqAsm_16 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr",
+               (ins VecListThreeQHWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
+def VLD3LNqAsm_32 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr",
+               (ins VecListThreeQWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
+
+def VLD3LNdWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr!",
+               (ins VecListThreeDByteIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
+def VLD3LNdWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr!",
+               (ins VecListThreeDHWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
+def VLD3LNdWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr!",
+               (ins VecListThreeDWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
+def VLD3LNqWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr!",
+               (ins VecListThreeQHWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
+def VLD3LNqWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr!",
+               (ins VecListThreeQWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
+def VLD3LNdWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListThreeDByteIndexed:$list, addrmode6alignNone:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD3LNdWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListThreeDHWordIndexed:$list,
+                       addrmode6alignNone:$addr, rGPR:$Rm, pred:$p)>;
+def VLD3LNdWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListThreeDWordIndexed:$list, addrmode6alignNone:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD3LNqWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListThreeQHWordIndexed:$list,
+                       addrmode6alignNone:$addr, rGPR:$Rm, pred:$p)>;
+def VLD3LNqWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListThreeQWordIndexed:$list, addrmode6alignNone:$addr,
+                       rGPR:$Rm, pred:$p)>;
+
+// VLD3 multiple structure pseudo-instructions. These need special handling for
+// the vector operands that the normal instructions don't yet model.
+// FIXME: Remove these when the register classes and instructions are updated.
+def VLD3dAsm_8 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr",
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
+def VLD3dAsm_16 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr",
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
+def VLD3dAsm_32 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr",
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
+def VLD3qAsm_8 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr",
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
+def VLD3qAsm_16 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr",
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
+def VLD3qAsm_32 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr",
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
+
+def VLD3dWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr!",
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
+def VLD3dWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr!",
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
+def VLD3dWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr!",
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
+def VLD3qWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr!",
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
+def VLD3qWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr!",
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
+def VLD3qWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr!",
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
+def VLD3dWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListThreeD:$list, addrmode6align64:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD3dWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListThreeD:$list, addrmode6align64:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD3dWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListThreeD:$list, addrmode6align64:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD3qWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListThreeQ:$list, addrmode6align64:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD3qWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListThreeQ:$list, addrmode6align64:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD3qWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListThreeQ:$list, addrmode6align64:$addr,
+                       rGPR:$Rm, pred:$p)>;
+
+// VST3 single-lane pseudo-instructions. These need special handling for
+// the lane index that an InstAlias can't handle, so we use these instead.
+def VST3LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr",
+               (ins VecListThreeDByteIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
+def VST3LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr",
+               (ins VecListThreeDHWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
+def VST3LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr",
+               (ins VecListThreeDWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
+def VST3LNqAsm_16 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr",
+               (ins VecListThreeQHWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
+def VST3LNqAsm_32 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr",
+               (ins VecListThreeQWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
+
+def VST3LNdWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr!",
+               (ins VecListThreeDByteIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
+def VST3LNdWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr!",
+               (ins VecListThreeDHWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
+def VST3LNdWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr!",
+               (ins VecListThreeDWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
+def VST3LNqWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr!",
+               (ins VecListThreeQHWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
+def VST3LNqWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr!",
+               (ins VecListThreeQWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
+def VST3LNdWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListThreeDByteIndexed:$list, addrmode6alignNone:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST3LNdWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListThreeDHWordIndexed:$list,
+                       addrmode6alignNone:$addr, rGPR:$Rm, pred:$p)>;
+def VST3LNdWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListThreeDWordIndexed:$list, addrmode6alignNone:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST3LNqWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListThreeQHWordIndexed:$list,
+                       addrmode6alignNone:$addr, rGPR:$Rm, pred:$p)>;
+def VST3LNqWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListThreeQWordIndexed:$list, addrmode6alignNone:$addr,
+                       rGPR:$Rm, pred:$p)>;
+
+
+// VST3 multiple structure pseudo-instructions. These need special handling for
+// the vector operands that the normal instructions don't yet model.
+// FIXME: Remove these when the register classes and instructions are updated.
+def VST3dAsm_8 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr",
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
+def VST3dAsm_16 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr",
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
+def VST3dAsm_32 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr",
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
+def VST3qAsm_8 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr",
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
+def VST3qAsm_16 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr",
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
+def VST3qAsm_32 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr",
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
+
+def VST3dWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr!",
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
+def VST3dWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr!",
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
+def VST3dWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr!",
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
+def VST3qWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr!",
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
+def VST3qWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr!",
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
+def VST3qWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr!",
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
+def VST3dWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListThreeD:$list, addrmode6align64:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST3dWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListThreeD:$list, addrmode6align64:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST3dWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListThreeD:$list, addrmode6align64:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST3qWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListThreeQ:$list, addrmode6align64:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST3qWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListThreeQ:$list, addrmode6align64:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST3qWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListThreeQ:$list, addrmode6align64:$addr,
+                       rGPR:$Rm, pred:$p)>;
+
+// VLD4 all-lanes pseudo-instructions. These need special handling for
+// the lane index that an InstAlias can't handle, so we use these instead.
+def VLD4DUPdAsm_8 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr",
+               (ins VecListFourDAllLanes:$list, addrmode6dupalign32:$addr,
+                    pred:$p)>;
+def VLD4DUPdAsm_16: NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr",
+               (ins VecListFourDAllLanes:$list, addrmode6dupalign64:$addr,
+                    pred:$p)>;
+def VLD4DUPdAsm_32: NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr",
+               (ins VecListFourDAllLanes:$list, addrmode6dupalign64or128:$addr,
+                    pred:$p)>;
+def VLD4DUPqAsm_8 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr",
+               (ins VecListFourQAllLanes:$list, addrmode6dupalign32:$addr,
+                    pred:$p)>;
+def VLD4DUPqAsm_16: NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr",
+               (ins VecListFourQAllLanes:$list, addrmode6dupalign64:$addr,
+                    pred:$p)>;
+def VLD4DUPqAsm_32: NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr",
+               (ins VecListFourQAllLanes:$list, addrmode6dupalign64or128:$addr,
+                    pred:$p)>;
+
+def VLD4DUPdWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr!",
+               (ins VecListFourDAllLanes:$list, addrmode6dupalign32:$addr,
+                    pred:$p)>;
+def VLD4DUPdWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr!",
+               (ins VecListFourDAllLanes:$list, addrmode6dupalign64:$addr,
+                    pred:$p)>;
+def VLD4DUPdWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr!",
+               (ins VecListFourDAllLanes:$list, addrmode6dupalign64or128:$addr,
+                    pred:$p)>;
+def VLD4DUPqWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr!",
+               (ins VecListFourQAllLanes:$list, addrmode6dupalign32:$addr,
+                    pred:$p)>;
+def VLD4DUPqWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr!",
+               (ins VecListFourQAllLanes:$list, addrmode6dupalign64:$addr,
+                    pred:$p)>;
+def VLD4DUPqWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr!",
+               (ins VecListFourQAllLanes:$list, addrmode6dupalign64or128:$addr,
+                    pred:$p)>;
+def VLD4DUPdWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListFourDAllLanes:$list, addrmode6dupalign32:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD4DUPdWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListFourDAllLanes:$list, addrmode6dupalign64:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD4DUPdWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListFourDAllLanes:$list,
+                       addrmode6dupalign64or128:$addr, rGPR:$Rm, pred:$p)>;
+def VLD4DUPqWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListFourQAllLanes:$list, addrmode6dupalign32:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD4DUPqWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListFourQAllLanes:$list, addrmode6dupalign64:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD4DUPqWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListFourQAllLanes:$list,
+                       addrmode6dupalign64or128:$addr, rGPR:$Rm, pred:$p)>;
+
+
+// VLD4 single-lane pseudo-instructions. These need special handling for
+// the lane index that an InstAlias can't handle, so we use these instead.
+def VLD4LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr",
+               (ins VecListFourDByteIndexed:$list, addrmode6align32:$addr,
+                    pred:$p)>;
+def VLD4LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr",
+               (ins VecListFourDHWordIndexed:$list, addrmode6align64:$addr,
+                    pred:$p)>;
+def VLD4LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr",
+               (ins VecListFourDWordIndexed:$list, addrmode6align64or128:$addr,
+                    pred:$p)>;
+def VLD4LNqAsm_16 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr",
+               (ins VecListFourQHWordIndexed:$list, addrmode6align64:$addr,
+                    pred:$p)>;
+def VLD4LNqAsm_32 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr",
+               (ins VecListFourQWordIndexed:$list, addrmode6align64or128:$addr,
+                    pred:$p)>;
+
+def VLD4LNdWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr!",
+               (ins VecListFourDByteIndexed:$list, addrmode6align32:$addr,
+                    pred:$p)>;
+def VLD4LNdWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr!",
+               (ins VecListFourDHWordIndexed:$list, addrmode6align64:$addr,
+                    pred:$p)>;
+def VLD4LNdWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr!",
+               (ins VecListFourDWordIndexed:$list, addrmode6align64or128:$addr,
+                    pred:$p)>;
+def VLD4LNqWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr!",
+               (ins VecListFourQHWordIndexed:$list, addrmode6align64:$addr,
+                    pred:$p)>;
+def VLD4LNqWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr!",
+               (ins VecListFourQWordIndexed:$list, addrmode6align64or128:$addr,
+                    pred:$p)>;
+def VLD4LNdWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListFourDByteIndexed:$list, addrmode6align32:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD4LNdWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListFourDHWordIndexed:$list, addrmode6align64:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD4LNdWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListFourDWordIndexed:$list,
+                       addrmode6align64or128:$addr, rGPR:$Rm, pred:$p)>;
+def VLD4LNqWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListFourQHWordIndexed:$list, addrmode6align64:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD4LNqWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListFourQWordIndexed:$list,
+                       addrmode6align64or128:$addr, rGPR:$Rm, pred:$p)>;
+
+
+
+// VLD4 multiple structure pseudo-instructions. These need special handling for
+// the vector operands that the normal instructions don't yet model.
+// FIXME: Remove these when the register classes and instructions are updated.
+def VLD4dAsm_8 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr",
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
+def VLD4dAsm_16 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr",
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
+def VLD4dAsm_32 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr",
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
+def VLD4qAsm_8 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr",
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
+def VLD4qAsm_16 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr",
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
+def VLD4qAsm_32 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr",
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
+
+def VLD4dWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr!",
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
+def VLD4dWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr!",
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
+def VLD4dWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr!",
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
+def VLD4qWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr!",
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
+def VLD4qWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr!",
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
+def VLD4qWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr!",
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
+def VLD4dWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD4dWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD4dWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD4qWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD4qWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD4qWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                       rGPR:$Rm, pred:$p)>;
+
+// VST4 single-lane pseudo-instructions. These need special handling for
+// the lane index that an InstAlias can't handle, so we use these instead.
+def VST4LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr",
+               (ins VecListFourDByteIndexed:$list, addrmode6align32:$addr,
+                    pred:$p)>;
+def VST4LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr",
+               (ins VecListFourDHWordIndexed:$list, addrmode6align64:$addr,
+                    pred:$p)>;
+def VST4LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr",
+               (ins VecListFourDWordIndexed:$list, addrmode6align64or128:$addr,
+                    pred:$p)>;
+def VST4LNqAsm_16 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr",
+               (ins VecListFourQHWordIndexed:$list, addrmode6align64:$addr,
+                    pred:$p)>;
+def VST4LNqAsm_32 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr",
+               (ins VecListFourQWordIndexed:$list, addrmode6align64or128:$addr,
+                    pred:$p)>;
+
+def VST4LNdWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr!",
+               (ins VecListFourDByteIndexed:$list, addrmode6align32:$addr,
+                    pred:$p)>;
+def VST4LNdWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr!",
+               (ins VecListFourDHWordIndexed:$list, addrmode6align64:$addr,
+                    pred:$p)>;
+def VST4LNdWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr!",
+               (ins VecListFourDWordIndexed:$list, addrmode6align64or128:$addr,
+                    pred:$p)>;
+def VST4LNqWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr!",
+               (ins VecListFourQHWordIndexed:$list, addrmode6align64:$addr,
+                    pred:$p)>;
+def VST4LNqWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr!",
+               (ins VecListFourQWordIndexed:$list, addrmode6align64or128:$addr,
+                    pred:$p)>;
+def VST4LNdWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListFourDByteIndexed:$list, addrmode6align32:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST4LNdWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListFourDHWordIndexed:$list, addrmode6align64:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST4LNdWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListFourDWordIndexed:$list,
+                       addrmode6align64or128:$addr, rGPR:$Rm, pred:$p)>;
+def VST4LNqWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListFourQHWordIndexed:$list, addrmode6align64:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST4LNqWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListFourQWordIndexed:$list,
+                       addrmode6align64or128:$addr, rGPR:$Rm, pred:$p)>;
+
+
+// VST4 multiple structure pseudo-instructions. These need special handling for
+// the vector operands that the normal instructions don't yet model.
+// FIXME: Remove these when the register classes and instructions are updated.
+def VST4dAsm_8 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr",
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
+def VST4dAsm_16 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr",
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
+def VST4dAsm_32 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr",
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
+def VST4qAsm_8 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr",
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
+def VST4qAsm_16 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr",
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
+def VST4qAsm_32 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr",
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
+
+def VST4dWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr!",
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
+def VST4dWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr!",
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
+def VST4dWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr!",
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
+def VST4qWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr!",
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
+def VST4qWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr!",
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
+def VST4qWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr!",
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
+def VST4dWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST4dWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST4dWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST4qWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST4qWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST4qWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                       rGPR:$Rm, pred:$p)>;
+
+// VMOV/VMVN takes an optional datatype suffix
+defm : NEONDTAnyInstAlias<"vmov${p}", "$Vd, $Vm",
+                         (VORRd DPR:$Vd, DPR:$Vm, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vmov${p}", "$Vd, $Vm",
+                         (VORRq QPR:$Vd, QPR:$Vm, QPR:$Vm, pred:$p)>;
+
+defm : NEONDTAnyInstAlias<"vmvn${p}", "$Vd, $Vm",
+                         (VMVNd DPR:$Vd, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vmvn${p}", "$Vd, $Vm",
+                         (VMVNq QPR:$Vd, QPR:$Vm, pred:$p)>;
+
+// VCLT (register) is an assembler alias for VCGT w/ the operands reversed.
+// D-register versions.
+def : NEONInstAlias<"vcle${p}.s8 $Dd, $Dn, $Dm",
+                    (VCGEsv8i8 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.s16 $Dd, $Dn, $Dm",
+                    (VCGEsv4i16 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.s32 $Dd, $Dn, $Dm",
+                    (VCGEsv2i32 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.u8 $Dd, $Dn, $Dm",
+                    (VCGEuv8i8 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.u16 $Dd, $Dn, $Dm",
+                    (VCGEuv4i16 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.u32 $Dd, $Dn, $Dm",
+                    (VCGEuv2i32 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.f32 $Dd, $Dn, $Dm",
+                    (VCGEfd DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+// Q-register versions.
+def : NEONInstAlias<"vcle${p}.s8 $Qd, $Qn, $Qm",
+                    (VCGEsv16i8 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.s16 $Qd, $Qn, $Qm",
+                    (VCGEsv8i16 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.s32 $Qd, $Qn, $Qm",
+                    (VCGEsv4i32 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.u8 $Qd, $Qn, $Qm",
+                    (VCGEuv16i8 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.u16 $Qd, $Qn, $Qm",
+                    (VCGEuv8i16 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.u32 $Qd, $Qn, $Qm",
+                    (VCGEuv4i32 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.f32 $Qd, $Qn, $Qm",
+                    (VCGEfq QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+
+// VCLT (register) is an assembler alias for VCGT w/ the operands reversed.
+// D-register versions.
+def : NEONInstAlias<"vclt${p}.s8 $Dd, $Dn, $Dm",
+                    (VCGTsv8i8 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.s16 $Dd, $Dn, $Dm",
+                    (VCGTsv4i16 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.s32 $Dd, $Dn, $Dm",
+                    (VCGTsv2i32 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.u8 $Dd, $Dn, $Dm",
+                    (VCGTuv8i8 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.u16 $Dd, $Dn, $Dm",
+                    (VCGTuv4i16 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.u32 $Dd, $Dn, $Dm",
+                    (VCGTuv2i32 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.f32 $Dd, $Dn, $Dm",
+                    (VCGTfd DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+// Q-register versions.
+def : NEONInstAlias<"vclt${p}.s8 $Qd, $Qn, $Qm",
+                    (VCGTsv16i8 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.s16 $Qd, $Qn, $Qm",
+                    (VCGTsv8i16 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.s32 $Qd, $Qn, $Qm",
+                    (VCGTsv4i32 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.u8 $Qd, $Qn, $Qm",
+                    (VCGTuv16i8 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.u16 $Qd, $Qn, $Qm",
+                    (VCGTuv8i16 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.u32 $Qd, $Qn, $Qm",
+                    (VCGTuv4i32 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.f32 $Qd, $Qn, $Qm",
+                    (VCGTfq QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+
+// VSWP allows, but does not require, a type suffix.
+defm : NEONDTAnyInstAlias<"vswp${p}", "$Vd, $Vm",
+                         (VSWPd DPR:$Vd, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vswp${p}", "$Vd, $Vm",
+                         (VSWPq QPR:$Vd, QPR:$Vm, pred:$p)>;
+
+// VBIF, VBIT, and VBSL allow, but do not require, a type suffix.
+defm : NEONDTAnyInstAlias<"vbif${p}", "$Vd, $Vn, $Vm",
+                         (VBIFd DPR:$Vd, DPR:$Vn, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vbit${p}", "$Vd, $Vn, $Vm",
+                         (VBITd DPR:$Vd, DPR:$Vn, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vbsl${p}", "$Vd, $Vn, $Vm",
+                         (VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vbif${p}", "$Vd, $Vn, $Vm",
+                         (VBIFq QPR:$Vd, QPR:$Vn, QPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vbit${p}", "$Vd, $Vn, $Vm",
+                         (VBITq QPR:$Vd, QPR:$Vn, QPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vbsl${p}", "$Vd, $Vn, $Vm",
+                         (VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm, pred:$p)>;
+
+// "vmov Rd, #-imm" can be handled via "vmvn".
+def : NEONInstAlias<"vmov${p}.i32 $Vd, $imm",
+                    (VMVNv2i32 DPR:$Vd, nImmVMOVI32Neg:$imm, pred:$p)>;
+def : NEONInstAlias<"vmov${p}.i32 $Vd, $imm",
+                    (VMVNv4i32 QPR:$Vd, nImmVMOVI32Neg:$imm, pred:$p)>;
+def : NEONInstAlias<"vmvn${p}.i32 $Vd, $imm",
+                    (VMOVv2i32 DPR:$Vd, nImmVMOVI32Neg:$imm, pred:$p)>;
+def : NEONInstAlias<"vmvn${p}.i32 $Vd, $imm",
+                    (VMOVv4i32 QPR:$Vd, nImmVMOVI32Neg:$imm, pred:$p)>;
+
+// 'gas' compatibility aliases for quad-word instructions. Strictly speaking,
+// these should restrict to just the Q register variants, but the register
+// classes are enough to match correctly regardless, so we keep it simple
+// and just use MnemonicAlias.
+def : NEONMnemonicAlias<"vbicq", "vbic">;
+def : NEONMnemonicAlias<"vandq", "vand">;
+def : NEONMnemonicAlias<"veorq", "veor">;
+def : NEONMnemonicAlias<"vorrq", "vorr">;
+
+def : NEONMnemonicAlias<"vmovq", "vmov">;
+def : NEONMnemonicAlias<"vmvnq", "vmvn">;
+// Explicit versions for floating point so that the FPImm variants get
+// handled early. The parser gets confused otherwise.
+def : NEONMnemonicAlias<"vmovq.f32", "vmov.f32">;
+def : NEONMnemonicAlias<"vmovq.f64", "vmov.f64">;
+
+def : NEONMnemonicAlias<"vaddq", "vadd">;
+def : NEONMnemonicAlias<"vsubq", "vsub">;
+
+def : NEONMnemonicAlias<"vminq", "vmin">;
+def : NEONMnemonicAlias<"vmaxq", "vmax">;
+
+def : NEONMnemonicAlias<"vmulq", "vmul">;
+
+def : NEONMnemonicAlias<"vabsq", "vabs">;
+
+def : NEONMnemonicAlias<"vshlq", "vshl">;
+def : NEONMnemonicAlias<"vshrq", "vshr">;
+
+def : NEONMnemonicAlias<"vcvtq", "vcvt">;
+
+def : NEONMnemonicAlias<"vcleq", "vcle">;
+def : NEONMnemonicAlias<"vceqq", "vceq">;
+
+def : NEONMnemonicAlias<"vzipq", "vzip">;
+def : NEONMnemonicAlias<"vswpq", "vswp">;
+
+def : NEONMnemonicAlias<"vrecpeq.f32", "vrecpe.f32">;
+def : NEONMnemonicAlias<"vrecpeq.u32", "vrecpe.u32">;
+
+
+// Alias for loading floating point immediates that aren't representable
+// using the vmov.f32 encoding but the bitpattern is representable using
+// the .i32 encoding.
+def : NEONInstAlias<"vmov${p}.f32 $Vd, $imm",
+                     (VMOVv4i32 QPR:$Vd, nImmVMOVI32:$imm, pred:$p)>;
+def : NEONInstAlias<"vmov${p}.f32 $Vd, $imm",
+                     (VMOVv2i32 DPR:$Vd, nImmVMOVI32:$imm, pred:$p)>;
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td b/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td
new file mode 100644
index 0000000..e17f73af
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td
@@ -0,0 +1,1486 @@
+//===-- ARMInstrThumb.td - Thumb support for ARM -----------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Thumb instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Thumb specific DAG Nodes.
+//
+
+def ARMtcall : SDNode<"ARMISD::tCALL", SDT_ARMcall,
+                      [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                       SDNPVariadic]>;
+
+def imm_sr_XFORM: SDNodeXForm<imm, [{
+  unsigned Imm = N->getZExtValue();
+  return CurDAG->getTargetConstant((Imm == 32 ? 0 : Imm), MVT::i32);
+}]>;
+def ThumbSRImmAsmOperand: AsmOperandClass { let Name = "ImmThumbSR"; }
+def imm_sr : Operand<i32>, PatLeaf<(imm), [{
+  uint64_t Imm = N->getZExtValue();
+  return Imm > 0 && Imm <= 32;
+}], imm_sr_XFORM> {
+  let PrintMethod = "printThumbSRImm";
+  let ParserMatchClass = ThumbSRImmAsmOperand;
+}
+
+def imm_comp_XFORM : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(~((uint32_t)N->getZExtValue()), MVT::i32);
+}]>;
+
+def imm0_7_neg : PatLeaf<(i32 imm), [{
+  return (uint32_t)-N->getZExtValue() < 8;
+}], imm_neg_XFORM>;
+
+def imm0_255_comp : PatLeaf<(i32 imm), [{
+  return ~((uint32_t)N->getZExtValue()) < 256;
+}]>;
+
+def imm8_255 : ImmLeaf<i32, [{
+  return Imm >= 8 && Imm < 256;
+}]>;
+def imm8_255_neg : PatLeaf<(i32 imm), [{
+  unsigned Val = -N->getZExtValue();
+  return Val >= 8 && Val < 256;
+}], imm_neg_XFORM>;
+
+// Break imm's up into two pieces: an immediate + a left shift. This uses
+// thumb_immshifted to match and thumb_immshifted_val and thumb_immshifted_shamt
+// to get the val/shift pieces.
+def thumb_immshifted : PatLeaf<(imm), [{
+  return ARM_AM::isThumbImmShiftedVal((unsigned)N->getZExtValue());
+}]>;
+
+def thumb_immshifted_val : SDNodeXForm<imm, [{
+  unsigned V = ARM_AM::getThumbImmNonShiftedVal((unsigned)N->getZExtValue());
+  return CurDAG->getTargetConstant(V, MVT::i32);
+}]>;
+
+def thumb_immshifted_shamt : SDNodeXForm<imm, [{
+  unsigned V = ARM_AM::getThumbImmValShift((unsigned)N->getZExtValue());
+  return CurDAG->getTargetConstant(V, MVT::i32);
+}]>;
+
+// Scaled 4 immediate.
+def t_imm0_1020s4_asmoperand: AsmOperandClass { let Name = "Imm0_1020s4"; }
+def t_imm0_1020s4 : Operand<i32> {
+  let PrintMethod = "printThumbS4ImmOperand";
+  let ParserMatchClass = t_imm0_1020s4_asmoperand;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+def t_imm0_508s4_asmoperand: AsmOperandClass { let Name = "Imm0_508s4"; }
+def t_imm0_508s4 : Operand<i32> {
+  let PrintMethod = "printThumbS4ImmOperand";
+  let ParserMatchClass = t_imm0_508s4_asmoperand;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+// Alias use only, so no printer is necessary.
+def t_imm0_508s4_neg_asmoperand: AsmOperandClass { let Name = "Imm0_508s4Neg"; }
+def t_imm0_508s4_neg : Operand<i32> {
+  let ParserMatchClass = t_imm0_508s4_neg_asmoperand;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+// Define Thumb specific addressing modes.
+
+// unsigned 8-bit, 2-scaled memory offset
+class OperandUnsignedOffset_b8s2 : AsmOperandClass {
+  let Name = "UnsignedOffset_b8s2";
+  let PredicateMethod = "isUnsignedOffset<8, 2>";
+}
+
+def UnsignedOffset_b8s2 : OperandUnsignedOffset_b8s2;
+
+// thumb style PC relative operand. signed, 8 bits magnitude,
+// two bits shift. can be represented as either [pc, #imm], #imm,
+// or relocatable expression...
+def ThumbMemPC : AsmOperandClass {
+  let Name = "ThumbMemPC";
+}
+
+let OperandType = "OPERAND_PCREL" in {
+def t_brtarget : Operand<OtherVT> {
+  let EncoderMethod = "getThumbBRTargetOpValue";
+  let DecoderMethod = "DecodeThumbBROperand";
+}
+
+// ADR instruction labels.
+def t_adrlabel : Operand<i32> {
+  let EncoderMethod = "getThumbAdrLabelOpValue";
+  let PrintMethod = "printAdrLabelOperand<2>";
+  let ParserMatchClass = UnsignedOffset_b8s2;
+}
+
+def t_bcctarget : Operand<i32> {
+  let EncoderMethod = "getThumbBCCTargetOpValue";
+  let DecoderMethod = "DecodeThumbBCCTargetOperand";
+}
+
+def t_cbtarget : Operand<i32> {
+  let EncoderMethod = "getThumbCBTargetOpValue";
+  let DecoderMethod = "DecodeThumbCmpBROperand";
+}
+
+def t_bltarget : Operand<i32> {
+  let EncoderMethod = "getThumbBLTargetOpValue";
+  let DecoderMethod = "DecodeThumbBLTargetOperand";
+}
+
+def t_blxtarget : Operand<i32> {
+  let EncoderMethod = "getThumbBLXTargetOpValue";
+  let DecoderMethod = "DecodeThumbBLXOffset";
+}
+
+// t_addrmode_pc := <label> => pc + imm8 * 4
+//
+def t_addrmode_pc : Operand<i32> {
+  let EncoderMethod = "getAddrModePCOpValue";
+  let DecoderMethod = "DecodeThumbAddrModePC";
+  let PrintMethod = "printThumbLdrLabelOperand";
+  let ParserMatchClass = ThumbMemPC;
+}
+}
+
+// t_addrmode_rr := reg + reg
+//
+def t_addrmode_rr_asm_operand : AsmOperandClass { let Name = "MemThumbRR"; }
+def t_addrmode_rr : Operand<i32>,
+                    ComplexPattern<i32, 2, "SelectThumbAddrModeRR", []> {
+  let EncoderMethod = "getThumbAddrModeRegRegOpValue";
+  let PrintMethod = "printThumbAddrModeRROperand";
+  let DecoderMethod = "DecodeThumbAddrModeRR";
+  let ParserMatchClass = t_addrmode_rr_asm_operand;
+  let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg);
+}
+
+// t_addrmode_rrs := reg + reg
+//
+// We use separate scaled versions because the Select* functions need
+// to explicitly check for a matching constant and return false here so that
+// the reg+imm forms will match instead. This is a horrible way to do that,
+// as it forces tight coupling between the methods, but it's how selectiondag
+// currently works.
+def t_addrmode_rrs1 : Operand<i32>,
+                      ComplexPattern<i32, 2, "SelectThumbAddrModeRI5S1", []> {
+  let EncoderMethod = "getThumbAddrModeRegRegOpValue";
+  let PrintMethod = "printThumbAddrModeRROperand";
+  let DecoderMethod = "DecodeThumbAddrModeRR";
+  let ParserMatchClass = t_addrmode_rr_asm_operand;
+  let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg);
+}
+def t_addrmode_rrs2 : Operand<i32>,
+                      ComplexPattern<i32, 2, "SelectThumbAddrModeRI5S2", []> {
+  let EncoderMethod = "getThumbAddrModeRegRegOpValue";
+  let DecoderMethod = "DecodeThumbAddrModeRR";
+  let PrintMethod = "printThumbAddrModeRROperand";
+  let ParserMatchClass = t_addrmode_rr_asm_operand;
+  let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg);
+}
+def t_addrmode_rrs4 : Operand<i32>,
+                      ComplexPattern<i32, 2, "SelectThumbAddrModeRI5S4", []> {
+  let EncoderMethod = "getThumbAddrModeRegRegOpValue";
+  let DecoderMethod = "DecodeThumbAddrModeRR";
+  let PrintMethod = "printThumbAddrModeRROperand";
+  let ParserMatchClass = t_addrmode_rr_asm_operand;
+  let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg);
+}
+
+// t_addrmode_is4 := reg + imm5 * 4
+//
+def t_addrmode_is4_asm_operand : AsmOperandClass { let Name = "MemThumbRIs4"; }
+def t_addrmode_is4 : Operand<i32>,
+                     ComplexPattern<i32, 2, "SelectThumbAddrModeImm5S4", []> {
+  let EncoderMethod = "getAddrModeISOpValue";
+  let DecoderMethod = "DecodeThumbAddrModeIS";
+  let PrintMethod = "printThumbAddrModeImm5S4Operand";
+  let ParserMatchClass = t_addrmode_is4_asm_operand;
+  let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm);
+}
+
+// t_addrmode_is2 := reg + imm5 * 2
+//
+def t_addrmode_is2_asm_operand : AsmOperandClass { let Name = "MemThumbRIs2"; }
+def t_addrmode_is2 : Operand<i32>,
+                     ComplexPattern<i32, 2, "SelectThumbAddrModeImm5S2", []> {
+  let EncoderMethod = "getAddrModeISOpValue";
+  let DecoderMethod = "DecodeThumbAddrModeIS";
+  let PrintMethod = "printThumbAddrModeImm5S2Operand";
+  let ParserMatchClass = t_addrmode_is2_asm_operand;
+  let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm);
+}
+
+// t_addrmode_is1 := reg + imm5
+//
+def t_addrmode_is1_asm_operand : AsmOperandClass { let Name = "MemThumbRIs1"; }
+def t_addrmode_is1 : Operand<i32>,
+                     ComplexPattern<i32, 2, "SelectThumbAddrModeImm5S1", []> {
+  let EncoderMethod = "getAddrModeISOpValue";
+  let DecoderMethod = "DecodeThumbAddrModeIS";
+  let PrintMethod = "printThumbAddrModeImm5S1Operand";
+  let ParserMatchClass = t_addrmode_is1_asm_operand;
+  let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm);
+}
+
+// t_addrmode_sp := sp + imm8 * 4
+//
+// FIXME: This really shouldn't have an explicit SP operand at all. It should
+// be implicit, just like in the instruction encoding itself.
+def t_addrmode_sp_asm_operand : AsmOperandClass { let Name = "MemThumbSPI"; }
+def t_addrmode_sp : Operand<i32>,
+                    ComplexPattern<i32, 2, "SelectThumbAddrModeSP", []> {
+  let EncoderMethod = "getAddrModeThumbSPOpValue";
+  let DecoderMethod = "DecodeThumbAddrModeSP";
+  let PrintMethod = "printThumbAddrModeSPOperand";
+  let ParserMatchClass = t_addrmode_sp_asm_operand;
+  let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
+}
+
+//===----------------------------------------------------------------------===//
+//  Miscellaneous Instructions.
+//
+
+// FIXME: Marking these as hasSideEffects is necessary to prevent machine DCE
+// from removing one half of the matched pairs. That breaks PEI, which assumes
+// these will always be in pairs, and asserts if it finds otherwise. Better way?
+let Defs = [SP], Uses = [SP], hasSideEffects = 1 in {
+def tADJCALLSTACKUP :
+  PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), NoItinerary,
+             [(ARMcallseq_end imm:$amt1, imm:$amt2)]>,
+            Requires<[IsThumb, IsThumb1Only]>;
+
+def tADJCALLSTACKDOWN :
+  PseudoInst<(outs), (ins i32imm:$amt), NoItinerary,
+             [(ARMcallseq_start imm:$amt)]>,
+            Requires<[IsThumb, IsThumb1Only]>;
+}
+
+class T1SystemEncoding<bits<8> opc>
+  : T1Encoding<0b101111> {
+  let Inst{9-8} = 0b11;
+  let Inst{7-0} = opc;
+}
+
+def tHINT : T1pI<(outs), (ins imm0_15:$imm), NoItinerary, "hint", "\t$imm",
+                 [(int_arm_hint imm0_15:$imm)]>,
+            T1SystemEncoding<0x00>,
+            Requires<[IsThumb, HasV6M]> {
+  bits<4> imm;
+  let Inst{7-4} = imm;
+}
+
+class tHintAlias<string Asm, dag Result> : tInstAlias<Asm, Result> {
+  let Predicates = [IsThumb, HasV6M];
+}
+
+def : tHintAlias<"nop$p", (tHINT 0, pred:$p)>; // A8.6.110
+def : tHintAlias<"yield$p", (tHINT 1, pred:$p)>; // A8.6.410
+def : tHintAlias<"wfe$p", (tHINT 2, pred:$p)>; // A8.6.408
+def : tHintAlias<"wfi$p", (tHINT 3, pred:$p)>; // A8.6.409
+def : tHintAlias<"sev$p", (tHINT 4, pred:$p)>; // A8.6.157
+def : tInstAlias<"sevl$p", (tHINT 5, pred:$p)> {
+  let Predicates = [IsThumb2, HasV8];
+}
+
+// The imm operand $val can be used by a debugger to store more information
+// about the breakpoint.
+def tBKPT : T1I<(outs), (ins imm0_255:$val), NoItinerary, "bkpt\t$val",
+                []>,
+           T1Encoding<0b101111> {
+  let Inst{9-8} = 0b10;
+  // A8.6.22
+  bits<8> val;
+  let Inst{7-0} = val;
+}
+// default immediate for breakpoint mnemonic
+def : InstAlias<"bkpt", (tBKPT 0)>, Requires<[IsThumb]>;
+
+def tHLT : T1I<(outs), (ins imm0_63:$val), NoItinerary, "hlt\t$val",
+                []>, T1Encoding<0b101110>, Requires<[IsThumb, HasV8]> {
+  let Inst{9-6} = 0b1010;
+  bits<6> val;
+  let Inst{5-0} = val;
+}
+
+def tSETEND : T1I<(outs), (ins setend_op:$end), NoItinerary, "setend\t$end",
+                  []>, T1Encoding<0b101101>, Deprecated<HasV8Ops> {
+  bits<1> end;
+  // A8.6.156
+  let Inst{9-5} = 0b10010;
+  let Inst{4}   = 1;
+  let Inst{3}   = end;
+  let Inst{2-0} = 0b000;
+}
+
+// Change Processor State is a system instruction -- for disassembly only.
+def tCPS : T1I<(outs), (ins imod_op:$imod, iflags_op:$iflags),
+                NoItinerary, "cps$imod $iflags", []>,
+           T1Misc<0b0110011> {
+  // A8.6.38 & B6.1.1
+  bit imod;
+  bits<3> iflags;
+
+  let Inst{4}   = imod;
+  let Inst{3}   = 0;
+  let Inst{2-0} = iflags;
+  let DecoderMethod = "DecodeThumbCPS";
+}
+
+// For both thumb1 and thumb2.
+let isNotDuplicable = 1, isCodeGenOnly = 1 in
+def tPICADD : TIt<(outs GPR:$dst), (ins GPR:$lhs, pclabel:$cp), IIC_iALUr, "",
+                  [(set GPR:$dst, (ARMpic_add GPR:$lhs, imm:$cp))]>,
+              T1Special<{0,0,?,?}>, Sched<[WriteALU]> {
+  // A8.6.6
+  bits<3> dst;
+  let Inst{6-3} = 0b1111; // Rm = pc
+  let Inst{2-0} = dst;
+}
+
+// ADD <Rd>, sp, #<imm8>
+// FIXME: This should not be marked as having side effects, and it should be
+// rematerializable. Clearing the side effect bit causes miscompilations,
+// probably because the instruction can be moved around.
+def tADDrSPi : T1pI<(outs tGPR:$dst), (ins GPRsp:$sp, t_imm0_1020s4:$imm),
+                    IIC_iALUi, "add", "\t$dst, $sp, $imm", []>,
+               T1Encoding<{1,0,1,0,1,?}>, Sched<[WriteALU]> {
+  // A6.2 & A8.6.8
+  bits<3> dst;
+  bits<8> imm;
+  let Inst{10-8} = dst;
+  let Inst{7-0}  = imm;
+  let DecoderMethod = "DecodeThumbAddSpecialReg";
+}
+
+// ADD sp, sp, #<imm7>
+def tADDspi : T1pIt<(outs GPRsp:$Rdn), (ins GPRsp:$Rn, t_imm0_508s4:$imm),
+                     IIC_iALUi, "add", "\t$Rdn, $imm", []>,
+              T1Misc<{0,0,0,0,0,?,?}>, Sched<[WriteALU]> {
+  // A6.2.5 & A8.6.8
+  bits<7> imm;
+  let Inst{6-0} = imm;
+  let DecoderMethod = "DecodeThumbAddSPImm";
+}
+
+// SUB sp, sp, #<imm7>
+// FIXME: The encoding and the ASM string don't match up.
+def tSUBspi : T1pIt<(outs GPRsp:$Rdn), (ins GPRsp:$Rn, t_imm0_508s4:$imm),
+                    IIC_iALUi, "sub", "\t$Rdn, $imm", []>,
+              T1Misc<{0,0,0,0,1,?,?}>, Sched<[WriteALU]> {
+  // A6.2.5 & A8.6.214
+  bits<7> imm;
+  let Inst{6-0} = imm;
+  let DecoderMethod = "DecodeThumbAddSPImm";
+}
+
+def : tInstAlias<"add${p} sp, $imm",
+                 (tSUBspi SP, t_imm0_508s4_neg:$imm, pred:$p)>;
+def : tInstAlias<"add${p} sp, sp, $imm",
+                 (tSUBspi SP, t_imm0_508s4_neg:$imm, pred:$p)>;
+
+// Can optionally specify SP as a three operand instruction.
+def : tInstAlias<"add${p} sp, sp, $imm",
+                 (tADDspi SP, t_imm0_508s4:$imm, pred:$p)>;
+def : tInstAlias<"sub${p} sp, sp, $imm",
+                 (tSUBspi SP, t_imm0_508s4:$imm, pred:$p)>;
+
+// ADD <Rm>, sp
+def tADDrSP : T1pI<(outs GPR:$Rdn), (ins GPRsp:$sp, GPR:$Rn), IIC_iALUr,
+                   "add", "\t$Rdn, $sp, $Rn", []>,
+              T1Special<{0,0,?,?}>, Sched<[WriteALU]> {
+  // A8.6.9 Encoding T1
+  bits<4> Rdn;
+  let Inst{7}   = Rdn{3};
+  let Inst{6-3} = 0b1101;
+  let Inst{2-0} = Rdn{2-0};
+  let DecoderMethod = "DecodeThumbAddSPReg";
+}
+
+// ADD sp, <Rm>
+def tADDspr : T1pIt<(outs GPRsp:$Rdn), (ins GPRsp:$Rn, GPR:$Rm), IIC_iALUr,
+                  "add", "\t$Rdn, $Rm", []>,
+              T1Special<{0,0,?,?}>, Sched<[WriteALU]> {
+  // A8.6.9 Encoding T2
+  bits<4> Rm;
+  let Inst{7} = 1;
+  let Inst{6-3} = Rm;
+  let Inst{2-0} = 0b101;
+  let DecoderMethod = "DecodeThumbAddSPReg";
+}
+
+//===----------------------------------------------------------------------===//
+//  Control Flow Instructions.
+//
+
+// Indirect branches
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
+  def tBX : TI<(outs), (ins GPR:$Rm, pred:$p), IIC_Br, "bx${p}\t$Rm", []>,
+            T1Special<{1,1,0,?}>, Sched<[WriteBr]> {
+    // A6.2.3 & A8.6.25
+    bits<4> Rm;
+    let Inst{6-3} = Rm;
+    let Inst{2-0} = 0b000;
+    let Unpredictable{2-0} = 0b111;
+  }
+}
+
+let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
+  def tBX_RET : tPseudoExpand<(outs), (ins pred:$p), 2, IIC_Br,
+                   [(ARMretflag)], (tBX LR, pred:$p)>, Sched<[WriteBr]>;
+
+  // Alternative return instruction used by vararg functions.
+  def tBX_RET_vararg : tPseudoExpand<(outs), (ins tGPR:$Rm, pred:$p),
+                   2, IIC_Br, [],
+                   (tBX GPR:$Rm, pred:$p)>, Sched<[WriteBr]>;
+}
+
+// All calls clobber the non-callee saved registers. SP is marked as a use to
+// prevent stack-pointer assignments that appear immediately before calls from
+// potentially appearing dead.
+let isCall = 1,
+  Defs = [LR], Uses = [SP] in {
+  // Also used for Thumb2
+  def tBL  : TIx2<0b11110, 0b11, 1,
+                  (outs), (ins pred:$p, t_bltarget:$func), IIC_Br,
+                  "bl${p}\t$func",
+                  [(ARMtcall tglobaladdr:$func)]>,
+             Requires<[IsThumb]>, Sched<[WriteBrL]> {
+    bits<24> func;
+    let Inst{26} = func{23};
+    let Inst{25-16} = func{20-11};
+    let Inst{13} = func{22};
+    let Inst{11} = func{21};
+    let Inst{10-0} = func{10-0};
+  }
+
+  // ARMv5T and above, also used for Thumb2
+  def tBLXi : TIx2<0b11110, 0b11, 0,
+                 (outs), (ins pred:$p, t_blxtarget:$func), IIC_Br,
+                   "blx${p}\t$func",
+                   [(ARMcall tglobaladdr:$func)]>,
+              Requires<[IsThumb, HasV5T]>, Sched<[WriteBrL]> {
+    bits<24> func;
+    let Inst{26} = func{23};
+    let Inst{25-16} = func{20-11};
+    let Inst{13} = func{22};
+    let Inst{11} = func{21};
+    let Inst{10-1} = func{10-1};
+    let Inst{0} = 0; // func{0} is assumed zero
+  }
+
+  // Also used for Thumb2
+  def tBLXr : TI<(outs), (ins pred:$p, GPR:$func), IIC_Br,
+                  "blx${p}\t$func",
+                  [(ARMtcall GPR:$func)]>,
+              Requires<[IsThumb, HasV5T]>,
+              T1Special<{1,1,1,?}>, Sched<[WriteBrL]> { // A6.2.3 & A8.6.24;
+    bits<4> func;
+    let Inst{6-3} = func;
+    let Inst{2-0} = 0b000;
+  }
+
+  // ARMv4T
+  def tBX_CALL : tPseudoInst<(outs), (ins tGPR:$func),
+                  4, IIC_Br,
+                  [(ARMcall_nolink tGPR:$func)]>,
+            Requires<[IsThumb, IsThumb1Only]>, Sched<[WriteBr]>;
+}
+
+let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
+  let isPredicable = 1 in
+  def tB   : T1pI<(outs), (ins t_brtarget:$target), IIC_Br,
+                 "b", "\t$target", [(br bb:$target)]>,
+             T1Encoding<{1,1,1,0,0,?}>, Sched<[WriteBr]> {
+    bits<11> target;
+    let Inst{10-0} = target;
+    let AsmMatchConverter = "cvtThumbBranches";
+ }
+
+  // Far jump
+  // Just a pseudo for a tBL instruction. Needed to let regalloc know about
+  // the clobber of LR.
+  let Defs = [LR] in
+  def tBfar : tPseudoExpand<(outs), (ins t_bltarget:$target, pred:$p),
+                          4, IIC_Br, [], (tBL pred:$p, t_bltarget:$target)>,
+                          Sched<[WriteBrTbl]>;
+
+  def tBR_JTr : tPseudoInst<(outs),
+                      (ins tGPR:$target, i32imm:$jt, i32imm:$id),
+                      0, IIC_Br,
+                      [(ARMbrjt tGPR:$target, tjumptable:$jt, imm:$id)]>,
+                      Sched<[WriteBrTbl]> {
+    list<Predicate> Predicates = [IsThumb, IsThumb1Only];
+  }
+}
+
+// FIXME: should be able to write a pattern for ARMBrcond, but can't use
+// a two-value operand where a dag node expects two operands. :(
+let isBranch = 1, isTerminator = 1 in
+  def tBcc : T1I<(outs), (ins t_bcctarget:$target, pred:$p), IIC_Br,
+                 "b${p}\t$target",
+                 [/*(ARMbrcond bb:$target, imm:$cc)*/]>,
+             T1BranchCond<{1,1,0,1}>, Sched<[WriteBr]> {
+  bits<4> p;
+  bits<8> target;
+  let Inst{11-8} = p;
+  let Inst{7-0} = target;
+  let AsmMatchConverter = "cvtThumbBranches";
+}
+
+
+// Tail calls
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
+  // IOS versions.
+  let Uses = [SP] in {
+    def tTAILJMPr : tPseudoExpand<(outs), (ins tcGPR:$dst),
+                     4, IIC_Br, [],
+                     (tBX GPR:$dst, (ops 14, zero_reg))>,
+                     Requires<[IsThumb]>, Sched<[WriteBr]>;
+  }
+  // tTAILJMPd: MachO version uses a Thumb2 branch (no Thumb1 tail calls
+  // on MachO), so it's in ARMInstrThumb2.td.
+  // Non-MachO version:
+  let Uses = [SP] in {
+    def tTAILJMPdND : tPseudoExpand<(outs),
+                   (ins t_brtarget:$dst, pred:$p),
+                   4, IIC_Br, [],
+                   (tB t_brtarget:$dst, pred:$p)>,
+                 Requires<[IsThumb, IsNotMachO]>, Sched<[WriteBr]>;
+  }
+}
+
+
+// A8.6.218 Supervisor Call (Software Interrupt)
+// A8.6.16 B: Encoding T1
+// If Inst{11-8} == 0b1111 then SEE SVC
+let isCall = 1, Uses = [SP] in
+def tSVC : T1pI<(outs), (ins imm0_255:$imm), IIC_Br,
+                "svc", "\t$imm", []>, Encoding16, Sched<[WriteBr]> {
+  bits<8> imm;
+  let Inst{15-12} = 0b1101;
+  let Inst{11-8}  = 0b1111;
+  let Inst{7-0}   = imm;
+}
+
+// The assembler uses 0xDEFE for a trap instruction.
+let isBarrier = 1, isTerminator = 1 in
+def tTRAP : TI<(outs), (ins), IIC_Br,
+               "trap", [(trap)]>, Encoding16, Sched<[WriteBr]> {
+  let Inst = 0xdefe;
+}
+
+//===----------------------------------------------------------------------===//
+//  Load Store Instructions.
+//
+
+// Loads: reg/reg and reg/imm5
+let canFoldAsLoad = 1, isReMaterializable = 1 in
+multiclass thumb_ld_rr_ri_enc<bits<3> reg_opc, bits<4> imm_opc,
+                              Operand AddrMode_r, Operand AddrMode_i,
+                              AddrMode am, InstrItinClass itin_r,
+                              InstrItinClass itin_i, string asm,
+                              PatFrag opnode> {
+  def r : // reg/reg
+    T1pILdStEncode<reg_opc,
+                   (outs tGPR:$Rt), (ins AddrMode_r:$addr),
+                   am, itin_r, asm, "\t$Rt, $addr",
+                   [(set tGPR:$Rt, (opnode AddrMode_r:$addr))]>;
+  def i : // reg/imm5
+    T1pILdStEncodeImm<imm_opc, 1 /* Load */,
+                      (outs tGPR:$Rt), (ins AddrMode_i:$addr),
+                      am, itin_i, asm, "\t$Rt, $addr",
+                      [(set tGPR:$Rt, (opnode AddrMode_i:$addr))]>;
+}
+// Stores: reg/reg and reg/imm5
+multiclass thumb_st_rr_ri_enc<bits<3> reg_opc, bits<4> imm_opc,
+                              Operand AddrMode_r, Operand AddrMode_i,
+                              AddrMode am, InstrItinClass itin_r,
+                              InstrItinClass itin_i, string asm,
+                              PatFrag opnode> {
+  def r : // reg/reg
+    T1pILdStEncode<reg_opc,
+                   (outs), (ins tGPR:$Rt, AddrMode_r:$addr),
+                   am, itin_r, asm, "\t$Rt, $addr",
+                   [(opnode tGPR:$Rt, AddrMode_r:$addr)]>;
+  def i : // reg/imm5
+    T1pILdStEncodeImm<imm_opc, 0 /* Store */,
+                      (outs), (ins tGPR:$Rt, AddrMode_i:$addr),
+                      am, itin_i, asm, "\t$Rt, $addr",
+                      [(opnode tGPR:$Rt, AddrMode_i:$addr)]>;
+}
+
+// A8.6.57 & A8.6.60
+defm tLDR  : thumb_ld_rr_ri_enc<0b100, 0b0110, t_addrmode_rrs4,
+                                t_addrmode_is4, AddrModeT1_4,
+                                IIC_iLoad_r, IIC_iLoad_i, "ldr",
+                                UnOpFrag<(load node:$Src)>>;
+
+// A8.6.64 & A8.6.61
+defm tLDRB : thumb_ld_rr_ri_enc<0b110, 0b0111, t_addrmode_rrs1,
+                                t_addrmode_is1, AddrModeT1_1,
+                                IIC_iLoad_bh_r, IIC_iLoad_bh_i, "ldrb",
+                                UnOpFrag<(zextloadi8 node:$Src)>>;
+
+// A8.6.76 & A8.6.73
+defm tLDRH : thumb_ld_rr_ri_enc<0b101, 0b1000, t_addrmode_rrs2,
+                                t_addrmode_is2, AddrModeT1_2,
+                                IIC_iLoad_bh_r, IIC_iLoad_bh_i, "ldrh",
+                                UnOpFrag<(zextloadi16 node:$Src)>>;
+
+let AddedComplexity = 10 in
+def tLDRSB :                    // A8.6.80
+  T1pILdStEncode<0b011, (outs tGPR:$Rt), (ins t_addrmode_rr:$addr),
+                 AddrModeT1_1, IIC_iLoad_bh_r,
+                 "ldrsb", "\t$Rt, $addr",
+                 [(set tGPR:$Rt, (sextloadi8 t_addrmode_rr:$addr))]>;
+
+let AddedComplexity = 10 in
+def tLDRSH :                    // A8.6.84
+  T1pILdStEncode<0b111, (outs tGPR:$Rt), (ins t_addrmode_rr:$addr),
+                 AddrModeT1_2, IIC_iLoad_bh_r,
+                 "ldrsh", "\t$Rt, $addr",
+                 [(set tGPR:$Rt, (sextloadi16 t_addrmode_rr:$addr))]>;
+
+let canFoldAsLoad = 1 in
+def tLDRspi : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_sp:$addr), IIC_iLoad_i,
+                    "ldr", "\t$Rt, $addr",
+                    [(set tGPR:$Rt, (load t_addrmode_sp:$addr))]>,
+              T1LdStSP<{1,?,?}> {
+  bits<3> Rt;
+  bits<8> addr;
+  let Inst{10-8} = Rt;
+  let Inst{7-0} = addr;
+}
+
+let canFoldAsLoad = 1, isReMaterializable = 1 in
+def tLDRpci : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_pc:$addr), IIC_iLoad_i,
+                  "ldr", "\t$Rt, $addr",
+                  [(set tGPR:$Rt, (load (ARMWrapper tconstpool:$addr)))]>,
+              T1Encoding<{0,1,0,0,1,?}> {
+  // A6.2 & A8.6.59
+  bits<3> Rt;
+  bits<8> addr;
+  let Inst{10-8} = Rt;
+  let Inst{7-0}  = addr;
+}
+
+// A8.6.194 & A8.6.192
+defm tSTR  : thumb_st_rr_ri_enc<0b000, 0b0110, t_addrmode_rrs4,
+                                t_addrmode_is4, AddrModeT1_4,
+                                IIC_iStore_r, IIC_iStore_i, "str",
+                                BinOpFrag<(store node:$LHS, node:$RHS)>>;
+
+// A8.6.197 & A8.6.195
+defm tSTRB : thumb_st_rr_ri_enc<0b010, 0b0111, t_addrmode_rrs1,
+                                t_addrmode_is1, AddrModeT1_1,
+                                IIC_iStore_bh_r, IIC_iStore_bh_i, "strb",
+                                BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>;
+
+// A8.6.207 & A8.6.205
+defm tSTRH : thumb_st_rr_ri_enc<0b001, 0b1000, t_addrmode_rrs2,
+                               t_addrmode_is2, AddrModeT1_2,
+                               IIC_iStore_bh_r, IIC_iStore_bh_i, "strh",
+                               BinOpFrag<(truncstorei16 node:$LHS, node:$RHS)>>;
+
+
+def tSTRspi : T1pIs<(outs), (ins tGPR:$Rt, t_addrmode_sp:$addr), IIC_iStore_i,
+                    "str", "\t$Rt, $addr",
+                    [(store tGPR:$Rt, t_addrmode_sp:$addr)]>,
+              T1LdStSP<{0,?,?}> {
+  bits<3> Rt;
+  bits<8> addr;
+  let Inst{10-8} = Rt;
+  let Inst{7-0} = addr;
+}
+
+//===----------------------------------------------------------------------===//
+//  Load / store multiple Instructions.
+//
+
+// These require base address to be written back or one of the loaded regs.
+let neverHasSideEffects = 1 in {
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
+def tLDMIA : T1I<(outs), (ins tGPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+        IIC_iLoad_m, "ldm${p}\t$Rn, $regs", []>, T1Encoding<{1,1,0,0,1,?}> {
+  bits<3> Rn;
+  bits<8> regs;
+  let Inst{10-8} = Rn;
+  let Inst{7-0}  = regs;
+}
+
+// Writeback version is just a pseudo, as there's no encoding difference.
+// Writeback happens iff the base register is not in the destination register
+// list.
+def tLDMIA_UPD :
+    InstTemplate<AddrModeNone, 0, IndexModeNone, Pseudo, GenericDomain,
+                 "$Rn = $wb", IIC_iLoad_mu>,
+    PseudoInstExpansion<(tLDMIA tGPR:$Rn, pred:$p, reglist:$regs)> {
+  let Size = 2;
+  let OutOperandList = (outs GPR:$wb);
+  let InOperandList = (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops);
+  let Pattern = [];
+  let isCodeGenOnly = 1;
+  let isPseudo = 1;
+  list<Predicate> Predicates = [IsThumb];
+}
+
+// There is no non-writeback version of STM for Thumb.
+let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
+def tSTMIA_UPD : Thumb1I<(outs GPR:$wb),
+                         (ins tGPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+                         AddrModeNone, 2, IIC_iStore_mu,
+                         "stm${p}\t$Rn!, $regs", "$Rn = $wb", []>,
+                     T1Encoding<{1,1,0,0,0,?}> {
+  bits<3> Rn;
+  bits<8> regs;
+  let Inst{10-8} = Rn;
+  let Inst{7-0}  = regs;
+}
+
+} // neverHasSideEffects
+
+def : InstAlias<"ldm${p} $Rn!, $regs",
+                (tLDMIA tGPR:$Rn, pred:$p, reglist:$regs)>,
+        Requires<[IsThumb, IsThumb1Only]>;
+
+let mayLoad = 1, Uses = [SP], Defs = [SP], hasExtraDefRegAllocReq = 1 in
+def tPOP : T1I<(outs), (ins pred:$p, reglist:$regs, variable_ops),
+               IIC_iPop,
+               "pop${p}\t$regs", []>,
+           T1Misc<{1,1,0,?,?,?,?}> {
+  bits<16> regs;
+  let Inst{8}   = regs{15};
+  let Inst{7-0} = regs{7-0};
+}
+
+let mayStore = 1, Uses = [SP], Defs = [SP], hasExtraSrcRegAllocReq = 1 in
+def tPUSH : T1I<(outs), (ins pred:$p, reglist:$regs, variable_ops),
+                IIC_iStore_m,
+                "push${p}\t$regs", []>,
+            T1Misc<{0,1,0,?,?,?,?}> {
+  bits<16> regs;
+  let Inst{8}   = regs{14};
+  let Inst{7-0} = regs{7-0};
+}
+
+//===----------------------------------------------------------------------===//
+//  Arithmetic Instructions.
+//
+
+// Helper classes for encoding T1pI patterns:
+class T1pIDPEncode<bits<4> opA, dag oops, dag iops, InstrItinClass itin,
+                   string opc, string asm, list<dag> pattern>
+    : T1pI<oops, iops, itin, opc, asm, pattern>,
+      T1DataProcessing<opA> {
+  bits<3> Rm;
+  bits<3> Rn;
+  let Inst{5-3} = Rm;
+  let Inst{2-0} = Rn;
+}
+class T1pIMiscEncode<bits<7> opA, dag oops, dag iops, InstrItinClass itin,
+                     string opc, string asm, list<dag> pattern>
+    : T1pI<oops, iops, itin, opc, asm, pattern>,
+      T1Misc<opA> {
+  bits<3> Rm;
+  bits<3> Rd;
+  let Inst{5-3} = Rm;
+  let Inst{2-0} = Rd;
+}
+
+// Helper classes for encoding T1sI patterns:
+class T1sIDPEncode<bits<4> opA, dag oops, dag iops, InstrItinClass itin,
+                   string opc, string asm, list<dag> pattern>
+    : T1sI<oops, iops, itin, opc, asm, pattern>,
+      T1DataProcessing<opA> {
+  bits<3> Rd;
+  bits<3> Rn;
+  let Inst{5-3} = Rn;
+  let Inst{2-0} = Rd;
+}
+class T1sIGenEncode<bits<5> opA, dag oops, dag iops, InstrItinClass itin,
+                    string opc, string asm, list<dag> pattern>
+    : T1sI<oops, iops, itin, opc, asm, pattern>,
+      T1General<opA> {
+  bits<3> Rm;
+  bits<3> Rn;
+  bits<3> Rd;
+  let Inst{8-6} = Rm;
+  let Inst{5-3} = Rn;
+  let Inst{2-0} = Rd;
+}
+class T1sIGenEncodeImm<bits<5> opA, dag oops, dag iops, InstrItinClass itin,
+                       string opc, string asm, list<dag> pattern>
+    : T1sI<oops, iops, itin, opc, asm, pattern>,
+      T1General<opA> {
+  bits<3> Rd;
+  bits<3> Rm;
+  let Inst{5-3} = Rm;
+  let Inst{2-0} = Rd;
+}
+
+// Helper classes for encoding T1sIt patterns:
+class T1sItDPEncode<bits<4> opA, dag oops, dag iops, InstrItinClass itin,
+                    string opc, string asm, list<dag> pattern>
+    : T1sIt<oops, iops, itin, opc, asm, pattern>,
+      T1DataProcessing<opA> {
+  bits<3> Rdn;
+  bits<3> Rm;
+  let Inst{5-3} = Rm;
+  let Inst{2-0} = Rdn;
+}
+class T1sItGenEncodeImm<bits<5> opA, dag oops, dag iops, InstrItinClass itin,
+                        string opc, string asm, list<dag> pattern>
+    : T1sIt<oops, iops, itin, opc, asm, pattern>,
+      T1General<opA> {
+  bits<3> Rdn;
+  bits<8> imm8;
+  let Inst{10-8} = Rdn;
+  let Inst{7-0}  = imm8;
+}
+
+// Add with carry register
+let isCommutable = 1, Uses = [CPSR] in
+def tADC :                      // A8.6.2
+  T1sItDPEncode<0b0101, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), IIC_iALUr,
+                "adc", "\t$Rdn, $Rm",
+                [(set tGPR:$Rdn, (adde tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
+
+// Add immediate
+def tADDi3 :                    // A8.6.4 T1
+  T1sIGenEncodeImm<0b01110, (outs tGPR:$Rd), (ins tGPR:$Rm, imm0_7:$imm3),
+                   IIC_iALUi,
+                   "add", "\t$Rd, $Rm, $imm3",
+                   [(set tGPR:$Rd, (add tGPR:$Rm, imm0_7:$imm3))]>,
+                   Sched<[WriteALU]> {
+  bits<3> imm3;
+  let Inst{8-6} = imm3;
+}
+
+def tADDi8 :                    // A8.6.4 T2
+  T1sItGenEncodeImm<{1,1,0,?,?}, (outs tGPR:$Rdn),
+                    (ins tGPR:$Rn, imm0_255:$imm8), IIC_iALUi,
+                    "add", "\t$Rdn, $imm8",
+                    [(set tGPR:$Rdn, (add tGPR:$Rn, imm8_255:$imm8))]>,
+                    Sched<[WriteALU]>;
+
+// Add register
+let isCommutable = 1 in
+def tADDrr :                    // A8.6.6 T1
+  T1sIGenEncode<0b01100, (outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm),
+                IIC_iALUr,
+                "add", "\t$Rd, $Rn, $Rm",
+                [(set tGPR:$Rd, (add tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
+
+let neverHasSideEffects = 1 in
+def tADDhirr : T1pIt<(outs GPR:$Rdn), (ins GPR:$Rn, GPR:$Rm), IIC_iALUr,
+                     "add", "\t$Rdn, $Rm", []>,
+               T1Special<{0,0,?,?}>, Sched<[WriteALU]> {
+  // A8.6.6 T2
+  bits<4> Rdn;
+  bits<4> Rm;
+  let Inst{7}   = Rdn{3};
+  let Inst{6-3} = Rm;
+  let Inst{2-0} = Rdn{2-0};
+}
+
+// AND register
+let isCommutable = 1 in
+def tAND :                      // A8.6.12
+  T1sItDPEncode<0b0000, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+                IIC_iBITr,
+                "and", "\t$Rdn, $Rm",
+                [(set tGPR:$Rdn, (and tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
+
+// ASR immediate
+def tASRri :                    // A8.6.14
+  T1sIGenEncodeImm<{0,1,0,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, imm_sr:$imm5),
+                   IIC_iMOVsi,
+                   "asr", "\t$Rd, $Rm, $imm5",
+                   [(set tGPR:$Rd, (sra tGPR:$Rm, (i32 imm_sr:$imm5)))]>,
+                   Sched<[WriteALU]> {
+  bits<5> imm5;
+  let Inst{10-6} = imm5;
+}
+
+// ASR register
+def tASRrr :                    // A8.6.15
+  T1sItDPEncode<0b0100, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+                IIC_iMOVsr,
+                "asr", "\t$Rdn, $Rm",
+                [(set tGPR:$Rdn, (sra tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
+
+// BIC register
+def tBIC :                      // A8.6.20
+  T1sItDPEncode<0b1110, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+                IIC_iBITr,
+                "bic", "\t$Rdn, $Rm",
+                [(set tGPR:$Rdn, (and tGPR:$Rn, (not tGPR:$Rm)))]>,
+                Sched<[WriteALU]>;
+
+// CMN register
+let isCompare = 1, Defs = [CPSR] in {
+//FIXME: Disable CMN, as CCodes are backwards from compare expectations
+//       Compare-to-zero still works out, just not the relationals
+//def tCMN :                     // A8.6.33
+//  T1pIDPEncode<0b1011, (outs), (ins tGPR:$lhs, tGPR:$rhs),
+//               IIC_iCMPr,
+//               "cmn", "\t$lhs, $rhs",
+//               [(ARMcmp tGPR:$lhs, (ineg tGPR:$rhs))]>;
+
+def tCMNz :                     // A8.6.33
+  T1pIDPEncode<0b1011, (outs), (ins tGPR:$Rn, tGPR:$Rm),
+               IIC_iCMPr,
+               "cmn", "\t$Rn, $Rm",
+               [(ARMcmpZ tGPR:$Rn, (ineg tGPR:$Rm))]>, Sched<[WriteCMP]>;
+
+} // isCompare = 1, Defs = [CPSR]
+
+// CMP immediate
+let isCompare = 1, Defs = [CPSR] in {
+def tCMPi8 : T1pI<(outs), (ins tGPR:$Rn, imm0_255:$imm8), IIC_iCMPi,
+                  "cmp", "\t$Rn, $imm8",
+                  [(ARMcmp tGPR:$Rn, imm0_255:$imm8)]>,
+             T1General<{1,0,1,?,?}>, Sched<[WriteCMP]> {
+  // A8.6.35
+  bits<3> Rn;
+  bits<8> imm8;
+  let Inst{10-8} = Rn;
+  let Inst{7-0}  = imm8;
+}
+
+// CMP register
+def tCMPr :                     // A8.6.36 T1
+  T1pIDPEncode<0b1010, (outs), (ins tGPR:$Rn, tGPR:$Rm),
+               IIC_iCMPr,
+               "cmp", "\t$Rn, $Rm",
+               [(ARMcmp tGPR:$Rn, tGPR:$Rm)]>, Sched<[WriteCMP]>;
+
+def tCMPhir : T1pI<(outs), (ins GPR:$Rn, GPR:$Rm), IIC_iCMPr,
+                   "cmp", "\t$Rn, $Rm", []>,
+              T1Special<{0,1,?,?}>, Sched<[WriteCMP]> {
+  // A8.6.36 T2
+  bits<4> Rm;
+  bits<4> Rn;
+  let Inst{7}   = Rn{3};
+  let Inst{6-3} = Rm;
+  let Inst{2-0} = Rn{2-0};
+}
+} // isCompare = 1, Defs = [CPSR]
+
+
+// XOR register
+let isCommutable = 1 in
+def tEOR :                      // A8.6.45
+  T1sItDPEncode<0b0001, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+                IIC_iBITr,
+                "eor", "\t$Rdn, $Rm",
+                [(set tGPR:$Rdn, (xor tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
+
+// LSL immediate
+def tLSLri :                    // A8.6.88
+  T1sIGenEncodeImm<{0,0,0,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, imm0_31:$imm5),
+                   IIC_iMOVsi,
+                   "lsl", "\t$Rd, $Rm, $imm5",
+                   [(set tGPR:$Rd, (shl tGPR:$Rm, (i32 imm:$imm5)))]>,
+                   Sched<[WriteALU]> {
+  bits<5> imm5;
+  let Inst{10-6} = imm5;
+}
+
+// LSL register
+def tLSLrr :                    // A8.6.89
+  T1sItDPEncode<0b0010, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+                IIC_iMOVsr,
+                "lsl", "\t$Rdn, $Rm",
+                [(set tGPR:$Rdn, (shl tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
+
+// LSR immediate
+def tLSRri :                    // A8.6.90
+  T1sIGenEncodeImm<{0,0,1,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, imm_sr:$imm5),
+                   IIC_iMOVsi,
+                   "lsr", "\t$Rd, $Rm, $imm5",
+                   [(set tGPR:$Rd, (srl tGPR:$Rm, (i32 imm_sr:$imm5)))]>,
+                   Sched<[WriteALU]> {
+  bits<5> imm5;
+  let Inst{10-6} = imm5;
+}
+
+// LSR register
+def tLSRrr :                    // A8.6.91
+  T1sItDPEncode<0b0011, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+                IIC_iMOVsr,
+                "lsr", "\t$Rdn, $Rm",
+                [(set tGPR:$Rdn, (srl tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
+
+// Move register
+let isMoveImm = 1 in
+def tMOVi8 : T1sI<(outs tGPR:$Rd), (ins imm0_255:$imm8), IIC_iMOVi,
+                  "mov", "\t$Rd, $imm8",
+                  [(set tGPR:$Rd, imm0_255:$imm8)]>,
+             T1General<{1,0,0,?,?}>, Sched<[WriteALU]> {
+  // A8.6.96
+  bits<3> Rd;
+  bits<8> imm8;
+  let Inst{10-8} = Rd;
+  let Inst{7-0}  = imm8;
+}
+// Because we have an explicit tMOVSr below, we need an alias to handle
+// the immediate "movs" form here. Blech.
+def : tInstAlias <"movs $Rdn, $imm",
+                 (tMOVi8 tGPR:$Rdn, CPSR, imm0_255:$imm, 14, 0)>;
+
+// A7-73: MOV(2) - mov setting flag.
+
+let neverHasSideEffects = 1 in {
+def tMOVr : Thumb1pI<(outs GPR:$Rd), (ins GPR:$Rm), AddrModeNone,
+                      2, IIC_iMOVr,
+                      "mov", "\t$Rd, $Rm", "", []>,
+                  T1Special<{1,0,?,?}>, Sched<[WriteALU]> {
+  // A8.6.97
+  bits<4> Rd;
+  bits<4> Rm;
+  let Inst{7}   = Rd{3};
+  let Inst{6-3} = Rm;
+  let Inst{2-0} = Rd{2-0};
+}
+let Defs = [CPSR] in
+def tMOVSr      : T1I<(outs tGPR:$Rd), (ins tGPR:$Rm), IIC_iMOVr,
+                      "movs\t$Rd, $Rm", []>, Encoding16, Sched<[WriteALU]> {
+  // A8.6.97
+  bits<3> Rd;
+  bits<3> Rm;
+  let Inst{15-6} = 0b0000000000;
+  let Inst{5-3}  = Rm;
+  let Inst{2-0}  = Rd;
+}
+} // neverHasSideEffects
+
+// Multiply register
+let isCommutable = 1 in
+def tMUL :                      // A8.6.105 T1
+  Thumb1sI<(outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm), AddrModeNone, 2,
+           IIC_iMUL32, "mul", "\t$Rd, $Rn, $Rm", "$Rm = $Rd",
+           [(set tGPR:$Rd, (mul tGPR:$Rn, tGPR:$Rm))]>,
+      T1DataProcessing<0b1101> {
+  bits<3> Rd;
+  bits<3> Rn;
+  let Inst{5-3} = Rn;
+  let Inst{2-0} = Rd;
+  let AsmMatchConverter = "cvtThumbMultiply";
+}
+
+def :tInstAlias<"mul${s}${p} $Rdm, $Rn", (tMUL tGPR:$Rdm, s_cc_out:$s, tGPR:$Rn,
+                                               pred:$p)>;
+
+// Move inverse register
+def tMVN :                      // A8.6.107
+  T1sIDPEncode<0b1111, (outs tGPR:$Rd), (ins tGPR:$Rn), IIC_iMVNr,
+               "mvn", "\t$Rd, $Rn",
+               [(set tGPR:$Rd, (not tGPR:$Rn))]>, Sched<[WriteALU]>;
+
+// Bitwise or register
+let isCommutable = 1 in
+def tORR :                      // A8.6.114
+  T1sItDPEncode<0b1100, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+                IIC_iBITr,
+                "orr", "\t$Rdn, $Rm",
+                [(set tGPR:$Rdn, (or tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
+
+// Swaps
+def tREV :                      // A8.6.134
+  T1pIMiscEncode<{1,0,1,0,0,0,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
+                 IIC_iUNAr,
+                 "rev", "\t$Rd, $Rm",
+                 [(set tGPR:$Rd, (bswap tGPR:$Rm))]>,
+                 Requires<[IsThumb, IsThumb1Only, HasV6]>, Sched<[WriteALU]>;
+
+def tREV16 :                    // A8.6.135
+  T1pIMiscEncode<{1,0,1,0,0,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
+                 IIC_iUNAr,
+                 "rev16", "\t$Rd, $Rm",
+             [(set tGPR:$Rd, (rotr (bswap tGPR:$Rm), (i32 16)))]>,
+                Requires<[IsThumb, IsThumb1Only, HasV6]>, Sched<[WriteALU]>;
+
+def tREVSH :                    // A8.6.136
+  T1pIMiscEncode<{1,0,1,0,1,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
+                 IIC_iUNAr,
+                 "revsh", "\t$Rd, $Rm",
+                 [(set tGPR:$Rd, (sra (bswap tGPR:$Rm), (i32 16)))]>,
+                 Requires<[IsThumb, IsThumb1Only, HasV6]>, Sched<[WriteALU]>;
+
+// Rotate right register
+def tROR :                      // A8.6.139
+  T1sItDPEncode<0b0111, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+                IIC_iMOVsr,
+                "ror", "\t$Rdn, $Rm",
+                [(set tGPR:$Rdn, (rotr tGPR:$Rn, tGPR:$Rm))]>,
+                Sched<[WriteALU]>;
+
+// Negate register
+def tRSB :                      // A8.6.141
+  T1sIDPEncode<0b1001, (outs tGPR:$Rd), (ins tGPR:$Rn),
+               IIC_iALUi,
+               "rsb", "\t$Rd, $Rn, #0",
+               [(set tGPR:$Rd, (ineg tGPR:$Rn))]>, Sched<[WriteALU]>;
+
+// Subtract with carry register
+let Uses = [CPSR] in
+def tSBC :                      // A8.6.151
+  T1sItDPEncode<0b0110, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+                IIC_iALUr,
+                "sbc", "\t$Rdn, $Rm",
+                [(set tGPR:$Rdn, (sube tGPR:$Rn, tGPR:$Rm))]>,
+                Sched<[WriteALU]>;
+
+// Subtract immediate
+def tSUBi3 :                    // A8.6.210 T1
+  T1sIGenEncodeImm<0b01111, (outs tGPR:$Rd), (ins tGPR:$Rm, imm0_7:$imm3),
+                   IIC_iALUi,
+                   "sub", "\t$Rd, $Rm, $imm3",
+                   [(set tGPR:$Rd, (add tGPR:$Rm, imm0_7_neg:$imm3))]>,
+                   Sched<[WriteALU]> {
+  bits<3> imm3;
+  let Inst{8-6} = imm3;
+}
+
+def tSUBi8 :                    // A8.6.210 T2
+  T1sItGenEncodeImm<{1,1,1,?,?}, (outs tGPR:$Rdn),
+                    (ins tGPR:$Rn, imm0_255:$imm8), IIC_iALUi,
+                    "sub", "\t$Rdn, $imm8",
+                    [(set tGPR:$Rdn, (add tGPR:$Rn, imm8_255_neg:$imm8))]>,
+                    Sched<[WriteALU]>;
+
+// Subtract register
+def tSUBrr :                    // A8.6.212
+  T1sIGenEncode<0b01101, (outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm),
+                IIC_iALUr,
+                "sub", "\t$Rd, $Rn, $Rm",
+                [(set tGPR:$Rd, (sub tGPR:$Rn, tGPR:$Rm))]>,
+                Sched<[WriteALU]>;
+
+// Sign-extend byte
+def tSXTB :                     // A8.6.222
+  T1pIMiscEncode<{0,0,1,0,0,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
+                 IIC_iUNAr,
+                 "sxtb", "\t$Rd, $Rm",
+                 [(set tGPR:$Rd, (sext_inreg tGPR:$Rm, i8))]>,
+                 Requires<[IsThumb, IsThumb1Only, HasV6]>,
+                 Sched<[WriteALU]>;
+
+// Sign-extend short
+def tSXTH :                     // A8.6.224
+  T1pIMiscEncode<{0,0,1,0,0,0,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
+                 IIC_iUNAr,
+                 "sxth", "\t$Rd, $Rm",
+                 [(set tGPR:$Rd, (sext_inreg tGPR:$Rm, i16))]>,
+                 Requires<[IsThumb, IsThumb1Only, HasV6]>,
+                 Sched<[WriteALU]>;
+
+// Test
+let isCompare = 1, isCommutable = 1, Defs = [CPSR] in
+def tTST :                      // A8.6.230
+  T1pIDPEncode<0b1000, (outs), (ins tGPR:$Rn, tGPR:$Rm), IIC_iTSTr,
+               "tst", "\t$Rn, $Rm",
+               [(ARMcmpZ (and_su tGPR:$Rn, tGPR:$Rm), 0)]>,
+               Sched<[WriteALU]>;
+
+// A8.8.247  UDF - Undefined (Encoding T1)
+def tUDF : TI<(outs), (ins imm0_255:$imm8), IIC_Br, "udf\t$imm8",
+              [(int_arm_undefined imm0_255:$imm8)]>, Encoding16 {
+  bits<8> imm8;
+  let Inst{15-12} = 0b1101;
+  let Inst{11-8} = 0b1110;
+  let Inst{7-0} = imm8;
+}
+
+// Zero-extend byte
+def tUXTB :                     // A8.6.262
+  T1pIMiscEncode<{0,0,1,0,1,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
+                 IIC_iUNAr,
+                 "uxtb", "\t$Rd, $Rm",
+                 [(set tGPR:$Rd, (and tGPR:$Rm, 0xFF))]>,
+                 Requires<[IsThumb, IsThumb1Only, HasV6]>,
+                 Sched<[WriteALU]>;
+
+// Zero-extend short
+def tUXTH :                     // A8.6.264
+  T1pIMiscEncode<{0,0,1,0,1,0,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
+                 IIC_iUNAr,
+                 "uxth", "\t$Rd, $Rm",
+                 [(set tGPR:$Rd, (and tGPR:$Rm, 0xFFFF))]>,
+                 Requires<[IsThumb, IsThumb1Only, HasV6]>, Sched<[WriteALU]>;
+
+// Conditional move tMOVCCr - Used to implement the Thumb SELECT_CC operation.
+// Expanded after instruction selection into a branch sequence.
+let usesCustomInserter = 1 in  // Expanded after instruction selection.
+  def tMOVCCr_pseudo :
+  PseudoInst<(outs tGPR:$dst), (ins tGPR:$false, tGPR:$true, cmovpred:$p),
+             NoItinerary,
+             [(set tGPR:$dst, (ARMcmov tGPR:$false, tGPR:$true, cmovpred:$p))]>;
+
+// tLEApcrel - Load a pc-relative address into a register without offending the
+// assembler.
+
+def tADR : T1I<(outs tGPR:$Rd), (ins t_adrlabel:$addr, pred:$p),
+               IIC_iALUi, "adr{$p}\t$Rd, $addr", []>,
+               T1Encoding<{1,0,1,0,0,?}>, Sched<[WriteALU]> {
+  bits<3> Rd;
+  bits<8> addr;
+  let Inst{10-8} = Rd;
+  let Inst{7-0} = addr;
+  let DecoderMethod = "DecodeThumbAddSpecialReg";
+}
+
+let neverHasSideEffects = 1, isReMaterializable = 1 in
+def tLEApcrel   : tPseudoInst<(outs tGPR:$Rd), (ins i32imm:$label, pred:$p),
+                              2, IIC_iALUi, []>, Sched<[WriteALU]>;
+
+let hasSideEffects = 1 in
+def tLEApcrelJT : tPseudoInst<(outs tGPR:$Rd),
+                              (ins i32imm:$label, nohash_imm:$id, pred:$p),
+                              2, IIC_iALUi, []>, Sched<[WriteALU]>;
+
+//===----------------------------------------------------------------------===//
+// TLS Instructions
+//
+
+// __aeabi_read_tp preserves the registers r1-r3.
+// This is a pseudo inst so that we can get the encoding right,
+// complete with fixup for the aeabi_read_tp function.
+let isCall = 1, Defs = [R0, R12, LR, CPSR], Uses = [SP] in
+def tTPsoft : tPseudoInst<(outs), (ins), 4, IIC_Br,
+                          [(set R0, ARMthread_pointer)]>,
+                          Sched<[WriteBr]>;
+
+//===----------------------------------------------------------------------===//
+// SJLJ Exception handling intrinsics
+//
+
+// eh_sjlj_setjmp() is an instruction sequence to store the return address and
+// save #0 in R0 for the non-longjmp case.  Since by its nature we may be coming
+// from some other function to get here, and we're using the stack frame for the
+// containing function to save/restore registers, we can't keep anything live in
+// regs across the eh_sjlj_setjmp(), else it will almost certainly have been
+// tromped upon when we get here from a longjmp(). We force everything out of
+// registers except for our own input by listing the relevant registers in
+// Defs. By doing so, we also cause the prologue/epilogue code to actively
+// preserve all of the callee-saved resgisters, which is exactly what we want.
+// $val is a scratch register for our use.
+let Defs = [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7, R12, CPSR ],
+    hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1,
+    usesCustomInserter = 1 in
+def tInt_eh_sjlj_setjmp : ThumbXI<(outs),(ins tGPR:$src, tGPR:$val),
+                                  AddrModeNone, 0, NoItinerary, "","",
+                          [(set R0, (ARMeh_sjlj_setjmp tGPR:$src, tGPR:$val))]>;
+
+// FIXME: Non-IOS version(s)
+let isBarrier = 1, hasSideEffects = 1, isTerminator = 1, isCodeGenOnly = 1,
+    Defs = [ R7, LR, SP ] in
+def tInt_eh_sjlj_longjmp : XI<(outs), (ins GPR:$src, GPR:$scratch),
+                              AddrModeNone, 0, IndexModeNone,
+                              Pseudo, NoItinerary, "", "",
+                              [(ARMeh_sjlj_longjmp GPR:$src, GPR:$scratch)]>,
+                             Requires<[IsThumb, IsIOS]>;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//
+
+// Comparisons
+def : T1Pat<(ARMcmpZ tGPR:$Rn, imm0_255:$imm8),
+            (tCMPi8  tGPR:$Rn, imm0_255:$imm8)>;
+def : T1Pat<(ARMcmpZ tGPR:$Rn, tGPR:$Rm),
+            (tCMPr   tGPR:$Rn, tGPR:$Rm)>;
+
+// Add with carry
+def : T1Pat<(addc   tGPR:$lhs, imm0_7:$rhs),
+            (tADDi3 tGPR:$lhs, imm0_7:$rhs)>;
+def : T1Pat<(addc   tGPR:$lhs, imm8_255:$rhs),
+            (tADDi8 tGPR:$lhs, imm8_255:$rhs)>;
+def : T1Pat<(addc   tGPR:$lhs, tGPR:$rhs),
+            (tADDrr tGPR:$lhs, tGPR:$rhs)>;
+
+// Subtract with carry
+def : T1Pat<(addc   tGPR:$lhs, imm0_7_neg:$rhs),
+            (tSUBi3 tGPR:$lhs, imm0_7_neg:$rhs)>;
+def : T1Pat<(addc   tGPR:$lhs, imm8_255_neg:$rhs),
+            (tSUBi8 tGPR:$lhs, imm8_255_neg:$rhs)>;
+def : T1Pat<(subc   tGPR:$lhs, tGPR:$rhs),
+            (tSUBrr tGPR:$lhs, tGPR:$rhs)>;
+
+// Bswap 16 with load/store
+def : T1Pat<(srl (bswap (extloadi16 t_addrmode_rrs2:$addr)), (i32 16)),
+            (tREV16 (tLDRHr t_addrmode_rrs2:$addr))>;
+def : T1Pat<(srl (bswap (extloadi16 t_addrmode_is2:$addr)), (i32 16)),
+            (tREV16 (tLDRHi t_addrmode_is2:$addr))>;
+def : T1Pat<(truncstorei16 (srl (bswap tGPR:$Rn), (i32 16)),
+                           t_addrmode_rrs2:$addr),
+            (tSTRHr (tREV16 tGPR:$Rn), t_addrmode_rrs2:$addr)>;
+def : T1Pat<(truncstorei16 (srl (bswap tGPR:$Rn), (i32 16)),
+                           t_addrmode_is2:$addr),
+            (tSTRHi(tREV16 tGPR:$Rn), t_addrmode_is2:$addr)>;
+
+// ConstantPool
+def : T1Pat<(ARMWrapper  tconstpool  :$dst), (tLEApcrel tconstpool  :$dst)>;
+
+// GlobalAddress
+def tLDRLIT_ga_pcrel : PseudoInst<(outs tGPR:$dst), (ins i32imm:$addr),
+                                  IIC_iLoadiALU,
+                                  [(set tGPR:$dst,
+                                        (ARMWrapperPIC tglobaladdr:$addr))]>,
+                       Requires<[IsThumb, DontUseMovt]>;
+
+def tLDRLIT_ga_abs : PseudoInst<(outs tGPR:$dst), (ins i32imm:$src),
+                                IIC_iLoad_i,
+                                [(set tGPR:$dst,
+                                      (ARMWrapper tglobaladdr:$src))]>,
+                     Requires<[IsThumb, DontUseMovt]>;
+
+
+// JumpTable
+def : T1Pat<(ARMWrapperJT tjumptable:$dst, imm:$id),
+            (tLEApcrelJT tjumptable:$dst, imm:$id)>;
+
+// Direct calls
+def : T1Pat<(ARMtcall texternalsym:$func), (tBL texternalsym:$func)>,
+      Requires<[IsThumb]>;
+
+def : Tv5Pat<(ARMcall texternalsym:$func), (tBLXi texternalsym:$func)>,
+      Requires<[IsThumb, HasV5T]>;
+
+// Indirect calls to ARM routines
+def : Tv5Pat<(ARMcall GPR:$dst), (tBLXr GPR:$dst)>,
+      Requires<[IsThumb, HasV5T]>;
+
+// zextload i1 -> zextload i8
+def : T1Pat<(zextloadi1 t_addrmode_rrs1:$addr),
+            (tLDRBr t_addrmode_rrs1:$addr)>;
+def : T1Pat<(zextloadi1 t_addrmode_is1:$addr),
+            (tLDRBi t_addrmode_is1:$addr)>;
+
+// extload -> zextload
+def : T1Pat<(extloadi1  t_addrmode_rrs1:$addr), (tLDRBr t_addrmode_rrs1:$addr)>;
+def : T1Pat<(extloadi1  t_addrmode_is1:$addr),  (tLDRBi t_addrmode_is1:$addr)>;
+def : T1Pat<(extloadi8  t_addrmode_rrs1:$addr), (tLDRBr t_addrmode_rrs1:$addr)>;
+def : T1Pat<(extloadi8  t_addrmode_is1:$addr),  (tLDRBi t_addrmode_is1:$addr)>;
+def : T1Pat<(extloadi16 t_addrmode_rrs2:$addr), (tLDRHr t_addrmode_rrs2:$addr)>;
+def : T1Pat<(extloadi16 t_addrmode_is2:$addr),  (tLDRHi t_addrmode_is2:$addr)>;
+
+// If it's impossible to use [r,r] address mode for sextload, select to
+// ldr{b|h} + sxt{b|h} instead.
+def : T1Pat<(sextloadi8 t_addrmode_is1:$addr),
+            (tSXTB (tLDRBi t_addrmode_is1:$addr))>,
+      Requires<[IsThumb, IsThumb1Only, HasV6]>;
+def : T1Pat<(sextloadi8 t_addrmode_rrs1:$addr),
+            (tSXTB (tLDRBr t_addrmode_rrs1:$addr))>,
+      Requires<[IsThumb, IsThumb1Only, HasV6]>;
+def : T1Pat<(sextloadi16 t_addrmode_is2:$addr),
+            (tSXTH (tLDRHi t_addrmode_is2:$addr))>,
+      Requires<[IsThumb, IsThumb1Only, HasV6]>;
+def : T1Pat<(sextloadi16 t_addrmode_rrs2:$addr),
+            (tSXTH (tLDRHr t_addrmode_rrs2:$addr))>,
+      Requires<[IsThumb, IsThumb1Only, HasV6]>;
+
+def : T1Pat<(sextloadi8 t_addrmode_rrs1:$addr),
+            (tASRri (tLSLri (tLDRBr t_addrmode_rrs1:$addr), 24), 24)>;
+def : T1Pat<(sextloadi8 t_addrmode_is1:$addr),
+            (tASRri (tLSLri (tLDRBi t_addrmode_is1:$addr), 24), 24)>;
+def : T1Pat<(sextloadi16 t_addrmode_rrs2:$addr),
+            (tASRri (tLSLri (tLDRHr t_addrmode_rrs2:$addr), 16), 16)>;
+def : T1Pat<(sextloadi16 t_addrmode_is2:$addr),
+            (tASRri (tLSLri (tLDRHi t_addrmode_is2:$addr), 16), 16)>;
+
+def : T1Pat<(atomic_load_8 t_addrmode_is1:$src),
+             (tLDRBi t_addrmode_is1:$src)>;
+def : T1Pat<(atomic_load_8 t_addrmode_rrs1:$src),
+             (tLDRBr t_addrmode_rrs1:$src)>;
+def : T1Pat<(atomic_load_16 t_addrmode_is2:$src),
+             (tLDRHi t_addrmode_is2:$src)>;
+def : T1Pat<(atomic_load_16 t_addrmode_rrs2:$src),
+             (tLDRHr t_addrmode_rrs2:$src)>;
+def : T1Pat<(atomic_load_32 t_addrmode_is4:$src),
+             (tLDRi t_addrmode_is4:$src)>;
+def : T1Pat<(atomic_load_32 t_addrmode_rrs4:$src),
+             (tLDRr t_addrmode_rrs4:$src)>;
+def : T1Pat<(atomic_store_8 t_addrmode_is1:$ptr, tGPR:$val),
+             (tSTRBi tGPR:$val, t_addrmode_is1:$ptr)>;
+def : T1Pat<(atomic_store_8 t_addrmode_rrs1:$ptr, tGPR:$val),
+             (tSTRBr tGPR:$val, t_addrmode_rrs1:$ptr)>;
+def : T1Pat<(atomic_store_16 t_addrmode_is2:$ptr, tGPR:$val),
+             (tSTRHi tGPR:$val, t_addrmode_is2:$ptr)>;
+def : T1Pat<(atomic_store_16 t_addrmode_rrs2:$ptr, tGPR:$val),
+             (tSTRHr tGPR:$val, t_addrmode_rrs2:$ptr)>;
+def : T1Pat<(atomic_store_32 t_addrmode_is4:$ptr, tGPR:$val),
+             (tSTRi tGPR:$val, t_addrmode_is4:$ptr)>;
+def : T1Pat<(atomic_store_32 t_addrmode_rrs4:$ptr, tGPR:$val),
+             (tSTRr tGPR:$val, t_addrmode_rrs4:$ptr)>;
+
+// Large immediate handling.
+
+// Two piece imms.
+def : T1Pat<(i32 thumb_immshifted:$src),
+            (tLSLri (tMOVi8 (thumb_immshifted_val imm:$src)),
+                    (thumb_immshifted_shamt imm:$src))>;
+
+def : T1Pat<(i32 imm0_255_comp:$src),
+            (tMVN (tMOVi8 (imm_comp_XFORM imm:$src)))>;
+
+// Pseudo instruction that combines ldr from constpool and add pc. This should
+// be expanded into two instructions late to allow if-conversion and
+// scheduling.
+let isReMaterializable = 1 in
+def tLDRpci_pic : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr, pclabel:$cp),
+                             NoItinerary,
+               [(set GPR:$dst, (ARMpic_add (load (ARMWrapper tconstpool:$addr)),
+                                           imm:$cp))]>,
+               Requires<[IsThumb, IsThumb1Only]>;
+
+// Pseudo-instruction for merged POP and return.
+// FIXME: remove when we have a way to marking a MI with these properties.
+let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1,
+    hasExtraDefRegAllocReq = 1 in
+def tPOP_RET : tPseudoExpand<(outs), (ins pred:$p, reglist:$regs, variable_ops),
+                           2, IIC_iPop_Br, [],
+                           (tPOP pred:$p, reglist:$regs)>, Sched<[WriteBrL]>;
+
+// Indirect branch using "mov pc, $Rm"
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
+  def tBRIND : tPseudoExpand<(outs), (ins GPR:$Rm, pred:$p),
+                  2, IIC_Br, [(brind GPR:$Rm)],
+                  (tMOVr PC, GPR:$Rm, pred:$p)>, Sched<[WriteBr]>;
+}
+
+
+// In Thumb1, "nop" is encoded as a "mov r8, r8". Technically, the bf00
+// encoding is available on ARMv6K, but we don't differentiate that finely.
+def : InstAlias<"nop", (tMOVr R8, R8, 14, 0)>,Requires<[IsThumb, IsThumb1Only]>;
+
+
+// For round-trip assembly/disassembly, we have to handle a CPS instruction
+// without any iflags. That's not, strictly speaking, valid syntax, but it's
+// a useful extension and assembles to defined behaviour (the insn does
+// nothing).
+def : tInstAlias<"cps$imod", (tCPS imod_op:$imod, 0)>;
+def : tInstAlias<"cps$imod", (tCPS imod_op:$imod, 0)>;
+
+// "neg" is and alias for "rsb rd, rn, #0"
+def : tInstAlias<"neg${s}${p} $Rd, $Rm",
+                 (tRSB tGPR:$Rd, s_cc_out:$s, tGPR:$Rm, pred:$p)>;
+
+
+// Implied destination operand forms for shifts.
+def : tInstAlias<"lsl${s}${p} $Rdm, $imm",
+             (tLSLri tGPR:$Rdm, cc_out:$s, tGPR:$Rdm, imm0_31:$imm, pred:$p)>;
+def : tInstAlias<"lsr${s}${p} $Rdm, $imm",
+             (tLSRri tGPR:$Rdm, cc_out:$s, tGPR:$Rdm, imm_sr:$imm, pred:$p)>;
+def : tInstAlias<"asr${s}${p} $Rdm, $imm",
+             (tASRri tGPR:$Rdm, cc_out:$s, tGPR:$Rdm, imm_sr:$imm, pred:$p)>;
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td b/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td
new file mode 100644
index 0000000..85e9351
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td
@@ -0,0 +1,4643 @@
+//===-- ARMInstrThumb2.td - Thumb2 support for ARM ---------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Thumb2 instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+// IT block predicate field
+def it_pred_asmoperand : AsmOperandClass {
+  let Name = "ITCondCode";
+  let ParserMethod = "parseITCondCode";
+}
+def it_pred : Operand<i32> {
+  let PrintMethod = "printMandatoryPredicateOperand";
+  let ParserMatchClass = it_pred_asmoperand;
+}
+
+// IT block condition mask
+def it_mask_asmoperand : AsmOperandClass { let Name = "ITMask"; }
+def it_mask : Operand<i32> {
+  let PrintMethod = "printThumbITMask";
+  let ParserMatchClass = it_mask_asmoperand;
+}
+
+// t2_shift_imm: An integer that encodes a shift amount and the type of shift
+// (asr or lsl). The 6-bit immediate encodes as:
+//    {5}     0 ==> lsl
+//            1     asr
+//    {4-0}   imm5 shift amount.
+//            asr #32 not allowed
+def t2_shift_imm : Operand<i32> {
+  let PrintMethod = "printShiftImmOperand";
+  let ParserMatchClass = ShifterImmAsmOperand;
+  let DecoderMethod = "DecodeT2ShifterImmOperand";
+}
+
+// Shifted operands. No register controlled shifts for Thumb2.
+// Note: We do not support rrx shifted operands yet.
+def t2_so_reg : Operand<i32>,    // reg imm
+                ComplexPattern<i32, 2, "SelectT2ShifterOperandReg",
+                               [shl,srl,sra,rotr]> {
+  let EncoderMethod = "getT2SORegOpValue";
+  let PrintMethod = "printT2SOOperand";
+  let DecoderMethod = "DecodeSORegImmOperand";
+  let ParserMatchClass = ShiftedImmAsmOperand;
+  let MIOperandInfo = (ops rGPR, i32imm);
+}
+
+// t2_so_imm_not_XFORM - Return the complement of a t2_so_imm value
+def t2_so_imm_not_XFORM : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(~((uint32_t)N->getZExtValue()), MVT::i32);
+}]>;
+
+// t2_so_imm_neg_XFORM - Return the negation of a t2_so_imm value
+def t2_so_imm_neg_XFORM : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(-((int)N->getZExtValue()), MVT::i32);
+}]>;
+
+// so_imm_notSext_XFORM - Return a so_imm value packed into the format
+// described for so_imm_notSext def below, with sign extension from 16
+// bits.
+def t2_so_imm_notSext16_XFORM : SDNodeXForm<imm, [{
+  APInt apIntN = N->getAPIntValue();
+  unsigned N16bitSignExt = apIntN.trunc(16).sext(32).getZExtValue();
+  return CurDAG->getTargetConstant(~N16bitSignExt, MVT::i32);
+}]>;
+
+// t2_so_imm - Match a 32-bit immediate operand, which is an
+// 8-bit immediate rotated by an arbitrary number of bits, or an 8-bit
+// immediate splatted into multiple bytes of the word.
+def t2_so_imm_asmoperand : ImmAsmOperand { let Name = "T2SOImm"; }
+def t2_so_imm : Operand<i32>, ImmLeaf<i32, [{
+    return ARM_AM::getT2SOImmVal(Imm) != -1;
+  }]> {
+  let ParserMatchClass = t2_so_imm_asmoperand;
+  let EncoderMethod = "getT2SOImmOpValue";
+  let DecoderMethod = "DecodeT2SOImm";
+}
+
+// t2_so_imm_not - Match an immediate that is a complement
+// of a t2_so_imm.
+// Note: this pattern doesn't require an encoder method and such, as it's
+// only used on aliases (Pat<> and InstAlias<>). The actual encoding
+// is handled by the destination instructions, which use t2_so_imm.
+def t2_so_imm_not_asmoperand : AsmOperandClass { let Name = "T2SOImmNot"; }
+def t2_so_imm_not : Operand<i32>, PatLeaf<(imm), [{
+  return ARM_AM::getT2SOImmVal(~((uint32_t)N->getZExtValue())) != -1;
+}], t2_so_imm_not_XFORM> {
+  let ParserMatchClass = t2_so_imm_not_asmoperand;
+}
+
+// t2_so_imm_notSext - match an immediate that is a complement of a t2_so_imm
+// if the upper 16 bits are zero.
+def t2_so_imm_notSext : Operand<i32>, PatLeaf<(imm), [{
+    APInt apIntN = N->getAPIntValue();
+    if (!apIntN.isIntN(16)) return false;
+    unsigned N16bitSignExt = apIntN.trunc(16).sext(32).getZExtValue();
+    return ARM_AM::getT2SOImmVal(~N16bitSignExt) != -1;
+  }], t2_so_imm_notSext16_XFORM> {
+  let ParserMatchClass = t2_so_imm_not_asmoperand;
+}
+
+// t2_so_imm_neg - Match an immediate that is a negation of a t2_so_imm.
+def t2_so_imm_neg_asmoperand : AsmOperandClass { let Name = "T2SOImmNeg"; }
+def t2_so_imm_neg : Operand<i32>, PatLeaf<(imm), [{
+  int64_t Value = -(int)N->getZExtValue();
+  return Value && ARM_AM::getT2SOImmVal(Value) != -1;
+}], t2_so_imm_neg_XFORM> {
+  let ParserMatchClass = t2_so_imm_neg_asmoperand;
+}
+
+/// imm0_4095 predicate - True if the 32-bit immediate is in the range [0.4095].
+def imm0_4095_asmoperand: ImmAsmOperand { let Name = "Imm0_4095"; }
+def imm0_4095 : Operand<i32>, ImmLeaf<i32, [{
+  return Imm >= 0 && Imm < 4096;
+}]> {
+  let ParserMatchClass = imm0_4095_asmoperand;
+}
+
+def imm0_4095_neg_asmoperand: AsmOperandClass { let Name = "Imm0_4095Neg"; }
+def imm0_4095_neg : Operand<i32>, PatLeaf<(i32 imm), [{
+ return (uint32_t)(-N->getZExtValue()) < 4096;
+}], imm_neg_XFORM> {
+  let ParserMatchClass = imm0_4095_neg_asmoperand;
+}
+
+def imm1_255_neg : PatLeaf<(i32 imm), [{
+  uint32_t Val = -N->getZExtValue();
+  return (Val > 0 && Val < 255);
+}], imm_neg_XFORM>;
+
+def imm0_255_not : PatLeaf<(i32 imm), [{
+  return (uint32_t)(~N->getZExtValue()) < 255;
+}], imm_comp_XFORM>;
+
+def lo5AllOne : PatLeaf<(i32 imm), [{
+  // Returns true if all low 5-bits are 1.
+  return (((uint32_t)N->getZExtValue()) & 0x1FUL) == 0x1FUL;
+}]>;
+
+// Define Thumb2 specific addressing modes.
+
+// t2addrmode_imm12  := reg + imm12
+def t2addrmode_imm12_asmoperand : AsmOperandClass {let Name="MemUImm12Offset";}
+def t2addrmode_imm12 : Operand<i32>,
+                       ComplexPattern<i32, 2, "SelectT2AddrModeImm12", []> {
+  let PrintMethod = "printAddrModeImm12Operand<false>";
+  let EncoderMethod = "getAddrModeImm12OpValue";
+  let DecoderMethod = "DecodeT2AddrModeImm12";
+  let ParserMatchClass = t2addrmode_imm12_asmoperand;
+  let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
+}
+
+// t2ldrlabel  := imm12
+def t2ldrlabel : Operand<i32> {
+  let EncoderMethod = "getAddrModeImm12OpValue";
+  let PrintMethod = "printThumbLdrLabelOperand";
+}
+
+def t2ldr_pcrel_imm12_asmoperand : AsmOperandClass {let Name = "MemPCRelImm12";}
+def t2ldr_pcrel_imm12 : Operand<i32> {
+  let ParserMatchClass = t2ldr_pcrel_imm12_asmoperand;
+  // used for assembler pseudo instruction and maps to t2ldrlabel, so
+  // doesn't need encoder or print methods of its own.
+}
+
+// ADR instruction labels.
+def t2adrlabel : Operand<i32> {
+  let EncoderMethod = "getT2AdrLabelOpValue";
+  let PrintMethod = "printAdrLabelOperand<0>";
+}
+
+// t2addrmode_posimm8  := reg + imm8
+def MemPosImm8OffsetAsmOperand : AsmOperandClass {let Name="MemPosImm8Offset";}
+def t2addrmode_posimm8 : Operand<i32> {
+  let PrintMethod = "printT2AddrModeImm8Operand<false>";
+  let EncoderMethod = "getT2AddrModeImm8OpValue";
+  let DecoderMethod = "DecodeT2AddrModeImm8";
+  let ParserMatchClass = MemPosImm8OffsetAsmOperand;
+  let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
+}
+
+// t2addrmode_negimm8  := reg - imm8
+def MemNegImm8OffsetAsmOperand : AsmOperandClass {let Name="MemNegImm8Offset";}
+def t2addrmode_negimm8 : Operand<i32>,
+                      ComplexPattern<i32, 2, "SelectT2AddrModeImm8", []> {
+  let PrintMethod = "printT2AddrModeImm8Operand<false>";
+  let EncoderMethod = "getT2AddrModeImm8OpValue";
+  let DecoderMethod = "DecodeT2AddrModeImm8";
+  let ParserMatchClass = MemNegImm8OffsetAsmOperand;
+  let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
+}
+
+// t2addrmode_imm8  := reg +/- imm8
+def MemImm8OffsetAsmOperand : AsmOperandClass { let Name = "MemImm8Offset"; }
+class T2AddrMode_Imm8 : Operand<i32>,
+                        ComplexPattern<i32, 2, "SelectT2AddrModeImm8", []> {
+  let EncoderMethod = "getT2AddrModeImm8OpValue";
+  let DecoderMethod = "DecodeT2AddrModeImm8";
+  let ParserMatchClass = MemImm8OffsetAsmOperand;
+  let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
+}
+
+def t2addrmode_imm8 : T2AddrMode_Imm8 {
+  let PrintMethod = "printT2AddrModeImm8Operand<false>";
+}
+
+def t2addrmode_imm8_pre : T2AddrMode_Imm8 {
+  let PrintMethod = "printT2AddrModeImm8Operand<true>";
+}
+
+def t2am_imm8_offset : Operand<i32>,
+                       ComplexPattern<i32, 1, "SelectT2AddrModeImm8Offset",
+                                      [], [SDNPWantRoot]> {
+  let PrintMethod = "printT2AddrModeImm8OffsetOperand";
+  let EncoderMethod = "getT2AddrModeImm8OffsetOpValue";
+  let DecoderMethod = "DecodeT2Imm8";
+}
+
+// t2addrmode_imm8s4  := reg +/- (imm8 << 2)
+def MemImm8s4OffsetAsmOperand : AsmOperandClass {let Name = "MemImm8s4Offset";}
+class T2AddrMode_Imm8s4 : Operand<i32> {
+  let EncoderMethod = "getT2AddrModeImm8s4OpValue";
+  let DecoderMethod = "DecodeT2AddrModeImm8s4";
+  let ParserMatchClass = MemImm8s4OffsetAsmOperand;
+  let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
+}
+
+def t2addrmode_imm8s4 : T2AddrMode_Imm8s4 {
+  let PrintMethod = "printT2AddrModeImm8s4Operand<false>";
+}
+
+def t2addrmode_imm8s4_pre : T2AddrMode_Imm8s4 {
+  let PrintMethod = "printT2AddrModeImm8s4Operand<true>";
+}
+
+def t2am_imm8s4_offset_asmoperand : AsmOperandClass { let Name = "Imm8s4"; }
+def t2am_imm8s4_offset : Operand<i32> {
+  let PrintMethod = "printT2AddrModeImm8s4OffsetOperand";
+  let EncoderMethod = "getT2Imm8s4OpValue";
+  let DecoderMethod = "DecodeT2Imm8S4";
+}
+
+// t2addrmode_imm0_1020s4  := reg + (imm8 << 2)
+def MemImm0_1020s4OffsetAsmOperand : AsmOperandClass {
+  let Name = "MemImm0_1020s4Offset";
+}
+def t2addrmode_imm0_1020s4 : Operand<i32>,
+                         ComplexPattern<i32, 2, "SelectT2AddrModeExclusive"> {
+  let PrintMethod = "printT2AddrModeImm0_1020s4Operand";
+  let EncoderMethod = "getT2AddrModeImm0_1020s4OpValue";
+  let DecoderMethod = "DecodeT2AddrModeImm0_1020s4";
+  let ParserMatchClass = MemImm0_1020s4OffsetAsmOperand;
+  let MIOperandInfo = (ops GPRnopc:$base, i32imm:$offsimm);
+}
+
+// t2addrmode_so_reg  := reg + (reg << imm2)
+def t2addrmode_so_reg_asmoperand : AsmOperandClass {let Name="T2MemRegOffset";}
+def t2addrmode_so_reg : Operand<i32>,
+                        ComplexPattern<i32, 3, "SelectT2AddrModeSoReg", []> {
+  let PrintMethod = "printT2AddrModeSoRegOperand";
+  let EncoderMethod = "getT2AddrModeSORegOpValue";
+  let DecoderMethod = "DecodeT2AddrModeSOReg";
+  let ParserMatchClass = t2addrmode_so_reg_asmoperand;
+  let MIOperandInfo = (ops GPR:$base, rGPR:$offsreg, i32imm:$offsimm);
+}
+
+// Addresses for the TBB/TBH instructions.
+def addrmode_tbb_asmoperand : AsmOperandClass { let Name = "MemTBB"; }
+def addrmode_tbb : Operand<i32> {
+  let PrintMethod = "printAddrModeTBB";
+  let ParserMatchClass = addrmode_tbb_asmoperand;
+  let MIOperandInfo = (ops GPR:$Rn, rGPR:$Rm);
+}
+def addrmode_tbh_asmoperand : AsmOperandClass { let Name = "MemTBH"; }
+def addrmode_tbh : Operand<i32> {
+  let PrintMethod = "printAddrModeTBH";
+  let ParserMatchClass = addrmode_tbh_asmoperand;
+  let MIOperandInfo = (ops GPR:$Rn, rGPR:$Rm);
+}
+
+//===----------------------------------------------------------------------===//
+// Multiclass helpers...
+//
+
+
+class T2OneRegImm<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<12> imm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{26}    = imm{11};
+  let Inst{14-12} = imm{10-8};
+  let Inst{7-0}   = imm{7-0};
+}
+
+
+class T2sOneRegImm<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2sI<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<12> imm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{26}    = imm{11};
+  let Inst{14-12} = imm{10-8};
+  let Inst{7-0}   = imm{7-0};
+}
+
+class T2OneRegCmpImm<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rn;
+  bits<12> imm;
+
+  let Inst{19-16}  = Rn;
+  let Inst{26}    = imm{11};
+  let Inst{14-12} = imm{10-8};
+  let Inst{7-0}   = imm{7-0};
+}
+
+
+class T2OneRegShiftedReg<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<12> ShiftedRm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{3-0}   = ShiftedRm{3-0};
+  let Inst{5-4}   = ShiftedRm{6-5};
+  let Inst{14-12} = ShiftedRm{11-9};
+  let Inst{7-6}   = ShiftedRm{8-7};
+}
+
+class T2sOneRegShiftedReg<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2sI<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<12> ShiftedRm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{3-0}   = ShiftedRm{3-0};
+  let Inst{5-4}   = ShiftedRm{6-5};
+  let Inst{14-12} = ShiftedRm{11-9};
+  let Inst{7-6}   = ShiftedRm{8-7};
+}
+
+class T2OneRegCmpShiftedReg<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rn;
+  bits<12> ShiftedRm;
+
+  let Inst{19-16} = Rn;
+  let Inst{3-0}   = ShiftedRm{3-0};
+  let Inst{5-4}   = ShiftedRm{6-5};
+  let Inst{14-12} = ShiftedRm{11-9};
+  let Inst{7-6}   = ShiftedRm{8-7};
+}
+
+class T2TwoReg<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{3-0}   = Rm;
+}
+
+class T2sTwoReg<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2sI<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{3-0}   = Rm;
+}
+
+class T2TwoRegCmp<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rn;
+  bits<4> Rm;
+
+  let Inst{19-16} = Rn;
+  let Inst{3-0}   = Rm;
+}
+
+
+class T2TwoRegImm<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<12> imm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = Rn;
+  let Inst{26}    = imm{11};
+  let Inst{14-12} = imm{10-8};
+  let Inst{7-0}   = imm{7-0};
+}
+
+class T2sTwoRegImm<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2sI<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<12> imm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = Rn;
+  let Inst{26}    = imm{11};
+  let Inst{14-12} = imm{10-8};
+  let Inst{7-0}   = imm{7-0};
+}
+
+class T2TwoRegShiftImm<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rm;
+  bits<5> imm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{3-0}   = Rm;
+  let Inst{14-12} = imm{4-2};
+  let Inst{7-6}   = imm{1-0};
+}
+
+class T2sTwoRegShiftImm<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2sI<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rm;
+  bits<5> imm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{3-0}   = Rm;
+  let Inst{14-12} = imm{4-2};
+  let Inst{7-6}   = imm{1-0};
+}
+
+class T2ThreeReg<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = Rn;
+  let Inst{3-0}   = Rm;
+}
+
+class T2ThreeRegNoP<dag oops, dag iops, InstrItinClass itin,
+           string asm, list<dag> pattern>
+  : T2XI<oops, iops, itin, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = Rn;
+  let Inst{3-0}   = Rm;
+}
+
+class T2sThreeReg<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2sI<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = Rn;
+  let Inst{3-0}   = Rm;
+}
+
+class T2TwoRegShiftedReg<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<12> ShiftedRm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = Rn;
+  let Inst{3-0}   = ShiftedRm{3-0};
+  let Inst{5-4}   = ShiftedRm{6-5};
+  let Inst{14-12} = ShiftedRm{11-9};
+  let Inst{7-6}   = ShiftedRm{8-7};
+}
+
+class T2sTwoRegShiftedReg<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2sI<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<12> ShiftedRm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = Rn;
+  let Inst{3-0}   = ShiftedRm{3-0};
+  let Inst{5-4}   = ShiftedRm{6-5};
+  let Inst{14-12} = ShiftedRm{11-9};
+  let Inst{7-6}   = ShiftedRm{8-7};
+}
+
+class T2FourReg<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+  bits<4> Ra;
+
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = Ra;
+  let Inst{11-8}  = Rd;
+  let Inst{3-0}   = Rm;
+}
+
+class T2MulLong<bits<3> opc22_20, bits<4> opc7_4,
+                dag oops, dag iops, InstrItinClass itin,
+                string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> RdLo;
+  bits<4> RdHi;
+  bits<4> Rn;
+  bits<4> Rm;
+
+  let Inst{31-23} = 0b111110111;
+  let Inst{22-20} = opc22_20;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = RdLo;
+  let Inst{11-8}  = RdHi;
+  let Inst{7-4}   = opc7_4;
+  let Inst{3-0}   = Rm;
+}
+class T2MlaLong<bits<3> opc22_20, bits<4> opc7_4,
+                dag oops, dag iops, InstrItinClass itin,
+                string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> RdLo;
+  bits<4> RdHi;
+  bits<4> Rn;
+  bits<4> Rm;
+
+  let Inst{31-23} = 0b111110111;
+  let Inst{22-20} = opc22_20;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = RdLo;
+  let Inst{11-8}  = RdHi;
+  let Inst{7-4}   = opc7_4;
+  let Inst{3-0}   = Rm;
+}
+
+
+/// T2I_bin_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns for a
+/// binary operation that produces a value. These are predicable and can be
+/// changed to modify CPSR.
+multiclass T2I_bin_irs<bits<4> opcod, string opc,
+                     InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
+                       PatFrag opnode, bit Commutable = 0,
+                       string wide = ""> {
+   // shifted imm
+   def ri : T2sTwoRegImm<
+                (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm), iii,
+                 opc, "\t$Rd, $Rn, $imm",
+                 [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_imm:$imm))]>,
+                 Sched<[WriteALU, ReadALU]> {
+     let Inst{31-27} = 0b11110;
+     let Inst{25} = 0;
+     let Inst{24-21} = opcod;
+     let Inst{15} = 0;
+   }
+   // register
+   def rr : T2sThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), iir,
+                 opc, !strconcat(wide, "\t$Rd, $Rn, $Rm"),
+                 [(set rGPR:$Rd, (opnode rGPR:$Rn, rGPR:$Rm))]>,
+                 Sched<[WriteALU, ReadALU, ReadALU]> {
+     let isCommutable = Commutable;
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+     let Inst{14-12} = 0b000; // imm3
+     let Inst{7-6} = 0b00; // imm2
+     let Inst{5-4} = 0b00; // type
+   }
+   // shifted register
+   def rs : T2sTwoRegShiftedReg<
+                 (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm), iis,
+                 opc, !strconcat(wide, "\t$Rd, $Rn, $ShiftedRm"),
+                 [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_reg:$ShiftedRm))]>,
+                 Sched<[WriteALUsi, ReadALU]>  {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+   }
+  // Assembly aliases for optional destination operand when it's the same
+  // as the source operand.
+  def : t2InstAlias<!strconcat(opc, "${s}${p} $Rdn, $imm"),
+     (!cast<Instruction>(NAME#"ri") rGPR:$Rdn, rGPR:$Rdn,
+                                                    t2_so_imm:$imm, pred:$p,
+                                                    cc_out:$s)>;
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", wide, " $Rdn, $Rm"),
+     (!cast<Instruction>(NAME#"rr") rGPR:$Rdn, rGPR:$Rdn,
+                                                    rGPR:$Rm, pred:$p,
+                                                    cc_out:$s)>;
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", wide, " $Rdn, $shift"),
+     (!cast<Instruction>(NAME#"rs") rGPR:$Rdn, rGPR:$Rdn,
+                                                    t2_so_reg:$shift, pred:$p,
+                                                    cc_out:$s)>;
+}
+
+/// T2I_bin_w_irs - Same as T2I_bin_irs except these operations need
+//  the ".w" suffix to indicate that they are wide.
+multiclass T2I_bin_w_irs<bits<4> opcod, string opc,
+                     InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
+                         PatFrag opnode, bit Commutable = 0> :
+    T2I_bin_irs<opcod, opc, iii, iir, iis, opnode, Commutable, ".w"> {
+  // Assembler aliases w/ the ".w" suffix.
+  def : t2InstAlias<!strconcat(opc, "${s}${p}.w", " $Rd, $Rn, $imm"),
+     (!cast<Instruction>(NAME#"ri") rGPR:$Rd, rGPR:$Rn, t2_so_imm:$imm, pred:$p,
+                                    cc_out:$s)>;
+  // Assembler aliases w/o the ".w" suffix.
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rd, $Rn, $Rm"),
+     (!cast<Instruction>(NAME#"rr") rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, pred:$p,
+                                    cc_out:$s)>;
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rd, $Rn, $shift"),
+     (!cast<Instruction>(NAME#"rs") rGPR:$Rd, rGPR:$Rn, t2_so_reg:$shift,
+                                    pred:$p, cc_out:$s)>;
+
+  // and with the optional destination operand, too.
+  def : t2InstAlias<!strconcat(opc, "${s}${p}.w", " $Rdn, $imm"),
+     (!cast<Instruction>(NAME#"ri") rGPR:$Rdn, rGPR:$Rdn, t2_so_imm:$imm,
+                                    pred:$p, cc_out:$s)>;
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rdn, $Rm"),
+     (!cast<Instruction>(NAME#"rr") rGPR:$Rdn, rGPR:$Rdn, rGPR:$Rm, pred:$p,
+                                    cc_out:$s)>;
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rdn, $shift"),
+     (!cast<Instruction>(NAME#"rs") rGPR:$Rdn, rGPR:$Rdn, t2_so_reg:$shift,
+                                    pred:$p, cc_out:$s)>;
+}
+
+/// T2I_rbin_is - Same as T2I_bin_irs except the order of operands are
+/// reversed.  The 'rr' form is only defined for the disassembler; for codegen
+/// it is equivalent to the T2I_bin_irs counterpart.
+multiclass T2I_rbin_irs<bits<4> opcod, string opc, PatFrag opnode> {
+   // shifted imm
+   def ri : T2sTwoRegImm<
+                 (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm), IIC_iALUi,
+                 opc, ".w\t$Rd, $Rn, $imm",
+                 [(set rGPR:$Rd, (opnode t2_so_imm:$imm, rGPR:$Rn))]>,
+                 Sched<[WriteALU, ReadALU]> {
+     let Inst{31-27} = 0b11110;
+     let Inst{25} = 0;
+     let Inst{24-21} = opcod;
+     let Inst{15} = 0;
+   }
+   // register
+   def rr : T2sThreeReg<
+                 (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iALUr,
+                 opc, "\t$Rd, $Rn, $Rm",
+                 [/* For disassembly only; pattern left blank */]>,
+                 Sched<[WriteALU, ReadALU, ReadALU]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+     let Inst{14-12} = 0b000; // imm3
+     let Inst{7-6} = 0b00; // imm2
+     let Inst{5-4} = 0b00; // type
+   }
+   // shifted register
+   def rs : T2sTwoRegShiftedReg<
+                 (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm),
+                 IIC_iALUsir, opc, "\t$Rd, $Rn, $ShiftedRm",
+                 [(set rGPR:$Rd, (opnode t2_so_reg:$ShiftedRm, rGPR:$Rn))]>,
+                 Sched<[WriteALUsi, ReadALU]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+   }
+}
+
+/// T2I_bin_s_irs - Similar to T2I_bin_irs except it sets the 's' bit so the
+/// instruction modifies the CPSR register.
+///
+/// These opcodes will be converted to the real non-S opcodes by
+/// AdjustInstrPostInstrSelection after giving then an optional CPSR operand.
+let hasPostISelHook = 1, Defs = [CPSR] in {
+multiclass T2I_bin_s_irs<InstrItinClass iii, InstrItinClass iir,
+                         InstrItinClass iis, PatFrag opnode,
+                         bit Commutable = 0> {
+   // shifted imm
+   def ri : t2PseudoInst<(outs rGPR:$Rd),
+                         (ins GPRnopc:$Rn, t2_so_imm:$imm, pred:$p),
+                         4, iii,
+                         [(set rGPR:$Rd, CPSR, (opnode GPRnopc:$Rn,
+                                                t2_so_imm:$imm))]>,
+            Sched<[WriteALU, ReadALU]>;
+   // register
+   def rr : t2PseudoInst<(outs rGPR:$Rd), (ins GPRnopc:$Rn, rGPR:$Rm, pred:$p),
+                         4, iir,
+                         [(set rGPR:$Rd, CPSR, (opnode GPRnopc:$Rn,
+                                                rGPR:$Rm))]>,
+            Sched<[WriteALU, ReadALU, ReadALU]> {
+     let isCommutable = Commutable;
+   }
+   // shifted register
+   def rs : t2PseudoInst<(outs rGPR:$Rd),
+                         (ins GPRnopc:$Rn, t2_so_reg:$ShiftedRm, pred:$p),
+                         4, iis,
+                         [(set rGPR:$Rd, CPSR, (opnode GPRnopc:$Rn,
+                                                t2_so_reg:$ShiftedRm))]>,
+            Sched<[WriteALUsi, ReadALUsr]>;
+}
+}
+
+/// T2I_rbin_s_is -  Same as T2I_bin_s_irs, except selection DAG
+/// operands are reversed.
+let hasPostISelHook = 1, Defs = [CPSR] in {
+multiclass T2I_rbin_s_is<PatFrag opnode> {
+   // shifted imm
+   def ri : t2PseudoInst<(outs rGPR:$Rd),
+                         (ins rGPR:$Rn, t2_so_imm:$imm, pred:$p),
+                         4, IIC_iALUi,
+                         [(set rGPR:$Rd, CPSR, (opnode t2_so_imm:$imm,
+                                                rGPR:$Rn))]>,
+            Sched<[WriteALU, ReadALU]>;
+   // shifted register
+   def rs : t2PseudoInst<(outs rGPR:$Rd),
+                         (ins rGPR:$Rn, t2_so_reg:$ShiftedRm, pred:$p),
+                         4, IIC_iALUsi,
+                         [(set rGPR:$Rd, CPSR, (opnode t2_so_reg:$ShiftedRm,
+                                                rGPR:$Rn))]>,
+            Sched<[WriteALUsi, ReadALU]>;
+}
+}
+
+/// T2I_bin_ii12rs - Defines a set of (op reg, {so_imm|imm0_4095|r|so_reg})
+/// patterns for a binary operation that produces a value.
+multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, PatFrag opnode,
+                          bit Commutable = 0> {
+   // shifted imm
+   // The register-immediate version is re-materializable. This is useful
+   // in particular for taking the address of a local.
+   let isReMaterializable = 1 in {
+   def ri : T2sTwoRegImm<
+               (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, t2_so_imm:$imm), IIC_iALUi,
+               opc, ".w\t$Rd, $Rn, $imm",
+               [(set GPRnopc:$Rd, (opnode GPRnopc:$Rn, t2_so_imm:$imm))]>,
+               Sched<[WriteALU, ReadALU]> {
+     let Inst{31-27} = 0b11110;
+     let Inst{25} = 0;
+     let Inst{24} = 1;
+     let Inst{23-21} = op23_21;
+     let Inst{15} = 0;
+   }
+   }
+   // 12-bit imm
+   def ri12 : T2I<
+                  (outs GPRnopc:$Rd), (ins GPR:$Rn, imm0_4095:$imm), IIC_iALUi,
+                  !strconcat(opc, "w"), "\t$Rd, $Rn, $imm",
+                  [(set GPRnopc:$Rd, (opnode GPR:$Rn, imm0_4095:$imm))]>,
+                  Sched<[WriteALU, ReadALU]> {
+     bits<4> Rd;
+     bits<4> Rn;
+     bits<12> imm;
+     let Inst{31-27} = 0b11110;
+     let Inst{26} = imm{11};
+     let Inst{25-24} = 0b10;
+     let Inst{23-21} = op23_21;
+     let Inst{20} = 0; // The S bit.
+     let Inst{19-16} = Rn;
+     let Inst{15} = 0;
+     let Inst{14-12} = imm{10-8};
+     let Inst{11-8} = Rd;
+     let Inst{7-0} = imm{7-0};
+   }
+   // register
+   def rr : T2sThreeReg<(outs GPRnopc:$Rd), (ins GPRnopc:$Rn, rGPR:$Rm),
+                 IIC_iALUr, opc, ".w\t$Rd, $Rn, $Rm",
+                 [(set GPRnopc:$Rd, (opnode GPRnopc:$Rn, rGPR:$Rm))]>,
+                 Sched<[WriteALU, ReadALU, ReadALU]> {
+     let isCommutable = Commutable;
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24} = 1;
+     let Inst{23-21} = op23_21;
+     let Inst{14-12} = 0b000; // imm3
+     let Inst{7-6} = 0b00; // imm2
+     let Inst{5-4} = 0b00; // type
+   }
+   // shifted register
+   def rs : T2sTwoRegShiftedReg<
+                 (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, t2_so_reg:$ShiftedRm),
+                 IIC_iALUsi, opc, ".w\t$Rd, $Rn, $ShiftedRm",
+              [(set GPRnopc:$Rd, (opnode GPRnopc:$Rn, t2_so_reg:$ShiftedRm))]>,
+              Sched<[WriteALUsi, ReadALU]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24} = 1;
+     let Inst{23-21} = op23_21;
+   }
+}
+
+/// T2I_adde_sube_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns
+/// for a binary operation that produces a value and use the carry
+/// bit. It's not predicable.
+let Defs = [CPSR], Uses = [CPSR] in {
+multiclass T2I_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
+                             bit Commutable = 0> {
+   // shifted imm
+   def ri : T2sTwoRegImm<(outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm),
+                 IIC_iALUi, opc, "\t$Rd, $Rn, $imm",
+               [(set rGPR:$Rd, CPSR, (opnode rGPR:$Rn, t2_so_imm:$imm, CPSR))]>,
+                 Requires<[IsThumb2]>, Sched<[WriteALU, ReadALU]> {
+     let Inst{31-27} = 0b11110;
+     let Inst{25} = 0;
+     let Inst{24-21} = opcod;
+     let Inst{15} = 0;
+   }
+   // register
+   def rr : T2sThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iALUr,
+                 opc, ".w\t$Rd, $Rn, $Rm",
+                 [(set rGPR:$Rd, CPSR, (opnode rGPR:$Rn, rGPR:$Rm, CPSR))]>,
+                 Requires<[IsThumb2]>, Sched<[WriteALU, ReadALU, ReadALU]> {
+     let isCommutable = Commutable;
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+     let Inst{14-12} = 0b000; // imm3
+     let Inst{7-6} = 0b00; // imm2
+     let Inst{5-4} = 0b00; // type
+   }
+   // shifted register
+   def rs : T2sTwoRegShiftedReg<
+                 (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm),
+                 IIC_iALUsi, opc, ".w\t$Rd, $Rn, $ShiftedRm",
+         [(set rGPR:$Rd, CPSR, (opnode rGPR:$Rn, t2_so_reg:$ShiftedRm, CPSR))]>,
+                 Requires<[IsThumb2]>, Sched<[WriteALUsi, ReadALU]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+   }
+}
+}
+
+/// T2I_sh_ir - Defines a set of (op reg, {so_imm|r}) patterns for a shift /
+//  rotate operation that produces a value.
+multiclass T2I_sh_ir<bits<2> opcod, string opc, Operand ty, PatFrag opnode> {
+   // 5-bit imm
+   def ri : T2sTwoRegShiftImm<
+                 (outs rGPR:$Rd), (ins rGPR:$Rm, ty:$imm), IIC_iMOVsi,
+                 opc, ".w\t$Rd, $Rm, $imm",
+                 [(set rGPR:$Rd, (opnode rGPR:$Rm, (i32 ty:$imm)))]>,
+                 Sched<[WriteALU]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-21} = 0b010010;
+     let Inst{19-16} = 0b1111; // Rn
+     let Inst{5-4} = opcod;
+   }
+   // register
+   def rr : T2sThreeReg<
+                 (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMOVsr,
+                 opc, ".w\t$Rd, $Rn, $Rm",
+                 [(set rGPR:$Rd, (opnode rGPR:$Rn, rGPR:$Rm))]>,
+                 Sched<[WriteALU]> {
+     let Inst{31-27} = 0b11111;
+     let Inst{26-23} = 0b0100;
+     let Inst{22-21} = opcod;
+     let Inst{15-12} = 0b1111;
+     let Inst{7-4} = 0b0000;
+   }
+
+  // Optional destination register
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", ".w $Rdn, $imm"),
+     (!cast<Instruction>(NAME#"ri") rGPR:$Rdn, rGPR:$Rdn, ty:$imm, pred:$p,
+                                    cc_out:$s)>;
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", ".w $Rdn, $Rm"),
+     (!cast<Instruction>(NAME#"rr") rGPR:$Rdn, rGPR:$Rdn, rGPR:$Rm, pred:$p,
+                                    cc_out:$s)>;
+
+  // Assembler aliases w/o the ".w" suffix.
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rd, $Rn, $imm"),
+     (!cast<Instruction>(NAME#"ri") rGPR:$Rd, rGPR:$Rn, ty:$imm, pred:$p,
+                                    cc_out:$s)>;
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rd, $Rn, $Rm"),
+     (!cast<Instruction>(NAME#"rr") rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, pred:$p,
+                                    cc_out:$s)>;
+
+  // and with the optional destination operand, too.
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rdn, $imm"),
+     (!cast<Instruction>(NAME#"ri") rGPR:$Rdn, rGPR:$Rdn, ty:$imm, pred:$p,
+                                    cc_out:$s)>;
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rdn, $Rm"),
+     (!cast<Instruction>(NAME#"rr") rGPR:$Rdn, rGPR:$Rdn, rGPR:$Rm, pred:$p,
+                                    cc_out:$s)>;
+}
+
+/// T2I_cmp_irs - Defines a set of (op r, {so_imm|r|so_reg}) cmp / test
+/// patterns. Similar to T2I_bin_irs except the instruction does not produce
+/// a explicit result, only implicitly set CPSR.
+multiclass T2I_cmp_irs<bits<4> opcod, string opc,
+                     InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
+                       PatFrag opnode> {
+let isCompare = 1, Defs = [CPSR] in {
+   // shifted imm
+   def ri : T2OneRegCmpImm<
+                (outs), (ins GPRnopc:$Rn, t2_so_imm:$imm), iii,
+                opc, ".w\t$Rn, $imm",
+                [(opnode GPRnopc:$Rn, t2_so_imm:$imm)]>, Sched<[WriteCMP]> {
+     let Inst{31-27} = 0b11110;
+     let Inst{25} = 0;
+     let Inst{24-21} = opcod;
+     let Inst{20} = 1; // The S bit.
+     let Inst{15} = 0;
+     let Inst{11-8} = 0b1111; // Rd
+   }
+   // register
+   def rr : T2TwoRegCmp<
+                (outs), (ins GPRnopc:$Rn, rGPR:$Rm), iir,
+                opc, ".w\t$Rn, $Rm",
+                [(opnode GPRnopc:$Rn, rGPR:$Rm)]>, Sched<[WriteCMP]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+     let Inst{20} = 1; // The S bit.
+     let Inst{14-12} = 0b000; // imm3
+     let Inst{11-8} = 0b1111; // Rd
+     let Inst{7-6} = 0b00; // imm2
+     let Inst{5-4} = 0b00; // type
+   }
+   // shifted register
+   def rs : T2OneRegCmpShiftedReg<
+                (outs), (ins GPRnopc:$Rn, t2_so_reg:$ShiftedRm), iis,
+                opc, ".w\t$Rn, $ShiftedRm",
+                [(opnode GPRnopc:$Rn, t2_so_reg:$ShiftedRm)]>,
+                Sched<[WriteCMPsi]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+     let Inst{20} = 1; // The S bit.
+     let Inst{11-8} = 0b1111; // Rd
+   }
+}
+
+  // Assembler aliases w/o the ".w" suffix.
+  // No alias here for 'rr' version as not all instantiations of this
+  // multiclass want one (CMP in particular, does not).
+  def : t2InstAlias<!strconcat(opc, "${p}", " $Rn, $imm"),
+     (!cast<Instruction>(NAME#"ri") GPRnopc:$Rn, t2_so_imm:$imm, pred:$p)>;
+  def : t2InstAlias<!strconcat(opc, "${p}", " $Rn, $shift"),
+     (!cast<Instruction>(NAME#"rs") GPRnopc:$Rn, t2_so_reg:$shift, pred:$p)>;
+}
+
+/// T2I_ld - Defines a set of (op r, {imm12|imm8|so_reg}) load patterns.
+multiclass T2I_ld<bit signed, bits<2> opcod, string opc,
+                  InstrItinClass iii, InstrItinClass iis, RegisterClass target,
+                  PatFrag opnode> {
+  def i12 : T2Ii12<(outs target:$Rt), (ins t2addrmode_imm12:$addr), iii,
+                   opc, ".w\t$Rt, $addr",
+                   [(set target:$Rt, (opnode t2addrmode_imm12:$addr))]> {
+    bits<4> Rt;
+    bits<17> addr;
+    let Inst{31-25} = 0b1111100;
+    let Inst{24} = signed;
+    let Inst{23} = 1;
+    let Inst{22-21} = opcod;
+    let Inst{20} = 1; // load
+    let Inst{19-16} = addr{16-13}; // Rn
+    let Inst{15-12} = Rt;
+    let Inst{11-0}  = addr{11-0};  // imm
+
+    let DecoderMethod = "DecodeT2LoadImm12";
+  }
+  def i8  : T2Ii8 <(outs target:$Rt), (ins t2addrmode_negimm8:$addr), iii,
+                   opc, "\t$Rt, $addr",
+                   [(set target:$Rt, (opnode t2addrmode_negimm8:$addr))]> {
+    bits<4> Rt;
+    bits<13> addr;
+    let Inst{31-27} = 0b11111;
+    let Inst{26-25} = 0b00;
+    let Inst{24} = signed;
+    let Inst{23} = 0;
+    let Inst{22-21} = opcod;
+    let Inst{20} = 1; // load
+    let Inst{19-16} = addr{12-9}; // Rn
+    let Inst{15-12} = Rt;
+    let Inst{11} = 1;
+    // Offset: index==TRUE, wback==FALSE
+    let Inst{10} = 1; // The P bit.
+    let Inst{9}     = addr{8};    // U
+    let Inst{8} = 0; // The W bit.
+    let Inst{7-0}   = addr{7-0};  // imm
+
+    let DecoderMethod = "DecodeT2LoadImm8";
+  }
+  def s   : T2Iso <(outs target:$Rt), (ins t2addrmode_so_reg:$addr), iis,
+                   opc, ".w\t$Rt, $addr",
+                   [(set target:$Rt, (opnode t2addrmode_so_reg:$addr))]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-25} = 0b00;
+    let Inst{24} = signed;
+    let Inst{23} = 0;
+    let Inst{22-21} = opcod;
+    let Inst{20} = 1; // load
+    let Inst{11-6} = 0b000000;
+
+    bits<4> Rt;
+    let Inst{15-12} = Rt;
+
+    bits<10> addr;
+    let Inst{19-16} = addr{9-6}; // Rn
+    let Inst{3-0}   = addr{5-2}; // Rm
+    let Inst{5-4}   = addr{1-0}; // imm
+
+    let DecoderMethod = "DecodeT2LoadShift";
+  }
+
+  // pci variant is very similar to i12, but supports negative offsets
+  // from the PC.
+  def pci : T2Ipc <(outs target:$Rt), (ins t2ldrlabel:$addr), iii,
+                   opc, ".w\t$Rt, $addr",
+                   [(set target:$Rt, (opnode (ARMWrapper tconstpool:$addr)))]> {
+    let isReMaterializable = 1;
+    let Inst{31-27} = 0b11111;
+    let Inst{26-25} = 0b00;
+    let Inst{24} = signed;
+    let Inst{22-21} = opcod;
+    let Inst{20} = 1; // load
+    let Inst{19-16} = 0b1111; // Rn
+
+    bits<4> Rt;
+    let Inst{15-12} = Rt{3-0};
+
+    bits<13> addr;
+    let Inst{23} = addr{12}; // add = (U == '1')
+    let Inst{11-0}  = addr{11-0};
+
+    let DecoderMethod = "DecodeT2LoadLabel";
+  }
+}
+
+/// T2I_st - Defines a set of (op r, {imm12|imm8|so_reg}) store patterns.
+multiclass T2I_st<bits<2> opcod, string opc,
+                  InstrItinClass iii, InstrItinClass iis, RegisterClass target,
+                  PatFrag opnode> {
+  def i12 : T2Ii12<(outs), (ins target:$Rt, t2addrmode_imm12:$addr), iii,
+                   opc, ".w\t$Rt, $addr",
+                   [(opnode target:$Rt, t2addrmode_imm12:$addr)]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0001;
+    let Inst{22-21} = opcod;
+    let Inst{20} = 0; // !load
+
+    bits<4> Rt;
+    let Inst{15-12} = Rt;
+
+    bits<17> addr;
+    let addr{12}    = 1;           // add = TRUE
+    let Inst{19-16} = addr{16-13}; // Rn
+    let Inst{23}    = addr{12};    // U
+    let Inst{11-0}  = addr{11-0};  // imm
+  }
+  def i8  : T2Ii8 <(outs), (ins target:$Rt, t2addrmode_negimm8:$addr), iii,
+                   opc, "\t$Rt, $addr",
+                   [(opnode target:$Rt, t2addrmode_negimm8:$addr)]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0000;
+    let Inst{22-21} = opcod;
+    let Inst{20} = 0; // !load
+    let Inst{11} = 1;
+    // Offset: index==TRUE, wback==FALSE
+    let Inst{10} = 1; // The P bit.
+    let Inst{8} = 0; // The W bit.
+
+    bits<4> Rt;
+    let Inst{15-12} = Rt;
+
+    bits<13> addr;
+    let Inst{19-16} = addr{12-9}; // Rn
+    let Inst{9}     = addr{8};    // U
+    let Inst{7-0}   = addr{7-0};  // imm
+  }
+  def s   : T2Iso <(outs), (ins target:$Rt, t2addrmode_so_reg:$addr), iis,
+                   opc, ".w\t$Rt, $addr",
+                   [(opnode target:$Rt, t2addrmode_so_reg:$addr)]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0000;
+    let Inst{22-21} = opcod;
+    let Inst{20} = 0; // !load
+    let Inst{11-6} = 0b000000;
+
+    bits<4> Rt;
+    let Inst{15-12} = Rt;
+
+    bits<10> addr;
+    let Inst{19-16}   = addr{9-6}; // Rn
+    let Inst{3-0} = addr{5-2}; // Rm
+    let Inst{5-4}   = addr{1-0}; // imm
+  }
+}
+
+/// T2I_ext_rrot - A unary operation with two forms: one whose operand is a
+/// register and one whose operand is a register rotated by 8/16/24.
+class T2I_ext_rrot<bits<3> opcod, string opc, PatFrag opnode>
+  : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm, rot_imm:$rot), IIC_iEXTr,
+             opc, ".w\t$Rd, $Rm$rot",
+             [(set rGPR:$Rd, (opnode (rotr rGPR:$Rm, rot_imm:$rot)))]>,
+             Requires<[IsThumb2]> {
+   let Inst{31-27} = 0b11111;
+   let Inst{26-23} = 0b0100;
+   let Inst{22-20} = opcod;
+   let Inst{19-16} = 0b1111; // Rn
+   let Inst{15-12} = 0b1111;
+   let Inst{7} = 1;
+
+   bits<2> rot;
+   let Inst{5-4} = rot{1-0}; // rotate
+}
+
+// UXTB16 - Requres T2ExtractPack, does not need the .w qualifier.
+class T2I_ext_rrot_uxtb16<bits<3> opcod, string opc, PatFrag opnode>
+  : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm, rot_imm:$rot),
+             IIC_iEXTr, opc, "\t$Rd, $Rm$rot",
+            [(set rGPR:$Rd, (opnode (rotr rGPR:$Rm, rot_imm:$rot)))]>,
+          Requires<[HasT2ExtractPack, IsThumb2]> {
+  bits<2> rot;
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0100;
+  let Inst{22-20} = opcod;
+  let Inst{19-16} = 0b1111; // Rn
+  let Inst{15-12} = 0b1111;
+  let Inst{7} = 1;
+  let Inst{5-4} = rot;
+}
+
+// SXTB16 - Requres T2ExtractPack, does not need the .w qualifier, no pattern
+// supported yet.
+class T2I_ext_rrot_sxtb16<bits<3> opcod, string opc>
+  : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm, rot_imm:$rot), IIC_iEXTr,
+             opc, "\t$Rd, $Rm$rot", []>,
+          Requires<[IsThumb2, HasT2ExtractPack]> {
+  bits<2> rot;
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0100;
+  let Inst{22-20} = opcod;
+  let Inst{19-16} = 0b1111; // Rn
+  let Inst{15-12} = 0b1111;
+  let Inst{7} = 1;
+  let Inst{5-4} = rot;
+}
+
+/// T2I_exta_rrot - A binary operation with two forms: one whose operand is a
+/// register and one whose operand is a register rotated by 8/16/24.
+class T2I_exta_rrot<bits<3> opcod, string opc, PatFrag opnode>
+  : T2ThreeReg<(outs rGPR:$Rd),
+               (ins rGPR:$Rn, rGPR:$Rm, rot_imm:$rot),
+               IIC_iEXTAsr, opc, "\t$Rd, $Rn, $Rm$rot",
+             [(set rGPR:$Rd, (opnode rGPR:$Rn, (rotr rGPR:$Rm,rot_imm:$rot)))]>,
+           Requires<[HasT2ExtractPack, IsThumb2]> {
+  bits<2> rot;
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0100;
+  let Inst{22-20} = opcod;
+  let Inst{15-12} = 0b1111;
+  let Inst{7} = 1;
+  let Inst{5-4} = rot;
+}
+
+class T2I_exta_rrot_np<bits<3> opcod, string opc>
+  : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm,rot_imm:$rot),
+               IIC_iEXTAsr, opc, "\t$Rd, $Rn, $Rm$rot", []> {
+  bits<2> rot;
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0100;
+  let Inst{22-20} = opcod;
+  let Inst{15-12} = 0b1111;
+  let Inst{7} = 1;
+  let Inst{5-4} = rot;
+}
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Miscellaneous Instructions.
+//
+
+class T2PCOneRegImm<dag oops, dag iops, InstrItinClass itin,
+           string asm, list<dag> pattern>
+  : T2XI<oops, iops, itin, asm, pattern> {
+  bits<4> Rd;
+  bits<12> label;
+
+  let Inst{11-8}  = Rd;
+  let Inst{26}    = label{11};
+  let Inst{14-12} = label{10-8};
+  let Inst{7-0}   = label{7-0};
+}
+
+// LEApcrel - Load a pc-relative address into a register without offending the
+// assembler.
+def t2ADR : T2PCOneRegImm<(outs rGPR:$Rd),
+              (ins t2adrlabel:$addr, pred:$p),
+              IIC_iALUi, "adr{$p}.w\t$Rd, $addr", []>,
+              Sched<[WriteALU, ReadALU]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25-24} = 0b10;
+  // Inst{23:21} = '11' (add = FALSE) or '00' (add = TRUE)
+  let Inst{22} = 0;
+  let Inst{20} = 0;
+  let Inst{19-16} = 0b1111; // Rn
+  let Inst{15} = 0;
+
+  bits<4> Rd;
+  bits<13> addr;
+  let Inst{11-8} = Rd;
+  let Inst{23}    = addr{12};
+  let Inst{21}    = addr{12};
+  let Inst{26}    = addr{11};
+  let Inst{14-12} = addr{10-8};
+  let Inst{7-0}   = addr{7-0};
+
+  let DecoderMethod = "DecodeT2Adr";
+}
+
+let neverHasSideEffects = 1, isReMaterializable = 1 in
+def t2LEApcrel   : t2PseudoInst<(outs rGPR:$Rd), (ins i32imm:$label, pred:$p),
+                                4, IIC_iALUi, []>, Sched<[WriteALU, ReadALU]>;
+let hasSideEffects = 1 in
+def t2LEApcrelJT : t2PseudoInst<(outs rGPR:$Rd),
+                                (ins i32imm:$label, nohash_imm:$id, pred:$p),
+                                4, IIC_iALUi,
+                                []>, Sched<[WriteALU, ReadALU]>;
+
+
+//===----------------------------------------------------------------------===//
+//  Load / store Instructions.
+//
+
+// Load
+let canFoldAsLoad = 1, isReMaterializable = 1  in
+defm t2LDR   : T2I_ld<0, 0b10, "ldr", IIC_iLoad_i, IIC_iLoad_si, GPR,
+                      UnOpFrag<(load node:$Src)>>;
+
+// Loads with zero extension
+defm t2LDRH  : T2I_ld<0, 0b01, "ldrh", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
+                      GPR, UnOpFrag<(zextloadi16 node:$Src)>>;
+defm t2LDRB  : T2I_ld<0, 0b00, "ldrb", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
+                      GPR, UnOpFrag<(zextloadi8  node:$Src)>>;
+
+// Loads with sign extension
+defm t2LDRSH : T2I_ld<1, 0b01, "ldrsh", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
+                      GPR, UnOpFrag<(sextloadi16 node:$Src)>>;
+defm t2LDRSB : T2I_ld<1, 0b00, "ldrsb", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
+                      GPR, UnOpFrag<(sextloadi8  node:$Src)>>;
+
+let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
+// Load doubleword
+def t2LDRDi8  : T2Ii8s4<1, 0, 1, (outs rGPR:$Rt, rGPR:$Rt2),
+                        (ins t2addrmode_imm8s4:$addr),
+                        IIC_iLoad_d_i, "ldrd", "\t$Rt, $Rt2, $addr", "", []>;
+} // mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1
+
+// zextload i1 -> zextload i8
+def : T2Pat<(zextloadi1 t2addrmode_imm12:$addr),
+            (t2LDRBi12  t2addrmode_imm12:$addr)>;
+def : T2Pat<(zextloadi1 t2addrmode_negimm8:$addr),
+            (t2LDRBi8   t2addrmode_negimm8:$addr)>;
+def : T2Pat<(zextloadi1 t2addrmode_so_reg:$addr),
+            (t2LDRBs    t2addrmode_so_reg:$addr)>;
+def : T2Pat<(zextloadi1 (ARMWrapper tconstpool:$addr)),
+            (t2LDRBpci  tconstpool:$addr)>;
+
+// extload -> zextload
+// FIXME: Reduce the number of patterns by legalizing extload to zextload
+// earlier?
+def : T2Pat<(extloadi1  t2addrmode_imm12:$addr),
+            (t2LDRBi12  t2addrmode_imm12:$addr)>;
+def : T2Pat<(extloadi1  t2addrmode_negimm8:$addr),
+            (t2LDRBi8   t2addrmode_negimm8:$addr)>;
+def : T2Pat<(extloadi1  t2addrmode_so_reg:$addr),
+            (t2LDRBs    t2addrmode_so_reg:$addr)>;
+def : T2Pat<(extloadi1  (ARMWrapper tconstpool:$addr)),
+            (t2LDRBpci  tconstpool:$addr)>;
+
+def : T2Pat<(extloadi8  t2addrmode_imm12:$addr),
+            (t2LDRBi12  t2addrmode_imm12:$addr)>;
+def : T2Pat<(extloadi8  t2addrmode_negimm8:$addr),
+            (t2LDRBi8   t2addrmode_negimm8:$addr)>;
+def : T2Pat<(extloadi8  t2addrmode_so_reg:$addr),
+            (t2LDRBs    t2addrmode_so_reg:$addr)>;
+def : T2Pat<(extloadi8  (ARMWrapper tconstpool:$addr)),
+            (t2LDRBpci  tconstpool:$addr)>;
+
+def : T2Pat<(extloadi16 t2addrmode_imm12:$addr),
+            (t2LDRHi12  t2addrmode_imm12:$addr)>;
+def : T2Pat<(extloadi16 t2addrmode_negimm8:$addr),
+            (t2LDRHi8   t2addrmode_negimm8:$addr)>;
+def : T2Pat<(extloadi16 t2addrmode_so_reg:$addr),
+            (t2LDRHs    t2addrmode_so_reg:$addr)>;
+def : T2Pat<(extloadi16 (ARMWrapper tconstpool:$addr)),
+            (t2LDRHpci  tconstpool:$addr)>;
+
+// FIXME: The destination register of the loads and stores can't be PC, but
+//        can be SP. We need another regclass (similar to rGPR) to represent
+//        that. Not a pressing issue since these are selected manually,
+//        not via pattern.
+
+// Indexed loads
+
+let mayLoad = 1, neverHasSideEffects = 1 in {
+def t2LDR_PRE  : T2Ipreldst<0, 0b10, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
+                            (ins t2addrmode_imm8_pre:$addr),
+                            AddrModeT2_i8, IndexModePre, IIC_iLoad_iu,
+                            "ldr", "\t$Rt, $addr!", "$addr.base = $Rn_wb", []>;
+
+def t2LDR_POST : T2Ipostldst<0, 0b10, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                          (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
+                          AddrModeT2_i8, IndexModePost, IIC_iLoad_iu,
+                          "ldr", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
+
+def t2LDRB_PRE : T2Ipreldst<0, 0b00, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
+                            (ins t2addrmode_imm8_pre:$addr),
+                            AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
+                            "ldrb", "\t$Rt, $addr!", "$addr.base = $Rn_wb", []>;
+
+def t2LDRB_POST : T2Ipostldst<0, 0b00, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                          (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
+                          AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
+                          "ldrb", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
+
+def t2LDRH_PRE : T2Ipreldst<0, 0b01, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
+                            (ins t2addrmode_imm8_pre:$addr),
+                            AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
+                            "ldrh", "\t$Rt, $addr!", "$addr.base = $Rn_wb", []>;
+
+def t2LDRH_POST : T2Ipostldst<0, 0b01, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                          (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
+                          AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
+                          "ldrh", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
+
+def t2LDRSB_PRE : T2Ipreldst<1, 0b00, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
+                            (ins t2addrmode_imm8_pre:$addr),
+                            AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
+                            "ldrsb", "\t$Rt, $addr!", "$addr.base = $Rn_wb",
+                            []>;
+
+def t2LDRSB_POST : T2Ipostldst<1, 0b00, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                          (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
+                          AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
+                          "ldrsb", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
+
+def t2LDRSH_PRE : T2Ipreldst<1, 0b01, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
+                            (ins t2addrmode_imm8_pre:$addr),
+                            AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
+                            "ldrsh", "\t$Rt, $addr!", "$addr.base = $Rn_wb",
+                            []>;
+
+def t2LDRSH_POST : T2Ipostldst<1, 0b01, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                          (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
+                          AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
+                          "ldrsh", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
+} // mayLoad = 1, neverHasSideEffects = 1
+
+// LDRT, LDRBT, LDRHT, LDRSBT, LDRSHT all have offset mode (PUW=0b110).
+// Ref: A8.6.57 LDR (immediate, Thumb) Encoding T4
+class T2IldT<bit signed, bits<2> type, string opc, InstrItinClass ii>
+  : T2Ii8<(outs rGPR:$Rt), (ins t2addrmode_posimm8:$addr), ii, opc,
+          "\t$Rt, $addr", []> {
+  bits<4> Rt;
+  bits<13> addr;
+  let Inst{31-27} = 0b11111;
+  let Inst{26-25} = 0b00;
+  let Inst{24} = signed;
+  let Inst{23} = 0;
+  let Inst{22-21} = type;
+  let Inst{20} = 1; // load
+  let Inst{19-16} = addr{12-9};
+  let Inst{15-12} = Rt;
+  let Inst{11} = 1;
+  let Inst{10-8} = 0b110; // PUW.
+  let Inst{7-0} = addr{7-0};
+
+  let DecoderMethod = "DecodeT2LoadT";
+}
+
+def t2LDRT   : T2IldT<0, 0b10, "ldrt", IIC_iLoad_i>;
+def t2LDRBT  : T2IldT<0, 0b00, "ldrbt", IIC_iLoad_bh_i>;
+def t2LDRHT  : T2IldT<0, 0b01, "ldrht", IIC_iLoad_bh_i>;
+def t2LDRSBT : T2IldT<1, 0b00, "ldrsbt", IIC_iLoad_bh_i>;
+def t2LDRSHT : T2IldT<1, 0b01, "ldrsht", IIC_iLoad_bh_i>;
+
+class T2Ildacq<bits<4> bits23_20, bits<2> bit54, dag oops, dag iops,
+               string opc, string asm, list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeNone, 4, NoItinerary,
+            opc, asm, "", pattern>, Requires<[IsThumb, HasV8]> {
+  bits<4> Rt;
+  bits<4> addr;
+
+  let Inst{31-27} = 0b11101;
+  let Inst{26-24} = 0b000;
+  let Inst{23-20} = bits23_20;
+  let Inst{11-6} = 0b111110;
+  let Inst{5-4} = bit54;
+  let Inst{3-0} = 0b1111;
+
+  // Encode instruction operands
+  let Inst{19-16} = addr;
+  let Inst{15-12} = Rt;
+}
+
+def t2LDA : T2Ildacq<0b1101, 0b10, (outs rGPR:$Rt),
+                     (ins addr_offset_none:$addr), "lda", "\t$Rt, $addr", []>;
+def t2LDAB : T2Ildacq<0b1101, 0b00, (outs rGPR:$Rt),
+                      (ins addr_offset_none:$addr), "ldab", "\t$Rt, $addr", []>;
+def t2LDAH : T2Ildacq<0b1101, 0b01, (outs rGPR:$Rt),
+                      (ins addr_offset_none:$addr), "ldah", "\t$Rt, $addr", []>;
+
+// Store
+defm t2STR :T2I_st<0b10,"str", IIC_iStore_i, IIC_iStore_si, GPR,
+                   BinOpFrag<(store node:$LHS, node:$RHS)>>;
+defm t2STRB:T2I_st<0b00,"strb", IIC_iStore_bh_i, IIC_iStore_bh_si,
+                   rGPR, BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>;
+defm t2STRH:T2I_st<0b01,"strh", IIC_iStore_bh_i, IIC_iStore_bh_si,
+                   rGPR, BinOpFrag<(truncstorei16 node:$LHS, node:$RHS)>>;
+
+// Store doubleword
+let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in
+def t2STRDi8 : T2Ii8s4<1, 0, 0, (outs),
+                       (ins rGPR:$Rt, rGPR:$Rt2, t2addrmode_imm8s4:$addr),
+               IIC_iStore_d_r, "strd", "\t$Rt, $Rt2, $addr", "", []>;
+
+// Indexed stores
+
+let mayStore = 1, neverHasSideEffects = 1 in {
+def t2STR_PRE  : T2Ipreldst<0, 0b10, 0, 1, (outs GPRnopc:$Rn_wb),
+                            (ins GPRnopc:$Rt, t2addrmode_imm8_pre:$addr),
+                            AddrModeT2_i8, IndexModePre, IIC_iStore_iu,
+                            "str", "\t$Rt, $addr!",
+                            "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []>;
+
+def t2STRH_PRE  : T2Ipreldst<0, 0b01, 0, 1, (outs GPRnopc:$Rn_wb),
+                            (ins rGPR:$Rt, t2addrmode_imm8_pre:$addr),
+                            AddrModeT2_i8, IndexModePre, IIC_iStore_iu,
+                        "strh", "\t$Rt, $addr!",
+                        "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []>;
+
+def t2STRB_PRE  : T2Ipreldst<0, 0b00, 0, 1, (outs GPRnopc:$Rn_wb),
+                            (ins rGPR:$Rt, t2addrmode_imm8_pre:$addr),
+                            AddrModeT2_i8, IndexModePre, IIC_iStore_bh_iu,
+                        "strb", "\t$Rt, $addr!",
+                        "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []>;
+} // mayStore = 1, neverHasSideEffects = 1
+
+def t2STR_POST : T2Ipostldst<0, 0b10, 0, 0, (outs GPRnopc:$Rn_wb),
+                            (ins GPRnopc:$Rt, addr_offset_none:$Rn,
+                                 t2am_imm8_offset:$offset),
+                            AddrModeT2_i8, IndexModePost, IIC_iStore_iu,
+                          "str", "\t$Rt, $Rn$offset",
+                          "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
+             [(set GPRnopc:$Rn_wb,
+                  (post_store GPRnopc:$Rt, addr_offset_none:$Rn,
+                              t2am_imm8_offset:$offset))]>;
+
+def t2STRH_POST : T2Ipostldst<0, 0b01, 0, 0, (outs GPRnopc:$Rn_wb),
+                            (ins rGPR:$Rt, addr_offset_none:$Rn,
+                                 t2am_imm8_offset:$offset),
+                            AddrModeT2_i8, IndexModePost, IIC_iStore_bh_iu,
+                         "strh", "\t$Rt, $Rn$offset",
+                         "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
+       [(set GPRnopc:$Rn_wb,
+             (post_truncsti16 rGPR:$Rt, addr_offset_none:$Rn,
+                              t2am_imm8_offset:$offset))]>;
+
+def t2STRB_POST : T2Ipostldst<0, 0b00, 0, 0, (outs GPRnopc:$Rn_wb),
+                            (ins rGPR:$Rt, addr_offset_none:$Rn,
+                                 t2am_imm8_offset:$offset),
+                            AddrModeT2_i8, IndexModePost, IIC_iStore_bh_iu,
+                         "strb", "\t$Rt, $Rn$offset",
+                         "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
+        [(set GPRnopc:$Rn_wb,
+              (post_truncsti8 rGPR:$Rt, addr_offset_none:$Rn,
+                              t2am_imm8_offset:$offset))]>;
+
+// Pseudo-instructions for pattern matching the pre-indexed stores. We can't
+// put the patterns on the instruction definitions directly as ISel wants
+// the address base and offset to be separate operands, not a single
+// complex operand like we represent the instructions themselves. The
+// pseudos map between the two.
+let usesCustomInserter = 1,
+    Constraints = "$Rn = $Rn_wb,@earlyclobber $Rn_wb" in {
+def t2STR_preidx: t2PseudoInst<(outs GPRnopc:$Rn_wb),
+               (ins rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset, pred:$p),
+               4, IIC_iStore_ru,
+      [(set GPRnopc:$Rn_wb,
+            (pre_store rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset))]>;
+def t2STRB_preidx: t2PseudoInst<(outs GPRnopc:$Rn_wb),
+               (ins rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset, pred:$p),
+               4, IIC_iStore_ru,
+      [(set GPRnopc:$Rn_wb,
+            (pre_truncsti8 rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset))]>;
+def t2STRH_preidx: t2PseudoInst<(outs GPRnopc:$Rn_wb),
+               (ins rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset, pred:$p),
+               4, IIC_iStore_ru,
+      [(set GPRnopc:$Rn_wb,
+            (pre_truncsti16 rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset))]>;
+}
+
+// STRT, STRBT, STRHT all have offset mode (PUW=0b110) and are for disassembly
+// only.
+// Ref: A8.6.193 STR (immediate, Thumb) Encoding T4
+class T2IstT<bits<2> type, string opc, InstrItinClass ii>
+  : T2Ii8<(outs rGPR:$Rt), (ins t2addrmode_imm8:$addr), ii, opc,
+          "\t$Rt, $addr", []> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-25} = 0b00;
+  let Inst{24} = 0; // not signed
+  let Inst{23} = 0;
+  let Inst{22-21} = type;
+  let Inst{20} = 0; // store
+  let Inst{11} = 1;
+  let Inst{10-8} = 0b110; // PUW
+
+  bits<4> Rt;
+  bits<13> addr;
+  let Inst{15-12} = Rt;
+  let Inst{19-16} = addr{12-9};
+  let Inst{7-0}   = addr{7-0};
+}
+
+def t2STRT   : T2IstT<0b10, "strt", IIC_iStore_i>;
+def t2STRBT  : T2IstT<0b00, "strbt", IIC_iStore_bh_i>;
+def t2STRHT  : T2IstT<0b01, "strht", IIC_iStore_bh_i>;
+
+// ldrd / strd pre / post variants
+// For disassembly only.
+
+def t2LDRD_PRE  : T2Ii8s4<1, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2, GPR:$wb),
+                 (ins t2addrmode_imm8s4_pre:$addr), IIC_iLoad_d_ru,
+                 "ldrd", "\t$Rt, $Rt2, $addr!", "$addr.base = $wb", []> {
+  let DecoderMethod = "DecodeT2LDRDPreInstruction";
+}
+
+def t2LDRD_POST : T2Ii8s4post<0, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2, GPR:$wb),
+                 (ins addr_offset_none:$addr, t2am_imm8s4_offset:$imm),
+                 IIC_iLoad_d_ru, "ldrd", "\t$Rt, $Rt2, $addr$imm",
+                 "$addr.base = $wb", []>;
+
+def t2STRD_PRE  : T2Ii8s4<1, 1, 0, (outs GPR:$wb),
+                 (ins rGPR:$Rt, rGPR:$Rt2, t2addrmode_imm8s4_pre:$addr),
+                 IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, $addr!",
+                 "$addr.base = $wb", []> {
+  let DecoderMethod = "DecodeT2STRDPreInstruction";
+}
+
+def t2STRD_POST : T2Ii8s4post<0, 1, 0, (outs GPR:$wb),
+                 (ins rGPR:$Rt, rGPR:$Rt2, addr_offset_none:$addr,
+                      t2am_imm8s4_offset:$imm),
+                 IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, $addr$imm",
+                 "$addr.base = $wb", []>;
+
+class T2Istrrel<bits<2> bit54, dag oops, dag iops,
+                string opc, string asm, list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeNone, 4, NoItinerary, opc,
+            asm, "", pattern>, Requires<[IsThumb, HasV8]> {
+  bits<4> Rt;
+  bits<4> addr;
+
+  let Inst{31-27} = 0b11101;
+  let Inst{26-20} = 0b0001100;
+  let Inst{11-6} = 0b111110;
+  let Inst{5-4} = bit54;
+  let Inst{3-0} = 0b1111;
+
+  // Encode instruction operands
+  let Inst{19-16} = addr;
+  let Inst{15-12} = Rt;
+}
+
+def t2STL  : T2Istrrel<0b10, (outs), (ins rGPR:$Rt, addr_offset_none:$addr),
+                       "stl", "\t$Rt, $addr", []>;
+def t2STLB : T2Istrrel<0b00, (outs), (ins rGPR:$Rt, addr_offset_none:$addr),
+                       "stlb", "\t$Rt, $addr", []>;
+def t2STLH : T2Istrrel<0b01, (outs), (ins rGPR:$Rt, addr_offset_none:$addr),
+                       "stlh", "\t$Rt, $addr", []>;
+
+// T2Ipl (Preload Data/Instruction) signals the memory system of possible future
+// data/instruction access.
+// instr_write is inverted for Thumb mode: (prefetch 3) -> (preload 0),
+// (prefetch 1) -> (preload 2),  (prefetch 2) -> (preload 1).
+multiclass T2Ipl<bits<1> write, bits<1> instr, string opc> {
+
+  def i12 : T2Ii12<(outs), (ins t2addrmode_imm12:$addr), IIC_Preload, opc,
+                "\t$addr",
+              [(ARMPreload t2addrmode_imm12:$addr, (i32 write), (i32 instr))]>,
+              Sched<[WritePreLd]> {
+    let Inst{31-25} = 0b1111100;
+    let Inst{24} = instr;
+    let Inst{23} = 1;
+    let Inst{22} = 0;
+    let Inst{21} = write;
+    let Inst{20} = 1;
+    let Inst{15-12} = 0b1111;
+
+    bits<17> addr;
+    let Inst{19-16} = addr{16-13}; // Rn
+    let Inst{11-0}  = addr{11-0};  // imm12
+
+    let DecoderMethod = "DecodeT2LoadImm12";
+  }
+
+  def i8 : T2Ii8<(outs), (ins t2addrmode_negimm8:$addr), IIC_Preload, opc,
+                "\t$addr",
+            [(ARMPreload t2addrmode_negimm8:$addr, (i32 write), (i32 instr))]>,
+            Sched<[WritePreLd]> {
+    let Inst{31-25} = 0b1111100;
+    let Inst{24} = instr;
+    let Inst{23} = 0; // U = 0
+    let Inst{22} = 0;
+    let Inst{21} = write;
+    let Inst{20} = 1;
+    let Inst{15-12} = 0b1111;
+    let Inst{11-8} = 0b1100;
+
+    bits<13> addr;
+    let Inst{19-16} = addr{12-9}; // Rn
+    let Inst{7-0}   = addr{7-0};  // imm8
+
+    let DecoderMethod = "DecodeT2LoadImm8";
+  }
+
+  def s : T2Iso<(outs), (ins t2addrmode_so_reg:$addr), IIC_Preload, opc,
+               "\t$addr",
+             [(ARMPreload t2addrmode_so_reg:$addr, (i32 write), (i32 instr))]>,
+             Sched<[WritePreLd]> {
+    let Inst{31-25} = 0b1111100;
+    let Inst{24} = instr;
+    let Inst{23} = 0; // add = TRUE for T1
+    let Inst{22} = 0;
+    let Inst{21} = write;
+    let Inst{20} = 1;
+    let Inst{15-12} = 0b1111;
+    let Inst{11-6} = 0b000000;
+
+    bits<10> addr;
+    let Inst{19-16} = addr{9-6}; // Rn
+    let Inst{3-0}   = addr{5-2}; // Rm
+    let Inst{5-4}   = addr{1-0}; // imm2
+
+    let DecoderMethod = "DecodeT2LoadShift";
+  }
+}
+
+defm t2PLD    : T2Ipl<0, 0, "pld">,  Requires<[IsThumb2]>;
+defm t2PLDW   : T2Ipl<1, 0, "pldw">, Requires<[IsThumb2,HasV7,HasMP]>;
+defm t2PLI    : T2Ipl<0, 1, "pli">,  Requires<[IsThumb2,HasV7]>;
+
+// pci variant is very similar to i12, but supports negative offsets
+// from the PC. Only PLD and PLI have pci variants (not PLDW)
+class T2Iplpci<bits<1> inst, string opc> : T2Iso<(outs), (ins t2ldrlabel:$addr),
+               IIC_Preload, opc, "\t$addr",
+               [(ARMPreload (ARMWrapper tconstpool:$addr),
+                (i32 0), (i32 inst))]>, Sched<[WritePreLd]> {
+  let Inst{31-25} = 0b1111100;
+  let Inst{24} = inst;
+  let Inst{22-20} = 0b001;
+  let Inst{19-16} = 0b1111;
+  let Inst{15-12} = 0b1111;
+
+  bits<13> addr;
+  let Inst{23}   = addr{12};   // add = (U == '1')
+  let Inst{11-0} = addr{11-0}; // imm12
+
+  let DecoderMethod = "DecodeT2LoadLabel";
+}
+
+def t2PLDpci : T2Iplpci<0, "pld">,  Requires<[IsThumb2]>;
+def t2PLIpci : T2Iplpci<1, "pli">,  Requires<[IsThumb2,HasV7]>;
+
+//===----------------------------------------------------------------------===//
+//  Load / store multiple Instructions.
+//
+
+multiclass thumb2_ld_mult<string asm, InstrItinClass itin,
+                            InstrItinClass itin_upd, bit L_bit> {
+  def IA :
+    T2XI<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         itin, !strconcat(asm, "${p}.w\t$Rn, $regs"), []> {
+    bits<4>  Rn;
+    bits<16> regs;
+
+    let Inst{31-27} = 0b11101;
+    let Inst{26-25} = 0b00;
+    let Inst{24-23} = 0b01;     // Increment After
+    let Inst{22}    = 0;
+    let Inst{21}    = 0;        // No writeback
+    let Inst{20}    = L_bit;
+    let Inst{19-16} = Rn;
+    let Inst{15-0}  = regs;
+  }
+  def IA_UPD :
+    T2XIt<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+          itin_upd, !strconcat(asm, "${p}.w\t$Rn!, $regs"), "$Rn = $wb", []> {
+    bits<4>  Rn;
+    bits<16> regs;
+
+    let Inst{31-27} = 0b11101;
+    let Inst{26-25} = 0b00;
+    let Inst{24-23} = 0b01;     // Increment After
+    let Inst{22}    = 0;
+    let Inst{21}    = 1;        // Writeback
+    let Inst{20}    = L_bit;
+    let Inst{19-16} = Rn;
+    let Inst{15-0}  = regs;
+  }
+  def DB :
+    T2XI<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         itin, !strconcat(asm, "db${p}\t$Rn, $regs"), []> {
+    bits<4>  Rn;
+    bits<16> regs;
+
+    let Inst{31-27} = 0b11101;
+    let Inst{26-25} = 0b00;
+    let Inst{24-23} = 0b10;     // Decrement Before
+    let Inst{22}    = 0;
+    let Inst{21}    = 0;        // No writeback
+    let Inst{20}    = L_bit;
+    let Inst{19-16} = Rn;
+    let Inst{15-0}  = regs;
+  }
+  def DB_UPD :
+    T2XIt<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+          itin_upd, !strconcat(asm, "db${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+    bits<4>  Rn;
+    bits<16> regs;
+
+    let Inst{31-27} = 0b11101;
+    let Inst{26-25} = 0b00;
+    let Inst{24-23} = 0b10;     // Decrement Before
+    let Inst{22}    = 0;
+    let Inst{21}    = 1;        // Writeback
+    let Inst{20}    = L_bit;
+    let Inst{19-16} = Rn;
+    let Inst{15-0}  = regs;
+  }
+}
+
+let neverHasSideEffects = 1 in {
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
+defm t2LDM : thumb2_ld_mult<"ldm", IIC_iLoad_m, IIC_iLoad_mu, 1>;
+
+multiclass thumb2_st_mult<string asm, InstrItinClass itin,
+                            InstrItinClass itin_upd, bit L_bit> {
+  def IA :
+    T2XI<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         itin, !strconcat(asm, "${p}.w\t$Rn, $regs"), []> {
+    bits<4>  Rn;
+    bits<16> regs;
+
+    let Inst{31-27} = 0b11101;
+    let Inst{26-25} = 0b00;
+    let Inst{24-23} = 0b01;     // Increment After
+    let Inst{22}    = 0;
+    let Inst{21}    = 0;        // No writeback
+    let Inst{20}    = L_bit;
+    let Inst{19-16} = Rn;
+    let Inst{15}    = 0;
+    let Inst{14}    = regs{14};
+    let Inst{13}    = 0;
+    let Inst{12-0}  = regs{12-0};
+  }
+  def IA_UPD :
+    T2XIt<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+          itin_upd, !strconcat(asm, "${p}.w\t$Rn!, $regs"), "$Rn = $wb", []> {
+    bits<4>  Rn;
+    bits<16> regs;
+
+    let Inst{31-27} = 0b11101;
+    let Inst{26-25} = 0b00;
+    let Inst{24-23} = 0b01;     // Increment After
+    let Inst{22}    = 0;
+    let Inst{21}    = 1;        // Writeback
+    let Inst{20}    = L_bit;
+    let Inst{19-16} = Rn;
+    let Inst{15}    = 0;
+    let Inst{14}    = regs{14};
+    let Inst{13}    = 0;
+    let Inst{12-0}  = regs{12-0};
+  }
+  def DB :
+    T2XI<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         itin, !strconcat(asm, "db${p}\t$Rn, $regs"), []> {
+    bits<4>  Rn;
+    bits<16> regs;
+
+    let Inst{31-27} = 0b11101;
+    let Inst{26-25} = 0b00;
+    let Inst{24-23} = 0b10;     // Decrement Before
+    let Inst{22}    = 0;
+    let Inst{21}    = 0;        // No writeback
+    let Inst{20}    = L_bit;
+    let Inst{19-16} = Rn;
+    let Inst{15}    = 0;
+    let Inst{14}    = regs{14};
+    let Inst{13}    = 0;
+    let Inst{12-0}  = regs{12-0};
+  }
+  def DB_UPD :
+    T2XIt<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+          itin_upd, !strconcat(asm, "db${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+    bits<4>  Rn;
+    bits<16> regs;
+
+    let Inst{31-27} = 0b11101;
+    let Inst{26-25} = 0b00;
+    let Inst{24-23} = 0b10;     // Decrement Before
+    let Inst{22}    = 0;
+    let Inst{21}    = 1;        // Writeback
+    let Inst{20}    = L_bit;
+    let Inst{19-16} = Rn;
+    let Inst{15}    = 0;
+    let Inst{14}    = regs{14};
+    let Inst{13}    = 0;
+    let Inst{12-0}  = regs{12-0};
+  }
+}
+
+
+let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
+defm t2STM : thumb2_st_mult<"stm", IIC_iStore_m, IIC_iStore_mu, 0>;
+
+} // neverHasSideEffects
+
+
+//===----------------------------------------------------------------------===//
+//  Move Instructions.
+//
+
+let neverHasSideEffects = 1 in
+def t2MOVr : T2sTwoReg<(outs GPRnopc:$Rd), (ins GPR:$Rm), IIC_iMOVr,
+                   "mov", ".w\t$Rd, $Rm", []>, Sched<[WriteALU]> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-25} = 0b01;
+  let Inst{24-21} = 0b0010;
+  let Inst{19-16} = 0b1111; // Rn
+  let Inst{14-12} = 0b000;
+  let Inst{7-4} = 0b0000;
+}
+def : t2InstAlias<"mov${p}.w $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPR:$Rm,
+                                                pred:$p, zero_reg)>;
+def : t2InstAlias<"movs${p}.w $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPR:$Rm,
+                                                 pred:$p, CPSR)>;
+def : t2InstAlias<"movs${p} $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPR:$Rm,
+                                               pred:$p, CPSR)>;
+
+// AddedComplexity to ensure isel tries t2MOVi before t2MOVi16.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1,
+    AddedComplexity = 1 in
+def t2MOVi : T2sOneRegImm<(outs rGPR:$Rd), (ins t2_so_imm:$imm), IIC_iMOVi,
+                   "mov", ".w\t$Rd, $imm",
+                   [(set rGPR:$Rd, t2_so_imm:$imm)]>, Sched<[WriteALU]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25} = 0;
+  let Inst{24-21} = 0b0010;
+  let Inst{19-16} = 0b1111; // Rn
+  let Inst{15} = 0;
+}
+
+// cc_out is handled as part of the explicit mnemonic in the parser for 'mov'.
+// Use aliases to get that to play nice here.
+def : t2InstAlias<"movs${p}.w $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm,
+                                                pred:$p, CPSR)>;
+def : t2InstAlias<"movs${p} $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm,
+                                                pred:$p, CPSR)>;
+
+def : t2InstAlias<"mov${p}.w $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm,
+                                                 pred:$p, zero_reg)>;
+def : t2InstAlias<"mov${p} $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm,
+                                               pred:$p, zero_reg)>;
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in
+def t2MOVi16 : T2I<(outs rGPR:$Rd), (ins imm0_65535_expr:$imm), IIC_iMOVi,
+                   "movw", "\t$Rd, $imm",
+                   [(set rGPR:$Rd, imm0_65535:$imm)]>, Sched<[WriteALU]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25} = 1;
+  let Inst{24-21} = 0b0010;
+  let Inst{20} = 0; // The S bit.
+  let Inst{15} = 0;
+
+  bits<4> Rd;
+  bits<16> imm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = imm{15-12};
+  let Inst{26}    = imm{11};
+  let Inst{14-12} = imm{10-8};
+  let Inst{7-0}   = imm{7-0};
+  let DecoderMethod = "DecodeT2MOVTWInstruction";
+}
+
+def : t2InstAlias<"mov${p} $Rd, $imm",
+                  (t2MOVi16 rGPR:$Rd, imm256_65535_expr:$imm, pred:$p)>;
+
+def t2MOVi16_ga_pcrel : PseudoInst<(outs rGPR:$Rd),
+                                (ins i32imm:$addr, pclabel:$id), IIC_iMOVi, []>;
+
+let Constraints = "$src = $Rd" in {
+def t2MOVTi16 : T2I<(outs rGPR:$Rd),
+                    (ins rGPR:$src, imm0_65535_expr:$imm), IIC_iMOVi,
+                    "movt", "\t$Rd, $imm",
+                    [(set rGPR:$Rd,
+                          (or (and rGPR:$src, 0xffff), lo16AllZero:$imm))]>,
+                          Sched<[WriteALU]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25} = 1;
+  let Inst{24-21} = 0b0110;
+  let Inst{20} = 0; // The S bit.
+  let Inst{15} = 0;
+
+  bits<4> Rd;
+  bits<16> imm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = imm{15-12};
+  let Inst{26}    = imm{11};
+  let Inst{14-12} = imm{10-8};
+  let Inst{7-0}   = imm{7-0};
+  let DecoderMethod = "DecodeT2MOVTWInstruction";
+}
+
+def t2MOVTi16_ga_pcrel : PseudoInst<(outs rGPR:$Rd),
+                     (ins rGPR:$src, i32imm:$addr, pclabel:$id), IIC_iMOVi, []>,
+                     Sched<[WriteALU]>;
+} // Constraints
+
+def : T2Pat<(or rGPR:$src, 0xffff0000), (t2MOVTi16 rGPR:$src, 0xffff)>;
+
+//===----------------------------------------------------------------------===//
+//  Extend Instructions.
+//
+
+// Sign extenders
+
+def t2SXTB  : T2I_ext_rrot<0b100, "sxtb",
+                              UnOpFrag<(sext_inreg node:$Src, i8)>>;
+def t2SXTH  : T2I_ext_rrot<0b000, "sxth",
+                              UnOpFrag<(sext_inreg node:$Src, i16)>>;
+def t2SXTB16 : T2I_ext_rrot_sxtb16<0b010, "sxtb16">;
+
+def t2SXTAB : T2I_exta_rrot<0b100, "sxtab",
+                        BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS, i8))>>;
+def t2SXTAH : T2I_exta_rrot<0b000, "sxtah",
+                        BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS,i16))>>;
+def t2SXTAB16 : T2I_exta_rrot_np<0b010, "sxtab16">;
+
+// Zero extenders
+
+let AddedComplexity = 16 in {
+def t2UXTB   : T2I_ext_rrot<0b101, "uxtb",
+                               UnOpFrag<(and node:$Src, 0x000000FF)>>;
+def t2UXTH   : T2I_ext_rrot<0b001, "uxth",
+                               UnOpFrag<(and node:$Src, 0x0000FFFF)>>;
+def t2UXTB16 : T2I_ext_rrot_uxtb16<0b011, "uxtb16",
+                               UnOpFrag<(and node:$Src, 0x00FF00FF)>>;
+
+// FIXME: This pattern incorrectly assumes the shl operator is a rotate.
+//        The transformation should probably be done as a combiner action
+//        instead so we can include a check for masking back in the upper
+//        eight bits of the source into the lower eight bits of the result.
+//def : T2Pat<(and (shl rGPR:$Src, (i32 8)), 0xFF00FF),
+//            (t2UXTB16 rGPR:$Src, 3)>,
+//          Requires<[HasT2ExtractPack, IsThumb2]>;
+def : T2Pat<(and (srl rGPR:$Src, (i32 8)), 0xFF00FF),
+            (t2UXTB16 rGPR:$Src, 1)>,
+        Requires<[HasT2ExtractPack, IsThumb2]>;
+
+def t2UXTAB : T2I_exta_rrot<0b101, "uxtab",
+                           BinOpFrag<(add node:$LHS, (and node:$RHS, 0x00FF))>>;
+def t2UXTAH : T2I_exta_rrot<0b001, "uxtah",
+                           BinOpFrag<(add node:$LHS, (and node:$RHS, 0xFFFF))>>;
+def t2UXTAB16 : T2I_exta_rrot_np<0b011, "uxtab16">;
+}
+
+//===----------------------------------------------------------------------===//
+//  Arithmetic Instructions.
+//
+
+defm t2ADD  : T2I_bin_ii12rs<0b000, "add",
+                             BinOpFrag<(add  node:$LHS, node:$RHS)>, 1>;
+defm t2SUB  : T2I_bin_ii12rs<0b101, "sub",
+                             BinOpFrag<(sub  node:$LHS, node:$RHS)>>;
+
+// ADD and SUB with 's' bit set. No 12-bit immediate (T4) variants.
+//
+// Currently, t2ADDS/t2SUBS are pseudo opcodes that exist only in the
+// selection DAG. They are "lowered" to real t2ADD/t2SUB opcodes by
+// AdjustInstrPostInstrSelection where we determine whether or not to
+// set the "s" bit based on CPSR liveness.
+//
+// FIXME: Eliminate t2ADDS/t2SUBS pseudo opcodes after adding tablegen
+// support for an optional CPSR definition that corresponds to the DAG
+// node's second value. We can then eliminate the implicit def of CPSR.
+defm t2ADDS : T2I_bin_s_irs <IIC_iALUi, IIC_iALUr, IIC_iALUsi,
+                             BinOpFrag<(ARMaddc node:$LHS, node:$RHS)>, 1>;
+defm t2SUBS : T2I_bin_s_irs <IIC_iALUi, IIC_iALUr, IIC_iALUsi,
+                             BinOpFrag<(ARMsubc node:$LHS, node:$RHS)>>;
+
+let hasPostISelHook = 1 in {
+defm t2ADC  : T2I_adde_sube_irs<0b1010, "adc",
+              BinOpWithFlagFrag<(ARMadde node:$LHS, node:$RHS, node:$FLAG)>, 1>;
+defm t2SBC  : T2I_adde_sube_irs<0b1011, "sbc",
+              BinOpWithFlagFrag<(ARMsube node:$LHS, node:$RHS, node:$FLAG)>>;
+}
+
+// RSB
+defm t2RSB  : T2I_rbin_irs  <0b1110, "rsb",
+                             BinOpFrag<(sub  node:$LHS, node:$RHS)>>;
+
+// FIXME: Eliminate them if we can write def : Pat patterns which defines
+// CPSR and the implicit def of CPSR is not needed.
+defm t2RSBS : T2I_rbin_s_is <BinOpFrag<(ARMsubc node:$LHS, node:$RHS)>>;
+
+// (sub X, imm) gets canonicalized to (add X, -imm).  Match this form.
+// The assume-no-carry-in form uses the negation of the input since add/sub
+// assume opposite meanings of the carry flag (i.e., carry == !borrow).
+// See the definition of AddWithCarry() in the ARM ARM A2.2.1 for the gory
+// details.
+// The AddedComplexity preferences the first variant over the others since
+// it can be shrunk to a 16-bit wide encoding, while the others cannot.
+let AddedComplexity = 1 in
+def : T2Pat<(add        GPR:$src, imm1_255_neg:$imm),
+            (t2SUBri    GPR:$src, imm1_255_neg:$imm)>;
+def : T2Pat<(add        GPR:$src, t2_so_imm_neg:$imm),
+            (t2SUBri    GPR:$src, t2_so_imm_neg:$imm)>;
+def : T2Pat<(add        GPR:$src, imm0_4095_neg:$imm),
+            (t2SUBri12  GPR:$src, imm0_4095_neg:$imm)>;
+def : T2Pat<(add        GPR:$src, imm0_65535_neg:$imm),
+            (t2SUBrr    GPR:$src, (t2MOVi16 (imm_neg_XFORM imm:$imm)))>;
+
+let AddedComplexity = 1 in
+def : T2Pat<(ARMaddc    rGPR:$src, imm1_255_neg:$imm),
+            (t2SUBSri   rGPR:$src, imm1_255_neg:$imm)>;
+def : T2Pat<(ARMaddc    rGPR:$src, t2_so_imm_neg:$imm),
+            (t2SUBSri   rGPR:$src, t2_so_imm_neg:$imm)>;
+def : T2Pat<(ARMaddc    rGPR:$src, imm0_65535_neg:$imm),
+            (t2SUBSrr   rGPR:$src, (t2MOVi16 (imm_neg_XFORM imm:$imm)))>;
+// The with-carry-in form matches bitwise not instead of the negation.
+// Effectively, the inverse interpretation of the carry flag already accounts
+// for part of the negation.
+let AddedComplexity = 1 in
+def : T2Pat<(ARMadde    rGPR:$src, imm0_255_not:$imm, CPSR),
+            (t2SBCri    rGPR:$src, imm0_255_not:$imm)>;
+def : T2Pat<(ARMadde    rGPR:$src, t2_so_imm_not:$imm, CPSR),
+            (t2SBCri    rGPR:$src, t2_so_imm_not:$imm)>;
+def : T2Pat<(ARMadde    rGPR:$src, imm0_65535_neg:$imm, CPSR),
+            (t2SBCrr    rGPR:$src, (t2MOVi16 (imm_not_XFORM imm:$imm)))>;
+
+// Select Bytes -- for disassembly only
+
+def t2SEL : T2ThreeReg<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+                NoItinerary, "sel", "\t$Rd, $Rn, $Rm", []>,
+          Requires<[IsThumb2, HasThumb2DSP]> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-24} = 0b010;
+  let Inst{23} = 0b1;
+  let Inst{22-20} = 0b010;
+  let Inst{15-12} = 0b1111;
+  let Inst{7} = 0b1;
+  let Inst{6-4} = 0b000;
+}
+
+// A6.3.13, A6.3.14, A6.3.15 Parallel addition and subtraction (signed/unsigned)
+// And Miscellaneous operations -- for disassembly only
+class T2I_pam<bits<3> op22_20, bits<4> op7_4, string opc,
+              list<dag> pat = [/* For disassembly only; pattern left blank */],
+              dag iops = (ins rGPR:$Rn, rGPR:$Rm),
+              string asm = "\t$Rd, $Rn, $Rm">
+  : T2I<(outs rGPR:$Rd), iops, NoItinerary, opc, asm, pat>,
+    Requires<[IsThumb2, HasThumb2DSP]> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0101;
+  let Inst{22-20} = op22_20;
+  let Inst{15-12} = 0b1111;
+  let Inst{7-4} = op7_4;
+
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = Rn;
+  let Inst{3-0}   = Rm;
+}
+
+// Saturating add/subtract -- for disassembly only
+
+def t2QADD    : T2I_pam<0b000, 0b1000, "qadd",
+                        [(set rGPR:$Rd, (int_arm_qadd rGPR:$Rn, rGPR:$Rm))],
+                        (ins rGPR:$Rm, rGPR:$Rn), "\t$Rd, $Rm, $Rn">;
+def t2QADD16  : T2I_pam<0b001, 0b0001, "qadd16">;
+def t2QADD8   : T2I_pam<0b000, 0b0001, "qadd8">;
+def t2QASX    : T2I_pam<0b010, 0b0001, "qasx">;
+def t2QDADD   : T2I_pam<0b000, 0b1001, "qdadd", [],
+                        (ins rGPR:$Rm, rGPR:$Rn), "\t$Rd, $Rm, $Rn">;
+def t2QDSUB   : T2I_pam<0b000, 0b1011, "qdsub", [],
+                        (ins rGPR:$Rm, rGPR:$Rn), "\t$Rd, $Rm, $Rn">;
+def t2QSAX    : T2I_pam<0b110, 0b0001, "qsax">;
+def t2QSUB    : T2I_pam<0b000, 0b1010, "qsub",
+                        [(set rGPR:$Rd, (int_arm_qsub rGPR:$Rn, rGPR:$Rm))],
+                        (ins rGPR:$Rm, rGPR:$Rn), "\t$Rd, $Rm, $Rn">;
+def t2QSUB16  : T2I_pam<0b101, 0b0001, "qsub16">;
+def t2QSUB8   : T2I_pam<0b100, 0b0001, "qsub8">;
+def t2UQADD16 : T2I_pam<0b001, 0b0101, "uqadd16">;
+def t2UQADD8  : T2I_pam<0b000, 0b0101, "uqadd8">;
+def t2UQASX   : T2I_pam<0b010, 0b0101, "uqasx">;
+def t2UQSAX   : T2I_pam<0b110, 0b0101, "uqsax">;
+def t2UQSUB16 : T2I_pam<0b101, 0b0101, "uqsub16">;
+def t2UQSUB8  : T2I_pam<0b100, 0b0101, "uqsub8">;
+
+// Signed/Unsigned add/subtract -- for disassembly only
+
+def t2SASX    : T2I_pam<0b010, 0b0000, "sasx">;
+def t2SADD16  : T2I_pam<0b001, 0b0000, "sadd16">;
+def t2SADD8   : T2I_pam<0b000, 0b0000, "sadd8">;
+def t2SSAX    : T2I_pam<0b110, 0b0000, "ssax">;
+def t2SSUB16  : T2I_pam<0b101, 0b0000, "ssub16">;
+def t2SSUB8   : T2I_pam<0b100, 0b0000, "ssub8">;
+def t2UASX    : T2I_pam<0b010, 0b0100, "uasx">;
+def t2UADD16  : T2I_pam<0b001, 0b0100, "uadd16">;
+def t2UADD8   : T2I_pam<0b000, 0b0100, "uadd8">;
+def t2USAX    : T2I_pam<0b110, 0b0100, "usax">;
+def t2USUB16  : T2I_pam<0b101, 0b0100, "usub16">;
+def t2USUB8   : T2I_pam<0b100, 0b0100, "usub8">;
+
+// Signed/Unsigned halving add/subtract -- for disassembly only
+
+def t2SHASX   : T2I_pam<0b010, 0b0010, "shasx">;
+def t2SHADD16 : T2I_pam<0b001, 0b0010, "shadd16">;
+def t2SHADD8  : T2I_pam<0b000, 0b0010, "shadd8">;
+def t2SHSAX   : T2I_pam<0b110, 0b0010, "shsax">;
+def t2SHSUB16 : T2I_pam<0b101, 0b0010, "shsub16">;
+def t2SHSUB8  : T2I_pam<0b100, 0b0010, "shsub8">;
+def t2UHASX   : T2I_pam<0b010, 0b0110, "uhasx">;
+def t2UHADD16 : T2I_pam<0b001, 0b0110, "uhadd16">;
+def t2UHADD8  : T2I_pam<0b000, 0b0110, "uhadd8">;
+def t2UHSAX   : T2I_pam<0b110, 0b0110, "uhsax">;
+def t2UHSUB16 : T2I_pam<0b101, 0b0110, "uhsub16">;
+def t2UHSUB8  : T2I_pam<0b100, 0b0110, "uhsub8">;
+
+// Helper class for disassembly only
+// A6.3.16 & A6.3.17
+// T2Imac - Thumb2 multiply [accumulate, and absolute difference] instructions.
+class T2ThreeReg_mac<bit long, bits<3> op22_20, bits<4> op7_4, dag oops,
+  dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : T2ThreeReg<oops, iops, itin, opc, asm, pattern> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-24} = 0b011;
+  let Inst{23}    = long;
+  let Inst{22-20} = op22_20;
+  let Inst{7-4}   = op7_4;
+}
+
+class T2FourReg_mac<bit long, bits<3> op22_20, bits<4> op7_4, dag oops,
+  dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : T2FourReg<oops, iops, itin, opc, asm, pattern> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-24} = 0b011;
+  let Inst{23}    = long;
+  let Inst{22-20} = op22_20;
+  let Inst{7-4}   = op7_4;
+}
+
+// Unsigned Sum of Absolute Differences [and Accumulate].
+def t2USAD8   : T2ThreeReg_mac<0, 0b111, 0b0000, (outs rGPR:$Rd),
+                                           (ins rGPR:$Rn, rGPR:$Rm),
+                        NoItinerary, "usad8", "\t$Rd, $Rn, $Rm", []>,
+          Requires<[IsThumb2, HasThumb2DSP]> {
+  let Inst{15-12} = 0b1111;
+}
+def t2USADA8  : T2FourReg_mac<0, 0b111, 0b0000, (outs rGPR:$Rd),
+                       (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), NoItinerary,
+                        "usada8", "\t$Rd, $Rn, $Rm, $Ra", []>,
+          Requires<[IsThumb2, HasThumb2DSP]>;
+
+// Signed/Unsigned saturate.
+class T2SatI<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<5> sat_imm;
+  bits<7> sh;
+
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = Rn;
+  let Inst{4-0}   = sat_imm;
+  let Inst{21}    = sh{5};
+  let Inst{14-12} = sh{4-2};
+  let Inst{7-6}   = sh{1-0};
+}
+
+def t2SSAT: T2SatI<
+              (outs rGPR:$Rd),
+              (ins imm1_32:$sat_imm, rGPR:$Rn, t2_shift_imm:$sh),
+              NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", []> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25-22} = 0b1100;
+  let Inst{20} = 0;
+  let Inst{15} = 0;
+  let Inst{5}  = 0;
+}
+
+def t2SSAT16: T2SatI<
+                (outs rGPR:$Rd), (ins imm1_16:$sat_imm, rGPR:$Rn), NoItinerary,
+                "ssat16", "\t$Rd, $sat_imm, $Rn", []>,
+          Requires<[IsThumb2, HasThumb2DSP]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25-22} = 0b1100;
+  let Inst{20} = 0;
+  let Inst{15} = 0;
+  let Inst{21} = 1;        // sh = '1'
+  let Inst{14-12} = 0b000; // imm3 = '000'
+  let Inst{7-6} = 0b00;    // imm2 = '00'
+  let Inst{5-4} = 0b00;
+}
+
+def t2USAT: T2SatI<
+               (outs rGPR:$Rd),
+               (ins imm0_31:$sat_imm, rGPR:$Rn, t2_shift_imm:$sh),
+                NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh", []> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25-22} = 0b1110;
+  let Inst{20} = 0;
+  let Inst{15} = 0;
+}
+
+def t2USAT16: T2SatI<(outs rGPR:$Rd), (ins imm0_15:$sat_imm, rGPR:$Rn),
+                     NoItinerary,
+                     "usat16", "\t$Rd, $sat_imm, $Rn", []>,
+          Requires<[IsThumb2, HasThumb2DSP]> {
+  let Inst{31-22} = 0b1111001110;
+  let Inst{20} = 0;
+  let Inst{15} = 0;
+  let Inst{21} = 1;        // sh = '1'
+  let Inst{14-12} = 0b000; // imm3 = '000'
+  let Inst{7-6} = 0b00;    // imm2 = '00'
+  let Inst{5-4} = 0b00;
+}
+
+def : T2Pat<(int_arm_ssat GPR:$a, imm:$pos), (t2SSAT imm:$pos, GPR:$a, 0)>;
+def : T2Pat<(int_arm_usat GPR:$a, imm:$pos), (t2USAT imm:$pos, GPR:$a, 0)>;
+
+//===----------------------------------------------------------------------===//
+//  Shift and rotate Instructions.
+//
+
+defm t2LSL  : T2I_sh_ir<0b00, "lsl", imm0_31,
+                        BinOpFrag<(shl  node:$LHS, node:$RHS)>>;
+defm t2LSR  : T2I_sh_ir<0b01, "lsr", imm_sr,
+                        BinOpFrag<(srl  node:$LHS, node:$RHS)>>;
+defm t2ASR  : T2I_sh_ir<0b10, "asr", imm_sr,
+                        BinOpFrag<(sra  node:$LHS, node:$RHS)>>;
+defm t2ROR  : T2I_sh_ir<0b11, "ror", imm0_31,
+                        BinOpFrag<(rotr node:$LHS, node:$RHS)>>;
+
+// (rotr x, (and y, 0x...1f)) ==> (ROR x, y)
+def : T2Pat<(rotr rGPR:$lhs, (and rGPR:$rhs, lo5AllOne)),
+            (t2RORrr rGPR:$lhs, rGPR:$rhs)>;
+
+let Uses = [CPSR] in {
+def t2RRX : T2sTwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iMOVsi,
+                   "rrx", "\t$Rd, $Rm",
+                   [(set rGPR:$Rd, (ARMrrx rGPR:$Rm))]>, Sched<[WriteALU]> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-25} = 0b01;
+  let Inst{24-21} = 0b0010;
+  let Inst{19-16} = 0b1111; // Rn
+  let Inst{14-12} = 0b000;
+  let Inst{7-4} = 0b0011;
+}
+}
+
+let isCodeGenOnly = 1, Defs = [CPSR] in {
+def t2MOVsrl_flag : T2TwoRegShiftImm<
+                        (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iMOVsi,
+                        "lsrs", ".w\t$Rd, $Rm, #1",
+                        [(set rGPR:$Rd, (ARMsrl_flag rGPR:$Rm))]>,
+                        Sched<[WriteALU]> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-25} = 0b01;
+  let Inst{24-21} = 0b0010;
+  let Inst{20} = 1; // The S bit.
+  let Inst{19-16} = 0b1111; // Rn
+  let Inst{5-4} = 0b01; // Shift type.
+  // Shift amount = Inst{14-12:7-6} = 1.
+  let Inst{14-12} = 0b000;
+  let Inst{7-6} = 0b01;
+}
+def t2MOVsra_flag : T2TwoRegShiftImm<
+                        (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iMOVsi,
+                        "asrs", ".w\t$Rd, $Rm, #1",
+                        [(set rGPR:$Rd, (ARMsra_flag rGPR:$Rm))]>,
+                        Sched<[WriteALU]> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-25} = 0b01;
+  let Inst{24-21} = 0b0010;
+  let Inst{20} = 1; // The S bit.
+  let Inst{19-16} = 0b1111; // Rn
+  let Inst{5-4} = 0b10; // Shift type.
+  // Shift amount = Inst{14-12:7-6} = 1.
+  let Inst{14-12} = 0b000;
+  let Inst{7-6} = 0b01;
+}
+}
+
+//===----------------------------------------------------------------------===//
+//  Bitwise Instructions.
+//
+
+defm t2AND  : T2I_bin_w_irs<0b0000, "and",
+                            IIC_iBITi, IIC_iBITr, IIC_iBITsi,
+                            BinOpFrag<(and node:$LHS, node:$RHS)>, 1>;
+defm t2ORR  : T2I_bin_w_irs<0b0010, "orr",
+                            IIC_iBITi, IIC_iBITr, IIC_iBITsi,
+                            BinOpFrag<(or  node:$LHS, node:$RHS)>, 1>;
+defm t2EOR  : T2I_bin_w_irs<0b0100, "eor",
+                            IIC_iBITi, IIC_iBITr, IIC_iBITsi,
+                            BinOpFrag<(xor node:$LHS, node:$RHS)>, 1>;
+
+defm t2BIC  : T2I_bin_w_irs<0b0001, "bic",
+                            IIC_iBITi, IIC_iBITr, IIC_iBITsi,
+                            BinOpFrag<(and node:$LHS, (not node:$RHS))>>;
+
+class T2BitFI<dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+    : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<5> msb;
+  bits<5> lsb;
+
+  let Inst{11-8}  = Rd;
+  let Inst{4-0}   = msb{4-0};
+  let Inst{14-12} = lsb{4-2};
+  let Inst{7-6}   = lsb{1-0};
+}
+
+class T2TwoRegBitFI<dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+    : T2BitFI<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rn;
+
+  let Inst{19-16} = Rn;
+}
+
+let Constraints = "$src = $Rd" in
+def t2BFC : T2BitFI<(outs rGPR:$Rd), (ins rGPR:$src, bf_inv_mask_imm:$imm),
+                IIC_iUNAsi, "bfc", "\t$Rd, $imm",
+                [(set rGPR:$Rd, (and rGPR:$src, bf_inv_mask_imm:$imm))]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{26} = 0; // should be 0.
+  let Inst{25} = 1;
+  let Inst{24-20} = 0b10110;
+  let Inst{19-16} = 0b1111; // Rn
+  let Inst{15} = 0;
+  let Inst{5} = 0; // should be 0.
+
+  bits<10> imm;
+  let msb{4-0} = imm{9-5};
+  let lsb{4-0} = imm{4-0};
+}
+
+def t2SBFX: T2TwoRegBitFI<
+                (outs rGPR:$Rd), (ins rGPR:$Rn, imm0_31:$lsb, imm1_32:$msb),
+                 IIC_iUNAsi, "sbfx", "\t$Rd, $Rn, $lsb, $msb", []> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25} = 1;
+  let Inst{24-20} = 0b10100;
+  let Inst{15} = 0;
+}
+
+def t2UBFX: T2TwoRegBitFI<
+                (outs rGPR:$Rd), (ins rGPR:$Rn, imm0_31:$lsb, imm1_32:$msb),
+                 IIC_iUNAsi, "ubfx", "\t$Rd, $Rn, $lsb, $msb", []> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25} = 1;
+  let Inst{24-20} = 0b11100;
+  let Inst{15} = 0;
+}
+
+// A8.8.247  UDF - Undefined (Encoding T2)
+def t2UDF : T2XI<(outs), (ins imm0_65535:$imm16), IIC_Br, "udf.w\t$imm16",
+                 [(int_arm_undefined imm0_65535:$imm16)]> {
+  bits<16> imm16;
+  let Inst{31-29} = 0b111;
+  let Inst{28-27} = 0b10;
+  let Inst{26-20} = 0b1111111;
+  let Inst{19-16} = imm16{15-12};
+  let Inst{15} = 0b1;
+  let Inst{14-12} = 0b010;
+  let Inst{11-0} = imm16{11-0};
+}
+
+// A8.6.18  BFI - Bitfield insert (Encoding T1)
+let Constraints = "$src = $Rd" in {
+  def t2BFI : T2TwoRegBitFI<(outs rGPR:$Rd),
+                  (ins rGPR:$src, rGPR:$Rn, bf_inv_mask_imm:$imm),
+                  IIC_iBITi, "bfi", "\t$Rd, $Rn, $imm",
+                  [(set rGPR:$Rd, (ARMbfi rGPR:$src, rGPR:$Rn,
+                                   bf_inv_mask_imm:$imm))]> {
+    let Inst{31-27} = 0b11110;
+    let Inst{26} = 0; // should be 0.
+    let Inst{25} = 1;
+    let Inst{24-20} = 0b10110;
+    let Inst{15} = 0;
+    let Inst{5} = 0; // should be 0.
+
+    bits<10> imm;
+    let msb{4-0} = imm{9-5};
+    let lsb{4-0} = imm{4-0};
+  }
+}
+
+defm t2ORN  : T2I_bin_irs<0b0011, "orn",
+                          IIC_iBITi, IIC_iBITr, IIC_iBITsi,
+                          BinOpFrag<(or node:$LHS, (not node:$RHS))>, 0, "">;
+
+/// T2I_un_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns for a
+/// unary operation that produces a value. These are predicable and can be
+/// changed to modify CPSR.
+multiclass T2I_un_irs<bits<4> opcod, string opc,
+                     InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
+                      PatFrag opnode,
+                      bit Cheap = 0, bit ReMat = 0, bit MoveImm = 0> {
+   // shifted imm
+   def i : T2sOneRegImm<(outs rGPR:$Rd), (ins t2_so_imm:$imm), iii,
+                opc, "\t$Rd, $imm",
+                [(set rGPR:$Rd, (opnode t2_so_imm:$imm))]>, Sched<[WriteALU]> {
+     let isAsCheapAsAMove = Cheap;
+     let isReMaterializable = ReMat;
+     let isMoveImm = MoveImm;
+     let Inst{31-27} = 0b11110;
+     let Inst{25} = 0;
+     let Inst{24-21} = opcod;
+     let Inst{19-16} = 0b1111; // Rn
+     let Inst{15} = 0;
+   }
+   // register
+   def r : T2sTwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm), iir,
+                opc, ".w\t$Rd, $Rm",
+                [(set rGPR:$Rd, (opnode rGPR:$Rm))]>, Sched<[WriteALU]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+     let Inst{19-16} = 0b1111; // Rn
+     let Inst{14-12} = 0b000; // imm3
+     let Inst{7-6} = 0b00; // imm2
+     let Inst{5-4} = 0b00; // type
+   }
+   // shifted register
+   def s : T2sOneRegShiftedReg<(outs rGPR:$Rd), (ins t2_so_reg:$ShiftedRm), iis,
+                opc, ".w\t$Rd, $ShiftedRm",
+                [(set rGPR:$Rd, (opnode t2_so_reg:$ShiftedRm))]>,
+                Sched<[WriteALU]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+     let Inst{19-16} = 0b1111; // Rn
+   }
+}
+
+// Prefer over of t2EORri ra, rb, -1 because mvn has 16-bit version
+let AddedComplexity = 1 in
+defm t2MVN  : T2I_un_irs <0b0011, "mvn",
+                          IIC_iMVNi, IIC_iMVNr, IIC_iMVNsi,
+                          UnOpFrag<(not node:$Src)>, 1, 1, 1>;
+
+let AddedComplexity = 1 in
+def : T2Pat<(and     rGPR:$src, t2_so_imm_not:$imm),
+            (t2BICri rGPR:$src, t2_so_imm_not:$imm)>;
+
+// top16Zero - answer true if the upper 16 bits of $src are 0, false otherwise
+def top16Zero: PatLeaf<(i32 rGPR:$src), [{
+  return CurDAG->MaskedValueIsZero(SDValue(N,0), APInt::getHighBitsSet(32, 16));
+  }]>;
+
+// so_imm_notSext is needed instead of so_imm_not, as the value of imm
+// will match the extended, not the original bitWidth for $src.
+def : T2Pat<(and top16Zero:$src, t2_so_imm_notSext:$imm),
+            (t2BICri rGPR:$src, t2_so_imm_notSext:$imm)>;
+
+
+// FIXME: Disable this pattern on Darwin to workaround an assembler bug.
+def : T2Pat<(or      rGPR:$src, t2_so_imm_not:$imm),
+            (t2ORNri rGPR:$src, t2_so_imm_not:$imm)>,
+            Requires<[IsThumb2]>;
+
+def : T2Pat<(t2_so_imm_not:$src),
+            (t2MVNi t2_so_imm_not:$src)>;
+
+//===----------------------------------------------------------------------===//
+//  Multiply Instructions.
+//
+let isCommutable = 1 in
+def t2MUL: T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL32,
+                "mul", "\t$Rd, $Rn, $Rm",
+                [(set rGPR:$Rd, (mul rGPR:$Rn, rGPR:$Rm))]> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0110;
+  let Inst{22-20} = 0b000;
+  let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate)
+  let Inst{7-4} = 0b0000; // Multiply
+}
+
+def t2MLA: T2FourReg<
+                (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32,
+                "mla", "\t$Rd, $Rn, $Rm, $Ra",
+                [(set rGPR:$Rd, (add (mul rGPR:$Rn, rGPR:$Rm), rGPR:$Ra))]>,
+           Requires<[IsThumb2, UseMulOps]> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0110;
+  let Inst{22-20} = 0b000;
+  let Inst{7-4} = 0b0000; // Multiply
+}
+
+def t2MLS: T2FourReg<
+                (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32,
+                "mls", "\t$Rd, $Rn, $Rm, $Ra",
+                [(set rGPR:$Rd, (sub rGPR:$Ra, (mul rGPR:$Rn, rGPR:$Rm)))]>,
+           Requires<[IsThumb2, UseMulOps]> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0110;
+  let Inst{22-20} = 0b000;
+  let Inst{7-4} = 0b0001; // Multiply and Subtract
+}
+
+// Extra precision multiplies with low / high results
+let neverHasSideEffects = 1 in {
+let isCommutable = 1 in {
+def t2SMULL : T2MulLong<0b000, 0b0000,
+                  (outs rGPR:$RdLo, rGPR:$RdHi),
+                  (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL64,
+                   "smull", "\t$RdLo, $RdHi, $Rn, $Rm", []>;
+
+def t2UMULL : T2MulLong<0b010, 0b0000,
+                  (outs rGPR:$RdLo, rGPR:$RdHi),
+                  (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL64,
+                   "umull", "\t$RdLo, $RdHi, $Rn, $Rm", []>;
+} // isCommutable
+
+// Multiply + accumulate
+def t2SMLAL : T2MlaLong<0b100, 0b0000,
+                  (outs rGPR:$RdLo, rGPR:$RdHi),
+                  (ins rGPR:$Rn, rGPR:$Rm, rGPR:$RLo, rGPR:$RHi), IIC_iMAC64,
+                  "smlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+                  RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">;
+
+def t2UMLAL : T2MlaLong<0b110, 0b0000,
+                  (outs rGPR:$RdLo, rGPR:$RdHi),
+                  (ins rGPR:$Rn, rGPR:$Rm, rGPR:$RLo, rGPR:$RHi), IIC_iMAC64,
+                  "umlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+                  RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">;
+
+def t2UMAAL : T2MulLong<0b110, 0b0110,
+                  (outs rGPR:$RdLo, rGPR:$RdHi),
+                  (ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC64,
+                  "umaal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+          Requires<[IsThumb2, HasThumb2DSP]>;
+} // neverHasSideEffects
+
+// Rounding variants of the below included for disassembly only
+
+// Most significant word multiply
+def t2SMMUL : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL32,
+                  "smmul", "\t$Rd, $Rn, $Rm",
+                  [(set rGPR:$Rd, (mulhs rGPR:$Rn, rGPR:$Rm))]>,
+          Requires<[IsThumb2, HasThumb2DSP]> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0110;
+  let Inst{22-20} = 0b101;
+  let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate)
+  let Inst{7-4} = 0b0000; // No Rounding (Inst{4} = 0)
+}
+
+def t2SMMULR : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL32,
+                  "smmulr", "\t$Rd, $Rn, $Rm", []>,
+          Requires<[IsThumb2, HasThumb2DSP]> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0110;
+  let Inst{22-20} = 0b101;
+  let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate)
+  let Inst{7-4} = 0b0001; // Rounding (Inst{4} = 1)
+}
+
+def t2SMMLA : T2FourReg<
+        (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32,
+                "smmla", "\t$Rd, $Rn, $Rm, $Ra",
+                [(set rGPR:$Rd, (add (mulhs rGPR:$Rm, rGPR:$Rn), rGPR:$Ra))]>,
+              Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0110;
+  let Inst{22-20} = 0b101;
+  let Inst{7-4} = 0b0000; // No Rounding (Inst{4} = 0)
+}
+
+def t2SMMLAR: T2FourReg<
+        (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32,
+                  "smmlar", "\t$Rd, $Rn, $Rm, $Ra", []>,
+          Requires<[IsThumb2, HasThumb2DSP]> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0110;
+  let Inst{22-20} = 0b101;
+  let Inst{7-4} = 0b0001; // Rounding (Inst{4} = 1)
+}
+
+def t2SMMLS: T2FourReg<
+        (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32,
+                "smmls", "\t$Rd, $Rn, $Rm, $Ra",
+                [(set rGPR:$Rd, (sub rGPR:$Ra, (mulhs rGPR:$Rn, rGPR:$Rm)))]>,
+             Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0110;
+  let Inst{22-20} = 0b110;
+  let Inst{7-4} = 0b0000; // No Rounding (Inst{4} = 0)
+}
+
+def t2SMMLSR:T2FourReg<
+        (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32,
+                "smmlsr", "\t$Rd, $Rn, $Rm, $Ra", []>,
+          Requires<[IsThumb2, HasThumb2DSP]> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0110;
+  let Inst{22-20} = 0b110;
+  let Inst{7-4} = 0b0001; // Rounding (Inst{4} = 1)
+}
+
+multiclass T2I_smul<string opc, PatFrag opnode> {
+  def BB : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16,
+              !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm",
+              [(set rGPR:$Rd, (opnode (sext_inreg rGPR:$Rn, i16),
+                                      (sext_inreg rGPR:$Rm, i16)))]>,
+          Requires<[IsThumb2, HasThumb2DSP]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0110;
+    let Inst{22-20} = 0b001;
+    let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate)
+    let Inst{7-6} = 0b00;
+    let Inst{5-4} = 0b00;
+  }
+
+  def BT : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16,
+              !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm",
+              [(set rGPR:$Rd, (opnode (sext_inreg rGPR:$Rn, i16),
+                                      (sra rGPR:$Rm, (i32 16))))]>,
+          Requires<[IsThumb2, HasThumb2DSP]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0110;
+    let Inst{22-20} = 0b001;
+    let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate)
+    let Inst{7-6} = 0b00;
+    let Inst{5-4} = 0b01;
+  }
+
+  def TB : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16,
+              !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm",
+              [(set rGPR:$Rd, (opnode (sra rGPR:$Rn, (i32 16)),
+                                      (sext_inreg rGPR:$Rm, i16)))]>,
+          Requires<[IsThumb2, HasThumb2DSP]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0110;
+    let Inst{22-20} = 0b001;
+    let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate)
+    let Inst{7-6} = 0b00;
+    let Inst{5-4} = 0b10;
+  }
+
+  def TT : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16,
+              !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm",
+              [(set rGPR:$Rd, (opnode (sra rGPR:$Rn, (i32 16)),
+                                      (sra rGPR:$Rm, (i32 16))))]>,
+          Requires<[IsThumb2, HasThumb2DSP]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0110;
+    let Inst{22-20} = 0b001;
+    let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate)
+    let Inst{7-6} = 0b00;
+    let Inst{5-4} = 0b11;
+  }
+
+  def WB : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16,
+              !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm",
+              [(set rGPR:$Rd, (sra (opnode rGPR:$Rn,
+                                    (sext_inreg rGPR:$Rm, i16)), (i32 16)))]>,
+          Requires<[IsThumb2, HasThumb2DSP]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0110;
+    let Inst{22-20} = 0b011;
+    let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate)
+    let Inst{7-6} = 0b00;
+    let Inst{5-4} = 0b00;
+  }
+
+  def WT : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16,
+              !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm",
+              [(set rGPR:$Rd, (sra (opnode rGPR:$Rn,
+                                    (sra rGPR:$Rm, (i32 16))), (i32 16)))]>,
+          Requires<[IsThumb2, HasThumb2DSP]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0110;
+    let Inst{22-20} = 0b011;
+    let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate)
+    let Inst{7-6} = 0b00;
+    let Inst{5-4} = 0b01;
+  }
+}
+
+
+multiclass T2I_smla<string opc, PatFrag opnode> {
+  def BB : T2FourReg<
+        (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16,
+              !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm, $Ra",
+              [(set rGPR:$Rd, (add rGPR:$Ra,
+                               (opnode (sext_inreg rGPR:$Rn, i16),
+                                       (sext_inreg rGPR:$Rm, i16))))]>,
+           Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0110;
+    let Inst{22-20} = 0b001;
+    let Inst{7-6} = 0b00;
+    let Inst{5-4} = 0b00;
+  }
+
+  def BT : T2FourReg<
+       (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16,
+             !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm, $Ra",
+             [(set rGPR:$Rd, (add rGPR:$Ra, (opnode (sext_inreg rGPR:$Rn, i16),
+                                                 (sra rGPR:$Rm, (i32 16)))))]>,
+           Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0110;
+    let Inst{22-20} = 0b001;
+    let Inst{7-6} = 0b00;
+    let Inst{5-4} = 0b01;
+  }
+
+  def TB : T2FourReg<
+        (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16,
+              !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm, $Ra",
+              [(set rGPR:$Rd, (add rGPR:$Ra, (opnode (sra rGPR:$Rn, (i32 16)),
+                                               (sext_inreg rGPR:$Rm, i16))))]>,
+           Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0110;
+    let Inst{22-20} = 0b001;
+    let Inst{7-6} = 0b00;
+    let Inst{5-4} = 0b10;
+  }
+
+  def TT : T2FourReg<
+        (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16,
+              !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm, $Ra",
+             [(set rGPR:$Rd, (add rGPR:$Ra, (opnode (sra rGPR:$Rn, (i32 16)),
+                                                 (sra rGPR:$Rm, (i32 16)))))]>,
+           Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0110;
+    let Inst{22-20} = 0b001;
+    let Inst{7-6} = 0b00;
+    let Inst{5-4} = 0b11;
+  }
+
+  def WB : T2FourReg<
+        (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16,
+              !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm, $Ra",
+              [(set rGPR:$Rd, (add rGPR:$Ra, (sra (opnode rGPR:$Rn,
+                                    (sext_inreg rGPR:$Rm, i16)), (i32 16))))]>,
+           Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0110;
+    let Inst{22-20} = 0b011;
+    let Inst{7-6} = 0b00;
+    let Inst{5-4} = 0b00;
+  }
+
+  def WT : T2FourReg<
+        (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16,
+              !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm, $Ra",
+              [(set rGPR:$Rd, (add rGPR:$Ra, (sra (opnode rGPR:$Rn,
+                                      (sra rGPR:$Rm, (i32 16))), (i32 16))))]>,
+           Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0110;
+    let Inst{22-20} = 0b011;
+    let Inst{7-6} = 0b00;
+    let Inst{5-4} = 0b01;
+  }
+}
+
+defm t2SMUL : T2I_smul<"smul", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
+defm t2SMLA : T2I_smla<"smla", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
+
+// Halfword multiple accumulate long: SMLAL<x><y>
+def t2SMLALBB : T2FourReg_mac<1, 0b100, 0b1000, (outs rGPR:$Ra,rGPR:$Rd),
+         (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlalbb", "\t$Ra, $Rd, $Rn, $Rm",
+           [/* For disassembly only; pattern left blank */]>,
+          Requires<[IsThumb2, HasThumb2DSP]>;
+def t2SMLALBT : T2FourReg_mac<1, 0b100, 0b1001, (outs rGPR:$Ra,rGPR:$Rd),
+         (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlalbt", "\t$Ra, $Rd, $Rn, $Rm",
+           [/* For disassembly only; pattern left blank */]>,
+          Requires<[IsThumb2, HasThumb2DSP]>;
+def t2SMLALTB : T2FourReg_mac<1, 0b100, 0b1010, (outs rGPR:$Ra,rGPR:$Rd),
+         (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlaltb", "\t$Ra, $Rd, $Rn, $Rm",
+           [/* For disassembly only; pattern left blank */]>,
+          Requires<[IsThumb2, HasThumb2DSP]>;
+def t2SMLALTT : T2FourReg_mac<1, 0b100, 0b1011, (outs rGPR:$Ra,rGPR:$Rd),
+         (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlaltt", "\t$Ra, $Rd, $Rn, $Rm",
+           [/* For disassembly only; pattern left blank */]>,
+          Requires<[IsThumb2, HasThumb2DSP]>;
+
+// Dual halfword multiple: SMUAD, SMUSD, SMLAD, SMLSD, SMLALD, SMLSLD
+def t2SMUAD: T2ThreeReg_mac<
+            0, 0b010, 0b0000, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm),
+            IIC_iMAC32, "smuad", "\t$Rd, $Rn, $Rm", []>,
+          Requires<[IsThumb2, HasThumb2DSP]> {
+  let Inst{15-12} = 0b1111;
+}
+def t2SMUADX:T2ThreeReg_mac<
+            0, 0b010, 0b0001, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm),
+            IIC_iMAC32, "smuadx", "\t$Rd, $Rn, $Rm", []>,
+          Requires<[IsThumb2, HasThumb2DSP]> {
+  let Inst{15-12} = 0b1111;
+}
+def t2SMUSD: T2ThreeReg_mac<
+            0, 0b100, 0b0000, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm),
+            IIC_iMAC32, "smusd", "\t$Rd, $Rn, $Rm", []>,
+          Requires<[IsThumb2, HasThumb2DSP]> {
+  let Inst{15-12} = 0b1111;
+}
+def t2SMUSDX:T2ThreeReg_mac<
+            0, 0b100, 0b0001, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm),
+            IIC_iMAC32, "smusdx", "\t$Rd, $Rn, $Rm", []>,
+          Requires<[IsThumb2, HasThumb2DSP]> {
+  let Inst{15-12} = 0b1111;
+}
+def t2SMLAD   : T2FourReg_mac<
+            0, 0b010, 0b0000, (outs rGPR:$Rd),
+            (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smlad",
+            "\t$Rd, $Rn, $Rm, $Ra", []>,
+          Requires<[IsThumb2, HasThumb2DSP]>;
+def t2SMLADX  : T2FourReg_mac<
+            0, 0b010, 0b0001, (outs rGPR:$Rd),
+            (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smladx",
+            "\t$Rd, $Rn, $Rm, $Ra", []>,
+          Requires<[IsThumb2, HasThumb2DSP]>;
+def t2SMLSD   : T2FourReg_mac<0, 0b100, 0b0000, (outs rGPR:$Rd),
+            (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smlsd",
+            "\t$Rd, $Rn, $Rm, $Ra", []>,
+          Requires<[IsThumb2, HasThumb2DSP]>;
+def t2SMLSDX  : T2FourReg_mac<0, 0b100, 0b0001, (outs rGPR:$Rd),
+            (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smlsdx",
+            "\t$Rd, $Rn, $Rm, $Ra", []>,
+          Requires<[IsThumb2, HasThumb2DSP]>;
+def t2SMLALD  : T2FourReg_mac<1, 0b100, 0b1100, (outs rGPR:$Ra,rGPR:$Rd),
+                        (ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC64, "smlald",
+                        "\t$Ra, $Rd, $Rn, $Rm", []>,
+          Requires<[IsThumb2, HasThumb2DSP]>;
+def t2SMLALDX : T2FourReg_mac<1, 0b100, 0b1101, (outs rGPR:$Ra,rGPR:$Rd),
+                        (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlaldx",
+                        "\t$Ra, $Rd, $Rn, $Rm", []>,
+          Requires<[IsThumb2, HasThumb2DSP]>;
+def t2SMLSLD  : T2FourReg_mac<1, 0b101, 0b1100, (outs rGPR:$Ra,rGPR:$Rd),
+                        (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlsld",
+                        "\t$Ra, $Rd, $Rn, $Rm", []>,
+          Requires<[IsThumb2, HasThumb2DSP]>;
+def t2SMLSLDX : T2FourReg_mac<1, 0b101, 0b1101, (outs rGPR:$Ra,rGPR:$Rd),
+                        (ins rGPR:$Rm,rGPR:$Rn), IIC_iMAC64, "smlsldx",
+                        "\t$Ra, $Rd, $Rn, $Rm", []>,
+          Requires<[IsThumb2, HasThumb2DSP]>;
+
+//===----------------------------------------------------------------------===//
+//  Division Instructions.
+//  Signed and unsigned division on v7-M
+//
+def t2SDIV : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iDIV,
+                 "sdiv", "\t$Rd, $Rn, $Rm",
+                 [(set rGPR:$Rd, (sdiv rGPR:$Rn, rGPR:$Rm))]>,
+                 Requires<[HasDivide, IsThumb2]> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-21} = 0b011100;
+  let Inst{20} = 0b1;
+  let Inst{15-12} = 0b1111;
+  let Inst{7-4} = 0b1111;
+}
+
+def t2UDIV : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iDIV,
+                 "udiv", "\t$Rd, $Rn, $Rm",
+                 [(set rGPR:$Rd, (udiv rGPR:$Rn, rGPR:$Rm))]>,
+                 Requires<[HasDivide, IsThumb2]> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-21} = 0b011101;
+  let Inst{20} = 0b1;
+  let Inst{15-12} = 0b1111;
+  let Inst{7-4} = 0b1111;
+}
+
+//===----------------------------------------------------------------------===//
+//  Misc. Arithmetic Instructions.
+//
+
+class T2I_misc<bits<2> op1, bits<2> op2, dag oops, dag iops,
+      InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : T2ThreeReg<oops, iops, itin, opc, asm, pattern> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-22} = 0b01010;
+  let Inst{21-20} = op1;
+  let Inst{15-12} = 0b1111;
+  let Inst{7-6} = 0b10;
+  let Inst{5-4} = op2;
+  let Rn{3-0} = Rm;
+}
+
+def t2CLZ : T2I_misc<0b11, 0b00, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr,
+                    "clz", "\t$Rd, $Rm", [(set rGPR:$Rd, (ctlz rGPR:$Rm))]>,
+                    Sched<[WriteALU]>;
+
+def t2RBIT : T2I_misc<0b01, 0b10, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr,
+                      "rbit", "\t$Rd, $Rm",
+                      [(set rGPR:$Rd, (ARMrbit rGPR:$Rm))]>,
+                      Sched<[WriteALU]>;
+
+def t2REV : T2I_misc<0b01, 0b00, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr,
+                 "rev", ".w\t$Rd, $Rm", [(set rGPR:$Rd, (bswap rGPR:$Rm))]>,
+                 Sched<[WriteALU]>;
+
+def t2REV16 : T2I_misc<0b01, 0b01, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr,
+                       "rev16", ".w\t$Rd, $Rm",
+                [(set rGPR:$Rd, (rotr (bswap rGPR:$Rm), (i32 16)))]>,
+                Sched<[WriteALU]>;
+
+def t2REVSH : T2I_misc<0b01, 0b11, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr,
+                       "revsh", ".w\t$Rd, $Rm",
+                 [(set rGPR:$Rd, (sra (bswap rGPR:$Rm), (i32 16)))]>,
+                 Sched<[WriteALU]>;
+
+def : T2Pat<(or (sra (shl rGPR:$Rm, (i32 24)), (i32 16)),
+                (and (srl rGPR:$Rm, (i32 8)), 0xFF)),
+            (t2REVSH rGPR:$Rm)>;
+
+def t2PKHBT : T2ThreeReg<
+            (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, pkh_lsl_amt:$sh),
+                  IIC_iBITsi, "pkhbt", "\t$Rd, $Rn, $Rm$sh",
+                  [(set rGPR:$Rd, (or (and rGPR:$Rn, 0xFFFF),
+                                      (and (shl rGPR:$Rm, pkh_lsl_amt:$sh),
+                                           0xFFFF0000)))]>,
+                  Requires<[HasT2ExtractPack, IsThumb2]>,
+                  Sched<[WriteALUsi, ReadALU]> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-25} = 0b01;
+  let Inst{24-20} = 0b01100;
+  let Inst{5} = 0; // BT form
+  let Inst{4} = 0;
+
+  bits<5> sh;
+  let Inst{14-12} = sh{4-2};
+  let Inst{7-6}   = sh{1-0};
+}
+
+// Alternate cases for PKHBT where identities eliminate some nodes.
+def : T2Pat<(or (and rGPR:$src1, 0xFFFF), (and rGPR:$src2, 0xFFFF0000)),
+            (t2PKHBT rGPR:$src1, rGPR:$src2, 0)>,
+            Requires<[HasT2ExtractPack, IsThumb2]>;
+def : T2Pat<(or (and rGPR:$src1, 0xFFFF), (shl rGPR:$src2, imm16_31:$sh)),
+            (t2PKHBT rGPR:$src1, rGPR:$src2, imm16_31:$sh)>,
+            Requires<[HasT2ExtractPack, IsThumb2]>;
+
+// Note: Shifts of 1-15 bits will be transformed to srl instead of sra and
+// will match the pattern below.
+def t2PKHTB : T2ThreeReg<
+                  (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, pkh_asr_amt:$sh),
+                  IIC_iBITsi, "pkhtb", "\t$Rd, $Rn, $Rm$sh",
+                  [(set rGPR:$Rd, (or (and rGPR:$Rn, 0xFFFF0000),
+                                       (and (sra rGPR:$Rm, pkh_asr_amt:$sh),
+                                            0xFFFF)))]>,
+                  Requires<[HasT2ExtractPack, IsThumb2]>,
+                  Sched<[WriteALUsi, ReadALU]> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-25} = 0b01;
+  let Inst{24-20} = 0b01100;
+  let Inst{5} = 1; // TB form
+  let Inst{4} = 0;
+
+  bits<5> sh;
+  let Inst{14-12} = sh{4-2};
+  let Inst{7-6}   = sh{1-0};
+}
+
+// Alternate cases for PKHTB where identities eliminate some nodes.  Note that
+// a shift amount of 0 is *not legal* here, it is PKHBT instead.
+// We also can not replace a srl (17..31) by an arithmetic shift we would use in
+// pkhtb src1, src2, asr (17..31).
+def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000), (srl rGPR:$src2, imm16:$sh)),
+            (t2PKHTB rGPR:$src1, rGPR:$src2, imm16:$sh)>,
+            Requires<[HasT2ExtractPack, IsThumb2]>;
+def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000), (sra rGPR:$src2, imm16_31:$sh)),
+            (t2PKHTB rGPR:$src1, rGPR:$src2, imm16_31:$sh)>,
+            Requires<[HasT2ExtractPack, IsThumb2]>;
+def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000),
+                (and (srl rGPR:$src2, imm1_15:$sh), 0xFFFF)),
+            (t2PKHTB rGPR:$src1, rGPR:$src2, imm1_15:$sh)>,
+            Requires<[HasT2ExtractPack, IsThumb2]>;
+
+//===----------------------------------------------------------------------===//
+// CRC32 Instructions
+//
+// Polynomials:
+// + CRC32{B,H,W}       0x04C11DB7
+// + CRC32C{B,H,W}      0x1EDC6F41
+//
+
+class T2I_crc32<bit C, bits<2> sz, string suffix, SDPatternOperator builtin>
+  : T2ThreeRegNoP<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), NoItinerary,
+               !strconcat("crc32", suffix, "\t$Rd, $Rn, $Rm"),
+               [(set rGPR:$Rd, (builtin rGPR:$Rn, rGPR:$Rm))]>,
+               Requires<[IsThumb2, HasV8, HasCRC]> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-21} = 0b010110;
+  let Inst{20}    = C;
+  let Inst{15-12} = 0b1111;
+  let Inst{7-6}   = 0b10;
+  let Inst{5-4}   = sz;
+}
+
+def t2CRC32B  : T2I_crc32<0, 0b00, "b", int_arm_crc32b>;
+def t2CRC32CB : T2I_crc32<1, 0b00, "cb", int_arm_crc32cb>;
+def t2CRC32H  : T2I_crc32<0, 0b01, "h", int_arm_crc32h>;
+def t2CRC32CH : T2I_crc32<1, 0b01, "ch", int_arm_crc32ch>;
+def t2CRC32W  : T2I_crc32<0, 0b10, "w", int_arm_crc32w>;
+def t2CRC32CW : T2I_crc32<1, 0b10, "cw", int_arm_crc32cw>;
+
+//===----------------------------------------------------------------------===//
+//  Comparison Instructions...
+//
+defm t2CMP  : T2I_cmp_irs<0b1101, "cmp",
+                          IIC_iCMPi, IIC_iCMPr, IIC_iCMPsi,
+                          BinOpFrag<(ARMcmp node:$LHS, node:$RHS)>>;
+
+def : T2Pat<(ARMcmpZ  GPRnopc:$lhs, t2_so_imm:$imm),
+            (t2CMPri  GPRnopc:$lhs, t2_so_imm:$imm)>;
+def : T2Pat<(ARMcmpZ  GPRnopc:$lhs, rGPR:$rhs),
+            (t2CMPrr  GPRnopc:$lhs, rGPR:$rhs)>;
+def : T2Pat<(ARMcmpZ  GPRnopc:$lhs, t2_so_reg:$rhs),
+            (t2CMPrs  GPRnopc:$lhs, t2_so_reg:$rhs)>;
+
+let isCompare = 1, Defs = [CPSR] in {
+   // shifted imm
+   def t2CMNri : T2OneRegCmpImm<
+                (outs), (ins GPRnopc:$Rn, t2_so_imm:$imm), IIC_iCMPi,
+                "cmn", ".w\t$Rn, $imm",
+                [(ARMcmn GPRnopc:$Rn, (ineg t2_so_imm:$imm))]>,
+                Sched<[WriteCMP, ReadALU]> {
+     let Inst{31-27} = 0b11110;
+     let Inst{25} = 0;
+     let Inst{24-21} = 0b1000;
+     let Inst{20} = 1; // The S bit.
+     let Inst{15} = 0;
+     let Inst{11-8} = 0b1111; // Rd
+   }
+   // register
+   def t2CMNzrr : T2TwoRegCmp<
+                (outs), (ins GPRnopc:$Rn, rGPR:$Rm), IIC_iCMPr,
+                "cmn", ".w\t$Rn, $Rm",
+                [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>
+                  GPRnopc:$Rn, rGPR:$Rm)]>, Sched<[WriteCMP, ReadALU, ReadALU]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = 0b1000;
+     let Inst{20} = 1; // The S bit.
+     let Inst{14-12} = 0b000; // imm3
+     let Inst{11-8} = 0b1111; // Rd
+     let Inst{7-6} = 0b00; // imm2
+     let Inst{5-4} = 0b00; // type
+   }
+   // shifted register
+   def t2CMNzrs : T2OneRegCmpShiftedReg<
+                (outs), (ins GPRnopc:$Rn, t2_so_reg:$ShiftedRm), IIC_iCMPsi,
+                "cmn", ".w\t$Rn, $ShiftedRm",
+                [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>
+                  GPRnopc:$Rn, t2_so_reg:$ShiftedRm)]>,
+                  Sched<[WriteCMPsi, ReadALU, ReadALU]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = 0b1000;
+     let Inst{20} = 1; // The S bit.
+     let Inst{11-8} = 0b1111; // Rd
+   }
+}
+
+// Assembler aliases w/o the ".w" suffix.
+// No alias here for 'rr' version as not all instantiations of this multiclass
+// want one (CMP in particular, does not).
+def : t2InstAlias<"cmn${p} $Rn, $imm",
+   (t2CMNri GPRnopc:$Rn, t2_so_imm:$imm, pred:$p)>;
+def : t2InstAlias<"cmn${p} $Rn, $shift",
+   (t2CMNzrs GPRnopc:$Rn, t2_so_reg:$shift, pred:$p)>;
+
+def : T2Pat<(ARMcmp  GPR:$src, t2_so_imm_neg:$imm),
+            (t2CMNri GPR:$src, t2_so_imm_neg:$imm)>;
+
+def : T2Pat<(ARMcmpZ GPRnopc:$src, t2_so_imm_neg:$imm),
+            (t2CMNri GPRnopc:$src, t2_so_imm_neg:$imm)>;
+
+defm t2TST  : T2I_cmp_irs<0b0000, "tst",
+                          IIC_iTSTi, IIC_iTSTr, IIC_iTSTsi,
+                         BinOpFrag<(ARMcmpZ (and_su node:$LHS, node:$RHS), 0)>>;
+defm t2TEQ  : T2I_cmp_irs<0b0100, "teq",
+                          IIC_iTSTi, IIC_iTSTr, IIC_iTSTsi,
+                         BinOpFrag<(ARMcmpZ (xor_su node:$LHS, node:$RHS), 0)>>;
+
+// Conditional moves
+let neverHasSideEffects = 1 in {
+
+let isCommutable = 1, isSelect = 1 in
+def t2MOVCCr : t2PseudoInst<(outs rGPR:$Rd),
+                            (ins rGPR:$false, rGPR:$Rm, cmovpred:$p),
+                            4, IIC_iCMOVr,
+                            [(set rGPR:$Rd, (ARMcmov rGPR:$false, rGPR:$Rm,
+                                                     cmovpred:$p))]>,
+               RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
+
+let isMoveImm = 1 in
+def t2MOVCCi
+    : t2PseudoInst<(outs rGPR:$Rd),
+                   (ins rGPR:$false, t2_so_imm:$imm, cmovpred:$p),
+                   4, IIC_iCMOVi,
+                   [(set rGPR:$Rd, (ARMcmov rGPR:$false,t2_so_imm:$imm,
+                                            cmovpred:$p))]>,
+      RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
+
+let isCodeGenOnly = 1 in {
+let isMoveImm = 1 in
+def t2MOVCCi16
+    : t2PseudoInst<(outs rGPR:$Rd),
+                   (ins  rGPR:$false, imm0_65535_expr:$imm, cmovpred:$p),
+                   4, IIC_iCMOVi,
+                   [(set rGPR:$Rd, (ARMcmov rGPR:$false, imm0_65535:$imm,
+                                            cmovpred:$p))]>,
+      RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
+
+let isMoveImm = 1 in
+def t2MVNCCi
+    : t2PseudoInst<(outs rGPR:$Rd),
+                   (ins rGPR:$false, t2_so_imm:$imm, cmovpred:$p),
+                   4, IIC_iCMOVi,
+                   [(set rGPR:$Rd,
+                         (ARMcmov rGPR:$false, t2_so_imm_not:$imm,
+                                  cmovpred:$p))]>,
+      RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
+
+class MOVCCShPseudo<SDPatternOperator opnode, Operand ty>
+    : t2PseudoInst<(outs rGPR:$Rd),
+                   (ins rGPR:$false, rGPR:$Rm, i32imm:$imm, cmovpred:$p),
+                   4, IIC_iCMOVsi,
+                   [(set rGPR:$Rd, (ARMcmov rGPR:$false,
+                                            (opnode rGPR:$Rm, (i32 ty:$imm)),
+                                            cmovpred:$p))]>,
+      RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
+
+def t2MOVCClsl : MOVCCShPseudo<shl,  imm0_31>;
+def t2MOVCClsr : MOVCCShPseudo<srl,  imm_sr>;
+def t2MOVCCasr : MOVCCShPseudo<sra,  imm_sr>;
+def t2MOVCCror : MOVCCShPseudo<rotr, imm0_31>;
+
+let isMoveImm = 1 in
+def t2MOVCCi32imm
+    : t2PseudoInst<(outs rGPR:$dst),
+                   (ins rGPR:$false, i32imm:$src, cmovpred:$p),
+                   8, IIC_iCMOVix2,
+                   [(set rGPR:$dst, (ARMcmov rGPR:$false, imm:$src,
+                                             cmovpred:$p))]>,
+      RegConstraint<"$false = $dst">;
+} // isCodeGenOnly = 1
+
+} // neverHasSideEffects
+
+//===----------------------------------------------------------------------===//
+// Atomic operations intrinsics
+//
+
+// memory barriers protect the atomic sequences
+let hasSideEffects = 1 in {
+def t2DMB : T2I<(outs), (ins memb_opt:$opt), NoItinerary,
+                "dmb", "\t$opt", [(int_arm_dmb (i32 imm0_15:$opt))]>,
+                Requires<[IsThumb, HasDB]> {
+  bits<4> opt;
+  let Inst{31-4} = 0xf3bf8f5;
+  let Inst{3-0} = opt;
+}
+
+def t2DSB : T2I<(outs), (ins memb_opt:$opt), NoItinerary,
+                "dsb", "\t$opt", [(int_arm_dsb (i32 imm0_15:$opt))]>,
+                Requires<[IsThumb, HasDB]> {
+  bits<4> opt;
+  let Inst{31-4} = 0xf3bf8f4;
+  let Inst{3-0} = opt;
+}
+
+def t2ISB : T2I<(outs), (ins instsyncb_opt:$opt), NoItinerary,
+                "isb", "\t$opt", [(int_arm_isb (i32 imm0_15:$opt))]>,
+                Requires<[IsThumb, HasDB]> {
+  bits<4> opt;
+  let Inst{31-4} = 0xf3bf8f6;
+  let Inst{3-0} = opt;
+}
+}
+
+class T2I_ldrex<bits<4> opcod, dag oops, dag iops, AddrMode am, int sz,
+                InstrItinClass itin, string opc, string asm, string cstr,
+                list<dag> pattern, bits<4> rt2 = 0b1111>
+  : Thumb2I<oops, iops, am, sz, itin, opc, asm, cstr, pattern> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-20} = 0b0001101;
+  let Inst{11-8} = rt2;
+  let Inst{7-4} = opcod;
+  let Inst{3-0} = 0b1111;
+
+  bits<4> addr;
+  bits<4> Rt;
+  let Inst{19-16} = addr;
+  let Inst{15-12} = Rt;
+}
+class T2I_strex<bits<4> opcod, dag oops, dag iops, AddrMode am, int sz,
+                InstrItinClass itin, string opc, string asm, string cstr,
+                list<dag> pattern, bits<4> rt2 = 0b1111>
+  : Thumb2I<oops, iops, am, sz, itin, opc, asm, cstr, pattern> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-20} = 0b0001100;
+  let Inst{11-8} = rt2;
+  let Inst{7-4} = opcod;
+
+  bits<4> Rd;
+  bits<4> addr;
+  bits<4> Rt;
+  let Inst{3-0}  = Rd;
+  let Inst{19-16} = addr;
+  let Inst{15-12} = Rt;
+}
+
+let mayLoad = 1 in {
+def t2LDREXB : T2I_ldrex<0b0100, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "ldrexb", "\t$Rt, $addr", "",
+                         [(set rGPR:$Rt, (ldrex_1 addr_offset_none:$addr))]>;
+def t2LDREXH : T2I_ldrex<0b0101, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "ldrexh", "\t$Rt, $addr", "",
+                         [(set rGPR:$Rt, (ldrex_2 addr_offset_none:$addr))]>;
+def t2LDREX  : Thumb2I<(outs rGPR:$Rt), (ins t2addrmode_imm0_1020s4:$addr),
+                       AddrModeNone, 4, NoItinerary,
+                       "ldrex", "\t$Rt, $addr", "",
+                     [(set rGPR:$Rt, (ldrex_4 t2addrmode_imm0_1020s4:$addr))]> {
+  bits<4> Rt;
+  bits<12> addr;
+  let Inst{31-27} = 0b11101;
+  let Inst{26-20} = 0b0000101;
+  let Inst{19-16} = addr{11-8};
+  let Inst{15-12} = Rt;
+  let Inst{11-8} = 0b1111;
+  let Inst{7-0} = addr{7-0};
+}
+let hasExtraDefRegAllocReq = 1 in
+def t2LDREXD : T2I_ldrex<0b0111, (outs rGPR:$Rt, rGPR:$Rt2),
+                         (ins addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "ldrexd", "\t$Rt, $Rt2, $addr", "",
+                         [], {?, ?, ?, ?}> {
+  bits<4> Rt2;
+  let Inst{11-8} = Rt2;
+}
+def t2LDAEXB : T2I_ldrex<0b1100, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "ldaexb", "\t$Rt, $addr", "",
+                         [(set rGPR:$Rt, (ldaex_1 addr_offset_none:$addr))]>,
+               Requires<[IsThumb, HasV8]>;
+def t2LDAEXH : T2I_ldrex<0b1101, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "ldaexh", "\t$Rt, $addr", "",
+                         [(set rGPR:$Rt, (ldaex_2 addr_offset_none:$addr))]>,
+               Requires<[IsThumb, HasV8]>;
+def t2LDAEX  : Thumb2I<(outs rGPR:$Rt), (ins addr_offset_none:$addr),
+                       AddrModeNone, 4, NoItinerary,
+                       "ldaex", "\t$Rt, $addr", "",
+                         [(set rGPR:$Rt, (ldaex_4 addr_offset_none:$addr))]>,
+               Requires<[IsThumb, HasV8]> {
+  bits<4> Rt;
+  bits<4> addr;
+  let Inst{31-27} = 0b11101;
+  let Inst{26-20} = 0b0001101;
+  let Inst{19-16} = addr;
+  let Inst{15-12} = Rt;
+  let Inst{11-8} = 0b1111;
+  let Inst{7-0} = 0b11101111;
+}
+let hasExtraDefRegAllocReq = 1 in
+def t2LDAEXD : T2I_ldrex<0b1111, (outs rGPR:$Rt, rGPR:$Rt2),
+                         (ins addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "ldaexd", "\t$Rt, $Rt2, $addr", "",
+                         [], {?, ?, ?, ?}>, Requires<[IsThumb, HasV8]> {
+  bits<4> Rt2;
+  let Inst{11-8} = Rt2;
+
+  let Inst{7} = 1;
+}
+}
+
+let mayStore = 1, Constraints = "@earlyclobber $Rd" in {
+def t2STREXB : T2I_strex<0b0100, (outs rGPR:$Rd),
+                         (ins rGPR:$Rt, addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "strexb", "\t$Rd, $Rt, $addr", "",
+                         [(set rGPR:$Rd,
+                               (strex_1 rGPR:$Rt, addr_offset_none:$addr))]>;
+def t2STREXH : T2I_strex<0b0101, (outs rGPR:$Rd),
+                         (ins rGPR:$Rt, addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "strexh", "\t$Rd, $Rt, $addr", "",
+                         [(set rGPR:$Rd,
+                               (strex_2 rGPR:$Rt, addr_offset_none:$addr))]>;
+
+def t2STREX  : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt,
+                             t2addrmode_imm0_1020s4:$addr),
+                  AddrModeNone, 4, NoItinerary,
+                  "strex", "\t$Rd, $Rt, $addr", "",
+                  [(set rGPR:$Rd,
+                        (strex_4 rGPR:$Rt, t2addrmode_imm0_1020s4:$addr))]> {
+  bits<4> Rd;
+  bits<4> Rt;
+  bits<12> addr;
+  let Inst{31-27} = 0b11101;
+  let Inst{26-20} = 0b0000100;
+  let Inst{19-16} = addr{11-8};
+  let Inst{15-12} = Rt;
+  let Inst{11-8}  = Rd;
+  let Inst{7-0} = addr{7-0};
+}
+let hasExtraSrcRegAllocReq = 1 in
+def t2STREXD : T2I_strex<0b0111, (outs rGPR:$Rd),
+                         (ins rGPR:$Rt, rGPR:$Rt2, addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "strexd", "\t$Rd, $Rt, $Rt2, $addr", "", [],
+                         {?, ?, ?, ?}> {
+  bits<4> Rt2;
+  let Inst{11-8} = Rt2;
+}
+def t2STLEXB : T2I_strex<0b1100, (outs rGPR:$Rd),
+                         (ins rGPR:$Rt, addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "stlexb", "\t$Rd, $Rt, $addr", "",
+                         [(set rGPR:$Rd,
+                               (stlex_1 rGPR:$Rt, addr_offset_none:$addr))]>,
+                         Requires<[IsThumb, HasV8]>;
+
+def t2STLEXH : T2I_strex<0b1101, (outs rGPR:$Rd),
+                         (ins rGPR:$Rt, addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "stlexh", "\t$Rd, $Rt, $addr", "",
+                         [(set rGPR:$Rd,
+                               (stlex_2 rGPR:$Rt, addr_offset_none:$addr))]>,
+                         Requires<[IsThumb, HasV8]>;
+
+def t2STLEX  : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt,
+                             addr_offset_none:$addr),
+                  AddrModeNone, 4, NoItinerary,
+                  "stlex", "\t$Rd, $Rt, $addr", "",
+                  [(set rGPR:$Rd,
+                        (stlex_4 rGPR:$Rt, addr_offset_none:$addr))]>,
+                  Requires<[IsThumb, HasV8]> {
+  bits<4> Rd;
+  bits<4> Rt;
+  bits<4> addr;
+  let Inst{31-27} = 0b11101;
+  let Inst{26-20} = 0b0001100;
+  let Inst{19-16} = addr;
+  let Inst{15-12} = Rt;
+  let Inst{11-4}  = 0b11111110;
+  let Inst{3-0}   = Rd;
+}
+let hasExtraSrcRegAllocReq = 1 in
+def t2STLEXD : T2I_strex<0b1111, (outs rGPR:$Rd),
+                         (ins rGPR:$Rt, rGPR:$Rt2, addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "stlexd", "\t$Rd, $Rt, $Rt2, $addr", "", [],
+                         {?, ?, ?, ?}>, Requires<[IsThumb, HasV8]> {
+  bits<4> Rt2;
+  let Inst{11-8} = Rt2;
+}
+}
+
+def t2CLREX : T2I<(outs), (ins), NoItinerary, "clrex", "", [(int_arm_clrex)]>,
+            Requires<[IsThumb2, HasV7]>  {
+  let Inst{31-16} = 0xf3bf;
+  let Inst{15-14} = 0b10;
+  let Inst{13} = 0;
+  let Inst{12} = 0;
+  let Inst{11-8} = 0b1111;
+  let Inst{7-4} = 0b0010;
+  let Inst{3-0} = 0b1111;
+}
+
+def : T2Pat<(and (ldrex_1 addr_offset_none:$addr), 0xff),
+            (t2LDREXB addr_offset_none:$addr)>;
+def : T2Pat<(and (ldrex_2 addr_offset_none:$addr), 0xffff),
+            (t2LDREXH addr_offset_none:$addr)>;
+def : T2Pat<(strex_1 (and GPR:$Rt, 0xff), addr_offset_none:$addr),
+            (t2STREXB GPR:$Rt, addr_offset_none:$addr)>;
+def : T2Pat<(strex_2 (and GPR:$Rt, 0xffff), addr_offset_none:$addr),
+            (t2STREXH GPR:$Rt, addr_offset_none:$addr)>;
+
+def : T2Pat<(and (ldaex_1 addr_offset_none:$addr), 0xff),
+            (t2LDAEXB addr_offset_none:$addr)>;
+def : T2Pat<(and (ldaex_2 addr_offset_none:$addr), 0xffff),
+            (t2LDAEXH addr_offset_none:$addr)>;
+def : T2Pat<(stlex_1 (and GPR:$Rt, 0xff), addr_offset_none:$addr),
+            (t2STLEXB GPR:$Rt, addr_offset_none:$addr)>;
+def : T2Pat<(stlex_2 (and GPR:$Rt, 0xffff), addr_offset_none:$addr),
+            (t2STLEXH GPR:$Rt, addr_offset_none:$addr)>;
+
+//===----------------------------------------------------------------------===//
+// SJLJ Exception handling intrinsics
+//   eh_sjlj_setjmp() is an instruction sequence to store the return
+//   address and save #0 in R0 for the non-longjmp case.
+//   Since by its nature we may be coming from some other function to get
+//   here, and we're using the stack frame for the containing function to
+//   save/restore registers, we can't keep anything live in regs across
+//   the eh_sjlj_setjmp(), else it will almost certainly have been tromped upon
+//   when we get here from a longjmp(). We force everything out of registers
+//   except for our own input by listing the relevant registers in Defs. By
+//   doing so, we also cause the prologue/epilogue code to actively preserve
+//   all of the callee-saved resgisters, which is exactly what we want.
+//   $val is a scratch register for our use.
+let Defs =
+  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR, CPSR,
+    Q0, Q1, Q2, Q3, Q8, Q9, Q10, Q11, Q12, Q13, Q14, Q15],
+  hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1,
+  usesCustomInserter = 1 in {
+  def t2Int_eh_sjlj_setjmp : Thumb2XI<(outs), (ins tGPR:$src, tGPR:$val),
+                               AddrModeNone, 0, NoItinerary, "", "",
+                          [(set R0, (ARMeh_sjlj_setjmp tGPR:$src, tGPR:$val))]>,
+                             Requires<[IsThumb2, HasVFP2]>;
+}
+
+let Defs =
+  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR, CPSR ],
+  hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1,
+  usesCustomInserter = 1 in {
+  def t2Int_eh_sjlj_setjmp_nofp : Thumb2XI<(outs), (ins tGPR:$src, tGPR:$val),
+                               AddrModeNone, 0, NoItinerary, "", "",
+                          [(set R0, (ARMeh_sjlj_setjmp tGPR:$src, tGPR:$val))]>,
+                                  Requires<[IsThumb2, NoVFP]>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Control-Flow Instructions
+//
+
+// FIXME: remove when we have a way to marking a MI with these properties.
+// FIXME: Should pc be an implicit operand like PICADD, etc?
+let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1,
+    hasExtraDefRegAllocReq = 1, isCodeGenOnly = 1 in
+def t2LDMIA_RET: t2PseudoExpand<(outs GPR:$wb), (ins GPR:$Rn, pred:$p,
+                                                   reglist:$regs, variable_ops),
+                              4, IIC_iLoad_mBr, [],
+            (t2LDMIA_UPD GPR:$wb, GPR:$Rn, pred:$p, reglist:$regs)>,
+                         RegConstraint<"$Rn = $wb">;
+
+let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
+let isPredicable = 1 in
+def t2B   : T2I<(outs), (ins uncondbrtarget:$target), IIC_Br,
+                 "b", ".w\t$target",
+                 [(br bb:$target)]>, Sched<[WriteBr]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{15-14} = 0b10;
+  let Inst{12} = 1;
+
+  bits<24> target;
+  let Inst{26} = target{23};
+  let Inst{13} = target{22};
+  let Inst{11} = target{21};
+  let Inst{25-16} = target{20-11};
+  let Inst{10-0} = target{10-0};
+  let DecoderMethod = "DecodeT2BInstruction";
+  let AsmMatchConverter = "cvtThumbBranches";
+}
+
+let isNotDuplicable = 1, isIndirectBranch = 1 in {
+def t2BR_JT : t2PseudoInst<(outs),
+          (ins GPR:$target, GPR:$index, i32imm:$jt, i32imm:$id),
+           0, IIC_Br,
+          [(ARMbr2jt GPR:$target, GPR:$index, tjumptable:$jt, imm:$id)]>,
+          Sched<[WriteBr]>;
+
+// FIXME: Add a non-pc based case that can be predicated.
+def t2TBB_JT : t2PseudoInst<(outs),
+        (ins GPR:$index, i32imm:$jt, i32imm:$id), 0, IIC_Br, []>,
+        Sched<[WriteBr]>;
+
+def t2TBH_JT : t2PseudoInst<(outs),
+        (ins GPR:$index, i32imm:$jt, i32imm:$id), 0, IIC_Br, []>,
+        Sched<[WriteBr]>;
+
+def t2TBB : T2I<(outs), (ins addrmode_tbb:$addr), IIC_Br,
+                    "tbb", "\t$addr", []>, Sched<[WriteBrTbl]> {
+  bits<4> Rn;
+  bits<4> Rm;
+  let Inst{31-20} = 0b111010001101;
+  let Inst{19-16} = Rn;
+  let Inst{15-5} = 0b11110000000;
+  let Inst{4} = 0; // B form
+  let Inst{3-0} = Rm;
+
+  let DecoderMethod = "DecodeThumbTableBranch";
+}
+
+def t2TBH : T2I<(outs), (ins addrmode_tbh:$addr), IIC_Br,
+                   "tbh", "\t$addr", []>, Sched<[WriteBrTbl]> {
+  bits<4> Rn;
+  bits<4> Rm;
+  let Inst{31-20} = 0b111010001101;
+  let Inst{19-16} = Rn;
+  let Inst{15-5} = 0b11110000000;
+  let Inst{4} = 1; // H form
+  let Inst{3-0} = Rm;
+
+  let DecoderMethod = "DecodeThumbTableBranch";
+}
+} // isNotDuplicable, isIndirectBranch
+
+} // isBranch, isTerminator, isBarrier
+
+// FIXME: should be able to write a pattern for ARMBrcond, but can't use
+// a two-value operand where a dag node expects ", "two operands. :(
+let isBranch = 1, isTerminator = 1 in
+def t2Bcc : T2I<(outs), (ins brtarget:$target), IIC_Br,
+                "b", ".w\t$target",
+                [/*(ARMbrcond bb:$target, imm:$cc)*/]>, Sched<[WriteBr]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{15-14} = 0b10;
+  let Inst{12} = 0;
+
+  bits<4> p;
+  let Inst{25-22} = p;
+
+  bits<21> target;
+  let Inst{26} = target{20};
+  let Inst{11} = target{19};
+  let Inst{13} = target{18};
+  let Inst{21-16} = target{17-12};
+  let Inst{10-0} = target{11-1};
+
+  let DecoderMethod = "DecodeThumb2BCCInstruction";
+  let AsmMatchConverter = "cvtThumbBranches";
+}
+
+// Tail calls. The MachO version of thumb tail calls uses a t2 branch, so
+// it goes here.
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
+  // IOS version.
+  let Uses = [SP] in
+  def tTAILJMPd: tPseudoExpand<(outs),
+                   (ins uncondbrtarget:$dst, pred:$p),
+                   4, IIC_Br, [],
+                   (t2B uncondbrtarget:$dst, pred:$p)>,
+                 Requires<[IsThumb2, IsMachO]>, Sched<[WriteBr]>;
+}
+
+// IT block
+let Defs = [ITSTATE] in
+def t2IT : Thumb2XI<(outs), (ins it_pred:$cc, it_mask:$mask),
+                    AddrModeNone, 2,  IIC_iALUx,
+                    "it$mask\t$cc", "", []>,
+           ComplexDeprecationPredicate<"IT"> {
+  // 16-bit instruction.
+  let Inst{31-16} = 0x0000;
+  let Inst{15-8} = 0b10111111;
+
+  bits<4> cc;
+  bits<4> mask;
+  let Inst{7-4} = cc;
+  let Inst{3-0} = mask;
+
+  let DecoderMethod = "DecodeIT";
+}
+
+// Branch and Exchange Jazelle -- for disassembly only
+// Rm = Inst{19-16}
+def t2BXJ : T2I<(outs), (ins rGPR:$func), NoItinerary, "bxj", "\t$func", []>,
+    Sched<[WriteBr]> {
+  bits<4> func;
+  let Inst{31-27} = 0b11110;
+  let Inst{26} = 0;
+  let Inst{25-20} = 0b111100;
+  let Inst{19-16} = func;
+  let Inst{15-0} = 0b1000111100000000;
+}
+
+// Compare and branch on zero / non-zero
+let isBranch = 1, isTerminator = 1 in {
+  def tCBZ  : T1I<(outs), (ins tGPR:$Rn, t_cbtarget:$target), IIC_Br,
+                  "cbz\t$Rn, $target", []>,
+              T1Misc<{0,0,?,1,?,?,?}>,
+              Requires<[IsThumb2]>, Sched<[WriteBr]> {
+    // A8.6.27
+    bits<6> target;
+    bits<3> Rn;
+    let Inst{9}   = target{5};
+    let Inst{7-3} = target{4-0};
+    let Inst{2-0} = Rn;
+  }
+
+  def tCBNZ : T1I<(outs), (ins tGPR:$Rn, t_cbtarget:$target), IIC_Br,
+                  "cbnz\t$Rn, $target", []>,
+              T1Misc<{1,0,?,1,?,?,?}>,
+              Requires<[IsThumb2]>, Sched<[WriteBr]> {
+    // A8.6.27
+    bits<6> target;
+    bits<3> Rn;
+    let Inst{9}   = target{5};
+    let Inst{7-3} = target{4-0};
+    let Inst{2-0} = Rn;
+  }
+}
+
+
+// Change Processor State is a system instruction.
+// FIXME: Since the asm parser has currently no clean way to handle optional
+// operands, create 3 versions of the same instruction. Once there's a clean
+// framework to represent optional operands, change this behavior.
+class t2CPS<dag iops, string asm_op> : T2XI<(outs), iops, NoItinerary,
+            !strconcat("cps", asm_op), []> {
+  bits<2> imod;
+  bits<3> iflags;
+  bits<5> mode;
+  bit M;
+
+  let Inst{31-11} = 0b111100111010111110000;
+  let Inst{10-9}  = imod;
+  let Inst{8}     = M;
+  let Inst{7-5}   = iflags;
+  let Inst{4-0}   = mode;
+  let DecoderMethod = "DecodeT2CPSInstruction";
+}
+
+let M = 1 in
+  def t2CPS3p : t2CPS<(ins imod_op:$imod, iflags_op:$iflags, i32imm:$mode),
+                      "$imod\t$iflags, $mode">;
+let mode = 0, M = 0 in
+  def t2CPS2p : t2CPS<(ins imod_op:$imod, iflags_op:$iflags),
+                      "$imod.w\t$iflags">;
+let imod = 0, iflags = 0, M = 1 in
+  def t2CPS1p : t2CPS<(ins imm0_31:$mode), "\t$mode">;
+
+def : t2InstAlias<"cps$imod.w $iflags, $mode",
+                   (t2CPS3p imod_op:$imod, iflags_op:$iflags, i32imm:$mode), 0>;
+def : t2InstAlias<"cps.w $mode", (t2CPS1p imm0_31:$mode), 0>;
+
+// A6.3.4 Branches and miscellaneous control
+// Table A6-14 Change Processor State, and hint instructions
+def t2HINT : T2I<(outs), (ins imm0_239:$imm), NoItinerary, "hint", ".w\t$imm",
+                  [(int_arm_hint imm0_239:$imm)]> {
+  bits<8> imm;
+  let Inst{31-3} = 0b11110011101011111000000000000;
+  let Inst{7-0} = imm;
+}
+
+def : t2InstAlias<"hint$p $imm", (t2HINT imm0_239:$imm, pred:$p)>;
+def : t2InstAlias<"nop$p.w", (t2HINT 0, pred:$p)>;
+def : t2InstAlias<"yield$p.w", (t2HINT 1, pred:$p)>;
+def : t2InstAlias<"wfe$p.w", (t2HINT 2, pred:$p)>;
+def : t2InstAlias<"wfi$p.w", (t2HINT 3, pred:$p)>;
+def : t2InstAlias<"sev$p.w", (t2HINT 4, pred:$p)>;
+def : t2InstAlias<"sevl$p.w", (t2HINT 5, pred:$p)> {
+  let Predicates = [IsThumb2, HasV8];
+}
+
+def t2DBG : T2I<(outs), (ins imm0_15:$opt), NoItinerary, "dbg", "\t$opt", []> {
+  bits<4> opt;
+  let Inst{31-20} = 0b111100111010;
+  let Inst{19-16} = 0b1111;
+  let Inst{15-8} = 0b10000000;
+  let Inst{7-4} = 0b1111;
+  let Inst{3-0} = opt;
+}
+
+// Secure Monitor Call is a system instruction.
+// Option = Inst{19-16}
+def t2SMC : T2I<(outs), (ins imm0_15:$opt), NoItinerary, "smc", "\t$opt",
+                []>, Requires<[IsThumb2, HasTrustZone]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{26-20} = 0b1111111;
+  let Inst{15-12} = 0b1000;
+
+  bits<4> opt;
+  let Inst{19-16} = opt;
+}
+
+class T2DCPS<bits<2> opt, string opc>
+  : T2I<(outs), (ins), NoItinerary, opc, "", []>, Requires<[IsThumb2, HasV8]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{26-20} = 0b1111000;
+  let Inst{19-16} = 0b1111;
+  let Inst{15-12} = 0b1000;
+  let Inst{11-2} = 0b0000000000;
+  let Inst{1-0} = opt;
+}
+
+def t2DCPS1 : T2DCPS<0b01, "dcps1">;
+def t2DCPS2 : T2DCPS<0b10, "dcps2">;
+def t2DCPS3 : T2DCPS<0b11, "dcps3">;
+
+class T2SRS<bits<2> Op, bit W, dag oops, dag iops, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<5> mode;
+  let Inst{31-25} = 0b1110100;
+  let Inst{24-23} = Op;
+  let Inst{22} = 0;
+  let Inst{21} = W;
+  let Inst{20-16} = 0b01101;
+  let Inst{15-5} = 0b11000000000;
+  let Inst{4-0} = mode{4-0};
+}
+
+// Store Return State is a system instruction.
+def t2SRSDB_UPD : T2SRS<0b00, 1, (outs), (ins imm0_31:$mode), NoItinerary,
+                        "srsdb", "\tsp!, $mode", []>;
+def t2SRSDB  : T2SRS<0b00, 0, (outs), (ins imm0_31:$mode), NoItinerary,
+                     "srsdb","\tsp, $mode", []>;
+def t2SRSIA_UPD : T2SRS<0b11, 1, (outs), (ins imm0_31:$mode), NoItinerary,
+                        "srsia","\tsp!, $mode", []>;
+def t2SRSIA  : T2SRS<0b11, 0, (outs), (ins imm0_31:$mode), NoItinerary,
+                     "srsia","\tsp, $mode", []>;
+
+
+def : t2InstAlias<"srsdb${p} $mode", (t2SRSDB imm0_31:$mode, pred:$p)>;
+def : t2InstAlias<"srsdb${p} $mode!", (t2SRSDB_UPD imm0_31:$mode, pred:$p)>;
+
+def : t2InstAlias<"srsia${p} $mode", (t2SRSIA imm0_31:$mode, pred:$p)>;
+def : t2InstAlias<"srsia${p} $mode!", (t2SRSIA_UPD imm0_31:$mode, pred:$p)>;
+
+// Return From Exception is a system instruction.
+class T2RFE<bits<12> op31_20, dag oops, dag iops, InstrItinClass itin,
+          string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  let Inst{31-20} = op31_20{11-0};
+
+  bits<4> Rn;
+  let Inst{19-16} = Rn;
+  let Inst{15-0} = 0xc000;
+}
+
+def t2RFEDBW : T2RFE<0b111010000011,
+                   (outs), (ins GPR:$Rn), NoItinerary, "rfedb", "\t$Rn!",
+                   [/* For disassembly only; pattern left blank */]>;
+def t2RFEDB  : T2RFE<0b111010000001,
+                   (outs), (ins GPR:$Rn), NoItinerary, "rfedb", "\t$Rn",
+                   [/* For disassembly only; pattern left blank */]>;
+def t2RFEIAW : T2RFE<0b111010011011,
+                   (outs), (ins GPR:$Rn), NoItinerary, "rfeia", "\t$Rn!",
+                   [/* For disassembly only; pattern left blank */]>;
+def t2RFEIA  : T2RFE<0b111010011001,
+                   (outs), (ins GPR:$Rn), NoItinerary, "rfeia", "\t$Rn",
+                   [/* For disassembly only; pattern left blank */]>;
+
+// B9.3.19 SUBS PC, LR, #imm (Thumb2) system instruction.
+// Exception return instruction is "subs pc, lr, #imm".
+let isReturn = 1, isBarrier = 1, isTerminator = 1, Defs = [PC] in
+def t2SUBS_PC_LR : T2I <(outs), (ins imm0_255:$imm), NoItinerary,
+                        "subs", "\tpc, lr, $imm",
+                        [(ARMintretflag imm0_255:$imm)]>,
+                   Requires<[IsThumb2]> {
+  let Inst{31-8} = 0b111100111101111010001111;
+
+  bits<8> imm;
+  let Inst{7-0} = imm;
+}
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//
+
+// 32-bit immediate using movw + movt.
+// This is a single pseudo instruction to make it re-materializable.
+// FIXME: Remove this when we can do generalized remat.
+let isReMaterializable = 1, isMoveImm = 1 in
+def t2MOVi32imm : PseudoInst<(outs rGPR:$dst), (ins i32imm:$src), IIC_iMOVix2,
+                            [(set rGPR:$dst, (i32 imm:$src))]>,
+                            Requires<[IsThumb, UseMovt]>;
+
+// Pseudo instruction that combines movw + movt + add pc (if pic).
+// It also makes it possible to rematerialize the instructions.
+// FIXME: Remove this when we can do generalized remat and when machine licm
+// can properly the instructions.
+let isReMaterializable = 1 in {
+def t2MOV_ga_pcrel : PseudoInst<(outs rGPR:$dst), (ins i32imm:$addr),
+                                IIC_iMOVix2addpc,
+                          [(set rGPR:$dst, (ARMWrapperPIC tglobaladdr:$addr))]>,
+                          Requires<[IsThumb2, UseMovt]>;
+
+}
+
+// ConstantPool, GlobalAddress, and JumpTable
+def : T2Pat<(ARMWrapper  tconstpool  :$dst), (t2LEApcrel tconstpool  :$dst)>;
+def : T2Pat<(ARMWrapper  tglobaladdr :$dst), (t2MOVi32imm tglobaladdr :$dst)>,
+           Requires<[IsThumb2, UseMovt]>;
+
+def : T2Pat<(ARMWrapperJT tjumptable:$dst, imm:$id),
+            (t2LEApcrelJT tjumptable:$dst, imm:$id)>;
+
+// Pseudo instruction that combines ldr from constpool and add pc. This should
+// be expanded into two instructions late to allow if-conversion and
+// scheduling.
+let canFoldAsLoad = 1, isReMaterializable = 1 in
+def t2LDRpci_pic : PseudoInst<(outs rGPR:$dst), (ins i32imm:$addr, pclabel:$cp),
+                   IIC_iLoadiALU,
+              [(set rGPR:$dst, (ARMpic_add (load (ARMWrapper tconstpool:$addr)),
+                                           imm:$cp))]>,
+               Requires<[IsThumb2]>;
+
+// Pseudo isntruction that combines movs + predicated rsbmi
+// to implement integer ABS
+let usesCustomInserter = 1, Defs = [CPSR] in {
+def t2ABS : PseudoInst<(outs rGPR:$dst), (ins rGPR:$src),
+                       NoItinerary, []>, Requires<[IsThumb2]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Coprocessor load/store -- for disassembly only
+//
+class T2CI<bits<4> op31_28, dag oops, dag iops, string opc, string asm>
+  : T2I<oops, iops, NoItinerary, opc, asm, []> {
+  let Inst{31-28} = op31_28;
+  let Inst{27-25} = 0b110;
+}
+
+multiclass t2LdStCop<bits<4> op31_28, bit load, bit Dbit, string asm> {
+  def _OFFSET : T2CI<op31_28,
+                     (outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr),
+                     asm, "\t$cop, $CRd, $addr"> {
+    bits<13> addr;
+    bits<4> cop;
+    bits<4> CRd;
+    let Inst{24} = 1; // P = 1
+    let Inst{23} = addr{8};
+    let Inst{22} = Dbit;
+    let Inst{21} = 0; // W = 0
+    let Inst{20} = load;
+    let Inst{19-16} = addr{12-9};
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = addr{7-0};
+    let DecoderMethod = "DecodeCopMemInstruction";
+  }
+  def _PRE : T2CI<op31_28,
+                  (outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5_pre:$addr),
+                  asm, "\t$cop, $CRd, $addr!"> {
+    bits<13> addr;
+    bits<4> cop;
+    bits<4> CRd;
+    let Inst{24} = 1; // P = 1
+    let Inst{23} = addr{8};
+    let Inst{22} = Dbit;
+    let Inst{21} = 1; // W = 1
+    let Inst{20} = load;
+    let Inst{19-16} = addr{12-9};
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = addr{7-0};
+    let DecoderMethod = "DecodeCopMemInstruction";
+  }
+  def _POST: T2CI<op31_28,
+                  (outs), (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
+                               postidx_imm8s4:$offset),
+                 asm, "\t$cop, $CRd, $addr, $offset"> {
+    bits<9> offset;
+    bits<4> addr;
+    bits<4> cop;
+    bits<4> CRd;
+    let Inst{24} = 0; // P = 0
+    let Inst{23} = offset{8};
+    let Inst{22} = Dbit;
+    let Inst{21} = 1; // W = 1
+    let Inst{20} = load;
+    let Inst{19-16} = addr;
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = offset{7-0};
+    let DecoderMethod = "DecodeCopMemInstruction";
+  }
+  def _OPTION : T2CI<op31_28, (outs),
+                     (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
+                          coproc_option_imm:$option),
+      asm, "\t$cop, $CRd, $addr, $option"> {
+    bits<8> option;
+    bits<4> addr;
+    bits<4> cop;
+    bits<4> CRd;
+    let Inst{24} = 0; // P = 0
+    let Inst{23} = 1; // U = 1
+    let Inst{22} = Dbit;
+    let Inst{21} = 0; // W = 0
+    let Inst{20} = load;
+    let Inst{19-16} = addr;
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = option;
+    let DecoderMethod = "DecodeCopMemInstruction";
+  }
+}
+
+defm t2LDC   : t2LdStCop<0b1110, 1, 0, "ldc">;
+defm t2LDCL  : t2LdStCop<0b1110, 1, 1, "ldcl">;
+defm t2STC   : t2LdStCop<0b1110, 0, 0, "stc">;
+defm t2STCL  : t2LdStCop<0b1110, 0, 1, "stcl">;
+defm t2LDC2  : t2LdStCop<0b1111, 1, 0, "ldc2">, Requires<[PreV8]>;
+defm t2LDC2L : t2LdStCop<0b1111, 1, 1, "ldc2l">, Requires<[PreV8]>;
+defm t2STC2  : t2LdStCop<0b1111, 0, 0, "stc2">, Requires<[PreV8]>;
+defm t2STC2L : t2LdStCop<0b1111, 0, 1, "stc2l">, Requires<[PreV8]>;
+
+
+//===----------------------------------------------------------------------===//
+// Move between special register and ARM core register -- for disassembly only
+//
+// Move to ARM core register from Special Register
+
+// A/R class MRS.
+//
+// A/R class can only move from CPSR or SPSR.
+def t2MRS_AR : T2I<(outs GPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, apsr",
+                  []>, Requires<[IsThumb2,IsNotMClass]> {
+  bits<4> Rd;
+  let Inst{31-12} = 0b11110011111011111000;
+  let Inst{11-8} = Rd;
+  let Inst{7-0} = 0b0000;
+}
+
+def : t2InstAlias<"mrs${p} $Rd, cpsr", (t2MRS_AR GPR:$Rd, pred:$p)>;
+
+def t2MRSsys_AR: T2I<(outs GPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, spsr",
+                   []>, Requires<[IsThumb2,IsNotMClass]> {
+  bits<4> Rd;
+  let Inst{31-12} = 0b11110011111111111000;
+  let Inst{11-8} = Rd;
+  let Inst{7-0} = 0b0000;
+}
+
+// M class MRS.
+//
+// This MRS has a mask field in bits 7-0 and can take more values than
+// the A/R class (a full msr_mask).
+def t2MRS_M : T2I<(outs rGPR:$Rd), (ins msr_mask:$mask), NoItinerary,
+                  "mrs", "\t$Rd, $mask", []>,
+              Requires<[IsThumb,IsMClass]> {
+  bits<4> Rd;
+  bits<8> mask;
+  let Inst{31-12} = 0b11110011111011111000;
+  let Inst{11-8} = Rd;
+  let Inst{19-16} = 0b1111;
+  let Inst{7-0} = mask;
+}
+
+
+// Move from ARM core register to Special Register
+//
+// A/R class MSR.
+//
+// No need to have both system and application versions, the encodings are the
+// same and the assembly parser has no way to distinguish between them. The mask
+// operand contains the special register (R Bit) in bit 4 and bits 3-0 contains
+// the mask with the fields to be accessed in the special register.
+def t2MSR_AR : T2I<(outs), (ins msr_mask:$mask, rGPR:$Rn),
+                   NoItinerary, "msr", "\t$mask, $Rn", []>,
+               Requires<[IsThumb2,IsNotMClass]> {
+  bits<5> mask;
+  bits<4> Rn;
+  let Inst{31-21} = 0b11110011100;
+  let Inst{20}    = mask{4}; // R Bit
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = 0b1000;
+  let Inst{11-8}  = mask{3-0};
+  let Inst{7-0}   = 0;
+}
+
+// M class MSR.
+//
+// Move from ARM core register to Special Register
+def t2MSR_M : T2I<(outs), (ins msr_mask:$SYSm, rGPR:$Rn),
+                  NoItinerary, "msr", "\t$SYSm, $Rn", []>,
+              Requires<[IsThumb,IsMClass]> {
+  bits<12> SYSm;
+  bits<4> Rn;
+  let Inst{31-21} = 0b11110011100;
+  let Inst{20}    = 0b0;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = 0b1000;
+  let Inst{11-0}  = SYSm;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Move between coprocessor and ARM core register
+//
+
+class t2MovRCopro<bits<4> Op, string opc, bit direction, dag oops, dag iops,
+                  list<dag> pattern>
+  : T2Cop<Op, oops, iops, opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2",
+          pattern> {
+  let Inst{27-24} = 0b1110;
+  let Inst{20} = direction;
+  let Inst{4} = 1;
+
+  bits<4> Rt;
+  bits<4> cop;
+  bits<3> opc1;
+  bits<3> opc2;
+  bits<4> CRm;
+  bits<4> CRn;
+
+  let Inst{15-12} = Rt;
+  let Inst{11-8}  = cop;
+  let Inst{23-21} = opc1;
+  let Inst{7-5}   = opc2;
+  let Inst{3-0}   = CRm;
+  let Inst{19-16} = CRn;
+}
+
+class t2MovRRCopro<bits<4> Op, string opc, bit direction,
+                   list<dag> pattern = []>
+  : T2Cop<Op, (outs),
+          (ins p_imm:$cop, imm0_15:$opc1, GPR:$Rt, GPR:$Rt2, c_imm:$CRm),
+          opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm", pattern> {
+  let Inst{27-24} = 0b1100;
+  let Inst{23-21} = 0b010;
+  let Inst{20} = direction;
+
+  bits<4> Rt;
+  bits<4> Rt2;
+  bits<4> cop;
+  bits<4> opc1;
+  bits<4> CRm;
+
+  let Inst{15-12} = Rt;
+  let Inst{19-16} = Rt2;
+  let Inst{11-8}  = cop;
+  let Inst{7-4}   = opc1;
+  let Inst{3-0}   = CRm;
+}
+
+/* from ARM core register to coprocessor */
+def t2MCR : t2MovRCopro<0b1110, "mcr", 0,
+           (outs),
+           (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
+                c_imm:$CRm, imm0_7:$opc2),
+           [(int_arm_mcr imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
+                         imm:$CRm, imm:$opc2)]>,
+           ComplexDeprecationPredicate<"MCR">;
+def : t2InstAlias<"mcr${p} $cop, $opc1, $Rt, $CRn, $CRm",
+                  (t2MCR p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
+                         c_imm:$CRm, 0, pred:$p)>;
+def t2MCR2 : t2MovRCopro<0b1111, "mcr2", 0,
+             (outs), (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
+                          c_imm:$CRm, imm0_7:$opc2),
+             [(int_arm_mcr2 imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
+                            imm:$CRm, imm:$opc2)]> {
+  let Predicates = [IsThumb2, PreV8];
+}
+def : t2InstAlias<"mcr2${p} $cop, $opc1, $Rt, $CRn, $CRm",
+                  (t2MCR2 p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
+                          c_imm:$CRm, 0, pred:$p)>;
+
+/* from coprocessor to ARM core register */
+def t2MRC : t2MovRCopro<0b1110, "mrc", 1,
+             (outs GPRwithAPSR:$Rt), (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
+                                  c_imm:$CRm, imm0_7:$opc2), []>;
+def : t2InstAlias<"mrc${p} $cop, $opc1, $Rt, $CRn, $CRm",
+                  (t2MRC GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
+                         c_imm:$CRm, 0, pred:$p)>;
+
+def t2MRC2 : t2MovRCopro<0b1111, "mrc2", 1,
+             (outs GPRwithAPSR:$Rt), (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
+                                  c_imm:$CRm, imm0_7:$opc2), []> {
+  let Predicates = [IsThumb2, PreV8];
+}
+def : t2InstAlias<"mrc2${p} $cop, $opc1, $Rt, $CRn, $CRm",
+                  (t2MRC2 GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
+                          c_imm:$CRm, 0, pred:$p)>;
+
+def : T2v6Pat<(int_arm_mrc  imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2),
+              (t2MRC imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>;
+
+def : T2v6Pat<(int_arm_mrc2 imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2),
+              (t2MRC2 imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>;
+
+
+/* from ARM core register to coprocessor */
+def t2MCRR : t2MovRRCopro<0b1110, "mcrr", 0,
+                        [(int_arm_mcrr imm:$cop, imm:$opc1, GPR:$Rt, GPR:$Rt2,
+                                       imm:$CRm)]>;
+def t2MCRR2 : t2MovRRCopro<0b1111, "mcrr2", 0,
+                           [(int_arm_mcrr2 imm:$cop, imm:$opc1, GPR:$Rt,
+                                           GPR:$Rt2, imm:$CRm)]> {
+  let Predicates = [IsThumb2, PreV8];
+}
+
+/* from coprocessor to ARM core register */
+def t2MRRC : t2MovRRCopro<0b1110, "mrrc", 1>;
+
+def t2MRRC2 : t2MovRRCopro<0b1111, "mrrc2", 1> {
+  let Predicates = [IsThumb2, PreV8];
+}
+
+//===----------------------------------------------------------------------===//
+// Other Coprocessor Instructions.
+//
+
+def t2CDP : T2Cop<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
+                 c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2),
+                 "cdp", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
+                 [(int_arm_cdp imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
+                               imm:$CRm, imm:$opc2)]> {
+  let Inst{27-24} = 0b1110;
+
+  bits<4> opc1;
+  bits<4> CRn;
+  bits<4> CRd;
+  bits<4> cop;
+  bits<3> opc2;
+  bits<4> CRm;
+
+  let Inst{3-0}   = CRm;
+  let Inst{4}     = 0;
+  let Inst{7-5}   = opc2;
+  let Inst{11-8}  = cop;
+  let Inst{15-12} = CRd;
+  let Inst{19-16} = CRn;
+  let Inst{23-20} = opc1;
+
+  let Predicates = [IsThumb2, PreV8];
+}
+
+def t2CDP2 : T2Cop<0b1111, (outs), (ins p_imm:$cop, imm0_15:$opc1,
+                   c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2),
+                   "cdp2", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
+                   [(int_arm_cdp2 imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
+                                  imm:$CRm, imm:$opc2)]> {
+  let Inst{27-24} = 0b1110;
+
+  bits<4> opc1;
+  bits<4> CRn;
+  bits<4> CRd;
+  bits<4> cop;
+  bits<3> opc2;
+  bits<4> CRm;
+
+  let Inst{3-0}   = CRm;
+  let Inst{4}     = 0;
+  let Inst{7-5}   = opc2;
+  let Inst{11-8}  = cop;
+  let Inst{15-12} = CRd;
+  let Inst{19-16} = CRn;
+  let Inst{23-20} = opc1;
+
+  let Predicates = [IsThumb2, PreV8];
+}
+
+
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//
+
+// SXT/UXT with no rotate
+let AddedComplexity = 16 in {
+def : T2Pat<(and rGPR:$Rm, 0x000000FF), (t2UXTB rGPR:$Rm, 0)>,
+           Requires<[IsThumb2]>;
+def : T2Pat<(and rGPR:$Rm, 0x0000FFFF), (t2UXTH rGPR:$Rm, 0)>,
+           Requires<[IsThumb2]>;
+def : T2Pat<(and rGPR:$Rm, 0x00FF00FF), (t2UXTB16 rGPR:$Rm, 0)>,
+           Requires<[HasT2ExtractPack, IsThumb2]>;
+def : T2Pat<(add rGPR:$Rn, (and rGPR:$Rm, 0x00FF)),
+            (t2UXTAB rGPR:$Rn, rGPR:$Rm, 0)>,
+           Requires<[HasT2ExtractPack, IsThumb2]>;
+def : T2Pat<(add rGPR:$Rn, (and rGPR:$Rm, 0xFFFF)),
+            (t2UXTAH rGPR:$Rn, rGPR:$Rm, 0)>,
+           Requires<[HasT2ExtractPack, IsThumb2]>;
+}
+
+def : T2Pat<(sext_inreg rGPR:$Src, i8),  (t2SXTB rGPR:$Src, 0)>,
+           Requires<[IsThumb2]>;
+def : T2Pat<(sext_inreg rGPR:$Src, i16), (t2SXTH rGPR:$Src, 0)>,
+           Requires<[IsThumb2]>;
+def : T2Pat<(add rGPR:$Rn, (sext_inreg rGPR:$Rm, i8)),
+            (t2SXTAB rGPR:$Rn, rGPR:$Rm, 0)>,
+           Requires<[HasT2ExtractPack, IsThumb2]>;
+def : T2Pat<(add rGPR:$Rn, (sext_inreg rGPR:$Rm, i16)),
+            (t2SXTAH rGPR:$Rn, rGPR:$Rm, 0)>,
+           Requires<[HasT2ExtractPack, IsThumb2]>;
+
+// Atomic load/store patterns
+def : T2Pat<(atomic_load_8   t2addrmode_imm12:$addr),
+            (t2LDRBi12  t2addrmode_imm12:$addr)>;
+def : T2Pat<(atomic_load_8   t2addrmode_negimm8:$addr),
+            (t2LDRBi8   t2addrmode_negimm8:$addr)>;
+def : T2Pat<(atomic_load_8   t2addrmode_so_reg:$addr),
+            (t2LDRBs    t2addrmode_so_reg:$addr)>;
+def : T2Pat<(atomic_load_16  t2addrmode_imm12:$addr),
+            (t2LDRHi12  t2addrmode_imm12:$addr)>;
+def : T2Pat<(atomic_load_16  t2addrmode_negimm8:$addr),
+            (t2LDRHi8   t2addrmode_negimm8:$addr)>;
+def : T2Pat<(atomic_load_16  t2addrmode_so_reg:$addr),
+            (t2LDRHs    t2addrmode_so_reg:$addr)>;
+def : T2Pat<(atomic_load_32  t2addrmode_imm12:$addr),
+            (t2LDRi12   t2addrmode_imm12:$addr)>;
+def : T2Pat<(atomic_load_32  t2addrmode_negimm8:$addr),
+            (t2LDRi8    t2addrmode_negimm8:$addr)>;
+def : T2Pat<(atomic_load_32  t2addrmode_so_reg:$addr),
+            (t2LDRs     t2addrmode_so_reg:$addr)>;
+def : T2Pat<(atomic_store_8  t2addrmode_imm12:$addr, GPR:$val),
+            (t2STRBi12  GPR:$val, t2addrmode_imm12:$addr)>;
+def : T2Pat<(atomic_store_8  t2addrmode_negimm8:$addr, GPR:$val),
+            (t2STRBi8   GPR:$val, t2addrmode_negimm8:$addr)>;
+def : T2Pat<(atomic_store_8  t2addrmode_so_reg:$addr, GPR:$val),
+            (t2STRBs    GPR:$val, t2addrmode_so_reg:$addr)>;
+def : T2Pat<(atomic_store_16 t2addrmode_imm12:$addr, GPR:$val),
+            (t2STRHi12  GPR:$val, t2addrmode_imm12:$addr)>;
+def : T2Pat<(atomic_store_16 t2addrmode_negimm8:$addr, GPR:$val),
+            (t2STRHi8   GPR:$val, t2addrmode_negimm8:$addr)>;
+def : T2Pat<(atomic_store_16 t2addrmode_so_reg:$addr, GPR:$val),
+            (t2STRHs    GPR:$val, t2addrmode_so_reg:$addr)>;
+def : T2Pat<(atomic_store_32 t2addrmode_imm12:$addr, GPR:$val),
+            (t2STRi12   GPR:$val, t2addrmode_imm12:$addr)>;
+def : T2Pat<(atomic_store_32 t2addrmode_negimm8:$addr, GPR:$val),
+            (t2STRi8    GPR:$val, t2addrmode_negimm8:$addr)>;
+def : T2Pat<(atomic_store_32 t2addrmode_so_reg:$addr, GPR:$val),
+            (t2STRs     GPR:$val, t2addrmode_so_reg:$addr)>;
+
+let AddedComplexity = 8 in {
+  def : T2Pat<(atomic_load_acquire_8 addr_offset_none:$addr),  (t2LDAB addr_offset_none:$addr)>;
+  def : T2Pat<(atomic_load_acquire_16 addr_offset_none:$addr), (t2LDAH addr_offset_none:$addr)>;
+  def : T2Pat<(atomic_load_acquire_32 addr_offset_none:$addr), (t2LDA  addr_offset_none:$addr)>;
+  def : T2Pat<(atomic_store_release_8 addr_offset_none:$addr, GPR:$val),  (t2STLB GPR:$val, addr_offset_none:$addr)>;
+  def : T2Pat<(atomic_store_release_16 addr_offset_none:$addr, GPR:$val), (t2STLH GPR:$val, addr_offset_none:$addr)>;
+  def : T2Pat<(atomic_store_release_32 addr_offset_none:$addr, GPR:$val), (t2STL  GPR:$val, addr_offset_none:$addr)>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Assembler aliases
+//
+
+// Aliases for ADC without the ".w" optional width specifier.
+def : t2InstAlias<"adc${s}${p} $Rd, $Rn, $Rm",
+                  (t2ADCrr rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"adc${s}${p} $Rd, $Rn, $ShiftedRm",
+                  (t2ADCrs rGPR:$Rd, rGPR:$Rn, t2_so_reg:$ShiftedRm,
+                           pred:$p, cc_out:$s)>;
+
+// Aliases for SBC without the ".w" optional width specifier.
+def : t2InstAlias<"sbc${s}${p} $Rd, $Rn, $Rm",
+                  (t2SBCrr rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"sbc${s}${p} $Rd, $Rn, $ShiftedRm",
+                  (t2SBCrs rGPR:$Rd, rGPR:$Rn, t2_so_reg:$ShiftedRm,
+                           pred:$p, cc_out:$s)>;
+
+// Aliases for ADD without the ".w" optional width specifier.
+def : t2InstAlias<"add${s}${p} $Rd, $Rn, $imm",
+        (t2ADDri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm:$imm, pred:$p,
+         cc_out:$s)>;
+def : t2InstAlias<"add${p} $Rd, $Rn, $imm",
+           (t2ADDri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095:$imm, pred:$p)>;
+def : t2InstAlias<"add${s}${p} $Rd, $Rn, $Rm",
+              (t2ADDrr GPRnopc:$Rd, GPRnopc:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"add${s}${p} $Rd, $Rn, $ShiftedRm",
+                  (t2ADDrs GPRnopc:$Rd, GPRnopc:$Rn, t2_so_reg:$ShiftedRm,
+                           pred:$p, cc_out:$s)>;
+// ... and with the destination and source register combined.
+def : t2InstAlias<"add${s}${p} $Rdn, $imm",
+      (t2ADDri GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"add${p} $Rdn, $imm",
+           (t2ADDri12 GPRnopc:$Rdn, GPRnopc:$Rdn, imm0_4095:$imm, pred:$p)>;
+def : t2InstAlias<"add${s}${p} $Rdn, $Rm",
+            (t2ADDrr GPRnopc:$Rdn, GPRnopc:$Rdn, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"add${s}${p} $Rdn, $ShiftedRm",
+                  (t2ADDrs GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_reg:$ShiftedRm,
+                           pred:$p, cc_out:$s)>;
+
+// add w/ negative immediates is just a sub.
+def : t2InstAlias<"add${s}${p} $Rd, $Rn, $imm",
+        (t2SUBri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm_neg:$imm, pred:$p,
+                 cc_out:$s)>;
+def : t2InstAlias<"add${p} $Rd, $Rn, $imm",
+           (t2SUBri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095_neg:$imm, pred:$p)>;
+def : t2InstAlias<"add${s}${p} $Rdn, $imm",
+      (t2SUBri GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_imm_neg:$imm, pred:$p,
+               cc_out:$s)>;
+def : t2InstAlias<"add${p} $Rdn, $imm",
+           (t2SUBri12 GPRnopc:$Rdn, GPRnopc:$Rdn, imm0_4095_neg:$imm, pred:$p)>;
+
+def : t2InstAlias<"add${s}${p}.w $Rd, $Rn, $imm",
+        (t2SUBri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm_neg:$imm, pred:$p,
+                 cc_out:$s)>;
+def : t2InstAlias<"addw${p} $Rd, $Rn, $imm",
+           (t2SUBri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095_neg:$imm, pred:$p)>;
+def : t2InstAlias<"add${s}${p}.w $Rdn, $imm",
+      (t2SUBri GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_imm_neg:$imm, pred:$p,
+               cc_out:$s)>;
+def : t2InstAlias<"addw${p} $Rdn, $imm",
+           (t2SUBri12 GPRnopc:$Rdn, GPRnopc:$Rdn, imm0_4095_neg:$imm, pred:$p)>;
+
+
+// Aliases for SUB without the ".w" optional width specifier.
+def : t2InstAlias<"sub${s}${p} $Rd, $Rn, $imm",
+        (t2SUBri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"sub${p} $Rd, $Rn, $imm",
+           (t2SUBri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095:$imm, pred:$p)>;
+def : t2InstAlias<"sub${s}${p} $Rd, $Rn, $Rm",
+              (t2SUBrr GPRnopc:$Rd, GPRnopc:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"sub${s}${p} $Rd, $Rn, $ShiftedRm",
+                  (t2SUBrs GPRnopc:$Rd, GPRnopc:$Rn, t2_so_reg:$ShiftedRm,
+                           pred:$p, cc_out:$s)>;
+// ... and with the destination and source register combined.
+def : t2InstAlias<"sub${s}${p} $Rdn, $imm",
+      (t2SUBri GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"sub${p} $Rdn, $imm",
+           (t2SUBri12 GPRnopc:$Rdn, GPRnopc:$Rdn, imm0_4095:$imm, pred:$p)>;
+def : t2InstAlias<"sub${s}${p}.w $Rdn, $Rm",
+            (t2SUBrr GPRnopc:$Rdn, GPRnopc:$Rdn, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"sub${s}${p} $Rdn, $Rm",
+            (t2SUBrr GPRnopc:$Rdn, GPRnopc:$Rdn, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"sub${s}${p} $Rdn, $ShiftedRm",
+                  (t2SUBrs GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_reg:$ShiftedRm,
+                           pred:$p, cc_out:$s)>;
+
+// Alias for compares without the ".w" optional width specifier.
+def : t2InstAlias<"cmn${p} $Rn, $Rm",
+                  (t2CMNzrr GPRnopc:$Rn, rGPR:$Rm, pred:$p)>;
+def : t2InstAlias<"teq${p} $Rn, $Rm",
+                  (t2TEQrr GPRnopc:$Rn, rGPR:$Rm, pred:$p)>;
+def : t2InstAlias<"tst${p} $Rn, $Rm",
+                  (t2TSTrr GPRnopc:$Rn, rGPR:$Rm, pred:$p)>;
+
+// Memory barriers
+def : InstAlias<"dmb${p}", (t2DMB 0xf, pred:$p)>, Requires<[HasDB]>;
+def : InstAlias<"dsb${p}", (t2DSB 0xf, pred:$p)>, Requires<[HasDB]>;
+def : InstAlias<"isb${p}", (t2ISB 0xf, pred:$p)>, Requires<[HasDB]>;
+
+// Alias for LDR, LDRB, LDRH, LDRSB, and LDRSH without the ".w" optional
+// width specifier.
+def : t2InstAlias<"ldr${p} $Rt, $addr",
+                  (t2LDRi12 GPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"ldrb${p} $Rt, $addr",
+                  (t2LDRBi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"ldrh${p} $Rt, $addr",
+                  (t2LDRHi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"ldrsb${p} $Rt, $addr",
+                  (t2LDRSBi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"ldrsh${p} $Rt, $addr",
+                  (t2LDRSHi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+
+def : t2InstAlias<"ldr${p} $Rt, $addr",
+                  (t2LDRs GPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+def : t2InstAlias<"ldrb${p} $Rt, $addr",
+                  (t2LDRBs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+def : t2InstAlias<"ldrh${p} $Rt, $addr",
+                  (t2LDRHs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+def : t2InstAlias<"ldrsb${p} $Rt, $addr",
+                  (t2LDRSBs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+def : t2InstAlias<"ldrsh${p} $Rt, $addr",
+                  (t2LDRSHs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+
+def : t2InstAlias<"ldr${p} $Rt, $addr",
+                  (t2LDRpci GPRnopc:$Rt, t2ldrlabel:$addr, pred:$p)>;
+def : t2InstAlias<"ldrb${p} $Rt, $addr",
+                  (t2LDRBpci rGPR:$Rt, t2ldrlabel:$addr, pred:$p)>;
+def : t2InstAlias<"ldrh${p} $Rt, $addr",
+                  (t2LDRHpci rGPR:$Rt, t2ldrlabel:$addr, pred:$p)>;
+def : t2InstAlias<"ldrsb${p} $Rt, $addr",
+                  (t2LDRSBpci rGPR:$Rt, t2ldrlabel:$addr, pred:$p)>;
+def : t2InstAlias<"ldrsh${p} $Rt, $addr",
+                  (t2LDRSHpci rGPR:$Rt, t2ldrlabel:$addr, pred:$p)>;
+
+// Alias for MVN with(out) the ".w" optional width specifier.
+def : t2InstAlias<"mvn${s}${p}.w $Rd, $imm",
+           (t2MVNi rGPR:$Rd, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"mvn${s}${p} $Rd, $Rm",
+           (t2MVNr rGPR:$Rd, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"mvn${s}${p} $Rd, $ShiftedRm",
+           (t2MVNs rGPR:$Rd, t2_so_reg:$ShiftedRm, pred:$p, cc_out:$s)>;
+
+// PKHBT/PKHTB with default shift amount. PKHTB is equivalent to PKHBT when the
+// shift amount is zero (i.e., unspecified).
+def : InstAlias<"pkhbt${p} $Rd, $Rn, $Rm",
+                (t2PKHBT rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>,
+            Requires<[HasT2ExtractPack, IsThumb2]>;
+def : InstAlias<"pkhtb${p} $Rd, $Rn, $Rm",
+                (t2PKHBT rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>,
+            Requires<[HasT2ExtractPack, IsThumb2]>;
+
+// PUSH/POP aliases for STM/LDM
+def : t2InstAlias<"push${p}.w $regs", (t2STMDB_UPD SP, pred:$p, reglist:$regs)>;
+def : t2InstAlias<"push${p} $regs", (t2STMDB_UPD SP, pred:$p, reglist:$regs)>;
+def : t2InstAlias<"pop${p}.w $regs", (t2LDMIA_UPD SP, pred:$p, reglist:$regs)>;
+def : t2InstAlias<"pop${p} $regs", (t2LDMIA_UPD SP, pred:$p, reglist:$regs)>;
+
+// STMIA/STMIA_UPD aliases w/o the optional .w suffix
+def : t2InstAlias<"stm${p} $Rn, $regs",
+                  (t2STMIA GPR:$Rn, pred:$p, reglist:$regs)>;
+def : t2InstAlias<"stm${p} $Rn!, $regs",
+                  (t2STMIA_UPD GPR:$Rn, pred:$p, reglist:$regs)>;
+
+// LDMIA/LDMIA_UPD aliases w/o the optional .w suffix
+def : t2InstAlias<"ldm${p} $Rn, $regs",
+                  (t2LDMIA GPR:$Rn, pred:$p, reglist:$regs)>;
+def : t2InstAlias<"ldm${p} $Rn!, $regs",
+                  (t2LDMIA_UPD GPR:$Rn, pred:$p, reglist:$regs)>;
+
+// STMDB/STMDB_UPD aliases w/ the optional .w suffix
+def : t2InstAlias<"stmdb${p}.w $Rn, $regs",
+                  (t2STMDB GPR:$Rn, pred:$p, reglist:$regs)>;
+def : t2InstAlias<"stmdb${p}.w $Rn!, $regs",
+                  (t2STMDB_UPD GPR:$Rn, pred:$p, reglist:$regs)>;
+
+// LDMDB/LDMDB_UPD aliases w/ the optional .w suffix
+def : t2InstAlias<"ldmdb${p}.w $Rn, $regs",
+                  (t2LDMDB GPR:$Rn, pred:$p, reglist:$regs)>;
+def : t2InstAlias<"ldmdb${p}.w $Rn!, $regs",
+                  (t2LDMDB_UPD GPR:$Rn, pred:$p, reglist:$regs)>;
+
+// Alias for REV/REV16/REVSH without the ".w" optional width specifier.
+def : t2InstAlias<"rev${p} $Rd, $Rm", (t2REV rGPR:$Rd, rGPR:$Rm, pred:$p)>;
+def : t2InstAlias<"rev16${p} $Rd, $Rm", (t2REV16 rGPR:$Rd, rGPR:$Rm, pred:$p)>;
+def : t2InstAlias<"revsh${p} $Rd, $Rm", (t2REVSH rGPR:$Rd, rGPR:$Rm, pred:$p)>;
+
+
+// Alias for RSB without the ".w" optional width specifier, and with optional
+// implied destination register.
+def : t2InstAlias<"rsb${s}${p} $Rd, $Rn, $imm",
+           (t2RSBri rGPR:$Rd, rGPR:$Rn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"rsb${s}${p} $Rdn, $imm",
+           (t2RSBri rGPR:$Rdn, rGPR:$Rdn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"rsb${s}${p} $Rdn, $Rm",
+           (t2RSBrr rGPR:$Rdn, rGPR:$Rdn, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"rsb${s}${p} $Rdn, $ShiftedRm",
+           (t2RSBrs rGPR:$Rdn, rGPR:$Rdn, t2_so_reg:$ShiftedRm, pred:$p,
+                    cc_out:$s)>;
+
+// SSAT/USAT optional shift operand.
+def : t2InstAlias<"ssat${p} $Rd, $sat_imm, $Rn",
+                  (t2SSAT rGPR:$Rd, imm1_32:$sat_imm, rGPR:$Rn, 0, pred:$p)>;
+def : t2InstAlias<"usat${p} $Rd, $sat_imm, $Rn",
+                  (t2USAT rGPR:$Rd, imm0_31:$sat_imm, rGPR:$Rn, 0, pred:$p)>;
+
+// STM w/o the .w suffix.
+def : t2InstAlias<"stm${p} $Rn, $regs",
+                  (t2STMIA GPR:$Rn, pred:$p, reglist:$regs)>;
+
+// Alias for STR, STRB, and STRH without the ".w" optional
+// width specifier.
+def : t2InstAlias<"str${p} $Rt, $addr",
+                  (t2STRi12 GPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"strb${p} $Rt, $addr",
+                  (t2STRBi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"strh${p} $Rt, $addr",
+                  (t2STRHi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+
+def : t2InstAlias<"str${p} $Rt, $addr",
+                  (t2STRs GPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+def : t2InstAlias<"strb${p} $Rt, $addr",
+                  (t2STRBs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+def : t2InstAlias<"strh${p} $Rt, $addr",
+                  (t2STRHs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+
+// Extend instruction optional rotate operand.
+def : t2InstAlias<"sxtab${p} $Rd, $Rn, $Rm",
+                (t2SXTAB rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"sxtah${p} $Rd, $Rn, $Rm",
+                (t2SXTAH rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"sxtab16${p} $Rd, $Rn, $Rm",
+                (t2SXTAB16 rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>;
+
+def : t2InstAlias<"sxtb${p} $Rd, $Rm",
+                (t2SXTB rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"sxtb16${p} $Rd, $Rm",
+                (t2SXTB16 rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"sxth${p} $Rd, $Rm",
+                (t2SXTH rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"sxtb${p}.w $Rd, $Rm",
+                (t2SXTB rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"sxth${p}.w $Rd, $Rm",
+                (t2SXTH rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+
+def : t2InstAlias<"uxtab${p} $Rd, $Rn, $Rm",
+                (t2UXTAB rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"uxtah${p} $Rd, $Rn, $Rm",
+                (t2UXTAH rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"uxtab16${p} $Rd, $Rn, $Rm",
+                (t2UXTAB16 rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"uxtb${p} $Rd, $Rm",
+                (t2UXTB rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"uxtb16${p} $Rd, $Rm",
+                (t2UXTB16 rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"uxth${p} $Rd, $Rm",
+                (t2UXTH rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+
+def : t2InstAlias<"uxtb${p}.w $Rd, $Rm",
+                (t2UXTB rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"uxth${p}.w $Rd, $Rm",
+                (t2UXTH rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+
+// Extend instruction w/o the ".w" optional width specifier.
+def : t2InstAlias<"uxtb${p} $Rd, $Rm$rot",
+                  (t2UXTB rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
+def : t2InstAlias<"uxtb16${p} $Rd, $Rm$rot",
+                  (t2UXTB16 rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
+def : t2InstAlias<"uxth${p} $Rd, $Rm$rot",
+                  (t2UXTH rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
+
+def : t2InstAlias<"sxtb${p} $Rd, $Rm$rot",
+                  (t2SXTB rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
+def : t2InstAlias<"sxtb16${p} $Rd, $Rm$rot",
+                  (t2SXTB16 rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
+def : t2InstAlias<"sxth${p} $Rd, $Rm$rot",
+                  (t2SXTH rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
+
+
+// "mov Rd, t2_so_imm_not" can be handled via "mvn" in assembly, just like
+// for isel.
+def : t2InstAlias<"mov${p} $Rd, $imm",
+                  (t2MVNi rGPR:$Rd, t2_so_imm_not:$imm, pred:$p, zero_reg)>;
+def : t2InstAlias<"mvn${p} $Rd, $imm",
+                  (t2MOVi rGPR:$Rd, t2_so_imm_not:$imm, pred:$p, zero_reg)>;
+// Same for AND <--> BIC
+def : t2InstAlias<"bic${s}${p} $Rd, $Rn, $imm",
+                  (t2ANDri rGPR:$Rd, rGPR:$Rn, t2_so_imm_not:$imm,
+                           pred:$p, cc_out:$s)>;
+def : t2InstAlias<"bic${s}${p} $Rdn, $imm",
+                  (t2ANDri rGPR:$Rdn, rGPR:$Rdn, t2_so_imm_not:$imm,
+                           pred:$p, cc_out:$s)>;
+def : t2InstAlias<"and${s}${p} $Rd, $Rn, $imm",
+                  (t2BICri rGPR:$Rd, rGPR:$Rn, t2_so_imm_not:$imm,
+                           pred:$p, cc_out:$s)>;
+def : t2InstAlias<"and${s}${p} $Rdn, $imm",
+                  (t2BICri rGPR:$Rdn, rGPR:$Rdn, t2_so_imm_not:$imm,
+                           pred:$p, cc_out:$s)>;
+// Likewise, "add Rd, t2_so_imm_neg" -> sub
+def : t2InstAlias<"add${s}${p} $Rd, $Rn, $imm",
+                  (t2SUBri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm_neg:$imm,
+                           pred:$p, cc_out:$s)>;
+def : t2InstAlias<"add${s}${p} $Rd, $imm",
+                  (t2SUBri GPRnopc:$Rd, GPRnopc:$Rd, t2_so_imm_neg:$imm,
+                           pred:$p, cc_out:$s)>;
+// Same for CMP <--> CMN via t2_so_imm_neg
+def : t2InstAlias<"cmp${p} $Rd, $imm",
+                  (t2CMNri rGPR:$Rd, t2_so_imm_neg:$imm, pred:$p)>;
+def : t2InstAlias<"cmn${p} $Rd, $imm",
+                  (t2CMPri rGPR:$Rd, t2_so_imm_neg:$imm, pred:$p)>;
+
+
+// Wide 'mul' encoding can be specified with only two operands.
+def : t2InstAlias<"mul${p} $Rn, $Rm",
+                  (t2MUL rGPR:$Rn, rGPR:$Rm, rGPR:$Rn, pred:$p)>;
+
+// "neg" is and alias for "rsb rd, rn, #0"
+def : t2InstAlias<"neg${s}${p} $Rd, $Rm",
+                  (t2RSBri rGPR:$Rd, rGPR:$Rm, 0, pred:$p, cc_out:$s)>;
+
+// MOV so_reg assembler pseudos. InstAlias isn't expressive enough for
+// these, unfortunately.
+def t2MOVsi: t2AsmPseudo<"mov${p} $Rd, $shift",
+                         (ins rGPR:$Rd, t2_so_reg:$shift, pred:$p)>;
+def t2MOVSsi: t2AsmPseudo<"movs${p} $Rd, $shift",
+                          (ins rGPR:$Rd, t2_so_reg:$shift, pred:$p)>;
+
+def t2MOVsr: t2AsmPseudo<"mov${p} $Rd, $shift",
+                         (ins rGPR:$Rd, so_reg_reg:$shift, pred:$p)>;
+def t2MOVSsr: t2AsmPseudo<"movs${p} $Rd, $shift",
+                          (ins rGPR:$Rd, so_reg_reg:$shift, pred:$p)>;
+
+// ADR w/o the .w suffix
+def : t2InstAlias<"adr${p} $Rd, $addr",
+                  (t2ADR rGPR:$Rd, t2adrlabel:$addr, pred:$p)>;
+
+// LDR(literal) w/ alternate [pc, #imm] syntax.
+def t2LDRpcrel   : t2AsmPseudo<"ldr${p} $Rt, $addr",
+                         (ins GPR:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
+def t2LDRBpcrel  : t2AsmPseudo<"ldrb${p} $Rt, $addr",
+                         (ins GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
+def t2LDRHpcrel  : t2AsmPseudo<"ldrh${p} $Rt, $addr",
+                         (ins GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
+def t2LDRSBpcrel  : t2AsmPseudo<"ldrsb${p} $Rt, $addr",
+                         (ins GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
+def t2LDRSHpcrel  : t2AsmPseudo<"ldrsh${p} $Rt, $addr",
+                         (ins GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
+    // Version w/ the .w suffix.
+def : t2InstAlias<"ldr${p}.w $Rt, $addr",
+                  (t2LDRpcrel GPR:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p), 0>;
+def : t2InstAlias<"ldrb${p}.w $Rt, $addr",
+                  (t2LDRBpcrel GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"ldrh${p}.w $Rt, $addr",
+                  (t2LDRHpcrel GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"ldrsb${p}.w $Rt, $addr",
+                  (t2LDRSBpcrel GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"ldrsh${p}.w $Rt, $addr",
+                  (t2LDRSHpcrel GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
+
+def : t2InstAlias<"add${p} $Rd, pc, $imm",
+                  (t2ADR rGPR:$Rd, imm0_4095:$imm, pred:$p)>;
+
+// PLD/PLDW/PLI with alternate literal form.
+def : t2InstAlias<"pld${p} $addr",
+                  (t2PLDpci t2ldr_pcrel_imm12:$addr, pred:$p)>;
+def : InstAlias<"pli${p} $addr",
+                 (t2PLIpci  t2ldr_pcrel_imm12:$addr, pred:$p)>,
+      Requires<[IsThumb2,HasV7]>;
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td b/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td
new file mode 100644
index 0000000..55a6efc
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td
@@ -0,0 +1,1782 @@
+//===-- ARMInstrVFP.td - VFP support for ARM ---------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the ARM VFP instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+def SDT_FTOI    : SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisFP<1>]>;
+def SDT_ITOF    : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVT<1, f32>]>;
+def SDT_CMPFP0  : SDTypeProfile<0, 1, [SDTCisFP<0>]>;
+def SDT_VMOVDRR : SDTypeProfile<1, 2, [SDTCisVT<0, f64>, SDTCisVT<1, i32>,
+                                       SDTCisSameAs<1, 2>]>;
+
+def arm_ftoui  : SDNode<"ARMISD::FTOUI",   SDT_FTOI>;
+def arm_ftosi  : SDNode<"ARMISD::FTOSI",   SDT_FTOI>;
+def arm_sitof  : SDNode<"ARMISD::SITOF",   SDT_ITOF>;
+def arm_uitof  : SDNode<"ARMISD::UITOF",   SDT_ITOF>;
+def arm_fmstat : SDNode<"ARMISD::FMSTAT",  SDTNone, [SDNPInGlue, SDNPOutGlue]>;
+def arm_cmpfp  : SDNode<"ARMISD::CMPFP",   SDT_ARMCmp, [SDNPOutGlue]>;
+def arm_cmpfp0 : SDNode<"ARMISD::CMPFPw0", SDT_CMPFP0, [SDNPOutGlue]>;
+def arm_fmdrr  : SDNode<"ARMISD::VMOVDRR", SDT_VMOVDRR>;
+
+
+//===----------------------------------------------------------------------===//
+// Operand Definitions.
+//
+
+// 8-bit floating-point immediate encodings.
+def FPImmOperand : AsmOperandClass {
+  let Name = "FPImm";
+  let ParserMethod = "parseFPImm";
+}
+
+def vfp_f32imm : Operand<f32>,
+                 PatLeaf<(f32 fpimm), [{
+      return ARM_AM::getFP32Imm(N->getValueAPF()) != -1;
+    }], SDNodeXForm<fpimm, [{
+      APFloat InVal = N->getValueAPF();
+      uint32_t enc = ARM_AM::getFP32Imm(InVal);
+      return CurDAG->getTargetConstant(enc, MVT::i32);
+    }]>> {
+  let PrintMethod = "printFPImmOperand";
+  let ParserMatchClass = FPImmOperand;
+}
+
+def vfp_f64imm : Operand<f64>,
+                 PatLeaf<(f64 fpimm), [{
+      return ARM_AM::getFP64Imm(N->getValueAPF()) != -1;
+    }], SDNodeXForm<fpimm, [{
+      APFloat InVal = N->getValueAPF();
+      uint32_t enc = ARM_AM::getFP64Imm(InVal);
+      return CurDAG->getTargetConstant(enc, MVT::i32);
+    }]>> {
+  let PrintMethod = "printFPImmOperand";
+  let ParserMatchClass = FPImmOperand;
+}
+
+def alignedload32 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 4;
+}]>;
+
+def alignedstore32 : PatFrag<(ops node:$val, node:$ptr),
+                             (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() >= 4;
+}]>;
+
+// The VCVT to/from fixed-point instructions encode the 'fbits' operand
+// (the number of fixed bits) differently than it appears in the assembly
+// source. It's encoded as "Size - fbits" where Size is the size of the
+// fixed-point representation (32 or 16) and fbits is the value appearing
+// in the assembly source, an integer in [0,16] or (0,32], depending on size.
+def fbits32_asm_operand : AsmOperandClass { let Name = "FBits32"; }
+def fbits32 : Operand<i32> {
+  let PrintMethod = "printFBits32";
+  let ParserMatchClass = fbits32_asm_operand;
+}
+
+def fbits16_asm_operand : AsmOperandClass { let Name = "FBits16"; }
+def fbits16 : Operand<i32> {
+  let PrintMethod = "printFBits16";
+  let ParserMatchClass = fbits16_asm_operand;
+}
+
+//===----------------------------------------------------------------------===//
+//  Load / store Instructions.
+//
+
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
+
+def VLDRD : ADI5<0b1101, 0b01, (outs DPR:$Dd), (ins addrmode5:$addr),
+                 IIC_fpLoad64, "vldr", "\t$Dd, $addr",
+                 [(set DPR:$Dd, (f64 (alignedload32 addrmode5:$addr)))]>;
+
+def VLDRS : ASI5<0b1101, 0b01, (outs SPR:$Sd), (ins addrmode5:$addr),
+                 IIC_fpLoad32, "vldr", "\t$Sd, $addr",
+                 [(set SPR:$Sd, (load addrmode5:$addr))]> {
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
+}
+
+} // End of 'let canFoldAsLoad = 1, isReMaterializable = 1 in'
+
+def VSTRD : ADI5<0b1101, 0b00, (outs), (ins DPR:$Dd, addrmode5:$addr),
+                 IIC_fpStore64, "vstr", "\t$Dd, $addr",
+                 [(alignedstore32 (f64 DPR:$Dd), addrmode5:$addr)]>;
+
+def VSTRS : ASI5<0b1101, 0b00, (outs), (ins SPR:$Sd, addrmode5:$addr),
+                 IIC_fpStore32, "vstr", "\t$Sd, $addr",
+                 [(store SPR:$Sd, addrmode5:$addr)]> {
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
+}
+
+//===----------------------------------------------------------------------===//
+//  Load / store multiple Instructions.
+//
+
+multiclass vfp_ldst_mult<string asm, bit L_bit,
+                         InstrItinClass itin, InstrItinClass itin_upd> {
+  // Double Precision
+  def DIA :
+    AXDI4<(outs), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops),
+          IndexModeNone, itin,
+          !strconcat(asm, "ia${p}\t$Rn, $regs"), "", []> {
+    let Inst{24-23} = 0b01;       // Increment After
+    let Inst{21}    = 0;          // No writeback
+    let Inst{20}    = L_bit;
+  }
+  def DIA_UPD :
+    AXDI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs,
+                               variable_ops),
+          IndexModeUpd, itin_upd,
+          !strconcat(asm, "ia${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+    let Inst{24-23} = 0b01;       // Increment After
+    let Inst{21}    = 1;          // Writeback
+    let Inst{20}    = L_bit;
+  }
+  def DDB_UPD :
+    AXDI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs,
+                               variable_ops),
+          IndexModeUpd, itin_upd,
+          !strconcat(asm, "db${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+    let Inst{24-23} = 0b10;       // Decrement Before
+    let Inst{21}    = 1;          // Writeback
+    let Inst{20}    = L_bit;
+  }
+
+  // Single Precision
+  def SIA :
+    AXSI4<(outs), (ins GPR:$Rn, pred:$p, spr_reglist:$regs, variable_ops),
+          IndexModeNone, itin,
+          !strconcat(asm, "ia${p}\t$Rn, $regs"), "", []> {
+    let Inst{24-23} = 0b01;       // Increment After
+    let Inst{21}    = 0;          // No writeback
+    let Inst{20}    = L_bit;
+
+    // Some single precision VFP instructions may be executed on both NEON and
+    // VFP pipelines.
+    let D = VFPNeonDomain;
+  }
+  def SIA_UPD :
+    AXSI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, spr_reglist:$regs,
+                               variable_ops),
+          IndexModeUpd, itin_upd,
+          !strconcat(asm, "ia${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+    let Inst{24-23} = 0b01;       // Increment After
+    let Inst{21}    = 1;          // Writeback
+    let Inst{20}    = L_bit;
+
+    // Some single precision VFP instructions may be executed on both NEON and
+    // VFP pipelines.
+    let D = VFPNeonDomain;
+  }
+  def SDB_UPD :
+    AXSI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, spr_reglist:$regs,
+                               variable_ops),
+          IndexModeUpd, itin_upd,
+          !strconcat(asm, "db${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+    let Inst{24-23} = 0b10;       // Decrement Before
+    let Inst{21}    = 1;          // Writeback
+    let Inst{20}    = L_bit;
+
+    // Some single precision VFP instructions may be executed on both NEON and
+    // VFP pipelines.
+    let D = VFPNeonDomain;
+  }
+}
+
+let neverHasSideEffects = 1 in {
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
+defm VLDM : vfp_ldst_mult<"vldm", 1, IIC_fpLoad_m, IIC_fpLoad_mu>;
+
+let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
+defm VSTM : vfp_ldst_mult<"vstm", 0, IIC_fpStore_m, IIC_fpStore_mu>;
+
+} // neverHasSideEffects
+
+def : MnemonicAlias<"vldm", "vldmia">;
+def : MnemonicAlias<"vstm", "vstmia">;
+
+// FLDM/FSTM - Load / Store multiple single / double precision registers for
+// pre-ARMv6 cores.
+// These instructions are deprecated!
+def : VFP2MnemonicAlias<"fldmias", "vldmia">;
+def : VFP2MnemonicAlias<"fldmdbs", "vldmdb">;
+def : VFP2MnemonicAlias<"fldmeas", "vldmdb">;
+def : VFP2MnemonicAlias<"fldmfds", "vldmia">;
+def : VFP2MnemonicAlias<"fldmiad", "vldmia">;
+def : VFP2MnemonicAlias<"fldmdbd", "vldmdb">;
+def : VFP2MnemonicAlias<"fldmead", "vldmdb">;
+def : VFP2MnemonicAlias<"fldmfdd", "vldmia">;
+
+def : VFP2MnemonicAlias<"fstmias", "vstmia">;
+def : VFP2MnemonicAlias<"fstmdbs", "vstmdb">;
+def : VFP2MnemonicAlias<"fstmeas", "vstmia">;
+def : VFP2MnemonicAlias<"fstmfds", "vstmdb">;
+def : VFP2MnemonicAlias<"fstmiad", "vstmia">;
+def : VFP2MnemonicAlias<"fstmdbd", "vstmdb">;
+def : VFP2MnemonicAlias<"fstmead", "vstmia">;
+def : VFP2MnemonicAlias<"fstmfdd", "vstmdb">;
+
+def : InstAlias<"vpush${p} $r", (VSTMDDB_UPD SP, pred:$p, dpr_reglist:$r)>,
+                Requires<[HasVFP2]>;
+def : InstAlias<"vpush${p} $r", (VSTMSDB_UPD SP, pred:$p, spr_reglist:$r)>,
+                Requires<[HasVFP2]>;
+def : InstAlias<"vpop${p} $r",  (VLDMDIA_UPD SP, pred:$p, dpr_reglist:$r)>,
+                Requires<[HasVFP2]>;
+def : InstAlias<"vpop${p} $r",  (VLDMSIA_UPD SP, pred:$p, spr_reglist:$r)>,
+                Requires<[HasVFP2]>;
+defm : VFPDTAnyInstAlias<"vpush${p}", "$r",
+                         (VSTMSDB_UPD SP, pred:$p, spr_reglist:$r)>;
+defm : VFPDTAnyInstAlias<"vpush${p}", "$r",
+                         (VSTMDDB_UPD SP, pred:$p, dpr_reglist:$r)>;
+defm : VFPDTAnyInstAlias<"vpop${p}", "$r",
+                         (VLDMSIA_UPD SP, pred:$p, spr_reglist:$r)>;
+defm : VFPDTAnyInstAlias<"vpop${p}", "$r",
+                         (VLDMDIA_UPD SP, pred:$p, dpr_reglist:$r)>;
+
+// FLDMX, FSTMX - Load and store multiple unknown precision registers for
+// pre-armv6 cores.
+// These instruction are deprecated so we don't want them to get selected.
+multiclass vfp_ldstx_mult<string asm, bit L_bit> {
+  // Unknown precision
+  def XIA :
+    AXXI4<(outs), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops),
+          IndexModeNone, !strconcat(asm, "iax${p}\t$Rn, $regs"), "", []> {
+    let Inst{24-23} = 0b01;       // Increment After
+    let Inst{21}    = 0;          // No writeback
+    let Inst{20}    = L_bit;
+  }
+  def XIA_UPD :
+    AXXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops),
+          IndexModeUpd, !strconcat(asm, "iax${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+    let Inst{24-23} = 0b01;         // Increment After
+    let Inst{21}    = 1;            // Writeback
+    let Inst{20}    = L_bit;
+  }
+  def XDB_UPD :
+    AXXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops),
+          IndexModeUpd, !strconcat(asm, "dbx${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+    let Inst{24-23} = 0b10;         // Decrement Before
+    let Inst{21}    = 1;            // Writeback
+    let Inst{20}    = L_bit;
+  }
+}
+
+defm FLDM : vfp_ldstx_mult<"fldm", 1>;
+defm FSTM : vfp_ldstx_mult<"fstm", 0>;
+
+def : VFP2MnemonicAlias<"fldmeax", "fldmdbx">;
+def : VFP2MnemonicAlias<"fldmfdx", "fldmiax">;
+
+def : VFP2MnemonicAlias<"fstmeax", "fstmiax">;
+def : VFP2MnemonicAlias<"fstmfdx", "fstmdbx">;
+
+//===----------------------------------------------------------------------===//
+// FP Binary Operations.
+//
+
+let TwoOperandAliasConstraint = "$Dn = $Dd" in
+def VADDD  : ADbI<0b11100, 0b11, 0, 0,
+                  (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
+                  IIC_fpALU64, "vadd", ".f64\t$Dd, $Dn, $Dm",
+                  [(set DPR:$Dd, (fadd DPR:$Dn, (f64 DPR:$Dm)))]>;
+
+let TwoOperandAliasConstraint = "$Sn = $Sd" in
+def VADDS  : ASbIn<0b11100, 0b11, 0, 0,
+                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                   IIC_fpALU32, "vadd", ".f32\t$Sd, $Sn, $Sm",
+                   [(set SPR:$Sd, (fadd SPR:$Sn, SPR:$Sm))]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+let TwoOperandAliasConstraint = "$Dn = $Dd" in
+def VSUBD  : ADbI<0b11100, 0b11, 1, 0,
+                  (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
+                  IIC_fpALU64, "vsub", ".f64\t$Dd, $Dn, $Dm",
+                  [(set DPR:$Dd, (fsub DPR:$Dn, (f64 DPR:$Dm)))]>;
+
+let TwoOperandAliasConstraint = "$Sn = $Sd" in
+def VSUBS  : ASbIn<0b11100, 0b11, 1, 0,
+                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                   IIC_fpALU32, "vsub", ".f32\t$Sd, $Sn, $Sm",
+                   [(set SPR:$Sd, (fsub SPR:$Sn, SPR:$Sm))]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+let TwoOperandAliasConstraint = "$Dn = $Dd" in
+def VDIVD  : ADbI<0b11101, 0b00, 0, 0,
+                  (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
+                  IIC_fpDIV64, "vdiv", ".f64\t$Dd, $Dn, $Dm",
+                  [(set DPR:$Dd, (fdiv DPR:$Dn, (f64 DPR:$Dm)))]>;
+
+let TwoOperandAliasConstraint = "$Sn = $Sd" in
+def VDIVS  : ASbI<0b11101, 0b00, 0, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                  IIC_fpDIV32, "vdiv", ".f32\t$Sd, $Sn, $Sm",
+                  [(set SPR:$Sd, (fdiv SPR:$Sn, SPR:$Sm))]>;
+
+let TwoOperandAliasConstraint = "$Dn = $Dd" in
+def VMULD  : ADbI<0b11100, 0b10, 0, 0,
+                  (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
+                  IIC_fpMUL64, "vmul", ".f64\t$Dd, $Dn, $Dm",
+                  [(set DPR:$Dd, (fmul DPR:$Dn, (f64 DPR:$Dm)))]>;
+
+let TwoOperandAliasConstraint = "$Sn = $Sd" in
+def VMULS  : ASbIn<0b11100, 0b10, 0, 0,
+                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                   IIC_fpMUL32, "vmul", ".f32\t$Sd, $Sn, $Sm",
+                   [(set SPR:$Sd, (fmul SPR:$Sn, SPR:$Sm))]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def VNMULD : ADbI<0b11100, 0b10, 1, 0,
+                  (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
+                  IIC_fpMUL64, "vnmul", ".f64\t$Dd, $Dn, $Dm",
+                  [(set DPR:$Dd, (fneg (fmul DPR:$Dn, (f64 DPR:$Dm))))]>;
+
+def VNMULS : ASbI<0b11100, 0b10, 1, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                  IIC_fpMUL32, "vnmul", ".f32\t$Sd, $Sn, $Sm",
+                  [(set SPR:$Sd, (fneg (fmul SPR:$Sn, SPR:$Sm)))]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+multiclass vsel_inst<string op, bits<2> opc, int CC> {
+  let DecoderNamespace = "VFPV8", PostEncoderMethod = "",
+      Uses = [CPSR], AddedComplexity = 4 in {
+    def S : ASbInp<0b11100, opc, 0,
+                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                   NoItinerary, !strconcat("vsel", op, ".f32\t$Sd, $Sn, $Sm"),
+                   [(set SPR:$Sd, (ARMcmov SPR:$Sm, SPR:$Sn, CC))]>,
+                   Requires<[HasFPARMv8]>;
+
+    def D : ADbInp<0b11100, opc, 0,
+                   (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
+                   NoItinerary, !strconcat("vsel", op, ".f64\t$Dd, $Dn, $Dm"),
+                   [(set DPR:$Dd, (ARMcmov (f64 DPR:$Dm), (f64 DPR:$Dn), CC))]>,
+                   Requires<[HasFPARMv8, HasDPVFP]>;
+  }
+}
+
+// The CC constants here match ARMCC::CondCodes.
+defm VSELGT : vsel_inst<"gt", 0b11, 12>;
+defm VSELGE : vsel_inst<"ge", 0b10, 10>;
+defm VSELEQ : vsel_inst<"eq", 0b00, 0>;
+defm VSELVS : vsel_inst<"vs", 0b01, 6>;
+
+multiclass vmaxmin_inst<string op, bit opc, SDNode SD> {
+  let DecoderNamespace = "VFPV8", PostEncoderMethod = "" in {
+    def S : ASbInp<0b11101, 0b00, opc,
+                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                   NoItinerary, !strconcat(op, ".f32\t$Sd, $Sn, $Sm"),
+                   [(set SPR:$Sd, (SD SPR:$Sn, SPR:$Sm))]>,
+                   Requires<[HasFPARMv8]>;
+
+    def D : ADbInp<0b11101, 0b00, opc,
+                   (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
+                   NoItinerary, !strconcat(op, ".f64\t$Dd, $Dn, $Dm"),
+                   [(set DPR:$Dd, (f64 (SD (f64 DPR:$Dn), (f64 DPR:$Dm))))]>,
+                   Requires<[HasFPARMv8, HasDPVFP]>;
+  }
+}
+
+defm VMAXNM : vmaxmin_inst<"vmaxnm", 0, ARMvmaxnm>;
+defm VMINNM : vmaxmin_inst<"vminnm", 1, ARMvminnm>;
+
+// Match reassociated forms only if not sign dependent rounding.
+def : Pat<(fmul (fneg DPR:$a), (f64 DPR:$b)),
+          (VNMULD DPR:$a, DPR:$b)>,
+          Requires<[NoHonorSignDependentRounding,HasDPVFP]>;
+def : Pat<(fmul (fneg SPR:$a), SPR:$b),
+          (VNMULS SPR:$a, SPR:$b)>, Requires<[NoHonorSignDependentRounding]>;
+
+// These are encoded as unary instructions.
+let Defs = [FPSCR_NZCV] in {
+def VCMPED : ADuI<0b11101, 0b11, 0b0100, 0b11, 0,
+                  (outs), (ins DPR:$Dd, DPR:$Dm),
+                  IIC_fpCMP64, "vcmpe", ".f64\t$Dd, $Dm",
+                  [(arm_cmpfp DPR:$Dd, (f64 DPR:$Dm))]>;
+
+def VCMPES : ASuI<0b11101, 0b11, 0b0100, 0b11, 0,
+                  (outs), (ins SPR:$Sd, SPR:$Sm),
+                  IIC_fpCMP32, "vcmpe", ".f32\t$Sd, $Sm",
+                  [(arm_cmpfp SPR:$Sd, SPR:$Sm)]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+// FIXME: Verify encoding after integrated assembler is working.
+def VCMPD  : ADuI<0b11101, 0b11, 0b0100, 0b01, 0,
+                  (outs), (ins DPR:$Dd, DPR:$Dm),
+                  IIC_fpCMP64, "vcmp", ".f64\t$Dd, $Dm",
+                  [/* For disassembly only; pattern left blank */]>;
+
+def VCMPS  : ASuI<0b11101, 0b11, 0b0100, 0b01, 0,
+                  (outs), (ins SPR:$Sd, SPR:$Sm),
+                  IIC_fpCMP32, "vcmp", ".f32\t$Sd, $Sm",
+                  [/* For disassembly only; pattern left blank */]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+} // Defs = [FPSCR_NZCV]
+
+//===----------------------------------------------------------------------===//
+// FP Unary Operations.
+//
+
+def VABSD  : ADuI<0b11101, 0b11, 0b0000, 0b11, 0,
+                  (outs DPR:$Dd), (ins DPR:$Dm),
+                  IIC_fpUNA64, "vabs", ".f64\t$Dd, $Dm",
+                  [(set DPR:$Dd, (fabs (f64 DPR:$Dm)))]>;
+
+def VABSS  : ASuIn<0b11101, 0b11, 0b0000, 0b11, 0,
+                   (outs SPR:$Sd), (ins SPR:$Sm),
+                   IIC_fpUNA32, "vabs", ".f32\t$Sd, $Sm",
+                   [(set SPR:$Sd, (fabs SPR:$Sm))]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+let Defs = [FPSCR_NZCV] in {
+def VCMPEZD : ADuI<0b11101, 0b11, 0b0101, 0b11, 0,
+                   (outs), (ins DPR:$Dd),
+                   IIC_fpCMP64, "vcmpe", ".f64\t$Dd, #0",
+                   [(arm_cmpfp0 (f64 DPR:$Dd))]> {
+  let Inst{3-0} = 0b0000;
+  let Inst{5}   = 0;
+}
+
+def VCMPEZS : ASuI<0b11101, 0b11, 0b0101, 0b11, 0,
+                   (outs), (ins SPR:$Sd),
+                   IIC_fpCMP32, "vcmpe", ".f32\t$Sd, #0",
+                   [(arm_cmpfp0 SPR:$Sd)]> {
+  let Inst{3-0} = 0b0000;
+  let Inst{5}   = 0;
+
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+// FIXME: Verify encoding after integrated assembler is working.
+def VCMPZD  : ADuI<0b11101, 0b11, 0b0101, 0b01, 0,
+                   (outs), (ins DPR:$Dd),
+                   IIC_fpCMP64, "vcmp", ".f64\t$Dd, #0",
+                   [/* For disassembly only; pattern left blank */]> {
+  let Inst{3-0} = 0b0000;
+  let Inst{5}   = 0;
+}
+
+def VCMPZS  : ASuI<0b11101, 0b11, 0b0101, 0b01, 0,
+                   (outs), (ins SPR:$Sd),
+                   IIC_fpCMP32, "vcmp", ".f32\t$Sd, #0",
+                   [/* For disassembly only; pattern left blank */]> {
+  let Inst{3-0} = 0b0000;
+  let Inst{5}   = 0;
+
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+} // Defs = [FPSCR_NZCV]
+
+def VCVTDS  : ASuI<0b11101, 0b11, 0b0111, 0b11, 0,
+                   (outs DPR:$Dd), (ins SPR:$Sm),
+                   IIC_fpCVTDS, "vcvt", ".f64.f32\t$Dd, $Sm",
+                   [(set DPR:$Dd, (fextend SPR:$Sm))]> {
+  // Instruction operands.
+  bits<5> Dd;
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{15-12} = Dd{3-0};
+  let Inst{22}    = Dd{4};
+}
+
+// Special case encoding: bits 11-8 is 0b1011.
+def VCVTSD  : VFPAI<(outs SPR:$Sd), (ins DPR:$Dm), VFPUnaryFrm,
+                    IIC_fpCVTSD, "vcvt", ".f32.f64\t$Sd, $Dm",
+                    [(set SPR:$Sd, (fround DPR:$Dm))]> {
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Dm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Dm{3-0};
+  let Inst{5}     = Dm{4};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+
+  let Inst{27-23} = 0b11101;
+  let Inst{21-16} = 0b110111;
+  let Inst{11-8}  = 0b1011;
+  let Inst{7-6}   = 0b11;
+  let Inst{4}     = 0;
+
+  let Predicates = [HasVFP2, HasDPVFP];
+}
+
+// Between half, single and double-precision.  For disassembly only.
+
+// FIXME: Verify encoding after integrated assembler is working.
+def VCVTBHS: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
+                 /* FIXME */ IIC_fpCVTSH, "vcvtb", ".f32.f16\t$Sd, $Sm",
+                 [/* For disassembly only; pattern left blank */]>;
+
+def VCVTBSH: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
+                 /* FIXME */ IIC_fpCVTHS, "vcvtb", ".f16.f32\t$Sd, $Sm",
+                 [/* For disassembly only; pattern left blank */]>;
+
+def VCVTTHS: ASuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm),
+                 /* FIXME */ IIC_fpCVTSH, "vcvtt", ".f32.f16\t$Sd, $Sm",
+                 [/* For disassembly only; pattern left blank */]>;
+
+def VCVTTSH: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm),
+                 /* FIXME */ IIC_fpCVTHS, "vcvtt", ".f16.f32\t$Sd, $Sm",
+                 [/* For disassembly only; pattern left blank */]>;
+
+def VCVTBHD : ADuI<0b11101, 0b11, 0b0010, 0b01, 0,
+                   (outs DPR:$Dd), (ins SPR:$Sm),
+                   NoItinerary, "vcvtb", ".f64.f16\t$Dd, $Sm",
+                   []>, Requires<[HasFPARMv8, HasDPVFP]> {
+  // Instruction operands.
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0} = Sm{4-1};
+  let Inst{5}   = Sm{0};
+}
+
+def VCVTBDH : ADuI<0b11101, 0b11, 0b0011, 0b01, 0,
+                   (outs SPR:$Sd), (ins DPR:$Dm),
+                   NoItinerary, "vcvtb", ".f16.f64\t$Sd, $Dm",
+                   []>, Requires<[HasFPARMv8, HasDPVFP]> {
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Dm;
+
+  // Encode instruction operands.
+  let Inst{3-0}     = Dm{3-0};
+  let Inst{5}       = Dm{4};
+  let Inst{15-12}   = Sd{4-1};
+  let Inst{22}      = Sd{0};
+}
+
+def VCVTTHD : ADuI<0b11101, 0b11, 0b0010, 0b11, 0,
+                   (outs DPR:$Dd), (ins SPR:$Sm),
+                   NoItinerary, "vcvtt", ".f64.f16\t$Dd, $Sm",
+                   []>, Requires<[HasFPARMv8, HasDPVFP]> {
+  // Instruction operands.
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0} = Sm{4-1};
+  let Inst{5}   = Sm{0};
+}
+
+def VCVTTDH : ADuI<0b11101, 0b11, 0b0011, 0b11, 0,
+                   (outs SPR:$Sd), (ins DPR:$Dm),
+                   NoItinerary, "vcvtt", ".f16.f64\t$Sd, $Dm",
+                   []>, Requires<[HasFPARMv8, HasDPVFP]> {
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Dm;
+
+  // Encode instruction operands.
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+  let Inst{3-0}   = Dm{3-0};
+  let Inst{5}     = Dm{4};
+}
+
+def : Pat<(fp_to_f16 SPR:$a),
+          (i32 (COPY_TO_REGCLASS (VCVTBSH SPR:$a), GPR))>;
+
+def : Pat<(fp_to_f16 (f64 DPR:$a)),
+          (i32 (COPY_TO_REGCLASS (VCVTBDH DPR:$a), GPR))>;
+
+def : Pat<(f16_to_fp GPR:$a),
+          (VCVTBHS (COPY_TO_REGCLASS GPR:$a, SPR))>;
+
+def : Pat<(f64 (f16_to_fp GPR:$a)),
+          (VCVTBHD (COPY_TO_REGCLASS GPR:$a, SPR))>;
+
+
+multiclass vcvt_inst<string opc, bits<2> rm> {
+  let PostEncoderMethod = "", DecoderNamespace = "VFPV8" in {
+    def SS : ASuInp<0b11101, 0b11, 0b1100, 0b11, 0,
+                    (outs SPR:$Sd), (ins SPR:$Sm),
+                    NoItinerary, !strconcat("vcvt", opc, ".s32.f32\t$Sd, $Sm"),
+                    []>, Requires<[HasFPARMv8]> {
+      let Inst{17-16} = rm;
+    }
+
+    def US : ASuInp<0b11101, 0b11, 0b1100, 0b01, 0,
+                    (outs SPR:$Sd), (ins SPR:$Sm),
+                    NoItinerary, !strconcat("vcvt", opc, ".u32.f32\t$Sd, $Sm"),
+                    []>, Requires<[HasFPARMv8]> {
+      let Inst{17-16} = rm;
+    }
+
+    def SD : ASuInp<0b11101, 0b11, 0b1100, 0b11, 0,
+                    (outs SPR:$Sd), (ins DPR:$Dm),
+                    NoItinerary, !strconcat("vcvt", opc, ".s32.f64\t$Sd, $Dm"),
+                    []>, Requires<[HasFPARMv8, HasDPVFP]> {
+      bits<5> Dm;
+
+      let Inst{17-16} = rm;
+
+      // Encode instruction operands
+      let Inst{3-0} = Dm{3-0};
+      let Inst{5}   = Dm{4};
+      let Inst{8} = 1;
+    }
+
+    def UD : ASuInp<0b11101, 0b11, 0b1100, 0b01, 0,
+                    (outs SPR:$Sd), (ins DPR:$Dm),
+                    NoItinerary, !strconcat("vcvt", opc, ".u32.f64\t$Sd, $Dm"),
+                    []>, Requires<[HasFPARMv8, HasDPVFP]> {
+      bits<5> Dm;
+
+      let Inst{17-16} = rm;
+
+      // Encode instruction operands
+      let Inst{3-0}  = Dm{3-0};
+      let Inst{5}    = Dm{4};
+      let Inst{8} = 1;
+    }
+  }
+}
+
+defm VCVTA : vcvt_inst<"a", 0b00>;
+defm VCVTN : vcvt_inst<"n", 0b01>;
+defm VCVTP : vcvt_inst<"p", 0b10>;
+defm VCVTM : vcvt_inst<"m", 0b11>;
+
+def VNEGD  : ADuI<0b11101, 0b11, 0b0001, 0b01, 0,
+                  (outs DPR:$Dd), (ins DPR:$Dm),
+                  IIC_fpUNA64, "vneg", ".f64\t$Dd, $Dm",
+                  [(set DPR:$Dd, (fneg (f64 DPR:$Dm)))]>;
+
+def VNEGS  : ASuIn<0b11101, 0b11, 0b0001, 0b01, 0,
+                   (outs SPR:$Sd), (ins SPR:$Sm),
+                   IIC_fpUNA32, "vneg", ".f32\t$Sd, $Sm",
+                   [(set SPR:$Sd, (fneg SPR:$Sm))]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+multiclass vrint_inst_zrx<string opc, bit op, bit op2> {
+  def S : ASuI<0b11101, 0b11, 0b0110, 0b11, 0,
+               (outs SPR:$Sd), (ins SPR:$Sm),
+               NoItinerary, !strconcat("vrint", opc), ".f32\t$Sd, $Sm",
+               []>, Requires<[HasFPARMv8]> {
+    let Inst{7} = op2;
+    let Inst{16} = op;
+  }
+  def D : ADuI<0b11101, 0b11, 0b0110, 0b11, 0,
+                (outs DPR:$Dd), (ins DPR:$Dm),
+                NoItinerary, !strconcat("vrint", opc), ".f64\t$Dd, $Dm",
+                []>, Requires<[HasFPARMv8, HasDPVFP]> {
+    let Inst{7} = op2;
+    let Inst{16} = op;
+  }
+
+  def : InstAlias<!strconcat("vrint", opc, "$p.f32.f32\t$Sd, $Sm"),
+                  (!cast<Instruction>(NAME#"S") SPR:$Sd, SPR:$Sm, pred:$p)>,
+        Requires<[HasFPARMv8]>;
+  def : InstAlias<!strconcat("vrint", opc, "$p.f64.f64\t$Dd, $Dm"),
+                  (!cast<Instruction>(NAME#"D") DPR:$Dd, DPR:$Dm, pred:$p)>,
+        Requires<[HasFPARMv8,HasDPVFP]>;
+}
+
+defm VRINTZ : vrint_inst_zrx<"z", 0, 1>;
+defm VRINTR : vrint_inst_zrx<"r", 0, 0>;
+defm VRINTX : vrint_inst_zrx<"x", 1, 0>;
+
+multiclass vrint_inst_anpm<string opc, bits<2> rm> {
+  let PostEncoderMethod = "", DecoderNamespace = "VFPV8" in {
+    def S : ASuInp<0b11101, 0b11, 0b1000, 0b01, 0,
+                   (outs SPR:$Sd), (ins SPR:$Sm),
+                   NoItinerary, !strconcat("vrint", opc, ".f32\t$Sd, $Sm"),
+                   []>, Requires<[HasFPARMv8]> {
+      let Inst{17-16} = rm;
+    }
+    def D : ADuInp<0b11101, 0b11, 0b1000, 0b01, 0,
+                   (outs DPR:$Dd), (ins DPR:$Dm),
+                   NoItinerary, !strconcat("vrint", opc, ".f64\t$Dd, $Dm"),
+                   []>, Requires<[HasFPARMv8, HasDPVFP]> {
+      let Inst{17-16} = rm;
+    }
+  }
+
+  def : InstAlias<!strconcat("vrint", opc, ".f32.f32\t$Sd, $Sm"),
+                  (!cast<Instruction>(NAME#"S") SPR:$Sd, SPR:$Sm)>,
+        Requires<[HasFPARMv8]>;
+  def : InstAlias<!strconcat("vrint", opc, ".f64.f64\t$Dd, $Dm"),
+                  (!cast<Instruction>(NAME#"D") DPR:$Dd, DPR:$Dm)>,
+        Requires<[HasFPARMv8,HasDPVFP]>;
+}
+
+defm VRINTA : vrint_inst_anpm<"a", 0b00>;
+defm VRINTN : vrint_inst_anpm<"n", 0b01>;
+defm VRINTP : vrint_inst_anpm<"p", 0b10>;
+defm VRINTM : vrint_inst_anpm<"m", 0b11>;
+
+def VSQRTD : ADuI<0b11101, 0b11, 0b0001, 0b11, 0,
+                  (outs DPR:$Dd), (ins DPR:$Dm),
+                  IIC_fpSQRT64, "vsqrt", ".f64\t$Dd, $Dm",
+                  [(set DPR:$Dd, (fsqrt (f64 DPR:$Dm)))]>;
+
+def VSQRTS : ASuI<0b11101, 0b11, 0b0001, 0b11, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sm),
+                  IIC_fpSQRT32, "vsqrt", ".f32\t$Sd, $Sm",
+                  [(set SPR:$Sd, (fsqrt SPR:$Sm))]>;
+
+let neverHasSideEffects = 1 in {
+def VMOVD  : ADuI<0b11101, 0b11, 0b0000, 0b01, 0,
+                  (outs DPR:$Dd), (ins DPR:$Dm),
+                  IIC_fpUNA64, "vmov", ".f64\t$Dd, $Dm", []>;
+
+def VMOVS  : ASuI<0b11101, 0b11, 0b0000, 0b01, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sm),
+                  IIC_fpUNA32, "vmov", ".f32\t$Sd, $Sm", []>;
+} // neverHasSideEffects
+
+//===----------------------------------------------------------------------===//
+// FP <-> GPR Copies.  Int <-> FP Conversions.
+//
+
+def VMOVRS : AVConv2I<0b11100001, 0b1010,
+                      (outs GPR:$Rt), (ins SPR:$Sn),
+                      IIC_fpMOVSI, "vmov", "\t$Rt, $Sn",
+                      [(set GPR:$Rt, (bitconvert SPR:$Sn))]> {
+  // Instruction operands.
+  bits<4> Rt;
+  bits<5> Sn;
+
+  // Encode instruction operands.
+  let Inst{19-16} = Sn{4-1};
+  let Inst{7}     = Sn{0};
+  let Inst{15-12} = Rt;
+
+  let Inst{6-5}   = 0b00;
+  let Inst{3-0}   = 0b0000;
+
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
+}
+
+// Bitcast i32 -> f32.  NEON prefers to use VMOVDRR.
+def VMOVSR : AVConv4I<0b11100000, 0b1010,
+                      (outs SPR:$Sn), (ins GPR:$Rt),
+                      IIC_fpMOVIS, "vmov", "\t$Sn, $Rt",
+                      [(set SPR:$Sn, (bitconvert GPR:$Rt))]>,
+             Requires<[HasVFP2, UseVMOVSR]> {
+  // Instruction operands.
+  bits<5> Sn;
+  bits<4> Rt;
+
+  // Encode instruction operands.
+  let Inst{19-16} = Sn{4-1};
+  let Inst{7}     = Sn{0};
+  let Inst{15-12} = Rt;
+
+  let Inst{6-5}   = 0b00;
+  let Inst{3-0}   = 0b0000;
+
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
+}
+
+let neverHasSideEffects = 1 in {
+def VMOVRRD  : AVConv3I<0b11000101, 0b1011,
+                        (outs GPR:$Rt, GPR:$Rt2), (ins DPR:$Dm),
+                        IIC_fpMOVDI, "vmov", "\t$Rt, $Rt2, $Dm",
+                 [/* FIXME: Can't write pattern for multiple result instr*/]> {
+  // Instruction operands.
+  bits<5> Dm;
+  bits<4> Rt;
+  bits<4> Rt2;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Dm{3-0};
+  let Inst{5}     = Dm{4};
+  let Inst{15-12} = Rt;
+  let Inst{19-16} = Rt2;
+
+  let Inst{7-6} = 0b00;
+
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
+}
+
+def VMOVRRS  : AVConv3I<0b11000101, 0b1010,
+                      (outs GPR:$Rt, GPR:$Rt2), (ins SPR:$src1, SPR:$src2),
+                 IIC_fpMOVDI, "vmov", "\t$Rt, $Rt2, $src1, $src2",
+                 [/* For disassembly only; pattern left blank */]> {
+  bits<5> src1;
+  bits<4> Rt;
+  bits<4> Rt2;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = src1{4-1};
+  let Inst{5}     = src1{0};
+  let Inst{15-12} = Rt;
+  let Inst{19-16} = Rt2;
+
+  let Inst{7-6} = 0b00;
+
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
+  let DecoderMethod = "DecodeVMOVRRS";
+}
+} // neverHasSideEffects
+
+// FMDHR: GPR -> SPR
+// FMDLR: GPR -> SPR
+
+def VMOVDRR : AVConv5I<0b11000100, 0b1011,
+                      (outs DPR:$Dm), (ins GPR:$Rt, GPR:$Rt2),
+                      IIC_fpMOVID, "vmov", "\t$Dm, $Rt, $Rt2",
+                      [(set DPR:$Dm, (arm_fmdrr GPR:$Rt, GPR:$Rt2))]> {
+  // Instruction operands.
+  bits<5> Dm;
+  bits<4> Rt;
+  bits<4> Rt2;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Dm{3-0};
+  let Inst{5}     = Dm{4};
+  let Inst{15-12} = Rt;
+  let Inst{19-16} = Rt2;
+
+  let Inst{7-6}   = 0b00;
+
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
+}
+
+let neverHasSideEffects = 1 in
+def VMOVSRR : AVConv5I<0b11000100, 0b1010,
+                     (outs SPR:$dst1, SPR:$dst2), (ins GPR:$src1, GPR:$src2),
+                IIC_fpMOVID, "vmov", "\t$dst1, $dst2, $src1, $src2",
+                [/* For disassembly only; pattern left blank */]> {
+  // Instruction operands.
+  bits<5> dst1;
+  bits<4> src1;
+  bits<4> src2;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = dst1{4-1};
+  let Inst{5}     = dst1{0};
+  let Inst{15-12} = src1;
+  let Inst{19-16} = src2;
+
+  let Inst{7-6} = 0b00;
+
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
+
+  let DecoderMethod = "DecodeVMOVSRR";
+}
+
+// FMRDH: SPR -> GPR
+// FMRDL: SPR -> GPR
+// FMRRS: SPR -> GPR
+// FMRX:  SPR system reg -> GPR
+// FMSRR: GPR -> SPR
+// FMXR:  GPR -> VFP system reg
+
+
+// Int -> FP:
+
+class AVConv1IDs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
+                        bits<4> opcod4, dag oops, dag iops,
+                        InstrItinClass itin, string opc, string asm,
+                        list<dag> pattern>
+  : AVConv1I<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm,
+             pattern> {
+  // Instruction operands.
+  bits<5> Dd;
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{15-12} = Dd{3-0};
+  let Inst{22}    = Dd{4};
+
+  let Predicates = [HasVFP2, HasDPVFP];
+}
+
+class AVConv1InSs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
+                         bits<4> opcod4, dag oops, dag iops,InstrItinClass itin,
+                         string opc, string asm, list<dag> pattern>
+  : AVConv1In<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm,
+              pattern> {
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+}
+
+def VSITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011,
+                               (outs DPR:$Dd), (ins SPR:$Sm),
+                               IIC_fpCVTID, "vcvt", ".f64.s32\t$Dd, $Sm",
+                               [(set DPR:$Dd, (f64 (arm_sitof SPR:$Sm)))]> {
+  let Inst{7} = 1; // s32
+}
+
+def VSITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010,
+                                (outs SPR:$Sd),(ins SPR:$Sm),
+                                IIC_fpCVTIS, "vcvt", ".f32.s32\t$Sd, $Sm",
+                                [(set SPR:$Sd, (arm_sitof SPR:$Sm))]> {
+  let Inst{7} = 1; // s32
+
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def VUITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011,
+                               (outs DPR:$Dd), (ins SPR:$Sm),
+                               IIC_fpCVTID, "vcvt", ".f64.u32\t$Dd, $Sm",
+                               [(set DPR:$Dd, (f64 (arm_uitof SPR:$Sm)))]> {
+  let Inst{7} = 0; // u32
+}
+
+def VUITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010,
+                                (outs SPR:$Sd), (ins SPR:$Sm),
+                                IIC_fpCVTIS, "vcvt", ".f32.u32\t$Sd, $Sm",
+                                [(set SPR:$Sd, (arm_uitof SPR:$Sm))]> {
+  let Inst{7} = 0; // u32
+
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+// FP -> Int:
+
+class AVConv1IsD_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
+                        bits<4> opcod4, dag oops, dag iops,
+                        InstrItinClass itin, string opc, string asm,
+                        list<dag> pattern>
+  : AVConv1I<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm,
+             pattern> {
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Dm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Dm{3-0};
+  let Inst{5}     = Dm{4};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+
+  let Predicates = [HasVFP2, HasDPVFP];
+}
+
+class AVConv1InsS_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
+                         bits<4> opcod4, dag oops, dag iops,
+                         InstrItinClass itin, string opc, string asm,
+                         list<dag> pattern>
+  : AVConv1In<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm,
+              pattern> {
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+}
+
+// Always set Z bit in the instruction, i.e. "round towards zero" variants.
+def VTOSIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1101, 0b1011,
+                                (outs SPR:$Sd), (ins DPR:$Dm),
+                                IIC_fpCVTDI, "vcvt", ".s32.f64\t$Sd, $Dm",
+                                [(set SPR:$Sd, (arm_ftosi (f64 DPR:$Dm)))]> {
+  let Inst{7} = 1; // Z bit
+}
+
+def VTOSIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1101, 0b1010,
+                                 (outs SPR:$Sd), (ins SPR:$Sm),
+                                 IIC_fpCVTSI, "vcvt", ".s32.f32\t$Sd, $Sm",
+                                 [(set SPR:$Sd, (arm_ftosi SPR:$Sm))]> {
+  let Inst{7} = 1; // Z bit
+
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def VTOUIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011,
+                               (outs SPR:$Sd), (ins DPR:$Dm),
+                               IIC_fpCVTDI, "vcvt", ".u32.f64\t$Sd, $Dm",
+                               [(set SPR:$Sd, (arm_ftoui (f64 DPR:$Dm)))]> {
+  let Inst{7} = 1; // Z bit
+}
+
+def VTOUIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1100, 0b1010,
+                                 (outs SPR:$Sd), (ins SPR:$Sm),
+                                 IIC_fpCVTSI, "vcvt", ".u32.f32\t$Sd, $Sm",
+                                 [(set SPR:$Sd, (arm_ftoui SPR:$Sm))]> {
+  let Inst{7} = 1; // Z bit
+
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+// And the Z bit '0' variants, i.e. use the rounding mode specified by FPSCR.
+let Uses = [FPSCR] in {
+// FIXME: Verify encoding after integrated assembler is working.
+def VTOSIRD : AVConv1IsD_Encode<0b11101, 0b11, 0b1101, 0b1011,
+                                (outs SPR:$Sd), (ins DPR:$Dm),
+                                IIC_fpCVTDI, "vcvtr", ".s32.f64\t$Sd, $Dm",
+                                [(set SPR:$Sd, (int_arm_vcvtr (f64 DPR:$Dm)))]>{
+  let Inst{7} = 0; // Z bit
+}
+
+def VTOSIRS : AVConv1InsS_Encode<0b11101, 0b11, 0b1101, 0b1010,
+                                 (outs SPR:$Sd), (ins SPR:$Sm),
+                                 IIC_fpCVTSI, "vcvtr", ".s32.f32\t$Sd, $Sm",
+                                 [(set SPR:$Sd, (int_arm_vcvtr SPR:$Sm))]> {
+  let Inst{7} = 0; // Z bit
+}
+
+def VTOUIRD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011,
+                                (outs SPR:$Sd), (ins DPR:$Dm),
+                                IIC_fpCVTDI, "vcvtr", ".u32.f64\t$Sd, $Dm",
+                                [(set SPR:$Sd, (int_arm_vcvtru(f64 DPR:$Dm)))]>{
+  let Inst{7} = 0; // Z bit
+}
+
+def VTOUIRS : AVConv1InsS_Encode<0b11101, 0b11, 0b1100, 0b1010,
+                                 (outs SPR:$Sd), (ins SPR:$Sm),
+                                 IIC_fpCVTSI, "vcvtr", ".u32.f32\t$Sd, $Sm",
+                                 [(set SPR:$Sd, (int_arm_vcvtru SPR:$Sm))]> {
+  let Inst{7} = 0; // Z bit
+}
+}
+
+// Convert between floating-point and fixed-point
+// Data type for fixed-point naming convention:
+//   S16 (U=0, sx=0) -> SH
+//   U16 (U=1, sx=0) -> UH
+//   S32 (U=0, sx=1) -> SL
+//   U32 (U=1, sx=1) -> UL
+
+let Constraints = "$a = $dst" in {
+
+// FP to Fixed-Point:
+
+// Single Precision register
+class AVConv1XInsS_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4,
+                          bit op5, dag oops, dag iops, InstrItinClass itin,
+                          string opc, string asm, list<dag> pattern>
+  : AVConv1XI<op1, op2, op3, op4, op5, oops, iops, itin, opc, asm, pattern>,
+  Sched<[WriteCvtFP]> {
+  bits<5> dst;
+  // if dp_operation then UInt(D:Vd) else UInt(Vd:D);
+  let Inst{22} = dst{0};
+  let Inst{15-12} = dst{4-1};
+}
+
+// Double Precision register
+class AVConv1XInsD_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4,
+                          bit op5, dag oops, dag iops, InstrItinClass itin,
+                          string opc, string asm, list<dag> pattern>
+  : AVConv1XI<op1, op2, op3, op4, op5, oops, iops, itin, opc, asm, pattern>,
+    Sched<[WriteCvtFP]> {
+  bits<5> dst;
+  // if dp_operation then UInt(D:Vd) else UInt(Vd:D);
+  let Inst{22} = dst{4};
+  let Inst{15-12} = dst{3-0};
+
+  let Predicates = [HasVFP2, HasDPVFP];
+}
+
+def VTOSHS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1010, 0,
+                       (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
+                 IIC_fpCVTSI, "vcvt", ".s16.f32\t$dst, $a, $fbits", []> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def VTOUHS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1111, 0b1010, 0,
+                       (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
+                 IIC_fpCVTSI, "vcvt", ".u16.f32\t$dst, $a, $fbits", []> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def VTOSLS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1010, 1,
+                       (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
+                 IIC_fpCVTSI, "vcvt", ".s32.f32\t$dst, $a, $fbits", []> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def VTOULS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1111, 0b1010, 1,
+                       (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
+                 IIC_fpCVTSI, "vcvt", ".u32.f32\t$dst, $a, $fbits", []> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def VTOSHD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1110, 0b1011, 0,
+                       (outs DPR:$dst), (ins DPR:$a, fbits16:$fbits),
+                 IIC_fpCVTDI, "vcvt", ".s16.f64\t$dst, $a, $fbits", []>;
+
+def VTOUHD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1111, 0b1011, 0,
+                       (outs DPR:$dst), (ins DPR:$a, fbits16:$fbits),
+                 IIC_fpCVTDI, "vcvt", ".u16.f64\t$dst, $a, $fbits", []>;
+
+def VTOSLD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1110, 0b1011, 1,
+                       (outs DPR:$dst), (ins DPR:$a, fbits32:$fbits),
+                 IIC_fpCVTDI, "vcvt", ".s32.f64\t$dst, $a, $fbits", []>;
+
+def VTOULD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1111, 0b1011, 1,
+                       (outs DPR:$dst), (ins DPR:$a, fbits32:$fbits),
+                 IIC_fpCVTDI, "vcvt", ".u32.f64\t$dst, $a, $fbits", []>;
+
+// Fixed-Point to FP:
+
+def VSHTOS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1010, 0b1010, 0,
+                       (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
+                 IIC_fpCVTIS, "vcvt", ".f32.s16\t$dst, $a, $fbits", []> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def VUHTOS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1011, 0b1010, 0,
+                       (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
+                 IIC_fpCVTIS, "vcvt", ".f32.u16\t$dst, $a, $fbits", []> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def VSLTOS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1010, 0b1010, 1,
+                       (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
+                 IIC_fpCVTIS, "vcvt", ".f32.s32\t$dst, $a, $fbits", []> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def VULTOS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1011, 0b1010, 1,
+                       (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
+                 IIC_fpCVTIS, "vcvt", ".f32.u32\t$dst, $a, $fbits", []> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def VSHTOD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1010, 0b1011, 0,
+                       (outs DPR:$dst), (ins DPR:$a, fbits16:$fbits),
+                 IIC_fpCVTID, "vcvt", ".f64.s16\t$dst, $a, $fbits", []>;
+
+def VUHTOD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1011, 0b1011, 0,
+                       (outs DPR:$dst), (ins DPR:$a, fbits16:$fbits),
+                 IIC_fpCVTID, "vcvt", ".f64.u16\t$dst, $a, $fbits", []>;
+
+def VSLTOD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1010, 0b1011, 1,
+                       (outs DPR:$dst), (ins DPR:$a, fbits32:$fbits),
+                 IIC_fpCVTID, "vcvt", ".f64.s32\t$dst, $a, $fbits", []>;
+
+def VULTOD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1011, 0b1011, 1,
+                       (outs DPR:$dst), (ins DPR:$a, fbits32:$fbits),
+                 IIC_fpCVTID, "vcvt", ".f64.u32\t$dst, $a, $fbits", []>;
+
+} // End of 'let Constraints = "$a = $dst" in'
+
+//===----------------------------------------------------------------------===//
+// FP Multiply-Accumulate Operations.
+//
+
+def VMLAD : ADbI<0b11100, 0b00, 0, 0,
+                 (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
+                 IIC_fpMAC64, "vmla", ".f64\t$Dd, $Dn, $Dm",
+                 [(set DPR:$Dd, (fadd_mlx (fmul_su DPR:$Dn, DPR:$Dm),
+                                          (f64 DPR:$Ddin)))]>,
+              RegConstraint<"$Ddin = $Dd">,
+              Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
+
+def VMLAS : ASbIn<0b11100, 0b00, 0, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpMAC32, "vmla", ".f32\t$Sd, $Sn, $Sm",
+                  [(set SPR:$Sd, (fadd_mlx (fmul_su SPR:$Sn, SPR:$Sm),
+                                           SPR:$Sdin))]>,
+              RegConstraint<"$Sdin = $Sd">,
+              Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
+          (VMLAD DPR:$dstin, DPR:$a, DPR:$b)>,
+          Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
+def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
+          (VMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
+          Requires<[HasVFP2,DontUseNEONForFP, UseFPVMLx,DontUseFusedMAC]>;
+
+def VMLSD : ADbI<0b11100, 0b00, 1, 0,
+                 (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
+                 IIC_fpMAC64, "vmls", ".f64\t$Dd, $Dn, $Dm",
+                 [(set DPR:$Dd, (fadd_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
+                                          (f64 DPR:$Ddin)))]>,
+              RegConstraint<"$Ddin = $Dd">,
+              Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
+
+def VMLSS : ASbIn<0b11100, 0b00, 1, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpMAC32, "vmls", ".f32\t$Sd, $Sn, $Sm",
+                  [(set SPR:$Sd, (fadd_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)),
+                                           SPR:$Sdin))]>,
+              RegConstraint<"$Sdin = $Sd">,
+              Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
+          (VMLSD DPR:$dstin, DPR:$a, DPR:$b)>,
+          Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
+def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
+          (VMLSS SPR:$dstin, SPR:$a, SPR:$b)>,
+          Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
+
+def VNMLAD : ADbI<0b11100, 0b01, 1, 0,
+                  (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
+                  IIC_fpMAC64, "vnmla", ".f64\t$Dd, $Dn, $Dm",
+                  [(set DPR:$Dd,(fsub_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
+                                          (f64 DPR:$Ddin)))]>,
+                RegConstraint<"$Ddin = $Dd">,
+                Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
+
+def VNMLAS : ASbI<0b11100, 0b01, 1, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpMAC32, "vnmla", ".f32\t$Sd, $Sn, $Sm",
+                  [(set SPR:$Sd, (fsub_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)),
+                                           SPR:$Sdin))]>,
+                RegConstraint<"$Sdin = $Sd">,
+                Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin),
+          (VNMLAD DPR:$dstin, DPR:$a, DPR:$b)>,
+          Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
+def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin),
+          (VNMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
+          Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
+
+def VNMLSD : ADbI<0b11100, 0b01, 0, 0,
+                  (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
+                  IIC_fpMAC64, "vnmls", ".f64\t$Dd, $Dn, $Dm",
+                  [(set DPR:$Dd, (fsub_mlx (fmul_su DPR:$Dn, DPR:$Dm),
+                                           (f64 DPR:$Ddin)))]>,
+               RegConstraint<"$Ddin = $Dd">,
+               Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
+
+def VNMLSS : ASbI<0b11100, 0b01, 0, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpMAC32, "vnmls", ".f32\t$Sd, $Sn, $Sm",
+             [(set SPR:$Sd, (fsub_mlx (fmul_su SPR:$Sn, SPR:$Sm), SPR:$Sdin))]>,
+                         RegConstraint<"$Sdin = $Sd">,
+                Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin),
+          (VNMLSD DPR:$dstin, DPR:$a, DPR:$b)>,
+          Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
+def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin),
+          (VNMLSS SPR:$dstin, SPR:$a, SPR:$b)>,
+          Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
+
+//===----------------------------------------------------------------------===//
+// Fused FP Multiply-Accumulate Operations.
+//
+def VFMAD : ADbI<0b11101, 0b10, 0, 0,
+                 (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
+                 IIC_fpFMAC64, "vfma", ".f64\t$Dd, $Dn, $Dm",
+                 [(set DPR:$Dd, (fadd_mlx (fmul_su DPR:$Dn, DPR:$Dm),
+                                          (f64 DPR:$Ddin)))]>,
+              RegConstraint<"$Ddin = $Dd">,
+              Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
+
+def VFMAS : ASbIn<0b11101, 0b10, 0, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpFMAC32, "vfma", ".f32\t$Sd, $Sn, $Sm",
+                  [(set SPR:$Sd, (fadd_mlx (fmul_su SPR:$Sn, SPR:$Sm),
+                                           SPR:$Sdin))]>,
+              RegConstraint<"$Sdin = $Sd">,
+              Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines.
+}
+
+def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
+          (VFMAD DPR:$dstin, DPR:$a, DPR:$b)>,
+          Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
+def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
+          (VFMAS SPR:$dstin, SPR:$a, SPR:$b)>,
+          Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
+
+// Match @llvm.fma.* intrinsics
+// (fma x, y, z) -> (vfms z, x, y)
+def : Pat<(f64 (fma DPR:$Dn, DPR:$Dm, DPR:$Ddin)),
+          (VFMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
+      Requires<[HasVFP4,HasDPVFP]>;
+def : Pat<(f32 (fma SPR:$Sn, SPR:$Sm, SPR:$Sdin)),
+          (VFMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
+      Requires<[HasVFP4]>;
+
+def VFMSD : ADbI<0b11101, 0b10, 1, 0,
+                 (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
+                 IIC_fpFMAC64, "vfms", ".f64\t$Dd, $Dn, $Dm",
+                 [(set DPR:$Dd, (fadd_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
+                                          (f64 DPR:$Ddin)))]>,
+              RegConstraint<"$Ddin = $Dd">,
+              Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
+
+def VFMSS : ASbIn<0b11101, 0b10, 1, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpFMAC32, "vfms", ".f32\t$Sd, $Sn, $Sm",
+                  [(set SPR:$Sd, (fadd_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)),
+                                           SPR:$Sdin))]>,
+              RegConstraint<"$Sdin = $Sd">,
+              Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines.
+}
+
+def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
+          (VFMSD DPR:$dstin, DPR:$a, DPR:$b)>,
+          Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
+def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
+          (VFMSS SPR:$dstin, SPR:$a, SPR:$b)>,
+          Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
+
+// Match @llvm.fma.* intrinsics
+// (fma (fneg x), y, z) -> (vfms z, x, y)
+def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin)),
+          (VFMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
+      Requires<[HasVFP4,HasDPVFP]>;
+def : Pat<(f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin)),
+          (VFMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
+      Requires<[HasVFP4]>;
+// (fma x, (fneg y), z) -> (vfms z, x, y)
+def : Pat<(f64 (fma DPR:$Dn, (fneg DPR:$Dm), DPR:$Ddin)),
+          (VFMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
+      Requires<[HasVFP4,HasDPVFP]>;
+def : Pat<(f32 (fma SPR:$Sn, (fneg SPR:$Sm), SPR:$Sdin)),
+          (VFMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
+      Requires<[HasVFP4]>;
+
+def VFNMAD : ADbI<0b11101, 0b01, 1, 0,
+                  (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
+                  IIC_fpFMAC64, "vfnma", ".f64\t$Dd, $Dn, $Dm",
+                  [(set DPR:$Dd,(fsub_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
+                                          (f64 DPR:$Ddin)))]>,
+                RegConstraint<"$Ddin = $Dd">,
+                Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
+
+def VFNMAS : ASbI<0b11101, 0b01, 1, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpFMAC32, "vfnma", ".f32\t$Sd, $Sn, $Sm",
+                  [(set SPR:$Sd, (fsub_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)),
+                                           SPR:$Sdin))]>,
+                RegConstraint<"$Sdin = $Sd">,
+                Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines.
+}
+
+def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin),
+          (VFNMAD DPR:$dstin, DPR:$a, DPR:$b)>,
+          Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
+def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin),
+          (VFNMAS SPR:$dstin, SPR:$a, SPR:$b)>,
+          Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
+
+// Match @llvm.fma.* intrinsics
+// (fneg (fma x, y, z)) -> (vfnma z, x, y)
+def : Pat<(fneg (fma (f64 DPR:$Dn), (f64 DPR:$Dm), (f64 DPR:$Ddin))),
+          (VFNMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
+      Requires<[HasVFP4,HasDPVFP]>;
+def : Pat<(fneg (fma (f32 SPR:$Sn), (f32 SPR:$Sm), (f32 SPR:$Sdin))),
+          (VFNMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
+      Requires<[HasVFP4]>;
+// (fma (fneg x), y, (fneg z)) -> (vfnma z, x, y)
+def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, (fneg DPR:$Ddin))),
+          (VFNMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
+      Requires<[HasVFP4,HasDPVFP]>;
+def : Pat<(f32 (fma (fneg SPR:$Sn), SPR:$Sm, (fneg SPR:$Sdin))),
+          (VFNMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
+      Requires<[HasVFP4]>;
+
+def VFNMSD : ADbI<0b11101, 0b01, 0, 0,
+                  (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
+                  IIC_fpFMAC64, "vfnms", ".f64\t$Dd, $Dn, $Dm",
+                  [(set DPR:$Dd, (fsub_mlx (fmul_su DPR:$Dn, DPR:$Dm),
+                                           (f64 DPR:$Ddin)))]>,
+               RegConstraint<"$Ddin = $Dd">,
+               Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
+
+def VFNMSS : ASbI<0b11101, 0b01, 0, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpFMAC32, "vfnms", ".f32\t$Sd, $Sn, $Sm",
+             [(set SPR:$Sd, (fsub_mlx (fmul_su SPR:$Sn, SPR:$Sm), SPR:$Sdin))]>,
+                         RegConstraint<"$Sdin = $Sd">,
+                  Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines.
+}
+
+def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin),
+          (VFNMSD DPR:$dstin, DPR:$a, DPR:$b)>,
+          Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
+def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin),
+          (VFNMSS SPR:$dstin, SPR:$a, SPR:$b)>,
+          Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
+
+// Match @llvm.fma.* intrinsics
+
+// (fma x, y, (fneg z)) -> (vfnms z, x, y))
+def : Pat<(f64 (fma DPR:$Dn, DPR:$Dm, (fneg DPR:$Ddin))),
+          (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
+      Requires<[HasVFP4,HasDPVFP]>;
+def : Pat<(f32 (fma SPR:$Sn, SPR:$Sm, (fneg SPR:$Sdin))),
+          (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
+      Requires<[HasVFP4]>;
+// (fneg (fma (fneg x), y, z)) -> (vfnms z, x, y)
+def : Pat<(fneg (f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin))),
+          (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
+      Requires<[HasVFP4,HasDPVFP]>;
+def : Pat<(fneg (f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin))),
+          (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
+      Requires<[HasVFP4]>;
+// (fneg (fma x, (fneg y), z) -> (vfnms z, x, y)
+def : Pat<(fneg (f64 (fma DPR:$Dn, (fneg DPR:$Dm), DPR:$Ddin))),
+          (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
+      Requires<[HasVFP4,HasDPVFP]>;
+def : Pat<(fneg (f32 (fma SPR:$Sn, (fneg SPR:$Sm), SPR:$Sdin))),
+          (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
+      Requires<[HasVFP4]>;
+
+//===----------------------------------------------------------------------===//
+// FP Conditional moves.
+//
+
+let neverHasSideEffects = 1 in {
+def VMOVDcc  : PseudoInst<(outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm, cmovpred:$p),
+                    IIC_fpUNA64,
+                    [(set (f64 DPR:$Dd),
+                          (ARMcmov DPR:$Dn, DPR:$Dm, cmovpred:$p))]>,
+               RegConstraint<"$Dn = $Dd">, Requires<[HasVFP2,HasDPVFP]>;
+
+def VMOVScc  : PseudoInst<(outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm, cmovpred:$p),
+                    IIC_fpUNA32,
+                    [(set (f32 SPR:$Sd),
+                          (ARMcmov SPR:$Sn, SPR:$Sm, cmovpred:$p))]>,
+               RegConstraint<"$Sn = $Sd">, Requires<[HasVFP2]>;
+} // neverHasSideEffects
+
+//===----------------------------------------------------------------------===//
+// Move from VFP System Register to ARM core register.
+//
+
+class MovFromVFP<bits<4> opc19_16, dag oops, dag iops, string opc, string asm,
+                 list<dag> pattern>:
+  VFPAI<oops, iops, VFPMiscFrm, IIC_fpSTAT, opc, asm, pattern> {
+
+  // Instruction operand.
+  bits<4> Rt;
+
+  let Inst{27-20} = 0b11101111;
+  let Inst{19-16} = opc19_16;
+  let Inst{15-12} = Rt;
+  let Inst{11-8}  = 0b1010;
+  let Inst{7}     = 0;
+  let Inst{6-5}   = 0b00;
+  let Inst{4}     = 1;
+  let Inst{3-0}   = 0b0000;
+}
+
+// APSR is the application level alias of CPSR. This FPSCR N, Z, C, V flags
+// to APSR.
+let Defs = [CPSR], Uses = [FPSCR_NZCV], Rt = 0b1111 /* apsr_nzcv */ in
+def FMSTAT : MovFromVFP<0b0001 /* fpscr */, (outs), (ins),
+                        "vmrs", "\tAPSR_nzcv, fpscr", [(arm_fmstat)]>;
+
+// Application level FPSCR -> GPR
+let hasSideEffects = 1, Uses = [FPSCR] in
+def VMRS : MovFromVFP<0b0001 /* fpscr */, (outs GPR:$Rt), (ins),
+                      "vmrs", "\t$Rt, fpscr",
+                      [(set GPR:$Rt, (int_arm_get_fpscr))]>;
+
+// System level FPEXC, FPSID -> GPR
+let Uses = [FPSCR] in {
+  def VMRS_FPEXC : MovFromVFP<0b1000 /* fpexc */, (outs GPR:$Rt), (ins),
+                              "vmrs", "\t$Rt, fpexc", []>;
+  def VMRS_FPSID : MovFromVFP<0b0000 /* fpsid */, (outs GPR:$Rt), (ins),
+                              "vmrs", "\t$Rt, fpsid", []>;
+  def VMRS_MVFR0 : MovFromVFP<0b0111 /* mvfr0 */, (outs GPR:$Rt), (ins),
+                              "vmrs", "\t$Rt, mvfr0", []>;
+  def VMRS_MVFR1 : MovFromVFP<0b0110 /* mvfr1 */, (outs GPR:$Rt), (ins),
+                              "vmrs", "\t$Rt, mvfr1", []>;
+  def VMRS_MVFR2 : MovFromVFP<0b0101 /* mvfr2 */, (outs GPR:$Rt), (ins),
+                              "vmrs", "\t$Rt, mvfr2", []>, Requires<[HasFPARMv8]>;
+  def VMRS_FPINST : MovFromVFP<0b1001 /* fpinst */, (outs GPR:$Rt), (ins),
+                              "vmrs", "\t$Rt, fpinst", []>;
+  def VMRS_FPINST2 : MovFromVFP<0b1010 /* fpinst2 */, (outs GPR:$Rt), (ins),
+                                "vmrs", "\t$Rt, fpinst2", []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Move from ARM core register to VFP System Register.
+//
+
+class MovToVFP<bits<4> opc19_16, dag oops, dag iops, string opc, string asm,
+               list<dag> pattern>:
+  VFPAI<oops, iops, VFPMiscFrm, IIC_fpSTAT, opc, asm, pattern> {
+
+  // Instruction operand.
+  bits<4> src;
+
+  // Encode instruction operand.
+  let Inst{15-12} = src;
+
+  let Inst{27-20} = 0b11101110;
+  let Inst{19-16} = opc19_16;
+  let Inst{11-8}  = 0b1010;
+  let Inst{7}     = 0;
+  let Inst{4}     = 1;
+}
+
+let Defs = [FPSCR] in {
+  // Application level GPR -> FPSCR
+  def VMSR : MovToVFP<0b0001 /* fpscr */, (outs), (ins GPR:$src),
+                      "vmsr", "\tfpscr, $src", [(int_arm_set_fpscr GPR:$src)]>;
+  // System level GPR -> FPEXC
+  def VMSR_FPEXC : MovToVFP<0b1000 /* fpexc */, (outs), (ins GPR:$src),
+                      "vmsr", "\tfpexc, $src", []>;
+  // System level GPR -> FPSID
+  def VMSR_FPSID : MovToVFP<0b0000 /* fpsid */, (outs), (ins GPR:$src),
+                      "vmsr", "\tfpsid, $src", []>;
+
+  def VMSR_FPINST : MovToVFP<0b1001 /* fpinst */, (outs), (ins GPR:$src),
+                              "vmsr", "\tfpinst, $src", []>;
+  def VMSR_FPINST2 : MovToVFP<0b1010 /* fpinst2 */, (outs), (ins GPR:$src),
+                                "vmsr", "\tfpinst2, $src", []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Misc.
+//
+
+// Materialize FP immediates. VFP3 only.
+let isReMaterializable = 1 in {
+def FCONSTD : VFPAI<(outs DPR:$Dd), (ins vfp_f64imm:$imm),
+                    VFPMiscFrm, IIC_fpUNA64,
+                    "vmov", ".f64\t$Dd, $imm",
+                    [(set DPR:$Dd, vfp_f64imm:$imm)]>,
+              Requires<[HasVFP3,HasDPVFP]> {
+  bits<5> Dd;
+  bits<8> imm;
+
+  let Inst{27-23} = 0b11101;
+  let Inst{22}    = Dd{4};
+  let Inst{21-20} = 0b11;
+  let Inst{19-16} = imm{7-4};
+  let Inst{15-12} = Dd{3-0};
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 1;          // Double precision.
+  let Inst{7-4}   = 0b0000;
+  let Inst{3-0}   = imm{3-0};
+}
+
+def FCONSTS : VFPAI<(outs SPR:$Sd), (ins vfp_f32imm:$imm),
+                     VFPMiscFrm, IIC_fpUNA32,
+                     "vmov", ".f32\t$Sd, $imm",
+                     [(set SPR:$Sd, vfp_f32imm:$imm)]>, Requires<[HasVFP3]> {
+  bits<5> Sd;
+  bits<8> imm;
+
+  let Inst{27-23} = 0b11101;
+  let Inst{22}    = Sd{0};
+  let Inst{21-20} = 0b11;
+  let Inst{19-16} = imm{7-4};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 0;          // Single precision.
+  let Inst{7-4}   = 0b0000;
+  let Inst{3-0}   = imm{3-0};
+}
+}
+
+//===----------------------------------------------------------------------===//
+// Assembler aliases.
+//
+// A few mnemonic aliases for pre-unifixed syntax. We don't guarantee to
+// support them all, but supporting at least some of the basics is
+// good to be friendly.
+def : VFP2MnemonicAlias<"flds", "vldr">;
+def : VFP2MnemonicAlias<"fldd", "vldr">;
+def : VFP2MnemonicAlias<"fmrs", "vmov">;
+def : VFP2MnemonicAlias<"fmsr", "vmov">;
+def : VFP2MnemonicAlias<"fsqrts", "vsqrt">;
+def : VFP2MnemonicAlias<"fsqrtd", "vsqrt">;
+def : VFP2MnemonicAlias<"fadds", "vadd.f32">;
+def : VFP2MnemonicAlias<"faddd", "vadd.f64">;
+def : VFP2MnemonicAlias<"fmrdd", "vmov">;
+def : VFP2MnemonicAlias<"fmrds", "vmov">;
+def : VFP2MnemonicAlias<"fmrrd", "vmov">;
+def : VFP2MnemonicAlias<"fmdrr", "vmov">;
+def : VFP2MnemonicAlias<"fmuls", "vmul.f32">;
+def : VFP2MnemonicAlias<"fmuld", "vmul.f64">;
+def : VFP2MnemonicAlias<"fnegs", "vneg.f32">;
+def : VFP2MnemonicAlias<"fnegd", "vneg.f64">;
+def : VFP2MnemonicAlias<"ftosizd", "vcvt.s32.f64">;
+def : VFP2MnemonicAlias<"ftosid", "vcvtr.s32.f64">;
+def : VFP2MnemonicAlias<"ftosizs", "vcvt.s32.f32">;
+def : VFP2MnemonicAlias<"ftosis", "vcvtr.s32.f32">;
+def : VFP2MnemonicAlias<"ftouizd", "vcvt.u32.f64">;
+def : VFP2MnemonicAlias<"ftouid", "vcvtr.u32.f64">;
+def : VFP2MnemonicAlias<"ftouizs", "vcvt.u32.f32">;
+def : VFP2MnemonicAlias<"ftouis", "vcvtr.u32.f32">;
+def : VFP2MnemonicAlias<"fsitod", "vcvt.f64.s32">;
+def : VFP2MnemonicAlias<"fsitos", "vcvt.f32.s32">;
+def : VFP2MnemonicAlias<"fuitod", "vcvt.f64.u32">;
+def : VFP2MnemonicAlias<"fuitos", "vcvt.f32.u32">;
+def : VFP2MnemonicAlias<"fsts", "vstr">;
+def : VFP2MnemonicAlias<"fstd", "vstr">;
+def : VFP2MnemonicAlias<"fmacd", "vmla.f64">;
+def : VFP2MnemonicAlias<"fmacs", "vmla.f32">;
+def : VFP2MnemonicAlias<"fcpys", "vmov.f32">;
+def : VFP2MnemonicAlias<"fcpyd", "vmov.f64">;
+def : VFP2MnemonicAlias<"fcmps", "vcmp.f32">;
+def : VFP2MnemonicAlias<"fcmpd", "vcmp.f64">;
+def : VFP2MnemonicAlias<"fdivs", "vdiv.f32">;
+def : VFP2MnemonicAlias<"fdivd", "vdiv.f64">;
+def : VFP2MnemonicAlias<"fmrx", "vmrs">;
+def : VFP2MnemonicAlias<"fmxr", "vmsr">;
+
+// Be friendly and accept the old form of zero-compare
+def : VFP2DPInstAlias<"fcmpzd${p} $val", (VCMPZD DPR:$val, pred:$p)>;
+def : VFP2InstAlias<"fcmpzs${p} $val", (VCMPZS SPR:$val, pred:$p)>;
+
+
+def : VFP2InstAlias<"fmstat${p}", (FMSTAT pred:$p)>;
+def : VFP2InstAlias<"fadds${p} $Sd, $Sn, $Sm",
+                    (VADDS SPR:$Sd, SPR:$Sn, SPR:$Sm, pred:$p)>;
+def : VFP2DPInstAlias<"faddd${p} $Dd, $Dn, $Dm",
+                      (VADDD DPR:$Dd, DPR:$Dn, DPR:$Dm, pred:$p)>;
+def : VFP2InstAlias<"fsubs${p} $Sd, $Sn, $Sm",
+                    (VSUBS SPR:$Sd, SPR:$Sn, SPR:$Sm, pred:$p)>;
+def : VFP2DPInstAlias<"fsubd${p} $Dd, $Dn, $Dm",
+                      (VSUBD DPR:$Dd, DPR:$Dn, DPR:$Dm, pred:$p)>;
+
+// No need for the size suffix on VSQRT. It's implied by the register classes.
+def : VFP2InstAlias<"vsqrt${p} $Sd, $Sm", (VSQRTS SPR:$Sd, SPR:$Sm, pred:$p)>;
+def : VFP2DPInstAlias<"vsqrt${p} $Dd, $Dm", (VSQRTD DPR:$Dd, DPR:$Dm, pred:$p)>;
+
+// VLDR/VSTR accept an optional type suffix.
+def : VFP2InstAlias<"vldr${p}.32 $Sd, $addr",
+                    (VLDRS SPR:$Sd, addrmode5:$addr, pred:$p)>;
+def : VFP2InstAlias<"vstr${p}.32 $Sd, $addr",
+                    (VSTRS SPR:$Sd, addrmode5:$addr, pred:$p)>;
+def : VFP2InstAlias<"vldr${p}.64 $Dd, $addr",
+                    (VLDRD DPR:$Dd, addrmode5:$addr, pred:$p)>;
+def : VFP2InstAlias<"vstr${p}.64 $Dd, $addr",
+                    (VSTRD DPR:$Dd, addrmode5:$addr, pred:$p)>;
+
+// VMOV can accept optional 32-bit or less data type suffix suffix.
+def : VFP2InstAlias<"vmov${p}.8 $Rt, $Sn",
+                    (VMOVRS GPR:$Rt, SPR:$Sn, pred:$p)>;
+def : VFP2InstAlias<"vmov${p}.16 $Rt, $Sn",
+                    (VMOVRS GPR:$Rt, SPR:$Sn, pred:$p)>;
+def : VFP2InstAlias<"vmov${p}.32 $Rt, $Sn",
+                    (VMOVRS GPR:$Rt, SPR:$Sn, pred:$p)>;
+def : VFP2InstAlias<"vmov${p}.8 $Sn, $Rt",
+                    (VMOVSR SPR:$Sn, GPR:$Rt, pred:$p)>;
+def : VFP2InstAlias<"vmov${p}.16 $Sn, $Rt",
+                    (VMOVSR SPR:$Sn, GPR:$Rt, pred:$p)>;
+def : VFP2InstAlias<"vmov${p}.32 $Sn, $Rt",
+                    (VMOVSR SPR:$Sn, GPR:$Rt, pred:$p)>;
+
+def : VFP2InstAlias<"vmov${p}.f64 $Rt, $Rt2, $Dn",
+                    (VMOVRRD GPR:$Rt, GPR:$Rt2, DPR:$Dn, pred:$p)>;
+def : VFP2InstAlias<"vmov${p}.f64 $Dn, $Rt, $Rt2",
+                    (VMOVDRR DPR:$Dn, GPR:$Rt, GPR:$Rt2, pred:$p)>;
+
+// VMOVS doesn't need the .f32 to disambiguate from the NEON encoding the way
+// VMOVD does.
+def : VFP2InstAlias<"vmov${p} $Sd, $Sm",
+                    (VMOVS SPR:$Sd, SPR:$Sm, pred:$p)>;
+
+// FCONSTD/FCONSTS alias for vmov.f64/vmov.f32
+// These aliases provide added functionality over vmov.f instructions by
+// allowing users to write assembly containing encoded floating point constants
+// (e.g. #0x70 vs #1.0).  Without these alises there is no way for the
+// assembler to accept encoded fp constants (but the equivalent fp-literal is
+// accepted directly by vmovf).
+def : VFP3InstAlias<"fconstd${p} $Dd, $val",
+                    (FCONSTD DPR:$Dd, vfp_f64imm:$val, pred:$p)>;
+def : VFP3InstAlias<"fconsts${p} $Sd, $val",
+                    (FCONSTS SPR:$Sd, vfp_f32imm:$val, pred:$p)>;
diff --git a/contrib/llvm/lib/Target/ARM/ARMJITInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMJITInfo.cpp
new file mode 100644
index 0000000..6d1114d
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMJITInfo.cpp
@@ -0,0 +1,344 @@
+//===-- ARMJITInfo.cpp - Implement the JIT interfaces for the ARM target --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the JIT interfaces for the ARM target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMJITInfo.h"
+#include "ARMConstantPoolValue.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMRelocations.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Memory.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstdlib>
+using namespace llvm;
+
+#define DEBUG_TYPE "jit"
+
+void ARMJITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
+  report_fatal_error("ARMJITInfo::replaceMachineCodeForFunction");
+}
+
+/// JITCompilerFunction - This contains the address of the JIT function used to
+/// compile a function lazily.
+static TargetJITInfo::JITCompilerFn JITCompilerFunction;
+
+// Get the ASMPREFIX for the current host.  This is often '_'.
+#ifndef __USER_LABEL_PREFIX__
+#define __USER_LABEL_PREFIX__
+#endif
+#define GETASMPREFIX2(X) #X
+#define GETASMPREFIX(X) GETASMPREFIX2(X)
+#define ASMPREFIX GETASMPREFIX(__USER_LABEL_PREFIX__)
+
+// CompilationCallback stub - We can't use a C function with inline assembly in
+// it, because the prolog/epilog inserted by GCC won't work for us. (We need
+// to preserve more context and manipulate the stack directly).  Instead,
+// write our own wrapper, which does things our way, so we have complete
+// control over register saving and restoring.
+extern "C" {
+#if defined(__arm__)
+  void ARMCompilationCallback();
+  asm(
+    ".text\n"
+    ".align 2\n"
+    ".globl " ASMPREFIX "ARMCompilationCallback\n"
+    ASMPREFIX "ARMCompilationCallback:\n"
+    // Save caller saved registers since they may contain stuff
+    // for the real target function right now. We have to act as if this
+    // whole compilation callback doesn't exist as far as the caller is
+    // concerned, so we can't just preserve the callee saved regs.
+    "stmdb sp!, {r0, r1, r2, r3, lr}\n"
+#if (defined(__VFP_FP__) && !defined(__SOFTFP__))
+    "vstmdb sp!, {d0, d1, d2, d3, d4, d5, d6, d7}\n"
+#endif
+    // The LR contains the address of the stub function on entry.
+    // pass it as the argument to the C part of the callback
+    "mov  r0, lr\n"
+    "sub  sp, sp, #4\n"
+    // Call the C portion of the callback
+    "bl   " ASMPREFIX "ARMCompilationCallbackC\n"
+    "add  sp, sp, #4\n"
+    // Restoring the LR to the return address of the function that invoked
+    // the stub and de-allocating the stack space for it requires us to
+    // swap the two saved LR values on the stack, as they're backwards
+    // for what we need since the pop instruction has a pre-determined
+    // order for the registers.
+    //      +--------+
+    //   0  | LR     | Original return address
+    //      +--------+
+    //   1  | LR     | Stub address (start of stub)
+    // 2-5  | R3..R0 | Saved registers (we need to preserve all regs)
+    // 6-20 | D0..D7 | Saved VFP registers
+    //      +--------+
+    //
+#if (defined(__VFP_FP__) && !defined(__SOFTFP__))
+    // Restore VFP caller-saved registers.
+    "vldmia sp!, {d0, d1, d2, d3, d4, d5, d6, d7}\n"
+#endif
+    //
+    //      We need to exchange the values in slots 0 and 1 so we can
+    //      return to the address in slot 1 with the address in slot 0
+    //      restored to the LR.
+    "ldr  r0, [sp,#20]\n"
+    "ldr  r1, [sp,#16]\n"
+    "str  r1, [sp,#20]\n"
+    "str  r0, [sp,#16]\n"
+    // Return to the (newly modified) stub to invoke the real function.
+    // The above twiddling of the saved return addresses allows us to
+    // deallocate everything, including the LR the stub saved, with two
+    // updating load instructions.
+    "ldmia  sp!, {r0, r1, r2, r3, lr}\n"
+    "ldr    pc, [sp], #4\n"
+      );
+#else  // Not an ARM host
+  void ARMCompilationCallback() {
+    llvm_unreachable("Cannot call ARMCompilationCallback() on a non-ARM arch!");
+  }
+#endif
+}
+
+/// ARMCompilationCallbackC - This is the target-specific function invoked
+/// by the function stub when we did not know the real target of a call.
+/// This function must locate the start of the stub or call site and pass
+/// it into the JIT compiler function.
+extern "C" void ARMCompilationCallbackC(intptr_t StubAddr) {
+  // Get the address of the compiled code for this function.
+  intptr_t NewVal = (intptr_t)JITCompilerFunction((void*)StubAddr);
+
+  // Rewrite the call target... so that we don't end up here every time we
+  // execute the call. We're replacing the first two instructions of the
+  // stub with:
+  //   ldr pc, [pc,#-4]
+  //   <addr>
+  if (!sys::Memory::setRangeWritable((void*)StubAddr, 8)) {
+    llvm_unreachable("ERROR: Unable to mark stub writable");
+  }
+  *(intptr_t *)StubAddr = 0xe51ff004;  // ldr pc, [pc, #-4]
+  *(intptr_t *)(StubAddr+4) = NewVal;
+  if (!sys::Memory::setRangeExecutable((void*)StubAddr, 8)) {
+    llvm_unreachable("ERROR: Unable to mark stub executable");
+  }
+}
+
+TargetJITInfo::LazyResolverFn
+ARMJITInfo::getLazyResolverFunction(JITCompilerFn F) {
+  JITCompilerFunction = F;
+  return ARMCompilationCallback;
+}
+
+void *ARMJITInfo::emitGlobalValueIndirectSym(const GlobalValue *GV, void *Ptr,
+                                             JITCodeEmitter &JCE) {
+  uint8_t Buffer[4];
+  uint8_t *Cur = Buffer;
+  MachineCodeEmitter::emitWordLEInto(Cur, (intptr_t)Ptr);
+  void *PtrAddr = JCE.allocIndirectGV(
+      GV, Buffer, sizeof(Buffer), /*Alignment=*/4);
+  addIndirectSymAddr(Ptr, (intptr_t)PtrAddr);
+  return PtrAddr;
+}
+
+TargetJITInfo::StubLayout ARMJITInfo::getStubLayout() {
+  // The stub contains up to 3 4-byte instructions, aligned at 4 bytes, and a
+  // 4-byte address.  See emitFunctionStub for details.
+  StubLayout Result = {16, 4};
+  return Result;
+}
+
+void *ARMJITInfo::emitFunctionStub(const Function* F, void *Fn,
+                                   JITCodeEmitter &JCE) {
+  void *Addr;
+  // If this is just a call to an external function, emit a branch instead of a
+  // call.  The code is the same except for one bit of the last instruction.
+  if (Fn != (void*)(intptr_t)ARMCompilationCallback) {
+    // Branch to the corresponding function addr.
+    if (IsPIC) {
+      // The stub is 16-byte size and 4-aligned.
+      intptr_t LazyPtr = getIndirectSymAddr(Fn);
+      if (!LazyPtr) {
+        // In PIC mode, the function stub is loading a lazy-ptr.
+        LazyPtr= (intptr_t)emitGlobalValueIndirectSym((const GlobalValue*)F, Fn, JCE);
+        DEBUG(if (F)
+                errs() << "JIT: Indirect symbol emitted at [" << LazyPtr
+                       << "] for GV '" << F->getName() << "'\n";
+              else
+                errs() << "JIT: Stub emitted at [" << LazyPtr
+                       << "] for external function at '" << Fn << "'\n");
+      }
+      JCE.emitAlignment(4);
+      Addr = (void*)JCE.getCurrentPCValue();
+      if (!sys::Memory::setRangeWritable(Addr, 16)) {
+        llvm_unreachable("ERROR: Unable to mark stub writable");
+      }
+      JCE.emitWordLE(0xe59fc004);            // ldr ip, [pc, #+4]
+      JCE.emitWordLE(0xe08fc00c);            // L_func$scv: add ip, pc, ip
+      JCE.emitWordLE(0xe59cf000);            // ldr pc, [ip]
+      JCE.emitWordLE(LazyPtr - (intptr_t(Addr)+4+8));  // func - (L_func$scv+8)
+      sys::Memory::InvalidateInstructionCache(Addr, 16);
+      if (!sys::Memory::setRangeExecutable(Addr, 16)) {
+        llvm_unreachable("ERROR: Unable to mark stub executable");
+      }
+    } else {
+      // The stub is 8-byte size and 4-aligned.
+      JCE.emitAlignment(4);
+      Addr = (void*)JCE.getCurrentPCValue();
+      if (!sys::Memory::setRangeWritable(Addr, 8)) {
+        llvm_unreachable("ERROR: Unable to mark stub writable");
+      }
+      JCE.emitWordLE(0xe51ff004);    // ldr pc, [pc, #-4]
+      JCE.emitWordLE((intptr_t)Fn);  // addr of function
+      sys::Memory::InvalidateInstructionCache(Addr, 8);
+      if (!sys::Memory::setRangeExecutable(Addr, 8)) {
+        llvm_unreachable("ERROR: Unable to mark stub executable");
+      }
+    }
+  } else {
+    // The compilation callback will overwrite the first two words of this
+    // stub with indirect branch instructions targeting the compiled code.
+    // This stub sets the return address to restart the stub, so that
+    // the new branch will be invoked when we come back.
+    //
+    // Branch and link to the compilation callback.
+    // The stub is 16-byte size and 4-byte aligned.
+    JCE.emitAlignment(4);
+    Addr = (void*)JCE.getCurrentPCValue();
+    if (!sys::Memory::setRangeWritable(Addr, 16)) {
+      llvm_unreachable("ERROR: Unable to mark stub writable");
+    }
+    // Save LR so the callback can determine which stub called it.
+    // The compilation callback is responsible for popping this prior
+    // to returning.
+    JCE.emitWordLE(0xe92d4000); // push {lr}
+    // Set the return address to go back to the start of this stub.
+    JCE.emitWordLE(0xe24fe00c); // sub lr, pc, #12
+    // Invoke the compilation callback.
+    JCE.emitWordLE(0xe51ff004); // ldr pc, [pc, #-4]
+    // The address of the compilation callback.
+    JCE.emitWordLE((intptr_t)ARMCompilationCallback);
+    sys::Memory::InvalidateInstructionCache(Addr, 16);
+    if (!sys::Memory::setRangeExecutable(Addr, 16)) {
+      llvm_unreachable("ERROR: Unable to mark stub executable");
+    }
+  }
+
+  return Addr;
+}
+
+intptr_t ARMJITInfo::resolveRelocDestAddr(MachineRelocation *MR) const {
+  ARM::RelocationType RT = (ARM::RelocationType)MR->getRelocationType();
+  switch (RT) {
+  default:
+    return (intptr_t)(MR->getResultPointer());
+  case ARM::reloc_arm_pic_jt:
+    // Destination address - jump table base.
+    return (intptr_t)(MR->getResultPointer()) - MR->getConstantVal();
+  case ARM::reloc_arm_jt_base:
+    // Jump table base address.
+    return getJumpTableBaseAddr(MR->getJumpTableIndex());
+  case ARM::reloc_arm_cp_entry:
+  case ARM::reloc_arm_vfp_cp_entry:
+    // Constant pool entry address.
+    return getConstantPoolEntryAddr(MR->getConstantPoolIndex());
+  case ARM::reloc_arm_machine_cp_entry: {
+    ARMConstantPoolValue *ACPV = (ARMConstantPoolValue*)MR->getConstantVal();
+    assert((!ACPV->hasModifier() && !ACPV->mustAddCurrentAddress()) &&
+           "Can't handle this machine constant pool entry yet!");
+    intptr_t Addr = (intptr_t)(MR->getResultPointer());
+    Addr -= getPCLabelAddr(ACPV->getLabelId()) + ACPV->getPCAdjustment();
+    return Addr;
+  }
+  }
+}
+
+/// relocate - Before the JIT can run a block of code that has been emitted,
+/// it must rewrite the code to contain the actual addresses of any
+/// referenced global symbols.
+void ARMJITInfo::relocate(void *Function, MachineRelocation *MR,
+                          unsigned NumRelocs, unsigned char* GOTBase) {
+  for (unsigned i = 0; i != NumRelocs; ++i, ++MR) {
+    void *RelocPos = (char*)Function + MR->getMachineCodeOffset();
+    intptr_t ResultPtr = resolveRelocDestAddr(MR);
+    switch ((ARM::RelocationType)MR->getRelocationType()) {
+    case ARM::reloc_arm_cp_entry:
+    case ARM::reloc_arm_vfp_cp_entry:
+    case ARM::reloc_arm_relative: {
+      // It is necessary to calculate the correct PC relative value. We
+      // subtract the base addr from the target addr to form a byte offset.
+      ResultPtr = ResultPtr - (intptr_t)RelocPos - 8;
+      // If the result is positive, set bit U(23) to 1.
+      if (ResultPtr >= 0)
+        *((intptr_t*)RelocPos) |= 1 << ARMII::U_BitShift;
+      else {
+        // Otherwise, obtain the absolute value and set bit U(23) to 0.
+        *((intptr_t*)RelocPos) &= ~(1 << ARMII::U_BitShift);
+        ResultPtr = - ResultPtr;
+      }
+      // Set the immed value calculated.
+      // VFP immediate offset is multiplied by 4.
+      if (MR->getRelocationType() == ARM::reloc_arm_vfp_cp_entry)
+        ResultPtr = ResultPtr >> 2;
+      *((intptr_t*)RelocPos) |= ResultPtr;
+      // Set register Rn to PC (which is register 15 on all architectures).
+      // FIXME: This avoids the need for register info in the JIT class.
+      *((intptr_t*)RelocPos) |= 15 << ARMII::RegRnShift;
+      break;
+    }
+    case ARM::reloc_arm_pic_jt:
+    case ARM::reloc_arm_machine_cp_entry:
+    case ARM::reloc_arm_absolute: {
+      // These addresses have already been resolved.
+      *((intptr_t*)RelocPos) |= (intptr_t)ResultPtr;
+      break;
+    }
+    case ARM::reloc_arm_branch: {
+      // It is necessary to calculate the correct value of signed_immed_24
+      // field. We subtract the base addr from the target addr to form a
+      // byte offset, which must be inside the range -33554432 and +33554428.
+      // Then, we set the signed_immed_24 field of the instruction to bits
+      // [25:2] of the byte offset. More details ARM-ARM p. A4-11.
+      ResultPtr = ResultPtr - (intptr_t)RelocPos - 8;
+      ResultPtr = (ResultPtr & 0x03FFFFFC) >> 2;
+      assert(ResultPtr >= -33554432 && ResultPtr <= 33554428);
+      *((intptr_t*)RelocPos) |= ResultPtr;
+      break;
+    }
+    case ARM::reloc_arm_jt_base: {
+      // JT base - (instruction addr + 8)
+      ResultPtr = ResultPtr - (intptr_t)RelocPos - 8;
+      *((intptr_t*)RelocPos) |= ResultPtr;
+      break;
+    }
+    case ARM::reloc_arm_movw: {
+      ResultPtr = ResultPtr & 0xFFFF;
+      *((intptr_t*)RelocPos) |= ResultPtr & 0xFFF;
+      *((intptr_t*)RelocPos) |= ((ResultPtr >> 12) & 0xF) << 16;
+      break;
+    }
+    case ARM::reloc_arm_movt: {
+      ResultPtr = (ResultPtr >> 16) & 0xFFFF;
+      *((intptr_t*)RelocPos) |= ResultPtr & 0xFFF;
+      *((intptr_t*)RelocPos) |= ((ResultPtr >> 12) & 0xF) << 16;
+      break;
+    }
+    }
+  }
+}
+
+void ARMJITInfo::Initialize(const MachineFunction &MF, bool isPIC) {
+  const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  ConstPoolId2AddrMap.resize(AFI->getNumPICLabels());
+  JumpTableId2AddrMap.resize(AFI->getNumJumpTables());
+  IsPIC = isPIC;
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMJITInfo.h b/contrib/llvm/lib/Target/ARM/ARMJITInfo.h
new file mode 100644
index 0000000..27e2a20
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMJITInfo.h
@@ -0,0 +1,177 @@
+//===-- ARMJITInfo.h - ARM implementation of the JIT interface  -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the ARMJITInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMJITINFO_H
+#define ARMJITINFO_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/Target/TargetJITInfo.h"
+
+namespace llvm {
+  class ARMTargetMachine;
+
+  class ARMJITInfo : public TargetJITInfo {
+    // ConstPoolId2AddrMap - A map from constant pool ids to the corresponding
+    // CONSTPOOL_ENTRY addresses.
+    SmallVector<intptr_t, 16> ConstPoolId2AddrMap;
+
+    // JumpTableId2AddrMap - A map from inline jumptable ids to the
+    // corresponding inline jump table bases.
+    SmallVector<intptr_t, 16> JumpTableId2AddrMap;
+
+    // PCLabelMap - A map from PC labels to addresses.
+    DenseMap<unsigned, intptr_t> PCLabelMap;
+
+    // Sym2IndirectSymMap - A map from symbol (GlobalValue and ExternalSymbol)
+    // addresses to their indirect symbol addresses.
+    DenseMap<void*, intptr_t> Sym2IndirectSymMap;
+
+    // IsPIC - True if the relocation model is PIC. This is used to determine
+    // how to codegen function stubs.
+    bool IsPIC;
+
+  public:
+    explicit ARMJITInfo() : IsPIC(false) { useGOT = false; }
+
+    /// replaceMachineCodeForFunction - Make it so that calling the function
+    /// whose machine code is at OLD turns into a call to NEW, perhaps by
+    /// overwriting OLD with a branch to NEW.  This is used for self-modifying
+    /// code.
+    ///
+    void replaceMachineCodeForFunction(void *Old, void *New) override;
+
+    /// emitGlobalValueIndirectSym - Use the specified JITCodeEmitter object
+    /// to emit an indirect symbol which contains the address of the specified
+    /// ptr.
+    void *emitGlobalValueIndirectSym(const GlobalValue* GV, void *ptr,
+                                    JITCodeEmitter &JCE) override;
+
+    // getStubLayout - Returns the size and alignment of the largest call stub
+    // on ARM.
+    StubLayout getStubLayout() override;
+
+    /// emitFunctionStub - Use the specified JITCodeEmitter object to emit a
+    /// small native function that simply calls the function at the specified
+    /// address.
+    void *emitFunctionStub(const Function* F, void *Fn,
+                           JITCodeEmitter &JCE) override;
+
+    /// getLazyResolverFunction - Expose the lazy resolver to the JIT.
+    LazyResolverFn getLazyResolverFunction(JITCompilerFn) override;
+
+    /// relocate - Before the JIT can run a block of code that has been emitted,
+    /// it must rewrite the code to contain the actual addresses of any
+    /// referenced global symbols.
+    void relocate(void *Function, MachineRelocation *MR,
+                  unsigned NumRelocs, unsigned char* GOTBase) override;
+
+    /// hasCustomConstantPool - Allows a target to specify that constant
+    /// pool address resolution is handled by the target.
+    bool hasCustomConstantPool() const override { return true; }
+
+    /// hasCustomJumpTables - Allows a target to specify that jumptables
+    /// are emitted by the target.
+    bool hasCustomJumpTables() const override { return true; }
+
+    /// allocateSeparateGVMemory - If true, globals should be placed in
+    /// separately allocated heap memory rather than in the same
+    /// code memory allocated by JITCodeEmitter.
+    bool allocateSeparateGVMemory() const override {
+#ifdef __APPLE__
+      return true;
+#else
+      return false;
+#endif
+    }
+
+    /// Initialize - Initialize internal stage for the function being JITted.
+    /// Resize constant pool ids to CONSTPOOL_ENTRY addresses map; resize
+    /// jump table ids to jump table bases map; remember if codegen relocation
+    /// model is PIC.
+    void Initialize(const MachineFunction &MF, bool isPIC);
+
+    /// getConstantPoolEntryAddr - The ARM target puts all constant
+    /// pool entries into constant islands. This returns the address of the
+    /// constant pool entry of the specified index.
+    intptr_t getConstantPoolEntryAddr(unsigned CPI) const {
+      assert(CPI < ConstPoolId2AddrMap.size());
+      return ConstPoolId2AddrMap[CPI];
+    }
+
+    /// addConstantPoolEntryAddr - Map a Constant Pool Index to the address
+    /// where its associated value is stored. When relocations are processed,
+    /// this value will be used to resolve references to the constant.
+    void addConstantPoolEntryAddr(unsigned CPI, intptr_t Addr) {
+      assert(CPI < ConstPoolId2AddrMap.size());
+      ConstPoolId2AddrMap[CPI] = Addr;
+    }
+
+    /// getJumpTableBaseAddr - The ARM target inline all jump tables within
+    /// text section of the function. This returns the address of the base of
+    /// the jump table of the specified index.
+    intptr_t getJumpTableBaseAddr(unsigned JTI) const {
+      assert(JTI < JumpTableId2AddrMap.size());
+      return JumpTableId2AddrMap[JTI];
+    }
+
+    /// addJumpTableBaseAddr - Map a jump table index to the address where
+    /// the corresponding inline jump table is emitted. When relocations are
+    /// processed, this value will be used to resolve references to the
+    /// jump table.
+    void addJumpTableBaseAddr(unsigned JTI, intptr_t Addr) {
+      assert(JTI < JumpTableId2AddrMap.size());
+      JumpTableId2AddrMap[JTI] = Addr;
+    }
+
+    /// getPCLabelAddr - Retrieve the address of the PC label of the
+    /// specified id.
+    intptr_t getPCLabelAddr(unsigned Id) const {
+      DenseMap<unsigned, intptr_t>::const_iterator I = PCLabelMap.find(Id);
+      assert(I != PCLabelMap.end());
+      return I->second;
+    }
+
+    /// addPCLabelAddr - Remember the address of the specified PC label.
+    void addPCLabelAddr(unsigned Id, intptr_t Addr) {
+      PCLabelMap.insert(std::make_pair(Id, Addr));
+    }
+
+    /// getIndirectSymAddr - Retrieve the address of the indirect symbol of the
+    /// specified symbol located at address. Returns 0 if the indirect symbol
+    /// has not been emitted.
+    intptr_t getIndirectSymAddr(void *Addr) const {
+      DenseMap<void*,intptr_t>::const_iterator I= Sym2IndirectSymMap.find(Addr);
+      if (I != Sym2IndirectSymMap.end())
+        return I->second;
+      return 0;
+    }
+
+    /// addIndirectSymAddr - Add a mapping from address of an emitted symbol to
+    /// its indirect symbol address.
+    void addIndirectSymAddr(void *SymAddr, intptr_t IndSymAddr) {
+      Sym2IndirectSymMap.insert(std::make_pair(SymAddr, IndSymAddr));
+    }
+
+  private:
+    /// resolveRelocDestAddr - Resolve the resulting address of the relocation
+    /// if it's not already solved. Constantpool entries must be resolved by
+    /// ARM target.
+    intptr_t resolveRelocDestAddr(MachineRelocation *MR) const;
+  };
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
new file mode 100644
index 0000000..a03bcdb
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -0,0 +1,2220 @@
+//===-- ARMLoadStoreOptimizer.cpp - ARM load / store opt. pass ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that performs load / store related peephole
+// optimizations. This pass should be run after register allocation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMBaseRegisterInfo.h"
+#include "ARMISelLowering.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMSubtarget.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "Thumb1RegisterInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "arm-ldst-opt"
+
+STATISTIC(NumLDMGened , "Number of ldm instructions generated");
+STATISTIC(NumSTMGened , "Number of stm instructions generated");
+STATISTIC(NumVLDMGened, "Number of vldm instructions generated");
+STATISTIC(NumVSTMGened, "Number of vstm instructions generated");
+STATISTIC(NumLdStMoved, "Number of load / store instructions moved");
+STATISTIC(NumLDRDFormed,"Number of ldrd created before allocation");
+STATISTIC(NumSTRDFormed,"Number of strd created before allocation");
+STATISTIC(NumLDRD2LDM,  "Number of ldrd instructions turned back into ldm");
+STATISTIC(NumSTRD2STM,  "Number of strd instructions turned back into stm");
+STATISTIC(NumLDRD2LDR,  "Number of ldrd instructions turned back into ldr's");
+STATISTIC(NumSTRD2STR,  "Number of strd instructions turned back into str's");
+
+/// ARMAllocLoadStoreOpt - Post- register allocation pass the combine
+/// load / store instructions to form ldm / stm instructions.
+
+namespace {
+  struct ARMLoadStoreOpt : public MachineFunctionPass {
+    static char ID;
+    ARMLoadStoreOpt() : MachineFunctionPass(ID) {}
+
+    const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    const ARMSubtarget *STI;
+    const TargetLowering *TL;
+    ARMFunctionInfo *AFI;
+    RegScavenger *RS;
+    bool isThumb1, isThumb2;
+
+    bool runOnMachineFunction(MachineFunction &Fn) override;
+
+    const char *getPassName() const override {
+      return "ARM load / store optimization pass";
+    }
+
+  private:
+    struct MemOpQueueEntry {
+      int Offset;
+      unsigned Reg;
+      bool isKill;
+      unsigned Position;
+      MachineBasicBlock::iterator MBBI;
+      bool Merged;
+      MemOpQueueEntry(int o, unsigned r, bool k, unsigned p,
+                      MachineBasicBlock::iterator i)
+        : Offset(o), Reg(r), isKill(k), Position(p), MBBI(i), Merged(false) {}
+    };
+    typedef SmallVector<MemOpQueueEntry,8> MemOpQueue;
+    typedef MemOpQueue::iterator MemOpQueueIter;
+
+    void findUsesOfImpDef(SmallVectorImpl<MachineOperand *> &UsesOfImpDefs,
+                          const MemOpQueue &MemOps, unsigned DefReg,
+                          unsigned RangeBegin, unsigned RangeEnd);
+    void UpdateBaseRegUses(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           DebugLoc dl, unsigned Base, unsigned WordOffset,
+                           ARMCC::CondCodes Pred, unsigned PredReg);
+    bool MergeOps(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                  int Offset, unsigned Base, bool BaseKill, int Opcode,
+                  ARMCC::CondCodes Pred, unsigned PredReg, unsigned Scratch,
+                  DebugLoc dl,
+                  ArrayRef<std::pair<unsigned, bool> > Regs,
+                  ArrayRef<unsigned> ImpDefs);
+    void MergeOpsUpdate(MachineBasicBlock &MBB,
+                        MemOpQueue &MemOps,
+                        unsigned memOpsBegin,
+                        unsigned memOpsEnd,
+                        unsigned insertAfter,
+                        int Offset,
+                        unsigned Base,
+                        bool BaseKill,
+                        int Opcode,
+                        ARMCC::CondCodes Pred,
+                        unsigned PredReg,
+                        unsigned Scratch,
+                        DebugLoc dl,
+                        SmallVectorImpl<MachineBasicBlock::iterator> &Merges);
+    void MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, unsigned Base,
+                      int Opcode, unsigned Size,
+                      ARMCC::CondCodes Pred, unsigned PredReg,
+                      unsigned Scratch, MemOpQueue &MemOps,
+                      SmallVectorImpl<MachineBasicBlock::iterator> &Merges);
+    void AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps);
+    bool FixInvalidRegPairOp(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator &MBBI);
+    bool MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator MBBI,
+                                  const TargetInstrInfo *TII,
+                                  bool &Advance,
+                                  MachineBasicBlock::iterator &I);
+    bool MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   bool &Advance,
+                                   MachineBasicBlock::iterator &I);
+    bool LoadStoreMultipleOpti(MachineBasicBlock &MBB);
+    bool MergeReturnIntoLDM(MachineBasicBlock &MBB);
+  };
+  char ARMLoadStoreOpt::ID = 0;
+}
+
+static int getLoadStoreMultipleOpcode(int Opcode, ARM_AM::AMSubMode Mode) {
+  switch (Opcode) {
+  default: llvm_unreachable("Unhandled opcode!");
+  case ARM::LDRi12:
+    ++NumLDMGened;
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::LDMIA;
+    case ARM_AM::da: return ARM::LDMDA;
+    case ARM_AM::db: return ARM::LDMDB;
+    case ARM_AM::ib: return ARM::LDMIB;
+    }
+  case ARM::STRi12:
+    ++NumSTMGened;
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::STMIA;
+    case ARM_AM::da: return ARM::STMDA;
+    case ARM_AM::db: return ARM::STMDB;
+    case ARM_AM::ib: return ARM::STMIB;
+    }
+  case ARM::tLDRi:
+    // tLDMIA is writeback-only - unless the base register is in the input
+    // reglist.
+    ++NumLDMGened;
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::tLDMIA;
+    }
+  case ARM::tSTRi:
+    // There is no non-writeback tSTMIA either.
+    ++NumSTMGened;
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::tSTMIA_UPD;
+    }
+  case ARM::t2LDRi8:
+  case ARM::t2LDRi12:
+    ++NumLDMGened;
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::t2LDMIA;
+    case ARM_AM::db: return ARM::t2LDMDB;
+    }
+  case ARM::t2STRi8:
+  case ARM::t2STRi12:
+    ++NumSTMGened;
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::t2STMIA;
+    case ARM_AM::db: return ARM::t2STMDB;
+    }
+  case ARM::VLDRS:
+    ++NumVLDMGened;
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::VLDMSIA;
+    case ARM_AM::db: return 0; // Only VLDMSDB_UPD exists.
+    }
+  case ARM::VSTRS:
+    ++NumVSTMGened;
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::VSTMSIA;
+    case ARM_AM::db: return 0; // Only VSTMSDB_UPD exists.
+    }
+  case ARM::VLDRD:
+    ++NumVLDMGened;
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::VLDMDIA;
+    case ARM_AM::db: return 0; // Only VLDMDDB_UPD exists.
+    }
+  case ARM::VSTRD:
+    ++NumVSTMGened;
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::VSTMDIA;
+    case ARM_AM::db: return 0; // Only VSTMDDB_UPD exists.
+    }
+  }
+}
+
+namespace llvm {
+  namespace ARM_AM {
+
+AMSubMode getLoadStoreMultipleSubMode(int Opcode) {
+  switch (Opcode) {
+  default: llvm_unreachable("Unhandled opcode!");
+  case ARM::LDMIA_RET:
+  case ARM::LDMIA:
+  case ARM::LDMIA_UPD:
+  case ARM::STMIA:
+  case ARM::STMIA_UPD:
+  case ARM::tLDMIA:
+  case ARM::tLDMIA_UPD:
+  case ARM::tSTMIA_UPD:
+  case ARM::t2LDMIA_RET:
+  case ARM::t2LDMIA:
+  case ARM::t2LDMIA_UPD:
+  case ARM::t2STMIA:
+  case ARM::t2STMIA_UPD:
+  case ARM::VLDMSIA:
+  case ARM::VLDMSIA_UPD:
+  case ARM::VSTMSIA:
+  case ARM::VSTMSIA_UPD:
+  case ARM::VLDMDIA:
+  case ARM::VLDMDIA_UPD:
+  case ARM::VSTMDIA:
+  case ARM::VSTMDIA_UPD:
+    return ARM_AM::ia;
+
+  case ARM::LDMDA:
+  case ARM::LDMDA_UPD:
+  case ARM::STMDA:
+  case ARM::STMDA_UPD:
+    return ARM_AM::da;
+
+  case ARM::LDMDB:
+  case ARM::LDMDB_UPD:
+  case ARM::STMDB:
+  case ARM::STMDB_UPD:
+  case ARM::t2LDMDB:
+  case ARM::t2LDMDB_UPD:
+  case ARM::t2STMDB:
+  case ARM::t2STMDB_UPD:
+  case ARM::VLDMSDB_UPD:
+  case ARM::VSTMSDB_UPD:
+  case ARM::VLDMDDB_UPD:
+  case ARM::VSTMDDB_UPD:
+    return ARM_AM::db;
+
+  case ARM::LDMIB:
+  case ARM::LDMIB_UPD:
+  case ARM::STMIB:
+  case ARM::STMIB_UPD:
+    return ARM_AM::ib;
+  }
+}
+
+  } // end namespace ARM_AM
+} // end namespace llvm
+
+static bool isT1i32Load(unsigned Opc) {
+  return Opc == ARM::tLDRi;
+}
+
+static bool isT2i32Load(unsigned Opc) {
+  return Opc == ARM::t2LDRi12 || Opc == ARM::t2LDRi8;
+}
+
+static bool isi32Load(unsigned Opc) {
+  return Opc == ARM::LDRi12 || isT1i32Load(Opc) || isT2i32Load(Opc) ;
+}
+
+static bool isT1i32Store(unsigned Opc) {
+  return Opc == ARM::tSTRi;
+}
+
+static bool isT2i32Store(unsigned Opc) {
+  return Opc == ARM::t2STRi12 || Opc == ARM::t2STRi8;
+}
+
+static bool isi32Store(unsigned Opc) {
+  return Opc == ARM::STRi12 || isT1i32Store(Opc) || isT2i32Store(Opc);
+}
+
+static unsigned getImmScale(unsigned Opc) {
+  switch (Opc) {
+  default: llvm_unreachable("Unhandled opcode!");
+  case ARM::tLDRi:
+  case ARM::tSTRi:
+    return 1;
+  case ARM::tLDRHi:
+  case ARM::tSTRHi:
+    return 2;
+  case ARM::tLDRBi:
+  case ARM::tSTRBi:
+    return 4;
+  }
+}
+
+/// Update future uses of the base register with the offset introduced
+/// due to writeback. This function only works on Thumb1.
+void
+ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   DebugLoc dl, unsigned Base,
+                                   unsigned WordOffset,
+                                   ARMCC::CondCodes Pred, unsigned PredReg) {
+  assert(isThumb1 && "Can only update base register uses for Thumb1!");
+
+  // Start updating any instructions with immediate offsets. Insert a sub before
+  // the first non-updateable instruction (if any).
+  for (; MBBI != MBB.end(); ++MBBI) {
+    if (MBBI->readsRegister(Base)) {
+      unsigned Opc = MBBI->getOpcode();
+      int Offset;
+      bool InsertSub = false;
+
+      if (Opc == ARM::tLDRi  || Opc == ARM::tSTRi  ||
+          Opc == ARM::tLDRHi || Opc == ARM::tSTRHi ||
+          Opc == ARM::tLDRBi || Opc == ARM::tSTRBi) {
+        // Loads and stores with immediate offsets can be updated, but only if
+        // the new offset isn't negative.
+        // The MachineOperand containing the offset immediate is the last one
+        // before predicates.
+        MachineOperand &MO =
+          MBBI->getOperand(MBBI->getDesc().getNumOperands() - 3);
+        // The offsets are scaled by 1, 2 or 4 depending on the Opcode
+        Offset = MO.getImm() - WordOffset * getImmScale(Opc);
+        if (Offset >= 0)
+          MO.setImm(Offset);
+        else
+          InsertSub = true;
+
+      } else if (Opc == ARM::tSUBi8 || Opc == ARM::tADDi8) {
+        // SUB/ADD using this register. Merge it with the update.
+        // If the merged offset is too large, insert a new sub instead.
+        MachineOperand &MO =
+          MBBI->getOperand(MBBI->getDesc().getNumOperands() - 3);
+        Offset = (Opc == ARM::tSUBi8) ?
+          MO.getImm() + WordOffset * 4 :
+          MO.getImm() - WordOffset * 4 ;
+        if (TL->isLegalAddImmediate(Offset)) {
+          MO.setImm(Offset);
+          // The base register has now been reset, so exit early.
+          return;
+        } else {
+          InsertSub = true;
+        }
+
+      } else {
+        // Can't update the instruction.
+        InsertSub = true;
+      }
+
+      if (InsertSub) {
+        // An instruction above couldn't be updated, so insert a sub.
+        AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII->get(ARM::tSUBi8), Base))
+          .addReg(Base, getKillRegState(true)).addImm(WordOffset * 4)
+          .addImm(Pred).addReg(PredReg);
+        return;
+      }
+    }
+
+    if (MBBI->killsRegister(Base))
+      // Register got killed. Stop updating.
+      return;
+  }
+
+  // The end of the block was reached. This means register liveness escapes the
+  // block, and it's necessary to insert a sub before the last instruction.
+  if (MBB.succ_size() > 0)
+    // But only insert the SUB if there is actually a successor block.
+    // FIXME: Check more carefully if register is live at this point, e.g. by
+    // also examining the successor block's register liveness information.
+    AddDefaultT1CC(BuildMI(MBB, --MBBI, dl, TII->get(ARM::tSUBi8), Base))
+      .addReg(Base, getKillRegState(true)).addImm(WordOffset * 4)
+      .addImm(Pred).addReg(PredReg);
+}
+
+/// MergeOps - Create and insert a LDM or STM with Base as base register and
+/// registers in Regs as the register operands that would be loaded / stored.
+/// It returns true if the transformation is done.
+bool
+ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MBBI,
+                          int Offset, unsigned Base, bool BaseKill,
+                          int Opcode, ARMCC::CondCodes Pred,
+                          unsigned PredReg, unsigned Scratch, DebugLoc dl,
+                          ArrayRef<std::pair<unsigned, bool> > Regs,
+                          ArrayRef<unsigned> ImpDefs) {
+  // Only a single register to load / store. Don't bother.
+  unsigned NumRegs = Regs.size();
+  if (NumRegs <= 1)
+    return false;
+
+  ARM_AM::AMSubMode Mode = ARM_AM::ia;
+  // VFP and Thumb2 do not support IB or DA modes. Thumb1 only supports IA.
+  bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode);
+  bool haveIBAndDA = isNotVFP && !isThumb2 && !isThumb1;
+
+  if (Offset == 4 && haveIBAndDA) {
+    Mode = ARM_AM::ib;
+  } else if (Offset == -4 * (int)NumRegs + 4 && haveIBAndDA) {
+    Mode = ARM_AM::da;
+  } else if (Offset == -4 * (int)NumRegs && isNotVFP && !isThumb1) {
+    // VLDM/VSTM do not support DB mode without also updating the base reg.
+    Mode = ARM_AM::db;
+  } else if (Offset != 0) {
+    // Check if this is a supported opcode before inserting instructions to
+    // calculate a new base register.
+    if (!getLoadStoreMultipleOpcode(Opcode, Mode)) return false;
+
+    // If starting offset isn't zero, insert a MI to materialize a new base.
+    // But only do so if it is cost effective, i.e. merging more than two
+    // loads / stores.
+    if (NumRegs <= 2)
+      return false;
+
+    unsigned NewBase;
+    if (isi32Load(Opcode)) {
+      // If it is a load, then just use one of the destination register to
+      // use as the new base.
+      NewBase = Regs[NumRegs-1].first;
+    } else {
+      // Use the scratch register to use as a new base.
+      NewBase = Scratch;
+      if (NewBase == 0)
+        return false;
+    }
+
+    int BaseOpc =
+      isThumb2 ? ARM::t2ADDri :
+      isThumb1 ? ARM::tADDi8  : ARM::ADDri;
+
+    if (Offset < 0) {
+      BaseOpc =
+        isThumb2 ? ARM::t2SUBri :
+        isThumb1 ? ARM::tSUBi8  : ARM::SUBri;
+      Offset = - Offset;
+    }
+
+    if (!TL->isLegalAddImmediate(Offset))
+      // FIXME: Try add with register operand?
+      return false; // Probably not worth it then.
+
+    if (isThumb1) {
+      if (Base != NewBase) {
+        // Need to insert a MOV to the new base first.
+        // FIXME: If the immediate fits in 3 bits, use ADD instead.
+        BuildMI(MBB, MBBI, dl, TII->get(ARM::tMOVr), NewBase)
+          .addReg(Base, getKillRegState(BaseKill))
+          .addImm(Pred).addReg(PredReg);
+      }
+      AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase))
+        .addReg(NewBase, getKillRegState(true)).addImm(Offset)
+        .addImm(Pred).addReg(PredReg);
+    } else {
+      BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase)
+        .addReg(Base, getKillRegState(BaseKill)).addImm(Offset)
+        .addImm(Pred).addReg(PredReg).addReg(0);
+    }
+
+    Base = NewBase;
+    BaseKill = true; // New base is always killed straight away.
+  }
+
+  bool isDef = (isi32Load(Opcode) || Opcode == ARM::VLDRS ||
+                Opcode == ARM::VLDRD);
+
+  // Get LS multiple opcode. Note that for Thumb1 this might be an opcode with
+  // base register writeback.
+  Opcode = getLoadStoreMultipleOpcode(Opcode, Mode);
+  if (!Opcode) return false;
+
+  bool Writeback = isThumb1; // Thumb1 LDM/STM have base reg writeback.
+
+  // Exception: If the base register is in the input reglist, Thumb1 LDM is
+  // non-writeback. Check for this.
+  if (Opcode == ARM::tLDMIA && isThumb1)
+    for (unsigned I = 0; I < NumRegs; ++I)
+      if (Base == Regs[I].first) {
+        Writeback = false;
+        break;
+      }
+
+  MachineInstrBuilder MIB;
+
+  if (Writeback) {
+    if (Opcode == ARM::tLDMIA)
+      // Update tLDMIA with writeback if necessary.
+      Opcode = ARM::tLDMIA_UPD;
+
+    MIB = BuildMI(MBB, MBBI, dl, TII->get(Opcode));
+
+    // Thumb1: we might need to set base writeback when building the MI.
+    MIB.addReg(Base, getDefRegState(true))
+       .addReg(Base, getKillRegState(BaseKill));
+
+    // The base isn't dead after a merged instruction with writeback. Update
+    // future uses of the base with the added offset (if possible), or reset
+    // the base register as necessary.
+    if (!BaseKill)
+      UpdateBaseRegUses(MBB, MBBI, dl, Base, NumRegs, Pred, PredReg);
+  } else {
+    // No writeback, simply build the MachineInstr.
+    MIB = BuildMI(MBB, MBBI, dl, TII->get(Opcode));
+    MIB.addReg(Base, getKillRegState(BaseKill));
+  }
+
+  MIB.addImm(Pred).addReg(PredReg);
+
+  for (unsigned i = 0; i != NumRegs; ++i)
+    MIB = MIB.addReg(Regs[i].first, getDefRegState(isDef)
+                     | getKillRegState(Regs[i].second));
+
+  // Add implicit defs for super-registers.
+  for (unsigned i = 0, e = ImpDefs.size(); i != e; ++i)
+    MIB.addReg(ImpDefs[i], RegState::ImplicitDefine);
+
+  return true;
+}
+
+/// \brief Find all instructions using a given imp-def within a range.
+///
+/// We are trying to combine a range of instructions, one of which (located at
+/// position RangeBegin) implicitly defines a register. The final LDM/STM will
+/// be placed at RangeEnd, and so any uses of this definition between RangeStart
+/// and RangeEnd must be modified to use an undefined value.
+///
+/// The live range continues until we find a second definition or one of the
+/// uses we find is a kill. Unfortunately MemOps is not sorted by Position, so
+/// we must consider all uses and decide which are relevant in a second pass.
+void ARMLoadStoreOpt::findUsesOfImpDef(
+    SmallVectorImpl<MachineOperand *> &UsesOfImpDefs, const MemOpQueue &MemOps,
+    unsigned DefReg, unsigned RangeBegin, unsigned RangeEnd) {
+  std::map<unsigned, MachineOperand *> Uses;
+  unsigned LastLivePos = RangeEnd;
+
+  // First we find all uses of this register with Position between RangeBegin
+  // and RangeEnd, any or all of these could be uses of a definition at
+  // RangeBegin. We also record the latest position a definition at RangeBegin
+  // would be considered live.
+  for (unsigned i = 0; i < MemOps.size(); ++i) {
+    MachineInstr &MI = *MemOps[i].MBBI;
+    unsigned MIPosition = MemOps[i].Position;
+    if (MIPosition <= RangeBegin || MIPosition > RangeEnd)
+      continue;
+
+    // If this instruction defines the register, then any later use will be of
+    // that definition rather than ours.
+    if (MI.definesRegister(DefReg))
+      LastLivePos = std::min(LastLivePos, MIPosition);
+
+    MachineOperand *UseOp = MI.findRegisterUseOperand(DefReg);
+    if (!UseOp)
+      continue;
+
+    // If this instruction kills the register then (assuming liveness is
+    // correct when we start) we don't need to think about anything after here.
+    if (UseOp->isKill())
+      LastLivePos = std::min(LastLivePos, MIPosition);
+
+    Uses[MIPosition] = UseOp;
+  }
+
+  // Now we traverse the list of all uses, and append the ones that actually use
+  // our definition to the requested list.
+  for (std::map<unsigned, MachineOperand *>::iterator I = Uses.begin(),
+                                                      E = Uses.end();
+       I != E; ++I) {
+    // List is sorted by position so once we've found one out of range there
+    // will be no more to consider.
+    if (I->first > LastLivePos)
+      break;
+    UsesOfImpDefs.push_back(I->second);
+  }
+}
+
+// MergeOpsUpdate - call MergeOps and update MemOps and merges accordingly on
+// success.
+void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB,
+                                     MemOpQueue &memOps,
+                                     unsigned memOpsBegin, unsigned memOpsEnd,
+                                     unsigned insertAfter, int Offset,
+                                     unsigned Base, bool BaseKill,
+                                     int Opcode,
+                                     ARMCC::CondCodes Pred, unsigned PredReg,
+                                     unsigned Scratch,
+                                     DebugLoc dl,
+                         SmallVectorImpl<MachineBasicBlock::iterator> &Merges) {
+  // First calculate which of the registers should be killed by the merged
+  // instruction.
+  const unsigned insertPos = memOps[insertAfter].Position;
+  SmallSet<unsigned, 4> KilledRegs;
+  DenseMap<unsigned, unsigned> Killer;
+  for (unsigned i = 0, e = memOps.size(); i != e; ++i) {
+    if (i == memOpsBegin) {
+      i = memOpsEnd;
+      if (i == e)
+        break;
+    }
+    if (memOps[i].Position < insertPos && memOps[i].isKill) {
+      unsigned Reg = memOps[i].Reg;
+      KilledRegs.insert(Reg);
+      Killer[Reg] = i;
+    }
+  }
+
+  SmallVector<std::pair<unsigned, bool>, 8> Regs;
+  SmallVector<unsigned, 8> ImpDefs;
+  SmallVector<MachineOperand *, 8> UsesOfImpDefs;
+  for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) {
+    unsigned Reg = memOps[i].Reg;
+    // If we are inserting the merged operation after an operation that
+    // uses the same register, make sure to transfer any kill flag.
+    bool isKill = memOps[i].isKill || KilledRegs.count(Reg);
+    Regs.push_back(std::make_pair(Reg, isKill));
+
+    // Collect any implicit defs of super-registers. They must be preserved.
+    for (MIOperands MO(memOps[i].MBBI); MO.isValid(); ++MO) {
+      if (!MO->isReg() || !MO->isDef() || !MO->isImplicit() || MO->isDead())
+        continue;
+      unsigned DefReg = MO->getReg();
+      if (std::find(ImpDefs.begin(), ImpDefs.end(), DefReg) == ImpDefs.end())
+        ImpDefs.push_back(DefReg);
+
+      // There may be other uses of the definition between this instruction and
+      // the eventual LDM/STM position. These should be marked undef if the
+      // merge takes place.
+      findUsesOfImpDef(UsesOfImpDefs, memOps, DefReg, memOps[i].Position,
+                       insertPos);
+    }
+  }
+
+  // Try to do the merge.
+  MachineBasicBlock::iterator Loc = memOps[insertAfter].MBBI;
+  ++Loc;
+  if (!MergeOps(MBB, Loc, Offset, Base, BaseKill, Opcode,
+                Pred, PredReg, Scratch, dl, Regs, ImpDefs))
+    return;
+
+  // Merge succeeded, update records.
+  Merges.push_back(std::prev(Loc));
+
+  // In gathering loads together, we may have moved the imp-def of a register
+  // past one of its uses. This is OK, since we know better than the rest of
+  // LLVM what's OK with ARM loads and stores; but we still have to adjust the
+  // affected uses.
+  for (SmallVectorImpl<MachineOperand *>::iterator I = UsesOfImpDefs.begin(),
+                                                   E = UsesOfImpDefs.end();
+                                                   I != E; ++I)
+    (*I)->setIsUndef();
+
+  for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) {
+    // Remove kill flags from any memops that come before insertPos.
+    if (Regs[i-memOpsBegin].second) {
+      unsigned Reg = Regs[i-memOpsBegin].first;
+      if (KilledRegs.count(Reg)) {
+        unsigned j = Killer[Reg];
+        int Idx = memOps[j].MBBI->findRegisterUseOperandIdx(Reg, true);
+        assert(Idx >= 0 && "Cannot find killing operand");
+        memOps[j].MBBI->getOperand(Idx).setIsKill(false);
+        memOps[j].isKill = false;
+      }
+      memOps[i].isKill = true;
+    }
+    MBB.erase(memOps[i].MBBI);
+    // Update this memop to refer to the merged instruction.
+    // We may need to move kill flags again.
+    memOps[i].Merged = true;
+    memOps[i].MBBI = Merges.back();
+    memOps[i].Position = insertPos;
+  }
+}
+
+/// MergeLDR_STR - Merge a number of load / store instructions into one or more
+/// load / store multiple instructions.
+void
+ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex,
+                         unsigned Base, int Opcode, unsigned Size,
+                         ARMCC::CondCodes Pred, unsigned PredReg,
+                         unsigned Scratch, MemOpQueue &MemOps,
+                         SmallVectorImpl<MachineBasicBlock::iterator> &Merges) {
+  bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode);
+  int Offset = MemOps[SIndex].Offset;
+  int SOffset = Offset;
+  unsigned insertAfter = SIndex;
+  MachineBasicBlock::iterator Loc = MemOps[SIndex].MBBI;
+  DebugLoc dl = Loc->getDebugLoc();
+  const MachineOperand &PMO = Loc->getOperand(0);
+  unsigned PReg = PMO.getReg();
+  unsigned PRegNum = PMO.isUndef() ? UINT_MAX : TRI->getEncodingValue(PReg);
+  unsigned Count = 1;
+  unsigned Limit = ~0U;
+
+  // vldm / vstm limit are 32 for S variants, 16 for D variants.
+
+  switch (Opcode) {
+  default: break;
+  case ARM::VSTRS:
+    Limit = 32;
+    break;
+  case ARM::VSTRD:
+    Limit = 16;
+    break;
+  case ARM::VLDRD:
+    Limit = 16;
+    break;
+  case ARM::VLDRS:
+    Limit = 32;
+    break;
+  }
+
+  for (unsigned i = SIndex+1, e = MemOps.size(); i != e; ++i) {
+    int NewOffset = MemOps[i].Offset;
+    const MachineOperand &MO = MemOps[i].MBBI->getOperand(0);
+    unsigned Reg = MO.getReg();
+    unsigned RegNum = MO.isUndef() ? UINT_MAX : TRI->getEncodingValue(Reg);
+    // Register numbers must be in ascending order. For VFP / NEON load and
+    // store multiples, the registers must also be consecutive and within the
+    // limit on the number of registers per instruction.
+    if (Reg != ARM::SP &&
+        NewOffset == Offset + (int)Size &&
+        ((isNotVFP && RegNum > PRegNum) ||
+         ((Count < Limit) && RegNum == PRegNum+1)) &&
+        // On Swift we don't want vldm/vstm to start with a odd register num
+        // because Q register unaligned vldm/vstm need more uops.
+        (!STI->isSwift() || isNotVFP || Count != 1 || !(PRegNum & 0x1))) {
+      Offset += Size;
+      PRegNum = RegNum;
+      ++Count;
+    } else {
+      // Can't merge this in. Try merge the earlier ones first.
+      MergeOpsUpdate(MBB, MemOps, SIndex, i, insertAfter, SOffset,
+                     Base, false, Opcode, Pred, PredReg, Scratch, dl, Merges);
+      MergeLDR_STR(MBB, i, Base, Opcode, Size, Pred, PredReg, Scratch,
+                   MemOps, Merges);
+      return;
+    }
+
+    if (MemOps[i].Position > MemOps[insertAfter].Position)
+      insertAfter = i;
+  }
+
+  bool BaseKill = Loc->findRegisterUseOperandIdx(Base, true) != -1;
+  MergeOpsUpdate(MBB, MemOps, SIndex, MemOps.size(), insertAfter, SOffset,
+                 Base, BaseKill, Opcode, Pred, PredReg, Scratch, dl, Merges);
+}
+
+static bool definesCPSR(MachineInstr *MI) {
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg())
+      continue;
+    if (MO.isDef() && MO.getReg() == ARM::CPSR && !MO.isDead())
+      // If the instruction has live CPSR def, then it's not safe to fold it
+      // into load / store.
+      return true;
+  }
+
+  return false;
+}
+
+static bool isMatchingDecrement(MachineInstr *MI, unsigned Base,
+                                unsigned Bytes, unsigned Limit,
+                                ARMCC::CondCodes Pred, unsigned PredReg) {
+  unsigned MyPredReg = 0;
+  if (!MI)
+    return false;
+
+  bool CheckCPSRDef = false;
+  switch (MI->getOpcode()) {
+  default: return false;
+  case ARM::tSUBi8:
+  case ARM::t2SUBri:
+  case ARM::SUBri:
+    CheckCPSRDef = true;
+  // fallthrough
+  case ARM::tSUBspi:
+    break;
+  }
+
+  // Make sure the offset fits in 8 bits.
+  if (Bytes == 0 || (Limit && Bytes >= Limit))
+    return false;
+
+  unsigned Scale = (MI->getOpcode() == ARM::tSUBspi ||
+                    MI->getOpcode() == ARM::tSUBi8) ? 4 : 1; // FIXME
+  if (!(MI->getOperand(0).getReg() == Base &&
+        MI->getOperand(1).getReg() == Base &&
+        (MI->getOperand(2).getImm() * Scale) == Bytes &&
+        getInstrPredicate(MI, MyPredReg) == Pred &&
+        MyPredReg == PredReg))
+    return false;
+
+  return CheckCPSRDef ? !definesCPSR(MI) : true;
+}
+
+static bool isMatchingIncrement(MachineInstr *MI, unsigned Base,
+                                unsigned Bytes, unsigned Limit,
+                                ARMCC::CondCodes Pred, unsigned PredReg) {
+  unsigned MyPredReg = 0;
+  if (!MI)
+    return false;
+
+  bool CheckCPSRDef = false;
+  switch (MI->getOpcode()) {
+  default: return false;
+  case ARM::tADDi8:
+  case ARM::t2ADDri:
+  case ARM::ADDri:
+    CheckCPSRDef = true;
+  // fallthrough
+  case ARM::tADDspi:
+    break;
+  }
+
+  if (Bytes == 0 || (Limit && Bytes >= Limit))
+    // Make sure the offset fits in 8 bits.
+    return false;
+
+  unsigned Scale = (MI->getOpcode() == ARM::tADDspi ||
+                    MI->getOpcode() == ARM::tADDi8) ? 4 : 1; // FIXME
+  if (!(MI->getOperand(0).getReg() == Base &&
+        MI->getOperand(1).getReg() == Base &&
+        (MI->getOperand(2).getImm() * Scale) == Bytes &&
+        getInstrPredicate(MI, MyPredReg) == Pred &&
+        MyPredReg == PredReg))
+    return false;
+
+  return CheckCPSRDef ? !definesCPSR(MI) : true;
+}
+
+static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  default: return 0;
+  case ARM::LDRi12:
+  case ARM::STRi12:
+  case ARM::tLDRi:
+  case ARM::tSTRi:
+  case ARM::t2LDRi8:
+  case ARM::t2LDRi12:
+  case ARM::t2STRi8:
+  case ARM::t2STRi12:
+  case ARM::VLDRS:
+  case ARM::VSTRS:
+    return 4;
+  case ARM::VLDRD:
+  case ARM::VSTRD:
+    return 8;
+  case ARM::LDMIA:
+  case ARM::LDMDA:
+  case ARM::LDMDB:
+  case ARM::LDMIB:
+  case ARM::STMIA:
+  case ARM::STMDA:
+  case ARM::STMDB:
+  case ARM::STMIB:
+  case ARM::tLDMIA:
+  case ARM::tLDMIA_UPD:
+  case ARM::tSTMIA_UPD:
+  case ARM::t2LDMIA:
+  case ARM::t2LDMDB:
+  case ARM::t2STMIA:
+  case ARM::t2STMDB:
+  case ARM::VLDMSIA:
+  case ARM::VSTMSIA:
+    return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 4;
+  case ARM::VLDMDIA:
+  case ARM::VSTMDIA:
+    return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 8;
+  }
+}
+
+static unsigned getUpdatingLSMultipleOpcode(unsigned Opc,
+                                            ARM_AM::AMSubMode Mode) {
+  switch (Opc) {
+  default: llvm_unreachable("Unhandled opcode!");
+  case ARM::LDMIA:
+  case ARM::LDMDA:
+  case ARM::LDMDB:
+  case ARM::LDMIB:
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::LDMIA_UPD;
+    case ARM_AM::ib: return ARM::LDMIB_UPD;
+    case ARM_AM::da: return ARM::LDMDA_UPD;
+    case ARM_AM::db: return ARM::LDMDB_UPD;
+    }
+  case ARM::STMIA:
+  case ARM::STMDA:
+  case ARM::STMDB:
+  case ARM::STMIB:
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::STMIA_UPD;
+    case ARM_AM::ib: return ARM::STMIB_UPD;
+    case ARM_AM::da: return ARM::STMDA_UPD;
+    case ARM_AM::db: return ARM::STMDB_UPD;
+    }
+  case ARM::t2LDMIA:
+  case ARM::t2LDMDB:
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::t2LDMIA_UPD;
+    case ARM_AM::db: return ARM::t2LDMDB_UPD;
+    }
+  case ARM::t2STMIA:
+  case ARM::t2STMDB:
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::t2STMIA_UPD;
+    case ARM_AM::db: return ARM::t2STMDB_UPD;
+    }
+  case ARM::VLDMSIA:
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::VLDMSIA_UPD;
+    case ARM_AM::db: return ARM::VLDMSDB_UPD;
+    }
+  case ARM::VLDMDIA:
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::VLDMDIA_UPD;
+    case ARM_AM::db: return ARM::VLDMDDB_UPD;
+    }
+  case ARM::VSTMSIA:
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::VSTMSIA_UPD;
+    case ARM_AM::db: return ARM::VSTMSDB_UPD;
+    }
+  case ARM::VSTMDIA:
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::VSTMDIA_UPD;
+    case ARM_AM::db: return ARM::VSTMDDB_UPD;
+    }
+  }
+}
+
+/// MergeBaseUpdateLSMultiple - Fold proceeding/trailing inc/dec of base
+/// register into the LDM/STM/VLDM{D|S}/VSTM{D|S} op when possible:
+///
+/// stmia rn, <ra, rb, rc>
+/// rn := rn + 4 * 3;
+/// =>
+/// stmia rn!, <ra, rb, rc>
+///
+/// rn := rn - 4 * 3;
+/// ldmia rn, <ra, rb, rc>
+/// =>
+/// ldmdb rn!, <ra, rb, rc>
+bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
+                                               MachineBasicBlock::iterator MBBI,
+                                               bool &Advance,
+                                               MachineBasicBlock::iterator &I) {
+  // Thumb1 is already using updating loads/stores.
+  if (isThumb1) return false;
+
+  MachineInstr *MI = MBBI;
+  unsigned Base = MI->getOperand(0).getReg();
+  bool BaseKill = MI->getOperand(0).isKill();
+  unsigned Bytes = getLSMultipleTransferSize(MI);
+  unsigned PredReg = 0;
+  ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
+  int Opcode = MI->getOpcode();
+  DebugLoc dl = MI->getDebugLoc();
+
+  // Can't use an updating ld/st if the base register is also a dest
+  // register. e.g. ldmdb r0!, {r0, r1, r2}. The behavior is undefined.
+  for (unsigned i = 2, e = MI->getNumOperands(); i != e; ++i)
+    if (MI->getOperand(i).getReg() == Base)
+      return false;
+
+  bool DoMerge = false;
+  ARM_AM::AMSubMode Mode = ARM_AM::getLoadStoreMultipleSubMode(Opcode);
+
+  // Try merging with the previous instruction.
+  MachineBasicBlock::iterator BeginMBBI = MBB.begin();
+  if (MBBI != BeginMBBI) {
+    MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI);
+    while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue())
+      --PrevMBBI;
+    if (Mode == ARM_AM::ia &&
+        isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
+      Mode = ARM_AM::db;
+      DoMerge = true;
+    } else if (Mode == ARM_AM::ib &&
+               isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
+      Mode = ARM_AM::da;
+      DoMerge = true;
+    }
+    if (DoMerge)
+      MBB.erase(PrevMBBI);
+  }
+
+  // Try merging with the next instruction.
+  MachineBasicBlock::iterator EndMBBI = MBB.end();
+  if (!DoMerge && MBBI != EndMBBI) {
+    MachineBasicBlock::iterator NextMBBI = std::next(MBBI);
+    while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
+      ++NextMBBI;
+    if ((Mode == ARM_AM::ia || Mode == ARM_AM::ib) &&
+        isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
+      DoMerge = true;
+    } else if ((Mode == ARM_AM::da || Mode == ARM_AM::db) &&
+               isMatchingDecrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
+      DoMerge = true;
+    }
+    if (DoMerge) {
+      if (NextMBBI == I) {
+        Advance = true;
+        ++I;
+      }
+      MBB.erase(NextMBBI);
+    }
+  }
+
+  if (!DoMerge)
+    return false;
+
+  unsigned NewOpc = getUpdatingLSMultipleOpcode(Opcode, Mode);
+  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII->get(NewOpc))
+    .addReg(Base, getDefRegState(true)) // WB base register
+    .addReg(Base, getKillRegState(BaseKill))
+    .addImm(Pred).addReg(PredReg);
+
+  // Transfer the rest of operands.
+  for (unsigned OpNum = 3, e = MI->getNumOperands(); OpNum != e; ++OpNum)
+    MIB.addOperand(MI->getOperand(OpNum));
+
+  // Transfer memoperands.
+  MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+  MBB.erase(MBBI);
+  return true;
+}
+
+static unsigned getPreIndexedLoadStoreOpcode(unsigned Opc,
+                                             ARM_AM::AddrOpc Mode) {
+  switch (Opc) {
+  case ARM::LDRi12:
+    return ARM::LDR_PRE_IMM;
+  case ARM::STRi12:
+    return ARM::STR_PRE_IMM;
+  case ARM::VLDRS:
+    return Mode == ARM_AM::add ? ARM::VLDMSIA_UPD : ARM::VLDMSDB_UPD;
+  case ARM::VLDRD:
+    return Mode == ARM_AM::add ? ARM::VLDMDIA_UPD : ARM::VLDMDDB_UPD;
+  case ARM::VSTRS:
+    return Mode == ARM_AM::add ? ARM::VSTMSIA_UPD : ARM::VSTMSDB_UPD;
+  case ARM::VSTRD:
+    return Mode == ARM_AM::add ? ARM::VSTMDIA_UPD : ARM::VSTMDDB_UPD;
+  case ARM::t2LDRi8:
+  case ARM::t2LDRi12:
+    return ARM::t2LDR_PRE;
+  case ARM::t2STRi8:
+  case ARM::t2STRi12:
+    return ARM::t2STR_PRE;
+  default: llvm_unreachable("Unhandled opcode!");
+  }
+}
+
+static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc,
+                                              ARM_AM::AddrOpc Mode) {
+  switch (Opc) {
+  case ARM::LDRi12:
+    return ARM::LDR_POST_IMM;
+  case ARM::STRi12:
+    return ARM::STR_POST_IMM;
+  case ARM::VLDRS:
+    return Mode == ARM_AM::add ? ARM::VLDMSIA_UPD : ARM::VLDMSDB_UPD;
+  case ARM::VLDRD:
+    return Mode == ARM_AM::add ? ARM::VLDMDIA_UPD : ARM::VLDMDDB_UPD;
+  case ARM::VSTRS:
+    return Mode == ARM_AM::add ? ARM::VSTMSIA_UPD : ARM::VSTMSDB_UPD;
+  case ARM::VSTRD:
+    return Mode == ARM_AM::add ? ARM::VSTMDIA_UPD : ARM::VSTMDDB_UPD;
+  case ARM::t2LDRi8:
+  case ARM::t2LDRi12:
+    return ARM::t2LDR_POST;
+  case ARM::t2STRi8:
+  case ARM::t2STRi12:
+    return ARM::t2STR_POST;
+  default: llvm_unreachable("Unhandled opcode!");
+  }
+}
+
+/// MergeBaseUpdateLoadStore - Fold proceeding/trailing inc/dec of base
+/// register into the LDR/STR/FLD{D|S}/FST{D|S} op when possible:
+bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
+                                               MachineBasicBlock::iterator MBBI,
+                                               const TargetInstrInfo *TII,
+                                               bool &Advance,
+                                               MachineBasicBlock::iterator &I) {
+  // Thumb1 doesn't have updating LDR/STR.
+  // FIXME: Use LDM/STM with single register instead.
+  if (isThumb1) return false;
+
+  MachineInstr *MI = MBBI;
+  unsigned Base = MI->getOperand(1).getReg();
+  bool BaseKill = MI->getOperand(1).isKill();
+  unsigned Bytes = getLSMultipleTransferSize(MI);
+  int Opcode = MI->getOpcode();
+  DebugLoc dl = MI->getDebugLoc();
+  bool isAM5 = (Opcode == ARM::VLDRD || Opcode == ARM::VLDRS ||
+                Opcode == ARM::VSTRD || Opcode == ARM::VSTRS);
+  bool isAM2 = (Opcode == ARM::LDRi12 || Opcode == ARM::STRi12);
+  if (isi32Load(Opcode) || isi32Store(Opcode))
+    if (MI->getOperand(2).getImm() != 0)
+      return false;
+  if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0)
+    return false;
+
+  bool isLd = isi32Load(Opcode) || Opcode == ARM::VLDRS || Opcode == ARM::VLDRD;
+  // Can't do the merge if the destination register is the same as the would-be
+  // writeback register.
+  if (MI->getOperand(0).getReg() == Base)
+    return false;
+
+  unsigned PredReg = 0;
+  ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
+  bool DoMerge = false;
+  ARM_AM::AddrOpc AddSub = ARM_AM::add;
+  unsigned NewOpc = 0;
+  // AM2 - 12 bits, thumb2 - 8 bits.
+  unsigned Limit = isAM5 ? 0 : (isAM2 ? 0x1000 : 0x100);
+
+  // Try merging with the previous instruction.
+  MachineBasicBlock::iterator BeginMBBI = MBB.begin();
+  if (MBBI != BeginMBBI) {
+    MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI);
+    while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue())
+      --PrevMBBI;
+    if (isMatchingDecrement(PrevMBBI, Base, Bytes, Limit, Pred, PredReg)) {
+      DoMerge = true;
+      AddSub = ARM_AM::sub;
+    } else if (!isAM5 &&
+               isMatchingIncrement(PrevMBBI, Base, Bytes, Limit,Pred,PredReg)) {
+      DoMerge = true;
+    }
+    if (DoMerge) {
+      NewOpc = getPreIndexedLoadStoreOpcode(Opcode, AddSub);
+      MBB.erase(PrevMBBI);
+    }
+  }
+
+  // Try merging with the next instruction.
+  MachineBasicBlock::iterator EndMBBI = MBB.end();
+  if (!DoMerge && MBBI != EndMBBI) {
+    MachineBasicBlock::iterator NextMBBI = std::next(MBBI);
+    while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
+      ++NextMBBI;
+    if (!isAM5 &&
+        isMatchingDecrement(NextMBBI, Base, Bytes, Limit, Pred, PredReg)) {
+      DoMerge = true;
+      AddSub = ARM_AM::sub;
+    } else if (isMatchingIncrement(NextMBBI, Base, Bytes, Limit,Pred,PredReg)) {
+      DoMerge = true;
+    }
+    if (DoMerge) {
+      NewOpc = getPostIndexedLoadStoreOpcode(Opcode, AddSub);
+      if (NextMBBI == I) {
+        Advance = true;
+        ++I;
+      }
+      MBB.erase(NextMBBI);
+    }
+  }
+
+  if (!DoMerge)
+    return false;
+
+  if (isAM5) {
+    // VLDM[SD]_UPD, VSTM[SD]_UPD
+    // (There are no base-updating versions of VLDR/VSTR instructions, but the
+    // updating load/store-multiple instructions can be used with only one
+    // register.)
+    MachineOperand &MO = MI->getOperand(0);
+    BuildMI(MBB, MBBI, dl, TII->get(NewOpc))
+      .addReg(Base, getDefRegState(true)) // WB base register
+      .addReg(Base, getKillRegState(isLd ? BaseKill : false))
+      .addImm(Pred).addReg(PredReg)
+      .addReg(MO.getReg(), (isLd ? getDefRegState(true) :
+                            getKillRegState(MO.isKill())));
+  } else if (isLd) {
+    if (isAM2) {
+      // LDR_PRE, LDR_POST
+      if (NewOpc == ARM::LDR_PRE_IMM || NewOpc == ARM::LDRB_PRE_IMM) {
+        int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
+        BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg())
+          .addReg(Base, RegState::Define)
+          .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
+      } else {
+        int Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
+        BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg())
+          .addReg(Base, RegState::Define)
+          .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
+      }
+    } else {
+      int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
+      // t2LDR_PRE, t2LDR_POST
+      BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg())
+        .addReg(Base, RegState::Define)
+        .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
+    }
+  } else {
+    MachineOperand &MO = MI->getOperand(0);
+    // FIXME: post-indexed stores use am2offset_imm, which still encodes
+    // the vestigal zero-reg offset register. When that's fixed, this clause
+    // can be removed entirely.
+    if (isAM2 && NewOpc == ARM::STR_POST_IMM) {
+      int Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
+      // STR_PRE, STR_POST
+      BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base)
+        .addReg(MO.getReg(), getKillRegState(MO.isKill()))
+        .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
+    } else {
+      int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
+      // t2STR_PRE, t2STR_POST
+      BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base)
+        .addReg(MO.getReg(), getKillRegState(MO.isKill()))
+        .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
+    }
+  }
+  MBB.erase(MBBI);
+
+  return true;
+}
+
+/// isMemoryOp - Returns true if instruction is a memory operation that this
+/// pass is capable of operating on.
+static bool isMemoryOp(const MachineInstr *MI) {
+  // When no memory operands are present, conservatively assume unaligned,
+  // volatile, unfoldable.
+  if (!MI->hasOneMemOperand())
+    return false;
+
+  const MachineMemOperand *MMO = *MI->memoperands_begin();
+
+  // Don't touch volatile memory accesses - we may be changing their order.
+  if (MMO->isVolatile())
+    return false;
+
+  // Unaligned ldr/str is emulated by some kernels, but unaligned ldm/stm is
+  // not.
+  if (MMO->getAlignment() < 4)
+    return false;
+
+  // str <undef> could probably be eliminated entirely, but for now we just want
+  // to avoid making a mess of it.
+  // FIXME: Use str <undef> as a wildcard to enable better stm folding.
+  if (MI->getNumOperands() > 0 && MI->getOperand(0).isReg() &&
+      MI->getOperand(0).isUndef())
+    return false;
+
+  // Likewise don't mess with references to undefined addresses.
+  if (MI->getNumOperands() > 1 && MI->getOperand(1).isReg() &&
+      MI->getOperand(1).isUndef())
+    return false;
+
+  int Opcode = MI->getOpcode();
+  switch (Opcode) {
+  default: break;
+  case ARM::VLDRS:
+  case ARM::VSTRS:
+    return MI->getOperand(1).isReg();
+  case ARM::VLDRD:
+  case ARM::VSTRD:
+    return MI->getOperand(1).isReg();
+  case ARM::LDRi12:
+  case ARM::STRi12:
+  case ARM::tLDRi:
+  case ARM::tSTRi:
+  case ARM::t2LDRi8:
+  case ARM::t2LDRi12:
+  case ARM::t2STRi8:
+  case ARM::t2STRi12:
+    return MI->getOperand(1).isReg();
+  }
+  return false;
+}
+
+/// AdvanceRS - Advance register scavenger to just before the earliest memory
+/// op that is being merged.
+void ARMLoadStoreOpt::AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps) {
+  MachineBasicBlock::iterator Loc = MemOps[0].MBBI;
+  unsigned Position = MemOps[0].Position;
+  for (unsigned i = 1, e = MemOps.size(); i != e; ++i) {
+    if (MemOps[i].Position < Position) {
+      Position = MemOps[i].Position;
+      Loc = MemOps[i].MBBI;
+    }
+  }
+
+  if (Loc != MBB.begin())
+    RS->forward(std::prev(Loc));
+}
+
+static int getMemoryOpOffset(const MachineInstr *MI) {
+  int Opcode = MI->getOpcode();
+  bool isAM3 = Opcode == ARM::LDRD || Opcode == ARM::STRD;
+  unsigned NumOperands = MI->getDesc().getNumOperands();
+  unsigned OffField = MI->getOperand(NumOperands-3).getImm();
+
+  if (Opcode == ARM::t2LDRi12 || Opcode == ARM::t2LDRi8 ||
+      Opcode == ARM::t2STRi12 || Opcode == ARM::t2STRi8 ||
+      Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8 ||
+      Opcode == ARM::LDRi12   || Opcode == ARM::STRi12)
+    return OffField;
+
+  // Thumb1 immediate offsets are scaled by 4
+  if (Opcode == ARM::tLDRi || Opcode == ARM::tSTRi)
+    return OffField * 4;
+
+  int Offset = isAM3 ? ARM_AM::getAM3Offset(OffField)
+    : ARM_AM::getAM5Offset(OffField) * 4;
+  if (isAM3) {
+    if (ARM_AM::getAM3Op(OffField) == ARM_AM::sub)
+      Offset = -Offset;
+  } else {
+    if (ARM_AM::getAM5Op(OffField) == ARM_AM::sub)
+      Offset = -Offset;
+  }
+  return Offset;
+}
+
+static void InsertLDR_STR(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator &MBBI,
+                          int Offset, bool isDef,
+                          DebugLoc dl, unsigned NewOpc,
+                          unsigned Reg, bool RegDeadKill, bool RegUndef,
+                          unsigned BaseReg, bool BaseKill, bool BaseUndef,
+                          bool OffKill, bool OffUndef,
+                          ARMCC::CondCodes Pred, unsigned PredReg,
+                          const TargetInstrInfo *TII, bool isT2) {
+  if (isDef) {
+    MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
+                                      TII->get(NewOpc))
+      .addReg(Reg, getDefRegState(true) | getDeadRegState(RegDeadKill))
+      .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef));
+    MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
+  } else {
+    MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
+                                      TII->get(NewOpc))
+      .addReg(Reg, getKillRegState(RegDeadKill) | getUndefRegState(RegUndef))
+      .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef));
+    MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
+  }
+}
+
+bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator &MBBI) {
+  MachineInstr *MI = &*MBBI;
+  unsigned Opcode = MI->getOpcode();
+  if (Opcode == ARM::LDRD || Opcode == ARM::STRD ||
+      Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) {
+    const MachineOperand &BaseOp = MI->getOperand(2);
+    unsigned BaseReg = BaseOp.getReg();
+    unsigned EvenReg = MI->getOperand(0).getReg();
+    unsigned OddReg  = MI->getOperand(1).getReg();
+    unsigned EvenRegNum = TRI->getDwarfRegNum(EvenReg, false);
+    unsigned OddRegNum  = TRI->getDwarfRegNum(OddReg, false);
+    // ARM errata 602117: LDRD with base in list may result in incorrect base
+    // register when interrupted or faulted.
+    bool Errata602117 = EvenReg == BaseReg && STI->isCortexM3();
+    if (!Errata602117 &&
+        ((EvenRegNum & 1) == 0 && (EvenRegNum + 1) == OddRegNum))
+      return false;
+
+    MachineBasicBlock::iterator NewBBI = MBBI;
+    bool isT2 = Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8;
+    bool isLd = Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8;
+    bool EvenDeadKill = isLd ?
+      MI->getOperand(0).isDead() : MI->getOperand(0).isKill();
+    bool EvenUndef = MI->getOperand(0).isUndef();
+    bool OddDeadKill  = isLd ?
+      MI->getOperand(1).isDead() : MI->getOperand(1).isKill();
+    bool OddUndef = MI->getOperand(1).isUndef();
+    bool BaseKill = BaseOp.isKill();
+    bool BaseUndef = BaseOp.isUndef();
+    bool OffKill = isT2 ? false : MI->getOperand(3).isKill();
+    bool OffUndef = isT2 ? false : MI->getOperand(3).isUndef();
+    int OffImm = getMemoryOpOffset(MI);
+    unsigned PredReg = 0;
+    ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
+
+    if (OddRegNum > EvenRegNum && OffImm == 0) {
+      // Ascending register numbers and no offset. It's safe to change it to a
+      // ldm or stm.
+      unsigned NewOpc = (isLd)
+        ? (isT2 ? ARM::t2LDMIA : ARM::LDMIA)
+        : (isT2 ? ARM::t2STMIA : ARM::STMIA);
+      if (isLd) {
+        BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
+          .addReg(BaseReg, getKillRegState(BaseKill))
+          .addImm(Pred).addReg(PredReg)
+          .addReg(EvenReg, getDefRegState(isLd) | getDeadRegState(EvenDeadKill))
+          .addReg(OddReg,  getDefRegState(isLd) | getDeadRegState(OddDeadKill));
+        ++NumLDRD2LDM;
+      } else {
+        BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
+          .addReg(BaseReg, getKillRegState(BaseKill))
+          .addImm(Pred).addReg(PredReg)
+          .addReg(EvenReg,
+                  getKillRegState(EvenDeadKill) | getUndefRegState(EvenUndef))
+          .addReg(OddReg,
+                  getKillRegState(OddDeadKill)  | getUndefRegState(OddUndef));
+        ++NumSTRD2STM;
+      }
+      NewBBI = std::prev(MBBI);
+    } else {
+      // Split into two instructions.
+      unsigned NewOpc = (isLd)
+        ? (isT2 ? (OffImm < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDRi12)
+        : (isT2 ? (OffImm < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STRi12);
+      // Be extra careful for thumb2. t2LDRi8 can't reference a zero offset,
+      // so adjust and use t2LDRi12 here for that.
+      unsigned NewOpc2 = (isLd)
+        ? (isT2 ? (OffImm+4 < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDRi12)
+        : (isT2 ? (OffImm+4 < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STRi12);
+      DebugLoc dl = MBBI->getDebugLoc();
+      // If this is a load and base register is killed, it may have been
+      // re-defed by the load, make sure the first load does not clobber it.
+      if (isLd &&
+          (BaseKill || OffKill) &&
+          (TRI->regsOverlap(EvenReg, BaseReg))) {
+        assert(!TRI->regsOverlap(OddReg, BaseReg));
+        InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc2,
+                      OddReg, OddDeadKill, false,
+                      BaseReg, false, BaseUndef, false, OffUndef,
+                      Pred, PredReg, TII, isT2);
+        NewBBI = std::prev(MBBI);
+        InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
+                      EvenReg, EvenDeadKill, false,
+                      BaseReg, BaseKill, BaseUndef, OffKill, OffUndef,
+                      Pred, PredReg, TII, isT2);
+      } else {
+        if (OddReg == EvenReg && EvenDeadKill) {
+          // If the two source operands are the same, the kill marker is
+          // probably on the first one. e.g.
+          // t2STRDi8 %R5<kill>, %R5, %R9<kill>, 0, 14, %reg0
+          EvenDeadKill = false;
+          OddDeadKill = true;
+        }
+        // Never kill the base register in the first instruction.
+        if (EvenReg == BaseReg)
+          EvenDeadKill = false;
+        InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
+                      EvenReg, EvenDeadKill, EvenUndef,
+                      BaseReg, false, BaseUndef, false, OffUndef,
+                      Pred, PredReg, TII, isT2);
+        NewBBI = std::prev(MBBI);
+        InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc2,
+                      OddReg, OddDeadKill, OddUndef,
+                      BaseReg, BaseKill, BaseUndef, OffKill, OffUndef,
+                      Pred, PredReg, TII, isT2);
+      }
+      if (isLd)
+        ++NumLDRD2LDR;
+      else
+        ++NumSTRD2STR;
+    }
+
+    MBB.erase(MI);
+    MBBI = NewBBI;
+    return true;
+  }
+  return false;
+}
+
+/// LoadStoreMultipleOpti - An optimization pass to turn multiple LDR / STR
+/// ops of the same base and incrementing offset into LDM / STM ops.
+bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
+  unsigned NumMerges = 0;
+  unsigned NumMemOps = 0;
+  MemOpQueue MemOps;
+  unsigned CurrBase = 0;
+  int CurrOpc = -1;
+  unsigned CurrSize = 0;
+  ARMCC::CondCodes CurrPred = ARMCC::AL;
+  unsigned CurrPredReg = 0;
+  unsigned Position = 0;
+  SmallVector<MachineBasicBlock::iterator,4> Merges;
+
+  RS->enterBasicBlock(&MBB);
+  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+  while (MBBI != E) {
+    if (FixInvalidRegPairOp(MBB, MBBI))
+      continue;
+
+    bool Advance  = false;
+    bool TryMerge = false;
+    bool Clobber  = false;
+
+    bool isMemOp = isMemoryOp(MBBI);
+    if (isMemOp) {
+      int Opcode = MBBI->getOpcode();
+      unsigned Size = getLSMultipleTransferSize(MBBI);
+      const MachineOperand &MO = MBBI->getOperand(0);
+      unsigned Reg = MO.getReg();
+      bool isKill = MO.isDef() ? false : MO.isKill();
+      unsigned Base = MBBI->getOperand(1).getReg();
+      unsigned PredReg = 0;
+      ARMCC::CondCodes Pred = getInstrPredicate(MBBI, PredReg);
+      int Offset = getMemoryOpOffset(MBBI);
+      // Watch out for:
+      // r4 := ldr [r5]
+      // r5 := ldr [r5, #4]
+      // r6 := ldr [r5, #8]
+      //
+      // The second ldr has effectively broken the chain even though it
+      // looks like the later ldr(s) use the same base register. Try to
+      // merge the ldr's so far, including this one. But don't try to
+      // combine the following ldr(s).
+      Clobber = (isi32Load(Opcode) && Base == MBBI->getOperand(0).getReg());
+
+      // Watch out for:
+      // r4 := ldr [r0, #8]
+      // r4 := ldr [r0, #4]
+      //
+      // The optimization may reorder the second ldr in front of the first
+      // ldr, which violates write after write(WAW) dependence. The same as
+      // str. Try to merge inst(s) already in MemOps.
+      bool Overlap = false;
+      for (MemOpQueueIter I = MemOps.begin(), E = MemOps.end(); I != E; ++I) {
+        if (TRI->regsOverlap(Reg, I->MBBI->getOperand(0).getReg())) {
+          Overlap = true;
+          break;
+        }
+      }
+
+      if (CurrBase == 0 && !Clobber) {
+        // Start of a new chain.
+        CurrBase = Base;
+        CurrOpc  = Opcode;
+        CurrSize = Size;
+        CurrPred = Pred;
+        CurrPredReg = PredReg;
+        MemOps.push_back(MemOpQueueEntry(Offset, Reg, isKill, Position, MBBI));
+        ++NumMemOps;
+        Advance = true;
+      } else if (!Overlap) {
+        if (Clobber) {
+          TryMerge = true;
+          Advance = true;
+        }
+
+        if (CurrOpc == Opcode && CurrBase == Base && CurrPred == Pred) {
+          // No need to match PredReg.
+          // Continue adding to the queue.
+          if (Offset > MemOps.back().Offset) {
+            MemOps.push_back(MemOpQueueEntry(Offset, Reg, isKill,
+                                             Position, MBBI));
+            ++NumMemOps;
+            Advance = true;
+          } else {
+            for (MemOpQueueIter I = MemOps.begin(), E = MemOps.end();
+                 I != E; ++I) {
+              if (Offset < I->Offset) {
+                MemOps.insert(I, MemOpQueueEntry(Offset, Reg, isKill,
+                                                 Position, MBBI));
+                ++NumMemOps;
+                Advance = true;
+                break;
+              } else if (Offset == I->Offset) {
+                // Collision! This can't be merged!
+                break;
+              }
+            }
+          }
+        }
+      }
+    }
+
+    if (MBBI->isDebugValue()) {
+      ++MBBI;
+      if (MBBI == E)
+        // Reach the end of the block, try merging the memory instructions.
+        TryMerge = true;
+    } else if (Advance) {
+      ++Position;
+      ++MBBI;
+      if (MBBI == E)
+        // Reach the end of the block, try merging the memory instructions.
+        TryMerge = true;
+    } else {
+      TryMerge = true;
+    }
+
+    if (TryMerge) {
+      if (NumMemOps > 1) {
+        // Try to find a free register to use as a new base in case it's needed.
+        // First advance to the instruction just before the start of the chain.
+        AdvanceRS(MBB, MemOps);
+
+        // Find a scratch register.
+        unsigned Scratch =
+          RS->FindUnusedReg(isThumb1 ? &ARM::tGPRRegClass : &ARM::GPRRegClass);
+
+        // Process the load / store instructions.
+        RS->forward(std::prev(MBBI));
+
+        // Merge ops.
+        Merges.clear();
+        MergeLDR_STR(MBB, 0, CurrBase, CurrOpc, CurrSize,
+                     CurrPred, CurrPredReg, Scratch, MemOps, Merges);
+
+        // Try folding preceding/trailing base inc/dec into the generated
+        // LDM/STM ops.
+        for (unsigned i = 0, e = Merges.size(); i < e; ++i)
+          if (MergeBaseUpdateLSMultiple(MBB, Merges[i], Advance, MBBI))
+            ++NumMerges;
+        NumMerges += Merges.size();
+
+        // Try folding preceding/trailing base inc/dec into those load/store
+        // that were not merged to form LDM/STM ops.
+        for (unsigned i = 0; i != NumMemOps; ++i)
+          if (!MemOps[i].Merged)
+            if (MergeBaseUpdateLoadStore(MBB, MemOps[i].MBBI, TII,Advance,MBBI))
+              ++NumMerges;
+
+        // RS may be pointing to an instruction that's deleted.
+        RS->skipTo(std::prev(MBBI));
+      } else if (NumMemOps == 1) {
+        // Try folding preceding/trailing base inc/dec into the single
+        // load/store.
+        if (MergeBaseUpdateLoadStore(MBB, MemOps[0].MBBI, TII, Advance, MBBI)) {
+          ++NumMerges;
+          RS->forward(std::prev(MBBI));
+        }
+      }
+
+      CurrBase = 0;
+      CurrOpc = -1;
+      CurrSize = 0;
+      CurrPred = ARMCC::AL;
+      CurrPredReg = 0;
+      if (NumMemOps) {
+        MemOps.clear();
+        NumMemOps = 0;
+      }
+
+      // If iterator hasn't been advanced and this is not a memory op, skip it.
+      // It can't start a new chain anyway.
+      if (!Advance && !isMemOp && MBBI != E) {
+        ++Position;
+        ++MBBI;
+      }
+    }
+  }
+  return NumMerges > 0;
+}
+
+/// MergeReturnIntoLDM - If this is a exit BB, try merging the return ops
+/// ("bx lr" and "mov pc, lr") into the preceding stack restore so it
+/// directly restore the value of LR into pc.
+///   ldmfd sp!, {..., lr}
+///   bx lr
+/// or
+///   ldmfd sp!, {..., lr}
+///   mov pc, lr
+/// =>
+///   ldmfd sp!, {..., pc}
+bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) {
+  // Thumb1 LDM doesn't allow high registers.
+  if (isThumb1) return false;
+  if (MBB.empty()) return false;
+
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  if (MBBI != MBB.begin() &&
+      (MBBI->getOpcode() == ARM::BX_RET ||
+       MBBI->getOpcode() == ARM::tBX_RET ||
+       MBBI->getOpcode() == ARM::MOVPCLR)) {
+    MachineInstr *PrevMI = std::prev(MBBI);
+    unsigned Opcode = PrevMI->getOpcode();
+    if (Opcode == ARM::LDMIA_UPD || Opcode == ARM::LDMDA_UPD ||
+        Opcode == ARM::LDMDB_UPD || Opcode == ARM::LDMIB_UPD ||
+        Opcode == ARM::t2LDMIA_UPD || Opcode == ARM::t2LDMDB_UPD) {
+      MachineOperand &MO = PrevMI->getOperand(PrevMI->getNumOperands()-1);
+      if (MO.getReg() != ARM::LR)
+        return false;
+      unsigned NewOpc = (isThumb2 ? ARM::t2LDMIA_RET : ARM::LDMIA_RET);
+      assert(((isThumb2 && Opcode == ARM::t2LDMIA_UPD) ||
+              Opcode == ARM::LDMIA_UPD) && "Unsupported multiple load-return!");
+      PrevMI->setDesc(TII->get(NewOpc));
+      MO.setReg(ARM::PC);
+      PrevMI->copyImplicitOps(*MBB.getParent(), &*MBBI);
+      MBB.erase(MBBI);
+      return true;
+    }
+  }
+  return false;
+}
+
+bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
+  const TargetMachine &TM = Fn.getTarget();
+  TL = TM.getTargetLowering();
+  AFI = Fn.getInfo<ARMFunctionInfo>();
+  TII = TM.getInstrInfo();
+  TRI = TM.getRegisterInfo();
+  STI = &TM.getSubtarget<ARMSubtarget>();
+  RS = new RegScavenger();
+  isThumb2 = AFI->isThumb2Function();
+  isThumb1 = AFI->isThumbFunction() && !isThumb2;
+
+  // FIXME: Temporarily disabling for Thumb-1 due to miscompiles
+  if (isThumb1) {
+    delete RS;
+    return false;
+  }
+
+  bool Modified = false;
+  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+       ++MFI) {
+    MachineBasicBlock &MBB = *MFI;
+    Modified |= LoadStoreMultipleOpti(MBB);
+    if (TM.getSubtarget<ARMSubtarget>().hasV5TOps())
+      Modified |= MergeReturnIntoLDM(MBB);
+  }
+
+  delete RS;
+  return Modified;
+}
+
+
+/// ARMPreAllocLoadStoreOpt - Pre- register allocation pass that move
+/// load / stores from consecutive locations close to make it more
+/// likely they will be combined later.
+
+namespace {
+  struct ARMPreAllocLoadStoreOpt : public MachineFunctionPass{
+    static char ID;
+    ARMPreAllocLoadStoreOpt() : MachineFunctionPass(ID) {}
+
+    const DataLayout *TD;
+    const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    const ARMSubtarget *STI;
+    MachineRegisterInfo *MRI;
+    MachineFunction *MF;
+
+    bool runOnMachineFunction(MachineFunction &Fn) override;
+
+    const char *getPassName() const override {
+      return "ARM pre- register allocation load / store optimization pass";
+    }
+
+  private:
+    bool CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl,
+                          unsigned &NewOpc, unsigned &EvenReg,
+                          unsigned &OddReg, unsigned &BaseReg,
+                          int &Offset,
+                          unsigned &PredReg, ARMCC::CondCodes &Pred,
+                          bool &isT2);
+    bool RescheduleOps(MachineBasicBlock *MBB,
+                       SmallVectorImpl<MachineInstr *> &Ops,
+                       unsigned Base, bool isLd,
+                       DenseMap<MachineInstr*, unsigned> &MI2LocMap);
+    bool RescheduleLoadStoreInstrs(MachineBasicBlock *MBB);
+  };
+  char ARMPreAllocLoadStoreOpt::ID = 0;
+}
+
+bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
+  TD  = Fn.getTarget().getDataLayout();
+  TII = Fn.getTarget().getInstrInfo();
+  TRI = Fn.getTarget().getRegisterInfo();
+  STI = &Fn.getTarget().getSubtarget<ARMSubtarget>();
+  MRI = &Fn.getRegInfo();
+  MF  = &Fn;
+
+  bool Modified = false;
+  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+       ++MFI)
+    Modified |= RescheduleLoadStoreInstrs(MFI);
+
+  return Modified;
+}
+
+static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base,
+                                      MachineBasicBlock::iterator I,
+                                      MachineBasicBlock::iterator E,
+                                      SmallPtrSet<MachineInstr*, 4> &MemOps,
+                                      SmallSet<unsigned, 4> &MemRegs,
+                                      const TargetRegisterInfo *TRI) {
+  // Are there stores / loads / calls between them?
+  // FIXME: This is overly conservative. We should make use of alias information
+  // some day.
+  SmallSet<unsigned, 4> AddedRegPressure;
+  while (++I != E) {
+    if (I->isDebugValue() || MemOps.count(&*I))
+      continue;
+    if (I->isCall() || I->isTerminator() || I->hasUnmodeledSideEffects())
+      return false;
+    if (isLd && I->mayStore())
+      return false;
+    if (!isLd) {
+      if (I->mayLoad())
+        return false;
+      // It's not safe to move the first 'str' down.
+      // str r1, [r0]
+      // strh r5, [r0]
+      // str r4, [r0, #+4]
+      if (I->mayStore())
+        return false;
+    }
+    for (unsigned j = 0, NumOps = I->getNumOperands(); j != NumOps; ++j) {
+      MachineOperand &MO = I->getOperand(j);
+      if (!MO.isReg())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (MO.isDef() && TRI->regsOverlap(Reg, Base))
+        return false;
+      if (Reg != Base && !MemRegs.count(Reg))
+        AddedRegPressure.insert(Reg);
+    }
+  }
+
+  // Estimate register pressure increase due to the transformation.
+  if (MemRegs.size() <= 4)
+    // Ok if we are moving small number of instructions.
+    return true;
+  return AddedRegPressure.size() <= MemRegs.size() * 2;
+}
+
+
+/// Copy Op0 and Op1 operands into a new array assigned to MI.
+static void concatenateMemOperands(MachineInstr *MI, MachineInstr *Op0,
+                                   MachineInstr *Op1) {
+  assert(MI->memoperands_empty() && "expected a new machineinstr");
+  size_t numMemRefs = (Op0->memoperands_end() - Op0->memoperands_begin())
+    + (Op1->memoperands_end() - Op1->memoperands_begin());
+
+  MachineFunction *MF = MI->getParent()->getParent();
+  MachineSDNode::mmo_iterator MemBegin = MF->allocateMemRefsArray(numMemRefs);
+  MachineSDNode::mmo_iterator MemEnd =
+    std::copy(Op0->memoperands_begin(), Op0->memoperands_end(), MemBegin);
+  MemEnd =
+    std::copy(Op1->memoperands_begin(), Op1->memoperands_end(), MemEnd);
+  MI->setMemRefs(MemBegin, MemEnd);
+}
+
+bool
+ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1,
+                                          DebugLoc &dl,
+                                          unsigned &NewOpc, unsigned &EvenReg,
+                                          unsigned &OddReg, unsigned &BaseReg,
+                                          int &Offset, unsigned &PredReg,
+                                          ARMCC::CondCodes &Pred,
+                                          bool &isT2) {
+  // Make sure we're allowed to generate LDRD/STRD.
+  if (!STI->hasV5TEOps())
+    return false;
+
+  // FIXME: VLDRS / VSTRS -> VLDRD / VSTRD
+  unsigned Scale = 1;
+  unsigned Opcode = Op0->getOpcode();
+  if (Opcode == ARM::LDRi12) {
+    NewOpc = ARM::LDRD;
+  } else if (Opcode == ARM::STRi12) {
+    NewOpc = ARM::STRD;
+  } else if (Opcode == ARM::t2LDRi8 || Opcode == ARM::t2LDRi12) {
+    NewOpc = ARM::t2LDRDi8;
+    Scale = 4;
+    isT2 = true;
+  } else if (Opcode == ARM::t2STRi8 || Opcode == ARM::t2STRi12) {
+    NewOpc = ARM::t2STRDi8;
+    Scale = 4;
+    isT2 = true;
+  } else {
+    return false;
+  }
+
+  // Make sure the base address satisfies i64 ld / st alignment requirement.
+  // At the moment, we ignore the memoryoperand's value.
+  // If we want to use AliasAnalysis, we should check it accordingly.
+  if (!Op0->hasOneMemOperand() ||
+      (*Op0->memoperands_begin())->isVolatile())
+    return false;
+
+  unsigned Align = (*Op0->memoperands_begin())->getAlignment();
+  const Function *Func = MF->getFunction();
+  unsigned ReqAlign = STI->hasV6Ops()
+    ? TD->getABITypeAlignment(Type::getInt64Ty(Func->getContext()))
+    : 8;  // Pre-v6 need 8-byte align
+  if (Align < ReqAlign)
+    return false;
+
+  // Then make sure the immediate offset fits.
+  int OffImm = getMemoryOpOffset(Op0);
+  if (isT2) {
+    int Limit = (1 << 8) * Scale;
+    if (OffImm >= Limit || (OffImm <= -Limit) || (OffImm & (Scale-1)))
+      return false;
+    Offset = OffImm;
+  } else {
+    ARM_AM::AddrOpc AddSub = ARM_AM::add;
+    if (OffImm < 0) {
+      AddSub = ARM_AM::sub;
+      OffImm = - OffImm;
+    }
+    int Limit = (1 << 8) * Scale;
+    if (OffImm >= Limit || (OffImm & (Scale-1)))
+      return false;
+    Offset = ARM_AM::getAM3Opc(AddSub, OffImm);
+  }
+  EvenReg = Op0->getOperand(0).getReg();
+  OddReg  = Op1->getOperand(0).getReg();
+  if (EvenReg == OddReg)
+    return false;
+  BaseReg = Op0->getOperand(1).getReg();
+  Pred = getInstrPredicate(Op0, PredReg);
+  dl = Op0->getDebugLoc();
+  return true;
+}
+
+bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
+                                 SmallVectorImpl<MachineInstr *> &Ops,
+                                 unsigned Base, bool isLd,
+                                 DenseMap<MachineInstr*, unsigned> &MI2LocMap) {
+  bool RetVal = false;
+
+  // Sort by offset (in reverse order).
+  std::sort(Ops.begin(), Ops.end(),
+            [](const MachineInstr *LHS, const MachineInstr *RHS) {
+    int LOffset = getMemoryOpOffset(LHS);
+    int ROffset = getMemoryOpOffset(RHS);
+    assert(LHS == RHS || LOffset != ROffset);
+    return LOffset > ROffset;
+  });
+
+  // The loads / stores of the same base are in order. Scan them from first to
+  // last and check for the following:
+  // 1. Any def of base.
+  // 2. Any gaps.
+  while (Ops.size() > 1) {
+    unsigned FirstLoc = ~0U;
+    unsigned LastLoc = 0;
+    MachineInstr *FirstOp = nullptr;
+    MachineInstr *LastOp = nullptr;
+    int LastOffset = 0;
+    unsigned LastOpcode = 0;
+    unsigned LastBytes = 0;
+    unsigned NumMove = 0;
+    for (int i = Ops.size() - 1; i >= 0; --i) {
+      MachineInstr *Op = Ops[i];
+      unsigned Loc = MI2LocMap[Op];
+      if (Loc <= FirstLoc) {
+        FirstLoc = Loc;
+        FirstOp = Op;
+      }
+      if (Loc >= LastLoc) {
+        LastLoc = Loc;
+        LastOp = Op;
+      }
+
+      unsigned LSMOpcode
+        = getLoadStoreMultipleOpcode(Op->getOpcode(), ARM_AM::ia);
+      if (LastOpcode && LSMOpcode != LastOpcode)
+        break;
+
+      int Offset = getMemoryOpOffset(Op);
+      unsigned Bytes = getLSMultipleTransferSize(Op);
+      if (LastBytes) {
+        if (Bytes != LastBytes || Offset != (LastOffset + (int)Bytes))
+          break;
+      }
+      LastOffset = Offset;
+      LastBytes = Bytes;
+      LastOpcode = LSMOpcode;
+      if (++NumMove == 8) // FIXME: Tune this limit.
+        break;
+    }
+
+    if (NumMove <= 1)
+      Ops.pop_back();
+    else {
+      SmallPtrSet<MachineInstr*, 4> MemOps;
+      SmallSet<unsigned, 4> MemRegs;
+      for (int i = NumMove-1; i >= 0; --i) {
+        MemOps.insert(Ops[i]);
+        MemRegs.insert(Ops[i]->getOperand(0).getReg());
+      }
+
+      // Be conservative, if the instructions are too far apart, don't
+      // move them. We want to limit the increase of register pressure.
+      bool DoMove = (LastLoc - FirstLoc) <= NumMove*4; // FIXME: Tune this.
+      if (DoMove)
+        DoMove = IsSafeAndProfitableToMove(isLd, Base, FirstOp, LastOp,
+                                           MemOps, MemRegs, TRI);
+      if (!DoMove) {
+        for (unsigned i = 0; i != NumMove; ++i)
+          Ops.pop_back();
+      } else {
+        // This is the new location for the loads / stores.
+        MachineBasicBlock::iterator InsertPos = isLd ? FirstOp : LastOp;
+        while (InsertPos != MBB->end()
+               && (MemOps.count(InsertPos) || InsertPos->isDebugValue()))
+          ++InsertPos;
+
+        // If we are moving a pair of loads / stores, see if it makes sense
+        // to try to allocate a pair of registers that can form register pairs.
+        MachineInstr *Op0 = Ops.back();
+        MachineInstr *Op1 = Ops[Ops.size()-2];
+        unsigned EvenReg = 0, OddReg = 0;
+        unsigned BaseReg = 0, PredReg = 0;
+        ARMCC::CondCodes Pred = ARMCC::AL;
+        bool isT2 = false;
+        unsigned NewOpc = 0;
+        int Offset = 0;
+        DebugLoc dl;
+        if (NumMove == 2 && CanFormLdStDWord(Op0, Op1, dl, NewOpc,
+                                             EvenReg, OddReg, BaseReg,
+                                             Offset, PredReg, Pred, isT2)) {
+          Ops.pop_back();
+          Ops.pop_back();
+
+          const MCInstrDesc &MCID = TII->get(NewOpc);
+          const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0, TRI, *MF);
+          MRI->constrainRegClass(EvenReg, TRC);
+          MRI->constrainRegClass(OddReg, TRC);
+
+          // Form the pair instruction.
+          if (isLd) {
+            MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, dl, MCID)
+              .addReg(EvenReg, RegState::Define)
+              .addReg(OddReg, RegState::Define)
+              .addReg(BaseReg);
+            // FIXME: We're converting from LDRi12 to an insn that still
+            // uses addrmode2, so we need an explicit offset reg. It should
+            // always by reg0 since we're transforming LDRi12s.
+            if (!isT2)
+              MIB.addReg(0);
+            MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
+            concatenateMemOperands(MIB, Op0, Op1);
+            DEBUG(dbgs() << "Formed " << *MIB << "\n");
+            ++NumLDRDFormed;
+          } else {
+            MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, dl, MCID)
+              .addReg(EvenReg)
+              .addReg(OddReg)
+              .addReg(BaseReg);
+            // FIXME: We're converting from LDRi12 to an insn that still
+            // uses addrmode2, so we need an explicit offset reg. It should
+            // always by reg0 since we're transforming STRi12s.
+            if (!isT2)
+              MIB.addReg(0);
+            MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
+            concatenateMemOperands(MIB, Op0, Op1);
+            DEBUG(dbgs() << "Formed " << *MIB << "\n");
+            ++NumSTRDFormed;
+          }
+          MBB->erase(Op0);
+          MBB->erase(Op1);
+
+          // Add register allocation hints to form register pairs.
+          MRI->setRegAllocationHint(EvenReg, ARMRI::RegPairEven, OddReg);
+          MRI->setRegAllocationHint(OddReg,  ARMRI::RegPairOdd, EvenReg);
+        } else {
+          for (unsigned i = 0; i != NumMove; ++i) {
+            MachineInstr *Op = Ops.back();
+            Ops.pop_back();
+            MBB->splice(InsertPos, MBB, Op);
+          }
+        }
+
+        NumLdStMoved += NumMove;
+        RetVal = true;
+      }
+    }
+  }
+
+  return RetVal;
+}
+
+bool
+ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
+  bool RetVal = false;
+
+  DenseMap<MachineInstr*, unsigned> MI2LocMap;
+  DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2LdsMap;
+  DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2StsMap;
+  SmallVector<unsigned, 4> LdBases;
+  SmallVector<unsigned, 4> StBases;
+
+  unsigned Loc = 0;
+  MachineBasicBlock::iterator MBBI = MBB->begin();
+  MachineBasicBlock::iterator E = MBB->end();
+  while (MBBI != E) {
+    for (; MBBI != E; ++MBBI) {
+      MachineInstr *MI = MBBI;
+      if (MI->isCall() || MI->isTerminator()) {
+        // Stop at barriers.
+        ++MBBI;
+        break;
+      }
+
+      if (!MI->isDebugValue())
+        MI2LocMap[MI] = ++Loc;
+
+      if (!isMemoryOp(MI))
+        continue;
+      unsigned PredReg = 0;
+      if (getInstrPredicate(MI, PredReg) != ARMCC::AL)
+        continue;
+
+      int Opc = MI->getOpcode();
+      bool isLd = isi32Load(Opc) || Opc == ARM::VLDRS || Opc == ARM::VLDRD;
+      unsigned Base = MI->getOperand(1).getReg();
+      int Offset = getMemoryOpOffset(MI);
+
+      bool StopHere = false;
+      if (isLd) {
+        DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI =
+          Base2LdsMap.find(Base);
+        if (BI != Base2LdsMap.end()) {
+          for (unsigned i = 0, e = BI->second.size(); i != e; ++i) {
+            if (Offset == getMemoryOpOffset(BI->second[i])) {
+              StopHere = true;
+              break;
+            }
+          }
+          if (!StopHere)
+            BI->second.push_back(MI);
+        } else {
+          Base2LdsMap[Base].push_back(MI);
+          LdBases.push_back(Base);
+        }
+      } else {
+        DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI =
+          Base2StsMap.find(Base);
+        if (BI != Base2StsMap.end()) {
+          for (unsigned i = 0, e = BI->second.size(); i != e; ++i) {
+            if (Offset == getMemoryOpOffset(BI->second[i])) {
+              StopHere = true;
+              break;
+            }
+          }
+          if (!StopHere)
+            BI->second.push_back(MI);
+        } else {
+          Base2StsMap[Base].push_back(MI);
+          StBases.push_back(Base);
+        }
+      }
+
+      if (StopHere) {
+        // Found a duplicate (a base+offset combination that's seen earlier).
+        // Backtrack.
+        --Loc;
+        break;
+      }
+    }
+
+    // Re-schedule loads.
+    for (unsigned i = 0, e = LdBases.size(); i != e; ++i) {
+      unsigned Base = LdBases[i];
+      SmallVectorImpl<MachineInstr *> &Lds = Base2LdsMap[Base];
+      if (Lds.size() > 1)
+        RetVal |= RescheduleOps(MBB, Lds, Base, true, MI2LocMap);
+    }
+
+    // Re-schedule stores.
+    for (unsigned i = 0, e = StBases.size(); i != e; ++i) {
+      unsigned Base = StBases[i];
+      SmallVectorImpl<MachineInstr *> &Sts = Base2StsMap[Base];
+      if (Sts.size() > 1)
+        RetVal |= RescheduleOps(MBB, Sts, Base, false, MI2LocMap);
+    }
+
+    if (MBBI != E) {
+      Base2LdsMap.clear();
+      Base2StsMap.clear();
+      LdBases.clear();
+      StBases.clear();
+    }
+  }
+
+  return RetVal;
+}
+
+
+/// createARMLoadStoreOptimizationPass - returns an instance of the load / store
+/// optimization pass.
+FunctionPass *llvm::createARMLoadStoreOptimizationPass(bool PreAlloc) {
+  if (PreAlloc)
+    return new ARMPreAllocLoadStoreOpt();
+  return new ARMLoadStoreOpt();
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMMCInstLower.cpp b/contrib/llvm/lib/Target/ARM/ARMMCInstLower.cpp
new file mode 100644
index 0000000..023f5f8
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMMCInstLower.cpp
@@ -0,0 +1,129 @@
+//===-- ARMMCInstLower.cpp - Convert ARM MachineInstr to an MCInst --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower ARM MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMAsmPrinter.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "MCTargetDesc/ARMMCExpr.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+using namespace llvm;
+
+
+MCOperand ARMAsmPrinter::GetSymbolRef(const MachineOperand &MO,
+                                      const MCSymbol *Symbol) {
+  const MCExpr *Expr;
+  unsigned Option = MO.getTargetFlags() & ARMII::MO_OPTION_MASK;
+  switch (Option) {
+  default: {
+    Expr = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_None,
+                                   OutContext);
+    switch (Option) {
+    default: llvm_unreachable("Unknown target flag on symbol operand");
+    case ARMII::MO_NO_FLAG:
+      break;
+    case ARMII::MO_LO16:
+      Expr = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_None,
+                                     OutContext);
+      Expr = ARMMCExpr::CreateLower16(Expr, OutContext);
+      break;
+    case ARMII::MO_HI16:
+      Expr = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_None,
+                                     OutContext);
+      Expr = ARMMCExpr::CreateUpper16(Expr, OutContext);
+      break;
+    }
+    break;
+  }
+
+  case ARMII::MO_PLT:
+    Expr = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_PLT,
+                                   OutContext);
+    break;
+  }
+
+  if (!MO.isJTI() && MO.getOffset())
+    Expr = MCBinaryExpr::CreateAdd(Expr,
+                                   MCConstantExpr::Create(MO.getOffset(),
+                                                          OutContext),
+                                   OutContext);
+  return MCOperand::CreateExpr(Expr);
+
+}
+
+bool ARMAsmPrinter::lowerOperand(const MachineOperand &MO,
+                                 MCOperand &MCOp) {
+  switch (MO.getType()) {
+  default: llvm_unreachable("unknown operand type");
+  case MachineOperand::MO_Register:
+    // Ignore all non-CPSR implicit register operands.
+    if (MO.isImplicit() && MO.getReg() != ARM::CPSR)
+      return false;
+    assert(!MO.getSubReg() && "Subregs should be eliminated!");
+    MCOp = MCOperand::CreateReg(MO.getReg());
+    break;
+  case MachineOperand::MO_Immediate:
+    MCOp = MCOperand::CreateImm(MO.getImm());
+    break;
+  case MachineOperand::MO_MachineBasicBlock:
+    MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(
+        MO.getMBB()->getSymbol(), OutContext));
+    break;
+  case MachineOperand::MO_GlobalAddress: {
+    MCOp = GetSymbolRef(MO,
+                        GetARMGVSymbol(MO.getGlobal(), MO.getTargetFlags()));
+    break;
+  }
+  case MachineOperand::MO_ExternalSymbol:
+   MCOp = GetSymbolRef(MO,
+                        GetExternalSymbolSymbol(MO.getSymbolName()));
+    break;
+  case MachineOperand::MO_JumpTableIndex:
+    MCOp = GetSymbolRef(MO, GetJTISymbol(MO.getIndex()));
+    break;
+  case MachineOperand::MO_ConstantPoolIndex:
+    MCOp = GetSymbolRef(MO, GetCPISymbol(MO.getIndex()));
+    break;
+  case MachineOperand::MO_BlockAddress:
+    MCOp = GetSymbolRef(MO, GetBlockAddressSymbol(MO.getBlockAddress()));
+    break;
+  case MachineOperand::MO_FPImmediate: {
+    APFloat Val = MO.getFPImm()->getValueAPF();
+    bool ignored;
+    Val.convert(APFloat::IEEEdouble, APFloat::rmTowardZero, &ignored);
+    MCOp = MCOperand::CreateFPImm(Val.convertToDouble());
+    break;
+  }
+  case MachineOperand::MO_RegisterMask:
+    // Ignore call clobbers.
+    return false;
+  }
+  return true;
+}
+
+void llvm::LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
+                                        ARMAsmPrinter &AP) {
+  OutMI.setOpcode(MI->getOpcode());
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+
+    MCOperand MCOp;
+    if (AP.lowerOperand(MO, MCOp))
+      OutMI.addOperand(MCOp);
+  }
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp
new file mode 100644
index 0000000..892b269
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp
@@ -0,0 +1,24 @@
+//===-- ARMMachineFuctionInfo.cpp - ARM machine function info -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMMachineFunctionInfo.h"
+
+using namespace llvm;
+
+void ARMFunctionInfo::anchor() { }
+
+ARMFunctionInfo::ARMFunctionInfo(MachineFunction &MF)
+    : isThumb(MF.getTarget().getSubtarget<ARMSubtarget>().isThumb()),
+      hasThumb2(MF.getTarget().getSubtarget<ARMSubtarget>().hasThumb2()),
+      StByValParamsPadding(0), ArgRegsSaveSize(0), HasStackFrame(false),
+      RestoreSPFromFP(false), LRSpilledForFarJump(false),
+      FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
+      GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0), JumpTableUId(0),
+      PICLabelUId(0), VarArgsFrameIndex(0), HasITBlocks(false),
+      GlobalBaseReg(0) {}
diff --git a/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
new file mode 100644
index 0000000..d3fabc3
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -0,0 +1,241 @@
+//===-- ARMMachineFuctionInfo.h - ARM machine function info -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares ARM-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMMACHINEFUNCTIONINFO_H
+#define ARMMACHINEFUNCTIONINFO_H
+
+#include "ARMSubtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/ADT/DenseMap.h"
+
+namespace llvm {
+
+/// ARMFunctionInfo - This class is derived from MachineFunctionInfo and
+/// contains private ARM-specific information for each MachineFunction.
+class ARMFunctionInfo : public MachineFunctionInfo {
+  virtual void anchor();
+
+  /// isThumb - True if this function is compiled under Thumb mode.
+  /// Used to initialized Align, so must precede it.
+  bool isThumb;
+
+  /// hasThumb2 - True if the target architecture supports Thumb2. Do not use
+  /// to determine if function is compiled under Thumb mode, for that use
+  /// 'isThumb'.
+  bool hasThumb2;
+
+  /// StByValParamsPadding - For parameter that is split between
+  /// GPRs and memory; while recovering GPRs part, when
+  /// StackAlignment > 4, and GPRs-part-size mod StackAlignment != 0,
+  /// we need to insert gap before parameter start address. It allows to
+  /// "attach" GPR-part to the part that was passed via stack.
+  unsigned StByValParamsPadding;
+
+  /// VarArgsRegSaveSize - Size of the register save area for vararg functions.
+  ///
+  unsigned ArgRegsSaveSize;
+
+  /// HasStackFrame - True if this function has a stack frame. Set by
+  /// processFunctionBeforeCalleeSavedScan().
+  bool HasStackFrame;
+
+  /// RestoreSPFromFP - True if epilogue should restore SP from FP. Set by
+  /// emitPrologue.
+  bool RestoreSPFromFP;
+
+  /// LRSpilledForFarJump - True if the LR register has been for spilled to
+  /// enable far jump.
+  bool LRSpilledForFarJump;
+
+  /// FramePtrSpillOffset - If HasStackFrame, this records the frame pointer
+  /// spill stack offset.
+  unsigned FramePtrSpillOffset;
+
+  /// GPRCS1Offset, GPRCS2Offset, DPRCSOffset - Starting offset of callee saved
+  /// register spills areas. For Mac OS X:
+  ///
+  /// GPR callee-saved (1) : r4, r5, r6, r7, lr
+  /// --------------------------------------------
+  /// GPR callee-saved (2) : r8, r10, r11
+  /// --------------------------------------------
+  /// DPR callee-saved : d8 - d15
+  ///
+  /// Also see AlignedDPRCSRegs below. Not all D-regs need to go in area 3.
+  /// Some may be spilled after the stack has been realigned.
+  unsigned GPRCS1Offset;
+  unsigned GPRCS2Offset;
+  unsigned DPRCSOffset;
+
+  /// GPRCS1Size, GPRCS2Size, DPRCSSize - Sizes of callee saved register spills
+  /// areas.
+  unsigned GPRCS1Size;
+  unsigned GPRCS2Size;
+  unsigned DPRCSSize;
+
+  /// NumAlignedDPRCS2Regs - The number of callee-saved DPRs that are saved in
+  /// the aligned portion of the stack frame.  This is always a contiguous
+  /// sequence of D-registers starting from d8.
+  ///
+  /// We do not keep track of the frame indices used for these registers - they
+  /// behave like any other frame index in the aligned stack frame.  These
+  /// registers also aren't included in DPRCSSize above.
+  unsigned NumAlignedDPRCS2Regs;
+
+  /// JumpTableUId - Unique id for jumptables.
+  ///
+  unsigned JumpTableUId;
+
+  unsigned PICLabelUId;
+
+  /// VarArgsFrameIndex - FrameIndex for start of varargs area.
+  int VarArgsFrameIndex;
+
+  /// HasITBlocks - True if IT blocks have been inserted.
+  bool HasITBlocks;
+
+  /// CPEClones - Track constant pool entries clones created by Constant Island
+  /// pass.
+  DenseMap<unsigned, unsigned> CPEClones;
+
+  /// GlobalBaseReg - keeps track of the virtual register initialized for
+  /// use as the global base register. This is used for PIC in some PIC
+  /// relocation models.
+  unsigned GlobalBaseReg;
+
+  /// ArgumentStackSize - amount of bytes on stack consumed by the arguments
+  /// being passed on the stack
+  unsigned ArgumentStackSize;
+
+  /// CoalescedWeights - mapping of basic blocks to the rolling counter of
+  /// coalesced weights.
+  DenseMap<const MachineBasicBlock*, unsigned> CoalescedWeights;
+
+public:
+  ARMFunctionInfo() :
+    isThumb(false),
+    hasThumb2(false),
+    ArgRegsSaveSize(0), HasStackFrame(false), RestoreSPFromFP(false),
+    LRSpilledForFarJump(false),
+    FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
+    GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0),
+    NumAlignedDPRCS2Regs(0),
+    JumpTableUId(0), PICLabelUId(0),
+    VarArgsFrameIndex(0), HasITBlocks(false), GlobalBaseReg(0) {}
+
+  explicit ARMFunctionInfo(MachineFunction &MF);
+
+  bool isThumbFunction() const { return isThumb; }
+  bool isThumb1OnlyFunction() const { return isThumb && !hasThumb2; }
+  bool isThumb2Function() const { return isThumb && hasThumb2; }
+
+  unsigned getStoredByValParamsPadding() const { return StByValParamsPadding; }
+  void setStoredByValParamsPadding(unsigned p) { StByValParamsPadding = p; }
+
+  unsigned getArgRegsSaveSize(unsigned Align = 0) const {
+    if (!Align)
+      return ArgRegsSaveSize;
+    return (ArgRegsSaveSize + Align - 1) & ~(Align - 1);
+  }
+  void setArgRegsSaveSize(unsigned s) { ArgRegsSaveSize = s; }
+
+  bool hasStackFrame() const { return HasStackFrame; }
+  void setHasStackFrame(bool s) { HasStackFrame = s; }
+
+  bool shouldRestoreSPFromFP() const { return RestoreSPFromFP; }
+  void setShouldRestoreSPFromFP(bool s) { RestoreSPFromFP = s; }
+
+  bool isLRSpilledForFarJump() const { return LRSpilledForFarJump; }
+  void setLRIsSpilledForFarJump(bool s) { LRSpilledForFarJump = s; }
+
+  unsigned getFramePtrSpillOffset() const { return FramePtrSpillOffset; }
+  void setFramePtrSpillOffset(unsigned o) { FramePtrSpillOffset = o; }
+
+  unsigned getNumAlignedDPRCS2Regs() const { return NumAlignedDPRCS2Regs; }
+  void setNumAlignedDPRCS2Regs(unsigned n) { NumAlignedDPRCS2Regs = n; }
+
+  unsigned getGPRCalleeSavedArea1Offset() const { return GPRCS1Offset; }
+  unsigned getGPRCalleeSavedArea2Offset() const { return GPRCS2Offset; }
+  unsigned getDPRCalleeSavedAreaOffset()  const { return DPRCSOffset; }
+
+  void setGPRCalleeSavedArea1Offset(unsigned o) { GPRCS1Offset = o; }
+  void setGPRCalleeSavedArea2Offset(unsigned o) { GPRCS2Offset = o; }
+  void setDPRCalleeSavedAreaOffset(unsigned o)  { DPRCSOffset = o; }
+
+  unsigned getGPRCalleeSavedArea1Size() const { return GPRCS1Size; }
+  unsigned getGPRCalleeSavedArea2Size() const { return GPRCS2Size; }
+  unsigned getDPRCalleeSavedAreaSize()  const { return DPRCSSize; }
+
+  void setGPRCalleeSavedArea1Size(unsigned s) { GPRCS1Size = s; }
+  void setGPRCalleeSavedArea2Size(unsigned s) { GPRCS2Size = s; }
+  void setDPRCalleeSavedAreaSize(unsigned s)  { DPRCSSize = s; }
+
+  unsigned getArgumentStackSize() const { return ArgumentStackSize; }
+  void setArgumentStackSize(unsigned size) { ArgumentStackSize = size; }
+
+  unsigned createJumpTableUId() {
+    return JumpTableUId++;
+  }
+
+  unsigned getNumJumpTables() const {
+    return JumpTableUId;
+  }
+
+  void initPICLabelUId(unsigned UId) {
+    PICLabelUId = UId;
+  }
+
+  unsigned getNumPICLabels() const {
+    return PICLabelUId;
+  }
+
+  unsigned createPICLabelUId() {
+    return PICLabelUId++;
+  }
+
+  int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
+  void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; }
+
+  bool hasITBlocks() const { return HasITBlocks; }
+  void setHasITBlocks(bool h) { HasITBlocks = h; }
+
+  unsigned getGlobalBaseReg() const { return GlobalBaseReg; }
+  void setGlobalBaseReg(unsigned Reg) { GlobalBaseReg = Reg; }
+
+  void recordCPEClone(unsigned CPIdx, unsigned CPCloneIdx) {
+    if (!CPEClones.insert(std::make_pair(CPCloneIdx, CPIdx)).second)
+      llvm_unreachable("Duplicate entries!");
+  }
+
+  unsigned getOriginalCPIdx(unsigned CloneIdx) const {
+    DenseMap<unsigned, unsigned>::const_iterator I = CPEClones.find(CloneIdx);
+    if (I != CPEClones.end())
+      return I->second;
+    else
+      return -1U;
+  }
+
+  DenseMap<const MachineBasicBlock*, unsigned>::iterator getCoalescedWeight(
+                                                  MachineBasicBlock* MBB) {
+    auto It = CoalescedWeights.find(MBB);
+    if (It == CoalescedWeights.end()) {
+      It = CoalescedWeights.insert(std::make_pair(MBB, 0)).first;
+    }
+    return It;
+  }
+};
+} // End llvm namespace
+
+#endif // ARMMACHINEFUNCTIONINFO_H
diff --git a/contrib/llvm/lib/Target/ARM/ARMOptimizeBarriersPass.cpp b/contrib/llvm/lib/Target/ARM/ARMOptimizeBarriersPass.cpp
new file mode 100644
index 0000000..2a49255
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMOptimizeBarriersPass.cpp
@@ -0,0 +1,101 @@
+//===-- ARMOptimizeBarriersPass - two DMBs without a memory access in between,
+//removed one -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===------------------------------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMInstrInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "double barriers"
+
+STATISTIC(NumDMBsRemoved, "Number of DMBs removed");
+
+namespace {
+class ARMOptimizeBarriersPass : public MachineFunctionPass {
+public:
+  static char ID;
+  ARMOptimizeBarriersPass() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+
+  const char *getPassName() const override {
+    return "optimise barriers pass";
+  }
+
+private:
+};
+char ARMOptimizeBarriersPass::ID = 0;
+}
+
+// Returns whether the instruction can safely move past a DMB instruction
+// The current implementation allows this iif MI does not have any possible
+// memory access
+static bool CanMovePastDMB(const MachineInstr *MI) {
+  return !(MI->mayLoad() ||
+          MI->mayStore() ||
+          MI->hasUnmodeledSideEffects() ||
+          MI->isCall() ||
+          MI->isReturn());
+}
+
+bool ARMOptimizeBarriersPass::runOnMachineFunction(MachineFunction &MF) {
+  // Vector to store the DMBs we will remove after the first iteration
+  std::vector<MachineInstr *> ToRemove;
+  // DMBType is the Imm value of the first operand. It determines whether it's a
+  // DMB ish, dmb sy, dmb osh, etc
+  int64_t DMBType = -1;
+
+  // Find a dmb. If we can move it until the next dmb, tag the second one for
+  // removal
+  for (auto &MBB : MF) {
+    // Will be true when we have seen a DMB, and not seen any instruction since
+    // that cannot move past a DMB
+    bool IsRemovableNextDMB = false;
+    for (auto &MI : MBB) {
+      if (MI.getOpcode() == ARM::DMB) {
+        if (IsRemovableNextDMB) {
+          // If the Imm of this DMB is the same as that of the last DMB, we can
+          // tag this second DMB for removal
+          if (MI.getOperand(0).getImm() == DMBType) {
+            ToRemove.push_back(&MI);
+          } else {
+            // If it has a different DMBType, we cannot remove it, but will scan
+            // for the next DMB, recording this DMB's type as last seen DMB type
+            DMBType = MI.getOperand(0).getImm();
+          }
+        } else {
+          // After we see a DMB, a next one is removable
+          IsRemovableNextDMB = true;
+          DMBType = MI.getOperand(0).getImm();
+        }
+      } else if (!CanMovePastDMB(&MI)) {
+        // If we find an instruction unable to pass past a DMB, a next DMB is
+        // not removable
+        IsRemovableNextDMB = false;
+      }
+    }
+  }
+  // Remove the tagged DMB
+  for (auto MI : ToRemove) {
+    MI->eraseFromParent();
+    ++NumDMBsRemoved;
+  }
+
+  return NumDMBsRemoved > 0;
+}
+
+/// createARMOptimizeBarriersPass - Returns an instance of the remove double
+/// barriers
+/// pass.
+FunctionPass *llvm::createARMOptimizeBarriersPass() {
+  return new ARMOptimizeBarriersPass();
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMPerfectShuffle.h b/contrib/llvm/lib/Target/ARM/ARMPerfectShuffle.h
new file mode 100644
index 0000000..efa22fb
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMPerfectShuffle.h
@@ -0,0 +1,6586 @@
+//===-- ARMPerfectShuffle.h - NEON Perfect Shuffle Table --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file, which was autogenerated by llvm-PerfectShuffle, contains data
+// for the optimal way to build a perfect shuffle using neon instructions.
+//
+//===----------------------------------------------------------------------===//
+
+// 31 entries have cost 0
+// 242 entries have cost 1
+// 1447 entries have cost 2
+// 3602 entries have cost 3
+// 1237 entries have cost 4
+// 2 entries have cost 5
+
+// This table is 6561*4 = 26244 bytes in size.
+static const unsigned PerfectShuffleTable[6561+1] = {
+  135053414U, // <0,0,0,0>: Cost 1 vdup0 LHS
+  1543503974U, // <0,0,0,1>: Cost 2 vext2 <0,0,0,0>, LHS
+  2618572962U, // <0,0,0,2>: Cost 3 vext2 <0,2,0,0>, <0,2,0,0>
+  2568054923U, // <0,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
+  1476398390U, // <0,0,0,4>: Cost 2 vext1 <0,0,0,0>, RHS
+  2550140624U, // <0,0,0,5>: Cost 3 vext1 <0,0,0,0>, <5,1,7,3>
+  2550141434U, // <0,0,0,6>: Cost 3 vext1 <0,0,0,0>, <6,2,7,3>
+  2591945711U, // <0,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
+  135053414U, // <0,0,0,u>: Cost 1 vdup0 LHS
+  2886516736U, // <0,0,1,0>: Cost 3 vzipl LHS, <0,0,0,0>
+  1812775014U, // <0,0,1,1>: Cost 2 vzipl LHS, LHS
+  1618133094U, // <0,0,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
+  2625209292U, // <0,0,1,3>: Cost 3 vext2 <1,3,0,0>, <1,3,0,0>
+  2886558034U, // <0,0,1,4>: Cost 3 vzipl LHS, <0,4,1,5>
+  2617246864U, // <0,0,1,5>: Cost 3 vext2 <0,0,0,0>, <1,5,3,7>
+  3659723031U, // <0,0,1,6>: Cost 4 vext1 <6,0,0,1>, <6,0,0,1>
+  2591953904U, // <0,0,1,7>: Cost 3 vext1 <7,0,0,1>, <7,0,0,1>
+  1812775581U, // <0,0,1,u>: Cost 2 vzipl LHS, LHS
+  3020734464U, // <0,0,2,0>: Cost 3 vtrnl LHS, <0,0,0,0>
+  3020734474U, // <0,0,2,1>: Cost 3 vtrnl LHS, <0,0,1,1>
+  1946992742U, // <0,0,2,2>: Cost 2 vtrnl LHS, LHS
+  2631181989U, // <0,0,2,3>: Cost 3 vext2 <2,3,0,0>, <2,3,0,0>
+  3020734668U, // <0,0,2,4>: Cost 3 vtrnl LHS, <0,2,4,6>
+  3826550569U, // <0,0,2,5>: Cost 4 vuzpl <0,2,0,2>, <2,4,5,6>
+  2617247674U, // <0,0,2,6>: Cost 3 vext2 <0,0,0,0>, <2,6,3,7>
+  2591962097U, // <0,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
+  1946992796U, // <0,0,2,u>: Cost 2 vtrnl LHS, LHS
+  2635163787U, // <0,0,3,0>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
+  2686419196U, // <0,0,3,1>: Cost 3 vext3 <0,3,1,0>, <0,3,1,0>
+  2686492933U, // <0,0,3,2>: Cost 3 vext3 <0,3,2,0>, <0,3,2,0>
+  2617248156U, // <0,0,3,3>: Cost 3 vext2 <0,0,0,0>, <3,3,3,3>
+  2617248258U, // <0,0,3,4>: Cost 3 vext2 <0,0,0,0>, <3,4,5,6>
+  3826551298U, // <0,0,3,5>: Cost 4 vuzpl <0,2,0,2>, <3,4,5,6>
+  3690990200U, // <0,0,3,6>: Cost 4 vext2 <0,0,0,0>, <3,6,0,7>
+  3713551042U, // <0,0,3,7>: Cost 4 vext2 <3,7,0,0>, <3,7,0,0>
+  2635163787U, // <0,0,3,u>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
+  2617248658U, // <0,0,4,0>: Cost 3 vext2 <0,0,0,0>, <4,0,5,1>
+  2888450150U, // <0,0,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
+  3021570150U, // <0,0,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
+  3641829519U, // <0,0,4,3>: Cost 4 vext1 <3,0,0,4>, <3,0,0,4>
+  3021570252U, // <0,0,4,4>: Cost 3 vtrnl <0,2,4,6>, <0,2,4,6>
+  1543507254U, // <0,0,4,5>: Cost 2 vext2 <0,0,0,0>, RHS
+  2752810294U, // <0,0,4,6>: Cost 3 vuzpl <0,2,0,2>, RHS
+  3786998152U, // <0,0,4,7>: Cost 4 vext3 <4,7,5,0>, <0,4,7,5>
+  1543507497U, // <0,0,4,u>: Cost 2 vext2 <0,0,0,0>, RHS
+  2684354972U, // <0,0,5,0>: Cost 3 vext3 <0,0,0,0>, <0,5,0,7>
+  2617249488U, // <0,0,5,1>: Cost 3 vext2 <0,0,0,0>, <5,1,7,3>
+  3765617070U, // <0,0,5,2>: Cost 4 vext3 <1,2,3,0>, <0,5,2,7>
+  3635865780U, // <0,0,5,3>: Cost 4 vext1 <2,0,0,5>, <3,0,4,5>
+  2617249734U, // <0,0,5,4>: Cost 3 vext2 <0,0,0,0>, <5,4,7,6>
+  2617249796U, // <0,0,5,5>: Cost 3 vext2 <0,0,0,0>, <5,5,5,5>
+  2718712274U, // <0,0,5,6>: Cost 3 vext3 <5,6,7,0>, <0,5,6,7>
+  2617249960U, // <0,0,5,7>: Cost 3 vext2 <0,0,0,0>, <5,7,5,7>
+  2720039396U, // <0,0,5,u>: Cost 3 vext3 <5,u,7,0>, <0,5,u,7>
+  2684355053U, // <0,0,6,0>: Cost 3 vext3 <0,0,0,0>, <0,6,0,7>
+  3963609190U, // <0,0,6,1>: Cost 4 vzipl <0,6,2,7>, LHS
+  2617250298U, // <0,0,6,2>: Cost 3 vext2 <0,0,0,0>, <6,2,7,3>
+  3796435464U, // <0,0,6,3>: Cost 4 vext3 <6,3,7,0>, <0,6,3,7>
+  3659762998U, // <0,0,6,4>: Cost 4 vext1 <6,0,0,6>, RHS
+  3659763810U, // <0,0,6,5>: Cost 4 vext1 <6,0,0,6>, <5,6,7,0>
+  2617250616U, // <0,0,6,6>: Cost 3 vext2 <0,0,0,0>, <6,6,6,6>
+  2657727309U, // <0,0,6,7>: Cost 3 vext2 <6,7,0,0>, <6,7,0,0>
+  2658390942U, // <0,0,6,u>: Cost 3 vext2 <6,u,0,0>, <6,u,0,0>
+  2659054575U, // <0,0,7,0>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
+  3635880854U, // <0,0,7,1>: Cost 4 vext1 <2,0,0,7>, <1,2,3,0>
+  3635881401U, // <0,0,7,2>: Cost 4 vext1 <2,0,0,7>, <2,0,0,7>
+  3734787298U, // <0,0,7,3>: Cost 4 vext2 <7,3,0,0>, <7,3,0,0>
+  2617251174U, // <0,0,7,4>: Cost 3 vext2 <0,0,0,0>, <7,4,5,6>
+  3659772002U, // <0,0,7,5>: Cost 4 vext1 <6,0,0,7>, <5,6,7,0>
+  3659772189U, // <0,0,7,6>: Cost 4 vext1 <6,0,0,7>, <6,0,0,7>
+  2617251436U, // <0,0,7,7>: Cost 3 vext2 <0,0,0,0>, <7,7,7,7>
+  2659054575U, // <0,0,7,u>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
+  135053414U, // <0,0,u,0>: Cost 1 vdup0 LHS
+  1817419878U, // <0,0,u,1>: Cost 2 vzipl LHS, LHS
+  1947435110U, // <0,0,u,2>: Cost 2 vtrnl LHS, LHS
+  2568120467U, // <0,0,u,3>: Cost 3 vext1 <3,0,0,u>, <3,0,0,u>
+  1476463926U, // <0,0,u,4>: Cost 2 vext1 <0,0,0,u>, RHS
+  1543510170U, // <0,0,u,5>: Cost 2 vext2 <0,0,0,0>, RHS
+  2752813210U, // <0,0,u,6>: Cost 3 vuzpl <0,2,0,2>, RHS
+  2592011255U, // <0,0,u,7>: Cost 3 vext1 <7,0,0,u>, <7,0,0,u>
+  135053414U, // <0,0,u,u>: Cost 1 vdup0 LHS
+  2618581002U, // <0,1,0,0>: Cost 3 vext2 <0,2,0,1>, <0,0,1,1>
+  1557446758U, // <0,1,0,1>: Cost 2 vext2 <2,3,0,1>, LHS
+  2618581155U, // <0,1,0,2>: Cost 3 vext2 <0,2,0,1>, <0,2,0,1>
+  2690548468U, // <0,1,0,3>: Cost 3 vext3 <1,0,3,0>, <1,0,3,0>
+  2626543954U, // <0,1,0,4>: Cost 3 vext2 <1,5,0,1>, <0,4,1,5>
+  4094985216U, // <0,1,0,5>: Cost 4 vtrnl <0,2,0,2>, <1,3,5,7>
+  2592019278U, // <0,1,0,6>: Cost 3 vext1 <7,0,1,0>, <6,7,0,1>
+  2592019448U, // <0,1,0,7>: Cost 3 vext1 <7,0,1,0>, <7,0,1,0>
+  1557447325U, // <0,1,0,u>: Cost 2 vext2 <2,3,0,1>, LHS
+  1476476938U, // <0,1,1,0>: Cost 2 vext1 <0,0,1,1>, <0,0,1,1>
+  2886517556U, // <0,1,1,1>: Cost 3 vzipl LHS, <1,1,1,1>
+  2886517654U, // <0,1,1,2>: Cost 3 vzipl LHS, <1,2,3,0>
+  2886517720U, // <0,1,1,3>: Cost 3 vzipl LHS, <1,3,1,3>
+  1476480310U, // <0,1,1,4>: Cost 2 vext1 <0,0,1,1>, RHS
+  2886558864U, // <0,1,1,5>: Cost 3 vzipl LHS, <1,5,3,7>
+  2550223354U, // <0,1,1,6>: Cost 3 vext1 <0,0,1,1>, <6,2,7,3>
+  2550223856U, // <0,1,1,7>: Cost 3 vext1 <0,0,1,1>, <7,0,0,1>
+  1476482862U, // <0,1,1,u>: Cost 2 vext1 <0,0,1,1>, LHS
+  1494401126U, // <0,1,2,0>: Cost 2 vext1 <3,0,1,2>, LHS
+  3020735284U, // <0,1,2,1>: Cost 3 vtrnl LHS, <1,1,1,1>
+  2562172349U, // <0,1,2,2>: Cost 3 vext1 <2,0,1,2>, <2,0,1,2>
+  835584U, // <0,1,2,3>: Cost 0 copy LHS
+  1494404406U, // <0,1,2,4>: Cost 2 vext1 <3,0,1,2>, RHS
+  3020735488U, // <0,1,2,5>: Cost 3 vtrnl LHS, <1,3,5,7>
+  2631190458U, // <0,1,2,6>: Cost 3 vext2 <2,3,0,1>, <2,6,3,7>
+  1518294010U, // <0,1,2,7>: Cost 2 vext1 <7,0,1,2>, <7,0,1,2>
+  835584U, // <0,1,2,u>: Cost 0 copy LHS
+  2692318156U, // <0,1,3,0>: Cost 3 vext3 <1,3,0,0>, <1,3,0,0>
+  2691875800U, // <0,1,3,1>: Cost 3 vext3 <1,2,3,0>, <1,3,1,3>
+  2691875806U, // <0,1,3,2>: Cost 3 vext3 <1,2,3,0>, <1,3,2,0>
+  2692539367U, // <0,1,3,3>: Cost 3 vext3 <1,3,3,0>, <1,3,3,0>
+  2562182454U, // <0,1,3,4>: Cost 3 vext1 <2,0,1,3>, RHS
+  2691875840U, // <0,1,3,5>: Cost 3 vext3 <1,2,3,0>, <1,3,5,7>
+  2692760578U, // <0,1,3,6>: Cost 3 vext3 <1,3,6,0>, <1,3,6,0>
+  2639817411U, // <0,1,3,7>: Cost 3 vext2 <3,7,0,1>, <3,7,0,1>
+  2691875863U, // <0,1,3,u>: Cost 3 vext3 <1,2,3,0>, <1,3,u,3>
+  2568159334U, // <0,1,4,0>: Cost 3 vext1 <3,0,1,4>, LHS
+  4095312692U, // <0,1,4,1>: Cost 4 vtrnl <0,2,4,6>, <1,1,1,1>
+  2568160934U, // <0,1,4,2>: Cost 3 vext1 <3,0,1,4>, <2,3,0,1>
+  2568161432U, // <0,1,4,3>: Cost 3 vext1 <3,0,1,4>, <3,0,1,4>
+  2568162614U, // <0,1,4,4>: Cost 3 vext1 <3,0,1,4>, RHS
+  1557450038U, // <0,1,4,5>: Cost 2 vext2 <2,3,0,1>, RHS
+  2754235702U, // <0,1,4,6>: Cost 3 vuzpl <0,4,1,5>, RHS
+  2592052220U, // <0,1,4,7>: Cost 3 vext1 <7,0,1,4>, <7,0,1,4>
+  1557450281U, // <0,1,4,u>: Cost 2 vext2 <2,3,0,1>, RHS
+  3765617775U, // <0,1,5,0>: Cost 4 vext3 <1,2,3,0>, <1,5,0,1>
+  2647781007U, // <0,1,5,1>: Cost 3 vext2 <5,1,0,1>, <5,1,0,1>
+  3704934138U, // <0,1,5,2>: Cost 4 vext2 <2,3,0,1>, <5,2,3,0>
+  2691875984U, // <0,1,5,3>: Cost 3 vext3 <1,2,3,0>, <1,5,3,7>
+  2657734598U, // <0,1,5,4>: Cost 3 vext2 <6,7,0,1>, <5,4,7,6>
+  2650435539U, // <0,1,5,5>: Cost 3 vext2 <5,5,0,1>, <5,5,0,1>
+  2651099172U, // <0,1,5,6>: Cost 3 vext2 <5,6,0,1>, <5,6,0,1>
+  2651762805U, // <0,1,5,7>: Cost 3 vext2 <5,7,0,1>, <5,7,0,1>
+  2691876029U, // <0,1,5,u>: Cost 3 vext3 <1,2,3,0>, <1,5,u,7>
+  2592063590U, // <0,1,6,0>: Cost 3 vext1 <7,0,1,6>, LHS
+  3765617871U, // <0,1,6,1>: Cost 4 vext3 <1,2,3,0>, <1,6,1,7>
+  2654417337U, // <0,1,6,2>: Cost 3 vext2 <6,2,0,1>, <6,2,0,1>
+  3765617889U, // <0,1,6,3>: Cost 4 vext3 <1,2,3,0>, <1,6,3,7>
+  2592066870U, // <0,1,6,4>: Cost 3 vext1 <7,0,1,6>, RHS
+  3765617907U, // <0,1,6,5>: Cost 4 vext3 <1,2,3,0>, <1,6,5,7>
+  2657071869U, // <0,1,6,6>: Cost 3 vext2 <6,6,0,1>, <6,6,0,1>
+  1583993678U, // <0,1,6,7>: Cost 2 vext2 <6,7,0,1>, <6,7,0,1>
+  1584657311U, // <0,1,6,u>: Cost 2 vext2 <6,u,0,1>, <6,u,0,1>
+  2657735672U, // <0,1,7,0>: Cost 3 vext2 <6,7,0,1>, <7,0,1,0>
+  2657735808U, // <0,1,7,1>: Cost 3 vext2 <6,7,0,1>, <7,1,7,1>
+  2631193772U, // <0,1,7,2>: Cost 3 vext2 <2,3,0,1>, <7,2,3,0>
+  2661053667U, // <0,1,7,3>: Cost 3 vext2 <7,3,0,1>, <7,3,0,1>
+  2657736038U, // <0,1,7,4>: Cost 3 vext2 <6,7,0,1>, <7,4,5,6>
+  3721524621U, // <0,1,7,5>: Cost 4 vext2 <5,1,0,1>, <7,5,1,0>
+  2657736158U, // <0,1,7,6>: Cost 3 vext2 <6,7,0,1>, <7,6,1,0>
+  2657736300U, // <0,1,7,7>: Cost 3 vext2 <6,7,0,1>, <7,7,7,7>
+  2657736322U, // <0,1,7,u>: Cost 3 vext2 <6,7,0,1>, <7,u,1,2>
+  1494450278U, // <0,1,u,0>: Cost 2 vext1 <3,0,1,u>, LHS
+  1557452590U, // <0,1,u,1>: Cost 2 vext2 <2,3,0,1>, LHS
+  2754238254U, // <0,1,u,2>: Cost 3 vuzpl <0,4,1,5>, LHS
+  835584U, // <0,1,u,3>: Cost 0 copy LHS
+  1494453558U, // <0,1,u,4>: Cost 2 vext1 <3,0,1,u>, RHS
+  1557452954U, // <0,1,u,5>: Cost 2 vext2 <2,3,0,1>, RHS
+  2754238618U, // <0,1,u,6>: Cost 3 vuzpl <0,4,1,5>, RHS
+  1518343168U, // <0,1,u,7>: Cost 2 vext1 <7,0,1,u>, <7,0,1,u>
+  835584U, // <0,1,u,u>: Cost 0 copy LHS
+  2752299008U, // <0,2,0,0>: Cost 3 vuzpl LHS, <0,0,0,0>
+  1544847462U, // <0,2,0,1>: Cost 2 vext2 <0,2,0,2>, LHS
+  1678557286U, // <0,2,0,2>: Cost 2 vuzpl LHS, LHS
+  2696521165U, // <0,2,0,3>: Cost 3 vext3 <2,0,3,0>, <2,0,3,0>
+  2752340172U, // <0,2,0,4>: Cost 3 vuzpl LHS, <0,2,4,6>
+  2691876326U, // <0,2,0,5>: Cost 3 vext3 <1,2,3,0>, <2,0,5,7>
+  2618589695U, // <0,2,0,6>: Cost 3 vext2 <0,2,0,2>, <0,6,2,7>
+  2592093185U, // <0,2,0,7>: Cost 3 vext1 <7,0,2,0>, <7,0,2,0>
+  1678557340U, // <0,2,0,u>: Cost 2 vuzpl LHS, LHS
+  2618589942U, // <0,2,1,0>: Cost 3 vext2 <0,2,0,2>, <1,0,3,2>
+  2752299828U, // <0,2,1,1>: Cost 3 vuzpl LHS, <1,1,1,1>
+  2886518376U, // <0,2,1,2>: Cost 3 vzipl LHS, <2,2,2,2>
+  2752299766U, // <0,2,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
+  2550295862U, // <0,2,1,4>: Cost 3 vext1 <0,0,2,1>, RHS
+  2752340992U, // <0,2,1,5>: Cost 3 vuzpl LHS, <1,3,5,7>
+  2886559674U, // <0,2,1,6>: Cost 3 vzipl LHS, <2,6,3,7>
+  3934208106U, // <0,2,1,7>: Cost 4 vuzpr <7,0,1,2>, <0,1,2,7>
+  2752340771U, // <0,2,1,u>: Cost 3 vuzpl LHS, <1,0,u,2>
+  1476558868U, // <0,2,2,0>: Cost 2 vext1 <0,0,2,2>, <0,0,2,2>
+  2226628029U, // <0,2,2,1>: Cost 3 vrev <2,0,1,2>
+  2752300648U, // <0,2,2,2>: Cost 3 vuzpl LHS, <2,2,2,2>
+  3020736114U, // <0,2,2,3>: Cost 3 vtrnl LHS, <2,2,3,3>
+  1476562230U, // <0,2,2,4>: Cost 2 vext1 <0,0,2,2>, RHS
+  2550304464U, // <0,2,2,5>: Cost 3 vext1 <0,0,2,2>, <5,1,7,3>
+  2618591162U, // <0,2,2,6>: Cost 3 vext2 <0,2,0,2>, <2,6,3,7>
+  2550305777U, // <0,2,2,7>: Cost 3 vext1 <0,0,2,2>, <7,0,0,2>
+  1476564782U, // <0,2,2,u>: Cost 2 vext1 <0,0,2,2>, LHS
+  2618591382U, // <0,2,3,0>: Cost 3 vext2 <0,2,0,2>, <3,0,1,2>
+  2752301206U, // <0,2,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
+  3826043121U, // <0,2,3,2>: Cost 4 vuzpl LHS, <3,1,2,3>
+  2752301468U, // <0,2,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
+  2618591746U, // <0,2,3,4>: Cost 3 vext2 <0,2,0,2>, <3,4,5,6>
+  2752301570U, // <0,2,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
+  3830688102U, // <0,2,3,6>: Cost 4 vuzpl LHS, <3,2,6,3>
+  2698807012U, // <0,2,3,7>: Cost 3 vext3 <2,3,7,0>, <2,3,7,0>
+  2752301269U, // <0,2,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
+  2562261094U, // <0,2,4,0>: Cost 3 vext1 <2,0,2,4>, LHS
+  4095313828U, // <0,2,4,1>: Cost 4 vtrnl <0,2,4,6>, <2,6,1,3>
+  2226718152U, // <0,2,4,2>: Cost 3 vrev <2,0,2,4>
+  2568235169U, // <0,2,4,3>: Cost 3 vext1 <3,0,2,4>, <3,0,2,4>
+  2562264374U, // <0,2,4,4>: Cost 3 vext1 <2,0,2,4>, RHS
+  1544850742U, // <0,2,4,5>: Cost 2 vext2 <0,2,0,2>, RHS
+  1678560566U, // <0,2,4,6>: Cost 2 vuzpl LHS, RHS
+  2592125957U, // <0,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
+  1678560584U, // <0,2,4,u>: Cost 2 vuzpl LHS, RHS
+  2691876686U, // <0,2,5,0>: Cost 3 vext3 <1,2,3,0>, <2,5,0,7>
+  2618592976U, // <0,2,5,1>: Cost 3 vext2 <0,2,0,2>, <5,1,7,3>
+  3765618528U, // <0,2,5,2>: Cost 4 vext3 <1,2,3,0>, <2,5,2,7>
+  3765618536U, // <0,2,5,3>: Cost 4 vext3 <1,2,3,0>, <2,5,3,6>
+  2618593222U, // <0,2,5,4>: Cost 3 vext2 <0,2,0,2>, <5,4,7,6>
+  2752303108U, // <0,2,5,5>: Cost 3 vuzpl LHS, <5,5,5,5>
+  2618593378U, // <0,2,5,6>: Cost 3 vext2 <0,2,0,2>, <5,6,7,0>
+  2824785206U, // <0,2,5,7>: Cost 3 vuzpr <1,0,3,2>, RHS
+  2824785207U, // <0,2,5,u>: Cost 3 vuzpr <1,0,3,2>, RHS
+  2752303950U, // <0,2,6,0>: Cost 3 vuzpl LHS, <6,7,0,1>
+  3830690081U, // <0,2,6,1>: Cost 4 vuzpl LHS, <6,0,1,2>
+  2618593786U, // <0,2,6,2>: Cost 3 vext2 <0,2,0,2>, <6,2,7,3>
+  2691876794U, // <0,2,6,3>: Cost 3 vext3 <1,2,3,0>, <2,6,3,7>
+  2752303990U, // <0,2,6,4>: Cost 3 vuzpl LHS, <6,7,4,5>
+  3830690445U, // <0,2,6,5>: Cost 4 vuzpl LHS, <6,4,5,6>
+  2752303928U, // <0,2,6,6>: Cost 3 vuzpl LHS, <6,6,6,6>
+  2657743695U, // <0,2,6,7>: Cost 3 vext2 <6,7,0,2>, <6,7,0,2>
+  2691876839U, // <0,2,6,u>: Cost 3 vext3 <1,2,3,0>, <2,6,u,7>
+  2659070961U, // <0,2,7,0>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
+  2659734594U, // <0,2,7,1>: Cost 3 vext2 <7,1,0,2>, <7,1,0,2>
+  3734140051U, // <0,2,7,2>: Cost 4 vext2 <7,2,0,2>, <7,2,0,2>
+  2701166596U, // <0,2,7,3>: Cost 3 vext3 <2,7,3,0>, <2,7,3,0>
+  2662389094U, // <0,2,7,4>: Cost 3 vext2 <7,5,0,2>, <7,4,5,6>
+  2662389126U, // <0,2,7,5>: Cost 3 vext2 <7,5,0,2>, <7,5,0,2>
+  3736794583U, // <0,2,7,6>: Cost 4 vext2 <7,6,0,2>, <7,6,0,2>
+  2752304748U, // <0,2,7,7>: Cost 3 vuzpl LHS, <7,7,7,7>
+  2659070961U, // <0,2,7,u>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
+  1476608026U, // <0,2,u,0>: Cost 2 vext1 <0,0,2,u>, <0,0,2,u>
+  1544853294U, // <0,2,u,1>: Cost 2 vext2 <0,2,0,2>, LHS
+  1678563118U, // <0,2,u,2>: Cost 2 vuzpl LHS, LHS
+  3021178482U, // <0,2,u,3>: Cost 3 vtrnl LHS, <2,2,3,3>
+  1476611382U, // <0,2,u,4>: Cost 2 vext1 <0,0,2,u>, RHS
+  1544853658U, // <0,2,u,5>: Cost 2 vext2 <0,2,0,2>, RHS
+  1678563482U, // <0,2,u,6>: Cost 2 vuzpl LHS, RHS
+  2824785449U, // <0,2,u,7>: Cost 3 vuzpr <1,0,3,2>, RHS
+  1678563172U, // <0,2,u,u>: Cost 2 vuzpl LHS, LHS
+  2556329984U, // <0,3,0,0>: Cost 3 vext1 <1,0,3,0>, <0,0,0,0>
+  2686421142U, // <0,3,0,1>: Cost 3 vext3 <0,3,1,0>, <3,0,1,2>
+  2562303437U, // <0,3,0,2>: Cost 3 vext1 <2,0,3,0>, <2,0,3,0>
+  4094986652U, // <0,3,0,3>: Cost 4 vtrnl <0,2,0,2>, <3,3,3,3>
+  2556333366U, // <0,3,0,4>: Cost 3 vext1 <1,0,3,0>, RHS
+  4094986754U, // <0,3,0,5>: Cost 4 vtrnl <0,2,0,2>, <3,4,5,6>
+  3798796488U, // <0,3,0,6>: Cost 4 vext3 <6,7,3,0>, <3,0,6,7>
+  3776530634U, // <0,3,0,7>: Cost 4 vext3 <3,0,7,0>, <3,0,7,0>
+  2556335918U, // <0,3,0,u>: Cost 3 vext1 <1,0,3,0>, LHS
+  2886518934U, // <0,3,1,0>: Cost 3 vzipl LHS, <3,0,1,2>
+  2556338933U, // <0,3,1,1>: Cost 3 vext1 <1,0,3,1>, <1,0,3,1>
+  2691877105U, // <0,3,1,2>: Cost 3 vext3 <1,2,3,0>, <3,1,2,3>
+  2886519196U, // <0,3,1,3>: Cost 3 vzipl LHS, <3,3,3,3>
+  2886519298U, // <0,3,1,4>: Cost 3 vzipl LHS, <3,4,5,6>
+  4095740418U, // <0,3,1,5>: Cost 4 vtrnl <0,3,1,4>, <3,4,5,6>
+  3659944242U, // <0,3,1,6>: Cost 4 vext1 <6,0,3,1>, <6,0,3,1>
+  3769600286U, // <0,3,1,7>: Cost 4 vext3 <1,u,3,0>, <3,1,7,3>
+  2886519582U, // <0,3,1,u>: Cost 3 vzipl LHS, <3,u,1,2>
+  1482604646U, // <0,3,2,0>: Cost 2 vext1 <1,0,3,2>, LHS
+  1482605302U, // <0,3,2,1>: Cost 2 vext1 <1,0,3,2>, <1,0,3,2>
+  2556348008U, // <0,3,2,2>: Cost 3 vext1 <1,0,3,2>, <2,2,2,2>
+  3020736924U, // <0,3,2,3>: Cost 3 vtrnl LHS, <3,3,3,3>
+  1482607926U, // <0,3,2,4>: Cost 2 vext1 <1,0,3,2>, RHS
+  3020737026U, // <0,3,2,5>: Cost 3 vtrnl LHS, <3,4,5,6>
+  2598154746U, // <0,3,2,6>: Cost 3 vext1 <u,0,3,2>, <6,2,7,3>
+  2598155258U, // <0,3,2,7>: Cost 3 vext1 <u,0,3,2>, <7,0,1,2>
+  1482610478U, // <0,3,2,u>: Cost 2 vext1 <1,0,3,2>, LHS
+  3692341398U, // <0,3,3,0>: Cost 4 vext2 <0,2,0,3>, <3,0,1,2>
+  2635851999U, // <0,3,3,1>: Cost 3 vext2 <3,1,0,3>, <3,1,0,3>
+  3636069840U, // <0,3,3,2>: Cost 4 vext1 <2,0,3,3>, <2,0,3,3>
+  2691877276U, // <0,3,3,3>: Cost 3 vext3 <1,2,3,0>, <3,3,3,3>
+  3961522690U, // <0,3,3,4>: Cost 4 vzipl <0,3,1,4>, <3,4,5,6>
+  3826797058U, // <0,3,3,5>: Cost 4 vuzpl <0,2,3,5>, <3,4,5,6>
+  3703622282U, // <0,3,3,6>: Cost 4 vext2 <2,1,0,3>, <3,6,2,7>
+  3769600452U, // <0,3,3,7>: Cost 4 vext3 <1,u,3,0>, <3,3,7,7>
+  2640497430U, // <0,3,3,u>: Cost 3 vext2 <3,u,0,3>, <3,u,0,3>
+  3962194070U, // <0,3,4,0>: Cost 4 vzipl <0,4,1,5>, <3,0,1,2>
+  2232617112U, // <0,3,4,1>: Cost 3 vrev <3,0,1,4>
+  2232690849U, // <0,3,4,2>: Cost 3 vrev <3,0,2,4>
+  4095314332U, // <0,3,4,3>: Cost 4 vtrnl <0,2,4,6>, <3,3,3,3>
+  3962194434U, // <0,3,4,4>: Cost 4 vzipl <0,4,1,5>, <3,4,5,6>
+  2691877378U, // <0,3,4,5>: Cost 3 vext3 <1,2,3,0>, <3,4,5,6>
+  3826765110U, // <0,3,4,6>: Cost 4 vuzpl <0,2,3,1>, RHS
+  3665941518U, // <0,3,4,7>: Cost 4 vext1 <7,0,3,4>, <7,0,3,4>
+  2691877405U, // <0,3,4,u>: Cost 3 vext3 <1,2,3,0>, <3,4,u,6>
+  3630112870U, // <0,3,5,0>: Cost 4 vext1 <1,0,3,5>, LHS
+  3630113526U, // <0,3,5,1>: Cost 4 vext1 <1,0,3,5>, <1,0,3,2>
+  4035199734U, // <0,3,5,2>: Cost 4 vzipr <1,4,0,5>, <1,0,3,2>
+  3769600578U, // <0,3,5,3>: Cost 4 vext3 <1,u,3,0>, <3,5,3,7>
+  2232846516U, // <0,3,5,4>: Cost 3 vrev <3,0,4,5>
+  3779037780U, // <0,3,5,5>: Cost 4 vext3 <3,4,5,0>, <3,5,5,7>
+  2718714461U, // <0,3,5,6>: Cost 3 vext3 <5,6,7,0>, <3,5,6,7>
+  2706106975U, // <0,3,5,7>: Cost 3 vext3 <3,5,7,0>, <3,5,7,0>
+  2233141464U, // <0,3,5,u>: Cost 3 vrev <3,0,u,5>
+  2691877496U, // <0,3,6,0>: Cost 3 vext3 <1,2,3,0>, <3,6,0,7>
+  3727511914U, // <0,3,6,1>: Cost 4 vext2 <6,1,0,3>, <6,1,0,3>
+  3765619338U, // <0,3,6,2>: Cost 4 vext3 <1,2,3,0>, <3,6,2,7>
+  3765619347U, // <0,3,6,3>: Cost 4 vext3 <1,2,3,0>, <3,6,3,7>
+  3765987996U, // <0,3,6,4>: Cost 4 vext3 <1,2,u,0>, <3,6,4,7>
+  3306670270U, // <0,3,6,5>: Cost 4 vrev <3,0,5,6>
+  3792456365U, // <0,3,6,6>: Cost 4 vext3 <5,6,7,0>, <3,6,6,6>
+  2706770608U, // <0,3,6,7>: Cost 3 vext3 <3,6,7,0>, <3,6,7,0>
+  2706844345U, // <0,3,6,u>: Cost 3 vext3 <3,6,u,0>, <3,6,u,0>
+  3769600707U, // <0,3,7,0>: Cost 4 vext3 <1,u,3,0>, <3,7,0,1>
+  2659742787U, // <0,3,7,1>: Cost 3 vext2 <7,1,0,3>, <7,1,0,3>
+  3636102612U, // <0,3,7,2>: Cost 4 vext1 <2,0,3,7>, <2,0,3,7>
+  3769600740U, // <0,3,7,3>: Cost 4 vext3 <1,u,3,0>, <3,7,3,7>
+  3769600747U, // <0,3,7,4>: Cost 4 vext3 <1,u,3,0>, <3,7,4,5>
+  3769600758U, // <0,3,7,5>: Cost 4 vext3 <1,u,3,0>, <3,7,5,7>
+  3659993400U, // <0,3,7,6>: Cost 4 vext1 <6,0,3,7>, <6,0,3,7>
+  3781176065U, // <0,3,7,7>: Cost 4 vext3 <3,7,7,0>, <3,7,7,0>
+  2664388218U, // <0,3,7,u>: Cost 3 vext2 <7,u,0,3>, <7,u,0,3>
+  1482653798U, // <0,3,u,0>: Cost 2 vext1 <1,0,3,u>, LHS
+  1482654460U, // <0,3,u,1>: Cost 2 vext1 <1,0,3,u>, <1,0,3,u>
+  2556397160U, // <0,3,u,2>: Cost 3 vext1 <1,0,3,u>, <2,2,2,2>
+  3021179292U, // <0,3,u,3>: Cost 3 vtrnl LHS, <3,3,3,3>
+  1482657078U, // <0,3,u,4>: Cost 2 vext1 <1,0,3,u>, RHS
+  3021179394U, // <0,3,u,5>: Cost 3 vtrnl LHS, <3,4,5,6>
+  2598203898U, // <0,3,u,6>: Cost 3 vext1 <u,0,3,u>, <6,2,7,3>
+  2708097874U, // <0,3,u,7>: Cost 3 vext3 <3,u,7,0>, <3,u,7,0>
+  1482659630U, // <0,3,u,u>: Cost 2 vext1 <1,0,3,u>, LHS
+  2617278468U, // <0,4,0,0>: Cost 3 vext2 <0,0,0,4>, <0,0,0,4>
+  2618605670U, // <0,4,0,1>: Cost 3 vext2 <0,2,0,4>, LHS
+  2618605734U, // <0,4,0,2>: Cost 3 vext2 <0,2,0,4>, <0,2,0,4>
+  3642091695U, // <0,4,0,3>: Cost 4 vext1 <3,0,4,0>, <3,0,4,0>
+  2753134796U, // <0,4,0,4>: Cost 3 vuzpl <0,2,4,6>, <0,2,4,6>
+  2718714770U, // <0,4,0,5>: Cost 3 vext3 <5,6,7,0>, <4,0,5,1>
+  3021245750U, // <0,4,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
+  3665982483U, // <0,4,0,7>: Cost 4 vext1 <7,0,4,0>, <7,0,4,0>
+  3021245768U, // <0,4,0,u>: Cost 3 vtrnl <0,2,0,2>, RHS
+  2568355942U, // <0,4,1,0>: Cost 3 vext1 <3,0,4,1>, LHS
+  3692348212U, // <0,4,1,1>: Cost 4 vext2 <0,2,0,4>, <1,1,1,1>
+  3692348310U, // <0,4,1,2>: Cost 4 vext2 <0,2,0,4>, <1,2,3,0>
+  2568358064U, // <0,4,1,3>: Cost 3 vext1 <3,0,4,1>, <3,0,4,1>
+  2568359222U, // <0,4,1,4>: Cost 3 vext1 <3,0,4,1>, RHS
+  1812778294U, // <0,4,1,5>: Cost 2 vzipl LHS, RHS
+  3022671158U, // <0,4,1,6>: Cost 3 vtrnl <0,4,1,5>, RHS
+  2592248852U, // <0,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
+  1812778537U, // <0,4,1,u>: Cost 2 vzipl LHS, RHS
+  2568364134U, // <0,4,2,0>: Cost 3 vext1 <3,0,4,2>, LHS
+  2238573423U, // <0,4,2,1>: Cost 3 vrev <4,0,1,2>
+  3692349032U, // <0,4,2,2>: Cost 4 vext2 <0,2,0,4>, <2,2,2,2>
+  2631214761U, // <0,4,2,3>: Cost 3 vext2 <2,3,0,4>, <2,3,0,4>
+  2568367414U, // <0,4,2,4>: Cost 3 vext1 <3,0,4,2>, RHS
+  2887028022U, // <0,4,2,5>: Cost 3 vzipl <0,2,0,2>, RHS
+  1946996022U, // <0,4,2,6>: Cost 2 vtrnl LHS, RHS
+  2592257045U, // <0,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
+  1946996040U, // <0,4,2,u>: Cost 2 vtrnl LHS, RHS
+  3692349590U, // <0,4,3,0>: Cost 4 vext2 <0,2,0,4>, <3,0,1,2>
+  3826878614U, // <0,4,3,1>: Cost 4 vuzpl <0,2,4,6>, <3,0,1,2>
+  3826878625U, // <0,4,3,2>: Cost 4 vuzpl <0,2,4,6>, <3,0,2,4>
+  3692349852U, // <0,4,3,3>: Cost 4 vext2 <0,2,0,4>, <3,3,3,3>
+  3692349954U, // <0,4,3,4>: Cost 4 vext2 <0,2,0,4>, <3,4,5,6>
+  3826878978U, // <0,4,3,5>: Cost 4 vuzpl <0,2,4,6>, <3,4,5,6>
+  4095200566U, // <0,4,3,6>: Cost 4 vtrnl <0,2,3,1>, RHS
+  3713583814U, // <0,4,3,7>: Cost 4 vext2 <3,7,0,4>, <3,7,0,4>
+  3692350238U, // <0,4,3,u>: Cost 4 vext2 <0,2,0,4>, <3,u,1,2>
+  2550464552U, // <0,4,4,0>: Cost 3 vext1 <0,0,4,4>, <0,0,4,4>
+  3962194914U, // <0,4,4,1>: Cost 4 vzipl <0,4,1,5>, <4,1,5,0>
+  3693677631U, // <0,4,4,2>: Cost 4 vext2 <0,4,0,4>, <4,2,6,3>
+  3642124467U, // <0,4,4,3>: Cost 4 vext1 <3,0,4,4>, <3,0,4,4>
+  2718715088U, // <0,4,4,4>: Cost 3 vext3 <5,6,7,0>, <4,4,4,4>
+  2618608950U, // <0,4,4,5>: Cost 3 vext2 <0,2,0,4>, RHS
+  2753137974U, // <0,4,4,6>: Cost 3 vuzpl <0,2,4,6>, RHS
+  3666015255U, // <0,4,4,7>: Cost 4 vext1 <7,0,4,4>, <7,0,4,4>
+  2618609193U, // <0,4,4,u>: Cost 3 vext2 <0,2,0,4>, RHS
+  2568388710U, // <0,4,5,0>: Cost 3 vext1 <3,0,4,5>, LHS
+  2568389526U, // <0,4,5,1>: Cost 3 vext1 <3,0,4,5>, <1,2,3,0>
+  3636159963U, // <0,4,5,2>: Cost 4 vext1 <2,0,4,5>, <2,0,4,5>
+  2568390836U, // <0,4,5,3>: Cost 3 vext1 <3,0,4,5>, <3,0,4,5>
+  2568391990U, // <0,4,5,4>: Cost 3 vext1 <3,0,4,5>, RHS
+  2718715180U, // <0,4,5,5>: Cost 3 vext3 <5,6,7,0>, <4,5,5,6>
+  1618136374U, // <0,4,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
+  2592281624U, // <0,4,5,7>: Cost 3 vext1 <7,0,4,5>, <7,0,4,5>
+  1618136392U, // <0,4,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
+  2550480938U, // <0,4,6,0>: Cost 3 vext1 <0,0,4,6>, <0,0,4,6>
+  3826880801U, // <0,4,6,1>: Cost 4 vuzpl <0,2,4,6>, <6,0,1,2>
+  2562426332U, // <0,4,6,2>: Cost 3 vext1 <2,0,4,6>, <2,0,4,6>
+  3786190181U, // <0,4,6,3>: Cost 4 vext3 <4,6,3,0>, <4,6,3,0>
+  2718715252U, // <0,4,6,4>: Cost 3 vext3 <5,6,7,0>, <4,6,4,6>
+  3826881165U, // <0,4,6,5>: Cost 4 vuzpl <0,2,4,6>, <6,4,5,6>
+  2712669568U, // <0,4,6,6>: Cost 3 vext3 <4,6,6,0>, <4,6,6,0>
+  2657760081U, // <0,4,6,7>: Cost 3 vext2 <6,7,0,4>, <6,7,0,4>
+  2718715284U, // <0,4,6,u>: Cost 3 vext3 <5,6,7,0>, <4,6,u,2>
+  3654090854U, // <0,4,7,0>: Cost 4 vext1 <5,0,4,7>, LHS
+  3934229326U, // <0,4,7,1>: Cost 4 vuzpr <7,0,1,4>, <6,7,0,1>
+  3734156437U, // <0,4,7,2>: Cost 4 vext2 <7,2,0,4>, <7,2,0,4>
+  3734820070U, // <0,4,7,3>: Cost 4 vext2 <7,3,0,4>, <7,3,0,4>
+  3654094134U, // <0,4,7,4>: Cost 4 vext1 <5,0,4,7>, RHS
+  2713259464U, // <0,4,7,5>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
+  2713333201U, // <0,4,7,6>: Cost 3 vext3 <4,7,6,0>, <4,7,6,0>
+  3654095866U, // <0,4,7,7>: Cost 4 vext1 <5,0,4,7>, <7,0,1,2>
+  2713259464U, // <0,4,7,u>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
+  2568413286U, // <0,4,u,0>: Cost 3 vext1 <3,0,4,u>, LHS
+  2618611502U, // <0,4,u,1>: Cost 3 vext2 <0,2,0,4>, LHS
+  2753140526U, // <0,4,u,2>: Cost 3 vuzpl <0,2,4,6>, LHS
+  2568415415U, // <0,4,u,3>: Cost 3 vext1 <3,0,4,u>, <3,0,4,u>
+  2568416566U, // <0,4,u,4>: Cost 3 vext1 <3,0,4,u>, RHS
+  1817423158U, // <0,4,u,5>: Cost 2 vzipl LHS, RHS
+  1947438390U, // <0,4,u,6>: Cost 2 vtrnl LHS, RHS
+  2592306203U, // <0,4,u,7>: Cost 3 vext1 <7,0,4,u>, <7,0,4,u>
+  1947438408U, // <0,4,u,u>: Cost 2 vtrnl LHS, RHS
+  3630219264U, // <0,5,0,0>: Cost 4 vext1 <1,0,5,0>, <0,0,0,0>
+  2625912934U, // <0,5,0,1>: Cost 3 vext2 <1,4,0,5>, LHS
+  3692355748U, // <0,5,0,2>: Cost 4 vext2 <0,2,0,5>, <0,2,0,2>
+  3693019384U, // <0,5,0,3>: Cost 4 vext2 <0,3,0,5>, <0,3,0,5>
+  3630222646U, // <0,5,0,4>: Cost 4 vext1 <1,0,5,0>, RHS
+  3699655062U, // <0,5,0,5>: Cost 4 vext2 <1,4,0,5>, <0,5,0,1>
+  2718715508U, // <0,5,0,6>: Cost 3 vext3 <5,6,7,0>, <5,0,6,1>
+  3087011126U, // <0,5,0,7>: Cost 3 vtrnr <0,0,0,0>, RHS
+  2625913501U, // <0,5,0,u>: Cost 3 vext2 <1,4,0,5>, LHS
+  1500659814U, // <0,5,1,0>: Cost 2 vext1 <4,0,5,1>, LHS
+  2886520528U, // <0,5,1,1>: Cost 3 vzipl LHS, <5,1,7,3>
+  2574403176U, // <0,5,1,2>: Cost 3 vext1 <4,0,5,1>, <2,2,2,2>
+  2574403734U, // <0,5,1,3>: Cost 3 vext1 <4,0,5,1>, <3,0,1,2>
+  1500662674U, // <0,5,1,4>: Cost 2 vext1 <4,0,5,1>, <4,0,5,1>
+  2886520836U, // <0,5,1,5>: Cost 3 vzipl LHS, <5,5,5,5>
+  2886520930U, // <0,5,1,6>: Cost 3 vzipl LHS, <5,6,7,0>
+  2718715600U, // <0,5,1,7>: Cost 3 vext3 <5,6,7,0>, <5,1,7,3>
+  1500665646U, // <0,5,1,u>: Cost 2 vext1 <4,0,5,1>, LHS
+  2556493926U, // <0,5,2,0>: Cost 3 vext1 <1,0,5,2>, LHS
+  2244546120U, // <0,5,2,1>: Cost 3 vrev <5,0,1,2>
+  3692357256U, // <0,5,2,2>: Cost 4 vext2 <0,2,0,5>, <2,2,5,7>
+  2568439994U, // <0,5,2,3>: Cost 3 vext1 <3,0,5,2>, <3,0,5,2>
+  2556497206U, // <0,5,2,4>: Cost 3 vext1 <1,0,5,2>, RHS
+  3020738564U, // <0,5,2,5>: Cost 3 vtrnl LHS, <5,5,5,5>
+  4027877161U, // <0,5,2,6>: Cost 4 vzipr <0,2,0,2>, <2,4,5,6>
+  3093220662U, // <0,5,2,7>: Cost 3 vtrnr <1,0,3,2>, RHS
+  3093220663U, // <0,5,2,u>: Cost 3 vtrnr <1,0,3,2>, RHS
+  3699656854U, // <0,5,3,0>: Cost 4 vext2 <1,4,0,5>, <3,0,1,2>
+  3699656927U, // <0,5,3,1>: Cost 4 vext2 <1,4,0,5>, <3,1,0,3>
+  3699657006U, // <0,5,3,2>: Cost 4 vext2 <1,4,0,5>, <3,2,0,1>
+  3699657116U, // <0,5,3,3>: Cost 4 vext2 <1,4,0,5>, <3,3,3,3>
+  2637859284U, // <0,5,3,4>: Cost 3 vext2 <3,4,0,5>, <3,4,0,5>
+  3790319453U, // <0,5,3,5>: Cost 4 vext3 <5,3,5,0>, <5,3,5,0>
+  3699657354U, // <0,5,3,6>: Cost 4 vext2 <1,4,0,5>, <3,6,2,7>
+  2716725103U, // <0,5,3,7>: Cost 3 vext3 <5,3,7,0>, <5,3,7,0>
+  2716798840U, // <0,5,3,u>: Cost 3 vext3 <5,3,u,0>, <5,3,u,0>
+  2661747602U, // <0,5,4,0>: Cost 3 vext2 <7,4,0,5>, <4,0,5,1>
+  3630252810U, // <0,5,4,1>: Cost 4 vext1 <1,0,5,4>, <1,0,5,4>
+  3636225507U, // <0,5,4,2>: Cost 4 vext1 <2,0,5,4>, <2,0,5,4>
+  3716910172U, // <0,5,4,3>: Cost 4 vext2 <4,3,0,5>, <4,3,0,5>
+  3962195892U, // <0,5,4,4>: Cost 4 vzipl <0,4,1,5>, <5,4,5,6>
+  2625916214U, // <0,5,4,5>: Cost 3 vext2 <1,4,0,5>, RHS
+  3718901071U, // <0,5,4,6>: Cost 4 vext2 <4,6,0,5>, <4,6,0,5>
+  2718715846U, // <0,5,4,7>: Cost 3 vext3 <5,6,7,0>, <5,4,7,6>
+  2625916457U, // <0,5,4,u>: Cost 3 vext2 <1,4,0,5>, RHS
+  3791278034U, // <0,5,5,0>: Cost 4 vext3 <5,5,0,0>, <5,5,0,0>
+  3791351771U, // <0,5,5,1>: Cost 4 vext3 <5,5,1,0>, <5,5,1,0>
+  3318386260U, // <0,5,5,2>: Cost 4 vrev <5,0,2,5>
+  3791499245U, // <0,5,5,3>: Cost 4 vext3 <5,5,3,0>, <5,5,3,0>
+  3318533734U, // <0,5,5,4>: Cost 4 vrev <5,0,4,5>
+  2718715908U, // <0,5,5,5>: Cost 3 vext3 <5,6,7,0>, <5,5,5,5>
+  2657767522U, // <0,5,5,6>: Cost 3 vext2 <6,7,0,5>, <5,6,7,0>
+  2718715928U, // <0,5,5,7>: Cost 3 vext3 <5,6,7,0>, <5,5,7,7>
+  2718715937U, // <0,5,5,u>: Cost 3 vext3 <5,6,7,0>, <5,5,u,7>
+  2592358502U, // <0,5,6,0>: Cost 3 vext1 <7,0,5,6>, LHS
+  3792015404U, // <0,5,6,1>: Cost 4 vext3 <5,6,1,0>, <5,6,1,0>
+  3731509754U, // <0,5,6,2>: Cost 4 vext2 <6,7,0,5>, <6,2,7,3>
+  3785748546U, // <0,5,6,3>: Cost 4 vext3 <4,5,6,0>, <5,6,3,4>
+  2592361782U, // <0,5,6,4>: Cost 3 vext1 <7,0,5,6>, RHS
+  2592362594U, // <0,5,6,5>: Cost 3 vext1 <7,0,5,6>, <5,6,7,0>
+  3785748576U, // <0,5,6,6>: Cost 4 vext3 <4,5,6,0>, <5,6,6,7>
+  1644974178U, // <0,5,6,7>: Cost 2 vext3 <5,6,7,0>, <5,6,7,0>
+  1645047915U, // <0,5,6,u>: Cost 2 vext3 <5,6,u,0>, <5,6,u,0>
+  2562506854U, // <0,5,7,0>: Cost 3 vext1 <2,0,5,7>, LHS
+  2562507670U, // <0,5,7,1>: Cost 3 vext1 <2,0,5,7>, <1,2,3,0>
+  2562508262U, // <0,5,7,2>: Cost 3 vext1 <2,0,5,7>, <2,0,5,7>
+  3636250774U, // <0,5,7,3>: Cost 4 vext1 <2,0,5,7>, <3,0,1,2>
+  2562510134U, // <0,5,7,4>: Cost 3 vext1 <2,0,5,7>, RHS
+  2718716072U, // <0,5,7,5>: Cost 3 vext3 <5,6,7,0>, <5,7,5,7>
+  2718716074U, // <0,5,7,6>: Cost 3 vext3 <5,6,7,0>, <5,7,6,0>
+  2719379635U, // <0,5,7,7>: Cost 3 vext3 <5,7,7,0>, <5,7,7,0>
+  2562512686U, // <0,5,7,u>: Cost 3 vext1 <2,0,5,7>, LHS
+  1500717158U, // <0,5,u,0>: Cost 2 vext1 <4,0,5,u>, LHS
+  2625918766U, // <0,5,u,1>: Cost 3 vext2 <1,4,0,5>, LHS
+  2719674583U, // <0,5,u,2>: Cost 3 vext3 <5,u,2,0>, <5,u,2,0>
+  2568489152U, // <0,5,u,3>: Cost 3 vext1 <3,0,5,u>, <3,0,5,u>
+  1500720025U, // <0,5,u,4>: Cost 2 vext1 <4,0,5,u>, <4,0,5,u>
+  2625919130U, // <0,5,u,5>: Cost 3 vext2 <1,4,0,5>, RHS
+  2586407243U, // <0,5,u,6>: Cost 3 vext1 <6,0,5,u>, <6,0,5,u>
+  1646301444U, // <0,5,u,7>: Cost 2 vext3 <5,u,7,0>, <5,u,7,0>
+  1646375181U, // <0,5,u,u>: Cost 2 vext3 <5,u,u,0>, <5,u,u,0>
+  2586411110U, // <0,6,0,0>: Cost 3 vext1 <6,0,6,0>, LHS
+  2619949158U, // <0,6,0,1>: Cost 3 vext2 <0,4,0,6>, LHS
+  2619949220U, // <0,6,0,2>: Cost 3 vext2 <0,4,0,6>, <0,2,0,2>
+  3785748789U, // <0,6,0,3>: Cost 4 vext3 <4,5,6,0>, <6,0,3,4>
+  2619949386U, // <0,6,0,4>: Cost 3 vext2 <0,4,0,6>, <0,4,0,6>
+  2586415202U, // <0,6,0,5>: Cost 3 vext1 <6,0,6,0>, <5,6,7,0>
+  2586415436U, // <0,6,0,6>: Cost 3 vext1 <6,0,6,0>, <6,0,6,0>
+  2952793398U, // <0,6,0,7>: Cost 3 vzipr <0,0,0,0>, RHS
+  2619949725U, // <0,6,0,u>: Cost 3 vext2 <0,4,0,6>, LHS
+  2562531430U, // <0,6,1,0>: Cost 3 vext1 <2,0,6,1>, LHS
+  3693691700U, // <0,6,1,1>: Cost 4 vext2 <0,4,0,6>, <1,1,1,1>
+  2886521338U, // <0,6,1,2>: Cost 3 vzipl LHS, <6,2,7,3>
+  3693691864U, // <0,6,1,3>: Cost 4 vext2 <0,4,0,6>, <1,3,1,3>
+  2562534710U, // <0,6,1,4>: Cost 3 vext1 <2,0,6,1>, RHS
+  2580450932U, // <0,6,1,5>: Cost 3 vext1 <5,0,6,1>, <5,0,6,1>
+  2886521656U, // <0,6,1,6>: Cost 3 vzipl LHS, <6,6,6,6>
+  2966736182U, // <0,6,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
+  2966736183U, // <0,6,1,u>: Cost 3 vzipr <2,3,0,1>, RHS
+  1500741734U, // <0,6,2,0>: Cost 2 vext1 <4,0,6,2>, LHS
+  2250518817U, // <0,6,2,1>: Cost 3 vrev <6,0,1,2>
+  2574485096U, // <0,6,2,2>: Cost 3 vext1 <4,0,6,2>, <2,2,2,2>
+  2631894694U, // <0,6,2,3>: Cost 3 vext2 <2,4,0,6>, <2,3,0,1>
+  1500744604U, // <0,6,2,4>: Cost 2 vext1 <4,0,6,2>, <4,0,6,2>
+  2574487248U, // <0,6,2,5>: Cost 3 vext1 <4,0,6,2>, <5,1,7,3>
+  3020739384U, // <0,6,2,6>: Cost 3 vtrnl LHS, <6,6,6,6>
+  2954136886U, // <0,6,2,7>: Cost 3 vzipr <0,2,0,2>, RHS
+  1500747566U, // <0,6,2,u>: Cost 2 vext1 <4,0,6,2>, LHS
+  3693693078U, // <0,6,3,0>: Cost 4 vext2 <0,4,0,6>, <3,0,1,2>
+  3705637136U, // <0,6,3,1>: Cost 4 vext2 <2,4,0,6>, <3,1,5,7>
+  3705637192U, // <0,6,3,2>: Cost 4 vext2 <2,4,0,6>, <3,2,3,0>
+  3693693340U, // <0,6,3,3>: Cost 4 vext2 <0,4,0,6>, <3,3,3,3>
+  2637867477U, // <0,6,3,4>: Cost 3 vext2 <3,4,0,6>, <3,4,0,6>
+  3705637424U, // <0,6,3,5>: Cost 4 vext2 <2,4,0,6>, <3,5,1,7>
+  3666154056U, // <0,6,3,6>: Cost 4 vext1 <7,0,6,3>, <6,3,7,0>
+  2722697800U, // <0,6,3,7>: Cost 3 vext3 <6,3,7,0>, <6,3,7,0>
+  2722771537U, // <0,6,3,u>: Cost 3 vext3 <6,3,u,0>, <6,3,u,0>
+  2562556006U, // <0,6,4,0>: Cost 3 vext1 <2,0,6,4>, LHS
+  4095316257U, // <0,6,4,1>: Cost 4 vtrnl <0,2,4,6>, <6,0,1,2>
+  2562557420U, // <0,6,4,2>: Cost 3 vext1 <2,0,6,4>, <2,0,6,4>
+  3636299926U, // <0,6,4,3>: Cost 4 vext1 <2,0,6,4>, <3,0,1,2>
+  2562559286U, // <0,6,4,4>: Cost 3 vext1 <2,0,6,4>, RHS
+  2619952438U, // <0,6,4,5>: Cost 3 vext2 <0,4,0,6>, RHS
+  2723287696U, // <0,6,4,6>: Cost 3 vext3 <6,4,6,0>, <6,4,6,0>
+  4027895094U, // <0,6,4,7>: Cost 4 vzipr <0,2,0,4>, RHS
+  2619952681U, // <0,6,4,u>: Cost 3 vext2 <0,4,0,6>, RHS
+  2718716594U, // <0,6,5,0>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
+  3648250774U, // <0,6,5,1>: Cost 4 vext1 <4,0,6,5>, <1,2,3,0>
+  3792458436U, // <0,6,5,2>: Cost 4 vext3 <5,6,7,0>, <6,5,2,7>
+  3705638767U, // <0,6,5,3>: Cost 5 vext2 <2,4,0,6>, <5,3,7,0>
+  3648252831U, // <0,6,5,4>: Cost 4 vext1 <4,0,6,5>, <4,0,6,5>
+  3797619416U, // <0,6,5,5>: Cost 4 vext3 <6,5,5,0>, <6,5,5,0>
+  3792458472U, // <0,6,5,6>: Cost 4 vext3 <5,6,7,0>, <6,5,6,7>
+  4035202358U, // <0,6,5,7>: Cost 4 vzipr <1,4,0,5>, RHS
+  2718716594U, // <0,6,5,u>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
+  3786412796U, // <0,6,6,0>: Cost 4 vext3 <4,6,6,0>, <6,6,0,0>
+  3792458504U, // <0,6,6,1>: Cost 4 vext3 <5,6,7,0>, <6,6,1,3>
+  3728200126U, // <0,6,6,2>: Cost 4 vext2 <6,2,0,6>, <6,2,0,6>
+  3798135575U, // <0,6,6,3>: Cost 4 vext3 <6,6,3,0>, <6,6,3,0>
+  3786412836U, // <0,6,6,4>: Cost 4 vext3 <4,6,6,0>, <6,6,4,4>
+  3792458543U, // <0,6,6,5>: Cost 4 vext3 <5,6,7,0>, <6,6,5,6>
+  2718716728U, // <0,6,6,6>: Cost 3 vext3 <5,6,7,0>, <6,6,6,6>
+  2718716738U, // <0,6,6,7>: Cost 3 vext3 <5,6,7,0>, <6,6,7,7>
+  2718716747U, // <0,6,6,u>: Cost 3 vext3 <5,6,7,0>, <6,6,u,7>
+  2718716750U, // <0,6,7,0>: Cost 3 vext3 <5,6,7,0>, <6,7,0,1>
+  2724909910U, // <0,6,7,1>: Cost 3 vext3 <6,7,1,0>, <6,7,1,0>
+  3636323823U, // <0,6,7,2>: Cost 4 vext1 <2,0,6,7>, <2,0,6,7>
+  2725057384U, // <0,6,7,3>: Cost 3 vext3 <6,7,3,0>, <6,7,3,0>
+  2718716790U, // <0,6,7,4>: Cost 3 vext3 <5,6,7,0>, <6,7,4,5>
+  2718716800U, // <0,6,7,5>: Cost 3 vext3 <5,6,7,0>, <6,7,5,6>
+  3792458629U, // <0,6,7,6>: Cost 4 vext3 <5,6,7,0>, <6,7,6,2>
+  2725352332U, // <0,6,7,7>: Cost 3 vext3 <6,7,7,0>, <6,7,7,0>
+  2718716822U, // <0,6,7,u>: Cost 3 vext3 <5,6,7,0>, <6,7,u,1>
+  1500790886U, // <0,6,u,0>: Cost 2 vext1 <4,0,6,u>, LHS
+  2619954990U, // <0,6,u,1>: Cost 3 vext2 <0,4,0,6>, LHS
+  2562590192U, // <0,6,u,2>: Cost 3 vext1 <2,0,6,u>, <2,0,6,u>
+  2725721017U, // <0,6,u,3>: Cost 3 vext3 <6,u,3,0>, <6,u,3,0>
+  1500793762U, // <0,6,u,4>: Cost 2 vext1 <4,0,6,u>, <4,0,6,u>
+  2619955354U, // <0,6,u,5>: Cost 3 vext2 <0,4,0,6>, RHS
+  2725942228U, // <0,6,u,6>: Cost 3 vext3 <6,u,6,0>, <6,u,6,0>
+  2954186038U, // <0,6,u,7>: Cost 3 vzipr <0,2,0,u>, RHS
+  1500796718U, // <0,6,u,u>: Cost 2 vext1 <4,0,6,u>, LHS
+  2256401391U, // <0,7,0,0>: Cost 3 vrev <7,0,0,0>
+  2632564838U, // <0,7,0,1>: Cost 3 vext2 <2,5,0,7>, LHS
+  2256548865U, // <0,7,0,2>: Cost 3 vrev <7,0,2,0>
+  3700998396U, // <0,7,0,3>: Cost 4 vext2 <1,6,0,7>, <0,3,1,0>
+  2718716952U, // <0,7,0,4>: Cost 3 vext3 <5,6,7,0>, <7,0,4,5>
+  2718716962U, // <0,7,0,5>: Cost 3 vext3 <5,6,7,0>, <7,0,5,6>
+  2621284845U, // <0,7,0,6>: Cost 3 vext2 <0,6,0,7>, <0,6,0,7>
+  3904685542U, // <0,7,0,7>: Cost 4 vuzpr <2,0,5,7>, <2,0,5,7>
+  2632565405U, // <0,7,0,u>: Cost 3 vext2 <2,5,0,7>, LHS
+  2256409584U, // <0,7,1,0>: Cost 3 vrev <7,0,0,1>
+  3706307380U, // <0,7,1,1>: Cost 4 vext2 <2,5,0,7>, <1,1,1,1>
+  2632565654U, // <0,7,1,2>: Cost 3 vext2 <2,5,0,7>, <1,2,3,0>
+  3769603168U, // <0,7,1,3>: Cost 4 vext3 <1,u,3,0>, <7,1,3,5>
+  2256704532U, // <0,7,1,4>: Cost 3 vrev <7,0,4,1>
+  3769603184U, // <0,7,1,5>: Cost 4 vext3 <1,u,3,0>, <7,1,5,3>
+  3700999366U, // <0,7,1,6>: Cost 4 vext2 <1,6,0,7>, <1,6,0,7>
+  2886522476U, // <0,7,1,7>: Cost 3 vzipl LHS, <7,7,7,7>
+  2256999480U, // <0,7,1,u>: Cost 3 vrev <7,0,u,1>
+  2586501222U, // <0,7,2,0>: Cost 3 vext1 <6,0,7,2>, LHS
+  1182749690U, // <0,7,2,1>: Cost 2 vrev <7,0,1,2>
+  3636356595U, // <0,7,2,2>: Cost 4 vext1 <2,0,7,2>, <2,0,7,2>
+  2727711916U, // <0,7,2,3>: Cost 3 vext3 <7,2,3,0>, <7,2,3,0>
+  2586504502U, // <0,7,2,4>: Cost 3 vext1 <6,0,7,2>, RHS
+  2632566606U, // <0,7,2,5>: Cost 3 vext2 <2,5,0,7>, <2,5,0,7>
+  2586505559U, // <0,7,2,6>: Cost 3 vext1 <6,0,7,2>, <6,0,7,2>
+  3020740204U, // <0,7,2,7>: Cost 3 vtrnl LHS, <7,7,7,7>
+  1183265849U, // <0,7,2,u>: Cost 2 vrev <7,0,u,2>
+  3701000342U, // <0,7,3,0>: Cost 4 vext2 <1,6,0,7>, <3,0,1,2>
+  3706308849U, // <0,7,3,1>: Cost 4 vext2 <2,5,0,7>, <3,1,2,3>
+  3330315268U, // <0,7,3,2>: Cost 4 vrev <7,0,2,3>
+  3706309020U, // <0,7,3,3>: Cost 4 vext2 <2,5,0,7>, <3,3,3,3>
+  3706309122U, // <0,7,3,4>: Cost 4 vext2 <2,5,0,7>, <3,4,5,6>
+  3712281127U, // <0,7,3,5>: Cost 4 vext2 <3,5,0,7>, <3,5,0,7>
+  2639202936U, // <0,7,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
+  3802412321U, // <0,7,3,7>: Cost 4 vext3 <7,3,7,0>, <7,3,7,0>
+  2640530202U, // <0,7,3,u>: Cost 3 vext2 <3,u,0,7>, <3,u,0,7>
+  3654287462U, // <0,7,4,0>: Cost 4 vext1 <5,0,7,4>, LHS
+  2256507900U, // <0,7,4,1>: Cost 3 vrev <7,0,1,4>
+  2256581637U, // <0,7,4,2>: Cost 3 vrev <7,0,2,4>
+  3660262008U, // <0,7,4,3>: Cost 4 vext1 <6,0,7,4>, <3,6,0,7>
+  3786413405U, // <0,7,4,4>: Cost 4 vext3 <4,6,6,0>, <7,4,4,6>
+  2632568118U, // <0,7,4,5>: Cost 3 vext2 <2,5,0,7>, RHS
+  3718917457U, // <0,7,4,6>: Cost 4 vext2 <4,6,0,7>, <4,6,0,7>
+  3787003255U, // <0,7,4,7>: Cost 4 vext3 <4,7,5,0>, <7,4,7,5>
+  2632568361U, // <0,7,4,u>: Cost 3 vext2 <2,5,0,7>, RHS
+  3706310268U, // <0,7,5,0>: Cost 4 vext2 <2,5,0,7>, <5,0,7,0>
+  3792459156U, // <0,7,5,1>: Cost 4 vext3 <5,6,7,0>, <7,5,1,7>
+  3330331654U, // <0,7,5,2>: Cost 4 vrev <7,0,2,5>
+  3722899255U, // <0,7,5,3>: Cost 4 vext2 <5,3,0,7>, <5,3,0,7>
+  2256737304U, // <0,7,5,4>: Cost 3 vrev <7,0,4,5>
+  3724226521U, // <0,7,5,5>: Cost 4 vext2 <5,5,0,7>, <5,5,0,7>
+  2718717377U, // <0,7,5,6>: Cost 3 vext3 <5,6,7,0>, <7,5,6,7>
+  2729997763U, // <0,7,5,7>: Cost 3 vext3 <7,5,7,0>, <7,5,7,0>
+  2720044499U, // <0,7,5,u>: Cost 3 vext3 <5,u,7,0>, <7,5,u,7>
+  3712946517U, // <0,7,6,0>: Cost 4 vext2 <3,6,0,7>, <6,0,7,0>
+  2256524286U, // <0,7,6,1>: Cost 3 vrev <7,0,1,6>
+  3792459246U, // <0,7,6,2>: Cost 4 vext3 <5,6,7,0>, <7,6,2,7>
+  3796440567U, // <0,7,6,3>: Cost 4 vext3 <6,3,7,0>, <7,6,3,7>
+  3654307126U, // <0,7,6,4>: Cost 4 vext1 <5,0,7,6>, RHS
+  2656457394U, // <0,7,6,5>: Cost 3 vext2 <6,5,0,7>, <6,5,0,7>
+  3792459281U, // <0,7,6,6>: Cost 4 vext3 <5,6,7,0>, <7,6,6,6>
+  2730661396U, // <0,7,6,7>: Cost 3 vext3 <7,6,7,0>, <7,6,7,0>
+  2658448293U, // <0,7,6,u>: Cost 3 vext2 <6,u,0,7>, <6,u,0,7>
+  3787003431U, // <0,7,7,0>: Cost 4 vext3 <4,7,5,0>, <7,7,0,1>
+  3654312854U, // <0,7,7,1>: Cost 4 vext1 <5,0,7,7>, <1,2,3,0>
+  3654313446U, // <0,7,7,2>: Cost 4 vext1 <5,0,7,7>, <2,0,5,7>
+  3804771905U, // <0,7,7,3>: Cost 4 vext3 <7,7,3,0>, <7,7,3,0>
+  3654315318U, // <0,7,7,4>: Cost 4 vext1 <5,0,7,7>, RHS
+  3654315651U, // <0,7,7,5>: Cost 4 vext1 <5,0,7,7>, <5,0,7,7>
+  3660288348U, // <0,7,7,6>: Cost 4 vext1 <6,0,7,7>, <6,0,7,7>
+  2718717548U, // <0,7,7,7>: Cost 3 vext3 <5,6,7,0>, <7,7,7,7>
+  2664420990U, // <0,7,7,u>: Cost 3 vext2 <7,u,0,7>, <7,u,0,7>
+  2256466935U, // <0,7,u,0>: Cost 3 vrev <7,0,0,u>
+  1182798848U, // <0,7,u,1>: Cost 2 vrev <7,0,1,u>
+  2256614409U, // <0,7,u,2>: Cost 3 vrev <7,0,2,u>
+  2731693714U, // <0,7,u,3>: Cost 3 vext3 <7,u,3,0>, <7,u,3,0>
+  2256761883U, // <0,7,u,4>: Cost 3 vrev <7,0,4,u>
+  2632571034U, // <0,7,u,5>: Cost 3 vext2 <2,5,0,7>, RHS
+  2669066421U, // <0,7,u,6>: Cost 3 vext2 <u,6,0,7>, <u,6,0,7>
+  2731988662U, // <0,7,u,7>: Cost 3 vext3 <7,u,7,0>, <7,u,7,0>
+  1183315007U, // <0,7,u,u>: Cost 2 vrev <7,0,u,u>
+  135053414U, // <0,u,0,0>: Cost 1 vdup0 LHS
+  1544896614U, // <0,u,0,1>: Cost 2 vext2 <0,2,0,u>, LHS
+  1678999654U, // <0,u,0,2>: Cost 2 vuzpl LHS, LHS
+  2691880677U, // <0,u,0,3>: Cost 3 vext3 <1,2,3,0>, <u,0,3,2>
+  1476988214U, // <0,u,0,4>: Cost 2 vext1 <0,0,u,0>, RHS
+  2718791419U, // <0,u,0,5>: Cost 3 vext3 <5,6,u,0>, <u,0,5,6>
+  3021248666U, // <0,u,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
+  2592535607U, // <0,u,0,7>: Cost 3 vext1 <7,0,u,0>, <7,0,u,0>
+  135053414U, // <0,u,0,u>: Cost 1 vdup0 LHS
+  1476993097U, // <0,u,1,0>: Cost 2 vext1 <0,0,u,1>, <0,0,u,1>
+  1812780846U, // <0,u,1,1>: Cost 2 vzipl LHS, LHS
+  1618138926U, // <0,u,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
+  2752742134U, // <0,u,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
+  1476996406U, // <0,u,1,4>: Cost 2 vext1 <0,0,u,1>, RHS
+  1812781210U, // <0,u,1,5>: Cost 2 vzipl LHS, RHS
+  2887006416U, // <0,u,1,6>: Cost 3 vzipl LHS, <u,6,3,7>
+  2966736200U, // <0,u,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
+  1812781413U, // <0,u,1,u>: Cost 2 vzipl LHS, LHS
+  1482973286U, // <0,u,2,0>: Cost 2 vext1 <1,0,u,2>, LHS
+  1482973987U, // <0,u,2,1>: Cost 2 vext1 <1,0,u,2>, <1,0,u,2>
+  1946998574U, // <0,u,2,2>: Cost 2 vtrnl LHS, LHS
+  835584U, // <0,u,2,3>: Cost 0 copy LHS
+  1482976566U, // <0,u,2,4>: Cost 2 vext1 <1,0,u,2>, RHS
+  3020781631U, // <0,u,2,5>: Cost 3 vtrnl LHS, <u,4,5,6>
+  1946998938U, // <0,u,2,6>: Cost 2 vtrnl LHS, RHS
+  1518810169U, // <0,u,2,7>: Cost 2 vext1 <7,0,u,2>, <7,0,u,2>
+  835584U, // <0,u,2,u>: Cost 0 copy LHS
+  2618640534U, // <0,u,3,0>: Cost 3 vext2 <0,2,0,u>, <3,0,1,2>
+  2752743574U, // <0,u,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
+  2636556597U, // <0,u,3,2>: Cost 3 vext2 <3,2,0,u>, <3,2,0,u>
+  2752743836U, // <0,u,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
+  2618640898U, // <0,u,3,4>: Cost 3 vext2 <0,2,0,u>, <3,4,5,6>
+  2752743938U, // <0,u,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
+  2639202936U, // <0,u,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
+  2639874762U, // <0,u,3,7>: Cost 3 vext2 <3,7,0,u>, <3,7,0,u>
+  2752743637U, // <0,u,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
+  2562703462U, // <0,u,4,0>: Cost 3 vext1 <2,0,u,4>, LHS
+  2888455982U, // <0,u,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
+  3021575982U, // <0,u,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
+  2568677591U, // <0,u,4,3>: Cost 3 vext1 <3,0,u,4>, <3,0,u,4>
+  2562706742U, // <0,u,4,4>: Cost 3 vext1 <2,0,u,4>, RHS
+  1544899894U, // <0,u,4,5>: Cost 2 vext2 <0,2,0,u>, RHS
+  1679002934U, // <0,u,4,6>: Cost 2 vuzpl LHS, RHS
+  2718718033U, // <0,u,4,7>: Cost 3 vext3 <5,6,7,0>, <u,4,7,6>
+  1679002952U, // <0,u,4,u>: Cost 2 vuzpl LHS, RHS
+  2568683622U, // <0,u,5,0>: Cost 3 vext1 <3,0,u,5>, LHS
+  2568684438U, // <0,u,5,1>: Cost 3 vext1 <3,0,u,5>, <1,2,3,0>
+  3765622902U, // <0,u,5,2>: Cost 4 vext3 <1,2,3,0>, <u,5,2,7>
+  2691881087U, // <0,u,5,3>: Cost 3 vext3 <1,2,3,0>, <u,5,3,7>
+  2568686902U, // <0,u,5,4>: Cost 3 vext1 <3,0,u,5>, RHS
+  2650492890U, // <0,u,5,5>: Cost 3 vext2 <5,5,0,u>, <5,5,0,u>
+  1618139290U, // <0,u,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
+  2824834358U, // <0,u,5,7>: Cost 3 vuzpr <1,0,3,u>, RHS
+  1618139308U, // <0,u,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
+  2592579686U, // <0,u,6,0>: Cost 3 vext1 <7,0,u,6>, LHS
+  2262496983U, // <0,u,6,1>: Cost 3 vrev <u,0,1,6>
+  2654474688U, // <0,u,6,2>: Cost 3 vext2 <6,2,0,u>, <6,2,0,u>
+  2691881168U, // <0,u,6,3>: Cost 3 vext3 <1,2,3,0>, <u,6,3,7>
+  2592582966U, // <0,u,6,4>: Cost 3 vext1 <7,0,u,6>, RHS
+  2656465587U, // <0,u,6,5>: Cost 3 vext2 <6,5,0,u>, <6,5,0,u>
+  2657129220U, // <0,u,6,6>: Cost 3 vext2 <6,6,0,u>, <6,6,0,u>
+  1584051029U, // <0,u,6,7>: Cost 2 vext2 <6,7,0,u>, <6,7,0,u>
+  1584714662U, // <0,u,6,u>: Cost 2 vext2 <6,u,0,u>, <6,u,0,u>
+  2562728038U, // <0,u,7,0>: Cost 3 vext1 <2,0,u,7>, LHS
+  2562728854U, // <0,u,7,1>: Cost 3 vext1 <2,0,u,7>, <1,2,3,0>
+  2562729473U, // <0,u,7,2>: Cost 3 vext1 <2,0,u,7>, <2,0,u,7>
+  2661111018U, // <0,u,7,3>: Cost 3 vext2 <7,3,0,u>, <7,3,0,u>
+  2562731318U, // <0,u,7,4>: Cost 3 vext1 <2,0,u,7>, RHS
+  2718718258U, // <0,u,7,5>: Cost 3 vext3 <5,6,7,0>, <u,7,5,6>
+  2586620261U, // <0,u,7,6>: Cost 3 vext1 <6,0,u,7>, <6,0,u,7>
+  2657793644U, // <0,u,7,7>: Cost 3 vext2 <6,7,0,u>, <7,7,7,7>
+  2562733870U, // <0,u,7,u>: Cost 3 vext1 <2,0,u,7>, LHS
+  135053414U, // <0,u,u,0>: Cost 1 vdup0 LHS
+  1544902446U, // <0,u,u,1>: Cost 2 vext2 <0,2,0,u>, LHS
+  1679005486U, // <0,u,u,2>: Cost 2 vuzpl LHS, LHS
+  835584U, // <0,u,u,3>: Cost 0 copy LHS
+  1483025718U, // <0,u,u,4>: Cost 2 vext1 <1,0,u,u>, RHS
+  1544902810U, // <0,u,u,5>: Cost 2 vext2 <0,2,0,u>, RHS
+  1679005850U, // <0,u,u,6>: Cost 2 vuzpl LHS, RHS
+  1518859327U, // <0,u,u,7>: Cost 2 vext1 <7,0,u,u>, <7,0,u,u>
+  835584U, // <0,u,u,u>: Cost 0 copy LHS
+  2689744896U, // <1,0,0,0>: Cost 3 vext3 <0,u,1,1>, <0,0,0,0>
+  1610694666U, // <1,0,0,1>: Cost 2 vext3 <0,0,1,1>, <0,0,1,1>
+  2689744916U, // <1,0,0,2>: Cost 3 vext3 <0,u,1,1>, <0,0,2,2>
+  2619310332U, // <1,0,0,3>: Cost 3 vext2 <0,3,1,0>, <0,3,1,0>
+  2684657701U, // <1,0,0,4>: Cost 3 vext3 <0,0,4,1>, <0,0,4,1>
+  2620637598U, // <1,0,0,5>: Cost 3 vext2 <0,5,1,0>, <0,5,1,0>
+  3708977654U, // <1,0,0,6>: Cost 4 vext2 <3,0,1,0>, <0,6,1,7>
+  3666351168U, // <1,0,0,7>: Cost 4 vext1 <7,1,0,0>, <7,1,0,0>
+  1611210825U, // <1,0,0,u>: Cost 2 vext3 <0,0,u,1>, <0,0,u,1>
+  2556780646U, // <1,0,1,0>: Cost 3 vext1 <1,1,0,1>, LHS
+  2556781355U, // <1,0,1,1>: Cost 3 vext1 <1,1,0,1>, <1,1,0,1>
+  1616003174U, // <1,0,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
+  3693052888U, // <1,0,1,3>: Cost 4 vext2 <0,3,1,0>, <1,3,1,3>
+  2556783926U, // <1,0,1,4>: Cost 3 vext1 <1,1,0,1>, RHS
+  2580672143U, // <1,0,1,5>: Cost 3 vext1 <5,1,0,1>, <5,1,0,1>
+  2724839566U, // <1,0,1,6>: Cost 3 vext3 <6,7,0,1>, <0,1,6,7>
+  3654415354U, // <1,0,1,7>: Cost 4 vext1 <5,1,0,1>, <7,0,1,2>
+  1616003228U, // <1,0,1,u>: Cost 2 vext3 <0,u,1,1>, LHS
+  2685690019U, // <1,0,2,0>: Cost 3 vext3 <0,2,0,1>, <0,2,0,1>
+  2685763756U, // <1,0,2,1>: Cost 3 vext3 <0,2,1,1>, <0,2,1,1>
+  2698297524U, // <1,0,2,2>: Cost 3 vext3 <2,3,0,1>, <0,2,2,0>
+  2685911230U, // <1,0,2,3>: Cost 3 vext3 <0,2,3,1>, <0,2,3,1>
+  2689745100U, // <1,0,2,4>: Cost 3 vext3 <0,u,1,1>, <0,2,4,6>
+  3764814038U, // <1,0,2,5>: Cost 4 vext3 <1,1,1,1>, <0,2,5,7>
+  2724839640U, // <1,0,2,6>: Cost 3 vext3 <6,7,0,1>, <0,2,6,0>
+  2592625658U, // <1,0,2,7>: Cost 3 vext1 <7,1,0,2>, <7,0,1,2>
+  2686279915U, // <1,0,2,u>: Cost 3 vext3 <0,2,u,1>, <0,2,u,1>
+  3087843328U, // <1,0,3,0>: Cost 3 vtrnr LHS, <0,0,0,0>
+  3087843338U, // <1,0,3,1>: Cost 3 vtrnr LHS, <0,0,1,1>
+  67944550U, // <1,0,3,2>: Cost 1 vrev LHS
+  2568743135U, // <1,0,3,3>: Cost 3 vext1 <3,1,0,3>, <3,1,0,3>
+  2562772278U, // <1,0,3,4>: Cost 3 vext1 <2,1,0,3>, RHS
+  4099850454U, // <1,0,3,5>: Cost 4 vtrnl <1,0,3,2>, <0,2,5,7>
+  3704998538U, // <1,0,3,6>: Cost 4 vext2 <2,3,1,0>, <3,6,2,7>
+  2592633923U, // <1,0,3,7>: Cost 3 vext1 <7,1,0,3>, <7,1,0,3>
+  68386972U, // <1,0,3,u>: Cost 1 vrev LHS
+  2620640146U, // <1,0,4,0>: Cost 3 vext2 <0,5,1,0>, <4,0,5,1>
+  2689745234U, // <1,0,4,1>: Cost 3 vext3 <0,u,1,1>, <0,4,1,5>
+  2689745244U, // <1,0,4,2>: Cost 3 vext3 <0,u,1,1>, <0,4,2,6>
+  3760980320U, // <1,0,4,3>: Cost 4 vext3 <0,4,3,1>, <0,4,3,1>
+  3761054057U, // <1,0,4,4>: Cost 4 vext3 <0,4,4,1>, <0,4,4,1>
+  2619313462U, // <1,0,4,5>: Cost 3 vext2 <0,3,1,0>, RHS
+  3761201531U, // <1,0,4,6>: Cost 4 vext3 <0,4,6,1>, <0,4,6,1>
+  3666383940U, // <1,0,4,7>: Cost 4 vext1 <7,1,0,4>, <7,1,0,4>
+  2619313705U, // <1,0,4,u>: Cost 3 vext2 <0,3,1,0>, RHS
+  4029300736U, // <1,0,5,0>: Cost 4 vzipr <0,4,1,5>, <0,0,0,0>
+  2895249510U, // <1,0,5,1>: Cost 3 vzipl <1,5,3,7>, LHS
+  3028287590U, // <1,0,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
+  3642501345U, // <1,0,5,3>: Cost 4 vext1 <3,1,0,5>, <3,1,0,5>
+  2215592058U, // <1,0,5,4>: Cost 3 vrev <0,1,4,5>
+  3724242907U, // <1,0,5,5>: Cost 4 vext2 <5,5,1,0>, <5,5,1,0>
+  3724906540U, // <1,0,5,6>: Cost 4 vext2 <5,6,1,0>, <5,6,1,0>
+  3911118134U, // <1,0,5,7>: Cost 4 vuzpr <3,1,3,0>, RHS
+  3028287644U, // <1,0,5,u>: Cost 3 vtrnl <1,3,5,7>, LHS
+  3762086375U, // <1,0,6,0>: Cost 4 vext3 <0,6,0,1>, <0,6,0,1>
+  2698297846U, // <1,0,6,1>: Cost 3 vext3 <2,3,0,1>, <0,6,1,7>
+  3760022015U, // <1,0,6,2>: Cost 4 vext3 <0,2,u,1>, <0,6,2,7>
+  3642509538U, // <1,0,6,3>: Cost 4 vext1 <3,1,0,6>, <3,1,0,6>
+  3762381323U, // <1,0,6,4>: Cost 4 vext3 <0,6,4,1>, <0,6,4,1>
+  3730215604U, // <1,0,6,5>: Cost 4 vext2 <6,5,1,0>, <6,5,1,0>
+  3730879237U, // <1,0,6,6>: Cost 4 vext2 <6,6,1,0>, <6,6,1,0>
+  2657801046U, // <1,0,6,7>: Cost 3 vext2 <6,7,1,0>, <6,7,1,0>
+  2658464679U, // <1,0,6,u>: Cost 3 vext2 <6,u,1,0>, <6,u,1,0>
+  2659128312U, // <1,0,7,0>: Cost 3 vext2 <7,0,1,0>, <7,0,1,0>
+  4047898278U, // <1,0,7,1>: Cost 4 vzipr <3,5,1,7>, <2,3,0,1>
+  2215460970U, // <1,0,7,2>: Cost 3 vrev <0,1,2,7>
+  3734861035U, // <1,0,7,3>: Cost 4 vext2 <7,3,1,0>, <7,3,1,0>
+  3731543398U, // <1,0,7,4>: Cost 4 vext2 <6,7,1,0>, <7,4,5,6>
+  3736188301U, // <1,0,7,5>: Cost 4 vext2 <7,5,1,0>, <7,5,1,0>
+  2663110110U, // <1,0,7,6>: Cost 3 vext2 <7,6,1,0>, <7,6,1,0>
+  3731543660U, // <1,0,7,7>: Cost 4 vext2 <6,7,1,0>, <7,7,7,7>
+  2664437376U, // <1,0,7,u>: Cost 3 vext2 <7,u,1,0>, <7,u,1,0>
+  3087884288U, // <1,0,u,0>: Cost 3 vtrnr LHS, <0,0,0,0>
+  1616003730U, // <1,0,u,1>: Cost 2 vext3 <0,u,1,1>, <0,u,1,1>
+  67985515U, // <1,0,u,2>: Cost 1 vrev LHS
+  2689893028U, // <1,0,u,3>: Cost 3 vext3 <0,u,3,1>, <0,u,3,1>
+  2689745586U, // <1,0,u,4>: Cost 3 vext3 <0,u,1,1>, <0,u,4,6>
+  2619316378U, // <1,0,u,5>: Cost 3 vext2 <0,3,1,0>, RHS
+  2669082807U, // <1,0,u,6>: Cost 3 vext2 <u,6,1,0>, <u,6,1,0>
+  2592674888U, // <1,0,u,7>: Cost 3 vext1 <7,1,0,u>, <7,1,0,u>
+  68427937U, // <1,0,u,u>: Cost 1 vrev LHS
+  1543585802U, // <1,1,0,0>: Cost 2 vext2 <0,0,1,1>, <0,0,1,1>
+  1548894310U, // <1,1,0,1>: Cost 2 vext2 <0,u,1,1>, LHS
+  2618654892U, // <1,1,0,2>: Cost 3 vext2 <0,2,1,1>, <0,2,1,1>
+  2689745654U, // <1,1,0,3>: Cost 3 vext3 <0,u,1,1>, <1,0,3,2>
+  2622636370U, // <1,1,0,4>: Cost 3 vext2 <0,u,1,1>, <0,4,1,5>
+  2620645791U, // <1,1,0,5>: Cost 3 vext2 <0,5,1,1>, <0,5,1,1>
+  3696378367U, // <1,1,0,6>: Cost 4 vext2 <0,u,1,1>, <0,6,2,7>
+  3666424905U, // <1,1,0,7>: Cost 4 vext1 <7,1,1,0>, <7,1,1,0>
+  1548894866U, // <1,1,0,u>: Cost 2 vext2 <0,u,1,1>, <0,u,1,1>
+  1483112550U, // <1,1,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
+  202162278U, // <1,1,1,1>: Cost 1 vdup1 LHS
+  2622636950U, // <1,1,1,2>: Cost 3 vext2 <0,u,1,1>, <1,2,3,0>
+  2622637016U, // <1,1,1,3>: Cost 3 vext2 <0,u,1,1>, <1,3,1,3>
+  1483115830U, // <1,1,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
+  2622637200U, // <1,1,1,5>: Cost 3 vext2 <0,u,1,1>, <1,5,3,7>
+  2622637263U, // <1,1,1,6>: Cost 3 vext2 <0,u,1,1>, <1,6,1,7>
+  2592691274U, // <1,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
+  202162278U, // <1,1,1,u>: Cost 1 vdup1 LHS
+  2550890588U, // <1,1,2,0>: Cost 3 vext1 <0,1,1,2>, <0,1,1,2>
+  2617329183U, // <1,1,2,1>: Cost 3 vext2 <0,0,1,1>, <2,1,3,1>
+  2622637672U, // <1,1,2,2>: Cost 3 vext2 <0,u,1,1>, <2,2,2,2>
+  2622637734U, // <1,1,2,3>: Cost 3 vext2 <0,u,1,1>, <2,3,0,1>
+  2550893878U, // <1,1,2,4>: Cost 3 vext1 <0,1,1,2>, RHS
+  3696379744U, // <1,1,2,5>: Cost 4 vext2 <0,u,1,1>, <2,5,2,7>
+  2622638010U, // <1,1,2,6>: Cost 3 vext2 <0,u,1,1>, <2,6,3,7>
+  3804554170U, // <1,1,2,7>: Cost 4 vext3 <7,7,0,1>, <1,2,7,0>
+  2622638139U, // <1,1,2,u>: Cost 3 vext2 <0,u,1,1>, <2,u,0,1>
+  2622638230U, // <1,1,3,0>: Cost 3 vext2 <0,u,1,1>, <3,0,1,2>
+  3087844148U, // <1,1,3,1>: Cost 3 vtrnr LHS, <1,1,1,1>
+  4161585244U, // <1,1,3,2>: Cost 4 vtrnr LHS, <0,1,1,2>
+  2014101606U, // <1,1,3,3>: Cost 2 vtrnr LHS, LHS
+  2622638594U, // <1,1,3,4>: Cost 3 vext2 <0,u,1,1>, <3,4,5,6>
+  2689745920U, // <1,1,3,5>: Cost 3 vext3 <0,u,1,1>, <1,3,5,7>
+  3763487753U, // <1,1,3,6>: Cost 4 vext3 <0,u,1,1>, <1,3,6,7>
+  2592707660U, // <1,1,3,7>: Cost 3 vext1 <7,1,1,3>, <7,1,1,3>
+  2014101611U, // <1,1,3,u>: Cost 2 vtrnr LHS, LHS
+  2556878950U, // <1,1,4,0>: Cost 3 vext1 <1,1,1,4>, LHS
+  2221335351U, // <1,1,4,1>: Cost 3 vrev <1,1,1,4>
+  3696380988U, // <1,1,4,2>: Cost 4 vext2 <0,u,1,1>, <4,2,6,0>
+  3763487805U, // <1,1,4,3>: Cost 4 vext3 <0,u,1,1>, <1,4,3,5>
+  2556882230U, // <1,1,4,4>: Cost 3 vext1 <1,1,1,4>, RHS
+  1548897590U, // <1,1,4,5>: Cost 2 vext2 <0,u,1,1>, RHS
+  2758184246U, // <1,1,4,6>: Cost 3 vuzpl <1,1,1,1>, RHS
+  3666457677U, // <1,1,4,7>: Cost 4 vext1 <7,1,1,4>, <7,1,1,4>
+  1548897833U, // <1,1,4,u>: Cost 2 vext2 <0,u,1,1>, RHS
+  2693653615U, // <1,1,5,0>: Cost 3 vext3 <1,5,0,1>, <1,5,0,1>
+  2617331408U, // <1,1,5,1>: Cost 3 vext2 <0,0,1,1>, <5,1,7,3>
+  4029302934U, // <1,1,5,2>: Cost 4 vzipr <0,4,1,5>, <3,0,1,2>
+  2689746064U, // <1,1,5,3>: Cost 3 vext3 <0,u,1,1>, <1,5,3,7>
+  2221564755U, // <1,1,5,4>: Cost 3 vrev <1,1,4,5>
+  2955559250U, // <1,1,5,5>: Cost 3 vzipr <0,4,1,5>, <0,4,1,5>
+  2617331810U, // <1,1,5,6>: Cost 3 vext2 <0,0,1,1>, <5,6,7,0>
+  2825293110U, // <1,1,5,7>: Cost 3 vuzpr <1,1,1,1>, RHS
+  2689746109U, // <1,1,5,u>: Cost 3 vext3 <0,u,1,1>, <1,5,u,7>
+  3696382241U, // <1,1,6,0>: Cost 4 vext2 <0,u,1,1>, <6,0,1,2>
+  2689746127U, // <1,1,6,1>: Cost 3 vext3 <0,u,1,1>, <1,6,1,7>
+  2617332218U, // <1,1,6,2>: Cost 3 vext2 <0,0,1,1>, <6,2,7,3>
+  3763487969U, // <1,1,6,3>: Cost 4 vext3 <0,u,1,1>, <1,6,3,7>
+  3696382605U, // <1,1,6,4>: Cost 4 vext2 <0,u,1,1>, <6,4,5,6>
+  4029309266U, // <1,1,6,5>: Cost 4 vzipr <0,4,1,6>, <0,4,1,5>
+  2617332536U, // <1,1,6,6>: Cost 3 vext2 <0,0,1,1>, <6,6,6,6>
+  2724840702U, // <1,1,6,7>: Cost 3 vext3 <6,7,0,1>, <1,6,7,0>
+  2725504263U, // <1,1,6,u>: Cost 3 vext3 <6,u,0,1>, <1,6,u,0>
+  2617332720U, // <1,1,7,0>: Cost 3 vext2 <0,0,1,1>, <7,0,0,1>
+  2659800138U, // <1,1,7,1>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
+  3691074717U, // <1,1,7,2>: Cost 4 vext2 <0,0,1,1>, <7,2,1,3>
+  4167811174U, // <1,1,7,3>: Cost 4 vtrnr <1,1,5,7>, LHS
+  2617333094U, // <1,1,7,4>: Cost 3 vext2 <0,0,1,1>, <7,4,5,6>
+  3295396702U, // <1,1,7,5>: Cost 4 vrev <1,1,5,7>
+  3803891014U, // <1,1,7,6>: Cost 4 vext3 <7,6,0,1>, <1,7,6,0>
+  2617333356U, // <1,1,7,7>: Cost 3 vext2 <0,0,1,1>, <7,7,7,7>
+  2659800138U, // <1,1,7,u>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
+  1483112550U, // <1,1,u,0>: Cost 2 vext1 <1,1,1,1>, LHS
+  202162278U, // <1,1,u,1>: Cost 1 vdup1 LHS
+  2622642056U, // <1,1,u,2>: Cost 3 vext2 <0,u,1,1>, <u,2,3,3>
+  2014142566U, // <1,1,u,3>: Cost 2 vtrnr LHS, LHS
+  1483115830U, // <1,1,u,4>: Cost 2 vext1 <1,1,1,1>, RHS
+  1548900506U, // <1,1,u,5>: Cost 2 vext2 <0,u,1,1>, RHS
+  2622642384U, // <1,1,u,6>: Cost 3 vext2 <0,u,1,1>, <u,6,3,7>
+  2825293353U, // <1,1,u,7>: Cost 3 vuzpr <1,1,1,1>, RHS
+  202162278U, // <1,1,u,u>: Cost 1 vdup1 LHS
+  2635251712U, // <1,2,0,0>: Cost 3 vext2 <3,0,1,2>, <0,0,0,0>
+  1561509990U, // <1,2,0,1>: Cost 2 vext2 <3,0,1,2>, LHS
+  2618663085U, // <1,2,0,2>: Cost 3 vext2 <0,2,1,2>, <0,2,1,2>
+  2696529358U, // <1,2,0,3>: Cost 3 vext3 <2,0,3,1>, <2,0,3,1>
+  2635252050U, // <1,2,0,4>: Cost 3 vext2 <3,0,1,2>, <0,4,1,5>
+  3769533926U, // <1,2,0,5>: Cost 4 vext3 <1,u,2,1>, <2,0,5,7>
+  2621317617U, // <1,2,0,6>: Cost 3 vext2 <0,6,1,2>, <0,6,1,2>
+  2659140170U, // <1,2,0,7>: Cost 3 vext2 <7,0,1,2>, <0,7,2,1>
+  1561510557U, // <1,2,0,u>: Cost 2 vext2 <3,0,1,2>, LHS
+  2623308516U, // <1,2,1,0>: Cost 3 vext2 <1,0,1,2>, <1,0,1,2>
+  2635252532U, // <1,2,1,1>: Cost 3 vext2 <3,0,1,2>, <1,1,1,1>
+  2631271318U, // <1,2,1,2>: Cost 3 vext2 <2,3,1,2>, <1,2,3,0>
+  2958180454U, // <1,2,1,3>: Cost 3 vzipr <0,u,1,1>, LHS
+  2550959414U, // <1,2,1,4>: Cost 3 vext1 <0,1,2,1>, RHS
+  2635252880U, // <1,2,1,5>: Cost 3 vext2 <3,0,1,2>, <1,5,3,7>
+  2635252952U, // <1,2,1,6>: Cost 3 vext2 <3,0,1,2>, <1,6,2,7>
+  3732882731U, // <1,2,1,7>: Cost 4 vext2 <7,0,1,2>, <1,7,3,0>
+  2958180459U, // <1,2,1,u>: Cost 3 vzipr <0,u,1,1>, LHS
+  2629281213U, // <1,2,2,0>: Cost 3 vext2 <2,0,1,2>, <2,0,1,2>
+  2635253280U, // <1,2,2,1>: Cost 3 vext2 <3,0,1,2>, <2,1,3,2>
+  2618664552U, // <1,2,2,2>: Cost 3 vext2 <0,2,1,2>, <2,2,2,2>
+  2689746546U, // <1,2,2,3>: Cost 3 vext3 <0,u,1,1>, <2,2,3,3>
+  3764815485U, // <1,2,2,4>: Cost 4 vext3 <1,1,1,1>, <2,2,4,5>
+  3760023176U, // <1,2,2,5>: Cost 4 vext3 <0,2,u,1>, <2,2,5,7>
+  2635253690U, // <1,2,2,6>: Cost 3 vext2 <3,0,1,2>, <2,6,3,7>
+  2659141610U, // <1,2,2,7>: Cost 3 vext2 <7,0,1,2>, <2,7,0,1>
+  2689746591U, // <1,2,2,u>: Cost 3 vext3 <0,u,1,1>, <2,2,u,3>
+  403488870U, // <1,2,3,0>: Cost 1 vext1 LHS, LHS
+  1477231350U, // <1,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  1477232232U, // <1,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
+  1477233052U, // <1,2,3,3>: Cost 2 vext1 LHS, <3,3,3,3>
+  403492150U, // <1,2,3,4>: Cost 1 vext1 LHS, RHS
+  1525010128U, // <1,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
+  1525010938U, // <1,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1525011450U, // <1,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  403494702U, // <1,2,3,u>: Cost 1 vext1 LHS, LHS
+  2641226607U, // <1,2,4,0>: Cost 3 vext2 <4,0,1,2>, <4,0,1,2>
+  3624723446U, // <1,2,4,1>: Cost 4 vext1 <0,1,2,4>, <1,3,4,6>
+  3301123609U, // <1,2,4,2>: Cost 4 vrev <2,1,2,4>
+  2598759198U, // <1,2,4,3>: Cost 3 vext1 <u,1,2,4>, <3,u,1,2>
+  2659142864U, // <1,2,4,4>: Cost 3 vext2 <7,0,1,2>, <4,4,4,4>
+  1561513270U, // <1,2,4,5>: Cost 2 vext2 <3,0,1,2>, RHS
+  2659143028U, // <1,2,4,6>: Cost 3 vext2 <7,0,1,2>, <4,6,4,6>
+  2659143112U, // <1,2,4,7>: Cost 3 vext2 <7,0,1,2>, <4,7,5,0>
+  1561513513U, // <1,2,4,u>: Cost 2 vext2 <3,0,1,2>, RHS
+  2550988902U, // <1,2,5,0>: Cost 3 vext1 <0,1,2,5>, LHS
+  2550989824U, // <1,2,5,1>: Cost 3 vext1 <0,1,2,5>, <1,3,5,7>
+  3624732264U, // <1,2,5,2>: Cost 4 vext1 <0,1,2,5>, <2,2,2,2>
+  2955559014U, // <1,2,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
+  2550992182U, // <1,2,5,4>: Cost 3 vext1 <0,1,2,5>, RHS
+  2659143684U, // <1,2,5,5>: Cost 3 vext2 <7,0,1,2>, <5,5,5,5>
+  2659143778U, // <1,2,5,6>: Cost 3 vext2 <7,0,1,2>, <5,6,7,0>
+  2659143848U, // <1,2,5,7>: Cost 3 vext2 <7,0,1,2>, <5,7,5,7>
+  2550994734U, // <1,2,5,u>: Cost 3 vext1 <0,1,2,5>, LHS
+  2700289945U, // <1,2,6,0>: Cost 3 vext3 <2,6,0,1>, <2,6,0,1>
+  2635256232U, // <1,2,6,1>: Cost 3 vext2 <3,0,1,2>, <6,1,7,2>
+  2659144186U, // <1,2,6,2>: Cost 3 vext2 <7,0,1,2>, <6,2,7,3>
+  2689746874U, // <1,2,6,3>: Cost 3 vext3 <0,u,1,1>, <2,6,3,7>
+  3763488705U, // <1,2,6,4>: Cost 4 vext3 <0,u,1,1>, <2,6,4,5>
+  3763488716U, // <1,2,6,5>: Cost 4 vext3 <0,u,1,1>, <2,6,5,7>
+  2659144504U, // <1,2,6,6>: Cost 3 vext2 <7,0,1,2>, <6,6,6,6>
+  2657817432U, // <1,2,6,7>: Cost 3 vext2 <6,7,1,2>, <6,7,1,2>
+  2689746919U, // <1,2,6,u>: Cost 3 vext3 <0,u,1,1>, <2,6,u,7>
+  1585402874U, // <1,2,7,0>: Cost 2 vext2 <7,0,1,2>, <7,0,1,2>
+  2659144770U, // <1,2,7,1>: Cost 3 vext2 <7,0,1,2>, <7,1,0,2>
+  3708998858U, // <1,2,7,2>: Cost 4 vext2 <3,0,1,2>, <7,2,6,3>
+  2635257059U, // <1,2,7,3>: Cost 3 vext2 <3,0,1,2>, <7,3,0,1>
+  2659145062U, // <1,2,7,4>: Cost 3 vext2 <7,0,1,2>, <7,4,5,6>
+  3732886916U, // <1,2,7,5>: Cost 4 vext2 <7,0,1,2>, <7,5,0,0>
+  3732886998U, // <1,2,7,6>: Cost 4 vext2 <7,0,1,2>, <7,6,0,1>
+  2659145255U, // <1,2,7,7>: Cost 3 vext2 <7,0,1,2>, <7,7,0,1>
+  1590711938U, // <1,2,7,u>: Cost 2 vext2 <7,u,1,2>, <7,u,1,2>
+  403529835U, // <1,2,u,0>: Cost 1 vext1 LHS, LHS
+  1477272310U, // <1,2,u,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  1477273192U, // <1,2,u,2>: Cost 2 vext1 LHS, <2,2,2,2>
+  1477273750U, // <1,2,u,3>: Cost 2 vext1 LHS, <3,0,1,2>
+  403533110U, // <1,2,u,4>: Cost 1 vext1 LHS, RHS
+  1561516186U, // <1,2,u,5>: Cost 2 vext2 <3,0,1,2>, RHS
+  1525051898U, // <1,2,u,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1525052410U, // <1,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  403535662U, // <1,2,u,u>: Cost 1 vext1 LHS, LHS
+  2819407872U, // <1,3,0,0>: Cost 3 vuzpr LHS, <0,0,0,0>
+  1551564902U, // <1,3,0,1>: Cost 2 vext2 <1,3,1,3>, LHS
+  2819408630U, // <1,3,0,2>: Cost 3 vuzpr LHS, <1,0,3,2>
+  2619334911U, // <1,3,0,3>: Cost 3 vext2 <0,3,1,3>, <0,3,1,3>
+  2625306962U, // <1,3,0,4>: Cost 3 vext2 <1,3,1,3>, <0,4,1,5>
+  3832725879U, // <1,3,0,5>: Cost 4 vuzpl <1,2,3,0>, <0,4,5,6>
+  3699048959U, // <1,3,0,6>: Cost 4 vext2 <1,3,1,3>, <0,6,2,7>
+  3776538827U, // <1,3,0,7>: Cost 4 vext3 <3,0,7,1>, <3,0,7,1>
+  1551565469U, // <1,3,0,u>: Cost 2 vext2 <1,3,1,3>, LHS
+  2618671862U, // <1,3,1,0>: Cost 3 vext2 <0,2,1,3>, <1,0,3,2>
+  2819408692U, // <1,3,1,1>: Cost 3 vuzpr LHS, <1,1,1,1>
+  2624643975U, // <1,3,1,2>: Cost 3 vext2 <1,2,1,3>, <1,2,1,3>
+  1745666150U, // <1,3,1,3>: Cost 2 vuzpr LHS, LHS
+  2557005110U, // <1,3,1,4>: Cost 3 vext1 <1,1,3,1>, RHS
+  2625307792U, // <1,3,1,5>: Cost 3 vext2 <1,3,1,3>, <1,5,3,7>
+  3698386127U, // <1,3,1,6>: Cost 4 vext2 <1,2,1,3>, <1,6,1,7>
+  2592838748U, // <1,3,1,7>: Cost 3 vext1 <7,1,3,1>, <7,1,3,1>
+  1745666155U, // <1,3,1,u>: Cost 2 vuzpr LHS, LHS
+  2819408790U, // <1,3,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
+  2625308193U, // <1,3,2,1>: Cost 3 vext2 <1,3,1,3>, <2,1,3,3>
+  2819408036U, // <1,3,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
+  2819851890U, // <1,3,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
+  2819408794U, // <1,3,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
+  3893149890U, // <1,3,2,5>: Cost 4 vuzpr LHS, <0,2,3,5>
+  2819408076U, // <1,3,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
+  3772041583U, // <1,3,2,7>: Cost 4 vext3 <2,3,0,1>, <3,2,7,3>
+  2819408042U, // <1,3,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
+  1483276390U, // <1,3,3,0>: Cost 2 vext1 <1,1,3,3>, LHS
+  1483277128U, // <1,3,3,1>: Cost 2 vext1 <1,1,3,3>, <1,1,3,3>
+  2557019752U, // <1,3,3,2>: Cost 3 vext1 <1,1,3,3>, <2,2,2,2>
+  2819408856U, // <1,3,3,3>: Cost 3 vuzpr LHS, <1,3,1,3>
+  1483279670U, // <1,3,3,4>: Cost 2 vext1 <1,1,3,3>, RHS
+  2819409614U, // <1,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
+  2598826490U, // <1,3,3,6>: Cost 3 vext1 <u,1,3,3>, <6,2,7,3>
+  3087844352U, // <1,3,3,7>: Cost 3 vtrnr LHS, <1,3,5,7>
+  1483282222U, // <1,3,3,u>: Cost 2 vext1 <1,1,3,3>, LHS
+  2568970342U, // <1,3,4,0>: Cost 3 vext1 <3,1,3,4>, LHS
+  2568971224U, // <1,3,4,1>: Cost 3 vext1 <3,1,3,4>, <1,3,1,3>
+  3832761290U, // <1,3,4,2>: Cost 4 vuzpl <1,2,3,4>, <4,1,2,3>
+  2233428219U, // <1,3,4,3>: Cost 3 vrev <3,1,3,4>
+  2568973622U, // <1,3,4,4>: Cost 3 vext1 <3,1,3,4>, RHS
+  1551568182U, // <1,3,4,5>: Cost 2 vext2 <1,3,1,3>, RHS
+  2819410434U, // <1,3,4,6>: Cost 3 vuzpr LHS, <3,4,5,6>
+  3666605151U, // <1,3,4,7>: Cost 4 vext1 <7,1,3,4>, <7,1,3,4>
+  1551568425U, // <1,3,4,u>: Cost 2 vext2 <1,3,1,3>, RHS
+  2563006566U, // <1,3,5,0>: Cost 3 vext1 <2,1,3,5>, LHS
+  2568979456U, // <1,3,5,1>: Cost 3 vext1 <3,1,3,5>, <1,3,5,7>
+  2563008035U, // <1,3,5,2>: Cost 3 vext1 <2,1,3,5>, <2,1,3,5>
+  2233436412U, // <1,3,5,3>: Cost 3 vrev <3,1,3,5>
+  2563009846U, // <1,3,5,4>: Cost 3 vext1 <2,1,3,5>, RHS
+  2867187716U, // <1,3,5,5>: Cost 3 vuzpr LHS, <5,5,5,5>
+  2655834214U, // <1,3,5,6>: Cost 3 vext2 <6,4,1,3>, <5,6,7,4>
+  1745669430U, // <1,3,5,7>: Cost 2 vuzpr LHS, RHS
+  1745669431U, // <1,3,5,u>: Cost 2 vuzpr LHS, RHS
+  2867187810U, // <1,3,6,0>: Cost 3 vuzpr LHS, <5,6,7,0>
+  3699052931U, // <1,3,6,1>: Cost 4 vext2 <1,3,1,3>, <6,1,3,1>
+  2654507460U, // <1,3,6,2>: Cost 3 vext2 <6,2,1,3>, <6,2,1,3>
+  3766291091U, // <1,3,6,3>: Cost 4 vext3 <1,3,3,1>, <3,6,3,7>
+  2655834726U, // <1,3,6,4>: Cost 3 vext2 <6,4,1,3>, <6,4,1,3>
+  3923384562U, // <1,3,6,5>: Cost 4 vuzpr <5,1,7,3>, <u,6,7,5>
+  2657161992U, // <1,3,6,6>: Cost 3 vext2 <6,6,1,3>, <6,6,1,3>
+  2819852218U, // <1,3,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
+  2819852219U, // <1,3,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
+  2706926275U, // <1,3,7,0>: Cost 3 vext3 <3,7,0,1>, <3,7,0,1>
+  2659816524U, // <1,3,7,1>: Cost 3 vext2 <7,1,1,3>, <7,1,1,3>
+  3636766245U, // <1,3,7,2>: Cost 4 vext1 <2,1,3,7>, <2,1,3,7>
+  2867187903U, // <1,3,7,3>: Cost 3 vuzpr LHS, <5,7,u,3>
+  2625312102U, // <1,3,7,4>: Cost 3 vext2 <1,3,1,3>, <7,4,5,6>
+  2867188598U, // <1,3,7,5>: Cost 3 vuzpr LHS, <6,7,4,5>
+  3728250344U, // <1,3,7,6>: Cost 4 vext2 <6,2,1,3>, <7,6,2,1>
+  2867187880U, // <1,3,7,7>: Cost 3 vuzpr LHS, <5,7,5,7>
+  2707516171U, // <1,3,7,u>: Cost 3 vext3 <3,7,u,1>, <3,7,u,1>
+  1483317350U, // <1,3,u,0>: Cost 2 vext1 <1,1,3,u>, LHS
+  1483318093U, // <1,3,u,1>: Cost 2 vext1 <1,1,3,u>, <1,1,3,u>
+  2819410718U, // <1,3,u,2>: Cost 3 vuzpr LHS, <3,u,1,2>
+  1745666717U, // <1,3,u,3>: Cost 2 vuzpr LHS, LHS
+  1483320630U, // <1,3,u,4>: Cost 2 vext1 <1,1,3,u>, RHS
+  1551571098U, // <1,3,u,5>: Cost 2 vext2 <1,3,1,3>, RHS
+  2819410758U, // <1,3,u,6>: Cost 3 vuzpr LHS, <3,u,5,6>
+  1745669673U, // <1,3,u,7>: Cost 2 vuzpr LHS, RHS
+  1745666722U, // <1,3,u,u>: Cost 2 vuzpr LHS, LHS
+  2617352205U, // <1,4,0,0>: Cost 3 vext2 <0,0,1,4>, <0,0,1,4>
+  2619342950U, // <1,4,0,1>: Cost 3 vext2 <0,3,1,4>, LHS
+  3692421295U, // <1,4,0,2>: Cost 4 vext2 <0,2,1,4>, <0,2,1,4>
+  2619343104U, // <1,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
+  2617352530U, // <1,4,0,4>: Cost 3 vext2 <0,0,1,4>, <0,4,1,5>
+  1634880402U, // <1,4,0,5>: Cost 2 vext3 <4,0,5,1>, <4,0,5,1>
+  2713930652U, // <1,4,0,6>: Cost 3 vext3 <4,u,5,1>, <4,0,6,2>
+  3732898396U, // <1,4,0,7>: Cost 4 vext2 <7,0,1,4>, <0,7,4,1>
+  1635101613U, // <1,4,0,u>: Cost 2 vext3 <4,0,u,1>, <4,0,u,1>
+  3693085430U, // <1,4,1,0>: Cost 4 vext2 <0,3,1,4>, <1,0,3,2>
+  2623988535U, // <1,4,1,1>: Cost 3 vext2 <1,1,1,4>, <1,1,1,4>
+  3693085590U, // <1,4,1,2>: Cost 4 vext2 <0,3,1,4>, <1,2,3,0>
+  3692422134U, // <1,4,1,3>: Cost 4 vext2 <0,2,1,4>, <1,3,4,6>
+  3693085726U, // <1,4,1,4>: Cost 4 vext2 <0,3,1,4>, <1,4,0,1>
+  2892401974U, // <1,4,1,5>: Cost 3 vzipl <1,1,1,1>, RHS
+  3026619702U, // <1,4,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
+  3800206324U, // <1,4,1,7>: Cost 4 vext3 <7,0,4,1>, <4,1,7,0>
+  2892402217U, // <1,4,1,u>: Cost 3 vzipl <1,1,1,1>, RHS
+  3966978927U, // <1,4,2,0>: Cost 4 vzipl <1,2,3,4>, <4,0,1,2>
+  3966979018U, // <1,4,2,1>: Cost 4 vzipl <1,2,3,4>, <4,1,2,3>
+  3693086312U, // <1,4,2,2>: Cost 4 vext2 <0,3,1,4>, <2,2,2,2>
+  2635269798U, // <1,4,2,3>: Cost 3 vext2 <3,0,1,4>, <2,3,0,1>
+  3966979280U, // <1,4,2,4>: Cost 4 vzipl <1,2,3,4>, <4,4,4,4>
+  2893204790U, // <1,4,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
+  3693086650U, // <1,4,2,6>: Cost 4 vext2 <0,3,1,4>, <2,6,3,7>
+  3666662502U, // <1,4,2,7>: Cost 4 vext1 <7,1,4,2>, <7,1,4,2>
+  2893205033U, // <1,4,2,u>: Cost 3 vzipl <1,2,3,0>, RHS
+  2563063910U, // <1,4,3,0>: Cost 3 vext1 <2,1,4,3>, LHS
+  2563064730U, // <1,4,3,1>: Cost 3 vext1 <2,1,4,3>, <1,2,3,4>
+  2563065386U, // <1,4,3,2>: Cost 3 vext1 <2,1,4,3>, <2,1,4,3>
+  3693087132U, // <1,4,3,3>: Cost 4 vext2 <0,3,1,4>, <3,3,3,3>
+  2619345410U, // <1,4,3,4>: Cost 3 vext2 <0,3,1,4>, <3,4,5,6>
+  3087843666U, // <1,4,3,5>: Cost 3 vtrnr LHS, <0,4,1,5>
+  3087843676U, // <1,4,3,6>: Cost 3 vtrnr LHS, <0,4,2,6>
+  3666670695U, // <1,4,3,7>: Cost 4 vext1 <7,1,4,3>, <7,1,4,3>
+  3087843669U, // <1,4,3,u>: Cost 3 vtrnr LHS, <0,4,1,u>
+  2620672914U, // <1,4,4,0>: Cost 3 vext2 <0,5,1,4>, <4,0,5,1>
+  3630842706U, // <1,4,4,1>: Cost 4 vext1 <1,1,4,4>, <1,1,4,4>
+  3313069003U, // <1,4,4,2>: Cost 4 vrev <4,1,2,4>
+  3642788100U, // <1,4,4,3>: Cost 4 vext1 <3,1,4,4>, <3,1,4,4>
+  2713930960U, // <1,4,4,4>: Cost 3 vext3 <4,u,5,1>, <4,4,4,4>
+  2619346230U, // <1,4,4,5>: Cost 3 vext2 <0,3,1,4>, RHS
+  2713930980U, // <1,4,4,6>: Cost 3 vext3 <4,u,5,1>, <4,4,6,6>
+  3736882642U, // <1,4,4,7>: Cost 4 vext2 <7,6,1,4>, <4,7,6,1>
+  2619346473U, // <1,4,4,u>: Cost 3 vext2 <0,3,1,4>, RHS
+  2557108326U, // <1,4,5,0>: Cost 3 vext1 <1,1,4,5>, LHS
+  2557109075U, // <1,4,5,1>: Cost 3 vext1 <1,1,4,5>, <1,1,4,5>
+  2598913774U, // <1,4,5,2>: Cost 3 vext1 <u,1,4,5>, <2,3,u,1>
+  3630852246U, // <1,4,5,3>: Cost 4 vext1 <1,1,4,5>, <3,0,1,2>
+  2557111606U, // <1,4,5,4>: Cost 3 vext1 <1,1,4,5>, RHS
+  2895252790U, // <1,4,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
+  1616006454U, // <1,4,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  3899059510U, // <1,4,5,7>: Cost 4 vuzpr <1,1,1,4>, RHS
+  1616006472U, // <1,4,5,u>: Cost 2 vext3 <0,u,1,1>, RHS
+  2557116518U, // <1,4,6,0>: Cost 3 vext1 <1,1,4,6>, LHS
+  2557117236U, // <1,4,6,1>: Cost 3 vext1 <1,1,4,6>, <1,1,1,1>
+  3630859880U, // <1,4,6,2>: Cost 4 vext1 <1,1,4,6>, <2,2,2,2>
+  2569062550U, // <1,4,6,3>: Cost 3 vext1 <3,1,4,6>, <3,0,1,2>
+  2557119798U, // <1,4,6,4>: Cost 3 vext1 <1,1,4,6>, RHS
+  3763490174U, // <1,4,6,5>: Cost 4 vext3 <0,u,1,1>, <4,6,5,7>
+  3763490183U, // <1,4,6,6>: Cost 4 vext3 <0,u,1,1>, <4,6,6,7>
+  2712751498U, // <1,4,6,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
+  2557122350U, // <1,4,6,u>: Cost 3 vext1 <1,1,4,6>, LHS
+  2659161084U, // <1,4,7,0>: Cost 3 vext2 <7,0,1,4>, <7,0,1,4>
+  3732903040U, // <1,4,7,1>: Cost 4 vext2 <7,0,1,4>, <7,1,7,1>
+  3734230174U, // <1,4,7,2>: Cost 4 vext2 <7,2,1,4>, <7,2,1,4>
+  3734893807U, // <1,4,7,3>: Cost 4 vext2 <7,3,1,4>, <7,3,1,4>
+  3660729654U, // <1,4,7,4>: Cost 4 vext1 <6,1,4,7>, RHS
+  3786493384U, // <1,4,7,5>: Cost 4 vext3 <4,6,7,1>, <4,7,5,0>
+  2713341394U, // <1,4,7,6>: Cost 3 vext3 <4,7,6,1>, <4,7,6,1>
+  3660731386U, // <1,4,7,7>: Cost 4 vext1 <6,1,4,7>, <7,0,1,2>
+  2664470148U, // <1,4,7,u>: Cost 3 vext2 <7,u,1,4>, <7,u,1,4>
+  2557132902U, // <1,4,u,0>: Cost 3 vext1 <1,1,4,u>, LHS
+  2619348782U, // <1,4,u,1>: Cost 3 vext2 <0,3,1,4>, LHS
+  2563106351U, // <1,4,u,2>: Cost 3 vext1 <2,1,4,u>, <2,1,4,u>
+  2713783816U, // <1,4,u,3>: Cost 3 vext3 <4,u,3,1>, <4,u,3,1>
+  2622666815U, // <1,4,u,4>: Cost 3 vext2 <0,u,1,4>, <u,4,5,6>
+  1640189466U, // <1,4,u,5>: Cost 2 vext3 <4,u,5,1>, <4,u,5,1>
+  1616006697U, // <1,4,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  2712751498U, // <1,4,u,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
+  1616006715U, // <1,4,u,u>: Cost 2 vext3 <0,u,1,1>, RHS
+  2620014592U, // <1,5,0,0>: Cost 3 vext2 <0,4,1,5>, <0,0,0,0>
+  1546272870U, // <1,5,0,1>: Cost 2 vext2 <0,4,1,5>, LHS
+  2618687664U, // <1,5,0,2>: Cost 3 vext2 <0,2,1,5>, <0,2,1,5>
+  3693093120U, // <1,5,0,3>: Cost 4 vext2 <0,3,1,5>, <0,3,1,4>
+  1546273106U, // <1,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
+  2620678563U, // <1,5,0,5>: Cost 3 vext2 <0,5,1,5>, <0,5,1,5>
+  2714668660U, // <1,5,0,6>: Cost 3 vext3 <5,0,6,1>, <5,0,6,1>
+  3772042877U, // <1,5,0,7>: Cost 4 vext3 <2,3,0,1>, <5,0,7,1>
+  1546273437U, // <1,5,0,u>: Cost 2 vext2 <0,4,1,5>, LHS
+  2620015350U, // <1,5,1,0>: Cost 3 vext2 <0,4,1,5>, <1,0,3,2>
+  2620015412U, // <1,5,1,1>: Cost 3 vext2 <0,4,1,5>, <1,1,1,1>
+  2620015510U, // <1,5,1,2>: Cost 3 vext2 <0,4,1,5>, <1,2,3,0>
+  2618688512U, // <1,5,1,3>: Cost 3 vext2 <0,2,1,5>, <1,3,5,7>
+  2620015677U, // <1,5,1,4>: Cost 3 vext2 <0,4,1,5>, <1,4,3,5>
+  2620015727U, // <1,5,1,5>: Cost 3 vext2 <0,4,1,5>, <1,5,0,1>
+  2620015859U, // <1,5,1,6>: Cost 3 vext2 <0,4,1,5>, <1,6,5,7>
+  3093728566U, // <1,5,1,7>: Cost 3 vtrnr <1,1,1,1>, RHS
+  2620015981U, // <1,5,1,u>: Cost 3 vext2 <0,4,1,5>, <1,u,1,3>
+  3692430816U, // <1,5,2,0>: Cost 4 vext2 <0,2,1,5>, <2,0,5,1>
+  2620016163U, // <1,5,2,1>: Cost 3 vext2 <0,4,1,5>, <2,1,3,5>
+  2620016232U, // <1,5,2,2>: Cost 3 vext2 <0,4,1,5>, <2,2,2,2>
+  2620016294U, // <1,5,2,3>: Cost 3 vext2 <0,4,1,5>, <2,3,0,1>
+  3693758221U, // <1,5,2,4>: Cost 4 vext2 <0,4,1,5>, <2,4,2,5>
+  3692431209U, // <1,5,2,5>: Cost 4 vext2 <0,2,1,5>, <2,5,3,7>
+  2620016570U, // <1,5,2,6>: Cost 3 vext2 <0,4,1,5>, <2,6,3,7>
+  4173598006U, // <1,5,2,7>: Cost 4 vtrnr <2,1,3,2>, RHS
+  2620016699U, // <1,5,2,u>: Cost 3 vext2 <0,4,1,5>, <2,u,0,1>
+  2620016790U, // <1,5,3,0>: Cost 3 vext2 <0,4,1,5>, <3,0,1,2>
+  2569110672U, // <1,5,3,1>: Cost 3 vext1 <3,1,5,3>, <1,5,3,7>
+  3693758785U, // <1,5,3,2>: Cost 4 vext2 <0,4,1,5>, <3,2,2,2>
+  2620017052U, // <1,5,3,3>: Cost 3 vext2 <0,4,1,5>, <3,3,3,3>
+  2620017154U, // <1,5,3,4>: Cost 3 vext2 <0,4,1,5>, <3,4,5,6>
+  3135623172U, // <1,5,3,5>: Cost 3 vtrnr LHS, <5,5,5,5>
+  4161587048U, // <1,5,3,6>: Cost 4 vtrnr LHS, <2,5,3,6>
+  2014104886U, // <1,5,3,7>: Cost 2 vtrnr LHS, RHS
+  2014104887U, // <1,5,3,u>: Cost 2 vtrnr LHS, RHS
+  2620017554U, // <1,5,4,0>: Cost 3 vext2 <0,4,1,5>, <4,0,5,1>
+  2620017634U, // <1,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
+  3693759551U, // <1,5,4,2>: Cost 4 vext2 <0,4,1,5>, <4,2,6,3>
+  3642861837U, // <1,5,4,3>: Cost 4 vext1 <3,1,5,4>, <3,1,5,4>
+  2575092710U, // <1,5,4,4>: Cost 3 vext1 <4,1,5,4>, <4,1,5,4>
+  1546276150U, // <1,5,4,5>: Cost 2 vext2 <0,4,1,5>, RHS
+  2759855414U, // <1,5,4,6>: Cost 3 vuzpl <1,3,5,7>, RHS
+  2713931718U, // <1,5,4,7>: Cost 3 vext3 <4,u,5,1>, <5,4,7,6>
+  1546276393U, // <1,5,4,u>: Cost 2 vext2 <0,4,1,5>, RHS
+  2557182054U, // <1,5,5,0>: Cost 3 vext1 <1,1,5,5>, LHS
+  2557182812U, // <1,5,5,1>: Cost 3 vext1 <1,1,5,5>, <1,1,5,5>
+  3630925347U, // <1,5,5,2>: Cost 4 vext1 <1,1,5,5>, <2,1,3,5>
+  4029301675U, // <1,5,5,3>: Cost 4 vzipr <0,4,1,5>, <1,2,5,3>
+  2557185334U, // <1,5,5,4>: Cost 3 vext1 <1,1,5,5>, RHS
+  2713931780U, // <1,5,5,5>: Cost 3 vext3 <4,u,5,1>, <5,5,5,5>
+  2667794530U, // <1,5,5,6>: Cost 3 vext2 <u,4,1,5>, <5,6,7,0>
+  2713931800U, // <1,5,5,7>: Cost 3 vext3 <4,u,5,1>, <5,5,7,7>
+  2557187886U, // <1,5,5,u>: Cost 3 vext1 <1,1,5,5>, LHS
+  2718208036U, // <1,5,6,0>: Cost 3 vext3 <5,6,0,1>, <5,6,0,1>
+  2620019115U, // <1,5,6,1>: Cost 3 vext2 <0,4,1,5>, <6,1,7,5>
+  2667794938U, // <1,5,6,2>: Cost 3 vext2 <u,4,1,5>, <6,2,7,3>
+  3787673666U, // <1,5,6,3>: Cost 4 vext3 <4,u,5,1>, <5,6,3,4>
+  3693761165U, // <1,5,6,4>: Cost 4 vext2 <0,4,1,5>, <6,4,5,6>
+  3319279297U, // <1,5,6,5>: Cost 4 vrev <5,1,5,6>
+  2667795256U, // <1,5,6,6>: Cost 3 vext2 <u,4,1,5>, <6,6,6,6>
+  2713931874U, // <1,5,6,7>: Cost 3 vext3 <4,u,5,1>, <5,6,7,0>
+  2713931883U, // <1,5,6,u>: Cost 3 vext3 <4,u,5,1>, <5,6,u,0>
+  2557198438U, // <1,5,7,0>: Cost 3 vext1 <1,1,5,7>, LHS
+  2557199156U, // <1,5,7,1>: Cost 3 vext1 <1,1,5,7>, <1,1,1,1>
+  2569143974U, // <1,5,7,2>: Cost 3 vext1 <3,1,5,7>, <2,3,0,1>
+  2569144592U, // <1,5,7,3>: Cost 3 vext1 <3,1,5,7>, <3,1,5,7>
+  2557201718U, // <1,5,7,4>: Cost 3 vext1 <1,1,5,7>, RHS
+  2713931944U, // <1,5,7,5>: Cost 3 vext3 <4,u,5,1>, <5,7,5,7>
+  3787673770U, // <1,5,7,6>: Cost 4 vext3 <4,u,5,1>, <5,7,6,0>
+  2719387828U, // <1,5,7,7>: Cost 3 vext3 <5,7,7,1>, <5,7,7,1>
+  2557204270U, // <1,5,7,u>: Cost 3 vext1 <1,1,5,7>, LHS
+  2620020435U, // <1,5,u,0>: Cost 3 vext2 <0,4,1,5>, <u,0,1,2>
+  1546278702U, // <1,5,u,1>: Cost 2 vext2 <0,4,1,5>, LHS
+  2620020616U, // <1,5,u,2>: Cost 3 vext2 <0,4,1,5>, <u,2,3,3>
+  2620020668U, // <1,5,u,3>: Cost 3 vext2 <0,4,1,5>, <u,3,0,1>
+  1594054682U, // <1,5,u,4>: Cost 2 vext2 <u,4,1,5>, <u,4,1,5>
+  1546279066U, // <1,5,u,5>: Cost 2 vext2 <0,4,1,5>, RHS
+  2620020944U, // <1,5,u,6>: Cost 3 vext2 <0,4,1,5>, <u,6,3,7>
+  2014145846U, // <1,5,u,7>: Cost 2 vtrnr LHS, RHS
+  2014145847U, // <1,5,u,u>: Cost 2 vtrnr LHS, RHS
+  3692437504U, // <1,6,0,0>: Cost 4 vext2 <0,2,1,6>, <0,0,0,0>
+  2618695782U, // <1,6,0,1>: Cost 3 vext2 <0,2,1,6>, LHS
+  2618695857U, // <1,6,0,2>: Cost 3 vext2 <0,2,1,6>, <0,2,1,6>
+  3794161970U, // <1,6,0,3>: Cost 4 vext3 <6,0,3,1>, <6,0,3,1>
+  2620023122U, // <1,6,0,4>: Cost 3 vext2 <0,4,1,6>, <0,4,1,5>
+  2620686756U, // <1,6,0,5>: Cost 3 vext2 <0,5,1,6>, <0,5,1,6>
+  2621350389U, // <1,6,0,6>: Cost 3 vext2 <0,6,1,6>, <0,6,1,6>
+  4028599606U, // <1,6,0,7>: Cost 4 vzipr <0,3,1,0>, RHS
+  2618696349U, // <1,6,0,u>: Cost 3 vext2 <0,2,1,6>, LHS
+  3692438262U, // <1,6,1,0>: Cost 4 vext2 <0,2,1,6>, <1,0,3,2>
+  2625995572U, // <1,6,1,1>: Cost 3 vext2 <1,4,1,6>, <1,1,1,1>
+  3692438422U, // <1,6,1,2>: Cost 4 vext2 <0,2,1,6>, <1,2,3,0>
+  3692438488U, // <1,6,1,3>: Cost 4 vext2 <0,2,1,6>, <1,3,1,3>
+  2625995820U, // <1,6,1,4>: Cost 3 vext2 <1,4,1,6>, <1,4,1,6>
+  3692438672U, // <1,6,1,5>: Cost 4 vext2 <0,2,1,6>, <1,5,3,7>
+  3692438720U, // <1,6,1,6>: Cost 4 vext2 <0,2,1,6>, <1,6,0,1>
+  2958183734U, // <1,6,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
+  2958183735U, // <1,6,1,u>: Cost 3 vzipr <0,u,1,1>, RHS
+  2721526201U, // <1,6,2,0>: Cost 3 vext3 <6,2,0,1>, <6,2,0,1>
+  3692439097U, // <1,6,2,1>: Cost 4 vext2 <0,2,1,6>, <2,1,6,0>
+  3692439144U, // <1,6,2,2>: Cost 4 vext2 <0,2,1,6>, <2,2,2,2>
+  3692439206U, // <1,6,2,3>: Cost 4 vext2 <0,2,1,6>, <2,3,0,1>
+  3636948278U, // <1,6,2,4>: Cost 4 vext1 <2,1,6,2>, RHS
+  3787674092U, // <1,6,2,5>: Cost 4 vext3 <4,u,5,1>, <6,2,5,7>
+  2618697658U, // <1,6,2,6>: Cost 3 vext2 <0,2,1,6>, <2,6,3,7>
+  2970799414U, // <1,6,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
+  2970799415U, // <1,6,2,u>: Cost 3 vzipr <3,0,1,2>, RHS
+  2563211366U, // <1,6,3,0>: Cost 3 vext1 <2,1,6,3>, LHS
+  3699738854U, // <1,6,3,1>: Cost 4 vext2 <1,4,1,6>, <3,1,1,1>
+  2563212860U, // <1,6,3,2>: Cost 3 vext1 <2,1,6,3>, <2,1,6,3>
+  3692439964U, // <1,6,3,3>: Cost 4 vext2 <0,2,1,6>, <3,3,3,3>
+  2563214646U, // <1,6,3,4>: Cost 3 vext1 <2,1,6,3>, RHS
+  4191820018U, // <1,6,3,5>: Cost 4 vtrnr <5,1,7,3>, <u,6,7,5>
+  2587103648U, // <1,6,3,6>: Cost 3 vext1 <6,1,6,3>, <6,1,6,3>
+  3087845306U, // <1,6,3,7>: Cost 3 vtrnr LHS, <2,6,3,7>
+  3087845307U, // <1,6,3,u>: Cost 3 vtrnr LHS, <2,6,3,u>
+  3693767570U, // <1,6,4,0>: Cost 4 vext2 <0,4,1,6>, <4,0,5,1>
+  3693767650U, // <1,6,4,1>: Cost 4 vext2 <0,4,1,6>, <4,1,5,0>
+  3636962877U, // <1,6,4,2>: Cost 4 vext1 <2,1,6,4>, <2,1,6,4>
+  3325088134U, // <1,6,4,3>: Cost 4 vrev <6,1,3,4>
+  3693767898U, // <1,6,4,4>: Cost 4 vext2 <0,4,1,6>, <4,4,5,5>
+  2618699062U, // <1,6,4,5>: Cost 3 vext2 <0,2,1,6>, RHS
+  3833670966U, // <1,6,4,6>: Cost 4 vuzpl <1,3,6,7>, RHS
+  4028632374U, // <1,6,4,7>: Cost 4 vzipr <0,3,1,4>, RHS
+  2618699305U, // <1,6,4,u>: Cost 3 vext2 <0,2,1,6>, RHS
+  3693768264U, // <1,6,5,0>: Cost 4 vext2 <0,4,1,6>, <5,0,1,2>
+  3630998373U, // <1,6,5,1>: Cost 4 vext1 <1,1,6,5>, <1,1,6,5>
+  3636971070U, // <1,6,5,2>: Cost 4 vext1 <2,1,6,5>, <2,1,6,5>
+  3642943767U, // <1,6,5,3>: Cost 4 vext1 <3,1,6,5>, <3,1,6,5>
+  3693768628U, // <1,6,5,4>: Cost 4 vext2 <0,4,1,6>, <5,4,5,6>
+  3732918276U, // <1,6,5,5>: Cost 4 vext2 <7,0,1,6>, <5,5,5,5>
+  2620690530U, // <1,6,5,6>: Cost 3 vext2 <0,5,1,6>, <5,6,7,0>
+  2955562294U, // <1,6,5,7>: Cost 3 vzipr <0,4,1,5>, RHS
+  2955562295U, // <1,6,5,u>: Cost 3 vzipr <0,4,1,5>, RHS
+  2724180733U, // <1,6,6,0>: Cost 3 vext3 <6,6,0,1>, <6,6,0,1>
+  3631006566U, // <1,6,6,1>: Cost 4 vext1 <1,1,6,6>, <1,1,6,6>
+  3631007674U, // <1,6,6,2>: Cost 4 vext1 <1,1,6,6>, <2,6,3,7>
+  3692442184U, // <1,6,6,3>: Cost 4 vext2 <0,2,1,6>, <6,3,7,0>
+  3631009078U, // <1,6,6,4>: Cost 4 vext1 <1,1,6,6>, RHS
+  3787674416U, // <1,6,6,5>: Cost 4 vext3 <4,u,5,1>, <6,6,5,7>
+  2713932600U, // <1,6,6,6>: Cost 3 vext3 <4,u,5,1>, <6,6,6,6>
+  2713932610U, // <1,6,6,7>: Cost 3 vext3 <4,u,5,1>, <6,6,7,7>
+  2713932619U, // <1,6,6,u>: Cost 3 vext3 <4,u,5,1>, <6,6,u,7>
+  1651102542U, // <1,6,7,0>: Cost 2 vext3 <6,7,0,1>, <6,7,0,1>
+  2724918103U, // <1,6,7,1>: Cost 3 vext3 <6,7,1,1>, <6,7,1,1>
+  2698302306U, // <1,6,7,2>: Cost 3 vext3 <2,3,0,1>, <6,7,2,3>
+  3642960153U, // <1,6,7,3>: Cost 4 vext1 <3,1,6,7>, <3,1,6,7>
+  2713932662U, // <1,6,7,4>: Cost 3 vext3 <4,u,5,1>, <6,7,4,5>
+  2725213051U, // <1,6,7,5>: Cost 3 vext3 <6,7,5,1>, <6,7,5,1>
+  2724844426U, // <1,6,7,6>: Cost 3 vext3 <6,7,0,1>, <6,7,6,7>
+  4035956022U, // <1,6,7,7>: Cost 4 vzipr <1,5,1,7>, RHS
+  1651692438U, // <1,6,7,u>: Cost 2 vext3 <6,7,u,1>, <6,7,u,1>
+  1651766175U, // <1,6,u,0>: Cost 2 vext3 <6,u,0,1>, <6,u,0,1>
+  2618701614U, // <1,6,u,1>: Cost 3 vext2 <0,2,1,6>, LHS
+  3135663508U, // <1,6,u,2>: Cost 3 vtrnr LHS, <4,6,u,2>
+  3692443580U, // <1,6,u,3>: Cost 4 vext2 <0,2,1,6>, <u,3,0,1>
+  2713932743U, // <1,6,u,4>: Cost 3 vext3 <4,u,5,1>, <6,u,4,5>
+  2618701978U, // <1,6,u,5>: Cost 3 vext2 <0,2,1,6>, RHS
+  2622683344U, // <1,6,u,6>: Cost 3 vext2 <0,u,1,6>, <u,6,3,7>
+  3087886266U, // <1,6,u,7>: Cost 3 vtrnr LHS, <2,6,3,7>
+  1652356071U, // <1,6,u,u>: Cost 2 vext3 <6,u,u,1>, <6,u,u,1>
+  2726171632U, // <1,7,0,0>: Cost 3 vext3 <7,0,0,1>, <7,0,0,1>
+  2626666598U, // <1,7,0,1>: Cost 3 vext2 <1,5,1,7>, LHS
+  3695100067U, // <1,7,0,2>: Cost 4 vext2 <0,6,1,7>, <0,2,0,1>
+  3707044102U, // <1,7,0,3>: Cost 4 vext2 <2,6,1,7>, <0,3,2,1>
+  2726466580U, // <1,7,0,4>: Cost 3 vext3 <7,0,4,1>, <7,0,4,1>
+  3654921933U, // <1,7,0,5>: Cost 4 vext1 <5,1,7,0>, <5,1,7,0>
+  2621358582U, // <1,7,0,6>: Cost 3 vext2 <0,6,1,7>, <0,6,1,7>
+  2622022215U, // <1,7,0,7>: Cost 3 vext2 <0,7,1,7>, <0,7,1,7>
+  2626667165U, // <1,7,0,u>: Cost 3 vext2 <1,5,1,7>, LHS
+  2593128550U, // <1,7,1,0>: Cost 3 vext1 <7,1,7,1>, LHS
+  2626667316U, // <1,7,1,1>: Cost 3 vext2 <1,5,1,7>, <1,1,1,1>
+  3700409238U, // <1,7,1,2>: Cost 4 vext2 <1,5,1,7>, <1,2,3,0>
+  2257294428U, // <1,7,1,3>: Cost 3 vrev <7,1,3,1>
+  2593131830U, // <1,7,1,4>: Cost 3 vext1 <7,1,7,1>, RHS
+  2626667646U, // <1,7,1,5>: Cost 3 vext2 <1,5,1,7>, <1,5,1,7>
+  2627331279U, // <1,7,1,6>: Cost 3 vext2 <1,6,1,7>, <1,6,1,7>
+  2593133696U, // <1,7,1,7>: Cost 3 vext1 <7,1,7,1>, <7,1,7,1>
+  2628658545U, // <1,7,1,u>: Cost 3 vext2 <1,u,1,7>, <1,u,1,7>
+  2587164774U, // <1,7,2,0>: Cost 3 vext1 <6,1,7,2>, LHS
+  3701073445U, // <1,7,2,1>: Cost 4 vext2 <1,6,1,7>, <2,1,3,7>
+  3700409960U, // <1,7,2,2>: Cost 4 vext2 <1,5,1,7>, <2,2,2,2>
+  2638612134U, // <1,7,2,3>: Cost 3 vext2 <3,5,1,7>, <2,3,0,1>
+  2587168054U, // <1,7,2,4>: Cost 3 vext1 <6,1,7,2>, RHS
+  3706382167U, // <1,7,2,5>: Cost 4 vext2 <2,5,1,7>, <2,5,1,7>
+  2587169192U, // <1,7,2,6>: Cost 3 vext1 <6,1,7,2>, <6,1,7,2>
+  3660911610U, // <1,7,2,7>: Cost 4 vext1 <6,1,7,2>, <7,0,1,2>
+  2587170606U, // <1,7,2,u>: Cost 3 vext1 <6,1,7,2>, LHS
+  1507459174U, // <1,7,3,0>: Cost 2 vext1 <5,1,7,3>, LHS
+  2569257984U, // <1,7,3,1>: Cost 3 vext1 <3,1,7,3>, <1,3,5,7>
+  2581202536U, // <1,7,3,2>: Cost 3 vext1 <5,1,7,3>, <2,2,2,2>
+  2569259294U, // <1,7,3,3>: Cost 3 vext1 <3,1,7,3>, <3,1,7,3>
+  1507462454U, // <1,7,3,4>: Cost 2 vext1 <5,1,7,3>, RHS
+  1507462864U, // <1,7,3,5>: Cost 2 vext1 <5,1,7,3>, <5,1,7,3>
+  2581205498U, // <1,7,3,6>: Cost 3 vext1 <5,1,7,3>, <6,2,7,3>
+  2581206010U, // <1,7,3,7>: Cost 3 vext1 <5,1,7,3>, <7,0,1,2>
+  1507465006U, // <1,7,3,u>: Cost 2 vext1 <5,1,7,3>, LHS
+  2728826164U, // <1,7,4,0>: Cost 3 vext3 <7,4,0,1>, <7,4,0,1>
+  3654951732U, // <1,7,4,1>: Cost 4 vext1 <5,1,7,4>, <1,1,1,1>
+  3330987094U, // <1,7,4,2>: Cost 4 vrev <7,1,2,4>
+  3331060831U, // <1,7,4,3>: Cost 4 vrev <7,1,3,4>
+  3787674971U, // <1,7,4,4>: Cost 4 vext3 <4,u,5,1>, <7,4,4,4>
+  2626669878U, // <1,7,4,5>: Cost 3 vext2 <1,5,1,7>, RHS
+  3785979241U, // <1,7,4,6>: Cost 4 vext3 <4,6,0,1>, <7,4,6,0>
+  3787085176U, // <1,7,4,7>: Cost 4 vext3 <4,7,6,1>, <7,4,7,6>
+  2626670121U, // <1,7,4,u>: Cost 3 vext2 <1,5,1,7>, RHS
+  2569273446U, // <1,7,5,0>: Cost 3 vext1 <3,1,7,5>, LHS
+  2569274368U, // <1,7,5,1>: Cost 3 vext1 <3,1,7,5>, <1,3,5,7>
+  3643016808U, // <1,7,5,2>: Cost 4 vext1 <3,1,7,5>, <2,2,2,2>
+  2569275680U, // <1,7,5,3>: Cost 3 vext1 <3,1,7,5>, <3,1,7,5>
+  2569276726U, // <1,7,5,4>: Cost 3 vext1 <3,1,7,5>, RHS
+  4102034790U, // <1,7,5,5>: Cost 4 vtrnl <1,3,5,7>, <7,4,5,6>
+  2651222067U, // <1,7,5,6>: Cost 3 vext2 <5,6,1,7>, <5,6,1,7>
+  3899378998U, // <1,7,5,7>: Cost 4 vuzpr <1,1,5,7>, RHS
+  2569279278U, // <1,7,5,u>: Cost 3 vext1 <3,1,7,5>, LHS
+  2730153430U, // <1,7,6,0>: Cost 3 vext3 <7,6,0,1>, <7,6,0,1>
+  2724845022U, // <1,7,6,1>: Cost 3 vext3 <6,7,0,1>, <7,6,1,0>
+  3643025338U, // <1,7,6,2>: Cost 4 vext1 <3,1,7,6>, <2,6,3,7>
+  3643025697U, // <1,7,6,3>: Cost 4 vext1 <3,1,7,6>, <3,1,7,6>
+  3643026742U, // <1,7,6,4>: Cost 4 vext1 <3,1,7,6>, RHS
+  3654971091U, // <1,7,6,5>: Cost 4 vext1 <5,1,7,6>, <5,1,7,6>
+  3787675153U, // <1,7,6,6>: Cost 4 vext3 <4,u,5,1>, <7,6,6,6>
+  2724845076U, // <1,7,6,7>: Cost 3 vext3 <6,7,0,1>, <7,6,7,0>
+  2725508637U, // <1,7,6,u>: Cost 3 vext3 <6,u,0,1>, <7,6,u,0>
+  2730817063U, // <1,7,7,0>: Cost 3 vext3 <7,7,0,1>, <7,7,0,1>
+  3631088436U, // <1,7,7,1>: Cost 4 vext1 <1,1,7,7>, <1,1,1,1>
+  3660949158U, // <1,7,7,2>: Cost 4 vext1 <6,1,7,7>, <2,3,0,1>
+  3801904705U, // <1,7,7,3>: Cost 4 vext3 <7,3,0,1>, <7,7,3,0>
+  3631090998U, // <1,7,7,4>: Cost 4 vext1 <1,1,7,7>, RHS
+  2662503828U, // <1,7,7,5>: Cost 3 vext2 <7,5,1,7>, <7,5,1,7>
+  3660951981U, // <1,7,7,6>: Cost 4 vext1 <6,1,7,7>, <6,1,7,7>
+  2713933420U, // <1,7,7,7>: Cost 3 vext3 <4,u,5,1>, <7,7,7,7>
+  2731406959U, // <1,7,7,u>: Cost 3 vext3 <7,7,u,1>, <7,7,u,1>
+  1507500134U, // <1,7,u,0>: Cost 2 vext1 <5,1,7,u>, LHS
+  2626672430U, // <1,7,u,1>: Cost 3 vext2 <1,5,1,7>, LHS
+  2581243496U, // <1,7,u,2>: Cost 3 vext1 <5,1,7,u>, <2,2,2,2>
+  2569300259U, // <1,7,u,3>: Cost 3 vext1 <3,1,7,u>, <3,1,7,u>
+  1507503414U, // <1,7,u,4>: Cost 2 vext1 <5,1,7,u>, RHS
+  1507503829U, // <1,7,u,5>: Cost 2 vext1 <5,1,7,u>, <5,1,7,u>
+  2581246458U, // <1,7,u,6>: Cost 3 vext1 <5,1,7,u>, <6,2,7,3>
+  2581246970U, // <1,7,u,7>: Cost 3 vext1 <5,1,7,u>, <7,0,1,2>
+  1507505966U, // <1,7,u,u>: Cost 2 vext1 <5,1,7,u>, LHS
+  1543643153U, // <1,u,0,0>: Cost 2 vext2 <0,0,1,u>, <0,0,1,u>
+  1546297446U, // <1,u,0,1>: Cost 2 vext2 <0,4,1,u>, LHS
+  2819448852U, // <1,u,0,2>: Cost 3 vuzpr LHS, <0,0,2,2>
+  2619375876U, // <1,u,0,3>: Cost 3 vext2 <0,3,1,u>, <0,3,1,u>
+  1546297685U, // <1,u,0,4>: Cost 2 vext2 <0,4,1,u>, <0,4,1,u>
+  1658771190U, // <1,u,0,5>: Cost 2 vext3 <u,0,5,1>, <u,0,5,1>
+  2736789248U, // <1,u,0,6>: Cost 3 vext3 <u,7,0,1>, <u,0,6,2>
+  2659189376U, // <1,u,0,7>: Cost 3 vext2 <7,0,1,u>, <0,7,u,1>
+  1546298013U, // <1,u,0,u>: Cost 2 vext2 <0,4,1,u>, LHS
+  1483112550U, // <1,u,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
+  202162278U, // <1,u,1,1>: Cost 1 vdup1 LHS
+  1616009006U, // <1,u,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
+  1745707110U, // <1,u,1,3>: Cost 2 vuzpr LHS, LHS
+  1483115830U, // <1,u,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
+  2620040336U, // <1,u,1,5>: Cost 3 vext2 <0,4,1,u>, <1,5,3,7>
+  3026622618U, // <1,u,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
+  2958183752U, // <1,u,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
+  202162278U, // <1,u,1,u>: Cost 1 vdup1 LHS
+  2819449750U, // <1,u,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
+  2893207342U, // <1,u,2,1>: Cost 3 vzipl <1,2,3,0>, LHS
+  2819448996U, // <1,u,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
+  2819450482U, // <1,u,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
+  2819449754U, // <1,u,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
+  2893207706U, // <1,u,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
+  2819449036U, // <1,u,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
+  2970799432U, // <1,u,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
+  2819449002U, // <1,u,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
+  403931292U, // <1,u,3,0>: Cost 1 vext1 LHS, LHS
+  1477673718U, // <1,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  115726126U, // <1,u,3,2>: Cost 1 vrev LHS
+  2014102173U, // <1,u,3,3>: Cost 2 vtrnr LHS, LHS
+  403934518U, // <1,u,3,4>: Cost 1 vext1 LHS, RHS
+  1507536601U, // <1,u,3,5>: Cost 2 vext1 <5,1,u,3>, <5,1,u,3>
+  1525453306U, // <1,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  2014105129U, // <1,u,3,7>: Cost 2 vtrnr LHS, RHS
+  403937070U, // <1,u,3,u>: Cost 1 vext1 LHS, LHS
+  2620042157U, // <1,u,4,0>: Cost 3 vext2 <0,4,1,u>, <4,0,u,1>
+  2620042237U, // <1,u,4,1>: Cost 3 vext2 <0,4,1,u>, <4,1,u,0>
+  2263217967U, // <1,u,4,2>: Cost 3 vrev <u,1,2,4>
+  2569341224U, // <1,u,4,3>: Cost 3 vext1 <3,1,u,4>, <3,1,u,4>
+  2569342262U, // <1,u,4,4>: Cost 3 vext1 <3,1,u,4>, RHS
+  1546300726U, // <1,u,4,5>: Cost 2 vext2 <0,4,1,u>, RHS
+  2819449180U, // <1,u,4,6>: Cost 3 vuzpr LHS, <0,4,2,6>
+  2724845649U, // <1,u,4,7>: Cost 3 vext3 <6,7,0,1>, <u,4,7,6>
+  1546300969U, // <1,u,4,u>: Cost 2 vext2 <0,4,1,u>, RHS
+  2551431270U, // <1,u,5,0>: Cost 3 vext1 <0,1,u,5>, LHS
+  2551432192U, // <1,u,5,1>: Cost 3 vext1 <0,1,u,5>, <1,3,5,7>
+  3028293422U, // <1,u,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
+  2955559068U, // <1,u,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
+  2551434550U, // <1,u,5,4>: Cost 3 vext1 <0,1,u,5>, RHS
+  2895255706U, // <1,u,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
+  1616009370U, // <1,u,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  1745710390U, // <1,u,5,7>: Cost 2 vuzpr LHS, RHS
+  1745710391U, // <1,u,5,u>: Cost 2 vuzpr LHS, RHS
+  2653221159U, // <1,u,6,0>: Cost 3 vext2 <6,0,1,u>, <6,0,1,u>
+  2725509303U, // <1,u,6,1>: Cost 3 vext3 <6,u,0,1>, <u,6,1,0>
+  2659193338U, // <1,u,6,2>: Cost 3 vext2 <7,0,1,u>, <6,2,7,3>
+  2689751248U, // <1,u,6,3>: Cost 3 vext3 <0,u,1,1>, <u,6,3,7>
+  2867228774U, // <1,u,6,4>: Cost 3 vuzpr LHS, <5,6,7,4>
+  3764820194U, // <1,u,6,5>: Cost 4 vext3 <1,1,1,1>, <u,6,5,7>
+  2657202957U, // <1,u,6,6>: Cost 3 vext2 <6,6,1,u>, <6,6,1,u>
+  2819450810U, // <1,u,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
+  2819450811U, // <1,u,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
+  1585452032U, // <1,u,7,0>: Cost 2 vext2 <7,0,1,u>, <7,0,1,u>
+  2557420340U, // <1,u,7,1>: Cost 3 vext1 <1,1,u,7>, <1,1,1,1>
+  2569365158U, // <1,u,7,2>: Cost 3 vext1 <3,1,u,7>, <2,3,0,1>
+  2569365803U, // <1,u,7,3>: Cost 3 vext1 <3,1,u,7>, <3,1,u,7>
+  2557422902U, // <1,u,7,4>: Cost 3 vext1 <1,1,u,7>, RHS
+  2662512021U, // <1,u,7,5>: Cost 3 vext2 <7,5,1,u>, <7,5,1,u>
+  2724845884U, // <1,u,7,6>: Cost 3 vext3 <6,7,0,1>, <u,7,6,7>
+  2659194476U, // <1,u,7,7>: Cost 3 vext2 <7,0,1,u>, <7,7,7,7>
+  1590761096U, // <1,u,7,u>: Cost 2 vext2 <7,u,1,u>, <7,u,1,u>
+  403972257U, // <1,u,u,0>: Cost 1 vext1 LHS, LHS
+  202162278U, // <1,u,u,1>: Cost 1 vdup1 LHS
+  115767091U, // <1,u,u,2>: Cost 1 vrev LHS
+  1745707677U, // <1,u,u,3>: Cost 2 vuzpr LHS, LHS
+  403975478U, // <1,u,u,4>: Cost 1 vext1 LHS, RHS
+  1546303642U, // <1,u,u,5>: Cost 2 vext2 <0,4,1,u>, RHS
+  1616009613U, // <1,u,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  1745710633U, // <1,u,u,7>: Cost 2 vuzpr LHS, RHS
+  403978030U, // <1,u,u,u>: Cost 1 vext1 LHS, LHS
+  2551463936U, // <2,0,0,0>: Cost 3 vext1 <0,2,0,0>, <0,0,0,0>
+  2685698058U, // <2,0,0,1>: Cost 3 vext3 <0,2,0,2>, <0,0,1,1>
+  1610776596U, // <2,0,0,2>: Cost 2 vext3 <0,0,2,2>, <0,0,2,2>
+  2619384069U, // <2,0,0,3>: Cost 3 vext2 <0,3,2,0>, <0,3,2,0>
+  2551467318U, // <2,0,0,4>: Cost 3 vext1 <0,2,0,0>, RHS
+  3899836596U, // <2,0,0,5>: Cost 4 vuzpr <1,2,3,0>, <3,0,4,5>
+  2621374968U, // <2,0,0,6>: Cost 3 vext2 <0,6,2,0>, <0,6,2,0>
+  4168271334U, // <2,0,0,7>: Cost 4 vtrnr <1,2,3,0>, <2,0,5,7>
+  1611219018U, // <2,0,0,u>: Cost 2 vext3 <0,0,u,2>, <0,0,u,2>
+  2551472138U, // <2,0,1,0>: Cost 3 vext1 <0,2,0,1>, <0,0,1,1>
+  2690564186U, // <2,0,1,1>: Cost 3 vext3 <1,0,3,2>, <0,1,1,0>
+  1611956326U, // <2,0,1,2>: Cost 2 vext3 <0,2,0,2>, LHS
+  2826092646U, // <2,0,1,3>: Cost 3 vuzpr <1,2,3,0>, LHS
+  2551475510U, // <2,0,1,4>: Cost 3 vext1 <0,2,0,1>, RHS
+  3692463248U, // <2,0,1,5>: Cost 4 vext2 <0,2,2,0>, <1,5,3,7>
+  2587308473U, // <2,0,1,6>: Cost 3 vext1 <6,2,0,1>, <6,2,0,1>
+  3661050874U, // <2,0,1,7>: Cost 4 vext1 <6,2,0,1>, <7,0,1,2>
+  1611956380U, // <2,0,1,u>: Cost 2 vext3 <0,2,0,2>, LHS
+  1477738598U, // <2,0,2,0>: Cost 2 vext1 <0,2,0,2>, LHS
+  2551481078U, // <2,0,2,1>: Cost 3 vext1 <0,2,0,2>, <1,0,3,2>
+  2551481796U, // <2,0,2,2>: Cost 3 vext1 <0,2,0,2>, <2,0,2,0>
+  2551482518U, // <2,0,2,3>: Cost 3 vext1 <0,2,0,2>, <3,0,1,2>
+  1477741878U, // <2,0,2,4>: Cost 2 vext1 <0,2,0,2>, RHS
+  2551484112U, // <2,0,2,5>: Cost 3 vext1 <0,2,0,2>, <5,1,7,3>
+  2551484759U, // <2,0,2,6>: Cost 3 vext1 <0,2,0,2>, <6,0,7,2>
+  2551485434U, // <2,0,2,7>: Cost 3 vext1 <0,2,0,2>, <7,0,1,2>
+  1477744430U, // <2,0,2,u>: Cost 2 vext1 <0,2,0,2>, LHS
+  2953625600U, // <2,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
+  2953627302U, // <2,0,3,1>: Cost 3 vzipr LHS, <2,3,0,1>
+  2953625764U, // <2,0,3,2>: Cost 3 vzipr LHS, <0,2,0,2>
+  4027369695U, // <2,0,3,3>: Cost 4 vzipr LHS, <3,1,0,3>
+  3625233718U, // <2,0,3,4>: Cost 4 vext1 <0,2,0,3>, RHS
+  3899836110U, // <2,0,3,5>: Cost 4 vuzpr <1,2,3,0>, <2,3,4,5>
+  4032012618U, // <2,0,3,6>: Cost 4 vzipr LHS, <0,4,0,6>
+  3899835392U, // <2,0,3,7>: Cost 4 vuzpr <1,2,3,0>, <1,3,5,7>
+  2953625770U, // <2,0,3,u>: Cost 3 vzipr LHS, <0,2,0,u>
+  2551496806U, // <2,0,4,0>: Cost 3 vext1 <0,2,0,4>, LHS
+  2685698386U, // <2,0,4,1>: Cost 3 vext3 <0,2,0,2>, <0,4,1,5>
+  2685698396U, // <2,0,4,2>: Cost 3 vext3 <0,2,0,2>, <0,4,2,6>
+  3625240726U, // <2,0,4,3>: Cost 4 vext1 <0,2,0,4>, <3,0,1,2>
+  2551500086U, // <2,0,4,4>: Cost 3 vext1 <0,2,0,4>, RHS
+  2618723638U, // <2,0,4,5>: Cost 3 vext2 <0,2,2,0>, RHS
+  2765409590U, // <2,0,4,6>: Cost 3 vuzpl <2,3,0,1>, RHS
+  3799990664U, // <2,0,4,7>: Cost 4 vext3 <7,0,1,2>, <0,4,7,5>
+  2685698450U, // <2,0,4,u>: Cost 3 vext3 <0,2,0,2>, <0,4,u,6>
+  3625246822U, // <2,0,5,0>: Cost 4 vext1 <0,2,0,5>, LHS
+  3289776304U, // <2,0,5,1>: Cost 4 vrev <0,2,1,5>
+  2690564526U, // <2,0,5,2>: Cost 3 vext3 <1,0,3,2>, <0,5,2,7>
+  3289923778U, // <2,0,5,3>: Cost 4 vrev <0,2,3,5>
+  2216255691U, // <2,0,5,4>: Cost 3 vrev <0,2,4,5>
+  3726307332U, // <2,0,5,5>: Cost 4 vext2 <5,u,2,0>, <5,5,5,5>
+  3726307426U, // <2,0,5,6>: Cost 4 vext2 <5,u,2,0>, <5,6,7,0>
+  2826095926U, // <2,0,5,7>: Cost 3 vuzpr <1,2,3,0>, RHS
+  2216550639U, // <2,0,5,u>: Cost 3 vrev <0,2,u,5>
+  4162420736U, // <2,0,6,0>: Cost 4 vtrnr <0,2,4,6>, <0,0,0,0>
+  2901885030U, // <2,0,6,1>: Cost 3 vzipl <2,6,3,7>, LHS
+  2685698559U, // <2,0,6,2>: Cost 3 vext3 <0,2,0,2>, <0,6,2,7>
+  3643173171U, // <2,0,6,3>: Cost 4 vext1 <3,2,0,6>, <3,2,0,6>
+  2216263884U, // <2,0,6,4>: Cost 3 vrev <0,2,4,6>
+  3730289341U, // <2,0,6,5>: Cost 4 vext2 <6,5,2,0>, <6,5,2,0>
+  3726308152U, // <2,0,6,6>: Cost 4 vext2 <5,u,2,0>, <6,6,6,6>
+  3899836346U, // <2,0,6,7>: Cost 4 vuzpr <1,2,3,0>, <2,6,3,7>
+  2216558832U, // <2,0,6,u>: Cost 3 vrev <0,2,u,6>
+  2659202049U, // <2,0,7,0>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
+  3726308437U, // <2,0,7,1>: Cost 4 vext2 <5,u,2,0>, <7,1,2,3>
+  2726249034U, // <2,0,7,2>: Cost 3 vext3 <7,0,1,2>, <0,7,2,1>
+  3734934772U, // <2,0,7,3>: Cost 4 vext2 <7,3,2,0>, <7,3,2,0>
+  3726308710U, // <2,0,7,4>: Cost 4 vext2 <5,u,2,0>, <7,4,5,6>
+  3726308814U, // <2,0,7,5>: Cost 4 vext2 <5,u,2,0>, <7,5,u,2>
+  3736925671U, // <2,0,7,6>: Cost 4 vext2 <7,6,2,0>, <7,6,2,0>
+  3726308972U, // <2,0,7,7>: Cost 4 vext2 <5,u,2,0>, <7,7,7,7>
+  2659202049U, // <2,0,7,u>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
+  1477787750U, // <2,0,u,0>: Cost 2 vext1 <0,2,0,u>, LHS
+  2953668262U, // <2,0,u,1>: Cost 3 vzipr LHS, <2,3,0,1>
+  1611956893U, // <2,0,u,2>: Cost 2 vext3 <0,2,0,2>, LHS
+  2551531670U, // <2,0,u,3>: Cost 3 vext1 <0,2,0,u>, <3,0,1,2>
+  1477791030U, // <2,0,u,4>: Cost 2 vext1 <0,2,0,u>, RHS
+  2618726554U, // <2,0,u,5>: Cost 3 vext2 <0,2,2,0>, RHS
+  2765412506U, // <2,0,u,6>: Cost 3 vuzpl <2,3,0,1>, RHS
+  2826096169U, // <2,0,u,7>: Cost 3 vuzpr <1,2,3,0>, RHS
+  1611956947U, // <2,0,u,u>: Cost 2 vext3 <0,2,0,2>, LHS
+  2569453670U, // <2,1,0,0>: Cost 3 vext1 <3,2,1,0>, LHS
+  2619392102U, // <2,1,0,1>: Cost 3 vext2 <0,3,2,1>, LHS
+  3759440619U, // <2,1,0,2>: Cost 4 vext3 <0,2,0,2>, <1,0,2,0>
+  1616823030U, // <2,1,0,3>: Cost 2 vext3 <1,0,3,2>, <1,0,3,2>
+  2569456950U, // <2,1,0,4>: Cost 3 vext1 <3,2,1,0>, RHS
+  2690712328U, // <2,1,0,5>: Cost 3 vext3 <1,0,5,2>, <1,0,5,2>
+  3661115841U, // <2,1,0,6>: Cost 4 vext1 <6,2,1,0>, <6,2,1,0>
+  2622046794U, // <2,1,0,7>: Cost 3 vext2 <0,7,2,1>, <0,7,2,1>
+  1617191715U, // <2,1,0,u>: Cost 2 vext3 <1,0,u,2>, <1,0,u,2>
+  2551545958U, // <2,1,1,0>: Cost 3 vext1 <0,2,1,1>, LHS
+  2685698868U, // <2,1,1,1>: Cost 3 vext3 <0,2,0,2>, <1,1,1,1>
+  2628682646U, // <2,1,1,2>: Cost 3 vext2 <1,u,2,1>, <1,2,3,0>
+  2685698888U, // <2,1,1,3>: Cost 3 vext3 <0,2,0,2>, <1,1,3,3>
+  2551549238U, // <2,1,1,4>: Cost 3 vext1 <0,2,1,1>, RHS
+  3693134992U, // <2,1,1,5>: Cost 4 vext2 <0,3,2,1>, <1,5,3,7>
+  3661124034U, // <2,1,1,6>: Cost 4 vext1 <6,2,1,1>, <6,2,1,1>
+  3625292794U, // <2,1,1,7>: Cost 4 vext1 <0,2,1,1>, <7,0,1,2>
+  2685698933U, // <2,1,1,u>: Cost 3 vext3 <0,2,0,2>, <1,1,u,3>
+  2551554150U, // <2,1,2,0>: Cost 3 vext1 <0,2,1,2>, LHS
+  3893649571U, // <2,1,2,1>: Cost 4 vuzpr <0,2,0,1>, <0,2,0,1>
+  2551555688U, // <2,1,2,2>: Cost 3 vext1 <0,2,1,2>, <2,2,2,2>
+  2685698966U, // <2,1,2,3>: Cost 3 vext3 <0,2,0,2>, <1,2,3,0>
+  2551557430U, // <2,1,2,4>: Cost 3 vext1 <0,2,1,2>, RHS
+  3763422123U, // <2,1,2,5>: Cost 4 vext3 <0,u,0,2>, <1,2,5,3>
+  3693135802U, // <2,1,2,6>: Cost 4 vext2 <0,3,2,1>, <2,6,3,7>
+  2726249402U, // <2,1,2,7>: Cost 3 vext3 <7,0,1,2>, <1,2,7,0>
+  2685699011U, // <2,1,2,u>: Cost 3 vext3 <0,2,0,2>, <1,2,u,0>
+  2551562342U, // <2,1,3,0>: Cost 3 vext1 <0,2,1,3>, LHS
+  2953625610U, // <2,1,3,1>: Cost 3 vzipr LHS, <0,0,1,1>
+  2953627798U, // <2,1,3,2>: Cost 3 vzipr LHS, <3,0,1,2>
+  2953626584U, // <2,1,3,3>: Cost 3 vzipr LHS, <1,3,1,3>
+  2551565622U, // <2,1,3,4>: Cost 3 vext1 <0,2,1,3>, RHS
+  2953625938U, // <2,1,3,5>: Cost 3 vzipr LHS, <0,4,1,5>
+  2587398596U, // <2,1,3,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
+  4032013519U, // <2,1,3,7>: Cost 4 vzipr LHS, <1,6,1,7>
+  2953625617U, // <2,1,3,u>: Cost 3 vzipr LHS, <0,0,1,u>
+  2690565154U, // <2,1,4,0>: Cost 3 vext3 <1,0,3,2>, <1,4,0,5>
+  3625313270U, // <2,1,4,1>: Cost 4 vext1 <0,2,1,4>, <1,3,4,6>
+  3771532340U, // <2,1,4,2>: Cost 4 vext3 <2,2,2,2>, <1,4,2,5>
+  1148404634U, // <2,1,4,3>: Cost 2 vrev <1,2,3,4>
+  3625315638U, // <2,1,4,4>: Cost 4 vext1 <0,2,1,4>, RHS
+  2619395382U, // <2,1,4,5>: Cost 3 vext2 <0,3,2,1>, RHS
+  3837242678U, // <2,1,4,6>: Cost 4 vuzpl <2,0,1,2>, RHS
+  3799991394U, // <2,1,4,7>: Cost 4 vext3 <7,0,1,2>, <1,4,7,6>
+  1148773319U, // <2,1,4,u>: Cost 2 vrev <1,2,u,4>
+  2551578726U, // <2,1,5,0>: Cost 3 vext1 <0,2,1,5>, LHS
+  2551579648U, // <2,1,5,1>: Cost 3 vext1 <0,2,1,5>, <1,3,5,7>
+  3625321952U, // <2,1,5,2>: Cost 4 vext1 <0,2,1,5>, <2,0,5,1>
+  2685699216U, // <2,1,5,3>: Cost 3 vext3 <0,2,0,2>, <1,5,3,7>
+  2551582006U, // <2,1,5,4>: Cost 3 vext1 <0,2,1,5>, RHS
+  3740913668U, // <2,1,5,5>: Cost 4 vext2 <u,3,2,1>, <5,5,5,5>
+  3661156806U, // <2,1,5,6>: Cost 4 vext1 <6,2,1,5>, <6,2,1,5>
+  3893652790U, // <2,1,5,7>: Cost 4 vuzpr <0,2,0,1>, RHS
+  2685699261U, // <2,1,5,u>: Cost 3 vext3 <0,2,0,2>, <1,5,u,7>
+  2551586918U, // <2,1,6,0>: Cost 3 vext1 <0,2,1,6>, LHS
+  3625329398U, // <2,1,6,1>: Cost 4 vext1 <0,2,1,6>, <1,0,3,2>
+  2551588794U, // <2,1,6,2>: Cost 3 vext1 <0,2,1,6>, <2,6,3,7>
+  3088679014U, // <2,1,6,3>: Cost 3 vtrnr <0,2,4,6>, LHS
+  2551590198U, // <2,1,6,4>: Cost 3 vext1 <0,2,1,6>, RHS
+  4029382994U, // <2,1,6,5>: Cost 4 vzipr <0,4,2,6>, <0,4,1,5>
+  3625333560U, // <2,1,6,6>: Cost 4 vext1 <0,2,1,6>, <6,6,6,6>
+  3731624800U, // <2,1,6,7>: Cost 4 vext2 <6,7,2,1>, <6,7,2,1>
+  2551592750U, // <2,1,6,u>: Cost 3 vext1 <0,2,1,6>, LHS
+  2622051322U, // <2,1,7,0>: Cost 3 vext2 <0,7,2,1>, <7,0,1,2>
+  3733615699U, // <2,1,7,1>: Cost 4 vext2 <7,1,2,1>, <7,1,2,1>
+  3795125538U, // <2,1,7,2>: Cost 4 vext3 <6,1,7,2>, <1,7,2,0>
+  2222171037U, // <2,1,7,3>: Cost 3 vrev <1,2,3,7>
+  3740915046U, // <2,1,7,4>: Cost 4 vext2 <u,3,2,1>, <7,4,5,6>
+  3296060335U, // <2,1,7,5>: Cost 4 vrev <1,2,5,7>
+  3736933864U, // <2,1,7,6>: Cost 4 vext2 <7,6,2,1>, <7,6,2,1>
+  3805300055U, // <2,1,7,7>: Cost 4 vext3 <7,u,1,2>, <1,7,7,u>
+  2669827714U, // <2,1,7,u>: Cost 3 vext2 <u,7,2,1>, <7,u,1,2>
+  2551603302U, // <2,1,u,0>: Cost 3 vext1 <0,2,1,u>, LHS
+  2953666570U, // <2,1,u,1>: Cost 3 vzipr LHS, <0,0,1,1>
+  2953668758U, // <2,1,u,2>: Cost 3 vzipr LHS, <3,0,1,2>
+  1148437406U, // <2,1,u,3>: Cost 2 vrev <1,2,3,u>
+  2551606582U, // <2,1,u,4>: Cost 3 vext1 <0,2,1,u>, RHS
+  2953666898U, // <2,1,u,5>: Cost 3 vzipr LHS, <0,4,1,5>
+  2587398596U, // <2,1,u,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
+  2669828370U, // <2,1,u,7>: Cost 3 vext2 <u,7,2,1>, <u,7,2,1>
+  1148806091U, // <2,1,u,u>: Cost 2 vrev <1,2,u,u>
+  1543667732U, // <2,2,0,0>: Cost 2 vext2 <0,0,2,2>, <0,0,2,2>
+  1548976230U, // <2,2,0,1>: Cost 2 vext2 <0,u,2,2>, LHS
+  2685699524U, // <2,2,0,2>: Cost 3 vext3 <0,2,0,2>, <2,0,2,0>
+  2685699535U, // <2,2,0,3>: Cost 3 vext3 <0,2,0,2>, <2,0,3,2>
+  2551614774U, // <2,2,0,4>: Cost 3 vext1 <0,2,2,0>, RHS
+  3704422830U, // <2,2,0,5>: Cost 4 vext2 <2,2,2,2>, <0,5,2,7>
+  3893657642U, // <2,2,0,6>: Cost 4 vuzpr <0,2,0,2>, <0,0,4,6>
+  3770574323U, // <2,2,0,7>: Cost 4 vext3 <2,0,7,2>, <2,0,7,2>
+  1548976796U, // <2,2,0,u>: Cost 2 vext2 <0,u,2,2>, <0,u,2,2>
+  2622718710U, // <2,2,1,0>: Cost 3 vext2 <0,u,2,2>, <1,0,3,2>
+  2622718772U, // <2,2,1,1>: Cost 3 vext2 <0,u,2,2>, <1,1,1,1>
+  2622718870U, // <2,2,1,2>: Cost 3 vext2 <0,u,2,2>, <1,2,3,0>
+  2819915878U, // <2,2,1,3>: Cost 3 vuzpr <0,2,0,2>, LHS
+  3625364790U, // <2,2,1,4>: Cost 4 vext1 <0,2,2,1>, RHS
+  2622719120U, // <2,2,1,5>: Cost 3 vext2 <0,u,2,2>, <1,5,3,7>
+  3760031292U, // <2,2,1,6>: Cost 4 vext3 <0,2,u,2>, <2,1,6,3>
+  3667170468U, // <2,2,1,7>: Cost 4 vext1 <7,2,2,1>, <7,2,2,1>
+  2819915883U, // <2,2,1,u>: Cost 3 vuzpr <0,2,0,2>, LHS
+  1489829990U, // <2,2,2,0>: Cost 2 vext1 <2,2,2,2>, LHS
+  2563572470U, // <2,2,2,1>: Cost 3 vext1 <2,2,2,2>, <1,0,3,2>
+  269271142U, // <2,2,2,2>: Cost 1 vdup2 LHS
+  2685699698U, // <2,2,2,3>: Cost 3 vext3 <0,2,0,2>, <2,2,3,3>
+  1489833270U, // <2,2,2,4>: Cost 2 vext1 <2,2,2,2>, RHS
+  2685699720U, // <2,2,2,5>: Cost 3 vext3 <0,2,0,2>, <2,2,5,7>
+  2622719930U, // <2,2,2,6>: Cost 3 vext2 <0,u,2,2>, <2,6,3,7>
+  2593436837U, // <2,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
+  269271142U, // <2,2,2,u>: Cost 1 vdup2 LHS
+  2685699750U, // <2,2,3,0>: Cost 3 vext3 <0,2,0,2>, <2,3,0,1>
+  2690565806U, // <2,2,3,1>: Cost 3 vext3 <1,0,3,2>, <2,3,1,0>
+  2953627240U, // <2,2,3,2>: Cost 3 vzipr LHS, <2,2,2,2>
+  1879883878U, // <2,2,3,3>: Cost 2 vzipr LHS, LHS
+  2685699790U, // <2,2,3,4>: Cost 3 vext3 <0,2,0,2>, <2,3,4,5>
+  3893659342U, // <2,2,3,5>: Cost 4 vuzpr <0,2,0,2>, <2,3,4,5>
+  2958270812U, // <2,2,3,6>: Cost 3 vzipr LHS, <0,4,2,6>
+  2593445030U, // <2,2,3,7>: Cost 3 vext1 <7,2,2,3>, <7,2,2,3>
+  1879883883U, // <2,2,3,u>: Cost 2 vzipr LHS, LHS
+  2551644262U, // <2,2,4,0>: Cost 3 vext1 <0,2,2,4>, LHS
+  3625386742U, // <2,2,4,1>: Cost 4 vext1 <0,2,2,4>, <1,0,3,2>
+  2551645902U, // <2,2,4,2>: Cost 3 vext1 <0,2,2,4>, <2,3,4,5>
+  3759441686U, // <2,2,4,3>: Cost 4 vext3 <0,2,0,2>, <2,4,3,5>
+  2551647542U, // <2,2,4,4>: Cost 3 vext1 <0,2,2,4>, RHS
+  1548979510U, // <2,2,4,5>: Cost 2 vext2 <0,u,2,2>, RHS
+  2764901686U, // <2,2,4,6>: Cost 3 vuzpl <2,2,2,2>, RHS
+  3667195047U, // <2,2,4,7>: Cost 4 vext1 <7,2,2,4>, <7,2,2,4>
+  1548979753U, // <2,2,4,u>: Cost 2 vext2 <0,u,2,2>, RHS
+  3696463432U, // <2,2,5,0>: Cost 4 vext2 <0,u,2,2>, <5,0,1,2>
+  2617413328U, // <2,2,5,1>: Cost 3 vext2 <0,0,2,2>, <5,1,7,3>
+  2685699936U, // <2,2,5,2>: Cost 3 vext3 <0,2,0,2>, <2,5,2,7>
+  4027383910U, // <2,2,5,3>: Cost 4 vzipr <0,1,2,5>, LHS
+  2228201085U, // <2,2,5,4>: Cost 3 vrev <2,2,4,5>
+  2617413636U, // <2,2,5,5>: Cost 3 vext2 <0,0,2,2>, <5,5,5,5>
+  2617413730U, // <2,2,5,6>: Cost 3 vext2 <0,0,2,2>, <5,6,7,0>
+  2819919158U, // <2,2,5,7>: Cost 3 vuzpr <0,2,0,2>, RHS
+  2819919159U, // <2,2,5,u>: Cost 3 vuzpr <0,2,0,2>, RHS
+  3625402554U, // <2,2,6,0>: Cost 4 vext1 <0,2,2,6>, <0,2,2,6>
+  3760031652U, // <2,2,6,1>: Cost 4 vext3 <0,2,u,2>, <2,6,1,3>
+  2617414138U, // <2,2,6,2>: Cost 3 vext2 <0,0,2,2>, <6,2,7,3>
+  2685700026U, // <2,2,6,3>: Cost 3 vext3 <0,2,0,2>, <2,6,3,7>
+  3625405750U, // <2,2,6,4>: Cost 4 vext1 <0,2,2,6>, RHS
+  3760031692U, // <2,2,6,5>: Cost 4 vext3 <0,2,u,2>, <2,6,5,7>
+  3088679116U, // <2,2,6,6>: Cost 3 vtrnr <0,2,4,6>, <0,2,4,6>
+  2657891169U, // <2,2,6,7>: Cost 3 vext2 <6,7,2,2>, <6,7,2,2>
+  2685700071U, // <2,2,6,u>: Cost 3 vext3 <0,2,0,2>, <2,6,u,7>
+  2726250474U, // <2,2,7,0>: Cost 3 vext3 <7,0,1,2>, <2,7,0,1>
+  3704427616U, // <2,2,7,1>: Cost 4 vext2 <2,2,2,2>, <7,1,3,5>
+  2660545701U, // <2,2,7,2>: Cost 3 vext2 <7,2,2,2>, <7,2,2,2>
+  4030718054U, // <2,2,7,3>: Cost 4 vzipr <0,6,2,7>, LHS
+  2617415014U, // <2,2,7,4>: Cost 3 vext2 <0,0,2,2>, <7,4,5,6>
+  3302033032U, // <2,2,7,5>: Cost 4 vrev <2,2,5,7>
+  3661246929U, // <2,2,7,6>: Cost 4 vext1 <6,2,2,7>, <6,2,2,7>
+  2617415276U, // <2,2,7,7>: Cost 3 vext2 <0,0,2,2>, <7,7,7,7>
+  2731558962U, // <2,2,7,u>: Cost 3 vext3 <7,u,1,2>, <2,7,u,1>
+  1489829990U, // <2,2,u,0>: Cost 2 vext1 <2,2,2,2>, LHS
+  1548982062U, // <2,2,u,1>: Cost 2 vext2 <0,u,2,2>, LHS
+  269271142U, // <2,2,u,2>: Cost 1 vdup2 LHS
+  1879924838U, // <2,2,u,3>: Cost 2 vzipr LHS, LHS
+  1489833270U, // <2,2,u,4>: Cost 2 vext1 <2,2,2,2>, RHS
+  1548982426U, // <2,2,u,5>: Cost 2 vext2 <0,u,2,2>, RHS
+  2953666908U, // <2,2,u,6>: Cost 3 vzipr LHS, <0,4,2,6>
+  2819919401U, // <2,2,u,7>: Cost 3 vuzpr <0,2,0,2>, RHS
+  269271142U, // <2,2,u,u>: Cost 1 vdup2 LHS
+  1544339456U, // <2,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+  470597734U, // <2,3,0,1>: Cost 1 vext2 LHS, LHS
+  1548984484U, // <2,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  2619408648U, // <2,3,0,3>: Cost 3 vext2 <0,3,2,3>, <0,3,2,3>
+  1548984658U, // <2,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  2665857454U, // <2,3,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
+  2622726655U, // <2,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
+  2593494188U, // <2,3,0,7>: Cost 3 vext1 <7,2,3,0>, <7,2,3,0>
+  470598301U, // <2,3,0,u>: Cost 1 vext2 LHS, LHS
+  1544340214U, // <2,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  1544340276U, // <2,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+  1544340374U, // <2,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+  1548985304U, // <2,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  2551696694U, // <2,3,1,4>: Cost 3 vext1 <0,2,3,1>, RHS
+  1548985488U, // <2,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  2622727375U, // <2,3,1,6>: Cost 3 vext2 LHS, <1,6,1,7>
+  2665858347U, // <2,3,1,7>: Cost 3 vext2 LHS, <1,7,3,0>
+  1548985709U, // <2,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
+  2622727613U, // <2,3,2,0>: Cost 3 vext2 LHS, <2,0,1,2>
+  2622727711U, // <2,3,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
+  1544341096U, // <2,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
+  1544341158U, // <2,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+  2622727958U, // <2,3,2,4>: Cost 3 vext2 LHS, <2,4,3,5>
+  2622728032U, // <2,3,2,5>: Cost 3 vext2 LHS, <2,5,2,7>
+  1548986298U, // <2,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  2665859050U, // <2,3,2,7>: Cost 3 vext2 LHS, <2,7,0,1>
+  1548986427U, // <2,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
+  1548986518U, // <2,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+  2622728415U, // <2,3,3,1>: Cost 3 vext2 LHS, <3,1,0,3>
+  1489913458U, // <2,3,3,2>: Cost 2 vext1 <2,2,3,3>, <2,2,3,3>
+  1544341916U, // <2,3,3,3>: Cost 2 vext2 LHS, <3,3,3,3>
+  1548986882U, // <2,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+  2665859632U, // <2,3,3,5>: Cost 3 vext2 LHS, <3,5,1,7>
+  2234304870U, // <2,3,3,6>: Cost 3 vrev <3,2,6,3>
+  2958271632U, // <2,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
+  1548987166U, // <2,3,3,u>: Cost 2 vext2 LHS, <3,u,1,2>
+  1483948134U, // <2,3,4,0>: Cost 2 vext1 <1,2,3,4>, LHS
+  1483948954U, // <2,3,4,1>: Cost 2 vext1 <1,2,3,4>, <1,2,3,4>
+  2622729276U, // <2,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
+  2557692054U, // <2,3,4,3>: Cost 3 vext1 <1,2,3,4>, <3,0,1,2>
+  1483951414U, // <2,3,4,4>: Cost 2 vext1 <1,2,3,4>, RHS
+  470601014U, // <2,3,4,5>: Cost 1 vext2 LHS, RHS
+  1592118644U, // <2,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+  2593526960U, // <2,3,4,7>: Cost 3 vext1 <7,2,3,4>, <7,2,3,4>
+  470601257U, // <2,3,4,u>: Cost 1 vext2 LHS, RHS
+  2551726182U, // <2,3,5,0>: Cost 3 vext1 <0,2,3,5>, LHS
+  1592118992U, // <2,3,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
+  2665860862U, // <2,3,5,2>: Cost 3 vext2 LHS, <5,2,3,4>
+  2551728642U, // <2,3,5,3>: Cost 3 vext1 <0,2,3,5>, <3,4,5,6>
+  1592119238U, // <2,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+  1592119300U, // <2,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+  1592119394U, // <2,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
+  1592119464U, // <2,3,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
+  1592119545U, // <2,3,5,u>: Cost 2 vext2 LHS, <5,u,5,7>
+  2622730529U, // <2,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
+  2557707164U, // <2,3,6,1>: Cost 3 vext1 <1,2,3,6>, <1,2,3,6>
+  1592119802U, // <2,3,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
+  2665861682U, // <2,3,6,3>: Cost 3 vext2 LHS, <6,3,4,5>
+  2622730893U, // <2,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
+  2665861810U, // <2,3,6,5>: Cost 3 vext2 LHS, <6,5,0,7>
+  1592120120U, // <2,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+  1592120142U, // <2,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+  1592120223U, // <2,3,6,u>: Cost 2 vext2 LHS, <6,u,0,1>
+  1592120314U, // <2,3,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
+  2659890261U, // <2,3,7,1>: Cost 3 vext2 <7,1,2,3>, <7,1,2,3>
+  2660553894U, // <2,3,7,2>: Cost 3 vext2 <7,2,2,3>, <7,2,2,3>
+  2665862371U, // <2,3,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
+  1592120678U, // <2,3,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
+  2665862534U, // <2,3,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
+  2665862614U, // <2,3,7,6>: Cost 3 vext2 LHS, <7,6,0,1>
+  1592120940U, // <2,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+  1592120962U, // <2,3,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
+  1548990163U, // <2,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
+  470603566U, // <2,3,u,1>: Cost 1 vext2 LHS, LHS
+  1548990341U, // <2,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
+  1548990396U, // <2,3,u,3>: Cost 2 vext2 LHS, <u,3,0,1>
+  1548990527U, // <2,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
+  470603930U, // <2,3,u,5>: Cost 1 vext2 LHS, RHS
+  1548990672U, // <2,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
+  1592121600U, // <2,3,u,7>: Cost 2 vext2 LHS, <u,7,0,1>
+  470604133U, // <2,3,u,u>: Cost 1 vext2 LHS, LHS
+  2617425942U, // <2,4,0,0>: Cost 3 vext2 <0,0,2,4>, <0,0,2,4>
+  2618753126U, // <2,4,0,1>: Cost 3 vext2 <0,2,2,4>, LHS
+  2618753208U, // <2,4,0,2>: Cost 3 vext2 <0,2,2,4>, <0,2,2,4>
+  2619416841U, // <2,4,0,3>: Cost 3 vext2 <0,3,2,4>, <0,3,2,4>
+  2587593628U, // <2,4,0,4>: Cost 3 vext1 <6,2,4,0>, <4,0,6,2>
+  2712832914U, // <2,4,0,5>: Cost 3 vext3 <4,6,u,2>, <4,0,5,1>
+  1634962332U, // <2,4,0,6>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
+  3799993252U, // <2,4,0,7>: Cost 4 vext3 <7,0,1,2>, <4,0,7,1>
+  1634962332U, // <2,4,0,u>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
+  2619417334U, // <2,4,1,0>: Cost 3 vext2 <0,3,2,4>, <1,0,3,2>
+  3692495668U, // <2,4,1,1>: Cost 4 vext2 <0,2,2,4>, <1,1,1,1>
+  2625389466U, // <2,4,1,2>: Cost 3 vext2 <1,3,2,4>, <1,2,3,4>
+  2826125414U, // <2,4,1,3>: Cost 3 vuzpr <1,2,3,4>, LHS
+  3699794995U, // <2,4,1,4>: Cost 4 vext2 <1,4,2,4>, <1,4,2,4>
+  3692496016U, // <2,4,1,5>: Cost 4 vext2 <0,2,2,4>, <1,5,3,7>
+  3763424238U, // <2,4,1,6>: Cost 4 vext3 <0,u,0,2>, <4,1,6,3>
+  3667317942U, // <2,4,1,7>: Cost 4 vext1 <7,2,4,1>, <7,2,4,1>
+  2826125419U, // <2,4,1,u>: Cost 3 vuzpr <1,2,3,4>, LHS
+  2629371336U, // <2,4,2,0>: Cost 3 vext2 <2,0,2,4>, <2,0,2,4>
+  3699131946U, // <2,4,2,1>: Cost 4 vext2 <1,3,2,4>, <2,1,4,3>
+  2630698602U, // <2,4,2,2>: Cost 3 vext2 <2,2,2,4>, <2,2,2,4>
+  2618754766U, // <2,4,2,3>: Cost 3 vext2 <0,2,2,4>, <2,3,4,5>
+  2826126234U, // <2,4,2,4>: Cost 3 vuzpr <1,2,3,4>, <1,2,3,4>
+  2899119414U, // <2,4,2,5>: Cost 3 vzipl <2,2,2,2>, RHS
+  3033337142U, // <2,4,2,6>: Cost 3 vtrnl <2,2,2,2>, RHS
+  3800214597U, // <2,4,2,7>: Cost 4 vext3 <7,0,4,2>, <4,2,7,0>
+  2899119657U, // <2,4,2,u>: Cost 3 vzipl <2,2,2,2>, RHS
+  2635344033U, // <2,4,3,0>: Cost 3 vext2 <3,0,2,4>, <3,0,2,4>
+  4032012325U, // <2,4,3,1>: Cost 4 vzipr LHS, <0,0,4,1>
+  3692497228U, // <2,4,3,2>: Cost 4 vext2 <0,2,2,4>, <3,2,3,4>
+  3692497308U, // <2,4,3,3>: Cost 4 vext2 <0,2,2,4>, <3,3,3,3>
+  3001404624U, // <2,4,3,4>: Cost 3 vzipr LHS, <4,4,4,4>
+  2953627342U, // <2,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
+  2953625804U, // <2,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
+  3899868160U, // <2,4,3,7>: Cost 4 vuzpr <1,2,3,4>, <1,3,5,7>
+  2953625806U, // <2,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
+  2710916266U, // <2,4,4,0>: Cost 3 vext3 <4,4,0,2>, <4,4,0,2>
+  3899869648U, // <2,4,4,1>: Cost 4 vuzpr <1,2,3,4>, <3,4,0,1>
+  3899869658U, // <2,4,4,2>: Cost 4 vuzpr <1,2,3,4>, <3,4,1,2>
+  3899868930U, // <2,4,4,3>: Cost 4 vuzpr <1,2,3,4>, <2,4,1,3>
+  2712833232U, // <2,4,4,4>: Cost 3 vext3 <4,6,u,2>, <4,4,4,4>
+  2618756406U, // <2,4,4,5>: Cost 3 vext2 <0,2,2,4>, RHS
+  2765737270U, // <2,4,4,6>: Cost 3 vuzpl <2,3,4,5>, RHS
+  4168304426U, // <2,4,4,7>: Cost 4 vtrnr <1,2,3,4>, <2,4,5,7>
+  2618756649U, // <2,4,4,u>: Cost 3 vext2 <0,2,2,4>, RHS
+  2551800011U, // <2,4,5,0>: Cost 3 vext1 <0,2,4,5>, <0,2,4,5>
+  2569716470U, // <2,4,5,1>: Cost 3 vext1 <3,2,4,5>, <1,0,3,2>
+  2563745405U, // <2,4,5,2>: Cost 3 vext1 <2,2,4,5>, <2,2,4,5>
+  2569718102U, // <2,4,5,3>: Cost 3 vext1 <3,2,4,5>, <3,2,4,5>
+  2551803190U, // <2,4,5,4>: Cost 3 vext1 <0,2,4,5>, RHS
+  3625545732U, // <2,4,5,5>: Cost 4 vext1 <0,2,4,5>, <5,5,5,5>
+  1611959606U, // <2,4,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  2826128694U, // <2,4,5,7>: Cost 3 vuzpr <1,2,3,4>, RHS
+  1611959624U, // <2,4,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
+  1478066278U, // <2,4,6,0>: Cost 2 vext1 <0,2,4,6>, LHS
+  2551808758U, // <2,4,6,1>: Cost 3 vext1 <0,2,4,6>, <1,0,3,2>
+  2551809516U, // <2,4,6,2>: Cost 3 vext1 <0,2,4,6>, <2,0,6,4>
+  2551810198U, // <2,4,6,3>: Cost 3 vext1 <0,2,4,6>, <3,0,1,2>
+  1478069558U, // <2,4,6,4>: Cost 2 vext1 <0,2,4,6>, RHS
+  2901888310U, // <2,4,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
+  2551812920U, // <2,4,6,6>: Cost 3 vext1 <0,2,4,6>, <6,6,6,6>
+  2726251914U, // <2,4,6,7>: Cost 3 vext3 <7,0,1,2>, <4,6,7,1>
+  1478072110U, // <2,4,6,u>: Cost 2 vext1 <0,2,4,6>, LHS
+  2659234821U, // <2,4,7,0>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
+  3786722726U, // <2,4,7,1>: Cost 4 vext3 <4,7,1,2>, <4,7,1,2>
+  3734303911U, // <2,4,7,2>: Cost 4 vext2 <7,2,2,4>, <7,2,2,4>
+  3734967544U, // <2,4,7,3>: Cost 4 vext2 <7,3,2,4>, <7,3,2,4>
+  3727005030U, // <2,4,7,4>: Cost 4 vext2 <6,0,2,4>, <7,4,5,6>
+  2726251976U, // <2,4,7,5>: Cost 3 vext3 <7,0,1,2>, <4,7,5,0>
+  2726251986U, // <2,4,7,6>: Cost 3 vext3 <7,0,1,2>, <4,7,6,1>
+  3727005292U, // <2,4,7,7>: Cost 4 vext2 <6,0,2,4>, <7,7,7,7>
+  2659234821U, // <2,4,7,u>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
+  1478082662U, // <2,4,u,0>: Cost 2 vext1 <0,2,4,u>, LHS
+  2618758958U, // <2,4,u,1>: Cost 3 vext2 <0,2,2,4>, LHS
+  2551826024U, // <2,4,u,2>: Cost 3 vext1 <0,2,4,u>, <2,2,2,2>
+  2551826582U, // <2,4,u,3>: Cost 3 vext1 <0,2,4,u>, <3,0,1,2>
+  1478085942U, // <2,4,u,4>: Cost 2 vext1 <0,2,4,u>, RHS
+  2953668302U, // <2,4,u,5>: Cost 3 vzipr LHS, <2,3,4,5>
+  1611959849U, // <2,4,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  2826128937U, // <2,4,u,7>: Cost 3 vuzpr <1,2,3,4>, RHS
+  1611959867U, // <2,4,u,u>: Cost 2 vext3 <0,2,0,2>, RHS
+  3691839488U, // <2,5,0,0>: Cost 4 vext2 <0,1,2,5>, <0,0,0,0>
+  2618097766U, // <2,5,0,1>: Cost 3 vext2 <0,1,2,5>, LHS
+  2620088484U, // <2,5,0,2>: Cost 3 vext2 <0,4,2,5>, <0,2,0,2>
+  2619425034U, // <2,5,0,3>: Cost 3 vext2 <0,3,2,5>, <0,3,2,5>
+  2620088667U, // <2,5,0,4>: Cost 3 vext2 <0,4,2,5>, <0,4,2,5>
+  2620752300U, // <2,5,0,5>: Cost 3 vext2 <0,5,2,5>, <0,5,2,5>
+  3693830655U, // <2,5,0,6>: Cost 4 vext2 <0,4,2,5>, <0,6,2,7>
+  3094531382U, // <2,5,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
+  2618098333U, // <2,5,0,u>: Cost 3 vext2 <0,1,2,5>, LHS
+  3691840246U, // <2,5,1,0>: Cost 4 vext2 <0,1,2,5>, <1,0,3,2>
+  3691840308U, // <2,5,1,1>: Cost 4 vext2 <0,1,2,5>, <1,1,1,1>
+  2626061206U, // <2,5,1,2>: Cost 3 vext2 <1,4,2,5>, <1,2,3,0>
+  2618098688U, // <2,5,1,3>: Cost 3 vext2 <0,1,2,5>, <1,3,5,7>
+  2626061364U, // <2,5,1,4>: Cost 3 vext2 <1,4,2,5>, <1,4,2,5>
+  3691840656U, // <2,5,1,5>: Cost 4 vext2 <0,1,2,5>, <1,5,3,7>
+  3789082310U, // <2,5,1,6>: Cost 4 vext3 <5,1,6,2>, <5,1,6,2>
+  2712833744U, // <2,5,1,7>: Cost 3 vext3 <4,6,u,2>, <5,1,7,3>
+  2628715896U, // <2,5,1,u>: Cost 3 vext2 <1,u,2,5>, <1,u,2,5>
+  3693831613U, // <2,5,2,0>: Cost 4 vext2 <0,4,2,5>, <2,0,1,2>
+  4026698642U, // <2,5,2,1>: Cost 4 vzipr <0,0,2,2>, <4,0,5,1>
+  2632033896U, // <2,5,2,2>: Cost 3 vext2 <2,4,2,5>, <2,2,2,2>
+  3691841190U, // <2,5,2,3>: Cost 4 vext2 <0,1,2,5>, <2,3,0,1>
+  2632034061U, // <2,5,2,4>: Cost 3 vext2 <2,4,2,5>, <2,4,2,5>
+  3691841352U, // <2,5,2,5>: Cost 4 vext2 <0,1,2,5>, <2,5,0,1>
+  3691841466U, // <2,5,2,6>: Cost 4 vext2 <0,1,2,5>, <2,6,3,7>
+  3088354614U, // <2,5,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
+  3088354615U, // <2,5,2,u>: Cost 3 vtrnr <0,2,0,2>, RHS
+  2557829222U, // <2,5,3,0>: Cost 3 vext1 <1,2,5,3>, LHS
+  2557830059U, // <2,5,3,1>: Cost 3 vext1 <1,2,5,3>, <1,2,5,3>
+  2575746766U, // <2,5,3,2>: Cost 3 vext1 <4,2,5,3>, <2,3,4,5>
+  3691841948U, // <2,5,3,3>: Cost 4 vext2 <0,1,2,5>, <3,3,3,3>
+  2619427330U, // <2,5,3,4>: Cost 3 vext2 <0,3,2,5>, <3,4,5,6>
+  2581720847U, // <2,5,3,5>: Cost 3 vext1 <5,2,5,3>, <5,2,5,3>
+  2953628162U, // <2,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  2953626624U, // <2,5,3,7>: Cost 3 vzipr LHS, <1,3,5,7>
+  2953626625U, // <2,5,3,u>: Cost 3 vzipr LHS, <1,3,5,u>
+  2569781350U, // <2,5,4,0>: Cost 3 vext1 <3,2,5,4>, LHS
+  3631580076U, // <2,5,4,1>: Cost 4 vext1 <1,2,5,4>, <1,2,5,4>
+  2569782990U, // <2,5,4,2>: Cost 3 vext1 <3,2,5,4>, <2,3,4,5>
+  2569783646U, // <2,5,4,3>: Cost 3 vext1 <3,2,5,4>, <3,2,5,4>
+  2569784630U, // <2,5,4,4>: Cost 3 vext1 <3,2,5,4>, RHS
+  2618101046U, // <2,5,4,5>: Cost 3 vext2 <0,1,2,5>, RHS
+  3893905922U, // <2,5,4,6>: Cost 4 vuzpr <0,2,3,5>, <3,4,5,6>
+  3094564150U, // <2,5,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
+  2618101289U, // <2,5,4,u>: Cost 3 vext2 <0,1,2,5>, RHS
+  2551873638U, // <2,5,5,0>: Cost 3 vext1 <0,2,5,5>, LHS
+  3637560320U, // <2,5,5,1>: Cost 4 vext1 <2,2,5,5>, <1,3,5,7>
+  3637560966U, // <2,5,5,2>: Cost 4 vext1 <2,2,5,5>, <2,2,5,5>
+  3723030343U, // <2,5,5,3>: Cost 4 vext2 <5,3,2,5>, <5,3,2,5>
+  2551876918U, // <2,5,5,4>: Cost 3 vext1 <0,2,5,5>, RHS
+  2712834052U, // <2,5,5,5>: Cost 3 vext3 <4,6,u,2>, <5,5,5,5>
+  4028713474U, // <2,5,5,6>: Cost 4 vzipr <0,3,2,5>, <3,4,5,6>
+  2712834072U, // <2,5,5,7>: Cost 3 vext3 <4,6,u,2>, <5,5,7,7>
+  2712834081U, // <2,5,5,u>: Cost 3 vext3 <4,6,u,2>, <5,5,u,7>
+  2575769702U, // <2,5,6,0>: Cost 3 vext1 <4,2,5,6>, LHS
+  3631596462U, // <2,5,6,1>: Cost 4 vext1 <1,2,5,6>, <1,2,5,6>
+  2655924730U, // <2,5,6,2>: Cost 3 vext2 <6,4,2,5>, <6,2,7,3>
+  3643541856U, // <2,5,6,3>: Cost 4 vext1 <3,2,5,6>, <3,2,5,6>
+  2655924849U, // <2,5,6,4>: Cost 3 vext2 <6,4,2,5>, <6,4,2,5>
+  3787755607U, // <2,5,6,5>: Cost 4 vext3 <4,u,6,2>, <5,6,5,7>
+  4029385218U, // <2,5,6,6>: Cost 4 vzipr <0,4,2,6>, <3,4,5,6>
+  3088682294U, // <2,5,6,7>: Cost 3 vtrnr <0,2,4,6>, RHS
+  3088682295U, // <2,5,6,u>: Cost 3 vtrnr <0,2,4,6>, RHS
+  2563833958U, // <2,5,7,0>: Cost 3 vext1 <2,2,5,7>, LHS
+  2551890678U, // <2,5,7,1>: Cost 3 vext1 <0,2,5,7>, <1,0,3,2>
+  2563835528U, // <2,5,7,2>: Cost 3 vext1 <2,2,5,7>, <2,2,5,7>
+  3637577878U, // <2,5,7,3>: Cost 4 vext1 <2,2,5,7>, <3,0,1,2>
+  2563837238U, // <2,5,7,4>: Cost 3 vext1 <2,2,5,7>, RHS
+  2712834216U, // <2,5,7,5>: Cost 3 vext3 <4,6,u,2>, <5,7,5,7>
+  2712834220U, // <2,5,7,6>: Cost 3 vext3 <4,6,u,2>, <5,7,6,2>
+  4174449974U, // <2,5,7,7>: Cost 4 vtrnr <2,2,5,7>, RHS
+  2563839790U, // <2,5,7,u>: Cost 3 vext1 <2,2,5,7>, LHS
+  2563842150U, // <2,5,u,0>: Cost 3 vext1 <2,2,5,u>, LHS
+  2618103598U, // <2,5,u,1>: Cost 3 vext2 <0,1,2,5>, LHS
+  2563843721U, // <2,5,u,2>: Cost 3 vext1 <2,2,5,u>, <2,2,5,u>
+  2569816418U, // <2,5,u,3>: Cost 3 vext1 <3,2,5,u>, <3,2,5,u>
+  2622748735U, // <2,5,u,4>: Cost 3 vext2 <0,u,2,5>, <u,4,5,6>
+  2618103962U, // <2,5,u,5>: Cost 3 vext2 <0,1,2,5>, RHS
+  2953669122U, // <2,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  2953667584U, // <2,5,u,7>: Cost 3 vzipr LHS, <1,3,5,7>
+  2618104165U, // <2,5,u,u>: Cost 3 vext2 <0,1,2,5>, LHS
+  2620096512U, // <2,6,0,0>: Cost 3 vext2 <0,4,2,6>, <0,0,0,0>
+  1546354790U, // <2,6,0,1>: Cost 2 vext2 <0,4,2,6>, LHS
+  2620096676U, // <2,6,0,2>: Cost 3 vext2 <0,4,2,6>, <0,2,0,2>
+  3693838588U, // <2,6,0,3>: Cost 4 vext2 <0,4,2,6>, <0,3,1,0>
+  1546355036U, // <2,6,0,4>: Cost 2 vext2 <0,4,2,6>, <0,4,2,6>
+  3694502317U, // <2,6,0,5>: Cost 4 vext2 <0,5,2,6>, <0,5,2,6>
+  2551911246U, // <2,6,0,6>: Cost 3 vext1 <0,2,6,0>, <6,7,0,1>
+  2720723287U, // <2,6,0,7>: Cost 3 vext3 <6,0,7,2>, <6,0,7,2>
+  1546355357U, // <2,6,0,u>: Cost 2 vext2 <0,4,2,6>, LHS
+  2620097270U, // <2,6,1,0>: Cost 3 vext2 <0,4,2,6>, <1,0,3,2>
+  2620097332U, // <2,6,1,1>: Cost 3 vext2 <0,4,2,6>, <1,1,1,1>
+  2620097430U, // <2,6,1,2>: Cost 3 vext2 <0,4,2,6>, <1,2,3,0>
+  2820243558U, // <2,6,1,3>: Cost 3 vuzpr <0,2,4,6>, LHS
+  2620097598U, // <2,6,1,4>: Cost 3 vext2 <0,4,2,6>, <1,4,3,6>
+  2620097680U, // <2,6,1,5>: Cost 3 vext2 <0,4,2,6>, <1,5,3,7>
+  3693839585U, // <2,6,1,6>: Cost 4 vext2 <0,4,2,6>, <1,6,3,7>
+  2721386920U, // <2,6,1,7>: Cost 3 vext3 <6,1,7,2>, <6,1,7,2>
+  2820243563U, // <2,6,1,u>: Cost 3 vuzpr <0,2,4,6>, LHS
+  2714014137U, // <2,6,2,0>: Cost 3 vext3 <4,u,6,2>, <6,2,0,1>
+  2712834500U, // <2,6,2,1>: Cost 3 vext3 <4,6,u,2>, <6,2,1,3>
+  2620098152U, // <2,6,2,2>: Cost 3 vext2 <0,4,2,6>, <2,2,2,2>
+  2620098214U, // <2,6,2,3>: Cost 3 vext2 <0,4,2,6>, <2,3,0,1>
+  2632042254U, // <2,6,2,4>: Cost 3 vext2 <2,4,2,6>, <2,4,2,6>
+  2712834540U, // <2,6,2,5>: Cost 3 vext3 <4,6,u,2>, <6,2,5,7>
+  2820243660U, // <2,6,2,6>: Cost 3 vuzpr <0,2,4,6>, <0,2,4,6>
+  2958265654U, // <2,6,2,7>: Cost 3 vzipr <0,u,2,2>, RHS
+  2620098619U, // <2,6,2,u>: Cost 3 vext2 <0,4,2,6>, <2,u,0,1>
+  2620098710U, // <2,6,3,0>: Cost 3 vext2 <0,4,2,6>, <3,0,1,2>
+  3893986982U, // <2,6,3,1>: Cost 4 vuzpr <0,2,4,6>, <2,3,0,1>
+  2569848762U, // <2,6,3,2>: Cost 3 vext1 <3,2,6,3>, <2,6,3,7>
+  2620098972U, // <2,6,3,3>: Cost 3 vext2 <0,4,2,6>, <3,3,3,3>
+  2620099074U, // <2,6,3,4>: Cost 3 vext2 <0,4,2,6>, <3,4,5,6>
+  3893987022U, // <2,6,3,5>: Cost 4 vuzpr <0,2,4,6>, <2,3,4,5>
+  3001404644U, // <2,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
+  1879887158U, // <2,6,3,7>: Cost 2 vzipr LHS, RHS
+  1879887159U, // <2,6,3,u>: Cost 2 vzipr LHS, RHS
+  2620099484U, // <2,6,4,0>: Cost 3 vext2 <0,4,2,6>, <4,0,6,2>
+  2620099566U, // <2,6,4,1>: Cost 3 vext2 <0,4,2,6>, <4,1,6,3>
+  2620099644U, // <2,6,4,2>: Cost 3 vext2 <0,4,2,6>, <4,2,6,0>
+  3643599207U, // <2,6,4,3>: Cost 4 vext1 <3,2,6,4>, <3,2,6,4>
+  2575830080U, // <2,6,4,4>: Cost 3 vext1 <4,2,6,4>, <4,2,6,4>
+  1546358070U, // <2,6,4,5>: Cost 2 vext2 <0,4,2,6>, RHS
+  2667875700U, // <2,6,4,6>: Cost 3 vext2 <u,4,2,6>, <4,6,4,6>
+  4028042550U, // <2,6,4,7>: Cost 4 vzipr <0,2,2,4>, RHS
+  1546358313U, // <2,6,4,u>: Cost 2 vext2 <0,4,2,6>, RHS
+  3693841992U, // <2,6,5,0>: Cost 4 vext2 <0,4,2,6>, <5,0,1,2>
+  2667876048U, // <2,6,5,1>: Cost 3 vext2 <u,4,2,6>, <5,1,7,3>
+  2712834756U, // <2,6,5,2>: Cost 3 vext3 <4,6,u,2>, <6,5,2,7>
+  3643607400U, // <2,6,5,3>: Cost 4 vext1 <3,2,6,5>, <3,2,6,5>
+  2252091873U, // <2,6,5,4>: Cost 3 vrev <6,2,4,5>
+  2667876356U, // <2,6,5,5>: Cost 3 vext2 <u,4,2,6>, <5,5,5,5>
+  2667876450U, // <2,6,5,6>: Cost 3 vext2 <u,4,2,6>, <5,6,7,0>
+  2820246838U, // <2,6,5,7>: Cost 3 vuzpr <0,2,4,6>, RHS
+  2820246839U, // <2,6,5,u>: Cost 3 vuzpr <0,2,4,6>, RHS
+  2563899494U, // <2,6,6,0>: Cost 3 vext1 <2,2,6,6>, LHS
+  3893988683U, // <2,6,6,1>: Cost 4 vuzpr <0,2,4,6>, <4,6,0,1>
+  2563901072U, // <2,6,6,2>: Cost 3 vext1 <2,2,6,6>, <2,2,6,6>
+  3893987236U, // <2,6,6,3>: Cost 4 vuzpr <0,2,4,6>, <2,6,1,3>
+  2563902774U, // <2,6,6,4>: Cost 3 vext1 <2,2,6,6>, RHS
+  3893988723U, // <2,6,6,5>: Cost 4 vuzpr <0,2,4,6>, <4,6,4,5>
+  2712834872U, // <2,6,6,6>: Cost 3 vext3 <4,6,u,2>, <6,6,6,6>
+  2955644214U, // <2,6,6,7>: Cost 3 vzipr <0,4,2,6>, RHS
+  2955644215U, // <2,6,6,u>: Cost 3 vzipr <0,4,2,6>, RHS
+  2712834894U, // <2,6,7,0>: Cost 3 vext3 <4,6,u,2>, <6,7,0,1>
+  2724926296U, // <2,6,7,1>: Cost 3 vext3 <6,7,1,2>, <6,7,1,2>
+  2725000033U, // <2,6,7,2>: Cost 3 vext3 <6,7,2,2>, <6,7,2,2>
+  2702365544U, // <2,6,7,3>: Cost 3 vext3 <3,0,1,2>, <6,7,3,0>
+  2712834934U, // <2,6,7,4>: Cost 3 vext3 <4,6,u,2>, <6,7,4,5>
+  3776107393U, // <2,6,7,5>: Cost 4 vext3 <3,0,1,2>, <6,7,5,7>
+  2725294981U, // <2,6,7,6>: Cost 3 vext3 <6,7,6,2>, <6,7,6,2>
+  2726253452U, // <2,6,7,7>: Cost 3 vext3 <7,0,1,2>, <6,7,7,0>
+  2712834966U, // <2,6,7,u>: Cost 3 vext3 <4,6,u,2>, <6,7,u,1>
+  2620102355U, // <2,6,u,0>: Cost 3 vext2 <0,4,2,6>, <u,0,1,2>
+  1546360622U, // <2,6,u,1>: Cost 2 vext2 <0,4,2,6>, LHS
+  2620102536U, // <2,6,u,2>: Cost 3 vext2 <0,4,2,6>, <u,2,3,3>
+  2820244125U, // <2,6,u,3>: Cost 3 vuzpr <0,2,4,6>, LHS
+  1594136612U, // <2,6,u,4>: Cost 2 vext2 <u,4,2,6>, <u,4,2,6>
+  1546360986U, // <2,6,u,5>: Cost 2 vext2 <0,4,2,6>, RHS
+  2620102864U, // <2,6,u,6>: Cost 3 vext2 <0,4,2,6>, <u,6,3,7>
+  1879928118U, // <2,6,u,7>: Cost 2 vzipr LHS, RHS
+  1879928119U, // <2,6,u,u>: Cost 2 vzipr LHS, RHS
+  2726179825U, // <2,7,0,0>: Cost 3 vext3 <7,0,0,2>, <7,0,0,2>
+  1652511738U, // <2,7,0,1>: Cost 2 vext3 <7,0,1,2>, <7,0,1,2>
+  2621431972U, // <2,7,0,2>: Cost 3 vext2 <0,6,2,7>, <0,2,0,2>
+  2257949868U, // <2,7,0,3>: Cost 3 vrev <7,2,3,0>
+  2726474773U, // <2,7,0,4>: Cost 3 vext3 <7,0,4,2>, <7,0,4,2>
+  2620768686U, // <2,7,0,5>: Cost 3 vext2 <0,5,2,7>, <0,5,2,7>
+  2621432319U, // <2,7,0,6>: Cost 3 vext2 <0,6,2,7>, <0,6,2,7>
+  2599760953U, // <2,7,0,7>: Cost 3 vext1 <u,2,7,0>, <7,0,u,2>
+  1653027897U, // <2,7,0,u>: Cost 2 vext3 <7,0,u,2>, <7,0,u,2>
+  2639348470U, // <2,7,1,0>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
+  3695174452U, // <2,7,1,1>: Cost 4 vext2 <0,6,2,7>, <1,1,1,1>
+  3695174550U, // <2,7,1,2>: Cost 4 vext2 <0,6,2,7>, <1,2,3,0>
+  3694511104U, // <2,7,1,3>: Cost 4 vext2 <0,5,2,7>, <1,3,5,7>
+  3713090594U, // <2,7,1,4>: Cost 4 vext2 <3,6,2,7>, <1,4,0,5>
+  3693184144U, // <2,7,1,5>: Cost 4 vext2 <0,3,2,7>, <1,5,3,7>
+  2627405016U, // <2,7,1,6>: Cost 3 vext2 <1,6,2,7>, <1,6,2,7>
+  3799995519U, // <2,7,1,7>: Cost 4 vext3 <7,0,1,2>, <7,1,7,0>
+  2639348470U, // <2,7,1,u>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
+  3695175101U, // <2,7,2,0>: Cost 4 vext2 <0,6,2,7>, <2,0,1,2>
+  3643655168U, // <2,7,2,1>: Cost 4 vext1 <3,2,7,2>, <1,3,5,7>
+  2257892517U, // <2,7,2,2>: Cost 3 vrev <7,2,2,2>
+  3695175334U, // <2,7,2,3>: Cost 4 vext2 <0,6,2,7>, <2,3,0,1>
+  3695175465U, // <2,7,2,4>: Cost 4 vext2 <0,6,2,7>, <2,4,5,6>
+  2632714080U, // <2,7,2,5>: Cost 3 vext2 <2,5,2,7>, <2,5,2,7>
+  2633377713U, // <2,7,2,6>: Cost 3 vext2 <2,6,2,7>, <2,6,2,7>
+  3695175658U, // <2,7,2,7>: Cost 4 vext2 <0,6,2,7>, <2,7,0,1>
+  2634704979U, // <2,7,2,u>: Cost 3 vext2 <2,u,2,7>, <2,u,2,7>
+  1514094694U, // <2,7,3,0>: Cost 2 vext1 <6,2,7,3>, LHS
+  2569921680U, // <2,7,3,1>: Cost 3 vext1 <3,2,7,3>, <1,5,3,7>
+  2587838056U, // <2,7,3,2>: Cost 3 vext1 <6,2,7,3>, <2,2,2,2>
+  2569922927U, // <2,7,3,3>: Cost 3 vext1 <3,2,7,3>, <3,2,7,3>
+  1514097974U, // <2,7,3,4>: Cost 2 vext1 <6,2,7,3>, RHS
+  2581868321U, // <2,7,3,5>: Cost 3 vext1 <5,2,7,3>, <5,2,7,3>
+  1514099194U, // <2,7,3,6>: Cost 2 vext1 <6,2,7,3>, <6,2,7,3>
+  2587841530U, // <2,7,3,7>: Cost 3 vext1 <6,2,7,3>, <7,0,1,2>
+  1514100526U, // <2,7,3,u>: Cost 2 vext1 <6,2,7,3>, LHS
+  2708706617U, // <2,7,4,0>: Cost 3 vext3 <4,0,6,2>, <7,4,0,6>
+  3649643418U, // <2,7,4,1>: Cost 4 vext1 <4,2,7,4>, <1,2,3,4>
+  3649644330U, // <2,7,4,2>: Cost 4 vext1 <4,2,7,4>, <2,4,5,7>
+  2257982640U, // <2,7,4,3>: Cost 3 vrev <7,2,3,4>
+  3649645641U, // <2,7,4,4>: Cost 4 vext1 <4,2,7,4>, <4,2,7,4>
+  2621435190U, // <2,7,4,5>: Cost 3 vext2 <0,6,2,7>, RHS
+  2712835441U, // <2,7,4,6>: Cost 3 vext3 <4,6,u,2>, <7,4,6,u>
+  3799995762U, // <2,7,4,7>: Cost 4 vext3 <7,0,1,2>, <7,4,7,0>
+  2621435433U, // <2,7,4,u>: Cost 3 vext2 <0,6,2,7>, RHS
+  2729497990U, // <2,7,5,0>: Cost 3 vext3 <7,5,0,2>, <7,5,0,2>
+  3643679744U, // <2,7,5,1>: Cost 4 vext1 <3,2,7,5>, <1,3,5,7>
+  3637708424U, // <2,7,5,2>: Cost 4 vext1 <2,2,7,5>, <2,2,5,7>
+  3643681137U, // <2,7,5,3>: Cost 4 vext1 <3,2,7,5>, <3,2,7,5>
+  2599800118U, // <2,7,5,4>: Cost 3 vext1 <u,2,7,5>, RHS
+  3786577334U, // <2,7,5,5>: Cost 4 vext3 <4,6,u,2>, <7,5,5,5>
+  3786577345U, // <2,7,5,6>: Cost 4 vext3 <4,6,u,2>, <7,5,6,7>
+  2599802214U, // <2,7,5,7>: Cost 3 vext1 <u,2,7,5>, <7,4,5,6>
+  2599802670U, // <2,7,5,u>: Cost 3 vext1 <u,2,7,5>, LHS
+  2581889126U, // <2,7,6,0>: Cost 3 vext1 <5,2,7,6>, LHS
+  3643687936U, // <2,7,6,1>: Cost 4 vext1 <3,2,7,6>, <1,3,5,7>
+  2663240186U, // <2,7,6,2>: Cost 3 vext2 <7,6,2,7>, <6,2,7,3>
+  3643689330U, // <2,7,6,3>: Cost 4 vext1 <3,2,7,6>, <3,2,7,6>
+  2581892406U, // <2,7,6,4>: Cost 3 vext1 <5,2,7,6>, RHS
+  2581892900U, // <2,7,6,5>: Cost 3 vext1 <5,2,7,6>, <5,2,7,6>
+  2587865597U, // <2,7,6,6>: Cost 3 vext1 <6,2,7,6>, <6,2,7,6>
+  3786577428U, // <2,7,6,7>: Cost 4 vext3 <4,6,u,2>, <7,6,7,0>
+  2581894958U, // <2,7,6,u>: Cost 3 vext1 <5,2,7,6>, LHS
+  2726254119U, // <2,7,7,0>: Cost 3 vext3 <7,0,1,2>, <7,7,0,1>
+  3804640817U, // <2,7,7,1>: Cost 4 vext3 <7,7,1,2>, <7,7,1,2>
+  3637724826U, // <2,7,7,2>: Cost 4 vext1 <2,2,7,7>, <2,2,7,7>
+  3734992123U, // <2,7,7,3>: Cost 4 vext2 <7,3,2,7>, <7,3,2,7>
+  2552040758U, // <2,7,7,4>: Cost 3 vext1 <0,2,7,7>, RHS
+  3799995992U, // <2,7,7,5>: Cost 4 vext3 <7,0,1,2>, <7,7,5,5>
+  2663241198U, // <2,7,7,6>: Cost 3 vext2 <7,6,2,7>, <7,6,2,7>
+  2712835692U, // <2,7,7,7>: Cost 3 vext3 <4,6,u,2>, <7,7,7,7>
+  2731562607U, // <2,7,7,u>: Cost 3 vext3 <7,u,1,2>, <7,7,u,1>
+  1514135654U, // <2,7,u,0>: Cost 2 vext1 <6,2,7,u>, LHS
+  1657820802U, // <2,7,u,1>: Cost 2 vext3 <7,u,1,2>, <7,u,1,2>
+  2587879016U, // <2,7,u,2>: Cost 3 vext1 <6,2,7,u>, <2,2,2,2>
+  2569963892U, // <2,7,u,3>: Cost 3 vext1 <3,2,7,u>, <3,2,7,u>
+  1514138934U, // <2,7,u,4>: Cost 2 vext1 <6,2,7,u>, RHS
+  2621438106U, // <2,7,u,5>: Cost 3 vext2 <0,6,2,7>, RHS
+  1514140159U, // <2,7,u,6>: Cost 2 vext1 <6,2,7,u>, <6,2,7,u>
+  2587882490U, // <2,7,u,7>: Cost 3 vext1 <6,2,7,u>, <7,0,1,2>
+  1514141486U, // <2,7,u,u>: Cost 2 vext1 <6,2,7,u>, LHS
+  1544380416U, // <2,u,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+  470638699U, // <2,u,0,1>: Cost 1 vext2 LHS, LHS
+  1544380580U, // <2,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  1658631909U, // <2,u,0,3>: Cost 2 vext3 <u,0,3,2>, <u,0,3,2>
+  1544380754U, // <2,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  2665898414U, // <2,u,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
+  1658853120U, // <2,u,0,6>: Cost 2 vext3 <u,0,6,2>, <u,0,6,2>
+  3094531625U, // <2,u,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
+  470639261U, // <2,u,0,u>: Cost 1 vext2 LHS, LHS
+  1544381174U, // <2,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  1544381236U, // <2,u,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+  1544381334U, // <2,u,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+  1544381400U, // <2,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  2618123325U, // <2,u,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
+  1544381584U, // <2,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  2618123489U, // <2,u,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
+  2726254427U, // <2,u,1,7>: Cost 3 vext3 <7,0,1,2>, <u,1,7,3>
+  1544381823U, // <2,u,1,u>: Cost 2 vext2 LHS, <1,u,3,3>
+  1478328422U, // <2,u,2,0>: Cost 2 vext1 <0,2,u,2>, LHS
+  2618123807U, // <2,u,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
+  269271142U, // <2,u,2,2>: Cost 1 vdup2 LHS
+  1544382118U, // <2,u,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+  1478331702U, // <2,u,2,4>: Cost 2 vext1 <0,2,u,2>, RHS
+  2618124136U, // <2,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+  1544382394U, // <2,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  3088354857U, // <2,u,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
+  269271142U, // <2,u,2,u>: Cost 1 vdup2 LHS
+  1544382614U, // <2,u,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+  2953627374U, // <2,u,3,1>: Cost 3 vzipr LHS, <2,3,u,1>
+  1490282143U, // <2,u,3,2>: Cost 2 vext1 <2,2,u,3>, <2,2,u,3>
+  1879883932U, // <2,u,3,3>: Cost 2 vzipr LHS, LHS
+  1544382978U, // <2,u,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+  2953627378U, // <2,u,3,5>: Cost 3 vzipr LHS, <2,3,u,5>
+  1514172931U, // <2,u,3,6>: Cost 2 vext1 <6,2,u,3>, <6,2,u,3>
+  1879887176U, // <2,u,3,7>: Cost 2 vzipr LHS, RHS
+  1879883937U, // <2,u,3,u>: Cost 2 vzipr LHS, LHS
+  1484316774U, // <2,u,4,0>: Cost 2 vext1 <1,2,u,4>, LHS
+  1484317639U, // <2,u,4,1>: Cost 2 vext1 <1,2,u,4>, <1,2,u,4>
+  2552088270U, // <2,u,4,2>: Cost 3 vext1 <0,2,u,4>, <2,3,4,5>
+  1190213513U, // <2,u,4,3>: Cost 2 vrev <u,2,3,4>
+  1484320054U, // <2,u,4,4>: Cost 2 vext1 <1,2,u,4>, RHS
+  470641974U, // <2,u,4,5>: Cost 1 vext2 LHS, RHS
+  1592159604U, // <2,u,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+  3094564393U, // <2,u,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
+  470642217U, // <2,u,4,u>: Cost 1 vext2 LHS, RHS
+  2552094959U, // <2,u,5,0>: Cost 3 vext1 <0,2,u,5>, <0,2,u,5>
+  1592159952U, // <2,u,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
+  2564040353U, // <2,u,5,2>: Cost 3 vext1 <2,2,u,5>, <2,2,u,5>
+  2690275455U, // <2,u,5,3>: Cost 3 vext3 <0,u,u,2>, <u,5,3,7>
+  1592160198U, // <2,u,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+  1592160260U, // <2,u,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+  1611962522U, // <2,u,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  1592160424U, // <2,u,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
+  1611962540U, // <2,u,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
+  1478361190U, // <2,u,6,0>: Cost 2 vext1 <0,2,u,6>, LHS
+  2552103670U, // <2,u,6,1>: Cost 3 vext1 <0,2,u,6>, <1,0,3,2>
+  1592160762U, // <2,u,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
+  2685704400U, // <2,u,6,3>: Cost 3 vext3 <0,2,0,2>, <u,6,3,7>
+  1478364470U, // <2,u,6,4>: Cost 2 vext1 <0,2,u,6>, RHS
+  2901891226U, // <2,u,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
+  1592161080U, // <2,u,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+  1592161102U, // <2,u,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+  1478367022U, // <2,u,6,u>: Cost 2 vext1 <0,2,u,6>, LHS
+  1592161274U, // <2,u,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
+  2659931226U, // <2,u,7,1>: Cost 3 vext2 <7,1,2,u>, <7,1,2,u>
+  2564056739U, // <2,u,7,2>: Cost 3 vext1 <2,2,u,7>, <2,2,u,7>
+  2665903331U, // <2,u,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
+  1592161638U, // <2,u,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
+  2665903494U, // <2,u,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
+  2587947527U, // <2,u,7,6>: Cost 3 vext1 <6,2,u,7>, <6,2,u,7>
+  1592161900U, // <2,u,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+  1592161922U, // <2,u,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
+  1478377574U, // <2,u,u,0>: Cost 2 vext1 <0,2,u,u>, LHS
+  470644526U, // <2,u,u,1>: Cost 1 vext2 LHS, LHS
+  269271142U, // <2,u,u,2>: Cost 1 vdup2 LHS
+  1879924892U, // <2,u,u,3>: Cost 2 vzipr LHS, LHS
+  1478380854U, // <2,u,u,4>: Cost 2 vext1 <0,2,u,u>, RHS
+  470644890U, // <2,u,u,5>: Cost 1 vext2 LHS, RHS
+  1611962765U, // <2,u,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  1879928136U, // <2,u,u,7>: Cost 2 vzipr LHS, RHS
+  470645093U, // <2,u,u,u>: Cost 1 vext2 LHS, LHS
+  1611448320U, // <3,0,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
+  1611890698U, // <3,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
+  1611890708U, // <3,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
+  3763576860U, // <3,0,0,3>: Cost 4 vext3 LHS, <0,0,3,1>
+  2689835045U, // <3,0,0,4>: Cost 3 vext3 LHS, <0,0,4,1>
+  3698508206U, // <3,0,0,5>: Cost 4 vext2 <1,2,3,0>, <0,5,2,7>
+  3763576887U, // <3,0,0,6>: Cost 4 vext3 LHS, <0,0,6,1>
+  3667678434U, // <3,0,0,7>: Cost 4 vext1 <7,3,0,0>, <7,3,0,0>
+  1616093258U, // <3,0,0,u>: Cost 2 vext3 LHS, <0,0,u,2>
+  1490337894U, // <3,0,1,0>: Cost 2 vext1 <2,3,0,1>, LHS
+  2685632602U, // <3,0,1,1>: Cost 3 vext3 LHS, <0,1,1,0>
+  537706598U, // <3,0,1,2>: Cost 1 vext3 LHS, LHS
+  2624766936U, // <3,0,1,3>: Cost 3 vext2 <1,2,3,0>, <1,3,1,3>
+  1490341174U, // <3,0,1,4>: Cost 2 vext1 <2,3,0,1>, RHS
+  2624767120U, // <3,0,1,5>: Cost 3 vext2 <1,2,3,0>, <1,5,3,7>
+  2732966030U, // <3,0,1,6>: Cost 3 vext3 LHS, <0,1,6,7>
+  2593944803U, // <3,0,1,7>: Cost 3 vext1 <7,3,0,1>, <7,3,0,1>
+  537706652U, // <3,0,1,u>: Cost 1 vext3 LHS, LHS
+  1611890852U, // <3,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+  2685632684U, // <3,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
+  2685632692U, // <3,0,2,2>: Cost 3 vext3 LHS, <0,2,2,0>
+  2685632702U, // <3,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
+  1611890892U, // <3,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+  2732966102U, // <3,0,2,5>: Cost 3 vext3 LHS, <0,2,5,7>
+  2624767930U, // <3,0,2,6>: Cost 3 vext2 <1,2,3,0>, <2,6,3,7>
+  2685632744U, // <3,0,2,7>: Cost 3 vext3 LHS, <0,2,7,7>
+  1611890924U, // <3,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
+  2624768150U, // <3,0,3,0>: Cost 3 vext2 <1,2,3,0>, <3,0,1,2>
+  2685632764U, // <3,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
+  2685632774U, // <3,0,3,2>: Cost 3 vext3 LHS, <0,3,2,1>
+  2624768412U, // <3,0,3,3>: Cost 3 vext2 <1,2,3,0>, <3,3,3,3>
+  2624768514U, // <3,0,3,4>: Cost 3 vext2 <1,2,3,0>, <3,4,5,6>
+  3702491714U, // <3,0,3,5>: Cost 4 vext2 <1,u,3,0>, <3,5,3,7>
+  2624768632U, // <3,0,3,6>: Cost 3 vext2 <1,2,3,0>, <3,6,0,7>
+  3702491843U, // <3,0,3,7>: Cost 4 vext2 <1,u,3,0>, <3,7,0,1>
+  2686959934U, // <3,0,3,u>: Cost 3 vext3 <0,3,u,3>, <0,3,u,3>
+  2689835336U, // <3,0,4,0>: Cost 3 vext3 LHS, <0,4,0,4>
+  1611891026U, // <3,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
+  1611891036U, // <3,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
+  3763577184U, // <3,0,4,3>: Cost 4 vext3 LHS, <0,4,3,1>
+  2689835374U, // <3,0,4,4>: Cost 3 vext3 LHS, <0,4,4,6>
+  1551027510U, // <3,0,4,5>: Cost 2 vext2 <1,2,3,0>, RHS
+  2666573172U, // <3,0,4,6>: Cost 3 vext2 <u,2,3,0>, <4,6,4,6>
+  3667711206U, // <3,0,4,7>: Cost 4 vext1 <7,3,0,4>, <7,3,0,4>
+  1616093586U, // <3,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
+  2685190556U, // <3,0,5,0>: Cost 3 vext3 LHS, <0,5,0,7>
+  2666573520U, // <3,0,5,1>: Cost 3 vext2 <u,2,3,0>, <5,1,7,3>
+  3040886886U, // <3,0,5,2>: Cost 3 vtrnl <3,4,5,6>, LHS
+  3625912834U, // <3,0,5,3>: Cost 4 vext1 <0,3,0,5>, <3,4,5,6>
+  2666573766U, // <3,0,5,4>: Cost 3 vext2 <u,2,3,0>, <5,4,7,6>
+  2666573828U, // <3,0,5,5>: Cost 3 vext2 <u,2,3,0>, <5,5,5,5>
+  2732966354U, // <3,0,5,6>: Cost 3 vext3 LHS, <0,5,6,7>
+  2666573992U, // <3,0,5,7>: Cost 3 vext2 <u,2,3,0>, <5,7,5,7>
+  3040886940U, // <3,0,5,u>: Cost 3 vtrnl <3,4,5,6>, LHS
+  2685190637U, // <3,0,6,0>: Cost 3 vext3 LHS, <0,6,0,7>
+  2732966390U, // <3,0,6,1>: Cost 3 vext3 LHS, <0,6,1,7>
+  2689835519U, // <3,0,6,2>: Cost 3 vext3 LHS, <0,6,2,7>
+  3667724438U, // <3,0,6,3>: Cost 4 vext1 <7,3,0,6>, <3,0,1,2>
+  3763577355U, // <3,0,6,4>: Cost 4 vext3 LHS, <0,6,4,1>
+  3806708243U, // <3,0,6,5>: Cost 4 vext3 LHS, <0,6,5,0>
+  2666574648U, // <3,0,6,6>: Cost 3 vext2 <u,2,3,0>, <6,6,6,6>
+  2657948520U, // <3,0,6,7>: Cost 3 vext2 <6,7,3,0>, <6,7,3,0>
+  2689835573U, // <3,0,6,u>: Cost 3 vext3 LHS, <0,6,u,7>
+  2666574842U, // <3,0,7,0>: Cost 3 vext2 <u,2,3,0>, <7,0,1,2>
+  2685633095U, // <3,0,7,1>: Cost 3 vext3 LHS, <0,7,1,7>
+  2660603052U, // <3,0,7,2>: Cost 3 vext2 <7,2,3,0>, <7,2,3,0>
+  3643844997U, // <3,0,7,3>: Cost 4 vext1 <3,3,0,7>, <3,3,0,7>
+  2666575206U, // <3,0,7,4>: Cost 3 vext2 <u,2,3,0>, <7,4,5,6>
+  3655790391U, // <3,0,7,5>: Cost 4 vext1 <5,3,0,7>, <5,3,0,7>
+  3731690968U, // <3,0,7,6>: Cost 4 vext2 <6,7,3,0>, <7,6,0,3>
+  2666575468U, // <3,0,7,7>: Cost 3 vext2 <u,2,3,0>, <7,7,7,7>
+  2664584850U, // <3,0,7,u>: Cost 3 vext2 <7,u,3,0>, <7,u,3,0>
+  1616093834U, // <3,0,u,0>: Cost 2 vext3 LHS, <0,u,0,2>
+  1611891346U, // <3,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
+  537707165U, // <3,0,u,2>: Cost 1 vext3 LHS, LHS
+  2689835684U, // <3,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
+  1616093874U, // <3,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
+  1551030426U, // <3,0,u,5>: Cost 2 vext2 <1,2,3,0>, RHS
+  2624772304U, // <3,0,u,6>: Cost 3 vext2 <1,2,3,0>, <u,6,3,7>
+  2594002154U, // <3,0,u,7>: Cost 3 vext1 <7,3,0,u>, <7,3,0,u>
+  537707219U, // <3,0,u,u>: Cost 1 vext3 LHS, LHS
+  2552201318U, // <3,1,0,0>: Cost 3 vext1 <0,3,1,0>, LHS
+  2618802278U, // <3,1,0,1>: Cost 3 vext2 <0,2,3,1>, LHS
+  2618802366U, // <3,1,0,2>: Cost 3 vext2 <0,2,3,1>, <0,2,3,1>
+  1611449078U, // <3,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
+  2552204598U, // <3,1,0,4>: Cost 3 vext1 <0,3,1,0>, RHS
+  2732966663U, // <3,1,0,5>: Cost 3 vext3 LHS, <1,0,5,1>
+  3906258396U, // <3,1,0,6>: Cost 4 vuzpr <2,3,0,1>, <2,0,4,6>
+  3667752171U, // <3,1,0,7>: Cost 4 vext1 <7,3,1,0>, <7,3,1,0>
+  1611891491U, // <3,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
+  2689835819U, // <3,1,1,0>: Cost 3 vext3 LHS, <1,1,0,1>
+  1611449140U, // <3,1,1,1>: Cost 2 vext3 LHS, <1,1,1,1>
+  2624775063U, // <3,1,1,2>: Cost 3 vext2 <1,2,3,1>, <1,2,3,1>
+  1611891528U, // <3,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
+  2689835859U, // <3,1,1,4>: Cost 3 vext3 LHS, <1,1,4,5>
+  2689835868U, // <3,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
+  3763577701U, // <3,1,1,6>: Cost 4 vext3 LHS, <1,1,6,5>
+  3765273452U, // <3,1,1,7>: Cost 4 vext3 <1,1,7,3>, <1,1,7,3>
+  1611891573U, // <3,1,1,u>: Cost 2 vext3 LHS, <1,1,u,3>
+  2629420494U, // <3,1,2,0>: Cost 3 vext2 <2,0,3,1>, <2,0,3,1>
+  2689835911U, // <3,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
+  2564163248U, // <3,1,2,2>: Cost 3 vext1 <2,3,1,2>, <2,3,1,2>
+  1611449238U, // <3,1,2,3>: Cost 2 vext3 LHS, <1,2,3,0>
+  2564164918U, // <3,1,2,4>: Cost 3 vext1 <2,3,1,2>, RHS
+  2689835947U, // <3,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
+  3692545978U, // <3,1,2,6>: Cost 4 vext2 <0,2,3,1>, <2,6,3,7>
+  2732966842U, // <3,1,2,7>: Cost 3 vext3 LHS, <1,2,7,0>
+  1611891651U, // <3,1,2,u>: Cost 2 vext3 LHS, <1,2,u,0>
+  1484456038U, // <3,1,3,0>: Cost 2 vext1 <1,3,1,3>, LHS
+  1611891672U, // <3,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
+  2685633502U, // <3,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
+  2685633512U, // <3,1,3,3>: Cost 3 vext3 LHS, <1,3,3,1>
+  1484459318U, // <3,1,3,4>: Cost 2 vext1 <1,3,1,3>, RHS
+  1611891712U, // <3,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
+  2689836041U, // <3,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
+  2733409294U, // <3,1,3,7>: Cost 3 vext3 LHS, <1,3,7,3>
+  1611891735U, // <3,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
+  2552234086U, // <3,1,4,0>: Cost 3 vext1 <0,3,1,4>, LHS
+  2732966955U, // <3,1,4,1>: Cost 3 vext3 LHS, <1,4,1,5>
+  2732966964U, // <3,1,4,2>: Cost 3 vext3 LHS, <1,4,2,5>
+  2685633597U, // <3,1,4,3>: Cost 3 vext3 LHS, <1,4,3,5>
+  2552237366U, // <3,1,4,4>: Cost 3 vext1 <0,3,1,4>, RHS
+  2618805558U, // <3,1,4,5>: Cost 3 vext2 <0,2,3,1>, RHS
+  2769472822U, // <3,1,4,6>: Cost 3 vuzpl <3,0,1,2>, RHS
+  3667784943U, // <3,1,4,7>: Cost 4 vext1 <7,3,1,4>, <7,3,1,4>
+  2685633642U, // <3,1,4,u>: Cost 3 vext3 LHS, <1,4,u,5>
+  2689836143U, // <3,1,5,0>: Cost 3 vext3 LHS, <1,5,0,1>
+  2564187280U, // <3,1,5,1>: Cost 3 vext1 <2,3,1,5>, <1,5,3,7>
+  2564187827U, // <3,1,5,2>: Cost 3 vext1 <2,3,1,5>, <2,3,1,5>
+  1611891856U, // <3,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
+  2689836183U, // <3,1,5,4>: Cost 3 vext3 LHS, <1,5,4,5>
+  3759375522U, // <3,1,5,5>: Cost 4 vext3 LHS, <1,5,5,7>
+  3720417378U, // <3,1,5,6>: Cost 4 vext2 <4,u,3,1>, <5,6,7,0>
+  2832518454U, // <3,1,5,7>: Cost 3 vuzpr <2,3,0,1>, RHS
+  1611891901U, // <3,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
+  3763578048U, // <3,1,6,0>: Cost 4 vext3 LHS, <1,6,0,1>
+  2689836239U, // <3,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
+  2732967128U, // <3,1,6,2>: Cost 3 vext3 LHS, <1,6,2,7>
+  2685633761U, // <3,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
+  3763578088U, // <3,1,6,4>: Cost 4 vext3 LHS, <1,6,4,5>
+  2689836275U, // <3,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
+  3763578108U, // <3,1,6,6>: Cost 4 vext3 LHS, <1,6,6,7>
+  2732967166U, // <3,1,6,7>: Cost 3 vext3 LHS, <1,6,7,0>
+  2685633806U, // <3,1,6,u>: Cost 3 vext3 LHS, <1,6,u,7>
+  3631972454U, // <3,1,7,0>: Cost 4 vext1 <1,3,1,7>, LHS
+  2659947612U, // <3,1,7,1>: Cost 3 vext2 <7,1,3,1>, <7,1,3,1>
+  4036102294U, // <3,1,7,2>: Cost 4 vzipr <1,5,3,7>, <3,0,1,2>
+  3095396454U, // <3,1,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
+  3631975734U, // <3,1,7,4>: Cost 4 vext1 <1,3,1,7>, RHS
+  2222982144U, // <3,1,7,5>: Cost 3 vrev <1,3,5,7>
+  3296797705U, // <3,1,7,6>: Cost 4 vrev <1,3,6,7>
+  3720418924U, // <3,1,7,7>: Cost 4 vext2 <4,u,3,1>, <7,7,7,7>
+  3095396459U, // <3,1,7,u>: Cost 3 vtrnr <1,3,5,7>, LHS
+  1484496998U, // <3,1,u,0>: Cost 2 vext1 <1,3,1,u>, LHS
+  1611892077U, // <3,1,u,1>: Cost 2 vext3 LHS, <1,u,1,3>
+  2685633907U, // <3,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
+  1611892092U, // <3,1,u,3>: Cost 2 vext3 LHS, <1,u,3,0>
+  1484500278U, // <3,1,u,4>: Cost 2 vext1 <1,3,1,u>, RHS
+  1611892117U, // <3,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
+  2685633950U, // <3,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
+  2832518697U, // <3,1,u,7>: Cost 3 vuzpr <2,3,0,1>, RHS
+  1611892140U, // <3,1,u,u>: Cost 2 vext3 LHS, <1,u,u,3>
+  2623455232U, // <3,2,0,0>: Cost 3 vext2 <1,0,3,2>, <0,0,0,0>
+  1549713510U, // <3,2,0,1>: Cost 2 vext2 <1,0,3,2>, LHS
+  2689836484U, // <3,2,0,2>: Cost 3 vext3 LHS, <2,0,2,0>
+  2685633997U, // <3,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
+  2623455570U, // <3,2,0,4>: Cost 3 vext2 <1,0,3,2>, <0,4,1,5>
+  2732967398U, // <3,2,0,5>: Cost 3 vext3 LHS, <2,0,5,7>
+  2689836524U, // <3,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
+  2229044964U, // <3,2,0,7>: Cost 3 vrev <2,3,7,0>
+  1549714077U, // <3,2,0,u>: Cost 2 vext2 <1,0,3,2>, LHS
+  1549714166U, // <3,2,1,0>: Cost 2 vext2 <1,0,3,2>, <1,0,3,2>
+  2623456052U, // <3,2,1,1>: Cost 3 vext2 <1,0,3,2>, <1,1,1,1>
+  2623456150U, // <3,2,1,2>: Cost 3 vext2 <1,0,3,2>, <1,2,3,0>
+  2685634079U, // <3,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
+  2552286518U, // <3,2,1,4>: Cost 3 vext1 <0,3,2,1>, RHS
+  2623456400U, // <3,2,1,5>: Cost 3 vext2 <1,0,3,2>, <1,5,3,7>
+  2689836604U, // <3,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
+  3667834101U, // <3,2,1,7>: Cost 4 vext1 <7,3,2,1>, <7,3,2,1>
+  1155385070U, // <3,2,1,u>: Cost 2 vrev <2,3,u,1>
+  2689836629U, // <3,2,2,0>: Cost 3 vext3 LHS, <2,2,0,1>
+  2689836640U, // <3,2,2,1>: Cost 3 vext3 LHS, <2,2,1,3>
+  1611449960U, // <3,2,2,2>: Cost 2 vext3 LHS, <2,2,2,2>
+  1611892338U, // <3,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
+  2689836669U, // <3,2,2,4>: Cost 3 vext3 LHS, <2,2,4,5>
+  2689836680U, // <3,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
+  2689836688U, // <3,2,2,6>: Cost 3 vext3 LHS, <2,2,6,6>
+  3763578518U, // <3,2,2,7>: Cost 4 vext3 LHS, <2,2,7,3>
+  1611892383U, // <3,2,2,u>: Cost 2 vext3 LHS, <2,2,u,3>
+  1611450022U, // <3,2,3,0>: Cost 2 vext3 LHS, <2,3,0,1>
+  2685191854U, // <3,2,3,1>: Cost 3 vext3 LHS, <2,3,1,0>
+  2685191865U, // <3,2,3,2>: Cost 3 vext3 LHS, <2,3,2,2>
+  2685191875U, // <3,2,3,3>: Cost 3 vext3 LHS, <2,3,3,3>
+  1611450062U, // <3,2,3,4>: Cost 2 vext3 LHS, <2,3,4,5>
+  2732967635U, // <3,2,3,5>: Cost 3 vext3 LHS, <2,3,5,1>
+  2732967645U, // <3,2,3,6>: Cost 3 vext3 LHS, <2,3,6,2>
+  2732967652U, // <3,2,3,7>: Cost 3 vext3 LHS, <2,3,7,0>
+  1611450094U, // <3,2,3,u>: Cost 2 vext3 LHS, <2,3,u,1>
+  2558279782U, // <3,2,4,0>: Cost 3 vext1 <1,3,2,4>, LHS
+  2558280602U, // <3,2,4,1>: Cost 3 vext1 <1,3,2,4>, <1,2,3,4>
+  2732967692U, // <3,2,4,2>: Cost 3 vext3 LHS, <2,4,2,4>
+  2685634326U, // <3,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
+  2558283062U, // <3,2,4,4>: Cost 3 vext1 <1,3,2,4>, RHS
+  1549716790U, // <3,2,4,5>: Cost 2 vext2 <1,0,3,2>, RHS
+  2689836844U, // <3,2,4,6>: Cost 3 vext3 LHS, <2,4,6,0>
+  2229077736U, // <3,2,4,7>: Cost 3 vrev <2,3,7,4>
+  1549717033U, // <3,2,4,u>: Cost 2 vext2 <1,0,3,2>, RHS
+  2552316006U, // <3,2,5,0>: Cost 3 vext1 <0,3,2,5>, LHS
+  2228643507U, // <3,2,5,1>: Cost 3 vrev <2,3,1,5>
+  2689836896U, // <3,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
+  2685634408U, // <3,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
+  1155122894U, // <3,2,5,4>: Cost 2 vrev <2,3,4,5>
+  2665263108U, // <3,2,5,5>: Cost 3 vext2 <u,0,3,2>, <5,5,5,5>
+  2689836932U, // <3,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
+  2665263272U, // <3,2,5,7>: Cost 3 vext2 <u,0,3,2>, <5,7,5,7>
+  1155417842U, // <3,2,5,u>: Cost 2 vrev <2,3,u,5>
+  2689836953U, // <3,2,6,0>: Cost 3 vext3 LHS, <2,6,0,1>
+  2689836964U, // <3,2,6,1>: Cost 3 vext3 LHS, <2,6,1,3>
+  2689836976U, // <3,2,6,2>: Cost 3 vext3 LHS, <2,6,2,6>
+  1611892666U, // <3,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
+  2689836993U, // <3,2,6,4>: Cost 3 vext3 LHS, <2,6,4,5>
+  2689837004U, // <3,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
+  2689837013U, // <3,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
+  2665263950U, // <3,2,6,7>: Cost 3 vext2 <u,0,3,2>, <6,7,0,1>
+  1611892711U, // <3,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
+  2665264122U, // <3,2,7,0>: Cost 3 vext2 <u,0,3,2>, <7,0,1,2>
+  2623460419U, // <3,2,7,1>: Cost 3 vext2 <1,0,3,2>, <7,1,0,3>
+  4169138340U, // <3,2,7,2>: Cost 4 vtrnr <1,3,5,7>, <0,2,0,2>
+  2962358374U, // <3,2,7,3>: Cost 3 vzipr <1,5,3,7>, LHS
+  2665264486U, // <3,2,7,4>: Cost 3 vext2 <u,0,3,2>, <7,4,5,6>
+  2228954841U, // <3,2,7,5>: Cost 3 vrev <2,3,5,7>
+  2229028578U, // <3,2,7,6>: Cost 3 vrev <2,3,6,7>
+  2665264748U, // <3,2,7,7>: Cost 3 vext2 <u,0,3,2>, <7,7,7,7>
+  2962358379U, // <3,2,7,u>: Cost 3 vzipr <1,5,3,7>, LHS
+  1611892795U, // <3,2,u,0>: Cost 2 vext3 LHS, <2,u,0,1>
+  1549719342U, // <3,2,u,1>: Cost 2 vext2 <1,0,3,2>, LHS
+  1611449960U, // <3,2,u,2>: Cost 2 vext3 LHS, <2,2,2,2>
+  1611892824U, // <3,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
+  1611892835U, // <3,2,u,4>: Cost 2 vext3 LHS, <2,u,4,5>
+  1549719706U, // <3,2,u,5>: Cost 2 vext2 <1,0,3,2>, RHS
+  2689837168U, // <3,2,u,6>: Cost 3 vext3 LHS, <2,u,6,0>
+  2665265408U, // <3,2,u,7>: Cost 3 vext2 <u,0,3,2>, <u,7,0,1>
+  1611892867U, // <3,2,u,u>: Cost 2 vext3 LHS, <2,u,u,1>
+  2685192331U, // <3,3,0,0>: Cost 3 vext3 LHS, <3,0,0,0>
+  1611450518U, // <3,3,0,1>: Cost 2 vext3 LHS, <3,0,1,2>
+  2685634717U, // <3,3,0,2>: Cost 3 vext3 LHS, <3,0,2,0>
+  2564294806U, // <3,3,0,3>: Cost 3 vext1 <2,3,3,0>, <3,0,1,2>
+  2685634736U, // <3,3,0,4>: Cost 3 vext3 LHS, <3,0,4,1>
+  2732968122U, // <3,3,0,5>: Cost 3 vext3 LHS, <3,0,5,2>
+  3763579075U, // <3,3,0,6>: Cost 4 vext3 LHS, <3,0,6,2>
+  4034053264U, // <3,3,0,7>: Cost 4 vzipr <1,2,3,0>, <1,5,3,7>
+  1611450581U, // <3,3,0,u>: Cost 2 vext3 LHS, <3,0,u,2>
+  2685192415U, // <3,3,1,0>: Cost 3 vext3 LHS, <3,1,0,3>
+  1550385992U, // <3,3,1,1>: Cost 2 vext2 <1,1,3,3>, <1,1,3,3>
+  2685192433U, // <3,3,1,2>: Cost 3 vext3 LHS, <3,1,2,3>
+  2685634808U, // <3,3,1,3>: Cost 3 vext3 LHS, <3,1,3,1>
+  2558332214U, // <3,3,1,4>: Cost 3 vext1 <1,3,3,1>, RHS
+  2685634828U, // <3,3,1,5>: Cost 3 vext3 LHS, <3,1,5,3>
+  3759376661U, // <3,3,1,6>: Cost 4 vext3 LHS, <3,1,6,3>
+  2703477022U, // <3,3,1,7>: Cost 3 vext3 <3,1,7,3>, <3,1,7,3>
+  1555031423U, // <3,3,1,u>: Cost 2 vext2 <1,u,3,3>, <1,u,3,3>
+  2564309094U, // <3,3,2,0>: Cost 3 vext1 <2,3,3,2>, LHS
+  2630100513U, // <3,3,2,1>: Cost 3 vext2 <2,1,3,3>, <2,1,3,3>
+  1557022322U, // <3,3,2,2>: Cost 2 vext2 <2,2,3,3>, <2,2,3,3>
+  2685192520U, // <3,3,2,3>: Cost 3 vext3 LHS, <3,2,3,0>
+  2564312374U, // <3,3,2,4>: Cost 3 vext1 <2,3,3,2>, RHS
+  2732968286U, // <3,3,2,5>: Cost 3 vext3 LHS, <3,2,5,4>
+  2685634918U, // <3,3,2,6>: Cost 3 vext3 LHS, <3,2,6,3>
+  2704140655U, // <3,3,2,7>: Cost 3 vext3 <3,2,7,3>, <3,2,7,3>
+  1561004120U, // <3,3,2,u>: Cost 2 vext2 <2,u,3,3>, <2,u,3,3>
+  1496547430U, // <3,3,3,0>: Cost 2 vext1 <3,3,3,3>, LHS
+  2624129256U, // <3,3,3,1>: Cost 3 vext2 <1,1,3,3>, <3,1,1,3>
+  2630764866U, // <3,3,3,2>: Cost 3 vext2 <2,2,3,3>, <3,2,2,3>
+  336380006U, // <3,3,3,3>: Cost 1 vdup3 LHS
+  1496550710U, // <3,3,3,4>: Cost 2 vext1 <3,3,3,3>, RHS
+  2732968368U, // <3,3,3,5>: Cost 3 vext3 LHS, <3,3,5,5>
+  2624129683U, // <3,3,3,6>: Cost 3 vext2 <1,1,3,3>, <3,6,3,7>
+  2594182400U, // <3,3,3,7>: Cost 3 vext1 <7,3,3,3>, <7,3,3,3>
+  336380006U, // <3,3,3,u>: Cost 1 vdup3 LHS
+  2558353510U, // <3,3,4,0>: Cost 3 vext1 <1,3,3,4>, LHS
+  2558354411U, // <3,3,4,1>: Cost 3 vext1 <1,3,3,4>, <1,3,3,4>
+  2564327108U, // <3,3,4,2>: Cost 3 vext1 <2,3,3,4>, <2,3,3,4>
+  2564327938U, // <3,3,4,3>: Cost 3 vext1 <2,3,3,4>, <3,4,5,6>
+  2960343962U, // <3,3,4,4>: Cost 3 vzipr <1,2,3,4>, <1,2,3,4>
+  1611893250U, // <3,3,4,5>: Cost 2 vext3 LHS, <3,4,5,6>
+  2771619126U, // <3,3,4,6>: Cost 3 vuzpl <3,3,3,3>, RHS
+  4034086032U, // <3,3,4,7>: Cost 4 vzipr <1,2,3,4>, <1,5,3,7>
+  1611893277U, // <3,3,4,u>: Cost 2 vext3 LHS, <3,4,u,6>
+  2558361702U, // <3,3,5,0>: Cost 3 vext1 <1,3,3,5>, LHS
+  2558362604U, // <3,3,5,1>: Cost 3 vext1 <1,3,3,5>, <1,3,3,5>
+  2558363342U, // <3,3,5,2>: Cost 3 vext1 <1,3,3,5>, <2,3,4,5>
+  2732968512U, // <3,3,5,3>: Cost 3 vext3 LHS, <3,5,3,5>
+  2558364982U, // <3,3,5,4>: Cost 3 vext1 <1,3,3,5>, RHS
+  3101279950U, // <3,3,5,5>: Cost 3 vtrnr <2,3,4,5>, <2,3,4,5>
+  2665934946U, // <3,3,5,6>: Cost 3 vext2 <u,1,3,3>, <5,6,7,0>
+  2826636598U, // <3,3,5,7>: Cost 3 vuzpr <1,3,1,3>, RHS
+  2826636599U, // <3,3,5,u>: Cost 3 vuzpr <1,3,1,3>, RHS
+  2732968568U, // <3,3,6,0>: Cost 3 vext3 LHS, <3,6,0,7>
+  3763579521U, // <3,3,6,1>: Cost 4 vext3 LHS, <3,6,1,7>
+  2732968586U, // <3,3,6,2>: Cost 3 vext3 LHS, <3,6,2,7>
+  2732968595U, // <3,3,6,3>: Cost 3 vext3 LHS, <3,6,3,7>
+  2732968604U, // <3,3,6,4>: Cost 3 vext3 LHS, <3,6,4,7>
+  3763579557U, // <3,3,6,5>: Cost 4 vext3 LHS, <3,6,5,7>
+  2732968621U, // <3,3,6,6>: Cost 3 vext3 LHS, <3,6,6,6>
+  2657973099U, // <3,3,6,7>: Cost 3 vext2 <6,7,3,3>, <6,7,3,3>
+  2658636732U, // <3,3,6,u>: Cost 3 vext2 <6,u,3,3>, <6,u,3,3>
+  2558378086U, // <3,3,7,0>: Cost 3 vext1 <1,3,3,7>, LHS
+  2558378990U, // <3,3,7,1>: Cost 3 vext1 <1,3,3,7>, <1,3,3,7>
+  2564351687U, // <3,3,7,2>: Cost 3 vext1 <2,3,3,7>, <2,3,3,7>
+  2661291264U, // <3,3,7,3>: Cost 3 vext2 <7,3,3,3>, <7,3,3,3>
+  2558381366U, // <3,3,7,4>: Cost 3 vext1 <1,3,3,7>, RHS
+  2732968694U, // <3,3,7,5>: Cost 3 vext3 LHS, <3,7,5,7>
+  3781126907U, // <3,3,7,6>: Cost 4 vext3 <3,7,6,3>, <3,7,6,3>
+  3095397376U, // <3,3,7,7>: Cost 3 vtrnr <1,3,5,7>, <1,3,5,7>
+  2558383918U, // <3,3,7,u>: Cost 3 vext1 <1,3,3,7>, LHS
+  1496547430U, // <3,3,u,0>: Cost 2 vext1 <3,3,3,3>, LHS
+  1611893534U, // <3,3,u,1>: Cost 2 vext3 LHS, <3,u,1,2>
+  1592858504U, // <3,3,u,2>: Cost 2 vext2 <u,2,3,3>, <u,2,3,3>
+  336380006U, // <3,3,u,3>: Cost 1 vdup3 LHS
+  1496550710U, // <3,3,u,4>: Cost 2 vext1 <3,3,3,3>, RHS
+  1611893574U, // <3,3,u,5>: Cost 2 vext3 LHS, <3,u,5,6>
+  2690280268U, // <3,3,u,6>: Cost 3 vext3 LHS, <3,u,6,3>
+  2826636841U, // <3,3,u,7>: Cost 3 vuzpr <1,3,1,3>, RHS
+  336380006U, // <3,3,u,u>: Cost 1 vdup3 LHS
+  2624798720U, // <3,4,0,0>: Cost 3 vext2 <1,2,3,4>, <0,0,0,0>
+  1551056998U, // <3,4,0,1>: Cost 2 vext2 <1,2,3,4>, LHS
+  2624798884U, // <3,4,0,2>: Cost 3 vext2 <1,2,3,4>, <0,2,0,2>
+  3693232384U, // <3,4,0,3>: Cost 4 vext2 <0,3,3,4>, <0,3,1,4>
+  2624799058U, // <3,4,0,4>: Cost 3 vext2 <1,2,3,4>, <0,4,1,5>
+  1659227026U, // <3,4,0,5>: Cost 2 vext3 LHS, <4,0,5,1>
+  1659227036U, // <3,4,0,6>: Cost 2 vext3 LHS, <4,0,6,2>
+  3667973382U, // <3,4,0,7>: Cost 4 vext1 <7,3,4,0>, <7,3,4,0>
+  1551057565U, // <3,4,0,u>: Cost 2 vext2 <1,2,3,4>, LHS
+  2624799478U, // <3,4,1,0>: Cost 3 vext2 <1,2,3,4>, <1,0,3,2>
+  2624799540U, // <3,4,1,1>: Cost 3 vext2 <1,2,3,4>, <1,1,1,1>
+  1551057818U, // <3,4,1,2>: Cost 2 vext2 <1,2,3,4>, <1,2,3,4>
+  2624799704U, // <3,4,1,3>: Cost 3 vext2 <1,2,3,4>, <1,3,1,3>
+  2564377910U, // <3,4,1,4>: Cost 3 vext1 <2,3,4,1>, RHS
+  2689838050U, // <3,4,1,5>: Cost 3 vext3 LHS, <4,1,5,0>
+  2689838062U, // <3,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
+  2628117807U, // <3,4,1,7>: Cost 3 vext2 <1,7,3,4>, <1,7,3,4>
+  1555039616U, // <3,4,1,u>: Cost 2 vext2 <1,u,3,4>, <1,u,3,4>
+  3626180710U, // <3,4,2,0>: Cost 4 vext1 <0,3,4,2>, LHS
+  2624800298U, // <3,4,2,1>: Cost 3 vext2 <1,2,3,4>, <2,1,4,3>
+  2624800360U, // <3,4,2,2>: Cost 3 vext2 <1,2,3,4>, <2,2,2,2>
+  2624800422U, // <3,4,2,3>: Cost 3 vext2 <1,2,3,4>, <2,3,0,1>
+  2624800514U, // <3,4,2,4>: Cost 3 vext2 <1,2,3,4>, <2,4,1,3>
+  2709965878U, // <3,4,2,5>: Cost 3 vext3 <4,2,5,3>, <4,2,5,3>
+  2689838140U, // <3,4,2,6>: Cost 3 vext3 LHS, <4,2,6,0>
+  2634090504U, // <3,4,2,7>: Cost 3 vext2 <2,7,3,4>, <2,7,3,4>
+  2689838158U, // <3,4,2,u>: Cost 3 vext3 LHS, <4,2,u,0>
+  2624800918U, // <3,4,3,0>: Cost 3 vext2 <1,2,3,4>, <3,0,1,2>
+  2636081403U, // <3,4,3,1>: Cost 3 vext2 <3,1,3,4>, <3,1,3,4>
+  2636745036U, // <3,4,3,2>: Cost 3 vext2 <3,2,3,4>, <3,2,3,4>
+  2624801180U, // <3,4,3,3>: Cost 3 vext2 <1,2,3,4>, <3,3,3,3>
+  2624801232U, // <3,4,3,4>: Cost 3 vext2 <1,2,3,4>, <3,4,0,1>
+  2905836854U, // <3,4,3,5>: Cost 3 vzipl <3,3,3,3>, RHS
+  3040054582U, // <3,4,3,6>: Cost 3 vtrnl <3,3,3,3>, RHS
+  3702524611U, // <3,4,3,7>: Cost 4 vext2 <1,u,3,4>, <3,7,0,1>
+  2624801566U, // <3,4,3,u>: Cost 3 vext2 <1,2,3,4>, <3,u,1,2>
+  2564399206U, // <3,4,4,0>: Cost 3 vext1 <2,3,4,4>, LHS
+  2564400026U, // <3,4,4,1>: Cost 3 vext1 <2,3,4,4>, <1,2,3,4>
+  2564400845U, // <3,4,4,2>: Cost 3 vext1 <2,3,4,4>, <2,3,4,4>
+  2570373542U, // <3,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
+  1659227344U, // <3,4,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
+  1551060278U, // <3,4,4,5>: Cost 2 vext2 <1,2,3,4>, RHS
+  1659227364U, // <3,4,4,6>: Cost 2 vext3 LHS, <4,4,6,6>
+  3668006154U, // <3,4,4,7>: Cost 4 vext1 <7,3,4,4>, <7,3,4,4>
+  1551060521U, // <3,4,4,u>: Cost 2 vext2 <1,2,3,4>, RHS
+  1490665574U, // <3,4,5,0>: Cost 2 vext1 <2,3,4,5>, LHS
+  2689838341U, // <3,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
+  1490667214U, // <3,4,5,2>: Cost 2 vext1 <2,3,4,5>, <2,3,4,5>
+  2564409494U, // <3,4,5,3>: Cost 3 vext1 <2,3,4,5>, <3,0,1,2>
+  1490668854U, // <3,4,5,4>: Cost 2 vext1 <2,3,4,5>, RHS
+  2689838381U, // <3,4,5,5>: Cost 3 vext3 LHS, <4,5,5,7>
+  537709878U, // <3,4,5,6>: Cost 1 vext3 LHS, RHS
+  2594272523U, // <3,4,5,7>: Cost 3 vext1 <7,3,4,5>, <7,3,4,5>
+  537709896U, // <3,4,5,u>: Cost 1 vext3 LHS, RHS
+  2689838411U, // <3,4,6,0>: Cost 3 vext3 LHS, <4,6,0,1>
+  2558444534U, // <3,4,6,1>: Cost 3 vext1 <1,3,4,6>, <1,3,4,6>
+  2666607098U, // <3,4,6,2>: Cost 3 vext2 <u,2,3,4>, <6,2,7,3>
+  2558446082U, // <3,4,6,3>: Cost 3 vext1 <1,3,4,6>, <3,4,5,6>
+  1659227508U, // <3,4,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
+  2689838462U, // <3,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
+  2689838471U, // <3,4,6,6>: Cost 3 vext3 LHS, <4,6,6,7>
+  2657981292U, // <3,4,6,7>: Cost 3 vext2 <6,7,3,4>, <6,7,3,4>
+  1659227540U, // <3,4,6,u>: Cost 2 vext3 LHS, <4,6,u,2>
+  2666607610U, // <3,4,7,0>: Cost 3 vext2 <u,2,3,4>, <7,0,1,2>
+  3702527072U, // <3,4,7,1>: Cost 4 vext2 <1,u,3,4>, <7,1,3,5>
+  2660635824U, // <3,4,7,2>: Cost 3 vext2 <7,2,3,4>, <7,2,3,4>
+  3644139945U, // <3,4,7,3>: Cost 4 vext1 <3,3,4,7>, <3,3,4,7>
+  2666607974U, // <3,4,7,4>: Cost 3 vext2 <u,2,3,4>, <7,4,5,6>
+  2732969416U, // <3,4,7,5>: Cost 3 vext3 LHS, <4,7,5,0>
+  2732969425U, // <3,4,7,6>: Cost 3 vext3 LHS, <4,7,6,0>
+  2666608236U, // <3,4,7,7>: Cost 3 vext2 <u,2,3,4>, <7,7,7,7>
+  2664617622U, // <3,4,7,u>: Cost 3 vext2 <7,u,3,4>, <7,u,3,4>
+  1490690150U, // <3,4,u,0>: Cost 2 vext1 <2,3,4,u>, LHS
+  1551062830U, // <3,4,u,1>: Cost 2 vext2 <1,2,3,4>, LHS
+  1490691793U, // <3,4,u,2>: Cost 2 vext1 <2,3,4,u>, <2,3,4,u>
+  2624804796U, // <3,4,u,3>: Cost 3 vext2 <1,2,3,4>, <u,3,0,1>
+  1490693430U, // <3,4,u,4>: Cost 2 vext1 <2,3,4,u>, RHS
+  1551063194U, // <3,4,u,5>: Cost 2 vext2 <1,2,3,4>, RHS
+  537710121U, // <3,4,u,6>: Cost 1 vext3 LHS, RHS
+  2594297102U, // <3,4,u,7>: Cost 3 vext1 <7,3,4,u>, <7,3,4,u>
+  537710139U, // <3,4,u,u>: Cost 1 vext3 LHS, RHS
+  3692576768U, // <3,5,0,0>: Cost 4 vext2 <0,2,3,5>, <0,0,0,0>
+  2618835046U, // <3,5,0,1>: Cost 3 vext2 <0,2,3,5>, LHS
+  2618835138U, // <3,5,0,2>: Cost 3 vext2 <0,2,3,5>, <0,2,3,5>
+  3692577024U, // <3,5,0,3>: Cost 4 vext2 <0,2,3,5>, <0,3,1,4>
+  2689838690U, // <3,5,0,4>: Cost 3 vext3 LHS, <5,0,4,1>
+  2732969579U, // <3,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
+  2732969588U, // <3,5,0,6>: Cost 3 vext3 LHS, <5,0,6,1>
+  2246963055U, // <3,5,0,7>: Cost 3 vrev <5,3,7,0>
+  2618835613U, // <3,5,0,u>: Cost 3 vext2 <0,2,3,5>, LHS
+  2594308198U, // <3,5,1,0>: Cost 3 vext1 <7,3,5,1>, LHS
+  3692577588U, // <3,5,1,1>: Cost 4 vext2 <0,2,3,5>, <1,1,1,1>
+  2624807835U, // <3,5,1,2>: Cost 3 vext2 <1,2,3,5>, <1,2,3,5>
+  2625471468U, // <3,5,1,3>: Cost 3 vext2 <1,3,3,5>, <1,3,3,5>
+  2626135101U, // <3,5,1,4>: Cost 3 vext2 <1,4,3,5>, <1,4,3,5>
+  2594311888U, // <3,5,1,5>: Cost 3 vext1 <7,3,5,1>, <5,1,7,3>
+  3699877107U, // <3,5,1,6>: Cost 4 vext2 <1,4,3,5>, <1,6,5,7>
+  1641680592U, // <3,5,1,7>: Cost 2 vext3 <5,1,7,3>, <5,1,7,3>
+  1641754329U, // <3,5,1,u>: Cost 2 vext3 <5,1,u,3>, <5,1,u,3>
+  3692578274U, // <3,5,2,0>: Cost 4 vext2 <0,2,3,5>, <2,0,5,3>
+  2630116899U, // <3,5,2,1>: Cost 3 vext2 <2,1,3,5>, <2,1,3,5>
+  3692578408U, // <3,5,2,2>: Cost 4 vext2 <0,2,3,5>, <2,2,2,2>
+  2625472206U, // <3,5,2,3>: Cost 3 vext2 <1,3,3,5>, <2,3,4,5>
+  2632107798U, // <3,5,2,4>: Cost 3 vext2 <2,4,3,5>, <2,4,3,5>
+  2715938575U, // <3,5,2,5>: Cost 3 vext3 <5,2,5,3>, <5,2,5,3>
+  3692578746U, // <3,5,2,6>: Cost 4 vext2 <0,2,3,5>, <2,6,3,7>
+  2716086049U, // <3,5,2,7>: Cost 3 vext3 <5,2,7,3>, <5,2,7,3>
+  2634762330U, // <3,5,2,u>: Cost 3 vext2 <2,u,3,5>, <2,u,3,5>
+  3692578966U, // <3,5,3,0>: Cost 4 vext2 <0,2,3,5>, <3,0,1,2>
+  2636089596U, // <3,5,3,1>: Cost 3 vext2 <3,1,3,5>, <3,1,3,5>
+  3699214668U, // <3,5,3,2>: Cost 4 vext2 <1,3,3,5>, <3,2,3,4>
+  2638080412U, // <3,5,3,3>: Cost 3 vext2 <3,4,3,5>, <3,3,3,3>
+  2618837506U, // <3,5,3,4>: Cost 3 vext2 <0,2,3,5>, <3,4,5,6>
+  2832844494U, // <3,5,3,5>: Cost 3 vuzpr <2,3,4,5>, <2,3,4,5>
+  4033415682U, // <3,5,3,6>: Cost 4 vzipr <1,1,3,3>, <3,4,5,6>
+  3095072054U, // <3,5,3,7>: Cost 3 vtrnr <1,3,1,3>, RHS
+  3095072055U, // <3,5,3,u>: Cost 3 vtrnr <1,3,1,3>, RHS
+  2600304742U, // <3,5,4,0>: Cost 3 vext1 <u,3,5,4>, LHS
+  3763580815U, // <3,5,4,1>: Cost 4 vext3 LHS, <5,4,1,5>
+  2564474582U, // <3,5,4,2>: Cost 3 vext1 <2,3,5,4>, <2,3,5,4>
+  3699879044U, // <3,5,4,3>: Cost 4 vext2 <1,4,3,5>, <4,3,5,0>
+  2600308022U, // <3,5,4,4>: Cost 3 vext1 <u,3,5,4>, RHS
+  2618838326U, // <3,5,4,5>: Cost 3 vext2 <0,2,3,5>, RHS
+  2772454710U, // <3,5,4,6>: Cost 3 vuzpl <3,4,5,6>, RHS
+  1659228102U, // <3,5,4,7>: Cost 2 vext3 LHS, <5,4,7,6>
+  1659228111U, // <3,5,4,u>: Cost 2 vext3 LHS, <5,4,u,6>
+  2570453094U, // <3,5,5,0>: Cost 3 vext1 <3,3,5,5>, LHS
+  2624810704U, // <3,5,5,1>: Cost 3 vext2 <1,2,3,5>, <5,1,7,3>
+  2570454734U, // <3,5,5,2>: Cost 3 vext1 <3,3,5,5>, <2,3,4,5>
+  2570455472U, // <3,5,5,3>: Cost 3 vext1 <3,3,5,5>, <3,3,5,5>
+  2570456374U, // <3,5,5,4>: Cost 3 vext1 <3,3,5,5>, RHS
+  1659228164U, // <3,5,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
+  2732969998U, // <3,5,5,6>: Cost 3 vext3 LHS, <5,5,6,6>
+  1659228184U, // <3,5,5,7>: Cost 2 vext3 LHS, <5,5,7,7>
+  1659228193U, // <3,5,5,u>: Cost 2 vext3 LHS, <5,5,u,7>
+  2732970020U, // <3,5,6,0>: Cost 3 vext3 LHS, <5,6,0,1>
+  2732970035U, // <3,5,6,1>: Cost 3 vext3 LHS, <5,6,1,7>
+  2564490968U, // <3,5,6,2>: Cost 3 vext1 <2,3,5,6>, <2,3,5,6>
+  2732970050U, // <3,5,6,3>: Cost 3 vext3 LHS, <5,6,3,4>
+  2732970060U, // <3,5,6,4>: Cost 3 vext3 LHS, <5,6,4,5>
+  2732970071U, // <3,5,6,5>: Cost 3 vext3 LHS, <5,6,5,7>
+  2732970080U, // <3,5,6,6>: Cost 3 vext3 LHS, <5,6,6,7>
+  1659228258U, // <3,5,6,7>: Cost 2 vext3 LHS, <5,6,7,0>
+  1659228267U, // <3,5,6,u>: Cost 2 vext3 LHS, <5,6,u,0>
+  1484783718U, // <3,5,7,0>: Cost 2 vext1 <1,3,5,7>, LHS
+  1484784640U, // <3,5,7,1>: Cost 2 vext1 <1,3,5,7>, <1,3,5,7>
+  2558527080U, // <3,5,7,2>: Cost 3 vext1 <1,3,5,7>, <2,2,2,2>
+  2558527638U, // <3,5,7,3>: Cost 3 vext1 <1,3,5,7>, <3,0,1,2>
+  1484786998U, // <3,5,7,4>: Cost 2 vext1 <1,3,5,7>, RHS
+  1659228328U, // <3,5,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
+  2732970154U, // <3,5,7,6>: Cost 3 vext3 LHS, <5,7,6,0>
+  2558531180U, // <3,5,7,7>: Cost 3 vext1 <1,3,5,7>, <7,7,7,7>
+  1484789550U, // <3,5,7,u>: Cost 2 vext1 <1,3,5,7>, LHS
+  1484791910U, // <3,5,u,0>: Cost 2 vext1 <1,3,5,u>, LHS
+  1484792833U, // <3,5,u,1>: Cost 2 vext1 <1,3,5,u>, <1,3,5,u>
+  2558535272U, // <3,5,u,2>: Cost 3 vext1 <1,3,5,u>, <2,2,2,2>
+  2558535830U, // <3,5,u,3>: Cost 3 vext1 <1,3,5,u>, <3,0,1,2>
+  1484795190U, // <3,5,u,4>: Cost 2 vext1 <1,3,5,u>, RHS
+  1659228409U, // <3,5,u,5>: Cost 2 vext3 LHS, <5,u,5,7>
+  2772457626U, // <3,5,u,6>: Cost 3 vuzpl <3,4,5,6>, RHS
+  1646326023U, // <3,5,u,7>: Cost 2 vext3 <5,u,7,3>, <5,u,7,3>
+  1484797742U, // <3,5,u,u>: Cost 2 vext1 <1,3,5,u>, LHS
+  2558541926U, // <3,6,0,0>: Cost 3 vext1 <1,3,6,0>, LHS
+  2689839393U, // <3,6,0,1>: Cost 3 vext3 LHS, <6,0,1,2>
+  2689839404U, // <3,6,0,2>: Cost 3 vext3 LHS, <6,0,2,4>
+  3706519808U, // <3,6,0,3>: Cost 4 vext2 <2,5,3,6>, <0,3,1,4>
+  2689839420U, // <3,6,0,4>: Cost 3 vext3 LHS, <6,0,4,2>
+  2732970314U, // <3,6,0,5>: Cost 3 vext3 LHS, <6,0,5,7>
+  2732970316U, // <3,6,0,6>: Cost 3 vext3 LHS, <6,0,6,0>
+  2960313654U, // <3,6,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
+  2689839456U, // <3,6,0,u>: Cost 3 vext3 LHS, <6,0,u,2>
+  3763581290U, // <3,6,1,0>: Cost 4 vext3 LHS, <6,1,0,3>
+  3763581297U, // <3,6,1,1>: Cost 4 vext3 LHS, <6,1,1,1>
+  2624816028U, // <3,6,1,2>: Cost 3 vext2 <1,2,3,6>, <1,2,3,6>
+  3763581315U, // <3,6,1,3>: Cost 4 vext3 LHS, <6,1,3,1>
+  2626143294U, // <3,6,1,4>: Cost 3 vext2 <1,4,3,6>, <1,4,3,6>
+  3763581335U, // <3,6,1,5>: Cost 4 vext3 LHS, <6,1,5,3>
+  2721321376U, // <3,6,1,6>: Cost 3 vext3 <6,1,6,3>, <6,1,6,3>
+  2721395113U, // <3,6,1,7>: Cost 3 vext3 <6,1,7,3>, <6,1,7,3>
+  2628797826U, // <3,6,1,u>: Cost 3 vext2 <1,u,3,6>, <1,u,3,6>
+  2594390118U, // <3,6,2,0>: Cost 3 vext1 <7,3,6,2>, LHS
+  2721616324U, // <3,6,2,1>: Cost 3 vext3 <6,2,1,3>, <6,2,1,3>
+  2630788725U, // <3,6,2,2>: Cost 3 vext2 <2,2,3,6>, <2,2,3,6>
+  3763581395U, // <3,6,2,3>: Cost 4 vext3 LHS, <6,2,3,0>
+  2632115991U, // <3,6,2,4>: Cost 3 vext2 <2,4,3,6>, <2,4,3,6>
+  2632779624U, // <3,6,2,5>: Cost 3 vext2 <2,5,3,6>, <2,5,3,6>
+  2594394618U, // <3,6,2,6>: Cost 3 vext1 <7,3,6,2>, <6,2,7,3>
+  1648316922U, // <3,6,2,7>: Cost 2 vext3 <6,2,7,3>, <6,2,7,3>
+  1648390659U, // <3,6,2,u>: Cost 2 vext3 <6,2,u,3>, <6,2,u,3>
+  3693914262U, // <3,6,3,0>: Cost 4 vext2 <0,4,3,6>, <3,0,1,2>
+  3638281176U, // <3,6,3,1>: Cost 4 vext1 <2,3,6,3>, <1,3,1,3>
+  3696568678U, // <3,6,3,2>: Cost 4 vext2 <0,u,3,6>, <3,2,6,3>
+  2638088604U, // <3,6,3,3>: Cost 3 vext2 <3,4,3,6>, <3,3,3,3>
+  2632780290U, // <3,6,3,4>: Cost 3 vext2 <2,5,3,6>, <3,4,5,6>
+  3712494145U, // <3,6,3,5>: Cost 4 vext2 <3,5,3,6>, <3,5,3,6>
+  3698559612U, // <3,6,3,6>: Cost 4 vext2 <1,2,3,6>, <3,6,1,2>
+  2959674678U, // <3,6,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
+  2959674679U, // <3,6,3,u>: Cost 3 vzipr <1,1,3,3>, RHS
+  3763581536U, // <3,6,4,0>: Cost 4 vext3 LHS, <6,4,0,6>
+  2722943590U, // <3,6,4,1>: Cost 3 vext3 <6,4,1,3>, <6,4,1,3>
+  2732970609U, // <3,6,4,2>: Cost 3 vext3 LHS, <6,4,2,5>
+  3698560147U, // <3,6,4,3>: Cost 4 vext2 <1,2,3,6>, <4,3,6,6>
+  2732970628U, // <3,6,4,4>: Cost 3 vext3 LHS, <6,4,4,6>
+  2689839757U, // <3,6,4,5>: Cost 3 vext3 LHS, <6,4,5,6>
+  2732970640U, // <3,6,4,6>: Cost 3 vext3 LHS, <6,4,6,0>
+  2960346422U, // <3,6,4,7>: Cost 3 vzipr <1,2,3,4>, RHS
+  2689839784U, // <3,6,4,u>: Cost 3 vext3 LHS, <6,4,u,6>
+  2576498790U, // <3,6,5,0>: Cost 3 vext1 <4,3,6,5>, LHS
+  3650241270U, // <3,6,5,1>: Cost 4 vext1 <4,3,6,5>, <1,0,3,2>
+  2732970692U, // <3,6,5,2>: Cost 3 vext3 LHS, <6,5,2,7>
+  2576501250U, // <3,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
+  2576501906U, // <3,6,5,4>: Cost 3 vext1 <4,3,6,5>, <4,3,6,5>
+  3650244622U, // <3,6,5,5>: Cost 4 vext1 <4,3,6,5>, <5,5,6,6>
+  4114633528U, // <3,6,5,6>: Cost 4 vtrnl <3,4,5,6>, <6,6,6,6>
+  2732970735U, // <3,6,5,7>: Cost 3 vext3 LHS, <6,5,7,5>
+  2576504622U, // <3,6,5,u>: Cost 3 vext1 <4,3,6,5>, LHS
+  2732970749U, // <3,6,6,0>: Cost 3 vext3 LHS, <6,6,0,1>
+  2724270856U, // <3,6,6,1>: Cost 3 vext3 <6,6,1,3>, <6,6,1,3>
+  2624819706U, // <3,6,6,2>: Cost 3 vext2 <1,2,3,6>, <6,2,7,3>
+  3656223234U, // <3,6,6,3>: Cost 4 vext1 <5,3,6,6>, <3,4,5,6>
+  2732970788U, // <3,6,6,4>: Cost 3 vext3 LHS, <6,6,4,4>
+  2732970800U, // <3,6,6,5>: Cost 3 vext3 LHS, <6,6,5,7>
+  1659228984U, // <3,6,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
+  1659228994U, // <3,6,6,7>: Cost 2 vext3 LHS, <6,6,7,7>
+  1659229003U, // <3,6,6,u>: Cost 2 vext3 LHS, <6,6,u,7>
+  1659229006U, // <3,6,7,0>: Cost 2 vext3 LHS, <6,7,0,1>
+  2558600201U, // <3,6,7,1>: Cost 3 vext1 <1,3,6,7>, <1,3,6,7>
+  2558601146U, // <3,6,7,2>: Cost 3 vext1 <1,3,6,7>, <2,6,3,7>
+  2725081963U, // <3,6,7,3>: Cost 3 vext3 <6,7,3,3>, <6,7,3,3>
+  1659229046U, // <3,6,7,4>: Cost 2 vext3 LHS, <6,7,4,5>
+  2715423611U, // <3,6,7,5>: Cost 3 vext3 <5,1,7,3>, <6,7,5,1>
+  2722059141U, // <3,6,7,6>: Cost 3 vext3 <6,2,7,3>, <6,7,6,2>
+  2962361654U, // <3,6,7,7>: Cost 3 vzipr <1,5,3,7>, RHS
+  1659229078U, // <3,6,7,u>: Cost 2 vext3 LHS, <6,7,u,1>
+  1659229087U, // <3,6,u,0>: Cost 2 vext3 LHS, <6,u,0,1>
+  2689840041U, // <3,6,u,1>: Cost 3 vext3 LHS, <6,u,1,2>
+  2558609339U, // <3,6,u,2>: Cost 3 vext1 <1,3,6,u>, <2,6,3,u>
+  2576525853U, // <3,6,u,3>: Cost 3 vext1 <4,3,6,u>, <3,4,u,6>
+  1659229127U, // <3,6,u,4>: Cost 2 vext3 LHS, <6,u,4,5>
+  2689840081U, // <3,6,u,5>: Cost 3 vext3 LHS, <6,u,5,6>
+  1659228984U, // <3,6,u,6>: Cost 2 vext3 LHS, <6,6,6,6>
+  1652298720U, // <3,6,u,7>: Cost 2 vext3 <6,u,7,3>, <6,u,7,3>
+  1659229159U, // <3,6,u,u>: Cost 2 vext3 LHS, <6,u,u,1>
+  2626813952U, // <3,7,0,0>: Cost 3 vext2 <1,5,3,7>, <0,0,0,0>
+  1553072230U, // <3,7,0,1>: Cost 2 vext2 <1,5,3,7>, LHS
+  2626814116U, // <3,7,0,2>: Cost 3 vext2 <1,5,3,7>, <0,2,0,2>
+  3700556028U, // <3,7,0,3>: Cost 4 vext2 <1,5,3,7>, <0,3,1,0>
+  2626814290U, // <3,7,0,4>: Cost 3 vext2 <1,5,3,7>, <0,4,1,5>
+  2582507375U, // <3,7,0,5>: Cost 3 vext1 <5,3,7,0>, <5,3,7,0>
+  2588480072U, // <3,7,0,6>: Cost 3 vext1 <6,3,7,0>, <6,3,7,0>
+  2732971055U, // <3,7,0,7>: Cost 3 vext3 LHS, <7,0,7,1>
+  1553072797U, // <3,7,0,u>: Cost 2 vext2 <1,5,3,7>, LHS
+  2626814710U, // <3,7,1,0>: Cost 3 vext2 <1,5,3,7>, <1,0,3,2>
+  2626814772U, // <3,7,1,1>: Cost 3 vext2 <1,5,3,7>, <1,1,1,1>
+  2626814870U, // <3,7,1,2>: Cost 3 vext2 <1,5,3,7>, <1,2,3,0>
+  2625487854U, // <3,7,1,3>: Cost 3 vext2 <1,3,3,7>, <1,3,3,7>
+  2582514998U, // <3,7,1,4>: Cost 3 vext1 <5,3,7,1>, RHS
+  1553073296U, // <3,7,1,5>: Cost 2 vext2 <1,5,3,7>, <1,5,3,7>
+  2627478753U, // <3,7,1,6>: Cost 3 vext2 <1,6,3,7>, <1,6,3,7>
+  2727367810U, // <3,7,1,7>: Cost 3 vext3 <7,1,7,3>, <7,1,7,3>
+  1555064195U, // <3,7,1,u>: Cost 2 vext2 <1,u,3,7>, <1,u,3,7>
+  2588491878U, // <3,7,2,0>: Cost 3 vext1 <6,3,7,2>, LHS
+  3700557318U, // <3,7,2,1>: Cost 4 vext2 <1,5,3,7>, <2,1,0,3>
+  2626815592U, // <3,7,2,2>: Cost 3 vext2 <1,5,3,7>, <2,2,2,2>
+  2626815654U, // <3,7,2,3>: Cost 3 vext2 <1,5,3,7>, <2,3,0,1>
+  2588495158U, // <3,7,2,4>: Cost 3 vext1 <6,3,7,2>, RHS
+  2632787817U, // <3,7,2,5>: Cost 3 vext2 <2,5,3,7>, <2,5,3,7>
+  1559709626U, // <3,7,2,6>: Cost 2 vext2 <2,6,3,7>, <2,6,3,7>
+  2728031443U, // <3,7,2,7>: Cost 3 vext3 <7,2,7,3>, <7,2,7,3>
+  1561036892U, // <3,7,2,u>: Cost 2 vext2 <2,u,3,7>, <2,u,3,7>
+  2626816150U, // <3,7,3,0>: Cost 3 vext2 <1,5,3,7>, <3,0,1,2>
+  2626816268U, // <3,7,3,1>: Cost 3 vext2 <1,5,3,7>, <3,1,5,3>
+  2633451878U, // <3,7,3,2>: Cost 3 vext2 <2,6,3,7>, <3,2,6,3>
+  2626816412U, // <3,7,3,3>: Cost 3 vext2 <1,5,3,7>, <3,3,3,3>
+  2626816514U, // <3,7,3,4>: Cost 3 vext2 <1,5,3,7>, <3,4,5,6>
+  2638760514U, // <3,7,3,5>: Cost 3 vext2 <3,5,3,7>, <3,5,3,7>
+  2639424147U, // <3,7,3,6>: Cost 3 vext2 <3,6,3,7>, <3,6,3,7>
+  2826961920U, // <3,7,3,7>: Cost 3 vuzpr <1,3,5,7>, <1,3,5,7>
+  2626816798U, // <3,7,3,u>: Cost 3 vext2 <1,5,3,7>, <3,u,1,2>
+  2582536294U, // <3,7,4,0>: Cost 3 vext1 <5,3,7,4>, LHS
+  2582537360U, // <3,7,4,1>: Cost 3 vext1 <5,3,7,4>, <1,5,3,7>
+  2588510138U, // <3,7,4,2>: Cost 3 vext1 <6,3,7,4>, <2,6,3,7>
+  3700558996U, // <3,7,4,3>: Cost 4 vext2 <1,5,3,7>, <4,3,6,7>
+  2582539574U, // <3,7,4,4>: Cost 3 vext1 <5,3,7,4>, RHS
+  1553075510U, // <3,7,4,5>: Cost 2 vext2 <1,5,3,7>, RHS
+  2588512844U, // <3,7,4,6>: Cost 3 vext1 <6,3,7,4>, <6,3,7,4>
+  2564625766U, // <3,7,4,7>: Cost 3 vext1 <2,3,7,4>, <7,4,5,6>
+  1553075753U, // <3,7,4,u>: Cost 2 vext2 <1,5,3,7>, RHS
+  2732971398U, // <3,7,5,0>: Cost 3 vext3 LHS, <7,5,0,2>
+  2626817744U, // <3,7,5,1>: Cost 3 vext2 <1,5,3,7>, <5,1,7,3>
+  3700559649U, // <3,7,5,2>: Cost 4 vext2 <1,5,3,7>, <5,2,7,3>
+  2626817903U, // <3,7,5,3>: Cost 3 vext2 <1,5,3,7>, <5,3,7,0>
+  2258728203U, // <3,7,5,4>: Cost 3 vrev <7,3,4,5>
+  2732971446U, // <3,7,5,5>: Cost 3 vext3 LHS, <7,5,5,5>
+  2732971457U, // <3,7,5,6>: Cost 3 vext3 LHS, <7,5,6,7>
+  2826964278U, // <3,7,5,7>: Cost 3 vuzpr <1,3,5,7>, RHS
+  2826964279U, // <3,7,5,u>: Cost 3 vuzpr <1,3,5,7>, RHS
+  2732971478U, // <3,7,6,0>: Cost 3 vext3 LHS, <7,6,0,1>
+  2732971486U, // <3,7,6,1>: Cost 3 vext3 LHS, <7,6,1,0>
+  2633454074U, // <3,7,6,2>: Cost 3 vext2 <2,6,3,7>, <6,2,7,3>
+  2633454152U, // <3,7,6,3>: Cost 3 vext2 <2,6,3,7>, <6,3,7,0>
+  2732971518U, // <3,7,6,4>: Cost 3 vext3 LHS, <7,6,4,5>
+  2732971526U, // <3,7,6,5>: Cost 3 vext3 LHS, <7,6,5,4>
+  2732971537U, // <3,7,6,6>: Cost 3 vext3 LHS, <7,6,6,6>
+  2732971540U, // <3,7,6,7>: Cost 3 vext3 LHS, <7,6,7,0>
+  2726041124U, // <3,7,6,u>: Cost 3 vext3 <6,u,7,3>, <7,6,u,7>
+  2570616934U, // <3,7,7,0>: Cost 3 vext1 <3,3,7,7>, LHS
+  2570617856U, // <3,7,7,1>: Cost 3 vext1 <3,3,7,7>, <1,3,5,7>
+  2564646635U, // <3,7,7,2>: Cost 3 vext1 <2,3,7,7>, <2,3,7,7>
+  2570619332U, // <3,7,7,3>: Cost 3 vext1 <3,3,7,7>, <3,3,7,7>
+  2570620214U, // <3,7,7,4>: Cost 3 vext1 <3,3,7,7>, RHS
+  2582564726U, // <3,7,7,5>: Cost 3 vext1 <5,3,7,7>, <5,3,7,7>
+  2588537423U, // <3,7,7,6>: Cost 3 vext1 <6,3,7,7>, <6,3,7,7>
+  1659229804U, // <3,7,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
+  1659229804U, // <3,7,7,u>: Cost 2 vext3 LHS, <7,7,7,7>
+  2626819795U, // <3,7,u,0>: Cost 3 vext2 <1,5,3,7>, <u,0,1,2>
+  1553078062U, // <3,7,u,1>: Cost 2 vext2 <1,5,3,7>, LHS
+  2626819973U, // <3,7,u,2>: Cost 3 vext2 <1,5,3,7>, <u,2,3,0>
+  2826961565U, // <3,7,u,3>: Cost 3 vuzpr <1,3,5,7>, LHS
+  2626820159U, // <3,7,u,4>: Cost 3 vext2 <1,5,3,7>, <u,4,5,6>
+  1553078426U, // <3,7,u,5>: Cost 2 vext2 <1,5,3,7>, RHS
+  1595545808U, // <3,7,u,6>: Cost 2 vext2 <u,6,3,7>, <u,6,3,7>
+  1659229804U, // <3,7,u,7>: Cost 2 vext3 LHS, <7,7,7,7>
+  1553078629U, // <3,7,u,u>: Cost 2 vext2 <1,5,3,7>, LHS
+  1611448320U, // <3,u,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
+  1611896531U, // <3,u,0,1>: Cost 2 vext3 LHS, <u,0,1,2>
+  1659672284U, // <3,u,0,2>: Cost 2 vext3 LHS, <u,0,2,2>
+  1616099045U, // <3,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
+  2685638381U, // <3,u,0,4>: Cost 3 vext3 LHS, <u,0,4,1>
+  1663874806U, // <3,u,0,5>: Cost 2 vext3 LHS, <u,0,5,1>
+  1663874816U, // <3,u,0,6>: Cost 2 vext3 LHS, <u,0,6,2>
+  2960313672U, // <3,u,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
+  1611896594U, // <3,u,0,u>: Cost 2 vext3 LHS, <u,0,u,2>
+  1549763324U, // <3,u,1,0>: Cost 2 vext2 <1,0,3,u>, <1,0,3,u>
+  1550426957U, // <3,u,1,1>: Cost 2 vext2 <1,1,3,u>, <1,1,3,u>
+  537712430U, // <3,u,1,2>: Cost 1 vext3 LHS, LHS
+  1616541495U, // <3,u,1,3>: Cost 2 vext3 LHS, <u,1,3,3>
+  1490930998U, // <3,u,1,4>: Cost 2 vext1 <2,3,u,1>, RHS
+  1553081489U, // <3,u,1,5>: Cost 2 vext2 <1,5,3,u>, <1,5,3,u>
+  2627486946U, // <3,u,1,6>: Cost 3 vext2 <1,6,3,u>, <1,6,3,u>
+  1659230043U, // <3,u,1,7>: Cost 2 vext3 LHS, <u,1,7,3>
+  537712484U, // <3,u,1,u>: Cost 1 vext3 LHS, LHS
+  1611890852U, // <3,u,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+  2624833102U, // <3,u,2,1>: Cost 3 vext2 <1,2,3,u>, <2,1,u,3>
+  1557063287U, // <3,u,2,2>: Cost 2 vext2 <2,2,3,u>, <2,2,3,u>
+  1616099205U, // <3,u,2,3>: Cost 2 vext3 LHS, <u,2,3,0>
+  1611890892U, // <3,u,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+  2689841054U, // <3,u,2,5>: Cost 3 vext3 LHS, <u,2,5,7>
+  1559717819U, // <3,u,2,6>: Cost 2 vext2 <2,6,3,u>, <2,6,3,u>
+  1659230124U, // <3,u,2,7>: Cost 2 vext3 LHS, <u,2,7,3>
+  1616541618U, // <3,u,2,u>: Cost 2 vext3 LHS, <u,2,u,0>
+  1611896764U, // <3,u,3,0>: Cost 2 vext3 LHS, <u,3,0,1>
+  1484973079U, // <3,u,3,1>: Cost 2 vext1 <1,3,u,3>, <1,3,u,3>
+  2685638607U, // <3,u,3,2>: Cost 3 vext3 LHS, <u,3,2,2>
+  336380006U, // <3,u,3,3>: Cost 1 vdup3 LHS
+  1611896804U, // <3,u,3,4>: Cost 2 vext3 LHS, <u,3,4,5>
+  1616541679U, // <3,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
+  2690283512U, // <3,u,3,6>: Cost 3 vext3 LHS, <u,3,6,7>
+  2959674696U, // <3,u,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
+  336380006U, // <3,u,3,u>: Cost 1 vdup3 LHS
+  2558722150U, // <3,u,4,0>: Cost 3 vext1 <1,3,u,4>, LHS
+  1659672602U, // <3,u,4,1>: Cost 2 vext3 LHS, <u,4,1,5>
+  1659672612U, // <3,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
+  2689841196U, // <3,u,4,3>: Cost 3 vext3 LHS, <u,4,3,5>
+  1659227344U, // <3,u,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
+  1611896895U, // <3,u,4,5>: Cost 2 vext3 LHS, <u,4,5,6>
+  1663875144U, // <3,u,4,6>: Cost 2 vext3 LHS, <u,4,6,6>
+  1659230289U, // <3,u,4,7>: Cost 2 vext3 LHS, <u,4,7,6>
+  1611896922U, // <3,u,4,u>: Cost 2 vext3 LHS, <u,4,u,6>
+  1490960486U, // <3,u,5,0>: Cost 2 vext1 <2,3,u,5>, LHS
+  2689841261U, // <3,u,5,1>: Cost 3 vext3 LHS, <u,5,1,7>
+  1490962162U, // <3,u,5,2>: Cost 2 vext1 <2,3,u,5>, <2,3,u,5>
+  1616541823U, // <3,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
+  1490963766U, // <3,u,5,4>: Cost 2 vext1 <2,3,u,5>, RHS
+  1659228164U, // <3,u,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
+  537712794U, // <3,u,5,6>: Cost 1 vext3 LHS, RHS
+  1659230371U, // <3,u,5,7>: Cost 2 vext3 LHS, <u,5,7,7>
+  537712812U, // <3,u,5,u>: Cost 1 vext3 LHS, RHS
+  2689841327U, // <3,u,6,0>: Cost 3 vext3 LHS, <u,6,0,1>
+  2558739482U, // <3,u,6,1>: Cost 3 vext1 <1,3,u,6>, <1,3,u,6>
+  2689841351U, // <3,u,6,2>: Cost 3 vext3 LHS, <u,6,2,7>
+  1616099536U, // <3,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
+  1659227508U, // <3,u,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
+  2690283746U, // <3,u,6,5>: Cost 3 vext3 LHS, <u,6,5,7>
+  1659228984U, // <3,u,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
+  1659230445U, // <3,u,6,7>: Cost 2 vext3 LHS, <u,6,7,0>
+  1616099581U, // <3,u,6,u>: Cost 2 vext3 LHS, <u,6,u,7>
+  1485004902U, // <3,u,7,0>: Cost 2 vext1 <1,3,u,7>, LHS
+  1485005851U, // <3,u,7,1>: Cost 2 vext1 <1,3,u,7>, <1,3,u,7>
+  2558748264U, // <3,u,7,2>: Cost 3 vext1 <1,3,u,7>, <2,2,2,2>
+  3095397021U, // <3,u,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
+  1485008182U, // <3,u,7,4>: Cost 2 vext1 <1,3,u,7>, RHS
+  1659228328U, // <3,u,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
+  2722060599U, // <3,u,7,6>: Cost 3 vext3 <6,2,7,3>, <u,7,6,2>
+  1659229804U, // <3,u,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
+  1485010734U, // <3,u,7,u>: Cost 2 vext1 <1,3,u,7>, LHS
+  1616099665U, // <3,u,u,0>: Cost 2 vext3 LHS, <u,u,0,1>
+  1611897179U, // <3,u,u,1>: Cost 2 vext3 LHS, <u,u,1,2>
+  537712997U, // <3,u,u,2>: Cost 1 vext3 LHS, LHS
+  336380006U, // <3,u,u,3>: Cost 1 vdup3 LHS
+  1616099705U, // <3,u,u,4>: Cost 2 vext3 LHS, <u,u,4,5>
+  1611897219U, // <3,u,u,5>: Cost 2 vext3 LHS, <u,u,5,6>
+  537713037U, // <3,u,u,6>: Cost 1 vext3 LHS, RHS
+  1659230607U, // <3,u,u,7>: Cost 2 vext3 LHS, <u,u,7,0>
+  537713051U, // <3,u,u,u>: Cost 1 vext3 LHS, LHS
+  2691907584U, // <4,0,0,0>: Cost 3 vext3 <1,2,3,4>, <0,0,0,0>
+  2691907594U, // <4,0,0,1>: Cost 3 vext3 <1,2,3,4>, <0,0,1,1>
+  2691907604U, // <4,0,0,2>: Cost 3 vext3 <1,2,3,4>, <0,0,2,2>
+  3709862144U, // <4,0,0,3>: Cost 4 vext2 <3,1,4,0>, <0,3,1,4>
+  2684682280U, // <4,0,0,4>: Cost 3 vext3 <0,0,4,4>, <0,0,4,4>
+  3694600633U, // <4,0,0,5>: Cost 4 vext2 <0,5,4,0>, <0,5,4,0>
+  3291431290U, // <4,0,0,6>: Cost 4 vrev <0,4,6,0>
+  3668342067U, // <4,0,0,7>: Cost 4 vext1 <7,4,0,0>, <7,4,0,0>
+  2691907657U, // <4,0,0,u>: Cost 3 vext3 <1,2,3,4>, <0,0,u,1>
+  2570715238U, // <4,0,1,0>: Cost 3 vext1 <3,4,0,1>, LHS
+  2570716058U, // <4,0,1,1>: Cost 3 vext1 <3,4,0,1>, <1,2,3,4>
+  1618165862U, // <4,0,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  2570717648U, // <4,0,1,3>: Cost 3 vext1 <3,4,0,1>, <3,4,0,1>
+  2570718518U, // <4,0,1,4>: Cost 3 vext1 <3,4,0,1>, RHS
+  2594607206U, // <4,0,1,5>: Cost 3 vext1 <7,4,0,1>, <5,6,7,4>
+  3662377563U, // <4,0,1,6>: Cost 4 vext1 <6,4,0,1>, <6,4,0,1>
+  2594608436U, // <4,0,1,7>: Cost 3 vext1 <7,4,0,1>, <7,4,0,1>
+  1618165916U, // <4,0,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
+  2685714598U, // <4,0,2,0>: Cost 3 vext3 <0,2,0,4>, <0,2,0,4>
+  3759530159U, // <4,0,2,1>: Cost 4 vext3 <0,2,1,4>, <0,2,1,4>
+  2685862072U, // <4,0,2,2>: Cost 3 vext3 <0,2,2,4>, <0,2,2,4>
+  2631476937U, // <4,0,2,3>: Cost 3 vext2 <2,3,4,0>, <2,3,4,0>
+  2685714636U, // <4,0,2,4>: Cost 3 vext3 <0,2,0,4>, <0,2,4,6>
+  3765649622U, // <4,0,2,5>: Cost 4 vext3 <1,2,3,4>, <0,2,5,7>
+  2686157020U, // <4,0,2,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
+  3668358453U, // <4,0,2,7>: Cost 4 vext1 <7,4,0,2>, <7,4,0,2>
+  2686304494U, // <4,0,2,u>: Cost 3 vext3 <0,2,u,4>, <0,2,u,4>
+  3632529510U, // <4,0,3,0>: Cost 4 vext1 <1,4,0,3>, LHS
+  2686451968U, // <4,0,3,1>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
+  2686525705U, // <4,0,3,2>: Cost 3 vext3 <0,3,2,4>, <0,3,2,4>
+  3760341266U, // <4,0,3,3>: Cost 4 vext3 <0,3,3,4>, <0,3,3,4>
+  3632532790U, // <4,0,3,4>: Cost 4 vext1 <1,4,0,3>, RHS
+  3913254606U, // <4,0,3,5>: Cost 4 vuzpr <3,4,5,0>, <2,3,4,5>
+  3705219740U, // <4,0,3,6>: Cost 4 vext2 <2,3,4,0>, <3,6,4,7>
+  3713845990U, // <4,0,3,7>: Cost 4 vext2 <3,7,4,0>, <3,7,4,0>
+  2686451968U, // <4,0,3,u>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
+  2552823910U, // <4,0,4,0>: Cost 3 vext1 <0,4,0,4>, LHS
+  2691907922U, // <4,0,4,1>: Cost 3 vext3 <1,2,3,4>, <0,4,1,5>
+  2691907932U, // <4,0,4,2>: Cost 3 vext3 <1,2,3,4>, <0,4,2,6>
+  3626567830U, // <4,0,4,3>: Cost 4 vext1 <0,4,0,4>, <3,0,1,2>
+  2552827190U, // <4,0,4,4>: Cost 3 vext1 <0,4,0,4>, RHS
+  2631478582U, // <4,0,4,5>: Cost 3 vext2 <2,3,4,0>, RHS
+  3626570017U, // <4,0,4,6>: Cost 4 vext1 <0,4,0,4>, <6,0,1,2>
+  3668374839U, // <4,0,4,7>: Cost 4 vext1 <7,4,0,4>, <7,4,0,4>
+  2552829742U, // <4,0,4,u>: Cost 3 vext1 <0,4,0,4>, LHS
+  2558804070U, // <4,0,5,0>: Cost 3 vext1 <1,4,0,5>, LHS
+  1839644774U, // <4,0,5,1>: Cost 2 vzipl RHS, LHS
+  2913386660U, // <4,0,5,2>: Cost 3 vzipl RHS, <0,2,0,2>
+  2570750420U, // <4,0,5,3>: Cost 3 vext1 <3,4,0,5>, <3,4,0,5>
+  2558807350U, // <4,0,5,4>: Cost 3 vext1 <1,4,0,5>, RHS
+  3987128750U, // <4,0,5,5>: Cost 4 vzipl RHS, <0,5,2,7>
+  3987128822U, // <4,0,5,6>: Cost 4 vzipl RHS, <0,6,1,7>
+  2594641208U, // <4,0,5,7>: Cost 3 vext1 <7,4,0,5>, <7,4,0,5>
+  1839645341U, // <4,0,5,u>: Cost 2 vzipl RHS, LHS
+  2552840294U, // <4,0,6,0>: Cost 3 vext1 <0,4,0,6>, LHS
+  3047604234U, // <4,0,6,1>: Cost 3 vtrnl RHS, <0,0,1,1>
+  1973862502U, // <4,0,6,2>: Cost 2 vtrnl RHS, LHS
+  2570758613U, // <4,0,6,3>: Cost 3 vext1 <3,4,0,6>, <3,4,0,6>
+  2552843574U, // <4,0,6,4>: Cost 3 vext1 <0,4,0,6>, RHS
+  2217664887U, // <4,0,6,5>: Cost 3 vrev <0,4,5,6>
+  3662418528U, // <4,0,6,6>: Cost 4 vext1 <6,4,0,6>, <6,4,0,6>
+  2658022257U, // <4,0,6,7>: Cost 3 vext2 <6,7,4,0>, <6,7,4,0>
+  1973862556U, // <4,0,6,u>: Cost 2 vtrnl RHS, LHS
+  3731764218U, // <4,0,7,0>: Cost 4 vext2 <6,7,4,0>, <7,0,1,2>
+  3988324454U, // <4,0,7,1>: Cost 4 vzipl <4,7,5,0>, LHS
+  4122034278U, // <4,0,7,2>: Cost 4 vtrnl <4,6,7,1>, LHS
+  3735082246U, // <4,0,7,3>: Cost 4 vext2 <7,3,4,0>, <7,3,4,0>
+  3731764536U, // <4,0,7,4>: Cost 4 vext2 <6,7,4,0>, <7,4,0,5>
+  3937145718U, // <4,0,7,5>: Cost 4 vuzpr <7,4,5,0>, <6,7,4,5>
+  3737073145U, // <4,0,7,6>: Cost 4 vext2 <7,6,4,0>, <7,6,4,0>
+  3731764844U, // <4,0,7,7>: Cost 4 vext2 <6,7,4,0>, <7,7,7,7>
+  4122034332U, // <4,0,7,u>: Cost 4 vtrnl <4,6,7,1>, LHS
+  2552856678U, // <4,0,u,0>: Cost 3 vext1 <0,4,0,u>, LHS
+  1841635430U, // <4,0,u,1>: Cost 2 vzipl RHS, LHS
+  1618166429U, // <4,0,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  2570774999U, // <4,0,u,3>: Cost 3 vext1 <3,4,0,u>, <3,4,0,u>
+  2552859958U, // <4,0,u,4>: Cost 3 vext1 <0,4,0,u>, RHS
+  2631481498U, // <4,0,u,5>: Cost 3 vext2 <2,3,4,0>, RHS
+  2686157020U, // <4,0,u,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
+  2594665787U, // <4,0,u,7>: Cost 3 vext1 <7,4,0,u>, <7,4,0,u>
+  1618166483U, // <4,0,u,u>: Cost 2 vext3 <1,2,3,4>, LHS
+  2617548837U, // <4,1,0,0>: Cost 3 vext2 <0,0,4,1>, <0,0,4,1>
+  2622857318U, // <4,1,0,1>: Cost 3 vext2 <0,u,4,1>, LHS
+  3693281484U, // <4,1,0,2>: Cost 4 vext2 <0,3,4,1>, <0,2,4,6>
+  2691908342U, // <4,1,0,3>: Cost 3 vext3 <1,2,3,4>, <1,0,3,2>
+  2622857554U, // <4,1,0,4>: Cost 3 vext2 <0,u,4,1>, <0,4,1,5>
+  3764470538U, // <4,1,0,5>: Cost 4 vext3 <1,0,5,4>, <1,0,5,4>
+  3695272459U, // <4,1,0,6>: Cost 4 vext2 <0,6,4,1>, <0,6,4,1>
+  3733094980U, // <4,1,0,7>: Cost 4 vext2 <7,0,4,1>, <0,7,1,4>
+  2622857885U, // <4,1,0,u>: Cost 3 vext2 <0,u,4,1>, LHS
+  3696599798U, // <4,1,1,0>: Cost 4 vext2 <0,u,4,1>, <1,0,3,2>
+  2691097399U, // <4,1,1,1>: Cost 3 vext3 <1,1,1,4>, <1,1,1,4>
+  2631484314U, // <4,1,1,2>: Cost 3 vext2 <2,3,4,1>, <1,2,3,4>
+  2691908424U, // <4,1,1,3>: Cost 3 vext3 <1,2,3,4>, <1,1,3,3>
+  3696600125U, // <4,1,1,4>: Cost 4 vext2 <0,u,4,1>, <1,4,3,5>
+  3696600175U, // <4,1,1,5>: Cost 4 vext2 <0,u,4,1>, <1,5,0,1>
+  3696600307U, // <4,1,1,6>: Cost 4 vext2 <0,u,4,1>, <1,6,5,7>
+  3668423997U, // <4,1,1,7>: Cost 4 vext1 <7,4,1,1>, <7,4,1,1>
+  2691908469U, // <4,1,1,u>: Cost 3 vext3 <1,2,3,4>, <1,1,u,3>
+  2570797158U, // <4,1,2,0>: Cost 3 vext1 <3,4,1,2>, LHS
+  2570797978U, // <4,1,2,1>: Cost 3 vext1 <3,4,1,2>, <1,2,3,4>
+  3696600680U, // <4,1,2,2>: Cost 4 vext2 <0,u,4,1>, <2,2,2,2>
+  1618166682U, // <4,1,2,3>: Cost 2 vext3 <1,2,3,4>, <1,2,3,4>
+  2570800438U, // <4,1,2,4>: Cost 3 vext1 <3,4,1,2>, RHS
+  3765650347U, // <4,1,2,5>: Cost 4 vext3 <1,2,3,4>, <1,2,5,3>
+  3696601018U, // <4,1,2,6>: Cost 4 vext2 <0,u,4,1>, <2,6,3,7>
+  3668432190U, // <4,1,2,7>: Cost 4 vext1 <7,4,1,2>, <7,4,1,2>
+  1618535367U, // <4,1,2,u>: Cost 2 vext3 <1,2,u,4>, <1,2,u,4>
+  2564833382U, // <4,1,3,0>: Cost 3 vext1 <2,4,1,3>, LHS
+  2691908568U, // <4,1,3,1>: Cost 3 vext3 <1,2,3,4>, <1,3,1,3>
+  2691908578U, // <4,1,3,2>: Cost 3 vext3 <1,2,3,4>, <1,3,2,4>
+  2692572139U, // <4,1,3,3>: Cost 3 vext3 <1,3,3,4>, <1,3,3,4>
+  2564836662U, // <4,1,3,4>: Cost 3 vext1 <2,4,1,3>, RHS
+  2691908608U, // <4,1,3,5>: Cost 3 vext3 <1,2,3,4>, <1,3,5,7>
+  2588725862U, // <4,1,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+  3662468090U, // <4,1,3,7>: Cost 4 vext1 <6,4,1,3>, <7,0,1,2>
+  2691908631U, // <4,1,3,u>: Cost 3 vext3 <1,2,3,4>, <1,3,u,3>
+  3760194590U, // <4,1,4,0>: Cost 4 vext3 <0,3,1,4>, <1,4,0,1>
+  3693947874U, // <4,1,4,1>: Cost 4 vext2 <0,4,4,1>, <4,1,5,0>
+  3765650484U, // <4,1,4,2>: Cost 4 vext3 <1,2,3,4>, <1,4,2,5>
+  3113877606U, // <4,1,4,3>: Cost 3 vtrnr <4,4,4,4>, LHS
+  3760194630U, // <4,1,4,4>: Cost 4 vext3 <0,3,1,4>, <1,4,4,5>
+  2622860598U, // <4,1,4,5>: Cost 3 vext2 <0,u,4,1>, RHS
+  3297436759U, // <4,1,4,6>: Cost 4 vrev <1,4,6,4>
+  3800007772U, // <4,1,4,7>: Cost 4 vext3 <7,0,1,4>, <1,4,7,0>
+  2622860841U, // <4,1,4,u>: Cost 3 vext2 <0,u,4,1>, RHS
+  1479164006U, // <4,1,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
+  2552906486U, // <4,1,5,1>: Cost 3 vext1 <0,4,1,5>, <1,0,3,2>
+  2552907299U, // <4,1,5,2>: Cost 3 vext1 <0,4,1,5>, <2,1,3,5>
+  2552907926U, // <4,1,5,3>: Cost 3 vext1 <0,4,1,5>, <3,0,1,2>
+  1479167286U, // <4,1,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
+  2913387664U, // <4,1,5,5>: Cost 3 vzipl RHS, <1,5,3,7>
+  2600686074U, // <4,1,5,6>: Cost 3 vext1 <u,4,1,5>, <6,2,7,3>
+  2600686586U, // <4,1,5,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
+  1479169838U, // <4,1,5,u>: Cost 2 vext1 <0,4,1,5>, LHS
+  2552914022U, // <4,1,6,0>: Cost 3 vext1 <0,4,1,6>, LHS
+  2558886708U, // <4,1,6,1>: Cost 3 vext1 <1,4,1,6>, <1,1,1,1>
+  4028205206U, // <4,1,6,2>: Cost 4 vzipr <0,2,4,6>, <3,0,1,2>
+  3089858662U, // <4,1,6,3>: Cost 3 vtrnr <0,4,2,6>, LHS
+  2552917302U, // <4,1,6,4>: Cost 3 vext1 <0,4,1,6>, RHS
+  2223637584U, // <4,1,6,5>: Cost 3 vrev <1,4,5,6>
+  4121347081U, // <4,1,6,6>: Cost 4 vtrnl RHS, <1,3,6,7>
+  3721155406U, // <4,1,6,7>: Cost 4 vext2 <5,0,4,1>, <6,7,0,1>
+  2552919854U, // <4,1,6,u>: Cost 3 vext1 <0,4,1,6>, LHS
+  2659357716U, // <4,1,7,0>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
+  3733763173U, // <4,1,7,1>: Cost 4 vext2 <7,1,4,1>, <7,1,4,1>
+  3734426806U, // <4,1,7,2>: Cost 4 vext2 <7,2,4,1>, <7,2,4,1>
+  2695226671U, // <4,1,7,3>: Cost 3 vext3 <1,7,3,4>, <1,7,3,4>
+  3721155942U, // <4,1,7,4>: Cost 4 vext2 <5,0,4,1>, <7,4,5,6>
+  3721155976U, // <4,1,7,5>: Cost 4 vext2 <5,0,4,1>, <7,5,0,4>
+  3662500458U, // <4,1,7,6>: Cost 4 vext1 <6,4,1,7>, <6,4,1,7>
+  3721156204U, // <4,1,7,7>: Cost 4 vext2 <5,0,4,1>, <7,7,7,7>
+  2659357716U, // <4,1,7,u>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
+  1479188582U, // <4,1,u,0>: Cost 2 vext1 <0,4,1,u>, LHS
+  2552931062U, // <4,1,u,1>: Cost 3 vext1 <0,4,1,u>, <1,0,3,2>
+  2552931944U, // <4,1,u,2>: Cost 3 vext1 <0,4,1,u>, <2,2,2,2>
+  1622148480U, // <4,1,u,3>: Cost 2 vext3 <1,u,3,4>, <1,u,3,4>
+  1479191862U, // <4,1,u,4>: Cost 2 vext1 <0,4,1,u>, RHS
+  2622863514U, // <4,1,u,5>: Cost 3 vext2 <0,u,4,1>, RHS
+  2588725862U, // <4,1,u,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+  2600686586U, // <4,1,u,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
+  1479194414U, // <4,1,u,u>: Cost 2 vext1 <0,4,1,u>, LHS
+  2617557030U, // <4,2,0,0>: Cost 3 vext2 <0,0,4,2>, <0,0,4,2>
+  2622865510U, // <4,2,0,1>: Cost 3 vext2 <0,u,4,2>, LHS
+  2622865612U, // <4,2,0,2>: Cost 3 vext2 <0,u,4,2>, <0,2,4,6>
+  3693289753U, // <4,2,0,3>: Cost 4 vext2 <0,3,4,2>, <0,3,4,2>
+  2635473244U, // <4,2,0,4>: Cost 3 vext2 <3,0,4,2>, <0,4,2,6>
+  3765650918U, // <4,2,0,5>: Cost 4 vext3 <1,2,3,4>, <2,0,5,7>
+  2696775148U, // <4,2,0,6>: Cost 3 vext3 <2,0,6,4>, <2,0,6,4>
+  3695944285U, // <4,2,0,7>: Cost 4 vext2 <0,7,4,2>, <0,7,4,2>
+  2622866077U, // <4,2,0,u>: Cost 3 vext2 <0,u,4,2>, LHS
+  3696607990U, // <4,2,1,0>: Cost 4 vext2 <0,u,4,2>, <1,0,3,2>
+  3696608052U, // <4,2,1,1>: Cost 4 vext2 <0,u,4,2>, <1,1,1,1>
+  3696608150U, // <4,2,1,2>: Cost 4 vext2 <0,u,4,2>, <1,2,3,0>
+  3895574630U, // <4,2,1,3>: Cost 4 vuzpr <0,4,u,2>, LHS
+  2691909162U, // <4,2,1,4>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
+  3696608400U, // <4,2,1,5>: Cost 4 vext2 <0,u,4,2>, <1,5,3,7>
+  3760784956U, // <4,2,1,6>: Cost 4 vext3 <0,4,0,4>, <2,1,6,3>
+  3773908549U, // <4,2,1,7>: Cost 5 vext3 <2,5,7,4>, <2,1,7,3>
+  2691909162U, // <4,2,1,u>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
+  3696608748U, // <4,2,2,0>: Cost 4 vext2 <0,u,4,2>, <2,0,6,4>
+  3696608828U, // <4,2,2,1>: Cost 4 vext2 <0,u,4,2>, <2,1,6,3>
+  2691909224U, // <4,2,2,2>: Cost 3 vext3 <1,2,3,4>, <2,2,2,2>
+  2691909234U, // <4,2,2,3>: Cost 3 vext3 <1,2,3,4>, <2,2,3,3>
+  3759605368U, // <4,2,2,4>: Cost 4 vext3 <0,2,2,4>, <2,2,4,0>
+  3696609156U, // <4,2,2,5>: Cost 4 vext2 <0,u,4,2>, <2,5,6,7>
+  3760785040U, // <4,2,2,6>: Cost 4 vext3 <0,4,0,4>, <2,2,6,6>
+  3668505927U, // <4,2,2,7>: Cost 4 vext1 <7,4,2,2>, <7,4,2,2>
+  2691909279U, // <4,2,2,u>: Cost 3 vext3 <1,2,3,4>, <2,2,u,3>
+  2691909286U, // <4,2,3,0>: Cost 3 vext3 <1,2,3,4>, <2,3,0,1>
+  3764840111U, // <4,2,3,1>: Cost 4 vext3 <1,1,1,4>, <2,3,1,1>
+  3765651129U, // <4,2,3,2>: Cost 4 vext3 <1,2,3,4>, <2,3,2,2>
+  2698544836U, // <4,2,3,3>: Cost 3 vext3 <2,3,3,4>, <2,3,3,4>
+  2685863630U, // <4,2,3,4>: Cost 3 vext3 <0,2,2,4>, <2,3,4,5>
+  2698692310U, // <4,2,3,5>: Cost 3 vext3 <2,3,5,4>, <2,3,5,4>
+  3772507871U, // <4,2,3,6>: Cost 4 vext3 <2,3,6,4>, <2,3,6,4>
+  2698839784U, // <4,2,3,7>: Cost 3 vext3 <2,3,7,4>, <2,3,7,4>
+  2691909358U, // <4,2,3,u>: Cost 3 vext3 <1,2,3,4>, <2,3,u,1>
+  2564915302U, // <4,2,4,0>: Cost 3 vext1 <2,4,2,4>, LHS
+  2564916122U, // <4,2,4,1>: Cost 3 vext1 <2,4,2,4>, <1,2,3,4>
+  2564917004U, // <4,2,4,2>: Cost 3 vext1 <2,4,2,4>, <2,4,2,4>
+  2699208469U, // <4,2,4,3>: Cost 3 vext3 <2,4,3,4>, <2,4,3,4>
+  2564918582U, // <4,2,4,4>: Cost 3 vext1 <2,4,2,4>, RHS
+  2622868790U, // <4,2,4,5>: Cost 3 vext2 <0,u,4,2>, RHS
+  2229667632U, // <4,2,4,6>: Cost 3 vrev <2,4,6,4>
+  3800082229U, // <4,2,4,7>: Cost 4 vext3 <7,0,2,4>, <2,4,7,0>
+  2622869033U, // <4,2,4,u>: Cost 3 vext2 <0,u,4,2>, RHS
+  2552979558U, // <4,2,5,0>: Cost 3 vext1 <0,4,2,5>, LHS
+  2558952342U, // <4,2,5,1>: Cost 3 vext1 <1,4,2,5>, <1,2,3,0>
+  2564925032U, // <4,2,5,2>: Cost 3 vext1 <2,4,2,5>, <2,2,2,2>
+  2967060582U, // <4,2,5,3>: Cost 3 vzipr <2,3,4,5>, LHS
+  2552982838U, // <4,2,5,4>: Cost 3 vext1 <0,4,2,5>, RHS
+  3987130190U, // <4,2,5,5>: Cost 4 vzipl RHS, <2,5,0,7>
+  2913388474U, // <4,2,5,6>: Cost 3 vzipl RHS, <2,6,3,7>
+  3895577910U, // <4,2,5,7>: Cost 4 vuzpr <0,4,u,2>, RHS
+  2552985390U, // <4,2,5,u>: Cost 3 vext1 <0,4,2,5>, LHS
+  1479245926U, // <4,2,6,0>: Cost 2 vext1 <0,4,2,6>, LHS
+  2552988406U, // <4,2,6,1>: Cost 3 vext1 <0,4,2,6>, <1,0,3,2>
+  2552989288U, // <4,2,6,2>: Cost 3 vext1 <0,4,2,6>, <2,2,2,2>
+  2954461286U, // <4,2,6,3>: Cost 3 vzipr <0,2,4,6>, LHS
+  1479249206U, // <4,2,6,4>: Cost 2 vext1 <0,4,2,6>, RHS
+  2229610281U, // <4,2,6,5>: Cost 3 vrev <2,4,5,6>
+  2600767994U, // <4,2,6,6>: Cost 3 vext1 <u,4,2,6>, <6,2,7,3>
+  2600768506U, // <4,2,6,7>: Cost 3 vext1 <u,4,2,6>, <7,0,1,2>
+  1479251758U, // <4,2,6,u>: Cost 2 vext1 <0,4,2,6>, LHS
+  2659365909U, // <4,2,7,0>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
+  3733771366U, // <4,2,7,1>: Cost 4 vext2 <7,1,4,2>, <7,1,4,2>
+  3734434999U, // <4,2,7,2>: Cost 4 vext2 <7,2,4,2>, <7,2,4,2>
+  2701199368U, // <4,2,7,3>: Cost 3 vext3 <2,7,3,4>, <2,7,3,4>
+  4175774618U, // <4,2,7,4>: Cost 4 vtrnr <2,4,5,7>, <1,2,3,4>
+  3303360298U, // <4,2,7,5>: Cost 4 vrev <2,4,5,7>
+  3727136217U, // <4,2,7,6>: Cost 4 vext2 <6,0,4,2>, <7,6,0,4>
+  3727136364U, // <4,2,7,7>: Cost 4 vext2 <6,0,4,2>, <7,7,7,7>
+  2659365909U, // <4,2,7,u>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
+  1479262310U, // <4,2,u,0>: Cost 2 vext1 <0,4,2,u>, LHS
+  2553004790U, // <4,2,u,1>: Cost 3 vext1 <0,4,2,u>, <1,0,3,2>
+  2553005672U, // <4,2,u,2>: Cost 3 vext1 <0,4,2,u>, <2,2,2,2>
+  2954477670U, // <4,2,u,3>: Cost 3 vzipr <0,2,4,u>, LHS
+  1479265590U, // <4,2,u,4>: Cost 2 vext1 <0,4,2,u>, RHS
+  2622871706U, // <4,2,u,5>: Cost 3 vext2 <0,u,4,2>, RHS
+  2229700404U, // <4,2,u,6>: Cost 3 vrev <2,4,6,u>
+  2600784890U, // <4,2,u,7>: Cost 3 vext1 <u,4,2,u>, <7,0,1,2>
+  1479268142U, // <4,2,u,u>: Cost 2 vext1 <0,4,2,u>, LHS
+  3765651595U, // <4,3,0,0>: Cost 4 vext3 <1,2,3,4>, <3,0,0,0>
+  2691909782U, // <4,3,0,1>: Cost 3 vext3 <1,2,3,4>, <3,0,1,2>
+  2702452897U, // <4,3,0,2>: Cost 3 vext3 <3,0,2,4>, <3,0,2,4>
+  3693297946U, // <4,3,0,3>: Cost 4 vext2 <0,3,4,3>, <0,3,4,3>
+  3760711856U, // <4,3,0,4>: Cost 4 vext3 <0,3,u,4>, <3,0,4,1>
+  2235533820U, // <4,3,0,5>: Cost 3 vrev <3,4,5,0>
+  3309349381U, // <4,3,0,6>: Cost 4 vrev <3,4,6,0>
+  3668563278U, // <4,3,0,7>: Cost 4 vext1 <7,4,3,0>, <7,4,3,0>
+  2691909845U, // <4,3,0,u>: Cost 3 vext3 <1,2,3,4>, <3,0,u,2>
+  2235173328U, // <4,3,1,0>: Cost 3 vrev <3,4,0,1>
+  3764840678U, // <4,3,1,1>: Cost 4 vext3 <1,1,1,4>, <3,1,1,1>
+  2630173594U, // <4,3,1,2>: Cost 3 vext2 <2,1,4,3>, <1,2,3,4>
+  2703190267U, // <4,3,1,3>: Cost 3 vext3 <3,1,3,4>, <3,1,3,4>
+  3760195840U, // <4,3,1,4>: Cost 4 vext3 <0,3,1,4>, <3,1,4,0>
+  3765651724U, // <4,3,1,5>: Cost 4 vext3 <1,2,3,4>, <3,1,5,3>
+  3309357574U, // <4,3,1,6>: Cost 4 vrev <3,4,6,1>
+  3769633054U, // <4,3,1,7>: Cost 4 vext3 <1,u,3,4>, <3,1,7,3>
+  2703558952U, // <4,3,1,u>: Cost 3 vext3 <3,1,u,4>, <3,1,u,4>
+  3626770534U, // <4,3,2,0>: Cost 4 vext1 <0,4,3,2>, LHS
+  2630174250U, // <4,3,2,1>: Cost 3 vext2 <2,1,4,3>, <2,1,4,3>
+  3765651777U, // <4,3,2,2>: Cost 4 vext3 <1,2,3,4>, <3,2,2,2>
+  2703853900U, // <4,3,2,3>: Cost 3 vext3 <3,2,3,4>, <3,2,3,4>
+  3626773814U, // <4,3,2,4>: Cost 4 vext1 <0,4,3,2>, RHS
+  2704001374U, // <4,3,2,5>: Cost 3 vext3 <3,2,5,4>, <3,2,5,4>
+  3765651814U, // <4,3,2,6>: Cost 4 vext3 <1,2,3,4>, <3,2,6,3>
+  3769633135U, // <4,3,2,7>: Cost 4 vext3 <1,u,3,4>, <3,2,7,3>
+  2634819681U, // <4,3,2,u>: Cost 3 vext2 <2,u,4,3>, <2,u,4,3>
+  3765651839U, // <4,3,3,0>: Cost 4 vext3 <1,2,3,4>, <3,3,0,1>
+  3765651848U, // <4,3,3,1>: Cost 4 vext3 <1,2,3,4>, <3,3,1,1>
+  3710552404U, // <4,3,3,2>: Cost 4 vext2 <3,2,4,3>, <3,2,4,3>
+  2691910044U, // <4,3,3,3>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
+  2704591270U, // <4,3,3,4>: Cost 3 vext3 <3,3,4,4>, <3,3,4,4>
+  3769633202U, // <4,3,3,5>: Cost 4 vext3 <1,u,3,4>, <3,3,5,7>
+  3703917212U, // <4,3,3,6>: Cost 4 vext2 <2,1,4,3>, <3,6,4,7>
+  3769633220U, // <4,3,3,7>: Cost 4 vext3 <1,u,3,4>, <3,3,7,7>
+  2691910044U, // <4,3,3,u>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
+  2691910096U, // <4,3,4,0>: Cost 3 vext3 <1,2,3,4>, <3,4,0,1>
+  2691910106U, // <4,3,4,1>: Cost 3 vext3 <1,2,3,4>, <3,4,1,2>
+  2564990741U, // <4,3,4,2>: Cost 3 vext1 <2,4,3,4>, <2,4,3,4>
+  3765651946U, // <4,3,4,3>: Cost 4 vext3 <1,2,3,4>, <3,4,3,0>
+  2691910136U, // <4,3,4,4>: Cost 3 vext3 <1,2,3,4>, <3,4,4,5>
+  2686454274U, // <4,3,4,5>: Cost 3 vext3 <0,3,1,4>, <3,4,5,6>
+  2235640329U, // <4,3,4,6>: Cost 3 vrev <3,4,6,4>
+  3801483792U, // <4,3,4,7>: Cost 4 vext3 <7,2,3,4>, <3,4,7,2>
+  2691910168U, // <4,3,4,u>: Cost 3 vext3 <1,2,3,4>, <3,4,u,1>
+  2559025254U, // <4,3,5,0>: Cost 3 vext1 <1,4,3,5>, LHS
+  2559026237U, // <4,3,5,1>: Cost 3 vext1 <1,4,3,5>, <1,4,3,5>
+  2564998862U, // <4,3,5,2>: Cost 3 vext1 <2,4,3,5>, <2,3,4,5>
+  2570971548U, // <4,3,5,3>: Cost 3 vext1 <3,4,3,5>, <3,3,3,3>
+  2559028534U, // <4,3,5,4>: Cost 3 vext1 <1,4,3,5>, RHS
+  4163519477U, // <4,3,5,5>: Cost 4 vtrnr <0,4,1,5>, <1,3,4,5>
+  3309390346U, // <4,3,5,6>: Cost 4 vrev <3,4,6,5>
+  2706139747U, // <4,3,5,7>: Cost 3 vext3 <3,5,7,4>, <3,5,7,4>
+  2559031086U, // <4,3,5,u>: Cost 3 vext1 <1,4,3,5>, LHS
+  2559033446U, // <4,3,6,0>: Cost 3 vext1 <1,4,3,6>, LHS
+  2559034430U, // <4,3,6,1>: Cost 3 vext1 <1,4,3,6>, <1,4,3,6>
+  2565007127U, // <4,3,6,2>: Cost 3 vext1 <2,4,3,6>, <2,4,3,6>
+  2570979740U, // <4,3,6,3>: Cost 3 vext1 <3,4,3,6>, <3,3,3,3>
+  2559036726U, // <4,3,6,4>: Cost 3 vext1 <1,4,3,6>, RHS
+  1161841154U, // <4,3,6,5>: Cost 2 vrev <3,4,5,6>
+  4028203932U, // <4,3,6,6>: Cost 4 vzipr <0,2,4,6>, <1,2,3,6>
+  2706803380U, // <4,3,6,7>: Cost 3 vext3 <3,6,7,4>, <3,6,7,4>
+  1162062365U, // <4,3,6,u>: Cost 2 vrev <3,4,u,6>
+  3769633475U, // <4,3,7,0>: Cost 4 vext3 <1,u,3,4>, <3,7,0,1>
+  3769633488U, // <4,3,7,1>: Cost 4 vext3 <1,u,3,4>, <3,7,1,5>
+  3638757144U, // <4,3,7,2>: Cost 4 vext1 <2,4,3,7>, <2,4,3,7>
+  3769633508U, // <4,3,7,3>: Cost 4 vext3 <1,u,3,4>, <3,7,3,7>
+  3769633515U, // <4,3,7,4>: Cost 4 vext3 <1,u,3,4>, <3,7,4,5>
+  3769633526U, // <4,3,7,5>: Cost 4 vext3 <1,u,3,4>, <3,7,5,7>
+  3662647932U, // <4,3,7,6>: Cost 4 vext1 <6,4,3,7>, <6,4,3,7>
+  3781208837U, // <4,3,7,7>: Cost 4 vext3 <3,7,7,4>, <3,7,7,4>
+  3769633547U, // <4,3,7,u>: Cost 4 vext3 <1,u,3,4>, <3,7,u,1>
+  2559049830U, // <4,3,u,0>: Cost 3 vext1 <1,4,3,u>, LHS
+  2691910430U, // <4,3,u,1>: Cost 3 vext3 <1,2,3,4>, <3,u,1,2>
+  2565023513U, // <4,3,u,2>: Cost 3 vext1 <2,4,3,u>, <2,4,3,u>
+  2707835698U, // <4,3,u,3>: Cost 3 vext3 <3,u,3,4>, <3,u,3,4>
+  2559053110U, // <4,3,u,4>: Cost 3 vext1 <1,4,3,u>, RHS
+  1161857540U, // <4,3,u,5>: Cost 2 vrev <3,4,5,u>
+  2235673101U, // <4,3,u,6>: Cost 3 vrev <3,4,6,u>
+  2708130646U, // <4,3,u,7>: Cost 3 vext3 <3,u,7,4>, <3,u,7,4>
+  1162078751U, // <4,3,u,u>: Cost 2 vrev <3,4,u,u>
+  2617573416U, // <4,4,0,0>: Cost 3 vext2 <0,0,4,4>, <0,0,4,4>
+  1570373734U, // <4,4,0,1>: Cost 2 vext2 <4,4,4,4>, LHS
+  2779676774U, // <4,4,0,2>: Cost 3 vuzpl <4,6,4,6>, LHS
+  3760196480U, // <4,4,0,3>: Cost 4 vext3 <0,3,1,4>, <4,0,3,1>
+  2576977100U, // <4,4,0,4>: Cost 3 vext1 <4,4,4,0>, <4,4,4,0>
+  2718747538U, // <4,4,0,5>: Cost 3 vext3 <5,6,7,4>, <4,0,5,1>
+  2718747548U, // <4,4,0,6>: Cost 3 vext3 <5,6,7,4>, <4,0,6,2>
+  3668637015U, // <4,4,0,7>: Cost 4 vext1 <7,4,4,0>, <7,4,4,0>
+  1570374301U, // <4,4,0,u>: Cost 2 vext2 <4,4,4,4>, LHS
+  2644116214U, // <4,4,1,0>: Cost 3 vext2 <4,4,4,4>, <1,0,3,2>
+  2644116276U, // <4,4,1,1>: Cost 3 vext2 <4,4,4,4>, <1,1,1,1>
+  2691910602U, // <4,4,1,2>: Cost 3 vext3 <1,2,3,4>, <4,1,2,3>
+  2644116440U, // <4,4,1,3>: Cost 3 vext2 <4,4,4,4>, <1,3,1,3>
+  2711227356U, // <4,4,1,4>: Cost 3 vext3 <4,4,4,4>, <4,1,4,3>
+  2709310438U, // <4,4,1,5>: Cost 3 vext3 <4,1,5,4>, <4,1,5,4>
+  3765652462U, // <4,4,1,6>: Cost 4 vext3 <1,2,3,4>, <4,1,6,3>
+  3768970231U, // <4,4,1,7>: Cost 4 vext3 <1,7,3,4>, <4,1,7,3>
+  2695891968U, // <4,4,1,u>: Cost 3 vext3 <1,u,3,4>, <4,1,u,3>
+  3703260634U, // <4,4,2,0>: Cost 4 vext2 <2,0,4,4>, <2,0,4,4>
+  3765652499U, // <4,4,2,1>: Cost 4 vext3 <1,2,3,4>, <4,2,1,4>
+  2644117096U, // <4,4,2,2>: Cost 3 vext2 <4,4,4,4>, <2,2,2,2>
+  2631509709U, // <4,4,2,3>: Cost 3 vext2 <2,3,4,4>, <2,3,4,4>
+  2644117269U, // <4,4,2,4>: Cost 3 vext2 <4,4,4,4>, <2,4,3,4>
+  3705251698U, // <4,4,2,5>: Cost 4 vext2 <2,3,4,4>, <2,5,4,7>
+  2710047808U, // <4,4,2,6>: Cost 3 vext3 <4,2,6,4>, <4,2,6,4>
+  3783863369U, // <4,4,2,7>: Cost 4 vext3 <4,2,7,4>, <4,2,7,4>
+  2634827874U, // <4,4,2,u>: Cost 3 vext2 <2,u,4,4>, <2,u,4,4>
+  2644117654U, // <4,4,3,0>: Cost 3 vext2 <4,4,4,4>, <3,0,1,2>
+  3638797210U, // <4,4,3,1>: Cost 4 vext1 <2,4,4,3>, <1,2,3,4>
+  3638798082U, // <4,4,3,2>: Cost 4 vext1 <2,4,4,3>, <2,4,1,3>
+  2637482406U, // <4,4,3,3>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
+  2638146039U, // <4,4,3,4>: Cost 3 vext2 <3,4,4,4>, <3,4,4,4>
+  3913287374U, // <4,4,3,5>: Cost 4 vuzpr <3,4,5,4>, <2,3,4,5>
+  3765652625U, // <4,4,3,6>: Cost 4 vext3 <1,2,3,4>, <4,3,6,4>
+  3713878762U, // <4,4,3,7>: Cost 4 vext2 <3,7,4,4>, <3,7,4,4>
+  2637482406U, // <4,4,3,u>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
+  1503264870U, // <4,4,4,0>: Cost 2 vext1 <4,4,4,4>, LHS
+  2577007514U, // <4,4,4,1>: Cost 3 vext1 <4,4,4,4>, <1,2,3,4>
+  2577008232U, // <4,4,4,2>: Cost 3 vext1 <4,4,4,4>, <2,2,2,2>
+  2571037175U, // <4,4,4,3>: Cost 3 vext1 <3,4,4,4>, <3,4,4,4>
+  161926454U, // <4,4,4,4>: Cost 1 vdup0 RHS
+  1570377014U, // <4,4,4,5>: Cost 2 vext2 <4,4,4,4>, RHS
+  2779680054U, // <4,4,4,6>: Cost 3 vuzpl <4,6,4,6>, RHS
+  2594927963U, // <4,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
+  161926454U, // <4,4,4,u>: Cost 1 vdup0 RHS
+  2571042918U, // <4,4,5,0>: Cost 3 vext1 <3,4,4,5>, LHS
+  2571043738U, // <4,4,5,1>: Cost 3 vext1 <3,4,4,5>, <1,2,3,4>
+  3638814495U, // <4,4,5,2>: Cost 4 vext1 <2,4,4,5>, <2,4,4,5>
+  2571045368U, // <4,4,5,3>: Cost 3 vext1 <3,4,4,5>, <3,4,4,5>
+  2571046198U, // <4,4,5,4>: Cost 3 vext1 <3,4,4,5>, RHS
+  1839648054U, // <4,4,5,5>: Cost 2 vzipl RHS, RHS
+  1618169142U, // <4,4,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  2594936156U, // <4,4,5,7>: Cost 3 vext1 <7,4,4,5>, <7,4,4,5>
+  1618169160U, // <4,4,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
+  2553135206U, // <4,4,6,0>: Cost 3 vext1 <0,4,4,6>, LHS
+  3626877686U, // <4,4,6,1>: Cost 4 vext1 <0,4,4,6>, <1,0,3,2>
+  2565080782U, // <4,4,6,2>: Cost 3 vext1 <2,4,4,6>, <2,3,4,5>
+  2571053561U, // <4,4,6,3>: Cost 3 vext1 <3,4,4,6>, <3,4,4,6>
+  2553138486U, // <4,4,6,4>: Cost 3 vext1 <0,4,4,6>, RHS
+  2241555675U, // <4,4,6,5>: Cost 3 vrev <4,4,5,6>
+  1973865782U, // <4,4,6,6>: Cost 2 vtrnl RHS, RHS
+  2658055029U, // <4,4,6,7>: Cost 3 vext2 <6,7,4,4>, <6,7,4,4>
+  1973865800U, // <4,4,6,u>: Cost 2 vtrnl RHS, RHS
+  2644120570U, // <4,4,7,0>: Cost 3 vext2 <4,4,4,4>, <7,0,1,2>
+  3638829978U, // <4,4,7,1>: Cost 4 vext1 <2,4,4,7>, <1,2,3,4>
+  3638830881U, // <4,4,7,2>: Cost 4 vext1 <2,4,4,7>, <2,4,4,7>
+  3735115018U, // <4,4,7,3>: Cost 4 vext2 <7,3,4,4>, <7,3,4,4>
+  2662036827U, // <4,4,7,4>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
+  2713292236U, // <4,4,7,5>: Cost 3 vext3 <4,7,5,4>, <4,7,5,4>
+  2713365973U, // <4,4,7,6>: Cost 3 vext3 <4,7,6,4>, <4,7,6,4>
+  2644121196U, // <4,4,7,7>: Cost 3 vext2 <4,4,4,4>, <7,7,7,7>
+  2662036827U, // <4,4,7,u>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
+  1503297638U, // <4,4,u,0>: Cost 2 vext1 <4,4,4,u>, LHS
+  1570379566U, // <4,4,u,1>: Cost 2 vext2 <4,4,4,4>, LHS
+  2779682606U, // <4,4,u,2>: Cost 3 vuzpl <4,6,4,6>, LHS
+  2571069947U, // <4,4,u,3>: Cost 3 vext1 <3,4,4,u>, <3,4,4,u>
+  161926454U, // <4,4,u,4>: Cost 1 vdup0 RHS
+  1841638710U, // <4,4,u,5>: Cost 2 vzipl RHS, RHS
+  1618169385U, // <4,4,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  2594960735U, // <4,4,u,7>: Cost 3 vext1 <7,4,4,u>, <7,4,4,u>
+  161926454U, // <4,4,u,u>: Cost 1 vdup0 RHS
+  2631516160U, // <4,5,0,0>: Cost 3 vext2 <2,3,4,5>, <0,0,0,0>
+  1557774438U, // <4,5,0,1>: Cost 2 vext2 <2,3,4,5>, LHS
+  2618908875U, // <4,5,0,2>: Cost 3 vext2 <0,2,4,5>, <0,2,4,5>
+  2571078140U, // <4,5,0,3>: Cost 3 vext1 <3,4,5,0>, <3,4,5,0>
+  2626871634U, // <4,5,0,4>: Cost 3 vext2 <1,5,4,5>, <0,4,1,5>
+  3705258414U, // <4,5,0,5>: Cost 4 vext2 <2,3,4,5>, <0,5,2,7>
+  2594968438U, // <4,5,0,6>: Cost 3 vext1 <7,4,5,0>, <6,7,4,5>
+  2594968928U, // <4,5,0,7>: Cost 3 vext1 <7,4,5,0>, <7,4,5,0>
+  1557775005U, // <4,5,0,u>: Cost 2 vext2 <2,3,4,5>, LHS
+  2631516918U, // <4,5,1,0>: Cost 3 vext2 <2,3,4,5>, <1,0,3,2>
+  2624217939U, // <4,5,1,1>: Cost 3 vext2 <1,1,4,5>, <1,1,4,5>
+  2631517078U, // <4,5,1,2>: Cost 3 vext2 <2,3,4,5>, <1,2,3,0>
+  2821341286U, // <4,5,1,3>: Cost 3 vuzpr <0,4,1,5>, LHS
+  3895086054U, // <4,5,1,4>: Cost 4 vuzpr <0,4,1,5>, <4,1,5,4>
+  2626872471U, // <4,5,1,5>: Cost 3 vext2 <1,5,4,5>, <1,5,4,5>
+  3895083131U, // <4,5,1,6>: Cost 4 vuzpr <0,4,1,5>, <0,1,4,6>
+  2718748368U, // <4,5,1,7>: Cost 3 vext3 <5,6,7,4>, <5,1,7,3>
+  2821341291U, // <4,5,1,u>: Cost 3 vuzpr <0,4,1,5>, LHS
+  2571092070U, // <4,5,2,0>: Cost 3 vext1 <3,4,5,2>, LHS
+  3699287585U, // <4,5,2,1>: Cost 4 vext2 <1,3,4,5>, <2,1,3,3>
+  2630854269U, // <4,5,2,2>: Cost 3 vext2 <2,2,4,5>, <2,2,4,5>
+  1557776078U, // <4,5,2,3>: Cost 2 vext2 <2,3,4,5>, <2,3,4,5>
+  2631517974U, // <4,5,2,4>: Cost 3 vext2 <2,3,4,5>, <2,4,3,5>
+  3692652384U, // <4,5,2,5>: Cost 4 vext2 <0,2,4,5>, <2,5,2,7>
+  2631518138U, // <4,5,2,6>: Cost 3 vext2 <2,3,4,5>, <2,6,3,7>
+  4164013366U, // <4,5,2,7>: Cost 4 vtrnr <0,4,u,2>, RHS
+  1561094243U, // <4,5,2,u>: Cost 2 vext2 <2,u,4,5>, <2,u,4,5>
+  2631518358U, // <4,5,3,0>: Cost 3 vext2 <2,3,4,5>, <3,0,1,2>
+  3895084710U, // <4,5,3,1>: Cost 4 vuzpr <0,4,1,5>, <2,3,0,1>
+  2631518540U, // <4,5,3,2>: Cost 3 vext2 <2,3,4,5>, <3,2,3,4>
+  2631518620U, // <4,5,3,3>: Cost 3 vext2 <2,3,4,5>, <3,3,3,3>
+  2631518716U, // <4,5,3,4>: Cost 3 vext2 <2,3,4,5>, <3,4,5,0>
+  2631518784U, // <4,5,3,5>: Cost 3 vext2 <2,3,4,5>, <3,5,3,5>
+  2658060980U, // <4,5,3,6>: Cost 3 vext2 <6,7,4,5>, <3,6,7,4>
+  2640145131U, // <4,5,3,7>: Cost 3 vext2 <3,7,4,5>, <3,7,4,5>
+  2631519006U, // <4,5,3,u>: Cost 3 vext2 <2,3,4,5>, <3,u,1,2>
+  2571108454U, // <4,5,4,0>: Cost 3 vext1 <3,4,5,4>, LHS
+  3632907342U, // <4,5,4,1>: Cost 4 vext1 <1,4,5,4>, <1,4,5,4>
+  2571110094U, // <4,5,4,2>: Cost 3 vext1 <3,4,5,4>, <2,3,4,5>
+  2571110912U, // <4,5,4,3>: Cost 3 vext1 <3,4,5,4>, <3,4,5,4>
+  2571111734U, // <4,5,4,4>: Cost 3 vext1 <3,4,5,4>, RHS
+  1557777718U, // <4,5,4,5>: Cost 2 vext2 <2,3,4,5>, RHS
+  2645454195U, // <4,5,4,6>: Cost 3 vext2 <4,6,4,5>, <4,6,4,5>
+  2718748614U, // <4,5,4,7>: Cost 3 vext3 <5,6,7,4>, <5,4,7,6>
+  1557777961U, // <4,5,4,u>: Cost 2 vext2 <2,3,4,5>, RHS
+  1503346790U, // <4,5,5,0>: Cost 2 vext1 <4,4,5,5>, LHS
+  2913398480U, // <4,5,5,1>: Cost 3 vzipl RHS, <5,1,7,3>
+  2631519998U, // <4,5,5,2>: Cost 3 vext2 <2,3,4,5>, <5,2,3,4>
+  2577090710U, // <4,5,5,3>: Cost 3 vext1 <4,4,5,5>, <3,0,1,2>
+  1503349978U, // <4,5,5,4>: Cost 2 vext1 <4,4,5,5>, <4,4,5,5>
+  2631520260U, // <4,5,5,5>: Cost 3 vext2 <2,3,4,5>, <5,5,5,5>
+  2913390690U, // <4,5,5,6>: Cost 3 vzipl RHS, <5,6,7,0>
+  2821344566U, // <4,5,5,7>: Cost 3 vuzpr <0,4,1,5>, RHS
+  1503352622U, // <4,5,5,u>: Cost 2 vext1 <4,4,5,5>, LHS
+  1497383014U, // <4,5,6,0>: Cost 2 vext1 <3,4,5,6>, LHS
+  2559181904U, // <4,5,6,1>: Cost 3 vext1 <1,4,5,6>, <1,4,5,6>
+  2565154601U, // <4,5,6,2>: Cost 3 vext1 <2,4,5,6>, <2,4,5,6>
+  1497385474U, // <4,5,6,3>: Cost 2 vext1 <3,4,5,6>, <3,4,5,6>
+  1497386294U, // <4,5,6,4>: Cost 2 vext1 <3,4,5,6>, RHS
+  3047608324U, // <4,5,6,5>: Cost 3 vtrnl RHS, <5,5,5,5>
+  2571129656U, // <4,5,6,6>: Cost 3 vext1 <3,4,5,6>, <6,6,6,6>
+  27705344U, // <4,5,6,7>: Cost 0 copy RHS
+  27705344U, // <4,5,6,u>: Cost 0 copy RHS
+  2565161062U, // <4,5,7,0>: Cost 3 vext1 <2,4,5,7>, LHS
+  2565161882U, // <4,5,7,1>: Cost 3 vext1 <2,4,5,7>, <1,2,3,4>
+  2565162794U, // <4,5,7,2>: Cost 3 vext1 <2,4,5,7>, <2,4,5,7>
+  2661381387U, // <4,5,7,3>: Cost 3 vext2 <7,3,4,5>, <7,3,4,5>
+  2565164342U, // <4,5,7,4>: Cost 3 vext1 <2,4,5,7>, RHS
+  2718748840U, // <4,5,7,5>: Cost 3 vext3 <5,6,7,4>, <5,7,5,7>
+  2718748846U, // <4,5,7,6>: Cost 3 vext3 <5,6,7,4>, <5,7,6,4>
+  2719412407U, // <4,5,7,7>: Cost 3 vext3 <5,7,7,4>, <5,7,7,4>
+  2565166894U, // <4,5,7,u>: Cost 3 vext1 <2,4,5,7>, LHS
+  1497399398U, // <4,5,u,0>: Cost 2 vext1 <3,4,5,u>, LHS
+  1557780270U, // <4,5,u,1>: Cost 2 vext2 <2,3,4,5>, LHS
+  2631522181U, // <4,5,u,2>: Cost 3 vext2 <2,3,4,5>, <u,2,3,0>
+  1497401860U, // <4,5,u,3>: Cost 2 vext1 <3,4,5,u>, <3,4,5,u>
+  1497402678U, // <4,5,u,4>: Cost 2 vext1 <3,4,5,u>, RHS
+  1557780634U, // <4,5,u,5>: Cost 2 vext2 <2,3,4,5>, RHS
+  2631522512U, // <4,5,u,6>: Cost 3 vext2 <2,3,4,5>, <u,6,3,7>
+  27705344U, // <4,5,u,7>: Cost 0 copy RHS
+  27705344U, // <4,5,u,u>: Cost 0 copy RHS
+  2618916864U, // <4,6,0,0>: Cost 3 vext2 <0,2,4,6>, <0,0,0,0>
+  1545175142U, // <4,6,0,1>: Cost 2 vext2 <0,2,4,6>, LHS
+  1545175244U, // <4,6,0,2>: Cost 2 vext2 <0,2,4,6>, <0,2,4,6>
+  3692658940U, // <4,6,0,3>: Cost 4 vext2 <0,2,4,6>, <0,3,1,0>
+  2618917202U, // <4,6,0,4>: Cost 3 vext2 <0,2,4,6>, <0,4,1,5>
+  3852910806U, // <4,6,0,5>: Cost 4 vuzpl RHS, <0,2,5,7>
+  2253525648U, // <4,6,0,6>: Cost 3 vrev <6,4,6,0>
+  4040764726U, // <4,6,0,7>: Cost 4 vzipr <2,3,4,0>, RHS
+  1545175709U, // <4,6,0,u>: Cost 2 vext2 <0,2,4,6>, LHS
+  2618917622U, // <4,6,1,0>: Cost 3 vext2 <0,2,4,6>, <1,0,3,2>
+  2618917684U, // <4,6,1,1>: Cost 3 vext2 <0,2,4,6>, <1,1,1,1>
+  2618917782U, // <4,6,1,2>: Cost 3 vext2 <0,2,4,6>, <1,2,3,0>
+  2618917848U, // <4,6,1,3>: Cost 3 vext2 <0,2,4,6>, <1,3,1,3>
+  3692659773U, // <4,6,1,4>: Cost 4 vext2 <0,2,4,6>, <1,4,3,5>
+  2618918032U, // <4,6,1,5>: Cost 3 vext2 <0,2,4,6>, <1,5,3,7>
+  3692659937U, // <4,6,1,6>: Cost 4 vext2 <0,2,4,6>, <1,6,3,7>
+  4032146742U, // <4,6,1,7>: Cost 4 vzipr <0,u,4,1>, RHS
+  2618918253U, // <4,6,1,u>: Cost 3 vext2 <0,2,4,6>, <1,u,1,3>
+  2618918380U, // <4,6,2,0>: Cost 3 vext2 <0,2,4,6>, <2,0,6,4>
+  2618918460U, // <4,6,2,1>: Cost 3 vext2 <0,2,4,6>, <2,1,6,3>
+  2618918504U, // <4,6,2,2>: Cost 3 vext2 <0,2,4,6>, <2,2,2,2>
+  2618918566U, // <4,6,2,3>: Cost 3 vext2 <0,2,4,6>, <2,3,0,1>
+  2618918679U, // <4,6,2,4>: Cost 3 vext2 <0,2,4,6>, <2,4,3,6>
+  2618918788U, // <4,6,2,5>: Cost 3 vext2 <0,2,4,6>, <2,5,6,7>
+  2618918842U, // <4,6,2,6>: Cost 3 vext2 <0,2,4,6>, <2,6,3,7>
+  2718749178U, // <4,6,2,7>: Cost 3 vext3 <5,6,7,4>, <6,2,7,3>
+  2618918971U, // <4,6,2,u>: Cost 3 vext2 <0,2,4,6>, <2,u,0,1>
+  2618919062U, // <4,6,3,0>: Cost 3 vext2 <0,2,4,6>, <3,0,1,2>
+  2636171526U, // <4,6,3,1>: Cost 3 vext2 <3,1,4,6>, <3,1,4,6>
+  3692661057U, // <4,6,3,2>: Cost 4 vext2 <0,2,4,6>, <3,2,2,2>
+  2618919324U, // <4,6,3,3>: Cost 3 vext2 <0,2,4,6>, <3,3,3,3>
+  2618919426U, // <4,6,3,4>: Cost 3 vext2 <0,2,4,6>, <3,4,5,6>
+  2638826058U, // <4,6,3,5>: Cost 3 vext2 <3,5,4,6>, <3,5,4,6>
+  3913303030U, // <4,6,3,6>: Cost 4 vuzpr <3,4,5,6>, <1,3,4,6>
+  2722730572U, // <4,6,3,7>: Cost 3 vext3 <6,3,7,4>, <6,3,7,4>
+  2618919710U, // <4,6,3,u>: Cost 3 vext2 <0,2,4,6>, <3,u,1,2>
+  2565210214U, // <4,6,4,0>: Cost 3 vext1 <2,4,6,4>, LHS
+  2718749286U, // <4,6,4,1>: Cost 3 vext3 <5,6,7,4>, <6,4,1,3>
+  2565211952U, // <4,6,4,2>: Cost 3 vext1 <2,4,6,4>, <2,4,6,4>
+  2571184649U, // <4,6,4,3>: Cost 3 vext1 <3,4,6,4>, <3,4,6,4>
+  2565213494U, // <4,6,4,4>: Cost 3 vext1 <2,4,6,4>, RHS
+  1545178422U, // <4,6,4,5>: Cost 2 vext2 <0,2,4,6>, RHS
+  1705430326U, // <4,6,4,6>: Cost 2 vuzpl RHS, RHS
+  2595075437U, // <4,6,4,7>: Cost 3 vext1 <7,4,6,4>, <7,4,6,4>
+  1545178665U, // <4,6,4,u>: Cost 2 vext2 <0,2,4,6>, RHS
+  2565218406U, // <4,6,5,0>: Cost 3 vext1 <2,4,6,5>, LHS
+  2645462736U, // <4,6,5,1>: Cost 3 vext2 <4,6,4,6>, <5,1,7,3>
+  2913399290U, // <4,6,5,2>: Cost 3 vzipl RHS, <6,2,7,3>
+  3913305394U, // <4,6,5,3>: Cost 4 vuzpr <3,4,5,6>, <4,5,6,3>
+  2645462982U, // <4,6,5,4>: Cost 3 vext2 <4,6,4,6>, <5,4,7,6>
+  2779172868U, // <4,6,5,5>: Cost 3 vuzpl RHS, <5,5,5,5>
+  2913391416U, // <4,6,5,6>: Cost 3 vzipl RHS, <6,6,6,6>
+  2821426486U, // <4,6,5,7>: Cost 3 vuzpr <0,4,2,6>, RHS
+  2821426487U, // <4,6,5,u>: Cost 3 vuzpr <0,4,2,6>, RHS
+  1503428710U, // <4,6,6,0>: Cost 2 vext1 <4,4,6,6>, LHS
+  2577171190U, // <4,6,6,1>: Cost 3 vext1 <4,4,6,6>, <1,0,3,2>
+  2645463546U, // <4,6,6,2>: Cost 3 vext2 <4,6,4,6>, <6,2,7,3>
+  2577172630U, // <4,6,6,3>: Cost 3 vext1 <4,4,6,6>, <3,0,1,2>
+  1503431908U, // <4,6,6,4>: Cost 2 vext1 <4,4,6,6>, <4,4,6,6>
+  2253501069U, // <4,6,6,5>: Cost 3 vrev <6,4,5,6>
+  2618921784U, // <4,6,6,6>: Cost 3 vext2 <0,2,4,6>, <6,6,6,6>
+  2954464566U, // <4,6,6,7>: Cost 3 vzipr <0,2,4,6>, RHS
+  1503434542U, // <4,6,6,u>: Cost 2 vext1 <4,4,6,6>, LHS
+  2645464058U, // <4,6,7,0>: Cost 3 vext2 <4,6,4,6>, <7,0,1,2>
+  2779173882U, // <4,6,7,1>: Cost 3 vuzpl RHS, <7,0,1,2>
+  3638978355U, // <4,6,7,2>: Cost 4 vext1 <2,4,6,7>, <2,4,6,7>
+  2725090156U, // <4,6,7,3>: Cost 3 vext3 <6,7,3,4>, <6,7,3,4>
+  2645464422U, // <4,6,7,4>: Cost 3 vext2 <4,6,4,6>, <7,4,5,6>
+  2779174246U, // <4,6,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
+  3852915914U, // <4,6,7,6>: Cost 4 vuzpl RHS, <7,2,6,3>
+  2779174508U, // <4,6,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
+  2779173945U, // <4,6,7,u>: Cost 3 vuzpl RHS, <7,0,u,2>
+  1503445094U, // <4,6,u,0>: Cost 2 vext1 <4,4,6,u>, LHS
+  1545180974U, // <4,6,u,1>: Cost 2 vext2 <0,2,4,6>, LHS
+  1705432878U, // <4,6,u,2>: Cost 2 vuzpl RHS, LHS
+  2618922940U, // <4,6,u,3>: Cost 3 vext2 <0,2,4,6>, <u,3,0,1>
+  1503448294U, // <4,6,u,4>: Cost 2 vext1 <4,4,6,u>, <4,4,6,u>
+  1545181338U, // <4,6,u,5>: Cost 2 vext2 <0,2,4,6>, RHS
+  1705433242U, // <4,6,u,6>: Cost 2 vuzpl RHS, RHS
+  2954480950U, // <4,6,u,7>: Cost 3 vzipr <0,2,4,u>, RHS
+  1545181541U, // <4,6,u,u>: Cost 2 vext2 <0,2,4,6>, LHS
+  3706601472U, // <4,7,0,0>: Cost 4 vext2 <2,5,4,7>, <0,0,0,0>
+  2632859750U, // <4,7,0,1>: Cost 3 vext2 <2,5,4,7>, LHS
+  2726343685U, // <4,7,0,2>: Cost 3 vext3 <7,0,2,4>, <7,0,2,4>
+  3701293312U, // <4,7,0,3>: Cost 4 vext2 <1,6,4,7>, <0,3,1,4>
+  3706601810U, // <4,7,0,4>: Cost 4 vext2 <2,5,4,7>, <0,4,1,5>
+  2259424608U, // <4,7,0,5>: Cost 3 vrev <7,4,5,0>
+  3695321617U, // <4,7,0,6>: Cost 4 vext2 <0,6,4,7>, <0,6,4,7>
+  3800454194U, // <4,7,0,7>: Cost 4 vext3 <7,0,7,4>, <7,0,7,4>
+  2632860317U, // <4,7,0,u>: Cost 3 vext2 <2,5,4,7>, LHS
+  2259064116U, // <4,7,1,0>: Cost 3 vrev <7,4,0,1>
+  3700630324U, // <4,7,1,1>: Cost 4 vext2 <1,5,4,7>, <1,1,1,1>
+  2632860570U, // <4,7,1,2>: Cost 3 vext2 <2,5,4,7>, <1,2,3,4>
+  3769635936U, // <4,7,1,3>: Cost 4 vext3 <1,u,3,4>, <7,1,3,5>
+  3656920374U, // <4,7,1,4>: Cost 4 vext1 <5,4,7,1>, RHS
+  3700630681U, // <4,7,1,5>: Cost 4 vext2 <1,5,4,7>, <1,5,4,7>
+  3701294314U, // <4,7,1,6>: Cost 4 vext2 <1,6,4,7>, <1,6,4,7>
+  3793818754U, // <4,7,1,7>: Cost 4 vext3 <5,u,7,4>, <7,1,7,3>
+  2259654012U, // <4,7,1,u>: Cost 3 vrev <7,4,u,1>
+  3656925286U, // <4,7,2,0>: Cost 4 vext1 <5,4,7,2>, LHS
+  3706603050U, // <4,7,2,1>: Cost 4 vext2 <2,5,4,7>, <2,1,4,3>
+  3706603112U, // <4,7,2,2>: Cost 4 vext2 <2,5,4,7>, <2,2,2,2>
+  2727744688U, // <4,7,2,3>: Cost 3 vext3 <7,2,3,4>, <7,2,3,4>
+  3705939745U, // <4,7,2,4>: Cost 4 vext2 <2,4,4,7>, <2,4,4,7>
+  2632861554U, // <4,7,2,5>: Cost 3 vext2 <2,5,4,7>, <2,5,4,7>
+  3706603450U, // <4,7,2,6>: Cost 4 vext2 <2,5,4,7>, <2,6,3,7>
+  3792491731U, // <4,7,2,7>: Cost 4 vext3 <5,6,7,4>, <7,2,7,3>
+  2634852453U, // <4,7,2,u>: Cost 3 vext2 <2,u,4,7>, <2,u,4,7>
+  3706603670U, // <4,7,3,0>: Cost 4 vext2 <2,5,4,7>, <3,0,1,2>
+  3662906266U, // <4,7,3,1>: Cost 4 vext1 <6,4,7,3>, <1,2,3,4>
+  3725183326U, // <4,7,3,2>: Cost 4 vext2 <5,6,4,7>, <3,2,5,4>
+  3706603932U, // <4,7,3,3>: Cost 4 vext2 <2,5,4,7>, <3,3,3,3>
+  3701295618U, // <4,7,3,4>: Cost 4 vext2 <1,6,4,7>, <3,4,5,6>
+  2638834251U, // <4,7,3,5>: Cost 3 vext2 <3,5,4,7>, <3,5,4,7>
+  2639497884U, // <4,7,3,6>: Cost 3 vext2 <3,6,4,7>, <3,6,4,7>
+  3802445093U, // <4,7,3,7>: Cost 4 vext3 <7,3,7,4>, <7,3,7,4>
+  2640825150U, // <4,7,3,u>: Cost 3 vext2 <3,u,4,7>, <3,u,4,7>
+  2718750004U, // <4,7,4,0>: Cost 3 vext3 <5,6,7,4>, <7,4,0,1>
+  3706604490U, // <4,7,4,1>: Cost 4 vext2 <2,5,4,7>, <4,1,2,3>
+  3656943474U, // <4,7,4,2>: Cost 4 vext1 <5,4,7,4>, <2,5,4,7>
+  3779884371U, // <4,7,4,3>: Cost 4 vext3 <3,5,7,4>, <7,4,3,5>
+  2259383643U, // <4,7,4,4>: Cost 3 vrev <7,4,4,4>
+  2632863030U, // <4,7,4,5>: Cost 3 vext2 <2,5,4,7>, RHS
+  2259531117U, // <4,7,4,6>: Cost 3 vrev <7,4,6,4>
+  3907340074U, // <4,7,4,7>: Cost 4 vuzpr <2,4,5,7>, <2,4,5,7>
+  2632863273U, // <4,7,4,u>: Cost 3 vext2 <2,5,4,7>, RHS
+  2913391610U, // <4,7,5,0>: Cost 3 vzipl RHS, <7,0,1,2>
+  3645006848U, // <4,7,5,1>: Cost 4 vext1 <3,4,7,5>, <1,3,5,7>
+  2589181646U, // <4,7,5,2>: Cost 3 vext1 <6,4,7,5>, <2,3,4,5>
+  3645008403U, // <4,7,5,3>: Cost 4 vext1 <3,4,7,5>, <3,4,7,5>
+  2913391974U, // <4,7,5,4>: Cost 3 vzipl RHS, <7,4,5,6>
+  2583211973U, // <4,7,5,5>: Cost 3 vext1 <5,4,7,5>, <5,4,7,5>
+  2589184670U, // <4,7,5,6>: Cost 3 vext1 <6,4,7,5>, <6,4,7,5>
+  2913392236U, // <4,7,5,7>: Cost 3 vzipl RHS, <7,7,7,7>
+  2913392258U, // <4,7,5,u>: Cost 3 vzipl RHS, <7,u,1,2>
+  1509474406U, // <4,7,6,0>: Cost 2 vext1 <5,4,7,6>, LHS
+  3047609338U, // <4,7,6,1>: Cost 3 vtrnl RHS, <7,0,1,2>
+  2583217768U, // <4,7,6,2>: Cost 3 vext1 <5,4,7,6>, <2,2,2,2>
+  2583218326U, // <4,7,6,3>: Cost 3 vext1 <5,4,7,6>, <3,0,1,2>
+  1509477686U, // <4,7,6,4>: Cost 2 vext1 <5,4,7,6>, RHS
+  1509478342U, // <4,7,6,5>: Cost 2 vext1 <5,4,7,6>, <5,4,7,6>
+  2583220730U, // <4,7,6,6>: Cost 3 vext1 <5,4,7,6>, <6,2,7,3>
+  3047609964U, // <4,7,6,7>: Cost 3 vtrnl RHS, <7,7,7,7>
+  1509480238U, // <4,7,6,u>: Cost 2 vext1 <5,4,7,6>, LHS
+  3650994278U, // <4,7,7,0>: Cost 4 vext1 <4,4,7,7>, LHS
+  3650995098U, // <4,7,7,1>: Cost 4 vext1 <4,4,7,7>, <1,2,3,4>
+  3650996010U, // <4,7,7,2>: Cost 4 vext1 <4,4,7,7>, <2,4,5,7>
+  3804804677U, // <4,7,7,3>: Cost 4 vext3 <7,7,3,4>, <7,7,3,4>
+  3650997486U, // <4,7,7,4>: Cost 4 vext1 <4,4,7,7>, <4,4,7,7>
+  2662725039U, // <4,7,7,5>: Cost 3 vext2 <7,5,4,7>, <7,5,4,7>
+  3662942880U, // <4,7,7,6>: Cost 4 vext1 <6,4,7,7>, <6,4,7,7>
+  2718750316U, // <4,7,7,7>: Cost 3 vext3 <5,6,7,4>, <7,7,7,7>
+  2664715938U, // <4,7,7,u>: Cost 3 vext2 <7,u,4,7>, <7,u,4,7>
+  1509490790U, // <4,7,u,0>: Cost 2 vext1 <5,4,7,u>, LHS
+  2632865582U, // <4,7,u,1>: Cost 3 vext2 <2,5,4,7>, LHS
+  2583234152U, // <4,7,u,2>: Cost 3 vext1 <5,4,7,u>, <2,2,2,2>
+  2583234710U, // <4,7,u,3>: Cost 3 vext1 <5,4,7,u>, <3,0,1,2>
+  1509494070U, // <4,7,u,4>: Cost 2 vext1 <5,4,7,u>, RHS
+  1509494728U, // <4,7,u,5>: Cost 2 vext1 <5,4,7,u>, <5,4,7,u>
+  2583237114U, // <4,7,u,6>: Cost 3 vext1 <5,4,7,u>, <6,2,7,3>
+  3047757420U, // <4,7,u,7>: Cost 3 vtrnl RHS, <7,7,7,7>
+  1509496622U, // <4,7,u,u>: Cost 2 vext1 <5,4,7,u>, LHS
+  2618933248U, // <4,u,0,0>: Cost 3 vext2 <0,2,4,u>, <0,0,0,0>
+  1545191526U, // <4,u,0,1>: Cost 2 vext2 <0,2,4,u>, LHS
+  1545191630U, // <4,u,0,2>: Cost 2 vext2 <0,2,4,u>, <0,2,4,u>
+  2691913445U, // <4,u,0,3>: Cost 3 vext3 <1,2,3,4>, <u,0,3,2>
+  2618933586U, // <4,u,0,4>: Cost 3 vext2 <0,2,4,u>, <0,4,1,5>
+  2265397305U, // <4,u,0,5>: Cost 3 vrev <u,4,5,0>
+  2595189625U, // <4,u,0,6>: Cost 3 vext1 <7,4,u,0>, <6,7,4,u>
+  2595190139U, // <4,u,0,7>: Cost 3 vext1 <7,4,u,0>, <7,4,u,0>
+  1545192093U, // <4,u,0,u>: Cost 2 vext2 <0,2,4,u>, LHS
+  2618934006U, // <4,u,1,0>: Cost 3 vext2 <0,2,4,u>, <1,0,3,2>
+  2618934068U, // <4,u,1,1>: Cost 3 vext2 <0,2,4,u>, <1,1,1,1>
+  1618171694U, // <4,u,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  2618934232U, // <4,u,1,3>: Cost 3 vext2 <0,2,4,u>, <1,3,1,3>
+  2695894848U, // <4,u,1,4>: Cost 3 vext3 <1,u,3,4>, <u,1,4,3>
+  2618934416U, // <4,u,1,5>: Cost 3 vext2 <0,2,4,u>, <1,5,3,7>
+  3692676321U, // <4,u,1,6>: Cost 4 vext2 <0,2,4,u>, <1,6,3,7>
+  2718750555U, // <4,u,1,7>: Cost 3 vext3 <5,6,7,4>, <u,1,7,3>
+  1618171748U, // <4,u,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
+  2553397350U, // <4,u,2,0>: Cost 3 vext1 <0,4,u,2>, LHS
+  2630215215U, // <4,u,2,1>: Cost 3 vext2 <2,1,4,u>, <2,1,4,u>
+  2618934888U, // <4,u,2,2>: Cost 3 vext2 <0,2,4,u>, <2,2,2,2>
+  1557800657U, // <4,u,2,3>: Cost 2 vext2 <2,3,4,u>, <2,3,4,u>
+  2618935065U, // <4,u,2,4>: Cost 3 vext2 <0,2,4,u>, <2,4,3,u>
+  2733864859U, // <4,u,2,5>: Cost 3 vext3 <u,2,5,4>, <u,2,5,4>
+  2618935226U, // <4,u,2,6>: Cost 3 vext2 <0,2,4,u>, <2,6,3,7>
+  2718750636U, // <4,u,2,7>: Cost 3 vext3 <5,6,7,4>, <u,2,7,3>
+  1561118822U, // <4,u,2,u>: Cost 2 vext2 <2,u,4,u>, <2,u,4,u>
+  2618935446U, // <4,u,3,0>: Cost 3 vext2 <0,2,4,u>, <3,0,1,2>
+  2779318422U, // <4,u,3,1>: Cost 3 vuzpl RHS, <3,0,1,2>
+  2636851545U, // <4,u,3,2>: Cost 3 vext2 <3,2,4,u>, <3,2,4,u>
+  2618935708U, // <4,u,3,3>: Cost 3 vext2 <0,2,4,u>, <3,3,3,3>
+  2618935810U, // <4,u,3,4>: Cost 3 vext2 <0,2,4,u>, <3,4,5,6>
+  2691913711U, // <4,u,3,5>: Cost 3 vext3 <1,2,3,4>, <u,3,5,7>
+  2588725862U, // <4,u,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+  2640169710U, // <4,u,3,7>: Cost 3 vext2 <3,7,4,u>, <3,7,4,u>
+  2618936094U, // <4,u,3,u>: Cost 3 vext2 <0,2,4,u>, <3,u,1,2>
+  1503559782U, // <4,u,4,0>: Cost 2 vext1 <4,4,u,4>, LHS
+  2692282391U, // <4,u,4,1>: Cost 3 vext3 <1,2,u,4>, <u,4,1,2>
+  2565359426U, // <4,u,4,2>: Cost 3 vext1 <2,4,u,4>, <2,4,u,4>
+  2571332123U, // <4,u,4,3>: Cost 3 vext1 <3,4,u,4>, <3,4,u,4>
+  161926454U, // <4,u,4,4>: Cost 1 vdup0 RHS
+  1545194806U, // <4,u,4,5>: Cost 2 vext2 <0,2,4,u>, RHS
+  1705577782U, // <4,u,4,6>: Cost 2 vuzpl RHS, RHS
+  2718750801U, // <4,u,4,7>: Cost 3 vext3 <5,6,7,4>, <u,4,7,6>
+  161926454U, // <4,u,4,u>: Cost 1 vdup0 RHS
+  1479164006U, // <4,u,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
+  1839650606U, // <4,u,5,1>: Cost 2 vzipl RHS, LHS
+  2565367502U, // <4,u,5,2>: Cost 3 vext1 <2,4,u,5>, <2,3,4,5>
+  3089777309U, // <4,u,5,3>: Cost 3 vtrnr <0,4,1,5>, LHS
+  1479167286U, // <4,u,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
+  1839650970U, // <4,u,5,5>: Cost 2 vzipl RHS, RHS
+  1618172058U, // <4,u,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  3089780265U, // <4,u,5,7>: Cost 3 vtrnr <0,4,1,5>, RHS
+  1618172076U, // <4,u,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
+  1479688294U, // <4,u,6,0>: Cost 2 vext1 <0,4,u,6>, LHS
+  2553430774U, // <4,u,6,1>: Cost 3 vext1 <0,4,u,6>, <1,0,3,2>
+  1973868334U, // <4,u,6,2>: Cost 2 vtrnl RHS, LHS
+  1497606685U, // <4,u,6,3>: Cost 2 vext1 <3,4,u,6>, <3,4,u,6>
+  1479691574U, // <4,u,6,4>: Cost 2 vext1 <0,4,u,6>, RHS
+  1509552079U, // <4,u,6,5>: Cost 2 vext1 <5,4,u,6>, <5,4,u,6>
+  1973868698U, // <4,u,6,6>: Cost 2 vtrnl RHS, RHS
+  27705344U, // <4,u,6,7>: Cost 0 copy RHS
+  27705344U, // <4,u,6,u>: Cost 0 copy RHS
+  2565382246U, // <4,u,7,0>: Cost 3 vext1 <2,4,u,7>, LHS
+  2565383066U, // <4,u,7,1>: Cost 3 vext1 <2,4,u,7>, <1,2,3,4>
+  2565384005U, // <4,u,7,2>: Cost 3 vext1 <2,4,u,7>, <2,4,u,7>
+  2661405966U, // <4,u,7,3>: Cost 3 vext2 <7,3,4,u>, <7,3,4,u>
+  2565385526U, // <4,u,7,4>: Cost 3 vext1 <2,4,u,7>, RHS
+  2779321702U, // <4,u,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
+  2589274793U, // <4,u,7,6>: Cost 3 vext1 <6,4,u,7>, <6,4,u,7>
+  2779321964U, // <4,u,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
+  2565388078U, // <4,u,7,u>: Cost 3 vext1 <2,4,u,7>, LHS
+  1479704678U, // <4,u,u,0>: Cost 2 vext1 <0,4,u,u>, LHS
+  1545197358U, // <4,u,u,1>: Cost 2 vext2 <0,2,4,u>, LHS
+  1618172261U, // <4,u,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  1497623071U, // <4,u,u,3>: Cost 2 vext1 <3,4,u,u>, <3,4,u,u>
+  161926454U, // <4,u,u,4>: Cost 1 vdup0 RHS
+  1545197722U, // <4,u,u,5>: Cost 2 vext2 <0,2,4,u>, RHS
+  1618172301U, // <4,u,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  27705344U, // <4,u,u,7>: Cost 0 copy RHS
+  27705344U, // <4,u,u,u>: Cost 0 copy RHS
+  2687123456U, // <5,0,0,0>: Cost 3 vext3 <0,4,1,5>, <0,0,0,0>
+  2687123466U, // <5,0,0,1>: Cost 3 vext3 <0,4,1,5>, <0,0,1,1>
+  2687123476U, // <5,0,0,2>: Cost 3 vext3 <0,4,1,5>, <0,0,2,2>
+  3710599434U, // <5,0,0,3>: Cost 4 vext2 <3,2,5,0>, <0,3,2,5>
+  2642166098U, // <5,0,0,4>: Cost 3 vext2 <4,1,5,0>, <0,4,1,5>
+  3657060306U, // <5,0,0,5>: Cost 4 vext1 <5,5,0,0>, <5,5,0,0>
+  3292094923U, // <5,0,0,6>: Cost 4 vrev <0,5,6,0>
+  3669005700U, // <5,0,0,7>: Cost 4 vext1 <7,5,0,0>, <7,5,0,0>
+  2687123530U, // <5,0,0,u>: Cost 3 vext3 <0,4,1,5>, <0,0,u,2>
+  2559434854U, // <5,0,1,0>: Cost 3 vext1 <1,5,0,1>, LHS
+  2559435887U, // <5,0,1,1>: Cost 3 vext1 <1,5,0,1>, <1,5,0,1>
+  1613381734U, // <5,0,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  3698656256U, // <5,0,1,3>: Cost 4 vext2 <1,2,5,0>, <1,3,5,7>
+  2559438134U, // <5,0,1,4>: Cost 3 vext1 <1,5,0,1>, RHS
+  2583326675U, // <5,0,1,5>: Cost 3 vext1 <5,5,0,1>, <5,5,0,1>
+  3715908851U, // <5,0,1,6>: Cost 4 vext2 <4,1,5,0>, <1,6,5,7>
+  3657069562U, // <5,0,1,7>: Cost 4 vext1 <5,5,0,1>, <7,0,1,2>
+  1613381788U, // <5,0,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
+  2686017700U, // <5,0,2,0>: Cost 3 vext3 <0,2,4,5>, <0,2,0,2>
+  2685796528U, // <5,0,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
+  2698625208U, // <5,0,2,2>: Cost 3 vext3 <2,3,4,5>, <0,2,2,4>
+  2685944002U, // <5,0,2,3>: Cost 3 vext3 <0,2,3,5>, <0,2,3,5>
+  2686017739U, // <5,0,2,4>: Cost 3 vext3 <0,2,4,5>, <0,2,4,5>
+  2686091476U, // <5,0,2,5>: Cost 3 vext3 <0,2,5,5>, <0,2,5,5>
+  2725167324U, // <5,0,2,6>: Cost 3 vext3 <6,7,4,5>, <0,2,6,4>
+  2595280230U, // <5,0,2,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
+  2686312687U, // <5,0,2,u>: Cost 3 vext3 <0,2,u,5>, <0,2,u,5>
+  3760128248U, // <5,0,3,0>: Cost 4 vext3 <0,3,0,5>, <0,3,0,5>
+  3759685888U, // <5,0,3,1>: Cost 4 vext3 <0,2,3,5>, <0,3,1,4>
+  2686533898U, // <5,0,3,2>: Cost 3 vext3 <0,3,2,5>, <0,3,2,5>
+  3760349459U, // <5,0,3,3>: Cost 4 vext3 <0,3,3,5>, <0,3,3,5>
+  2638187004U, // <5,0,3,4>: Cost 3 vext2 <3,4,5,0>, <3,4,5,0>
+  3776348452U, // <5,0,3,5>: Cost 4 vext3 <3,0,4,5>, <0,3,5,4>
+  3713256094U, // <5,0,3,6>: Cost 4 vext2 <3,6,5,0>, <3,6,5,0>
+  3914064896U, // <5,0,3,7>: Cost 4 vuzpr <3,5,7,0>, <1,3,5,7>
+  2686976320U, // <5,0,3,u>: Cost 3 vext3 <0,3,u,5>, <0,3,u,5>
+  2559459430U, // <5,0,4,0>: Cost 3 vext1 <1,5,0,4>, LHS
+  1613381970U, // <5,0,4,1>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
+  2687123804U, // <5,0,4,2>: Cost 3 vext3 <0,4,1,5>, <0,4,2,6>
+  3761013092U, // <5,0,4,3>: Cost 4 vext3 <0,4,3,5>, <0,4,3,5>
+  2559462710U, // <5,0,4,4>: Cost 3 vext1 <1,5,0,4>, RHS
+  2638187830U, // <5,0,4,5>: Cost 3 vext2 <3,4,5,0>, RHS
+  3761234303U, // <5,0,4,6>: Cost 4 vext3 <0,4,6,5>, <0,4,6,5>
+  2646150600U, // <5,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
+  1613381970U, // <5,0,4,u>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
+  3766763926U, // <5,0,5,0>: Cost 4 vext3 <1,4,0,5>, <0,5,0,1>
+  2919268454U, // <5,0,5,1>: Cost 3 vzipl <5,5,5,5>, LHS
+  3053486182U, // <5,0,5,2>: Cost 3 vtrnl <5,5,5,5>, LHS
+  3723210589U, // <5,0,5,3>: Cost 4 vext2 <5,3,5,0>, <5,3,5,0>
+  3766763966U, // <5,0,5,4>: Cost 4 vext3 <1,4,0,5>, <0,5,4,5>
+  2650796031U, // <5,0,5,5>: Cost 3 vext2 <5,5,5,0>, <5,5,5,0>
+  3719893090U, // <5,0,5,6>: Cost 4 vext2 <4,7,5,0>, <5,6,7,0>
+  3914067254U, // <5,0,5,7>: Cost 4 vuzpr <3,5,7,0>, RHS
+  2919269021U, // <5,0,5,u>: Cost 3 vzipl <5,5,5,5>, LHS
+  4047519744U, // <5,0,6,0>: Cost 4 vzipr <3,4,5,6>, <0,0,0,0>
+  2920038502U, // <5,0,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
+  3759759871U, // <5,0,6,2>: Cost 4 vext3 <0,2,4,5>, <0,6,2,7>
+  3645164070U, // <5,0,6,3>: Cost 4 vext1 <3,5,0,6>, <3,5,0,6>
+  3762414095U, // <5,0,6,4>: Cost 4 vext3 <0,6,4,5>, <0,6,4,5>
+  3993780690U, // <5,0,6,5>: Cost 4 vzipl <5,6,7,0>, <0,5,6,7>
+  3719893816U, // <5,0,6,6>: Cost 4 vext2 <4,7,5,0>, <6,6,6,6>
+  2662077302U, // <5,0,6,7>: Cost 3 vext2 <7,4,5,0>, <6,7,4,5>
+  2920039069U, // <5,0,6,u>: Cost 3 vzipl <5,6,7,0>, LHS
+  2565455974U, // <5,0,7,0>: Cost 3 vext1 <2,5,0,7>, LHS
+  2565456790U, // <5,0,7,1>: Cost 3 vext1 <2,5,0,7>, <1,2,3,0>
+  2565457742U, // <5,0,7,2>: Cost 3 vext1 <2,5,0,7>, <2,5,0,7>
+  3639199894U, // <5,0,7,3>: Cost 4 vext1 <2,5,0,7>, <3,0,1,2>
+  2565459254U, // <5,0,7,4>: Cost 3 vext1 <2,5,0,7>, RHS
+  2589347938U, // <5,0,7,5>: Cost 3 vext1 <6,5,0,7>, <5,6,7,0>
+  2589348530U, // <5,0,7,6>: Cost 3 vext1 <6,5,0,7>, <6,5,0,7>
+  4188456422U, // <5,0,7,7>: Cost 4 vtrnr RHS, <2,0,5,7>
+  2565461806U, // <5,0,7,u>: Cost 3 vext1 <2,5,0,7>, LHS
+  2687124106U, // <5,0,u,0>: Cost 3 vext3 <0,4,1,5>, <0,u,0,2>
+  1616036502U, // <5,0,u,1>: Cost 2 vext3 <0,u,1,5>, <0,u,1,5>
+  1613382301U, // <5,0,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  2689925800U, // <5,0,u,3>: Cost 3 vext3 <0,u,3,5>, <0,u,3,5>
+  2687124146U, // <5,0,u,4>: Cost 3 vext3 <0,4,1,5>, <0,u,4,6>
+  2638190746U, // <5,0,u,5>: Cost 3 vext2 <3,4,5,0>, RHS
+  2589356723U, // <5,0,u,6>: Cost 3 vext1 <6,5,0,u>, <6,5,0,u>
+  2595280230U, // <5,0,u,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
+  1613382355U, // <5,0,u,u>: Cost 2 vext3 <0,4,1,5>, LHS
+  2646818816U, // <5,1,0,0>: Cost 3 vext2 <4,u,5,1>, <0,0,0,0>
+  1573077094U, // <5,1,0,1>: Cost 2 vext2 <4,u,5,1>, LHS
+  2646818980U, // <5,1,0,2>: Cost 3 vext2 <4,u,5,1>, <0,2,0,2>
+  2687124214U, // <5,1,0,3>: Cost 3 vext3 <0,4,1,5>, <1,0,3,2>
+  2641510738U, // <5,1,0,4>: Cost 3 vext2 <4,0,5,1>, <0,4,1,5>
+  2641510814U, // <5,1,0,5>: Cost 3 vext2 <4,0,5,1>, <0,5,1,0>
+  3720561142U, // <5,1,0,6>: Cost 4 vext2 <4,u,5,1>, <0,6,1,7>
+  3298141357U, // <5,1,0,7>: Cost 4 vrev <1,5,7,0>
+  1573077661U, // <5,1,0,u>: Cost 2 vext2 <4,u,5,1>, LHS
+  2223891567U, // <5,1,1,0>: Cost 3 vrev <1,5,0,1>
+  2687124276U, // <5,1,1,1>: Cost 3 vext3 <0,4,1,5>, <1,1,1,1>
+  2646819734U, // <5,1,1,2>: Cost 3 vext2 <4,u,5,1>, <1,2,3,0>
+  2687124296U, // <5,1,1,3>: Cost 3 vext3 <0,4,1,5>, <1,1,3,3>
+  2691326803U, // <5,1,1,4>: Cost 3 vext3 <1,1,4,5>, <1,1,4,5>
+  2691400540U, // <5,1,1,5>: Cost 3 vext3 <1,1,5,5>, <1,1,5,5>
+  3765216101U, // <5,1,1,6>: Cost 4 vext3 <1,1,6,5>, <1,1,6,5>
+  3765289838U, // <5,1,1,7>: Cost 4 vext3 <1,1,7,5>, <1,1,7,5>
+  2687124341U, // <5,1,1,u>: Cost 3 vext3 <0,4,1,5>, <1,1,u,3>
+  3297641584U, // <5,1,2,0>: Cost 4 vrev <1,5,0,2>
+  3763520391U, // <5,1,2,1>: Cost 4 vext3 <0,u,1,5>, <1,2,1,3>
+  2646820456U, // <5,1,2,2>: Cost 3 vext2 <4,u,5,1>, <2,2,2,2>
+  2687124374U, // <5,1,2,3>: Cost 3 vext3 <0,4,1,5>, <1,2,3,0>
+  2691990436U, // <5,1,2,4>: Cost 3 vext3 <1,2,4,5>, <1,2,4,5>
+  2687124395U, // <5,1,2,5>: Cost 3 vext3 <0,4,1,5>, <1,2,5,3>
+  2646820794U, // <5,1,2,6>: Cost 3 vext2 <4,u,5,1>, <2,6,3,7>
+  3808199610U, // <5,1,2,7>: Cost 4 vext3 <u,3,4,5>, <1,2,7,0>
+  2687124419U, // <5,1,2,u>: Cost 3 vext3 <0,4,1,5>, <1,2,u,0>
+  2577440870U, // <5,1,3,0>: Cost 3 vext1 <4,5,1,3>, LHS
+  2687124440U, // <5,1,3,1>: Cost 3 vext3 <0,4,1,5>, <1,3,1,3>
+  3759686627U, // <5,1,3,2>: Cost 4 vext3 <0,2,3,5>, <1,3,2,5>
+  2692580332U, // <5,1,3,3>: Cost 3 vext3 <1,3,3,5>, <1,3,3,5>
+  2687124469U, // <5,1,3,4>: Cost 3 vext3 <0,4,1,5>, <1,3,4,5>
+  2685207552U, // <5,1,3,5>: Cost 3 vext3 <0,1,2,5>, <1,3,5,7>
+  3760866313U, // <5,1,3,6>: Cost 4 vext3 <0,4,1,5>, <1,3,6,7>
+  2692875280U, // <5,1,3,7>: Cost 3 vext3 <1,3,7,5>, <1,3,7,5>
+  2687124503U, // <5,1,3,u>: Cost 3 vext3 <0,4,1,5>, <1,3,u,3>
+  1567771538U, // <5,1,4,0>: Cost 2 vext2 <4,0,5,1>, <4,0,5,1>
+  2693096491U, // <5,1,4,1>: Cost 3 vext3 <1,4,1,5>, <1,4,1,5>
+  2693170228U, // <5,1,4,2>: Cost 3 vext3 <1,4,2,5>, <1,4,2,5>
+  2687124541U, // <5,1,4,3>: Cost 3 vext3 <0,4,1,5>, <1,4,3,5>
+  2646822096U, // <5,1,4,4>: Cost 3 vext2 <4,u,5,1>, <4,4,4,4>
+  1573080374U, // <5,1,4,5>: Cost 2 vext2 <4,u,5,1>, RHS
+  2646822260U, // <5,1,4,6>: Cost 3 vext2 <4,u,5,1>, <4,6,4,6>
+  3298174129U, // <5,1,4,7>: Cost 4 vrev <1,5,7,4>
+  1573080602U, // <5,1,4,u>: Cost 2 vext2 <4,u,5,1>, <4,u,5,1>
+  2687124591U, // <5,1,5,0>: Cost 3 vext3 <0,4,1,5>, <1,5,0,1>
+  2646822543U, // <5,1,5,1>: Cost 3 vext2 <4,u,5,1>, <5,1,0,1>
+  3760866433U, // <5,1,5,2>: Cost 4 vext3 <0,4,1,5>, <1,5,2,1>
+  2687124624U, // <5,1,5,3>: Cost 3 vext3 <0,4,1,5>, <1,5,3,7>
+  2687124631U, // <5,1,5,4>: Cost 3 vext3 <0,4,1,5>, <1,5,4,5>
+  2646822916U, // <5,1,5,5>: Cost 3 vext2 <4,u,5,1>, <5,5,5,5>
+  2646823010U, // <5,1,5,6>: Cost 3 vext2 <4,u,5,1>, <5,6,7,0>
+  2646823080U, // <5,1,5,7>: Cost 3 vext2 <4,u,5,1>, <5,7,5,7>
+  2687124663U, // <5,1,5,u>: Cost 3 vext3 <0,4,1,5>, <1,5,u,1>
+  2553577574U, // <5,1,6,0>: Cost 3 vext1 <0,5,1,6>, LHS
+  3763520719U, // <5,1,6,1>: Cost 4 vext3 <0,u,1,5>, <1,6,1,7>
+  2646823418U, // <5,1,6,2>: Cost 3 vext2 <4,u,5,1>, <6,2,7,3>
+  3760866529U, // <5,1,6,3>: Cost 4 vext3 <0,4,1,5>, <1,6,3,7>
+  2553580854U, // <5,1,6,4>: Cost 3 vext1 <0,5,1,6>, RHS
+  2687124723U, // <5,1,6,5>: Cost 3 vext3 <0,4,1,5>, <1,6,5,7>
+  2646823736U, // <5,1,6,6>: Cost 3 vext2 <4,u,5,1>, <6,6,6,6>
+  2646823758U, // <5,1,6,7>: Cost 3 vext2 <4,u,5,1>, <6,7,0,1>
+  2646823839U, // <5,1,6,u>: Cost 3 vext2 <4,u,5,1>, <6,u,0,1>
+  2559557734U, // <5,1,7,0>: Cost 3 vext1 <1,5,1,7>, LHS
+  2559558452U, // <5,1,7,1>: Cost 3 vext1 <1,5,1,7>, <1,1,1,1>
+  2571503270U, // <5,1,7,2>: Cost 3 vext1 <3,5,1,7>, <2,3,0,1>
+  2040971366U, // <5,1,7,3>: Cost 2 vtrnr RHS, LHS
+  2559561014U, // <5,1,7,4>: Cost 3 vext1 <1,5,1,7>, RHS
+  2595393232U, // <5,1,7,5>: Cost 3 vext1 <7,5,1,7>, <5,1,7,3>
+  4188455035U, // <5,1,7,6>: Cost 4 vtrnr RHS, <0,1,4,6>
+  2646824556U, // <5,1,7,7>: Cost 3 vext2 <4,u,5,1>, <7,7,7,7>
+  2040971371U, // <5,1,7,u>: Cost 2 vtrnr RHS, LHS
+  1591662326U, // <5,1,u,0>: Cost 2 vext2 <u,0,5,1>, <u,0,5,1>
+  1573082926U, // <5,1,u,1>: Cost 2 vext2 <4,u,5,1>, LHS
+  2695824760U, // <5,1,u,2>: Cost 3 vext3 <1,u,2,5>, <1,u,2,5>
+  2040979558U, // <5,1,u,3>: Cost 2 vtrnr RHS, LHS
+  2687124874U, // <5,1,u,4>: Cost 3 vext3 <0,4,1,5>, <1,u,4,5>
+  1573083290U, // <5,1,u,5>: Cost 2 vext2 <4,u,5,1>, RHS
+  2646825168U, // <5,1,u,6>: Cost 3 vext2 <4,u,5,1>, <u,6,3,7>
+  2646825216U, // <5,1,u,7>: Cost 3 vext2 <4,u,5,1>, <u,7,0,1>
+  2040979563U, // <5,1,u,u>: Cost 2 vtrnr RHS, LHS
+  3702652928U, // <5,2,0,0>: Cost 4 vext2 <1,u,5,2>, <0,0,0,0>
+  2628911206U, // <5,2,0,1>: Cost 3 vext2 <1,u,5,2>, LHS
+  2641518756U, // <5,2,0,2>: Cost 3 vext2 <4,0,5,2>, <0,2,0,2>
+  3759760847U, // <5,2,0,3>: Cost 4 vext3 <0,2,4,5>, <2,0,3,2>
+  3760866775U, // <5,2,0,4>: Cost 4 vext3 <0,4,1,5>, <2,0,4,1>
+  3759539680U, // <5,2,0,5>: Cost 4 vext3 <0,2,1,5>, <2,0,5,1>
+  3760866796U, // <5,2,0,6>: Cost 4 vext3 <0,4,1,5>, <2,0,6,4>
+  3304114054U, // <5,2,0,7>: Cost 4 vrev <2,5,7,0>
+  2628911773U, // <5,2,0,u>: Cost 3 vext2 <1,u,5,2>, LHS
+  2623603464U, // <5,2,1,0>: Cost 3 vext2 <1,0,5,2>, <1,0,5,2>
+  3698008921U, // <5,2,1,1>: Cost 4 vext2 <1,1,5,2>, <1,1,5,2>
+  3633325603U, // <5,2,1,2>: Cost 4 vext1 <1,5,2,1>, <2,1,3,5>
+  2687125027U, // <5,2,1,3>: Cost 3 vext3 <0,4,1,5>, <2,1,3,5>
+  3633327414U, // <5,2,1,4>: Cost 4 vext1 <1,5,2,1>, RHS
+  3759539760U, // <5,2,1,5>: Cost 4 vext3 <0,2,1,5>, <2,1,5,0>
+  3760866876U, // <5,2,1,6>: Cost 4 vext3 <0,4,1,5>, <2,1,6,3>
+  3304122247U, // <5,2,1,7>: Cost 4 vrev <2,5,7,1>
+  2687125072U, // <5,2,1,u>: Cost 3 vext3 <0,4,1,5>, <2,1,u,5>
+  3633332326U, // <5,2,2,0>: Cost 4 vext1 <1,5,2,2>, LHS
+  3759760992U, // <5,2,2,1>: Cost 4 vext3 <0,2,4,5>, <2,2,1,3>
+  2687125096U, // <5,2,2,2>: Cost 3 vext3 <0,4,1,5>, <2,2,2,2>
+  2687125106U, // <5,2,2,3>: Cost 3 vext3 <0,4,1,5>, <2,2,3,3>
+  2697963133U, // <5,2,2,4>: Cost 3 vext3 <2,2,4,5>, <2,2,4,5>
+  3759466120U, // <5,2,2,5>: Cost 4 vext3 <0,2,0,5>, <2,2,5,7>
+  3760866960U, // <5,2,2,6>: Cost 4 vext3 <0,4,1,5>, <2,2,6,6>
+  3771926168U, // <5,2,2,7>: Cost 4 vext3 <2,2,7,5>, <2,2,7,5>
+  2687125151U, // <5,2,2,u>: Cost 3 vext3 <0,4,1,5>, <2,2,u,3>
+  2687125158U, // <5,2,3,0>: Cost 3 vext3 <0,4,1,5>, <2,3,0,1>
+  2698405555U, // <5,2,3,1>: Cost 3 vext3 <2,3,1,5>, <2,3,1,5>
+  2577516238U, // <5,2,3,2>: Cost 3 vext1 <4,5,2,3>, <2,3,4,5>
+  3759687365U, // <5,2,3,3>: Cost 4 vext3 <0,2,3,5>, <2,3,3,5>
+  1624884942U, // <5,2,3,4>: Cost 2 vext3 <2,3,4,5>, <2,3,4,5>
+  2698700503U, // <5,2,3,5>: Cost 3 vext3 <2,3,5,5>, <2,3,5,5>
+  3772368608U, // <5,2,3,6>: Cost 4 vext3 <2,3,4,5>, <2,3,6,5>
+  3702655716U, // <5,2,3,7>: Cost 4 vext2 <1,u,5,2>, <3,7,3,7>
+  1625179890U, // <5,2,3,u>: Cost 2 vext3 <2,3,u,5>, <2,3,u,5>
+  2641521555U, // <5,2,4,0>: Cost 3 vext2 <4,0,5,2>, <4,0,5,2>
+  3772368642U, // <5,2,4,1>: Cost 4 vext3 <2,3,4,5>, <2,4,1,3>
+  2699142925U, // <5,2,4,2>: Cost 3 vext3 <2,4,2,5>, <2,4,2,5>
+  2698626838U, // <5,2,4,3>: Cost 3 vext3 <2,3,4,5>, <2,4,3,5>
+  2698626848U, // <5,2,4,4>: Cost 3 vext3 <2,3,4,5>, <2,4,4,6>
+  2628914486U, // <5,2,4,5>: Cost 3 vext2 <1,u,5,2>, RHS
+  2645503353U, // <5,2,4,6>: Cost 3 vext2 <4,6,5,2>, <4,6,5,2>
+  3304146826U, // <5,2,4,7>: Cost 4 vrev <2,5,7,4>
+  2628914729U, // <5,2,4,u>: Cost 3 vext2 <1,u,5,2>, RHS
+  2553643110U, // <5,2,5,0>: Cost 3 vext1 <0,5,2,5>, LHS
+  3758950227U, // <5,2,5,1>: Cost 4 vext3 <0,1,2,5>, <2,5,1,3>
+  3759761248U, // <5,2,5,2>: Cost 4 vext3 <0,2,4,5>, <2,5,2,7>
+  2982396006U, // <5,2,5,3>: Cost 3 vzipr <4,u,5,5>, LHS
+  2553646390U, // <5,2,5,4>: Cost 3 vext1 <0,5,2,5>, RHS
+  2553647108U, // <5,2,5,5>: Cost 3 vext1 <0,5,2,5>, <5,5,5,5>
+  3760867204U, // <5,2,5,6>: Cost 4 vext3 <0,4,1,5>, <2,5,6,7>
+  3702657141U, // <5,2,5,7>: Cost 4 vext2 <1,u,5,2>, <5,7,0,1>
+  2982396011U, // <5,2,5,u>: Cost 3 vzipr <4,u,5,5>, LHS
+  3627393126U, // <5,2,6,0>: Cost 4 vext1 <0,5,2,6>, LHS
+  3760867236U, // <5,2,6,1>: Cost 4 vext3 <0,4,1,5>, <2,6,1,3>
+  2645504506U, // <5,2,6,2>: Cost 3 vext2 <4,6,5,2>, <6,2,7,3>
+  2687125434U, // <5,2,6,3>: Cost 3 vext3 <0,4,1,5>, <2,6,3,7>
+  2700617665U, // <5,2,6,4>: Cost 3 vext3 <2,6,4,5>, <2,6,4,5>
+  3760867276U, // <5,2,6,5>: Cost 4 vext3 <0,4,1,5>, <2,6,5,7>
+  3763521493U, // <5,2,6,6>: Cost 4 vext3 <0,u,1,5>, <2,6,6,7>
+  3719246670U, // <5,2,6,7>: Cost 4 vext2 <4,6,5,2>, <6,7,0,1>
+  2687125479U, // <5,2,6,u>: Cost 3 vext3 <0,4,1,5>, <2,6,u,7>
+  2565603430U, // <5,2,7,0>: Cost 3 vext1 <2,5,2,7>, LHS
+  2553660150U, // <5,2,7,1>: Cost 3 vext1 <0,5,2,7>, <1,0,3,2>
+  2565605216U, // <5,2,7,2>: Cost 3 vext1 <2,5,2,7>, <2,5,2,7>
+  2961178726U, // <5,2,7,3>: Cost 3 vzipr <1,3,5,7>, LHS
+  2565606710U, // <5,2,7,4>: Cost 3 vext1 <2,5,2,7>, RHS
+  4034920552U, // <5,2,7,5>: Cost 4 vzipr <1,3,5,7>, <0,1,2,5>
+  3114713292U, // <5,2,7,6>: Cost 3 vtrnr RHS, <0,2,4,6>
+  3702658668U, // <5,2,7,7>: Cost 4 vext2 <1,u,5,2>, <7,7,7,7>
+  2961178731U, // <5,2,7,u>: Cost 3 vzipr <1,3,5,7>, LHS
+  2687125563U, // <5,2,u,0>: Cost 3 vext3 <0,4,1,5>, <2,u,0,1>
+  2628917038U, // <5,2,u,1>: Cost 3 vext2 <1,u,5,2>, LHS
+  2565613409U, // <5,2,u,2>: Cost 3 vext1 <2,5,2,u>, <2,5,2,u>
+  2687125592U, // <5,2,u,3>: Cost 3 vext3 <0,4,1,5>, <2,u,3,3>
+  1628203107U, // <5,2,u,4>: Cost 2 vext3 <2,u,4,5>, <2,u,4,5>
+  2628917402U, // <5,2,u,5>: Cost 3 vext2 <1,u,5,2>, RHS
+  2702092405U, // <5,2,u,6>: Cost 3 vext3 <2,u,6,5>, <2,u,6,5>
+  3304179598U, // <5,2,u,7>: Cost 4 vrev <2,5,7,u>
+  1628498055U, // <5,2,u,u>: Cost 2 vext3 <2,u,u,5>, <2,u,u,5>
+  3760867467U, // <5,3,0,0>: Cost 4 vext3 <0,4,1,5>, <3,0,0,0>
+  2687125654U, // <5,3,0,1>: Cost 3 vext3 <0,4,1,5>, <3,0,1,2>
+  3759761565U, // <5,3,0,2>: Cost 4 vext3 <0,2,4,5>, <3,0,2,0>
+  3633391766U, // <5,3,0,3>: Cost 4 vext1 <1,5,3,0>, <3,0,1,2>
+  2687125680U, // <5,3,0,4>: Cost 3 vext3 <0,4,1,5>, <3,0,4,1>
+  3760277690U, // <5,3,0,5>: Cost 4 vext3 <0,3,2,5>, <3,0,5,2>
+  3310013014U, // <5,3,0,6>: Cost 4 vrev <3,5,6,0>
+  2236344927U, // <5,3,0,7>: Cost 3 vrev <3,5,7,0>
+  2687125717U, // <5,3,0,u>: Cost 3 vext3 <0,4,1,5>, <3,0,u,2>
+  3760867551U, // <5,3,1,0>: Cost 4 vext3 <0,4,1,5>, <3,1,0,3>
+  3760867558U, // <5,3,1,1>: Cost 4 vext3 <0,4,1,5>, <3,1,1,1>
+  2624938923U, // <5,3,1,2>: Cost 3 vext2 <1,2,5,3>, <1,2,5,3>
+  2703198460U, // <5,3,1,3>: Cost 3 vext3 <3,1,3,5>, <3,1,3,5>
+  3760867587U, // <5,3,1,4>: Cost 4 vext3 <0,4,1,5>, <3,1,4,3>
+  2636219536U, // <5,3,1,5>: Cost 3 vext2 <3,1,5,3>, <1,5,3,7>
+  3698681075U, // <5,3,1,6>: Cost 4 vext2 <1,2,5,3>, <1,6,5,7>
+  2703493408U, // <5,3,1,7>: Cost 3 vext3 <3,1,7,5>, <3,1,7,5>
+  2628920721U, // <5,3,1,u>: Cost 3 vext2 <1,u,5,3>, <1,u,5,3>
+  3766765870U, // <5,3,2,0>: Cost 4 vext3 <1,4,0,5>, <3,2,0,1>
+  3698681379U, // <5,3,2,1>: Cost 4 vext2 <1,2,5,3>, <2,1,3,5>
+  3760867649U, // <5,3,2,2>: Cost 4 vext3 <0,4,1,5>, <3,2,2,2>
+  2698627404U, // <5,3,2,3>: Cost 3 vext3 <2,3,4,5>, <3,2,3,4>
+  2703935830U, // <5,3,2,4>: Cost 3 vext3 <3,2,4,5>, <3,2,4,5>
+  2698627422U, // <5,3,2,5>: Cost 3 vext3 <2,3,4,5>, <3,2,5,4>
+  3760867686U, // <5,3,2,6>: Cost 4 vext3 <0,4,1,5>, <3,2,6,3>
+  3769788783U, // <5,3,2,7>: Cost 4 vext3 <1,u,5,5>, <3,2,7,3>
+  2701945209U, // <5,3,2,u>: Cost 3 vext3 <2,u,4,5>, <3,2,u,4>
+  3760867711U, // <5,3,3,0>: Cost 4 vext3 <0,4,1,5>, <3,3,0,1>
+  2636220684U, // <5,3,3,1>: Cost 3 vext2 <3,1,5,3>, <3,1,5,3>
+  3772369298U, // <5,3,3,2>: Cost 4 vext3 <2,3,4,5>, <3,3,2,2>
+  2687125916U, // <5,3,3,3>: Cost 3 vext3 <0,4,1,5>, <3,3,3,3>
+  2704599463U, // <5,3,3,4>: Cost 3 vext3 <3,3,4,5>, <3,3,4,5>
+  2704673200U, // <5,3,3,5>: Cost 3 vext3 <3,3,5,5>, <3,3,5,5>
+  3709962935U, // <5,3,3,6>: Cost 4 vext2 <3,1,5,3>, <3,6,7,7>
+  3772369346U, // <5,3,3,7>: Cost 4 vext3 <2,3,4,5>, <3,3,7,5>
+  2704894411U, // <5,3,3,u>: Cost 3 vext3 <3,3,u,5>, <3,3,u,5>
+  2704968148U, // <5,3,4,0>: Cost 3 vext3 <3,4,0,5>, <3,4,0,5>
+  3698682850U, // <5,3,4,1>: Cost 4 vext2 <1,2,5,3>, <4,1,5,0>
+  2642857014U, // <5,3,4,2>: Cost 3 vext2 <4,2,5,3>, <4,2,5,3>
+  2705189359U, // <5,3,4,3>: Cost 3 vext3 <3,4,3,5>, <3,4,3,5>
+  2705263096U, // <5,3,4,4>: Cost 3 vext3 <3,4,4,5>, <3,4,4,5>
+  2685946370U, // <5,3,4,5>: Cost 3 vext3 <0,2,3,5>, <3,4,5,6>
+  3779152394U, // <5,3,4,6>: Cost 4 vext3 <3,4,6,5>, <3,4,6,5>
+  2236377699U, // <5,3,4,7>: Cost 3 vrev <3,5,7,4>
+  2687126045U, // <5,3,4,u>: Cost 3 vext3 <0,4,1,5>, <3,4,u,6>
+  2571632742U, // <5,3,5,0>: Cost 3 vext1 <3,5,3,5>, LHS
+  2559689870U, // <5,3,5,1>: Cost 3 vext1 <1,5,3,5>, <1,5,3,5>
+  2571634382U, // <5,3,5,2>: Cost 3 vext1 <3,5,3,5>, <2,3,4,5>
+  2571635264U, // <5,3,5,3>: Cost 3 vext1 <3,5,3,5>, <3,5,3,5>
+  2571636022U, // <5,3,5,4>: Cost 3 vext1 <3,5,3,5>, RHS
+  2559692804U, // <5,3,5,5>: Cost 3 vext1 <1,5,3,5>, <5,5,5,5>
+  3720581218U, // <5,3,5,6>: Cost 4 vext2 <4,u,5,3>, <5,6,7,0>
+  2236385892U, // <5,3,5,7>: Cost 3 vrev <3,5,7,5>
+  2571638574U, // <5,3,5,u>: Cost 3 vext1 <3,5,3,5>, LHS
+  2565668966U, // <5,3,6,0>: Cost 3 vext1 <2,5,3,6>, LHS
+  3633439887U, // <5,3,6,1>: Cost 4 vext1 <1,5,3,6>, <1,5,3,6>
+  2565670760U, // <5,3,6,2>: Cost 3 vext1 <2,5,3,6>, <2,5,3,6>
+  2565671426U, // <5,3,6,3>: Cost 3 vext1 <2,5,3,6>, <3,4,5,6>
+  2565672246U, // <5,3,6,4>: Cost 3 vext1 <2,5,3,6>, RHS
+  3639414630U, // <5,3,6,5>: Cost 4 vext1 <2,5,3,6>, <5,3,6,0>
+  4047521640U, // <5,3,6,6>: Cost 4 vzipr <3,4,5,6>, <2,5,3,6>
+  2725169844U, // <5,3,6,7>: Cost 3 vext3 <6,7,4,5>, <3,6,7,4>
+  2565674798U, // <5,3,6,u>: Cost 3 vext1 <2,5,3,6>, LHS
+  1485963366U, // <5,3,7,0>: Cost 2 vext1 <1,5,3,7>, LHS
+  1485964432U, // <5,3,7,1>: Cost 2 vext1 <1,5,3,7>, <1,5,3,7>
+  2559706728U, // <5,3,7,2>: Cost 3 vext1 <1,5,3,7>, <2,2,2,2>
+  2559707286U, // <5,3,7,3>: Cost 3 vext1 <1,5,3,7>, <3,0,1,2>
+  1485966646U, // <5,3,7,4>: Cost 2 vext1 <1,5,3,7>, RHS
+  2559708880U, // <5,3,7,5>: Cost 3 vext1 <1,5,3,7>, <5,1,7,3>
+  2601513466U, // <5,3,7,6>: Cost 3 vext1 <u,5,3,7>, <6,2,7,3>
+  3114714112U, // <5,3,7,7>: Cost 3 vtrnr RHS, <1,3,5,7>
+  1485969198U, // <5,3,7,u>: Cost 2 vext1 <1,5,3,7>, LHS
+  1485971558U, // <5,3,u,0>: Cost 2 vext1 <1,5,3,u>, LHS
+  1485972625U, // <5,3,u,1>: Cost 2 vext1 <1,5,3,u>, <1,5,3,u>
+  2559714920U, // <5,3,u,2>: Cost 3 vext1 <1,5,3,u>, <2,2,2,2>
+  2559715478U, // <5,3,u,3>: Cost 3 vext1 <1,5,3,u>, <3,0,1,2>
+  1485974838U, // <5,3,u,4>: Cost 2 vext1 <1,5,3,u>, RHS
+  2687126342U, // <5,3,u,5>: Cost 3 vext3 <0,4,1,5>, <3,u,5,6>
+  2601521658U, // <5,3,u,6>: Cost 3 vext1 <u,5,3,u>, <6,2,7,3>
+  2236410471U, // <5,3,u,7>: Cost 3 vrev <3,5,7,u>
+  1485977390U, // <5,3,u,u>: Cost 2 vext1 <1,5,3,u>, LHS
+  3627491430U, // <5,4,0,0>: Cost 4 vext1 <0,5,4,0>, LHS
+  2636890214U, // <5,4,0,1>: Cost 3 vext2 <3,2,5,4>, LHS
+  3703333028U, // <5,4,0,2>: Cost 4 vext2 <2,0,5,4>, <0,2,0,2>
+  3782249348U, // <5,4,0,3>: Cost 4 vext3 <4,0,3,5>, <4,0,3,5>
+  2642198866U, // <5,4,0,4>: Cost 3 vext2 <4,1,5,4>, <0,4,1,5>
+  2687126418U, // <5,4,0,5>: Cost 3 vext3 <0,4,1,5>, <4,0,5,1>
+  2242243887U, // <5,4,0,6>: Cost 3 vrev <4,5,6,0>
+  3316059448U, // <5,4,0,7>: Cost 4 vrev <4,5,7,0>
+  2636890781U, // <5,4,0,u>: Cost 3 vext2 <3,2,5,4>, LHS
+  2241809658U, // <5,4,1,0>: Cost 3 vrev <4,5,0,1>
+  3698025307U, // <5,4,1,1>: Cost 4 vext2 <1,1,5,4>, <1,1,5,4>
+  3698688940U, // <5,4,1,2>: Cost 4 vext2 <1,2,5,4>, <1,2,5,4>
+  3698689024U, // <5,4,1,3>: Cost 4 vext2 <1,2,5,4>, <1,3,5,7>
+  3700016206U, // <5,4,1,4>: Cost 4 vext2 <1,4,5,4>, <1,4,5,4>
+  2687126498U, // <5,4,1,5>: Cost 3 vext3 <0,4,1,5>, <4,1,5,0>
+  3760868336U, // <5,4,1,6>: Cost 4 vext3 <0,4,1,5>, <4,1,6,5>
+  3316067641U, // <5,4,1,7>: Cost 4 vrev <4,5,7,1>
+  2242399554U, // <5,4,1,u>: Cost 3 vrev <4,5,u,1>
+  3703334371U, // <5,4,2,0>: Cost 4 vext2 <2,0,5,4>, <2,0,5,4>
+  3703998004U, // <5,4,2,1>: Cost 4 vext2 <2,1,5,4>, <2,1,5,4>
+  3704661637U, // <5,4,2,2>: Cost 4 vext2 <2,2,5,4>, <2,2,5,4>
+  2636891854U, // <5,4,2,3>: Cost 3 vext2 <3,2,5,4>, <2,3,4,5>
+  3705988903U, // <5,4,2,4>: Cost 4 vext2 <2,4,5,4>, <2,4,5,4>
+  2698628150U, // <5,4,2,5>: Cost 3 vext3 <2,3,4,5>, <4,2,5,3>
+  3760868415U, // <5,4,2,6>: Cost 4 vext3 <0,4,1,5>, <4,2,6,3>
+  3783871562U, // <5,4,2,7>: Cost 4 vext3 <4,2,7,5>, <4,2,7,5>
+  2666752099U, // <5,4,2,u>: Cost 3 vext2 <u,2,5,4>, <2,u,4,5>
+  3639459942U, // <5,4,3,0>: Cost 4 vext1 <2,5,4,3>, LHS
+  3709970701U, // <5,4,3,1>: Cost 4 vext2 <3,1,5,4>, <3,1,5,4>
+  2636892510U, // <5,4,3,2>: Cost 3 vext2 <3,2,5,4>, <3,2,5,4>
+  3710634396U, // <5,4,3,3>: Cost 4 vext2 <3,2,5,4>, <3,3,3,3>
+  2638219776U, // <5,4,3,4>: Cost 3 vext2 <3,4,5,4>, <3,4,5,4>
+  3766987908U, // <5,4,3,5>: Cost 4 vext3 <1,4,3,5>, <4,3,5,0>
+  2710719634U, // <5,4,3,6>: Cost 3 vext3 <4,3,6,5>, <4,3,6,5>
+  3914097664U, // <5,4,3,7>: Cost 4 vuzpr <3,5,7,4>, <1,3,5,7>
+  2640874308U, // <5,4,3,u>: Cost 3 vext2 <3,u,5,4>, <3,u,5,4>
+  2583642214U, // <5,4,4,0>: Cost 3 vext1 <5,5,4,4>, LHS
+  2642201574U, // <5,4,4,1>: Cost 3 vext2 <4,1,5,4>, <4,1,5,4>
+  3710635062U, // <5,4,4,2>: Cost 4 vext2 <3,2,5,4>, <4,2,5,3>
+  3717270664U, // <5,4,4,3>: Cost 4 vext2 <4,3,5,4>, <4,3,5,4>
+  2713963728U, // <5,4,4,4>: Cost 3 vext3 <4,u,5,5>, <4,4,4,4>
+  1637567706U, // <5,4,4,5>: Cost 2 vext3 <4,4,5,5>, <4,4,5,5>
+  2242276659U, // <5,4,4,6>: Cost 3 vrev <4,5,6,4>
+  2646183372U, // <5,4,4,7>: Cost 3 vext2 <4,7,5,4>, <4,7,5,4>
+  1637788917U, // <5,4,4,u>: Cost 2 vext3 <4,4,u,5>, <4,4,u,5>
+  2559762534U, // <5,4,5,0>: Cost 3 vext1 <1,5,4,5>, LHS
+  2559763607U, // <5,4,5,1>: Cost 3 vext1 <1,5,4,5>, <1,5,4,5>
+  2698628366U, // <5,4,5,2>: Cost 3 vext3 <2,3,4,5>, <4,5,2,3>
+  3633506454U, // <5,4,5,3>: Cost 4 vext1 <1,5,4,5>, <3,0,1,2>
+  2559765814U, // <5,4,5,4>: Cost 3 vext1 <1,5,4,5>, RHS
+  2583654395U, // <5,4,5,5>: Cost 3 vext1 <5,5,4,5>, <5,5,4,5>
+  1613385014U, // <5,4,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
+  3901639990U, // <5,4,5,7>: Cost 4 vuzpr <1,5,0,4>, RHS
+  1613385032U, // <5,4,5,u>: Cost 2 vext3 <0,4,1,5>, RHS
+  2559770726U, // <5,4,6,0>: Cost 3 vext1 <1,5,4,6>, LHS
+  2559771648U, // <5,4,6,1>: Cost 3 vext1 <1,5,4,6>, <1,3,5,7>
+  3633514088U, // <5,4,6,2>: Cost 4 vext1 <1,5,4,6>, <2,2,2,2>
+  2571717122U, // <5,4,6,3>: Cost 3 vext1 <3,5,4,6>, <3,4,5,6>
+  2559774006U, // <5,4,6,4>: Cost 3 vext1 <1,5,4,6>, RHS
+  2712636796U, // <5,4,6,5>: Cost 3 vext3 <4,6,5,5>, <4,6,5,5>
+  3760868743U, // <5,4,6,6>: Cost 4 vext3 <0,4,1,5>, <4,6,6,7>
+  2712784270U, // <5,4,6,7>: Cost 3 vext3 <4,6,7,5>, <4,6,7,5>
+  2559776558U, // <5,4,6,u>: Cost 3 vext1 <1,5,4,6>, LHS
+  2565750886U, // <5,4,7,0>: Cost 3 vext1 <2,5,4,7>, LHS
+  2565751706U, // <5,4,7,1>: Cost 3 vext1 <2,5,4,7>, <1,2,3,4>
+  2565752690U, // <5,4,7,2>: Cost 3 vext1 <2,5,4,7>, <2,5,4,7>
+  2571725387U, // <5,4,7,3>: Cost 3 vext1 <3,5,4,7>, <3,5,4,7>
+  2565754166U, // <5,4,7,4>: Cost 3 vext1 <2,5,4,7>, RHS
+  3114713426U, // <5,4,7,5>: Cost 3 vtrnr RHS, <0,4,1,5>
+  94817590U, // <5,4,7,6>: Cost 1 vrev RHS
+  2595616175U, // <5,4,7,7>: Cost 3 vext1 <7,5,4,7>, <7,5,4,7>
+  94965064U, // <5,4,7,u>: Cost 1 vrev RHS
+  2559787110U, // <5,4,u,0>: Cost 3 vext1 <1,5,4,u>, LHS
+  2559788186U, // <5,4,u,1>: Cost 3 vext1 <1,5,4,u>, <1,5,4,u>
+  2242014483U, // <5,4,u,2>: Cost 3 vrev <4,5,2,u>
+  2667419628U, // <5,4,u,3>: Cost 3 vext2 <u,3,5,4>, <u,3,5,4>
+  2559790390U, // <5,4,u,4>: Cost 3 vext1 <1,5,4,u>, RHS
+  1640222238U, // <5,4,u,5>: Cost 2 vext3 <4,u,5,5>, <4,u,5,5>
+  94825783U, // <5,4,u,6>: Cost 1 vrev RHS
+  2714111536U, // <5,4,u,7>: Cost 3 vext3 <4,u,7,5>, <4,u,7,5>
+  94973257U, // <5,4,u,u>: Cost 1 vrev RHS
+  2646851584U, // <5,5,0,0>: Cost 3 vext2 <4,u,5,5>, <0,0,0,0>
+  1573109862U, // <5,5,0,1>: Cost 2 vext2 <4,u,5,5>, LHS
+  2646851748U, // <5,5,0,2>: Cost 3 vext2 <4,u,5,5>, <0,2,0,2>
+  3760279130U, // <5,5,0,3>: Cost 4 vext3 <0,3,2,5>, <5,0,3,2>
+  2687127138U, // <5,5,0,4>: Cost 3 vext3 <0,4,1,5>, <5,0,4,1>
+  2248142847U, // <5,5,0,5>: Cost 3 vrev <5,5,5,0>
+  3720593910U, // <5,5,0,6>: Cost 4 vext2 <4,u,5,5>, <0,6,1,7>
+  4182502710U, // <5,5,0,7>: Cost 4 vtrnr <3,5,7,0>, RHS
+  1573110429U, // <5,5,0,u>: Cost 2 vext2 <4,u,5,5>, LHS
+  2646852342U, // <5,5,1,0>: Cost 3 vext2 <4,u,5,5>, <1,0,3,2>
+  2624291676U, // <5,5,1,1>: Cost 3 vext2 <1,1,5,5>, <1,1,5,5>
+  2646852502U, // <5,5,1,2>: Cost 3 vext2 <4,u,5,5>, <1,2,3,0>
+  2646852568U, // <5,5,1,3>: Cost 3 vext2 <4,u,5,5>, <1,3,1,3>
+  2715217591U, // <5,5,1,4>: Cost 3 vext3 <5,1,4,5>, <5,1,4,5>
+  2628936848U, // <5,5,1,5>: Cost 3 vext2 <1,u,5,5>, <1,5,3,7>
+  3698033907U, // <5,5,1,6>: Cost 4 vext2 <1,1,5,5>, <1,6,5,7>
+  2713964240U, // <5,5,1,7>: Cost 3 vext3 <4,u,5,5>, <5,1,7,3>
+  2628937107U, // <5,5,1,u>: Cost 3 vext2 <1,u,5,5>, <1,u,5,5>
+  3645497446U, // <5,5,2,0>: Cost 4 vext1 <3,5,5,2>, LHS
+  3760869099U, // <5,5,2,1>: Cost 4 vext3 <0,4,1,5>, <5,2,1,3>
+  2646853224U, // <5,5,2,2>: Cost 3 vext2 <4,u,5,5>, <2,2,2,2>
+  2698628862U, // <5,5,2,3>: Cost 3 vext3 <2,3,4,5>, <5,2,3,4>
+  3772370694U, // <5,5,2,4>: Cost 4 vext3 <2,3,4,5>, <5,2,4,3>
+  2713964303U, // <5,5,2,5>: Cost 3 vext3 <4,u,5,5>, <5,2,5,3>
+  2646853562U, // <5,5,2,6>: Cost 3 vext2 <4,u,5,5>, <2,6,3,7>
+  4038198272U, // <5,5,2,7>: Cost 4 vzipr <1,u,5,2>, <1,3,5,7>
+  2701946667U, // <5,5,2,u>: Cost 3 vext3 <2,u,4,5>, <5,2,u,4>
+  2646853782U, // <5,5,3,0>: Cost 3 vext2 <4,u,5,5>, <3,0,1,2>
+  3698034922U, // <5,5,3,1>: Cost 4 vext2 <1,1,5,5>, <3,1,1,5>
+  3702679919U, // <5,5,3,2>: Cost 4 vext2 <1,u,5,5>, <3,2,7,3>
+  2637564336U, // <5,5,3,3>: Cost 3 vext2 <3,3,5,5>, <3,3,5,5>
+  2646854146U, // <5,5,3,4>: Cost 3 vext2 <4,u,5,5>, <3,4,5,6>
+  2638891602U, // <5,5,3,5>: Cost 3 vext2 <3,5,5,5>, <3,5,5,5>
+  3702680247U, // <5,5,3,6>: Cost 4 vext2 <1,u,5,5>, <3,6,7,7>
+  3702680259U, // <5,5,3,7>: Cost 4 vext2 <1,u,5,5>, <3,7,0,1>
+  2646854430U, // <5,5,3,u>: Cost 3 vext2 <4,u,5,5>, <3,u,1,2>
+  2646854546U, // <5,5,4,0>: Cost 3 vext2 <4,u,5,5>, <4,0,5,1>
+  2642209767U, // <5,5,4,1>: Cost 3 vext2 <4,1,5,5>, <4,1,5,5>
+  3711306806U, // <5,5,4,2>: Cost 4 vext2 <3,3,5,5>, <4,2,5,3>
+  3645516369U, // <5,5,4,3>: Cost 4 vext1 <3,5,5,4>, <3,5,5,4>
+  1570458842U, // <5,5,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
+  1573113142U, // <5,5,4,5>: Cost 2 vext2 <4,u,5,5>, RHS
+  2645527932U, // <5,5,4,6>: Cost 3 vext2 <4,6,5,5>, <4,6,5,5>
+  2713964486U, // <5,5,4,7>: Cost 3 vext3 <4,u,5,5>, <5,4,7,6>
+  1573113374U, // <5,5,4,u>: Cost 2 vext2 <4,u,5,5>, <4,u,5,5>
+  1509982310U, // <5,5,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
+  2646855376U, // <5,5,5,1>: Cost 3 vext2 <4,u,5,5>, <5,1,7,3>
+  2583725672U, // <5,5,5,2>: Cost 3 vext1 <5,5,5,5>, <2,2,2,2>
+  2583726230U, // <5,5,5,3>: Cost 3 vext1 <5,5,5,5>, <3,0,1,2>
+  1509985590U, // <5,5,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
+  229035318U, // <5,5,5,5>: Cost 1 vdup1 RHS
+  2646855778U, // <5,5,5,6>: Cost 3 vext2 <4,u,5,5>, <5,6,7,0>
+  2646855848U, // <5,5,5,7>: Cost 3 vext2 <4,u,5,5>, <5,7,5,7>
+  229035318U, // <5,5,5,u>: Cost 1 vdup1 RHS
+  2577760358U, // <5,5,6,0>: Cost 3 vext1 <4,5,5,6>, LHS
+  3633587361U, // <5,5,6,1>: Cost 4 vext1 <1,5,5,6>, <1,5,5,6>
+  2646856186U, // <5,5,6,2>: Cost 3 vext2 <4,u,5,5>, <6,2,7,3>
+  3633588738U, // <5,5,6,3>: Cost 4 vext1 <1,5,5,6>, <3,4,5,6>
+  2718535756U, // <5,5,6,4>: Cost 3 vext3 <5,6,4,5>, <5,6,4,5>
+  2644202223U, // <5,5,6,5>: Cost 3 vext2 <4,4,5,5>, <6,5,7,5>
+  2973780482U, // <5,5,6,6>: Cost 3 vzipr <3,4,5,6>, <3,4,5,6>
+  2646856526U, // <5,5,6,7>: Cost 3 vext2 <4,u,5,5>, <6,7,0,1>
+  2646856607U, // <5,5,6,u>: Cost 3 vext2 <4,u,5,5>, <6,u,0,1>
+  2571796582U, // <5,5,7,0>: Cost 3 vext1 <3,5,5,7>, LHS
+  3633595392U, // <5,5,7,1>: Cost 4 vext1 <1,5,5,7>, <1,3,5,7>
+  2571798222U, // <5,5,7,2>: Cost 3 vext1 <3,5,5,7>, <2,3,4,5>
+  2571799124U, // <5,5,7,3>: Cost 3 vext1 <3,5,5,7>, <3,5,5,7>
+  2571799862U, // <5,5,7,4>: Cost 3 vext1 <3,5,5,7>, RHS
+  3114717188U, // <5,5,7,5>: Cost 3 vtrnr RHS, <5,5,5,5>
+  4034923010U, // <5,5,7,6>: Cost 4 vzipr <1,3,5,7>, <3,4,5,6>
+  2040974646U, // <5,5,7,7>: Cost 2 vtrnr RHS, RHS
+  2040974647U, // <5,5,7,u>: Cost 2 vtrnr RHS, RHS
+  1509982310U, // <5,5,u,0>: Cost 2 vext1 <5,5,5,5>, LHS
+  1573115694U, // <5,5,u,1>: Cost 2 vext2 <4,u,5,5>, LHS
+  2571806414U, // <5,5,u,2>: Cost 3 vext1 <3,5,5,u>, <2,3,4,5>
+  2571807317U, // <5,5,u,3>: Cost 3 vext1 <3,5,5,u>, <3,5,5,u>
+  1509985590U, // <5,5,u,4>: Cost 2 vext1 <5,5,5,5>, RHS
+  229035318U, // <5,5,u,5>: Cost 1 vdup1 RHS
+  2646857936U, // <5,5,u,6>: Cost 3 vext2 <4,u,5,5>, <u,6,3,7>
+  2040982838U, // <5,5,u,7>: Cost 2 vtrnr RHS, RHS
+  229035318U, // <5,5,u,u>: Cost 1 vdup1 RHS
+  2638233600U, // <5,6,0,0>: Cost 3 vext2 <3,4,5,6>, <0,0,0,0>
+  1564491878U, // <5,6,0,1>: Cost 2 vext2 <3,4,5,6>, LHS
+  2632261796U, // <5,6,0,2>: Cost 3 vext2 <2,4,5,6>, <0,2,0,2>
+  2638233856U, // <5,6,0,3>: Cost 3 vext2 <3,4,5,6>, <0,3,1,4>
+  2638233938U, // <5,6,0,4>: Cost 3 vext2 <3,4,5,6>, <0,4,1,5>
+  3706003885U, // <5,6,0,5>: Cost 4 vext2 <2,4,5,6>, <0,5,2,6>
+  3706003967U, // <5,6,0,6>: Cost 4 vext2 <2,4,5,6>, <0,6,2,7>
+  4047473974U, // <5,6,0,7>: Cost 4 vzipr <3,4,5,0>, RHS
+  1564492445U, // <5,6,0,u>: Cost 2 vext2 <3,4,5,6>, LHS
+  2638234358U, // <5,6,1,0>: Cost 3 vext2 <3,4,5,6>, <1,0,3,2>
+  2638234420U, // <5,6,1,1>: Cost 3 vext2 <3,4,5,6>, <1,1,1,1>
+  2638234518U, // <5,6,1,2>: Cost 3 vext2 <3,4,5,6>, <1,2,3,0>
+  2638234584U, // <5,6,1,3>: Cost 3 vext2 <3,4,5,6>, <1,3,1,3>
+  2626290768U, // <5,6,1,4>: Cost 3 vext2 <1,4,5,6>, <1,4,5,6>
+  2638234768U, // <5,6,1,5>: Cost 3 vext2 <3,4,5,6>, <1,5,3,7>
+  3700032719U, // <5,6,1,6>: Cost 4 vext2 <1,4,5,6>, <1,6,1,7>
+  2982366518U, // <5,6,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
+  2628945300U, // <5,6,1,u>: Cost 3 vext2 <1,u,5,6>, <1,u,5,6>
+  3706004925U, // <5,6,2,0>: Cost 4 vext2 <2,4,5,6>, <2,0,1,2>
+  3711976966U, // <5,6,2,1>: Cost 4 vext2 <3,4,5,6>, <2,1,0,3>
+  2638235240U, // <5,6,2,2>: Cost 3 vext2 <3,4,5,6>, <2,2,2,2>
+  2638235302U, // <5,6,2,3>: Cost 3 vext2 <3,4,5,6>, <2,3,0,1>
+  2632263465U, // <5,6,2,4>: Cost 3 vext2 <2,4,5,6>, <2,4,5,6>
+  2638235496U, // <5,6,2,5>: Cost 3 vext2 <3,4,5,6>, <2,5,3,6>
+  2638235578U, // <5,6,2,6>: Cost 3 vext2 <3,4,5,6>, <2,6,3,7>
+  2713965050U, // <5,6,2,7>: Cost 3 vext3 <4,u,5,5>, <6,2,7,3>
+  2634917997U, // <5,6,2,u>: Cost 3 vext2 <2,u,5,6>, <2,u,5,6>
+  2638235798U, // <5,6,3,0>: Cost 3 vext2 <3,4,5,6>, <3,0,1,2>
+  3711977695U, // <5,6,3,1>: Cost 4 vext2 <3,4,5,6>, <3,1,0,3>
+  3710650720U, // <5,6,3,2>: Cost 4 vext2 <3,2,5,6>, <3,2,5,6>
+  2638236060U, // <5,6,3,3>: Cost 3 vext2 <3,4,5,6>, <3,3,3,3>
+  1564494338U, // <5,6,3,4>: Cost 2 vext2 <3,4,5,6>, <3,4,5,6>
+  2638236234U, // <5,6,3,5>: Cost 3 vext2 <3,4,5,6>, <3,5,4,6>
+  3711978104U, // <5,6,3,6>: Cost 4 vext2 <3,4,5,6>, <3,6,0,7>
+  4034227510U, // <5,6,3,7>: Cost 4 vzipr <1,2,5,3>, RHS
+  1567148870U, // <5,6,3,u>: Cost 2 vext2 <3,u,5,6>, <3,u,5,6>
+  2577817702U, // <5,6,4,0>: Cost 3 vext1 <4,5,6,4>, LHS
+  3700034544U, // <5,6,4,1>: Cost 4 vext2 <1,4,5,6>, <4,1,6,5>
+  2723033713U, // <5,6,4,2>: Cost 3 vext3 <6,4,2,5>, <6,4,2,5>
+  2638236818U, // <5,6,4,3>: Cost 3 vext2 <3,4,5,6>, <4,3,6,5>
+  2644208859U, // <5,6,4,4>: Cost 3 vext2 <4,4,5,6>, <4,4,5,6>
+  1564495158U, // <5,6,4,5>: Cost 2 vext2 <3,4,5,6>, RHS
+  2645536125U, // <5,6,4,6>: Cost 3 vext2 <4,6,5,6>, <4,6,5,6>
+  2723402398U, // <5,6,4,7>: Cost 3 vext3 <6,4,7,5>, <6,4,7,5>
+  1564495401U, // <5,6,4,u>: Cost 2 vext2 <3,4,5,6>, RHS
+  2577825894U, // <5,6,5,0>: Cost 3 vext1 <4,5,6,5>, LHS
+  2662125264U, // <5,6,5,1>: Cost 3 vext2 <7,4,5,6>, <5,1,7,3>
+  3775836867U, // <5,6,5,2>: Cost 4 vext3 <2,u,6,5>, <6,5,2,6>
+  3711979343U, // <5,6,5,3>: Cost 4 vext2 <3,4,5,6>, <5,3,3,4>
+  2650181556U, // <5,6,5,4>: Cost 3 vext2 <5,4,5,6>, <5,4,5,6>
+  2662125572U, // <5,6,5,5>: Cost 3 vext2 <7,4,5,6>, <5,5,5,5>
+  2638237732U, // <5,6,5,6>: Cost 3 vext2 <3,4,5,6>, <5,6,0,1>
+  2982399286U, // <5,6,5,7>: Cost 3 vzipr <4,u,5,5>, RHS
+  2982399287U, // <5,6,5,u>: Cost 3 vzipr <4,u,5,5>, RHS
+  2583806054U, // <5,6,6,0>: Cost 3 vext1 <5,5,6,6>, LHS
+  3711979910U, // <5,6,6,1>: Cost 4 vext2 <3,4,5,6>, <6,1,3,4>
+  2662126074U, // <5,6,6,2>: Cost 3 vext2 <7,4,5,6>, <6,2,7,3>
+  2583808514U, // <5,6,6,3>: Cost 3 vext1 <5,5,6,6>, <3,4,5,6>
+  2583809334U, // <5,6,6,4>: Cost 3 vext1 <5,5,6,6>, RHS
+  2583810062U, // <5,6,6,5>: Cost 3 vext1 <5,5,6,6>, <5,5,6,6>
+  2638238520U, // <5,6,6,6>: Cost 3 vext2 <3,4,5,6>, <6,6,6,6>
+  2973781302U, // <5,6,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
+  2973781303U, // <5,6,6,u>: Cost 3 vzipr <3,4,5,6>, RHS
+  430358630U, // <5,6,7,0>: Cost 1 vext1 RHS, LHS
+  1504101110U, // <5,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
+  1504101992U, // <5,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1504102550U, // <5,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  430361910U, // <5,6,7,4>: Cost 1 vext1 RHS, RHS
+  1504104390U, // <5,6,7,5>: Cost 2 vext1 RHS, <5,4,7,6>
+  1504105272U, // <5,6,7,6>: Cost 2 vext1 RHS, <6,6,6,6>
+  1504106092U, // <5,6,7,7>: Cost 2 vext1 RHS, <7,7,7,7>
+  430364462U, // <5,6,7,u>: Cost 1 vext1 RHS, LHS
+  430366822U, // <5,6,u,0>: Cost 1 vext1 RHS, LHS
+  1564497710U, // <5,6,u,1>: Cost 2 vext2 <3,4,5,6>, LHS
+  1504110184U, // <5,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1504110742U, // <5,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  430370103U, // <5,6,u,4>: Cost 1 vext1 RHS, RHS
+  1564498074U, // <5,6,u,5>: Cost 2 vext2 <3,4,5,6>, RHS
+  1504113146U, // <5,6,u,6>: Cost 2 vext1 RHS, <6,2,7,3>
+  1504113658U, // <5,6,u,7>: Cost 2 vext1 RHS, <7,0,1,2>
+  430372654U, // <5,6,u,u>: Cost 1 vext1 RHS, LHS
+  2625634304U, // <5,7,0,0>: Cost 3 vext2 <1,3,5,7>, <0,0,0,0>
+  1551892582U, // <5,7,0,1>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625634468U, // <5,7,0,2>: Cost 3 vext2 <1,3,5,7>, <0,2,0,2>
+  2571889247U, // <5,7,0,3>: Cost 3 vext1 <3,5,7,0>, <3,5,7,0>
+  2625634642U, // <5,7,0,4>: Cost 3 vext2 <1,3,5,7>, <0,4,1,5>
+  2595778728U, // <5,7,0,5>: Cost 3 vext1 <7,5,7,0>, <5,7,5,7>
+  3699376639U, // <5,7,0,6>: Cost 4 vext2 <1,3,5,7>, <0,6,2,7>
+  2260235715U, // <5,7,0,7>: Cost 3 vrev <7,5,7,0>
+  1551893149U, // <5,7,0,u>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625635062U, // <5,7,1,0>: Cost 3 vext2 <1,3,5,7>, <1,0,3,2>
+  2624308020U, // <5,7,1,1>: Cost 3 vext2 <1,1,5,7>, <1,1,1,1>
+  2625635222U, // <5,7,1,2>: Cost 3 vext2 <1,3,5,7>, <1,2,3,0>
+  1551893504U, // <5,7,1,3>: Cost 2 vext2 <1,3,5,7>, <1,3,5,7>
+  2571898166U, // <5,7,1,4>: Cost 3 vext1 <3,5,7,1>, RHS
+  2625635472U, // <5,7,1,5>: Cost 3 vext2 <1,3,5,7>, <1,5,3,7>
+  2627626227U, // <5,7,1,6>: Cost 3 vext2 <1,6,5,7>, <1,6,5,7>
+  3702031684U, // <5,7,1,7>: Cost 4 vext2 <1,7,5,7>, <1,7,5,7>
+  1555211669U, // <5,7,1,u>: Cost 2 vext2 <1,u,5,7>, <1,u,5,7>
+  2629617126U, // <5,7,2,0>: Cost 3 vext2 <2,0,5,7>, <2,0,5,7>
+  3699377670U, // <5,7,2,1>: Cost 4 vext2 <1,3,5,7>, <2,1,0,3>
+  2625635944U, // <5,7,2,2>: Cost 3 vext2 <1,3,5,7>, <2,2,2,2>
+  2625636006U, // <5,7,2,3>: Cost 3 vext2 <1,3,5,7>, <2,3,0,1>
+  2632271658U, // <5,7,2,4>: Cost 3 vext2 <2,4,5,7>, <2,4,5,7>
+  2625636201U, // <5,7,2,5>: Cost 3 vext2 <1,3,5,7>, <2,5,3,7>
+  2625636282U, // <5,7,2,6>: Cost 3 vext2 <1,3,5,7>, <2,6,3,7>
+  3708004381U, // <5,7,2,7>: Cost 4 vext2 <2,7,5,7>, <2,7,5,7>
+  2625636411U, // <5,7,2,u>: Cost 3 vext2 <1,3,5,7>, <2,u,0,1>
+  2625636502U, // <5,7,3,0>: Cost 3 vext2 <1,3,5,7>, <3,0,1,2>
+  2625636604U, // <5,7,3,1>: Cost 3 vext2 <1,3,5,7>, <3,1,3,5>
+  3699378478U, // <5,7,3,2>: Cost 4 vext2 <1,3,5,7>, <3,2,0,1>
+  2625636764U, // <5,7,3,3>: Cost 3 vext2 <1,3,5,7>, <3,3,3,3>
+  2625636866U, // <5,7,3,4>: Cost 3 vext2 <1,3,5,7>, <3,4,5,6>
+  2625636959U, // <5,7,3,5>: Cost 3 vext2 <1,3,5,7>, <3,5,7,0>
+  3699378808U, // <5,7,3,6>: Cost 4 vext2 <1,3,5,7>, <3,6,0,7>
+  2640235254U, // <5,7,3,7>: Cost 3 vext2 <3,7,5,7>, <3,7,5,7>
+  2625637150U, // <5,7,3,u>: Cost 3 vext2 <1,3,5,7>, <3,u,1,2>
+  2571919462U, // <5,7,4,0>: Cost 3 vext1 <3,5,7,4>, LHS
+  2571920384U, // <5,7,4,1>: Cost 3 vext1 <3,5,7,4>, <1,3,5,7>
+  3699379260U, // <5,7,4,2>: Cost 4 vext2 <1,3,5,7>, <4,2,6,0>
+  2571922019U, // <5,7,4,3>: Cost 3 vext1 <3,5,7,4>, <3,5,7,4>
+  2571922742U, // <5,7,4,4>: Cost 3 vext1 <3,5,7,4>, RHS
+  1551895862U, // <5,7,4,5>: Cost 2 vext2 <1,3,5,7>, RHS
+  2846277980U, // <5,7,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
+  2646207951U, // <5,7,4,7>: Cost 3 vext2 <4,7,5,7>, <4,7,5,7>
+  1551896105U, // <5,7,4,u>: Cost 2 vext2 <1,3,5,7>, RHS
+  2583871590U, // <5,7,5,0>: Cost 3 vext1 <5,5,7,5>, LHS
+  2652180176U, // <5,7,5,1>: Cost 3 vext2 <5,7,5,7>, <5,1,7,3>
+  2625638177U, // <5,7,5,2>: Cost 3 vext2 <1,3,5,7>, <5,2,7,3>
+  2625638262U, // <5,7,5,3>: Cost 3 vext2 <1,3,5,7>, <5,3,7,7>
+  2583874870U, // <5,7,5,4>: Cost 3 vext1 <5,5,7,5>, RHS
+  2846281732U, // <5,7,5,5>: Cost 3 vuzpr RHS, <5,5,5,5>
+  2651517015U, // <5,7,5,6>: Cost 3 vext2 <5,6,5,7>, <5,6,5,7>
+  1772539190U, // <5,7,5,7>: Cost 2 vuzpr RHS, RHS
+  1772539191U, // <5,7,5,u>: Cost 2 vuzpr RHS, RHS
+  2846281826U, // <5,7,6,0>: Cost 3 vuzpr RHS, <5,6,7,0>
+  3699380615U, // <5,7,6,1>: Cost 4 vext2 <1,3,5,7>, <6,1,3,5>
+  2846281108U, // <5,7,6,2>: Cost 3 vuzpr RHS, <4,6,u,2>
+  2589854210U, // <5,7,6,3>: Cost 3 vext1 <6,5,7,6>, <3,4,5,6>
+  2846281830U, // <5,7,6,4>: Cost 3 vuzpr RHS, <5,6,7,4>
+  2725467658U, // <5,7,6,5>: Cost 3 vext3 <6,7,u,5>, <7,6,5,u>
+  2846281076U, // <5,7,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
+  2846279610U, // <5,7,6,7>: Cost 3 vuzpr RHS, <2,6,3,7>
+  2846279611U, // <5,7,6,u>: Cost 3 vuzpr RHS, <2,6,3,u>
+  1510146150U, // <5,7,7,0>: Cost 2 vext1 <5,5,7,7>, LHS
+  2846282574U, // <5,7,7,1>: Cost 3 vuzpr RHS, <6,7,0,1>
+  2583889512U, // <5,7,7,2>: Cost 3 vext1 <5,5,7,7>, <2,2,2,2>
+  2846281919U, // <5,7,7,3>: Cost 3 vuzpr RHS, <5,7,u,3>
+  1510149430U, // <5,7,7,4>: Cost 2 vext1 <5,5,7,7>, RHS
+  1510150168U, // <5,7,7,5>: Cost 2 vext1 <5,5,7,7>, <5,5,7,7>
+  2583892474U, // <5,7,7,6>: Cost 3 vext1 <5,5,7,7>, <6,2,7,3>
+  2625640044U, // <5,7,7,7>: Cost 3 vext2 <1,3,5,7>, <7,7,7,7>
+  1510151982U, // <5,7,7,u>: Cost 2 vext1 <5,5,7,7>, LHS
+  1510154342U, // <5,7,u,0>: Cost 2 vext1 <5,5,7,u>, LHS
+  1551898414U, // <5,7,u,1>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625640325U, // <5,7,u,2>: Cost 3 vext2 <1,3,5,7>, <u,2,3,0>
+  1772536477U, // <5,7,u,3>: Cost 2 vuzpr RHS, LHS
+  1510157622U, // <5,7,u,4>: Cost 2 vext1 <5,5,7,u>, RHS
+  1551898778U, // <5,7,u,5>: Cost 2 vext2 <1,3,5,7>, RHS
+  2625640656U, // <5,7,u,6>: Cost 3 vext2 <1,3,5,7>, <u,6,3,7>
+  1772539433U, // <5,7,u,7>: Cost 2 vuzpr RHS, RHS
+  1551898981U, // <5,7,u,u>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625642496U, // <5,u,0,0>: Cost 3 vext2 <1,3,5,u>, <0,0,0,0>
+  1551900774U, // <5,u,0,1>: Cost 2 vext2 <1,3,5,u>, LHS
+  2625642660U, // <5,u,0,2>: Cost 3 vext2 <1,3,5,u>, <0,2,0,2>
+  2698630885U, // <5,u,0,3>: Cost 3 vext3 <2,3,4,5>, <u,0,3,2>
+  2687129325U, // <5,u,0,4>: Cost 3 vext3 <0,4,1,5>, <u,0,4,1>
+  2689783542U, // <5,u,0,5>: Cost 3 vext3 <0,u,1,5>, <u,0,5,1>
+  2266134675U, // <5,u,0,6>: Cost 3 vrev <u,5,6,0>
+  2595853772U, // <5,u,0,7>: Cost 3 vext1 <7,5,u,0>, <7,5,u,0>
+  1551901341U, // <5,u,0,u>: Cost 2 vext2 <1,3,5,u>, LHS
+  2625643254U, // <5,u,1,0>: Cost 3 vext2 <1,3,5,u>, <1,0,3,2>
+  2625643316U, // <5,u,1,1>: Cost 3 vext2 <1,3,5,u>, <1,1,1,1>
+  1613387566U, // <5,u,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  1551901697U, // <5,u,1,3>: Cost 2 vext2 <1,3,5,u>, <1,3,5,u>
+  2626307154U, // <5,u,1,4>: Cost 3 vext2 <1,4,5,u>, <1,4,5,u>
+  2689783622U, // <5,u,1,5>: Cost 3 vext3 <0,u,1,5>, <u,1,5,0>
+  2627634420U, // <5,u,1,6>: Cost 3 vext2 <1,6,5,u>, <1,6,5,u>
+  2982366536U, // <5,u,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
+  1613387620U, // <5,u,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
+  2846286742U, // <5,u,2,0>: Cost 3 vuzpr RHS, <1,2,3,0>
+  2685796528U, // <5,u,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
+  2625644136U, // <5,u,2,2>: Cost 3 vext2 <1,3,5,u>, <2,2,2,2>
+  2687129480U, // <5,u,2,3>: Cost 3 vext3 <0,4,1,5>, <u,2,3,3>
+  2632279851U, // <5,u,2,4>: Cost 3 vext2 <2,4,5,u>, <2,4,5,u>
+  2625644394U, // <5,u,2,5>: Cost 3 vext2 <1,3,5,u>, <2,5,3,u>
+  2625644474U, // <5,u,2,6>: Cost 3 vext2 <1,3,5,u>, <2,6,3,7>
+  2713966508U, // <5,u,2,7>: Cost 3 vext3 <4,u,5,5>, <u,2,7,3>
+  2625644603U, // <5,u,2,u>: Cost 3 vext2 <1,3,5,u>, <2,u,0,1>
+  2687129532U, // <5,u,3,0>: Cost 3 vext3 <0,4,1,5>, <u,3,0,1>
+  2636261649U, // <5,u,3,1>: Cost 3 vext2 <3,1,5,u>, <3,1,5,u>
+  2636925282U, // <5,u,3,2>: Cost 3 vext2 <3,2,5,u>, <3,2,5,u>
+  2625644956U, // <5,u,3,3>: Cost 3 vext2 <1,3,5,u>, <3,3,3,3>
+  1564510724U, // <5,u,3,4>: Cost 2 vext2 <3,4,5,u>, <3,4,5,u>
+  2625645160U, // <5,u,3,5>: Cost 3 vext2 <1,3,5,u>, <3,5,u,0>
+  2734610422U, // <5,u,3,6>: Cost 3 vext3 <u,3,6,5>, <u,3,6,5>
+  2640243447U, // <5,u,3,7>: Cost 3 vext2 <3,7,5,u>, <3,7,5,u>
+  1567165256U, // <5,u,3,u>: Cost 2 vext2 <3,u,5,u>, <3,u,5,u>
+  1567828889U, // <5,u,4,0>: Cost 2 vext2 <4,0,5,u>, <4,0,5,u>
+  1661163546U, // <5,u,4,1>: Cost 2 vext3 <u,4,1,5>, <u,4,1,5>
+  2734463012U, // <5,u,4,2>: Cost 3 vext3 <u,3,4,5>, <u,4,2,6>
+  2698631212U, // <5,u,4,3>: Cost 3 vext3 <2,3,4,5>, <u,4,3,5>
+  1570458842U, // <5,u,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
+  1551904054U, // <5,u,4,5>: Cost 2 vext2 <1,3,5,u>, RHS
+  2846286172U, // <5,u,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
+  2646216144U, // <5,u,4,7>: Cost 3 vext2 <4,7,5,u>, <4,7,5,u>
+  1551904297U, // <5,u,4,u>: Cost 2 vext2 <1,3,5,u>, RHS
+  1509982310U, // <5,u,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
+  2560058555U, // <5,u,5,1>: Cost 3 vext1 <1,5,u,5>, <1,5,u,5>
+  2698926194U, // <5,u,5,2>: Cost 3 vext3 <2,3,u,5>, <u,5,2,3>
+  2698631295U, // <5,u,5,3>: Cost 3 vext3 <2,3,4,5>, <u,5,3,7>
+  1509985590U, // <5,u,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
+  229035318U, // <5,u,5,5>: Cost 1 vdup1 RHS
+  1613387930U, // <5,u,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
+  1772547382U, // <5,u,5,7>: Cost 2 vuzpr RHS, RHS
+  229035318U, // <5,u,5,u>: Cost 1 vdup1 RHS
+  2566037606U, // <5,u,6,0>: Cost 3 vext1 <2,5,u,6>, LHS
+  2920044334U, // <5,u,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
+  2566039445U, // <5,u,6,2>: Cost 3 vext1 <2,5,u,6>, <2,5,u,6>
+  2687129808U, // <5,u,6,3>: Cost 3 vext3 <0,4,1,5>, <u,6,3,7>
+  2566040886U, // <5,u,6,4>: Cost 3 vext1 <2,5,u,6>, RHS
+  2920044698U, // <5,u,6,5>: Cost 3 vzipl <5,6,7,0>, RHS
+  2846289268U, // <5,u,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
+  2973781320U, // <5,u,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
+  2687129853U, // <5,u,6,u>: Cost 3 vext3 <0,4,1,5>, <u,6,u,7>
+  430506086U, // <5,u,7,0>: Cost 1 vext1 RHS, LHS
+  1486333117U, // <5,u,7,1>: Cost 2 vext1 <1,5,u,7>, <1,5,u,7>
+  1504249448U, // <5,u,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  2040971933U, // <5,u,7,3>: Cost 2 vtrnr RHS, LHS
+  430509384U, // <5,u,7,4>: Cost 1 vext1 RHS, RHS
+  1504251600U, // <5,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+  118708378U, // <5,u,7,6>: Cost 1 vrev RHS
+  2040974889U, // <5,u,7,7>: Cost 2 vtrnr RHS, RHS
+  430511918U, // <5,u,7,u>: Cost 1 vext1 RHS, LHS
+  430514278U, // <5,u,u,0>: Cost 1 vext1 RHS, LHS
+  1551906606U, // <5,u,u,1>: Cost 2 vext2 <1,3,5,u>, LHS
+  1613388133U, // <5,u,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  1772544669U, // <5,u,u,3>: Cost 2 vuzpr RHS, LHS
+  430517577U, // <5,u,u,4>: Cost 1 vext1 RHS, RHS
+  229035318U, // <5,u,u,5>: Cost 1 vdup1 RHS
+  118716571U, // <5,u,u,6>: Cost 1 vrev RHS
+  1772547625U, // <5,u,u,7>: Cost 2 vuzpr RHS, RHS
+  430520110U, // <5,u,u,u>: Cost 1 vext1 RHS, LHS
+  2686025728U, // <6,0,0,0>: Cost 3 vext3 <0,2,4,6>, <0,0,0,0>
+  2686025738U, // <6,0,0,1>: Cost 3 vext3 <0,2,4,6>, <0,0,1,1>
+  2686025748U, // <6,0,0,2>: Cost 3 vext3 <0,2,4,6>, <0,0,2,2>
+  3779084320U, // <6,0,0,3>: Cost 4 vext3 <3,4,5,6>, <0,0,3,5>
+  2642903388U, // <6,0,0,4>: Cost 3 vext2 <4,2,6,0>, <0,4,2,6>
+  3657723939U, // <6,0,0,5>: Cost 4 vext1 <5,6,0,0>, <5,6,0,0>
+  3926676514U, // <6,0,0,6>: Cost 4 vuzpr <5,6,7,0>, <7,0,5,6>
+  3926675786U, // <6,0,0,7>: Cost 4 vuzpr <5,6,7,0>, <6,0,5,7>
+  2686025802U, // <6,0,0,u>: Cost 3 vext3 <0,2,4,6>, <0,0,u,2>
+  2566070374U, // <6,0,1,0>: Cost 3 vext1 <2,6,0,1>, LHS
+  3759767642U, // <6,0,1,1>: Cost 4 vext3 <0,2,4,6>, <0,1,1,0>
+  1612284006U, // <6,0,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  2583988738U, // <6,0,1,3>: Cost 3 vext1 <5,6,0,1>, <3,4,5,6>
+  2566073654U, // <6,0,1,4>: Cost 3 vext1 <2,6,0,1>, RHS
+  2583990308U, // <6,0,1,5>: Cost 3 vext1 <5,6,0,1>, <5,6,0,1>
+  2589963005U, // <6,0,1,6>: Cost 3 vext1 <6,6,0,1>, <6,6,0,1>
+  2595935702U, // <6,0,1,7>: Cost 3 vext1 <7,6,0,1>, <7,6,0,1>
+  1612284060U, // <6,0,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
+  2686025892U, // <6,0,2,0>: Cost 3 vext3 <0,2,4,6>, <0,2,0,2>
+  2685804721U, // <6,0,2,1>: Cost 3 vext3 <0,2,1,6>, <0,2,1,6>
+  3759620282U, // <6,0,2,2>: Cost 4 vext3 <0,2,2,6>, <0,2,2,6>
+  2705342658U, // <6,0,2,3>: Cost 3 vext3 <3,4,5,6>, <0,2,3,5>
+  1612284108U, // <6,0,2,4>: Cost 2 vext3 <0,2,4,6>, <0,2,4,6>
+  3706029956U, // <6,0,2,5>: Cost 4 vext2 <2,4,6,0>, <2,5,6,7>
+  2686173406U, // <6,0,2,6>: Cost 3 vext3 <0,2,6,6>, <0,2,6,6>
+  3651769338U, // <6,0,2,7>: Cost 4 vext1 <4,6,0,2>, <7,0,1,2>
+  1612579056U, // <6,0,2,u>: Cost 2 vext3 <0,2,u,6>, <0,2,u,6>
+  3706030230U, // <6,0,3,0>: Cost 4 vext2 <2,4,6,0>, <3,0,1,2>
+  2705342720U, // <6,0,3,1>: Cost 3 vext3 <3,4,5,6>, <0,3,1,4>
+  2705342730U, // <6,0,3,2>: Cost 3 vext3 <3,4,5,6>, <0,3,2,5>
+  3706030492U, // <6,0,3,3>: Cost 4 vext2 <2,4,6,0>, <3,3,3,3>
+  2644896258U, // <6,0,3,4>: Cost 3 vext2 <4,5,6,0>, <3,4,5,6>
+  3718638154U, // <6,0,3,5>: Cost 4 vext2 <4,5,6,0>, <3,5,4,6>
+  3729918619U, // <6,0,3,6>: Cost 4 vext2 <6,4,6,0>, <3,6,4,6>
+  3926672384U, // <6,0,3,7>: Cost 4 vuzpr <5,6,7,0>, <1,3,5,7>
+  2705342784U, // <6,0,3,u>: Cost 3 vext3 <3,4,5,6>, <0,3,u,5>
+  2687058250U, // <6,0,4,0>: Cost 3 vext3 <0,4,0,6>, <0,4,0,6>
+  2686026066U, // <6,0,4,1>: Cost 3 vext3 <0,2,4,6>, <0,4,1,5>
+  1613463900U, // <6,0,4,2>: Cost 2 vext3 <0,4,2,6>, <0,4,2,6>
+  3761021285U, // <6,0,4,3>: Cost 4 vext3 <0,4,3,6>, <0,4,3,6>
+  2687353198U, // <6,0,4,4>: Cost 3 vext3 <0,4,4,6>, <0,4,4,6>
+  2632289590U, // <6,0,4,5>: Cost 3 vext2 <2,4,6,0>, RHS
+  2645560704U, // <6,0,4,6>: Cost 3 vext2 <4,6,6,0>, <4,6,6,0>
+  2646224337U, // <6,0,4,7>: Cost 3 vext2 <4,7,6,0>, <4,7,6,0>
+  1613906322U, // <6,0,4,u>: Cost 2 vext3 <0,4,u,6>, <0,4,u,6>
+  3651788902U, // <6,0,5,0>: Cost 4 vext1 <4,6,0,5>, LHS
+  2687795620U, // <6,0,5,1>: Cost 3 vext3 <0,5,1,6>, <0,5,1,6>
+  3761611181U, // <6,0,5,2>: Cost 4 vext3 <0,5,2,6>, <0,5,2,6>
+  3723284326U, // <6,0,5,3>: Cost 4 vext2 <5,3,6,0>, <5,3,6,0>
+  2646224838U, // <6,0,5,4>: Cost 3 vext2 <4,7,6,0>, <5,4,7,6>
+  3718639630U, // <6,0,5,5>: Cost 4 vext2 <4,5,6,0>, <5,5,6,6>
+  2652196962U, // <6,0,5,6>: Cost 3 vext2 <5,7,6,0>, <5,6,7,0>
+  2852932918U, // <6,0,5,7>: Cost 3 vuzpr <5,6,7,0>, RHS
+  2852932919U, // <6,0,5,u>: Cost 3 vuzpr <5,6,7,0>, RHS
+  2852933730U, // <6,0,6,0>: Cost 3 vuzpr <5,6,7,0>, <5,6,7,0>
+  2925985894U, // <6,0,6,1>: Cost 3 vzipl <6,6,6,6>, LHS
+  3060203622U, // <6,0,6,2>: Cost 3 vtrnl <6,6,6,6>, LHS
+  3718640178U, // <6,0,6,3>: Cost 4 vext2 <4,5,6,0>, <6,3,4,5>
+  2656178832U, // <6,0,6,4>: Cost 3 vext2 <6,4,6,0>, <6,4,6,0>
+  3725939378U, // <6,0,6,5>: Cost 4 vext2 <5,7,6,0>, <6,5,0,7>
+  2657506098U, // <6,0,6,6>: Cost 3 vext2 <6,6,6,0>, <6,6,6,0>
+  2619020110U, // <6,0,6,7>: Cost 3 vext2 <0,2,6,0>, <6,7,0,1>
+  2925986461U, // <6,0,6,u>: Cost 3 vzipl <6,6,6,6>, LHS
+  2572091494U, // <6,0,7,0>: Cost 3 vext1 <3,6,0,7>, LHS
+  2572092310U, // <6,0,7,1>: Cost 3 vext1 <3,6,0,7>, <1,2,3,0>
+  2980495524U, // <6,0,7,2>: Cost 3 vzipr RHS, <0,2,0,2>
+  2572094072U, // <6,0,7,3>: Cost 3 vext1 <3,6,0,7>, <3,6,0,7>
+  2572094774U, // <6,0,7,4>: Cost 3 vext1 <3,6,0,7>, RHS
+  4054238242U, // <6,0,7,5>: Cost 4 vzipr RHS, <1,4,0,5>
+  3645837653U, // <6,0,7,6>: Cost 4 vext1 <3,6,0,7>, <6,0,7,0>
+  4054239054U, // <6,0,7,7>: Cost 4 vzipr RHS, <2,5,0,7>
+  2572097326U, // <6,0,7,u>: Cost 3 vext1 <3,6,0,7>, LHS
+  2686026378U, // <6,0,u,0>: Cost 3 vext3 <0,2,4,6>, <0,u,0,2>
+  2686026386U, // <6,0,u,1>: Cost 3 vext3 <0,2,4,6>, <0,u,1,1>
+  1612284573U, // <6,0,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  2705343144U, // <6,0,u,3>: Cost 3 vext3 <3,4,5,6>, <0,u,3,5>
+  1616265906U, // <6,0,u,4>: Cost 2 vext3 <0,u,4,6>, <0,u,4,6>
+  2632292506U, // <6,0,u,5>: Cost 3 vext2 <2,4,6,0>, RHS
+  2590020356U, // <6,0,u,6>: Cost 3 vext1 <6,6,0,u>, <6,6,0,u>
+  2852933161U, // <6,0,u,7>: Cost 3 vuzpr <5,6,7,0>, RHS
+  1612284627U, // <6,0,u,u>: Cost 2 vext3 <0,2,4,6>, LHS
+  2595995750U, // <6,1,0,0>: Cost 3 vext1 <7,6,1,0>, LHS
+  2646229094U, // <6,1,0,1>: Cost 3 vext2 <4,7,6,1>, LHS
+  3694092492U, // <6,1,0,2>: Cost 4 vext2 <0,4,6,1>, <0,2,4,6>
+  2686026486U, // <6,1,0,3>: Cost 3 vext3 <0,2,4,6>, <1,0,3,2>
+  2595999030U, // <6,1,0,4>: Cost 3 vext1 <7,6,1,0>, RHS
+  3767730952U, // <6,1,0,5>: Cost 4 vext3 <1,5,4,6>, <1,0,5,2>
+  2596000590U, // <6,1,0,6>: Cost 3 vext1 <7,6,1,0>, <6,7,0,1>
+  2596001246U, // <6,1,0,7>: Cost 3 vext1 <7,6,1,0>, <7,6,1,0>
+  2686026531U, // <6,1,0,u>: Cost 3 vext3 <0,2,4,6>, <1,0,u,2>
+  3763602219U, // <6,1,1,0>: Cost 4 vext3 <0,u,2,6>, <1,1,0,1>
+  2686026548U, // <6,1,1,1>: Cost 3 vext3 <0,2,4,6>, <1,1,1,1>
+  3764929346U, // <6,1,1,2>: Cost 4 vext3 <1,1,2,6>, <1,1,2,6>
+  2686026568U, // <6,1,1,3>: Cost 3 vext3 <0,2,4,6>, <1,1,3,3>
+  2691334996U, // <6,1,1,4>: Cost 3 vext3 <1,1,4,6>, <1,1,4,6>
+  3760874332U, // <6,1,1,5>: Cost 4 vext3 <0,4,1,6>, <1,1,5,5>
+  3765224294U, // <6,1,1,6>: Cost 4 vext3 <1,1,6,6>, <1,1,6,6>
+  3669751263U, // <6,1,1,7>: Cost 4 vext1 <7,6,1,1>, <7,6,1,1>
+  2686026613U, // <6,1,1,u>: Cost 3 vext3 <0,2,4,6>, <1,1,u,3>
+  2554208358U, // <6,1,2,0>: Cost 3 vext1 <0,6,1,2>, LHS
+  3763602311U, // <6,1,2,1>: Cost 4 vext3 <0,u,2,6>, <1,2,1,3>
+  3639895971U, // <6,1,2,2>: Cost 4 vext1 <2,6,1,2>, <2,6,1,2>
+  2686026646U, // <6,1,2,3>: Cost 3 vext3 <0,2,4,6>, <1,2,3,0>
+  2554211638U, // <6,1,2,4>: Cost 3 vext1 <0,6,1,2>, RHS
+  3760874411U, // <6,1,2,5>: Cost 4 vext3 <0,4,1,6>, <1,2,5,3>
+  2554212858U, // <6,1,2,6>: Cost 3 vext1 <0,6,1,2>, <6,2,7,3>
+  3802973114U, // <6,1,2,7>: Cost 4 vext3 <7,4,5,6>, <1,2,7,0>
+  2686026691U, // <6,1,2,u>: Cost 3 vext3 <0,2,4,6>, <1,2,u,0>
+  2566160486U, // <6,1,3,0>: Cost 3 vext1 <2,6,1,3>, LHS
+  2686026712U, // <6,1,3,1>: Cost 3 vext3 <0,2,4,6>, <1,3,1,3>
+  2686026724U, // <6,1,3,2>: Cost 3 vext3 <0,2,4,6>, <1,3,2,6>
+  3759768552U, // <6,1,3,3>: Cost 4 vext3 <0,2,4,6>, <1,3,3,1>
+  2692662262U, // <6,1,3,4>: Cost 3 vext3 <1,3,4,6>, <1,3,4,6>
+  2686026752U, // <6,1,3,5>: Cost 3 vext3 <0,2,4,6>, <1,3,5,7>
+  2590053128U, // <6,1,3,6>: Cost 3 vext1 <6,6,1,3>, <6,6,1,3>
+  3663795194U, // <6,1,3,7>: Cost 4 vext1 <6,6,1,3>, <7,0,1,2>
+  2686026775U, // <6,1,3,u>: Cost 3 vext3 <0,2,4,6>, <1,3,u,3>
+  2641587099U, // <6,1,4,0>: Cost 3 vext2 <4,0,6,1>, <4,0,6,1>
+  2693104684U, // <6,1,4,1>: Cost 3 vext3 <1,4,1,6>, <1,4,1,6>
+  3639912357U, // <6,1,4,2>: Cost 4 vext1 <2,6,1,4>, <2,6,1,4>
+  2687206462U, // <6,1,4,3>: Cost 3 vext3 <0,4,2,6>, <1,4,3,6>
+  3633941814U, // <6,1,4,4>: Cost 4 vext1 <1,6,1,4>, RHS
+  2693399632U, // <6,1,4,5>: Cost 3 vext3 <1,4,5,6>, <1,4,5,6>
+  3765077075U, // <6,1,4,6>: Cost 4 vext3 <1,1,4,6>, <1,4,6,0>
+  2646232530U, // <6,1,4,7>: Cost 3 vext2 <4,7,6,1>, <4,7,6,1>
+  2687206507U, // <6,1,4,u>: Cost 3 vext3 <0,4,2,6>, <1,4,u,6>
+  2647559796U, // <6,1,5,0>: Cost 3 vext2 <5,0,6,1>, <5,0,6,1>
+  3765077118U, // <6,1,5,1>: Cost 4 vext3 <1,1,4,6>, <1,5,1,7>
+  3767583878U, // <6,1,5,2>: Cost 4 vext3 <1,5,2,6>, <1,5,2,6>
+  2686026896U, // <6,1,5,3>: Cost 3 vext3 <0,2,4,6>, <1,5,3,7>
+  2693989528U, // <6,1,5,4>: Cost 3 vext3 <1,5,4,6>, <1,5,4,6>
+  3767805089U, // <6,1,5,5>: Cost 4 vext3 <1,5,5,6>, <1,5,5,6>
+  2652868706U, // <6,1,5,6>: Cost 3 vext2 <5,u,6,1>, <5,6,7,0>
+  3908250934U, // <6,1,5,7>: Cost 4 vuzpr <2,6,0,1>, RHS
+  2686026941U, // <6,1,5,u>: Cost 3 vext3 <0,2,4,6>, <1,5,u,7>
+  2554241126U, // <6,1,6,0>: Cost 3 vext1 <0,6,1,6>, LHS
+  3763602639U, // <6,1,6,1>: Cost 4 vext3 <0,u,2,6>, <1,6,1,7>
+  3759547607U, // <6,1,6,2>: Cost 4 vext3 <0,2,1,6>, <1,6,2,6>
+  3115221094U, // <6,1,6,3>: Cost 3 vtrnr <4,6,4,6>, LHS
+  2554244406U, // <6,1,6,4>: Cost 3 vext1 <0,6,1,6>, RHS
+  3760874739U, // <6,1,6,5>: Cost 4 vext3 <0,4,1,6>, <1,6,5,7>
+  2554245944U, // <6,1,6,6>: Cost 3 vext1 <0,6,1,6>, <6,6,6,6>
+  3719975758U, // <6,1,6,7>: Cost 4 vext2 <4,7,6,1>, <6,7,0,1>
+  3115221099U, // <6,1,6,u>: Cost 3 vtrnr <4,6,4,6>, LHS
+  2560221286U, // <6,1,7,0>: Cost 3 vext1 <1,6,1,7>, LHS
+  2560222415U, // <6,1,7,1>: Cost 3 vext1 <1,6,1,7>, <1,6,1,7>
+  2980497558U, // <6,1,7,2>: Cost 3 vzipr RHS, <3,0,1,2>
+  3103211622U, // <6,1,7,3>: Cost 3 vtrnr <2,6,3,7>, LHS
+  2560224566U, // <6,1,7,4>: Cost 3 vext1 <1,6,1,7>, RHS
+  2980495698U, // <6,1,7,5>: Cost 3 vzipr RHS, <0,4,1,5>
+  3633967526U, // <6,1,7,6>: Cost 4 vext1 <1,6,1,7>, <6,1,7,0>
+  4054237686U, // <6,1,7,7>: Cost 4 vzipr RHS, <0,6,1,7>
+  2560227118U, // <6,1,7,u>: Cost 3 vext1 <1,6,1,7>, LHS
+  2560229478U, // <6,1,u,0>: Cost 3 vext1 <1,6,1,u>, LHS
+  2686027117U, // <6,1,u,1>: Cost 3 vext3 <0,2,4,6>, <1,u,1,3>
+  2686027129U, // <6,1,u,2>: Cost 3 vext3 <0,2,4,6>, <1,u,2,6>
+  2686027132U, // <6,1,u,3>: Cost 3 vext3 <0,2,4,6>, <1,u,3,0>
+  2687206795U, // <6,1,u,4>: Cost 3 vext3 <0,4,2,6>, <1,u,4,6>
+  2686027157U, // <6,1,u,5>: Cost 3 vext3 <0,2,4,6>, <1,u,5,7>
+  2590094093U, // <6,1,u,6>: Cost 3 vext1 <6,6,1,u>, <6,6,1,u>
+  2596066790U, // <6,1,u,7>: Cost 3 vext1 <7,6,1,u>, <7,6,1,u>
+  2686027177U, // <6,1,u,u>: Cost 3 vext3 <0,2,4,6>, <1,u,u,0>
+  2646900736U, // <6,2,0,0>: Cost 3 vext2 <4,u,6,2>, <0,0,0,0>
+  1573159014U, // <6,2,0,1>: Cost 2 vext2 <4,u,6,2>, LHS
+  2646900900U, // <6,2,0,2>: Cost 3 vext2 <4,u,6,2>, <0,2,0,2>
+  3759769037U, // <6,2,0,3>: Cost 4 vext3 <0,2,4,6>, <2,0,3,0>
+  2641592668U, // <6,2,0,4>: Cost 3 vext2 <4,0,6,2>, <0,4,2,6>
+  3779085794U, // <6,2,0,5>: Cost 4 vext3 <3,4,5,6>, <2,0,5,3>
+  2686027244U, // <6,2,0,6>: Cost 3 vext3 <0,2,4,6>, <2,0,6,4>
+  3669816807U, // <6,2,0,7>: Cost 4 vext1 <7,6,2,0>, <7,6,2,0>
+  1573159581U, // <6,2,0,u>: Cost 2 vext2 <4,u,6,2>, LHS
+  2230527897U, // <6,2,1,0>: Cost 3 vrev <2,6,0,1>
+  2646901556U, // <6,2,1,1>: Cost 3 vext2 <4,u,6,2>, <1,1,1,1>
+  2646901654U, // <6,2,1,2>: Cost 3 vext2 <4,u,6,2>, <1,2,3,0>
+  2847047782U, // <6,2,1,3>: Cost 3 vuzpr <4,6,u,2>, LHS
+  3771049517U, // <6,2,1,4>: Cost 4 vext3 <2,1,4,6>, <2,1,4,6>
+  2646901904U, // <6,2,1,5>: Cost 3 vext2 <4,u,6,2>, <1,5,3,7>
+  2686027324U, // <6,2,1,6>: Cost 3 vext3 <0,2,4,6>, <2,1,6,3>
+  3669825000U, // <6,2,1,7>: Cost 4 vext1 <7,6,2,1>, <7,6,2,1>
+  2231117793U, // <6,2,1,u>: Cost 3 vrev <2,6,u,1>
+  3763603029U, // <6,2,2,0>: Cost 4 vext3 <0,u,2,6>, <2,2,0,1>
+  3759769184U, // <6,2,2,1>: Cost 4 vext3 <0,2,4,6>, <2,2,1,3>
+  2686027368U, // <6,2,2,2>: Cost 3 vext3 <0,2,4,6>, <2,2,2,2>
+  2686027378U, // <6,2,2,3>: Cost 3 vext3 <0,2,4,6>, <2,2,3,3>
+  2697971326U, // <6,2,2,4>: Cost 3 vext3 <2,2,4,6>, <2,2,4,6>
+  3759769224U, // <6,2,2,5>: Cost 4 vext3 <0,2,4,6>, <2,2,5,7>
+  2698118800U, // <6,2,2,6>: Cost 3 vext3 <2,2,6,6>, <2,2,6,6>
+  3920794092U, // <6,2,2,7>: Cost 4 vuzpr <4,6,u,2>, <6,2,5,7>
+  2686027423U, // <6,2,2,u>: Cost 3 vext3 <0,2,4,6>, <2,2,u,3>
+  2686027430U, // <6,2,3,0>: Cost 3 vext3 <0,2,4,6>, <2,3,0,1>
+  3759769262U, // <6,2,3,1>: Cost 4 vext3 <0,2,4,6>, <2,3,1,0>
+  2698487485U, // <6,2,3,2>: Cost 3 vext3 <2,3,2,6>, <2,3,2,6>
+  2705344196U, // <6,2,3,3>: Cost 3 vext3 <3,4,5,6>, <2,3,3,4>
+  2686027470U, // <6,2,3,4>: Cost 3 vext3 <0,2,4,6>, <2,3,4,5>
+  2698708696U, // <6,2,3,5>: Cost 3 vext3 <2,3,5,6>, <2,3,5,6>
+  2724660961U, // <6,2,3,6>: Cost 3 vext3 <6,6,6,6>, <2,3,6,6>
+  2729232104U, // <6,2,3,7>: Cost 3 vext3 <7,4,5,6>, <2,3,7,4>
+  2686027502U, // <6,2,3,u>: Cost 3 vext3 <0,2,4,6>, <2,3,u,1>
+  1567853468U, // <6,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
+  3759769351U, // <6,2,4,1>: Cost 4 vext3 <0,2,4,6>, <2,4,1,u>
+  2699151118U, // <6,2,4,2>: Cost 3 vext3 <2,4,2,6>, <2,4,2,6>
+  2686027543U, // <6,2,4,3>: Cost 3 vext3 <0,2,4,6>, <2,4,3,6>
+  2699298592U, // <6,2,4,4>: Cost 3 vext3 <2,4,4,6>, <2,4,4,6>
+  1573162294U, // <6,2,4,5>: Cost 2 vext2 <4,u,6,2>, RHS
+  2686027564U, // <6,2,4,6>: Cost 3 vext3 <0,2,4,6>, <2,4,6,0>
+  3719982547U, // <6,2,4,7>: Cost 4 vext2 <4,7,6,2>, <4,7,6,2>
+  1573162532U, // <6,2,4,u>: Cost 2 vext2 <4,u,6,2>, <4,u,6,2>
+  3779086154U, // <6,2,5,0>: Cost 4 vext3 <3,4,5,6>, <2,5,0,3>
+  2646904528U, // <6,2,5,1>: Cost 3 vext2 <4,u,6,2>, <5,1,7,3>
+  3759769440U, // <6,2,5,2>: Cost 4 vext3 <0,2,4,6>, <2,5,2,7>
+  2699888488U, // <6,2,5,3>: Cost 3 vext3 <2,5,3,6>, <2,5,3,6>
+  2230855617U, // <6,2,5,4>: Cost 3 vrev <2,6,4,5>
+  2646904836U, // <6,2,5,5>: Cost 3 vext2 <4,u,6,2>, <5,5,5,5>
+  2646904930U, // <6,2,5,6>: Cost 3 vext2 <4,u,6,2>, <5,6,7,0>
+  2847051062U, // <6,2,5,7>: Cost 3 vuzpr <4,6,u,2>, RHS
+  2700257173U, // <6,2,5,u>: Cost 3 vext3 <2,5,u,6>, <2,5,u,6>
+  2687207321U, // <6,2,6,0>: Cost 3 vext3 <0,4,2,6>, <2,6,0,1>
+  2686027684U, // <6,2,6,1>: Cost 3 vext3 <0,2,4,6>, <2,6,1,3>
+  2566260656U, // <6,2,6,2>: Cost 3 vext1 <2,6,2,6>, <2,6,2,6>
+  2685806522U, // <6,2,6,3>: Cost 3 vext3 <0,2,1,6>, <2,6,3,7>
+  2687207361U, // <6,2,6,4>: Cost 3 vext3 <0,4,2,6>, <2,6,4,5>
+  2686027724U, // <6,2,6,5>: Cost 3 vext3 <0,2,4,6>, <2,6,5,7>
+  2646905656U, // <6,2,6,6>: Cost 3 vext2 <4,u,6,2>, <6,6,6,6>
+  2646905678U, // <6,2,6,7>: Cost 3 vext2 <4,u,6,2>, <6,7,0,1>
+  2686027751U, // <6,2,6,u>: Cost 3 vext3 <0,2,4,6>, <2,6,u,7>
+  2554323046U, // <6,2,7,0>: Cost 3 vext1 <0,6,2,7>, LHS
+  2572239606U, // <6,2,7,1>: Cost 3 vext1 <3,6,2,7>, <1,0,3,2>
+  2566268849U, // <6,2,7,2>: Cost 3 vext1 <2,6,2,7>, <2,6,2,7>
+  1906753638U, // <6,2,7,3>: Cost 2 vzipr RHS, LHS
+  2554326326U, // <6,2,7,4>: Cost 3 vext1 <0,6,2,7>, RHS
+  3304687564U, // <6,2,7,5>: Cost 4 vrev <2,6,5,7>
+  2980495708U, // <6,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
+  2646906476U, // <6,2,7,7>: Cost 3 vext2 <4,u,6,2>, <7,7,7,7>
+  1906753643U, // <6,2,7,u>: Cost 2 vzipr RHS, LHS
+  1591744256U, // <6,2,u,0>: Cost 2 vext2 <u,0,6,2>, <u,0,6,2>
+  1573164846U, // <6,2,u,1>: Cost 2 vext2 <4,u,6,2>, LHS
+  2701805650U, // <6,2,u,2>: Cost 3 vext3 <2,u,2,6>, <2,u,2,6>
+  1906761830U, // <6,2,u,3>: Cost 2 vzipr RHS, LHS
+  2686027875U, // <6,2,u,4>: Cost 3 vext3 <0,2,4,6>, <2,u,4,5>
+  1573165210U, // <6,2,u,5>: Cost 2 vext2 <4,u,6,2>, RHS
+  2686322800U, // <6,2,u,6>: Cost 3 vext3 <0,2,u,6>, <2,u,6,0>
+  2847051305U, // <6,2,u,7>: Cost 3 vuzpr <4,6,u,2>, RHS
+  1906761835U, // <6,2,u,u>: Cost 2 vzipr RHS, LHS
+  3759769739U, // <6,3,0,0>: Cost 4 vext3 <0,2,4,6>, <3,0,0,0>
+  2686027926U, // <6,3,0,1>: Cost 3 vext3 <0,2,4,6>, <3,0,1,2>
+  2686027937U, // <6,3,0,2>: Cost 3 vext3 <0,2,4,6>, <3,0,2,4>
+  3640027286U, // <6,3,0,3>: Cost 4 vext1 <2,6,3,0>, <3,0,1,2>
+  2687207601U, // <6,3,0,4>: Cost 3 vext3 <0,4,2,6>, <3,0,4,2>
+  2705344698U, // <6,3,0,5>: Cost 3 vext3 <3,4,5,6>, <3,0,5,2>
+  3663917847U, // <6,3,0,6>: Cost 4 vext1 <6,6,3,0>, <6,6,3,0>
+  2237008560U, // <6,3,0,7>: Cost 3 vrev <3,6,7,0>
+  2686027989U, // <6,3,0,u>: Cost 3 vext3 <0,2,4,6>, <3,0,u,2>
+  3759769823U, // <6,3,1,0>: Cost 4 vext3 <0,2,4,6>, <3,1,0,3>
+  3759769830U, // <6,3,1,1>: Cost 4 vext3 <0,2,4,6>, <3,1,1,1>
+  3759769841U, // <6,3,1,2>: Cost 4 vext3 <0,2,4,6>, <3,1,2,3>
+  3759769848U, // <6,3,1,3>: Cost 4 vext3 <0,2,4,6>, <3,1,3,1>
+  2703280390U, // <6,3,1,4>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
+  3759769868U, // <6,3,1,5>: Cost 4 vext3 <0,2,4,6>, <3,1,5,3>
+  3704063194U, // <6,3,1,6>: Cost 4 vext2 <2,1,6,3>, <1,6,3,0>
+  3767732510U, // <6,3,1,7>: Cost 4 vext3 <1,5,4,6>, <3,1,7,3>
+  2703280390U, // <6,3,1,u>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
+  3704063468U, // <6,3,2,0>: Cost 4 vext2 <2,1,6,3>, <2,0,6,4>
+  2630321724U, // <6,3,2,1>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
+  3759769921U, // <6,3,2,2>: Cost 4 vext3 <0,2,4,6>, <3,2,2,2>
+  3759769928U, // <6,3,2,3>: Cost 4 vext3 <0,2,4,6>, <3,2,3,0>
+  3704063767U, // <6,3,2,4>: Cost 4 vext2 <2,1,6,3>, <2,4,3,6>
+  3704063876U, // <6,3,2,5>: Cost 4 vext2 <2,1,6,3>, <2,5,6,7>
+  2636957626U, // <6,3,2,6>: Cost 3 vext2 <3,2,6,3>, <2,6,3,7>
+  3777907058U, // <6,3,2,7>: Cost 4 vext3 <3,2,7,6>, <3,2,7,6>
+  2630321724U, // <6,3,2,u>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
+  3759769983U, // <6,3,3,0>: Cost 4 vext3 <0,2,4,6>, <3,3,0,1>
+  3710036245U, // <6,3,3,1>: Cost 4 vext2 <3,1,6,3>, <3,1,6,3>
+  2636958054U, // <6,3,3,2>: Cost 3 vext2 <3,2,6,3>, <3,2,6,3>
+  2686028188U, // <6,3,3,3>: Cost 3 vext3 <0,2,4,6>, <3,3,3,3>
+  2704607656U, // <6,3,3,4>: Cost 3 vext3 <3,3,4,6>, <3,3,4,6>
+  3773041072U, // <6,3,3,5>: Cost 4 vext3 <2,4,4,6>, <3,3,5,5>
+  3711363731U, // <6,3,3,6>: Cost 4 vext2 <3,3,6,3>, <3,6,3,7>
+  3767732676U, // <6,3,3,7>: Cost 4 vext3 <1,5,4,6>, <3,3,7,7>
+  2707999179U, // <6,3,3,u>: Cost 3 vext3 <3,u,5,6>, <3,3,u,5>
+  2584232038U, // <6,3,4,0>: Cost 3 vext1 <5,6,3,4>, LHS
+  2642267118U, // <6,3,4,1>: Cost 3 vext2 <4,1,6,3>, <4,1,6,3>
+  2642930751U, // <6,3,4,2>: Cost 3 vext2 <4,2,6,3>, <4,2,6,3>
+  2705197552U, // <6,3,4,3>: Cost 3 vext3 <3,4,3,6>, <3,4,3,6>
+  2584235318U, // <6,3,4,4>: Cost 3 vext1 <5,6,3,4>, RHS
+  1631603202U, // <6,3,4,5>: Cost 2 vext3 <3,4,5,6>, <3,4,5,6>
+  2654211444U, // <6,3,4,6>: Cost 3 vext2 <6,1,6,3>, <4,6,4,6>
+  2237041332U, // <6,3,4,7>: Cost 3 vrev <3,6,7,4>
+  1631824413U, // <6,3,4,u>: Cost 2 vext3 <3,4,u,6>, <3,4,u,6>
+  3640066150U, // <6,3,5,0>: Cost 4 vext1 <2,6,3,5>, LHS
+  3772746288U, // <6,3,5,1>: Cost 4 vext3 <2,4,0,6>, <3,5,1,7>
+  3640067790U, // <6,3,5,2>: Cost 4 vext1 <2,6,3,5>, <2,3,4,5>
+  3773041216U, // <6,3,5,3>: Cost 4 vext3 <2,4,4,6>, <3,5,3,5>
+  2705934922U, // <6,3,5,4>: Cost 3 vext3 <3,5,4,6>, <3,5,4,6>
+  3773041236U, // <6,3,5,5>: Cost 4 vext3 <2,4,4,6>, <3,5,5,7>
+  3779086940U, // <6,3,5,6>: Cost 4 vext3 <3,4,5,6>, <3,5,6,6>
+  3767732831U, // <6,3,5,7>: Cost 4 vext3 <1,5,4,6>, <3,5,7,0>
+  2706229870U, // <6,3,5,u>: Cost 3 vext3 <3,5,u,6>, <3,5,u,6>
+  2602164326U, // <6,3,6,0>: Cost 3 vext1 <u,6,3,6>, LHS
+  2654212512U, // <6,3,6,1>: Cost 3 vext2 <6,1,6,3>, <6,1,6,3>
+  2566334393U, // <6,3,6,2>: Cost 3 vext1 <2,6,3,6>, <2,6,3,6>
+  3704066588U, // <6,3,6,3>: Cost 4 vext2 <2,1,6,3>, <6,3,2,1>
+  2602167524U, // <6,3,6,4>: Cost 3 vext1 <u,6,3,6>, <4,4,6,6>
+  3710702321U, // <6,3,6,5>: Cost 4 vext2 <3,2,6,3>, <6,5,7,7>
+  2724661933U, // <6,3,6,6>: Cost 3 vext3 <6,6,6,6>, <3,6,6,6>
+  3710702465U, // <6,3,6,7>: Cost 4 vext2 <3,2,6,3>, <6,7,5,7>
+  2602170158U, // <6,3,6,u>: Cost 3 vext1 <u,6,3,6>, LHS
+  1492598886U, // <6,3,7,0>: Cost 2 vext1 <2,6,3,7>, LHS
+  2560369889U, // <6,3,7,1>: Cost 3 vext1 <1,6,3,7>, <1,6,3,7>
+  1492600762U, // <6,3,7,2>: Cost 2 vext1 <2,6,3,7>, <2,6,3,7>
+  2566342806U, // <6,3,7,3>: Cost 3 vext1 <2,6,3,7>, <3,0,1,2>
+  1492602166U, // <6,3,7,4>: Cost 2 vext1 <2,6,3,7>, RHS
+  2602176208U, // <6,3,7,5>: Cost 3 vext1 <u,6,3,7>, <5,1,7,3>
+  2566345210U, // <6,3,7,6>: Cost 3 vext1 <2,6,3,7>, <6,2,7,3>
+  2980496528U, // <6,3,7,7>: Cost 3 vzipr RHS, <1,5,3,7>
+  1492604718U, // <6,3,7,u>: Cost 2 vext1 <2,6,3,7>, LHS
+  1492607078U, // <6,3,u,0>: Cost 2 vext1 <2,6,3,u>, LHS
+  2686028574U, // <6,3,u,1>: Cost 3 vext3 <0,2,4,6>, <3,u,1,2>
+  1492608955U, // <6,3,u,2>: Cost 2 vext1 <2,6,3,u>, <2,6,3,u>
+  2566350998U, // <6,3,u,3>: Cost 3 vext1 <2,6,3,u>, <3,0,1,2>
+  1492610358U, // <6,3,u,4>: Cost 2 vext1 <2,6,3,u>, RHS
+  1634257734U, // <6,3,u,5>: Cost 2 vext3 <3,u,5,6>, <3,u,5,6>
+  2566353489U, // <6,3,u,6>: Cost 3 vext1 <2,6,3,u>, <6,3,u,0>
+  2980504720U, // <6,3,u,7>: Cost 3 vzipr RHS, <1,5,3,7>
+  1492612910U, // <6,3,u,u>: Cost 2 vext1 <2,6,3,u>, LHS
+  3703406592U, // <6,4,0,0>: Cost 4 vext2 <2,0,6,4>, <0,0,0,0>
+  2629664870U, // <6,4,0,1>: Cost 3 vext2 <2,0,6,4>, LHS
+  2629664972U, // <6,4,0,2>: Cost 3 vext2 <2,0,6,4>, <0,2,4,6>
+  3779087232U, // <6,4,0,3>: Cost 4 vext3 <3,4,5,6>, <4,0,3,1>
+  2642936156U, // <6,4,0,4>: Cost 3 vext2 <4,2,6,4>, <0,4,2,6>
+  2712570770U, // <6,4,0,5>: Cost 3 vext3 <4,6,4,6>, <4,0,5,1>
+  2687208348U, // <6,4,0,6>: Cost 3 vext3 <0,4,2,6>, <4,0,6,2>
+  3316723081U, // <6,4,0,7>: Cost 4 vrev <4,6,7,0>
+  2629665437U, // <6,4,0,u>: Cost 3 vext2 <2,0,6,4>, LHS
+  2242473291U, // <6,4,1,0>: Cost 3 vrev <4,6,0,1>
+  3700089652U, // <6,4,1,1>: Cost 4 vext2 <1,4,6,4>, <1,1,1,1>
+  3703407510U, // <6,4,1,2>: Cost 4 vext2 <2,0,6,4>, <1,2,3,0>
+  2852962406U, // <6,4,1,3>: Cost 3 vuzpr <5,6,7,4>, LHS
+  3628166454U, // <6,4,1,4>: Cost 4 vext1 <0,6,4,1>, RHS
+  3760876514U, // <6,4,1,5>: Cost 4 vext3 <0,4,1,6>, <4,1,5,0>
+  2687208430U, // <6,4,1,6>: Cost 3 vext3 <0,4,2,6>, <4,1,6,3>
+  3316731274U, // <6,4,1,7>: Cost 4 vrev <4,6,7,1>
+  2243063187U, // <6,4,1,u>: Cost 3 vrev <4,6,u,1>
+  2629666284U, // <6,4,2,0>: Cost 3 vext2 <2,0,6,4>, <2,0,6,4>
+  3703408188U, // <6,4,2,1>: Cost 4 vext2 <2,0,6,4>, <2,1,6,3>
+  3703408232U, // <6,4,2,2>: Cost 4 vext2 <2,0,6,4>, <2,2,2,2>
+  3703408294U, // <6,4,2,3>: Cost 4 vext2 <2,0,6,4>, <2,3,0,1>
+  2632320816U, // <6,4,2,4>: Cost 3 vext2 <2,4,6,4>, <2,4,6,4>
+  2923384118U, // <6,4,2,5>: Cost 3 vzipl <6,2,7,3>, RHS
+  2687208508U, // <6,4,2,6>: Cost 3 vext3 <0,4,2,6>, <4,2,6,0>
+  3760950341U, // <6,4,2,7>: Cost 4 vext3 <0,4,2,6>, <4,2,7,0>
+  2634975348U, // <6,4,2,u>: Cost 3 vext2 <2,u,6,4>, <2,u,6,4>
+  3703408790U, // <6,4,3,0>: Cost 4 vext2 <2,0,6,4>, <3,0,1,2>
+  3316305238U, // <6,4,3,1>: Cost 4 vrev <4,6,1,3>
+  3703408947U, // <6,4,3,2>: Cost 4 vext2 <2,0,6,4>, <3,2,0,6>
+  3703409052U, // <6,4,3,3>: Cost 4 vext2 <2,0,6,4>, <3,3,3,3>
+  2644929026U, // <6,4,3,4>: Cost 3 vext2 <4,5,6,4>, <3,4,5,6>
+  3718670922U, // <6,4,3,5>: Cost 4 vext2 <4,5,6,4>, <3,5,4,6>
+  2705345682U, // <6,4,3,6>: Cost 3 vext3 <3,4,5,6>, <4,3,6,5>
+  3926705152U, // <6,4,3,7>: Cost 4 vuzpr <5,6,7,4>, <1,3,5,7>
+  2668817222U, // <6,4,3,u>: Cost 3 vext2 <u,5,6,4>, <3,u,5,6>
+  2590277734U, // <6,4,4,0>: Cost 3 vext1 <6,6,4,4>, LHS
+  3716017135U, // <6,4,4,1>: Cost 4 vext2 <4,1,6,4>, <4,1,6,4>
+  2642938944U, // <6,4,4,2>: Cost 3 vext2 <4,2,6,4>, <4,2,6,4>
+  3717344401U, // <6,4,4,3>: Cost 4 vext2 <4,3,6,4>, <4,3,6,4>
+  2712571088U, // <6,4,4,4>: Cost 3 vext3 <4,6,4,6>, <4,4,4,4>
+  2629668150U, // <6,4,4,5>: Cost 3 vext2 <2,0,6,4>, RHS
+  1637649636U, // <6,4,4,6>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
+  2646257109U, // <6,4,4,7>: Cost 3 vext2 <4,7,6,4>, <4,7,6,4>
+  1637649636U, // <6,4,4,u>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
+  2566398054U, // <6,4,5,0>: Cost 3 vext1 <2,6,4,5>, LHS
+  3760876805U, // <6,4,5,1>: Cost 4 vext3 <0,4,1,6>, <4,5,1,3>
+  2566399937U, // <6,4,5,2>: Cost 3 vext1 <2,6,4,5>, <2,6,4,5>
+  2584316418U, // <6,4,5,3>: Cost 3 vext1 <5,6,4,5>, <3,4,5,6>
+  2566401334U, // <6,4,5,4>: Cost 3 vext1 <2,6,4,5>, RHS
+  2584318028U, // <6,4,5,5>: Cost 3 vext1 <5,6,4,5>, <5,6,4,5>
+  1612287286U, // <6,4,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
+  2852965686U, // <6,4,5,7>: Cost 3 vuzpr <5,6,7,4>, RHS
+  1612287304U, // <6,4,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
+  1504608358U, // <6,4,6,0>: Cost 2 vext1 <4,6,4,6>, LHS
+  2578350838U, // <6,4,6,1>: Cost 3 vext1 <4,6,4,6>, <1,0,3,2>
+  2578351720U, // <6,4,6,2>: Cost 3 vext1 <4,6,4,6>, <2,2,2,2>
+  2578352278U, // <6,4,6,3>: Cost 3 vext1 <4,6,4,6>, <3,0,1,2>
+  1504611638U, // <6,4,6,4>: Cost 2 vext1 <4,6,4,6>, RHS
+  2578353872U, // <6,4,6,5>: Cost 3 vext1 <4,6,4,6>, <5,1,7,3>
+  2578354682U, // <6,4,6,6>: Cost 3 vext1 <4,6,4,6>, <6,2,7,3>
+  2578355194U, // <6,4,6,7>: Cost 3 vext1 <4,6,4,6>, <7,0,1,2>
+  1504614190U, // <6,4,6,u>: Cost 2 vext1 <4,6,4,6>, LHS
+  2572386406U, // <6,4,7,0>: Cost 3 vext1 <3,6,4,7>, LHS
+  2572387226U, // <6,4,7,1>: Cost 3 vext1 <3,6,4,7>, <1,2,3,4>
+  3640157902U, // <6,4,7,2>: Cost 4 vext1 <2,6,4,7>, <2,3,4,5>
+  2572389020U, // <6,4,7,3>: Cost 3 vext1 <3,6,4,7>, <3,6,4,7>
+  2572389686U, // <6,4,7,4>: Cost 3 vext1 <3,6,4,7>, RHS
+  2980497102U, // <6,4,7,5>: Cost 3 vzipr RHS, <2,3,4,5>
+  2980495564U, // <6,4,7,6>: Cost 3 vzipr RHS, <0,2,4,6>
+  4054239090U, // <6,4,7,7>: Cost 4 vzipr RHS, <2,5,4,7>
+  2572392238U, // <6,4,7,u>: Cost 3 vext1 <3,6,4,7>, LHS
+  1504608358U, // <6,4,u,0>: Cost 2 vext1 <4,6,4,6>, LHS
+  2629670702U, // <6,4,u,1>: Cost 3 vext2 <2,0,6,4>, LHS
+  2566424516U, // <6,4,u,2>: Cost 3 vext1 <2,6,4,u>, <2,6,4,u>
+  2584340994U, // <6,4,u,3>: Cost 3 vext1 <5,6,4,u>, <3,4,5,6>
+  1640156694U, // <6,4,u,4>: Cost 2 vext3 <4,u,4,6>, <4,u,4,6>
+  2629671066U, // <6,4,u,5>: Cost 3 vext2 <2,0,6,4>, RHS
+  1612287529U, // <6,4,u,6>: Cost 2 vext3 <0,2,4,6>, RHS
+  2852965929U, // <6,4,u,7>: Cost 3 vuzpr <5,6,7,4>, RHS
+  1612287547U, // <6,4,u,u>: Cost 2 vext3 <0,2,4,6>, RHS
+  3708723200U, // <6,5,0,0>: Cost 4 vext2 <2,u,6,5>, <0,0,0,0>
+  2634981478U, // <6,5,0,1>: Cost 3 vext2 <2,u,6,5>, LHS
+  3694125260U, // <6,5,0,2>: Cost 4 vext2 <0,4,6,5>, <0,2,4,6>
+  3779087962U, // <6,5,0,3>: Cost 4 vext3 <3,4,5,6>, <5,0,3,2>
+  3760877154U, // <6,5,0,4>: Cost 4 vext3 <0,4,1,6>, <5,0,4,1>
+  4195110916U, // <6,5,0,5>: Cost 4 vtrnr <5,6,7,0>, <5,5,5,5>
+  3696779775U, // <6,5,0,6>: Cost 4 vext2 <0,u,6,5>, <0,6,2,7>
+  1175212130U, // <6,5,0,7>: Cost 2 vrev <5,6,7,0>
+  1175285867U, // <6,5,0,u>: Cost 2 vrev <5,6,u,0>
+  2248445988U, // <6,5,1,0>: Cost 3 vrev <5,6,0,1>
+  3698107237U, // <6,5,1,1>: Cost 4 vext2 <1,1,6,5>, <1,1,6,5>
+  3708724118U, // <6,5,1,2>: Cost 4 vext2 <2,u,6,5>, <1,2,3,0>
+  3908575334U, // <6,5,1,3>: Cost 4 vuzpr <2,6,4,5>, LHS
+  3716023376U, // <6,5,1,4>: Cost 4 vext2 <4,1,6,5>, <1,4,5,6>
+  3708724368U, // <6,5,1,5>: Cost 4 vext2 <2,u,6,5>, <1,5,3,7>
+  3767733960U, // <6,5,1,6>: Cost 4 vext3 <1,5,4,6>, <5,1,6,4>
+  2712571600U, // <6,5,1,7>: Cost 3 vext3 <4,6,4,6>, <5,1,7,3>
+  2712571609U, // <6,5,1,u>: Cost 3 vext3 <4,6,4,6>, <5,1,u,3>
+  2578391142U, // <6,5,2,0>: Cost 3 vext1 <4,6,5,2>, LHS
+  3704079934U, // <6,5,2,1>: Cost 4 vext2 <2,1,6,5>, <2,1,6,5>
+  3708724840U, // <6,5,2,2>: Cost 4 vext2 <2,u,6,5>, <2,2,2,2>
+  3705407182U, // <6,5,2,3>: Cost 4 vext2 <2,3,6,5>, <2,3,4,5>
+  2578394422U, // <6,5,2,4>: Cost 3 vext1 <4,6,5,2>, RHS
+  3717351272U, // <6,5,2,5>: Cost 4 vext2 <4,3,6,5>, <2,5,3,6>
+  2634983354U, // <6,5,2,6>: Cost 3 vext2 <2,u,6,5>, <2,6,3,7>
+  3115486518U, // <6,5,2,7>: Cost 3 vtrnr <4,6,u,2>, RHS
+  2634983541U, // <6,5,2,u>: Cost 3 vext2 <2,u,6,5>, <2,u,6,5>
+  3708725398U, // <6,5,3,0>: Cost 4 vext2 <2,u,6,5>, <3,0,1,2>
+  3710052631U, // <6,5,3,1>: Cost 4 vext2 <3,1,6,5>, <3,1,6,5>
+  3708725606U, // <6,5,3,2>: Cost 4 vext2 <2,u,6,5>, <3,2,6,3>
+  3708725660U, // <6,5,3,3>: Cost 4 vext2 <2,u,6,5>, <3,3,3,3>
+  2643610114U, // <6,5,3,4>: Cost 3 vext2 <4,3,6,5>, <3,4,5,6>
+  3717352010U, // <6,5,3,5>: Cost 4 vext2 <4,3,6,5>, <3,5,4,6>
+  3773632358U, // <6,5,3,6>: Cost 4 vext3 <2,5,3,6>, <5,3,6,0>
+  2248978533U, // <6,5,3,7>: Cost 3 vrev <5,6,7,3>
+  2249052270U, // <6,5,3,u>: Cost 3 vrev <5,6,u,3>
+  2596323430U, // <6,5,4,0>: Cost 3 vext1 <7,6,5,4>, LHS
+  3716025328U, // <6,5,4,1>: Cost 4 vext2 <4,1,6,5>, <4,1,6,5>
+  3716688961U, // <6,5,4,2>: Cost 4 vext2 <4,2,6,5>, <4,2,6,5>
+  2643610770U, // <6,5,4,3>: Cost 3 vext2 <4,3,6,5>, <4,3,6,5>
+  2596326710U, // <6,5,4,4>: Cost 3 vext1 <7,6,5,4>, RHS
+  2634984758U, // <6,5,4,5>: Cost 3 vext2 <2,u,6,5>, RHS
+  3767734199U, // <6,5,4,6>: Cost 4 vext3 <1,5,4,6>, <5,4,6,0>
+  1643696070U, // <6,5,4,7>: Cost 2 vext3 <5,4,7,6>, <5,4,7,6>
+  1643769807U, // <6,5,4,u>: Cost 2 vext3 <5,4,u,6>, <5,4,u,6>
+  2578415718U, // <6,5,5,0>: Cost 3 vext1 <4,6,5,5>, LHS
+  3652158198U, // <6,5,5,1>: Cost 4 vext1 <4,6,5,5>, <1,0,3,2>
+  3652159080U, // <6,5,5,2>: Cost 4 vext1 <4,6,5,5>, <2,2,2,2>
+  3652159638U, // <6,5,5,3>: Cost 4 vext1 <4,6,5,5>, <3,0,1,2>
+  2578418998U, // <6,5,5,4>: Cost 3 vext1 <4,6,5,5>, RHS
+  2712571908U, // <6,5,5,5>: Cost 3 vext3 <4,6,4,6>, <5,5,5,5>
+  2718027790U, // <6,5,5,6>: Cost 3 vext3 <5,5,6,6>, <5,5,6,6>
+  2712571928U, // <6,5,5,7>: Cost 3 vext3 <4,6,4,6>, <5,5,7,7>
+  2712571937U, // <6,5,5,u>: Cost 3 vext3 <4,6,4,6>, <5,5,u,7>
+  2705346596U, // <6,5,6,0>: Cost 3 vext3 <3,4,5,6>, <5,6,0,1>
+  3767144496U, // <6,5,6,1>: Cost 4 vext3 <1,4,5,6>, <5,6,1,4>
+  3773116473U, // <6,5,6,2>: Cost 4 vext3 <2,4,5,6>, <5,6,2,4>
+  2705346626U, // <6,5,6,3>: Cost 3 vext3 <3,4,5,6>, <5,6,3,4>
+  2705346636U, // <6,5,6,4>: Cost 3 vext3 <3,4,5,6>, <5,6,4,5>
+  3908577217U, // <6,5,6,5>: Cost 4 vuzpr <2,6,4,5>, <2,6,4,5>
+  2578428728U, // <6,5,6,6>: Cost 3 vext1 <4,6,5,6>, <6,6,6,6>
+  2712572002U, // <6,5,6,7>: Cost 3 vext3 <4,6,4,6>, <5,6,7,0>
+  2705346668U, // <6,5,6,u>: Cost 3 vext3 <3,4,5,6>, <5,6,u,1>
+  2560516198U, // <6,5,7,0>: Cost 3 vext1 <1,6,5,7>, LHS
+  2560517363U, // <6,5,7,1>: Cost 3 vext1 <1,6,5,7>, <1,6,5,7>
+  2566490060U, // <6,5,7,2>: Cost 3 vext1 <2,6,5,7>, <2,6,5,7>
+  3634260118U, // <6,5,7,3>: Cost 4 vext1 <1,6,5,7>, <3,0,1,2>
+  2560519478U, // <6,5,7,4>: Cost 3 vext1 <1,6,5,7>, RHS
+  2980498650U, // <6,5,7,5>: Cost 3 vzipr RHS, <4,4,5,5>
+  2980497922U, // <6,5,7,6>: Cost 3 vzipr RHS, <3,4,5,6>
+  3103214902U, // <6,5,7,7>: Cost 3 vtrnr <2,6,3,7>, RHS
+  2560522030U, // <6,5,7,u>: Cost 3 vext1 <1,6,5,7>, LHS
+  2560524390U, // <6,5,u,0>: Cost 3 vext1 <1,6,5,u>, LHS
+  2560525556U, // <6,5,u,1>: Cost 3 vext1 <1,6,5,u>, <1,6,5,u>
+  2566498253U, // <6,5,u,2>: Cost 3 vext1 <2,6,5,u>, <2,6,5,u>
+  2646931439U, // <6,5,u,3>: Cost 3 vext2 <4,u,6,5>, <u,3,5,7>
+  2560527670U, // <6,5,u,4>: Cost 3 vext1 <1,6,5,u>, RHS
+  2634987674U, // <6,5,u,5>: Cost 3 vext2 <2,u,6,5>, RHS
+  2980506114U, // <6,5,u,6>: Cost 3 vzipr RHS, <3,4,5,6>
+  1175277674U, // <6,5,u,7>: Cost 2 vrev <5,6,7,u>
+  1175351411U, // <6,5,u,u>: Cost 2 vrev <5,6,u,u>
+  2578448486U, // <6,6,0,0>: Cost 3 vext1 <4,6,6,0>, LHS
+  1573191782U, // <6,6,0,1>: Cost 2 vext2 <4,u,6,6>, LHS
+  2686030124U, // <6,6,0,2>: Cost 3 vext3 <0,2,4,6>, <6,0,2,4>
+  3779088690U, // <6,6,0,3>: Cost 4 vext3 <3,4,5,6>, <6,0,3,1>
+  2687209788U, // <6,6,0,4>: Cost 3 vext3 <0,4,2,6>, <6,0,4,2>
+  3652194000U, // <6,6,0,5>: Cost 4 vext1 <4,6,6,0>, <5,1,7,3>
+  2254852914U, // <6,6,0,6>: Cost 3 vrev <6,6,6,0>
+  4041575734U, // <6,6,0,7>: Cost 4 vzipr <2,4,6,0>, RHS
+  1573192349U, // <6,6,0,u>: Cost 2 vext2 <4,u,6,6>, LHS
+  2646934262U, // <6,6,1,0>: Cost 3 vext2 <4,u,6,6>, <1,0,3,2>
+  2646934324U, // <6,6,1,1>: Cost 3 vext2 <4,u,6,6>, <1,1,1,1>
+  2646934422U, // <6,6,1,2>: Cost 3 vext2 <4,u,6,6>, <1,2,3,0>
+  2846785638U, // <6,6,1,3>: Cost 3 vuzpr <4,6,4,6>, LHS
+  3760951694U, // <6,6,1,4>: Cost 4 vext3 <0,4,2,6>, <6,1,4,3>
+  2646934672U, // <6,6,1,5>: Cost 3 vext2 <4,u,6,6>, <1,5,3,7>
+  2712572320U, // <6,6,1,6>: Cost 3 vext3 <4,6,4,6>, <6,1,6,3>
+  3775549865U, // <6,6,1,7>: Cost 4 vext3 <2,u,2,6>, <6,1,7,3>
+  2846785643U, // <6,6,1,u>: Cost 3 vuzpr <4,6,4,6>, LHS
+  3759772094U, // <6,6,2,0>: Cost 4 vext3 <0,2,4,6>, <6,2,0,6>
+  3704751676U, // <6,6,2,1>: Cost 4 vext2 <2,2,6,6>, <2,1,6,3>
+  2631009936U, // <6,6,2,2>: Cost 3 vext2 <2,2,6,6>, <2,2,6,6>
+  2646935206U, // <6,6,2,3>: Cost 3 vext2 <4,u,6,6>, <2,3,0,1>
+  3759772127U, // <6,6,2,4>: Cost 4 vext3 <0,2,4,6>, <6,2,4,3>
+  3704752004U, // <6,6,2,5>: Cost 4 vext2 <2,2,6,6>, <2,5,6,7>
+  2646935482U, // <6,6,2,6>: Cost 3 vext2 <4,u,6,6>, <2,6,3,7>
+  2712572410U, // <6,6,2,7>: Cost 3 vext3 <4,6,4,6>, <6,2,7,3>
+  2712572419U, // <6,6,2,u>: Cost 3 vext3 <4,6,4,6>, <6,2,u,3>
+  2646935702U, // <6,6,3,0>: Cost 3 vext2 <4,u,6,6>, <3,0,1,2>
+  3777024534U, // <6,6,3,1>: Cost 4 vext3 <3,1,4,6>, <6,3,1,4>
+  3704752453U, // <6,6,3,2>: Cost 4 vext2 <2,2,6,6>, <3,2,2,6>
+  2646935964U, // <6,6,3,3>: Cost 3 vext2 <4,u,6,6>, <3,3,3,3>
+  2705347122U, // <6,6,3,4>: Cost 3 vext3 <3,4,5,6>, <6,3,4,5>
+  3779678778U, // <6,6,3,5>: Cost 4 vext3 <3,5,4,6>, <6,3,5,4>
+  2657553069U, // <6,6,3,6>: Cost 3 vext2 <6,6,6,6>, <3,6,6,6>
+  4039609654U, // <6,6,3,7>: Cost 4 vzipr <2,1,6,3>, RHS
+  2708001366U, // <6,6,3,u>: Cost 3 vext3 <3,u,5,6>, <6,3,u,5>
+  2578481254U, // <6,6,4,0>: Cost 3 vext1 <4,6,6,4>, LHS
+  3652223734U, // <6,6,4,1>: Cost 4 vext1 <4,6,6,4>, <1,0,3,2>
+  3760951922U, // <6,6,4,2>: Cost 4 vext3 <0,4,2,6>, <6,4,2,6>
+  3779089019U, // <6,6,4,3>: Cost 4 vext3 <3,4,5,6>, <6,4,3,6>
+  1570540772U, // <6,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
+  1573195062U, // <6,6,4,5>: Cost 2 vext2 <4,u,6,6>, RHS
+  2712572560U, // <6,6,4,6>: Cost 3 vext3 <4,6,4,6>, <6,4,6,0>
+  2723410591U, // <6,6,4,7>: Cost 3 vext3 <6,4,7,6>, <6,4,7,6>
+  1573195304U, // <6,6,4,u>: Cost 2 vext2 <4,u,6,6>, <4,u,6,6>
+  3640287334U, // <6,6,5,0>: Cost 4 vext1 <2,6,6,5>, LHS
+  2646937296U, // <6,6,5,1>: Cost 3 vext2 <4,u,6,6>, <5,1,7,3>
+  3640289235U, // <6,6,5,2>: Cost 4 vext1 <2,6,6,5>, <2,6,6,5>
+  3720679279U, // <6,6,5,3>: Cost 4 vext2 <4,u,6,6>, <5,3,7,0>
+  2646937542U, // <6,6,5,4>: Cost 3 vext2 <4,u,6,6>, <5,4,7,6>
+  2646937604U, // <6,6,5,5>: Cost 3 vext2 <4,u,6,6>, <5,5,5,5>
+  2646937698U, // <6,6,5,6>: Cost 3 vext2 <4,u,6,6>, <5,6,7,0>
+  2846788918U, // <6,6,5,7>: Cost 3 vuzpr <4,6,4,6>, RHS
+  2846788919U, // <6,6,5,u>: Cost 3 vuzpr <4,6,4,6>, RHS
+  1516699750U, // <6,6,6,0>: Cost 2 vext1 <6,6,6,6>, LHS
+  2590442230U, // <6,6,6,1>: Cost 3 vext1 <6,6,6,6>, <1,0,3,2>
+  2646938106U, // <6,6,6,2>: Cost 3 vext2 <4,u,6,6>, <6,2,7,3>
+  2590443670U, // <6,6,6,3>: Cost 3 vext1 <6,6,6,6>, <3,0,1,2>
+  1516703030U, // <6,6,6,4>: Cost 2 vext1 <6,6,6,6>, RHS
+  2590445264U, // <6,6,6,5>: Cost 3 vext1 <6,6,6,6>, <5,1,7,3>
+  296144182U, // <6,6,6,6>: Cost 1 vdup2 RHS
+  2712572738U, // <6,6,6,7>: Cost 3 vext3 <4,6,4,6>, <6,6,7,7>
+  296144182U, // <6,6,6,u>: Cost 1 vdup2 RHS
+  2566561894U, // <6,6,7,0>: Cost 3 vext1 <2,6,6,7>, LHS
+  3634332924U, // <6,6,7,1>: Cost 4 vext1 <1,6,6,7>, <1,6,6,7>
+  2566563797U, // <6,6,7,2>: Cost 3 vext1 <2,6,6,7>, <2,6,6,7>
+  2584480258U, // <6,6,7,3>: Cost 3 vext1 <5,6,6,7>, <3,4,5,6>
+  2566565174U, // <6,6,7,4>: Cost 3 vext1 <2,6,6,7>, RHS
+  2717438846U, // <6,6,7,5>: Cost 3 vext3 <5,4,7,6>, <6,7,5,4>
+  2980500280U, // <6,6,7,6>: Cost 3 vzipr RHS, <6,6,6,6>
+  1906756918U, // <6,6,7,7>: Cost 2 vzipr RHS, RHS
+  1906756919U, // <6,6,7,u>: Cost 2 vzipr RHS, RHS
+  1516699750U, // <6,6,u,0>: Cost 2 vext1 <6,6,6,6>, LHS
+  1573197614U, // <6,6,u,1>: Cost 2 vext2 <4,u,6,6>, LHS
+  2566571990U, // <6,6,u,2>: Cost 3 vext1 <2,6,6,u>, <2,6,6,u>
+  2846786205U, // <6,6,u,3>: Cost 3 vuzpr <4,6,4,6>, LHS
+  1516703030U, // <6,6,u,4>: Cost 2 vext1 <6,6,6,6>, RHS
+  1573197978U, // <6,6,u,5>: Cost 2 vext2 <4,u,6,6>, RHS
+  296144182U, // <6,6,u,6>: Cost 1 vdup2 RHS
+  1906765110U, // <6,6,u,7>: Cost 2 vzipr RHS, RHS
+  296144182U, // <6,6,u,u>: Cost 1 vdup2 RHS
+  1571209216U, // <6,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+  497467494U, // <6,7,0,1>: Cost 1 vext2 RHS, LHS
+  1571209380U, // <6,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+  2644951292U, // <6,7,0,3>: Cost 3 vext2 RHS, <0,3,1,0>
+  1571209554U, // <6,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+  1510756450U, // <6,7,0,5>: Cost 2 vext1 <5,6,7,0>, <5,6,7,0>
+  2644951542U, // <6,7,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
+  2584499194U, // <6,7,0,7>: Cost 3 vext1 <5,6,7,0>, <7,0,1,2>
+  497468061U, // <6,7,0,u>: Cost 1 vext2 RHS, LHS
+  1571209974U, // <6,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+  1571210036U, // <6,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+  1571210134U, // <6,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
+  1571210200U, // <6,7,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
+  2644952098U, // <6,7,1,4>: Cost 3 vext2 RHS, <1,4,0,5>
+  1571210384U, // <6,7,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
+  2644952271U, // <6,7,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
+  2578535418U, // <6,7,1,7>: Cost 3 vext1 <4,6,7,1>, <7,0,1,2>
+  1571210605U, // <6,7,1,u>: Cost 2 vext2 RHS, <1,u,1,3>
+  2644952509U, // <6,7,2,0>: Cost 3 vext2 RHS, <2,0,1,2>
+  2644952582U, // <6,7,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
+  1571210856U, // <6,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+  1571210918U, // <6,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+  2644952828U, // <6,7,2,4>: Cost 3 vext2 RHS, <2,4,0,6>
+  2633009028U, // <6,7,2,5>: Cost 3 vext2 <2,5,6,7>, <2,5,6,7>
+  1571211194U, // <6,7,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
+  2668840938U, // <6,7,2,7>: Cost 3 vext2 RHS, <2,7,0,1>
+  1571211323U, // <6,7,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
+  1571211414U, // <6,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+  2644953311U, // <6,7,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
+  2644953390U, // <6,7,3,2>: Cost 3 vext2 RHS, <3,2,0,1>
+  1571211676U, // <6,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+  1571211778U, // <6,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+  2644953648U, // <6,7,3,5>: Cost 3 vext2 RHS, <3,5,1,7>
+  2644953720U, // <6,7,3,6>: Cost 3 vext2 RHS, <3,6,0,7>
+  2644953795U, // <6,7,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
+  1571212062U, // <6,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+  1573202834U, // <6,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+  2644954058U, // <6,7,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
+  2644954166U, // <6,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
+  2644954258U, // <6,7,4,3>: Cost 3 vext2 RHS, <4,3,6,5>
+  1571212496U, // <6,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+  497470774U, // <6,7,4,5>: Cost 1 vext2 RHS, RHS
+  1573203316U, // <6,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  2646281688U, // <6,7,4,7>: Cost 3 vext2 <4,7,6,7>, <4,7,6,7>
+  497471017U, // <6,7,4,u>: Cost 1 vext2 RHS, RHS
+  2644954696U, // <6,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
+  1573203664U, // <6,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  2644954878U, // <6,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
+  2644954991U, // <6,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
+  1571213254U, // <6,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+  1571213316U, // <6,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+  1571213410U, // <6,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
+  1573204136U, // <6,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+  1573204217U, // <6,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
+  2644955425U, // <6,7,6,0>: Cost 3 vext2 RHS, <6,0,1,2>
+  2644955561U, // <6,7,6,1>: Cost 3 vext2 RHS, <6,1,7,3>
+  1573204474U, // <6,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  2644955698U, // <6,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
+  2644955789U, // <6,7,6,4>: Cost 3 vext2 RHS, <6,4,5,6>
+  2644955889U, // <6,7,6,5>: Cost 3 vext2 RHS, <6,5,7,7>
+  1571214136U, // <6,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
+  1571214158U, // <6,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+  1573204895U, // <6,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
+  1573204986U, // <6,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
+  2572608656U, // <6,7,7,1>: Cost 3 vext1 <3,6,7,7>, <1,5,3,7>
+  2644956362U, // <6,7,7,2>: Cost 3 vext2 RHS, <7,2,6,3>
+  2572610231U, // <6,7,7,3>: Cost 3 vext1 <3,6,7,7>, <3,6,7,7>
+  1573205350U, // <6,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
+  2646947220U, // <6,7,7,5>: Cost 3 vext2 RHS, <7,5,1,7>
+  1516786498U, // <6,7,7,6>: Cost 2 vext1 <6,6,7,7>, <6,6,7,7>
+  1571214956U, // <6,7,7,7>: Cost 2 vext2 RHS, <7,7,7,7>
+  1573205634U, // <6,7,7,u>: Cost 2 vext2 RHS, <7,u,1,2>
+  1571215059U, // <6,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
+  497473326U, // <6,7,u,1>: Cost 1 vext2 RHS, LHS
+  1571215237U, // <6,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
+  1571215292U, // <6,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+  1571215423U, // <6,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
+  497473690U, // <6,7,u,5>: Cost 1 vext2 RHS, RHS
+  1571215568U, // <6,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
+  1573206272U, // <6,7,u,7>: Cost 2 vext2 RHS, <u,7,0,1>
+  497473893U, // <6,7,u,u>: Cost 1 vext2 RHS, LHS
+  1571217408U, // <6,u,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+  497475686U, // <6,u,0,1>: Cost 1 vext2 RHS, LHS
+  1571217572U, // <6,u,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+  2689865445U, // <6,u,0,3>: Cost 3 vext3 <0,u,2,6>, <u,0,3,2>
+  1571217746U, // <6,u,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+  1510830187U, // <6,u,0,5>: Cost 2 vext1 <5,6,u,0>, <5,6,u,0>
+  2644959734U, // <6,u,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
+  1193130221U, // <6,u,0,7>: Cost 2 vrev <u,6,7,0>
+  497476253U, // <6,u,0,u>: Cost 1 vext2 RHS, LHS
+  1571218166U, // <6,u,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+  1571218228U, // <6,u,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+  1612289838U, // <6,u,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  1571218392U, // <6,u,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
+  2566663478U, // <6,u,1,4>: Cost 3 vext1 <2,6,u,1>, RHS
+  1571218576U, // <6,u,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
+  2644960463U, // <6,u,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
+  2717439835U, // <6,u,1,7>: Cost 3 vext3 <5,4,7,6>, <u,1,7,3>
+  1612289892U, // <6,u,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
+  1504870502U, // <6,u,2,0>: Cost 2 vext1 <4,6,u,2>, LHS
+  2644960774U, // <6,u,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
+  1571219048U, // <6,u,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+  1571219110U, // <6,u,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+  1504873782U, // <6,u,2,4>: Cost 2 vext1 <4,6,u,2>, RHS
+  2633017221U, // <6,u,2,5>: Cost 3 vext2 <2,5,6,u>, <2,5,6,u>
+  1571219386U, // <6,u,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
+  2712573868U, // <6,u,2,7>: Cost 3 vext3 <4,6,4,6>, <u,2,7,3>
+  1571219515U, // <6,u,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
+  1571219606U, // <6,u,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+  2644961503U, // <6,u,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
+  2566678499U, // <6,u,3,2>: Cost 3 vext1 <2,6,u,3>, <2,6,u,3>
+  1571219868U, // <6,u,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+  1571219970U, // <6,u,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+  2689865711U, // <6,u,3,5>: Cost 3 vext3 <0,u,2,6>, <u,3,5,7>
+  2708002806U, // <6,u,3,6>: Cost 3 vext3 <3,u,5,6>, <u,3,6,5>
+  2644961987U, // <6,u,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
+  1571220254U, // <6,u,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+  1571220370U, // <6,u,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+  2644962250U, // <6,u,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
+  1661245476U, // <6,u,4,2>: Cost 2 vext3 <u,4,2,6>, <u,4,2,6>
+  2686031917U, // <6,u,4,3>: Cost 3 vext3 <0,2,4,6>, <u,4,3,6>
+  1571220688U, // <6,u,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+  497478967U, // <6,u,4,5>: Cost 1 vext2 RHS, RHS
+  1571220852U, // <6,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  1661614161U, // <6,u,4,7>: Cost 2 vext3 <u,4,7,6>, <u,4,7,6>
+  497479209U, // <6,u,4,u>: Cost 1 vext2 RHS, RHS
+  2566692966U, // <6,u,5,0>: Cost 3 vext1 <2,6,u,5>, LHS
+  1571221200U, // <6,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  2566694885U, // <6,u,5,2>: Cost 3 vext1 <2,6,u,5>, <2,6,u,5>
+  2689865855U, // <6,u,5,3>: Cost 3 vext3 <0,u,2,6>, <u,5,3,7>
+  1571221446U, // <6,u,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+  1571221508U, // <6,u,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+  1612290202U, // <6,u,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
+  1571221672U, // <6,u,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+  1612290220U, // <6,u,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
+  1504903270U, // <6,u,6,0>: Cost 2 vext1 <4,6,u,6>, LHS
+  2644963752U, // <6,u,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
+  1571222010U, // <6,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  2686032080U, // <6,u,6,3>: Cost 3 vext3 <0,2,4,6>, <u,6,3,7>
+  1504906550U, // <6,u,6,4>: Cost 2 vext1 <4,6,u,6>, RHS
+  2644964079U, // <6,u,6,5>: Cost 3 vext2 RHS, <6,5,7,5>
+  296144182U, // <6,u,6,6>: Cost 1 vdup2 RHS
+  1571222350U, // <6,u,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+  296144182U, // <6,u,6,u>: Cost 1 vdup2 RHS
+  1492967526U, // <6,u,7,0>: Cost 2 vext1 <2,6,u,7>, LHS
+  2560738574U, // <6,u,7,1>: Cost 3 vext1 <1,6,u,7>, <1,6,u,7>
+  1492969447U, // <6,u,7,2>: Cost 2 vext1 <2,6,u,7>, <2,6,u,7>
+  1906753692U, // <6,u,7,3>: Cost 2 vzipr RHS, LHS
+  1492970806U, // <6,u,7,4>: Cost 2 vext1 <2,6,u,7>, RHS
+  2980495761U, // <6,u,7,5>: Cost 3 vzipr RHS, <0,4,u,5>
+  1516860235U, // <6,u,7,6>: Cost 2 vext1 <6,6,u,7>, <6,6,u,7>
+  1906756936U, // <6,u,7,7>: Cost 2 vzipr RHS, RHS
+  1492973358U, // <6,u,7,u>: Cost 2 vext1 <2,6,u,7>, LHS
+  1492975718U, // <6,u,u,0>: Cost 2 vext1 <2,6,u,u>, LHS
+  497481518U, // <6,u,u,1>: Cost 1 vext2 RHS, LHS
+  1612290405U, // <6,u,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  1571223484U, // <6,u,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+  1492978998U, // <6,u,u,4>: Cost 2 vext1 <2,6,u,u>, RHS
+  497481882U, // <6,u,u,5>: Cost 1 vext2 RHS, RHS
+  296144182U, // <6,u,u,6>: Cost 1 vdup2 RHS
+  1906765128U, // <6,u,u,7>: Cost 2 vzipr RHS, RHS
+  497482085U, // <6,u,u,u>: Cost 1 vext2 RHS, LHS
+  1638318080U, // <7,0,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
+  1638318090U, // <7,0,0,1>: Cost 2 vext3 RHS, <0,0,1,1>
+  1638318100U, // <7,0,0,2>: Cost 2 vext3 RHS, <0,0,2,2>
+  3646442178U, // <7,0,0,3>: Cost 4 vext1 <3,7,0,0>, <3,7,0,0>
+  2712059941U, // <7,0,0,4>: Cost 3 vext3 RHS, <0,0,4,1>
+  2651603364U, // <7,0,0,5>: Cost 3 vext2 <5,6,7,0>, <0,5,1,6>
+  2590618445U, // <7,0,0,6>: Cost 3 vext1 <6,7,0,0>, <6,7,0,0>
+  3785801798U, // <7,0,0,7>: Cost 4 vext3 RHS, <0,0,7,7>
+  1638318153U, // <7,0,0,u>: Cost 2 vext3 RHS, <0,0,u,1>
+  1516879974U, // <7,0,1,0>: Cost 2 vext1 <6,7,0,1>, LHS
+  2693922911U, // <7,0,1,1>: Cost 3 vext3 <1,5,3,7>, <0,1,1,5>
+  564576358U, // <7,0,1,2>: Cost 1 vext3 RHS, LHS
+  2638996480U, // <7,0,1,3>: Cost 3 vext2 <3,5,7,0>, <1,3,5,7>
+  1516883254U, // <7,0,1,4>: Cost 2 vext1 <6,7,0,1>, RHS
+  2649613456U, // <7,0,1,5>: Cost 3 vext2 <5,3,7,0>, <1,5,3,7>
+  1516884814U, // <7,0,1,6>: Cost 2 vext1 <6,7,0,1>, <6,7,0,1>
+  2590626808U, // <7,0,1,7>: Cost 3 vext1 <6,7,0,1>, <7,0,1,0>
+  564576412U, // <7,0,1,u>: Cost 1 vext3 RHS, LHS
+  1638318244U, // <7,0,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
+  2692743344U, // <7,0,2,1>: Cost 3 vext3 <1,3,5,7>, <0,2,1,5>
+  2712060084U, // <7,0,2,2>: Cost 3 vext3 RHS, <0,2,2,0>
+  2712060094U, // <7,0,2,3>: Cost 3 vext3 RHS, <0,2,3,1>
+  1638318284U, // <7,0,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
+  2712060118U, // <7,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
+  2651604922U, // <7,0,2,6>: Cost 3 vext2 <5,6,7,0>, <2,6,3,7>
+  2686255336U, // <7,0,2,7>: Cost 3 vext3 <0,2,7,7>, <0,2,7,7>
+  1638318316U, // <7,0,2,u>: Cost 2 vext3 RHS, <0,2,u,2>
+  2651605142U, // <7,0,3,0>: Cost 3 vext2 <5,6,7,0>, <3,0,1,2>
+  2712060156U, // <7,0,3,1>: Cost 3 vext3 RHS, <0,3,1,0>
+  2712060165U, // <7,0,3,2>: Cost 3 vext3 RHS, <0,3,2,0>
+  2651605404U, // <7,0,3,3>: Cost 3 vext2 <5,6,7,0>, <3,3,3,3>
+  2651605506U, // <7,0,3,4>: Cost 3 vext2 <5,6,7,0>, <3,4,5,6>
+  2638998111U, // <7,0,3,5>: Cost 3 vext2 <3,5,7,0>, <3,5,7,0>
+  2639661744U, // <7,0,3,6>: Cost 3 vext2 <3,6,7,0>, <3,6,7,0>
+  3712740068U, // <7,0,3,7>: Cost 4 vext2 <3,5,7,0>, <3,7,3,7>
+  2640989010U, // <7,0,3,u>: Cost 3 vext2 <3,u,7,0>, <3,u,7,0>
+  2712060232U, // <7,0,4,0>: Cost 3 vext3 RHS, <0,4,0,4>
+  1638318418U, // <7,0,4,1>: Cost 2 vext3 RHS, <0,4,1,5>
+  1638318428U, // <7,0,4,2>: Cost 2 vext3 RHS, <0,4,2,6>
+  3646474950U, // <7,0,4,3>: Cost 4 vext1 <3,7,0,4>, <3,7,0,4>
+  2712060270U, // <7,0,4,4>: Cost 3 vext3 RHS, <0,4,4,6>
+  1577864502U, // <7,0,4,5>: Cost 2 vext2 <5,6,7,0>, RHS
+  2651606388U, // <7,0,4,6>: Cost 3 vext2 <5,6,7,0>, <4,6,4,6>
+  3787792776U, // <7,0,4,7>: Cost 4 vext3 RHS, <0,4,7,5>
+  1638318481U, // <7,0,4,u>: Cost 2 vext3 RHS, <0,4,u,5>
+  2590654566U, // <7,0,5,0>: Cost 3 vext1 <6,7,0,5>, LHS
+  2651606736U, // <7,0,5,1>: Cost 3 vext2 <5,6,7,0>, <5,1,7,3>
+  2712060334U, // <7,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
+  2649616239U, // <7,0,5,3>: Cost 3 vext2 <5,3,7,0>, <5,3,7,0>
+  2651606982U, // <7,0,5,4>: Cost 3 vext2 <5,6,7,0>, <5,4,7,6>
+  2651607044U, // <7,0,5,5>: Cost 3 vext2 <5,6,7,0>, <5,5,5,5>
+  1577865314U, // <7,0,5,6>: Cost 2 vext2 <5,6,7,0>, <5,6,7,0>
+  2651607208U, // <7,0,5,7>: Cost 3 vext2 <5,6,7,0>, <5,7,5,7>
+  1579192580U, // <7,0,5,u>: Cost 2 vext2 <5,u,7,0>, <5,u,7,0>
+  2688393709U, // <7,0,6,0>: Cost 3 vext3 <0,6,0,7>, <0,6,0,7>
+  2712060406U, // <7,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
+  2688541183U, // <7,0,6,2>: Cost 3 vext3 <0,6,2,7>, <0,6,2,7>
+  2655588936U, // <7,0,6,3>: Cost 3 vext2 <6,3,7,0>, <6,3,7,0>
+  3762430481U, // <7,0,6,4>: Cost 4 vext3 <0,6,4,7>, <0,6,4,7>
+  2651607730U, // <7,0,6,5>: Cost 3 vext2 <5,6,7,0>, <6,5,0,7>
+  2651607864U, // <7,0,6,6>: Cost 3 vext2 <5,6,7,0>, <6,6,6,6>
+  2651607886U, // <7,0,6,7>: Cost 3 vext2 <5,6,7,0>, <6,7,0,1>
+  2688983605U, // <7,0,6,u>: Cost 3 vext3 <0,6,u,7>, <0,6,u,7>
+  2651608058U, // <7,0,7,0>: Cost 3 vext2 <5,6,7,0>, <7,0,1,2>
+  2932703334U, // <7,0,7,1>: Cost 3 vzipl <7,7,7,7>, LHS
+  3066921062U, // <7,0,7,2>: Cost 3 vtrnl <7,7,7,7>, LHS
+  3712742678U, // <7,0,7,3>: Cost 4 vext2 <3,5,7,0>, <7,3,5,7>
+  2651608422U, // <7,0,7,4>: Cost 3 vext2 <5,6,7,0>, <7,4,5,6>
+  2651608513U, // <7,0,7,5>: Cost 3 vext2 <5,6,7,0>, <7,5,6,7>
+  2663552532U, // <7,0,7,6>: Cost 3 vext2 <7,6,7,0>, <7,6,7,0>
+  2651608684U, // <7,0,7,7>: Cost 3 vext2 <5,6,7,0>, <7,7,7,7>
+  2651608706U, // <7,0,7,u>: Cost 3 vext2 <5,6,7,0>, <7,u,1,2>
+  1638318730U, // <7,0,u,0>: Cost 2 vext3 RHS, <0,u,0,2>
+  1638318738U, // <7,0,u,1>: Cost 2 vext3 RHS, <0,u,1,1>
+  564576925U, // <7,0,u,2>: Cost 1 vext3 RHS, LHS
+  2572765898U, // <7,0,u,3>: Cost 3 vext1 <3,7,0,u>, <3,7,0,u>
+  1638318770U, // <7,0,u,4>: Cost 2 vext3 RHS, <0,u,4,6>
+  1577867418U, // <7,0,u,5>: Cost 2 vext2 <5,6,7,0>, RHS
+  1516942165U, // <7,0,u,6>: Cost 2 vext1 <6,7,0,u>, <6,7,0,u>
+  2651609344U, // <7,0,u,7>: Cost 3 vext2 <5,6,7,0>, <u,7,0,1>
+  564576979U, // <7,0,u,u>: Cost 1 vext3 RHS, LHS
+  2590687334U, // <7,1,0,0>: Cost 3 vext1 <6,7,1,0>, LHS
+  2639003750U, // <7,1,0,1>: Cost 3 vext2 <3,5,7,1>, LHS
+  2793357414U, // <7,1,0,2>: Cost 3 vuzpl <7,0,1,2>, LHS
+  1638318838U, // <7,1,0,3>: Cost 2 vext3 RHS, <1,0,3,2>
+  2590690614U, // <7,1,0,4>: Cost 3 vext1 <6,7,1,0>, RHS
+  2712060679U, // <7,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
+  2590692182U, // <7,1,0,6>: Cost 3 vext1 <6,7,1,0>, <6,7,1,0>
+  3785802521U, // <7,1,0,7>: Cost 4 vext3 RHS, <1,0,7,1>
+  1638318883U, // <7,1,0,u>: Cost 2 vext3 RHS, <1,0,u,2>
+  2712060715U, // <7,1,1,0>: Cost 3 vext3 RHS, <1,1,0,1>
+  1638318900U, // <7,1,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
+  3774300994U, // <7,1,1,2>: Cost 4 vext3 <2,6,3,7>, <1,1,2,6>
+  1638318920U, // <7,1,1,3>: Cost 2 vext3 RHS, <1,1,3,3>
+  2712060755U, // <7,1,1,4>: Cost 3 vext3 RHS, <1,1,4,5>
+  2691416926U, // <7,1,1,5>: Cost 3 vext3 <1,1,5,7>, <1,1,5,7>
+  2590700375U, // <7,1,1,6>: Cost 3 vext1 <6,7,1,1>, <6,7,1,1>
+  3765158766U, // <7,1,1,7>: Cost 4 vext3 <1,1,5,7>, <1,1,7,5>
+  1638318965U, // <7,1,1,u>: Cost 2 vext3 RHS, <1,1,u,3>
+  2712060796U, // <7,1,2,0>: Cost 3 vext3 RHS, <1,2,0,1>
+  2712060807U, // <7,1,2,1>: Cost 3 vext3 RHS, <1,2,1,3>
+  3712747112U, // <7,1,2,2>: Cost 4 vext2 <3,5,7,1>, <2,2,2,2>
+  1638318998U, // <7,1,2,3>: Cost 2 vext3 RHS, <1,2,3,0>
+  2712060836U, // <7,1,2,4>: Cost 3 vext3 RHS, <1,2,4,5>
+  2712060843U, // <7,1,2,5>: Cost 3 vext3 RHS, <1,2,5,3>
+  2590708568U, // <7,1,2,6>: Cost 3 vext1 <6,7,1,2>, <6,7,1,2>
+  2735948730U, // <7,1,2,7>: Cost 3 vext3 RHS, <1,2,7,0>
+  1638319043U, // <7,1,2,u>: Cost 2 vext3 RHS, <1,2,u,0>
+  2712060876U, // <7,1,3,0>: Cost 3 vext3 RHS, <1,3,0,0>
+  1638319064U, // <7,1,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
+  2712060894U, // <7,1,3,2>: Cost 3 vext3 RHS, <1,3,2,0>
+  2692596718U, // <7,1,3,3>: Cost 3 vext3 <1,3,3,7>, <1,3,3,7>
+  2712060917U, // <7,1,3,4>: Cost 3 vext3 RHS, <1,3,4,5>
+  1619002368U, // <7,1,3,5>: Cost 2 vext3 <1,3,5,7>, <1,3,5,7>
+  2692817929U, // <7,1,3,6>: Cost 3 vext3 <1,3,6,7>, <1,3,6,7>
+  2735948814U, // <7,1,3,7>: Cost 3 vext3 RHS, <1,3,7,3>
+  1619223579U, // <7,1,3,u>: Cost 2 vext3 <1,3,u,7>, <1,3,u,7>
+  2712060962U, // <7,1,4,0>: Cost 3 vext3 RHS, <1,4,0,5>
+  2712060971U, // <7,1,4,1>: Cost 3 vext3 RHS, <1,4,1,5>
+  2712060980U, // <7,1,4,2>: Cost 3 vext3 RHS, <1,4,2,5>
+  2712060989U, // <7,1,4,3>: Cost 3 vext3 RHS, <1,4,3,5>
+  3785802822U, // <7,1,4,4>: Cost 4 vext3 RHS, <1,4,4,5>
+  2639007030U, // <7,1,4,5>: Cost 3 vext2 <3,5,7,1>, RHS
+  2645642634U, // <7,1,4,6>: Cost 3 vext2 <4,6,7,1>, <4,6,7,1>
+  3719384520U, // <7,1,4,7>: Cost 4 vext2 <4,6,7,1>, <4,7,5,0>
+  2639007273U, // <7,1,4,u>: Cost 3 vext2 <3,5,7,1>, RHS
+  2572812390U, // <7,1,5,0>: Cost 3 vext1 <3,7,1,5>, LHS
+  2693776510U, // <7,1,5,1>: Cost 3 vext3 <1,5,1,7>, <1,5,1,7>
+  3774301318U, // <7,1,5,2>: Cost 4 vext3 <2,6,3,7>, <1,5,2,6>
+  1620182160U, // <7,1,5,3>: Cost 2 vext3 <1,5,3,7>, <1,5,3,7>
+  2572815670U, // <7,1,5,4>: Cost 3 vext1 <3,7,1,5>, RHS
+  3766486178U, // <7,1,5,5>: Cost 4 vext3 <1,3,5,7>, <1,5,5,7>
+  2651615331U, // <7,1,5,6>: Cost 3 vext2 <5,6,7,1>, <5,6,7,1>
+  2652278964U, // <7,1,5,7>: Cost 3 vext2 <5,7,7,1>, <5,7,7,1>
+  1620550845U, // <7,1,5,u>: Cost 2 vext3 <1,5,u,7>, <1,5,u,7>
+  3768108230U, // <7,1,6,0>: Cost 4 vext3 <1,6,0,7>, <1,6,0,7>
+  2694440143U, // <7,1,6,1>: Cost 3 vext3 <1,6,1,7>, <1,6,1,7>
+  2712061144U, // <7,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
+  2694587617U, // <7,1,6,3>: Cost 3 vext3 <1,6,3,7>, <1,6,3,7>
+  3768403178U, // <7,1,6,4>: Cost 4 vext3 <1,6,4,7>, <1,6,4,7>
+  2694735091U, // <7,1,6,5>: Cost 3 vext3 <1,6,5,7>, <1,6,5,7>
+  3768550652U, // <7,1,6,6>: Cost 4 vext3 <1,6,6,7>, <1,6,6,7>
+  2652279630U, // <7,1,6,7>: Cost 3 vext2 <5,7,7,1>, <6,7,0,1>
+  2694956302U, // <7,1,6,u>: Cost 3 vext3 <1,6,u,7>, <1,6,u,7>
+  2645644282U, // <7,1,7,0>: Cost 3 vext2 <4,6,7,1>, <7,0,1,2>
+  2859062094U, // <7,1,7,1>: Cost 3 vuzpr <6,7,0,1>, <6,7,0,1>
+  3779462437U, // <7,1,7,2>: Cost 4 vext3 <3,5,1,7>, <1,7,2,3>
+  3121938534U, // <7,1,7,3>: Cost 3 vtrnr <5,7,5,7>, LHS
+  2554916150U, // <7,1,7,4>: Cost 3 vext1 <0,7,1,7>, RHS
+  3769140548U, // <7,1,7,5>: Cost 4 vext3 <1,7,5,7>, <1,7,5,7>
+  3726022164U, // <7,1,7,6>: Cost 4 vext2 <5,7,7,1>, <7,6,7,0>
+  2554918508U, // <7,1,7,7>: Cost 3 vext1 <0,7,1,7>, <7,7,7,7>
+  3121938539U, // <7,1,7,u>: Cost 3 vtrnr <5,7,5,7>, LHS
+  2572836966U, // <7,1,u,0>: Cost 3 vext1 <3,7,1,u>, LHS
+  1638319469U, // <7,1,u,1>: Cost 2 vext3 RHS, <1,u,1,3>
+  2712061299U, // <7,1,u,2>: Cost 3 vext3 RHS, <1,u,2,0>
+  1622173059U, // <7,1,u,3>: Cost 2 vext3 <1,u,3,7>, <1,u,3,7>
+  2572840246U, // <7,1,u,4>: Cost 3 vext1 <3,7,1,u>, RHS
+  1622320533U, // <7,1,u,5>: Cost 2 vext3 <1,u,5,7>, <1,u,5,7>
+  2696136094U, // <7,1,u,6>: Cost 3 vext3 <1,u,6,7>, <1,u,6,7>
+  2859060777U, // <7,1,u,7>: Cost 3 vuzpr <6,7,0,1>, RHS
+  1622541744U, // <7,1,u,u>: Cost 2 vext3 <1,u,u,7>, <1,u,u,7>
+  2712061364U, // <7,2,0,0>: Cost 3 vext3 RHS, <2,0,0,2>
+  2712061373U, // <7,2,0,1>: Cost 3 vext3 RHS, <2,0,1,2>
+  2712061380U, // <7,2,0,2>: Cost 3 vext3 RHS, <2,0,2,0>
+  2712061389U, // <7,2,0,3>: Cost 3 vext3 RHS, <2,0,3,0>
+  2712061404U, // <7,2,0,4>: Cost 3 vext3 RHS, <2,0,4,6>
+  2696725990U, // <7,2,0,5>: Cost 3 vext3 <2,0,5,7>, <2,0,5,7>
+  2712061417U, // <7,2,0,6>: Cost 3 vext3 RHS, <2,0,6,1>
+  3785803251U, // <7,2,0,7>: Cost 4 vext3 RHS, <2,0,7,2>
+  2696947201U, // <7,2,0,u>: Cost 3 vext3 <2,0,u,7>, <2,0,u,7>
+  2712061446U, // <7,2,1,0>: Cost 3 vext3 RHS, <2,1,0,3>
+  3785803276U, // <7,2,1,1>: Cost 4 vext3 RHS, <2,1,1,0>
+  3785803285U, // <7,2,1,2>: Cost 4 vext3 RHS, <2,1,2,0>
+  2712061471U, // <7,2,1,3>: Cost 3 vext3 RHS, <2,1,3,1>
+  2712061482U, // <7,2,1,4>: Cost 3 vext3 RHS, <2,1,4,3>
+  3766486576U, // <7,2,1,5>: Cost 4 vext3 <1,3,5,7>, <2,1,5,0>
+  2712061500U, // <7,2,1,6>: Cost 3 vext3 RHS, <2,1,6,3>
+  2602718850U, // <7,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
+  2712061516U, // <7,2,1,u>: Cost 3 vext3 RHS, <2,1,u,1>
+  2712061525U, // <7,2,2,0>: Cost 3 vext3 RHS, <2,2,0,1>
+  2712061536U, // <7,2,2,1>: Cost 3 vext3 RHS, <2,2,1,3>
+  1638319720U, // <7,2,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
+  1638319730U, // <7,2,2,3>: Cost 2 vext3 RHS, <2,2,3,3>
+  2712061565U, // <7,2,2,4>: Cost 3 vext3 RHS, <2,2,4,5>
+  2698053256U, // <7,2,2,5>: Cost 3 vext3 <2,2,5,7>, <2,2,5,7>
+  2712061584U, // <7,2,2,6>: Cost 3 vext3 RHS, <2,2,6,6>
+  3771795096U, // <7,2,2,7>: Cost 4 vext3 <2,2,5,7>, <2,2,7,5>
+  1638319775U, // <7,2,2,u>: Cost 2 vext3 RHS, <2,2,u,3>
+  1638319782U, // <7,2,3,0>: Cost 2 vext3 RHS, <2,3,0,1>
+  2693924531U, // <7,2,3,1>: Cost 3 vext3 <1,5,3,7>, <2,3,1,5>
+  2700560061U, // <7,2,3,2>: Cost 3 vext3 <2,6,3,7>, <2,3,2,6>
+  2693924551U, // <7,2,3,3>: Cost 3 vext3 <1,5,3,7>, <2,3,3,7>
+  1638319822U, // <7,2,3,4>: Cost 2 vext3 RHS, <2,3,4,5>
+  2698716889U, // <7,2,3,5>: Cost 3 vext3 <2,3,5,7>, <2,3,5,7>
+  2712061665U, // <7,2,3,6>: Cost 3 vext3 RHS, <2,3,6,6>
+  2735949540U, // <7,2,3,7>: Cost 3 vext3 RHS, <2,3,7,0>
+  1638319854U, // <7,2,3,u>: Cost 2 vext3 RHS, <2,3,u,1>
+  2712061692U, // <7,2,4,0>: Cost 3 vext3 RHS, <2,4,0,6>
+  2712061698U, // <7,2,4,1>: Cost 3 vext3 RHS, <2,4,1,3>
+  2712061708U, // <7,2,4,2>: Cost 3 vext3 RHS, <2,4,2,4>
+  2712061718U, // <7,2,4,3>: Cost 3 vext3 RHS, <2,4,3,5>
+  2712061728U, // <7,2,4,4>: Cost 3 vext3 RHS, <2,4,4,6>
+  2699380522U, // <7,2,4,5>: Cost 3 vext3 <2,4,5,7>, <2,4,5,7>
+  2712061740U, // <7,2,4,6>: Cost 3 vext3 RHS, <2,4,6,0>
+  3809691445U, // <7,2,4,7>: Cost 4 vext3 RHS, <2,4,7,0>
+  2699601733U, // <7,2,4,u>: Cost 3 vext3 <2,4,u,7>, <2,4,u,7>
+  2699675470U, // <7,2,5,0>: Cost 3 vext3 <2,5,0,7>, <2,5,0,7>
+  3766486867U, // <7,2,5,1>: Cost 4 vext3 <1,3,5,7>, <2,5,1,3>
+  2699822944U, // <7,2,5,2>: Cost 3 vext3 <2,5,2,7>, <2,5,2,7>
+  2692745065U, // <7,2,5,3>: Cost 3 vext3 <1,3,5,7>, <2,5,3,7>
+  2699970418U, // <7,2,5,4>: Cost 3 vext3 <2,5,4,7>, <2,5,4,7>
+  3766486907U, // <7,2,5,5>: Cost 4 vext3 <1,3,5,7>, <2,5,5,7>
+  2700117892U, // <7,2,5,6>: Cost 3 vext3 <2,5,6,7>, <2,5,6,7>
+  3771795334U, // <7,2,5,7>: Cost 4 vext3 <2,2,5,7>, <2,5,7,0>
+  2692745110U, // <7,2,5,u>: Cost 3 vext3 <1,3,5,7>, <2,5,u,7>
+  2572894310U, // <7,2,6,0>: Cost 3 vext1 <3,7,2,6>, LHS
+  2712061860U, // <7,2,6,1>: Cost 3 vext3 RHS, <2,6,1,3>
+  2700486577U, // <7,2,6,2>: Cost 3 vext3 <2,6,2,7>, <2,6,2,7>
+  1626818490U, // <7,2,6,3>: Cost 2 vext3 <2,6,3,7>, <2,6,3,7>
+  2572897590U, // <7,2,6,4>: Cost 3 vext1 <3,7,2,6>, RHS
+  2700707788U, // <7,2,6,5>: Cost 3 vext3 <2,6,5,7>, <2,6,5,7>
+  2700781525U, // <7,2,6,6>: Cost 3 vext3 <2,6,6,7>, <2,6,6,7>
+  3774597086U, // <7,2,6,7>: Cost 4 vext3 <2,6,7,7>, <2,6,7,7>
+  1627187175U, // <7,2,6,u>: Cost 2 vext3 <2,6,u,7>, <2,6,u,7>
+  2735949802U, // <7,2,7,0>: Cost 3 vext3 RHS, <2,7,0,1>
+  3780200434U, // <7,2,7,1>: Cost 4 vext3 <3,6,2,7>, <2,7,1,0>
+  3773564928U, // <7,2,7,2>: Cost 4 vext3 <2,5,2,7>, <2,7,2,5>
+  2986541158U, // <7,2,7,3>: Cost 3 vzipr <5,5,7,7>, LHS
+  2554989878U, // <7,2,7,4>: Cost 3 vext1 <0,7,2,7>, RHS
+  3775113245U, // <7,2,7,5>: Cost 4 vext3 <2,7,5,7>, <2,7,5,7>
+  4060283228U, // <7,2,7,6>: Cost 4 vzipr <5,5,7,7>, <0,4,2,6>
+  2554992236U, // <7,2,7,7>: Cost 3 vext1 <0,7,2,7>, <7,7,7,7>
+  2986541163U, // <7,2,7,u>: Cost 3 vzipr <5,5,7,7>, LHS
+  1638320187U, // <7,2,u,0>: Cost 2 vext3 RHS, <2,u,0,1>
+  2693924936U, // <7,2,u,1>: Cost 3 vext3 <1,5,3,7>, <2,u,1,5>
+  1638319720U, // <7,2,u,2>: Cost 2 vext3 RHS, <2,2,2,2>
+  1628145756U, // <7,2,u,3>: Cost 2 vext3 <2,u,3,7>, <2,u,3,7>
+  1638320227U, // <7,2,u,4>: Cost 2 vext3 RHS, <2,u,4,5>
+  2702035054U, // <7,2,u,5>: Cost 3 vext3 <2,u,5,7>, <2,u,5,7>
+  2702108791U, // <7,2,u,6>: Cost 3 vext3 <2,u,6,7>, <2,u,6,7>
+  2735949945U, // <7,2,u,7>: Cost 3 vext3 RHS, <2,u,7,0>
+  1628514441U, // <7,2,u,u>: Cost 2 vext3 <2,u,u,7>, <2,u,u,7>
+  2712062091U, // <7,3,0,0>: Cost 3 vext3 RHS, <3,0,0,0>
+  1638320278U, // <7,3,0,1>: Cost 2 vext3 RHS, <3,0,1,2>
+  2712062109U, // <7,3,0,2>: Cost 3 vext3 RHS, <3,0,2,0>
+  2590836886U, // <7,3,0,3>: Cost 3 vext1 <6,7,3,0>, <3,0,1,2>
+  2712062128U, // <7,3,0,4>: Cost 3 vext3 RHS, <3,0,4,1>
+  2712062138U, // <7,3,0,5>: Cost 3 vext3 RHS, <3,0,5,2>
+  2590839656U, // <7,3,0,6>: Cost 3 vext1 <6,7,3,0>, <6,7,3,0>
+  3311414017U, // <7,3,0,7>: Cost 4 vrev <3,7,7,0>
+  1638320341U, // <7,3,0,u>: Cost 2 vext3 RHS, <3,0,u,2>
+  2237164227U, // <7,3,1,0>: Cost 3 vrev <3,7,0,1>
+  2712062182U, // <7,3,1,1>: Cost 3 vext3 RHS, <3,1,1,1>
+  2712062193U, // <7,3,1,2>: Cost 3 vext3 RHS, <3,1,2,3>
+  2692745468U, // <7,3,1,3>: Cost 3 vext3 <1,3,5,7>, <3,1,3,5>
+  2712062214U, // <7,3,1,4>: Cost 3 vext3 RHS, <3,1,4,6>
+  2693925132U, // <7,3,1,5>: Cost 3 vext3 <1,5,3,7>, <3,1,5,3>
+  3768183059U, // <7,3,1,6>: Cost 4 vext3 <1,6,1,7>, <3,1,6,1>
+  2692745504U, // <7,3,1,7>: Cost 3 vext3 <1,3,5,7>, <3,1,7,5>
+  2696063273U, // <7,3,1,u>: Cost 3 vext3 <1,u,5,7>, <3,1,u,5>
+  2712062254U, // <7,3,2,0>: Cost 3 vext3 RHS, <3,2,0,1>
+  2712062262U, // <7,3,2,1>: Cost 3 vext3 RHS, <3,2,1,0>
+  2712062273U, // <7,3,2,2>: Cost 3 vext3 RHS, <3,2,2,2>
+  2712062280U, // <7,3,2,3>: Cost 3 vext3 RHS, <3,2,3,0>
+  2712062294U, // <7,3,2,4>: Cost 3 vext3 RHS, <3,2,4,5>
+  2712062302U, // <7,3,2,5>: Cost 3 vext3 RHS, <3,2,5,4>
+  2700560742U, // <7,3,2,6>: Cost 3 vext3 <2,6,3,7>, <3,2,6,3>
+  2712062319U, // <7,3,2,7>: Cost 3 vext3 RHS, <3,2,7,3>
+  2712062325U, // <7,3,2,u>: Cost 3 vext3 RHS, <3,2,u,0>
+  2712062335U, // <7,3,3,0>: Cost 3 vext3 RHS, <3,3,0,1>
+  2636368158U, // <7,3,3,1>: Cost 3 vext2 <3,1,7,3>, <3,1,7,3>
+  2637031791U, // <7,3,3,2>: Cost 3 vext2 <3,2,7,3>, <3,2,7,3>
+  1638320540U, // <7,3,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
+  2712062374U, // <7,3,3,4>: Cost 3 vext3 RHS, <3,3,4,4>
+  2704689586U, // <7,3,3,5>: Cost 3 vext3 <3,3,5,7>, <3,3,5,7>
+  2590864235U, // <7,3,3,6>: Cost 3 vext1 <6,7,3,3>, <6,7,3,3>
+  2704837060U, // <7,3,3,7>: Cost 3 vext3 <3,3,7,7>, <3,3,7,7>
+  1638320540U, // <7,3,3,u>: Cost 2 vext3 RHS, <3,3,3,3>
+  2712062416U, // <7,3,4,0>: Cost 3 vext3 RHS, <3,4,0,1>
+  2712062426U, // <7,3,4,1>: Cost 3 vext3 RHS, <3,4,1,2>
+  2566981640U, // <7,3,4,2>: Cost 3 vext1 <2,7,3,4>, <2,7,3,4>
+  2712062447U, // <7,3,4,3>: Cost 3 vext3 RHS, <3,4,3,5>
+  2712062456U, // <7,3,4,4>: Cost 3 vext3 RHS, <3,4,4,5>
+  1638320642U, // <7,3,4,5>: Cost 2 vext3 RHS, <3,4,5,6>
+  2648313204U, // <7,3,4,6>: Cost 3 vext2 <5,1,7,3>, <4,6,4,6>
+  3311446789U, // <7,3,4,7>: Cost 4 vrev <3,7,7,4>
+  1638320669U, // <7,3,4,u>: Cost 2 vext3 RHS, <3,4,u,6>
+  2602819686U, // <7,3,5,0>: Cost 3 vext1 <u,7,3,5>, LHS
+  1574571728U, // <7,3,5,1>: Cost 2 vext2 <5,1,7,3>, <5,1,7,3>
+  2648977185U, // <7,3,5,2>: Cost 3 vext2 <5,2,7,3>, <5,2,7,3>
+  2705869378U, // <7,3,5,3>: Cost 3 vext3 <3,5,3,7>, <3,5,3,7>
+  2237491947U, // <7,3,5,4>: Cost 3 vrev <3,7,4,5>
+  2706016852U, // <7,3,5,5>: Cost 3 vext3 <3,5,5,7>, <3,5,5,7>
+  2648313954U, // <7,3,5,6>: Cost 3 vext2 <5,1,7,3>, <5,6,7,0>
+  2692745823U, // <7,3,5,7>: Cost 3 vext3 <1,3,5,7>, <3,5,7,0>
+  1579217159U, // <7,3,5,u>: Cost 2 vext2 <5,u,7,3>, <5,u,7,3>
+  2706311800U, // <7,3,6,0>: Cost 3 vext3 <3,6,0,7>, <3,6,0,7>
+  2654286249U, // <7,3,6,1>: Cost 3 vext2 <6,1,7,3>, <6,1,7,3>
+  1581208058U, // <7,3,6,2>: Cost 2 vext2 <6,2,7,3>, <6,2,7,3>
+  2706533011U, // <7,3,6,3>: Cost 3 vext3 <3,6,3,7>, <3,6,3,7>
+  2706606748U, // <7,3,6,4>: Cost 3 vext3 <3,6,4,7>, <3,6,4,7>
+  3780422309U, // <7,3,6,5>: Cost 4 vext3 <3,6,5,7>, <3,6,5,7>
+  2712062637U, // <7,3,6,6>: Cost 3 vext3 RHS, <3,6,6,6>
+  2706827959U, // <7,3,6,7>: Cost 3 vext3 <3,6,7,7>, <3,6,7,7>
+  1585189856U, // <7,3,6,u>: Cost 2 vext2 <6,u,7,3>, <6,u,7,3>
+  2693925571U, // <7,3,7,0>: Cost 3 vext3 <1,5,3,7>, <3,7,0,1>
+  2693925584U, // <7,3,7,1>: Cost 3 vext3 <1,5,3,7>, <3,7,1,5>
+  2700561114U, // <7,3,7,2>: Cost 3 vext3 <2,6,3,7>, <3,7,2,6>
+  2572978916U, // <7,3,7,3>: Cost 3 vext1 <3,7,3,7>, <3,7,3,7>
+  2693925611U, // <7,3,7,4>: Cost 3 vext3 <1,5,3,7>, <3,7,4,5>
+  2707344118U, // <7,3,7,5>: Cost 3 vext3 <3,7,5,7>, <3,7,5,7>
+  2654950894U, // <7,3,7,6>: Cost 3 vext2 <6,2,7,3>, <7,6,2,7>
+  2648315500U, // <7,3,7,7>: Cost 3 vext2 <5,1,7,3>, <7,7,7,7>
+  2693925643U, // <7,3,7,u>: Cost 3 vext3 <1,5,3,7>, <3,7,u,1>
+  2237221578U, // <7,3,u,0>: Cost 3 vrev <3,7,0,u>
+  1638320926U, // <7,3,u,1>: Cost 2 vext3 RHS, <3,u,1,2>
+  1593153452U, // <7,3,u,2>: Cost 2 vext2 <u,2,7,3>, <u,2,7,3>
+  1638320540U, // <7,3,u,3>: Cost 2 vext3 RHS, <3,3,3,3>
+  2237516526U, // <7,3,u,4>: Cost 3 vrev <3,7,4,u>
+  1638320966U, // <7,3,u,5>: Cost 2 vext3 RHS, <3,u,5,6>
+  2712062796U, // <7,3,u,6>: Cost 3 vext3 RHS, <3,u,6,3>
+  2692967250U, // <7,3,u,7>: Cost 3 vext3 <1,3,u,7>, <3,u,7,0>
+  1638320989U, // <7,3,u,u>: Cost 2 vext3 RHS, <3,u,u,2>
+  2651635712U, // <7,4,0,0>: Cost 3 vext2 <5,6,7,4>, <0,0,0,0>
+  1577893990U, // <7,4,0,1>: Cost 2 vext2 <5,6,7,4>, LHS
+  2651635876U, // <7,4,0,2>: Cost 3 vext2 <5,6,7,4>, <0,2,0,2>
+  3785804672U, // <7,4,0,3>: Cost 4 vext3 RHS, <4,0,3,1>
+  2651636050U, // <7,4,0,4>: Cost 3 vext2 <5,6,7,4>, <0,4,1,5>
+  1638468498U, // <7,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
+  1638468508U, // <7,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
+  3787795364U, // <7,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
+  1640459181U, // <7,4,0,u>: Cost 2 vext3 RHS, <4,0,u,1>
+  2651636470U, // <7,4,1,0>: Cost 3 vext2 <5,6,7,4>, <1,0,3,2>
+  2651636532U, // <7,4,1,1>: Cost 3 vext2 <5,6,7,4>, <1,1,1,1>
+  2712062922U, // <7,4,1,2>: Cost 3 vext3 RHS, <4,1,2,3>
+  2639029248U, // <7,4,1,3>: Cost 3 vext2 <3,5,7,4>, <1,3,5,7>
+  2712062940U, // <7,4,1,4>: Cost 3 vext3 RHS, <4,1,4,3>
+  2712062946U, // <7,4,1,5>: Cost 3 vext3 RHS, <4,1,5,0>
+  2712062958U, // <7,4,1,6>: Cost 3 vext3 RHS, <4,1,6,3>
+  3785804791U, // <7,4,1,7>: Cost 4 vext3 RHS, <4,1,7,3>
+  2712062973U, // <7,4,1,u>: Cost 3 vext3 RHS, <4,1,u,0>
+  3785804807U, // <7,4,2,0>: Cost 4 vext3 RHS, <4,2,0,1>
+  3785804818U, // <7,4,2,1>: Cost 4 vext3 RHS, <4,2,1,3>
+  2651637352U, // <7,4,2,2>: Cost 3 vext2 <5,6,7,4>, <2,2,2,2>
+  2651637414U, // <7,4,2,3>: Cost 3 vext2 <5,6,7,4>, <2,3,0,1>
+  3716753194U, // <7,4,2,4>: Cost 4 vext2 <4,2,7,4>, <2,4,5,7>
+  2712063030U, // <7,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
+  2712063036U, // <7,4,2,6>: Cost 3 vext3 RHS, <4,2,6,0>
+  3773123658U, // <7,4,2,7>: Cost 4 vext3 <2,4,5,7>, <4,2,7,5>
+  2712063054U, // <7,4,2,u>: Cost 3 vext3 RHS, <4,2,u,0>
+  2651637910U, // <7,4,3,0>: Cost 3 vext2 <5,6,7,4>, <3,0,1,2>
+  3712772348U, // <7,4,3,1>: Cost 4 vext2 <3,5,7,4>, <3,1,3,5>
+  3785804906U, // <7,4,3,2>: Cost 4 vext3 RHS, <4,3,2,1>
+  2651638172U, // <7,4,3,3>: Cost 3 vext2 <5,6,7,4>, <3,3,3,3>
+  2651638274U, // <7,4,3,4>: Cost 3 vext2 <5,6,7,4>, <3,4,5,6>
+  2639030883U, // <7,4,3,5>: Cost 3 vext2 <3,5,7,4>, <3,5,7,4>
+  2712063122U, // <7,4,3,6>: Cost 3 vext3 RHS, <4,3,6,5>
+  3712772836U, // <7,4,3,7>: Cost 4 vext2 <3,5,7,4>, <3,7,3,7>
+  2641021782U, // <7,4,3,u>: Cost 3 vext2 <3,u,7,4>, <3,u,7,4>
+  2714053802U, // <7,4,4,0>: Cost 3 vext3 RHS, <4,4,0,2>
+  3785804978U, // <7,4,4,1>: Cost 4 vext3 RHS, <4,4,1,1>
+  3716754505U, // <7,4,4,2>: Cost 4 vext2 <4,2,7,4>, <4,2,7,4>
+  3785804998U, // <7,4,4,3>: Cost 4 vext3 RHS, <4,4,3,3>
+  1638321360U, // <7,4,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
+  1638468826U, // <7,4,4,5>: Cost 2 vext3 RHS, <4,4,5,5>
+  1638468836U, // <7,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
+  3785215214U, // <7,4,4,7>: Cost 4 vext3 <4,4,7,7>, <4,4,7,7>
+  1640459509U, // <7,4,4,u>: Cost 2 vext3 RHS, <4,4,u,5>
+  1517207654U, // <7,4,5,0>: Cost 2 vext1 <6,7,4,5>, LHS
+  2573034640U, // <7,4,5,1>: Cost 3 vext1 <3,7,4,5>, <1,5,3,7>
+  2712063246U, // <7,4,5,2>: Cost 3 vext3 RHS, <4,5,2,3>
+  2573036267U, // <7,4,5,3>: Cost 3 vext1 <3,7,4,5>, <3,7,4,5>
+  1517210934U, // <7,4,5,4>: Cost 2 vext1 <6,7,4,5>, RHS
+  2711989549U, // <7,4,5,5>: Cost 3 vext3 <4,5,5,7>, <4,5,5,7>
+  564579638U, // <7,4,5,6>: Cost 1 vext3 RHS, RHS
+  2651639976U, // <7,4,5,7>: Cost 3 vext2 <5,6,7,4>, <5,7,5,7>
+  564579656U, // <7,4,5,u>: Cost 1 vext3 RHS, RHS
+  2712063307U, // <7,4,6,0>: Cost 3 vext3 RHS, <4,6,0,1>
+  3767668056U, // <7,4,6,1>: Cost 4 vext3 <1,5,3,7>, <4,6,1,5>
+  2651640314U, // <7,4,6,2>: Cost 3 vext2 <5,6,7,4>, <6,2,7,3>
+  2655621708U, // <7,4,6,3>: Cost 3 vext2 <6,3,7,4>, <6,3,7,4>
+  1638468980U, // <7,4,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
+  2712063358U, // <7,4,6,5>: Cost 3 vext3 RHS, <4,6,5,7>
+  2712063367U, // <7,4,6,6>: Cost 3 vext3 RHS, <4,6,6,7>
+  2712210826U, // <7,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
+  1638469012U, // <7,4,6,u>: Cost 2 vext3 RHS, <4,6,u,2>
+  2651640826U, // <7,4,7,0>: Cost 3 vext2 <5,6,7,4>, <7,0,1,2>
+  3773713830U, // <7,4,7,1>: Cost 4 vext3 <2,5,4,7>, <4,7,1,2>
+  3773713842U, // <7,4,7,2>: Cost 4 vext3 <2,5,4,7>, <4,7,2,5>
+  3780349372U, // <7,4,7,3>: Cost 4 vext3 <3,6,4,7>, <4,7,3,6>
+  2651641140U, // <7,4,7,4>: Cost 3 vext2 <5,6,7,4>, <7,4,0,1>
+  2712210888U, // <7,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
+  2712210898U, // <7,4,7,6>: Cost 3 vext3 RHS, <4,7,6,1>
+  2651641452U, // <7,4,7,7>: Cost 3 vext2 <5,6,7,4>, <7,7,7,7>
+  2713538026U, // <7,4,7,u>: Cost 3 vext3 <4,7,u,7>, <4,7,u,7>
+  1517232230U, // <7,4,u,0>: Cost 2 vext1 <6,7,4,u>, LHS
+  1577899822U, // <7,4,u,1>: Cost 2 vext2 <5,6,7,4>, LHS
+  2712063489U, // <7,4,u,2>: Cost 3 vext3 RHS, <4,u,2,3>
+  2573060846U, // <7,4,u,3>: Cost 3 vext1 <3,7,4,u>, <3,7,4,u>
+  1640312342U, // <7,4,u,4>: Cost 2 vext3 RHS, <4,u,4,6>
+  1638469146U, // <7,4,u,5>: Cost 2 vext3 RHS, <4,u,5,1>
+  564579881U, // <7,4,u,6>: Cost 1 vext3 RHS, RHS
+  2714054192U, // <7,4,u,7>: Cost 3 vext3 RHS, <4,u,7,5>
+  564579899U, // <7,4,u,u>: Cost 1 vext3 RHS, RHS
+  2579038310U, // <7,5,0,0>: Cost 3 vext1 <4,7,5,0>, LHS
+  2636382310U, // <7,5,0,1>: Cost 3 vext2 <3,1,7,5>, LHS
+  2796339302U, // <7,5,0,2>: Cost 3 vuzpl <7,4,5,6>, LHS
+  3646810719U, // <7,5,0,3>: Cost 4 vext1 <3,7,5,0>, <3,5,7,0>
+  2712063586U, // <7,5,0,4>: Cost 3 vext3 RHS, <5,0,4,1>
+  2735951467U, // <7,5,0,5>: Cost 3 vext3 RHS, <5,0,5,1>
+  2735951476U, // <7,5,0,6>: Cost 3 vext3 RHS, <5,0,6,1>
+  2579043322U, // <7,5,0,7>: Cost 3 vext1 <4,7,5,0>, <7,0,1,2>
+  2636382877U, // <7,5,0,u>: Cost 3 vext2 <3,1,7,5>, LHS
+  2712211087U, // <7,5,1,0>: Cost 3 vext3 RHS, <5,1,0,1>
+  3698180916U, // <7,5,1,1>: Cost 4 vext2 <1,1,7,5>, <1,1,1,1>
+  3710124950U, // <7,5,1,2>: Cost 4 vext2 <3,1,7,5>, <1,2,3,0>
+  2636383232U, // <7,5,1,3>: Cost 3 vext2 <3,1,7,5>, <1,3,5,7>
+  2712211127U, // <7,5,1,4>: Cost 3 vext3 RHS, <5,1,4,5>
+  2590994128U, // <7,5,1,5>: Cost 3 vext1 <6,7,5,1>, <5,1,7,3>
+  2590995323U, // <7,5,1,6>: Cost 3 vext1 <6,7,5,1>, <6,7,5,1>
+  1638469328U, // <7,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
+  1638469337U, // <7,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
+  3785805536U, // <7,5,2,0>: Cost 4 vext3 RHS, <5,2,0,1>
+  3785805544U, // <7,5,2,1>: Cost 4 vext3 RHS, <5,2,1,0>
+  3704817288U, // <7,5,2,2>: Cost 4 vext2 <2,2,7,5>, <2,2,5,7>
+  2712063742U, // <7,5,2,3>: Cost 3 vext3 RHS, <5,2,3,4>
+  3716761386U, // <7,5,2,4>: Cost 4 vext2 <4,2,7,5>, <2,4,5,7>
+  2714054415U, // <7,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
+  3774304024U, // <7,5,2,6>: Cost 4 vext3 <2,6,3,7>, <5,2,6,3>
+  2712063777U, // <7,5,2,7>: Cost 3 vext3 RHS, <5,2,7,3>
+  2712063787U, // <7,5,2,u>: Cost 3 vext3 RHS, <5,2,u,4>
+  3634888806U, // <7,5,3,0>: Cost 4 vext1 <1,7,5,3>, LHS
+  2636384544U, // <7,5,3,1>: Cost 3 vext2 <3,1,7,5>, <3,1,7,5>
+  3710790001U, // <7,5,3,2>: Cost 4 vext2 <3,2,7,5>, <3,2,7,5>
+  3710126492U, // <7,5,3,3>: Cost 4 vext2 <3,1,7,5>, <3,3,3,3>
+  3634892086U, // <7,5,3,4>: Cost 4 vext1 <1,7,5,3>, RHS
+  2639039076U, // <7,5,3,5>: Cost 3 vext2 <3,5,7,5>, <3,5,7,5>
+  3713444533U, // <7,5,3,6>: Cost 4 vext2 <3,6,7,5>, <3,6,7,5>
+  2693926767U, // <7,5,3,7>: Cost 3 vext3 <1,5,3,7>, <5,3,7,0>
+  2712063864U, // <7,5,3,u>: Cost 3 vext3 RHS, <5,3,u,0>
+  2579071078U, // <7,5,4,0>: Cost 3 vext1 <4,7,5,4>, LHS
+  3646841856U, // <7,5,4,1>: Cost 4 vext1 <3,7,5,4>, <1,3,5,7>
+  3716762698U, // <7,5,4,2>: Cost 4 vext2 <4,2,7,5>, <4,2,7,5>
+  3646843491U, // <7,5,4,3>: Cost 4 vext1 <3,7,5,4>, <3,5,7,4>
+  2579074358U, // <7,5,4,4>: Cost 3 vext1 <4,7,5,4>, RHS
+  2636385590U, // <7,5,4,5>: Cost 3 vext2 <3,1,7,5>, RHS
+  2645675406U, // <7,5,4,6>: Cost 3 vext2 <4,6,7,5>, <4,6,7,5>
+  1638322118U, // <7,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
+  1638469583U, // <7,5,4,u>: Cost 2 vext3 RHS, <5,4,u,6>
+  2714054611U, // <7,5,5,0>: Cost 3 vext3 RHS, <5,5,0,1>
+  2652974800U, // <7,5,5,1>: Cost 3 vext2 <5,u,7,5>, <5,1,7,3>
+  3710127905U, // <7,5,5,2>: Cost 4 vext2 <3,1,7,5>, <5,2,7,3>
+  3785805808U, // <7,5,5,3>: Cost 4 vext3 RHS, <5,5,3,3>
+  2712211450U, // <7,5,5,4>: Cost 3 vext3 RHS, <5,5,4,4>
+  1638322180U, // <7,5,5,5>: Cost 2 vext3 RHS, <5,5,5,5>
+  2712064014U, // <7,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
+  1638469656U, // <7,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
+  1638469665U, // <7,5,5,u>: Cost 2 vext3 RHS, <5,5,u,7>
+  2712064036U, // <7,5,6,0>: Cost 3 vext3 RHS, <5,6,0,1>
+  2714054707U, // <7,5,6,1>: Cost 3 vext3 RHS, <5,6,1,7>
+  3785805879U, // <7,5,6,2>: Cost 4 vext3 RHS, <5,6,2,2>
+  2712064066U, // <7,5,6,3>: Cost 3 vext3 RHS, <5,6,3,4>
+  2712064076U, // <7,5,6,4>: Cost 3 vext3 RHS, <5,6,4,5>
+  2714054743U, // <7,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
+  2712064096U, // <7,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
+  1638322274U, // <7,5,6,7>: Cost 2 vext3 RHS, <5,6,7,0>
+  1638469739U, // <7,5,6,u>: Cost 2 vext3 RHS, <5,6,u,0>
+  1511325798U, // <7,5,7,0>: Cost 2 vext1 <5,7,5,7>, LHS
+  2692747392U, // <7,5,7,1>: Cost 3 vext3 <1,3,5,7>, <5,7,1,3>
+  2585069160U, // <7,5,7,2>: Cost 3 vext1 <5,7,5,7>, <2,2,2,2>
+  2573126390U, // <7,5,7,3>: Cost 3 vext1 <3,7,5,7>, <3,7,5,7>
+  1511329078U, // <7,5,7,4>: Cost 2 vext1 <5,7,5,7>, RHS
+  1638469800U, // <7,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
+  2712211626U, // <7,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
+  2712211636U, // <7,5,7,7>: Cost 3 vext3 RHS, <5,7,7,1>
+  1638469823U, // <7,5,7,u>: Cost 2 vext3 RHS, <5,7,u,3>
+  1511333990U, // <7,5,u,0>: Cost 2 vext1 <5,7,5,u>, LHS
+  2636388142U, // <7,5,u,1>: Cost 3 vext2 <3,1,7,5>, LHS
+  2712211671U, // <7,5,u,2>: Cost 3 vext3 RHS, <5,u,2,0>
+  2573134583U, // <7,5,u,3>: Cost 3 vext1 <3,7,5,u>, <3,7,5,u>
+  1511337270U, // <7,5,u,4>: Cost 2 vext1 <5,7,5,u>, RHS
+  1638469881U, // <7,5,u,5>: Cost 2 vext3 RHS, <5,u,5,7>
+  2712064258U, // <7,5,u,6>: Cost 3 vext3 RHS, <5,u,6,7>
+  1638469892U, // <7,5,u,7>: Cost 2 vext3 RHS, <5,u,7,0>
+  1638469904U, // <7,5,u,u>: Cost 2 vext3 RHS, <5,u,u,3>
+  2650324992U, // <7,6,0,0>: Cost 3 vext2 <5,4,7,6>, <0,0,0,0>
+  1576583270U, // <7,6,0,1>: Cost 2 vext2 <5,4,7,6>, LHS
+  2712064300U, // <7,6,0,2>: Cost 3 vext3 RHS, <6,0,2,4>
+  2255295336U, // <7,6,0,3>: Cost 3 vrev <6,7,3,0>
+  2712064316U, // <7,6,0,4>: Cost 3 vext3 RHS, <6,0,4,2>
+  2585088098U, // <7,6,0,5>: Cost 3 vext1 <5,7,6,0>, <5,6,7,0>
+  2735952204U, // <7,6,0,6>: Cost 3 vext3 RHS, <6,0,6,0>
+  2712211799U, // <7,6,0,7>: Cost 3 vext3 RHS, <6,0,7,2>
+  1576583837U, // <7,6,0,u>: Cost 2 vext2 <5,4,7,6>, LHS
+  1181340494U, // <7,6,1,0>: Cost 2 vrev <6,7,0,1>
+  2650325812U, // <7,6,1,1>: Cost 3 vext2 <5,4,7,6>, <1,1,1,1>
+  2650325910U, // <7,6,1,2>: Cost 3 vext2 <5,4,7,6>, <1,2,3,0>
+  2650325976U, // <7,6,1,3>: Cost 3 vext2 <5,4,7,6>, <1,3,1,3>
+  2579123510U, // <7,6,1,4>: Cost 3 vext1 <4,7,6,1>, RHS
+  2650326160U, // <7,6,1,5>: Cost 3 vext2 <5,4,7,6>, <1,5,3,7>
+  2714055072U, // <7,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
+  2712064425U, // <7,6,1,7>: Cost 3 vext3 RHS, <6,1,7,3>
+  1181930390U, // <7,6,1,u>: Cost 2 vrev <6,7,u,1>
+  2712211897U, // <7,6,2,0>: Cost 3 vext3 RHS, <6,2,0,1>
+  2714055108U, // <7,6,2,1>: Cost 3 vext3 RHS, <6,2,1,3>
+  2650326632U, // <7,6,2,2>: Cost 3 vext2 <5,4,7,6>, <2,2,2,2>
+  2650326694U, // <7,6,2,3>: Cost 3 vext2 <5,4,7,6>, <2,3,0,1>
+  2714055137U, // <7,6,2,4>: Cost 3 vext3 RHS, <6,2,4,5>
+  2714055148U, // <7,6,2,5>: Cost 3 vext3 RHS, <6,2,5,7>
+  2650326970U, // <7,6,2,6>: Cost 3 vext2 <5,4,7,6>, <2,6,3,7>
+  1638470138U, // <7,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
+  1638470147U, // <7,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
+  2650327190U, // <7,6,3,0>: Cost 3 vext2 <5,4,7,6>, <3,0,1,2>
+  2255172441U, // <7,6,3,1>: Cost 3 vrev <6,7,1,3>
+  2255246178U, // <7,6,3,2>: Cost 3 vrev <6,7,2,3>
+  2650327452U, // <7,6,3,3>: Cost 3 vext2 <5,4,7,6>, <3,3,3,3>
+  2712064562U, // <7,6,3,4>: Cost 3 vext3 RHS, <6,3,4,5>
+  2650327627U, // <7,6,3,5>: Cost 3 vext2 <5,4,7,6>, <3,5,4,7>
+  3713452726U, // <7,6,3,6>: Cost 4 vext2 <3,6,7,6>, <3,6,7,6>
+  2700563016U, // <7,6,3,7>: Cost 3 vext3 <2,6,3,7>, <6,3,7,0>
+  2712064593U, // <7,6,3,u>: Cost 3 vext3 RHS, <6,3,u,0>
+  2650327954U, // <7,6,4,0>: Cost 3 vext2 <5,4,7,6>, <4,0,5,1>
+  2735952486U, // <7,6,4,1>: Cost 3 vext3 RHS, <6,4,1,3>
+  2735952497U, // <7,6,4,2>: Cost 3 vext3 RHS, <6,4,2,5>
+  2255328108U, // <7,6,4,3>: Cost 3 vrev <6,7,3,4>
+  2712212100U, // <7,6,4,4>: Cost 3 vext3 RHS, <6,4,4,6>
+  1576586550U, // <7,6,4,5>: Cost 2 vext2 <5,4,7,6>, RHS
+  2714055312U, // <7,6,4,6>: Cost 3 vext3 RHS, <6,4,6,0>
+  2712212126U, // <7,6,4,7>: Cost 3 vext3 RHS, <6,4,7,5>
+  1576586793U, // <7,6,4,u>: Cost 2 vext2 <5,4,7,6>, RHS
+  2579152998U, // <7,6,5,0>: Cost 3 vext1 <4,7,6,5>, LHS
+  2650328784U, // <7,6,5,1>: Cost 3 vext2 <5,4,7,6>, <5,1,7,3>
+  2714055364U, // <7,6,5,2>: Cost 3 vext3 RHS, <6,5,2,7>
+  3785806538U, // <7,6,5,3>: Cost 4 vext3 RHS, <6,5,3,4>
+  1576587206U, // <7,6,5,4>: Cost 2 vext2 <5,4,7,6>, <5,4,7,6>
+  2650329092U, // <7,6,5,5>: Cost 3 vext2 <5,4,7,6>, <5,5,5,5>
+  2650329186U, // <7,6,5,6>: Cost 3 vext2 <5,4,7,6>, <5,6,7,0>
+  2712064753U, // <7,6,5,7>: Cost 3 vext3 RHS, <6,5,7,7>
+  1181963162U, // <7,6,5,u>: Cost 2 vrev <6,7,u,5>
+  2714055421U, // <7,6,6,0>: Cost 3 vext3 RHS, <6,6,0,1>
+  2714055432U, // <7,6,6,1>: Cost 3 vext3 RHS, <6,6,1,3>
+  2650329594U, // <7,6,6,2>: Cost 3 vext2 <5,4,7,6>, <6,2,7,3>
+  3785806619U, // <7,6,6,3>: Cost 4 vext3 RHS, <6,6,3,4>
+  2712212260U, // <7,6,6,4>: Cost 3 vext3 RHS, <6,6,4,4>
+  2714055472U, // <7,6,6,5>: Cost 3 vext3 RHS, <6,6,5,7>
+  1638323000U, // <7,6,6,6>: Cost 2 vext3 RHS, <6,6,6,6>
+  1638470466U, // <7,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
+  1638470475U, // <7,6,6,u>: Cost 2 vext3 RHS, <6,6,u,7>
+  1638323022U, // <7,6,7,0>: Cost 2 vext3 RHS, <6,7,0,1>
+  2712064854U, // <7,6,7,1>: Cost 3 vext3 RHS, <6,7,1,0>
+  2712064865U, // <7,6,7,2>: Cost 3 vext3 RHS, <6,7,2,2>
+  2712064872U, // <7,6,7,3>: Cost 3 vext3 RHS, <6,7,3,0>
+  1638323062U, // <7,6,7,4>: Cost 2 vext3 RHS, <6,7,4,5>
+  2712064894U, // <7,6,7,5>: Cost 3 vext3 RHS, <6,7,5,4>
+  2712064905U, // <7,6,7,6>: Cost 3 vext3 RHS, <6,7,6,6>
+  2712064915U, // <7,6,7,7>: Cost 3 vext3 RHS, <6,7,7,7>
+  1638323094U, // <7,6,7,u>: Cost 2 vext3 RHS, <6,7,u,1>
+  1638470559U, // <7,6,u,0>: Cost 2 vext3 RHS, <6,u,0,1>
+  1576589102U, // <7,6,u,1>: Cost 2 vext2 <5,4,7,6>, LHS
+  2712212402U, // <7,6,u,2>: Cost 3 vext3 RHS, <6,u,2,2>
+  2712212409U, // <7,6,u,3>: Cost 3 vext3 RHS, <6,u,3,0>
+  1638470599U, // <7,6,u,4>: Cost 2 vext3 RHS, <6,u,4,5>
+  1576589466U, // <7,6,u,5>: Cost 2 vext2 <5,4,7,6>, RHS
+  1638323000U, // <7,6,u,6>: Cost 2 vext3 RHS, <6,6,6,6>
+  1638470624U, // <7,6,u,7>: Cost 2 vext3 RHS, <6,u,7,3>
+  1638470631U, // <7,6,u,u>: Cost 2 vext3 RHS, <6,u,u,1>
+  2712065007U, // <7,7,0,0>: Cost 3 vext3 RHS, <7,0,0,0>
+  1638323194U, // <7,7,0,1>: Cost 2 vext3 RHS, <7,0,1,2>
+  2712065025U, // <7,7,0,2>: Cost 3 vext3 RHS, <7,0,2,0>
+  3646958337U, // <7,7,0,3>: Cost 4 vext1 <3,7,7,0>, <3,7,7,0>
+  2712065044U, // <7,7,0,4>: Cost 3 vext3 RHS, <7,0,4,1>
+  2585161907U, // <7,7,0,5>: Cost 3 vext1 <5,7,7,0>, <5,7,7,0>
+  2591134604U, // <7,7,0,6>: Cost 3 vext1 <6,7,7,0>, <6,7,7,0>
+  2591134714U, // <7,7,0,7>: Cost 3 vext1 <6,7,7,0>, <7,0,1,2>
+  1638323257U, // <7,7,0,u>: Cost 2 vext3 RHS, <7,0,u,2>
+  2712065091U, // <7,7,1,0>: Cost 3 vext3 RHS, <7,1,0,3>
+  2712065098U, // <7,7,1,1>: Cost 3 vext3 RHS, <7,1,1,1>
+  2712065109U, // <7,7,1,2>: Cost 3 vext3 RHS, <7,1,2,3>
+  2692748384U, // <7,7,1,3>: Cost 3 vext3 <1,3,5,7>, <7,1,3,5>
+  2585169206U, // <7,7,1,4>: Cost 3 vext1 <5,7,7,1>, RHS
+  2693928048U, // <7,7,1,5>: Cost 3 vext3 <1,5,3,7>, <7,1,5,3>
+  2585170766U, // <7,7,1,6>: Cost 3 vext1 <5,7,7,1>, <6,7,0,1>
+  2735953024U, // <7,7,1,7>: Cost 3 vext3 RHS, <7,1,7,1>
+  2695918731U, // <7,7,1,u>: Cost 3 vext3 <1,u,3,7>, <7,1,u,3>
+  3770471574U, // <7,7,2,0>: Cost 4 vext3 <2,0,5,7>, <7,2,0,5>
+  3785807002U, // <7,7,2,1>: Cost 4 vext3 RHS, <7,2,1,0>
+  2712065189U, // <7,7,2,2>: Cost 3 vext3 RHS, <7,2,2,2>
+  2712065196U, // <7,7,2,3>: Cost 3 vext3 RHS, <7,2,3,0>
+  3773125818U, // <7,7,2,4>: Cost 4 vext3 <2,4,5,7>, <7,2,4,5>
+  3766490305U, // <7,7,2,5>: Cost 4 vext3 <1,3,5,7>, <7,2,5,3>
+  2700563658U, // <7,7,2,6>: Cost 3 vext3 <2,6,3,7>, <7,2,6,3>
+  2735953107U, // <7,7,2,7>: Cost 3 vext3 RHS, <7,2,7,3>
+  2701890780U, // <7,7,2,u>: Cost 3 vext3 <2,u,3,7>, <7,2,u,3>
+  2712065251U, // <7,7,3,0>: Cost 3 vext3 RHS, <7,3,0,1>
+  3766490350U, // <7,7,3,1>: Cost 4 vext3 <1,3,5,7>, <7,3,1,3>
+  3774305530U, // <7,7,3,2>: Cost 4 vext3 <2,6,3,7>, <7,3,2,6>
+  2637728196U, // <7,7,3,3>: Cost 3 vext2 <3,3,7,7>, <3,3,7,7>
+  2712065291U, // <7,7,3,4>: Cost 3 vext3 RHS, <7,3,4,5>
+  2585186486U, // <7,7,3,5>: Cost 3 vext1 <5,7,7,3>, <5,7,7,3>
+  2639719095U, // <7,7,3,6>: Cost 3 vext2 <3,6,7,7>, <3,6,7,7>
+  2640382728U, // <7,7,3,7>: Cost 3 vext2 <3,7,7,7>, <3,7,7,7>
+  2641046361U, // <7,7,3,u>: Cost 3 vext2 <3,u,7,7>, <3,u,7,7>
+  2712212792U, // <7,7,4,0>: Cost 3 vext3 RHS, <7,4,0,5>
+  3646989312U, // <7,7,4,1>: Cost 4 vext1 <3,7,7,4>, <1,3,5,7>
+  3785807176U, // <7,7,4,2>: Cost 4 vext3 RHS, <7,4,2,3>
+  3646991109U, // <7,7,4,3>: Cost 4 vext1 <3,7,7,4>, <3,7,7,4>
+  2712065371U, // <7,7,4,4>: Cost 3 vext3 RHS, <7,4,4,4>
+  1638323558U, // <7,7,4,5>: Cost 2 vext3 RHS, <7,4,5,6>
+  2712212845U, // <7,7,4,6>: Cost 3 vext3 RHS, <7,4,6,4>
+  2591167846U, // <7,7,4,7>: Cost 3 vext1 <6,7,7,4>, <7,4,5,6>
+  1638323585U, // <7,7,4,u>: Cost 2 vext3 RHS, <7,4,u,6>
+  2585198694U, // <7,7,5,0>: Cost 3 vext1 <5,7,7,5>, LHS
+  2712212884U, // <7,7,5,1>: Cost 3 vext3 RHS, <7,5,1,7>
+  3711471393U, // <7,7,5,2>: Cost 4 vext2 <3,3,7,7>, <5,2,7,3>
+  2649673590U, // <7,7,5,3>: Cost 3 vext2 <5,3,7,7>, <5,3,7,7>
+  2712065455U, // <7,7,5,4>: Cost 3 vext3 RHS, <7,5,4,7>
+  1577259032U, // <7,7,5,5>: Cost 2 vext2 <5,5,7,7>, <5,5,7,7>
+  2712065473U, // <7,7,5,6>: Cost 3 vext3 RHS, <7,5,6,7>
+  2712212936U, // <7,7,5,7>: Cost 3 vext3 RHS, <7,5,7,5>
+  1579249931U, // <7,7,5,u>: Cost 2 vext2 <5,u,7,7>, <5,u,7,7>
+  2591178854U, // <7,7,6,0>: Cost 3 vext1 <6,7,7,6>, LHS
+  2735953374U, // <7,7,6,1>: Cost 3 vext3 RHS, <7,6,1,0>
+  2712212974U, // <7,7,6,2>: Cost 3 vext3 RHS, <7,6,2,7>
+  2655646287U, // <7,7,6,3>: Cost 3 vext2 <6,3,7,7>, <6,3,7,7>
+  2591182134U, // <7,7,6,4>: Cost 3 vext1 <6,7,7,6>, RHS
+  2656973553U, // <7,7,6,5>: Cost 3 vext2 <6,5,7,7>, <6,5,7,7>
+  1583895362U, // <7,7,6,6>: Cost 2 vext2 <6,6,7,7>, <6,6,7,7>
+  2712065556U, // <7,7,6,7>: Cost 3 vext3 RHS, <7,6,7,0>
+  1585222628U, // <7,7,6,u>: Cost 2 vext2 <6,u,7,7>, <6,u,7,7>
+  1523417190U, // <7,7,7,0>: Cost 2 vext1 <7,7,7,7>, LHS
+  2597159670U, // <7,7,7,1>: Cost 3 vext1 <7,7,7,7>, <1,0,3,2>
+  2597160552U, // <7,7,7,2>: Cost 3 vext1 <7,7,7,7>, <2,2,2,2>
+  2597161110U, // <7,7,7,3>: Cost 3 vext1 <7,7,7,7>, <3,0,1,2>
+  1523420470U, // <7,7,7,4>: Cost 2 vext1 <7,7,7,7>, RHS
+  2651002296U, // <7,7,7,5>: Cost 3 vext2 <5,5,7,7>, <7,5,5,7>
+  2657637906U, // <7,7,7,6>: Cost 3 vext2 <6,6,7,7>, <7,6,6,7>
+  363253046U, // <7,7,7,7>: Cost 1 vdup3 RHS
+  363253046U, // <7,7,7,u>: Cost 1 vdup3 RHS
+  1523417190U, // <7,7,u,0>: Cost 2 vext1 <7,7,7,7>, LHS
+  1638471298U, // <7,7,u,1>: Cost 2 vext3 RHS, <7,u,1,2>
+  2712213132U, // <7,7,u,2>: Cost 3 vext3 RHS, <7,u,2,3>
+  2712213138U, // <7,7,u,3>: Cost 3 vext3 RHS, <7,u,3,0>
+  1523420470U, // <7,7,u,4>: Cost 2 vext1 <7,7,7,7>, RHS
+  1638471338U, // <7,7,u,5>: Cost 2 vext3 RHS, <7,u,5,6>
+  1595840756U, // <7,7,u,6>: Cost 2 vext2 <u,6,7,7>, <u,6,7,7>
+  363253046U, // <7,7,u,7>: Cost 1 vdup3 RHS
+  363253046U, // <7,7,u,u>: Cost 1 vdup3 RHS
+  1638318080U, // <7,u,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
+  1638323923U, // <7,u,0,1>: Cost 2 vext3 RHS, <u,0,1,2>
+  1662211804U, // <7,u,0,2>: Cost 2 vext3 RHS, <u,0,2,2>
+  1638323941U, // <7,u,0,3>: Cost 2 vext3 RHS, <u,0,3,2>
+  2712065773U, // <7,u,0,4>: Cost 3 vext3 RHS, <u,0,4,1>
+  1662359286U, // <7,u,0,5>: Cost 2 vext3 RHS, <u,0,5,1>
+  1662359296U, // <7,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
+  2987150664U, // <7,u,0,7>: Cost 3 vzipr <5,6,7,0>, RHS
+  1638323986U, // <7,u,0,u>: Cost 2 vext3 RHS, <u,0,u,2>
+  1517469798U, // <7,u,1,0>: Cost 2 vext1 <6,7,u,1>, LHS
+  1638318900U, // <7,u,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
+  564582190U, // <7,u,1,2>: Cost 1 vext3 RHS, LHS
+  1638324023U, // <7,u,1,3>: Cost 2 vext3 RHS, <u,1,3,3>
+  1517473078U, // <7,u,1,4>: Cost 2 vext1 <6,7,u,1>, RHS
+  2693928777U, // <7,u,1,5>: Cost 3 vext3 <1,5,3,7>, <u,1,5,3>
+  1517474710U, // <7,u,1,6>: Cost 2 vext1 <6,7,u,1>, <6,7,u,1>
+  1640462171U, // <7,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
+  564582244U, // <7,u,1,u>: Cost 1 vext3 RHS, LHS
+  1638318244U, // <7,u,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
+  2712065907U, // <7,u,2,1>: Cost 3 vext3 RHS, <u,2,1,0>
+  1638319720U, // <7,u,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
+  1638324101U, // <7,u,2,3>: Cost 2 vext3 RHS, <u,2,3,0>
+  1638318284U, // <7,u,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
+  2712065947U, // <7,u,2,5>: Cost 3 vext3 RHS, <u,2,5,4>
+  2700564387U, // <7,u,2,6>: Cost 3 vext3 <2,6,3,7>, <u,2,6,3>
+  1640314796U, // <7,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
+  1638324146U, // <7,u,2,u>: Cost 2 vext3 RHS, <u,2,u,0>
+  1638324156U, // <7,u,3,0>: Cost 2 vext3 RHS, <u,3,0,1>
+  1638319064U, // <7,u,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
+  2700564435U, // <7,u,3,2>: Cost 3 vext3 <2,6,3,7>, <u,3,2,6>
+  1638320540U, // <7,u,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
+  1638324196U, // <7,u,3,4>: Cost 2 vext3 RHS, <u,3,4,5>
+  1638324207U, // <7,u,3,5>: Cost 2 vext3 RHS, <u,3,5,7>
+  2700564472U, // <7,u,3,6>: Cost 3 vext3 <2,6,3,7>, <u,3,6,7>
+  2695919610U, // <7,u,3,7>: Cost 3 vext3 <1,u,3,7>, <u,3,7,0>
+  1638324228U, // <7,u,3,u>: Cost 2 vext3 RHS, <u,3,u,1>
+  2712066061U, // <7,u,4,0>: Cost 3 vext3 RHS, <u,4,0,1>
+  1662212122U, // <7,u,4,1>: Cost 2 vext3 RHS, <u,4,1,5>
+  1662212132U, // <7,u,4,2>: Cost 2 vext3 RHS, <u,4,2,6>
+  2712066092U, // <7,u,4,3>: Cost 3 vext3 RHS, <u,4,3,5>
+  1638321360U, // <7,u,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
+  1638324287U, // <7,u,4,5>: Cost 2 vext3 RHS, <u,4,5,6>
+  1662359624U, // <7,u,4,6>: Cost 2 vext3 RHS, <u,4,6,6>
+  1640314961U, // <7,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
+  1638324314U, // <7,u,4,u>: Cost 2 vext3 RHS, <u,4,u,6>
+  1517502566U, // <7,u,5,0>: Cost 2 vext1 <6,7,u,5>, LHS
+  1574612693U, // <7,u,5,1>: Cost 2 vext2 <5,1,7,u>, <5,1,7,u>
+  2712066162U, // <7,u,5,2>: Cost 3 vext3 RHS, <u,5,2,3>
+  1638324351U, // <7,u,5,3>: Cost 2 vext3 RHS, <u,5,3,7>
+  1576603592U, // <7,u,5,4>: Cost 2 vext2 <5,4,7,u>, <5,4,7,u>
+  1577267225U, // <7,u,5,5>: Cost 2 vext2 <5,5,7,u>, <5,5,7,u>
+  564582554U, // <7,u,5,6>: Cost 1 vext3 RHS, RHS
+  1640462499U, // <7,u,5,7>: Cost 2 vext3 RHS, <u,5,7,7>
+  564582572U, // <7,u,5,u>: Cost 1 vext3 RHS, RHS
+  2712066223U, // <7,u,6,0>: Cost 3 vext3 RHS, <u,6,0,1>
+  2712066238U, // <7,u,6,1>: Cost 3 vext3 RHS, <u,6,1,7>
+  1581249023U, // <7,u,6,2>: Cost 2 vext2 <6,2,7,u>, <6,2,7,u>
+  1638324432U, // <7,u,6,3>: Cost 2 vext3 RHS, <u,6,3,7>
+  1638468980U, // <7,u,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
+  2712066274U, // <7,u,6,5>: Cost 3 vext3 RHS, <u,6,5,7>
+  1583903555U, // <7,u,6,6>: Cost 2 vext2 <6,6,7,u>, <6,6,7,u>
+  1640315117U, // <7,u,6,7>: Cost 2 vext3 RHS, <u,6,7,0>
+  1638324477U, // <7,u,6,u>: Cost 2 vext3 RHS, <u,6,u,7>
+  1638471936U, // <7,u,7,0>: Cost 2 vext3 RHS, <u,7,0,1>
+  2692970763U, // <7,u,7,1>: Cost 3 vext3 <1,3,u,7>, <u,7,1,3>
+  2700933399U, // <7,u,7,2>: Cost 3 vext3 <2,6,u,7>, <u,7,2,6>
+  2573347601U, // <7,u,7,3>: Cost 3 vext1 <3,7,u,7>, <3,7,u,7>
+  1638471976U, // <7,u,7,4>: Cost 2 vext3 RHS, <u,7,4,5>
+  1511551171U, // <7,u,7,5>: Cost 2 vext1 <5,7,u,7>, <5,7,u,7>
+  2712213815U, // <7,u,7,6>: Cost 3 vext3 RHS, <u,7,6,2>
+  363253046U, // <7,u,7,7>: Cost 1 vdup3 RHS
+  363253046U, // <7,u,7,u>: Cost 1 vdup3 RHS
+  1638324561U, // <7,u,u,0>: Cost 2 vext3 RHS, <u,u,0,1>
+  1638324571U, // <7,u,u,1>: Cost 2 vext3 RHS, <u,u,1,2>
+  564582757U, // <7,u,u,2>: Cost 1 vext3 RHS, LHS
+  1638324587U, // <7,u,u,3>: Cost 2 vext3 RHS, <u,u,3,0>
+  1638324601U, // <7,u,u,4>: Cost 2 vext3 RHS, <u,u,4,5>
+  1638324611U, // <7,u,u,5>: Cost 2 vext3 RHS, <u,u,5,6>
+  564582797U, // <7,u,u,6>: Cost 1 vext3 RHS, RHS
+  363253046U, // <7,u,u,7>: Cost 1 vdup3 RHS
+  564582811U, // <7,u,u,u>: Cost 1 vext3 RHS, LHS
+  135053414U, // <u,0,0,0>: Cost 1 vdup0 LHS
+  1611489290U, // <u,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
+  1611489300U, // <u,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
+  2568054923U, // <u,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
+  1481706806U, // <u,0,0,4>: Cost 2 vext1 <0,u,0,0>, RHS
+  2555449040U, // <u,0,0,5>: Cost 3 vext1 <0,u,0,0>, <5,1,7,3>
+  2591282078U, // <u,0,0,6>: Cost 3 vext1 <6,u,0,0>, <6,u,0,0>
+  2591945711U, // <u,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
+  135053414U, // <u,0,0,u>: Cost 1 vdup0 LHS
+  1493655654U, // <u,0,1,0>: Cost 2 vext1 <2,u,0,1>, LHS
+  1860550758U, // <u,0,1,1>: Cost 2 vzipl LHS, LHS
+  537747563U, // <u,0,1,2>: Cost 1 vext3 LHS, LHS
+  2625135576U, // <u,0,1,3>: Cost 3 vext2 <1,2,u,0>, <1,3,1,3>
+  1493658934U, // <u,0,1,4>: Cost 2 vext1 <2,u,0,1>, RHS
+  2625135760U, // <u,0,1,5>: Cost 3 vext2 <1,2,u,0>, <1,5,3,7>
+  1517548447U, // <u,0,1,6>: Cost 2 vext1 <6,u,0,1>, <6,u,0,1>
+  2591290362U, // <u,0,1,7>: Cost 3 vext1 <6,u,0,1>, <7,0,1,2>
+  537747612U, // <u,0,1,u>: Cost 1 vext3 LHS, LHS
+  1611489444U, // <u,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+  2685231276U, // <u,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
+  1994768486U, // <u,0,2,2>: Cost 2 vtrnl LHS, LHS
+  2685231294U, // <u,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
+  1611489484U, // <u,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+  2712068310U, // <u,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
+  2625136570U, // <u,0,2,6>: Cost 3 vext2 <1,2,u,0>, <2,6,3,7>
+  2591962097U, // <u,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
+  1611489516U, // <u,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
+  2954067968U, // <u,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
+  2685231356U, // <u,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
+  72589981U, // <u,0,3,2>: Cost 1 vrev LHS
+  2625137052U, // <u,0,3,3>: Cost 3 vext2 <1,2,u,0>, <3,3,3,3>
+  2625137154U, // <u,0,3,4>: Cost 3 vext2 <1,2,u,0>, <3,4,5,6>
+  2639071848U, // <u,0,3,5>: Cost 3 vext2 <3,5,u,0>, <3,5,u,0>
+  2639735481U, // <u,0,3,6>: Cost 3 vext2 <3,6,u,0>, <3,6,u,0>
+  2597279354U, // <u,0,3,7>: Cost 3 vext1 <7,u,0,3>, <7,u,0,3>
+  73032403U, // <u,0,3,u>: Cost 1 vrev LHS
+  2687074636U, // <u,0,4,0>: Cost 3 vext3 <0,4,0,u>, <0,4,0,u>
+  1611489618U, // <u,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
+  1611489628U, // <u,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
+  3629222038U, // <u,0,4,3>: Cost 4 vext1 <0,u,0,4>, <3,0,1,2>
+  2555481398U, // <u,0,4,4>: Cost 3 vext1 <0,u,0,4>, RHS
+  1551396150U, // <u,0,4,5>: Cost 2 vext2 <1,2,u,0>, RHS
+  2651680116U, // <u,0,4,6>: Cost 3 vext2 <5,6,u,0>, <4,6,4,6>
+  2646150600U, // <u,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
+  1611932050U, // <u,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
+  2561458278U, // <u,0,5,0>: Cost 3 vext1 <1,u,0,5>, LHS
+  1863532646U, // <u,0,5,1>: Cost 2 vzipl RHS, LHS
+  2712068526U, // <u,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
+  2649689976U, // <u,0,5,3>: Cost 3 vext2 <5,3,u,0>, <5,3,u,0>
+  2220237489U, // <u,0,5,4>: Cost 3 vrev <0,u,4,5>
+  2651680772U, // <u,0,5,5>: Cost 3 vext2 <5,6,u,0>, <5,5,5,5>
+  1577939051U, // <u,0,5,6>: Cost 2 vext2 <5,6,u,0>, <5,6,u,0>
+  2830077238U, // <u,0,5,7>: Cost 3 vuzpr <1,u,3,0>, RHS
+  1579266317U, // <u,0,5,u>: Cost 2 vext2 <5,u,u,0>, <5,u,u,0>
+  2555494502U, // <u,0,6,0>: Cost 3 vext1 <0,u,0,6>, LHS
+  2712068598U, // <u,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
+  1997750374U, // <u,0,6,2>: Cost 2 vtrnl RHS, LHS
+  2655662673U, // <u,0,6,3>: Cost 3 vext2 <6,3,u,0>, <6,3,u,0>
+  2555497782U, // <u,0,6,4>: Cost 3 vext1 <0,u,0,6>, RHS
+  2651681459U, // <u,0,6,5>: Cost 3 vext2 <5,6,u,0>, <6,5,0,u>
+  2651681592U, // <u,0,6,6>: Cost 3 vext2 <5,6,u,0>, <6,6,6,6>
+  2651681614U, // <u,0,6,7>: Cost 3 vext2 <5,6,u,0>, <6,7,0,1>
+  1997750428U, // <u,0,6,u>: Cost 2 vtrnl RHS, LHS
+  2567446630U, // <u,0,7,0>: Cost 3 vext1 <2,u,0,7>, LHS
+  2567447446U, // <u,0,7,1>: Cost 3 vext1 <2,u,0,7>, <1,2,3,0>
+  2567448641U, // <u,0,7,2>: Cost 3 vext1 <2,u,0,7>, <2,u,0,7>
+  2573421338U, // <u,0,7,3>: Cost 3 vext1 <3,u,0,7>, <3,u,0,7>
+  2567449910U, // <u,0,7,4>: Cost 3 vext1 <2,u,0,7>, RHS
+  2651682242U, // <u,0,7,5>: Cost 3 vext2 <5,6,u,0>, <7,5,6,u>
+  2591339429U, // <u,0,7,6>: Cost 3 vext1 <6,u,0,7>, <6,u,0,7>
+  2651682412U, // <u,0,7,7>: Cost 3 vext2 <5,6,u,0>, <7,7,7,7>
+  2567452462U, // <u,0,7,u>: Cost 3 vext1 <2,u,0,7>, LHS
+  135053414U, // <u,0,u,0>: Cost 1 vdup0 LHS
+  1611489938U, // <u,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
+  537748125U, // <u,0,u,2>: Cost 1 vext3 LHS, LHS
+  2685674148U, // <u,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
+  1611932338U, // <u,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
+  1551399066U, // <u,0,u,5>: Cost 2 vext2 <1,2,u,0>, RHS
+  1517605798U, // <u,0,u,6>: Cost 2 vext1 <6,u,0,u>, <6,u,0,u>
+  2830077481U, // <u,0,u,7>: Cost 3 vuzpr <1,u,3,0>, RHS
+  537748179U, // <u,0,u,u>: Cost 1 vext3 LHS, LHS
+  1544101961U, // <u,1,0,0>: Cost 2 vext2 <0,0,u,1>, <0,0,u,1>
+  1558036582U, // <u,1,0,1>: Cost 2 vext2 <2,3,u,1>, LHS
+  2619171051U, // <u,1,0,2>: Cost 3 vext2 <0,2,u,1>, <0,2,u,1>
+  1611490038U, // <u,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
+  2555522358U, // <u,1,0,4>: Cost 3 vext1 <0,u,1,0>, RHS
+  2712068871U, // <u,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
+  2591355815U, // <u,1,0,6>: Cost 3 vext1 <6,u,1,0>, <6,u,1,0>
+  2597328512U, // <u,1,0,7>: Cost 3 vext1 <7,u,1,0>, <7,u,1,0>
+  1611490083U, // <u,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
+  1481785446U, // <u,1,1,0>: Cost 2 vext1 <0,u,1,1>, LHS
+  202162278U, // <u,1,1,1>: Cost 1 vdup1 LHS
+  2555528808U, // <u,1,1,2>: Cost 3 vext1 <0,u,1,1>, <2,2,2,2>
+  1611490120U, // <u,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
+  1481788726U, // <u,1,1,4>: Cost 2 vext1 <0,u,1,1>, RHS
+  2689876828U, // <u,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
+  2591364008U, // <u,1,1,6>: Cost 3 vext1 <6,u,1,1>, <6,u,1,1>
+  2592691274U, // <u,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
+  202162278U, // <u,1,1,u>: Cost 1 vdup1 LHS
+  1499709542U, // <u,1,2,0>: Cost 2 vext1 <3,u,1,2>, LHS
+  2689876871U, // <u,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
+  2631116445U, // <u,1,2,2>: Cost 3 vext2 <2,2,u,1>, <2,2,u,1>
+  835584U, // <u,1,2,3>: Cost 0 copy LHS
+  1499712822U, // <u,1,2,4>: Cost 2 vext1 <3,u,1,2>, RHS
+  2689876907U, // <u,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
+  2631780282U, // <u,1,2,6>: Cost 3 vext2 <2,3,u,1>, <2,6,3,7>
+  1523603074U, // <u,1,2,7>: Cost 2 vext1 <7,u,1,2>, <7,u,1,2>
+  835584U, // <u,1,2,u>: Cost 0 copy LHS
+  1487773798U, // <u,1,3,0>: Cost 2 vext1 <1,u,1,3>, LHS
+  1611490264U, // <u,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
+  2685232094U, // <u,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
+  2018746470U, // <u,1,3,3>: Cost 2 vtrnr LHS, LHS
+  1487777078U, // <u,1,3,4>: Cost 2 vext1 <1,u,1,3>, RHS
+  1611490304U, // <u,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
+  2685674505U, // <u,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
+  2640407307U, // <u,1,3,7>: Cost 3 vext2 <3,7,u,1>, <3,7,u,1>
+  1611490327U, // <u,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
+  1567992749U, // <u,1,4,0>: Cost 2 vext2 <4,0,u,1>, <4,0,u,1>
+  2693121070U, // <u,1,4,1>: Cost 3 vext3 <1,4,1,u>, <1,4,1,u>
+  2693194807U, // <u,1,4,2>: Cost 3 vext3 <1,4,2,u>, <1,4,2,u>
+  1152386432U, // <u,1,4,3>: Cost 2 vrev <1,u,3,4>
+  2555555126U, // <u,1,4,4>: Cost 3 vext1 <0,u,1,4>, RHS
+  1558039862U, // <u,1,4,5>: Cost 2 vext2 <2,3,u,1>, RHS
+  2645716371U, // <u,1,4,6>: Cost 3 vext2 <4,6,u,1>, <4,6,u,1>
+  2597361284U, // <u,1,4,7>: Cost 3 vext1 <7,u,1,4>, <7,u,1,4>
+  1152755117U, // <u,1,4,u>: Cost 2 vrev <1,u,u,4>
+  1481818214U, // <u,1,5,0>: Cost 2 vext1 <0,u,1,5>, LHS
+  2555560694U, // <u,1,5,1>: Cost 3 vext1 <0,u,1,5>, <1,0,3,2>
+  2555561576U, // <u,1,5,2>: Cost 3 vext1 <0,u,1,5>, <2,2,2,2>
+  1611490448U, // <u,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
+  1481821494U, // <u,1,5,4>: Cost 2 vext1 <0,u,1,5>, RHS
+  2651025435U, // <u,1,5,5>: Cost 3 vext2 <5,5,u,1>, <5,5,u,1>
+  2651689068U, // <u,1,5,6>: Cost 3 vext2 <5,6,u,1>, <5,6,u,1>
+  2823966006U, // <u,1,5,7>: Cost 3 vuzpr <0,u,1,1>, RHS
+  1611932861U, // <u,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
+  2555568230U, // <u,1,6,0>: Cost 3 vext1 <0,u,1,6>, LHS
+  2689877199U, // <u,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
+  2712069336U, // <u,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
+  2685232353U, // <u,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
+  2555571510U, // <u,1,6,4>: Cost 3 vext1 <0,u,1,6>, RHS
+  2689877235U, // <u,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
+  2657661765U, // <u,1,6,6>: Cost 3 vext2 <6,6,u,1>, <6,6,u,1>
+  1584583574U, // <u,1,6,7>: Cost 2 vext2 <6,7,u,1>, <6,7,u,1>
+  1585247207U, // <u,1,6,u>: Cost 2 vext2 <6,u,u,1>, <6,u,u,1>
+  2561548390U, // <u,1,7,0>: Cost 3 vext1 <1,u,1,7>, LHS
+  2561549681U, // <u,1,7,1>: Cost 3 vext1 <1,u,1,7>, <1,u,1,7>
+  2573493926U, // <u,1,7,2>: Cost 3 vext1 <3,u,1,7>, <2,3,0,1>
+  2042962022U, // <u,1,7,3>: Cost 2 vtrnr RHS, LHS
+  2561551670U, // <u,1,7,4>: Cost 3 vext1 <1,u,1,7>, RHS
+  2226300309U, // <u,1,7,5>: Cost 3 vrev <1,u,5,7>
+  2658325990U, // <u,1,7,6>: Cost 3 vext2 <6,7,u,1>, <7,6,1,u>
+  2658326124U, // <u,1,7,7>: Cost 3 vext2 <6,7,u,1>, <7,7,7,7>
+  2042962027U, // <u,1,7,u>: Cost 2 vtrnr RHS, LHS
+  1481842790U, // <u,1,u,0>: Cost 2 vext1 <0,u,1,u>, LHS
+  202162278U, // <u,1,u,1>: Cost 1 vdup1 LHS
+  2685674867U, // <u,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
+  835584U, // <u,1,u,3>: Cost 0 copy LHS
+  1481846070U, // <u,1,u,4>: Cost 2 vext1 <0,u,1,u>, RHS
+  1611933077U, // <u,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
+  2685674910U, // <u,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
+  1523652232U, // <u,1,u,7>: Cost 2 vext1 <7,u,1,u>, <7,u,1,u>
+  835584U, // <u,1,u,u>: Cost 0 copy LHS
+  1544110154U, // <u,2,0,0>: Cost 2 vext2 <0,0,u,2>, <0,0,u,2>
+  1545437286U, // <u,2,0,1>: Cost 2 vext2 <0,2,u,2>, LHS
+  1545437420U, // <u,2,0,2>: Cost 2 vext2 <0,2,u,2>, <0,2,u,2>
+  2685232589U, // <u,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
+  2619179346U, // <u,2,0,4>: Cost 3 vext2 <0,2,u,2>, <0,4,1,5>
+  2712069606U, // <u,2,0,5>: Cost 3 vext3 RHS, <2,0,5,7>
+  2689877484U, // <u,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
+  2659656273U, // <u,2,0,7>: Cost 3 vext2 <7,0,u,2>, <0,7,2,u>
+  1545437853U, // <u,2,0,u>: Cost 2 vext2 <0,2,u,2>, LHS
+  1550082851U, // <u,2,1,0>: Cost 2 vext2 <1,0,u,2>, <1,0,u,2>
+  2619179828U, // <u,2,1,1>: Cost 3 vext2 <0,2,u,2>, <1,1,1,1>
+  2619179926U, // <u,2,1,2>: Cost 3 vext2 <0,2,u,2>, <1,2,3,0>
+  2685232671U, // <u,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
+  2555604278U, // <u,2,1,4>: Cost 3 vext1 <0,u,2,1>, RHS
+  2619180176U, // <u,2,1,5>: Cost 3 vext2 <0,2,u,2>, <1,5,3,7>
+  2689877564U, // <u,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
+  2602718850U, // <u,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
+  1158703235U, // <u,2,1,u>: Cost 2 vrev <2,u,u,1>
+  1481867366U, // <u,2,2,0>: Cost 2 vext1 <0,u,2,2>, LHS
+  2555609846U, // <u,2,2,1>: Cost 3 vext1 <0,u,2,2>, <1,0,3,2>
+  269271142U, // <u,2,2,2>: Cost 1 vdup2 LHS
+  1611490930U, // <u,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
+  1481870646U, // <u,2,2,4>: Cost 2 vext1 <0,u,2,2>, RHS
+  2689877640U, // <u,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
+  2619180986U, // <u,2,2,6>: Cost 3 vext2 <0,2,u,2>, <2,6,3,7>
+  2593436837U, // <u,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
+  269271142U, // <u,2,2,u>: Cost 1 vdup2 LHS
+  408134301U, // <u,2,3,0>: Cost 1 vext1 LHS, LHS
+  1481876214U, // <u,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  1481877096U, // <u,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
+  1880326246U, // <u,2,3,3>: Cost 2 vzipr LHS, LHS
+  408137014U, // <u,2,3,4>: Cost 1 vext1 LHS, RHS
+  1529654992U, // <u,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
+  1529655802U, // <u,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1529656314U, // <u,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  408139566U, // <u,2,3,u>: Cost 1 vext1 LHS, LHS
+  1567853468U, // <u,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
+  2561598362U, // <u,2,4,1>: Cost 3 vext1 <1,u,2,4>, <1,2,3,4>
+  2555627214U, // <u,2,4,2>: Cost 3 vext1 <0,u,2,4>, <2,3,4,5>
+  2685232918U, // <u,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
+  2555628854U, // <u,2,4,4>: Cost 3 vext1 <0,u,2,4>, RHS
+  1545440566U, // <u,2,4,5>: Cost 2 vext2 <0,2,u,2>, RHS
+  1571982740U, // <u,2,4,6>: Cost 2 vext2 <4,6,u,2>, <4,6,u,2>
+  2592125957U, // <u,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
+  1545440809U, // <u,2,4,u>: Cost 2 vext2 <0,2,u,2>, RHS
+  2555633766U, // <u,2,5,0>: Cost 3 vext1 <0,u,2,5>, LHS
+  2561606550U, // <u,2,5,1>: Cost 3 vext1 <1,u,2,5>, <1,2,3,0>
+  2689877856U, // <u,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
+  2685233000U, // <u,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
+  1158441059U, // <u,2,5,4>: Cost 2 vrev <2,u,4,5>
+  2645725188U, // <u,2,5,5>: Cost 3 vext2 <4,6,u,2>, <5,5,5,5>
+  2689877892U, // <u,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
+  2823900470U, // <u,2,5,7>: Cost 3 vuzpr <0,u,0,2>, RHS
+  1158736007U, // <u,2,5,u>: Cost 2 vrev <2,u,u,5>
+  1481900134U, // <u,2,6,0>: Cost 2 vext1 <0,u,2,6>, LHS
+  2555642614U, // <u,2,6,1>: Cost 3 vext1 <0,u,2,6>, <1,0,3,2>
+  2555643496U, // <u,2,6,2>: Cost 3 vext1 <0,u,2,6>, <2,2,2,2>
+  1611491258U, // <u,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
+  1481903414U, // <u,2,6,4>: Cost 2 vext1 <0,u,2,6>, RHS
+  2689877964U, // <u,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
+  2689877973U, // <u,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
+  2645726030U, // <u,2,6,7>: Cost 3 vext2 <4,6,u,2>, <6,7,0,1>
+  1611933671U, // <u,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
+  1585919033U, // <u,2,7,0>: Cost 2 vext2 <7,0,u,2>, <7,0,u,2>
+  2573566710U, // <u,2,7,1>: Cost 3 vext1 <3,u,2,7>, <1,0,3,2>
+  2567596115U, // <u,2,7,2>: Cost 3 vext1 <2,u,2,7>, <2,u,2,7>
+  1906901094U, // <u,2,7,3>: Cost 2 vzipr RHS, LHS
+  2555653430U, // <u,2,7,4>: Cost 3 vext1 <0,u,2,7>, RHS
+  2800080230U, // <u,2,7,5>: Cost 3 vuzpl LHS, <7,4,5,6>
+  2980643164U, // <u,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
+  2645726828U, // <u,2,7,7>: Cost 3 vext2 <4,6,u,2>, <7,7,7,7>
+  1906901099U, // <u,2,7,u>: Cost 2 vzipr RHS, LHS
+  408175266U, // <u,2,u,0>: Cost 1 vext1 LHS, LHS
+  1545443118U, // <u,2,u,1>: Cost 2 vext2 <0,2,u,2>, LHS
+  269271142U, // <u,2,u,2>: Cost 1 vdup2 LHS
+  1611491416U, // <u,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
+  408177974U, // <u,2,u,4>: Cost 1 vext1 LHS, RHS
+  1545443482U, // <u,2,u,5>: Cost 2 vext2 <0,2,u,2>, RHS
+  1726339226U, // <u,2,u,6>: Cost 2 vuzpl LHS, RHS
+  1529697274U, // <u,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  408180526U, // <u,2,u,u>: Cost 1 vext1 LHS, LHS
+  1544781824U, // <u,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+  471040156U, // <u,3,0,1>: Cost 1 vext2 LHS, LHS
+  1544781988U, // <u,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  2618523900U, // <u,3,0,3>: Cost 3 vext2 LHS, <0,3,1,0>
+  1544782162U, // <u,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  2238188352U, // <u,3,0,5>: Cost 3 vrev <3,u,5,0>
+  2623169023U, // <u,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
+  2238335826U, // <u,3,0,7>: Cost 3 vrev <3,u,7,0>
+  471040669U, // <u,3,0,u>: Cost 1 vext2 LHS, LHS
+  1544782582U, // <u,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  1544782644U, // <u,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+  1544782742U, // <u,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+  1544782808U, // <u,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  2618524733U, // <u,3,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
+  1544782992U, // <u,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  2618524897U, // <u,3,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
+  2703517987U, // <u,3,1,7>: Cost 3 vext3 <3,1,7,u>, <3,1,7,u>
+  1544783213U, // <u,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
+  1529716838U, // <u,3,2,0>: Cost 2 vext1 <u,u,3,2>, LHS
+  1164167966U, // <u,3,2,1>: Cost 2 vrev <3,u,1,2>
+  1544783464U, // <u,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
+  1544783526U, // <u,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+  1529720118U, // <u,3,2,4>: Cost 2 vext1 <u,u,3,2>, RHS
+  2618525544U, // <u,3,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+  1544783802U, // <u,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  2704181620U, // <u,3,2,7>: Cost 3 vext3 <3,2,7,u>, <3,2,7,u>
+  1544783931U, // <u,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
+  1544784022U, // <u,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+  1487922559U, // <u,3,3,1>: Cost 2 vext1 <1,u,3,3>, <1,u,3,3>
+  1493895256U, // <u,3,3,2>: Cost 2 vext1 <2,u,3,3>, <2,u,3,3>
+  336380006U, // <u,3,3,3>: Cost 1 vdup3 LHS
+  1544784386U, // <u,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+  2824054478U, // <u,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
+  2238286668U, // <u,3,3,6>: Cost 3 vrev <3,u,6,3>
+  2954069136U, // <u,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
+  336380006U, // <u,3,3,u>: Cost 1 vdup3 LHS
+  1487929446U, // <u,3,4,0>: Cost 2 vext1 <1,u,3,4>, LHS
+  1487930752U, // <u,3,4,1>: Cost 2 vext1 <1,u,3,4>, <1,u,3,4>
+  2623171644U, // <u,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
+  2561673366U, // <u,3,4,3>: Cost 3 vext1 <1,u,3,4>, <3,0,1,2>
+  1487932726U, // <u,3,4,4>: Cost 2 vext1 <1,u,3,4>, RHS
+  471043382U, // <u,3,4,5>: Cost 1 vext2 LHS, RHS
+  1592561012U, // <u,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+  2238368598U, // <u,3,4,7>: Cost 3 vrev <3,u,7,4>
+  471043625U, // <u,3,4,u>: Cost 1 vext2 LHS, RHS
+  2555707494U, // <u,3,5,0>: Cost 3 vext1 <0,u,3,5>, LHS
+  1574645465U, // <u,3,5,1>: Cost 2 vext2 <5,1,u,3>, <5,1,u,3>
+  2567653106U, // <u,3,5,2>: Cost 3 vext1 <2,u,3,5>, <2,3,u,5>
+  2555709954U, // <u,3,5,3>: Cost 3 vext1 <0,u,3,5>, <3,4,5,6>
+  1592561606U, // <u,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+  1592561668U, // <u,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+  1592561762U, // <u,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
+  1750314294U, // <u,3,5,7>: Cost 2 vuzpr LHS, RHS
+  1750314295U, // <u,3,5,u>: Cost 2 vuzpr LHS, RHS
+  2623172897U, // <u,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
+  2561688962U, // <u,3,6,1>: Cost 3 vext1 <1,u,3,6>, <1,u,3,6>
+  1581281795U, // <u,3,6,2>: Cost 2 vext2 <6,2,u,3>, <6,2,u,3>
+  2706541204U, // <u,3,6,3>: Cost 3 vext3 <3,6,3,u>, <3,6,3,u>
+  2623173261U, // <u,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
+  1164495686U, // <u,3,6,5>: Cost 2 vrev <3,u,5,6>
+  1592562488U, // <u,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+  1592562510U, // <u,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+  1164716897U, // <u,3,6,u>: Cost 2 vrev <3,u,u,6>
+  1487954022U, // <u,3,7,0>: Cost 2 vext1 <1,u,3,7>, LHS
+  1487955331U, // <u,3,7,1>: Cost 2 vext1 <1,u,3,7>, <1,u,3,7>
+  1493928028U, // <u,3,7,2>: Cost 2 vext1 <2,u,3,7>, <2,u,3,7>
+  2561697942U, // <u,3,7,3>: Cost 3 vext1 <1,u,3,7>, <3,0,1,2>
+  1487957302U, // <u,3,7,4>: Cost 2 vext1 <1,u,3,7>, RHS
+  2707352311U, // <u,3,7,5>: Cost 3 vext3 <3,7,5,u>, <3,7,5,u>
+  2655024623U, // <u,3,7,6>: Cost 3 vext2 <6,2,u,3>, <7,6,2,u>
+  1592563308U, // <u,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+  1487959854U, // <u,3,7,u>: Cost 2 vext1 <1,u,3,7>, LHS
+  1544787667U, // <u,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
+  471045934U, // <u,3,u,1>: Cost 1 vext2 LHS, LHS
+  1549432709U, // <u,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
+  336380006U, // <u,3,u,3>: Cost 1 vdup3 LHS
+  1544788031U, // <u,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
+  471046298U, // <u,3,u,5>: Cost 1 vext2 LHS, RHS
+  1549433040U, // <u,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
+  1750314537U, // <u,3,u,7>: Cost 2 vuzpr LHS, RHS
+  471046501U, // <u,3,u,u>: Cost 1 vext2 LHS, LHS
+  2625167360U, // <u,4,0,0>: Cost 3 vext2 <1,2,u,4>, <0,0,0,0>
+  1551425638U, // <u,4,0,1>: Cost 2 vext2 <1,2,u,4>, LHS
+  2619195630U, // <u,4,0,2>: Cost 3 vext2 <0,2,u,4>, <0,2,u,4>
+  2619343104U, // <u,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
+  2625167698U, // <u,4,0,4>: Cost 3 vext2 <1,2,u,4>, <0,4,1,5>
+  1638329234U, // <u,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
+  1638329244U, // <u,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
+  3787803556U, // <u,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
+  1551426205U, // <u,4,0,u>: Cost 2 vext2 <1,2,u,4>, LHS
+  2555748454U, // <u,4,1,0>: Cost 3 vext1 <0,u,4,1>, LHS
+  2625168180U, // <u,4,1,1>: Cost 3 vext2 <1,2,u,4>, <1,1,1,1>
+  1551426503U, // <u,4,1,2>: Cost 2 vext2 <1,2,u,4>, <1,2,u,4>
+  2625168344U, // <u,4,1,3>: Cost 3 vext2 <1,2,u,4>, <1,3,1,3>
+  2555751734U, // <u,4,1,4>: Cost 3 vext1 <0,u,4,1>, RHS
+  1860554038U, // <u,4,1,5>: Cost 2 vzipl LHS, RHS
+  2689879022U, // <u,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
+  2592248852U, // <u,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
+  1555408301U, // <u,4,1,u>: Cost 2 vext2 <1,u,u,4>, <1,u,u,4>
+  2555756646U, // <u,4,2,0>: Cost 3 vext1 <0,u,4,2>, LHS
+  2625168943U, // <u,4,2,1>: Cost 3 vext2 <1,2,u,4>, <2,1,4,u>
+  2625169000U, // <u,4,2,2>: Cost 3 vext2 <1,2,u,4>, <2,2,2,2>
+  2619197134U, // <u,4,2,3>: Cost 3 vext2 <0,2,u,4>, <2,3,4,5>
+  2555759926U, // <u,4,2,4>: Cost 3 vext1 <0,u,4,2>, RHS
+  2712071222U, // <u,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
+  1994771766U, // <u,4,2,6>: Cost 2 vtrnl LHS, RHS
+  2592257045U, // <u,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
+  1994771784U, // <u,4,2,u>: Cost 2 vtrnl LHS, RHS
+  2625169558U, // <u,4,3,0>: Cost 3 vext2 <1,2,u,4>, <3,0,1,2>
+  2567709594U, // <u,4,3,1>: Cost 3 vext1 <2,u,4,3>, <1,2,3,4>
+  2567710817U, // <u,4,3,2>: Cost 3 vext1 <2,u,4,3>, <2,u,4,3>
+  2625169820U, // <u,4,3,3>: Cost 3 vext2 <1,2,u,4>, <3,3,3,3>
+  2625169922U, // <u,4,3,4>: Cost 3 vext2 <1,2,u,4>, <3,4,5,6>
+  2954069710U, // <u,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
+  2954068172U, // <u,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
+  3903849472U, // <u,4,3,7>: Cost 4 vuzpr <1,u,3,4>, <1,3,5,7>
+  2954068174U, // <u,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
+  1505919078U, // <u,4,4,0>: Cost 2 vext1 <4,u,4,4>, LHS
+  2567717831U, // <u,4,4,1>: Cost 3 vext1 <2,u,4,4>, <1,2,u,4>
+  2567719010U, // <u,4,4,2>: Cost 3 vext1 <2,u,4,4>, <2,u,4,4>
+  2570373542U, // <u,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
+  161926454U, // <u,4,4,4>: Cost 1 vdup0 RHS
+  1551428918U, // <u,4,4,5>: Cost 2 vext2 <1,2,u,4>, RHS
+  1638329572U, // <u,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
+  2594927963U, // <u,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
+  161926454U, // <u,4,4,u>: Cost 1 vdup0 RHS
+  1493983334U, // <u,4,5,0>: Cost 2 vext1 <2,u,4,5>, LHS
+  2689879301U, // <u,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
+  1493985379U, // <u,4,5,2>: Cost 2 vext1 <2,u,4,5>, <2,u,4,5>
+  2567727254U, // <u,4,5,3>: Cost 3 vext1 <2,u,4,5>, <3,0,1,2>
+  1493986614U, // <u,4,5,4>: Cost 2 vext1 <2,u,4,5>, RHS
+  1863535926U, // <u,4,5,5>: Cost 2 vzipl RHS, RHS
+  537750838U, // <u,4,5,6>: Cost 1 vext3 LHS, RHS
+  2830110006U, // <u,4,5,7>: Cost 3 vuzpr <1,u,3,4>, RHS
+  537750856U, // <u,4,5,u>: Cost 1 vext3 LHS, RHS
+  1482047590U, // <u,4,6,0>: Cost 2 vext1 <0,u,4,6>, LHS
+  2555790070U, // <u,4,6,1>: Cost 3 vext1 <0,u,4,6>, <1,0,3,2>
+  2555790952U, // <u,4,6,2>: Cost 3 vext1 <0,u,4,6>, <2,2,2,2>
+  2555791510U, // <u,4,6,3>: Cost 3 vext1 <0,u,4,6>, <3,0,1,2>
+  1482050870U, // <u,4,6,4>: Cost 2 vext1 <0,u,4,6>, RHS
+  2689879422U, // <u,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
+  1997753654U, // <u,4,6,6>: Cost 2 vtrnl RHS, RHS
+  2712071562U, // <u,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
+  1482053422U, // <u,4,6,u>: Cost 2 vext1 <0,u,4,6>, LHS
+  2567741542U, // <u,4,7,0>: Cost 3 vext1 <2,u,4,7>, LHS
+  2567742362U, // <u,4,7,1>: Cost 3 vext1 <2,u,4,7>, <1,2,3,4>
+  2567743589U, // <u,4,7,2>: Cost 3 vext1 <2,u,4,7>, <2,u,4,7>
+  2573716286U, // <u,4,7,3>: Cost 3 vext1 <3,u,4,7>, <3,u,4,7>
+  2567744822U, // <u,4,7,4>: Cost 3 vext1 <2,u,4,7>, RHS
+  2712071624U, // <u,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
+  96808489U, // <u,4,7,6>: Cost 1 vrev RHS
+  2651715180U, // <u,4,7,7>: Cost 3 vext2 <5,6,u,4>, <7,7,7,7>
+  96955963U, // <u,4,7,u>: Cost 1 vrev RHS
+  1482063974U, // <u,4,u,0>: Cost 2 vext1 <0,u,4,u>, LHS
+  1551431470U, // <u,4,u,1>: Cost 2 vext2 <1,2,u,4>, LHS
+  1494009958U, // <u,4,u,2>: Cost 2 vext1 <2,u,4,u>, <2,u,4,u>
+  2555807894U, // <u,4,u,3>: Cost 3 vext1 <0,u,4,u>, <3,0,1,2>
+  161926454U, // <u,4,u,4>: Cost 1 vdup0 RHS
+  1551431834U, // <u,4,u,5>: Cost 2 vext2 <1,2,u,4>, RHS
+  537751081U, // <u,4,u,6>: Cost 1 vext3 LHS, RHS
+  2830110249U, // <u,4,u,7>: Cost 3 vuzpr <1,u,3,4>, RHS
+  537751099U, // <u,4,u,u>: Cost 1 vext3 LHS, RHS
+  2631811072U, // <u,5,0,0>: Cost 3 vext2 <2,3,u,5>, <0,0,0,0>
+  1558069350U, // <u,5,0,1>: Cost 2 vext2 <2,3,u,5>, LHS
+  2619203823U, // <u,5,0,2>: Cost 3 vext2 <0,2,u,5>, <0,2,u,5>
+  2619867456U, // <u,5,0,3>: Cost 3 vext2 <0,3,u,5>, <0,3,u,5>
+  1546273106U, // <u,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
+  2733010539U, // <u,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
+  2597622682U, // <u,5,0,6>: Cost 3 vext1 <7,u,5,0>, <6,7,u,5>
+  1176539396U, // <u,5,0,7>: Cost 2 vrev <5,u,7,0>
+  1558069917U, // <u,5,0,u>: Cost 2 vext2 <2,3,u,5>, LHS
+  1505968230U, // <u,5,1,0>: Cost 2 vext1 <4,u,5,1>, LHS
+  2624512887U, // <u,5,1,1>: Cost 3 vext2 <1,1,u,5>, <1,1,u,5>
+  2631811990U, // <u,5,1,2>: Cost 3 vext2 <2,3,u,5>, <1,2,3,0>
+  2618541056U, // <u,5,1,3>: Cost 3 vext2 <0,1,u,5>, <1,3,5,7>
+  1505971510U, // <u,5,1,4>: Cost 2 vext1 <4,u,5,1>, RHS
+  2627167419U, // <u,5,1,5>: Cost 3 vext2 <1,5,u,5>, <1,5,u,5>
+  2579714554U, // <u,5,1,6>: Cost 3 vext1 <4,u,5,1>, <6,2,7,3>
+  1638330064U, // <u,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
+  1638477529U, // <u,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
+  2561802342U, // <u,5,2,0>: Cost 3 vext1 <1,u,5,2>, LHS
+  2561803264U, // <u,5,2,1>: Cost 3 vext1 <1,u,5,2>, <1,3,5,7>
+  2631149217U, // <u,5,2,2>: Cost 3 vext2 <2,2,u,5>, <2,2,u,5>
+  1558071026U, // <u,5,2,3>: Cost 2 vext2 <2,3,u,5>, <2,3,u,5>
+  2561805622U, // <u,5,2,4>: Cost 3 vext1 <1,u,5,2>, RHS
+  2714062607U, // <u,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
+  2631813050U, // <u,5,2,6>: Cost 3 vext2 <2,3,u,5>, <2,6,3,7>
+  3092335926U, // <u,5,2,7>: Cost 3 vtrnr <0,u,0,2>, RHS
+  1561389191U, // <u,5,2,u>: Cost 2 vext2 <2,u,u,5>, <2,u,u,5>
+  2561810534U, // <u,5,3,0>: Cost 3 vext1 <1,u,5,3>, LHS
+  2561811857U, // <u,5,3,1>: Cost 3 vext1 <1,u,5,3>, <1,u,5,3>
+  2631813474U, // <u,5,3,2>: Cost 3 vext2 <2,3,u,5>, <3,2,5,u>
+  2631813532U, // <u,5,3,3>: Cost 3 vext2 <2,3,u,5>, <3,3,3,3>
+  2619869698U, // <u,5,3,4>: Cost 3 vext2 <0,3,u,5>, <3,4,5,6>
+  3001847002U, // <u,5,3,5>: Cost 3 vzipr LHS, <4,4,5,5>
+  2954070530U, // <u,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  2018749750U, // <u,5,3,7>: Cost 2 vtrnr LHS, RHS
+  2018749751U, // <u,5,3,u>: Cost 2 vtrnr LHS, RHS
+  2573762662U, // <u,5,4,0>: Cost 3 vext1 <3,u,5,4>, LHS
+  2620017634U, // <u,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
+  2573764338U, // <u,5,4,2>: Cost 3 vext1 <3,u,5,4>, <2,3,u,5>
+  2573765444U, // <u,5,4,3>: Cost 3 vext1 <3,u,5,4>, <3,u,5,4>
+  1570680053U, // <u,5,4,4>: Cost 2 vext2 <4,4,u,5>, <4,4,u,5>
+  1558072630U, // <u,5,4,5>: Cost 2 vext2 <2,3,u,5>, RHS
+  2645749143U, // <u,5,4,6>: Cost 3 vext2 <4,6,u,5>, <4,6,u,5>
+  1638330310U, // <u,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
+  1558072873U, // <u,5,4,u>: Cost 2 vext2 <2,3,u,5>, RHS
+  1506000998U, // <u,5,5,0>: Cost 2 vext1 <4,u,5,5>, LHS
+  2561827984U, // <u,5,5,1>: Cost 3 vext1 <1,u,5,5>, <1,5,3,7>
+  2579744360U, // <u,5,5,2>: Cost 3 vext1 <4,u,5,5>, <2,2,2,2>
+  2579744918U, // <u,5,5,3>: Cost 3 vext1 <4,u,5,5>, <3,0,1,2>
+  1506004278U, // <u,5,5,4>: Cost 2 vext1 <4,u,5,5>, RHS
+  229035318U, // <u,5,5,5>: Cost 1 vdup1 RHS
+  2712072206U, // <u,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
+  1638330392U, // <u,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
+  229035318U, // <u,5,5,u>: Cost 1 vdup1 RHS
+  1500037222U, // <u,5,6,0>: Cost 2 vext1 <3,u,5,6>, LHS
+  2561836436U, // <u,5,6,1>: Cost 3 vext1 <1,u,5,6>, <1,u,5,6>
+  2567809133U, // <u,5,6,2>: Cost 3 vext1 <2,u,5,6>, <2,u,5,6>
+  1500040006U, // <u,5,6,3>: Cost 2 vext1 <3,u,5,6>, <3,u,5,6>
+  1500040502U, // <u,5,6,4>: Cost 2 vext1 <3,u,5,6>, RHS
+  2714062935U, // <u,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
+  2712072288U, // <u,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
+  27705344U, // <u,5,6,7>: Cost 0 copy RHS
+  27705344U, // <u,5,6,u>: Cost 0 copy RHS
+  1488101478U, // <u,5,7,0>: Cost 2 vext1 <1,u,5,7>, LHS
+  1488102805U, // <u,5,7,1>: Cost 2 vext1 <1,u,5,7>, <1,u,5,7>
+  2561844840U, // <u,5,7,2>: Cost 3 vext1 <1,u,5,7>, <2,2,2,2>
+  2561845398U, // <u,5,7,3>: Cost 3 vext1 <1,u,5,7>, <3,0,1,2>
+  1488104758U, // <u,5,7,4>: Cost 2 vext1 <1,u,5,7>, RHS
+  1638330536U, // <u,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
+  2712072362U, // <u,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
+  2042965302U, // <u,5,7,7>: Cost 2 vtrnr RHS, RHS
+  1488107310U, // <u,5,7,u>: Cost 2 vext1 <1,u,5,7>, LHS
+  1488109670U, // <u,5,u,0>: Cost 2 vext1 <1,u,5,u>, LHS
+  1488110998U, // <u,5,u,1>: Cost 2 vext1 <1,u,5,u>, <1,u,5,u>
+  2561853032U, // <u,5,u,2>: Cost 3 vext1 <1,u,5,u>, <2,2,2,2>
+  1500056392U, // <u,5,u,3>: Cost 2 vext1 <3,u,5,u>, <3,u,5,u>
+  1488112950U, // <u,5,u,4>: Cost 2 vext1 <1,u,5,u>, RHS
+  229035318U, // <u,5,u,5>: Cost 1 vdup1 RHS
+  2954111490U, // <u,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  27705344U, // <u,5,u,7>: Cost 0 copy RHS
+  27705344U, // <u,5,u,u>: Cost 0 copy RHS
+  2619211776U, // <u,6,0,0>: Cost 3 vext2 <0,2,u,6>, <0,0,0,0>
+  1545470054U, // <u,6,0,1>: Cost 2 vext2 <0,2,u,6>, LHS
+  1545470192U, // <u,6,0,2>: Cost 2 vext2 <0,2,u,6>, <0,2,u,6>
+  2255958969U, // <u,6,0,3>: Cost 3 vrev <6,u,3,0>
+  1546797458U, // <u,6,0,4>: Cost 2 vext2 <0,4,u,6>, <0,4,u,6>
+  2720624971U, // <u,6,0,5>: Cost 3 vext3 <6,0,5,u>, <6,0,5,u>
+  2256180180U, // <u,6,0,6>: Cost 3 vrev <6,u,6,0>
+  2960682294U, // <u,6,0,7>: Cost 3 vzipr <1,2,u,0>, RHS
+  1545470621U, // <u,6,0,u>: Cost 2 vext2 <0,2,u,6>, LHS
+  1182004127U, // <u,6,1,0>: Cost 2 vrev <6,u,0,1>
+  2619212596U, // <u,6,1,1>: Cost 3 vext2 <0,2,u,6>, <1,1,1,1>
+  2619212694U, // <u,6,1,2>: Cost 3 vext2 <0,2,u,6>, <1,2,3,0>
+  2619212760U, // <u,6,1,3>: Cost 3 vext2 <0,2,u,6>, <1,3,1,3>
+  2626511979U, // <u,6,1,4>: Cost 3 vext2 <1,4,u,6>, <1,4,u,6>
+  2619212944U, // <u,6,1,5>: Cost 3 vext2 <0,2,u,6>, <1,5,3,7>
+  2714063264U, // <u,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
+  2967326006U, // <u,6,1,7>: Cost 3 vzipr <2,3,u,1>, RHS
+  1182594023U, // <u,6,1,u>: Cost 2 vrev <6,u,u,1>
+  1506050150U, // <u,6,2,0>: Cost 2 vext1 <4,u,6,2>, LHS
+  2579792630U, // <u,6,2,1>: Cost 3 vext1 <4,u,6,2>, <1,0,3,2>
+  2619213416U, // <u,6,2,2>: Cost 3 vext2 <0,2,u,6>, <2,2,2,2>
+  2619213478U, // <u,6,2,3>: Cost 3 vext2 <0,2,u,6>, <2,3,0,1>
+  1506053430U, // <u,6,2,4>: Cost 2 vext1 <4,u,6,2>, RHS
+  2633148309U, // <u,6,2,5>: Cost 3 vext2 <2,5,u,6>, <2,5,u,6>
+  2619213754U, // <u,6,2,6>: Cost 3 vext2 <0,2,u,6>, <2,6,3,7>
+  1638330874U, // <u,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
+  1638478339U, // <u,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
+  2619213974U, // <u,6,3,0>: Cost 3 vext2 <0,2,u,6>, <3,0,1,2>
+  2255836074U, // <u,6,3,1>: Cost 3 vrev <6,u,1,3>
+  2255909811U, // <u,6,3,2>: Cost 3 vrev <6,u,2,3>
+  2619214236U, // <u,6,3,3>: Cost 3 vext2 <0,2,u,6>, <3,3,3,3>
+  1564715549U, // <u,6,3,4>: Cost 2 vext2 <3,4,u,6>, <3,4,u,6>
+  2639121006U, // <u,6,3,5>: Cost 3 vext2 <3,5,u,6>, <3,5,u,6>
+  3001847012U, // <u,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
+  1880329526U, // <u,6,3,7>: Cost 2 vzipr LHS, RHS
+  1880329527U, // <u,6,3,u>: Cost 2 vzipr LHS, RHS
+  2567864422U, // <u,6,4,0>: Cost 3 vext1 <2,u,6,4>, LHS
+  2733011558U, // <u,6,4,1>: Cost 3 vext3 LHS, <6,4,1,3>
+  2567866484U, // <u,6,4,2>: Cost 3 vext1 <2,u,6,4>, <2,u,6,4>
+  2638458005U, // <u,6,4,3>: Cost 3 vext2 <3,4,u,6>, <4,3,6,u>
+  1570540772U, // <u,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
+  1545473334U, // <u,6,4,5>: Cost 2 vext2 <0,2,u,6>, RHS
+  1572015512U, // <u,6,4,6>: Cost 2 vext2 <4,6,u,6>, <4,6,u,6>
+  2960715062U, // <u,6,4,7>: Cost 3 vzipr <1,2,u,4>, RHS
+  1545473577U, // <u,6,4,u>: Cost 2 vext2 <0,2,u,6>, RHS
+  2567872614U, // <u,6,5,0>: Cost 3 vext1 <2,u,6,5>, LHS
+  2645757648U, // <u,6,5,1>: Cost 3 vext2 <4,6,u,6>, <5,1,7,3>
+  2567874490U, // <u,6,5,2>: Cost 3 vext1 <2,u,6,5>, <2,6,3,7>
+  2576501250U, // <u,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
+  1576660943U, // <u,6,5,4>: Cost 2 vext2 <5,4,u,6>, <5,4,u,6>
+  2645757956U, // <u,6,5,5>: Cost 3 vext2 <4,6,u,6>, <5,5,5,5>
+  2645758050U, // <u,6,5,6>: Cost 3 vext2 <4,6,u,6>, <5,6,7,0>
+  2824080694U, // <u,6,5,7>: Cost 3 vuzpr <0,u,2,6>, RHS
+  1182626795U, // <u,6,5,u>: Cost 2 vrev <6,u,u,5>
+  1506082918U, // <u,6,6,0>: Cost 2 vext1 <4,u,6,6>, LHS
+  2579825398U, // <u,6,6,1>: Cost 3 vext1 <4,u,6,6>, <1,0,3,2>
+  2645758458U, // <u,6,6,2>: Cost 3 vext2 <4,6,u,6>, <6,2,7,3>
+  2579826838U, // <u,6,6,3>: Cost 3 vext1 <4,u,6,6>, <3,0,1,2>
+  1506086198U, // <u,6,6,4>: Cost 2 vext1 <4,u,6,6>, RHS
+  2579828432U, // <u,6,6,5>: Cost 3 vext1 <4,u,6,6>, <5,1,7,3>
+  296144182U, // <u,6,6,6>: Cost 1 vdup2 RHS
+  1638331202U, // <u,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
+  296144182U, // <u,6,6,u>: Cost 1 vdup2 RHS
+  432349286U, // <u,6,7,0>: Cost 1 vext1 RHS, LHS
+  1506091766U, // <u,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
+  1506092648U, // <u,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1506093206U, // <u,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  432352809U, // <u,6,7,4>: Cost 1 vext1 RHS, RHS
+  1506094800U, // <u,6,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+  1506095610U, // <u,6,7,6>: Cost 2 vext1 RHS, <6,2,7,3>
+  1906904374U, // <u,6,7,7>: Cost 2 vzipr RHS, RHS
+  432355118U, // <u,6,7,u>: Cost 1 vext1 RHS, LHS
+  432357478U, // <u,6,u,0>: Cost 1 vext1 RHS, LHS
+  1545475886U, // <u,6,u,1>: Cost 2 vext2 <0,2,u,6>, LHS
+  1506100840U, // <u,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1506101398U, // <u,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  432361002U, // <u,6,u,4>: Cost 1 vext1 RHS, RHS
+  1545476250U, // <u,6,u,5>: Cost 2 vext2 <0,2,u,6>, RHS
+  296144182U, // <u,6,u,6>: Cost 1 vdup2 RHS
+  1880370486U, // <u,6,u,7>: Cost 2 vzipr LHS, RHS
+  432363310U, // <u,6,u,u>: Cost 1 vext1 RHS, LHS
+  1571356672U, // <u,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+  497614950U, // <u,7,0,1>: Cost 1 vext2 RHS, LHS
+  1571356836U, // <u,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+  2573880146U, // <u,7,0,3>: Cost 3 vext1 <3,u,7,0>, <3,u,7,0>
+  1571357010U, // <u,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+  1512083716U, // <u,7,0,5>: Cost 2 vext1 <5,u,7,0>, <5,u,7,0>
+  2621874741U, // <u,7,0,6>: Cost 3 vext2 <0,6,u,7>, <0,6,u,7>
+  2585826298U, // <u,7,0,7>: Cost 3 vext1 <5,u,7,0>, <7,0,1,2>
+  497615517U, // <u,7,0,u>: Cost 1 vext2 RHS, LHS
+  1571357430U, // <u,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+  1571357492U, // <u,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+  1571357590U, // <u,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
+  1552114715U, // <u,7,1,3>: Cost 2 vext2 <1,3,u,7>, <1,3,u,7>
+  2573888822U, // <u,7,1,4>: Cost 3 vext1 <3,u,7,1>, RHS
+  1553441981U, // <u,7,1,5>: Cost 2 vext2 <1,5,u,7>, <1,5,u,7>
+  2627847438U, // <u,7,1,6>: Cost 3 vext2 <1,6,u,7>, <1,6,u,7>
+  2727408775U, // <u,7,1,7>: Cost 3 vext3 <7,1,7,u>, <7,1,7,u>
+  1555432880U, // <u,7,1,u>: Cost 2 vext2 <1,u,u,7>, <1,u,u,7>
+  2629838337U, // <u,7,2,0>: Cost 3 vext2 <2,0,u,7>, <2,0,u,7>
+  1188058754U, // <u,7,2,1>: Cost 2 vrev <7,u,1,2>
+  1571358312U, // <u,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+  1571358374U, // <u,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+  2632492869U, // <u,7,2,4>: Cost 3 vext2 <2,4,u,7>, <2,4,u,7>
+  2633156502U, // <u,7,2,5>: Cost 3 vext2 <2,5,u,7>, <2,5,u,7>
+  1560078311U, // <u,7,2,6>: Cost 2 vext2 <2,6,u,7>, <2,6,u,7>
+  2728072408U, // <u,7,2,7>: Cost 3 vext3 <7,2,7,u>, <7,2,7,u>
+  1561405577U, // <u,7,2,u>: Cost 2 vext2 <2,u,u,7>, <2,u,u,7>
+  1571358870U, // <u,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+  2627184913U, // <u,7,3,1>: Cost 3 vext2 <1,5,u,7>, <3,1,5,u>
+  2633820523U, // <u,7,3,2>: Cost 3 vext2 <2,6,u,7>, <3,2,6,u>
+  1571359132U, // <u,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+  1571359234U, // <u,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+  1512108295U, // <u,7,3,5>: Cost 2 vext1 <5,u,7,3>, <5,u,7,3>
+  1518080992U, // <u,7,3,6>: Cost 2 vext1 <6,u,7,3>, <6,u,7,3>
+  2640456465U, // <u,7,3,7>: Cost 3 vext2 <3,7,u,7>, <3,7,u,7>
+  1571359518U, // <u,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+  1571359634U, // <u,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+  2573911067U, // <u,7,4,1>: Cost 3 vext1 <3,u,7,4>, <1,3,u,7>
+  2645101622U, // <u,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
+  2573912918U, // <u,7,4,3>: Cost 3 vext1 <3,u,7,4>, <3,u,7,4>
+  1571359952U, // <u,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+  497618248U, // <u,7,4,5>: Cost 1 vext2 RHS, RHS
+  1571360116U, // <u,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  2645102024U, // <u,7,4,7>: Cost 3 vext2 RHS, <4,7,5,0>
+  497618473U, // <u,7,4,u>: Cost 1 vext2 RHS, RHS
+  2645102152U, // <u,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
+  1571360464U, // <u,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  2645102334U, // <u,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
+  2645102447U, // <u,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
+  1571360710U, // <u,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+  1571360772U, // <u,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+  1571360866U, // <u,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
+  1571360936U, // <u,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+  1571361017U, // <u,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
+  1530044518U, // <u,7,6,0>: Cost 2 vext1 <u,u,7,6>, LHS
+  2645103016U, // <u,7,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
+  1571361274U, // <u,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  2645103154U, // <u,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
+  1530047798U, // <u,7,6,4>: Cost 2 vext1 <u,u,7,6>, RHS
+  1188386474U, // <u,7,6,5>: Cost 2 vrev <7,u,5,6>
+  1571361592U, // <u,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
+  1571361614U, // <u,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+  1571361695U, // <u,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
+  1571361786U, // <u,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
+  2573935616U, // <u,7,7,1>: Cost 3 vext1 <3,u,7,7>, <1,3,5,7>
+  2645103781U, // <u,7,7,2>: Cost 3 vext2 RHS, <7,2,2,2>
+  2573937497U, // <u,7,7,3>: Cost 3 vext1 <3,u,7,7>, <3,u,7,7>
+  1571362150U, // <u,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
+  1512141067U, // <u,7,7,5>: Cost 2 vext1 <5,u,7,7>, <5,u,7,7>
+  1518113764U, // <u,7,7,6>: Cost 2 vext1 <6,u,7,7>, <6,u,7,7>
+  363253046U, // <u,7,7,7>: Cost 1 vdup3 RHS
+  363253046U, // <u,7,7,u>: Cost 1 vdup3 RHS
+  1571362515U, // <u,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
+  497620782U, // <u,7,u,1>: Cost 1 vext2 RHS, LHS
+  1571362693U, // <u,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
+  1571362748U, // <u,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+  1571362879U, // <u,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
+  497621146U, // <u,7,u,5>: Cost 1 vext2 RHS, RHS
+  1571363024U, // <u,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
+  363253046U, // <u,7,u,7>: Cost 1 vdup3 RHS
+  497621349U, // <u,7,u,u>: Cost 1 vext2 RHS, LHS
+  135053414U, // <u,u,0,0>: Cost 1 vdup0 LHS
+  471081121U, // <u,u,0,1>: Cost 1 vext2 LHS, LHS
+  1544822948U, // <u,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  1616140005U, // <u,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
+  1544823122U, // <u,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  1512157453U, // <u,u,0,5>: Cost 2 vext1 <5,u,u,0>, <5,u,u,0>
+  1662220032U, // <u,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
+  1194457487U, // <u,u,0,7>: Cost 2 vrev <u,u,7,0>
+  471081629U, // <u,u,0,u>: Cost 1 vext2 LHS, LHS
+  1544823542U, // <u,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  202162278U, // <u,u,1,1>: Cost 1 vdup1 LHS
+  537753390U, // <u,u,1,2>: Cost 1 vext3 LHS, LHS
+  1544823768U, // <u,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  1494248758U, // <u,u,1,4>: Cost 2 vext1 <2,u,u,1>, RHS
+  1544823952U, // <u,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  1518138343U, // <u,u,1,6>: Cost 2 vext1 <6,u,u,1>, <6,u,u,1>
+  1640322907U, // <u,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
+  537753444U, // <u,u,1,u>: Cost 1 vext3 LHS, LHS
+  1482309734U, // <u,u,2,0>: Cost 2 vext1 <0,u,u,2>, LHS
+  1194031451U, // <u,u,2,1>: Cost 2 vrev <u,u,1,2>
+  269271142U, // <u,u,2,2>: Cost 1 vdup2 LHS
+  835584U, // <u,u,2,3>: Cost 0 copy LHS
+  1482313014U, // <u,u,2,4>: Cost 2 vext1 <0,u,u,2>, RHS
+  2618566504U, // <u,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+  1544824762U, // <u,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  1638479788U, // <u,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
+  835584U, // <u,u,2,u>: Cost 0 copy LHS
+  408576723U, // <u,u,3,0>: Cost 1 vext1 LHS, LHS
+  1482318582U, // <u,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  120371557U, // <u,u,3,2>: Cost 1 vrev LHS
+  336380006U, // <u,u,3,3>: Cost 1 vdup3 LHS
+  408579382U, // <u,u,3,4>: Cost 1 vext1 LHS, RHS
+  1616140271U, // <u,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
+  1530098170U, // <u,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1880329544U, // <u,u,3,7>: Cost 2 vzipr LHS, RHS
+  408581934U, // <u,u,3,u>: Cost 1 vext1 LHS, LHS
+  1488298086U, // <u,u,4,0>: Cost 2 vext1 <1,u,u,4>, LHS
+  1488299437U, // <u,u,4,1>: Cost 2 vext1 <1,u,u,4>, <1,u,u,4>
+  1659271204U, // <u,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
+  1194195311U, // <u,u,4,3>: Cost 2 vrev <u,u,3,4>
+  161926454U, // <u,u,4,4>: Cost 1 vdup0 RHS
+  471084342U, // <u,u,4,5>: Cost 1 vext2 LHS, RHS
+  1571368308U, // <u,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  1640323153U, // <u,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
+  471084585U, // <u,u,4,u>: Cost 1 vext2 LHS, RHS
+  1494278246U, // <u,u,5,0>: Cost 2 vext1 <2,u,u,5>, LHS
+  1571368656U, // <u,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  1494280327U, // <u,u,5,2>: Cost 2 vext1 <2,u,u,5>, <2,u,u,5>
+  1616140415U, // <u,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
+  1494281526U, // <u,u,5,4>: Cost 2 vext1 <2,u,u,5>, RHS
+  229035318U, // <u,u,5,5>: Cost 1 vdup1 RHS
+  537753754U, // <u,u,5,6>: Cost 1 vext3 LHS, RHS
+  1750355254U, // <u,u,5,7>: Cost 2 vuzpr LHS, RHS
+  537753772U, // <u,u,5,u>: Cost 1 vext3 LHS, RHS
+  1482342502U, // <u,u,6,0>: Cost 2 vext1 <0,u,u,6>, LHS
+  2556084982U, // <u,u,6,1>: Cost 3 vext1 <0,u,u,6>, <1,0,3,2>
+  1571369466U, // <u,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  1611938000U, // <u,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
+  1482345782U, // <u,u,6,4>: Cost 2 vext1 <0,u,u,6>, RHS
+  1194359171U, // <u,u,6,5>: Cost 2 vrev <u,u,5,6>
+  296144182U, // <u,u,6,6>: Cost 1 vdup2 RHS
+  27705344U, // <u,u,6,7>: Cost 0 copy RHS
+  27705344U, // <u,u,6,u>: Cost 0 copy RHS
+  432496742U, // <u,u,7,0>: Cost 1 vext1 RHS, LHS
+  1488324016U, // <u,u,7,1>: Cost 2 vext1 <1,u,u,7>, <1,u,u,7>
+  1494296713U, // <u,u,7,2>: Cost 2 vext1 <2,u,u,7>, <2,u,u,7>
+  1906901148U, // <u,u,7,3>: Cost 2 vzipr RHS, LHS
+  432500283U, // <u,u,7,4>: Cost 1 vext1 RHS, RHS
+  1506242256U, // <u,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+  120699277U, // <u,u,7,6>: Cost 1 vrev RHS
+  363253046U, // <u,u,7,7>: Cost 1 vdup3 RHS
+  432502574U, // <u,u,7,u>: Cost 1 vext1 RHS, LHS
+  408617688U, // <u,u,u,0>: Cost 1 vext1 LHS, LHS
+  471086894U, // <u,u,u,1>: Cost 1 vext2 LHS, LHS
+  537753957U, // <u,u,u,2>: Cost 1 vext3 LHS, LHS
+  835584U, // <u,u,u,3>: Cost 0 copy LHS
+  408620342U, // <u,u,u,4>: Cost 1 vext1 LHS, RHS
+  471087258U, // <u,u,u,5>: Cost 1 vext2 LHS, RHS
+  537753997U, // <u,u,u,6>: Cost 1 vext3 LHS, RHS
+  27705344U, // <u,u,u,7>: Cost 0 copy RHS
+  835584U, // <u,u,u,u>: Cost 0 copy LHS
+  0
+};
diff --git a/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.cpp
new file mode 100644
index 0000000..80b4b48
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.cpp
@@ -0,0 +1,21 @@
+//===-- ARMRegisterInfo.cpp - ARM Register Information --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMRegisterInfo.h"
+using namespace llvm;
+
+void ARMRegisterInfo::anchor() { }
+
+ARMRegisterInfo::ARMRegisterInfo(const ARMSubtarget &sti)
+  : ARMBaseRegisterInfo(sti) {
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.h b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.h
new file mode 100644
index 0000000..3e6af3f
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.h
@@ -0,0 +1,31 @@
+//===-- ARMRegisterInfo.h - ARM Register Information Impl -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMREGISTERINFO_H
+#define ARMREGISTERINFO_H
+
+#include "ARMBaseRegisterInfo.h"
+
+namespace llvm {
+
+class ARMSubtarget;
+
+struct ARMRegisterInfo : public ARMBaseRegisterInfo {
+  virtual void anchor();
+public:
+  ARMRegisterInfo(const ARMSubtarget &STI);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td
new file mode 100644
index 0000000..b290e7f
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td
@@ -0,0 +1,419 @@
+//===-- ARMRegisterInfo.td - ARM Register defs -------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the ARM register file
+//===----------------------------------------------------------------------===//
+
+// Registers are identified with 4-bit ID numbers.
+class ARMReg<bits<16> Enc, string n, list<Register> subregs = []> : Register<n> {
+  let HWEncoding = Enc;
+  let Namespace = "ARM";
+  let SubRegs = subregs;
+  // All bits of ARM registers with sub-registers are covered by sub-registers.
+  let CoveredBySubRegs = 1;
+}
+
+class ARMFReg<bits<16> Enc, string n> : Register<n> {
+  let HWEncoding = Enc;
+  let Namespace = "ARM";
+}
+
+// Subregister indices.
+let Namespace = "ARM" in {
+def qqsub_0 : SubRegIndex<256>;
+def qqsub_1 : SubRegIndex<256, 256>;
+
+// Note: Code depends on these having consecutive numbers.
+def qsub_0 : SubRegIndex<128>;
+def qsub_1 : SubRegIndex<128, 128>;
+def qsub_2 : ComposedSubRegIndex<qqsub_1, qsub_0>;
+def qsub_3 : ComposedSubRegIndex<qqsub_1, qsub_1>;
+
+def dsub_0 : SubRegIndex<64>;
+def dsub_1 : SubRegIndex<64, 64>;
+def dsub_2 : ComposedSubRegIndex<qsub_1, dsub_0>;
+def dsub_3 : ComposedSubRegIndex<qsub_1, dsub_1>;
+def dsub_4 : ComposedSubRegIndex<qsub_2, dsub_0>;
+def dsub_5 : ComposedSubRegIndex<qsub_2, dsub_1>;
+def dsub_6 : ComposedSubRegIndex<qsub_3, dsub_0>;
+def dsub_7 : ComposedSubRegIndex<qsub_3, dsub_1>;
+
+def ssub_0  : SubRegIndex<32>;
+def ssub_1  : SubRegIndex<32, 32>;
+def ssub_2  : ComposedSubRegIndex<dsub_1, ssub_0>;
+def ssub_3  : ComposedSubRegIndex<dsub_1, ssub_1>;
+
+def gsub_0  : SubRegIndex<32>;
+def gsub_1  : SubRegIndex<32, 32>;
+// Let TableGen synthesize the remaining 12 ssub_* indices.
+// We don't need to name them.
+}
+
+// Integer registers
+def R0  : ARMReg< 0, "r0">,  DwarfRegNum<[0]>;
+def R1  : ARMReg< 1, "r1">,  DwarfRegNum<[1]>;
+def R2  : ARMReg< 2, "r2">,  DwarfRegNum<[2]>;
+def R3  : ARMReg< 3, "r3">,  DwarfRegNum<[3]>;
+def R4  : ARMReg< 4, "r4">,  DwarfRegNum<[4]>;
+def R5  : ARMReg< 5, "r5">,  DwarfRegNum<[5]>;
+def R6  : ARMReg< 6, "r6">,  DwarfRegNum<[6]>;
+def R7  : ARMReg< 7, "r7">,  DwarfRegNum<[7]>;
+// These require 32-bit instructions.
+let CostPerUse = 1 in {
+def R8  : ARMReg< 8, "r8">,  DwarfRegNum<[8]>;
+def R9  : ARMReg< 9, "r9">,  DwarfRegNum<[9]>;
+def R10 : ARMReg<10, "r10">, DwarfRegNum<[10]>;
+def R11 : ARMReg<11, "r11">, DwarfRegNum<[11]>;
+def R12 : ARMReg<12, "r12">, DwarfRegNum<[12]>;
+def SP  : ARMReg<13, "sp">,  DwarfRegNum<[13]>;
+def LR  : ARMReg<14, "lr">,  DwarfRegNum<[14]>;
+def PC  : ARMReg<15, "pc">,  DwarfRegNum<[15]>;
+}
+
+// Float registers
+def S0  : ARMFReg< 0, "s0">;  def S1  : ARMFReg< 1, "s1">;
+def S2  : ARMFReg< 2, "s2">;  def S3  : ARMFReg< 3, "s3">;
+def S4  : ARMFReg< 4, "s4">;  def S5  : ARMFReg< 5, "s5">;
+def S6  : ARMFReg< 6, "s6">;  def S7  : ARMFReg< 7, "s7">;
+def S8  : ARMFReg< 8, "s8">;  def S9  : ARMFReg< 9, "s9">;
+def S10 : ARMFReg<10, "s10">; def S11 : ARMFReg<11, "s11">;
+def S12 : ARMFReg<12, "s12">; def S13 : ARMFReg<13, "s13">;
+def S14 : ARMFReg<14, "s14">; def S15 : ARMFReg<15, "s15">;
+def S16 : ARMFReg<16, "s16">; def S17 : ARMFReg<17, "s17">;
+def S18 : ARMFReg<18, "s18">; def S19 : ARMFReg<19, "s19">;
+def S20 : ARMFReg<20, "s20">; def S21 : ARMFReg<21, "s21">;
+def S22 : ARMFReg<22, "s22">; def S23 : ARMFReg<23, "s23">;
+def S24 : ARMFReg<24, "s24">; def S25 : ARMFReg<25, "s25">;
+def S26 : ARMFReg<26, "s26">; def S27 : ARMFReg<27, "s27">;
+def S28 : ARMFReg<28, "s28">; def S29 : ARMFReg<29, "s29">;
+def S30 : ARMFReg<30, "s30">; def S31 : ARMFReg<31, "s31">;
+
+// Aliases of the F* registers used to hold 64-bit fp values (doubles)
+let SubRegIndices = [ssub_0, ssub_1] in {
+def D0  : ARMReg< 0,  "d0", [S0,   S1]>, DwarfRegNum<[256]>;
+def D1  : ARMReg< 1,  "d1", [S2,   S3]>, DwarfRegNum<[257]>;
+def D2  : ARMReg< 2,  "d2", [S4,   S5]>, DwarfRegNum<[258]>;
+def D3  : ARMReg< 3,  "d3", [S6,   S7]>, DwarfRegNum<[259]>;
+def D4  : ARMReg< 4,  "d4", [S8,   S9]>, DwarfRegNum<[260]>;
+def D5  : ARMReg< 5,  "d5", [S10, S11]>, DwarfRegNum<[261]>;
+def D6  : ARMReg< 6,  "d6", [S12, S13]>, DwarfRegNum<[262]>;
+def D7  : ARMReg< 7,  "d7", [S14, S15]>, DwarfRegNum<[263]>;
+def D8  : ARMReg< 8,  "d8", [S16, S17]>, DwarfRegNum<[264]>;
+def D9  : ARMReg< 9,  "d9", [S18, S19]>, DwarfRegNum<[265]>;
+def D10 : ARMReg<10, "d10", [S20, S21]>, DwarfRegNum<[266]>;
+def D11 : ARMReg<11, "d11", [S22, S23]>, DwarfRegNum<[267]>;
+def D12 : ARMReg<12, "d12", [S24, S25]>, DwarfRegNum<[268]>;
+def D13 : ARMReg<13, "d13", [S26, S27]>, DwarfRegNum<[269]>;
+def D14 : ARMReg<14, "d14", [S28, S29]>, DwarfRegNum<[270]>;
+def D15 : ARMReg<15, "d15", [S30, S31]>, DwarfRegNum<[271]>;
+}
+
+// VFP3 defines 16 additional double registers
+def D16 : ARMFReg<16, "d16">, DwarfRegNum<[272]>;
+def D17 : ARMFReg<17, "d17">, DwarfRegNum<[273]>;
+def D18 : ARMFReg<18, "d18">, DwarfRegNum<[274]>;
+def D19 : ARMFReg<19, "d19">, DwarfRegNum<[275]>;
+def D20 : ARMFReg<20, "d20">, DwarfRegNum<[276]>;
+def D21 : ARMFReg<21, "d21">, DwarfRegNum<[277]>;
+def D22 : ARMFReg<22, "d22">, DwarfRegNum<[278]>;
+def D23 : ARMFReg<23, "d23">, DwarfRegNum<[279]>;
+def D24 : ARMFReg<24, "d24">, DwarfRegNum<[280]>;
+def D25 : ARMFReg<25, "d25">, DwarfRegNum<[281]>;
+def D26 : ARMFReg<26, "d26">, DwarfRegNum<[282]>;
+def D27 : ARMFReg<27, "d27">, DwarfRegNum<[283]>;
+def D28 : ARMFReg<28, "d28">, DwarfRegNum<[284]>;
+def D29 : ARMFReg<29, "d29">, DwarfRegNum<[285]>;
+def D30 : ARMFReg<30, "d30">, DwarfRegNum<[286]>;
+def D31 : ARMFReg<31, "d31">, DwarfRegNum<[287]>;
+
+// Advanced SIMD (NEON) defines 16 quad-word aliases
+let SubRegIndices = [dsub_0, dsub_1] in {
+def Q0  : ARMReg< 0,  "q0", [D0,   D1]>;
+def Q1  : ARMReg< 1,  "q1", [D2,   D3]>;
+def Q2  : ARMReg< 2,  "q2", [D4,   D5]>;
+def Q3  : ARMReg< 3,  "q3", [D6,   D7]>;
+def Q4  : ARMReg< 4,  "q4", [D8,   D9]>;
+def Q5  : ARMReg< 5,  "q5", [D10, D11]>;
+def Q6  : ARMReg< 6,  "q6", [D12, D13]>;
+def Q7  : ARMReg< 7,  "q7", [D14, D15]>;
+}
+let SubRegIndices = [dsub_0, dsub_1] in {
+def Q8  : ARMReg< 8,  "q8", [D16, D17]>;
+def Q9  : ARMReg< 9,  "q9", [D18, D19]>;
+def Q10 : ARMReg<10, "q10", [D20, D21]>;
+def Q11 : ARMReg<11, "q11", [D22, D23]>;
+def Q12 : ARMReg<12, "q12", [D24, D25]>;
+def Q13 : ARMReg<13, "q13", [D26, D27]>;
+def Q14 : ARMReg<14, "q14", [D28, D29]>;
+def Q15 : ARMReg<15, "q15", [D30, D31]>;
+}
+
+// Current Program Status Register.
+// We model fpscr with two registers: FPSCR models the control bits and will be
+// reserved. FPSCR_NZCV models the flag bits and will be unreserved. APSR_NZCV
+// models the APSR when it's accessed by some special instructions. In such cases
+// it has the same encoding as PC.
+def CPSR       : ARMReg<0,  "cpsr">;
+def APSR       : ARMReg<1,  "apsr">;
+def APSR_NZCV  : ARMReg<15, "apsr_nzcv">;
+def SPSR       : ARMReg<2,  "spsr">;
+def FPSCR      : ARMReg<3,  "fpscr">;
+def FPSCR_NZCV : ARMReg<3,  "fpscr_nzcv"> {
+  let Aliases = [FPSCR];
+}
+def ITSTATE    : ARMReg<4, "itstate">;
+
+// Special Registers - only available in privileged mode.
+def FPSID   : ARMReg<0,  "fpsid">;
+def MVFR2   : ARMReg<5,  "mvfr2">;
+def MVFR1   : ARMReg<6,  "mvfr1">;
+def MVFR0   : ARMReg<7,  "mvfr0">;
+def FPEXC   : ARMReg<8,  "fpexc">;
+def FPINST  : ARMReg<9,  "fpinst">;
+def FPINST2 : ARMReg<10, "fpinst2">;
+
+// Register classes.
+//
+// pc  == Program Counter
+// lr  == Link Register
+// sp  == Stack Pointer
+// r12 == ip (scratch)
+// r7  == Frame Pointer (thumb-style backtraces)
+// r9  == May be reserved as Thread Register
+// r11 == Frame Pointer (arm-style backtraces)
+// r10 == Stack Limit
+//
+def GPR : RegisterClass<"ARM", [i32], 32, (add (sequence "R%u", 0, 12),
+                                               SP, LR, PC)> {
+  // Allocate LR as the first CSR since it is always saved anyway.
+  // For Thumb1 mode, we don't want to allocate hi regs at all, as we don't
+  // know how to spill them. If we make our prologue/epilogue code smarter at
+  // some point, we can go back to using the above allocation orders for the
+  // Thumb1 instructions that know how to use hi regs.
+  let AltOrders = [(add LR, GPR), (trunc GPR, 8)];
+  let AltOrderSelect = [{
+      return 1 + MF.getTarget().getSubtarget<ARMSubtarget>().isThumb1Only();
+  }];
+}
+
+// GPRs without the PC.  Some ARM instructions do not allow the PC in
+// certain operand slots, particularly as the destination.  Primarily
+// useful for disassembly.
+def GPRnopc : RegisterClass<"ARM", [i32], 32, (sub GPR, PC)> {
+  let AltOrders = [(add LR, GPRnopc), (trunc GPRnopc, 8)];
+  let AltOrderSelect = [{
+      return 1 + MF.getTarget().getSubtarget<ARMSubtarget>().isThumb1Only();
+  }];
+}
+
+// GPRs without the PC but with APSR. Some instructions allow accessing the
+// APSR, while actually encoding PC in the register field. This is useful
+// for assembly and disassembly only.
+def GPRwithAPSR : RegisterClass<"ARM", [i32], 32, (add (sub GPR, PC), APSR_NZCV)> {
+  let AltOrders = [(add LR, GPRnopc), (trunc GPRnopc, 8)];
+  let AltOrderSelect = [{
+      return 1 + MF.getTarget().getSubtarget<ARMSubtarget>().isThumb1Only();
+  }];
+}
+
+// GPRsp - Only the SP is legal. Used by Thumb1 instructions that want the
+// implied SP argument list.
+// FIXME: It would be better to not use this at all and refactor the
+// instructions to not have SP an an explicit argument. That makes
+// frame index resolution a bit trickier, though.
+def GPRsp : RegisterClass<"ARM", [i32], 32, (add SP)>;
+
+// restricted GPR register class. Many Thumb2 instructions allow the full
+// register range for operands, but have undefined behaviours when PC
+// or SP (R13 or R15) are used. The ARM ISA refers to these operands
+// via the BadReg() pseudo-code description.
+def rGPR : RegisterClass<"ARM", [i32], 32, (sub GPR, SP, PC)> {
+  let AltOrders = [(add LR, rGPR), (trunc rGPR, 8)];
+  let AltOrderSelect = [{
+      return 1 + MF.getTarget().getSubtarget<ARMSubtarget>().isThumb1Only();
+  }];
+}
+
+// Thumb registers are R0-R7 normally. Some instructions can still use
+// the general GPR register class above (MOV, e.g.)
+def tGPR : RegisterClass<"ARM", [i32], 32, (trunc GPR, 8)>;
+
+// The high registers in thumb mode, R8-R15.
+def hGPR : RegisterClass<"ARM", [i32], 32, (sub GPR, tGPR)>;
+
+// For tail calls, we can't use callee-saved registers, as they are restored
+// to the saved value before the tail call, which would clobber a call address.
+// Note, getMinimalPhysRegClass(R0) returns tGPR because of the names of
+// this class and the preceding one(!)  This is what we want.
+def tcGPR : RegisterClass<"ARM", [i32], 32, (add R0, R1, R2, R3, R12)> {
+  let AltOrders = [(and tcGPR, tGPR)];
+  let AltOrderSelect = [{
+      return MF.getTarget().getSubtarget<ARMSubtarget>().isThumb1Only();
+  }];
+}
+
+// Condition code registers.
+def CCR : RegisterClass<"ARM", [i32], 32, (add CPSR)> {
+  let CopyCost = -1;  // Don't allow copying of status registers.
+  let isAllocatable = 0;
+}
+
+// Scalar single precision floating point register class..
+// FIXME: Allocation order changed to s0, s2, s4, ... as a quick hack to
+// avoid partial-write dependencies on D registers (S registers are
+// renamed as portions of D registers).
+def SPR : RegisterClass<"ARM", [f32], 32, (add (decimate
+                                                (sequence "S%u", 0, 31), 2),
+                                               (sequence "S%u", 0, 31))>;
+
+// Subset of SPR which can be used as a source of NEON scalars for 16-bit
+// operations
+def SPR_8 : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 15)>;
+
+// Scalar double precision floating point / generic 64-bit vector register
+// class.
+// ARM requires only word alignment for double. It's more performant if it
+// is double-word alignment though.
+def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64,
+                        (sequence "D%u", 0, 31)> {
+  // Allocate non-VFP2 registers D16-D31 first.
+  let AltOrders = [(rotl DPR, 16)];
+  let AltOrderSelect = [{ return 1; }];
+}
+
+// Subset of DPR that are accessible with VFP2 (and so that also have
+// 32-bit SPR subregs).
+def DPR_VFP2 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64,
+                             (trunc DPR, 16)>;
+
+// Subset of DPR which can be used as a source of NEON scalars for 16-bit
+// operations
+def DPR_8 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64,
+                          (trunc DPR, 8)>;
+
+// Generic 128-bit vector register class.
+def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128,
+                        (sequence "Q%u", 0, 15)> {
+  // Allocate non-VFP2 aliases Q8-Q15 first.
+  let AltOrders = [(rotl QPR, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+
+// Subset of QPR that have 32-bit SPR subregs.
+def QPR_VFP2 : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+                             128, (trunc QPR, 8)>;
+
+// Subset of QPR that have DPR_8 and SPR_8 subregs.
+def QPR_8 : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+                           128, (trunc QPR, 4)>;
+
+// Pseudo-registers representing odd-even pairs of D registers. The even-odd
+// pairs are already represented by the Q registers.
+// These are needed by NEON instructions requiring two consecutive D registers.
+// There is no D31_D0 register as that is always an UNPREDICTABLE encoding.
+def TuplesOE2D : RegisterTuples<[dsub_0, dsub_1],
+                                [(decimate (shl DPR, 1), 2),
+                                 (decimate (shl DPR, 2), 2)]>;
+
+// Register class representing a pair of consecutive D registers.
+// Use the Q registers for the even-odd pairs.
+def DPair : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+                          128, (interleave QPR, TuplesOE2D)> {
+  // Allocate starting at non-VFP2 registers D16-D31 first.
+  // Prefer even-odd pairs as they are easier to copy.
+  let AltOrders = [(add (rotl QPR, 8), (rotl DPair, 16))];
+  let AltOrderSelect = [{ return 1; }];
+}
+
+// Pseudo-registers representing even-odd pairs of GPRs from R1 to R13/SP.
+// These are needed by instructions (e.g. ldrexd/strexd) requiring even-odd GPRs.
+def Tuples2R : RegisterTuples<[gsub_0, gsub_1],
+                              [(add R0, R2, R4, R6, R8, R10, R12),
+                               (add R1, R3, R5, R7, R9, R11, SP)]>;
+
+// Register class representing a pair of even-odd GPRs.
+def GPRPair : RegisterClass<"ARM", [untyped], 64, (add Tuples2R)> {
+  let Size = 64; // 2 x 32 bits, we have no predefined type of that size.
+}
+
+// Pseudo-registers representing 3 consecutive D registers.
+def Tuples3D : RegisterTuples<[dsub_0, dsub_1, dsub_2],
+                              [(shl DPR, 0),
+                               (shl DPR, 1),
+                               (shl DPR, 2)]>;
+
+// 3 consecutive D registers.
+def DTriple : RegisterClass<"ARM", [untyped], 64, (add Tuples3D)> {
+  let Size = 192; // 3 x 64 bits, we have no predefined type of that size.
+}
+
+// Pseudo 256-bit registers to represent pairs of Q registers. These should
+// never be present in the emitted code.
+// These are used for NEON load / store instructions, e.g., vld4, vst3.
+def Tuples2Q : RegisterTuples<[qsub_0, qsub_1], [(shl QPR, 0), (shl QPR, 1)]>;
+
+// Pseudo 256-bit vector register class to model pairs of Q registers
+// (4 consecutive D registers).
+def QQPR : RegisterClass<"ARM", [v4i64], 256, (add Tuples2Q)> {
+  // Allocate non-VFP2 aliases first.
+  let AltOrders = [(rotl QQPR, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+
+// Tuples of 4 D regs that isn't also a pair of Q regs.
+def TuplesOE4D : RegisterTuples<[dsub_0, dsub_1, dsub_2, dsub_3],
+                                [(decimate (shl DPR, 1), 2),
+                                 (decimate (shl DPR, 2), 2),
+                                 (decimate (shl DPR, 3), 2),
+                                 (decimate (shl DPR, 4), 2)]>;
+
+// 4 consecutive D registers.
+def DQuad : RegisterClass<"ARM", [v4i64], 256,
+                          (interleave Tuples2Q, TuplesOE4D)>;
+
+// Pseudo 512-bit registers to represent four consecutive Q registers.
+def Tuples2QQ : RegisterTuples<[qqsub_0, qqsub_1],
+                               [(shl QQPR, 0), (shl QQPR, 2)]>;
+
+// Pseudo 512-bit vector register class to model 4 consecutive Q registers
+// (8 consecutive D registers).
+def QQQQPR : RegisterClass<"ARM", [v8i64], 256, (add Tuples2QQ)> {
+  // Allocate non-VFP2 aliases first.
+  let AltOrders = [(rotl QQQQPR, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+
+
+// Pseudo-registers representing 2-spaced consecutive D registers.
+def Tuples2DSpc : RegisterTuples<[dsub_0, dsub_2],
+                                 [(shl DPR, 0),
+                                  (shl DPR, 2)]>;
+
+// Spaced pairs of D registers.
+def DPairSpc : RegisterClass<"ARM", [v2i64], 64, (add Tuples2DSpc)>;
+
+def Tuples3DSpc : RegisterTuples<[dsub_0, dsub_2, dsub_4],
+                                 [(shl DPR, 0),
+                                  (shl DPR, 2),
+                                  (shl DPR, 4)]>;
+
+// Spaced triples of D registers.
+def DTripleSpc : RegisterClass<"ARM", [untyped], 64, (add Tuples3DSpc)> {
+  let Size = 192; // 3 x 64 bits, we have no predefined type of that size.
+}
+
+def Tuples4DSpc : RegisterTuples<[dsub_0, dsub_2, dsub_4, dsub_6],
+                                 [(shl DPR, 0),
+                                  (shl DPR, 2),
+                                  (shl DPR, 4),
+                                  (shl DPR, 6)]>;
+
+// Spaced quads of D registers.
+def DQuadSpc : RegisterClass<"ARM", [v4i64], 64, (add Tuples3DSpc)>;
diff --git a/contrib/llvm/lib/Target/ARM/ARMRelocations.h b/contrib/llvm/lib/Target/ARM/ARMRelocations.h
new file mode 100644
index 0000000..21877fd
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMRelocations.h
@@ -0,0 +1,62 @@
+//===-- ARMRelocations.h - ARM Code Relocations -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the ARM target-specific relocation types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMRELOCATIONS_H
+#define ARMRELOCATIONS_H
+
+#include "llvm/CodeGen/MachineRelocation.h"
+
+namespace llvm {
+  namespace ARM {
+    enum RelocationType {
+      // reloc_arm_absolute - Absolute relocation, just add the relocated value
+      // to the value already in memory.
+      reloc_arm_absolute,
+
+      // reloc_arm_relative - PC relative relocation, add the relocated value to
+      // the value already in memory, after we adjust it for where the PC is.
+      reloc_arm_relative,
+
+      // reloc_arm_cp_entry - PC relative relocation for constpool_entry's whose
+      // addresses are kept locally in a map.
+      reloc_arm_cp_entry,
+
+      // reloc_arm_vfp_cp_entry - Same as reloc_arm_cp_entry except the offset
+      // should be divided by 4.
+      reloc_arm_vfp_cp_entry,
+
+      // reloc_arm_machine_cp_entry - Relocation of a ARM machine constantpool
+      // entry.
+      reloc_arm_machine_cp_entry,
+
+      // reloc_arm_jt_base - PC relative relocation for jump tables whose
+      // addresses are kept locally in a map.
+      reloc_arm_jt_base,
+
+      // reloc_arm_pic_jt - PIC jump table entry relocation: dest bb - jt base.
+      reloc_arm_pic_jt,
+
+      // reloc_arm_branch - Branch address relocation.
+      reloc_arm_branch,
+
+      // reloc_arm_movt  - MOVT immediate relocation.
+      reloc_arm_movt,
+
+      // reloc_arm_movw  - MOVW immediate relocation.
+      reloc_arm_movw
+    };
+  }
+}
+
+#endif
+
diff --git a/contrib/llvm/lib/Target/ARM/ARMSchedule.td b/contrib/llvm/lib/Target/ARM/ARMSchedule.td
new file mode 100644
index 0000000..528c4ec
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMSchedule.td
@@ -0,0 +1,354 @@
+//===-- ARMSchedule.td - ARM Scheduling Definitions --------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// Instruction scheduling annotations for out-of-order CPUs.
+// These annotations are independent of the itinerary class defined below.
+// Here we define the subtarget independent read/write per-operand resources.
+// The subtarget schedule definitions will then map these to the subtarget's
+// resource usages.
+// For example:
+// The instruction cycle timings table might contain an entry for an operation
+// like the following:
+// Rd <- ADD Rn, Rm, <shift> Rs
+//  Uops | Latency from register | Uops - resource requirements - latency
+//  2    | Rn: 1 Rm: 4 Rs: 4     | uop T0, Rm, Rs - P01 - 3
+//       |                       | uopc Rd, Rn, T0 -  P01 - 1
+// This is telling us that the result will be available in destination register
+// Rd after a minimum of three cycles after the result in Rm and Rs is available
+// and one cycle after the result in Rn is available. The micro-ops can execute
+// on resource P01.
+// To model this, we need to express that we need to dispatch two micro-ops,
+// that the resource P01 is needed and that the latency to Rn is different than
+// the latency to Rm and Rs. The scheduler can decrease Rn's producer latency by
+// two.
+// We will do this by assigning (abstract) resources to register defs/uses.
+// ARMSchedule.td:
+//   def WriteALUsr : SchedWrite;
+//   def ReadAdvanceALUsr : ScheRead;
+//
+// ARMInstrInfo.td:
+//   def ADDrs : I<>, Sched<[WriteALUsr, ReadAdvanceALUsr, ReadDefault,
+//                           ReadDefault]> { ...}
+// ReadAdvance read resources allow us to define "pipeline by-passes" or
+// shorter latencies to certain registers as needed in the example above.
+// The "ReadDefault" can be omitted.
+// Next, the subtarget td file assigns resources to the abstract resources
+// defined here.
+// ARMScheduleSubtarget.td:
+//  // Resources.
+//  def P01 : ProcResource<3>; // ALU unit (3 of it).
+//  ...
+//  // Resource usages.
+//  def : WriteRes<WriteALUsr, [P01, P01]> {
+//    Latency = 4; // Latency of 4.
+//    NumMicroOps = 2; // Dispatch 2 micro-ops.
+//    // The two instances of resource P01 are occupied for one cycle. It is one
+//    // cycle because these resources happen to be pipelined.
+//    ResourceCycles = [1, 1];
+//  }
+//  def : ReadAdvance<ReadAdvanceALUsr, 3>;
+
+// Basic ALU operation.
+def WriteALU : SchedWrite;
+def ReadALU : SchedRead;
+
+// Basic ALU with shifts.
+def WriteALUsi : SchedWrite; // Shift by immediate.
+def WriteALUsr : SchedWrite; // Shift by register.
+def WriteALUSsr : SchedWrite; // Shift by register (flag setting).
+def ReadALUsr : SchedRead; // Some operands are read later.
+
+// Compares.
+def WriteCMP : SchedWrite;
+def WriteCMPsi : SchedWrite;
+def WriteCMPsr : SchedWrite;
+
+// Division.
+def WriteDiv : SchedWrite;
+
+// Loads.
+def WriteLd : SchedWrite;
+def WritePreLd : SchedWrite;
+
+// Branches.
+def WriteBr : SchedWrite;
+def WriteBrL : SchedWrite;
+def WriteBrTbl : SchedWrite;
+
+// Fixpoint conversions.
+def WriteCvtFP : SchedWrite;
+
+// Noop.
+def WriteNoop : SchedWrite;
+
+// Define TII for use in SchedVariant Predicates.
+def : PredicateProlog<[{
+  const ARMBaseInstrInfo *TII =
+    static_cast<const ARMBaseInstrInfo*>(SchedModel->getInstrInfo());
+  (void)TII;
+}]>;
+
+def IsPredicatedPred : SchedPredicate<[{TII->isPredicated(MI)}]>;
+
+//===----------------------------------------------------------------------===//
+// Instruction Itinerary classes used for ARM
+//
+def IIC_iALUx      : InstrItinClass;
+def IIC_iALUi      : InstrItinClass;
+def IIC_iALUr      : InstrItinClass;
+def IIC_iALUsi     : InstrItinClass;
+def IIC_iALUsir    : InstrItinClass;
+def IIC_iALUsr     : InstrItinClass;
+def IIC_iBITi      : InstrItinClass;
+def IIC_iBITr      : InstrItinClass;
+def IIC_iBITsi     : InstrItinClass;
+def IIC_iBITsr     : InstrItinClass;
+def IIC_iUNAr      : InstrItinClass;
+def IIC_iUNAsi     : InstrItinClass;
+def IIC_iEXTr      : InstrItinClass;
+def IIC_iEXTAr     : InstrItinClass;
+def IIC_iEXTAsr    : InstrItinClass;
+def IIC_iCMPi      : InstrItinClass;
+def IIC_iCMPr      : InstrItinClass;
+def IIC_iCMPsi     : InstrItinClass;
+def IIC_iCMPsr     : InstrItinClass;
+def IIC_iTSTi      : InstrItinClass;
+def IIC_iTSTr      : InstrItinClass;
+def IIC_iTSTsi     : InstrItinClass;
+def IIC_iTSTsr     : InstrItinClass;
+def IIC_iMOVi      : InstrItinClass;
+def IIC_iMOVr      : InstrItinClass;
+def IIC_iMOVsi     : InstrItinClass;
+def IIC_iMOVsr     : InstrItinClass;
+def IIC_iMOVix2    : InstrItinClass;
+def IIC_iMOVix2addpc : InstrItinClass;
+def IIC_iMOVix2ld  : InstrItinClass;
+def IIC_iMVNi      : InstrItinClass;
+def IIC_iMVNr      : InstrItinClass;
+def IIC_iMVNsi     : InstrItinClass;
+def IIC_iMVNsr     : InstrItinClass;
+def IIC_iCMOVi     : InstrItinClass;
+def IIC_iCMOVr     : InstrItinClass;
+def IIC_iCMOVsi    : InstrItinClass;
+def IIC_iCMOVsr    : InstrItinClass;
+def IIC_iCMOVix2   : InstrItinClass;
+def IIC_iMUL16     : InstrItinClass;
+def IIC_iMAC16     : InstrItinClass;
+def IIC_iMUL32     : InstrItinClass;
+def IIC_iMAC32     : InstrItinClass;
+def IIC_iMUL64     : InstrItinClass;
+def IIC_iMAC64     : InstrItinClass;
+def IIC_iDIV     : InstrItinClass;
+def IIC_iLoad_i    : InstrItinClass;
+def IIC_iLoad_r    : InstrItinClass;
+def IIC_iLoad_si   : InstrItinClass;
+def IIC_iLoad_iu   : InstrItinClass;
+def IIC_iLoad_ru   : InstrItinClass;
+def IIC_iLoad_siu  : InstrItinClass;
+def IIC_iLoad_bh_i   : InstrItinClass;
+def IIC_iLoad_bh_r   : InstrItinClass;
+def IIC_iLoad_bh_si  : InstrItinClass;
+def IIC_iLoad_bh_iu  : InstrItinClass;
+def IIC_iLoad_bh_ru  : InstrItinClass;
+def IIC_iLoad_bh_siu : InstrItinClass;
+def IIC_iLoad_d_i  : InstrItinClass;
+def IIC_iLoad_d_r  : InstrItinClass;
+def IIC_iLoad_d_ru : InstrItinClass;
+def IIC_iLoad_m    : InstrItinClass;
+def IIC_iLoad_mu   : InstrItinClass;
+def IIC_iLoad_mBr  : InstrItinClass;
+def IIC_iPop       : InstrItinClass;
+def IIC_iPop_Br    : InstrItinClass;
+def IIC_iLoadiALU  : InstrItinClass;
+def IIC_iStore_i   : InstrItinClass;
+def IIC_iStore_r   : InstrItinClass;
+def IIC_iStore_si  : InstrItinClass;
+def IIC_iStore_iu  : InstrItinClass;
+def IIC_iStore_ru  : InstrItinClass;
+def IIC_iStore_siu : InstrItinClass;
+def IIC_iStore_bh_i   : InstrItinClass;
+def IIC_iStore_bh_r   : InstrItinClass;
+def IIC_iStore_bh_si  : InstrItinClass;
+def IIC_iStore_bh_iu  : InstrItinClass;
+def IIC_iStore_bh_ru  : InstrItinClass;
+def IIC_iStore_bh_siu : InstrItinClass;
+def IIC_iStore_d_i   : InstrItinClass;
+def IIC_iStore_d_r   : InstrItinClass;
+def IIC_iStore_d_ru  : InstrItinClass;
+def IIC_iStore_m   : InstrItinClass;
+def IIC_iStore_mu  : InstrItinClass;
+def IIC_Preload    : InstrItinClass;
+def IIC_Br         : InstrItinClass;
+def IIC_fpSTAT     : InstrItinClass;
+def IIC_fpUNA32    : InstrItinClass;
+def IIC_fpUNA64    : InstrItinClass;
+def IIC_fpCMP32    : InstrItinClass;
+def IIC_fpCMP64    : InstrItinClass;
+def IIC_fpCVTSD    : InstrItinClass;
+def IIC_fpCVTDS    : InstrItinClass;
+def IIC_fpCVTSH    : InstrItinClass;
+def IIC_fpCVTHS    : InstrItinClass;
+def IIC_fpCVTIS    : InstrItinClass;
+def IIC_fpCVTID    : InstrItinClass;
+def IIC_fpCVTSI    : InstrItinClass;
+def IIC_fpCVTDI    : InstrItinClass;
+def IIC_fpMOVIS    : InstrItinClass;
+def IIC_fpMOVID    : InstrItinClass;
+def IIC_fpMOVSI    : InstrItinClass;
+def IIC_fpMOVDI    : InstrItinClass;
+def IIC_fpALU32    : InstrItinClass;
+def IIC_fpALU64    : InstrItinClass;
+def IIC_fpMUL32    : InstrItinClass;
+def IIC_fpMUL64    : InstrItinClass;
+def IIC_fpMAC32    : InstrItinClass;
+def IIC_fpMAC64    : InstrItinClass;
+def IIC_fpFMAC32   : InstrItinClass;
+def IIC_fpFMAC64   : InstrItinClass;
+def IIC_fpDIV32    : InstrItinClass;
+def IIC_fpDIV64    : InstrItinClass;
+def IIC_fpSQRT32   : InstrItinClass;
+def IIC_fpSQRT64   : InstrItinClass;
+def IIC_fpLoad32   : InstrItinClass;
+def IIC_fpLoad64   : InstrItinClass;
+def IIC_fpLoad_m   : InstrItinClass;
+def IIC_fpLoad_mu  : InstrItinClass;
+def IIC_fpStore32  : InstrItinClass;
+def IIC_fpStore64  : InstrItinClass;
+def IIC_fpStore_m  : InstrItinClass;
+def IIC_fpStore_mu : InstrItinClass;
+def IIC_VLD1       : InstrItinClass;
+def IIC_VLD1x2     : InstrItinClass;
+def IIC_VLD1x3     : InstrItinClass;
+def IIC_VLD1x4     : InstrItinClass;
+def IIC_VLD1u      : InstrItinClass;
+def IIC_VLD1x2u    : InstrItinClass;
+def IIC_VLD1x3u    : InstrItinClass;
+def IIC_VLD1x4u    : InstrItinClass;
+def IIC_VLD1ln     : InstrItinClass;
+def IIC_VLD1lnu    : InstrItinClass;
+def IIC_VLD1dup    : InstrItinClass;
+def IIC_VLD1dupu   : InstrItinClass;
+def IIC_VLD2       : InstrItinClass;
+def IIC_VLD2x2     : InstrItinClass;
+def IIC_VLD2u      : InstrItinClass;
+def IIC_VLD2x2u    : InstrItinClass;
+def IIC_VLD2ln     : InstrItinClass;
+def IIC_VLD2lnu    : InstrItinClass;
+def IIC_VLD2dup    : InstrItinClass;
+def IIC_VLD2dupu   : InstrItinClass;
+def IIC_VLD3       : InstrItinClass;
+def IIC_VLD3ln     : InstrItinClass;
+def IIC_VLD3u      : InstrItinClass;
+def IIC_VLD3lnu    : InstrItinClass;
+def IIC_VLD3dup    : InstrItinClass;
+def IIC_VLD3dupu   : InstrItinClass;
+def IIC_VLD4       : InstrItinClass;
+def IIC_VLD4ln     : InstrItinClass;
+def IIC_VLD4u      : InstrItinClass;
+def IIC_VLD4lnu    : InstrItinClass;
+def IIC_VLD4dup    : InstrItinClass;
+def IIC_VLD4dupu   : InstrItinClass;
+def IIC_VST1       : InstrItinClass;
+def IIC_VST1x2     : InstrItinClass;
+def IIC_VST1x3     : InstrItinClass;
+def IIC_VST1x4     : InstrItinClass;
+def IIC_VST1u      : InstrItinClass;
+def IIC_VST1x2u    : InstrItinClass;
+def IIC_VST1x3u    : InstrItinClass;
+def IIC_VST1x4u    : InstrItinClass;
+def IIC_VST1ln     : InstrItinClass;
+def IIC_VST1lnu    : InstrItinClass;
+def IIC_VST2       : InstrItinClass;
+def IIC_VST2x2     : InstrItinClass;
+def IIC_VST2u      : InstrItinClass;
+def IIC_VST2x2u    : InstrItinClass;
+def IIC_VST2ln     : InstrItinClass;
+def IIC_VST2lnu    : InstrItinClass;
+def IIC_VST3       : InstrItinClass;
+def IIC_VST3u      : InstrItinClass;
+def IIC_VST3ln     : InstrItinClass;
+def IIC_VST3lnu    : InstrItinClass;
+def IIC_VST4       : InstrItinClass;
+def IIC_VST4u      : InstrItinClass;
+def IIC_VST4ln     : InstrItinClass;
+def IIC_VST4lnu    : InstrItinClass;
+def IIC_VUNAD      : InstrItinClass;
+def IIC_VUNAQ      : InstrItinClass;
+def IIC_VBIND      : InstrItinClass;
+def IIC_VBINQ      : InstrItinClass;
+def IIC_VPBIND     : InstrItinClass;
+def IIC_VFMULD     : InstrItinClass;
+def IIC_VFMULQ     : InstrItinClass;
+def IIC_VMOV       : InstrItinClass;
+def IIC_VMOVImm    : InstrItinClass;
+def IIC_VMOVD      : InstrItinClass;
+def IIC_VMOVQ      : InstrItinClass;
+def IIC_VMOVIS     : InstrItinClass;
+def IIC_VMOVID     : InstrItinClass;
+def IIC_VMOVISL    : InstrItinClass;
+def IIC_VMOVSI     : InstrItinClass;
+def IIC_VMOVDI     : InstrItinClass;
+def IIC_VMOVN      : InstrItinClass;
+def IIC_VPERMD     : InstrItinClass;
+def IIC_VPERMQ     : InstrItinClass;
+def IIC_VPERMQ3    : InstrItinClass;
+def IIC_VMACD      : InstrItinClass;
+def IIC_VMACQ      : InstrItinClass;
+def IIC_VFMACD     : InstrItinClass;
+def IIC_VFMACQ     : InstrItinClass;
+def IIC_VRECSD     : InstrItinClass;
+def IIC_VRECSQ     : InstrItinClass;
+def IIC_VCNTiD     : InstrItinClass;
+def IIC_VCNTiQ     : InstrItinClass;
+def IIC_VUNAiD     : InstrItinClass;
+def IIC_VUNAiQ     : InstrItinClass;
+def IIC_VQUNAiD    : InstrItinClass;
+def IIC_VQUNAiQ    : InstrItinClass;
+def IIC_VBINiD     : InstrItinClass;
+def IIC_VBINiQ     : InstrItinClass;
+def IIC_VSUBiD     : InstrItinClass;
+def IIC_VSUBiQ     : InstrItinClass;
+def IIC_VBINi4D    : InstrItinClass;
+def IIC_VBINi4Q    : InstrItinClass;
+def IIC_VSUBi4D    : InstrItinClass;
+def IIC_VSUBi4Q    : InstrItinClass;
+def IIC_VABAD      : InstrItinClass;
+def IIC_VABAQ      : InstrItinClass;
+def IIC_VSHLiD     : InstrItinClass;
+def IIC_VSHLiQ     : InstrItinClass;
+def IIC_VSHLi4D    : InstrItinClass;
+def IIC_VSHLi4Q    : InstrItinClass;
+def IIC_VPALiD     : InstrItinClass;
+def IIC_VPALiQ     : InstrItinClass;
+def IIC_VMULi16D   : InstrItinClass;
+def IIC_VMULi32D   : InstrItinClass;
+def IIC_VMULi16Q   : InstrItinClass;
+def IIC_VMULi32Q   : InstrItinClass;
+def IIC_VMACi16D   : InstrItinClass;
+def IIC_VMACi32D   : InstrItinClass;
+def IIC_VMACi16Q   : InstrItinClass;
+def IIC_VMACi32Q   : InstrItinClass;
+def IIC_VEXTD      : InstrItinClass;
+def IIC_VEXTQ      : InstrItinClass;
+def IIC_VTB1       : InstrItinClass;
+def IIC_VTB2       : InstrItinClass;
+def IIC_VTB3       : InstrItinClass;
+def IIC_VTB4       : InstrItinClass;
+def IIC_VTBX1      : InstrItinClass;
+def IIC_VTBX2      : InstrItinClass;
+def IIC_VTBX3      : InstrItinClass;
+def IIC_VTBX4      : InstrItinClass;
+
+//===----------------------------------------------------------------------===//
+// Processor instruction itineraries.
+
+include "ARMScheduleV6.td"
+include "ARMScheduleA8.td"
+include "ARMScheduleA9.td"
+include "ARMScheduleSwift.td"
diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleA8.td b/contrib/llvm/lib/Target/ARM/ARMScheduleA8.td
new file mode 100644
index 0000000..2c63825
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMScheduleA8.td
@@ -0,0 +1,1075 @@
+//=- ARMScheduleA8.td - ARM Cortex-A8 Scheduling Definitions -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the ARM Cortex A8 processors.
+//
+//===----------------------------------------------------------------------===//
+
+//
+// Scheduling information derived from "Cortex-A8 Technical Reference Manual".
+// Functional Units.
+def A8_Pipe0   : FuncUnit; // pipeline 0
+def A8_Pipe1   : FuncUnit; // pipeline 1
+def A8_LSPipe  : FuncUnit; // Load / store pipeline
+def A8_NPipe   : FuncUnit; // NEON ALU/MUL pipe
+def A8_NLSPipe : FuncUnit; // NEON LS pipe
+//
+// Dual issue pipeline represented by A8_Pipe0 | A8_Pipe1
+//
+def CortexA8Itineraries : ProcessorItineraries<
+  [A8_Pipe0, A8_Pipe1, A8_LSPipe, A8_NPipe, A8_NLSPipe],
+  [], [
+  // Two fully-pipelined integer ALU pipelines
+  //
+  // No operand cycles
+  InstrItinData<IIC_iALUx    , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>]>,
+  //
+  // Binary Instructions that produce a result
+  InstrItinData<IIC_iALUi ,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>,
+  InstrItinData<IIC_iALUr ,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 2]>,
+  InstrItinData<IIC_iALUsi,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1]>,
+  InstrItinData<IIC_iALUsir,[InstrStage<1,[A8_Pipe0, A8_Pipe1]>], [2, 1, 2]>,
+  InstrItinData<IIC_iALUsr,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1, 1]>,
+  //
+  // Bitwise Instructions that produce a result
+  InstrItinData<IIC_iBITi ,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>,
+  InstrItinData<IIC_iBITr ,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 2]>,
+  InstrItinData<IIC_iBITsi,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1]>,
+  InstrItinData<IIC_iBITsr,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1, 1]>,
+  //
+  // Unary Instructions that produce a result
+  InstrItinData<IIC_iUNAr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>,
+  InstrItinData<IIC_iUNAsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>,
+  //
+  // Zero and sign extension instructions
+  InstrItinData<IIC_iEXTr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>,
+  InstrItinData<IIC_iEXTAr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1]>,
+  InstrItinData<IIC_iEXTAsr,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>],[2, 2, 1, 1]>,
+  //
+  // Compare instructions
+  InstrItinData<IIC_iCMPi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>,
+  InstrItinData<IIC_iCMPr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>,
+  InstrItinData<IIC_iCMPsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>,
+  InstrItinData<IIC_iCMPsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1, 1]>,
+  //
+  // Test instructions
+  InstrItinData<IIC_iTSTi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>,
+  InstrItinData<IIC_iTSTr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>,
+  InstrItinData<IIC_iTSTsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>,
+  InstrItinData<IIC_iTSTsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1, 1]>,
+  //
+  // Move instructions, unconditional
+  InstrItinData<IIC_iMOVi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1]>,
+  InstrItinData<IIC_iMOVr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>,
+  InstrItinData<IIC_iMOVsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>,
+  InstrItinData<IIC_iMOVsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1, 1]>,
+  InstrItinData<IIC_iMOVix2,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                             InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>,
+  InstrItinData<IIC_iMOVix2addpc,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                                  InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                                  InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [3]>,
+  InstrItinData<IIC_iMOVix2ld,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<1, [A8_LSPipe]>], [5]>,
+  //
+  // Move instructions, conditional
+  InstrItinData<IIC_iCMOVi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>,
+  InstrItinData<IIC_iCMOVr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>,
+  InstrItinData<IIC_iCMOVsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>,
+  InstrItinData<IIC_iCMOVsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1, 1]>,
+  InstrItinData<IIC_iCMOVix2,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                              InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [3, 1]>,
+  //
+  // MVN instructions
+  InstrItinData<IIC_iMVNi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1]>,
+  InstrItinData<IIC_iMVNr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>,
+  InstrItinData<IIC_iMVNsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>,
+  InstrItinData<IIC_iMVNsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1, 1]>,
+
+  // Integer multiply pipeline
+  // Result written in E5, but that is relative to the last cycle of multicycle,
+  // so we use 6 for those cases
+  //
+  InstrItinData<IIC_iMUL16   , [InstrStage<1, [A8_Pipe0]>], [5, 1, 1]>,
+  InstrItinData<IIC_iMAC16   , [InstrStage<2, [A8_Pipe0]>], [6, 1, 1, 4]>,
+  InstrItinData<IIC_iMUL32   , [InstrStage<2, [A8_Pipe0]>], [6, 1, 1]>,
+  InstrItinData<IIC_iMAC32   , [InstrStage<2, [A8_Pipe0]>], [6, 1, 1, 4]>,
+  InstrItinData<IIC_iMUL64   , [InstrStage<3, [A8_Pipe0]>], [6, 6, 1, 1]>,
+  InstrItinData<IIC_iMAC64   , [InstrStage<3, [A8_Pipe0]>], [6, 6, 1, 1]>,
+
+  // Integer load pipeline
+  //
+  // Immediate offset
+  InstrItinData<IIC_iLoad_i   , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1]>,
+  InstrItinData<IIC_iLoad_bh_i, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1]>,
+  InstrItinData<IIC_iLoad_d_i,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1]>,
+  //
+  // Register offset
+  InstrItinData<IIC_iLoad_r   , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>,
+  InstrItinData<IIC_iLoad_bh_r, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>,
+  InstrItinData<IIC_iLoad_d_r , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>,
+  //
+  // Scaled register offset, issues over 2 cycles
+  // FIXME: lsl by 2 takes 1 cycle.
+  InstrItinData<IIC_iLoad_si  , [InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [4, 1, 1]>,
+  InstrItinData<IIC_iLoad_bh_si,[InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [4, 1, 1]>,
+  //
+  // Immediate offset with update
+  InstrItinData<IIC_iLoad_iu  , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 2, 1]>,
+  InstrItinData<IIC_iLoad_bh_iu,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 2, 1]>,
+  //
+  // Register offset with update
+  InstrItinData<IIC_iLoad_ru  , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 2, 1, 1]>,
+  InstrItinData<IIC_iLoad_bh_ru,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 2, 1, 1]>,
+  InstrItinData<IIC_iLoad_d_ru, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 2, 1, 1]>,
+  //
+  // Scaled register offset with update, issues over 2 cycles
+  InstrItinData<IIC_iLoad_siu , [InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<2, [A8_LSPipe]>], [4, 3, 1, 1]>,
+  InstrItinData<IIC_iLoad_bh_siu,[InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>,
+                                  InstrStage<2, [A8_LSPipe]>], [4, 3, 1, 1]>,
+  //
+  // Load multiple, def is the 5th operand. Pipeline 0 only.
+  // FIXME: A8_LSPipe cycle time is dynamic, this assumes 3 to 4 registers.
+  InstrItinData<IIC_iLoad_m  , [InstrStage<2, [A8_Pipe0], 0>,
+                                InstrStage<2, [A8_LSPipe]>],
+                [1, 1, 1, 1, 3], [], -1>, // dynamic uops
+  //
+  // Load multiple + update, defs are the 1st and 5th operands.
+  InstrItinData<IIC_iLoad_mu , [InstrStage<3, [A8_Pipe0], 0>,
+                                InstrStage<3, [A8_LSPipe]>],
+                [2, 1, 1, 1, 3], [], -1>, // dynamic uops
+  //
+  // Load multiple plus branch
+  InstrItinData<IIC_iLoad_mBr, [InstrStage<3, [A8_Pipe0], 0>,
+                                InstrStage<3, [A8_LSPipe]>,
+                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>],
+                              [1, 2, 1, 1, 3], [], -1>, // dynamic uops
+  //
+  // Pop, def is the 3rd operand.
+  InstrItinData<IIC_iPop  ,    [InstrStage<3, [A8_Pipe0], 0>,
+                                InstrStage<3, [A8_LSPipe]>],
+                [1, 1, 3], [], -1>, // dynamic uops
+  //
+  // Push, def is the 3th operand.
+  InstrItinData<IIC_iPop_Br,   [InstrStage<3, [A8_Pipe0], 0>,
+                                InstrStage<3, [A8_LSPipe]>,
+                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>],
+                               [1, 1, 3], [], -1>, // dynamic uops
+  //
+  // iLoadi + iALUr for t2LDRpci_pic.
+  InstrItinData<IIC_iLoadiALU, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                InstrStage<1, [A8_LSPipe]>,
+                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [4, 1]>,
+
+
+  // Integer store pipeline
+  //
+  // Immediate offset
+  InstrItinData<IIC_iStore_i  , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1]>,
+  InstrItinData<IIC_iStore_bh_i,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1]>,
+  InstrItinData<IIC_iStore_d_i, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1]>,
+  //
+  // Register offset
+  InstrItinData<IIC_iStore_r  , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_r,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>,
+  InstrItinData<IIC_iStore_d_r, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>,
+  //
+  // Scaled register offset, issues over 2 cycles
+  InstrItinData<IIC_iStore_si , [InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<2, [A8_LSPipe]>], [3, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_si,[InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>,
+                                  InstrStage<2, [A8_LSPipe]>], [3, 1, 1]>,
+  //
+  // Immediate offset with update
+  InstrItinData<IIC_iStore_iu , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [2, 3, 1]>,
+  InstrItinData<IIC_iStore_bh_iu,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [2, 3, 1]>,
+  //
+  // Register offset with update
+  InstrItinData<IIC_iStore_ru  , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                  InstrStage<1, [A8_LSPipe]>], [2, 3, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_ru,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                  InstrStage<1, [A8_LSPipe]>], [2, 3, 1, 1]>,
+  InstrItinData<IIC_iStore_d_ru, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                  InstrStage<1, [A8_LSPipe]>], [2, 3, 1, 1]>,
+  //
+  // Scaled register offset with update, issues over 2 cycles
+  InstrItinData<IIC_iStore_siu, [InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<2, [A8_LSPipe]>], [3, 3, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_siu,[InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>,
+                                   InstrStage<2, [A8_LSPipe]>], [3, 3, 1, 1]>,
+  //
+  // Store multiple. Pipeline 0 only.
+  // FIXME: A8_LSPipe cycle time is dynamic, this assumes 3 to 4 registers.
+  InstrItinData<IIC_iStore_m , [InstrStage<2, [A8_Pipe0], 0>,
+                                InstrStage<2, [A8_LSPipe]>],
+                [], [], -1>, // dynamic uops
+  //
+  // Store multiple + update
+  InstrItinData<IIC_iStore_mu, [InstrStage<2, [A8_Pipe0], 0>,
+                                InstrStage<2, [A8_LSPipe]>],
+                [2], [], -1>, // dynamic uops
+  //
+  // Preload
+  InstrItinData<IIC_Preload, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>,
+
+  // Branch
+  //
+  // no delay slots, so the latency of a branch is unimportant
+  InstrItinData<IIC_Br      , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>]>,
+
+  // VFP
+  // Issue through integer pipeline, and execute in NEON unit. We assume
+  // RunFast mode so that NFP pipeline is used for single-precision when
+  // possible.
+  //
+  // FP Special Register to Integer Register File Move
+  InstrItinData<IIC_fpSTAT , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                              InstrStage<1, [A8_NLSPipe]>], [20]>,
+  //
+  // Single-precision FP Unary
+  InstrItinData<IIC_fpUNA32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [7, 1]>,
+  //
+  // Double-precision FP Unary
+  InstrItinData<IIC_fpUNA64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<4, [A8_NPipe], 0>,
+                               InstrStage<4, [A8_NLSPipe]>], [4, 1]>,
+  //
+  // Single-precision FP Compare
+  InstrItinData<IIC_fpCMP32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [1, 1]>,
+  //
+  // Double-precision FP Compare
+  InstrItinData<IIC_fpCMP64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<4, [A8_NPipe], 0>,
+                               InstrStage<4, [A8_NLSPipe]>], [4, 1]>,
+  //
+  // Single to Double FP Convert
+  InstrItinData<IIC_fpCVTSD , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<7, [A8_NPipe], 0>,
+                               InstrStage<7, [A8_NLSPipe]>], [7, 1]>,
+  //
+  // Double to Single FP Convert
+  InstrItinData<IIC_fpCVTDS , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<5, [A8_NPipe], 0>,
+                               InstrStage<5, [A8_NLSPipe]>], [5, 1]>,
+  //
+  // Single-Precision FP to Integer Convert
+  InstrItinData<IIC_fpCVTSI , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [7, 1]>,
+  //
+  // Double-Precision FP to Integer Convert
+  InstrItinData<IIC_fpCVTDI , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<8, [A8_NPipe], 0>,
+                               InstrStage<8, [A8_NLSPipe]>], [8, 1]>,
+  //
+  // Integer to Single-Precision FP Convert
+  InstrItinData<IIC_fpCVTIS , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [7, 1]>,
+  //
+  // Integer to Double-Precision FP Convert
+  InstrItinData<IIC_fpCVTID , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<8, [A8_NPipe], 0>,
+                               InstrStage<8, [A8_NLSPipe]>], [8, 1]>,
+  //
+  // Single-precision FP ALU
+  InstrItinData<IIC_fpALU32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [7, 1, 1]>,
+  //
+  // Double-precision FP ALU
+  InstrItinData<IIC_fpALU64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<9, [A8_NPipe], 0>,
+                               InstrStage<9, [A8_NLSPipe]>], [9, 1, 1]>,
+  //
+  // Single-precision FP Multiply
+  InstrItinData<IIC_fpMUL32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [7, 1, 1]>,
+  //
+  // Double-precision FP Multiply
+  InstrItinData<IIC_fpMUL64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<11, [A8_NPipe], 0>,
+                               InstrStage<11, [A8_NLSPipe]>], [11, 1, 1]>,
+  //
+  // Single-precision FP MAC
+  InstrItinData<IIC_fpMAC32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [7, 2, 1, 1]>,
+  //
+  // Double-precision FP MAC
+  InstrItinData<IIC_fpMAC64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<19, [A8_NPipe], 0>,
+                               InstrStage<19, [A8_NLSPipe]>], [19, 2, 1, 1]>,
+  //
+  // Single-precision Fused FP MAC
+  InstrItinData<IIC_fpFMAC32, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [7, 2, 1, 1]>,
+  //
+  // Double-precision Fused FP MAC
+  InstrItinData<IIC_fpFMAC64, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<19, [A8_NPipe], 0>,
+                               InstrStage<19, [A8_NLSPipe]>], [19, 2, 1, 1]>,
+  //
+  // Single-precision FP DIV
+  InstrItinData<IIC_fpDIV32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<20, [A8_NPipe], 0>,
+                               InstrStage<20, [A8_NLSPipe]>], [20, 1, 1]>,
+  //
+  // Double-precision FP DIV
+  InstrItinData<IIC_fpDIV64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<29, [A8_NPipe], 0>,
+                               InstrStage<29, [A8_NLSPipe]>], [29, 1, 1]>,
+  //
+  // Single-precision FP SQRT
+  InstrItinData<IIC_fpSQRT32, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<19, [A8_NPipe], 0>,
+                               InstrStage<19, [A8_NLSPipe]>], [19, 1]>,
+  //
+  // Double-precision FP SQRT
+  InstrItinData<IIC_fpSQRT64, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<29, [A8_NPipe], 0>,
+                               InstrStage<29, [A8_NLSPipe]>], [29, 1]>,
+
+  //
+  // Integer to Single-precision Move
+  InstrItinData<IIC_fpMOVIS,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>],
+                              [2, 1]>,
+  //
+  // Integer to Double-precision Move
+  InstrItinData<IIC_fpMOVID,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>],
+                              [2, 1, 1]>,
+  //
+  // Single-precision to Integer Move
+  InstrItinData<IIC_fpMOVSI,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>],
+                              [20, 1]>,
+  //
+  // Double-precision to Integer Move
+  InstrItinData<IIC_fpMOVDI,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>],
+                              [20, 20, 1]>,
+
+  //
+  // Single-precision FP Load
+  InstrItinData<IIC_fpLoad32, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe], 0>,
+                               InstrStage<1, [A8_LSPipe]>],
+                              [2, 1]>,
+  //
+  // Double-precision FP Load
+  InstrItinData<IIC_fpLoad64, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe], 0>,
+                               InstrStage<1, [A8_LSPipe]>],
+                              [2, 1]>,
+  //
+  // FP Load Multiple
+  // FIXME: A8_LSPipe cycle time is dynamic, this assumes 3 to 4 registers.
+  InstrItinData<IIC_fpLoad_m, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe], 0>,
+                               InstrStage<1, [A8_LSPipe]>,
+                               InstrStage<1, [A8_NLSPipe], 0>,
+                               InstrStage<1, [A8_LSPipe]>],
+                [1, 1, 1, 2], [], -1>, // dynamic uops
+  //
+  // FP Load Multiple + update
+  InstrItinData<IIC_fpLoad_mu,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe], 0>,
+                               InstrStage<1, [A8_LSPipe]>,
+                               InstrStage<1, [A8_NLSPipe], 0>,
+                               InstrStage<1, [A8_LSPipe]>],
+                [2, 1, 1, 1, 2], [], -1>, // dynamic uops
+  //
+  // Single-precision FP Store
+  InstrItinData<IIC_fpStore32,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe], 0>,
+                               InstrStage<1, [A8_LSPipe]>],
+                              [1, 1]>,
+  //
+  // Double-precision FP Store
+  InstrItinData<IIC_fpStore64,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe], 0>,
+                               InstrStage<1, [A8_LSPipe]>],
+                              [1, 1]>,
+  //
+  // FP Store Multiple
+  InstrItinData<IIC_fpStore_m,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe], 0>,
+                               InstrStage<1, [A8_LSPipe]>,
+                               InstrStage<1, [A8_NLSPipe], 0>,
+                               InstrStage<1, [A8_LSPipe]>],
+                [1, 1, 1, 1], [], -1>, // dynamic uops
+  //
+  // FP Store Multiple + update
+  InstrItinData<IIC_fpStore_mu,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                InstrStage<1, [A8_NLSPipe], 0>,
+                                InstrStage<1, [A8_LSPipe]>,
+                                InstrStage<1, [A8_NLSPipe], 0>,
+                                InstrStage<1, [A8_LSPipe]>],
+                [2, 1, 1, 1, 1], [], -1>, // dynamic uops
+  // NEON
+  // Issue through integer pipeline, and execute in NEON unit.
+  //
+  // VLD1
+  InstrItinData<IIC_VLD1,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 1]>,
+  // VLD1x2
+  InstrItinData<IIC_VLD1x2,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 2, 1]>,
+  //
+  // VLD1x3
+  InstrItinData<IIC_VLD1x3,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 2, 3, 1]>,
+  //
+  // VLD1x4
+  InstrItinData<IIC_VLD1x4,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 2, 3, 3, 1]>,
+  //
+  // VLD1u
+  InstrItinData<IIC_VLD1u,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 2, 1]>,
+  //
+  // VLD1x2u
+  InstrItinData<IIC_VLD1x2u,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 2, 2, 1]>,
+  //
+  // VLD1x3u
+  InstrItinData<IIC_VLD1x3u,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 2, 3, 2, 1]>,
+  //
+  // VLD1x4u
+  InstrItinData<IIC_VLD1x4u,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 2, 3, 3, 2, 1]>,
+  //
+  // VLD1ln
+  InstrItinData<IIC_VLD1ln,   [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [3, 1, 1, 1]>,
+  //
+  // VLD1lnu
+  InstrItinData<IIC_VLD1lnu,  [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [3, 2, 1, 1, 1, 1]>,
+  //
+  // VLD1dup
+  InstrItinData<IIC_VLD1dup,  [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 1]>,
+  //
+  // VLD1dupu
+  InstrItinData<IIC_VLD1dupu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 2, 1, 1]>,
+  //
+  // VLD2
+  InstrItinData<IIC_VLD2,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 2, 1]>,
+  //
+  // VLD2x2
+  InstrItinData<IIC_VLD2x2,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 2, 3, 3, 1]>,
+  //
+  // VLD2ln
+  InstrItinData<IIC_VLD2ln,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [3, 3, 1, 1, 1, 1]>,
+  //
+  // VLD2u
+  InstrItinData<IIC_VLD2u,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 2, 2, 1, 1, 1]>,
+  //
+  // VLD2x2u
+  InstrItinData<IIC_VLD2x2u,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 2, 3, 3, 2, 1]>,
+  //
+  // VLD2lnu
+  InstrItinData<IIC_VLD2lnu,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [3, 3, 2, 1, 1, 1, 1, 1]>,
+  //
+  // VLD2dup
+  InstrItinData<IIC_VLD2dup,  [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 2, 1]>,
+  //
+  // VLD2dupu
+  InstrItinData<IIC_VLD2dupu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 2, 2, 1, 1]>,
+  //
+  // VLD3
+  InstrItinData<IIC_VLD3,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<4, [A8_NLSPipe], 0>,
+                               InstrStage<4, [A8_LSPipe]>],
+                              [3, 3, 4, 1]>,
+  //
+  // VLD3ln
+  InstrItinData<IIC_VLD3ln,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<5, [A8_NLSPipe], 0>,
+                               InstrStage<5, [A8_LSPipe]>],
+                              [4, 4, 5, 1, 1, 1, 1, 2]>,
+  //
+  // VLD3u
+  InstrItinData<IIC_VLD3u,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<4, [A8_NLSPipe], 0>,
+                               InstrStage<4, [A8_LSPipe]>],
+                              [3, 3, 4, 2, 1]>,
+  //
+  // VLD3lnu
+  InstrItinData<IIC_VLD3lnu,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<5, [A8_NLSPipe], 0>,
+                               InstrStage<5, [A8_LSPipe]>],
+                              [4, 4, 5, 2, 1, 1, 1, 1, 1, 2]>,
+  //
+  // VLD3dup
+  InstrItinData<IIC_VLD3dup,  [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 2, 3, 1]>,
+  //
+  // VLD3dupu
+  InstrItinData<IIC_VLD3dupu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 2, 3, 2, 1, 1]>,
+  //
+  // VLD4
+  InstrItinData<IIC_VLD4,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<4, [A8_NLSPipe], 0>,
+                               InstrStage<4, [A8_LSPipe]>],
+                              [3, 3, 4, 4, 1]>,
+  //
+  // VLD4ln
+  InstrItinData<IIC_VLD4ln,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<5, [A8_NLSPipe], 0>,
+                               InstrStage<5, [A8_LSPipe]>],
+                              [4, 4, 5, 5, 1, 1, 1, 1, 2, 2]>,
+  //
+  // VLD4u
+  InstrItinData<IIC_VLD4u,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<4, [A8_NLSPipe], 0>,
+                               InstrStage<4, [A8_LSPipe]>],
+                              [3, 3, 4, 4, 2, 1]>,
+  //
+  // VLD4lnu
+  InstrItinData<IIC_VLD4lnu,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<5, [A8_NLSPipe], 0>,
+                               InstrStage<5, [A8_LSPipe]>],
+                              [4, 4, 5, 5, 2, 1, 1, 1, 1, 1, 2, 2]>,
+  //
+  // VLD4dup
+  InstrItinData<IIC_VLD4dup,  [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 2, 3, 3, 1]>,
+  //
+  // VLD4dupu
+  InstrItinData<IIC_VLD4dupu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 2, 3, 3, 2, 1, 1]>,
+  //
+  // VST1
+  InstrItinData<IIC_VST1,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [1, 1, 1]>,
+  //
+  // VST1x2
+  InstrItinData<IIC_VST1x2,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [1, 1, 1, 1]>,
+  //
+  // VST1x3
+  InstrItinData<IIC_VST1x3,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [1, 1, 1, 1, 2]>,
+  //
+  // VST1x4
+  InstrItinData<IIC_VST1x4,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [1, 1, 1, 1, 2, 2]>,
+  //
+  // VST1u
+  InstrItinData<IIC_VST1u,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1]>,
+  //
+  // VST1x2u
+  InstrItinData<IIC_VST1x2u,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1, 1]>,
+  //
+  // VST1x3u
+  InstrItinData<IIC_VST1x3u,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1, 1, 2]>,
+  //
+  // VST1x4u
+  InstrItinData<IIC_VST1x4u,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1, 1, 2, 2]>,
+  //
+  // VST1ln
+  InstrItinData<IIC_VST1ln,   [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [1, 1, 1]>,
+  //
+  // VST1lnu
+  InstrItinData<IIC_VST1lnu,  [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1]>,
+  //
+  // VST2
+  InstrItinData<IIC_VST2,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [1, 1, 1, 1]>,
+  //
+  // VST2x2
+  InstrItinData<IIC_VST2x2,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<4, [A8_NLSPipe], 0>,
+                               InstrStage<4, [A8_LSPipe]>],
+                              [1, 1, 1, 1, 2, 2]>,
+  //
+  // VST2u
+  InstrItinData<IIC_VST2u,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1, 1]>,
+  //
+  // VST2x2u
+  InstrItinData<IIC_VST2x2u,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<4, [A8_NLSPipe], 0>,
+                               InstrStage<4, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1, 1, 2, 2]>,
+  //
+  // VST2ln
+  InstrItinData<IIC_VST2ln,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [1, 1, 1, 1]>,
+  //
+  // VST2lnu
+  InstrItinData<IIC_VST2lnu,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1, 1]>,
+  //
+  // VST3
+  InstrItinData<IIC_VST3,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [1, 1, 1, 1, 2]>,
+  //
+  // VST3u
+  InstrItinData<IIC_VST3u,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1, 1, 2]>,
+  //
+  // VST3ln
+  InstrItinData<IIC_VST3ln,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [1, 1, 1, 1, 2]>,
+  //
+  // VST3lnu
+  InstrItinData<IIC_VST3lnu,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1, 1, 2]>,
+  //
+  // VST4
+  InstrItinData<IIC_VST4,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<4, [A8_NLSPipe], 0>,
+                               InstrStage<4, [A8_LSPipe]>],
+                              [1, 1, 1, 1, 2, 2]>,
+  //
+  // VST4u
+  InstrItinData<IIC_VST4u,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<4, [A8_NLSPipe], 0>,
+                               InstrStage<4, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1, 1, 2, 2]>,
+  //
+  // VST4ln
+  InstrItinData<IIC_VST4ln,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<4, [A8_NLSPipe], 0>,
+                               InstrStage<4, [A8_LSPipe]>],
+                              [1, 1, 1, 1, 2, 2]>,
+  //
+  // VST4lnu
+  InstrItinData<IIC_VST4lnu,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<4, [A8_NLSPipe], 0>,
+                               InstrStage<4, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1, 1, 2, 2]>,
+  //
+  // Double-register FP Unary
+  InstrItinData<IIC_VUNAD,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [5, 2]>,
+  //
+  // Quad-register FP Unary
+  // Result written in N5, but that is relative to the last cycle of multicycle,
+  // so we use 6 for those cases
+  InstrItinData<IIC_VUNAQ,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NPipe]>], [6, 2]>,
+  //
+  // Double-register FP Binary
+  InstrItinData<IIC_VBIND,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [5, 2, 2]>,
+  //
+  // VPADD, etc.
+  InstrItinData<IIC_VPBIND,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [5, 2, 2]>,
+  //
+  // Double-register FP VMUL
+  InstrItinData<IIC_VFMULD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [5, 2, 1]>,
+
+  //
+  // Quad-register FP Binary
+  // Result written in N5, but that is relative to the last cycle of multicycle,
+  // so we use 6 for those cases
+  InstrItinData<IIC_VBINQ,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NPipe]>], [6, 2, 2]>,
+  //
+  // Quad-register FP VMUL
+  InstrItinData<IIC_VFMULQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [6, 2, 1]>,
+  //
+  // Move
+  InstrItinData<IIC_VMOV,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [1, 1]>,
+  //
+  // Move Immediate
+  InstrItinData<IIC_VMOVImm,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [3]>,
+  //
+  // Double-register Permute Move
+  InstrItinData<IIC_VMOVD,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe]>], [2, 1]>,
+  //
+  // Quad-register Permute Move
+  // Result written in N2, but that is relative to the last cycle of multicycle,
+  // so we use 3 for those cases
+  InstrItinData<IIC_VMOVQ,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe]>], [3, 1]>,
+  //
+  // Integer to Single-precision Move
+  InstrItinData<IIC_VMOVIS ,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe]>], [2, 1]>,
+  //
+  // Integer to Double-precision Move
+  InstrItinData<IIC_VMOVID ,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe]>], [2, 1, 1]>,
+  //
+  // Single-precision to Integer Move
+  InstrItinData<IIC_VMOVSI ,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe]>], [20, 1]>,
+  //
+  // Double-precision to Integer Move
+  InstrItinData<IIC_VMOVDI ,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe]>], [20, 20, 1]>,
+  //
+  // Integer to Lane Move
+  InstrItinData<IIC_VMOVISL , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe]>], [3, 1, 1]>,
+  //
+  // Vector narrow move
+  InstrItinData<IIC_VMOVN   , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [2, 1]>,
+  //
+  // Double-register Permute
+  InstrItinData<IIC_VPERMD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe]>], [2, 2, 1, 1]>,
+  //
+  // Quad-register Permute
+  // Result written in N2, but that is relative to the last cycle of multicycle,
+  // so we use 3 for those cases
+  InstrItinData<IIC_VPERMQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe]>], [3, 3, 1, 1]>,
+  //
+  // Quad-register Permute (3 cycle issue)
+  // Result written in N2, but that is relative to the last cycle of multicycle,
+  // so we use 4 for those cases
+  InstrItinData<IIC_VPERMQ3,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe]>,
+                               InstrStage<1, [A8_NPipe], 0>,
+                               InstrStage<2, [A8_NLSPipe]>], [4, 4, 1, 1]>,
+  //
+  // Double-register FP Multiple-Accumulate
+  InstrItinData<IIC_VMACD,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [9, 3, 2, 2]>,
+  //
+  // Quad-register FP Multiple-Accumulate
+  // Result written in N9, but that is relative to the last cycle of multicycle,
+  // so we use 10 for those cases
+  InstrItinData<IIC_VMACQ,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NPipe]>], [10, 3, 2, 2]>,
+  //
+  // Double-register Fused FP Multiple-Accumulate
+  InstrItinData<IIC_VFMACD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [9, 3, 2, 2]>,
+  //
+  // Quad-register Fused FP Multiple-Accumulate
+  // Result written in N9, but that is relative to the last cycle of multicycle,
+  // so we use 10 for those cases
+  InstrItinData<IIC_VFMACQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NPipe]>], [10, 3, 2, 2]>,
+  //
+  // Double-register Reciprical Step
+  InstrItinData<IIC_VRECSD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [9, 2, 2]>,
+  //
+  // Quad-register Reciprical Step
+  InstrItinData<IIC_VRECSQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NPipe]>], [10, 2, 2]>,
+  //
+  // Double-register Integer Count
+  InstrItinData<IIC_VCNTiD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [3, 2, 2]>,
+  //
+  // Quad-register Integer Count
+  // Result written in N3, but that is relative to the last cycle of multicycle,
+  // so we use 4 for those cases
+  InstrItinData<IIC_VCNTiQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NPipe]>], [4, 2, 2]>,
+  //
+  // Double-register Integer Unary
+  InstrItinData<IIC_VUNAiD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [4, 2]>,
+  //
+  // Quad-register Integer Unary
+  InstrItinData<IIC_VUNAiQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [4, 2]>,
+  //
+  // Double-register Integer Q-Unary
+  InstrItinData<IIC_VQUNAiD,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [4, 1]>,
+  //
+  // Quad-register Integer CountQ-Unary
+  InstrItinData<IIC_VQUNAiQ,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [4, 1]>,
+  //
+  // Double-register Integer Binary
+  InstrItinData<IIC_VBINiD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [3, 2, 2]>,
+  //
+  // Quad-register Integer Binary
+  InstrItinData<IIC_VBINiQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [3, 2, 2]>,
+  //
+  // Double-register Integer Binary (4 cycle)
+  InstrItinData<IIC_VBINi4D,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [4, 2, 1]>,
+  //
+  // Quad-register Integer Binary (4 cycle)
+  InstrItinData<IIC_VBINi4Q,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [4, 2, 1]>,
+
+  //
+  // Double-register Integer Subtract
+  InstrItinData<IIC_VSUBiD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [3, 2, 1]>,
+  //
+  // Quad-register Integer Subtract
+  InstrItinData<IIC_VSUBiQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [3, 2, 1]>,
+  //
+  // Double-register Integer Subtract
+  InstrItinData<IIC_VSUBi4D,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [4, 2, 1]>,
+  //
+  // Quad-register Integer Subtract
+  InstrItinData<IIC_VSUBi4Q,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [4, 2, 1]>,
+  //
+  // Double-register Integer Shift
+  InstrItinData<IIC_VSHLiD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [3, 1, 1]>,
+  //
+  // Quad-register Integer Shift
+  InstrItinData<IIC_VSHLiQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NPipe]>], [4, 1, 1]>,
+  //
+  // Double-register Integer Shift (4 cycle)
+  InstrItinData<IIC_VSHLi4D,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [4, 1, 1]>,
+  //
+  // Quad-register Integer Shift (4 cycle)
+  InstrItinData<IIC_VSHLi4Q,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NPipe]>], [5, 1, 1]>,
+  //
+  // Double-register Integer Pair Add Long
+  InstrItinData<IIC_VPALiD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [6, 3, 1]>,
+  //
+  // Quad-register Integer Pair Add Long
+  InstrItinData<IIC_VPALiQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NPipe]>], [7, 3, 1]>,
+  //
+  // Double-register Absolute Difference and Accumulate
+  InstrItinData<IIC_VABAD,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [6, 3, 2, 1]>,
+  //
+  // Quad-register Absolute Difference and Accumulate
+  InstrItinData<IIC_VABAQ,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NPipe]>], [6, 3, 2, 1]>,
+
+  //
+  // Double-register Integer Multiply (.8, .16)
+  InstrItinData<IIC_VMULi16D, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [6, 2, 2]>,
+  //
+  // Double-register Integer Multiply (.32)
+  InstrItinData<IIC_VMULi32D, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NPipe]>], [7, 2, 1]>,
+  //
+  // Quad-register Integer Multiply (.8, .16)
+  InstrItinData<IIC_VMULi16Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NPipe]>], [7, 2, 2]>,
+  //
+  // Quad-register Integer Multiply (.32)
+  InstrItinData<IIC_VMULi32Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_NPipe]>], [9, 2, 1]>,
+  //
+  // Double-register Integer Multiply-Accumulate (.8, .16)
+  InstrItinData<IIC_VMACi16D, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [6, 3, 2, 2]>,
+  //
+  // Double-register Integer Multiply-Accumulate (.32)
+  InstrItinData<IIC_VMACi32D, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NPipe]>], [7, 3, 2, 1]>,
+  //
+  // Quad-register Integer Multiply-Accumulate (.8, .16)
+  InstrItinData<IIC_VMACi16Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NPipe]>], [7, 3, 2, 2]>,
+  //
+  // Quad-register Integer Multiply-Accumulate (.32)
+  InstrItinData<IIC_VMACi32Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_NPipe]>], [9, 3, 2, 1]>,
+  //
+  // Double-register VEXT
+  InstrItinData<IIC_VEXTD,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe]>], [2, 1, 1]>,
+  //
+  // Quad-register VEXT
+  InstrItinData<IIC_VEXTQ,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe]>], [3, 1, 1]>,
+  //
+  // VTB
+  InstrItinData<IIC_VTB1,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe]>], [3, 2, 1]>,
+  InstrItinData<IIC_VTB2,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe]>], [3, 2, 2, 1]>,
+  InstrItinData<IIC_VTB3,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe]>,
+                               InstrStage<1, [A8_NPipe], 0>,
+                               InstrStage<2, [A8_NLSPipe]>], [4, 2, 2, 3, 1]>,
+  InstrItinData<IIC_VTB4,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe]>,
+                               InstrStage<1, [A8_NPipe], 0>,
+                               InstrStage<2, [A8_NLSPipe]>],[4, 2, 2, 3, 3, 1]>,
+  //
+  // VTBX
+  InstrItinData<IIC_VTBX1,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe]>], [3, 1, 2, 1]>,
+  InstrItinData<IIC_VTBX2,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe]>], [3, 1, 2, 2, 1]>,
+  InstrItinData<IIC_VTBX3,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe]>,
+                               InstrStage<1, [A8_NPipe], 0>,
+                               InstrStage<2, [A8_NLSPipe]>],[4, 1, 2, 2, 3, 1]>,
+  InstrItinData<IIC_VTBX4,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe]>,
+                               InstrStage<1, [A8_NPipe], 0>,
+                            InstrStage<2, [A8_NLSPipe]>], [4, 1, 2, 2, 3, 3, 1]>
+]>;
+
+// ===---------------------------------------------------------------------===//
+// This following definitions describe the simple machine model which
+// will replace itineraries.
+
+// Cortex-A8 machine model for scheduling and other instruction cost heuristics.
+def CortexA8Model : SchedMachineModel {
+  let IssueWidth = 2; // 2 micro-ops are dispatched per cycle.
+  let MinLatency = -1; // OperandCycles are interpreted as MinLatency.
+  let LoadLatency = 2; // Optimistic load latency assuming bypass.
+                       // This is overriden by OperandCycles if the
+                       // Itineraries are queried instead.
+  let MispredictPenalty = 13; // Based on estimate of pipeline depth.
+
+  let Itineraries = CortexA8Itineraries;
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleA9.td b/contrib/llvm/lib/Target/ARM/ARMScheduleA9.td
new file mode 100644
index 0000000..9a1d222
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMScheduleA9.td
@@ -0,0 +1,2529 @@
+//=- ARMScheduleA9.td - ARM Cortex-A9 Scheduling Definitions -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the ARM Cortex A9 processors.
+//
+//===----------------------------------------------------------------------===//
+
+// ===---------------------------------------------------------------------===//
+// This section contains legacy support for itineraries. This is
+// required until SD and PostRA schedulers are replaced by MachineScheduler.
+
+//
+// Ad-hoc scheduling information derived from pretty vague "Cortex-A9 Technical
+// Reference Manual".
+//
+// Functional units
+def A9_Issue0  : FuncUnit; // Issue 0
+def A9_Issue1  : FuncUnit; // Issue 1
+def A9_Branch  : FuncUnit; // Branch
+def A9_ALU0    : FuncUnit; // ALU / MUL pipeline 0
+def A9_ALU1    : FuncUnit; // ALU pipeline 1
+def A9_AGU     : FuncUnit; // Address generation unit for ld / st
+def A9_NPipe   : FuncUnit; // NEON pipeline
+def A9_MUX0    : FuncUnit; // AGU + NEON/FPU multiplexer
+def A9_LSUnit  : FuncUnit; // L/S Unit
+def A9_DRegsVFP: FuncUnit; // FP register set, VFP side
+def A9_DRegsN  : FuncUnit; // FP register set, NEON side
+
+// Bypasses
+def A9_LdBypass : Bypass;
+
+def CortexA9Itineraries : ProcessorItineraries<
+  [A9_Issue0, A9_Issue1, A9_Branch, A9_ALU0, A9_ALU1, A9_AGU, A9_NPipe, A9_MUX0,
+   A9_LSUnit, A9_DRegsVFP, A9_DRegsN],
+  [A9_LdBypass], [
+  // Two fully-pipelined integer ALU pipelines
+
+  //
+  // Move instructions, unconditional
+  InstrItinData<IIC_iMOVi   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [1]>,
+  InstrItinData<IIC_iMOVr   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
+  InstrItinData<IIC_iMOVsi  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
+  InstrItinData<IIC_iMOVsr  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<2, [A9_ALU0, A9_ALU1]>], [2, 1, 1]>,
+  InstrItinData<IIC_iMOVix2 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [2]>,
+  InstrItinData<IIC_iMOVix2addpc,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                  InstrStage<1, [A9_ALU0, A9_ALU1]>,
+                                  InstrStage<1, [A9_ALU0, A9_ALU1]>,
+                                  InstrStage<1, [A9_ALU0, A9_ALU1]>], [3]>,
+  InstrItinData<IIC_iMOVix2ld,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_AGU], 0>,
+                               InstrStage<1, [A9_LSUnit]>], [5]>,
+  //
+  // MVN instructions
+  InstrItinData<IIC_iMVNi   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>],
+                              [1]>,
+  InstrItinData<IIC_iMVNr   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>],
+                              [1, 1], [NoBypass, A9_LdBypass]>,
+  InstrItinData<IIC_iMVNsi  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<2, [A9_ALU0, A9_ALU1]>],
+                              [2, 1]>,
+  InstrItinData<IIC_iMVNsr  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<3, [A9_ALU0, A9_ALU1]>],
+                              [3, 1, 1]>,
+  //
+  // No operand cycles
+  InstrItinData<IIC_iALUx   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>]>,
+  //
+  // Binary Instructions that produce a result
+  InstrItinData<IIC_iALUi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<1, [A9_ALU0, A9_ALU1]>],
+                            [1, 1], [NoBypass, A9_LdBypass]>,
+  InstrItinData<IIC_iALUr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<1, [A9_ALU0, A9_ALU1]>],
+                            [1, 1, 1], [NoBypass, A9_LdBypass, A9_LdBypass]>,
+  InstrItinData<IIC_iALUsi, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<2, [A9_ALU0, A9_ALU1]>],
+                            [2, 1, 1], [NoBypass, A9_LdBypass, NoBypass]>,
+  InstrItinData<IIC_iALUsir,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<2, [A9_ALU0, A9_ALU1]>],
+                            [2, 1, 1], [NoBypass, NoBypass, A9_LdBypass]>,
+  InstrItinData<IIC_iALUsr, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<3, [A9_ALU0, A9_ALU1]>],
+                            [3, 1, 1, 1],
+                            [NoBypass, A9_LdBypass, NoBypass, NoBypass]>,
+  //
+  // Bitwise Instructions that produce a result
+  InstrItinData<IIC_iBITi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
+  InstrItinData<IIC_iBITr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1, 1]>,
+  InstrItinData<IIC_iBITsi, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<2, [A9_ALU0, A9_ALU1]>], [2, 1, 1]>,
+  InstrItinData<IIC_iBITsr, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<3, [A9_ALU0, A9_ALU1]>], [3, 1, 1, 1]>,
+  //
+  // Unary Instructions that produce a result
+
+  // CLZ, RBIT, etc.
+  InstrItinData<IIC_iUNAr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
+
+  // BFC, BFI, UBFX, SBFX
+  InstrItinData<IIC_iUNAsi, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<2, [A9_ALU0, A9_ALU1]>], [2, 1]>,
+
+  //
+  // Zero and sign extension instructions
+  InstrItinData<IIC_iEXTr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<1, [A9_ALU0, A9_ALU1]>], [2, 1]>,
+  InstrItinData<IIC_iEXTAr, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<2, [A9_ALU0, A9_ALU1]>], [3, 1, 1]>,
+  InstrItinData<IIC_iEXTAsr,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<3, [A9_ALU0, A9_ALU1]>], [3, 1, 1, 1]>,
+  //
+  // Compare instructions
+  InstrItinData<IIC_iCMPi   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>],
+                               [1], [A9_LdBypass]>,
+  InstrItinData<IIC_iCMPr   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>],
+                               [1, 1], [A9_LdBypass, A9_LdBypass]>,
+  InstrItinData<IIC_iCMPsi  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<2, [A9_ALU0, A9_ALU1]>],
+                                [1, 1], [A9_LdBypass, NoBypass]>,
+  InstrItinData<IIC_iCMPsr  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<3, [A9_ALU0, A9_ALU1]>],
+                              [1, 1, 1], [A9_LdBypass, NoBypass, NoBypass]>,
+  //
+  // Test instructions
+  InstrItinData<IIC_iTSTi   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [1]>,
+  InstrItinData<IIC_iTSTr   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
+  InstrItinData<IIC_iTSTsi  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<2, [A9_ALU0, A9_ALU1]>], [1, 1]>,
+  InstrItinData<IIC_iTSTsr  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<3, [A9_ALU0, A9_ALU1]>], [1, 1, 1]>,
+  //
+  // Move instructions, conditional
+  // FIXME: Correctly model the extra input dep on the destination.
+  InstrItinData<IIC_iCMOVi  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [1]>,
+  InstrItinData<IIC_iCMOVr  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
+  InstrItinData<IIC_iCMOVsi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
+  InstrItinData<IIC_iCMOVsr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<2, [A9_ALU0, A9_ALU1]>], [2, 1, 1]>,
+  InstrItinData<IIC_iCMOVix2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>,
+                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [2]>,
+
+  // Integer multiply pipeline
+  //
+  InstrItinData<IIC_iMUL16  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<2, [A9_ALU0]>], [3, 1, 1]>,
+  InstrItinData<IIC_iMAC16  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<2, [A9_ALU0]>],
+                              [3, 1, 1, 1]>,
+  InstrItinData<IIC_iMUL32  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<2, [A9_ALU0]>], [4, 1, 1]>,
+  InstrItinData<IIC_iMAC32  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<2, [A9_ALU0]>],
+                              [4, 1, 1, 1]>,
+  InstrItinData<IIC_iMUL64  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<3, [A9_ALU0]>], [4, 5, 1, 1]>,
+  InstrItinData<IIC_iMAC64  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<3, [A9_ALU0]>],
+                              [4, 5, 1, 1]>,
+  // Integer load pipeline
+  // FIXME: The timings are some rough approximations
+  //
+  // Immediate offset
+  InstrItinData<IIC_iLoad_i   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<1, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [3, 1], [A9_LdBypass]>,
+  InstrItinData<IIC_iLoad_bh_i, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [4, 1], [A9_LdBypass]>,
+  // FIXME: If address is 64-bit aligned, AGU cycles is 1.
+  InstrItinData<IIC_iLoad_d_i , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [3, 3, 1], [A9_LdBypass]>,
+  //
+  // Register offset
+  InstrItinData<IIC_iLoad_r   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<1, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [3, 1, 1], [A9_LdBypass]>,
+  InstrItinData<IIC_iLoad_bh_r, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [4, 1, 1], [A9_LdBypass]>,
+  InstrItinData<IIC_iLoad_d_r , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [3, 3, 1, 1], [A9_LdBypass]>,
+  //
+  // Scaled register offset
+  InstrItinData<IIC_iLoad_si  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<1, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit], 0>],
+                                [4, 1, 1], [A9_LdBypass]>,
+  InstrItinData<IIC_iLoad_bh_si,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [5, 1, 1], [A9_LdBypass]>,
+  //
+  // Immediate offset with update
+  InstrItinData<IIC_iLoad_iu  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<1, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [3, 2, 1], [A9_LdBypass]>,
+  InstrItinData<IIC_iLoad_bh_iu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [4, 3, 1], [A9_LdBypass]>,
+  //
+  // Register offset with update
+  InstrItinData<IIC_iLoad_ru  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<1, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [3, 2, 1, 1], [A9_LdBypass]>,
+  InstrItinData<IIC_iLoad_bh_ru,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [4, 3, 1, 1], [A9_LdBypass]>,
+  InstrItinData<IIC_iLoad_d_ru, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [3, 3, 1, 1], [A9_LdBypass]>,
+  //
+  // Scaled register offset with update
+  InstrItinData<IIC_iLoad_siu , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<1, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [4, 3, 1, 1], [A9_LdBypass]>,
+  InstrItinData<IIC_iLoad_bh_siu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                  InstrStage<1, [A9_MUX0], 0>,
+                                  InstrStage<2, [A9_AGU], 0>,
+                                  InstrStage<1, [A9_LSUnit]>],
+                                 [5, 4, 1, 1], [A9_LdBypass]>,
+  //
+  // Load multiple, def is the 5th operand.
+  // FIXME: This assumes 3 to 4 registers.
+  InstrItinData<IIC_iLoad_m  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                InstrStage<1, [A9_MUX0], 0>,
+                                InstrStage<2, [A9_AGU], 1>,
+                                InstrStage<2, [A9_LSUnit]>],
+                               [1, 1, 1, 1, 3],
+                         [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass],
+                         -1>, // dynamic uops
+  //
+  // Load multiple + update, defs are the 1st and 5th operands.
+  InstrItinData<IIC_iLoad_mu , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                InstrStage<1, [A9_MUX0], 0>,
+                                InstrStage<2, [A9_AGU], 1>,
+                                InstrStage<2, [A9_LSUnit]>],
+                               [2, 1, 1, 1, 3],
+                         [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass],
+                         -1>, // dynamic uops
+  //
+  // Load multiple plus branch
+  InstrItinData<IIC_iLoad_mBr, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                InstrStage<1, [A9_MUX0], 0>,
+                                InstrStage<1, [A9_AGU], 1>,
+                                InstrStage<2, [A9_LSUnit]>,
+                                InstrStage<1, [A9_Branch]>],
+                               [1, 2, 1, 1, 3],
+                         [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass],
+                         -1>, // dynamic uops
+  //
+  // Pop, def is the 3rd operand.
+  InstrItinData<IIC_iPop  ,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                InstrStage<1, [A9_MUX0], 0>,
+                                InstrStage<2, [A9_AGU], 1>,
+                                InstrStage<2, [A9_LSUnit]>],
+                               [1, 1, 3],
+                               [NoBypass, NoBypass, A9_LdBypass],
+                               -1>, // dynamic uops
+  //
+  // Pop + branch, def is the 3rd operand.
+  InstrItinData<IIC_iPop_Br,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                InstrStage<1, [A9_MUX0], 0>,
+                                InstrStage<2, [A9_AGU], 1>,
+                                InstrStage<2, [A9_LSUnit]>,
+                                InstrStage<1, [A9_Branch]>],
+                               [1, 1, 3],
+                               [NoBypass, NoBypass, A9_LdBypass],
+                               -1>, // dynamic uops
+  //
+  // iLoadi + iALUr for t2LDRpci_pic.
+  InstrItinData<IIC_iLoadiALU, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                InstrStage<1, [A9_MUX0], 0>,
+                                InstrStage<1, [A9_AGU], 0>,
+                                InstrStage<1, [A9_LSUnit]>,
+                                InstrStage<1, [A9_ALU0, A9_ALU1]>],
+                               [2, 1]>,
+
+  // Integer store pipeline
+  ///
+  // Immediate offset
+  InstrItinData<IIC_iStore_i  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<1, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>], [1, 1]>,
+  InstrItinData<IIC_iStore_bh_i,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 1>,
+                                 InstrStage<1, [A9_LSUnit]>], [1, 1]>,
+  // FIXME: If address is 64-bit aligned, AGU cycles is 1.
+  InstrItinData<IIC_iStore_d_i, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 1>,
+                                 InstrStage<1, [A9_LSUnit]>], [1, 1]>,
+  //
+  // Register offset
+  InstrItinData<IIC_iStore_r  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<1, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_r,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 1>,
+                                 InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>,
+  InstrItinData<IIC_iStore_d_r, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 1>,
+                                 InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>,
+  //
+  // Scaled register offset
+  InstrItinData<IIC_iStore_si ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                  InstrStage<1, [A9_MUX0], 0>,
+                                  InstrStage<1, [A9_AGU], 0>,
+                                  InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_si,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                  InstrStage<1, [A9_MUX0], 0>,
+                                  InstrStage<2, [A9_AGU], 1>,
+                                  InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>,
+  //
+  // Immediate offset with update
+  InstrItinData<IIC_iStore_iu ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                  InstrStage<1, [A9_MUX0], 0>,
+                                  InstrStage<1, [A9_AGU], 0>,
+                                  InstrStage<1, [A9_LSUnit]>], [2, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_iu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                  InstrStage<1, [A9_MUX0], 0>,
+                                  InstrStage<2, [A9_AGU], 1>,
+                                  InstrStage<1, [A9_LSUnit]>], [3, 1, 1]>,
+  //
+  // Register offset with update
+  InstrItinData<IIC_iStore_ru ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                  InstrStage<1, [A9_MUX0], 0>,
+                                  InstrStage<1, [A9_AGU], 0>,
+                                  InstrStage<1, [A9_LSUnit]>],
+                                 [2, 1, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_ru,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                  InstrStage<1, [A9_MUX0], 0>,
+                                  InstrStage<2, [A9_AGU], 1>,
+                                  InstrStage<1, [A9_LSUnit]>],
+                                 [3, 1, 1, 1]>,
+  InstrItinData<IIC_iStore_d_ru, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                  InstrStage<1, [A9_MUX0], 0>,
+                                  InstrStage<2, [A9_AGU], 1>,
+                                  InstrStage<1, [A9_LSUnit]>],
+                                 [3, 1, 1, 1]>,
+  //
+  // Scaled register offset with update
+  InstrItinData<IIC_iStore_siu,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                    InstrStage<1, [A9_MUX0], 0>,
+                                    InstrStage<1, [A9_AGU], 0>,
+                                    InstrStage<1, [A9_LSUnit]>],
+                                   [2, 1, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_siu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                    InstrStage<1, [A9_MUX0], 0>,
+                                    InstrStage<2, [A9_AGU], 1>,
+                                    InstrStage<1, [A9_LSUnit]>],
+                                   [3, 1, 1, 1]>,
+  //
+  // Store multiple
+  InstrItinData<IIC_iStore_m , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                InstrStage<1, [A9_MUX0], 0>,
+                                InstrStage<1, [A9_AGU], 0>,
+                                InstrStage<2, [A9_LSUnit]>],
+                [], [], -1>, // dynamic uops
+  //
+  // Store multiple + update
+  InstrItinData<IIC_iStore_mu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                InstrStage<1, [A9_MUX0], 0>,
+                                InstrStage<1, [A9_AGU], 0>,
+                                InstrStage<2, [A9_LSUnit]>],
+                [2], [], -1>, // dynamic uops
+  //
+  // Preload
+  InstrItinData<IIC_Preload,   [InstrStage<1, [A9_Issue0, A9_Issue1]>], [1, 1]>,
+
+  // Branch
+  //
+  // no delay slots, so the latency of a branch is unimportant
+  InstrItinData<IIC_Br       , [InstrStage<1, [A9_Issue0], 0>,
+                                InstrStage<1, [A9_Issue1], 0>,
+                                InstrStage<1, [A9_Branch]>]>,
+
+  // VFP and NEON shares the same register file. This means that every VFP
+  // instruction should wait for full completion of the consecutive NEON
+  // instruction and vice-versa. We model this behavior with two artificial FUs:
+  // DRegsVFP and DRegsVFP.
+  //
+  // Every VFP instruction:
+  //  - Acquires DRegsVFP resource for 1 cycle
+  //  - Reserves DRegsN resource for the whole duration (including time to
+  //    register file writeback!).
+  // Every NEON instruction does the same but with FUs swapped.
+  //
+  // Since the reserved FU cannot be acquired, this models precisely
+  // "cross-domain" stalls.
+
+  // VFP
+  // Issue through integer pipeline, and execute in NEON unit.
+
+  // FP Special Register to Integer Register File Move
+  InstrItinData<IIC_fpSTAT , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                              InstrStage<1, [A9_MUX0], 0>,
+                              InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                              InstrStage<2, [A9_DRegsN],   0, Reserved>,
+                              InstrStage<1, [A9_NPipe]>],
+                             [1]>,
+  //
+  // Single-precision FP Unary
+  InstrItinData<IIC_fpUNA32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               // Extra latency cycles since wbck is 2 cycles
+                               InstrStage<3, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [1, 1]>,
+  //
+  // Double-precision FP Unary
+  InstrItinData<IIC_fpUNA64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               // Extra latency cycles since wbck is 2 cycles
+                               InstrStage<3, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [1, 1]>,
+
+  //
+  // Single-precision FP Compare
+  InstrItinData<IIC_fpCMP32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               // Extra latency cycles since wbck is 4 cycles
+                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [1, 1]>,
+  //
+  // Double-precision FP Compare
+  InstrItinData<IIC_fpCMP64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               // Extra latency cycles since wbck is 4 cycles
+                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [1, 1]>,
+  //
+  // Single to Double FP Convert
+  InstrItinData<IIC_fpCVTSD , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1]>,
+  //
+  // Double to Single FP Convert
+  InstrItinData<IIC_fpCVTDS , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1]>,
+
+  //
+  // Single to Half FP Convert
+  InstrItinData<IIC_fpCVTSH , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1]>,
+  //
+  // Half to Single FP Convert
+  InstrItinData<IIC_fpCVTHS , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<3, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [2, 1]>,
+
+  //
+  // Single-Precision FP to Integer Convert
+  InstrItinData<IIC_fpCVTSI , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1]>,
+  //
+  // Double-Precision FP to Integer Convert
+  InstrItinData<IIC_fpCVTDI , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1]>,
+  //
+  // Integer to Single-Precision FP Convert
+  InstrItinData<IIC_fpCVTIS , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1]>,
+  //
+  // Integer to Double-Precision FP Convert
+  InstrItinData<IIC_fpCVTID , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1]>,
+  //
+  // Single-precision FP ALU
+  InstrItinData<IIC_fpALU32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1, 1]>,
+  //
+  // Double-precision FP ALU
+  InstrItinData<IIC_fpALU64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1, 1]>,
+  //
+  // Single-precision FP Multiply
+  InstrItinData<IIC_fpMUL32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<6, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [5, 1, 1]>,
+  //
+  // Double-precision FP Multiply
+  InstrItinData<IIC_fpMUL64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<7, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [6, 1, 1]>,
+  //
+  // Single-precision FP MAC
+  InstrItinData<IIC_fpMAC32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<9, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [8, 1, 1, 1]>,
+  //
+  // Double-precision FP MAC
+  InstrItinData<IIC_fpMAC64 , [InstrStage<1,  [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1,  [A9_MUX0], 0>,
+                               InstrStage<1,  [A9_DRegsVFP], 0, Required>,
+                               InstrStage<10, [A9_DRegsN],  0, Reserved>,
+                               InstrStage<2,  [A9_NPipe]>],
+                              [9, 1, 1, 1]>,
+  //
+  // Single-precision Fused FP MAC
+  InstrItinData<IIC_fpFMAC32, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<9, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [8, 1, 1, 1]>,
+  //
+  // Double-precision Fused FP MAC
+  InstrItinData<IIC_fpFMAC64, [InstrStage<1,  [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1,  [A9_MUX0], 0>,
+                               InstrStage<1,  [A9_DRegsVFP], 0, Required>,
+                               InstrStage<10, [A9_DRegsN],  0, Reserved>,
+                               InstrStage<2,  [A9_NPipe]>],
+                              [9, 1, 1, 1]>,
+  //
+  // Single-precision FP DIV
+  InstrItinData<IIC_fpDIV32 , [InstrStage<1,  [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1,  [A9_MUX0], 0>,
+                               InstrStage<1,  [A9_DRegsVFP], 0, Required>,
+                               InstrStage<16, [A9_DRegsN],  0, Reserved>,
+                               InstrStage<10, [A9_NPipe]>],
+                              [15, 1, 1]>,
+  //
+  // Double-precision FP DIV
+  InstrItinData<IIC_fpDIV64 , [InstrStage<1,  [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1,  [A9_MUX0], 0>,
+                               InstrStage<1,  [A9_DRegsVFP], 0, Required>,
+                               InstrStage<26, [A9_DRegsN],  0, Reserved>,
+                               InstrStage<20, [A9_NPipe]>],
+                              [25, 1, 1]>,
+  //
+  // Single-precision FP SQRT
+  InstrItinData<IIC_fpSQRT32, [InstrStage<1,  [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1,  [A9_MUX0], 0>,
+                               InstrStage<1,  [A9_DRegsVFP], 0, Required>,
+                               InstrStage<18, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<13, [A9_NPipe]>],
+                              [17, 1]>,
+  //
+  // Double-precision FP SQRT
+  InstrItinData<IIC_fpSQRT64, [InstrStage<1,  [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1,  [A9_MUX0], 0>,
+                               InstrStage<1,  [A9_DRegsVFP], 0, Required>,
+                               InstrStage<33, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<28, [A9_NPipe]>],
+                              [32, 1]>,
+
+  //
+  // Integer to Single-precision Move
+  InstrItinData<IIC_fpMOVIS,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               // Extra 1 latency cycle since wbck is 2 cycles
+                               InstrStage<3, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [1, 1]>,
+  //
+  // Integer to Double-precision Move
+  InstrItinData<IIC_fpMOVID,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               // Extra 1 latency cycle since wbck is 2 cycles
+                               InstrStage<3, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [1, 1, 1]>,
+  //
+  // Single-precision to Integer Move
+  //
+  // On A9 move-from-VFP is free to issue with no stall if other VFP
+  // operations are in flight. I assume it still can't dual-issue though.
+  InstrItinData<IIC_fpMOVSI,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>],
+                              [2, 1]>,
+  //
+  // Double-precision to Integer Move
+  //
+  // On A9 move-from-VFP is free to issue with no stall if other VFP
+  // operations are in flight. I assume it still can't dual-issue though.
+  InstrItinData<IIC_fpMOVDI,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>],
+                              [2, 1, 1]>,
+  //
+  // Single-precision FP Load
+  InstrItinData<IIC_fpLoad32, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1]>,
+  //
+  // Double-precision FP Load
+  // FIXME: Result latency is 1 if address is 64-bit aligned.
+  InstrItinData<IIC_fpLoad64, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 1]>,
+  //
+  // FP Load Multiple
+  // FIXME: assumes 2 doubles which requires 2 LS cycles.
+  InstrItinData<IIC_fpLoad_m, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                [1, 1, 1, 1], [], -1>, // dynamic uops
+  //
+  // FP Load Multiple + update
+  // FIXME: assumes 2 doubles which requires 2 LS cycles.
+  InstrItinData<IIC_fpLoad_mu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                [2, 1, 1, 1], [], -1>, // dynamic uops
+  //
+  // Single-precision FP Store
+  InstrItinData<IIC_fpStore32,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1]>,
+  //
+  // Double-precision FP Store
+  InstrItinData<IIC_fpStore64,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1]>,
+  //
+  // FP Store Multiple
+  // FIXME: assumes 2 doubles which requires 2 LS cycles.
+  InstrItinData<IIC_fpStore_m,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                [1, 1, 1, 1], [], -1>, // dynamic uops
+  //
+  // FP Store Multiple + update
+  // FIXME: assumes 2 doubles which requires 2 LS cycles.
+  InstrItinData<IIC_fpStore_mu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                InstrStage<1, [A9_MUX0], 0>,
+                                InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
+                                InstrStage<1, [A9_NPipe], 0>,
+                                InstrStage<2, [A9_LSUnit]>],
+                [2, 1, 1, 1], [], -1>, // dynamic uops
+  // NEON
+  // VLD1
+  InstrItinData<IIC_VLD1,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1]>,
+  // VLD1x2
+  InstrItinData<IIC_VLD1x2,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1, 1]>,
+  // VLD1x3
+  InstrItinData<IIC_VLD1x3,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 2, 1]>,
+  // VLD1x4
+  InstrItinData<IIC_VLD1x4,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 2, 2, 1]>,
+  // VLD1u
+  InstrItinData<IIC_VLD1u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 2, 1]>,
+  // VLD1x2u
+  InstrItinData<IIC_VLD1x2u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1, 2, 1]>,
+  // VLD1x3u
+  InstrItinData<IIC_VLD1x3u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 2, 2, 1]>,
+  // VLD1x4u
+  InstrItinData<IIC_VLD1x4u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 2, 2, 2, 1]>,
+  //
+  // VLD1ln
+  InstrItinData<IIC_VLD1ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [3, 1, 1, 1]>,
+  //
+  // VLD1lnu
+  InstrItinData<IIC_VLD1lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [3, 2, 1, 1, 1, 1]>,
+  //
+  // VLD1dup
+  InstrItinData<IIC_VLD1dup,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 1]>,
+  //
+  // VLD1dupu
+  InstrItinData<IIC_VLD1dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 2, 1, 1]>,
+  //
+  // VLD2
+  InstrItinData<IIC_VLD2,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 2, 1]>,
+  //
+  // VLD2x2
+  InstrItinData<IIC_VLD2x2,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 3, 2, 3, 1]>,
+  //
+  // VLD2ln
+  InstrItinData<IIC_VLD2ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [3, 3, 1, 1, 1, 1]>,
+  //
+  // VLD2u
+  InstrItinData<IIC_VLD2u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 2, 2, 1, 1, 1]>,
+  //
+  // VLD2x2u
+  InstrItinData<IIC_VLD2x2u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 3, 2, 3, 2, 1]>,
+  //
+  // VLD2lnu
+  InstrItinData<IIC_VLD2lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [3, 3, 2, 1, 1, 1, 1, 1]>,
+  //
+  // VLD2dup
+  InstrItinData<IIC_VLD2dup,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 2, 1]>,
+  //
+  // VLD2dupu
+  InstrItinData<IIC_VLD2dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 2, 2, 1, 1]>,
+  //
+  // VLD3
+  InstrItinData<IIC_VLD3,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<9,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [3, 3, 4, 1]>,
+  //
+  // VLD3ln
+  InstrItinData<IIC_VLD3ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<11,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<5, [A9_NPipe], 0>,
+                               InstrStage<5, [A9_LSUnit]>],
+                              [5, 5, 6, 1, 1, 1, 1, 2]>,
+  //
+  // VLD3u
+  InstrItinData<IIC_VLD3u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<9,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [3, 3, 4, 2, 1]>,
+  //
+  // VLD3lnu
+  InstrItinData<IIC_VLD3lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<11,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<5, [A9_NPipe], 0>,
+                               InstrStage<5, [A9_LSUnit]>],
+                              [5, 5, 6, 2, 1, 1, 1, 1, 1, 2]>,
+  //
+  // VLD3dup
+  InstrItinData<IIC_VLD3dup,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [3, 3, 4, 1]>,
+  //
+  // VLD3dupu
+  InstrItinData<IIC_VLD3dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [3, 3, 4, 2, 1, 1]>,
+  //
+  // VLD4
+  InstrItinData<IIC_VLD4,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<9,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [3, 3, 4, 4, 1]>,
+  //
+  // VLD4ln
+  InstrItinData<IIC_VLD4ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<10,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<4, [A9_NPipe], 0>,
+                               InstrStage<4, [A9_LSUnit]>],
+                              [4, 4, 5, 5, 1, 1, 1, 1, 2, 2]>,
+  //
+  // VLD4u
+  InstrItinData<IIC_VLD4u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<9,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [3, 3, 4, 4, 2, 1]>,
+  //
+  // VLD4lnu
+  InstrItinData<IIC_VLD4lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<10,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<4, [A9_NPipe], 0>,
+                               InstrStage<4, [A9_LSUnit]>],
+                              [4, 4, 5, 5, 2, 1, 1, 1, 1, 1, 2, 2]>,
+  //
+  // VLD4dup
+  InstrItinData<IIC_VLD4dup,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 2, 3, 3, 1]>,
+  //
+  // VLD4dupu
+  InstrItinData<IIC_VLD4dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 2, 3, 3, 2, 1, 1]>,
+  //
+  // VST1
+  InstrItinData<IIC_VST1,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1, 1]>,
+  //
+  // VST1x2
+  InstrItinData<IIC_VST1x2,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1, 1, 1]>,
+  //
+  // VST1x3
+  InstrItinData<IIC_VST1x3,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 1, 1, 2]>,
+  //
+  // VST1x4
+  InstrItinData<IIC_VST1x4,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 1, 1, 2, 2]>,
+  //
+  // VST1u
+  InstrItinData<IIC_VST1u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1]>,
+  //
+  // VST1x2u
+  InstrItinData<IIC_VST1x2u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1]>,
+  //
+  // VST1x3u
+  InstrItinData<IIC_VST1x3u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1, 2]>,
+  //
+  // VST1x4u
+  InstrItinData<IIC_VST1x4u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1, 2, 2]>,
+  //
+  // VST1ln
+  InstrItinData<IIC_VST1ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1, 1]>,
+  //
+  // VST1lnu
+  InstrItinData<IIC_VST1lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1]>,
+  //
+  // VST2
+  InstrItinData<IIC_VST2,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1, 1, 1]>,
+  //
+  // VST2x2
+  InstrItinData<IIC_VST2x2,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [1, 1, 1, 1, 2, 2]>,
+  //
+  // VST2u
+  InstrItinData<IIC_VST2u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1]>,
+  //
+  // VST2x2u
+  InstrItinData<IIC_VST2x2u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1, 2, 2]>,
+  //
+  // VST2ln
+  InstrItinData<IIC_VST2ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1, 1, 1]>,
+  //
+  // VST2lnu
+  InstrItinData<IIC_VST2lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1]>,
+  //
+  // VST3
+  InstrItinData<IIC_VST3,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 1, 1, 2]>,
+  //
+  // VST3u
+  InstrItinData<IIC_VST3u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1, 2]>,
+  //
+  // VST3ln
+  InstrItinData<IIC_VST3ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [1, 1, 1, 1, 2]>,
+  //
+  // VST3lnu
+  InstrItinData<IIC_VST3lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1, 2]>,
+  //
+  // VST4
+  InstrItinData<IIC_VST4,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 1, 1, 2, 2]>,
+  //
+  // VST4u
+  InstrItinData<IIC_VST4u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1, 2, 2]>,
+  //
+  // VST4ln
+  InstrItinData<IIC_VST4ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 1, 1, 2, 2]>,
+  //
+  // VST4lnu
+  InstrItinData<IIC_VST4lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1, 2, 2]>,
+
+  //
+  // Double-register Integer Unary
+  InstrItinData<IIC_VUNAiD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 2]>,
+  //
+  // Quad-register Integer Unary
+  InstrItinData<IIC_VUNAiQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 2]>,
+  //
+  // Double-register Integer Q-Unary
+  InstrItinData<IIC_VQUNAiD,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1]>,
+  //
+  // Quad-register Integer CountQ-Unary
+  InstrItinData<IIC_VQUNAiQ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1]>,
+  //
+  // Double-register Integer Binary
+  InstrItinData<IIC_VBINiD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [3, 2, 2]>,
+  //
+  // Quad-register Integer Binary
+  InstrItinData<IIC_VBINiQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [3, 2, 2]>,
+  //
+  // Double-register Integer Subtract
+  InstrItinData<IIC_VSUBiD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [3, 2, 1]>,
+  //
+  // Quad-register Integer Subtract
+  InstrItinData<IIC_VSUBiQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [3, 2, 1]>,
+  //
+  // Double-register Integer Shift
+  InstrItinData<IIC_VSHLiD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [3, 1, 1]>,
+  //
+  // Quad-register Integer Shift
+  InstrItinData<IIC_VSHLiQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [3, 1, 1]>,
+  //
+  // Double-register Integer Shift (4 cycle)
+  InstrItinData<IIC_VSHLi4D,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1, 1]>,
+  //
+  // Quad-register Integer Shift (4 cycle)
+  InstrItinData<IIC_VSHLi4Q,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1, 1]>,
+  //
+  // Double-register Integer Binary (4 cycle)
+  InstrItinData<IIC_VBINi4D,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 2, 2]>,
+  //
+  // Quad-register Integer Binary (4 cycle)
+  InstrItinData<IIC_VBINi4Q,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 2, 2]>,
+  //
+  // Double-register Integer Subtract (4 cycle)
+  InstrItinData<IIC_VSUBi4D,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 2, 1]>,
+  //
+  // Quad-register Integer Subtract (4 cycle)
+  InstrItinData<IIC_VSUBi4Q,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 2, 1]>,
+
+  //
+  // Double-register Integer Count
+  InstrItinData<IIC_VCNTiD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [3, 2, 2]>,
+  //
+  // Quad-register Integer Count
+  // Result written in N3, but that is relative to the last cycle of multicycle,
+  // so we use 4 for those cases
+  InstrItinData<IIC_VCNTiQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [4, 2, 2]>,
+  //
+  // Double-register Absolute Difference and Accumulate
+  InstrItinData<IIC_VABAD,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [6, 3, 2, 1]>,
+  //
+  // Quad-register Absolute Difference and Accumulate
+  InstrItinData<IIC_VABAQ,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [6, 3, 2, 1]>,
+  //
+  // Double-register Integer Pair Add Long
+  InstrItinData<IIC_VPALiD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [6, 3, 1]>,
+  //
+  // Quad-register Integer Pair Add Long
+  InstrItinData<IIC_VPALiQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [6, 3, 1]>,
+
+  //
+  // Double-register Integer Multiply (.8, .16)
+  InstrItinData<IIC_VMULi16D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [6, 2, 2]>,
+  //
+  // Quad-register Integer Multiply (.8, .16)
+  InstrItinData<IIC_VMULi16Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [7, 2, 2]>,
+
+  //
+  // Double-register Integer Multiply (.32)
+  InstrItinData<IIC_VMULi32D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [7, 2, 1]>,
+  //
+  // Quad-register Integer Multiply (.32)
+  InstrItinData<IIC_VMULi32Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 9 cycles
+                               InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<4, [A9_NPipe]>],
+                              [9, 2, 1]>,
+  //
+  // Double-register Integer Multiply-Accumulate (.8, .16)
+  InstrItinData<IIC_VMACi16D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [6, 3, 2, 2]>,
+  //
+  // Double-register Integer Multiply-Accumulate (.32)
+  InstrItinData<IIC_VMACi32D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [7, 3, 2, 1]>,
+  //
+  // Quad-register Integer Multiply-Accumulate (.8, .16)
+  InstrItinData<IIC_VMACi16Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [7, 3, 2, 2]>,
+  //
+  // Quad-register Integer Multiply-Accumulate (.32)
+  InstrItinData<IIC_VMACi32Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 9 cycles
+                               InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<4, [A9_NPipe]>],
+                              [9, 3, 2, 1]>,
+
+  //
+  // Move
+  InstrItinData<IIC_VMOV,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [1,1]>,
+  //
+  // Move Immediate
+  InstrItinData<IIC_VMOVImm,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [3]>,
+  //
+  // Double-register Permute Move
+  InstrItinData<IIC_VMOVD,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [2, 1]>,
+  //
+  // Quad-register Permute Move
+  InstrItinData<IIC_VMOVQ,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [2, 1]>,
+  //
+  // Integer to Single-precision Move
+  InstrItinData<IIC_VMOVIS ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [1, 1]>,
+  //
+  // Integer to Double-precision Move
+  InstrItinData<IIC_VMOVID ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [1, 1, 1]>,
+  //
+  // Single-precision to Integer Move
+  InstrItinData<IIC_VMOVSI ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [2, 1]>,
+  //
+  // Double-precision to Integer Move
+  InstrItinData<IIC_VMOVDI ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [2, 2, 1]>,
+  //
+  // Integer to Lane Move
+  InstrItinData<IIC_VMOVISL , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<4, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [3, 1, 1]>,
+
+  //
+  // Vector narrow move
+  InstrItinData<IIC_VMOVN,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [3, 1]>,
+  //
+  // Double-register FP Unary
+  InstrItinData<IIC_VUNAD,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [5, 2]>,
+  //
+  // Quad-register FP Unary
+  // Result written in N5, but that is relative to the last cycle of multicycle,
+  // so we use 6 for those cases
+  InstrItinData<IIC_VUNAQ,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [6, 2]>,
+  //
+  // Double-register FP Binary
+  // FIXME: We're using this itin for many instructions and [2, 2] here is too
+  // optimistic.
+  InstrItinData<IIC_VBIND,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [5, 2, 2]>,
+
+  //
+  // VPADD, etc.
+  InstrItinData<IIC_VPBIND,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [5, 1, 1]>,
+  //
+  // Double-register FP VMUL
+  InstrItinData<IIC_VFMULD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [5, 2, 1]>,
+  //
+  // Quad-register FP Binary
+  // Result written in N5, but that is relative to the last cycle of multicycle,
+  // so we use 6 for those cases
+  // FIXME: We're using this itin for many instructions and [2, 2] here is too
+  // optimistic.
+  InstrItinData<IIC_VBINQ,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [6, 2, 2]>,
+  //
+  // Quad-register FP VMUL
+  InstrItinData<IIC_VFMULQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [6, 2, 1]>,
+  //
+  // Double-register FP Multiple-Accumulate
+  InstrItinData<IIC_VMACD,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [6, 3, 2, 1]>,
+  //
+  // Quad-register FP Multiple-Accumulate
+  // Result written in N9, but that is relative to the last cycle of multicycle,
+  // so we use 10 for those cases
+  InstrItinData<IIC_VMACQ,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 9 cycles
+                               InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<4, [A9_NPipe]>],
+                              [8, 4, 2, 1]>,
+  //
+  // Double-register Fused FP Multiple-Accumulate
+  InstrItinData<IIC_VFMACD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [6, 3, 2, 1]>,
+  //
+  // Quad-register Fused FP Multiple-Accumulate
+  // Result written in N9, but that is relative to the last cycle of multicycle,
+  // so we use 10 for those cases
+  InstrItinData<IIC_VFMACQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 9 cycles
+                               InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<4, [A9_NPipe]>],
+                              [8, 4, 2, 1]>,
+  //
+  // Double-register Reciprical Step
+  InstrItinData<IIC_VRECSD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 10 cycles
+                               InstrStage<11, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [9, 2, 2]>,
+  //
+  // Quad-register Reciprical Step
+  InstrItinData<IIC_VRECSQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 11 cycles
+                               InstrStage<12, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [10, 2, 2]>,
+  //
+  // Double-register Permute
+  InstrItinData<IIC_VPERMD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [2, 2, 1, 1]>,
+  //
+  // Quad-register Permute
+  // Result written in N2, but that is relative to the last cycle of multicycle,
+  // so we use 3 for those cases
+  InstrItinData<IIC_VPERMQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [3, 3, 1, 1]>,
+  //
+  // Quad-register Permute (3 cycle issue)
+  // Result written in N2, but that is relative to the last cycle of multicycle,
+  // so we use 4 for those cases
+  InstrItinData<IIC_VPERMQ3,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 8 cycles
+                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe]>],
+                              [4, 4, 1, 1]>,
+
+  //
+  // Double-register VEXT
+  InstrItinData<IIC_VEXTD,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [2, 1, 1]>,
+  //
+  // Quad-register VEXT
+  InstrItinData<IIC_VEXTQ,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [3, 1, 2]>,
+  //
+  // VTB
+  InstrItinData<IIC_VTB1,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [3, 2, 1]>,
+  InstrItinData<IIC_VTB2,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<2, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [3, 2, 2, 1]>,
+  InstrItinData<IIC_VTB3,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<2, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 8 cycles
+                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe]>],
+                              [4, 2, 2, 3, 1]>,
+  InstrItinData<IIC_VTB4,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 8 cycles
+                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe]>],
+                              [4, 2, 2, 3, 3, 1]>,
+  //
+  // VTBX
+  InstrItinData<IIC_VTBX1,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [3, 1, 2, 1]>,
+  InstrItinData<IIC_VTBX2,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [3, 1, 2, 2, 1]>,
+  InstrItinData<IIC_VTBX3,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 8 cycles
+                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe]>],
+                              [4, 1, 2, 2, 3, 1]>,
+  InstrItinData<IIC_VTBX4,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 8 cycles
+                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [4, 1, 2, 2, 3, 3, 1]>
+]>;
+
+// ===---------------------------------------------------------------------===//
+// The following definitions describe the simpler per-operand machine model.
+// This works with MachineScheduler and will eventually replace itineraries.
+
+class A9WriteLMOpsListType<list<WriteSequence> writes> {
+  list <WriteSequence> Writes = writes;
+  SchedMachineModel SchedModel = ?;
+}
+
+// Cortex-A9 machine model for scheduling and other instruction cost heuristics.
+def CortexA9Model : SchedMachineModel {
+  let IssueWidth = 2; // 2 micro-ops are dispatched per cycle.
+  let MicroOpBufferSize = 56; // Based on available renamed registers.
+  let LoadLatency = 2; // Optimistic load latency assuming bypass.
+                       // This is overriden by OperandCycles if the
+                       // Itineraries are queried instead.
+  let MispredictPenalty = 8; // Based on estimate of pipeline depth.
+
+  let Itineraries = CortexA9Itineraries;
+
+  // FIXME: Many vector operations were never given an itinerary. We
+  // haven't mapped these to the new model either.
+  let CompleteModel = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available.
+//
+// The AGU unit has BufferSize=1 so that the latency between operations
+// that use it are considered to stall other operations.
+//
+// The FP unit has BufferSize=0 so that it is a hard dispatch
+// hazard. No instruction may be dispatched while the unit is reserved.
+
+let SchedModel = CortexA9Model in {
+
+def A9UnitALU : ProcResource<2>;
+def A9UnitMul : ProcResource<1> { let Super = A9UnitALU; }
+def A9UnitAGU : ProcResource<1> { let BufferSize = 1; }
+def A9UnitLS  : ProcResource<1>;
+def A9UnitFP  : ProcResource<1> { let BufferSize = 0; }
+def A9UnitB   : ProcResource<1>;
+
+//===----------------------------------------------------------------------===//
+// Define scheduler read/write types with their resources and latency on A9.
+
+// Consume an issue slot, but no processor resources. This is useful when all
+// other writes associated with the operand have NumMicroOps = 0.
+def A9WriteIssue : SchedWriteRes<[]> { let Latency = 0; }
+
+// Write an integer register.
+def A9WriteI : SchedWriteRes<[A9UnitALU]>;
+// Write an integer shifted-by register
+def A9WriteIsr : SchedWriteRes<[A9UnitALU]> { let Latency = 2; }
+
+// Basic ALU.
+def A9WriteALU : SchedWriteRes<[A9UnitALU]>;
+// ALU with operand shifted by immediate.
+def : WriteRes<WriteALUsi, [A9UnitALU]> { let Latency = 2; }
+// ALU with operand shifted by register.
+def A9WriteALUsr : SchedWriteRes<[A9UnitALU]> { let Latency = 3; }
+
+// Multiplication
+def A9WriteM   : SchedWriteRes<[A9UnitMul, A9UnitMul]> { let Latency = 4; }
+def A9WriteMHi : SchedWriteRes<[A9UnitMul]> { let Latency = 5;
+                                              let NumMicroOps = 0; }
+def A9WriteM16   : SchedWriteRes<[A9UnitMul]> { let Latency = 3; }
+def A9WriteM16Hi : SchedWriteRes<[A9UnitMul]> { let Latency = 4;
+                                                let NumMicroOps = 0; }
+
+// Floating-point
+// Only one FP or AGU instruction may issue per cycle. We model this
+// by having FP instructions consume the AGU resource.
+def A9WriteF      : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 4; }
+def A9WriteFMov   : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 1; }
+def A9WriteFMulS  : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 5; }
+def A9WriteFMulD  : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 6; }
+def A9WriteFMAS   : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 8; }
+def A9WriteFMAD   : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 9; }
+def A9WriteFDivS  : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 15; }
+def A9WriteFDivD  : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 25; }
+def A9WriteFSqrtS : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 17; }
+def A9WriteFSqrtD : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 32; }
+
+// NEON has an odd mix of latencies. Simply name the write types by latency.
+def A9WriteV1 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 1; }
+def A9WriteV2 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 2; }
+def A9WriteV3 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 3; }
+def A9WriteV4 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 4; }
+def A9WriteV5 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 5; }
+def A9WriteV6 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 6; }
+def A9WriteV7 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 7; }
+def A9WriteV9 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 9; }
+def A9WriteV10 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 10; }
+
+// Reserve A9UnitFP for 2 consecutive cycles.
+def A9Write2V4 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> {
+  let Latency = 4;
+  let ResourceCycles = [2];
+}
+def A9Write2V7 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> {
+  let Latency = 7;
+  let ResourceCycles = [2];
+}
+def A9Write2V9 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> {
+  let Latency = 9;
+  let ResourceCycles = [2];
+}
+
+// Branches don't have a def operand but still consume resources.
+def A9WriteB : SchedWriteRes<[A9UnitB]>;
+
+// Address generation.
+def A9WriteAdr : SchedWriteRes<[A9UnitAGU]> { let NumMicroOps = 0; }
+
+// Load Integer.
+def A9WriteL : SchedWriteRes<[A9UnitLS]> { let Latency = 3; }
+// Load the upper 32-bits using the same micro-op.
+def A9WriteLHi : SchedWriteRes<[]> { let Latency = 3;
+                                     let NumMicroOps = 0; }
+// Offset shifted by register.
+def A9WriteLsi : SchedWriteRes<[A9UnitLS]> { let Latency = 4; }
+// Load (and zero extend) a byte.
+def A9WriteLb : SchedWriteRes<[A9UnitLS]> { let Latency = 4; }
+def A9WriteLbsi : SchedWriteRes<[A9UnitLS]> { let Latency = 5; }
+
+// Load or Store Float, aligned.
+def A9WriteLSfp : SchedWriteRes<[A9UnitLS, A9UnitFP]> { let Latency = 1; }
+
+// Store Integer.
+def A9WriteS : SchedWriteRes<[A9UnitLS]>;
+
+//===----------------------------------------------------------------------===//
+// Define resources dynamically for load multiple variants.
+
+// Define helpers for extra latency without consuming resources.
+def A9WriteCycle1 : SchedWriteRes<[]> { let Latency = 1; let NumMicroOps = 0; }
+foreach NumCycles = 2-8 in {
+def A9WriteCycle#NumCycles : WriteSequence<[A9WriteCycle1], NumCycles>;
+} // foreach NumCycles
+
+// Define address generation sequences and predicates for 8 flavors of LDMs.
+foreach NumAddr = 1-8 in {
+
+// Define A9WriteAdr1-8 as a sequence of A9WriteAdr with additive
+// latency for instructions that generate multiple loads or stores.
+def A9WriteAdr#NumAddr : WriteSequence<[A9WriteAdr], NumAddr>;
+
+// Define a predicate to select the LDM based on number of memory addresses.
+def A9LMAdr#NumAddr#Pred :
+  SchedPredicate<"(TII->getNumLDMAddresses(MI)+1)/2 == "#NumAddr>;
+
+} // foreach NumAddr
+
+// Fall-back for unknown LDMs.
+def A9LMUnknownPred : SchedPredicate<"TII->getNumLDMAddresses(MI) == 0">;
+
+// LDM/VLDM/VLDn address generation latency & resources.
+// Dynamically select the A9WriteAdrN sequence using a predicate.
+def A9WriteLMAdr : SchedWriteVariant<[
+  SchedVar<A9LMAdr1Pred, [A9WriteAdr1]>,
+  SchedVar<A9LMAdr2Pred, [A9WriteAdr2]>,
+  SchedVar<A9LMAdr3Pred, [A9WriteAdr3]>,
+  SchedVar<A9LMAdr4Pred, [A9WriteAdr4]>,
+  SchedVar<A9LMAdr5Pred, [A9WriteAdr5]>,
+  SchedVar<A9LMAdr6Pred, [A9WriteAdr6]>,
+  SchedVar<A9LMAdr7Pred, [A9WriteAdr7]>,
+  SchedVar<A9LMAdr8Pred, [A9WriteAdr8]>,
+  // For unknown LDM/VLDM/VSTM, assume 2 32-bit registers.
+  SchedVar<A9LMUnknownPred, [A9WriteAdr2]>]>;
+
+// Define LDM Resources.
+// These take no issue resource, so they can be combined with other
+// writes like WriteB.
+// A9WriteLMLo takes a single LS resource and 2 cycles.
+def A9WriteLMLo : SchedWriteRes<[A9UnitLS]> { let Latency = 2;
+                                              let NumMicroOps = 0; }
+// Assuming aligned access, the upper half of each pair is free with
+// the same latency.
+def A9WriteLMHi : SchedWriteRes<[]> { let Latency = 2;
+                                      let NumMicroOps = 0; }
+// Each A9WriteL#N variant adds N cycles of latency without consuming
+// additional resources.
+foreach NumAddr = 1-8 in {
+def A9WriteL#NumAddr : WriteSequence<
+  [A9WriteLMLo, !cast<SchedWrite>("A9WriteCycle"#NumAddr)]>;
+def A9WriteL#NumAddr#Hi : WriteSequence<
+  [A9WriteLMHi, !cast<SchedWrite>("A9WriteCycle"#NumAddr)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// LDM: Load multiple into 32-bit integer registers.
+
+def A9WriteLMOpsList : A9WriteLMOpsListType<
+                 [A9WriteL1, A9WriteL1Hi,
+                  A9WriteL2, A9WriteL2Hi,
+                  A9WriteL3, A9WriteL3Hi,
+                  A9WriteL4, A9WriteL4Hi,
+                  A9WriteL5, A9WriteL5Hi,
+                  A9WriteL6, A9WriteL6Hi,
+                  A9WriteL7, A9WriteL7Hi,
+                  A9WriteL8, A9WriteL8Hi]>;
+
+// A9WriteLM variants expand into a pair of writes for each 64-bit
+// value loaded. When the number of registers is odd, the last
+// A9WriteLnHi is naturally ignored because the instruction has no
+// following def operands.  These variants take no issue resource, so
+// they may need to be part of a WriteSequence that includes A9WriteIssue.
+def A9WriteLM : SchedWriteVariant<[
+  SchedVar<A9LMAdr1Pred, A9WriteLMOpsList.Writes[0-1]>,
+  SchedVar<A9LMAdr2Pred, A9WriteLMOpsList.Writes[0-3]>,
+  SchedVar<A9LMAdr3Pred, A9WriteLMOpsList.Writes[0-5]>,
+  SchedVar<A9LMAdr4Pred, A9WriteLMOpsList.Writes[0-7]>,
+  SchedVar<A9LMAdr5Pred, A9WriteLMOpsList.Writes[0-9]>,
+  SchedVar<A9LMAdr6Pred, A9WriteLMOpsList.Writes[0-11]>,
+  SchedVar<A9LMAdr7Pred, A9WriteLMOpsList.Writes[0-13]>,
+  SchedVar<A9LMAdr8Pred, A9WriteLMOpsList.Writes[0-15]>,
+  // For unknown LDMs, define the maximum number of writes, but only
+  // make the first two consume resources.
+  SchedVar<A9LMUnknownPred, [A9WriteL1, A9WriteL1Hi,
+                             A9WriteL2, A9WriteL2Hi,
+                             A9WriteL3Hi, A9WriteL3Hi,
+                             A9WriteL4Hi, A9WriteL4Hi,
+                             A9WriteL5Hi, A9WriteL5Hi,
+                             A9WriteL6Hi, A9WriteL6Hi,
+                             A9WriteL7Hi, A9WriteL7Hi,
+                             A9WriteL8Hi, A9WriteL8Hi]>]> {
+  let Variadic = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// VFP Load/Store Multiple Variants, and NEON VLDn/VSTn support.
+
+// A9WriteLfpOp is the same as A9WriteLSfp but takes no issue resources
+// so can be used in WriteSequences for in single-issue instructions that
+// encapsulate multiple loads.
+def A9WriteLfpOp : SchedWriteRes<[A9UnitLS, A9UnitFP]> {
+  let Latency = 1;
+  let NumMicroOps = 0;
+}
+
+foreach NumAddr = 1-8 in {
+
+// Helper for A9WriteLfp1-8: A sequence of fp loads with no micro-ops.
+def A9WriteLfp#NumAddr#Seq : WriteSequence<[A9WriteLfpOp], NumAddr>;
+
+// A9WriteLfp1-8 definitions are statically expanded into a sequence of
+// A9WriteLfpOps with additive latency that takes a single issue slot.
+// Used directly to describe NEON VLDn.
+def A9WriteLfp#NumAddr : WriteSequence<
+  [A9WriteIssue, !cast<SchedWrite>("A9WriteLfp"#NumAddr#Seq)]>;
+
+// A9WriteLfp1-8Mov adds a cycle of latency and FP resource for
+// permuting loaded values.
+def A9WriteLfp#NumAddr#Mov : WriteSequence<
+  [A9WriteF, !cast<SchedWrite>("A9WriteLfp"#NumAddr#Seq)]>;
+
+} // foreach NumAddr
+
+// Define VLDM/VSTM PreRA resources.
+// A9WriteLMfpPreRA are dynamically expanded into the correct
+// A9WriteLfp1-8 sequence based on a predicate. This supports the
+// preRA VLDM variants in which all 64-bit loads are written to the
+// same tuple of either single or double precision registers.
+def A9WriteLMfpPreRA : SchedWriteVariant<[
+  SchedVar<A9LMAdr1Pred, [A9WriteLfp1]>,
+  SchedVar<A9LMAdr2Pred, [A9WriteLfp2]>,
+  SchedVar<A9LMAdr3Pred, [A9WriteLfp3]>,
+  SchedVar<A9LMAdr4Pred, [A9WriteLfp4]>,
+  SchedVar<A9LMAdr5Pred, [A9WriteLfp5]>,
+  SchedVar<A9LMAdr6Pred, [A9WriteLfp6]>,
+  SchedVar<A9LMAdr7Pred, [A9WriteLfp7]>,
+  SchedVar<A9LMAdr8Pred, [A9WriteLfp8]>,
+  // For unknown VLDM/VSTM PreRA, assume 2xS registers.
+  SchedVar<A9LMUnknownPred, [A9WriteLfp2]>]>;
+
+// Define VLDM/VSTM PostRA Resources.
+// A9WriteLMfpLo takes a LS and FP resource and one issue slot but no latency.
+def A9WriteLMfpLo : SchedWriteRes<[A9UnitLS, A9UnitFP]> { let Latency = 0; }
+
+foreach NumAddr = 1-8 in {
+
+// Each A9WriteL#N variant adds N cycles of latency without consuming
+// additional resources.
+def A9WriteLMfp#NumAddr : WriteSequence<
+  [A9WriteLMfpLo, !cast<SchedWrite>("A9WriteCycle"#NumAddr)]>;
+
+// Assuming aligned access, the upper half of each pair is free with
+// the same latency.
+def A9WriteLMfp#NumAddr#Hi : WriteSequence<
+  [A9WriteLMHi, !cast<SchedWrite>("A9WriteCycle"#NumAddr)]>;
+
+} // foreach NumAddr
+
+// VLDM PostRA Variants. These variants expand A9WriteLMfpPostRA into a
+// pair of writes for each 64-bit data loaded. When the number of
+// registers is odd, the last WriteLMfpnHi is naturally ignored because
+// the instruction has no following def operands.
+
+def A9WriteLMfpPostRAOpsList : A9WriteLMOpsListType<
+                 [A9WriteLMfp1, A9WriteLMfp2,       // 0-1
+                  A9WriteLMfp3, A9WriteLMfp4,       // 2-3
+                  A9WriteLMfp5, A9WriteLMfp6,       // 4-5
+                  A9WriteLMfp7, A9WriteLMfp8,       // 6-7
+                  A9WriteLMfp1Hi,                   // 8-8
+                  A9WriteLMfp2Hi, A9WriteLMfp2Hi,   // 9-10
+                  A9WriteLMfp3Hi, A9WriteLMfp3Hi,   // 11-12
+                  A9WriteLMfp4Hi, A9WriteLMfp4Hi,   // 13-14
+                  A9WriteLMfp5Hi, A9WriteLMfp5Hi,   // 15-16
+                  A9WriteLMfp6Hi, A9WriteLMfp6Hi,   // 17-18
+                  A9WriteLMfp7Hi, A9WriteLMfp7Hi,   // 19-20
+                  A9WriteLMfp8Hi, A9WriteLMfp8Hi]>; // 21-22
+
+def A9WriteLMfpPostRA : SchedWriteVariant<[
+  SchedVar<A9LMAdr1Pred, A9WriteLMfpPostRAOpsList.Writes[0-0, 8-8]>,
+  SchedVar<A9LMAdr2Pred, A9WriteLMfpPostRAOpsList.Writes[0-1, 9-10]>,
+  SchedVar<A9LMAdr3Pred, A9WriteLMfpPostRAOpsList.Writes[0-2, 10-12]>,
+  SchedVar<A9LMAdr4Pred, A9WriteLMfpPostRAOpsList.Writes[0-3, 11-14]>,
+  SchedVar<A9LMAdr5Pred, A9WriteLMfpPostRAOpsList.Writes[0-4, 12-16]>,
+  SchedVar<A9LMAdr6Pred, A9WriteLMfpPostRAOpsList.Writes[0-5, 13-18]>,
+  SchedVar<A9LMAdr7Pred, A9WriteLMfpPostRAOpsList.Writes[0-6, 14-20]>,
+  SchedVar<A9LMAdr8Pred, A9WriteLMfpPostRAOpsList.Writes[0-7, 15-22]>,
+  // For unknown LDMs, define the maximum number of writes, but only
+  // make the first two consume resources. We are optimizing for the case
+  // where the operands are DPRs, and this determines the first eight
+  // types. The remaining eight types are filled to cover the case
+  // where the operands are SPRs.
+  SchedVar<A9LMUnknownPred, [A9WriteLMfp1, A9WriteLMfp2,
+                             A9WriteLMfp3Hi, A9WriteLMfp4Hi,
+                             A9WriteLMfp5Hi, A9WriteLMfp6Hi,
+                             A9WriteLMfp7Hi, A9WriteLMfp8Hi,
+                             A9WriteLMfp5Hi, A9WriteLMfp5Hi,
+                             A9WriteLMfp6Hi, A9WriteLMfp6Hi,
+                             A9WriteLMfp7Hi, A9WriteLMfp7Hi,
+                             A9WriteLMfp8Hi, A9WriteLMfp8Hi]>]> {
+  let Variadic = 1;
+}
+
+// Distinguish between our multiple MI-level forms of the same
+// VLDM/VSTM instructions.
+def A9PreRA : SchedPredicate<
+  "TargetRegisterInfo::isVirtualRegister(MI->getOperand(0).getReg())">;
+def A9PostRA : SchedPredicate<
+  "TargetRegisterInfo::isPhysicalRegister(MI->getOperand(0).getReg())">;
+
+// VLDM represents all destination registers as a single register
+// tuple, unlike LDM. So the number of write operands is not variadic.
+def A9WriteLMfp : SchedWriteVariant<[
+  SchedVar<A9PreRA, [A9WriteLMfpPreRA]>,
+  SchedVar<A9PostRA, [A9WriteLMfpPostRA]>]>;
+
+//===----------------------------------------------------------------------===//
+// Resources for other (non-LDM/VLDM) Variants.
+
+// These mov immediate writers are unconditionally expanded with
+// additive latency.
+def A9WriteI2 : WriteSequence<[A9WriteI, A9WriteI]>;
+def A9WriteI2pc : WriteSequence<[A9WriteI, A9WriteI, WriteALU]>;
+def A9WriteI2ld  : WriteSequence<[A9WriteI, A9WriteI, A9WriteL]>;
+
+// Some ALU operations can read loaded integer values one cycle early.
+def A9ReadALU : SchedReadAdvance<1,
+  [A9WriteL, A9WriteLHi, A9WriteLsi, A9WriteLb, A9WriteLbsi,
+   A9WriteL1, A9WriteL2, A9WriteL3, A9WriteL4,
+   A9WriteL5, A9WriteL6, A9WriteL7, A9WriteL8,
+   A9WriteL1Hi, A9WriteL2Hi, A9WriteL3Hi, A9WriteL4Hi,
+   A9WriteL5Hi, A9WriteL6Hi, A9WriteL7Hi, A9WriteL8Hi]>;
+
+// Read types for operands that are unconditionally read in cycle N
+// after the instruction issues, decreases producer latency by N-1.
+def A9Read2 : SchedReadAdvance<1>;
+def A9Read3 : SchedReadAdvance<2>;
+def A9Read4 : SchedReadAdvance<3>;
+
+//===----------------------------------------------------------------------===//
+// Map itinerary classes to scheduler read/write resources per operand.
+//
+// For ARM, we piggyback scheduler resources on the Itinerary classes
+// to avoid perturbing the existing instruction definitions.
+
+// This table follows the ARM Cortex-A9 Technical Reference Manuals,
+// mostly in order.
+
+def :ItinRW<[WriteALU], [IIC_iMOVi,IIC_iMOVr,IIC_iMOVsi,
+                         IIC_iMVNi,IIC_iMVNsi,
+                         IIC_iCMOVi,IIC_iCMOVr,IIC_iCMOVsi]>;
+def :ItinRW<[WriteALU, A9ReadALU],[IIC_iMVNr]>;
+def :ItinRW<[A9WriteIsr], [IIC_iMOVsr,IIC_iMVNsr,IIC_iCMOVsr]>;
+
+def :ItinRW<[A9WriteI2],   [IIC_iMOVix2,IIC_iCMOVix2]>;
+def :ItinRW<[A9WriteI2pc], [IIC_iMOVix2addpc]>;
+def :ItinRW<[A9WriteI2ld], [IIC_iMOVix2ld]>;
+
+def :ItinRW<[WriteALU], [IIC_iBITi,IIC_iBITr,IIC_iUNAr,IIC_iTSTi,IIC_iTSTr]>;
+def :ItinRW<[WriteALU, A9ReadALU], [IIC_iALUi, IIC_iCMPi, IIC_iCMPsi]>;
+def :ItinRW<[WriteALU, A9ReadALU, A9ReadALU],[IIC_iALUr,IIC_iCMPr]>;
+def :ItinRW<[WriteALUsi], [IIC_iBITsi,IIC_iUNAsi,IIC_iEXTr,IIC_iTSTsi]>;
+def :ItinRW<[WriteALUsi, A9ReadALU], [IIC_iALUsi]>;
+def :ItinRW<[WriteALUsi, ReadDefault, A9ReadALU], [IIC_iALUsir]>; // RSB
+def :ItinRW<[A9WriteALUsr], [IIC_iBITsr,IIC_iTSTsr,IIC_iEXTAr,IIC_iEXTAsr]>;
+def :ItinRW<[A9WriteALUsr, A9ReadALU], [IIC_iALUsr,IIC_iCMPsr]>;
+
+// A9WriteHi ignored for MUL32.
+def :ItinRW<[A9WriteM, A9WriteMHi], [IIC_iMUL32,IIC_iMAC32,
+                                     IIC_iMUL64,IIC_iMAC64]>;
+// FIXME: SMLALxx needs itin classes
+def :ItinRW<[A9WriteM16, A9WriteM16Hi], [IIC_iMUL16,IIC_iMAC16]>;
+
+// TODO: For floating-point ops, we model the pipeline forwarding
+// latencies here. WAW latencies are sometimes longer.
+
+def :ItinRW<[A9WriteFMov], [IIC_fpSTAT, IIC_fpMOVIS, IIC_fpMOVID, IIC_fpMOVSI,
+                            IIC_fpUNA32, IIC_fpUNA64,
+                            IIC_fpCMP32, IIC_fpCMP64]>;
+def :ItinRW<[A9WriteFMov, A9WriteFMov], [IIC_fpMOVDI]>;
+def :ItinRW<[A9WriteF], [IIC_fpCVTSD, IIC_fpCVTDS, IIC_fpCVTSH, IIC_fpCVTHS,
+                         IIC_fpCVTIS, IIC_fpCVTID, IIC_fpCVTSI, IIC_fpCVTDI,
+                         IIC_fpALU32, IIC_fpALU64]>;
+def :ItinRW<[A9WriteFMulS], [IIC_fpMUL32]>;
+def :ItinRW<[A9WriteFMulD], [IIC_fpMUL64]>;
+def :ItinRW<[A9WriteFMAS], [IIC_fpMAC32]>;
+def :ItinRW<[A9WriteFMAD], [IIC_fpMAC64]>;
+def :ItinRW<[A9WriteFDivS], [IIC_fpDIV32]>;
+def :ItinRW<[A9WriteFDivD], [IIC_fpDIV64]>;
+def :ItinRW<[A9WriteFSqrtS], [IIC_fpSQRT32]>;
+def :ItinRW<[A9WriteFSqrtD], [IIC_fpSQRT64]>;
+
+def :ItinRW<[A9WriteB], [IIC_Br]>;
+
+// A9 PLD is processed in a dedicated unit.
+def :ItinRW<[], [IIC_Preload]>;
+
+// Note: We must assume that loads are aligned, since the machine
+// model cannot know this statically and A9 ignores alignment hints.
+
+// A9WriteAdr consumes AGU regardless address writeback. But it's
+// latency is only relevant for users of an updated address.
+def :ItinRW<[A9WriteL, A9WriteAdr], [IIC_iLoad_i,IIC_iLoad_r,
+                                     IIC_iLoad_iu,IIC_iLoad_ru]>;
+def :ItinRW<[A9WriteLsi, A9WriteAdr], [IIC_iLoad_si,IIC_iLoad_siu]>;
+def :ItinRW<[A9WriteLb, A9WriteAdr2], [IIC_iLoad_bh_i,IIC_iLoad_bh_r,
+                                       IIC_iLoad_bh_iu,IIC_iLoad_bh_ru]>;
+def :ItinRW<[A9WriteLbsi, A9WriteAdr2], [IIC_iLoad_bh_si,IIC_iLoad_bh_siu]>;
+def :ItinRW<[A9WriteL, A9WriteLHi, A9WriteAdr], [IIC_iLoad_d_i,IIC_iLoad_d_r,
+                                            IIC_iLoad_d_ru]>;
+// Store either has no def operands, or the one def for address writeback.
+def :ItinRW<[A9WriteAdr, A9WriteS], [IIC_iStore_i, IIC_iStore_r,
+                                     IIC_iStore_iu, IIC_iStore_ru,
+                                     IIC_iStore_d_i, IIC_iStore_d_r,
+                                     IIC_iStore_d_ru]>;
+def :ItinRW<[A9WriteAdr2, A9WriteS], [IIC_iStore_si, IIC_iStore_siu,
+                                      IIC_iStore_bh_i, IIC_iStore_bh_r,
+                                      IIC_iStore_bh_iu, IIC_iStore_bh_ru]>;
+def :ItinRW<[A9WriteAdr3, A9WriteS], [IIC_iStore_bh_si, IIC_iStore_bh_siu]>;
+
+// A9WriteML will be expanded into a separate write for each def
+// operand. Address generation consumes resources, but A9WriteLMAdr
+// is listed after all def operands, so has no effective latency.
+//
+// Note: A9WriteLM expands into an even number of def operands. The
+// actual number of def operands may be less by one.
+def :ItinRW<[A9WriteLM, A9WriteLMAdr, A9WriteIssue], [IIC_iLoad_m, IIC_iPop]>;
+
+// Load multiple with address writeback has an extra def operand in
+// front of the loaded registers.
+//
+// Reuse the load-multiple variants for store-multiple because the
+// resources are identical, For stores only the address writeback
+// has a def operand so the WriteL latencies are unused.
+def :ItinRW<[A9WriteLMAdr, A9WriteLM, A9WriteIssue], [IIC_iLoad_mu,
+                                                      IIC_iStore_m,
+                                                      IIC_iStore_mu]>;
+def :ItinRW<[A9WriteLM, A9WriteLMAdr, A9WriteB], [IIC_iLoad_mBr, IIC_iPop_Br]>;
+def :ItinRW<[A9WriteL, A9WriteAdr, WriteALU], [IIC_iLoadiALU]>;
+
+def :ItinRW<[A9WriteLSfp, A9WriteAdr], [IIC_fpLoad32, IIC_fpLoad64]>;
+
+def :ItinRW<[A9WriteLMfp, A9WriteLMAdr], [IIC_fpLoad_m]>;
+def :ItinRW<[A9WriteLMAdr, A9WriteLMfp], [IIC_fpLoad_mu]>;
+def :ItinRW<[A9WriteAdr, A9WriteLSfp], [IIC_fpStore32, IIC_fpStore64,
+                                        IIC_fpStore_m, IIC_fpStore_mu]>;
+
+// Note: Unlike VLDM, VLD1 expects the writeback operand after the
+// normal writes.
+def :ItinRW<[A9WriteLfp1, A9WriteAdr1], [IIC_VLD1, IIC_VLD1u,
+                                         IIC_VLD1x2, IIC_VLD1x2u]>;
+def :ItinRW<[A9WriteLfp2, A9WriteAdr2], [IIC_VLD1x3, IIC_VLD1x3u,
+                                         IIC_VLD1x4, IIC_VLD1x4u,
+                                         IIC_VLD4dup, IIC_VLD4dupu]>;
+def :ItinRW<[A9WriteLfp1Mov, A9WriteAdr1], [IIC_VLD1dup, IIC_VLD1dupu,
+                                            IIC_VLD2, IIC_VLD2u,
+                                            IIC_VLD2dup, IIC_VLD2dupu]>;
+def :ItinRW<[A9WriteLfp2Mov, A9WriteAdr1], [IIC_VLD1ln, IIC_VLD1lnu,
+                                            IIC_VLD2x2, IIC_VLD2x2u,
+                                            IIC_VLD2ln, IIC_VLD2lnu]>;
+def :ItinRW<[A9WriteLfp3Mov, A9WriteAdr3], [IIC_VLD3, IIC_VLD3u,
+                                            IIC_VLD3dup, IIC_VLD3dupu]>;
+def :ItinRW<[A9WriteLfp4Mov, A9WriteAdr4], [IIC_VLD4, IIC_VLD4u,
+                                            IIC_VLD4ln, IIC_VLD4lnu]>;
+def :ItinRW<[A9WriteLfp5Mov, A9WriteAdr5], [IIC_VLD3ln, IIC_VLD3lnu]>;
+
+// Vector stores use similar resources to vector loads, so use the
+// same write types. The address write must be first for stores with
+// address writeback.
+def :ItinRW<[A9WriteAdr1, A9WriteLfp1], [IIC_VST1, IIC_VST1u,
+                                         IIC_VST1x2, IIC_VST1x2u,
+                                         IIC_VST1ln, IIC_VST1lnu,
+                                         IIC_VST2, IIC_VST2u,
+                                         IIC_VST2x2, IIC_VST2x2u,
+                                         IIC_VST2ln, IIC_VST2lnu]>;
+def :ItinRW<[A9WriteAdr2, A9WriteLfp2], [IIC_VST1x3, IIC_VST1x3u,
+                                         IIC_VST1x4, IIC_VST1x4u,
+                                         IIC_VST3, IIC_VST3u,
+                                         IIC_VST3ln, IIC_VST3lnu,
+                                         IIC_VST4, IIC_VST4u,
+                                         IIC_VST4ln, IIC_VST4lnu]>;
+
+// NEON moves.
+def :ItinRW<[A9WriteV2], [IIC_VMOVSI, IIC_VMOVDI, IIC_VMOVD, IIC_VMOVQ]>;
+def :ItinRW<[A9WriteV1], [IIC_VMOV, IIC_VMOVIS, IIC_VMOVID]>;
+def :ItinRW<[A9WriteV3], [IIC_VMOVISL, IIC_VMOVN]>;
+
+// NEON integer arithmetic
+//
+// VADD/VAND/VORR/VEOR/VBIC/VORN/VBIT/VBIF/VBSL
+def :ItinRW<[A9WriteV3, A9Read2, A9Read2], [IIC_VBINiD, IIC_VBINiQ]>;
+// VSUB/VMVN/VCLSD/VCLZD/VCNTD
+def :ItinRW<[A9WriteV3, A9Read2], [IIC_VSUBiD, IIC_VSUBiQ, IIC_VCNTiD]>;
+// VADDL/VSUBL/VNEG are mapped later under IIC_SHLi.
+// ...
+// VHADD/VRHADD/VQADD/VTST/VADH/VRADH
+def :ItinRW<[A9WriteV4, A9Read2, A9Read2], [IIC_VBINi4D, IIC_VBINi4Q]>;
+
+// VSBH/VRSBH/VHSUB/VQSUB/VABD/VCEQ/VCGE/VCGT/VMAX/VMIN/VPMAX/VPMIN/VABDL
+def :ItinRW<[A9WriteV4, A9Read2], [IIC_VSUBi4D, IIC_VSUBi4Q]>;
+// VQNEG/VQABS
+def :ItinRW<[A9WriteV4], [IIC_VQUNAiD, IIC_VQUNAiQ]>;
+// VABS
+def :ItinRW<[A9WriteV4, A9Read2], [IIC_VUNAiD, IIC_VUNAiQ]>;
+// VPADD/VPADDL are mapped later under IIC_SHLi.
+// ...
+// VCLSQ/VCLZQ/VCNTQ, takes two cycles.
+def :ItinRW<[A9Write2V4, A9Read3], [IIC_VCNTiQ]>;
+// VMOVimm/VMVNimm/VORRimm/VBICimm
+def :ItinRW<[A9WriteV3], [IIC_VMOVImm]>;
+def :ItinRW<[A9WriteV6, A9Read3, A9Read2], [IIC_VABAD, IIC_VABAQ]>;
+def :ItinRW<[A9WriteV6, A9Read3], [IIC_VPALiD, IIC_VPALiQ]>;
+
+// NEON integer multiply
+//
+// Note: these don't quite match the timing docs, but they do match
+// the original A9 itinerary.
+def :ItinRW<[A9WriteV6, A9Read2, A9Read2], [IIC_VMULi16D]>;
+def :ItinRW<[A9WriteV7, A9Read2, A9Read2], [IIC_VMULi16Q]>;
+def :ItinRW<[A9Write2V7, A9Read2], [IIC_VMULi32D]>;
+def :ItinRW<[A9Write2V9, A9Read2], [IIC_VMULi32Q]>;
+def :ItinRW<[A9WriteV6, A9Read3, A9Read2, A9Read2], [IIC_VMACi16D]>;
+def :ItinRW<[A9WriteV7, A9Read3, A9Read2, A9Read2], [IIC_VMACi16Q]>;
+def :ItinRW<[A9Write2V7, A9Read3, A9Read2], [IIC_VMACi32D]>;
+def :ItinRW<[A9Write2V9, A9Read3, A9Read2], [IIC_VMACi32Q]>;
+
+// NEON integer shift
+// TODO: Q,Q,Q shifts should actually reserve FP for 2 cycles.
+def :ItinRW<[A9WriteV3], [IIC_VSHLiD, IIC_VSHLiQ]>;
+def :ItinRW<[A9WriteV4], [IIC_VSHLi4D, IIC_VSHLi4Q]>;
+
+// NEON permute
+def :ItinRW<[A9WriteV2, A9WriteV2], [IIC_VPERMD, IIC_VPERMQ, IIC_VEXTD]>;
+def :ItinRW<[A9WriteV3, A9WriteV4, ReadDefault, A9Read2],
+            [IIC_VPERMQ3, IIC_VEXTQ]>;
+def :ItinRW<[A9WriteV3, A9Read2], [IIC_VTB1]>;
+def :ItinRW<[A9WriteV3, A9Read2, A9Read2], [IIC_VTB2]>;
+def :ItinRW<[A9WriteV4, A9Read2, A9Read2, A9Read3], [IIC_VTB3]>;
+def :ItinRW<[A9WriteV4, A9Read2, A9Read2, A9Read3, A9Read3], [IIC_VTB4]>;
+def :ItinRW<[A9WriteV3, ReadDefault, A9Read2], [IIC_VTBX1]>;
+def :ItinRW<[A9WriteV3, ReadDefault, A9Read2, A9Read2], [IIC_VTBX2]>;
+def :ItinRW<[A9WriteV4, ReadDefault, A9Read2, A9Read2, A9Read3], [IIC_VTBX3]>;
+def :ItinRW<[A9WriteV4, ReadDefault, A9Read2, A9Read2, A9Read3, A9Read3],
+            [IIC_VTBX4]>;
+
+// NEON floating-point
+def :ItinRW<[A9WriteV5, A9Read2, A9Read2], [IIC_VBIND]>;
+def :ItinRW<[A9WriteV6, A9Read2, A9Read2], [IIC_VBINQ]>;
+def :ItinRW<[A9WriteV5, A9Read2], [IIC_VUNAD, IIC_VFMULD]>;
+def :ItinRW<[A9WriteV6, A9Read2], [IIC_VUNAQ, IIC_VFMULQ]>;
+def :ItinRW<[A9WriteV9, A9Read3, A9Read2], [IIC_VMACD, IIC_VFMACD]>;
+def :ItinRW<[A9WriteV10, A9Read3, A9Read2], [IIC_VMACQ, IIC_VFMACQ]>;
+def :ItinRW<[A9WriteV9, A9Read2, A9Read2], [IIC_VRECSD]>;
+def :ItinRW<[A9WriteV10, A9Read2, A9Read2], [IIC_VRECSQ]>;
+
+// Map SchedRWs that are identical for cortexa9 to existing resources.
+def : SchedAlias<WriteALU, A9WriteALU>;
+def : SchedAlias<WriteALUsr, A9WriteALUsr>;
+def : SchedAlias<WriteALUSsr, A9WriteALUsr>;
+def : SchedAlias<ReadALU, A9ReadALU>;
+def : SchedAlias<ReadALUsr, A9ReadALU>;
+def : InstRW< [WriteALU],
+      (instregex "ANDri", "ORRri", "EORri", "BICri", "ANDrr", "ORRrr", "EORrr",
+                 "BICrr")>;
+def : InstRW< [WriteALUsi], (instregex "ANDrsi", "ORRrsi", "EORrsi", "BICrsi")>;
+def : InstRW< [WriteALUsr], (instregex "ANDrsr", "ORRrsr", "EORrsr", "BICrsr")>;
+
+
+def : SchedAlias<WriteCMP, A9WriteALU>;
+def : SchedAlias<WriteCMPsi, A9WriteALU>;
+def : SchedAlias<WriteCMPsr, A9WriteALU>;
+
+def : InstRW< [A9WriteIsr], (instregex "MOVsr", "MOVsi", "MVNsr", "MOVCCsi",
+                                       "MOVCCsr")>;
+def : InstRW< [WriteALU, A9ReadALU], (instregex "MVNr")>;
+def : InstRW< [A9WriteI2], (instregex "MOVCCi32imm", "MOVi32imm",
+                                      "MOV_ga_dyn")>;
+def : InstRW< [A9WriteI2pc], (instregex "MOV_ga_pcrel")>;
+def : InstRW< [A9WriteI2ld], (instregex "MOV_ga_pcrel_ldr")>;
+
+def : InstRW< [WriteALU], (instregex "SEL")>;
+
+def : InstRW< [WriteALUsi], (instregex "BFC", "BFI", "UBFX", "SBFX")>;
+
+def : InstRW< [A9WriteM],
+      (instregex "MUL", "MULv5", "SMMUL", "SMMULR", "MLA", "MLAv5", "MLS",
+      "SMMLA", "SMMLAR", "SMMLS", "SMMLSR")>;
+def : InstRW< [A9WriteM, A9WriteMHi],
+      (instregex "SMULL", "SMULLv5", "UMULL", "UMULLv5", "SMLAL$", "UMLAL",
+      "UMAAL", "SMLALv5", "UMLALv5", "UMAALv5", "SMLALBB", "SMLALBT", "SMLALTB",
+      "SMLALTT")>;
+// FIXME: These instructions used to have NoItinerary. Just copied the one from above.
+def : InstRW< [A9WriteM, A9WriteMHi],
+      (instregex "SMLAD", "SMLADX", "SMLALD", "SMLALDX", "SMLSD", "SMLSDX",
+      "SMLSLD", "SMLLDX", "SMUAD", "SMUADX", "SMUSD", "SMUSDX")>;
+
+def : InstRW<[A9WriteM16, A9WriteM16Hi],
+      (instregex "SMULBB", "SMULBT", "SMULTB", "SMULTT", "SMULWB", "SMULWT")>;
+def : InstRW<[A9WriteM16, A9WriteM16Hi],
+      (instregex "SMLABB", "SMLABT", "SMLATB", "SMLATT", "SMLAWB", "SMLAWT")>;
+
+def : InstRW<[A9WriteL], (instregex "LDRi12", "PICLDR$")>;
+def : InstRW<[A9WriteLsi], (instregex "LDRrs")>;
+def : InstRW<[A9WriteLb],
+      (instregex "LDRBi12", "PICLDRH", "PICLDRB", "PICLDRSH", "PICLDRSB",
+      "LDRH", "LDRSH", "LDRSB")>;
+def : InstRW<[A9WriteLbsi], (instregex "LDRrs")>;
+
+def : WriteRes<WriteDiv, []> { let Latency = 0; }
+
+def : WriteRes<WriteBr, [A9UnitB]>;
+def : WriteRes<WriteBrL, [A9UnitB]>;
+def : WriteRes<WriteBrTbl, [A9UnitB]>;
+def : WriteRes<WritePreLd, []>;
+def : SchedAlias<WriteCvtFP, A9WriteF>;
+def : WriteRes<WriteNoop, []> { let Latency = 0; let NumMicroOps = 0; }
+} // SchedModel = CortexA9Model
diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleSwift.td b/contrib/llvm/lib/Target/ARM/ARMScheduleSwift.td
new file mode 100644
index 0000000..b03d5ff
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMScheduleSwift.td
@@ -0,0 +1,2076 @@
+//=- ARMScheduleSwift.td - Swift Scheduling Definitions -*- tablegen -*----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the Swift processor..
+//
+//===----------------------------------------------------------------------===//
+
+// ===---------------------------------------------------------------------===//
+// This section contains legacy support for itineraries. This is
+// required until SD and PostRA schedulers are replaced by MachineScheduler.
+
+def SW_DIS0 : FuncUnit;
+def SW_DIS1 : FuncUnit;
+def SW_DIS2 : FuncUnit;
+
+def SW_ALU0 : FuncUnit;
+def SW_ALU1 : FuncUnit;
+def SW_LS   : FuncUnit;
+def SW_IDIV : FuncUnit;
+def SW_FDIV : FuncUnit;
+
+// FIXME: Need bypasses.
+// FIXME: Model the multiple stages of IIC_iMOVix2, IIC_iMOVix2addpc, and
+//        IIC_iMOVix2ld better.
+// FIXME: Model the special immediate shifts that are not microcoded.
+// FIXME: Do we need to model the fact that uses of r15 in a micro-op force it
+//        to issue on pipe 1?
+// FIXME: Model the pipelined behavior of CMP / TST instructions.
+// FIXME: Better model the microcode stages of multiply instructions, especially
+//        conditional variants.
+// FIXME: Add preload instruction when it is documented.
+// FIXME: Model non-pipelined nature of FP div / sqrt unit.
+
+def SwiftItineraries : ProcessorItineraries<
+  [SW_DIS0, SW_DIS1, SW_DIS2, SW_ALU0, SW_ALU1, SW_LS, SW_IDIV, SW_FDIV], [], [
+  //
+  // Move instructions, unconditional
+  InstrItinData<IIC_iMOVi   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [1]>,
+  InstrItinData<IIC_iMOVr   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [1]>,
+  InstrItinData<IIC_iMOVsi  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [1]>,
+  InstrItinData<IIC_iMOVsr  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [1]>,
+  InstrItinData<IIC_iMOVix2 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [2]>,
+  InstrItinData<IIC_iMOVix2addpc,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                                  InstrStage<1, [SW_ALU0, SW_ALU1]>,
+                                  InstrStage<1, [SW_ALU0, SW_ALU1]>,
+                                  InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                                 [3]>,
+  InstrItinData<IIC_iMOVix2ld,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>,
+                               InstrStage<1, [SW_LS]>],
+                              [5]>,
+  //
+  // MVN instructions
+  InstrItinData<IIC_iMVNi   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [1]>,
+  InstrItinData<IIC_iMVNr   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [1]>,
+  InstrItinData<IIC_iMVNsi  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [1]>,
+  InstrItinData<IIC_iMVNsr  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [1]>,
+  //
+  // No operand cycles
+  InstrItinData<IIC_iALUx   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>]>,
+  //
+  // Binary Instructions that produce a result
+  InstrItinData<IIC_iALUi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                            [1, 1]>,
+  InstrItinData<IIC_iALUr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                            [1, 1, 1]>,
+  InstrItinData<IIC_iALUsi, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                            [2, 1, 1]>,
+  InstrItinData<IIC_iALUsir,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                            [2, 1, 1]>,
+  InstrItinData<IIC_iALUsr, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                            [2, 1, 1, 1]>,
+  //
+  // Bitwise Instructions that produce a result
+  InstrItinData<IIC_iBITi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                            [1, 1]>,
+  InstrItinData<IIC_iBITr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                            [1, 1, 1]>,
+  InstrItinData<IIC_iBITsi, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                            [2, 1, 1]>,
+  InstrItinData<IIC_iBITsr, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                            [2, 1, 1, 1]>,
+  //
+  // Unary Instructions that produce a result
+
+  // CLZ, RBIT, etc.
+  InstrItinData<IIC_iUNAr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                            [1, 1]>,
+
+  // BFC, BFI, UBFX, SBFX
+  InstrItinData<IIC_iUNAsi, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                            [2, 1]>,
+
+  //
+  // Zero and sign extension instructions
+  InstrItinData<IIC_iEXTr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                            [1, 1]>,
+  InstrItinData<IIC_iEXTAr, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                            [1, 1, 1]>,
+  InstrItinData<IIC_iEXTAsr,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                            [1, 1, 1, 1]>,
+  //
+  // Compare instructions
+  InstrItinData<IIC_iCMPi   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [1]>,
+  InstrItinData<IIC_iCMPr   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [1, 1]>,
+  InstrItinData<IIC_iCMPsi  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<2, [SW_ALU0, SW_ALU1]>],
+                              [1, 1]>,
+  InstrItinData<IIC_iCMPsr  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<2, [SW_ALU0, SW_ALU1]>],
+                              [1, 1, 1]>,
+  //
+  // Test instructions
+  InstrItinData<IIC_iTSTi   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [1]>,
+  InstrItinData<IIC_iTSTr   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [1, 1]>,
+  InstrItinData<IIC_iTSTsi  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<2, [SW_ALU0, SW_ALU1]>],
+                              [1, 1]>,
+  InstrItinData<IIC_iTSTsr  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<2, [SW_ALU0, SW_ALU1]>],
+                              [1, 1, 1]>,
+  //
+  // Move instructions, conditional
+  // FIXME: Correctly model the extra input dep on the destination.
+  InstrItinData<IIC_iCMOVi  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [1]>,
+  InstrItinData<IIC_iCMOVr  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [1, 1]>,
+  InstrItinData<IIC_iCMOVsi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [1, 1]>,
+  InstrItinData<IIC_iCMOVsr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [2, 1, 1]>,
+  InstrItinData<IIC_iCMOVix2, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [2]>,
+
+  // Integer multiply pipeline
+  //
+  InstrItinData<IIC_iMUL16  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [3, 1, 1]>,
+  InstrItinData<IIC_iMAC16  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [3, 1, 1, 1]>,
+  InstrItinData<IIC_iMUL32  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1, 1]>,
+  InstrItinData<IIC_iMAC32  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1, 1, 1]>,
+  InstrItinData<IIC_iMUL64  , [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0], 1>,
+                               InstrStage<1, [SW_ALU0], 3>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [5, 5, 1, 1]>,
+  InstrItinData<IIC_iMAC64  , [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0], 1>,
+                               InstrStage<1, [SW_ALU0], 1>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1], 3>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [5, 6, 1, 1]>,
+  //
+  // Integer divide
+  InstrItinData<IIC_iDIV  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                             InstrStage<1, [SW_ALU0], 0>,
+                             InstrStage<14, [SW_IDIV]>],
+                            [14, 1, 1]>,
+
+  // Integer load pipeline
+  // FIXME: The timings are some rough approximations
+  //
+  // Immediate offset
+  InstrItinData<IIC_iLoad_i   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                                 InstrStage<1, [SW_LS]>],
+                                [3, 1]>,
+  InstrItinData<IIC_iLoad_bh_i, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                                 InstrStage<1, [SW_LS]>],
+                                [3, 1]>,
+  InstrItinData<IIC_iLoad_d_i , [InstrStage<1, [SW_DIS0], 0>,
+                                 InstrStage<1, [SW_DIS1], 0>,
+                                 InstrStage<1, [SW_LS], 1>,
+                                 InstrStage<1, [SW_LS]>],
+                                [3, 4, 1]>,
+  //
+  // Register offset
+  InstrItinData<IIC_iLoad_r   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                                 InstrStage<1, [SW_LS]>],
+                                [3, 1, 1]>,
+  InstrItinData<IIC_iLoad_bh_r, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                                 InstrStage<1, [SW_LS]>],
+                                [3, 1, 1]>,
+  InstrItinData<IIC_iLoad_d_r , [InstrStage<1, [SW_DIS0], 0>,
+                                 InstrStage<1, [SW_DIS1], 0>,
+                                 InstrStage<1, [SW_DIS2], 0>,
+                                 InstrStage<1, [SW_LS], 1>,
+                                 InstrStage<1, [SW_LS], 3>,
+                                 InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                                [3, 4, 1, 1]>,
+  //
+  // Scaled register offset
+  InstrItinData<IIC_iLoad_si  , [InstrStage<1, [SW_DIS0], 0>,
+                                 InstrStage<1, [SW_DIS1], 0>,
+                                 InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
+                                 InstrStage<1, [SW_LS]>],
+                                [5, 1, 1]>,
+  InstrItinData<IIC_iLoad_bh_si,[InstrStage<1, [SW_DIS0], 0>,
+                                 InstrStage<1, [SW_DIS1], 0>,
+                                 InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
+                                 InstrStage<1, [SW_LS]>],
+                                [5, 1, 1]>,
+  //
+  // Immediate offset with update
+  InstrItinData<IIC_iLoad_iu  , [InstrStage<1, [SW_DIS0], 0>,
+                                 InstrStage<1, [SW_DIS1], 0>,
+                                 InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                 InstrStage<1, [SW_LS]>],
+                                [3, 1, 1]>,
+  InstrItinData<IIC_iLoad_bh_iu,[InstrStage<1, [SW_DIS0], 0>,
+                                 InstrStage<1, [SW_DIS1], 0>,
+                                 InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                 InstrStage<1, [SW_LS]>],
+                                [3, 1, 1]>,
+  //
+  // Register offset with update
+  InstrItinData<IIC_iLoad_ru  , [InstrStage<1, [SW_DIS0], 0>,
+                                 InstrStage<1, [SW_DIS1], 0>,
+                                 InstrStage<1, [SW_ALU0], 1>,
+                                 InstrStage<1, [SW_LS]>],
+                                [3, 1, 1, 1]>,
+  InstrItinData<IIC_iLoad_bh_ru,[InstrStage<1, [SW_DIS0], 0>,
+                                 InstrStage<1, [SW_DIS1], 0>,
+                                 InstrStage<1, [SW_ALU0], 1>,
+                                 InstrStage<1, [SW_LS]>],
+                                [3, 1, 1, 1]>,
+  InstrItinData<IIC_iLoad_d_ru, [InstrStage<1, [SW_DIS0], 0>,
+                                 InstrStage<1, [SW_DIS1], 0>,
+                                 InstrStage<1, [SW_DIS2], 0>,
+                                 InstrStage<1, [SW_ALU0, SW_ALU1], 0>,
+                                 InstrStage<1, [SW_LS], 3>,
+                                 InstrStage<1, [SW_LS], 0>,
+                                 InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                                [3, 4, 1, 1]>,
+  //
+  // Scaled register offset with update
+  InstrItinData<IIC_iLoad_siu , [InstrStage<1, [SW_DIS0], 0>,
+                                 InstrStage<1, [SW_DIS1], 0>,
+                                 InstrStage<1, [SW_DIS2], 0>,
+                                 InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
+                                 InstrStage<1, [SW_LS], 3>,
+                                 InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                                [5, 3, 1, 1]>,
+  InstrItinData<IIC_iLoad_bh_siu,[InstrStage<1, [SW_DIS0], 0>,
+                                  InstrStage<1, [SW_DIS1], 0>,
+                                  InstrStage<1, [SW_DIS2], 0>,
+                                  InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
+                                  InstrStage<1, [SW_LS], 0>,
+                                  InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                                [5, 3, 1, 1]>,
+  //
+  // Load multiple, def is the 5th operand.
+  // FIXME: This assumes 3 to 4 registers.
+  InstrItinData<IIC_iLoad_m  , [InstrStage<1, [SW_DIS0], 0>,
+                                InstrStage<1, [SW_DIS1], 0>,
+                                InstrStage<1, [SW_DIS2], 0>,
+                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                InstrStage<1, [SW_LS]>],
+                               [1, 1, 1, 1, 3], [], -1>, // dynamic uops
+
+  //
+  // Load multiple + update, defs are the 1st and 5th operands.
+  InstrItinData<IIC_iLoad_mu , [InstrStage<1, [SW_DIS0], 0>,
+                                InstrStage<1, [SW_DIS1], 0>,
+                                InstrStage<1, [SW_DIS2], 0>,
+                                InstrStage<1, [SW_ALU0, SW_ALU1], 0>,
+                                InstrStage<1, [SW_LS], 3>,
+                                InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                               [2, 1, 1, 1, 3], [], -1>, // dynamic uops
+  //
+  // Load multiple plus branch
+  InstrItinData<IIC_iLoad_mBr, [InstrStage<1, [SW_DIS0], 0>,
+                                InstrStage<1, [SW_DIS1], 0>,
+                                InstrStage<1, [SW_DIS2], 0>,
+                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                InstrStage<1, [SW_LS]>],
+                               [1, 1, 1, 1, 3], [], -1>, // dynamic uops
+  //
+  // Pop, def is the 3rd operand.
+  InstrItinData<IIC_iPop  ,    [InstrStage<1, [SW_DIS0], 0>,
+                                InstrStage<1, [SW_DIS1], 0>,
+                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                InstrStage<1, [SW_LS]>],
+                               [1, 1, 3], [], -1>, // dynamic uops
+  //
+  // Pop + branch, def is the 3rd operand.
+  InstrItinData<IIC_iPop_Br,   [InstrStage<1, [SW_DIS0], 0>,
+                                InstrStage<1, [SW_DIS1], 0>,
+                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                InstrStage<1, [SW_LS]>],
+                               [1, 1, 3], [], -1>, // dynamic uops
+
+  //
+  // iLoadi + iALUr for t2LDRpci_pic.
+  InstrItinData<IIC_iLoadiALU, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                                InstrStage<1, [SW_LS], 3>,
+                                InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                               [4, 1]>,
+
+  // Integer store pipeline
+  ///
+  // Immediate offset
+  InstrItinData<IIC_iStore_i  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                                 InstrStage<1, [SW_LS]>],
+                                [1, 1]>,
+  InstrItinData<IIC_iStore_bh_i,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                                 InstrStage<1, [SW_LS]>],
+                                [1, 1]>,
+  InstrItinData<IIC_iStore_d_i, [InstrStage<1, [SW_DIS0], 0>,
+                                 InstrStage<1, [SW_DIS1], 0>,
+                                 InstrStage<1, [SW_DIS2], 0>,
+                                 InstrStage<1, [SW_LS], 0>,
+                                 InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                 InstrStage<1, [SW_LS]>],
+                                [1, 1]>,
+  //
+  // Register offset
+  InstrItinData<IIC_iStore_r  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                                 InstrStage<1, [SW_LS]>],
+                                [1, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_r,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                                 InstrStage<1, [SW_LS]>],
+                                [1, 1, 1]>,
+  InstrItinData<IIC_iStore_d_r, [InstrStage<1, [SW_DIS0], 0>,
+                                 InstrStage<1, [SW_DIS1], 0>,
+                                 InstrStage<1, [SW_DIS2], 0>,
+                                 InstrStage<1, [SW_LS], 0>,
+                                 InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                 InstrStage<1, [SW_LS]>],
+                                [1, 1, 1]>,
+  //
+  // Scaled register offset
+  InstrItinData<IIC_iStore_si ,  [InstrStage<1, [SW_DIS0], 0>,
+                                  InstrStage<1, [SW_DIS1], 0>,
+                                  InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
+                                  InstrStage<1, [SW_LS]>],
+                                 [1, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_si,[InstrStage<1, [SW_DIS0], 0>,
+                                  InstrStage<1, [SW_DIS1], 0>,
+                                  InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
+                                  InstrStage<1, [SW_LS]>],
+                                 [1, 1, 1]>,
+  //
+  // Immediate offset with update
+  InstrItinData<IIC_iStore_iu ,  [InstrStage<1, [SW_DIS0], 0>,
+                                  InstrStage<1, [SW_DIS1], 0>,
+                                  InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                  InstrStage<1, [SW_LS]>],
+                                 [1, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_iu,[InstrStage<1, [SW_DIS0], 0>,
+                                  InstrStage<1, [SW_DIS1], 0>,
+                                  InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                  InstrStage<1, [SW_LS]>],
+                                 [1, 1, 1]>,
+  //
+  // Register offset with update
+  InstrItinData<IIC_iStore_ru ,  [InstrStage<1, [SW_DIS0], 0>,
+                                  InstrStage<1, [SW_DIS1], 0>,
+                                  InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                  InstrStage<1, [SW_LS]>],
+                                 [1, 1, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_ru,[InstrStage<1, [SW_DIS0], 0>,
+                                  InstrStage<1, [SW_DIS1], 0>,
+                                  InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                  InstrStage<1, [SW_LS]>],
+                                 [1, 1, 1, 1]>,
+  InstrItinData<IIC_iStore_d_ru, [InstrStage<1, [SW_DIS0], 0>,
+                                  InstrStage<1, [SW_DIS1], 0>,
+                                  InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                  InstrStage<1, [SW_LS]>],
+                                 [1, 1, 1, 1]>,
+  //
+  // Scaled register offset with update
+  InstrItinData<IIC_iStore_siu,    [InstrStage<1, [SW_DIS0], 0>,
+                                    InstrStage<1, [SW_DIS1], 0>,
+                                    InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
+                                    InstrStage<1, [SW_LS], 0>,
+                                    InstrStage<1, [SW_ALU0, SW_ALU1], 1>],
+                                   [3, 1, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_siu, [InstrStage<1, [SW_DIS0], 0>,
+                                    InstrStage<1, [SW_DIS1], 0>,
+                                    InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
+                                    InstrStage<1, [SW_LS], 0>,
+                                    InstrStage<1, [SW_ALU0, SW_ALU1], 1>],
+                                   [3, 1, 1, 1]>,
+  //
+  // Store multiple
+  InstrItinData<IIC_iStore_m , [InstrStage<1, [SW_DIS0], 0>,
+                                InstrStage<1, [SW_DIS1], 0>,
+                                InstrStage<1, [SW_DIS2], 0>,
+                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                InstrStage<1, [SW_LS], 1>,
+                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                InstrStage<1, [SW_LS], 1>,
+                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                InstrStage<1, [SW_LS]>],
+                                [], [], -1>, // dynamic uops
+  //
+  // Store multiple + update
+  InstrItinData<IIC_iStore_mu, [InstrStage<1, [SW_DIS0], 0>,
+                                InstrStage<1, [SW_DIS1], 0>,
+                                InstrStage<1, [SW_DIS2], 0>,
+                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                InstrStage<1, [SW_LS], 1>,
+                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                InstrStage<1, [SW_LS], 1>,
+                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                InstrStage<1, [SW_LS]>],
+                               [2], [], -1>, // dynamic uops
+
+  //
+  // Preload
+  InstrItinData<IIC_Preload,   [InstrStage<1, [SW_DIS0], 0>], [1, 1]>,
+
+  // Branch
+  //
+  // no delay slots, so the latency of a branch is unimportant
+  InstrItinData<IIC_Br       , [InstrStage<1, [SW_DIS0], 0>]>,
+
+  // FP Special Register to Integer Register File Move
+  InstrItinData<IIC_fpSTAT , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                              InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                             [1]>,
+  //
+  // Single-precision FP Unary
+  //
+  // Most floating-point moves get issued on ALU0.
+  InstrItinData<IIC_fpUNA32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2, 1]>,
+  //
+  // Double-precision FP Unary
+  InstrItinData<IIC_fpUNA64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2, 1]>,
+
+  //
+  // Single-precision FP Compare
+  InstrItinData<IIC_fpCMP32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [1, 1]>,
+  //
+  // Double-precision FP Compare
+  InstrItinData<IIC_fpCMP64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [1, 1]>,
+  //
+  // Single to Double FP Convert
+  InstrItinData<IIC_fpCVTSD , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1]>,
+  //
+  // Double to Single FP Convert
+  InstrItinData<IIC_fpCVTDS , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1]>,
+
+  //
+  // Single to Half FP Convert
+  InstrItinData<IIC_fpCVTSH , [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_ALU1], 4>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [6, 1]>,
+  //
+  // Half to Single FP Convert
+  InstrItinData<IIC_fpCVTHS , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1]>,
+
+  //
+  // Single-Precision FP to Integer Convert
+  InstrItinData<IIC_fpCVTSI , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1]>,
+  //
+  // Double-Precision FP to Integer Convert
+  InstrItinData<IIC_fpCVTDI , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1]>,
+  //
+  // Integer to Single-Precision FP Convert
+  InstrItinData<IIC_fpCVTIS , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1]>,
+  //
+  // Integer to Double-Precision FP Convert
+  InstrItinData<IIC_fpCVTID , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1]>,
+  //
+  // Single-precision FP ALU
+  InstrItinData<IIC_fpALU32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2, 1, 1]>,
+  //
+  // Double-precision FP ALU
+  InstrItinData<IIC_fpALU64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2, 1, 1]>,
+  //
+  // Single-precision FP Multiply
+  InstrItinData<IIC_fpMUL32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1, 1]>,
+  //
+  // Double-precision FP Multiply
+  InstrItinData<IIC_fpMUL64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [6, 1, 1]>,
+  //
+  // Single-precision FP MAC
+  InstrItinData<IIC_fpMAC32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [8, 1, 1]>,
+  //
+  // Double-precision FP MAC
+  InstrItinData<IIC_fpMAC64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [12, 1, 1]>,
+  //
+  // Single-precision Fused FP MAC
+  InstrItinData<IIC_fpFMAC32, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [8, 1, 1]>,
+  //
+  // Double-precision Fused FP MAC
+  InstrItinData<IIC_fpFMAC64, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [12, 1, 1]>,
+  //
+  // Single-precision FP DIV
+  InstrItinData<IIC_fpDIV32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1], 0>,
+                               InstrStage<15, [SW_FDIV]>],
+                              [17, 1, 1]>,
+  //
+  // Double-precision FP DIV
+  InstrItinData<IIC_fpDIV64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1], 0>,
+                               InstrStage<30, [SW_FDIV]>],
+                              [32, 1, 1]>,
+  //
+  // Single-precision FP SQRT
+  InstrItinData<IIC_fpSQRT32, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1], 0>,
+                               InstrStage<15, [SW_FDIV]>],
+                              [17, 1]>,
+  //
+  // Double-precision FP SQRT
+  InstrItinData<IIC_fpSQRT64, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1], 0>,
+                               InstrStage<30, [SW_FDIV]>],
+                              [32, 1, 1]>,
+
+  //
+  // Integer to Single-precision Move
+  InstrItinData<IIC_fpMOVIS,  [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_LS], 4>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [6, 1]>,
+  //
+  // Integer to Double-precision Move
+  InstrItinData<IIC_fpMOVID,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_LS]>],
+                              [4, 1]>,
+  //
+  // Single-precision to Integer Move
+  InstrItinData<IIC_fpMOVSI,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_LS]>],
+                              [3, 1]>,
+  //
+  // Double-precision to Integer Move
+  InstrItinData<IIC_fpMOVDI,  [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_LS], 3>,
+                               InstrStage<1, [SW_LS]>],
+                              [3, 4, 1]>,
+  //
+  // Single-precision FP Load
+  InstrItinData<IIC_fpLoad32, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_LS]>],
+                              [4, 1]>,
+  //
+  // Double-precision FP Load
+  InstrItinData<IIC_fpLoad64, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_LS]>],
+                              [4, 1]>,
+  //
+  // FP Load Multiple
+  // FIXME: Assumes a single Q register.
+  InstrItinData<IIC_fpLoad_m, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_LS]>],
+                              [1, 1, 1, 4], [], -1>, // dynamic uops
+  //
+  // FP Load Multiple + update
+  // FIXME: Assumes a single Q register.
+  InstrItinData<IIC_fpLoad_mu,[InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_LS], 4>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [2, 1, 1, 1, 4], [], -1>, // dynamic uops
+  //
+  // Single-precision FP Store
+  InstrItinData<IIC_fpStore32,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_LS]>],
+                              [1, 1]>,
+  //
+  // Double-precision FP Store
+  InstrItinData<IIC_fpStore64,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_LS]>],
+                              [1, 1]>,
+  //
+  // FP Store Multiple
+  // FIXME: Assumes a single Q register.
+  InstrItinData<IIC_fpStore_m,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_LS]>],
+                              [1, 1, 1], [], -1>, // dynamic uops
+  //
+  // FP Store Multiple + update
+  // FIXME: Assumes a single Q register.
+  InstrItinData<IIC_fpStore_mu,[InstrStage<1, [SW_DIS0], 0>,
+                                InstrStage<1, [SW_DIS1], 0>,
+                                InstrStage<1, [SW_LS], 4>,
+                                InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                               [2, 1, 1, 1], [], -1>, // dynamic uops
+  // NEON
+  //
+  // Double-register Integer Unary
+  InstrItinData<IIC_VUNAiD,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1]>,
+  //
+  // Quad-register Integer Unary
+  InstrItinData<IIC_VUNAiQ,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1]>,
+  //
+  // Double-register Integer Q-Unary
+  InstrItinData<IIC_VQUNAiD,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1]>,
+  //
+  // Quad-register Integer CountQ-Unary
+  InstrItinData<IIC_VQUNAiQ,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1]>,
+  //
+  // Double-register Integer Binary
+  InstrItinData<IIC_VBINiD,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2, 1, 1]>,
+  //
+  // Quad-register Integer Binary
+  InstrItinData<IIC_VBINiQ,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2, 1, 1]>,
+  //
+  // Double-register Integer Subtract
+  InstrItinData<IIC_VSUBiD,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2, 1, 1]>,
+  //
+  // Quad-register Integer Subtract
+  InstrItinData<IIC_VSUBiQ,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2, 1, 1]>,
+  //
+  // Double-register Integer Shift
+  InstrItinData<IIC_VSHLiD,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2, 1, 1]>,
+  //
+  // Quad-register Integer Shift
+  InstrItinData<IIC_VSHLiQ,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2, 1, 1]>,
+  //
+  // Double-register Integer Shift (4 cycle)
+  InstrItinData<IIC_VSHLi4D,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1, 1]>,
+  //
+  // Quad-register Integer Shift (4 cycle)
+  InstrItinData<IIC_VSHLi4Q,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1, 1]>,
+  //
+  // Double-register Integer Binary (4 cycle)
+  InstrItinData<IIC_VBINi4D,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1, 1]>,
+  //
+  // Quad-register Integer Binary (4 cycle)
+  InstrItinData<IIC_VBINi4Q,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1, 1]>,
+  //
+  // Double-register Integer Subtract (4 cycle)
+  InstrItinData<IIC_VSUBi4D,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1, 1]>,
+  //
+  // Quad-register Integer Subtract (4 cycle)
+  InstrItinData<IIC_VSUBi4Q,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1, 1]>,
+
+  //
+  // Double-register Integer Count
+  InstrItinData<IIC_VCNTiD,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2, 1, 1]>,
+  //
+  // Quad-register Integer Count
+  InstrItinData<IIC_VCNTiQ,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2, 1, 1]>,
+  //
+  // Double-register Absolute Difference and Accumulate
+  InstrItinData<IIC_VABAD,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1, 1, 1]>,
+  //
+  // Quad-register Absolute Difference and Accumulate
+  InstrItinData<IIC_VABAQ,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1, 1, 1]>,
+  //
+  // Double-register Integer Pair Add Long
+  InstrItinData<IIC_VPALiD,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1, 1]>,
+  //
+  // Quad-register Integer Pair Add Long
+  InstrItinData<IIC_VPALiQ,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1, 1]>,
+
+  //
+  // Double-register Integer Multiply (.8, .16)
+  InstrItinData<IIC_VMULi16D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1, 1]>,
+  //
+  // Quad-register Integer Multiply (.8, .16)
+  InstrItinData<IIC_VMULi16Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1, 1]>,
+
+  //
+  // Double-register Integer Multiply (.32)
+  InstrItinData<IIC_VMULi32D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1, 1]>,
+  //
+  // Quad-register Integer Multiply (.32)
+  InstrItinData<IIC_VMULi32Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1, 1]>,
+  //
+  // Double-register Integer Multiply-Accumulate (.8, .16)
+  InstrItinData<IIC_VMACi16D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1, 1, 1]>,
+  //
+  // Double-register Integer Multiply-Accumulate (.32)
+  InstrItinData<IIC_VMACi32D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1, 1, 1]>,
+  //
+  // Quad-register Integer Multiply-Accumulate (.8, .16)
+  InstrItinData<IIC_VMACi16Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1, 1, 1]>,
+  //
+  // Quad-register Integer Multiply-Accumulate (.32)
+  InstrItinData<IIC_VMACi32Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1, 1, 1]>,
+
+  //
+  // Move
+  InstrItinData<IIC_VMOV,     [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2, 1]>,
+  //
+  // Move Immediate
+  InstrItinData<IIC_VMOVImm,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2]>,
+  //
+  // Double-register Permute Move
+  InstrItinData<IIC_VMOVD,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [2, 1]>,
+  //
+  // Quad-register Permute Move
+  InstrItinData<IIC_VMOVQ,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [2, 1]>,
+  //
+  // Integer to Single-precision Move
+  InstrItinData<IIC_VMOVIS ,  [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_LS], 4>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [6, 1]>,
+  //
+  // Integer to Double-precision Move
+  InstrItinData<IIC_VMOVID ,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_LS]>],
+                              [4, 1, 1]>,
+  //
+  // Single-precision to Integer Move
+  InstrItinData<IIC_VMOVSI ,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_LS]>],
+                              [3, 1]>,
+  //
+  // Double-precision to Integer Move
+  InstrItinData<IIC_VMOVDI ,  [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_LS], 3>,
+                               InstrStage<1, [SW_LS]>],
+                              [3, 4, 1]>,
+  //
+  // Integer to Lane Move
+  // FIXME: I think this is correct, but it is not clear from the tuning guide.
+  InstrItinData<IIC_VMOVISL , [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_LS], 4>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [6, 1]>,
+
+  //
+  // Vector narrow move
+  InstrItinData<IIC_VMOVN,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [2, 1]>,
+  //
+  // Double-register FP Unary
+  // FIXME: VRECPE / VRSQRTE has a longer latency than VABS, which is used here,
+  //        and they issue on a different pipeline.
+  InstrItinData<IIC_VUNAD,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2, 1]>,
+  //
+  // Quad-register FP Unary
+  // FIXME: VRECPE / VRSQRTE has a longer latency than VABS, which is used here,
+  //        and they issue on a different pipeline.
+  InstrItinData<IIC_VUNAQ,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2, 1]>,
+  //
+  // Double-register FP Binary
+  // FIXME: We're using this itin for many instructions.
+  InstrItinData<IIC_VBIND,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1, 1]>,
+
+  //
+  // VPADD, etc.
+  InstrItinData<IIC_VPBIND,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1, 1]>,
+  //
+  // Double-register FP VMUL
+  InstrItinData<IIC_VFMULD,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1, 1]>,
+  //
+  // Quad-register FP Binary
+  InstrItinData<IIC_VBINQ,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1, 1]>,
+  //
+  // Quad-register FP VMUL
+  InstrItinData<IIC_VFMULQ,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1, 1]>,
+  //
+  // Double-register FP Multiple-Accumulate
+  InstrItinData<IIC_VMACD,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [8, 1, 1]>,
+  //
+  // Quad-register FP Multiple-Accumulate
+  InstrItinData<IIC_VMACQ,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [8, 1, 1]>,
+  //
+  // Double-register Fused FP Multiple-Accumulate
+  InstrItinData<IIC_VFMACD,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [8, 1, 1]>,
+  //
+  // Quad-register FusedF P Multiple-Accumulate
+  InstrItinData<IIC_VFMACQ,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [8, 1, 1]>,
+  //
+  // Double-register Reciprical Step
+  InstrItinData<IIC_VRECSD,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [8, 1, 1]>,
+  //
+  // Quad-register Reciprical Step
+  InstrItinData<IIC_VRECSQ,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [8, 1, 1]>,
+  //
+  // Double-register Permute
+  // FIXME: The latencies are unclear from the documentation.
+  InstrItinData<IIC_VPERMD,   [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [3, 4, 3, 4]>,
+  //
+  // Quad-register Permute
+  // FIXME: The latencies are unclear from the documentation.
+  InstrItinData<IIC_VPERMQ,   [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [3, 4, 3, 4]>,
+  //
+  // Quad-register Permute (3 cycle issue on A9)
+  InstrItinData<IIC_VPERMQ3,  [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [3, 4, 3, 4]>,
+
+  //
+  // Double-register VEXT
+  InstrItinData<IIC_VEXTD,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [2, 1, 1]>,
+  //
+  // Quad-register VEXT
+  InstrItinData<IIC_VEXTQ,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [2, 1, 1]>,
+  //
+  // VTB
+  InstrItinData<IIC_VTB1,     [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [2, 1, 1]>,
+  InstrItinData<IIC_VTB2,     [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1, 3, 3]>,
+  InstrItinData<IIC_VTB3,     [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [6, 1, 3, 5, 5]>,
+  InstrItinData<IIC_VTB4,     [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [8, 1, 3, 5, 7, 7]>,
+  //
+  // VTBX
+  InstrItinData<IIC_VTBX1,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [2, 1, 1]>,
+  InstrItinData<IIC_VTBX2,    [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1, 3, 3]>,
+  InstrItinData<IIC_VTBX3,    [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [6, 1, 3, 5, 5]>,
+  InstrItinData<IIC_VTBX4,    [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [8, 1, 3, 5, 7, 7]>
+]>;
+
+// ===---------------------------------------------------------------------===//
+// This following definitions describe the simple machine model which
+// will replace itineraries.
+
+// Swift machine model for scheduling and other instruction cost heuristics.
+def SwiftModel : SchedMachineModel {
+  let IssueWidth = 3; // 3 micro-ops are dispatched per cycle.
+  let MicroOpBufferSize = 45; // Based on NEON renamed registers.
+  let LoadLatency = 3;
+  let MispredictPenalty = 14; // A branch direction mispredict.
+
+  let Itineraries = SwiftItineraries;
+}
+
+// Swift predicates.
+def IsFastImmShiftSwiftPred : SchedPredicate<[{TII->isSwiftFastImmShift(MI)}]>;
+
+// Swift resource mapping.
+let SchedModel = SwiftModel in {
+  // Processor resources.
+  def SwiftUnitP01 : ProcResource<2>; // ALU unit.
+  def SwiftUnitP0 : ProcResource<1> { let Super = SwiftUnitP01; } // Mul unit.
+  def SwiftUnitP1 : ProcResource<1> { let Super = SwiftUnitP01; } // Br unit.
+  def SwiftUnitP2 : ProcResource<1>; // LS unit.
+  def SwiftUnitDiv : ProcResource<1>;
+
+  // Generic resource requirements.
+  def SwiftWriteP0OneCycle : SchedWriteRes<[SwiftUnitP0]>;
+  def SwiftWriteP0TwoCycle : SchedWriteRes<[SwiftUnitP0]> { let Latency = 2; }
+  def SwiftWriteP0FourCycle : SchedWriteRes<[SwiftUnitP0]> { let Latency = 4; }
+  def SwiftWriteP0SixCycle : SchedWriteRes<[SwiftUnitP0]> { let Latency = 6; }
+  def SwiftWriteP0P1FourCycle : SchedWriteRes<[SwiftUnitP0, SwiftUnitP1]> {
+    let Latency = 4;
+  }
+  def SwiftWriteP0P1SixCycle : SchedWriteRes<[SwiftUnitP0, SwiftUnitP1]> {
+    let Latency = 6;
+  }
+  def SwiftWriteP01OneCycle : SchedWriteRes<[SwiftUnitP01]>;
+  def SwiftWriteP1TwoCycle : SchedWriteRes<[SwiftUnitP1]> { let Latency = 2; }
+  def SwiftWriteP1FourCycle : SchedWriteRes<[SwiftUnitP1]> { let Latency = 4; }
+  def SwiftWriteP1SixCycle : SchedWriteRes<[SwiftUnitP1]> { let Latency = 6; }
+  def SwiftWriteP1EightCycle : SchedWriteRes<[SwiftUnitP1]> { let Latency = 8; }
+  def SwiftWriteP1TwelveCyc : SchedWriteRes<[SwiftUnitP1]> { let Latency = 12; }
+  def SwiftWriteP01OneCycle2x : WriteSequence<[SwiftWriteP01OneCycle], 2>;
+  def SwiftWriteP01OneCycle3x : WriteSequence<[SwiftWriteP01OneCycle], 3>;
+  def SwiftWriteP01TwoCycle : SchedWriteRes<[SwiftUnitP01]> { let Latency = 2; }
+  def SwiftWriteP01ThreeCycleTwoUops : SchedWriteRes<[SwiftUnitP01,
+                                                      SwiftUnitP01]> {
+    let Latency = 3;
+    let NumMicroOps = 2;
+  }
+  def SwiftWriteP0ThreeCycleThreeUops : SchedWriteRes<[SwiftUnitP0]> {
+    let Latency = 3;
+    let NumMicroOps = 3;
+    let ResourceCycles = [3];
+  }
+  // Plain load without writeback.
+  def SwiftWriteP2ThreeCycle : SchedWriteRes<[SwiftUnitP2]> {
+    let Latency = 3;
+  }
+  def SwiftWriteP2FourCycle : SchedWriteRes<[SwiftUnitP2]> {
+    let Latency = 4;
+  }
+  // A store does not write to a register.
+  def SwiftWriteP2 : SchedWriteRes<[SwiftUnitP2]> {
+    let Latency = 0;
+  }
+  foreach Num = 1-4 in {
+    def SwiftWrite#Num#xP2 : WriteSequence<[SwiftWriteP2], Num>;
+  }
+  def SwiftWriteP01OneCycle2x_load : WriteSequence<[SwiftWriteP01OneCycle,
+                                                    SwiftWriteP01OneCycle,
+                                                    SwiftWriteP2ThreeCycle]>;
+  // 4.2.4 Arithmetic and Logical.
+  // ALU operation register shifted by immediate variant.
+  def SwiftWriteALUsi : SchedWriteVariant<[
+    // lsl #2, lsl #1, or lsr #1.
+    SchedVar<IsFastImmShiftSwiftPred, [SwiftWriteP01TwoCycle]>,
+    SchedVar<NoSchedPred,             [WriteALU]>
+  ]>;
+  def SwiftWriteALUsr : SchedWriteVariant<[
+    SchedVar<IsPredicatedPred, [SwiftWriteP01ThreeCycleTwoUops]>,
+    SchedVar<NoSchedPred,      [SwiftWriteP01TwoCycle]>
+  ]>;
+  def SwiftWriteALUSsr : SchedWriteVariant<[
+    SchedVar<IsPredicatedPred, [SwiftWriteP0ThreeCycleThreeUops]>,
+    SchedVar<NoSchedPred,      [SwiftWriteP01TwoCycle]>
+  ]>;
+  def SwiftReadAdvanceALUsr : SchedReadVariant<[
+    SchedVar<IsPredicatedPred, [SchedReadAdvance<2>]>,
+    SchedVar<NoSchedPred,      [NoReadAdvance]>
+  ]>;
+  // ADC,ADD,NEG,RSB,RSC,SBC,SUB,ADR
+  // AND,BIC,EOR,ORN,ORR
+  // CLZ,RBIT,REV,REV16,REVSH,PKH
+  def : WriteRes<WriteALU, [SwiftUnitP01]>;
+  def : SchedAlias<WriteALUsi, SwiftWriteALUsi>;
+  def : SchedAlias<WriteALUsr, SwiftWriteALUsr>;
+  def : SchedAlias<WriteALUSsr, SwiftWriteALUSsr>;
+  def : ReadAdvance<ReadALU, 0>;
+  def : SchedAlias<ReadALUsr, SwiftReadAdvanceALUsr>;
+
+
+  def SwiftChooseShiftKindP01OneOrTwoCycle : SchedWriteVariant<[
+    SchedVar<IsFastImmShiftSwiftPred, [SwiftWriteP01OneCycle]>,
+    SchedVar<NoSchedPred,             [SwiftWriteP01TwoCycle]>
+  ]>;
+
+  // 4.2.5 Integer comparison
+  def : WriteRes<WriteCMP, [SwiftUnitP01]>;
+  def : SchedAlias<WriteCMPsi, SwiftChooseShiftKindP01OneOrTwoCycle>;
+  def : SchedAlias<WriteCMPsr, SwiftWriteP01TwoCycle>;
+
+  // 4.2.6 Shift, Move
+  // Shift
+  //  ASR,LSL,ROR,RRX
+  //  MOV(register-shiftedregister)  MVN(register-shiftedregister)
+  // Move
+  //  MOV,MVN
+  //  MOVT
+  // Sign/Zero extension
+  def : InstRW<[SwiftWriteP01OneCycle],
+               (instregex "SXTB", "SXTH", "SXTB16", "UXTB", "UXTH", "UXTB16",
+                          "t2SXTB", "t2SXTH", "t2SXTB16", "t2UXTB", "t2UXTH",
+                          "t2UXTB16")>;
+  // Pseudo instructions.
+  def : InstRW<[SwiftWriteP01OneCycle2x],
+        (instregex "MOVCCi32imm", "MOVi32imm", "MOV_ga_dyn", "t2MOVCCi32imm",
+                   "t2MOVi32imm", "t2MOV_ga_dyn")>;
+  def : InstRW<[SwiftWriteP01OneCycle3x],
+        (instregex "MOV_ga_pcrel", "t2MOV_ga_pcrel", "t2MOVi16_ga_pcrel")>;
+  def : InstRW<[SwiftWriteP01OneCycle2x_load],
+        (instregex "MOV_ga_pcrel_ldr", "t2MOV_ga_pcrel_ldr")>;
+
+  def SwiftWriteP0TwoCyleTwoUops : WriteSequence<[SwiftWriteP0OneCycle], 2>;
+
+  def SwiftPredP0OneOrTwoCycle : SchedWriteVariant<[
+    SchedVar<IsPredicatedPred, [ SwiftWriteP0TwoCyleTwoUops ]>,
+    SchedVar<NoSchedPred,     [ SwiftWriteP0OneCycle ]>
+  ]>;
+
+  // 4.2.7 Select
+  // SEL
+  def : InstRW<[SwiftPredP0OneOrTwoCycle], (instregex "SEL", "t2SEL")>;
+
+  // 4.2.8 Bitfield
+  // BFI,BFC, SBFX,UBFX
+  def : InstRW< [SwiftWriteP01TwoCycle],
+        (instregex "BFC", "BFI", "UBFX", "SBFX", "(t|t2)BFC", "(t|t2)BFI",
+        "(t|t2)UBFX", "(t|t2)SBFX")>;
+
+  // 4.2.9 Saturating arithmetic
+  def : InstRW< [SwiftWriteP01TwoCycle],
+        (instregex "QADD", "QSUB", "QDADD", "QDSUB", "SSAT", "SSAT16", "USAT",
+        "USAT16", "QADD8", "QADD16", "QSUB8", "QSUB16", "QASX", "QSAX",
+        "UQADD8", "UQADD16","UQSUB8","UQSUB16","UQASX","UQSAX", "t2QADD",
+        "t2QSUB", "t2QDADD", "t2QDSUB", "t2SSAT", "t2SSAT16", "t2USAT",
+        "t2QADD8", "t2QADD16", "t2QSUB8", "t2QSUB16", "t2QASX", "t2QSAX",
+        "t2UQADD8", "t2UQADD16","t2UQSUB8","t2UQSUB16","t2UQASX","t2UQSAX")>;
+
+  // 4.2.10 Parallel Arithmetic
+  // Not flag setting.
+  def : InstRW< [SwiftWriteALUsr],
+        (instregex "SADD8", "SADD16", "SSUB8", "SSUB16", "SASX", "SSAX",
+        "UADD8", "UADD16", "USUB8", "USUB16", "UASX", "USAX", "t2SADD8",
+        "t2SADD16", "t2SSUB8", "t2SSUB16", "t2SASX", "t2SSAX", "t2UADD8",
+        "t2UADD16", "t2USUB8", "t2USUB16", "t2UASX", "t2USAX")>;
+  // Flag setting.
+  def : InstRW< [SwiftWriteP01TwoCycle],
+       (instregex "SHADD8", "SHADD16", "SHSUB8", "SHSUB16", "SHASX", "SHSAX",
+       "SXTAB", "SXTAB16", "SXTAH", "UHADD8", "UHADD16", "UHSUB8", "UHSUB16",
+       "UHASX", "UHSAX", "UXTAB", "UXTAB16", "UXTAH", "t2SHADD8", "t2SHADD16",
+       "t2SHSUB8", "t2SHSUB16", "t2SHASX", "t2SHSAX", "t2SXTAB", "t2SXTAB16",
+       "t2SXTAH", "t2UHADD8", "t2UHADD16", "t2UHSUB8", "t2UHSUB16", "t2UHASX",
+       "t2UHSAX", "t2UXTAB", "t2UXTAB16", "t2UXTAH")>;
+
+  // 4.2.11 Sum of Absolute Difference
+  def : InstRW< [SwiftWriteP0P1FourCycle], (instregex "USAD8") >;
+  def : InstRW<[SwiftWriteP0P1FourCycle, ReadALU, ReadALU, SchedReadAdvance<2>],
+        (instregex "USADA8")>;
+
+  // 4.2.12 Integer Multiply (32-bit result)
+  // Two sources.
+  def : InstRW< [SwiftWriteP0FourCycle],
+        (instregex "MULS", "MUL", "SMMUL", "SMMULR", "SMULBB", "SMULBT",
+        "SMULTB", "SMULTT", "SMULWB", "SMULWT", "SMUSD", "SMUSDXi", "t2MUL",
+        "t2SMMUL", "t2SMMULR", "t2SMULBB", "t2SMULBT", "t2SMULTB", "t2SMULTT",
+        "t2SMULWB", "t2SMULWT", "t2SMUSD")>;
+
+  def SwiftWriteP0P01FiveCycleTwoUops :
+      SchedWriteRes<[SwiftUnitP0, SwiftUnitP01]>  {
+    let Latency = 5;
+  }
+
+  def SwiftPredP0P01FourFiveCycle : SchedWriteVariant<[
+    SchedVar<IsPredicatedPred, [ SwiftWriteP0P01FiveCycleTwoUops ]>,
+    SchedVar<NoSchedPred,      [ SwiftWriteP0FourCycle ]>
+  ]>;
+
+  def SwiftReadAdvanceFourCyclesPred : SchedReadVariant<[
+     SchedVar<IsPredicatedPred, [SchedReadAdvance<4>]>,
+     SchedVar<NoSchedPred,      [ReadALU]>
+  ]>;
+
+  // Multiply accumulate, three sources
+  def : InstRW< [SwiftPredP0P01FourFiveCycle, ReadALU, ReadALU,
+                 SwiftReadAdvanceFourCyclesPred],
+        (instregex "MLAS", "MLA", "MLS", "SMMLA", "SMMLAR", "SMMLS", "SMMLSR",
+        "t2MLA", "t2MLS", "t2MLAS", "t2SMMLA", "t2SMMLAR", "t2SMMLS",
+        "t2SMMLSR")>;
+
+  // 4.2.13 Integer Multiply (32-bit result, Q flag)
+  def : InstRW< [SwiftWriteP0FourCycle],
+        (instregex "SMUAD", "SMUADX", "t2SMUAD", "t2SMUADX")>;
+  def : InstRW< [SwiftPredP0P01FourFiveCycle, ReadALU, ReadALU,
+                 SwiftReadAdvanceFourCyclesPred],
+        (instregex "SMLABB", "SMLABT", "SMLATB", "SMLATT", "SMLSD", "SMLSDX",
+        "SMLAWB", "SMLAWT", "t2SMLABB", "t2SMLABT", "t2SMLATB", "t2SMLATT",
+        "t2SMLSD", "t2SMLSDX", "t2SMLAWB", "t2SMLAWT")>;
+  def : InstRW< [SwiftPredP0P01FourFiveCycle],
+        (instregex "SMLAD", "SMLADX", "t2SMLAD", "t2SMLADX")>;
+
+  def SwiftP0P0P01FiveCycle : SchedWriteRes<[SwiftUnitP0, SwiftUnitP01]> {
+    let Latency = 5;
+    let NumMicroOps = 3;
+    let ResourceCycles = [2, 1];
+  }
+  def SwiftWrite1Cycle : SchedWriteRes<[]> {
+    let Latency = 1;
+    let NumMicroOps = 0;
+  }
+  def SwiftWrite5Cycle : SchedWriteRes<[]> {
+    let Latency = 5;
+    let NumMicroOps = 0;
+  }
+  def SwiftWrite6Cycle : SchedWriteRes<[]> {
+    let Latency = 6;
+    let NumMicroOps = 0;
+  }
+
+  // 4.2.14 Integer Multiply, Long
+  def : InstRW< [SwiftP0P0P01FiveCycle, SwiftWrite5Cycle],
+        (instregex "SMULL$", "UMULL$", "t2SMULL$", "t2UMULL$")>;
+
+  def Swift2P03P01FiveCycle : SchedWriteRes<[SwiftUnitP0, SwiftUnitP01]> {
+    let Latency = 7;
+    let NumMicroOps = 5;
+    let ResourceCycles = [2, 3];
+  }
+
+  // 4.2.15 Integer Multiply Accumulate, Long
+  // 4.2.16 Integer Multiply Accumulate, Dual
+  // 4.2.17 Integer Multiply Accumulate Accumulate, Long
+  // We are being a bit inaccurate here.
+  def : InstRW< [SwiftWrite5Cycle, Swift2P03P01FiveCycle, ReadALU, ReadALU,
+                 SchedReadAdvance<4>, SchedReadAdvance<3>],
+        (instregex "SMLALS", "UMLALS", "SMLAL", "UMLAL", "MLALBB", "SMLALBT",
+        "SMLALTB", "SMLALTT", "SMLALD", "SMLALDX", "SMLSLD", "SMLSLDX",
+        "UMAAL", "t2SMLALS", "t2UMLALS", "t2SMLAL", "t2UMLAL", "t2MLALBB", "t2SMLALBT",
+        "t2SMLALTB", "t2SMLALTT", "t2SMLALD", "t2SMLALDX", "t2SMLSLD", "t2SMLSLDX",
+        "t2UMAAL")>;
+
+  def SwiftDiv : SchedWriteRes<[SwiftUnitP0, SwiftUnitDiv]> {
+    let NumMicroOps = 1;
+    let Latency = 14;
+    let ResourceCycles = [1, 14];
+  }
+  // 4.2.18 Integer Divide
+  def : WriteRes<WriteDiv, [SwiftUnitDiv]>; // Workaround.
+  def : InstRW <[SwiftDiv],
+        (instregex "SDIV", "UDIV", "t2SDIV", "t2UDIV")>;
+
+  // 4.2.19 Integer Load Single Element
+  // 4.2.20 Integer Load Signextended
+  def SwiftWriteP2P01ThreeCycle : SchedWriteRes<[SwiftUnitP2, SwiftUnitP01]> {
+    let Latency = 3;
+    let NumMicroOps = 2;
+  }
+  def SwiftWriteP2P01FourCyle : SchedWriteRes<[SwiftUnitP2, SwiftUnitP01]> {
+    let Latency = 4;
+    let NumMicroOps = 2;
+  }
+  def SwiftWriteP2P01P01FourCycle : SchedWriteRes<[SwiftUnitP2, SwiftUnitP01,
+                                                   SwiftUnitP01]> {
+    let Latency = 4;
+    let NumMicroOps = 3;
+  }
+  def SwiftWriteP2P2ThreeCycle : SchedWriteRes<[SwiftUnitP2, SwiftUnitP2]> {
+    let Latency = 3;
+    let NumMicroOps = 2;
+  }
+  def SwiftWriteP2P2P01ThreeCycle : SchedWriteRes<[SwiftUnitP2, SwiftUnitP2,
+                                                   SwiftUnitP01]> {
+    let Latency = 3;
+    let NumMicroOps = 3;
+  }
+  def SwiftWrBackOne : SchedWriteRes<[]> {
+    let Latency = 1;
+    let NumMicroOps = 0;
+  }
+  def SwiftWriteLdFour : SchedWriteRes<[]> {
+    let Latency = 4;
+    let NumMicroOps = 0;
+  }
+   // Not accurate.
+  def : InstRW<[SwiftWriteP2ThreeCycle],
+        (instregex "LDR(i12|rs)$", "LDRB(i12|rs)$", "t2LDR(i8|i12|s|pci)",
+        "t2LDR(H|B)(i8|i12|s|pci)", "LDREX", "tLDR[BH](r|i|spi|pci|pciASM)",
+        "tLDR(r|i|spi|pci|pciASM)")>;
+  def : InstRW<[SwiftWriteP2ThreeCycle],
+        (instregex "LDRH$",  "PICLDR$", "PICLDR(H|B)$", "LDRcp$")>;
+  def : InstRW<[SwiftWriteP2P01FourCyle],
+        (instregex "PICLDRS(H|B)$", "t2LDRS(H|B)(i|r|p|s)", "LDRS(H|B)$",
+        "t2LDRpci_pic", "tLDRS(B|H)")>;
+  def : InstRW<[SwiftWriteP2P01ThreeCycle,  SwiftWrBackOne],
+        (instregex "LD(RB|R)(_|T_)(POST|PRE)_(IMM|REG)", "LDRH(_PRE|_POST)",
+        "LDR(T|BT)_POST_(REG|IMM)", "LDRHT(i|r)",
+        "t2LD(R|RB|RH)_(PRE|POST)", "t2LD(R|RB|RH)T")>;
+  def : InstRW<[SwiftWriteP2P01P01FourCycle, SwiftWrBackOne],
+        (instregex "LDR(SH|SB)(_POST|_PRE)", "t2LDR(SH|SB)(_POST|_PRE)",
+        "LDRS(B|H)T(i|r)", "t2LDRS(B|H)T(i|r)", "t2LDRS(B|H)T")>;
+
+  // 4.2.21 Integer Dual Load
+  // Not accurate.
+  def : InstRW<[SwiftWriteP2P2ThreeCycle, SwiftWriteLdFour],
+        (instregex "t2LDRDi8", "LDRD$")>;
+  def : InstRW<[SwiftWriteP2P2P01ThreeCycle, SwiftWriteLdFour, SwiftWrBackOne],
+        (instregex "LDRD_(POST|PRE)", "t2LDRD_(POST|PRE)")>;
+
+  // 4.2.22 Integer Load, Multiple
+  // NumReg = 1 .. 16
+  foreach Lat = 3-25 in {
+    def SwiftWriteLM#Lat#Cy : SchedWriteRes<[SwiftUnitP2]> {
+      let Latency = Lat;
+    }
+    def SwiftWriteLM#Lat#CyNo : SchedWriteRes<[]> {
+      let Latency = Lat;
+      let NumMicroOps = 0;
+    }
+  }
+  // Predicate.
+  foreach NumAddr = 1-16 in {
+    def SwiftLMAddr#NumAddr#Pred : SchedPredicate<"TII->getNumLDMAddresses(MI) == "#NumAddr>;
+  }
+  def SwiftWriteLDMAddrNoWB : SchedWriteRes<[SwiftUnitP01]> { let Latency = 0; }
+  def SwiftWriteLDMAddrWB : SchedWriteRes<[SwiftUnitP01, SwiftUnitP01]>;
+  def SwiftWriteLM : SchedWriteVariant<[
+    SchedVar<SwiftLMAddr2Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy]>,
+    SchedVar<SwiftLMAddr3Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy]>,
+    SchedVar<SwiftLMAddr4Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy]>,
+    SchedVar<SwiftLMAddr5Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy]>,
+    SchedVar<SwiftLMAddr6Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy, SwiftWriteLM8Cy]>,
+    SchedVar<SwiftLMAddr7Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM9Cy]>,
+    SchedVar<SwiftLMAddr8Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM9Cy, SwiftWriteLM10Cy]>,
+    SchedVar<SwiftLMAddr9Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM11Cy]>,
+    SchedVar<SwiftLMAddr10Pred,[SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM11Cy, SwiftWriteLM12Cy]>,
+    SchedVar<SwiftLMAddr11Pred,[SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM11Cy, SwiftWriteLM12Cy,
+                                SwiftWriteLM13Cy]>,
+    SchedVar<SwiftLMAddr12Pred,[SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM11Cy, SwiftWriteLM12Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM14Cy]>,
+    SchedVar<SwiftLMAddr13Pred,[SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM11Cy, SwiftWriteLM12Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM14Cy,
+                                SwiftWriteLM15Cy]>,
+    SchedVar<SwiftLMAddr14Pred,[SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM11Cy, SwiftWriteLM12Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM14Cy,
+                                SwiftWriteLM15Cy, SwiftWriteLM16Cy]>,
+    SchedVar<SwiftLMAddr15Pred,[SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM11Cy, SwiftWriteLM12Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM14Cy,
+                                SwiftWriteLM15Cy, SwiftWriteLM16Cy,
+                                SwiftWriteLM17Cy]>,
+    SchedVar<SwiftLMAddr16Pred,[SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM11Cy, SwiftWriteLM12Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM14Cy,
+                                SwiftWriteLM15Cy, SwiftWriteLM16Cy,
+                                SwiftWriteLM17Cy, SwiftWriteLM18Cy]>,
+    // Unknow number of registers, just use resources for two registers.
+    SchedVar<NoSchedPred,      [SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5CyNo, SwiftWriteLM6CyNo,
+                                SwiftWriteLM7CyNo, SwiftWriteLM8CyNo,
+                                SwiftWriteLM9CyNo, SwiftWriteLM10CyNo,
+                                SwiftWriteLM11CyNo, SwiftWriteLM12CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM14CyNo,
+                                SwiftWriteLM15CyNo, SwiftWriteLM16CyNo,
+                                SwiftWriteLM17CyNo, SwiftWriteLM18CyNo]>
+
+  ]> { let Variadic=1; }
+
+  def : InstRW<[SwiftWriteLM, SwiftWriteLDMAddrNoWB],
+        (instregex "LDM(IA|DA|DB|IB)$", "t2LDM(IA|DA|DB|IB)$",
+        "(t|sys)LDM(IA|DA|DB|IB)$")>;
+  def : InstRW<[SwiftWriteLDMAddrWB, SwiftWriteLM],
+        (instregex /*"t2LDMIA_RET", "tLDMIA_RET", "LDMIA_RET",*/
+        "LDM(IA|DA|DB|IB)_UPD", "(t2|sys|t)LDM(IA|DA|DB|IB)_UPD")>;
+  def : InstRW<[SwiftWriteLDMAddrWB, SwiftWriteLM, SwiftWriteP1TwoCycle],
+        (instregex "LDMIA_RET", "(t|t2)LDMIA_RET", "POP", "tPOP")>;
+  // 4.2.23 Integer Store, Single Element
+  def : InstRW<[SwiftWriteP2],
+        (instregex "PICSTR", "STR(i12|rs)", "STRB(i12|rs)", "STRH$", "STREX",
+        "t2STR(i12|i8|s)$", "t2STR[BH](i12|i8|s)$", "tSTR[BH](i|r)", "tSTR(i|r)", "tSTRspi")>;
+
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWriteP2],
+        (instregex "STR(B_|_|BT_|T_)(PRE_IMM|PRE_REG|POST_REG|POST_IMM)",
+        "STR(i|r)_preidx", "STRB(i|r)_preidx", "STRH_preidx", "STR(H_|HT_)(PRE|POST)",
+        "STR(BT|HT|T)", "t2STR_(PRE|POST)", "t2STR[BH]_(PRE|POST)",
+        "t2STR_preidx", "t2STR[BH]_preidx", "t2ST(RB|RH|R)T")>;
+
+  // 4.2.24 Integer Store, Dual
+  def : InstRW<[SwiftWriteP2, SwiftWriteP2, SwiftWriteP01OneCycle],
+        (instregex "STRD$", "t2STRDi8")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWriteP2, SwiftWriteP2,
+                SwiftWriteP01OneCycle],
+        (instregex "(t2|t)STRD_(POST|PRE)", "STRD_(POST|PRE)")>;
+
+  // 4.2.25 Integer Store, Multiple
+  def SwiftWriteStIncAddr : SchedWriteRes<[SwiftUnitP2, SwiftUnitP01]> {
+    let Latency = 0;
+    let NumMicroOps = 2;
+  }
+  foreach NumAddr = 1-16 in {
+     def SwiftWriteSTM#NumAddr : WriteSequence<[SwiftWriteStIncAddr], NumAddr>;
+  }
+  def SwiftWriteSTM : SchedWriteVariant<[
+    SchedVar<SwiftLMAddr2Pred, [SwiftWriteSTM2]>,
+    SchedVar<SwiftLMAddr3Pred, [SwiftWriteSTM3]>,
+    SchedVar<SwiftLMAddr4Pred, [SwiftWriteSTM4]>,
+    SchedVar<SwiftLMAddr5Pred, [SwiftWriteSTM5]>,
+    SchedVar<SwiftLMAddr6Pred, [SwiftWriteSTM6]>,
+    SchedVar<SwiftLMAddr7Pred, [SwiftWriteSTM7]>,
+    SchedVar<SwiftLMAddr8Pred, [SwiftWriteSTM8]>,
+    SchedVar<SwiftLMAddr9Pred, [SwiftWriteSTM9]>,
+    SchedVar<SwiftLMAddr10Pred,[SwiftWriteSTM10]>,
+    SchedVar<SwiftLMAddr11Pred,[SwiftWriteSTM11]>,
+    SchedVar<SwiftLMAddr12Pred,[SwiftWriteSTM12]>,
+    SchedVar<SwiftLMAddr13Pred,[SwiftWriteSTM13]>,
+    SchedVar<SwiftLMAddr14Pred,[SwiftWriteSTM14]>,
+    SchedVar<SwiftLMAddr15Pred,[SwiftWriteSTM15]>,
+    SchedVar<SwiftLMAddr16Pred,[SwiftWriteSTM16]>,
+    // Unknow number of registers, just use resources for two registers.
+    SchedVar<NoSchedPred,      [SwiftWriteSTM2]>
+  ]>;
+  def : InstRW<[SwiftWriteSTM],
+        (instregex "STM(IB|IA|DB|DA)$", "(t2|sys|t)STM(IB|IA|DB|DA)$")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWriteSTM],
+        (instregex "STM(IB|IA|DB|DA)_UPD", "(t2|sys|t)STM(IB|IA|DB|DA)_UPD",
+        "PUSH", "tPUSH")>;
+
+  // 4.2.26 Branch
+  def : WriteRes<WriteBr, [SwiftUnitP1]> { let Latency = 0; }
+  def : WriteRes<WriteBrL, [SwiftUnitP1]> { let Latency = 2; }
+  def : WriteRes<WriteBrTbl, [SwiftUnitP1, SwiftUnitP2]> { let Latency = 0; }
+
+  // 4.2.27 Not issued
+  def : WriteRes<WriteNoop, []> { let Latency = 0; let NumMicroOps = 0; }
+  def : InstRW<[WriteNoop], (instregex "t2IT", "IT", "NOP")>;
+
+  // 4.2.28 Advanced SIMD, Integer, 2 cycle
+  def : InstRW<[SwiftWriteP0TwoCycle],
+        (instregex "VADDv", "VSUBv", "VNEG(s|f|v)", "VADDL", "VSUBL",
+                   "VADDW", "VSUBW", "VHADD", "VHSUB", "VRHADD", "VPADDi",
+                   "VPADDL", "VAND", "VBIC", "VEOR", "VORN", "VORR", "VTST",
+                   "VSHL", "VSHR(s|u)", "VSHLL", "VQSHL", "VQSHLU", "VBIF",
+                   "VBIT", "VBSL", "VSLI", "VSRI", "VCLS", "VCLZ", "VCNT")>;
+
+  def : InstRW<[SwiftWriteP1TwoCycle],
+        (instregex "VEXT", "VREV16", "VREV32", "VREV64")>;
+
+  // 4.2.29 Advanced SIMD, Integer, 4 cycle
+  // 4.2.30 Advanced SIMD, Integer with Accumulate
+  def : InstRW<[SwiftWriteP0FourCycle],
+        (instregex "VABA", "VABAL", "VPADAL", "VRSRA", "VSRA", "VACGE", "VACGT",
+        "VACLE", "VACLT", "VCEQ", "VCGE", "VCGT", "VCLE", "VCLT", "VRSHL",
+        "VQRSHL", "VRSHR(u|s)", "VABS(f|v)", "VQABS", "VQNEG", "VQADD",
+        "VQSUB")>;
+  def : InstRW<[SwiftWriteP1FourCycle],
+        (instregex "VRECPE", "VRSQRTE")>;
+
+  // 4.2.31 Advanced SIMD, Add and Shift with Narrow
+  def : InstRW<[SwiftWriteP0P1FourCycle],
+        (instregex "VADDHN", "VSUBHN", "VSHRN")>;
+  def : InstRW<[SwiftWriteP0P1SixCycle],
+        (instregex "VRADDHN", "VRSUBHN", "VRSHRN", "VQSHRN", "VQSHRUN",
+                   "VQRSHRN", "VQRSHRUN")>;
+
+  // 4.2.32 Advanced SIMD, Vector Table Lookup
+  foreach Num = 1-4 in {
+    def SwiftWrite#Num#xP1TwoCycle : WriteSequence<[SwiftWriteP1TwoCycle], Num>;
+  }
+  def : InstRW<[SwiftWrite1xP1TwoCycle],
+        (instregex "VTB(L|X)1")>;
+  def : InstRW<[SwiftWrite2xP1TwoCycle],
+        (instregex "VTB(L|X)2")>;
+  def : InstRW<[SwiftWrite3xP1TwoCycle],
+        (instregex "VTB(L|X)3")>;
+  def : InstRW<[SwiftWrite4xP1TwoCycle],
+        (instregex "VTB(L|X)4")>;
+
+  // 4.2.33 Advanced SIMD, Transpose
+  def : InstRW<[SwiftWriteP1FourCycle, SwiftWriteP1FourCycle,
+                SwiftWriteP1TwoCycle/*RsrcOnly*/, SchedReadAdvance<2>],
+        (instregex "VSWP", "VTRN", "VUZP", "VZIP")>;
+
+  // 4.2.34 Advanced SIMD and VFP, Floating Point
+  def : InstRW<[SwiftWriteP0TwoCycle], (instregex "VABS(S|D)$", "VNEG(S|D)$")>;
+  def : InstRW<[SwiftWriteP0FourCycle],
+        (instregex "VCMP(D|S|ZD|ZS)$", "VCMPE(D|S|ZD|ZS)")>;
+  def : InstRW<[SwiftWriteP0FourCycle],
+        (instregex "VADD(S|f)", "VSUB(S|f)", "VABD", "VPADDf", "VMAX", "VMIN", "VPMAX",
+                   "VPMIN")>;
+  def : InstRW<[SwiftWriteP0SixCycle], (instregex "VADDD$", "VSUBD$")>;
+  def : InstRW<[SwiftWriteP1EightCycle], (instregex "VRECPS", "VRSQRTS")>;
+
+  // 4.2.35 Advanced SIMD and VFP, Multiply
+  def : InstRW<[SwiftWriteP1FourCycle],
+        (instregex "VMUL(S|v|p|f|s)", "VNMULS", "VQDMULH", "VQRDMULH",
+                   "VMULL", "VQDMULL")>;
+  def : InstRW<[SwiftWriteP1SixCycle],
+        (instregex "VMULD", "VNMULD")>;
+  def : InstRW<[SwiftWriteP1FourCycle],
+        (instregex "VMLA", "VMLS", "VNMLA", "VNMLS", "VFMA(S|D)", "VFMS(S|D)",
+        "VFNMA", "VFNMS", "VMLAL", "VMLSL","VQDMLAL", "VQDMLSL")>;
+  def : InstRW<[SwiftWriteP1EightCycle], (instregex "VFMAfd", "VFMSfd")>;
+  def : InstRW<[SwiftWriteP1TwelveCyc], (instregex "VFMAfq", "VFMSfq")>;
+
+  // 4.2.36 Advanced SIMD and VFP, Convert
+  def : InstRW<[SwiftWriteP1FourCycle], (instregex "VCVT", "V(S|U)IT", "VTO(S|U)")>;
+  // Fixpoint conversions.
+  def : WriteRes<WriteCvtFP, [SwiftUnitP1]> { let Latency = 4; }
+
+  // 4.2.37 Advanced SIMD and VFP, Move
+  def : InstRW<[SwiftWriteP0TwoCycle],
+        (instregex "VMOVv", "VMOV(S|D)$", "VMOV(S|D)cc",
+                   "VMVNv", "VMVN(d|q)", "VMVN(S|D)cc",
+                   "FCONST(D|S)")>;
+  def : InstRW<[SwiftWriteP1TwoCycle], (instregex "VMOVN", "VMOVL")>;
+  def : InstRW<[WriteSequence<[SwiftWriteP0FourCycle, SwiftWriteP1TwoCycle]>],
+        (instregex "VQMOVN")>;
+  def : InstRW<[SwiftWriteP1TwoCycle], (instregex "VDUPLN", "VDUPf")>;
+  def : InstRW<[WriteSequence<[SwiftWriteP2FourCycle, SwiftWriteP1TwoCycle]>],
+        (instregex "VDUP(8|16|32)")>;
+  def : InstRW<[SwiftWriteP2ThreeCycle], (instregex "VMOVRS$")>;
+  def : InstRW<[WriteSequence<[SwiftWriteP2FourCycle, SwiftWriteP0TwoCycle]>],
+        (instregex "VMOVSR$", "VSETLN")>;
+  def : InstRW<[SwiftWriteP2ThreeCycle, SwiftWriteP2FourCycle],
+        (instregex "VMOVRR(D|S)$")>;
+  def : InstRW<[SwiftWriteP2FourCycle], (instregex "VMOVDRR$")>;
+  def : InstRW<[WriteSequence<[SwiftWriteP2FourCycle, SwiftWriteP1TwoCycle]>,
+                WriteSequence<[SwiftWrite1Cycle, SwiftWriteP2FourCycle,
+                               SwiftWriteP1TwoCycle]>],
+                (instregex "VMOVSRR$")>;
+  def : InstRW<[WriteSequence<[SwiftWriteP1TwoCycle, SwiftWriteP2ThreeCycle]>],
+        (instregex "VGETLN(u|i)")>;
+  def : InstRW<[WriteSequence<[SwiftWriteP1TwoCycle, SwiftWriteP2ThreeCycle,
+                               SwiftWriteP01OneCycle]>],
+        (instregex "VGETLNs")>;
+
+  // 4.2.38 Advanced SIMD and VFP, Move FPSCR
+  // Serializing instructions.
+  def SwiftWaitP0For15Cy : SchedWriteRes<[SwiftUnitP0]> {
+    let Latency = 15;
+    let ResourceCycles = [15];
+  }
+  def SwiftWaitP1For15Cy : SchedWriteRes<[SwiftUnitP1]> {
+    let Latency = 15;
+    let ResourceCycles = [15];
+  }
+  def SwiftWaitP2For15Cy : SchedWriteRes<[SwiftUnitP2]> {
+    let Latency = 15;
+    let ResourceCycles = [15];
+  }
+  def : InstRW<[SwiftWaitP0For15Cy, SwiftWaitP1For15Cy, SwiftWaitP2For15Cy],
+        (instregex "VMRS")>;
+  def : InstRW<[SwiftWaitP0For15Cy, SwiftWaitP1For15Cy, SwiftWaitP2For15Cy],
+        (instregex "VMSR")>;
+  // Not serializing.
+  def : InstRW<[SwiftWriteP0TwoCycle], (instregex "FMSTAT")>;
+
+  // 4.2.39 Advanced SIMD and VFP, Load Single Element
+  def : InstRW<[SwiftWriteLM4Cy], (instregex "VLDRD$", "VLDRS$")>;
+
+  // 4.2.40 Advanced SIMD and VFP, Store Single Element
+  def : InstRW<[SwiftWriteLM4Cy], (instregex "VSTRD$", "VSTRS$")>;
+
+  // 4.2.41 Advanced SIMD and VFP, Load Multiple
+  // 4.2.42 Advanced SIMD and VFP, Store Multiple
+
+  // Resource requirement for permuting, just reserves the resources.
+  foreach Num = 1-28 in {
+    def SwiftVLDMPerm#Num : SchedWriteRes<[SwiftUnitP1]> {
+      let Latency = 0;
+      let NumMicroOps = Num;
+      let ResourceCycles = [Num];
+    }
+  }
+
+  // Pre RA pseudos - load/store to a Q register as a D register pair.
+  def : InstRW<[SwiftWriteLM4Cy], (instregex "VLDMQIA$", "VSTMQIA$")>;
+
+  // Post RA not modelled accurately. We assume that register use of width 64
+  // bit maps to a D register, 128 maps to a Q register. Not all different kinds
+  // are accurately represented.
+  def SwiftWriteVLDM : SchedWriteVariant<[
+    // Load of one S register.
+    SchedVar<SwiftLMAddr1Pred, [SwiftWriteLM4Cy]>,
+    // Load of one D register.
+    SchedVar<SwiftLMAddr2Pred, [SwiftWriteLM4Cy, SwiftWriteLM4CyNo]>,
+    // Load of 3 S register.
+    SchedVar<SwiftLMAddr3Pred, [SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM13CyNo, SwiftWriteP01OneCycle,
+                                SwiftVLDMPerm3]>,
+    // Load of a Q register (not necessarily true). We should not be mapping to
+    // 4 S registers, either.
+    SchedVar<SwiftLMAddr4Pred, [SwiftWriteLM4Cy, SwiftWriteLM4CyNo,
+                                SwiftWriteLM4CyNo, SwiftWriteLM4CyNo]>,
+    // Load of 5 S registers.
+    SchedVar<SwiftLMAddr5Pred, [SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM13CyNo, SwiftWriteLM14CyNo,
+                                SwiftWriteLM17CyNo,  SwiftWriteP01OneCycle,
+                                SwiftVLDMPerm5]>,
+    // Load of 3 D registers. (Must also be able to handle s register list -
+    // though, not accurate)
+    SchedVar<SwiftLMAddr6Pred, [SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM10Cy, SwiftWriteLM14CyNo,
+                                SwiftWriteLM14CyNo, SwiftWriteLM14CyNo,
+                                SwiftWriteP01OneCycle, SwiftVLDMPerm5]>,
+    // Load of 7 S registers.
+    SchedVar<SwiftLMAddr7Pred, [SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM14CyNo,
+                                SwiftWriteLM17CyNo, SwiftWriteLM18CyNo,
+                                SwiftWriteLM21CyNo, SwiftWriteP01OneCycle,
+                                SwiftVLDMPerm7]>,
+    // Load of two Q registers.
+    SchedVar<SwiftLMAddr8Pred, [SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteP01OneCycle,  SwiftVLDMPerm2]>,
+    // Load of 9 S registers.
+    SchedVar<SwiftLMAddr9Pred, [SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM14CyNo,
+                                SwiftWriteLM17CyNo, SwiftWriteLM18CyNo,
+                                SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM25CyNo, SwiftWriteP01OneCycle,
+                                SwiftVLDMPerm9]>,
+    // Load of 5 D registers.
+    SchedVar<SwiftLMAddr10Pred,[SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM10Cy, SwiftWriteLM14Cy,
+                                SwiftWriteLM14CyNo, SwiftWriteLM14CyNo,
+                                SwiftWriteLM14CyNo, SwiftWriteLM14CyNo,
+                                SwiftWriteLM14CyNo,  SwiftWriteLM14CyNo,
+                                SwiftWriteP01OneCycle, SwiftVLDMPerm5]>,
+    // Inaccurate: reuse describtion from 9 S registers.
+    SchedVar<SwiftLMAddr11Pred,[SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM14CyNo,
+                                SwiftWriteLM17CyNo, SwiftWriteLM18CyNo,
+                                SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM25CyNo, SwiftWriteP01OneCycle,
+                                SwiftVLDMPerm9]>,
+    // Load of three Q registers.
+    SchedVar<SwiftLMAddr12Pred,[SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM11Cy, SwiftWriteLM11Cy,
+                                SwiftWriteLM11CyNo, SwiftWriteLM11CyNo,
+                                SwiftWriteLM11CyNo, SwiftWriteLM11CyNo,
+                                SwiftWriteLM11CyNo, SwiftWriteLM11CyNo,
+                                SwiftWriteLM11CyNo, SwiftWriteLM11CyNo,
+                                SwiftWriteP01OneCycle, SwiftVLDMPerm3]>,
+    // Inaccurate: reuse describtion from 9 S registers.
+    SchedVar<SwiftLMAddr13Pred, [SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM14CyNo,
+                                SwiftWriteLM17CyNo, SwiftWriteLM18CyNo,
+                                SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM25CyNo, SwiftWriteP01OneCycle,
+                                SwiftVLDMPerm9]>,
+    // Load of 7 D registers inaccurate.
+    SchedVar<SwiftLMAddr14Pred,[SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM10Cy, SwiftWriteLM14Cy,
+                                SwiftWriteLM14Cy, SwiftWriteLM14CyNo,
+                                SwiftWriteLM14CyNo, SwiftWriteLM14CyNo,
+                                SwiftWriteLM14CyNo,  SwiftWriteLM14CyNo,
+                                SwiftWriteLM14CyNo,  SwiftWriteLM14CyNo,
+                                SwiftWriteP01OneCycle, SwiftVLDMPerm7]>,
+    SchedVar<SwiftLMAddr15Pred,[SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM14Cy,
+                                SwiftWriteLM17Cy, SwiftWriteLM18CyNo,
+                                SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM25CyNo, SwiftWriteP01OneCycle,
+                                SwiftVLDMPerm9]>,
+    // Load of 4 Q registers.
+    SchedVar<SwiftLMAddr16Pred,[SwiftWriteLM7Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM11Cy, SwiftWriteLM14Cy,
+                                SwiftWriteLM15Cy, SwiftWriteLM18CyNo,
+                                SwiftWriteLM19CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM19CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM19CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM19CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM19CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteP01OneCycle, SwiftVLDMPerm4]>,
+    // Unknow number of registers, just use resources for two registers.
+    SchedVar<NoSchedPred,      [SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteP01OneCycle,  SwiftVLDMPerm2]>
+  ]> { let Variadic = 1; }
+
+  def : InstRW<[SwiftWriteVLDM], (instregex "VLDM[SD](IA|DB)$")>;
+
+  def : InstRW<[SwiftWriteP01OneCycle2x, SwiftWriteVLDM],
+        (instregex "VLDM[SD](IA|DB)_UPD$")>;
+
+  def SwiftWriteVSTM : SchedWriteVariant<[
+    // One S register.
+    SchedVar<SwiftLMAddr1Pred, [SwiftWriteSTM1]>,
+    // One D register.
+    SchedVar<SwiftLMAddr2Pred, [SwiftWriteSTM1]>,
+    // Three S registers.
+    SchedVar<SwiftLMAddr3Pred, [SwiftWriteSTM4]>,
+    // Assume one Q register.
+    SchedVar<SwiftLMAddr4Pred, [SwiftWriteSTM1]>,
+    SchedVar<SwiftLMAddr5Pred, [SwiftWriteSTM6]>,
+    // Assume three D registers.
+    SchedVar<SwiftLMAddr6Pred, [SwiftWriteSTM4]>,
+    SchedVar<SwiftLMAddr7Pred, [SwiftWriteSTM8]>,
+    // Assume two Q registers.
+    SchedVar<SwiftLMAddr8Pred, [SwiftWriteSTM3]>,
+    SchedVar<SwiftLMAddr9Pred, [SwiftWriteSTM10]>,
+    // Assume 5 D registers.
+    SchedVar<SwiftLMAddr10Pred, [SwiftWriteSTM6]>,
+    SchedVar<SwiftLMAddr11Pred, [SwiftWriteSTM12]>,
+    // Assume three Q registers.
+    SchedVar<SwiftLMAddr12Pred, [SwiftWriteSTM4]>,
+    SchedVar<SwiftLMAddr13Pred, [SwiftWriteSTM14]>,
+    // Assume 7 D registers.
+    SchedVar<SwiftLMAddr14Pred, [SwiftWriteSTM8]>,
+    SchedVar<SwiftLMAddr15Pred, [SwiftWriteSTM16]>,
+    // Assume four Q registers.
+    SchedVar<SwiftLMAddr16Pred, [SwiftWriteSTM5]>,
+    // Asumme two Q registers.
+    SchedVar<NoSchedPred, [SwiftWriteSTM3]>
+  ]> { let Variadic = 1; }
+
+  def : InstRW<[SwiftWriteVSTM], (instregex "VSTM[SD](IA|DB)$")>;
+
+  def : InstRW<[SwiftWriteP01OneCycle2x, SwiftWriteVSTM],
+        (instregex "VSTM[SD](IA|DB)_UPD")>;
+
+  // 4.2.43 Advanced SIMD, Element or Structure Load and Store
+  def SwiftWrite2xP2FourCy : SchedWriteRes<[SwiftUnitP2]> {
+      let Latency = 4;
+      let ResourceCycles = [2];
+  }
+  def SwiftWrite3xP2FourCy : SchedWriteRes<[SwiftUnitP2]> {
+      let Latency = 4;
+      let ResourceCycles = [3];
+  }
+  foreach Num = 1-2 in {
+    def SwiftExt#Num#xP0 : SchedWriteRes<[SwiftUnitP0]> {
+      let Latency = 0;
+      let NumMicroOps = Num;
+      let ResourceCycles = [Num];
+    }
+  }
+  // VLDx
+  // Multiple structures.
+  // Single element structure loads.
+  // We assume aligned.
+  // Single/two register.
+  def : InstRW<[SwiftWriteLM4Cy], (instregex "VLD1(d|q)(8|16|32|64)$")>;
+  def : InstRW<[SwiftWriteLM4Cy, SwiftWriteP01OneCycle],
+        (instregex "VLD1(d|q)(8|16|32|64)wb")>;
+  // Three register.
+  def : InstRW<[SwiftWrite3xP2FourCy],
+        (instregex "VLD1(d|q)(8|16|32|64)T$", "VLD1d64TPseudo")>;
+  def : InstRW<[SwiftWrite3xP2FourCy, SwiftWriteP01OneCycle],
+        (instregex "VLD1(d|q)(8|16|32|64)Twb")>;
+  /// Four Register.
+  def : InstRW<[SwiftWrite2xP2FourCy],
+        (instregex "VLD1(d|q)(8|16|32|64)Q$", "VLD1d64QPseudo")>;
+  def : InstRW<[SwiftWrite2xP2FourCy, SwiftWriteP01OneCycle],
+        (instregex "VLD1(d|q)(8|16|32|64)Qwb")>;
+  // Two element structure loads.
+  // Two/four register.
+  def : InstRW<[SwiftWriteLM9Cy, SwiftExt2xP0, SwiftVLDMPerm2],
+        (instregex "VLD2(d|q|b)(8|16|32)$", "VLD2q(8|16|32)Pseudo$")>;
+  def : InstRW<[SwiftWriteLM9Cy, SwiftWriteP01OneCycle, SwiftExt2xP0,
+                SwiftVLDMPerm2],
+        (instregex "VLD2(d|q|b)(8|16|32)wb", "VLD2q(8|16|32)PseudoWB")>;
+  // Three element structure.
+  def : InstRW<[SwiftWriteLM9Cy, SwiftWriteLM9CyNo, SwiftWriteLM9CyNo,
+                SwiftVLDMPerm3, SwiftWrite3xP2FourCy],
+        (instregex "VLD3(d|q)(8|16|32)$")>;
+  def : InstRW<[SwiftWriteLM9Cy, SwiftVLDMPerm3, SwiftWrite3xP2FourCy],
+        (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo$")>;
+
+  def : InstRW<[SwiftWriteLM9Cy, SwiftWriteLM9CyNo, SwiftWriteLM9CyNo,
+                SwiftWriteP01OneCycle, SwiftVLDMPerm3, SwiftWrite3xP2FourCy],
+        (instregex "VLD3(d|q)(8|16|32)_UPD$")>;
+  def : InstRW<[SwiftWriteLM9Cy, SwiftWriteP01OneCycle, SwiftVLDMPerm3,
+                SwiftWrite3xP2FourCy],
+        (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo_UPD")>;
+  // Four element structure loads.
+  def : InstRW<[SwiftWriteLM11Cy, SwiftWriteLM11Cy, SwiftWriteLM11Cy,
+                SwiftWriteLM11Cy, SwiftExt2xP0, SwiftVLDMPerm4,
+                SwiftWrite3xP2FourCy],
+        (instregex "VLD4(d|q)(8|16|32)$")>;
+  def : InstRW<[SwiftWriteLM11Cy,  SwiftExt2xP0, SwiftVLDMPerm4,
+                SwiftWrite3xP2FourCy],
+        (instregex "VLD4(d|q)(8|16|32)(oddP|P)seudo$")>;
+  def : InstRW<[SwiftWriteLM11Cy, SwiftWriteLM11Cy, SwiftWriteLM11Cy,
+                SwiftWriteLM11Cy, SwiftWriteP01OneCycle, SwiftExt2xP0,
+                SwiftVLDMPerm4, SwiftWrite3xP2FourCy],
+        (instregex "VLD4(d|q)(8|16|32)_UPD")>;
+  def : InstRW<[SwiftWriteLM11Cy, SwiftWriteP01OneCycle, SwiftExt2xP0,
+                SwiftVLDMPerm4, SwiftWrite3xP2FourCy],
+        (instregex  "VLD4(d|q)(8|16|32)(oddP|P)seudo_UPD")>;
+
+  // Single all/lane loads.
+  // One element structure.
+  def : InstRW<[SwiftWriteLM6Cy, SwiftVLDMPerm2],
+        (instregex "VLD1(LN|DUP)(d|q)(8|16|32)$", "VLD1(LN|DUP)(d|q)(8|16|32)Pseudo$")>;
+  def : InstRW<[SwiftWriteLM6Cy, SwiftWriteP01OneCycle, SwiftVLDMPerm2],
+        (instregex "VLD1(LN|DUP)(d|q)(8|16|32)(wb|_UPD)",
+                  "VLD1LNq(8|16|32)Pseudo_UPD")>;
+  // Two element structure.
+  def : InstRW<[SwiftWriteLM6Cy, SwiftWriteLM6Cy, SwiftExt1xP0, SwiftVLDMPerm2],
+        (instregex "VLD2(DUP|LN)(d|q)(8|16|32|8x2|16x2|32x2)$",
+                   "VLD2LN(d|q)(8|16|32)Pseudo$")>;
+  def : InstRW<[SwiftWriteLM6Cy, SwiftWriteLM6Cy, SwiftWriteP01OneCycle,
+                SwiftExt1xP0, SwiftVLDMPerm2],
+        (instregex "VLD2LN(d|q)(8|16|32)_UPD$")>;
+  def : InstRW<[SwiftWriteLM6Cy, SwiftWriteP01OneCycle, SwiftWriteLM6Cy,
+                SwiftExt1xP0, SwiftVLDMPerm2],
+        (instregex "VLD2DUPd(8|16|32|8x2|16x2|32x2)wb")>;
+  def : InstRW<[SwiftWriteLM6Cy, SwiftWriteP01OneCycle, SwiftWriteLM6Cy,
+                SwiftExt1xP0, SwiftVLDMPerm2],
+        (instregex "VLD2LN(d|q)(8|16|32)Pseudo_UPD")>;
+  // Three element structure.
+  def : InstRW<[SwiftWriteLM7Cy, SwiftWriteLM8Cy, SwiftWriteLM8Cy, SwiftExt1xP0,
+                SwiftVLDMPerm3],
+        (instregex "VLD3(DUP|LN)(d|q)(8|16|32)$",
+                   "VLD3(LN|DUP)(d|q)(8|16|32)Pseudo$")>;
+  def : InstRW<[SwiftWriteLM7Cy, SwiftWriteLM8Cy, SwiftWriteLM8Cy,
+                SwiftWriteP01OneCycle, SwiftExt1xP0, SwiftVLDMPerm3],
+        (instregex "VLD3(LN|DUP)(d|q)(8|16|32)_UPD")>;
+  def : InstRW<[SwiftWriteLM7Cy, SwiftWriteP01OneCycle, SwiftWriteLM8Cy,
+                SwiftWriteLM8Cy, SwiftExt1xP0, SwiftVLDMPerm3],
+        (instregex "VLD3(LN|DUP)(d|q)(8|16|32)Pseudo_UPD")>;
+  // Four element struture.
+  def : InstRW<[SwiftWriteLM8Cy, SwiftWriteLM9Cy, SwiftWriteLM10CyNo,
+                SwiftWriteLM10CyNo, SwiftExt1xP0, SwiftVLDMPerm5],
+        (instregex "VLD4(LN|DUP)(d|q)(8|16|32)$",
+                   "VLD4(LN|DUP)(d|q)(8|16|32)Pseudo$")>;
+  def : InstRW<[SwiftWriteLM8Cy, SwiftWriteLM9Cy, SwiftWriteLM10CyNo,
+                SwiftWriteLM10CyNo, SwiftWriteP01OneCycle, SwiftExt1xP0,
+                SwiftVLDMPerm5],
+        (instregex "VLD4(DUP|LN)(d|q)(8|16|32)_UPD")>;
+  def : InstRW<[SwiftWriteLM8Cy, SwiftWriteP01OneCycle, SwiftWriteLM9Cy,
+                SwiftWriteLM10CyNo, SwiftWriteLM10CyNo, SwiftExt1xP0,
+                SwiftVLDMPerm5],
+        (instregex "VLD4(DUP|LN)(d|q)(8|16|32)Pseudo_UPD")>;
+  // VSTx
+  // Multiple structures.
+  // Single element structure store.
+  def : InstRW<[SwiftWrite1xP2], (instregex "VST1d(8|16|32|64)$")>;
+  def : InstRW<[SwiftWrite2xP2], (instregex "VST1q(8|16|32|64)$")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite1xP2],
+        (instregex "VST1d(8|16|32|64)wb")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite2xP2],
+        (instregex "VST1q(8|16|32|64)wb")>;
+  def : InstRW<[SwiftWrite3xP2],
+        (instregex "VST1d(8|16|32|64)T$", "VST1d64TPseudo$")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite3xP2],
+        (instregex "VST1d(8|16|32|64)Twb", "VST1d64TPseudoWB")>;
+  def : InstRW<[SwiftWrite4xP2],
+        (instregex "VST1d(8|16|32|64)(Q|QPseudo)$")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite4xP2],
+        (instregex "VST1d(8|16|32|64)(Qwb|QPseudoWB)")>;
+  // Two element structure store.
+  def : InstRW<[SwiftWrite1xP2, SwiftVLDMPerm1],
+        (instregex "VST2(d|b)(8|16|32)$")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite1xP2, SwiftVLDMPerm1],
+        (instregex "VST2(b|d)(8|16|32)wb")>;
+  def : InstRW<[SwiftWrite2xP2, SwiftVLDMPerm2],
+        (instregex "VST2q(8|16|32)$", "VST2q(8|16|32)Pseudo$")>;
+  def : InstRW<[SwiftWrite2xP2, SwiftVLDMPerm2],
+        (instregex "VST2q(8|16|32)wb", "VST2q(8|16|32)PseudoWB")>;
+  // Three element structure store.
+  def : InstRW<[SwiftWrite4xP2, SwiftVLDMPerm2],
+        (instregex "VST3(d|q)(8|16|32)$", "VST3(d|q)(8|16|32)(oddP|P)seudo$")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite4xP2, SwiftVLDMPerm2],
+        (instregex "VST3(d|q)(8|16|32)_UPD",
+                   "VST3(d|q)(8|16|32)(oddP|P)seudo_UPD$")>;
+  // Four element structure store.
+  def : InstRW<[SwiftWrite4xP2, SwiftVLDMPerm2],
+        (instregex "VST4(d|q)(8|16|32)$", "VST4(d|q)(8|16|32)(oddP|P)seudo$")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite4xP2, SwiftVLDMPerm4],
+        (instregex "VST4(d|q)(8|16|32)_UPD",
+                   "VST4(d|q)(8|16|32)(oddP|P)seudo_UPD$")>;
+  // Single/all lane store.
+  // One element structure.
+  def : InstRW<[SwiftWrite1xP2, SwiftVLDMPerm1],
+        (instregex "VST1LNd(8|16|32)$", "VST1LNq(8|16|32)Pseudo$")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite1xP2, SwiftVLDMPerm1],
+        (instregex "VST1LNd(8|16|32)_UPD", "VST1LNq(8|16|32)Pseudo_UPD")>;
+  // Two element structure.
+  def : InstRW<[SwiftWrite1xP2, SwiftVLDMPerm2],
+        (instregex "VST2LN(d|q)(8|16|32)$", "VST2LN(d|q)(8|16|32)Pseudo$")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite1xP2, SwiftVLDMPerm2],
+        (instregex "VST2LN(d|q)(8|16|32)_UPD",
+                   "VST2LN(d|q)(8|16|32)Pseudo_UPD")>;
+  // Three element structure.
+  def : InstRW<[SwiftWrite4xP2, SwiftVLDMPerm2],
+        (instregex "VST3LN(d|q)(8|16|32)$", "VST3LN(d|q)(8|16|32)Pseudo$")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite4xP2, SwiftVLDMPerm2],
+        (instregex "VST3LN(d|q)(8|16|32)_UPD",
+                   "VST3LN(d|q)(8|16|32)Pseudo_UPD")>;
+  // Four element structure.
+  def : InstRW<[SwiftWrite2xP2, SwiftVLDMPerm2],
+        (instregex "VST4LN(d|q)(8|16|32)$", "VST4LN(d|q)(8|16|32)Pseudo$")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite2xP2, SwiftVLDMPerm2],
+        (instregex "VST4LN(d|q)(8|16|32)_UPD",
+                   "VST4LN(d|q)(8|16|32)Pseudo_UPD")>;
+
+  // 4.2.44 VFP, Divide and Square Root
+  def SwiftDiv17 : SchedWriteRes<[SwiftUnitP0, SwiftUnitDiv]> {
+    let NumMicroOps = 1;
+    let Latency = 17;
+    let ResourceCycles = [1, 15];
+  }
+  def SwiftDiv32 : SchedWriteRes<[SwiftUnitP0, SwiftUnitDiv]> {
+    let NumMicroOps = 1;
+    let Latency = 32;
+    let ResourceCycles = [1, 30];
+  }
+  def : InstRW<[SwiftDiv17], (instregex "VDIVS", "VSQRTS")>;
+  def : InstRW<[SwiftDiv32], (instregex "VDIVD", "VSQRTD")>;
+
+  // Not specified.
+  def : InstRW<[SwiftWriteP01OneCycle2x], (instregex "ABS")>;
+  // Preload.
+  def : WriteRes<WritePreLd, [SwiftUnitP2]> { let Latency = 0;
+    let ResourceCycles = [0];
+  }
+
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleV6.td b/contrib/llvm/lib/Target/ARM/ARMScheduleV6.td
new file mode 100644
index 0000000..57d0bfb
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMScheduleV6.td
@@ -0,0 +1,300 @@
+//===-- ARMScheduleV6.td - ARM v6 Scheduling Definitions ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the ARM v6 processors.
+//
+//===----------------------------------------------------------------------===//
+
+// Model based on ARM1176
+//
+// Functional Units
+def V6_Pipe : FuncUnit; // pipeline
+
+// Scheduling information derived from "ARM1176JZF-S Technical Reference Manual"
+//
+def ARMV6Itineraries : ProcessorItineraries<
+  [V6_Pipe], [], [
+  //
+  // No operand cycles
+  InstrItinData<IIC_iALUx    , [InstrStage<1, [V6_Pipe]>]>,
+  //
+  // Binary Instructions that produce a result
+  InstrItinData<IIC_iALUi    , [InstrStage<1, [V6_Pipe]>], [2, 2]>,
+  InstrItinData<IIC_iALUr    , [InstrStage<1, [V6_Pipe]>], [2, 2, 2]>,
+  InstrItinData<IIC_iALUsi   , [InstrStage<1, [V6_Pipe]>], [2, 2, 1]>,
+  InstrItinData<IIC_iALUsr   , [InstrStage<2, [V6_Pipe]>], [3, 3, 2, 1]>,
+  //
+  // Bitwise Instructions that produce a result
+  InstrItinData<IIC_iBITi    , [InstrStage<1, [V6_Pipe]>], [2, 2]>,
+  InstrItinData<IIC_iBITr    , [InstrStage<1, [V6_Pipe]>], [2, 2, 2]>,
+  InstrItinData<IIC_iBITsi   , [InstrStage<1, [V6_Pipe]>], [2, 2, 1]>,
+  InstrItinData<IIC_iBITsr   , [InstrStage<2, [V6_Pipe]>], [3, 3, 2, 1]>,
+  //
+  // Unary Instructions that produce a result
+  InstrItinData<IIC_iUNAr    , [InstrStage<1, [V6_Pipe]>], [2, 2]>,
+  InstrItinData<IIC_iUNAsi   , [InstrStage<1, [V6_Pipe]>], [2, 1]>,
+  //
+  // Zero and sign extension instructions
+  InstrItinData<IIC_iEXTr    , [InstrStage<1, [V6_Pipe]>], [1, 1]>,
+  InstrItinData<IIC_iEXTAr   , [InstrStage<1, [V6_Pipe]>], [2, 2, 1]>,
+  InstrItinData<IIC_iEXTAsr  , [InstrStage<2, [V6_Pipe]>], [3, 3, 2, 1]>,
+  //
+  // Compare instructions
+  InstrItinData<IIC_iCMPi    , [InstrStage<1, [V6_Pipe]>], [2]>,
+  InstrItinData<IIC_iCMPr    , [InstrStage<1, [V6_Pipe]>], [2, 2]>,
+  InstrItinData<IIC_iCMPsi   , [InstrStage<1, [V6_Pipe]>], [2, 1]>,
+  InstrItinData<IIC_iCMPsr   , [InstrStage<2, [V6_Pipe]>], [3, 2, 1]>,
+  //
+  // Test instructions
+  InstrItinData<IIC_iTSTi    , [InstrStage<1, [V6_Pipe]>], [2]>,
+  InstrItinData<IIC_iTSTr    , [InstrStage<1, [V6_Pipe]>], [2, 2]>,
+  InstrItinData<IIC_iTSTsi   , [InstrStage<1, [V6_Pipe]>], [2, 1]>,
+  InstrItinData<IIC_iTSTsr   , [InstrStage<2, [V6_Pipe]>], [3, 2, 1]>,
+  //
+  // Move instructions, unconditional
+  InstrItinData<IIC_iMOVi    , [InstrStage<1, [V6_Pipe]>], [2]>,
+  InstrItinData<IIC_iMOVr    , [InstrStage<1, [V6_Pipe]>], [2, 2]>,
+  InstrItinData<IIC_iMOVsi   , [InstrStage<1, [V6_Pipe]>], [2, 1]>,
+  InstrItinData<IIC_iMOVsr   , [InstrStage<2, [V6_Pipe]>], [3, 2, 1]>,
+  InstrItinData<IIC_iMOVix2  , [InstrStage<1, [V6_Pipe]>,
+                                InstrStage<1, [V6_Pipe]>], [2]>,
+  InstrItinData<IIC_iMOVix2addpc,[InstrStage<1, [V6_Pipe]>,
+                                  InstrStage<1, [V6_Pipe]>,
+                                  InstrStage<1, [V6_Pipe]>], [3]>,
+  InstrItinData<IIC_iMOVix2ld , [InstrStage<1, [V6_Pipe]>,
+                                 InstrStage<1, [V6_Pipe]>,
+                                 InstrStage<1, [V6_Pipe]>], [5]>,
+  //
+  // Move instructions, conditional
+  InstrItinData<IIC_iCMOVi   , [InstrStage<1, [V6_Pipe]>], [3]>,
+  InstrItinData<IIC_iCMOVr   , [InstrStage<1, [V6_Pipe]>], [3, 2]>,
+  InstrItinData<IIC_iCMOVsi  , [InstrStage<1, [V6_Pipe]>], [3, 1]>,
+  InstrItinData<IIC_iCMOVsr  , [InstrStage<1, [V6_Pipe]>], [4, 2, 1]>,
+  InstrItinData<IIC_iCMOVix2 , [InstrStage<1, [V6_Pipe]>,
+                                InstrStage<1, [V6_Pipe]>], [4]>,
+  //
+  // MVN instructions
+  InstrItinData<IIC_iMVNi    , [InstrStage<1, [V6_Pipe]>], [2]>,
+  InstrItinData<IIC_iMVNr    , [InstrStage<1, [V6_Pipe]>], [2, 2]>,
+  InstrItinData<IIC_iMVNsi   , [InstrStage<1, [V6_Pipe]>], [2, 1]>,
+  InstrItinData<IIC_iMVNsr   , [InstrStage<2, [V6_Pipe]>], [3, 2, 1]>,
+
+  // Integer multiply pipeline
+  //
+  InstrItinData<IIC_iMUL16   , [InstrStage<1, [V6_Pipe]>], [4, 1, 1]>,
+  InstrItinData<IIC_iMAC16   , [InstrStage<1, [V6_Pipe]>], [4, 1, 1, 2]>,
+  InstrItinData<IIC_iMUL32   , [InstrStage<2, [V6_Pipe]>], [5, 1, 1]>,
+  InstrItinData<IIC_iMAC32   , [InstrStage<2, [V6_Pipe]>], [5, 1, 1, 2]>,
+  InstrItinData<IIC_iMUL64   , [InstrStage<3, [V6_Pipe]>], [6, 1, 1]>,
+  InstrItinData<IIC_iMAC64   , [InstrStage<3, [V6_Pipe]>], [6, 1, 1, 2]>,
+
+  // Integer load pipeline
+  //
+  // Immediate offset
+  InstrItinData<IIC_iLoad_i   , [InstrStage<1, [V6_Pipe]>], [4, 1]>,
+  InstrItinData<IIC_iLoad_bh_i, [InstrStage<1, [V6_Pipe]>], [4, 1]>,
+  InstrItinData<IIC_iLoad_d_i , [InstrStage<1, [V6_Pipe]>], [4, 1]>,
+  //
+  // Register offset
+  InstrItinData<IIC_iLoad_r   , [InstrStage<1, [V6_Pipe]>], [4, 1, 1]>,
+  InstrItinData<IIC_iLoad_bh_r, [InstrStage<1, [V6_Pipe]>], [4, 1, 1]>,
+  InstrItinData<IIC_iLoad_d_r , [InstrStage<1, [V6_Pipe]>], [4, 1, 1]>,
+  //
+  // Scaled register offset, issues over 2 cycles
+  InstrItinData<IIC_iLoad_si   , [InstrStage<2, [V6_Pipe]>], [5, 2, 1]>,
+  InstrItinData<IIC_iLoad_bh_si, [InstrStage<2, [V6_Pipe]>], [5, 2, 1]>,
+  //
+  // Immediate offset with update
+  InstrItinData<IIC_iLoad_iu   , [InstrStage<1, [V6_Pipe]>], [4, 2, 1]>,
+  InstrItinData<IIC_iLoad_bh_iu, [InstrStage<1, [V6_Pipe]>], [4, 2, 1]>,
+  //
+  // Register offset with update
+  InstrItinData<IIC_iLoad_ru   , [InstrStage<1, [V6_Pipe]>], [4, 2, 1, 1]>,
+  InstrItinData<IIC_iLoad_bh_ru, [InstrStage<1, [V6_Pipe]>], [4, 2, 1, 1]>,
+  InstrItinData<IIC_iLoad_d_ru , [InstrStage<1, [V6_Pipe]>], [4, 2, 1, 1]>,
+  //
+  // Scaled register offset with update, issues over 2 cycles
+  InstrItinData<IIC_iLoad_siu,   [InstrStage<2, [V6_Pipe]>], [5, 2, 2, 1]>,
+  InstrItinData<IIC_iLoad_bh_siu,[InstrStage<2, [V6_Pipe]>], [5, 2, 2, 1]>,
+
+  //
+  // Load multiple, def is the 5th operand.
+  InstrItinData<IIC_iLoad_m  , [InstrStage<3, [V6_Pipe]>], [1, 1, 1, 1, 4]>,
+  //
+  // Load multiple + update, defs are the 1st and 5th operands.
+  InstrItinData<IIC_iLoad_mu , [InstrStage<3, [V6_Pipe]>], [2, 1, 1, 1, 4]>,
+  //
+  // Load multiple plus branch
+  InstrItinData<IIC_iLoad_mBr, [InstrStage<3, [V6_Pipe]>,
+                                InstrStage<1, [V6_Pipe]>], [1, 2, 1, 1, 4]>,
+
+  //
+  // iLoadi + iALUr for t2LDRpci_pic.
+  InstrItinData<IIC_iLoadiALU, [InstrStage<1, [V6_Pipe]>,
+                                InstrStage<1, [V6_Pipe]>], [3, 1]>,
+
+  //
+  // Pop, def is the 3rd operand.
+  InstrItinData<IIC_iPop     , [InstrStage<3, [V6_Pipe]>], [1, 1, 4]>,
+  //
+  // Pop + branch, def is the 3rd operand.
+  InstrItinData<IIC_iPop_Br,   [InstrStage<3, [V6_Pipe]>,
+                                InstrStage<1, [V6_Pipe]>], [1, 2, 4]>,
+
+  // Integer store pipeline
+  //
+  // Immediate offset
+  InstrItinData<IIC_iStore_i   , [InstrStage<1, [V6_Pipe]>], [2, 1]>,
+  InstrItinData<IIC_iStore_bh_i, [InstrStage<1, [V6_Pipe]>], [2, 1]>,
+  InstrItinData<IIC_iStore_d_i , [InstrStage<1, [V6_Pipe]>], [2, 1]>,
+  //
+  // Register offset
+  InstrItinData<IIC_iStore_r   , [InstrStage<1, [V6_Pipe]>], [2, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_r, [InstrStage<1, [V6_Pipe]>], [2, 1, 1]>,
+  InstrItinData<IIC_iStore_d_r , [InstrStage<1, [V6_Pipe]>], [2, 1, 1]>,
+  //
+  // Scaled register offset, issues over 2 cycles
+  InstrItinData<IIC_iStore_si   , [InstrStage<2, [V6_Pipe]>], [2, 2, 1]>,
+  InstrItinData<IIC_iStore_bh_si, [InstrStage<2, [V6_Pipe]>], [2, 2, 1]>,
+  //
+  // Immediate offset with update
+  InstrItinData<IIC_iStore_iu   , [InstrStage<1, [V6_Pipe]>], [2, 2, 1]>,
+  InstrItinData<IIC_iStore_bh_iu, [InstrStage<1, [V6_Pipe]>], [2, 2, 1]>,
+  //
+  // Register offset with update
+  InstrItinData<IIC_iStore_ru,   [InstrStage<1, [V6_Pipe]>], [2, 2, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_ru,[InstrStage<1, [V6_Pipe]>], [2, 2, 1, 1]>,
+  InstrItinData<IIC_iStore_d_ru, [InstrStage<1, [V6_Pipe]>], [2, 2, 1, 1]>,
+  //
+  // Scaled register offset with update, issues over 2 cycles
+  InstrItinData<IIC_iStore_siu,   [InstrStage<2, [V6_Pipe]>], [2, 2, 2, 1]>,
+  InstrItinData<IIC_iStore_bh_siu,[InstrStage<2, [V6_Pipe]>], [2, 2, 2, 1]>,
+  //
+  // Store multiple
+  InstrItinData<IIC_iStore_m  , [InstrStage<3, [V6_Pipe]>]>,
+  //
+  // Store multiple + update
+  InstrItinData<IIC_iStore_mu , [InstrStage<3, [V6_Pipe]>], [2]>,
+
+  // Branch
+  //
+  // no delay slots, so the latency of a branch is unimportant
+  InstrItinData<IIC_Br      , [InstrStage<1, [V6_Pipe]>]>,
+
+  // VFP
+  // Issue through integer pipeline, and execute in NEON unit. We assume
+  // RunFast mode so that NFP pipeline is used for single-precision when
+  // possible.
+  //
+  // FP Special Register to Integer Register File Move
+  InstrItinData<IIC_fpSTAT , [InstrStage<1, [V6_Pipe]>], [3]>,
+  //
+  // Single-precision FP Unary
+  InstrItinData<IIC_fpUNA32 , [InstrStage<1, [V6_Pipe]>], [5, 2]>,
+  //
+  // Double-precision FP Unary
+  InstrItinData<IIC_fpUNA64 , [InstrStage<1, [V6_Pipe]>], [5, 2]>,
+  //
+  // Single-precision FP Compare
+  InstrItinData<IIC_fpCMP32 , [InstrStage<1, [V6_Pipe]>], [2, 2]>,
+  //
+  // Double-precision FP Compare
+  InstrItinData<IIC_fpCMP64 , [InstrStage<1, [V6_Pipe]>], [2, 2]>,
+  //
+  // Single to Double FP Convert
+  InstrItinData<IIC_fpCVTSD , [InstrStage<1, [V6_Pipe]>], [5, 2]>,
+  //
+  // Double to Single FP Convert
+  InstrItinData<IIC_fpCVTDS , [InstrStage<1, [V6_Pipe]>], [5, 2]>,
+  //
+  // Single-Precision FP to Integer Convert
+  InstrItinData<IIC_fpCVTSI , [InstrStage<1, [V6_Pipe]>], [9, 2]>,
+  //
+  // Double-Precision FP to Integer Convert
+  InstrItinData<IIC_fpCVTDI , [InstrStage<1, [V6_Pipe]>], [9, 2]>,
+  //
+  // Integer to Single-Precision FP Convert
+  InstrItinData<IIC_fpCVTIS , [InstrStage<1, [V6_Pipe]>], [9, 2]>,
+  //
+  // Integer to Double-Precision FP Convert
+  InstrItinData<IIC_fpCVTID , [InstrStage<1, [V6_Pipe]>], [9, 2]>,
+  //
+  // Single-precision FP ALU
+  InstrItinData<IIC_fpALU32 , [InstrStage<1, [V6_Pipe]>], [9, 2, 2]>,
+  //
+  // Double-precision FP ALU
+  InstrItinData<IIC_fpALU64 , [InstrStage<1, [V6_Pipe]>], [9, 2, 2]>,
+  //
+  // Single-precision FP Multiply
+  InstrItinData<IIC_fpMUL32 , [InstrStage<1, [V6_Pipe]>], [9, 2, 2]>,
+  //
+  // Double-precision FP Multiply
+  InstrItinData<IIC_fpMUL64 , [InstrStage<2, [V6_Pipe]>], [9, 2, 2]>,
+  //
+  // Single-precision FP MAC
+  InstrItinData<IIC_fpMAC32 , [InstrStage<1, [V6_Pipe]>], [9, 2, 2, 2]>,
+  //
+  // Double-precision FP MAC
+  InstrItinData<IIC_fpMAC64 , [InstrStage<2, [V6_Pipe]>], [9, 2, 2, 2]>,
+  //
+  // Single-precision Fused FP MAC
+  InstrItinData<IIC_fpFMAC32, [InstrStage<1, [V6_Pipe]>], [9, 2, 2, 2]>,
+  //
+  // Double-precision Fused FP MAC
+  InstrItinData<IIC_fpFMAC64, [InstrStage<2, [V6_Pipe]>], [9, 2, 2, 2]>,
+  //
+  // Single-precision FP DIV
+  InstrItinData<IIC_fpDIV32 , [InstrStage<15, [V6_Pipe]>], [20, 2, 2]>,
+  //
+  // Double-precision FP DIV
+  InstrItinData<IIC_fpDIV64 , [InstrStage<29, [V6_Pipe]>], [34, 2, 2]>,
+  //
+  // Single-precision FP SQRT
+  InstrItinData<IIC_fpSQRT32 , [InstrStage<15, [V6_Pipe]>], [20, 2, 2]>,
+  //
+  // Double-precision FP SQRT
+  InstrItinData<IIC_fpSQRT64 , [InstrStage<29, [V6_Pipe]>], [34, 2, 2]>,
+  //
+  // Integer to Single-precision Move
+  InstrItinData<IIC_fpMOVIS,  [InstrStage<1, [V6_Pipe]>], [10, 1]>,
+  //
+  // Integer to Double-precision Move
+  InstrItinData<IIC_fpMOVID,  [InstrStage<1, [V6_Pipe]>], [10, 1, 1]>,
+  //
+  // Single-precision to Integer Move
+  InstrItinData<IIC_fpMOVSI,  [InstrStage<1, [V6_Pipe]>], [10, 1]>,
+  //
+  // Double-precision to Integer Move
+  InstrItinData<IIC_fpMOVDI,  [InstrStage<1, [V6_Pipe]>], [10, 10, 1]>,
+  //
+  // Single-precision FP Load
+  InstrItinData<IIC_fpLoad32 , [InstrStage<1, [V6_Pipe]>], [5, 2, 2]>,
+  //
+  // Double-precision FP Load
+  InstrItinData<IIC_fpLoad64 , [InstrStage<1, [V6_Pipe]>], [5, 2, 2]>,
+  //
+  // FP Load Multiple
+  InstrItinData<IIC_fpLoad_m , [InstrStage<3, [V6_Pipe]>], [2, 1, 1, 5]>,
+  //
+  // FP Load Multiple + update
+  InstrItinData<IIC_fpLoad_mu, [InstrStage<3, [V6_Pipe]>], [3, 2, 1, 1, 5]>,
+  //
+  // Single-precision FP Store
+  InstrItinData<IIC_fpStore32 , [InstrStage<1, [V6_Pipe]>], [2, 2, 2]>,
+  //
+  // Double-precision FP Store
+  // use FU_Issue to enforce the 1 load/store per cycle limit
+  InstrItinData<IIC_fpStore64 , [InstrStage<1, [V6_Pipe]>], [2, 2, 2]>,
+  //
+  // FP Store Multiple
+  InstrItinData<IIC_fpStore_m, [InstrStage<3, [V6_Pipe]>], [2, 2, 2, 2]>,
+  //
+  // FP Store Multiple + update
+  InstrItinData<IIC_fpStore_mu,[InstrStage<3, [V6_Pipe]>], [3, 2, 2, 2, 2]>
+]>;
diff --git a/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
new file mode 100644
index 0000000..3dcc0df
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -0,0 +1,199 @@
+//===-- ARMSelectionDAGInfo.cpp - ARM SelectionDAG Info -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARMSelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMTargetMachine.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/DerivedTypes.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "arm-selectiondag-info"
+
+ARMSelectionDAGInfo::ARMSelectionDAGInfo(const DataLayout &DL)
+    : TargetSelectionDAGInfo(&DL) {}
+
+ARMSelectionDAGInfo::~ARMSelectionDAGInfo() {
+}
+
+SDValue
+ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
+                                             SDValue Chain,
+                                             SDValue Dst, SDValue Src,
+                                             SDValue Size, unsigned Align,
+                                             bool isVolatile, bool AlwaysInline,
+                                             MachinePointerInfo DstPtrInfo,
+                                          MachinePointerInfo SrcPtrInfo) const {
+  const ARMSubtarget &Subtarget = DAG.getTarget().getSubtarget<ARMSubtarget>();
+  // Do repeated 4-byte loads and stores. To be improved.
+  // This requires 4-byte alignment.
+  if ((Align & 3) != 0)
+    return SDValue();
+  // This requires the copy size to be a constant, preferably
+  // within a subtarget-specific limit.
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  if (!ConstantSize)
+    return SDValue();
+  uint64_t SizeVal = ConstantSize->getZExtValue();
+  if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold())
+    return SDValue();
+
+  unsigned BytesLeft = SizeVal & 3;
+  unsigned NumMemOps = SizeVal >> 2;
+  unsigned EmittedNumMemOps = 0;
+  EVT VT = MVT::i32;
+  unsigned VTSize = 4;
+  unsigned i = 0;
+  // Emit a maximum of 4 loads in Thumb1 since we have fewer registers
+  const unsigned MAX_LOADS_IN_LDM = Subtarget.isThumb1Only() ? 4 : 6;
+  SDValue TFOps[6];
+  SDValue Loads[6];
+  uint64_t SrcOff = 0, DstOff = 0;
+
+  // Emit up to MAX_LOADS_IN_LDM loads, then a TokenFactor barrier, then the
+  // same number of stores.  The loads and stores will get combined into
+  // ldm/stm later on.
+  while (EmittedNumMemOps < NumMemOps) {
+    for (i = 0;
+         i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) {
+      Loads[i] = DAG.getLoad(VT, dl, Chain,
+                             DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
+                                         DAG.getConstant(SrcOff, MVT::i32)),
+                             SrcPtrInfo.getWithOffset(SrcOff), isVolatile,
+                             false, false, 0);
+      TFOps[i] = Loads[i].getValue(1);
+      SrcOff += VTSize;
+    }
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        makeArrayRef(TFOps, i));
+
+    for (i = 0;
+         i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) {
+      TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
+                              DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
+                                          DAG.getConstant(DstOff, MVT::i32)),
+                              DstPtrInfo.getWithOffset(DstOff),
+                              isVolatile, false, 0);
+      DstOff += VTSize;
+    }
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        makeArrayRef(TFOps, i));
+
+    EmittedNumMemOps += i;
+  }
+
+  if (BytesLeft == 0)
+    return Chain;
+
+  // Issue loads / stores for the trailing (1 - 3) bytes.
+  unsigned BytesLeftSave = BytesLeft;
+  i = 0;
+  while (BytesLeft) {
+    if (BytesLeft >= 2) {
+      VT = MVT::i16;
+      VTSize = 2;
+    } else {
+      VT = MVT::i8;
+      VTSize = 1;
+    }
+
+    Loads[i] = DAG.getLoad(VT, dl, Chain,
+                           DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
+                                       DAG.getConstant(SrcOff, MVT::i32)),
+                           SrcPtrInfo.getWithOffset(SrcOff),
+                           false, false, false, 0);
+    TFOps[i] = Loads[i].getValue(1);
+    ++i;
+    SrcOff += VTSize;
+    BytesLeft -= VTSize;
+  }
+  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                      makeArrayRef(TFOps, i));
+
+  i = 0;
+  BytesLeft = BytesLeftSave;
+  while (BytesLeft) {
+    if (BytesLeft >= 2) {
+      VT = MVT::i16;
+      VTSize = 2;
+    } else {
+      VT = MVT::i8;
+      VTSize = 1;
+    }
+
+    TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
+                            DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
+                                        DAG.getConstant(DstOff, MVT::i32)),
+                            DstPtrInfo.getWithOffset(DstOff), false, false, 0);
+    ++i;
+    DstOff += VTSize;
+    BytesLeft -= VTSize;
+  }
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                     makeArrayRef(TFOps, i));
+}
+
+// Adjust parameters for memset, EABI uses format (ptr, size, value),
+// GNU library uses (ptr, value, size)
+// See RTABI section 4.3.4
+SDValue ARMSelectionDAGInfo::
+EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
+                        SDValue Chain, SDValue Dst,
+                        SDValue Src, SDValue Size,
+                        unsigned Align, bool isVolatile,
+                        MachinePointerInfo DstPtrInfo) const {
+  const ARMSubtarget &Subtarget = DAG.getTarget().getSubtarget<ARMSubtarget>();
+  // Use default for non-AAPCS (or MachO) subtargets
+  if (!Subtarget.isAAPCS_ABI() || Subtarget.isTargetMachO() ||
+      Subtarget.isTargetWindows())
+    return SDValue();
+
+  const ARMTargetLowering &TLI =
+    *static_cast<const ARMTargetLowering*>(DAG.getTarget().getTargetLowering());
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+
+  // First argument: data pointer
+  Type *IntPtrTy = TLI.getDataLayout()->getIntPtrType(*DAG.getContext());
+  Entry.Node = Dst;
+  Entry.Ty = IntPtrTy;
+  Args.push_back(Entry);
+
+  // Second argument: buffer size
+  Entry.Node = Size;
+  Entry.Ty = IntPtrTy;
+  Entry.isSExt = false;
+  Args.push_back(Entry);
+
+  // Extend or truncate the argument to be an i32 value for the call.
+  if (Src.getValueType().bitsGT(MVT::i32))
+    Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
+  else
+    Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Src);
+
+  // Third argument: value to fill
+  Entry.Node = Src;
+  Entry.Ty = Type::getInt32Ty(*DAG.getContext());
+  Entry.isSExt = true;
+  Args.push_back(Entry);
+
+  // Emit __eabi_memset call
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(Chain)
+    .setCallee(TLI.getLibcallCallingConv(RTLIB::MEMSET),
+               Type::getVoidTy(*DAG.getContext()),
+               DAG.getExternalSymbol(TLI.getLibcallName(RTLIB::MEMSET),
+                                     TLI.getPointerTy()), std::move(Args), 0)
+    .setDiscardResult();
+
+  std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
+  return CallResult.second;
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
new file mode 100644
index 0000000..13769dc
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
@@ -0,0 +1,62 @@
+//===-- ARMSelectionDAGInfo.h - ARM SelectionDAG Info -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the ARM subclass for TargetSelectionDAGInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMSELECTIONDAGINFO_H
+#define ARMSELECTIONDAGINFO_H
+
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+
+namespace llvm {
+
+namespace ARM_AM {
+  static inline ShiftOpc getShiftOpcForNode(unsigned Opcode) {
+    switch (Opcode) {
+    default:          return ARM_AM::no_shift;
+    case ISD::SHL:    return ARM_AM::lsl;
+    case ISD::SRL:    return ARM_AM::lsr;
+    case ISD::SRA:    return ARM_AM::asr;
+    case ISD::ROTR:   return ARM_AM::ror;
+    //case ISD::ROTL:  // Only if imm -> turn into ROTR.
+    // Can't handle RRX here, because it would require folding a flag into
+    // the addressing mode.  :(  This causes us to miss certain things.
+    //case ARMISD::RRX: return ARM_AM::rrx;
+    }
+  }
+}  // end namespace ARM_AM
+
+class ARMSelectionDAGInfo : public TargetSelectionDAGInfo {
+public:
+  explicit ARMSelectionDAGInfo(const DataLayout &DL);
+  ~ARMSelectionDAGInfo();
+
+  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
+                                  SDValue Chain,
+                                  SDValue Dst, SDValue Src,
+                                  SDValue Size, unsigned Align,
+                                  bool isVolatile, bool AlwaysInline,
+                                  MachinePointerInfo DstPtrInfo,
+                                  MachinePointerInfo SrcPtrInfo) const override;
+
+  // Adjust parameters for memset, see RTABI section 4.3.4
+  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
+                                  SDValue Chain,
+                                  SDValue Op1, SDValue Op2,
+                                  SDValue Op3, unsigned Align,
+                                  bool isVolatile,
+                                  MachinePointerInfo DstPtrInfo) const override;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp b/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp
new file mode 100644
index 0000000..c1b4562
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp
@@ -0,0 +1,440 @@
+//===-- ARMSubtarget.cpp - ARM Subtarget Information ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARM specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMSubtarget.h"
+#include "ARMFrameLowering.h"
+#include "ARMISelLowering.h"
+#include "ARMInstrInfo.h"
+#include "ARMJITInfo.h"
+#include "ARMSelectionDAGInfo.h"
+#include "ARMSubtarget.h"
+#include "ARMMachineFunctionInfo.h"
+#include "Thumb1FrameLowering.h"
+#include "Thumb1InstrInfo.h"
+#include "Thumb2InstrInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "arm-subtarget"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "ARMGenSubtargetInfo.inc"
+
+static cl::opt<bool>
+ReserveR9("arm-reserve-r9", cl::Hidden,
+          cl::desc("Reserve R9, making it unavailable as GPR"));
+
+static cl::opt<bool>
+ArmUseMOVT("arm-use-movt", cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
+UseFusedMulOps("arm-use-mulops",
+               cl::init(true), cl::Hidden);
+
+enum AlignMode {
+  DefaultAlign,
+  StrictAlign,
+  NoStrictAlign
+};
+
+static cl::opt<AlignMode>
+Align(cl::desc("Load/store alignment support"),
+      cl::Hidden, cl::init(DefaultAlign),
+      cl::values(
+          clEnumValN(DefaultAlign,  "arm-default-align",
+                     "Generate unaligned accesses only on hardware/OS "
+                     "combinations that are known to support them"),
+          clEnumValN(StrictAlign,   "arm-strict-align",
+                     "Disallow all unaligned memory accesses"),
+          clEnumValN(NoStrictAlign, "arm-no-strict-align",
+                     "Allow unaligned memory accesses"),
+          clEnumValEnd));
+
+enum ITMode {
+  DefaultIT,
+  RestrictedIT,
+  NoRestrictedIT
+};
+
+static cl::opt<ITMode>
+IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT),
+   cl::ZeroOrMore,
+   cl::values(clEnumValN(DefaultIT, "arm-default-it",
+                         "Generate IT block based on arch"),
+              clEnumValN(RestrictedIT, "arm-restrict-it",
+                         "Disallow deprecated IT based on ARMv8"),
+              clEnumValN(NoRestrictedIT, "arm-no-restrict-it",
+                         "Allow IT blocks based on ARMv7"),
+              clEnumValEnd));
+
+static std::string computeDataLayout(ARMSubtarget &ST) {
+  std::string Ret = "";
+
+  if (ST.isLittle())
+    // Little endian.
+    Ret += "e";
+  else
+    // Big endian.
+    Ret += "E";
+
+  Ret += DataLayout::getManglingComponent(ST.getTargetTriple());
+
+  // Pointers are 32 bits and aligned to 32 bits.
+  Ret += "-p:32:32";
+
+  // On thumb, i16,i18 and i1 have natural aligment requirements, but we try to
+  // align to 32.
+  if (ST.isThumb())
+    Ret += "-i1:8:32-i8:8:32-i16:16:32";
+
+  // ABIs other than APCS have 64 bit integers with natural alignment.
+  if (!ST.isAPCS_ABI())
+    Ret += "-i64:64";
+
+  // We have 64 bits floats. The APCS ABI requires them to be aligned to 32
+  // bits, others to 64 bits. We always try to align to 64 bits.
+  if (ST.isAPCS_ABI())
+    Ret += "-f64:32:64";
+
+  // We have 128 and 64 bit vectors. The APCS ABI aligns them to 32 bits, others
+  // to 64. We always ty to give them natural alignment.
+  if (ST.isAPCS_ABI())
+    Ret += "-v64:32:64-v128:32:128";
+  else
+    Ret += "-v128:64:128";
+
+  // On thumb and APCS, only try to align aggregates to 32 bits (the default is
+  // 64 bits).
+  if (ST.isThumb() || ST.isAPCS_ABI())
+    Ret += "-a:0:32";
+
+  // Integer registers are 32 bits.
+  Ret += "-n32";
+
+  // The stack is 128 bit aligned on NaCl, 64 bit aligned on AAPCS and 32 bit
+  // aligned everywhere else.
+  if (ST.isTargetNaCl())
+    Ret += "-S128";
+  else if (ST.isAAPCS_ABI())
+    Ret += "-S64";
+  else
+    Ret += "-S32";
+
+  return Ret;
+}
+
+/// initializeSubtargetDependencies - Initializes using a CPU and feature string
+/// so that we can use initializer lists for subtarget initialization.
+ARMSubtarget &ARMSubtarget::initializeSubtargetDependencies(StringRef CPU,
+                                                            StringRef FS) {
+  initializeEnvironment();
+  resetSubtargetFeatures(CPU, FS);
+  return *this;
+}
+
+ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &CPU,
+                           const std::string &FS, TargetMachine &TM,
+                           bool IsLittle, const TargetOptions &Options)
+    : ARMGenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others),
+      ARMProcClass(None), stackAlignment(4), CPUString(CPU), IsLittle(IsLittle),
+      TargetTriple(TT), Options(Options), TargetABI(ARM_ABI_UNKNOWN),
+      DL(computeDataLayout(initializeSubtargetDependencies(CPU, FS))),
+      TSInfo(DL), JITInfo(),
+      InstrInfo(isThumb1Only()
+                    ? (ARMBaseInstrInfo *)new Thumb1InstrInfo(*this)
+                    : !isThumb()
+                          ? (ARMBaseInstrInfo *)new ARMInstrInfo(*this)
+                          : (ARMBaseInstrInfo *)new Thumb2InstrInfo(*this)),
+      TLInfo(TM),
+      FrameLowering(!isThumb1Only()
+                        ? new ARMFrameLowering(*this)
+                        : (ARMFrameLowering *)new Thumb1FrameLowering(*this)) {}
+
+void ARMSubtarget::initializeEnvironment() {
+  HasV4TOps = false;
+  HasV5TOps = false;
+  HasV5TEOps = false;
+  HasV6Ops = false;
+  HasV6MOps = false;
+  HasV6T2Ops = false;
+  HasV7Ops = false;
+  HasV8Ops = false;
+  HasVFPv2 = false;
+  HasVFPv3 = false;
+  HasVFPv4 = false;
+  HasFPARMv8 = false;
+  HasNEON = false;
+  UseNEONForSinglePrecisionFP = false;
+  UseMulOps = UseFusedMulOps;
+  SlowFPVMLx = false;
+  HasVMLxForwarding = false;
+  SlowFPBrcc = false;
+  InThumbMode = false;
+  HasThumb2 = false;
+  NoARM = false;
+  IsR9Reserved = ReserveR9;
+  UseMovt = false;
+  SupportsTailCall = false;
+  HasFP16 = false;
+  HasD16 = false;
+  HasHardwareDivide = false;
+  HasHardwareDivideInARM = false;
+  HasT2ExtractPack = false;
+  HasDataBarrier = false;
+  Pref32BitThumb = false;
+  AvoidCPSRPartialUpdate = false;
+  AvoidMOVsShifterOperand = false;
+  HasRAS = false;
+  HasMPExtension = false;
+  HasVirtualization = false;
+  FPOnlySP = false;
+  HasPerfMon = false;
+  HasTrustZone = false;
+  HasCrypto = false;
+  HasCRC = false;
+  HasZeroCycleZeroing = false;
+  AllowsUnalignedMem = false;
+  Thumb2DSP = false;
+  UseNaClTrap = false;
+  UnsafeFPMath = false;
+}
+
+void ARMSubtarget::resetSubtargetFeatures(const MachineFunction *MF) {
+  AttributeSet FnAttrs = MF->getFunction()->getAttributes();
+  Attribute CPUAttr = FnAttrs.getAttribute(AttributeSet::FunctionIndex,
+                                           "target-cpu");
+  Attribute FSAttr = FnAttrs.getAttribute(AttributeSet::FunctionIndex,
+                                          "target-features");
+  std::string CPU =
+    !CPUAttr.hasAttribute(Attribute::None) ?CPUAttr.getValueAsString() : "";
+  std::string FS =
+    !FSAttr.hasAttribute(Attribute::None) ? FSAttr.getValueAsString() : "";
+  if (!FS.empty()) {
+    initializeEnvironment();
+    resetSubtargetFeatures(CPU, FS);
+  }
+}
+
+void ARMSubtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) {
+  if (CPUString.empty()) {
+    if (isTargetIOS() && TargetTriple.getArchName().endswith("v7s"))
+      // Default to the Swift CPU when targeting armv7s/thumbv7s.
+      CPUString = "swift";
+    else
+      CPUString = "generic";
+  }
+
+  // Insert the architecture feature derived from the target triple into the
+  // feature string. This is important for setting features that are implied
+  // based on the architecture version.
+  std::string ArchFS = ARM_MC::ParseARMTriple(TargetTriple.getTriple(),
+                                              CPUString);
+  if (!FS.empty()) {
+    if (!ArchFS.empty())
+      ArchFS = ArchFS + "," + FS.str();
+    else
+      ArchFS = FS;
+  }
+  ParseSubtargetFeatures(CPUString, ArchFS);
+
+  // FIXME: This used enable V6T2 support implicitly for Thumb2 mode.
+  // Assert this for now to make the change obvious.
+  assert(hasV6T2Ops() || !hasThumb2());
+
+  // Keep a pointer to static instruction cost data for the specified CPU.
+  SchedModel = getSchedModelForCPU(CPUString);
+
+  // Initialize scheduling itinerary for the specified CPU.
+  InstrItins = getInstrItineraryForCPU(CPUString);
+
+  if (TargetABI == ARM_ABI_UNKNOWN) {
+    switch (TargetTriple.getEnvironment()) {
+    case Triple::Android:
+    case Triple::EABI:
+    case Triple::EABIHF:
+    case Triple::GNUEABI:
+    case Triple::GNUEABIHF:
+      TargetABI = ARM_ABI_AAPCS;
+      break;
+    default:
+      if ((isTargetIOS() && isMClass()) ||
+          (TargetTriple.isOSBinFormatMachO() &&
+           TargetTriple.getOS() == Triple::UnknownOS))
+        TargetABI = ARM_ABI_AAPCS;
+      else
+        TargetABI = ARM_ABI_APCS;
+      break;
+    }
+  }
+
+  // FIXME: this is invalid for WindowsCE
+  if (isTargetWindows()) {
+    TargetABI = ARM_ABI_AAPCS;
+    NoARM = true;
+  }
+
+  if (isAAPCS_ABI())
+    stackAlignment = 8;
+  if (isTargetNaCl())
+    stackAlignment = 16;
+
+  UseMovt = hasV6T2Ops() && ArmUseMOVT;
+
+  if (isTargetMachO()) {
+    IsR9Reserved = ReserveR9 | !HasV6Ops;
+    SupportsTailCall = !isTargetIOS() || !getTargetTriple().isOSVersionLT(5, 0);
+  } else {
+    IsR9Reserved = ReserveR9;
+    SupportsTailCall = !isThumb1Only();
+  }
+
+  switch (Align) {
+    case DefaultAlign:
+      // Assume pre-ARMv6 doesn't support unaligned accesses.
+      //
+      // ARMv6 may or may not support unaligned accesses depending on the
+      // SCTLR.U bit, which is architecture-specific. We assume ARMv6
+      // Darwin and NetBSD targets support unaligned accesses, and others don't.
+      //
+      // ARMv7 always has SCTLR.U set to 1, but it has a new SCTLR.A bit
+      // which raises an alignment fault on unaligned accesses. Linux
+      // defaults this bit to 0 and handles it as a system-wide (not
+      // per-process) setting. It is therefore safe to assume that ARMv7+
+      // Linux targets support unaligned accesses. The same goes for NaCl.
+      //
+      // The above behavior is consistent with GCC.
+      AllowsUnalignedMem =
+          (hasV7Ops() && (isTargetLinux() || isTargetNaCl() ||
+                          isTargetNetBSD())) ||
+          (hasV6Ops() && (isTargetMachO() || isTargetNetBSD()));
+      // The one exception is cortex-m0, which despite being v6, does not
+      // support unaligned accesses. Rather than make the above boolean
+      // expression even more obtuse, just override the value here.
+      if (isThumb1Only() && isMClass())
+        AllowsUnalignedMem = false;
+      break;
+    case StrictAlign:
+      AllowsUnalignedMem = false;
+      break;
+    case NoStrictAlign:
+      AllowsUnalignedMem = true;
+      break;
+  }
+
+  switch (IT) {
+  case DefaultIT:
+    RestrictIT = hasV8Ops() ? true : false;
+    break;
+  case RestrictedIT:
+    RestrictIT = true;
+    break;
+  case NoRestrictedIT:
+    RestrictIT = false;
+    break;
+  }
+
+  // NEON f32 ops are non-IEEE 754 compliant. Darwin is ok with it by default.
+  uint64_t Bits = getFeatureBits();
+  if ((Bits & ARM::ProcA5 || Bits & ARM::ProcA8) && // Where this matters
+      (Options.UnsafeFPMath || isTargetDarwin()))
+    UseNEONForSinglePrecisionFP = true;
+}
+
+/// GVIsIndirectSymbol - true if the GV will be accessed via an indirect symbol.
+bool
+ARMSubtarget::GVIsIndirectSymbol(const GlobalValue *GV,
+                                 Reloc::Model RelocM) const {
+  if (RelocM == Reloc::Static)
+    return false;
+
+  // Materializable GVs (in JIT lazy compilation mode) do not require an extra
+  // load from stub.
+  bool isDecl = GV->hasAvailableExternallyLinkage();
+  if (GV->isDeclaration() && !GV->isMaterializable())
+    isDecl = true;
+
+  if (!isTargetMachO()) {
+    // Extra load is needed for all externally visible.
+    if (GV->hasLocalLinkage() || GV->hasHiddenVisibility())
+      return false;
+    return true;
+  } else {
+    if (RelocM == Reloc::PIC_) {
+      // If this is a strong reference to a definition, it is definitely not
+      // through a stub.
+      if (!isDecl && !GV->isWeakForLinker())
+        return false;
+
+      // Unless we have a symbol with hidden visibility, we have to go through a
+      // normal $non_lazy_ptr stub because this symbol might be resolved late.
+      if (!GV->hasHiddenVisibility())  // Non-hidden $non_lazy_ptr reference.
+        return true;
+
+      // If symbol visibility is hidden, we have a stub for common symbol
+      // references and external declarations.
+      if (isDecl || GV->hasCommonLinkage())
+        // Hidden $non_lazy_ptr reference.
+        return true;
+
+      return false;
+    } else {
+      // If this is a strong reference to a definition, it is definitely not
+      // through a stub.
+      if (!isDecl && !GV->isWeakForLinker())
+        return false;
+
+      // Unless we have a symbol with hidden visibility, we have to go through a
+      // normal $non_lazy_ptr stub because this symbol might be resolved late.
+      if (!GV->hasHiddenVisibility())  // Non-hidden $non_lazy_ptr reference.
+        return true;
+    }
+  }
+
+  return false;
+}
+
+unsigned ARMSubtarget::getMispredictionPenalty() const {
+  return SchedModel->MispredictPenalty;
+}
+
+bool ARMSubtarget::hasSinCos() const {
+  return getTargetTriple().getOS() == Triple::IOS &&
+    !getTargetTriple().isOSVersionLT(7, 0);
+}
+
+// This overrides the PostRAScheduler bit in the SchedModel for any CPU.
+bool ARMSubtarget::enablePostMachineScheduler() const {
+  return (!isThumb() || hasThumb2());
+}
+
+bool ARMSubtarget::enableAtomicExpandLoadLinked() const {
+  return hasAnyDataBarrier() && !isThumb1Only();
+}
+
+bool ARMSubtarget::useMovt(const MachineFunction &MF) const {
+  // NOTE Windows on ARM needs to use mov.w/mov.t pairs to materialise 32-bit
+  // immediates as it is inherently position independent, and may be out of
+  // range otherwise.
+  return UseMovt && (isTargetWindows() ||
+                     !MF.getFunction()->getAttributes().hasAttribute(
+                         AttributeSet::FunctionIndex, Attribute::MinSize));
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMSubtarget.h b/contrib/llvm/lib/Target/ARM/ARMSubtarget.h
new file mode 100644
index 0000000..f8283b0
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -0,0 +1,450 @@
+//===-- ARMSubtarget.h - Define Subtarget for the ARM ----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the ARM specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMSUBTARGET_H
+#define ARMSUBTARGET_H
+
+
+#include "ARMFrameLowering.h"
+#include "ARMISelLowering.h"
+#include "ARMInstrInfo.h"
+#include "ARMJITInfo.h"
+#include "ARMSelectionDAGInfo.h"
+#include "ARMSubtarget.h"
+#include "Thumb1FrameLowering.h"
+#include "Thumb1InstrInfo.h"
+#include "Thumb2InstrInfo.h"
+#include "ARMJITInfo.h"
+#include "MCTargetDesc/ARMMCTargetDesc.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "ARMGenSubtargetInfo.inc"
+
+namespace llvm {
+class GlobalValue;
+class StringRef;
+class TargetOptions;
+
+class ARMSubtarget : public ARMGenSubtargetInfo {
+protected:
+  enum ARMProcFamilyEnum {
+    Others, CortexA5, CortexA7, CortexA8, CortexA9, CortexA12, CortexA15,
+    CortexR5, Swift, CortexA53, CortexA57, Krait
+  };
+  enum ARMProcClassEnum {
+    None, AClass, RClass, MClass
+  };
+
+  /// ARMProcFamily - ARM processor family: Cortex-A8, Cortex-A9, and others.
+  ARMProcFamilyEnum ARMProcFamily;
+
+  /// ARMProcClass - ARM processor class: None, AClass, RClass or MClass.
+  ARMProcClassEnum ARMProcClass;
+
+  /// HasV4TOps, HasV5TOps, HasV5TEOps,
+  /// HasV6Ops, HasV6MOps, HasV6T2Ops, HasV7Ops, HasV8Ops -
+  /// Specify whether target support specific ARM ISA variants.
+  bool HasV4TOps;
+  bool HasV5TOps;
+  bool HasV5TEOps;
+  bool HasV6Ops;
+  bool HasV6MOps;
+  bool HasV6T2Ops;
+  bool HasV7Ops;
+  bool HasV8Ops;
+
+  /// HasVFPv2, HasVFPv3, HasVFPv4, HasFPARMv8, HasNEON - Specify what
+  /// floating point ISAs are supported.
+  bool HasVFPv2;
+  bool HasVFPv3;
+  bool HasVFPv4;
+  bool HasFPARMv8;
+  bool HasNEON;
+
+  /// UseNEONForSinglePrecisionFP - if the NEONFP attribute has been
+  /// specified. Use the method useNEONForSinglePrecisionFP() to
+  /// determine if NEON should actually be used.
+  bool UseNEONForSinglePrecisionFP;
+
+  /// UseMulOps - True if non-microcoded fused integer multiply-add and
+  /// multiply-subtract instructions should be used.
+  bool UseMulOps;
+
+  /// SlowFPVMLx - If the VFP2 / NEON instructions are available, indicates
+  /// whether the FP VML[AS] instructions are slow (if so, don't use them).
+  bool SlowFPVMLx;
+
+  /// HasVMLxForwarding - If true, NEON has special multiplier accumulator
+  /// forwarding to allow mul + mla being issued back to back.
+  bool HasVMLxForwarding;
+
+  /// SlowFPBrcc - True if floating point compare + branch is slow.
+  bool SlowFPBrcc;
+
+  /// InThumbMode - True if compiling for Thumb, false for ARM.
+  bool InThumbMode;
+
+  /// HasThumb2 - True if Thumb2 instructions are supported.
+  bool HasThumb2;
+
+  /// NoARM - True if subtarget does not support ARM mode execution.
+  bool NoARM;
+
+  /// IsR9Reserved - True if R9 is a not available as general purpose register.
+  bool IsR9Reserved;
+
+  /// UseMovt - True if MOVT / MOVW pairs are used for materialization of 32-bit
+  /// imms (including global addresses).
+  bool UseMovt;
+
+  /// SupportsTailCall - True if the OS supports tail call. The dynamic linker
+  /// must be able to synthesize call stubs for interworking between ARM and
+  /// Thumb.
+  bool SupportsTailCall;
+
+  /// HasFP16 - True if subtarget supports half-precision FP (We support VFP+HF
+  /// only so far)
+  bool HasFP16;
+
+  /// HasD16 - True if subtarget is limited to 16 double precision
+  /// FP registers for VFPv3.
+  bool HasD16;
+
+  /// HasHardwareDivide - True if subtarget supports [su]div
+  bool HasHardwareDivide;
+
+  /// HasHardwareDivideInARM - True if subtarget supports [su]div in ARM mode
+  bool HasHardwareDivideInARM;
+
+  /// HasT2ExtractPack - True if subtarget supports thumb2 extract/pack
+  /// instructions.
+  bool HasT2ExtractPack;
+
+  /// HasDataBarrier - True if the subtarget supports DMB / DSB data barrier
+  /// instructions.
+  bool HasDataBarrier;
+
+  /// Pref32BitThumb - If true, codegen would prefer 32-bit Thumb instructions
+  /// over 16-bit ones.
+  bool Pref32BitThumb;
+
+  /// AvoidCPSRPartialUpdate - If true, codegen would avoid using instructions
+  /// that partially update CPSR and add false dependency on the previous
+  /// CPSR setting instruction.
+  bool AvoidCPSRPartialUpdate;
+
+  /// AvoidMOVsShifterOperand - If true, codegen should avoid using flag setting
+  /// movs with shifter operand (i.e. asr, lsl, lsr).
+  bool AvoidMOVsShifterOperand;
+
+  /// HasRAS - Some processors perform return stack prediction. CodeGen should
+  /// avoid issue "normal" call instructions to callees which do not return.
+  bool HasRAS;
+
+  /// HasMPExtension - True if the subtarget supports Multiprocessing
+  /// extension (ARMv7 only).
+  bool HasMPExtension;
+
+  /// HasVirtualization - True if the subtarget supports the Virtualization
+  /// extension.
+  bool HasVirtualization;
+
+  /// FPOnlySP - If true, the floating point unit only supports single
+  /// precision.
+  bool FPOnlySP;
+
+  /// If true, the processor supports the Performance Monitor Extensions. These
+  /// include a generic cycle-counter as well as more fine-grained (often
+  /// implementation-specific) events.
+  bool HasPerfMon;
+
+  /// HasTrustZone - if true, processor supports TrustZone security extensions
+  bool HasTrustZone;
+
+  /// HasCrypto - if true, processor supports Cryptography extensions
+  bool HasCrypto;
+
+  /// HasCRC - if true, processor supports CRC instructions
+  bool HasCRC;
+
+  /// If true, the instructions "vmov.i32 d0, #0" and "vmov.i32 q0, #0" are
+  /// particularly effective at zeroing a VFP register.
+  bool HasZeroCycleZeroing;
+
+  /// AllowsUnalignedMem - If true, the subtarget allows unaligned memory
+  /// accesses for some types.  For details, see
+  /// ARMTargetLowering::allowsUnalignedMemoryAccesses().
+  bool AllowsUnalignedMem;
+
+  /// RestrictIT - If true, the subtarget disallows generation of deprecated IT
+  ///  blocks to conform to ARMv8 rule.
+  bool RestrictIT;
+
+  /// Thumb2DSP - If true, the subtarget supports the v7 DSP (saturating arith
+  /// and such) instructions in Thumb2 code.
+  bool Thumb2DSP;
+
+  /// NaCl TRAP instruction is generated instead of the regular TRAP.
+  bool UseNaClTrap;
+
+  /// Target machine allowed unsafe FP math (such as use of NEON fp)
+  bool UnsafeFPMath;
+
+  /// stackAlignment - The minimum alignment known to hold of the stack frame on
+  /// entry to the function and which must be maintained by every function.
+  unsigned stackAlignment;
+
+  /// CPUString - String name of used CPU.
+  std::string CPUString;
+
+  /// IsLittle - The target is Little Endian
+  bool IsLittle;
+
+  /// TargetTriple - What processor and OS we're targeting.
+  Triple TargetTriple;
+
+  /// SchedModel - Processor specific instruction costs.
+  const MCSchedModel *SchedModel;
+
+  /// Selected instruction itineraries (one entry per itinerary class.)
+  InstrItineraryData InstrItins;
+
+  /// Options passed via command line that could influence the target
+  const TargetOptions &Options;
+
+ public:
+  enum {
+    ARM_ABI_UNKNOWN,
+    ARM_ABI_APCS,
+    ARM_ABI_AAPCS // ARM EABI
+  } TargetABI;
+
+  /// This constructor initializes the data members to match that
+  /// of the specified triple.
+  ///
+  ARMSubtarget(const std::string &TT, const std::string &CPU,
+               const std::string &FS, TargetMachine &TM, bool IsLittle,
+               const TargetOptions &Options);
+
+  /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
+  /// that still makes it profitable to inline the call.
+  unsigned getMaxInlineSizeThreshold() const {
+    return 64;
+  }
+  /// ParseSubtargetFeatures - Parses features string setting specified
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  /// \brief Reset the features for the ARM target.
+  void resetSubtargetFeatures(const MachineFunction *MF) override;
+
+  /// initializeSubtargetDependencies - Initializes using a CPU and feature string
+  /// so that we can use initializer lists for subtarget initialization.
+  ARMSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
+
+  const DataLayout *getDataLayout() const { return &DL; }
+  const ARMSelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; }
+  ARMJITInfo *getJITInfo() { return &JITInfo; }
+  const ARMBaseInstrInfo *getInstrInfo() const { return InstrInfo.get(); }
+  const ARMTargetLowering *getTargetLowering() const { return &TLInfo; }
+  const ARMFrameLowering *getFrameLowering() const { return FrameLowering.get(); }
+  const ARMBaseRegisterInfo *getRegisterInfo() const {
+    return &InstrInfo->getRegisterInfo();
+  }
+
+private:
+  const DataLayout DL;
+  ARMSelectionDAGInfo TSInfo;
+  ARMJITInfo JITInfo;
+  // Either Thumb1InstrInfo or Thumb2InstrInfo.
+  std::unique_ptr<ARMBaseInstrInfo> InstrInfo;
+  ARMTargetLowering   TLInfo;
+  // Either Thumb1FrameLowering or ARMFrameLowering.
+  std::unique_ptr<ARMFrameLowering> FrameLowering;
+
+  void initializeEnvironment();
+  void resetSubtargetFeatures(StringRef CPU, StringRef FS);
+public:
+  void computeIssueWidth();
+
+  bool hasV4TOps()  const { return HasV4TOps;  }
+  bool hasV5TOps()  const { return HasV5TOps;  }
+  bool hasV5TEOps() const { return HasV5TEOps; }
+  bool hasV6Ops()   const { return HasV6Ops;   }
+  bool hasV6MOps()  const { return HasV6MOps;  }
+  bool hasV6T2Ops() const { return HasV6T2Ops; }
+  bool hasV7Ops()   const { return HasV7Ops;  }
+  bool hasV8Ops()   const { return HasV8Ops;  }
+
+  bool isCortexA5() const { return ARMProcFamily == CortexA5; }
+  bool isCortexA7() const { return ARMProcFamily == CortexA7; }
+  bool isCortexA8() const { return ARMProcFamily == CortexA8; }
+  bool isCortexA9() const { return ARMProcFamily == CortexA9; }
+  bool isCortexA15() const { return ARMProcFamily == CortexA15; }
+  bool isSwift()    const { return ARMProcFamily == Swift; }
+  bool isCortexM3() const { return CPUString == "cortex-m3"; }
+  bool isLikeA9() const { return isCortexA9() || isCortexA15() || isKrait(); }
+  bool isCortexR5() const { return ARMProcFamily == CortexR5; }
+  bool isKrait() const { return ARMProcFamily == Krait; }
+
+  bool hasARMOps() const { return !NoARM; }
+
+  bool hasVFP2() const { return HasVFPv2; }
+  bool hasVFP3() const { return HasVFPv3; }
+  bool hasVFP4() const { return HasVFPv4; }
+  bool hasFPARMv8() const { return HasFPARMv8; }
+  bool hasNEON() const { return HasNEON;  }
+  bool hasCrypto() const { return HasCrypto; }
+  bool hasCRC() const { return HasCRC; }
+  bool hasVirtualization() const { return HasVirtualization; }
+  bool useNEONForSinglePrecisionFP() const {
+    return hasNEON() && UseNEONForSinglePrecisionFP; }
+
+  bool hasDivide() const { return HasHardwareDivide; }
+  bool hasDivideInARMMode() const { return HasHardwareDivideInARM; }
+  bool hasT2ExtractPack() const { return HasT2ExtractPack; }
+  bool hasDataBarrier() const { return HasDataBarrier; }
+  bool hasAnyDataBarrier() const {
+    return HasDataBarrier || (hasV6Ops() && !isThumb());
+  }
+  bool useMulOps() const { return UseMulOps; }
+  bool useFPVMLx() const { return !SlowFPVMLx; }
+  bool hasVMLxForwarding() const { return HasVMLxForwarding; }
+  bool isFPBrccSlow() const { return SlowFPBrcc; }
+  bool isFPOnlySP() const { return FPOnlySP; }
+  bool hasPerfMon() const { return HasPerfMon; }
+  bool hasTrustZone() const { return HasTrustZone; }
+  bool hasZeroCycleZeroing() const { return HasZeroCycleZeroing; }
+  bool prefers32BitThumb() const { return Pref32BitThumb; }
+  bool avoidCPSRPartialUpdate() const { return AvoidCPSRPartialUpdate; }
+  bool avoidMOVsShifterOperand() const { return AvoidMOVsShifterOperand; }
+  bool hasRAS() const { return HasRAS; }
+  bool hasMPExtension() const { return HasMPExtension; }
+  bool hasThumb2DSP() const { return Thumb2DSP; }
+  bool useNaClTrap() const { return UseNaClTrap; }
+
+  bool hasFP16() const { return HasFP16; }
+  bool hasD16() const { return HasD16; }
+
+  const Triple &getTargetTriple() const { return TargetTriple; }
+
+  bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
+  bool isTargetIOS() const { return TargetTriple.isiOS(); }
+  bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
+  bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
+  bool isTargetNetBSD() const { return TargetTriple.getOS() == Triple::NetBSD; }
+  bool isTargetWindows() const { return TargetTriple.isOSWindows(); }
+
+  bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); }
+  bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
+  bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
+
+  // ARM EABI is the bare-metal EABI described in ARM ABI documents and
+  // can be accessed via -target arm-none-eabi. This is NOT GNUEABI.
+  // FIXME: Add a flag for bare-metal for that target and set Triple::EABI
+  // even for GNUEABI, so we can make a distinction here and still conform to
+  // the EABI on GNU (and Android) mode. This requires change in Clang, too.
+  // FIXME: The Darwin exception is temporary, while we move users to
+  // "*-*-*-macho" triples as quickly as possible.
+  bool isTargetAEABI() const {
+    return (TargetTriple.getEnvironment() == Triple::EABI ||
+            TargetTriple.getEnvironment() == Triple::EABIHF) &&
+           !isTargetDarwin() && !isTargetWindows();
+  }
+
+  // ARM Targets that support EHABI exception handling standard
+  // Darwin uses SjLj. Other targets might need more checks.
+  bool isTargetEHABICompatible() const {
+    return (TargetTriple.getEnvironment() == Triple::EABI ||
+            TargetTriple.getEnvironment() == Triple::GNUEABI ||
+            TargetTriple.getEnvironment() == Triple::EABIHF ||
+            TargetTriple.getEnvironment() == Triple::GNUEABIHF ||
+            TargetTriple.getEnvironment() == Triple::Android) &&
+           !isTargetDarwin() && !isTargetWindows();
+  }
+
+  bool isTargetHardFloat() const {
+    // FIXME: this is invalid for WindowsCE
+    return TargetTriple.getEnvironment() == Triple::GNUEABIHF ||
+           TargetTriple.getEnvironment() == Triple::EABIHF ||
+           isTargetWindows();
+  }
+  bool isTargetAndroid() const {
+    return TargetTriple.getEnvironment() == Triple::Android;
+  }
+
+  bool isAPCS_ABI() const {
+    assert(TargetABI != ARM_ABI_UNKNOWN);
+    return TargetABI == ARM_ABI_APCS;
+  }
+  bool isAAPCS_ABI() const {
+    assert(TargetABI != ARM_ABI_UNKNOWN);
+    return TargetABI == ARM_ABI_AAPCS;
+  }
+
+  bool isThumb() const { return InThumbMode; }
+  bool isThumb1Only() const { return InThumbMode && !HasThumb2; }
+  bool isThumb2() const { return InThumbMode && HasThumb2; }
+  bool hasThumb2() const { return HasThumb2; }
+  bool isMClass() const { return ARMProcClass == MClass; }
+  bool isRClass() const { return ARMProcClass == RClass; }
+  bool isAClass() const { return ARMProcClass == AClass; }
+
+  bool isR9Reserved() const { return IsR9Reserved; }
+
+  bool useMovt(const MachineFunction &MF) const;
+
+  bool supportsTailCall() const { return SupportsTailCall; }
+
+  bool allowsUnalignedMem() const { return AllowsUnalignedMem; }
+
+  bool restrictIT() const { return RestrictIT; }
+
+  const std::string & getCPUString() const { return CPUString; }
+
+  bool isLittle() const { return IsLittle; }
+
+  unsigned getMispredictionPenalty() const;
+
+  /// This function returns true if the target has sincos() routine in its
+  /// compiler runtime or math libraries.
+  bool hasSinCos() const;
+
+  /// True for some subtargets at > -O0.
+  bool enablePostMachineScheduler() const override;
+
+  // enableAtomicExpandLoadLinked - True if we need to expand our atomics.
+  bool enableAtomicExpandLoadLinked() const override;
+
+  /// getInstrItins - Return the instruction itineraies based on subtarget
+  /// selection.
+  const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
+
+  /// getStackAlignment - Returns the minimum alignment known to hold of the
+  /// stack frame on entry to the function and which must be maintained by every
+  /// function for this subtarget.
+  unsigned getStackAlignment() const { return stackAlignment; }
+
+  /// GVIsIndirectSymbol - true if the GV will be accessed via an indirect
+  /// symbol.
+  bool GVIsIndirectSymbol(const GlobalValue *GV, Reloc::Model RelocM) const;
+
+};
+} // End llvm namespace
+
+#endif  // ARMSUBTARGET_H
diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp
new file mode 100644
index 0000000..d85194b
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -0,0 +1,253 @@
+//===-- ARMTargetMachine.cpp - Define TargetMachine for ARM ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMTargetMachine.h"
+#include "ARMFrameLowering.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/Scalar.h"
+using namespace llvm;
+
+static cl::opt<bool>
+DisableA15SDOptimization("disable-a15-sd-optimization", cl::Hidden,
+                   cl::desc("Inhibit optimization of S->D register accesses on A15"),
+                   cl::init(false));
+
+static cl::opt<bool>
+EnableAtomicTidy("arm-atomic-cfg-tidy", cl::Hidden,
+                 cl::desc("Run SimplifyCFG after expanding atomic operations"
+                          " to make use of cmpxchg flow-based information"),
+                 cl::init(true));
+
+extern "C" void LLVMInitializeARMTarget() {
+  // Register the target.
+  RegisterTargetMachine<ARMLETargetMachine> X(TheARMLETarget);
+  RegisterTargetMachine<ARMBETargetMachine> Y(TheARMBETarget);
+  RegisterTargetMachine<ThumbLETargetMachine> A(TheThumbLETarget);
+  RegisterTargetMachine<ThumbBETargetMachine> B(TheThumbBETarget);
+}
+
+
+/// TargetMachine ctor - Create an ARM architecture model.
+///
+ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, StringRef TT,
+                                           StringRef CPU, StringRef FS,
+                                           const TargetOptions &Options,
+                                           Reloc::Model RM, CodeModel::Model CM,
+                                           CodeGenOpt::Level OL, bool isLittle)
+    : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+      Subtarget(TT, CPU, FS, *this, isLittle, Options) {
+
+  // Default to triple-appropriate float ABI
+  if (Options.FloatABIType == FloatABI::Default)
+    this->Options.FloatABIType =
+        Subtarget.isTargetHardFloat() ? FloatABI::Hard : FloatABI::Soft;
+}
+
+void ARMBaseTargetMachine::addAnalysisPasses(PassManagerBase &PM) {
+  // Add first the target-independent BasicTTI pass, then our ARM pass. This
+  // allows the ARM pass to delegate to the target independent layer when
+  // appropriate.
+  PM.add(createBasicTargetTransformInfoPass(this));
+  PM.add(createARMTargetTransformInfoPass(this));
+}
+
+
+void ARMTargetMachine::anchor() { }
+
+ARMTargetMachine::ARMTargetMachine(const Target &T, StringRef TT, StringRef CPU,
+                                   StringRef FS, const TargetOptions &Options,
+                                   Reloc::Model RM, CodeModel::Model CM,
+                                   CodeGenOpt::Level OL, bool isLittle)
+    : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, isLittle) {
+  initAsmInfo();
+  if (!Subtarget.hasARMOps())
+    report_fatal_error("CPU: '" + Subtarget.getCPUString() + "' does not "
+                       "support ARM mode execution!");
+}
+
+void ARMLETargetMachine::anchor() { }
+
+ARMLETargetMachine::ARMLETargetMachine(const Target &T, StringRef TT,
+                                       StringRef CPU, StringRef FS,
+                                       const TargetOptions &Options,
+                                       Reloc::Model RM, CodeModel::Model CM,
+                                       CodeGenOpt::Level OL)
+    : ARMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
+
+void ARMBETargetMachine::anchor() { }
+
+ARMBETargetMachine::ARMBETargetMachine(const Target &T, StringRef TT,
+                                       StringRef CPU, StringRef FS,
+                                       const TargetOptions &Options,
+                                       Reloc::Model RM, CodeModel::Model CM,
+                                       CodeGenOpt::Level OL)
+    : ARMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
+
+void ThumbTargetMachine::anchor() { }
+
+ThumbTargetMachine::ThumbTargetMachine(const Target &T, StringRef TT,
+                                       StringRef CPU, StringRef FS,
+                                       const TargetOptions &Options,
+                                       Reloc::Model RM, CodeModel::Model CM,
+                                       CodeGenOpt::Level OL, bool isLittle)
+    : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL,
+                           isLittle) {
+  initAsmInfo();
+}
+
+void ThumbLETargetMachine::anchor() { }
+
+ThumbLETargetMachine::ThumbLETargetMachine(const Target &T, StringRef TT,
+                                           StringRef CPU, StringRef FS,
+                                           const TargetOptions &Options,
+                                           Reloc::Model RM, CodeModel::Model CM,
+                                           CodeGenOpt::Level OL)
+    : ThumbTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
+
+void ThumbBETargetMachine::anchor() { }
+
+ThumbBETargetMachine::ThumbBETargetMachine(const Target &T, StringRef TT,
+                                           StringRef CPU, StringRef FS,
+                                           const TargetOptions &Options,
+                                           Reloc::Model RM, CodeModel::Model CM,
+                                           CodeGenOpt::Level OL)
+    : ThumbTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
+
+namespace {
+/// ARM Code Generator Pass Configuration Options.
+class ARMPassConfig : public TargetPassConfig {
+public:
+  ARMPassConfig(ARMBaseTargetMachine *TM, PassManagerBase &PM)
+    : TargetPassConfig(TM, PM) {}
+
+  ARMBaseTargetMachine &getARMTargetMachine() const {
+    return getTM<ARMBaseTargetMachine>();
+  }
+
+  const ARMSubtarget &getARMSubtarget() const {
+    return *getARMTargetMachine().getSubtargetImpl();
+  }
+
+  void addIRPasses() override;
+  bool addPreISel() override;
+  bool addInstSelector() override;
+  bool addPreRegAlloc() override;
+  bool addPreSched2() override;
+  bool addPreEmitPass() override;
+};
+} // namespace
+
+TargetPassConfig *ARMBaseTargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new ARMPassConfig(this, PM);
+}
+
+void ARMPassConfig::addIRPasses() {
+  addPass(createAtomicExpandLoadLinkedPass(TM));
+
+  // Cmpxchg instructions are often used with a subsequent comparison to
+  // determine whether it succeeded. We can exploit existing control-flow in
+  // ldrex/strex loops to simplify this, but it needs tidying up.
+  const ARMSubtarget *Subtarget = &getARMSubtarget();
+  if (Subtarget->hasAnyDataBarrier() && !Subtarget->isThumb1Only())
+    if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy)
+      addPass(createCFGSimplificationPass());
+
+  TargetPassConfig::addIRPasses();
+}
+
+bool ARMPassConfig::addPreISel() {
+  if (TM->getOptLevel() != CodeGenOpt::None)
+    addPass(createGlobalMergePass(TM));
+
+  return false;
+}
+
+bool ARMPassConfig::addInstSelector() {
+  addPass(createARMISelDag(getARMTargetMachine(), getOptLevel()));
+
+  const ARMSubtarget *Subtarget = &getARMSubtarget();
+  if (Subtarget->isTargetELF() && !Subtarget->isThumb1Only() &&
+      TM->Options.EnableFastISel)
+    addPass(createARMGlobalBaseRegPass());
+  return false;
+}
+
+bool ARMPassConfig::addPreRegAlloc() {
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(createARMLoadStoreOptimizationPass(true));
+  if (getOptLevel() != CodeGenOpt::None && getARMSubtarget().isCortexA9())
+    addPass(createMLxExpansionPass());
+  // Since the A15SDOptimizer pass can insert VDUP instructions, it can only be
+  // enabled when NEON is available.
+  if (getOptLevel() != CodeGenOpt::None && getARMSubtarget().isCortexA15() &&
+    getARMSubtarget().hasNEON() && !DisableA15SDOptimization) {
+    addPass(createA15SDOptimizerPass());
+  }
+  return true;
+}
+
+bool ARMPassConfig::addPreSched2() {
+  if (getOptLevel() != CodeGenOpt::None) {
+    addPass(createARMLoadStoreOptimizationPass());
+    printAndVerify("After ARM load / store optimizer");
+
+    if (getARMSubtarget().hasNEON())
+      addPass(createExecutionDependencyFixPass(&ARM::DPRRegClass));
+  }
+
+  // Expand some pseudo instructions into multiple instructions to allow
+  // proper scheduling.
+  addPass(createARMExpandPseudoPass());
+
+  if (getOptLevel() != CodeGenOpt::None) {
+    if (!getARMSubtarget().isThumb1Only()) {
+      // in v8, IfConversion depends on Thumb instruction widths
+      if (getARMSubtarget().restrictIT() &&
+          !getARMSubtarget().prefers32BitThumb())
+        addPass(createThumb2SizeReductionPass());
+      addPass(&IfConverterID);
+    }
+  }
+  if (getARMSubtarget().isThumb2())
+    addPass(createThumb2ITBlockPass());
+
+  return true;
+}
+
+bool ARMPassConfig::addPreEmitPass() {
+  if (getARMSubtarget().isThumb2()) {
+    if (!getARMSubtarget().prefers32BitThumb())
+      addPass(createThumb2SizeReductionPass());
+
+    // Constant island pass work on unbundled instructions.
+    addPass(&UnpackMachineBundlesID);
+  }
+
+  addPass(createARMOptimizeBarriersPass());
+  addPass(createARMConstantIslandPass());
+
+  return true;
+}
+
+bool ARMBaseTargetMachine::addCodeEmitter(PassManagerBase &PM,
+                                          JITCodeEmitter &JCE) {
+  // Machine code emitter pass for ARM.
+  PM.add(createARMJITCodeEmitterPass(*this, JCE));
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h
new file mode 100644
index 0000000..b72b1df
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h
@@ -0,0 +1,135 @@
+//===-- ARMTargetMachine.h - Define TargetMachine for ARM -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the ARM specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMTARGETMACHINE_H
+#define ARMTARGETMACHINE_H
+
+#include "ARMInstrInfo.h"
+#include "ARMSubtarget.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class ARMBaseTargetMachine : public LLVMTargetMachine {
+protected:
+  ARMSubtarget        Subtarget;
+public:
+  ARMBaseTargetMachine(const Target &T, StringRef TT,
+                       StringRef CPU, StringRef FS,
+                       const TargetOptions &Options,
+                       Reloc::Model RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL,
+                       bool isLittle);
+
+  const ARMSubtarget *getSubtargetImpl() const override { return &Subtarget; }
+  const ARMBaseRegisterInfo *getRegisterInfo() const override {
+    return getSubtargetImpl()->getRegisterInfo();
+  }
+  const ARMTargetLowering *getTargetLowering() const override {
+    return getSubtargetImpl()->getTargetLowering();
+  }
+  const ARMSelectionDAGInfo *getSelectionDAGInfo() const override {
+    return getSubtargetImpl()->getSelectionDAGInfo();
+  }
+  const ARMBaseInstrInfo *getInstrInfo() const override {
+    return getSubtargetImpl()->getInstrInfo();
+  }
+  const ARMFrameLowering *getFrameLowering() const override {
+    return getSubtargetImpl()->getFrameLowering();
+  }
+  const InstrItineraryData *getInstrItineraryData() const override {
+    return &getSubtargetImpl()->getInstrItineraryData();
+  }
+  const DataLayout *getDataLayout() const override {
+    return getSubtargetImpl()->getDataLayout();
+  }
+  ARMJITInfo *getJITInfo() override { return Subtarget.getJITInfo(); }
+
+  /// \brief Register ARM analysis passes with a pass manager.
+  void addAnalysisPasses(PassManagerBase &PM) override;
+
+  // Pass Pipeline Configuration
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+  bool addCodeEmitter(PassManagerBase &PM, JITCodeEmitter &MCE) override;
+};
+
+/// ARMTargetMachine - ARM target machine.
+///
+class ARMTargetMachine : public ARMBaseTargetMachine {
+  virtual void anchor();
+ public:
+   ARMTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS,
+                    const TargetOptions &Options, Reloc::Model RM,
+                    CodeModel::Model CM, CodeGenOpt::Level OL, bool isLittle);
+};
+
+/// ARMLETargetMachine - ARM little endian target machine.
+///
+class ARMLETargetMachine : public ARMTargetMachine {
+  void anchor() override;
+public:
+  ARMLETargetMachine(const Target &T, StringRef TT,
+                     StringRef CPU, StringRef FS, const TargetOptions &Options,
+                     Reloc::Model RM, CodeModel::Model CM,
+                     CodeGenOpt::Level OL);
+};
+
+/// ARMBETargetMachine - ARM big endian target machine.
+///
+class ARMBETargetMachine : public ARMTargetMachine {
+  void anchor() override;
+public:
+  ARMBETargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS,
+                     const TargetOptions &Options, Reloc::Model RM,
+                     CodeModel::Model CM, CodeGenOpt::Level OL);
+};
+
+/// ThumbTargetMachine - Thumb target machine.
+/// Due to the way architectures are handled, this represents both
+///   Thumb-1 and Thumb-2.
+///
+class ThumbTargetMachine : public ARMBaseTargetMachine {
+  virtual void anchor();
+public:
+  ThumbTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS,
+                     const TargetOptions &Options, Reloc::Model RM,
+                     CodeModel::Model CM, CodeGenOpt::Level OL, bool isLittle);
+};
+
+/// ThumbLETargetMachine - Thumb little endian target machine.
+///
+class ThumbLETargetMachine : public ThumbTargetMachine {
+  void anchor() override;
+public:
+  ThumbLETargetMachine(const Target &T, StringRef TT, StringRef CPU,
+                       StringRef FS, const TargetOptions &Options,
+                       Reloc::Model RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL);
+};
+
+/// ThumbBETargetMachine - Thumb big endian target machine.
+///
+class ThumbBETargetMachine : public ThumbTargetMachine {
+  void anchor() override;
+public:
+  ThumbBETargetMachine(const Target &T, StringRef TT, StringRef CPU,
+                       StringRef FS, const TargetOptions &Options,
+                       Reloc::Model RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
new file mode 100644
index 0000000..48238bf
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
@@ -0,0 +1,63 @@
+//===-- llvm/Target/ARMTargetObjectFile.cpp - ARM Object Info Impl --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMTargetObjectFile.h"
+#include "ARMSubtarget.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Target/TargetLowering.h"
+using namespace llvm;
+using namespace dwarf;
+
+//===----------------------------------------------------------------------===//
+//                               ELF Target
+//===----------------------------------------------------------------------===//
+
+void ARMElfTargetObjectFile::Initialize(MCContext &Ctx,
+                                        const TargetMachine &TM) {
+  bool isAAPCS_ABI = TM.getSubtarget<ARMSubtarget>().isAAPCS_ABI();
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+  InitializeELF(isAAPCS_ABI);
+
+  if (isAAPCS_ABI) {
+    LSDASection = nullptr;
+  }
+
+  AttributesSection =
+    getContext().getELFSection(".ARM.attributes",
+                               ELF::SHT_ARM_ATTRIBUTES,
+                               0,
+                               SectionKind::getMetadata());
+}
+
+const MCExpr *ARMElfTargetObjectFile::getTTypeGlobalReference(
+    const GlobalValue *GV, unsigned Encoding, Mangler &Mang,
+    const TargetMachine &TM, MachineModuleInfo *MMI,
+    MCStreamer &Streamer) const {
+  if (TM.getMCAsmInfo()->getExceptionHandlingType() != ExceptionHandling::ARM)
+    return TargetLoweringObjectFileELF::getTTypeGlobalReference(
+        GV, Encoding, Mang, TM, MMI, Streamer);
+
+  assert(Encoding == DW_EH_PE_absptr && "Can handle absptr encoding only");
+
+  return MCSymbolRefExpr::Create(TM.getSymbol(GV, Mang),
+                                 MCSymbolRefExpr::VK_ARM_TARGET2, getContext());
+}
+
+const MCExpr *ARMElfTargetObjectFile::
+getDebugThreadLocalSymbol(const MCSymbol *Sym) const {
+  return MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_ARM_TLSLDO,
+                                 getContext());
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.h b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.h
new file mode 100644
index 0000000..c926421
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.h
@@ -0,0 +1,43 @@
+//===-- llvm/Target/ARMTargetObjectFile.h - ARM Object Info -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_ARM_TARGETOBJECTFILE_H
+#define LLVM_TARGET_ARM_TARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+
+namespace llvm {
+
+class MCContext;
+class TargetMachine;
+
+class ARMElfTargetObjectFile : public TargetLoweringObjectFileELF {
+protected:
+  const MCSection *AttributesSection;
+public:
+  ARMElfTargetObjectFile() :
+    TargetLoweringObjectFileELF(),
+    AttributesSection(nullptr)
+  {}
+
+  void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+
+  const MCExpr *
+  getTTypeGlobalReference(const GlobalValue *GV, unsigned Encoding,
+                          Mangler &Mang, const TargetMachine &TM,
+                          MachineModuleInfo *MMI,
+                          MCStreamer &Streamer) const override;
+
+  /// \brief Describe a TLS variable address within debug info.
+  const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
new file mode 100644
index 0000000..a2ace62
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -0,0 +1,586 @@
+//===-- ARMTargetTransformInfo.cpp - ARM specific TTI pass ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements a TargetTransformInfo analysis pass specific to the
+/// ARM target machine. It uses the target's detailed information to provide
+/// more precise answers to certain TTI queries, while letting the target
+/// independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMTargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/CostTable.h"
+#include "llvm/Target/TargetLowering.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "armtti"
+
+// Declare the pass initialization routine locally as target-specific passes
+// don't have a target-wide initialization entry point, and so we rely on the
+// pass constructor initialization.
+namespace llvm {
+void initializeARMTTIPass(PassRegistry &);
+}
+
+namespace {
+
+class ARMTTI final : public ImmutablePass, public TargetTransformInfo {
+  const ARMBaseTargetMachine *TM;
+  const ARMSubtarget *ST;
+  const ARMTargetLowering *TLI;
+
+  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
+  /// are set if the result needs to be inserted and/or extracted from vectors.
+  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
+
+public:
+  ARMTTI() : ImmutablePass(ID), TM(nullptr), ST(nullptr), TLI(nullptr) {
+    llvm_unreachable("This pass cannot be directly constructed");
+  }
+
+  ARMTTI(const ARMBaseTargetMachine *TM)
+      : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
+        TLI(TM->getTargetLowering()) {
+    initializeARMTTIPass(*PassRegistry::getPassRegistry());
+  }
+
+  void initializePass() override {
+    pushTTIStack(this);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    TargetTransformInfo::getAnalysisUsage(AU);
+  }
+
+  /// Pass identification.
+  static char ID;
+
+  /// Provide necessary pointer adjustments for the two base classes.
+  void *getAdjustedAnalysisPointer(const void *ID) override {
+    if (ID == &TargetTransformInfo::ID)
+      return (TargetTransformInfo*)this;
+    return this;
+  }
+
+  /// \name Scalar TTI Implementations
+  /// @{
+  using TargetTransformInfo::getIntImmCost;
+  unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override;
+
+  /// @}
+
+
+  /// \name Vector TTI Implementations
+  /// @{
+
+  unsigned getNumberOfRegisters(bool Vector) const override {
+    if (Vector) {
+      if (ST->hasNEON())
+        return 16;
+      return 0;
+    }
+
+    if (ST->isThumb1Only())
+      return 8;
+    return 13;
+  }
+
+  unsigned getRegisterBitWidth(bool Vector) const override {
+    if (Vector) {
+      if (ST->hasNEON())
+        return 128;
+      return 0;
+    }
+
+    return 32;
+  }
+
+  unsigned getMaximumUnrollFactor() const override {
+    // These are out of order CPUs:
+    if (ST->isCortexA15() || ST->isSwift())
+      return 2;
+    return 1;
+  }
+
+  unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
+                          int Index, Type *SubTp) const override;
+
+  unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
+                            Type *Src) const override;
+
+  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                              Type *CondTy) const override;
+
+  unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
+                              unsigned Index) const override;
+
+  unsigned getAddressComputationCost(Type *Val,
+                                     bool IsComplex) const override;
+
+  unsigned
+  getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+                         OperandValueKind Op1Info = OK_AnyValue,
+                         OperandValueKind Op2Info = OK_AnyValue) const override;
+
+  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                           unsigned AddressSpace) const override;
+  /// @}
+};
+
+} // end anonymous namespace
+
+INITIALIZE_AG_PASS(ARMTTI, TargetTransformInfo, "armtti",
+                   "ARM Target Transform Info", true, true, false)
+char ARMTTI::ID = 0;
+
+ImmutablePass *
+llvm::createARMTargetTransformInfoPass(const ARMBaseTargetMachine *TM) {
+  return new ARMTTI(TM);
+}
+
+
+unsigned ARMTTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
+  assert(Ty->isIntegerTy());
+
+  unsigned Bits = Ty->getPrimitiveSizeInBits();
+  if (Bits == 0 || Bits > 32)
+    return 4;
+
+  int32_t SImmVal = Imm.getSExtValue();
+  uint32_t ZImmVal = Imm.getZExtValue();
+  if (!ST->isThumb()) {
+    if ((SImmVal >= 0 && SImmVal < 65536) ||
+        (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
+        (ARM_AM::getSOImmVal(~ZImmVal) != -1))
+      return 1;
+    return ST->hasV6T2Ops() ? 2 : 3;
+  }
+  if (ST->isThumb2()) {
+    if ((SImmVal >= 0 && SImmVal < 65536) ||
+        (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
+        (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
+      return 1;
+    return ST->hasV6T2Ops() ? 2 : 3;
+  }
+  // Thumb1.
+  if (SImmVal >= 0 && SImmVal < 256)
+    return 1;
+  if ((~ZImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
+    return 2;
+  // Load from constantpool.
+  return 3;
+}
+
+unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst,
+                                  Type *Src) const {
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  // Single to/from double precision conversions.
+  static const CostTblEntry<MVT::SimpleValueType> NEONFltDblTbl[] = {
+    // Vector fptrunc/fpext conversions.
+    { ISD::FP_ROUND,   MVT::v2f64, 2 },
+    { ISD::FP_EXTEND,  MVT::v2f32, 2 },
+    { ISD::FP_EXTEND,  MVT::v4f32, 4 }
+  };
+
+  if (Src->isVectorTy() && ST->hasNEON() && (ISD == ISD::FP_ROUND ||
+                                          ISD == ISD::FP_EXTEND)) {
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
+    int Idx = CostTableLookup(NEONFltDblTbl, ISD, LT.second);
+    if (Idx != -1)
+      return LT.first * NEONFltDblTbl[Idx].Cost;
+  }
+
+  EVT SrcTy = TLI->getValueType(Src);
+  EVT DstTy = TLI->getValueType(Dst);
+
+  if (!SrcTy.isSimple() || !DstTy.isSimple())
+    return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
+
+  // Some arithmetic, load and store operations have specific instructions
+  // to cast up/down their types automatically at no extra cost.
+  // TODO: Get these tables to know at least what the related operations are.
+  static const TypeConversionCostTblEntry<MVT::SimpleValueType>
+  NEONVectorConversionTbl[] = {
+    { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0 },
+    { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0 },
+    { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
+    { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
+    { ISD::TRUNCATE,    MVT::v4i32, MVT::v4i64, 0 },
+    { ISD::TRUNCATE,    MVT::v4i16, MVT::v4i32, 1 },
+
+    // The number of vmovl instructions for the extension.
+    { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
+    { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
+    { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
+    { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
+    { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
+    { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
+    { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
+    { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
+
+    // Operations that we legalize using splitting.
+    { ISD::TRUNCATE,    MVT::v16i8, MVT::v16i32, 6 },
+    { ISD::TRUNCATE,    MVT::v8i8, MVT::v8i32, 3 },
+
+    // Vector float <-> i32 conversions.
+    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
+    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
+
+    { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i8, 3 },
+    { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i8, 3 },
+    { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i16, 2 },
+    { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i16, 2 },
+    { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i32, 1 },
+    { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i32, 1 },
+    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i1, 3 },
+    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i1, 3 },
+    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i8, 3 },
+    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i8, 3 },
+    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
+    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
+    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i16, 4 },
+    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i16, 4 },
+    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i32, 2 },
+    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i32, 2 },
+    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i16, 8 },
+    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i16, 8 },
+    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i32, 4 },
+    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i32, 4 },
+
+    { ISD::FP_TO_SINT,  MVT::v4i32, MVT::v4f32, 1 },
+    { ISD::FP_TO_UINT,  MVT::v4i32, MVT::v4f32, 1 },
+    { ISD::FP_TO_SINT,  MVT::v4i8, MVT::v4f32, 3 },
+    { ISD::FP_TO_UINT,  MVT::v4i8, MVT::v4f32, 3 },
+    { ISD::FP_TO_SINT,  MVT::v4i16, MVT::v4f32, 2 },
+    { ISD::FP_TO_UINT,  MVT::v4i16, MVT::v4f32, 2 },
+
+    // Vector double <-> i32 conversions.
+    { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
+    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
+
+    { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i8, 4 },
+    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i8, 4 },
+    { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i16, 3 },
+    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i16, 3 },
+    { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
+    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
+
+    { ISD::FP_TO_SINT,  MVT::v2i32, MVT::v2f64, 2 },
+    { ISD::FP_TO_UINT,  MVT::v2i32, MVT::v2f64, 2 },
+    { ISD::FP_TO_SINT,  MVT::v8i16, MVT::v8f32, 4 },
+    { ISD::FP_TO_UINT,  MVT::v8i16, MVT::v8f32, 4 },
+    { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v16f32, 8 },
+    { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v16f32, 8 }
+  };
+
+  if (SrcTy.isVector() && ST->hasNEON()) {
+    int Idx = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
+                                     DstTy.getSimpleVT(), SrcTy.getSimpleVT());
+    if (Idx != -1)
+      return NEONVectorConversionTbl[Idx].Cost;
+  }
+
+  // Scalar float to integer conversions.
+  static const TypeConversionCostTblEntry<MVT::SimpleValueType>
+  NEONFloatConversionTbl[] = {
+    { ISD::FP_TO_SINT,  MVT::i1, MVT::f32, 2 },
+    { ISD::FP_TO_UINT,  MVT::i1, MVT::f32, 2 },
+    { ISD::FP_TO_SINT,  MVT::i1, MVT::f64, 2 },
+    { ISD::FP_TO_UINT,  MVT::i1, MVT::f64, 2 },
+    { ISD::FP_TO_SINT,  MVT::i8, MVT::f32, 2 },
+    { ISD::FP_TO_UINT,  MVT::i8, MVT::f32, 2 },
+    { ISD::FP_TO_SINT,  MVT::i8, MVT::f64, 2 },
+    { ISD::FP_TO_UINT,  MVT::i8, MVT::f64, 2 },
+    { ISD::FP_TO_SINT,  MVT::i16, MVT::f32, 2 },
+    { ISD::FP_TO_UINT,  MVT::i16, MVT::f32, 2 },
+    { ISD::FP_TO_SINT,  MVT::i16, MVT::f64, 2 },
+    { ISD::FP_TO_UINT,  MVT::i16, MVT::f64, 2 },
+    { ISD::FP_TO_SINT,  MVT::i32, MVT::f32, 2 },
+    { ISD::FP_TO_UINT,  MVT::i32, MVT::f32, 2 },
+    { ISD::FP_TO_SINT,  MVT::i32, MVT::f64, 2 },
+    { ISD::FP_TO_UINT,  MVT::i32, MVT::f64, 2 },
+    { ISD::FP_TO_SINT,  MVT::i64, MVT::f32, 10 },
+    { ISD::FP_TO_UINT,  MVT::i64, MVT::f32, 10 },
+    { ISD::FP_TO_SINT,  MVT::i64, MVT::f64, 10 },
+    { ISD::FP_TO_UINT,  MVT::i64, MVT::f64, 10 }
+  };
+  if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
+    int Idx = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
+                                     DstTy.getSimpleVT(), SrcTy.getSimpleVT());
+    if (Idx != -1)
+        return NEONFloatConversionTbl[Idx].Cost;
+  }
+
+  // Scalar integer to float conversions.
+  static const TypeConversionCostTblEntry<MVT::SimpleValueType>
+  NEONIntegerConversionTbl[] = {
+    { ISD::SINT_TO_FP,  MVT::f32, MVT::i1, 2 },
+    { ISD::UINT_TO_FP,  MVT::f32, MVT::i1, 2 },
+    { ISD::SINT_TO_FP,  MVT::f64, MVT::i1, 2 },
+    { ISD::UINT_TO_FP,  MVT::f64, MVT::i1, 2 },
+    { ISD::SINT_TO_FP,  MVT::f32, MVT::i8, 2 },
+    { ISD::UINT_TO_FP,  MVT::f32, MVT::i8, 2 },
+    { ISD::SINT_TO_FP,  MVT::f64, MVT::i8, 2 },
+    { ISD::UINT_TO_FP,  MVT::f64, MVT::i8, 2 },
+    { ISD::SINT_TO_FP,  MVT::f32, MVT::i16, 2 },
+    { ISD::UINT_TO_FP,  MVT::f32, MVT::i16, 2 },
+    { ISD::SINT_TO_FP,  MVT::f64, MVT::i16, 2 },
+    { ISD::UINT_TO_FP,  MVT::f64, MVT::i16, 2 },
+    { ISD::SINT_TO_FP,  MVT::f32, MVT::i32, 2 },
+    { ISD::UINT_TO_FP,  MVT::f32, MVT::i32, 2 },
+    { ISD::SINT_TO_FP,  MVT::f64, MVT::i32, 2 },
+    { ISD::UINT_TO_FP,  MVT::f64, MVT::i32, 2 },
+    { ISD::SINT_TO_FP,  MVT::f32, MVT::i64, 10 },
+    { ISD::UINT_TO_FP,  MVT::f32, MVT::i64, 10 },
+    { ISD::SINT_TO_FP,  MVT::f64, MVT::i64, 10 },
+    { ISD::UINT_TO_FP,  MVT::f64, MVT::i64, 10 }
+  };
+
+  if (SrcTy.isInteger() && ST->hasNEON()) {
+    int Idx = ConvertCostTableLookup(NEONIntegerConversionTbl, ISD,
+                                     DstTy.getSimpleVT(), SrcTy.getSimpleVT());
+    if (Idx != -1)
+      return NEONIntegerConversionTbl[Idx].Cost;
+  }
+
+  // Scalar integer conversion costs.
+  static const TypeConversionCostTblEntry<MVT::SimpleValueType>
+  ARMIntegerConversionTbl[] = {
+    // i16 -> i64 requires two dependent operations.
+    { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },
+
+    // Truncates on i64 are assumed to be free.
+    { ISD::TRUNCATE,    MVT::i32, MVT::i64, 0 },
+    { ISD::TRUNCATE,    MVT::i16, MVT::i64, 0 },
+    { ISD::TRUNCATE,    MVT::i8,  MVT::i64, 0 },
+    { ISD::TRUNCATE,    MVT::i1,  MVT::i64, 0 }
+  };
+
+  if (SrcTy.isInteger()) {
+    int Idx = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
+                                     DstTy.getSimpleVT(), SrcTy.getSimpleVT());
+    if (Idx != -1)
+      return ARMIntegerConversionTbl[Idx].Cost;
+  }
+
+  return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
+}
+
+unsigned ARMTTI::getVectorInstrCost(unsigned Opcode, Type *ValTy,
+                                    unsigned Index) const {
+  // Penalize inserting into an D-subregister. We end up with a three times
+  // lower estimated throughput on swift.
+  if (ST->isSwift() &&
+      Opcode == Instruction::InsertElement &&
+      ValTy->isVectorTy() &&
+      ValTy->getScalarSizeInBits() <= 32)
+    return 3;
+
+  return TargetTransformInfo::getVectorInstrCost(Opcode, ValTy, Index);
+}
+
+unsigned ARMTTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                                    Type *CondTy) const {
+
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  // On NEON a a vector select gets lowered to vbsl.
+  if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT) {
+    // Lowering of some vector selects is currently far from perfect.
+    static const TypeConversionCostTblEntry<MVT::SimpleValueType>
+    NEONVectorSelectTbl[] = {
+      { ISD::SELECT, MVT::v16i1, MVT::v16i16, 2*16 + 1 + 3*1 + 4*1 },
+      { ISD::SELECT, MVT::v8i1, MVT::v8i32, 4*8 + 1*3 + 1*4 + 1*2 },
+      { ISD::SELECT, MVT::v16i1, MVT::v16i32, 4*16 + 1*6 + 1*8 + 1*4 },
+      { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
+      { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
+      { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
+    };
+
+    EVT SelCondTy = TLI->getValueType(CondTy);
+    EVT SelValTy = TLI->getValueType(ValTy);
+    if (SelCondTy.isSimple() && SelValTy.isSimple()) {
+      int Idx = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
+                                       SelCondTy.getSimpleVT(),
+                                       SelValTy.getSimpleVT());
+      if (Idx != -1)
+        return NEONVectorSelectTbl[Idx].Cost;
+    }
+
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy);
+    return LT.first;
+  }
+
+  return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+}
+
+unsigned ARMTTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
+  // Address computations in vectorized code with non-consecutive addresses will
+  // likely result in more instructions compared to scalar code where the
+  // computation can more often be merged into the index mode. The resulting
+  // extra micro-ops can significantly decrease throughput.
+  unsigned NumVectorInstToHideOverhead = 10;
+
+  if (Ty->isVectorTy() && IsComplex)
+    return NumVectorInstToHideOverhead;
+
+  // In many cases the address computation is not merged into the instruction
+  // addressing mode.
+  return 1;
+}
+
+unsigned ARMTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
+                                Type *SubTp) const {
+  // We only handle costs of reverse and alternate shuffles for now.
+  if (Kind != SK_Reverse && Kind != SK_Alternate)
+    return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
+
+  if (Kind == SK_Reverse) {
+    static const CostTblEntry<MVT::SimpleValueType> NEONShuffleTbl[] = {
+        // Reverse shuffle cost one instruction if we are shuffling within a
+        // double word (vrev) or two if we shuffle a quad word (vrev, vext).
+        {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
+
+        {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
+        {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
+        {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
+        {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
+
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
+
+    int Idx = CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
+    if (Idx == -1)
+      return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
+
+    return LT.first * NEONShuffleTbl[Idx].Cost;
+  }
+  if (Kind == SK_Alternate) {
+    static const CostTblEntry<MVT::SimpleValueType> NEONAltShuffleTbl[] = {
+        // Alt shuffle cost table for ARM. Cost is the number of instructions
+        // required to create the shuffled vector.
+
+        {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
+
+        {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
+        {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
+        {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
+
+        {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
+
+        {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
+
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
+    int Idx =
+        CostTableLookup(NEONAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
+    if (Idx == -1)
+      return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
+    return LT.first * NEONAltShuffleTbl[Idx].Cost;
+  }
+  return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
+}
+
+unsigned ARMTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+                                        OperandValueKind Op1Info,
+                                        OperandValueKind Op2Info) const {
+
+  int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
+
+  const unsigned FunctionCallDivCost = 20;
+  const unsigned ReciprocalDivCost = 10;
+  static const CostTblEntry<MVT::SimpleValueType> CostTbl[] = {
+    // Division.
+    // These costs are somewhat random. Choose a cost of 20 to indicate that
+    // vectorizing devision (added function call) is going to be very expensive.
+    // Double registers types.
+    { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
+    { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
+    { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
+    { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
+    { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
+    { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
+    { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
+    { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
+    { ISD::SDIV, MVT::v4i16,     ReciprocalDivCost},
+    { ISD::UDIV, MVT::v4i16,     ReciprocalDivCost},
+    { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
+    { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
+    { ISD::SDIV, MVT::v8i8,      ReciprocalDivCost},
+    { ISD::UDIV, MVT::v8i8,      ReciprocalDivCost},
+    { ISD::SREM, MVT::v8i8,  8 * FunctionCallDivCost},
+    { ISD::UREM, MVT::v8i8,  8 * FunctionCallDivCost},
+    // Quad register types.
+    { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
+    { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
+    { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
+    { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
+    { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
+    { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
+    { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
+    { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
+    { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
+    { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
+    { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
+    { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
+    { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
+    { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
+    { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
+    { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
+    // Multiplication.
+  };
+
+  int Idx = -1;
+
+  if (ST->hasNEON())
+    Idx = CostTableLookup(CostTbl, ISDOpcode, LT.second);
+
+  if (Idx != -1)
+    return LT.first * CostTbl[Idx].Cost;
+
+  unsigned Cost =
+      TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info);
+
+  // This is somewhat of a hack. The problem that we are facing is that SROA
+  // creates a sequence of shift, and, or instructions to construct values.
+  // These sequences are recognized by the ISel and have zero-cost. Not so for
+  // the vectorized code. Because we have support for v2i64 but not i64 those
+  // sequences look particularly beneficial to vectorize.
+  // To work around this we increase the cost of v2i64 operations to make them
+  // seem less beneficial.
+  if (LT.second == MVT::v2i64 &&
+      Op2Info == TargetTransformInfo::OK_UniformConstantValue)
+    Cost += 4;
+
+  return Cost;
+}
+
+unsigned ARMTTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                                 unsigned AddressSpace) const {
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
+
+  if (Src->isVectorTy() && Alignment != 16 &&
+      Src->getVectorElementType()->isDoubleTy()) {
+    // Unaligned loads/stores are extremely inefficient.
+    // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
+    return LT.first * 4;
+  }
+  return LT.first;
+}
diff --git a/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
new file mode 100644
index 0000000..b62706c
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -0,0 +1,9461 @@
+//===-- ARMAsmParser.cpp - Parse ARM assembly to MCInst instructions ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMFPUName.h"
+#include "ARMFeatures.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMArchName.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "MCTargetDesc/ARMMCExpr.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/Support/ARMBuildAttributes.h"
+#include "llvm/Support/ARMEHABI.h"
+#include "llvm/Support/COFF.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+
+class ARMOperand;
+
+enum VectorLaneTy { NoLanes, AllLanes, IndexedLane };
+
+class UnwindContext {
+  MCAsmParser &Parser;
+
+  typedef SmallVector<SMLoc, 4> Locs;
+
+  Locs FnStartLocs;
+  Locs CantUnwindLocs;
+  Locs PersonalityLocs;
+  Locs PersonalityIndexLocs;
+  Locs HandlerDataLocs;
+  int FPReg;
+
+public:
+  UnwindContext(MCAsmParser &P) : Parser(P), FPReg(ARM::SP) {}
+
+  bool hasFnStart() const { return !FnStartLocs.empty(); }
+  bool cantUnwind() const { return !CantUnwindLocs.empty(); }
+  bool hasHandlerData() const { return !HandlerDataLocs.empty(); }
+  bool hasPersonality() const {
+    return !(PersonalityLocs.empty() && PersonalityIndexLocs.empty());
+  }
+
+  void recordFnStart(SMLoc L) { FnStartLocs.push_back(L); }
+  void recordCantUnwind(SMLoc L) { CantUnwindLocs.push_back(L); }
+  void recordPersonality(SMLoc L) { PersonalityLocs.push_back(L); }
+  void recordHandlerData(SMLoc L) { HandlerDataLocs.push_back(L); }
+  void recordPersonalityIndex(SMLoc L) { PersonalityIndexLocs.push_back(L); }
+
+  void saveFPReg(int Reg) { FPReg = Reg; }
+  int getFPReg() const { return FPReg; }
+
+  void emitFnStartLocNotes() const {
+    for (Locs::const_iterator FI = FnStartLocs.begin(), FE = FnStartLocs.end();
+         FI != FE; ++FI)
+      Parser.Note(*FI, ".fnstart was specified here");
+  }
+  void emitCantUnwindLocNotes() const {
+    for (Locs::const_iterator UI = CantUnwindLocs.begin(),
+                              UE = CantUnwindLocs.end(); UI != UE; ++UI)
+      Parser.Note(*UI, ".cantunwind was specified here");
+  }
+  void emitHandlerDataLocNotes() const {
+    for (Locs::const_iterator HI = HandlerDataLocs.begin(),
+                              HE = HandlerDataLocs.end(); HI != HE; ++HI)
+      Parser.Note(*HI, ".handlerdata was specified here");
+  }
+  void emitPersonalityLocNotes() const {
+    for (Locs::const_iterator PI = PersonalityLocs.begin(),
+                              PE = PersonalityLocs.end(),
+                              PII = PersonalityIndexLocs.begin(),
+                              PIE = PersonalityIndexLocs.end();
+         PI != PE || PII != PIE;) {
+      if (PI != PE && (PII == PIE || PI->getPointer() < PII->getPointer()))
+        Parser.Note(*PI++, ".personality was specified here");
+      else if (PII != PIE && (PI == PE || PII->getPointer() < PI->getPointer()))
+        Parser.Note(*PII++, ".personalityindex was specified here");
+      else
+        llvm_unreachable(".personality and .personalityindex cannot be "
+                         "at the same location");
+    }
+  }
+
+  void reset() {
+    FnStartLocs = Locs();
+    CantUnwindLocs = Locs();
+    PersonalityLocs = Locs();
+    HandlerDataLocs = Locs();
+    PersonalityIndexLocs = Locs();
+    FPReg = ARM::SP;
+  }
+};
+
+class ARMAsmParser : public MCTargetAsmParser {
+  MCSubtargetInfo &STI;
+  MCAsmParser &Parser;
+  const MCInstrInfo &MII;
+  const MCRegisterInfo *MRI;
+  UnwindContext UC;
+
+  ARMTargetStreamer &getTargetStreamer() {
+    MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer();
+    return static_cast<ARMTargetStreamer &>(TS);
+  }
+
+  // Map of register aliases registers via the .req directive.
+  StringMap<unsigned> RegisterReqs;
+
+  bool NextSymbolIsThumb;
+
+  struct {
+    ARMCC::CondCodes Cond;    // Condition for IT block.
+    unsigned Mask:4;          // Condition mask for instructions.
+                              // Starting at first 1 (from lsb).
+                              //   '1'  condition as indicated in IT.
+                              //   '0'  inverse of condition (else).
+                              // Count of instructions in IT block is
+                              // 4 - trailingzeroes(mask)
+
+    bool FirstCond;           // Explicit flag for when we're parsing the
+                              // First instruction in the IT block. It's
+                              // implied in the mask, so needs special
+                              // handling.
+
+    unsigned CurPosition;     // Current position in parsing of IT
+                              // block. In range [0,3]. Initialized
+                              // according to count of instructions in block.
+                              // ~0U if no active IT block.
+  } ITState;
+  bool inITBlock() { return ITState.CurPosition != ~0U;}
+  void forwardITPosition() {
+    if (!inITBlock()) return;
+    // Move to the next instruction in the IT block, if there is one. If not,
+    // mark the block as done.
+    unsigned TZ = countTrailingZeros(ITState.Mask);
+    if (++ITState.CurPosition == 5 - TZ)
+      ITState.CurPosition = ~0U; // Done with the IT block after this.
+  }
+
+
+  MCAsmParser &getParser() const { return Parser; }
+  MCAsmLexer &getLexer() const { return Parser.getLexer(); }
+
+  void Note(SMLoc L, const Twine &Msg, ArrayRef<SMRange> Ranges = None) {
+    return Parser.Note(L, Msg, Ranges);
+  }
+  bool Warning(SMLoc L, const Twine &Msg,
+               ArrayRef<SMRange> Ranges = None) {
+    return Parser.Warning(L, Msg, Ranges);
+  }
+  bool Error(SMLoc L, const Twine &Msg,
+             ArrayRef<SMRange> Ranges = None) {
+    return Parser.Error(L, Msg, Ranges);
+  }
+
+  int tryParseRegister();
+  bool tryParseRegisterWithWriteBack(OperandVector &);
+  int tryParseShiftRegister(OperandVector &);
+  bool parseRegisterList(OperandVector &);
+  bool parseMemory(OperandVector &);
+  bool parseOperand(OperandVector &, StringRef Mnemonic);
+  bool parsePrefix(ARMMCExpr::VariantKind &RefKind);
+  bool parseMemRegOffsetShift(ARM_AM::ShiftOpc &ShiftType,
+                              unsigned &ShiftAmount);
+  bool parseLiteralValues(unsigned Size, SMLoc L);
+  bool parseDirectiveThumb(SMLoc L);
+  bool parseDirectiveARM(SMLoc L);
+  bool parseDirectiveThumbFunc(SMLoc L);
+  bool parseDirectiveCode(SMLoc L);
+  bool parseDirectiveSyntax(SMLoc L);
+  bool parseDirectiveReq(StringRef Name, SMLoc L);
+  bool parseDirectiveUnreq(SMLoc L);
+  bool parseDirectiveArch(SMLoc L);
+  bool parseDirectiveEabiAttr(SMLoc L);
+  bool parseDirectiveCPU(SMLoc L);
+  bool parseDirectiveFPU(SMLoc L);
+  bool parseDirectiveFnStart(SMLoc L);
+  bool parseDirectiveFnEnd(SMLoc L);
+  bool parseDirectiveCantUnwind(SMLoc L);
+  bool parseDirectivePersonality(SMLoc L);
+  bool parseDirectiveHandlerData(SMLoc L);
+  bool parseDirectiveSetFP(SMLoc L);
+  bool parseDirectivePad(SMLoc L);
+  bool parseDirectiveRegSave(SMLoc L, bool IsVector);
+  bool parseDirectiveInst(SMLoc L, char Suffix = '\0');
+  bool parseDirectiveLtorg(SMLoc L);
+  bool parseDirectiveEven(SMLoc L);
+  bool parseDirectivePersonalityIndex(SMLoc L);
+  bool parseDirectiveUnwindRaw(SMLoc L);
+  bool parseDirectiveTLSDescSeq(SMLoc L);
+  bool parseDirectiveMovSP(SMLoc L);
+  bool parseDirectiveObjectArch(SMLoc L);
+  bool parseDirectiveArchExtension(SMLoc L);
+  bool parseDirectiveAlign(SMLoc L);
+  bool parseDirectiveThumbSet(SMLoc L);
+
+  StringRef splitMnemonic(StringRef Mnemonic, unsigned &PredicationCode,
+                          bool &CarrySetting, unsigned &ProcessorIMod,
+                          StringRef &ITMask);
+  void getMnemonicAcceptInfo(StringRef Mnemonic, StringRef FullInst,
+                             bool &CanAcceptCarrySet,
+                             bool &CanAcceptPredicationCode);
+
+  bool isThumb() const {
+    // FIXME: Can tablegen auto-generate this?
+    return (STI.getFeatureBits() & ARM::ModeThumb) != 0;
+  }
+  bool isThumbOne() const {
+    return isThumb() && (STI.getFeatureBits() & ARM::FeatureThumb2) == 0;
+  }
+  bool isThumbTwo() const {
+    return isThumb() && (STI.getFeatureBits() & ARM::FeatureThumb2);
+  }
+  bool hasThumb() const {
+    return STI.getFeatureBits() & ARM::HasV4TOps;
+  }
+  bool hasV6Ops() const {
+    return STI.getFeatureBits() & ARM::HasV6Ops;
+  }
+  bool hasV6MOps() const {
+    return STI.getFeatureBits() & ARM::HasV6MOps;
+  }
+  bool hasV7Ops() const {
+    return STI.getFeatureBits() & ARM::HasV7Ops;
+  }
+  bool hasV8Ops() const {
+    return STI.getFeatureBits() & ARM::HasV8Ops;
+  }
+  bool hasARM() const {
+    return !(STI.getFeatureBits() & ARM::FeatureNoARM);
+  }
+
+  void SwitchMode() {
+    unsigned FB = ComputeAvailableFeatures(STI.ToggleFeature(ARM::ModeThumb));
+    setAvailableFeatures(FB);
+  }
+  bool isMClass() const {
+    return STI.getFeatureBits() & ARM::FeatureMClass;
+  }
+
+  /// @name Auto-generated Match Functions
+  /// {
+
+#define GET_ASSEMBLER_HEADER
+#include "ARMGenAsmMatcher.inc"
+
+  /// }
+
+  OperandMatchResultTy parseITCondCode(OperandVector &);
+  OperandMatchResultTy parseCoprocNumOperand(OperandVector &);
+  OperandMatchResultTy parseCoprocRegOperand(OperandVector &);
+  OperandMatchResultTy parseCoprocOptionOperand(OperandVector &);
+  OperandMatchResultTy parseMemBarrierOptOperand(OperandVector &);
+  OperandMatchResultTy parseInstSyncBarrierOptOperand(OperandVector &);
+  OperandMatchResultTy parseProcIFlagsOperand(OperandVector &);
+  OperandMatchResultTy parseMSRMaskOperand(OperandVector &);
+  OperandMatchResultTy parsePKHImm(OperandVector &O, StringRef Op, int Low,
+                                   int High);
+  OperandMatchResultTy parsePKHLSLImm(OperandVector &O) {
+    return parsePKHImm(O, "lsl", 0, 31);
+  }
+  OperandMatchResultTy parsePKHASRImm(OperandVector &O) {
+    return parsePKHImm(O, "asr", 1, 32);
+  }
+  OperandMatchResultTy parseSetEndImm(OperandVector &);
+  OperandMatchResultTy parseShifterImm(OperandVector &);
+  OperandMatchResultTy parseRotImm(OperandVector &);
+  OperandMatchResultTy parseBitfield(OperandVector &);
+  OperandMatchResultTy parsePostIdxReg(OperandVector &);
+  OperandMatchResultTy parseAM3Offset(OperandVector &);
+  OperandMatchResultTy parseFPImm(OperandVector &);
+  OperandMatchResultTy parseVectorList(OperandVector &);
+  OperandMatchResultTy parseVectorLane(VectorLaneTy &LaneKind, unsigned &Index,
+                                       SMLoc &EndLoc);
+
+  // Asm Match Converter Methods
+  void cvtThumbMultiply(MCInst &Inst, const OperandVector &);
+  void cvtThumbBranches(MCInst &Inst, const OperandVector &);
+
+  bool validateInstruction(MCInst &Inst, const OperandVector &Ops);
+  bool processInstruction(MCInst &Inst, const OperandVector &Ops);
+  bool shouldOmitCCOutOperand(StringRef Mnemonic, OperandVector &Operands);
+  bool shouldOmitPredicateOperand(StringRef Mnemonic, OperandVector &Operands);
+
+public:
+  enum ARMMatchResultTy {
+    Match_RequiresITBlock = FIRST_TARGET_MATCH_RESULT_TY,
+    Match_RequiresNotITBlock,
+    Match_RequiresV6,
+    Match_RequiresThumb2,
+#define GET_OPERAND_DIAGNOSTIC_TYPES
+#include "ARMGenAsmMatcher.inc"
+
+  };
+
+  ARMAsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser,
+               const MCInstrInfo &MII,
+               const MCTargetOptions &Options)
+      : MCTargetAsmParser(), STI(_STI), Parser(_Parser), MII(MII), UC(_Parser) {
+    MCAsmParserExtension::Initialize(_Parser);
+
+    // Cache the MCRegisterInfo.
+    MRI = getContext().getRegisterInfo();
+
+    // Initialize the set of available features.
+    setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+
+    // Not in an ITBlock to start with.
+    ITState.CurPosition = ~0U;
+
+    NextSymbolIsThumb = false;
+  }
+
+  // Implementation of the MCTargetAsmParser interface:
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                        SMLoc NameLoc, OperandVector &Operands) override;
+  bool ParseDirective(AsmToken DirectiveID) override;
+
+  unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
+                                      unsigned Kind) override;
+  unsigned checkTargetMatchPredicate(MCInst &Inst) override;
+
+  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                               OperandVector &Operands, MCStreamer &Out,
+                               unsigned &ErrorInfo,
+                               bool MatchingInlineAsm) override;
+  void onLabelParsed(MCSymbol *Symbol) override;
+};
+} // end anonymous namespace
+
+namespace {
+
+/// ARMOperand - Instances of this class represent a parsed ARM machine
+/// operand.
+class ARMOperand : public MCParsedAsmOperand {
+  enum KindTy {
+    k_CondCode,
+    k_CCOut,
+    k_ITCondMask,
+    k_CoprocNum,
+    k_CoprocReg,
+    k_CoprocOption,
+    k_Immediate,
+    k_MemBarrierOpt,
+    k_InstSyncBarrierOpt,
+    k_Memory,
+    k_PostIndexRegister,
+    k_MSRMask,
+    k_ProcIFlags,
+    k_VectorIndex,
+    k_Register,
+    k_RegisterList,
+    k_DPRRegisterList,
+    k_SPRRegisterList,
+    k_VectorList,
+    k_VectorListAllLanes,
+    k_VectorListIndexed,
+    k_ShiftedRegister,
+    k_ShiftedImmediate,
+    k_ShifterImmediate,
+    k_RotateImmediate,
+    k_BitfieldDescriptor,
+    k_Token
+  } Kind;
+
+  SMLoc StartLoc, EndLoc, AlignmentLoc;
+  SmallVector<unsigned, 8> Registers;
+
+  struct CCOp {
+    ARMCC::CondCodes Val;
+  };
+
+  struct CopOp {
+    unsigned Val;
+  };
+
+  struct CoprocOptionOp {
+    unsigned Val;
+  };
+
+  struct ITMaskOp {
+    unsigned Mask:4;
+  };
+
+  struct MBOptOp {
+    ARM_MB::MemBOpt Val;
+  };
+
+  struct ISBOptOp {
+    ARM_ISB::InstSyncBOpt Val;
+  };
+
+  struct IFlagsOp {
+    ARM_PROC::IFlags Val;
+  };
+
+  struct MMaskOp {
+    unsigned Val;
+  };
+
+  struct TokOp {
+    const char *Data;
+    unsigned Length;
+  };
+
+  struct RegOp {
+    unsigned RegNum;
+  };
+
+  // A vector register list is a sequential list of 1 to 4 registers.
+  struct VectorListOp {
+    unsigned RegNum;
+    unsigned Count;
+    unsigned LaneIndex;
+    bool isDoubleSpaced;
+  };
+
+  struct VectorIndexOp {
+    unsigned Val;
+  };
+
+  struct ImmOp {
+    const MCExpr *Val;
+  };
+
+  /// Combined record for all forms of ARM address expressions.
+  struct MemoryOp {
+    unsigned BaseRegNum;
+    // Offset is in OffsetReg or OffsetImm. If both are zero, no offset
+    // was specified.
+    const MCConstantExpr *OffsetImm;  // Offset immediate value
+    unsigned OffsetRegNum;    // Offset register num, when OffsetImm == NULL
+    ARM_AM::ShiftOpc ShiftType; // Shift type for OffsetReg
+    unsigned ShiftImm;        // shift for OffsetReg.
+    unsigned Alignment;       // 0 = no alignment specified
+    // n = alignment in bytes (2, 4, 8, 16, or 32)
+    unsigned isNegative : 1;  // Negated OffsetReg? (~'U' bit)
+  };
+
+  struct PostIdxRegOp {
+    unsigned RegNum;
+    bool isAdd;
+    ARM_AM::ShiftOpc ShiftTy;
+    unsigned ShiftImm;
+  };
+
+  struct ShifterImmOp {
+    bool isASR;
+    unsigned Imm;
+  };
+
+  struct RegShiftedRegOp {
+    ARM_AM::ShiftOpc ShiftTy;
+    unsigned SrcReg;
+    unsigned ShiftReg;
+    unsigned ShiftImm;
+  };
+
+  struct RegShiftedImmOp {
+    ARM_AM::ShiftOpc ShiftTy;
+    unsigned SrcReg;
+    unsigned ShiftImm;
+  };
+
+  struct RotImmOp {
+    unsigned Imm;
+  };
+
+  struct BitfieldOp {
+    unsigned LSB;
+    unsigned Width;
+  };
+
+  union {
+    struct CCOp CC;
+    struct CopOp Cop;
+    struct CoprocOptionOp CoprocOption;
+    struct MBOptOp MBOpt;
+    struct ISBOptOp ISBOpt;
+    struct ITMaskOp ITMask;
+    struct IFlagsOp IFlags;
+    struct MMaskOp MMask;
+    struct TokOp Tok;
+    struct RegOp Reg;
+    struct VectorListOp VectorList;
+    struct VectorIndexOp VectorIndex;
+    struct ImmOp Imm;
+    struct MemoryOp Memory;
+    struct PostIdxRegOp PostIdxReg;
+    struct ShifterImmOp ShifterImm;
+    struct RegShiftedRegOp RegShiftedReg;
+    struct RegShiftedImmOp RegShiftedImm;
+    struct RotImmOp RotImm;
+    struct BitfieldOp Bitfield;
+  };
+
+public:
+  ARMOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
+  ARMOperand(const ARMOperand &o) : MCParsedAsmOperand() {
+    Kind = o.Kind;
+    StartLoc = o.StartLoc;
+    EndLoc = o.EndLoc;
+    switch (Kind) {
+    case k_CondCode:
+      CC = o.CC;
+      break;
+    case k_ITCondMask:
+      ITMask = o.ITMask;
+      break;
+    case k_Token:
+      Tok = o.Tok;
+      break;
+    case k_CCOut:
+    case k_Register:
+      Reg = o.Reg;
+      break;
+    case k_RegisterList:
+    case k_DPRRegisterList:
+    case k_SPRRegisterList:
+      Registers = o.Registers;
+      break;
+    case k_VectorList:
+    case k_VectorListAllLanes:
+    case k_VectorListIndexed:
+      VectorList = o.VectorList;
+      break;
+    case k_CoprocNum:
+    case k_CoprocReg:
+      Cop = o.Cop;
+      break;
+    case k_CoprocOption:
+      CoprocOption = o.CoprocOption;
+      break;
+    case k_Immediate:
+      Imm = o.Imm;
+      break;
+    case k_MemBarrierOpt:
+      MBOpt = o.MBOpt;
+      break;
+    case k_InstSyncBarrierOpt:
+      ISBOpt = o.ISBOpt;
+    case k_Memory:
+      Memory = o.Memory;
+      break;
+    case k_PostIndexRegister:
+      PostIdxReg = o.PostIdxReg;
+      break;
+    case k_MSRMask:
+      MMask = o.MMask;
+      break;
+    case k_ProcIFlags:
+      IFlags = o.IFlags;
+      break;
+    case k_ShifterImmediate:
+      ShifterImm = o.ShifterImm;
+      break;
+    case k_ShiftedRegister:
+      RegShiftedReg = o.RegShiftedReg;
+      break;
+    case k_ShiftedImmediate:
+      RegShiftedImm = o.RegShiftedImm;
+      break;
+    case k_RotateImmediate:
+      RotImm = o.RotImm;
+      break;
+    case k_BitfieldDescriptor:
+      Bitfield = o.Bitfield;
+      break;
+    case k_VectorIndex:
+      VectorIndex = o.VectorIndex;
+      break;
+    }
+  }
+
+  /// getStartLoc - Get the location of the first token of this operand.
+  SMLoc getStartLoc() const override { return StartLoc; }
+  /// getEndLoc - Get the location of the last token of this operand.
+  SMLoc getEndLoc() const override { return EndLoc; }
+  /// getLocRange - Get the range between the first and last token of this
+  /// operand.
+  SMRange getLocRange() const { return SMRange(StartLoc, EndLoc); }
+
+  /// getAlignmentLoc - Get the location of the Alignment token of this operand.
+  SMLoc getAlignmentLoc() const {
+    assert(Kind == k_Memory && "Invalid access!");
+    return AlignmentLoc;
+  }
+
+  ARMCC::CondCodes getCondCode() const {
+    assert(Kind == k_CondCode && "Invalid access!");
+    return CC.Val;
+  }
+
+  unsigned getCoproc() const {
+    assert((Kind == k_CoprocNum || Kind == k_CoprocReg) && "Invalid access!");
+    return Cop.Val;
+  }
+
+  StringRef getToken() const {
+    assert(Kind == k_Token && "Invalid access!");
+    return StringRef(Tok.Data, Tok.Length);
+  }
+
+  unsigned getReg() const override {
+    assert((Kind == k_Register || Kind == k_CCOut) && "Invalid access!");
+    return Reg.RegNum;
+  }
+
+  const SmallVectorImpl<unsigned> &getRegList() const {
+    assert((Kind == k_RegisterList || Kind == k_DPRRegisterList ||
+            Kind == k_SPRRegisterList) && "Invalid access!");
+    return Registers;
+  }
+
+  const MCExpr *getImm() const {
+    assert(isImm() && "Invalid access!");
+    return Imm.Val;
+  }
+
+  unsigned getVectorIndex() const {
+    assert(Kind == k_VectorIndex && "Invalid access!");
+    return VectorIndex.Val;
+  }
+
+  ARM_MB::MemBOpt getMemBarrierOpt() const {
+    assert(Kind == k_MemBarrierOpt && "Invalid access!");
+    return MBOpt.Val;
+  }
+
+  ARM_ISB::InstSyncBOpt getInstSyncBarrierOpt() const {
+    assert(Kind == k_InstSyncBarrierOpt && "Invalid access!");
+    return ISBOpt.Val;
+  }
+
+  ARM_PROC::IFlags getProcIFlags() const {
+    assert(Kind == k_ProcIFlags && "Invalid access!");
+    return IFlags.Val;
+  }
+
+  unsigned getMSRMask() const {
+    assert(Kind == k_MSRMask && "Invalid access!");
+    return MMask.Val;
+  }
+
+  bool isCoprocNum() const { return Kind == k_CoprocNum; }
+  bool isCoprocReg() const { return Kind == k_CoprocReg; }
+  bool isCoprocOption() const { return Kind == k_CoprocOption; }
+  bool isCondCode() const { return Kind == k_CondCode; }
+  bool isCCOut() const { return Kind == k_CCOut; }
+  bool isITMask() const { return Kind == k_ITCondMask; }
+  bool isITCondCode() const { return Kind == k_CondCode; }
+  bool isImm() const override { return Kind == k_Immediate; }
+  // checks whether this operand is an unsigned offset which fits is a field
+  // of specified width and scaled by a specific number of bits
+  template<unsigned width, unsigned scale>
+  bool isUnsignedOffset() const {
+    if (!isImm()) return false;
+    if (isa<MCSymbolRefExpr>(Imm.Val)) return true;
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) {
+      int64_t Val = CE->getValue();
+      int64_t Align = 1LL << scale;
+      int64_t Max = Align * ((1LL << width) - 1);
+      return ((Val % Align) == 0) && (Val >= 0) && (Val <= Max);
+    }
+    return false;
+  }
+  // checks whether this operand is an signed offset which fits is a field
+  // of specified width and scaled by a specific number of bits
+  template<unsigned width, unsigned scale>
+  bool isSignedOffset() const {
+    if (!isImm()) return false;
+    if (isa<MCSymbolRefExpr>(Imm.Val)) return true;
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) {
+      int64_t Val = CE->getValue();
+      int64_t Align = 1LL << scale;
+      int64_t Max = Align * ((1LL << (width-1)) - 1);
+      int64_t Min = -Align * (1LL << (width-1));
+      return ((Val % Align) == 0) && (Val >= Min) && (Val <= Max);
+    }
+    return false;
+  }
+
+  // checks whether this operand is a memory operand computed as an offset
+  // applied to PC. the offset may have 8 bits of magnitude and is represented
+  // with two bits of shift. textually it may be either [pc, #imm], #imm or 
+  // relocable expression...
+  bool isThumbMemPC() const {
+    int64_t Val = 0;
+    if (isImm()) {
+      if (isa<MCSymbolRefExpr>(Imm.Val)) return true;
+      const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val);
+      if (!CE) return false;
+      Val = CE->getValue();
+    }
+    else if (isMem()) {
+      if(!Memory.OffsetImm || Memory.OffsetRegNum) return false;
+      if(Memory.BaseRegNum != ARM::PC) return false;
+      Val = Memory.OffsetImm->getValue();
+    }
+    else return false;
+    return ((Val % 4) == 0) && (Val >= 0) && (Val <= 1020);
+  }
+  bool isFPImm() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int Val = ARM_AM::getFP32Imm(APInt(32, CE->getValue()));
+    return Val != -1;
+  }
+  bool isFBits16() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value <= 16;
+  }
+  bool isFBits32() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 1 && Value <= 32;
+  }
+  bool isImm8s4() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return ((Value & 3) == 0) && Value >= -1020 && Value <= 1020;
+  }
+  bool isImm0_1020s4() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return ((Value & 3) == 0) && Value >= 0 && Value <= 1020;
+  }
+  bool isImm0_508s4() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return ((Value & 3) == 0) && Value >= 0 && Value <= 508;
+  }
+  bool isImm0_508s4Neg() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = -CE->getValue();
+    // explicitly exclude zero. we want that to use the normal 0_508 version.
+    return ((Value & 3) == 0) && Value > 0 && Value <= 508;
+  }
+  bool isImm0_239() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 240;
+  }
+  bool isImm0_255() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 256;
+  }
+  bool isImm0_4095() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 4096;
+  }
+  bool isImm0_4095Neg() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = -CE->getValue();
+    return Value > 0 && Value < 4096;
+  }
+  bool isImm0_1() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 2;
+  }
+  bool isImm0_3() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 4;
+  }
+  bool isImm0_7() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 8;
+  }
+  bool isImm0_15() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 16;
+  }
+  bool isImm0_31() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 32;
+  }
+  bool isImm0_63() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 64;
+  }
+  bool isImm8() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value == 8;
+  }
+  bool isImm16() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value == 16;
+  }
+  bool isImm32() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value == 32;
+  }
+  bool isShrImm8() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value <= 8;
+  }
+  bool isShrImm16() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value <= 16;
+  }
+  bool isShrImm32() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value <= 32;
+  }
+  bool isShrImm64() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value <= 64;
+  }
+  bool isImm1_7() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value < 8;
+  }
+  bool isImm1_15() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value < 16;
+  }
+  bool isImm1_31() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value < 32;
+  }
+  bool isImm1_16() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value < 17;
+  }
+  bool isImm1_32() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value < 33;
+  }
+  bool isImm0_32() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 33;
+  }
+  bool isImm0_65535() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 65536;
+  }
+  bool isImm256_65535Expr() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    // If it's not a constant expression, it'll generate a fixup and be
+    // handled later.
+    if (!CE) return true;
+    int64_t Value = CE->getValue();
+    return Value >= 256 && Value < 65536;
+  }
+  bool isImm0_65535Expr() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    // If it's not a constant expression, it'll generate a fixup and be
+    // handled later.
+    if (!CE) return true;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 65536;
+  }
+  bool isImm24bit() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value <= 0xffffff;
+  }
+  bool isImmThumbSR() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value < 33;
+  }
+  bool isPKHLSLImm() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 32;
+  }
+  bool isPKHASRImm() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value <= 32;
+  }
+  bool isAdrLabel() const {
+    // If we have an immediate that's not a constant, treat it as a label
+    // reference needing a fixup. If it is a constant, but it can't fit 
+    // into shift immediate encoding, we reject it.
+    if (isImm() && !isa<MCConstantExpr>(getImm())) return true;
+    else return (isARMSOImm() || isARMSOImmNeg());
+  }
+  bool isARMSOImm() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return ARM_AM::getSOImmVal(Value) != -1;
+  }
+  bool isARMSOImmNot() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return ARM_AM::getSOImmVal(~Value) != -1;
+  }
+  bool isARMSOImmNeg() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    // Only use this when not representable as a plain so_imm.
+    return ARM_AM::getSOImmVal(Value) == -1 &&
+      ARM_AM::getSOImmVal(-Value) != -1;
+  }
+  bool isT2SOImm() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return ARM_AM::getT2SOImmVal(Value) != -1;
+  }
+  bool isT2SOImmNot() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return ARM_AM::getT2SOImmVal(Value) == -1 &&
+      ARM_AM::getT2SOImmVal(~Value) != -1;
+  }
+  bool isT2SOImmNeg() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    // Only use this when not representable as a plain so_imm.
+    return ARM_AM::getT2SOImmVal(Value) == -1 &&
+      ARM_AM::getT2SOImmVal(-Value) != -1;
+  }
+  bool isSetEndImm() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value == 1 || Value == 0;
+  }
+  bool isReg() const override { return Kind == k_Register; }
+  bool isRegList() const { return Kind == k_RegisterList; }
+  bool isDPRRegList() const { return Kind == k_DPRRegisterList; }
+  bool isSPRRegList() const { return Kind == k_SPRRegisterList; }
+  bool isToken() const override { return Kind == k_Token; }
+  bool isMemBarrierOpt() const { return Kind == k_MemBarrierOpt; }
+  bool isInstSyncBarrierOpt() const { return Kind == k_InstSyncBarrierOpt; }
+  bool isMem() const override { return Kind == k_Memory; }
+  bool isShifterImm() const { return Kind == k_ShifterImmediate; }
+  bool isRegShiftedReg() const { return Kind == k_ShiftedRegister; }
+  bool isRegShiftedImm() const { return Kind == k_ShiftedImmediate; }
+  bool isRotImm() const { return Kind == k_RotateImmediate; }
+  bool isBitfield() const { return Kind == k_BitfieldDescriptor; }
+  bool isPostIdxRegShifted() const { return Kind == k_PostIndexRegister; }
+  bool isPostIdxReg() const {
+    return Kind == k_PostIndexRegister && PostIdxReg.ShiftTy ==ARM_AM::no_shift;
+  }
+  bool isMemNoOffset(bool alignOK = false, unsigned Alignment = 0) const {
+    if (!isMem())
+      return false;
+    // No offset of any kind.
+    return Memory.OffsetRegNum == 0 && Memory.OffsetImm == nullptr &&
+     (alignOK || Memory.Alignment == Alignment);
+  }
+  bool isMemPCRelImm12() const {
+    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+      return false;
+    // Base register must be PC.
+    if (Memory.BaseRegNum != ARM::PC)
+      return false;
+    // Immediate offset in range [-4095, 4095].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return (Val > -4096 && Val < 4096) || (Val == INT32_MIN);
+  }
+  bool isAlignedMemory() const {
+    return isMemNoOffset(true);
+  }
+  bool isAlignedMemoryNone() const {
+    return isMemNoOffset(false, 0);
+  }
+  bool isDupAlignedMemoryNone() const {
+    return isMemNoOffset(false, 0);
+  }
+  bool isAlignedMemory16() const {
+    if (isMemNoOffset(false, 2)) // alignment in bytes for 16-bits is 2.
+      return true;
+    return isMemNoOffset(false, 0);
+  }
+  bool isDupAlignedMemory16() const {
+    if (isMemNoOffset(false, 2)) // alignment in bytes for 16-bits is 2.
+      return true;
+    return isMemNoOffset(false, 0);
+  }
+  bool isAlignedMemory32() const {
+    if (isMemNoOffset(false, 4)) // alignment in bytes for 32-bits is 4.
+      return true;
+    return isMemNoOffset(false, 0);
+  }
+  bool isDupAlignedMemory32() const {
+    if (isMemNoOffset(false, 4)) // alignment in bytes for 32-bits is 4.
+      return true;
+    return isMemNoOffset(false, 0);
+  }
+  bool isAlignedMemory64() const {
+    if (isMemNoOffset(false, 8)) // alignment in bytes for 64-bits is 8.
+      return true;
+    return isMemNoOffset(false, 0);
+  }
+  bool isDupAlignedMemory64() const {
+    if (isMemNoOffset(false, 8)) // alignment in bytes for 64-bits is 8.
+      return true;
+    return isMemNoOffset(false, 0);
+  }
+  bool isAlignedMemory64or128() const {
+    if (isMemNoOffset(false, 8)) // alignment in bytes for 64-bits is 8.
+      return true;
+    if (isMemNoOffset(false, 16)) // alignment in bytes for 128-bits is 16.
+      return true;
+    return isMemNoOffset(false, 0);
+  }
+  bool isDupAlignedMemory64or128() const {
+    if (isMemNoOffset(false, 8)) // alignment in bytes for 64-bits is 8.
+      return true;
+    if (isMemNoOffset(false, 16)) // alignment in bytes for 128-bits is 16.
+      return true;
+    return isMemNoOffset(false, 0);
+  }
+  bool isAlignedMemory64or128or256() const {
+    if (isMemNoOffset(false, 8)) // alignment in bytes for 64-bits is 8.
+      return true;
+    if (isMemNoOffset(false, 16)) // alignment in bytes for 128-bits is 16.
+      return true;
+    if (isMemNoOffset(false, 32)) // alignment in bytes for 256-bits is 32.
+      return true;
+    return isMemNoOffset(false, 0);
+  }
+  bool isAddrMode2() const {
+    if (!isMem() || Memory.Alignment != 0) return false;
+    // Check for register offset.
+    if (Memory.OffsetRegNum) return true;
+    // Immediate offset in range [-4095, 4095].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return Val > -4096 && Val < 4096;
+  }
+  bool isAM2OffsetImm() const {
+    if (!isImm()) return false;
+    // Immediate offset in range [-4095, 4095].
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Val = CE->getValue();
+    return (Val == INT32_MIN) || (Val > -4096 && Val < 4096);
+  }
+  bool isAddrMode3() const {
+    // If we have an immediate that's not a constant, treat it as a label
+    // reference needing a fixup. If it is a constant, it's something else
+    // and we reject it.
+    if (isImm() && !isa<MCConstantExpr>(getImm()))
+      return true;
+    if (!isMem() || Memory.Alignment != 0) return false;
+    // No shifts are legal for AM3.
+    if (Memory.ShiftType != ARM_AM::no_shift) return false;
+    // Check for register offset.
+    if (Memory.OffsetRegNum) return true;
+    // Immediate offset in range [-255, 255].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    // The #-0 offset is encoded as INT32_MIN, and we have to check 
+    // for this too.
+    return (Val > -256 && Val < 256) || Val == INT32_MIN;
+  }
+  bool isAM3Offset() const {
+    if (Kind != k_Immediate && Kind != k_PostIndexRegister)
+      return false;
+    if (Kind == k_PostIndexRegister)
+      return PostIdxReg.ShiftTy == ARM_AM::no_shift;
+    // Immediate offset in range [-255, 255].
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Val = CE->getValue();
+    // Special case, #-0 is INT32_MIN.
+    return (Val > -256 && Val < 256) || Val == INT32_MIN;
+  }
+  bool isAddrMode5() const {
+    // If we have an immediate that's not a constant, treat it as a label
+    // reference needing a fixup. If it is a constant, it's something else
+    // and we reject it.
+    if (isImm() && !isa<MCConstantExpr>(getImm()))
+      return true;
+    if (!isMem() || Memory.Alignment != 0) return false;
+    // Check for register offset.
+    if (Memory.OffsetRegNum) return false;
+    // Immediate offset in range [-1020, 1020] and a multiple of 4.
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return (Val >= -1020 && Val <= 1020 && ((Val & 3) == 0)) ||
+      Val == INT32_MIN;
+  }
+  bool isMemTBB() const {
+    if (!isMem() || !Memory.OffsetRegNum || Memory.isNegative ||
+        Memory.ShiftType != ARM_AM::no_shift || Memory.Alignment != 0)
+      return false;
+    return true;
+  }
+  bool isMemTBH() const {
+    if (!isMem() || !Memory.OffsetRegNum || Memory.isNegative ||
+        Memory.ShiftType != ARM_AM::lsl || Memory.ShiftImm != 1 ||
+        Memory.Alignment != 0 )
+      return false;
+    return true;
+  }
+  bool isMemRegOffset() const {
+    if (!isMem() || !Memory.OffsetRegNum || Memory.Alignment != 0)
+      return false;
+    return true;
+  }
+  bool isT2MemRegOffset() const {
+    if (!isMem() || !Memory.OffsetRegNum || Memory.isNegative ||
+        Memory.Alignment != 0)
+      return false;
+    // Only lsl #{0, 1, 2, 3} allowed.
+    if (Memory.ShiftType == ARM_AM::no_shift)
+      return true;
+    if (Memory.ShiftType != ARM_AM::lsl || Memory.ShiftImm > 3)
+      return false;
+    return true;
+  }
+  bool isMemThumbRR() const {
+    // Thumb reg+reg addressing is simple. Just two registers, a base and
+    // an offset. No shifts, negations or any other complicating factors.
+    if (!isMem() || !Memory.OffsetRegNum || Memory.isNegative ||
+        Memory.ShiftType != ARM_AM::no_shift || Memory.Alignment != 0)
+      return false;
+    return isARMLowRegister(Memory.BaseRegNum) &&
+      (!Memory.OffsetRegNum || isARMLowRegister(Memory.OffsetRegNum));
+  }
+  bool isMemThumbRIs4() const {
+    if (!isMem() || Memory.OffsetRegNum != 0 ||
+        !isARMLowRegister(Memory.BaseRegNum) || Memory.Alignment != 0)
+      return false;
+    // Immediate offset, multiple of 4 in range [0, 124].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return Val >= 0 && Val <= 124 && (Val % 4) == 0;
+  }
+  bool isMemThumbRIs2() const {
+    if (!isMem() || Memory.OffsetRegNum != 0 ||
+        !isARMLowRegister(Memory.BaseRegNum) || Memory.Alignment != 0)
+      return false;
+    // Immediate offset, multiple of 4 in range [0, 62].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return Val >= 0 && Val <= 62 && (Val % 2) == 0;
+  }
+  bool isMemThumbRIs1() const {
+    if (!isMem() || Memory.OffsetRegNum != 0 ||
+        !isARMLowRegister(Memory.BaseRegNum) || Memory.Alignment != 0)
+      return false;
+    // Immediate offset in range [0, 31].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return Val >= 0 && Val <= 31;
+  }
+  bool isMemThumbSPI() const {
+    if (!isMem() || Memory.OffsetRegNum != 0 ||
+        Memory.BaseRegNum != ARM::SP || Memory.Alignment != 0)
+      return false;
+    // Immediate offset, multiple of 4 in range [0, 1020].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return Val >= 0 && Val <= 1020 && (Val % 4) == 0;
+  }
+  bool isMemImm8s4Offset() const {
+    // If we have an immediate that's not a constant, treat it as a label
+    // reference needing a fixup. If it is a constant, it's something else
+    // and we reject it.
+    if (isImm() && !isa<MCConstantExpr>(getImm()))
+      return true;
+    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+      return false;
+    // Immediate offset a multiple of 4 in range [-1020, 1020].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    // Special case, #-0 is INT32_MIN.
+    return (Val >= -1020 && Val <= 1020 && (Val & 3) == 0) || Val == INT32_MIN;
+  }
+  bool isMemImm0_1020s4Offset() const {
+    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+      return false;
+    // Immediate offset a multiple of 4 in range [0, 1020].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return Val >= 0 && Val <= 1020 && (Val & 3) == 0;
+  }
+  bool isMemImm8Offset() const {
+    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+      return false;
+    // Base reg of PC isn't allowed for these encodings.
+    if (Memory.BaseRegNum == ARM::PC) return false;
+    // Immediate offset in range [-255, 255].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return (Val == INT32_MIN) || (Val > -256 && Val < 256);
+  }
+  bool isMemPosImm8Offset() const {
+    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+      return false;
+    // Immediate offset in range [0, 255].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return Val >= 0 && Val < 256;
+  }
+  bool isMemNegImm8Offset() const {
+    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+      return false;
+    // Base reg of PC isn't allowed for these encodings.
+    if (Memory.BaseRegNum == ARM::PC) return false;
+    // Immediate offset in range [-255, -1].
+    if (!Memory.OffsetImm) return false;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return (Val == INT32_MIN) || (Val > -256 && Val < 0);
+  }
+  bool isMemUImm12Offset() const {
+    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+      return false;
+    // Immediate offset in range [0, 4095].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return (Val >= 0 && Val < 4096);
+  }
+  bool isMemImm12Offset() const {
+    // If we have an immediate that's not a constant, treat it as a label
+    // reference needing a fixup. If it is a constant, it's something else
+    // and we reject it.
+    if (isImm() && !isa<MCConstantExpr>(getImm()))
+      return true;
+
+    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+      return false;
+    // Immediate offset in range [-4095, 4095].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return (Val > -4096 && Val < 4096) || (Val == INT32_MIN);
+  }
+  bool isPostIdxImm8() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Val = CE->getValue();
+    return (Val > -256 && Val < 256) || (Val == INT32_MIN);
+  }
+  bool isPostIdxImm8s4() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Val = CE->getValue();
+    return ((Val & 3) == 0 && Val >= -1020 && Val <= 1020) ||
+      (Val == INT32_MIN);
+  }
+
+  bool isMSRMask() const { return Kind == k_MSRMask; }
+  bool isProcIFlags() const { return Kind == k_ProcIFlags; }
+
+  // NEON operands.
+  bool isSingleSpacedVectorList() const {
+    return Kind == k_VectorList && !VectorList.isDoubleSpaced;
+  }
+  bool isDoubleSpacedVectorList() const {
+    return Kind == k_VectorList && VectorList.isDoubleSpaced;
+  }
+  bool isVecListOneD() const {
+    if (!isSingleSpacedVectorList()) return false;
+    return VectorList.Count == 1;
+  }
+
+  bool isVecListDPair() const {
+    if (!isSingleSpacedVectorList()) return false;
+    return (ARMMCRegisterClasses[ARM::DPairRegClassID]
+              .contains(VectorList.RegNum));
+  }
+
+  bool isVecListThreeD() const {
+    if (!isSingleSpacedVectorList()) return false;
+    return VectorList.Count == 3;
+  }
+
+  bool isVecListFourD() const {
+    if (!isSingleSpacedVectorList()) return false;
+    return VectorList.Count == 4;
+  }
+
+  bool isVecListDPairSpaced() const {
+    if (Kind != k_VectorList) return false;
+    if (isSingleSpacedVectorList()) return false;
+    return (ARMMCRegisterClasses[ARM::DPairSpcRegClassID]
+              .contains(VectorList.RegNum));
+  }
+
+  bool isVecListThreeQ() const {
+    if (!isDoubleSpacedVectorList()) return false;
+    return VectorList.Count == 3;
+  }
+
+  bool isVecListFourQ() const {
+    if (!isDoubleSpacedVectorList()) return false;
+    return VectorList.Count == 4;
+  }
+
+  bool isSingleSpacedVectorAllLanes() const {
+    return Kind == k_VectorListAllLanes && !VectorList.isDoubleSpaced;
+  }
+  bool isDoubleSpacedVectorAllLanes() const {
+    return Kind == k_VectorListAllLanes && VectorList.isDoubleSpaced;
+  }
+  bool isVecListOneDAllLanes() const {
+    if (!isSingleSpacedVectorAllLanes()) return false;
+    return VectorList.Count == 1;
+  }
+
+  bool isVecListDPairAllLanes() const {
+    if (!isSingleSpacedVectorAllLanes()) return false;
+    return (ARMMCRegisterClasses[ARM::DPairRegClassID]
+              .contains(VectorList.RegNum));
+  }
+
+  bool isVecListDPairSpacedAllLanes() const {
+    if (!isDoubleSpacedVectorAllLanes()) return false;
+    return VectorList.Count == 2;
+  }
+
+  bool isVecListThreeDAllLanes() const {
+    if (!isSingleSpacedVectorAllLanes()) return false;
+    return VectorList.Count == 3;
+  }
+
+  bool isVecListThreeQAllLanes() const {
+    if (!isDoubleSpacedVectorAllLanes()) return false;
+    return VectorList.Count == 3;
+  }
+
+  bool isVecListFourDAllLanes() const {
+    if (!isSingleSpacedVectorAllLanes()) return false;
+    return VectorList.Count == 4;
+  }
+
+  bool isVecListFourQAllLanes() const {
+    if (!isDoubleSpacedVectorAllLanes()) return false;
+    return VectorList.Count == 4;
+  }
+
+  bool isSingleSpacedVectorIndexed() const {
+    return Kind == k_VectorListIndexed && !VectorList.isDoubleSpaced;
+  }
+  bool isDoubleSpacedVectorIndexed() const {
+    return Kind == k_VectorListIndexed && VectorList.isDoubleSpaced;
+  }
+  bool isVecListOneDByteIndexed() const {
+    if (!isSingleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 1 && VectorList.LaneIndex <= 7;
+  }
+
+  bool isVecListOneDHWordIndexed() const {
+    if (!isSingleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 1 && VectorList.LaneIndex <= 3;
+  }
+
+  bool isVecListOneDWordIndexed() const {
+    if (!isSingleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 1 && VectorList.LaneIndex <= 1;
+  }
+
+  bool isVecListTwoDByteIndexed() const {
+    if (!isSingleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 2 && VectorList.LaneIndex <= 7;
+  }
+
+  bool isVecListTwoDHWordIndexed() const {
+    if (!isSingleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 2 && VectorList.LaneIndex <= 3;
+  }
+
+  bool isVecListTwoQWordIndexed() const {
+    if (!isDoubleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 2 && VectorList.LaneIndex <= 1;
+  }
+
+  bool isVecListTwoQHWordIndexed() const {
+    if (!isDoubleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 2 && VectorList.LaneIndex <= 3;
+  }
+
+  bool isVecListTwoDWordIndexed() const {
+    if (!isSingleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 2 && VectorList.LaneIndex <= 1;
+  }
+
+  bool isVecListThreeDByteIndexed() const {
+    if (!isSingleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 3 && VectorList.LaneIndex <= 7;
+  }
+
+  bool isVecListThreeDHWordIndexed() const {
+    if (!isSingleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 3 && VectorList.LaneIndex <= 3;
+  }
+
+  bool isVecListThreeQWordIndexed() const {
+    if (!isDoubleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 3 && VectorList.LaneIndex <= 1;
+  }
+
+  bool isVecListThreeQHWordIndexed() const {
+    if (!isDoubleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 3 && VectorList.LaneIndex <= 3;
+  }
+
+  bool isVecListThreeDWordIndexed() const {
+    if (!isSingleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 3 && VectorList.LaneIndex <= 1;
+  }
+
+  bool isVecListFourDByteIndexed() const {
+    if (!isSingleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 4 && VectorList.LaneIndex <= 7;
+  }
+
+  bool isVecListFourDHWordIndexed() const {
+    if (!isSingleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 4 && VectorList.LaneIndex <= 3;
+  }
+
+  bool isVecListFourQWordIndexed() const {
+    if (!isDoubleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 4 && VectorList.LaneIndex <= 1;
+  }
+
+  bool isVecListFourQHWordIndexed() const {
+    if (!isDoubleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 4 && VectorList.LaneIndex <= 3;
+  }
+
+  bool isVecListFourDWordIndexed() const {
+    if (!isSingleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 4 && VectorList.LaneIndex <= 1;
+  }
+
+  bool isVectorIndex8() const {
+    if (Kind != k_VectorIndex) return false;
+    return VectorIndex.Val < 8;
+  }
+  bool isVectorIndex16() const {
+    if (Kind != k_VectorIndex) return false;
+    return VectorIndex.Val < 4;
+  }
+  bool isVectorIndex32() const {
+    if (Kind != k_VectorIndex) return false;
+    return VectorIndex.Val < 2;
+  }
+
+  bool isNEONi8splat() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    // Must be a constant.
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    // i8 value splatted across 8 bytes. The immediate is just the 8 byte
+    // value.
+    return Value >= 0 && Value < 256;
+  }
+
+  bool isNEONi16splat() const {
+    if (isNEONByteReplicate(2))
+      return false; // Leave that for bytes replication and forbid by default.
+    if (!isImm())
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    // Must be a constant.
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    // i16 value in the range [0,255] or [0x0100, 0xff00]
+    return (Value >= 0 && Value < 256) || (Value >= 0x0100 && Value <= 0xff00);
+  }
+
+  bool isNEONi32splat() const {
+    if (isNEONByteReplicate(4))
+      return false; // Leave that for bytes replication and forbid by default.
+    if (!isImm())
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    // Must be a constant.
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    // i32 value with set bits only in one byte X000, 0X00, 00X0, or 000X.
+    return (Value >= 0 && Value < 256) ||
+      (Value >= 0x0100 && Value <= 0xff00) ||
+      (Value >= 0x010000 && Value <= 0xff0000) ||
+      (Value >= 0x01000000 && Value <= 0xff000000);
+  }
+
+  bool isNEONByteReplicate(unsigned NumBytes) const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    // Must be a constant.
+    if (!CE)
+      return false;
+    int64_t Value = CE->getValue();
+    if (!Value)
+      return false; // Don't bother with zero.
+
+    unsigned char B = Value & 0xff;
+    for (unsigned i = 1; i < NumBytes; ++i) {
+      Value >>= 8;
+      if ((Value & 0xff) != B)
+        return false;
+    }
+    return true;
+  }
+  bool isNEONi16ByteReplicate() const { return isNEONByteReplicate(2); }
+  bool isNEONi32ByteReplicate() const { return isNEONByteReplicate(4); }
+  bool isNEONi32vmov() const {
+    if (isNEONByteReplicate(4))
+      return false; // Let it to be classified as byte-replicate case.
+    if (!isImm())
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    // Must be a constant.
+    if (!CE)
+      return false;
+    int64_t Value = CE->getValue();
+    // i32 value with set bits only in one byte X000, 0X00, 00X0, or 000X,
+    // for VMOV/VMVN only, 00Xf or 0Xff are also accepted.
+    return (Value >= 0 && Value < 256) ||
+      (Value >= 0x0100 && Value <= 0xff00) ||
+      (Value >= 0x010000 && Value <= 0xff0000) ||
+      (Value >= 0x01000000 && Value <= 0xff000000) ||
+      (Value >= 0x01ff && Value <= 0xffff && (Value & 0xff) == 0xff) ||
+      (Value >= 0x01ffff && Value <= 0xffffff && (Value & 0xffff) == 0xffff);
+  }
+  bool isNEONi32vmovNeg() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    // Must be a constant.
+    if (!CE) return false;
+    int64_t Value = ~CE->getValue();
+    // i32 value with set bits only in one byte X000, 0X00, 00X0, or 000X,
+    // for VMOV/VMVN only, 00Xf or 0Xff are also accepted.
+    return (Value >= 0 && Value < 256) ||
+      (Value >= 0x0100 && Value <= 0xff00) ||
+      (Value >= 0x010000 && Value <= 0xff0000) ||
+      (Value >= 0x01000000 && Value <= 0xff000000) ||
+      (Value >= 0x01ff && Value <= 0xffff && (Value & 0xff) == 0xff) ||
+      (Value >= 0x01ffff && Value <= 0xffffff && (Value & 0xffff) == 0xffff);
+  }
+
+  bool isNEONi64splat() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    // Must be a constant.
+    if (!CE) return false;
+    uint64_t Value = CE->getValue();
+    // i64 value with each byte being either 0 or 0xff.
+    for (unsigned i = 0; i < 8; ++i)
+      if ((Value & 0xff) != 0 && (Value & 0xff) != 0xff) return false;
+    return true;
+  }
+
+  void addExpr(MCInst &Inst, const MCExpr *Expr) const {
+    // Add as immediates when possible.  Null MCExpr = 0.
+    if (!Expr)
+      Inst.addOperand(MCOperand::CreateImm(0));
+    else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+      Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::CreateExpr(Expr));
+  }
+
+  void addCondCodeOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(unsigned(getCondCode())));
+    unsigned RegNum = getCondCode() == ARMCC::AL ? 0: ARM::CPSR;
+    Inst.addOperand(MCOperand::CreateReg(RegNum));
+  }
+
+  void addCoprocNumOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getCoproc()));
+  }
+
+  void addCoprocRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getCoproc()));
+  }
+
+  void addCoprocOptionOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(CoprocOption.Val));
+  }
+
+  void addITMaskOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(ITMask.Mask));
+  }
+
+  void addITCondCodeOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(unsigned(getCondCode())));
+  }
+
+  void addCCOutOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getReg()));
+  }
+
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getReg()));
+  }
+
+  void addRegShiftedRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 3 && "Invalid number of operands!");
+    assert(isRegShiftedReg() &&
+           "addRegShiftedRegOperands() on non-RegShiftedReg!");
+    Inst.addOperand(MCOperand::CreateReg(RegShiftedReg.SrcReg));
+    Inst.addOperand(MCOperand::CreateReg(RegShiftedReg.ShiftReg));
+    Inst.addOperand(MCOperand::CreateImm(
+      ARM_AM::getSORegOpc(RegShiftedReg.ShiftTy, RegShiftedReg.ShiftImm)));
+  }
+
+  void addRegShiftedImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    assert(isRegShiftedImm() &&
+           "addRegShiftedImmOperands() on non-RegShiftedImm!");
+    Inst.addOperand(MCOperand::CreateReg(RegShiftedImm.SrcReg));
+    // Shift of #32 is encoded as 0 where permitted
+    unsigned Imm = (RegShiftedImm.ShiftImm == 32 ? 0 : RegShiftedImm.ShiftImm);
+    Inst.addOperand(MCOperand::CreateImm(
+      ARM_AM::getSORegOpc(RegShiftedImm.ShiftTy, Imm)));
+  }
+
+  void addShifterImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm((ShifterImm.isASR << 5) |
+                                         ShifterImm.Imm));
+  }
+
+  void addRegListOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const SmallVectorImpl<unsigned> &RegList = getRegList();
+    for (SmallVectorImpl<unsigned>::const_iterator
+           I = RegList.begin(), E = RegList.end(); I != E; ++I)
+      Inst.addOperand(MCOperand::CreateReg(*I));
+  }
+
+  void addDPRRegListOperands(MCInst &Inst, unsigned N) const {
+    addRegListOperands(Inst, N);
+  }
+
+  void addSPRRegListOperands(MCInst &Inst, unsigned N) const {
+    addRegListOperands(Inst, N);
+  }
+
+  void addRotImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // Encoded as val>>3. The printer handles display as 8, 16, 24.
+    Inst.addOperand(MCOperand::CreateImm(RotImm.Imm >> 3));
+  }
+
+  void addBitfieldOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // Munge the lsb/width into a bitfield mask.
+    unsigned lsb = Bitfield.LSB;
+    unsigned width = Bitfield.Width;
+    // Make a 32-bit mask w/ the referenced bits clear and all other bits set.
+    uint32_t Mask = ~(((uint32_t)0xffffffff >> lsb) << (32 - width) >>
+                      (32 - (lsb + width)));
+    Inst.addOperand(MCOperand::CreateImm(Mask));
+  }
+
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
+
+  void addFBits16Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(16 - CE->getValue()));
+  }
+
+  void addFBits32Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(32 - CE->getValue()));
+  }
+
+  void addFPImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    int Val = ARM_AM::getFP32Imm(APInt(32, CE->getValue()));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
+  void addImm8s4Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // FIXME: We really want to scale the value here, but the LDRD/STRD
+    // instruction don't encode operands that way yet.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+  }
+
+  void addImm0_1020s4Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The immediate is scaled by four in the encoding and is stored
+    // in the MCInst as such. Lop off the low two bits here.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(CE->getValue() / 4));
+  }
+
+  void addImm0_508s4NegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The immediate is scaled by four in the encoding and is stored
+    // in the MCInst as such. Lop off the low two bits here.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(-(CE->getValue() / 4)));
+  }
+
+  void addImm0_508s4Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The immediate is scaled by four in the encoding and is stored
+    // in the MCInst as such. Lop off the low two bits here.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(CE->getValue() / 4));
+  }
+
+  void addImm1_16Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The constant encodes as the immediate-1, and we store in the instruction
+    // the bits as encoded, so subtract off one here.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(CE->getValue() - 1));
+  }
+
+  void addImm1_32Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The constant encodes as the immediate-1, and we store in the instruction
+    // the bits as encoded, so subtract off one here.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(CE->getValue() - 1));
+  }
+
+  void addImmThumbSROperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The constant encodes as the immediate, except for 32, which encodes as
+    // zero.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    unsigned Imm = CE->getValue();
+    Inst.addOperand(MCOperand::CreateImm((Imm == 32 ? 0 : Imm)));
+  }
+
+  void addPKHASRImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // An ASR value of 32 encodes as 0, so that's how we want to add it to
+    // the instruction as well.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    int Val = CE->getValue();
+    Inst.addOperand(MCOperand::CreateImm(Val == 32 ? 0 : Val));
+  }
+
+  void addT2SOImmNotOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The operand is actually a t2_so_imm, but we have its bitwise
+    // negation in the assembly source, so twiddle it here.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(~CE->getValue()));
+  }
+
+  void addT2SOImmNegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The operand is actually a t2_so_imm, but we have its
+    // negation in the assembly source, so twiddle it here.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(-CE->getValue()));
+  }
+
+  void addImm0_4095NegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The operand is actually an imm0_4095, but we have its
+    // negation in the assembly source, so twiddle it here.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(-CE->getValue()));
+  }
+
+  void addUnsignedOffset_b8s2Operands(MCInst &Inst, unsigned N) const {
+    if(const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm())) {
+      Inst.addOperand(MCOperand::CreateImm(CE->getValue() >> 2));
+      return;
+    }
+
+    const MCSymbolRefExpr *SR = dyn_cast<MCSymbolRefExpr>(Imm.Val);
+    assert(SR && "Unknown value type!");
+    Inst.addOperand(MCOperand::CreateExpr(SR));
+  }
+
+  void addThumbMemPCOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    if (isImm()) {
+      const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+      if (CE) {
+        Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+        return;
+      }
+
+      const MCSymbolRefExpr *SR = dyn_cast<MCSymbolRefExpr>(Imm.Val);
+      assert(SR && "Unknown value type!");
+      Inst.addOperand(MCOperand::CreateExpr(SR));
+      return;
+    }
+
+    assert(isMem()  && "Unknown value type!");
+    assert(isa<MCConstantExpr>(Memory.OffsetImm) && "Unknown value type!");
+    Inst.addOperand(MCOperand::CreateImm(Memory.OffsetImm->getValue()));
+  }
+
+  void addARMSOImmNotOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The operand is actually a so_imm, but we have its bitwise
+    // negation in the assembly source, so twiddle it here.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(~CE->getValue()));
+  }
+
+  void addARMSOImmNegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The operand is actually a so_imm, but we have its
+    // negation in the assembly source, so twiddle it here.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(-CE->getValue()));
+  }
+
+  void addMemBarrierOptOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(unsigned(getMemBarrierOpt())));
+  }
+
+  void addInstSyncBarrierOptOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(unsigned(getInstSyncBarrierOpt())));
+  }
+
+  void addMemNoOffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+  }
+
+  void addMemPCRelImm12Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    int32_t Imm = Memory.OffsetImm->getValue();
+    Inst.addOperand(MCOperand::CreateImm(Imm));
+  }
+
+  void addAdrLabelOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    assert(isImm() && "Not an immediate!");
+
+    // If we have an immediate that's not a constant, treat it as a label
+    // reference needing a fixup. 
+    if (!isa<MCConstantExpr>(getImm())) {
+      Inst.addOperand(MCOperand::CreateExpr(getImm()));
+      return;
+    }
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    int Val = CE->getValue();
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
+  void addAlignedMemoryOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Memory.Alignment));
+  }
+
+  void addDupAlignedMemoryNoneOperands(MCInst &Inst, unsigned N) const {
+    addAlignedMemoryOperands(Inst, N);
+  }
+
+  void addAlignedMemoryNoneOperands(MCInst &Inst, unsigned N) const {
+    addAlignedMemoryOperands(Inst, N);
+  }
+
+  void addAlignedMemory16Operands(MCInst &Inst, unsigned N) const {
+    addAlignedMemoryOperands(Inst, N);
+  }
+
+  void addDupAlignedMemory16Operands(MCInst &Inst, unsigned N) const {
+    addAlignedMemoryOperands(Inst, N);
+  }
+
+  void addAlignedMemory32Operands(MCInst &Inst, unsigned N) const {
+    addAlignedMemoryOperands(Inst, N);
+  }
+
+  void addDupAlignedMemory32Operands(MCInst &Inst, unsigned N) const {
+    addAlignedMemoryOperands(Inst, N);
+  }
+
+  void addAlignedMemory64Operands(MCInst &Inst, unsigned N) const {
+    addAlignedMemoryOperands(Inst, N);
+  }
+
+  void addDupAlignedMemory64Operands(MCInst &Inst, unsigned N) const {
+    addAlignedMemoryOperands(Inst, N);
+  }
+
+  void addAlignedMemory64or128Operands(MCInst &Inst, unsigned N) const {
+    addAlignedMemoryOperands(Inst, N);
+  }
+
+  void addDupAlignedMemory64or128Operands(MCInst &Inst, unsigned N) const {
+    addAlignedMemoryOperands(Inst, N);
+  }
+
+  void addAlignedMemory64or128or256Operands(MCInst &Inst, unsigned N) const {
+    addAlignedMemoryOperands(Inst, N);
+  }
+
+  void addAddrMode2Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 3 && "Invalid number of operands!");
+    int32_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
+    if (!Memory.OffsetRegNum) {
+      ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add;
+      // Special case for #-0
+      if (Val == INT32_MIN) Val = 0;
+      if (Val < 0) Val = -Val;
+      Val = ARM_AM::getAM2Opc(AddSub, Val, ARM_AM::no_shift);
+    } else {
+      // For register offset, we encode the shift type and negation flag
+      // here.
+      Val = ARM_AM::getAM2Opc(Memory.isNegative ? ARM_AM::sub : ARM_AM::add,
+                              Memory.ShiftImm, Memory.ShiftType);
+    }
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateReg(Memory.OffsetRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
+  void addAM2OffsetImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    assert(CE && "non-constant AM2OffsetImm operand!");
+    int32_t Val = CE->getValue();
+    ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add;
+    // Special case for #-0
+    if (Val == INT32_MIN) Val = 0;
+    if (Val < 0) Val = -Val;
+    Val = ARM_AM::getAM2Opc(AddSub, Val, ARM_AM::no_shift);
+    Inst.addOperand(MCOperand::CreateReg(0));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
+  void addAddrMode3Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 3 && "Invalid number of operands!");
+    // If we have an immediate that's not a constant, treat it as a label
+    // reference needing a fixup. If it is a constant, it's something else
+    // and we reject it.
+    if (isImm()) {
+      Inst.addOperand(MCOperand::CreateExpr(getImm()));
+      Inst.addOperand(MCOperand::CreateReg(0));
+      Inst.addOperand(MCOperand::CreateImm(0));
+      return;
+    }
+
+    int32_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
+    if (!Memory.OffsetRegNum) {
+      ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add;
+      // Special case for #-0
+      if (Val == INT32_MIN) Val = 0;
+      if (Val < 0) Val = -Val;
+      Val = ARM_AM::getAM3Opc(AddSub, Val);
+    } else {
+      // For register offset, we encode the shift type and negation flag
+      // here.
+      Val = ARM_AM::getAM3Opc(Memory.isNegative ? ARM_AM::sub : ARM_AM::add, 0);
+    }
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateReg(Memory.OffsetRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
+  void addAM3OffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    if (Kind == k_PostIndexRegister) {
+      int32_t Val =
+        ARM_AM::getAM3Opc(PostIdxReg.isAdd ? ARM_AM::add : ARM_AM::sub, 0);
+      Inst.addOperand(MCOperand::CreateReg(PostIdxReg.RegNum));
+      Inst.addOperand(MCOperand::CreateImm(Val));
+      return;
+    }
+
+    // Constant offset.
+    const MCConstantExpr *CE = static_cast<const MCConstantExpr*>(getImm());
+    int32_t Val = CE->getValue();
+    ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add;
+    // Special case for #-0
+    if (Val == INT32_MIN) Val = 0;
+    if (Val < 0) Val = -Val;
+    Val = ARM_AM::getAM3Opc(AddSub, Val);
+    Inst.addOperand(MCOperand::CreateReg(0));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
+  void addAddrMode5Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    // If we have an immediate that's not a constant, treat it as a label
+    // reference needing a fixup. If it is a constant, it's something else
+    // and we reject it.
+    if (isImm()) {
+      Inst.addOperand(MCOperand::CreateExpr(getImm()));
+      Inst.addOperand(MCOperand::CreateImm(0));
+      return;
+    }
+
+    // The lower two bits are always zero and as such are not encoded.
+    int32_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() / 4 : 0;
+    ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add;
+    // Special case for #-0
+    if (Val == INT32_MIN) Val = 0;
+    if (Val < 0) Val = -Val;
+    Val = ARM_AM::getAM5Opc(AddSub, Val);
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
+  void addMemImm8s4OffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    // If we have an immediate that's not a constant, treat it as a label
+    // reference needing a fixup. If it is a constant, it's something else
+    // and we reject it.
+    if (isImm()) {
+      Inst.addOperand(MCOperand::CreateExpr(getImm()));
+      Inst.addOperand(MCOperand::CreateImm(0));
+      return;
+    }
+
+    int64_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
+  void addMemImm0_1020s4OffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    // The lower two bits are always zero and as such are not encoded.
+    int32_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() / 4 : 0;
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
+  void addMemImm8OffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    int64_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
+  void addMemPosImm8OffsetOperands(MCInst &Inst, unsigned N) const {
+    addMemImm8OffsetOperands(Inst, N);
+  }
+
+  void addMemNegImm8OffsetOperands(MCInst &Inst, unsigned N) const {
+    addMemImm8OffsetOperands(Inst, N);
+  }
+
+  void addMemUImm12OffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    // If this is an immediate, it's a label reference.
+    if (isImm()) {
+      addExpr(Inst, getImm());
+      Inst.addOperand(MCOperand::CreateImm(0));
+      return;
+    }
+
+    // Otherwise, it's a normal memory reg+offset.
+    int64_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
+  void addMemImm12OffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    // If this is an immediate, it's a label reference.
+    if (isImm()) {
+      addExpr(Inst, getImm());
+      Inst.addOperand(MCOperand::CreateImm(0));
+      return;
+    }
+
+    // Otherwise, it's a normal memory reg+offset.
+    int64_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
+  void addMemTBBOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateReg(Memory.OffsetRegNum));
+  }
+
+  void addMemTBHOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateReg(Memory.OffsetRegNum));
+  }
+
+  void addMemRegOffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 3 && "Invalid number of operands!");
+    unsigned Val =
+      ARM_AM::getAM2Opc(Memory.isNegative ? ARM_AM::sub : ARM_AM::add,
+                        Memory.ShiftImm, Memory.ShiftType);
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateReg(Memory.OffsetRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
+  void addT2MemRegOffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 3 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateReg(Memory.OffsetRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Memory.ShiftImm));
+  }
+
+  void addMemThumbRROperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateReg(Memory.OffsetRegNum));
+  }
+
+  void addMemThumbRIs4Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    int64_t Val = Memory.OffsetImm ? (Memory.OffsetImm->getValue() / 4) : 0;
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
+  void addMemThumbRIs2Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    int64_t Val = Memory.OffsetImm ? (Memory.OffsetImm->getValue() / 2) : 0;
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
+  void addMemThumbRIs1Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    int64_t Val = Memory.OffsetImm ? (Memory.OffsetImm->getValue()) : 0;
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
+  void addMemThumbSPIOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    int64_t Val = Memory.OffsetImm ? (Memory.OffsetImm->getValue() / 4) : 0;
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
+  void addPostIdxImm8Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    assert(CE && "non-constant post-idx-imm8 operand!");
+    int Imm = CE->getValue();
+    bool isAdd = Imm >= 0;
+    if (Imm == INT32_MIN) Imm = 0;
+    Imm = (Imm < 0 ? -Imm : Imm) | (int)isAdd << 8;
+    Inst.addOperand(MCOperand::CreateImm(Imm));
+  }
+
+  void addPostIdxImm8s4Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    assert(CE && "non-constant post-idx-imm8s4 operand!");
+    int Imm = CE->getValue();
+    bool isAdd = Imm >= 0;
+    if (Imm == INT32_MIN) Imm = 0;
+    // Immediate is scaled by 4.
+    Imm = ((Imm < 0 ? -Imm : Imm) / 4) | (int)isAdd << 8;
+    Inst.addOperand(MCOperand::CreateImm(Imm));
+  }
+
+  void addPostIdxRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(PostIdxReg.RegNum));
+    Inst.addOperand(MCOperand::CreateImm(PostIdxReg.isAdd));
+  }
+
+  void addPostIdxRegShiftedOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(PostIdxReg.RegNum));
+    // The sign, shift type, and shift amount are encoded in a single operand
+    // using the AM2 encoding helpers.
+    ARM_AM::AddrOpc opc = PostIdxReg.isAdd ? ARM_AM::add : ARM_AM::sub;
+    unsigned Imm = ARM_AM::getAM2Opc(opc, PostIdxReg.ShiftImm,
+                                     PostIdxReg.ShiftTy);
+    Inst.addOperand(MCOperand::CreateImm(Imm));
+  }
+
+  void addMSRMaskOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(unsigned(getMSRMask())));
+  }
+
+  void addProcIFlagsOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(unsigned(getProcIFlags())));
+  }
+
+  void addVecListOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(VectorList.RegNum));
+  }
+
+  void addVecListIndexedOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(VectorList.RegNum));
+    Inst.addOperand(MCOperand::CreateImm(VectorList.LaneIndex));
+  }
+
+  void addVectorIndex8Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
+  }
+
+  void addVectorIndex16Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
+  }
+
+  void addVectorIndex32Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
+  }
+
+  void addNEONi8splatOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The immediate encodes the type of constant as well as the value.
+    // Mask in that this is an i8 splat.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(CE->getValue() | 0xe00));
+  }
+
+  void addNEONi16splatOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The immediate encodes the type of constant as well as the value.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    unsigned Value = CE->getValue();
+    if (Value >= 256)
+      Value = (Value >> 8) | 0xa00;
+    else
+      Value |= 0x800;
+    Inst.addOperand(MCOperand::CreateImm(Value));
+  }
+
+  void addNEONi32splatOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The immediate encodes the type of constant as well as the value.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    unsigned Value = CE->getValue();
+    if (Value >= 256 && Value <= 0xff00)
+      Value = (Value >> 8) | 0x200;
+    else if (Value > 0xffff && Value <= 0xff0000)
+      Value = (Value >> 16) | 0x400;
+    else if (Value > 0xffffff)
+      Value = (Value >> 24) | 0x600;
+    Inst.addOperand(MCOperand::CreateImm(Value));
+  }
+
+  void addNEONinvByteReplicateOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The immediate encodes the type of constant as well as the value.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    unsigned Value = CE->getValue();
+    assert((Inst.getOpcode() == ARM::VMOVv8i8 ||
+            Inst.getOpcode() == ARM::VMOVv16i8) &&
+           "All vmvn instructions that wants to replicate non-zero byte "
+           "always must be replaced with VMOVv8i8 or VMOVv16i8.");
+    unsigned B = ((~Value) & 0xff);
+    B |= 0xe00; // cmode = 0b1110
+    Inst.addOperand(MCOperand::CreateImm(B));
+  }
+  void addNEONi32vmovOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The immediate encodes the type of constant as well as the value.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    unsigned Value = CE->getValue();
+    if (Value >= 256 && Value <= 0xffff)
+      Value = (Value >> 8) | ((Value & 0xff) ? 0xc00 : 0x200);
+    else if (Value > 0xffff && Value <= 0xffffff)
+      Value = (Value >> 16) | ((Value & 0xff) ? 0xd00 : 0x400);
+    else if (Value > 0xffffff)
+      Value = (Value >> 24) | 0x600;
+    Inst.addOperand(MCOperand::CreateImm(Value));
+  }
+
+  void addNEONvmovByteReplicateOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The immediate encodes the type of constant as well as the value.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    unsigned Value = CE->getValue();
+    assert((Inst.getOpcode() == ARM::VMOVv8i8 ||
+            Inst.getOpcode() == ARM::VMOVv16i8) &&
+           "All instructions that wants to replicate non-zero byte "
+           "always must be replaced with VMOVv8i8 or VMOVv16i8.");
+    unsigned B = Value & 0xff;
+    B |= 0xe00; // cmode = 0b1110
+    Inst.addOperand(MCOperand::CreateImm(B));
+  }
+  void addNEONi32vmovNegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The immediate encodes the type of constant as well as the value.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    unsigned Value = ~CE->getValue();
+    if (Value >= 256 && Value <= 0xffff)
+      Value = (Value >> 8) | ((Value & 0xff) ? 0xc00 : 0x200);
+    else if (Value > 0xffff && Value <= 0xffffff)
+      Value = (Value >> 16) | ((Value & 0xff) ? 0xd00 : 0x400);
+    else if (Value > 0xffffff)
+      Value = (Value >> 24) | 0x600;
+    Inst.addOperand(MCOperand::CreateImm(Value));
+  }
+
+  void addNEONi64splatOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The immediate encodes the type of constant as well as the value.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    uint64_t Value = CE->getValue();
+    unsigned Imm = 0;
+    for (unsigned i = 0; i < 8; ++i, Value >>= 8) {
+      Imm |= (Value & 1) << i;
+    }
+    Inst.addOperand(MCOperand::CreateImm(Imm | 0x1e00));
+  }
+
+  void print(raw_ostream &OS) const override;
+
+  static std::unique_ptr<ARMOperand> CreateITMask(unsigned Mask, SMLoc S) {
+    auto Op = make_unique<ARMOperand>(k_ITCondMask);
+    Op->ITMask.Mask = Mask;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand> CreateCondCode(ARMCC::CondCodes CC,
+                                                    SMLoc S) {
+    auto Op = make_unique<ARMOperand>(k_CondCode);
+    Op->CC.Val = CC;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand> CreateCoprocNum(unsigned CopVal, SMLoc S) {
+    auto Op = make_unique<ARMOperand>(k_CoprocNum);
+    Op->Cop.Val = CopVal;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand> CreateCoprocReg(unsigned CopVal, SMLoc S) {
+    auto Op = make_unique<ARMOperand>(k_CoprocReg);
+    Op->Cop.Val = CopVal;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand> CreateCoprocOption(unsigned Val, SMLoc S,
+                                                        SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_CoprocOption);
+    Op->Cop.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand> CreateCCOut(unsigned RegNum, SMLoc S) {
+    auto Op = make_unique<ARMOperand>(k_CCOut);
+    Op->Reg.RegNum = RegNum;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand> CreateToken(StringRef Str, SMLoc S) {
+    auto Op = make_unique<ARMOperand>(k_Token);
+    Op->Tok.Data = Str.data();
+    Op->Tok.Length = Str.size();
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand> CreateReg(unsigned RegNum, SMLoc S,
+                                               SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_Register);
+    Op->Reg.RegNum = RegNum;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand>
+  CreateShiftedRegister(ARM_AM::ShiftOpc ShTy, unsigned SrcReg,
+                        unsigned ShiftReg, unsigned ShiftImm, SMLoc S,
+                        SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_ShiftedRegister);
+    Op->RegShiftedReg.ShiftTy = ShTy;
+    Op->RegShiftedReg.SrcReg = SrcReg;
+    Op->RegShiftedReg.ShiftReg = ShiftReg;
+    Op->RegShiftedReg.ShiftImm = ShiftImm;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand>
+  CreateShiftedImmediate(ARM_AM::ShiftOpc ShTy, unsigned SrcReg,
+                         unsigned ShiftImm, SMLoc S, SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_ShiftedImmediate);
+    Op->RegShiftedImm.ShiftTy = ShTy;
+    Op->RegShiftedImm.SrcReg = SrcReg;
+    Op->RegShiftedImm.ShiftImm = ShiftImm;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand> CreateShifterImm(bool isASR, unsigned Imm,
+                                                      SMLoc S, SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_ShifterImmediate);
+    Op->ShifterImm.isASR = isASR;
+    Op->ShifterImm.Imm = Imm;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand> CreateRotImm(unsigned Imm, SMLoc S,
+                                                  SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_RotateImmediate);
+    Op->RotImm.Imm = Imm;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand>
+  CreateBitfield(unsigned LSB, unsigned Width, SMLoc S, SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_BitfieldDescriptor);
+    Op->Bitfield.LSB = LSB;
+    Op->Bitfield.Width = Width;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand>
+  CreateRegList(SmallVectorImpl<std::pair<unsigned, unsigned>> &Regs,
+                SMLoc StartLoc, SMLoc EndLoc) {
+    assert (Regs.size() > 0 && "RegList contains no registers?");
+    KindTy Kind = k_RegisterList;
+
+    if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Regs.front().second))
+      Kind = k_DPRRegisterList;
+    else if (ARMMCRegisterClasses[ARM::SPRRegClassID].
+             contains(Regs.front().second))
+      Kind = k_SPRRegisterList;
+
+    // Sort based on the register encoding values.
+    array_pod_sort(Regs.begin(), Regs.end());
+
+    auto Op = make_unique<ARMOperand>(Kind);
+    for (SmallVectorImpl<std::pair<unsigned, unsigned> >::const_iterator
+           I = Regs.begin(), E = Regs.end(); I != E; ++I)
+      Op->Registers.push_back(I->second);
+    Op->StartLoc = StartLoc;
+    Op->EndLoc = EndLoc;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand> CreateVectorList(unsigned RegNum,
+                                                      unsigned Count,
+                                                      bool isDoubleSpaced,
+                                                      SMLoc S, SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_VectorList);
+    Op->VectorList.RegNum = RegNum;
+    Op->VectorList.Count = Count;
+    Op->VectorList.isDoubleSpaced = isDoubleSpaced;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand>
+  CreateVectorListAllLanes(unsigned RegNum, unsigned Count, bool isDoubleSpaced,
+                           SMLoc S, SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_VectorListAllLanes);
+    Op->VectorList.RegNum = RegNum;
+    Op->VectorList.Count = Count;
+    Op->VectorList.isDoubleSpaced = isDoubleSpaced;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand>
+  CreateVectorListIndexed(unsigned RegNum, unsigned Count, unsigned Index,
+                          bool isDoubleSpaced, SMLoc S, SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_VectorListIndexed);
+    Op->VectorList.RegNum = RegNum;
+    Op->VectorList.Count = Count;
+    Op->VectorList.LaneIndex = Index;
+    Op->VectorList.isDoubleSpaced = isDoubleSpaced;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand>
+  CreateVectorIndex(unsigned Idx, SMLoc S, SMLoc E, MCContext &Ctx) {
+    auto Op = make_unique<ARMOperand>(k_VectorIndex);
+    Op->VectorIndex.Val = Idx;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand> CreateImm(const MCExpr *Val, SMLoc S,
+                                               SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_Immediate);
+    Op->Imm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand>
+  CreateMem(unsigned BaseRegNum, const MCConstantExpr *OffsetImm,
+            unsigned OffsetRegNum, ARM_AM::ShiftOpc ShiftType,
+            unsigned ShiftImm, unsigned Alignment, bool isNegative, SMLoc S,
+            SMLoc E, SMLoc AlignmentLoc = SMLoc()) {
+    auto Op = make_unique<ARMOperand>(k_Memory);
+    Op->Memory.BaseRegNum = BaseRegNum;
+    Op->Memory.OffsetImm = OffsetImm;
+    Op->Memory.OffsetRegNum = OffsetRegNum;
+    Op->Memory.ShiftType = ShiftType;
+    Op->Memory.ShiftImm = ShiftImm;
+    Op->Memory.Alignment = Alignment;
+    Op->Memory.isNegative = isNegative;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    Op->AlignmentLoc = AlignmentLoc;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand>
+  CreatePostIdxReg(unsigned RegNum, bool isAdd, ARM_AM::ShiftOpc ShiftTy,
+                   unsigned ShiftImm, SMLoc S, SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_PostIndexRegister);
+    Op->PostIdxReg.RegNum = RegNum;
+    Op->PostIdxReg.isAdd = isAdd;
+    Op->PostIdxReg.ShiftTy = ShiftTy;
+    Op->PostIdxReg.ShiftImm = ShiftImm;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand> CreateMemBarrierOpt(ARM_MB::MemBOpt Opt,
+                                                         SMLoc S) {
+    auto Op = make_unique<ARMOperand>(k_MemBarrierOpt);
+    Op->MBOpt.Val = Opt;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand>
+  CreateInstSyncBarrierOpt(ARM_ISB::InstSyncBOpt Opt, SMLoc S) {
+    auto Op = make_unique<ARMOperand>(k_InstSyncBarrierOpt);
+    Op->ISBOpt.Val = Opt;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand> CreateProcIFlags(ARM_PROC::IFlags IFlags,
+                                                      SMLoc S) {
+    auto Op = make_unique<ARMOperand>(k_ProcIFlags);
+    Op->IFlags.Val = IFlags;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand> CreateMSRMask(unsigned MMask, SMLoc S) {
+    auto Op = make_unique<ARMOperand>(k_MSRMask);
+    Op->MMask.Val = MMask;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+};
+
+} // end anonymous namespace.
+
+void ARMOperand::print(raw_ostream &OS) const {
+  switch (Kind) {
+  case k_CondCode:
+    OS << "<ARMCC::" << ARMCondCodeToString(getCondCode()) << ">";
+    break;
+  case k_CCOut:
+    OS << "<ccout " << getReg() << ">";
+    break;
+  case k_ITCondMask: {
+    static const char *const MaskStr[] = {
+      "()", "(t)", "(e)", "(tt)", "(et)", "(te)", "(ee)", "(ttt)", "(ett)",
+      "(tet)", "(eet)", "(tte)", "(ete)", "(tee)", "(eee)"
+    };
+    assert((ITMask.Mask & 0xf) == ITMask.Mask);
+    OS << "<it-mask " << MaskStr[ITMask.Mask] << ">";
+    break;
+  }
+  case k_CoprocNum:
+    OS << "<coprocessor number: " << getCoproc() << ">";
+    break;
+  case k_CoprocReg:
+    OS << "<coprocessor register: " << getCoproc() << ">";
+    break;
+  case k_CoprocOption:
+    OS << "<coprocessor option: " << CoprocOption.Val << ">";
+    break;
+  case k_MSRMask:
+    OS << "<mask: " << getMSRMask() << ">";
+    break;
+  case k_Immediate:
+    getImm()->print(OS);
+    break;
+  case k_MemBarrierOpt:
+    OS << "<ARM_MB::" << MemBOptToString(getMemBarrierOpt(), false) << ">";
+    break;
+  case k_InstSyncBarrierOpt:
+    OS << "<ARM_ISB::" << InstSyncBOptToString(getInstSyncBarrierOpt()) << ">";
+    break;
+  case k_Memory:
+    OS << "<memory "
+       << " base:" << Memory.BaseRegNum;
+    OS << ">";
+    break;
+  case k_PostIndexRegister:
+    OS << "post-idx register " << (PostIdxReg.isAdd ? "" : "-")
+       << PostIdxReg.RegNum;
+    if (PostIdxReg.ShiftTy != ARM_AM::no_shift)
+      OS << ARM_AM::getShiftOpcStr(PostIdxReg.ShiftTy) << " "
+         << PostIdxReg.ShiftImm;
+    OS << ">";
+    break;
+  case k_ProcIFlags: {
+    OS << "<ARM_PROC::";
+    unsigned IFlags = getProcIFlags();
+    for (int i=2; i >= 0; --i)
+      if (IFlags & (1 << i))
+        OS << ARM_PROC::IFlagsToString(1 << i);
+    OS << ">";
+    break;
+  }
+  case k_Register:
+    OS << "<register " << getReg() << ">";
+    break;
+  case k_ShifterImmediate:
+    OS << "<shift " << (ShifterImm.isASR ? "asr" : "lsl")
+       << " #" << ShifterImm.Imm << ">";
+    break;
+  case k_ShiftedRegister:
+    OS << "<so_reg_reg "
+       << RegShiftedReg.SrcReg << " "
+       << ARM_AM::getShiftOpcStr(RegShiftedReg.ShiftTy)
+       << " " << RegShiftedReg.ShiftReg << ">";
+    break;
+  case k_ShiftedImmediate:
+    OS << "<so_reg_imm "
+       << RegShiftedImm.SrcReg << " "
+       << ARM_AM::getShiftOpcStr(RegShiftedImm.ShiftTy)
+       << " #" << RegShiftedImm.ShiftImm << ">";
+    break;
+  case k_RotateImmediate:
+    OS << "<ror " << " #" << (RotImm.Imm * 8) << ">";
+    break;
+  case k_BitfieldDescriptor:
+    OS << "<bitfield " << "lsb: " << Bitfield.LSB
+       << ", width: " << Bitfield.Width << ">";
+    break;
+  case k_RegisterList:
+  case k_DPRRegisterList:
+  case k_SPRRegisterList: {
+    OS << "<register_list ";
+
+    const SmallVectorImpl<unsigned> &RegList = getRegList();
+    for (SmallVectorImpl<unsigned>::const_iterator
+           I = RegList.begin(), E = RegList.end(); I != E; ) {
+      OS << *I;
+      if (++I < E) OS << ", ";
+    }
+
+    OS << ">";
+    break;
+  }
+  case k_VectorList:
+    OS << "<vector_list " << VectorList.Count << " * "
+       << VectorList.RegNum << ">";
+    break;
+  case k_VectorListAllLanes:
+    OS << "<vector_list(all lanes) " << VectorList.Count << " * "
+       << VectorList.RegNum << ">";
+    break;
+  case k_VectorListIndexed:
+    OS << "<vector_list(lane " << VectorList.LaneIndex << ") "
+       << VectorList.Count << " * " << VectorList.RegNum << ">";
+    break;
+  case k_Token:
+    OS << "'" << getToken() << "'";
+    break;
+  case k_VectorIndex:
+    OS << "<vectorindex " << getVectorIndex() << ">";
+    break;
+  }
+}
+
+/// @name Auto-generated Match Functions
+/// {
+
+static unsigned MatchRegisterName(StringRef Name);
+
+/// }
+
+bool ARMAsmParser::ParseRegister(unsigned &RegNo,
+                                 SMLoc &StartLoc, SMLoc &EndLoc) {
+  StartLoc = Parser.getTok().getLoc();
+  EndLoc = Parser.getTok().getEndLoc();
+  RegNo = tryParseRegister();
+
+  return (RegNo == (unsigned)-1);
+}
+
+/// Try to parse a register name.  The token must be an Identifier when called,
+/// and if it is a register name the token is eaten and the register number is
+/// returned.  Otherwise return -1.
+///
+int ARMAsmParser::tryParseRegister() {
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.isNot(AsmToken::Identifier)) return -1;
+
+  std::string lowerCase = Tok.getString().lower();
+  unsigned RegNum = MatchRegisterName(lowerCase);
+  if (!RegNum) {
+    RegNum = StringSwitch<unsigned>(lowerCase)
+      .Case("r13", ARM::SP)
+      .Case("r14", ARM::LR)
+      .Case("r15", ARM::PC)
+      .Case("ip", ARM::R12)
+      // Additional register name aliases for 'gas' compatibility.
+      .Case("a1", ARM::R0)
+      .Case("a2", ARM::R1)
+      .Case("a3", ARM::R2)
+      .Case("a4", ARM::R3)
+      .Case("v1", ARM::R4)
+      .Case("v2", ARM::R5)
+      .Case("v3", ARM::R6)
+      .Case("v4", ARM::R7)
+      .Case("v5", ARM::R8)
+      .Case("v6", ARM::R9)
+      .Case("v7", ARM::R10)
+      .Case("v8", ARM::R11)
+      .Case("sb", ARM::R9)
+      .Case("sl", ARM::R10)
+      .Case("fp", ARM::R11)
+      .Default(0);
+  }
+  if (!RegNum) {
+    // Check for aliases registered via .req. Canonicalize to lower case.
+    // That's more consistent since register names are case insensitive, and
+    // it's how the original entry was passed in from MC/MCParser/AsmParser.
+    StringMap<unsigned>::const_iterator Entry = RegisterReqs.find(lowerCase);
+    // If no match, return failure.
+    if (Entry == RegisterReqs.end())
+      return -1;
+    Parser.Lex(); // Eat identifier token.
+    return Entry->getValue();
+  }
+
+  Parser.Lex(); // Eat identifier token.
+
+  return RegNum;
+}
+
+// Try to parse a shifter  (e.g., "lsl <amt>"). On success, return 0.
+// If a recoverable error occurs, return 1. If an irrecoverable error
+// occurs, return -1. An irrecoverable error is one where tokens have been
+// consumed in the process of trying to parse the shifter (i.e., when it is
+// indeed a shifter operand, but malformed).
+int ARMAsmParser::tryParseShiftRegister(OperandVector &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.isNot(AsmToken::Identifier))
+    return -1; 
+
+  std::string lowerCase = Tok.getString().lower();
+  ARM_AM::ShiftOpc ShiftTy = StringSwitch<ARM_AM::ShiftOpc>(lowerCase)
+      .Case("asl", ARM_AM::lsl)
+      .Case("lsl", ARM_AM::lsl)
+      .Case("lsr", ARM_AM::lsr)
+      .Case("asr", ARM_AM::asr)
+      .Case("ror", ARM_AM::ror)
+      .Case("rrx", ARM_AM::rrx)
+      .Default(ARM_AM::no_shift);
+
+  if (ShiftTy == ARM_AM::no_shift)
+    return 1;
+
+  Parser.Lex(); // Eat the operator.
+
+  // The source register for the shift has already been added to the
+  // operand list, so we need to pop it off and combine it into the shifted
+  // register operand instead.
+  std::unique_ptr<ARMOperand> PrevOp(
+      (ARMOperand *)Operands.pop_back_val().release());
+  if (!PrevOp->isReg())
+    return Error(PrevOp->getStartLoc(), "shift must be of a register");
+  int SrcReg = PrevOp->getReg();
+
+  SMLoc EndLoc;
+  int64_t Imm = 0;
+  int ShiftReg = 0;
+  if (ShiftTy == ARM_AM::rrx) {
+    // RRX Doesn't have an explicit shift amount. The encoder expects
+    // the shift register to be the same as the source register. Seems odd,
+    // but OK.
+    ShiftReg = SrcReg;
+  } else {
+    // Figure out if this is shifted by a constant or a register (for non-RRX).
+    if (Parser.getTok().is(AsmToken::Hash) ||
+        Parser.getTok().is(AsmToken::Dollar)) {
+      Parser.Lex(); // Eat hash.
+      SMLoc ImmLoc = Parser.getTok().getLoc();
+      const MCExpr *ShiftExpr = nullptr;
+      if (getParser().parseExpression(ShiftExpr, EndLoc)) {
+        Error(ImmLoc, "invalid immediate shift value");
+        return -1;
+      }
+      // The expression must be evaluatable as an immediate.
+      const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ShiftExpr);
+      if (!CE) {
+        Error(ImmLoc, "invalid immediate shift value");
+        return -1;
+      }
+      // Range check the immediate.
+      // lsl, ror: 0 <= imm <= 31
+      // lsr, asr: 0 <= imm <= 32
+      Imm = CE->getValue();
+      if (Imm < 0 ||
+          ((ShiftTy == ARM_AM::lsl || ShiftTy == ARM_AM::ror) && Imm > 31) ||
+          ((ShiftTy == ARM_AM::lsr || ShiftTy == ARM_AM::asr) && Imm > 32)) {
+        Error(ImmLoc, "immediate shift value out of range");
+        return -1;
+      }
+      // shift by zero is a nop. Always send it through as lsl.
+      // ('as' compatibility)
+      if (Imm == 0)
+        ShiftTy = ARM_AM::lsl;
+    } else if (Parser.getTok().is(AsmToken::Identifier)) {
+      SMLoc L = Parser.getTok().getLoc();
+      EndLoc = Parser.getTok().getEndLoc();
+      ShiftReg = tryParseRegister();
+      if (ShiftReg == -1) {
+        Error(L, "expected immediate or register in shift operand");
+        return -1;
+      }
+    } else {
+      Error(Parser.getTok().getLoc(),
+            "expected immediate or register in shift operand");
+      return -1;
+    }
+  }
+
+  if (ShiftReg && ShiftTy != ARM_AM::rrx)
+    Operands.push_back(ARMOperand::CreateShiftedRegister(ShiftTy, SrcReg,
+                                                         ShiftReg, Imm,
+                                                         S, EndLoc));
+  else
+    Operands.push_back(ARMOperand::CreateShiftedImmediate(ShiftTy, SrcReg, Imm,
+                                                          S, EndLoc));
+
+  return 0;
+}
+
+
+/// Try to parse a register name.  The token must be an Identifier when called.
+/// If it's a register, an AsmOperand is created. Another AsmOperand is created
+/// if there is a "writeback". 'true' if it's not a register.
+///
+/// TODO this is likely to change to allow different register types and or to
+/// parse for a specific register type.
+bool ARMAsmParser::tryParseRegisterWithWriteBack(OperandVector &Operands) {
+  const AsmToken &RegTok = Parser.getTok();
+  int RegNo = tryParseRegister();
+  if (RegNo == -1)
+    return true;
+
+  Operands.push_back(ARMOperand::CreateReg(RegNo, RegTok.getLoc(),
+                                           RegTok.getEndLoc()));
+
+  const AsmToken &ExclaimTok = Parser.getTok();
+  if (ExclaimTok.is(AsmToken::Exclaim)) {
+    Operands.push_back(ARMOperand::CreateToken(ExclaimTok.getString(),
+                                               ExclaimTok.getLoc()));
+    Parser.Lex(); // Eat exclaim token
+    return false;
+  }
+
+  // Also check for an index operand. This is only legal for vector registers,
+  // but that'll get caught OK in operand matching, so we don't need to
+  // explicitly filter everything else out here.
+  if (Parser.getTok().is(AsmToken::LBrac)) {
+    SMLoc SIdx = Parser.getTok().getLoc();
+    Parser.Lex(); // Eat left bracket token.
+
+    const MCExpr *ImmVal;
+    if (getParser().parseExpression(ImmVal))
+      return true;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+    if (!MCE)
+      return TokError("immediate value expected for vector index");
+
+    if (Parser.getTok().isNot(AsmToken::RBrac))
+      return Error(Parser.getTok().getLoc(), "']' expected");
+
+    SMLoc E = Parser.getTok().getEndLoc();
+    Parser.Lex(); // Eat right bracket token.
+
+    Operands.push_back(ARMOperand::CreateVectorIndex(MCE->getValue(),
+                                                     SIdx, E,
+                                                     getContext()));
+  }
+
+  return false;
+}
+
+/// MatchCoprocessorOperandName - Try to parse an coprocessor related
+/// instruction with a symbolic operand name.
+/// We accept "crN" syntax for GAS compatibility.
+/// <operand-name> ::= <prefix><number>
+/// If CoprocOp is 'c', then:
+///   <prefix> ::= c | cr
+/// If CoprocOp is 'p', then :
+///   <prefix> ::= p
+/// <number> ::= integer in range [0, 15]
+static int MatchCoprocessorOperandName(StringRef Name, char CoprocOp) {
+  // Use the same layout as the tablegen'erated register name matcher. Ugly,
+  // but efficient.
+  if (Name.size() < 2 || Name[0] != CoprocOp)
+    return -1;
+  Name = (Name[1] == 'r') ? Name.drop_front(2) : Name.drop_front();
+
+  switch (Name.size()) {
+  default: return -1;
+  case 1:
+    switch (Name[0]) {
+    default:  return -1;
+    case '0': return 0;
+    case '1': return 1;
+    case '2': return 2;
+    case '3': return 3;
+    case '4': return 4;
+    case '5': return 5;
+    case '6': return 6;
+    case '7': return 7;
+    case '8': return 8;
+    case '9': return 9;
+    }
+  case 2:
+    if (Name[0] != '1')
+      return -1;
+    switch (Name[1]) {
+    default:  return -1;
+    // p10 and p11 are invalid for coproc instructions (reserved for FP/NEON)
+    case '0': return CoprocOp == 'p'? -1: 10;
+    case '1': return CoprocOp == 'p'? -1: 11;
+    case '2': return 12;
+    case '3': return 13;
+    case '4': return 14;
+    case '5': return 15;
+    }
+  }
+}
+
+/// parseITCondCode - Try to parse a condition code for an IT instruction.
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parseITCondCode(OperandVector &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  if (!Tok.is(AsmToken::Identifier))
+    return MatchOperand_NoMatch;
+  unsigned CC = StringSwitch<unsigned>(Tok.getString().lower())
+    .Case("eq", ARMCC::EQ)
+    .Case("ne", ARMCC::NE)
+    .Case("hs", ARMCC::HS)
+    .Case("cs", ARMCC::HS)
+    .Case("lo", ARMCC::LO)
+    .Case("cc", ARMCC::LO)
+    .Case("mi", ARMCC::MI)
+    .Case("pl", ARMCC::PL)
+    .Case("vs", ARMCC::VS)
+    .Case("vc", ARMCC::VC)
+    .Case("hi", ARMCC::HI)
+    .Case("ls", ARMCC::LS)
+    .Case("ge", ARMCC::GE)
+    .Case("lt", ARMCC::LT)
+    .Case("gt", ARMCC::GT)
+    .Case("le", ARMCC::LE)
+    .Case("al", ARMCC::AL)
+    .Default(~0U);
+  if (CC == ~0U)
+    return MatchOperand_NoMatch;
+  Parser.Lex(); // Eat the token.
+
+  Operands.push_back(ARMOperand::CreateCondCode(ARMCC::CondCodes(CC), S));
+
+  return MatchOperand_Success;
+}
+
+/// parseCoprocNumOperand - Try to parse an coprocessor number operand. The
+/// token must be an Identifier when called, and if it is a coprocessor
+/// number, the token is eaten and the operand is added to the operand list.
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parseCoprocNumOperand(OperandVector &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.isNot(AsmToken::Identifier))
+    return MatchOperand_NoMatch;
+
+  int Num = MatchCoprocessorOperandName(Tok.getString(), 'p');
+  if (Num == -1)
+    return MatchOperand_NoMatch;
+
+  Parser.Lex(); // Eat identifier token.
+  Operands.push_back(ARMOperand::CreateCoprocNum(Num, S));
+  return MatchOperand_Success;
+}
+
+/// parseCoprocRegOperand - Try to parse an coprocessor register operand. The
+/// token must be an Identifier when called, and if it is a coprocessor
+/// number, the token is eaten and the operand is added to the operand list.
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parseCoprocRegOperand(OperandVector &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.isNot(AsmToken::Identifier))
+    return MatchOperand_NoMatch;
+
+  int Reg = MatchCoprocessorOperandName(Tok.getString(), 'c');
+  if (Reg == -1)
+    return MatchOperand_NoMatch;
+
+  Parser.Lex(); // Eat identifier token.
+  Operands.push_back(ARMOperand::CreateCoprocReg(Reg, S));
+  return MatchOperand_Success;
+}
+
+/// parseCoprocOptionOperand - Try to parse an coprocessor option operand.
+/// coproc_option : '{' imm0_255 '}'
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parseCoprocOptionOperand(OperandVector &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+
+  // If this isn't a '{', this isn't a coprocessor immediate operand.
+  if (Parser.getTok().isNot(AsmToken::LCurly))
+    return MatchOperand_NoMatch;
+  Parser.Lex(); // Eat the '{'
+
+  const MCExpr *Expr;
+  SMLoc Loc = Parser.getTok().getLoc();
+  if (getParser().parseExpression(Expr)) {
+    Error(Loc, "illegal expression");
+    return MatchOperand_ParseFail;
+  }
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr);
+  if (!CE || CE->getValue() < 0 || CE->getValue() > 255) {
+    Error(Loc, "coprocessor option must be an immediate in range [0, 255]");
+    return MatchOperand_ParseFail;
+  }
+  int Val = CE->getValue();
+
+  // Check for and consume the closing '}'
+  if (Parser.getTok().isNot(AsmToken::RCurly))
+    return MatchOperand_ParseFail;
+  SMLoc E = Parser.getTok().getEndLoc();
+  Parser.Lex(); // Eat the '}'
+
+  Operands.push_back(ARMOperand::CreateCoprocOption(Val, S, E));
+  return MatchOperand_Success;
+}
+
+// For register list parsing, we need to map from raw GPR register numbering
+// to the enumeration values. The enumeration values aren't sorted by
+// register number due to our using "sp", "lr" and "pc" as canonical names.
+static unsigned getNextRegister(unsigned Reg) {
+  // If this is a GPR, we need to do it manually, otherwise we can rely
+  // on the sort ordering of the enumeration since the other reg-classes
+  // are sane.
+  if (!ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Reg))
+    return Reg + 1;
+  switch(Reg) {
+  default: llvm_unreachable("Invalid GPR number!");
+  case ARM::R0:  return ARM::R1;  case ARM::R1:  return ARM::R2;
+  case ARM::R2:  return ARM::R3;  case ARM::R3:  return ARM::R4;
+  case ARM::R4:  return ARM::R5;  case ARM::R5:  return ARM::R6;
+  case ARM::R6:  return ARM::R7;  case ARM::R7:  return ARM::R8;
+  case ARM::R8:  return ARM::R9;  case ARM::R9:  return ARM::R10;
+  case ARM::R10: return ARM::R11; case ARM::R11: return ARM::R12;
+  case ARM::R12: return ARM::SP;  case ARM::SP:  return ARM::LR;
+  case ARM::LR:  return ARM::PC;  case ARM::PC:  return ARM::R0;
+  }
+}
+
+// Return the low-subreg of a given Q register.
+static unsigned getDRegFromQReg(unsigned QReg) {
+  switch (QReg) {
+  default: llvm_unreachable("expected a Q register!");
+  case ARM::Q0:  return ARM::D0;
+  case ARM::Q1:  return ARM::D2;
+  case ARM::Q2:  return ARM::D4;
+  case ARM::Q3:  return ARM::D6;
+  case ARM::Q4:  return ARM::D8;
+  case ARM::Q5:  return ARM::D10;
+  case ARM::Q6:  return ARM::D12;
+  case ARM::Q7:  return ARM::D14;
+  case ARM::Q8:  return ARM::D16;
+  case ARM::Q9:  return ARM::D18;
+  case ARM::Q10: return ARM::D20;
+  case ARM::Q11: return ARM::D22;
+  case ARM::Q12: return ARM::D24;
+  case ARM::Q13: return ARM::D26;
+  case ARM::Q14: return ARM::D28;
+  case ARM::Q15: return ARM::D30;
+  }
+}
+
+/// Parse a register list.
+bool ARMAsmParser::parseRegisterList(OperandVector &Operands) {
+  assert(Parser.getTok().is(AsmToken::LCurly) &&
+         "Token is not a Left Curly Brace");
+  SMLoc S = Parser.getTok().getLoc();
+  Parser.Lex(); // Eat '{' token.
+  SMLoc RegLoc = Parser.getTok().getLoc();
+
+  // Check the first register in the list to see what register class
+  // this is a list of.
+  int Reg = tryParseRegister();
+  if (Reg == -1)
+    return Error(RegLoc, "register expected");
+
+  // The reglist instructions have at most 16 registers, so reserve
+  // space for that many.
+  int EReg = 0;
+  SmallVector<std::pair<unsigned, unsigned>, 16> Registers;
+
+  // Allow Q regs and just interpret them as the two D sub-registers.
+  if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) {
+    Reg = getDRegFromQReg(Reg);
+    EReg = MRI->getEncodingValue(Reg);
+    Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg));
+    ++Reg;
+  }
+  const MCRegisterClass *RC;
+  if (ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Reg))
+    RC = &ARMMCRegisterClasses[ARM::GPRRegClassID];
+  else if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg))
+    RC = &ARMMCRegisterClasses[ARM::DPRRegClassID];
+  else if (ARMMCRegisterClasses[ARM::SPRRegClassID].contains(Reg))
+    RC = &ARMMCRegisterClasses[ARM::SPRRegClassID];
+  else
+    return Error(RegLoc, "invalid register in register list");
+
+  // Store the register.
+  EReg = MRI->getEncodingValue(Reg);
+  Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg));
+
+  // This starts immediately after the first register token in the list,
+  // so we can see either a comma or a minus (range separator) as a legal
+  // next token.
+  while (Parser.getTok().is(AsmToken::Comma) ||
+         Parser.getTok().is(AsmToken::Minus)) {
+    if (Parser.getTok().is(AsmToken::Minus)) {
+      Parser.Lex(); // Eat the minus.
+      SMLoc AfterMinusLoc = Parser.getTok().getLoc();
+      int EndReg = tryParseRegister();
+      if (EndReg == -1)
+        return Error(AfterMinusLoc, "register expected");
+      // Allow Q regs and just interpret them as the two D sub-registers.
+      if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(EndReg))
+        EndReg = getDRegFromQReg(EndReg) + 1;
+      // If the register is the same as the start reg, there's nothing
+      // more to do.
+      if (Reg == EndReg)
+        continue;
+      // The register must be in the same register class as the first.
+      if (!RC->contains(EndReg))
+        return Error(AfterMinusLoc, "invalid register in register list");
+      // Ranges must go from low to high.
+      if (MRI->getEncodingValue(Reg) > MRI->getEncodingValue(EndReg))
+        return Error(AfterMinusLoc, "bad range in register list");
+
+      // Add all the registers in the range to the register list.
+      while (Reg != EndReg) {
+        Reg = getNextRegister(Reg);
+        EReg = MRI->getEncodingValue(Reg);
+        Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg));
+      }
+      continue;
+    }
+    Parser.Lex(); // Eat the comma.
+    RegLoc = Parser.getTok().getLoc();
+    int OldReg = Reg;
+    const AsmToken RegTok = Parser.getTok();
+    Reg = tryParseRegister();
+    if (Reg == -1)
+      return Error(RegLoc, "register expected");
+    // Allow Q regs and just interpret them as the two D sub-registers.
+    bool isQReg = false;
+    if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) {
+      Reg = getDRegFromQReg(Reg);
+      isQReg = true;
+    }
+    // The register must be in the same register class as the first.
+    if (!RC->contains(Reg))
+      return Error(RegLoc, "invalid register in register list");
+    // List must be monotonically increasing.
+    if (MRI->getEncodingValue(Reg) < MRI->getEncodingValue(OldReg)) {
+      if (ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Reg))
+        Warning(RegLoc, "register list not in ascending order");
+      else
+        return Error(RegLoc, "register list not in ascending order");
+    }
+    if (MRI->getEncodingValue(Reg) == MRI->getEncodingValue(OldReg)) {
+      Warning(RegLoc, "duplicated register (" + RegTok.getString() +
+              ") in register list");
+      continue;
+    }
+    // VFP register lists must also be contiguous.
+    if (RC != &ARMMCRegisterClasses[ARM::GPRRegClassID] &&
+        Reg != OldReg + 1)
+      return Error(RegLoc, "non-contiguous register range");
+    EReg = MRI->getEncodingValue(Reg);
+    Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg));
+    if (isQReg) {
+      EReg = MRI->getEncodingValue(++Reg);
+      Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg));
+    }
+  }
+
+  if (Parser.getTok().isNot(AsmToken::RCurly))
+    return Error(Parser.getTok().getLoc(), "'}' expected");
+  SMLoc E = Parser.getTok().getEndLoc();
+  Parser.Lex(); // Eat '}' token.
+
+  // Push the register list operand.
+  Operands.push_back(ARMOperand::CreateRegList(Registers, S, E));
+
+  // The ARM system instruction variants for LDM/STM have a '^' token here.
+  if (Parser.getTok().is(AsmToken::Caret)) {
+    Operands.push_back(ARMOperand::CreateToken("^",Parser.getTok().getLoc()));
+    Parser.Lex(); // Eat '^' token.
+  }
+
+  return false;
+}
+
+// Helper function to parse the lane index for vector lists.
+ARMAsmParser::OperandMatchResultTy ARMAsmParser::
+parseVectorLane(VectorLaneTy &LaneKind, unsigned &Index, SMLoc &EndLoc) {
+  Index = 0; // Always return a defined index value.
+  if (Parser.getTok().is(AsmToken::LBrac)) {
+    Parser.Lex(); // Eat the '['.
+    if (Parser.getTok().is(AsmToken::RBrac)) {
+      // "Dn[]" is the 'all lanes' syntax.
+      LaneKind = AllLanes;
+      EndLoc = Parser.getTok().getEndLoc();
+      Parser.Lex(); // Eat the ']'.
+      return MatchOperand_Success;
+    }
+
+    // There's an optional '#' token here. Normally there wouldn't be, but
+    // inline assemble puts one in, and it's friendly to accept that.
+    if (Parser.getTok().is(AsmToken::Hash))
+      Parser.Lex(); // Eat '#' or '$'.
+
+    const MCExpr *LaneIndex;
+    SMLoc Loc = Parser.getTok().getLoc();
+    if (getParser().parseExpression(LaneIndex)) {
+      Error(Loc, "illegal expression");
+      return MatchOperand_ParseFail;
+    }
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(LaneIndex);
+    if (!CE) {
+      Error(Loc, "lane index must be empty or an integer");
+      return MatchOperand_ParseFail;
+    }
+    if (Parser.getTok().isNot(AsmToken::RBrac)) {
+      Error(Parser.getTok().getLoc(), "']' expected");
+      return MatchOperand_ParseFail;
+    }
+    EndLoc = Parser.getTok().getEndLoc();
+    Parser.Lex(); // Eat the ']'.
+    int64_t Val = CE->getValue();
+
+    // FIXME: Make this range check context sensitive for .8, .16, .32.
+    if (Val < 0 || Val > 7) {
+      Error(Parser.getTok().getLoc(), "lane index out of range");
+      return MatchOperand_ParseFail;
+    }
+    Index = Val;
+    LaneKind = IndexedLane;
+    return MatchOperand_Success;
+  }
+  LaneKind = NoLanes;
+  return MatchOperand_Success;
+}
+
+// parse a vector register list
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parseVectorList(OperandVector &Operands) {
+  VectorLaneTy LaneKind;
+  unsigned LaneIndex;
+  SMLoc S = Parser.getTok().getLoc();
+  // As an extension (to match gas), support a plain D register or Q register
+  // (without encosing curly braces) as a single or double entry list,
+  // respectively.
+  if (Parser.getTok().is(AsmToken::Identifier)) {
+    SMLoc E = Parser.getTok().getEndLoc();
+    int Reg = tryParseRegister();
+    if (Reg == -1)
+      return MatchOperand_NoMatch;
+    if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg)) {
+      OperandMatchResultTy Res = parseVectorLane(LaneKind, LaneIndex, E);
+      if (Res != MatchOperand_Success)
+        return Res;
+      switch (LaneKind) {
+      case NoLanes:
+        Operands.push_back(ARMOperand::CreateVectorList(Reg, 1, false, S, E));
+        break;
+      case AllLanes:
+        Operands.push_back(ARMOperand::CreateVectorListAllLanes(Reg, 1, false,
+                                                                S, E));
+        break;
+      case IndexedLane:
+        Operands.push_back(ARMOperand::CreateVectorListIndexed(Reg, 1,
+                                                               LaneIndex,
+                                                               false, S, E));
+        break;
+      }
+      return MatchOperand_Success;
+    }
+    if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) {
+      Reg = getDRegFromQReg(Reg);
+      OperandMatchResultTy Res = parseVectorLane(LaneKind, LaneIndex, E);
+      if (Res != MatchOperand_Success)
+        return Res;
+      switch (LaneKind) {
+      case NoLanes:
+        Reg = MRI->getMatchingSuperReg(Reg, ARM::dsub_0,
+                                   &ARMMCRegisterClasses[ARM::DPairRegClassID]);
+        Operands.push_back(ARMOperand::CreateVectorList(Reg, 2, false, S, E));
+        break;
+      case AllLanes:
+        Reg = MRI->getMatchingSuperReg(Reg, ARM::dsub_0,
+                                   &ARMMCRegisterClasses[ARM::DPairRegClassID]);
+        Operands.push_back(ARMOperand::CreateVectorListAllLanes(Reg, 2, false,
+                                                                S, E));
+        break;
+      case IndexedLane:
+        Operands.push_back(ARMOperand::CreateVectorListIndexed(Reg, 2,
+                                                               LaneIndex,
+                                                               false, S, E));
+        break;
+      }
+      return MatchOperand_Success;
+    }
+    Error(S, "vector register expected");
+    return MatchOperand_ParseFail;
+  }
+
+  if (Parser.getTok().isNot(AsmToken::LCurly))
+    return MatchOperand_NoMatch;
+
+  Parser.Lex(); // Eat '{' token.
+  SMLoc RegLoc = Parser.getTok().getLoc();
+
+  int Reg = tryParseRegister();
+  if (Reg == -1) {
+    Error(RegLoc, "register expected");
+    return MatchOperand_ParseFail;
+  }
+  unsigned Count = 1;
+  int Spacing = 0;
+  unsigned FirstReg = Reg;
+  // The list is of D registers, but we also allow Q regs and just interpret
+  // them as the two D sub-registers.
+  if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) {
+    FirstReg = Reg = getDRegFromQReg(Reg);
+    Spacing = 1; // double-spacing requires explicit D registers, otherwise
+                 // it's ambiguous with four-register single spaced.
+    ++Reg;
+    ++Count;
+  }
+
+  SMLoc E;
+  if (parseVectorLane(LaneKind, LaneIndex, E) != MatchOperand_Success)
+    return MatchOperand_ParseFail;
+
+  while (Parser.getTok().is(AsmToken::Comma) ||
+         Parser.getTok().is(AsmToken::Minus)) {
+    if (Parser.getTok().is(AsmToken::Minus)) {
+      if (!Spacing)
+        Spacing = 1; // Register range implies a single spaced list.
+      else if (Spacing == 2) {
+        Error(Parser.getTok().getLoc(),
+              "sequential registers in double spaced list");
+        return MatchOperand_ParseFail;
+      }
+      Parser.Lex(); // Eat the minus.
+      SMLoc AfterMinusLoc = Parser.getTok().getLoc();
+      int EndReg = tryParseRegister();
+      if (EndReg == -1) {
+        Error(AfterMinusLoc, "register expected");
+        return MatchOperand_ParseFail;
+      }
+      // Allow Q regs and just interpret them as the two D sub-registers.
+      if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(EndReg))
+        EndReg = getDRegFromQReg(EndReg) + 1;
+      // If the register is the same as the start reg, there's nothing
+      // more to do.
+      if (Reg == EndReg)
+        continue;
+      // The register must be in the same register class as the first.
+      if (!ARMMCRegisterClasses[ARM::DPRRegClassID].contains(EndReg)) {
+        Error(AfterMinusLoc, "invalid register in register list");
+        return MatchOperand_ParseFail;
+      }
+      // Ranges must go from low to high.
+      if (Reg > EndReg) {
+        Error(AfterMinusLoc, "bad range in register list");
+        return MatchOperand_ParseFail;
+      }
+      // Parse the lane specifier if present.
+      VectorLaneTy NextLaneKind;
+      unsigned NextLaneIndex;
+      if (parseVectorLane(NextLaneKind, NextLaneIndex, E) !=
+          MatchOperand_Success)
+        return MatchOperand_ParseFail;
+      if (NextLaneKind != LaneKind || LaneIndex != NextLaneIndex) {
+        Error(AfterMinusLoc, "mismatched lane index in register list");
+        return MatchOperand_ParseFail;
+      }
+
+      // Add all the registers in the range to the register list.
+      Count += EndReg - Reg;
+      Reg = EndReg;
+      continue;
+    }
+    Parser.Lex(); // Eat the comma.
+    RegLoc = Parser.getTok().getLoc();
+    int OldReg = Reg;
+    Reg = tryParseRegister();
+    if (Reg == -1) {
+      Error(RegLoc, "register expected");
+      return MatchOperand_ParseFail;
+    }
+    // vector register lists must be contiguous.
+    // It's OK to use the enumeration values directly here rather, as the
+    // VFP register classes have the enum sorted properly.
+    //
+    // The list is of D registers, but we also allow Q regs and just interpret
+    // them as the two D sub-registers.
+    if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) {
+      if (!Spacing)
+        Spacing = 1; // Register range implies a single spaced list.
+      else if (Spacing == 2) {
+        Error(RegLoc,
+              "invalid register in double-spaced list (must be 'D' register')");
+        return MatchOperand_ParseFail;
+      }
+      Reg = getDRegFromQReg(Reg);
+      if (Reg != OldReg + 1) {
+        Error(RegLoc, "non-contiguous register range");
+        return MatchOperand_ParseFail;
+      }
+      ++Reg;
+      Count += 2;
+      // Parse the lane specifier if present.
+      VectorLaneTy NextLaneKind;
+      unsigned NextLaneIndex;
+      SMLoc LaneLoc = Parser.getTok().getLoc();
+      if (parseVectorLane(NextLaneKind, NextLaneIndex, E) !=
+          MatchOperand_Success)
+        return MatchOperand_ParseFail;
+      if (NextLaneKind != LaneKind || LaneIndex != NextLaneIndex) {
+        Error(LaneLoc, "mismatched lane index in register list");
+        return MatchOperand_ParseFail;
+      }
+      continue;
+    }
+    // Normal D register.
+    // Figure out the register spacing (single or double) of the list if
+    // we don't know it already.
+    if (!Spacing)
+      Spacing = 1 + (Reg == OldReg + 2);
+
+    // Just check that it's contiguous and keep going.
+    if (Reg != OldReg + Spacing) {
+      Error(RegLoc, "non-contiguous register range");
+      return MatchOperand_ParseFail;
+    }
+    ++Count;
+    // Parse the lane specifier if present.
+    VectorLaneTy NextLaneKind;
+    unsigned NextLaneIndex;
+    SMLoc EndLoc = Parser.getTok().getLoc();
+    if (parseVectorLane(NextLaneKind, NextLaneIndex, E) != MatchOperand_Success)
+      return MatchOperand_ParseFail;
+    if (NextLaneKind != LaneKind || LaneIndex != NextLaneIndex) {
+      Error(EndLoc, "mismatched lane index in register list");
+      return MatchOperand_ParseFail;
+    }
+  }
+
+  if (Parser.getTok().isNot(AsmToken::RCurly)) {
+    Error(Parser.getTok().getLoc(), "'}' expected");
+    return MatchOperand_ParseFail;
+  }
+  E = Parser.getTok().getEndLoc();
+  Parser.Lex(); // Eat '}' token.
+
+  switch (LaneKind) {
+  case NoLanes:
+    // Two-register operands have been converted to the
+    // composite register classes.
+    if (Count == 2) {
+      const MCRegisterClass *RC = (Spacing == 1) ?
+        &ARMMCRegisterClasses[ARM::DPairRegClassID] :
+        &ARMMCRegisterClasses[ARM::DPairSpcRegClassID];
+      FirstReg = MRI->getMatchingSuperReg(FirstReg, ARM::dsub_0, RC);
+    }
+
+    Operands.push_back(ARMOperand::CreateVectorList(FirstReg, Count,
+                                                    (Spacing == 2), S, E));
+    break;
+  case AllLanes:
+    // Two-register operands have been converted to the
+    // composite register classes.
+    if (Count == 2) {
+      const MCRegisterClass *RC = (Spacing == 1) ?
+        &ARMMCRegisterClasses[ARM::DPairRegClassID] :
+        &ARMMCRegisterClasses[ARM::DPairSpcRegClassID];
+      FirstReg = MRI->getMatchingSuperReg(FirstReg, ARM::dsub_0, RC);
+    }
+    Operands.push_back(ARMOperand::CreateVectorListAllLanes(FirstReg, Count,
+                                                            (Spacing == 2),
+                                                            S, E));
+    break;
+  case IndexedLane:
+    Operands.push_back(ARMOperand::CreateVectorListIndexed(FirstReg, Count,
+                                                           LaneIndex,
+                                                           (Spacing == 2),
+                                                           S, E));
+    break;
+  }
+  return MatchOperand_Success;
+}
+
+/// parseMemBarrierOptOperand - Try to parse DSB/DMB data barrier options.
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parseMemBarrierOptOperand(OperandVector &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  unsigned Opt;
+
+  if (Tok.is(AsmToken::Identifier)) {
+    StringRef OptStr = Tok.getString();
+
+    Opt = StringSwitch<unsigned>(OptStr.slice(0, OptStr.size()).lower())
+      .Case("sy",    ARM_MB::SY)
+      .Case("st",    ARM_MB::ST)
+      .Case("ld",    ARM_MB::LD)
+      .Case("sh",    ARM_MB::ISH)
+      .Case("ish",   ARM_MB::ISH)
+      .Case("shst",  ARM_MB::ISHST)
+      .Case("ishst", ARM_MB::ISHST)
+      .Case("ishld", ARM_MB::ISHLD)
+      .Case("nsh",   ARM_MB::NSH)
+      .Case("un",    ARM_MB::NSH)
+      .Case("nshst", ARM_MB::NSHST)
+      .Case("nshld", ARM_MB::NSHLD)
+      .Case("unst",  ARM_MB::NSHST)
+      .Case("osh",   ARM_MB::OSH)
+      .Case("oshst", ARM_MB::OSHST)
+      .Case("oshld", ARM_MB::OSHLD)
+      .Default(~0U);
+
+    // ishld, oshld, nshld and ld are only available from ARMv8.
+    if (!hasV8Ops() && (Opt == ARM_MB::ISHLD || Opt == ARM_MB::OSHLD ||
+                        Opt == ARM_MB::NSHLD || Opt == ARM_MB::LD))
+      Opt = ~0U;
+
+    if (Opt == ~0U)
+      return MatchOperand_NoMatch;
+
+    Parser.Lex(); // Eat identifier token.
+  } else if (Tok.is(AsmToken::Hash) ||
+             Tok.is(AsmToken::Dollar) ||
+             Tok.is(AsmToken::Integer)) {
+    if (Parser.getTok().isNot(AsmToken::Integer))
+      Parser.Lex(); // Eat '#' or '$'.
+    SMLoc Loc = Parser.getTok().getLoc();
+
+    const MCExpr *MemBarrierID;
+    if (getParser().parseExpression(MemBarrierID)) {
+      Error(Loc, "illegal expression");
+      return MatchOperand_ParseFail;
+    }
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(MemBarrierID);
+    if (!CE) {
+      Error(Loc, "constant expression expected");
+      return MatchOperand_ParseFail;
+    }
+
+    int Val = CE->getValue();
+    if (Val & ~0xf) {
+      Error(Loc, "immediate value out of range");
+      return MatchOperand_ParseFail;
+    }
+
+    Opt = ARM_MB::RESERVED_0 + Val;
+  } else
+    return MatchOperand_ParseFail;
+
+  Operands.push_back(ARMOperand::CreateMemBarrierOpt((ARM_MB::MemBOpt)Opt, S));
+  return MatchOperand_Success;
+}
+
+/// parseInstSyncBarrierOptOperand - Try to parse ISB inst sync barrier options.
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parseInstSyncBarrierOptOperand(OperandVector &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  unsigned Opt;
+
+  if (Tok.is(AsmToken::Identifier)) {
+    StringRef OptStr = Tok.getString();
+
+    if (OptStr.equals_lower("sy"))
+      Opt = ARM_ISB::SY;
+    else
+      return MatchOperand_NoMatch;
+
+    Parser.Lex(); // Eat identifier token.
+  } else if (Tok.is(AsmToken::Hash) ||
+             Tok.is(AsmToken::Dollar) ||
+             Tok.is(AsmToken::Integer)) {
+    if (Parser.getTok().isNot(AsmToken::Integer))
+      Parser.Lex(); // Eat '#' or '$'.
+    SMLoc Loc = Parser.getTok().getLoc();
+
+    const MCExpr *ISBarrierID;
+    if (getParser().parseExpression(ISBarrierID)) {
+      Error(Loc, "illegal expression");
+      return MatchOperand_ParseFail;
+    }
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ISBarrierID);
+    if (!CE) {
+      Error(Loc, "constant expression expected");
+      return MatchOperand_ParseFail;
+    }
+
+    int Val = CE->getValue();
+    if (Val & ~0xf) {
+      Error(Loc, "immediate value out of range");
+      return MatchOperand_ParseFail;
+    }
+
+    Opt = ARM_ISB::RESERVED_0 + Val;
+  } else
+    return MatchOperand_ParseFail;
+
+  Operands.push_back(ARMOperand::CreateInstSyncBarrierOpt(
+          (ARM_ISB::InstSyncBOpt)Opt, S));
+  return MatchOperand_Success;
+}
+
+
+/// parseProcIFlagsOperand - Try to parse iflags from CPS instruction.
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parseProcIFlagsOperand(OperandVector &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  if (!Tok.is(AsmToken::Identifier)) 
+    return MatchOperand_NoMatch;
+  StringRef IFlagsStr = Tok.getString();
+
+  // An iflags string of "none" is interpreted to mean that none of the AIF
+  // bits are set.  Not a terribly useful instruction, but a valid encoding.
+  unsigned IFlags = 0;
+  if (IFlagsStr != "none") {
+        for (int i = 0, e = IFlagsStr.size(); i != e; ++i) {
+      unsigned Flag = StringSwitch<unsigned>(IFlagsStr.substr(i, 1))
+        .Case("a", ARM_PROC::A)
+        .Case("i", ARM_PROC::I)
+        .Case("f", ARM_PROC::F)
+        .Default(~0U);
+
+      // If some specific iflag is already set, it means that some letter is
+      // present more than once, this is not acceptable.
+      if (Flag == ~0U || (IFlags & Flag))
+        return MatchOperand_NoMatch;
+
+      IFlags |= Flag;
+    }
+  }
+
+  Parser.Lex(); // Eat identifier token.
+  Operands.push_back(ARMOperand::CreateProcIFlags((ARM_PROC::IFlags)IFlags, S));
+  return MatchOperand_Success;
+}
+
+/// parseMSRMaskOperand - Try to parse mask flags from MSR instruction.
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parseMSRMaskOperand(OperandVector &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  if (!Tok.is(AsmToken::Identifier))
+    return MatchOperand_NoMatch;
+  StringRef Mask = Tok.getString();
+
+  if (isMClass()) {
+    // See ARMv6-M 10.1.1
+    std::string Name = Mask.lower();
+    unsigned FlagsVal = StringSwitch<unsigned>(Name)
+      // Note: in the documentation:
+      //  ARM deprecates using MSR APSR without a _<bits> qualifier as an alias
+      //  for MSR APSR_nzcvq.
+      // but we do make it an alias here.  This is so to get the "mask encoding"
+      // bits correct on MSR APSR writes.
+      //
+      // FIXME: Note the 0xc00 "mask encoding" bits version of the registers
+      // should really only be allowed when writing a special register.  Note
+      // they get dropped in the MRS instruction reading a special register as
+      // the SYSm field is only 8 bits.
+      //
+      // FIXME: the _g and _nzcvqg versions are only allowed if the processor
+      // includes the DSP extension but that is not checked.
+      .Case("apsr", 0x800)
+      .Case("apsr_nzcvq", 0x800)
+      .Case("apsr_g", 0x400)
+      .Case("apsr_nzcvqg", 0xc00)
+      .Case("iapsr", 0x801)
+      .Case("iapsr_nzcvq", 0x801)
+      .Case("iapsr_g", 0x401)
+      .Case("iapsr_nzcvqg", 0xc01)
+      .Case("eapsr", 0x802)
+      .Case("eapsr_nzcvq", 0x802)
+      .Case("eapsr_g", 0x402)
+      .Case("eapsr_nzcvqg", 0xc02)
+      .Case("xpsr", 0x803)
+      .Case("xpsr_nzcvq", 0x803)
+      .Case("xpsr_g", 0x403)
+      .Case("xpsr_nzcvqg", 0xc03)
+      .Case("ipsr", 0x805)
+      .Case("epsr", 0x806)
+      .Case("iepsr", 0x807)
+      .Case("msp", 0x808)
+      .Case("psp", 0x809)
+      .Case("primask", 0x810)
+      .Case("basepri", 0x811)
+      .Case("basepri_max", 0x812)
+      .Case("faultmask", 0x813)
+      .Case("control", 0x814)
+      .Default(~0U);
+
+    if (FlagsVal == ~0U)
+      return MatchOperand_NoMatch;
+
+    if (!hasV7Ops() && FlagsVal >= 0x811 && FlagsVal <= 0x813)
+      // basepri, basepri_max and faultmask only valid for V7m.
+      return MatchOperand_NoMatch;
+
+    Parser.Lex(); // Eat identifier token.
+    Operands.push_back(ARMOperand::CreateMSRMask(FlagsVal, S));
+    return MatchOperand_Success;
+  }
+
+  // Split spec_reg from flag, example: CPSR_sxf => "CPSR" and "sxf"
+  size_t Start = 0, Next = Mask.find('_');
+  StringRef Flags = "";
+  std::string SpecReg = Mask.slice(Start, Next).lower();
+  if (Next != StringRef::npos)
+    Flags = Mask.slice(Next+1, Mask.size());
+
+  // FlagsVal contains the complete mask:
+  // 3-0: Mask
+  // 4: Special Reg (cpsr, apsr => 0; spsr => 1)
+  unsigned FlagsVal = 0;
+
+  if (SpecReg == "apsr") {
+    FlagsVal = StringSwitch<unsigned>(Flags)
+    .Case("nzcvq",  0x8) // same as CPSR_f
+    .Case("g",      0x4) // same as CPSR_s
+    .Case("nzcvqg", 0xc) // same as CPSR_fs
+    .Default(~0U);
+
+    if (FlagsVal == ~0U) {
+      if (!Flags.empty())
+        return MatchOperand_NoMatch;
+      else
+        FlagsVal = 8; // No flag
+    }
+  } else if (SpecReg == "cpsr" || SpecReg == "spsr") {
+    // cpsr_all is an alias for cpsr_fc, as is plain cpsr.
+    if (Flags == "all" || Flags == "")
+      Flags = "fc";
+    for (int i = 0, e = Flags.size(); i != e; ++i) {
+      unsigned Flag = StringSwitch<unsigned>(Flags.substr(i, 1))
+      .Case("c", 1)
+      .Case("x", 2)
+      .Case("s", 4)
+      .Case("f", 8)
+      .Default(~0U);
+
+      // If some specific flag is already set, it means that some letter is
+      // present more than once, this is not acceptable.
+      if (FlagsVal == ~0U || (FlagsVal & Flag))
+        return MatchOperand_NoMatch;
+      FlagsVal |= Flag;
+    }
+  } else // No match for special register.
+    return MatchOperand_NoMatch;
+
+  // Special register without flags is NOT equivalent to "fc" flags.
+  // NOTE: This is a divergence from gas' behavior.  Uncommenting the following
+  // two lines would enable gas compatibility at the expense of breaking
+  // round-tripping.
+  //
+  // if (!FlagsVal)
+  //  FlagsVal = 0x9;
+
+  // Bit 4: Special Reg (cpsr, apsr => 0; spsr => 1)
+  if (SpecReg == "spsr")
+    FlagsVal |= 16;
+
+  Parser.Lex(); // Eat identifier token.
+  Operands.push_back(ARMOperand::CreateMSRMask(FlagsVal, S));
+  return MatchOperand_Success;
+}
+
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parsePKHImm(OperandVector &Operands, StringRef Op, int Low,
+                          int High) {
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.isNot(AsmToken::Identifier)) {
+    Error(Parser.getTok().getLoc(), Op + " operand expected.");
+    return MatchOperand_ParseFail;
+  }
+  StringRef ShiftName = Tok.getString();
+  std::string LowerOp = Op.lower();
+  std::string UpperOp = Op.upper();
+  if (ShiftName != LowerOp && ShiftName != UpperOp) {
+    Error(Parser.getTok().getLoc(), Op + " operand expected.");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat shift type token.
+
+  // There must be a '#' and a shift amount.
+  if (Parser.getTok().isNot(AsmToken::Hash) &&
+      Parser.getTok().isNot(AsmToken::Dollar)) {
+    Error(Parser.getTok().getLoc(), "'#' expected");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat hash token.
+
+  const MCExpr *ShiftAmount;
+  SMLoc Loc = Parser.getTok().getLoc();
+  SMLoc EndLoc;
+  if (getParser().parseExpression(ShiftAmount, EndLoc)) {
+    Error(Loc, "illegal expression");
+    return MatchOperand_ParseFail;
+  }
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ShiftAmount);
+  if (!CE) {
+    Error(Loc, "constant expression expected");
+    return MatchOperand_ParseFail;
+  }
+  int Val = CE->getValue();
+  if (Val < Low || Val > High) {
+    Error(Loc, "immediate value out of range");
+    return MatchOperand_ParseFail;
+  }
+
+  Operands.push_back(ARMOperand::CreateImm(CE, Loc, EndLoc));
+
+  return MatchOperand_Success;
+}
+
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parseSetEndImm(OperandVector &Operands) {
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc S = Tok.getLoc();
+  if (Tok.isNot(AsmToken::Identifier)) {
+    Error(S, "'be' or 'le' operand expected");
+    return MatchOperand_ParseFail;
+  }
+  int Val = StringSwitch<int>(Tok.getString().lower())
+    .Case("be", 1)
+    .Case("le", 0)
+    .Default(-1);
+  Parser.Lex(); // Eat the token.
+
+  if (Val == -1) {
+    Error(S, "'be' or 'le' operand expected");
+    return MatchOperand_ParseFail;
+  }
+  Operands.push_back(ARMOperand::CreateImm(MCConstantExpr::Create(Val,
+                                                                  getContext()),
+                                           S, Tok.getEndLoc()));
+  return MatchOperand_Success;
+}
+
+/// parseShifterImm - Parse the shifter immediate operand for SSAT/USAT
+/// instructions. Legal values are:
+///     lsl #n  'n' in [0,31]
+///     asr #n  'n' in [1,32]
+///             n == 32 encoded as n == 0.
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parseShifterImm(OperandVector &Operands) {
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc S = Tok.getLoc();
+  if (Tok.isNot(AsmToken::Identifier)) {
+    Error(S, "shift operator 'asr' or 'lsl' expected");
+    return MatchOperand_ParseFail;
+  }
+  StringRef ShiftName = Tok.getString();
+  bool isASR;
+  if (ShiftName == "lsl" || ShiftName == "LSL")
+    isASR = false;
+  else if (ShiftName == "asr" || ShiftName == "ASR")
+    isASR = true;
+  else {
+    Error(S, "shift operator 'asr' or 'lsl' expected");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat the operator.
+
+  // A '#' and a shift amount.
+  if (Parser.getTok().isNot(AsmToken::Hash) &&
+      Parser.getTok().isNot(AsmToken::Dollar)) {
+    Error(Parser.getTok().getLoc(), "'#' expected");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat hash token.
+  SMLoc ExLoc = Parser.getTok().getLoc();
+
+  const MCExpr *ShiftAmount;
+  SMLoc EndLoc;
+  if (getParser().parseExpression(ShiftAmount, EndLoc)) {
+    Error(ExLoc, "malformed shift expression");
+    return MatchOperand_ParseFail;
+  }
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ShiftAmount);
+  if (!CE) {
+    Error(ExLoc, "shift amount must be an immediate");
+    return MatchOperand_ParseFail;
+  }
+
+  int64_t Val = CE->getValue();
+  if (isASR) {
+    // Shift amount must be in [1,32]
+    if (Val < 1 || Val > 32) {
+      Error(ExLoc, "'asr' shift amount must be in range [1,32]");
+      return MatchOperand_ParseFail;
+    }
+    // asr #32 encoded as asr #0, but is not allowed in Thumb2 mode.
+    if (isThumb() && Val == 32) {
+      Error(ExLoc, "'asr #32' shift amount not allowed in Thumb mode");
+      return MatchOperand_ParseFail;
+    }
+    if (Val == 32) Val = 0;
+  } else {
+    // Shift amount must be in [1,32]
+    if (Val < 0 || Val > 31) {
+      Error(ExLoc, "'lsr' shift amount must be in range [0,31]");
+      return MatchOperand_ParseFail;
+    }
+  }
+
+  Operands.push_back(ARMOperand::CreateShifterImm(isASR, Val, S, EndLoc));
+
+  return MatchOperand_Success;
+}
+
+/// parseRotImm - Parse the shifter immediate operand for SXTB/UXTB family
+/// of instructions. Legal values are:
+///     ror #n  'n' in {0, 8, 16, 24}
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parseRotImm(OperandVector &Operands) {
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc S = Tok.getLoc();
+  if (Tok.isNot(AsmToken::Identifier))
+    return MatchOperand_NoMatch;
+  StringRef ShiftName = Tok.getString();
+  if (ShiftName != "ror" && ShiftName != "ROR")
+    return MatchOperand_NoMatch;
+  Parser.Lex(); // Eat the operator.
+
+  // A '#' and a rotate amount.
+  if (Parser.getTok().isNot(AsmToken::Hash) &&
+      Parser.getTok().isNot(AsmToken::Dollar)) {
+    Error(Parser.getTok().getLoc(), "'#' expected");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat hash token.
+  SMLoc ExLoc = Parser.getTok().getLoc();
+
+  const MCExpr *ShiftAmount;
+  SMLoc EndLoc;
+  if (getParser().parseExpression(ShiftAmount, EndLoc)) {
+    Error(ExLoc, "malformed rotate expression");
+    return MatchOperand_ParseFail;
+  }
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ShiftAmount);
+  if (!CE) {
+    Error(ExLoc, "rotate amount must be an immediate");
+    return MatchOperand_ParseFail;
+  }
+
+  int64_t Val = CE->getValue();
+  // Shift amount must be in {0, 8, 16, 24} (0 is undocumented extension)
+  // normally, zero is represented in asm by omitting the rotate operand
+  // entirely.
+  if (Val != 8 && Val != 16 && Val != 24 && Val != 0) {
+    Error(ExLoc, "'ror' rotate amount must be 8, 16, or 24");
+    return MatchOperand_ParseFail;
+  }
+
+  Operands.push_back(ARMOperand::CreateRotImm(Val, S, EndLoc));
+
+  return MatchOperand_Success;
+}
+
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parseBitfield(OperandVector &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  // The bitfield descriptor is really two operands, the LSB and the width.
+  if (Parser.getTok().isNot(AsmToken::Hash) &&
+      Parser.getTok().isNot(AsmToken::Dollar)) {
+    Error(Parser.getTok().getLoc(), "'#' expected");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat hash token.
+
+  const MCExpr *LSBExpr;
+  SMLoc E = Parser.getTok().getLoc();
+  if (getParser().parseExpression(LSBExpr)) {
+    Error(E, "malformed immediate expression");
+    return MatchOperand_ParseFail;
+  }
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(LSBExpr);
+  if (!CE) {
+    Error(E, "'lsb' operand must be an immediate");
+    return MatchOperand_ParseFail;
+  }
+
+  int64_t LSB = CE->getValue();
+  // The LSB must be in the range [0,31]
+  if (LSB < 0 || LSB > 31) {
+    Error(E, "'lsb' operand must be in the range [0,31]");
+    return MatchOperand_ParseFail;
+  }
+  E = Parser.getTok().getLoc();
+
+  // Expect another immediate operand.
+  if (Parser.getTok().isNot(AsmToken::Comma)) {
+    Error(Parser.getTok().getLoc(), "too few operands");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat hash token.
+  if (Parser.getTok().isNot(AsmToken::Hash) &&
+      Parser.getTok().isNot(AsmToken::Dollar)) {
+    Error(Parser.getTok().getLoc(), "'#' expected");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat hash token.
+
+  const MCExpr *WidthExpr;
+  SMLoc EndLoc;
+  if (getParser().parseExpression(WidthExpr, EndLoc)) {
+    Error(E, "malformed immediate expression");
+    return MatchOperand_ParseFail;
+  }
+  CE = dyn_cast<MCConstantExpr>(WidthExpr);
+  if (!CE) {
+    Error(E, "'width' operand must be an immediate");
+    return MatchOperand_ParseFail;
+  }
+
+  int64_t Width = CE->getValue();
+  // The LSB must be in the range [1,32-lsb]
+  if (Width < 1 || Width > 32 - LSB) {
+    Error(E, "'width' operand must be in the range [1,32-lsb]");
+    return MatchOperand_ParseFail;
+  }
+
+  Operands.push_back(ARMOperand::CreateBitfield(LSB, Width, S, EndLoc));
+
+  return MatchOperand_Success;
+}
+
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parsePostIdxReg(OperandVector &Operands) {
+  // Check for a post-index addressing register operand. Specifically:
+  // postidx_reg := '+' register {, shift}
+  //              | '-' register {, shift}
+  //              | register {, shift}
+
+  // This method must return MatchOperand_NoMatch without consuming any tokens
+  // in the case where there is no match, as other alternatives take other
+  // parse methods.
+  AsmToken Tok = Parser.getTok();
+  SMLoc S = Tok.getLoc();
+  bool haveEaten = false;
+  bool isAdd = true;
+  if (Tok.is(AsmToken::Plus)) {
+    Parser.Lex(); // Eat the '+' token.
+    haveEaten = true;
+  } else if (Tok.is(AsmToken::Minus)) {
+    Parser.Lex(); // Eat the '-' token.
+    isAdd = false;
+    haveEaten = true;
+  }
+
+  SMLoc E = Parser.getTok().getEndLoc();
+  int Reg = tryParseRegister();
+  if (Reg == -1) {
+    if (!haveEaten)
+      return MatchOperand_NoMatch;
+    Error(Parser.getTok().getLoc(), "register expected");
+    return MatchOperand_ParseFail;
+  }
+
+  ARM_AM::ShiftOpc ShiftTy = ARM_AM::no_shift;
+  unsigned ShiftImm = 0;
+  if (Parser.getTok().is(AsmToken::Comma)) {
+    Parser.Lex(); // Eat the ','.
+    if (parseMemRegOffsetShift(ShiftTy, ShiftImm))
+      return MatchOperand_ParseFail;
+
+    // FIXME: Only approximates end...may include intervening whitespace.
+    E = Parser.getTok().getLoc();
+  }
+
+  Operands.push_back(ARMOperand::CreatePostIdxReg(Reg, isAdd, ShiftTy,
+                                                  ShiftImm, S, E));
+
+  return MatchOperand_Success;
+}
+
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parseAM3Offset(OperandVector &Operands) {
+  // Check for a post-index addressing register operand. Specifically:
+  // am3offset := '+' register
+  //              | '-' register
+  //              | register
+  //              | # imm
+  //              | # + imm
+  //              | # - imm
+
+  // This method must return MatchOperand_NoMatch without consuming any tokens
+  // in the case where there is no match, as other alternatives take other
+  // parse methods.
+  AsmToken Tok = Parser.getTok();
+  SMLoc S = Tok.getLoc();
+
+  // Do immediates first, as we always parse those if we have a '#'.
+  if (Parser.getTok().is(AsmToken::Hash) ||
+      Parser.getTok().is(AsmToken::Dollar)) {
+    Parser.Lex(); // Eat '#' or '$'.
+    // Explicitly look for a '-', as we need to encode negative zero
+    // differently.
+    bool isNegative = Parser.getTok().is(AsmToken::Minus);
+    const MCExpr *Offset;
+    SMLoc E;
+    if (getParser().parseExpression(Offset, E))
+      return MatchOperand_ParseFail;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Offset);
+    if (!CE) {
+      Error(S, "constant expression expected");
+      return MatchOperand_ParseFail;
+    }
+    // Negative zero is encoded as the flag value INT32_MIN.
+    int32_t Val = CE->getValue();
+    if (isNegative && Val == 0)
+      Val = INT32_MIN;
+
+    Operands.push_back(
+      ARMOperand::CreateImm(MCConstantExpr::Create(Val, getContext()), S, E));
+
+    return MatchOperand_Success;
+  }
+
+
+  bool haveEaten = false;
+  bool isAdd = true;
+  if (Tok.is(AsmToken::Plus)) {
+    Parser.Lex(); // Eat the '+' token.
+    haveEaten = true;
+  } else if (Tok.is(AsmToken::Minus)) {
+    Parser.Lex(); // Eat the '-' token.
+    isAdd = false;
+    haveEaten = true;
+  }
+
+  Tok = Parser.getTok();
+  int Reg = tryParseRegister();
+  if (Reg == -1) {
+    if (!haveEaten)
+      return MatchOperand_NoMatch;
+    Error(Tok.getLoc(), "register expected");
+    return MatchOperand_ParseFail;
+  }
+
+  Operands.push_back(ARMOperand::CreatePostIdxReg(Reg, isAdd, ARM_AM::no_shift,
+                                                  0, S, Tok.getEndLoc()));
+
+  return MatchOperand_Success;
+}
+
+/// Convert parsed operands to MCInst.  Needed here because this instruction
+/// only has two register operands, but multiplication is commutative so
+/// assemblers should accept both "mul rD, rN, rD" and "mul rD, rD, rN".
+void ARMAsmParser::cvtThumbMultiply(MCInst &Inst,
+                                    const OperandVector &Operands) {
+  ((ARMOperand &)*Operands[3]).addRegOperands(Inst, 1);
+  ((ARMOperand &)*Operands[1]).addCCOutOperands(Inst, 1);
+  // If we have a three-operand form, make sure to set Rn to be the operand
+  // that isn't the same as Rd.
+  unsigned RegOp = 4;
+  if (Operands.size() == 6 &&
+      ((ARMOperand &)*Operands[4]).getReg() ==
+          ((ARMOperand &)*Operands[3]).getReg())
+    RegOp = 5;
+  ((ARMOperand &)*Operands[RegOp]).addRegOperands(Inst, 1);
+  Inst.addOperand(Inst.getOperand(0));
+  ((ARMOperand &)*Operands[2]).addCondCodeOperands(Inst, 2);
+}
+
+void ARMAsmParser::cvtThumbBranches(MCInst &Inst,
+                                    const OperandVector &Operands) {
+  int CondOp = -1, ImmOp = -1;
+  switch(Inst.getOpcode()) {
+    case ARM::tB:
+    case ARM::tBcc:  CondOp = 1; ImmOp = 2; break;
+
+    case ARM::t2B:
+    case ARM::t2Bcc: CondOp = 1; ImmOp = 3; break;
+
+    default: llvm_unreachable("Unexpected instruction in cvtThumbBranches");
+  }
+  // first decide whether or not the branch should be conditional
+  // by looking at it's location relative to an IT block
+  if(inITBlock()) {
+    // inside an IT block we cannot have any conditional branches. any 
+    // such instructions needs to be converted to unconditional form
+    switch(Inst.getOpcode()) {
+      case ARM::tBcc: Inst.setOpcode(ARM::tB); break;
+      case ARM::t2Bcc: Inst.setOpcode(ARM::t2B); break;
+    }
+  } else {
+    // outside IT blocks we can only have unconditional branches with AL
+    // condition code or conditional branches with non-AL condition code
+    unsigned Cond = static_cast<ARMOperand &>(*Operands[CondOp]).getCondCode();
+    switch(Inst.getOpcode()) {
+      case ARM::tB:
+      case ARM::tBcc: 
+        Inst.setOpcode(Cond == ARMCC::AL ? ARM::tB : ARM::tBcc); 
+        break;
+      case ARM::t2B:
+      case ARM::t2Bcc: 
+        Inst.setOpcode(Cond == ARMCC::AL ? ARM::t2B : ARM::t2Bcc);
+        break;
+    }
+  }
+
+  // now decide on encoding size based on branch target range
+  switch(Inst.getOpcode()) {
+    // classify tB as either t2B or t1B based on range of immediate operand
+    case ARM::tB: {
+      ARMOperand &op = static_cast<ARMOperand &>(*Operands[ImmOp]);
+      if (!op.isSignedOffset<11, 1>() && isThumbTwo())
+        Inst.setOpcode(ARM::t2B);
+      break;
+    }
+    // classify tBcc as either t2Bcc or t1Bcc based on range of immediate operand
+    case ARM::tBcc: {
+      ARMOperand &op = static_cast<ARMOperand &>(*Operands[ImmOp]);
+      if (!op.isSignedOffset<8, 1>() && isThumbTwo())
+        Inst.setOpcode(ARM::t2Bcc);
+      break;
+    }
+  }
+  ((ARMOperand &)*Operands[ImmOp]).addImmOperands(Inst, 1);
+  ((ARMOperand &)*Operands[CondOp]).addCondCodeOperands(Inst, 2);
+}
+
+/// Parse an ARM memory expression, return false if successful else return true
+/// or an error.  The first token must be a '[' when called.
+bool ARMAsmParser::parseMemory(OperandVector &Operands) {
+  SMLoc S, E;
+  assert(Parser.getTok().is(AsmToken::LBrac) &&
+         "Token is not a Left Bracket");
+  S = Parser.getTok().getLoc();
+  Parser.Lex(); // Eat left bracket token.
+
+  const AsmToken &BaseRegTok = Parser.getTok();
+  int BaseRegNum = tryParseRegister();
+  if (BaseRegNum == -1)
+    return Error(BaseRegTok.getLoc(), "register expected");
+
+  // The next token must either be a comma, a colon or a closing bracket.
+  const AsmToken &Tok = Parser.getTok();
+  if (!Tok.is(AsmToken::Colon) && !Tok.is(AsmToken::Comma) &&
+      !Tok.is(AsmToken::RBrac))
+    return Error(Tok.getLoc(), "malformed memory operand");
+
+  if (Tok.is(AsmToken::RBrac)) {
+    E = Tok.getEndLoc();
+    Parser.Lex(); // Eat right bracket token.
+
+    Operands.push_back(ARMOperand::CreateMem(BaseRegNum, nullptr, 0,
+                                             ARM_AM::no_shift, 0, 0, false,
+                                             S, E));
+
+    // If there's a pre-indexing writeback marker, '!', just add it as a token
+    // operand. It's rather odd, but syntactically valid.
+    if (Parser.getTok().is(AsmToken::Exclaim)) {
+      Operands.push_back(ARMOperand::CreateToken("!",Parser.getTok().getLoc()));
+      Parser.Lex(); // Eat the '!'.
+    }
+
+    return false;
+  }
+
+  assert((Tok.is(AsmToken::Colon) || Tok.is(AsmToken::Comma)) &&
+         "Lost colon or comma in memory operand?!");
+  if (Tok.is(AsmToken::Comma)) {
+    Parser.Lex(); // Eat the comma.
+  }
+
+  // If we have a ':', it's an alignment specifier.
+  if (Parser.getTok().is(AsmToken::Colon)) {
+    Parser.Lex(); // Eat the ':'.
+    E = Parser.getTok().getLoc();
+    SMLoc AlignmentLoc = Tok.getLoc();
+
+    const MCExpr *Expr;
+    if (getParser().parseExpression(Expr))
+     return true;
+
+    // The expression has to be a constant. Memory references with relocations
+    // don't come through here, as they use the <label> forms of the relevant
+    // instructions.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr);
+    if (!CE)
+      return Error (E, "constant expression expected");
+
+    unsigned Align = 0;
+    switch (CE->getValue()) {
+    default:
+      return Error(E,
+                   "alignment specifier must be 16, 32, 64, 128, or 256 bits");
+    case 16:  Align = 2; break;
+    case 32:  Align = 4; break;
+    case 64:  Align = 8; break;
+    case 128: Align = 16; break;
+    case 256: Align = 32; break;
+    }
+
+    // Now we should have the closing ']'
+    if (Parser.getTok().isNot(AsmToken::RBrac))
+      return Error(Parser.getTok().getLoc(), "']' expected");
+    E = Parser.getTok().getEndLoc();
+    Parser.Lex(); // Eat right bracket token.
+
+    // Don't worry about range checking the value here. That's handled by
+    // the is*() predicates.
+    Operands.push_back(ARMOperand::CreateMem(BaseRegNum, nullptr, 0,
+                                             ARM_AM::no_shift, 0, Align,
+                                             false, S, E, AlignmentLoc));
+
+    // If there's a pre-indexing writeback marker, '!', just add it as a token
+    // operand.
+    if (Parser.getTok().is(AsmToken::Exclaim)) {
+      Operands.push_back(ARMOperand::CreateToken("!",Parser.getTok().getLoc()));
+      Parser.Lex(); // Eat the '!'.
+    }
+
+    return false;
+  }
+
+  // If we have a '#', it's an immediate offset, else assume it's a register
+  // offset. Be friendly and also accept a plain integer (without a leading
+  // hash) for gas compatibility.
+  if (Parser.getTok().is(AsmToken::Hash) ||
+      Parser.getTok().is(AsmToken::Dollar) ||
+      Parser.getTok().is(AsmToken::Integer)) {
+    if (Parser.getTok().isNot(AsmToken::Integer))
+      Parser.Lex(); // Eat '#' or '$'.
+    E = Parser.getTok().getLoc();
+
+    bool isNegative = getParser().getTok().is(AsmToken::Minus);
+    const MCExpr *Offset;
+    if (getParser().parseExpression(Offset))
+     return true;
+
+    // The expression has to be a constant. Memory references with relocations
+    // don't come through here, as they use the <label> forms of the relevant
+    // instructions.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Offset);
+    if (!CE)
+      return Error (E, "constant expression expected");
+
+    // If the constant was #-0, represent it as INT32_MIN.
+    int32_t Val = CE->getValue();
+    if (isNegative && Val == 0)
+      CE = MCConstantExpr::Create(INT32_MIN, getContext());
+
+    // Now we should have the closing ']'
+    if (Parser.getTok().isNot(AsmToken::RBrac))
+      return Error(Parser.getTok().getLoc(), "']' expected");
+    E = Parser.getTok().getEndLoc();
+    Parser.Lex(); // Eat right bracket token.
+
+    // Don't worry about range checking the value here. That's handled by
+    // the is*() predicates.
+    Operands.push_back(ARMOperand::CreateMem(BaseRegNum, CE, 0,
+                                             ARM_AM::no_shift, 0, 0,
+                                             false, S, E));
+
+    // If there's a pre-indexing writeback marker, '!', just add it as a token
+    // operand.
+    if (Parser.getTok().is(AsmToken::Exclaim)) {
+      Operands.push_back(ARMOperand::CreateToken("!",Parser.getTok().getLoc()));
+      Parser.Lex(); // Eat the '!'.
+    }
+
+    return false;
+  }
+
+  // The register offset is optionally preceded by a '+' or '-'
+  bool isNegative = false;
+  if (Parser.getTok().is(AsmToken::Minus)) {
+    isNegative = true;
+    Parser.Lex(); // Eat the '-'.
+  } else if (Parser.getTok().is(AsmToken::Plus)) {
+    // Nothing to do.
+    Parser.Lex(); // Eat the '+'.
+  }
+
+  E = Parser.getTok().getLoc();
+  int OffsetRegNum = tryParseRegister();
+  if (OffsetRegNum == -1)
+    return Error(E, "register expected");
+
+  // If there's a shift operator, handle it.
+  ARM_AM::ShiftOpc ShiftType = ARM_AM::no_shift;
+  unsigned ShiftImm = 0;
+  if (Parser.getTok().is(AsmToken::Comma)) {
+    Parser.Lex(); // Eat the ','.
+    if (parseMemRegOffsetShift(ShiftType, ShiftImm))
+      return true;
+  }
+
+  // Now we should have the closing ']'
+  if (Parser.getTok().isNot(AsmToken::RBrac))
+    return Error(Parser.getTok().getLoc(), "']' expected");
+  E = Parser.getTok().getEndLoc();
+  Parser.Lex(); // Eat right bracket token.
+
+  Operands.push_back(ARMOperand::CreateMem(BaseRegNum, nullptr, OffsetRegNum,
+                                           ShiftType, ShiftImm, 0, isNegative,
+                                           S, E));
+
+  // If there's a pre-indexing writeback marker, '!', just add it as a token
+  // operand.
+  if (Parser.getTok().is(AsmToken::Exclaim)) {
+    Operands.push_back(ARMOperand::CreateToken("!",Parser.getTok().getLoc()));
+    Parser.Lex(); // Eat the '!'.
+  }
+
+  return false;
+}
+
+/// parseMemRegOffsetShift - one of these two:
+///   ( lsl | lsr | asr | ror ) , # shift_amount
+///   rrx
+/// return true if it parses a shift otherwise it returns false.
+bool ARMAsmParser::parseMemRegOffsetShift(ARM_AM::ShiftOpc &St,
+                                          unsigned &Amount) {
+  SMLoc Loc = Parser.getTok().getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.isNot(AsmToken::Identifier))
+    return true;
+  StringRef ShiftName = Tok.getString();
+  if (ShiftName == "lsl" || ShiftName == "LSL" ||
+      ShiftName == "asl" || ShiftName == "ASL")
+    St = ARM_AM::lsl;
+  else if (ShiftName == "lsr" || ShiftName == "LSR")
+    St = ARM_AM::lsr;
+  else if (ShiftName == "asr" || ShiftName == "ASR")
+    St = ARM_AM::asr;
+  else if (ShiftName == "ror" || ShiftName == "ROR")
+    St = ARM_AM::ror;
+  else if (ShiftName == "rrx" || ShiftName == "RRX")
+    St = ARM_AM::rrx;
+  else
+    return Error(Loc, "illegal shift operator");
+  Parser.Lex(); // Eat shift type token.
+
+  // rrx stands alone.
+  Amount = 0;
+  if (St != ARM_AM::rrx) {
+    Loc = Parser.getTok().getLoc();
+    // A '#' and a shift amount.
+    const AsmToken &HashTok = Parser.getTok();
+    if (HashTok.isNot(AsmToken::Hash) &&
+        HashTok.isNot(AsmToken::Dollar))
+      return Error(HashTok.getLoc(), "'#' expected");
+    Parser.Lex(); // Eat hash token.
+
+    const MCExpr *Expr;
+    if (getParser().parseExpression(Expr))
+      return true;
+    // Range check the immediate.
+    // lsl, ror: 0 <= imm <= 31
+    // lsr, asr: 0 <= imm <= 32
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr);
+    if (!CE)
+      return Error(Loc, "shift amount must be an immediate");
+    int64_t Imm = CE->getValue();
+    if (Imm < 0 ||
+        ((St == ARM_AM::lsl || St == ARM_AM::ror) && Imm > 31) ||
+        ((St == ARM_AM::lsr || St == ARM_AM::asr) && Imm > 32))
+      return Error(Loc, "immediate shift value out of range");
+    // If <ShiftTy> #0, turn it into a no_shift.
+    if (Imm == 0)
+      St = ARM_AM::lsl;
+    // For consistency, treat lsr #32 and asr #32 as having immediate value 0.
+    if (Imm == 32)
+      Imm = 0;
+    Amount = Imm;
+  }
+
+  return false;
+}
+
+/// parseFPImm - A floating point immediate expression operand.
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parseFPImm(OperandVector &Operands) {
+  // Anything that can accept a floating point constant as an operand
+  // needs to go through here, as the regular parseExpression is
+  // integer only.
+  //
+  // This routine still creates a generic Immediate operand, containing
+  // a bitcast of the 64-bit floating point value. The various operands
+  // that accept floats can check whether the value is valid for them
+  // via the standard is*() predicates.
+
+  SMLoc S = Parser.getTok().getLoc();
+
+  if (Parser.getTok().isNot(AsmToken::Hash) &&
+      Parser.getTok().isNot(AsmToken::Dollar))
+    return MatchOperand_NoMatch;
+
+  // Disambiguate the VMOV forms that can accept an FP immediate.
+  // vmov.f32 <sreg>, #imm
+  // vmov.f64 <dreg>, #imm
+  // vmov.f32 <dreg>, #imm  @ vector f32x2
+  // vmov.f32 <qreg>, #imm  @ vector f32x4
+  //
+  // There are also the NEON VMOV instructions which expect an
+  // integer constant. Make sure we don't try to parse an FPImm
+  // for these:
+  // vmov.i{8|16|32|64} <dreg|qreg>, #imm
+  ARMOperand &TyOp = static_cast<ARMOperand &>(*Operands[2]);
+  bool isVmovf = TyOp.isToken() &&
+                 (TyOp.getToken() == ".f32" || TyOp.getToken() == ".f64");
+  ARMOperand &Mnemonic = static_cast<ARMOperand &>(*Operands[0]);
+  bool isFconst = Mnemonic.isToken() && (Mnemonic.getToken() == "fconstd" ||
+                                         Mnemonic.getToken() == "fconsts");
+  if (!(isVmovf || isFconst))
+    return MatchOperand_NoMatch;
+
+  Parser.Lex(); // Eat '#' or '$'.
+
+  // Handle negation, as that still comes through as a separate token.
+  bool isNegative = false;
+  if (Parser.getTok().is(AsmToken::Minus)) {
+    isNegative = true;
+    Parser.Lex();
+  }
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc Loc = Tok.getLoc();
+  if (Tok.is(AsmToken::Real) && isVmovf) {
+    APFloat RealVal(APFloat::IEEEsingle, Tok.getString());
+    uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
+    // If we had a '-' in front, toggle the sign bit.
+    IntVal ^= (uint64_t)isNegative << 31;
+    Parser.Lex(); // Eat the token.
+    Operands.push_back(ARMOperand::CreateImm(
+          MCConstantExpr::Create(IntVal, getContext()),
+          S, Parser.getTok().getLoc()));
+    return MatchOperand_Success;
+  }
+  // Also handle plain integers. Instructions which allow floating point
+  // immediates also allow a raw encoded 8-bit value.
+  if (Tok.is(AsmToken::Integer) && isFconst) {
+    int64_t Val = Tok.getIntVal();
+    Parser.Lex(); // Eat the token.
+    if (Val > 255 || Val < 0) {
+      Error(Loc, "encoded floating point value out of range");
+      return MatchOperand_ParseFail;
+    }
+    float RealVal = ARM_AM::getFPImmFloat(Val);
+    Val = APFloat(RealVal).bitcastToAPInt().getZExtValue();
+
+    Operands.push_back(ARMOperand::CreateImm(
+        MCConstantExpr::Create(Val, getContext()), S,
+        Parser.getTok().getLoc()));
+    return MatchOperand_Success;
+  }
+
+  Error(Loc, "invalid floating point immediate");
+  return MatchOperand_ParseFail;
+}
+
+/// Parse a arm instruction operand.  For now this parses the operand regardless
+/// of the mnemonic.
+bool ARMAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
+  SMLoc S, E;
+
+  // Check if the current operand has a custom associated parser, if so, try to
+  // custom parse the operand, or fallback to the general approach.
+  OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
+  if (ResTy == MatchOperand_Success)
+    return false;
+  // If there wasn't a custom match, try the generic matcher below. Otherwise,
+  // there was a match, but an error occurred, in which case, just return that
+  // the operand parsing failed.
+  if (ResTy == MatchOperand_ParseFail)
+    return true;
+
+  switch (getLexer().getKind()) {
+  default:
+    Error(Parser.getTok().getLoc(), "unexpected token in operand");
+    return true;
+  case AsmToken::Identifier: {
+    // If we've seen a branch mnemonic, the next operand must be a label.  This
+    // is true even if the label is a register name.  So "br r1" means branch to
+    // label "r1".
+    bool ExpectLabel = Mnemonic == "b" || Mnemonic == "bl";
+    if (!ExpectLabel) {
+      if (!tryParseRegisterWithWriteBack(Operands))
+        return false;
+      int Res = tryParseShiftRegister(Operands);
+      if (Res == 0) // success
+        return false;
+      else if (Res == -1) // irrecoverable error
+        return true;
+      // If this is VMRS, check for the apsr_nzcv operand.
+      if (Mnemonic == "vmrs" &&
+          Parser.getTok().getString().equals_lower("apsr_nzcv")) {
+        S = Parser.getTok().getLoc();
+        Parser.Lex();
+        Operands.push_back(ARMOperand::CreateToken("APSR_nzcv", S));
+        return false;
+      }
+    }
+
+    // Fall though for the Identifier case that is not a register or a
+    // special name.
+  }
+  case AsmToken::LParen:  // parenthesized expressions like (_strcmp-4)
+  case AsmToken::Integer: // things like 1f and 2b as a branch targets
+  case AsmToken::String:  // quoted label names.
+  case AsmToken::Dot: {   // . as a branch target
+    // This was not a register so parse other operands that start with an
+    // identifier (like labels) as expressions and create them as immediates.
+    const MCExpr *IdVal;
+    S = Parser.getTok().getLoc();
+    if (getParser().parseExpression(IdVal))
+      return true;
+    E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+    Operands.push_back(ARMOperand::CreateImm(IdVal, S, E));
+    return false;
+  }
+  case AsmToken::LBrac:
+    return parseMemory(Operands);
+  case AsmToken::LCurly:
+    return parseRegisterList(Operands);
+  case AsmToken::Dollar:
+  case AsmToken::Hash: {
+    // #42 -> immediate.
+    S = Parser.getTok().getLoc();
+    Parser.Lex();
+
+    if (Parser.getTok().isNot(AsmToken::Colon)) {
+      bool isNegative = Parser.getTok().is(AsmToken::Minus);
+      const MCExpr *ImmVal;
+      if (getParser().parseExpression(ImmVal))
+        return true;
+      const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ImmVal);
+      if (CE) {
+        int32_t Val = CE->getValue();
+        if (isNegative && Val == 0)
+          ImmVal = MCConstantExpr::Create(INT32_MIN, getContext());
+      }
+      E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+      Operands.push_back(ARMOperand::CreateImm(ImmVal, S, E));
+
+      // There can be a trailing '!' on operands that we want as a separate
+      // '!' Token operand. Handle that here. For example, the compatibility
+      // alias for 'srsdb sp!, #imm' is 'srsdb #imm!'.
+      if (Parser.getTok().is(AsmToken::Exclaim)) {
+        Operands.push_back(ARMOperand::CreateToken(Parser.getTok().getString(),
+                                                   Parser.getTok().getLoc()));
+        Parser.Lex(); // Eat exclaim token
+      }
+      return false;
+    }
+    // w/ a ':' after the '#', it's just like a plain ':'.
+    // FALLTHROUGH
+  }
+  case AsmToken::Colon: {
+    // ":lower16:" and ":upper16:" expression prefixes
+    // FIXME: Check it's an expression prefix,
+    // e.g. (FOO - :lower16:BAR) isn't legal.
+    ARMMCExpr::VariantKind RefKind;
+    if (parsePrefix(RefKind))
+      return true;
+
+    const MCExpr *SubExprVal;
+    if (getParser().parseExpression(SubExprVal))
+      return true;
+
+    const MCExpr *ExprVal = ARMMCExpr::Create(RefKind, SubExprVal,
+                                              getContext());
+    E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+    Operands.push_back(ARMOperand::CreateImm(ExprVal, S, E));
+    return false;
+  }
+  case AsmToken::Equal: {
+    if (Mnemonic != "ldr") // only parse for ldr pseudo (e.g. ldr r0, =val)
+      return Error(Parser.getTok().getLoc(), "unexpected token in operand");
+
+    Parser.Lex(); // Eat '='
+    const MCExpr *SubExprVal;
+    if (getParser().parseExpression(SubExprVal))
+      return true;
+    E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+
+    const MCExpr *CPLoc = getTargetStreamer().addConstantPoolEntry(SubExprVal);
+    Operands.push_back(ARMOperand::CreateImm(CPLoc, S, E));
+    return false;
+  }
+  }
+}
+
+// parsePrefix - Parse ARM 16-bit relocations expression prefix, i.e.
+//  :lower16: and :upper16:.
+bool ARMAsmParser::parsePrefix(ARMMCExpr::VariantKind &RefKind) {
+  RefKind = ARMMCExpr::VK_ARM_None;
+
+  // consume an optional '#' (GNU compatibility)
+  if (getLexer().is(AsmToken::Hash))
+    Parser.Lex();
+
+  // :lower16: and :upper16: modifiers
+  assert(getLexer().is(AsmToken::Colon) && "expected a :");
+  Parser.Lex(); // Eat ':'
+
+  if (getLexer().isNot(AsmToken::Identifier)) {
+    Error(Parser.getTok().getLoc(), "expected prefix identifier in operand");
+    return true;
+  }
+
+  StringRef IDVal = Parser.getTok().getIdentifier();
+  if (IDVal == "lower16") {
+    RefKind = ARMMCExpr::VK_ARM_LO16;
+  } else if (IDVal == "upper16") {
+    RefKind = ARMMCExpr::VK_ARM_HI16;
+  } else {
+    Error(Parser.getTok().getLoc(), "unexpected prefix in operand");
+    return true;
+  }
+  Parser.Lex();
+
+  if (getLexer().isNot(AsmToken::Colon)) {
+    Error(Parser.getTok().getLoc(), "unexpected token after prefix");
+    return true;
+  }
+  Parser.Lex(); // Eat the last ':'
+  return false;
+}
+
+/// \brief Given a mnemonic, split out possible predication code and carry
+/// setting letters to form a canonical mnemonic and flags.
+//
+// FIXME: Would be nice to autogen this.
+// FIXME: This is a bit of a maze of special cases.
+StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic,
+                                      unsigned &PredicationCode,
+                                      bool &CarrySetting,
+                                      unsigned &ProcessorIMod,
+                                      StringRef &ITMask) {
+  PredicationCode = ARMCC::AL;
+  CarrySetting = false;
+  ProcessorIMod = 0;
+
+  // Ignore some mnemonics we know aren't predicated forms.
+  //
+  // FIXME: Would be nice to autogen this.
+  if ((Mnemonic == "movs" && isThumb()) ||
+      Mnemonic == "teq"   || Mnemonic == "vceq"   || Mnemonic == "svc"   ||
+      Mnemonic == "mls"   || Mnemonic == "smmls"  || Mnemonic == "vcls"  ||
+      Mnemonic == "vmls"  || Mnemonic == "vnmls"  || Mnemonic == "vacge" ||
+      Mnemonic == "vcge"  || Mnemonic == "vclt"   || Mnemonic == "vacgt" ||
+      Mnemonic == "vaclt" || Mnemonic == "vacle"  || Mnemonic == "hlt" ||
+      Mnemonic == "vcgt"  || Mnemonic == "vcle"   || Mnemonic == "smlal" ||
+      Mnemonic == "umaal" || Mnemonic == "umlal"  || Mnemonic == "vabal" ||
+      Mnemonic == "vmlal" || Mnemonic == "vpadal" || Mnemonic == "vqdmlal" ||
+      Mnemonic == "fmuls" || Mnemonic == "vmaxnm" || Mnemonic == "vminnm" ||
+      Mnemonic == "vcvta" || Mnemonic == "vcvtn"  || Mnemonic == "vcvtp" ||
+      Mnemonic == "vcvtm" || Mnemonic == "vrinta" || Mnemonic == "vrintn" ||
+      Mnemonic == "vrintp" || Mnemonic == "vrintm" || Mnemonic.startswith("vsel"))
+    return Mnemonic;
+
+  // First, split out any predication code. Ignore mnemonics we know aren't
+  // predicated but do have a carry-set and so weren't caught above.
+  if (Mnemonic != "adcs" && Mnemonic != "bics" && Mnemonic != "movs" &&
+      Mnemonic != "muls" && Mnemonic != "smlals" && Mnemonic != "smulls" &&
+      Mnemonic != "umlals" && Mnemonic != "umulls" && Mnemonic != "lsls" &&
+      Mnemonic != "sbcs" && Mnemonic != "rscs") {
+    unsigned CC = StringSwitch<unsigned>(Mnemonic.substr(Mnemonic.size()-2))
+      .Case("eq", ARMCC::EQ)
+      .Case("ne", ARMCC::NE)
+      .Case("hs", ARMCC::HS)
+      .Case("cs", ARMCC::HS)
+      .Case("lo", ARMCC::LO)
+      .Case("cc", ARMCC::LO)
+      .Case("mi", ARMCC::MI)
+      .Case("pl", ARMCC::PL)
+      .Case("vs", ARMCC::VS)
+      .Case("vc", ARMCC::VC)
+      .Case("hi", ARMCC::HI)
+      .Case("ls", ARMCC::LS)
+      .Case("ge", ARMCC::GE)
+      .Case("lt", ARMCC::LT)
+      .Case("gt", ARMCC::GT)
+      .Case("le", ARMCC::LE)
+      .Case("al", ARMCC::AL)
+      .Default(~0U);
+    if (CC != ~0U) {
+      Mnemonic = Mnemonic.slice(0, Mnemonic.size() - 2);
+      PredicationCode = CC;
+    }
+  }
+
+  // Next, determine if we have a carry setting bit. We explicitly ignore all
+  // the instructions we know end in 's'.
+  if (Mnemonic.endswith("s") &&
+      !(Mnemonic == "cps" || Mnemonic == "mls" ||
+        Mnemonic == "mrs" || Mnemonic == "smmls" || Mnemonic == "vabs" ||
+        Mnemonic == "vcls" || Mnemonic == "vmls" || Mnemonic == "vmrs" ||
+        Mnemonic == "vnmls" || Mnemonic == "vqabs" || Mnemonic == "vrecps" ||
+        Mnemonic == "vrsqrts" || Mnemonic == "srs" || Mnemonic == "flds" ||
+        Mnemonic == "fmrs" || Mnemonic == "fsqrts" || Mnemonic == "fsubs" ||
+        Mnemonic == "fsts" || Mnemonic == "fcpys" || Mnemonic == "fdivs" ||
+        Mnemonic == "fmuls" || Mnemonic == "fcmps" || Mnemonic == "fcmpzs" ||
+        Mnemonic == "vfms" || Mnemonic == "vfnms" || Mnemonic == "fconsts" ||
+        (Mnemonic == "movs" && isThumb()))) {
+    Mnemonic = Mnemonic.slice(0, Mnemonic.size() - 1);
+    CarrySetting = true;
+  }
+
+  // The "cps" instruction can have a interrupt mode operand which is glued into
+  // the mnemonic. Check if this is the case, split it and parse the imod op
+  if (Mnemonic.startswith("cps")) {
+    // Split out any imod code.
+    unsigned IMod =
+      StringSwitch<unsigned>(Mnemonic.substr(Mnemonic.size()-2, 2))
+      .Case("ie", ARM_PROC::IE)
+      .Case("id", ARM_PROC::ID)
+      .Default(~0U);
+    if (IMod != ~0U) {
+      Mnemonic = Mnemonic.slice(0, Mnemonic.size()-2);
+      ProcessorIMod = IMod;
+    }
+  }
+
+  // The "it" instruction has the condition mask on the end of the mnemonic.
+  if (Mnemonic.startswith("it")) {
+    ITMask = Mnemonic.slice(2, Mnemonic.size());
+    Mnemonic = Mnemonic.slice(0, 2);
+  }
+
+  return Mnemonic;
+}
+
+/// \brief Given a canonical mnemonic, determine if the instruction ever allows
+/// inclusion of carry set or predication code operands.
+//
+// FIXME: It would be nice to autogen this.
+void ARMAsmParser::
+getMnemonicAcceptInfo(StringRef Mnemonic, StringRef FullInst,
+                     bool &CanAcceptCarrySet, bool &CanAcceptPredicationCode) {
+  if (Mnemonic == "and" || Mnemonic == "lsl" || Mnemonic == "lsr" ||
+      Mnemonic == "rrx" || Mnemonic == "ror" || Mnemonic == "sub" ||
+      Mnemonic == "add" || Mnemonic == "adc" ||
+      Mnemonic == "mul" || Mnemonic == "bic" || Mnemonic == "asr" ||
+      Mnemonic == "orr" || Mnemonic == "mvn" ||
+      Mnemonic == "rsb" || Mnemonic == "rsc" || Mnemonic == "orn" ||
+      Mnemonic == "sbc" || Mnemonic == "eor" || Mnemonic == "neg" ||
+      Mnemonic == "vfm" || Mnemonic == "vfnm" ||
+      (!isThumb() && (Mnemonic == "smull" || Mnemonic == "mov" ||
+                      Mnemonic == "mla" || Mnemonic == "smlal" ||
+                      Mnemonic == "umlal" || Mnemonic == "umull"))) {
+    CanAcceptCarrySet = true;
+  } else
+    CanAcceptCarrySet = false;
+
+  if (Mnemonic == "bkpt" || Mnemonic == "cbnz" || Mnemonic == "setend" ||
+      Mnemonic == "cps" ||  Mnemonic == "it" ||  Mnemonic == "cbz" ||
+      Mnemonic == "trap" || Mnemonic == "hlt" || Mnemonic == "udf" ||
+      Mnemonic.startswith("crc32") || Mnemonic.startswith("cps") ||
+      Mnemonic.startswith("vsel") ||
+      Mnemonic == "vmaxnm" || Mnemonic == "vminnm" || Mnemonic == "vcvta" ||
+      Mnemonic == "vcvtn" || Mnemonic == "vcvtp" || Mnemonic == "vcvtm" ||
+      Mnemonic == "vrinta" || Mnemonic == "vrintn" || Mnemonic == "vrintp" ||
+      Mnemonic == "vrintm" || Mnemonic.startswith("aes") ||
+      Mnemonic.startswith("sha1") || Mnemonic.startswith("sha256") ||
+      (FullInst.startswith("vmull") && FullInst.endswith(".p64"))) {
+    // These mnemonics are never predicable
+    CanAcceptPredicationCode = false;
+  } else if (!isThumb()) {
+    // Some instructions are only predicable in Thumb mode
+    CanAcceptPredicationCode
+      = Mnemonic != "cdp2" && Mnemonic != "clrex" && Mnemonic != "mcr2" &&
+        Mnemonic != "mcrr2" && Mnemonic != "mrc2" && Mnemonic != "mrrc2" &&
+        Mnemonic != "dmb" && Mnemonic != "dsb" && Mnemonic != "isb" &&
+        Mnemonic != "pld" && Mnemonic != "pli" && Mnemonic != "pldw" &&
+        Mnemonic != "ldc2" && Mnemonic != "ldc2l" &&
+        Mnemonic != "stc2" && Mnemonic != "stc2l" &&
+        !Mnemonic.startswith("rfe") && !Mnemonic.startswith("srs");
+  } else if (isThumbOne()) {
+    if (hasV6MOps())
+      CanAcceptPredicationCode = Mnemonic != "movs";
+    else
+      CanAcceptPredicationCode = Mnemonic != "nop" && Mnemonic != "movs";
+  } else
+    CanAcceptPredicationCode = true;
+}
+
+bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic,
+                                          OperandVector &Operands) {
+  // FIXME: This is all horribly hacky. We really need a better way to deal
+  // with optional operands like this in the matcher table.
+
+  // The 'mov' mnemonic is special. One variant has a cc_out operand, while
+  // another does not. Specifically, the MOVW instruction does not. So we
+  // special case it here and remove the defaulted (non-setting) cc_out
+  // operand if that's the instruction we're trying to match.
+  //
+  // We do this as post-processing of the explicit operands rather than just
+  // conditionally adding the cc_out in the first place because we need
+  // to check the type of the parsed immediate operand.
+  if (Mnemonic == "mov" && Operands.size() > 4 && !isThumb() &&
+      !static_cast<ARMOperand &>(*Operands[4]).isARMSOImm() &&
+      static_cast<ARMOperand &>(*Operands[4]).isImm0_65535Expr() &&
+      static_cast<ARMOperand &>(*Operands[1]).getReg() == 0)
+    return true;
+
+  // Register-register 'add' for thumb does not have a cc_out operand
+  // when there are only two register operands.
+  if (isThumb() && Mnemonic == "add" && Operands.size() == 5 &&
+      static_cast<ARMOperand &>(*Operands[3]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[4]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[1]).getReg() == 0)
+    return true;
+  // Register-register 'add' for thumb does not have a cc_out operand
+  // when it's an ADD Rdm, SP, {Rdm|#imm0_255} instruction. We do
+  // have to check the immediate range here since Thumb2 has a variant
+  // that can handle a different range and has a cc_out operand.
+  if (((isThumb() && Mnemonic == "add") ||
+       (isThumbTwo() && Mnemonic == "sub")) &&
+      Operands.size() == 6 && static_cast<ARMOperand &>(*Operands[3]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[4]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[4]).getReg() == ARM::SP &&
+      static_cast<ARMOperand &>(*Operands[1]).getReg() == 0 &&
+      ((Mnemonic == "add" && static_cast<ARMOperand &>(*Operands[5]).isReg()) ||
+       static_cast<ARMOperand &>(*Operands[5]).isImm0_1020s4()))
+    return true;
+  // For Thumb2, add/sub immediate does not have a cc_out operand for the
+  // imm0_4095 variant. That's the least-preferred variant when
+  // selecting via the generic "add" mnemonic, so to know that we
+  // should remove the cc_out operand, we have to explicitly check that
+  // it's not one of the other variants. Ugh.
+  if (isThumbTwo() && (Mnemonic == "add" || Mnemonic == "sub") &&
+      Operands.size() == 6 && static_cast<ARMOperand &>(*Operands[3]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[4]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[5]).isImm()) {
+    // Nest conditions rather than one big 'if' statement for readability.
+    //
+    // If both registers are low, we're in an IT block, and the immediate is
+    // in range, we should use encoding T1 instead, which has a cc_out.
+    if (inITBlock() &&
+        isARMLowRegister(static_cast<ARMOperand &>(*Operands[3]).getReg()) &&
+        isARMLowRegister(static_cast<ARMOperand &>(*Operands[4]).getReg()) &&
+        static_cast<ARMOperand &>(*Operands[5]).isImm0_7())
+      return false;
+    // Check against T3. If the second register is the PC, this is an
+    // alternate form of ADR, which uses encoding T4, so check for that too.
+    if (static_cast<ARMOperand &>(*Operands[4]).getReg() != ARM::PC &&
+        static_cast<ARMOperand &>(*Operands[5]).isT2SOImm())
+      return false;
+
+    // Otherwise, we use encoding T4, which does not have a cc_out
+    // operand.
+    return true;
+  }
+
+  // The thumb2 multiply instruction doesn't have a CCOut register, so
+  // if we have a "mul" mnemonic in Thumb mode, check if we'll be able to
+  // use the 16-bit encoding or not.
+  if (isThumbTwo() && Mnemonic == "mul" && Operands.size() == 6 &&
+      static_cast<ARMOperand &>(*Operands[1]).getReg() == 0 &&
+      static_cast<ARMOperand &>(*Operands[3]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[4]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[5]).isReg() &&
+      // If the registers aren't low regs, the destination reg isn't the
+      // same as one of the source regs, or the cc_out operand is zero
+      // outside of an IT block, we have to use the 32-bit encoding, so
+      // remove the cc_out operand.
+      (!isARMLowRegister(static_cast<ARMOperand &>(*Operands[3]).getReg()) ||
+       !isARMLowRegister(static_cast<ARMOperand &>(*Operands[4]).getReg()) ||
+       !isARMLowRegister(static_cast<ARMOperand &>(*Operands[5]).getReg()) ||
+       !inITBlock() || (static_cast<ARMOperand &>(*Operands[3]).getReg() !=
+                            static_cast<ARMOperand &>(*Operands[5]).getReg() &&
+                        static_cast<ARMOperand &>(*Operands[3]).getReg() !=
+                            static_cast<ARMOperand &>(*Operands[4]).getReg())))
+    return true;
+
+  // Also check the 'mul' syntax variant that doesn't specify an explicit
+  // destination register.
+  if (isThumbTwo() && Mnemonic == "mul" && Operands.size() == 5 &&
+      static_cast<ARMOperand &>(*Operands[1]).getReg() == 0 &&
+      static_cast<ARMOperand &>(*Operands[3]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[4]).isReg() &&
+      // If the registers aren't low regs  or the cc_out operand is zero
+      // outside of an IT block, we have to use the 32-bit encoding, so
+      // remove the cc_out operand.
+      (!isARMLowRegister(static_cast<ARMOperand &>(*Operands[3]).getReg()) ||
+       !isARMLowRegister(static_cast<ARMOperand &>(*Operands[4]).getReg()) ||
+       !inITBlock()))
+    return true;
+
+
+
+  // Register-register 'add/sub' for thumb does not have a cc_out operand
+  // when it's an ADD/SUB SP, #imm. Be lenient on count since there's also
+  // the "add/sub SP, SP, #imm" version. If the follow-up operands aren't
+  // right, this will result in better diagnostics (which operand is off)
+  // anyway.
+  if (isThumb() && (Mnemonic == "add" || Mnemonic == "sub") &&
+      (Operands.size() == 5 || Operands.size() == 6) &&
+      static_cast<ARMOperand &>(*Operands[3]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[3]).getReg() == ARM::SP &&
+      static_cast<ARMOperand &>(*Operands[1]).getReg() == 0 &&
+      (static_cast<ARMOperand &>(*Operands[4]).isImm() ||
+       (Operands.size() == 6 &&
+        static_cast<ARMOperand &>(*Operands[5]).isImm())))
+    return true;
+
+  return false;
+}
+
+bool ARMAsmParser::shouldOmitPredicateOperand(StringRef Mnemonic,
+                                              OperandVector &Operands) {
+  // VRINT{Z, R, X} have a predicate operand in VFP, but not in NEON
+  unsigned RegIdx = 3;
+  if ((Mnemonic == "vrintz" || Mnemonic == "vrintx" || Mnemonic == "vrintr") &&
+      static_cast<ARMOperand &>(*Operands[2]).getToken() == ".f32") {
+    if (static_cast<ARMOperand &>(*Operands[3]).isToken() &&
+        static_cast<ARMOperand &>(*Operands[3]).getToken() == ".f32")
+      RegIdx = 4;
+
+    if (static_cast<ARMOperand &>(*Operands[RegIdx]).isReg() &&
+        (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(
+             static_cast<ARMOperand &>(*Operands[RegIdx]).getReg()) ||
+         ARMMCRegisterClasses[ARM::QPRRegClassID].contains(
+             static_cast<ARMOperand &>(*Operands[RegIdx]).getReg())))
+      return true;
+  }
+  return false;
+}
+
+static bool isDataTypeToken(StringRef Tok) {
+  return Tok == ".8" || Tok == ".16" || Tok == ".32" || Tok == ".64" ||
+    Tok == ".i8" || Tok == ".i16" || Tok == ".i32" || Tok == ".i64" ||
+    Tok == ".u8" || Tok == ".u16" || Tok == ".u32" || Tok == ".u64" ||
+    Tok == ".s8" || Tok == ".s16" || Tok == ".s32" || Tok == ".s64" ||
+    Tok == ".p8" || Tok == ".p16" || Tok == ".f32" || Tok == ".f64" ||
+    Tok == ".f" || Tok == ".d";
+}
+
+// FIXME: This bit should probably be handled via an explicit match class
+// in the .td files that matches the suffix instead of having it be
+// a literal string token the way it is now.
+static bool doesIgnoreDataTypeSuffix(StringRef Mnemonic, StringRef DT) {
+  return Mnemonic.startswith("vldm") || Mnemonic.startswith("vstm");
+}
+static void applyMnemonicAliases(StringRef &Mnemonic, unsigned Features,
+                                 unsigned VariantID);
+
+static bool RequiresVFPRegListValidation(StringRef Inst,
+                                         bool &AcceptSinglePrecisionOnly,
+                                         bool &AcceptDoublePrecisionOnly) {
+  if (Inst.size() < 7)
+    return false;
+
+  if (Inst.startswith("fldm") || Inst.startswith("fstm")) {
+    StringRef AddressingMode = Inst.substr(4, 2);
+    if (AddressingMode == "ia" || AddressingMode == "db" ||
+        AddressingMode == "ea" || AddressingMode == "fd") {
+      AcceptSinglePrecisionOnly = Inst[6] == 's';
+      AcceptDoublePrecisionOnly = Inst[6] == 'd' || Inst[6] == 'x';
+      return true;
+    }
+  }
+
+  return false;
+}
+
+/// Parse an arm instruction mnemonic followed by its operands.
+bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                                    SMLoc NameLoc, OperandVector &Operands) {
+  // FIXME: Can this be done via tablegen in some fashion?
+  bool RequireVFPRegisterListCheck;
+  bool AcceptSinglePrecisionOnly;
+  bool AcceptDoublePrecisionOnly;
+  RequireVFPRegisterListCheck =
+    RequiresVFPRegListValidation(Name, AcceptSinglePrecisionOnly,
+                                 AcceptDoublePrecisionOnly);
+
+  // Apply mnemonic aliases before doing anything else, as the destination
+  // mnemonic may include suffices and we want to handle them normally.
+  // The generic tblgen'erated code does this later, at the start of
+  // MatchInstructionImpl(), but that's too late for aliases that include
+  // any sort of suffix.
+  unsigned AvailableFeatures = getAvailableFeatures();
+  unsigned AssemblerDialect = getParser().getAssemblerDialect();
+  applyMnemonicAliases(Name, AvailableFeatures, AssemblerDialect);
+
+  // First check for the ARM-specific .req directive.
+  if (Parser.getTok().is(AsmToken::Identifier) &&
+      Parser.getTok().getIdentifier() == ".req") {
+    parseDirectiveReq(Name, NameLoc);
+    // We always return 'error' for this, as we're done with this
+    // statement and don't need to match the 'instruction."
+    return true;
+  }
+
+  // Create the leading tokens for the mnemonic, split by '.' characters.
+  size_t Start = 0, Next = Name.find('.');
+  StringRef Mnemonic = Name.slice(Start, Next);
+
+  // Split out the predication code and carry setting flag from the mnemonic.
+  unsigned PredicationCode;
+  unsigned ProcessorIMod;
+  bool CarrySetting;
+  StringRef ITMask;
+  Mnemonic = splitMnemonic(Mnemonic, PredicationCode, CarrySetting,
+                           ProcessorIMod, ITMask);
+
+  // In Thumb1, only the branch (B) instruction can be predicated.
+  if (isThumbOne() && PredicationCode != ARMCC::AL && Mnemonic != "b") {
+    Parser.eatToEndOfStatement();
+    return Error(NameLoc, "conditional execution not supported in Thumb1");
+  }
+
+  Operands.push_back(ARMOperand::CreateToken(Mnemonic, NameLoc));
+
+  // Handle the IT instruction ITMask. Convert it to a bitmask. This
+  // is the mask as it will be for the IT encoding if the conditional
+  // encoding has a '1' as it's bit0 (i.e. 't' ==> '1'). In the case
+  // where the conditional bit0 is zero, the instruction post-processing
+  // will adjust the mask accordingly.
+  if (Mnemonic == "it") {
+    SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + 2);
+    if (ITMask.size() > 3) {
+      Parser.eatToEndOfStatement();
+      return Error(Loc, "too many conditions on IT instruction");
+    }
+    unsigned Mask = 8;
+    for (unsigned i = ITMask.size(); i != 0; --i) {
+      char pos = ITMask[i - 1];
+      if (pos != 't' && pos != 'e') {
+        Parser.eatToEndOfStatement();
+        return Error(Loc, "illegal IT block condition mask '" + ITMask + "'");
+      }
+      Mask >>= 1;
+      if (ITMask[i - 1] == 't')
+        Mask |= 8;
+    }
+    Operands.push_back(ARMOperand::CreateITMask(Mask, Loc));
+  }
+
+  // FIXME: This is all a pretty gross hack. We should automatically handle
+  // optional operands like this via tblgen.
+
+  // Next, add the CCOut and ConditionCode operands, if needed.
+  //
+  // For mnemonics which can ever incorporate a carry setting bit or predication
+  // code, our matching model involves us always generating CCOut and
+  // ConditionCode operands to match the mnemonic "as written" and then we let
+  // the matcher deal with finding the right instruction or generating an
+  // appropriate error.
+  bool CanAcceptCarrySet, CanAcceptPredicationCode;
+  getMnemonicAcceptInfo(Mnemonic, Name, CanAcceptCarrySet, CanAcceptPredicationCode);
+
+  // If we had a carry-set on an instruction that can't do that, issue an
+  // error.
+  if (!CanAcceptCarrySet && CarrySetting) {
+    Parser.eatToEndOfStatement();
+    return Error(NameLoc, "instruction '" + Mnemonic +
+                 "' can not set flags, but 's' suffix specified");
+  }
+  // If we had a predication code on an instruction that can't do that, issue an
+  // error.
+  if (!CanAcceptPredicationCode && PredicationCode != ARMCC::AL) {
+    Parser.eatToEndOfStatement();
+    return Error(NameLoc, "instruction '" + Mnemonic +
+                 "' is not predicable, but condition code specified");
+  }
+
+  // Add the carry setting operand, if necessary.
+  if (CanAcceptCarrySet) {
+    SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Mnemonic.size());
+    Operands.push_back(ARMOperand::CreateCCOut(CarrySetting ? ARM::CPSR : 0,
+                                               Loc));
+  }
+
+  // Add the predication code operand, if necessary.
+  if (CanAcceptPredicationCode) {
+    SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Mnemonic.size() +
+                                      CarrySetting);
+    Operands.push_back(ARMOperand::CreateCondCode(
+                         ARMCC::CondCodes(PredicationCode), Loc));
+  }
+
+  // Add the processor imod operand, if necessary.
+  if (ProcessorIMod) {
+    Operands.push_back(ARMOperand::CreateImm(
+          MCConstantExpr::Create(ProcessorIMod, getContext()),
+                                 NameLoc, NameLoc));
+  }
+
+  // Add the remaining tokens in the mnemonic.
+  while (Next != StringRef::npos) {
+    Start = Next;
+    Next = Name.find('.', Start + 1);
+    StringRef ExtraToken = Name.slice(Start, Next);
+
+    // Some NEON instructions have an optional datatype suffix that is
+    // completely ignored. Check for that.
+    if (isDataTypeToken(ExtraToken) &&
+        doesIgnoreDataTypeSuffix(Mnemonic, ExtraToken))
+      continue;
+
+    // For for ARM mode generate an error if the .n qualifier is used.
+    if (ExtraToken == ".n" && !isThumb()) {
+      SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Start);
+      Parser.eatToEndOfStatement();
+      return Error(Loc, "instruction with .n (narrow) qualifier not allowed in "
+                   "arm mode");
+    }
+
+    // The .n qualifier is always discarded as that is what the tables
+    // and matcher expect.  In ARM mode the .w qualifier has no effect,
+    // so discard it to avoid errors that can be caused by the matcher.
+    if (ExtraToken != ".n" && (isThumb() || ExtraToken != ".w")) {
+      SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Start);
+      Operands.push_back(ARMOperand::CreateToken(ExtraToken, Loc));
+    }
+  }
+
+  // Read the remaining operands.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    // Read the first operand.
+    if (parseOperand(Operands, Mnemonic)) {
+      Parser.eatToEndOfStatement();
+      return true;
+    }
+
+    while (getLexer().is(AsmToken::Comma)) {
+      Parser.Lex();  // Eat the comma.
+
+      // Parse and remember the operand.
+      if (parseOperand(Operands, Mnemonic)) {
+        Parser.eatToEndOfStatement();
+        return true;
+      }
+    }
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    SMLoc Loc = getLexer().getLoc();
+    Parser.eatToEndOfStatement();
+    return Error(Loc, "unexpected token in argument list");
+  }
+
+  Parser.Lex(); // Consume the EndOfStatement
+
+  if (RequireVFPRegisterListCheck) {
+    ARMOperand &Op = static_cast<ARMOperand &>(*Operands.back());
+    if (AcceptSinglePrecisionOnly && !Op.isSPRRegList())
+      return Error(Op.getStartLoc(),
+                   "VFP/Neon single precision register expected");
+    if (AcceptDoublePrecisionOnly && !Op.isDPRRegList())
+      return Error(Op.getStartLoc(),
+                   "VFP/Neon double precision register expected");
+  }
+
+  // Some instructions, mostly Thumb, have forms for the same mnemonic that
+  // do and don't have a cc_out optional-def operand. With some spot-checks
+  // of the operand list, we can figure out which variant we're trying to
+  // parse and adjust accordingly before actually matching. We shouldn't ever
+  // try to remove a cc_out operand that was explicitly set on the the
+  // mnemonic, of course (CarrySetting == true). Reason number #317 the
+  // table driven matcher doesn't fit well with the ARM instruction set.
+  if (!CarrySetting && shouldOmitCCOutOperand(Mnemonic, Operands))
+    Operands.erase(Operands.begin() + 1);
+
+  // Some instructions have the same mnemonic, but don't always
+  // have a predicate. Distinguish them here and delete the
+  // predicate if needed.
+  if (shouldOmitPredicateOperand(Mnemonic, Operands))
+    Operands.erase(Operands.begin() + 1);
+
+  // ARM mode 'blx' need special handling, as the register operand version
+  // is predicable, but the label operand version is not. So, we can't rely
+  // on the Mnemonic based checking to correctly figure out when to put
+  // a k_CondCode operand in the list. If we're trying to match the label
+  // version, remove the k_CondCode operand here.
+  if (!isThumb() && Mnemonic == "blx" && Operands.size() == 3 &&
+      static_cast<ARMOperand &>(*Operands[2]).isImm())
+    Operands.erase(Operands.begin() + 1);
+
+  // Adjust operands of ldrexd/strexd to MCK_GPRPair.
+  // ldrexd/strexd require even/odd GPR pair. To enforce this constraint,
+  // a single GPRPair reg operand is used in the .td file to replace the two
+  // GPRs. However, when parsing from asm, the two GRPs cannot be automatically
+  // expressed as a GPRPair, so we have to manually merge them.
+  // FIXME: We would really like to be able to tablegen'erate this.
+  if (!isThumb() && Operands.size() > 4 &&
+      (Mnemonic == "ldrexd" || Mnemonic == "strexd" || Mnemonic == "ldaexd" ||
+       Mnemonic == "stlexd")) {
+    bool isLoad = (Mnemonic == "ldrexd" || Mnemonic == "ldaexd");
+    unsigned Idx = isLoad ? 2 : 3;
+    ARMOperand &Op1 = static_cast<ARMOperand &>(*Operands[Idx]);
+    ARMOperand &Op2 = static_cast<ARMOperand &>(*Operands[Idx + 1]);
+
+    const MCRegisterClass& MRC = MRI->getRegClass(ARM::GPRRegClassID);
+    // Adjust only if Op1 and Op2 are GPRs.
+    if (Op1.isReg() && Op2.isReg() && MRC.contains(Op1.getReg()) &&
+        MRC.contains(Op2.getReg())) {
+      unsigned Reg1 = Op1.getReg();
+      unsigned Reg2 = Op2.getReg();
+      unsigned Rt = MRI->getEncodingValue(Reg1);
+      unsigned Rt2 = MRI->getEncodingValue(Reg2);
+
+      // Rt2 must be Rt + 1 and Rt must be even.
+      if (Rt + 1 != Rt2 || (Rt & 1)) {
+        Error(Op2.getStartLoc(), isLoad
+                                     ? "destination operands must be sequential"
+                                     : "source operands must be sequential");
+        return true;
+      }
+      unsigned NewReg = MRI->getMatchingSuperReg(Reg1, ARM::gsub_0,
+          &(MRI->getRegClass(ARM::GPRPairRegClassID)));
+      Operands[Idx] =
+          ARMOperand::CreateReg(NewReg, Op1.getStartLoc(), Op2.getEndLoc());
+      Operands.erase(Operands.begin() + Idx + 1);
+    }
+  }
+
+  // GNU Assembler extension (compatibility)
+  if ((Mnemonic == "ldrd" || Mnemonic == "strd")) {
+    ARMOperand &Op2 = static_cast<ARMOperand &>(*Operands[2]);
+    ARMOperand &Op3 = static_cast<ARMOperand &>(*Operands[3]);
+    if (Op3.isMem()) {
+      assert(Op2.isReg() && "expected register argument");
+
+      unsigned SuperReg = MRI->getMatchingSuperReg(
+          Op2.getReg(), ARM::gsub_0, &MRI->getRegClass(ARM::GPRPairRegClassID));
+
+      assert(SuperReg && "expected register pair");
+
+      unsigned PairedReg = MRI->getSubReg(SuperReg, ARM::gsub_1);
+
+      Operands.insert(
+          Operands.begin() + 3,
+          ARMOperand::CreateReg(PairedReg, Op2.getStartLoc(), Op2.getEndLoc()));
+    }
+  }
+
+  // FIXME: As said above, this is all a pretty gross hack.  This instruction
+  // does not fit with other "subs" and tblgen.
+  // Adjust operands of B9.3.19 SUBS PC, LR, #imm (Thumb2) system instruction
+  // so the Mnemonic is the original name "subs" and delete the predicate
+  // operand so it will match the table entry.
+  if (isThumbTwo() && Mnemonic == "sub" && Operands.size() == 6 &&
+      static_cast<ARMOperand &>(*Operands[3]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[3]).getReg() == ARM::PC &&
+      static_cast<ARMOperand &>(*Operands[4]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[4]).getReg() == ARM::LR &&
+      static_cast<ARMOperand &>(*Operands[5]).isImm()) {
+    Operands.front() = ARMOperand::CreateToken(Name, NameLoc);
+    Operands.erase(Operands.begin() + 1);
+  }
+  return false;
+}
+
+// Validate context-sensitive operand constraints.
+
+// return 'true' if register list contains non-low GPR registers,
+// 'false' otherwise. If Reg is in the register list or is HiReg, set
+// 'containsReg' to true.
+static bool checkLowRegisterList(MCInst Inst, unsigned OpNo, unsigned Reg,
+                                 unsigned HiReg, bool &containsReg) {
+  containsReg = false;
+  for (unsigned i = OpNo; i < Inst.getNumOperands(); ++i) {
+    unsigned OpReg = Inst.getOperand(i).getReg();
+    if (OpReg == Reg)
+      containsReg = true;
+    // Anything other than a low register isn't legal here.
+    if (!isARMLowRegister(OpReg) && (!HiReg || OpReg != HiReg))
+      return true;
+  }
+  return false;
+}
+
+// Check if the specified regisgter is in the register list of the inst,
+// starting at the indicated operand number.
+static bool listContainsReg(MCInst &Inst, unsigned OpNo, unsigned Reg) {
+  for (unsigned i = OpNo; i < Inst.getNumOperands(); ++i) {
+    unsigned OpReg = Inst.getOperand(i).getReg();
+    if (OpReg == Reg)
+      return true;
+  }
+  return false;
+}
+
+// Return true if instruction has the interesting property of being
+// allowed in IT blocks, but not being predicable.
+static bool instIsBreakpoint(const MCInst &Inst) {
+    return Inst.getOpcode() == ARM::tBKPT ||
+           Inst.getOpcode() == ARM::BKPT ||
+           Inst.getOpcode() == ARM::tHLT ||
+           Inst.getOpcode() == ARM::HLT;
+
+}
+
+// FIXME: We would really like to be able to tablegen'erate this.
+bool ARMAsmParser::validateInstruction(MCInst &Inst,
+                                       const OperandVector &Operands) {
+  const MCInstrDesc &MCID = MII.get(Inst.getOpcode());
+  SMLoc Loc = Operands[0]->getStartLoc();
+
+  // Check the IT block state first.
+  // NOTE: BKPT and HLT instructions have the interesting property of being
+  // allowed in IT blocks, but not being predicable. They just always execute.
+  if (inITBlock() && !instIsBreakpoint(Inst)) {
+    unsigned Bit = 1;
+    if (ITState.FirstCond)
+      ITState.FirstCond = false;
+    else
+      Bit = (ITState.Mask >> (5 - ITState.CurPosition)) & 1;
+    // The instruction must be predicable.
+    if (!MCID.isPredicable())
+      return Error(Loc, "instructions in IT block must be predicable");
+    unsigned Cond = Inst.getOperand(MCID.findFirstPredOperandIdx()).getImm();
+    unsigned ITCond = Bit ? ITState.Cond :
+      ARMCC::getOppositeCondition(ITState.Cond);
+    if (Cond != ITCond) {
+      // Find the condition code Operand to get its SMLoc information.
+      SMLoc CondLoc;
+      for (unsigned I = 1; I < Operands.size(); ++I)
+        if (static_cast<ARMOperand &>(*Operands[I]).isCondCode())
+          CondLoc = Operands[I]->getStartLoc();
+      return Error(CondLoc, "incorrect condition in IT block; got '" +
+                   StringRef(ARMCondCodeToString(ARMCC::CondCodes(Cond))) +
+                   "', but expected '" +
+                   ARMCondCodeToString(ARMCC::CondCodes(ITCond)) + "'");
+    }
+  // Check for non-'al' condition codes outside of the IT block.
+  } else if (isThumbTwo() && MCID.isPredicable() &&
+             Inst.getOperand(MCID.findFirstPredOperandIdx()).getImm() !=
+             ARMCC::AL && Inst.getOpcode() != ARM::tBcc &&
+             Inst.getOpcode() != ARM::t2Bcc)
+    return Error(Loc, "predicated instructions must be in IT block");
+
+  const unsigned Opcode = Inst.getOpcode();
+  switch (Opcode) {
+  case ARM::LDRD:
+  case ARM::LDRD_PRE:
+  case ARM::LDRD_POST: {
+    const unsigned RtReg = Inst.getOperand(0).getReg();
+
+    // Rt can't be R14.
+    if (RtReg == ARM::LR)
+      return Error(Operands[3]->getStartLoc(),
+                   "Rt can't be R14");
+
+    const unsigned Rt = MRI->getEncodingValue(RtReg);
+    // Rt must be even-numbered.
+    if ((Rt & 1) == 1)
+      return Error(Operands[3]->getStartLoc(),
+                   "Rt must be even-numbered");
+
+    // Rt2 must be Rt + 1.
+    const unsigned Rt2 = MRI->getEncodingValue(Inst.getOperand(1).getReg());
+    if (Rt2 != Rt + 1)
+      return Error(Operands[3]->getStartLoc(),
+                   "destination operands must be sequential");
+
+    if (Opcode == ARM::LDRD_PRE || Opcode == ARM::LDRD_POST) {
+      const unsigned Rn = MRI->getEncodingValue(Inst.getOperand(3).getReg());
+      // For addressing modes with writeback, the base register needs to be
+      // different from the destination registers.
+      if (Rn == Rt || Rn == Rt2)
+        return Error(Operands[3]->getStartLoc(),
+                     "base register needs to be different from destination "
+                     "registers");
+    }
+
+    return false;
+  }
+  case ARM::t2LDRDi8:
+  case ARM::t2LDRD_PRE:
+  case ARM::t2LDRD_POST: {
+    // Rt2 must be different from Rt.
+    unsigned Rt = MRI->getEncodingValue(Inst.getOperand(0).getReg());
+    unsigned Rt2 = MRI->getEncodingValue(Inst.getOperand(1).getReg());
+    if (Rt2 == Rt)
+      return Error(Operands[3]->getStartLoc(),
+                   "destination operands can't be identical");
+    return false;
+  }
+  case ARM::STRD: {
+    // Rt2 must be Rt + 1.
+    unsigned Rt = MRI->getEncodingValue(Inst.getOperand(0).getReg());
+    unsigned Rt2 = MRI->getEncodingValue(Inst.getOperand(1).getReg());
+    if (Rt2 != Rt + 1)
+      return Error(Operands[3]->getStartLoc(),
+                   "source operands must be sequential");
+    return false;
+  }
+  case ARM::STRD_PRE:
+  case ARM::STRD_POST: {
+    // Rt2 must be Rt + 1.
+    unsigned Rt = MRI->getEncodingValue(Inst.getOperand(1).getReg());
+    unsigned Rt2 = MRI->getEncodingValue(Inst.getOperand(2).getReg());
+    if (Rt2 != Rt + 1)
+      return Error(Operands[3]->getStartLoc(),
+                   "source operands must be sequential");
+    return false;
+  }
+  case ARM::SBFX:
+  case ARM::UBFX: {
+    // Width must be in range [1, 32-lsb].
+    unsigned LSB = Inst.getOperand(2).getImm();
+    unsigned Widthm1 = Inst.getOperand(3).getImm();
+    if (Widthm1 >= 32 - LSB)
+      return Error(Operands[5]->getStartLoc(),
+                   "bitfield width must be in range [1,32-lsb]");
+    return false;
+  }
+  // Notionally handles ARM::tLDMIA_UPD too.
+  case ARM::tLDMIA: {
+    // If we're parsing Thumb2, the .w variant is available and handles
+    // most cases that are normally illegal for a Thumb1 LDM instruction.
+    // We'll make the transformation in processInstruction() if necessary.
+    //
+    // Thumb LDM instructions are writeback iff the base register is not
+    // in the register list.
+    unsigned Rn = Inst.getOperand(0).getReg();
+    bool HasWritebackToken =
+        (static_cast<ARMOperand &>(*Operands[3]).isToken() &&
+         static_cast<ARMOperand &>(*Operands[3]).getToken() == "!");
+    bool ListContainsBase;
+    if (checkLowRegisterList(Inst, 3, Rn, 0, ListContainsBase) && !isThumbTwo())
+      return Error(Operands[3 + HasWritebackToken]->getStartLoc(),
+                   "registers must be in range r0-r7");
+    // If we should have writeback, then there should be a '!' token.
+    if (!ListContainsBase && !HasWritebackToken && !isThumbTwo())
+      return Error(Operands[2]->getStartLoc(),
+                   "writeback operator '!' expected");
+    // If we should not have writeback, there must not be a '!'. This is
+    // true even for the 32-bit wide encodings.
+    if (ListContainsBase && HasWritebackToken)
+      return Error(Operands[3]->getStartLoc(),
+                   "writeback operator '!' not allowed when base register "
+                   "in register list");
+
+    break;
+  }
+  case ARM::LDMIA_UPD:
+  case ARM::LDMDB_UPD:
+  case ARM::LDMIB_UPD:
+  case ARM::LDMDA_UPD:
+    // ARM variants loading and updating the same register are only officially
+    // UNPREDICTABLE on v7 upwards. Goodness knows what they did before.
+    if (!hasV7Ops())
+      break;
+    // Fallthrough
+  case ARM::t2LDMIA_UPD:
+  case ARM::t2LDMDB_UPD:
+  case ARM::t2STMIA_UPD:
+  case ARM::t2STMDB_UPD: {
+    if (listContainsReg(Inst, 3, Inst.getOperand(0).getReg()))
+      return Error(Operands.back()->getStartLoc(),
+                   "writeback register not allowed in register list");
+    break;
+  }
+  case ARM::sysLDMIA_UPD:
+  case ARM::sysLDMDA_UPD:
+  case ARM::sysLDMDB_UPD:
+  case ARM::sysLDMIB_UPD:
+    if (!listContainsReg(Inst, 3, ARM::PC))
+      return Error(Operands[4]->getStartLoc(),
+                   "writeback register only allowed on system LDM "
+                   "if PC in register-list");
+    break;
+  case ARM::sysSTMIA_UPD:
+  case ARM::sysSTMDA_UPD:
+  case ARM::sysSTMDB_UPD:
+  case ARM::sysSTMIB_UPD:
+    return Error(Operands[2]->getStartLoc(),
+                 "system STM cannot have writeback register");
+  case ARM::tMUL: {
+    // The second source operand must be the same register as the destination
+    // operand.
+    //
+    // In this case, we must directly check the parsed operands because the
+    // cvtThumbMultiply() function is written in such a way that it guarantees
+    // this first statement is always true for the new Inst.  Essentially, the
+    // destination is unconditionally copied into the second source operand
+    // without checking to see if it matches what we actually parsed.
+    if (Operands.size() == 6 && (((ARMOperand &)*Operands[3]).getReg() !=
+                                 ((ARMOperand &)*Operands[5]).getReg()) &&
+        (((ARMOperand &)*Operands[3]).getReg() !=
+         ((ARMOperand &)*Operands[4]).getReg())) {
+      return Error(Operands[3]->getStartLoc(),
+                   "destination register must match source register");
+    }
+    break;
+  }
+  // Like for ldm/stm, push and pop have hi-reg handling version in Thumb2,
+  // so only issue a diagnostic for thumb1. The instructions will be
+  // switched to the t2 encodings in processInstruction() if necessary.
+  case ARM::tPOP: {
+    bool ListContainsBase;
+    if (checkLowRegisterList(Inst, 2, 0, ARM::PC, ListContainsBase) &&
+        !isThumbTwo())
+      return Error(Operands[2]->getStartLoc(),
+                   "registers must be in range r0-r7 or pc");
+    break;
+  }
+  case ARM::tPUSH: {
+    bool ListContainsBase;
+    if (checkLowRegisterList(Inst, 2, 0, ARM::LR, ListContainsBase) &&
+        !isThumbTwo())
+      return Error(Operands[2]->getStartLoc(),
+                   "registers must be in range r0-r7 or lr");
+    break;
+  }
+  case ARM::tSTMIA_UPD: {
+    bool ListContainsBase, InvalidLowList;
+    InvalidLowList = checkLowRegisterList(Inst, 4, Inst.getOperand(0).getReg(),
+                                          0, ListContainsBase);
+    if (InvalidLowList && !isThumbTwo())
+      return Error(Operands[4]->getStartLoc(),
+                   "registers must be in range r0-r7");
+
+    // This would be converted to a 32-bit stm, but that's not valid if the
+    // writeback register is in the list.
+    if (InvalidLowList && ListContainsBase)
+      return Error(Operands[4]->getStartLoc(),
+                   "writeback operator '!' not allowed when base register "
+                   "in register list");
+    break;
+  }
+  case ARM::tADDrSP: {
+    // If the non-SP source operand and the destination operand are not the
+    // same, we need thumb2 (for the wide encoding), or we have an error.
+    if (!isThumbTwo() &&
+        Inst.getOperand(0).getReg() != Inst.getOperand(2).getReg()) {
+      return Error(Operands[4]->getStartLoc(),
+                   "source register must be the same as destination");
+    }
+    break;
+  }
+  // Final range checking for Thumb unconditional branch instructions.
+  case ARM::tB:
+    if (!(static_cast<ARMOperand &>(*Operands[2])).isSignedOffset<11, 1>())
+      return Error(Operands[2]->getStartLoc(), "branch target out of range");
+    break;
+  case ARM::t2B: {
+    int op = (Operands[2]->isImm()) ? 2 : 3;
+    if (!static_cast<ARMOperand &>(*Operands[op]).isSignedOffset<24, 1>())
+      return Error(Operands[op]->getStartLoc(), "branch target out of range");
+    break;
+  }
+  // Final range checking for Thumb conditional branch instructions.
+  case ARM::tBcc:
+    if (!static_cast<ARMOperand &>(*Operands[2]).isSignedOffset<8, 1>())
+      return Error(Operands[2]->getStartLoc(), "branch target out of range");
+    break;
+  case ARM::t2Bcc: {
+    int Op = (Operands[2]->isImm()) ? 2 : 3;
+    if (!static_cast<ARMOperand &>(*Operands[Op]).isSignedOffset<20, 1>())
+      return Error(Operands[Op]->getStartLoc(), "branch target out of range");
+    break;
+  }
+  case ARM::MOVi16:
+  case ARM::t2MOVi16:
+  case ARM::t2MOVTi16:
+    {
+    // We want to avoid misleadingly allowing something like "mov r0, <symbol>"
+    // especially when we turn it into a movw and the expression <symbol> does
+    // not have a :lower16: or :upper16 as part of the expression.  We don't
+    // want the behavior of silently truncating, which can be unexpected and
+    // lead to bugs that are difficult to find since this is an easy mistake
+    // to make.
+    int i = (Operands[3]->isImm()) ? 3 : 4;
+    ARMOperand &Op = static_cast<ARMOperand &>(*Operands[i]);
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op.getImm());
+    if (CE) break;
+    const MCExpr *E = dyn_cast<MCExpr>(Op.getImm());
+    if (!E) break;
+    const ARMMCExpr *ARM16Expr = dyn_cast<ARMMCExpr>(E);
+    if (!ARM16Expr || (ARM16Expr->getKind() != ARMMCExpr::VK_ARM_HI16 &&
+                       ARM16Expr->getKind() != ARMMCExpr::VK_ARM_LO16))
+      return Error(
+          Op.getStartLoc(),
+          "immediate expression for mov requires :lower16: or :upper16");
+    break;
+  }
+  }
+
+  return false;
+}
+
+static unsigned getRealVSTOpcode(unsigned Opc, unsigned &Spacing) {
+  switch(Opc) {
+  default: llvm_unreachable("unexpected opcode!");
+  // VST1LN
+  case ARM::VST1LNdWB_fixed_Asm_8:  Spacing = 1; return ARM::VST1LNd8_UPD;
+  case ARM::VST1LNdWB_fixed_Asm_16: Spacing = 1; return ARM::VST1LNd16_UPD;
+  case ARM::VST1LNdWB_fixed_Asm_32: Spacing = 1; return ARM::VST1LNd32_UPD;
+  case ARM::VST1LNdWB_register_Asm_8:  Spacing = 1; return ARM::VST1LNd8_UPD;
+  case ARM::VST1LNdWB_register_Asm_16: Spacing = 1; return ARM::VST1LNd16_UPD;
+  case ARM::VST1LNdWB_register_Asm_32: Spacing = 1; return ARM::VST1LNd32_UPD;
+  case ARM::VST1LNdAsm_8:  Spacing = 1; return ARM::VST1LNd8;
+  case ARM::VST1LNdAsm_16: Spacing = 1; return ARM::VST1LNd16;
+  case ARM::VST1LNdAsm_32: Spacing = 1; return ARM::VST1LNd32;
+
+  // VST2LN
+  case ARM::VST2LNdWB_fixed_Asm_8:  Spacing = 1; return ARM::VST2LNd8_UPD;
+  case ARM::VST2LNdWB_fixed_Asm_16: Spacing = 1; return ARM::VST2LNd16_UPD;
+  case ARM::VST2LNdWB_fixed_Asm_32: Spacing = 1; return ARM::VST2LNd32_UPD;
+  case ARM::VST2LNqWB_fixed_Asm_16: Spacing = 2; return ARM::VST2LNq16_UPD;
+  case ARM::VST2LNqWB_fixed_Asm_32: Spacing = 2; return ARM::VST2LNq32_UPD;
+
+  case ARM::VST2LNdWB_register_Asm_8:  Spacing = 1; return ARM::VST2LNd8_UPD;
+  case ARM::VST2LNdWB_register_Asm_16: Spacing = 1; return ARM::VST2LNd16_UPD;
+  case ARM::VST2LNdWB_register_Asm_32: Spacing = 1; return ARM::VST2LNd32_UPD;
+  case ARM::VST2LNqWB_register_Asm_16: Spacing = 2; return ARM::VST2LNq16_UPD;
+  case ARM::VST2LNqWB_register_Asm_32: Spacing = 2; return ARM::VST2LNq32_UPD;
+
+  case ARM::VST2LNdAsm_8:  Spacing = 1; return ARM::VST2LNd8;
+  case ARM::VST2LNdAsm_16: Spacing = 1; return ARM::VST2LNd16;
+  case ARM::VST2LNdAsm_32: Spacing = 1; return ARM::VST2LNd32;
+  case ARM::VST2LNqAsm_16: Spacing = 2; return ARM::VST2LNq16;
+  case ARM::VST2LNqAsm_32: Spacing = 2; return ARM::VST2LNq32;
+
+  // VST3LN
+  case ARM::VST3LNdWB_fixed_Asm_8:  Spacing = 1; return ARM::VST3LNd8_UPD;
+  case ARM::VST3LNdWB_fixed_Asm_16: Spacing = 1; return ARM::VST3LNd16_UPD;
+  case ARM::VST3LNdWB_fixed_Asm_32: Spacing = 1; return ARM::VST3LNd32_UPD;
+  case ARM::VST3LNqWB_fixed_Asm_16: Spacing = 1; return ARM::VST3LNq16_UPD;
+  case ARM::VST3LNqWB_fixed_Asm_32: Spacing = 2; return ARM::VST3LNq32_UPD;
+  case ARM::VST3LNdWB_register_Asm_8:  Spacing = 1; return ARM::VST3LNd8_UPD;
+  case ARM::VST3LNdWB_register_Asm_16: Spacing = 1; return ARM::VST3LNd16_UPD;
+  case ARM::VST3LNdWB_register_Asm_32: Spacing = 1; return ARM::VST3LNd32_UPD;
+  case ARM::VST3LNqWB_register_Asm_16: Spacing = 2; return ARM::VST3LNq16_UPD;
+  case ARM::VST3LNqWB_register_Asm_32: Spacing = 2; return ARM::VST3LNq32_UPD;
+  case ARM::VST3LNdAsm_8:  Spacing = 1; return ARM::VST3LNd8;
+  case ARM::VST3LNdAsm_16: Spacing = 1; return ARM::VST3LNd16;
+  case ARM::VST3LNdAsm_32: Spacing = 1; return ARM::VST3LNd32;
+  case ARM::VST3LNqAsm_16: Spacing = 2; return ARM::VST3LNq16;
+  case ARM::VST3LNqAsm_32: Spacing = 2; return ARM::VST3LNq32;
+
+  // VST3
+  case ARM::VST3dWB_fixed_Asm_8:  Spacing = 1; return ARM::VST3d8_UPD;
+  case ARM::VST3dWB_fixed_Asm_16: Spacing = 1; return ARM::VST3d16_UPD;
+  case ARM::VST3dWB_fixed_Asm_32: Spacing = 1; return ARM::VST3d32_UPD;
+  case ARM::VST3qWB_fixed_Asm_8:  Spacing = 2; return ARM::VST3q8_UPD;
+  case ARM::VST3qWB_fixed_Asm_16: Spacing = 2; return ARM::VST3q16_UPD;
+  case ARM::VST3qWB_fixed_Asm_32: Spacing = 2; return ARM::VST3q32_UPD;
+  case ARM::VST3dWB_register_Asm_8:  Spacing = 1; return ARM::VST3d8_UPD;
+  case ARM::VST3dWB_register_Asm_16: Spacing = 1; return ARM::VST3d16_UPD;
+  case ARM::VST3dWB_register_Asm_32: Spacing = 1; return ARM::VST3d32_UPD;
+  case ARM::VST3qWB_register_Asm_8:  Spacing = 2; return ARM::VST3q8_UPD;
+  case ARM::VST3qWB_register_Asm_16: Spacing = 2; return ARM::VST3q16_UPD;
+  case ARM::VST3qWB_register_Asm_32: Spacing = 2; return ARM::VST3q32_UPD;
+  case ARM::VST3dAsm_8:  Spacing = 1; return ARM::VST3d8;
+  case ARM::VST3dAsm_16: Spacing = 1; return ARM::VST3d16;
+  case ARM::VST3dAsm_32: Spacing = 1; return ARM::VST3d32;
+  case ARM::VST3qAsm_8:  Spacing = 2; return ARM::VST3q8;
+  case ARM::VST3qAsm_16: Spacing = 2; return ARM::VST3q16;
+  case ARM::VST3qAsm_32: Spacing = 2; return ARM::VST3q32;
+
+  // VST4LN
+  case ARM::VST4LNdWB_fixed_Asm_8:  Spacing = 1; return ARM::VST4LNd8_UPD;
+  case ARM::VST4LNdWB_fixed_Asm_16: Spacing = 1; return ARM::VST4LNd16_UPD;
+  case ARM::VST4LNdWB_fixed_Asm_32: Spacing = 1; return ARM::VST4LNd32_UPD;
+  case ARM::VST4LNqWB_fixed_Asm_16: Spacing = 1; return ARM::VST4LNq16_UPD;
+  case ARM::VST4LNqWB_fixed_Asm_32: Spacing = 2; return ARM::VST4LNq32_UPD;
+  case ARM::VST4LNdWB_register_Asm_8:  Spacing = 1; return ARM::VST4LNd8_UPD;
+  case ARM::VST4LNdWB_register_Asm_16: Spacing = 1; return ARM::VST4LNd16_UPD;
+  case ARM::VST4LNdWB_register_Asm_32: Spacing = 1; return ARM::VST4LNd32_UPD;
+  case ARM::VST4LNqWB_register_Asm_16: Spacing = 2; return ARM::VST4LNq16_UPD;
+  case ARM::VST4LNqWB_register_Asm_32: Spacing = 2; return ARM::VST4LNq32_UPD;
+  case ARM::VST4LNdAsm_8:  Spacing = 1; return ARM::VST4LNd8;
+  case ARM::VST4LNdAsm_16: Spacing = 1; return ARM::VST4LNd16;
+  case ARM::VST4LNdAsm_32: Spacing = 1; return ARM::VST4LNd32;
+  case ARM::VST4LNqAsm_16: Spacing = 2; return ARM::VST4LNq16;
+  case ARM::VST4LNqAsm_32: Spacing = 2; return ARM::VST4LNq32;
+
+  // VST4
+  case ARM::VST4dWB_fixed_Asm_8:  Spacing = 1; return ARM::VST4d8_UPD;
+  case ARM::VST4dWB_fixed_Asm_16: Spacing = 1; return ARM::VST4d16_UPD;
+  case ARM::VST4dWB_fixed_Asm_32: Spacing = 1; return ARM::VST4d32_UPD;
+  case ARM::VST4qWB_fixed_Asm_8:  Spacing = 2; return ARM::VST4q8_UPD;
+  case ARM::VST4qWB_fixed_Asm_16: Spacing = 2; return ARM::VST4q16_UPD;
+  case ARM::VST4qWB_fixed_Asm_32: Spacing = 2; return ARM::VST4q32_UPD;
+  case ARM::VST4dWB_register_Asm_8:  Spacing = 1; return ARM::VST4d8_UPD;
+  case ARM::VST4dWB_register_Asm_16: Spacing = 1; return ARM::VST4d16_UPD;
+  case ARM::VST4dWB_register_Asm_32: Spacing = 1; return ARM::VST4d32_UPD;
+  case ARM::VST4qWB_register_Asm_8:  Spacing = 2; return ARM::VST4q8_UPD;
+  case ARM::VST4qWB_register_Asm_16: Spacing = 2; return ARM::VST4q16_UPD;
+  case ARM::VST4qWB_register_Asm_32: Spacing = 2; return ARM::VST4q32_UPD;
+  case ARM::VST4dAsm_8:  Spacing = 1; return ARM::VST4d8;
+  case ARM::VST4dAsm_16: Spacing = 1; return ARM::VST4d16;
+  case ARM::VST4dAsm_32: Spacing = 1; return ARM::VST4d32;
+  case ARM::VST4qAsm_8:  Spacing = 2; return ARM::VST4q8;
+  case ARM::VST4qAsm_16: Spacing = 2; return ARM::VST4q16;
+  case ARM::VST4qAsm_32: Spacing = 2; return ARM::VST4q32;
+  }
+}
+
+static unsigned getRealVLDOpcode(unsigned Opc, unsigned &Spacing) {
+  switch(Opc) {
+  default: llvm_unreachable("unexpected opcode!");
+  // VLD1LN
+  case ARM::VLD1LNdWB_fixed_Asm_8:  Spacing = 1; return ARM::VLD1LNd8_UPD;
+  case ARM::VLD1LNdWB_fixed_Asm_16: Spacing = 1; return ARM::VLD1LNd16_UPD;
+  case ARM::VLD1LNdWB_fixed_Asm_32: Spacing = 1; return ARM::VLD1LNd32_UPD;
+  case ARM::VLD1LNdWB_register_Asm_8:  Spacing = 1; return ARM::VLD1LNd8_UPD;
+  case ARM::VLD1LNdWB_register_Asm_16: Spacing = 1; return ARM::VLD1LNd16_UPD;
+  case ARM::VLD1LNdWB_register_Asm_32: Spacing = 1; return ARM::VLD1LNd32_UPD;
+  case ARM::VLD1LNdAsm_8:  Spacing = 1; return ARM::VLD1LNd8;
+  case ARM::VLD1LNdAsm_16: Spacing = 1; return ARM::VLD1LNd16;
+  case ARM::VLD1LNdAsm_32: Spacing = 1; return ARM::VLD1LNd32;
+
+  // VLD2LN
+  case ARM::VLD2LNdWB_fixed_Asm_8:  Spacing = 1; return ARM::VLD2LNd8_UPD;
+  case ARM::VLD2LNdWB_fixed_Asm_16: Spacing = 1; return ARM::VLD2LNd16_UPD;
+  case ARM::VLD2LNdWB_fixed_Asm_32: Spacing = 1; return ARM::VLD2LNd32_UPD;
+  case ARM::VLD2LNqWB_fixed_Asm_16: Spacing = 1; return ARM::VLD2LNq16_UPD;
+  case ARM::VLD2LNqWB_fixed_Asm_32: Spacing = 2; return ARM::VLD2LNq32_UPD;
+  case ARM::VLD2LNdWB_register_Asm_8:  Spacing = 1; return ARM::VLD2LNd8_UPD;
+  case ARM::VLD2LNdWB_register_Asm_16: Spacing = 1; return ARM::VLD2LNd16_UPD;
+  case ARM::VLD2LNdWB_register_Asm_32: Spacing = 1; return ARM::VLD2LNd32_UPD;
+  case ARM::VLD2LNqWB_register_Asm_16: Spacing = 2; return ARM::VLD2LNq16_UPD;
+  case ARM::VLD2LNqWB_register_Asm_32: Spacing = 2; return ARM::VLD2LNq32_UPD;
+  case ARM::VLD2LNdAsm_8:  Spacing = 1; return ARM::VLD2LNd8;
+  case ARM::VLD2LNdAsm_16: Spacing = 1; return ARM::VLD2LNd16;
+  case ARM::VLD2LNdAsm_32: Spacing = 1; return ARM::VLD2LNd32;
+  case ARM::VLD2LNqAsm_16: Spacing = 2; return ARM::VLD2LNq16;
+  case ARM::VLD2LNqAsm_32: Spacing = 2; return ARM::VLD2LNq32;
+
+  // VLD3DUP
+  case ARM::VLD3DUPdWB_fixed_Asm_8:  Spacing = 1; return ARM::VLD3DUPd8_UPD;
+  case ARM::VLD3DUPdWB_fixed_Asm_16: Spacing = 1; return ARM::VLD3DUPd16_UPD;
+  case ARM::VLD3DUPdWB_fixed_Asm_32: Spacing = 1; return ARM::VLD3DUPd32_UPD;
+  case ARM::VLD3DUPqWB_fixed_Asm_8: Spacing = 1; return ARM::VLD3DUPq8_UPD;
+  case ARM::VLD3DUPqWB_fixed_Asm_16: Spacing = 2; return ARM::VLD3DUPq16_UPD;
+  case ARM::VLD3DUPqWB_fixed_Asm_32: Spacing = 2; return ARM::VLD3DUPq32_UPD;
+  case ARM::VLD3DUPdWB_register_Asm_8:  Spacing = 1; return ARM::VLD3DUPd8_UPD;
+  case ARM::VLD3DUPdWB_register_Asm_16: Spacing = 1; return ARM::VLD3DUPd16_UPD;
+  case ARM::VLD3DUPdWB_register_Asm_32: Spacing = 1; return ARM::VLD3DUPd32_UPD;
+  case ARM::VLD3DUPqWB_register_Asm_8: Spacing = 2; return ARM::VLD3DUPq8_UPD;
+  case ARM::VLD3DUPqWB_register_Asm_16: Spacing = 2; return ARM::VLD3DUPq16_UPD;
+  case ARM::VLD3DUPqWB_register_Asm_32: Spacing = 2; return ARM::VLD3DUPq32_UPD;
+  case ARM::VLD3DUPdAsm_8:  Spacing = 1; return ARM::VLD3DUPd8;
+  case ARM::VLD3DUPdAsm_16: Spacing = 1; return ARM::VLD3DUPd16;
+  case ARM::VLD3DUPdAsm_32: Spacing = 1; return ARM::VLD3DUPd32;
+  case ARM::VLD3DUPqAsm_8: Spacing = 2; return ARM::VLD3DUPq8;
+  case ARM::VLD3DUPqAsm_16: Spacing = 2; return ARM::VLD3DUPq16;
+  case ARM::VLD3DUPqAsm_32: Spacing = 2; return ARM::VLD3DUPq32;
+
+  // VLD3LN
+  case ARM::VLD3LNdWB_fixed_Asm_8:  Spacing = 1; return ARM::VLD3LNd8_UPD;
+  case ARM::VLD3LNdWB_fixed_Asm_16: Spacing = 1; return ARM::VLD3LNd16_UPD;
+  case ARM::VLD3LNdWB_fixed_Asm_32: Spacing = 1; return ARM::VLD3LNd32_UPD;
+  case ARM::VLD3LNqWB_fixed_Asm_16: Spacing = 1; return ARM::VLD3LNq16_UPD;
+  case ARM::VLD3LNqWB_fixed_Asm_32: Spacing = 2; return ARM::VLD3LNq32_UPD;
+  case ARM::VLD3LNdWB_register_Asm_8:  Spacing = 1; return ARM::VLD3LNd8_UPD;
+  case ARM::VLD3LNdWB_register_Asm_16: Spacing = 1; return ARM::VLD3LNd16_UPD;
+  case ARM::VLD3LNdWB_register_Asm_32: Spacing = 1; return ARM::VLD3LNd32_UPD;
+  case ARM::VLD3LNqWB_register_Asm_16: Spacing = 2; return ARM::VLD3LNq16_UPD;
+  case ARM::VLD3LNqWB_register_Asm_32: Spacing = 2; return ARM::VLD3LNq32_UPD;
+  case ARM::VLD3LNdAsm_8:  Spacing = 1; return ARM::VLD3LNd8;
+  case ARM::VLD3LNdAsm_16: Spacing = 1; return ARM::VLD3LNd16;
+  case ARM::VLD3LNdAsm_32: Spacing = 1; return ARM::VLD3LNd32;
+  case ARM::VLD3LNqAsm_16: Spacing = 2; return ARM::VLD3LNq16;
+  case ARM::VLD3LNqAsm_32: Spacing = 2; return ARM::VLD3LNq32;
+
+  // VLD3
+  case ARM::VLD3dWB_fixed_Asm_8:  Spacing = 1; return ARM::VLD3d8_UPD;
+  case ARM::VLD3dWB_fixed_Asm_16: Spacing = 1; return ARM::VLD3d16_UPD;
+  case ARM::VLD3dWB_fixed_Asm_32: Spacing = 1; return ARM::VLD3d32_UPD;
+  case ARM::VLD3qWB_fixed_Asm_8:  Spacing = 2; return ARM::VLD3q8_UPD;
+  case ARM::VLD3qWB_fixed_Asm_16: Spacing = 2; return ARM::VLD3q16_UPD;
+  case ARM::VLD3qWB_fixed_Asm_32: Spacing = 2; return ARM::VLD3q32_UPD;
+  case ARM::VLD3dWB_register_Asm_8:  Spacing = 1; return ARM::VLD3d8_UPD;
+  case ARM::VLD3dWB_register_Asm_16: Spacing = 1; return ARM::VLD3d16_UPD;
+  case ARM::VLD3dWB_register_Asm_32: Spacing = 1; return ARM::VLD3d32_UPD;
+  case ARM::VLD3qWB_register_Asm_8:  Spacing = 2; return ARM::VLD3q8_UPD;
+  case ARM::VLD3qWB_register_Asm_16: Spacing = 2; return ARM::VLD3q16_UPD;
+  case ARM::VLD3qWB_register_Asm_32: Spacing = 2; return ARM::VLD3q32_UPD;
+  case ARM::VLD3dAsm_8:  Spacing = 1; return ARM::VLD3d8;
+  case ARM::VLD3dAsm_16: Spacing = 1; return ARM::VLD3d16;
+  case ARM::VLD3dAsm_32: Spacing = 1; return ARM::VLD3d32;
+  case ARM::VLD3qAsm_8:  Spacing = 2; return ARM::VLD3q8;
+  case ARM::VLD3qAsm_16: Spacing = 2; return ARM::VLD3q16;
+  case ARM::VLD3qAsm_32: Spacing = 2; return ARM::VLD3q32;
+
+  // VLD4LN
+  case ARM::VLD4LNdWB_fixed_Asm_8:  Spacing = 1; return ARM::VLD4LNd8_UPD;
+  case ARM::VLD4LNdWB_fixed_Asm_16: Spacing = 1; return ARM::VLD4LNd16_UPD;
+  case ARM::VLD4LNdWB_fixed_Asm_32: Spacing = 1; return ARM::VLD4LNd32_UPD;
+  case ARM::VLD4LNqWB_fixed_Asm_16: Spacing = 2; return ARM::VLD4LNq16_UPD;
+  case ARM::VLD4LNqWB_fixed_Asm_32: Spacing = 2; return ARM::VLD4LNq32_UPD;
+  case ARM::VLD4LNdWB_register_Asm_8:  Spacing = 1; return ARM::VLD4LNd8_UPD;
+  case ARM::VLD4LNdWB_register_Asm_16: Spacing = 1; return ARM::VLD4LNd16_UPD;
+  case ARM::VLD4LNdWB_register_Asm_32: Spacing = 1; return ARM::VLD4LNd32_UPD;
+  case ARM::VLD4LNqWB_register_Asm_16: Spacing = 2; return ARM::VLD4LNq16_UPD;
+  case ARM::VLD4LNqWB_register_Asm_32: Spacing = 2; return ARM::VLD4LNq32_UPD;
+  case ARM::VLD4LNdAsm_8:  Spacing = 1; return ARM::VLD4LNd8;
+  case ARM::VLD4LNdAsm_16: Spacing = 1; return ARM::VLD4LNd16;
+  case ARM::VLD4LNdAsm_32: Spacing = 1; return ARM::VLD4LNd32;
+  case ARM::VLD4LNqAsm_16: Spacing = 2; return ARM::VLD4LNq16;
+  case ARM::VLD4LNqAsm_32: Spacing = 2; return ARM::VLD4LNq32;
+
+  // VLD4DUP
+  case ARM::VLD4DUPdWB_fixed_Asm_8:  Spacing = 1; return ARM::VLD4DUPd8_UPD;
+  case ARM::VLD4DUPdWB_fixed_Asm_16: Spacing = 1; return ARM::VLD4DUPd16_UPD;
+  case ARM::VLD4DUPdWB_fixed_Asm_32: Spacing = 1; return ARM::VLD4DUPd32_UPD;
+  case ARM::VLD4DUPqWB_fixed_Asm_8: Spacing = 1; return ARM::VLD4DUPq8_UPD;
+  case ARM::VLD4DUPqWB_fixed_Asm_16: Spacing = 1; return ARM::VLD4DUPq16_UPD;
+  case ARM::VLD4DUPqWB_fixed_Asm_32: Spacing = 2; return ARM::VLD4DUPq32_UPD;
+  case ARM::VLD4DUPdWB_register_Asm_8:  Spacing = 1; return ARM::VLD4DUPd8_UPD;
+  case ARM::VLD4DUPdWB_register_Asm_16: Spacing = 1; return ARM::VLD4DUPd16_UPD;
+  case ARM::VLD4DUPdWB_register_Asm_32: Spacing = 1; return ARM::VLD4DUPd32_UPD;
+  case ARM::VLD4DUPqWB_register_Asm_8: Spacing = 2; return ARM::VLD4DUPq8_UPD;
+  case ARM::VLD4DUPqWB_register_Asm_16: Spacing = 2; return ARM::VLD4DUPq16_UPD;
+  case ARM::VLD4DUPqWB_register_Asm_32: Spacing = 2; return ARM::VLD4DUPq32_UPD;
+  case ARM::VLD4DUPdAsm_8:  Spacing = 1; return ARM::VLD4DUPd8;
+  case ARM::VLD4DUPdAsm_16: Spacing = 1; return ARM::VLD4DUPd16;
+  case ARM::VLD4DUPdAsm_32: Spacing = 1; return ARM::VLD4DUPd32;
+  case ARM::VLD4DUPqAsm_8: Spacing = 2; return ARM::VLD4DUPq8;
+  case ARM::VLD4DUPqAsm_16: Spacing = 2; return ARM::VLD4DUPq16;
+  case ARM::VLD4DUPqAsm_32: Spacing = 2; return ARM::VLD4DUPq32;
+
+  // VLD4
+  case ARM::VLD4dWB_fixed_Asm_8:  Spacing = 1; return ARM::VLD4d8_UPD;
+  case ARM::VLD4dWB_fixed_Asm_16: Spacing = 1; return ARM::VLD4d16_UPD;
+  case ARM::VLD4dWB_fixed_Asm_32: Spacing = 1; return ARM::VLD4d32_UPD;
+  case ARM::VLD4qWB_fixed_Asm_8:  Spacing = 2; return ARM::VLD4q8_UPD;
+  case ARM::VLD4qWB_fixed_Asm_16: Spacing = 2; return ARM::VLD4q16_UPD;
+  case ARM::VLD4qWB_fixed_Asm_32: Spacing = 2; return ARM::VLD4q32_UPD;
+  case ARM::VLD4dWB_register_Asm_8:  Spacing = 1; return ARM::VLD4d8_UPD;
+  case ARM::VLD4dWB_register_Asm_16: Spacing = 1; return ARM::VLD4d16_UPD;
+  case ARM::VLD4dWB_register_Asm_32: Spacing = 1; return ARM::VLD4d32_UPD;
+  case ARM::VLD4qWB_register_Asm_8:  Spacing = 2; return ARM::VLD4q8_UPD;
+  case ARM::VLD4qWB_register_Asm_16: Spacing = 2; return ARM::VLD4q16_UPD;
+  case ARM::VLD4qWB_register_Asm_32: Spacing = 2; return ARM::VLD4q32_UPD;
+  case ARM::VLD4dAsm_8:  Spacing = 1; return ARM::VLD4d8;
+  case ARM::VLD4dAsm_16: Spacing = 1; return ARM::VLD4d16;
+  case ARM::VLD4dAsm_32: Spacing = 1; return ARM::VLD4d32;
+  case ARM::VLD4qAsm_8:  Spacing = 2; return ARM::VLD4q8;
+  case ARM::VLD4qAsm_16: Spacing = 2; return ARM::VLD4q16;
+  case ARM::VLD4qAsm_32: Spacing = 2; return ARM::VLD4q32;
+  }
+}
+
+bool ARMAsmParser::processInstruction(MCInst &Inst,
+                                      const OperandVector &Operands) {
+  switch (Inst.getOpcode()) {
+  // Alias for alternate form of 'ldr{,b}t Rt, [Rn], #imm' instruction.
+  case ARM::LDRT_POST:
+  case ARM::LDRBT_POST: {
+    const unsigned Opcode =
+      (Inst.getOpcode() == ARM::LDRT_POST) ? ARM::LDRT_POST_IMM
+                                           : ARM::LDRBT_POST_IMM;
+    MCInst TmpInst;
+    TmpInst.setOpcode(Opcode);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::CreateReg(0));
+    TmpInst.addOperand(MCOperand::CreateImm(0));
+    TmpInst.addOperand(Inst.getOperand(2));
+    TmpInst.addOperand(Inst.getOperand(3));
+    Inst = TmpInst;
+    return true;
+  }
+  // Alias for alternate form of 'str{,b}t Rt, [Rn], #imm' instruction.
+  case ARM::STRT_POST:
+  case ARM::STRBT_POST: {
+    const unsigned Opcode =
+      (Inst.getOpcode() == ARM::STRT_POST) ? ARM::STRT_POST_IMM
+                                           : ARM::STRBT_POST_IMM;
+    MCInst TmpInst;
+    TmpInst.setOpcode(Opcode);
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::CreateReg(0));
+    TmpInst.addOperand(MCOperand::CreateImm(0));
+    TmpInst.addOperand(Inst.getOperand(2));
+    TmpInst.addOperand(Inst.getOperand(3));
+    Inst = TmpInst;
+    return true;
+  }
+  // Alias for alternate form of 'ADR Rd, #imm' instruction.
+  case ARM::ADDri: {
+    if (Inst.getOperand(1).getReg() != ARM::PC ||
+        Inst.getOperand(5).getReg() != 0)
+      return false;
+    MCInst TmpInst;
+    TmpInst.setOpcode(ARM::ADR);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(2));
+    TmpInst.addOperand(Inst.getOperand(3));
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
+  // Aliases for alternate PC+imm syntax of LDR instructions.
+  case ARM::t2LDRpcrel:
+    // Select the narrow version if the immediate will fit.
+    if (Inst.getOperand(1).getImm() > 0 &&
+        Inst.getOperand(1).getImm() <= 0xff &&
+        !(static_cast<ARMOperand &>(*Operands[2]).isToken() &&
+          static_cast<ARMOperand &>(*Operands[2]).getToken() == ".w"))
+      Inst.setOpcode(ARM::tLDRpci);
+    else
+      Inst.setOpcode(ARM::t2LDRpci);
+    return true;
+  case ARM::t2LDRBpcrel:
+    Inst.setOpcode(ARM::t2LDRBpci);
+    return true;
+  case ARM::t2LDRHpcrel:
+    Inst.setOpcode(ARM::t2LDRHpci);
+    return true;
+  case ARM::t2LDRSBpcrel:
+    Inst.setOpcode(ARM::t2LDRSBpci);
+    return true;
+  case ARM::t2LDRSHpcrel:
+    Inst.setOpcode(ARM::t2LDRSHpci);
+    return true;
+  // Handle NEON VST complex aliases.
+  case ARM::VST1LNdWB_register_Asm_8:
+  case ARM::VST1LNdWB_register_Asm_16:
+  case ARM::VST1LNdWB_register_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(4)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(5)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(6));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST2LNdWB_register_Asm_8:
+  case ARM::VST2LNdWB_register_Asm_16:
+  case ARM::VST2LNdWB_register_Asm_32:
+  case ARM::VST2LNqWB_register_Asm_16:
+  case ARM::VST2LNqWB_register_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(4)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(5)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(6));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST3LNdWB_register_Asm_8:
+  case ARM::VST3LNdWB_register_Asm_16:
+  case ARM::VST3LNdWB_register_Asm_32:
+  case ARM::VST3LNqWB_register_Asm_16:
+  case ARM::VST3LNqWB_register_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(4)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(5)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(6));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST4LNdWB_register_Asm_8:
+  case ARM::VST4LNdWB_register_Asm_16:
+  case ARM::VST4LNdWB_register_Asm_32:
+  case ARM::VST4LNqWB_register_Asm_16:
+  case ARM::VST4LNqWB_register_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(4)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(5)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(6));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST1LNdWB_fixed_Asm_8:
+  case ARM::VST1LNdWB_fixed_Asm_16:
+  case ARM::VST1LNdWB_fixed_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(MCOperand::CreateReg(0)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST2LNdWB_fixed_Asm_8:
+  case ARM::VST2LNdWB_fixed_Asm_16:
+  case ARM::VST2LNdWB_fixed_Asm_32:
+  case ARM::VST2LNqWB_fixed_Asm_16:
+  case ARM::VST2LNqWB_fixed_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(MCOperand::CreateReg(0)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST3LNdWB_fixed_Asm_8:
+  case ARM::VST3LNdWB_fixed_Asm_16:
+  case ARM::VST3LNdWB_fixed_Asm_32:
+  case ARM::VST3LNqWB_fixed_Asm_16:
+  case ARM::VST3LNqWB_fixed_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(MCOperand::CreateReg(0)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST4LNdWB_fixed_Asm_8:
+  case ARM::VST4LNdWB_fixed_Asm_16:
+  case ARM::VST4LNdWB_fixed_Asm_32:
+  case ARM::VST4LNqWB_fixed_Asm_16:
+  case ARM::VST4LNqWB_fixed_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(MCOperand::CreateReg(0)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST1LNdAsm_8:
+  case ARM::VST1LNdAsm_16:
+  case ARM::VST1LNdAsm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST2LNdAsm_8:
+  case ARM::VST2LNdAsm_16:
+  case ARM::VST2LNdAsm_32:
+  case ARM::VST2LNqAsm_16:
+  case ARM::VST2LNqAsm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST3LNdAsm_8:
+  case ARM::VST3LNdAsm_16:
+  case ARM::VST3LNdAsm_32:
+  case ARM::VST3LNqAsm_16:
+  case ARM::VST3LNqAsm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST4LNdAsm_8:
+  case ARM::VST4LNdAsm_16:
+  case ARM::VST4LNdAsm_32:
+  case ARM::VST4LNqAsm_16:
+  case ARM::VST4LNqAsm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  // Handle NEON VLD complex aliases.
+  case ARM::VLD1LNdWB_register_Asm_8:
+  case ARM::VLD1LNdWB_register_Asm_16:
+  case ARM::VLD1LNdWB_register_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(4)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(5)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(6));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD2LNdWB_register_Asm_8:
+  case ARM::VLD2LNdWB_register_Asm_16:
+  case ARM::VLD2LNdWB_register_Asm_32:
+  case ARM::VLD2LNqWB_register_Asm_16:
+  case ARM::VLD2LNqWB_register_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(4)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(5)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(6));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD3LNdWB_register_Asm_8:
+  case ARM::VLD3LNdWB_register_Asm_16:
+  case ARM::VLD3LNdWB_register_Asm_32:
+  case ARM::VLD3LNqWB_register_Asm_16:
+  case ARM::VLD3LNqWB_register_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(4)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(5)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(6));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD4LNdWB_register_Asm_8:
+  case ARM::VLD4LNdWB_register_Asm_16:
+  case ARM::VLD4LNdWB_register_Asm_32:
+  case ARM::VLD4LNqWB_register_Asm_16:
+  case ARM::VLD4LNqWB_register_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(4)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(5)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(6));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD1LNdWB_fixed_Asm_8:
+  case ARM::VLD1LNdWB_fixed_Asm_16:
+  case ARM::VLD1LNdWB_fixed_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(MCOperand::CreateReg(0)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD2LNdWB_fixed_Asm_8:
+  case ARM::VLD2LNdWB_fixed_Asm_16:
+  case ARM::VLD2LNdWB_fixed_Asm_32:
+  case ARM::VLD2LNqWB_fixed_Asm_16:
+  case ARM::VLD2LNqWB_fixed_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(MCOperand::CreateReg(0)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD3LNdWB_fixed_Asm_8:
+  case ARM::VLD3LNdWB_fixed_Asm_16:
+  case ARM::VLD3LNdWB_fixed_Asm_32:
+  case ARM::VLD3LNqWB_fixed_Asm_16:
+  case ARM::VLD3LNqWB_fixed_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(MCOperand::CreateReg(0)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD4LNdWB_fixed_Asm_8:
+  case ARM::VLD4LNdWB_fixed_Asm_16:
+  case ARM::VLD4LNdWB_fixed_Asm_32:
+  case ARM::VLD4LNqWB_fixed_Asm_16:
+  case ARM::VLD4LNqWB_fixed_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(MCOperand::CreateReg(0)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD1LNdAsm_8:
+  case ARM::VLD1LNdAsm_16:
+  case ARM::VLD1LNdAsm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD2LNdAsm_8:
+  case ARM::VLD2LNdAsm_16:
+  case ARM::VLD2LNdAsm_32:
+  case ARM::VLD2LNqAsm_16:
+  case ARM::VLD2LNqAsm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD3LNdAsm_8:
+  case ARM::VLD3LNdAsm_16:
+  case ARM::VLD3LNdAsm_32:
+  case ARM::VLD3LNqAsm_16:
+  case ARM::VLD3LNqAsm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD4LNdAsm_8:
+  case ARM::VLD4LNdAsm_16:
+  case ARM::VLD4LNdAsm_32:
+  case ARM::VLD4LNqAsm_16:
+  case ARM::VLD4LNqAsm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  // VLD3DUP single 3-element structure to all lanes instructions.
+  case ARM::VLD3DUPdAsm_8:
+  case ARM::VLD3DUPdAsm_16:
+  case ARM::VLD3DUPdAsm_32:
+  case ARM::VLD3DUPqAsm_8:
+  case ARM::VLD3DUPqAsm_16:
+  case ARM::VLD3DUPqAsm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD3DUPdWB_fixed_Asm_8:
+  case ARM::VLD3DUPdWB_fixed_Asm_16:
+  case ARM::VLD3DUPdWB_fixed_Asm_32:
+  case ARM::VLD3DUPqWB_fixed_Asm_8:
+  case ARM::VLD3DUPqWB_fixed_Asm_16:
+  case ARM::VLD3DUPqWB_fixed_Asm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(MCOperand::CreateReg(0)); // Rm
+    TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD3DUPdWB_register_Asm_8:
+  case ARM::VLD3DUPdWB_register_Asm_16:
+  case ARM::VLD3DUPdWB_register_Asm_32:
+  case ARM::VLD3DUPqWB_register_Asm_8:
+  case ARM::VLD3DUPqWB_register_Asm_16:
+  case ARM::VLD3DUPqWB_register_Asm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(Inst.getOperand(3)); // Rm
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  // VLD3 multiple 3-element structure instructions.
+  case ARM::VLD3dAsm_8:
+  case ARM::VLD3dAsm_16:
+  case ARM::VLD3dAsm_32:
+  case ARM::VLD3qAsm_8:
+  case ARM::VLD3qAsm_16:
+  case ARM::VLD3qAsm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD3dWB_fixed_Asm_8:
+  case ARM::VLD3dWB_fixed_Asm_16:
+  case ARM::VLD3dWB_fixed_Asm_32:
+  case ARM::VLD3qWB_fixed_Asm_8:
+  case ARM::VLD3qWB_fixed_Asm_16:
+  case ARM::VLD3qWB_fixed_Asm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(MCOperand::CreateReg(0)); // Rm
+    TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD3dWB_register_Asm_8:
+  case ARM::VLD3dWB_register_Asm_16:
+  case ARM::VLD3dWB_register_Asm_32:
+  case ARM::VLD3qWB_register_Asm_8:
+  case ARM::VLD3qWB_register_Asm_16:
+  case ARM::VLD3qWB_register_Asm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(Inst.getOperand(3)); // Rm
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  // VLD4DUP single 3-element structure to all lanes instructions.
+  case ARM::VLD4DUPdAsm_8:
+  case ARM::VLD4DUPdAsm_16:
+  case ARM::VLD4DUPdAsm_32:
+  case ARM::VLD4DUPqAsm_8:
+  case ARM::VLD4DUPqAsm_16:
+  case ARM::VLD4DUPqAsm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD4DUPdWB_fixed_Asm_8:
+  case ARM::VLD4DUPdWB_fixed_Asm_16:
+  case ARM::VLD4DUPdWB_fixed_Asm_32:
+  case ARM::VLD4DUPqWB_fixed_Asm_8:
+  case ARM::VLD4DUPqWB_fixed_Asm_16:
+  case ARM::VLD4DUPqWB_fixed_Asm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(MCOperand::CreateReg(0)); // Rm
+    TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD4DUPdWB_register_Asm_8:
+  case ARM::VLD4DUPdWB_register_Asm_16:
+  case ARM::VLD4DUPdWB_register_Asm_32:
+  case ARM::VLD4DUPqWB_register_Asm_8:
+  case ARM::VLD4DUPqWB_register_Asm_16:
+  case ARM::VLD4DUPqWB_register_Asm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(Inst.getOperand(3)); // Rm
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  // VLD4 multiple 4-element structure instructions.
+  case ARM::VLD4dAsm_8:
+  case ARM::VLD4dAsm_16:
+  case ARM::VLD4dAsm_32:
+  case ARM::VLD4qAsm_8:
+  case ARM::VLD4qAsm_16:
+  case ARM::VLD4qAsm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD4dWB_fixed_Asm_8:
+  case ARM::VLD4dWB_fixed_Asm_16:
+  case ARM::VLD4dWB_fixed_Asm_32:
+  case ARM::VLD4qWB_fixed_Asm_8:
+  case ARM::VLD4qWB_fixed_Asm_16:
+  case ARM::VLD4qWB_fixed_Asm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(MCOperand::CreateReg(0)); // Rm
+    TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD4dWB_register_Asm_8:
+  case ARM::VLD4dWB_register_Asm_16:
+  case ARM::VLD4dWB_register_Asm_32:
+  case ARM::VLD4qWB_register_Asm_8:
+  case ARM::VLD4qWB_register_Asm_16:
+  case ARM::VLD4qWB_register_Asm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(Inst.getOperand(3)); // Rm
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  // VST3 multiple 3-element structure instructions.
+  case ARM::VST3dAsm_8:
+  case ARM::VST3dAsm_16:
+  case ARM::VST3dAsm_32:
+  case ARM::VST3qAsm_8:
+  case ARM::VST3qAsm_16:
+  case ARM::VST3qAsm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST3dWB_fixed_Asm_8:
+  case ARM::VST3dWB_fixed_Asm_16:
+  case ARM::VST3dWB_fixed_Asm_32:
+  case ARM::VST3qWB_fixed_Asm_8:
+  case ARM::VST3qWB_fixed_Asm_16:
+  case ARM::VST3qWB_fixed_Asm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(MCOperand::CreateReg(0)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST3dWB_register_Asm_8:
+  case ARM::VST3dWB_register_Asm_16:
+  case ARM::VST3dWB_register_Asm_32:
+  case ARM::VST3qWB_register_Asm_8:
+  case ARM::VST3qWB_register_Asm_16:
+  case ARM::VST3qWB_register_Asm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(Inst.getOperand(3)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  // VST4 multiple 3-element structure instructions.
+  case ARM::VST4dAsm_8:
+  case ARM::VST4dAsm_16:
+  case ARM::VST4dAsm_32:
+  case ARM::VST4qAsm_8:
+  case ARM::VST4qAsm_16:
+  case ARM::VST4qAsm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST4dWB_fixed_Asm_8:
+  case ARM::VST4dWB_fixed_Asm_16:
+  case ARM::VST4dWB_fixed_Asm_32:
+  case ARM::VST4qWB_fixed_Asm_8:
+  case ARM::VST4qWB_fixed_Asm_16:
+  case ARM::VST4qWB_fixed_Asm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(MCOperand::CreateReg(0)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST4dWB_register_Asm_8:
+  case ARM::VST4dWB_register_Asm_16:
+  case ARM::VST4dWB_register_Asm_32:
+  case ARM::VST4qWB_register_Asm_8:
+  case ARM::VST4qWB_register_Asm_16:
+  case ARM::VST4qWB_register_Asm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(Inst.getOperand(3)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  // Handle encoding choice for the shift-immediate instructions.
+  case ARM::t2LSLri:
+  case ARM::t2LSRri:
+  case ARM::t2ASRri: {
+    if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
+        Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() &&
+        Inst.getOperand(5).getReg() == (inITBlock() ? 0 : ARM::CPSR) &&
+        !(static_cast<ARMOperand &>(*Operands[3]).isToken() &&
+          static_cast<ARMOperand &>(*Operands[3]).getToken() == ".w")) {
+      unsigned NewOpc;
+      switch (Inst.getOpcode()) {
+      default: llvm_unreachable("unexpected opcode");
+      case ARM::t2LSLri: NewOpc = ARM::tLSLri; break;
+      case ARM::t2LSRri: NewOpc = ARM::tLSRri; break;
+      case ARM::t2ASRri: NewOpc = ARM::tASRri; break;
+      }
+      // The Thumb1 operands aren't in the same order. Awesome, eh?
+      MCInst TmpInst;
+      TmpInst.setOpcode(NewOpc);
+      TmpInst.addOperand(Inst.getOperand(0));
+      TmpInst.addOperand(Inst.getOperand(5));
+      TmpInst.addOperand(Inst.getOperand(1));
+      TmpInst.addOperand(Inst.getOperand(2));
+      TmpInst.addOperand(Inst.getOperand(3));
+      TmpInst.addOperand(Inst.getOperand(4));
+      Inst = TmpInst;
+      return true;
+    }
+    return false;
+  }
+
+  // Handle the Thumb2 mode MOV complex aliases.
+  case ARM::t2MOVsr:
+  case ARM::t2MOVSsr: {
+    // Which instruction to expand to depends on the CCOut operand and
+    // whether we're in an IT block if the register operands are low
+    // registers.
+    bool isNarrow = false;
+    if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
+        isARMLowRegister(Inst.getOperand(1).getReg()) &&
+        isARMLowRegister(Inst.getOperand(2).getReg()) &&
+        Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() &&
+        inITBlock() == (Inst.getOpcode() == ARM::t2MOVsr))
+      isNarrow = true;
+    MCInst TmpInst;
+    unsigned newOpc;
+    switch(ARM_AM::getSORegShOp(Inst.getOperand(3).getImm())) {
+    default: llvm_unreachable("unexpected opcode!");
+    case ARM_AM::asr: newOpc = isNarrow ? ARM::tASRrr : ARM::t2ASRrr; break;
+    case ARM_AM::lsr: newOpc = isNarrow ? ARM::tLSRrr : ARM::t2LSRrr; break;
+    case ARM_AM::lsl: newOpc = isNarrow ? ARM::tLSLrr : ARM::t2LSLrr; break;
+    case ARM_AM::ror: newOpc = isNarrow ? ARM::tROR   : ARM::t2RORrr; break;
+    }
+    TmpInst.setOpcode(newOpc);
+    TmpInst.addOperand(Inst.getOperand(0)); // Rd
+    if (isNarrow)
+      TmpInst.addOperand(MCOperand::CreateReg(
+          Inst.getOpcode() == ARM::t2MOVSsr ? ARM::CPSR : 0));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // Rm
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    if (!isNarrow)
+      TmpInst.addOperand(MCOperand::CreateReg(
+          Inst.getOpcode() == ARM::t2MOVSsr ? ARM::CPSR : 0));
+    Inst = TmpInst;
+    return true;
+  }
+  case ARM::t2MOVsi:
+  case ARM::t2MOVSsi: {
+    // Which instruction to expand to depends on the CCOut operand and
+    // whether we're in an IT block if the register operands are low
+    // registers.
+    bool isNarrow = false;
+    if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
+        isARMLowRegister(Inst.getOperand(1).getReg()) &&
+        inITBlock() == (Inst.getOpcode() == ARM::t2MOVsi))
+      isNarrow = true;
+    MCInst TmpInst;
+    unsigned newOpc;
+    switch(ARM_AM::getSORegShOp(Inst.getOperand(2).getImm())) {
+    default: llvm_unreachable("unexpected opcode!");
+    case ARM_AM::asr: newOpc = isNarrow ? ARM::tASRri : ARM::t2ASRri; break;
+    case ARM_AM::lsr: newOpc = isNarrow ? ARM::tLSRri : ARM::t2LSRri; break;
+    case ARM_AM::lsl: newOpc = isNarrow ? ARM::tLSLri : ARM::t2LSLri; break;
+    case ARM_AM::ror: newOpc = ARM::t2RORri; isNarrow = false; break;
+    case ARM_AM::rrx: isNarrow = false; newOpc = ARM::t2RRX; break;
+    }
+    unsigned Amount = ARM_AM::getSORegOffset(Inst.getOperand(2).getImm());
+    if (Amount == 32) Amount = 0;
+    TmpInst.setOpcode(newOpc);
+    TmpInst.addOperand(Inst.getOperand(0)); // Rd
+    if (isNarrow)
+      TmpInst.addOperand(MCOperand::CreateReg(
+          Inst.getOpcode() == ARM::t2MOVSsi ? ARM::CPSR : 0));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    if (newOpc != ARM::t2RRX)
+      TmpInst.addOperand(MCOperand::CreateImm(Amount));
+    TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
+    if (!isNarrow)
+      TmpInst.addOperand(MCOperand::CreateReg(
+          Inst.getOpcode() == ARM::t2MOVSsi ? ARM::CPSR : 0));
+    Inst = TmpInst;
+    return true;
+  }
+  // Handle the ARM mode MOV complex aliases.
+  case ARM::ASRr:
+  case ARM::LSRr:
+  case ARM::LSLr:
+  case ARM::RORr: {
+    ARM_AM::ShiftOpc ShiftTy;
+    switch(Inst.getOpcode()) {
+    default: llvm_unreachable("unexpected opcode!");
+    case ARM::ASRr: ShiftTy = ARM_AM::asr; break;
+    case ARM::LSRr: ShiftTy = ARM_AM::lsr; break;
+    case ARM::LSLr: ShiftTy = ARM_AM::lsl; break;
+    case ARM::RORr: ShiftTy = ARM_AM::ror; break;
+    }
+    unsigned Shifter = ARM_AM::getSORegOpc(ShiftTy, 0);
+    MCInst TmpInst;
+    TmpInst.setOpcode(ARM::MOVsr);
+    TmpInst.addOperand(Inst.getOperand(0)); // Rd
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // Rm
+    TmpInst.addOperand(MCOperand::CreateImm(Shifter)); // Shift value and ty
+    TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
+    TmpInst.addOperand(Inst.getOperand(5)); // cc_out
+    Inst = TmpInst;
+    return true;
+  }
+  case ARM::ASRi:
+  case ARM::LSRi:
+  case ARM::LSLi:
+  case ARM::RORi: {
+    ARM_AM::ShiftOpc ShiftTy;
+    switch(Inst.getOpcode()) {
+    default: llvm_unreachable("unexpected opcode!");
+    case ARM::ASRi: ShiftTy = ARM_AM::asr; break;
+    case ARM::LSRi: ShiftTy = ARM_AM::lsr; break;
+    case ARM::LSLi: ShiftTy = ARM_AM::lsl; break;
+    case ARM::RORi: ShiftTy = ARM_AM::ror; break;
+    }
+    // A shift by zero is a plain MOVr, not a MOVsi.
+    unsigned Amt = Inst.getOperand(2).getImm();
+    unsigned Opc = Amt == 0 ? ARM::MOVr : ARM::MOVsi;
+    // A shift by 32 should be encoded as 0 when permitted
+    if (Amt == 32 && (ShiftTy == ARM_AM::lsr || ShiftTy == ARM_AM::asr))
+      Amt = 0;
+    unsigned Shifter = ARM_AM::getSORegOpc(ShiftTy, Amt);
+    MCInst TmpInst;
+    TmpInst.setOpcode(Opc);
+    TmpInst.addOperand(Inst.getOperand(0)); // Rd
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    if (Opc == ARM::MOVsi)
+      TmpInst.addOperand(MCOperand::CreateImm(Shifter)); // Shift value and ty
+    TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
+    TmpInst.addOperand(Inst.getOperand(5)); // cc_out
+    Inst = TmpInst;
+    return true;
+  }
+  case ARM::RRXi: {
+    unsigned Shifter = ARM_AM::getSORegOpc(ARM_AM::rrx, 0);
+    MCInst TmpInst;
+    TmpInst.setOpcode(ARM::MOVsi);
+    TmpInst.addOperand(Inst.getOperand(0)); // Rd
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(MCOperand::CreateImm(Shifter)); // Shift value and ty
+    TmpInst.addOperand(Inst.getOperand(2)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(3));
+    TmpInst.addOperand(Inst.getOperand(4)); // cc_out
+    Inst = TmpInst;
+    return true;
+  }
+  case ARM::t2LDMIA_UPD: {
+    // If this is a load of a single register, then we should use
+    // a post-indexed LDR instruction instead, per the ARM ARM.
+    if (Inst.getNumOperands() != 5)
+      return false;
+    MCInst TmpInst;
+    TmpInst.setOpcode(ARM::t2LDR_POST);
+    TmpInst.addOperand(Inst.getOperand(4)); // Rt
+    TmpInst.addOperand(Inst.getOperand(0)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(MCOperand::CreateImm(4));
+    TmpInst.addOperand(Inst.getOperand(2)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(3));
+    Inst = TmpInst;
+    return true;
+  }
+  case ARM::t2STMDB_UPD: {
+    // If this is a store of a single register, then we should use
+    // a pre-indexed STR instruction instead, per the ARM ARM.
+    if (Inst.getNumOperands() != 5)
+      return false;
+    MCInst TmpInst;
+    TmpInst.setOpcode(ARM::t2STR_PRE);
+    TmpInst.addOperand(Inst.getOperand(0)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(4)); // Rt
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(MCOperand::CreateImm(-4));
+    TmpInst.addOperand(Inst.getOperand(2)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(3));
+    Inst = TmpInst;
+    return true;
+  }
+  case ARM::LDMIA_UPD:
+    // If this is a load of a single register via a 'pop', then we should use
+    // a post-indexed LDR instruction instead, per the ARM ARM.
+    if (static_cast<ARMOperand &>(*Operands[0]).getToken() == "pop" &&
+        Inst.getNumOperands() == 5) {
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::LDR_POST_IMM);
+      TmpInst.addOperand(Inst.getOperand(4)); // Rt
+      TmpInst.addOperand(Inst.getOperand(0)); // Rn_wb
+      TmpInst.addOperand(Inst.getOperand(1)); // Rn
+      TmpInst.addOperand(MCOperand::CreateReg(0));  // am2offset
+      TmpInst.addOperand(MCOperand::CreateImm(4));
+      TmpInst.addOperand(Inst.getOperand(2)); // CondCode
+      TmpInst.addOperand(Inst.getOperand(3));
+      Inst = TmpInst;
+      return true;
+    }
+    break;
+  case ARM::STMDB_UPD:
+    // If this is a store of a single register via a 'push', then we should use
+    // a pre-indexed STR instruction instead, per the ARM ARM.
+    if (static_cast<ARMOperand &>(*Operands[0]).getToken() == "push" &&
+        Inst.getNumOperands() == 5) {
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::STR_PRE_IMM);
+      TmpInst.addOperand(Inst.getOperand(0)); // Rn_wb
+      TmpInst.addOperand(Inst.getOperand(4)); // Rt
+      TmpInst.addOperand(Inst.getOperand(1)); // addrmode_imm12
+      TmpInst.addOperand(MCOperand::CreateImm(-4));
+      TmpInst.addOperand(Inst.getOperand(2)); // CondCode
+      TmpInst.addOperand(Inst.getOperand(3));
+      Inst = TmpInst;
+    }
+    break;
+  case ARM::t2ADDri12:
+    // If the immediate fits for encoding T3 (t2ADDri) and the generic "add"
+    // mnemonic was used (not "addw"), encoding T3 is preferred.
+    if (static_cast<ARMOperand &>(*Operands[0]).getToken() != "add" ||
+        ARM_AM::getT2SOImmVal(Inst.getOperand(2).getImm()) == -1)
+      break;
+    Inst.setOpcode(ARM::t2ADDri);
+    Inst.addOperand(MCOperand::CreateReg(0)); // cc_out
+    break;
+  case ARM::t2SUBri12:
+    // If the immediate fits for encoding T3 (t2SUBri) and the generic "sub"
+    // mnemonic was used (not "subw"), encoding T3 is preferred.
+    if (static_cast<ARMOperand &>(*Operands[0]).getToken() != "sub" ||
+        ARM_AM::getT2SOImmVal(Inst.getOperand(2).getImm()) == -1)
+      break;
+    Inst.setOpcode(ARM::t2SUBri);
+    Inst.addOperand(MCOperand::CreateReg(0)); // cc_out
+    break;
+  case ARM::tADDi8:
+    // If the immediate is in the range 0-7, we want tADDi3 iff Rd was
+    // explicitly specified. From the ARM ARM: "Encoding T1 is preferred
+    // to encoding T2 if <Rd> is specified and encoding T2 is preferred
+    // to encoding T1 if <Rd> is omitted."
+    if ((unsigned)Inst.getOperand(3).getImm() < 8 && Operands.size() == 6) {
+      Inst.setOpcode(ARM::tADDi3);
+      return true;
+    }
+    break;
+  case ARM::tSUBi8:
+    // If the immediate is in the range 0-7, we want tADDi3 iff Rd was
+    // explicitly specified. From the ARM ARM: "Encoding T1 is preferred
+    // to encoding T2 if <Rd> is specified and encoding T2 is preferred
+    // to encoding T1 if <Rd> is omitted."
+    if ((unsigned)Inst.getOperand(3).getImm() < 8 && Operands.size() == 6) {
+      Inst.setOpcode(ARM::tSUBi3);
+      return true;
+    }
+    break;
+  case ARM::t2ADDri:
+  case ARM::t2SUBri: {
+    // If the destination and first source operand are the same, and
+    // the flags are compatible with the current IT status, use encoding T2
+    // instead of T3. For compatibility with the system 'as'. Make sure the
+    // wide encoding wasn't explicit.
+    if (Inst.getOperand(0).getReg() != Inst.getOperand(1).getReg() ||
+        !isARMLowRegister(Inst.getOperand(0).getReg()) ||
+        (unsigned)Inst.getOperand(2).getImm() > 255 ||
+        ((!inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR) ||
+         (inITBlock() && Inst.getOperand(5).getReg() != 0)) ||
+        (static_cast<ARMOperand &>(*Operands[3]).isToken() &&
+         static_cast<ARMOperand &>(*Operands[3]).getToken() == ".w"))
+      break;
+    MCInst TmpInst;
+    TmpInst.setOpcode(Inst.getOpcode() == ARM::t2ADDri ?
+                      ARM::tADDi8 : ARM::tSUBi8);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(5));
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(2));
+    TmpInst.addOperand(Inst.getOperand(3));
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
+  case ARM::t2ADDrr: {
+    // If the destination and first source operand are the same, and
+    // there's no setting of the flags, use encoding T2 instead of T3.
+    // Note that this is only for ADD, not SUB. This mirrors the system
+    // 'as' behaviour. Make sure the wide encoding wasn't explicit.
+    if (Inst.getOperand(0).getReg() != Inst.getOperand(1).getReg() ||
+        Inst.getOperand(5).getReg() != 0 ||
+        (static_cast<ARMOperand &>(*Operands[3]).isToken() &&
+         static_cast<ARMOperand &>(*Operands[3]).getToken() == ".w"))
+      break;
+    MCInst TmpInst;
+    TmpInst.setOpcode(ARM::tADDhirr);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(2));
+    TmpInst.addOperand(Inst.getOperand(3));
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
+  case ARM::tADDrSP: {
+    // If the non-SP source operand and the destination operand are not the
+    // same, we need to use the 32-bit encoding if it's available.
+    if (Inst.getOperand(0).getReg() != Inst.getOperand(2).getReg()) {
+      Inst.setOpcode(ARM::t2ADDrr);
+      Inst.addOperand(MCOperand::CreateReg(0)); // cc_out
+      return true;
+    }
+    break;
+  }
+  case ARM::tB:
+    // A Thumb conditional branch outside of an IT block is a tBcc.
+    if (Inst.getOperand(1).getImm() != ARMCC::AL && !inITBlock()) {
+      Inst.setOpcode(ARM::tBcc);
+      return true;
+    }
+    break;
+  case ARM::t2B:
+    // A Thumb2 conditional branch outside of an IT block is a t2Bcc.
+    if (Inst.getOperand(1).getImm() != ARMCC::AL && !inITBlock()){
+      Inst.setOpcode(ARM::t2Bcc);
+      return true;
+    }
+    break;
+  case ARM::t2Bcc:
+    // If the conditional is AL or we're in an IT block, we really want t2B.
+    if (Inst.getOperand(1).getImm() == ARMCC::AL || inITBlock()) {
+      Inst.setOpcode(ARM::t2B);
+      return true;
+    }
+    break;
+  case ARM::tBcc:
+    // If the conditional is AL, we really want tB.
+    if (Inst.getOperand(1).getImm() == ARMCC::AL) {
+      Inst.setOpcode(ARM::tB);
+      return true;
+    }
+    break;
+  case ARM::tLDMIA: {
+    // If the register list contains any high registers, or if the writeback
+    // doesn't match what tLDMIA can do, we need to use the 32-bit encoding
+    // instead if we're in Thumb2. Otherwise, this should have generated
+    // an error in validateInstruction().
+    unsigned Rn = Inst.getOperand(0).getReg();
+    bool hasWritebackToken =
+        (static_cast<ARMOperand &>(*Operands[3]).isToken() &&
+         static_cast<ARMOperand &>(*Operands[3]).getToken() == "!");
+    bool listContainsBase;
+    if (checkLowRegisterList(Inst, 3, Rn, 0, listContainsBase) ||
+        (!listContainsBase && !hasWritebackToken) ||
+        (listContainsBase && hasWritebackToken)) {
+      // 16-bit encoding isn't sufficient. Switch to the 32-bit version.
+      assert (isThumbTwo());
+      Inst.setOpcode(hasWritebackToken ? ARM::t2LDMIA_UPD : ARM::t2LDMIA);
+      // If we're switching to the updating version, we need to insert
+      // the writeback tied operand.
+      if (hasWritebackToken)
+        Inst.insert(Inst.begin(),
+                    MCOperand::CreateReg(Inst.getOperand(0).getReg()));
+      return true;
+    }
+    break;
+  }
+  case ARM::tSTMIA_UPD: {
+    // If the register list contains any high registers, we need to use
+    // the 32-bit encoding instead if we're in Thumb2. Otherwise, this
+    // should have generated an error in validateInstruction().
+    unsigned Rn = Inst.getOperand(0).getReg();
+    bool listContainsBase;
+    if (checkLowRegisterList(Inst, 4, Rn, 0, listContainsBase)) {
+      // 16-bit encoding isn't sufficient. Switch to the 32-bit version.
+      assert (isThumbTwo());
+      Inst.setOpcode(ARM::t2STMIA_UPD);
+      return true;
+    }
+    break;
+  }
+  case ARM::tPOP: {
+    bool listContainsBase;
+    // If the register list contains any high registers, we need to use
+    // the 32-bit encoding instead if we're in Thumb2. Otherwise, this
+    // should have generated an error in validateInstruction().
+    if (!checkLowRegisterList(Inst, 2, 0, ARM::PC, listContainsBase))
+      return false;
+    assert (isThumbTwo());
+    Inst.setOpcode(ARM::t2LDMIA_UPD);
+    // Add the base register and writeback operands.
+    Inst.insert(Inst.begin(), MCOperand::CreateReg(ARM::SP));
+    Inst.insert(Inst.begin(), MCOperand::CreateReg(ARM::SP));
+    return true;
+  }
+  case ARM::tPUSH: {
+    bool listContainsBase;
+    if (!checkLowRegisterList(Inst, 2, 0, ARM::LR, listContainsBase))
+      return false;
+    assert (isThumbTwo());
+    Inst.setOpcode(ARM::t2STMDB_UPD);
+    // Add the base register and writeback operands.
+    Inst.insert(Inst.begin(), MCOperand::CreateReg(ARM::SP));
+    Inst.insert(Inst.begin(), MCOperand::CreateReg(ARM::SP));
+    return true;
+  }
+  case ARM::t2MOVi: {
+    // If we can use the 16-bit encoding and the user didn't explicitly
+    // request the 32-bit variant, transform it here.
+    if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
+        (unsigned)Inst.getOperand(1).getImm() <= 255 &&
+        ((!inITBlock() && Inst.getOperand(2).getImm() == ARMCC::AL &&
+          Inst.getOperand(4).getReg() == ARM::CPSR) ||
+         (inITBlock() && Inst.getOperand(4).getReg() == 0)) &&
+        (!static_cast<ARMOperand &>(*Operands[2]).isToken() ||
+         static_cast<ARMOperand &>(*Operands[2]).getToken() != ".w")) {
+      // The operands aren't in the same order for tMOVi8...
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::tMOVi8);
+      TmpInst.addOperand(Inst.getOperand(0));
+      TmpInst.addOperand(Inst.getOperand(4));
+      TmpInst.addOperand(Inst.getOperand(1));
+      TmpInst.addOperand(Inst.getOperand(2));
+      TmpInst.addOperand(Inst.getOperand(3));
+      Inst = TmpInst;
+      return true;
+    }
+    break;
+  }
+  case ARM::t2MOVr: {
+    // If we can use the 16-bit encoding and the user didn't explicitly
+    // request the 32-bit variant, transform it here.
+    if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
+        isARMLowRegister(Inst.getOperand(1).getReg()) &&
+        Inst.getOperand(2).getImm() == ARMCC::AL &&
+        Inst.getOperand(4).getReg() == ARM::CPSR &&
+        (!static_cast<ARMOperand &>(*Operands[2]).isToken() ||
+         static_cast<ARMOperand &>(*Operands[2]).getToken() != ".w")) {
+      // The operands aren't the same for tMOV[S]r... (no cc_out)
+      MCInst TmpInst;
+      TmpInst.setOpcode(Inst.getOperand(4).getReg() ? ARM::tMOVSr : ARM::tMOVr);
+      TmpInst.addOperand(Inst.getOperand(0));
+      TmpInst.addOperand(Inst.getOperand(1));
+      TmpInst.addOperand(Inst.getOperand(2));
+      TmpInst.addOperand(Inst.getOperand(3));
+      Inst = TmpInst;
+      return true;
+    }
+    break;
+  }
+  case ARM::t2SXTH:
+  case ARM::t2SXTB:
+  case ARM::t2UXTH:
+  case ARM::t2UXTB: {
+    // If we can use the 16-bit encoding and the user didn't explicitly
+    // request the 32-bit variant, transform it here.
+    if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
+        isARMLowRegister(Inst.getOperand(1).getReg()) &&
+        Inst.getOperand(2).getImm() == 0 &&
+        (!static_cast<ARMOperand &>(*Operands[2]).isToken() ||
+         static_cast<ARMOperand &>(*Operands[2]).getToken() != ".w")) {
+      unsigned NewOpc;
+      switch (Inst.getOpcode()) {
+      default: llvm_unreachable("Illegal opcode!");
+      case ARM::t2SXTH: NewOpc = ARM::tSXTH; break;
+      case ARM::t2SXTB: NewOpc = ARM::tSXTB; break;
+      case ARM::t2UXTH: NewOpc = ARM::tUXTH; break;
+      case ARM::t2UXTB: NewOpc = ARM::tUXTB; break;
+      }
+      // The operands aren't the same for thumb1 (no rotate operand).
+      MCInst TmpInst;
+      TmpInst.setOpcode(NewOpc);
+      TmpInst.addOperand(Inst.getOperand(0));
+      TmpInst.addOperand(Inst.getOperand(1));
+      TmpInst.addOperand(Inst.getOperand(3));
+      TmpInst.addOperand(Inst.getOperand(4));
+      Inst = TmpInst;
+      return true;
+    }
+    break;
+  }
+  case ARM::MOVsi: {
+    ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(Inst.getOperand(2).getImm());
+    // rrx shifts and asr/lsr of #32 is encoded as 0
+    if (SOpc == ARM_AM::rrx || SOpc == ARM_AM::asr || SOpc == ARM_AM::lsr) 
+      return false;
+    if (ARM_AM::getSORegOffset(Inst.getOperand(2).getImm()) == 0) {
+      // Shifting by zero is accepted as a vanilla 'MOVr'
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::MOVr);
+      TmpInst.addOperand(Inst.getOperand(0));
+      TmpInst.addOperand(Inst.getOperand(1));
+      TmpInst.addOperand(Inst.getOperand(3));
+      TmpInst.addOperand(Inst.getOperand(4));
+      TmpInst.addOperand(Inst.getOperand(5));
+      Inst = TmpInst;
+      return true;
+    }
+    return false;
+  }
+  case ARM::ANDrsi:
+  case ARM::ORRrsi:
+  case ARM::EORrsi:
+  case ARM::BICrsi:
+  case ARM::SUBrsi:
+  case ARM::ADDrsi: {
+    unsigned newOpc;
+    ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(Inst.getOperand(3).getImm());
+    if (SOpc == ARM_AM::rrx) return false;
+    switch (Inst.getOpcode()) {
+    default: llvm_unreachable("unexpected opcode!");
+    case ARM::ANDrsi: newOpc = ARM::ANDrr; break;
+    case ARM::ORRrsi: newOpc = ARM::ORRrr; break;
+    case ARM::EORrsi: newOpc = ARM::EORrr; break;
+    case ARM::BICrsi: newOpc = ARM::BICrr; break;
+    case ARM::SUBrsi: newOpc = ARM::SUBrr; break;
+    case ARM::ADDrsi: newOpc = ARM::ADDrr; break;
+    }
+    // If the shift is by zero, use the non-shifted instruction definition.
+    // The exception is for right shifts, where 0 == 32
+    if (ARM_AM::getSORegOffset(Inst.getOperand(3).getImm()) == 0 &&
+        !(SOpc == ARM_AM::lsr || SOpc == ARM_AM::asr)) {
+      MCInst TmpInst;
+      TmpInst.setOpcode(newOpc);
+      TmpInst.addOperand(Inst.getOperand(0));
+      TmpInst.addOperand(Inst.getOperand(1));
+      TmpInst.addOperand(Inst.getOperand(2));
+      TmpInst.addOperand(Inst.getOperand(4));
+      TmpInst.addOperand(Inst.getOperand(5));
+      TmpInst.addOperand(Inst.getOperand(6));
+      Inst = TmpInst;
+      return true;
+    }
+    return false;
+  }
+  case ARM::ITasm:
+  case ARM::t2IT: {
+    // The mask bits for all but the first condition are represented as
+    // the low bit of the condition code value implies 't'. We currently
+    // always have 1 implies 't', so XOR toggle the bits if the low bit
+    // of the condition code is zero. 
+    MCOperand &MO = Inst.getOperand(1);
+    unsigned Mask = MO.getImm();
+    unsigned OrigMask = Mask;
+    unsigned TZ = countTrailingZeros(Mask);
+    if ((Inst.getOperand(0).getImm() & 1) == 0) {
+      assert(Mask && TZ <= 3 && "illegal IT mask value!");
+      Mask ^= (0xE << TZ) & 0xF;
+    }
+    MO.setImm(Mask);
+
+    // Set up the IT block state according to the IT instruction we just
+    // matched.
+    assert(!inITBlock() && "nested IT blocks?!");
+    ITState.Cond = ARMCC::CondCodes(Inst.getOperand(0).getImm());
+    ITState.Mask = OrigMask; // Use the original mask, not the updated one.
+    ITState.CurPosition = 0;
+    ITState.FirstCond = true;
+    break;
+  }
+  case ARM::t2LSLrr:
+  case ARM::t2LSRrr:
+  case ARM::t2ASRrr:
+  case ARM::t2SBCrr:
+  case ARM::t2RORrr:
+  case ARM::t2BICrr:
+  {
+    // Assemblers should use the narrow encodings of these instructions when permissible.
+    if ((isARMLowRegister(Inst.getOperand(1).getReg()) &&
+         isARMLowRegister(Inst.getOperand(2).getReg())) &&
+        Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() &&
+        ((!inITBlock() && Inst.getOperand(5).getReg() == ARM::CPSR) ||
+         (inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR)) &&
+        (!static_cast<ARMOperand &>(*Operands[3]).isToken() ||
+         !static_cast<ARMOperand &>(*Operands[3]).getToken().equals_lower(
+             ".w"))) {
+      unsigned NewOpc;
+      switch (Inst.getOpcode()) {
+        default: llvm_unreachable("unexpected opcode");
+        case ARM::t2LSLrr: NewOpc = ARM::tLSLrr; break;
+        case ARM::t2LSRrr: NewOpc = ARM::tLSRrr; break;
+        case ARM::t2ASRrr: NewOpc = ARM::tASRrr; break;
+        case ARM::t2SBCrr: NewOpc = ARM::tSBC; break;
+        case ARM::t2RORrr: NewOpc = ARM::tROR; break;
+        case ARM::t2BICrr: NewOpc = ARM::tBIC; break;
+      }
+      MCInst TmpInst;
+      TmpInst.setOpcode(NewOpc);
+      TmpInst.addOperand(Inst.getOperand(0));
+      TmpInst.addOperand(Inst.getOperand(5));
+      TmpInst.addOperand(Inst.getOperand(1));
+      TmpInst.addOperand(Inst.getOperand(2));
+      TmpInst.addOperand(Inst.getOperand(3));
+      TmpInst.addOperand(Inst.getOperand(4));
+      Inst = TmpInst;
+      return true;
+    }
+    return false;
+  }
+  case ARM::t2ANDrr:
+  case ARM::t2EORrr:
+  case ARM::t2ADCrr:
+  case ARM::t2ORRrr:
+  {
+    // Assemblers should use the narrow encodings of these instructions when permissible.
+    // These instructions are special in that they are commutable, so shorter encodings
+    // are available more often.
+    if ((isARMLowRegister(Inst.getOperand(1).getReg()) &&
+         isARMLowRegister(Inst.getOperand(2).getReg())) &&
+        (Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() ||
+         Inst.getOperand(0).getReg() == Inst.getOperand(2).getReg()) &&
+        ((!inITBlock() && Inst.getOperand(5).getReg() == ARM::CPSR) ||
+         (inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR)) &&
+        (!static_cast<ARMOperand &>(*Operands[3]).isToken() ||
+         !static_cast<ARMOperand &>(*Operands[3]).getToken().equals_lower(
+             ".w"))) {
+      unsigned NewOpc;
+      switch (Inst.getOpcode()) {
+        default: llvm_unreachable("unexpected opcode");
+        case ARM::t2ADCrr: NewOpc = ARM::tADC; break;
+        case ARM::t2ANDrr: NewOpc = ARM::tAND; break;
+        case ARM::t2EORrr: NewOpc = ARM::tEOR; break;
+        case ARM::t2ORRrr: NewOpc = ARM::tORR; break;
+      }
+      MCInst TmpInst;
+      TmpInst.setOpcode(NewOpc);
+      TmpInst.addOperand(Inst.getOperand(0));
+      TmpInst.addOperand(Inst.getOperand(5));
+      if (Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg()) {
+        TmpInst.addOperand(Inst.getOperand(1));
+        TmpInst.addOperand(Inst.getOperand(2));
+      } else {
+        TmpInst.addOperand(Inst.getOperand(2));
+        TmpInst.addOperand(Inst.getOperand(1));
+      }
+      TmpInst.addOperand(Inst.getOperand(3));
+      TmpInst.addOperand(Inst.getOperand(4));
+      Inst = TmpInst;
+      return true;
+    }
+    return false;
+  }
+  }
+  return false;
+}
+
+unsigned ARMAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
+  // 16-bit thumb arithmetic instructions either require or preclude the 'S'
+  // suffix depending on whether they're in an IT block or not.
+  unsigned Opc = Inst.getOpcode();
+  const MCInstrDesc &MCID = MII.get(Opc);
+  if (MCID.TSFlags & ARMII::ThumbArithFlagSetting) {
+    assert(MCID.hasOptionalDef() &&
+           "optionally flag setting instruction missing optional def operand");
+    assert(MCID.NumOperands == Inst.getNumOperands() &&
+           "operand count mismatch!");
+    // Find the optional-def operand (cc_out).
+    unsigned OpNo;
+    for (OpNo = 0;
+         !MCID.OpInfo[OpNo].isOptionalDef() && OpNo < MCID.NumOperands;
+         ++OpNo)
+      ;
+    // If we're parsing Thumb1, reject it completely.
+    if (isThumbOne() && Inst.getOperand(OpNo).getReg() != ARM::CPSR)
+      return Match_MnemonicFail;
+    // If we're parsing Thumb2, which form is legal depends on whether we're
+    // in an IT block.
+    if (isThumbTwo() && Inst.getOperand(OpNo).getReg() != ARM::CPSR &&
+        !inITBlock())
+      return Match_RequiresITBlock;
+    if (isThumbTwo() && Inst.getOperand(OpNo).getReg() == ARM::CPSR &&
+        inITBlock())
+      return Match_RequiresNotITBlock;
+  }
+  // Some high-register supporting Thumb1 encodings only allow both registers
+  // to be from r0-r7 when in Thumb2.
+  else if (Opc == ARM::tADDhirr && isThumbOne() &&
+           isARMLowRegister(Inst.getOperand(1).getReg()) &&
+           isARMLowRegister(Inst.getOperand(2).getReg()))
+    return Match_RequiresThumb2;
+  // Others only require ARMv6 or later.
+  else if (Opc == ARM::tMOVr && isThumbOne() && !hasV6Ops() &&
+           isARMLowRegister(Inst.getOperand(0).getReg()) &&
+           isARMLowRegister(Inst.getOperand(1).getReg()))
+    return Match_RequiresV6;
+  return Match_Success;
+}
+
+namespace llvm {
+template <> inline bool IsCPSRDead<MCInst>(MCInst *Instr) {
+  return true; // In an assembly source, no need to second-guess
+}
+}
+
+static const char *getSubtargetFeatureName(unsigned Val);
+bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                           OperandVector &Operands,
+                                           MCStreamer &Out, unsigned &ErrorInfo,
+                                           bool MatchingInlineAsm) {
+  MCInst Inst;
+  unsigned MatchResult;
+
+  MatchResult = MatchInstructionImpl(Operands, Inst, ErrorInfo,
+                                     MatchingInlineAsm);
+  switch (MatchResult) {
+  default: break;
+  case Match_Success:
+    // Context sensitive operand constraints aren't handled by the matcher,
+    // so check them here.
+    if (validateInstruction(Inst, Operands)) {
+      // Still progress the IT block, otherwise one wrong condition causes
+      // nasty cascading errors.
+      forwardITPosition();
+      return true;
+    }
+
+    { // processInstruction() updates inITBlock state, we need to save it away
+      bool wasInITBlock = inITBlock();
+
+      // Some instructions need post-processing to, for example, tweak which
+      // encoding is selected. Loop on it while changes happen so the
+      // individual transformations can chain off each other. E.g.,
+      // tPOP(r8)->t2LDMIA_UPD(sp,r8)->t2STR_POST(sp,r8)
+      while (processInstruction(Inst, Operands))
+        ;
+
+      // Only after the instruction is fully processed, we can validate it
+      if (wasInITBlock && hasV8Ops() && isThumb() &&
+          !isV8EligibleForIT(&Inst)) {
+        Warning(IDLoc, "deprecated instruction in IT block");
+      }
+    }
+
+    // Only move forward at the very end so that everything in validate
+    // and process gets a consistent answer about whether we're in an IT
+    // block.
+    forwardITPosition();
+
+    // ITasm is an ARM mode pseudo-instruction that just sets the ITblock and
+    // doesn't actually encode.
+    if (Inst.getOpcode() == ARM::ITasm)
+      return false;
+
+    Inst.setLoc(IDLoc);
+    Out.EmitInstruction(Inst, STI);
+    return false;
+  case Match_MissingFeature: {
+    assert(ErrorInfo && "Unknown missing feature!");
+    // Special case the error message for the very common case where only
+    // a single subtarget feature is missing (Thumb vs. ARM, e.g.).
+    std::string Msg = "instruction requires:";
+    unsigned Mask = 1;
+    for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) {
+      if (ErrorInfo & Mask) {
+        Msg += " ";
+        Msg += getSubtargetFeatureName(ErrorInfo & Mask);
+      }
+      Mask <<= 1;
+    }
+    return Error(IDLoc, Msg);
+  }
+  case Match_InvalidOperand: {
+    SMLoc ErrorLoc = IDLoc;
+    if (ErrorInfo != ~0U) {
+      if (ErrorInfo >= Operands.size())
+        return Error(IDLoc, "too few operands for instruction");
+
+      ErrorLoc = ((ARMOperand &)*Operands[ErrorInfo]).getStartLoc();
+      if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
+    }
+
+    return Error(ErrorLoc, "invalid operand for instruction");
+  }
+  case Match_MnemonicFail:
+    return Error(IDLoc, "invalid instruction",
+                 ((ARMOperand &)*Operands[0]).getLocRange());
+  case Match_RequiresNotITBlock:
+    return Error(IDLoc, "flag setting instruction only valid outside IT block");
+  case Match_RequiresITBlock:
+    return Error(IDLoc, "instruction only valid inside IT block");
+  case Match_RequiresV6:
+    return Error(IDLoc, "instruction variant requires ARMv6 or later");
+  case Match_RequiresThumb2:
+    return Error(IDLoc, "instruction variant requires Thumb2");
+  case Match_ImmRange0_15: {
+    SMLoc ErrorLoc = ((ARMOperand &)*Operands[ErrorInfo]).getStartLoc();
+    if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
+    return Error(ErrorLoc, "immediate operand must be in the range [0,15]");
+  }
+  case Match_ImmRange0_239: {
+    SMLoc ErrorLoc = ((ARMOperand &)*Operands[ErrorInfo]).getStartLoc();
+    if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
+    return Error(ErrorLoc, "immediate operand must be in the range [0,239]");
+  }
+  case Match_AlignedMemoryRequiresNone:
+  case Match_DupAlignedMemoryRequiresNone:
+  case Match_AlignedMemoryRequires16:
+  case Match_DupAlignedMemoryRequires16:
+  case Match_AlignedMemoryRequires32:
+  case Match_DupAlignedMemoryRequires32:
+  case Match_AlignedMemoryRequires64:
+  case Match_DupAlignedMemoryRequires64:
+  case Match_AlignedMemoryRequires64or128:
+  case Match_DupAlignedMemoryRequires64or128:
+  case Match_AlignedMemoryRequires64or128or256:
+  {
+    SMLoc ErrorLoc = ((ARMOperand &)*Operands[ErrorInfo]).getAlignmentLoc();
+    if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
+    switch (MatchResult) {
+      default:
+        llvm_unreachable("Missing Match_Aligned type");
+      case Match_AlignedMemoryRequiresNone:
+      case Match_DupAlignedMemoryRequiresNone:
+        return Error(ErrorLoc, "alignment must be omitted");
+      case Match_AlignedMemoryRequires16:
+      case Match_DupAlignedMemoryRequires16:
+        return Error(ErrorLoc, "alignment must be 16 or omitted");
+      case Match_AlignedMemoryRequires32:
+      case Match_DupAlignedMemoryRequires32:
+        return Error(ErrorLoc, "alignment must be 32 or omitted");
+      case Match_AlignedMemoryRequires64:
+      case Match_DupAlignedMemoryRequires64:
+        return Error(ErrorLoc, "alignment must be 64 or omitted");
+      case Match_AlignedMemoryRequires64or128:
+      case Match_DupAlignedMemoryRequires64or128:
+        return Error(ErrorLoc, "alignment must be 64, 128 or omitted");
+      case Match_AlignedMemoryRequires64or128or256:
+        return Error(ErrorLoc, "alignment must be 64, 128, 256 or omitted");
+    }
+  }
+  }
+
+  llvm_unreachable("Implement any new match types added!");
+}
+
+/// parseDirective parses the arm specific directives
+bool ARMAsmParser::ParseDirective(AsmToken DirectiveID) {
+  const MCObjectFileInfo::Environment Format =
+    getContext().getObjectFileInfo()->getObjectFileType();
+  bool IsMachO = Format == MCObjectFileInfo::IsMachO;
+
+  StringRef IDVal = DirectiveID.getIdentifier();
+  if (IDVal == ".word")
+    return parseLiteralValues(4, DirectiveID.getLoc());
+  else if (IDVal == ".short" || IDVal == ".hword")
+    return parseLiteralValues(2, DirectiveID.getLoc());
+  else if (IDVal == ".thumb")
+    return parseDirectiveThumb(DirectiveID.getLoc());
+  else if (IDVal == ".arm")
+    return parseDirectiveARM(DirectiveID.getLoc());
+  else if (IDVal == ".thumb_func")
+    return parseDirectiveThumbFunc(DirectiveID.getLoc());
+  else if (IDVal == ".code")
+    return parseDirectiveCode(DirectiveID.getLoc());
+  else if (IDVal == ".syntax")
+    return parseDirectiveSyntax(DirectiveID.getLoc());
+  else if (IDVal == ".unreq")
+    return parseDirectiveUnreq(DirectiveID.getLoc());
+  else if (IDVal == ".fnend")
+    return parseDirectiveFnEnd(DirectiveID.getLoc());
+  else if (IDVal == ".cantunwind")
+    return parseDirectiveCantUnwind(DirectiveID.getLoc());
+  else if (IDVal == ".personality")
+    return parseDirectivePersonality(DirectiveID.getLoc());
+  else if (IDVal == ".handlerdata")
+    return parseDirectiveHandlerData(DirectiveID.getLoc());
+  else if (IDVal == ".setfp")
+    return parseDirectiveSetFP(DirectiveID.getLoc());
+  else if (IDVal == ".pad")
+    return parseDirectivePad(DirectiveID.getLoc());
+  else if (IDVal == ".save")
+    return parseDirectiveRegSave(DirectiveID.getLoc(), false);
+  else if (IDVal == ".vsave")
+    return parseDirectiveRegSave(DirectiveID.getLoc(), true);
+  else if (IDVal == ".ltorg" || IDVal == ".pool")
+    return parseDirectiveLtorg(DirectiveID.getLoc());
+  else if (IDVal == ".even")
+    return parseDirectiveEven(DirectiveID.getLoc());
+  else if (IDVal == ".personalityindex")
+    return parseDirectivePersonalityIndex(DirectiveID.getLoc());
+  else if (IDVal == ".unwind_raw")
+    return parseDirectiveUnwindRaw(DirectiveID.getLoc());
+  else if (IDVal == ".movsp")
+    return parseDirectiveMovSP(DirectiveID.getLoc());
+  else if (IDVal == ".arch_extension")
+    return parseDirectiveArchExtension(DirectiveID.getLoc());
+  else if (IDVal == ".align")
+    return parseDirectiveAlign(DirectiveID.getLoc());
+  else if (IDVal == ".thumb_set")
+    return parseDirectiveThumbSet(DirectiveID.getLoc());
+
+  if (!IsMachO) {
+    if (IDVal == ".arch")
+      return parseDirectiveArch(DirectiveID.getLoc());
+    else if (IDVal == ".cpu")
+      return parseDirectiveCPU(DirectiveID.getLoc());
+    else if (IDVal == ".eabi_attribute")
+      return parseDirectiveEabiAttr(DirectiveID.getLoc());
+    else if (IDVal == ".fpu")
+      return parseDirectiveFPU(DirectiveID.getLoc());
+    else if (IDVal == ".fnstart")
+      return parseDirectiveFnStart(DirectiveID.getLoc());
+    else if (IDVal == ".inst")
+      return parseDirectiveInst(DirectiveID.getLoc());
+    else if (IDVal == ".inst.n")
+      return parseDirectiveInst(DirectiveID.getLoc(), 'n');
+    else if (IDVal == ".inst.w")
+      return parseDirectiveInst(DirectiveID.getLoc(), 'w');
+    else if (IDVal == ".object_arch")
+      return parseDirectiveObjectArch(DirectiveID.getLoc());
+    else if (IDVal == ".tlsdescseq")
+      return parseDirectiveTLSDescSeq(DirectiveID.getLoc());
+  }
+
+  return true;
+}
+
+/// parseLiteralValues
+///  ::= .hword expression [, expression]*
+///  ::= .short expression [, expression]*
+///  ::= .word expression [, expression]*
+bool ARMAsmParser::parseLiteralValues(unsigned Size, SMLoc L) {
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    for (;;) {
+      const MCExpr *Value;
+      if (getParser().parseExpression(Value)) {
+        Parser.eatToEndOfStatement();
+        return false;
+      }
+
+      getParser().getStreamer().EmitValue(Value, Size);
+
+      if (getLexer().is(AsmToken::EndOfStatement))
+        break;
+
+      // FIXME: Improve diagnostic.
+      if (getLexer().isNot(AsmToken::Comma)) {
+        Error(L, "unexpected token in directive");
+        return false;
+      }
+      Parser.Lex();
+    }
+  }
+
+  Parser.Lex();
+  return false;
+}
+
+/// parseDirectiveThumb
+///  ::= .thumb
+bool ARMAsmParser::parseDirectiveThumb(SMLoc L) {
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    Error(L, "unexpected token in directive");
+    return false;
+  }
+  Parser.Lex();
+
+  if (!hasThumb()) {
+    Error(L, "target does not support Thumb mode");
+    return false;
+  }
+
+  if (!isThumb())
+    SwitchMode();
+
+  getParser().getStreamer().EmitAssemblerFlag(MCAF_Code16);
+  return false;
+}
+
+/// parseDirectiveARM
+///  ::= .arm
+bool ARMAsmParser::parseDirectiveARM(SMLoc L) {
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    Error(L, "unexpected token in directive");
+    return false;
+  }
+  Parser.Lex();
+
+  if (!hasARM()) {
+    Error(L, "target does not support ARM mode");
+    return false;
+  }
+
+  if (isThumb())
+    SwitchMode();
+
+  getParser().getStreamer().EmitAssemblerFlag(MCAF_Code32);
+  return false;
+}
+
+void ARMAsmParser::onLabelParsed(MCSymbol *Symbol) {
+  if (NextSymbolIsThumb) {
+    getParser().getStreamer().EmitThumbFunc(Symbol);
+    NextSymbolIsThumb = false;
+  }
+}
+
+/// parseDirectiveThumbFunc
+///  ::= .thumbfunc symbol_name
+bool ARMAsmParser::parseDirectiveThumbFunc(SMLoc L) {
+  const MCAsmInfo *MAI = getParser().getStreamer().getContext().getAsmInfo();
+  bool isMachO = MAI->hasSubsectionsViaSymbols();
+
+  // Darwin asm has (optionally) function name after .thumb_func direction
+  // ELF doesn't
+  if (isMachO) {
+    const AsmToken &Tok = Parser.getTok();
+    if (Tok.isNot(AsmToken::EndOfStatement)) {
+      if (Tok.isNot(AsmToken::Identifier) && Tok.isNot(AsmToken::String)) {
+        Error(L, "unexpected token in .thumb_func directive");
+        return false;
+      }
+
+      MCSymbol *Func =
+          getParser().getContext().GetOrCreateSymbol(Tok.getIdentifier());
+      getParser().getStreamer().EmitThumbFunc(Func);
+      Parser.Lex(); // Consume the identifier token.
+      return false;
+    }
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    Error(L, "unexpected token in directive");
+    return false;
+  }
+
+  NextSymbolIsThumb = true;
+  return false;
+}
+
+/// parseDirectiveSyntax
+///  ::= .syntax unified | divided
+bool ARMAsmParser::parseDirectiveSyntax(SMLoc L) {
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.isNot(AsmToken::Identifier)) {
+    Error(L, "unexpected token in .syntax directive");
+    return false;
+  }
+
+  StringRef Mode = Tok.getString();
+  if (Mode == "unified" || Mode == "UNIFIED") {
+    Parser.Lex();
+  } else if (Mode == "divided" || Mode == "DIVIDED") {
+    Error(L, "'.syntax divided' arm asssembly not supported");
+    return false;
+  } else {
+    Error(L, "unrecognized syntax mode in .syntax directive");
+    return false;
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    Error(Parser.getTok().getLoc(), "unexpected token in directive");
+    return false;
+  }
+  Parser.Lex();
+
+  // TODO tell the MC streamer the mode
+  // getParser().getStreamer().Emit???();
+  return false;
+}
+
+/// parseDirectiveCode
+///  ::= .code 16 | 32
+bool ARMAsmParser::parseDirectiveCode(SMLoc L) {
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.isNot(AsmToken::Integer)) {
+    Error(L, "unexpected token in .code directive");
+    return false;
+  }
+  int64_t Val = Parser.getTok().getIntVal();
+  if (Val != 16 && Val != 32) {
+    Error(L, "invalid operand to .code directive");
+    return false;
+  }
+  Parser.Lex();
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    Error(Parser.getTok().getLoc(), "unexpected token in directive");
+    return false;
+  }
+  Parser.Lex();
+
+  if (Val == 16) {
+    if (!hasThumb()) {
+      Error(L, "target does not support Thumb mode");
+      return false;
+    }
+
+    if (!isThumb())
+      SwitchMode();
+    getParser().getStreamer().EmitAssemblerFlag(MCAF_Code16);
+  } else {
+    if (!hasARM()) {
+      Error(L, "target does not support ARM mode");
+      return false;
+    }
+
+    if (isThumb())
+      SwitchMode();
+    getParser().getStreamer().EmitAssemblerFlag(MCAF_Code32);
+  }
+
+  return false;
+}
+
+/// parseDirectiveReq
+///  ::= name .req registername
+bool ARMAsmParser::parseDirectiveReq(StringRef Name, SMLoc L) {
+  Parser.Lex(); // Eat the '.req' token.
+  unsigned Reg;
+  SMLoc SRegLoc, ERegLoc;
+  if (ParseRegister(Reg, SRegLoc, ERegLoc)) {
+    Parser.eatToEndOfStatement();
+    Error(SRegLoc, "register name expected");
+    return false;
+  }
+
+  // Shouldn't be anything else.
+  if (Parser.getTok().isNot(AsmToken::EndOfStatement)) {
+    Parser.eatToEndOfStatement();
+    Error(Parser.getTok().getLoc(), "unexpected input in .req directive.");
+    return false;
+  }
+
+  Parser.Lex(); // Consume the EndOfStatement
+
+  if (RegisterReqs.GetOrCreateValue(Name, Reg).getValue() != Reg) {
+    Error(SRegLoc, "redefinition of '" + Name + "' does not match original.");
+    return false;
+  }
+
+  return false;
+}
+
+/// parseDirectiveUneq
+///  ::= .unreq registername
+bool ARMAsmParser::parseDirectiveUnreq(SMLoc L) {
+  if (Parser.getTok().isNot(AsmToken::Identifier)) {
+    Parser.eatToEndOfStatement();
+    Error(L, "unexpected input in .unreq directive.");
+    return false;
+  }
+  RegisterReqs.erase(Parser.getTok().getIdentifier().lower());
+  Parser.Lex(); // Eat the identifier.
+  return false;
+}
+
+/// parseDirectiveArch
+///  ::= .arch token
+bool ARMAsmParser::parseDirectiveArch(SMLoc L) {
+  StringRef Arch = getParser().parseStringToEndOfStatement().trim();
+
+  unsigned ID = StringSwitch<unsigned>(Arch)
+#define ARM_ARCH_NAME(NAME, ID, DEFAULT_CPU_NAME, DEFAULT_CPU_ARCH) \
+    .Case(NAME, ARM::ID)
+#define ARM_ARCH_ALIAS(NAME, ID) \
+    .Case(NAME, ARM::ID)
+#include "MCTargetDesc/ARMArchName.def"
+    .Default(ARM::INVALID_ARCH);
+
+  if (ID == ARM::INVALID_ARCH) {
+    Error(L, "Unknown arch name");
+    return false;
+  }
+
+  getTargetStreamer().emitArch(ID);
+  return false;
+}
+
+/// parseDirectiveEabiAttr
+///  ::= .eabi_attribute int, int [, "str"]
+///  ::= .eabi_attribute Tag_name, int [, "str"]
+bool ARMAsmParser::parseDirectiveEabiAttr(SMLoc L) {
+  int64_t Tag;
+  SMLoc TagLoc;
+  TagLoc = Parser.getTok().getLoc();
+  if (Parser.getTok().is(AsmToken::Identifier)) {
+    StringRef Name = Parser.getTok().getIdentifier();
+    Tag = ARMBuildAttrs::AttrTypeFromString(Name);
+    if (Tag == -1) {
+      Error(TagLoc, "attribute name not recognised: " + Name);
+      Parser.eatToEndOfStatement();
+      return false;
+    }
+    Parser.Lex();
+  } else {
+    const MCExpr *AttrExpr;
+
+    TagLoc = Parser.getTok().getLoc();
+    if (Parser.parseExpression(AttrExpr)) {
+      Parser.eatToEndOfStatement();
+      return false;
+    }
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(AttrExpr);
+    if (!CE) {
+      Error(TagLoc, "expected numeric constant");
+      Parser.eatToEndOfStatement();
+      return false;
+    }
+
+    Tag = CE->getValue();
+  }
+
+  if (Parser.getTok().isNot(AsmToken::Comma)) {
+    Error(Parser.getTok().getLoc(), "comma expected");
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+  Parser.Lex(); // skip comma
+
+  StringRef StringValue = "";
+  bool IsStringValue = false;
+
+  int64_t IntegerValue = 0;
+  bool IsIntegerValue = false;
+
+  if (Tag == ARMBuildAttrs::CPU_raw_name || Tag == ARMBuildAttrs::CPU_name)
+    IsStringValue = true;
+  else if (Tag == ARMBuildAttrs::compatibility) {
+    IsStringValue = true;
+    IsIntegerValue = true;
+  } else if (Tag < 32 || Tag % 2 == 0)
+    IsIntegerValue = true;
+  else if (Tag % 2 == 1)
+    IsStringValue = true;
+  else
+    llvm_unreachable("invalid tag type");
+
+  if (IsIntegerValue) {
+    const MCExpr *ValueExpr;
+    SMLoc ValueExprLoc = Parser.getTok().getLoc();
+    if (Parser.parseExpression(ValueExpr)) {
+      Parser.eatToEndOfStatement();
+      return false;
+    }
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ValueExpr);
+    if (!CE) {
+      Error(ValueExprLoc, "expected numeric constant");
+      Parser.eatToEndOfStatement();
+      return false;
+    }
+
+    IntegerValue = CE->getValue();
+  }
+
+  if (Tag == ARMBuildAttrs::compatibility) {
+    if (Parser.getTok().isNot(AsmToken::Comma))
+      IsStringValue = false;
+    else
+      Parser.Lex();
+  }
+
+  if (IsStringValue) {
+    if (Parser.getTok().isNot(AsmToken::String)) {
+      Error(Parser.getTok().getLoc(), "bad string constant");
+      Parser.eatToEndOfStatement();
+      return false;
+    }
+
+    StringValue = Parser.getTok().getStringContents();
+    Parser.Lex();
+  }
+
+  if (IsIntegerValue && IsStringValue) {
+    assert(Tag == ARMBuildAttrs::compatibility);
+    getTargetStreamer().emitIntTextAttribute(Tag, IntegerValue, StringValue);
+  } else if (IsIntegerValue)
+    getTargetStreamer().emitAttribute(Tag, IntegerValue);
+  else if (IsStringValue)
+    getTargetStreamer().emitTextAttribute(Tag, StringValue);
+  return false;
+}
+
+/// parseDirectiveCPU
+///  ::= .cpu str
+bool ARMAsmParser::parseDirectiveCPU(SMLoc L) {
+  StringRef CPU = getParser().parseStringToEndOfStatement().trim();
+  getTargetStreamer().emitTextAttribute(ARMBuildAttrs::CPU_name, CPU);
+  return false;
+}
+
+/// parseDirectiveFPU
+///  ::= .fpu str
+bool ARMAsmParser::parseDirectiveFPU(SMLoc L) {
+  StringRef FPU = getParser().parseStringToEndOfStatement().trim();
+
+  unsigned ID = StringSwitch<unsigned>(FPU)
+#define ARM_FPU_NAME(NAME, ID) .Case(NAME, ARM::ID)
+#include "ARMFPUName.def"
+    .Default(ARM::INVALID_FPU);
+
+  if (ID == ARM::INVALID_FPU) {
+    Error(L, "Unknown FPU name");
+    return false;
+  }
+
+  getTargetStreamer().emitFPU(ID);
+  return false;
+}
+
+/// parseDirectiveFnStart
+///  ::= .fnstart
+bool ARMAsmParser::parseDirectiveFnStart(SMLoc L) {
+  if (UC.hasFnStart()) {
+    Error(L, ".fnstart starts before the end of previous one");
+    UC.emitFnStartLocNotes();
+    return false;
+  }
+
+  // Reset the unwind directives parser state
+  UC.reset();
+
+  getTargetStreamer().emitFnStart();
+
+  UC.recordFnStart(L);
+  return false;
+}
+
+/// parseDirectiveFnEnd
+///  ::= .fnend
+bool ARMAsmParser::parseDirectiveFnEnd(SMLoc L) {
+  // Check the ordering of unwind directives
+  if (!UC.hasFnStart()) {
+    Error(L, ".fnstart must precede .fnend directive");
+    return false;
+  }
+
+  // Reset the unwind directives parser state
+  getTargetStreamer().emitFnEnd();
+
+  UC.reset();
+  return false;
+}
+
+/// parseDirectiveCantUnwind
+///  ::= .cantunwind
+bool ARMAsmParser::parseDirectiveCantUnwind(SMLoc L) {
+  UC.recordCantUnwind(L);
+
+  // Check the ordering of unwind directives
+  if (!UC.hasFnStart()) {
+    Error(L, ".fnstart must precede .cantunwind directive");
+    return false;
+  }
+  if (UC.hasHandlerData()) {
+    Error(L, ".cantunwind can't be used with .handlerdata directive");
+    UC.emitHandlerDataLocNotes();
+    return false;
+  }
+  if (UC.hasPersonality()) {
+    Error(L, ".cantunwind can't be used with .personality directive");
+    UC.emitPersonalityLocNotes();
+    return false;
+  }
+
+  getTargetStreamer().emitCantUnwind();
+  return false;
+}
+
+/// parseDirectivePersonality
+///  ::= .personality name
+bool ARMAsmParser::parseDirectivePersonality(SMLoc L) {
+  bool HasExistingPersonality = UC.hasPersonality();
+
+  UC.recordPersonality(L);
+
+  // Check the ordering of unwind directives
+  if (!UC.hasFnStart()) {
+    Error(L, ".fnstart must precede .personality directive");
+    return false;
+  }
+  if (UC.cantUnwind()) {
+    Error(L, ".personality can't be used with .cantunwind directive");
+    UC.emitCantUnwindLocNotes();
+    return false;
+  }
+  if (UC.hasHandlerData()) {
+    Error(L, ".personality must precede .handlerdata directive");
+    UC.emitHandlerDataLocNotes();
+    return false;
+  }
+  if (HasExistingPersonality) {
+    Parser.eatToEndOfStatement();
+    Error(L, "multiple personality directives");
+    UC.emitPersonalityLocNotes();
+    return false;
+  }
+
+  // Parse the name of the personality routine
+  if (Parser.getTok().isNot(AsmToken::Identifier)) {
+    Parser.eatToEndOfStatement();
+    Error(L, "unexpected input in .personality directive.");
+    return false;
+  }
+  StringRef Name(Parser.getTok().getIdentifier());
+  Parser.Lex();
+
+  MCSymbol *PR = getParser().getContext().GetOrCreateSymbol(Name);
+  getTargetStreamer().emitPersonality(PR);
+  return false;
+}
+
+/// parseDirectiveHandlerData
+///  ::= .handlerdata
+bool ARMAsmParser::parseDirectiveHandlerData(SMLoc L) {
+  UC.recordHandlerData(L);
+
+  // Check the ordering of unwind directives
+  if (!UC.hasFnStart()) {
+    Error(L, ".fnstart must precede .personality directive");
+    return false;
+  }
+  if (UC.cantUnwind()) {
+    Error(L, ".handlerdata can't be used with .cantunwind directive");
+    UC.emitCantUnwindLocNotes();
+    return false;
+  }
+
+  getTargetStreamer().emitHandlerData();
+  return false;
+}
+
+/// parseDirectiveSetFP
+///  ::= .setfp fpreg, spreg [, offset]
+bool ARMAsmParser::parseDirectiveSetFP(SMLoc L) {
+  // Check the ordering of unwind directives
+  if (!UC.hasFnStart()) {
+    Error(L, ".fnstart must precede .setfp directive");
+    return false;
+  }
+  if (UC.hasHandlerData()) {
+    Error(L, ".setfp must precede .handlerdata directive");
+    return false;
+  }
+
+  // Parse fpreg
+  SMLoc FPRegLoc = Parser.getTok().getLoc();
+  int FPReg = tryParseRegister();
+  if (FPReg == -1) {
+    Error(FPRegLoc, "frame pointer register expected");
+    return false;
+  }
+
+  // Consume comma
+  if (Parser.getTok().isNot(AsmToken::Comma)) {
+    Error(Parser.getTok().getLoc(), "comma expected");
+    return false;
+  }
+  Parser.Lex(); // skip comma
+
+  // Parse spreg
+  SMLoc SPRegLoc = Parser.getTok().getLoc();
+  int SPReg = tryParseRegister();
+  if (SPReg == -1) {
+    Error(SPRegLoc, "stack pointer register expected");
+    return false;
+  }
+
+  if (SPReg != ARM::SP && SPReg != UC.getFPReg()) {
+    Error(SPRegLoc, "register should be either $sp or the latest fp register");
+    return false;
+  }
+
+  // Update the frame pointer register
+  UC.saveFPReg(FPReg);
+
+  // Parse offset
+  int64_t Offset = 0;
+  if (Parser.getTok().is(AsmToken::Comma)) {
+    Parser.Lex(); // skip comma
+
+    if (Parser.getTok().isNot(AsmToken::Hash) &&
+        Parser.getTok().isNot(AsmToken::Dollar)) {
+      Error(Parser.getTok().getLoc(), "'#' expected");
+      return false;
+    }
+    Parser.Lex(); // skip hash token.
+
+    const MCExpr *OffsetExpr;
+    SMLoc ExLoc = Parser.getTok().getLoc();
+    SMLoc EndLoc;
+    if (getParser().parseExpression(OffsetExpr, EndLoc)) {
+      Error(ExLoc, "malformed setfp offset");
+      return false;
+    }
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(OffsetExpr);
+    if (!CE) {
+      Error(ExLoc, "setfp offset must be an immediate");
+      return false;
+    }
+
+    Offset = CE->getValue();
+  }
+
+  getTargetStreamer().emitSetFP(static_cast<unsigned>(FPReg),
+                                static_cast<unsigned>(SPReg), Offset);
+  return false;
+}
+
+/// parseDirective
+///  ::= .pad offset
+bool ARMAsmParser::parseDirectivePad(SMLoc L) {
+  // Check the ordering of unwind directives
+  if (!UC.hasFnStart()) {
+    Error(L, ".fnstart must precede .pad directive");
+    return false;
+  }
+  if (UC.hasHandlerData()) {
+    Error(L, ".pad must precede .handlerdata directive");
+    return false;
+  }
+
+  // Parse the offset
+  if (Parser.getTok().isNot(AsmToken::Hash) &&
+      Parser.getTok().isNot(AsmToken::Dollar)) {
+    Error(Parser.getTok().getLoc(), "'#' expected");
+    return false;
+  }
+  Parser.Lex(); // skip hash token.
+
+  const MCExpr *OffsetExpr;
+  SMLoc ExLoc = Parser.getTok().getLoc();
+  SMLoc EndLoc;
+  if (getParser().parseExpression(OffsetExpr, EndLoc)) {
+    Error(ExLoc, "malformed pad offset");
+    return false;
+  }
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(OffsetExpr);
+  if (!CE) {
+    Error(ExLoc, "pad offset must be an immediate");
+    return false;
+  }
+
+  getTargetStreamer().emitPad(CE->getValue());
+  return false;
+}
+
+/// parseDirectiveRegSave
+///  ::= .save  { registers }
+///  ::= .vsave { registers }
+bool ARMAsmParser::parseDirectiveRegSave(SMLoc L, bool IsVector) {
+  // Check the ordering of unwind directives
+  if (!UC.hasFnStart()) {
+    Error(L, ".fnstart must precede .save or .vsave directives");
+    return false;
+  }
+  if (UC.hasHandlerData()) {
+    Error(L, ".save or .vsave must precede .handlerdata directive");
+    return false;
+  }
+
+  // RAII object to make sure parsed operands are deleted.
+  SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> Operands;
+
+  // Parse the register list
+  if (parseRegisterList(Operands))
+    return false;
+  ARMOperand &Op = (ARMOperand &)*Operands[0];
+  if (!IsVector && !Op.isRegList()) {
+    Error(L, ".save expects GPR registers");
+    return false;
+  }
+  if (IsVector && !Op.isDPRRegList()) {
+    Error(L, ".vsave expects DPR registers");
+    return false;
+  }
+
+  getTargetStreamer().emitRegSave(Op.getRegList(), IsVector);
+  return false;
+}
+
+/// parseDirectiveInst
+///  ::= .inst opcode [, ...]
+///  ::= .inst.n opcode [, ...]
+///  ::= .inst.w opcode [, ...]
+bool ARMAsmParser::parseDirectiveInst(SMLoc Loc, char Suffix) {
+  int Width;
+
+  if (isThumb()) {
+    switch (Suffix) {
+    case 'n':
+      Width = 2;
+      break;
+    case 'w':
+      Width = 4;
+      break;
+    default:
+      Parser.eatToEndOfStatement();
+      Error(Loc, "cannot determine Thumb instruction size, "
+                 "use inst.n/inst.w instead");
+      return false;
+    }
+  } else {
+    if (Suffix) {
+      Parser.eatToEndOfStatement();
+      Error(Loc, "width suffixes are invalid in ARM mode");
+      return false;
+    }
+    Width = 4;
+  }
+
+  if (getLexer().is(AsmToken::EndOfStatement)) {
+    Parser.eatToEndOfStatement();
+    Error(Loc, "expected expression following directive");
+    return false;
+  }
+
+  for (;;) {
+    const MCExpr *Expr;
+
+    if (getParser().parseExpression(Expr)) {
+      Error(Loc, "expected expression");
+      return false;
+    }
+
+    const MCConstantExpr *Value = dyn_cast_or_null<MCConstantExpr>(Expr);
+    if (!Value) {
+      Error(Loc, "expected constant expression");
+      return false;
+    }
+
+    switch (Width) {
+    case 2:
+      if (Value->getValue() > 0xffff) {
+        Error(Loc, "inst.n operand is too big, use inst.w instead");
+        return false;
+      }
+      break;
+    case 4:
+      if (Value->getValue() > 0xffffffff) {
+        Error(Loc,
+              StringRef(Suffix ? "inst.w" : "inst") + " operand is too big");
+        return false;
+      }
+      break;
+    default:
+      llvm_unreachable("only supported widths are 2 and 4");
+    }
+
+    getTargetStreamer().emitInst(Value->getValue(), Suffix);
+
+    if (getLexer().is(AsmToken::EndOfStatement))
+      break;
+
+    if (getLexer().isNot(AsmToken::Comma)) {
+      Error(Loc, "unexpected token in directive");
+      return false;
+    }
+
+    Parser.Lex();
+  }
+
+  Parser.Lex();
+  return false;
+}
+
+/// parseDirectiveLtorg
+///  ::= .ltorg | .pool
+bool ARMAsmParser::parseDirectiveLtorg(SMLoc L) {
+  getTargetStreamer().emitCurrentConstantPool();
+  return false;
+}
+
+bool ARMAsmParser::parseDirectiveEven(SMLoc L) {
+  const MCSection *Section = getStreamer().getCurrentSection().first;
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    TokError("unexpected token in directive");
+    return false;
+  }
+
+  if (!Section) {
+    getStreamer().InitSections();
+    Section = getStreamer().getCurrentSection().first;
+  }
+
+  assert(Section && "must have section to emit alignment");
+  if (Section->UseCodeAlign())
+    getStreamer().EmitCodeAlignment(2);
+  else
+    getStreamer().EmitValueToAlignment(2);
+
+  return false;
+}
+
+/// parseDirectivePersonalityIndex
+///   ::= .personalityindex index
+bool ARMAsmParser::parseDirectivePersonalityIndex(SMLoc L) {
+  bool HasExistingPersonality = UC.hasPersonality();
+
+  UC.recordPersonalityIndex(L);
+
+  if (!UC.hasFnStart()) {
+    Parser.eatToEndOfStatement();
+    Error(L, ".fnstart must precede .personalityindex directive");
+    return false;
+  }
+  if (UC.cantUnwind()) {
+    Parser.eatToEndOfStatement();
+    Error(L, ".personalityindex cannot be used with .cantunwind");
+    UC.emitCantUnwindLocNotes();
+    return false;
+  }
+  if (UC.hasHandlerData()) {
+    Parser.eatToEndOfStatement();
+    Error(L, ".personalityindex must precede .handlerdata directive");
+    UC.emitHandlerDataLocNotes();
+    return false;
+  }
+  if (HasExistingPersonality) {
+    Parser.eatToEndOfStatement();
+    Error(L, "multiple personality directives");
+    UC.emitPersonalityLocNotes();
+    return false;
+  }
+
+  const MCExpr *IndexExpression;
+  SMLoc IndexLoc = Parser.getTok().getLoc();
+  if (Parser.parseExpression(IndexExpression)) {
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(IndexExpression);
+  if (!CE) {
+    Parser.eatToEndOfStatement();
+    Error(IndexLoc, "index must be a constant number");
+    return false;
+  }
+  if (CE->getValue() < 0 ||
+      CE->getValue() >= ARM::EHABI::NUM_PERSONALITY_INDEX) {
+    Parser.eatToEndOfStatement();
+    Error(IndexLoc, "personality routine index should be in range [0-3]");
+    return false;
+  }
+
+  getTargetStreamer().emitPersonalityIndex(CE->getValue());
+  return false;
+}
+
+/// parseDirectiveUnwindRaw
+///   ::= .unwind_raw offset, opcode [, opcode...]
+bool ARMAsmParser::parseDirectiveUnwindRaw(SMLoc L) {
+  if (!UC.hasFnStart()) {
+    Parser.eatToEndOfStatement();
+    Error(L, ".fnstart must precede .unwind_raw directives");
+    return false;
+  }
+
+  int64_t StackOffset;
+
+  const MCExpr *OffsetExpr;
+  SMLoc OffsetLoc = getLexer().getLoc();
+  if (getLexer().is(AsmToken::EndOfStatement) ||
+      getParser().parseExpression(OffsetExpr)) {
+    Error(OffsetLoc, "expected expression");
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(OffsetExpr);
+  if (!CE) {
+    Error(OffsetLoc, "offset must be a constant");
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
+  StackOffset = CE->getValue();
+
+  if (getLexer().isNot(AsmToken::Comma)) {
+    Error(getLexer().getLoc(), "expected comma");
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+  Parser.Lex();
+
+  SmallVector<uint8_t, 16> Opcodes;
+  for (;;) {
+    const MCExpr *OE;
+
+    SMLoc OpcodeLoc = getLexer().getLoc();
+    if (getLexer().is(AsmToken::EndOfStatement) || Parser.parseExpression(OE)) {
+      Error(OpcodeLoc, "expected opcode expression");
+      Parser.eatToEndOfStatement();
+      return false;
+    }
+
+    const MCConstantExpr *OC = dyn_cast<MCConstantExpr>(OE);
+    if (!OC) {
+      Error(OpcodeLoc, "opcode value must be a constant");
+      Parser.eatToEndOfStatement();
+      return false;
+    }
+
+    const int64_t Opcode = OC->getValue();
+    if (Opcode & ~0xff) {
+      Error(OpcodeLoc, "invalid opcode");
+      Parser.eatToEndOfStatement();
+      return false;
+    }
+
+    Opcodes.push_back(uint8_t(Opcode));
+
+    if (getLexer().is(AsmToken::EndOfStatement))
+      break;
+
+    if (getLexer().isNot(AsmToken::Comma)) {
+      Error(getLexer().getLoc(), "unexpected token in directive");
+      Parser.eatToEndOfStatement();
+      return false;
+    }
+
+    Parser.Lex();
+  }
+
+  getTargetStreamer().emitUnwindRaw(StackOffset, Opcodes);
+
+  Parser.Lex();
+  return false;
+}
+
+/// parseDirectiveTLSDescSeq
+///   ::= .tlsdescseq tls-variable
+bool ARMAsmParser::parseDirectiveTLSDescSeq(SMLoc L) {
+  if (getLexer().isNot(AsmToken::Identifier)) {
+    TokError("expected variable after '.tlsdescseq' directive");
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
+  const MCSymbolRefExpr *SRE =
+    MCSymbolRefExpr::Create(Parser.getTok().getIdentifier(),
+                            MCSymbolRefExpr::VK_ARM_TLSDESCSEQ, getContext());
+  Lex();
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    Error(Parser.getTok().getLoc(), "unexpected token");
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
+  getTargetStreamer().AnnotateTLSDescriptorSequence(SRE);
+  return false;
+}
+
+/// parseDirectiveMovSP
+///  ::= .movsp reg [, #offset]
+bool ARMAsmParser::parseDirectiveMovSP(SMLoc L) {
+  if (!UC.hasFnStart()) {
+    Parser.eatToEndOfStatement();
+    Error(L, ".fnstart must precede .movsp directives");
+    return false;
+  }
+  if (UC.getFPReg() != ARM::SP) {
+    Parser.eatToEndOfStatement();
+    Error(L, "unexpected .movsp directive");
+    return false;
+  }
+
+  SMLoc SPRegLoc = Parser.getTok().getLoc();
+  int SPReg = tryParseRegister();
+  if (SPReg == -1) {
+    Parser.eatToEndOfStatement();
+    Error(SPRegLoc, "register expected");
+    return false;
+  }
+
+  if (SPReg == ARM::SP || SPReg == ARM::PC) {
+    Parser.eatToEndOfStatement();
+    Error(SPRegLoc, "sp and pc are not permitted in .movsp directive");
+    return false;
+  }
+
+  int64_t Offset = 0;
+  if (Parser.getTok().is(AsmToken::Comma)) {
+    Parser.Lex();
+
+    if (Parser.getTok().isNot(AsmToken::Hash)) {
+      Error(Parser.getTok().getLoc(), "expected #constant");
+      Parser.eatToEndOfStatement();
+      return false;
+    }
+    Parser.Lex();
+
+    const MCExpr *OffsetExpr;
+    SMLoc OffsetLoc = Parser.getTok().getLoc();
+    if (Parser.parseExpression(OffsetExpr)) {
+      Parser.eatToEndOfStatement();
+      Error(OffsetLoc, "malformed offset expression");
+      return false;
+    }
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(OffsetExpr);
+    if (!CE) {
+      Parser.eatToEndOfStatement();
+      Error(OffsetLoc, "offset must be an immediate constant");
+      return false;
+    }
+
+    Offset = CE->getValue();
+  }
+
+  getTargetStreamer().emitMovSP(SPReg, Offset);
+  UC.saveFPReg(SPReg);
+
+  return false;
+}
+
+/// parseDirectiveObjectArch
+///   ::= .object_arch name
+bool ARMAsmParser::parseDirectiveObjectArch(SMLoc L) {
+  if (getLexer().isNot(AsmToken::Identifier)) {
+    Error(getLexer().getLoc(), "unexpected token");
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
+  StringRef Arch = Parser.getTok().getString();
+  SMLoc ArchLoc = Parser.getTok().getLoc();
+  getLexer().Lex();
+
+  unsigned ID = StringSwitch<unsigned>(Arch)
+#define ARM_ARCH_NAME(NAME, ID, DEFAULT_CPU_NAME, DEFAULT_CPU_ARCH) \
+    .Case(NAME, ARM::ID)
+#define ARM_ARCH_ALIAS(NAME, ID) \
+    .Case(NAME, ARM::ID)
+#include "MCTargetDesc/ARMArchName.def"
+#undef ARM_ARCH_NAME
+#undef ARM_ARCH_ALIAS
+    .Default(ARM::INVALID_ARCH);
+
+  if (ID == ARM::INVALID_ARCH) {
+    Error(ArchLoc, "unknown architecture '" + Arch + "'");
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
+  getTargetStreamer().emitObjectArch(ID);
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    Error(getLexer().getLoc(), "unexpected token");
+    Parser.eatToEndOfStatement();
+  }
+
+  return false;
+}
+
+/// parseDirectiveAlign
+///   ::= .align
+bool ARMAsmParser::parseDirectiveAlign(SMLoc L) {
+  // NOTE: if this is not the end of the statement, fall back to the target
+  // agnostic handling for this directive which will correctly handle this.
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return true;
+
+  // '.align' is target specifically handled to mean 2**2 byte alignment.
+  if (getStreamer().getCurrentSection().first->UseCodeAlign())
+    getStreamer().EmitCodeAlignment(4, 0);
+  else
+    getStreamer().EmitValueToAlignment(4, 0, 1, 0);
+
+  return false;
+}
+
+/// parseDirectiveThumbSet
+///  ::= .thumb_set name, value
+bool ARMAsmParser::parseDirectiveThumbSet(SMLoc L) {
+  StringRef Name;
+  if (Parser.parseIdentifier(Name)) {
+    TokError("expected identifier after '.thumb_set'");
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
+  if (getLexer().isNot(AsmToken::Comma)) {
+    TokError("expected comma after name '" + Name + "'");
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+  Lex();
+
+  const MCExpr *Value;
+  if (Parser.parseExpression(Value)) {
+    TokError("missing expression");
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    TokError("unexpected token");
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+  Lex();
+
+  MCSymbol *Alias = getContext().GetOrCreateSymbol(Name);
+  getTargetStreamer().emitThumbSet(Alias, Value);
+  return false;
+}
+
+/// Force static initialization.
+extern "C" void LLVMInitializeARMAsmParser() {
+  RegisterMCAsmParser<ARMAsmParser> X(TheARMLETarget);
+  RegisterMCAsmParser<ARMAsmParser> Y(TheARMBETarget);
+  RegisterMCAsmParser<ARMAsmParser> A(TheThumbLETarget);
+  RegisterMCAsmParser<ARMAsmParser> B(TheThumbBETarget);
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_SUBTARGET_FEATURE_NAME
+#define GET_MATCHER_IMPLEMENTATION
+#include "ARMGenAsmMatcher.inc"
+
+static const struct ExtMapEntry {
+  const char *Extension;
+  const unsigned ArchCheck;
+  const uint64_t Features;
+} Extensions[] = {
+  { "crc", Feature_HasV8, ARM::FeatureCRC },
+  { "crypto",  Feature_HasV8,
+    ARM::FeatureCrypto | ARM::FeatureNEON | ARM::FeatureFPARMv8 },
+  { "fp", Feature_HasV8, ARM::FeatureFPARMv8 },
+  { "idiv", Feature_HasV7 | Feature_IsNotMClass,
+    ARM::FeatureHWDiv | ARM::FeatureHWDivARM },
+  // FIXME: iWMMXT not supported
+  { "iwmmxt", Feature_None, 0 },
+  // FIXME: iWMMXT2 not supported
+  { "iwmmxt2", Feature_None, 0 },
+  // FIXME: Maverick not supported
+  { "maverick", Feature_None, 0 },
+  { "mp", Feature_HasV7 | Feature_IsNotMClass, ARM::FeatureMP },
+  // FIXME: ARMv6-m OS Extensions feature not checked
+  { "os", Feature_None, 0 },
+  // FIXME: Also available in ARMv6-K
+  { "sec", Feature_HasV7, ARM::FeatureTrustZone },
+  { "simd", Feature_HasV8, ARM::FeatureNEON | ARM::FeatureFPARMv8 },
+  // FIXME: Only available in A-class, isel not predicated
+  { "virt", Feature_HasV7, ARM::FeatureVirtualization },
+  // FIXME: xscale not supported
+  { "xscale", Feature_None, 0 },
+};
+
+/// parseDirectiveArchExtension
+///   ::= .arch_extension [no]feature
+bool ARMAsmParser::parseDirectiveArchExtension(SMLoc L) {
+  if (getLexer().isNot(AsmToken::Identifier)) {
+    Error(getLexer().getLoc(), "unexpected token");
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
+  StringRef Extension = Parser.getTok().getString();
+  SMLoc ExtLoc = Parser.getTok().getLoc();
+  getLexer().Lex();
+
+  bool EnableFeature = true;
+  if (Extension.startswith_lower("no")) {
+    EnableFeature = false;
+    Extension = Extension.substr(2);
+  }
+
+  for (unsigned EI = 0, EE = array_lengthof(Extensions); EI != EE; ++EI) {
+    if (Extensions[EI].Extension != Extension)
+      continue;
+
+    unsigned FB = getAvailableFeatures();
+    if ((FB & Extensions[EI].ArchCheck) != Extensions[EI].ArchCheck) {
+      Error(ExtLoc, "architectural extension '" + Extension + "' is not "
+            "allowed for the current base architecture");
+      return false;
+    }
+
+    if (!Extensions[EI].Features)
+      report_fatal_error("unsupported architectural extension: " + Extension);
+
+    if (EnableFeature)
+      FB |= ComputeAvailableFeatures(Extensions[EI].Features);
+    else
+      FB &= ~ComputeAvailableFeatures(Extensions[EI].Features);
+
+    setAvailableFeatures(FB);
+    return false;
+  }
+
+  Error(ExtLoc, "unknown architectural extension: " + Extension);
+  Parser.eatToEndOfStatement();
+  return false;
+}
+
+// Define this matcher function after the auto-generated include so we
+// have the match class enum definitions.
+unsigned ARMAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
+                                                  unsigned Kind) {
+  ARMOperand &Op = static_cast<ARMOperand &>(AsmOp);
+  // If the kind is a token for a literal immediate, check if our asm
+  // operand matches. This is for InstAliases which have a fixed-value
+  // immediate in the syntax.
+  switch (Kind) {
+  default: break;
+  case MCK__35_0:
+    if (Op.isImm())
+      if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op.getImm()))
+        if (CE->getValue() == 0)
+          return Match_Success;
+    break;
+  case MCK_ARMSOImm:
+    if (Op.isImm()) {
+      const MCExpr *SOExpr = Op.getImm();
+      int64_t Value;
+      if (!SOExpr->EvaluateAsAbsolute(Value))
+        return Match_Success;
+      assert((Value >= INT32_MIN && Value <= UINT32_MAX) &&
+             "expression value must be representable in 32 bits");
+    }
+    break;
+  case MCK_GPRPair:
+    if (Op.isReg() &&
+        MRI->getRegClass(ARM::GPRRegClassID).contains(Op.getReg()))
+      return Match_Success;
+    break;
+  }
+  return Match_InvalidOperand;
+}
diff --git a/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
new file mode 100644
index 0000000..4d4038d
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -0,0 +1,4992 @@
+//===-- ARMDisassembler.cpp - Disassembler for ARM/Thumb ISA --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCDisassembler.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "MCTargetDesc/ARMMCExpr.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Support/MemoryObject.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "arm-disassembler"
+
+typedef MCDisassembler::DecodeStatus DecodeStatus;
+
+namespace {
+  // Handles the condition code status of instructions in IT blocks
+  class ITStatus
+  {
+    public:
+      // Returns the condition code for instruction in IT block
+      unsigned getITCC() {
+        unsigned CC = ARMCC::AL;
+        if (instrInITBlock())
+          CC = ITStates.back();
+        return CC;
+      }
+
+      // Advances the IT block state to the next T or E
+      void advanceITState() {
+        ITStates.pop_back();
+      }
+
+      // Returns true if the current instruction is in an IT block
+      bool instrInITBlock() {
+        return !ITStates.empty();
+      }
+
+      // Returns true if current instruction is the last instruction in an IT block
+      bool instrLastInITBlock() {
+        return ITStates.size() == 1;
+      }
+
+      // Called when decoding an IT instruction. Sets the IT state for the following
+      // instructions that for the IT block. Firstcond and Mask correspond to the 
+      // fields in the IT instruction encoding.
+      void setITState(char Firstcond, char Mask) {
+        // (3 - the number of trailing zeros) is the number of then / else.
+        unsigned CondBit0 = Firstcond & 1;
+        unsigned NumTZ = countTrailingZeros<uint8_t>(Mask);
+        unsigned char CCBits = static_cast<unsigned char>(Firstcond & 0xf);
+        assert(NumTZ <= 3 && "Invalid IT mask!");
+        // push condition codes onto the stack the correct order for the pops
+        for (unsigned Pos = NumTZ+1; Pos <= 3; ++Pos) {
+          bool T = ((Mask >> Pos) & 1) == CondBit0;
+          if (T)
+            ITStates.push_back(CCBits);
+          else
+            ITStates.push_back(CCBits ^ 1);
+        }
+        ITStates.push_back(CCBits);
+      }
+
+    private:
+      std::vector<unsigned char> ITStates;
+  };
+}
+
+namespace {
+/// ARMDisassembler - ARM disassembler for all ARM platforms.
+class ARMDisassembler : public MCDisassembler {
+public:
+  /// Constructor     - Initializes the disassembler.
+  ///
+  ARMDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) :
+    MCDisassembler(STI, Ctx) {
+  }
+
+  ~ARMDisassembler() {
+  }
+
+  /// getInstruction - See MCDisassembler.
+  DecodeStatus getInstruction(MCInst &instr, uint64_t &size,
+                              const MemoryObject &region, uint64_t address,
+                              raw_ostream &vStream,
+                              raw_ostream &cStream) const override;
+};
+
+/// ThumbDisassembler - Thumb disassembler for all Thumb platforms.
+class ThumbDisassembler : public MCDisassembler {
+public:
+  /// Constructor     - Initializes the disassembler.
+  ///
+  ThumbDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) :
+    MCDisassembler(STI, Ctx) {
+  }
+
+  ~ThumbDisassembler() {
+  }
+
+  /// getInstruction - See MCDisassembler.
+  DecodeStatus getInstruction(MCInst &instr, uint64_t &size,
+                              const MemoryObject &region, uint64_t address,
+                              raw_ostream &vStream,
+                              raw_ostream &cStream) const override;
+
+private:
+  mutable ITStatus ITBlock;
+  DecodeStatus AddThumbPredicate(MCInst&) const;
+  void UpdateThumbVFPPredicate(MCInst&) const;
+};
+}
+
+static bool Check(DecodeStatus &Out, DecodeStatus In) {
+  switch (In) {
+    case MCDisassembler::Success:
+      // Out stays the same.
+      return true;
+    case MCDisassembler::SoftFail:
+      Out = In;
+      return true;
+    case MCDisassembler::Fail:
+      Out = In;
+      return false;
+  }
+  llvm_unreachable("Invalid DecodeStatus!");
+}
+
+
+// Forward declare these because the autogenerated code will reference them.
+// Definitions are further down.
+static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeGPRnopcRegisterClass(MCInst &Inst,
+                                               unsigned RegNo, uint64_t Address,
+                                               const void *Decoder);
+static DecodeStatus DecodeGPRwithAPSRRegisterClass(MCInst &Inst,
+                                               unsigned RegNo, uint64_t Address,
+                                               const void *Decoder);
+static DecodeStatus DecodetGPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodetcGPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecoderGPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeDPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeDPR_8RegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeDPR_VFP2RegisterClass(MCInst &Inst,
+                                                unsigned RegNo,
+                                                uint64_t Address,
+                                                const void *Decoder);
+static DecodeStatus DecodeQPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeDPairRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeDPairSpacedRegisterClass(MCInst &Inst,
+                               unsigned RegNo, uint64_t Address,
+                               const void *Decoder);
+
+static DecodeStatus DecodePredicateOperand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeCCOutOperand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSOImmOperand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSPRRegListOperand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeDPRRegListOperand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeBitfieldMaskOperand(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeCopMemInstruction(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeAddrMode2IdxInstruction(MCInst &Inst,
+                                                  unsigned Insn,
+                                                  uint64_t Address,
+                                                  const void *Decoder);
+static DecodeStatus DecodeSORegMemOperand(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeAddrMode3Instruction(MCInst &Inst,unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSORegImmOperand(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSORegRegOperand(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeMemMultipleWritebackInstruction(MCInst & Inst,
+                                                  unsigned Insn,
+                                                  uint64_t Adddress,
+                                                  const void *Decoder);
+static DecodeStatus DecodeT2MOVTWInstruction(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeArmMOVTWInstruction(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSMLAInstruction(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeCPSInstruction(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2CPSInstruction(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeAddrModeImm12Operand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeAddrMode5Operand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeAddrMode7Operand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2BInstruction(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeBranchImmInstruction(MCInst &Inst,unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeAddrMode6Operand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLDST1Instruction(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLDST2Instruction(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLDST3Instruction(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLDST4Instruction(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLDInstruction(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVSTInstruction(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD1DupInstruction(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD2DupInstruction(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD3DupInstruction(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD4DupInstruction(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeNEONModImmInstruction(MCInst &Inst,unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVSHLMaxInstruction(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeShiftRight8Imm(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeShiftRight16Imm(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeShiftRight32Imm(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeShiftRight64Imm(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeTBLInstruction(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodePostIdxReg(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeCoprocessor(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMemBarrierOption(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeInstSyncBarrierOption(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMSRMask(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeDoubleRegLoad(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeDoubleRegStore(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeLDRPreImm(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeLDRPreReg(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSTRPreImm(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSTRPreReg(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD1LN(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD2LN(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD3LN(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD4LN(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVST1LN(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVST2LN(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVST3LN(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVST4LN(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVMOVSRR(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVMOVRRS(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSwap(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn,
+                                uint64_t Address, const void *Decoder);
+
+
+static DecodeStatus DecodeThumbAddSpecialReg(MCInst &Inst, uint16_t Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbBROperand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2BROperand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbCmpBROperand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbAddrModeRR(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbAddrModeIS(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbAddrModePC(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbAddrModeSP(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2AddrModeSOReg(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2LoadShift(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2LoadImm8(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void* Decoder);
+static DecodeStatus DecodeT2LoadImm12(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void* Decoder);
+static DecodeStatus DecodeT2LoadT(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void* Decoder);
+static DecodeStatus DecodeT2LoadLabel(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void* Decoder);
+static DecodeStatus DecodeT2Imm8S4(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2AddrModeImm8s4(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2AddrModeImm0_1020s4(MCInst &Inst,unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2Imm8(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2AddrModeImm8(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbAddSPImm(MCInst &Inst, uint16_t Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbAddSPReg(MCInst &Inst, uint16_t Insn,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbCPS(MCInst &Inst, uint16_t Insn,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeQADDInstruction(MCInst &Inst, unsigned Insn,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbBLXOffset(MCInst &Inst, unsigned Insn,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2AddrModeImm12(MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbTableBranch(MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumb2BCCInstruction(MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2SOImm(MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbBCCTargetOperand(MCInst &Inst,unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbBLTargetOperand(MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeIT(MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2LDRDPreInstruction(MCInst &Inst,unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2STRDPreInstruction(MCInst &Inst,unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2Adr(MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2LdStPre(MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2ShifterImmOperand(MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeLDR(MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMRRC2(llvm::MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+#include "ARMGenDisassemblerTables.inc"
+
+static MCDisassembler *createARMDisassembler(const Target &T,
+                                             const MCSubtargetInfo &STI,
+                                             MCContext &Ctx) {
+  return new ARMDisassembler(STI, Ctx);
+}
+
+static MCDisassembler *createThumbDisassembler(const Target &T,
+                                               const MCSubtargetInfo &STI,
+                                               MCContext &Ctx) {
+  return new ThumbDisassembler(STI, Ctx);
+}
+
+DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
+                                             const MemoryObject &Region,
+                                             uint64_t Address,
+                                             raw_ostream &os,
+                                             raw_ostream &cs) const {
+  CommentStream = &cs;
+
+  uint8_t bytes[4];
+
+  assert(!(STI.getFeatureBits() & ARM::ModeThumb) &&
+         "Asked to disassemble an ARM instruction but Subtarget is in Thumb mode!");
+
+  // We want to read exactly 4 bytes of data.
+  if (Region.readBytes(Address, 4, bytes) == -1) {
+    Size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  // Encoded as a small-endian 32-bit word in the stream.
+  uint32_t insn = (bytes[3] << 24) |
+                  (bytes[2] << 16) |
+                  (bytes[1] <<  8) |
+                  (bytes[0] <<  0);
+
+  // Calling the auto-generated decoder function.
+  DecodeStatus result = decodeInstruction(DecoderTableARM32, MI, insn,
+                                          Address, this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    return result;
+  }
+
+  // VFP and NEON instructions, similarly, are shared between ARM
+  // and Thumb modes.
+  MI.clear();
+  result = decodeInstruction(DecoderTableVFP32, MI, insn, Address, this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    return result;
+  }
+
+  MI.clear();
+  result = decodeInstruction(DecoderTableVFPV832, MI, insn, Address, this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    return result;
+  }
+
+  MI.clear();
+  result = decodeInstruction(DecoderTableNEONData32, MI, insn, Address,
+                             this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    // Add a fake predicate operand, because we share these instruction
+    // definitions with Thumb2 where these instructions are predicable.
+    if (!DecodePredicateOperand(MI, 0xE, Address, this))
+      return MCDisassembler::Fail;
+    return result;
+  }
+
+  MI.clear();
+  result = decodeInstruction(DecoderTableNEONLoadStore32, MI, insn, Address,
+                             this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    // Add a fake predicate operand, because we share these instruction
+    // definitions with Thumb2 where these instructions are predicable.
+    if (!DecodePredicateOperand(MI, 0xE, Address, this))
+      return MCDisassembler::Fail;
+    return result;
+  }
+
+  MI.clear();
+  result = decodeInstruction(DecoderTableNEONDup32, MI, insn, Address,
+                             this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    // Add a fake predicate operand, because we share these instruction
+    // definitions with Thumb2 where these instructions are predicable.
+    if (!DecodePredicateOperand(MI, 0xE, Address, this))
+      return MCDisassembler::Fail;
+    return result;
+  }
+
+  MI.clear();
+  result = decodeInstruction(DecoderTablev8NEON32, MI, insn, Address,
+                             this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    return result;
+  }
+
+  MI.clear();
+  result = decodeInstruction(DecoderTablev8Crypto32, MI, insn, Address,
+                             this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    return result;
+  }
+
+  MI.clear();
+  Size = 0;
+  return MCDisassembler::Fail;
+}
+
+namespace llvm {
+extern const MCInstrDesc ARMInsts[];
+}
+
+/// tryAddingSymbolicOperand - trys to add a symbolic operand in place of the
+/// immediate Value in the MCInst.  The immediate Value has had any PC
+/// adjustment made by the caller.  If the instruction is a branch instruction
+/// then isBranch is true, else false.  If the getOpInfo() function was set as
+/// part of the setupForSymbolicDisassembly() call then that function is called
+/// to get any symbolic information at the Address for this instruction.  If
+/// that returns non-zero then the symbolic information it returns is used to
+/// create an MCExpr and that is added as an operand to the MCInst.  If
+/// getOpInfo() returns zero and isBranch is true then a symbol look up for
+/// Value is done and if a symbol is found an MCExpr is created with that, else
+/// an MCExpr with Value is created.  This function returns true if it adds an
+/// operand to the MCInst and false otherwise.
+static bool tryAddingSymbolicOperand(uint64_t Address, int32_t Value,
+                                     bool isBranch, uint64_t InstSize,
+                                     MCInst &MI, const void *Decoder) {
+  const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
+  // FIXME: Does it make sense for value to be negative?
+  return Dis->tryAddingSymbolicOperand(MI, (uint32_t)Value, Address, isBranch,
+                                       /* Offset */ 0, InstSize);
+}
+
+/// tryAddingPcLoadReferenceComment - trys to add a comment as to what is being
+/// referenced by a load instruction with the base register that is the Pc.
+/// These can often be values in a literal pool near the Address of the
+/// instruction.  The Address of the instruction and its immediate Value are
+/// used as a possible literal pool entry.  The SymbolLookUp call back will
+/// return the name of a symbol referenced by the literal pool's entry if
+/// the referenced address is that of a symbol.  Or it will return a pointer to
+/// a literal 'C' string if the referenced address of the literal pool's entry
+/// is an address into a section with 'C' string literals.
+static void tryAddingPcLoadReferenceComment(uint64_t Address, int Value,
+                                            const void *Decoder) {
+  const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
+  Dis->tryAddingPcLoadReferenceComment(Value, Address);
+}
+
+// Thumb1 instructions don't have explicit S bits.  Rather, they
+// implicitly set CPSR.  Since it's not represented in the encoding, the
+// auto-generated decoder won't inject the CPSR operand.  We need to fix
+// that as a post-pass.
+static void AddThumb1SBit(MCInst &MI, bool InITBlock) {
+  const MCOperandInfo *OpInfo = ARMInsts[MI.getOpcode()].OpInfo;
+  unsigned short NumOps = ARMInsts[MI.getOpcode()].NumOperands;
+  MCInst::iterator I = MI.begin();
+  for (unsigned i = 0; i < NumOps; ++i, ++I) {
+    if (I == MI.end()) break;
+    if (OpInfo[i].isOptionalDef() && OpInfo[i].RegClass == ARM::CCRRegClassID) {
+      if (i > 0 && OpInfo[i-1].isPredicate()) continue;
+      MI.insert(I, MCOperand::CreateReg(InITBlock ? 0 : ARM::CPSR));
+      return;
+    }
+  }
+
+  MI.insert(I, MCOperand::CreateReg(InITBlock ? 0 : ARM::CPSR));
+}
+
+// Most Thumb instructions don't have explicit predicates in the
+// encoding, but rather get their predicates from IT context.  We need
+// to fix up the predicate operands using this context information as a
+// post-pass.
+MCDisassembler::DecodeStatus
+ThumbDisassembler::AddThumbPredicate(MCInst &MI) const {
+  MCDisassembler::DecodeStatus S = Success;
+
+  // A few instructions actually have predicates encoded in them.  Don't
+  // try to overwrite it if we're seeing one of those.
+  switch (MI.getOpcode()) {
+    case ARM::tBcc:
+    case ARM::t2Bcc:
+    case ARM::tCBZ:
+    case ARM::tCBNZ:
+    case ARM::tCPS:
+    case ARM::t2CPS3p:
+    case ARM::t2CPS2p:
+    case ARM::t2CPS1p:
+    case ARM::tMOVSr:
+    case ARM::tSETEND:
+      // Some instructions (mostly conditional branches) are not
+      // allowed in IT blocks.
+      if (ITBlock.instrInITBlock())
+        S = SoftFail;
+      else
+        return Success;
+      break;
+    case ARM::tB:
+    case ARM::t2B:
+    case ARM::t2TBB:
+    case ARM::t2TBH:
+      // Some instructions (mostly unconditional branches) can
+      // only appears at the end of, or outside of, an IT.
+      if (ITBlock.instrInITBlock() && !ITBlock.instrLastInITBlock())
+        S = SoftFail;
+      break;
+    default:
+      break;
+  }
+
+  // If we're in an IT block, base the predicate on that.  Otherwise,
+  // assume a predicate of AL.
+  unsigned CC;
+  CC = ITBlock.getITCC();
+  if (CC == 0xF) 
+    CC = ARMCC::AL;
+  if (ITBlock.instrInITBlock())
+    ITBlock.advanceITState();
+
+  const MCOperandInfo *OpInfo = ARMInsts[MI.getOpcode()].OpInfo;
+  unsigned short NumOps = ARMInsts[MI.getOpcode()].NumOperands;
+  MCInst::iterator I = MI.begin();
+  for (unsigned i = 0; i < NumOps; ++i, ++I) {
+    if (I == MI.end()) break;
+    if (OpInfo[i].isPredicate()) {
+      I = MI.insert(I, MCOperand::CreateImm(CC));
+      ++I;
+      if (CC == ARMCC::AL)
+        MI.insert(I, MCOperand::CreateReg(0));
+      else
+        MI.insert(I, MCOperand::CreateReg(ARM::CPSR));
+      return S;
+    }
+  }
+
+  I = MI.insert(I, MCOperand::CreateImm(CC));
+  ++I;
+  if (CC == ARMCC::AL)
+    MI.insert(I, MCOperand::CreateReg(0));
+  else
+    MI.insert(I, MCOperand::CreateReg(ARM::CPSR));
+
+  return S;
+}
+
+// Thumb VFP instructions are a special case.  Because we share their
+// encodings between ARM and Thumb modes, and they are predicable in ARM
+// mode, the auto-generated decoder will give them an (incorrect)
+// predicate operand.  We need to rewrite these operands based on the IT
+// context as a post-pass.
+void ThumbDisassembler::UpdateThumbVFPPredicate(MCInst &MI) const {
+  unsigned CC;
+  CC = ITBlock.getITCC();
+  if (ITBlock.instrInITBlock())
+    ITBlock.advanceITState();
+
+  const MCOperandInfo *OpInfo = ARMInsts[MI.getOpcode()].OpInfo;
+  MCInst::iterator I = MI.begin();
+  unsigned short NumOps = ARMInsts[MI.getOpcode()].NumOperands;
+  for (unsigned i = 0; i < NumOps; ++i, ++I) {
+    if (OpInfo[i].isPredicate() ) {
+      I->setImm(CC);
+      ++I;
+      if (CC == ARMCC::AL)
+        I->setReg(0);
+      else
+        I->setReg(ARM::CPSR);
+      return;
+    }
+  }
+}
+
+DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
+                                               const MemoryObject &Region,
+                                               uint64_t Address,
+                                               raw_ostream &os,
+                                               raw_ostream &cs) const {
+  CommentStream = &cs;
+
+  uint8_t bytes[4];
+
+  assert((STI.getFeatureBits() & ARM::ModeThumb) &&
+         "Asked to disassemble in Thumb mode but Subtarget is in ARM mode!");
+
+  // We want to read exactly 2 bytes of data.
+  if (Region.readBytes(Address, 2, bytes) == -1) {
+    Size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  uint16_t insn16 = (bytes[1] << 8) | bytes[0];
+  DecodeStatus result = decodeInstruction(DecoderTableThumb16, MI, insn16,
+                                          Address, this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 2;
+    Check(result, AddThumbPredicate(MI));
+    return result;
+  }
+
+  MI.clear();
+  result = decodeInstruction(DecoderTableThumbSBit16, MI, insn16,
+                             Address, this, STI);
+  if (result) {
+    Size = 2;
+    bool InITBlock = ITBlock.instrInITBlock();
+    Check(result, AddThumbPredicate(MI));
+    AddThumb1SBit(MI, InITBlock);
+    return result;
+  }
+
+  MI.clear();
+  result = decodeInstruction(DecoderTableThumb216, MI, insn16,
+                             Address, this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 2;
+
+    // Nested IT blocks are UNPREDICTABLE.  Must be checked before we add
+    // the Thumb predicate.
+    if (MI.getOpcode() == ARM::t2IT && ITBlock.instrInITBlock())
+      result = MCDisassembler::SoftFail;
+
+    Check(result, AddThumbPredicate(MI));
+
+    // If we find an IT instruction, we need to parse its condition
+    // code and mask operands so that we can apply them correctly
+    // to the subsequent instructions.
+    if (MI.getOpcode() == ARM::t2IT) {
+
+      unsigned Firstcond = MI.getOperand(0).getImm();
+      unsigned Mask = MI.getOperand(1).getImm();
+      ITBlock.setITState(Firstcond, Mask);
+    }
+
+    return result;
+  }
+
+  // We want to read exactly 4 bytes of data.
+  if (Region.readBytes(Address, 4, bytes) == -1) {
+    Size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  uint32_t insn32 = (bytes[3] <<  8) |
+                    (bytes[2] <<  0) |
+                    (bytes[1] << 24) |
+                    (bytes[0] << 16);
+  MI.clear();
+  result = decodeInstruction(DecoderTableThumb32, MI, insn32, Address,
+                             this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    bool InITBlock = ITBlock.instrInITBlock();
+    Check(result, AddThumbPredicate(MI));
+    AddThumb1SBit(MI, InITBlock);
+    return result;
+  }
+
+  MI.clear();
+  result = decodeInstruction(DecoderTableThumb232, MI, insn32, Address,
+                             this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    Check(result, AddThumbPredicate(MI));
+    return result;
+  }
+
+  if (fieldFromInstruction(insn32, 28, 4) == 0xE) {
+    MI.clear();
+    result = decodeInstruction(DecoderTableVFP32, MI, insn32, Address, this, STI);
+    if (result != MCDisassembler::Fail) {
+      Size = 4;
+      UpdateThumbVFPPredicate(MI);
+      return result;
+    }
+  }
+
+  MI.clear();
+  result = decodeInstruction(DecoderTableVFPV832, MI, insn32, Address, this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    return result;
+  }
+
+  if (fieldFromInstruction(insn32, 28, 4) == 0xE) {
+    MI.clear();
+    result = decodeInstruction(DecoderTableNEONDup32, MI, insn32, Address,
+                               this, STI);
+    if (result != MCDisassembler::Fail) {
+      Size = 4;
+      Check(result, AddThumbPredicate(MI));
+      return result;
+    }
+  }
+
+  if (fieldFromInstruction(insn32, 24, 8) == 0xF9) {
+    MI.clear();
+    uint32_t NEONLdStInsn = insn32;
+    NEONLdStInsn &= 0xF0FFFFFF;
+    NEONLdStInsn |= 0x04000000;
+    result = decodeInstruction(DecoderTableNEONLoadStore32, MI, NEONLdStInsn,
+                               Address, this, STI);
+    if (result != MCDisassembler::Fail) {
+      Size = 4;
+      Check(result, AddThumbPredicate(MI));
+      return result;
+    }
+  }
+
+  if (fieldFromInstruction(insn32, 24, 4) == 0xF) {
+    MI.clear();
+    uint32_t NEONDataInsn = insn32;
+    NEONDataInsn &= 0xF0FFFFFF; // Clear bits 27-24
+    NEONDataInsn |= (NEONDataInsn & 0x10000000) >> 4; // Move bit 28 to bit 24
+    NEONDataInsn |= 0x12000000; // Set bits 28 and 25
+    result = decodeInstruction(DecoderTableNEONData32, MI, NEONDataInsn,
+                               Address, this, STI);
+    if (result != MCDisassembler::Fail) {
+      Size = 4;
+      Check(result, AddThumbPredicate(MI));
+      return result;
+    }
+
+    MI.clear();
+    uint32_t NEONCryptoInsn = insn32;
+    NEONCryptoInsn &= 0xF0FFFFFF; // Clear bits 27-24
+    NEONCryptoInsn |= (NEONCryptoInsn & 0x10000000) >> 4; // Move bit 28 to bit 24
+    NEONCryptoInsn |= 0x12000000; // Set bits 28 and 25
+    result = decodeInstruction(DecoderTablev8Crypto32, MI, NEONCryptoInsn,
+                               Address, this, STI);
+    if (result != MCDisassembler::Fail) {
+      Size = 4;
+      return result;
+    }
+
+    MI.clear();
+    uint32_t NEONv8Insn = insn32;
+    NEONv8Insn &= 0xF3FFFFFF; // Clear bits 27-26
+    result = decodeInstruction(DecoderTablev8NEON32, MI, NEONv8Insn, Address,
+                               this, STI);
+    if (result != MCDisassembler::Fail) {
+      Size = 4;
+      return result;
+    }
+  }
+
+  MI.clear();
+  Size = 0;
+  return MCDisassembler::Fail;
+}
+
+
+extern "C" void LLVMInitializeARMDisassembler() {
+  TargetRegistry::RegisterMCDisassembler(TheARMLETarget,
+                                         createARMDisassembler);
+  TargetRegistry::RegisterMCDisassembler(TheARMBETarget,
+                                         createARMDisassembler);
+  TargetRegistry::RegisterMCDisassembler(TheThumbLETarget,
+                                         createThumbDisassembler);
+  TargetRegistry::RegisterMCDisassembler(TheThumbBETarget,
+                                         createThumbDisassembler);
+}
+
+static const uint16_t GPRDecoderTable[] = {
+  ARM::R0, ARM::R1, ARM::R2, ARM::R3,
+  ARM::R4, ARM::R5, ARM::R6, ARM::R7,
+  ARM::R8, ARM::R9, ARM::R10, ARM::R11,
+  ARM::R12, ARM::SP, ARM::LR, ARM::PC
+};
+
+static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  if (RegNo > 15)
+    return MCDisassembler::Fail;
+
+  unsigned Register = GPRDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus
+DecodeGPRnopcRegisterClass(MCInst &Inst, unsigned RegNo,
+                           uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+  
+  if (RegNo == 15) 
+    S = MCDisassembler::SoftFail;
+
+  Check(S, DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder));
+
+  return S;
+}
+
+static DecodeStatus
+DecodeGPRwithAPSRRegisterClass(MCInst &Inst, unsigned RegNo,
+                               uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  if (RegNo == 15)
+  {
+    Inst.addOperand(MCOperand::CreateReg(ARM::APSR_NZCV));
+    return MCDisassembler::Success;
+  }
+
+  Check(S, DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder));
+  return S;
+}
+
+static DecodeStatus DecodetGPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  if (RegNo > 7)
+    return MCDisassembler::Fail;
+  return DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder);
+}
+
+static const uint16_t GPRPairDecoderTable[] = {
+  ARM::R0_R1, ARM::R2_R3,   ARM::R4_R5,  ARM::R6_R7,
+  ARM::R8_R9, ARM::R10_R11, ARM::R12_SP
+};
+
+static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  if (RegNo > 13)
+    return MCDisassembler::Fail;
+
+  if ((RegNo & 1) || RegNo == 0xe)
+     S = MCDisassembler::SoftFail;
+
+  unsigned RegisterPair = GPRPairDecoderTable[RegNo/2];
+  Inst.addOperand(MCOperand::CreateReg(RegisterPair));
+  return S;
+}
+
+static DecodeStatus DecodetcGPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  unsigned Register = 0;
+  switch (RegNo) {
+    case 0:
+      Register = ARM::R0;
+      break;
+    case 1:
+      Register = ARM::R1;
+      break;
+    case 2:
+      Register = ARM::R2;
+      break;
+    case 3:
+      Register = ARM::R3;
+      break;
+    case 9:
+      Register = ARM::R9;
+      break;
+    case 12:
+      Register = ARM::R12;
+      break;
+    default:
+      return MCDisassembler::Fail;
+    }
+
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecoderGPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+  if (RegNo == 13 || RegNo == 15)
+    S = MCDisassembler::SoftFail;
+  Check(S, DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder));
+  return S;
+}
+
+static const uint16_t SPRDecoderTable[] = {
+     ARM::S0,  ARM::S1,  ARM::S2,  ARM::S3,
+     ARM::S4,  ARM::S5,  ARM::S6,  ARM::S7,
+     ARM::S8,  ARM::S9, ARM::S10, ARM::S11,
+    ARM::S12, ARM::S13, ARM::S14, ARM::S15,
+    ARM::S16, ARM::S17, ARM::S18, ARM::S19,
+    ARM::S20, ARM::S21, ARM::S22, ARM::S23,
+    ARM::S24, ARM::S25, ARM::S26, ARM::S27,
+    ARM::S28, ARM::S29, ARM::S30, ARM::S31
+};
+
+static DecodeStatus DecodeSPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Register = SPRDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
+
+static const uint16_t DPRDecoderTable[] = {
+     ARM::D0,  ARM::D1,  ARM::D2,  ARM::D3,
+     ARM::D4,  ARM::D5,  ARM::D6,  ARM::D7,
+     ARM::D8,  ARM::D9, ARM::D10, ARM::D11,
+    ARM::D12, ARM::D13, ARM::D14, ARM::D15,
+    ARM::D16, ARM::D17, ARM::D18, ARM::D19,
+    ARM::D20, ARM::D21, ARM::D22, ARM::D23,
+    ARM::D24, ARM::D25, ARM::D26, ARM::D27,
+    ARM::D28, ARM::D29, ARM::D30, ARM::D31
+};
+
+static DecodeStatus DecodeDPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Register = DPRDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeDPR_8RegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  if (RegNo > 7)
+    return MCDisassembler::Fail;
+  return DecodeDPRRegisterClass(Inst, RegNo, Address, Decoder);
+}
+
+static DecodeStatus
+DecodeDPR_VFP2RegisterClass(MCInst &Inst, unsigned RegNo,
+                            uint64_t Address, const void *Decoder) {
+  if (RegNo > 15)
+    return MCDisassembler::Fail;
+  return DecodeDPRRegisterClass(Inst, RegNo, Address, Decoder);
+}
+
+static const uint16_t QPRDecoderTable[] = {
+     ARM::Q0,  ARM::Q1,  ARM::Q2,  ARM::Q3,
+     ARM::Q4,  ARM::Q5,  ARM::Q6,  ARM::Q7,
+     ARM::Q8,  ARM::Q9, ARM::Q10, ARM::Q11,
+    ARM::Q12, ARM::Q13, ARM::Q14, ARM::Q15
+};
+
+
+static DecodeStatus DecodeQPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  if (RegNo > 31 || (RegNo & 1) != 0)
+    return MCDisassembler::Fail;
+  RegNo >>= 1;
+
+  unsigned Register = QPRDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
+
+static const uint16_t DPairDecoderTable[] = {
+  ARM::Q0,  ARM::D1_D2,   ARM::Q1,  ARM::D3_D4,   ARM::Q2,  ARM::D5_D6,
+  ARM::Q3,  ARM::D7_D8,   ARM::Q4,  ARM::D9_D10,  ARM::Q5,  ARM::D11_D12,
+  ARM::Q6,  ARM::D13_D14, ARM::Q7,  ARM::D15_D16, ARM::Q8,  ARM::D17_D18,
+  ARM::Q9,  ARM::D19_D20, ARM::Q10, ARM::D21_D22, ARM::Q11, ARM::D23_D24,
+  ARM::Q12, ARM::D25_D26, ARM::Q13, ARM::D27_D28, ARM::Q14, ARM::D29_D30,
+  ARM::Q15
+};
+
+static DecodeStatus DecodeDPairRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  if (RegNo > 30)
+    return MCDisassembler::Fail;
+
+  unsigned Register = DPairDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
+
+static const uint16_t DPairSpacedDecoderTable[] = {
+  ARM::D0_D2,   ARM::D1_D3,   ARM::D2_D4,   ARM::D3_D5,
+  ARM::D4_D6,   ARM::D5_D7,   ARM::D6_D8,   ARM::D7_D9,
+  ARM::D8_D10,  ARM::D9_D11,  ARM::D10_D12, ARM::D11_D13,
+  ARM::D12_D14, ARM::D13_D15, ARM::D14_D16, ARM::D15_D17,
+  ARM::D16_D18, ARM::D17_D19, ARM::D18_D20, ARM::D19_D21,
+  ARM::D20_D22, ARM::D21_D23, ARM::D22_D24, ARM::D23_D25,
+  ARM::D24_D26, ARM::D25_D27, ARM::D26_D28, ARM::D27_D29,
+  ARM::D28_D30, ARM::D29_D31
+};
+
+static DecodeStatus DecodeDPairSpacedRegisterClass(MCInst &Inst,
+                                                   unsigned RegNo,
+                                                   uint64_t Address,
+                                                   const void *Decoder) {
+  if (RegNo > 29)
+    return MCDisassembler::Fail;
+
+  unsigned Register = DPairSpacedDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodePredicateOperand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder) {
+  if (Val == 0xF) return MCDisassembler::Fail;
+  // AL predicate is not allowed on Thumb1 branches.
+  if (Inst.getOpcode() == ARM::tBcc && Val == 0xE)
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(Val));
+  if (Val == ARMCC::AL) {
+    Inst.addOperand(MCOperand::CreateReg(0));
+  } else
+    Inst.addOperand(MCOperand::CreateReg(ARM::CPSR));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCCOutOperand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder) {
+  if (Val)
+    Inst.addOperand(MCOperand::CreateReg(ARM::CPSR));
+  else
+    Inst.addOperand(MCOperand::CreateReg(0));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSOImmOperand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder) {
+  uint32_t imm = Val & 0xFF;
+  uint32_t rot = (Val & 0xF00) >> 7;
+  uint32_t rot_imm = (imm >> rot) | (imm << ((32-rot) & 0x1F));
+  Inst.addOperand(MCOperand::CreateImm(rot_imm));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSORegImmOperand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rm = fieldFromInstruction(Val, 0, 4);
+  unsigned type = fieldFromInstruction(Val, 5, 2);
+  unsigned imm = fieldFromInstruction(Val, 7, 5);
+
+  // Register-immediate
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  ARM_AM::ShiftOpc Shift = ARM_AM::lsl;
+  switch (type) {
+    case 0:
+      Shift = ARM_AM::lsl;
+      break;
+    case 1:
+      Shift = ARM_AM::lsr;
+      break;
+    case 2:
+      Shift = ARM_AM::asr;
+      break;
+    case 3:
+      Shift = ARM_AM::ror;
+      break;
+  }
+
+  if (Shift == ARM_AM::ror && imm == 0)
+    Shift = ARM_AM::rrx;
+
+  unsigned Op = Shift | (imm << 3);
+  Inst.addOperand(MCOperand::CreateImm(Op));
+
+  return S;
+}
+
+static DecodeStatus DecodeSORegRegOperand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rm = fieldFromInstruction(Val, 0, 4);
+  unsigned type = fieldFromInstruction(Val, 5, 2);
+  unsigned Rs = fieldFromInstruction(Val, 8, 4);
+
+  // Register-register
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rs, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  ARM_AM::ShiftOpc Shift = ARM_AM::lsl;
+  switch (type) {
+    case 0:
+      Shift = ARM_AM::lsl;
+      break;
+    case 1:
+      Shift = ARM_AM::lsr;
+      break;
+    case 2:
+      Shift = ARM_AM::asr;
+      break;
+    case 3:
+      Shift = ARM_AM::ror;
+      break;
+  }
+
+  Inst.addOperand(MCOperand::CreateImm(Shift));
+
+  return S;
+}
+
+static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  bool NeedDisjointWriteback = false;
+  unsigned WritebackReg = 0;
+  switch (Inst.getOpcode()) {
+  default:
+    break;
+  case ARM::LDMIA_UPD:
+  case ARM::LDMDB_UPD:
+  case ARM::LDMIB_UPD:
+  case ARM::LDMDA_UPD:
+  case ARM::t2LDMIA_UPD:
+  case ARM::t2LDMDB_UPD:
+  case ARM::t2STMIA_UPD:
+  case ARM::t2STMDB_UPD:
+    NeedDisjointWriteback = true;
+    WritebackReg = Inst.getOperand(0).getReg();
+    break;
+  }
+
+  // Empty register lists are not allowed.
+  if (Val == 0) return MCDisassembler::Fail;
+  for (unsigned i = 0; i < 16; ++i) {
+    if (Val & (1 << i)) {
+      if (!Check(S, DecodeGPRRegisterClass(Inst, i, Address, Decoder)))
+        return MCDisassembler::Fail;
+      // Writeback not allowed if Rn is in the target list.
+      if (NeedDisjointWriteback && WritebackReg == Inst.end()[-1].getReg())
+        Check(S, MCDisassembler::SoftFail);
+    }
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeSPRRegListOperand(MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Vd = fieldFromInstruction(Val, 8, 5);
+  unsigned regs = fieldFromInstruction(Val, 0, 8);
+
+  // In case of unpredictable encoding, tweak the operands.
+  if (regs == 0 || (Vd + regs) > 32) {
+    regs = Vd + regs > 32 ? 32 - Vd : regs;
+    regs = std::max( 1u, regs);
+    S = MCDisassembler::SoftFail;
+  }
+
+  if (!Check(S, DecodeSPRRegisterClass(Inst, Vd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  for (unsigned i = 0; i < (regs - 1); ++i) {
+    if (!Check(S, DecodeSPRRegisterClass(Inst, ++Vd, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeDPRRegListOperand(MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Vd = fieldFromInstruction(Val, 8, 5);
+  unsigned regs = fieldFromInstruction(Val, 1, 7);
+
+  // In case of unpredictable encoding, tweak the operands.
+  if (regs == 0 || regs > 16 || (Vd + regs) > 32) {
+    regs = Vd + regs > 32 ? 32 - Vd : regs;
+    regs = std::max( 1u, regs);
+    regs = std::min(16u, regs);
+    S = MCDisassembler::SoftFail;
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Vd, Address, Decoder)))
+      return MCDisassembler::Fail;
+  for (unsigned i = 0; i < (regs - 1); ++i) {
+    if (!Check(S, DecodeDPRRegisterClass(Inst, ++Vd, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeBitfieldMaskOperand(MCInst &Inst, unsigned Val,
+                                      uint64_t Address, const void *Decoder) {
+  // This operand encodes a mask of contiguous zeros between a specified MSB
+  // and LSB.  To decode it, we create the mask of all bits MSB-and-lower,
+  // the mask of all bits LSB-and-lower, and then xor them to create
+  // the mask of that's all ones on [msb, lsb].  Finally we not it to
+  // create the final mask.
+  unsigned msb = fieldFromInstruction(Val, 5, 5);
+  unsigned lsb = fieldFromInstruction(Val, 0, 5);
+
+  DecodeStatus S = MCDisassembler::Success;
+  if (lsb > msb) {
+    Check(S, MCDisassembler::SoftFail);
+    // The check above will cause the warning for the "potentially undefined
+    // instruction encoding" but we can't build a bad MCOperand value here
+    // with a lsb > msb or else printing the MCInst will cause a crash.
+    lsb = msb;
+  }
+
+  uint32_t msb_mask = 0xFFFFFFFF;
+  if (msb != 31) msb_mask = (1U << (msb+1)) - 1;
+  uint32_t lsb_mask = (1U << lsb) - 1;
+
+  Inst.addOperand(MCOperand::CreateImm(~(msb_mask ^ lsb_mask)));
+  return S;
+}
+
+static DecodeStatus DecodeCopMemInstruction(MCInst &Inst, unsigned Insn,
+                                  uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+  unsigned CRd = fieldFromInstruction(Insn, 12, 4);
+  unsigned coproc = fieldFromInstruction(Insn, 8, 4);
+  unsigned imm = fieldFromInstruction(Insn, 0, 8);
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned U = fieldFromInstruction(Insn, 23, 1);
+
+  switch (Inst.getOpcode()) {
+    case ARM::LDC_OFFSET:
+    case ARM::LDC_PRE:
+    case ARM::LDC_POST:
+    case ARM::LDC_OPTION:
+    case ARM::LDCL_OFFSET:
+    case ARM::LDCL_PRE:
+    case ARM::LDCL_POST:
+    case ARM::LDCL_OPTION:
+    case ARM::STC_OFFSET:
+    case ARM::STC_PRE:
+    case ARM::STC_POST:
+    case ARM::STC_OPTION:
+    case ARM::STCL_OFFSET:
+    case ARM::STCL_PRE:
+    case ARM::STCL_POST:
+    case ARM::STCL_OPTION:
+    case ARM::t2LDC_OFFSET:
+    case ARM::t2LDC_PRE:
+    case ARM::t2LDC_POST:
+    case ARM::t2LDC_OPTION:
+    case ARM::t2LDCL_OFFSET:
+    case ARM::t2LDCL_PRE:
+    case ARM::t2LDCL_POST:
+    case ARM::t2LDCL_OPTION:
+    case ARM::t2STC_OFFSET:
+    case ARM::t2STC_PRE:
+    case ARM::t2STC_POST:
+    case ARM::t2STC_OPTION:
+    case ARM::t2STCL_OFFSET:
+    case ARM::t2STCL_PRE:
+    case ARM::t2STCL_POST:
+    case ARM::t2STCL_OPTION:
+      if (coproc == 0xA || coproc == 0xB)
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  uint64_t featureBits = ((const MCDisassembler*)Decoder)->getSubtargetInfo()
+                                                          .getFeatureBits();
+  if ((featureBits & ARM::HasV8Ops) && (coproc != 14))
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateImm(coproc));
+  Inst.addOperand(MCOperand::CreateImm(CRd));
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  switch (Inst.getOpcode()) {
+    case ARM::t2LDC2_OFFSET:
+    case ARM::t2LDC2L_OFFSET:
+    case ARM::t2LDC2_PRE:
+    case ARM::t2LDC2L_PRE:
+    case ARM::t2STC2_OFFSET:
+    case ARM::t2STC2L_OFFSET:
+    case ARM::t2STC2_PRE:
+    case ARM::t2STC2L_PRE:
+    case ARM::LDC2_OFFSET:
+    case ARM::LDC2L_OFFSET:
+    case ARM::LDC2_PRE:
+    case ARM::LDC2L_PRE:
+    case ARM::STC2_OFFSET:
+    case ARM::STC2L_OFFSET:
+    case ARM::STC2_PRE:
+    case ARM::STC2L_PRE:
+    case ARM::t2LDC_OFFSET:
+    case ARM::t2LDCL_OFFSET:
+    case ARM::t2LDC_PRE:
+    case ARM::t2LDCL_PRE:
+    case ARM::t2STC_OFFSET:
+    case ARM::t2STCL_OFFSET:
+    case ARM::t2STC_PRE:
+    case ARM::t2STCL_PRE:
+    case ARM::LDC_OFFSET:
+    case ARM::LDCL_OFFSET:
+    case ARM::LDC_PRE:
+    case ARM::LDCL_PRE:
+    case ARM::STC_OFFSET:
+    case ARM::STCL_OFFSET:
+    case ARM::STC_PRE:
+    case ARM::STCL_PRE:
+      imm = ARM_AM::getAM5Opc(U ? ARM_AM::add : ARM_AM::sub, imm);
+      Inst.addOperand(MCOperand::CreateImm(imm));
+      break;
+    case ARM::t2LDC2_POST:
+    case ARM::t2LDC2L_POST:
+    case ARM::t2STC2_POST:
+    case ARM::t2STC2L_POST:
+    case ARM::LDC2_POST:
+    case ARM::LDC2L_POST:
+    case ARM::STC2_POST:
+    case ARM::STC2L_POST:
+    case ARM::t2LDC_POST:
+    case ARM::t2LDCL_POST:
+    case ARM::t2STC_POST:
+    case ARM::t2STCL_POST:
+    case ARM::LDC_POST:
+    case ARM::LDCL_POST:
+    case ARM::STC_POST:
+    case ARM::STCL_POST:
+      imm |= U << 8;
+      // fall through.
+    default:
+      // The 'option' variant doesn't encode 'U' in the immediate since
+      // the immediate is unsigned [0,255].
+      Inst.addOperand(MCOperand::CreateImm(imm));
+      break;
+  }
+
+  switch (Inst.getOpcode()) {
+    case ARM::LDC_OFFSET:
+    case ARM::LDC_PRE:
+    case ARM::LDC_POST:
+    case ARM::LDC_OPTION:
+    case ARM::LDCL_OFFSET:
+    case ARM::LDCL_PRE:
+    case ARM::LDCL_POST:
+    case ARM::LDCL_OPTION:
+    case ARM::STC_OFFSET:
+    case ARM::STC_PRE:
+    case ARM::STC_POST:
+    case ARM::STC_OPTION:
+    case ARM::STCL_OFFSET:
+    case ARM::STCL_PRE:
+    case ARM::STCL_POST:
+    case ARM::STCL_OPTION:
+      if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  return S;
+}
+
+static DecodeStatus
+DecodeAddrMode2IdxInstruction(MCInst &Inst, unsigned Insn,
+                              uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned imm = fieldFromInstruction(Insn, 0, 12);
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+  unsigned reg = fieldFromInstruction(Insn, 25, 1);
+  unsigned P = fieldFromInstruction(Insn, 24, 1);
+  unsigned W = fieldFromInstruction(Insn, 21, 1);
+
+  // On stores, the writeback operand precedes Rt.
+  switch (Inst.getOpcode()) {
+    case ARM::STR_POST_IMM:
+    case ARM::STR_POST_REG:
+    case ARM::STRB_POST_IMM:
+    case ARM::STRB_POST_REG:
+    case ARM::STRT_POST_REG:
+    case ARM::STRT_POST_IMM:
+    case ARM::STRBT_POST_REG:
+    case ARM::STRBT_POST_IMM:
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  // On loads, the writeback operand comes after Rt.
+  switch (Inst.getOpcode()) {
+    case ARM::LDR_POST_IMM:
+    case ARM::LDR_POST_REG:
+    case ARM::LDRB_POST_IMM:
+    case ARM::LDRB_POST_REG:
+    case ARM::LDRBT_POST_REG:
+    case ARM::LDRBT_POST_IMM:
+    case ARM::LDRT_POST_REG:
+    case ARM::LDRT_POST_IMM:
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  ARM_AM::AddrOpc Op = ARM_AM::add;
+  if (!fieldFromInstruction(Insn, 23, 1))
+    Op = ARM_AM::sub;
+
+  bool writeback = (P == 0) || (W == 1);
+  unsigned idx_mode = 0;
+  if (P && writeback)
+    idx_mode = ARMII::IndexModePre;
+  else if (!P && writeback)
+    idx_mode = ARMII::IndexModePost;
+
+  if (writeback && (Rn == 15 || Rn == Rt))
+    S = MCDisassembler::SoftFail; // UNPREDICTABLE
+
+  if (reg) {
+    if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rm, Address, Decoder)))
+      return MCDisassembler::Fail;
+    ARM_AM::ShiftOpc Opc = ARM_AM::lsl;
+    switch( fieldFromInstruction(Insn, 5, 2)) {
+      case 0:
+        Opc = ARM_AM::lsl;
+        break;
+      case 1:
+        Opc = ARM_AM::lsr;
+        break;
+      case 2:
+        Opc = ARM_AM::asr;
+        break;
+      case 3:
+        Opc = ARM_AM::ror;
+        break;
+      default:
+        return MCDisassembler::Fail;
+    }
+    unsigned amt = fieldFromInstruction(Insn, 7, 5);
+    if (Opc == ARM_AM::ror && amt == 0)
+      Opc = ARM_AM::rrx;
+    unsigned imm = ARM_AM::getAM2Opc(Op, amt, Opc, idx_mode);
+
+    Inst.addOperand(MCOperand::CreateImm(imm));
+  } else {
+    Inst.addOperand(MCOperand::CreateReg(0));
+    unsigned tmp = ARM_AM::getAM2Opc(Op, imm, ARM_AM::lsl, idx_mode);
+    Inst.addOperand(MCOperand::CreateImm(tmp));
+  }
+
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeSORegMemOperand(MCInst &Inst, unsigned Val,
+                                  uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Val, 13, 4);
+  unsigned Rm = fieldFromInstruction(Val,  0, 4);
+  unsigned type = fieldFromInstruction(Val, 5, 2);
+  unsigned imm = fieldFromInstruction(Val, 7, 5);
+  unsigned U = fieldFromInstruction(Val, 12, 1);
+
+  ARM_AM::ShiftOpc ShOp = ARM_AM::lsl;
+  switch (type) {
+    case 0:
+      ShOp = ARM_AM::lsl;
+      break;
+    case 1:
+      ShOp = ARM_AM::lsr;
+      break;
+    case 2:
+      ShOp = ARM_AM::asr;
+      break;
+    case 3:
+      ShOp = ARM_AM::ror;
+      break;
+  }
+
+  if (ShOp == ARM_AM::ror && imm == 0)
+    ShOp = ARM_AM::rrx;
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  unsigned shift;
+  if (U)
+    shift = ARM_AM::getAM2Opc(ARM_AM::add, imm, ShOp);
+  else
+    shift = ARM_AM::getAM2Opc(ARM_AM::sub, imm, ShOp);
+  Inst.addOperand(MCOperand::CreateImm(shift));
+
+  return S;
+}
+
+static DecodeStatus
+DecodeAddrMode3Instruction(MCInst &Inst, unsigned Insn,
+                           uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned type = fieldFromInstruction(Insn, 22, 1);
+  unsigned imm = fieldFromInstruction(Insn, 8, 4);
+  unsigned U = ((~fieldFromInstruction(Insn, 23, 1)) & 1) << 8;
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+  unsigned W = fieldFromInstruction(Insn, 21, 1);
+  unsigned P = fieldFromInstruction(Insn, 24, 1);
+  unsigned Rt2 = Rt + 1;
+
+  bool writeback = (W == 1) | (P == 0);
+
+  // For {LD,ST}RD, Rt must be even, else undefined.
+  switch (Inst.getOpcode()) {
+    case ARM::STRD:
+    case ARM::STRD_PRE:
+    case ARM::STRD_POST:
+    case ARM::LDRD:
+    case ARM::LDRD_PRE:
+    case ARM::LDRD_POST:
+      if (Rt & 0x1) S = MCDisassembler::SoftFail;
+      break;
+    default:
+      break;
+  }
+  switch (Inst.getOpcode()) {
+    case ARM::STRD:
+    case ARM::STRD_PRE:
+    case ARM::STRD_POST:
+      if (P == 0 && W == 1)
+        S = MCDisassembler::SoftFail;
+      
+      if (writeback && (Rn == 15 || Rn == Rt || Rn == Rt2))
+        S = MCDisassembler::SoftFail;
+      if (type && Rm == 15)
+        S = MCDisassembler::SoftFail;
+      if (Rt2 == 15)
+        S = MCDisassembler::SoftFail;
+      if (!type && fieldFromInstruction(Insn, 8, 4))
+        S = MCDisassembler::SoftFail;
+      break;
+    case ARM::STRH:
+    case ARM::STRH_PRE:
+    case ARM::STRH_POST:
+      if (Rt == 15)
+        S = MCDisassembler::SoftFail;
+      if (writeback && (Rn == 15 || Rn == Rt))
+        S = MCDisassembler::SoftFail;
+      if (!type && Rm == 15)
+        S = MCDisassembler::SoftFail;
+      break;
+    case ARM::LDRD:
+    case ARM::LDRD_PRE:
+    case ARM::LDRD_POST:
+      if (type && Rn == 15){
+        if (Rt2 == 15)
+          S = MCDisassembler::SoftFail;
+        break;
+      }
+      if (P == 0 && W == 1)
+        S = MCDisassembler::SoftFail;
+      if (!type && (Rt2 == 15 || Rm == 15 || Rm == Rt || Rm == Rt2))
+        S = MCDisassembler::SoftFail;
+      if (!type && writeback && Rn == 15)
+        S = MCDisassembler::SoftFail;
+      if (writeback && (Rn == Rt || Rn == Rt2))
+        S = MCDisassembler::SoftFail;
+      break;
+    case ARM::LDRH:
+    case ARM::LDRH_PRE:
+    case ARM::LDRH_POST:
+      if (type && Rn == 15){
+        if (Rt == 15)
+          S = MCDisassembler::SoftFail;
+        break;
+      }
+      if (Rt == 15)
+        S = MCDisassembler::SoftFail;
+      if (!type && Rm == 15)
+        S = MCDisassembler::SoftFail;
+      if (!type && writeback && (Rn == 15 || Rn == Rt))
+        S = MCDisassembler::SoftFail;
+      break;
+    case ARM::LDRSH:
+    case ARM::LDRSH_PRE:
+    case ARM::LDRSH_POST:
+    case ARM::LDRSB:
+    case ARM::LDRSB_PRE:
+    case ARM::LDRSB_POST:
+      if (type && Rn == 15){
+        if (Rt == 15)
+          S = MCDisassembler::SoftFail;
+        break;
+      }
+      if (type && (Rt == 15 || (writeback && Rn == Rt)))
+        S = MCDisassembler::SoftFail;
+      if (!type && (Rt == 15 || Rm == 15))
+        S = MCDisassembler::SoftFail;
+      if (!type && writeback && (Rn == 15 || Rn == Rt))
+        S = MCDisassembler::SoftFail;
+      break;
+    default:
+      break;
+  }
+
+  if (writeback) { // Writeback
+    if (P)
+      U |= ARMII::IndexModePre << 9;
+    else
+      U |= ARMII::IndexModePost << 9;
+
+    // On stores, the writeback operand precedes Rt.
+    switch (Inst.getOpcode()) {
+    case ARM::STRD:
+    case ARM::STRD_PRE:
+    case ARM::STRD_POST:
+    case ARM::STRH:
+    case ARM::STRH_PRE:
+    case ARM::STRH_POST:
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+    }
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  switch (Inst.getOpcode()) {
+    case ARM::STRD:
+    case ARM::STRD_PRE:
+    case ARM::STRD_POST:
+    case ARM::LDRD:
+    case ARM::LDRD_PRE:
+    case ARM::LDRD_POST:
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rt+1, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  if (writeback) {
+    // On loads, the writeback operand comes after Rt.
+    switch (Inst.getOpcode()) {
+    case ARM::LDRD:
+    case ARM::LDRD_PRE:
+    case ARM::LDRD_POST:
+    case ARM::LDRH:
+    case ARM::LDRH_PRE:
+    case ARM::LDRH_POST:
+    case ARM::LDRSH:
+    case ARM::LDRSH_PRE:
+    case ARM::LDRSH_POST:
+    case ARM::LDRSB:
+    case ARM::LDRSB_PRE:
+    case ARM::LDRSB_POST:
+    case ARM::LDRHTr:
+    case ARM::LDRSBTr:
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+    }
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (type) {
+    Inst.addOperand(MCOperand::CreateReg(0));
+    Inst.addOperand(MCOperand::CreateImm(U | (imm << 4) | Rm));
+  } else {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+    Inst.addOperand(MCOperand::CreateImm(U));
+  }
+
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeRFEInstruction(MCInst &Inst, unsigned Insn,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned mode = fieldFromInstruction(Insn, 23, 2);
+
+  switch (mode) {
+    case 0:
+      mode = ARM_AM::da;
+      break;
+    case 1:
+      mode = ARM_AM::ia;
+      break;
+    case 2:
+      mode = ARM_AM::db;
+      break;
+    case 3:
+      mode = ARM_AM::ib;
+      break;
+  }
+
+  Inst.addOperand(MCOperand::CreateImm(mode));
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeQADDInstruction(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+
+  if (pred == 0xF)
+    return DecodeCPSInstruction(Inst, Insn, Address, Decoder);
+
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+  return S;
+}
+
+static DecodeStatus DecodeMemMultipleWritebackInstruction(MCInst &Inst,
+                                  unsigned Insn,
+                                  uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+  unsigned reglist = fieldFromInstruction(Insn, 0, 16);
+
+  if (pred == 0xF) {
+    // Ambiguous with RFE and SRS
+    switch (Inst.getOpcode()) {
+      case ARM::LDMDA:
+        Inst.setOpcode(ARM::RFEDA);
+        break;
+      case ARM::LDMDA_UPD:
+        Inst.setOpcode(ARM::RFEDA_UPD);
+        break;
+      case ARM::LDMDB:
+        Inst.setOpcode(ARM::RFEDB);
+        break;
+      case ARM::LDMDB_UPD:
+        Inst.setOpcode(ARM::RFEDB_UPD);
+        break;
+      case ARM::LDMIA:
+        Inst.setOpcode(ARM::RFEIA);
+        break;
+      case ARM::LDMIA_UPD:
+        Inst.setOpcode(ARM::RFEIA_UPD);
+        break;
+      case ARM::LDMIB:
+        Inst.setOpcode(ARM::RFEIB);
+        break;
+      case ARM::LDMIB_UPD:
+        Inst.setOpcode(ARM::RFEIB_UPD);
+        break;
+      case ARM::STMDA:
+        Inst.setOpcode(ARM::SRSDA);
+        break;
+      case ARM::STMDA_UPD:
+        Inst.setOpcode(ARM::SRSDA_UPD);
+        break;
+      case ARM::STMDB:
+        Inst.setOpcode(ARM::SRSDB);
+        break;
+      case ARM::STMDB_UPD:
+        Inst.setOpcode(ARM::SRSDB_UPD);
+        break;
+      case ARM::STMIA:
+        Inst.setOpcode(ARM::SRSIA);
+        break;
+      case ARM::STMIA_UPD:
+        Inst.setOpcode(ARM::SRSIA_UPD);
+        break;
+      case ARM::STMIB:
+        Inst.setOpcode(ARM::SRSIB);
+        break;
+      case ARM::STMIB_UPD:
+        Inst.setOpcode(ARM::SRSIB_UPD);
+        break;
+      default:
+        return MCDisassembler::Fail;
+    }
+
+    // For stores (which become SRS's, the only operand is the mode.
+    if (fieldFromInstruction(Insn, 20, 1) == 0) {
+      // Check SRS encoding constraints
+      if (!(fieldFromInstruction(Insn, 22, 1) == 1 &&
+            fieldFromInstruction(Insn, 20, 1) == 0))
+        return MCDisassembler::Fail;
+
+      Inst.addOperand(
+          MCOperand::CreateImm(fieldFromInstruction(Insn, 0, 4)));
+      return S;
+    }
+
+    return DecodeRFEInstruction(Inst, Insn, Address, Decoder);
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail; // Tied
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeRegListOperand(Inst, reglist, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeCPSInstruction(MCInst &Inst, unsigned Insn,
+                                 uint64_t Address, const void *Decoder) {
+  unsigned imod = fieldFromInstruction(Insn, 18, 2);
+  unsigned M = fieldFromInstruction(Insn, 17, 1);
+  unsigned iflags = fieldFromInstruction(Insn, 6, 3);
+  unsigned mode = fieldFromInstruction(Insn, 0, 5);
+
+  DecodeStatus S = MCDisassembler::Success;
+
+  // This decoder is called from multiple location that do not check
+  // the full encoding is valid before they do.
+  if (fieldFromInstruction(Insn, 5, 1) != 0 ||
+      fieldFromInstruction(Insn, 16, 1) != 0 ||
+      fieldFromInstruction(Insn, 20, 8) != 0x10)
+    return MCDisassembler::Fail;
+
+  // imod == '01' --> UNPREDICTABLE
+  // NOTE: Even though this is technically UNPREDICTABLE, we choose to
+  // return failure here.  The '01' imod value is unprintable, so there's
+  // nothing useful we could do even if we returned UNPREDICTABLE.
+
+  if (imod == 1) return MCDisassembler::Fail;
+
+  if (imod && M) {
+    Inst.setOpcode(ARM::CPS3p);
+    Inst.addOperand(MCOperand::CreateImm(imod));
+    Inst.addOperand(MCOperand::CreateImm(iflags));
+    Inst.addOperand(MCOperand::CreateImm(mode));
+  } else if (imod && !M) {
+    Inst.setOpcode(ARM::CPS2p);
+    Inst.addOperand(MCOperand::CreateImm(imod));
+    Inst.addOperand(MCOperand::CreateImm(iflags));
+    if (mode) S = MCDisassembler::SoftFail;
+  } else if (!imod && M) {
+    Inst.setOpcode(ARM::CPS1p);
+    Inst.addOperand(MCOperand::CreateImm(mode));
+    if (iflags) S = MCDisassembler::SoftFail;
+  } else {
+    // imod == '00' && M == '0' --> UNPREDICTABLE
+    Inst.setOpcode(ARM::CPS1p);
+    Inst.addOperand(MCOperand::CreateImm(mode));
+    S = MCDisassembler::SoftFail;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeT2CPSInstruction(MCInst &Inst, unsigned Insn,
+                                 uint64_t Address, const void *Decoder) {
+  unsigned imod = fieldFromInstruction(Insn, 9, 2);
+  unsigned M = fieldFromInstruction(Insn, 8, 1);
+  unsigned iflags = fieldFromInstruction(Insn, 5, 3);
+  unsigned mode = fieldFromInstruction(Insn, 0, 5);
+
+  DecodeStatus S = MCDisassembler::Success;
+
+  // imod == '01' --> UNPREDICTABLE
+  // NOTE: Even though this is technically UNPREDICTABLE, we choose to
+  // return failure here.  The '01' imod value is unprintable, so there's
+  // nothing useful we could do even if we returned UNPREDICTABLE.
+
+  if (imod == 1) return MCDisassembler::Fail;
+
+  if (imod && M) {
+    Inst.setOpcode(ARM::t2CPS3p);
+    Inst.addOperand(MCOperand::CreateImm(imod));
+    Inst.addOperand(MCOperand::CreateImm(iflags));
+    Inst.addOperand(MCOperand::CreateImm(mode));
+  } else if (imod && !M) {
+    Inst.setOpcode(ARM::t2CPS2p);
+    Inst.addOperand(MCOperand::CreateImm(imod));
+    Inst.addOperand(MCOperand::CreateImm(iflags));
+    if (mode) S = MCDisassembler::SoftFail;
+  } else if (!imod && M) {
+    Inst.setOpcode(ARM::t2CPS1p);
+    Inst.addOperand(MCOperand::CreateImm(mode));
+    if (iflags) S = MCDisassembler::SoftFail;
+  } else {
+    // imod == '00' && M == '0' --> this is a HINT instruction
+    int imm = fieldFromInstruction(Insn, 0, 8);
+    // HINT are defined only for immediate in [0..4]
+    if(imm > 4) return MCDisassembler::Fail;
+    Inst.setOpcode(ARM::t2HINT);
+    Inst.addOperand(MCOperand::CreateImm(imm));
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeT2MOVTWInstruction(MCInst &Inst, unsigned Insn,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction(Insn, 8, 4);
+  unsigned imm = 0;
+
+  imm |= (fieldFromInstruction(Insn, 0, 8) << 0);
+  imm |= (fieldFromInstruction(Insn, 12, 3) << 8);
+  imm |= (fieldFromInstruction(Insn, 16, 4) << 12);
+  imm |= (fieldFromInstruction(Insn, 26, 1) << 11);
+
+  if (Inst.getOpcode() == ARM::t2MOVTi16)
+    if (!Check(S, DecoderGPRRegisterClass(Inst, Rd, Address, Decoder)))
+      return MCDisassembler::Fail;
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (!tryAddingSymbolicOperand(Address, imm, false, 4, Inst, Decoder))
+    Inst.addOperand(MCOperand::CreateImm(imm));
+
+  return S;
+}
+
+static DecodeStatus DecodeArmMOVTWInstruction(MCInst &Inst, unsigned Insn,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+  unsigned imm = 0;
+
+  imm |= (fieldFromInstruction(Insn, 0, 12) << 0);
+  imm |= (fieldFromInstruction(Insn, 16, 4) << 12);
+
+  if (Inst.getOpcode() == ARM::MOVTi16)
+    if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rd, Address, Decoder)))
+      return MCDisassembler::Fail;
+
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (!tryAddingSymbolicOperand(Address, imm, false, 4, Inst, Decoder))
+    Inst.addOperand(MCOperand::CreateImm(imm));
+
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeSMLAInstruction(MCInst &Inst, unsigned Insn,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rn = fieldFromInstruction(Insn, 0, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 8, 4);
+  unsigned Ra = fieldFromInstruction(Insn, 12, 4);
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+
+  if (pred == 0xF)
+    return DecodeCPSInstruction(Inst, Insn, Address, Decoder);
+
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Ra, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeAddrModeImm12Operand(MCInst &Inst, unsigned Val,
+                           uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned add = fieldFromInstruction(Val, 12, 1);
+  unsigned imm = fieldFromInstruction(Val, 0, 12);
+  unsigned Rn = fieldFromInstruction(Val, 13, 4);
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (!add) imm *= -1;
+  if (imm == 0 && !add) imm = INT32_MIN;
+  Inst.addOperand(MCOperand::CreateImm(imm));
+  if (Rn == 15)
+    tryAddingPcLoadReferenceComment(Address, Address + imm + 8, Decoder);
+
+  return S;
+}
+
+static DecodeStatus DecodeAddrMode5Operand(MCInst &Inst, unsigned Val,
+                                   uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Val, 9, 4);
+  unsigned U = fieldFromInstruction(Val, 8, 1);
+  unsigned imm = fieldFromInstruction(Val, 0, 8);
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (U)
+    Inst.addOperand(MCOperand::CreateImm(ARM_AM::getAM5Opc(ARM_AM::add, imm)));
+  else
+    Inst.addOperand(MCOperand::CreateImm(ARM_AM::getAM5Opc(ARM_AM::sub, imm)));
+
+  return S;
+}
+
+static DecodeStatus DecodeAddrMode7Operand(MCInst &Inst, unsigned Val,
+                                   uint64_t Address, const void *Decoder) {
+  return DecodeGPRRegisterClass(Inst, Val, Address, Decoder);
+}
+
+static DecodeStatus
+DecodeT2BInstruction(MCInst &Inst, unsigned Insn,
+                     uint64_t Address, const void *Decoder) {
+  DecodeStatus Status = MCDisassembler::Success;
+
+  // Note the J1 and J2 values are from the encoded instruction.  So here
+  // change them to I1 and I2 values via as documented:
+  // I1 = NOT(J1 EOR S);
+  // I2 = NOT(J2 EOR S);
+  // and build the imm32 with one trailing zero as documented:
+  // imm32 = SignExtend(S:I1:I2:imm10:imm11:'0', 32);
+  unsigned S = fieldFromInstruction(Insn, 26, 1);
+  unsigned J1 = fieldFromInstruction(Insn, 13, 1);
+  unsigned J2 = fieldFromInstruction(Insn, 11, 1);
+  unsigned I1 = !(J1 ^ S);
+  unsigned I2 = !(J2 ^ S);
+  unsigned imm10 = fieldFromInstruction(Insn, 16, 10);
+  unsigned imm11 = fieldFromInstruction(Insn, 0, 11);
+  unsigned tmp = (S << 23) | (I1 << 22) | (I2 << 21) | (imm10 << 11) | imm11;
+  int imm32 = SignExtend32<25>(tmp << 1);
+  if (!tryAddingSymbolicOperand(Address, Address + imm32 + 4,
+                                true, 4, Inst, Decoder))
+    Inst.addOperand(MCOperand::CreateImm(imm32));
+
+  return Status;
+}
+
+static DecodeStatus
+DecodeBranchImmInstruction(MCInst &Inst, unsigned Insn,
+                           uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+  unsigned imm = fieldFromInstruction(Insn, 0, 24) << 2;
+
+  if (pred == 0xF) {
+    Inst.setOpcode(ARM::BLXi);
+    imm |= fieldFromInstruction(Insn, 24, 1) << 1;
+    if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<26>(imm) + 8,
+                                  true, 4, Inst, Decoder))
+    Inst.addOperand(MCOperand::CreateImm(SignExtend32<26>(imm)));
+    return S;
+  }
+
+  if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<26>(imm) + 8,
+                                true, 4, Inst, Decoder))
+    Inst.addOperand(MCOperand::CreateImm(SignExtend32<26>(imm)));
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+
+static DecodeStatus DecodeAddrMode6Operand(MCInst &Inst, unsigned Val,
+                                   uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rm = fieldFromInstruction(Val, 0, 4);
+  unsigned align = fieldFromInstruction(Val, 4, 2);
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!align)
+    Inst.addOperand(MCOperand::CreateImm(0));
+  else
+    Inst.addOperand(MCOperand::CreateImm(4 << align));
+
+  return S;
+}
+
+static DecodeStatus DecodeVLDInstruction(MCInst &Inst, unsigned Insn,
+                                   uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned wb = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  Rn |= fieldFromInstruction(Insn, 4, 2) << 4;
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+
+  // First output register
+  switch (Inst.getOpcode()) {
+  case ARM::VLD1q16: case ARM::VLD1q32: case ARM::VLD1q64: case ARM::VLD1q8:
+  case ARM::VLD1q16wb_fixed: case ARM::VLD1q16wb_register:
+  case ARM::VLD1q32wb_fixed: case ARM::VLD1q32wb_register:
+  case ARM::VLD1q64wb_fixed: case ARM::VLD1q64wb_register:
+  case ARM::VLD1q8wb_fixed: case ARM::VLD1q8wb_register:
+  case ARM::VLD2d16: case ARM::VLD2d32: case ARM::VLD2d8:
+  case ARM::VLD2d16wb_fixed: case ARM::VLD2d16wb_register:
+  case ARM::VLD2d32wb_fixed: case ARM::VLD2d32wb_register:
+  case ARM::VLD2d8wb_fixed: case ARM::VLD2d8wb_register:
+    if (!Check(S, DecodeDPairRegisterClass(Inst, Rd, Address, Decoder)))
+      return MCDisassembler::Fail;
+    break;
+  case ARM::VLD2b16:
+  case ARM::VLD2b32:
+  case ARM::VLD2b8:
+  case ARM::VLD2b16wb_fixed:
+  case ARM::VLD2b16wb_register:
+  case ARM::VLD2b32wb_fixed:
+  case ARM::VLD2b32wb_register:
+  case ARM::VLD2b8wb_fixed:
+  case ARM::VLD2b8wb_register:
+    if (!Check(S, DecodeDPairSpacedRegisterClass(Inst, Rd, Address, Decoder)))
+      return MCDisassembler::Fail;
+    break;
+  default:
+    if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  // Second output register
+  switch (Inst.getOpcode()) {
+    case ARM::VLD3d8:
+    case ARM::VLD3d16:
+    case ARM::VLD3d32:
+    case ARM::VLD3d8_UPD:
+    case ARM::VLD3d16_UPD:
+    case ARM::VLD3d32_UPD:
+    case ARM::VLD4d8:
+    case ARM::VLD4d16:
+    case ARM::VLD4d32:
+    case ARM::VLD4d8_UPD:
+    case ARM::VLD4d16_UPD:
+    case ARM::VLD4d32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+1)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    case ARM::VLD3q8:
+    case ARM::VLD3q16:
+    case ARM::VLD3q32:
+    case ARM::VLD3q8_UPD:
+    case ARM::VLD3q16_UPD:
+    case ARM::VLD3q32_UPD:
+    case ARM::VLD4q8:
+    case ARM::VLD4q16:
+    case ARM::VLD4q32:
+    case ARM::VLD4q8_UPD:
+    case ARM::VLD4q16_UPD:
+    case ARM::VLD4q32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+    default:
+      break;
+  }
+
+  // Third output register
+  switch(Inst.getOpcode()) {
+    case ARM::VLD3d8:
+    case ARM::VLD3d16:
+    case ARM::VLD3d32:
+    case ARM::VLD3d8_UPD:
+    case ARM::VLD3d16_UPD:
+    case ARM::VLD3d32_UPD:
+    case ARM::VLD4d8:
+    case ARM::VLD4d16:
+    case ARM::VLD4d32:
+    case ARM::VLD4d8_UPD:
+    case ARM::VLD4d16_UPD:
+    case ARM::VLD4d32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    case ARM::VLD3q8:
+    case ARM::VLD3q16:
+    case ARM::VLD3q32:
+    case ARM::VLD3q8_UPD:
+    case ARM::VLD3q16_UPD:
+    case ARM::VLD3q32_UPD:
+    case ARM::VLD4q8:
+    case ARM::VLD4q16:
+    case ARM::VLD4q32:
+    case ARM::VLD4q8_UPD:
+    case ARM::VLD4q16_UPD:
+    case ARM::VLD4q32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+4)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  // Fourth output register
+  switch (Inst.getOpcode()) {
+    case ARM::VLD4d8:
+    case ARM::VLD4d16:
+    case ARM::VLD4d32:
+    case ARM::VLD4d8_UPD:
+    case ARM::VLD4d16_UPD:
+    case ARM::VLD4d32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+3)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    case ARM::VLD4q8:
+    case ARM::VLD4q16:
+    case ARM::VLD4q32:
+    case ARM::VLD4q8_UPD:
+    case ARM::VLD4q16_UPD:
+    case ARM::VLD4q32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+6)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  // Writeback operand
+  switch (Inst.getOpcode()) {
+    case ARM::VLD1d8wb_fixed:
+    case ARM::VLD1d16wb_fixed:
+    case ARM::VLD1d32wb_fixed:
+    case ARM::VLD1d64wb_fixed:
+    case ARM::VLD1d8wb_register:
+    case ARM::VLD1d16wb_register:
+    case ARM::VLD1d32wb_register:
+    case ARM::VLD1d64wb_register:
+    case ARM::VLD1q8wb_fixed:
+    case ARM::VLD1q16wb_fixed:
+    case ARM::VLD1q32wb_fixed:
+    case ARM::VLD1q64wb_fixed:
+    case ARM::VLD1q8wb_register:
+    case ARM::VLD1q16wb_register:
+    case ARM::VLD1q32wb_register:
+    case ARM::VLD1q64wb_register:
+    case ARM::VLD1d8Twb_fixed:
+    case ARM::VLD1d8Twb_register:
+    case ARM::VLD1d16Twb_fixed:
+    case ARM::VLD1d16Twb_register:
+    case ARM::VLD1d32Twb_fixed:
+    case ARM::VLD1d32Twb_register:
+    case ARM::VLD1d64Twb_fixed:
+    case ARM::VLD1d64Twb_register:
+    case ARM::VLD1d8Qwb_fixed:
+    case ARM::VLD1d8Qwb_register:
+    case ARM::VLD1d16Qwb_fixed:
+    case ARM::VLD1d16Qwb_register:
+    case ARM::VLD1d32Qwb_fixed:
+    case ARM::VLD1d32Qwb_register:
+    case ARM::VLD1d64Qwb_fixed:
+    case ARM::VLD1d64Qwb_register:
+    case ARM::VLD2d8wb_fixed:
+    case ARM::VLD2d16wb_fixed:
+    case ARM::VLD2d32wb_fixed:
+    case ARM::VLD2q8wb_fixed:
+    case ARM::VLD2q16wb_fixed:
+    case ARM::VLD2q32wb_fixed:
+    case ARM::VLD2d8wb_register:
+    case ARM::VLD2d16wb_register:
+    case ARM::VLD2d32wb_register:
+    case ARM::VLD2q8wb_register:
+    case ARM::VLD2q16wb_register:
+    case ARM::VLD2q32wb_register:
+    case ARM::VLD2b8wb_fixed:
+    case ARM::VLD2b16wb_fixed:
+    case ARM::VLD2b32wb_fixed:
+    case ARM::VLD2b8wb_register:
+    case ARM::VLD2b16wb_register:
+    case ARM::VLD2b32wb_register:
+      Inst.addOperand(MCOperand::CreateImm(0));
+      break;
+    case ARM::VLD3d8_UPD:
+    case ARM::VLD3d16_UPD:
+    case ARM::VLD3d32_UPD:
+    case ARM::VLD3q8_UPD:
+    case ARM::VLD3q16_UPD:
+    case ARM::VLD3q32_UPD:
+    case ARM::VLD4d8_UPD:
+    case ARM::VLD4d16_UPD:
+    case ARM::VLD4d32_UPD:
+    case ARM::VLD4q8_UPD:
+    case ARM::VLD4q16_UPD:
+    case ARM::VLD4q32_UPD:
+      if (!Check(S, DecodeGPRRegisterClass(Inst, wb, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  // AddrMode6 Base (register+alignment)
+  if (!Check(S, DecodeAddrMode6Operand(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  // AddrMode6 Offset (register)
+  switch (Inst.getOpcode()) {
+  default:
+    // The below have been updated to have explicit am6offset split
+    // between fixed and register offset. For those instructions not
+    // yet updated, we need to add an additional reg0 operand for the
+    // fixed variant.
+    //
+    // The fixed offset encodes as Rm == 0xd, so we check for that.
+    if (Rm == 0xd) {
+      Inst.addOperand(MCOperand::CreateReg(0));
+      break;
+    }
+    // Fall through to handle the register offset variant.
+  case ARM::VLD1d8wb_fixed:
+  case ARM::VLD1d16wb_fixed:
+  case ARM::VLD1d32wb_fixed:
+  case ARM::VLD1d64wb_fixed:
+  case ARM::VLD1d8Twb_fixed:
+  case ARM::VLD1d16Twb_fixed:
+  case ARM::VLD1d32Twb_fixed:
+  case ARM::VLD1d64Twb_fixed:
+  case ARM::VLD1d8Qwb_fixed:
+  case ARM::VLD1d16Qwb_fixed:
+  case ARM::VLD1d32Qwb_fixed:
+  case ARM::VLD1d64Qwb_fixed:
+  case ARM::VLD1d8wb_register:
+  case ARM::VLD1d16wb_register:
+  case ARM::VLD1d32wb_register:
+  case ARM::VLD1d64wb_register:
+  case ARM::VLD1q8wb_fixed:
+  case ARM::VLD1q16wb_fixed:
+  case ARM::VLD1q32wb_fixed:
+  case ARM::VLD1q64wb_fixed:
+  case ARM::VLD1q8wb_register:
+  case ARM::VLD1q16wb_register:
+  case ARM::VLD1q32wb_register:
+  case ARM::VLD1q64wb_register:
+    // The fixed offset post-increment encodes Rm == 0xd. The no-writeback
+    // variant encodes Rm == 0xf. Anything else is a register offset post-
+    // increment and we need to add the register operand to the instruction.
+    if (Rm != 0xD && Rm != 0xF &&
+        !Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+      return MCDisassembler::Fail;
+    break;
+  case ARM::VLD2d8wb_fixed:
+  case ARM::VLD2d16wb_fixed:
+  case ARM::VLD2d32wb_fixed:
+  case ARM::VLD2b8wb_fixed:
+  case ARM::VLD2b16wb_fixed:
+  case ARM::VLD2b32wb_fixed:
+  case ARM::VLD2q8wb_fixed:
+  case ARM::VLD2q16wb_fixed:
+  case ARM::VLD2q32wb_fixed:
+    break;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeVLDST1Instruction(MCInst &Inst, unsigned Insn,
+                                   uint64_t Address, const void *Decoder) {
+  unsigned type = fieldFromInstruction(Insn, 8, 4);
+  unsigned align = fieldFromInstruction(Insn, 4, 2);
+  if (type == 6 && (align & 2)) return MCDisassembler::Fail;
+  if (type == 7 && (align & 2)) return MCDisassembler::Fail;
+  if (type == 10 && align == 3) return MCDisassembler::Fail;
+
+  unsigned load = fieldFromInstruction(Insn, 21, 1);
+  return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder)
+              : DecodeVSTInstruction(Inst, Insn, Address, Decoder);
+}
+
+static DecodeStatus DecodeVLDST2Instruction(MCInst &Inst, unsigned Insn,
+                                   uint64_t Address, const void *Decoder) {
+  unsigned size = fieldFromInstruction(Insn, 6, 2);
+  if (size == 3) return MCDisassembler::Fail;
+
+  unsigned type = fieldFromInstruction(Insn, 8, 4);
+  unsigned align = fieldFromInstruction(Insn, 4, 2);
+  if (type == 8 && align == 3) return MCDisassembler::Fail;
+  if (type == 9 && align == 3) return MCDisassembler::Fail;
+
+  unsigned load = fieldFromInstruction(Insn, 21, 1);
+  return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder)
+              : DecodeVSTInstruction(Inst, Insn, Address, Decoder);
+}
+
+static DecodeStatus DecodeVLDST3Instruction(MCInst &Inst, unsigned Insn,
+                                   uint64_t Address, const void *Decoder) {
+  unsigned size = fieldFromInstruction(Insn, 6, 2);
+  if (size == 3) return MCDisassembler::Fail;
+
+  unsigned align = fieldFromInstruction(Insn, 4, 2);
+  if (align & 2) return MCDisassembler::Fail;
+
+  unsigned load = fieldFromInstruction(Insn, 21, 1);
+  return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder)
+              : DecodeVSTInstruction(Inst, Insn, Address, Decoder);
+}
+
+static DecodeStatus DecodeVLDST4Instruction(MCInst &Inst, unsigned Insn,
+                                   uint64_t Address, const void *Decoder) {
+  unsigned size = fieldFromInstruction(Insn, 6, 2);
+  if (size == 3) return MCDisassembler::Fail;
+
+  unsigned load = fieldFromInstruction(Insn, 21, 1);
+  return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder)
+              : DecodeVSTInstruction(Inst, Insn, Address, Decoder);
+}
+
+static DecodeStatus DecodeVSTInstruction(MCInst &Inst, unsigned Insn,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned wb = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  Rn |= fieldFromInstruction(Insn, 4, 2) << 4;
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+
+  // Writeback Operand
+  switch (Inst.getOpcode()) {
+    case ARM::VST1d8wb_fixed:
+    case ARM::VST1d16wb_fixed:
+    case ARM::VST1d32wb_fixed:
+    case ARM::VST1d64wb_fixed:
+    case ARM::VST1d8wb_register:
+    case ARM::VST1d16wb_register:
+    case ARM::VST1d32wb_register:
+    case ARM::VST1d64wb_register:
+    case ARM::VST1q8wb_fixed:
+    case ARM::VST1q16wb_fixed:
+    case ARM::VST1q32wb_fixed:
+    case ARM::VST1q64wb_fixed:
+    case ARM::VST1q8wb_register:
+    case ARM::VST1q16wb_register:
+    case ARM::VST1q32wb_register:
+    case ARM::VST1q64wb_register:
+    case ARM::VST1d8Twb_fixed:
+    case ARM::VST1d16Twb_fixed:
+    case ARM::VST1d32Twb_fixed:
+    case ARM::VST1d64Twb_fixed:
+    case ARM::VST1d8Twb_register:
+    case ARM::VST1d16Twb_register:
+    case ARM::VST1d32Twb_register:
+    case ARM::VST1d64Twb_register:
+    case ARM::VST1d8Qwb_fixed:
+    case ARM::VST1d16Qwb_fixed:
+    case ARM::VST1d32Qwb_fixed:
+    case ARM::VST1d64Qwb_fixed:
+    case ARM::VST1d8Qwb_register:
+    case ARM::VST1d16Qwb_register:
+    case ARM::VST1d32Qwb_register:
+    case ARM::VST1d64Qwb_register:
+    case ARM::VST2d8wb_fixed:
+    case ARM::VST2d16wb_fixed:
+    case ARM::VST2d32wb_fixed:
+    case ARM::VST2d8wb_register:
+    case ARM::VST2d16wb_register:
+    case ARM::VST2d32wb_register:
+    case ARM::VST2q8wb_fixed:
+    case ARM::VST2q16wb_fixed:
+    case ARM::VST2q32wb_fixed:
+    case ARM::VST2q8wb_register:
+    case ARM::VST2q16wb_register:
+    case ARM::VST2q32wb_register:
+    case ARM::VST2b8wb_fixed:
+    case ARM::VST2b16wb_fixed:
+    case ARM::VST2b32wb_fixed:
+    case ARM::VST2b8wb_register:
+    case ARM::VST2b16wb_register:
+    case ARM::VST2b32wb_register:
+      if (Rm == 0xF)
+        return MCDisassembler::Fail;
+      Inst.addOperand(MCOperand::CreateImm(0));
+      break;
+    case ARM::VST3d8_UPD:
+    case ARM::VST3d16_UPD:
+    case ARM::VST3d32_UPD:
+    case ARM::VST3q8_UPD:
+    case ARM::VST3q16_UPD:
+    case ARM::VST3q32_UPD:
+    case ARM::VST4d8_UPD:
+    case ARM::VST4d16_UPD:
+    case ARM::VST4d32_UPD:
+    case ARM::VST4q8_UPD:
+    case ARM::VST4q16_UPD:
+    case ARM::VST4q32_UPD:
+      if (!Check(S, DecodeGPRRegisterClass(Inst, wb, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  // AddrMode6 Base (register+alignment)
+  if (!Check(S, DecodeAddrMode6Operand(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  // AddrMode6 Offset (register)
+  switch (Inst.getOpcode()) {
+    default:
+      if (Rm == 0xD)
+        Inst.addOperand(MCOperand::CreateReg(0));
+      else if (Rm != 0xF) {
+        if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+          return MCDisassembler::Fail;
+      }
+      break;
+    case ARM::VST1d8wb_fixed:
+    case ARM::VST1d16wb_fixed:
+    case ARM::VST1d32wb_fixed:
+    case ARM::VST1d64wb_fixed:
+    case ARM::VST1q8wb_fixed:
+    case ARM::VST1q16wb_fixed:
+    case ARM::VST1q32wb_fixed:
+    case ARM::VST1q64wb_fixed:
+    case ARM::VST1d8Twb_fixed:
+    case ARM::VST1d16Twb_fixed:
+    case ARM::VST1d32Twb_fixed:
+    case ARM::VST1d64Twb_fixed:
+    case ARM::VST1d8Qwb_fixed:
+    case ARM::VST1d16Qwb_fixed:
+    case ARM::VST1d32Qwb_fixed:
+    case ARM::VST1d64Qwb_fixed:
+    case ARM::VST2d8wb_fixed:
+    case ARM::VST2d16wb_fixed:
+    case ARM::VST2d32wb_fixed:
+    case ARM::VST2q8wb_fixed:
+    case ARM::VST2q16wb_fixed:
+    case ARM::VST2q32wb_fixed:
+    case ARM::VST2b8wb_fixed:
+    case ARM::VST2b16wb_fixed:
+    case ARM::VST2b32wb_fixed:
+      break;
+  }
+
+
+  // First input register
+  switch (Inst.getOpcode()) {
+  case ARM::VST1q16:
+  case ARM::VST1q32:
+  case ARM::VST1q64:
+  case ARM::VST1q8:
+  case ARM::VST1q16wb_fixed:
+  case ARM::VST1q16wb_register:
+  case ARM::VST1q32wb_fixed:
+  case ARM::VST1q32wb_register:
+  case ARM::VST1q64wb_fixed:
+  case ARM::VST1q64wb_register:
+  case ARM::VST1q8wb_fixed:
+  case ARM::VST1q8wb_register:
+  case ARM::VST2d16:
+  case ARM::VST2d32:
+  case ARM::VST2d8:
+  case ARM::VST2d16wb_fixed:
+  case ARM::VST2d16wb_register:
+  case ARM::VST2d32wb_fixed:
+  case ARM::VST2d32wb_register:
+  case ARM::VST2d8wb_fixed:
+  case ARM::VST2d8wb_register:
+    if (!Check(S, DecodeDPairRegisterClass(Inst, Rd, Address, Decoder)))
+      return MCDisassembler::Fail;
+    break;
+  case ARM::VST2b16:
+  case ARM::VST2b32:
+  case ARM::VST2b8:
+  case ARM::VST2b16wb_fixed:
+  case ARM::VST2b16wb_register:
+  case ARM::VST2b32wb_fixed:
+  case ARM::VST2b32wb_register:
+  case ARM::VST2b8wb_fixed:
+  case ARM::VST2b8wb_register:
+    if (!Check(S, DecodeDPairSpacedRegisterClass(Inst, Rd, Address, Decoder)))
+      return MCDisassembler::Fail;
+    break;
+  default:
+    if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  // Second input register
+  switch (Inst.getOpcode()) {
+    case ARM::VST3d8:
+    case ARM::VST3d16:
+    case ARM::VST3d32:
+    case ARM::VST3d8_UPD:
+    case ARM::VST3d16_UPD:
+    case ARM::VST3d32_UPD:
+    case ARM::VST4d8:
+    case ARM::VST4d16:
+    case ARM::VST4d32:
+    case ARM::VST4d8_UPD:
+    case ARM::VST4d16_UPD:
+    case ARM::VST4d32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+1)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    case ARM::VST3q8:
+    case ARM::VST3q16:
+    case ARM::VST3q32:
+    case ARM::VST3q8_UPD:
+    case ARM::VST3q16_UPD:
+    case ARM::VST3q32_UPD:
+    case ARM::VST4q8:
+    case ARM::VST4q16:
+    case ARM::VST4q32:
+    case ARM::VST4q8_UPD:
+    case ARM::VST4q16_UPD:
+    case ARM::VST4q32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  // Third input register
+  switch (Inst.getOpcode()) {
+    case ARM::VST3d8:
+    case ARM::VST3d16:
+    case ARM::VST3d32:
+    case ARM::VST3d8_UPD:
+    case ARM::VST3d16_UPD:
+    case ARM::VST3d32_UPD:
+    case ARM::VST4d8:
+    case ARM::VST4d16:
+    case ARM::VST4d32:
+    case ARM::VST4d8_UPD:
+    case ARM::VST4d16_UPD:
+    case ARM::VST4d32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    case ARM::VST3q8:
+    case ARM::VST3q16:
+    case ARM::VST3q32:
+    case ARM::VST3q8_UPD:
+    case ARM::VST3q16_UPD:
+    case ARM::VST3q32_UPD:
+    case ARM::VST4q8:
+    case ARM::VST4q16:
+    case ARM::VST4q32:
+    case ARM::VST4q8_UPD:
+    case ARM::VST4q16_UPD:
+    case ARM::VST4q32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+4)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  // Fourth input register
+  switch (Inst.getOpcode()) {
+    case ARM::VST4d8:
+    case ARM::VST4d16:
+    case ARM::VST4d32:
+    case ARM::VST4d8_UPD:
+    case ARM::VST4d16_UPD:
+    case ARM::VST4d32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+3)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    case ARM::VST4q8:
+    case ARM::VST4q16:
+    case ARM::VST4q32:
+    case ARM::VST4q8_UPD:
+    case ARM::VST4q16_UPD:
+    case ARM::VST4q32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+6)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeVLD1DupInstruction(MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned align = fieldFromInstruction(Insn, 4, 1);
+  unsigned size = fieldFromInstruction(Insn, 6, 2);
+
+  if (size == 0 && align == 1)
+    return MCDisassembler::Fail;
+  align *= (1 << size);
+
+  switch (Inst.getOpcode()) {
+  case ARM::VLD1DUPq16: case ARM::VLD1DUPq32: case ARM::VLD1DUPq8:
+  case ARM::VLD1DUPq16wb_fixed: case ARM::VLD1DUPq16wb_register:
+  case ARM::VLD1DUPq32wb_fixed: case ARM::VLD1DUPq32wb_register:
+  case ARM::VLD1DUPq8wb_fixed: case ARM::VLD1DUPq8wb_register:
+    if (!Check(S, DecodeDPairRegisterClass(Inst, Rd, Address, Decoder)))
+      return MCDisassembler::Fail;
+    break;
+  default:
+    if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+      return MCDisassembler::Fail;
+    break;
+  }
+  if (Rm != 0xF) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(align));
+
+  // The fixed offset post-increment encodes Rm == 0xd. The no-writeback
+  // variant encodes Rm == 0xf. Anything else is a register offset post-
+  // increment and we need to add the register operand to the instruction.
+  if (Rm != 0xD && Rm != 0xF &&
+      !Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeVLD2DupInstruction(MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned align = fieldFromInstruction(Insn, 4, 1);
+  unsigned size = 1 << fieldFromInstruction(Insn, 6, 2);
+  align *= 2*size;
+
+  switch (Inst.getOpcode()) {
+  case ARM::VLD2DUPd16: case ARM::VLD2DUPd32: case ARM::VLD2DUPd8:
+  case ARM::VLD2DUPd16wb_fixed: case ARM::VLD2DUPd16wb_register:
+  case ARM::VLD2DUPd32wb_fixed: case ARM::VLD2DUPd32wb_register:
+  case ARM::VLD2DUPd8wb_fixed: case ARM::VLD2DUPd8wb_register:
+    if (!Check(S, DecodeDPairRegisterClass(Inst, Rd, Address, Decoder)))
+      return MCDisassembler::Fail;
+    break;
+  case ARM::VLD2DUPd16x2: case ARM::VLD2DUPd32x2: case ARM::VLD2DUPd8x2:
+  case ARM::VLD2DUPd16x2wb_fixed: case ARM::VLD2DUPd16x2wb_register:
+  case ARM::VLD2DUPd32x2wb_fixed: case ARM::VLD2DUPd32x2wb_register:
+  case ARM::VLD2DUPd8x2wb_fixed: case ARM::VLD2DUPd8x2wb_register:
+    if (!Check(S, DecodeDPairSpacedRegisterClass(Inst, Rd, Address, Decoder)))
+      return MCDisassembler::Fail;
+    break;
+  default:
+    if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+      return MCDisassembler::Fail;
+    break;
+  }
+
+  if (Rm != 0xF)
+    Inst.addOperand(MCOperand::CreateImm(0));
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(align));
+
+  if (Rm != 0xD && Rm != 0xF) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeVLD3DupInstruction(MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned inc = fieldFromInstruction(Insn, 5, 1) + 1;
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+inc)%32, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2*inc)%32, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (Rm != 0xF) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(0));
+
+  if (Rm == 0xD)
+    Inst.addOperand(MCOperand::CreateReg(0));
+  else if (Rm != 0xF) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeVLD4DupInstruction(MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned size = fieldFromInstruction(Insn, 6, 2);
+  unsigned inc = fieldFromInstruction(Insn, 5, 1) + 1;
+  unsigned align = fieldFromInstruction(Insn, 4, 1);
+
+  if (size == 0x3) {
+    if (align == 0)
+      return MCDisassembler::Fail;
+    size = 4;
+    align = 16;
+  } else {
+    if (size == 2) {
+      size = 1 << size;
+      align *= 8;
+    } else {
+      size = 1 << size;
+      align *= 4*size;
+    }
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+inc)%32, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2*inc)%32, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+3*inc)%32, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (Rm != 0xF) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(align));
+
+  if (Rm == 0xD)
+    Inst.addOperand(MCOperand::CreateReg(0));
+  else if (Rm != 0xF) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  return S;
+}
+
+static DecodeStatus
+DecodeNEONModImmInstruction(MCInst &Inst, unsigned Insn,
+                            uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned imm = fieldFromInstruction(Insn, 0, 4);
+  imm |= fieldFromInstruction(Insn, 16, 3) << 4;
+  imm |= fieldFromInstruction(Insn, 24, 1) << 7;
+  imm |= fieldFromInstruction(Insn, 8, 4) << 8;
+  imm |= fieldFromInstruction(Insn, 5, 1) << 12;
+  unsigned Q = fieldFromInstruction(Insn, 6, 1);
+
+  if (Q) {
+    if (!Check(S, DecodeQPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  } else {
+    if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  }
+
+  Inst.addOperand(MCOperand::CreateImm(imm));
+
+  switch (Inst.getOpcode()) {
+    case ARM::VORRiv4i16:
+    case ARM::VORRiv2i32:
+    case ARM::VBICiv4i16:
+    case ARM::VBICiv2i32:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    case ARM::VORRiv8i16:
+    case ARM::VORRiv4i32:
+    case ARM::VBICiv8i16:
+    case ARM::VBICiv4i32:
+      if (!Check(S, DecodeQPRRegisterClass(Inst, Rd, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeVSHLMaxInstruction(MCInst &Inst, unsigned Insn,
+                                        uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  Rm |= fieldFromInstruction(Insn, 5, 1) << 4;
+  unsigned size = fieldFromInstruction(Insn, 18, 2);
+
+  if (!Check(S, DecodeQPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(8 << size));
+
+  return S;
+}
+
+static DecodeStatus DecodeShiftRight8Imm(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(8 - Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeShiftRight16Imm(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(16 - Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeShiftRight32Imm(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(32 - Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeShiftRight64Imm(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(64 - Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeTBLInstruction(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  Rn |= fieldFromInstruction(Insn, 7, 1) << 4;
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  Rm |= fieldFromInstruction(Insn, 5, 1) << 4;
+  unsigned op = fieldFromInstruction(Insn, 6, 1);
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (op) {
+    if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail; // Writeback
+  }
+
+  switch (Inst.getOpcode()) {
+  case ARM::VTBL2:
+  case ARM::VTBX2:
+    if (!Check(S, DecodeDPairRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+    break;
+  default:
+    if (!Check(S, DecodeDPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeThumbAddSpecialReg(MCInst &Inst, uint16_t Insn,
+                                     uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned dst = fieldFromInstruction(Insn, 8, 3);
+  unsigned imm = fieldFromInstruction(Insn, 0, 8);
+
+  if (!Check(S, DecodetGPRRegisterClass(Inst, dst, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  switch(Inst.getOpcode()) {
+    default:
+      return MCDisassembler::Fail;
+    case ARM::tADR:
+      break; // tADR does not explicitly represent the PC as an operand.
+    case ARM::tADDrSPi:
+      Inst.addOperand(MCOperand::CreateReg(ARM::SP));
+      break;
+  }
+
+  Inst.addOperand(MCOperand::CreateImm(imm));
+  return S;
+}
+
+static DecodeStatus DecodeThumbBROperand(MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<12>(Val<<1) + 4,
+                                true, 2, Inst, Decoder))
+    Inst.addOperand(MCOperand::CreateImm(SignExtend32<12>(Val << 1)));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeT2BROperand(MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<21>(Val) + 4,
+                                true, 4, Inst, Decoder))
+    Inst.addOperand(MCOperand::CreateImm(SignExtend32<21>(Val)));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeThumbCmpBROperand(MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  if (!tryAddingSymbolicOperand(Address, Address + (Val<<1) + 4,
+                                true, 2, Inst, Decoder))
+    Inst.addOperand(MCOperand::CreateImm(Val << 1));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeThumbAddrModeRR(MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Val, 0, 3);
+  unsigned Rm = fieldFromInstruction(Val, 3, 3);
+
+  if (!Check(S, DecodetGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodetGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeThumbAddrModeIS(MCInst &Inst, unsigned Val,
+                                  uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Val, 0, 3);
+  unsigned imm = fieldFromInstruction(Val, 3, 5);
+
+  if (!Check(S, DecodetGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(imm));
+
+  return S;
+}
+
+static DecodeStatus DecodeThumbAddrModePC(MCInst &Inst, unsigned Val,
+                                  uint64_t Address, const void *Decoder) {
+  unsigned imm = Val << 2;
+
+  Inst.addOperand(MCOperand::CreateImm(imm));
+  tryAddingPcLoadReferenceComment(Address, (Address & ~2u) + imm + 4, Decoder);
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeThumbAddrModeSP(MCInst &Inst, unsigned Val,
+                                  uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateReg(ARM::SP));
+  Inst.addOperand(MCOperand::CreateImm(Val));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeT2AddrModeSOReg(MCInst &Inst, unsigned Val,
+                                  uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Val, 6, 4);
+  unsigned Rm = fieldFromInstruction(Val, 2, 4);
+  unsigned imm = fieldFromInstruction(Val, 0, 2);
+
+  // Thumb stores cannot use PC as dest register.
+  switch (Inst.getOpcode()) {
+  case ARM::t2STRHs:
+  case ARM::t2STRBs:
+  case ARM::t2STRs:
+    if (Rn == 15)
+      return MCDisassembler::Fail;
+  default:
+    break;
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(imm));
+
+  return S;
+}
+
+static DecodeStatus DecodeT2LoadShift(MCInst &Inst, unsigned Insn,
+                              uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+
+  if (Rn == 15) {
+    switch (Inst.getOpcode()) {
+    case ARM::t2LDRBs:
+      Inst.setOpcode(ARM::t2LDRBpci);
+      break;
+    case ARM::t2LDRHs:
+      Inst.setOpcode(ARM::t2LDRHpci);
+      break;
+    case ARM::t2LDRSHs:
+      Inst.setOpcode(ARM::t2LDRSHpci);
+      break;
+    case ARM::t2LDRSBs:
+      Inst.setOpcode(ARM::t2LDRSBpci);
+      break;
+    case ARM::t2LDRs:
+      Inst.setOpcode(ARM::t2LDRpci);
+      break;
+    case ARM::t2PLDs:
+      Inst.setOpcode(ARM::t2PLDpci);
+      break;
+    case ARM::t2PLIs:
+      Inst.setOpcode(ARM::t2PLIpci);
+      break;
+    default:
+      return MCDisassembler::Fail;
+    }
+
+    return DecodeT2LoadLabel(Inst, Insn, Address, Decoder);
+  }
+
+  if (Rt == 15) {
+    switch (Inst.getOpcode()) {
+    case ARM::t2LDRSHs:
+      return MCDisassembler::Fail;
+    case ARM::t2LDRHs:
+      // FIXME: this instruction is only available with MP extensions,
+      // this should be checked first but we don't have access to the
+      // feature bits here.
+      Inst.setOpcode(ARM::t2PLDWs);
+      break;
+    default:
+      break;
+    }
+  }
+
+  switch (Inst.getOpcode()) {
+    case ARM::t2PLDs:
+    case ARM::t2PLDWs:
+    case ARM::t2PLIs:
+      break;
+    default:
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+        return MCDisassembler::Fail;
+  }
+
+  unsigned addrmode = fieldFromInstruction(Insn, 4, 2);
+  addrmode |= fieldFromInstruction(Insn, 0, 4) << 2;
+  addrmode |= fieldFromInstruction(Insn, 16, 4) << 6;
+  if (!Check(S, DecodeT2AddrModeSOReg(Inst, addrmode, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeT2LoadImm8(MCInst &Inst, unsigned Insn,
+                                uint64_t Address, const void* Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned U = fieldFromInstruction(Insn, 9, 1);
+  unsigned imm = fieldFromInstruction(Insn, 0, 8);
+  imm |= (U << 8);
+  imm |= (Rn << 9);
+
+  if (Rn == 15) {
+    switch (Inst.getOpcode()) {
+    case ARM::t2LDRi8:
+      Inst.setOpcode(ARM::t2LDRpci);
+      break;
+    case ARM::t2LDRBi8:
+      Inst.setOpcode(ARM::t2LDRBpci);
+      break;
+    case ARM::t2LDRSBi8:
+      Inst.setOpcode(ARM::t2LDRSBpci);
+      break;
+    case ARM::t2LDRHi8:
+      Inst.setOpcode(ARM::t2LDRHpci);
+      break;
+    case ARM::t2LDRSHi8:
+      Inst.setOpcode(ARM::t2LDRSHpci);
+      break;
+    case ARM::t2PLDi8:
+      Inst.setOpcode(ARM::t2PLDpci);
+      break;
+    case ARM::t2PLIi8:
+      Inst.setOpcode(ARM::t2PLIpci);
+      break;
+    default:
+      return MCDisassembler::Fail;
+    }
+    return DecodeT2LoadLabel(Inst, Insn, Address, Decoder);
+  }
+
+  if (Rt == 15) {
+    switch (Inst.getOpcode()) {
+    case ARM::t2LDRSHi8:
+      return MCDisassembler::Fail;
+    default:
+      break;
+    }
+  }
+
+  switch (Inst.getOpcode()) {
+  case ARM::t2PLDi8:
+  case ARM::t2PLIi8:
+  case ARM::t2PLDWi8:
+    break;
+  default:
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  if (!Check(S, DecodeT2AddrModeImm8(Inst, imm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  return S;
+}
+
+static DecodeStatus DecodeT2LoadImm12(MCInst &Inst, unsigned Insn,
+                                uint64_t Address, const void* Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned imm = fieldFromInstruction(Insn, 0, 12);
+  imm |= (Rn << 13);
+
+  if (Rn == 15) {
+    switch (Inst.getOpcode()) {
+    case ARM::t2LDRi12:
+      Inst.setOpcode(ARM::t2LDRpci);
+      break;
+    case ARM::t2LDRHi12:
+      Inst.setOpcode(ARM::t2LDRHpci);
+      break;
+    case ARM::t2LDRSHi12:
+      Inst.setOpcode(ARM::t2LDRSHpci);
+      break;
+    case ARM::t2LDRBi12:
+      Inst.setOpcode(ARM::t2LDRBpci);
+      break;
+    case ARM::t2LDRSBi12:
+      Inst.setOpcode(ARM::t2LDRSBpci);
+      break;
+    case ARM::t2PLDi12:
+      Inst.setOpcode(ARM::t2PLDpci);
+      break;
+    case ARM::t2PLIi12:
+      Inst.setOpcode(ARM::t2PLIpci);
+      break;
+    default:
+      return MCDisassembler::Fail;
+    }
+    return DecodeT2LoadLabel(Inst, Insn, Address, Decoder);
+  }
+
+  if (Rt == 15) {
+    switch (Inst.getOpcode()) {
+    case ARM::t2LDRSHi12:
+      return MCDisassembler::Fail;
+    case ARM::t2LDRHi12:
+      Inst.setOpcode(ARM::t2PLDi12);
+      break;
+    default:
+      break;
+    }
+  }
+
+  switch (Inst.getOpcode()) {
+  case ARM::t2PLDi12:
+  case ARM::t2PLDWi12:
+  case ARM::t2PLIi12:
+    break;
+  default:
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  if (!Check(S, DecodeT2AddrModeImm12(Inst, imm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  return S;
+}
+
+static DecodeStatus DecodeT2LoadT(MCInst &Inst, unsigned Insn,
+                                uint64_t Address, const void* Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned imm = fieldFromInstruction(Insn, 0, 8);
+  imm |= (Rn << 9);
+
+  if (Rn == 15) {
+    switch (Inst.getOpcode()) {
+    case ARM::t2LDRT:
+      Inst.setOpcode(ARM::t2LDRpci);
+      break;
+    case ARM::t2LDRBT:
+      Inst.setOpcode(ARM::t2LDRBpci);
+      break;
+    case ARM::t2LDRHT:
+      Inst.setOpcode(ARM::t2LDRHpci);
+      break;
+    case ARM::t2LDRSBT:
+      Inst.setOpcode(ARM::t2LDRSBpci);
+      break;
+    case ARM::t2LDRSHT:
+      Inst.setOpcode(ARM::t2LDRSHpci);
+      break;
+    default:
+      return MCDisassembler::Fail;
+    }
+    return DecodeT2LoadLabel(Inst, Insn, Address, Decoder);
+  }
+
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeT2AddrModeImm8(Inst, imm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  return S;
+}
+
+static DecodeStatus DecodeT2LoadLabel(MCInst &Inst, unsigned Insn,
+                                uint64_t Address, const void* Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned U = fieldFromInstruction(Insn, 23, 1);
+  int imm = fieldFromInstruction(Insn, 0, 12);
+
+  if (Rt == 15) {
+    switch (Inst.getOpcode()) {
+      case ARM::t2LDRBpci:
+      case ARM::t2LDRHpci:
+        Inst.setOpcode(ARM::t2PLDpci);
+        break;
+      case ARM::t2LDRSBpci:
+        Inst.setOpcode(ARM::t2PLIpci);
+        break;
+      case ARM::t2LDRSHpci:
+        return MCDisassembler::Fail;
+      default:
+        break;
+    }
+  }
+
+  switch(Inst.getOpcode()) {
+  case ARM::t2PLDpci:
+  case ARM::t2PLIpci:
+    break;
+  default:
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  if (!U) {
+    // Special case for #-0.
+    if (imm == 0)
+      imm = INT32_MIN;
+    else
+      imm = -imm;
+  }
+  Inst.addOperand(MCOperand::CreateImm(imm));
+
+  return S;
+}
+
+static DecodeStatus DecodeT2Imm8S4(MCInst &Inst, unsigned Val,
+                           uint64_t Address, const void *Decoder) {
+  if (Val == 0)
+    Inst.addOperand(MCOperand::CreateImm(INT32_MIN));
+  else {
+    int imm = Val & 0xFF;
+
+    if (!(Val & 0x100)) imm *= -1;
+    Inst.addOperand(MCOperand::CreateImm(imm * 4));
+  }
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeT2AddrModeImm8s4(MCInst &Inst, unsigned Val,
+                                   uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Val, 9, 4);
+  unsigned imm = fieldFromInstruction(Val, 0, 9);
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeT2Imm8S4(Inst, imm, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeT2AddrModeImm0_1020s4(MCInst &Inst,unsigned Val,
+                                   uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Val, 8, 4);
+  unsigned imm = fieldFromInstruction(Val, 0, 8);
+
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateImm(imm));
+
+  return S;
+}
+
+static DecodeStatus DecodeT2Imm8(MCInst &Inst, unsigned Val,
+                         uint64_t Address, const void *Decoder) {
+  int imm = Val & 0xFF;
+  if (Val == 0)
+    imm = INT32_MIN;
+  else if (!(Val & 0x100))
+    imm *= -1;
+  Inst.addOperand(MCOperand::CreateImm(imm));
+
+  return MCDisassembler::Success;
+}
+
+
+static DecodeStatus DecodeT2AddrModeImm8(MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Val, 9, 4);
+  unsigned imm = fieldFromInstruction(Val, 0, 9);
+
+  // Thumb stores cannot use PC as dest register.
+  switch (Inst.getOpcode()) {
+  case ARM::t2STRT:
+  case ARM::t2STRBT:
+  case ARM::t2STRHT:
+  case ARM::t2STRi8:
+  case ARM::t2STRHi8:
+  case ARM::t2STRBi8:
+    if (Rn == 15)
+      return MCDisassembler::Fail;
+    break;
+  default:
+    break;
+  }
+
+  // Some instructions always use an additive offset.
+  switch (Inst.getOpcode()) {
+    case ARM::t2LDRT:
+    case ARM::t2LDRBT:
+    case ARM::t2LDRHT:
+    case ARM::t2LDRSBT:
+    case ARM::t2LDRSHT:
+    case ARM::t2STRT:
+    case ARM::t2STRBT:
+    case ARM::t2STRHT:
+      imm |= 0x100;
+      break;
+    default:
+      break;
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeT2Imm8(Inst, imm, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeT2LdStPre(MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned addr = fieldFromInstruction(Insn, 0, 8);
+  addr |= fieldFromInstruction(Insn, 9, 1) << 8;
+  addr |= Rn << 9;
+  unsigned load = fieldFromInstruction(Insn, 20, 1);
+
+  if (Rn == 15) {
+    switch (Inst.getOpcode()) {
+    case ARM::t2LDR_PRE:
+    case ARM::t2LDR_POST:
+      Inst.setOpcode(ARM::t2LDRpci);
+      break;
+    case ARM::t2LDRB_PRE:
+    case ARM::t2LDRB_POST:
+      Inst.setOpcode(ARM::t2LDRBpci);
+      break;
+    case ARM::t2LDRH_PRE:
+    case ARM::t2LDRH_POST:
+      Inst.setOpcode(ARM::t2LDRHpci);
+      break;
+    case ARM::t2LDRSB_PRE:
+    case ARM::t2LDRSB_POST:
+      if (Rt == 15)
+        Inst.setOpcode(ARM::t2PLIpci);
+      else
+        Inst.setOpcode(ARM::t2LDRSBpci);
+      break;
+    case ARM::t2LDRSH_PRE:
+    case ARM::t2LDRSH_POST:
+      Inst.setOpcode(ARM::t2LDRSHpci);
+      break;
+    default:
+      return MCDisassembler::Fail;
+    }
+    return DecodeT2LoadLabel(Inst, Insn, Address, Decoder);
+  }
+
+  if (!load) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (load) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  if (!Check(S, DecodeT2AddrModeImm8(Inst, addr, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeT2AddrModeImm12(MCInst &Inst, unsigned Val,
+                                  uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Val, 13, 4);
+  unsigned imm = fieldFromInstruction(Val, 0, 12);
+
+  // Thumb stores cannot use PC as dest register.
+  switch (Inst.getOpcode()) {
+  case ARM::t2STRi12:
+  case ARM::t2STRBi12:
+  case ARM::t2STRHi12:
+    if (Rn == 15)
+      return MCDisassembler::Fail;
+  default:
+    break;
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(imm));
+
+  return S;
+}
+
+
+static DecodeStatus DecodeThumbAddSPImm(MCInst &Inst, uint16_t Insn,
+                                uint64_t Address, const void *Decoder) {
+  unsigned imm = fieldFromInstruction(Insn, 0, 7);
+
+  Inst.addOperand(MCOperand::CreateReg(ARM::SP));
+  Inst.addOperand(MCOperand::CreateReg(ARM::SP));
+  Inst.addOperand(MCOperand::CreateImm(imm));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeThumbAddSPReg(MCInst &Inst, uint16_t Insn,
+                                uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  if (Inst.getOpcode() == ARM::tADDrSP) {
+    unsigned Rdm = fieldFromInstruction(Insn, 0, 3);
+    Rdm |= fieldFromInstruction(Insn, 7, 1) << 3;
+
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rdm, Address, Decoder)))
+    return MCDisassembler::Fail;
+    Inst.addOperand(MCOperand::CreateReg(ARM::SP));
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rdm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  } else if (Inst.getOpcode() == ARM::tADDspr) {
+    unsigned Rm = fieldFromInstruction(Insn, 3, 4);
+
+    Inst.addOperand(MCOperand::CreateReg(ARM::SP));
+    Inst.addOperand(MCOperand::CreateReg(ARM::SP));
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeThumbCPS(MCInst &Inst, uint16_t Insn,
+                           uint64_t Address, const void *Decoder) {
+  unsigned imod = fieldFromInstruction(Insn, 4, 1) | 0x2;
+  unsigned flags = fieldFromInstruction(Insn, 0, 3);
+
+  Inst.addOperand(MCOperand::CreateImm(imod));
+  Inst.addOperand(MCOperand::CreateImm(flags));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodePostIdxReg(MCInst &Inst, unsigned Insn,
+                             uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned add = fieldFromInstruction(Insn, 4, 1);
+
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(add));
+
+  return S;
+}
+
+static DecodeStatus DecodeThumbBLXOffset(MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  // Val is passed in as S:J1:J2:imm10H:imm10L:'0'
+  // Note only one trailing zero not two.  Also the J1 and J2 values are from
+  // the encoded instruction.  So here change to I1 and I2 values via:
+  // I1 = NOT(J1 EOR S);
+  // I2 = NOT(J2 EOR S);
+  // and build the imm32 with two trailing zeros as documented:
+  // imm32 = SignExtend(S:I1:I2:imm10H:imm10L:'00', 32);
+  unsigned S = (Val >> 23) & 1;
+  unsigned J1 = (Val >> 22) & 1;
+  unsigned J2 = (Val >> 21) & 1;
+  unsigned I1 = !(J1 ^ S);
+  unsigned I2 = !(J2 ^ S);
+  unsigned tmp = (Val & ~0x600000) | (I1 << 22) | (I2 << 21);
+  int imm32 = SignExtend32<25>(tmp << 1);
+
+  if (!tryAddingSymbolicOperand(Address,
+                                (Address & ~2u) + imm32 + 4,
+                                true, 4, Inst, Decoder))
+    Inst.addOperand(MCOperand::CreateImm(imm32));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCoprocessor(MCInst &Inst, unsigned Val,
+                              uint64_t Address, const void *Decoder) {
+  if (Val == 0xA || Val == 0xB)
+    return MCDisassembler::Fail;
+
+  uint64_t featureBits = ((const MCDisassembler*)Decoder)->getSubtargetInfo()
+                                                          .getFeatureBits();
+  if ((featureBits & ARM::HasV8Ops) && !(Val == 14 || Val == 15))
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateImm(Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus
+DecodeThumbTableBranch(MCInst &Inst, unsigned Insn,
+                       uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+
+  if (Rn == ARM::SP) S = MCDisassembler::SoftFail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  return S;
+}
+
+static DecodeStatus
+DecodeThumb2BCCInstruction(MCInst &Inst, unsigned Insn,
+                           uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned pred = fieldFromInstruction(Insn, 22, 4);
+  if (pred == 0xE || pred == 0xF) {
+    unsigned opc = fieldFromInstruction(Insn, 4, 28);
+    switch (opc) {
+      default:
+        return MCDisassembler::Fail;
+      case 0xf3bf8f4:
+        Inst.setOpcode(ARM::t2DSB);
+        break;
+      case 0xf3bf8f5:
+        Inst.setOpcode(ARM::t2DMB);
+        break;
+      case 0xf3bf8f6:
+        Inst.setOpcode(ARM::t2ISB);
+        break;
+    }
+
+    unsigned imm = fieldFromInstruction(Insn, 0, 4);
+    return DecodeMemBarrierOption(Inst, imm, Address, Decoder);
+  }
+
+  unsigned brtarget = fieldFromInstruction(Insn, 0, 11) << 1;
+  brtarget |= fieldFromInstruction(Insn, 11, 1) << 19;
+  brtarget |= fieldFromInstruction(Insn, 13, 1) << 18;
+  brtarget |= fieldFromInstruction(Insn, 16, 6) << 12;
+  brtarget |= fieldFromInstruction(Insn, 26, 1) << 20;
+
+  if (!Check(S, DecodeT2BROperand(Inst, brtarget, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+// Decode a shifted immediate operand.  These basically consist
+// of an 8-bit value, and a 4-bit directive that specifies either
+// a splat operation or a rotation.
+static DecodeStatus DecodeT2SOImm(MCInst &Inst, unsigned Val,
+                          uint64_t Address, const void *Decoder) {
+  unsigned ctrl = fieldFromInstruction(Val, 10, 2);
+  if (ctrl == 0) {
+    unsigned byte = fieldFromInstruction(Val, 8, 2);
+    unsigned imm = fieldFromInstruction(Val, 0, 8);
+    switch (byte) {
+      case 0:
+        Inst.addOperand(MCOperand::CreateImm(imm));
+        break;
+      case 1:
+        Inst.addOperand(MCOperand::CreateImm((imm << 16) | imm));
+        break;
+      case 2:
+        Inst.addOperand(MCOperand::CreateImm((imm << 24) | (imm << 8)));
+        break;
+      case 3:
+        Inst.addOperand(MCOperand::CreateImm((imm << 24) | (imm << 16) |
+                                             (imm << 8)  |  imm));
+        break;
+    }
+  } else {
+    unsigned unrot = fieldFromInstruction(Val, 0, 7) | 0x80;
+    unsigned rot = fieldFromInstruction(Val, 7, 5);
+    unsigned imm = (unrot >> rot) | (unrot << ((32-rot)&31));
+    Inst.addOperand(MCOperand::CreateImm(imm));
+  }
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus
+DecodeThumbBCCTargetOperand(MCInst &Inst, unsigned Val,
+                            uint64_t Address, const void *Decoder){
+  if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<9>(Val<<1) + 4,
+                                true, 2, Inst, Decoder))
+    Inst.addOperand(MCOperand::CreateImm(SignExtend32<9>(Val << 1)));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeThumbBLTargetOperand(MCInst &Inst, unsigned Val,
+                                       uint64_t Address, const void *Decoder){
+  // Val is passed in as S:J1:J2:imm10:imm11
+  // Note no trailing zero after imm11.  Also the J1 and J2 values are from
+  // the encoded instruction.  So here change to I1 and I2 values via:
+  // I1 = NOT(J1 EOR S);
+  // I2 = NOT(J2 EOR S);
+  // and build the imm32 with one trailing zero as documented:
+  // imm32 = SignExtend(S:I1:I2:imm10:imm11:'0', 32);
+  unsigned S = (Val >> 23) & 1;
+  unsigned J1 = (Val >> 22) & 1;
+  unsigned J2 = (Val >> 21) & 1;
+  unsigned I1 = !(J1 ^ S);
+  unsigned I2 = !(J2 ^ S);
+  unsigned tmp = (Val & ~0x600000) | (I1 << 22) | (I2 << 21);
+  int imm32 = SignExtend32<25>(tmp << 1);
+
+  if (!tryAddingSymbolicOperand(Address, Address + imm32 + 4,
+                                true, 4, Inst, Decoder))
+    Inst.addOperand(MCOperand::CreateImm(imm32));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMemBarrierOption(MCInst &Inst, unsigned Val,
+                                   uint64_t Address, const void *Decoder) {
+  if (Val & ~0xf)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateImm(Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeInstSyncBarrierOption(MCInst &Inst, unsigned Val,
+                                        uint64_t Address, const void *Decoder) {
+  if (Val & ~0xf)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateImm(Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMSRMask(MCInst &Inst, unsigned Val,
+                          uint64_t Address, const void *Decoder) {
+  if (!Val) return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeDoubleRegLoad(MCInst &Inst, unsigned Insn,
+                                        uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+
+  if (Rn == 0xF)
+    S = MCDisassembler::SoftFail;
+
+  if (!Check(S, DecodeGPRPairRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeDoubleRegStore(MCInst &Inst, unsigned Insn,
+                                         uint64_t Address, const void *Decoder){
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  unsigned Rt = fieldFromInstruction(Insn, 0, 4);
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (Rn == 0xF || Rd == Rn || Rd == Rt || Rd == Rt+1)
+    S = MCDisassembler::SoftFail;
+
+  if (!Check(S, DecodeGPRPairRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeLDRPreImm(MCInst &Inst, unsigned Insn,
+                            uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned imm = fieldFromInstruction(Insn, 0, 12);
+  imm |= fieldFromInstruction(Insn, 16, 4) << 13;
+  imm |= fieldFromInstruction(Insn, 23, 1) << 12;
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+
+  if (Rn == 0xF || Rn == Rt) S = MCDisassembler::SoftFail;
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeAddrModeImm12Operand(Inst, imm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeLDRPreReg(MCInst &Inst, unsigned Insn,
+                            uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned imm = fieldFromInstruction(Insn, 0, 12);
+  imm |= fieldFromInstruction(Insn, 16, 4) << 13;
+  imm |= fieldFromInstruction(Insn, 23, 1) << 12;
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+
+  if (Rn == 0xF || Rn == Rt) S = MCDisassembler::SoftFail;
+  if (Rm == 0xF) S = MCDisassembler::SoftFail;
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeSORegMemOperand(Inst, imm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+
+static DecodeStatus DecodeSTRPreImm(MCInst &Inst, unsigned Insn,
+                            uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned imm = fieldFromInstruction(Insn, 0, 12);
+  imm |= fieldFromInstruction(Insn, 16, 4) << 13;
+  imm |= fieldFromInstruction(Insn, 23, 1) << 12;
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+
+  if (Rn == 0xF || Rn == Rt) S = MCDisassembler::SoftFail;
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeAddrModeImm12Operand(Inst, imm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeSTRPreReg(MCInst &Inst, unsigned Insn,
+                            uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned imm = fieldFromInstruction(Insn, 0, 12);
+  imm |= fieldFromInstruction(Insn, 16, 4) << 13;
+  imm |= fieldFromInstruction(Insn, 23, 1) << 12;
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+
+  if (Rn == 0xF || Rn == Rt) S = MCDisassembler::SoftFail;
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeSORegMemOperand(Inst, imm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeVLD1LN(MCInst &Inst, unsigned Insn,
+                         uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned size = fieldFromInstruction(Insn, 10, 2);
+
+  unsigned align = 0;
+  unsigned index = 0;
+  switch (size) {
+    default:
+      return MCDisassembler::Fail;
+    case 0:
+      if (fieldFromInstruction(Insn, 4, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction(Insn, 5, 3);
+      break;
+    case 1:
+      if (fieldFromInstruction(Insn, 5, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction(Insn, 6, 2);
+      if (fieldFromInstruction(Insn, 4, 1))
+        align = 2;
+      break;
+    case 2:
+      if (fieldFromInstruction(Insn, 6, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction(Insn, 7, 1);
+
+      switch (fieldFromInstruction(Insn, 4, 2)) {
+        case 0 :
+          align = 0; break;
+        case 3:
+          align = 4; break;
+        default:
+          return MCDisassembler::Fail;
+      }
+      break;
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (Rm != 0xF) { // Writeback
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(align));
+  if (Rm != 0xF) {
+    if (Rm != 0xD) {
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+        return MCDisassembler::Fail;
+    } else
+      Inst.addOperand(MCOperand::CreateReg(0));
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(index));
+
+  return S;
+}
+
+static DecodeStatus DecodeVST1LN(MCInst &Inst, unsigned Insn,
+                         uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned size = fieldFromInstruction(Insn, 10, 2);
+
+  unsigned align = 0;
+  unsigned index = 0;
+  switch (size) {
+    default:
+      return MCDisassembler::Fail;
+    case 0:
+      if (fieldFromInstruction(Insn, 4, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction(Insn, 5, 3);
+      break;
+    case 1:
+      if (fieldFromInstruction(Insn, 5, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction(Insn, 6, 2);
+      if (fieldFromInstruction(Insn, 4, 1))
+        align = 2;
+      break;
+    case 2:
+      if (fieldFromInstruction(Insn, 6, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction(Insn, 7, 1);
+
+      switch (fieldFromInstruction(Insn, 4, 2)) {
+        case 0: 
+          align = 0; break;
+        case 3:
+          align = 4; break;
+        default:
+          return MCDisassembler::Fail;
+      }
+      break;
+  }
+
+  if (Rm != 0xF) { // Writeback
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  }
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(align));
+  if (Rm != 0xF) {
+    if (Rm != 0xD) {
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+    } else
+      Inst.addOperand(MCOperand::CreateReg(0));
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(index));
+
+  return S;
+}
+
+
+static DecodeStatus DecodeVLD2LN(MCInst &Inst, unsigned Insn,
+                         uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned size = fieldFromInstruction(Insn, 10, 2);
+
+  unsigned align = 0;
+  unsigned index = 0;
+  unsigned inc = 1;
+  switch (size) {
+    default:
+      return MCDisassembler::Fail;
+    case 0:
+      index = fieldFromInstruction(Insn, 5, 3);
+      if (fieldFromInstruction(Insn, 4, 1))
+        align = 2;
+      break;
+    case 1:
+      index = fieldFromInstruction(Insn, 6, 2);
+      if (fieldFromInstruction(Insn, 4, 1))
+        align = 4;
+      if (fieldFromInstruction(Insn, 5, 1))
+        inc = 2;
+      break;
+    case 2:
+      if (fieldFromInstruction(Insn, 5, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction(Insn, 7, 1);
+      if (fieldFromInstruction(Insn, 4, 1) != 0)
+        align = 8;
+      if (fieldFromInstruction(Insn, 6, 1))
+        inc = 2;
+      break;
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (Rm != 0xF) { // Writeback
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(align));
+  if (Rm != 0xF) {
+    if (Rm != 0xD) {
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+        return MCDisassembler::Fail;
+    } else
+      Inst.addOperand(MCOperand::CreateReg(0));
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(index));
+
+  return S;
+}
+
+static DecodeStatus DecodeVST2LN(MCInst &Inst, unsigned Insn,
+                         uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned size = fieldFromInstruction(Insn, 10, 2);
+
+  unsigned align = 0;
+  unsigned index = 0;
+  unsigned inc = 1;
+  switch (size) {
+    default:
+      return MCDisassembler::Fail;
+    case 0:
+      index = fieldFromInstruction(Insn, 5, 3);
+      if (fieldFromInstruction(Insn, 4, 1))
+        align = 2;
+      break;
+    case 1:
+      index = fieldFromInstruction(Insn, 6, 2);
+      if (fieldFromInstruction(Insn, 4, 1))
+        align = 4;
+      if (fieldFromInstruction(Insn, 5, 1))
+        inc = 2;
+      break;
+    case 2:
+      if (fieldFromInstruction(Insn, 5, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction(Insn, 7, 1);
+      if (fieldFromInstruction(Insn, 4, 1) != 0)
+        align = 8;
+      if (fieldFromInstruction(Insn, 6, 1))
+        inc = 2;
+      break;
+  }
+
+  if (Rm != 0xF) { // Writeback
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(align));
+  if (Rm != 0xF) {
+    if (Rm != 0xD) {
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+        return MCDisassembler::Fail;
+    } else
+      Inst.addOperand(MCOperand::CreateReg(0));
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(index));
+
+  return S;
+}
+
+
+static DecodeStatus DecodeVLD3LN(MCInst &Inst, unsigned Insn,
+                         uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned size = fieldFromInstruction(Insn, 10, 2);
+
+  unsigned align = 0;
+  unsigned index = 0;
+  unsigned inc = 1;
+  switch (size) {
+    default:
+      return MCDisassembler::Fail;
+    case 0:
+      if (fieldFromInstruction(Insn, 4, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction(Insn, 5, 3);
+      break;
+    case 1:
+      if (fieldFromInstruction(Insn, 4, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction(Insn, 6, 2);
+      if (fieldFromInstruction(Insn, 5, 1))
+        inc = 2;
+      break;
+    case 2:
+      if (fieldFromInstruction(Insn, 4, 2))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction(Insn, 7, 1);
+      if (fieldFromInstruction(Insn, 6, 1))
+        inc = 2;
+      break;
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (Rm != 0xF) { // Writeback
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  }
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(align));
+  if (Rm != 0xF) {
+    if (Rm != 0xD) {
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+    } else
+      Inst.addOperand(MCOperand::CreateReg(0));
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(index));
+
+  return S;
+}
+
+static DecodeStatus DecodeVST3LN(MCInst &Inst, unsigned Insn,
+                         uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned size = fieldFromInstruction(Insn, 10, 2);
+
+  unsigned align = 0;
+  unsigned index = 0;
+  unsigned inc = 1;
+  switch (size) {
+    default:
+      return MCDisassembler::Fail;
+    case 0:
+      if (fieldFromInstruction(Insn, 4, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction(Insn, 5, 3);
+      break;
+    case 1:
+      if (fieldFromInstruction(Insn, 4, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction(Insn, 6, 2);
+      if (fieldFromInstruction(Insn, 5, 1))
+        inc = 2;
+      break;
+    case 2:
+      if (fieldFromInstruction(Insn, 4, 2))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction(Insn, 7, 1);
+      if (fieldFromInstruction(Insn, 6, 1))
+        inc = 2;
+      break;
+  }
+
+  if (Rm != 0xF) { // Writeback
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  }
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(align));
+  if (Rm != 0xF) {
+    if (Rm != 0xD) {
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+    } else
+      Inst.addOperand(MCOperand::CreateReg(0));
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(index));
+
+  return S;
+}
+
+
+static DecodeStatus DecodeVLD4LN(MCInst &Inst, unsigned Insn,
+                         uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned size = fieldFromInstruction(Insn, 10, 2);
+
+  unsigned align = 0;
+  unsigned index = 0;
+  unsigned inc = 1;
+  switch (size) {
+    default:
+      return MCDisassembler::Fail;
+    case 0:
+      if (fieldFromInstruction(Insn, 4, 1))
+        align = 4;
+      index = fieldFromInstruction(Insn, 5, 3);
+      break;
+    case 1:
+      if (fieldFromInstruction(Insn, 4, 1))
+        align = 8;
+      index = fieldFromInstruction(Insn, 6, 2);
+      if (fieldFromInstruction(Insn, 5, 1))
+        inc = 2;
+      break;
+    case 2:
+      switch (fieldFromInstruction(Insn, 4, 2)) {
+        case 0:
+          align = 0; break;
+        case 3:
+          return MCDisassembler::Fail;
+        default:
+          align = 4 << fieldFromInstruction(Insn, 4, 2); break;
+      }
+
+      index = fieldFromInstruction(Insn, 7, 1);
+      if (fieldFromInstruction(Insn, 6, 1))
+        inc = 2;
+      break;
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+3*inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (Rm != 0xF) { // Writeback
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(align));
+  if (Rm != 0xF) {
+    if (Rm != 0xD) {
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+        return MCDisassembler::Fail;
+    } else
+      Inst.addOperand(MCOperand::CreateReg(0));
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+3*inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(index));
+
+  return S;
+}
+
+static DecodeStatus DecodeVST4LN(MCInst &Inst, unsigned Insn,
+                         uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned size = fieldFromInstruction(Insn, 10, 2);
+
+  unsigned align = 0;
+  unsigned index = 0;
+  unsigned inc = 1;
+  switch (size) {
+    default:
+      return MCDisassembler::Fail;
+    case 0:
+      if (fieldFromInstruction(Insn, 4, 1))
+        align = 4;
+      index = fieldFromInstruction(Insn, 5, 3);
+      break;
+    case 1:
+      if (fieldFromInstruction(Insn, 4, 1))
+        align = 8;
+      index = fieldFromInstruction(Insn, 6, 2);
+      if (fieldFromInstruction(Insn, 5, 1))
+        inc = 2;
+      break;
+    case 2:
+      switch (fieldFromInstruction(Insn, 4, 2)) {
+        case 0:
+          align = 0; break;
+        case 3:
+          return MCDisassembler::Fail;
+        default:
+          align = 4 << fieldFromInstruction(Insn, 4, 2); break;
+      }
+
+      index = fieldFromInstruction(Insn, 7, 1);
+      if (fieldFromInstruction(Insn, 6, 1))
+        inc = 2;
+      break;
+  }
+
+  if (Rm != 0xF) { // Writeback
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  }
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(align));
+  if (Rm != 0xF) {
+    if (Rm != 0xD) {
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+    } else
+      Inst.addOperand(MCOperand::CreateReg(0));
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+3*inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(index));
+
+  return S;
+}
+
+static DecodeStatus DecodeVMOVSRR(MCInst &Inst, unsigned Insn,
+                                  uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+  unsigned Rt  = fieldFromInstruction(Insn, 12, 4);
+  unsigned Rt2 = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm  = fieldFromInstruction(Insn,  5, 1);
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+  Rm |= fieldFromInstruction(Insn, 0, 4) << 1;
+
+  if (Rt == 0xF || Rt2 == 0xF || Rm == 0x1F)
+    S = MCDisassembler::SoftFail;
+
+  if (!Check(S, DecodeSPRRegisterClass(Inst, Rm  , Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeSPRRegisterClass(Inst, Rm+1, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt  , Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt2 , Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeVMOVRRS(MCInst &Inst, unsigned Insn,
+                                  uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+  unsigned Rt  = fieldFromInstruction(Insn, 12, 4);
+  unsigned Rt2 = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm  = fieldFromInstruction(Insn,  5, 1);
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+  Rm |= fieldFromInstruction(Insn, 0, 4) << 1;
+
+  if (Rt == 0xF || Rt2 == 0xF || Rm == 0x1F)
+    S = MCDisassembler::SoftFail;
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt  , Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt2 , Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeSPRRegisterClass(Inst, Rm  , Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeSPRRegisterClass(Inst, Rm+1, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeIT(MCInst &Inst, unsigned Insn,
+                             uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+  unsigned pred = fieldFromInstruction(Insn, 4, 4);
+  unsigned mask = fieldFromInstruction(Insn, 0, 4);
+
+  if (pred == 0xF) {
+    pred = 0xE;
+    S = MCDisassembler::SoftFail;
+  }
+
+  if (mask == 0x0)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateImm(pred));
+  Inst.addOperand(MCOperand::CreateImm(mask));
+  return S;
+}
+
+static DecodeStatus
+DecodeT2LDRDPreInstruction(MCInst &Inst, unsigned Insn,
+                           uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned Rt2 = fieldFromInstruction(Insn, 8, 4);
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned addr = fieldFromInstruction(Insn, 0, 8);
+  unsigned W = fieldFromInstruction(Insn, 21, 1);
+  unsigned U = fieldFromInstruction(Insn, 23, 1);
+  unsigned P = fieldFromInstruction(Insn, 24, 1);
+  bool writeback = (W == 1) | (P == 0);
+
+  addr |= (U << 8) | (Rn << 9);
+
+  if (writeback && (Rn == Rt || Rn == Rt2))
+    Check(S, MCDisassembler::SoftFail);
+  if (Rt == Rt2)
+    Check(S, MCDisassembler::SoftFail);
+
+  // Rt
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  // Rt2
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rt2, Address, Decoder)))
+    return MCDisassembler::Fail;
+  // Writeback operand
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  // addr
+  if (!Check(S, DecodeT2AddrModeImm8s4(Inst, addr, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus
+DecodeT2STRDPreInstruction(MCInst &Inst, unsigned Insn,
+                           uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned Rt2 = fieldFromInstruction(Insn, 8, 4);
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned addr = fieldFromInstruction(Insn, 0, 8);
+  unsigned W = fieldFromInstruction(Insn, 21, 1);
+  unsigned U = fieldFromInstruction(Insn, 23, 1);
+  unsigned P = fieldFromInstruction(Insn, 24, 1);
+  bool writeback = (W == 1) | (P == 0);
+
+  addr |= (U << 8) | (Rn << 9);
+
+  if (writeback && (Rn == Rt || Rn == Rt2))
+    Check(S, MCDisassembler::SoftFail);
+
+  // Writeback operand
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  // Rt
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  // Rt2
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rt2, Address, Decoder)))
+    return MCDisassembler::Fail;
+  // addr
+  if (!Check(S, DecodeT2AddrModeImm8s4(Inst, addr, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeT2Adr(MCInst &Inst, uint32_t Insn,
+                                uint64_t Address, const void *Decoder) {
+  unsigned sign1 = fieldFromInstruction(Insn, 21, 1);
+  unsigned sign2 = fieldFromInstruction(Insn, 23, 1);
+  if (sign1 != sign2) return MCDisassembler::Fail;
+
+  unsigned Val = fieldFromInstruction(Insn, 0, 8);
+  Val |= fieldFromInstruction(Insn, 12, 3) << 8;
+  Val |= fieldFromInstruction(Insn, 26, 1) << 11;
+  Val |= sign1 << 12;
+  Inst.addOperand(MCOperand::CreateImm(SignExtend32<13>(Val)));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeT2ShifterImmOperand(MCInst &Inst, uint32_t Val,
+                                              uint64_t Address,
+                                              const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  // Shift of "asr #32" is not allowed in Thumb2 mode.
+  if (Val == 0x20) S = MCDisassembler::SoftFail;
+  Inst.addOperand(MCOperand::CreateImm(Val));
+  return S;
+}
+
+static DecodeStatus DecodeSwap(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder) {
+  unsigned Rt   = fieldFromInstruction(Insn, 12, 4);
+  unsigned Rt2  = fieldFromInstruction(Insn, 0,  4);
+  unsigned Rn   = fieldFromInstruction(Insn, 16, 4);
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+
+  if (pred == 0xF)
+    return DecodeCPSInstruction(Inst, Insn, Address, Decoder);
+
+  DecodeStatus S = MCDisassembler::Success;
+
+  if (Rt == Rn || Rn == Rt2)
+    S = MCDisassembler::SoftFail;
+
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rt2, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn,
+                                uint64_t Address, const void *Decoder) {
+  unsigned Vd = (fieldFromInstruction(Insn, 12, 4) << 0);
+  Vd |= (fieldFromInstruction(Insn, 22, 1) << 4);
+  unsigned Vm = (fieldFromInstruction(Insn, 0, 4) << 0);
+  Vm |= (fieldFromInstruction(Insn, 5, 1) << 4);
+  unsigned imm = fieldFromInstruction(Insn, 16, 6);
+  unsigned cmode = fieldFromInstruction(Insn, 8, 4);
+  unsigned op = fieldFromInstruction(Insn, 5, 1);
+
+  DecodeStatus S = MCDisassembler::Success;
+
+  // VMOVv2f32 is ambiguous with these decodings.
+  if (!(imm & 0x38) && cmode == 0xF) {
+    if (op == 1) return MCDisassembler::Fail;
+    Inst.setOpcode(ARM::VMOVv2f32);
+    return DecodeNEONModImmInstruction(Inst, Insn, Address, Decoder);
+  }
+
+  if (!(imm & 0x20)) return MCDisassembler::Fail;
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Vd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Vm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(64 - imm));
+
+  return S;
+}
+
+static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn,
+                                uint64_t Address, const void *Decoder) {
+  unsigned Vd = (fieldFromInstruction(Insn, 12, 4) << 0);
+  Vd |= (fieldFromInstruction(Insn, 22, 1) << 4);
+  unsigned Vm = (fieldFromInstruction(Insn, 0, 4) << 0);
+  Vm |= (fieldFromInstruction(Insn, 5, 1) << 4);
+  unsigned imm = fieldFromInstruction(Insn, 16, 6);
+  unsigned cmode = fieldFromInstruction(Insn, 8, 4);
+  unsigned op = fieldFromInstruction(Insn, 5, 1);
+
+  DecodeStatus S = MCDisassembler::Success;
+
+  // VMOVv4f32 is ambiguous with these decodings.
+  if (!(imm & 0x38) && cmode == 0xF) {
+    if (op == 1) return MCDisassembler::Fail;
+    Inst.setOpcode(ARM::VMOVv4f32);
+    return DecodeNEONModImmInstruction(Inst, Insn, Address, Decoder);
+  }
+
+  if (!(imm & 0x20)) return MCDisassembler::Fail;
+
+  if (!Check(S, DecodeQPRRegisterClass(Inst, Vd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeQPRRegisterClass(Inst, Vm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(64 - imm));
+
+  return S;
+}
+
+static DecodeStatus DecodeLDR(MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Val, 16, 4);
+  unsigned Rt = fieldFromInstruction(Val, 12, 4);
+  unsigned Rm = fieldFromInstruction(Val, 0, 4);
+  Rm |= (fieldFromInstruction(Val, 23, 1) << 4);
+  unsigned Cond = fieldFromInstruction(Val, 28, 4);
+ 
+  if (fieldFromInstruction(Val, 8, 4) != 0 || Rn == Rt)
+    S = MCDisassembler::SoftFail;
+
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeAddrMode7Operand(Inst, Rn, Address, Decoder))) 
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePostIdxReg(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, Cond, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeMRRC2(llvm::MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder) {
+
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned CRm = fieldFromInstruction(Val, 0, 4);
+  unsigned opc1 = fieldFromInstruction(Val, 4, 4);
+  unsigned cop = fieldFromInstruction(Val, 8, 4);
+  unsigned Rt = fieldFromInstruction(Val, 12, 4);
+  unsigned Rt2 = fieldFromInstruction(Val, 16, 4);
+
+  if ((cop & ~0x1) == 0xa)
+    return MCDisassembler::Fail;
+
+  if (Rt == Rt2)
+    S = MCDisassembler::SoftFail;
+
+  Inst.addOperand(MCOperand::CreateImm(cop));
+  Inst.addOperand(MCOperand::CreateImm(opc1));
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rt2, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(CRm));
+
+  return S;
+}
+
diff --git a/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
new file mode 100644
index 0000000..228fb57
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
@@ -0,0 +1,1501 @@
+//===-- ARMInstPrinter.cpp - Convert ARM MCInst to assembly syntax --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an ARM MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMInstPrinter.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+#include "ARMGenAsmWriter.inc"
+
+/// translateShiftImm - Convert shift immediate from 0-31 to 1-32 for printing.
+///
+/// getSORegOffset returns an integer from 0-31, representing '32' as 0.
+static unsigned translateShiftImm(unsigned imm) {
+  // lsr #32 and asr #32 exist, but should be encoded as a 0.
+  assert((imm & ~0x1f) == 0 && "Invalid shift encoding");
+
+  if (imm == 0)
+    return 32;
+  return imm;
+}
+
+/// Prints the shift value with an immediate value.
+static void printRegImmShift(raw_ostream &O, ARM_AM::ShiftOpc ShOpc,
+                          unsigned ShImm, bool UseMarkup) {
+  if (ShOpc == ARM_AM::no_shift || (ShOpc == ARM_AM::lsl && !ShImm))
+    return;
+  O << ", ";
+
+  assert (!(ShOpc == ARM_AM::ror && !ShImm) && "Cannot have ror #0");
+  O << getShiftOpcStr(ShOpc);
+
+  if (ShOpc != ARM_AM::rrx) {
+    O << " ";
+    if (UseMarkup)
+      O << "<imm:";
+    O << "#" << translateShiftImm(ShImm);
+    if (UseMarkup)
+      O << ">";
+  }
+}
+
+ARMInstPrinter::ARMInstPrinter(const MCAsmInfo &MAI,
+                               const MCInstrInfo &MII,
+                               const MCRegisterInfo &MRI,
+                               const MCSubtargetInfo &STI) :
+  MCInstPrinter(MAI, MII, MRI) {
+  // Initialize the set of available features.
+  setAvailableFeatures(STI.getFeatureBits());
+}
+
+void ARMInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+  OS << markup("<reg:")
+     << getRegisterName(RegNo)
+     << markup(">");
+}
+
+void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                               StringRef Annot) {
+  unsigned Opcode = MI->getOpcode();
+
+  switch(Opcode) {
+
+  // Check for HINT instructions w/ canonical names.
+  case ARM::HINT:
+  case ARM::tHINT:
+  case ARM::t2HINT:
+    switch (MI->getOperand(0).getImm()) {
+    case 0: O << "\tnop"; break;
+    case 1: O << "\tyield"; break;
+    case 2: O << "\twfe"; break;
+    case 3: O << "\twfi"; break;
+    case 4: O << "\tsev"; break;
+    case 5:
+      if ((getAvailableFeatures() & ARM::HasV8Ops)) {
+        O << "\tsevl";
+        break;
+      } // Fallthrough for non-v8
+    default:
+      // Anything else should just print normally.
+      printInstruction(MI, O);
+      printAnnotation(O, Annot);
+      return;
+    }
+    printPredicateOperand(MI, 1, O);
+    if (Opcode == ARM::t2HINT)
+      O << ".w";
+    printAnnotation(O, Annot);
+    return;
+
+  // Check for MOVs and print canonical forms, instead.
+  case ARM::MOVsr: {
+    // FIXME: Thumb variants?
+    const MCOperand &Dst = MI->getOperand(0);
+    const MCOperand &MO1 = MI->getOperand(1);
+    const MCOperand &MO2 = MI->getOperand(2);
+    const MCOperand &MO3 = MI->getOperand(3);
+
+    O << '\t' << ARM_AM::getShiftOpcStr(ARM_AM::getSORegShOp(MO3.getImm()));
+    printSBitModifierOperand(MI, 6, O);
+    printPredicateOperand(MI, 4, O);
+
+    O << '\t';
+    printRegName(O, Dst.getReg());
+    O << ", ";
+    printRegName(O, MO1.getReg());
+
+    O << ", ";
+    printRegName(O, MO2.getReg());
+    assert(ARM_AM::getSORegOffset(MO3.getImm()) == 0);
+    printAnnotation(O, Annot);
+    return;
+  }
+
+  case ARM::MOVsi: {
+    // FIXME: Thumb variants?
+    const MCOperand &Dst = MI->getOperand(0);
+    const MCOperand &MO1 = MI->getOperand(1);
+    const MCOperand &MO2 = MI->getOperand(2);
+
+    O << '\t' << ARM_AM::getShiftOpcStr(ARM_AM::getSORegShOp(MO2.getImm()));
+    printSBitModifierOperand(MI, 5, O);
+    printPredicateOperand(MI, 3, O);
+
+    O << '\t';
+    printRegName(O, Dst.getReg());
+    O << ", ";
+    printRegName(O, MO1.getReg());
+
+    if (ARM_AM::getSORegShOp(MO2.getImm()) == ARM_AM::rrx) {
+      printAnnotation(O, Annot);
+      return;
+    }
+
+    O << ", "
+      << markup("<imm:")
+      << "#" << translateShiftImm(ARM_AM::getSORegOffset(MO2.getImm()))
+      << markup(">");
+    printAnnotation(O, Annot);
+    return;
+  }
+
+  // A8.6.123 PUSH
+  case ARM::STMDB_UPD:
+  case ARM::t2STMDB_UPD:
+    if (MI->getOperand(0).getReg() == ARM::SP && MI->getNumOperands() > 5) {
+      // Should only print PUSH if there are at least two registers in the list.
+      O << '\t' << "push";
+      printPredicateOperand(MI, 2, O);
+      if (Opcode == ARM::t2STMDB_UPD)
+        O << ".w";
+      O << '\t';
+      printRegisterList(MI, 4, O);
+      printAnnotation(O, Annot);
+      return;
+    } else
+      break;
+
+  case ARM::STR_PRE_IMM:
+    if (MI->getOperand(2).getReg() == ARM::SP &&
+        MI->getOperand(3).getImm() == -4) {
+      O << '\t' << "push";
+      printPredicateOperand(MI, 4, O);
+      O << "\t{";
+      printRegName(O, MI->getOperand(1).getReg());
+      O << "}";
+      printAnnotation(O, Annot);
+      return;
+    } else
+      break;
+
+  // A8.6.122 POP
+  case ARM::LDMIA_UPD:
+  case ARM::t2LDMIA_UPD:
+    if (MI->getOperand(0).getReg() == ARM::SP && MI->getNumOperands() > 5) {
+      // Should only print POP if there are at least two registers in the list.
+      O << '\t' << "pop";
+      printPredicateOperand(MI, 2, O);
+      if (Opcode == ARM::t2LDMIA_UPD)
+        O << ".w";
+      O << '\t';
+      printRegisterList(MI, 4, O);
+      printAnnotation(O, Annot);
+      return;
+    } else
+      break;
+
+  case ARM::LDR_POST_IMM:
+    if (MI->getOperand(2).getReg() == ARM::SP &&
+        MI->getOperand(4).getImm() == 4) {
+      O << '\t' << "pop";
+      printPredicateOperand(MI, 5, O);
+      O << "\t{";
+      printRegName(O, MI->getOperand(0).getReg());
+      O << "}";
+      printAnnotation(O, Annot);
+      return;
+    } else
+      break;
+
+  // A8.6.355 VPUSH
+  case ARM::VSTMSDB_UPD:
+  case ARM::VSTMDDB_UPD:
+    if (MI->getOperand(0).getReg() == ARM::SP) {
+      O << '\t' << "vpush";
+      printPredicateOperand(MI, 2, O);
+      O << '\t';
+      printRegisterList(MI, 4, O);
+      printAnnotation(O, Annot);
+      return;
+    } else
+      break;
+
+  // A8.6.354 VPOP
+  case ARM::VLDMSIA_UPD:
+  case ARM::VLDMDIA_UPD:
+    if (MI->getOperand(0).getReg() == ARM::SP) {
+      O << '\t' << "vpop";
+      printPredicateOperand(MI, 2, O);
+      O << '\t';
+      printRegisterList(MI, 4, O);
+      printAnnotation(O, Annot);
+      return;
+    } else
+      break;
+
+  case ARM::tLDMIA: {
+    bool Writeback = true;
+    unsigned BaseReg = MI->getOperand(0).getReg();
+    for (unsigned i = 3; i < MI->getNumOperands(); ++i) {
+      if (MI->getOperand(i).getReg() == BaseReg)
+        Writeback = false;
+    }
+
+    O << "\tldm";
+
+    printPredicateOperand(MI, 1, O);
+    O << '\t';
+    printRegName(O, BaseReg);
+    if (Writeback) O << "!";
+    O << ", ";
+    printRegisterList(MI, 3, O);
+    printAnnotation(O, Annot);
+    return;
+  }
+
+  // Combine 2 GPRs from disassember into a GPRPair to match with instr def.
+  // ldrexd/strexd require even/odd GPR pair. To enforce this constraint,
+  // a single GPRPair reg operand is used in the .td file to replace the two
+  // GPRs. However, when decoding them, the two GRPs cannot be automatically
+  // expressed as a GPRPair, so we have to manually merge them.
+  // FIXME: We would really like to be able to tablegen'erate this.
+  case ARM::LDREXD: case ARM::STREXD:
+  case ARM::LDAEXD: case ARM::STLEXD:
+    const MCRegisterClass& MRC = MRI.getRegClass(ARM::GPRRegClassID);
+    bool isStore = Opcode == ARM::STREXD || Opcode == ARM::STLEXD;
+    unsigned Reg = MI->getOperand(isStore ? 1 : 0).getReg();
+    if (MRC.contains(Reg)) {
+      MCInst NewMI;
+      MCOperand NewReg;
+      NewMI.setOpcode(Opcode);
+
+      if (isStore)
+        NewMI.addOperand(MI->getOperand(0));
+      NewReg = MCOperand::CreateReg(MRI.getMatchingSuperReg(Reg, ARM::gsub_0,
+        &MRI.getRegClass(ARM::GPRPairRegClassID)));
+      NewMI.addOperand(NewReg);
+
+      // Copy the rest operands into NewMI.
+      for(unsigned i= isStore ? 3 : 2; i < MI->getNumOperands(); ++i)
+        NewMI.addOperand(MI->getOperand(i));
+      printInstruction(&NewMI, O);
+      return;
+    }
+  }
+
+  printInstruction(MI, O);
+  printAnnotation(O, Annot);
+}
+
+void ARMInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                  raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    unsigned Reg = Op.getReg();
+    printRegName(O, Reg);
+  } else if (Op.isImm()) {
+    O << markup("<imm:")
+      << '#' << formatImm(Op.getImm())
+      << markup(">");
+  } else {
+    assert(Op.isExpr() && "unknown operand kind in printOperand");
+    const MCExpr *Expr = Op.getExpr();
+    switch (Expr->getKind()) {
+    case MCExpr::Binary:
+      O << '#' << *Expr;
+      break;
+    case MCExpr::Constant: {
+      // If a symbolic branch target was added as a constant expression then
+      // print that address in hex. And only print 32 unsigned bits for the
+      // address.
+      const MCConstantExpr *Constant = cast<MCConstantExpr>(Expr);
+      int64_t TargetAddress;
+      if (!Constant->EvaluateAsAbsolute(TargetAddress)) {
+        O << '#' << *Expr;
+      } else {
+        O << "0x";
+        O.write_hex(static_cast<uint32_t>(TargetAddress));
+      }
+      break;
+    }
+    default:
+      // FIXME: Should we always treat this as if it is a constant literal and
+      // prefix it with '#'?
+      O << *Expr;
+      break;
+    }
+  }
+}
+
+void ARMInstPrinter::printThumbLdrLabelOperand(const MCInst *MI, unsigned OpNum,
+                                               raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  if (MO1.isExpr()) {
+    O << *MO1.getExpr();
+    return;
+  }
+
+  O << markup("<mem:") << "[pc, ";
+
+  int32_t OffImm = (int32_t)MO1.getImm();
+  bool isSub = OffImm < 0;
+
+  // Special value for #-0. All others are normal.
+  if (OffImm == INT32_MIN)
+    OffImm = 0;
+  if (isSub) {
+    O << markup("<imm:")
+      << "#-" << formatImm(-OffImm)
+      << markup(">");
+  } else {
+    O << markup("<imm:")
+      << "#" << formatImm(OffImm)
+      << markup(">");
+  }
+  O << "]" << markup(">");
+}
+
+// so_reg is a 4-operand unit corresponding to register forms of the A5.1
+// "Addressing Mode 1 - Data-processing operands" forms.  This includes:
+//    REG 0   0           - e.g. R5
+//    REG REG 0,SH_OPC    - e.g. R5, ROR R3
+//    REG 0   IMM,SH_OPC  - e.g. R5, LSL #3
+void ARMInstPrinter::printSORegRegOperand(const MCInst *MI, unsigned OpNum,
+                                       raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+  const MCOperand &MO3 = MI->getOperand(OpNum+2);
+
+  printRegName(O, MO1.getReg());
+
+  // Print the shift opc.
+  ARM_AM::ShiftOpc ShOpc = ARM_AM::getSORegShOp(MO3.getImm());
+  O << ", " << ARM_AM::getShiftOpcStr(ShOpc);
+  if (ShOpc == ARM_AM::rrx)
+    return;
+
+  O << ' ';
+  printRegName(O, MO2.getReg());
+  assert(ARM_AM::getSORegOffset(MO3.getImm()) == 0);
+}
+
+void ARMInstPrinter::printSORegImmOperand(const MCInst *MI, unsigned OpNum,
+                                       raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+
+  printRegName(O, MO1.getReg());
+
+  // Print the shift opc.
+  printRegImmShift(O, ARM_AM::getSORegShOp(MO2.getImm()),
+                   ARM_AM::getSORegOffset(MO2.getImm()), UseMarkup);
+}
+
+
+//===--------------------------------------------------------------------===//
+// Addressing Mode #2
+//===--------------------------------------------------------------------===//
+
+void ARMInstPrinter::printAM2PreOrOffsetIndexOp(const MCInst *MI, unsigned Op,
+                                                raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(Op);
+  const MCOperand &MO2 = MI->getOperand(Op+1);
+  const MCOperand &MO3 = MI->getOperand(Op+2);
+
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+
+  if (!MO2.getReg()) {
+    if (ARM_AM::getAM2Offset(MO3.getImm())) { // Don't print +0.
+      O << ", "
+        << markup("<imm:")
+        << "#"
+        << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO3.getImm()))
+        << ARM_AM::getAM2Offset(MO3.getImm())
+        << markup(">");
+    }
+    O << "]" << markup(">");
+    return;
+  }
+
+  O << ", ";
+  O << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO3.getImm()));
+  printRegName(O, MO2.getReg());
+
+  printRegImmShift(O, ARM_AM::getAM2ShiftOpc(MO3.getImm()),
+                   ARM_AM::getAM2Offset(MO3.getImm()), UseMarkup);
+  O << "]" << markup(">");
+}
+
+void ARMInstPrinter::printAddrModeTBB(const MCInst *MI, unsigned Op,
+                                           raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(Op);
+  const MCOperand &MO2 = MI->getOperand(Op+1);
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+  O << ", ";
+  printRegName(O, MO2.getReg());
+  O << "]" << markup(">");
+}
+
+void ARMInstPrinter::printAddrModeTBH(const MCInst *MI, unsigned Op,
+                                           raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(Op);
+  const MCOperand &MO2 = MI->getOperand(Op+1);
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+  O << ", ";
+  printRegName(O, MO2.getReg());
+  O << ", lsl " << markup("<imm:") << "#1" << markup(">") << "]" << markup(">");
+}
+
+void ARMInstPrinter::printAddrMode2Operand(const MCInst *MI, unsigned Op,
+                                           raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(Op);
+
+  if (!MO1.isReg()) {   // FIXME: This is for CP entries, but isn't right.
+    printOperand(MI, Op, O);
+    return;
+  }
+
+#ifndef NDEBUG
+  const MCOperand &MO3 = MI->getOperand(Op+2);
+  unsigned IdxMode = ARM_AM::getAM2IdxMode(MO3.getImm());
+  assert(IdxMode != ARMII::IndexModePost &&
+         "Should be pre or offset index op");
+#endif
+
+  printAM2PreOrOffsetIndexOp(MI, Op, O);
+}
+
+void ARMInstPrinter::printAddrMode2OffsetOperand(const MCInst *MI,
+                                                 unsigned OpNum,
+                                                 raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+
+  if (!MO1.getReg()) {
+    unsigned ImmOffs = ARM_AM::getAM2Offset(MO2.getImm());
+    O << markup("<imm:")
+      << '#' << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO2.getImm()))
+      << ImmOffs
+      << markup(">");
+    return;
+  }
+
+  O << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO2.getImm()));
+  printRegName(O, MO1.getReg());
+
+  printRegImmShift(O, ARM_AM::getAM2ShiftOpc(MO2.getImm()),
+                   ARM_AM::getAM2Offset(MO2.getImm()), UseMarkup);
+}
+
+//===--------------------------------------------------------------------===//
+// Addressing Mode #3
+//===--------------------------------------------------------------------===//
+
+void ARMInstPrinter::printAM3PostIndexOp(const MCInst *MI, unsigned Op,
+                                         raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(Op);
+  const MCOperand &MO2 = MI->getOperand(Op+1);
+  const MCOperand &MO3 = MI->getOperand(Op+2);
+
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+  O << "], " << markup(">");
+
+  if (MO2.getReg()) {
+    O << (char)ARM_AM::getAM3Op(MO3.getImm());
+    printRegName(O, MO2.getReg());
+    return;
+  }
+
+  unsigned ImmOffs = ARM_AM::getAM3Offset(MO3.getImm());
+  O << markup("<imm:")
+    << '#'
+    << ARM_AM::getAddrOpcStr(ARM_AM::getAM3Op(MO3.getImm()))
+    << ImmOffs
+    << markup(">");
+}
+
+void ARMInstPrinter::printAM3PreOrOffsetIndexOp(const MCInst *MI, unsigned Op,
+                                                raw_ostream &O,
+                                                bool AlwaysPrintImm0) {
+  const MCOperand &MO1 = MI->getOperand(Op);
+  const MCOperand &MO2 = MI->getOperand(Op+1);
+  const MCOperand &MO3 = MI->getOperand(Op+2);
+
+  O << markup("<mem:") << '[';
+  printRegName(O, MO1.getReg());
+
+  if (MO2.getReg()) {
+    O << ", " << getAddrOpcStr(ARM_AM::getAM3Op(MO3.getImm()));
+    printRegName(O, MO2.getReg());
+    O << ']' << markup(">");
+    return;
+  }
+
+  //If the op is sub we have to print the immediate even if it is 0
+  unsigned ImmOffs = ARM_AM::getAM3Offset(MO3.getImm());
+  ARM_AM::AddrOpc op = ARM_AM::getAM3Op(MO3.getImm());
+
+  if (AlwaysPrintImm0 || ImmOffs || (op == ARM_AM::sub)) {
+    O << ", "
+      << markup("<imm:")
+      << "#"
+      << ARM_AM::getAddrOpcStr(op)
+      << ImmOffs
+      << markup(">");
+  }
+  O << ']' << markup(">");
+}
+
+template <bool AlwaysPrintImm0>
+void ARMInstPrinter::printAddrMode3Operand(const MCInst *MI, unsigned Op,
+                                           raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(Op);
+  if (!MO1.isReg()) {   //  For label symbolic references.
+    printOperand(MI, Op, O);
+    return;
+  }
+
+  const MCOperand &MO3 = MI->getOperand(Op+2);
+  unsigned IdxMode = ARM_AM::getAM3IdxMode(MO3.getImm());
+
+  if (IdxMode == ARMII::IndexModePost) {
+    printAM3PostIndexOp(MI, Op, O);
+    return;
+  }
+  printAM3PreOrOffsetIndexOp(MI, Op, O, AlwaysPrintImm0);
+}
+
+void ARMInstPrinter::printAddrMode3OffsetOperand(const MCInst *MI,
+                                                 unsigned OpNum,
+                                                 raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+
+  if (MO1.getReg()) {
+    O << getAddrOpcStr(ARM_AM::getAM3Op(MO2.getImm()));
+    printRegName(O, MO1.getReg());
+    return;
+  }
+
+  unsigned ImmOffs = ARM_AM::getAM3Offset(MO2.getImm());
+  O << markup("<imm:")
+    << '#' << ARM_AM::getAddrOpcStr(ARM_AM::getAM3Op(MO2.getImm())) << ImmOffs
+    << markup(">");
+}
+
+void ARMInstPrinter::printPostIdxImm8Operand(const MCInst *MI,
+                                             unsigned OpNum,
+                                             raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  unsigned Imm = MO.getImm();
+  O << markup("<imm:")
+    << '#' << ((Imm & 256) ? "" : "-") << (Imm & 0xff)
+    << markup(">");
+}
+
+void ARMInstPrinter::printPostIdxRegOperand(const MCInst *MI, unsigned OpNum,
+                                            raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+
+  O << (MO2.getImm() ? "" : "-");
+  printRegName(O, MO1.getReg());
+}
+
+void ARMInstPrinter::printPostIdxImm8s4Operand(const MCInst *MI,
+                                             unsigned OpNum,
+                                             raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  unsigned Imm = MO.getImm();
+  O << markup("<imm:")
+    << '#' << ((Imm & 256) ? "" : "-") << ((Imm & 0xff) << 2)
+    << markup(">");
+}
+
+
+void ARMInstPrinter::printLdStmModeOperand(const MCInst *MI, unsigned OpNum,
+                                           raw_ostream &O) {
+  ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MI->getOperand(OpNum)
+                                                 .getImm());
+  O << ARM_AM::getAMSubModeStr(Mode);
+}
+
+template <bool AlwaysPrintImm0>
+void ARMInstPrinter::printAddrMode5Operand(const MCInst *MI, unsigned OpNum,
+                                           raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+
+  if (!MO1.isReg()) {   // FIXME: This is for CP entries, but isn't right.
+    printOperand(MI, OpNum, O);
+    return;
+  }
+
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+
+  unsigned ImmOffs = ARM_AM::getAM5Offset(MO2.getImm());
+  unsigned Op = ARM_AM::getAM5Op(MO2.getImm());
+  if (AlwaysPrintImm0 || ImmOffs || Op == ARM_AM::sub) {
+    O << ", "
+      << markup("<imm:")
+      << "#"
+      << ARM_AM::getAddrOpcStr(ARM_AM::getAM5Op(MO2.getImm()))
+      << ImmOffs * 4
+      << markup(">");
+  }
+  O << "]" << markup(">");
+}
+
+void ARMInstPrinter::printAddrMode6Operand(const MCInst *MI, unsigned OpNum,
+                                           raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+  if (MO2.getImm()) {
+    O << ":" << (MO2.getImm() << 3);
+  }
+  O << "]" << markup(">");
+}
+
+void ARMInstPrinter::printAddrMode7Operand(const MCInst *MI, unsigned OpNum,
+                                           raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+  O << "]" << markup(">");
+}
+
+void ARMInstPrinter::printAddrMode6OffsetOperand(const MCInst *MI,
+                                                 unsigned OpNum,
+                                                 raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  if (MO.getReg() == 0)
+    O << "!";
+  else {
+    O << ", ";
+    printRegName(O, MO.getReg());
+  }
+}
+
+void ARMInstPrinter::printBitfieldInvMaskImmOperand(const MCInst *MI,
+                                                    unsigned OpNum,
+                                                    raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  uint32_t v = ~MO.getImm();
+  int32_t lsb = countTrailingZeros(v);
+  int32_t width = (32 - countLeadingZeros (v)) - lsb;
+  assert(MO.isImm() && "Not a valid bf_inv_mask_imm value!");
+  O << markup("<imm:") << '#' << lsb << markup(">")
+    << ", "
+    << markup("<imm:") << '#' << width << markup(">");
+}
+
+void ARMInstPrinter::printMemBOption(const MCInst *MI, unsigned OpNum,
+                                     raw_ostream &O) {
+  unsigned val = MI->getOperand(OpNum).getImm();
+  O << ARM_MB::MemBOptToString(val, (getAvailableFeatures() & ARM::HasV8Ops));
+}
+
+void ARMInstPrinter::printInstSyncBOption(const MCInst *MI, unsigned OpNum,
+                                          raw_ostream &O) {
+  unsigned val = MI->getOperand(OpNum).getImm();
+  O << ARM_ISB::InstSyncBOptToString(val);
+}
+
+void ARMInstPrinter::printShiftImmOperand(const MCInst *MI, unsigned OpNum,
+                                          raw_ostream &O) {
+  unsigned ShiftOp = MI->getOperand(OpNum).getImm();
+  bool isASR = (ShiftOp & (1 << 5)) != 0;
+  unsigned Amt = ShiftOp & 0x1f;
+  if (isASR) {
+    O << ", asr "
+      << markup("<imm:")
+      << "#" << (Amt == 0 ? 32 : Amt)
+      << markup(">");
+  }
+  else if (Amt) {
+    O << ", lsl "
+      << markup("<imm:")
+      << "#" << Amt
+      << markup(">");
+  }
+}
+
+void ARMInstPrinter::printPKHLSLShiftImm(const MCInst *MI, unsigned OpNum,
+                                         raw_ostream &O) {
+  unsigned Imm = MI->getOperand(OpNum).getImm();
+  if (Imm == 0)
+    return;
+  assert(Imm > 0 && Imm < 32 && "Invalid PKH shift immediate value!");
+  O << ", lsl " << markup("<imm:") << "#" << Imm << markup(">");
+}
+
+void ARMInstPrinter::printPKHASRShiftImm(const MCInst *MI, unsigned OpNum,
+                                         raw_ostream &O) {
+  unsigned Imm = MI->getOperand(OpNum).getImm();
+  // A shift amount of 32 is encoded as 0.
+  if (Imm == 0)
+    Imm = 32;
+  assert(Imm > 0 && Imm <= 32 && "Invalid PKH shift immediate value!");
+  O << ", asr " << markup("<imm:") << "#" << Imm << markup(">");
+}
+
+void ARMInstPrinter::printRegisterList(const MCInst *MI, unsigned OpNum,
+                                       raw_ostream &O) {
+  O << "{";
+  for (unsigned i = OpNum, e = MI->getNumOperands(); i != e; ++i) {
+    if (i != OpNum) O << ", ";
+    printRegName(O, MI->getOperand(i).getReg());
+  }
+  O << "}";
+}
+
+void ARMInstPrinter::printGPRPairOperand(const MCInst *MI, unsigned OpNum,
+                                         raw_ostream &O) {
+  unsigned Reg = MI->getOperand(OpNum).getReg();
+  printRegName(O, MRI.getSubReg(Reg, ARM::gsub_0));
+  O << ", ";
+  printRegName(O, MRI.getSubReg(Reg, ARM::gsub_1));
+}
+
+
+void ARMInstPrinter::printSetendOperand(const MCInst *MI, unsigned OpNum,
+                                        raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNum);
+  if (Op.getImm())
+    O << "be";
+  else
+    O << "le";
+}
+
+void ARMInstPrinter::printCPSIMod(const MCInst *MI, unsigned OpNum,
+                                  raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNum);
+  O << ARM_PROC::IModToString(Op.getImm());
+}
+
+void ARMInstPrinter::printCPSIFlag(const MCInst *MI, unsigned OpNum,
+                                   raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNum);
+  unsigned IFlags = Op.getImm();
+  for (int i=2; i >= 0; --i)
+    if (IFlags & (1 << i))
+      O << ARM_PROC::IFlagsToString(1 << i);
+
+  if (IFlags == 0)
+    O << "none";
+}
+
+void ARMInstPrinter::printMSRMaskOperand(const MCInst *MI, unsigned OpNum,
+                                         raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNum);
+  unsigned SpecRegRBit = Op.getImm() >> 4;
+  unsigned Mask = Op.getImm() & 0xf;
+
+  if (getAvailableFeatures() & ARM::FeatureMClass) {
+    unsigned SYSm = Op.getImm();
+    unsigned Opcode = MI->getOpcode();
+    // For reads of the special registers ignore the "mask encoding" bits
+    // which are only for writes.
+    if (Opcode == ARM::t2MRS_M)
+      SYSm &= 0xff;
+    switch (SYSm) {
+    default: llvm_unreachable("Unexpected mask value!");
+    case     0:
+    case 0x800: O << "apsr"; return; // with _nzcvq bits is an alias for aspr
+    case 0x400: O << "apsr_g"; return;
+    case 0xc00: O << "apsr_nzcvqg"; return;
+    case     1:
+    case 0x801: O << "iapsr"; return; // with _nzcvq bits is an alias for iapsr
+    case 0x401: O << "iapsr_g"; return;
+    case 0xc01: O << "iapsr_nzcvqg"; return;
+    case     2:
+    case 0x802: O << "eapsr"; return; // with _nzcvq bits is an alias for eapsr
+    case 0x402: O << "eapsr_g"; return;
+    case 0xc02: O << "eapsr_nzcvqg"; return;
+    case     3:
+    case 0x803: O << "xpsr"; return; // with _nzcvq bits is an alias for xpsr
+    case 0x403: O << "xpsr_g"; return;
+    case 0xc03: O << "xpsr_nzcvqg"; return;
+    case     5:
+    case 0x805: O << "ipsr"; return;
+    case     6:
+    case 0x806: O << "epsr"; return;
+    case     7:
+    case 0x807: O << "iepsr"; return;
+    case     8:
+    case 0x808: O << "msp"; return;
+    case     9:
+    case 0x809: O << "psp"; return;
+    case  0x10:
+    case 0x810: O << "primask"; return;
+    case  0x11:
+    case 0x811: O << "basepri"; return;
+    case  0x12:
+    case 0x812: O << "basepri_max"; return;
+    case  0x13:
+    case 0x813: O << "faultmask"; return;
+    case  0x14:
+    case 0x814: O << "control"; return;
+    }
+  }
+
+  // As special cases, CPSR_f, CPSR_s and CPSR_fs prefer printing as
+  // APSR_nzcvq, APSR_g and APSRnzcvqg, respectively.
+  if (!SpecRegRBit && (Mask == 8 || Mask == 4 || Mask == 12)) {
+    O << "APSR_";
+    switch (Mask) {
+    default: llvm_unreachable("Unexpected mask value!");
+    case 4:  O << "g"; return;
+    case 8:  O << "nzcvq"; return;
+    case 12: O << "nzcvqg"; return;
+    }
+  }
+
+  if (SpecRegRBit)
+    O << "SPSR";
+  else
+    O << "CPSR";
+
+  if (Mask) {
+    O << '_';
+    if (Mask & 8) O << 'f';
+    if (Mask & 4) O << 's';
+    if (Mask & 2) O << 'x';
+    if (Mask & 1) O << 'c';
+  }
+}
+
+void ARMInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNum,
+                                           raw_ostream &O) {
+  ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(OpNum).getImm();
+  // Handle the undefined 15 CC value here for printing so we don't abort().
+  if ((unsigned)CC == 15)
+    O << "<und>";
+  else if (CC != ARMCC::AL)
+    O << ARMCondCodeToString(CC);
+}
+
+void ARMInstPrinter::printMandatoryPredicateOperand(const MCInst *MI,
+                                                    unsigned OpNum,
+                                                    raw_ostream &O) {
+  ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(OpNum).getImm();
+  O << ARMCondCodeToString(CC);
+}
+
+void ARMInstPrinter::printSBitModifierOperand(const MCInst *MI, unsigned OpNum,
+                                              raw_ostream &O) {
+  if (MI->getOperand(OpNum).getReg()) {
+    assert(MI->getOperand(OpNum).getReg() == ARM::CPSR &&
+           "Expect ARM CPSR register!");
+    O << 's';
+  }
+}
+
+void ARMInstPrinter::printNoHashImmediate(const MCInst *MI, unsigned OpNum,
+                                          raw_ostream &O) {
+  O << MI->getOperand(OpNum).getImm();
+}
+
+void ARMInstPrinter::printPImmediate(const MCInst *MI, unsigned OpNum,
+                                     raw_ostream &O) {
+  O << "p" << MI->getOperand(OpNum).getImm();
+}
+
+void ARMInstPrinter::printCImmediate(const MCInst *MI, unsigned OpNum,
+                                     raw_ostream &O) {
+  O << "c" << MI->getOperand(OpNum).getImm();
+}
+
+void ARMInstPrinter::printCoprocOptionImm(const MCInst *MI, unsigned OpNum,
+                                          raw_ostream &O) {
+  O << "{" << MI->getOperand(OpNum).getImm() << "}";
+}
+
+void ARMInstPrinter::printPCLabel(const MCInst *MI, unsigned OpNum,
+                                  raw_ostream &O) {
+  llvm_unreachable("Unhandled PC-relative pseudo-instruction!");
+}
+
+template<unsigned scale>
+void ARMInstPrinter::printAdrLabelOperand(const MCInst *MI, unsigned OpNum,
+                                  raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+
+  if (MO.isExpr()) {
+    O << *MO.getExpr();
+    return;
+  }
+
+  int32_t OffImm = (int32_t)MO.getImm() << scale;
+
+  O << markup("<imm:");
+  if (OffImm == INT32_MIN)
+    O << "#-0";
+  else if (OffImm < 0)
+    O << "#-" << -OffImm;
+  else
+    O << "#" << OffImm;
+  O << markup(">");
+}
+
+void ARMInstPrinter::printThumbS4ImmOperand(const MCInst *MI, unsigned OpNum,
+                                            raw_ostream &O) {
+  O << markup("<imm:")
+    << "#" << formatImm(MI->getOperand(OpNum).getImm() * 4)
+    << markup(">");
+}
+
+void ARMInstPrinter::printThumbSRImm(const MCInst *MI, unsigned OpNum,
+                                     raw_ostream &O) {
+  unsigned Imm = MI->getOperand(OpNum).getImm();
+  O << markup("<imm:")
+    << "#" << formatImm((Imm == 0 ? 32 : Imm))
+    << markup(">");
+}
+
+void ARMInstPrinter::printThumbITMask(const MCInst *MI, unsigned OpNum,
+                                      raw_ostream &O) {
+  // (3 - the number of trailing zeros) is the number of then / else.
+  unsigned Mask = MI->getOperand(OpNum).getImm();
+  unsigned Firstcond = MI->getOperand(OpNum-1).getImm();
+  unsigned CondBit0 = Firstcond & 1;
+  unsigned NumTZ = countTrailingZeros(Mask);
+  assert(NumTZ <= 3 && "Invalid IT mask!");
+  for (unsigned Pos = 3, e = NumTZ; Pos > e; --Pos) {
+    bool T = ((Mask >> Pos) & 1) == CondBit0;
+    if (T)
+      O << 't';
+    else
+      O << 'e';
+  }
+}
+
+void ARMInstPrinter::printThumbAddrModeRROperand(const MCInst *MI, unsigned Op,
+                                                 raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(Op);
+  const MCOperand &MO2 = MI->getOperand(Op + 1);
+
+  if (!MO1.isReg()) {   // FIXME: This is for CP entries, but isn't right.
+    printOperand(MI, Op, O);
+    return;
+  }
+
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+  if (unsigned RegNum = MO2.getReg()) {
+    O << ", ";
+    printRegName(O, RegNum);
+  }
+  O << "]" << markup(">");
+}
+
+void ARMInstPrinter::printThumbAddrModeImm5SOperand(const MCInst *MI,
+                                                    unsigned Op,
+                                                    raw_ostream &O,
+                                                    unsigned Scale) {
+  const MCOperand &MO1 = MI->getOperand(Op);
+  const MCOperand &MO2 = MI->getOperand(Op + 1);
+
+  if (!MO1.isReg()) {   // FIXME: This is for CP entries, but isn't right.
+    printOperand(MI, Op, O);
+    return;
+  }
+
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+  if (unsigned ImmOffs = MO2.getImm()) {
+    O << ", "
+      << markup("<imm:")
+      << "#" << formatImm(ImmOffs * Scale)
+      << markup(">");
+  }
+  O << "]" << markup(">");
+}
+
+void ARMInstPrinter::printThumbAddrModeImm5S1Operand(const MCInst *MI,
+                                                     unsigned Op,
+                                                     raw_ostream &O) {
+  printThumbAddrModeImm5SOperand(MI, Op, O, 1);
+}
+
+void ARMInstPrinter::printThumbAddrModeImm5S2Operand(const MCInst *MI,
+                                                     unsigned Op,
+                                                     raw_ostream &O) {
+  printThumbAddrModeImm5SOperand(MI, Op, O, 2);
+}
+
+void ARMInstPrinter::printThumbAddrModeImm5S4Operand(const MCInst *MI,
+                                                     unsigned Op,
+                                                     raw_ostream &O) {
+  printThumbAddrModeImm5SOperand(MI, Op, O, 4);
+}
+
+void ARMInstPrinter::printThumbAddrModeSPOperand(const MCInst *MI, unsigned Op,
+                                                 raw_ostream &O) {
+  printThumbAddrModeImm5SOperand(MI, Op, O, 4);
+}
+
+// Constant shifts t2_so_reg is a 2-operand unit corresponding to the Thumb2
+// register with shift forms.
+// REG 0   0           - e.g. R5
+// REG IMM, SH_OPC     - e.g. R5, LSL #3
+void ARMInstPrinter::printT2SOOperand(const MCInst *MI, unsigned OpNum,
+                                      raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+
+  unsigned Reg = MO1.getReg();
+  printRegName(O, Reg);
+
+  // Print the shift opc.
+  assert(MO2.isImm() && "Not a valid t2_so_reg value!");
+  printRegImmShift(O, ARM_AM::getSORegShOp(MO2.getImm()),
+                   ARM_AM::getSORegOffset(MO2.getImm()), UseMarkup);
+}
+
+template <bool AlwaysPrintImm0>
+void ARMInstPrinter::printAddrModeImm12Operand(const MCInst *MI, unsigned OpNum,
+                                               raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+
+  if (!MO1.isReg()) {   // FIXME: This is for CP entries, but isn't right.
+    printOperand(MI, OpNum, O);
+    return;
+  }
+
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+
+  int32_t OffImm = (int32_t)MO2.getImm();
+  bool isSub = OffImm < 0;
+  // Special value for #-0. All others are normal.
+  if (OffImm == INT32_MIN)
+    OffImm = 0;
+  if (isSub) {
+    O << ", "
+      << markup("<imm:")
+      << "#-" << formatImm(-OffImm)
+      << markup(">");
+  }
+  else if (AlwaysPrintImm0 || OffImm > 0) {
+    O << ", "
+      << markup("<imm:")
+      << "#" << formatImm(OffImm)
+      << markup(">");
+  }
+  O << "]" << markup(">");
+}
+
+template<bool AlwaysPrintImm0>
+void ARMInstPrinter::printT2AddrModeImm8Operand(const MCInst *MI,
+                                                unsigned OpNum,
+                                                raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+
+  int32_t OffImm = (int32_t)MO2.getImm();
+  bool isSub = OffImm < 0;
+  // Don't print +0.
+  if (OffImm == INT32_MIN)
+    OffImm = 0;
+  if (isSub) {
+    O << ", "
+      << markup("<imm:")
+      << "#-" << -OffImm
+      << markup(">");
+  } else if (AlwaysPrintImm0 || OffImm > 0) {
+    O << ", "
+      << markup("<imm:")
+      << "#" << OffImm
+      << markup(">");
+  }
+  O << "]" << markup(">");
+}
+
+template<bool AlwaysPrintImm0>
+void ARMInstPrinter::printT2AddrModeImm8s4Operand(const MCInst *MI,
+                                                  unsigned OpNum,
+                                                  raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+
+  if (!MO1.isReg()) {   //  For label symbolic references.
+    printOperand(MI, OpNum, O);
+    return;
+  }
+
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+
+  int32_t OffImm = (int32_t)MO2.getImm();
+  bool isSub = OffImm < 0;
+
+  assert(((OffImm & 0x3) == 0) && "Not a valid immediate!");
+
+  // Don't print +0.
+  if (OffImm == INT32_MIN)
+    OffImm = 0;
+  if (isSub) {
+    O << ", "
+      << markup("<imm:")
+      << "#-" << -OffImm
+      << markup(">");
+  } else if (AlwaysPrintImm0 || OffImm > 0) {
+    O << ", "
+      << markup("<imm:")
+      << "#" << OffImm
+      << markup(">");
+  }
+  O << "]" << markup(">");
+}
+
+void ARMInstPrinter::printT2AddrModeImm0_1020s4Operand(const MCInst *MI,
+                                                       unsigned OpNum,
+                                                       raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+  if (MO2.getImm()) {
+    O << ", "
+      << markup("<imm:")
+      << "#" << formatImm(MO2.getImm() * 4)
+      << markup(">");
+  }
+  O << "]" << markup(">");
+}
+
+void ARMInstPrinter::printT2AddrModeImm8OffsetOperand(const MCInst *MI,
+                                                      unsigned OpNum,
+                                                      raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  int32_t OffImm = (int32_t)MO1.getImm();
+  O << ", " << markup("<imm:");
+  if (OffImm == INT32_MIN)
+    O << "#-0";
+  else if (OffImm < 0)
+    O << "#-" << -OffImm;
+  else
+    O << "#" << OffImm;
+  O << markup(">");
+}
+
+void ARMInstPrinter::printT2AddrModeImm8s4OffsetOperand(const MCInst *MI,
+                                                        unsigned OpNum,
+                                                        raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  int32_t OffImm = (int32_t)MO1.getImm();
+
+  assert(((OffImm & 0x3) == 0) && "Not a valid immediate!");
+
+  O << ", " << markup("<imm:");
+  if (OffImm == INT32_MIN)
+    O << "#-0";
+  else if (OffImm < 0)
+    O << "#-" << -OffImm;
+  else
+    O << "#" << OffImm;
+  O << markup(">");
+}
+
+void ARMInstPrinter::printT2AddrModeSoRegOperand(const MCInst *MI,
+                                                 unsigned OpNum,
+                                                 raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+  const MCOperand &MO3 = MI->getOperand(OpNum+2);
+
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+
+  assert(MO2.getReg() && "Invalid so_reg load / store address!");
+  O << ", ";
+  printRegName(O, MO2.getReg());
+
+  unsigned ShAmt = MO3.getImm();
+  if (ShAmt) {
+    assert(ShAmt <= 3 && "Not a valid Thumb2 addressing mode!");
+    O << ", lsl "
+      << markup("<imm:")
+      << "#" << ShAmt
+      << markup(">");
+  }
+  O << "]" << markup(">");
+}
+
+void ARMInstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum,
+                                       raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  O << markup("<imm:")
+    << '#' << ARM_AM::getFPImmFloat(MO.getImm())
+    << markup(">");
+}
+
+void ARMInstPrinter::printNEONModImmOperand(const MCInst *MI, unsigned OpNum,
+                                            raw_ostream &O) {
+  unsigned EncodedImm = MI->getOperand(OpNum).getImm();
+  unsigned EltBits;
+  uint64_t Val = ARM_AM::decodeNEONModImm(EncodedImm, EltBits);
+  O << markup("<imm:")
+    << "#0x";
+  O.write_hex(Val);
+  O << markup(">");
+}
+
+void ARMInstPrinter::printImmPlusOneOperand(const MCInst *MI, unsigned OpNum,
+                                            raw_ostream &O) {
+  unsigned Imm = MI->getOperand(OpNum).getImm();
+  O << markup("<imm:")
+    << "#" << formatImm(Imm + 1)
+    << markup(">");
+}
+
+void ARMInstPrinter::printRotImmOperand(const MCInst *MI, unsigned OpNum,
+                                        raw_ostream &O) {
+  unsigned Imm = MI->getOperand(OpNum).getImm();
+  if (Imm == 0)
+    return;
+  O << ", ror "
+    << markup("<imm:")
+    << "#";
+  switch (Imm) {
+  default: assert (0 && "illegal ror immediate!");
+  case 1: O << "8"; break;
+  case 2: O << "16"; break;
+  case 3: O << "24"; break;
+  }
+  O << markup(">");
+}
+
+void ARMInstPrinter::printFBits16(const MCInst *MI, unsigned OpNum,
+                                  raw_ostream &O) {
+  O << markup("<imm:")
+    << "#" << 16 - MI->getOperand(OpNum).getImm()
+    << markup(">");
+}
+
+void ARMInstPrinter::printFBits32(const MCInst *MI, unsigned OpNum,
+                                  raw_ostream &O) {
+  O << markup("<imm:")
+    << "#" << 32 - MI->getOperand(OpNum).getImm()
+    << markup(">");
+}
+
+void ARMInstPrinter::printVectorIndex(const MCInst *MI, unsigned OpNum,
+                                      raw_ostream &O) {
+  O << "[" << MI->getOperand(OpNum).getImm() << "]";
+}
+
+void ARMInstPrinter::printVectorListOne(const MCInst *MI, unsigned OpNum,
+                                        raw_ostream &O) {
+  O << "{";
+  printRegName(O, MI->getOperand(OpNum).getReg());
+  O << "}";
+}
+
+void ARMInstPrinter::printVectorListTwo(const MCInst *MI, unsigned OpNum,
+                                          raw_ostream &O) {
+  unsigned Reg = MI->getOperand(OpNum).getReg();
+  unsigned Reg0 = MRI.getSubReg(Reg, ARM::dsub_0);
+  unsigned Reg1 = MRI.getSubReg(Reg, ARM::dsub_1);
+  O << "{";
+  printRegName(O, Reg0);
+  O << ", ";
+  printRegName(O, Reg1);
+  O << "}";
+}
+
+void ARMInstPrinter::printVectorListTwoSpaced(const MCInst *MI,
+                                              unsigned OpNum,
+                                              raw_ostream &O) {
+  unsigned Reg = MI->getOperand(OpNum).getReg();
+  unsigned Reg0 = MRI.getSubReg(Reg, ARM::dsub_0);
+  unsigned Reg1 = MRI.getSubReg(Reg, ARM::dsub_2);
+  O << "{";
+  printRegName(O, Reg0);
+  O << ", ";
+  printRegName(O, Reg1);
+  O << "}";
+}
+
+void ARMInstPrinter::printVectorListThree(const MCInst *MI, unsigned OpNum,
+                                          raw_ostream &O) {
+  // Normally, it's not safe to use register enum values directly with
+  // addition to get the next register, but for VFP registers, the
+  // sort order is guaranteed because they're all of the form D<n>.
+  O << "{";
+  printRegName(O, MI->getOperand(OpNum).getReg());
+  O << ", ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 1);
+  O << ", ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 2);
+  O << "}";
+}
+
+void ARMInstPrinter::printVectorListFour(const MCInst *MI, unsigned OpNum,
+                                         raw_ostream &O) {
+  // Normally, it's not safe to use register enum values directly with
+  // addition to get the next register, but for VFP registers, the
+  // sort order is guaranteed because they're all of the form D<n>.
+  O << "{";
+  printRegName(O, MI->getOperand(OpNum).getReg());
+  O << ", ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 1);
+  O << ", ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 2);
+  O << ", ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 3);
+  O << "}";
+}
+
+void ARMInstPrinter::printVectorListOneAllLanes(const MCInst *MI,
+                                                unsigned OpNum,
+                                                raw_ostream &O) {
+  O << "{";
+  printRegName(O, MI->getOperand(OpNum).getReg());
+  O << "[]}";
+}
+
+void ARMInstPrinter::printVectorListTwoAllLanes(const MCInst *MI,
+                                                unsigned OpNum,
+                                                raw_ostream &O) {
+  unsigned Reg = MI->getOperand(OpNum).getReg();
+  unsigned Reg0 = MRI.getSubReg(Reg, ARM::dsub_0);
+  unsigned Reg1 = MRI.getSubReg(Reg, ARM::dsub_1);
+  O << "{";
+  printRegName(O, Reg0);
+  O << "[], ";
+  printRegName(O, Reg1);
+  O << "[]}";
+}
+
+void ARMInstPrinter::printVectorListThreeAllLanes(const MCInst *MI,
+                                                  unsigned OpNum,
+                                                  raw_ostream &O) {
+  // Normally, it's not safe to use register enum values directly with
+  // addition to get the next register, but for VFP registers, the
+  // sort order is guaranteed because they're all of the form D<n>.
+  O << "{";
+  printRegName(O, MI->getOperand(OpNum).getReg());
+  O << "[], ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 1);
+  O << "[], ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 2);
+  O << "[]}";
+}
+
+void ARMInstPrinter::printVectorListFourAllLanes(const MCInst *MI,
+                                                  unsigned OpNum,
+                                                  raw_ostream &O) {
+  // Normally, it's not safe to use register enum values directly with
+  // addition to get the next register, but for VFP registers, the
+  // sort order is guaranteed because they're all of the form D<n>.
+  O << "{";
+  printRegName(O, MI->getOperand(OpNum).getReg());
+  O << "[], ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 1);
+  O << "[], ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 2);
+  O << "[], ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 3);
+  O << "[]}";
+}
+
+void ARMInstPrinter::printVectorListTwoSpacedAllLanes(const MCInst *MI,
+                                                      unsigned OpNum,
+                                                      raw_ostream &O) {
+  unsigned Reg = MI->getOperand(OpNum).getReg();
+  unsigned Reg0 = MRI.getSubReg(Reg, ARM::dsub_0);
+  unsigned Reg1 = MRI.getSubReg(Reg, ARM::dsub_2);
+  O << "{";
+  printRegName(O, Reg0);
+  O << "[], ";
+  printRegName(O, Reg1);
+  O << "[]}";
+}
+
+void ARMInstPrinter::printVectorListThreeSpacedAllLanes(const MCInst *MI,
+                                                        unsigned OpNum,
+                                                        raw_ostream &O) {
+  // Normally, it's not safe to use register enum values directly with
+  // addition to get the next register, but for VFP registers, the
+  // sort order is guaranteed because they're all of the form D<n>.
+  O << "{";
+  printRegName(O, MI->getOperand(OpNum).getReg());
+  O  << "[], ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 2);
+  O << "[], ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 4);
+  O << "[]}";
+}
+
+void ARMInstPrinter::printVectorListFourSpacedAllLanes(const MCInst *MI,
+                                                       unsigned OpNum,
+                                                       raw_ostream &O) {
+  // Normally, it's not safe to use register enum values directly with
+  // addition to get the next register, but for VFP registers, the
+  // sort order is guaranteed because they're all of the form D<n>.
+  O << "{";
+  printRegName(O, MI->getOperand(OpNum).getReg());
+  O << "[], ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 2);
+  O << "[], ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 4);
+  O << "[], ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 6);
+  O << "[]}";
+}
+
+void ARMInstPrinter::printVectorListThreeSpaced(const MCInst *MI,
+                                                unsigned OpNum,
+                                                raw_ostream &O) {
+  // Normally, it's not safe to use register enum values directly with
+  // addition to get the next register, but for VFP registers, the
+  // sort order is guaranteed because they're all of the form D<n>.
+  O << "{";
+  printRegName(O, MI->getOperand(OpNum).getReg());
+  O << ", ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 2);
+  O << ", ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 4);
+  O << "}";
+}
+
+void ARMInstPrinter::printVectorListFourSpaced(const MCInst *MI,
+                                                unsigned OpNum,
+                                                raw_ostream &O) {
+  // Normally, it's not safe to use register enum values directly with
+  // addition to get the next register, but for VFP registers, the
+  // sort order is guaranteed because they're all of the form D<n>.
+  O << "{";
+  printRegName(O, MI->getOperand(OpNum).getReg());
+  O << ", ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 2);
+  O << ", ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 4);
+  O << ", ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 6);
+  O << "}";
+}
diff --git a/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.h b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
new file mode 100644
index 0000000..f671fe4
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
@@ -0,0 +1,170 @@
+//===- ARMInstPrinter.h - Convert ARM MCInst to assembly syntax -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an ARM MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMINSTPRINTER_H
+#define ARMINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+
+namespace llvm {
+
+class MCOperand;
+
+class ARMInstPrinter : public MCInstPrinter {
+public:
+  ARMInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                 const MCRegisterInfo &MRI, const MCSubtargetInfo &STI);
+
+  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override;
+  void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+
+  // Autogenerated by tblgen.
+  void printInstruction(const MCInst *MI, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo);
+
+
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+
+  void printSORegRegOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printSORegImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+
+  void printAddrModeTBB(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printAddrModeTBH(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printAddrMode2Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printAM2PostIndexOp(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printAM2PreOrOffsetIndexOp(const MCInst *MI, unsigned OpNum,
+                                  raw_ostream &O);
+  void printAddrMode2OffsetOperand(const MCInst *MI, unsigned OpNum,
+                                   raw_ostream &O);
+  template <bool AlwaysPrintImm0>
+  void printAddrMode3Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printAddrMode3OffsetOperand(const MCInst *MI, unsigned OpNum,
+                                   raw_ostream &O);
+  void printAM3PostIndexOp(const MCInst *MI, unsigned Op, raw_ostream &O);
+  void printAM3PreOrOffsetIndexOp(const MCInst *MI, unsigned Op, raw_ostream &O,
+                                  bool AlwaysPrintImm0);
+  void printPostIdxImm8Operand(const MCInst *MI, unsigned OpNum,
+                               raw_ostream &O);
+  void printPostIdxRegOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printPostIdxImm8s4Operand(const MCInst *MI, unsigned OpNum,
+                               raw_ostream &O);
+
+  void printLdStmModeOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  template <bool AlwaysPrintImm0>
+  void printAddrMode5Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printAddrMode6Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printAddrMode7Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printAddrMode6OffsetOperand(const MCInst *MI, unsigned OpNum,
+                                   raw_ostream &O);
+
+  void printBitfieldInvMaskImmOperand(const MCInst *MI, unsigned OpNum,
+                                      raw_ostream &O);
+  void printMemBOption(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printInstSyncBOption(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printShiftImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printPKHLSLShiftImm(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printPKHASRShiftImm(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+
+  template <unsigned scale>
+  void printAdrLabelOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printThumbS4ImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printThumbSRImm(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printThumbITMask(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printThumbAddrModeRROperand(const MCInst *MI, unsigned OpNum,
+                                   raw_ostream &O);
+  void printThumbAddrModeImm5SOperand(const MCInst *MI, unsigned OpNum,
+                                      raw_ostream &O, unsigned Scale);
+  void printThumbAddrModeImm5S1Operand(const MCInst *MI, unsigned OpNum,
+                                       raw_ostream &O);
+  void printThumbAddrModeImm5S2Operand(const MCInst *MI, unsigned OpNum,
+                                       raw_ostream &O);
+  void printThumbAddrModeImm5S4Operand(const MCInst *MI, unsigned OpNum,
+                                       raw_ostream &O);
+  void printThumbAddrModeSPOperand(const MCInst *MI, unsigned OpNum,
+                                   raw_ostream &O);
+
+  void printT2SOOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  template<bool AlwaysPrintImm0>
+  void printAddrModeImm12Operand(const MCInst *MI, unsigned OpNum,
+                                 raw_ostream &O);
+  template<bool AlwaysPrintImm0>
+  void printT2AddrModeImm8Operand(const MCInst *MI, unsigned OpNum,
+                                  raw_ostream &O);
+  template<bool AlwaysPrintImm0>
+  void printT2AddrModeImm8s4Operand(const MCInst *MI, unsigned OpNum,
+                                    raw_ostream &O);
+  void printT2AddrModeImm0_1020s4Operand(const MCInst *MI, unsigned OpNum,
+                                    raw_ostream &O);
+  void printT2AddrModeImm8OffsetOperand(const MCInst *MI, unsigned OpNum,
+                                        raw_ostream &O);
+  void printT2AddrModeImm8s4OffsetOperand(const MCInst *MI, unsigned OpNum,
+                                          raw_ostream &O);
+  void printT2AddrModeSoRegOperand(const MCInst *MI, unsigned OpNum,
+                                   raw_ostream &O);
+
+  void printSetendOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printCPSIMod(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printCPSIFlag(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printMSRMaskOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printPredicateOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printMandatoryPredicateOperand(const MCInst *MI, unsigned OpNum,
+                                      raw_ostream &O);
+  void printSBitModifierOperand(const MCInst *MI, unsigned OpNum,
+                                raw_ostream &O);
+  void printRegisterList(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printNoHashImmediate(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printPImmediate(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printCImmediate(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printCoprocOptionImm(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printFPImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printNEONModImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printImmPlusOneOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printRotImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printGPRPairOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+
+  void printPCLabel(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printThumbLdrLabelOperand(const MCInst *MI, unsigned OpNum,
+                                 raw_ostream &O);
+  void printFBits16(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printFBits32(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printVectorIndex(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printVectorListOne(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printVectorListTwo(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printVectorListTwoSpaced(const MCInst *MI, unsigned OpNum,
+                               raw_ostream &O);
+  void printVectorListThree(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printVectorListFour(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printVectorListOneAllLanes(const MCInst *MI, unsigned OpNum,
+                                  raw_ostream &O);
+  void printVectorListTwoAllLanes(const MCInst *MI, unsigned OpNum,
+                                  raw_ostream &O);
+  void printVectorListThreeAllLanes(const MCInst *MI, unsigned OpNum,
+                                    raw_ostream &O);
+  void printVectorListFourAllLanes(const MCInst *MI, unsigned OpNum,
+                                   raw_ostream &O);
+  void printVectorListTwoSpacedAllLanes(const MCInst *MI, unsigned OpNum,
+                                        raw_ostream &O);
+  void printVectorListThreeSpacedAllLanes(const MCInst *MI, unsigned OpNum,
+                                          raw_ostream &O);
+  void printVectorListFourSpacedAllLanes(const MCInst *MI, unsigned OpNum,
+                                         raw_ostream &O);
+  void printVectorListThreeSpaced(const MCInst *MI, unsigned OpNum,
+                                  raw_ostream &O);
+  void printVectorListFourSpaced(const MCInst *MI, unsigned OpNum,
+                                  raw_ostream &O);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/LICENSE.TXT b/contrib/llvm/lib/Target/ARM/LICENSE.TXT
new file mode 100755
index 0000000..68afea1
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/LICENSE.TXT
@@ -0,0 +1,47 @@
+ARM Limited
+
+Software Grant License Agreement ("Agreement")
+
+Except for the license granted herein to you, ARM Limited ("ARM") reserves all
+right, title, and interest in and to the Software (defined below).
+
+Definition
+
+"Software" means the code and documentation as well as any original work of
+authorship, including any modifications or additions to an existing work, that
+is intentionally submitted by ARM to llvm.org (http://llvm.org) ("LLVM") for
+inclusion in, or documentation of, any of the products owned or managed by LLVM
+(the "Work"). For the purposes of this definition, "submitted" means any form of
+electronic, verbal, or written communication sent to LLVM or its
+representatives, including but not limited to communication on electronic
+mailing lists, source code control systems, and issue tracking systems that are
+managed by, or on behalf of, LLVM for the purpose of discussing and improving
+the Work, but excluding communication that is conspicuously marked otherwise.
+
+1. Grant of Copyright License. Subject to the terms and conditions of this
+   Agreement, ARM hereby grants to you and to recipients of the Software
+   distributed by LLVM a perpetual, worldwide, non-exclusive, no-charge,
+   royalty-free, irrevocable copyright license to reproduce, prepare derivative
+   works of, publicly display, publicly perform, sublicense, and distribute the
+   Software and such derivative works.
+
+2. Grant of Patent License. Subject to the terms and conditions of this
+   Agreement, ARM hereby grants you and to recipients of the Software
+   distributed by LLVM a perpetual, worldwide, non-exclusive, no-charge,
+   royalty-free, irrevocable (except as stated in this section) patent license
+   to make, have made, use, offer to sell, sell, import, and otherwise transfer
+   the Work, where such license applies only to those patent claims licensable
+   by ARM that are necessarily infringed by ARM's Software alone or by
+   combination of the Software with the Work to which such Software was
+   submitted. If any entity institutes patent litigation against ARM or any
+   other entity (including a cross-claim or counterclaim in a lawsuit) alleging
+   that ARM's Software, or the Work to which ARM has contributed constitutes
+   direct or contributory patent infringement, then any patent licenses granted
+   to that entity under this Agreement for the Software or Work shall terminate
+   as of the date such litigation is filed.
+
+Unless required by applicable law or agreed to in writing, the software is
+provided on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+either express or implied, including, without limitation, any warranties or
+conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+PARTICULAR PURPOSE.
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
new file mode 100644
index 0000000..b6c85c2
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
@@ -0,0 +1,668 @@
+//===-- ARMAddressingModes.h - ARM Addressing Modes -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM addressing mode implementation stuff.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_ARM_ARMADDRESSINGMODES_H
+#define LLVM_TARGET_ARM_ARMADDRESSINGMODES_H
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include <cassert>
+
+namespace llvm {
+
+/// ARM_AM - ARM Addressing Mode Stuff
+namespace ARM_AM {
+  enum ShiftOpc {
+    no_shift = 0,
+    asr,
+    lsl,
+    lsr,
+    ror,
+    rrx
+  };
+
+  enum AddrOpc {
+    sub = 0,
+    add
+  };
+
+  static inline const char *getAddrOpcStr(AddrOpc Op) {
+    return Op == sub ? "-" : "";
+  }
+
+  static inline const char *getShiftOpcStr(ShiftOpc Op) {
+    switch (Op) {
+    default: llvm_unreachable("Unknown shift opc!");
+    case ARM_AM::asr: return "asr";
+    case ARM_AM::lsl: return "lsl";
+    case ARM_AM::lsr: return "lsr";
+    case ARM_AM::ror: return "ror";
+    case ARM_AM::rrx: return "rrx";
+    }
+  }
+
+  static inline unsigned getShiftOpcEncoding(ShiftOpc Op) {
+    switch (Op) {
+    default: llvm_unreachable("Unknown shift opc!");
+    case ARM_AM::asr: return 2;
+    case ARM_AM::lsl: return 0;
+    case ARM_AM::lsr: return 1;
+    case ARM_AM::ror: return 3;
+    }
+  }
+
+  enum AMSubMode {
+    bad_am_submode = 0,
+    ia,
+    ib,
+    da,
+    db
+  };
+
+  static inline const char *getAMSubModeStr(AMSubMode Mode) {
+    switch (Mode) {
+    default: llvm_unreachable("Unknown addressing sub-mode!");
+    case ARM_AM::ia: return "ia";
+    case ARM_AM::ib: return "ib";
+    case ARM_AM::da: return "da";
+    case ARM_AM::db: return "db";
+    }
+  }
+
+  /// rotr32 - Rotate a 32-bit unsigned value right by a specified # bits.
+  ///
+  static inline unsigned rotr32(unsigned Val, unsigned Amt) {
+    assert(Amt < 32 && "Invalid rotate amount");
+    return (Val >> Amt) | (Val << ((32-Amt)&31));
+  }
+
+  /// rotl32 - Rotate a 32-bit unsigned value left by a specified # bits.
+  ///
+  static inline unsigned rotl32(unsigned Val, unsigned Amt) {
+    assert(Amt < 32 && "Invalid rotate amount");
+    return (Val << Amt) | (Val >> ((32-Amt)&31));
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Addressing Mode #1: shift_operand with registers
+  //===--------------------------------------------------------------------===//
+  //
+  // This 'addressing mode' is used for arithmetic instructions.  It can
+  // represent things like:
+  //   reg
+  //   reg [asr|lsl|lsr|ror|rrx] reg
+  //   reg [asr|lsl|lsr|ror|rrx] imm
+  //
+  // This is stored three operands [rega, regb, opc].  The first is the base
+  // reg, the second is the shift amount (or reg0 if not present or imm).  The
+  // third operand encodes the shift opcode and the imm if a reg isn't present.
+  //
+  static inline unsigned getSORegOpc(ShiftOpc ShOp, unsigned Imm) {
+    return ShOp | (Imm << 3);
+  }
+  static inline unsigned getSORegOffset(unsigned Op) {
+    return Op >> 3;
+  }
+  static inline ShiftOpc getSORegShOp(unsigned Op) {
+    return (ShiftOpc)(Op & 7);
+  }
+
+  /// getSOImmValImm - Given an encoded imm field for the reg/imm form, return
+  /// the 8-bit imm value.
+  static inline unsigned getSOImmValImm(unsigned Imm) {
+    return Imm & 0xFF;
+  }
+  /// getSOImmValRot - Given an encoded imm field for the reg/imm form, return
+  /// the rotate amount.
+  static inline unsigned getSOImmValRot(unsigned Imm) {
+    return (Imm >> 8) * 2;
+  }
+
+  /// getSOImmValRotate - Try to handle Imm with an immediate shifter operand,
+  /// computing the rotate amount to use.  If this immediate value cannot be
+  /// handled with a single shifter-op, determine a good rotate amount that will
+  /// take a maximal chunk of bits out of the immediate.
+  static inline unsigned getSOImmValRotate(unsigned Imm) {
+    // 8-bit (or less) immediates are trivially shifter_operands with a rotate
+    // of zero.
+    if ((Imm & ~255U) == 0) return 0;
+
+    // Use CTZ to compute the rotate amount.
+    unsigned TZ = countTrailingZeros(Imm);
+
+    // Rotate amount must be even.  Something like 0x200 must be rotated 8 bits,
+    // not 9.
+    unsigned RotAmt = TZ & ~1;
+
+    // If we can handle this spread, return it.
+    if ((rotr32(Imm, RotAmt) & ~255U) == 0)
+      return (32-RotAmt)&31;  // HW rotates right, not left.
+
+    // For values like 0xF000000F, we should ignore the low 6 bits, then
+    // retry the hunt.
+    if (Imm & 63U) {
+      unsigned TZ2 = countTrailingZeros(Imm & ~63U);
+      unsigned RotAmt2 = TZ2 & ~1;
+      if ((rotr32(Imm, RotAmt2) & ~255U) == 0)
+        return (32-RotAmt2)&31;  // HW rotates right, not left.
+    }
+
+    // Otherwise, we have no way to cover this span of bits with a single
+    // shifter_op immediate.  Return a chunk of bits that will be useful to
+    // handle.
+    return (32-RotAmt)&31;  // HW rotates right, not left.
+  }
+
+  /// getSOImmVal - Given a 32-bit immediate, if it is something that can fit
+  /// into an shifter_operand immediate operand, return the 12-bit encoding for
+  /// it.  If not, return -1.
+  static inline int getSOImmVal(unsigned Arg) {
+    // 8-bit (or less) immediates are trivially shifter_operands with a rotate
+    // of zero.
+    if ((Arg & ~255U) == 0) return Arg;
+
+    unsigned RotAmt = getSOImmValRotate(Arg);
+
+    // If this cannot be handled with a single shifter_op, bail out.
+    if (rotr32(~255U, RotAmt) & Arg)
+      return -1;
+
+    // Encode this correctly.
+    return rotl32(Arg, RotAmt) | ((RotAmt>>1) << 8);
+  }
+
+  /// isSOImmTwoPartVal - Return true if the specified value can be obtained by
+  /// or'ing together two SOImmVal's.
+  static inline bool isSOImmTwoPartVal(unsigned V) {
+    // If this can be handled with a single shifter_op, bail out.
+    V = rotr32(~255U, getSOImmValRotate(V)) & V;
+    if (V == 0)
+      return false;
+
+    // If this can be handled with two shifter_op's, accept.
+    V = rotr32(~255U, getSOImmValRotate(V)) & V;
+    return V == 0;
+  }
+
+  /// getSOImmTwoPartFirst - If V is a value that satisfies isSOImmTwoPartVal,
+  /// return the first chunk of it.
+  static inline unsigned getSOImmTwoPartFirst(unsigned V) {
+    return rotr32(255U, getSOImmValRotate(V)) & V;
+  }
+
+  /// getSOImmTwoPartSecond - If V is a value that satisfies isSOImmTwoPartVal,
+  /// return the second chunk of it.
+  static inline unsigned getSOImmTwoPartSecond(unsigned V) {
+    // Mask out the first hunk.
+    V = rotr32(~255U, getSOImmValRotate(V)) & V;
+
+    // Take what's left.
+    assert(V == (rotr32(255U, getSOImmValRotate(V)) & V));
+    return V;
+  }
+
+  /// getThumbImmValShift - Try to handle Imm with a 8-bit immediate followed
+  /// by a left shift. Returns the shift amount to use.
+  static inline unsigned getThumbImmValShift(unsigned Imm) {
+    // 8-bit (or less) immediates are trivially immediate operand with a shift
+    // of zero.
+    if ((Imm & ~255U) == 0) return 0;
+
+    // Use CTZ to compute the shift amount.
+    return countTrailingZeros(Imm);
+  }
+
+  /// isThumbImmShiftedVal - Return true if the specified value can be obtained
+  /// by left shifting a 8-bit immediate.
+  static inline bool isThumbImmShiftedVal(unsigned V) {
+    // If this can be handled with
+    V = (~255U << getThumbImmValShift(V)) & V;
+    return V == 0;
+  }
+
+  /// getThumbImm16ValShift - Try to handle Imm with a 16-bit immediate followed
+  /// by a left shift. Returns the shift amount to use.
+  static inline unsigned getThumbImm16ValShift(unsigned Imm) {
+    // 16-bit (or less) immediates are trivially immediate operand with a shift
+    // of zero.
+    if ((Imm & ~65535U) == 0) return 0;
+
+    // Use CTZ to compute the shift amount.
+    return countTrailingZeros(Imm);
+  }
+
+  /// isThumbImm16ShiftedVal - Return true if the specified value can be
+  /// obtained by left shifting a 16-bit immediate.
+  static inline bool isThumbImm16ShiftedVal(unsigned V) {
+    // If this can be handled with
+    V = (~65535U << getThumbImm16ValShift(V)) & V;
+    return V == 0;
+  }
+
+  /// getThumbImmNonShiftedVal - If V is a value that satisfies
+  /// isThumbImmShiftedVal, return the non-shiftd value.
+  static inline unsigned getThumbImmNonShiftedVal(unsigned V) {
+    return V >> getThumbImmValShift(V);
+  }
+
+
+  /// getT2SOImmValSplat - Return the 12-bit encoded representation
+  /// if the specified value can be obtained by splatting the low 8 bits
+  /// into every other byte or every byte of a 32-bit value. i.e.,
+  ///     00000000 00000000 00000000 abcdefgh    control = 0
+  ///     00000000 abcdefgh 00000000 abcdefgh    control = 1
+  ///     abcdefgh 00000000 abcdefgh 00000000    control = 2
+  ///     abcdefgh abcdefgh abcdefgh abcdefgh    control = 3
+  /// Return -1 if none of the above apply.
+  /// See ARM Reference Manual A6.3.2.
+  static inline int getT2SOImmValSplatVal(unsigned V) {
+    unsigned u, Vs, Imm;
+    // control = 0
+    if ((V & 0xffffff00) == 0)
+      return V;
+
+    // If the value is zeroes in the first byte, just shift those off
+    Vs = ((V & 0xff) == 0) ? V >> 8 : V;
+    // Any passing value only has 8 bits of payload, splatted across the word
+    Imm = Vs & 0xff;
+    // Likewise, any passing values have the payload splatted into the 3rd byte
+    u = Imm | (Imm << 16);
+
+    // control = 1 or 2
+    if (Vs == u)
+      return (((Vs == V) ? 1 : 2) << 8) | Imm;
+
+    // control = 3
+    if (Vs == (u | (u << 8)))
+      return (3 << 8) | Imm;
+
+    return -1;
+  }
+
+  /// getT2SOImmValRotateVal - Return the 12-bit encoded representation if the
+  /// specified value is a rotated 8-bit value. Return -1 if no rotation
+  /// encoding is possible.
+  /// See ARM Reference Manual A6.3.2.
+  static inline int getT2SOImmValRotateVal(unsigned V) {
+    unsigned RotAmt = countLeadingZeros(V);
+    if (RotAmt >= 24)
+      return -1;
+
+    // If 'Arg' can be handled with a single shifter_op return the value.
+    if ((rotr32(0xff000000U, RotAmt) & V) == V)
+      return (rotr32(V, 24 - RotAmt) & 0x7f) | ((RotAmt + 8) << 7);
+
+    return -1;
+  }
+
+  /// getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit
+  /// into a Thumb-2 shifter_operand immediate operand, return the 12-bit
+  /// encoding for it.  If not, return -1.
+  /// See ARM Reference Manual A6.3.2.
+  static inline int getT2SOImmVal(unsigned Arg) {
+    // If 'Arg' is an 8-bit splat, then get the encoded value.
+    int Splat = getT2SOImmValSplatVal(Arg);
+    if (Splat != -1)
+      return Splat;
+
+    // If 'Arg' can be handled with a single shifter_op return the value.
+    int Rot = getT2SOImmValRotateVal(Arg);
+    if (Rot != -1)
+      return Rot;
+
+    return -1;
+  }
+
+  static inline unsigned getT2SOImmValRotate(unsigned V) {
+    if ((V & ~255U) == 0) return 0;
+    // Use CTZ to compute the rotate amount.
+    unsigned RotAmt = countTrailingZeros(V);
+    return (32 - RotAmt) & 31;
+  }
+
+  static inline bool isT2SOImmTwoPartVal (unsigned Imm) {
+    unsigned V = Imm;
+    // Passing values can be any combination of splat values and shifter
+    // values. If this can be handled with a single shifter or splat, bail
+    // out. Those should be handled directly, not with a two-part val.
+    if (getT2SOImmValSplatVal(V) != -1)
+      return false;
+    V = rotr32 (~255U, getT2SOImmValRotate(V)) & V;
+    if (V == 0)
+      return false;
+
+    // If this can be handled as an immediate, accept.
+    if (getT2SOImmVal(V) != -1) return true;
+
+    // Likewise, try masking out a splat value first.
+    V = Imm;
+    if (getT2SOImmValSplatVal(V & 0xff00ff00U) != -1)
+      V &= ~0xff00ff00U;
+    else if (getT2SOImmValSplatVal(V & 0x00ff00ffU) != -1)
+      V &= ~0x00ff00ffU;
+    // If what's left can be handled as an immediate, accept.
+    if (getT2SOImmVal(V) != -1) return true;
+
+    // Otherwise, do not accept.
+    return false;
+  }
+
+  static inline unsigned getT2SOImmTwoPartFirst(unsigned Imm) {
+    assert (isT2SOImmTwoPartVal(Imm) &&
+            "Immedate cannot be encoded as two part immediate!");
+    // Try a shifter operand as one part
+    unsigned V = rotr32 (~255, getT2SOImmValRotate(Imm)) & Imm;
+    // If the rest is encodable as an immediate, then return it.
+    if (getT2SOImmVal(V) != -1) return V;
+
+    // Try masking out a splat value first.
+    if (getT2SOImmValSplatVal(Imm & 0xff00ff00U) != -1)
+      return Imm & 0xff00ff00U;
+
+    // The other splat is all that's left as an option.
+    assert (getT2SOImmValSplatVal(Imm & 0x00ff00ffU) != -1);
+    return Imm & 0x00ff00ffU;
+  }
+
+  static inline unsigned getT2SOImmTwoPartSecond(unsigned Imm) {
+    // Mask out the first hunk
+    Imm ^= getT2SOImmTwoPartFirst(Imm);
+    // Return what's left
+    assert (getT2SOImmVal(Imm) != -1 &&
+            "Unable to encode second part of T2 two part SO immediate");
+    return Imm;
+  }
+
+
+  //===--------------------------------------------------------------------===//
+  // Addressing Mode #2
+  //===--------------------------------------------------------------------===//
+  //
+  // This is used for most simple load/store instructions.
+  //
+  // addrmode2 := reg +/- reg shop imm
+  // addrmode2 := reg +/- imm12
+  //
+  // The first operand is always a Reg.  The second operand is a reg if in
+  // reg/reg form, otherwise it's reg#0.  The third field encodes the operation
+  // in bit 12, the immediate in bits 0-11, and the shift op in 13-15. The
+  // fourth operand 16-17 encodes the index mode.
+  //
+  // If this addressing mode is a frame index (before prolog/epilog insertion
+  // and code rewriting), this operand will have the form:  FI#, reg0, <offs>
+  // with no shift amount for the frame offset.
+  //
+  static inline unsigned getAM2Opc(AddrOpc Opc, unsigned Imm12, ShiftOpc SO,
+                                   unsigned IdxMode = 0) {
+    assert(Imm12 < (1 << 12) && "Imm too large!");
+    bool isSub = Opc == sub;
+    return Imm12 | ((int)isSub << 12) | (SO << 13) | (IdxMode << 16) ;
+  }
+  static inline unsigned getAM2Offset(unsigned AM2Opc) {
+    return AM2Opc & ((1 << 12)-1);
+  }
+  static inline AddrOpc getAM2Op(unsigned AM2Opc) {
+    return ((AM2Opc >> 12) & 1) ? sub : add;
+  }
+  static inline ShiftOpc getAM2ShiftOpc(unsigned AM2Opc) {
+    return (ShiftOpc)((AM2Opc >> 13) & 7);
+  }
+  static inline unsigned getAM2IdxMode(unsigned AM2Opc) {
+    return (AM2Opc >> 16);
+  }
+
+
+  //===--------------------------------------------------------------------===//
+  // Addressing Mode #3
+  //===--------------------------------------------------------------------===//
+  //
+  // This is used for sign-extending loads, and load/store-pair instructions.
+  //
+  // addrmode3 := reg +/- reg
+  // addrmode3 := reg +/- imm8
+  //
+  // The first operand is always a Reg.  The second operand is a reg if in
+  // reg/reg form, otherwise it's reg#0.  The third field encodes the operation
+  // in bit 8, the immediate in bits 0-7. The fourth operand 9-10 encodes the
+  // index mode.
+
+  /// getAM3Opc - This function encodes the addrmode3 opc field.
+  static inline unsigned getAM3Opc(AddrOpc Opc, unsigned char Offset,
+                                   unsigned IdxMode = 0) {
+    bool isSub = Opc == sub;
+    return ((int)isSub << 8) | Offset | (IdxMode << 9);
+  }
+  static inline unsigned char getAM3Offset(unsigned AM3Opc) {
+    return AM3Opc & 0xFF;
+  }
+  static inline AddrOpc getAM3Op(unsigned AM3Opc) {
+    return ((AM3Opc >> 8) & 1) ? sub : add;
+  }
+  static inline unsigned getAM3IdxMode(unsigned AM3Opc) {
+    return (AM3Opc >> 9);
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Addressing Mode #4
+  //===--------------------------------------------------------------------===//
+  //
+  // This is used for load / store multiple instructions.
+  //
+  // addrmode4 := reg, <mode>
+  //
+  // The four modes are:
+  //    IA - Increment after
+  //    IB - Increment before
+  //    DA - Decrement after
+  //    DB - Decrement before
+  // For VFP instructions, only the IA and DB modes are valid.
+
+  static inline AMSubMode getAM4SubMode(unsigned Mode) {
+    return (AMSubMode)(Mode & 0x7);
+  }
+
+  static inline unsigned getAM4ModeImm(AMSubMode SubMode) {
+    return (int)SubMode;
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Addressing Mode #5
+  //===--------------------------------------------------------------------===//
+  //
+  // This is used for coprocessor instructions, such as FP load/stores.
+  //
+  // addrmode5 := reg +/- imm8*4
+  //
+  // The first operand is always a Reg.  The second operand encodes the
+  // operation in bit 8 and the immediate in bits 0-7.
+
+  /// getAM5Opc - This function encodes the addrmode5 opc field.
+  static inline unsigned getAM5Opc(AddrOpc Opc, unsigned char Offset) {
+    bool isSub = Opc == sub;
+    return ((int)isSub << 8) | Offset;
+  }
+  static inline unsigned char getAM5Offset(unsigned AM5Opc) {
+    return AM5Opc & 0xFF;
+  }
+  static inline AddrOpc getAM5Op(unsigned AM5Opc) {
+    return ((AM5Opc >> 8) & 1) ? sub : add;
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Addressing Mode #6
+  //===--------------------------------------------------------------------===//
+  //
+  // This is used for NEON load / store instructions.
+  //
+  // addrmode6 := reg with optional alignment
+  //
+  // This is stored in two operands [regaddr, align].  The first is the
+  // address register.  The second operand is the value of the alignment
+  // specifier in bytes or zero if no explicit alignment.
+  // Valid alignments depend on the specific instruction.
+
+  //===--------------------------------------------------------------------===//
+  // NEON Modified Immediates
+  //===--------------------------------------------------------------------===//
+  //
+  // Several NEON instructions (e.g., VMOV) take a "modified immediate"
+  // vector operand, where a small immediate encoded in the instruction
+  // specifies a full NEON vector value.  These modified immediates are
+  // represented here as encoded integers.  The low 8 bits hold the immediate
+  // value; bit 12 holds the "Op" field of the instruction, and bits 11-8 hold
+  // the "Cmode" field of the instruction.  The interfaces below treat the
+  // Op and Cmode values as a single 5-bit value.
+
+  static inline unsigned createNEONModImm(unsigned OpCmode, unsigned Val) {
+    return (OpCmode << 8) | Val;
+  }
+  static inline unsigned getNEONModImmOpCmode(unsigned ModImm) {
+    return (ModImm >> 8) & 0x1f;
+  }
+  static inline unsigned getNEONModImmVal(unsigned ModImm) {
+    return ModImm & 0xff;
+  }
+
+  /// decodeNEONModImm - Decode a NEON modified immediate value into the
+  /// element value and the element size in bits.  (If the element size is
+  /// smaller than the vector, it is splatted into all the elements.)
+  static inline uint64_t decodeNEONModImm(unsigned ModImm, unsigned &EltBits) {
+    unsigned OpCmode = getNEONModImmOpCmode(ModImm);
+    unsigned Imm8 = getNEONModImmVal(ModImm);
+    uint64_t Val = 0;
+
+    if (OpCmode == 0xe) {
+      // 8-bit vector elements
+      Val = Imm8;
+      EltBits = 8;
+    } else if ((OpCmode & 0xc) == 0x8) {
+      // 16-bit vector elements
+      unsigned ByteNum = (OpCmode & 0x6) >> 1;
+      Val = Imm8 << (8 * ByteNum);
+      EltBits = 16;
+    } else if ((OpCmode & 0x8) == 0) {
+      // 32-bit vector elements, zero with one byte set
+      unsigned ByteNum = (OpCmode & 0x6) >> 1;
+      Val = Imm8 << (8 * ByteNum);
+      EltBits = 32;
+    } else if ((OpCmode & 0xe) == 0xc) {
+      // 32-bit vector elements, one byte with low bits set
+      unsigned ByteNum = 1 + (OpCmode & 0x1);
+      Val = (Imm8 << (8 * ByteNum)) | (0xffff >> (8 * (2 - ByteNum)));
+      EltBits = 32;
+    } else if (OpCmode == 0x1e) {
+      // 64-bit vector elements
+      for (unsigned ByteNum = 0; ByteNum < 8; ++ByteNum) {
+        if ((ModImm >> ByteNum) & 1)
+          Val |= (uint64_t)0xff << (8 * ByteNum);
+      }
+      EltBits = 64;
+    } else {
+      llvm_unreachable("Unsupported NEON immediate");
+    }
+    return Val;
+  }
+
+  AMSubMode getLoadStoreMultipleSubMode(int Opcode);
+
+  //===--------------------------------------------------------------------===//
+  // Floating-point Immediates
+  //
+  static inline float getFPImmFloat(unsigned Imm) {
+    // We expect an 8-bit binary encoding of a floating-point number here.
+    union {
+      uint32_t I;
+      float F;
+    } FPUnion;
+
+    uint8_t Sign = (Imm >> 7) & 0x1;
+    uint8_t Exp = (Imm >> 4) & 0x7;
+    uint8_t Mantissa = Imm & 0xf;
+
+    //   8-bit FP    iEEEE Float Encoding
+    //   abcd efgh   aBbbbbbc defgh000 00000000 00000000
+    //
+    // where B = NOT(b);
+
+    FPUnion.I = 0;
+    FPUnion.I |= Sign << 31;
+    FPUnion.I |= ((Exp & 0x4) != 0 ? 0 : 1) << 30;
+    FPUnion.I |= ((Exp & 0x4) != 0 ? 0x1f : 0) << 25;
+    FPUnion.I |= (Exp & 0x3) << 23;
+    FPUnion.I |= Mantissa << 19;
+    return FPUnion.F;
+  }
+
+  /// getFP32Imm - Return an 8-bit floating-point version of the 32-bit
+  /// floating-point value. If the value cannot be represented as an 8-bit
+  /// floating-point value, then return -1.
+  static inline int getFP32Imm(const APInt &Imm) {
+    uint32_t Sign = Imm.lshr(31).getZExtValue() & 1;
+    int32_t Exp = (Imm.lshr(23).getSExtValue() & 0xff) - 127;  // -126 to 127
+    int64_t Mantissa = Imm.getZExtValue() & 0x7fffff;  // 23 bits
+
+    // We can handle 4 bits of mantissa.
+    // mantissa = (16+UInt(e:f:g:h))/16.
+    if (Mantissa & 0x7ffff)
+      return -1;
+    Mantissa >>= 19;
+    if ((Mantissa & 0xf) != Mantissa)
+      return -1;
+
+    // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
+    if (Exp < -3 || Exp > 4)
+      return -1;
+    Exp = ((Exp+3) & 0x7) ^ 4;
+
+    return ((int)Sign << 7) | (Exp << 4) | Mantissa;
+  }
+
+  static inline int getFP32Imm(const APFloat &FPImm) {
+    return getFP32Imm(FPImm.bitcastToAPInt());
+  }
+
+  /// getFP64Imm - Return an 8-bit floating-point version of the 64-bit
+  /// floating-point value. If the value cannot be represented as an 8-bit
+  /// floating-point value, then return -1.
+  static inline int getFP64Imm(const APInt &Imm) {
+    uint64_t Sign = Imm.lshr(63).getZExtValue() & 1;
+    int64_t Exp = (Imm.lshr(52).getSExtValue() & 0x7ff) - 1023; // -1022 to 1023
+    uint64_t Mantissa = Imm.getZExtValue() & 0xfffffffffffffULL;
+
+    // We can handle 4 bits of mantissa.
+    // mantissa = (16+UInt(e:f:g:h))/16.
+    if (Mantissa & 0xffffffffffffULL)
+      return -1;
+    Mantissa >>= 48;
+    if ((Mantissa & 0xf) != Mantissa)
+      return -1;
+
+    // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
+    if (Exp < -3 || Exp > 4)
+      return -1;
+    Exp = ((Exp+3) & 0x7) ^ 4;
+
+    return ((int)Sign << 7) | (Exp << 4) | Mantissa;
+  }
+
+  static inline int getFP64Imm(const APFloat &FPImm) {
+    return getFP64Imm(FPImm.bitcastToAPInt());
+  }
+
+} // end namespace ARM_AM
+} // end namespace llvm
+
+#endif
+
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMArchName.def b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMArchName.def
new file mode 100644
index 0000000..9f007a0
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMArchName.def
@@ -0,0 +1,50 @@
+//===-- ARMArchName.def - List of the ARM arch names ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the list of the supported ARM architecture names,
+// i.e. the supported value for -march= option.
+//
+//===----------------------------------------------------------------------===//
+
+// NOTE: NO INCLUDE GUARD DESIRED!
+
+#ifndef ARM_ARCH_NAME
+#error "You must define ARM_ARCH_NAME before including ARMArchName.def"
+#endif
+
+// ARM_ARCH_NAME(NAME, ID, DEFAULT_CPU_NAME, DEFAULT_CPU_ARCH)
+ARM_ARCH_NAME("armv2",   ARMV2,   "2",       v4)
+ARM_ARCH_NAME("armv2a",  ARMV2A,  "2A",      v4)
+ARM_ARCH_NAME("armv3",   ARMV3,   "3",       v4)
+ARM_ARCH_NAME("armv3m",  ARMV3M,  "3M",      v4)
+ARM_ARCH_NAME("armv4",   ARMV4,   "4",       v4)
+ARM_ARCH_NAME("armv4t",  ARMV4T,  "4T",      v4T)
+ARM_ARCH_NAME("armv5",   ARMV5,   "5",       v5T)
+ARM_ARCH_NAME("armv5t",  ARMV5T,  "5T",      v5T)
+ARM_ARCH_NAME("armv5te", ARMV5TE, "5TE",     v5TE)
+ARM_ARCH_NAME("armv6",   ARMV6,   "6",       v6)
+ARM_ARCH_NAME("armv6j",  ARMV6J,  "6J",      v6)
+ARM_ARCH_NAME("armv6t2", ARMV6T2, "6T2",     v6T2)
+ARM_ARCH_NAME("armv6z",  ARMV6Z,  "6Z",      v6KZ)
+ARM_ARCH_NAME("armv6zk", ARMV6ZK, "6ZK",     v6KZ)
+ARM_ARCH_NAME("armv6-m", ARMV6M,  "6-M",     v6_M)
+ARM_ARCH_NAME("armv7",   ARMV7,   "7",       v7)
+ARM_ARCH_NAME("armv7-a", ARMV7A,  "7-A",     v7)
+ARM_ARCH_ALIAS("armv7a", ARMV7A)
+ARM_ARCH_NAME("armv7-r", ARMV7R,  "7-R",     v7)
+ARM_ARCH_ALIAS("armv7r", ARMV7R)
+ARM_ARCH_NAME("armv7-m", ARMV7M,  "7-M",     v7)
+ARM_ARCH_ALIAS("armv7m", ARMV7M)
+ARM_ARCH_NAME("armv8-a", ARMV8A,  "8-A",     v8)
+ARM_ARCH_ALIAS("armv8a", ARMV8A)
+ARM_ARCH_NAME("iwmmxt",  IWMMXT,  "iwmmxt",  v5TE)
+ARM_ARCH_NAME("iwmmxt2", IWMMXT2, "iwmmxt2", v5TE)
+
+#undef ARM_ARCH_NAME
+#undef ARM_ARCH_ALIAS
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMArchName.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMArchName.h
new file mode 100644
index 0000000..34b9fc1
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMArchName.h
@@ -0,0 +1,27 @@
+//===-- ARMArchName.h - List of the ARM arch names --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMARCHNAME_H
+#define ARMARCHNAME_H
+
+namespace llvm {
+namespace ARM {
+
+enum ArchKind {
+  INVALID_ARCH = 0
+
+#define ARM_ARCH_NAME(NAME, ID, DEFAULT_CPU_NAME, DEFAULT_CPU_ARCH) , ID
+#define ARM_ARCH_ALIAS(NAME, ID) /* empty */
+#include "ARMArchName.def"
+};
+
+} // namespace ARM
+} // namespace llvm
+
+#endif // ARMARCHNAME_H
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
new file mode 100644
index 0000000..7acd9cc
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -0,0 +1,860 @@
+//===-- ARMAsmBackend.cpp - ARM Assembler Backend -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/ARMMCTargetDesc.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "MCTargetDesc/ARMFixupKinds.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachO.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+namespace {
+class ARMELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+  ARMELFObjectWriter(uint8_t OSABI)
+    : MCELFObjectTargetWriter(/*Is64Bit*/ false, OSABI, ELF::EM_ARM,
+                              /*HasRelocationAddend*/ false) {}
+};
+
+class ARMAsmBackend : public MCAsmBackend {
+  const MCSubtargetInfo* STI;
+  bool isThumbMode;     // Currently emitting Thumb code.
+  bool IsLittleEndian;  // Big or little endian.
+public:
+  ARMAsmBackend(const Target &T, const StringRef TT, bool IsLittle)
+    : MCAsmBackend(), STI(ARM_MC::createARMMCSubtargetInfo(TT, "", "")),
+      isThumbMode(TT.startswith("thumb")), IsLittleEndian(IsLittle) {}
+
+  ~ARMAsmBackend() {
+    delete STI;
+  }
+
+  unsigned getNumFixupKinds() const override {
+    return ARM::NumTargetFixupKinds;
+  }
+
+  bool hasNOP() const {
+    return (STI->getFeatureBits() & ARM::HasV6T2Ops) != 0;
+  }
+
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
+    const static MCFixupKindInfo InfosLE[ARM::NumTargetFixupKinds] = {
+// This table *must* be in the order that the fixup_* kinds are defined in
+// ARMFixupKinds.h.
+//
+// Name                      Offset (bits) Size (bits)     Flags
+{ "fixup_arm_ldst_pcrel_12", 0,            32,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_t2_ldst_pcrel_12",  0,            32,  MCFixupKindInfo::FKF_IsPCRel |
+                                   MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+{ "fixup_arm_pcrel_10_unscaled", 0,        32,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_pcrel_10",      0,            32,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_t2_pcrel_10",       0,            32,  MCFixupKindInfo::FKF_IsPCRel |
+                                   MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+{ "fixup_thumb_adr_pcrel_10",0,            8,   MCFixupKindInfo::FKF_IsPCRel |
+                                   MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+{ "fixup_arm_adr_pcrel_12",  0,            32,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_t2_adr_pcrel_12",   0,            32,  MCFixupKindInfo::FKF_IsPCRel |
+                                   MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+{ "fixup_arm_condbranch",    0,            24,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_uncondbranch",  0,            24,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_t2_condbranch",     0,            32,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_t2_uncondbranch",   0,            32,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_thumb_br",      0,            16,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_uncondbl",      0,            24,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_condbl",        0,            24,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_blx",           0,            24,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_thumb_bl",      0,            32,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_thumb_blx",     0,            32,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_thumb_cb",      0,            16,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_thumb_cp",      0,             8,  MCFixupKindInfo::FKF_IsPCRel |
+                                   MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+{ "fixup_arm_thumb_bcc",     0,             8,  MCFixupKindInfo::FKF_IsPCRel },
+// movw / movt: 16-bits immediate but scattered into two chunks 0 - 12, 16 - 19.
+{ "fixup_arm_movt_hi16",     0,            20,  0 },
+{ "fixup_arm_movw_lo16",     0,            20,  0 },
+{ "fixup_t2_movt_hi16",      0,            20,  0 },
+{ "fixup_t2_movw_lo16",      0,            20,  0 },
+    };
+    const static MCFixupKindInfo InfosBE[ARM::NumTargetFixupKinds] = {
+// This table *must* be in the order that the fixup_* kinds are defined in
+// ARMFixupKinds.h.
+//
+// Name                      Offset (bits) Size (bits)     Flags
+{ "fixup_arm_ldst_pcrel_12", 0,            32,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_t2_ldst_pcrel_12",  0,            32,  MCFixupKindInfo::FKF_IsPCRel |
+                                   MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+{ "fixup_arm_pcrel_10_unscaled", 0,        32,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_pcrel_10",      0,            32,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_t2_pcrel_10",       0,            32,  MCFixupKindInfo::FKF_IsPCRel |
+                                   MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+{ "fixup_thumb_adr_pcrel_10",8,            8,   MCFixupKindInfo::FKF_IsPCRel |
+                                   MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+{ "fixup_arm_adr_pcrel_12",  0,            32,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_t2_adr_pcrel_12",   0,            32,  MCFixupKindInfo::FKF_IsPCRel |
+                                   MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+{ "fixup_arm_condbranch",    8,            24,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_uncondbranch",  8,            24,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_t2_condbranch",     0,            32,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_t2_uncondbranch",   0,            32,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_thumb_br",      0,            16,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_uncondbl",      8,            24,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_condbl",        8,            24,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_blx",           8,            24,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_thumb_bl",      0,            32,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_thumb_blx",     0,            32,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_thumb_cb",      0,            16,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_thumb_cp",      8,             8,  MCFixupKindInfo::FKF_IsPCRel |
+                                   MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+{ "fixup_arm_thumb_bcc",     8,             8,  MCFixupKindInfo::FKF_IsPCRel },
+// movw / movt: 16-bits immediate but scattered into two chunks 0 - 12, 16 - 19.
+{ "fixup_arm_movt_hi16",     12,           20,  0 },
+{ "fixup_arm_movw_lo16",     12,           20,  0 },
+{ "fixup_t2_movt_hi16",      12,           20,  0 },
+{ "fixup_t2_movw_lo16",      12,           20,  0 },
+    };
+
+    if (Kind < FirstTargetFixupKind)
+      return MCAsmBackend::getFixupKindInfo(Kind);
+
+    assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+           "Invalid kind!");
+    return (IsLittleEndian ? InfosLE : InfosBE)[Kind - FirstTargetFixupKind];
+  }
+
+  /// processFixupValue - Target hook to process the literal value of a fixup
+  /// if necessary.
+  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
+                         const MCFixup &Fixup, const MCFragment *DF,
+                         const MCValue &Target, uint64_t &Value,
+                         bool &IsResolved) override;
+
+
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value, bool IsPCRel) const override;
+
+  bool mayNeedRelaxation(const MCInst &Inst) const override;
+
+  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const override;
+
+  void relaxInstruction(const MCInst &Inst, MCInst &Res) const override;
+
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+
+  void handleAssemblerFlag(MCAssemblerFlag Flag) override {
+    switch (Flag) {
+    default: break;
+    case MCAF_Code16:
+      setIsThumb(true);
+      break;
+    case MCAF_Code32:
+      setIsThumb(false);
+      break;
+    }
+  }
+
+  unsigned getPointerSize() const { return 4; }
+  bool isThumb() const { return isThumbMode; }
+  void setIsThumb(bool it) { isThumbMode = it; }
+  bool isLittle() const { return IsLittleEndian; }
+};
+} // end anonymous namespace
+
+static unsigned getRelaxedOpcode(unsigned Op) {
+  switch (Op) {
+  default: return Op;
+  case ARM::tBcc:       return ARM::t2Bcc;
+  case ARM::tLDRpci:    return ARM::t2LDRpci;
+  case ARM::tADR:       return ARM::t2ADR;
+  case ARM::tB:         return ARM::t2B;
+  case ARM::tCBZ:       return ARM::tHINT;
+  case ARM::tCBNZ:      return ARM::tHINT;
+  }
+}
+
+bool ARMAsmBackend::mayNeedRelaxation(const MCInst &Inst) const {
+  if (getRelaxedOpcode(Inst.getOpcode()) != Inst.getOpcode())
+    return true;
+  return false;
+}
+
+bool ARMAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
+                                         uint64_t Value,
+                                         const MCRelaxableFragment *DF,
+                                         const MCAsmLayout &Layout) const {
+  switch ((unsigned)Fixup.getKind()) {
+  case ARM::fixup_arm_thumb_br: {
+    // Relaxing tB to t2B. tB has a signed 12-bit displacement with the
+    // low bit being an implied zero. There's an implied +4 offset for the
+    // branch, so we adjust the other way here to determine what's
+    // encodable.
+    //
+    // Relax if the value is too big for a (signed) i8.
+    int64_t Offset = int64_t(Value) - 4;
+    return Offset > 2046 || Offset < -2048;
+  }
+  case ARM::fixup_arm_thumb_bcc: {
+    // Relaxing tBcc to t2Bcc. tBcc has a signed 9-bit displacement with the
+    // low bit being an implied zero. There's an implied +4 offset for the
+    // branch, so we adjust the other way here to determine what's
+    // encodable.
+    //
+    // Relax if the value is too big for a (signed) i8.
+    int64_t Offset = int64_t(Value) - 4;
+    return Offset > 254 || Offset < -256;
+  }
+  case ARM::fixup_thumb_adr_pcrel_10:
+  case ARM::fixup_arm_thumb_cp: {
+    // If the immediate is negative, greater than 1020, or not a multiple
+    // of four, the wide version of the instruction must be used.
+    int64_t Offset = int64_t(Value) - 4;
+    return Offset > 1020 || Offset < 0 || Offset & 3;
+  }
+  case ARM::fixup_arm_thumb_cb:
+    // If we have a Thumb CBZ or CBNZ instruction and its target is the next
+    // instruction it is is actually out of range for the instruction.
+    // It will be changed to a NOP.
+    int64_t Offset = (Value & ~1);
+    return Offset == 2;
+  }
+  llvm_unreachable("Unexpected fixup kind in fixupNeedsRelaxation()!");
+}
+
+void ARMAsmBackend::relaxInstruction(const MCInst &Inst, MCInst &Res) const {
+  unsigned RelaxedOp = getRelaxedOpcode(Inst.getOpcode());
+
+  // Sanity check w/ diagnostic if we get here w/ a bogus instruction.
+  if (RelaxedOp == Inst.getOpcode()) {
+    SmallString<256> Tmp;
+    raw_svector_ostream OS(Tmp);
+    Inst.dump_pretty(OS);
+    OS << "\n";
+    report_fatal_error("unexpected instruction to relax: " + OS.str());
+  }
+
+  // If we are changing Thumb CBZ or CBNZ instruction to a NOP, aka tHINT, we
+  // have to change the operands too.
+  if ((Inst.getOpcode() == ARM::tCBZ || Inst.getOpcode() == ARM::tCBNZ) &&
+      RelaxedOp == ARM::tHINT) {
+    Res.setOpcode(RelaxedOp);
+    Res.addOperand(MCOperand::CreateImm(0));
+    Res.addOperand(MCOperand::CreateImm(14));
+    Res.addOperand(MCOperand::CreateReg(0));
+    return;
+  } 
+
+  // The rest of instructions we're relaxing have the same operands.
+  // We just need to update to the proper opcode.
+  Res = Inst;
+  Res.setOpcode(RelaxedOp);
+}
+
+bool ARMAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+  const uint16_t Thumb1_16bitNopEncoding = 0x46c0; // using MOV r8,r8
+  const uint16_t Thumb2_16bitNopEncoding = 0xbf00; // NOP
+  const uint32_t ARMv4_NopEncoding = 0xe1a00000; // using MOV r0,r0
+  const uint32_t ARMv6T2_NopEncoding = 0xe320f000; // NOP
+  if (isThumb()) {
+    const uint16_t nopEncoding = hasNOP() ? Thumb2_16bitNopEncoding
+                                          : Thumb1_16bitNopEncoding;
+    uint64_t NumNops = Count / 2;
+    for (uint64_t i = 0; i != NumNops; ++i)
+      OW->Write16(nopEncoding);
+    if (Count & 1)
+      OW->Write8(0);
+    return true;
+  }
+  // ARM mode
+  const uint32_t nopEncoding = hasNOP() ? ARMv6T2_NopEncoding
+                                        : ARMv4_NopEncoding;
+  uint64_t NumNops = Count / 4;
+  for (uint64_t i = 0; i != NumNops; ++i)
+    OW->Write32(nopEncoding);
+  // FIXME: should this function return false when unable to write exactly
+  // 'Count' bytes with NOP encodings?
+  switch (Count % 4) {
+  default: break; // No leftover bytes to write
+  case 1: OW->Write8(0); break;
+  case 2: OW->Write16(0); break;
+  case 3: OW->Write16(0); OW->Write8(0xa0); break;
+  }
+
+  return true;
+}
+
+static uint32_t swapHalfWords(uint32_t Value, bool IsLittleEndian) {
+  if (IsLittleEndian) {
+    // Note that the halfwords are stored high first and low second in thumb;
+    // so we need to swap the fixup value here to map properly.
+    uint32_t Swapped = (Value & 0xFFFF0000) >> 16;
+    Swapped |= (Value & 0x0000FFFF) << 16;
+    return Swapped;
+  }
+  else
+    return Value;
+}
+
+static uint32_t joinHalfWords(uint32_t FirstHalf, uint32_t SecondHalf,
+                              bool IsLittleEndian) {
+  uint32_t Value;
+
+  if (IsLittleEndian) {
+    Value = (SecondHalf & 0xFFFF) << 16;
+    Value |= (FirstHalf & 0xFFFF);
+  } else {
+    Value = (SecondHalf & 0xFFFF);
+    Value |= (FirstHalf & 0xFFFF) << 16;
+  }
+
+  return Value;
+}
+
+static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
+                                 bool IsPCRel, MCContext *Ctx,
+                                 bool IsLittleEndian) {
+  unsigned Kind = Fixup.getKind();
+  switch (Kind) {
+  default:
+    llvm_unreachable("Unknown fixup kind!");
+  case FK_Data_1:
+  case FK_Data_2:
+  case FK_Data_4:
+    return Value;
+  case FK_SecRel_2:
+    return Value;
+  case FK_SecRel_4:
+    return Value;
+  case ARM::fixup_arm_movt_hi16:
+    if (!IsPCRel)
+      Value >>= 16;
+    // Fallthrough
+  case ARM::fixup_arm_movw_lo16: {
+    unsigned Hi4 = (Value & 0xF000) >> 12;
+    unsigned Lo12 = Value & 0x0FFF;
+    // inst{19-16} = Hi4;
+    // inst{11-0} = Lo12;
+    Value = (Hi4 << 16) | (Lo12);
+    return Value;
+  }
+  case ARM::fixup_t2_movt_hi16:
+    if (!IsPCRel)
+      Value >>= 16;
+    // Fallthrough
+  case ARM::fixup_t2_movw_lo16: {
+    unsigned Hi4 = (Value & 0xF000) >> 12;
+    unsigned i = (Value & 0x800) >> 11;
+    unsigned Mid3 = (Value & 0x700) >> 8;
+    unsigned Lo8 = Value & 0x0FF;
+    // inst{19-16} = Hi4;
+    // inst{26} = i;
+    // inst{14-12} = Mid3;
+    // inst{7-0} = Lo8;
+    Value = (Hi4 << 16) | (i << 26) | (Mid3 << 12) | (Lo8);
+    return swapHalfWords(Value, IsLittleEndian);
+  }
+  case ARM::fixup_arm_ldst_pcrel_12:
+    // ARM PC-relative values are offset by 8.
+    Value -= 4;
+    // FALLTHROUGH
+  case ARM::fixup_t2_ldst_pcrel_12: {
+    // Offset by 4, adjusted by two due to the half-word ordering of thumb.
+    Value -= 4;
+    bool isAdd = true;
+    if ((int64_t)Value < 0) {
+      Value = -Value;
+      isAdd = false;
+    }
+    if (Ctx && Value >= 4096)
+      Ctx->FatalError(Fixup.getLoc(), "out of range pc-relative fixup value");
+    Value |= isAdd << 23;
+
+    // Same addressing mode as fixup_arm_pcrel_10,
+    // but with 16-bit halfwords swapped.
+    if (Kind == ARM::fixup_t2_ldst_pcrel_12)
+      return swapHalfWords(Value, IsLittleEndian);
+
+    return Value;
+  }
+  case ARM::fixup_thumb_adr_pcrel_10:
+    return ((Value - 4) >> 2) & 0xff;
+  case ARM::fixup_arm_adr_pcrel_12: {
+    // ARM PC-relative values are offset by 8.
+    Value -= 8;
+    unsigned opc = 4; // bits {24-21}. Default to add: 0b0100
+    if ((int64_t)Value < 0) {
+      Value = -Value;
+      opc = 2; // 0b0010
+    }
+    if (Ctx && ARM_AM::getSOImmVal(Value) == -1)
+      Ctx->FatalError(Fixup.getLoc(), "out of range pc-relative fixup value");
+    // Encode the immediate and shift the opcode into place.
+    return ARM_AM::getSOImmVal(Value) | (opc << 21);
+  }
+
+  case ARM::fixup_t2_adr_pcrel_12: {
+    Value -= 4;
+    unsigned opc = 0;
+    if ((int64_t)Value < 0) {
+      Value = -Value;
+      opc = 5;
+    }
+
+    uint32_t out = (opc << 21);
+    out |= (Value & 0x800) << 15;
+    out |= (Value & 0x700) << 4;
+    out |= (Value & 0x0FF);
+
+    return swapHalfWords(out, IsLittleEndian);
+  }
+
+  case ARM::fixup_arm_condbranch:
+  case ARM::fixup_arm_uncondbranch:
+  case ARM::fixup_arm_uncondbl:
+  case ARM::fixup_arm_condbl:
+  case ARM::fixup_arm_blx:
+    // These values don't encode the low two bits since they're always zero.
+    // Offset by 8 just as above.
+    if (const MCSymbolRefExpr *SRE = dyn_cast<MCSymbolRefExpr>(Fixup.getValue()))
+      if (SRE->getKind() == MCSymbolRefExpr::VK_ARM_TLSCALL)
+        return 0;
+    return 0xffffff & ((Value - 8) >> 2);
+  case ARM::fixup_t2_uncondbranch: {
+    Value = Value - 4;
+    Value >>= 1; // Low bit is not encoded.
+
+    uint32_t out = 0;
+    bool I =  Value & 0x800000;
+    bool J1 = Value & 0x400000;
+    bool J2 = Value & 0x200000;
+    J1 ^= I;
+    J2 ^= I;
+
+    out |= I  << 26; // S bit
+    out |= !J1 << 13; // J1 bit
+    out |= !J2 << 11; // J2 bit
+    out |= (Value & 0x1FF800)  << 5; // imm6 field
+    out |= (Value & 0x0007FF);        // imm11 field
+
+    return swapHalfWords(out, IsLittleEndian);
+  }
+  case ARM::fixup_t2_condbranch: {
+    Value = Value - 4;
+    Value >>= 1; // Low bit is not encoded.
+
+    uint64_t out = 0;
+    out |= (Value & 0x80000) << 7; // S bit
+    out |= (Value & 0x40000) >> 7; // J2 bit
+    out |= (Value & 0x20000) >> 4; // J1 bit
+    out |= (Value & 0x1F800) << 5; // imm6 field
+    out |= (Value & 0x007FF);      // imm11 field
+
+    return swapHalfWords(out, IsLittleEndian);
+  }
+  case ARM::fixup_arm_thumb_bl: {
+    // The value doesn't encode the low bit (always zero) and is offset by
+    // four. The 32-bit immediate value is encoded as
+    //   imm32 = SignExtend(S:I1:I2:imm10:imm11:0)
+    // where I1 = NOT(J1 ^ S) and I2 = NOT(J2 ^ S).
+    // The value is encoded into disjoint bit positions in the destination
+    // opcode. x = unchanged, I = immediate value bit, S = sign extension bit,
+    // J = either J1 or J2 bit
+    //
+    //   BL:  xxxxxSIIIIIIIIII xxJxJIIIIIIIIIII
+    //
+    // Note that the halfwords are stored high first, low second; so we need
+    // to transpose the fixup value here to map properly.
+    uint32_t offset = (Value - 4) >> 1;
+    uint32_t signBit = (offset & 0x800000) >> 23;
+    uint32_t I1Bit = (offset & 0x400000) >> 22;
+    uint32_t J1Bit = (I1Bit ^ 0x1) ^ signBit;
+    uint32_t I2Bit = (offset & 0x200000) >> 21;
+    uint32_t J2Bit = (I2Bit ^ 0x1) ^ signBit;
+    uint32_t imm10Bits = (offset & 0x1FF800) >> 11;
+    uint32_t imm11Bits = (offset & 0x000007FF);
+
+    uint32_t FirstHalf = (((uint16_t)signBit << 10) | (uint16_t)imm10Bits);
+    uint32_t SecondHalf = (((uint16_t)J1Bit << 13) | ((uint16_t)J2Bit << 11) |
+                          (uint16_t)imm11Bits);
+    return joinHalfWords(FirstHalf, SecondHalf, IsLittleEndian);
+  }
+  case ARM::fixup_arm_thumb_blx: {
+    // The value doesn't encode the low two bits (always zero) and is offset by
+    // four (see fixup_arm_thumb_cp). The 32-bit immediate value is encoded as
+    //   imm32 = SignExtend(S:I1:I2:imm10H:imm10L:00)
+    // where I1 = NOT(J1 ^ S) and I2 = NOT(J2 ^ S).
+    // The value is encoded into disjoint bit positions in the destination
+    // opcode. x = unchanged, I = immediate value bit, S = sign extension bit,
+    // J = either J1 or J2 bit, 0 = zero.
+    //
+    //   BLX: xxxxxSIIIIIIIIII xxJxJIIIIIIIIII0
+    //
+    // Note that the halfwords are stored high first, low second; so we need
+    // to transpose the fixup value here to map properly.
+    uint32_t offset = (Value - 2) >> 2;
+    if (const MCSymbolRefExpr *SRE = dyn_cast<MCSymbolRefExpr>(Fixup.getValue()))
+      if (SRE->getKind() == MCSymbolRefExpr::VK_ARM_TLSCALL)
+        offset = 0;
+    uint32_t signBit = (offset & 0x400000) >> 22;
+    uint32_t I1Bit = (offset & 0x200000) >> 21;
+    uint32_t J1Bit = (I1Bit ^ 0x1) ^ signBit;
+    uint32_t I2Bit = (offset & 0x100000) >> 20;
+    uint32_t J2Bit = (I2Bit ^ 0x1) ^ signBit;
+    uint32_t imm10HBits = (offset & 0xFFC00) >> 10;
+    uint32_t imm10LBits = (offset & 0x3FF);
+
+    uint32_t FirstHalf = (((uint16_t)signBit << 10) | (uint16_t)imm10HBits);
+    uint32_t SecondHalf = (((uint16_t)J1Bit << 13) | ((uint16_t)J2Bit << 11) |
+                          ((uint16_t)imm10LBits) << 1);
+    return joinHalfWords(FirstHalf, SecondHalf, IsLittleEndian);
+  }
+  case ARM::fixup_arm_thumb_cp:
+    // Offset by 4, and don't encode the low two bits. Two bytes of that
+    // 'off by 4' is implicitly handled by the half-word ordering of the
+    // Thumb encoding, so we only need to adjust by 2 here.
+    return ((Value - 2) >> 2) & 0xff;
+  case ARM::fixup_arm_thumb_cb: {
+    // Offset by 4 and don't encode the lower bit, which is always 0.
+    uint32_t Binary = (Value - 4) >> 1;
+    return ((Binary & 0x20) << 4) | ((Binary & 0x1f) << 3);
+  }
+  case ARM::fixup_arm_thumb_br:
+    // Offset by 4 and don't encode the lower bit, which is always 0.
+    return ((Value - 4) >> 1) & 0x7ff;
+  case ARM::fixup_arm_thumb_bcc:
+    // Offset by 4 and don't encode the lower bit, which is always 0.
+    return ((Value - 4) >> 1) & 0xff;
+  case ARM::fixup_arm_pcrel_10_unscaled: {
+    Value = Value - 8; // ARM fixups offset by an additional word and don't
+                       // need to adjust for the half-word ordering.
+    bool isAdd = true;
+    if ((int64_t)Value < 0) {
+      Value = -Value;
+      isAdd = false;
+    }
+    // The value has the low 4 bits encoded in [3:0] and the high 4 in [11:8].
+    if (Ctx && Value >= 256)
+      Ctx->FatalError(Fixup.getLoc(), "out of range pc-relative fixup value");
+    Value = (Value & 0xf) | ((Value & 0xf0) << 4);
+    return Value | (isAdd << 23);
+  }
+  case ARM::fixup_arm_pcrel_10:
+    Value = Value - 4; // ARM fixups offset by an additional word and don't
+                       // need to adjust for the half-word ordering.
+    // Fall through.
+  case ARM::fixup_t2_pcrel_10: {
+    // Offset by 4, adjusted by two due to the half-word ordering of thumb.
+    Value = Value - 4;
+    bool isAdd = true;
+    if ((int64_t)Value < 0) {
+      Value = -Value;
+      isAdd = false;
+    }
+    // These values don't encode the low two bits since they're always zero.
+    Value >>= 2;
+    if (Ctx && Value >= 256)
+      Ctx->FatalError(Fixup.getLoc(), "out of range pc-relative fixup value");
+    Value |= isAdd << 23;
+
+    // Same addressing mode as fixup_arm_pcrel_10, but with 16-bit halfwords
+    // swapped.
+    if (Kind == ARM::fixup_t2_pcrel_10)
+      return swapHalfWords(Value, IsLittleEndian);
+
+    return Value;
+  }
+  }
+}
+
+void ARMAsmBackend::processFixupValue(const MCAssembler &Asm,
+                                      const MCAsmLayout &Layout,
+                                      const MCFixup &Fixup,
+                                      const MCFragment *DF,
+                                      const MCValue &Target, uint64_t &Value,
+                                      bool &IsResolved) {
+  const MCSymbolRefExpr *A = Target.getSymA();
+  // Some fixups to thumb function symbols need the low bit (thumb bit)
+  // twiddled.
+  if ((unsigned)Fixup.getKind() != ARM::fixup_arm_ldst_pcrel_12 &&
+      (unsigned)Fixup.getKind() != ARM::fixup_t2_ldst_pcrel_12 &&
+      (unsigned)Fixup.getKind() != ARM::fixup_arm_adr_pcrel_12 &&
+      (unsigned)Fixup.getKind() != ARM::fixup_thumb_adr_pcrel_10 &&
+      (unsigned)Fixup.getKind() != ARM::fixup_t2_adr_pcrel_12 &&
+      (unsigned)Fixup.getKind() != ARM::fixup_arm_thumb_cp) {
+    if (A) {
+      const MCSymbol &Sym = A->getSymbol().AliasedSymbol();
+      if (Asm.isThumbFunc(&Sym))
+        Value |= 1;
+    }
+  }
+  // For Thumb1 BL instruction, it is possible to be a long jump between
+  // the basic blocks of the same function.  Thus, we would like to resolve
+  // the offset when the destination has the same MCFragment.
+  if (A && (unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_bl) {
+    const MCSymbol &Sym = A->getSymbol().AliasedSymbol();
+    const MCSymbolData &SymData = Asm.getSymbolData(Sym);
+    IsResolved = (SymData.getFragment() == DF);
+  }
+  // We must always generate a relocation for BL/BLX instructions if we have
+  // a symbol to reference, as the linker relies on knowing the destination
+  // symbol's thumb-ness to get interworking right.
+  if (A && ((unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_blx ||
+            (unsigned)Fixup.getKind() == ARM::fixup_arm_blx ||
+            (unsigned)Fixup.getKind() == ARM::fixup_arm_uncondbl ||
+            (unsigned)Fixup.getKind() == ARM::fixup_arm_condbl))
+    IsResolved = false;
+
+  // Try to get the encoded value for the fixup as-if we're mapping it into
+  // the instruction. This allows adjustFixupValue() to issue a diagnostic
+  // if the value aren't invalid.
+  (void)adjustFixupValue(Fixup, Value, false, &Asm.getContext(),
+                         IsLittleEndian);
+}
+
+/// getFixupKindNumBytes - The number of bytes the fixup may change.
+static unsigned getFixupKindNumBytes(unsigned Kind) {
+  switch (Kind) {
+  default:
+    llvm_unreachable("Unknown fixup kind!");
+
+  case FK_Data_1:
+  case ARM::fixup_arm_thumb_bcc:
+  case ARM::fixup_arm_thumb_cp:
+  case ARM::fixup_thumb_adr_pcrel_10:
+    return 1;
+
+  case FK_Data_2:
+  case ARM::fixup_arm_thumb_br:
+  case ARM::fixup_arm_thumb_cb:
+    return 2;
+
+  case ARM::fixup_arm_pcrel_10_unscaled:
+  case ARM::fixup_arm_ldst_pcrel_12:
+  case ARM::fixup_arm_pcrel_10:
+  case ARM::fixup_arm_adr_pcrel_12:
+  case ARM::fixup_arm_uncondbl:
+  case ARM::fixup_arm_condbl:
+  case ARM::fixup_arm_blx:
+  case ARM::fixup_arm_condbranch:
+  case ARM::fixup_arm_uncondbranch:
+    return 3;
+
+  case FK_Data_4:
+  case ARM::fixup_t2_ldst_pcrel_12:
+  case ARM::fixup_t2_condbranch:
+  case ARM::fixup_t2_uncondbranch:
+  case ARM::fixup_t2_pcrel_10:
+  case ARM::fixup_t2_adr_pcrel_12:
+  case ARM::fixup_arm_thumb_bl:
+  case ARM::fixup_arm_thumb_blx:
+  case ARM::fixup_arm_movt_hi16:
+  case ARM::fixup_arm_movw_lo16:
+  case ARM::fixup_t2_movt_hi16:
+  case ARM::fixup_t2_movw_lo16:
+    return 4;
+
+  case FK_SecRel_2:
+    return 2;
+  case FK_SecRel_4:
+    return 4;
+  }
+}
+
+/// getFixupKindContainerSizeBytes - The number of bytes of the
+/// container involved in big endian.
+static unsigned getFixupKindContainerSizeBytes(unsigned Kind) {
+  switch (Kind) {
+  default:
+    llvm_unreachable("Unknown fixup kind!");
+
+  case FK_Data_1:
+    return 1;
+  case FK_Data_2:
+    return 2;
+  case FK_Data_4:
+    return 4;
+
+  case ARM::fixup_arm_thumb_bcc:
+  case ARM::fixup_arm_thumb_cp:
+  case ARM::fixup_thumb_adr_pcrel_10:
+  case ARM::fixup_arm_thumb_br:
+  case ARM::fixup_arm_thumb_cb:
+    // Instruction size is 2 bytes.
+    return 2;
+
+  case ARM::fixup_arm_pcrel_10_unscaled:
+  case ARM::fixup_arm_ldst_pcrel_12:
+  case ARM::fixup_arm_pcrel_10:
+  case ARM::fixup_arm_adr_pcrel_12:
+  case ARM::fixup_arm_uncondbl:
+  case ARM::fixup_arm_condbl:
+  case ARM::fixup_arm_blx:
+  case ARM::fixup_arm_condbranch:
+  case ARM::fixup_arm_uncondbranch:
+  case ARM::fixup_t2_ldst_pcrel_12:
+  case ARM::fixup_t2_condbranch:
+  case ARM::fixup_t2_uncondbranch:
+  case ARM::fixup_t2_pcrel_10:
+  case ARM::fixup_t2_adr_pcrel_12:
+  case ARM::fixup_arm_thumb_bl:
+  case ARM::fixup_arm_thumb_blx:
+  case ARM::fixup_arm_movt_hi16:
+  case ARM::fixup_arm_movw_lo16:
+  case ARM::fixup_t2_movt_hi16:
+  case ARM::fixup_t2_movw_lo16:
+    // Instruction size is 4 bytes.
+    return 4;
+  }
+}
+
+void ARMAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+                               unsigned DataSize, uint64_t Value,
+                               bool IsPCRel) const {
+  unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
+  Value = adjustFixupValue(Fixup, Value, IsPCRel, nullptr, IsLittleEndian);
+  if (!Value) return;           // Doesn't change encoding.
+
+  unsigned Offset = Fixup.getOffset();
+  assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+
+  // Used to point to big endian bytes.
+  unsigned FullSizeBytes;
+  if (!IsLittleEndian) {
+    FullSizeBytes = getFixupKindContainerSizeBytes(Fixup.getKind());
+    assert((Offset + FullSizeBytes) <= DataSize && "Invalid fixup size!");
+    assert(NumBytes <= FullSizeBytes && "Invalid fixup size!");
+  }
+
+  // For each byte of the fragment that the fixup touches, mask in the bits from
+  // the fixup value. The Value has been "split up" into the appropriate
+  // bitfields above.
+  for (unsigned i = 0; i != NumBytes; ++i) {
+    unsigned Idx = IsLittleEndian ? i : (FullSizeBytes - 1 - i);
+    Data[Offset + Idx] |= uint8_t((Value >> (i * 8)) & 0xff);
+  }
+}
+
+namespace {
+// FIXME: This should be in a separate file.
+class ARMWinCOFFAsmBackend : public ARMAsmBackend {
+public:
+  ARMWinCOFFAsmBackend(const Target &T, const StringRef &Triple)
+    : ARMAsmBackend(T, Triple, true) { }
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
+    return createARMWinCOFFObjectWriter(OS, /*Is64Bit=*/false);
+  }
+};
+
+// FIXME: This should be in a separate file.
+// ELF is an ELF of course...
+class ELFARMAsmBackend : public ARMAsmBackend {
+public:
+  uint8_t OSABI;
+  ELFARMAsmBackend(const Target &T, const StringRef TT,
+                   uint8_t OSABI, bool IsLittle)
+    : ARMAsmBackend(T, TT, IsLittle), OSABI(OSABI) { }
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
+    return createARMELFObjectWriter(OS, OSABI, isLittle());
+  }
+};
+
+// FIXME: This should be in a separate file.
+class DarwinARMAsmBackend : public ARMAsmBackend {
+public:
+  const MachO::CPUSubTypeARM Subtype;
+  DarwinARMAsmBackend(const Target &T, const StringRef TT,
+                      MachO::CPUSubTypeARM st)
+    : ARMAsmBackend(T, TT, /* IsLittleEndian */ true), Subtype(st) {
+      HasDataInCodeSupport = true;
+    }
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
+    return createARMMachObjectWriter(OS, /*Is64Bit=*/false,
+                                     MachO::CPU_TYPE_ARM,
+                                     Subtype);
+  }
+};
+
+} // end anonymous namespace
+
+MCAsmBackend *llvm::createARMAsmBackend(const Target &T,
+                                        const MCRegisterInfo &MRI,
+                                        StringRef TT, StringRef CPU,
+                                        bool isLittle) {
+  Triple TheTriple(TT);
+
+  switch (TheTriple.getObjectFormat()) {
+  default: llvm_unreachable("unsupported object format");
+  case Triple::MachO: {
+    MachO::CPUSubTypeARM CS =
+      StringSwitch<MachO::CPUSubTypeARM>(TheTriple.getArchName())
+      .Cases("armv4t", "thumbv4t", MachO::CPU_SUBTYPE_ARM_V4T)
+      .Cases("armv5e", "thumbv5e", MachO::CPU_SUBTYPE_ARM_V5TEJ)
+      .Cases("armv6", "thumbv6", MachO::CPU_SUBTYPE_ARM_V6)
+      .Cases("armv6m", "thumbv6m", MachO::CPU_SUBTYPE_ARM_V6M)
+      .Cases("armv7em", "thumbv7em", MachO::CPU_SUBTYPE_ARM_V7EM)
+      .Cases("armv7k", "thumbv7k", MachO::CPU_SUBTYPE_ARM_V7K)
+      .Cases("armv7m", "thumbv7m", MachO::CPU_SUBTYPE_ARM_V7M)
+      .Cases("armv7s", "thumbv7s", MachO::CPU_SUBTYPE_ARM_V7S)
+      .Default(MachO::CPU_SUBTYPE_ARM_V7);
+
+    return new DarwinARMAsmBackend(T, TT, CS);
+  }
+  case Triple::COFF:
+    assert(TheTriple.isOSWindows() && "non-Windows ARM COFF is not supported");
+    return new ARMWinCOFFAsmBackend(T, TT);
+  case Triple::ELF:
+    assert(TheTriple.isOSBinFormatELF() && "using ELF for non-ELF target");
+    uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(Triple(TT).getOS());
+    return new ELFARMAsmBackend(T, TT, OSABI, isLittle);
+  }
+}
+
+MCAsmBackend *llvm::createARMLEAsmBackend(const Target &T,
+                                          const MCRegisterInfo &MRI,
+                                          StringRef TT, StringRef CPU) {
+  return createARMAsmBackend(T, MRI, TT, CPU, true);
+}
+
+MCAsmBackend *llvm::createARMBEAsmBackend(const Target &T,
+                                          const MCRegisterInfo &MRI,
+                                          StringRef TT, StringRef CPU) {
+  return createARMAsmBackend(T, MRI, TT, CPU, false);
+}
+
+MCAsmBackend *llvm::createThumbLEAsmBackend(const Target &T,
+                                          const MCRegisterInfo &MRI,
+                                          StringRef TT, StringRef CPU) {
+  return createARMAsmBackend(T, MRI, TT, CPU, true);
+}
+
+MCAsmBackend *llvm::createThumbBEAsmBackend(const Target &T,
+                                          const MCRegisterInfo &MRI,
+                                          StringRef TT, StringRef CPU) {
+  return createARMAsmBackend(T, MRI, TT, CPU, false);
+}
+
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
new file mode 100644
index 0000000..1686d76
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
@@ -0,0 +1,464 @@
+//===-- ARMBaseInfo.h - Top level definitions for ARM -------- --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the ARM target useful for the compiler back-end and the MC libraries.
+// As such, it deliberately does not include references to LLVM core
+// code gen types, passes, etc..
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMBASEINFO_H
+#define ARMBASEINFO_H
+
+#include "ARMMCTargetDesc.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+// Enums corresponding to ARM condition codes
+namespace ARMCC {
+  // The CondCodes constants map directly to the 4-bit encoding of the
+  // condition field for predicated instructions.
+  enum CondCodes { // Meaning (integer)          Meaning (floating-point)
+    EQ,            // Equal                      Equal
+    NE,            // Not equal                  Not equal, or unordered
+    HS,            // Carry set                  >, ==, or unordered
+    LO,            // Carry clear                Less than
+    MI,            // Minus, negative            Less than
+    PL,            // Plus, positive or zero     >, ==, or unordered
+    VS,            // Overflow                   Unordered
+    VC,            // No overflow                Not unordered
+    HI,            // Unsigned higher            Greater than, or unordered
+    LS,            // Unsigned lower or same     Less than or equal
+    GE,            // Greater than or equal      Greater than or equal
+    LT,            // Less than                  Less than, or unordered
+    GT,            // Greater than               Greater than
+    LE,            // Less than or equal         <, ==, or unordered
+    AL             // Always (unconditional)     Always (unconditional)
+  };
+
+  inline static CondCodes getOppositeCondition(CondCodes CC) {
+    switch (CC) {
+    default: llvm_unreachable("Unknown condition code");
+    case EQ: return NE;
+    case NE: return EQ;
+    case HS: return LO;
+    case LO: return HS;
+    case MI: return PL;
+    case PL: return MI;
+    case VS: return VC;
+    case VC: return VS;
+    case HI: return LS;
+    case LS: return HI;
+    case GE: return LT;
+    case LT: return GE;
+    case GT: return LE;
+    case LE: return GT;
+    }
+  }
+} // namespace ARMCC
+
+inline static const char *ARMCondCodeToString(ARMCC::CondCodes CC) {
+  switch (CC) {
+  case ARMCC::EQ:  return "eq";
+  case ARMCC::NE:  return "ne";
+  case ARMCC::HS:  return "hs";
+  case ARMCC::LO:  return "lo";
+  case ARMCC::MI:  return "mi";
+  case ARMCC::PL:  return "pl";
+  case ARMCC::VS:  return "vs";
+  case ARMCC::VC:  return "vc";
+  case ARMCC::HI:  return "hi";
+  case ARMCC::LS:  return "ls";
+  case ARMCC::GE:  return "ge";
+  case ARMCC::LT:  return "lt";
+  case ARMCC::GT:  return "gt";
+  case ARMCC::LE:  return "le";
+  case ARMCC::AL:  return "al";
+  }
+  llvm_unreachable("Unknown condition code");
+}
+
+namespace ARM_PROC {
+  enum IMod {
+    IE = 2,
+    ID = 3
+  };
+
+  enum IFlags {
+    F = 1,
+    I = 2,
+    A = 4
+  };
+
+  inline static const char *IFlagsToString(unsigned val) {
+    switch (val) {
+    default: llvm_unreachable("Unknown iflags operand");
+    case F: return "f";
+    case I: return "i";
+    case A: return "a";
+    }
+  }
+
+  inline static const char *IModToString(unsigned val) {
+    switch (val) {
+    default: llvm_unreachable("Unknown imod operand");
+    case IE: return "ie";
+    case ID: return "id";
+    }
+  }
+}
+
+namespace ARM_MB {
+  // The Memory Barrier Option constants map directly to the 4-bit encoding of
+  // the option field for memory barrier operations.
+  enum MemBOpt {
+    RESERVED_0 = 0,
+    OSHLD = 1,
+    OSHST = 2,
+    OSH   = 3,
+    RESERVED_4 = 4,
+    NSHLD = 5,
+    NSHST = 6,
+    NSH   = 7,
+    RESERVED_8 = 8,
+    ISHLD = 9,
+    ISHST = 10,
+    ISH   = 11,
+    RESERVED_12 = 12,
+    LD = 13,
+    ST    = 14,
+    SY    = 15
+  };
+
+  inline static const char *MemBOptToString(unsigned val, bool HasV8) {
+    switch (val) {
+    default: llvm_unreachable("Unknown memory operation");
+    case SY:    return "sy";
+    case ST:    return "st";
+    case LD: return HasV8 ? "ld" : "#0xd";
+    case RESERVED_12: return "#0xc";
+    case ISH:   return "ish";
+    case ISHST: return "ishst";
+    case ISHLD: return HasV8 ?  "ishld" : "#0x9";
+    case RESERVED_8: return "#0x8";
+    case NSH:   return "nsh";
+    case NSHST: return "nshst";
+    case NSHLD: return HasV8 ? "nshld" : "#0x5";
+    case RESERVED_4: return "#0x4";
+    case OSH:   return "osh";
+    case OSHST: return "oshst";
+    case OSHLD: return HasV8 ? "oshld" : "#0x1";
+    case RESERVED_0: return "#0x0";
+    }
+  }
+} // namespace ARM_MB
+
+namespace ARM_ISB {
+  enum InstSyncBOpt {
+    RESERVED_0 = 0,
+    RESERVED_1 = 1,
+    RESERVED_2 = 2,
+    RESERVED_3 = 3,
+    RESERVED_4 = 4,
+    RESERVED_5 = 5,
+    RESERVED_6 = 6,
+    RESERVED_7 = 7,
+    RESERVED_8 = 8,
+    RESERVED_9 = 9,
+    RESERVED_10 = 10,
+    RESERVED_11 = 11,
+    RESERVED_12 = 12,
+    RESERVED_13 = 13,
+    RESERVED_14 = 14,
+    SY = 15
+  };
+
+  inline static const char *InstSyncBOptToString(unsigned val) {
+    switch (val) {
+    default:
+      llvm_unreachable("Unknown memory operation");
+      case RESERVED_0:  return "#0x0";
+      case RESERVED_1:  return "#0x1";
+      case RESERVED_2:  return "#0x2";
+      case RESERVED_3:  return "#0x3";
+      case RESERVED_4:  return "#0x4";
+      case RESERVED_5:  return "#0x5";
+      case RESERVED_6:  return "#0x6";
+      case RESERVED_7:  return "#0x7";
+      case RESERVED_8:  return "#0x8";
+      case RESERVED_9:  return "#0x9";
+      case RESERVED_10: return "#0xa";
+      case RESERVED_11: return "#0xb";
+      case RESERVED_12: return "#0xc";
+      case RESERVED_13: return "#0xd";
+      case RESERVED_14: return "#0xe";
+      case SY:          return "sy";
+    }
+  }
+} // namespace ARM_ISB
+
+/// isARMLowRegister - Returns true if the register is a low register (r0-r7).
+///
+static inline bool isARMLowRegister(unsigned Reg) {
+  using namespace ARM;
+  switch (Reg) {
+  case R0:  case R1:  case R2:  case R3:
+  case R4:  case R5:  case R6:  case R7:
+    return true;
+  default:
+    return false;
+  }
+}
+
+/// ARMII - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace ARMII {
+
+  /// ARM Index Modes
+  enum IndexMode {
+    IndexModeNone  = 0,
+    IndexModePre   = 1,
+    IndexModePost  = 2,
+    IndexModeUpd   = 3
+  };
+
+  /// ARM Addressing Modes
+  enum AddrMode {
+    AddrModeNone    = 0,
+    AddrMode1       = 1,
+    AddrMode2       = 2,
+    AddrMode3       = 3,
+    AddrMode4       = 4,
+    AddrMode5       = 5,
+    AddrMode6       = 6,
+    AddrModeT1_1    = 7,
+    AddrModeT1_2    = 8,
+    AddrModeT1_4    = 9,
+    AddrModeT1_s    = 10, // i8 * 4 for pc and sp relative data
+    AddrModeT2_i12  = 11,
+    AddrModeT2_i8   = 12,
+    AddrModeT2_so   = 13,
+    AddrModeT2_pc   = 14, // +/- i12 for pc relative data
+    AddrModeT2_i8s4 = 15, // i8 * 4
+    AddrMode_i12    = 16
+  };
+
+  inline static const char *AddrModeToString(AddrMode addrmode) {
+    switch (addrmode) {
+    case AddrModeNone:    return "AddrModeNone";
+    case AddrMode1:       return "AddrMode1";
+    case AddrMode2:       return "AddrMode2";
+    case AddrMode3:       return "AddrMode3";
+    case AddrMode4:       return "AddrMode4";
+    case AddrMode5:       return "AddrMode5";
+    case AddrMode6:       return "AddrMode6";
+    case AddrModeT1_1:    return "AddrModeT1_1";
+    case AddrModeT1_2:    return "AddrModeT1_2";
+    case AddrModeT1_4:    return "AddrModeT1_4";
+    case AddrModeT1_s:    return "AddrModeT1_s";
+    case AddrModeT2_i12:  return "AddrModeT2_i12";
+    case AddrModeT2_i8:   return "AddrModeT2_i8";
+    case AddrModeT2_so:   return "AddrModeT2_so";
+    case AddrModeT2_pc:   return "AddrModeT2_pc";
+    case AddrModeT2_i8s4: return "AddrModeT2_i8s4";
+    case AddrMode_i12:    return "AddrMode_i12";
+    }
+  }
+
+  /// Target Operand Flag enum.
+  enum TOF {
+    //===------------------------------------------------------------------===//
+    // ARM Specific MachineOperand flags.
+
+    MO_NO_FLAG = 0,
+
+    /// MO_LO16 - On a symbol operand, this represents a relocation containing
+    /// lower 16 bit of the address. Used only via movw instruction.
+    MO_LO16 = 0x1,
+
+    /// MO_HI16 - On a symbol operand, this represents a relocation containing
+    /// higher 16 bit of the address. Used only via movt instruction.
+    MO_HI16 = 0x2,
+
+    /// MO_PLT - On a symbol operand, this represents an ELF PLT reference on a
+    /// call operand.
+    MO_PLT = 0x3,
+
+    /// MO_OPTION_MASK - Most flags are mutually exclusive; this mask selects
+    /// just that part of the flag set.
+    MO_OPTION_MASK = 0x3f,
+
+    /// MO_DLLIMPORT - On a symbol operand, this represents that the reference
+    /// to the symbol is for an import stub.  This is used for DLL import
+    /// storage class indication on Windows.
+    MO_DLLIMPORT = 0x40,
+
+    /// MO_NONLAZY - This is an independent flag, on a symbol operand "FOO" it
+    /// represents a symbol which, if indirect, will get special Darwin mangling
+    /// as a non-lazy-ptr indirect symbol (i.e. "L_FOO$non_lazy_ptr"). Can be
+    /// combined with MO_LO16, MO_HI16 or MO_NO_FLAG (in a constant-pool, for
+    /// example).
+    MO_NONLAZY = 0x80,
+
+    // It's undefined behaviour if an enum overflows the range between its
+    // smallest and largest values, but since these are |ed together, it can
+    // happen. Put a sentinel in (values of this enum are stored as "unsigned
+    // char").
+    MO_UNUSED_MAXIMUM = 0xff
+  };
+
+  enum {
+    //===------------------------------------------------------------------===//
+    // Instruction Flags.
+
+    //===------------------------------------------------------------------===//
+    // This four-bit field describes the addressing mode used.
+    AddrModeMask  = 0x1f, // The AddrMode enums are declared in ARMBaseInfo.h
+
+    // IndexMode - Unindex, pre-indexed, or post-indexed are valid for load
+    // and store ops only.  Generic "updating" flag is used for ld/st multiple.
+    // The index mode enums are declared in ARMBaseInfo.h
+    IndexModeShift = 5,
+    IndexModeMask  = 3 << IndexModeShift,
+
+    //===------------------------------------------------------------------===//
+    // Instruction encoding formats.
+    //
+    FormShift     = 7,
+    FormMask      = 0x3f << FormShift,
+
+    // Pseudo instructions
+    Pseudo        = 0  << FormShift,
+
+    // Multiply instructions
+    MulFrm        = 1  << FormShift,
+
+    // Branch instructions
+    BrFrm         = 2  << FormShift,
+    BrMiscFrm     = 3  << FormShift,
+
+    // Data Processing instructions
+    DPFrm         = 4  << FormShift,
+    DPSoRegFrm    = 5  << FormShift,
+
+    // Load and Store
+    LdFrm         = 6  << FormShift,
+    StFrm         = 7  << FormShift,
+    LdMiscFrm     = 8  << FormShift,
+    StMiscFrm     = 9  << FormShift,
+    LdStMulFrm    = 10 << FormShift,
+
+    LdStExFrm     = 11 << FormShift,
+
+    // Miscellaneous arithmetic instructions
+    ArithMiscFrm  = 12 << FormShift,
+    SatFrm        = 13 << FormShift,
+
+    // Extend instructions
+    ExtFrm        = 14 << FormShift,
+
+    // VFP formats
+    VFPUnaryFrm   = 15 << FormShift,
+    VFPBinaryFrm  = 16 << FormShift,
+    VFPConv1Frm   = 17 << FormShift,
+    VFPConv2Frm   = 18 << FormShift,
+    VFPConv3Frm   = 19 << FormShift,
+    VFPConv4Frm   = 20 << FormShift,
+    VFPConv5Frm   = 21 << FormShift,
+    VFPLdStFrm    = 22 << FormShift,
+    VFPLdStMulFrm = 23 << FormShift,
+    VFPMiscFrm    = 24 << FormShift,
+
+    // Thumb format
+    ThumbFrm      = 25 << FormShift,
+
+    // Miscelleaneous format
+    MiscFrm       = 26 << FormShift,
+
+    // NEON formats
+    NGetLnFrm     = 27 << FormShift,
+    NSetLnFrm     = 28 << FormShift,
+    NDupFrm       = 29 << FormShift,
+    NLdStFrm      = 30 << FormShift,
+    N1RegModImmFrm= 31 << FormShift,
+    N2RegFrm      = 32 << FormShift,
+    NVCVTFrm      = 33 << FormShift,
+    NVDupLnFrm    = 34 << FormShift,
+    N2RegVShLFrm  = 35 << FormShift,
+    N2RegVShRFrm  = 36 << FormShift,
+    N3RegFrm      = 37 << FormShift,
+    N3RegVShFrm   = 38 << FormShift,
+    NVExtFrm      = 39 << FormShift,
+    NVMulSLFrm    = 40 << FormShift,
+    NVTBLFrm      = 41 << FormShift,
+
+    //===------------------------------------------------------------------===//
+    // Misc flags.
+
+    // UnaryDP - Indicates this is a unary data processing instruction, i.e.
+    // it doesn't have a Rn operand.
+    UnaryDP       = 1 << 13,
+
+    // Xform16Bit - Indicates this Thumb2 instruction may be transformed into
+    // a 16-bit Thumb instruction if certain conditions are met.
+    Xform16Bit    = 1 << 14,
+
+    // ThumbArithFlagSetting - The instruction is a 16-bit flag setting Thumb
+    // instruction. Used by the parser to determine whether to require the 'S'
+    // suffix on the mnemonic (when not in an IT block) or preclude it (when
+    // in an IT block).
+    ThumbArithFlagSetting = 1 << 18,
+
+    //===------------------------------------------------------------------===//
+    // Code domain.
+    DomainShift   = 15,
+    DomainMask    = 7 << DomainShift,
+    DomainGeneral = 0 << DomainShift,
+    DomainVFP     = 1 << DomainShift,
+    DomainNEON    = 2 << DomainShift,
+    DomainNEONA8  = 4 << DomainShift,
+
+    //===------------------------------------------------------------------===//
+    // Field shifts - such shifts are used to set field while generating
+    // machine instructions.
+    //
+    // FIXME: This list will need adjusting/fixing as the MC code emitter
+    // takes shape and the ARMCodeEmitter.cpp bits go away.
+    ShiftTypeShift = 4,
+
+    M_BitShift     = 5,
+    ShiftImmShift  = 5,
+    ShiftShift     = 7,
+    N_BitShift     = 7,
+    ImmHiShift     = 8,
+    SoRotImmShift  = 8,
+    RegRsShift     = 8,
+    ExtRotImmShift = 10,
+    RegRdLoShift   = 12,
+    RegRdShift     = 12,
+    RegRdHiShift   = 16,
+    RegRnShift     = 16,
+    S_BitShift     = 20,
+    W_BitShift     = 21,
+    AM3_I_BitShift = 22,
+    D_BitShift     = 22,
+    U_BitShift     = 23,
+    P_BitShift     = 24,
+    I_BitShift     = 25,
+    CondShift      = 28
+  };
+
+} // end namespace ARMII
+
+} // end namespace llvm;
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
new file mode 100644
index 0000000..a86601b
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
@@ -0,0 +1,236 @@
+//===-- ARMELFObjectWriter.cpp - ARM ELF Writer ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/ARMMCTargetDesc.h"
+#include "MCTargetDesc/ARMFixupKinds.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+  class ARMELFObjectWriter : public MCELFObjectTargetWriter {
+    enum { DefaultEABIVersion = 0x05000000U };
+    unsigned GetRelocTypeInner(const MCValue &Target,
+                               const MCFixup &Fixup,
+                               bool IsPCRel) const;
+
+
+  public:
+    ARMELFObjectWriter(uint8_t OSABI);
+
+    virtual ~ARMELFObjectWriter();
+
+    unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
+                          bool IsPCRel) const override;
+
+    bool needsRelocateWithSymbol(const MCSymbolData &SD,
+                                 unsigned Type) const override;
+  };
+}
+
+ARMELFObjectWriter::ARMELFObjectWriter(uint8_t OSABI)
+  : MCELFObjectTargetWriter(/*Is64Bit*/ false, OSABI,
+                            ELF::EM_ARM,
+                            /*HasRelocationAddend*/ false) {}
+
+ARMELFObjectWriter::~ARMELFObjectWriter() {}
+
+bool ARMELFObjectWriter::needsRelocateWithSymbol(const MCSymbolData &SD,
+                                                 unsigned Type) const {
+  // FIXME: This is extremelly conservative. This really needs to use a
+  // whitelist with a clear explanation for why each realocation needs to
+  // point to the symbol, not to the section.
+  switch (Type) {
+  default:
+    return true;
+
+  case ELF::R_ARM_PREL31:
+  case ELF::R_ARM_ABS32:
+    return false;
+  }
+}
+
+// Need to examine the Fixup when determining whether to 
+// emit the relocation as an explicit symbol or as a section relative
+// offset
+unsigned ARMELFObjectWriter::GetRelocType(const MCValue &Target,
+                                          const MCFixup &Fixup,
+                                          bool IsPCRel) const {
+  return GetRelocTypeInner(Target, Fixup, IsPCRel);
+}
+
+unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
+                                               const MCFixup &Fixup,
+                                               bool IsPCRel) const  {
+  MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant();
+
+  unsigned Type = 0;
+  if (IsPCRel) {
+    switch ((unsigned)Fixup.getKind()) {
+    default: llvm_unreachable("Unimplemented");
+    case FK_Data_4:
+      switch (Modifier) {
+      default: llvm_unreachable("Unsupported Modifier");
+      case MCSymbolRefExpr::VK_None:
+        Type = ELF::R_ARM_REL32;
+        break;
+      case MCSymbolRefExpr::VK_TLSGD:
+        llvm_unreachable("unimplemented");
+      case MCSymbolRefExpr::VK_GOTTPOFF:
+        Type = ELF::R_ARM_TLS_IE32;
+        break;
+      case MCSymbolRefExpr::VK_GOTPCREL:
+        Type = ELF::R_ARM_GOT_PREL;
+        break;
+      }
+      break;
+    case ARM::fixup_arm_blx:
+    case ARM::fixup_arm_uncondbl:
+      switch (Modifier) {
+      case MCSymbolRefExpr::VK_PLT:
+        Type = ELF::R_ARM_PLT32;
+        break;
+      case MCSymbolRefExpr::VK_ARM_TLSCALL:
+        Type = ELF::R_ARM_TLS_CALL;
+        break;
+      default:
+        Type = ELF::R_ARM_CALL;
+        break;
+      }
+      break;
+    case ARM::fixup_arm_condbl:
+    case ARM::fixup_arm_condbranch:
+    case ARM::fixup_arm_uncondbranch:
+      Type = ELF::R_ARM_JUMP24;
+      break;
+    case ARM::fixup_t2_condbranch:
+    case ARM::fixup_t2_uncondbranch:
+      Type = ELF::R_ARM_THM_JUMP24;
+      break;
+    case ARM::fixup_arm_movt_hi16:
+      Type = ELF::R_ARM_MOVT_PREL;
+      break;
+    case ARM::fixup_arm_movw_lo16:
+      Type = ELF::R_ARM_MOVW_PREL_NC;
+      break;
+    case ARM::fixup_t2_movt_hi16:
+      Type = ELF::R_ARM_THM_MOVT_PREL;
+      break;
+    case ARM::fixup_t2_movw_lo16:
+      Type = ELF::R_ARM_THM_MOVW_PREL_NC;
+      break;
+    case ARM::fixup_arm_thumb_bl:
+    case ARM::fixup_arm_thumb_blx:
+      switch (Modifier) {
+      case MCSymbolRefExpr::VK_ARM_TLSCALL:
+        Type = ELF::R_ARM_THM_TLS_CALL;
+        break;
+      default:
+        Type = ELF::R_ARM_THM_CALL;
+        break;
+      }
+      break;
+    }
+  } else {
+    switch ((unsigned)Fixup.getKind()) {
+    default: llvm_unreachable("invalid fixup kind!");
+    case FK_Data_4:
+      switch (Modifier) {
+      default: llvm_unreachable("Unsupported Modifier");
+      case MCSymbolRefExpr::VK_ARM_NONE:
+        Type = ELF::R_ARM_NONE;
+        break;
+      case MCSymbolRefExpr::VK_GOT:
+        Type = ELF::R_ARM_GOT_BREL;
+        break;
+      case MCSymbolRefExpr::VK_TLSGD:
+        Type = ELF::R_ARM_TLS_GD32;
+        break;
+      case MCSymbolRefExpr::VK_TPOFF:
+        Type = ELF::R_ARM_TLS_LE32;
+        break;
+      case MCSymbolRefExpr::VK_GOTTPOFF:
+        Type = ELF::R_ARM_TLS_IE32;
+        break;
+      case MCSymbolRefExpr::VK_None:
+        Type = ELF::R_ARM_ABS32;
+        break;
+      case MCSymbolRefExpr::VK_GOTOFF:
+        Type = ELF::R_ARM_GOTOFF32;
+        break;
+      case MCSymbolRefExpr::VK_GOTPCREL:
+        Type = ELF::R_ARM_GOT_PREL;
+        break;
+      case MCSymbolRefExpr::VK_ARM_TARGET1:
+        Type = ELF::R_ARM_TARGET1;
+        break;
+      case MCSymbolRefExpr::VK_ARM_TARGET2:
+        Type = ELF::R_ARM_TARGET2;
+        break;
+      case MCSymbolRefExpr::VK_ARM_PREL31:
+        Type = ELF::R_ARM_PREL31;
+        break;
+      case MCSymbolRefExpr::VK_ARM_TLSLDO:
+        Type = ELF::R_ARM_TLS_LDO32;
+        break;
+      case MCSymbolRefExpr::VK_ARM_TLSCALL:
+        Type = ELF::R_ARM_TLS_CALL;
+        break;
+      case MCSymbolRefExpr::VK_ARM_TLSDESC:
+        Type = ELF::R_ARM_TLS_GOTDESC;
+        break;
+      case MCSymbolRefExpr::VK_ARM_TLSDESCSEQ:
+        Type = ELF::R_ARM_TLS_DESCSEQ;
+        break;
+      }
+      break;
+    case ARM::fixup_arm_ldst_pcrel_12:
+    case ARM::fixup_arm_pcrel_10:
+    case ARM::fixup_arm_adr_pcrel_12:
+    case ARM::fixup_arm_thumb_bl:
+    case ARM::fixup_arm_thumb_cb:
+    case ARM::fixup_arm_thumb_cp:
+    case ARM::fixup_arm_thumb_br:
+      llvm_unreachable("Unimplemented");
+    case ARM::fixup_arm_condbranch:
+    case ARM::fixup_arm_uncondbranch:
+      Type = ELF::R_ARM_JUMP24;
+      break;
+    case ARM::fixup_arm_movt_hi16:
+      Type = ELF::R_ARM_MOVT_ABS;
+      break;
+    case ARM::fixup_arm_movw_lo16:
+      Type = ELF::R_ARM_MOVW_ABS_NC;
+      break;
+    case ARM::fixup_t2_movt_hi16:
+      Type = ELF::R_ARM_THM_MOVT_ABS;
+      break;
+    case ARM::fixup_t2_movw_lo16:
+      Type = ELF::R_ARM_THM_MOVW_ABS_NC;
+      break;
+    }
+  }
+
+  return Type;
+}
+
+MCObjectWriter *llvm::createARMELFObjectWriter(raw_ostream &OS,
+                                               uint8_t OSABI,
+                                               bool IsLittleEndian) {
+  MCELFObjectTargetWriter *MOTW = new ARMELFObjectWriter(OSABI);
+  return createELFObjectWriter(MOTW, OS, IsLittleEndian);
+}
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
new file mode 100644
index 0000000..7b5d8b0
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -0,0 +1,1362 @@
+//===- lib/MC/ARMELFStreamer.cpp - ELF Object Output for ARM --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file assembles .s files and emits ARM ELF .o object files. Different
+// from generic ELF streamer in emitting mapping symbols ($a, $t and $d) to
+// delimit regions of data and code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMArchName.h"
+#include "ARMFPUName.h"
+#include "ARMRegisterInfo.h"
+#include "ARMUnwindOpAsm.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELF.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCELFSymbolFlags.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ARMBuildAttributes.h"
+#include "llvm/Support/ARMEHABI.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+
+using namespace llvm;
+
+static std::string GetAEABIUnwindPersonalityName(unsigned Index) {
+  assert(Index < ARM::EHABI::NUM_PERSONALITY_INDEX &&
+         "Invalid personality index");
+  return (Twine("__aeabi_unwind_cpp_pr") + Twine(Index)).str();
+}
+
+static const char *GetFPUName(unsigned ID) {
+  switch (ID) {
+  default:
+    llvm_unreachable("Unknown FPU kind");
+    break;
+#define ARM_FPU_NAME(NAME, ID) case ARM::ID: return NAME;
+#include "ARMFPUName.def"
+  }
+  return nullptr;
+}
+
+static const char *GetArchName(unsigned ID) {
+  switch (ID) {
+  default:
+    llvm_unreachable("Unknown ARCH kind");
+    break;
+#define ARM_ARCH_NAME(NAME, ID, DEFAULT_CPU_NAME, DEFAULT_CPU_ARCH) \
+  case ARM::ID: return NAME;
+#define ARM_ARCH_ALIAS(NAME, ID) /* empty */
+#include "ARMArchName.def"
+  }
+  return nullptr;
+}
+
+static const char *GetArchDefaultCPUName(unsigned ID) {
+  switch (ID) {
+  default:
+    llvm_unreachable("Unknown ARCH kind");
+    break;
+#define ARM_ARCH_NAME(NAME, ID, DEFAULT_CPU_NAME, DEFAULT_CPU_ARCH) \
+  case ARM::ID: return DEFAULT_CPU_NAME;
+#define ARM_ARCH_ALIAS(NAME, ID) /* empty */
+#include "ARMArchName.def"
+  }
+  return nullptr;
+}
+
+static unsigned GetArchDefaultCPUArch(unsigned ID) {
+  switch (ID) {
+  default:
+    llvm_unreachable("Unknown ARCH kind");
+    break;
+#define ARM_ARCH_NAME(NAME, ID, DEFAULT_CPU_NAME, DEFAULT_CPU_ARCH) \
+  case ARM::ID: return ARMBuildAttrs::DEFAULT_CPU_ARCH;
+#define ARM_ARCH_ALIAS(NAME, ID) /* empty */
+#include "ARMArchName.def"
+  }
+  return 0;
+}
+
+namespace {
+
+class ARMELFStreamer;
+
+class ARMTargetAsmStreamer : public ARMTargetStreamer {
+  formatted_raw_ostream &OS;
+  MCInstPrinter &InstPrinter;
+  bool IsVerboseAsm;
+
+  void emitFnStart() override;
+  void emitFnEnd() override;
+  void emitCantUnwind() override;
+  void emitPersonality(const MCSymbol *Personality) override;
+  void emitPersonalityIndex(unsigned Index) override;
+  void emitHandlerData() override;
+  void emitSetFP(unsigned FpReg, unsigned SpReg, int64_t Offset = 0) override;
+  void emitMovSP(unsigned Reg, int64_t Offset = 0) override;
+  void emitPad(int64_t Offset) override;
+  void emitRegSave(const SmallVectorImpl<unsigned> &RegList,
+                   bool isVector) override;
+  void emitUnwindRaw(int64_t Offset,
+                     const SmallVectorImpl<uint8_t> &Opcodes) override;
+
+  void switchVendor(StringRef Vendor) override;
+  void emitAttribute(unsigned Attribute, unsigned Value) override;
+  void emitTextAttribute(unsigned Attribute, StringRef String) override;
+  void emitIntTextAttribute(unsigned Attribute, unsigned IntValue,
+                            StringRef StrinValue) override;
+  void emitArch(unsigned Arch) override;
+  void emitObjectArch(unsigned Arch) override;
+  void emitFPU(unsigned FPU) override;
+  void emitInst(uint32_t Inst, char Suffix = '\0') override;
+  void finishAttributeSection() override;
+
+  void AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *SRE) override;
+  void emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) override;
+
+public:
+  ARMTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS,
+                       MCInstPrinter &InstPrinter, bool VerboseAsm);
+};
+
+ARMTargetAsmStreamer::ARMTargetAsmStreamer(MCStreamer &S,
+                                           formatted_raw_ostream &OS,
+                                           MCInstPrinter &InstPrinter,
+                                           bool VerboseAsm)
+    : ARMTargetStreamer(S), OS(OS), InstPrinter(InstPrinter),
+      IsVerboseAsm(VerboseAsm) {}
+void ARMTargetAsmStreamer::emitFnStart() { OS << "\t.fnstart\n"; }
+void ARMTargetAsmStreamer::emitFnEnd() { OS << "\t.fnend\n"; }
+void ARMTargetAsmStreamer::emitCantUnwind() { OS << "\t.cantunwind\n"; }
+void ARMTargetAsmStreamer::emitPersonality(const MCSymbol *Personality) {
+  OS << "\t.personality " << Personality->getName() << '\n';
+}
+void ARMTargetAsmStreamer::emitPersonalityIndex(unsigned Index) {
+  OS << "\t.personalityindex " << Index << '\n';
+}
+void ARMTargetAsmStreamer::emitHandlerData() { OS << "\t.handlerdata\n"; }
+void ARMTargetAsmStreamer::emitSetFP(unsigned FpReg, unsigned SpReg,
+                                     int64_t Offset) {
+  OS << "\t.setfp\t";
+  InstPrinter.printRegName(OS, FpReg);
+  OS << ", ";
+  InstPrinter.printRegName(OS, SpReg);
+  if (Offset)
+    OS << ", #" << Offset;
+  OS << '\n';
+}
+void ARMTargetAsmStreamer::emitMovSP(unsigned Reg, int64_t Offset) {
+  assert((Reg != ARM::SP && Reg != ARM::PC) &&
+         "the operand of .movsp cannot be either sp or pc");
+
+  OS << "\t.movsp\t";
+  InstPrinter.printRegName(OS, Reg);
+  if (Offset)
+    OS << ", #" << Offset;
+  OS << '\n';
+}
+void ARMTargetAsmStreamer::emitPad(int64_t Offset) {
+  OS << "\t.pad\t#" << Offset << '\n';
+}
+void ARMTargetAsmStreamer::emitRegSave(const SmallVectorImpl<unsigned> &RegList,
+                                       bool isVector) {
+  assert(RegList.size() && "RegList should not be empty");
+  if (isVector)
+    OS << "\t.vsave\t{";
+  else
+    OS << "\t.save\t{";
+
+  InstPrinter.printRegName(OS, RegList[0]);
+
+  for (unsigned i = 1, e = RegList.size(); i != e; ++i) {
+    OS << ", ";
+    InstPrinter.printRegName(OS, RegList[i]);
+  }
+
+  OS << "}\n";
+}
+void ARMTargetAsmStreamer::switchVendor(StringRef Vendor) {
+}
+void ARMTargetAsmStreamer::emitAttribute(unsigned Attribute, unsigned Value) {
+  OS << "\t.eabi_attribute\t" << Attribute << ", " << Twine(Value);
+  if (IsVerboseAsm) {
+    StringRef Name = ARMBuildAttrs::AttrTypeAsString(Attribute);
+    if (!Name.empty())
+      OS << "\t@ " << Name;
+  }
+  OS << "\n";
+}
+void ARMTargetAsmStreamer::emitTextAttribute(unsigned Attribute,
+                                             StringRef String) {
+  switch (Attribute) {
+  case ARMBuildAttrs::CPU_name:
+    OS << "\t.cpu\t" << String.lower();
+    break;
+  default:
+    OS << "\t.eabi_attribute\t" << Attribute << ", \"" << String << "\"";
+    if (IsVerboseAsm) {
+      StringRef Name = ARMBuildAttrs::AttrTypeAsString(Attribute);
+      if (!Name.empty())
+        OS << "\t@ " << Name;
+    }
+    break;
+  }
+  OS << "\n";
+}
+void ARMTargetAsmStreamer::emitIntTextAttribute(unsigned Attribute,
+                                                unsigned IntValue,
+                                                StringRef StringValue) {
+  switch (Attribute) {
+  default: llvm_unreachable("unsupported multi-value attribute in asm mode");
+  case ARMBuildAttrs::compatibility:
+    OS << "\t.eabi_attribute\t" << Attribute << ", " << IntValue;
+    if (!StringValue.empty())
+      OS << ", \"" << StringValue << "\"";
+    if (IsVerboseAsm)
+      OS << "\t@ " << ARMBuildAttrs::AttrTypeAsString(Attribute);
+    break;
+  }
+  OS << "\n";
+}
+void ARMTargetAsmStreamer::emitArch(unsigned Arch) {
+  OS << "\t.arch\t" << GetArchName(Arch) << "\n";
+}
+void ARMTargetAsmStreamer::emitObjectArch(unsigned Arch) {
+  OS << "\t.object_arch\t" << GetArchName(Arch) << '\n';
+}
+void ARMTargetAsmStreamer::emitFPU(unsigned FPU) {
+  OS << "\t.fpu\t" << GetFPUName(FPU) << "\n";
+}
+void ARMTargetAsmStreamer::finishAttributeSection() {
+}
+void
+ARMTargetAsmStreamer::AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *S) {
+  OS << "\t.tlsdescseq\t" << S->getSymbol().getName();
+}
+
+void ARMTargetAsmStreamer::emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) {
+  OS << "\t.thumb_set\t" << *Symbol << ", " << *Value << '\n';
+}
+
+void ARMTargetAsmStreamer::emitInst(uint32_t Inst, char Suffix) {
+  OS << "\t.inst";
+  if (Suffix)
+    OS << "." << Suffix;
+  OS << "\t0x" << utohexstr(Inst) << "\n";
+}
+
+void ARMTargetAsmStreamer::emitUnwindRaw(int64_t Offset,
+                                      const SmallVectorImpl<uint8_t> &Opcodes) {
+  OS << "\t.unwind_raw " << Offset;
+  for (SmallVectorImpl<uint8_t>::const_iterator OCI = Opcodes.begin(),
+                                                OCE = Opcodes.end();
+       OCI != OCE; ++OCI)
+    OS << ", 0x" << utohexstr(*OCI);
+  OS << '\n';
+}
+
+class ARMTargetELFStreamer : public ARMTargetStreamer {
+private:
+  // This structure holds all attributes, accounting for
+  // their string/numeric value, so we can later emmit them
+  // in declaration order, keeping all in the same vector
+  struct AttributeItem {
+    enum {
+      HiddenAttribute = 0,
+      NumericAttribute,
+      TextAttribute,
+      NumericAndTextAttributes
+    } Type;
+    unsigned Tag;
+    unsigned IntValue;
+    StringRef StringValue;
+
+    static bool LessTag(const AttributeItem &LHS, const AttributeItem &RHS) {
+      return (LHS.Tag < RHS.Tag);
+    }
+  };
+
+  StringRef CurrentVendor;
+  unsigned FPU;
+  unsigned Arch;
+  unsigned EmittedArch;
+  SmallVector<AttributeItem, 64> Contents;
+
+  const MCSection *AttributeSection;
+
+  AttributeItem *getAttributeItem(unsigned Attribute) {
+    for (size_t i = 0; i < Contents.size(); ++i)
+      if (Contents[i].Tag == Attribute)
+        return &Contents[i];
+    return nullptr;
+  }
+
+  void setAttributeItem(unsigned Attribute, unsigned Value,
+                        bool OverwriteExisting) {
+    // Look for existing attribute item
+    if (AttributeItem *Item = getAttributeItem(Attribute)) {
+      if (!OverwriteExisting)
+        return;
+      Item->Type = AttributeItem::NumericAttribute;
+      Item->IntValue = Value;
+      return;
+    }
+
+    // Create new attribute item
+    AttributeItem Item = {
+      AttributeItem::NumericAttribute,
+      Attribute,
+      Value,
+      StringRef("")
+    };
+    Contents.push_back(Item);
+  }
+
+  void setAttributeItem(unsigned Attribute, StringRef Value,
+                        bool OverwriteExisting) {
+    // Look for existing attribute item
+    if (AttributeItem *Item = getAttributeItem(Attribute)) {
+      if (!OverwriteExisting)
+        return;
+      Item->Type = AttributeItem::TextAttribute;
+      Item->StringValue = Value;
+      return;
+    }
+
+    // Create new attribute item
+    AttributeItem Item = {
+      AttributeItem::TextAttribute,
+      Attribute,
+      0,
+      Value
+    };
+    Contents.push_back(Item);
+  }
+
+  void setAttributeItems(unsigned Attribute, unsigned IntValue,
+                         StringRef StringValue, bool OverwriteExisting) {
+    // Look for existing attribute item
+    if (AttributeItem *Item = getAttributeItem(Attribute)) {
+      if (!OverwriteExisting)
+        return;
+      Item->Type = AttributeItem::NumericAndTextAttributes;
+      Item->IntValue = IntValue;
+      Item->StringValue = StringValue;
+      return;
+    }
+
+    // Create new attribute item
+    AttributeItem Item = {
+      AttributeItem::NumericAndTextAttributes,
+      Attribute,
+      IntValue,
+      StringValue
+    };
+    Contents.push_back(Item);
+  }
+
+  void emitArchDefaultAttributes();
+  void emitFPUDefaultAttributes();
+
+  ARMELFStreamer &getStreamer();
+
+  void emitFnStart() override;
+  void emitFnEnd() override;
+  void emitCantUnwind() override;
+  void emitPersonality(const MCSymbol *Personality) override;
+  void emitPersonalityIndex(unsigned Index) override;
+  void emitHandlerData() override;
+  void emitSetFP(unsigned FpReg, unsigned SpReg, int64_t Offset = 0) override;
+  void emitMovSP(unsigned Reg, int64_t Offset = 0) override;
+  void emitPad(int64_t Offset) override;
+  void emitRegSave(const SmallVectorImpl<unsigned> &RegList,
+                   bool isVector) override;
+  void emitUnwindRaw(int64_t Offset,
+                     const SmallVectorImpl<uint8_t> &Opcodes) override;
+
+  void switchVendor(StringRef Vendor) override;
+  void emitAttribute(unsigned Attribute, unsigned Value) override;
+  void emitTextAttribute(unsigned Attribute, StringRef String) override;
+  void emitIntTextAttribute(unsigned Attribute, unsigned IntValue,
+                            StringRef StringValue) override;
+  void emitArch(unsigned Arch) override;
+  void emitObjectArch(unsigned Arch) override;
+  void emitFPU(unsigned FPU) override;
+  void emitInst(uint32_t Inst, char Suffix = '\0') override;
+  void finishAttributeSection() override;
+  void emitLabel(MCSymbol *Symbol) override;
+
+  void AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *SRE) override;
+  void emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) override;
+
+  size_t calculateContentSize() const;
+
+public:
+  ARMTargetELFStreamer(MCStreamer &S)
+    : ARMTargetStreamer(S), CurrentVendor("aeabi"), FPU(ARM::INVALID_FPU),
+      Arch(ARM::INVALID_ARCH), EmittedArch(ARM::INVALID_ARCH),
+      AttributeSection(nullptr) {}
+};
+
+/// Extend the generic ELFStreamer class so that it can emit mapping symbols at
+/// the appropriate points in the object files. These symbols are defined in the
+/// ARM ELF ABI: infocenter.arm.com/help/topic/com.arm.../IHI0044D_aaelf.pdf.
+///
+/// In brief: $a, $t or $d should be emitted at the start of each contiguous
+/// region of ARM code, Thumb code or data in a section. In practice, this
+/// emission does not rely on explicit assembler directives but on inherent
+/// properties of the directives doing the emission (e.g. ".byte" is data, "add
+/// r0, r0, r0" an instruction).
+///
+/// As a result this system is orthogonal to the DataRegion infrastructure used
+/// by MachO. Beware!
+class ARMELFStreamer : public MCELFStreamer {
+public:
+  friend class ARMTargetELFStreamer;
+
+  ARMELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS,
+                 MCCodeEmitter *Emitter, bool IsThumb)
+      : MCELFStreamer(Context, TAB, OS, Emitter), IsThumb(IsThumb),
+        MappingSymbolCounter(0), LastEMS(EMS_None) {
+    Reset();
+  }
+
+  ~ARMELFStreamer() {}
+
+  void FinishImpl() override;
+
+  // ARM exception handling directives
+  void emitFnStart();
+  void emitFnEnd();
+  void emitCantUnwind();
+  void emitPersonality(const MCSymbol *Per);
+  void emitPersonalityIndex(unsigned index);
+  void emitHandlerData();
+  void emitSetFP(unsigned NewFpReg, unsigned NewSpReg, int64_t Offset = 0);
+  void emitMovSP(unsigned Reg, int64_t Offset = 0);
+  void emitPad(int64_t Offset);
+  void emitRegSave(const SmallVectorImpl<unsigned> &RegList, bool isVector);
+  void emitUnwindRaw(int64_t Offset, const SmallVectorImpl<uint8_t> &Opcodes);
+
+  void ChangeSection(const MCSection *Section,
+                     const MCExpr *Subsection) override {
+    // We have to keep track of the mapping symbol state of any sections we
+    // use. Each one should start off as EMS_None, which is provided as the
+    // default constructor by DenseMap::lookup.
+    LastMappingSymbols[getPreviousSection().first] = LastEMS;
+    LastEMS = LastMappingSymbols.lookup(Section);
+
+    MCELFStreamer::ChangeSection(Section, Subsection);
+  }
+
+  /// This function is the one used to emit instruction data into the ELF
+  /// streamer. We override it to add the appropriate mapping symbol if
+  /// necessary.
+  void EmitInstruction(const MCInst& Inst,
+                       const MCSubtargetInfo &STI) override {
+    if (IsThumb)
+      EmitThumbMappingSymbol();
+    else
+      EmitARMMappingSymbol();
+
+    MCELFStreamer::EmitInstruction(Inst, STI);
+  }
+
+  void emitInst(uint32_t Inst, char Suffix) {
+    unsigned Size;
+    char Buffer[4];
+    const bool LittleEndian = getContext().getAsmInfo()->isLittleEndian();
+
+    switch (Suffix) {
+    case '\0':
+      Size = 4;
+
+      assert(!IsThumb);
+      EmitARMMappingSymbol();
+      for (unsigned II = 0, IE = Size; II != IE; II++) {
+        const unsigned I = LittleEndian ? (Size - II - 1) : II;
+        Buffer[Size - II - 1] = uint8_t(Inst >> I * CHAR_BIT);
+      }
+
+      break;
+    case 'n':
+    case 'w':
+      Size = (Suffix == 'n' ? 2 : 4);
+
+      assert(IsThumb);
+      EmitThumbMappingSymbol();
+      for (unsigned II = 0, IE = Size; II != IE; II = II + 2) {
+        const unsigned I0 = LittleEndian ? II + 0 : (Size - II - 1);
+        const unsigned I1 = LittleEndian ? II + 1 : (Size - II - 2);
+        Buffer[Size - II - 2] = uint8_t(Inst >> I0 * CHAR_BIT);
+        Buffer[Size - II - 1] = uint8_t(Inst >> I1 * CHAR_BIT);
+      }
+
+      break;
+    default:
+      llvm_unreachable("Invalid Suffix");
+    }
+
+    MCELFStreamer::EmitBytes(StringRef(Buffer, Size));
+  }
+
+  /// This is one of the functions used to emit data into an ELF section, so the
+  /// ARM streamer overrides it to add the appropriate mapping symbol ($d) if
+  /// necessary.
+  void EmitBytes(StringRef Data) override {
+    EmitDataMappingSymbol();
+    MCELFStreamer::EmitBytes(Data);
+  }
+
+  /// This is one of the functions used to emit data into an ELF section, so the
+  /// ARM streamer overrides it to add the appropriate mapping symbol ($d) if
+  /// necessary.
+  void EmitValueImpl(const MCExpr *Value, unsigned Size,
+                     const SMLoc &Loc) override {
+    EmitDataMappingSymbol();
+    MCELFStreamer::EmitValueImpl(Value, Size);
+  }
+
+  void EmitAssemblerFlag(MCAssemblerFlag Flag) override {
+    MCELFStreamer::EmitAssemblerFlag(Flag);
+
+    switch (Flag) {
+    case MCAF_SyntaxUnified:
+      return; // no-op here.
+    case MCAF_Code16:
+      IsThumb = true;
+      return; // Change to Thumb mode
+    case MCAF_Code32:
+      IsThumb = false;
+      return; // Change to ARM mode
+    case MCAF_Code64:
+      return;
+    case MCAF_SubsectionsViaSymbols:
+      return;
+    }
+  }
+
+private:
+  enum ElfMappingSymbol {
+    EMS_None,
+    EMS_ARM,
+    EMS_Thumb,
+    EMS_Data
+  };
+
+  void EmitDataMappingSymbol() {
+    if (LastEMS == EMS_Data) return;
+    EmitMappingSymbol("$d");
+    LastEMS = EMS_Data;
+  }
+
+  void EmitThumbMappingSymbol() {
+    if (LastEMS == EMS_Thumb) return;
+    EmitMappingSymbol("$t");
+    LastEMS = EMS_Thumb;
+  }
+
+  void EmitARMMappingSymbol() {
+    if (LastEMS == EMS_ARM) return;
+    EmitMappingSymbol("$a");
+    LastEMS = EMS_ARM;
+  }
+
+  void EmitMappingSymbol(StringRef Name) {
+    MCSymbol *Start = getContext().CreateTempSymbol();
+    EmitLabel(Start);
+
+    MCSymbol *Symbol =
+      getContext().GetOrCreateSymbol(Name + "." +
+                                     Twine(MappingSymbolCounter++));
+
+    MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
+    MCELF::SetType(SD, ELF::STT_NOTYPE);
+    MCELF::SetBinding(SD, ELF::STB_LOCAL);
+    SD.setExternal(false);
+    AssignSection(Symbol, getCurrentSection().first);
+
+    const MCExpr *Value = MCSymbolRefExpr::Create(Start, getContext());
+    Symbol->setVariableValue(Value);
+  }
+
+  void EmitThumbFunc(MCSymbol *Func) override {
+    getAssembler().setIsThumbFunc(Func);
+    EmitSymbolAttribute(Func, MCSA_ELF_TypeFunction);
+  }
+
+  // Helper functions for ARM exception handling directives
+  void Reset();
+
+  void EmitPersonalityFixup(StringRef Name);
+  void FlushPendingOffset();
+  void FlushUnwindOpcodes(bool NoHandlerData);
+
+  void SwitchToEHSection(const char *Prefix, unsigned Type, unsigned Flags,
+                         SectionKind Kind, const MCSymbol &Fn);
+  void SwitchToExTabSection(const MCSymbol &FnStart);
+  void SwitchToExIdxSection(const MCSymbol &FnStart);
+
+  void EmitFixup(const MCExpr *Expr, MCFixupKind Kind);
+
+  bool IsThumb;
+  int64_t MappingSymbolCounter;
+
+  DenseMap<const MCSection *, ElfMappingSymbol> LastMappingSymbols;
+  ElfMappingSymbol LastEMS;
+
+  // ARM Exception Handling Frame Information
+  MCSymbol *ExTab;
+  MCSymbol *FnStart;
+  const MCSymbol *Personality;
+  unsigned PersonalityIndex;
+  unsigned FPReg; // Frame pointer register
+  int64_t FPOffset; // Offset: (final frame pointer) - (initial $sp)
+  int64_t SPOffset; // Offset: (final $sp) - (initial $sp)
+  int64_t PendingOffset; // Offset: (final $sp) - (emitted $sp)
+  bool UsedFP;
+  bool CantUnwind;
+  SmallVector<uint8_t, 64> Opcodes;
+  UnwindOpcodeAssembler UnwindOpAsm;
+};
+} // end anonymous namespace
+
+ARMELFStreamer &ARMTargetELFStreamer::getStreamer() {
+  return static_cast<ARMELFStreamer &>(Streamer);
+}
+
+void ARMTargetELFStreamer::emitFnStart() { getStreamer().emitFnStart(); }
+void ARMTargetELFStreamer::emitFnEnd() { getStreamer().emitFnEnd(); }
+void ARMTargetELFStreamer::emitCantUnwind() { getStreamer().emitCantUnwind(); }
+void ARMTargetELFStreamer::emitPersonality(const MCSymbol *Personality) {
+  getStreamer().emitPersonality(Personality);
+}
+void ARMTargetELFStreamer::emitPersonalityIndex(unsigned Index) {
+  getStreamer().emitPersonalityIndex(Index);
+}
+void ARMTargetELFStreamer::emitHandlerData() {
+  getStreamer().emitHandlerData();
+}
+void ARMTargetELFStreamer::emitSetFP(unsigned FpReg, unsigned SpReg,
+                                     int64_t Offset) {
+  getStreamer().emitSetFP(FpReg, SpReg, Offset);
+}
+void ARMTargetELFStreamer::emitMovSP(unsigned Reg, int64_t Offset) {
+  getStreamer().emitMovSP(Reg, Offset);
+}
+void ARMTargetELFStreamer::emitPad(int64_t Offset) {
+  getStreamer().emitPad(Offset);
+}
+void ARMTargetELFStreamer::emitRegSave(const SmallVectorImpl<unsigned> &RegList,
+                                       bool isVector) {
+  getStreamer().emitRegSave(RegList, isVector);
+}
+void ARMTargetELFStreamer::emitUnwindRaw(int64_t Offset,
+                                      const SmallVectorImpl<uint8_t> &Opcodes) {
+  getStreamer().emitUnwindRaw(Offset, Opcodes);
+}
+void ARMTargetELFStreamer::switchVendor(StringRef Vendor) {
+  assert(!Vendor.empty() && "Vendor cannot be empty.");
+
+  if (CurrentVendor == Vendor)
+    return;
+
+  if (!CurrentVendor.empty())
+    finishAttributeSection();
+
+  assert(Contents.empty() &&
+         ".ARM.attributes should be flushed before changing vendor");
+  CurrentVendor = Vendor;
+
+}
+void ARMTargetELFStreamer::emitAttribute(unsigned Attribute, unsigned Value) {
+  setAttributeItem(Attribute, Value, /* OverwriteExisting= */ true);
+}
+void ARMTargetELFStreamer::emitTextAttribute(unsigned Attribute,
+                                             StringRef Value) {
+  setAttributeItem(Attribute, Value, /* OverwriteExisting= */ true);
+}
+void ARMTargetELFStreamer::emitIntTextAttribute(unsigned Attribute,
+                                                unsigned IntValue,
+                                                StringRef StringValue) {
+  setAttributeItems(Attribute, IntValue, StringValue,
+                    /* OverwriteExisting= */ true);
+}
+void ARMTargetELFStreamer::emitArch(unsigned Value) {
+  Arch = Value;
+}
+void ARMTargetELFStreamer::emitObjectArch(unsigned Value) {
+  EmittedArch = Value;
+}
+void ARMTargetELFStreamer::emitArchDefaultAttributes() {
+  using namespace ARMBuildAttrs;
+
+  setAttributeItem(CPU_name, GetArchDefaultCPUName(Arch), false);
+  if (EmittedArch == ARM::INVALID_ARCH)
+    setAttributeItem(CPU_arch, GetArchDefaultCPUArch(Arch), false);
+  else
+    setAttributeItem(CPU_arch, GetArchDefaultCPUArch(EmittedArch), false);
+
+  switch (Arch) {
+  case ARM::ARMV2:
+  case ARM::ARMV2A:
+  case ARM::ARMV3:
+  case ARM::ARMV3M:
+  case ARM::ARMV4:
+  case ARM::ARMV5:
+    setAttributeItem(ARM_ISA_use, Allowed, false);
+    break;
+
+  case ARM::ARMV4T:
+  case ARM::ARMV5T:
+  case ARM::ARMV5TE:
+  case ARM::ARMV6:
+  case ARM::ARMV6J:
+    setAttributeItem(ARM_ISA_use, Allowed, false);
+    setAttributeItem(THUMB_ISA_use, Allowed, false);
+    break;
+
+  case ARM::ARMV6T2:
+    setAttributeItem(ARM_ISA_use, Allowed, false);
+    setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
+    break;
+
+  case ARM::ARMV6Z:
+  case ARM::ARMV6ZK:
+    setAttributeItem(ARM_ISA_use, Allowed, false);
+    setAttributeItem(THUMB_ISA_use, Allowed, false);
+    setAttributeItem(Virtualization_use, AllowTZ, false);
+    break;
+
+  case ARM::ARMV6M:
+    setAttributeItem(THUMB_ISA_use, Allowed, false);
+    break;
+
+  case ARM::ARMV7:
+    setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
+    break;
+
+  case ARM::ARMV7A:
+    setAttributeItem(CPU_arch_profile, ApplicationProfile, false);
+    setAttributeItem(ARM_ISA_use, Allowed, false);
+    setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
+    break;
+
+  case ARM::ARMV7R:
+    setAttributeItem(CPU_arch_profile, RealTimeProfile, false);
+    setAttributeItem(ARM_ISA_use, Allowed, false);
+    setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
+    break;
+
+  case ARM::ARMV7M:
+    setAttributeItem(CPU_arch_profile, MicroControllerProfile, false);
+    setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
+    break;
+
+  case ARM::ARMV8A:
+    setAttributeItem(CPU_arch_profile, ApplicationProfile, false);
+    setAttributeItem(ARM_ISA_use, Allowed, false);
+    setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
+    setAttributeItem(MPextension_use, Allowed, false);
+    setAttributeItem(Virtualization_use, AllowTZVirtualization, false);
+    break;
+
+  case ARM::IWMMXT:
+    setAttributeItem(ARM_ISA_use, Allowed, false);
+    setAttributeItem(THUMB_ISA_use, Allowed, false);
+    setAttributeItem(WMMX_arch, AllowWMMXv1, false);
+    break;
+
+  case ARM::IWMMXT2:
+    setAttributeItem(ARM_ISA_use, Allowed, false);
+    setAttributeItem(THUMB_ISA_use, Allowed, false);
+    setAttributeItem(WMMX_arch, AllowWMMXv2, false);
+    break;
+
+  default:
+    report_fatal_error("Unknown Arch: " + Twine(Arch));
+    break;
+  }
+}
+void ARMTargetELFStreamer::emitFPU(unsigned Value) {
+  FPU = Value;
+}
+void ARMTargetELFStreamer::emitFPUDefaultAttributes() {
+  switch (FPU) {
+  case ARM::VFP:
+  case ARM::VFPV2:
+    setAttributeItem(ARMBuildAttrs::FP_arch,
+                     ARMBuildAttrs::AllowFPv2,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  case ARM::VFPV3:
+    setAttributeItem(ARMBuildAttrs::FP_arch,
+                     ARMBuildAttrs::AllowFPv3A,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  case ARM::VFPV3_D16:
+    setAttributeItem(ARMBuildAttrs::FP_arch,
+                     ARMBuildAttrs::AllowFPv3B,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  case ARM::VFPV4:
+    setAttributeItem(ARMBuildAttrs::FP_arch,
+                     ARMBuildAttrs::AllowFPv4A,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  case ARM::VFPV4_D16:
+    setAttributeItem(ARMBuildAttrs::FP_arch,
+                     ARMBuildAttrs::AllowFPv4B,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  case ARM::FP_ARMV8:
+    setAttributeItem(ARMBuildAttrs::FP_arch,
+                     ARMBuildAttrs::AllowFPARMv8A,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  case ARM::NEON:
+    setAttributeItem(ARMBuildAttrs::FP_arch,
+                     ARMBuildAttrs::AllowFPv3A,
+                     /* OverwriteExisting= */ false);
+    setAttributeItem(ARMBuildAttrs::Advanced_SIMD_arch,
+                     ARMBuildAttrs::AllowNeon,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  case ARM::NEON_VFPV4:
+    setAttributeItem(ARMBuildAttrs::FP_arch,
+                     ARMBuildAttrs::AllowFPv4A,
+                     /* OverwriteExisting= */ false);
+    setAttributeItem(ARMBuildAttrs::Advanced_SIMD_arch,
+                     ARMBuildAttrs::AllowNeon2,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  case ARM::NEON_FP_ARMV8:
+  case ARM::CRYPTO_NEON_FP_ARMV8:
+    setAttributeItem(ARMBuildAttrs::FP_arch,
+                     ARMBuildAttrs::AllowFPARMv8A,
+                     /* OverwriteExisting= */ false);
+    setAttributeItem(ARMBuildAttrs::Advanced_SIMD_arch,
+                     ARMBuildAttrs::AllowNeonARMv8,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  case ARM::SOFTVFP:
+    break;
+
+  default:
+    report_fatal_error("Unknown FPU: " + Twine(FPU));
+    break;
+  }
+}
+size_t ARMTargetELFStreamer::calculateContentSize() const {
+  size_t Result = 0;
+  for (size_t i = 0; i < Contents.size(); ++i) {
+    AttributeItem item = Contents[i];
+    switch (item.Type) {
+    case AttributeItem::HiddenAttribute:
+      break;
+    case AttributeItem::NumericAttribute:
+      Result += getULEB128Size(item.Tag);
+      Result += getULEB128Size(item.IntValue);
+      break;
+    case AttributeItem::TextAttribute:
+      Result += getULEB128Size(item.Tag);
+      Result += item.StringValue.size() + 1; // string + '\0'
+      break;
+    case AttributeItem::NumericAndTextAttributes:
+      Result += getULEB128Size(item.Tag);
+      Result += getULEB128Size(item.IntValue);
+      Result += item.StringValue.size() + 1; // string + '\0';
+      break;
+    }
+  }
+  return Result;
+}
+void ARMTargetELFStreamer::finishAttributeSection() {
+  // <format-version>
+  // [ <section-length> "vendor-name"
+  // [ <file-tag> <size> <attribute>*
+  //   | <section-tag> <size> <section-number>* 0 <attribute>*
+  //   | <symbol-tag> <size> <symbol-number>* 0 <attribute>*
+  //   ]+
+  // ]*
+
+  if (FPU != ARM::INVALID_FPU)
+    emitFPUDefaultAttributes();
+
+  if (Arch != ARM::INVALID_ARCH)
+    emitArchDefaultAttributes();
+
+  if (Contents.empty())
+    return;
+
+  std::sort(Contents.begin(), Contents.end(), AttributeItem::LessTag);
+
+  ARMELFStreamer &Streamer = getStreamer();
+
+  // Switch to .ARM.attributes section
+  if (AttributeSection) {
+    Streamer.SwitchSection(AttributeSection);
+  } else {
+    AttributeSection =
+      Streamer.getContext().getELFSection(".ARM.attributes",
+                                          ELF::SHT_ARM_ATTRIBUTES,
+                                          0,
+                                          SectionKind::getMetadata());
+    Streamer.SwitchSection(AttributeSection);
+
+    // Format version
+    Streamer.EmitIntValue(0x41, 1);
+  }
+
+  // Vendor size + Vendor name + '\0'
+  const size_t VendorHeaderSize = 4 + CurrentVendor.size() + 1;
+
+  // Tag + Tag Size
+  const size_t TagHeaderSize = 1 + 4;
+
+  const size_t ContentsSize = calculateContentSize();
+
+  Streamer.EmitIntValue(VendorHeaderSize + TagHeaderSize + ContentsSize, 4);
+  Streamer.EmitBytes(CurrentVendor);
+  Streamer.EmitIntValue(0, 1); // '\0'
+
+  Streamer.EmitIntValue(ARMBuildAttrs::File, 1);
+  Streamer.EmitIntValue(TagHeaderSize + ContentsSize, 4);
+
+  // Size should have been accounted for already, now
+  // emit each field as its type (ULEB or String)
+  for (size_t i = 0; i < Contents.size(); ++i) {
+    AttributeItem item = Contents[i];
+    Streamer.EmitULEB128IntValue(item.Tag);
+    switch (item.Type) {
+    default: llvm_unreachable("Invalid attribute type");
+    case AttributeItem::NumericAttribute:
+      Streamer.EmitULEB128IntValue(item.IntValue);
+      break;
+    case AttributeItem::TextAttribute:
+      Streamer.EmitBytes(item.StringValue.upper());
+      Streamer.EmitIntValue(0, 1); // '\0'
+      break;
+    case AttributeItem::NumericAndTextAttributes:
+      Streamer.EmitULEB128IntValue(item.IntValue);
+      Streamer.EmitBytes(item.StringValue.upper());
+      Streamer.EmitIntValue(0, 1); // '\0'
+      break;
+    }
+  }
+
+  Contents.clear();
+  FPU = ARM::INVALID_FPU;
+}
+
+void ARMTargetELFStreamer::emitLabel(MCSymbol *Symbol) {
+  ARMELFStreamer &Streamer = getStreamer();
+  if (!Streamer.IsThumb)
+    return;
+
+  const MCSymbolData &SD = Streamer.getOrCreateSymbolData(Symbol);
+  unsigned Type = MCELF::GetType(SD);
+  if (Type == ELF_STT_Func || Type == ELF_STT_GnuIFunc)
+    Streamer.EmitThumbFunc(Symbol);
+}
+
+void
+ARMTargetELFStreamer::AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *S) {
+  getStreamer().EmitFixup(S, FK_Data_4);
+}
+
+void ARMTargetELFStreamer::emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) {
+  if (const MCSymbolRefExpr *SRE = dyn_cast<MCSymbolRefExpr>(Value)) {
+    const MCSymbol &Sym = SRE->getSymbol();
+    if (!Sym.isDefined()) {
+      getStreamer().EmitAssignment(Symbol, Value);
+      return;
+    }
+  }
+
+  getStreamer().EmitThumbFunc(Symbol);
+  getStreamer().EmitAssignment(Symbol, Value);
+}
+
+void ARMTargetELFStreamer::emitInst(uint32_t Inst, char Suffix) {
+  getStreamer().emitInst(Inst, Suffix);
+}
+
+void ARMELFStreamer::FinishImpl() {
+  MCTargetStreamer &TS = *getTargetStreamer();
+  ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
+  ATS.finishAttributeSection();
+
+  MCELFStreamer::FinishImpl();
+}
+
+inline void ARMELFStreamer::SwitchToEHSection(const char *Prefix,
+                                              unsigned Type,
+                                              unsigned Flags,
+                                              SectionKind Kind,
+                                              const MCSymbol &Fn) {
+  const MCSectionELF &FnSection =
+    static_cast<const MCSectionELF &>(Fn.getSection());
+
+  // Create the name for new section
+  StringRef FnSecName(FnSection.getSectionName());
+  SmallString<128> EHSecName(Prefix);
+  if (FnSecName != ".text") {
+    EHSecName += FnSecName;
+  }
+
+  // Get .ARM.extab or .ARM.exidx section
+  const MCSectionELF *EHSection = nullptr;
+  if (const MCSymbol *Group = FnSection.getGroup()) {
+    EHSection = getContext().getELFSection(
+      EHSecName, Type, Flags | ELF::SHF_GROUP, Kind,
+      FnSection.getEntrySize(), Group->getName());
+  } else {
+    EHSection = getContext().getELFSection(EHSecName, Type, Flags, Kind);
+  }
+  assert(EHSection && "Failed to get the required EH section");
+
+  // Switch to .ARM.extab or .ARM.exidx section
+  SwitchSection(EHSection);
+  EmitCodeAlignment(4);
+}
+
+inline void ARMELFStreamer::SwitchToExTabSection(const MCSymbol &FnStart) {
+  SwitchToEHSection(".ARM.extab",
+                    ELF::SHT_PROGBITS,
+                    ELF::SHF_ALLOC,
+                    SectionKind::getDataRel(),
+                    FnStart);
+}
+
+inline void ARMELFStreamer::SwitchToExIdxSection(const MCSymbol &FnStart) {
+  SwitchToEHSection(".ARM.exidx",
+                    ELF::SHT_ARM_EXIDX,
+                    ELF::SHF_ALLOC | ELF::SHF_LINK_ORDER,
+                    SectionKind::getDataRel(),
+                    FnStart);
+}
+void ARMELFStreamer::EmitFixup(const MCExpr *Expr, MCFixupKind Kind) {
+  MCDataFragment *Frag = getOrCreateDataFragment();
+  Frag->getFixups().push_back(MCFixup::Create(Frag->getContents().size(), Expr,
+                                              Kind));
+}
+
+void ARMELFStreamer::Reset() {
+  ExTab = nullptr;
+  FnStart = nullptr;
+  Personality = nullptr;
+  PersonalityIndex = ARM::EHABI::NUM_PERSONALITY_INDEX;
+  FPReg = ARM::SP;
+  FPOffset = 0;
+  SPOffset = 0;
+  PendingOffset = 0;
+  UsedFP = false;
+  CantUnwind = false;
+
+  Opcodes.clear();
+  UnwindOpAsm.Reset();
+}
+
+void ARMELFStreamer::emitFnStart() {
+  assert(FnStart == nullptr);
+  FnStart = getContext().CreateTempSymbol();
+  EmitLabel(FnStart);
+}
+
+void ARMELFStreamer::emitFnEnd() {
+  assert(FnStart && ".fnstart must precedes .fnend");
+
+  // Emit unwind opcodes if there is no .handlerdata directive
+  if (!ExTab && !CantUnwind)
+    FlushUnwindOpcodes(true);
+
+  // Emit the exception index table entry
+  SwitchToExIdxSection(*FnStart);
+
+  if (PersonalityIndex < ARM::EHABI::NUM_PERSONALITY_INDEX)
+    EmitPersonalityFixup(GetAEABIUnwindPersonalityName(PersonalityIndex));
+
+  const MCSymbolRefExpr *FnStartRef =
+    MCSymbolRefExpr::Create(FnStart,
+                            MCSymbolRefExpr::VK_ARM_PREL31,
+                            getContext());
+
+  EmitValue(FnStartRef, 4);
+
+  if (CantUnwind) {
+    EmitIntValue(ARM::EHABI::EXIDX_CANTUNWIND, 4);
+  } else if (ExTab) {
+    // Emit a reference to the unwind opcodes in the ".ARM.extab" section.
+    const MCSymbolRefExpr *ExTabEntryRef =
+      MCSymbolRefExpr::Create(ExTab,
+                              MCSymbolRefExpr::VK_ARM_PREL31,
+                              getContext());
+    EmitValue(ExTabEntryRef, 4);
+  } else {
+    // For the __aeabi_unwind_cpp_pr0, we have to emit the unwind opcodes in
+    // the second word of exception index table entry.  The size of the unwind
+    // opcodes should always be 4 bytes.
+    assert(PersonalityIndex == ARM::EHABI::AEABI_UNWIND_CPP_PR0 &&
+           "Compact model must use __aeabi_unwind_cpp_pr0 as personality");
+    assert(Opcodes.size() == 4u &&
+           "Unwind opcode size for __aeabi_unwind_cpp_pr0 must be equal to 4");
+    uint64_t Intval = Opcodes[0] |
+                      Opcodes[1] << 8 |
+                      Opcodes[2] << 16 |
+                      Opcodes[3] << 24;
+    EmitIntValue(Intval, Opcodes.size());
+  }
+
+  // Switch to the section containing FnStart
+  SwitchSection(&FnStart->getSection());
+
+  // Clean exception handling frame information
+  Reset();
+}
+
+void ARMELFStreamer::emitCantUnwind() { CantUnwind = true; }
+
+// Add the R_ARM_NONE fixup at the same position
+void ARMELFStreamer::EmitPersonalityFixup(StringRef Name) {
+  const MCSymbol *PersonalitySym = getContext().GetOrCreateSymbol(Name);
+
+  const MCSymbolRefExpr *PersonalityRef = MCSymbolRefExpr::Create(
+      PersonalitySym, MCSymbolRefExpr::VK_ARM_NONE, getContext());
+
+  visitUsedExpr(*PersonalityRef);
+  MCDataFragment *DF = getOrCreateDataFragment();
+  DF->getFixups().push_back(MCFixup::Create(DF->getContents().size(),
+                                            PersonalityRef,
+                                            MCFixup::getKindForSize(4, false)));
+}
+
+void ARMELFStreamer::FlushPendingOffset() {
+  if (PendingOffset != 0) {
+    UnwindOpAsm.EmitSPOffset(-PendingOffset);
+    PendingOffset = 0;
+  }
+}
+
+void ARMELFStreamer::FlushUnwindOpcodes(bool NoHandlerData) {
+  // Emit the unwind opcode to restore $sp.
+  if (UsedFP) {
+    const MCRegisterInfo *MRI = getContext().getRegisterInfo();
+    int64_t LastRegSaveSPOffset = SPOffset - PendingOffset;
+    UnwindOpAsm.EmitSPOffset(LastRegSaveSPOffset - FPOffset);
+    UnwindOpAsm.EmitSetSP(MRI->getEncodingValue(FPReg));
+  } else {
+    FlushPendingOffset();
+  }
+
+  // Finalize the unwind opcode sequence
+  UnwindOpAsm.Finalize(PersonalityIndex, Opcodes);
+
+  // For compact model 0, we have to emit the unwind opcodes in the .ARM.exidx
+  // section.  Thus, we don't have to create an entry in the .ARM.extab
+  // section.
+  if (NoHandlerData && PersonalityIndex == ARM::EHABI::AEABI_UNWIND_CPP_PR0)
+    return;
+
+  // Switch to .ARM.extab section.
+  SwitchToExTabSection(*FnStart);
+
+  // Create .ARM.extab label for offset in .ARM.exidx
+  assert(!ExTab);
+  ExTab = getContext().CreateTempSymbol();
+  EmitLabel(ExTab);
+
+  // Emit personality
+  if (Personality) {
+    const MCSymbolRefExpr *PersonalityRef =
+      MCSymbolRefExpr::Create(Personality,
+                              MCSymbolRefExpr::VK_ARM_PREL31,
+                              getContext());
+
+    EmitValue(PersonalityRef, 4);
+  }
+
+  // Emit unwind opcodes
+  assert((Opcodes.size() % 4) == 0 &&
+         "Unwind opcode size for __aeabi_cpp_unwind_pr0 must be multiple of 4");
+  for (unsigned I = 0; I != Opcodes.size(); I += 4) {
+    uint64_t Intval = Opcodes[I] |
+                      Opcodes[I + 1] << 8 |
+                      Opcodes[I + 2] << 16 |
+                      Opcodes[I + 3] << 24;
+    EmitIntValue(Intval, 4);
+  }
+
+  // According to ARM EHABI section 9.2, if the __aeabi_unwind_cpp_pr1() or
+  // __aeabi_unwind_cpp_pr2() is used, then the handler data must be emitted
+  // after the unwind opcodes.  The handler data consists of several 32-bit
+  // words, and should be terminated by zero.
+  //
+  // In case that the .handlerdata directive is not specified by the
+  // programmer, we should emit zero to terminate the handler data.
+  if (NoHandlerData && !Personality)
+    EmitIntValue(0, 4);
+}
+
+void ARMELFStreamer::emitHandlerData() { FlushUnwindOpcodes(false); }
+
+void ARMELFStreamer::emitPersonality(const MCSymbol *Per) {
+  Personality = Per;
+  UnwindOpAsm.setPersonality(Per);
+}
+
+void ARMELFStreamer::emitPersonalityIndex(unsigned Index) {
+  assert(Index < ARM::EHABI::NUM_PERSONALITY_INDEX && "invalid index");
+  PersonalityIndex = Index;
+}
+
+void ARMELFStreamer::emitSetFP(unsigned NewFPReg, unsigned NewSPReg,
+                               int64_t Offset) {
+  assert((NewSPReg == ARM::SP || NewSPReg == FPReg) &&
+         "the operand of .setfp directive should be either $sp or $fp");
+
+  UsedFP = true;
+  FPReg = NewFPReg;
+
+  if (NewSPReg == ARM::SP)
+    FPOffset = SPOffset + Offset;
+  else
+    FPOffset += Offset;
+}
+
+void ARMELFStreamer::emitMovSP(unsigned Reg, int64_t Offset) {
+  assert((Reg != ARM::SP && Reg != ARM::PC) &&
+         "the operand of .movsp cannot be either sp or pc");
+  assert(FPReg == ARM::SP && "current FP must be SP");
+
+  FlushPendingOffset();
+
+  FPReg = Reg;
+  FPOffset = SPOffset + Offset;
+
+  const MCRegisterInfo *MRI = getContext().getRegisterInfo();
+  UnwindOpAsm.EmitSetSP(MRI->getEncodingValue(FPReg));
+}
+
+void ARMELFStreamer::emitPad(int64_t Offset) {
+  // Track the change of the $sp offset
+  SPOffset -= Offset;
+
+  // To squash multiple .pad directives, we should delay the unwind opcode
+  // until the .save, .vsave, .handlerdata, or .fnend directives.
+  PendingOffset -= Offset;
+}
+
+void ARMELFStreamer::emitRegSave(const SmallVectorImpl<unsigned> &RegList,
+                                 bool IsVector) {
+  // Collect the registers in the register list
+  unsigned Count = 0;
+  uint32_t Mask = 0;
+  const MCRegisterInfo *MRI = getContext().getRegisterInfo();
+  for (size_t i = 0; i < RegList.size(); ++i) {
+    unsigned Reg = MRI->getEncodingValue(RegList[i]);
+    assert(Reg < (IsVector ? 32U : 16U) && "Register out of range");
+    unsigned Bit = (1u << Reg);
+    if ((Mask & Bit) == 0) {
+      Mask |= Bit;
+      ++Count;
+    }
+  }
+
+  // Track the change the $sp offset: For the .save directive, the
+  // corresponding push instruction will decrease the $sp by (4 * Count).
+  // For the .vsave directive, the corresponding vpush instruction will
+  // decrease $sp by (8 * Count).
+  SPOffset -= Count * (IsVector ? 8 : 4);
+
+  // Emit the opcode
+  FlushPendingOffset();
+  if (IsVector)
+    UnwindOpAsm.EmitVFPRegSave(Mask);
+  else
+    UnwindOpAsm.EmitRegSave(Mask);
+}
+
+void ARMELFStreamer::emitUnwindRaw(int64_t Offset,
+                                   const SmallVectorImpl<uint8_t> &Opcodes) {
+  FlushPendingOffset();
+  SPOffset = SPOffset - Offset;
+  UnwindOpAsm.EmitRaw(Opcodes);
+}
+
+namespace llvm {
+
+MCStreamer *createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
+                                bool isVerboseAsm, bool useDwarfDirectory,
+                                MCInstPrinter *InstPrint, MCCodeEmitter *CE,
+                                MCAsmBackend *TAB, bool ShowInst) {
+  MCStreamer *S = llvm::createAsmStreamer(
+      Ctx, OS, isVerboseAsm, useDwarfDirectory, InstPrint, CE, TAB, ShowInst);
+  new ARMTargetAsmStreamer(*S, OS, *InstPrint, isVerboseAsm);
+  return S;
+}
+
+MCStreamer *createARMNullStreamer(MCContext &Ctx) {
+  MCStreamer *S = llvm::createNullStreamer(Ctx);
+  new ARMTargetStreamer(*S);
+  return S;
+}
+
+  MCELFStreamer* createARMELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+                                      raw_ostream &OS, MCCodeEmitter *Emitter,
+                                      bool RelaxAll, bool NoExecStack,
+                                      bool IsThumb) {
+    ARMELFStreamer *S = new ARMELFStreamer(Context, TAB, OS, Emitter, IsThumb);
+    new ARMTargetELFStreamer(*S);
+    // FIXME: This should eventually end up somewhere else where more
+    // intelligent flag decisions can be made. For now we are just maintaining
+    // the status quo for ARM and setting EF_ARM_EABI_VER5 as the default.
+    S->getAssembler().setELFHeaderEFlags(ELF::EF_ARM_EABI_VER5);
+
+    if (RelaxAll)
+      S->getAssembler().setRelaxAll(true);
+    if (NoExecStack)
+      S->getAssembler().setNoExecStack(true);
+    return S;
+  }
+
+}
+
+
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h
new file mode 100644
index 0000000..bfd9e33
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h
@@ -0,0 +1,110 @@
+//===-- ARMFixupKinds.h - ARM Specific Fixup Entries ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ARM_ARMFIXUPKINDS_H
+#define LLVM_ARM_ARMFIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace ARM {
+enum Fixups {
+  // fixup_arm_ldst_pcrel_12 - 12-bit PC relative relocation for symbol
+  // addresses
+  fixup_arm_ldst_pcrel_12 = FirstTargetFixupKind,
+
+  // fixup_t2_ldst_pcrel_12 - Equivalent to fixup_arm_ldst_pcrel_12, with
+  // the 16-bit halfwords reordered.
+  fixup_t2_ldst_pcrel_12,
+
+  // fixup_arm_pcrel_10_unscaled - 10-bit PC relative relocation for symbol
+  // addresses used in LDRD/LDRH/LDRB/etc. instructions. All bits are encoded.
+  fixup_arm_pcrel_10_unscaled,
+  // fixup_arm_pcrel_10 - 10-bit PC relative relocation for symbol addresses
+  // used in VFP instructions where the lower 2 bits are not encoded
+  // (so it's encoded as an 8-bit immediate).
+  fixup_arm_pcrel_10,
+  // fixup_t2_pcrel_10 - Equivalent to fixup_arm_pcrel_10, accounting for
+  // the short-swapped encoding of Thumb2 instructions.
+  fixup_t2_pcrel_10,
+  // fixup_thumb_adr_pcrel_10 - 10-bit PC relative relocation for symbol
+  // addresses where the lower 2 bits are not encoded (so it's encoded as an
+  // 8-bit immediate).
+  fixup_thumb_adr_pcrel_10,
+  // fixup_arm_adr_pcrel_12 - 12-bit PC relative relocation for the ADR
+  // instruction.
+  fixup_arm_adr_pcrel_12,
+  // fixup_t2_adr_pcrel_12 - 12-bit PC relative relocation for the ADR
+  // instruction.
+  fixup_t2_adr_pcrel_12,
+  // fixup_arm_condbranch - 24-bit PC relative relocation for conditional branch
+  // instructions. 
+  fixup_arm_condbranch,
+  // fixup_arm_uncondbranch - 24-bit PC relative relocation for 
+  // branch instructions. (unconditional)
+  fixup_arm_uncondbranch,
+  // fixup_t2_condbranch - 20-bit PC relative relocation for Thumb2 direct
+  // uconditional branch instructions.
+  fixup_t2_condbranch,
+  // fixup_t2_uncondbranch - 20-bit PC relative relocation for Thumb2 direct
+  // branch unconditional branch instructions.
+  fixup_t2_uncondbranch,
+
+  // fixup_arm_thumb_br - 12-bit fixup for Thumb B instructions.
+  fixup_arm_thumb_br,
+
+  // The following fixups handle the ARM BL instructions. These can be
+  // conditionalised; however, the ARM ELF ABI requires a different relocation
+  // in that case: R_ARM_JUMP24 instead of R_ARM_CALL. The difference is that
+  // R_ARM_CALL is allowed to change the instruction to a BLX inline, which has
+  // no conditional version; R_ARM_JUMP24 would have to insert a veneer.
+  //
+  // MachO does not draw a distinction between the two cases, so it will treat
+  // fixup_arm_uncondbl and fixup_arm_condbl as identical fixups.
+
+  // fixup_arm_uncondbl - Fixup for unconditional ARM BL instructions.
+  fixup_arm_uncondbl,
+
+  // fixup_arm_condbl - Fixup for ARM BL instructions with nontrivial
+  // conditionalisation.
+  fixup_arm_condbl,
+
+  // fixup_arm_blx - Fixup for ARM BLX instructions.
+  fixup_arm_blx,
+
+  // fixup_arm_thumb_bl - Fixup for Thumb BL instructions.
+  fixup_arm_thumb_bl,
+
+  // fixup_arm_thumb_blx - Fixup for Thumb BLX instructions.
+  fixup_arm_thumb_blx,
+
+  // fixup_arm_thumb_cb - Fixup for Thumb branch instructions.
+  fixup_arm_thumb_cb,
+
+  // fixup_arm_thumb_cp - Fixup for Thumb load/store from constant pool instrs.
+  fixup_arm_thumb_cp,
+
+  // fixup_arm_thumb_bcc - Fixup for Thumb conditional branching instructions.
+  fixup_arm_thumb_bcc,
+
+  // The next two are for the movt/movw pair
+  // the 16bit imm field are split into imm{15-12} and imm{11-0}
+  fixup_arm_movt_hi16, // :upper16:
+  fixup_arm_movw_lo16, // :lower16:
+  fixup_t2_movt_hi16, // :upper16:
+  fixup_t2_movw_lo16, // :lower16:
+
+  // Marker
+  LastTargetFixupKind,
+  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+}
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
new file mode 100644
index 0000000..7a19208
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
@@ -0,0 +1,114 @@
+//===-- ARMMCAsmInfo.cpp - ARM asm properties -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the ARMMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMMCAsmInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/ADT/Triple.h"
+
+using namespace llvm;
+
+void ARMMCAsmInfoDarwin::anchor() { }
+
+ARMMCAsmInfoDarwin::ARMMCAsmInfoDarwin(StringRef TT) {
+  Triple TheTriple(TT);
+  if ((TheTriple.getArch() == Triple::armeb) ||
+      (TheTriple.getArch() == Triple::thumbeb))
+    IsLittleEndian = false;
+
+  Data64bitsDirective = nullptr;
+  CommentString = "@";
+  Code16Directive = ".code\t16";
+  Code32Directive = ".code\t32";
+  UseDataRegionDirectives = true;
+
+  SupportsDebugInformation = true;
+
+  // Exceptions handling
+  ExceptionsType = ExceptionHandling::SjLj;
+
+  UseIntegratedAssembler = true;
+}
+
+void ARMELFMCAsmInfo::anchor() { }
+
+ARMELFMCAsmInfo::ARMELFMCAsmInfo(StringRef TT) {
+  Triple TheTriple(TT);
+  if ((TheTriple.getArch() == Triple::armeb) ||
+      (TheTriple.getArch() == Triple::thumbeb))
+    IsLittleEndian = false;
+
+  // ".comm align is in bytes but .align is pow-2."
+  AlignmentIsInBytes = false;
+
+  Data64bitsDirective = nullptr;
+  CommentString = "@";
+  Code16Directive = ".code\t16";
+  Code32Directive = ".code\t32";
+
+  HasLEB128 = true;
+  SupportsDebugInformation = true;
+
+  // Exceptions handling
+  switch (TheTriple.getOS()) {
+  case Triple::NetBSD:
+    ExceptionsType = ExceptionHandling::DwarfCFI;
+    break;
+  default:
+    ExceptionsType = ExceptionHandling::ARM;
+    break;
+  }
+
+  // foo(plt) instead of foo@plt
+  UseParensForSymbolVariant = true;
+
+  UseIntegratedAssembler = true;
+}
+
+void ARMELFMCAsmInfo::setUseIntegratedAssembler(bool Value) {
+  UseIntegratedAssembler = Value;
+  if (!UseIntegratedAssembler) {
+    // gas doesn't handle VFP register names in cfi directives,
+    // so don't use register names with external assembler.
+    // See https://sourceware.org/bugzilla/show_bug.cgi?id=16694
+    DwarfRegNumForCFI = true;
+  }
+}
+
+void ARMCOFFMCAsmInfoMicrosoft::anchor() { }
+
+ARMCOFFMCAsmInfoMicrosoft::ARMCOFFMCAsmInfoMicrosoft() {
+  AlignmentIsInBytes = false;
+
+  PrivateGlobalPrefix = "$M";
+}
+
+void ARMCOFFMCAsmInfoGNU::anchor() { }
+
+ARMCOFFMCAsmInfoGNU::ARMCOFFMCAsmInfoGNU() {
+  AlignmentIsInBytes = false;
+  HasSingleParameterDotFile = true;
+
+  CommentString = "@";
+  Code16Directive = ".code\t16";
+  Code32Directive = ".code\t32";
+  PrivateGlobalPrefix = ".L";
+
+  HasLEB128 = true;
+  SupportsDebugInformation = true;
+  ExceptionsType = ExceptionHandling::None;
+  UseParensForSymbolVariant = true;
+
+  UseIntegratedAssembler = false;
+  DwarfRegNumForCFI = true;
+}
+
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
new file mode 100644
index 0000000..51cfa0ad
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
@@ -0,0 +1,51 @@
+//===-- ARMMCAsmInfo.h - ARM asm properties --------------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the ARMMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ARMTARGETASMINFO_H
+#define LLVM_ARMTARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfoCOFF.h"
+#include "llvm/MC/MCAsmInfoDarwin.h"
+#include "llvm/MC/MCAsmInfoELF.h"
+
+namespace llvm {
+
+  class ARMMCAsmInfoDarwin : public MCAsmInfoDarwin {
+    void anchor() override;
+  public:
+    explicit ARMMCAsmInfoDarwin(StringRef TT);
+  };
+
+  class ARMELFMCAsmInfo : public MCAsmInfoELF {
+    void anchor() override;
+  public:
+    explicit ARMELFMCAsmInfo(StringRef TT);
+
+    void setUseIntegratedAssembler(bool Value) override;
+  };
+
+  class ARMCOFFMCAsmInfoMicrosoft : public MCAsmInfoMicrosoft {
+    void anchor() override;
+  public:
+    explicit ARMCOFFMCAsmInfoMicrosoft();
+  };
+
+  class ARMCOFFMCAsmInfoGNU : public MCAsmInfoGNUCOFF {
+    void anchor() override;
+  public:
+    explicit ARMCOFFMCAsmInfoGNU();
+  };
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
new file mode 100644
index 0000000..b8ee555
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
@@ -0,0 +1,1675 @@
+//===-- ARM/ARMMCCodeEmitter.cpp - Convert ARM code to machine code -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARMMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/ARMMCTargetDesc.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "MCTargetDesc/ARMFixupKinds.h"
+#include "MCTargetDesc/ARMMCExpr.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mccodeemitter"
+
+STATISTIC(MCNumEmitted, "Number of MC instructions emitted.");
+STATISTIC(MCNumCPRelocations, "Number of constant pool relocations created.");
+
+namespace {
+class ARMMCCodeEmitter : public MCCodeEmitter {
+  ARMMCCodeEmitter(const ARMMCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  void operator=(const ARMMCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  const MCInstrInfo &MCII;
+  const MCContext &CTX;
+  bool IsLittleEndian;
+
+public:
+  ARMMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx, bool IsLittle)
+    : MCII(mcii), CTX(ctx), IsLittleEndian(IsLittle) {
+  }
+
+  ~ARMMCCodeEmitter() {}
+
+  bool isThumb(const MCSubtargetInfo &STI) const {
+    return (STI.getFeatureBits() & ARM::ModeThumb) != 0;
+  }
+  bool isThumb2(const MCSubtargetInfo &STI) const {
+    return isThumb(STI) && (STI.getFeatureBits() & ARM::FeatureThumb2) != 0;
+  }
+  bool isTargetMachO(const MCSubtargetInfo &STI) const {
+    Triple TT(STI.getTargetTriple());
+    return TT.isOSBinFormatMachO();
+  }
+
+  unsigned getMachineSoImmOpValue(unsigned SoImm) const;
+
+  // getBinaryCodeForInstr - TableGen'erated function for getting the
+  // binary encoding for an instruction.
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  /// getMachineOpValue - Return binary encoding of operand. If the machine
+  /// operand requires relocation, record the relocation and return zero.
+  unsigned getMachineOpValue(const MCInst &MI,const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+
+  /// getHiLo16ImmOpValue - Return the encoding for the hi / low 16-bit of
+  /// the specified operand. This is used for operands with :lower16: and
+  /// :upper16: prefixes.
+  uint32_t getHiLo16ImmOpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
+
+  bool EncodeAddrModeOpValues(const MCInst &MI, unsigned OpIdx,
+                              unsigned &Reg, unsigned &Imm,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
+
+  /// getThumbBLTargetOpValue - Return encoding info for Thumb immediate
+  /// BL branch target.
+  uint32_t getThumbBLTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                   SmallVectorImpl<MCFixup> &Fixups,
+                                   const MCSubtargetInfo &STI) const;
+
+  /// getThumbBLXTargetOpValue - Return encoding info for Thumb immediate
+  /// BLX branch target.
+  uint32_t getThumbBLXTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const;
+
+  /// getThumbBRTargetOpValue - Return encoding info for Thumb branch target.
+  uint32_t getThumbBRTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                   SmallVectorImpl<MCFixup> &Fixups,
+                                   const MCSubtargetInfo &STI) const;
+
+  /// getThumbBCCTargetOpValue - Return encoding info for Thumb branch target.
+  uint32_t getThumbBCCTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const;
+
+  /// getThumbCBTargetOpValue - Return encoding info for Thumb branch target.
+  uint32_t getThumbCBTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                   SmallVectorImpl<MCFixup> &Fixups,
+                                   const MCSubtargetInfo &STI) const;
+
+  /// getBranchTargetOpValue - Return encoding info for 24-bit immediate
+  /// branch target.
+  uint32_t getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const;
+
+  /// getUnconditionalBranchTargetOpValue - Return encoding info for 24-bit
+  /// immediate Thumb2 direct branch target.
+  uint32_t getUnconditionalBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const;
+
+  /// getARMBranchTargetOpValue - Return encoding info for 24-bit immediate
+  /// branch target.
+  uint32_t getARMBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const;
+  uint32_t getARMBLTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getARMBLXTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const;
+
+  /// getAdrLabelOpValue - Return encoding info for 12-bit immediate
+  /// ADR label target.
+  uint32_t getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
+  uint32_t getThumbAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
+  uint32_t getT2AdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
+
+
+  /// getAddrModeImm12OpValue - Return encoding info for 'reg +/- imm12'
+  /// operand.
+  uint32_t getAddrModeImm12OpValue(const MCInst &MI, unsigned OpIdx,
+                                   SmallVectorImpl<MCFixup> &Fixups,
+                                   const MCSubtargetInfo &STI) const;
+
+  /// getThumbAddrModeRegRegOpValue - Return encoding for 'reg + reg' operand.
+  uint32_t getThumbAddrModeRegRegOpValue(const MCInst &MI, unsigned OpIdx,
+                                         SmallVectorImpl<MCFixup> &Fixups,
+                                         const MCSubtargetInfo &STI) const;
+
+  /// getT2AddrModeImm8s4OpValue - Return encoding info for 'reg +/- imm8<<2'
+  /// operand.
+  uint32_t getT2AddrModeImm8s4OpValue(const MCInst &MI, unsigned OpIdx,
+                                   SmallVectorImpl<MCFixup> &Fixups,
+                                   const MCSubtargetInfo &STI) const;
+
+  /// getT2AddrModeImm0_1020s4OpValue - Return encoding info for 'reg + imm8<<2'
+  /// operand.
+  uint32_t getT2AddrModeImm0_1020s4OpValue(const MCInst &MI, unsigned OpIdx,
+                                   SmallVectorImpl<MCFixup> &Fixups,
+                                   const MCSubtargetInfo &STI) const;
+
+  /// getT2Imm8s4OpValue - Return encoding info for '+/- imm8<<2'
+  /// operand.
+  uint32_t getT2Imm8s4OpValue(const MCInst &MI, unsigned OpIdx,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
+
+
+  /// getLdStSORegOpValue - Return encoding info for 'reg +/- reg shop imm'
+  /// operand as needed by load/store instructions.
+  uint32_t getLdStSORegOpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
+
+  /// getLdStmModeOpValue - Return encoding for load/store multiple mode.
+  uint32_t getLdStmModeOpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const {
+    ARM_AM::AMSubMode Mode = (ARM_AM::AMSubMode)MI.getOperand(OpIdx).getImm();
+    switch (Mode) {
+    default: llvm_unreachable("Unknown addressing sub-mode!");
+    case ARM_AM::da: return 0;
+    case ARM_AM::ia: return 1;
+    case ARM_AM::db: return 2;
+    case ARM_AM::ib: return 3;
+    }
+  }
+  /// getShiftOp - Return the shift opcode (bit[6:5]) of the immediate value.
+  ///
+  unsigned getShiftOp(ARM_AM::ShiftOpc ShOpc) const {
+    switch (ShOpc) {
+    case ARM_AM::no_shift:
+    case ARM_AM::lsl: return 0;
+    case ARM_AM::lsr: return 1;
+    case ARM_AM::asr: return 2;
+    case ARM_AM::ror:
+    case ARM_AM::rrx: return 3;
+    }
+    llvm_unreachable("Invalid ShiftOpc!");
+  }
+
+  /// getAddrMode2OpValue - Return encoding for addrmode2 operands.
+  uint32_t getAddrMode2OpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
+
+  /// getAddrMode2OffsetOpValue - Return encoding for am2offset operands.
+  uint32_t getAddrMode2OffsetOpValue(const MCInst &MI, unsigned OpIdx,
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const;
+
+  /// getPostIdxRegOpValue - Return encoding for postidx_reg operands.
+  uint32_t getPostIdxRegOpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+
+  /// getAddrMode3OffsetOpValue - Return encoding for am3offset operands.
+  uint32_t getAddrMode3OffsetOpValue(const MCInst &MI, unsigned OpIdx,
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const;
+
+  /// getAddrMode3OpValue - Return encoding for addrmode3 operands.
+  uint32_t getAddrMode3OpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
+
+  /// getAddrModeThumbSPOpValue - Return encoding info for 'reg +/- imm12'
+  /// operand.
+  uint32_t getAddrModeThumbSPOpValue(const MCInst &MI, unsigned OpIdx,
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const;
+
+  /// getAddrModeISOpValue - Encode the t_addrmode_is# operands.
+  uint32_t getAddrModeISOpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+
+  /// getAddrModePCOpValue - Return encoding for t_addrmode_pc operands.
+  uint32_t getAddrModePCOpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+
+  /// getAddrMode5OpValue - Return encoding info for 'reg +/- imm8' operand.
+  uint32_t getAddrMode5OpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
+
+  /// getCCOutOpValue - Return encoding of the 's' bit.
+  unsigned getCCOutOpValue(const MCInst &MI, unsigned Op,
+                           SmallVectorImpl<MCFixup> &Fixups,
+                           const MCSubtargetInfo &STI) const {
+    // The operand is either reg0 or CPSR. The 's' bit is encoded as '0' or
+    // '1' respectively.
+    return MI.getOperand(Op).getReg() == ARM::CPSR;
+  }
+
+  /// getSOImmOpValue - Return an encoded 12-bit shifted-immediate value.
+  unsigned getSOImmOpValue(const MCInst &MI, unsigned Op,
+                           SmallVectorImpl<MCFixup> &Fixups,
+                           const MCSubtargetInfo &STI) const {
+
+    const MCOperand &MO = MI.getOperand(Op);
+
+    // We expect MO to be an immediate or an expression,
+    // if it is an immediate - that's fine, just encode the value.
+    // Otherwise - create a Fixup.
+    if (MO.isExpr()) {
+      const MCExpr *Expr = MO.getExpr();
+      // In instruction code this value always encoded as lowest 12 bits,
+      // so we don't have to perform any specific adjustments.
+      // Due to requirements of relocatable records we have to use FK_Data_4.
+      // See ARMELFObjectWriter::ExplicitRelSym and
+      //     ARMELFObjectWriter::GetRelocTypeInner for more details.
+      MCFixupKind Kind = MCFixupKind(FK_Data_4);
+      Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
+      return 0;
+    }
+
+    unsigned SoImm = MO.getImm();
+    int SoImmVal = ARM_AM::getSOImmVal(SoImm);
+    assert(SoImmVal != -1 && "Not a valid so_imm value!");
+
+    // Encode rotate_imm.
+    unsigned Binary = (ARM_AM::getSOImmValRot((unsigned)SoImmVal) >> 1)
+      << ARMII::SoRotImmShift;
+
+    // Encode immed_8.
+    Binary |= ARM_AM::getSOImmValImm((unsigned)SoImmVal);
+    return Binary;
+  }
+
+  /// getT2SOImmOpValue - Return an encoded 12-bit shifted-immediate value.
+  unsigned getT2SOImmOpValue(const MCInst &MI, unsigned Op,
+                           SmallVectorImpl<MCFixup> &Fixups,
+                           const MCSubtargetInfo &STI) const {
+    unsigned SoImm = MI.getOperand(Op).getImm();
+    unsigned Encoded =  ARM_AM::getT2SOImmVal(SoImm);
+    assert(Encoded != ~0U && "Not a Thumb2 so_imm value?");
+    return Encoded;
+  }
+
+  unsigned getT2AddrModeSORegOpValue(const MCInst &MI, unsigned OpNum,
+    SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const;
+  unsigned getT2AddrModeImm8OpValue(const MCInst &MI, unsigned OpNum,
+    SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const;
+  unsigned getT2AddrModeImm8OffsetOpValue(const MCInst &MI, unsigned OpNum,
+    SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const;
+  unsigned getT2AddrModeImm12OffsetOpValue(const MCInst &MI, unsigned OpNum,
+    SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const;
+
+  /// getSORegOpValue - Return an encoded so_reg shifted register value.
+  unsigned getSORegRegOpValue(const MCInst &MI, unsigned Op,
+                           SmallVectorImpl<MCFixup> &Fixups,
+                           const MCSubtargetInfo &STI) const;
+  unsigned getSORegImmOpValue(const MCInst &MI, unsigned Op,
+                           SmallVectorImpl<MCFixup> &Fixups,
+                           const MCSubtargetInfo &STI) const;
+  unsigned getT2SORegOpValue(const MCInst &MI, unsigned Op,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+
+  unsigned getNEONVcvtImm32OpValue(const MCInst &MI, unsigned Op,
+                                   SmallVectorImpl<MCFixup> &Fixups,
+                                   const MCSubtargetInfo &STI) const {
+    return 64 - MI.getOperand(Op).getImm();
+  }
+
+  unsigned getBitfieldInvertedMaskOpValue(const MCInst &MI, unsigned Op,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const;
+
+  unsigned getRegisterListOpValue(const MCInst &MI, unsigned Op,
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const;
+  unsigned getAddrMode6AddressOpValue(const MCInst &MI, unsigned Op,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const;
+  unsigned getAddrMode6OneLane32AddressOpValue(const MCInst &MI, unsigned Op,
+                                        SmallVectorImpl<MCFixup> &Fixups,
+                                        const MCSubtargetInfo &STI) const;
+  unsigned getAddrMode6DupAddressOpValue(const MCInst &MI, unsigned Op,
+                                        SmallVectorImpl<MCFixup> &Fixups,
+                                        const MCSubtargetInfo &STI) const;
+  unsigned getAddrMode6OffsetOpValue(const MCInst &MI, unsigned Op,
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const;
+
+  unsigned getShiftRight8Imm(const MCInst &MI, unsigned Op,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+  unsigned getShiftRight16Imm(const MCInst &MI, unsigned Op,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
+  unsigned getShiftRight32Imm(const MCInst &MI, unsigned Op,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
+  unsigned getShiftRight64Imm(const MCInst &MI, unsigned Op,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
+
+  unsigned getThumbSRImmOpValue(const MCInst &MI, unsigned Op,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  unsigned NEONThumb2DataIPostEncoder(const MCInst &MI,
+                                      unsigned EncodedValue,
+                                      const MCSubtargetInfo &STI) const;
+  unsigned NEONThumb2LoadStorePostEncoder(const MCInst &MI,
+                                          unsigned EncodedValue,
+                                          const MCSubtargetInfo &STI) const;
+  unsigned NEONThumb2DupPostEncoder(const MCInst &MI,
+                                    unsigned EncodedValue,
+                                    const MCSubtargetInfo &STI) const;
+  unsigned NEONThumb2V8PostEncoder(const MCInst &MI,
+                                   unsigned EncodedValue,
+                                   const MCSubtargetInfo &STI) const;
+
+  unsigned VFPThumb2PostEncoder(const MCInst &MI,
+                                unsigned EncodedValue,
+                                const MCSubtargetInfo &STI) const;
+
+  void EmitByte(unsigned char C, raw_ostream &OS) const {
+    OS << (char)C;
+  }
+
+  void EmitConstant(uint64_t Val, unsigned Size, raw_ostream &OS) const {
+    // Output the constant in little endian byte order.
+    for (unsigned i = 0; i != Size; ++i) {
+      unsigned Shift = IsLittleEndian ? i * 8 : (Size - 1 - i) * 8;
+      EmitByte((Val >> Shift) & 0xff, OS);
+    }
+  }
+
+  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const override;
+};
+
+} // end anonymous namespace
+
+MCCodeEmitter *llvm::createARMLEMCCodeEmitter(const MCInstrInfo &MCII,
+                                              const MCRegisterInfo &MRI,
+                                              const MCSubtargetInfo &STI,
+                                              MCContext &Ctx) {
+  return new ARMMCCodeEmitter(MCII, Ctx, true);
+}
+
+MCCodeEmitter *llvm::createARMBEMCCodeEmitter(const MCInstrInfo &MCII,
+                                              const MCRegisterInfo &MRI,
+                                              const MCSubtargetInfo &STI,
+                                              MCContext &Ctx) {
+  return new ARMMCCodeEmitter(MCII, Ctx, false);
+}
+
+/// NEONThumb2DataIPostEncoder - Post-process encoded NEON data-processing
+/// instructions, and rewrite them to their Thumb2 form if we are currently in
+/// Thumb2 mode.
+unsigned ARMMCCodeEmitter::NEONThumb2DataIPostEncoder(const MCInst &MI,
+                                                 unsigned EncodedValue,
+                                                 const MCSubtargetInfo &STI) const {
+  if (isThumb2(STI)) {
+    // NEON Thumb2 data-processsing encodings are very simple: bit 24 is moved
+    // to bit 12 of the high half-word (i.e. bit 28), and bits 27-24 are
+    // set to 1111.
+    unsigned Bit24 = EncodedValue & 0x01000000;
+    unsigned Bit28 = Bit24 << 4;
+    EncodedValue &= 0xEFFFFFFF;
+    EncodedValue |= Bit28;
+    EncodedValue |= 0x0F000000;
+  }
+
+  return EncodedValue;
+}
+
+/// NEONThumb2LoadStorePostEncoder - Post-process encoded NEON load/store
+/// instructions, and rewrite them to their Thumb2 form if we are currently in
+/// Thumb2 mode.
+unsigned ARMMCCodeEmitter::NEONThumb2LoadStorePostEncoder(const MCInst &MI,
+                                                 unsigned EncodedValue,
+                                                 const MCSubtargetInfo &STI) const {
+  if (isThumb2(STI)) {
+    EncodedValue &= 0xF0FFFFFF;
+    EncodedValue |= 0x09000000;
+  }
+
+  return EncodedValue;
+}
+
+/// NEONThumb2DupPostEncoder - Post-process encoded NEON vdup
+/// instructions, and rewrite them to their Thumb2 form if we are currently in
+/// Thumb2 mode.
+unsigned ARMMCCodeEmitter::NEONThumb2DupPostEncoder(const MCInst &MI,
+                                                 unsigned EncodedValue,
+                                                 const MCSubtargetInfo &STI) const {
+  if (isThumb2(STI)) {
+    EncodedValue &= 0x00FFFFFF;
+    EncodedValue |= 0xEE000000;
+  }
+
+  return EncodedValue;
+}
+
+/// Post-process encoded NEON v8 instructions, and rewrite them to Thumb2 form
+/// if we are in Thumb2.
+unsigned ARMMCCodeEmitter::NEONThumb2V8PostEncoder(const MCInst &MI,
+                                                 unsigned EncodedValue,
+                                                 const MCSubtargetInfo &STI) const {
+  if (isThumb2(STI)) {
+    EncodedValue |= 0xC000000; // Set bits 27-26
+  }
+
+  return EncodedValue;
+}
+
+/// VFPThumb2PostEncoder - Post-process encoded VFP instructions and rewrite
+/// them to their Thumb2 form if we are currently in Thumb2 mode.
+unsigned ARMMCCodeEmitter::
+VFPThumb2PostEncoder(const MCInst &MI, unsigned EncodedValue,
+                     const MCSubtargetInfo &STI) const {
+  if (isThumb2(STI)) {
+    EncodedValue &= 0x0FFFFFFF;
+    EncodedValue |= 0xE0000000;
+  }
+  return EncodedValue;
+}
+
+/// getMachineOpValue - Return binary encoding of operand. If the machine
+/// operand requires relocation, record the relocation and return zero.
+unsigned ARMMCCodeEmitter::
+getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                  SmallVectorImpl<MCFixup> &Fixups,
+                  const MCSubtargetInfo &STI) const {
+  if (MO.isReg()) {
+    unsigned Reg = MO.getReg();
+    unsigned RegNo = CTX.getRegisterInfo()->getEncodingValue(Reg);
+
+    // Q registers are encoded as 2x their register number.
+    switch (Reg) {
+    default:
+      return RegNo;
+    case ARM::Q0:  case ARM::Q1:  case ARM::Q2:  case ARM::Q3:
+    case ARM::Q4:  case ARM::Q5:  case ARM::Q6:  case ARM::Q7:
+    case ARM::Q8:  case ARM::Q9:  case ARM::Q10: case ARM::Q11:
+    case ARM::Q12: case ARM::Q13: case ARM::Q14: case ARM::Q15:
+      return 2 * RegNo;
+    }
+  } else if (MO.isImm()) {
+    return static_cast<unsigned>(MO.getImm());
+  } else if (MO.isFPImm()) {
+    return static_cast<unsigned>(APFloat(MO.getFPImm())
+                     .bitcastToAPInt().getHiBits(32).getLimitedValue());
+  }
+
+  llvm_unreachable("Unable to encode MCOperand!");
+}
+
+/// getAddrModeImmOpValue - Return encoding info for 'reg +/- imm' operand.
+bool ARMMCCodeEmitter::
+EncodeAddrModeOpValues(const MCInst &MI, unsigned OpIdx, unsigned &Reg,
+                       unsigned &Imm, SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+  const MCOperand &MO  = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+
+  Reg = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
+
+  int32_t SImm = MO1.getImm();
+  bool isAdd = true;
+
+  // Special value for #-0
+  if (SImm == INT32_MIN) {
+    SImm = 0;
+    isAdd = false;
+  }
+
+  // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
+  if (SImm < 0) {
+    SImm = -SImm;
+    isAdd = false;
+  }
+
+  Imm = SImm;
+  return isAdd;
+}
+
+/// getBranchTargetOpValue - Helper function to get the branch target operand,
+/// which is either an immediate or requires a fixup.
+static uint32_t getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                       unsigned FixupKind,
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+
+  // If the destination is an immediate, we have nothing to do.
+  if (MO.isImm()) return MO.getImm();
+  assert(MO.isExpr() && "Unexpected branch target type!");
+  const MCExpr *Expr = MO.getExpr();
+  MCFixupKind Kind = MCFixupKind(FixupKind);
+  Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
+
+  // All of the information is in the fixup.
+  return 0;
+}
+
+// Thumb BL and BLX use a strange offset encoding where bits 22 and 21 are
+// determined by negating them and XOR'ing them with bit 23.
+static int32_t encodeThumbBLOffset(int32_t offset) {
+  offset >>= 1;
+  uint32_t S  = (offset & 0x800000) >> 23;
+  uint32_t J1 = (offset & 0x400000) >> 22;
+  uint32_t J2 = (offset & 0x200000) >> 21;
+  J1 = (~J1 & 0x1);
+  J2 = (~J2 & 0x1);
+  J1 ^= S;
+  J2 ^= S;
+
+  offset &= ~0x600000;
+  offset |= J1 << 22;
+  offset |= J2 << 21;
+
+  return offset;
+}
+
+/// getThumbBLTargetOpValue - Return encoding info for immediate branch target.
+uint32_t ARMMCCodeEmitter::
+getThumbBLTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                        SmallVectorImpl<MCFixup> &Fixups,
+                        const MCSubtargetInfo &STI) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_bl,
+                                    Fixups, STI);
+  return encodeThumbBLOffset(MO.getImm());
+}
+
+/// getThumbBLXTargetOpValue - Return encoding info for Thumb immediate
+/// BLX branch target.
+uint32_t ARMMCCodeEmitter::
+getThumbBLXTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_blx,
+                                    Fixups, STI);
+  return encodeThumbBLOffset(MO.getImm());
+}
+
+/// getThumbBRTargetOpValue - Return encoding info for Thumb branch target.
+uint32_t ARMMCCodeEmitter::
+getThumbBRTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                        SmallVectorImpl<MCFixup> &Fixups,
+                        const MCSubtargetInfo &STI) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_br,
+                                    Fixups, STI);
+  return (MO.getImm() >> 1);
+}
+
+/// getThumbBCCTargetOpValue - Return encoding info for Thumb branch target.
+uint32_t ARMMCCodeEmitter::
+getThumbBCCTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_bcc,
+                                    Fixups, STI);
+  return (MO.getImm() >> 1);
+}
+
+/// getThumbCBTargetOpValue - Return encoding info for Thumb branch target.
+uint32_t ARMMCCodeEmitter::
+getThumbCBTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                        SmallVectorImpl<MCFixup> &Fixups,
+                        const MCSubtargetInfo &STI) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_cb, Fixups, STI);
+  return (MO.getImm() >> 1);
+}
+
+/// Return true if this branch has a non-always predication
+static bool HasConditionalBranch(const MCInst &MI) {
+  int NumOp = MI.getNumOperands();
+  if (NumOp >= 2) {
+    for (int i = 0; i < NumOp-1; ++i) {
+      const MCOperand &MCOp1 = MI.getOperand(i);
+      const MCOperand &MCOp2 = MI.getOperand(i + 1);
+      if (MCOp1.isImm() && MCOp2.isReg() &&
+          (MCOp2.getReg() == 0 || MCOp2.getReg() == ARM::CPSR)) {
+        if (ARMCC::CondCodes(MCOp1.getImm()) != ARMCC::AL)
+          return true;
+      }
+    }
+  }
+  return false;
+}
+
+/// getBranchTargetOpValue - Return encoding info for 24-bit immediate branch
+/// target.
+uint32_t ARMMCCodeEmitter::
+getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                       SmallVectorImpl<MCFixup> &Fixups,
+                       const MCSubtargetInfo &STI) const {
+  // FIXME: This really, really shouldn't use TargetMachine. We don't want
+  // coupling between MC and TM anywhere we can help it.
+  if (isThumb2(STI))
+    return
+      ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_t2_condbranch, Fixups, STI);
+  return getARMBranchTargetOpValue(MI, OpIdx, Fixups, STI);
+}
+
+/// getBranchTargetOpValue - Return encoding info for 24-bit immediate branch
+/// target.
+uint32_t ARMMCCodeEmitter::
+getARMBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                          SmallVectorImpl<MCFixup> &Fixups,
+                          const MCSubtargetInfo &STI) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr()) {
+    if (HasConditionalBranch(MI))
+      return ::getBranchTargetOpValue(MI, OpIdx,
+                                      ARM::fixup_arm_condbranch, Fixups, STI);
+    return ::getBranchTargetOpValue(MI, OpIdx,
+                                    ARM::fixup_arm_uncondbranch, Fixups, STI);
+  }
+
+  return MO.getImm() >> 2;
+}
+
+uint32_t ARMMCCodeEmitter::
+getARMBLTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                          SmallVectorImpl<MCFixup> &Fixups,
+                          const MCSubtargetInfo &STI) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr()) {
+    if (HasConditionalBranch(MI))
+      return ::getBranchTargetOpValue(MI, OpIdx, 
+                                      ARM::fixup_arm_condbl, Fixups, STI);
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_uncondbl, Fixups, STI);
+  }
+
+  return MO.getImm() >> 2;
+}
+
+uint32_t ARMMCCodeEmitter::
+getARMBLXTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                          SmallVectorImpl<MCFixup> &Fixups,
+                          const MCSubtargetInfo &STI) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_blx, Fixups, STI);
+
+  return MO.getImm() >> 1;
+}
+
+/// getUnconditionalBranchTargetOpValue - Return encoding info for 24-bit
+/// immediate branch target.
+uint32_t ARMMCCodeEmitter::
+getUnconditionalBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                       SmallVectorImpl<MCFixup> &Fixups,
+                       const MCSubtargetInfo &STI) const {
+  unsigned Val = 0;
+  const MCOperand MO = MI.getOperand(OpIdx);
+    
+  if(MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_t2_uncondbranch, Fixups, STI);
+  else 
+    Val = MO.getImm() >> 1;
+
+  bool I  = (Val & 0x800000);
+  bool J1 = (Val & 0x400000);
+  bool J2 = (Val & 0x200000);
+  if (I ^ J1)
+    Val &= ~0x400000;
+  else
+    Val |= 0x400000;
+
+  if (I ^ J2)
+    Val &= ~0x200000;
+  else
+    Val |= 0x200000;
+
+  return Val;
+}
+
+/// getAdrLabelOpValue - Return encoding info for 12-bit shifted-immediate
+/// ADR label target.
+uint32_t ARMMCCodeEmitter::
+getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                   SmallVectorImpl<MCFixup> &Fixups,
+                   const MCSubtargetInfo &STI) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_adr_pcrel_12,
+                                    Fixups, STI);
+  int64_t offset = MO.getImm();
+  uint32_t Val = 0x2000;
+
+  int SoImmVal;
+  if (offset == INT32_MIN) {
+    Val = 0x1000;
+    SoImmVal = 0;
+  } else if (offset < 0) {
+    Val = 0x1000;
+    offset *= -1;
+    SoImmVal = ARM_AM::getSOImmVal(offset);
+    if(SoImmVal == -1) {
+      Val = 0x2000;
+      offset *= -1;
+      SoImmVal = ARM_AM::getSOImmVal(offset);
+    }
+  } else {
+    SoImmVal = ARM_AM::getSOImmVal(offset);
+    if(SoImmVal == -1) {
+      Val = 0x1000;
+      offset *= -1;
+      SoImmVal = ARM_AM::getSOImmVal(offset);
+    }
+  }
+
+  assert(SoImmVal != -1 && "Not a valid so_imm value!");
+
+  Val |= SoImmVal;
+  return Val;
+}
+
+/// getT2AdrLabelOpValue - Return encoding info for 12-bit immediate ADR label
+/// target.
+uint32_t ARMMCCodeEmitter::
+getT2AdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                   SmallVectorImpl<MCFixup> &Fixups,
+                   const MCSubtargetInfo &STI) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_t2_adr_pcrel_12,
+                                    Fixups, STI);
+  int32_t Val = MO.getImm();
+  if (Val == INT32_MIN)
+    Val = 0x1000;
+  else if (Val < 0) {
+    Val *= -1;
+    Val |= 0x1000;
+  }
+  return Val;
+}
+
+/// getThumbAdrLabelOpValue - Return encoding info for 8-bit immediate ADR label
+/// target.
+uint32_t ARMMCCodeEmitter::
+getThumbAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                   SmallVectorImpl<MCFixup> &Fixups,
+                   const MCSubtargetInfo &STI) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_thumb_adr_pcrel_10,
+                                    Fixups, STI);
+  return MO.getImm();
+}
+
+/// getThumbAddrModeRegRegOpValue - Return encoding info for 'reg + reg'
+/// operand.
+uint32_t ARMMCCodeEmitter::
+getThumbAddrModeRegRegOpValue(const MCInst &MI, unsigned OpIdx,
+                              SmallVectorImpl<MCFixup> &,
+                              const MCSubtargetInfo &STI) const {
+  // [Rn, Rm]
+  //   {5-3} = Rm
+  //   {2-0} = Rn
+  const MCOperand &MO1 = MI.getOperand(OpIdx);
+  const MCOperand &MO2 = MI.getOperand(OpIdx + 1);
+  unsigned Rn = CTX.getRegisterInfo()->getEncodingValue(MO1.getReg());
+  unsigned Rm = CTX.getRegisterInfo()->getEncodingValue(MO2.getReg());
+  return (Rm << 3) | Rn;
+}
+
+/// getAddrModeImm12OpValue - Return encoding info for 'reg +/- imm12' operand.
+uint32_t ARMMCCodeEmitter::
+getAddrModeImm12OpValue(const MCInst &MI, unsigned OpIdx,
+                        SmallVectorImpl<MCFixup> &Fixups,
+                        const MCSubtargetInfo &STI) const {
+  // {17-13} = reg
+  // {12}    = (U)nsigned (add == '1', sub == '0')
+  // {11-0}  = imm12
+  unsigned Reg, Imm12;
+  bool isAdd = true;
+  // If The first operand isn't a register, we have a label reference.
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  if (!MO.isReg()) {
+    Reg = CTX.getRegisterInfo()->getEncodingValue(ARM::PC);   // Rn is PC.
+    Imm12 = 0;
+
+    if (MO.isExpr()) {
+      const MCExpr *Expr = MO.getExpr();
+      isAdd = false ; // 'U' bit is set as part of the fixup.
+
+      MCFixupKind Kind;
+      if (isThumb2(STI))
+        Kind = MCFixupKind(ARM::fixup_t2_ldst_pcrel_12);
+      else
+        Kind = MCFixupKind(ARM::fixup_arm_ldst_pcrel_12);
+      Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
+
+      ++MCNumCPRelocations;
+    } else {
+      Reg = ARM::PC;
+      int32_t Offset = MO.getImm();
+      if (Offset == INT32_MIN) {
+        Offset = 0;
+        isAdd = false;
+      } else if (Offset < 0) {
+        Offset *= -1;
+        isAdd = false;
+      }
+      Imm12 = Offset;
+    }
+  } else
+    isAdd = EncodeAddrModeOpValues(MI, OpIdx, Reg, Imm12, Fixups, STI);
+
+  uint32_t Binary = Imm12 & 0xfff;
+  // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
+  if (isAdd)
+    Binary |= (1 << 12);
+  Binary |= (Reg << 13);
+  return Binary;
+}
+
+/// getT2Imm8s4OpValue - Return encoding info for
+/// '+/- imm8<<2' operand.
+uint32_t ARMMCCodeEmitter::
+getT2Imm8s4OpValue(const MCInst &MI, unsigned OpIdx,
+                   SmallVectorImpl<MCFixup> &Fixups,
+                   const MCSubtargetInfo &STI) const {
+  // FIXME: The immediate operand should have already been encoded like this
+  // before ever getting here. The encoder method should just need to combine
+  // the MI operands for the register and the offset into a single
+  // representation for the complex operand in the .td file. This isn't just
+  // style, unfortunately. As-is, we can't represent the distinct encoding
+  // for #-0.
+
+  // {8}    = (U)nsigned (add == '1', sub == '0')
+  // {7-0}  = imm8
+  int32_t Imm8 = MI.getOperand(OpIdx).getImm();
+  bool isAdd = Imm8 >= 0;
+
+  // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
+  if (Imm8 < 0)
+    Imm8 = -(uint32_t)Imm8;
+
+  // Scaled by 4.
+  Imm8 /= 4;
+
+  uint32_t Binary = Imm8 & 0xff;
+  // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
+  if (isAdd)
+    Binary |= (1 << 8);
+  return Binary;
+}
+
+/// getT2AddrModeImm8s4OpValue - Return encoding info for
+/// 'reg +/- imm8<<2' operand.
+uint32_t ARMMCCodeEmitter::
+getT2AddrModeImm8s4OpValue(const MCInst &MI, unsigned OpIdx,
+                        SmallVectorImpl<MCFixup> &Fixups,
+                        const MCSubtargetInfo &STI) const {
+  // {12-9} = reg
+  // {8}    = (U)nsigned (add == '1', sub == '0')
+  // {7-0}  = imm8
+  unsigned Reg, Imm8;
+  bool isAdd = true;
+  // If The first operand isn't a register, we have a label reference.
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  if (!MO.isReg()) {
+    Reg = CTX.getRegisterInfo()->getEncodingValue(ARM::PC);   // Rn is PC.
+    Imm8 = 0;
+    isAdd = false ; // 'U' bit is set as part of the fixup.
+
+    assert(MO.isExpr() && "Unexpected machine operand type!");
+    const MCExpr *Expr = MO.getExpr();
+    MCFixupKind Kind = MCFixupKind(ARM::fixup_t2_pcrel_10);
+    Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
+
+    ++MCNumCPRelocations;
+  } else
+    isAdd = EncodeAddrModeOpValues(MI, OpIdx, Reg, Imm8, Fixups, STI);
+
+  // FIXME: The immediate operand should have already been encoded like this
+  // before ever getting here. The encoder method should just need to combine
+  // the MI operands for the register and the offset into a single
+  // representation for the complex operand in the .td file. This isn't just
+  // style, unfortunately. As-is, we can't represent the distinct encoding
+  // for #-0.
+  uint32_t Binary = (Imm8 >> 2) & 0xff;
+  // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
+  if (isAdd)
+    Binary |= (1 << 8);
+  Binary |= (Reg << 9);
+  return Binary;
+}
+
+/// getT2AddrModeImm0_1020s4OpValue - Return encoding info for
+/// 'reg + imm8<<2' operand.
+uint32_t ARMMCCodeEmitter::
+getT2AddrModeImm0_1020s4OpValue(const MCInst &MI, unsigned OpIdx,
+                        SmallVectorImpl<MCFixup> &Fixups,
+                        const MCSubtargetInfo &STI) const {
+  // {11-8} = reg
+  // {7-0}  = imm8
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+  unsigned Reg = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
+  unsigned Imm8 = MO1.getImm();
+  return (Reg << 8) | Imm8;
+}
+
+uint32_t
+ARMMCCodeEmitter::getHiLo16ImmOpValue(const MCInst &MI, unsigned OpIdx,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const {
+  // {20-16} = imm{15-12}
+  // {11-0}  = imm{11-0}
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  if (MO.isImm())
+    // Hi / lo 16 bits already extracted during earlier passes.
+    return static_cast<unsigned>(MO.getImm());
+
+  // Handle :upper16: and :lower16: assembly prefixes.
+  const MCExpr *E = MO.getExpr();
+  MCFixupKind Kind;
+  if (E->getKind() == MCExpr::Target) {
+    const ARMMCExpr *ARM16Expr = cast<ARMMCExpr>(E);
+    E = ARM16Expr->getSubExpr();
+
+    if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(E)) {
+      const int64_t Value = MCE->getValue();
+      if (Value > UINT32_MAX)
+        report_fatal_error("constant value truncated (limited to 32-bit)");
+
+      switch (ARM16Expr->getKind()) {
+      case ARMMCExpr::VK_ARM_HI16:
+        return (int32_t(Value) & 0xffff0000) >> 16;
+      case ARMMCExpr::VK_ARM_LO16:
+        return (int32_t(Value) & 0x0000ffff);
+      default: llvm_unreachable("Unsupported ARMFixup");
+      }
+    }
+
+    switch (ARM16Expr->getKind()) {
+    default: llvm_unreachable("Unsupported ARMFixup");
+    case ARMMCExpr::VK_ARM_HI16:
+      Kind = MCFixupKind(isThumb2(STI) ? ARM::fixup_t2_movt_hi16
+                                       : ARM::fixup_arm_movt_hi16);
+      break;
+    case ARMMCExpr::VK_ARM_LO16:
+      Kind = MCFixupKind(isThumb2(STI) ? ARM::fixup_t2_movw_lo16
+                                       : ARM::fixup_arm_movw_lo16);
+      break;
+    }
+
+    Fixups.push_back(MCFixup::Create(0, E, Kind, MI.getLoc()));
+    return 0;
+  }
+  // If the expression doesn't have :upper16: or :lower16: on it,
+  // it's just a plain immediate expression, previously those evaluated to
+  // the lower 16 bits of the expression regardless of whether
+  // we have a movt or a movw, but that led to misleadingly results.
+  // This is now disallowed in the the AsmParser in validateInstruction()
+  // so this should never happen.
+  llvm_unreachable("expression without :upper16: or :lower16:");
+}
+
+uint32_t ARMMCCodeEmitter::
+getLdStSORegOpValue(const MCInst &MI, unsigned OpIdx,
+                    SmallVectorImpl<MCFixup> &Fixups,
+                    const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx+1);
+  const MCOperand &MO2 = MI.getOperand(OpIdx+2);
+  unsigned Rn = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
+  unsigned Rm = CTX.getRegisterInfo()->getEncodingValue(MO1.getReg());
+  unsigned ShImm = ARM_AM::getAM2Offset(MO2.getImm());
+  bool isAdd = ARM_AM::getAM2Op(MO2.getImm()) == ARM_AM::add;
+  ARM_AM::ShiftOpc ShOp = ARM_AM::getAM2ShiftOpc(MO2.getImm());
+  unsigned SBits = getShiftOp(ShOp);
+
+  // While "lsr #32" and "asr #32" exist, they are encoded with a 0 in the shift
+  // amount. However, it would be an easy mistake to make so check here.
+  assert((ShImm & ~0x1f) == 0 && "Out of range shift amount");
+
+  // {16-13} = Rn
+  // {12}    = isAdd
+  // {11-0}  = shifter
+  //  {3-0}  = Rm
+  //  {4}    = 0
+  //  {6-5}  = type
+  //  {11-7} = imm
+  uint32_t Binary = Rm;
+  Binary |= Rn << 13;
+  Binary |= SBits << 5;
+  Binary |= ShImm << 7;
+  if (isAdd)
+    Binary |= 1 << 12;
+  return Binary;
+}
+
+uint32_t ARMMCCodeEmitter::
+getAddrMode2OpValue(const MCInst &MI, unsigned OpIdx,
+                    SmallVectorImpl<MCFixup> &Fixups,
+                    const MCSubtargetInfo &STI) const {
+  // {17-14}  Rn
+  // {13}     1 == imm12, 0 == Rm
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  unsigned Rn = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
+  uint32_t Binary = getAddrMode2OffsetOpValue(MI, OpIdx + 1, Fixups, STI);
+  Binary |= Rn << 14;
+  return Binary;
+}
+
+uint32_t ARMMCCodeEmitter::
+getAddrMode2OffsetOpValue(const MCInst &MI, unsigned OpIdx,
+                          SmallVectorImpl<MCFixup> &Fixups,
+                          const MCSubtargetInfo &STI) const {
+  // {13}     1 == imm12, 0 == Rm
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx+1);
+  unsigned Imm = MO1.getImm();
+  bool isAdd = ARM_AM::getAM2Op(Imm) == ARM_AM::add;
+  bool isReg = MO.getReg() != 0;
+  uint32_t Binary = ARM_AM::getAM2Offset(Imm);
+  // if reg +/- reg, Rm will be non-zero. Otherwise, we have reg +/- imm12
+  if (isReg) {
+    ARM_AM::ShiftOpc ShOp = ARM_AM::getAM2ShiftOpc(Imm);
+    Binary <<= 7;                    // Shift amount is bits [11:7]
+    Binary |= getShiftOp(ShOp) << 5; // Shift type is bits [6:5]
+    Binary |= CTX.getRegisterInfo()->getEncodingValue(MO.getReg()); // Rm is bits [3:0]
+  }
+  return Binary | (isAdd << 12) | (isReg << 13);
+}
+
+uint32_t ARMMCCodeEmitter::
+getPostIdxRegOpValue(const MCInst &MI, unsigned OpIdx,
+                     SmallVectorImpl<MCFixup> &Fixups,
+                     const MCSubtargetInfo &STI) const {
+  // {4}      isAdd
+  // {3-0}    Rm
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx+1);
+  bool isAdd = MO1.getImm() != 0;
+  return CTX.getRegisterInfo()->getEncodingValue(MO.getReg()) | (isAdd << 4);
+}
+
+uint32_t ARMMCCodeEmitter::
+getAddrMode3OffsetOpValue(const MCInst &MI, unsigned OpIdx,
+                          SmallVectorImpl<MCFixup> &Fixups,
+                          const MCSubtargetInfo &STI) const {
+  // {9}      1 == imm8, 0 == Rm
+  // {8}      isAdd
+  // {7-4}    imm7_4/zero
+  // {3-0}    imm3_0/Rm
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx+1);
+  unsigned Imm = MO1.getImm();
+  bool isAdd = ARM_AM::getAM3Op(Imm) == ARM_AM::add;
+  bool isImm = MO.getReg() == 0;
+  uint32_t Imm8 = ARM_AM::getAM3Offset(Imm);
+  // if reg +/- reg, Rm will be non-zero. Otherwise, we have reg +/- imm8
+  if (!isImm)
+    Imm8 = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
+  return Imm8 | (isAdd << 8) | (isImm << 9);
+}
+
+uint32_t ARMMCCodeEmitter::
+getAddrMode3OpValue(const MCInst &MI, unsigned OpIdx,
+                    SmallVectorImpl<MCFixup> &Fixups,
+                    const MCSubtargetInfo &STI) const {
+  // {13}     1 == imm8, 0 == Rm
+  // {12-9}   Rn
+  // {8}      isAdd
+  // {7-4}    imm7_4/zero
+  // {3-0}    imm3_0/Rm
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx+1);
+  const MCOperand &MO2 = MI.getOperand(OpIdx+2);
+
+  // If The first operand isn't a register, we have a label reference.
+  if (!MO.isReg()) {
+    unsigned Rn = CTX.getRegisterInfo()->getEncodingValue(ARM::PC);   // Rn is PC.
+
+    assert(MO.isExpr() && "Unexpected machine operand type!");
+    const MCExpr *Expr = MO.getExpr();
+    MCFixupKind Kind = MCFixupKind(ARM::fixup_arm_pcrel_10_unscaled);
+    Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
+
+    ++MCNumCPRelocations;
+    return (Rn << 9) | (1 << 13);
+  }
+  unsigned Rn = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
+  unsigned Imm = MO2.getImm();
+  bool isAdd = ARM_AM::getAM3Op(Imm) == ARM_AM::add;
+  bool isImm = MO1.getReg() == 0;
+  uint32_t Imm8 = ARM_AM::getAM3Offset(Imm);
+  // if reg +/- reg, Rm will be non-zero. Otherwise, we have reg +/- imm8
+  if (!isImm)
+    Imm8 = CTX.getRegisterInfo()->getEncodingValue(MO1.getReg());
+  return (Rn << 9) | Imm8 | (isAdd << 8) | (isImm << 13);
+}
+
+/// getAddrModeThumbSPOpValue - Encode the t_addrmode_sp operands.
+uint32_t ARMMCCodeEmitter::
+getAddrModeThumbSPOpValue(const MCInst &MI, unsigned OpIdx,
+                          SmallVectorImpl<MCFixup> &Fixups,
+                          const MCSubtargetInfo &STI) const {
+  // [SP, #imm]
+  //   {7-0} = imm8
+  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+  assert(MI.getOperand(OpIdx).getReg() == ARM::SP &&
+         "Unexpected base register!");
+
+  // The immediate is already shifted for the implicit zeroes, so no change
+  // here.
+  return MO1.getImm() & 0xff;
+}
+
+/// getAddrModeISOpValue - Encode the t_addrmode_is# operands.
+uint32_t ARMMCCodeEmitter::
+getAddrModeISOpValue(const MCInst &MI, unsigned OpIdx,
+                     SmallVectorImpl<MCFixup> &Fixups,
+                     const MCSubtargetInfo &STI) const {
+  // [Rn, #imm]
+  //   {7-3} = imm5
+  //   {2-0} = Rn
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+  unsigned Rn = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
+  unsigned Imm5 = MO1.getImm();
+  return ((Imm5 & 0x1f) << 3) | Rn;
+}
+
+/// getAddrModePCOpValue - Return encoding for t_addrmode_pc operands.
+uint32_t ARMMCCodeEmitter::
+getAddrModePCOpValue(const MCInst &MI, unsigned OpIdx,
+                     SmallVectorImpl<MCFixup> &Fixups,
+                     const MCSubtargetInfo &STI) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_cp, Fixups, STI);
+  return (MO.getImm() >> 2);
+}
+
+/// getAddrMode5OpValue - Return encoding info for 'reg +/- imm10' operand.
+uint32_t ARMMCCodeEmitter::
+getAddrMode5OpValue(const MCInst &MI, unsigned OpIdx,
+                    SmallVectorImpl<MCFixup> &Fixups,
+                    const MCSubtargetInfo &STI) const {
+  // {12-9} = reg
+  // {8}    = (U)nsigned (add == '1', sub == '0')
+  // {7-0}  = imm8
+  unsigned Reg, Imm8;
+  bool isAdd;
+  // If The first operand isn't a register, we have a label reference.
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  if (!MO.isReg()) {
+    Reg = CTX.getRegisterInfo()->getEncodingValue(ARM::PC);   // Rn is PC.
+    Imm8 = 0;
+    isAdd = false; // 'U' bit is handled as part of the fixup.
+
+    assert(MO.isExpr() && "Unexpected machine operand type!");
+    const MCExpr *Expr = MO.getExpr();
+    MCFixupKind Kind;
+    if (isThumb2(STI))
+      Kind = MCFixupKind(ARM::fixup_t2_pcrel_10);
+    else
+      Kind = MCFixupKind(ARM::fixup_arm_pcrel_10);
+    Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
+
+    ++MCNumCPRelocations;
+  } else {
+    EncodeAddrModeOpValues(MI, OpIdx, Reg, Imm8, Fixups, STI);
+    isAdd = ARM_AM::getAM5Op(Imm8) == ARM_AM::add;
+  }
+
+  uint32_t Binary = ARM_AM::getAM5Offset(Imm8);
+  // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
+  if (isAdd)
+    Binary |= (1 << 8);
+  Binary |= (Reg << 9);
+  return Binary;
+}
+
+unsigned ARMMCCodeEmitter::
+getSORegRegOpValue(const MCInst &MI, unsigned OpIdx,
+                SmallVectorImpl<MCFixup> &Fixups,
+                const MCSubtargetInfo &STI) const {
+  // Sub-operands are [reg, reg, imm]. The first register is Rm, the reg to be
+  // shifted. The second is Rs, the amount to shift by, and the third specifies
+  // the type of the shift.
+  //
+  // {3-0} = Rm.
+  // {4}   = 1
+  // {6-5} = type
+  // {11-8} = Rs
+  // {7}    = 0
+
+  const MCOperand &MO  = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+  const MCOperand &MO2 = MI.getOperand(OpIdx + 2);
+  ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(MO2.getImm());
+
+  // Encode Rm.
+  unsigned Binary = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
+
+  // Encode the shift opcode.
+  unsigned SBits = 0;
+  unsigned Rs = MO1.getReg();
+  if (Rs) {
+    // Set shift operand (bit[7:4]).
+    // LSL - 0001
+    // LSR - 0011
+    // ASR - 0101
+    // ROR - 0111
+    switch (SOpc) {
+    default: llvm_unreachable("Unknown shift opc!");
+    case ARM_AM::lsl: SBits = 0x1; break;
+    case ARM_AM::lsr: SBits = 0x3; break;
+    case ARM_AM::asr: SBits = 0x5; break;
+    case ARM_AM::ror: SBits = 0x7; break;
+    }
+  }
+
+  Binary |= SBits << 4;
+
+  // Encode the shift operation Rs.
+  // Encode Rs bit[11:8].
+  assert(ARM_AM::getSORegOffset(MO2.getImm()) == 0);
+  return Binary | (CTX.getRegisterInfo()->getEncodingValue(Rs) << ARMII::RegRsShift);
+}
+
+unsigned ARMMCCodeEmitter::
+getSORegImmOpValue(const MCInst &MI, unsigned OpIdx,
+                SmallVectorImpl<MCFixup> &Fixups,
+                const MCSubtargetInfo &STI) const {
+  // Sub-operands are [reg, imm]. The first register is Rm, the reg to be
+  // shifted. The second is the amount to shift by.
+  //
+  // {3-0} = Rm.
+  // {4}   = 0
+  // {6-5} = type
+  // {11-7} = imm
+
+  const MCOperand &MO  = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+  ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(MO1.getImm());
+
+  // Encode Rm.
+  unsigned Binary = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
+
+  // Encode the shift opcode.
+  unsigned SBits = 0;
+
+  // Set shift operand (bit[6:4]).
+  // LSL - 000
+  // LSR - 010
+  // ASR - 100
+  // ROR - 110
+  // RRX - 110 and bit[11:8] clear.
+  switch (SOpc) {
+  default: llvm_unreachable("Unknown shift opc!");
+  case ARM_AM::lsl: SBits = 0x0; break;
+  case ARM_AM::lsr: SBits = 0x2; break;
+  case ARM_AM::asr: SBits = 0x4; break;
+  case ARM_AM::ror: SBits = 0x6; break;
+  case ARM_AM::rrx:
+    Binary |= 0x60;
+    return Binary;
+  }
+
+  // Encode shift_imm bit[11:7].
+  Binary |= SBits << 4;
+  unsigned Offset = ARM_AM::getSORegOffset(MO1.getImm());
+  assert(Offset < 32 && "Offset must be in range 0-31!");
+  return Binary | (Offset << 7);
+}
+
+
+unsigned ARMMCCodeEmitter::
+getT2AddrModeSORegOpValue(const MCInst &MI, unsigned OpNum,
+                SmallVectorImpl<MCFixup> &Fixups,
+                const MCSubtargetInfo &STI) const {
+  const MCOperand &MO1 = MI.getOperand(OpNum);
+  const MCOperand &MO2 = MI.getOperand(OpNum+1);
+  const MCOperand &MO3 = MI.getOperand(OpNum+2);
+
+  // Encoded as [Rn, Rm, imm].
+  // FIXME: Needs fixup support.
+  unsigned Value = CTX.getRegisterInfo()->getEncodingValue(MO1.getReg());
+  Value <<= 4;
+  Value |= CTX.getRegisterInfo()->getEncodingValue(MO2.getReg());
+  Value <<= 2;
+  Value |= MO3.getImm();
+
+  return Value;
+}
+
+unsigned ARMMCCodeEmitter::
+getT2AddrModeImm8OpValue(const MCInst &MI, unsigned OpNum,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const {
+  const MCOperand &MO1 = MI.getOperand(OpNum);
+  const MCOperand &MO2 = MI.getOperand(OpNum+1);
+
+  // FIXME: Needs fixup support.
+  unsigned Value = CTX.getRegisterInfo()->getEncodingValue(MO1.getReg());
+
+  // Even though the immediate is 8 bits long, we need 9 bits in order
+  // to represent the (inverse of the) sign bit.
+  Value <<= 9;
+  int32_t tmp = (int32_t)MO2.getImm();
+  if (tmp < 0)
+    tmp = abs(tmp);
+  else
+    Value |= 256; // Set the ADD bit
+  Value |= tmp & 255;
+  return Value;
+}
+
+unsigned ARMMCCodeEmitter::
+getT2AddrModeImm8OffsetOpValue(const MCInst &MI, unsigned OpNum,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const {
+  const MCOperand &MO1 = MI.getOperand(OpNum);
+
+  // FIXME: Needs fixup support.
+  unsigned Value = 0;
+  int32_t tmp = (int32_t)MO1.getImm();
+  if (tmp < 0)
+    tmp = abs(tmp);
+  else
+    Value |= 256; // Set the ADD bit
+  Value |= tmp & 255;
+  return Value;
+}
+
+unsigned ARMMCCodeEmitter::
+getT2AddrModeImm12OffsetOpValue(const MCInst &MI, unsigned OpNum,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const {
+  const MCOperand &MO1 = MI.getOperand(OpNum);
+
+  // FIXME: Needs fixup support.
+  unsigned Value = 0;
+  int32_t tmp = (int32_t)MO1.getImm();
+  if (tmp < 0)
+    tmp = abs(tmp);
+  else
+    Value |= 4096; // Set the ADD bit
+  Value |= tmp & 4095;
+  return Value;
+}
+
+unsigned ARMMCCodeEmitter::
+getT2SORegOpValue(const MCInst &MI, unsigned OpIdx,
+                SmallVectorImpl<MCFixup> &Fixups,
+                const MCSubtargetInfo &STI) const {
+  // Sub-operands are [reg, imm]. The first register is Rm, the reg to be
+  // shifted. The second is the amount to shift by.
+  //
+  // {3-0} = Rm.
+  // {4}   = 0
+  // {6-5} = type
+  // {11-7} = imm
+
+  const MCOperand &MO  = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+  ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(MO1.getImm());
+
+  // Encode Rm.
+  unsigned Binary = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
+
+  // Encode the shift opcode.
+  unsigned SBits = 0;
+  // Set shift operand (bit[6:4]).
+  // LSL - 000
+  // LSR - 010
+  // ASR - 100
+  // ROR - 110
+  switch (SOpc) {
+  default: llvm_unreachable("Unknown shift opc!");
+  case ARM_AM::lsl: SBits = 0x0; break;
+  case ARM_AM::lsr: SBits = 0x2; break;
+  case ARM_AM::asr: SBits = 0x4; break;
+  case ARM_AM::rrx: // FALLTHROUGH
+  case ARM_AM::ror: SBits = 0x6; break;
+  }
+
+  Binary |= SBits << 4;
+  if (SOpc == ARM_AM::rrx)
+    return Binary;
+
+  // Encode shift_imm bit[11:7].
+  return Binary | ARM_AM::getSORegOffset(MO1.getImm()) << 7;
+}
+
+unsigned ARMMCCodeEmitter::
+getBitfieldInvertedMaskOpValue(const MCInst &MI, unsigned Op,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const {
+  // 10 bits. lower 5 bits are are the lsb of the mask, high five bits are the
+  // msb of the mask.
+  const MCOperand &MO = MI.getOperand(Op);
+  uint32_t v = ~MO.getImm();
+  uint32_t lsb = countTrailingZeros(v);
+  uint32_t msb = (32 - countLeadingZeros (v)) - 1;
+  assert (v != 0 && lsb < 32 && msb < 32 && "Illegal bitfield mask!");
+  return lsb | (msb << 5);
+}
+
+unsigned ARMMCCodeEmitter::
+getRegisterListOpValue(const MCInst &MI, unsigned Op,
+                       SmallVectorImpl<MCFixup> &Fixups,
+                       const MCSubtargetInfo &STI) const {
+  // VLDM/VSTM:
+  //   {12-8} = Vd
+  //   {7-0}  = Number of registers
+  //
+  // LDM/STM:
+  //   {15-0}  = Bitfield of GPRs.
+  unsigned Reg = MI.getOperand(Op).getReg();
+  bool SPRRegs = ARMMCRegisterClasses[ARM::SPRRegClassID].contains(Reg);
+  bool DPRRegs = ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg);
+
+  unsigned Binary = 0;
+
+  if (SPRRegs || DPRRegs) {
+    // VLDM/VSTM
+    unsigned RegNo = CTX.getRegisterInfo()->getEncodingValue(Reg);
+    unsigned NumRegs = (MI.getNumOperands() - Op) & 0xff;
+    Binary |= (RegNo & 0x1f) << 8;
+    if (SPRRegs)
+      Binary |= NumRegs;
+    else
+      Binary |= NumRegs * 2;
+  } else {
+    for (unsigned I = Op, E = MI.getNumOperands(); I < E; ++I) {
+      unsigned RegNo = CTX.getRegisterInfo()->getEncodingValue(MI.getOperand(I).getReg());
+      Binary |= 1 << RegNo;
+    }
+  }
+
+  return Binary;
+}
+
+/// getAddrMode6AddressOpValue - Encode an addrmode6 register number along
+/// with the alignment operand.
+unsigned ARMMCCodeEmitter::
+getAddrMode6AddressOpValue(const MCInst &MI, unsigned Op,
+                           SmallVectorImpl<MCFixup> &Fixups,
+                           const MCSubtargetInfo &STI) const {
+  const MCOperand &Reg = MI.getOperand(Op);
+  const MCOperand &Imm = MI.getOperand(Op + 1);
+
+  unsigned RegNo = CTX.getRegisterInfo()->getEncodingValue(Reg.getReg());
+  unsigned Align = 0;
+
+  switch (Imm.getImm()) {
+  default: break;
+  case 2:
+  case 4:
+  case 8:  Align = 0x01; break;
+  case 16: Align = 0x02; break;
+  case 32: Align = 0x03; break;
+  }
+
+  return RegNo | (Align << 4);
+}
+
+/// getAddrMode6OneLane32AddressOpValue - Encode an addrmode6 register number
+/// along  with the alignment operand for use in VST1 and VLD1 with size 32.
+unsigned ARMMCCodeEmitter::
+getAddrMode6OneLane32AddressOpValue(const MCInst &MI, unsigned Op,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const {
+  const MCOperand &Reg = MI.getOperand(Op);
+  const MCOperand &Imm = MI.getOperand(Op + 1);
+
+  unsigned RegNo = CTX.getRegisterInfo()->getEncodingValue(Reg.getReg());
+  unsigned Align = 0;
+
+  switch (Imm.getImm()) {
+  default: break;
+  case 8:
+  case 16:
+  case 32: // Default '0' value for invalid alignments of 8, 16, 32 bytes.
+  case 2: Align = 0x00; break;
+  case 4: Align = 0x03; break;
+  }
+
+  return RegNo | (Align << 4);
+}
+
+
+/// getAddrMode6DupAddressOpValue - Encode an addrmode6 register number and
+/// alignment operand for use in VLD-dup instructions.  This is the same as
+/// getAddrMode6AddressOpValue except for the alignment encoding, which is
+/// different for VLD4-dup.
+unsigned ARMMCCodeEmitter::
+getAddrMode6DupAddressOpValue(const MCInst &MI, unsigned Op,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const {
+  const MCOperand &Reg = MI.getOperand(Op);
+  const MCOperand &Imm = MI.getOperand(Op + 1);
+
+  unsigned RegNo = CTX.getRegisterInfo()->getEncodingValue(Reg.getReg());
+  unsigned Align = 0;
+
+  switch (Imm.getImm()) {
+  default: break;
+  case 2:
+  case 4:
+  case 8:  Align = 0x01; break;
+  case 16: Align = 0x03; break;
+  }
+
+  return RegNo | (Align << 4);
+}
+
+unsigned ARMMCCodeEmitter::
+getAddrMode6OffsetOpValue(const MCInst &MI, unsigned Op,
+                          SmallVectorImpl<MCFixup> &Fixups,
+                          const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(Op);
+  if (MO.getReg() == 0) return 0x0D;
+  return CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
+}
+
+unsigned ARMMCCodeEmitter::
+getShiftRight8Imm(const MCInst &MI, unsigned Op,
+                  SmallVectorImpl<MCFixup> &Fixups,
+                  const MCSubtargetInfo &STI) const {
+  return 8 - MI.getOperand(Op).getImm();
+}
+
+unsigned ARMMCCodeEmitter::
+getShiftRight16Imm(const MCInst &MI, unsigned Op,
+                   SmallVectorImpl<MCFixup> &Fixups,
+                   const MCSubtargetInfo &STI) const {
+  return 16 - MI.getOperand(Op).getImm();
+}
+
+unsigned ARMMCCodeEmitter::
+getShiftRight32Imm(const MCInst &MI, unsigned Op,
+                   SmallVectorImpl<MCFixup> &Fixups,
+                   const MCSubtargetInfo &STI) const {
+  return 32 - MI.getOperand(Op).getImm();
+}
+
+unsigned ARMMCCodeEmitter::
+getShiftRight64Imm(const MCInst &MI, unsigned Op,
+                   SmallVectorImpl<MCFixup> &Fixups,
+                   const MCSubtargetInfo &STI) const {
+  return 64 - MI.getOperand(Op).getImm();
+}
+
+void ARMMCCodeEmitter::
+EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                  SmallVectorImpl<MCFixup> &Fixups,
+                  const MCSubtargetInfo &STI) const {
+  // Pseudo instructions don't get encoded.
+  const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+  uint64_t TSFlags = Desc.TSFlags;
+  if ((TSFlags & ARMII::FormMask) == ARMII::Pseudo)
+    return;
+
+  int Size;
+  if (Desc.getSize() == 2 || Desc.getSize() == 4)
+    Size = Desc.getSize();
+  else
+    llvm_unreachable("Unexpected instruction size!");
+
+  uint32_t Binary = getBinaryCodeForInstr(MI, Fixups, STI);
+  // Thumb 32-bit wide instructions need to emit the high order halfword
+  // first.
+  if (isThumb(STI) && Size == 4) {
+    EmitConstant(Binary >> 16, 2, OS);
+    EmitConstant(Binary & 0xffff, 2, OS);
+  } else
+    EmitConstant(Binary, Size, OS);
+  ++MCNumEmitted;  // Keep track of the # of mi's emitted.
+}
+
+#include "ARMGenMCCodeEmitter.inc"
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
new file mode 100644
index 0000000..e545e3c
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
@@ -0,0 +1,46 @@
+//===-- ARMMCExpr.cpp - ARM specific MC expression classes ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMMCExpr.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "armmcexpr"
+
+const ARMMCExpr*
+ARMMCExpr::Create(VariantKind Kind, const MCExpr *Expr,
+                       MCContext &Ctx) {
+  return new (Ctx) ARMMCExpr(Kind, Expr);
+}
+
+void ARMMCExpr::PrintImpl(raw_ostream &OS) const {
+  switch (Kind) {
+  default: llvm_unreachable("Invalid kind!");
+  case VK_ARM_HI16: OS << ":upper16:"; break;
+  case VK_ARM_LO16: OS << ":lower16:"; break;
+  }
+
+  const MCExpr *Expr = getSubExpr();
+  if (Expr->getKind() != MCExpr::SymbolRef)
+    OS << '(';
+  Expr->print(OS);
+  if (Expr->getKind() != MCExpr::SymbolRef)
+    OS << ')';
+}
+
+bool
+ARMMCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
+                                     const MCAsmLayout *Layout) const {
+  return false;
+}
+
+void ARMMCExpr::visitUsedExpr(MCStreamer &Streamer) const {
+  Streamer.visitUsedExpr(*getSubExpr());
+}
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
new file mode 100644
index 0000000..c5c0b10
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
@@ -0,0 +1,76 @@
+//===-- ARMMCExpr.h - ARM specific MC expression classes --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMMCEXPR_H
+#define ARMMCEXPR_H
+
+#include "llvm/MC/MCExpr.h"
+
+namespace llvm {
+
+class ARMMCExpr : public MCTargetExpr {
+public:
+  enum VariantKind {
+    VK_ARM_None,
+    VK_ARM_HI16,  // The R_ARM_MOVT_ABS relocation (:upper16: in the .s file)
+    VK_ARM_LO16   // The R_ARM_MOVW_ABS_NC relocation (:lower16: in the .s file)
+  };
+
+private:
+  const VariantKind Kind;
+  const MCExpr *Expr;
+
+  explicit ARMMCExpr(VariantKind _Kind, const MCExpr *_Expr)
+    : Kind(_Kind), Expr(_Expr) {}
+
+public:
+  /// @name Construction
+  /// @{
+
+  static const ARMMCExpr *Create(VariantKind Kind, const MCExpr *Expr,
+                                      MCContext &Ctx);
+
+  static const ARMMCExpr *CreateUpper16(const MCExpr *Expr, MCContext &Ctx) {
+    return Create(VK_ARM_HI16, Expr, Ctx);
+  }
+
+  static const ARMMCExpr *CreateLower16(const MCExpr *Expr, MCContext &Ctx) {
+    return Create(VK_ARM_LO16, Expr, Ctx);
+  }
+
+  /// @}
+  /// @name Accessors
+  /// @{
+
+  /// getOpcode - Get the kind of this expression.
+  VariantKind getKind() const { return Kind; }
+
+  /// getSubExpr - Get the child of this expression.
+  const MCExpr *getSubExpr() const { return Expr; }
+
+  /// @}
+
+  void PrintImpl(raw_ostream &OS) const override;
+  bool EvaluateAsRelocatableImpl(MCValue &Res,
+                                 const MCAsmLayout *Layout) const override;
+  void visitUsedExpr(MCStreamer &Streamer) const override;
+  const MCSection *FindAssociatedSection() const override {
+    return getSubExpr()->FindAssociatedSection();
+  }
+
+  // There are no TLS ARMMCExprs at the moment.
+  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override {}
+
+  static bool classof(const MCExpr *E) {
+    return E->getKind() == MCExpr::Target;
+  }
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
new file mode 100644
index 0000000..6a3ec8f
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -0,0 +1,447 @@
+//===-- ARMMCTargetDesc.cpp - ARM Target Descriptions ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides ARM specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMBaseInfo.h"
+#include "ARMMCAsmInfo.h"
+#include "ARMMCTargetDesc.h"
+#include "InstPrinter/ARMInstPrinter.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCInstrAnalysis.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_REGINFO_MC_DESC
+#include "ARMGenRegisterInfo.inc"
+
+static bool getMCRDeprecationInfo(MCInst &MI, MCSubtargetInfo &STI,
+                                  std::string &Info) {
+  if (STI.getFeatureBits() & llvm::ARM::HasV7Ops &&
+      (MI.getOperand(0).isImm() && MI.getOperand(0).getImm() == 15) &&
+      (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) &&
+      // Checks for the deprecated CP15ISB encoding:
+      // mcr p15, #0, rX, c7, c5, #4
+      (MI.getOperand(3).isImm() && MI.getOperand(3).getImm() == 7)) {
+    if ((MI.getOperand(5).isImm() && MI.getOperand(5).getImm() == 4)) {
+      if (MI.getOperand(4).isImm() && MI.getOperand(4).getImm() == 5) {
+        Info = "deprecated since v7, use 'isb'";
+        return true;
+      }
+
+      // Checks for the deprecated CP15DSB encoding:
+      // mcr p15, #0, rX, c7, c10, #4
+      if (MI.getOperand(4).isImm() && MI.getOperand(4).getImm() == 10) {
+        Info = "deprecated since v7, use 'dsb'";
+        return true;
+      }
+    }
+    // Checks for the deprecated CP15DMB encoding:
+    // mcr p15, #0, rX, c7, c10, #5
+    if (MI.getOperand(4).isImm() && MI.getOperand(4).getImm() == 10 &&
+        (MI.getOperand(5).isImm() && MI.getOperand(5).getImm() == 5)) {
+      Info = "deprecated since v7, use 'dmb'";
+      return true;
+    }
+  }
+  return false;
+}
+
+static bool getITDeprecationInfo(MCInst &MI, MCSubtargetInfo &STI,
+                                  std::string &Info) {
+  if (STI.getFeatureBits() & llvm::ARM::HasV8Ops &&
+      MI.getOperand(1).isImm() && MI.getOperand(1).getImm() != 8) {
+    Info = "applying IT instruction to more than one subsequent instruction is deprecated";
+    return true;
+  }
+
+  return false;
+}
+
+#define GET_INSTRINFO_MC_DESC
+#include "ARMGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "ARMGenSubtargetInfo.inc"
+
+
+std::string ARM_MC::ParseARMTriple(StringRef TT, StringRef CPU) {
+  Triple triple(TT);
+
+  bool isThumb = triple.getArch() == Triple::thumb ||
+                 triple.getArch() == Triple::thumbeb;
+
+  bool NoCPU = CPU == "generic" || CPU.empty();
+  std::string ARMArchFeature;
+  switch (triple.getSubArch()) {
+  case Triple::ARMSubArch_v8:
+    if (NoCPU)
+      // v8a: FeatureDB, FeatureFPARMv8, FeatureNEON, FeatureDSPThumb2,
+      //      FeatureMP, FeatureHWDiv, FeatureHWDivARM, FeatureTrustZone,
+      //      FeatureT2XtPk, FeatureCrypto, FeatureCRC
+      ARMArchFeature = "+v8,+db,+fp-armv8,+neon,+t2dsp,+mp,+hwdiv,+hwdiv-arm,"
+                       "+trustzone,+t2xtpk,+crypto,+crc";
+    else
+      // Use CPU to figure out the exact features
+      ARMArchFeature = "+v8";
+    break;
+  case Triple::ARMSubArch_v7m:
+    isThumb = true;
+    if (NoCPU)
+      // v7m: FeatureNoARM, FeatureDB, FeatureHWDiv, FeatureMClass
+      ARMArchFeature = "+v7,+noarm,+db,+hwdiv,+mclass";
+    else
+      // Use CPU to figure out the exact features.
+      ARMArchFeature = "+v7";
+    break;
+  case Triple::ARMSubArch_v7em:
+    if (NoCPU)
+      // v7em: FeatureNoARM, FeatureDB, FeatureHWDiv, FeatureDSPThumb2,
+      //       FeatureT2XtPk, FeatureMClass
+      ARMArchFeature = "+v7,+noarm,+db,+hwdiv,+t2dsp,t2xtpk,+mclass";
+    else
+      // Use CPU to figure out the exact features.
+      ARMArchFeature = "+v7";
+    break;
+  case Triple::ARMSubArch_v7s:
+    if (NoCPU)
+      // v7s: FeatureNEON, FeatureDB, FeatureDSPThumb2, FeatureHasRAS
+      //      Swift
+      ARMArchFeature = "+v7,+swift,+neon,+db,+t2dsp,+ras";
+    else
+      // Use CPU to figure out the exact features.
+      ARMArchFeature = "+v7";
+    break;
+  case Triple::ARMSubArch_v7:
+    // v7 CPUs have lots of different feature sets. If no CPU is specified,
+    // then assume v7a (e.g. cortex-a8) feature set. Otherwise, return
+    // the "minimum" feature set and use CPU string to figure out the exact
+    // features.
+    if (NoCPU)
+      // v7a: FeatureNEON, FeatureDB, FeatureDSPThumb2, FeatureT2XtPk
+      ARMArchFeature = "+v7,+neon,+db,+t2dsp,+t2xtpk";
+    else
+      // Use CPU to figure out the exact features.
+      ARMArchFeature = "+v7";
+    break;
+  case Triple::ARMSubArch_v6t2:
+    ARMArchFeature = "+v6t2";
+    break;
+  case Triple::ARMSubArch_v6m:
+    isThumb = true;
+    if (NoCPU)
+      // v6m: FeatureNoARM, FeatureMClass
+      ARMArchFeature = "+v6m,+noarm,+mclass";
+    else
+      ARMArchFeature = "+v6";
+    break;
+  case Triple::ARMSubArch_v6:
+    ARMArchFeature = "+v6";
+    break;
+  case Triple::ARMSubArch_v5te:
+    ARMArchFeature = "+v5te";
+    break;
+  case Triple::ARMSubArch_v5:
+    ARMArchFeature = "+v5t";
+    break;
+  case Triple::ARMSubArch_v4t:
+    ARMArchFeature = "+v4t";
+    break;
+  case Triple::NoSubArch:
+    break;
+  }
+
+  if (isThumb) {
+    if (ARMArchFeature.empty())
+      ARMArchFeature = "+thumb-mode";
+    else
+      ARMArchFeature += ",+thumb-mode";
+  }
+
+  if (triple.isOSNaCl()) {
+    if (ARMArchFeature.empty())
+      ARMArchFeature = "+nacl-trap";
+    else
+      ARMArchFeature += ",+nacl-trap";
+  }
+
+  return ARMArchFeature;
+}
+
+MCSubtargetInfo *ARM_MC::createARMMCSubtargetInfo(StringRef TT, StringRef CPU,
+                                                  StringRef FS) {
+  std::string ArchFS = ARM_MC::ParseARMTriple(TT, CPU);
+  if (!FS.empty()) {
+    if (!ArchFS.empty())
+      ArchFS = ArchFS + "," + FS.str();
+    else
+      ArchFS = FS;
+  }
+
+  MCSubtargetInfo *X = new MCSubtargetInfo();
+  InitARMMCSubtargetInfo(X, TT, CPU, ArchFS);
+  return X;
+}
+
+static MCInstrInfo *createARMMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitARMMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createARMMCRegisterInfo(StringRef Triple) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitARMMCRegisterInfo(X, ARM::LR, 0, 0, ARM::PC);
+  return X;
+}
+
+static MCAsmInfo *createARMMCAsmInfo(const MCRegisterInfo &MRI, StringRef TT) {
+  Triple TheTriple(TT);
+
+  MCAsmInfo *MAI;
+  switch (TheTriple.getOS()) {
+  case llvm::Triple::Darwin:
+  case llvm::Triple::IOS:
+  case llvm::Triple::MacOSX:
+    MAI = new ARMMCAsmInfoDarwin(TT);
+    break;
+  case llvm::Triple::Win32:
+    switch (TheTriple.getEnvironment()) {
+    case llvm::Triple::Itanium:
+      MAI = new ARMCOFFMCAsmInfoGNU();
+      break;
+    case llvm::Triple::MSVC:
+      MAI = new ARMCOFFMCAsmInfoMicrosoft();
+      break;
+    default:
+      llvm_unreachable("invalid environment");
+    }
+    break;
+  default:
+    if (TheTriple.isOSBinFormatMachO())
+      MAI = new ARMMCAsmInfoDarwin(TT);
+    else
+      MAI = new ARMELFMCAsmInfo(TT);
+    break;
+  }
+
+  unsigned Reg = MRI.getDwarfRegNum(ARM::SP, true);
+  MAI->addInitialFrameState(MCCFIInstruction::createDefCfa(nullptr, Reg, 0));
+
+  return MAI;
+}
+
+static MCCodeGenInfo *createARMMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                             CodeModel::Model CM,
+                                             CodeGenOpt::Level OL) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  if (RM == Reloc::Default) {
+    Triple TheTriple(TT);
+    // Default relocation model on Darwin is PIC, not DynamicNoPIC.
+    RM = TheTriple.isOSDarwin() ? Reloc::PIC_ : Reloc::DynamicNoPIC;
+  }
+  X->InitMCCodeGenInfo(RM, CM, OL);
+  return X;
+}
+
+// This is duplicated code. Refactor this.
+static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
+                                    MCContext &Ctx, MCAsmBackend &MAB,
+                                    raw_ostream &OS,
+                                    MCCodeEmitter *Emitter,
+                                    const MCSubtargetInfo &STI,
+                                    bool RelaxAll,
+                                    bool NoExecStack) {
+  Triple TheTriple(TT);
+
+  switch (TheTriple.getObjectFormat()) {
+  default: llvm_unreachable("unsupported object format");
+  case Triple::MachO: {
+    MCStreamer *S = createMachOStreamer(Ctx, MAB, OS, Emitter, false);
+    new ARMTargetStreamer(*S);
+    return S;
+  }
+  case Triple::COFF:
+    assert(TheTriple.isOSWindows() && "non-Windows ARM COFF is not supported");
+    return createARMWinCOFFStreamer(Ctx, MAB, *Emitter, OS);
+  case Triple::ELF:
+    return createARMELFStreamer(Ctx, MAB, OS, Emitter, false, NoExecStack,
+                                TheTriple.getArch() == Triple::thumb);
+  }
+}
+
+static MCInstPrinter *createARMMCInstPrinter(const Target &T,
+                                             unsigned SyntaxVariant,
+                                             const MCAsmInfo &MAI,
+                                             const MCInstrInfo &MII,
+                                             const MCRegisterInfo &MRI,
+                                             const MCSubtargetInfo &STI) {
+  if (SyntaxVariant == 0)
+    return new ARMInstPrinter(MAI, MII, MRI, STI);
+  return nullptr;
+}
+
+static MCRelocationInfo *createARMMCRelocationInfo(StringRef TT,
+                                                   MCContext &Ctx) {
+  Triple TheTriple(TT);
+  if (TheTriple.isOSBinFormatMachO())
+    return createARMMachORelocationInfo(Ctx);
+  // Default to the stock relocation info.
+  return llvm::createMCRelocationInfo(TT, Ctx);
+}
+
+namespace {
+
+class ARMMCInstrAnalysis : public MCInstrAnalysis {
+public:
+  ARMMCInstrAnalysis(const MCInstrInfo *Info) : MCInstrAnalysis(Info) {}
+
+  bool isUnconditionalBranch(const MCInst &Inst) const override {
+    // BCCs with the "always" predicate are unconditional branches.
+    if (Inst.getOpcode() == ARM::Bcc && Inst.getOperand(1).getImm()==ARMCC::AL)
+      return true;
+    return MCInstrAnalysis::isUnconditionalBranch(Inst);
+  }
+
+  bool isConditionalBranch(const MCInst &Inst) const override {
+    // BCCs with the "always" predicate are unconditional branches.
+    if (Inst.getOpcode() == ARM::Bcc && Inst.getOperand(1).getImm()==ARMCC::AL)
+      return false;
+    return MCInstrAnalysis::isConditionalBranch(Inst);
+  }
+
+  bool evaluateBranch(const MCInst &Inst, uint64_t Addr,
+                      uint64_t Size, uint64_t &Target) const override {
+    // We only handle PCRel branches for now.
+    if (Info->get(Inst.getOpcode()).OpInfo[0].OperandType!=MCOI::OPERAND_PCREL)
+      return false;
+
+    int64_t Imm = Inst.getOperand(0).getImm();
+    // FIXME: This is not right for thumb.
+    Target = Addr+Imm+8; // In ARM mode the PC is always off by 8 bytes.
+    return true;
+  }
+};
+
+}
+
+static MCInstrAnalysis *createARMMCInstrAnalysis(const MCInstrInfo *Info) {
+  return new ARMMCInstrAnalysis(Info);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeARMTargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfoFn X(TheARMLETarget, createARMMCAsmInfo);
+  RegisterMCAsmInfoFn Y(TheARMBETarget, createARMMCAsmInfo);
+  RegisterMCAsmInfoFn A(TheThumbLETarget, createARMMCAsmInfo);
+  RegisterMCAsmInfoFn B(TheThumbBETarget, createARMMCAsmInfo);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheARMLETarget, createARMMCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheARMBETarget, createARMMCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheThumbLETarget, createARMMCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheThumbBETarget, createARMMCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheARMLETarget, createARMMCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheARMBETarget, createARMMCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheThumbLETarget, createARMMCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheThumbBETarget, createARMMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheARMLETarget, createARMMCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheARMBETarget, createARMMCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheThumbLETarget, createARMMCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheThumbBETarget, createARMMCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheARMLETarget,
+                                          ARM_MC::createARMMCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(TheARMBETarget,
+                                          ARM_MC::createARMMCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(TheThumbLETarget,
+                                          ARM_MC::createARMMCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(TheThumbBETarget,
+                                          ARM_MC::createARMMCSubtargetInfo);
+
+  // Register the MC instruction analyzer.
+  TargetRegistry::RegisterMCInstrAnalysis(TheARMLETarget,
+                                          createARMMCInstrAnalysis);
+  TargetRegistry::RegisterMCInstrAnalysis(TheARMBETarget,
+                                          createARMMCInstrAnalysis);
+  TargetRegistry::RegisterMCInstrAnalysis(TheThumbLETarget,
+                                          createARMMCInstrAnalysis);
+  TargetRegistry::RegisterMCInstrAnalysis(TheThumbBETarget,
+                                          createARMMCInstrAnalysis);
+
+  // Register the MC Code Emitter
+  TargetRegistry::RegisterMCCodeEmitter(TheARMLETarget,
+                                        createARMLEMCCodeEmitter);
+  TargetRegistry::RegisterMCCodeEmitter(TheARMBETarget,
+                                        createARMBEMCCodeEmitter);
+  TargetRegistry::RegisterMCCodeEmitter(TheThumbLETarget,
+                                        createARMLEMCCodeEmitter);
+  TargetRegistry::RegisterMCCodeEmitter(TheThumbBETarget,
+                                        createARMBEMCCodeEmitter);
+
+  // Register the asm backend.
+  TargetRegistry::RegisterMCAsmBackend(TheARMLETarget, createARMLEAsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(TheARMBETarget, createARMBEAsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(TheThumbLETarget,
+                                       createThumbLEAsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(TheThumbBETarget,
+                                       createThumbBEAsmBackend);
+
+  // Register the object streamer.
+  TargetRegistry::RegisterMCObjectStreamer(TheARMLETarget, createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(TheARMBETarget, createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(TheThumbLETarget, createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(TheThumbBETarget, createMCStreamer);
+
+  // Register the asm streamer.
+  TargetRegistry::RegisterAsmStreamer(TheARMLETarget, createMCAsmStreamer);
+  TargetRegistry::RegisterAsmStreamer(TheARMBETarget, createMCAsmStreamer);
+  TargetRegistry::RegisterAsmStreamer(TheThumbLETarget, createMCAsmStreamer);
+  TargetRegistry::RegisterAsmStreamer(TheThumbBETarget, createMCAsmStreamer);
+
+  // Register the null streamer.
+  TargetRegistry::RegisterNullStreamer(TheARMLETarget, createARMNullStreamer);
+  TargetRegistry::RegisterNullStreamer(TheARMBETarget, createARMNullStreamer);
+  TargetRegistry::RegisterNullStreamer(TheThumbLETarget, createARMNullStreamer);
+  TargetRegistry::RegisterNullStreamer(TheThumbBETarget, createARMNullStreamer);
+
+  // Register the MCInstPrinter.
+  TargetRegistry::RegisterMCInstPrinter(TheARMLETarget, createARMMCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheARMBETarget, createARMMCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheThumbLETarget,
+                                        createARMMCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheThumbBETarget,
+                                        createARMMCInstPrinter);
+
+  // Register the MC relocation info.
+  TargetRegistry::RegisterMCRelocationInfo(TheARMLETarget,
+                                           createARMMCRelocationInfo);
+  TargetRegistry::RegisterMCRelocationInfo(TheARMBETarget,
+                                           createARMMCRelocationInfo);
+  TargetRegistry::RegisterMCRelocationInfo(TheThumbLETarget,
+                                           createARMMCRelocationInfo);
+  TargetRegistry::RegisterMCRelocationInfo(TheThumbBETarget,
+                                           createARMMCRelocationInfo);
+}
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
new file mode 100644
index 0000000..5326e56
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
@@ -0,0 +1,119 @@
+//===-- ARMMCTargetDesc.h - ARM Target Descriptions -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides ARM specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMMCTARGETDESC_H
+#define ARMMCTARGETDESC_H
+
+#include "llvm/Support/DataTypes.h"
+#include <string>
+
+namespace llvm {
+class formatted_raw_ostream;
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCInstPrinter;
+class MCObjectWriter;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class MCStreamer;
+class MCRelocationInfo;
+class StringRef;
+class Target;
+class raw_ostream;
+
+extern Target TheARMLETarget, TheThumbLETarget;
+extern Target TheARMBETarget, TheThumbBETarget;
+
+namespace ARM_MC {
+  std::string ParseARMTriple(StringRef TT, StringRef CPU);
+
+  /// createARMMCSubtargetInfo - Create a ARM MCSubtargetInfo instance.
+  /// This is exposed so Asm parser, etc. do not need to go through
+  /// TargetRegistry.
+  MCSubtargetInfo *createARMMCSubtargetInfo(StringRef TT, StringRef CPU,
+                                            StringRef FS);
+}
+
+MCStreamer *createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
+                                bool isVerboseAsm, bool useDwarfDirectory,
+                                MCInstPrinter *InstPrint, MCCodeEmitter *CE,
+                                MCAsmBackend *TAB, bool ShowInst);
+
+MCStreamer *createARMNullStreamer(MCContext &Ctx);
+
+MCCodeEmitter *createARMLEMCCodeEmitter(const MCInstrInfo &MCII,
+                                        const MCRegisterInfo &MRI,
+                                        const MCSubtargetInfo &STI,
+                                        MCContext &Ctx);
+
+MCCodeEmitter *createARMBEMCCodeEmitter(const MCInstrInfo &MCII,
+                                        const MCRegisterInfo &MRI,
+                                        const MCSubtargetInfo &STI,
+                                        MCContext &Ctx);
+
+MCAsmBackend *createARMAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                  StringRef TT, StringRef CPU,
+                                  bool IsLittleEndian);
+
+MCAsmBackend *createARMLEAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                  StringRef TT, StringRef CPU);
+
+MCAsmBackend *createARMBEAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                  StringRef TT, StringRef CPU);
+
+MCAsmBackend *createThumbLEAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                      StringRef TT, StringRef CPU);
+
+MCAsmBackend *createThumbBEAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                      StringRef TT, StringRef CPU);
+
+/// createARMWinCOFFStreamer - Construct a PE/COFF machine code streamer which
+/// will generate a PE/COFF object file.
+MCStreamer *createARMWinCOFFStreamer(MCContext &Context, MCAsmBackend &MAB,
+                                     MCCodeEmitter &Emitter, raw_ostream &OS);
+
+/// createARMELFObjectWriter - Construct an ELF Mach-O object writer.
+MCObjectWriter *createARMELFObjectWriter(raw_ostream &OS,
+                                         uint8_t OSABI,
+                                         bool IsLittleEndian);
+
+/// createARMMachObjectWriter - Construct an ARM Mach-O object writer.
+MCObjectWriter *createARMMachObjectWriter(raw_ostream &OS,
+                                          bool Is64Bit,
+                                          uint32_t CPUType,
+                                          uint32_t CPUSubtype);
+
+/// createARMWinCOFFObjectWriter - Construct an ARM PE/COFF object writer.
+MCObjectWriter *createARMWinCOFFObjectWriter(raw_ostream &OS, bool Is64Bit);
+
+/// createARMMachORelocationInfo - Construct ARM Mach-O relocation info.
+MCRelocationInfo *createARMMachORelocationInfo(MCContext &Ctx);
+} // End llvm namespace
+
+// Defines symbolic names for ARM registers.  This defines a mapping from
+// register name to register number.
+//
+#define GET_REGINFO_ENUM
+#include "ARMGenRegisterInfo.inc"
+
+// Defines symbolic names for the ARM instructions.
+//
+#define GET_INSTRINFO_ENUM
+#include "ARMGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "ARMGenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp
new file mode 100644
index 0000000..d4b00e6
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp
@@ -0,0 +1,43 @@
+//===-- ARMMachORelocationInfo.cpp ----------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/ARMMCTargetDesc.h"
+#include "ARMMCExpr.h"
+#include "llvm-c/Disassembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCRelocationInfo.h"
+
+using namespace llvm;
+using namespace object;
+
+namespace {
+class ARMMachORelocationInfo : public MCRelocationInfo {
+public:
+  ARMMachORelocationInfo(MCContext &Ctx) : MCRelocationInfo(Ctx) {}
+
+  const MCExpr *createExprForCAPIVariantKind(const MCExpr *SubExpr,
+                                             unsigned VariantKind) override {
+    switch(VariantKind) {
+    case LLVMDisassembler_VariantKind_ARM_HI16:
+      return ARMMCExpr::CreateUpper16(SubExpr, Ctx);
+    case LLVMDisassembler_VariantKind_ARM_LO16:
+      return ARMMCExpr::CreateLower16(SubExpr, Ctx);
+    default:
+      return MCRelocationInfo::createExprForCAPIVariantKind(SubExpr,
+                                                            VariantKind);
+    }
+  }
+};
+} // End unnamed namespace
+
+/// createARMMachORelocationInfo - Construct an ARM Mach-O RelocationInfo.
+MCRelocationInfo *llvm::createARMMachORelocationInfo(MCContext &Ctx) {
+  return new ARMMachORelocationInfo(Ctx);
+}
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
new file mode 100644
index 0000000..186776a
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
@@ -0,0 +1,493 @@
+//===-- ARMMachObjectWriter.cpp - ARM Mach Object Writer ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/ARMMCTargetDesc.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "MCTargetDesc/ARMFixupKinds.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCMachOSymbolFlags.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachO.h"
+using namespace llvm;
+
+namespace {
+class ARMMachObjectWriter : public MCMachObjectTargetWriter {
+  void RecordARMScatteredRelocation(MachObjectWriter *Writer,
+                                    const MCAssembler &Asm,
+                                    const MCAsmLayout &Layout,
+                                    const MCFragment *Fragment,
+                                    const MCFixup &Fixup,
+                                    MCValue Target,
+                                    unsigned Type,
+                                    unsigned Log2Size,
+                                    uint64_t &FixedValue);
+  void RecordARMScatteredHalfRelocation(MachObjectWriter *Writer,
+                                        const MCAssembler &Asm,
+                                        const MCAsmLayout &Layout,
+                                        const MCFragment *Fragment,
+                                        const MCFixup &Fixup, MCValue Target,
+                                        uint64_t &FixedValue);
+
+  bool requiresExternRelocation(MachObjectWriter *Writer,
+                                const MCAssembler &Asm,
+                                const MCFragment &Fragment,
+                                unsigned RelocType, const MCSymbolData *SD,
+                                uint64_t FixedValue);
+
+public:
+  ARMMachObjectWriter(bool Is64Bit, uint32_t CPUType,
+                      uint32_t CPUSubtype)
+    : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype,
+                               /*UseAggressiveSymbolFolding=*/true) {}
+
+  void RecordRelocation(MachObjectWriter *Writer,
+                        const MCAssembler &Asm, const MCAsmLayout &Layout,
+                        const MCFragment *Fragment, const MCFixup &Fixup,
+                        MCValue Target, uint64_t &FixedValue) override;
+};
+}
+
+static bool getARMFixupKindMachOInfo(unsigned Kind, unsigned &RelocType,
+                              unsigned &Log2Size) {
+  RelocType = unsigned(MachO::ARM_RELOC_VANILLA);
+  Log2Size = ~0U;
+
+  switch (Kind) {
+  default:
+    return false;
+
+  case FK_Data_1:
+    Log2Size = llvm::Log2_32(1);
+    return true;
+  case FK_Data_2:
+    Log2Size = llvm::Log2_32(2);
+    return true;
+  case FK_Data_4:
+    Log2Size = llvm::Log2_32(4);
+    return true;
+  case FK_Data_8:
+    Log2Size = llvm::Log2_32(8);
+    return true;
+
+    // These fixups are expected to always be resolvable at assembly time and
+    // have no relocations supported.
+  case ARM::fixup_arm_ldst_pcrel_12:
+  case ARM::fixup_arm_pcrel_10:
+  case ARM::fixup_arm_adr_pcrel_12:
+    return false;
+
+    // Handle 24-bit branch kinds.
+  case ARM::fixup_arm_condbranch:
+  case ARM::fixup_arm_uncondbranch:
+  case ARM::fixup_arm_uncondbl:
+  case ARM::fixup_arm_condbl:
+  case ARM::fixup_arm_blx:
+    RelocType = unsigned(MachO::ARM_RELOC_BR24);
+    // Report as 'long', even though that is not quite accurate.
+    Log2Size = llvm::Log2_32(4);
+    return true;
+
+    // Handle Thumb branches.
+  case ARM::fixup_arm_thumb_br:
+    RelocType = unsigned(MachO::ARM_THUMB_RELOC_BR22);
+    Log2Size = llvm::Log2_32(2);
+    return true;
+
+  case ARM::fixup_t2_uncondbranch:
+  case ARM::fixup_arm_thumb_bl:
+  case ARM::fixup_arm_thumb_blx:
+    RelocType = unsigned(MachO::ARM_THUMB_RELOC_BR22);
+    Log2Size = llvm::Log2_32(4);
+    return true;
+
+  // For movw/movt r_type relocations they always have a pair following them and
+  // the r_length bits are used differently.  The encoding of the r_length is as
+  // follows:
+  //   low bit of r_length:
+  //      0 - :lower16: for movw instructions
+  //      1 - :upper16: for movt instructions
+  //   high bit of r_length:
+  //      0 - arm instructions
+  //      1 - thumb instructions
+  case ARM::fixup_arm_movt_hi16:
+    RelocType = unsigned(MachO::ARM_RELOC_HALF);
+    Log2Size = 1;
+    return true;
+  case ARM::fixup_t2_movt_hi16:
+    RelocType = unsigned(MachO::ARM_RELOC_HALF);
+    Log2Size = 3;
+    return true;
+
+  case ARM::fixup_arm_movw_lo16:
+    RelocType = unsigned(MachO::ARM_RELOC_HALF);
+    Log2Size = 0;
+    return true;
+  case ARM::fixup_t2_movw_lo16:
+    RelocType = unsigned(MachO::ARM_RELOC_HALF);
+    Log2Size = 2;
+    return true;
+  }
+}
+
+void ARMMachObjectWriter::
+RecordARMScatteredHalfRelocation(MachObjectWriter *Writer,
+                                 const MCAssembler &Asm,
+                                 const MCAsmLayout &Layout,
+                                 const MCFragment *Fragment,
+                                 const MCFixup &Fixup,
+                                 MCValue Target,
+                                 uint64_t &FixedValue) {
+  uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
+  unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+  unsigned Type = MachO::ARM_RELOC_HALF;
+
+  // See <reloc.h>.
+  const MCSymbol *A = &Target.getSymA()->getSymbol();
+  const MCSymbolData *A_SD = &Asm.getSymbolData(*A);
+
+  if (!A_SD->getFragment())
+    Asm.getContext().FatalError(Fixup.getLoc(),
+                       "symbol '" + A->getName() +
+                       "' can not be undefined in a subtraction expression");
+
+  uint32_t Value = Writer->getSymbolAddress(A_SD, Layout);
+  uint32_t Value2 = 0;
+  uint64_t SecAddr =
+    Writer->getSectionAddress(A_SD->getFragment()->getParent());
+  FixedValue += SecAddr;
+
+  if (const MCSymbolRefExpr *B = Target.getSymB()) {
+    const MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol());
+
+    if (!B_SD->getFragment())
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                         "symbol '" + B->getSymbol().getName() +
+                         "' can not be undefined in a subtraction expression");
+
+    // Select the appropriate difference relocation type.
+    Type = MachO::ARM_RELOC_HALF_SECTDIFF;
+    Value2 = Writer->getSymbolAddress(B_SD, Layout);
+    FixedValue -= Writer->getSectionAddress(B_SD->getFragment()->getParent());
+  }
+
+  // Relocations are written out in reverse order, so the PAIR comes first.
+  // ARM_RELOC_HALF and ARM_RELOC_HALF_SECTDIFF abuse the r_length field:
+  //
+  // For these two r_type relocations they always have a pair following them and
+  // the r_length bits are used differently.  The encoding of the r_length is as
+  // follows:
+  //   low bit of r_length:
+  //      0 - :lower16: for movw instructions
+  //      1 - :upper16: for movt instructions
+  //   high bit of r_length:
+  //      0 - arm instructions
+  //      1 - thumb instructions
+  // the other half of the relocated expression is in the following pair
+  // relocation entry in the low 16 bits of r_address field.
+  unsigned ThumbBit = 0;
+  unsigned MovtBit = 0;
+  switch ((unsigned)Fixup.getKind()) {
+  default: break;
+  case ARM::fixup_arm_movt_hi16:
+    MovtBit = 1;
+    // The thumb bit shouldn't be set in the 'other-half' bit of the
+    // relocation, but it will be set in FixedValue if the base symbol
+    // is a thumb function. Clear it out here.
+    if (Asm.isThumbFunc(A))
+      FixedValue &= 0xfffffffe;
+    break;
+  case ARM::fixup_t2_movt_hi16:
+    if (Asm.isThumbFunc(A))
+      FixedValue &= 0xfffffffe;
+    MovtBit = 1;
+    // Fallthrough
+  case ARM::fixup_t2_movw_lo16:
+    ThumbBit = 1;
+    break;
+  }
+
+  if (Type == MachO::ARM_RELOC_HALF_SECTDIFF) {
+    uint32_t OtherHalf = MovtBit
+      ? (FixedValue & 0xffff) : ((FixedValue & 0xffff0000) >> 16);
+
+    MachO::any_relocation_info MRE;
+    MRE.r_word0 = ((OtherHalf             <<  0) |
+                   (MachO::ARM_RELOC_PAIR << 24) |
+                   (MovtBit               << 28) |
+                   (ThumbBit              << 29) |
+                   (IsPCRel               << 30) |
+                   MachO::R_SCATTERED);
+    MRE.r_word1 = Value2;
+    Writer->addRelocation(Fragment->getParent(), MRE);
+  }
+
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = ((FixupOffset <<  0) |
+                 (Type        << 24) |
+                 (MovtBit     << 28) |
+                 (ThumbBit    << 29) |
+                 (IsPCRel     << 30) |
+                 MachO::R_SCATTERED);
+  MRE.r_word1 = Value;
+  Writer->addRelocation(Fragment->getParent(), MRE);
+}
+
+void ARMMachObjectWriter::RecordARMScatteredRelocation(MachObjectWriter *Writer,
+                                                    const MCAssembler &Asm,
+                                                    const MCAsmLayout &Layout,
+                                                    const MCFragment *Fragment,
+                                                    const MCFixup &Fixup,
+                                                    MCValue Target,
+                                                    unsigned Type,
+                                                    unsigned Log2Size,
+                                                    uint64_t &FixedValue) {
+  uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
+  unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+
+  // See <reloc.h>.
+  const MCSymbol *A = &Target.getSymA()->getSymbol();
+  const MCSymbolData *A_SD = &Asm.getSymbolData(*A);
+
+  if (!A_SD->getFragment())
+    Asm.getContext().FatalError(Fixup.getLoc(),
+                       "symbol '" + A->getName() +
+                       "' can not be undefined in a subtraction expression");
+
+  uint32_t Value = Writer->getSymbolAddress(A_SD, Layout);
+  uint64_t SecAddr = Writer->getSectionAddress(A_SD->getFragment()->getParent());
+  FixedValue += SecAddr;
+  uint32_t Value2 = 0;
+
+  if (const MCSymbolRefExpr *B = Target.getSymB()) {
+    assert(Type == MachO::ARM_RELOC_VANILLA && "invalid reloc for 2 symbols");
+    const MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol());
+
+    if (!B_SD->getFragment())
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                         "symbol '" + B->getSymbol().getName() +
+                         "' can not be undefined in a subtraction expression");
+
+    // Select the appropriate difference relocation type.
+    Type = MachO::ARM_RELOC_SECTDIFF;
+    Value2 = Writer->getSymbolAddress(B_SD, Layout);
+    FixedValue -= Writer->getSectionAddress(B_SD->getFragment()->getParent());
+  }
+
+  // Relocations are written out in reverse order, so the PAIR comes first.
+  if (Type == MachO::ARM_RELOC_SECTDIFF ||
+      Type == MachO::ARM_RELOC_LOCAL_SECTDIFF) {
+    MachO::any_relocation_info MRE;
+    MRE.r_word0 = ((0                     <<  0) |
+                   (MachO::ARM_RELOC_PAIR << 24) |
+                   (Log2Size              << 28) |
+                   (IsPCRel               << 30) |
+                   MachO::R_SCATTERED);
+    MRE.r_word1 = Value2;
+    Writer->addRelocation(Fragment->getParent(), MRE);
+  }
+
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = ((FixupOffset <<  0) |
+                 (Type        << 24) |
+                 (Log2Size    << 28) |
+                 (IsPCRel     << 30) |
+                 MachO::R_SCATTERED);
+  MRE.r_word1 = Value;
+  Writer->addRelocation(Fragment->getParent(), MRE);
+}
+
+bool ARMMachObjectWriter::requiresExternRelocation(MachObjectWriter *Writer,
+                                                   const MCAssembler &Asm,
+                                                   const MCFragment &Fragment,
+                                                   unsigned RelocType,
+                                                   const MCSymbolData *SD,
+                                                   uint64_t FixedValue) {
+  // Most cases can be identified purely from the symbol.
+  if (Writer->doesSymbolRequireExternRelocation(SD))
+    return true;
+  int64_t Value = (int64_t)FixedValue;  // The displacement is signed.
+  int64_t Range;
+  switch (RelocType) {
+  default:
+    return false;
+  case MachO::ARM_RELOC_BR24:
+    // PC pre-adjustment of 8 for these instructions.
+    Value -= 8;
+    // ARM BL/BLX has a 25-bit offset.
+    Range = 0x1ffffff;
+    break;
+  case MachO::ARM_THUMB_RELOC_BR22:
+    // PC pre-adjustment of 4 for these instructions.
+    Value -= 4;
+    // Thumb BL/BLX has a 24-bit offset.
+    Range = 0xffffff;
+  }
+  // BL/BLX also use external relocations when an internal relocation
+  // would result in the target being out of range. This gives the linker
+  // enough information to generate a branch island.
+  const MCSectionData &SymSD = Asm.getSectionData(
+    SD->getSymbol().getSection());
+  Value += Writer->getSectionAddress(&SymSD);
+  Value -= Writer->getSectionAddress(Fragment.getParent());
+  // If the resultant value would be out of range for an internal relocation,
+  // use an external instead.
+  if (Value > Range || Value < -(Range + 1))
+    return true;
+  return false;
+}
+
+void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer,
+                                           const MCAssembler &Asm,
+                                           const MCAsmLayout &Layout,
+                                           const MCFragment *Fragment,
+                                           const MCFixup &Fixup,
+                                           MCValue Target,
+                                           uint64_t &FixedValue) {
+  unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+  unsigned Log2Size;
+  unsigned RelocType = MachO::ARM_RELOC_VANILLA;
+  if (!getARMFixupKindMachOInfo(Fixup.getKind(), RelocType, Log2Size))
+    // If we failed to get fixup kind info, it's because there's no legal
+    // relocation type for the fixup kind. This happens when it's a fixup that's
+    // expected to always be resolvable at assembly time and not have any
+    // relocations needed.
+    Asm.getContext().FatalError(Fixup.getLoc(),
+                                "unsupported relocation on symbol");
+
+  // If this is a difference or a defined symbol plus an offset, then we need a
+  // scattered relocation entry.  Differences always require scattered
+  // relocations.
+  if (Target.getSymB()) {
+    if (RelocType == MachO::ARM_RELOC_HALF)
+      return RecordARMScatteredHalfRelocation(Writer, Asm, Layout, Fragment,
+                                              Fixup, Target, FixedValue);
+    return RecordARMScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
+                                        Target, RelocType, Log2Size,
+                                        FixedValue);
+  }
+
+  // Get the symbol data, if any.
+  const MCSymbolData *SD = nullptr;
+  if (Target.getSymA())
+    SD = &Asm.getSymbolData(Target.getSymA()->getSymbol());
+
+  // FIXME: For other platforms, we need to use scattered relocations for
+  // internal relocations with offsets.  If this is an internal relocation with
+  // an offset, it also needs a scattered relocation entry.
+  //
+  // Is this right for ARM?
+  uint32_t Offset = Target.getConstant();
+  if (IsPCRel && RelocType == MachO::ARM_RELOC_VANILLA)
+    Offset += 1 << Log2Size;
+  if (Offset && SD && !Writer->doesSymbolRequireExternRelocation(SD))
+    return RecordARMScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
+                                        Target, RelocType, Log2Size,
+                                        FixedValue);
+
+  // See <reloc.h>.
+  uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
+  unsigned Index = 0;
+  unsigned IsExtern = 0;
+  unsigned Type = 0;
+
+  if (Target.isAbsolute()) { // constant
+    // FIXME!
+    report_fatal_error("FIXME: relocations to absolute targets "
+                       "not yet implemented");
+  } else {
+    // Resolve constant variables.
+    if (SD->getSymbol().isVariable()) {
+      int64_t Res;
+      if (SD->getSymbol().getVariableValue()->EvaluateAsAbsolute(
+            Res, Layout, Writer->getSectionAddressMap())) {
+        FixedValue = Res;
+        return;
+      }
+    }
+
+    // Check whether we need an external or internal relocation.
+    if (requiresExternRelocation(Writer, Asm, *Fragment, RelocType, SD,
+                                 FixedValue)) {
+      IsExtern = 1;
+      Index = SD->getIndex();
+
+      // For external relocations, make sure to offset the fixup value to
+      // compensate for the addend of the symbol address, if it was
+      // undefined. This occurs with weak definitions, for example.
+      if (!SD->Symbol->isUndefined())
+        FixedValue -= Layout.getSymbolOffset(SD);
+    } else {
+      // The index is the section ordinal (1-based).
+      const MCSectionData &SymSD = Asm.getSectionData(
+        SD->getSymbol().getSection());
+      Index = SymSD.getOrdinal() + 1;
+      FixedValue += Writer->getSectionAddress(&SymSD);
+    }
+    if (IsPCRel)
+      FixedValue -= Writer->getSectionAddress(Fragment->getParent());
+
+    // The type is determined by the fixup kind.
+    Type = RelocType;
+  }
+
+  // struct relocation_info (8 bytes)
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = FixupOffset;
+  MRE.r_word1 = ((Index     <<  0) |
+                 (IsPCRel   << 24) |
+                 (Log2Size  << 25) |
+                 (IsExtern  << 27) |
+                 (Type      << 28));
+
+  // Even when it's not a scattered relocation, movw/movt always uses
+  // a PAIR relocation.
+  if (Type == MachO::ARM_RELOC_HALF) {
+    // The other-half value only gets populated for the movt and movw
+    // relocation entries.
+    uint32_t Value = 0;
+    switch ((unsigned)Fixup.getKind()) {
+    default: break;
+    case ARM::fixup_arm_movw_lo16:
+    case ARM::fixup_t2_movw_lo16:
+      Value = (FixedValue >> 16) & 0xffff;
+      break;
+    case ARM::fixup_arm_movt_hi16:
+    case ARM::fixup_t2_movt_hi16:
+      Value = FixedValue & 0xffff;
+      break;
+    }
+    MachO::any_relocation_info MREPair;
+    MREPair.r_word0 = Value;
+    MREPair.r_word1 = ((0xffffff              <<  0) |
+                       (Log2Size              << 25) |
+                       (MachO::ARM_RELOC_PAIR << 28));
+
+    Writer->addRelocation(Fragment->getParent(), MREPair);
+  }
+
+  Writer->addRelocation(Fragment->getParent(), MRE);
+}
+
+MCObjectWriter *llvm::createARMMachObjectWriter(raw_ostream &OS,
+                                                bool Is64Bit,
+                                                uint32_t CPUType,
+                                                uint32_t CPUSubtype) {
+  return createMachObjectWriter(new ARMMachObjectWriter(Is64Bit,
+                                                        CPUType,
+                                                        CPUSubtype),
+                                OS, /*IsLittleEndian=*/true);
+}
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
new file mode 100644
index 0000000..8acd7af
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
@@ -0,0 +1,73 @@
+//===- ARMTargetStreamer.cpp - ARMTargetStreamer class --*- C++ -*---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARMTargetStreamer class.
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/ADT/MapVector.h"
+#include "llvm/MC/ConstantPools.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCStreamer.h"
+
+using namespace llvm;
+//
+// ARMTargetStreamer Implemenation
+//
+ARMTargetStreamer::ARMTargetStreamer(MCStreamer &S)
+    : MCTargetStreamer(S), ConstantPools(new AssemblerConstantPools()) {}
+
+ARMTargetStreamer::~ARMTargetStreamer() {}
+
+// The constant pool handling is shared by all ARMTargetStreamer
+// implementations.
+const MCExpr *ARMTargetStreamer::addConstantPoolEntry(const MCExpr *Expr) {
+  return ConstantPools->addEntry(Streamer, Expr, 4);
+}
+
+void ARMTargetStreamer::emitCurrentConstantPool() {
+  ConstantPools->emitForCurrentSection(Streamer);
+}
+
+// finish() - write out any non-empty assembler constant pools.
+void ARMTargetStreamer::finish() { ConstantPools->emitAll(Streamer); }
+
+// The remaining callbacks should be handled separately by each
+// streamer.
+void ARMTargetStreamer::emitFnStart() {}
+void ARMTargetStreamer::emitFnEnd() {}
+void ARMTargetStreamer::emitCantUnwind() {}
+void ARMTargetStreamer::emitPersonality(const MCSymbol *Personality) {}
+void ARMTargetStreamer::emitPersonalityIndex(unsigned Index) {}
+void ARMTargetStreamer::emitHandlerData() {}
+void ARMTargetStreamer::emitSetFP(unsigned FpReg, unsigned SpReg,
+                                  int64_t Offset) {}
+void ARMTargetStreamer::emitMovSP(unsigned Reg, int64_t Offset) {}
+void ARMTargetStreamer::emitPad(int64_t Offset) {}
+void ARMTargetStreamer::emitRegSave(const SmallVectorImpl<unsigned> &RegList,
+                                    bool isVector) {}
+void ARMTargetStreamer::emitUnwindRaw(int64_t StackOffset,
+                                      const SmallVectorImpl<uint8_t> &Opcodes) {
+}
+void ARMTargetStreamer::switchVendor(StringRef Vendor) {}
+void ARMTargetStreamer::emitAttribute(unsigned Attribute, unsigned Value) {}
+void ARMTargetStreamer::emitTextAttribute(unsigned Attribute,
+                                          StringRef String) {}
+void ARMTargetStreamer::emitIntTextAttribute(unsigned Attribute,
+                                             unsigned IntValue,
+                                             StringRef StringValue) {}
+void ARMTargetStreamer::emitArch(unsigned Arch) {}
+void ARMTargetStreamer::emitObjectArch(unsigned Arch) {}
+void ARMTargetStreamer::emitFPU(unsigned FPU) {}
+void ARMTargetStreamer::finishAttributeSection() {}
+void ARMTargetStreamer::emitInst(uint32_t Inst, char Suffix) {}
+void
+ARMTargetStreamer::AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *SRE) {}
+
+void ARMTargetStreamer::emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) {}
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp
new file mode 100644
index 0000000..593fe34
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp
@@ -0,0 +1,226 @@
+//===-- ARMUnwindOpAsm.cpp - ARM Unwind Opcodes Assembler -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the unwind opcode assmebler for ARM exception handling
+// table.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMUnwindOpAsm.h"
+#include "llvm/Support/ARMEHABI.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/LEB128.h"
+
+using namespace llvm;
+
+namespace {
+  /// UnwindOpcodeStreamer - The simple wrapper over SmallVector to emit bytes
+  /// with MSB to LSB per uint32_t ordering.  For example, the first byte will
+  /// be placed in Vec[3], and the following bytes will be placed in 2, 1, 0,
+  /// 7, 6, 5, 4, 11, 10, 9, 8, and so on.
+  class UnwindOpcodeStreamer {
+  private:
+    SmallVectorImpl<uint8_t> &Vec;
+    size_t Pos;
+
+  public:
+    UnwindOpcodeStreamer(SmallVectorImpl<uint8_t> &V) : Vec(V), Pos(3) {
+    }
+
+    /// Emit the byte in MSB to LSB per uint32_t order.
+    inline void EmitByte(uint8_t elem) {
+      Vec[Pos] = elem;
+      Pos = (((Pos ^ 0x3u) + 1) ^ 0x3u);
+    }
+
+    /// Emit the size prefix.
+    inline void EmitSize(size_t Size) {
+      size_t SizeInWords = (Size + 3) / 4;
+      assert(SizeInWords <= 0x100u &&
+             "Only 256 additional words are allowed for unwind opcodes");
+      EmitByte(static_cast<uint8_t>(SizeInWords - 1));
+    }
+
+    /// Emit the personality index prefix.
+    inline void EmitPersonalityIndex(unsigned PI) {
+      assert(PI < ARM::EHABI::NUM_PERSONALITY_INDEX &&
+             "Invalid personality prefix");
+      EmitByte(ARM::EHABI::EHT_COMPACT | PI);
+    }
+
+    /// Fill the rest of bytes with FINISH opcode.
+    inline void FillFinishOpcode() {
+      while (Pos < Vec.size())
+        EmitByte(ARM::EHABI::UNWIND_OPCODE_FINISH);
+    }
+  };
+}
+
+void UnwindOpcodeAssembler::EmitRegSave(uint32_t RegSave) {
+  if (RegSave == 0u)
+    return;
+
+  // One byte opcode to save register r14 and r11-r4
+  if (RegSave & (1u << 4)) {
+    // The one byte opcode will always save r4, thus we can't use the one byte
+    // opcode when r4 is not in .save directive.
+
+    // Compute the consecutive registers from r4 to r11.
+    uint32_t Range = 0;
+    uint32_t Mask = (1u << 4);
+    for (uint32_t Bit = (1u << 5); Bit < (1u << 12); Bit <<= 1) {
+      if ((RegSave & Bit) == 0u)
+        break;
+      ++Range;
+      Mask |= Bit;
+    }
+
+    // Emit this opcode when the mask covers every registers.
+    uint32_t UnmaskedReg = RegSave & 0xfff0u & (~Mask);
+    if (UnmaskedReg == 0u) {
+      // Pop r[4 : (4 + n)]
+      EmitInt8(ARM::EHABI::UNWIND_OPCODE_POP_REG_RANGE_R4 | Range);
+      RegSave &= 0x000fu;
+    } else if (UnmaskedReg == (1u << 14)) {
+      // Pop r[14] + r[4 : (4 + n)]
+      EmitInt8(ARM::EHABI::UNWIND_OPCODE_POP_REG_RANGE_R4_R14 | Range);
+      RegSave &= 0x000fu;
+    }
+  }
+
+  // Two bytes opcode to save register r15-r4
+  if ((RegSave & 0xfff0u) != 0)
+    EmitInt16(ARM::EHABI::UNWIND_OPCODE_POP_REG_MASK_R4 | (RegSave >> 4));
+
+  // Opcode to save register r3-r0
+  if ((RegSave & 0x000fu) != 0)
+    EmitInt16(ARM::EHABI::UNWIND_OPCODE_POP_REG_MASK | (RegSave & 0x000fu));
+}
+
+/// Emit unwind opcodes for .vsave directives
+void UnwindOpcodeAssembler::EmitVFPRegSave(uint32_t VFPRegSave) {
+  size_t i = 32;
+
+  while (i > 16) {
+    uint32_t Bit = 1u << (i - 1);
+    if ((VFPRegSave & Bit) == 0u) {
+      --i;
+      continue;
+    }
+
+    uint32_t Range = 0;
+
+    --i;
+    Bit >>= 1;
+
+    while (i > 16 && (VFPRegSave & Bit)) {
+      --i;
+      ++Range;
+      Bit >>= 1;
+    }
+
+    EmitInt16(ARM::EHABI::UNWIND_OPCODE_POP_VFP_REG_RANGE_FSTMFDD_D16 |
+              ((i - 16) << 4) | Range);
+  }
+
+  while (i > 0) {
+    uint32_t Bit = 1u << (i - 1);
+    if ((VFPRegSave & Bit) == 0u) {
+      --i;
+      continue;
+    }
+
+    uint32_t Range = 0;
+
+    --i;
+    Bit >>= 1;
+
+    while (i > 0 && (VFPRegSave & Bit)) {
+      --i;
+      ++Range;
+      Bit >>= 1;
+    }
+
+    EmitInt16(ARM::EHABI::UNWIND_OPCODE_POP_VFP_REG_RANGE_FSTMFDD | (i << 4) |
+              Range);
+  }
+}
+
+/// Emit unwind opcodes to copy address from source register to $sp.
+void UnwindOpcodeAssembler::EmitSetSP(uint16_t Reg) {
+  EmitInt8(ARM::EHABI::UNWIND_OPCODE_SET_VSP | Reg);
+}
+
+/// Emit unwind opcodes to add $sp with an offset.
+void UnwindOpcodeAssembler::EmitSPOffset(int64_t Offset) {
+  if (Offset > 0x200) {
+    uint8_t Buff[16];
+    Buff[0] = ARM::EHABI::UNWIND_OPCODE_INC_VSP_ULEB128;
+    size_t ULEBSize = encodeULEB128((Offset - 0x204) >> 2, Buff + 1);
+    EmitBytes(Buff, ULEBSize + 1);
+  } else if (Offset > 0) {
+    if (Offset > 0x100) {
+      EmitInt8(ARM::EHABI::UNWIND_OPCODE_INC_VSP | 0x3fu);
+      Offset -= 0x100;
+    }
+    EmitInt8(ARM::EHABI::UNWIND_OPCODE_INC_VSP |
+             static_cast<uint8_t>((Offset - 4) >> 2));
+  } else if (Offset < 0) {
+    while (Offset < -0x100) {
+      EmitInt8(ARM::EHABI::UNWIND_OPCODE_DEC_VSP | 0x3fu);
+      Offset += 0x100;
+    }
+    EmitInt8(ARM::EHABI::UNWIND_OPCODE_DEC_VSP |
+             static_cast<uint8_t>(((-Offset) - 4) >> 2));
+  }
+}
+
+void UnwindOpcodeAssembler::Finalize(unsigned &PersonalityIndex,
+                                     SmallVectorImpl<uint8_t> &Result) {
+
+  UnwindOpcodeStreamer OpStreamer(Result);
+
+  if (HasPersonality) {
+    // User-specifed personality routine: [ SIZE , OP1 , OP2 , ... ]
+    PersonalityIndex = ARM::EHABI::NUM_PERSONALITY_INDEX;
+    size_t TotalSize = Ops.size() + 1;
+    size_t RoundUpSize = (TotalSize + 3) / 4 * 4;
+    Result.resize(RoundUpSize);
+    OpStreamer.EmitSize(RoundUpSize);
+  } else {
+    // If no personalityindex is specified, select ane
+    if (PersonalityIndex == ARM::EHABI::NUM_PERSONALITY_INDEX)
+      PersonalityIndex = (Ops.size() <= 3) ? ARM::EHABI::AEABI_UNWIND_CPP_PR0
+                                           : ARM::EHABI::AEABI_UNWIND_CPP_PR1;
+    if (PersonalityIndex == ARM::EHABI::AEABI_UNWIND_CPP_PR0) {
+      // __aeabi_unwind_cpp_pr0: [ 0x80 , OP1 , OP2 , OP3 ]
+      assert(Ops.size() <= 3 && "too many opcodes for __aeabi_unwind_cpp_pr0");
+      Result.resize(4);
+      OpStreamer.EmitPersonalityIndex(PersonalityIndex);
+    } else {
+      // __aeabi_unwind_cpp_pr{1,2}: [ {0x81,0x82} , SIZE , OP1 , OP2 , ... ]
+      size_t TotalSize = Ops.size() + 2;
+      size_t RoundUpSize = (TotalSize + 3) / 4 * 4;
+      Result.resize(RoundUpSize);
+      OpStreamer.EmitPersonalityIndex(PersonalityIndex);
+      OpStreamer.EmitSize(RoundUpSize);
+    }
+  }
+
+  // Copy the unwind opcodes
+  for (size_t i = OpBegins.size() - 1; i > 0; --i)
+    for (size_t j = OpBegins[i - 1], end = OpBegins[i]; j < end; ++j)
+      OpStreamer.EmitByte(Ops[j]);
+
+  // Emit the padding finish opcodes if the size is not multiple of 4.
+  OpStreamer.FillFinishOpcode();
+
+  // Reset the assembler state
+  Reset();
+}
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h
new file mode 100644
index 0000000..cd58759
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h
@@ -0,0 +1,93 @@
+//===-- ARMUnwindOpAsm.h - ARM Unwind Opcodes Assembler ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the unwind opcode assmebler for ARM exception handling
+// table.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARM_UNWIND_OP_ASM_H
+#define ARM_UNWIND_OP_ASM_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/ARMEHABI.h"
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+
+class MCSymbol;
+
+class UnwindOpcodeAssembler {
+private:
+  llvm::SmallVector<uint8_t, 32> Ops;
+  llvm::SmallVector<unsigned, 8> OpBegins;
+  bool HasPersonality;
+
+public:
+  UnwindOpcodeAssembler()
+      : HasPersonality(0) {
+    OpBegins.push_back(0);
+  }
+
+  /// Reset the unwind opcode assembler.
+  void Reset() {
+    Ops.clear();
+    OpBegins.clear();
+    OpBegins.push_back(0);
+    HasPersonality = 0;
+  }
+
+  /// Set the personality
+  void setPersonality(const MCSymbol *Per) {
+    HasPersonality = 1;
+  }
+
+  /// Emit unwind opcodes for .save directives
+  void EmitRegSave(uint32_t RegSave);
+
+  /// Emit unwind opcodes for .vsave directives
+  void EmitVFPRegSave(uint32_t VFPRegSave);
+
+  /// Emit unwind opcodes to copy address from source register to $sp.
+  void EmitSetSP(uint16_t Reg);
+
+  /// Emit unwind opcodes to add $sp with an offset.
+  void EmitSPOffset(int64_t Offset);
+
+  /// Emit unwind raw opcodes
+  void EmitRaw(const SmallVectorImpl<uint8_t> &Opcodes) {
+    Ops.insert(Ops.end(), Opcodes.begin(), Opcodes.end());
+    OpBegins.push_back(OpBegins.back() + Opcodes.size());
+  }
+
+  /// Finalize the unwind opcode sequence for EmitBytes()
+  void Finalize(unsigned &PersonalityIndex,
+                SmallVectorImpl<uint8_t> &Result);
+
+private:
+  void EmitInt8(unsigned Opcode) {
+    Ops.push_back(Opcode & 0xff);
+    OpBegins.push_back(OpBegins.back() + 1);
+  }
+
+  void EmitInt16(unsigned Opcode) {
+    Ops.push_back((Opcode >> 8) & 0xff);
+    Ops.push_back(Opcode & 0xff);
+    OpBegins.push_back(OpBegins.back() + 2);
+  }
+
+  void EmitBytes(const uint8_t *Opcode, size_t Size) {
+    Ops.insert(Ops.end(), Opcode, Opcode + Size);
+    OpBegins.push_back(OpBegins.back() + Size);
+  }
+};
+
+} // namespace llvm
+
+#endif // ARM_UNWIND_OP_ASM_H
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
new file mode 100644
index 0000000..d31f1f4
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
@@ -0,0 +1,82 @@
+//===-- ARMWinCOFFObjectWriter.cpp - ARM Windows COFF Object Writer -- C++ -==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/ARMFixupKinds.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/MC/MCWinCOFFObjectWriter.h"
+#include "llvm/Support/COFF.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+namespace {
+class ARMWinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter {
+public:
+  ARMWinCOFFObjectWriter(bool Is64Bit)
+    : MCWinCOFFObjectTargetWriter(COFF::IMAGE_FILE_MACHINE_ARMNT) {
+    assert(!Is64Bit && "AArch64 support not yet implemented");
+  }
+  virtual ~ARMWinCOFFObjectWriter() { }
+
+  unsigned getRelocType(const MCValue &Target, const MCFixup &Fixup,
+                        bool IsCrossSection) const override;
+
+  bool recordRelocation(const MCFixup &) const override;
+};
+
+unsigned ARMWinCOFFObjectWriter::getRelocType(const MCValue &Target,
+                                              const MCFixup &Fixup,
+                                              bool IsCrossSection) const {
+  assert(getMachine() == COFF::IMAGE_FILE_MACHINE_ARMNT &&
+         "AArch64 support not yet implemented");
+
+  MCSymbolRefExpr::VariantKind Modifier =
+    Target.isAbsolute() ? MCSymbolRefExpr::VK_None : Target.getSymA()->getKind();
+
+  switch (static_cast<unsigned>(Fixup.getKind())) {
+  default: llvm_unreachable("unsupported relocation type");
+  case FK_Data_4:
+    switch (Modifier) {
+    case MCSymbolRefExpr::VK_COFF_IMGREL32:
+      return COFF::IMAGE_REL_ARM_ADDR32NB;
+    case MCSymbolRefExpr::VK_SECREL:
+      return COFF::IMAGE_REL_ARM_SECREL;
+    default:
+      return COFF::IMAGE_REL_ARM_ADDR32;
+    }
+  case FK_SecRel_2:
+    return COFF::IMAGE_REL_ARM_SECTION;
+  case FK_SecRel_4:
+    return COFF::IMAGE_REL_ARM_SECREL;
+  case ARM::fixup_t2_condbranch:
+    return COFF::IMAGE_REL_ARM_BRANCH20T;
+  case ARM::fixup_t2_uncondbranch:
+    return COFF::IMAGE_REL_ARM_BRANCH24T;
+  case ARM::fixup_arm_thumb_bl:
+  case ARM::fixup_arm_thumb_blx:
+    return COFF::IMAGE_REL_ARM_BLX23T;
+  case ARM::fixup_t2_movw_lo16:
+  case ARM::fixup_t2_movt_hi16:
+    return COFF::IMAGE_REL_ARM_MOV32T;
+  }
+}
+
+bool ARMWinCOFFObjectWriter::recordRelocation(const MCFixup &Fixup) const {
+  return static_cast<unsigned>(Fixup.getKind()) != ARM::fixup_t2_movt_hi16;
+}
+}
+
+namespace llvm {
+MCObjectWriter *createARMWinCOFFObjectWriter(raw_ostream &OS, bool Is64Bit) {
+  MCWinCOFFObjectTargetWriter *MOTW = new ARMWinCOFFObjectWriter(Is64Bit);
+  return createWinCOFFObjectWriter(MOTW, OS);
+}
+}
+
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
new file mode 100644
index 0000000..b344ced
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
@@ -0,0 +1,46 @@
+//===-- ARMWinCOFFStreamer.cpp - ARM Target WinCOFF Streamer ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMMCTargetDesc.h"
+#include "llvm/MC/MCWinCOFFStreamer.h"
+
+using namespace llvm;
+
+namespace {
+class ARMWinCOFFStreamer : public MCWinCOFFStreamer {
+public:
+  ARMWinCOFFStreamer(MCContext &C, MCAsmBackend &AB, MCCodeEmitter &CE,
+                     raw_ostream &OS)
+    : MCWinCOFFStreamer(C, AB, CE, OS) { }
+
+  void EmitAssemblerFlag(MCAssemblerFlag Flag) override;
+  void EmitThumbFunc(MCSymbol *Symbol) override;
+};
+
+void ARMWinCOFFStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
+  switch (Flag) {
+  default: llvm_unreachable("not implemented");
+  case MCAF_SyntaxUnified:
+  case MCAF_Code16:
+    break;
+  }
+}
+
+void ARMWinCOFFStreamer::EmitThumbFunc(MCSymbol *Symbol) {
+  getAssembler().setIsThumbFunc(Symbol);
+}
+}
+
+namespace llvm {
+MCStreamer *createARMWinCOFFStreamer(MCContext &Context, MCAsmBackend &MAB,
+                                     MCCodeEmitter &Emitter, raw_ostream &OS) {
+  return new ARMWinCOFFStreamer(Context, MAB, Emitter, OS);
+}
+}
+
diff --git a/contrib/llvm/lib/Target/ARM/MLxExpansionPass.cpp b/contrib/llvm/lib/Target/ARM/MLxExpansionPass.cpp
new file mode 100644
index 0000000..f6d24e9e
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MLxExpansionPass.cpp
@@ -0,0 +1,397 @@
+//===-- MLxExpansionPass.cpp - Expand MLx instrs to avoid hazards ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Expand VFP / NEON floating point MLA / MLS instructions (each to a pair of
+// multiple and add / sub instructions) when special VMLx hazards are detected.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMSubtarget.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "mlx-expansion"
+
+static cl::opt<bool>
+ForceExapnd("expand-all-fp-mlx", cl::init(false), cl::Hidden);
+static cl::opt<unsigned>
+ExpandLimit("expand-limit", cl::init(~0U), cl::Hidden);
+
+STATISTIC(NumExpand, "Number of fp MLA / MLS instructions expanded");
+
+namespace {
+  struct MLxExpansion : public MachineFunctionPass {
+    static char ID;
+    MLxExpansion() : MachineFunctionPass(ID) {}
+
+    bool runOnMachineFunction(MachineFunction &Fn) override;
+
+    const char *getPassName() const override {
+      return "ARM MLA / MLS expansion pass";
+    }
+
+  private:
+    const ARMBaseInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    MachineRegisterInfo *MRI;
+
+    bool isLikeA9;
+    bool isSwift;
+    unsigned MIIdx;
+    MachineInstr* LastMIs[4];
+    SmallPtrSet<MachineInstr*, 4> IgnoreStall;
+
+    void clearStack();
+    void pushStack(MachineInstr *MI);
+    MachineInstr *getAccDefMI(MachineInstr *MI) const;
+    unsigned getDefReg(MachineInstr *MI) const;
+    bool hasLoopHazard(MachineInstr *MI) const;
+    bool hasRAWHazard(unsigned Reg, MachineInstr *MI) const;
+    bool FindMLxHazard(MachineInstr *MI);
+    void ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI,
+                                unsigned MulOpc, unsigned AddSubOpc,
+                                bool NegAcc, bool HasLane);
+    bool ExpandFPMLxInstructions(MachineBasicBlock &MBB);
+  };
+  char MLxExpansion::ID = 0;
+}
+
+void MLxExpansion::clearStack() {
+  std::fill(LastMIs, LastMIs + 4, nullptr);
+  MIIdx = 0;
+}
+
+void MLxExpansion::pushStack(MachineInstr *MI) {
+  LastMIs[MIIdx] = MI;
+  if (++MIIdx == 4)
+    MIIdx = 0;
+}
+
+MachineInstr *MLxExpansion::getAccDefMI(MachineInstr *MI) const {
+  // Look past COPY and INSERT_SUBREG instructions to find the
+  // real definition MI. This is important for _sfp instructions.
+  unsigned Reg = MI->getOperand(1).getReg();
+  if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    return nullptr;
+
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineInstr *DefMI = MRI->getVRegDef(Reg);
+  while (true) {
+    if (DefMI->getParent() != MBB)
+      break;
+    if (DefMI->isCopyLike()) {
+      Reg = DefMI->getOperand(1).getReg();
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        DefMI = MRI->getVRegDef(Reg);
+        continue;
+      }
+    } else if (DefMI->isInsertSubreg()) {
+      Reg = DefMI->getOperand(2).getReg();
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        DefMI = MRI->getVRegDef(Reg);
+        continue;
+      }
+    }
+    break;
+  }
+  return DefMI;
+}
+
+unsigned MLxExpansion::getDefReg(MachineInstr *MI) const {
+  unsigned Reg = MI->getOperand(0).getReg();
+  if (TargetRegisterInfo::isPhysicalRegister(Reg) ||
+      !MRI->hasOneNonDBGUse(Reg))
+    return Reg;
+
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineInstr *UseMI = &*MRI->use_instr_nodbg_begin(Reg);
+  if (UseMI->getParent() != MBB)
+    return Reg;
+
+  while (UseMI->isCopy() || UseMI->isInsertSubreg()) {
+    Reg = UseMI->getOperand(0).getReg();
+    if (TargetRegisterInfo::isPhysicalRegister(Reg) ||
+        !MRI->hasOneNonDBGUse(Reg))
+      return Reg;
+    UseMI = &*MRI->use_instr_nodbg_begin(Reg);
+    if (UseMI->getParent() != MBB)
+      return Reg;
+  }
+
+  return Reg;
+}
+
+/// hasLoopHazard - Check whether an MLx instruction is chained to itself across
+/// a single-MBB loop.
+bool MLxExpansion::hasLoopHazard(MachineInstr *MI) const {
+  unsigned Reg = MI->getOperand(1).getReg();
+  if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    return false;
+
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineInstr *DefMI = MRI->getVRegDef(Reg);
+  while (true) {
+outer_continue:
+    if (DefMI->getParent() != MBB)
+      break;
+
+    if (DefMI->isPHI()) {
+      for (unsigned i = 1, e = DefMI->getNumOperands(); i < e; i += 2) {
+        if (DefMI->getOperand(i + 1).getMBB() == MBB) {
+          unsigned SrcReg = DefMI->getOperand(i).getReg();
+          if (TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+            DefMI = MRI->getVRegDef(SrcReg);
+            goto outer_continue;
+          }
+        }
+      }
+    } else if (DefMI->isCopyLike()) {
+      Reg = DefMI->getOperand(1).getReg();
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        DefMI = MRI->getVRegDef(Reg);
+        continue;
+      }
+    } else if (DefMI->isInsertSubreg()) {
+      Reg = DefMI->getOperand(2).getReg();
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        DefMI = MRI->getVRegDef(Reg);
+        continue;
+      }
+    }
+
+    break;
+  }
+
+  return DefMI == MI;
+}
+
+bool MLxExpansion::hasRAWHazard(unsigned Reg, MachineInstr *MI) const {
+  // FIXME: Detect integer instructions properly.
+  const MCInstrDesc &MCID = MI->getDesc();
+  unsigned Domain = MCID.TSFlags & ARMII::DomainMask;
+  if (MI->mayStore())
+    return false;
+  unsigned Opcode = MCID.getOpcode();
+  if (Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD)
+    return false;
+  if ((Domain & ARMII::DomainVFP) || (Domain & ARMII::DomainNEON))
+    return MI->readsRegister(Reg, TRI);
+  return false;
+}
+
+static bool isFpMulInstruction(unsigned Opcode) {
+  switch (Opcode) {
+  case ARM::VMULS:
+  case ARM::VMULfd:
+  case ARM::VMULfq:
+  case ARM::VMULD:
+  case ARM::VMULslfd:
+  case ARM::VMULslfq:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool MLxExpansion::FindMLxHazard(MachineInstr *MI) {
+  if (NumExpand >= ExpandLimit)
+    return false;
+
+  if (ForceExapnd)
+    return true;
+
+  MachineInstr *DefMI = getAccDefMI(MI);
+  if (TII->isFpMLxInstruction(DefMI->getOpcode())) {
+    // r0 = vmla
+    // r3 = vmla r0, r1, r2
+    // takes 16 - 17 cycles
+    //
+    // r0 = vmla
+    // r4 = vmul r1, r2
+    // r3 = vadd r0, r4
+    // takes about 14 - 15 cycles even with vmul stalling for 4 cycles.
+    IgnoreStall.insert(DefMI);
+    return true;
+  }
+
+  // On Swift, we mostly care about hazards from multiplication instructions
+  // writing the accumulator and the pipelining of loop iterations by out-of-
+  // order execution. 
+  if (isSwift)
+    return isFpMulInstruction(DefMI->getOpcode()) || hasLoopHazard(MI);
+
+  if (IgnoreStall.count(MI))
+    return false;
+
+  // If a VMLA.F is followed by an VADD.F or VMUL.F with no RAW hazard, the
+  // VADD.F or VMUL.F will stall 4 cycles before issue. The 4 cycle stall
+  // preserves the in-order retirement of the instructions.
+  // Look at the next few instructions, if *most* of them can cause hazards,
+  // then the scheduler can't *fix* this, we'd better break up the VMLA.
+  unsigned Limit1 = isLikeA9 ? 1 : 4;
+  unsigned Limit2 = isLikeA9 ? 1 : 4;
+  for (unsigned i = 1; i <= 4; ++i) {
+    int Idx = ((int)MIIdx - i + 4) % 4;
+    MachineInstr *NextMI = LastMIs[Idx];
+    if (!NextMI)
+      continue;
+
+    if (TII->canCauseFpMLxStall(NextMI->getOpcode())) {
+      if (i <= Limit1)
+        return true;
+    }
+
+    // Look for VMLx RAW hazard.
+    if (i <= Limit2 && hasRAWHazard(getDefReg(MI), NextMI))
+      return true;
+  }
+
+  return false;
+}
+
+/// ExpandFPMLxInstructions - Expand a MLA / MLS instruction into a pair
+/// of MUL + ADD / SUB instructions.
+void
+MLxExpansion::ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI,
+                                     unsigned MulOpc, unsigned AddSubOpc,
+                                     bool NegAcc, bool HasLane) {
+  unsigned DstReg = MI->getOperand(0).getReg();
+  bool DstDead = MI->getOperand(0).isDead();
+  unsigned AccReg = MI->getOperand(1).getReg();
+  unsigned Src1Reg = MI->getOperand(2).getReg();
+  unsigned Src2Reg = MI->getOperand(3).getReg();
+  bool Src1Kill = MI->getOperand(2).isKill();
+  bool Src2Kill = MI->getOperand(3).isKill();
+  unsigned LaneImm = HasLane ? MI->getOperand(4).getImm() : 0;
+  unsigned NextOp = HasLane ? 5 : 4;
+  ARMCC::CondCodes Pred = (ARMCC::CondCodes)MI->getOperand(NextOp).getImm();
+  unsigned PredReg = MI->getOperand(++NextOp).getReg();
+
+  const MCInstrDesc &MCID1 = TII->get(MulOpc);
+  const MCInstrDesc &MCID2 = TII->get(AddSubOpc);
+  const MachineFunction &MF = *MI->getParent()->getParent();
+  unsigned TmpReg = MRI->createVirtualRegister(
+                      TII->getRegClass(MCID1, 0, TRI, MF));
+
+  MachineInstrBuilder MIB = BuildMI(MBB, MI, MI->getDebugLoc(), MCID1, TmpReg)
+    .addReg(Src1Reg, getKillRegState(Src1Kill))
+    .addReg(Src2Reg, getKillRegState(Src2Kill));
+  if (HasLane)
+    MIB.addImm(LaneImm);
+  MIB.addImm(Pred).addReg(PredReg);
+
+  MIB = BuildMI(MBB, MI, MI->getDebugLoc(), MCID2)
+    .addReg(DstReg, getDefRegState(true) | getDeadRegState(DstDead));
+
+  if (NegAcc) {
+    bool AccKill = MRI->hasOneNonDBGUse(AccReg);
+    MIB.addReg(TmpReg, getKillRegState(true))
+       .addReg(AccReg, getKillRegState(AccKill));
+  } else {
+    MIB.addReg(AccReg).addReg(TmpReg, getKillRegState(true));
+  }
+  MIB.addImm(Pred).addReg(PredReg);
+
+  DEBUG({
+      dbgs() << "Expanding: " << *MI;
+      dbgs() << "  to:\n";
+      MachineBasicBlock::iterator MII = MI;
+      MII = std::prev(MII);
+      MachineInstr &MI2 = *MII;
+      MII = std::prev(MII);
+      MachineInstr &MI1 = *MII;
+      dbgs() << "    " << MI1;
+      dbgs() << "    " << MI2;
+   });
+
+  MI->eraseFromParent();
+  ++NumExpand;
+}
+
+bool MLxExpansion::ExpandFPMLxInstructions(MachineBasicBlock &MBB) {
+  bool Changed = false;
+
+  clearStack();
+  IgnoreStall.clear();
+
+  unsigned Skip = 0;
+  MachineBasicBlock::reverse_iterator MII = MBB.rbegin(), E = MBB.rend();
+  while (MII != E) {
+    MachineInstr *MI = &*MII;
+
+    if (MI->isPosition() || MI->isImplicitDef() || MI->isCopy()) {
+      ++MII;
+      continue;
+    }
+
+    const MCInstrDesc &MCID = MI->getDesc();
+    if (MI->isBarrier()) {
+      clearStack();
+      Skip = 0;
+      ++MII;
+      continue;
+    }
+
+    unsigned Domain = MCID.TSFlags & ARMII::DomainMask;
+    if (Domain == ARMII::DomainGeneral) {
+      if (++Skip == 2)
+        // Assume dual issues of non-VFP / NEON instructions.
+        pushStack(nullptr);
+    } else {
+      Skip = 0;
+
+      unsigned MulOpc, AddSubOpc;
+      bool NegAcc, HasLane;
+      if (!TII->isFpMLxInstruction(MCID.getOpcode(),
+                                   MulOpc, AddSubOpc, NegAcc, HasLane) ||
+          !FindMLxHazard(MI))
+        pushStack(MI);
+      else {
+        ExpandFPMLxInstruction(MBB, MI, MulOpc, AddSubOpc, NegAcc, HasLane);
+        E = MBB.rend(); // May have changed if MI was the 1st instruction.
+        Changed = true;
+        continue;
+      }
+    }
+
+    ++MII;
+  }
+
+  return Changed;
+}
+
+bool MLxExpansion::runOnMachineFunction(MachineFunction &Fn) {
+  TII = static_cast<const ARMBaseInstrInfo*>(Fn.getTarget().getInstrInfo());
+  TRI = Fn.getTarget().getRegisterInfo();
+  MRI = &Fn.getRegInfo();
+  const ARMSubtarget *STI = &Fn.getTarget().getSubtarget<ARMSubtarget>();
+  isLikeA9 = STI->isLikeA9() || STI->isSwift();
+  isSwift = STI->isSwift();
+
+  bool Modified = false;
+  for (MachineBasicBlock &MBB : Fn)
+    Modified |= ExpandFPMLxInstructions(MBB);
+
+  return Modified;
+}
+
+FunctionPass *llvm::createMLxExpansionPass() {
+  return new MLxExpansion();
+}
diff --git a/contrib/llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp b/contrib/llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp
new file mode 100644
index 0000000..e464671
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp
@@ -0,0 +1,28 @@
+//===-- ARMTargetInfo.cpp - ARM Target Implementation ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/ARMMCTargetDesc.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheARMLETarget,   llvm::TheARMBETarget;
+Target llvm::TheThumbLETarget, llvm::TheThumbBETarget;
+
+extern "C" void LLVMInitializeARMTargetInfo() { 
+  RegisterTarget<Triple::arm, /*HasJIT=*/true>
+    X(TheARMLETarget, "arm", "ARM");
+  RegisterTarget<Triple::armeb, /*HasJIT=*/true>
+    Y(TheARMBETarget, "armeb", "ARM (big endian)");
+
+  RegisterTarget<Triple::thumb, /*HasJIT=*/true>
+    A(TheThumbLETarget, "thumb", "Thumb");
+  RegisterTarget<Triple::thumbeb, /*HasJIT=*/true>
+    B(TheThumbBETarget, "thumbeb", "Thumb (big endian)");
+}
diff --git a/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
new file mode 100644
index 0000000..baa97a7
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -0,0 +1,489 @@
+//===-- Thumb1FrameLowering.cpp - Thumb1 Frame Information ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Thumb1 implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Thumb1FrameLowering.h"
+#include "ARMMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+Thumb1FrameLowering::Thumb1FrameLowering(const ARMSubtarget &sti)
+    : ARMFrameLowering(sti) {}
+
+bool Thumb1FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const{
+  const MachineFrameInfo *FFI = MF.getFrameInfo();
+  unsigned CFSize = FFI->getMaxCallFrameSize();
+  // It's not always a good idea to include the call frame as part of the
+  // stack frame. ARM (especially Thumb) has small immediate offset to
+  // address the stack frame. So a large call frame can cause poor codegen
+  // and may even makes it impossible to scavenge a register.
+  if (CFSize >= ((1 << 8) - 1) * 4 / 2) // Half of imm8 * 4
+    return false;
+
+  return !MF.getFrameInfo()->hasVarSizedObjects();
+}
+
+static void
+emitSPUpdate(MachineBasicBlock &MBB,
+             MachineBasicBlock::iterator &MBBI,
+             const TargetInstrInfo &TII, DebugLoc dl,
+             const Thumb1RegisterInfo &MRI,
+             int NumBytes, unsigned MIFlags = MachineInstr::NoFlags)  {
+  emitThumbRegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes, TII,
+                            MRI, MIFlags);
+}
+
+
+void Thumb1FrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  const Thumb1InstrInfo &TII =
+    *static_cast<const Thumb1InstrInfo*>(MF.getTarget().getInstrInfo());
+  const Thumb1RegisterInfo *RegInfo =
+    static_cast<const Thumb1RegisterInfo*>(MF.getTarget().getRegisterInfo());
+  if (!hasReservedCallFrame(MF)) {
+    // If we have alloca, convert as follows:
+    // ADJCALLSTACKDOWN -> sub, sp, sp, amount
+    // ADJCALLSTACKUP   -> add, sp, sp, amount
+    MachineInstr *Old = I;
+    DebugLoc dl = Old->getDebugLoc();
+    unsigned Amount = Old->getOperand(0).getImm();
+    if (Amount != 0) {
+      // We need to keep the stack aligned properly.  To do this, we round the
+      // amount of space needed for the outgoing arguments up to the next
+      // alignment boundary.
+      unsigned Align = getStackAlignment();
+      Amount = (Amount+Align-1)/Align*Align;
+
+      // Replace the pseudo instruction with a new instruction...
+      unsigned Opc = Old->getOpcode();
+      if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) {
+        emitSPUpdate(MBB, I, TII, dl, *RegInfo, -Amount);
+      } else {
+        assert(Opc == ARM::ADJCALLSTACKUP || Opc == ARM::tADJCALLSTACKUP);
+        emitSPUpdate(MBB, I, TII, dl, *RegInfo, Amount);
+      }
+    }
+  }
+  MBB.erase(I);
+}
+
+void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  MachineFrameInfo  *MFI = MF.getFrameInfo();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  MachineModuleInfo &MMI = MF.getMMI();
+  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+  const Thumb1RegisterInfo *RegInfo =
+    static_cast<const Thumb1RegisterInfo*>(MF.getTarget().getRegisterInfo());
+  const Thumb1InstrInfo &TII =
+    *static_cast<const Thumb1InstrInfo*>(MF.getTarget().getInstrInfo());
+
+  unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment();
+  unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(Align);
+  unsigned NumBytes = MFI->getStackSize();
+  assert(NumBytes >= ArgRegsSaveSize &&
+         "ArgRegsSaveSize is included in NumBytes");
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+  DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+  unsigned FramePtr = RegInfo->getFrameRegister(MF);
+  unsigned BasePtr = RegInfo->getBaseRegister();
+  int CFAOffset = 0;
+
+  // Thumb add/sub sp, imm8 instructions implicitly multiply the offset by 4.
+  NumBytes = (NumBytes + 3) & ~3;
+  MFI->setStackSize(NumBytes);
+
+  // Determine the sizes of each callee-save spill areas and record which frame
+  // belongs to which callee-save spill areas.
+  unsigned GPRCS1Size = 0, GPRCS2Size = 0, DPRCSSize = 0;
+  int FramePtrSpillFI = 0;
+
+  if (ArgRegsSaveSize) {
+    emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -ArgRegsSaveSize,
+                 MachineInstr::FrameSetup);
+    CFAOffset -= ArgRegsSaveSize;
+    unsigned CFIIndex = MMI.addFrameInst(
+        MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
+    BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
+  }
+
+  if (!AFI->hasStackFrame()) {
+    if (NumBytes - ArgRegsSaveSize != 0) {
+      emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -(NumBytes - ArgRegsSaveSize),
+                   MachineInstr::FrameSetup);
+      CFAOffset -= NumBytes - ArgRegsSaveSize;
+      unsigned CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    }
+    return;
+  }
+
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    int FI = CSI[i].getFrameIdx();
+    switch (Reg) {
+    case ARM::R8:
+    case ARM::R9:
+    case ARM::R10:
+    case ARM::R11:
+      if (STI.isTargetMachO()) {
+        GPRCS2Size += 4;
+        break;
+      }
+      // fallthrough
+    case ARM::R4:
+    case ARM::R5:
+    case ARM::R6:
+    case ARM::R7:
+    case ARM::LR:
+      if (Reg == FramePtr)
+        FramePtrSpillFI = FI;
+      GPRCS1Size += 4;
+      break;
+    default:
+      DPRCSSize += 8;
+    }
+  }
+
+  if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tPUSH) {
+    ++MBBI;
+    if (MBBI != MBB.end())
+      dl = MBBI->getDebugLoc();
+  }
+
+  // Determine starting offsets of spill areas.
+  unsigned DPRCSOffset  = NumBytes - ArgRegsSaveSize - (GPRCS1Size + GPRCS2Size + DPRCSSize);
+  unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize;
+  unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size;
+  bool HasFP = hasFP(MF);
+  if (HasFP)
+    AFI->setFramePtrSpillOffset(MFI->getObjectOffset(FramePtrSpillFI) +
+                                NumBytes);
+  AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset);
+  AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset);
+  AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset);
+  NumBytes = DPRCSOffset;
+
+  int FramePtrOffsetInBlock = 0;
+  unsigned adjustedGPRCS1Size = GPRCS1Size;
+  if (tryFoldSPUpdateIntoPushPop(STI, MF, std::prev(MBBI), NumBytes)) {
+    FramePtrOffsetInBlock = NumBytes;
+    adjustedGPRCS1Size += NumBytes;
+    NumBytes = 0;
+  }
+
+  if (adjustedGPRCS1Size) {
+    CFAOffset -= adjustedGPRCS1Size;
+    unsigned CFIIndex = MMI.addFrameInst(
+        MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
+    BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
+  }
+  for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(),
+         E = CSI.end(); I != E; ++I) {
+    unsigned Reg = I->getReg();
+    int FI = I->getFrameIdx();
+    switch (Reg) {
+    case ARM::R8:
+    case ARM::R9:
+    case ARM::R10:
+    case ARM::R11:
+    case ARM::R12:
+      if (STI.isTargetMachO())
+        break;
+      // fallthough
+    case ARM::R0:
+    case ARM::R1:
+    case ARM::R2:
+    case ARM::R3:
+    case ARM::R4:
+    case ARM::R5:
+    case ARM::R6:
+    case ARM::R7:
+    case ARM::LR:
+      unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
+          nullptr, MRI->getDwarfRegNum(Reg, true), MFI->getObjectOffset(FI)));
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+      break;
+    }
+  }
+
+
+  // Adjust FP so it point to the stack slot that contains the previous FP.
+  if (HasFP) {
+    FramePtrOffsetInBlock += MFI->getObjectOffset(FramePtrSpillFI)
+		                     + GPRCS1Size + ArgRegsSaveSize;
+    AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), FramePtr)
+      .addReg(ARM::SP).addImm(FramePtrOffsetInBlock / 4)
+      .setMIFlags(MachineInstr::FrameSetup));
+    if(FramePtrOffsetInBlock) {
+      CFAOffset += FramePtrOffsetInBlock;
+      unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createDefCfa(
+          nullptr, MRI->getDwarfRegNum(FramePtr, true), CFAOffset));
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    } else {
+      unsigned CFIIndex =
+          MMI.addFrameInst(MCCFIInstruction::createDefCfaRegister(
+              nullptr, MRI->getDwarfRegNum(FramePtr, true)));
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    }
+    if (NumBytes > 508)
+      // If offset is > 508 then sp cannot be adjusted in a single instruction,
+      // try restoring from fp instead.
+      AFI->setShouldRestoreSPFromFP(true);
+  }
+
+  if (NumBytes) {
+    // Insert it after all the callee-save spills.
+    emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -NumBytes,
+                 MachineInstr::FrameSetup);
+    if (!HasFP) {
+      CFAOffset -= NumBytes;
+      unsigned CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    }
+  }
+
+  if (STI.isTargetELF() && HasFP)
+    MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() -
+                             AFI->getFramePtrSpillOffset());
+
+  AFI->setGPRCalleeSavedArea1Size(GPRCS1Size);
+  AFI->setGPRCalleeSavedArea2Size(GPRCS2Size);
+  AFI->setDPRCalleeSavedAreaSize(DPRCSSize);
+
+  // Thumb1 does not currently support dynamic stack realignment.  Report a
+  // fatal error rather then silently generate bad code.
+  if (RegInfo->needsStackRealignment(MF))
+      report_fatal_error("Dynamic stack realignment not supported for thumb1.");
+
+  // If we need a base pointer, set it up here. It's whatever the value
+  // of the stack pointer is at this point. Any variable size objects
+  // will be allocated after this, so we can still use the base pointer
+  // to reference locals.
+  if (RegInfo->hasBasePointer(MF))
+    AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), BasePtr)
+                   .addReg(ARM::SP));
+
+  // If the frame has variable sized objects then the epilogue must restore
+  // the sp from fp. We can assume there's an FP here since hasFP already
+  // checks for hasVarSizedObjects.
+  if (MFI->hasVarSizedObjects())
+    AFI->setShouldRestoreSPFromFP(true);
+}
+
+static bool isCSRestore(MachineInstr *MI, const MCPhysReg *CSRegs) {
+  if (MI->getOpcode() == ARM::tLDRspi &&
+      MI->getOperand(1).isFI() &&
+      isCalleeSavedRegister(MI->getOperand(0).getReg(), CSRegs))
+    return true;
+  else if (MI->getOpcode() == ARM::tPOP) {
+    // The first two operands are predicates. The last two are
+    // imp-def and imp-use of SP. Check everything in between.
+    for (int i = 2, e = MI->getNumOperands() - 2; i != e; ++i)
+      if (!isCalleeSavedRegister(MI->getOperand(i).getReg(), CSRegs))
+        return false;
+    return true;
+  }
+  return false;
+}
+
+void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF,
+                                   MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  assert((MBBI->getOpcode() == ARM::tBX_RET ||
+          MBBI->getOpcode() == ARM::tPOP_RET) &&
+         "Can only insert epilog into returning blocks");
+  DebugLoc dl = MBBI->getDebugLoc();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  const Thumb1RegisterInfo *RegInfo =
+    static_cast<const Thumb1RegisterInfo*>(MF.getTarget().getRegisterInfo());
+  const Thumb1InstrInfo &TII =
+    *static_cast<const Thumb1InstrInfo*>(MF.getTarget().getInstrInfo());
+
+  unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment();
+  unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(Align);
+  int NumBytes = (int)MFI->getStackSize();
+  assert((unsigned)NumBytes >= ArgRegsSaveSize &&
+         "ArgRegsSaveSize is included in NumBytes");
+  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs();
+  unsigned FramePtr = RegInfo->getFrameRegister(MF);
+
+  if (!AFI->hasStackFrame()) {
+    if (NumBytes - ArgRegsSaveSize != 0)
+      emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, NumBytes - ArgRegsSaveSize);
+  } else {
+    // Unwind MBBI to point to first LDR / VLDRD.
+    if (MBBI != MBB.begin()) {
+      do
+        --MBBI;
+      while (MBBI != MBB.begin() && isCSRestore(MBBI, CSRegs));
+      if (!isCSRestore(MBBI, CSRegs))
+        ++MBBI;
+    }
+
+    // Move SP to start of FP callee save spill area.
+    NumBytes -= (AFI->getGPRCalleeSavedArea1Size() +
+                 AFI->getGPRCalleeSavedArea2Size() +
+                 AFI->getDPRCalleeSavedAreaSize() +
+                 ArgRegsSaveSize);
+
+    if (AFI->shouldRestoreSPFromFP()) {
+      NumBytes = AFI->getFramePtrSpillOffset() - NumBytes;
+      // Reset SP based on frame pointer only if the stack frame extends beyond
+      // frame pointer stack slot, the target is ELF and the function has FP, or
+      // the target uses var sized objects.
+      if (NumBytes) {
+        assert(MF.getRegInfo().isPhysRegUsed(ARM::R4) &&
+               "No scratch register to restore SP from FP!");
+        emitThumbRegPlusImmediate(MBB, MBBI, dl, ARM::R4, FramePtr, -NumBytes,
+                                  TII, *RegInfo);
+        AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr),
+                               ARM::SP)
+          .addReg(ARM::R4));
+      } else
+        AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr),
+                               ARM::SP)
+          .addReg(FramePtr));
+    } else {
+      if (MBBI->getOpcode() == ARM::tBX_RET &&
+          &MBB.front() != MBBI &&
+          std::prev(MBBI)->getOpcode() == ARM::tPOP) {
+        MachineBasicBlock::iterator PMBBI = std::prev(MBBI);
+        if (!tryFoldSPUpdateIntoPushPop(STI, MF, PMBBI, NumBytes))
+          emitSPUpdate(MBB, PMBBI, TII, dl, *RegInfo, NumBytes);
+      } else if (!tryFoldSPUpdateIntoPushPop(STI, MF, MBBI, NumBytes))
+        emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, NumBytes);
+    }
+  }
+
+  if (ArgRegsSaveSize) {
+    // Unlike T2 and ARM mode, the T1 pop instruction cannot restore
+    // to LR, and we can't pop the value directly to the PC since
+    // we need to update the SP after popping the value. Therefore, we
+    // pop the old LR into R3 as a temporary.
+
+    // Get the last instruction, tBX_RET
+    MBBI = MBB.getLastNonDebugInstr();
+    assert (MBBI->getOpcode() == ARM::tBX_RET);
+    // Epilogue for vararg functions: pop LR to R3 and branch off it.
+    AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP)))
+      .addReg(ARM::R3, RegState::Define);
+
+    emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, ArgRegsSaveSize);
+
+    MachineInstrBuilder MIB =
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::tBX_RET_vararg))
+      .addReg(ARM::R3, RegState::Kill);
+    AddDefaultPred(MIB);
+    MIB.copyImplicitOps(&*MBBI);
+    // erase the old tBX_RET instruction
+    MBB.erase(MBBI);
+  }
+}
+
+bool Thumb1FrameLowering::
+spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MI,
+                          const std::vector<CalleeSavedInfo> &CSI,
+                          const TargetRegisterInfo *TRI) const {
+  if (CSI.empty())
+    return false;
+
+  DebugLoc DL;
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(ARM::tPUSH));
+  AddDefaultPred(MIB);
+  for (unsigned i = CSI.size(); i != 0; --i) {
+    unsigned Reg = CSI[i-1].getReg();
+    bool isKill = true;
+
+    // Add the callee-saved register as live-in unless it's LR and
+    // @llvm.returnaddress is called. If LR is returned for @llvm.returnaddress
+    // then it's already added to the function and entry block live-in sets.
+    if (Reg == ARM::LR) {
+      MachineFunction &MF = *MBB.getParent();
+      if (MF.getFrameInfo()->isReturnAddressTaken() &&
+          MF.getRegInfo().isLiveIn(Reg))
+        isKill = false;
+    }
+
+    if (isKill)
+      MBB.addLiveIn(Reg);
+
+    MIB.addReg(Reg, getKillRegState(isKill));
+  }
+  MIB.setMIFlags(MachineInstr::FrameSetup);
+  return true;
+}
+
+bool Thumb1FrameLowering::
+restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            const std::vector<CalleeSavedInfo> &CSI,
+                            const TargetRegisterInfo *TRI) const {
+  if (CSI.empty())
+    return false;
+
+  MachineFunction &MF = *MBB.getParent();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+
+  bool isVarArg = AFI->getArgRegsSaveSize() > 0;
+  DebugLoc DL = MI->getDebugLoc();
+  MachineInstrBuilder MIB = BuildMI(MF, DL, TII.get(ARM::tPOP));
+  AddDefaultPred(MIB);
+
+  bool NumRegs = false;
+  for (unsigned i = CSI.size(); i != 0; --i) {
+    unsigned Reg = CSI[i-1].getReg();
+    if (Reg == ARM::LR) {
+      // Special epilogue for vararg functions. See emitEpilogue
+      if (isVarArg)
+        continue;
+      Reg = ARM::PC;
+      (*MIB).setDesc(TII.get(ARM::tPOP_RET));
+      MIB.copyImplicitOps(&*MI);
+      MI = MBB.erase(MI);
+    }
+    MIB.addReg(Reg, getDefRegState(true));
+    NumRegs = true;
+  }
+
+  // It's illegal to emit pop instruction without operands.
+  if (NumRegs)
+    MBB.insert(MI, &*MIB);
+  else
+    MF.DeleteMachineInstr(MIB);
+
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h
new file mode 100644
index 0000000..a227f8e
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h
@@ -0,0 +1,52 @@
+//===-- Thumb1FrameLowering.h - Thumb1-specific frame info stuff --*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ARM_THUMB1FRAMELOWERING_H
+#define LLVM_ARM_THUMB1FRAMELOWERING_H
+
+#include "ARMFrameLowering.h"
+#include "Thumb1InstrInfo.h"
+#include "Thumb1RegisterInfo.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+
+class Thumb1FrameLowering : public ARMFrameLowering {
+public:
+  explicit Thumb1FrameLowering(const ARMSubtarget &sti);
+
+  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+  /// the function.
+  void emitPrologue(MachineFunction &MF) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 const TargetRegisterInfo *TRI) const override;
+  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator MI,
+                                  const std::vector<CalleeSavedInfo> &CSI,
+                                  const TargetRegisterInfo *TRI) const override;
+
+  bool hasReservedCallFrame(const MachineFunction &MF) const override;
+
+  void
+  eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MI) const override;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
new file mode 100644
index 0000000..68cbb5c
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -0,0 +1,103 @@
+//===-- Thumb1InstrInfo.cpp - Thumb-1 Instruction Information -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Thumb-1 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Thumb1InstrInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/MC/MCInst.h"
+
+using namespace llvm;
+
+Thumb1InstrInfo::Thumb1InstrInfo(const ARMSubtarget &STI)
+  : ARMBaseInstrInfo(STI), RI(STI) {
+}
+
+/// getNoopForMachoTarget - Return the noop instruction to use for a noop.
+void Thumb1InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+  NopInst.setOpcode(ARM::tMOVr);
+  NopInst.addOperand(MCOperand::CreateReg(ARM::R8));
+  NopInst.addOperand(MCOperand::CreateReg(ARM::R8));
+  NopInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+  NopInst.addOperand(MCOperand::CreateReg(0));
+}
+
+unsigned Thumb1InstrInfo::getUnindexedOpcode(unsigned Opc) const {
+  return 0;
+}
+
+void Thumb1InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I, DebugLoc DL,
+                                  unsigned DestReg, unsigned SrcReg,
+                                  bool KillSrc) const {
+  AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tMOVr), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc)));
+  assert(ARM::GPRRegClass.contains(DestReg, SrcReg) &&
+         "Thumb1 can only copy GPR registers");
+}
+
+void Thumb1InstrInfo::
+storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                    unsigned SrcReg, bool isKill, int FI,
+                    const TargetRegisterClass *RC,
+                    const TargetRegisterInfo *TRI) const {
+  assert((RC == &ARM::tGPRRegClass ||
+          (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
+           isARMLowRegister(SrcReg))) && "Unknown regclass!");
+
+  if (RC == &ARM::tGPRRegClass ||
+      (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
+       isARMLowRegister(SrcReg))) {
+    DebugLoc DL;
+    if (I != MBB.end()) DL = I->getDebugLoc();
+
+    MachineFunction &MF = *MBB.getParent();
+    MachineFrameInfo &MFI = *MF.getFrameInfo();
+    MachineMemOperand *MMO =
+      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
+                              MachineMemOperand::MOStore,
+                              MFI.getObjectSize(FI),
+                              MFI.getObjectAlignment(FI));
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tSTRspi))
+                   .addReg(SrcReg, getKillRegState(isKill))
+                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+  }
+}
+
+void Thumb1InstrInfo::
+loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                     unsigned DestReg, int FI,
+                     const TargetRegisterClass *RC,
+                     const TargetRegisterInfo *TRI) const {
+  assert((RC == &ARM::tGPRRegClass ||
+          (TargetRegisterInfo::isPhysicalRegister(DestReg) &&
+           isARMLowRegister(DestReg))) && "Unknown regclass!");
+
+  if (RC == &ARM::tGPRRegClass ||
+      (TargetRegisterInfo::isPhysicalRegister(DestReg) &&
+       isARMLowRegister(DestReg))) {
+    DebugLoc DL;
+    if (I != MBB.end()) DL = I->getDebugLoc();
+
+    MachineFunction &MF = *MBB.getParent();
+    MachineFrameInfo &MFI = *MF.getFrameInfo();
+    MachineMemOperand *MMO =
+      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
+                              MachineMemOperand::MOLoad,
+                              MFI.getObjectSize(FI),
+                              MFI.getObjectAlignment(FI));
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tLDRspi), DestReg)
+                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+  }
+}
diff --git a/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.h b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.h
new file mode 100644
index 0000000..c5845b7
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.h
@@ -0,0 +1,60 @@
+//===-- Thumb1InstrInfo.h - Thumb-1 Instruction Information -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Thumb-1 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef THUMB1INSTRUCTIONINFO_H
+#define THUMB1INSTRUCTIONINFO_H
+
+#include "ARMBaseInstrInfo.h"
+#include "Thumb1RegisterInfo.h"
+
+namespace llvm {
+  class ARMSubtarget;
+
+class Thumb1InstrInfo : public ARMBaseInstrInfo {
+  Thumb1RegisterInfo RI;
+public:
+  explicit Thumb1InstrInfo(const ARMSubtarget &STI);
+
+  /// getNoopForMachoTarget - Return the noop instruction to use for a noop.
+  void getNoopForMachoTarget(MCInst &NopInst) const override;
+
+  // Return the non-pre/post incrementing version of 'Opc'. Return 0
+  // if there is not such an opcode.
+  unsigned getUnindexedOpcode(unsigned Opc) const override;
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  const Thumb1RegisterInfo &getRegisterInfo() const override { return RI; }
+
+  void copyPhysReg(MachineBasicBlock &MBB,
+                   MachineBasicBlock::iterator I, DebugLoc DL,
+                   unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
+
+};
+}
+
+#endif // THUMB1INSTRUCTIONINFO_H
diff --git a/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.cpp b/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.cpp
new file mode 100644
index 0000000..f907b14
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.cpp
@@ -0,0 +1,677 @@
+//===-- Thumb1RegisterInfo.cpp - Thumb-1 Register Information -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Thumb-1 implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Thumb1RegisterInfo.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMSubtarget.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+extern cl::opt<bool> ReuseFrameIndexVals;
+}
+
+using namespace llvm;
+
+Thumb1RegisterInfo::Thumb1RegisterInfo(const ARMSubtarget &sti)
+  : ARMBaseRegisterInfo(sti) {
+}
+
+const TargetRegisterClass*
+Thumb1RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC)
+                                                                         const {
+  if (ARM::tGPRRegClass.hasSubClassEq(RC))
+    return &ARM::tGPRRegClass;
+  return ARMBaseRegisterInfo::getLargestLegalSuperClass(RC);
+}
+
+const TargetRegisterClass *
+Thumb1RegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind)
+                                                                         const {
+  return &ARM::tGPRRegClass;
+}
+
+/// emitLoadConstPool - Emits a load from constpool to materialize the
+/// specified immediate.
+void
+Thumb1RegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator &MBBI,
+                                      DebugLoc dl,
+                                      unsigned DestReg, unsigned SubIdx,
+                                      int Val,
+                                      ARMCC::CondCodes Pred, unsigned PredReg,
+                                      unsigned MIFlags) const {
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  MachineConstantPool *ConstantPool = MF.getConstantPool();
+  const Constant *C = ConstantInt::get(
+          Type::getInt32Ty(MBB.getParent()->getFunction()->getContext()), Val);
+  unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4);
+
+  BuildMI(MBB, MBBI, dl, TII.get(ARM::tLDRpci))
+    .addReg(DestReg, getDefRegState(true), SubIdx)
+    .addConstantPoolIndex(Idx).addImm(Pred).addReg(PredReg)
+    .setMIFlags(MIFlags);
+}
+
+
+/// emitThumbRegPlusImmInReg - Emits a series of instructions to materialize
+/// a destreg = basereg + immediate in Thumb code. Materialize the immediate
+/// in a register using mov / mvn sequences or load the immediate from a
+/// constpool entry.
+static
+void emitThumbRegPlusImmInReg(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator &MBBI,
+                              DebugLoc dl,
+                              unsigned DestReg, unsigned BaseReg,
+                              int NumBytes, bool CanChangeCC,
+                              const TargetInstrInfo &TII,
+                              const ARMBaseRegisterInfo& MRI,
+                              unsigned MIFlags = MachineInstr::NoFlags) {
+    MachineFunction &MF = *MBB.getParent();
+    bool isHigh = !isARMLowRegister(DestReg) ||
+                  (BaseReg != 0 && !isARMLowRegister(BaseReg));
+    bool isSub = false;
+    // Subtract doesn't have high register version. Load the negative value
+    // if either base or dest register is a high register. Also, if do not
+    // issue sub as part of the sequence if condition register is to be
+    // preserved.
+    if (NumBytes < 0 && !isHigh && CanChangeCC) {
+      isSub = true;
+      NumBytes = -NumBytes;
+    }
+    unsigned LdReg = DestReg;
+    if (DestReg == ARM::SP) {
+      assert(BaseReg == ARM::SP && "Unexpected!");
+      LdReg = MF.getRegInfo().createVirtualRegister(&ARM::tGPRRegClass);
+    }
+
+    if (NumBytes <= 255 && NumBytes >= 0)
+      AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg))
+        .addImm(NumBytes).setMIFlags(MIFlags);
+    else if (NumBytes < 0 && NumBytes >= -255) {
+      AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg))
+        .addImm(NumBytes).setMIFlags(MIFlags);
+      AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tRSB), LdReg))
+        .addReg(LdReg, RegState::Kill).setMIFlags(MIFlags);
+    } else
+      MRI.emitLoadConstPool(MBB, MBBI, dl, LdReg, 0, NumBytes,
+                            ARMCC::AL, 0, MIFlags);
+
+    // Emit add / sub.
+    int Opc = (isSub) ? ARM::tSUBrr : (isHigh ? ARM::tADDhirr : ARM::tADDrr);
+    MachineInstrBuilder MIB =
+      BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg);
+    if (Opc != ARM::tADDhirr)
+      MIB = AddDefaultT1CC(MIB);
+    if (DestReg == ARM::SP || isSub)
+      MIB.addReg(BaseReg).addReg(LdReg, RegState::Kill);
+    else
+      MIB.addReg(LdReg).addReg(BaseReg, RegState::Kill);
+    AddDefaultPred(MIB);
+}
+
+/// calcNumMI - Returns the number of instructions required to materialize
+/// the specific add / sub r, c instruction.
+static unsigned calcNumMI(int Opc, int ExtraOpc, unsigned Bytes,
+                          unsigned NumBits, unsigned Scale) {
+  unsigned NumMIs = 0;
+  unsigned Chunk = ((1 << NumBits) - 1) * Scale;
+
+  if (Opc == ARM::tADDrSPi) {
+    unsigned ThisVal = (Bytes > Chunk) ? Chunk : Bytes;
+    Bytes -= ThisVal;
+    NumMIs++;
+    NumBits = 8;
+    Scale = 1;  // Followed by a number of tADDi8.
+    Chunk = ((1 << NumBits) - 1) * Scale;
+  }
+
+  NumMIs += Bytes / Chunk;
+  if ((Bytes % Chunk) != 0)
+    NumMIs++;
+  if (ExtraOpc)
+    NumMIs++;
+  return NumMIs;
+}
+
+/// emitThumbRegPlusImmediate - Emits a series of instructions to materialize
+/// a destreg = basereg + immediate in Thumb code.
+void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator &MBBI,
+                                     DebugLoc dl,
+                                     unsigned DestReg, unsigned BaseReg,
+                                     int NumBytes, const TargetInstrInfo &TII,
+                                     const ARMBaseRegisterInfo& MRI,
+                                     unsigned MIFlags) {
+  bool isSub = NumBytes < 0;
+  unsigned Bytes = (unsigned)NumBytes;
+  if (isSub) Bytes = -NumBytes;
+  bool isMul4 = (Bytes & 3) == 0;
+  bool isTwoAddr = false;
+  bool DstNotEqBase = false;
+  unsigned NumBits = 1;
+  unsigned Scale = 1;
+  int Opc = 0;
+  int ExtraOpc = 0;
+  bool NeedCC = false;
+
+  if (DestReg == BaseReg && BaseReg == ARM::SP) {
+    assert(isMul4 && "Thumb sp inc / dec size must be multiple of 4!");
+    NumBits = 7;
+    Scale = 4;
+    Opc = isSub ? ARM::tSUBspi : ARM::tADDspi;
+    isTwoAddr = true;
+  } else if (!isSub && BaseReg == ARM::SP) {
+    // r1 = add sp, 403
+    // =>
+    // r1 = add sp, 100 * 4
+    // r1 = add r1, 3
+    if (!isMul4) {
+      Bytes &= ~3;
+      ExtraOpc = ARM::tADDi3;
+    }
+    NumBits = 8;
+    Scale = 4;
+    Opc = ARM::tADDrSPi;
+  } else {
+    // sp = sub sp, c
+    // r1 = sub sp, c
+    // r8 = sub sp, c
+    if (DestReg != BaseReg)
+      DstNotEqBase = true;
+    NumBits = 8;
+    if (DestReg == ARM::SP) {
+      Opc = isSub ? ARM::tSUBspi : ARM::tADDspi;
+      assert(isMul4 && "Thumb sp inc / dec size must be multiple of 4!");
+      NumBits = 7;
+      Scale = 4;
+    } else {
+      Opc = isSub ? ARM::tSUBi8 : ARM::tADDi8;
+      NumBits = 8;
+      NeedCC = true;
+    }
+    isTwoAddr = true;
+  }
+
+  unsigned NumMIs = calcNumMI(Opc, ExtraOpc, Bytes, NumBits, Scale);
+  unsigned Threshold = (DestReg == ARM::SP) ? 3 : 2;
+  if (NumMIs > Threshold) {
+    // This will expand into too many instructions. Load the immediate from a
+    // constpool entry.
+    emitThumbRegPlusImmInReg(MBB, MBBI, dl,
+                             DestReg, BaseReg, NumBytes, true,
+                             TII, MRI, MIFlags);
+    return;
+  }
+
+  if (DstNotEqBase) {
+    if (isARMLowRegister(DestReg) && isARMLowRegister(BaseReg)) {
+      // If both are low registers, emit DestReg = add BaseReg, max(Imm, 7)
+      unsigned Chunk = (1 << 3) - 1;
+      unsigned ThisVal = (Bytes > Chunk) ? Chunk : Bytes;
+      Bytes -= ThisVal;
+      const MCInstrDesc &MCID = TII.get(isSub ? ARM::tSUBi3 : ARM::tADDi3);
+      const MachineInstrBuilder MIB =
+        AddDefaultT1CC(BuildMI(MBB, MBBI, dl, MCID, DestReg)
+                         .setMIFlags(MIFlags));
+      AddDefaultPred(MIB.addReg(BaseReg, RegState::Kill).addImm(ThisVal));
+    } else {
+      AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), DestReg)
+        .addReg(BaseReg, RegState::Kill))
+        .setMIFlags(MIFlags);
+    }
+    BaseReg = DestReg;
+  }
+
+  unsigned Chunk = ((1 << NumBits) - 1) * Scale;
+  while (Bytes) {
+    unsigned ThisVal = (Bytes > Chunk) ? Chunk : Bytes;
+    Bytes -= ThisVal;
+    ThisVal /= Scale;
+    // Build the new tADD / tSUB.
+    if (isTwoAddr) {
+      MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg);
+      if (NeedCC)
+        MIB = AddDefaultT1CC(MIB);
+      MIB.addReg(DestReg).addImm(ThisVal);
+      MIB = AddDefaultPred(MIB);
+      MIB.setMIFlags(MIFlags);
+    } else {
+      bool isKill = BaseReg != ARM::SP;
+      MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg);
+      if (NeedCC)
+        MIB = AddDefaultT1CC(MIB);
+      MIB.addReg(BaseReg, getKillRegState(isKill)).addImm(ThisVal);
+      MIB = AddDefaultPred(MIB);
+      MIB.setMIFlags(MIFlags);
+
+      BaseReg = DestReg;
+      if (Opc == ARM::tADDrSPi) {
+        // r4 = add sp, imm
+        // r4 = add r4, imm
+        // ...
+        NumBits = 8;
+        Scale = 1;
+        Chunk = ((1 << NumBits) - 1) * Scale;
+        Opc = isSub ? ARM::tSUBi8 : ARM::tADDi8;
+        NeedCC = isTwoAddr = true;
+      }
+    }
+  }
+
+  if (ExtraOpc) {
+    const MCInstrDesc &MCID = TII.get(ExtraOpc);
+    AddDefaultPred(AddDefaultT1CC(BuildMI(MBB, MBBI, dl, MCID, DestReg))
+                   .addReg(DestReg, RegState::Kill)
+                   .addImm(((unsigned)NumBytes) & 3)
+                   .setMIFlags(MIFlags));
+  }
+}
+
+/// emitThumbConstant - Emit a series of instructions to materialize a
+/// constant.
+static void emitThumbConstant(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator &MBBI,
+                              unsigned DestReg, int Imm,
+                              const TargetInstrInfo &TII,
+                              const Thumb1RegisterInfo& MRI,
+                              DebugLoc dl) {
+  bool isSub = Imm < 0;
+  if (isSub) Imm = -Imm;
+
+  int Chunk = (1 << 8) - 1;
+  int ThisVal = (Imm > Chunk) ? Chunk : Imm;
+  Imm -= ThisVal;
+  AddDefaultPred(AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8),
+                                        DestReg))
+                 .addImm(ThisVal));
+  if (Imm > 0)
+    emitThumbRegPlusImmediate(MBB, MBBI, dl, DestReg, DestReg, Imm, TII, MRI);
+  if (isSub) {
+    const MCInstrDesc &MCID = TII.get(ARM::tRSB);
+    AddDefaultPred(AddDefaultT1CC(BuildMI(MBB, MBBI, dl, MCID, DestReg))
+                   .addReg(DestReg, RegState::Kill));
+  }
+}
+
+static void removeOperands(MachineInstr &MI, unsigned i) {
+  unsigned Op = i;
+  for (unsigned e = MI.getNumOperands(); i != e; ++i)
+    MI.RemoveOperand(Op);
+}
+
+/// convertToNonSPOpcode - Change the opcode to the non-SP version, because
+/// we're replacing the frame index with a non-SP register.
+static unsigned convertToNonSPOpcode(unsigned Opcode) {
+  switch (Opcode) {
+  case ARM::tLDRspi:
+    return ARM::tLDRi;
+
+  case ARM::tSTRspi:
+    return ARM::tSTRi;
+  }
+
+  return Opcode;
+}
+
+bool Thumb1RegisterInfo::
+rewriteFrameIndex(MachineBasicBlock::iterator II, unsigned FrameRegIdx,
+                  unsigned FrameReg, int &Offset,
+                  const ARMBaseInstrInfo &TII) const {
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc dl = MI.getDebugLoc();
+  MachineInstrBuilder MIB(*MBB.getParent(), &MI);
+  unsigned Opcode = MI.getOpcode();
+  const MCInstrDesc &Desc = MI.getDesc();
+  unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
+
+  if (Opcode == ARM::tADDrSPi) {
+    Offset += MI.getOperand(FrameRegIdx+1).getImm();
+
+    // Can't use tADDrSPi if it's based off the frame pointer.
+    unsigned NumBits = 0;
+    unsigned Scale = 1;
+    if (FrameReg != ARM::SP) {
+      Opcode = ARM::tADDi3;
+      NumBits = 3;
+    } else {
+      NumBits = 8;
+      Scale = 4;
+      assert((Offset & 3) == 0 &&
+             "Thumb add/sub sp, #imm immediate must be multiple of 4!");
+    }
+
+    unsigned PredReg;
+    if (Offset == 0 && getInstrPredicate(&MI, PredReg) == ARMCC::AL) {
+      // Turn it into a move.
+      MI.setDesc(TII.get(ARM::tMOVr));
+      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+      // Remove offset
+      MI.RemoveOperand(FrameRegIdx+1);
+      return true;
+    }
+
+    // Common case: small offset, fits into instruction.
+    unsigned Mask = (1 << NumBits) - 1;
+    if (((Offset / Scale) & ~Mask) == 0) {
+      // Replace the FrameIndex with sp / fp
+      if (Opcode == ARM::tADDi3) {
+        MI.setDesc(TII.get(Opcode));
+        removeOperands(MI, FrameRegIdx);
+        AddDefaultPred(AddDefaultT1CC(MIB).addReg(FrameReg)
+                       .addImm(Offset / Scale));
+      } else {
+        MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+        MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Offset / Scale);
+      }
+      return true;
+    }
+
+    unsigned DestReg = MI.getOperand(0).getReg();
+    unsigned Bytes = (Offset > 0) ? Offset : -Offset;
+    unsigned NumMIs = calcNumMI(Opcode, 0, Bytes, NumBits, Scale);
+    // MI would expand into a large number of instructions. Don't try to
+    // simplify the immediate.
+    if (NumMIs > 2) {
+      emitThumbRegPlusImmediate(MBB, II, dl, DestReg, FrameReg, Offset, TII,
+                                *this);
+      MBB.erase(II);
+      return true;
+    }
+
+    if (Offset > 0) {
+      // Translate r0 = add sp, imm to
+      // r0 = add sp, 255*4
+      // r0 = add r0, (imm - 255*4)
+      if (Opcode == ARM::tADDi3) {
+        MI.setDesc(TII.get(Opcode));
+        removeOperands(MI, FrameRegIdx);
+        AddDefaultPred(AddDefaultT1CC(MIB).addReg(FrameReg).addImm(Mask));
+      } else {
+        MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+        MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Mask);
+      }
+      Offset = (Offset - Mask * Scale);
+      MachineBasicBlock::iterator NII = std::next(II);
+      emitThumbRegPlusImmediate(MBB, NII, dl, DestReg, DestReg, Offset, TII,
+                                *this);
+    } else {
+      // Translate r0 = add sp, -imm to
+      // r0 = -imm (this is then translated into a series of instructions)
+      // r0 = add r0, sp
+      emitThumbConstant(MBB, II, DestReg, Offset, TII, *this, dl);
+
+      MI.setDesc(TII.get(ARM::tADDhirr));
+      MI.getOperand(FrameRegIdx).ChangeToRegister(DestReg, false, false, true);
+      MI.getOperand(FrameRegIdx+1).ChangeToRegister(FrameReg, false);
+    }
+    return true;
+  } else {
+    if (AddrMode != ARMII::AddrModeT1_s)
+      llvm_unreachable("Unsupported addressing mode!");
+
+    unsigned ImmIdx = FrameRegIdx + 1;
+    int InstrOffs = MI.getOperand(ImmIdx).getImm();
+    unsigned NumBits = (FrameReg == ARM::SP) ? 8 : 5;
+    unsigned Scale = 4;
+
+    Offset += InstrOffs * Scale;
+    assert((Offset & (Scale - 1)) == 0 && "Can't encode this offset!");
+
+    // Common case: small offset, fits into instruction.
+    MachineOperand &ImmOp = MI.getOperand(ImmIdx);
+    int ImmedOffset = Offset / Scale;
+    unsigned Mask = (1 << NumBits) - 1;
+
+    if ((unsigned)Offset <= Mask * Scale) {
+      // Replace the FrameIndex with the frame register (e.g., sp).
+      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+      ImmOp.ChangeToImmediate(ImmedOffset);
+
+      // If we're using a register where sp was stored, convert the instruction
+      // to the non-SP version.
+      unsigned NewOpc = convertToNonSPOpcode(Opcode);
+      if (NewOpc != Opcode && FrameReg != ARM::SP)
+        MI.setDesc(TII.get(NewOpc));
+
+      return true;
+    }
+
+    NumBits = 5;
+    Mask = (1 << NumBits) - 1;
+
+    // If this is a thumb spill / restore, we will be using a constpool load to
+    // materialize the offset.
+    if (Opcode == ARM::tLDRspi || Opcode == ARM::tSTRspi) {
+      ImmOp.ChangeToImmediate(0);
+    } else {
+      // Otherwise, it didn't fit. Pull in what we can to simplify the immed.
+      ImmedOffset = ImmedOffset & Mask;
+      ImmOp.ChangeToImmediate(ImmedOffset);
+      Offset &= ~(Mask * Scale);
+    }
+  }
+
+  return Offset == 0;
+}
+
+void Thumb1RegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+                                           int64_t Offset) const {
+  const ARMBaseInstrInfo &TII =
+    *static_cast<const ARMBaseInstrInfo*>(
+      MI.getParent()->getParent()->getTarget().getInstrInfo());
+  int Off = Offset; // ARM doesn't need the general 64-bit offsets
+  unsigned i = 0;
+
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+  bool Done = rewriteFrameIndex(MI, i, BaseReg, Off, TII);
+  assert (Done && "Unable to resolve frame index!");
+  (void)Done;
+}
+
+/// saveScavengerRegister - Spill the register so it can be used by the
+/// register scavenger. Return true.
+bool
+Thumb1RegisterInfo::saveScavengerRegister(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator I,
+                                          MachineBasicBlock::iterator &UseMI,
+                                          const TargetRegisterClass *RC,
+                                          unsigned Reg) const {
+  // Thumb1 can't use the emergency spill slot on the stack because
+  // ldr/str immediate offsets must be positive, and if we're referencing
+  // off the frame pointer (if, for example, there are alloca() calls in
+  // the function, the offset will be negative. Use R12 instead since that's
+  // a call clobbered register that we know won't be used in Thumb1 mode.
+  const TargetInstrInfo &TII = *MBB.getParent()->getTarget().getInstrInfo();
+  DebugLoc DL;
+  AddDefaultPred(BuildMI(MBB, I, DL, TII.get(ARM::tMOVr))
+    .addReg(ARM::R12, RegState::Define)
+    .addReg(Reg, RegState::Kill));
+
+  // The UseMI is where we would like to restore the register. If there's
+  // interference with R12 before then, however, we'll need to restore it
+  // before that instead and adjust the UseMI.
+  bool done = false;
+  for (MachineBasicBlock::iterator II = I; !done && II != UseMI ; ++II) {
+    if (II->isDebugValue())
+      continue;
+    // If this instruction affects R12, adjust our restore point.
+    for (unsigned i = 0, e = II->getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = II->getOperand(i);
+      if (MO.isRegMask() && MO.clobbersPhysReg(ARM::R12)) {
+        UseMI = II;
+        done = true;
+        break;
+      }
+      if (!MO.isReg() || MO.isUndef() || !MO.getReg() ||
+          TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+        continue;
+      if (MO.getReg() == ARM::R12) {
+        UseMI = II;
+        done = true;
+        break;
+      }
+    }
+  }
+  // Restore the register from R12
+  AddDefaultPred(BuildMI(MBB, UseMI, DL, TII.get(ARM::tMOVr)).
+    addReg(Reg, RegState::Define).addReg(ARM::R12, RegState::Kill));
+
+  return true;
+}
+
+void
+Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                        int SPAdj, unsigned FIOperandNum,
+                                        RegScavenger *RS) const {
+  unsigned VReg = 0;
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const ARMBaseInstrInfo &TII =
+    *static_cast<const ARMBaseInstrInfo*>(MF.getTarget().getInstrInfo());
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  DebugLoc dl = MI.getDebugLoc();
+  MachineInstrBuilder MIB(*MBB.getParent(), &MI);
+
+  unsigned FrameReg = ARM::SP;
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+  int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) +
+               MF.getFrameInfo()->getStackSize() + SPAdj;
+
+  if (MF.getFrameInfo()->hasVarSizedObjects()) {
+    assert(SPAdj == 0 && MF.getTarget().getFrameLowering()->hasFP(MF) &&
+           "Unexpected");
+    // There are alloca()'s in this function, must reference off the frame
+    // pointer or base pointer instead.
+    if (!hasBasePointer(MF)) {
+      FrameReg = getFrameRegister(MF);
+      Offset -= AFI->getFramePtrSpillOffset();
+    } else
+      FrameReg = BasePtr;
+  }
+
+  // PEI::scavengeFrameVirtualRegs() cannot accurately track SPAdj because the
+  // call frame setup/destroy instructions have already been eliminated.  That
+  // means the stack pointer cannot be used to access the emergency spill slot
+  // when !hasReservedCallFrame().
+#ifndef NDEBUG
+  if (RS && FrameReg == ARM::SP && RS->isScavengingFrameIndex(FrameIndex)){
+    assert(MF.getTarget().getFrameLowering()->hasReservedCallFrame(MF) &&
+           "Cannot use SP to access the emergency spill slot in "
+           "functions without a reserved call frame");
+    assert(!MF.getFrameInfo()->hasVarSizedObjects() &&
+           "Cannot use SP to access the emergency spill slot in "
+           "functions with variable sized frame objects");
+  }
+#endif // NDEBUG
+
+  // Special handling of dbg_value instructions.
+  if (MI.isDebugValue()) {
+    MI.getOperand(FIOperandNum).  ChangeToRegister(FrameReg, false /*isDef*/);
+    MI.getOperand(FIOperandNum+1).ChangeToImmediate(Offset);
+    return;
+  }
+
+  // Modify MI as necessary to handle as much of 'Offset' as possible
+  assert(AFI->isThumbFunction() &&
+         "This eliminateFrameIndex only supports Thumb1!");
+  if (rewriteFrameIndex(MI, FIOperandNum, FrameReg, Offset, TII))
+    return;
+
+  // If we get here, the immediate doesn't fit into the instruction.  We folded
+  // as much as possible above, handle the rest, providing a register that is
+  // SP+LargeImm.
+  assert(Offset && "This code isn't needed if offset already handled!");
+
+  unsigned Opcode = MI.getOpcode();
+
+  // Remove predicate first.
+  int PIdx = MI.findFirstPredOperandIdx();
+  if (PIdx != -1)
+    removeOperands(MI, PIdx);
+
+  if (MI.mayLoad()) {
+    // Use the destination register to materialize sp + offset.
+    unsigned TmpReg = MI.getOperand(0).getReg();
+    bool UseRR = false;
+    if (Opcode == ARM::tLDRspi) {
+      if (FrameReg == ARM::SP)
+        emitThumbRegPlusImmInReg(MBB, II, dl, TmpReg, FrameReg,
+                                 Offset, false, TII, *this);
+      else {
+        emitLoadConstPool(MBB, II, dl, TmpReg, 0, Offset);
+        UseRR = true;
+      }
+    } else {
+      emitThumbRegPlusImmediate(MBB, II, dl, TmpReg, FrameReg, Offset, TII,
+                                *this);
+    }
+
+    MI.setDesc(TII.get(UseRR ? ARM::tLDRr : ARM::tLDRi));
+    MI.getOperand(FIOperandNum).ChangeToRegister(TmpReg, false, false, true);
+    if (UseRR)
+      // Use [reg, reg] addrmode. Replace the immediate operand w/ the frame
+      // register. The offset is already handled in the vreg value.
+      MI.getOperand(FIOperandNum+1).ChangeToRegister(FrameReg, false, false,
+                                                     false);
+  } else if (MI.mayStore()) {
+      VReg = MF.getRegInfo().createVirtualRegister(&ARM::tGPRRegClass);
+      bool UseRR = false;
+
+      if (Opcode == ARM::tSTRspi) {
+        if (FrameReg == ARM::SP)
+          emitThumbRegPlusImmInReg(MBB, II, dl, VReg, FrameReg,
+                                   Offset, false, TII, *this);
+        else {
+          emitLoadConstPool(MBB, II, dl, VReg, 0, Offset);
+          UseRR = true;
+        }
+      } else
+        emitThumbRegPlusImmediate(MBB, II, dl, VReg, FrameReg, Offset, TII,
+                                  *this);
+      MI.setDesc(TII.get(UseRR ? ARM::tSTRr : ARM::tSTRi));
+      MI.getOperand(FIOperandNum).ChangeToRegister(VReg, false, false, true);
+      if (UseRR)
+        // Use [reg, reg] addrmode. Replace the immediate operand w/ the frame
+        // register. The offset is already handled in the vreg value.
+        MI.getOperand(FIOperandNum+1).ChangeToRegister(FrameReg, false, false,
+                                                       false);
+  } else {
+    llvm_unreachable("Unexpected opcode!");
+  }
+
+  // Add predicate back if it's needed.
+  if (MI.isPredicable())
+    AddDefaultPred(MIB);
+}
diff --git a/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.h b/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.h
new file mode 100644
index 0000000..0c0abbe
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.h
@@ -0,0 +1,63 @@
+//===- Thumb1RegisterInfo.h - Thumb-1 Register Information Impl -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Thumb-1 implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef THUMB1REGISTERINFO_H
+#define THUMB1REGISTERINFO_H
+
+#include "ARMBaseRegisterInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+namespace llvm {
+  class ARMSubtarget;
+  class ARMBaseInstrInfo;
+
+struct Thumb1RegisterInfo : public ARMBaseRegisterInfo {
+public:
+  Thumb1RegisterInfo(const ARMSubtarget &STI);
+
+  const TargetRegisterClass *
+  getLargestLegalSuperClass(const TargetRegisterClass *RC) const override;
+
+  const TargetRegisterClass *
+  getPointerRegClass(const MachineFunction &MF,
+                     unsigned Kind = 0) const override;
+
+  /// emitLoadConstPool - Emits a load from constpool to materialize the
+  /// specified immediate.
+  void
+  emitLoadConstPool(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+                    DebugLoc dl, unsigned DestReg, unsigned SubIdx, int Val,
+                    ARMCC::CondCodes Pred = ARMCC::AL, unsigned PredReg = 0,
+                    unsigned MIFlags = MachineInstr::NoFlags) const override;
+
+  // rewrite MI to access 'Offset' bytes from the FP. Update Offset to be
+  // however much remains to be handled. Return 'true' if no further
+  // work is required.
+  bool rewriteFrameIndex(MachineBasicBlock::iterator II, unsigned FrameRegIdx,
+                         unsigned FrameReg, int &Offset,
+                         const ARMBaseInstrInfo &TII) const;
+  void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+                         int64_t Offset) const override;
+  bool saveScavengerRegister(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator I,
+                             MachineBasicBlock::iterator &UseMI,
+                             const TargetRegisterClass *RC,
+                             unsigned Reg) const override;
+  void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                           int SPAdj, unsigned FIOperandNum,
+                           RegScavenger *RS = nullptr) const override;
+};
+}
+
+#endif // THUMB1REGISTERINFO_H
diff --git a/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp b/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp
new file mode 100644
index 0000000..edb9ff3
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp
@@ -0,0 +1,282 @@
+//===-- Thumb2ITBlockPass.cpp - Insert Thumb-2 IT blocks ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMMachineFunctionInfo.h"
+#include "Thumb2InstrInfo.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "thumb2-it"
+
+STATISTIC(NumITs,        "Number of IT blocks inserted");
+STATISTIC(NumMovedInsts, "Number of predicated instructions moved");
+
+namespace {
+  class Thumb2ITBlockPass : public MachineFunctionPass {
+  public:
+    static char ID;
+    Thumb2ITBlockPass() : MachineFunctionPass(ID) {}
+
+    bool restrictIT;
+    const Thumb2InstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    ARMFunctionInfo *AFI;
+
+    bool runOnMachineFunction(MachineFunction &Fn) override;
+
+    const char *getPassName() const override {
+      return "Thumb IT blocks insertion pass";
+    }
+
+  private:
+    bool MoveCopyOutOfITBlock(MachineInstr *MI,
+                              ARMCC::CondCodes CC, ARMCC::CondCodes OCC,
+                              SmallSet<unsigned, 4> &Defs,
+                              SmallSet<unsigned, 4> &Uses);
+    bool InsertITInstructions(MachineBasicBlock &MBB);
+  };
+  char Thumb2ITBlockPass::ID = 0;
+}
+
+/// TrackDefUses - Tracking what registers are being defined and used by
+/// instructions in the IT block. This also tracks "dependencies", i.e. uses
+/// in the IT block that are defined before the IT instruction.
+static void TrackDefUses(MachineInstr *MI,
+                         SmallSet<unsigned, 4> &Defs,
+                         SmallSet<unsigned, 4> &Uses,
+                         const TargetRegisterInfo *TRI) {
+  SmallVector<unsigned, 4> LocalDefs;
+  SmallVector<unsigned, 4> LocalUses;
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg || Reg == ARM::ITSTATE || Reg == ARM::SP)
+      continue;
+    if (MO.isUse())
+      LocalUses.push_back(Reg);
+    else
+      LocalDefs.push_back(Reg);
+  }
+
+  for (unsigned i = 0, e = LocalUses.size(); i != e; ++i) {
+    unsigned Reg = LocalUses[i];
+    for (MCSubRegIterator Subreg(Reg, TRI, /*IncludeSelf=*/true);
+         Subreg.isValid(); ++Subreg)
+      Uses.insert(*Subreg);
+  }
+
+  for (unsigned i = 0, e = LocalDefs.size(); i != e; ++i) {
+    unsigned Reg = LocalDefs[i];
+    for (MCSubRegIterator Subreg(Reg, TRI, /*IncludeSelf=*/true);
+         Subreg.isValid(); ++Subreg)
+      Defs.insert(*Subreg);
+    if (Reg == ARM::CPSR)
+      continue;
+  }
+}
+
+static bool isCopy(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  default:
+    return false;
+  case ARM::MOVr:
+  case ARM::MOVr_TC:
+  case ARM::tMOVr:
+  case ARM::t2MOVr:
+    return true;
+  }
+}
+
+bool
+Thumb2ITBlockPass::MoveCopyOutOfITBlock(MachineInstr *MI,
+                                      ARMCC::CondCodes CC, ARMCC::CondCodes OCC,
+                                        SmallSet<unsigned, 4> &Defs,
+                                        SmallSet<unsigned, 4> &Uses) {
+  if (!isCopy(MI))
+    return false;
+  // llvm models select's as two-address instructions. That means a copy
+  // is inserted before a t2MOVccr, etc. If the copy is scheduled in
+  // between selects we would end up creating multiple IT blocks.
+  assert(MI->getOperand(0).getSubReg() == 0 &&
+         MI->getOperand(1).getSubReg() == 0 &&
+         "Sub-register indices still around?");
+
+  unsigned DstReg = MI->getOperand(0).getReg();
+  unsigned SrcReg = MI->getOperand(1).getReg();
+
+  // First check if it's safe to move it.
+  if (Uses.count(DstReg) || Defs.count(SrcReg))
+    return false;
+
+  // If the CPSR is defined by this copy, then we don't want to move it. E.g.,
+  // if we have:
+  //
+  //   movs  r1, r1
+  //   rsb   r1, 0
+  //   movs  r2, r2
+  //   rsb   r2, 0
+  //
+  // we don't want this to be converted to:
+  //
+  //   movs  r1, r1
+  //   movs  r2, r2
+  //   itt   mi
+  //   rsb   r1, 0
+  //   rsb   r2, 0
+  //
+  const MCInstrDesc &MCID = MI->getDesc();
+  if (MI->hasOptionalDef() &&
+      MI->getOperand(MCID.getNumOperands() - 1).getReg() == ARM::CPSR)
+    return false;
+
+  // Then peek at the next instruction to see if it's predicated on CC or OCC.
+  // If not, then there is nothing to be gained by moving the copy.
+  MachineBasicBlock::iterator I = MI; ++I;
+  MachineBasicBlock::iterator E = MI->getParent()->end();
+  while (I != E && I->isDebugValue())
+    ++I;
+  if (I != E) {
+    unsigned NPredReg = 0;
+    ARMCC::CondCodes NCC = getITInstrPredicate(I, NPredReg);
+    if (NCC == CC || NCC == OCC)
+      return true;
+  }
+  return false;
+}
+
+bool Thumb2ITBlockPass::InsertITInstructions(MachineBasicBlock &MBB) {
+  bool Modified = false;
+
+  SmallSet<unsigned, 4> Defs;
+  SmallSet<unsigned, 4> Uses;
+  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+  while (MBBI != E) {
+    MachineInstr *MI = &*MBBI;
+    DebugLoc dl = MI->getDebugLoc();
+    unsigned PredReg = 0;
+    ARMCC::CondCodes CC = getITInstrPredicate(MI, PredReg);
+    if (CC == ARMCC::AL) {
+      ++MBBI;
+      continue;
+    }
+
+    Defs.clear();
+    Uses.clear();
+    TrackDefUses(MI, Defs, Uses, TRI);
+
+    // Insert an IT instruction.
+    MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII->get(ARM::t2IT))
+      .addImm(CC);
+
+    // Add implicit use of ITSTATE to IT block instructions.
+    MI->addOperand(MachineOperand::CreateReg(ARM::ITSTATE, false/*ifDef*/,
+                                             true/*isImp*/, false/*isKill*/));
+
+    MachineInstr *LastITMI = MI;
+    MachineBasicBlock::iterator InsertPos = MIB;
+    ++MBBI;
+
+    // Form IT block.
+    ARMCC::CondCodes OCC = ARMCC::getOppositeCondition(CC);
+    unsigned Mask = 0, Pos = 3;
+
+    // v8 IT blocks are limited to one conditional op unless -arm-no-restrict-it
+    // is set: skip the loop
+    if (!restrictIT) {
+      // Branches, including tricky ones like LDM_RET, need to end an IT
+      // block so check the instruction we just put in the block.
+      for (; MBBI != E && Pos &&
+             (!MI->isBranch() && !MI->isReturn()) ; ++MBBI) {
+        if (MBBI->isDebugValue())
+          continue;
+
+        MachineInstr *NMI = &*MBBI;
+        MI = NMI;
+
+        unsigned NPredReg = 0;
+        ARMCC::CondCodes NCC = getITInstrPredicate(NMI, NPredReg);
+        if (NCC == CC || NCC == OCC) {
+          Mask |= (NCC & 1) << Pos;
+          // Add implicit use of ITSTATE.
+          NMI->addOperand(MachineOperand::CreateReg(ARM::ITSTATE, false/*ifDef*/,
+                                                 true/*isImp*/, false/*isKill*/));
+          LastITMI = NMI;
+        } else {
+          if (NCC == ARMCC::AL &&
+              MoveCopyOutOfITBlock(NMI, CC, OCC, Defs, Uses)) {
+            --MBBI;
+            MBB.remove(NMI);
+            MBB.insert(InsertPos, NMI);
+            ++NumMovedInsts;
+            continue;
+          }
+          break;
+        }
+        TrackDefUses(NMI, Defs, Uses, TRI);
+        --Pos;
+      }
+    }
+
+    // Finalize IT mask.
+    Mask |= (1 << Pos);
+    // Tag along (firstcond[0] << 4) with the mask.
+    Mask |= (CC & 1) << 4;
+    MIB.addImm(Mask);
+
+    // Last instruction in IT block kills ITSTATE.
+    LastITMI->findRegisterUseOperand(ARM::ITSTATE)->setIsKill();
+
+    // Finalize the bundle.
+    MachineBasicBlock::instr_iterator LI = LastITMI;
+    finalizeBundle(MBB, InsertPos.getInstrIterator(), std::next(LI));
+
+    Modified = true;
+    ++NumITs;
+  }
+
+  return Modified;
+}
+
+bool Thumb2ITBlockPass::runOnMachineFunction(MachineFunction &Fn) {
+  const TargetMachine &TM = Fn.getTarget();
+  AFI = Fn.getInfo<ARMFunctionInfo>();
+  TII = static_cast<const Thumb2InstrInfo*>(TM.getInstrInfo());
+  TRI = TM.getRegisterInfo();
+  restrictIT = TM.getSubtarget<ARMSubtarget>().restrictIT();
+
+  if (!AFI->isThumbFunction())
+    return false;
+
+  bool Modified = false;
+  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; ) {
+    MachineBasicBlock &MBB = *MFI;
+    ++MFI;
+    Modified |= InsertITInstructions(MBB);
+  }
+
+  if (Modified)
+    AFI->setHasITBlocks(true);
+
+  return Modified;
+}
+
+/// createThumb2ITBlockPass - Returns an instance of the Thumb2 IT blocks
+/// insertion pass.
+FunctionPass *llvm::createThumb2ITBlockPass() {
+  return new Thumb2ITBlockPass();
+}
diff --git a/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
new file mode 100644
index 0000000..a9df006
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -0,0 +1,630 @@
+//===-- Thumb2InstrInfo.cpp - Thumb-2 Instruction Information -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Thumb-2 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Thumb2InstrInfo.h"
+#include "ARMConstantPoolValue.h"
+#include "ARMMachineFunctionInfo.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+static cl::opt<bool>
+OldT2IfCvt("old-thumb2-ifcvt", cl::Hidden,
+           cl::desc("Use old-style Thumb2 if-conversion heuristics"),
+           cl::init(false));
+
+Thumb2InstrInfo::Thumb2InstrInfo(const ARMSubtarget &STI)
+  : ARMBaseInstrInfo(STI), RI(STI) {
+}
+
+/// getNoopForMachoTarget - Return the noop instruction to use for a noop.
+void Thumb2InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+  NopInst.setOpcode(ARM::tHINT);
+  NopInst.addOperand(MCOperand::CreateImm(0));
+  NopInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+  NopInst.addOperand(MCOperand::CreateReg(0));
+}
+
+unsigned Thumb2InstrInfo::getUnindexedOpcode(unsigned Opc) const {
+  // FIXME
+  return 0;
+}
+
+void
+Thumb2InstrInfo::ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
+                                         MachineBasicBlock *NewDest) const {
+  MachineBasicBlock *MBB = Tail->getParent();
+  ARMFunctionInfo *AFI = MBB->getParent()->getInfo<ARMFunctionInfo>();
+  if (!AFI->hasITBlocks()) {
+    TargetInstrInfo::ReplaceTailWithBranchTo(Tail, NewDest);
+    return;
+  }
+
+  // If the first instruction of Tail is predicated, we may have to update
+  // the IT instruction.
+  unsigned PredReg = 0;
+  ARMCC::CondCodes CC = getInstrPredicate(Tail, PredReg);
+  MachineBasicBlock::iterator MBBI = Tail;
+  if (CC != ARMCC::AL)
+    // Expecting at least the t2IT instruction before it.
+    --MBBI;
+
+  // Actually replace the tail.
+  TargetInstrInfo::ReplaceTailWithBranchTo(Tail, NewDest);
+
+  // Fix up IT.
+  if (CC != ARMCC::AL) {
+    MachineBasicBlock::iterator E = MBB->begin();
+    unsigned Count = 4; // At most 4 instructions in an IT block.
+    while (Count && MBBI != E) {
+      if (MBBI->isDebugValue()) {
+        --MBBI;
+        continue;
+      }
+      if (MBBI->getOpcode() == ARM::t2IT) {
+        unsigned Mask = MBBI->getOperand(1).getImm();
+        if (Count == 4)
+          MBBI->eraseFromParent();
+        else {
+          unsigned MaskOn = 1 << Count;
+          unsigned MaskOff = ~(MaskOn - 1);
+          MBBI->getOperand(1).setImm((Mask & MaskOff) | MaskOn);
+        }
+        return;
+      }
+      --MBBI;
+      --Count;
+    }
+
+    // Ctrl flow can reach here if branch folding is run before IT block
+    // formation pass.
+  }
+}
+
+bool
+Thumb2InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MBBI) const {
+  while (MBBI->isDebugValue()) {
+    ++MBBI;
+    if (MBBI == MBB.end())
+      return false;
+  }
+
+  unsigned PredReg = 0;
+  return getITInstrPredicate(MBBI, PredReg) == ARMCC::AL;
+}
+
+void Thumb2InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I, DebugLoc DL,
+                                  unsigned DestReg, unsigned SrcReg,
+                                  bool KillSrc) const {
+  // Handle SPR, DPR, and QPR copies.
+  if (!ARM::GPRRegClass.contains(DestReg, SrcReg))
+    return ARMBaseInstrInfo::copyPhysReg(MBB, I, DL, DestReg, SrcReg, KillSrc);
+
+  AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tMOVr), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc)));
+}
+
+void Thumb2InstrInfo::
+storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                    unsigned SrcReg, bool isKill, int FI,
+                    const TargetRegisterClass *RC,
+                    const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  if (I != MBB.end()) DL = I->getDebugLoc();
+
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
+                            MachineMemOperand::MOStore,
+                            MFI.getObjectSize(FI),
+                            MFI.getObjectAlignment(FI));
+
+  if (RC == &ARM::GPRRegClass   || RC == &ARM::tGPRRegClass ||
+      RC == &ARM::tcGPRRegClass || RC == &ARM::rGPRRegClass ||
+      RC == &ARM::GPRnopcRegClass) {
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::t2STRi12))
+                   .addReg(SrcReg, getKillRegState(isKill))
+                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+    return;
+  }
+
+  if (ARM::GPRPairRegClass.hasSubClassEq(RC)) {
+    // Thumb2 STRD expects its dest-registers to be in rGPR. Not a problem for
+    // gsub_0, but needs an extra constraint for gsub_1 (which could be sp
+    // otherwise).
+    MachineRegisterInfo *MRI = &MF.getRegInfo();
+    MRI->constrainRegClass(SrcReg, &ARM::GPRPair_with_gsub_1_in_rGPRRegClass);
+
+    MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::t2STRDi8));
+    AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill), TRI);
+    AddDReg(MIB, SrcReg, ARM::gsub_1, 0, TRI);
+    MIB.addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+    AddDefaultPred(MIB);
+    return;
+  }
+
+  ARMBaseInstrInfo::storeRegToStackSlot(MBB, I, SrcReg, isKill, FI, RC, TRI);
+}
+
+void Thumb2InstrInfo::
+loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                     unsigned DestReg, int FI,
+                     const TargetRegisterClass *RC,
+                     const TargetRegisterInfo *TRI) const {
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
+                            MachineMemOperand::MOLoad,
+                            MFI.getObjectSize(FI),
+                            MFI.getObjectAlignment(FI));
+  DebugLoc DL;
+  if (I != MBB.end()) DL = I->getDebugLoc();
+
+  if (RC == &ARM::GPRRegClass   || RC == &ARM::tGPRRegClass ||
+      RC == &ARM::tcGPRRegClass || RC == &ARM::rGPRRegClass ||
+      RC == &ARM::GPRnopcRegClass) {
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::t2LDRi12), DestReg)
+                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+    return;
+  }
+
+  if (ARM::GPRPairRegClass.hasSubClassEq(RC)) {
+    // Thumb2 LDRD expects its dest-registers to be in rGPR. Not a problem for
+    // gsub_0, but needs an extra constraint for gsub_1 (which could be sp
+    // otherwise).
+    MachineRegisterInfo *MRI = &MF.getRegInfo();
+    MRI->constrainRegClass(DestReg, &ARM::GPRPair_with_gsub_1_in_rGPRRegClass);
+
+    MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::t2LDRDi8));
+    AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead, TRI);
+    AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI);
+    MIB.addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+    AddDefaultPred(MIB);
+
+    if (TargetRegisterInfo::isPhysicalRegister(DestReg))
+      MIB.addReg(DestReg, RegState::ImplicitDefine);
+    return;
+  }
+
+  ARMBaseInstrInfo::loadRegFromStackSlot(MBB, I, DestReg, FI, RC, TRI);
+}
+
+void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator &MBBI, DebugLoc dl,
+                               unsigned DestReg, unsigned BaseReg, int NumBytes,
+                               ARMCC::CondCodes Pred, unsigned PredReg,
+                               const ARMBaseInstrInfo &TII, unsigned MIFlags) {
+  if (NumBytes == 0 && DestReg != BaseReg) {
+    BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), DestReg)
+      .addReg(BaseReg, RegState::Kill)
+      .addImm((unsigned)Pred).addReg(PredReg).setMIFlags(MIFlags);
+    return;
+  }
+
+  bool isSub = NumBytes < 0;
+  if (isSub) NumBytes = -NumBytes;
+
+  // If profitable, use a movw or movt to materialize the offset.
+  // FIXME: Use the scavenger to grab a scratch register.
+  if (DestReg != ARM::SP && DestReg != BaseReg &&
+      NumBytes >= 4096 &&
+      ARM_AM::getT2SOImmVal(NumBytes) == -1) {
+    bool Fits = false;
+    if (NumBytes < 65536) {
+      // Use a movw to materialize the 16-bit constant.
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi16), DestReg)
+        .addImm(NumBytes)
+        .addImm((unsigned)Pred).addReg(PredReg).setMIFlags(MIFlags);
+      Fits = true;
+    } else if ((NumBytes & 0xffff) == 0) {
+      // Use a movt to materialize the 32-bit constant.
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVTi16), DestReg)
+        .addReg(DestReg)
+        .addImm(NumBytes >> 16)
+        .addImm((unsigned)Pred).addReg(PredReg).setMIFlags(MIFlags);
+      Fits = true;
+    }
+
+    if (Fits) {
+      if (isSub) {
+        BuildMI(MBB, MBBI, dl, TII.get(ARM::t2SUBrr), DestReg)
+          .addReg(BaseReg, RegState::Kill)
+          .addReg(DestReg, RegState::Kill)
+          .addImm((unsigned)Pred).addReg(PredReg).addReg(0)
+          .setMIFlags(MIFlags);
+      } else {
+        BuildMI(MBB, MBBI, dl, TII.get(ARM::t2ADDrr), DestReg)
+          .addReg(DestReg, RegState::Kill)
+          .addReg(BaseReg, RegState::Kill)
+          .addImm((unsigned)Pred).addReg(PredReg).addReg(0)
+          .setMIFlags(MIFlags);
+      }
+      return;
+    }
+  }
+
+  while (NumBytes) {
+    unsigned ThisVal = NumBytes;
+    unsigned Opc = 0;
+    if (DestReg == ARM::SP && BaseReg != ARM::SP) {
+      // mov sp, rn. Note t2MOVr cannot be used.
+      AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr),DestReg)
+        .addReg(BaseReg).setMIFlags(MIFlags));
+      BaseReg = ARM::SP;
+      continue;
+    }
+
+    bool HasCCOut = true;
+    if (BaseReg == ARM::SP) {
+      // sub sp, sp, #imm7
+      if (DestReg == ARM::SP && (ThisVal < ((1 << 7)-1) * 4)) {
+        assert((ThisVal & 3) == 0 && "Stack update is not multiple of 4?");
+        Opc = isSub ? ARM::tSUBspi : ARM::tADDspi;
+        AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
+          .addReg(BaseReg).addImm(ThisVal/4).setMIFlags(MIFlags));
+        NumBytes = 0;
+        continue;
+      }
+
+      // sub rd, sp, so_imm
+      Opc = isSub ? ARM::t2SUBri : ARM::t2ADDri;
+      if (ARM_AM::getT2SOImmVal(NumBytes) != -1) {
+        NumBytes = 0;
+      } else {
+        // FIXME: Move this to ARMAddressingModes.h?
+        unsigned RotAmt = countLeadingZeros(ThisVal);
+        ThisVal = ThisVal & ARM_AM::rotr32(0xff000000U, RotAmt);
+        NumBytes &= ~ThisVal;
+        assert(ARM_AM::getT2SOImmVal(ThisVal) != -1 &&
+               "Bit extraction didn't work?");
+      }
+    } else {
+      assert(DestReg != ARM::SP && BaseReg != ARM::SP);
+      Opc = isSub ? ARM::t2SUBri : ARM::t2ADDri;
+      if (ARM_AM::getT2SOImmVal(NumBytes) != -1) {
+        NumBytes = 0;
+      } else if (ThisVal < 4096) {
+        Opc = isSub ? ARM::t2SUBri12 : ARM::t2ADDri12;
+        HasCCOut = false;
+        NumBytes = 0;
+      } else {
+        // FIXME: Move this to ARMAddressingModes.h?
+        unsigned RotAmt = countLeadingZeros(ThisVal);
+        ThisVal = ThisVal & ARM_AM::rotr32(0xff000000U, RotAmt);
+        NumBytes &= ~ThisVal;
+        assert(ARM_AM::getT2SOImmVal(ThisVal) != -1 &&
+               "Bit extraction didn't work?");
+      }
+    }
+
+    // Build the new ADD / SUB.
+    MachineInstrBuilder MIB =
+      AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
+                     .addReg(BaseReg, RegState::Kill)
+                     .addImm(ThisVal)).setMIFlags(MIFlags);
+    if (HasCCOut)
+      AddDefaultCC(MIB);
+
+    BaseReg = DestReg;
+  }
+}
+
+static unsigned
+negativeOffsetOpcode(unsigned opcode)
+{
+  switch (opcode) {
+  case ARM::t2LDRi12:   return ARM::t2LDRi8;
+  case ARM::t2LDRHi12:  return ARM::t2LDRHi8;
+  case ARM::t2LDRBi12:  return ARM::t2LDRBi8;
+  case ARM::t2LDRSHi12: return ARM::t2LDRSHi8;
+  case ARM::t2LDRSBi12: return ARM::t2LDRSBi8;
+  case ARM::t2STRi12:   return ARM::t2STRi8;
+  case ARM::t2STRBi12:  return ARM::t2STRBi8;
+  case ARM::t2STRHi12:  return ARM::t2STRHi8;
+  case ARM::t2PLDi12:   return ARM::t2PLDi8;
+
+  case ARM::t2LDRi8:
+  case ARM::t2LDRHi8:
+  case ARM::t2LDRBi8:
+  case ARM::t2LDRSHi8:
+  case ARM::t2LDRSBi8:
+  case ARM::t2STRi8:
+  case ARM::t2STRBi8:
+  case ARM::t2STRHi8:
+  case ARM::t2PLDi8:
+    return opcode;
+
+  default:
+    break;
+  }
+
+  return 0;
+}
+
+static unsigned
+positiveOffsetOpcode(unsigned opcode)
+{
+  switch (opcode) {
+  case ARM::t2LDRi8:   return ARM::t2LDRi12;
+  case ARM::t2LDRHi8:  return ARM::t2LDRHi12;
+  case ARM::t2LDRBi8:  return ARM::t2LDRBi12;
+  case ARM::t2LDRSHi8: return ARM::t2LDRSHi12;
+  case ARM::t2LDRSBi8: return ARM::t2LDRSBi12;
+  case ARM::t2STRi8:   return ARM::t2STRi12;
+  case ARM::t2STRBi8:  return ARM::t2STRBi12;
+  case ARM::t2STRHi8:  return ARM::t2STRHi12;
+  case ARM::t2PLDi8:   return ARM::t2PLDi12;
+
+  case ARM::t2LDRi12:
+  case ARM::t2LDRHi12:
+  case ARM::t2LDRBi12:
+  case ARM::t2LDRSHi12:
+  case ARM::t2LDRSBi12:
+  case ARM::t2STRi12:
+  case ARM::t2STRBi12:
+  case ARM::t2STRHi12:
+  case ARM::t2PLDi12:
+    return opcode;
+
+  default:
+    break;
+  }
+
+  return 0;
+}
+
+static unsigned
+immediateOffsetOpcode(unsigned opcode)
+{
+  switch (opcode) {
+  case ARM::t2LDRs:   return ARM::t2LDRi12;
+  case ARM::t2LDRHs:  return ARM::t2LDRHi12;
+  case ARM::t2LDRBs:  return ARM::t2LDRBi12;
+  case ARM::t2LDRSHs: return ARM::t2LDRSHi12;
+  case ARM::t2LDRSBs: return ARM::t2LDRSBi12;
+  case ARM::t2STRs:   return ARM::t2STRi12;
+  case ARM::t2STRBs:  return ARM::t2STRBi12;
+  case ARM::t2STRHs:  return ARM::t2STRHi12;
+  case ARM::t2PLDs:   return ARM::t2PLDi12;
+
+  case ARM::t2LDRi12:
+  case ARM::t2LDRHi12:
+  case ARM::t2LDRBi12:
+  case ARM::t2LDRSHi12:
+  case ARM::t2LDRSBi12:
+  case ARM::t2STRi12:
+  case ARM::t2STRBi12:
+  case ARM::t2STRHi12:
+  case ARM::t2PLDi12:
+  case ARM::t2LDRi8:
+  case ARM::t2LDRHi8:
+  case ARM::t2LDRBi8:
+  case ARM::t2LDRSHi8:
+  case ARM::t2LDRSBi8:
+  case ARM::t2STRi8:
+  case ARM::t2STRBi8:
+  case ARM::t2STRHi8:
+  case ARM::t2PLDi8:
+    return opcode;
+
+  default:
+    break;
+  }
+
+  return 0;
+}
+
+bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+                               unsigned FrameReg, int &Offset,
+                               const ARMBaseInstrInfo &TII) {
+  unsigned Opcode = MI.getOpcode();
+  const MCInstrDesc &Desc = MI.getDesc();
+  unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
+  bool isSub = false;
+
+  // Memory operands in inline assembly always use AddrModeT2_i12.
+  if (Opcode == ARM::INLINEASM)
+    AddrMode = ARMII::AddrModeT2_i12; // FIXME. mode for thumb2?
+
+  if (Opcode == ARM::t2ADDri || Opcode == ARM::t2ADDri12) {
+    Offset += MI.getOperand(FrameRegIdx+1).getImm();
+
+    unsigned PredReg;
+    if (Offset == 0 && getInstrPredicate(&MI, PredReg) == ARMCC::AL) {
+      // Turn it into a move.
+      MI.setDesc(TII.get(ARM::tMOVr));
+      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+      // Remove offset and remaining explicit predicate operands.
+      do MI.RemoveOperand(FrameRegIdx+1);
+      while (MI.getNumOperands() > FrameRegIdx+1);
+      MachineInstrBuilder MIB(*MI.getParent()->getParent(), &MI);
+      AddDefaultPred(MIB);
+      return true;
+    }
+
+    bool HasCCOut = Opcode != ARM::t2ADDri12;
+
+    if (Offset < 0) {
+      Offset = -Offset;
+      isSub = true;
+      MI.setDesc(TII.get(ARM::t2SUBri));
+    } else {
+      MI.setDesc(TII.get(ARM::t2ADDri));
+    }
+
+    // Common case: small offset, fits into instruction.
+    if (ARM_AM::getT2SOImmVal(Offset) != -1) {
+      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+      MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Offset);
+      // Add cc_out operand if the original instruction did not have one.
+      if (!HasCCOut)
+        MI.addOperand(MachineOperand::CreateReg(0, false));
+      Offset = 0;
+      return true;
+    }
+    // Another common case: imm12.
+    if (Offset < 4096 &&
+        (!HasCCOut || MI.getOperand(MI.getNumOperands()-1).getReg() == 0)) {
+      unsigned NewOpc = isSub ? ARM::t2SUBri12 : ARM::t2ADDri12;
+      MI.setDesc(TII.get(NewOpc));
+      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+      MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Offset);
+      // Remove the cc_out operand.
+      if (HasCCOut)
+        MI.RemoveOperand(MI.getNumOperands()-1);
+      Offset = 0;
+      return true;
+    }
+
+    // Otherwise, extract 8 adjacent bits from the immediate into this
+    // t2ADDri/t2SUBri.
+    unsigned RotAmt = countLeadingZeros<unsigned>(Offset);
+    unsigned ThisImmVal = Offset & ARM_AM::rotr32(0xff000000U, RotAmt);
+
+    // We will handle these bits from offset, clear them.
+    Offset &= ~ThisImmVal;
+
+    assert(ARM_AM::getT2SOImmVal(ThisImmVal) != -1 &&
+           "Bit extraction didn't work?");
+    MI.getOperand(FrameRegIdx+1).ChangeToImmediate(ThisImmVal);
+    // Add cc_out operand if the original instruction did not have one.
+    if (!HasCCOut)
+      MI.addOperand(MachineOperand::CreateReg(0, false));
+
+  } else {
+
+    // AddrMode4 and AddrMode6 cannot handle any offset.
+    if (AddrMode == ARMII::AddrMode4 || AddrMode == ARMII::AddrMode6)
+      return false;
+
+    // AddrModeT2_so cannot handle any offset. If there is no offset
+    // register then we change to an immediate version.
+    unsigned NewOpc = Opcode;
+    if (AddrMode == ARMII::AddrModeT2_so) {
+      unsigned OffsetReg = MI.getOperand(FrameRegIdx+1).getReg();
+      if (OffsetReg != 0) {
+        MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+        return Offset == 0;
+      }
+
+      MI.RemoveOperand(FrameRegIdx+1);
+      MI.getOperand(FrameRegIdx+1).ChangeToImmediate(0);
+      NewOpc = immediateOffsetOpcode(Opcode);
+      AddrMode = ARMII::AddrModeT2_i12;
+    }
+
+    unsigned NumBits = 0;
+    unsigned Scale = 1;
+    if (AddrMode == ARMII::AddrModeT2_i8 || AddrMode == ARMII::AddrModeT2_i12) {
+      // i8 supports only negative, and i12 supports only positive, so
+      // based on Offset sign convert Opcode to the appropriate
+      // instruction
+      Offset += MI.getOperand(FrameRegIdx+1).getImm();
+      if (Offset < 0) {
+        NewOpc = negativeOffsetOpcode(Opcode);
+        NumBits = 8;
+        isSub = true;
+        Offset = -Offset;
+      } else {
+        NewOpc = positiveOffsetOpcode(Opcode);
+        NumBits = 12;
+      }
+    } else if (AddrMode == ARMII::AddrMode5) {
+      // VFP address mode.
+      const MachineOperand &OffOp = MI.getOperand(FrameRegIdx+1);
+      int InstrOffs = ARM_AM::getAM5Offset(OffOp.getImm());
+      if (ARM_AM::getAM5Op(OffOp.getImm()) == ARM_AM::sub)
+        InstrOffs *= -1;
+      NumBits = 8;
+      Scale = 4;
+      Offset += InstrOffs * 4;
+      assert((Offset & (Scale-1)) == 0 && "Can't encode this offset!");
+      if (Offset < 0) {
+        Offset = -Offset;
+        isSub = true;
+      }
+    } else if (AddrMode == ARMII::AddrModeT2_i8s4) {
+      Offset += MI.getOperand(FrameRegIdx + 1).getImm() * 4;
+      NumBits = 8;
+      // MCInst operand has already scaled value.
+      Scale = 1;
+      if (Offset < 0) {
+        isSub = true;
+        Offset = -Offset;
+      }
+    } else {
+      llvm_unreachable("Unsupported addressing mode!");
+    }
+
+    if (NewOpc != Opcode)
+      MI.setDesc(TII.get(NewOpc));
+
+    MachineOperand &ImmOp = MI.getOperand(FrameRegIdx+1);
+
+    // Attempt to fold address computation
+    // Common case: small offset, fits into instruction.
+    int ImmedOffset = Offset / Scale;
+    unsigned Mask = (1 << NumBits) - 1;
+    if ((unsigned)Offset <= Mask * Scale) {
+      // Replace the FrameIndex with fp/sp
+      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+      if (isSub) {
+        if (AddrMode == ARMII::AddrMode5)
+          // FIXME: Not consistent.
+          ImmedOffset |= 1 << NumBits;
+        else
+          ImmedOffset = -ImmedOffset;
+      }
+      ImmOp.ChangeToImmediate(ImmedOffset);
+      Offset = 0;
+      return true;
+    }
+
+    // Otherwise, offset doesn't fit. Pull in what we can to simplify
+    ImmedOffset = ImmedOffset & Mask;
+    if (isSub) {
+      if (AddrMode == ARMII::AddrMode5)
+        // FIXME: Not consistent.
+        ImmedOffset |= 1 << NumBits;
+      else {
+        ImmedOffset = -ImmedOffset;
+        if (ImmedOffset == 0)
+          // Change the opcode back if the encoded offset is zero.
+          MI.setDesc(TII.get(positiveOffsetOpcode(NewOpc)));
+      }
+    }
+    ImmOp.ChangeToImmediate(ImmedOffset);
+    Offset &= ~(Mask*Scale);
+  }
+
+  Offset = (isSub) ? -Offset : Offset;
+  return Offset == 0;
+}
+
+ARMCC::CondCodes
+llvm::getITInstrPredicate(const MachineInstr *MI, unsigned &PredReg) {
+  unsigned Opc = MI->getOpcode();
+  if (Opc == ARM::tBcc || Opc == ARM::t2Bcc)
+    return ARMCC::AL;
+  return getInstrPredicate(MI, PredReg);
+}
diff --git a/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.h b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.h
new file mode 100644
index 0000000..34d45d3
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.h
@@ -0,0 +1,74 @@
+//===-- Thumb2InstrInfo.h - Thumb-2 Instruction Information -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Thumb-2 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef THUMB2INSTRUCTIONINFO_H
+#define THUMB2INSTRUCTIONINFO_H
+
+#include "ARMBaseInstrInfo.h"
+#include "Thumb2RegisterInfo.h"
+
+namespace llvm {
+class ARMSubtarget;
+class ScheduleHazardRecognizer;
+
+class Thumb2InstrInfo : public ARMBaseInstrInfo {
+  Thumb2RegisterInfo RI;
+public:
+  explicit Thumb2InstrInfo(const ARMSubtarget &STI);
+
+  /// getNoopForMachoTarget - Return the noop instruction to use for a noop.
+  void getNoopForMachoTarget(MCInst &NopInst) const override;
+
+  // Return the non-pre/post incrementing version of 'Opc'. Return 0
+  // if there is not such an opcode.
+  unsigned getUnindexedOpcode(unsigned Opc) const override;
+
+  void ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
+                               MachineBasicBlock *NewDest) const override;
+
+  bool isLegalToSplitMBBAt(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI) const override;
+
+  void copyPhysReg(MachineBasicBlock &MBB,
+                   MachineBasicBlock::iterator I, DebugLoc DL,
+                   unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  const Thumb2RegisterInfo &getRegisterInfo() const override { return RI; }
+};
+
+/// getITInstrPredicate - Valid only in Thumb2 mode. This function is identical
+/// to llvm::getInstrPredicate except it returns AL for conditional branch
+/// instructions which are "predicated", but are not in IT blocks.
+ARMCC::CondCodes getITInstrPredicate(const MachineInstr *MI, unsigned &PredReg);
+
+
+}
+
+#endif // THUMB2INSTRUCTIONINFO_H
diff --git a/contrib/llvm/lib/Target/ARM/Thumb2RegisterInfo.cpp b/contrib/llvm/lib/Target/ARM/Thumb2RegisterInfo.cpp
new file mode 100644
index 0000000..782d81f
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/Thumb2RegisterInfo.cpp
@@ -0,0 +1,53 @@
+//===-- Thumb2RegisterInfo.cpp - Thumb-2 Register Information -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Thumb-2 implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Thumb2RegisterInfo.h"
+#include "ARM.h"
+#include "ARMSubtarget.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+Thumb2RegisterInfo::Thumb2RegisterInfo(const ARMSubtarget &sti)
+  : ARMBaseRegisterInfo(sti) {
+}
+
+/// emitLoadConstPool - Emits a load from constpool to materialize the
+/// specified immediate.
+void
+Thumb2RegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator &MBBI,
+                                      DebugLoc dl,
+                                      unsigned DestReg, unsigned SubIdx,
+                                      int Val,
+                                      ARMCC::CondCodes Pred, unsigned PredReg,
+                                      unsigned MIFlags) const {
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  MachineConstantPool *ConstantPool = MF.getConstantPool();
+  const Constant *C = ConstantInt::get(
+           Type::getInt32Ty(MBB.getParent()->getFunction()->getContext()), Val);
+  unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4);
+
+  BuildMI(MBB, MBBI, dl, TII.get(ARM::t2LDRpci))
+    .addReg(DestReg, getDefRegState(true), SubIdx)
+    .addConstantPoolIndex(Idx).addImm((int64_t)ARMCC::AL).addReg(0)
+    .setMIFlags(MIFlags);
+}
diff --git a/contrib/llvm/lib/Target/ARM/Thumb2RegisterInfo.h b/contrib/llvm/lib/Target/ARM/Thumb2RegisterInfo.h
new file mode 100644
index 0000000..8a33e6c
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/Thumb2RegisterInfo.h
@@ -0,0 +1,38 @@
+//===- Thumb2RegisterInfo.h - Thumb-2 Register Information Impl -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Thumb-2 implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef THUMB2REGISTERINFO_H
+#define THUMB2REGISTERINFO_H
+
+#include "ARMBaseRegisterInfo.h"
+
+namespace llvm {
+
+class ARMSubtarget;
+
+struct Thumb2RegisterInfo : public ARMBaseRegisterInfo {
+public:
+  Thumb2RegisterInfo(const ARMSubtarget &STI);
+
+  /// emitLoadConstPool - Emits a load from constpool to materialize the
+  /// specified immediate.
+  void
+  emitLoadConstPool(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+                    DebugLoc dl, unsigned DestReg, unsigned SubIdx, int Val,
+                    ARMCC::CondCodes Pred = ARMCC::AL, unsigned PredReg = 0,
+                    unsigned MIFlags = MachineInstr::NoFlags) const override;
+};
+}
+
+#endif // THUMB2REGISTERINFO_H
diff --git a/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp b/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
new file mode 100644
index 0000000..09debe7
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -0,0 +1,1033 @@
+//===-- Thumb2SizeReduction.cpp - Thumb2 code size reduction pass -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMSubtarget.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "Thumb2InstrInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/Function.h"        // To access Function attributes
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "t2-reduce-size"
+
+STATISTIC(NumNarrows,  "Number of 32-bit instrs reduced to 16-bit ones");
+STATISTIC(Num2Addrs,   "Number of 32-bit instrs reduced to 2addr 16-bit ones");
+STATISTIC(NumLdSts,    "Number of 32-bit load / store reduced to 16-bit ones");
+
+static cl::opt<int> ReduceLimit("t2-reduce-limit",
+                                cl::init(-1), cl::Hidden);
+static cl::opt<int> ReduceLimit2Addr("t2-reduce-limit2",
+                                     cl::init(-1), cl::Hidden);
+static cl::opt<int> ReduceLimitLdSt("t2-reduce-limit3",
+                                     cl::init(-1), cl::Hidden);
+
+namespace {
+  /// ReduceTable - A static table with information on mapping from wide
+  /// opcodes to narrow
+  struct ReduceEntry {
+    uint16_t WideOpc;      // Wide opcode
+    uint16_t NarrowOpc1;   // Narrow opcode to transform to
+    uint16_t NarrowOpc2;   // Narrow opcode when it's two-address
+    uint8_t  Imm1Limit;    // Limit of immediate field (bits)
+    uint8_t  Imm2Limit;    // Limit of immediate field when it's two-address
+    unsigned LowRegs1 : 1; // Only possible if low-registers are used
+    unsigned LowRegs2 : 1; // Only possible if low-registers are used (2addr)
+    unsigned PredCC1  : 2; // 0 - If predicated, cc is on and vice versa.
+                           // 1 - No cc field.
+                           // 2 - Always set CPSR.
+    unsigned PredCC2  : 2;
+    unsigned PartFlag : 1; // 16-bit instruction does partial flag update
+    unsigned Special  : 1; // Needs to be dealt with specially
+    unsigned AvoidMovs: 1; // Avoid movs with shifter operand (for Swift)
+  };
+
+  static const ReduceEntry ReduceTable[] = {
+  // Wide,        Narrow1,      Narrow2,     imm1,imm2, lo1, lo2, P/C,PF,S,AM
+  { ARM::t2ADCrr, 0,            ARM::tADC,     0,   0,   0,   1,  0,0, 0,0,0 },
+  { ARM::t2ADDri, ARM::tADDi3,  ARM::tADDi8,   3,   8,   1,   1,  0,0, 0,1,0 },
+  { ARM::t2ADDrr, ARM::tADDrr,  ARM::tADDhirr, 0,   0,   1,   0,  0,1, 0,0,0 },
+  { ARM::t2ADDSri,ARM::tADDi3,  ARM::tADDi8,   3,   8,   1,   1,  2,2, 0,1,0 },
+  { ARM::t2ADDSrr,ARM::tADDrr,  0,             0,   0,   1,   0,  2,0, 0,1,0 },
+  { ARM::t2ANDrr, 0,            ARM::tAND,     0,   0,   0,   1,  0,0, 1,0,0 },
+  { ARM::t2ASRri, ARM::tASRri,  0,             5,   0,   1,   0,  0,0, 1,0,1 },
+  { ARM::t2ASRrr, 0,            ARM::tASRrr,   0,   0,   0,   1,  0,0, 1,0,1 },
+  { ARM::t2BICrr, 0,            ARM::tBIC,     0,   0,   0,   1,  0,0, 1,0,0 },
+  //FIXME: Disable CMN, as CCodes are backwards from compare expectations
+  //{ ARM::t2CMNrr, ARM::tCMN,  0,             0,   0,   1,   0,  2,0, 0,0,0 },
+  { ARM::t2CMNzrr, ARM::tCMNz,  0,             0,   0,   1,   0,  2,0, 0,0,0 },
+  { ARM::t2CMPri, ARM::tCMPi8,  0,             8,   0,   1,   0,  2,0, 0,0,0 },
+  { ARM::t2CMPrr, ARM::tCMPhir, 0,             0,   0,   0,   0,  2,0, 0,1,0 },
+  { ARM::t2EORrr, 0,            ARM::tEOR,     0,   0,   0,   1,  0,0, 1,0,0 },
+  // FIXME: adr.n immediate offset must be multiple of 4.
+  //{ ARM::t2LEApcrelJT,ARM::tLEApcrelJT, 0,   0,   0,   1,   0,  1,0, 0,0,0 },
+  { ARM::t2LSLri, ARM::tLSLri,  0,             5,   0,   1,   0,  0,0, 1,0,1 },
+  { ARM::t2LSLrr, 0,            ARM::tLSLrr,   0,   0,   0,   1,  0,0, 1,0,1 },
+  { ARM::t2LSRri, ARM::tLSRri,  0,             5,   0,   1,   0,  0,0, 1,0,1 },
+  { ARM::t2LSRrr, 0,            ARM::tLSRrr,   0,   0,   0,   1,  0,0, 1,0,1 },
+  { ARM::t2MOVi,  ARM::tMOVi8,  0,             8,   0,   1,   0,  0,0, 1,0,0 },
+  { ARM::t2MOVi16,ARM::tMOVi8,  0,             8,   0,   1,   0,  0,0, 1,1,0 },
+  // FIXME: Do we need the 16-bit 'S' variant?
+  { ARM::t2MOVr,ARM::tMOVr,     0,             0,   0,   0,   0,  1,0, 0,0,0 },
+  { ARM::t2MUL,   0,            ARM::tMUL,     0,   0,   0,   1,  0,0, 1,0,0 },
+  { ARM::t2MVNr,  ARM::tMVN,    0,             0,   0,   1,   0,  0,0, 0,0,0 },
+  { ARM::t2ORRrr, 0,            ARM::tORR,     0,   0,   0,   1,  0,0, 1,0,0 },
+  { ARM::t2REV,   ARM::tREV,    0,             0,   0,   1,   0,  1,0, 0,0,0 },
+  { ARM::t2REV16, ARM::tREV16,  0,             0,   0,   1,   0,  1,0, 0,0,0 },
+  { ARM::t2REVSH, ARM::tREVSH,  0,             0,   0,   1,   0,  1,0, 0,0,0 },
+  { ARM::t2RORrr, 0,            ARM::tROR,     0,   0,   0,   1,  0,0, 1,0,0 },
+  { ARM::t2RSBri, ARM::tRSB,    0,             0,   0,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2RSBSri,ARM::tRSB,    0,             0,   0,   1,   0,  2,0, 0,1,0 },
+  { ARM::t2SBCrr, 0,            ARM::tSBC,     0,   0,   0,   1,  0,0, 0,0,0 },
+  { ARM::t2SUBri, ARM::tSUBi3,  ARM::tSUBi8,   3,   8,   1,   1,  0,0, 0,0,0 },
+  { ARM::t2SUBrr, ARM::tSUBrr,  0,             0,   0,   1,   0,  0,0, 0,0,0 },
+  { ARM::t2SUBSri,ARM::tSUBi3,  ARM::tSUBi8,   3,   8,   1,   1,  2,2, 0,0,0 },
+  { ARM::t2SUBSrr,ARM::tSUBrr,  0,             0,   0,   1,   0,  2,0, 0,0,0 },
+  { ARM::t2SXTB,  ARM::tSXTB,   0,             0,   0,   1,   0,  1,0, 0,1,0 },
+  { ARM::t2SXTH,  ARM::tSXTH,   0,             0,   0,   1,   0,  1,0, 0,1,0 },
+  { ARM::t2TSTrr, ARM::tTST,    0,             0,   0,   1,   0,  2,0, 0,0,0 },
+  { ARM::t2UXTB,  ARM::tUXTB,   0,             0,   0,   1,   0,  1,0, 0,1,0 },
+  { ARM::t2UXTH,  ARM::tUXTH,   0,             0,   0,   1,   0,  1,0, 0,1,0 },
+
+  // FIXME: Clean this up after splitting each Thumb load / store opcode
+  // into multiple ones.
+  { ARM::t2LDRi12,ARM::tLDRi,   ARM::tLDRspi,  5,   8,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2LDRs,  ARM::tLDRr,   0,             0,   0,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2LDRBi12,ARM::tLDRBi, 0,             5,   0,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2LDRBs, ARM::tLDRBr,  0,             0,   0,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2LDRHi12,ARM::tLDRHi, 0,             5,   0,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2LDRHs, ARM::tLDRHr,  0,             0,   0,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2LDRSBs,ARM::tLDRSB,  0,             0,   0,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2LDRSHs,ARM::tLDRSH,  0,             0,   0,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2STRi12,ARM::tSTRi,   ARM::tSTRspi,  5,   8,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2STRs,  ARM::tSTRr,   0,             0,   0,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2STRBi12,ARM::tSTRBi, 0,             5,   0,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2STRBs, ARM::tSTRBr,  0,             0,   0,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2STRHi12,ARM::tSTRHi, 0,             5,   0,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2STRHs, ARM::tSTRHr,  0,             0,   0,   1,   0,  0,0, 0,1,0 },
+
+  { ARM::t2LDMIA, ARM::tLDMIA,  0,             0,   0,   1,   1,  1,1, 0,1,0 },
+  { ARM::t2LDMIA_RET,0,         ARM::tPOP_RET, 0,   0,   1,   1,  1,1, 0,1,0 },
+  { ARM::t2LDMIA_UPD,ARM::tLDMIA_UPD,ARM::tPOP,0,   0,   1,   1,  1,1, 0,1,0 },
+  // ARM::t2STM (with no basereg writeback) has no Thumb1 equivalent
+  { ARM::t2STMIA_UPD,ARM::tSTMIA_UPD, 0,       0,   0,   1,   1,  1,1, 0,1,0 },
+  { ARM::t2STMDB_UPD, 0,        ARM::tPUSH,    0,   0,   1,   1,  1,1, 0,1,0 }
+  };
+
+  class Thumb2SizeReduce : public MachineFunctionPass {
+  public:
+    static char ID;
+    Thumb2SizeReduce();
+
+    const Thumb2InstrInfo *TII;
+    const ARMSubtarget *STI;
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    const char *getPassName() const override {
+      return "Thumb2 instruction size reduction pass";
+    }
+
+  private:
+    /// ReduceOpcodeMap - Maps wide opcode to index of entry in ReduceTable.
+    DenseMap<unsigned, unsigned> ReduceOpcodeMap;
+
+    bool canAddPseudoFlagDep(MachineInstr *Use, bool IsSelfLoop);
+
+    bool VerifyPredAndCC(MachineInstr *MI, const ReduceEntry &Entry,
+                         bool is2Addr, ARMCC::CondCodes Pred,
+                         bool LiveCPSR, bool &HasCC, bool &CCDead);
+
+    bool ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
+                         const ReduceEntry &Entry);
+
+    bool ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
+                       const ReduceEntry &Entry, bool LiveCPSR, bool IsSelfLoop);
+
+    /// ReduceTo2Addr - Reduce a 32-bit instruction to a 16-bit two-address
+    /// instruction.
+    bool ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
+                       const ReduceEntry &Entry, bool LiveCPSR,
+                       bool IsSelfLoop);
+
+    /// ReduceToNarrow - Reduce a 32-bit instruction to a 16-bit
+    /// non-two-address instruction.
+    bool ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
+                        const ReduceEntry &Entry, bool LiveCPSR,
+                        bool IsSelfLoop);
+
+    /// ReduceMI - Attempt to reduce MI, return true on success.
+    bool ReduceMI(MachineBasicBlock &MBB, MachineInstr *MI,
+                  bool LiveCPSR, bool IsSelfLoop);
+
+    /// ReduceMBB - Reduce width of instructions in the specified basic block.
+    bool ReduceMBB(MachineBasicBlock &MBB);
+
+    bool OptimizeSize;
+    bool MinimizeSize;
+
+    // Last instruction to define CPSR in the current block.
+    MachineInstr *CPSRDef;
+    // Was CPSR last defined by a high latency instruction?
+    // When CPSRDef is null, this refers to CPSR defs in predecessors.
+    bool HighLatencyCPSR;
+
+    struct MBBInfo {
+      // The flags leaving this block have high latency.
+      bool HighLatencyCPSR;
+      // Has this block been visited yet?
+      bool Visited;
+
+      MBBInfo() : HighLatencyCPSR(false), Visited(false) {}
+    };
+
+    SmallVector<MBBInfo, 8> BlockInfo;
+  };
+  char Thumb2SizeReduce::ID = 0;
+}
+
+Thumb2SizeReduce::Thumb2SizeReduce() : MachineFunctionPass(ID) {
+  OptimizeSize = MinimizeSize = false;
+  for (unsigned i = 0, e = array_lengthof(ReduceTable); i != e; ++i) {
+    unsigned FromOpc = ReduceTable[i].WideOpc;
+    if (!ReduceOpcodeMap.insert(std::make_pair(FromOpc, i)).second)
+      assert(false && "Duplicated entries?");
+  }
+}
+
+static bool HasImplicitCPSRDef(const MCInstrDesc &MCID) {
+  for (const uint16_t *Regs = MCID.getImplicitDefs(); *Regs; ++Regs)
+    if (*Regs == ARM::CPSR)
+      return true;
+  return false;
+}
+
+// Check for a likely high-latency flag def.
+static bool isHighLatencyCPSR(MachineInstr *Def) {
+  switch(Def->getOpcode()) {
+  case ARM::FMSTAT:
+  case ARM::tMUL:
+    return true;
+  }
+  return false;
+}
+
+/// canAddPseudoFlagDep - For A9 (and other out-of-order) implementations,
+/// the 's' 16-bit instruction partially update CPSR. Abort the
+/// transformation to avoid adding false dependency on last CPSR setting
+/// instruction which hurts the ability for out-of-order execution engine
+/// to do register renaming magic.
+/// This function checks if there is a read-of-write dependency between the
+/// last instruction that defines the CPSR and the current instruction. If there
+/// is, then there is no harm done since the instruction cannot be retired
+/// before the CPSR setting instruction anyway.
+/// Note, we are not doing full dependency analysis here for the sake of compile
+/// time. We're not looking for cases like:
+/// r0 = muls ...
+/// r1 = add.w r0, ...
+/// ...
+///    = mul.w r1
+/// In this case it would have been ok to narrow the mul.w to muls since there
+/// are indirect RAW dependency between the muls and the mul.w
+bool
+Thumb2SizeReduce::canAddPseudoFlagDep(MachineInstr *Use, bool FirstInSelfLoop) {
+  // Disable the check for -Oz (aka OptimizeForSizeHarder).
+  if (MinimizeSize || !STI->avoidCPSRPartialUpdate())
+    return false;
+
+  if (!CPSRDef)
+    // If this BB loops back to itself, conservatively avoid narrowing the
+    // first instruction that does partial flag update.
+    return HighLatencyCPSR || FirstInSelfLoop;
+
+  SmallSet<unsigned, 2> Defs;
+  for (const MachineOperand &MO : CPSRDef->operands()) {
+    if (!MO.isReg() || MO.isUndef() || MO.isUse())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0 || Reg == ARM::CPSR)
+      continue;
+    Defs.insert(Reg);
+  }
+
+  for (const MachineOperand &MO : Use->operands()) {
+    if (!MO.isReg() || MO.isUndef() || MO.isDef())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (Defs.count(Reg))
+      return false;
+  }
+
+  // If the current CPSR has high latency, try to avoid the false dependency.
+  if (HighLatencyCPSR)
+    return true;
+
+  // tMOVi8 usually doesn't start long dependency chains, and there are a lot
+  // of them, so always shrink them when CPSR doesn't have high latency.
+  if (Use->getOpcode() == ARM::t2MOVi ||
+      Use->getOpcode() == ARM::t2MOVi16)
+    return false;
+
+  // No read-after-write dependency. The narrowing will add false dependency.
+  return true;
+}
+
+bool
+Thumb2SizeReduce::VerifyPredAndCC(MachineInstr *MI, const ReduceEntry &Entry,
+                                  bool is2Addr, ARMCC::CondCodes Pred,
+                                  bool LiveCPSR, bool &HasCC, bool &CCDead) {
+  if ((is2Addr  && Entry.PredCC2 == 0) ||
+      (!is2Addr && Entry.PredCC1 == 0)) {
+    if (Pred == ARMCC::AL) {
+      // Not predicated, must set CPSR.
+      if (!HasCC) {
+        // Original instruction was not setting CPSR, but CPSR is not
+        // currently live anyway. It's ok to set it. The CPSR def is
+        // dead though.
+        if (!LiveCPSR) {
+          HasCC = true;
+          CCDead = true;
+          return true;
+        }
+        return false;
+      }
+    } else {
+      // Predicated, must not set CPSR.
+      if (HasCC)
+        return false;
+    }
+  } else if ((is2Addr  && Entry.PredCC2 == 2) ||
+             (!is2Addr && Entry.PredCC1 == 2)) {
+    /// Old opcode has an optional def of CPSR.
+    if (HasCC)
+      return true;
+    // If old opcode does not implicitly define CPSR, then it's not ok since
+    // these new opcodes' CPSR def is not meant to be thrown away. e.g. CMP.
+    if (!HasImplicitCPSRDef(MI->getDesc()))
+      return false;
+    HasCC = true;
+  } else {
+    // 16-bit instruction does not set CPSR.
+    if (HasCC)
+      return false;
+  }
+
+  return true;
+}
+
+static bool VerifyLowRegs(MachineInstr *MI) {
+  unsigned Opc = MI->getOpcode();
+  bool isPCOk = (Opc == ARM::t2LDMIA_RET || Opc == ARM::t2LDMIA     ||
+                 Opc == ARM::t2LDMDB     || Opc == ARM::t2LDMIA_UPD ||
+                 Opc == ARM::t2LDMDB_UPD);
+  bool isLROk = (Opc == ARM::t2STMIA_UPD || Opc == ARM::t2STMDB_UPD);
+  bool isSPOk = isPCOk || isLROk;
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || MO.isImplicit())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0 || Reg == ARM::CPSR)
+      continue;
+    if (isPCOk && Reg == ARM::PC)
+      continue;
+    if (isLROk && Reg == ARM::LR)
+      continue;
+    if (Reg == ARM::SP) {
+      if (isSPOk)
+        continue;
+      if (i == 1 && (Opc == ARM::t2LDRi12 || Opc == ARM::t2STRi12))
+        // Special case for these ldr / str with sp as base register.
+        continue;
+    }
+    if (!isARMLowRegister(Reg))
+      return false;
+  }
+  return true;
+}
+
+bool
+Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
+                                  const ReduceEntry &Entry) {
+  if (ReduceLimitLdSt != -1 && ((int)NumLdSts >= ReduceLimitLdSt))
+    return false;
+
+  unsigned Scale = 1;
+  bool HasImmOffset = false;
+  bool HasShift = false;
+  bool HasOffReg = true;
+  bool isLdStMul = false;
+  unsigned Opc = Entry.NarrowOpc1;
+  unsigned OpNum = 3; // First 'rest' of operands.
+  uint8_t  ImmLimit = Entry.Imm1Limit;
+
+  switch (Entry.WideOpc) {
+  default:
+    llvm_unreachable("Unexpected Thumb2 load / store opcode!");
+  case ARM::t2LDRi12:
+  case ARM::t2STRi12:
+    if (MI->getOperand(1).getReg() == ARM::SP) {
+      Opc = Entry.NarrowOpc2;
+      ImmLimit = Entry.Imm2Limit;
+      HasOffReg = false;
+    }
+
+    Scale = 4;
+    HasImmOffset = true;
+    HasOffReg = false;
+    break;
+  case ARM::t2LDRBi12:
+  case ARM::t2STRBi12:
+    HasImmOffset = true;
+    HasOffReg = false;
+    break;
+  case ARM::t2LDRHi12:
+  case ARM::t2STRHi12:
+    Scale = 2;
+    HasImmOffset = true;
+    HasOffReg = false;
+    break;
+  case ARM::t2LDRs:
+  case ARM::t2LDRBs:
+  case ARM::t2LDRHs:
+  case ARM::t2LDRSBs:
+  case ARM::t2LDRSHs:
+  case ARM::t2STRs:
+  case ARM::t2STRBs:
+  case ARM::t2STRHs:
+    HasShift = true;
+    OpNum = 4;
+    break;
+  case ARM::t2LDMIA:
+  case ARM::t2LDMDB: {
+    unsigned BaseReg = MI->getOperand(0).getReg();
+    if (!isARMLowRegister(BaseReg) || Entry.WideOpc != ARM::t2LDMIA)
+      return false;
+
+    // For the non-writeback version (this one), the base register must be
+    // one of the registers being loaded.
+    bool isOK = false;
+    for (unsigned i = 4; i < MI->getNumOperands(); ++i) {
+      if (MI->getOperand(i).getReg() == BaseReg) {
+        isOK = true;
+        break;
+      }
+    }
+
+    if (!isOK)
+      return false;
+
+    OpNum = 0;
+    isLdStMul = true;
+    break;
+  }
+  case ARM::t2LDMIA_RET: {
+    unsigned BaseReg = MI->getOperand(1).getReg();
+    if (BaseReg != ARM::SP)
+      return false;
+    Opc = Entry.NarrowOpc2; // tPOP_RET
+    OpNum = 2;
+    isLdStMul = true;
+    break;
+  }
+  case ARM::t2LDMIA_UPD:
+  case ARM::t2LDMDB_UPD:
+  case ARM::t2STMIA_UPD:
+  case ARM::t2STMDB_UPD: {
+    OpNum = 0;
+
+    unsigned BaseReg = MI->getOperand(1).getReg();
+    if (BaseReg == ARM::SP &&
+        (Entry.WideOpc == ARM::t2LDMIA_UPD ||
+         Entry.WideOpc == ARM::t2STMDB_UPD)) {
+      Opc = Entry.NarrowOpc2; // tPOP or tPUSH
+      OpNum = 2;
+    } else if (!isARMLowRegister(BaseReg) ||
+               (Entry.WideOpc != ARM::t2LDMIA_UPD &&
+                Entry.WideOpc != ARM::t2STMIA_UPD)) {
+      return false;
+    }
+
+    isLdStMul = true;
+    break;
+  }
+  }
+
+  unsigned OffsetReg = 0;
+  bool OffsetKill = false;
+  if (HasShift) {
+    OffsetReg  = MI->getOperand(2).getReg();
+    OffsetKill = MI->getOperand(2).isKill();
+
+    if (MI->getOperand(3).getImm())
+      // Thumb1 addressing mode doesn't support shift.
+      return false;
+  }
+
+  unsigned OffsetImm = 0;
+  if (HasImmOffset) {
+    OffsetImm = MI->getOperand(2).getImm();
+    unsigned MaxOffset = ((1 << ImmLimit) - 1) * Scale;
+
+    if ((OffsetImm & (Scale - 1)) || OffsetImm > MaxOffset)
+      // Make sure the immediate field fits.
+      return false;
+  }
+
+  // Add the 16-bit load / store instruction.
+  DebugLoc dl = MI->getDebugLoc();
+  MachineInstrBuilder MIB = BuildMI(MBB, MI, dl, TII->get(Opc));
+  if (!isLdStMul) {
+    MIB.addOperand(MI->getOperand(0));
+    MIB.addOperand(MI->getOperand(1));
+
+    if (HasImmOffset)
+      MIB.addImm(OffsetImm / Scale);
+
+    assert((!HasShift || OffsetReg) && "Invalid so_reg load / store address!");
+
+    if (HasOffReg)
+      MIB.addReg(OffsetReg, getKillRegState(OffsetKill));
+  }
+
+  // Transfer the rest of operands.
+  for (unsigned e = MI->getNumOperands(); OpNum != e; ++OpNum)
+    MIB.addOperand(MI->getOperand(OpNum));
+
+  // Transfer memoperands.
+  MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+  // Transfer MI flags.
+  MIB.setMIFlags(MI->getFlags());
+
+  DEBUG(errs() << "Converted 32-bit: " << *MI << "       to 16-bit: " << *MIB);
+
+  MBB.erase_instr(MI);
+  ++NumLdSts;
+  return true;
+}
+
+bool
+Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
+                                const ReduceEntry &Entry,
+                                bool LiveCPSR, bool IsSelfLoop) {
+  unsigned Opc = MI->getOpcode();
+  if (Opc == ARM::t2ADDri) {
+    // If the source register is SP, try to reduce to tADDrSPi, otherwise
+    // it's a normal reduce.
+    if (MI->getOperand(1).getReg() != ARM::SP) {
+      if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, IsSelfLoop))
+        return true;
+      return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop);
+    }
+    // Try to reduce to tADDrSPi.
+    unsigned Imm = MI->getOperand(2).getImm();
+    // The immediate must be in range, the destination register must be a low
+    // reg, the predicate must be "always" and the condition flags must not
+    // be being set.
+    if (Imm & 3 || Imm > 1020)
+      return false;
+    if (!isARMLowRegister(MI->getOperand(0).getReg()))
+      return false;
+    if (MI->getOperand(3).getImm() != ARMCC::AL)
+      return false;
+    const MCInstrDesc &MCID = MI->getDesc();
+    if (MCID.hasOptionalDef() &&
+        MI->getOperand(MCID.getNumOperands()-1).getReg() == ARM::CPSR)
+      return false;
+
+    MachineInstrBuilder MIB = BuildMI(MBB, MI, MI->getDebugLoc(),
+                                      TII->get(ARM::tADDrSPi))
+      .addOperand(MI->getOperand(0))
+      .addOperand(MI->getOperand(1))
+      .addImm(Imm / 4); // The tADDrSPi has an implied scale by four.
+    AddDefaultPred(MIB);
+
+    // Transfer MI flags.
+    MIB.setMIFlags(MI->getFlags());
+
+    DEBUG(errs() << "Converted 32-bit: " << *MI << "       to 16-bit: " <<*MIB);
+
+    MBB.erase_instr(MI);
+    ++NumNarrows;
+    return true;
+  }
+
+  if (Entry.LowRegs1 && !VerifyLowRegs(MI))
+    return false;
+
+  if (MI->mayLoad() || MI->mayStore())
+    return ReduceLoadStore(MBB, MI, Entry);
+
+  switch (Opc) {
+  default: break;
+  case ARM::t2ADDSri:
+  case ARM::t2ADDSrr: {
+    unsigned PredReg = 0;
+    if (getInstrPredicate(MI, PredReg) == ARMCC::AL) {
+      switch (Opc) {
+      default: break;
+      case ARM::t2ADDSri: {
+        if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, IsSelfLoop))
+          return true;
+        // fallthrough
+      }
+      case ARM::t2ADDSrr:
+        return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop);
+      }
+    }
+    break;
+  }
+  case ARM::t2RSBri:
+  case ARM::t2RSBSri:
+  case ARM::t2SXTB:
+  case ARM::t2SXTH:
+  case ARM::t2UXTB:
+  case ARM::t2UXTH:
+    if (MI->getOperand(2).getImm() == 0)
+      return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop);
+    break;
+  case ARM::t2MOVi16:
+    // Can convert only 'pure' immediate operands, not immediates obtained as
+    // globals' addresses.
+    if (MI->getOperand(1).isImm())
+      return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop);
+    break;
+  case ARM::t2CMPrr: {
+    // Try to reduce to the lo-reg only version first. Why there are two
+    // versions of the instruction is a mystery.
+    // It would be nice to just have two entries in the master table that
+    // are prioritized, but the table assumes a unique entry for each
+    // source insn opcode. So for now, we hack a local entry record to use.
+    static const ReduceEntry NarrowEntry =
+      { ARM::t2CMPrr,ARM::tCMPr, 0, 0, 0, 1, 1,2, 0, 0,1,0 };
+    if (ReduceToNarrow(MBB, MI, NarrowEntry, LiveCPSR, IsSelfLoop))
+      return true;
+    return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop);
+  }
+  }
+  return false;
+}
+
+bool
+Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
+                                const ReduceEntry &Entry,
+                                bool LiveCPSR, bool IsSelfLoop) {
+
+  if (ReduceLimit2Addr != -1 && ((int)Num2Addrs >= ReduceLimit2Addr))
+    return false;
+
+  if (!MinimizeSize && !OptimizeSize && Entry.AvoidMovs &&
+      STI->avoidMOVsShifterOperand())
+    // Don't issue movs with shifter operand for some CPUs unless we
+    // are optimizing / minimizing for size.
+    return false;
+
+  unsigned Reg0 = MI->getOperand(0).getReg();
+  unsigned Reg1 = MI->getOperand(1).getReg();
+  // t2MUL is "special". The tied source operand is second, not first.
+  if (MI->getOpcode() == ARM::t2MUL) {
+    unsigned Reg2 = MI->getOperand(2).getReg();
+    // Early exit if the regs aren't all low regs.
+    if (!isARMLowRegister(Reg0) || !isARMLowRegister(Reg1)
+        || !isARMLowRegister(Reg2))
+      return false;
+    if (Reg0 != Reg2) {
+      // If the other operand also isn't the same as the destination, we
+      // can't reduce.
+      if (Reg1 != Reg0)
+        return false;
+      // Try to commute the operands to make it a 2-address instruction.
+      MachineInstr *CommutedMI = TII->commuteInstruction(MI);
+      if (!CommutedMI)
+        return false;
+    }
+  } else if (Reg0 != Reg1) {
+    // Try to commute the operands to make it a 2-address instruction.
+    unsigned CommOpIdx1, CommOpIdx2;
+    if (!TII->findCommutedOpIndices(MI, CommOpIdx1, CommOpIdx2) ||
+        CommOpIdx1 != 1 || MI->getOperand(CommOpIdx2).getReg() != Reg0)
+      return false;
+    MachineInstr *CommutedMI = TII->commuteInstruction(MI);
+    if (!CommutedMI)
+      return false;
+  }
+  if (Entry.LowRegs2 && !isARMLowRegister(Reg0))
+    return false;
+  if (Entry.Imm2Limit) {
+    unsigned Imm = MI->getOperand(2).getImm();
+    unsigned Limit = (1 << Entry.Imm2Limit) - 1;
+    if (Imm > Limit)
+      return false;
+  } else {
+    unsigned Reg2 = MI->getOperand(2).getReg();
+    if (Entry.LowRegs2 && !isARMLowRegister(Reg2))
+      return false;
+  }
+
+  // Check if it's possible / necessary to transfer the predicate.
+  const MCInstrDesc &NewMCID = TII->get(Entry.NarrowOpc2);
+  unsigned PredReg = 0;
+  ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
+  bool SkipPred = false;
+  if (Pred != ARMCC::AL) {
+    if (!NewMCID.isPredicable())
+      // Can't transfer predicate, fail.
+      return false;
+  } else {
+    SkipPred = !NewMCID.isPredicable();
+  }
+
+  bool HasCC = false;
+  bool CCDead = false;
+  const MCInstrDesc &MCID = MI->getDesc();
+  if (MCID.hasOptionalDef()) {
+    unsigned NumOps = MCID.getNumOperands();
+    HasCC = (MI->getOperand(NumOps-1).getReg() == ARM::CPSR);
+    if (HasCC && MI->getOperand(NumOps-1).isDead())
+      CCDead = true;
+  }
+  if (!VerifyPredAndCC(MI, Entry, true, Pred, LiveCPSR, HasCC, CCDead))
+    return false;
+
+  // Avoid adding a false dependency on partial flag update by some 16-bit
+  // instructions which has the 's' bit set.
+  if (Entry.PartFlag && NewMCID.hasOptionalDef() && HasCC &&
+      canAddPseudoFlagDep(MI, IsSelfLoop))
+    return false;
+
+  // Add the 16-bit instruction.
+  DebugLoc dl = MI->getDebugLoc();
+  MachineInstrBuilder MIB = BuildMI(MBB, MI, dl, NewMCID);
+  MIB.addOperand(MI->getOperand(0));
+  if (NewMCID.hasOptionalDef()) {
+    if (HasCC)
+      AddDefaultT1CC(MIB, CCDead);
+    else
+      AddNoT1CC(MIB);
+  }
+
+  // Transfer the rest of operands.
+  unsigned NumOps = MCID.getNumOperands();
+  for (unsigned i = 1, e = MI->getNumOperands(); i != e; ++i) {
+    if (i < NumOps && MCID.OpInfo[i].isOptionalDef())
+      continue;
+    if (SkipPred && MCID.OpInfo[i].isPredicate())
+      continue;
+    MIB.addOperand(MI->getOperand(i));
+  }
+
+  // Transfer MI flags.
+  MIB.setMIFlags(MI->getFlags());
+
+  DEBUG(errs() << "Converted 32-bit: " << *MI << "       to 16-bit: " << *MIB);
+
+  MBB.erase_instr(MI);
+  ++Num2Addrs;
+  return true;
+}
+
+bool
+Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
+                                 const ReduceEntry &Entry,
+                                 bool LiveCPSR, bool IsSelfLoop) {
+  if (ReduceLimit != -1 && ((int)NumNarrows >= ReduceLimit))
+    return false;
+
+  if (!MinimizeSize && !OptimizeSize && Entry.AvoidMovs &&
+      STI->avoidMOVsShifterOperand())
+    // Don't issue movs with shifter operand for some CPUs unless we
+    // are optimizing / minimizing for size.
+    return false;
+
+  unsigned Limit = ~0U;
+  if (Entry.Imm1Limit)
+    Limit = (1 << Entry.Imm1Limit) - 1;
+
+  const MCInstrDesc &MCID = MI->getDesc();
+  for (unsigned i = 0, e = MCID.getNumOperands(); i != e; ++i) {
+    if (MCID.OpInfo[i].isPredicate())
+      continue;
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg()) {
+      unsigned Reg = MO.getReg();
+      if (!Reg || Reg == ARM::CPSR)
+        continue;
+      if (Entry.LowRegs1 && !isARMLowRegister(Reg))
+        return false;
+    } else if (MO.isImm() &&
+               !MCID.OpInfo[i].isPredicate()) {
+      if (((unsigned)MO.getImm()) > Limit)
+        return false;
+    }
+  }
+
+  // Check if it's possible / necessary to transfer the predicate.
+  const MCInstrDesc &NewMCID = TII->get(Entry.NarrowOpc1);
+  unsigned PredReg = 0;
+  ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
+  bool SkipPred = false;
+  if (Pred != ARMCC::AL) {
+    if (!NewMCID.isPredicable())
+      // Can't transfer predicate, fail.
+      return false;
+  } else {
+    SkipPred = !NewMCID.isPredicable();
+  }
+
+  bool HasCC = false;
+  bool CCDead = false;
+  if (MCID.hasOptionalDef()) {
+    unsigned NumOps = MCID.getNumOperands();
+    HasCC = (MI->getOperand(NumOps-1).getReg() == ARM::CPSR);
+    if (HasCC && MI->getOperand(NumOps-1).isDead())
+      CCDead = true;
+  }
+  if (!VerifyPredAndCC(MI, Entry, false, Pred, LiveCPSR, HasCC, CCDead))
+    return false;
+
+  // Avoid adding a false dependency on partial flag update by some 16-bit
+  // instructions which has the 's' bit set.
+  if (Entry.PartFlag && NewMCID.hasOptionalDef() && HasCC &&
+      canAddPseudoFlagDep(MI, IsSelfLoop))
+    return false;
+
+  // Add the 16-bit instruction.
+  DebugLoc dl = MI->getDebugLoc();
+  MachineInstrBuilder MIB = BuildMI(MBB, MI, dl, NewMCID);
+  MIB.addOperand(MI->getOperand(0));
+  if (NewMCID.hasOptionalDef()) {
+    if (HasCC)
+      AddDefaultT1CC(MIB, CCDead);
+    else
+      AddNoT1CC(MIB);
+  }
+
+  // Transfer the rest of operands.
+  unsigned NumOps = MCID.getNumOperands();
+  for (unsigned i = 1, e = MI->getNumOperands(); i != e; ++i) {
+    if (i < NumOps && MCID.OpInfo[i].isOptionalDef())
+      continue;
+    if ((MCID.getOpcode() == ARM::t2RSBSri ||
+         MCID.getOpcode() == ARM::t2RSBri ||
+         MCID.getOpcode() == ARM::t2SXTB ||
+         MCID.getOpcode() == ARM::t2SXTH ||
+         MCID.getOpcode() == ARM::t2UXTB ||
+         MCID.getOpcode() == ARM::t2UXTH) && i == 2)
+      // Skip the zero immediate operand, it's now implicit.
+      continue;
+    bool isPred = (i < NumOps && MCID.OpInfo[i].isPredicate());
+    if (SkipPred && isPred)
+        continue;
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.isImplicit() && MO.getReg() == ARM::CPSR)
+      // Skip implicit def of CPSR. Either it's modeled as an optional
+      // def now or it's already an implicit def on the new instruction.
+      continue;
+    MIB.addOperand(MO);
+  }
+  if (!MCID.isPredicable() && NewMCID.isPredicable())
+    AddDefaultPred(MIB);
+
+  // Transfer MI flags.
+  MIB.setMIFlags(MI->getFlags());
+
+  DEBUG(errs() << "Converted 32-bit: " << *MI << "       to 16-bit: " << *MIB);
+
+  MBB.erase_instr(MI);
+  ++NumNarrows;
+  return true;
+}
+
+static bool UpdateCPSRDef(MachineInstr &MI, bool LiveCPSR, bool &DefCPSR) {
+  bool HasDef = false;
+  for (const MachineOperand &MO : MI.operands()) {
+    if (!MO.isReg() || MO.isUndef() || MO.isUse())
+      continue;
+    if (MO.getReg() != ARM::CPSR)
+      continue;
+
+    DefCPSR = true;
+    if (!MO.isDead())
+      HasDef = true;
+  }
+
+  return HasDef || LiveCPSR;
+}
+
+static bool UpdateCPSRUse(MachineInstr &MI, bool LiveCPSR) {
+  for (const MachineOperand &MO : MI.operands()) {
+    if (!MO.isReg() || MO.isUndef() || MO.isDef())
+      continue;
+    if (MO.getReg() != ARM::CPSR)
+      continue;
+    assert(LiveCPSR && "CPSR liveness tracking is wrong!");
+    if (MO.isKill()) {
+      LiveCPSR = false;
+      break;
+    }
+  }
+
+  return LiveCPSR;
+}
+
+bool Thumb2SizeReduce::ReduceMI(MachineBasicBlock &MBB, MachineInstr *MI,
+                                bool LiveCPSR, bool IsSelfLoop) {
+  unsigned Opcode = MI->getOpcode();
+  DenseMap<unsigned, unsigned>::iterator OPI = ReduceOpcodeMap.find(Opcode);
+  if (OPI == ReduceOpcodeMap.end())
+    return false;
+  const ReduceEntry &Entry = ReduceTable[OPI->second];
+
+  // Don't attempt normal reductions on "special" cases for now.
+  if (Entry.Special)
+    return ReduceSpecial(MBB, MI, Entry, LiveCPSR, IsSelfLoop);
+
+  // Try to transform to a 16-bit two-address instruction.
+  if (Entry.NarrowOpc2 &&
+      ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, IsSelfLoop))
+    return true;
+
+  // Try to transform to a 16-bit non-two-address instruction.
+  if (Entry.NarrowOpc1 &&
+      ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop))
+    return true;
+
+  return false;
+}
+
+bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
+  bool Modified = false;
+
+  // Yes, CPSR could be livein.
+  bool LiveCPSR = MBB.isLiveIn(ARM::CPSR);
+  MachineInstr *BundleMI = nullptr;
+
+  CPSRDef = nullptr;
+  HighLatencyCPSR = false;
+
+  // Check predecessors for the latest CPSRDef.
+  for (auto *Pred : MBB.predecessors()) {
+    const MBBInfo &PInfo = BlockInfo[Pred->getNumber()];
+    if (!PInfo.Visited) {
+      // Since blocks are visited in RPO, this must be a back-edge.
+      continue;
+    }
+    if (PInfo.HighLatencyCPSR) {
+      HighLatencyCPSR = true;
+      break;
+    }
+  }
+
+  // If this BB loops back to itself, conservatively avoid narrowing the
+  // first instruction that does partial flag update.
+  bool IsSelfLoop = MBB.isSuccessor(&MBB);
+  MachineBasicBlock::instr_iterator MII = MBB.instr_begin(),E = MBB.instr_end();
+  MachineBasicBlock::instr_iterator NextMII;
+  for (; MII != E; MII = NextMII) {
+    NextMII = std::next(MII);
+
+    MachineInstr *MI = &*MII;
+    if (MI->isBundle()) {
+      BundleMI = MI;
+      continue;
+    }
+    if (MI->isDebugValue())
+      continue;
+
+    LiveCPSR = UpdateCPSRUse(*MI, LiveCPSR);
+
+    // Does NextMII belong to the same bundle as MI?
+    bool NextInSameBundle = NextMII != E && NextMII->isBundledWithPred();
+
+    if (ReduceMI(MBB, MI, LiveCPSR, IsSelfLoop)) {
+      Modified = true;
+      MachineBasicBlock::instr_iterator I = std::prev(NextMII);
+      MI = &*I;
+      // Removing and reinserting the first instruction in a bundle will break
+      // up the bundle. Fix the bundling if it was broken.
+      if (NextInSameBundle && !NextMII->isBundledWithPred())
+        NextMII->bundleWithPred();
+    }
+
+    if (!NextInSameBundle && MI->isInsideBundle()) {
+      // FIXME: Since post-ra scheduler operates on bundles, the CPSR kill
+      // marker is only on the BUNDLE instruction. Process the BUNDLE
+      // instruction as we finish with the bundled instruction to work around
+      // the inconsistency.
+      if (BundleMI->killsRegister(ARM::CPSR))
+        LiveCPSR = false;
+      MachineOperand *MO = BundleMI->findRegisterDefOperand(ARM::CPSR);
+      if (MO && !MO->isDead())
+        LiveCPSR = true;
+      MO = BundleMI->findRegisterUseOperand(ARM::CPSR);
+      if (MO && !MO->isKill())
+        LiveCPSR = true;
+    }
+
+    bool DefCPSR = false;
+    LiveCPSR = UpdateCPSRDef(*MI, LiveCPSR, DefCPSR);
+    if (MI->isCall()) {
+      // Calls don't really set CPSR.
+      CPSRDef = nullptr;
+      HighLatencyCPSR = false;
+      IsSelfLoop = false;
+    } else if (DefCPSR) {
+      // This is the last CPSR defining instruction.
+      CPSRDef = MI;
+      HighLatencyCPSR = isHighLatencyCPSR(CPSRDef);
+      IsSelfLoop = false;
+    }
+  }
+
+  MBBInfo &Info = BlockInfo[MBB.getNumber()];
+  Info.HighLatencyCPSR = HighLatencyCPSR;
+  Info.Visited = true;
+  return Modified;
+}
+
+bool Thumb2SizeReduce::runOnMachineFunction(MachineFunction &MF) {
+  const TargetMachine &TM = MF.getTarget();
+  TII = static_cast<const Thumb2InstrInfo*>(TM.getInstrInfo());
+  STI = &TM.getSubtarget<ARMSubtarget>();
+
+  // Optimizing / minimizing size?
+  AttributeSet FnAttrs = MF.getFunction()->getAttributes();
+  OptimizeSize = FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
+                                      Attribute::OptimizeForSize);
+  MinimizeSize =
+      FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
+
+  BlockInfo.clear();
+  BlockInfo.resize(MF.getNumBlockIDs());
+
+  // Visit blocks in reverse post-order so LastCPSRDef is known for all
+  // predecessors.
+  ReversePostOrderTraversal<MachineFunction*> RPOT(&MF);
+  bool Modified = false;
+  for (ReversePostOrderTraversal<MachineFunction*>::rpo_iterator
+       I = RPOT.begin(), E = RPOT.end(); I != E; ++I)
+    Modified |= ReduceMBB(**I);
+  return Modified;
+}
+
+/// createThumb2SizeReductionPass - Returns an instance of the Thumb2 size
+/// reduction pass.
+FunctionPass *llvm::createThumb2SizeReductionPass() {
+  return new Thumb2SizeReduce();
+}
diff --git a/contrib/llvm/lib/Target/CppBackend/CPPBackend.cpp b/contrib/llvm/lib/Target/CppBackend/CPPBackend.cpp
new file mode 100644
index 0000000..f610fbb
--- /dev/null
+++ b/contrib/llvm/lib/Target/CppBackend/CPPBackend.cpp
@@ -0,0 +1,2157 @@
+//===-- CPPBackend.cpp - Library for converting LLVM code to C++ code -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the writing of the LLVM IR as a set of C++ calls to the
+// LLVM IR interface. The input module is assumed to be verified.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CPPTargetMachine.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Config/config.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include <algorithm>
+#include <cctype>
+#include <cstdio>
+#include <map>
+#include <set>
+using namespace llvm;
+
+static cl::opt<std::string>
+FuncName("cppfname", cl::desc("Specify the name of the generated function"),
+         cl::value_desc("function name"));
+
+enum WhatToGenerate {
+  GenProgram,
+  GenModule,
+  GenContents,
+  GenFunction,
+  GenFunctions,
+  GenInline,
+  GenVariable,
+  GenType
+};
+
+static cl::opt<WhatToGenerate> GenerationType("cppgen", cl::Optional,
+  cl::desc("Choose what kind of output to generate"),
+  cl::init(GenProgram),
+  cl::values(
+    clEnumValN(GenProgram,  "program",   "Generate a complete program"),
+    clEnumValN(GenModule,   "module",    "Generate a module definition"),
+    clEnumValN(GenContents, "contents",  "Generate contents of a module"),
+    clEnumValN(GenFunction, "function",  "Generate a function definition"),
+    clEnumValN(GenFunctions,"functions", "Generate all function definitions"),
+    clEnumValN(GenInline,   "inline",    "Generate an inline function"),
+    clEnumValN(GenVariable, "variable",  "Generate a variable definition"),
+    clEnumValN(GenType,     "type",      "Generate a type definition"),
+    clEnumValEnd
+  )
+);
+
+static cl::opt<std::string> NameToGenerate("cppfor", cl::Optional,
+  cl::desc("Specify the name of the thing to generate"),
+  cl::init("!bad!"));
+
+extern "C" void LLVMInitializeCppBackendTarget() {
+  // Register the target.
+  RegisterTargetMachine<CPPTargetMachine> X(TheCppBackendTarget);
+}
+
+namespace {
+  typedef std::vector<Type*> TypeList;
+  typedef std::map<Type*,std::string> TypeMap;
+  typedef std::map<const Value*,std::string> ValueMap;
+  typedef std::set<std::string> NameSet;
+  typedef std::set<Type*> TypeSet;
+  typedef std::set<const Value*> ValueSet;
+  typedef std::map<const Value*,std::string> ForwardRefMap;
+
+  /// CppWriter - This class is the main chunk of code that converts an LLVM
+  /// module to a C++ translation unit.
+  class CppWriter : public ModulePass {
+    formatted_raw_ostream &Out;
+    const Module *TheModule;
+    uint64_t uniqueNum;
+    TypeMap TypeNames;
+    ValueMap ValueNames;
+    NameSet UsedNames;
+    TypeSet DefinedTypes;
+    ValueSet DefinedValues;
+    ForwardRefMap ForwardRefs;
+    bool is_inline;
+    unsigned indent_level;
+
+  public:
+    static char ID;
+    explicit CppWriter(formatted_raw_ostream &o) :
+      ModulePass(ID), Out(o), uniqueNum(0), is_inline(false), indent_level(0){}
+
+    const char *getPassName() const override { return "C++ backend"; }
+
+    bool runOnModule(Module &M) override;
+
+    void printProgram(const std::string& fname, const std::string& modName );
+    void printModule(const std::string& fname, const std::string& modName );
+    void printContents(const std::string& fname, const std::string& modName );
+    void printFunction(const std::string& fname, const std::string& funcName );
+    void printFunctions();
+    void printInline(const std::string& fname, const std::string& funcName );
+    void printVariable(const std::string& fname, const std::string& varName );
+    void printType(const std::string& fname, const std::string& typeName );
+
+    void error(const std::string& msg);
+
+    
+    formatted_raw_ostream& nl(formatted_raw_ostream &Out, int delta = 0);
+    inline void in() { indent_level++; }
+    inline void out() { if (indent_level >0) indent_level--; }
+    
+  private:
+    void printLinkageType(GlobalValue::LinkageTypes LT);
+    void printVisibilityType(GlobalValue::VisibilityTypes VisTypes);
+    void printDLLStorageClassType(GlobalValue::DLLStorageClassTypes DSCType);
+    void printThreadLocalMode(GlobalVariable::ThreadLocalMode TLM);
+    void printCallingConv(CallingConv::ID cc);
+    void printEscapedString(const std::string& str);
+    void printCFP(const ConstantFP* CFP);
+
+    std::string getCppName(Type* val);
+    inline void printCppName(Type* val);
+
+    std::string getCppName(const Value* val);
+    inline void printCppName(const Value* val);
+
+    void printAttributes(const AttributeSet &PAL, const std::string &name);
+    void printType(Type* Ty);
+    void printTypes(const Module* M);
+
+    void printConstant(const Constant *CPV);
+    void printConstants(const Module* M);
+
+    void printVariableUses(const GlobalVariable *GV);
+    void printVariableHead(const GlobalVariable *GV);
+    void printVariableBody(const GlobalVariable *GV);
+
+    void printFunctionUses(const Function *F);
+    void printFunctionHead(const Function *F);
+    void printFunctionBody(const Function *F);
+    void printInstruction(const Instruction *I, const std::string& bbname);
+    std::string getOpName(const Value*);
+
+    void printModuleBody();
+  };
+} // end anonymous namespace.
+
+formatted_raw_ostream &CppWriter::nl(formatted_raw_ostream &Out, int delta) {
+  Out << '\n';
+  if (delta >= 0 || indent_level >= unsigned(-delta))
+    indent_level += delta;
+  Out.indent(indent_level);
+  return Out;
+}
+
+static inline void sanitize(std::string &str) {
+  for (size_t i = 0; i < str.length(); ++i)
+    if (!isalnum(str[i]) && str[i] != '_')
+      str[i] = '_';
+}
+
+static std::string getTypePrefix(Type *Ty) {
+  switch (Ty->getTypeID()) {
+  case Type::VoidTyID:     return "void_";
+  case Type::IntegerTyID:
+    return "int" + utostr(cast<IntegerType>(Ty)->getBitWidth()) + "_";
+  case Type::FloatTyID:    return "float_";
+  case Type::DoubleTyID:   return "double_";
+  case Type::LabelTyID:    return "label_";
+  case Type::FunctionTyID: return "func_";
+  case Type::StructTyID:   return "struct_";
+  case Type::ArrayTyID:    return "array_";
+  case Type::PointerTyID:  return "ptr_";
+  case Type::VectorTyID:   return "packed_";
+  default:                 return "other_";
+  }
+}
+
+void CppWriter::error(const std::string& msg) {
+  report_fatal_error(msg);
+}
+
+static inline std::string ftostr(const APFloat& V) {
+  std::string Buf;
+  if (&V.getSemantics() == &APFloat::IEEEdouble) {
+    raw_string_ostream(Buf) << V.convertToDouble();
+    return Buf;
+  } else if (&V.getSemantics() == &APFloat::IEEEsingle) {
+    raw_string_ostream(Buf) << (double)V.convertToFloat();
+    return Buf;
+  }
+  return "<unknown format in ftostr>"; // error
+}
+
+// printCFP - Print a floating point constant .. very carefully :)
+// This makes sure that conversion to/from floating yields the same binary
+// result so that we don't lose precision.
+void CppWriter::printCFP(const ConstantFP *CFP) {
+  bool ignored;
+  APFloat APF = APFloat(CFP->getValueAPF());  // copy
+  if (CFP->getType() == Type::getFloatTy(CFP->getContext()))
+    APF.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &ignored);
+  Out << "ConstantFP::get(mod->getContext(), ";
+  Out << "APFloat(";
+#if HAVE_PRINTF_A
+  char Buffer[100];
+  sprintf(Buffer, "%A", APF.convertToDouble());
+  if ((!strncmp(Buffer, "0x", 2) ||
+       !strncmp(Buffer, "-0x", 3) ||
+       !strncmp(Buffer, "+0x", 3)) &&
+      APF.bitwiseIsEqual(APFloat(atof(Buffer)))) {
+    if (CFP->getType() == Type::getDoubleTy(CFP->getContext()))
+      Out << "BitsToDouble(" << Buffer << ")";
+    else
+      Out << "BitsToFloat((float)" << Buffer << ")";
+    Out << ")";
+  } else {
+#endif
+    std::string StrVal = ftostr(CFP->getValueAPF());
+
+    while (StrVal[0] == ' ')
+      StrVal.erase(StrVal.begin());
+
+    // Check to make sure that the stringized number is not some string like
+    // "Inf" or NaN.  Check that the string matches the "[-+]?[0-9]" regex.
+    if (((StrVal[0] >= '0' && StrVal[0] <= '9') ||
+         ((StrVal[0] == '-' || StrVal[0] == '+') &&
+          (StrVal[1] >= '0' && StrVal[1] <= '9'))) &&
+        (CFP->isExactlyValue(atof(StrVal.c_str())))) {
+      if (CFP->getType() == Type::getDoubleTy(CFP->getContext()))
+        Out <<  StrVal;
+      else
+        Out << StrVal << "f";
+    } else if (CFP->getType() == Type::getDoubleTy(CFP->getContext()))
+      Out << "BitsToDouble(0x"
+          << utohexstr(CFP->getValueAPF().bitcastToAPInt().getZExtValue())
+          << "ULL) /* " << StrVal << " */";
+    else
+      Out << "BitsToFloat(0x"
+          << utohexstr((uint32_t)CFP->getValueAPF().
+                                      bitcastToAPInt().getZExtValue())
+          << "U) /* " << StrVal << " */";
+    Out << ")";
+#if HAVE_PRINTF_A
+  }
+#endif
+  Out << ")";
+}
+
+void CppWriter::printCallingConv(CallingConv::ID cc){
+  // Print the calling convention.
+  switch (cc) {
+  case CallingConv::C:     Out << "CallingConv::C"; break;
+  case CallingConv::Fast:  Out << "CallingConv::Fast"; break;
+  case CallingConv::Cold:  Out << "CallingConv::Cold"; break;
+  case CallingConv::FirstTargetCC: Out << "CallingConv::FirstTargetCC"; break;
+  default:                 Out << cc; break;
+  }
+}
+
+void CppWriter::printLinkageType(GlobalValue::LinkageTypes LT) {
+  switch (LT) {
+  case GlobalValue::InternalLinkage:
+    Out << "GlobalValue::InternalLinkage"; break;
+  case GlobalValue::PrivateLinkage:
+    Out << "GlobalValue::PrivateLinkage"; break;
+  case GlobalValue::AvailableExternallyLinkage:
+    Out << "GlobalValue::AvailableExternallyLinkage "; break;
+  case GlobalValue::LinkOnceAnyLinkage:
+    Out << "GlobalValue::LinkOnceAnyLinkage "; break;
+  case GlobalValue::LinkOnceODRLinkage:
+    Out << "GlobalValue::LinkOnceODRLinkage "; break;
+  case GlobalValue::WeakAnyLinkage:
+    Out << "GlobalValue::WeakAnyLinkage"; break;
+  case GlobalValue::WeakODRLinkage:
+    Out << "GlobalValue::WeakODRLinkage"; break;
+  case GlobalValue::AppendingLinkage:
+    Out << "GlobalValue::AppendingLinkage"; break;
+  case GlobalValue::ExternalLinkage:
+    Out << "GlobalValue::ExternalLinkage"; break;
+  case GlobalValue::ExternalWeakLinkage:
+    Out << "GlobalValue::ExternalWeakLinkage"; break;
+  case GlobalValue::CommonLinkage:
+    Out << "GlobalValue::CommonLinkage"; break;
+  }
+}
+
+void CppWriter::printVisibilityType(GlobalValue::VisibilityTypes VisType) {
+  switch (VisType) {
+  case GlobalValue::DefaultVisibility:
+    Out << "GlobalValue::DefaultVisibility";
+    break;
+  case GlobalValue::HiddenVisibility:
+    Out << "GlobalValue::HiddenVisibility";
+    break;
+  case GlobalValue::ProtectedVisibility:
+    Out << "GlobalValue::ProtectedVisibility";
+    break;
+  }
+}
+
+void CppWriter::printDLLStorageClassType(
+                                    GlobalValue::DLLStorageClassTypes DSCType) {
+  switch (DSCType) {
+  case GlobalValue::DefaultStorageClass:
+    Out << "GlobalValue::DefaultStorageClass";
+    break;
+  case GlobalValue::DLLImportStorageClass:
+    Out << "GlobalValue::DLLImportStorageClass";
+    break;
+  case GlobalValue::DLLExportStorageClass:
+    Out << "GlobalValue::DLLExportStorageClass";
+    break;
+  }
+}
+
+void CppWriter::printThreadLocalMode(GlobalVariable::ThreadLocalMode TLM) {
+  switch (TLM) {
+    case GlobalVariable::NotThreadLocal:
+      Out << "GlobalVariable::NotThreadLocal";
+      break;
+    case GlobalVariable::GeneralDynamicTLSModel:
+      Out << "GlobalVariable::GeneralDynamicTLSModel";
+      break;
+    case GlobalVariable::LocalDynamicTLSModel:
+      Out << "GlobalVariable::LocalDynamicTLSModel";
+      break;
+    case GlobalVariable::InitialExecTLSModel:
+      Out << "GlobalVariable::InitialExecTLSModel";
+      break;
+    case GlobalVariable::LocalExecTLSModel:
+      Out << "GlobalVariable::LocalExecTLSModel";
+      break;
+  }
+}
+
+// printEscapedString - Print each character of the specified string, escaping
+// it if it is not printable or if it is an escape char.
+void CppWriter::printEscapedString(const std::string &Str) {
+  for (unsigned i = 0, e = Str.size(); i != e; ++i) {
+    unsigned char C = Str[i];
+    if (isprint(C) && C != '"' && C != '\\') {
+      Out << C;
+    } else {
+      Out << "\\x"
+          << (char) ((C/16  < 10) ? ( C/16 +'0') : ( C/16 -10+'A'))
+          << (char)(((C&15) < 10) ? ((C&15)+'0') : ((C&15)-10+'A'));
+    }
+  }
+}
+
+std::string CppWriter::getCppName(Type* Ty) {
+  switch (Ty->getTypeID()) {
+  default:
+    break;
+  case Type::VoidTyID:
+    return "Type::getVoidTy(mod->getContext())";
+  case Type::IntegerTyID: {
+    unsigned BitWidth = cast<IntegerType>(Ty)->getBitWidth();
+    return "IntegerType::get(mod->getContext(), " + utostr(BitWidth) + ")";
+  }
+  case Type::X86_FP80TyID:
+    return "Type::getX86_FP80Ty(mod->getContext())";
+  case Type::FloatTyID:
+    return "Type::getFloatTy(mod->getContext())";
+  case Type::DoubleTyID:
+    return "Type::getDoubleTy(mod->getContext())";
+  case Type::LabelTyID:
+    return "Type::getLabelTy(mod->getContext())";
+  case Type::X86_MMXTyID:
+    return "Type::getX86_MMXTy(mod->getContext())";
+  }
+
+  // Now, see if we've seen the type before and return that
+  TypeMap::iterator I = TypeNames.find(Ty);
+  if (I != TypeNames.end())
+    return I->second;
+
+  // Okay, let's build a new name for this type. Start with a prefix
+  const char* prefix = nullptr;
+  switch (Ty->getTypeID()) {
+  case Type::FunctionTyID:    prefix = "FuncTy_"; break;
+  case Type::StructTyID:      prefix = "StructTy_"; break;
+  case Type::ArrayTyID:       prefix = "ArrayTy_"; break;
+  case Type::PointerTyID:     prefix = "PointerTy_"; break;
+  case Type::VectorTyID:      prefix = "VectorTy_"; break;
+  default:                    prefix = "OtherTy_"; break; // prevent breakage
+  }
+
+  // See if the type has a name in the symboltable and build accordingly
+  std::string name;
+  if (StructType *STy = dyn_cast<StructType>(Ty))
+    if (STy->hasName())
+      name = STy->getName();
+  
+  if (name.empty())
+    name = utostr(uniqueNum++);
+  
+  name = std::string(prefix) + name;
+  sanitize(name);
+
+  // Save the name
+  return TypeNames[Ty] = name;
+}
+
+void CppWriter::printCppName(Type* Ty) {
+  printEscapedString(getCppName(Ty));
+}
+
+std::string CppWriter::getCppName(const Value* val) {
+  std::string name;
+  ValueMap::iterator I = ValueNames.find(val);
+  if (I != ValueNames.end() && I->first == val)
+    return  I->second;
+
+  if (const GlobalVariable* GV = dyn_cast<GlobalVariable>(val)) {
+    name = std::string("gvar_") +
+      getTypePrefix(GV->getType()->getElementType());
+  } else if (isa<Function>(val)) {
+    name = std::string("func_");
+  } else if (const Constant* C = dyn_cast<Constant>(val)) {
+    name = std::string("const_") + getTypePrefix(C->getType());
+  } else if (const Argument* Arg = dyn_cast<Argument>(val)) {
+    if (is_inline) {
+      unsigned argNum = std::distance(Arg->getParent()->arg_begin(),
+                                      Function::const_arg_iterator(Arg)) + 1;
+      name = std::string("arg_") + utostr(argNum);
+      NameSet::iterator NI = UsedNames.find(name);
+      if (NI != UsedNames.end())
+        name += std::string("_") + utostr(uniqueNum++);
+      UsedNames.insert(name);
+      return ValueNames[val] = name;
+    } else {
+      name = getTypePrefix(val->getType());
+    }
+  } else {
+    name = getTypePrefix(val->getType());
+  }
+  if (val->hasName())
+    name += val->getName();
+  else
+    name += utostr(uniqueNum++);
+  sanitize(name);
+  NameSet::iterator NI = UsedNames.find(name);
+  if (NI != UsedNames.end())
+    name += std::string("_") + utostr(uniqueNum++);
+  UsedNames.insert(name);
+  return ValueNames[val] = name;
+}
+
+void CppWriter::printCppName(const Value* val) {
+  printEscapedString(getCppName(val));
+}
+
+void CppWriter::printAttributes(const AttributeSet &PAL,
+                                const std::string &name) {
+  Out << "AttributeSet " << name << "_PAL;";
+  nl(Out);
+  if (!PAL.isEmpty()) {
+    Out << '{'; in(); nl(Out);
+    Out << "SmallVector<AttributeSet, 4> Attrs;"; nl(Out);
+    Out << "AttributeSet PAS;"; in(); nl(Out);
+    for (unsigned i = 0; i < PAL.getNumSlots(); ++i) {
+      unsigned index = PAL.getSlotIndex(i);
+      AttrBuilder attrs(PAL.getSlotAttributes(i), index);
+      Out << "{"; in(); nl(Out);
+      Out << "AttrBuilder B;"; nl(Out);
+
+#define HANDLE_ATTR(X)                                                  \
+      if (attrs.contains(Attribute::X)) {                               \
+        Out << "B.addAttribute(Attribute::" #X ");"; nl(Out);           \
+        attrs.removeAttribute(Attribute::X);                            \
+      }
+
+      HANDLE_ATTR(SExt);
+      HANDLE_ATTR(ZExt);
+      HANDLE_ATTR(NoReturn);
+      HANDLE_ATTR(InReg);
+      HANDLE_ATTR(StructRet);
+      HANDLE_ATTR(NoUnwind);
+      HANDLE_ATTR(NoAlias);
+      HANDLE_ATTR(ByVal);
+      HANDLE_ATTR(InAlloca);
+      HANDLE_ATTR(Nest);
+      HANDLE_ATTR(ReadNone);
+      HANDLE_ATTR(ReadOnly);
+      HANDLE_ATTR(NoInline);
+      HANDLE_ATTR(AlwaysInline);
+      HANDLE_ATTR(OptimizeNone);
+      HANDLE_ATTR(OptimizeForSize);
+      HANDLE_ATTR(StackProtect);
+      HANDLE_ATTR(StackProtectReq);
+      HANDLE_ATTR(StackProtectStrong);
+      HANDLE_ATTR(NoCapture);
+      HANDLE_ATTR(NoRedZone);
+      HANDLE_ATTR(NoImplicitFloat);
+      HANDLE_ATTR(Naked);
+      HANDLE_ATTR(InlineHint);
+      HANDLE_ATTR(ReturnsTwice);
+      HANDLE_ATTR(UWTable);
+      HANDLE_ATTR(NonLazyBind);
+      HANDLE_ATTR(MinSize);
+#undef HANDLE_ATTR
+
+      if (attrs.contains(Attribute::StackAlignment)) {
+        Out << "B.addStackAlignmentAttr(" << attrs.getStackAlignment()<<')';
+        nl(Out);
+        attrs.removeAttribute(Attribute::StackAlignment);
+      }
+
+      Out << "PAS = AttributeSet::get(mod->getContext(), ";
+      if (index == ~0U)
+        Out << "~0U,";
+      else
+        Out << index << "U,";
+      Out << " B);"; out(); nl(Out);
+      Out << "}"; out(); nl(Out);
+      nl(Out);
+      Out << "Attrs.push_back(PAS);"; nl(Out);
+    }
+    Out << name << "_PAL = AttributeSet::get(mod->getContext(), Attrs);";
+    nl(Out);
+    out(); nl(Out);
+    Out << '}'; nl(Out);
+  }
+}
+
+void CppWriter::printType(Type* Ty) {
+  // We don't print definitions for primitive types
+  if (Ty->isFloatingPointTy() || Ty->isX86_MMXTy() || Ty->isIntegerTy() ||
+      Ty->isLabelTy() || Ty->isMetadataTy() || Ty->isVoidTy())
+    return;
+
+  // If we already defined this type, we don't need to define it again.
+  if (DefinedTypes.find(Ty) != DefinedTypes.end())
+    return;
+
+  // Everything below needs the name for the type so get it now.
+  std::string typeName(getCppName(Ty));
+
+  // Print the type definition
+  switch (Ty->getTypeID()) {
+  case Type::FunctionTyID:  {
+    FunctionType* FT = cast<FunctionType>(Ty);
+    Out << "std::vector<Type*>" << typeName << "_args;";
+    nl(Out);
+    FunctionType::param_iterator PI = FT->param_begin();
+    FunctionType::param_iterator PE = FT->param_end();
+    for (; PI != PE; ++PI) {
+      Type* argTy = static_cast<Type*>(*PI);
+      printType(argTy);
+      std::string argName(getCppName(argTy));
+      Out << typeName << "_args.push_back(" << argName;
+      Out << ");";
+      nl(Out);
+    }
+    printType(FT->getReturnType());
+    std::string retTypeName(getCppName(FT->getReturnType()));
+    Out << "FunctionType* " << typeName << " = FunctionType::get(";
+    in(); nl(Out) << "/*Result=*/" << retTypeName;
+    Out << ",";
+    nl(Out) << "/*Params=*/" << typeName << "_args,";
+    nl(Out) << "/*isVarArg=*/" << (FT->isVarArg() ? "true" : "false") << ");";
+    out();
+    nl(Out);
+    break;
+  }
+  case Type::StructTyID: {
+    StructType* ST = cast<StructType>(Ty);
+    if (!ST->isLiteral()) {
+      Out << "StructType *" << typeName << " = mod->getTypeByName(\"";
+      printEscapedString(ST->getName());
+      Out << "\");";
+      nl(Out);
+      Out << "if (!" << typeName << ") {";
+      nl(Out);
+      Out << typeName << " = ";
+      Out << "StructType::create(mod->getContext(), \"";
+      printEscapedString(ST->getName());
+      Out << "\");";
+      nl(Out);
+      Out << "}";
+      nl(Out);
+      // Indicate that this type is now defined.
+      DefinedTypes.insert(Ty);
+    }
+
+    Out << "std::vector<Type*>" << typeName << "_fields;";
+    nl(Out);
+    StructType::element_iterator EI = ST->element_begin();
+    StructType::element_iterator EE = ST->element_end();
+    for (; EI != EE; ++EI) {
+      Type* fieldTy = static_cast<Type*>(*EI);
+      printType(fieldTy);
+      std::string fieldName(getCppName(fieldTy));
+      Out << typeName << "_fields.push_back(" << fieldName;
+      Out << ");";
+      nl(Out);
+    }
+
+    if (ST->isLiteral()) {
+      Out << "StructType *" << typeName << " = ";
+      Out << "StructType::get(" << "mod->getContext(), ";
+    } else {
+      Out << "if (" << typeName << "->isOpaque()) {";
+      nl(Out);
+      Out << typeName << "->setBody(";
+    }
+
+    Out << typeName << "_fields, /*isPacked=*/"
+        << (ST->isPacked() ? "true" : "false") << ");";
+    nl(Out);
+    if (!ST->isLiteral()) {
+      Out << "}";
+      nl(Out);
+    }
+    break;
+  }
+  case Type::ArrayTyID: {
+    ArrayType* AT = cast<ArrayType>(Ty);
+    Type* ET = AT->getElementType();
+    printType(ET);
+    if (DefinedTypes.find(Ty) == DefinedTypes.end()) {
+      std::string elemName(getCppName(ET));
+      Out << "ArrayType* " << typeName << " = ArrayType::get("
+          << elemName
+          << ", " << utostr(AT->getNumElements()) << ");";
+      nl(Out);
+    }
+    break;
+  }
+  case Type::PointerTyID: {
+    PointerType* PT = cast<PointerType>(Ty);
+    Type* ET = PT->getElementType();
+    printType(ET);
+    if (DefinedTypes.find(Ty) == DefinedTypes.end()) {
+      std::string elemName(getCppName(ET));
+      Out << "PointerType* " << typeName << " = PointerType::get("
+          << elemName
+          << ", " << utostr(PT->getAddressSpace()) << ");";
+      nl(Out);
+    }
+    break;
+  }
+  case Type::VectorTyID: {
+    VectorType* PT = cast<VectorType>(Ty);
+    Type* ET = PT->getElementType();
+    printType(ET);
+    if (DefinedTypes.find(Ty) == DefinedTypes.end()) {
+      std::string elemName(getCppName(ET));
+      Out << "VectorType* " << typeName << " = VectorType::get("
+          << elemName
+          << ", " << utostr(PT->getNumElements()) << ");";
+      nl(Out);
+    }
+    break;
+  }
+  default:
+    error("Invalid TypeID");
+  }
+
+  // Indicate that this type is now defined.
+  DefinedTypes.insert(Ty);
+
+  // Finally, separate the type definition from other with a newline.
+  nl(Out);
+}
+
+void CppWriter::printTypes(const Module* M) {
+  // Add all of the global variables to the value table.
+  for (Module::const_global_iterator I = TheModule->global_begin(),
+         E = TheModule->global_end(); I != E; ++I) {
+    if (I->hasInitializer())
+      printType(I->getInitializer()->getType());
+    printType(I->getType());
+  }
+
+  // Add all the functions to the table
+  for (Module::const_iterator FI = TheModule->begin(), FE = TheModule->end();
+       FI != FE; ++FI) {
+    printType(FI->getReturnType());
+    printType(FI->getFunctionType());
+    // Add all the function arguments
+    for (Function::const_arg_iterator AI = FI->arg_begin(),
+           AE = FI->arg_end(); AI != AE; ++AI) {
+      printType(AI->getType());
+    }
+
+    // Add all of the basic blocks and instructions
+    for (Function::const_iterator BB = FI->begin(),
+           E = FI->end(); BB != E; ++BB) {
+      printType(BB->getType());
+      for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I!=E;
+           ++I) {
+        printType(I->getType());
+        for (unsigned i = 0; i < I->getNumOperands(); ++i)
+          printType(I->getOperand(i)->getType());
+      }
+    }
+  }
+}
+
+
+// printConstant - Print out a constant pool entry...
+void CppWriter::printConstant(const Constant *CV) {
+  // First, if the constant is actually a GlobalValue (variable or function)
+  // or its already in the constant list then we've printed it already and we
+  // can just return.
+  if (isa<GlobalValue>(CV) || ValueNames.find(CV) != ValueNames.end())
+    return;
+
+  std::string constName(getCppName(CV));
+  std::string typeName(getCppName(CV->getType()));
+
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
+    std::string constValue = CI->getValue().toString(10, true);
+    Out << "ConstantInt* " << constName
+        << " = ConstantInt::get(mod->getContext(), APInt("
+        << cast<IntegerType>(CI->getType())->getBitWidth()
+        << ", StringRef(\"" <<  constValue << "\"), 10));";
+  } else if (isa<ConstantAggregateZero>(CV)) {
+    Out << "ConstantAggregateZero* " << constName
+        << " = ConstantAggregateZero::get(" << typeName << ");";
+  } else if (isa<ConstantPointerNull>(CV)) {
+    Out << "ConstantPointerNull* " << constName
+        << " = ConstantPointerNull::get(" << typeName << ");";
+  } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CV)) {
+    Out << "ConstantFP* " << constName << " = ";
+    printCFP(CFP);
+    Out << ";";
+  } else if (const ConstantArray *CA = dyn_cast<ConstantArray>(CV)) {
+    Out << "std::vector<Constant*> " << constName << "_elems;";
+    nl(Out);
+    unsigned N = CA->getNumOperands();
+    for (unsigned i = 0; i < N; ++i) {
+      printConstant(CA->getOperand(i)); // recurse to print operands
+      Out << constName << "_elems.push_back("
+          << getCppName(CA->getOperand(i)) << ");";
+      nl(Out);
+    }
+    Out << "Constant* " << constName << " = ConstantArray::get("
+        << typeName << ", " << constName << "_elems);";
+  } else if (const ConstantStruct *CS = dyn_cast<ConstantStruct>(CV)) {
+    Out << "std::vector<Constant*> " << constName << "_fields;";
+    nl(Out);
+    unsigned N = CS->getNumOperands();
+    for (unsigned i = 0; i < N; i++) {
+      printConstant(CS->getOperand(i));
+      Out << constName << "_fields.push_back("
+          << getCppName(CS->getOperand(i)) << ");";
+      nl(Out);
+    }
+    Out << "Constant* " << constName << " = ConstantStruct::get("
+        << typeName << ", " << constName << "_fields);";
+  } else if (const ConstantVector *CVec = dyn_cast<ConstantVector>(CV)) {
+    Out << "std::vector<Constant*> " << constName << "_elems;";
+    nl(Out);
+    unsigned N = CVec->getNumOperands();
+    for (unsigned i = 0; i < N; ++i) {
+      printConstant(CVec->getOperand(i));
+      Out << constName << "_elems.push_back("
+          << getCppName(CVec->getOperand(i)) << ");";
+      nl(Out);
+    }
+    Out << "Constant* " << constName << " = ConstantVector::get("
+        << typeName << ", " << constName << "_elems);";
+  } else if (isa<UndefValue>(CV)) {
+    Out << "UndefValue* " << constName << " = UndefValue::get("
+        << typeName << ");";
+  } else if (const ConstantDataSequential *CDS =
+               dyn_cast<ConstantDataSequential>(CV)) {
+    if (CDS->isString()) {
+      Out << "Constant *" << constName <<
+      " = ConstantDataArray::getString(mod->getContext(), \"";
+      StringRef Str = CDS->getAsString();
+      bool nullTerminate = false;
+      if (Str.back() == 0) {
+        Str = Str.drop_back();
+        nullTerminate = true;
+      }
+      printEscapedString(Str);
+      // Determine if we want null termination or not.
+      if (nullTerminate)
+        Out << "\", true);";
+      else
+        Out << "\", false);";// No null terminator
+    } else {
+      // TODO: Could generate more efficient code generating CDS calls instead.
+      Out << "std::vector<Constant*> " << constName << "_elems;";
+      nl(Out);
+      for (unsigned i = 0; i != CDS->getNumElements(); ++i) {
+        Constant *Elt = CDS->getElementAsConstant(i);
+        printConstant(Elt);
+        Out << constName << "_elems.push_back(" << getCppName(Elt) << ");";
+        nl(Out);
+      }
+      Out << "Constant* " << constName;
+      
+      if (isa<ArrayType>(CDS->getType()))
+        Out << " = ConstantArray::get(";
+      else
+        Out << " = ConstantVector::get(";
+      Out << typeName << ", " << constName << "_elems);";
+    }
+  } else if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV)) {
+    if (CE->getOpcode() == Instruction::GetElementPtr) {
+      Out << "std::vector<Constant*> " << constName << "_indices;";
+      nl(Out);
+      printConstant(CE->getOperand(0));
+      for (unsigned i = 1; i < CE->getNumOperands(); ++i ) {
+        printConstant(CE->getOperand(i));
+        Out << constName << "_indices.push_back("
+            << getCppName(CE->getOperand(i)) << ");";
+        nl(Out);
+      }
+      Out << "Constant* " << constName
+          << " = ConstantExpr::getGetElementPtr("
+          << getCppName(CE->getOperand(0)) << ", "
+          << constName << "_indices);";
+    } else if (CE->isCast()) {
+      printConstant(CE->getOperand(0));
+      Out << "Constant* " << constName << " = ConstantExpr::getCast(";
+      switch (CE->getOpcode()) {
+      default: llvm_unreachable("Invalid cast opcode");
+      case Instruction::Trunc: Out << "Instruction::Trunc"; break;
+      case Instruction::ZExt:  Out << "Instruction::ZExt"; break;
+      case Instruction::SExt:  Out << "Instruction::SExt"; break;
+      case Instruction::FPTrunc:  Out << "Instruction::FPTrunc"; break;
+      case Instruction::FPExt:  Out << "Instruction::FPExt"; break;
+      case Instruction::FPToUI:  Out << "Instruction::FPToUI"; break;
+      case Instruction::FPToSI:  Out << "Instruction::FPToSI"; break;
+      case Instruction::UIToFP:  Out << "Instruction::UIToFP"; break;
+      case Instruction::SIToFP:  Out << "Instruction::SIToFP"; break;
+      case Instruction::PtrToInt:  Out << "Instruction::PtrToInt"; break;
+      case Instruction::IntToPtr:  Out << "Instruction::IntToPtr"; break;
+      case Instruction::BitCast:  Out << "Instruction::BitCast"; break;
+      }
+      Out << ", " << getCppName(CE->getOperand(0)) << ", "
+          << getCppName(CE->getType()) << ");";
+    } else {
+      unsigned N = CE->getNumOperands();
+      for (unsigned i = 0; i < N; ++i ) {
+        printConstant(CE->getOperand(i));
+      }
+      Out << "Constant* " << constName << " = ConstantExpr::";
+      switch (CE->getOpcode()) {
+      case Instruction::Add:    Out << "getAdd(";  break;
+      case Instruction::FAdd:   Out << "getFAdd(";  break;
+      case Instruction::Sub:    Out << "getSub("; break;
+      case Instruction::FSub:   Out << "getFSub("; break;
+      case Instruction::Mul:    Out << "getMul("; break;
+      case Instruction::FMul:   Out << "getFMul("; break;
+      case Instruction::UDiv:   Out << "getUDiv("; break;
+      case Instruction::SDiv:   Out << "getSDiv("; break;
+      case Instruction::FDiv:   Out << "getFDiv("; break;
+      case Instruction::URem:   Out << "getURem("; break;
+      case Instruction::SRem:   Out << "getSRem("; break;
+      case Instruction::FRem:   Out << "getFRem("; break;
+      case Instruction::And:    Out << "getAnd("; break;
+      case Instruction::Or:     Out << "getOr("; break;
+      case Instruction::Xor:    Out << "getXor("; break;
+      case Instruction::ICmp:
+        Out << "getICmp(ICmpInst::ICMP_";
+        switch (CE->getPredicate()) {
+        case ICmpInst::ICMP_EQ:  Out << "EQ"; break;
+        case ICmpInst::ICMP_NE:  Out << "NE"; break;
+        case ICmpInst::ICMP_SLT: Out << "SLT"; break;
+        case ICmpInst::ICMP_ULT: Out << "ULT"; break;
+        case ICmpInst::ICMP_SGT: Out << "SGT"; break;
+        case ICmpInst::ICMP_UGT: Out << "UGT"; break;
+        case ICmpInst::ICMP_SLE: Out << "SLE"; break;
+        case ICmpInst::ICMP_ULE: Out << "ULE"; break;
+        case ICmpInst::ICMP_SGE: Out << "SGE"; break;
+        case ICmpInst::ICMP_UGE: Out << "UGE"; break;
+        default: error("Invalid ICmp Predicate");
+        }
+        break;
+      case Instruction::FCmp:
+        Out << "getFCmp(FCmpInst::FCMP_";
+        switch (CE->getPredicate()) {
+        case FCmpInst::FCMP_FALSE: Out << "FALSE"; break;
+        case FCmpInst::FCMP_ORD:   Out << "ORD"; break;
+        case FCmpInst::FCMP_UNO:   Out << "UNO"; break;
+        case FCmpInst::FCMP_OEQ:   Out << "OEQ"; break;
+        case FCmpInst::FCMP_UEQ:   Out << "UEQ"; break;
+        case FCmpInst::FCMP_ONE:   Out << "ONE"; break;
+        case FCmpInst::FCMP_UNE:   Out << "UNE"; break;
+        case FCmpInst::FCMP_OLT:   Out << "OLT"; break;
+        case FCmpInst::FCMP_ULT:   Out << "ULT"; break;
+        case FCmpInst::FCMP_OGT:   Out << "OGT"; break;
+        case FCmpInst::FCMP_UGT:   Out << "UGT"; break;
+        case FCmpInst::FCMP_OLE:   Out << "OLE"; break;
+        case FCmpInst::FCMP_ULE:   Out << "ULE"; break;
+        case FCmpInst::FCMP_OGE:   Out << "OGE"; break;
+        case FCmpInst::FCMP_UGE:   Out << "UGE"; break;
+        case FCmpInst::FCMP_TRUE:  Out << "TRUE"; break;
+        default: error("Invalid FCmp Predicate");
+        }
+        break;
+      case Instruction::Shl:     Out << "getShl("; break;
+      case Instruction::LShr:    Out << "getLShr("; break;
+      case Instruction::AShr:    Out << "getAShr("; break;
+      case Instruction::Select:  Out << "getSelect("; break;
+      case Instruction::ExtractElement: Out << "getExtractElement("; break;
+      case Instruction::InsertElement:  Out << "getInsertElement("; break;
+      case Instruction::ShuffleVector:  Out << "getShuffleVector("; break;
+      default:
+        error("Invalid constant expression");
+        break;
+      }
+      Out << getCppName(CE->getOperand(0));
+      for (unsigned i = 1; i < CE->getNumOperands(); ++i)
+        Out << ", " << getCppName(CE->getOperand(i));
+      Out << ");";
+    }
+  } else if (const BlockAddress *BA = dyn_cast<BlockAddress>(CV)) {
+    Out << "Constant* " << constName << " = ";
+    Out << "BlockAddress::get(" << getOpName(BA->getBasicBlock()) << ");";
+  } else {
+    error("Bad Constant");
+    Out << "Constant* " << constName << " = 0; ";
+  }
+  nl(Out);
+}
+
+void CppWriter::printConstants(const Module* M) {
+  // Traverse all the global variables looking for constant initializers
+  for (Module::const_global_iterator I = TheModule->global_begin(),
+         E = TheModule->global_end(); I != E; ++I)
+    if (I->hasInitializer())
+      printConstant(I->getInitializer());
+
+  // Traverse the LLVM functions looking for constants
+  for (Module::const_iterator FI = TheModule->begin(), FE = TheModule->end();
+       FI != FE; ++FI) {
+    // Add all of the basic blocks and instructions
+    for (Function::const_iterator BB = FI->begin(),
+           E = FI->end(); BB != E; ++BB) {
+      for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I!=E;
+           ++I) {
+        for (unsigned i = 0; i < I->getNumOperands(); ++i) {
+          if (Constant* C = dyn_cast<Constant>(I->getOperand(i))) {
+            printConstant(C);
+          }
+        }
+      }
+    }
+  }
+}
+
+void CppWriter::printVariableUses(const GlobalVariable *GV) {
+  nl(Out) << "// Type Definitions";
+  nl(Out);
+  printType(GV->getType());
+  if (GV->hasInitializer()) {
+    const Constant *Init = GV->getInitializer();
+    printType(Init->getType());
+    if (const Function *F = dyn_cast<Function>(Init)) {
+      nl(Out)<< "/ Function Declarations"; nl(Out);
+      printFunctionHead(F);
+    } else if (const GlobalVariable* gv = dyn_cast<GlobalVariable>(Init)) {
+      nl(Out) << "// Global Variable Declarations"; nl(Out);
+      printVariableHead(gv);
+      
+      nl(Out) << "// Global Variable Definitions"; nl(Out);
+      printVariableBody(gv);
+    } else  {
+      nl(Out) << "// Constant Definitions"; nl(Out);
+      printConstant(Init);
+    }
+  }
+}
+
+void CppWriter::printVariableHead(const GlobalVariable *GV) {
+  nl(Out) << "GlobalVariable* " << getCppName(GV);
+  if (is_inline) {
+    Out << " = mod->getGlobalVariable(mod->getContext(), ";
+    printEscapedString(GV->getName());
+    Out << ", " << getCppName(GV->getType()->getElementType()) << ",true)";
+    nl(Out) << "if (!" << getCppName(GV) << ") {";
+    in(); nl(Out) << getCppName(GV);
+  }
+  Out << " = new GlobalVariable(/*Module=*/*mod, ";
+  nl(Out) << "/*Type=*/";
+  printCppName(GV->getType()->getElementType());
+  Out << ",";
+  nl(Out) << "/*isConstant=*/" << (GV->isConstant()?"true":"false");
+  Out << ",";
+  nl(Out) << "/*Linkage=*/";
+  printLinkageType(GV->getLinkage());
+  Out << ",";
+  nl(Out) << "/*Initializer=*/0, ";
+  if (GV->hasInitializer()) {
+    Out << "// has initializer, specified below";
+  }
+  nl(Out) << "/*Name=*/\"";
+  printEscapedString(GV->getName());
+  Out << "\");";
+  nl(Out);
+
+  if (GV->hasSection()) {
+    printCppName(GV);
+    Out << "->setSection(\"";
+    printEscapedString(GV->getSection());
+    Out << "\");";
+    nl(Out);
+  }
+  if (GV->getAlignment()) {
+    printCppName(GV);
+    Out << "->setAlignment(" << utostr(GV->getAlignment()) << ");";
+    nl(Out);
+  }
+  if (GV->getVisibility() != GlobalValue::DefaultVisibility) {
+    printCppName(GV);
+    Out << "->setVisibility(";
+    printVisibilityType(GV->getVisibility());
+    Out << ");";
+    nl(Out);
+  }
+  if (GV->getDLLStorageClass() != GlobalValue::DefaultStorageClass) {
+    printCppName(GV);
+    Out << "->setDLLStorageClass(";
+    printDLLStorageClassType(GV->getDLLStorageClass());
+    Out << ");";
+    nl(Out);
+  }
+  if (GV->isThreadLocal()) {
+    printCppName(GV);
+    Out << "->setThreadLocalMode(";
+    printThreadLocalMode(GV->getThreadLocalMode());
+    Out << ");";
+    nl(Out);
+  }
+  if (is_inline) {
+    out(); Out << "}"; nl(Out);
+  }
+}
+
+void CppWriter::printVariableBody(const GlobalVariable *GV) {
+  if (GV->hasInitializer()) {
+    printCppName(GV);
+    Out << "->setInitializer(";
+    Out << getCppName(GV->getInitializer()) << ");";
+    nl(Out);
+  }
+}
+
+std::string CppWriter::getOpName(const Value* V) {
+  if (!isa<Instruction>(V) || DefinedValues.find(V) != DefinedValues.end())
+    return getCppName(V);
+
+  // See if its alread in the map of forward references, if so just return the
+  // name we already set up for it
+  ForwardRefMap::const_iterator I = ForwardRefs.find(V);
+  if (I != ForwardRefs.end())
+    return I->second;
+
+  // This is a new forward reference. Generate a unique name for it
+  std::string result(std::string("fwdref_") + utostr(uniqueNum++));
+
+  // Yes, this is a hack. An Argument is the smallest instantiable value that
+  // we can make as a placeholder for the real value. We'll replace these
+  // Argument instances later.
+  Out << "Argument* " << result << " = new Argument("
+      << getCppName(V->getType()) << ");";
+  nl(Out);
+  ForwardRefs[V] = result;
+  return result;
+}
+
+static StringRef ConvertAtomicOrdering(AtomicOrdering Ordering) {
+  switch (Ordering) {
+    case NotAtomic: return "NotAtomic";
+    case Unordered: return "Unordered";
+    case Monotonic: return "Monotonic";
+    case Acquire: return "Acquire";
+    case Release: return "Release";
+    case AcquireRelease: return "AcquireRelease";
+    case SequentiallyConsistent: return "SequentiallyConsistent";
+  }
+  llvm_unreachable("Unknown ordering");
+}
+
+static StringRef ConvertAtomicSynchScope(SynchronizationScope SynchScope) {
+  switch (SynchScope) {
+    case SingleThread: return "SingleThread";
+    case CrossThread: return "CrossThread";
+  }
+  llvm_unreachable("Unknown synch scope");
+}
+
+// printInstruction - This member is called for each Instruction in a function.
+void CppWriter::printInstruction(const Instruction *I,
+                                 const std::string& bbname) {
+  std::string iName(getCppName(I));
+
+  // Before we emit this instruction, we need to take care of generating any
+  // forward references. So, we get the names of all the operands in advance
+  const unsigned Ops(I->getNumOperands());
+  std::string* opNames = new std::string[Ops];
+  for (unsigned i = 0; i < Ops; i++)
+    opNames[i] = getOpName(I->getOperand(i));
+
+  switch (I->getOpcode()) {
+  default:
+    error("Invalid instruction");
+    break;
+
+  case Instruction::Ret: {
+    const ReturnInst* ret =  cast<ReturnInst>(I);
+    Out << "ReturnInst::Create(mod->getContext(), "
+        << (ret->getReturnValue() ? opNames[0] + ", " : "") << bbname << ");";
+    break;
+  }
+  case Instruction::Br: {
+    const BranchInst* br = cast<BranchInst>(I);
+    Out << "BranchInst::Create(" ;
+    if (br->getNumOperands() == 3) {
+      Out << opNames[2] << ", "
+          << opNames[1] << ", "
+          << opNames[0] << ", ";
+
+    } else if (br->getNumOperands() == 1) {
+      Out << opNames[0] << ", ";
+    } else {
+      error("Branch with 2 operands?");
+    }
+    Out << bbname << ");";
+    break;
+  }
+  case Instruction::Switch: {
+    const SwitchInst *SI = cast<SwitchInst>(I);
+    Out << "SwitchInst* " << iName << " = SwitchInst::Create("
+        << getOpName(SI->getCondition()) << ", "
+        << getOpName(SI->getDefaultDest()) << ", "
+        << SI->getNumCases() << ", " << bbname << ");";
+    nl(Out);
+    for (SwitchInst::ConstCaseIt i = SI->case_begin(), e = SI->case_end();
+         i != e; ++i) {
+      const ConstantInt* CaseVal = i.getCaseValue();
+      const BasicBlock *BB = i.getCaseSuccessor();
+      Out << iName << "->addCase("
+          << getOpName(CaseVal) << ", "
+          << getOpName(BB) << ");";
+      nl(Out);
+    }
+    break;
+  }
+  case Instruction::IndirectBr: {
+    const IndirectBrInst *IBI = cast<IndirectBrInst>(I);
+    Out << "IndirectBrInst *" << iName << " = IndirectBrInst::Create("
+        << opNames[0] << ", " << IBI->getNumDestinations() << ");";
+    nl(Out);
+    for (unsigned i = 1; i != IBI->getNumOperands(); ++i) {
+      Out << iName << "->addDestination(" << opNames[i] << ");";
+      nl(Out);
+    }
+    break;
+  }
+  case Instruction::Resume: {
+    Out << "ResumeInst::Create(" << opNames[0] << ", " << bbname << ");";
+    break;
+  }
+  case Instruction::Invoke: {
+    const InvokeInst* inv = cast<InvokeInst>(I);
+    Out << "std::vector<Value*> " << iName << "_params;";
+    nl(Out);
+    for (unsigned i = 0; i < inv->getNumArgOperands(); ++i) {
+      Out << iName << "_params.push_back("
+          << getOpName(inv->getArgOperand(i)) << ");";
+      nl(Out);
+    }
+    // FIXME: This shouldn't use magic numbers -3, -2, and -1.
+    Out << "InvokeInst *" << iName << " = InvokeInst::Create("
+        << getOpName(inv->getCalledValue()) << ", "
+        << getOpName(inv->getNormalDest()) << ", "
+        << getOpName(inv->getUnwindDest()) << ", "
+        << iName << "_params, \"";
+    printEscapedString(inv->getName());
+    Out << "\", " << bbname << ");";
+    nl(Out) << iName << "->setCallingConv(";
+    printCallingConv(inv->getCallingConv());
+    Out << ");";
+    printAttributes(inv->getAttributes(), iName);
+    Out << iName << "->setAttributes(" << iName << "_PAL);";
+    nl(Out);
+    break;
+  }
+  case Instruction::Unreachable: {
+    Out << "new UnreachableInst("
+        << "mod->getContext(), "
+        << bbname << ");";
+    break;
+  }
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::Mul:
+  case Instruction::FMul:
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::FDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+  case Instruction::FRem:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:{
+    Out << "BinaryOperator* " << iName << " = BinaryOperator::Create(";
+    switch (I->getOpcode()) {
+    case Instruction::Add: Out << "Instruction::Add"; break;
+    case Instruction::FAdd: Out << "Instruction::FAdd"; break;
+    case Instruction::Sub: Out << "Instruction::Sub"; break;
+    case Instruction::FSub: Out << "Instruction::FSub"; break;
+    case Instruction::Mul: Out << "Instruction::Mul"; break;
+    case Instruction::FMul: Out << "Instruction::FMul"; break;
+    case Instruction::UDiv:Out << "Instruction::UDiv"; break;
+    case Instruction::SDiv:Out << "Instruction::SDiv"; break;
+    case Instruction::FDiv:Out << "Instruction::FDiv"; break;
+    case Instruction::URem:Out << "Instruction::URem"; break;
+    case Instruction::SRem:Out << "Instruction::SRem"; break;
+    case Instruction::FRem:Out << "Instruction::FRem"; break;
+    case Instruction::And: Out << "Instruction::And"; break;
+    case Instruction::Or:  Out << "Instruction::Or";  break;
+    case Instruction::Xor: Out << "Instruction::Xor"; break;
+    case Instruction::Shl: Out << "Instruction::Shl"; break;
+    case Instruction::LShr:Out << "Instruction::LShr"; break;
+    case Instruction::AShr:Out << "Instruction::AShr"; break;
+    default: Out << "Instruction::BadOpCode"; break;
+    }
+    Out << ", " << opNames[0] << ", " << opNames[1] << ", \"";
+    printEscapedString(I->getName());
+    Out << "\", " << bbname << ");";
+    break;
+  }
+  case Instruction::FCmp: {
+    Out << "FCmpInst* " << iName << " = new FCmpInst(*" << bbname << ", ";
+    switch (cast<FCmpInst>(I)->getPredicate()) {
+    case FCmpInst::FCMP_FALSE: Out << "FCmpInst::FCMP_FALSE"; break;
+    case FCmpInst::FCMP_OEQ  : Out << "FCmpInst::FCMP_OEQ"; break;
+    case FCmpInst::FCMP_OGT  : Out << "FCmpInst::FCMP_OGT"; break;
+    case FCmpInst::FCMP_OGE  : Out << "FCmpInst::FCMP_OGE"; break;
+    case FCmpInst::FCMP_OLT  : Out << "FCmpInst::FCMP_OLT"; break;
+    case FCmpInst::FCMP_OLE  : Out << "FCmpInst::FCMP_OLE"; break;
+    case FCmpInst::FCMP_ONE  : Out << "FCmpInst::FCMP_ONE"; break;
+    case FCmpInst::FCMP_ORD  : Out << "FCmpInst::FCMP_ORD"; break;
+    case FCmpInst::FCMP_UNO  : Out << "FCmpInst::FCMP_UNO"; break;
+    case FCmpInst::FCMP_UEQ  : Out << "FCmpInst::FCMP_UEQ"; break;
+    case FCmpInst::FCMP_UGT  : Out << "FCmpInst::FCMP_UGT"; break;
+    case FCmpInst::FCMP_UGE  : Out << "FCmpInst::FCMP_UGE"; break;
+    case FCmpInst::FCMP_ULT  : Out << "FCmpInst::FCMP_ULT"; break;
+    case FCmpInst::FCMP_ULE  : Out << "FCmpInst::FCMP_ULE"; break;
+    case FCmpInst::FCMP_UNE  : Out << "FCmpInst::FCMP_UNE"; break;
+    case FCmpInst::FCMP_TRUE : Out << "FCmpInst::FCMP_TRUE"; break;
+    default: Out << "FCmpInst::BAD_ICMP_PREDICATE"; break;
+    }
+    Out << ", " << opNames[0] << ", " << opNames[1] << ", \"";
+    printEscapedString(I->getName());
+    Out << "\");";
+    break;
+  }
+  case Instruction::ICmp: {
+    Out << "ICmpInst* " << iName << " = new ICmpInst(*" << bbname << ", ";
+    switch (cast<ICmpInst>(I)->getPredicate()) {
+    case ICmpInst::ICMP_EQ:  Out << "ICmpInst::ICMP_EQ";  break;
+    case ICmpInst::ICMP_NE:  Out << "ICmpInst::ICMP_NE";  break;
+    case ICmpInst::ICMP_ULE: Out << "ICmpInst::ICMP_ULE"; break;
+    case ICmpInst::ICMP_SLE: Out << "ICmpInst::ICMP_SLE"; break;
+    case ICmpInst::ICMP_UGE: Out << "ICmpInst::ICMP_UGE"; break;
+    case ICmpInst::ICMP_SGE: Out << "ICmpInst::ICMP_SGE"; break;
+    case ICmpInst::ICMP_ULT: Out << "ICmpInst::ICMP_ULT"; break;
+    case ICmpInst::ICMP_SLT: Out << "ICmpInst::ICMP_SLT"; break;
+    case ICmpInst::ICMP_UGT: Out << "ICmpInst::ICMP_UGT"; break;
+    case ICmpInst::ICMP_SGT: Out << "ICmpInst::ICMP_SGT"; break;
+    default: Out << "ICmpInst::BAD_ICMP_PREDICATE"; break;
+    }
+    Out << ", " << opNames[0] << ", " << opNames[1] << ", \"";
+    printEscapedString(I->getName());
+    Out << "\");";
+    break;
+  }
+  case Instruction::Alloca: {
+    const AllocaInst* allocaI = cast<AllocaInst>(I);
+    Out << "AllocaInst* " << iName << " = new AllocaInst("
+        << getCppName(allocaI->getAllocatedType()) << ", ";
+    if (allocaI->isArrayAllocation())
+      Out << opNames[0] << ", ";
+    Out << "\"";
+    printEscapedString(allocaI->getName());
+    Out << "\", " << bbname << ");";
+    if (allocaI->getAlignment())
+      nl(Out) << iName << "->setAlignment("
+          << allocaI->getAlignment() << ");";
+    break;
+  }
+  case Instruction::Load: {
+    const LoadInst* load = cast<LoadInst>(I);
+    Out << "LoadInst* " << iName << " = new LoadInst("
+        << opNames[0] << ", \"";
+    printEscapedString(load->getName());
+    Out << "\", " << (load->isVolatile() ? "true" : "false" )
+        << ", " << bbname << ");";
+    if (load->getAlignment())
+      nl(Out) << iName << "->setAlignment("
+              << load->getAlignment() << ");";
+    if (load->isAtomic()) {
+      StringRef Ordering = ConvertAtomicOrdering(load->getOrdering());
+      StringRef CrossThread = ConvertAtomicSynchScope(load->getSynchScope());
+      nl(Out) << iName << "->setAtomic("
+              << Ordering << ", " << CrossThread << ");";
+    }
+    break;
+  }
+  case Instruction::Store: {
+    const StoreInst* store = cast<StoreInst>(I);
+    Out << "StoreInst* " << iName << " = new StoreInst("
+        << opNames[0] << ", "
+        << opNames[1] << ", "
+        << (store->isVolatile() ? "true" : "false")
+        << ", " << bbname << ");";
+    if (store->getAlignment())
+      nl(Out) << iName << "->setAlignment("
+              << store->getAlignment() << ");";
+    if (store->isAtomic()) {
+      StringRef Ordering = ConvertAtomicOrdering(store->getOrdering());
+      StringRef CrossThread = ConvertAtomicSynchScope(store->getSynchScope());
+      nl(Out) << iName << "->setAtomic("
+              << Ordering << ", " << CrossThread << ");";
+    }
+    break;
+  }
+  case Instruction::GetElementPtr: {
+    const GetElementPtrInst* gep = cast<GetElementPtrInst>(I);
+    if (gep->getNumOperands() <= 2) {
+      Out << "GetElementPtrInst* " << iName << " = GetElementPtrInst::Create("
+          << opNames[0];
+      if (gep->getNumOperands() == 2)
+        Out << ", " << opNames[1];
+    } else {
+      Out << "std::vector<Value*> " << iName << "_indices;";
+      nl(Out);
+      for (unsigned i = 1; i < gep->getNumOperands(); ++i ) {
+        Out << iName << "_indices.push_back("
+            << opNames[i] << ");";
+        nl(Out);
+      }
+      Out << "Instruction* " << iName << " = GetElementPtrInst::Create("
+          << opNames[0] << ", " << iName << "_indices";
+    }
+    Out << ", \"";
+    printEscapedString(gep->getName());
+    Out << "\", " << bbname << ");";
+    break;
+  }
+  case Instruction::PHI: {
+    const PHINode* phi = cast<PHINode>(I);
+
+    Out << "PHINode* " << iName << " = PHINode::Create("
+        << getCppName(phi->getType()) << ", "
+        << phi->getNumIncomingValues() << ", \"";
+    printEscapedString(phi->getName());
+    Out << "\", " << bbname << ");";
+    nl(Out);
+    for (unsigned i = 0; i < phi->getNumIncomingValues(); ++i) {
+      Out << iName << "->addIncoming("
+          << opNames[PHINode::getOperandNumForIncomingValue(i)] << ", "
+          << getOpName(phi->getIncomingBlock(i)) << ");";
+      nl(Out);
+    }
+    break;
+  }
+  case Instruction::Trunc:
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::FPTrunc:
+  case Instruction::FPExt:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::UIToFP:
+  case Instruction::SIToFP:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::BitCast: {
+    const CastInst* cst = cast<CastInst>(I);
+    Out << "CastInst* " << iName << " = new ";
+    switch (I->getOpcode()) {
+    case Instruction::Trunc:    Out << "TruncInst"; break;
+    case Instruction::ZExt:     Out << "ZExtInst"; break;
+    case Instruction::SExt:     Out << "SExtInst"; break;
+    case Instruction::FPTrunc:  Out << "FPTruncInst"; break;
+    case Instruction::FPExt:    Out << "FPExtInst"; break;
+    case Instruction::FPToUI:   Out << "FPToUIInst"; break;
+    case Instruction::FPToSI:   Out << "FPToSIInst"; break;
+    case Instruction::UIToFP:   Out << "UIToFPInst"; break;
+    case Instruction::SIToFP:   Out << "SIToFPInst"; break;
+    case Instruction::PtrToInt: Out << "PtrToIntInst"; break;
+    case Instruction::IntToPtr: Out << "IntToPtrInst"; break;
+    case Instruction::BitCast:  Out << "BitCastInst"; break;
+    default: llvm_unreachable("Unreachable");
+    }
+    Out << "(" << opNames[0] << ", "
+        << getCppName(cst->getType()) << ", \"";
+    printEscapedString(cst->getName());
+    Out << "\", " << bbname << ");";
+    break;
+  }
+  case Instruction::Call: {
+    const CallInst* call = cast<CallInst>(I);
+    if (const InlineAsm* ila = dyn_cast<InlineAsm>(call->getCalledValue())) {
+      Out << "InlineAsm* " << getCppName(ila) << " = InlineAsm::get("
+          << getCppName(ila->getFunctionType()) << ", \""
+          << ila->getAsmString() << "\", \""
+          << ila->getConstraintString() << "\","
+          << (ila->hasSideEffects() ? "true" : "false") << ");";
+      nl(Out);
+    }
+    if (call->getNumArgOperands() > 1) {
+      Out << "std::vector<Value*> " << iName << "_params;";
+      nl(Out);
+      for (unsigned i = 0; i < call->getNumArgOperands(); ++i) {
+        Out << iName << "_params.push_back(" << opNames[i] << ");";
+        nl(Out);
+      }
+      Out << "CallInst* " << iName << " = CallInst::Create("
+          << opNames[call->getNumArgOperands()] << ", "
+          << iName << "_params, \"";
+    } else if (call->getNumArgOperands() == 1) {
+      Out << "CallInst* " << iName << " = CallInst::Create("
+          << opNames[call->getNumArgOperands()] << ", " << opNames[0] << ", \"";
+    } else {
+      Out << "CallInst* " << iName << " = CallInst::Create("
+          << opNames[call->getNumArgOperands()] << ", \"";
+    }
+    printEscapedString(call->getName());
+    Out << "\", " << bbname << ");";
+    nl(Out) << iName << "->setCallingConv(";
+    printCallingConv(call->getCallingConv());
+    Out << ");";
+    nl(Out) << iName << "->setTailCall("
+        << (call->isTailCall() ? "true" : "false");
+    Out << ");";
+    nl(Out);
+    printAttributes(call->getAttributes(), iName);
+    Out << iName << "->setAttributes(" << iName << "_PAL);";
+    nl(Out);
+    break;
+  }
+  case Instruction::Select: {
+    const SelectInst* sel = cast<SelectInst>(I);
+    Out << "SelectInst* " << getCppName(sel) << " = SelectInst::Create(";
+    Out << opNames[0] << ", " << opNames[1] << ", " << opNames[2] << ", \"";
+    printEscapedString(sel->getName());
+    Out << "\", " << bbname << ");";
+    break;
+  }
+  case Instruction::UserOp1:
+    /// FALL THROUGH
+  case Instruction::UserOp2: {
+    /// FIXME: What should be done here?
+    break;
+  }
+  case Instruction::VAArg: {
+    const VAArgInst* va = cast<VAArgInst>(I);
+    Out << "VAArgInst* " << getCppName(va) << " = new VAArgInst("
+        << opNames[0] << ", " << getCppName(va->getType()) << ", \"";
+    printEscapedString(va->getName());
+    Out << "\", " << bbname << ");";
+    break;
+  }
+  case Instruction::ExtractElement: {
+    const ExtractElementInst* eei = cast<ExtractElementInst>(I);
+    Out << "ExtractElementInst* " << getCppName(eei)
+        << " = new ExtractElementInst(" << opNames[0]
+        << ", " << opNames[1] << ", \"";
+    printEscapedString(eei->getName());
+    Out << "\", " << bbname << ");";
+    break;
+  }
+  case Instruction::InsertElement: {
+    const InsertElementInst* iei = cast<InsertElementInst>(I);
+    Out << "InsertElementInst* " << getCppName(iei)
+        << " = InsertElementInst::Create(" << opNames[0]
+        << ", " << opNames[1] << ", " << opNames[2] << ", \"";
+    printEscapedString(iei->getName());
+    Out << "\", " << bbname << ");";
+    break;
+  }
+  case Instruction::ShuffleVector: {
+    const ShuffleVectorInst* svi = cast<ShuffleVectorInst>(I);
+    Out << "ShuffleVectorInst* " << getCppName(svi)
+        << " = new ShuffleVectorInst(" << opNames[0]
+        << ", " << opNames[1] << ", " << opNames[2] << ", \"";
+    printEscapedString(svi->getName());
+    Out << "\", " << bbname << ");";
+    break;
+  }
+  case Instruction::ExtractValue: {
+    const ExtractValueInst *evi = cast<ExtractValueInst>(I);
+    Out << "std::vector<unsigned> " << iName << "_indices;";
+    nl(Out);
+    for (unsigned i = 0; i < evi->getNumIndices(); ++i) {
+      Out << iName << "_indices.push_back("
+          << evi->idx_begin()[i] << ");";
+      nl(Out);
+    }
+    Out << "ExtractValueInst* " << getCppName(evi)
+        << " = ExtractValueInst::Create(" << opNames[0]
+        << ", "
+        << iName << "_indices, \"";
+    printEscapedString(evi->getName());
+    Out << "\", " << bbname << ");";
+    break;
+  }
+  case Instruction::InsertValue: {
+    const InsertValueInst *ivi = cast<InsertValueInst>(I);
+    Out << "std::vector<unsigned> " << iName << "_indices;";
+    nl(Out);
+    for (unsigned i = 0; i < ivi->getNumIndices(); ++i) {
+      Out << iName << "_indices.push_back("
+          << ivi->idx_begin()[i] << ");";
+      nl(Out);
+    }
+    Out << "InsertValueInst* " << getCppName(ivi)
+        << " = InsertValueInst::Create(" << opNames[0]
+        << ", " << opNames[1] << ", "
+        << iName << "_indices, \"";
+    printEscapedString(ivi->getName());
+    Out << "\", " << bbname << ");";
+    break;
+  }
+  case Instruction::Fence: {
+    const FenceInst *fi = cast<FenceInst>(I);
+    StringRef Ordering = ConvertAtomicOrdering(fi->getOrdering());
+    StringRef CrossThread = ConvertAtomicSynchScope(fi->getSynchScope());
+    Out << "FenceInst* " << iName
+        << " = new FenceInst(mod->getContext(), "
+        << Ordering << ", " << CrossThread << ", " << bbname
+        << ");";
+    break;
+  }
+  case Instruction::AtomicCmpXchg: {
+    const AtomicCmpXchgInst *cxi = cast<AtomicCmpXchgInst>(I);
+    StringRef SuccessOrdering =
+        ConvertAtomicOrdering(cxi->getSuccessOrdering());
+    StringRef FailureOrdering =
+        ConvertAtomicOrdering(cxi->getFailureOrdering());
+    StringRef CrossThread = ConvertAtomicSynchScope(cxi->getSynchScope());
+    Out << "AtomicCmpXchgInst* " << iName
+        << " = new AtomicCmpXchgInst("
+        << opNames[0] << ", " << opNames[1] << ", " << opNames[2] << ", "
+        << SuccessOrdering << ", " << FailureOrdering << ", "
+        << CrossThread << ", " << bbname
+        << ");";
+    nl(Out) << iName << "->setName(\"";
+    printEscapedString(cxi->getName());
+    Out << "\");";
+    nl(Out) << iName << "->setVolatile("
+            << (cxi->isVolatile() ? "true" : "false") << ");";
+    nl(Out) << iName << "->setWeak("
+            << (cxi->isWeak() ? "true" : "false") << ");";
+    break;
+  }
+  case Instruction::AtomicRMW: {
+    const AtomicRMWInst *rmwi = cast<AtomicRMWInst>(I);
+    StringRef Ordering = ConvertAtomicOrdering(rmwi->getOrdering());
+    StringRef CrossThread = ConvertAtomicSynchScope(rmwi->getSynchScope());
+    StringRef Operation;
+    switch (rmwi->getOperation()) {
+      case AtomicRMWInst::Xchg: Operation = "AtomicRMWInst::Xchg"; break;
+      case AtomicRMWInst::Add:  Operation = "AtomicRMWInst::Add"; break;
+      case AtomicRMWInst::Sub:  Operation = "AtomicRMWInst::Sub"; break;
+      case AtomicRMWInst::And:  Operation = "AtomicRMWInst::And"; break;
+      case AtomicRMWInst::Nand: Operation = "AtomicRMWInst::Nand"; break;
+      case AtomicRMWInst::Or:   Operation = "AtomicRMWInst::Or"; break;
+      case AtomicRMWInst::Xor:  Operation = "AtomicRMWInst::Xor"; break;
+      case AtomicRMWInst::Max:  Operation = "AtomicRMWInst::Max"; break;
+      case AtomicRMWInst::Min:  Operation = "AtomicRMWInst::Min"; break;
+      case AtomicRMWInst::UMax: Operation = "AtomicRMWInst::UMax"; break;
+      case AtomicRMWInst::UMin: Operation = "AtomicRMWInst::UMin"; break;
+      case AtomicRMWInst::BAD_BINOP: llvm_unreachable("Bad atomic operation");
+    }
+    Out << "AtomicRMWInst* " << iName
+        << " = new AtomicRMWInst("
+        << Operation << ", "
+        << opNames[0] << ", " << opNames[1] << ", "
+        << Ordering << ", " << CrossThread << ", " << bbname
+        << ");";
+    nl(Out) << iName << "->setName(\"";
+    printEscapedString(rmwi->getName());
+    Out << "\");";
+    nl(Out) << iName << "->setVolatile("
+            << (rmwi->isVolatile() ? "true" : "false") << ");";
+    break;
+  }
+  case Instruction::LandingPad: {
+    const LandingPadInst *lpi = cast<LandingPadInst>(I);
+    Out << "LandingPadInst* " << iName << " = LandingPadInst::Create(";
+    printCppName(lpi->getType());
+    Out << ", " << opNames[0] << ", " << lpi->getNumClauses() << ", \"";
+    printEscapedString(lpi->getName());
+    Out << "\", " << bbname << ");";
+    nl(Out) << iName << "->setCleanup("
+            << (lpi->isCleanup() ? "true" : "false")
+            << ");";
+    for (unsigned i = 0, e = lpi->getNumClauses(); i != e; ++i)
+      nl(Out) << iName << "->addClause(" << opNames[i+1] << ");";
+    break;
+  }
+  }
+  DefinedValues.insert(I);
+  nl(Out);
+  delete [] opNames;
+}
+
+// Print out the types, constants and declarations needed by one function
+void CppWriter::printFunctionUses(const Function* F) {
+  nl(Out) << "// Type Definitions"; nl(Out);
+  if (!is_inline) {
+    // Print the function's return type
+    printType(F->getReturnType());
+
+    // Print the function's function type
+    printType(F->getFunctionType());
+
+    // Print the types of each of the function's arguments
+    for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
+         AI != AE; ++AI) {
+      printType(AI->getType());
+    }
+  }
+
+  // Print type definitions for every type referenced by an instruction and
+  // make a note of any global values or constants that are referenced
+  SmallPtrSet<GlobalValue*,64> gvs;
+  SmallPtrSet<Constant*,64> consts;
+  for (Function::const_iterator BB = F->begin(), BE = F->end();
+       BB != BE; ++BB){
+    for (BasicBlock::const_iterator I = BB->begin(), E = BB->end();
+         I != E; ++I) {
+      // Print the type of the instruction itself
+      printType(I->getType());
+
+      // Print the type of each of the instruction's operands
+      for (unsigned i = 0; i < I->getNumOperands(); ++i) {
+        Value* operand = I->getOperand(i);
+        printType(operand->getType());
+
+        // If the operand references a GVal or Constant, make a note of it
+        if (GlobalValue* GV = dyn_cast<GlobalValue>(operand)) {
+          gvs.insert(GV);
+          if (GenerationType != GenFunction)
+            if (GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
+              if (GVar->hasInitializer())
+                consts.insert(GVar->getInitializer());
+        } else if (Constant* C = dyn_cast<Constant>(operand)) {
+          consts.insert(C);
+          for (unsigned j = 0; j < C->getNumOperands(); ++j) {
+            // If the operand references a GVal or Constant, make a note of it
+            Value* operand = C->getOperand(j);
+            printType(operand->getType());
+            if (GlobalValue* GV = dyn_cast<GlobalValue>(operand)) {
+              gvs.insert(GV);
+              if (GenerationType != GenFunction)
+                if (GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
+                  if (GVar->hasInitializer())
+                    consts.insert(GVar->getInitializer());
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // Print the function declarations for any functions encountered
+  nl(Out) << "// Function Declarations"; nl(Out);
+  for (auto *GV : gvs) {
+    if (Function *Fun = dyn_cast<Function>(GV)) {
+      if (!is_inline || Fun != F)
+        printFunctionHead(Fun);
+    }
+  }
+
+  // Print the global variable declarations for any variables encountered
+  nl(Out) << "// Global Variable Declarations"; nl(Out);
+  for (auto *GV : gvs) {
+    if (GlobalVariable *F = dyn_cast<GlobalVariable>(GV))
+      printVariableHead(F);
+  }
+
+  // Print the constants found
+  nl(Out) << "// Constant Definitions"; nl(Out);
+  for (const auto *C : consts) {
+    printConstant(C);
+  }
+
+  // Process the global variables definitions now that all the constants have
+  // been emitted. These definitions just couple the gvars with their constant
+  // initializers.
+  if (GenerationType != GenFunction) {
+    nl(Out) << "// Global Variable Definitions"; nl(Out);
+    for (const auto &GV : gvs) {
+      if (GlobalVariable *Var = dyn_cast<GlobalVariable>(GV))
+        printVariableBody(Var);
+    }
+  }
+}
+
+void CppWriter::printFunctionHead(const Function* F) {
+  nl(Out) << "Function* " << getCppName(F);
+  Out << " = mod->getFunction(\"";
+  printEscapedString(F->getName());
+  Out << "\");";
+  nl(Out) << "if (!" << getCppName(F) << ") {";
+  nl(Out) << getCppName(F);
+
+  Out<< " = Function::Create(";
+  nl(Out,1) << "/*Type=*/" << getCppName(F->getFunctionType()) << ",";
+  nl(Out) << "/*Linkage=*/";
+  printLinkageType(F->getLinkage());
+  Out << ",";
+  nl(Out) << "/*Name=*/\"";
+  printEscapedString(F->getName());
+  Out << "\", mod); " << (F->isDeclaration()? "// (external, no body)" : "");
+  nl(Out,-1);
+  printCppName(F);
+  Out << "->setCallingConv(";
+  printCallingConv(F->getCallingConv());
+  Out << ");";
+  nl(Out);
+  if (F->hasSection()) {
+    printCppName(F);
+    Out << "->setSection(\"" << F->getSection() << "\");";
+    nl(Out);
+  }
+  if (F->getAlignment()) {
+    printCppName(F);
+    Out << "->setAlignment(" << F->getAlignment() << ");";
+    nl(Out);
+  }
+  if (F->getVisibility() != GlobalValue::DefaultVisibility) {
+    printCppName(F);
+    Out << "->setVisibility(";
+    printVisibilityType(F->getVisibility());
+    Out << ");";
+    nl(Out);
+  }
+  if (F->getDLLStorageClass() != GlobalValue::DefaultStorageClass) {
+    printCppName(F);
+    Out << "->setDLLStorageClass(";
+    printDLLStorageClassType(F->getDLLStorageClass());
+    Out << ");";
+    nl(Out);
+  }
+  if (F->hasGC()) {
+    printCppName(F);
+    Out << "->setGC(\"" << F->getGC() << "\");";
+    nl(Out);
+  }
+  Out << "}";
+  nl(Out);
+  printAttributes(F->getAttributes(), getCppName(F));
+  printCppName(F);
+  Out << "->setAttributes(" << getCppName(F) << "_PAL);";
+  nl(Out);
+}
+
+void CppWriter::printFunctionBody(const Function *F) {
+  if (F->isDeclaration())
+    return; // external functions have no bodies.
+
+  // Clear the DefinedValues and ForwardRefs maps because we can't have
+  // cross-function forward refs
+  ForwardRefs.clear();
+  DefinedValues.clear();
+
+  // Create all the argument values
+  if (!is_inline) {
+    if (!F->arg_empty()) {
+      Out << "Function::arg_iterator args = " << getCppName(F)
+          << "->arg_begin();";
+      nl(Out);
+    }
+    for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
+         AI != AE; ++AI) {
+      Out << "Value* " << getCppName(AI) << " = args++;";
+      nl(Out);
+      if (AI->hasName()) {
+        Out << getCppName(AI) << "->setName(\"";
+        printEscapedString(AI->getName());
+        Out << "\");";
+        nl(Out);
+      }
+    }
+  }
+
+  // Create all the basic blocks
+  nl(Out);
+  for (Function::const_iterator BI = F->begin(), BE = F->end();
+       BI != BE; ++BI) {
+    std::string bbname(getCppName(BI));
+    Out << "BasicBlock* " << bbname <<
+           " = BasicBlock::Create(mod->getContext(), \"";
+    if (BI->hasName())
+      printEscapedString(BI->getName());
+    Out << "\"," << getCppName(BI->getParent()) << ",0);";
+    nl(Out);
+  }
+
+  // Output all of its basic blocks... for the function
+  for (Function::const_iterator BI = F->begin(), BE = F->end();
+       BI != BE; ++BI) {
+    std::string bbname(getCppName(BI));
+    nl(Out) << "// Block " << BI->getName() << " (" << bbname << ")";
+    nl(Out);
+
+    // Output all of the instructions in the basic block...
+    for (BasicBlock::const_iterator I = BI->begin(), E = BI->end();
+         I != E; ++I) {
+      printInstruction(I,bbname);
+    }
+  }
+
+  // Loop over the ForwardRefs and resolve them now that all instructions
+  // are generated.
+  if (!ForwardRefs.empty()) {
+    nl(Out) << "// Resolve Forward References";
+    nl(Out);
+  }
+
+  while (!ForwardRefs.empty()) {
+    ForwardRefMap::iterator I = ForwardRefs.begin();
+    Out << I->second << "->replaceAllUsesWith("
+        << getCppName(I->first) << "); delete " << I->second << ";";
+    nl(Out);
+    ForwardRefs.erase(I);
+  }
+}
+
+void CppWriter::printInline(const std::string& fname,
+                            const std::string& func) {
+  const Function* F = TheModule->getFunction(func);
+  if (!F) {
+    error(std::string("Function '") + func + "' not found in input module");
+    return;
+  }
+  if (F->isDeclaration()) {
+    error(std::string("Function '") + func + "' is external!");
+    return;
+  }
+  nl(Out) << "BasicBlock* " << fname << "(Module* mod, Function *"
+          << getCppName(F);
+  unsigned arg_count = 1;
+  for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
+       AI != AE; ++AI) {
+    Out << ", Value* arg_" << arg_count++;
+  }
+  Out << ") {";
+  nl(Out);
+  is_inline = true;
+  printFunctionUses(F);
+  printFunctionBody(F);
+  is_inline = false;
+  Out << "return " << getCppName(F->begin()) << ";";
+  nl(Out) << "}";
+  nl(Out);
+}
+
+void CppWriter::printModuleBody() {
+  // Print out all the type definitions
+  nl(Out) << "// Type Definitions"; nl(Out);
+  printTypes(TheModule);
+
+  // Functions can call each other and global variables can reference them so
+  // define all the functions first before emitting their function bodies.
+  nl(Out) << "// Function Declarations"; nl(Out);
+  for (Module::const_iterator I = TheModule->begin(), E = TheModule->end();
+       I != E; ++I)
+    printFunctionHead(I);
+
+  // Process the global variables declarations. We can't initialze them until
+  // after the constants are printed so just print a header for each global
+  nl(Out) << "// Global Variable Declarations\n"; nl(Out);
+  for (Module::const_global_iterator I = TheModule->global_begin(),
+         E = TheModule->global_end(); I != E; ++I) {
+    printVariableHead(I);
+  }
+
+  // Print out all the constants definitions. Constants don't recurse except
+  // through GlobalValues. All GlobalValues have been declared at this point
+  // so we can proceed to generate the constants.
+  nl(Out) << "// Constant Definitions"; nl(Out);
+  printConstants(TheModule);
+
+  // Process the global variables definitions now that all the constants have
+  // been emitted. These definitions just couple the gvars with their constant
+  // initializers.
+  nl(Out) << "// Global Variable Definitions"; nl(Out);
+  for (Module::const_global_iterator I = TheModule->global_begin(),
+         E = TheModule->global_end(); I != E; ++I) {
+    printVariableBody(I);
+  }
+
+  // Finally, we can safely put out all of the function bodies.
+  nl(Out) << "// Function Definitions"; nl(Out);
+  for (Module::const_iterator I = TheModule->begin(), E = TheModule->end();
+       I != E; ++I) {
+    if (!I->isDeclaration()) {
+      nl(Out) << "// Function: " << I->getName() << " (" << getCppName(I)
+              << ")";
+      nl(Out) << "{";
+      nl(Out,1);
+      printFunctionBody(I);
+      nl(Out,-1) << "}";
+      nl(Out);
+    }
+  }
+}
+
+void CppWriter::printProgram(const std::string& fname,
+                             const std::string& mName) {
+  Out << "#include <llvm/Pass.h>\n";
+  Out << "#include <llvm/PassManager.h>\n";
+
+  Out << "#include <llvm/ADT/SmallVector.h>\n";
+  Out << "#include <llvm/Analysis/Verifier.h>\n";
+  Out << "#include <llvm/IR/BasicBlock.h>\n";
+  Out << "#include <llvm/IR/CallingConv.h>\n";
+  Out << "#include <llvm/IR/Constants.h>\n";
+  Out << "#include <llvm/IR/DerivedTypes.h>\n";
+  Out << "#include <llvm/IR/Function.h>\n";
+  Out << "#include <llvm/IR/GlobalVariable.h>\n";
+  Out << "#include <llvm/IR/IRPrintingPasses.h>\n";
+  Out << "#include <llvm/IR/InlineAsm.h>\n";
+  Out << "#include <llvm/IR/Instructions.h>\n";
+  Out << "#include <llvm/IR/LLVMContext.h>\n";
+  Out << "#include <llvm/IR/Module.h>\n";
+  Out << "#include <llvm/Support/FormattedStream.h>\n";
+  Out << "#include <llvm/Support/MathExtras.h>\n";
+  Out << "#include <algorithm>\n";
+  Out << "using namespace llvm;\n\n";
+  Out << "Module* " << fname << "();\n\n";
+  Out << "int main(int argc, char**argv) {\n";
+  Out << "  Module* Mod = " << fname << "();\n";
+  Out << "  verifyModule(*Mod, PrintMessageAction);\n";
+  Out << "  PassManager PM;\n";
+  Out << "  PM.add(createPrintModulePass(&outs()));\n";
+  Out << "  PM.run(*Mod);\n";
+  Out << "  return 0;\n";
+  Out << "}\n\n";
+  printModule(fname,mName);
+}
+
+void CppWriter::printModule(const std::string& fname,
+                            const std::string& mName) {
+  nl(Out) << "Module* " << fname << "() {";
+  nl(Out,1) << "// Module Construction";
+  nl(Out) << "Module* mod = new Module(\"";
+  printEscapedString(mName);
+  Out << "\", getGlobalContext());";
+  if (!TheModule->getTargetTriple().empty()) {
+    nl(Out) << "mod->setDataLayout(\"" << TheModule->getDataLayout() << "\");";
+  }
+  if (!TheModule->getTargetTriple().empty()) {
+    nl(Out) << "mod->setTargetTriple(\"" << TheModule->getTargetTriple()
+            << "\");";
+  }
+
+  if (!TheModule->getModuleInlineAsm().empty()) {
+    nl(Out) << "mod->setModuleInlineAsm(\"";
+    printEscapedString(TheModule->getModuleInlineAsm());
+    Out << "\");";
+  }
+  nl(Out);
+
+  printModuleBody();
+  nl(Out) << "return mod;";
+  nl(Out,-1) << "}";
+  nl(Out);
+}
+
+void CppWriter::printContents(const std::string& fname,
+                              const std::string& mName) {
+  Out << "\nModule* " << fname << "(Module *mod) {\n";
+  Out << "\nmod->setModuleIdentifier(\"";
+  printEscapedString(mName);
+  Out << "\");\n";
+  printModuleBody();
+  Out << "\nreturn mod;\n";
+  Out << "\n}\n";
+}
+
+void CppWriter::printFunction(const std::string& fname,
+                              const std::string& funcName) {
+  const Function* F = TheModule->getFunction(funcName);
+  if (!F) {
+    error(std::string("Function '") + funcName + "' not found in input module");
+    return;
+  }
+  Out << "\nFunction* " << fname << "(Module *mod) {\n";
+  printFunctionUses(F);
+  printFunctionHead(F);
+  printFunctionBody(F);
+  Out << "return " << getCppName(F) << ";\n";
+  Out << "}\n";
+}
+
+void CppWriter::printFunctions() {
+  const Module::FunctionListType &funcs = TheModule->getFunctionList();
+  Module::const_iterator I  = funcs.begin();
+  Module::const_iterator IE = funcs.end();
+
+  for (; I != IE; ++I) {
+    const Function &func = *I;
+    if (!func.isDeclaration()) {
+      std::string name("define_");
+      name += func.getName();
+      printFunction(name, func.getName());
+    }
+  }
+}
+
+void CppWriter::printVariable(const std::string& fname,
+                              const std::string& varName) {
+  const GlobalVariable* GV = TheModule->getNamedGlobal(varName);
+
+  if (!GV) {
+    error(std::string("Variable '") + varName + "' not found in input module");
+    return;
+  }
+  Out << "\nGlobalVariable* " << fname << "(Module *mod) {\n";
+  printVariableUses(GV);
+  printVariableHead(GV);
+  printVariableBody(GV);
+  Out << "return " << getCppName(GV) << ";\n";
+  Out << "}\n";
+}
+
+void CppWriter::printType(const std::string &fname,
+                          const std::string &typeName) {
+  Type* Ty = TheModule->getTypeByName(typeName);
+  if (!Ty) {
+    error(std::string("Type '") + typeName + "' not found in input module");
+    return;
+  }
+  Out << "\nType* " << fname << "(Module *mod) {\n";
+  printType(Ty);
+  Out << "return " << getCppName(Ty) << ";\n";
+  Out << "}\n";
+}
+
+bool CppWriter::runOnModule(Module &M) {
+  TheModule = &M;
+
+  // Emit a header
+  Out << "// Generated by llvm2cpp - DO NOT MODIFY!\n\n";
+
+  // Get the name of the function we're supposed to generate
+  std::string fname = FuncName.getValue();
+
+  // Get the name of the thing we are to generate
+  std::string tgtname = NameToGenerate.getValue();
+  if (GenerationType == GenModule ||
+      GenerationType == GenContents ||
+      GenerationType == GenProgram ||
+      GenerationType == GenFunctions) {
+    if (tgtname == "!bad!") {
+      if (M.getModuleIdentifier() == "-")
+        tgtname = "<stdin>";
+      else
+        tgtname = M.getModuleIdentifier();
+    }
+  } else if (tgtname == "!bad!")
+    error("You must use the -for option with -gen-{function,variable,type}");
+
+  switch (WhatToGenerate(GenerationType)) {
+   case GenProgram:
+    if (fname.empty())
+      fname = "makeLLVMModule";
+    printProgram(fname,tgtname);
+    break;
+   case GenModule:
+    if (fname.empty())
+      fname = "makeLLVMModule";
+    printModule(fname,tgtname);
+    break;
+   case GenContents:
+    if (fname.empty())
+      fname = "makeLLVMModuleContents";
+    printContents(fname,tgtname);
+    break;
+   case GenFunction:
+    if (fname.empty())
+      fname = "makeLLVMFunction";
+    printFunction(fname,tgtname);
+    break;
+   case GenFunctions:
+    printFunctions();
+    break;
+   case GenInline:
+    if (fname.empty())
+      fname = "makeLLVMInline";
+    printInline(fname,tgtname);
+    break;
+   case GenVariable:
+    if (fname.empty())
+      fname = "makeLLVMVariable";
+    printVariable(fname,tgtname);
+    break;
+   case GenType:
+    if (fname.empty())
+      fname = "makeLLVMType";
+    printType(fname,tgtname);
+    break;
+  }
+
+  return false;
+}
+
+char CppWriter::ID = 0;
+
+//===----------------------------------------------------------------------===//
+//                       External Interface declaration
+//===----------------------------------------------------------------------===//
+
+bool CPPTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
+                                           formatted_raw_ostream &o,
+                                           CodeGenFileType FileType,
+                                           bool DisableVerify,
+                                           AnalysisID StartAfter,
+                                           AnalysisID StopAfter) {
+  if (FileType != TargetMachine::CGFT_AssemblyFile) return true;
+  PM.add(new CppWriter(o));
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/CppBackend/CPPTargetMachine.h b/contrib/llvm/lib/Target/CppBackend/CPPTargetMachine.h
new file mode 100644
index 0000000..673ade7
--- /dev/null
+++ b/contrib/llvm/lib/Target/CppBackend/CPPTargetMachine.h
@@ -0,0 +1,44 @@
+//===-- CPPTargetMachine.h - TargetMachine for the C++ backend --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the TargetMachine that is used by the C++ backend.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CPPTARGETMACHINE_H
+#define CPPTARGETMACHINE_H
+
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class formatted_raw_ostream;
+
+struct CPPTargetMachine : public TargetMachine {
+  CPPTargetMachine(const Target &T, StringRef TT,
+                   StringRef CPU, StringRef FS, const TargetOptions &Options,
+                   Reloc::Model RM, CodeModel::Model CM,
+                   CodeGenOpt::Level OL)
+    : TargetMachine(T, TT, CPU, FS, Options) {}
+
+  bool addPassesToEmitFile(PassManagerBase &PM, formatted_raw_ostream &Out,
+                           CodeGenFileType FileType, bool DisableVerify,
+                           AnalysisID StartAfter,
+                           AnalysisID StopAfter) override;
+
+  const DataLayout *getDataLayout() const override { return nullptr; }
+};
+
+extern Target TheCppBackendTarget;
+
+} // End llvm namespace
+
+
+#endif
diff --git a/contrib/llvm/lib/Target/CppBackend/TargetInfo/CppBackendTargetInfo.cpp b/contrib/llvm/lib/Target/CppBackend/TargetInfo/CppBackendTargetInfo.cpp
new file mode 100644
index 0000000..096dc73
--- /dev/null
+++ b/contrib/llvm/lib/Target/CppBackend/TargetInfo/CppBackendTargetInfo.cpp
@@ -0,0 +1,29 @@
+//===-- CppBackendTargetInfo.cpp - CppBackend Target Implementation -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CPPTargetMachine.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheCppBackendTarget;
+
+static bool CppBackend_TripleMatchQuality(Triple::ArchType Arch) {
+  // This backend doesn't correspond to any architecture. It must be explicitly
+  // selected with -march.
+  return false;
+}
+
+extern "C" void LLVMInitializeCppBackendTargetInfo() { 
+  TargetRegistry::RegisterTarget(TheCppBackendTarget, "cpp",    
+                                  "C++ backend",
+                                  &CppBackend_TripleMatchQuality);
+}
+
+extern "C" void LLVMInitializeCppBackendTargetMC() {}
diff --git a/contrib/llvm/lib/Target/Hexagon/Hexagon.h b/contrib/llvm/lib/Target/Hexagon/Hexagon.h
new file mode 100644
index 0000000..5467ee3
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/Hexagon.h
@@ -0,0 +1,82 @@
+//=-- Hexagon.h - Top-level interface for Hexagon representation --*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// Hexagon back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_Hexagon_H
+#define TARGET_Hexagon_H
+
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+  class FunctionPass;
+  class ModulePass;
+  class TargetMachine;
+  class MachineInstr;
+  class HexagonMCInst;
+  class HexagonAsmPrinter;
+  class HexagonTargetMachine;
+  class raw_ostream;
+
+  FunctionPass *createHexagonISelDag(HexagonTargetMachine &TM,
+                                     CodeGenOpt::Level OptLevel);
+  FunctionPass *createHexagonDelaySlotFillerPass(const TargetMachine &TM);
+  FunctionPass *createHexagonFPMoverPass(const TargetMachine &TM);
+  FunctionPass *createHexagonRemoveExtendArgs(const HexagonTargetMachine &TM);
+  FunctionPass *createHexagonCFGOptimizer(const HexagonTargetMachine &TM);
+
+  FunctionPass *createHexagonSplitTFRCondSets(const HexagonTargetMachine &TM);
+  FunctionPass *createHexagonSplitConst32AndConst64(
+                      const HexagonTargetMachine &TM);
+  FunctionPass *createHexagonExpandPredSpillCode(
+                      const HexagonTargetMachine &TM);
+  FunctionPass *createHexagonHardwareLoops();
+  FunctionPass *createHexagonPeephole();
+  FunctionPass *createHexagonFixupHwLoops();
+  FunctionPass *createHexagonNewValueJump();
+  FunctionPass *createHexagonCopyToCombine();
+  FunctionPass *createHexagonPacketizer();
+  FunctionPass *createHexagonNewValueJump();
+
+/* TODO: object output.
+  MCCodeEmitter *createHexagonMCCodeEmitter(const Target &,
+                                            const TargetMachine &TM,
+                                            MCContext &Ctx);
+*/
+/* TODO: assembler input.
+  TargetAsmBackend *createHexagonAsmBackend(const Target &,
+                                                  const std::string &);
+*/
+  void HexagonLowerToMC(const MachineInstr *MI, HexagonMCInst &MCI,
+                        HexagonAsmPrinter &AP);
+} // end namespace llvm;
+
+#define Hexagon_POINTER_SIZE 4
+
+#define Hexagon_PointerSize (Hexagon_POINTER_SIZE)
+#define Hexagon_PointerSize_Bits (Hexagon_POINTER_SIZE * 8)
+#define Hexagon_WordSize Hexagon_PointerSize
+#define Hexagon_WordSize_Bits Hexagon_PointerSize_Bits
+
+// allocframe saves LR and FP on stack before allocating
+// a new stack frame. This takes 8 bytes.
+#define HEXAGON_LRFP_SIZE 8
+
+// Normal instruction size (in bytes).
+#define HEXAGON_INSTR_SIZE 4
+
+// Maximum number of words and instructions in a packet.
+#define HEXAGON_PACKET_SIZE 4
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/Hexagon.td b/contrib/llvm/lib/Target/Hexagon/Hexagon.td
new file mode 100644
index 0000000..5f4a6c6
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/Hexagon.td
@@ -0,0 +1,213 @@
+//===-- Hexagon.td - Describe the Hexagon Target Machine --*- tablegen -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the top level entry point for the Hexagon target.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// Hexagon Subtarget features.
+//===----------------------------------------------------------------------===//
+
+// Hexagon Archtectures
+def ArchV2       : SubtargetFeature<"v2", "HexagonArchVersion", "V2",
+                                    "Hexagon v2">;
+def ArchV3       : SubtargetFeature<"v3", "HexagonArchVersion", "V3",
+                                    "Hexagon v3">;
+def ArchV4       : SubtargetFeature<"v4", "HexagonArchVersion", "V4",
+                                    "Hexagon v4">;
+def ArchV5       : SubtargetFeature<"v5", "HexagonArchVersion", "V5",
+                                    "Hexagon v5">;
+
+//===----------------------------------------------------------------------===//
+// Hexagon Instruction Predicate Definitions.
+//===----------------------------------------------------------------------===//
+def HasV2T                      : Predicate<"Subtarget.hasV2TOps()">;
+def HasV2TOnly                  : Predicate<"Subtarget.hasV2TOpsOnly()">;
+def NoV2T                       : Predicate<"!Subtarget.hasV2TOps()">;
+def HasV3T                      : Predicate<"Subtarget.hasV3TOps()">;
+def HasV3TOnly                  : Predicate<"Subtarget.hasV3TOpsOnly()">;
+def NoV3T                       : Predicate<"!Subtarget.hasV3TOps()">;
+def HasV4T                      : Predicate<"Subtarget.hasV4TOps()">;
+def NoV4T                       : Predicate<"!Subtarget.hasV4TOps()">;
+def HasV5T                      : Predicate<"Subtarget.hasV5TOps()">;
+def NoV5T                       : Predicate<"!Subtarget.hasV5TOps()">;
+def UseMEMOP                    : Predicate<"Subtarget.useMemOps()">;
+def IEEERndNearV5T              : Predicate<"Subtarget.modeIEEERndNear()">;
+
+//===----------------------------------------------------------------------===//
+// Classes used for relation maps.
+//===----------------------------------------------------------------------===//
+// PredRel - Filter class used to relate non-predicated instructions with their
+// predicated forms.
+class PredRel;
+// PredNewRel - Filter class used to relate predicated instructions with their
+// predicate-new forms.
+class PredNewRel: PredRel;
+// ImmRegRel - Filter class used to relate instructions having reg-reg form
+// with their reg-imm counterparts.
+class ImmRegRel;
+// NewValueRel - Filter class used to relate regular store instructions with
+// their new-value store form.
+class NewValueRel: PredNewRel;
+// NewValueRel - Filter class used to relate load/store instructions having
+// different addressing modes with each other.
+class AddrModeRel: NewValueRel;
+
+//===----------------------------------------------------------------------===//
+// Generate mapping table to relate non-predicate instructions with their
+// predicated formats - true and false.
+//
+
+def getPredOpcode : InstrMapping {
+  let FilterClass = "PredRel";
+  // Instructions with the same BaseOpcode and isNVStore values form a row.
+  let RowFields = ["BaseOpcode", "isNVStore", "PNewValue"];
+  // Instructions with the same predicate sense form a column.
+  let ColFields = ["PredSense"];
+  // The key column is the unpredicated instructions.
+  let KeyCol = [""];
+  // Value columns are PredSense=true and PredSense=false
+  let ValueCols = [["true"], ["false"]];
+}
+
+//===----------------------------------------------------------------------===//
+// Generate mapping table to relate predicate-true instructions with their
+// predicate-false forms
+//
+def getFalsePredOpcode : InstrMapping {
+  let FilterClass = "PredRel";
+  let RowFields = ["BaseOpcode", "PNewValue", "isNVStore", "isBrTaken"];
+  let ColFields = ["PredSense"];
+  let KeyCol = ["true"];
+  let ValueCols = [["false"]];
+}
+
+//===----------------------------------------------------------------------===//
+// Generate mapping table to relate predicate-false instructions with their
+// predicate-true forms
+//
+def getTruePredOpcode : InstrMapping {
+  let FilterClass = "PredRel";
+  let RowFields = ["BaseOpcode", "PNewValue", "isNVStore", "isBrTaken"];
+  let ColFields = ["PredSense"];
+  let KeyCol = ["false"];
+  let ValueCols = [["true"]];
+}
+
+//===----------------------------------------------------------------------===//
+// Generate mapping table to relate predicated instructions with their .new
+// format.
+//
+def getPredNewOpcode : InstrMapping {
+  let FilterClass = "PredNewRel";
+  let RowFields = ["BaseOpcode", "PredSense", "isNVStore", "isBrTaken"];
+  let ColFields = ["PNewValue"];
+  let KeyCol = [""];
+  let ValueCols = [["new"]];
+}
+
+//===----------------------------------------------------------------------===//
+// Generate mapping table to relate .new predicated instructions with their old
+// format.
+//
+def getPredOldOpcode : InstrMapping {
+  let FilterClass = "PredNewRel";
+  let RowFields = ["BaseOpcode", "PredSense", "isNVStore"];
+  let ColFields = ["PNewValue"];
+  let KeyCol = ["new"];
+  let ValueCols = [[""]];
+}
+
+//===----------------------------------------------------------------------===//
+// Generate mapping table to relate store instructions with their new-value
+// format.
+//
+def getNewValueOpcode : InstrMapping {
+  let FilterClass = "NewValueRel";
+  let RowFields = ["BaseOpcode", "PredSense", "PNewValue"];
+  let ColFields = ["NValueST"];
+  let KeyCol = ["false"];
+  let ValueCols = [["true"]];
+}
+
+//===----------------------------------------------------------------------===//
+// Generate mapping table to relate new-value store instructions with their old
+// format.
+//
+def getNonNVStore : InstrMapping {
+  let FilterClass = "NewValueRel";
+  let RowFields = ["BaseOpcode", "PredSense", "PNewValue"];
+  let ColFields = ["NValueST"];
+  let KeyCol = ["true"];
+  let ValueCols = [["false"]];
+}
+
+def getBasedWithImmOffset : InstrMapping {
+  let FilterClass = "AddrModeRel";
+  let RowFields = ["CextOpcode", "PredSense", "PNewValue", "isNVStore",
+                   "isMEMri", "isFloat"];
+  let ColFields = ["addrMode"];
+  let KeyCol = ["Absolute"];
+  let ValueCols = [["BaseImmOffset"]];
+}
+
+def getBaseWithRegOffset : InstrMapping {
+  let FilterClass = "AddrModeRel";
+  let RowFields = ["CextOpcode", "PredSense", "PNewValue", "isNVStore"];
+  let ColFields = ["addrMode"];
+  let KeyCol = ["BaseImmOffset"];
+  let ValueCols = [["BaseRegOffset"]];
+}
+
+def getRegForm : InstrMapping {
+  let FilterClass = "ImmRegRel";
+  let RowFields = ["CextOpcode", "PredSense", "PNewValue"];
+  let ColFields = ["InputType"];
+  let KeyCol = ["imm"];
+  let ValueCols = [["reg"]];
+}
+
+//===----------------------------------------------------------------------===//
+// Register File, Calling Conv, Instruction Descriptions
+//===----------------------------------------------------------------------===//
+include "HexagonSchedule.td"
+include "HexagonRegisterInfo.td"
+include "HexagonCallingConv.td"
+include "HexagonInstrInfo.td"
+include "HexagonIntrinsics.td"
+include "HexagonIntrinsicsDerived.td"
+
+def HexagonInstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// Hexagon processors supported.
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, SchedMachineModel Model,
+           list<SubtargetFeature> Features>
+ : ProcessorModel<Name, Model, Features>;
+
+def : Proc<"hexagonv4", HexagonModelV4, [ArchV2, ArchV3, ArchV4]>;
+def : Proc<"hexagonv5", HexagonModelV4, [ArchV2, ArchV3, ArchV4, ArchV5]>;
+
+//===----------------------------------------------------------------------===//
+// Declare the target which we are implementing
+//===----------------------------------------------------------------------===//
+
+def Hexagon : Target {
+  // Pull in Instruction Info:
+  let InstructionSet = HexagonInstrInfo;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
new file mode 100644
index 0000000..2e011bd
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -0,0 +1,236 @@
+//===-- HexagonAsmPrinter.cpp - Print machine instrs to Hexagon assembly --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to Hexagon assembly language. This printer is
+// the output mechanism used by `llc'.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Hexagon.h"
+#include "HexagonAsmPrinter.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
+#include "InstPrinter/HexagonInstPrinter.h"
+#include "MCTargetDesc/HexagonMCInst.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+static cl::opt<bool> AlignCalls(
+         "hexagon-align-calls", cl::Hidden, cl::init(true),
+          cl::desc("Insert falign after call instruction for Hexagon target"));
+
+void HexagonAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
+                                    raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand(OpNo);
+
+  switch (MO.getType()) {
+  default: llvm_unreachable ("<unknown operand type>");
+  case MachineOperand::MO_Register:
+    O << HexagonInstPrinter::getRegisterName(MO.getReg());
+    return;
+  case MachineOperand::MO_Immediate:
+    O << MO.getImm();
+    return;
+  case MachineOperand::MO_MachineBasicBlock:
+    O << *MO.getMBB()->getSymbol();
+    return;
+  case MachineOperand::MO_ConstantPoolIndex:
+    O << *GetCPISymbol(MO.getIndex());
+    return;
+  case MachineOperand::MO_GlobalAddress:
+    // Computing the address of a global symbol, not calling it.
+    O << *getSymbol(MO.getGlobal());
+    printOffset(MO.getOffset(), O);
+    return;
+  }
+}
+
+//
+// isBlockOnlyReachableByFallthrough - We need to override this since the
+// default AsmPrinter does not print labels for any basic block that
+// is only reachable by a fall through. That works for all cases except
+// for the case in which the basic block is reachable by a fall through but
+// through an indirect from a jump table. In this case, the jump table
+// will contain a label not defined by AsmPrinter.
+//
+bool HexagonAsmPrinter::
+isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const {
+  if (MBB->hasAddressTaken()) {
+    return false;
+  }
+  return AsmPrinter::isBlockOnlyReachableByFallthrough(MBB);
+}
+
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool HexagonAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                        unsigned AsmVariant,
+                                        const char *ExtraCode,
+                                        raw_ostream &OS) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, OS);
+    case 'c': // Don't print "$" before a global var name or constant.
+      // Hexagon never has a prefix.
+      printOperand(MI, OpNo, OS);
+      return false;
+    case 'L': // Write second word of DImode reference.
+      // Verify that this operand has two consecutive registers.
+      if (!MI->getOperand(OpNo).isReg() ||
+          OpNo+1 == MI->getNumOperands() ||
+          !MI->getOperand(OpNo+1).isReg())
+        return true;
+      ++OpNo;   // Return the high-part.
+      break;
+    case 'I':
+      // Write 'i' if an integer constant, otherwise nothing.  Used to print
+      // addi vs add, etc.
+      if (MI->getOperand(OpNo).isImm())
+        OS << "i";
+      return false;
+    }
+  }
+
+  printOperand(MI, OpNo, OS);
+  return false;
+}
+
+bool HexagonAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                            unsigned OpNo, unsigned AsmVariant,
+                                            const char *ExtraCode,
+                                            raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0])
+    return true; // Unknown modifier.
+
+  const MachineOperand &Base  = MI->getOperand(OpNo);
+  const MachineOperand &Offset = MI->getOperand(OpNo+1);
+
+  if (Base.isReg())
+    printOperand(MI, OpNo, O);
+  else
+    llvm_unreachable("Unimplemented");
+
+  if (Offset.isImm()) {
+    if (Offset.getImm())
+      O << " + #" << Offset.getImm();
+  }
+  else
+    llvm_unreachable("Unimplemented");
+
+  return false;
+}
+
+
+/// printMachineInstruction -- Print out a single Hexagon MI in Darwin syntax to
+/// the current output stream.
+///
+void HexagonAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  if (MI->isBundle()) {
+    std::vector<const MachineInstr*> BundleMIs;
+
+    const MachineBasicBlock *MBB = MI->getParent();
+    MachineBasicBlock::const_instr_iterator MII = MI;
+    ++MII;
+    unsigned int IgnoreCount = 0;
+    while (MII != MBB->end() && MII->isInsideBundle()) {
+      const MachineInstr *MInst = MII;
+      if (MInst->getOpcode() == TargetOpcode::DBG_VALUE ||
+          MInst->getOpcode() == TargetOpcode::IMPLICIT_DEF) {
+          IgnoreCount++;
+          ++MII;
+          continue;
+      }
+      //BundleMIs.push_back(&*MII);
+      BundleMIs.push_back(MInst);
+      ++MII;
+    }
+    unsigned Size = BundleMIs.size();
+    assert((Size+IgnoreCount) == MI->getBundleSize() && "Corrupt Bundle!");
+    for (unsigned Index = 0; Index < Size; Index++) {
+      HexagonMCInst MCI;
+      MCI.setPacketStart(Index == 0);
+      MCI.setPacketEnd(Index == (Size-1));
+
+      HexagonLowerToMC(BundleMIs[Index], MCI, *this);
+      EmitToStreamer(OutStreamer, MCI);
+    }
+  }
+  else {
+    HexagonMCInst MCI;
+    if (MI->getOpcode() == Hexagon::ENDLOOP0) {
+      MCI.setPacketStart(true);
+      MCI.setPacketEnd(true);
+    }
+    HexagonLowerToMC(MI, MCI, *this);
+    EmitToStreamer(OutStreamer, MCI);
+  }
+
+  return;
+}
+
+static MCInstPrinter *createHexagonMCInstPrinter(const Target &T,
+                                                 unsigned SyntaxVariant,
+                                                 const MCAsmInfo &MAI,
+                                                 const MCInstrInfo &MII,
+                                                 const MCRegisterInfo &MRI,
+                                                 const MCSubtargetInfo &STI) {
+  if (SyntaxVariant == 0)
+    return(new HexagonInstPrinter(MAI, MII, MRI));
+  else
+   return nullptr;
+}
+
+extern "C" void LLVMInitializeHexagonAsmPrinter() {
+  RegisterAsmPrinter<HexagonAsmPrinter> X(TheHexagonTarget);
+
+  TargetRegistry::RegisterMCInstPrinter(TheHexagonTarget,
+                                        createHexagonMCInstPrinter);
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h b/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h
new file mode 100644
index 0000000..7fe8c57
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h
@@ -0,0 +1,55 @@
+//===-- HexagonAsmPrinter.h - Print machine code to an Hexagon .s file ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Hexagon Assembly printer class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HEXAGONASMPRINTER_H
+#define HEXAGONASMPRINTER_H
+
+#include "Hexagon.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+  class HexagonAsmPrinter : public AsmPrinter {
+    const HexagonSubtarget *Subtarget;
+
+  public:
+    explicit HexagonAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+      : AsmPrinter(TM, Streamer) {
+      Subtarget = &TM.getSubtarget<HexagonSubtarget>();
+    }
+
+    const char *getPassName() const override {
+      return "Hexagon Assembly Printer";
+    }
+
+    bool isBlockOnlyReachableByFallthrough(
+                                   const MachineBasicBlock *MBB) const override;
+
+    void EmitInstruction(const MachineInstr *MI) override;
+
+    void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O);
+    bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                         unsigned AsmVariant, const char *ExtraCode,
+                         raw_ostream &OS) override;
+    bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                               unsigned AsmVariant, const char *ExtraCode,
+                               raw_ostream &OS) override;
+
+    static const char *getRegisterName(unsigned RegNo);
+  };
+
+} // end of llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
new file mode 100644
index 0000000..de340e0
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
@@ -0,0 +1,253 @@
+//===-- HexagonCFGOptimizer.cpp - CFG optimizations -----------------------===//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Hexagon.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "hexagon_cfg"
+
+namespace llvm {
+  void initializeHexagonCFGOptimizerPass(PassRegistry&);
+}
+
+
+namespace {
+
+class HexagonCFGOptimizer : public MachineFunctionPass {
+
+private:
+  const HexagonTargetMachine& QTM;
+  const HexagonSubtarget &QST;
+
+  void InvertAndChangeJumpTarget(MachineInstr*, MachineBasicBlock*);
+
+ public:
+  static char ID;
+  HexagonCFGOptimizer(const HexagonTargetMachine& TM)
+    : MachineFunctionPass(ID), QTM(TM), QST(*TM.getSubtargetImpl()) {
+    initializeHexagonCFGOptimizerPass(*PassRegistry::getPassRegistry());
+  }
+
+  const char *getPassName() const override {
+    return "Hexagon CFG Optimizer";
+  }
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+};
+
+
+char HexagonCFGOptimizer::ID = 0;
+
+static bool IsConditionalBranch(int Opc) {
+  return (Opc == Hexagon::JMP_t) || (Opc == Hexagon::JMP_f)
+    || (Opc == Hexagon::JMP_tnew_t) || (Opc == Hexagon::JMP_fnew_t);
+}
+
+
+static bool IsUnconditionalJump(int Opc) {
+  return (Opc == Hexagon::JMP);
+}
+
+
+void
+HexagonCFGOptimizer::InvertAndChangeJumpTarget(MachineInstr* MI,
+                                               MachineBasicBlock* NewTarget) {
+  const HexagonInstrInfo *QII = QTM.getInstrInfo();
+  int NewOpcode = 0;
+  switch(MI->getOpcode()) {
+  case Hexagon::JMP_t:
+    NewOpcode = Hexagon::JMP_f;
+    break;
+
+  case Hexagon::JMP_f:
+    NewOpcode = Hexagon::JMP_t;
+    break;
+
+  case Hexagon::JMP_tnew_t:
+    NewOpcode = Hexagon::JMP_fnew_t;
+    break;
+
+  case Hexagon::JMP_fnew_t:
+    NewOpcode = Hexagon::JMP_tnew_t;
+    break;
+
+  default:
+    llvm_unreachable("Cannot handle this case");
+  }
+
+  MI->setDesc(QII->get(NewOpcode));
+  MI->getOperand(1).setMBB(NewTarget);
+}
+
+
+bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) {
+
+  // Loop over all of the basic blocks.
+  for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end();
+       MBBb != MBBe; ++MBBb) {
+    MachineBasicBlock* MBB = MBBb;
+
+    // Traverse the basic block.
+    MachineBasicBlock::iterator MII = MBB->getFirstTerminator();
+    if (MII != MBB->end()) {
+      MachineInstr *MI = MII;
+      int Opc = MI->getOpcode();
+      if (IsConditionalBranch(Opc)) {
+
+        //
+        // (Case 1) Transform the code if the following condition occurs:
+        //   BB1: if (p0) jump BB3
+        //   ...falls-through to BB2 ...
+        //   BB2: jump BB4
+        //   ...next block in layout is BB3...
+        //   BB3: ...
+        //
+        //  Transform this to:
+        //  BB1: if (!p0) jump BB4
+        //  Remove BB2
+        //  BB3: ...
+        //
+        // (Case 2) A variation occurs when BB3 contains a JMP to BB4:
+        //   BB1: if (p0) jump BB3
+        //   ...falls-through to BB2 ...
+        //   BB2: jump BB4
+        //   ...other basic blocks ...
+        //   BB4:
+        //   ...not a fall-thru
+        //   BB3: ...
+        //     jump BB4
+        //
+        // Transform this to:
+        //   BB1: if (!p0) jump BB4
+        //   Remove BB2
+        //   BB3: ...
+        //   BB4: ...
+        //
+        unsigned NumSuccs = MBB->succ_size();
+        MachineBasicBlock::succ_iterator SI = MBB->succ_begin();
+        MachineBasicBlock* FirstSucc = *SI;
+        MachineBasicBlock* SecondSucc = *(++SI);
+        MachineBasicBlock* LayoutSucc = nullptr;
+        MachineBasicBlock* JumpAroundTarget = nullptr;
+
+        if (MBB->isLayoutSuccessor(FirstSucc)) {
+          LayoutSucc = FirstSucc;
+          JumpAroundTarget = SecondSucc;
+        } else if (MBB->isLayoutSuccessor(SecondSucc)) {
+          LayoutSucc = SecondSucc;
+          JumpAroundTarget = FirstSucc;
+        } else {
+          // Odd case...cannot handle.
+        }
+
+        // The target of the unconditional branch must be JumpAroundTarget.
+        // TODO: If not, we should not invert the unconditional branch.
+        MachineBasicBlock* CondBranchTarget = nullptr;
+        if ((MI->getOpcode() == Hexagon::JMP_t) ||
+            (MI->getOpcode() == Hexagon::JMP_f)) {
+          CondBranchTarget = MI->getOperand(1).getMBB();
+        }
+
+        if (!LayoutSucc || (CondBranchTarget != JumpAroundTarget)) {
+          continue;
+        }
+
+        if ((NumSuccs == 2) && LayoutSucc && (LayoutSucc->pred_size() == 1)) {
+
+          // Ensure that BB2 has one instruction -- an unconditional jump.
+          if ((LayoutSucc->size() == 1) &&
+              IsUnconditionalJump(LayoutSucc->front().getOpcode())) {
+            MachineBasicBlock* UncondTarget =
+              LayoutSucc->front().getOperand(0).getMBB();
+            // Check if the layout successor of BB2 is BB3.
+            bool case1 = LayoutSucc->isLayoutSuccessor(JumpAroundTarget);
+            bool case2 = JumpAroundTarget->isSuccessor(UncondTarget) &&
+              JumpAroundTarget->size() >= 1 &&
+              IsUnconditionalJump(JumpAroundTarget->back().getOpcode()) &&
+              JumpAroundTarget->pred_size() == 1 &&
+              JumpAroundTarget->succ_size() == 1;
+
+            if (case1 || case2) {
+              InvertAndChangeJumpTarget(MI, UncondTarget);
+              MBB->removeSuccessor(JumpAroundTarget);
+              MBB->addSuccessor(UncondTarget);
+
+              // Remove the unconditional branch in LayoutSucc.
+              LayoutSucc->erase(LayoutSucc->begin());
+              LayoutSucc->removeSuccessor(UncondTarget);
+              LayoutSucc->addSuccessor(JumpAroundTarget);
+
+              // This code performs the conversion for case 2, which moves
+              // the block to the fall-thru case (BB3 in the code above).
+              if (case2 && !case1) {
+                JumpAroundTarget->moveAfter(LayoutSucc);
+                // only move a block if it doesn't have a fall-thru. otherwise
+                // the CFG will be incorrect.
+                if (!UncondTarget->canFallThrough()) {
+                  UncondTarget->moveAfter(JumpAroundTarget);
+                }
+              }
+
+              //
+              // Correct live-in information. Is used by post-RA scheduler
+              // The live-in to LayoutSucc is now all values live-in to
+              // JumpAroundTarget.
+              //
+              std::vector<unsigned> OrigLiveIn(LayoutSucc->livein_begin(),
+                                               LayoutSucc->livein_end());
+              std::vector<unsigned> NewLiveIn(JumpAroundTarget->livein_begin(),
+                                              JumpAroundTarget->livein_end());
+              for (unsigned i = 0; i < OrigLiveIn.size(); ++i) {
+                LayoutSucc->removeLiveIn(OrigLiveIn[i]);
+              }
+              for (unsigned i = 0; i < NewLiveIn.size(); ++i) {
+                LayoutSucc->addLiveIn(NewLiveIn[i]);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  return true;
+}
+}
+
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+static void initializePassOnce(PassRegistry &Registry) {
+  PassInfo *PI = new PassInfo("Hexagon CFG Optimizer", "hexagon-cfg",
+                              &HexagonCFGOptimizer::ID, nullptr, false, false);
+  Registry.registerPass(*PI, true);
+}
+
+void llvm::initializeHexagonCFGOptimizerPass(PassRegistry &Registry) {
+  CALL_ONCE_INITIALIZATION(initializePassOnce)
+}
+
+FunctionPass *llvm::createHexagonCFGOptimizer(const HexagonTargetMachine &TM) {
+  return new HexagonCFGOptimizer(TM);
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonCallingConv.td b/contrib/llvm/lib/Target/Hexagon/HexagonCallingConv.td
new file mode 100644
index 0000000..e61b2a7
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonCallingConv.td
@@ -0,0 +1,35 @@
+//===- HexagonCallingConv.td - Calling Conventions Hexagon -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the Hexagon architectures.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Conventions
+//===----------------------------------------------------------------------===//
+
+// Hexagon 32-bit C return-value convention.
+def RetCC_Hexagon32 : CallingConv<[
+  CCIfType<[i32, f32], CCAssignToReg<[R0, R1, R2, R3, R4, R5]>>,
+  CCIfType<[i64, f64], CCAssignToReg<[D0, D1, D2]>>,
+
+  // Alternatively, they are assigned to the stack in 4-byte aligned units.
+  CCAssignToStack<4, 4>
+]>;
+
+// Hexagon 32-bit C Calling convention.
+def CC_Hexagon32 : CallingConv<[
+  // All arguments get passed in integer registers if there is space.
+  CCIfType<[f32, i32, i16, i8], CCAssignToReg<[R0, R1, R2, R3, R4, R5]>>,
+  CCIfType<[f64, i64], CCAssignToReg<[D0, D1, D2]>>,
+
+  // Alternatively, they are assigned to the stack in 4-byte aligned units.
+  CCAssignToStack<4, 4>
+]>;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonCallingConvLower.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonCallingConvLower.cpp
new file mode 100644
index 0000000..f5f958c
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonCallingConvLower.cpp
@@ -0,0 +1,204 @@
+//===-- llvm/CallingConvLower.cpp - Calling Convention lowering -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Hexagon_CCState class, used for lowering and
+// implementing calling conventions. Adapted from the machine independent
+// version of the class (CCState) but this handles calls to varargs functions
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonCallingConvLower.h"
+#include "Hexagon.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+Hexagon_CCState::Hexagon_CCState(CallingConv::ID CC, bool isVarArg,
+                                 const TargetMachine &tm,
+                                 SmallVectorImpl<CCValAssign> &locs,
+                                 LLVMContext &c)
+  : CallingConv(CC), IsVarArg(isVarArg), TM(tm), Locs(locs), Context(c) {
+  // No stack is used.
+  StackOffset = 0;
+
+  UsedRegs.resize((TM.getRegisterInfo()->getNumRegs()+31)/32);
+}
+
+// HandleByVal - Allocate a stack slot large enough to pass an argument by
+// value. The size and alignment information of the argument is encoded in its
+// parameter attribute.
+void Hexagon_CCState::HandleByVal(unsigned ValNo, EVT ValVT,
+                                EVT LocVT, CCValAssign::LocInfo LocInfo,
+                                int MinSize, int MinAlign,
+                                ISD::ArgFlagsTy ArgFlags) {
+  unsigned Align = ArgFlags.getByValAlign();
+  unsigned Size  = ArgFlags.getByValSize();
+  if (MinSize > (int)Size)
+    Size = MinSize;
+  if (MinAlign > (int)Align)
+    Align = MinAlign;
+  unsigned Offset = AllocateStack(Size, Align);
+
+  addLoc(CCValAssign::getMem(ValNo, ValVT.getSimpleVT(), Offset,
+                             LocVT.getSimpleVT(), LocInfo));
+}
+
+/// MarkAllocated - Mark a register and all of its aliases as allocated.
+void Hexagon_CCState::MarkAllocated(unsigned Reg) {
+  const TargetRegisterInfo &TRI = *TM.getRegisterInfo();
+  for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI)
+    UsedRegs[*AI/32] |= 1 << (*AI&31);
+}
+
+/// AnalyzeFormalArguments - Analyze an ISD::FORMAL_ARGUMENTS node,
+/// incorporating info about the formals into this state.
+void
+Hexagon_CCState::AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg>
+                                        &Ins,
+                                        Hexagon_CCAssignFn Fn,
+                                        unsigned SretValueInRegs) {
+  unsigned NumArgs = Ins.size();
+  unsigned i = 0;
+
+  // If the function returns a small struct in registers, skip
+  // over the first (dummy) argument.
+  if (SretValueInRegs != 0) {
+    ++i;
+  }
+
+
+  for (; i != NumArgs; ++i) {
+    EVT ArgVT = Ins[i].VT;
+    ISD::ArgFlagsTy ArgFlags = Ins[i].Flags;
+    if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this, 0, 0, false)) {
+      dbgs() << "Formal argument #" << i << " has unhandled type "
+             << ArgVT.getEVTString() << "\n";
+      abort();
+    }
+  }
+}
+
+/// AnalyzeReturn - Analyze the returned values of an ISD::RET node,
+/// incorporating info about the result values into this state.
+void
+Hexagon_CCState::AnalyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               Hexagon_CCAssignFn Fn,
+                               unsigned SretValueInRegs) {
+
+  // For Hexagon, Return small structures in registers.
+  if (SretValueInRegs != 0) {
+    if (SretValueInRegs <= 32) {
+      unsigned Reg = Hexagon::R0;
+      addLoc(CCValAssign::getReg(0, MVT::i32, Reg, MVT::i32,
+                                 CCValAssign::Full));
+      return;
+    }
+    if (SretValueInRegs <= 64) {
+      unsigned Reg = Hexagon::D0;
+      addLoc(CCValAssign::getReg(0, MVT::i64, Reg, MVT::i64,
+                                 CCValAssign::Full));
+      return;
+    }
+  }
+
+
+  // Determine which register each value should be copied into.
+  for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
+    EVT VT = Outs[i].VT;
+    ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+    if (Fn(i, VT, VT, CCValAssign::Full, ArgFlags, *this, -1, -1, false)){
+      dbgs() << "Return operand #" << i << " has unhandled type "
+           << VT.getEVTString() << "\n";
+      abort();
+    }
+  }
+}
+
+
+/// AnalyzeCallOperands - Analyze an ISD::CALL node, incorporating info
+/// about the passed values into this state.
+void
+Hexagon_CCState::AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg>
+                                     &Outs,
+                                     Hexagon_CCAssignFn Fn,
+                                     int NonVarArgsParams,
+                                     unsigned SretValueSize) {
+  unsigned NumOps = Outs.size();
+
+  unsigned i = 0;
+  // If the called function returns a small struct in registers, skip
+  // the first actual parameter. We do not want to pass a pointer to
+  // the stack location.
+  if (SretValueSize != 0) {
+    ++i;
+  }
+
+  for (; i != NumOps; ++i) {
+    EVT ArgVT = Outs[i].VT;
+    ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+    if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this,
+           NonVarArgsParams, i+1, false)) {
+      dbgs() << "Call operand #" << i << " has unhandled type "
+           << ArgVT.getEVTString() << "\n";
+      abort();
+    }
+  }
+}
+
+/// AnalyzeCallOperands - Same as above except it takes vectors of types
+/// and argument flags.
+void
+Hexagon_CCState::AnalyzeCallOperands(SmallVectorImpl<EVT> &ArgVTs,
+                                     SmallVectorImpl<ISD::ArgFlagsTy> &Flags,
+                                     Hexagon_CCAssignFn Fn) {
+  unsigned NumOps = ArgVTs.size();
+  for (unsigned i = 0; i != NumOps; ++i) {
+    EVT ArgVT = ArgVTs[i];
+    ISD::ArgFlagsTy ArgFlags = Flags[i];
+    if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this, -1, -1,
+           false)) {
+      dbgs() << "Call operand #" << i << " has unhandled type "
+           << ArgVT.getEVTString() << "\n";
+      abort();
+    }
+  }
+}
+
+/// AnalyzeCallResult - Analyze the return values of an ISD::CALL node,
+/// incorporating info about the passed values into this state.
+void
+Hexagon_CCState::AnalyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins,
+                                   Hexagon_CCAssignFn Fn,
+                                   unsigned SretValueInRegs) {
+
+  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+    EVT VT = Ins[i].VT;
+    ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
+      if (Fn(i, VT, VT, CCValAssign::Full, Flags, *this, -1, -1, false)) {
+        dbgs() << "Call result #" << i << " has unhandled type "
+               << VT.getEVTString() << "\n";
+      abort();
+    }
+  }
+}
+
+/// AnalyzeCallResult - Same as above except it's specialized for calls which
+/// produce a single value.
+void Hexagon_CCState::AnalyzeCallResult(EVT VT, Hexagon_CCAssignFn Fn) {
+  if (Fn(0, VT, VT, CCValAssign::Full, ISD::ArgFlagsTy(), *this, -1, -1,
+         false)) {
+    dbgs() << "Call result has unhandled type "
+         << VT.getEVTString() << "\n";
+    abort();
+  }
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonCallingConvLower.h b/contrib/llvm/lib/Target/Hexagon/HexagonCallingConvLower.h
new file mode 100644
index 0000000..70b8b64
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonCallingConvLower.h
@@ -0,0 +1,187 @@
+//===-- HexagonCallingConvLower.h - Calling Conventions ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Hexagon_CCState class, used for lowering
+// and implementing calling conventions. Adapted from the target independent
+// version but this handles calls to varargs functions
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_Hexagon_CODEGEN_CALLINGCONVLOWER_H
+#define LLVM_Hexagon_CODEGEN_CALLINGCONVLOWER_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+
+//
+// Need to handle varargs.
+//
+namespace llvm {
+  class TargetRegisterInfo;
+  class TargetMachine;
+  class Hexagon_CCState;
+  class SDNode;
+  struct EVT;
+
+/// Hexagon_CCAssignFn - This function assigns a location for Val, updating
+/// State to reflect the change.
+typedef bool Hexagon_CCAssignFn(unsigned ValNo, EVT ValVT,
+                              EVT LocVT, CCValAssign::LocInfo LocInfo,
+                              ISD::ArgFlagsTy ArgFlags, Hexagon_CCState &State,
+                              int NonVarArgsParams,
+                              int CurrentParam,
+                              bool ForceMem);
+
+
+/// CCState - This class holds information needed while lowering arguments and
+/// return values.  It captures which registers are already assigned and which
+/// stack slots are used.  It provides accessors to allocate these values.
+class Hexagon_CCState {
+  CallingConv::ID CallingConv;
+  bool IsVarArg;
+  const TargetMachine &TM;
+  SmallVectorImpl<CCValAssign> &Locs;
+  LLVMContext &Context;
+
+  unsigned StackOffset;
+  SmallVector<uint32_t, 16> UsedRegs;
+public:
+  Hexagon_CCState(CallingConv::ID CC, bool isVarArg, const TargetMachine &TM,
+                  SmallVectorImpl<CCValAssign> &locs, LLVMContext &c);
+
+  void addLoc(const CCValAssign &V) {
+    Locs.push_back(V);
+  }
+
+  LLVMContext &getContext() const { return Context; }
+  const TargetMachine &getTarget() const { return TM; }
+  unsigned getCallingConv() const { return CallingConv; }
+  bool isVarArg() const { return IsVarArg; }
+
+  unsigned getNextStackOffset() const { return StackOffset; }
+
+  /// isAllocated - Return true if the specified register (or an alias) is
+  /// allocated.
+  bool isAllocated(unsigned Reg) const {
+    return UsedRegs[Reg/32] & (1 << (Reg&31));
+  }
+
+  /// AnalyzeFormalArguments - Analyze an ISD::FORMAL_ARGUMENTS node,
+  /// incorporating info about the formals into this state.
+  void AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
+                              Hexagon_CCAssignFn Fn, unsigned SretValueInRegs);
+
+  /// AnalyzeReturn - Analyze the returned values of an ISD::RET node,
+  /// incorporating info about the result values into this state.
+  void AnalyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
+                     Hexagon_CCAssignFn Fn, unsigned SretValueInRegs);
+
+  /// AnalyzeCallOperands - Analyze an ISD::CALL node, incorporating info
+  /// about the passed values into this state.
+  void AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
+                           Hexagon_CCAssignFn Fn, int NonVarArgsParams,
+                           unsigned SretValueSize);
+
+  /// AnalyzeCallOperands - Same as above except it takes vectors of types
+  /// and argument flags.
+  void AnalyzeCallOperands(SmallVectorImpl<EVT> &ArgVTs,
+                           SmallVectorImpl<ISD::ArgFlagsTy> &Flags,
+                           Hexagon_CCAssignFn Fn);
+
+  /// AnalyzeCallResult - Analyze the return values of an ISD::CALL node,
+  /// incorporating info about the passed values into this state.
+  void AnalyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins,
+                         Hexagon_CCAssignFn Fn, unsigned SretValueInRegs);
+
+  /// AnalyzeCallResult - Same as above except it's specialized for calls which
+  /// produce a single value.
+  void AnalyzeCallResult(EVT VT, Hexagon_CCAssignFn Fn);
+
+  /// getFirstUnallocated - Return the first unallocated register in the set, or
+  /// NumRegs if they are all allocated.
+  unsigned getFirstUnallocated(const unsigned *Regs, unsigned NumRegs) const {
+    for (unsigned i = 0; i != NumRegs; ++i)
+      if (!isAllocated(Regs[i]))
+        return i;
+    return NumRegs;
+  }
+
+  /// AllocateReg - Attempt to allocate one register.  If it is not available,
+  /// return zero.  Otherwise, return the register, marking it and any aliases
+  /// as allocated.
+  unsigned AllocateReg(unsigned Reg) {
+    if (isAllocated(Reg)) return 0;
+    MarkAllocated(Reg);
+    return Reg;
+  }
+
+  /// Version of AllocateReg with extra register to be shadowed.
+  unsigned AllocateReg(unsigned Reg, unsigned ShadowReg) {
+    if (isAllocated(Reg)) return 0;
+    MarkAllocated(Reg);
+    MarkAllocated(ShadowReg);
+    return Reg;
+  }
+
+  /// AllocateReg - Attempt to allocate one of the specified registers.  If none
+  /// are available, return zero.  Otherwise, return the first one available,
+  /// marking it and any aliases as allocated.
+  unsigned AllocateReg(const unsigned *Regs, unsigned NumRegs) {
+    unsigned FirstUnalloc = getFirstUnallocated(Regs, NumRegs);
+    if (FirstUnalloc == NumRegs)
+      return 0;    // Didn't find the reg.
+
+    // Mark the register and any aliases as allocated.
+    unsigned Reg = Regs[FirstUnalloc];
+    MarkAllocated(Reg);
+    return Reg;
+  }
+
+  /// Version of AllocateReg with list of registers to be shadowed.
+  unsigned AllocateReg(const unsigned *Regs, const unsigned *ShadowRegs,
+                       unsigned NumRegs) {
+    unsigned FirstUnalloc = getFirstUnallocated(Regs, NumRegs);
+    if (FirstUnalloc == NumRegs)
+      return 0;    // Didn't find the reg.
+
+    // Mark the register and any aliases as allocated.
+    unsigned Reg = Regs[FirstUnalloc], ShadowReg = ShadowRegs[FirstUnalloc];
+    MarkAllocated(Reg);
+    MarkAllocated(ShadowReg);
+    return Reg;
+  }
+
+  /// AllocateStack - Allocate a chunk of stack space with the specified size
+  /// and alignment.
+  unsigned AllocateStack(unsigned Size, unsigned Align) {
+    assert(Align && ((Align-1) & Align) == 0); // Align is power of 2.
+    StackOffset = ((StackOffset + Align-1) & ~(Align-1));
+    unsigned Result = StackOffset;
+    StackOffset += Size;
+    return Result;
+  }
+
+  // HandleByVal - Allocate a stack slot large enough to pass an argument by
+  // value. The size and alignment information of the argument is encoded in its
+  // parameter attribute.
+  void HandleByVal(unsigned ValNo, EVT ValVT,
+                   EVT LocVT, CCValAssign::LocInfo LocInfo,
+                   int MinSize, int MinAlign, ISD::ArgFlagsTy ArgFlags);
+
+private:
+  /// MarkAllocated - Mark a register and all of its aliases as allocated.
+  void MarkAllocated(unsigned Reg);
+};
+
+
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
new file mode 100644
index 0000000..aeff680
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
@@ -0,0 +1,676 @@
+//===------- HexagonCopyToCombine.cpp - Hexagon Copy-To-Combine Pass ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This pass replaces transfer instructions by combine instructions.
+// We walk along a basic block and look for two combinable instructions and try
+// to move them together. If we can move them next to each other we do so and
+// replace them with a combine instruction.
+//===----------------------------------------------------------------------===//
+#include "llvm/PassSupport.h"
+#include "Hexagon.h"
+#include "HexagonInstrInfo.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "hexagon-copy-combine"
+
+static
+cl::opt<bool> IsCombinesDisabled("disable-merge-into-combines",
+                                 cl::Hidden, cl::ZeroOrMore,
+                                 cl::init(false),
+                                 cl::desc("Disable merging into combines"));
+static
+cl::opt<unsigned>
+MaxNumOfInstsBetweenNewValueStoreAndTFR("max-num-inst-between-tfr-and-nv-store",
+                   cl::Hidden, cl::init(4),
+                   cl::desc("Maximum distance between a tfr feeding a store we "
+                            "consider the store still to be newifiable"));
+
+namespace llvm {
+  void initializeHexagonCopyToCombinePass(PassRegistry&);
+}
+
+
+namespace {
+
+class HexagonCopyToCombine : public MachineFunctionPass  {
+  const HexagonInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  bool ShouldCombineAggressively;
+
+  DenseSet<MachineInstr *> PotentiallyNewifiableTFR;
+public:
+  static char ID;
+
+  HexagonCopyToCombine() : MachineFunctionPass(ID) {
+    initializeHexagonCopyToCombinePass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  const char *getPassName() const override {
+    return "Hexagon Copy-To-Combine Pass";
+  }
+
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+
+private:
+  MachineInstr *findPairable(MachineInstr *I1, bool &DoInsertAtI1);
+
+  void findPotentialNewifiableTFRs(MachineBasicBlock &);
+
+  void combine(MachineInstr *I1, MachineInstr *I2,
+               MachineBasicBlock::iterator &MI, bool DoInsertAtI1);
+
+  bool isSafeToMoveTogether(MachineInstr *I1, MachineInstr *I2,
+                            unsigned I1DestReg, unsigned I2DestReg,
+                            bool &DoInsertAtI1);
+
+  void emitCombineRR(MachineBasicBlock::iterator &Before, unsigned DestReg,
+                     MachineOperand &HiOperand, MachineOperand &LoOperand);
+
+  void emitCombineRI(MachineBasicBlock::iterator &Before, unsigned DestReg,
+                     MachineOperand &HiOperand, MachineOperand &LoOperand);
+
+  void emitCombineIR(MachineBasicBlock::iterator &Before, unsigned DestReg,
+                     MachineOperand &HiOperand, MachineOperand &LoOperand);
+
+  void emitCombineII(MachineBasicBlock::iterator &Before, unsigned DestReg,
+                     MachineOperand &HiOperand, MachineOperand &LoOperand);
+};
+
+} // End anonymous namespace.
+
+char HexagonCopyToCombine::ID = 0;
+
+INITIALIZE_PASS(HexagonCopyToCombine, "hexagon-copy-combine",
+                "Hexagon Copy-To-Combine Pass", false, false)
+
+static bool isCombinableInstType(MachineInstr *MI,
+                                 const HexagonInstrInfo *TII,
+                                 bool ShouldCombineAggressively) {
+  switch(MI->getOpcode()) {
+  case Hexagon::TFR: {
+    // A COPY instruction can be combined if its arguments are IntRegs (32bit).
+    assert(MI->getOperand(0).isReg() && MI->getOperand(1).isReg());
+
+    unsigned DestReg = MI->getOperand(0).getReg();
+    unsigned SrcReg = MI->getOperand(1).getReg();
+    return Hexagon::IntRegsRegClass.contains(DestReg) &&
+      Hexagon::IntRegsRegClass.contains(SrcReg);
+  }
+
+  case Hexagon::TFRI: {
+    // A transfer-immediate can be combined if its argument is a signed 8bit
+    // value.
+    assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm());
+    unsigned DestReg = MI->getOperand(0).getReg();
+
+    // Only combine constant extended TFRI if we are in aggressive mode.
+    return Hexagon::IntRegsRegClass.contains(DestReg) &&
+      (ShouldCombineAggressively || isInt<8>(MI->getOperand(1).getImm()));
+  }
+
+  case Hexagon::TFRI_V4: {
+    if (!ShouldCombineAggressively)
+      return false;
+    assert(MI->getOperand(0).isReg() && MI->getOperand(1).isGlobal());
+
+    // Ensure that TargetFlags are MO_NO_FLAG for a global. This is a
+    // workaround for an ABI bug that prevents GOT relocations on combine
+    // instructions
+    if (MI->getOperand(1).getTargetFlags() != HexagonII::MO_NO_FLAG)
+      return false;
+
+    unsigned DestReg = MI->getOperand(0).getReg();
+    return Hexagon::IntRegsRegClass.contains(DestReg);
+  }
+
+  default:
+    break;
+  }
+
+  return false;
+}
+
+static bool isGreaterThan8BitTFRI(MachineInstr *I) {
+  return I->getOpcode() == Hexagon::TFRI &&
+    !isInt<8>(I->getOperand(1).getImm());
+}
+static bool isGreaterThan6BitTFRI(MachineInstr *I) {
+  return I->getOpcode() == Hexagon::TFRI &&
+    !isUInt<6>(I->getOperand(1).getImm());
+}
+
+/// areCombinableOperations - Returns true if the two instruction can be merge
+/// into a combine (ignoring register constraints).
+static bool areCombinableOperations(const TargetRegisterInfo *TRI,
+                                    MachineInstr *HighRegInst,
+                                    MachineInstr *LowRegInst) {
+  assert((HighRegInst->getOpcode() == Hexagon::TFR ||
+          HighRegInst->getOpcode() == Hexagon::TFRI ||
+          HighRegInst->getOpcode() == Hexagon::TFRI_V4) &&
+         (LowRegInst->getOpcode() == Hexagon::TFR ||
+          LowRegInst->getOpcode() == Hexagon::TFRI ||
+          LowRegInst->getOpcode() == Hexagon::TFRI_V4) &&
+         "Assume individual instructions are of a combinable type");
+
+  const HexagonRegisterInfo *QRI =
+    static_cast<const HexagonRegisterInfo *>(TRI);
+
+  // V4 added some combine variations (mixed immediate and register source
+  // operands), if we are on < V4 we can only combine 2 register-to-register
+  // moves and 2 immediate-to-register moves. We also don't have
+  // constant-extenders.
+  if (!QRI->Subtarget.hasV4TOps())
+    return HighRegInst->getOpcode() == LowRegInst->getOpcode() &&
+      !isGreaterThan8BitTFRI(HighRegInst) &&
+      !isGreaterThan6BitTFRI(LowRegInst);
+
+  // There is no combine of two constant extended values.
+  if ((HighRegInst->getOpcode() == Hexagon::TFRI_V4 ||
+       isGreaterThan8BitTFRI(HighRegInst)) &&
+      (LowRegInst->getOpcode() == Hexagon::TFRI_V4 ||
+       isGreaterThan6BitTFRI(LowRegInst)))
+    return false;
+
+  return true;
+}
+
+static bool isEvenReg(unsigned Reg) {
+  assert(TargetRegisterInfo::isPhysicalRegister(Reg) &&
+         Hexagon::IntRegsRegClass.contains(Reg));
+  return (Reg - Hexagon::R0) % 2 == 0;
+}
+
+static void removeKillInfo(MachineInstr *MI, unsigned RegNotKilled) {
+  for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
+    MachineOperand &Op = MI->getOperand(I);
+    if (!Op.isReg() || Op.getReg() != RegNotKilled || !Op.isKill())
+      continue;
+    Op.setIsKill(false);
+  }
+}
+
+/// isUnsafeToMoveAcross - Returns true if it is unsafe to move a copy
+/// instruction from \p UseReg to \p DestReg over the instruction \p I.
+static bool isUnsafeToMoveAcross(MachineInstr *I, unsigned UseReg,
+                                  unsigned DestReg,
+                                  const TargetRegisterInfo *TRI) {
+  return (UseReg && (I->modifiesRegister(UseReg, TRI))) ||
+          I->modifiesRegister(DestReg, TRI) ||
+          I->readsRegister(DestReg, TRI) ||
+          I->hasUnmodeledSideEffects() ||
+          I->isInlineAsm() || I->isDebugValue();
+}
+
+/// isSafeToMoveTogether - Returns true if it is safe to move I1 next to I2 such
+/// that the two instructions can be paired in a combine.
+bool HexagonCopyToCombine::isSafeToMoveTogether(MachineInstr *I1,
+                                                MachineInstr *I2,
+                                                unsigned I1DestReg,
+                                                unsigned I2DestReg,
+                                                bool &DoInsertAtI1) {
+
+  bool IsImmUseReg = I2->getOperand(1).isImm() || I2->getOperand(1).isGlobal();
+  unsigned I2UseReg = IsImmUseReg ? 0 : I2->getOperand(1).getReg();
+
+  // It is not safe to move I1 and I2 into one combine if I2 has a true
+  // dependence on I1.
+  if (I2UseReg && I1->modifiesRegister(I2UseReg, TRI))
+    return false;
+
+  bool isSafe = true;
+
+  // First try to move I2 towards I1.
+  {
+    // A reverse_iterator instantiated like below starts before I2, and I1
+    // respectively.
+    // Look at instructions I in between I2 and (excluding) I1.
+    MachineBasicBlock::reverse_iterator I(I2),
+      End = --(MachineBasicBlock::reverse_iterator(I1));
+    // At 03 we got better results (dhrystone!) by being more conservative.
+    if (!ShouldCombineAggressively)
+      End = MachineBasicBlock::reverse_iterator(I1);
+    // If I2 kills its operand and we move I2 over an instruction that also
+    // uses I2's use reg we need to modify that (first) instruction to now kill
+    // this reg.
+    unsigned KilledOperand = 0;
+    if (I2->killsRegister(I2UseReg))
+      KilledOperand = I2UseReg;
+    MachineInstr *KillingInstr = nullptr;
+
+    for (; I != End; ++I) {
+      // If the intervening instruction I:
+      //   * modifies I2's use reg
+      //   * modifies I2's def reg
+      //   * reads I2's def reg
+      //   * or has unmodelled side effects
+      // we can't move I2 across it.
+      if (isUnsafeToMoveAcross(&*I, I2UseReg, I2DestReg, TRI)) {
+        isSafe = false;
+        break;
+      }
+
+      // Update first use of the killed operand.
+      if (!KillingInstr && KilledOperand &&
+          I->readsRegister(KilledOperand, TRI))
+        KillingInstr = &*I;
+    }
+    if (isSafe) {
+      // Update the intermediate instruction to with the kill flag.
+      if (KillingInstr) {
+        bool Added = KillingInstr->addRegisterKilled(KilledOperand, TRI, true);
+        (void)Added; // suppress compiler warning
+        assert(Added && "Must successfully update kill flag");
+        removeKillInfo(I2, KilledOperand);
+      }
+      DoInsertAtI1 = true;
+      return true;
+    }
+  }
+
+  // Try to move I1 towards I2.
+  {
+    // Look at instructions I in between I1 and (excluding) I2.
+    MachineBasicBlock::iterator I(I1), End(I2);
+    // At O3 we got better results (dhrystone) by being more conservative here.
+    if (!ShouldCombineAggressively)
+      End = std::next(MachineBasicBlock::iterator(I2));
+    IsImmUseReg = I1->getOperand(1).isImm() || I1->getOperand(1).isGlobal();
+    unsigned I1UseReg = IsImmUseReg ? 0 : I1->getOperand(1).getReg();
+    // Track killed operands. If we move across an instruction that kills our
+    // operand, we need to update the kill information on the moved I1. It kills
+    // the operand now.
+    MachineInstr *KillingInstr = nullptr;
+    unsigned KilledOperand = 0;
+
+    while(++I != End) {
+      // If the intervening instruction I:
+      //   * modifies I1's use reg
+      //   * modifies I1's def reg
+      //   * reads I1's def reg
+      //   * or has unmodelled side effects
+      //   We introduce this special case because llvm has no api to remove a
+      //   kill flag for a register (a removeRegisterKilled() analogous to
+      //   addRegisterKilled) that handles aliased register correctly.
+      //   * or has a killed aliased register use of I1's use reg
+      //           %D4<def> = TFRI64 16
+      //           %R6<def> = TFR %R9
+      //           %R8<def> = KILL %R8, %D4<imp-use,kill>
+      //      If we want to move R6 = across the KILL instruction we would have
+      //      to remove the %D4<imp-use,kill> operand. For now, we are
+      //      conservative and disallow the move.
+      // we can't move I1 across it.
+      if (isUnsafeToMoveAcross(I, I1UseReg, I1DestReg, TRI) ||
+          // Check for an aliased register kill. Bail out if we see one.
+          (!I->killsRegister(I1UseReg) && I->killsRegister(I1UseReg, TRI)))
+        return false;
+
+      // Check for an exact kill (registers match).
+      if (I1UseReg && I->killsRegister(I1UseReg)) {
+        assert(!KillingInstr && "Should only see one killing instruction");
+        KilledOperand = I1UseReg;
+        KillingInstr = &*I;
+      }
+    }
+    if (KillingInstr) {
+      removeKillInfo(KillingInstr, KilledOperand);
+      // Update I1 to set the kill flag. This flag will later be picked up by
+      // the new COMBINE instruction.
+      bool Added = I1->addRegisterKilled(KilledOperand, TRI);
+      (void)Added; // suppress compiler warning
+      assert(Added && "Must successfully update kill flag");
+    }
+    DoInsertAtI1 = false;
+  }
+
+  return true;
+}
+
+/// findPotentialNewifiableTFRs - Finds tranfers that feed stores that could be
+/// newified. (A use of a 64 bit register define can not be newified)
+void
+HexagonCopyToCombine::findPotentialNewifiableTFRs(MachineBasicBlock &BB) {
+  DenseMap<unsigned, MachineInstr *> LastDef;
+  for (MachineBasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ++I) {
+    MachineInstr *MI = I;
+    // Mark TFRs that feed a potential new value store as such.
+    if(TII->mayBeNewStore(MI)) {
+      // Look for uses of TFR instructions.
+      for (unsigned OpdIdx = 0, OpdE = MI->getNumOperands(); OpdIdx != OpdE;
+           ++OpdIdx) {
+        MachineOperand &Op = MI->getOperand(OpdIdx);
+
+        // Skip over anything except register uses.
+        if (!Op.isReg() || !Op.isUse() || !Op.getReg())
+          continue;
+
+        // Look for the defining instruction.
+        unsigned Reg = Op.getReg();
+        MachineInstr *DefInst = LastDef[Reg];
+        if (!DefInst)
+          continue;
+        if (!isCombinableInstType(DefInst, TII, ShouldCombineAggressively))
+          continue;
+
+        // Only close newifiable stores should influence the decision.
+        MachineBasicBlock::iterator It(DefInst);
+        unsigned NumInstsToDef = 0;
+        while (&*It++ != MI)
+          ++NumInstsToDef;
+
+        if (NumInstsToDef > MaxNumOfInstsBetweenNewValueStoreAndTFR)
+          continue;
+
+        PotentiallyNewifiableTFR.insert(DefInst);
+      }
+      // Skip to next instruction.
+      continue;
+    }
+
+    // Put instructions that last defined integer or double registers into the
+    // map.
+    for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
+      MachineOperand &Op = MI->getOperand(I);
+      if (!Op.isReg() || !Op.isDef() || !Op.getReg())
+        continue;
+      unsigned Reg = Op.getReg();
+      if (Hexagon::DoubleRegsRegClass.contains(Reg)) {
+        for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+          LastDef[*SubRegs] = MI;
+        }
+      } else if (Hexagon::IntRegsRegClass.contains(Reg))
+        LastDef[Reg] = MI;
+    }
+  }
+}
+
+bool HexagonCopyToCombine::runOnMachineFunction(MachineFunction &MF) {
+
+  if (IsCombinesDisabled) return false;
+
+  bool HasChanged = false;
+
+  // Get target info.
+  TRI = MF.getTarget().getRegisterInfo();
+  TII = static_cast<const HexagonInstrInfo *>(MF.getTarget().getInstrInfo());
+
+  // Combine aggressively (for code size)
+  ShouldCombineAggressively =
+    MF.getTarget().getOptLevel() <= CodeGenOpt::Default;
+
+  // Traverse basic blocks.
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
+       ++BI) {
+    PotentiallyNewifiableTFR.clear();
+    findPotentialNewifiableTFRs(*BI);
+
+    // Traverse instructions in basic block.
+    for(MachineBasicBlock::iterator MI = BI->begin(), End = BI->end();
+        MI != End;) {
+      MachineInstr *I1 = MI++;
+      // Don't combine a TFR whose user could be newified (instructions that
+      // define double registers can not be newified - Programmer's Ref Manual
+      // 5.4.2 New-value stores).
+      if (ShouldCombineAggressively && PotentiallyNewifiableTFR.count(I1))
+        continue;
+
+      // Ignore instructions that are not combinable.
+      if (!isCombinableInstType(I1, TII, ShouldCombineAggressively))
+        continue;
+
+      // Find a second instruction that can be merged into a combine
+      // instruction.
+      bool DoInsertAtI1 = false;
+      MachineInstr *I2 = findPairable(I1, DoInsertAtI1);
+      if (I2) {
+        HasChanged = true;
+        combine(I1, I2, MI, DoInsertAtI1);
+      }
+    }
+  }
+
+  return HasChanged;
+}
+
+/// findPairable - Returns an instruction that can be merged with \p I1 into a
+/// COMBINE instruction or 0 if no such instruction can be found. Returns true
+/// in \p DoInsertAtI1 if the combine must be inserted at instruction \p I1
+/// false if the combine must be inserted at the returned instruction.
+MachineInstr *HexagonCopyToCombine::findPairable(MachineInstr *I1,
+                                                 bool &DoInsertAtI1) {
+  MachineBasicBlock::iterator I2 = std::next(MachineBasicBlock::iterator(I1));
+  unsigned I1DestReg = I1->getOperand(0).getReg();
+
+  for (MachineBasicBlock::iterator End = I1->getParent()->end(); I2 != End;
+       ++I2) {
+    // Bail out early if we see a second definition of I1DestReg.
+    if (I2->modifiesRegister(I1DestReg, TRI))
+      break;
+
+    // Ignore non-combinable instructions.
+    if (!isCombinableInstType(I2, TII, ShouldCombineAggressively))
+      continue;
+
+    // Don't combine a TFR whose user could be newified.
+    if (ShouldCombineAggressively && PotentiallyNewifiableTFR.count(I2))
+      continue;
+
+    unsigned I2DestReg = I2->getOperand(0).getReg();
+
+    // Check that registers are adjacent and that the first destination register
+    // is even.
+    bool IsI1LowReg = (I2DestReg - I1DestReg) == 1;
+    bool IsI2LowReg = (I1DestReg - I2DestReg) == 1;
+    unsigned FirstRegIndex = IsI1LowReg ? I1DestReg : I2DestReg;
+    if ((!IsI1LowReg && !IsI2LowReg) || !isEvenReg(FirstRegIndex))
+      continue;
+
+    // Check that the two instructions are combinable. V4 allows more
+    // instructions to be merged into a combine.
+    // The order matters because in a TFRI we might can encode a int8 as the
+    // hi reg operand but only a uint6 as the low reg operand.
+    if ((IsI2LowReg && !areCombinableOperations(TRI, I1, I2)) ||
+        (IsI1LowReg && !areCombinableOperations(TRI, I2, I1)))
+      break;
+
+    if (isSafeToMoveTogether(I1, I2, I1DestReg, I2DestReg,
+                             DoInsertAtI1))
+      return I2;
+
+    // Not safe. Stop searching.
+    break;
+  }
+  return nullptr;
+}
+
+void HexagonCopyToCombine::combine(MachineInstr *I1, MachineInstr *I2,
+                                   MachineBasicBlock::iterator &MI,
+                                   bool DoInsertAtI1) {
+  // We are going to delete I2. If MI points to I2 advance it to the next
+  // instruction.
+  if ((MachineInstr *)MI == I2) ++MI;
+
+  // Figure out whether I1 or I2 goes into the lowreg part.
+  unsigned I1DestReg = I1->getOperand(0).getReg();
+  unsigned I2DestReg = I2->getOperand(0).getReg();
+  bool IsI1Loreg = (I2DestReg - I1DestReg) == 1;
+  unsigned LoRegDef = IsI1Loreg ? I1DestReg : I2DestReg;
+
+  // Get the double word register.
+  unsigned DoubleRegDest =
+    TRI->getMatchingSuperReg(LoRegDef, Hexagon::subreg_loreg,
+                             &Hexagon::DoubleRegsRegClass);
+  assert(DoubleRegDest != 0 && "Expect a valid register");
+
+
+  // Setup source operands.
+  MachineOperand &LoOperand = IsI1Loreg ? I1->getOperand(1) :
+    I2->getOperand(1);
+  MachineOperand &HiOperand = IsI1Loreg ? I2->getOperand(1) :
+    I1->getOperand(1);
+
+  // Figure out which source is a register and which a constant.
+  bool IsHiReg = HiOperand.isReg();
+  bool IsLoReg = LoOperand.isReg();
+
+  MachineBasicBlock::iterator InsertPt(DoInsertAtI1 ? I1 : I2);
+  // Emit combine.
+  if (IsHiReg && IsLoReg)
+    emitCombineRR(InsertPt, DoubleRegDest, HiOperand, LoOperand);
+  else if (IsHiReg)
+    emitCombineRI(InsertPt, DoubleRegDest, HiOperand, LoOperand);
+  else if (IsLoReg)
+    emitCombineIR(InsertPt, DoubleRegDest, HiOperand, LoOperand);
+  else
+    emitCombineII(InsertPt, DoubleRegDest, HiOperand, LoOperand);
+
+  I1->eraseFromParent();
+  I2->eraseFromParent();
+}
+
+void HexagonCopyToCombine::emitCombineII(MachineBasicBlock::iterator &InsertPt,
+                                         unsigned DoubleDestReg,
+                                         MachineOperand &HiOperand,
+                                         MachineOperand &LoOperand) {
+  DebugLoc DL = InsertPt->getDebugLoc();
+  MachineBasicBlock *BB = InsertPt->getParent();
+
+  // Handle  globals.
+  if (HiOperand.isGlobal()) {
+    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::COMBINE_Ii), DoubleDestReg)
+      .addGlobalAddress(HiOperand.getGlobal(), HiOperand.getOffset(),
+                        HiOperand.getTargetFlags())
+      .addImm(LoOperand.getImm());
+    return;
+  }
+  if (LoOperand.isGlobal()) {
+    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::COMBINE_iI_V4), DoubleDestReg)
+      .addImm(HiOperand.getImm())
+      .addGlobalAddress(LoOperand.getGlobal(), LoOperand.getOffset(),
+                        LoOperand.getTargetFlags());
+    return;
+  }
+
+  // Handle constant extended immediates.
+  if (!isInt<8>(HiOperand.getImm())) {
+    assert(isInt<8>(LoOperand.getImm()));
+    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::COMBINE_Ii), DoubleDestReg)
+      .addImm(HiOperand.getImm())
+      .addImm(LoOperand.getImm());
+    return;
+  }
+
+  if (!isUInt<6>(LoOperand.getImm())) {
+    assert(isInt<8>(HiOperand.getImm()));
+    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::COMBINE_iI_V4), DoubleDestReg)
+      .addImm(HiOperand.getImm())
+      .addImm(LoOperand.getImm());
+    return;
+  }
+
+  // Insert new combine instruction.
+  //  DoubleRegDest = combine #HiImm, #LoImm
+  BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::COMBINE_Ii), DoubleDestReg)
+    .addImm(HiOperand.getImm())
+    .addImm(LoOperand.getImm());
+}
+
+void HexagonCopyToCombine::emitCombineIR(MachineBasicBlock::iterator &InsertPt,
+                                         unsigned DoubleDestReg,
+                                         MachineOperand &HiOperand,
+                                         MachineOperand &LoOperand) {
+  unsigned LoReg = LoOperand.getReg();
+  unsigned LoRegKillFlag = getKillRegState(LoOperand.isKill());
+
+  DebugLoc DL = InsertPt->getDebugLoc();
+  MachineBasicBlock *BB = InsertPt->getParent();
+
+  // Handle global.
+  if (HiOperand.isGlobal()) {
+    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::COMBINE_Ir_V4), DoubleDestReg)
+      .addGlobalAddress(HiOperand.getGlobal(), HiOperand.getOffset(),
+                        HiOperand.getTargetFlags())
+      .addReg(LoReg, LoRegKillFlag);
+    return;
+  }
+  // Insert new combine instruction.
+  //  DoubleRegDest = combine #HiImm, LoReg
+  BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::COMBINE_Ir_V4), DoubleDestReg)
+    .addImm(HiOperand.getImm())
+    .addReg(LoReg, LoRegKillFlag);
+}
+
+void HexagonCopyToCombine::emitCombineRI(MachineBasicBlock::iterator &InsertPt,
+                                         unsigned DoubleDestReg,
+                                         MachineOperand &HiOperand,
+                                         MachineOperand &LoOperand) {
+  unsigned HiRegKillFlag = getKillRegState(HiOperand.isKill());
+  unsigned HiReg = HiOperand.getReg();
+
+  DebugLoc DL = InsertPt->getDebugLoc();
+  MachineBasicBlock *BB = InsertPt->getParent();
+
+  // Handle global.
+  if (LoOperand.isGlobal()) {
+    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::COMBINE_rI_V4), DoubleDestReg)
+      .addReg(HiReg, HiRegKillFlag)
+      .addGlobalAddress(LoOperand.getGlobal(), LoOperand.getOffset(),
+                        LoOperand.getTargetFlags());
+    return;
+  }
+
+  // Insert new combine instruction.
+  //  DoubleRegDest = combine HiReg, #LoImm
+  BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::COMBINE_rI_V4), DoubleDestReg)
+    .addReg(HiReg, HiRegKillFlag)
+    .addImm(LoOperand.getImm());
+}
+
+void HexagonCopyToCombine::emitCombineRR(MachineBasicBlock::iterator &InsertPt,
+                                         unsigned DoubleDestReg,
+                                         MachineOperand &HiOperand,
+                                         MachineOperand &LoOperand) {
+  unsigned LoRegKillFlag = getKillRegState(LoOperand.isKill());
+  unsigned HiRegKillFlag = getKillRegState(HiOperand.isKill());
+  unsigned LoReg = LoOperand.getReg();
+  unsigned HiReg = HiOperand.getReg();
+
+  DebugLoc DL = InsertPt->getDebugLoc();
+  MachineBasicBlock *BB = InsertPt->getParent();
+
+  // Insert new combine instruction.
+  //  DoubleRegDest = combine HiReg, LoReg
+  BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::COMBINE_rr), DoubleDestReg)
+    .addReg(HiReg, HiRegKillFlag)
+    .addReg(LoReg, LoRegKillFlag);
+}
+
+FunctionPass *llvm::createHexagonCopyToCombine() {
+  return new HexagonCopyToCombine();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp
new file mode 100644
index 0000000..3dafe80
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp
@@ -0,0 +1,201 @@
+//===-- HexagonExpandPredSpillCode.cpp - Expand Predicate Spill Code ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// The Hexagon processor has no instructions that load or store predicate
+// registers directly.  So, when these registers must be spilled a general
+// purpose register must be found and the value copied to/from it from/to
+// the predicate register.  This code currently does not use the register
+// scavenger mechanism available in the allocator.  There are two registers
+// reserved to allow spilling/restoring predicate registers.  One is used to
+// hold the predicate value.  The other is used when stack frame offsets are
+// too large.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Hexagon.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LatencyPriorityQueue.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+
+namespace llvm {
+  void initializeHexagonExpandPredSpillCodePass(PassRegistry&);
+}
+
+
+namespace {
+
+class HexagonExpandPredSpillCode : public MachineFunctionPass {
+    const HexagonTargetMachine& QTM;
+    const HexagonSubtarget &QST;
+
+ public:
+    static char ID;
+    HexagonExpandPredSpillCode(const HexagonTargetMachine& TM) :
+      MachineFunctionPass(ID), QTM(TM), QST(*TM.getSubtargetImpl()) {
+      PassRegistry &Registry = *PassRegistry::getPassRegistry();
+      initializeHexagonExpandPredSpillCodePass(Registry);
+    }
+
+    const char *getPassName() const override {
+      return "Hexagon Expand Predicate Spill Code";
+    }
+    bool runOnMachineFunction(MachineFunction &Fn) override;
+};
+
+
+char HexagonExpandPredSpillCode::ID = 0;
+
+
+bool HexagonExpandPredSpillCode::runOnMachineFunction(MachineFunction &Fn) {
+
+  const HexagonInstrInfo *TII = QTM.getInstrInfo();
+
+  // Loop over all of the basic blocks.
+  for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end();
+       MBBb != MBBe; ++MBBb) {
+    MachineBasicBlock* MBB = MBBb;
+    // Traverse the basic block.
+    for (MachineBasicBlock::iterator MII = MBB->begin(); MII != MBB->end();
+         ++MII) {
+      MachineInstr *MI = MII;
+      int Opc = MI->getOpcode();
+      if (Opc == Hexagon::STriw_pred) {
+        // STriw_pred [R30], ofst, SrcReg;
+        unsigned FP = MI->getOperand(0).getReg();
+        assert(FP == QTM.getRegisterInfo()->getFrameRegister() &&
+               "Not a Frame Pointer, Nor a Spill Slot");
+        assert(MI->getOperand(1).isImm() && "Not an offset");
+        int Offset = MI->getOperand(1).getImm();
+        int SrcReg = MI->getOperand(2).getReg();
+        assert(Hexagon::PredRegsRegClass.contains(SrcReg) &&
+               "Not a predicate register");
+        if (!TII->isValidOffset(Hexagon::STriw_indexed, Offset)) {
+          if (!TII->isValidOffset(Hexagon::ADD_ri, Offset)) {
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+                    TII->get(Hexagon::CONST32_Int_Real),
+                      HEXAGON_RESERVED_REG_1).addImm(Offset);
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::ADD_rr),
+                    HEXAGON_RESERVED_REG_1)
+              .addReg(FP).addReg(HEXAGON_RESERVED_REG_1);
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFR_RsPd),
+                      HEXAGON_RESERVED_REG_2).addReg(SrcReg);
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+                    TII->get(Hexagon::STriw_indexed))
+              .addReg(HEXAGON_RESERVED_REG_1)
+              .addImm(0).addReg(HEXAGON_RESERVED_REG_2);
+          } else {
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::ADD_ri),
+                      HEXAGON_RESERVED_REG_1).addReg(FP).addImm(Offset);
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFR_RsPd),
+                      HEXAGON_RESERVED_REG_2).addReg(SrcReg);
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+                          TII->get(Hexagon::STriw_indexed))
+              .addReg(HEXAGON_RESERVED_REG_1)
+              .addImm(0)
+              .addReg(HEXAGON_RESERVED_REG_2);
+          }
+        } else {
+          BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFR_RsPd),
+                    HEXAGON_RESERVED_REG_2).addReg(SrcReg);
+          BuildMI(*MBB, MII, MI->getDebugLoc(),
+                        TII->get(Hexagon::STriw_indexed)).
+                    addReg(FP).addImm(Offset).addReg(HEXAGON_RESERVED_REG_2);
+        }
+        MII = MBB->erase(MI);
+        --MII;
+      } else if (Opc == Hexagon::LDriw_pred) {
+        // DstReg = LDriw_pred [R30], ofst.
+        int DstReg = MI->getOperand(0).getReg();
+        assert(Hexagon::PredRegsRegClass.contains(DstReg) &&
+               "Not a predicate register");
+        unsigned FP = MI->getOperand(1).getReg();
+        assert(FP == QTM.getRegisterInfo()->getFrameRegister() &&
+               "Not a Frame Pointer, Nor a Spill Slot");
+        assert(MI->getOperand(2).isImm() && "Not an offset");
+        int Offset = MI->getOperand(2).getImm();
+        if (!TII->isValidOffset(Hexagon::LDriw, Offset)) {
+          if (!TII->isValidOffset(Hexagon::ADD_ri, Offset)) {
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+                    TII->get(Hexagon::CONST32_Int_Real),
+                      HEXAGON_RESERVED_REG_1).addImm(Offset);
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::ADD_rr),
+                    HEXAGON_RESERVED_REG_1)
+              .addReg(FP)
+              .addReg(HEXAGON_RESERVED_REG_1);
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::LDriw),
+                      HEXAGON_RESERVED_REG_2)
+              .addReg(HEXAGON_RESERVED_REG_1)
+              .addImm(0);
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFR_PdRs),
+                      DstReg).addReg(HEXAGON_RESERVED_REG_2);
+          } else {
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::ADD_ri),
+                      HEXAGON_RESERVED_REG_1).addReg(FP).addImm(Offset);
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::LDriw),
+                      HEXAGON_RESERVED_REG_2)
+              .addReg(HEXAGON_RESERVED_REG_1)
+              .addImm(0);
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFR_PdRs),
+                      DstReg).addReg(HEXAGON_RESERVED_REG_2);
+          }
+        } else {
+          BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::LDriw),
+                    HEXAGON_RESERVED_REG_2).addReg(FP).addImm(Offset);
+          BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFR_PdRs),
+                    DstReg).addReg(HEXAGON_RESERVED_REG_2);
+        }
+        MII = MBB->erase(MI);
+        --MII;
+      }
+    }
+  }
+
+  return true;
+}
+
+}
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+static void initializePassOnce(PassRegistry &Registry) {
+  const char *Name = "Hexagon Expand Predicate Spill Code";
+  PassInfo *PI = new PassInfo(Name, "hexagon-spill-pred",
+                              &HexagonExpandPredSpillCode::ID,
+                              nullptr, false, false);
+  Registry.registerPass(*PI, true);
+}
+
+void llvm::initializeHexagonExpandPredSpillCodePass(PassRegistry &Registry) {
+  CALL_ONCE_INITIALIZATION(initializePassOnce)
+}
+
+FunctionPass*
+llvm::createHexagonExpandPredSpillCode(const HexagonTargetMachine &TM) {
+  return new HexagonExpandPredSpillCode(TM);
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonFixupHwLoops.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
new file mode 100644
index 0000000..d41939a
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
@@ -0,0 +1,185 @@
+//===---- HexagonFixupHwLoops.cpp - Fixup HW loops too far from LOOPn. ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// The loop start address in the LOOPn instruction is encoded as a distance
+// from the LOOPn instruction itself.  If the start address is too far from
+// the LOOPn instruction, the loop needs to be set up manually, i.e. via
+// direct transfers to SAn and LCn.
+// This pass will identify and convert such LOOPn instructions to a proper
+// form.
+//===----------------------------------------------------------------------===//
+
+
+#include "llvm/ADT/DenseMap.h"
+#include "Hexagon.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/PassSupport.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+namespace llvm {
+  void initializeHexagonFixupHwLoopsPass(PassRegistry&);
+}
+
+namespace {
+  struct HexagonFixupHwLoops : public MachineFunctionPass {
+  public:
+    static char ID;
+
+    HexagonFixupHwLoops() : MachineFunctionPass(ID) {
+      initializeHexagonFixupHwLoopsPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    const char *getPassName() const override {
+      return "Hexagon Hardware Loop Fixup";
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+  private:
+    /// \brief Maximum distance between the loop instr and the basic block.
+    /// Just an estimate.
+    static const unsigned MAX_LOOP_DISTANCE = 200;
+
+    /// \brief Check the offset between each loop instruction and
+    /// the loop basic block to determine if we can use the LOOP instruction
+    /// or if we need to set the LC/SA registers explicitly.
+    bool fixupLoopInstrs(MachineFunction &MF);
+
+    /// \brief Add the instruction to set the LC and SA registers explicitly.
+    void convertLoopInstr(MachineFunction &MF,
+                          MachineBasicBlock::iterator &MII,
+                          RegScavenger &RS);
+
+  };
+
+  char HexagonFixupHwLoops::ID = 0;
+}
+
+INITIALIZE_PASS(HexagonFixupHwLoops, "hwloopsfixup",
+                "Hexagon Hardware Loops Fixup", false, false)
+
+FunctionPass *llvm::createHexagonFixupHwLoops() {
+  return new HexagonFixupHwLoops();
+}
+
+
+/// \brief Returns true if the instruction is a hardware loop instruction.
+static bool isHardwareLoop(const MachineInstr *MI) {
+  return MI->getOpcode() == Hexagon::LOOP0_r ||
+         MI->getOpcode() == Hexagon::LOOP0_i;
+}
+
+
+bool HexagonFixupHwLoops::runOnMachineFunction(MachineFunction &MF) {
+  bool Changed = fixupLoopInstrs(MF);
+  return Changed;
+}
+
+
+/// \brief For Hexagon, if the loop label is to far from the
+/// loop instruction then we need to set the LC0 and SA0 registers
+/// explicitly instead of using LOOP(start,count).  This function
+/// checks the distance, and generates register assignments if needed.
+///
+/// This function makes two passes over the basic blocks.  The first
+/// pass computes the offset of the basic block from the start.
+/// The second pass checks all the loop instructions.
+bool HexagonFixupHwLoops::fixupLoopInstrs(MachineFunction &MF) {
+
+  // Offset of the current instruction from the start.
+  unsigned InstOffset = 0;
+  // Map for each basic block to it's first instruction.
+  DenseMap<MachineBasicBlock*, unsigned> BlockToInstOffset;
+
+  // First pass - compute the offset of each basic block.
+  for (MachineFunction::iterator MBB = MF.begin(), MBBe = MF.end();
+       MBB != MBBe; ++MBB) {
+    BlockToInstOffset[MBB] = InstOffset;
+    InstOffset += (MBB->size() * 4);
+  }
+
+  // Second pass - check each loop instruction to see if it needs to
+  // be converted.
+  InstOffset = 0;
+  bool Changed = false;
+  RegScavenger RS;
+
+  // Loop over all the basic blocks.
+  for (MachineFunction::iterator MBB = MF.begin(), MBBe = MF.end();
+       MBB != MBBe; ++MBB) {
+    InstOffset = BlockToInstOffset[MBB];
+    RS.enterBasicBlock(MBB);
+
+    // Loop over all the instructions.
+    MachineBasicBlock::iterator MIE = MBB->end();
+    MachineBasicBlock::iterator MII = MBB->begin();
+    while (MII != MIE) {
+      if (isHardwareLoop(MII)) {
+        RS.forward(MII);
+        assert(MII->getOperand(0).isMBB() &&
+               "Expect a basic block as loop operand");
+        int Sub = InstOffset - BlockToInstOffset[MII->getOperand(0).getMBB()];
+        unsigned Dist = Sub > 0 ? Sub : -Sub;
+        if (Dist > MAX_LOOP_DISTANCE) {
+          // Convert to explicity setting LC0 and SA0.
+          convertLoopInstr(MF, MII, RS);
+          MII = MBB->erase(MII);
+          Changed = true;
+        } else {
+          ++MII;
+        }
+      } else {
+        ++MII;
+      }
+      InstOffset += 4;
+    }
+  }
+
+  return Changed;
+}
+
+
+/// \brief convert a loop instruction to a sequence of instructions that
+/// set the LC0 and SA0 register explicitly.
+void HexagonFixupHwLoops::convertLoopInstr(MachineFunction &MF,
+                                           MachineBasicBlock::iterator &MII,
+                                           RegScavenger &RS) {
+  const TargetInstrInfo *TII = MF.getTarget().getInstrInfo();
+  MachineBasicBlock *MBB = MII->getParent();
+  DebugLoc DL = MII->getDebugLoc();
+  unsigned Scratch = RS.scavengeRegister(&Hexagon::IntRegsRegClass, MII, 0);
+
+  // First, set the LC0 with the trip count.
+  if (MII->getOperand(1).isReg()) {
+    // Trip count is a register
+    BuildMI(*MBB, MII, DL, TII->get(Hexagon::TFCR), Hexagon::LC0)
+      .addReg(MII->getOperand(1).getReg());
+  } else {
+    // Trip count is an immediate.
+    BuildMI(*MBB, MII, DL, TII->get(Hexagon::TFRI), Scratch)
+      .addImm(MII->getOperand(1).getImm());
+    BuildMI(*MBB, MII, DL, TII->get(Hexagon::TFCR), Hexagon::LC0)
+      .addReg(Scratch);
+  }
+  // Then, set the SA0 with the loop start address.
+  BuildMI(*MBB, MII, DL, TII->get(Hexagon::CONST32_Label), Scratch)
+    .addMBB(MII->getOperand(0).getMBB());
+  BuildMI(*MBB, MII, DL, TII->get(Hexagon::TFCR), Hexagon::SA0)
+    .addReg(Scratch);
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
new file mode 100644
index 0000000..21df12f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -0,0 +1,346 @@
+//===-- HexagonFrameLowering.cpp - Define frame lowering ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonFrameLowering.h"
+#include "Hexagon.h"
+#include "HexagonInstrInfo.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+static cl::opt<bool> DisableDeallocRet(
+                       "disable-hexagon-dealloc-ret",
+                       cl::Hidden,
+                       cl::desc("Disable Dealloc Return for Hexagon target"));
+
+/// determineFrameLayout - Determine the size of the frame and maximum call
+/// frame size.
+void HexagonFrameLowering::determineFrameLayout(MachineFunction &MF) const {
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Get the number of bytes to allocate from the FrameInfo.
+  unsigned FrameSize = MFI->getStackSize();
+
+  // Get the alignments provided by the target.
+  unsigned TargetAlign = MF.getTarget().getFrameLowering()->getStackAlignment();
+  // Get the maximum call frame size of all the calls.
+  unsigned maxCallFrameSize = MFI->getMaxCallFrameSize();
+
+  // If we have dynamic alloca then maxCallFrameSize needs to be aligned so
+  // that allocations will be aligned.
+  if (MFI->hasVarSizedObjects())
+    maxCallFrameSize = RoundUpToAlignment(maxCallFrameSize, TargetAlign);
+
+  // Update maximum call frame size.
+  MFI->setMaxCallFrameSize(maxCallFrameSize);
+
+  // Include call frame size in total.
+  FrameSize += maxCallFrameSize;
+
+  // Make sure the frame is aligned.
+  FrameSize = RoundUpToAlignment(FrameSize, TargetAlign);
+
+  // Update frame info.
+  MFI->setStackSize(FrameSize);
+}
+
+
+void HexagonFrameLowering::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  const HexagonRegisterInfo *QRI =
+    static_cast<const HexagonRegisterInfo *>(MF.getTarget().getRegisterInfo());
+  DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+  determineFrameLayout(MF);
+
+  // Get the number of bytes to allocate from the FrameInfo.
+  int NumBytes = (int) MFI->getStackSize();
+
+  // LLVM expects allocframe not to be the first instruction in the
+  // basic block.
+  MachineBasicBlock::iterator InsertPt = MBB.begin();
+
+  //
+  // ALLOCA adjust regs.  Iterate over ADJDYNALLOC nodes and change the offset.
+  //
+  HexagonMachineFunctionInfo *FuncInfo =
+    MF.getInfo<HexagonMachineFunctionInfo>();
+  const std::vector<MachineInstr*>& AdjustRegs =
+    FuncInfo->getAllocaAdjustInsts();
+  for (std::vector<MachineInstr*>::const_iterator i = AdjustRegs.begin(),
+         e = AdjustRegs.end();
+       i != e; ++i) {
+    MachineInstr* MI = *i;
+    assert((MI->getOpcode() == Hexagon::ADJDYNALLOC) &&
+           "Expected adjust alloca node");
+
+    MachineOperand& MO = MI->getOperand(2);
+    assert(MO.isImm() && "Expected immediate");
+    MO.setImm(MFI->getMaxCallFrameSize());
+  }
+
+  //
+  // Only insert ALLOCFRAME if we need to.
+  //
+  if (hasFP(MF)) {
+    // Check for overflow.
+    // Hexagon_TODO: Ugh! hardcoding. Is there an API that can be used?
+    const int ALLOCFRAME_MAX = 16384;
+    const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+
+    if (NumBytes >= ALLOCFRAME_MAX) {
+      // Emit allocframe(#0).
+      BuildMI(MBB, InsertPt, dl, TII.get(Hexagon::ALLOCFRAME)).addImm(0);
+
+      // Subtract offset from frame pointer.
+      BuildMI(MBB, InsertPt, dl, TII.get(Hexagon::CONST32_Int_Real),
+                                      HEXAGON_RESERVED_REG_1).addImm(NumBytes);
+      BuildMI(MBB, InsertPt, dl, TII.get(Hexagon::SUB_rr),
+                                      QRI->getStackRegister()).
+                                      addReg(QRI->getStackRegister()).
+                                      addReg(HEXAGON_RESERVED_REG_1);
+    } else {
+      BuildMI(MBB, InsertPt, dl, TII.get(Hexagon::ALLOCFRAME)).addImm(NumBytes);
+    }
+  }
+}
+// Returns true if MBB has a machine instructions that indicates a tail call
+// in the block.
+bool HexagonFrameLowering::hasTailCall(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  unsigned RetOpcode = MBBI->getOpcode();
+
+  return RetOpcode == Hexagon::TCRETURNtg || RetOpcode == Hexagon::TCRETURNtext;
+}
+
+void HexagonFrameLowering::emitEpilogue(MachineFunction &MF,
+                                     MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = std::prev(MBB.end());
+  DebugLoc dl = MBBI->getDebugLoc();
+  //
+  // Only insert deallocframe if we need to.  Also at -O0.  See comment
+  // in emitPrologue above.
+  //
+  if (hasFP(MF) || MF.getTarget().getOptLevel() == CodeGenOpt::None) {
+    MachineBasicBlock::iterator MBBI = std::prev(MBB.end());
+    MachineBasicBlock::iterator MBBI_end = MBB.end();
+
+    const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+    // Handle EH_RETURN.
+    if (MBBI->getOpcode() == Hexagon::EH_RETURN_JMPR) {
+      assert(MBBI->getOperand(0).isReg() && "Offset should be in register!");
+      BuildMI(MBB, MBBI, dl, TII.get(Hexagon::DEALLOCFRAME));
+      BuildMI(MBB, MBBI, dl, TII.get(Hexagon::ADD_rr),
+              Hexagon::R29).addReg(Hexagon::R29).addReg(Hexagon::R28);
+      return;
+    }
+    // Replace 'jumpr r31' instruction with dealloc_return for V4 and higher
+    // versions.
+    if (MF.getTarget().getSubtarget<HexagonSubtarget>().hasV4TOps() &&
+        MBBI->getOpcode() == Hexagon::JMPret && !DisableDeallocRet) {
+      // Check for RESTORE_DEALLOC_RET_JMP_V4 call. Don't emit an extra DEALLOC
+      // instruction if we encounter it.
+      MachineBasicBlock::iterator BeforeJMPR =
+        MBB.begin() == MBBI ? MBBI : std::prev(MBBI);
+      if (BeforeJMPR != MBBI &&
+          BeforeJMPR->getOpcode() == Hexagon::RESTORE_DEALLOC_RET_JMP_V4) {
+        // Remove the JMPR node.
+        MBB.erase(MBBI);
+        return;
+      }
+
+      // Add dealloc_return.
+      MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI_end, dl, TII.get(Hexagon::DEALLOC_RET_V4));
+      // Transfer the function live-out registers.
+      MIB->copyImplicitOps(*MBB.getParent(), &*MBBI);
+      // Remove the JUMPR node.
+      MBB.erase(MBBI);
+    } else { // Add deallocframe for V2 and V3, and V4 tail calls.
+      // Check for RESTORE_DEALLOC_BEFORE_TAILCALL_V4. We don't need an extra
+      // DEALLOCFRAME instruction after it.
+      MachineBasicBlock::iterator Term = MBB.getFirstTerminator();
+      MachineBasicBlock::iterator I =
+        Term == MBB.begin() ?  MBB.end() : std::prev(Term);
+      if (I != MBB.end() &&
+          I->getOpcode() == Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4)
+        return;
+
+      BuildMI(MBB, MBBI, dl, TII.get(Hexagon::DEALLOCFRAME));
+    }
+  }
+}
+
+bool HexagonFrameLowering::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const HexagonMachineFunctionInfo *FuncInfo =
+    MF.getInfo<HexagonMachineFunctionInfo>();
+  return (MFI->hasCalls() || (MFI->getStackSize() > 0) ||
+          FuncInfo->hasClobberLR() );
+}
+
+static inline
+unsigned uniqueSuperReg(unsigned Reg, const TargetRegisterInfo *TRI) {
+  MCSuperRegIterator SRI(Reg, TRI);
+  assert(SRI.isValid() && "Expected a superreg");
+  unsigned SuperReg = *SRI;
+  ++SRI;
+  assert(!SRI.isValid() && "Expected exactly one superreg");
+  return SuperReg;
+}
+
+bool
+HexagonFrameLowering::spillCalleeSavedRegisters(
+                                        MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MI,
+                                        const std::vector<CalleeSavedInfo> &CSI,
+                                        const TargetRegisterInfo *TRI) const {
+  MachineFunction *MF = MBB.getParent();
+  const TargetInstrInfo &TII = *MF->getTarget().getInstrInfo();
+
+  if (CSI.empty()) {
+    return false;
+  }
+
+  // We can only schedule double loads if we spill contiguous callee-saved regs
+  // For instance, we cannot scheduled double-word loads if we spill r24,
+  // r26, and r27.
+  // Hexagon_TODO: We can try to double-word align odd registers for -O2 and
+  // above.
+  bool ContiguousRegs = true;
+
+  for (unsigned i = 0; i < CSI.size(); ++i) {
+    unsigned Reg = CSI[i].getReg();
+
+    //
+    // Check if we can use a double-word store.
+    //
+    unsigned SuperReg = uniqueSuperReg(Reg, TRI);
+    bool CanUseDblStore = false;
+    const TargetRegisterClass* SuperRegClass = nullptr;
+
+    if (ContiguousRegs && (i < CSI.size()-1)) {
+      unsigned SuperRegNext = uniqueSuperReg(CSI[i+1].getReg(), TRI);
+      SuperRegClass = TRI->getMinimalPhysRegClass(SuperReg);
+      CanUseDblStore = (SuperRegNext == SuperReg);
+    }
+
+
+    if (CanUseDblStore) {
+      TII.storeRegToStackSlot(MBB, MI, SuperReg, true,
+                              CSI[i+1].getFrameIdx(), SuperRegClass, TRI);
+      MBB.addLiveIn(SuperReg);
+      ++i;
+    } else {
+      // Cannot use a double-word store.
+      ContiguousRegs = false;
+      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+      TII.storeRegToStackSlot(MBB, MI, Reg, true, CSI[i].getFrameIdx(), RC,
+                              TRI);
+      MBB.addLiveIn(Reg);
+    }
+  }
+  return true;
+}
+
+
+bool HexagonFrameLowering::restoreCalleeSavedRegisters(
+                                        MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MI,
+                                        const std::vector<CalleeSavedInfo> &CSI,
+                                        const TargetRegisterInfo *TRI) const {
+
+  MachineFunction *MF = MBB.getParent();
+  const TargetInstrInfo &TII = *MF->getTarget().getInstrInfo();
+
+  if (CSI.empty()) {
+    return false;
+  }
+
+  // We can only schedule double loads if we spill contiguous callee-saved regs
+  // For instance, we cannot scheduled double-word loads if we spill r24,
+  // r26, and r27.
+  // Hexagon_TODO: We can try to double-word align odd registers for -O2 and
+  // above.
+  bool ContiguousRegs = true;
+
+  for (unsigned i = 0; i < CSI.size(); ++i) {
+    unsigned Reg = CSI[i].getReg();
+
+    //
+    // Check if we can use a double-word load.
+    //
+    unsigned SuperReg = uniqueSuperReg(Reg, TRI);
+    const TargetRegisterClass* SuperRegClass = nullptr;
+    bool CanUseDblLoad = false;
+    if (ContiguousRegs && (i < CSI.size()-1)) {
+      unsigned SuperRegNext = uniqueSuperReg(CSI[i+1].getReg(), TRI);
+      SuperRegClass = TRI->getMinimalPhysRegClass(SuperReg);
+      CanUseDblLoad = (SuperRegNext == SuperReg);
+    }
+
+
+    if (CanUseDblLoad) {
+      TII.loadRegFromStackSlot(MBB, MI, SuperReg, CSI[i+1].getFrameIdx(),
+                               SuperRegClass, TRI);
+      MBB.addLiveIn(SuperReg);
+      ++i;
+    } else {
+      // Cannot use a double-word load.
+      ContiguousRegs = false;
+      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+      TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), RC, TRI);
+      MBB.addLiveIn(Reg);
+    }
+  }
+  return true;
+}
+
+void HexagonFrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  MachineInstr &MI = *I;
+
+  if (MI.getOpcode() == Hexagon::ADJCALLSTACKDOWN) {
+    // Hexagon_TODO: add code
+  } else if (MI.getOpcode() == Hexagon::ADJCALLSTACKUP) {
+    // Hexagon_TODO: add code
+  } else {
+    llvm_unreachable("Cannot handle this call frame pseudo instruction");
+  }
+  MBB.erase(I);
+}
+
+int HexagonFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
+                                              int FI) const {
+  return MF.getFrameInfo()->getObjectOffset(FI);
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.h b/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.h
new file mode 100644
index 0000000..2d4b0b9
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.h
@@ -0,0 +1,51 @@
+//=- HexagonFrameLowering.h - Define frame lowering for Hexagon --*- C++ -*--=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HEXAGON_FRAMEINFO_H
+#define HEXAGON_FRAMEINFO_H
+
+#include "Hexagon.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+
+class HexagonFrameLowering : public TargetFrameLowering {
+private:
+  void determineFrameLayout(MachineFunction &MF) const;
+
+public:
+  explicit HexagonFrameLowering() : TargetFrameLowering(StackGrowsDown, 8, 0) {}
+
+  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+  /// the function.
+  void emitPrologue(MachineFunction &MF) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 const TargetRegisterInfo *TRI) const override;
+
+  void
+  eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator I) const override;
+
+  bool
+  restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MI,
+                              const std::vector<CalleeSavedInfo> &CSI,
+                              const TargetRegisterInfo *TRI) const override;
+  int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
+  bool hasFP(const MachineFunction &MF) const override;
+  bool hasTailCall(MachineBasicBlock &MBB) const;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
new file mode 100644
index 0000000..7f76421
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
@@ -0,0 +1,1548 @@
+//===-- HexagonHardwareLoops.cpp - Identify and generate hardware loops ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass identifies loops where we can generate the Hexagon hardware
+// loop instruction.  The hardware loop can perform loop branches with a
+// zero-cycle overhead.
+//
+// The pattern that defines the induction variable can changed depending on
+// prior optimizations.  For example, the IndVarSimplify phase run by 'opt'
+// normalizes induction variables, and the Loop Strength Reduction pass
+// run by 'llc' may also make changes to the induction variable.
+// The pattern detected by this phase is due to running Strength Reduction.
+//
+// Criteria for hardware loops:
+//  - Countable loops (w/ ind. var for a trip count)
+//  - Assumes loops are normalized by IndVarSimplify
+//  - Try inner-most loops first
+//  - No nested hardware loops.
+//  - No function calls in loops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallSet.h"
+#include "Hexagon.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/PassSupport.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include <algorithm>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "hwloops"
+
+#ifndef NDEBUG
+static cl::opt<int> HWLoopLimit("max-hwloop", cl::Hidden, cl::init(-1));
+#endif
+
+STATISTIC(NumHWLoops, "Number of loops converted to hardware loops");
+
+namespace llvm {
+  void initializeHexagonHardwareLoopsPass(PassRegistry&);
+}
+
+namespace {
+  class CountValue;
+  struct HexagonHardwareLoops : public MachineFunctionPass {
+    MachineLoopInfo            *MLI;
+    MachineRegisterInfo        *MRI;
+    MachineDominatorTree       *MDT;
+    const HexagonTargetMachine *TM;
+    const HexagonInstrInfo     *TII;
+    const HexagonRegisterInfo  *TRI;
+#ifndef NDEBUG
+    static int Counter;
+#endif
+
+  public:
+    static char ID;
+
+    HexagonHardwareLoops() : MachineFunctionPass(ID) {
+      initializeHexagonHardwareLoopsPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    const char *getPassName() const override { return "Hexagon Hardware Loops"; }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<MachineDominatorTree>();
+      AU.addRequired<MachineLoopInfo>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+  private:
+    /// Kinds of comparisons in the compare instructions.
+    struct Comparison {
+      enum Kind {
+        EQ  = 0x01,
+        NE  = 0x02,
+        L   = 0x04, // Less-than property.
+        G   = 0x08, // Greater-than property.
+        U   = 0x40, // Unsigned property.
+        LTs = L,
+        LEs = L | EQ,
+        GTs = G,
+        GEs = G | EQ,
+        LTu = L      | U,
+        LEu = L | EQ | U,
+        GTu = G      | U,
+        GEu = G | EQ | U
+      };
+
+      static Kind getSwappedComparison(Kind Cmp) {
+        assert ((!((Cmp & L) && (Cmp & G))) && "Malformed comparison operator");
+        if ((Cmp & L) || (Cmp & G))
+          return (Kind)(Cmp ^ (L|G));
+        return Cmp;
+      }
+    };
+
+    /// \brief Find the register that contains the loop controlling
+    /// induction variable.
+    /// If successful, it will return true and set the \p Reg, \p IVBump
+    /// and \p IVOp arguments.  Otherwise it will return false.
+    /// The returned induction register is the register R that follows the
+    /// following induction pattern:
+    /// loop:
+    ///   R = phi ..., [ R.next, LatchBlock ]
+    ///   R.next = R + #bump
+    ///   if (R.next < #N) goto loop
+    /// IVBump is the immediate value added to R, and IVOp is the instruction
+    /// "R.next = R + #bump".
+    bool findInductionRegister(MachineLoop *L, unsigned &Reg,
+                               int64_t &IVBump, MachineInstr *&IVOp) const;
+
+    /// \brief Analyze the statements in a loop to determine if the loop
+    /// has a computable trip count and, if so, return a value that represents
+    /// the trip count expression.
+    CountValue *getLoopTripCount(MachineLoop *L,
+                                 SmallVectorImpl<MachineInstr *> &OldInsts);
+
+    /// \brief Return the expression that represents the number of times
+    /// a loop iterates.  The function takes the operands that represent the
+    /// loop start value, loop end value, and induction value.  Based upon
+    /// these operands, the function attempts to compute the trip count.
+    /// If the trip count is not directly available (as an immediate value,
+    /// or a register), the function will attempt to insert computation of it
+    /// to the loop's preheader.
+    CountValue *computeCount(MachineLoop *Loop,
+                             const MachineOperand *Start,
+                             const MachineOperand *End,
+                             unsigned IVReg,
+                             int64_t IVBump,
+                             Comparison::Kind Cmp) const;
+
+    /// \brief Return true if the instruction is not valid within a hardware
+    /// loop.
+    bool isInvalidLoopOperation(const MachineInstr *MI) const;
+
+    /// \brief Return true if the loop contains an instruction that inhibits
+    /// using the hardware loop.
+    bool containsInvalidInstruction(MachineLoop *L) const;
+
+    /// \brief Given a loop, check if we can convert it to a hardware loop.
+    /// If so, then perform the conversion and return true.
+    bool convertToHardwareLoop(MachineLoop *L);
+
+    /// \brief Return true if the instruction is now dead.
+    bool isDead(const MachineInstr *MI,
+                SmallVectorImpl<MachineInstr *> &DeadPhis) const;
+
+    /// \brief Remove the instruction if it is now dead.
+    void removeIfDead(MachineInstr *MI);
+
+    /// \brief Make sure that the "bump" instruction executes before the
+    /// compare.  We need that for the IV fixup, so that the compare
+    /// instruction would not use a bumped value that has not yet been
+    /// defined.  If the instructions are out of order, try to reorder them.
+    bool orderBumpCompare(MachineInstr *BumpI, MachineInstr *CmpI);
+
+    /// \brief Get the instruction that loads an immediate value into \p R,
+    /// or 0 if such an instruction does not exist.
+    MachineInstr *defWithImmediate(unsigned R);
+
+    /// \brief Get the immediate value referenced to by \p MO, either for
+    /// immediate operands, or for register operands, where the register
+    /// was defined with an immediate value.
+    int64_t getImmediate(MachineOperand &MO);
+
+    /// \brief Reset the given machine operand to now refer to a new immediate
+    /// value.  Assumes that the operand was already referencing an immediate
+    /// value, either directly, or via a register.
+    void setImmediate(MachineOperand &MO, int64_t Val);
+
+    /// \brief Fix the data flow of the induction varible.
+    /// The desired flow is: phi ---> bump -+-> comparison-in-latch.
+    ///                                     |
+    ///                                     +-> back to phi
+    /// where "bump" is the increment of the induction variable:
+    ///   iv = iv + #const.
+    /// Due to some prior code transformations, the actual flow may look
+    /// like this:
+    ///   phi -+-> bump ---> back to phi
+    ///        |
+    ///        +-> comparison-in-latch (against upper_bound-bump),
+    /// i.e. the comparison that controls the loop execution may be using
+    /// the value of the induction variable from before the increment.
+    ///
+    /// Return true if the loop's flow is the desired one (i.e. it's
+    /// either been fixed, or no fixing was necessary).
+    /// Otherwise, return false.  This can happen if the induction variable
+    /// couldn't be identified, or if the value in the latch's comparison
+    /// cannot be adjusted to reflect the post-bump value.
+    bool fixupInductionVariable(MachineLoop *L);
+
+    /// \brief Given a loop, if it does not have a preheader, create one.
+    /// Return the block that is the preheader.
+    MachineBasicBlock *createPreheaderForLoop(MachineLoop *L);
+  };
+
+  char HexagonHardwareLoops::ID = 0;
+#ifndef NDEBUG
+  int HexagonHardwareLoops::Counter = 0;
+#endif
+
+  /// \brief Abstraction for a trip count of a loop. A smaller vesrsion
+  /// of the MachineOperand class without the concerns of changing the
+  /// operand representation.
+  class CountValue {
+  public:
+    enum CountValueType {
+      CV_Register,
+      CV_Immediate
+    };
+  private:
+    CountValueType Kind;
+    union Values {
+      struct {
+        unsigned Reg;
+        unsigned Sub;
+      } R;
+      unsigned ImmVal;
+    } Contents;
+
+  public:
+    explicit CountValue(CountValueType t, unsigned v, unsigned u = 0) {
+      Kind = t;
+      if (Kind == CV_Register) {
+        Contents.R.Reg = v;
+        Contents.R.Sub = u;
+      } else {
+        Contents.ImmVal = v;
+      }
+    }
+    bool isReg() const { return Kind == CV_Register; }
+    bool isImm() const { return Kind == CV_Immediate; }
+
+    unsigned getReg() const {
+      assert(isReg() && "Wrong CountValue accessor");
+      return Contents.R.Reg;
+    }
+    unsigned getSubReg() const {
+      assert(isReg() && "Wrong CountValue accessor");
+      return Contents.R.Sub;
+    }
+    unsigned getImm() const {
+      assert(isImm() && "Wrong CountValue accessor");
+      return Contents.ImmVal;
+    }
+
+    void print(raw_ostream &OS, const TargetMachine *TM = nullptr) const {
+      const TargetRegisterInfo *TRI = TM ? TM->getRegisterInfo() : nullptr;
+      if (isReg()) { OS << PrintReg(Contents.R.Reg, TRI, Contents.R.Sub); }
+      if (isImm()) { OS << Contents.ImmVal; }
+    }
+  };
+} // end anonymous namespace
+
+
+INITIALIZE_PASS_BEGIN(HexagonHardwareLoops, "hwloops",
+                      "Hexagon Hardware Loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(HexagonHardwareLoops, "hwloops",
+                    "Hexagon Hardware Loops", false, false)
+
+
+/// \brief Returns true if the instruction is a hardware loop instruction.
+static bool isHardwareLoop(const MachineInstr *MI) {
+  return MI->getOpcode() == Hexagon::LOOP0_r ||
+    MI->getOpcode() == Hexagon::LOOP0_i;
+}
+
+FunctionPass *llvm::createHexagonHardwareLoops() {
+  return new HexagonHardwareLoops();
+}
+
+
+bool HexagonHardwareLoops::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "********* Hexagon Hardware Loops *********\n");
+
+  bool Changed = false;
+
+  MLI = &getAnalysis<MachineLoopInfo>();
+  MRI = &MF.getRegInfo();
+  MDT = &getAnalysis<MachineDominatorTree>();
+  TM  = static_cast<const HexagonTargetMachine*>(&MF.getTarget());
+  TII = static_cast<const HexagonInstrInfo*>(TM->getInstrInfo());
+  TRI = static_cast<const HexagonRegisterInfo*>(TM->getRegisterInfo());
+
+  for (MachineLoopInfo::iterator I = MLI->begin(), E = MLI->end();
+       I != E; ++I) {
+    MachineLoop *L = *I;
+    if (!L->getParentLoop())
+      Changed |= convertToHardwareLoop(L);
+  }
+
+  return Changed;
+}
+
+
+bool HexagonHardwareLoops::findInductionRegister(MachineLoop *L,
+                                                 unsigned &Reg,
+                                                 int64_t &IVBump,
+                                                 MachineInstr *&IVOp
+                                                 ) const {
+  MachineBasicBlock *Header = L->getHeader();
+  MachineBasicBlock *Preheader = L->getLoopPreheader();
+  MachineBasicBlock *Latch = L->getLoopLatch();
+  if (!Header || !Preheader || !Latch)
+    return false;
+
+  // This pair represents an induction register together with an immediate
+  // value that will be added to it in each loop iteration.
+  typedef std::pair<unsigned,int64_t> RegisterBump;
+
+  // Mapping:  R.next -> (R, bump), where R, R.next and bump are derived
+  // from an induction operation
+  //   R.next = R + bump
+  // where bump is an immediate value.
+  typedef std::map<unsigned,RegisterBump> InductionMap;
+
+  InductionMap IndMap;
+
+  typedef MachineBasicBlock::instr_iterator instr_iterator;
+  for (instr_iterator I = Header->instr_begin(), E = Header->instr_end();
+       I != E && I->isPHI(); ++I) {
+    MachineInstr *Phi = &*I;
+
+    // Have a PHI instruction.  Get the operand that corresponds to the
+    // latch block, and see if is a result of an addition of form "reg+imm",
+    // where the "reg" is defined by the PHI node we are looking at.
+    for (unsigned i = 1, n = Phi->getNumOperands(); i < n; i += 2) {
+      if (Phi->getOperand(i+1).getMBB() != Latch)
+        continue;
+
+      unsigned PhiOpReg = Phi->getOperand(i).getReg();
+      MachineInstr *DI = MRI->getVRegDef(PhiOpReg);
+      unsigned UpdOpc = DI->getOpcode();
+      bool isAdd = (UpdOpc == Hexagon::ADD_ri);
+
+      if (isAdd) {
+        // If the register operand to the add is the PHI we're
+        // looking at, this meets the induction pattern.
+        unsigned IndReg = DI->getOperand(1).getReg();
+        if (MRI->getVRegDef(IndReg) == Phi) {
+          unsigned UpdReg = DI->getOperand(0).getReg();
+          int64_t V = DI->getOperand(2).getImm();
+          IndMap.insert(std::make_pair(UpdReg, std::make_pair(IndReg, V)));
+        }
+      }
+    }  // for (i)
+  }  // for (instr)
+
+  SmallVector<MachineOperand,2> Cond;
+  MachineBasicBlock *TB = nullptr, *FB = nullptr;
+  bool NotAnalyzed = TII->AnalyzeBranch(*Latch, TB, FB, Cond, false);
+  if (NotAnalyzed)
+    return false;
+
+  unsigned CSz = Cond.size();
+  assert (CSz == 1 || CSz == 2);
+  unsigned PredR = Cond[CSz-1].getReg();
+
+  MachineInstr *PredI = MRI->getVRegDef(PredR);
+  if (!PredI->isCompare())
+    return false;
+
+  unsigned CmpReg1 = 0, CmpReg2 = 0;
+  int CmpImm = 0, CmpMask = 0;
+  bool CmpAnalyzed = TII->analyzeCompare(PredI, CmpReg1, CmpReg2,
+                                         CmpMask, CmpImm);
+  // Fail if the compare was not analyzed, or it's not comparing a register
+  // with an immediate value.  Not checking the mask here, since we handle
+  // the individual compare opcodes (including CMPb) later on.
+  if (!CmpAnalyzed)
+    return false;
+
+  // Exactly one of the input registers to the comparison should be among
+  // the induction registers.
+  InductionMap::iterator IndMapEnd = IndMap.end();
+  InductionMap::iterator F = IndMapEnd;
+  if (CmpReg1 != 0) {
+    InductionMap::iterator F1 = IndMap.find(CmpReg1);
+    if (F1 != IndMapEnd)
+      F = F1;
+  }
+  if (CmpReg2 != 0) {
+    InductionMap::iterator F2 = IndMap.find(CmpReg2);
+    if (F2 != IndMapEnd) {
+      if (F != IndMapEnd)
+        return false;
+      F = F2;
+    }
+  }
+  if (F == IndMapEnd)
+    return false;
+
+  Reg = F->second.first;
+  IVBump = F->second.second;
+  IVOp = MRI->getVRegDef(F->first);
+  return true;
+}
+
+
+/// \brief Analyze the statements in a loop to determine if the loop has
+/// a computable trip count and, if so, return a value that represents
+/// the trip count expression.
+///
+/// This function iterates over the phi nodes in the loop to check for
+/// induction variable patterns that are used in the calculation for
+/// the number of time the loop is executed.
+CountValue *HexagonHardwareLoops::getLoopTripCount(MachineLoop *L,
+                                    SmallVectorImpl<MachineInstr *> &OldInsts) {
+  MachineBasicBlock *TopMBB = L->getTopBlock();
+  MachineBasicBlock::pred_iterator PI = TopMBB->pred_begin();
+  assert(PI != TopMBB->pred_end() &&
+         "Loop must have more than one incoming edge!");
+  MachineBasicBlock *Backedge = *PI++;
+  if (PI == TopMBB->pred_end())  // dead loop?
+    return nullptr;
+  MachineBasicBlock *Incoming = *PI++;
+  if (PI != TopMBB->pred_end())  // multiple backedges?
+    return nullptr;
+
+  // Make sure there is one incoming and one backedge and determine which
+  // is which.
+  if (L->contains(Incoming)) {
+    if (L->contains(Backedge))
+      return nullptr;
+    std::swap(Incoming, Backedge);
+  } else if (!L->contains(Backedge))
+    return nullptr;
+
+  // Look for the cmp instruction to determine if we can get a useful trip
+  // count.  The trip count can be either a register or an immediate.  The
+  // location of the value depends upon the type (reg or imm).
+  MachineBasicBlock *Latch = L->getLoopLatch();
+  if (!Latch)
+    return nullptr;
+
+  unsigned IVReg = 0;
+  int64_t IVBump = 0;
+  MachineInstr *IVOp;
+  bool FoundIV = findInductionRegister(L, IVReg, IVBump, IVOp);
+  if (!FoundIV)
+    return nullptr;
+
+  MachineBasicBlock *Preheader = L->getLoopPreheader();
+
+  MachineOperand *InitialValue = nullptr;
+  MachineInstr *IV_Phi = MRI->getVRegDef(IVReg);
+  for (unsigned i = 1, n = IV_Phi->getNumOperands(); i < n; i += 2) {
+    MachineBasicBlock *MBB = IV_Phi->getOperand(i+1).getMBB();
+    if (MBB == Preheader)
+      InitialValue = &IV_Phi->getOperand(i);
+    else if (MBB == Latch)
+      IVReg = IV_Phi->getOperand(i).getReg();  // Want IV reg after bump.
+  }
+  if (!InitialValue)
+    return nullptr;
+
+  SmallVector<MachineOperand,2> Cond;
+  MachineBasicBlock *TB = nullptr, *FB = nullptr;
+  bool NotAnalyzed = TII->AnalyzeBranch(*Latch, TB, FB, Cond, false);
+  if (NotAnalyzed)
+    return nullptr;
+
+  MachineBasicBlock *Header = L->getHeader();
+  // TB must be non-null.  If FB is also non-null, one of them must be
+  // the header.  Otherwise, branch to TB could be exiting the loop, and
+  // the fall through can go to the header.
+  assert (TB && "Latch block without a branch?");
+  assert ((!FB || TB == Header || FB == Header) && "Branches not to header?");
+  if (!TB || (FB && TB != Header && FB != Header))
+    return nullptr;
+
+  // Branches of form "if (!P) ..." cause HexagonInstrInfo::AnalyzeBranch
+  // to put imm(0), followed by P in the vector Cond.
+  // If TB is not the header, it means that the "not-taken" path must lead
+  // to the header.
+  bool Negated = (Cond.size() > 1) ^ (TB != Header);
+  unsigned PredReg = Cond[Cond.size()-1].getReg();
+  MachineInstr *CondI = MRI->getVRegDef(PredReg);
+  unsigned CondOpc = CondI->getOpcode();
+
+  unsigned CmpReg1 = 0, CmpReg2 = 0;
+  int Mask = 0, ImmValue = 0;
+  bool AnalyzedCmp = TII->analyzeCompare(CondI, CmpReg1, CmpReg2,
+                                         Mask, ImmValue);
+  if (!AnalyzedCmp)
+    return nullptr;
+
+  // The comparison operator type determines how we compute the loop
+  // trip count.
+  OldInsts.push_back(CondI);
+  OldInsts.push_back(IVOp);
+
+  // Sadly, the following code gets information based on the position
+  // of the operands in the compare instruction.  This has to be done
+  // this way, because the comparisons check for a specific relationship
+  // between the operands (e.g. is-less-than), rather than to find out
+  // what relationship the operands are in (as on PPC).
+  Comparison::Kind Cmp;
+  bool isSwapped = false;
+  const MachineOperand &Op1 = CondI->getOperand(1);
+  const MachineOperand &Op2 = CondI->getOperand(2);
+  const MachineOperand *EndValue = nullptr;
+
+  if (Op1.isReg()) {
+    if (Op2.isImm() || Op1.getReg() == IVReg)
+      EndValue = &Op2;
+    else {
+      EndValue = &Op1;
+      isSwapped = true;
+    }
+  }
+
+  if (!EndValue)
+    return nullptr;
+
+  switch (CondOpc) {
+    case Hexagon::CMPEQri:
+    case Hexagon::CMPEQrr:
+      Cmp = !Negated ? Comparison::EQ : Comparison::NE;
+      break;
+    case Hexagon::CMPGTUri:
+    case Hexagon::CMPGTUrr:
+      Cmp = !Negated ? Comparison::GTu : Comparison::LEu;
+      break;
+    case Hexagon::CMPGTri:
+    case Hexagon::CMPGTrr:
+      Cmp = !Negated ? Comparison::GTs : Comparison::LEs;
+      break;
+    // Very limited support for byte/halfword compares.
+    case Hexagon::CMPbEQri_V4:
+    case Hexagon::CMPhEQri_V4: {
+      if (IVBump != 1)
+        return nullptr;
+
+      int64_t InitV, EndV;
+      // Since the comparisons are "ri", the EndValue should be an
+      // immediate.  Check it just in case.
+      assert(EndValue->isImm() && "Unrecognized latch comparison");
+      EndV = EndValue->getImm();
+      // Allow InitialValue to be a register defined with an immediate.
+      if (InitialValue->isReg()) {
+        if (!defWithImmediate(InitialValue->getReg()))
+          return nullptr;
+        InitV = getImmediate(*InitialValue);
+      } else {
+        assert(InitialValue->isImm());
+        InitV = InitialValue->getImm();
+      }
+      if (InitV >= EndV)
+        return nullptr;
+      if (CondOpc == Hexagon::CMPbEQri_V4) {
+        if (!isInt<8>(InitV) || !isInt<8>(EndV))
+          return nullptr;
+      } else {  // Hexagon::CMPhEQri_V4
+        if (!isInt<16>(InitV) || !isInt<16>(EndV))
+          return nullptr;
+      }
+      Cmp = !Negated ? Comparison::EQ : Comparison::NE;
+      break;
+    }
+    default:
+      return nullptr;
+  }
+
+  if (isSwapped)
+   Cmp = Comparison::getSwappedComparison(Cmp);
+
+  if (InitialValue->isReg()) {
+    unsigned R = InitialValue->getReg();
+    MachineBasicBlock *DefBB = MRI->getVRegDef(R)->getParent();
+    if (!MDT->properlyDominates(DefBB, Header))
+      return nullptr;
+    OldInsts.push_back(MRI->getVRegDef(R));
+  }
+  if (EndValue->isReg()) {
+    unsigned R = EndValue->getReg();
+    MachineBasicBlock *DefBB = MRI->getVRegDef(R)->getParent();
+    if (!MDT->properlyDominates(DefBB, Header))
+      return nullptr;
+  }
+
+  return computeCount(L, InitialValue, EndValue, IVReg, IVBump, Cmp);
+}
+
+/// \brief Helper function that returns the expression that represents the
+/// number of times a loop iterates.  The function takes the operands that
+/// represent the loop start value, loop end value, and induction value.
+/// Based upon these operands, the function attempts to compute the trip count.
+CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop,
+                                               const MachineOperand *Start,
+                                               const MachineOperand *End,
+                                               unsigned IVReg,
+                                               int64_t IVBump,
+                                               Comparison::Kind Cmp) const {
+  // Cannot handle comparison EQ, i.e. while (A == B).
+  if (Cmp == Comparison::EQ)
+    return nullptr;
+
+  // Check if either the start or end values are an assignment of an immediate.
+  // If so, use the immediate value rather than the register.
+  if (Start->isReg()) {
+    const MachineInstr *StartValInstr = MRI->getVRegDef(Start->getReg());
+    if (StartValInstr && StartValInstr->getOpcode() == Hexagon::TFRI)
+      Start = &StartValInstr->getOperand(1);
+  }
+  if (End->isReg()) {
+    const MachineInstr *EndValInstr = MRI->getVRegDef(End->getReg());
+    if (EndValInstr && EndValInstr->getOpcode() == Hexagon::TFRI)
+      End = &EndValInstr->getOperand(1);
+  }
+
+  assert (Start->isReg() || Start->isImm());
+  assert (End->isReg() || End->isImm());
+
+  bool CmpLess =     Cmp & Comparison::L;
+  bool CmpGreater =  Cmp & Comparison::G;
+  bool CmpHasEqual = Cmp & Comparison::EQ;
+
+  // Avoid certain wrap-arounds.  This doesn't detect all wrap-arounds.
+  // If loop executes while iv is "less" with the iv value going down, then
+  // the iv must wrap.
+  if (CmpLess && IVBump < 0)
+    return nullptr;
+  // If loop executes while iv is "greater" with the iv value going up, then
+  // the iv must wrap.
+  if (CmpGreater && IVBump > 0)
+    return nullptr;
+
+  if (Start->isImm() && End->isImm()) {
+    // Both, start and end are immediates.
+    int64_t StartV = Start->getImm();
+    int64_t EndV = End->getImm();
+    int64_t Dist = EndV - StartV;
+    if (Dist == 0)
+      return nullptr;
+
+    bool Exact = (Dist % IVBump) == 0;
+
+    if (Cmp == Comparison::NE) {
+      if (!Exact)
+        return nullptr;
+      if ((Dist < 0) ^ (IVBump < 0))
+        return nullptr;
+    }
+
+    // For comparisons that include the final value (i.e. include equality
+    // with the final value), we need to increase the distance by 1.
+    if (CmpHasEqual)
+      Dist = Dist > 0 ? Dist+1 : Dist-1;
+
+    // assert (CmpLess => Dist > 0);
+    assert ((!CmpLess || Dist > 0) && "Loop should never iterate!");
+    // assert (CmpGreater => Dist < 0);
+    assert ((!CmpGreater || Dist < 0) && "Loop should never iterate!");
+
+    // "Normalized" distance, i.e. with the bump set to +-1.
+    int64_t Dist1 = (IVBump > 0) ? (Dist +  (IVBump-1)) /   IVBump
+                               :  (-Dist + (-IVBump-1)) / (-IVBump);
+    assert (Dist1 > 0 && "Fishy thing.  Both operands have the same sign.");
+
+    uint64_t Count = Dist1;
+
+    if (Count > 0xFFFFFFFFULL)
+      return nullptr;
+
+    return new CountValue(CountValue::CV_Immediate, Count);
+  }
+
+  // A general case: Start and End are some values, but the actual
+  // iteration count may not be available.  If it is not, insert
+  // a computation of it into the preheader.
+
+  // If the induction variable bump is not a power of 2, quit.
+  // Othwerise we'd need a general integer division.
+  if (!isPowerOf2_64(abs64(IVBump)))
+    return nullptr;
+
+  MachineBasicBlock *PH = Loop->getLoopPreheader();
+  assert (PH && "Should have a preheader by now");
+  MachineBasicBlock::iterator InsertPos = PH->getFirstTerminator();
+  DebugLoc DL = (InsertPos != PH->end()) ? InsertPos->getDebugLoc()
+                                         : DebugLoc();
+
+  // If Start is an immediate and End is a register, the trip count
+  // will be "reg - imm".  Hexagon's "subtract immediate" instruction
+  // is actually "reg + -imm".
+
+  // If the loop IV is going downwards, i.e. if the bump is negative,
+  // then the iteration count (computed as End-Start) will need to be
+  // negated.  To avoid the negation, just swap Start and End.
+  if (IVBump < 0) {
+    std::swap(Start, End);
+    IVBump = -IVBump;
+  }
+  // Cmp may now have a wrong direction, e.g.  LEs may now be GEs.
+  // Signedness, and "including equality" are preserved.
+
+  bool RegToImm = Start->isReg() && End->isImm(); // for (reg..imm)
+  bool RegToReg = Start->isReg() && End->isReg(); // for (reg..reg)
+
+  int64_t StartV = 0, EndV = 0;
+  if (Start->isImm())
+    StartV = Start->getImm();
+  if (End->isImm())
+    EndV = End->getImm();
+
+  int64_t AdjV = 0;
+  // To compute the iteration count, we would need this computation:
+  //   Count = (End - Start + (IVBump-1)) / IVBump
+  // or, when CmpHasEqual:
+  //   Count = (End - Start + (IVBump-1)+1) / IVBump
+  // The "IVBump-1" part is the adjustment (AdjV).  We can avoid
+  // generating an instruction specifically to add it if we can adjust
+  // the immediate values for Start or End.
+
+  if (CmpHasEqual) {
+    // Need to add 1 to the total iteration count.
+    if (Start->isImm())
+      StartV--;
+    else if (End->isImm())
+      EndV++;
+    else
+      AdjV += 1;
+  }
+
+  if (Cmp != Comparison::NE) {
+    if (Start->isImm())
+      StartV -= (IVBump-1);
+    else if (End->isImm())
+      EndV += (IVBump-1);
+    else
+      AdjV += (IVBump-1);
+  }
+
+  unsigned R = 0, SR = 0;
+  if (Start->isReg()) {
+    R = Start->getReg();
+    SR = Start->getSubReg();
+  } else {
+    R = End->getReg();
+    SR = End->getSubReg();
+  }
+  const TargetRegisterClass *RC = MRI->getRegClass(R);
+  // Hardware loops cannot handle 64-bit registers.  If it's a double
+  // register, it has to have a subregister.
+  if (!SR && RC == &Hexagon::DoubleRegsRegClass)
+    return nullptr;
+  const TargetRegisterClass *IntRC = &Hexagon::IntRegsRegClass;
+
+  // Compute DistR (register with the distance between Start and End).
+  unsigned DistR, DistSR;
+
+  // Avoid special case, where the start value is an imm(0).
+  if (Start->isImm() && StartV == 0) {
+    DistR = End->getReg();
+    DistSR = End->getSubReg();
+  } else {
+    const MCInstrDesc &SubD = RegToReg ? TII->get(Hexagon::SUB_rr) :
+                              (RegToImm ? TII->get(Hexagon::SUB_ri) :
+                                          TII->get(Hexagon::ADD_ri));
+    unsigned SubR = MRI->createVirtualRegister(IntRC);
+    MachineInstrBuilder SubIB =
+      BuildMI(*PH, InsertPos, DL, SubD, SubR);
+
+    if (RegToReg) {
+      SubIB.addReg(End->getReg(), 0, End->getSubReg())
+           .addReg(Start->getReg(), 0, Start->getSubReg());
+    } else if (RegToImm) {
+      SubIB.addImm(EndV)
+           .addReg(Start->getReg(), 0, Start->getSubReg());
+    } else { // ImmToReg
+      SubIB.addReg(End->getReg(), 0, End->getSubReg())
+           .addImm(-StartV);
+    }
+    DistR = SubR;
+    DistSR = 0;
+  }
+
+  // From DistR, compute AdjR (register with the adjusted distance).
+  unsigned AdjR, AdjSR;
+
+  if (AdjV == 0) {
+    AdjR = DistR;
+    AdjSR = DistSR;
+  } else {
+    // Generate CountR = ADD DistR, AdjVal
+    unsigned AddR = MRI->createVirtualRegister(IntRC);
+    const MCInstrDesc &AddD = TII->get(Hexagon::ADD_ri);
+    BuildMI(*PH, InsertPos, DL, AddD, AddR)
+      .addReg(DistR, 0, DistSR)
+      .addImm(AdjV);
+
+    AdjR = AddR;
+    AdjSR = 0;
+  }
+
+  // From AdjR, compute CountR (register with the final count).
+  unsigned CountR, CountSR;
+
+  if (IVBump == 1) {
+    CountR = AdjR;
+    CountSR = AdjSR;
+  } else {
+    // The IV bump is a power of two. Log_2(IV bump) is the shift amount.
+    unsigned Shift = Log2_32(IVBump);
+
+    // Generate NormR = LSR DistR, Shift.
+    unsigned LsrR = MRI->createVirtualRegister(IntRC);
+    const MCInstrDesc &LsrD = TII->get(Hexagon::LSR_ri);
+    BuildMI(*PH, InsertPos, DL, LsrD, LsrR)
+      .addReg(AdjR, 0, AdjSR)
+      .addImm(Shift);
+
+    CountR = LsrR;
+    CountSR = 0;
+  }
+
+  return new CountValue(CountValue::CV_Register, CountR, CountSR);
+}
+
+
+/// \brief Return true if the operation is invalid within hardware loop.
+bool HexagonHardwareLoops::isInvalidLoopOperation(
+      const MachineInstr *MI) const {
+
+  // call is not allowed because the callee may use a hardware loop
+  if (MI->getDesc().isCall())
+    return true;
+
+  // do not allow nested hardware loops
+  if (isHardwareLoop(MI))
+    return true;
+
+  // check if the instruction defines a hardware loop register
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+    unsigned R = MO.getReg();
+    if (R == Hexagon::LC0 || R == Hexagon::LC1 ||
+        R == Hexagon::SA0 || R == Hexagon::SA1)
+      return true;
+  }
+  return false;
+}
+
+
+/// \brief - Return true if the loop contains an instruction that inhibits
+/// the use of the hardware loop function.
+bool HexagonHardwareLoops::containsInvalidInstruction(MachineLoop *L) const {
+  const std::vector<MachineBasicBlock *> &Blocks = L->getBlocks();
+  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+    MachineBasicBlock *MBB = Blocks[i];
+    for (MachineBasicBlock::iterator
+           MII = MBB->begin(), E = MBB->end(); MII != E; ++MII) {
+      const MachineInstr *MI = &*MII;
+      if (isInvalidLoopOperation(MI))
+        return true;
+    }
+  }
+  return false;
+}
+
+
+/// \brief Returns true if the instruction is dead.  This was essentially
+/// copied from DeadMachineInstructionElim::isDead, but with special cases
+/// for inline asm, physical registers and instructions with side effects
+/// removed.
+bool HexagonHardwareLoops::isDead(const MachineInstr *MI,
+                              SmallVectorImpl<MachineInstr *> &DeadPhis) const {
+  // Examine each operand.
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+
+    unsigned Reg = MO.getReg();
+    if (MRI->use_nodbg_empty(Reg))
+      continue;
+
+    typedef MachineRegisterInfo::use_nodbg_iterator use_nodbg_iterator;
+
+    // This instruction has users, but if the only user is the phi node for the
+    // parent block, and the only use of that phi node is this instruction, then
+    // this instruction is dead: both it (and the phi node) can be removed.
+    use_nodbg_iterator I = MRI->use_nodbg_begin(Reg);
+    use_nodbg_iterator End = MRI->use_nodbg_end();
+    if (std::next(I) != End || !I->getParent()->isPHI())
+      return false;
+
+    MachineInstr *OnePhi = I->getParent();
+    for (unsigned j = 0, f = OnePhi->getNumOperands(); j != f; ++j) {
+      const MachineOperand &OPO = OnePhi->getOperand(j);
+      if (!OPO.isReg() || !OPO.isDef())
+        continue;
+
+      unsigned OPReg = OPO.getReg();
+      use_nodbg_iterator nextJ;
+      for (use_nodbg_iterator J = MRI->use_nodbg_begin(OPReg);
+           J != End; J = nextJ) {
+        nextJ = std::next(J);
+        MachineOperand &Use = *J;
+        MachineInstr *UseMI = Use.getParent();
+
+        // If the phi node has a user that is not MI, bail...
+        if (MI != UseMI)
+          return false;
+      }
+    }
+    DeadPhis.push_back(OnePhi);
+  }
+
+  // If there are no defs with uses, the instruction is dead.
+  return true;
+}
+
+void HexagonHardwareLoops::removeIfDead(MachineInstr *MI) {
+  // This procedure was essentially copied from DeadMachineInstructionElim.
+
+  SmallVector<MachineInstr*, 1> DeadPhis;
+  if (isDead(MI, DeadPhis)) {
+    DEBUG(dbgs() << "HW looping will remove: " << *MI);
+
+    // It is possible that some DBG_VALUE instructions refer to this
+    // instruction.  Examine each def operand for such references;
+    // if found, mark the DBG_VALUE as undef (but don't delete it).
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = MI->getOperand(i);
+      if (!MO.isReg() || !MO.isDef())
+        continue;
+      unsigned Reg = MO.getReg();
+      MachineRegisterInfo::use_iterator nextI;
+      for (MachineRegisterInfo::use_iterator I = MRI->use_begin(Reg),
+           E = MRI->use_end(); I != E; I = nextI) {
+        nextI = std::next(I);  // I is invalidated by the setReg
+        MachineOperand &Use = *I;
+        MachineInstr *UseMI = I->getParent();
+        if (UseMI == MI)
+          continue;
+        if (Use.isDebug())
+          UseMI->getOperand(0).setReg(0U);
+        // This may also be a "instr -> phi -> instr" case which can
+        // be removed too.
+      }
+    }
+
+    MI->eraseFromParent();
+    for (unsigned i = 0; i < DeadPhis.size(); ++i)
+      DeadPhis[i]->eraseFromParent();
+  }
+}
+
+/// \brief Check if the loop is a candidate for converting to a hardware
+/// loop.  If so, then perform the transformation.
+///
+/// This function works on innermost loops first.  A loop can be converted
+/// if it is a counting loop; either a register value or an immediate.
+///
+/// The code makes several assumptions about the representation of the loop
+/// in llvm.
+bool HexagonHardwareLoops::convertToHardwareLoop(MachineLoop *L) {
+  // This is just for sanity.
+  assert(L->getHeader() && "Loop without a header?");
+
+  bool Changed = false;
+  // Process nested loops first.
+  for (MachineLoop::iterator I = L->begin(), E = L->end(); I != E; ++I)
+    Changed |= convertToHardwareLoop(*I);
+
+  // If a nested loop has been converted, then we can't convert this loop.
+  if (Changed)
+    return Changed;
+
+#ifndef NDEBUG
+  // Stop trying after reaching the limit (if any).
+  int Limit = HWLoopLimit;
+  if (Limit >= 0) {
+    if (Counter >= HWLoopLimit)
+      return false;
+    Counter++;
+  }
+#endif
+
+  // Does the loop contain any invalid instructions?
+  if (containsInvalidInstruction(L))
+    return false;
+
+  // Is the induction variable bump feeding the latch condition?
+  if (!fixupInductionVariable(L))
+    return false;
+
+  MachineBasicBlock *LastMBB = L->getExitingBlock();
+  // Don't generate hw loop if the loop has more than one exit.
+  if (!LastMBB)
+    return false;
+
+  MachineBasicBlock::iterator LastI = LastMBB->getFirstTerminator();
+  if (LastI == LastMBB->end())
+    return false;
+
+  // Ensure the loop has a preheader: the loop instruction will be
+  // placed there.
+  bool NewPreheader = false;
+  MachineBasicBlock *Preheader = L->getLoopPreheader();
+  if (!Preheader) {
+    Preheader = createPreheaderForLoop(L);
+    if (!Preheader)
+      return false;
+    NewPreheader = true;
+  }
+  MachineBasicBlock::iterator InsertPos = Preheader->getFirstTerminator();
+
+  SmallVector<MachineInstr*, 2> OldInsts;
+  // Are we able to determine the trip count for the loop?
+  CountValue *TripCount = getLoopTripCount(L, OldInsts);
+  if (!TripCount)
+    return false;
+
+  // Is the trip count available in the preheader?
+  if (TripCount->isReg()) {
+    // There will be a use of the register inserted into the preheader,
+    // so make sure that the register is actually defined at that point.
+    MachineInstr *TCDef = MRI->getVRegDef(TripCount->getReg());
+    MachineBasicBlock *BBDef = TCDef->getParent();
+    if (!NewPreheader) {
+      if (!MDT->dominates(BBDef, Preheader))
+        return false;
+    } else {
+      // If we have just created a preheader, the dominator tree won't be
+      // aware of it.  Check if the definition of the register dominates
+      // the header, but is not the header itself.
+      if (!MDT->properlyDominates(BBDef, L->getHeader()))
+        return false;
+    }
+  }
+
+  // Determine the loop start.
+  MachineBasicBlock *LoopStart = L->getTopBlock();
+  if (L->getLoopLatch() != LastMBB) {
+    // When the exit and latch are not the same, use the latch block as the
+    // start.
+    // The loop start address is used only after the 1st iteration, and the
+    // loop latch may contains instrs. that need to be executed after the
+    // first iteration.
+    LoopStart = L->getLoopLatch();
+    // Make sure the latch is a successor of the exit, otherwise it won't work.
+    if (!LastMBB->isSuccessor(LoopStart))
+      return false;
+  }
+
+  // Convert the loop to a hardware loop.
+  DEBUG(dbgs() << "Change to hardware loop at "; L->dump());
+  DebugLoc DL;
+  if (InsertPos != Preheader->end())
+    DL = InsertPos->getDebugLoc();
+
+  if (TripCount->isReg()) {
+    // Create a copy of the loop count register.
+    unsigned CountReg = MRI->createVirtualRegister(&Hexagon::IntRegsRegClass);
+    BuildMI(*Preheader, InsertPos, DL, TII->get(TargetOpcode::COPY), CountReg)
+      .addReg(TripCount->getReg(), 0, TripCount->getSubReg());
+    // Add the Loop instruction to the beginning of the loop.
+    BuildMI(*Preheader, InsertPos, DL, TII->get(Hexagon::LOOP0_r))
+      .addMBB(LoopStart)
+      .addReg(CountReg);
+  } else {
+    assert(TripCount->isImm() && "Expecting immediate value for trip count");
+    // Add the Loop immediate instruction to the beginning of the loop,
+    // if the immediate fits in the instructions.  Otherwise, we need to
+    // create a new virtual register.
+    int64_t CountImm = TripCount->getImm();
+    if (!TII->isValidOffset(Hexagon::LOOP0_i, CountImm)) {
+      unsigned CountReg = MRI->createVirtualRegister(&Hexagon::IntRegsRegClass);
+      BuildMI(*Preheader, InsertPos, DL, TII->get(Hexagon::TFRI), CountReg)
+        .addImm(CountImm);
+      BuildMI(*Preheader, InsertPos, DL, TII->get(Hexagon::LOOP0_r))
+        .addMBB(LoopStart).addReg(CountReg);
+    } else
+      BuildMI(*Preheader, InsertPos, DL, TII->get(Hexagon::LOOP0_i))
+        .addMBB(LoopStart).addImm(CountImm);
+  }
+
+  // Make sure the loop start always has a reference in the CFG.  We need
+  // to create a BlockAddress operand to get this mechanism to work both the
+  // MachineBasicBlock and BasicBlock objects need the flag set.
+  LoopStart->setHasAddressTaken();
+  // This line is needed to set the hasAddressTaken flag on the BasicBlock
+  // object.
+  BlockAddress::get(const_cast<BasicBlock *>(LoopStart->getBasicBlock()));
+
+  // Replace the loop branch with an endloop instruction.
+  DebugLoc LastIDL = LastI->getDebugLoc();
+  BuildMI(*LastMBB, LastI, LastIDL,
+          TII->get(Hexagon::ENDLOOP0)).addMBB(LoopStart);
+
+  // The loop ends with either:
+  //  - a conditional branch followed by an unconditional branch, or
+  //  - a conditional branch to the loop start.
+  if (LastI->getOpcode() == Hexagon::JMP_t ||
+      LastI->getOpcode() == Hexagon::JMP_f) {
+    // Delete one and change/add an uncond. branch to out of the loop.
+    MachineBasicBlock *BranchTarget = LastI->getOperand(1).getMBB();
+    LastI = LastMBB->erase(LastI);
+    if (!L->contains(BranchTarget)) {
+      if (LastI != LastMBB->end())
+        LastI = LastMBB->erase(LastI);
+      SmallVector<MachineOperand, 0> Cond;
+      TII->InsertBranch(*LastMBB, BranchTarget, nullptr, Cond, LastIDL);
+    }
+  } else {
+    // Conditional branch to loop start; just delete it.
+    LastMBB->erase(LastI);
+  }
+  delete TripCount;
+
+  // The induction operation and the comparison may now be
+  // unneeded. If these are unneeded, then remove them.
+  for (unsigned i = 0; i < OldInsts.size(); ++i)
+    removeIfDead(OldInsts[i]);
+
+  ++NumHWLoops;
+  return true;
+}
+
+
+bool HexagonHardwareLoops::orderBumpCompare(MachineInstr *BumpI,
+                                            MachineInstr *CmpI) {
+  assert (BumpI != CmpI && "Bump and compare in the same instruction?");
+
+  MachineBasicBlock *BB = BumpI->getParent();
+  if (CmpI->getParent() != BB)
+    return false;
+
+  typedef MachineBasicBlock::instr_iterator instr_iterator;
+  // Check if things are in order to begin with.
+  for (instr_iterator I = BumpI, E = BB->instr_end(); I != E; ++I)
+    if (&*I == CmpI)
+      return true;
+
+  // Out of order.
+  unsigned PredR = CmpI->getOperand(0).getReg();
+  bool FoundBump = false;
+  instr_iterator CmpIt = CmpI, NextIt = std::next(CmpIt);
+  for (instr_iterator I = NextIt, E = BB->instr_end(); I != E; ++I) {
+    MachineInstr *In = &*I;
+    for (unsigned i = 0, n = In->getNumOperands(); i < n; ++i) {
+      MachineOperand &MO = In->getOperand(i);
+      if (MO.isReg() && MO.isUse()) {
+        if (MO.getReg() == PredR)  // Found an intervening use of PredR.
+          return false;
+      }
+    }
+
+    if (In == BumpI) {
+      instr_iterator After = BumpI;
+      instr_iterator From = CmpI;
+      BB->splice(std::next(After), BB, From);
+      FoundBump = true;
+      break;
+    }
+  }
+  assert (FoundBump && "Cannot determine instruction order");
+  return FoundBump;
+}
+
+
+MachineInstr *HexagonHardwareLoops::defWithImmediate(unsigned R) {
+  MachineInstr *DI = MRI->getVRegDef(R);
+  unsigned DOpc = DI->getOpcode();
+  switch (DOpc) {
+    case Hexagon::TFRI:
+    case Hexagon::TFRI64:
+    case Hexagon::CONST32_Int_Real:
+    case Hexagon::CONST64_Int_Real:
+      return DI;
+  }
+  return nullptr;
+}
+
+
+int64_t HexagonHardwareLoops::getImmediate(MachineOperand &MO) {
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isReg());
+  unsigned R = MO.getReg();
+  MachineInstr *DI = defWithImmediate(R);
+  assert(DI && "Need an immediate operand");
+  // All currently supported "define-with-immediate" instructions have the
+  // actual immediate value in the operand(1).
+  int64_t v = DI->getOperand(1).getImm();
+  return v;
+}
+
+
+void HexagonHardwareLoops::setImmediate(MachineOperand &MO, int64_t Val) {
+  if (MO.isImm()) {
+    MO.setImm(Val);
+    return;
+  }
+
+  assert(MO.isReg());
+  unsigned R = MO.getReg();
+  MachineInstr *DI = defWithImmediate(R);
+  if (MRI->hasOneNonDBGUse(R)) {
+    // If R has only one use, then just change its defining instruction to
+    // the new immediate value.
+    DI->getOperand(1).setImm(Val);
+    return;
+  }
+
+  const TargetRegisterClass *RC = MRI->getRegClass(R);
+  unsigned NewR = MRI->createVirtualRegister(RC);
+  MachineBasicBlock &B = *DI->getParent();
+  DebugLoc DL = DI->getDebugLoc();
+  BuildMI(B, DI, DL, TII->get(DI->getOpcode()), NewR)
+    .addImm(Val);
+  MO.setReg(NewR);
+}
+
+
+bool HexagonHardwareLoops::fixupInductionVariable(MachineLoop *L) {
+  MachineBasicBlock *Header = L->getHeader();
+  MachineBasicBlock *Preheader = L->getLoopPreheader();
+  MachineBasicBlock *Latch = L->getLoopLatch();
+
+  if (!Header || !Preheader || !Latch)
+    return false;
+
+  // These data structures follow the same concept as the corresponding
+  // ones in findInductionRegister (where some comments are).
+  typedef std::pair<unsigned,int64_t> RegisterBump;
+  typedef std::pair<unsigned,RegisterBump> RegisterInduction;
+  typedef std::set<RegisterInduction> RegisterInductionSet;
+
+  // Register candidates for induction variables, with their associated bumps.
+  RegisterInductionSet IndRegs;
+
+  // Look for induction patterns:
+  //   vreg1 = PHI ..., [ latch, vreg2 ]
+  //   vreg2 = ADD vreg1, imm
+  typedef MachineBasicBlock::instr_iterator instr_iterator;
+  for (instr_iterator I = Header->instr_begin(), E = Header->instr_end();
+       I != E && I->isPHI(); ++I) {
+    MachineInstr *Phi = &*I;
+
+    // Have a PHI instruction.
+    for (unsigned i = 1, n = Phi->getNumOperands(); i < n; i += 2) {
+      if (Phi->getOperand(i+1).getMBB() != Latch)
+        continue;
+
+      unsigned PhiReg = Phi->getOperand(i).getReg();
+      MachineInstr *DI = MRI->getVRegDef(PhiReg);
+      unsigned UpdOpc = DI->getOpcode();
+      bool isAdd = (UpdOpc == Hexagon::ADD_ri);
+
+      if (isAdd) {
+        // If the register operand to the add/sub is the PHI we are looking
+        // at, this meets the induction pattern.
+        unsigned IndReg = DI->getOperand(1).getReg();
+        if (MRI->getVRegDef(IndReg) == Phi) {
+          unsigned UpdReg = DI->getOperand(0).getReg();
+          int64_t V = DI->getOperand(2).getImm();
+          IndRegs.insert(std::make_pair(UpdReg, std::make_pair(IndReg, V)));
+        }
+      }
+    }  // for (i)
+  }  // for (instr)
+
+  if (IndRegs.empty())
+    return false;
+
+  MachineBasicBlock *TB = nullptr, *FB = nullptr;
+  SmallVector<MachineOperand,2> Cond;
+  // AnalyzeBranch returns true if it fails to analyze branch.
+  bool NotAnalyzed = TII->AnalyzeBranch(*Latch, TB, FB, Cond, false);
+  if (NotAnalyzed)
+    return false;
+
+  // Check if the latch branch is unconditional.
+  if (Cond.empty())
+    return false;
+
+  if (TB != Header && FB != Header)
+    // The latch does not go back to the header.  Not a latch we know and love.
+    return false;
+
+  // Expecting a predicate register as a condition.  It won't be a hardware
+  // predicate register at this point yet, just a vreg.
+  // HexagonInstrInfo::AnalyzeBranch for negated branches inserts imm(0)
+  // into Cond, followed by the predicate register.  For non-negated branches
+  // it's just the register.
+  unsigned CSz = Cond.size();
+  if (CSz != 1 && CSz != 2)
+    return false;
+
+  unsigned P = Cond[CSz-1].getReg();
+  MachineInstr *PredDef = MRI->getVRegDef(P);
+
+  if (!PredDef->isCompare())
+    return false;
+
+  SmallSet<unsigned,2> CmpRegs;
+  MachineOperand *CmpImmOp = nullptr;
+
+  // Go over all operands to the compare and look for immediate and register
+  // operands.  Assume that if the compare has a single register use and a
+  // single immediate operand, then the register is being compared with the
+  // immediate value.
+  for (unsigned i = 0, n = PredDef->getNumOperands(); i < n; ++i) {
+    MachineOperand &MO = PredDef->getOperand(i);
+    if (MO.isReg()) {
+      // Skip all implicit references.  In one case there was:
+      //   %vreg140<def> = FCMPUGT32_rr %vreg138, %vreg139, %USR<imp-use>
+      if (MO.isImplicit())
+        continue;
+      if (MO.isUse()) {
+        unsigned R = MO.getReg();
+        if (!defWithImmediate(R)) {
+          CmpRegs.insert(MO.getReg());
+          continue;
+        }
+        // Consider the register to be the "immediate" operand.
+        if (CmpImmOp)
+          return false;
+        CmpImmOp = &MO;
+      }
+    } else if (MO.isImm()) {
+      if (CmpImmOp)    // A second immediate argument?  Confusing.  Bail out.
+        return false;
+      CmpImmOp = &MO;
+    }
+  }
+
+  if (CmpRegs.empty())
+    return false;
+
+  // Check if the compared register follows the order we want.  Fix if needed.
+  for (RegisterInductionSet::iterator I = IndRegs.begin(), E = IndRegs.end();
+       I != E; ++I) {
+    // This is a success.  If the register used in the comparison is one that
+    // we have identified as a bumped (updated) induction register, there is
+    // nothing to do.
+    if (CmpRegs.count(I->first))
+      return true;
+
+    // Otherwise, if the register being compared comes out of a PHI node,
+    // and has been recognized as following the induction pattern, and is
+    // compared against an immediate, we can fix it.
+    const RegisterBump &RB = I->second;
+    if (CmpRegs.count(RB.first)) {
+      if (!CmpImmOp)
+        return false;
+
+      int64_t CmpImm = getImmediate(*CmpImmOp);
+      int64_t V = RB.second;
+      if (V > 0 && CmpImm+V < CmpImm)  // Overflow (64-bit).
+        return false;
+      if (V < 0 && CmpImm+V > CmpImm)  // Overflow (64-bit).
+        return false;
+      CmpImm += V;
+      // Some forms of cmp-immediate allow u9 and s10.  Assume the worst case
+      // scenario, i.e. an 8-bit value.
+      if (CmpImmOp->isImm() && !isInt<8>(CmpImm))
+        return false;
+
+      // Make sure that the compare happens after the bump.  Otherwise,
+      // after the fixup, the compare would use a yet-undefined register.
+      MachineInstr *BumpI = MRI->getVRegDef(I->first);
+      bool Order = orderBumpCompare(BumpI, PredDef);
+      if (!Order)
+        return false;
+
+      // Finally, fix the compare instruction.
+      setImmediate(*CmpImmOp, CmpImm);
+      for (unsigned i = 0, n = PredDef->getNumOperands(); i < n; ++i) {
+        MachineOperand &MO = PredDef->getOperand(i);
+        if (MO.isReg() && MO.getReg() == RB.first) {
+          MO.setReg(I->first);
+          return true;
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
+
+/// \brief Create a preheader for a given loop.
+MachineBasicBlock *HexagonHardwareLoops::createPreheaderForLoop(
+      MachineLoop *L) {
+  if (MachineBasicBlock *TmpPH = L->getLoopPreheader())
+    return TmpPH;
+
+  MachineBasicBlock *Header = L->getHeader();
+  MachineBasicBlock *Latch = L->getLoopLatch();
+  MachineFunction *MF = Header->getParent();
+  DebugLoc DL;
+
+  if (!Latch || Header->hasAddressTaken())
+    return nullptr;
+
+  typedef MachineBasicBlock::instr_iterator instr_iterator;
+
+  // Verify that all existing predecessors have analyzable branches
+  // (or no branches at all).
+  typedef std::vector<MachineBasicBlock*> MBBVector;
+  MBBVector Preds(Header->pred_begin(), Header->pred_end());
+  SmallVector<MachineOperand,2> Tmp1;
+  MachineBasicBlock *TB = nullptr, *FB = nullptr;
+
+  if (TII->AnalyzeBranch(*Latch, TB, FB, Tmp1, false))
+    return nullptr;
+
+  for (MBBVector::iterator I = Preds.begin(), E = Preds.end(); I != E; ++I) {
+    MachineBasicBlock *PB = *I;
+    if (PB != Latch) {
+      bool NotAnalyzed = TII->AnalyzeBranch(*PB, TB, FB, Tmp1, false);
+      if (NotAnalyzed)
+        return nullptr;
+    }
+  }
+
+  MachineBasicBlock *NewPH = MF->CreateMachineBasicBlock();
+  MF->insert(Header, NewPH);
+
+  if (Header->pred_size() > 2) {
+    // Ensure that the header has only two predecessors: the preheader and
+    // the loop latch.  Any additional predecessors of the header should
+    // join at the newly created preheader.  Inspect all PHI nodes from the
+    // header and create appropriate corresponding PHI nodes in the preheader.
+
+    for (instr_iterator I = Header->instr_begin(), E = Header->instr_end();
+         I != E && I->isPHI(); ++I) {
+      MachineInstr *PN = &*I;
+
+      const MCInstrDesc &PD = TII->get(TargetOpcode::PHI);
+      MachineInstr *NewPN = MF->CreateMachineInstr(PD, DL);
+      NewPH->insert(NewPH->end(), NewPN);
+
+      unsigned PR = PN->getOperand(0).getReg();
+      const TargetRegisterClass *RC = MRI->getRegClass(PR);
+      unsigned NewPR = MRI->createVirtualRegister(RC);
+      NewPN->addOperand(MachineOperand::CreateReg(NewPR, true));
+
+      // Copy all non-latch operands of a header's PHI node to the newly
+      // created PHI node in the preheader.
+      for (unsigned i = 1, n = PN->getNumOperands(); i < n; i += 2) {
+        unsigned PredR = PN->getOperand(i).getReg();
+        MachineBasicBlock *PredB = PN->getOperand(i+1).getMBB();
+        if (PredB == Latch)
+          continue;
+
+        NewPN->addOperand(MachineOperand::CreateReg(PredR, false));
+        NewPN->addOperand(MachineOperand::CreateMBB(PredB));
+      }
+
+      // Remove copied operands from the old PHI node and add the value
+      // coming from the preheader's PHI.
+      for (int i = PN->getNumOperands()-2; i > 0; i -= 2) {
+        MachineBasicBlock *PredB = PN->getOperand(i+1).getMBB();
+        if (PredB != Latch) {
+          PN->RemoveOperand(i+1);
+          PN->RemoveOperand(i);
+        }
+      }
+      PN->addOperand(MachineOperand::CreateReg(NewPR, false));
+      PN->addOperand(MachineOperand::CreateMBB(NewPH));
+    }
+
+  } else {
+    assert(Header->pred_size() == 2);
+
+    // The header has only two predecessors, but the non-latch predecessor
+    // is not a preheader (e.g. it has other successors, etc.)
+    // In such a case we don't need any extra PHI nodes in the new preheader,
+    // all we need is to adjust existing PHIs in the header to now refer to
+    // the new preheader.
+    for (instr_iterator I = Header->instr_begin(), E = Header->instr_end();
+         I != E && I->isPHI(); ++I) {
+      MachineInstr *PN = &*I;
+      for (unsigned i = 1, n = PN->getNumOperands(); i < n; i += 2) {
+        MachineOperand &MO = PN->getOperand(i+1);
+        if (MO.getMBB() != Latch)
+          MO.setMBB(NewPH);
+      }
+    }
+  }
+
+  // "Reroute" the CFG edges to link in the new preheader.
+  // If any of the predecessors falls through to the header, insert a branch
+  // to the new preheader in that place.
+  SmallVector<MachineOperand,1> Tmp2;
+  SmallVector<MachineOperand,1> EmptyCond;
+
+  TB = FB = nullptr;
+
+  for (MBBVector::iterator I = Preds.begin(), E = Preds.end(); I != E; ++I) {
+    MachineBasicBlock *PB = *I;
+    if (PB != Latch) {
+      Tmp2.clear();
+      bool NotAnalyzed = TII->AnalyzeBranch(*PB, TB, FB, Tmp2, false);
+      (void)NotAnalyzed; // suppress compiler warning
+      assert (!NotAnalyzed && "Should be analyzable!");
+      if (TB != Header && (Tmp2.empty() || FB != Header))
+        TII->InsertBranch(*PB, NewPH, nullptr, EmptyCond, DL);
+      PB->ReplaceUsesOfBlockWith(Header, NewPH);
+    }
+  }
+
+  // It can happen that the latch block will fall through into the header.
+  // Insert an unconditional branch to the header.
+  TB = FB = nullptr;
+  bool LatchNotAnalyzed = TII->AnalyzeBranch(*Latch, TB, FB, Tmp2, false);
+  (void)LatchNotAnalyzed; // suppress compiler warning
+  assert (!LatchNotAnalyzed && "Should be analyzable!");
+  if (!TB && !FB)
+    TII->InsertBranch(*Latch, Header, nullptr, EmptyCond, DL);
+
+  // Finally, the branch from the preheader to the header.
+  TII->InsertBranch(*NewPH, Header, nullptr, EmptyCond, DL);
+  NewPH->addSuccessor(Header);
+
+  return NewPH;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
new file mode 100644
index 0000000..dabe650
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -0,0 +1,1685 @@
+//===-- HexagonISelDAGToDAG.cpp - A dag to dag inst selector for Hexagon --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the Hexagon target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Hexagon.h"
+#include "HexagonISelLowering.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "hexagon-isel"
+
+static
+cl::opt<unsigned>
+MaxNumOfUsesForConstExtenders("ga-max-num-uses-for-constant-extenders",
+  cl::Hidden, cl::init(2),
+  cl::desc("Maximum number of uses of a global address such that we still us a"
+           "constant extended instruction"));
+
+//===----------------------------------------------------------------------===//
+// Instruction Selector Implementation
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+  void initializeHexagonDAGToDAGISelPass(PassRegistry&);
+}
+
+//===--------------------------------------------------------------------===//
+/// HexagonDAGToDAGISel - Hexagon specific code to select Hexagon machine
+/// instructions for SelectionDAG operations.
+///
+namespace {
+class HexagonDAGToDAGISel : public SelectionDAGISel {
+  /// Subtarget - Keep a pointer to the Hexagon Subtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const HexagonSubtarget &Subtarget;
+
+  // Keep a reference to HexagonTargetMachine.
+  const HexagonTargetMachine& TM;
+  DenseMap<const GlobalValue *, unsigned> GlobalAddressUseCountMap;
+public:
+  explicit HexagonDAGToDAGISel(HexagonTargetMachine &targetmachine,
+                               CodeGenOpt::Level OptLevel)
+    : SelectionDAGISel(targetmachine, OptLevel),
+      Subtarget(targetmachine.getSubtarget<HexagonSubtarget>()),
+      TM(targetmachine) {
+    initializeHexagonDAGToDAGISelPass(*PassRegistry::getPassRegistry());
+  }
+  bool hasNumUsesBelowThresGA(SDNode *N) const;
+
+  SDNode *Select(SDNode *N) override;
+
+  // Complex Pattern Selectors.
+  inline bool foldGlobalAddress(SDValue &N, SDValue &R);
+  inline bool foldGlobalAddressGP(SDValue &N, SDValue &R);
+  bool foldGlobalAddressImpl(SDValue &N, SDValue &R, bool ShouldLookForGP);
+  bool SelectADDRri(SDValue& N, SDValue &R1, SDValue &R2);
+  bool SelectADDRriS11_0(SDValue& N, SDValue &R1, SDValue &R2);
+  bool SelectADDRriS11_1(SDValue& N, SDValue &R1, SDValue &R2);
+  bool SelectADDRriS11_2(SDValue& N, SDValue &R1, SDValue &R2);
+  bool SelectMEMriS11_2(SDValue& Addr, SDValue &Base, SDValue &Offset);
+  bool SelectADDRriS11_3(SDValue& N, SDValue &R1, SDValue &R2);
+  bool SelectADDRrr(SDValue &Addr, SDValue &Base, SDValue &Offset);
+  bool SelectADDRriU6_0(SDValue& N, SDValue &R1, SDValue &R2);
+  bool SelectADDRriU6_1(SDValue& N, SDValue &R1, SDValue &R2);
+  bool SelectADDRriU6_2(SDValue& N, SDValue &R1, SDValue &R2);
+
+  const char *getPassName() const override {
+    return "Hexagon DAG->DAG Pattern Instruction Selection";
+  }
+
+  /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+  /// inline asm expressions.
+  bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                    char ConstraintCode,
+                                    std::vector<SDValue> &OutOps) override;
+  bool SelectAddr(SDNode *Op, SDValue Addr, SDValue &Base, SDValue &Offset);
+
+  SDNode *SelectLoad(SDNode *N);
+  SDNode *SelectBaseOffsetLoad(LoadSDNode *LD, SDLoc dl);
+  SDNode *SelectIndexedLoad(LoadSDNode *LD, SDLoc dl);
+  SDNode *SelectIndexedLoadZeroExtend64(LoadSDNode *LD, unsigned Opcode,
+                                        SDLoc dl);
+  SDNode *SelectIndexedLoadSignExtend64(LoadSDNode *LD, unsigned Opcode,
+                                        SDLoc dl);
+  SDNode *SelectBaseOffsetStore(StoreSDNode *ST, SDLoc dl);
+  SDNode *SelectIndexedStore(StoreSDNode *ST, SDLoc dl);
+  SDNode *SelectStore(SDNode *N);
+  SDNode *SelectSHL(SDNode *N);
+  SDNode *SelectSelect(SDNode *N);
+  SDNode *SelectTruncate(SDNode *N);
+  SDNode *SelectMul(SDNode *N);
+  SDNode *SelectZeroExtend(SDNode *N);
+  SDNode *SelectIntrinsicWOChain(SDNode *N);
+  SDNode *SelectIntrinsicWChain(SDNode *N);
+  SDNode *SelectConstant(SDNode *N);
+  SDNode *SelectConstantFP(SDNode *N);
+  SDNode *SelectAdd(SDNode *N);
+  bool isConstExtProfitable(SDNode *N) const;
+
+// XformMskToBitPosU5Imm - Returns the bit position which
+// the single bit 32 bit mask represents.
+// Used in Clr and Set bit immediate memops.
+SDValue XformMskToBitPosU5Imm(uint32_t Imm) {
+  int32_t bitPos;
+  bitPos = Log2_32(Imm);
+  assert(bitPos >= 0 && bitPos < 32 &&
+         "Constant out of range for 32 BitPos Memops");
+  return CurDAG->getTargetConstant(bitPos, MVT::i32);
+}
+
+// XformMskToBitPosU4Imm - Returns the bit position which the single bit 16 bit
+// mask represents. Used in Clr and Set bit immediate memops.
+SDValue XformMskToBitPosU4Imm(uint16_t Imm) {
+  return XformMskToBitPosU5Imm(Imm);
+}
+
+// XformMskToBitPosU3Imm - Returns the bit position which the single bit 8 bit
+// mask represents. Used in Clr and Set bit immediate memops.
+SDValue XformMskToBitPosU3Imm(uint8_t Imm) {
+  return XformMskToBitPosU5Imm(Imm);
+}
+
+// Return true if there is exactly one bit set in V, i.e., if V is one of the
+// following integers: 2^0, 2^1, ..., 2^31.
+bool ImmIsSingleBit(uint32_t v) const {
+  uint32_t c = CountPopulation_64(v);
+  // Only return true if we counted 1 bit.
+  return c == 1;
+}
+
+// XformM5ToU5Imm - Return a target constant with the specified value, of type
+// i32 where the negative literal is transformed into a positive literal for
+// use in -= memops.
+inline SDValue XformM5ToU5Imm(signed Imm) {
+   assert( (Imm >= -31 && Imm <= -1)  && "Constant out of range for Memops");
+   return CurDAG->getTargetConstant( - Imm, MVT::i32);
+}
+
+
+// XformU7ToU7M1Imm - Return a target constant decremented by 1, in range
+// [1..128], used in cmpb.gtu instructions.
+inline SDValue XformU7ToU7M1Imm(signed Imm) {
+  assert((Imm >= 1 && Imm <= 128) && "Constant out of range for cmpb op");
+  return CurDAG->getTargetConstant(Imm - 1, MVT::i8);
+}
+
+// XformS8ToS8M1Imm - Return a target constant decremented by 1.
+inline SDValue XformSToSM1Imm(signed Imm) {
+  return CurDAG->getTargetConstant(Imm - 1, MVT::i32);
+}
+
+// XformU8ToU8M1Imm - Return a target constant decremented by 1.
+inline SDValue XformUToUM1Imm(unsigned Imm) {
+  assert((Imm >= 1) && "Cannot decrement unsigned int less than 1");
+  return CurDAG->getTargetConstant(Imm - 1, MVT::i32);
+}
+
+// Include the pieces autogenerated from the target description.
+#include "HexagonGenDAGISel.inc"
+};
+}  // end anonymous namespace
+
+
+/// createHexagonISelDag - This pass converts a legalized DAG into a
+/// Hexagon-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createHexagonISelDag(HexagonTargetMachine &TM,
+                                         CodeGenOpt::Level OptLevel) {
+  return new HexagonDAGToDAGISel(TM, OptLevel);
+}
+
+static void initializePassOnce(PassRegistry &Registry) {
+  const char *Name = "Hexagon DAG->DAG Pattern Instruction Selection";
+  PassInfo *PI = new PassInfo(Name, "hexagon-isel",
+                              &SelectionDAGISel::ID, nullptr, false, false);
+  Registry.registerPass(*PI, true);
+}
+
+void llvm::initializeHexagonDAGToDAGISelPass(PassRegistry &Registry) {
+  CALL_ONCE_INITIALIZATION(initializePassOnce)
+}
+
+
+static bool IsS11_0_Offset(SDNode * S) {
+    ConstantSDNode *N = cast<ConstantSDNode>(S);
+
+  // immS16 predicate - True if the immediate fits in a 16-bit sign extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isInt<11>(v);
+}
+
+
+static bool IsS11_1_Offset(SDNode * S) {
+    ConstantSDNode *N = cast<ConstantSDNode>(S);
+
+  // immS16 predicate - True if the immediate fits in a 16-bit sign extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedInt<11,1>(v);
+}
+
+
+static bool IsS11_2_Offset(SDNode * S) {
+    ConstantSDNode *N = cast<ConstantSDNode>(S);
+
+  // immS16 predicate - True if the immediate fits in a 16-bit sign extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedInt<11,2>(v);
+}
+
+
+static bool IsS11_3_Offset(SDNode * S) {
+    ConstantSDNode *N = cast<ConstantSDNode>(S);
+
+  // immS16 predicate - True if the immediate fits in a 16-bit sign extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedInt<11,3>(v);
+}
+
+
+static bool IsU6_0_Offset(SDNode * S) {
+    ConstantSDNode *N = cast<ConstantSDNode>(S);
+
+  // u6 predicate - True if the immediate fits in a 6-bit unsigned extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<6>(v);
+}
+
+
+static bool IsU6_1_Offset(SDNode * S) {
+    ConstantSDNode *N = cast<ConstantSDNode>(S);
+
+  // u6 predicate - True if the immediate fits in a 6-bit unsigned extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedUInt<6,1>(v);
+}
+
+
+static bool IsU6_2_Offset(SDNode * S) {
+    ConstantSDNode *N = cast<ConstantSDNode>(S);
+
+  // u6 predicate - True if the immediate fits in a 6-bit unsigned extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedUInt<6,2>(v);
+}
+
+
+// Intrinsics that return a a predicate.
+static unsigned doesIntrinsicReturnPredicate(unsigned ID)
+{
+  switch (ID) {
+    default:
+      return 0;
+    case Intrinsic::hexagon_C2_cmpeq:
+    case Intrinsic::hexagon_C2_cmpgt:
+    case Intrinsic::hexagon_C2_cmpgtu:
+    case Intrinsic::hexagon_C2_cmpgtup:
+    case Intrinsic::hexagon_C2_cmpgtp:
+    case Intrinsic::hexagon_C2_cmpeqp:
+    case Intrinsic::hexagon_C2_bitsset:
+    case Intrinsic::hexagon_C2_bitsclr:
+    case Intrinsic::hexagon_C2_cmpeqi:
+    case Intrinsic::hexagon_C2_cmpgti:
+    case Intrinsic::hexagon_C2_cmpgtui:
+    case Intrinsic::hexagon_C2_cmpgei:
+    case Intrinsic::hexagon_C2_cmpgeui:
+    case Intrinsic::hexagon_C2_cmplt:
+    case Intrinsic::hexagon_C2_cmpltu:
+    case Intrinsic::hexagon_C2_bitsclri:
+    case Intrinsic::hexagon_C2_and:
+    case Intrinsic::hexagon_C2_or:
+    case Intrinsic::hexagon_C2_xor:
+    case Intrinsic::hexagon_C2_andn:
+    case Intrinsic::hexagon_C2_not:
+    case Intrinsic::hexagon_C2_orn:
+    case Intrinsic::hexagon_C2_pxfer_map:
+    case Intrinsic::hexagon_C2_any8:
+    case Intrinsic::hexagon_C2_all8:
+    case Intrinsic::hexagon_A2_vcmpbeq:
+    case Intrinsic::hexagon_A2_vcmpbgtu:
+    case Intrinsic::hexagon_A2_vcmpheq:
+    case Intrinsic::hexagon_A2_vcmphgt:
+    case Intrinsic::hexagon_A2_vcmphgtu:
+    case Intrinsic::hexagon_A2_vcmpweq:
+    case Intrinsic::hexagon_A2_vcmpwgt:
+    case Intrinsic::hexagon_A2_vcmpwgtu:
+    case Intrinsic::hexagon_C2_tfrrp:
+    case Intrinsic::hexagon_S2_tstbit_i:
+    case Intrinsic::hexagon_S2_tstbit_r:
+      return 1;
+  }
+}
+
+
+// Intrinsics that have predicate operands.
+static unsigned doesIntrinsicContainPredicate(unsigned ID)
+{
+  switch (ID) {
+    default:
+      return 0;
+    case Intrinsic::hexagon_C2_tfrpr:
+      return Hexagon::TFR_RsPd;
+    case Intrinsic::hexagon_C2_and:
+      return Hexagon::AND_pp;
+    case Intrinsic::hexagon_C2_xor:
+      return Hexagon::XOR_pp;
+    case Intrinsic::hexagon_C2_or:
+      return Hexagon::OR_pp;
+    case Intrinsic::hexagon_C2_not:
+      return Hexagon::NOT_p;
+    case Intrinsic::hexagon_C2_any8:
+      return Hexagon::ANY_pp;
+    case Intrinsic::hexagon_C2_all8:
+      return Hexagon::ALL_pp;
+    case Intrinsic::hexagon_C2_vitpack:
+      return Hexagon::VITPACK_pp;
+    case Intrinsic::hexagon_C2_mask:
+      return Hexagon::MASK_p;
+    case Intrinsic::hexagon_C2_mux:
+      return Hexagon::MUX_rr;
+
+      // Mapping hexagon_C2_muxir to MUX_pri.  This is pretty weird - but
+      // that's how it's mapped in q6protos.h.
+    case Intrinsic::hexagon_C2_muxir:
+      return Hexagon::MUX_ri;
+
+      // Mapping hexagon_C2_muxri to MUX_pir.  This is pretty weird - but
+      // that's how it's mapped in q6protos.h.
+    case Intrinsic::hexagon_C2_muxri:
+      return Hexagon::MUX_ir;
+
+    case Intrinsic::hexagon_C2_muxii:
+      return Hexagon::MUX_ii;
+    case Intrinsic::hexagon_C2_vmux:
+      return Hexagon::VMUX_prr64;
+    case Intrinsic::hexagon_S2_valignrb:
+      return Hexagon::VALIGN_rrp;
+    case Intrinsic::hexagon_S2_vsplicerb:
+      return Hexagon::VSPLICE_rrp;
+  }
+}
+
+
+static bool OffsetFitsS11(EVT MemType, int64_t Offset) {
+  if (MemType == MVT::i64 && isShiftedInt<11,3>(Offset)) {
+    return true;
+  }
+  if (MemType == MVT::i32 && isShiftedInt<11,2>(Offset)) {
+    return true;
+  }
+  if (MemType == MVT::i16 && isShiftedInt<11,1>(Offset)) {
+    return true;
+  }
+  if (MemType == MVT::i8 && isInt<11>(Offset)) {
+    return true;
+  }
+  return false;
+}
+
+
+//
+// Try to lower loads of GlobalAdresses into base+offset loads.  Custom
+// lowering for GlobalAddress nodes has already turned it into a
+// CONST32.
+//
+SDNode *HexagonDAGToDAGISel::SelectBaseOffsetLoad(LoadSDNode *LD, SDLoc dl) {
+  SDValue Chain = LD->getChain();
+  SDNode* Const32 = LD->getBasePtr().getNode();
+  unsigned Opcode = 0;
+
+  if (Const32->getOpcode() == HexagonISD::CONST32 &&
+      ISD::isNormalLoad(LD)) {
+    SDValue Base = Const32->getOperand(0);
+    EVT LoadedVT = LD->getMemoryVT();
+    int64_t Offset = cast<GlobalAddressSDNode>(Base)->getOffset();
+    if (Offset != 0 && OffsetFitsS11(LoadedVT, Offset)) {
+      MVT PointerTy = getTargetLowering()->getPointerTy();
+      const GlobalValue* GV =
+        cast<GlobalAddressSDNode>(Base)->getGlobal();
+      SDValue TargAddr =
+        CurDAG->getTargetGlobalAddress(GV, dl, PointerTy, 0);
+      SDNode* NewBase = CurDAG->getMachineNode(Hexagon::CONST32_set,
+                                               dl, PointerTy,
+                                               TargAddr);
+      // Figure out base + offset opcode
+      if (LoadedVT == MVT::i64) Opcode = Hexagon::LDrid_indexed;
+      else if (LoadedVT == MVT::i32) Opcode = Hexagon::LDriw_indexed;
+      else if (LoadedVT == MVT::i16) Opcode = Hexagon::LDrih_indexed;
+      else if (LoadedVT == MVT::i8) Opcode = Hexagon::LDrib_indexed;
+      else llvm_unreachable("unknown memory type");
+
+      // Build indexed load.
+      SDValue TargetConstOff = CurDAG->getTargetConstant(Offset, PointerTy);
+      SDNode* Result = CurDAG->getMachineNode(Opcode, dl,
+                                              LD->getValueType(0),
+                                              MVT::Other,
+                                              SDValue(NewBase,0),
+                                              TargetConstOff,
+                                              Chain);
+      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+      MemOp[0] = LD->getMemOperand();
+      cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1);
+      ReplaceUses(LD, Result);
+      return Result;
+    }
+  }
+
+  return SelectCode(LD);
+}
+
+
+SDNode *HexagonDAGToDAGISel::SelectIndexedLoadSignExtend64(LoadSDNode *LD,
+                                                           unsigned Opcode,
+                                                           SDLoc dl)
+{
+  SDValue Chain = LD->getChain();
+  EVT LoadedVT = LD->getMemoryVT();
+  SDValue Base = LD->getBasePtr();
+  SDValue Offset = LD->getOffset();
+  SDNode *OffsetNode = Offset.getNode();
+  int32_t Val = cast<ConstantSDNode>(OffsetNode)->getSExtValue();
+  SDValue N1 = LD->getOperand(1);
+  SDValue CPTmpN1_0;
+  SDValue CPTmpN1_1;
+
+  if (SelectADDRriS11_2(N1, CPTmpN1_0, CPTmpN1_1) &&
+      N1.getNode()->getValueType(0) == MVT::i32) {
+    const HexagonInstrInfo *TII =
+      static_cast<const HexagonInstrInfo*>(TM.getInstrInfo());
+    if (TII->isValidAutoIncImm(LoadedVT, Val)) {
+      SDValue TargetConst = CurDAG->getTargetConstant(Val, MVT::i32);
+      SDNode *Result_1 = CurDAG->getMachineNode(Opcode, dl, MVT::i32, MVT::i32,
+                                                MVT::Other, Base, TargetConst,
+                                                Chain);
+      SDNode *Result_2 = CurDAG->getMachineNode(Hexagon::SXTW, dl, MVT::i64,
+                                                SDValue(Result_1, 0));
+      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+      MemOp[0] = LD->getMemOperand();
+      cast<MachineSDNode>(Result_1)->setMemRefs(MemOp, MemOp + 1);
+      const SDValue Froms[] = { SDValue(LD, 0),
+                                SDValue(LD, 1),
+                                SDValue(LD, 2)
+      };
+      const SDValue Tos[]   = { SDValue(Result_2, 0),
+                                SDValue(Result_1, 1),
+                                SDValue(Result_1, 2)
+      };
+      ReplaceUses(Froms, Tos, 3);
+      return Result_2;
+    }
+    SDValue TargetConst0 = CurDAG->getTargetConstant(0, MVT::i32);
+    SDValue TargetConstVal = CurDAG->getTargetConstant(Val, MVT::i32);
+    SDNode *Result_1 = CurDAG->getMachineNode(Opcode, dl, MVT::i32,
+                                              MVT::Other, Base, TargetConst0,
+                                              Chain);
+    SDNode *Result_2 = CurDAG->getMachineNode(Hexagon::SXTW, dl,
+                                                MVT::i64, SDValue(Result_1, 0));
+    SDNode* Result_3 = CurDAG->getMachineNode(Hexagon::ADD_ri, dl,
+                                              MVT::i32, Base, TargetConstVal,
+                                                SDValue(Result_1, 1));
+    MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+    MemOp[0] = LD->getMemOperand();
+    cast<MachineSDNode>(Result_1)->setMemRefs(MemOp, MemOp + 1);
+    const SDValue Froms[] = { SDValue(LD, 0),
+                              SDValue(LD, 1),
+                              SDValue(LD, 2)
+    };
+    const SDValue Tos[]   = { SDValue(Result_2, 0),
+                              SDValue(Result_3, 0),
+                              SDValue(Result_1, 1)
+    };
+    ReplaceUses(Froms, Tos, 3);
+    return Result_2;
+  }
+  return SelectCode(LD);
+}
+
+
+SDNode *HexagonDAGToDAGISel::SelectIndexedLoadZeroExtend64(LoadSDNode *LD,
+                                                           unsigned Opcode,
+                                                           SDLoc dl)
+{
+  SDValue Chain = LD->getChain();
+  EVT LoadedVT = LD->getMemoryVT();
+  SDValue Base = LD->getBasePtr();
+  SDValue Offset = LD->getOffset();
+  SDNode *OffsetNode = Offset.getNode();
+  int32_t Val = cast<ConstantSDNode>(OffsetNode)->getSExtValue();
+  SDValue N1 = LD->getOperand(1);
+  SDValue CPTmpN1_0;
+  SDValue CPTmpN1_1;
+
+  if (SelectADDRriS11_2(N1, CPTmpN1_0, CPTmpN1_1) &&
+      N1.getNode()->getValueType(0) == MVT::i32) {
+    const HexagonInstrInfo *TII =
+      static_cast<const HexagonInstrInfo*>(TM.getInstrInfo());
+    if (TII->isValidAutoIncImm(LoadedVT, Val)) {
+      SDValue TargetConstVal = CurDAG->getTargetConstant(Val, MVT::i32);
+      SDValue TargetConst0 = CurDAG->getTargetConstant(0, MVT::i32);
+      SDNode *Result_1 = CurDAG->getMachineNode(Opcode, dl, MVT::i32,
+                                                MVT::i32, MVT::Other, Base,
+                                                TargetConstVal, Chain);
+      SDNode *Result_2 = CurDAG->getMachineNode(Hexagon::TFRI, dl, MVT::i32,
+                                                TargetConst0);
+      SDNode *Result_3 = CurDAG->getMachineNode(Hexagon::COMBINE_rr, dl,
+                                                MVT::i64, MVT::Other,
+                                                SDValue(Result_2,0),
+                                                SDValue(Result_1,0));
+      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+      MemOp[0] = LD->getMemOperand();
+      cast<MachineSDNode>(Result_1)->setMemRefs(MemOp, MemOp + 1);
+      const SDValue Froms[] = { SDValue(LD, 0),
+                                SDValue(LD, 1),
+                                SDValue(LD, 2)
+      };
+      const SDValue Tos[]   = { SDValue(Result_3, 0),
+                                SDValue(Result_1, 1),
+                                SDValue(Result_1, 2)
+      };
+      ReplaceUses(Froms, Tos, 3);
+      return Result_3;
+    }
+
+    // Generate an indirect load.
+    SDValue TargetConst0 = CurDAG->getTargetConstant(0, MVT::i32);
+    SDValue TargetConstVal = CurDAG->getTargetConstant(Val, MVT::i32);
+    SDNode *Result_1 = CurDAG->getMachineNode(Opcode, dl, MVT::i32,
+                                              MVT::Other,
+                                              Base, TargetConst0, Chain);
+    SDNode *Result_2 = CurDAG->getMachineNode(Hexagon::TFRI, dl, MVT::i32,
+                                              TargetConst0);
+    SDNode *Result_3 = CurDAG->getMachineNode(Hexagon::COMBINE_rr, dl,
+                                              MVT::i64, MVT::Other,
+                                              SDValue(Result_2,0),
+                                              SDValue(Result_1,0));
+    // Add offset to base.
+    SDNode* Result_4 = CurDAG->getMachineNode(Hexagon::ADD_ri, dl, MVT::i32,
+                                              Base, TargetConstVal,
+                                              SDValue(Result_1, 1));
+    MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+    MemOp[0] = LD->getMemOperand();
+    cast<MachineSDNode>(Result_1)->setMemRefs(MemOp, MemOp + 1);
+    const SDValue Froms[] = { SDValue(LD, 0),
+                              SDValue(LD, 1),
+                              SDValue(LD, 2)
+    };
+    const SDValue Tos[]   = { SDValue(Result_3, 0), // Load value.
+                              SDValue(Result_4, 0), // New address.
+                              SDValue(Result_1, 1)
+    };
+    ReplaceUses(Froms, Tos, 3);
+    return Result_3;
+  }
+
+  return SelectCode(LD);
+}
+
+
+SDNode *HexagonDAGToDAGISel::SelectIndexedLoad(LoadSDNode *LD, SDLoc dl) {
+  SDValue Chain = LD->getChain();
+  SDValue Base = LD->getBasePtr();
+  SDValue Offset = LD->getOffset();
+  SDNode *OffsetNode = Offset.getNode();
+  // Get the constant value.
+  int32_t Val = cast<ConstantSDNode>(OffsetNode)->getSExtValue();
+  EVT LoadedVT = LD->getMemoryVT();
+  unsigned Opcode = 0;
+
+  // Check for zero ext loads.
+  bool zextval = (LD->getExtensionType() == ISD::ZEXTLOAD);
+
+  // Figure out the opcode.
+  const HexagonInstrInfo *TII =
+    static_cast<const HexagonInstrInfo*>(TM.getInstrInfo());
+  if (LoadedVT == MVT::i64) {
+    if (TII->isValidAutoIncImm(LoadedVT, Val))
+      Opcode = Hexagon::POST_LDrid;
+    else
+      Opcode = Hexagon::LDrid;
+  } else if (LoadedVT == MVT::i32) {
+    if (TII->isValidAutoIncImm(LoadedVT, Val))
+      Opcode = Hexagon::POST_LDriw;
+    else
+      Opcode = Hexagon::LDriw;
+  } else if (LoadedVT == MVT::i16) {
+    if (TII->isValidAutoIncImm(LoadedVT, Val))
+      Opcode = zextval ? Hexagon::POST_LDriuh : Hexagon::POST_LDrih;
+    else
+      Opcode = zextval ? Hexagon::LDriuh : Hexagon::LDrih;
+  } else if (LoadedVT == MVT::i8) {
+    if (TII->isValidAutoIncImm(LoadedVT, Val))
+      Opcode = zextval ? Hexagon::POST_LDriub : Hexagon::POST_LDrib;
+    else
+      Opcode = zextval ? Hexagon::LDriub : Hexagon::LDrib;
+  } else
+    llvm_unreachable("unknown memory type");
+
+  // For zero ext i64 loads, we need to add combine instructions.
+  if (LD->getValueType(0) == MVT::i64 &&
+      LD->getExtensionType() == ISD::ZEXTLOAD) {
+    return SelectIndexedLoadZeroExtend64(LD, Opcode, dl);
+  }
+  if (LD->getValueType(0) == MVT::i64 &&
+             LD->getExtensionType() == ISD::SEXTLOAD) {
+    // Handle sign ext i64 loads.
+    return SelectIndexedLoadSignExtend64(LD, Opcode, dl);
+  }
+  if (TII->isValidAutoIncImm(LoadedVT, Val)) {
+    SDValue TargetConstVal = CurDAG->getTargetConstant(Val, MVT::i32);
+    SDNode* Result = CurDAG->getMachineNode(Opcode, dl,
+                                            LD->getValueType(0),
+                                            MVT::i32, MVT::Other, Base,
+                                            TargetConstVal, Chain);
+    MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+    MemOp[0] = LD->getMemOperand();
+    cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1);
+    const SDValue Froms[] = { SDValue(LD, 0),
+                              SDValue(LD, 1),
+                              SDValue(LD, 2)
+    };
+    const SDValue Tos[]   = { SDValue(Result, 0),
+                              SDValue(Result, 1),
+                              SDValue(Result, 2)
+    };
+    ReplaceUses(Froms, Tos, 3);
+    return Result;
+  } else {
+    SDValue TargetConst0 = CurDAG->getTargetConstant(0, MVT::i32);
+    SDValue TargetConstVal = CurDAG->getTargetConstant(Val, MVT::i32);
+    SDNode* Result_1 = CurDAG->getMachineNode(Opcode, dl,
+                                              LD->getValueType(0),
+                                              MVT::Other, Base, TargetConst0,
+                                              Chain);
+    SDNode* Result_2 = CurDAG->getMachineNode(Hexagon::ADD_ri, dl, MVT::i32,
+                                              Base, TargetConstVal,
+                                              SDValue(Result_1, 1));
+    MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+    MemOp[0] = LD->getMemOperand();
+    cast<MachineSDNode>(Result_1)->setMemRefs(MemOp, MemOp + 1);
+    const SDValue Froms[] = { SDValue(LD, 0),
+                              SDValue(LD, 1),
+                              SDValue(LD, 2)
+    };
+    const SDValue Tos[]   = { SDValue(Result_1, 0),
+                              SDValue(Result_2, 0),
+                              SDValue(Result_1, 1)
+    };
+    ReplaceUses(Froms, Tos, 3);
+    return Result_1;
+  }
+}
+
+
+SDNode *HexagonDAGToDAGISel::SelectLoad(SDNode *N) {
+  SDNode *result;
+  SDLoc dl(N);
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  ISD::MemIndexedMode AM = LD->getAddressingMode();
+
+  // Handle indexed loads.
+  if (AM != ISD::UNINDEXED) {
+    result = SelectIndexedLoad(LD, dl);
+  } else {
+    result = SelectBaseOffsetLoad(LD, dl);
+  }
+
+  return result;
+}
+
+
+SDNode *HexagonDAGToDAGISel::SelectIndexedStore(StoreSDNode *ST, SDLoc dl) {
+  SDValue Chain = ST->getChain();
+  SDValue Base = ST->getBasePtr();
+  SDValue Offset = ST->getOffset();
+  SDValue Value = ST->getValue();
+  SDNode *OffsetNode = Offset.getNode();
+  // Get the constant value.
+  int32_t Val = cast<ConstantSDNode>(OffsetNode)->getSExtValue();
+  EVT StoredVT = ST->getMemoryVT();
+
+  // Offset value must be within representable range
+  // and must have correct alignment properties.
+  const HexagonInstrInfo *TII =
+    static_cast<const HexagonInstrInfo*>(TM.getInstrInfo());
+  if (TII->isValidAutoIncImm(StoredVT, Val)) {
+    SDValue Ops[] = {Base, CurDAG->getTargetConstant(Val, MVT::i32), Value,
+                     Chain};
+    unsigned Opcode = 0;
+
+    // Figure out the post inc version of opcode.
+    if (StoredVT == MVT::i64) Opcode = Hexagon::POST_STdri;
+    else if (StoredVT == MVT::i32) Opcode = Hexagon::POST_STwri;
+    else if (StoredVT == MVT::i16) Opcode = Hexagon::POST_SThri;
+    else if (StoredVT == MVT::i8) Opcode = Hexagon::POST_STbri;
+    else llvm_unreachable("unknown memory type");
+
+    // Build post increment store.
+    SDNode* Result = CurDAG->getMachineNode(Opcode, dl, MVT::i32,
+                                            MVT::Other, Ops);
+    MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+    MemOp[0] = ST->getMemOperand();
+    cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1);
+
+    ReplaceUses(ST, Result);
+    ReplaceUses(SDValue(ST,1), SDValue(Result,1));
+    return Result;
+  }
+
+  // Note: Order of operands matches the def of instruction:
+  // def STrid : STInst<(outs), (ins MEMri:$addr, DoubleRegs:$src1), ...
+  // and it differs for POST_ST* for instance.
+  SDValue Ops[] = { Base, CurDAG->getTargetConstant(0, MVT::i32), Value,
+                    Chain};
+  unsigned Opcode = 0;
+
+  // Figure out the opcode.
+  if (StoredVT == MVT::i64) Opcode = Hexagon::STrid;
+  else if (StoredVT == MVT::i32) Opcode = Hexagon::STriw_indexed;
+  else if (StoredVT == MVT::i16) Opcode = Hexagon::STrih;
+  else if (StoredVT == MVT::i8) Opcode = Hexagon::STrib;
+  else llvm_unreachable("unknown memory type");
+
+  // Build regular store.
+  SDValue TargetConstVal = CurDAG->getTargetConstant(Val, MVT::i32);
+  SDNode* Result_1 = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops);
+  // Build splitted incriment instruction.
+  SDNode* Result_2 = CurDAG->getMachineNode(Hexagon::ADD_ri, dl, MVT::i32,
+                                            Base,
+                                            TargetConstVal,
+                                            SDValue(Result_1, 0));
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = ST->getMemOperand();
+  cast<MachineSDNode>(Result_1)->setMemRefs(MemOp, MemOp + 1);
+
+  ReplaceUses(SDValue(ST,0), SDValue(Result_2,0));
+  ReplaceUses(SDValue(ST,1), SDValue(Result_1,0));
+  return Result_2;
+}
+
+
+SDNode *HexagonDAGToDAGISel::SelectBaseOffsetStore(StoreSDNode *ST,
+                                                   SDLoc dl) {
+  SDValue Chain = ST->getChain();
+  SDNode* Const32 = ST->getBasePtr().getNode();
+  SDValue Value = ST->getValue();
+  unsigned Opcode = 0;
+
+  // Try to lower stores of GlobalAdresses into indexed stores.  Custom
+  // lowering for GlobalAddress nodes has already turned it into a
+  // CONST32.  Avoid truncating stores for the moment.  Post-inc stores
+  // do the same.  Don't think there's a reason for it, so will file a
+  // bug to fix.
+  if ((Const32->getOpcode() == HexagonISD::CONST32) &&
+      !(Value.getValueType() == MVT::i64 && ST->isTruncatingStore())) {
+    SDValue Base = Const32->getOperand(0);
+    if (Base.getOpcode() == ISD::TargetGlobalAddress) {
+      EVT StoredVT = ST->getMemoryVT();
+      int64_t Offset = cast<GlobalAddressSDNode>(Base)->getOffset();
+      if (Offset != 0 && OffsetFitsS11(StoredVT, Offset)) {
+        MVT PointerTy = getTargetLowering()->getPointerTy();
+        const GlobalValue* GV =
+          cast<GlobalAddressSDNode>(Base)->getGlobal();
+        SDValue TargAddr =
+          CurDAG->getTargetGlobalAddress(GV, dl, PointerTy, 0);
+        SDNode* NewBase = CurDAG->getMachineNode(Hexagon::CONST32_set,
+                                                 dl, PointerTy,
+                                                 TargAddr);
+
+        // Figure out base + offset opcode
+        if (StoredVT == MVT::i64) Opcode = Hexagon::STrid_indexed;
+        else if (StoredVT == MVT::i32) Opcode = Hexagon::STriw_indexed;
+        else if (StoredVT == MVT::i16) Opcode = Hexagon::STrih_indexed;
+        else if (StoredVT == MVT::i8) Opcode = Hexagon::STrib_indexed;
+        else llvm_unreachable("unknown memory type");
+
+        SDValue Ops[] = {SDValue(NewBase,0),
+                         CurDAG->getTargetConstant(Offset,PointerTy),
+                         Value, Chain};
+        // build indexed store
+        SDNode* Result = CurDAG->getMachineNode(Opcode, dl,
+                                                MVT::Other, Ops);
+        MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+        MemOp[0] = ST->getMemOperand();
+        cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1);
+        ReplaceUses(ST, Result);
+        return Result;
+      }
+    }
+  }
+
+  return SelectCode(ST);
+}
+
+
+SDNode *HexagonDAGToDAGISel::SelectStore(SDNode *N) {
+  SDLoc dl(N);
+  StoreSDNode *ST = cast<StoreSDNode>(N);
+  ISD::MemIndexedMode AM = ST->getAddressingMode();
+
+  // Handle indexed stores.
+  if (AM != ISD::UNINDEXED) {
+    return SelectIndexedStore(ST, dl);
+  }
+
+  return SelectBaseOffsetStore(ST, dl);
+}
+
+SDNode *HexagonDAGToDAGISel::SelectMul(SDNode *N) {
+  SDLoc dl(N);
+
+  //
+  // %conv.i = sext i32 %tmp1 to i64
+  // %conv2.i = sext i32 %add to i64
+  // %mul.i = mul nsw i64 %conv2.i, %conv.i
+  //
+  //   --- match with the following ---
+  //
+  // %mul.i = mpy (%tmp1, %add)
+  //
+
+  if (N->getValueType(0) == MVT::i64) {
+    // Shifting a i64 signed multiply.
+    SDValue MulOp0 = N->getOperand(0);
+    SDValue MulOp1 = N->getOperand(1);
+
+    SDValue OP0;
+    SDValue OP1;
+
+    // Handle sign_extend and sextload.
+    if (MulOp0.getOpcode() == ISD::SIGN_EXTEND) {
+      SDValue Sext0 = MulOp0.getOperand(0);
+      if (Sext0.getNode()->getValueType(0) != MVT::i32) {
+        return SelectCode(N);
+      }
+
+      OP0 = Sext0;
+    } else if (MulOp0.getOpcode() == ISD::LOAD) {
+      LoadSDNode *LD = cast<LoadSDNode>(MulOp0.getNode());
+      if (LD->getMemoryVT() != MVT::i32 ||
+          LD->getExtensionType() != ISD::SEXTLOAD ||
+          LD->getAddressingMode() != ISD::UNINDEXED) {
+        return SelectCode(N);
+      }
+
+      SDValue Chain = LD->getChain();
+      SDValue TargetConst0 = CurDAG->getTargetConstant(0, MVT::i32);
+      OP0 = SDValue (CurDAG->getMachineNode(Hexagon::LDriw, dl, MVT::i32,
+                                            MVT::Other,
+                                            LD->getBasePtr(), TargetConst0,
+                                            Chain), 0);
+    } else {
+      return SelectCode(N);
+    }
+
+    // Same goes for the second operand.
+    if (MulOp1.getOpcode() == ISD::SIGN_EXTEND) {
+      SDValue Sext1 = MulOp1.getOperand(0);
+      if (Sext1.getNode()->getValueType(0) != MVT::i32) {
+        return SelectCode(N);
+      }
+
+      OP1 = Sext1;
+    } else if (MulOp1.getOpcode() == ISD::LOAD) {
+      LoadSDNode *LD = cast<LoadSDNode>(MulOp1.getNode());
+      if (LD->getMemoryVT() != MVT::i32 ||
+          LD->getExtensionType() != ISD::SEXTLOAD ||
+          LD->getAddressingMode() != ISD::UNINDEXED) {
+        return SelectCode(N);
+      }
+
+      SDValue Chain = LD->getChain();
+      SDValue TargetConst0 = CurDAG->getTargetConstant(0, MVT::i32);
+      OP1 = SDValue (CurDAG->getMachineNode(Hexagon::LDriw, dl, MVT::i32,
+                                            MVT::Other,
+                                            LD->getBasePtr(), TargetConst0,
+                                            Chain), 0);
+    } else {
+      return SelectCode(N);
+    }
+
+    // Generate a mpy instruction.
+    SDNode *Result = CurDAG->getMachineNode(Hexagon::MPY64, dl, MVT::i64,
+                                            OP0, OP1);
+    ReplaceUses(N, Result);
+    return Result;
+  }
+
+  return SelectCode(N);
+}
+
+
+SDNode *HexagonDAGToDAGISel::SelectSelect(SDNode *N) {
+  SDLoc dl(N);
+  SDValue N0 = N->getOperand(0);
+  if (N0.getOpcode() == ISD::SETCC) {
+    SDValue N00 = N0.getOperand(0);
+    if (N00.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+      SDValue N000 = N00.getOperand(0);
+      SDValue N001 = N00.getOperand(1);
+      if (cast<VTSDNode>(N001)->getVT() == MVT::i16) {
+        SDValue N01 = N0.getOperand(1);
+        SDValue N02 = N0.getOperand(2);
+
+        // Pattern: (select:i32 (setcc:i1 (sext_inreg:i32 IntRegs:i32:$src2,
+        // i16:Other),IntRegs:i32:$src1, SETLT:Other),IntRegs:i32:$src1,
+        // IntRegs:i32:$src2)
+        // Emits: (MAXh_rr:i32 IntRegs:i32:$src1, IntRegs:i32:$src2)
+        // Pattern complexity = 9  cost = 1  size = 0.
+        if (cast<CondCodeSDNode>(N02)->get() == ISD::SETLT) {
+          SDValue N1 = N->getOperand(1);
+          if (N01 == N1) {
+            SDValue N2 = N->getOperand(2);
+            if (N000 == N2 &&
+                N0.getNode()->getValueType(N0.getResNo()) == MVT::i1 &&
+                N00.getNode()->getValueType(N00.getResNo()) == MVT::i32) {
+              SDNode *SextNode = CurDAG->getMachineNode(Hexagon::SXTH, dl,
+                                                        MVT::i32, N000);
+              SDNode *Result = CurDAG->getMachineNode(Hexagon::MAXw_rr, dl,
+                                                      MVT::i32,
+                                                      SDValue(SextNode, 0),
+                                                      N1);
+              ReplaceUses(N, Result);
+              return Result;
+            }
+          }
+        }
+
+        // Pattern: (select:i32 (setcc:i1 (sext_inreg:i32 IntRegs:i32:$src2,
+        // i16:Other), IntRegs:i32:$src1, SETGT:Other), IntRegs:i32:$src1,
+        // IntRegs:i32:$src2)
+        // Emits: (MINh_rr:i32 IntRegs:i32:$src1, IntRegs:i32:$src2)
+        // Pattern complexity = 9  cost = 1  size = 0.
+        if (cast<CondCodeSDNode>(N02)->get() == ISD::SETGT) {
+          SDValue N1 = N->getOperand(1);
+          if (N01 == N1) {
+            SDValue N2 = N->getOperand(2);
+            if (N000 == N2 &&
+                N0.getNode()->getValueType(N0.getResNo()) == MVT::i1 &&
+                N00.getNode()->getValueType(N00.getResNo()) == MVT::i32) {
+              SDNode *SextNode = CurDAG->getMachineNode(Hexagon::SXTH, dl,
+                                                        MVT::i32, N000);
+              SDNode *Result = CurDAG->getMachineNode(Hexagon::MINw_rr, dl,
+                                                      MVT::i32,
+                                                      SDValue(SextNode, 0),
+                                                      N1);
+              ReplaceUses(N, Result);
+              return Result;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return SelectCode(N);
+}
+
+
+SDNode *HexagonDAGToDAGISel::SelectTruncate(SDNode *N) {
+  SDLoc dl(N);
+  SDValue Shift = N->getOperand(0);
+
+  //
+  // %conv.i = sext i32 %tmp1 to i64
+  // %conv2.i = sext i32 %add to i64
+  // %mul.i = mul nsw i64 %conv2.i, %conv.i
+  // %shr5.i = lshr i64 %mul.i, 32
+  // %conv3.i = trunc i64 %shr5.i to i32
+  //
+  //   --- match with the following ---
+  //
+  // %conv3.i = mpy (%tmp1, %add)
+  //
+  // Trunc to i32.
+  if (N->getValueType(0) == MVT::i32) {
+    // Trunc from i64.
+    if (Shift.getNode()->getValueType(0) == MVT::i64) {
+      // Trunc child is logical shift right.
+      if (Shift.getOpcode() != ISD::SRL) {
+        return SelectCode(N);
+      }
+
+      SDValue ShiftOp0 = Shift.getOperand(0);
+      SDValue ShiftOp1 = Shift.getOperand(1);
+
+      // Shift by const 32
+      if (ShiftOp1.getOpcode() != ISD::Constant) {
+        return SelectCode(N);
+      }
+
+      int32_t ShiftConst =
+        cast<ConstantSDNode>(ShiftOp1.getNode())->getSExtValue();
+      if (ShiftConst != 32) {
+        return SelectCode(N);
+      }
+
+      // Shifting a i64 signed multiply
+      SDValue Mul = ShiftOp0;
+      if (Mul.getOpcode() != ISD::MUL) {
+        return SelectCode(N);
+      }
+
+      SDValue MulOp0 = Mul.getOperand(0);
+      SDValue MulOp1 = Mul.getOperand(1);
+
+      SDValue OP0;
+      SDValue OP1;
+
+      // Handle sign_extend and sextload
+      if (MulOp0.getOpcode() == ISD::SIGN_EXTEND) {
+        SDValue Sext0 = MulOp0.getOperand(0);
+        if (Sext0.getNode()->getValueType(0) != MVT::i32) {
+          return SelectCode(N);
+        }
+
+        OP0 = Sext0;
+      } else if (MulOp0.getOpcode() == ISD::LOAD) {
+        LoadSDNode *LD = cast<LoadSDNode>(MulOp0.getNode());
+        if (LD->getMemoryVT() != MVT::i32 ||
+            LD->getExtensionType() != ISD::SEXTLOAD ||
+            LD->getAddressingMode() != ISD::UNINDEXED) {
+          return SelectCode(N);
+        }
+
+        SDValue Chain = LD->getChain();
+        SDValue TargetConst0 = CurDAG->getTargetConstant(0, MVT::i32);
+        OP0 = SDValue (CurDAG->getMachineNode(Hexagon::LDriw, dl, MVT::i32,
+                                              MVT::Other,
+                                              LD->getBasePtr(),
+                                              TargetConst0, Chain), 0);
+      } else {
+        return SelectCode(N);
+      }
+
+      // Same goes for the second operand.
+      if (MulOp1.getOpcode() == ISD::SIGN_EXTEND) {
+        SDValue Sext1 = MulOp1.getOperand(0);
+        if (Sext1.getNode()->getValueType(0) != MVT::i32)
+          return SelectCode(N);
+
+        OP1 = Sext1;
+      } else if (MulOp1.getOpcode() == ISD::LOAD) {
+        LoadSDNode *LD = cast<LoadSDNode>(MulOp1.getNode());
+        if (LD->getMemoryVT() != MVT::i32 ||
+            LD->getExtensionType() != ISD::SEXTLOAD ||
+            LD->getAddressingMode() != ISD::UNINDEXED) {
+          return SelectCode(N);
+        }
+
+        SDValue Chain = LD->getChain();
+        SDValue TargetConst0 = CurDAG->getTargetConstant(0, MVT::i32);
+        OP1 = SDValue (CurDAG->getMachineNode(Hexagon::LDriw, dl, MVT::i32,
+                                              MVT::Other,
+                                              LD->getBasePtr(),
+                                              TargetConst0, Chain), 0);
+      } else {
+        return SelectCode(N);
+      }
+
+      // Generate a mpy instruction.
+      SDNode *Result = CurDAG->getMachineNode(Hexagon::MPY, dl, MVT::i32,
+                                              OP0, OP1);
+      ReplaceUses(N, Result);
+      return Result;
+    }
+  }
+
+  return SelectCode(N);
+}
+
+
+SDNode *HexagonDAGToDAGISel::SelectSHL(SDNode *N) {
+  SDLoc dl(N);
+  if (N->getValueType(0) == MVT::i32) {
+    SDValue Shl_0 = N->getOperand(0);
+    SDValue Shl_1 = N->getOperand(1);
+    // RHS is const.
+    if (Shl_1.getOpcode() == ISD::Constant) {
+      if (Shl_0.getOpcode() == ISD::MUL) {
+        SDValue Mul_0 = Shl_0.getOperand(0); // Val
+        SDValue Mul_1 = Shl_0.getOperand(1); // Const
+        // RHS of mul is const.
+        if (Mul_1.getOpcode() == ISD::Constant) {
+          int32_t ShlConst =
+            cast<ConstantSDNode>(Shl_1.getNode())->getSExtValue();
+          int32_t MulConst =
+            cast<ConstantSDNode>(Mul_1.getNode())->getSExtValue();
+          int32_t ValConst = MulConst << ShlConst;
+          SDValue Val = CurDAG->getTargetConstant(ValConst,
+                                                  MVT::i32);
+          if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val.getNode()))
+            if (isInt<9>(CN->getSExtValue())) {
+              SDNode* Result =
+                CurDAG->getMachineNode(Hexagon::MPYI_ri, dl,
+                                       MVT::i32, Mul_0, Val);
+              ReplaceUses(N, Result);
+              return Result;
+            }
+
+        }
+      } else if (Shl_0.getOpcode() == ISD::SUB) {
+        SDValue Sub_0 = Shl_0.getOperand(0); // Const 0
+        SDValue Sub_1 = Shl_0.getOperand(1); // Val
+        if (Sub_0.getOpcode() == ISD::Constant) {
+          int32_t SubConst =
+            cast<ConstantSDNode>(Sub_0.getNode())->getSExtValue();
+          if (SubConst == 0) {
+            if (Sub_1.getOpcode() == ISD::SHL) {
+              SDValue Shl2_0 = Sub_1.getOperand(0); // Val
+              SDValue Shl2_1 = Sub_1.getOperand(1); // Const
+              if (Shl2_1.getOpcode() == ISD::Constant) {
+                int32_t ShlConst =
+                  cast<ConstantSDNode>(Shl_1.getNode())->getSExtValue();
+                int32_t Shl2Const =
+                  cast<ConstantSDNode>(Shl2_1.getNode())->getSExtValue();
+                int32_t ValConst = 1 << (ShlConst+Shl2Const);
+                SDValue Val = CurDAG->getTargetConstant(-ValConst, MVT::i32);
+                if (ConstantSDNode *CN =
+                    dyn_cast<ConstantSDNode>(Val.getNode()))
+                  if (isInt<9>(CN->getSExtValue())) {
+                    SDNode* Result =
+                      CurDAG->getMachineNode(Hexagon::MPYI_ri, dl, MVT::i32,
+                                             Shl2_0, Val);
+                    ReplaceUses(N, Result);
+                    return Result;
+                  }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  return SelectCode(N);
+}
+
+
+//
+// If there is an zero_extend followed an intrinsic in DAG (this means - the
+// result of the intrinsic is predicate); convert the zero_extend to
+// transfer instruction.
+//
+// Zero extend -> transfer is lowered here. Otherwise, zero_extend will be
+// converted into a MUX as predicate registers defined as 1 bit in the
+// compiler. Architecture defines them as 8-bit registers.
+// We want to preserve all the lower 8-bits and, not just 1 LSB bit.
+//
+SDNode *HexagonDAGToDAGISel::SelectZeroExtend(SDNode *N) {
+  SDLoc dl(N);
+  SDNode *IsIntrinsic = N->getOperand(0).getNode();
+  if ((IsIntrinsic->getOpcode() == ISD::INTRINSIC_WO_CHAIN)) {
+    unsigned ID =
+      cast<ConstantSDNode>(IsIntrinsic->getOperand(0))->getZExtValue();
+    if (doesIntrinsicReturnPredicate(ID)) {
+      // Now we need to differentiate target data types.
+      if (N->getValueType(0) == MVT::i64) {
+        // Convert the zero_extend to Rs = Pd followed by COMBINE_rr(0,Rs).
+        SDValue TargetConst0 = CurDAG->getTargetConstant(0, MVT::i32);
+        SDNode *Result_1 = CurDAG->getMachineNode(Hexagon::TFR_RsPd, dl,
+                                                  MVT::i32,
+                                                  SDValue(IsIntrinsic, 0));
+        SDNode *Result_2 = CurDAG->getMachineNode(Hexagon::TFRI, dl,
+                                                  MVT::i32,
+                                                  TargetConst0);
+        SDNode *Result_3 = CurDAG->getMachineNode(Hexagon::COMBINE_rr, dl,
+                                                  MVT::i64, MVT::Other,
+                                                  SDValue(Result_2, 0),
+                                                  SDValue(Result_1, 0));
+        ReplaceUses(N, Result_3);
+        return Result_3;
+      }
+      if (N->getValueType(0) == MVT::i32) {
+        // Convert the zero_extend to Rs = Pd
+        SDNode* RsPd = CurDAG->getMachineNode(Hexagon::TFR_RsPd, dl,
+                                              MVT::i32,
+                                              SDValue(IsIntrinsic, 0));
+        ReplaceUses(N, RsPd);
+        return RsPd;
+      }
+      llvm_unreachable("Unexpected value type");
+    }
+  }
+  return SelectCode(N);
+}
+
+
+//
+// Checking for intrinsics which have predicate registers as operand(s)
+// and lowering to the actual intrinsic.
+//
+SDNode *HexagonDAGToDAGISel::SelectIntrinsicWOChain(SDNode *N) {
+  SDLoc dl(N);
+  unsigned ID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+  unsigned IntrinsicWithPred = doesIntrinsicContainPredicate(ID);
+
+  // We are concerned with only those intrinsics that have predicate registers
+  // as at least one of the operands.
+  if (IntrinsicWithPred) {
+    SmallVector<SDValue, 8> Ops;
+    const HexagonInstrInfo *TII =
+      static_cast<const HexagonInstrInfo*>(TM.getInstrInfo());
+    const MCInstrDesc &MCID = TII->get(IntrinsicWithPred);
+    const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+
+    // Iterate over all the operands of the intrinsics.
+    // For PredRegs, do the transfer.
+    // For Double/Int Regs, just preserve the value
+    // For immediates, lower it.
+    for (unsigned i = 1; i < N->getNumOperands(); ++i) {
+      SDNode *Arg = N->getOperand(i).getNode();
+      const TargetRegisterClass *RC = TII->getRegClass(MCID, i, TRI, *MF);
+
+      if (RC == &Hexagon::IntRegsRegClass ||
+          RC == &Hexagon::DoubleRegsRegClass) {
+        Ops.push_back(SDValue(Arg, 0));
+      } else if (RC == &Hexagon::PredRegsRegClass) {
+        // Do the transfer.
+        SDNode *PdRs = CurDAG->getMachineNode(Hexagon::TFR_PdRs, dl, MVT::i1,
+                                              SDValue(Arg, 0));
+        Ops.push_back(SDValue(PdRs,0));
+      } else if (!RC && (dyn_cast<ConstantSDNode>(Arg) != nullptr)) {
+        // This is immediate operand. Lower it here making sure that we DO have
+        // const SDNode for immediate value.
+        int32_t Val = cast<ConstantSDNode>(Arg)->getSExtValue();
+        SDValue SDVal = CurDAG->getTargetConstant(Val, MVT::i32);
+        Ops.push_back(SDVal);
+      } else {
+        llvm_unreachable("Unimplemented");
+      }
+    }
+    EVT ReturnValueVT = N->getValueType(0);
+    SDNode *Result = CurDAG->getMachineNode(IntrinsicWithPred, dl,
+                                            ReturnValueVT, Ops);
+    ReplaceUses(N, Result);
+    return Result;
+  }
+  return SelectCode(N);
+}
+
+//
+// Map floating point constant values.
+//
+SDNode *HexagonDAGToDAGISel::SelectConstantFP(SDNode *N) {
+  SDLoc dl(N);
+  ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N);
+  APFloat APF = CN->getValueAPF();
+  if (N->getValueType(0) == MVT::f32) {
+    return CurDAG->getMachineNode(Hexagon::TFRI_f, dl, MVT::f32,
+              CurDAG->getTargetConstantFP(APF.convertToFloat(), MVT::f32));
+  }
+  else if (N->getValueType(0) == MVT::f64) {
+    return CurDAG->getMachineNode(Hexagon::CONST64_Float_Real, dl, MVT::f64,
+              CurDAG->getTargetConstantFP(APF.convertToDouble(), MVT::f64));
+  }
+
+  return SelectCode(N);
+}
+
+
+//
+// Map predicate true (encoded as -1 in LLVM) to a XOR.
+//
+SDNode *HexagonDAGToDAGISel::SelectConstant(SDNode *N) {
+  SDLoc dl(N);
+  if (N->getValueType(0) == MVT::i1) {
+    SDNode* Result;
+    int32_t Val = cast<ConstantSDNode>(N)->getSExtValue();
+    if (Val == -1) {
+      // Create the IntReg = 1 node.
+      SDNode* IntRegTFR =
+        CurDAG->getMachineNode(Hexagon::TFRI, dl, MVT::i32,
+                               CurDAG->getTargetConstant(0, MVT::i32));
+
+      // Pd = IntReg
+      SDNode* Pd = CurDAG->getMachineNode(Hexagon::TFR_PdRs, dl, MVT::i1,
+                                          SDValue(IntRegTFR, 0));
+
+      // not(Pd)
+      SDNode* NotPd = CurDAG->getMachineNode(Hexagon::NOT_p, dl, MVT::i1,
+                                             SDValue(Pd, 0));
+
+      // xor(not(Pd))
+      Result = CurDAG->getMachineNode(Hexagon::XOR_pp, dl, MVT::i1,
+                                      SDValue(Pd, 0), SDValue(NotPd, 0));
+
+      // We have just built:
+      // Rs = Pd
+      // Pd = xor(not(Pd), Pd)
+
+      ReplaceUses(N, Result);
+      return Result;
+    }
+  }
+
+  return SelectCode(N);
+}
+
+
+//
+// Map add followed by a asr -> asr +=.
+//
+SDNode *HexagonDAGToDAGISel::SelectAdd(SDNode *N) {
+  SDLoc dl(N);
+  if (N->getValueType(0) != MVT::i32) {
+    return SelectCode(N);
+  }
+  // Identify nodes of the form: add(asr(...)).
+  SDNode* Src1 = N->getOperand(0).getNode();
+  if (Src1->getOpcode() != ISD::SRA || !Src1->hasOneUse()
+      || Src1->getValueType(0) != MVT::i32) {
+    return SelectCode(N);
+  }
+
+  // Build Rd = Rd' + asr(Rs, Rt). The machine constraints will ensure that
+  // Rd and Rd' are assigned to the same register
+  SDNode* Result = CurDAG->getMachineNode(Hexagon::ASR_ADD_rr, dl, MVT::i32,
+                                          N->getOperand(1),
+                                          Src1->getOperand(0),
+                                          Src1->getOperand(1));
+  ReplaceUses(N, Result);
+
+  return Result;
+}
+
+
+SDNode *HexagonDAGToDAGISel::Select(SDNode *N) {
+  if (N->isMachineOpcode()) {
+    N->setNodeId(-1);
+    return nullptr;   // Already selected.
+  }
+
+
+  switch (N->getOpcode()) {
+  case ISD::Constant:
+    return SelectConstant(N);
+
+  case ISD::ConstantFP:
+    return SelectConstantFP(N);
+
+  case ISD::ADD:
+    return SelectAdd(N);
+
+  case ISD::SHL:
+    return SelectSHL(N);
+
+  case ISD::LOAD:
+    return SelectLoad(N);
+
+  case ISD::STORE:
+    return SelectStore(N);
+
+  case ISD::SELECT:
+    return SelectSelect(N);
+
+  case ISD::TRUNCATE:
+    return SelectTruncate(N);
+
+  case ISD::MUL:
+    return SelectMul(N);
+
+  case ISD::ZERO_EXTEND:
+    return SelectZeroExtend(N);
+
+  case ISD::INTRINSIC_WO_CHAIN:
+    return SelectIntrinsicWOChain(N);
+  }
+
+  return SelectCode(N);
+}
+
+
+//
+// Hexagon_TODO: Five functions for ADDRri?! Surely there must be a better way
+// to define these instructions.
+//
+bool HexagonDAGToDAGISel::SelectADDRri(SDValue& Addr, SDValue &Base,
+                                       SDValue &Offset) {
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress)
+    return false;  // Direct calls.
+
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  return true;
+}
+
+
+bool HexagonDAGToDAGISel::SelectADDRriS11_0(SDValue& Addr, SDValue &Base,
+                                            SDValue &Offset) {
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress)
+    return false;  // Direct calls.
+
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return (IsS11_0_Offset(Offset.getNode()));
+  }
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  return (IsS11_0_Offset(Offset.getNode()));
+}
+
+
+bool HexagonDAGToDAGISel::SelectADDRriS11_1(SDValue& Addr, SDValue &Base,
+                                            SDValue &Offset) {
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress)
+    return false;  // Direct calls.
+
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return (IsS11_1_Offset(Offset.getNode()));
+  }
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  return (IsS11_1_Offset(Offset.getNode()));
+}
+
+
+bool HexagonDAGToDAGISel::SelectADDRriS11_2(SDValue& Addr, SDValue &Base,
+                                            SDValue &Offset) {
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress)
+    return false;  // Direct calls.
+
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return (IsS11_2_Offset(Offset.getNode()));
+  }
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  return (IsS11_2_Offset(Offset.getNode()));
+}
+
+
+bool HexagonDAGToDAGISel::SelectADDRriU6_0(SDValue& Addr, SDValue &Base,
+                                            SDValue &Offset) {
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress)
+    return false;  // Direct calls.
+
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return (IsU6_0_Offset(Offset.getNode()));
+  }
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  return (IsU6_0_Offset(Offset.getNode()));
+}
+
+
+bool HexagonDAGToDAGISel::SelectADDRriU6_1(SDValue& Addr, SDValue &Base,
+                                            SDValue &Offset) {
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress)
+    return false;  // Direct calls.
+
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return (IsU6_1_Offset(Offset.getNode()));
+  }
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  return (IsU6_1_Offset(Offset.getNode()));
+}
+
+
+bool HexagonDAGToDAGISel::SelectADDRriU6_2(SDValue& Addr, SDValue &Base,
+                                            SDValue &Offset) {
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress)
+    return false;  // Direct calls.
+
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return (IsU6_2_Offset(Offset.getNode()));
+  }
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  return (IsU6_2_Offset(Offset.getNode()));
+}
+
+
+bool HexagonDAGToDAGISel::SelectMEMriS11_2(SDValue& Addr, SDValue &Base,
+                                           SDValue &Offset) {
+
+  if (Addr.getOpcode() != ISD::ADD) {
+    return(SelectADDRriS11_2(Addr, Base, Offset));
+  }
+
+  return SelectADDRriS11_2(Addr, Base, Offset);
+}
+
+
+bool HexagonDAGToDAGISel::SelectADDRriS11_3(SDValue& Addr, SDValue &Base,
+                                            SDValue &Offset) {
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress)
+    return false;  // Direct calls.
+
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return (IsS11_3_Offset(Offset.getNode()));
+  }
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  return (IsS11_3_Offset(Offset.getNode()));
+}
+
+bool HexagonDAGToDAGISel::SelectADDRrr(SDValue &Addr, SDValue &R1,
+                                       SDValue &R2) {
+  if (Addr.getOpcode() == ISD::FrameIndex) return false;
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress)
+    return false;  // Direct calls.
+
+  if (Addr.getOpcode() == ISD::ADD) {
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
+      if (isInt<13>(CN->getSExtValue()))
+        return false;  // Let the reg+imm pattern catch this!
+    R1 = Addr.getOperand(0);
+    R2 = Addr.getOperand(1);
+    return true;
+  }
+
+  R1 = Addr;
+
+  return true;
+}
+
+
+// Handle generic address case. It is accessed from inlined asm =m constraints,
+// which could have any kind of pointer.
+bool HexagonDAGToDAGISel::SelectAddr(SDNode *Op, SDValue Addr,
+                                          SDValue &Base, SDValue &Offset) {
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress)
+    return false;  // Direct calls.
+
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+
+  if (Addr.getOpcode() == ISD::ADD) {
+    Base = Addr.getOperand(0);
+    Offset = Addr.getOperand(1);
+    return true;
+  }
+
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  return true;
+}
+
+
+bool HexagonDAGToDAGISel::
+SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
+                             std::vector<SDValue> &OutOps) {
+  SDValue Op0, Op1;
+
+  switch (ConstraintCode) {
+  case 'o':   // Offsetable.
+  case 'v':   // Not offsetable.
+  default: return true;
+  case 'm':   // Memory.
+    if (!SelectAddr(Op.getNode(), Op, Op0, Op1))
+      return true;
+    break;
+  }
+
+  OutOps.push_back(Op0);
+  OutOps.push_back(Op1);
+  return false;
+}
+
+bool HexagonDAGToDAGISel::isConstExtProfitable(SDNode *N) const {
+  unsigned UseCount = 0;
+  for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) {
+    UseCount++;
+  }
+
+  return (UseCount <= 1);
+
+}
+
+//===--------------------------------------------------------------------===//
+// Return 'true' if use count of the global address is below threshold.
+//===--------------------------------------------------------------------===//
+bool HexagonDAGToDAGISel::hasNumUsesBelowThresGA(SDNode *N) const {
+  assert(N->getOpcode() == ISD::TargetGlobalAddress &&
+         "Expecting a target global address");
+
+  // Always try to fold the address.
+  if (TM.getOptLevel() == CodeGenOpt::Aggressive)
+    return true;
+
+  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N);
+  DenseMap<const GlobalValue *, unsigned>::const_iterator GI =
+    GlobalAddressUseCountMap.find(GA->getGlobal());
+
+  if (GI == GlobalAddressUseCountMap.end())
+    return false;
+
+  return GI->second <= MaxNumOfUsesForConstExtenders;
+}
+
+//===--------------------------------------------------------------------===//
+// Return true if the non-GP-relative global address can be folded.
+//===--------------------------------------------------------------------===//
+inline bool HexagonDAGToDAGISel::foldGlobalAddress(SDValue &N, SDValue &R) {
+  return foldGlobalAddressImpl(N, R, false);
+}
+
+//===--------------------------------------------------------------------===//
+// Return true if the GP-relative global address can be folded.
+//===--------------------------------------------------------------------===//
+inline bool HexagonDAGToDAGISel::foldGlobalAddressGP(SDValue &N, SDValue &R) {
+  return foldGlobalAddressImpl(N, R, true);
+}
+
+//===--------------------------------------------------------------------===//
+// Fold offset of the global address if number of uses are below threshold.
+//===--------------------------------------------------------------------===//
+bool HexagonDAGToDAGISel::foldGlobalAddressImpl(SDValue &N, SDValue &R,
+                                                bool ShouldLookForGP) {
+  if (N.getOpcode() == ISD::ADD) {
+    SDValue N0 = N.getOperand(0);
+    SDValue N1 = N.getOperand(1);
+    if ((ShouldLookForGP && (N0.getOpcode() == HexagonISD::CONST32_GP)) ||
+        (!ShouldLookForGP && (N0.getOpcode() == HexagonISD::CONST32))) {
+      ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N1);
+      GlobalAddressSDNode *GA =
+        dyn_cast<GlobalAddressSDNode>(N0.getOperand(0));
+
+      if (Const && GA &&
+          (GA->getOpcode() == ISD::TargetGlobalAddress)) {
+        if ((N0.getOpcode() == HexagonISD::CONST32) &&
+                !hasNumUsesBelowThresGA(GA))
+            return false;
+        R = CurDAG->getTargetGlobalAddress(GA->getGlobal(),
+                                          SDLoc(Const),
+                                          N.getValueType(),
+                                          GA->getOffset() +
+                                          (uint64_t)Const->getSExtValue());
+        return true;
+      }
+    }
+  }
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
new file mode 100644
index 0000000..a460ea4
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -0,0 +1,1708 @@
+//===-- HexagonISelLowering.cpp - Hexagon DAG Lowering Implementation -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the interfaces that Hexagon uses to lower LLVM code
+// into a selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonISelLowering.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
+#include "HexagonTargetObjectFile.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "hexagon-lowering"
+
+static cl::opt<bool>
+EmitJumpTables("hexagon-emit-jump-tables", cl::init(true), cl::Hidden,
+               cl::desc("Control jump table emission on Hexagon target"));
+
+namespace {
+class HexagonCCState : public CCState {
+  int NumNamedVarArgParams;
+
+public:
+  HexagonCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
+                 const TargetMachine &TM, SmallVectorImpl<CCValAssign> &locs,
+                 LLVMContext &C, int NumNamedVarArgParams)
+      : CCState(CC, isVarArg, MF, TM, locs, C),
+        NumNamedVarArgParams(NumNamedVarArgParams) {}
+
+  int getNumNamedVarArgParams() const { return NumNamedVarArgParams; }
+};
+}
+
+// Implement calling convention for Hexagon.
+static bool
+CC_Hexagon(unsigned ValNo, MVT ValVT,
+           MVT LocVT, CCValAssign::LocInfo LocInfo,
+           ISD::ArgFlagsTy ArgFlags, CCState &State);
+
+static bool
+CC_Hexagon32(unsigned ValNo, MVT ValVT,
+             MVT LocVT, CCValAssign::LocInfo LocInfo,
+             ISD::ArgFlagsTy ArgFlags, CCState &State);
+
+static bool
+CC_Hexagon64(unsigned ValNo, MVT ValVT,
+             MVT LocVT, CCValAssign::LocInfo LocInfo,
+             ISD::ArgFlagsTy ArgFlags, CCState &State);
+
+static bool
+RetCC_Hexagon(unsigned ValNo, MVT ValVT,
+              MVT LocVT, CCValAssign::LocInfo LocInfo,
+              ISD::ArgFlagsTy ArgFlags, CCState &State);
+
+static bool
+RetCC_Hexagon32(unsigned ValNo, MVT ValVT,
+                MVT LocVT, CCValAssign::LocInfo LocInfo,
+                ISD::ArgFlagsTy ArgFlags, CCState &State);
+
+static bool
+RetCC_Hexagon64(unsigned ValNo, MVT ValVT,
+                MVT LocVT, CCValAssign::LocInfo LocInfo,
+                ISD::ArgFlagsTy ArgFlags, CCState &State);
+
+static bool
+CC_Hexagon_VarArg (unsigned ValNo, MVT ValVT,
+            MVT LocVT, CCValAssign::LocInfo LocInfo,
+            ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  HexagonCCState &HState = static_cast<HexagonCCState &>(State);
+
+  // NumNamedVarArgParams can not be zero for a VarArg function.
+  assert((HState.getNumNamedVarArgParams() > 0) &&
+         "NumNamedVarArgParams is not bigger than zero.");
+
+  if ((int)ValNo < HState.getNumNamedVarArgParams()) {
+    // Deal with named arguments.
+    return CC_Hexagon(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State);
+  }
+
+  // Deal with un-named arguments.
+  unsigned ofst;
+  if (ArgFlags.isByVal()) {
+    // If pass-by-value, the size allocated on stack is decided
+    // by ArgFlags.getByValSize(), not by the size of LocVT.
+    assert ((ArgFlags.getByValSize() > 8) &&
+            "ByValSize must be bigger than 8 bytes");
+    ofst = State.AllocateStack(ArgFlags.getByValSize(), 4);
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, ofst, LocVT, LocInfo));
+    return false;
+  }
+  if (LocVT == MVT::i1 || LocVT == MVT::i8 || LocVT == MVT::i16) {
+    LocVT = MVT::i32;
+    ValVT = MVT::i32;
+    if (ArgFlags.isSExt())
+      LocInfo = CCValAssign::SExt;
+    else if (ArgFlags.isZExt())
+      LocInfo = CCValAssign::ZExt;
+    else
+      LocInfo = CCValAssign::AExt;
+  }
+  if (LocVT == MVT::i32 || LocVT == MVT::f32) {
+    ofst = State.AllocateStack(4, 4);
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, ofst, LocVT, LocInfo));
+    return false;
+  }
+  if (LocVT == MVT::i64 || LocVT == MVT::f64) {
+    ofst = State.AllocateStack(8, 8);
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, ofst, LocVT, LocInfo));
+    return false;
+  }
+  llvm_unreachable(nullptr);
+}
+
+
+static bool
+CC_Hexagon (unsigned ValNo, MVT ValVT,
+            MVT LocVT, CCValAssign::LocInfo LocInfo,
+            ISD::ArgFlagsTy ArgFlags, CCState &State) {
+
+  if (ArgFlags.isByVal()) {
+    // Passed on stack.
+    assert ((ArgFlags.getByValSize() > 8) &&
+            "ByValSize must be bigger than 8 bytes");
+    unsigned Offset = State.AllocateStack(ArgFlags.getByValSize(), 4);
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+    return false;
+  }
+
+  if (LocVT == MVT::i1 || LocVT == MVT::i8 || LocVT == MVT::i16) {
+    LocVT = MVT::i32;
+    ValVT = MVT::i32;
+    if (ArgFlags.isSExt())
+      LocInfo = CCValAssign::SExt;
+    else if (ArgFlags.isZExt())
+      LocInfo = CCValAssign::ZExt;
+    else
+      LocInfo = CCValAssign::AExt;
+  }
+
+  if (LocVT == MVT::i32 || LocVT == MVT::f32) {
+    if (!CC_Hexagon32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
+      return false;
+  }
+
+  if (LocVT == MVT::i64 || LocVT == MVT::f64) {
+    if (!CC_Hexagon64(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
+      return false;
+  }
+
+  return true;  // CC didn't match.
+}
+
+
+static bool CC_Hexagon32(unsigned ValNo, MVT ValVT,
+                         MVT LocVT, CCValAssign::LocInfo LocInfo,
+                         ISD::ArgFlagsTy ArgFlags, CCState &State) {
+
+  static const MCPhysReg RegList[] = {
+    Hexagon::R0, Hexagon::R1, Hexagon::R2, Hexagon::R3, Hexagon::R4,
+    Hexagon::R5
+  };
+  if (unsigned Reg = State.AllocateReg(RegList, 6)) {
+    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+    return false;
+  }
+
+  unsigned Offset = State.AllocateStack(4, 4);
+  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+  return false;
+}
+
+static bool CC_Hexagon64(unsigned ValNo, MVT ValVT,
+                         MVT LocVT, CCValAssign::LocInfo LocInfo,
+                         ISD::ArgFlagsTy ArgFlags, CCState &State) {
+
+  if (unsigned Reg = State.AllocateReg(Hexagon::D0)) {
+    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+    return false;
+  }
+
+  static const MCPhysReg RegList1[] = {
+    Hexagon::D1, Hexagon::D2
+  };
+  static const MCPhysReg RegList2[] = {
+    Hexagon::R1, Hexagon::R3
+  };
+  if (unsigned Reg = State.AllocateReg(RegList1, RegList2, 2)) {
+    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+    return false;
+  }
+
+  unsigned Offset = State.AllocateStack(8, 8, Hexagon::D2);
+  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+  return false;
+}
+
+static bool RetCC_Hexagon(unsigned ValNo, MVT ValVT,
+                          MVT LocVT, CCValAssign::LocInfo LocInfo,
+                          ISD::ArgFlagsTy ArgFlags, CCState &State) {
+
+
+  if (LocVT == MVT::i1 ||
+      LocVT == MVT::i8 ||
+      LocVT == MVT::i16) {
+    LocVT = MVT::i32;
+    ValVT = MVT::i32;
+    if (ArgFlags.isSExt())
+      LocInfo = CCValAssign::SExt;
+    else if (ArgFlags.isZExt())
+      LocInfo = CCValAssign::ZExt;
+    else
+      LocInfo = CCValAssign::AExt;
+  }
+
+  if (LocVT == MVT::i32 || LocVT == MVT::f32) {
+    if (!RetCC_Hexagon32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
+    return false;
+  }
+
+  if (LocVT == MVT::i64 || LocVT == MVT::f64) {
+    if (!RetCC_Hexagon64(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
+    return false;
+  }
+
+  return true;  // CC didn't match.
+}
+
+static bool RetCC_Hexagon32(unsigned ValNo, MVT ValVT,
+                            MVT LocVT, CCValAssign::LocInfo LocInfo,
+                            ISD::ArgFlagsTy ArgFlags, CCState &State) {
+
+  if (LocVT == MVT::i32 || LocVT == MVT::f32) {
+    if (unsigned Reg = State.AllocateReg(Hexagon::R0)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return false;
+    }
+  }
+
+  unsigned Offset = State.AllocateStack(4, 4);
+  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+  return false;
+}
+
+static bool RetCC_Hexagon64(unsigned ValNo, MVT ValVT,
+                            MVT LocVT, CCValAssign::LocInfo LocInfo,
+                            ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  if (LocVT == MVT::i64 || LocVT == MVT::f64) {
+    if (unsigned Reg = State.AllocateReg(Hexagon::D0)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return false;
+    }
+  }
+
+  unsigned Offset = State.AllocateStack(8, 8);
+  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+  return false;
+}
+
+SDValue
+HexagonTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG)
+const {
+  return SDValue();
+}
+
+/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
+/// by "Src" to address "Dst" of size "Size".  Alignment information is
+/// specified by the specific parameter attribute. The copy will be passed as
+/// a byval function parameter.  Sometimes what we are copying is the end of a
+/// larger object, the part that does not fit in registers.
+static SDValue
+CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
+                          ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
+                          SDLoc dl) {
+
+  SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
+  return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
+                       /*isVolatile=*/false, /*AlwaysInline=*/false,
+                       MachinePointerInfo(), MachinePointerInfo());
+}
+
+
+// LowerReturn - Lower ISD::RET. If a struct is larger than 8 bytes and is
+// passed by value, the function prototype is modified to return void and
+// the value is stored in memory pointed by a pointer passed by caller.
+SDValue
+HexagonTargetLowering::LowerReturn(SDValue Chain,
+                                   CallingConv::ID CallConv, bool isVarArg,
+                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                   const SmallVectorImpl<SDValue> &OutVals,
+                                   SDLoc dl, SelectionDAG &DAG) const {
+
+  // CCValAssign - represent the assignment of the return value to locations.
+  SmallVector<CCValAssign, 16> RVLocs;
+
+  // CCState - Info about the registers and stack slot.
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), RVLocs, *DAG.getContext());
+
+  // Analyze return values of ISD::RET
+  CCInfo.AnalyzeReturn(Outs, RetCC_Hexagon);
+
+  SDValue Flag;
+  SmallVector<SDValue, 4> RetOps(1, Chain);
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), OutVals[i], Flag);
+
+    // Guarantee that all emitted copies are stuck together with flags.
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
+
+  RetOps[0] = Chain;  // Update chain.
+
+  // Add the flag if we have it.
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
+  return DAG.getNode(HexagonISD::RET_FLAG, dl, MVT::Other, RetOps);
+}
+
+
+
+
+/// LowerCallResult - Lower the result values of an ISD::CALL into the
+/// appropriate copies out of appropriate physical registers.  This assumes that
+/// Chain/InFlag are the input chain/flag to use, and that TheCall is the call
+/// being lowered. Returns a SDNode with the same number of values as the
+/// ISD::CALL.
+SDValue
+HexagonTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
+                                       CallingConv::ID CallConv, bool isVarArg,
+                                       const
+                                       SmallVectorImpl<ISD::InputArg> &Ins,
+                                       SDLoc dl, SelectionDAG &DAG,
+                                       SmallVectorImpl<SDValue> &InVals,
+                                       const SmallVectorImpl<SDValue> &OutVals,
+                                       SDValue Callee) const {
+
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), RVLocs, *DAG.getContext());
+
+  CCInfo.AnalyzeCallResult(Ins, RetCC_Hexagon);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    Chain = DAG.getCopyFromReg(Chain, dl,
+                               RVLocs[i].getLocReg(),
+                               RVLocs[i].getValVT(), InFlag).getValue(1);
+    InFlag = Chain.getValue(2);
+    InVals.push_back(Chain.getValue(0));
+  }
+
+  return Chain;
+}
+
+/// LowerCall - Functions arguments are copied from virtual regs to
+/// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
+SDValue
+HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                                 SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  SDLoc &dl                             = CLI.DL;
+  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+  SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
+  SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  bool &isTailCall                      = CLI.IsTailCall;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool isVarArg                         = CLI.IsVarArg;
+
+  bool IsStructRet    = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
+
+  // Check for varargs.
+  int NumNamedVarArgParams = -1;
+  if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Callee))
+  {
+    const Function* CalleeFn = nullptr;
+    Callee = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, MVT::i32);
+    if ((CalleeFn = dyn_cast<Function>(GA->getGlobal())))
+    {
+      // If a function has zero args and is a vararg function, that's
+      // disallowed so it must be an undeclared function.  Do not assume
+      // varargs if the callee is undefined.
+      if (CalleeFn->isVarArg() &&
+          CalleeFn->getFunctionType()->getNumParams() != 0) {
+        NumNamedVarArgParams = CalleeFn->getFunctionType()->getNumParams();
+      }
+    }
+  }
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  HexagonCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                        getTargetMachine(), ArgLocs, *DAG.getContext(),
+                        NumNamedVarArgParams);
+
+  if (NumNamedVarArgParams > 0)
+    CCInfo.AnalyzeCallOperands(Outs, CC_Hexagon_VarArg);
+  else
+    CCInfo.AnalyzeCallOperands(Outs, CC_Hexagon);
+
+
+  if(isTailCall) {
+    bool StructAttrFlag =
+      DAG.getMachineFunction().getFunction()->hasStructRetAttr();
+    isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
+                                                   isVarArg, IsStructRet,
+                                                   StructAttrFlag,
+                                                   Outs, OutVals, Ins, DAG);
+    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i){
+      CCValAssign &VA = ArgLocs[i];
+      if (VA.isMemLoc()) {
+        isTailCall = false;
+        break;
+      }
+    }
+    if (isTailCall) {
+      DEBUG(dbgs () << "Eligible for Tail Call\n");
+    } else {
+      DEBUG(dbgs () <<
+            "Argument must be passed on stack. Not eligible for Tail Call\n");
+    }
+  }
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+  SmallVector<std::pair<unsigned, SDValue>, 16> RegsToPass;
+  SmallVector<SDValue, 8> MemOpChains;
+
+  const HexagonRegisterInfo *QRI = static_cast<const HexagonRegisterInfo *>(
+      DAG.getTarget().getRegisterInfo());
+  SDValue StackPtr =
+      DAG.getCopyFromReg(Chain, dl, QRI->getStackRegister(), getPointerTy());
+
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    SDValue Arg = OutVals[i];
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+      default:
+        // Loc info must be one of Full, SExt, ZExt, or AExt.
+        llvm_unreachable("Unknown loc info!");
+      case CCValAssign::Full:
+        break;
+      case CCValAssign::SExt:
+        Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+      case CCValAssign::ZExt:
+        Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+      case CCValAssign::AExt:
+        Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+    }
+
+    if (VA.isMemLoc()) {
+      unsigned LocMemOffset = VA.getLocMemOffset();
+      SDValue PtrOff = DAG.getConstant(LocMemOffset, StackPtr.getValueType());
+      PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
+
+      if (Flags.isByVal()) {
+        // The argument is a struct passed by value. According to LLVM, "Arg"
+        // is is pointer.
+        MemOpChains.push_back(CreateCopyOfByValArgument(Arg, PtrOff, Chain,
+                                                        Flags, DAG, dl));
+      } else {
+        // The argument is not passed by value. "Arg" is a buildin type. It is
+        // not a pointer.
+        MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
+                                           MachinePointerInfo(),false, false,
+                                           0));
+      }
+      continue;
+    }
+
+    // Arguments that can be passed on register must be kept at RegsToPass
+    // vector.
+    if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    }
+  }
+
+  // Transform all store nodes into one single node because all store
+  // nodes are independent of each other.
+  if (!MemOpChains.empty()) {
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
+  }
+
+  if (!isTailCall)
+    Chain = DAG.getCALLSEQ_START(Chain, DAG.getConstant(NumBytes,
+                                                        getPointerTy(), true),
+                                 dl);
+
+  // Build a sequence of copy-to-reg nodes chained together with token
+  // chain and flag operands which copy the outgoing args into registers.
+  // The InFlag in necessary since all emitted instructions must be
+  // stuck together.
+  SDValue InFlag;
+  if (!isTailCall) {
+    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                               RegsToPass[i].second, InFlag);
+      InFlag = Chain.getValue(1);
+    }
+  }
+
+  // For tail calls lower the arguments to the 'real' stack slot.
+  if (isTailCall) {
+    // Force all the incoming stack arguments to be loaded from the stack
+    // before any new outgoing arguments are stored to the stack, because the
+    // outgoing stack slots may alias the incoming argument stack slots, and
+    // the alias isn't otherwise explicit. This is slightly more conservative
+    // than necessary, because it means that each store effectively depends
+    // on every argument instead of just those arguments it would clobber.
+    //
+    // Do not flag preceding copytoreg stuff together with the following stuff.
+    InFlag = SDValue();
+    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                               RegsToPass[i].second, InFlag);
+      InFlag = Chain.getValue(1);
+    }
+    InFlag =SDValue();
+  }
+
+  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
+  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
+  // node so that legalize doesn't hack it.
+  if (flag_aligned_memcpy) {
+    const char *MemcpyName =
+      "__hexagon_memcpy_likely_aligned_min32bytes_mult8bytes";
+    Callee =
+      DAG.getTargetExternalSymbol(MemcpyName, getPointerTy());
+    flag_aligned_memcpy = false;
+  } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, getPointerTy());
+  } else if (ExternalSymbolSDNode *S =
+             dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy());
+  }
+
+  // Returns a chain & a flag for retval copy to use.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add argument registers to the end of the list so that they are
+  // known live into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+  }
+
+  if (InFlag.getNode()) {
+    Ops.push_back(InFlag);
+  }
+
+  if (isTailCall)
+    return DAG.getNode(HexagonISD::TC_RETURN, dl, NodeTys, Ops);
+
+  Chain = DAG.getNode(HexagonISD::CALL, dl, NodeTys, Ops);
+  InFlag = Chain.getValue(1);
+
+  // Create the CALLSEQ_END node.
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                             DAG.getIntPtrConstant(0, true), InFlag, dl);
+  InFlag = Chain.getValue(1);
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
+                         InVals, OutVals, Callee);
+}
+
+static bool getIndexedAddressParts(SDNode *Ptr, EVT VT,
+                                   bool isSEXTLoad, SDValue &Base,
+                                   SDValue &Offset, bool &isInc,
+                                   SelectionDAG &DAG) {
+  if (Ptr->getOpcode() != ISD::ADD)
+  return false;
+
+  if (VT == MVT::i64 || VT == MVT::i32 || VT == MVT::i16 || VT == MVT::i8) {
+    isInc = (Ptr->getOpcode() == ISD::ADD);
+    Base = Ptr->getOperand(0);
+    Offset = Ptr->getOperand(1);
+    // Ensure that Offset is a constant.
+    return (isa<ConstantSDNode>(Offset));
+  }
+
+  return false;
+}
+
+// TODO: Put this function along with the other isS* functions in
+// HexagonISelDAGToDAG.cpp into a common file. Or better still, use the
+// functions defined in HexagonOperands.td.
+static bool Is_PostInc_S4_Offset(SDNode * S, int ShiftAmount) {
+  ConstantSDNode *N = cast<ConstantSDNode>(S);
+
+  // immS4 predicate - True if the immediate fits in a 4-bit sign extended.
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  int64_t m = 0;
+  if (ShiftAmount > 0) {
+    m = v % ShiftAmount;
+    v = v >> ShiftAmount;
+  }
+  return (v <= 7) && (v >= -8) && (m == 0);
+}
+
+/// getPostIndexedAddressParts - returns true by value, base pointer and
+/// offset pointer and addressing mode by reference if this node can be
+/// combined with a load / store to form a post-indexed load / store.
+bool HexagonTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
+                                                       SDValue &Base,
+                                                       SDValue &Offset,
+                                                       ISD::MemIndexedMode &AM,
+                                                       SelectionDAG &DAG) const
+{
+  EVT VT;
+  SDValue Ptr;
+  bool isSEXTLoad = false;
+
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    VT  = LD->getMemoryVT();
+    isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
+  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    VT  = ST->getMemoryVT();
+    if (ST->getValue().getValueType() == MVT::i64 && ST->isTruncatingStore()) {
+      return false;
+    }
+  } else {
+    return false;
+  }
+
+  bool isInc = false;
+  bool isLegal = getIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
+                                        isInc, DAG);
+  // ShiftAmount = number of left-shifted bits in the Hexagon instruction.
+  int ShiftAmount = VT.getSizeInBits() / 16;
+  if (isLegal && Is_PostInc_S4_Offset(Offset.getNode(), ShiftAmount)) {
+    AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
+    return true;
+  }
+
+  return false;
+}
+
+SDValue HexagonTargetLowering::LowerINLINEASM(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  SDNode *Node = Op.getNode();
+  MachineFunction &MF = DAG.getMachineFunction();
+  HexagonMachineFunctionInfo *FuncInfo =
+    MF.getInfo<HexagonMachineFunctionInfo>();
+  switch (Node->getOpcode()) {
+    case ISD::INLINEASM: {
+      unsigned NumOps = Node->getNumOperands();
+      if (Node->getOperand(NumOps-1).getValueType() == MVT::Glue)
+        --NumOps;  // Ignore the flag operand.
+
+      for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) {
+        if (FuncInfo->hasClobberLR())
+          break;
+        unsigned Flags =
+          cast<ConstantSDNode>(Node->getOperand(i))->getZExtValue();
+        unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
+        ++i;  // Skip the ID value.
+
+        switch (InlineAsm::getKind(Flags)) {
+        default: llvm_unreachable("Bad flags!");
+          case InlineAsm::Kind_RegDef:
+          case InlineAsm::Kind_RegUse:
+          case InlineAsm::Kind_Imm:
+          case InlineAsm::Kind_Clobber:
+          case InlineAsm::Kind_Mem: {
+            for (; NumVals; --NumVals, ++i) {}
+            break;
+          }
+          case InlineAsm::Kind_RegDefEarlyClobber: {
+            for (; NumVals; --NumVals, ++i) {
+              unsigned Reg =
+                cast<RegisterSDNode>(Node->getOperand(i))->getReg();
+
+              // Check it to be lr
+              const HexagonRegisterInfo *QRI =
+                  static_cast<const HexagonRegisterInfo *>(
+                      DAG.getTarget().getRegisterInfo());
+              if (Reg == QRI->getRARegister()) {
+                FuncInfo->setHasClobberLR(true);
+                break;
+              }
+            }
+            break;
+          }
+        }
+      }
+    }
+  } // Node->getOpcode
+  return Op;
+}
+
+
+//
+// Taken from the XCore backend.
+//
+SDValue HexagonTargetLowering::
+LowerBR_JT(SDValue Op, SelectionDAG &DAG) const
+{
+  SDValue Chain = Op.getOperand(0);
+  SDValue Table = Op.getOperand(1);
+  SDValue Index = Op.getOperand(2);
+  SDLoc dl(Op);
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
+  unsigned JTI = JT->getIndex();
+  MachineFunction &MF = DAG.getMachineFunction();
+  const MachineJumpTableInfo *MJTI = MF.getJumpTableInfo();
+  SDValue TargetJT = DAG.getTargetJumpTable(JT->getIndex(), MVT::i32);
+
+  // Mark all jump table targets as address taken.
+  const std::vector<MachineJumpTableEntry> &JTE = MJTI->getJumpTables();
+  const std::vector<MachineBasicBlock*> &JTBBs = JTE[JTI].MBBs;
+  for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) {
+    MachineBasicBlock *MBB = JTBBs[i];
+    MBB->setHasAddressTaken();
+    // This line is needed to set the hasAddressTaken flag on the BasicBlock
+    // object.
+    BlockAddress::get(const_cast<BasicBlock *>(MBB->getBasicBlock()));
+  }
+
+  SDValue JumpTableBase = DAG.getNode(HexagonISD::WrapperJT, dl,
+                                      getPointerTy(), TargetJT);
+  SDValue ShiftIndex = DAG.getNode(ISD::SHL, dl, MVT::i32, Index,
+                                   DAG.getConstant(2, MVT::i32));
+  SDValue JTAddress = DAG.getNode(ISD::ADD, dl, MVT::i32, JumpTableBase,
+                                  ShiftIndex);
+  SDValue LoadTarget = DAG.getLoad(MVT::i32, dl, Chain, JTAddress,
+                                   MachinePointerInfo(), false, false, false,
+                                   0);
+  return DAG.getNode(HexagonISD::BR_JT, dl, MVT::Other, Chain, LoadTarget);
+}
+
+
+SDValue
+HexagonTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size = Op.getOperand(1);
+  SDLoc dl(Op);
+
+  unsigned SPReg = getStackPointerRegisterToSaveRestore();
+
+  // Get a reference to the stack pointer.
+  SDValue StackPointer = DAG.getCopyFromReg(Chain, dl, SPReg, MVT::i32);
+
+  // Subtract the dynamic size from the actual stack size to
+  // obtain the new stack size.
+  SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, StackPointer, Size);
+
+  //
+  // For Hexagon, the outgoing memory arguments area should be on top of the
+  // alloca area on the stack i.e., the outgoing memory arguments should be
+  // at a lower address than the alloca area. Move the alloca area down the
+  // stack by adding back the space reserved for outgoing arguments to SP
+  // here.
+  //
+  // We do not know what the size of the outgoing args is at this point.
+  // So, we add a pseudo instruction ADJDYNALLOC that will adjust the
+  // stack pointer. We patch this instruction with the correct, known
+  // offset in emitPrologue().
+  //
+  // Use a placeholder immediate (zero) for now. This will be patched up
+  // by emitPrologue().
+  SDValue ArgAdjust = DAG.getNode(HexagonISD::ADJDYNALLOC, dl,
+                                  MVT::i32,
+                                  Sub,
+                                  DAG.getConstant(0, MVT::i32));
+
+  // The Sub result contains the new stack start address, so it
+  // must be placed in the stack pointer register.
+  const HexagonRegisterInfo *QRI = static_cast<const HexagonRegisterInfo *>(
+      DAG.getTarget().getRegisterInfo());
+  SDValue CopyChain = DAG.getCopyToReg(Chain, dl, QRI->getStackRegister(), Sub);
+
+  SDValue Ops[2] = { ArgAdjust, CopyChain };
+  return DAG.getMergeValues(Ops, dl);
+}
+
+SDValue
+HexagonTargetLowering::LowerFormalArguments(SDValue Chain,
+                                            CallingConv::ID CallConv,
+                                            bool isVarArg,
+                                            const
+                                            SmallVectorImpl<ISD::InputArg> &Ins,
+                                            SDLoc dl, SelectionDAG &DAG,
+                                            SmallVectorImpl<SDValue> &InVals)
+const {
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+  HexagonMachineFunctionInfo *FuncInfo =
+    MF.getInfo<HexagonMachineFunctionInfo>();
+
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+
+  CCInfo.AnalyzeFormalArguments(Ins, CC_Hexagon);
+
+  // For LLVM, in the case when returning a struct by value (>8byte),
+  // the first argument is a pointer that points to the location on caller's
+  // stack where the return value will be stored. For Hexagon, the location on
+  // caller's stack is passed only when the struct size is smaller than (and
+  // equal to) 8 bytes. If not, no address will be passed into callee and
+  // callee return the result direclty through R0/R1.
+
+  SmallVector<SDValue, 4> MemOps;
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    ISD::ArgFlagsTy Flags = Ins[i].Flags;
+    unsigned ObjSize;
+    unsigned StackLocation;
+    int FI;
+
+    if (   (VA.isRegLoc() && !Flags.isByVal())
+        || (VA.isRegLoc() && Flags.isByVal() && Flags.getByValSize() > 8)) {
+      // Arguments passed in registers
+      // 1. int, long long, ptr args that get allocated in register.
+      // 2. Large struct that gets an register to put its address in.
+      EVT RegVT = VA.getLocVT();
+      if (RegVT == MVT::i8 || RegVT == MVT::i16 ||
+          RegVT == MVT::i32 || RegVT == MVT::f32) {
+        unsigned VReg =
+          RegInfo.createVirtualRegister(&Hexagon::IntRegsRegClass);
+        RegInfo.addLiveIn(VA.getLocReg(), VReg);
+        InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
+      } else if (RegVT == MVT::i64) {
+        unsigned VReg =
+          RegInfo.createVirtualRegister(&Hexagon::DoubleRegsRegClass);
+        RegInfo.addLiveIn(VA.getLocReg(), VReg);
+        InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
+      } else {
+        assert (0);
+      }
+    } else if (VA.isRegLoc() && Flags.isByVal() && Flags.getByValSize() <= 8) {
+      assert (0 && "ByValSize must be bigger than 8 bytes");
+    } else {
+      // Sanity check.
+      assert(VA.isMemLoc());
+
+      if (Flags.isByVal()) {
+        // If it's a byval parameter, then we need to compute the
+        // "real" size, not the size of the pointer.
+        ObjSize = Flags.getByValSize();
+      } else {
+        ObjSize = VA.getLocVT().getStoreSizeInBits() >> 3;
+      }
+
+      StackLocation = HEXAGON_LRFP_SIZE + VA.getLocMemOffset();
+      // Create the frame index object for this incoming parameter...
+      FI = MFI->CreateFixedObject(ObjSize, StackLocation, true);
+
+      // Create the SelectionDAG nodes cordl, responding to a load
+      // from this parameter.
+      SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
+
+      if (Flags.isByVal()) {
+        // If it's a pass-by-value aggregate, then do not dereference the stack
+        // location. Instead, we should generate a reference to the stack
+        // location.
+        InVals.push_back(FIN);
+      } else {
+        InVals.push_back(DAG.getLoad(VA.getLocVT(), dl, Chain, FIN,
+                                     MachinePointerInfo(), false, false,
+                                     false, 0));
+      }
+    }
+  }
+
+  if (!MemOps.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
+
+  if (isVarArg) {
+    // This will point to the next argument passed via stack.
+    int FrameIndex = MFI->CreateFixedObject(Hexagon_PointerSize,
+                                            HEXAGON_LRFP_SIZE +
+                                            CCInfo.getNextStackOffset(),
+                                            true);
+    FuncInfo->setVarArgsFrameIndex(FrameIndex);
+  }
+
+  return Chain;
+}
+
+SDValue
+HexagonTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
+  // VASTART stores the address of the VarArgsFrameIndex slot into the
+  // memory location argument.
+  MachineFunction &MF = DAG.getMachineFunction();
+  HexagonMachineFunctionInfo *QFI = MF.getInfo<HexagonMachineFunctionInfo>();
+  SDValue Addr = DAG.getFrameIndex(QFI->getVarArgsFrameIndex(), MVT::i32);
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  return DAG.getStore(Op.getOperand(0), SDLoc(Op), Addr,
+                      Op.getOperand(1), MachinePointerInfo(SV), false,
+                      false, 0);
+}
+
+SDValue
+HexagonTargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
+  EVT ValTy = Op.getValueType();
+  SDLoc dl(Op);
+  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+  SDValue Res;
+  if (CP->isMachineConstantPoolEntry())
+    Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), ValTy,
+                                    CP->getAlignment());
+  else
+    Res = DAG.getTargetConstantPool(CP->getConstVal(), ValTy,
+                                    CP->getAlignment());
+  return DAG.getNode(HexagonISD::CONST32, dl, ValTy, Res);
+}
+
+SDValue
+HexagonTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
+  const TargetRegisterInfo *TRI = DAG.getTarget().getRegisterInfo();
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MFI->setReturnAddressIsTaken(true);
+
+  if (verifyReturnAddressArgumentIsConstant(Op, DAG))
+    return SDValue();
+
+  EVT VT = Op.getValueType();
+  SDLoc dl(Op);
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  if (Depth) {
+    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+    SDValue Offset = DAG.getConstant(4, MVT::i32);
+    return DAG.getLoad(VT, dl, DAG.getEntryNode(),
+                       DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
+                       MachinePointerInfo(), false, false, false, 0);
+  }
+
+  // Return LR, which contains the return address. Mark it an implicit live-in.
+  unsigned Reg = MF.addLiveIn(TRI->getRARegister(), getRegClassFor(MVT::i32));
+  return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
+}
+
+SDValue
+HexagonTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
+  const HexagonRegisterInfo *TRI =
+      static_cast<const HexagonRegisterInfo *>(DAG.getTarget().getRegisterInfo());
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI->setFrameAddressIsTaken(true);
+
+  EVT VT = Op.getValueType();
+  SDLoc dl(Op);
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl,
+                                         TRI->getFrameRegister(), VT);
+  while (Depth--)
+    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
+                            MachinePointerInfo(),
+                            false, false, false, 0);
+  return FrameAddr;
+}
+
+SDValue HexagonTargetLowering::LowerATOMIC_FENCE(SDValue Op,
+                                                 SelectionDAG& DAG) const {
+  SDLoc dl(Op);
+  return DAG.getNode(HexagonISD::BARRIER, dl, MVT::Other, Op.getOperand(0));
+}
+
+
+SDValue HexagonTargetLowering::LowerGLOBALADDRESS(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  SDValue Result;
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
+  SDLoc dl(Op);
+  Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
+
+  const HexagonTargetObjectFile &TLOF =
+      static_cast<const HexagonTargetObjectFile &>(getObjFileLowering());
+  if (TLOF.IsGlobalInSmallSection(GV, getTargetMachine())) {
+    return DAG.getNode(HexagonISD::CONST32_GP, dl, getPointerTy(), Result);
+  }
+
+  return DAG.getNode(HexagonISD::CONST32, dl, getPointerTy(), Result);
+}
+
+SDValue
+HexagonTargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
+  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+  SDValue BA_SD =  DAG.getTargetBlockAddress(BA, MVT::i32);
+  SDLoc dl(Op);
+  return DAG.getNode(HexagonISD::CONST32_GP, dl, getPointerTy(), BA_SD);
+}
+
+//===----------------------------------------------------------------------===//
+// TargetLowering Implementation
+//===----------------------------------------------------------------------===//
+
+HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &targetmachine)
+    : TargetLowering(targetmachine, new HexagonTargetObjectFile()),
+      TM(targetmachine) {
+
+  const HexagonSubtarget &Subtarget = TM.getSubtarget<HexagonSubtarget>();
+
+  // Set up the register classes.
+  addRegisterClass(MVT::i32, &Hexagon::IntRegsRegClass);
+  addRegisterClass(MVT::i64, &Hexagon::DoubleRegsRegClass);
+
+  if (Subtarget.hasV5TOps()) {
+    addRegisterClass(MVT::f32, &Hexagon::IntRegsRegClass);
+    addRegisterClass(MVT::f64, &Hexagon::DoubleRegsRegClass);
+  }
+
+  addRegisterClass(MVT::i1, &Hexagon::PredRegsRegClass);
+
+  computeRegisterProperties();
+
+  // Align loop entry
+  setPrefLoopAlignment(4);
+
+  // Limits for inline expansion of memcpy/memmove
+  MaxStoresPerMemcpy = 6;
+  MaxStoresPerMemmove = 6;
+
+  //
+  // Library calls for unsupported operations
+  //
+
+  setLibcallName(RTLIB::SINTTOFP_I128_F64, "__hexagon_floattidf");
+  setLibcallName(RTLIB::SINTTOFP_I128_F32, "__hexagon_floattisf");
+
+  setLibcallName(RTLIB::FPTOUINT_F32_I128, "__hexagon_fixunssfti");
+  setLibcallName(RTLIB::FPTOUINT_F64_I128, "__hexagon_fixunsdfti");
+
+  setLibcallName(RTLIB::FPTOSINT_F32_I128, "__hexagon_fixsfti");
+  setLibcallName(RTLIB::FPTOSINT_F64_I128, "__hexagon_fixdfti");
+
+  setLibcallName(RTLIB::SDIV_I32, "__hexagon_divsi3");
+  setOperationAction(ISD::SDIV, MVT::i32, Expand);
+  setLibcallName(RTLIB::SREM_I32, "__hexagon_umodsi3");
+  setOperationAction(ISD::SREM, MVT::i32, Expand);
+
+  setLibcallName(RTLIB::SDIV_I64, "__hexagon_divdi3");
+  setOperationAction(ISD::SDIV, MVT::i64, Expand);
+  setLibcallName(RTLIB::SREM_I64, "__hexagon_moddi3");
+  setOperationAction(ISD::SREM, MVT::i64, Expand);
+
+  setLibcallName(RTLIB::UDIV_I32, "__hexagon_udivsi3");
+  setOperationAction(ISD::UDIV, MVT::i32, Expand);
+
+  setLibcallName(RTLIB::UDIV_I64, "__hexagon_udivdi3");
+  setOperationAction(ISD::UDIV, MVT::i64, Expand);
+
+  setLibcallName(RTLIB::UREM_I32, "__hexagon_umodsi3");
+  setOperationAction(ISD::UREM, MVT::i32, Expand);
+
+  setLibcallName(RTLIB::UREM_I64, "__hexagon_umoddi3");
+  setOperationAction(ISD::UREM, MVT::i64, Expand);
+
+  setLibcallName(RTLIB::DIV_F32, "__hexagon_divsf3");
+  setOperationAction(ISD::FDIV, MVT::f32, Expand);
+
+  setLibcallName(RTLIB::DIV_F64, "__hexagon_divdf3");
+  setOperationAction(ISD::FDIV, MVT::f64, Expand);
+
+  setOperationAction(ISD::FSQRT, MVT::f32, Expand);
+  setOperationAction(ISD::FSQRT, MVT::f64, Expand);
+  setOperationAction(ISD::FSIN, MVT::f32, Expand);
+  setOperationAction(ISD::FSIN, MVT::f64, Expand);
+
+  if (Subtarget.hasV5TOps()) {
+    // Hexagon V5 Support.
+    setOperationAction(ISD::FADD, MVT::f32, Legal);
+    setOperationAction(ISD::FADD, MVT::f64, Legal);
+    setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal);
+    setCondCodeAction(ISD::SETOEQ, MVT::f32, Legal);
+    setCondCodeAction(ISD::SETOEQ, MVT::f64, Legal);
+    setCondCodeAction(ISD::SETUEQ, MVT::f32, Legal);
+    setCondCodeAction(ISD::SETUEQ, MVT::f64, Legal);
+
+    setCondCodeAction(ISD::SETOGE, MVT::f32, Legal);
+    setCondCodeAction(ISD::SETOGE, MVT::f64, Legal);
+    setCondCodeAction(ISD::SETUGE, MVT::f32, Legal);
+    setCondCodeAction(ISD::SETUGE, MVT::f64, Legal);
+
+    setCondCodeAction(ISD::SETOGT, MVT::f32, Legal);
+    setCondCodeAction(ISD::SETOGT, MVT::f64, Legal);
+    setCondCodeAction(ISD::SETUGT, MVT::f32, Legal);
+    setCondCodeAction(ISD::SETUGT, MVT::f64, Legal);
+
+    setCondCodeAction(ISD::SETOLE, MVT::f32, Legal);
+    setCondCodeAction(ISD::SETOLE, MVT::f64, Legal);
+    setCondCodeAction(ISD::SETOLT, MVT::f32, Legal);
+    setCondCodeAction(ISD::SETOLT, MVT::f64, Legal);
+
+    setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
+    setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
+
+    setOperationAction(ISD::FP_TO_UINT, MVT::i1, Promote);
+    setOperationAction(ISD::FP_TO_SINT, MVT::i1, Promote);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i1, Promote);
+    setOperationAction(ISD::SINT_TO_FP, MVT::i1, Promote);
+
+    setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
+    setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);
+    setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);
+
+    setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
+    setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
+    setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
+
+    setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal);
+    setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal);
+    setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal);
+
+    setOperationAction(ISD::FP_TO_UINT, MVT::i64, Legal);
+    setOperationAction(ISD::FP_TO_SINT, MVT::i64, Legal);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i64, Legal);
+    setOperationAction(ISD::SINT_TO_FP, MVT::i64, Legal);
+
+    setOperationAction(ISD::FABS, MVT::f32, Legal);
+    setOperationAction(ISD::FABS, MVT::f64, Expand);
+
+    setOperationAction(ISD::FNEG, MVT::f32, Legal);
+    setOperationAction(ISD::FNEG, MVT::f64, Expand);
+  } else {
+
+    // Expand fp<->uint.
+    setOperationAction(ISD::FP_TO_SINT, MVT::i32, Expand);
+    setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
+
+    setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);
+
+    setLibcallName(RTLIB::SINTTOFP_I64_F32, "__hexagon_floatdisf");
+    setLibcallName(RTLIB::UINTTOFP_I64_F32, "__hexagon_floatundisf");
+
+    setLibcallName(RTLIB::UINTTOFP_I32_F32, "__hexagon_floatunsisf");
+    setLibcallName(RTLIB::SINTTOFP_I32_F32, "__hexagon_floatsisf");
+
+    setLibcallName(RTLIB::SINTTOFP_I64_F64, "__hexagon_floatdidf");
+    setLibcallName(RTLIB::UINTTOFP_I64_F64, "__hexagon_floatundidf");
+
+    setLibcallName(RTLIB::UINTTOFP_I32_F64, "__hexagon_floatunsidf");
+    setLibcallName(RTLIB::SINTTOFP_I32_F64, "__hexagon_floatsidf");
+
+    setLibcallName(RTLIB::FPTOUINT_F32_I32, "__hexagon_fixunssfsi");
+    setLibcallName(RTLIB::FPTOUINT_F32_I64, "__hexagon_fixunssfdi");
+
+    setLibcallName(RTLIB::FPTOSINT_F64_I64, "__hexagon_fixdfdi");
+    setLibcallName(RTLIB::FPTOSINT_F32_I64, "__hexagon_fixsfdi");
+
+    setLibcallName(RTLIB::FPTOUINT_F64_I32, "__hexagon_fixunsdfsi");
+    setLibcallName(RTLIB::FPTOUINT_F64_I64, "__hexagon_fixunsdfdi");
+
+    setLibcallName(RTLIB::ADD_F64, "__hexagon_adddf3");
+    setOperationAction(ISD::FADD, MVT::f64, Expand);
+
+    setLibcallName(RTLIB::ADD_F32, "__hexagon_addsf3");
+    setOperationAction(ISD::FADD, MVT::f32, Expand);
+
+    setLibcallName(RTLIB::FPEXT_F32_F64, "__hexagon_extendsfdf2");
+    setOperationAction(ISD::FP_EXTEND, MVT::f32, Expand);
+
+    setLibcallName(RTLIB::OEQ_F32, "__hexagon_eqsf2");
+    setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
+
+    setLibcallName(RTLIB::OEQ_F64, "__hexagon_eqdf2");
+    setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
+
+    setLibcallName(RTLIB::OGE_F32, "__hexagon_gesf2");
+    setCondCodeAction(ISD::SETOGE, MVT::f32, Expand);
+
+    setLibcallName(RTLIB::OGE_F64, "__hexagon_gedf2");
+    setCondCodeAction(ISD::SETOGE, MVT::f64, Expand);
+
+    setLibcallName(RTLIB::OGT_F32, "__hexagon_gtsf2");
+    setCondCodeAction(ISD::SETOGT, MVT::f32, Expand);
+
+    setLibcallName(RTLIB::OGT_F64, "__hexagon_gtdf2");
+    setCondCodeAction(ISD::SETOGT, MVT::f64, Expand);
+
+    setLibcallName(RTLIB::FPTOSINT_F64_I32, "__hexagon_fixdfsi");
+    setOperationAction(ISD::FP_TO_SINT, MVT::f64, Expand);
+
+    setLibcallName(RTLIB::FPTOSINT_F32_I32, "__hexagon_fixsfsi");
+    setOperationAction(ISD::FP_TO_SINT, MVT::f32, Expand);
+
+    setLibcallName(RTLIB::OLE_F64, "__hexagon_ledf2");
+    setCondCodeAction(ISD::SETOLE, MVT::f64, Expand);
+
+    setLibcallName(RTLIB::OLE_F32, "__hexagon_lesf2");
+    setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
+
+    setLibcallName(RTLIB::OLT_F64, "__hexagon_ltdf2");
+    setCondCodeAction(ISD::SETOLT, MVT::f64, Expand);
+
+    setLibcallName(RTLIB::OLT_F32, "__hexagon_ltsf2");
+    setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
+
+    setLibcallName(RTLIB::MUL_F64, "__hexagon_muldf3");
+    setOperationAction(ISD::FMUL, MVT::f64, Expand);
+
+    setLibcallName(RTLIB::MUL_F32, "__hexagon_mulsf3");
+    setOperationAction(ISD::MUL, MVT::f32, Expand);
+
+    setLibcallName(RTLIB::UNE_F64, "__hexagon_nedf2");
+    setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
+
+    setLibcallName(RTLIB::UNE_F32, "__hexagon_nesf2");
+
+    setLibcallName(RTLIB::SUB_F64, "__hexagon_subdf3");
+    setOperationAction(ISD::SUB, MVT::f64, Expand);
+
+    setLibcallName(RTLIB::SUB_F32, "__hexagon_subsf3");
+    setOperationAction(ISD::SUB, MVT::f32, Expand);
+
+    setLibcallName(RTLIB::FPROUND_F64_F32, "__hexagon_truncdfsf2");
+    setOperationAction(ISD::FP_ROUND, MVT::f64, Expand);
+
+    setLibcallName(RTLIB::UO_F64, "__hexagon_unorddf2");
+    setCondCodeAction(ISD::SETUO, MVT::f64, Expand);
+
+    setLibcallName(RTLIB::O_F64, "__hexagon_unorddf2");
+    setCondCodeAction(ISD::SETO, MVT::f64, Expand);
+
+    setLibcallName(RTLIB::O_F32, "__hexagon_unordsf2");
+    setCondCodeAction(ISD::SETO, MVT::f32, Expand);
+
+    setLibcallName(RTLIB::UO_F32, "__hexagon_unordsf2");
+    setCondCodeAction(ISD::SETUO, MVT::f32, Expand);
+
+    setOperationAction(ISD::FABS, MVT::f32, Expand);
+    setOperationAction(ISD::FABS, MVT::f64, Expand);
+    setOperationAction(ISD::FNEG, MVT::f32, Expand);
+    setOperationAction(ISD::FNEG, MVT::f64, Expand);
+  }
+
+  setLibcallName(RTLIB::SREM_I32, "__hexagon_modsi3");
+  setOperationAction(ISD::SREM, MVT::i32, Expand);
+
+  setIndexedLoadAction(ISD::POST_INC, MVT::i8, Legal);
+  setIndexedLoadAction(ISD::POST_INC, MVT::i16, Legal);
+  setIndexedLoadAction(ISD::POST_INC, MVT::i32, Legal);
+  setIndexedLoadAction(ISD::POST_INC, MVT::i64, Legal);
+
+  setIndexedStoreAction(ISD::POST_INC, MVT::i8, Legal);
+  setIndexedStoreAction(ISD::POST_INC, MVT::i16, Legal);
+  setIndexedStoreAction(ISD::POST_INC, MVT::i32, Legal);
+  setIndexedStoreAction(ISD::POST_INC, MVT::i64, Legal);
+
+  setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
+
+  // Turn FP extload into load/fextend.
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+  // Hexagon has a i1 sign extending load.
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Expand);
+  // Turn FP truncstore into trunc + store.
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+  // Custom legalize GlobalAddress nodes into CONST32.
+  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+  setOperationAction(ISD::GlobalAddress, MVT::i8, Custom);
+  setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
+  // Truncate action?
+  setOperationAction(ISD::TRUNCATE, MVT::i64, Expand);
+
+  // Hexagon doesn't have sext_inreg, replace them with shl/sra.
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+  // Hexagon has no REM or DIVREM operations.
+  setOperationAction(ISD::UREM, MVT::i32, Expand);
+  setOperationAction(ISD::SREM, MVT::i32, Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::SREM, MVT::i64, Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
+
+  setOperationAction(ISD::BSWAP, MVT::i64, Expand);
+
+  // Lower SELECT_CC to SETCC and SELECT.
+  setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
+
+  if (Subtarget.hasV5TOps()) {
+
+    // We need to make the operation type of SELECT node to be Custom,
+    // such that we don't go into the infinite loop of
+    // select ->  setcc -> select_cc -> select loop.
+    setOperationAction(ISD::SELECT, MVT::f32, Custom);
+    setOperationAction(ISD::SELECT, MVT::f64, Custom);
+
+    setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
+    setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
+
+  } else {
+
+    // Hexagon has no select or setcc: expand to SELECT_CC.
+    setOperationAction(ISD::SELECT, MVT::f32, Expand);
+    setOperationAction(ISD::SELECT, MVT::f64, Expand);
+  }
+
+  if (EmitJumpTables) {
+    setOperationAction(ISD::BR_JT, MVT::Other, Custom);
+  } else {
+    setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+  }
+  // Increase jump tables cutover to 5, was 4.
+  setMinimumJumpTableEntries(5);
+
+  setOperationAction(ISD::BR_CC, MVT::f32, Expand);
+  setOperationAction(ISD::BR_CC, MVT::f64, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i1, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i32, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i64, Expand);
+
+  setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
+
+  setOperationAction(ISD::FSIN, MVT::f64, Expand);
+  setOperationAction(ISD::FCOS, MVT::f64, Expand);
+  setOperationAction(ISD::FREM, MVT::f64, Expand);
+  setOperationAction(ISD::FSIN, MVT::f32, Expand);
+  setOperationAction(ISD::FCOS, MVT::f32, Expand);
+  setOperationAction(ISD::FREM, MVT::f32, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
+
+  // In V4, we have double word add/sub with carry. The problem with
+  // modelling this instruction is that it produces 2 results - Rdd and Px.
+  // To model update of Px, we will have to use Defs[p0..p3] which will
+  // cause any predicate live range to spill. So, we pretend we dont't
+  // have these instructions.
+  setOperationAction(ISD::ADDE, MVT::i8, Expand);
+  setOperationAction(ISD::ADDE, MVT::i16, Expand);
+  setOperationAction(ISD::ADDE, MVT::i32, Expand);
+  setOperationAction(ISD::ADDE, MVT::i64, Expand);
+  setOperationAction(ISD::SUBE, MVT::i8, Expand);
+  setOperationAction(ISD::SUBE, MVT::i16, Expand);
+  setOperationAction(ISD::SUBE, MVT::i32, Expand);
+  setOperationAction(ISD::SUBE, MVT::i64, Expand);
+  setOperationAction(ISD::ADDC, MVT::i8, Expand);
+  setOperationAction(ISD::ADDC, MVT::i16, Expand);
+  setOperationAction(ISD::ADDC, MVT::i32, Expand);
+  setOperationAction(ISD::ADDC, MVT::i64, Expand);
+  setOperationAction(ISD::SUBC, MVT::i8, Expand);
+  setOperationAction(ISD::SUBC, MVT::i16, Expand);
+  setOperationAction(ISD::SUBC, MVT::i32, Expand);
+  setOperationAction(ISD::SUBC, MVT::i64, Expand);
+
+  setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+  setOperationAction(ISD::CTPOP, MVT::i64, Expand);
+  setOperationAction(ISD::CTTZ, MVT::i32, Expand);
+  setOperationAction(ISD::CTTZ, MVT::i64, Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
+  setOperationAction(ISD::CTLZ, MVT::i32, Expand);
+  setOperationAction(ISD::CTLZ, MVT::i64, Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
+  setOperationAction(ISD::ROTL, MVT::i32, Expand);
+  setOperationAction(ISD::ROTR, MVT::i32, Expand);
+  setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+  setOperationAction(ISD::FPOW, MVT::f64, Expand);
+  setOperationAction(ISD::FPOW, MVT::f32, Expand);
+
+  setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
+  setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);
+  setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand);
+
+  setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
+
+  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
+  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
+
+  setOperationAction(ISD::EH_RETURN, MVT::Other, Custom);
+
+  if (Subtarget.isSubtargetV2()) {
+    setExceptionPointerRegister(Hexagon::R20);
+    setExceptionSelectorRegister(Hexagon::R21);
+  } else {
+    setExceptionPointerRegister(Hexagon::R0);
+    setExceptionSelectorRegister(Hexagon::R1);
+  }
+
+  // VASTART needs to be custom lowered to use the VarArgsFrameIndex.
+  setOperationAction(ISD::VASTART, MVT::Other, Custom);
+
+  // Use the default implementation.
+  setOperationAction(ISD::VAARG, MVT::Other, Expand);
+  setOperationAction(ISD::VACOPY, MVT::Other, Expand);
+  setOperationAction(ISD::VAEND, MVT::Other, Expand);
+  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
+  setOperationAction(ISD::INLINEASM, MVT::Other, Custom);
+
+  setMinFunctionAlignment(2);
+
+  // Needed for DYNAMIC_STACKALLOC expansion.
+  const HexagonRegisterInfo *QRI =
+      static_cast<const HexagonRegisterInfo *>(TM.getRegisterInfo());
+  setStackPointerRegisterToSaveRestore(QRI->getStackRegister());
+  setSchedulingPreference(Sched::VLIW);
+}
+
+const char*
+HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+    default: return nullptr;
+    case HexagonISD::CONST32:     return "HexagonISD::CONST32";
+    case HexagonISD::CONST32_GP: return "HexagonISD::CONST32_GP";
+    case HexagonISD::CONST32_Int_Real: return "HexagonISD::CONST32_Int_Real";
+    case HexagonISD::ADJDYNALLOC: return "HexagonISD::ADJDYNALLOC";
+    case HexagonISD::CMPICC:      return "HexagonISD::CMPICC";
+    case HexagonISD::CMPFCC:      return "HexagonISD::CMPFCC";
+    case HexagonISD::BRICC:       return "HexagonISD::BRICC";
+    case HexagonISD::BRFCC:       return "HexagonISD::BRFCC";
+    case HexagonISD::SELECT_ICC:  return "HexagonISD::SELECT_ICC";
+    case HexagonISD::SELECT_FCC:  return "HexagonISD::SELECT_FCC";
+    case HexagonISD::Hi:          return "HexagonISD::Hi";
+    case HexagonISD::Lo:          return "HexagonISD::Lo";
+    case HexagonISD::FTOI:        return "HexagonISD::FTOI";
+    case HexagonISD::ITOF:        return "HexagonISD::ITOF";
+    case HexagonISD::CALL:        return "HexagonISD::CALL";
+    case HexagonISD::RET_FLAG:    return "HexagonISD::RET_FLAG";
+    case HexagonISD::BR_JT:       return "HexagonISD::BR_JT";
+    case HexagonISD::TC_RETURN:   return "HexagonISD::TC_RETURN";
+  case HexagonISD::EH_RETURN: return "HexagonISD::EH_RETURN";
+  }
+}
+
+bool
+HexagonTargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
+  EVT MTy1 = EVT::getEVT(Ty1);
+  EVT MTy2 = EVT::getEVT(Ty2);
+  if (!MTy1.isSimple() || !MTy2.isSimple()) {
+    return false;
+  }
+  return ((MTy1.getSimpleVT() == MVT::i64) && (MTy2.getSimpleVT() == MVT::i32));
+}
+
+bool HexagonTargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
+  if (!VT1.isSimple() || !VT2.isSimple()) {
+    return false;
+  }
+  return ((VT1.getSimpleVT() == MVT::i64) && (VT2.getSimpleVT() == MVT::i32));
+}
+
+bool
+HexagonTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
+  // Assuming the caller does not have either a signext or zeroext modifier, and
+  // only one value is accepted, any reasonable truncation is allowed.
+  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+    return false;
+
+  // FIXME: in principle up to 64-bit could be made safe, but it would be very
+  // fragile at the moment: any support for multiple value returns would be
+  // liable to disallow tail calls involving i64 -> iN truncation in many cases.
+  return Ty1->getPrimitiveSizeInBits() <= 32;
+}
+
+SDValue
+HexagonTargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain     = Op.getOperand(0);
+  SDValue Offset    = Op.getOperand(1);
+  SDValue Handler   = Op.getOperand(2);
+  SDLoc dl(Op);
+
+  // Mark function as containing a call to EH_RETURN.
+  HexagonMachineFunctionInfo *FuncInfo =
+    DAG.getMachineFunction().getInfo<HexagonMachineFunctionInfo>();
+  FuncInfo->setHasEHReturn();
+
+  unsigned OffsetReg = Hexagon::R28;
+
+  SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(),
+                                  DAG.getRegister(Hexagon::R30, getPointerTy()),
+                                  DAG.getIntPtrConstant(4));
+  Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
+                       false, false, 0);
+  Chain = DAG.getCopyToReg(Chain, dl, OffsetReg, Offset);
+
+  // Not needed we already use it as explict input to EH_RETURN.
+  // MF.getRegInfo().addLiveOut(OffsetReg);
+
+  return DAG.getNode(HexagonISD::EH_RETURN, dl, MVT::Other, Chain);
+}
+
+SDValue
+HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+    default: llvm_unreachable("Should not custom lower this!");
+    case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
+    case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
+      // Frame & Return address.  Currently unimplemented.
+    case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
+    case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
+    case ISD::GlobalTLSAddress:
+                          llvm_unreachable("TLS not implemented for Hexagon.");
+    case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, DAG);
+    case ISD::GlobalAddress:      return LowerGLOBALADDRESS(Op, DAG);
+    case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
+    case ISD::VASTART:            return LowerVASTART(Op, DAG);
+    case ISD::BR_JT:              return LowerBR_JT(Op, DAG);
+
+    case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
+    case ISD::SELECT:             return Op;
+    case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+    case ISD::INLINEASM:          return LowerINLINEASM(Op, DAG);
+
+  }
+}
+
+
+
+//===----------------------------------------------------------------------===//
+//                           Hexagon Scheduler Hooks
+//===----------------------------------------------------------------------===//
+MachineBasicBlock *
+HexagonTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                   MachineBasicBlock *BB)
+const {
+  switch (MI->getOpcode()) {
+    case Hexagon::ADJDYNALLOC: {
+      MachineFunction *MF = BB->getParent();
+      HexagonMachineFunctionInfo *FuncInfo =
+        MF->getInfo<HexagonMachineFunctionInfo>();
+      FuncInfo->addAllocaAdjustInst(MI);
+      return BB;
+    }
+    default: llvm_unreachable("Unexpected instr type to insert");
+  } // switch
+}
+
+//===----------------------------------------------------------------------===//
+// Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+std::pair<unsigned, const TargetRegisterClass*>
+HexagonTargetLowering::getRegForInlineAsmConstraint(const
+                                                    std::string &Constraint,
+                                                    MVT VT) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'r':   // R0-R31
+       switch (VT.SimpleTy) {
+       default:
+         llvm_unreachable("getRegForInlineAsmConstraint Unhandled data type");
+       case MVT::i32:
+       case MVT::i16:
+       case MVT::i8:
+       case MVT::f32:
+         return std::make_pair(0U, &Hexagon::IntRegsRegClass);
+       case MVT::i64:
+       case MVT::f64:
+         return std::make_pair(0U, &Hexagon::DoubleRegsRegClass);
+      }
+    default:
+      llvm_unreachable("Unknown asm register class");
+    }
+  }
+
+  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+}
+
+/// isFPImmLegal - Returns true if the target can instruction select the
+/// specified FP immediate natively. If false, the legalizer will
+/// materialize the FP immediate as a load from a constant pool.
+bool HexagonTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+  return TM.getSubtarget<HexagonSubtarget>().hasV5TOps();
+}
+
+/// isLegalAddressingMode - Return true if the addressing mode represented by
+/// AM is legal for this target, for a load/store of the specified type.
+bool HexagonTargetLowering::isLegalAddressingMode(const AddrMode &AM,
+                                                  Type *Ty) const {
+  // Allows a signed-extended 11-bit immediate field.
+  if (AM.BaseOffs <= -(1LL << 13) || AM.BaseOffs >= (1LL << 13)-1) {
+    return false;
+  }
+
+  // No global is ever allowed as a base.
+  if (AM.BaseGV) {
+    return false;
+  }
+
+  int Scale = AM.Scale;
+  if (Scale < 0) Scale = -Scale;
+  switch (Scale) {
+  case 0:  // No scale reg, "r+i", "r", or just "i".
+    break;
+  default: // No scaled addressing mode.
+    return false;
+  }
+  return true;
+}
+
+/// isLegalICmpImmediate - Return true if the specified immediate is legal
+/// icmp immediate, that is the target has icmp instructions which can compare
+/// a register against the immediate without having to materialize the
+/// immediate into a register.
+bool HexagonTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
+  return Imm >= -512 && Imm <= 511;
+}
+
+/// IsEligibleForTailCallOptimization - Check whether the call is eligible
+/// for tail call optimization. Targets which want to do tail call
+/// optimization should implement this function.
+bool HexagonTargetLowering::IsEligibleForTailCallOptimization(
+                                 SDValue Callee,
+                                 CallingConv::ID CalleeCC,
+                                 bool isVarArg,
+                                 bool isCalleeStructRet,
+                                 bool isCallerStructRet,
+                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                 const SmallVectorImpl<SDValue> &OutVals,
+                                 const SmallVectorImpl<ISD::InputArg> &Ins,
+                                 SelectionDAG& DAG) const {
+  const Function *CallerF = DAG.getMachineFunction().getFunction();
+  CallingConv::ID CallerCC = CallerF->getCallingConv();
+  bool CCMatch = CallerCC == CalleeCC;
+
+  // ***************************************************************************
+  //  Look for obvious safe cases to perform tail call optimization that do not
+  //  require ABI changes.
+  // ***************************************************************************
+
+  // If this is a tail call via a function pointer, then don't do it!
+  if (!(dyn_cast<GlobalAddressSDNode>(Callee))
+      && !(dyn_cast<ExternalSymbolSDNode>(Callee))) {
+    return false;
+  }
+
+  // Do not optimize if the calling conventions do not match.
+  if (!CCMatch)
+    return false;
+
+  // Do not tail call optimize vararg calls.
+  if (isVarArg)
+    return false;
+
+  // Also avoid tail call optimization if either caller or callee uses struct
+  // return semantics.
+  if (isCalleeStructRet || isCallerStructRet)
+    return false;
+
+  // In addition to the cases above, we also disable Tail Call Optimization if
+  // the calling convention code that at least one outgoing argument needs to
+  // go on the stack. We cannot check that here because at this point that
+  // information is not available.
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.h
new file mode 100644
index 0000000..ec16cc8
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.h
@@ -0,0 +1,177 @@
+//===-- HexagonISelLowering.h - Hexagon DAG Lowering Interface --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that Hexagon uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef Hexagon_ISELLOWERING_H
+#define Hexagon_ISELLOWERING_H
+
+#include "Hexagon.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+  namespace HexagonISD {
+    enum {
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+      CONST32,
+      CONST32_GP,  // For marking data present in GP.
+      CONST32_Int_Real,
+      FCONST32,
+      SETCC,
+      ADJDYNALLOC,
+      ARGEXTEND,
+
+      CMPICC,      // Compare two GPR operands, set icc.
+      CMPFCC,      // Compare two FP operands, set fcc.
+      BRICC,       // Branch to dest on icc condition
+      BRFCC,       // Branch to dest on fcc condition
+      SELECT_ICC,  // Select between two values using the current ICC flags.
+      SELECT_FCC,  // Select between two values using the current FCC flags.
+
+      Hi, Lo,      // Hi/Lo operations, typically on a global address.
+
+      FTOI,        // FP to Int within a FP register.
+      ITOF,        // Int to FP within a FP register.
+
+      CALL,        // A call instruction.
+      RET_FLAG,    // Return with a flag operand.
+      BR_JT,       // Jump table.
+      BARRIER,     // Memory barrier.
+      WrapperJT,
+      WrapperCP,
+      WrapperCombineII,
+      WrapperCombineRR,
+      WrapperCombineRI_V4,
+      WrapperCombineIR_V4,
+      WrapperPackhl,
+      WrapperSplatB,
+      WrapperSplatH,
+      WrapperShuffEB,
+      WrapperShuffEH,
+      WrapperShuffOB,
+      WrapperShuffOH,
+      TC_RETURN,
+      EH_RETURN
+    };
+  }
+
+  class HexagonTargetLowering : public TargetLowering {
+    int VarArgsFrameOffset;   // Frame offset to start of varargs area.
+
+    bool CanReturnSmallStruct(const Function* CalleeFn,
+                              unsigned& RetSize) const;
+
+  public:
+    const TargetMachine &TM;
+    explicit HexagonTargetLowering(const TargetMachine &targetmachine);
+
+    /// IsEligibleForTailCallOptimization - Check whether the call is eligible
+    /// for tail call optimization. Targets which want to do tail call
+    /// optimization should implement this function.
+    bool
+    IsEligibleForTailCallOptimization(SDValue Callee,
+                                      CallingConv::ID CalleeCC,
+                                      bool isVarArg,
+                                      bool isCalleeStructRet,
+                                      bool isCallerStructRet,
+                                      const
+                                      SmallVectorImpl<ISD::OutputArg> &Outs,
+                                      const SmallVectorImpl<SDValue> &OutVals,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                      SelectionDAG& DAG) const;
+
+    bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
+    bool isTruncateFree(EVT VT1, EVT VT2) const override;
+
+    bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override;
+
+    SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+    const char *getTargetNodeName(unsigned Opcode) const override;
+    SDValue  LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerEH_LABEL(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFormalArguments(SDValue Chain,
+                                 CallingConv::ID CallConv, bool isVarArg,
+                                 const SmallVectorImpl<ISD::InputArg> &Ins,
+                                 SDLoc dl, SelectionDAG &DAG,
+                                 SmallVectorImpl<SDValue> &InVals) const override;
+    SDValue LowerGLOBALADDRESS(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+
+    SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                      SmallVectorImpl<SDValue> &InVals) const override;
+
+    SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                            CallingConv::ID CallConv, bool isVarArg,
+                            const SmallVectorImpl<ISD::InputArg> &Ins,
+                            SDLoc dl, SelectionDAG &DAG,
+                            SmallVectorImpl<SDValue> &InVals,
+                            const SmallVectorImpl<SDValue> &OutVals,
+                            SDValue Callee) const;
+
+    SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG& DAG) const;
+    SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+
+    SDValue LowerReturn(SDValue Chain,
+                        CallingConv::ID CallConv, bool isVarArg,
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        const SmallVectorImpl<SDValue> &OutVals,
+                        SDLoc dl, SelectionDAG &DAG) const override;
+
+    MachineBasicBlock *
+    EmitInstrWithCustomInserter(MachineInstr *MI,
+                                MachineBasicBlock *BB) const override;
+
+    SDValue  LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+    SDValue  LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+    EVT getSetCCResultType(LLVMContext &C, EVT VT) const override {
+      if (!VT.isVector())
+        return MVT::i1;
+      else
+        return EVT::getVectorVT(C, MVT::i1, VT.getVectorNumElements());
+    }
+
+    bool getPostIndexedAddressParts(SDNode *N, SDNode *Op,
+                                    SDValue &Base, SDValue &Offset,
+                                    ISD::MemIndexedMode &AM,
+                                    SelectionDAG &DAG) const override;
+
+    std::pair<unsigned, const TargetRegisterClass*>
+    getRegForInlineAsmConstraint(const std::string &Constraint,
+                                 MVT VT) const override;
+
+    // Intrinsics
+    SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+    /// isLegalAddressingMode - Return true if the addressing mode represented
+    /// by AM is legal for this target, for a load/store of the specified type.
+    /// The type may be VoidTy, in which case only return true if the addressing
+    /// mode is legal for a load/store of any legal type.
+    /// TODO: Handle pre/postinc as well.
+    bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
+    bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
+
+    /// isLegalICmpImmediate - Return true if the specified immediate is legal
+    /// icmp immediate, that is the target has icmp instructions which can
+    /// compare a register against the immediate without having to materialize
+    /// the immediate into a register.
+    bool isLegalICmpImmediate(int64_t Imm) const override;
+  };
+} // end namespace llvm
+
+#endif    // Hexagon_ISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormats.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormats.td
new file mode 100644
index 0000000..1057343
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormats.td
@@ -0,0 +1,401 @@
+//==- HexagonInstrFormats.td - Hexagon Instruction Formats --*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//                         Hexagon Instruction Flags +
+//
+//                    *** Must match HexagonBaseInfo.h ***
+//===----------------------------------------------------------------------===//
+
+class IType<bits<5> t> {
+  bits<5> Value = t;
+}
+def TypePSEUDO : IType<0>;
+def TypeALU32  : IType<1>;
+def TypeCR     : IType<2>;
+def TypeJR     : IType<3>;
+def TypeJ      : IType<4>;
+def TypeLD     : IType<5>;
+def TypeST     : IType<6>;
+def TypeSYSTEM : IType<7>;
+def TypeXTYPE  : IType<8>;
+def TypeENDLOOP: IType<31>;
+
+// Maintain list of valid subtargets for each instruction.
+class SubTarget<bits<4> value> {
+  bits<4> Value = value;
+}
+
+def HasV2SubT     : SubTarget<0xf>;
+def HasV2SubTOnly : SubTarget<0x1>;
+def NoV2SubT      : SubTarget<0x0>;
+def HasV3SubT     : SubTarget<0xe>;
+def HasV3SubTOnly : SubTarget<0x2>;
+def NoV3SubT      : SubTarget<0x1>;
+def HasV4SubT     : SubTarget<0xc>;
+def NoV4SubT      : SubTarget<0x3>;
+def HasV5SubT     : SubTarget<0x8>;
+def NoV5SubT      : SubTarget<0x7>;
+
+// Addressing modes for load/store instructions
+class AddrModeType<bits<3> value> {
+  bits<3> Value = value;
+}
+
+def NoAddrMode     : AddrModeType<0>;  // No addressing mode
+def Absolute       : AddrModeType<1>;  // Absolute addressing mode
+def AbsoluteSet    : AddrModeType<2>;  // Absolute set addressing mode
+def BaseImmOffset  : AddrModeType<3>;  // Indirect with offset
+def BaseLongOffset : AddrModeType<4>;  // Indirect with long offset
+def BaseRegOffset  : AddrModeType<5>;  // Indirect with register offset
+def PostInc        : AddrModeType<6>;  // Post increment addressing mode
+
+class MemAccessSize<bits<3> value> {
+  bits<3> Value = value;
+}
+
+def NoMemAccess      : MemAccessSize<0>;// Not a memory acces instruction.
+def ByteAccess       : MemAccessSize<1>;// Byte access instruction (memb).
+def HalfWordAccess   : MemAccessSize<2>;// Half word access instruction (memh).
+def WordAccess       : MemAccessSize<3>;// Word access instruction (memw).
+def DoubleWordAccess : MemAccessSize<4>;// Double word access instruction (memd)
+
+
+//===----------------------------------------------------------------------===//
+//                         Instruction Class Declaration +
+//===----------------------------------------------------------------------===//
+
+class OpcodeHexagon {
+  field bits<32> Inst = ?; // Default to an invalid insn.
+  bits<4> IClass = 0; // ICLASS
+  bits<2> IParse = 0; // Parse bits.
+
+  let Inst{31-28} = IClass;
+  let Inst{15-14} = IParse;
+
+  bits<1> zero = 0;
+}
+
+class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern,
+                  string cstr, InstrItinClass itin, IType type>
+  : Instruction, OpcodeHexagon {
+  let Namespace = "Hexagon";
+
+  dag OutOperandList = outs;
+  dag InOperandList = ins;
+  let AsmString = asmstr;
+  let Pattern = pattern;
+  let Constraints = cstr;
+  let Itinerary = itin;
+  let Size = 4;
+
+  // *** Must match MCTargetDesc/HexagonBaseInfo.h ***
+
+  // Instruction type according to the ISA.
+  IType Type = type;
+  let TSFlags{4-0} = Type.Value;
+
+  // Solo instructions, i.e., those that cannot be in a packet with others.
+  bits<1> isSolo = 0;
+  let TSFlags{5} = isSolo;
+  // Packed only with A or X-type instructions.
+  bits<1> isSoloAX = 0;
+  let TSFlags{6} = isSoloAX;
+  // Only A-type instruction in first slot or nothing.
+  bits<1> isSoloAin1 = 0;
+  let TSFlags{7} = isSoloAin1;
+
+  // Predicated instructions.
+  bits<1> isPredicated = 0;
+  let TSFlags{8} = isPredicated;
+  bits<1> isPredicatedFalse = 0;
+  let TSFlags{9} = isPredicatedFalse;
+  bits<1> isPredicatedNew = 0;
+  let TSFlags{10} = isPredicatedNew;
+  bits<1> isPredicateLate = 0;
+  let TSFlags{11} = isPredicateLate; // Late predicate producer insn.
+
+  // New-value insn helper fields.
+  bits<1> isNewValue = 0;
+  let TSFlags{12} = isNewValue; // New-value consumer insn.
+  bits<1> hasNewValue = 0;
+  let TSFlags{13} = hasNewValue; // New-value producer insn.
+  bits<3> opNewValue = 0;
+  let TSFlags{16-14} = opNewValue; // New-value produced operand.
+  bits<1> isNVStorable = 0;
+  let TSFlags{17} = isNVStorable; // Store that can become new-value store.
+  bits<1> isNVStore = 0;
+  let TSFlags{18} = isNVStore; // New-value store insn.
+  bits<1> isCVLoadable = 0;
+  let TSFlags{19} = isCVLoadable; // Load that can become cur-value load.
+  bits<1> isCVLoad = 0;
+  let TSFlags{20} = isCVLoad; // Cur-value load insn.
+
+  // Immediate extender helper fields.
+  bits<1> isExtendable = 0;
+  let TSFlags{21} = isExtendable; // Insn may be extended.
+  bits<1> isExtended = 0;
+  let TSFlags{22} = isExtended; // Insn must be extended.
+  bits<3> opExtendable = 0;
+  let TSFlags{25-23} = opExtendable; // Which operand may be extended.
+  bits<1> isExtentSigned = 0;
+  let TSFlags{26} = isExtentSigned; // Signed or unsigned range.
+  bits<5> opExtentBits = 0;
+  let TSFlags{31-27} = opExtentBits; //Number of bits of range before extending.
+  bits<2> opExtentAlign = 0;
+  let TSFlags{33-32} = opExtentAlign; // Alignment exponent before extending.
+
+  // If an instruction is valid on a subtarget (v2-v5), set the corresponding
+  // bit from validSubTargets. v2 is the least significant bit.
+  // By default, instruction is valid on all subtargets.
+  SubTarget validSubTargets = HasV2SubT;
+  let TSFlags{37-34} = validSubTargets.Value;
+
+  // Addressing mode for load/store instructions.
+  AddrModeType addrMode = NoAddrMode;
+  let TSFlags{42-40} = addrMode.Value;
+
+  // Memory access size for mem access instructions (load/store)
+  MemAccessSize accessSize = NoMemAccess;
+  let TSFlags{45-43} = accessSize.Value;
+
+  bits<1> isTaken = 0;
+  let TSFlags {47} = isTaken; // Branch prediction.
+
+  bits<1> isFP = 0;
+  let TSFlags {48} = isFP; // Floating-point.
+
+  // Fields used for relation models.
+  string BaseOpcode = "";
+  string CextOpcode = "";
+  string PredSense = "";
+  string PNewValue = "";
+  string NValueST  = "";    // Set to "true" for new-value stores.
+  string InputType = "";    // Input is "imm" or "reg" type.
+  string isMEMri = "false"; // Set to "true" for load/store with MEMri operand.
+  string isFloat = "false"; // Set to "true" for the floating-point load/store.
+  string isBrTaken = ""; // Set to "true"/"false" for jump instructions
+
+  let PredSense = !if(isPredicated, !if(isPredicatedFalse, "false", "true"),
+                                    "");
+  let PNewValue = !if(isPredicatedNew, "new", "");
+  let NValueST = !if(isNVStore, "true", "false");
+
+  // *** Must match MCTargetDesc/HexagonBaseInfo.h ***
+}
+
+//===----------------------------------------------------------------------===//
+//                         Instruction Classes Definitions +
+//===----------------------------------------------------------------------===//
+
+// LD Instruction Class in V2/V3/V4.
+// Definition of the instruction class NOT CHANGED.
+class LDInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+             string cstr = "", InstrItinClass itin = LD_tc_ld_SLOT01>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeLD>;
+
+let mayLoad = 1 in
+class LDInst2<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr = "">
+  : LDInst<outs, ins, asmstr, pattern, cstr>;
+
+class CONSTLDInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                  string cstr = "">
+  : LDInst<outs, ins, asmstr, pattern, cstr>;
+
+// LD Instruction Class in V2/V3/V4.
+// Definition of the instruction class NOT CHANGED.
+class LDInstPost<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                 string cstr = "">
+  : LDInst<outs, ins, asmstr, pattern, cstr>;
+
+let mayLoad = 1 in
+class LD0Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr = "", InstrItinClass itin=LD_tc_ld_SLOT0>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeLD>;
+
+// ST Instruction Class in V2/V3 can take SLOT0 only.
+// ST Instruction Class in V4    can take SLOT0 & SLOT1.
+// Definition of the instruction class CHANGED from V2/V3 to V4.
+let mayStore = 1 in
+class STInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+             string cstr = "", InstrItinClass itin = ST_tc_st_SLOT01>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeST>;
+
+class STInst2<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr = "">
+  : STInst<outs, ins, asmstr, pattern, cstr>;
+
+let mayStore = 1 in
+class ST0Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr = "", InstrItinClass itin = ST_tc_ld_SLOT0>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeST>;
+
+// ST Instruction Class in V2/V3 can take SLOT0 only.
+// ST Instruction Class in V4    can take SLOT0 & SLOT1.
+// Definition of the instruction class CHANGED from V2/V3 to V4.
+class STInstPost<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                 string cstr = "", InstrItinClass itin = ST_tc_st_SLOT01>
+  : STInst<outs, ins, asmstr, pattern, cstr, itin>;
+
+// SYSTEM Instruction Class in V4 can take SLOT0 only
+// In V2/V3 we used ST for this but in v4 ST can take SLOT0 or SLOT1.
+class SYSInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr = "",  InstrItinClass itin = ST_tc_3stall_SLOT0>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeSYSTEM>;
+
+// ALU32 Instruction Class in V2/V3/V4.
+// Definition of the instruction class NOT CHANGED.
+class ALU32Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                string cstr = "", InstrItinClass itin = ALU32_2op_tc_1_SLOT0123>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeALU32>;
+
+// ALU64 Instruction Class in V2/V3.
+// XTYPE Instruction Class in V4.
+// Definition of the instruction class NOT CHANGED.
+// Name of the Instruction Class changed from ALU64 to XTYPE from V2/V3 to V4.
+class ALU64Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                string cstr = "", InstrItinClass itin = ALU64_tc_2_SLOT23>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>;
+
+class ALU64_acc<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                string cstr = "", InstrItinClass itin = ALU64_tc_2_SLOT23>
+  : ALU64Inst<outs, ins, asmstr, pattern, cstr, itin>;
+
+
+// M Instruction Class in V2/V3.
+// XTYPE Instruction Class in V4.
+// Definition of the instruction class NOT CHANGED.
+// Name of the Instruction Class changed from M to XTYPE from V2/V3 to V4.
+class MInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+            string cstr = "", InstrItinClass itin = M_tc_3x_SLOT23>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>;
+
+// M Instruction Class in V2/V3.
+// XTYPE Instruction Class in V4.
+// Definition of the instruction class NOT CHANGED.
+// Name of the Instruction Class changed from M to XTYPE from V2/V3 to V4.
+class MInst_acc<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                string cstr = "", InstrItinClass itin = M_tc_2_SLOT23>
+    : MInst<outs, ins, asmstr, pattern, cstr, itin>;
+
+// S Instruction Class in V2/V3.
+// XTYPE Instruction Class in V4.
+// Definition of the instruction class NOT CHANGED.
+// Name of the Instruction Class changed from S to XTYPE from V2/V3 to V4.
+class SInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+            string cstr = "", InstrItinClass itin = S_2op_tc_1_SLOT23>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>;
+
+// S Instruction Class in V2/V3.
+// XTYPE Instruction Class in V4.
+// Definition of the instruction class NOT CHANGED.
+// Name of the Instruction Class changed from S to XTYPE from V2/V3 to V4.
+class SInst_acc<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                string cstr = "", InstrItinClass itin = S_3op_tc_1_SLOT23>
+  : SInst<outs, ins, asmstr, pattern, cstr, itin>;
+
+// J Instruction Class in V2/V3/V4.
+// Definition of the instruction class NOT CHANGED.
+class JInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+            string cstr = "", InstrItinClass itin = J_tc_2early_SLOT23>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeJ>;
+
+// JR Instruction Class in V2/V3/V4.
+// Definition of the instruction class NOT CHANGED.
+class JRInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+             string cstr = "", InstrItinClass itin = J_tc_2early_SLOT2>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeJR>;
+
+// CR Instruction Class in V2/V3/V4.
+// Definition of the instruction class NOT CHANGED.
+class CRInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+             string cstr = "", InstrItinClass itin = CR_tc_2early_SLOT3>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCR>;
+
+let isCodeGenOnly = 1, isPseudo = 1 in
+class Endloop<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr = "", InstrItinClass itin = J_tc_2early_SLOT0123>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeENDLOOP>;
+
+let isCodeGenOnly = 1, isPseudo = 1 in
+class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+             string cstr = "">
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, PSEUDO, TypePSEUDO>;
+
+let isCodeGenOnly = 1, isPseudo = 1 in
+class PseudoM<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr="">
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, PSEUDOM, TypePSEUDO>;
+
+//===----------------------------------------------------------------------===//
+//                         Instruction Classes Definitions -
+//===----------------------------------------------------------------------===//
+
+
+//
+// ALU32 patterns
+//.
+class ALU32_rr<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+               string cstr = "", InstrItinClass itin = ALU32_2op_tc_1_SLOT0123>
+   : ALU32Inst<outs, ins, asmstr, pattern, cstr, itin>;
+
+class ALU32_ir<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+               string cstr = "", InstrItinClass itin = ALU32_2op_tc_1_SLOT0123>
+   : ALU32Inst<outs, ins, asmstr, pattern, cstr, itin>;
+
+class ALU32_ri<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+               string cstr = "", InstrItinClass itin = ALU32_2op_tc_1_SLOT0123>
+   : ALU32Inst<outs, ins, asmstr, pattern, cstr, itin>;
+
+class ALU32_ii<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+               string cstr = "", InstrItinClass itin = ALU32_2op_tc_1_SLOT0123>
+   : ALU32Inst<outs, ins, asmstr, pattern, cstr, itin>;
+
+
+//
+// ALU64 patterns.
+//
+class ALU64_rr<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+               string cstr = "", InstrItinClass itin = ALU64_tc_1_SLOT23>
+   : ALU64Inst<outs, ins, asmstr, pattern, cstr, itin>;
+
+class ALU64_ri<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+               string cstr = "", InstrItinClass itin = ALU64_tc_1_SLOT23>
+   : ALU64Inst<outs, ins, asmstr, pattern, cstr, itin>;
+
+// Post increment ST Instruction.
+class STInstPI<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+               string cstr = "">
+  : STInst<outs, ins, asmstr, pattern, cstr>;
+
+let mayStore = 1 in
+class STInst2PI<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                string cstr = "">
+  : STInst<outs, ins, asmstr, pattern, cstr>;
+
+// Post increment LD Instruction.
+class LDInstPI<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+               string cstr = "">
+  : LDInst<outs, ins, asmstr, pattern, cstr>;
+
+let mayLoad = 1 in
+class LDInst2PI<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                string cstr = "">
+  : LDInst<outs, ins, asmstr, pattern, cstr>;
+
+//===----------------------------------------------------------------------===//
+// V4 Instruction Format Definitions +
+//===----------------------------------------------------------------------===//
+
+include "HexagonInstrFormatsV4.td"
+
+//===----------------------------------------------------------------------===//
+// V4 Instruction Format Definitions +
+//===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV4.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV4.td
new file mode 100644
index 0000000..d92f97b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV4.td
@@ -0,0 +1,67 @@
+//==- HexagonInstrFormats.td - Hexagon Instruction Formats --*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon V4 instruction classes in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+//----------------------------------------------------------------------------//
+//                         Hexagon Instruction Flags
+//
+//                        *** Must match BaseInfo.h ***
+//----------------------------------------------------------------------------//
+
+def TypeMEMOP  : IType<9>;
+def TypeNV     : IType<10>;
+def TypePREFIX : IType<30>;
+
+//----------------------------------------------------------------------------//
+//                         Instruction Classes Definitions
+//----------------------------------------------------------------------------//
+
+//
+// NV type instructions.
+//
+class NVInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+             string cstr = "", InstrItinClass itin = NCJ_tc_3or4stall_SLOT0>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeNV>;
+
+class NVInst_V4<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                string cstr = "", InstrItinClass itin = NCJ_tc_3or4stall_SLOT0>
+  : NVInst<outs, ins, asmstr, pattern, cstr, itin>;
+
+// Definition of Post increment new value store.
+class NVInstPost_V4<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+               string cstr = "", InstrItinClass itin = ST_tc_st_SLOT0>
+  : NVInst<outs, ins, asmstr, pattern, cstr, itin>;
+
+// Post increment ST Instruction.
+let mayStore = 1 in
+class NVInstPI_V4<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+               string cstr = "", InstrItinClass itin = ST_tc_st_SLOT0>
+  : NVInst<outs, ins, asmstr, pattern, cstr, itin>;
+
+// New-value conditional branch.
+class NCJInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr = "">
+  : NVInst<outs, ins, asmstr, pattern, cstr>;
+
+let mayLoad = 1, mayStore = 1 in
+class MEMInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr = "", InstrItinClass itin = V4LDST_tc_st_SLOT0>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeMEMOP>;
+
+class MEMInst_V4<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                 string cstr = "", InstrItinClass itin = V4LDST_tc_st_SLOT0>
+  : MEMInst<outs, ins, asmstr, pattern, cstr, itin>;
+
+let isCodeGenOnly = 1 in
+class EXTENDERInst<dag outs, dag ins, string asmstr, list<dag> pattern = []>
+  : InstHexagon<outs, ins, asmstr, pattern, "", EXTENDER_tc_1_SLOT0123,
+                TypePREFIX>;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
new file mode 100644
index 0000000..1c95e06
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -0,0 +1,1858 @@
+//===-- HexagonInstrInfo.cpp - Hexagon Instruction Information ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Hexagon implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonInstrInfo.h"
+#include "Hexagon.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/DFAPacketizer.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "hexagon-instrinfo"
+
+#define GET_INSTRINFO_CTOR_DTOR
+#define GET_INSTRMAP_INFO
+#include "HexagonGenInstrInfo.inc"
+#include "HexagonGenDFAPacketizer.inc"
+
+///
+/// Constants for Hexagon instructions.
+///
+const int Hexagon_MEMW_OFFSET_MAX = 4095;
+const int Hexagon_MEMW_OFFSET_MIN = -4096;
+const int Hexagon_MEMD_OFFSET_MAX = 8191;
+const int Hexagon_MEMD_OFFSET_MIN = -8192;
+const int Hexagon_MEMH_OFFSET_MAX = 2047;
+const int Hexagon_MEMH_OFFSET_MIN = -2048;
+const int Hexagon_MEMB_OFFSET_MAX = 1023;
+const int Hexagon_MEMB_OFFSET_MIN = -1024;
+const int Hexagon_ADDI_OFFSET_MAX = 32767;
+const int Hexagon_ADDI_OFFSET_MIN = -32768;
+const int Hexagon_MEMD_AUTOINC_MAX = 56;
+const int Hexagon_MEMD_AUTOINC_MIN = -64;
+const int Hexagon_MEMW_AUTOINC_MAX = 28;
+const int Hexagon_MEMW_AUTOINC_MIN = -32;
+const int Hexagon_MEMH_AUTOINC_MAX = 14;
+const int Hexagon_MEMH_AUTOINC_MIN = -16;
+const int Hexagon_MEMB_AUTOINC_MAX = 7;
+const int Hexagon_MEMB_AUTOINC_MIN = -8;
+
+// Pin the vtable to this file.
+void HexagonInstrInfo::anchor() {}
+
+HexagonInstrInfo::HexagonInstrInfo(HexagonSubtarget &ST)
+  : HexagonGenInstrInfo(Hexagon::ADJCALLSTACKDOWN, Hexagon::ADJCALLSTACKUP),
+    RI(ST), Subtarget(ST) {
+}
+
+
+/// isLoadFromStackSlot - If the specified machine instruction is a direct
+/// load from a stack slot, return the virtual or physical register number of
+/// the destination along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than loading from the stack slot.
+unsigned HexagonInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+                                             int &FrameIndex) const {
+
+
+  switch (MI->getOpcode()) {
+  default: break;
+  case Hexagon::LDriw:
+  case Hexagon::LDrid:
+  case Hexagon::LDrih:
+  case Hexagon::LDrib:
+  case Hexagon::LDriub:
+    if (MI->getOperand(2).isFI() &&
+        MI->getOperand(1).isImm() && (MI->getOperand(1).getImm() == 0)) {
+      FrameIndex = MI->getOperand(2).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+  return 0;
+}
+
+
+/// isStoreToStackSlot - If the specified machine instruction is a direct
+/// store to a stack slot, return the virtual or physical register number of
+/// the source reg along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than storing to the stack slot.
+unsigned HexagonInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+                                            int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case Hexagon::STriw:
+  case Hexagon::STrid:
+  case Hexagon::STrih:
+  case Hexagon::STrib:
+    if (MI->getOperand(2).isFI() &&
+        MI->getOperand(1).isImm() && (MI->getOperand(1).getImm() == 0)) {
+      FrameIndex = MI->getOperand(0).getIndex();
+      return MI->getOperand(2).getReg();
+    }
+    break;
+  }
+  return 0;
+}
+
+
+unsigned
+HexagonInstrInfo::InsertBranch(MachineBasicBlock &MBB,MachineBasicBlock *TBB,
+                             MachineBasicBlock *FBB,
+                             const SmallVectorImpl<MachineOperand> &Cond,
+                             DebugLoc DL) const{
+
+    int BOpc   = Hexagon::JMP;
+    int BccOpc = Hexagon::JMP_t;
+
+    assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+
+    int regPos = 0;
+    // Check if ReverseBranchCondition has asked to reverse this branch
+    // If we want to reverse the branch an odd number of times, we want
+    // JMP_f.
+    if (!Cond.empty() && Cond[0].isImm() && Cond[0].getImm() == 0) {
+      BccOpc = Hexagon::JMP_f;
+      regPos = 1;
+    }
+
+    if (!FBB) {
+      if (Cond.empty()) {
+        // Due to a bug in TailMerging/CFG Optimization, we need to add a
+        // special case handling of a predicated jump followed by an
+        // unconditional jump. If not, Tail Merging and CFG Optimization go
+        // into an infinite loop.
+        MachineBasicBlock *NewTBB, *NewFBB;
+        SmallVector<MachineOperand, 4> Cond;
+        MachineInstr *Term = MBB.getFirstTerminator();
+        if (isPredicated(Term) && !AnalyzeBranch(MBB, NewTBB, NewFBB, Cond,
+                                                 false)) {
+          MachineBasicBlock *NextBB =
+            std::next(MachineFunction::iterator(&MBB));
+          if (NewTBB == NextBB) {
+            ReverseBranchCondition(Cond);
+            RemoveBranch(MBB);
+            return InsertBranch(MBB, TBB, nullptr, Cond, DL);
+          }
+        }
+        BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB);
+      } else {
+        BuildMI(&MBB, DL,
+                get(BccOpc)).addReg(Cond[regPos].getReg()).addMBB(TBB);
+      }
+      return 1;
+    }
+
+    BuildMI(&MBB, DL, get(BccOpc)).addReg(Cond[regPos].getReg()).addMBB(TBB);
+    BuildMI(&MBB, DL, get(BOpc)).addMBB(FBB);
+
+    return 2;
+}
+
+
+bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+                                     MachineBasicBlock *&TBB,
+                                 MachineBasicBlock *&FBB,
+                                 SmallVectorImpl<MachineOperand> &Cond,
+                                 bool AllowModify) const {
+  TBB = nullptr;
+  FBB = nullptr;
+
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::instr_iterator I = MBB.instr_end();
+  if (I == MBB.instr_begin())
+    return false;
+
+  // A basic block may looks like this:
+  //
+  //  [   insn
+  //     EH_LABEL
+  //      insn
+  //      insn
+  //      insn
+  //     EH_LABEL
+  //      insn     ]
+  //
+  // It has two succs but does not have a terminator
+  // Don't know how to handle it.
+  do {
+    --I;
+    if (I->isEHLabel())
+      return true;
+  } while (I != MBB.instr_begin());
+
+  I = MBB.instr_end();
+  --I;
+
+  while (I->isDebugValue()) {
+    if (I == MBB.instr_begin())
+      return false;
+    --I;
+  }
+
+  // Delete the JMP if it's equivalent to a fall-through.
+  if (AllowModify && I->getOpcode() == Hexagon::JMP &&
+      MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
+    DEBUG(dbgs()<< "\nErasing the jump to successor block\n";);
+    I->eraseFromParent();
+    I = MBB.instr_end();
+    if (I == MBB.instr_begin())
+      return false;
+    --I;
+  }
+  if (!isUnpredicatedTerminator(I))
+    return false;
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = I;
+  MachineInstr *SecondLastInst = nullptr;
+  // Find one more terminator if present.
+  do {
+    if (&*I != LastInst && !I->isBundle() && isUnpredicatedTerminator(I)) {
+      if (!SecondLastInst)
+        SecondLastInst = I;
+      else
+        // This is a third branch.
+        return true;
+    }
+    if (I == MBB.instr_begin())
+      break;
+    --I;
+  } while(I);
+
+  int LastOpcode = LastInst->getOpcode();
+
+  bool LastOpcodeHasJMP_c = PredOpcodeHasJMP_c(LastOpcode);
+  bool LastOpcodeHasNot = PredOpcodeHasNot(LastOpcode);
+
+  // If there is only one terminator instruction, process it.
+  if (LastInst && !SecondLastInst) {
+    if (LastOpcode == Hexagon::JMP) {
+      TBB = LastInst->getOperand(0).getMBB();
+      return false;
+    }
+    if (LastOpcode == Hexagon::ENDLOOP0) {
+      TBB = LastInst->getOperand(0).getMBB();
+      Cond.push_back(LastInst->getOperand(0));
+      return false;
+    }
+    if (LastOpcodeHasJMP_c) {
+      TBB = LastInst->getOperand(1).getMBB();
+      if (LastOpcodeHasNot) {
+        Cond.push_back(MachineOperand::CreateImm(0));
+      }
+      Cond.push_back(LastInst->getOperand(0));
+      return false;
+    }
+    // Otherwise, don't know what this is.
+    return true;
+  }
+
+  int SecLastOpcode = SecondLastInst->getOpcode();
+
+  bool SecLastOpcodeHasJMP_c = PredOpcodeHasJMP_c(SecLastOpcode);
+  bool SecLastOpcodeHasNot = PredOpcodeHasNot(SecLastOpcode);
+  if (SecLastOpcodeHasJMP_c && (LastOpcode == Hexagon::JMP)) {
+    TBB =  SecondLastInst->getOperand(1).getMBB();
+    if (SecLastOpcodeHasNot)
+      Cond.push_back(MachineOperand::CreateImm(0));
+    Cond.push_back(SecondLastInst->getOperand(0));
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  }
+
+  // If the block ends with two Hexagon:JMPs, handle it.  The second one is not
+  // executed, so remove it.
+  if (SecLastOpcode == Hexagon::JMP && LastOpcode == Hexagon::JMP) {
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return false;
+  }
+
+  // If the block ends with an ENDLOOP, and JMP, handle it.
+  if (SecLastOpcode == Hexagon::ENDLOOP0 &&
+      LastOpcode == Hexagon::JMP) {
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    Cond.push_back(SecondLastInst->getOperand(0));
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  }
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+
+unsigned HexagonInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  int BOpc   = Hexagon::JMP;
+  int BccOpc = Hexagon::JMP_t;
+  int BccOpcNot = Hexagon::JMP_f;
+
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin()) return 0;
+  --I;
+  if (I->getOpcode() != BOpc && I->getOpcode() != BccOpc &&
+      I->getOpcode() != BccOpcNot)
+    return 0;
+
+  // Remove the branch.
+  I->eraseFromParent();
+
+  I = MBB.end();
+
+  if (I == MBB.begin()) return 1;
+  --I;
+  if (I->getOpcode() != BccOpc && I->getOpcode() != BccOpcNot)
+    return 1;
+
+  // Remove the branch.
+  I->eraseFromParent();
+  return 2;
+}
+
+
+/// \brief For a comparison instruction, return the source registers in
+/// \p SrcReg and \p SrcReg2 if having two register operands, and the value it
+/// compares against in CmpValue. Return true if the comparison instruction
+/// can be analyzed.
+bool HexagonInstrInfo::analyzeCompare(const MachineInstr *MI,
+                                      unsigned &SrcReg, unsigned &SrcReg2,
+                                      int &Mask, int &Value) const {
+  unsigned Opc = MI->getOpcode();
+
+  // Set mask and the first source register.
+  switch (Opc) {
+    case Hexagon::CMPEHexagon4rr:
+    case Hexagon::CMPEQri:
+    case Hexagon::CMPEQrr:
+    case Hexagon::CMPGT64rr:
+    case Hexagon::CMPGTU64rr:
+    case Hexagon::CMPGTUri:
+    case Hexagon::CMPGTUrr:
+    case Hexagon::CMPGTri:
+    case Hexagon::CMPGTrr:
+      SrcReg = MI->getOperand(1).getReg();
+      Mask = ~0;
+      break;
+    case Hexagon::CMPbEQri_V4:
+    case Hexagon::CMPbEQrr_sbsb_V4:
+    case Hexagon::CMPbEQrr_ubub_V4:
+    case Hexagon::CMPbGTUri_V4:
+    case Hexagon::CMPbGTUrr_V4:
+    case Hexagon::CMPbGTrr_V4:
+      SrcReg = MI->getOperand(1).getReg();
+      Mask = 0xFF;
+      break;
+    case Hexagon::CMPhEQri_V4:
+    case Hexagon::CMPhEQrr_shl_V4:
+    case Hexagon::CMPhEQrr_xor_V4:
+    case Hexagon::CMPhGTUri_V4:
+    case Hexagon::CMPhGTUrr_V4:
+    case Hexagon::CMPhGTrr_shl_V4:
+      SrcReg = MI->getOperand(1).getReg();
+      Mask = 0xFFFF;
+      break;
+  }
+
+  // Set the value/second source register.
+  switch (Opc) {
+    case Hexagon::CMPEHexagon4rr:
+    case Hexagon::CMPEQrr:
+    case Hexagon::CMPGT64rr:
+    case Hexagon::CMPGTU64rr:
+    case Hexagon::CMPGTUrr:
+    case Hexagon::CMPGTrr:
+    case Hexagon::CMPbEQrr_sbsb_V4:
+    case Hexagon::CMPbEQrr_ubub_V4:
+    case Hexagon::CMPbGTUrr_V4:
+    case Hexagon::CMPbGTrr_V4:
+    case Hexagon::CMPhEQrr_shl_V4:
+    case Hexagon::CMPhEQrr_xor_V4:
+    case Hexagon::CMPhGTUrr_V4:
+    case Hexagon::CMPhGTrr_shl_V4:
+      SrcReg2 = MI->getOperand(2).getReg();
+      return true;
+
+    case Hexagon::CMPEQri:
+    case Hexagon::CMPGTUri:
+    case Hexagon::CMPGTri:
+    case Hexagon::CMPbEQri_V4:
+    case Hexagon::CMPbGTUri_V4:
+    case Hexagon::CMPhEQri_V4:
+    case Hexagon::CMPhGTUri_V4:
+      SrcReg2 = 0;
+      Value = MI->getOperand(2).getImm();
+      return true;
+  }
+
+  return false;
+}
+
+
+void HexagonInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator I, DebugLoc DL,
+                                 unsigned DestReg, unsigned SrcReg,
+                                 bool KillSrc) const {
+  if (Hexagon::IntRegsRegClass.contains(SrcReg, DestReg)) {
+    BuildMI(MBB, I, DL, get(Hexagon::TFR), DestReg).addReg(SrcReg);
+    return;
+  }
+  if (Hexagon::DoubleRegsRegClass.contains(SrcReg, DestReg)) {
+    BuildMI(MBB, I, DL, get(Hexagon::TFR64), DestReg).addReg(SrcReg);
+    return;
+  }
+  if (Hexagon::PredRegsRegClass.contains(SrcReg, DestReg)) {
+    // Map Pd = Ps to Pd = or(Ps, Ps).
+    BuildMI(MBB, I, DL, get(Hexagon::OR_pp),
+            DestReg).addReg(SrcReg).addReg(SrcReg);
+    return;
+  }
+  if (Hexagon::DoubleRegsRegClass.contains(DestReg) &&
+      Hexagon::IntRegsRegClass.contains(SrcReg)) {
+    // We can have an overlap between single and double reg: r1:0 = r0.
+    if(SrcReg == RI.getSubReg(DestReg, Hexagon::subreg_loreg)) {
+        // r1:0 = r0
+        BuildMI(MBB, I, DL, get(Hexagon::TFRI), (RI.getSubReg(DestReg,
+                Hexagon::subreg_hireg))).addImm(0);
+    } else {
+        // r1:0 = r1 or no overlap.
+        BuildMI(MBB, I, DL, get(Hexagon::TFR), (RI.getSubReg(DestReg,
+                Hexagon::subreg_loreg))).addReg(SrcReg);
+        BuildMI(MBB, I, DL, get(Hexagon::TFRI), (RI.getSubReg(DestReg,
+                Hexagon::subreg_hireg))).addImm(0);
+    }
+    return;
+  }
+  if (Hexagon::CRRegsRegClass.contains(DestReg) &&
+      Hexagon::IntRegsRegClass.contains(SrcReg)) {
+    BuildMI(MBB, I, DL, get(Hexagon::TFCR), DestReg).addReg(SrcReg);
+    return;
+  }
+  if (Hexagon::PredRegsRegClass.contains(SrcReg) &&
+      Hexagon::IntRegsRegClass.contains(DestReg)) {
+    BuildMI(MBB, I, DL, get(Hexagon::TFR_RsPd), DestReg).
+      addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+  if (Hexagon::IntRegsRegClass.contains(SrcReg) &&
+      Hexagon::PredRegsRegClass.contains(DestReg)) {
+    BuildMI(MBB, I, DL, get(Hexagon::TFR_PdRs), DestReg).
+      addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+
+  llvm_unreachable("Unimplemented");
+}
+
+
+void HexagonInstrInfo::
+storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                    unsigned SrcReg, bool isKill, int FI,
+                    const TargetRegisterClass *RC,
+                    const TargetRegisterInfo *TRI) const {
+
+  DebugLoc DL = MBB.findDebugLoc(I);
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+  unsigned Align = MFI.getObjectAlignment(FI);
+
+  MachineMemOperand *MMO =
+      MF.getMachineMemOperand(
+                      MachinePointerInfo(PseudoSourceValue::getFixedStack(FI)),
+                      MachineMemOperand::MOStore,
+                      MFI.getObjectSize(FI),
+                      Align);
+
+  if (Hexagon::IntRegsRegClass.hasSubClassEq(RC)) {
+    BuildMI(MBB, I, DL, get(Hexagon::STriw))
+          .addFrameIndex(FI).addImm(0)
+          .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
+  } else if (Hexagon::DoubleRegsRegClass.hasSubClassEq(RC)) {
+    BuildMI(MBB, I, DL, get(Hexagon::STrid))
+          .addFrameIndex(FI).addImm(0)
+          .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
+  } else if (Hexagon::PredRegsRegClass.hasSubClassEq(RC)) {
+    BuildMI(MBB, I, DL, get(Hexagon::STriw_pred))
+          .addFrameIndex(FI).addImm(0)
+          .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
+  } else {
+    llvm_unreachable("Unimplemented");
+  }
+}
+
+
+void HexagonInstrInfo::storeRegToAddr(
+                                 MachineFunction &MF, unsigned SrcReg,
+                                 bool isKill,
+                                 SmallVectorImpl<MachineOperand> &Addr,
+                                 const TargetRegisterClass *RC,
+                                 SmallVectorImpl<MachineInstr*> &NewMIs) const
+{
+  llvm_unreachable("Unimplemented");
+}
+
+
+void HexagonInstrInfo::
+loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                     unsigned DestReg, int FI,
+                     const TargetRegisterClass *RC,
+                     const TargetRegisterInfo *TRI) const {
+  DebugLoc DL = MBB.findDebugLoc(I);
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+  unsigned Align = MFI.getObjectAlignment(FI);
+
+  MachineMemOperand *MMO =
+      MF.getMachineMemOperand(
+                      MachinePointerInfo(PseudoSourceValue::getFixedStack(FI)),
+                      MachineMemOperand::MOLoad,
+                      MFI.getObjectSize(FI),
+                      Align);
+  if (RC == &Hexagon::IntRegsRegClass) {
+    BuildMI(MBB, I, DL, get(Hexagon::LDriw), DestReg)
+          .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+  } else if (RC == &Hexagon::DoubleRegsRegClass) {
+    BuildMI(MBB, I, DL, get(Hexagon::LDrid), DestReg)
+          .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+  } else if (RC == &Hexagon::PredRegsRegClass) {
+    BuildMI(MBB, I, DL, get(Hexagon::LDriw_pred), DestReg)
+          .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+  } else {
+    llvm_unreachable("Can't store this register to stack slot");
+  }
+}
+
+
+void HexagonInstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+                                        SmallVectorImpl<MachineOperand> &Addr,
+                                        const TargetRegisterClass *RC,
+                                 SmallVectorImpl<MachineInstr*> &NewMIs) const {
+  llvm_unreachable("Unimplemented");
+}
+
+
+MachineInstr *HexagonInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+                                                    MachineInstr* MI,
+                                          const SmallVectorImpl<unsigned> &Ops,
+                                                    int FI) const {
+  // Hexagon_TODO: Implement.
+  return nullptr;
+}
+
+unsigned HexagonInstrInfo::createVR(MachineFunction* MF, MVT VT) const {
+
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  const TargetRegisterClass *TRC;
+  if (VT == MVT::i1) {
+    TRC = &Hexagon::PredRegsRegClass;
+  } else if (VT == MVT::i32 || VT == MVT::f32) {
+    TRC = &Hexagon::IntRegsRegClass;
+  } else if (VT == MVT::i64 || VT == MVT::f64) {
+    TRC = &Hexagon::DoubleRegsRegClass;
+  } else {
+    llvm_unreachable("Cannot handle this register class");
+  }
+
+  unsigned NewReg = RegInfo.createVirtualRegister(TRC);
+  return NewReg;
+}
+
+bool HexagonInstrInfo::isExtendable(const MachineInstr *MI) const {
+  // Constant extenders are allowed only for V4 and above.
+  if (!Subtarget.hasV4TOps())
+    return false;
+
+  const MCInstrDesc &MID = MI->getDesc();
+  const uint64_t F = MID.TSFlags;
+  if ((F >> HexagonII::ExtendablePos) & HexagonII::ExtendableMask)
+    return true;
+
+  // TODO: This is largely obsolete now. Will need to be removed
+  // in consecutive patches.
+  switch(MI->getOpcode()) {
+    // TFR_FI Remains a special case.
+    case Hexagon::TFR_FI:
+      return true;
+    default:
+      return false;
+  }
+  return  false;
+}
+
+// This returns true in two cases:
+// - The OP code itself indicates that this is an extended instruction.
+// - One of MOs has been marked with HMOTF_ConstExtended flag.
+bool HexagonInstrInfo::isExtended(const MachineInstr *MI) const {
+  // First check if this is permanently extended op code.
+  const uint64_t F = MI->getDesc().TSFlags;
+  if ((F >> HexagonII::ExtendedPos) & HexagonII::ExtendedMask)
+    return true;
+  // Use MO operand flags to determine if one of MI's operands
+  // has HMOTF_ConstExtended flag set.
+  for (MachineInstr::const_mop_iterator I = MI->operands_begin(),
+       E = MI->operands_end(); I != E; ++I) {
+    if (I->getTargetFlags() && HexagonII::HMOTF_ConstExtended)
+      return true;
+  }
+  return  false;
+}
+
+bool HexagonInstrInfo::isBranch (const MachineInstr *MI) const {
+  return MI->getDesc().isBranch();
+}
+
+bool HexagonInstrInfo::isNewValueInst(const MachineInstr *MI) const {
+  if (isNewValueJump(MI))
+    return true;
+
+  if (isNewValueStore(MI))
+    return true;
+
+  return false;
+}
+
+bool HexagonInstrInfo::isSaveCalleeSavedRegsCall(const MachineInstr *MI) const {
+  return MI->getOpcode() == Hexagon::SAVE_REGISTERS_CALL_V4;
+}
+
+bool HexagonInstrInfo::isPredicable(MachineInstr *MI) const {
+  bool isPred = MI->getDesc().isPredicable();
+
+  if (!isPred)
+    return false;
+
+  const int Opc = MI->getOpcode();
+
+  switch(Opc) {
+  case Hexagon::TFRI:
+    return isInt<12>(MI->getOperand(1).getImm());
+
+  case Hexagon::STrid:
+  case Hexagon::STrid_indexed:
+    return isShiftedUInt<6,3>(MI->getOperand(1).getImm());
+
+  case Hexagon::STriw:
+  case Hexagon::STriw_indexed:
+  case Hexagon::STriw_nv_V4:
+    return isShiftedUInt<6,2>(MI->getOperand(1).getImm());
+
+  case Hexagon::STrih:
+  case Hexagon::STrih_indexed:
+  case Hexagon::STrih_nv_V4:
+    return isShiftedUInt<6,1>(MI->getOperand(1).getImm());
+
+  case Hexagon::STrib:
+  case Hexagon::STrib_indexed:
+  case Hexagon::STrib_nv_V4:
+    return isUInt<6>(MI->getOperand(1).getImm());
+
+  case Hexagon::LDrid:
+  case Hexagon::LDrid_indexed:
+    return isShiftedUInt<6,3>(MI->getOperand(2).getImm());
+
+  case Hexagon::LDriw:
+  case Hexagon::LDriw_indexed:
+    return isShiftedUInt<6,2>(MI->getOperand(2).getImm());
+
+  case Hexagon::LDrih:
+  case Hexagon::LDriuh:
+  case Hexagon::LDrih_indexed:
+  case Hexagon::LDriuh_indexed:
+    return isShiftedUInt<6,1>(MI->getOperand(2).getImm());
+
+  case Hexagon::LDrib:
+  case Hexagon::LDriub:
+  case Hexagon::LDrib_indexed:
+  case Hexagon::LDriub_indexed:
+    return isUInt<6>(MI->getOperand(2).getImm());
+
+  case Hexagon::POST_LDrid:
+    return isShiftedInt<4,3>(MI->getOperand(3).getImm());
+
+  case Hexagon::POST_LDriw:
+    return isShiftedInt<4,2>(MI->getOperand(3).getImm());
+
+  case Hexagon::POST_LDrih:
+  case Hexagon::POST_LDriuh:
+    return isShiftedInt<4,1>(MI->getOperand(3).getImm());
+
+  case Hexagon::POST_LDrib:
+  case Hexagon::POST_LDriub:
+    return isInt<4>(MI->getOperand(3).getImm());
+
+  case Hexagon::STrib_imm_V4:
+  case Hexagon::STrih_imm_V4:
+  case Hexagon::STriw_imm_V4:
+    return (isUInt<6>(MI->getOperand(1).getImm()) &&
+            isInt<6>(MI->getOperand(2).getImm()));
+
+  case Hexagon::ADD_ri:
+    return isInt<8>(MI->getOperand(2).getImm());
+
+  case Hexagon::ASLH:
+  case Hexagon::ASRH:
+  case Hexagon::SXTB:
+  case Hexagon::SXTH:
+  case Hexagon::ZXTB:
+  case Hexagon::ZXTH:
+    return Subtarget.hasV4TOps();
+  }
+
+  return true;
+}
+
+// This function performs the following inversiones:
+//
+//  cPt    ---> cNotPt
+//  cNotPt ---> cPt
+//
+unsigned HexagonInstrInfo::getInvertedPredicatedOpcode(const int Opc) const {
+  int InvPredOpcode;
+  InvPredOpcode = isPredicatedTrue(Opc) ? Hexagon::getFalsePredOpcode(Opc)
+                                        : Hexagon::getTruePredOpcode(Opc);
+  if (InvPredOpcode >= 0) // Valid instruction with the inverted predicate.
+    return InvPredOpcode;
+
+  switch(Opc) {
+    default: llvm_unreachable("Unexpected predicated instruction");
+    case Hexagon::COMBINE_rr_cPt:
+      return Hexagon::COMBINE_rr_cNotPt;
+    case Hexagon::COMBINE_rr_cNotPt:
+      return Hexagon::COMBINE_rr_cPt;
+
+      // Dealloc_return.
+    case Hexagon::DEALLOC_RET_cPt_V4:
+      return Hexagon::DEALLOC_RET_cNotPt_V4;
+    case Hexagon::DEALLOC_RET_cNotPt_V4:
+      return Hexagon::DEALLOC_RET_cPt_V4;
+  }
+}
+
+// New Value Store instructions.
+bool HexagonInstrInfo::isNewValueStore(const MachineInstr *MI) const {
+  const uint64_t F = MI->getDesc().TSFlags;
+
+  return ((F >> HexagonII::NVStorePos) & HexagonII::NVStoreMask);
+}
+
+bool HexagonInstrInfo::isNewValueStore(unsigned Opcode) const {
+  const uint64_t F = get(Opcode).TSFlags;
+
+  return ((F >> HexagonII::NVStorePos) & HexagonII::NVStoreMask);
+}
+
+int HexagonInstrInfo::
+getMatchingCondBranchOpcode(int Opc, bool invertPredicate) const {
+  enum Hexagon::PredSense inPredSense;
+  inPredSense = invertPredicate ? Hexagon::PredSense_false :
+                                  Hexagon::PredSense_true;
+  int CondOpcode = Hexagon::getPredOpcode(Opc, inPredSense);
+  if (CondOpcode >= 0) // Valid Conditional opcode/instruction
+    return CondOpcode;
+
+  // This switch case will be removed once all the instructions have been
+  // modified to use relation maps.
+  switch(Opc) {
+  case Hexagon::TFRI_f:
+    return !invertPredicate ? Hexagon::TFRI_cPt_f :
+                              Hexagon::TFRI_cNotPt_f;
+  case Hexagon::COMBINE_rr:
+    return !invertPredicate ? Hexagon::COMBINE_rr_cPt :
+                              Hexagon::COMBINE_rr_cNotPt;
+
+  // Word.
+  case Hexagon::STriw_f:
+    return !invertPredicate ? Hexagon::STriw_cPt :
+                              Hexagon::STriw_cNotPt;
+  case Hexagon::STriw_indexed_f:
+    return !invertPredicate ? Hexagon::STriw_indexed_cPt :
+                              Hexagon::STriw_indexed_cNotPt;
+
+  // DEALLOC_RETURN.
+  case Hexagon::DEALLOC_RET_V4:
+    return !invertPredicate ? Hexagon::DEALLOC_RET_cPt_V4 :
+                              Hexagon::DEALLOC_RET_cNotPt_V4;
+  }
+  llvm_unreachable("Unexpected predicable instruction");
+}
+
+
+bool HexagonInstrInfo::
+PredicateInstruction(MachineInstr *MI,
+                     const SmallVectorImpl<MachineOperand> &Cond) const {
+  int Opc = MI->getOpcode();
+  assert (isPredicable(MI) && "Expected predicable instruction");
+  bool invertJump = (!Cond.empty() && Cond[0].isImm() &&
+                     (Cond[0].getImm() == 0));
+
+  // This will change MI's opcode to its predicate version.
+  // However, its operand list is still the old one, i.e. the
+  // non-predicate one.
+  MI->setDesc(get(getMatchingCondBranchOpcode(Opc, invertJump)));
+
+  int oper = -1;
+  unsigned int GAIdx = 0;
+
+  // Indicates whether the current MI has a GlobalAddress operand
+  bool hasGAOpnd = false;
+  std::vector<MachineOperand> tmpOpnds;
+
+  // Indicates whether we need to shift operands to right.
+  bool needShift = true;
+
+  // The predicate is ALWAYS the FIRST input operand !!!
+  if (MI->getNumOperands() == 0) {
+    // The non-predicate version of MI does not take any operands,
+    // i.e. no outs and no ins. In this condition, the predicate
+    // operand will be directly placed at Operands[0]. No operand
+    // shift is needed.
+    // Example: BARRIER
+    needShift = false;
+    oper = -1;
+  }
+  else if (   MI->getOperand(MI->getNumOperands()-1).isReg()
+           && MI->getOperand(MI->getNumOperands()-1).isDef()
+           && !MI->getOperand(MI->getNumOperands()-1).isImplicit()) {
+    // The non-predicate version of MI does not have any input operands.
+    // In this condition, we extend the length of Operands[] by one and
+    // copy the original last operand to the newly allocated slot.
+    // At this moment, it is just a place holder. Later, we will put
+    // predicate operand directly into it. No operand shift is needed.
+    // Example: r0=BARRIER (this is a faked insn used here for illustration)
+    MI->addOperand(MI->getOperand(MI->getNumOperands()-1));
+    needShift = false;
+    oper = MI->getNumOperands() - 2;
+  }
+  else {
+    // We need to right shift all input operands by one. Duplicate the
+    // last operand into the newly allocated slot.
+    MI->addOperand(MI->getOperand(MI->getNumOperands()-1));
+  }
+
+  if (needShift)
+  {
+    // Operands[ MI->getNumOperands() - 2 ] has been copied into
+    // Operands[ MI->getNumOperands() - 1 ], so we start from
+    // Operands[ MI->getNumOperands() - 3 ].
+    // oper is a signed int.
+    // It is ok if "MI->getNumOperands()-3" is -3, -2, or -1.
+    for (oper = MI->getNumOperands() - 3; oper >= 0; --oper)
+    {
+      MachineOperand &MO = MI->getOperand(oper);
+
+      // Opnd[0] Opnd[1] Opnd[2] Opnd[3] Opnd[4]   Opnd[5]   Opnd[6]   Opnd[7]
+      // <Def0>  <Def1>  <Use0>  <Use1>  <ImpDef0> <ImpDef1> <ImpUse0> <ImpUse1>
+      //               /\~
+      //              /||\~
+      //               ||
+      //        Predicate Operand here
+      if (MO.isReg() && !MO.isUse() && !MO.isImplicit()) {
+        break;
+      }
+      if (MO.isReg()) {
+        MI->getOperand(oper+1).ChangeToRegister(MO.getReg(), MO.isDef(),
+                                                MO.isImplicit(), MO.isKill(),
+                                                MO.isDead(), MO.isUndef(),
+                                                MO.isDebug());
+      }
+      else if (MO.isImm()) {
+        MI->getOperand(oper+1).ChangeToImmediate(MO.getImm());
+      }
+      else if (MO.isGlobal()) {
+        // MI can not have more than one GlobalAddress operand.
+        assert(hasGAOpnd == false && "MI can only have one GlobalAddress opnd");
+
+        // There is no member function called "ChangeToGlobalAddress" in the
+        // MachineOperand class (not like "ChangeToRegister" and
+        // "ChangeToImmediate"). So we have to remove them from Operands[] list
+        // first, and then add them back after we have inserted the predicate
+        // operand. tmpOpnds[] is to remember these operands before we remove
+        // them.
+        tmpOpnds.push_back(MO);
+
+        // Operands[oper] is a GlobalAddress operand;
+        // Operands[oper+1] has been copied into Operands[oper+2];
+        hasGAOpnd = true;
+        GAIdx = oper;
+        continue;
+      }
+      else {
+        assert(false && "Unexpected operand type");
+      }
+    }
+  }
+
+  int regPos = invertJump ? 1 : 0;
+  MachineOperand PredMO = Cond[regPos];
+
+  // [oper] now points to the last explicit Def. Predicate operand must be
+  // located at [oper+1]. See diagram above.
+  // This assumes that the predicate is always the first operand,
+  // i.e. Operands[0+numResults], in the set of inputs
+  // It is better to have an assert here to check this. But I don't know how
+  // to write this assert because findFirstPredOperandIdx() would return -1
+  if (oper < -1) oper = -1;
+
+  MI->getOperand(oper+1).ChangeToRegister(PredMO.getReg(), PredMO.isDef(),
+                                          PredMO.isImplicit(), false,
+                                          PredMO.isDead(), PredMO.isUndef(),
+                                          PredMO.isDebug());
+
+  MachineRegisterInfo &RegInfo = MI->getParent()->getParent()->getRegInfo();
+  RegInfo.clearKillFlags(PredMO.getReg());
+
+  if (hasGAOpnd)
+  {
+    unsigned int i;
+
+    // Operands[GAIdx] is the original GlobalAddress operand, which is
+    // already copied into tmpOpnds[0].
+    // Operands[GAIdx] now stores a copy of Operands[GAIdx-1]
+    // Operands[GAIdx+1] has already been copied into Operands[GAIdx+2],
+    // so we start from [GAIdx+2]
+    for (i = GAIdx + 2; i < MI->getNumOperands(); ++i)
+      tmpOpnds.push_back(MI->getOperand(i));
+
+    // Remove all operands in range [ (GAIdx+1) ... (MI->getNumOperands()-1) ]
+    // It is very important that we always remove from the end of Operands[]
+    // MI->getNumOperands() is at least 2 if program goes to here.
+    for (i = MI->getNumOperands() - 1; i > GAIdx; --i)
+      MI->RemoveOperand(i);
+
+    for (i = 0; i < tmpOpnds.size(); ++i)
+      MI->addOperand(tmpOpnds[i]);
+  }
+
+  return true;
+}
+
+
+bool
+HexagonInstrInfo::
+isProfitableToIfCvt(MachineBasicBlock &MBB,
+                    unsigned NumCycles,
+                    unsigned ExtraPredCycles,
+                    const BranchProbability &Probability) const {
+  return true;
+}
+
+
+bool
+HexagonInstrInfo::
+isProfitableToIfCvt(MachineBasicBlock &TMBB,
+                    unsigned NumTCycles,
+                    unsigned ExtraTCycles,
+                    MachineBasicBlock &FMBB,
+                    unsigned NumFCycles,
+                    unsigned ExtraFCycles,
+                    const BranchProbability &Probability) const {
+  return true;
+}
+
+// Returns true if an instruction is predicated irrespective of the predicate
+// sense. For example, all of the following will return true.
+// if (p0) R1 = add(R2, R3)
+// if (!p0) R1 = add(R2, R3)
+// if (p0.new) R1 = add(R2, R3)
+// if (!p0.new) R1 = add(R2, R3)
+bool HexagonInstrInfo::isPredicated(const MachineInstr *MI) const {
+  const uint64_t F = MI->getDesc().TSFlags;
+
+  return ((F >> HexagonII::PredicatedPos) & HexagonII::PredicatedMask);
+}
+
+bool HexagonInstrInfo::isPredicated(unsigned Opcode) const {
+  const uint64_t F = get(Opcode).TSFlags;
+
+  return ((F >> HexagonII::PredicatedPos) & HexagonII::PredicatedMask);
+}
+
+bool HexagonInstrInfo::isPredicatedTrue(const MachineInstr *MI) const {
+  const uint64_t F = MI->getDesc().TSFlags;
+
+  assert(isPredicated(MI));
+  return (!((F >> HexagonII::PredicatedFalsePos) &
+            HexagonII::PredicatedFalseMask));
+}
+
+bool HexagonInstrInfo::isPredicatedTrue(unsigned Opcode) const {
+  const uint64_t F = get(Opcode).TSFlags;
+
+  // Make sure that the instruction is predicated.
+  assert((F>> HexagonII::PredicatedPos) & HexagonII::PredicatedMask);
+  return (!((F >> HexagonII::PredicatedFalsePos) &
+            HexagonII::PredicatedFalseMask));
+}
+
+bool HexagonInstrInfo::isPredicatedNew(const MachineInstr *MI) const {
+  const uint64_t F = MI->getDesc().TSFlags;
+
+  assert(isPredicated(MI));
+  return ((F >> HexagonII::PredicatedNewPos) & HexagonII::PredicatedNewMask);
+}
+
+bool HexagonInstrInfo::isPredicatedNew(unsigned Opcode) const {
+  const uint64_t F = get(Opcode).TSFlags;
+
+  assert(isPredicated(Opcode));
+  return ((F >> HexagonII::PredicatedNewPos) & HexagonII::PredicatedNewMask);
+}
+
+// Returns true, if a ST insn can be promoted to a new-value store.
+bool HexagonInstrInfo::mayBeNewStore(const MachineInstr *MI) const {
+  const HexagonRegisterInfo& QRI = getRegisterInfo();
+  const uint64_t F = MI->getDesc().TSFlags;
+
+  return ((F >> HexagonII::mayNVStorePos) &
+           HexagonII::mayNVStoreMask &
+           QRI.Subtarget.hasV4TOps());
+}
+
+bool
+HexagonInstrInfo::DefinesPredicate(MachineInstr *MI,
+                                   std::vector<MachineOperand> &Pred) const {
+  for (unsigned oper = 0; oper < MI->getNumOperands(); ++oper) {
+    MachineOperand MO = MI->getOperand(oper);
+    if (MO.isReg() && MO.isDef()) {
+      const TargetRegisterClass* RC = RI.getMinimalPhysRegClass(MO.getReg());
+      if (RC == &Hexagon::PredRegsRegClass) {
+        Pred.push_back(MO);
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+
+bool
+HexagonInstrInfo::
+SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
+                  const SmallVectorImpl<MachineOperand> &Pred2) const {
+  // TODO: Fix this
+  return false;
+}
+
+
+//
+// We indicate that we want to reverse the branch by
+// inserting a 0 at the beginning of the Cond vector.
+//
+bool HexagonInstrInfo::
+ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+  if (!Cond.empty() && Cond[0].isImm() && Cond[0].getImm() == 0) {
+    Cond.erase(Cond.begin());
+  } else {
+    Cond.insert(Cond.begin(), MachineOperand::CreateImm(0));
+  }
+  return false;
+}
+
+
+bool HexagonInstrInfo::
+isProfitableToDupForIfCvt(MachineBasicBlock &MBB,unsigned NumInstrs,
+                          const BranchProbability &Probability) const {
+  return (NumInstrs <= 4);
+}
+
+bool HexagonInstrInfo::isDeallocRet(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  default: return false;
+  case Hexagon::DEALLOC_RET_V4 :
+  case Hexagon::DEALLOC_RET_cPt_V4 :
+  case Hexagon::DEALLOC_RET_cNotPt_V4 :
+  case Hexagon::DEALLOC_RET_cdnPnt_V4 :
+  case Hexagon::DEALLOC_RET_cNotdnPnt_V4 :
+  case Hexagon::DEALLOC_RET_cdnPt_V4 :
+  case Hexagon::DEALLOC_RET_cNotdnPt_V4 :
+   return true;
+  }
+}
+
+
+bool HexagonInstrInfo::
+isValidOffset(const int Opcode, const int Offset) const {
+  // This function is to check whether the "Offset" is in the correct range of
+  // the given "Opcode". If "Offset" is not in the correct range, "ADD_ri" is
+  // inserted to calculate the final address. Due to this reason, the function
+  // assumes that the "Offset" has correct alignment.
+  // We used to assert if the offset was not properly aligned, however,
+  // there are cases where a misaligned pointer recast can cause this
+  // problem, and we need to allow for it. The front end warns of such
+  // misaligns with respect to load size.
+
+  switch(Opcode) {
+
+  case Hexagon::LDriw:
+  case Hexagon::LDriw_indexed:
+  case Hexagon::LDriw_f:
+  case Hexagon::STriw_indexed:
+  case Hexagon::STriw:
+  case Hexagon::STriw_f:
+    return (Offset >= Hexagon_MEMW_OFFSET_MIN) &&
+      (Offset <= Hexagon_MEMW_OFFSET_MAX);
+
+  case Hexagon::LDrid:
+  case Hexagon::LDrid_indexed:
+  case Hexagon::LDrid_f:
+  case Hexagon::STrid:
+  case Hexagon::STrid_indexed:
+  case Hexagon::STrid_f:
+    return (Offset >= Hexagon_MEMD_OFFSET_MIN) &&
+      (Offset <= Hexagon_MEMD_OFFSET_MAX);
+
+  case Hexagon::LDrih:
+  case Hexagon::LDriuh:
+  case Hexagon::STrih:
+    return (Offset >= Hexagon_MEMH_OFFSET_MIN) &&
+      (Offset <= Hexagon_MEMH_OFFSET_MAX);
+
+  case Hexagon::LDrib:
+  case Hexagon::STrib:
+  case Hexagon::LDriub:
+    return (Offset >= Hexagon_MEMB_OFFSET_MIN) &&
+      (Offset <= Hexagon_MEMB_OFFSET_MAX);
+
+  case Hexagon::ADD_ri:
+  case Hexagon::TFR_FI:
+    return (Offset >= Hexagon_ADDI_OFFSET_MIN) &&
+      (Offset <= Hexagon_ADDI_OFFSET_MAX);
+
+  case Hexagon::MemOPw_ADDi_V4 :
+  case Hexagon::MemOPw_SUBi_V4 :
+  case Hexagon::MemOPw_ADDr_V4 :
+  case Hexagon::MemOPw_SUBr_V4 :
+  case Hexagon::MemOPw_ANDr_V4 :
+  case Hexagon::MemOPw_ORr_V4 :
+    return (0 <= Offset && Offset <= 255);
+
+  case Hexagon::MemOPh_ADDi_V4 :
+  case Hexagon::MemOPh_SUBi_V4 :
+  case Hexagon::MemOPh_ADDr_V4 :
+  case Hexagon::MemOPh_SUBr_V4 :
+  case Hexagon::MemOPh_ANDr_V4 :
+  case Hexagon::MemOPh_ORr_V4 :
+    return (0 <= Offset && Offset <= 127);
+
+  case Hexagon::MemOPb_ADDi_V4 :
+  case Hexagon::MemOPb_SUBi_V4 :
+  case Hexagon::MemOPb_ADDr_V4 :
+  case Hexagon::MemOPb_SUBr_V4 :
+  case Hexagon::MemOPb_ANDr_V4 :
+  case Hexagon::MemOPb_ORr_V4 :
+    return (0 <= Offset && Offset <= 63);
+
+  // LDri_pred and STriw_pred are pseudo operations, so it has to take offset of
+  // any size. Later pass knows how to handle it.
+  case Hexagon::STriw_pred:
+  case Hexagon::LDriw_pred:
+    return true;
+
+  case Hexagon::LOOP0_i:
+    return isUInt<10>(Offset);
+
+  // INLINEASM is very special.
+  case Hexagon::INLINEASM:
+    return true;
+  }
+
+  llvm_unreachable("No offset range is defined for this opcode. "
+                   "Please define it in the above switch statement!");
+}
+
+
+//
+// Check if the Offset is a valid auto-inc imm by Load/Store Type.
+//
+bool HexagonInstrInfo::
+isValidAutoIncImm(const EVT VT, const int Offset) const {
+
+  if (VT == MVT::i64) {
+      return (Offset >= Hexagon_MEMD_AUTOINC_MIN &&
+              Offset <= Hexagon_MEMD_AUTOINC_MAX &&
+              (Offset & 0x7) == 0);
+  }
+  if (VT == MVT::i32) {
+      return (Offset >= Hexagon_MEMW_AUTOINC_MIN &&
+              Offset <= Hexagon_MEMW_AUTOINC_MAX &&
+              (Offset & 0x3) == 0);
+  }
+  if (VT == MVT::i16) {
+      return (Offset >= Hexagon_MEMH_AUTOINC_MIN &&
+              Offset <= Hexagon_MEMH_AUTOINC_MAX &&
+              (Offset & 0x1) == 0);
+  }
+  if (VT == MVT::i8) {
+      return (Offset >= Hexagon_MEMB_AUTOINC_MIN &&
+              Offset <= Hexagon_MEMB_AUTOINC_MAX);
+  }
+  llvm_unreachable("Not an auto-inc opc!");
+}
+
+
+bool HexagonInstrInfo::
+isMemOp(const MachineInstr *MI) const {
+//  return MI->getDesc().mayLoad() && MI->getDesc().mayStore();
+
+  switch (MI->getOpcode())
+  {
+    default: return false;
+    case Hexagon::MemOPw_ADDi_V4 :
+    case Hexagon::MemOPw_SUBi_V4 :
+    case Hexagon::MemOPw_ADDr_V4 :
+    case Hexagon::MemOPw_SUBr_V4 :
+    case Hexagon::MemOPw_ANDr_V4 :
+    case Hexagon::MemOPw_ORr_V4 :
+    case Hexagon::MemOPh_ADDi_V4 :
+    case Hexagon::MemOPh_SUBi_V4 :
+    case Hexagon::MemOPh_ADDr_V4 :
+    case Hexagon::MemOPh_SUBr_V4 :
+    case Hexagon::MemOPh_ANDr_V4 :
+    case Hexagon::MemOPh_ORr_V4 :
+    case Hexagon::MemOPb_ADDi_V4 :
+    case Hexagon::MemOPb_SUBi_V4 :
+    case Hexagon::MemOPb_ADDr_V4 :
+    case Hexagon::MemOPb_SUBr_V4 :
+    case Hexagon::MemOPb_ANDr_V4 :
+    case Hexagon::MemOPb_ORr_V4 :
+    case Hexagon::MemOPb_SETBITi_V4:
+    case Hexagon::MemOPh_SETBITi_V4:
+    case Hexagon::MemOPw_SETBITi_V4:
+    case Hexagon::MemOPb_CLRBITi_V4:
+    case Hexagon::MemOPh_CLRBITi_V4:
+    case Hexagon::MemOPw_CLRBITi_V4:
+    return true;
+  }
+  return false;
+}
+
+
+bool HexagonInstrInfo::
+isSpillPredRegOp(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+    default: return false;
+    case Hexagon::STriw_pred :
+    case Hexagon::LDriw_pred :
+      return true;
+  }
+}
+
+bool HexagonInstrInfo::isNewValueJumpCandidate(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+    default: return false;
+    case Hexagon::CMPEQrr:
+    case Hexagon::CMPEQri:
+    case Hexagon::CMPGTrr:
+    case Hexagon::CMPGTri:
+    case Hexagon::CMPGTUrr:
+    case Hexagon::CMPGTUri:
+      return true;
+  }
+}
+
+bool HexagonInstrInfo::
+isConditionalTransfer (const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+    default: return false;
+    case Hexagon::TFR_cPt:
+    case Hexagon::TFR_cNotPt:
+    case Hexagon::TFRI_cPt:
+    case Hexagon::TFRI_cNotPt:
+    case Hexagon::TFR_cdnPt:
+    case Hexagon::TFR_cdnNotPt:
+    case Hexagon::TFRI_cdnPt:
+    case Hexagon::TFRI_cdnNotPt:
+      return true;
+  }
+}
+
+bool HexagonInstrInfo::isConditionalALU32 (const MachineInstr* MI) const {
+  const HexagonRegisterInfo& QRI = getRegisterInfo();
+  switch (MI->getOpcode())
+  {
+    default: return false;
+    case Hexagon::ADD_ri_cPt:
+    case Hexagon::ADD_ri_cNotPt:
+    case Hexagon::ADD_rr_cPt:
+    case Hexagon::ADD_rr_cNotPt:
+    case Hexagon::XOR_rr_cPt:
+    case Hexagon::XOR_rr_cNotPt:
+    case Hexagon::AND_rr_cPt:
+    case Hexagon::AND_rr_cNotPt:
+    case Hexagon::OR_rr_cPt:
+    case Hexagon::OR_rr_cNotPt:
+    case Hexagon::SUB_rr_cPt:
+    case Hexagon::SUB_rr_cNotPt:
+    case Hexagon::COMBINE_rr_cPt:
+    case Hexagon::COMBINE_rr_cNotPt:
+      return true;
+    case Hexagon::ASLH_cPt_V4:
+    case Hexagon::ASLH_cNotPt_V4:
+    case Hexagon::ASRH_cPt_V4:
+    case Hexagon::ASRH_cNotPt_V4:
+    case Hexagon::SXTB_cPt_V4:
+    case Hexagon::SXTB_cNotPt_V4:
+    case Hexagon::SXTH_cPt_V4:
+    case Hexagon::SXTH_cNotPt_V4:
+    case Hexagon::ZXTB_cPt_V4:
+    case Hexagon::ZXTB_cNotPt_V4:
+    case Hexagon::ZXTH_cPt_V4:
+    case Hexagon::ZXTH_cNotPt_V4:
+      return QRI.Subtarget.hasV4TOps();
+  }
+}
+
+bool HexagonInstrInfo::
+isConditionalLoad (const MachineInstr* MI) const {
+  const HexagonRegisterInfo& QRI = getRegisterInfo();
+  switch (MI->getOpcode())
+  {
+    default: return false;
+    case Hexagon::LDrid_cPt :
+    case Hexagon::LDrid_cNotPt :
+    case Hexagon::LDrid_indexed_cPt :
+    case Hexagon::LDrid_indexed_cNotPt :
+    case Hexagon::LDriw_cPt :
+    case Hexagon::LDriw_cNotPt :
+    case Hexagon::LDriw_indexed_cPt :
+    case Hexagon::LDriw_indexed_cNotPt :
+    case Hexagon::LDrih_cPt :
+    case Hexagon::LDrih_cNotPt :
+    case Hexagon::LDrih_indexed_cPt :
+    case Hexagon::LDrih_indexed_cNotPt :
+    case Hexagon::LDrib_cPt :
+    case Hexagon::LDrib_cNotPt :
+    case Hexagon::LDrib_indexed_cPt :
+    case Hexagon::LDrib_indexed_cNotPt :
+    case Hexagon::LDriuh_cPt :
+    case Hexagon::LDriuh_cNotPt :
+    case Hexagon::LDriuh_indexed_cPt :
+    case Hexagon::LDriuh_indexed_cNotPt :
+    case Hexagon::LDriub_cPt :
+    case Hexagon::LDriub_cNotPt :
+    case Hexagon::LDriub_indexed_cPt :
+    case Hexagon::LDriub_indexed_cNotPt :
+      return true;
+    case Hexagon::POST_LDrid_cPt :
+    case Hexagon::POST_LDrid_cNotPt :
+    case Hexagon::POST_LDriw_cPt :
+    case Hexagon::POST_LDriw_cNotPt :
+    case Hexagon::POST_LDrih_cPt :
+    case Hexagon::POST_LDrih_cNotPt :
+    case Hexagon::POST_LDrib_cPt :
+    case Hexagon::POST_LDrib_cNotPt :
+    case Hexagon::POST_LDriuh_cPt :
+    case Hexagon::POST_LDriuh_cNotPt :
+    case Hexagon::POST_LDriub_cPt :
+    case Hexagon::POST_LDriub_cNotPt :
+      return QRI.Subtarget.hasV4TOps();
+    case Hexagon::LDrid_indexed_shl_cPt_V4 :
+    case Hexagon::LDrid_indexed_shl_cNotPt_V4 :
+    case Hexagon::LDrib_indexed_shl_cPt_V4 :
+    case Hexagon::LDrib_indexed_shl_cNotPt_V4 :
+    case Hexagon::LDriub_indexed_shl_cPt_V4 :
+    case Hexagon::LDriub_indexed_shl_cNotPt_V4 :
+    case Hexagon::LDrih_indexed_shl_cPt_V4 :
+    case Hexagon::LDrih_indexed_shl_cNotPt_V4 :
+    case Hexagon::LDriuh_indexed_shl_cPt_V4 :
+    case Hexagon::LDriuh_indexed_shl_cNotPt_V4 :
+    case Hexagon::LDriw_indexed_shl_cPt_V4 :
+    case Hexagon::LDriw_indexed_shl_cNotPt_V4 :
+      return QRI.Subtarget.hasV4TOps();
+  }
+}
+
+// Returns true if an instruction is a conditional store.
+//
+// Note: It doesn't include conditional new-value stores as they can't be
+// converted to .new predicate.
+//
+//               p.new NV store [ if(p0.new)memw(R0+#0)=R2.new ]
+//                ^           ^
+//               /             \ (not OK. it will cause new-value store to be
+//              /               X conditional on p0.new while R2 producer is
+//             /                 \ on p0)
+//            /                   \.
+//     p.new store                 p.old NV store
+// [if(p0.new)memw(R0+#0)=R2]    [if(p0)memw(R0+#0)=R2.new]
+//            ^                  ^
+//             \                /
+//              \              /
+//               \            /
+//                 p.old store
+//             [if (p0)memw(R0+#0)=R2]
+//
+// The above diagram shows the steps involoved in the conversion of a predicated
+// store instruction to its .new predicated new-value form.
+//
+// The following set of instructions further explains the scenario where
+// conditional new-value store becomes invalid when promoted to .new predicate
+// form.
+//
+// { 1) if (p0) r0 = add(r1, r2)
+//   2) p0 = cmp.eq(r3, #0) }
+//
+//   3) if (p0) memb(r1+#0) = r0  --> this instruction can't be grouped with
+// the first two instructions because in instr 1, r0 is conditional on old value
+// of p0 but its use in instr 3 is conditional on p0 modified by instr 2 which
+// is not valid for new-value stores.
+bool HexagonInstrInfo::
+isConditionalStore (const MachineInstr* MI) const {
+  const HexagonRegisterInfo& QRI = getRegisterInfo();
+  switch (MI->getOpcode())
+  {
+    default: return false;
+    case Hexagon::STrib_imm_cPt_V4 :
+    case Hexagon::STrib_imm_cNotPt_V4 :
+    case Hexagon::STrib_indexed_shl_cPt_V4 :
+    case Hexagon::STrib_indexed_shl_cNotPt_V4 :
+    case Hexagon::STrib_cPt :
+    case Hexagon::STrib_cNotPt :
+    case Hexagon::POST_STbri_cPt :
+    case Hexagon::POST_STbri_cNotPt :
+    case Hexagon::STrid_indexed_cPt :
+    case Hexagon::STrid_indexed_cNotPt :
+    case Hexagon::STrid_indexed_shl_cPt_V4 :
+    case Hexagon::POST_STdri_cPt :
+    case Hexagon::POST_STdri_cNotPt :
+    case Hexagon::STrih_cPt :
+    case Hexagon::STrih_cNotPt :
+    case Hexagon::STrih_indexed_cPt :
+    case Hexagon::STrih_indexed_cNotPt :
+    case Hexagon::STrih_imm_cPt_V4 :
+    case Hexagon::STrih_imm_cNotPt_V4 :
+    case Hexagon::STrih_indexed_shl_cPt_V4 :
+    case Hexagon::STrih_indexed_shl_cNotPt_V4 :
+    case Hexagon::POST_SThri_cPt :
+    case Hexagon::POST_SThri_cNotPt :
+    case Hexagon::STriw_cPt :
+    case Hexagon::STriw_cNotPt :
+    case Hexagon::STriw_indexed_cPt :
+    case Hexagon::STriw_indexed_cNotPt :
+    case Hexagon::STriw_imm_cPt_V4 :
+    case Hexagon::STriw_imm_cNotPt_V4 :
+    case Hexagon::STriw_indexed_shl_cPt_V4 :
+    case Hexagon::STriw_indexed_shl_cNotPt_V4 :
+    case Hexagon::POST_STwri_cPt :
+    case Hexagon::POST_STwri_cNotPt :
+      return QRI.Subtarget.hasV4TOps();
+
+    // V4 global address store before promoting to dot new.
+    case Hexagon::STd_GP_cPt_V4 :
+    case Hexagon::STd_GP_cNotPt_V4 :
+    case Hexagon::STb_GP_cPt_V4 :
+    case Hexagon::STb_GP_cNotPt_V4 :
+    case Hexagon::STh_GP_cPt_V4 :
+    case Hexagon::STh_GP_cNotPt_V4 :
+    case Hexagon::STw_GP_cPt_V4 :
+    case Hexagon::STw_GP_cNotPt_V4 :
+      return QRI.Subtarget.hasV4TOps();
+
+    // Predicated new value stores (i.e. if (p0) memw(..)=r0.new) are excluded
+    // from the "Conditional Store" list. Because a predicated new value store
+    // would NOT be promoted to a double dot new store. See diagram below:
+    // This function returns yes for those stores that are predicated but not
+    // yet promoted to predicate dot new instructions.
+    //
+    //                          +---------------------+
+    //                    /-----| if (p0) memw(..)=r0 |---------\~
+    //                   ||     +---------------------+         ||
+    //          promote  ||       /\       /\                   ||  promote
+    //                   ||      /||\     /||\                  ||
+    //                  \||/    demote     ||                  \||/
+    //                   \/       ||       ||                   \/
+    //       +-------------------------+   ||   +-------------------------+
+    //       | if (p0.new) memw(..)=r0 |   ||   | if (p0) memw(..)=r0.new |
+    //       +-------------------------+   ||   +-------------------------+
+    //                        ||           ||         ||
+    //                        ||         demote      \||/
+    //                      promote        ||         \/ NOT possible
+    //                        ||           ||         /\~
+    //                       \||/          ||        /||\~
+    //                        \/           ||         ||
+    //                      +-----------------------------+
+    //                      | if (p0.new) memw(..)=r0.new |
+    //                      +-----------------------------+
+    //                           Double Dot New Store
+    //
+  }
+}
+
+
+bool HexagonInstrInfo::isNewValueJump(const MachineInstr *MI) const {
+  if (isNewValue(MI) && isBranch(MI))
+    return true;
+  return false;
+}
+
+bool HexagonInstrInfo::isPostIncrement (const MachineInstr* MI) const {
+  return (getAddrMode(MI) == HexagonII::PostInc);
+}
+
+bool HexagonInstrInfo::isNewValue(const MachineInstr* MI) const {
+  const uint64_t F = MI->getDesc().TSFlags;
+  return ((F >> HexagonII::NewValuePos) & HexagonII::NewValueMask);
+}
+
+// Returns true, if any one of the operands is a dot new
+// insn, whether it is predicated dot new or register dot new.
+bool HexagonInstrInfo::isDotNewInst (const MachineInstr* MI) const {
+  return (isNewValueInst(MI) ||
+     (isPredicated(MI) && isPredicatedNew(MI)));
+}
+
+// Returns the most basic instruction for the .new predicated instructions and
+// new-value stores.
+// For example, all of the following instructions will be converted back to the
+// same instruction:
+// 1) if (p0.new) memw(R0+#0) = R1.new  --->
+// 2) if (p0) memw(R0+#0)= R1.new      -------> if (p0) memw(R0+#0) = R1
+// 3) if (p0.new) memw(R0+#0) = R1      --->
+//
+
+int HexagonInstrInfo::GetDotOldOp(const int opc) const {
+  int NewOp = opc;
+  if (isPredicated(NewOp) && isPredicatedNew(NewOp)) { // Get predicate old form
+    NewOp = Hexagon::getPredOldOpcode(NewOp);
+    assert(NewOp >= 0 &&
+           "Couldn't change predicate new instruction to its old form.");
+  }
+
+  if (isNewValueStore(NewOp)) { // Convert into non-new-value format
+    NewOp = Hexagon::getNonNVStore(NewOp);
+    assert(NewOp >= 0 && "Couldn't change new-value store to its old form.");
+  }
+  return NewOp;
+}
+
+// Return the new value instruction for a given store.
+int HexagonInstrInfo::GetDotNewOp(const MachineInstr* MI) const {
+  int NVOpcode = Hexagon::getNewValueOpcode(MI->getOpcode());
+  if (NVOpcode >= 0) // Valid new-value store instruction.
+    return NVOpcode;
+
+  switch (MI->getOpcode()) {
+  default: llvm_unreachable("Unknown .new type");
+  // store new value byte
+  case Hexagon::STrib_shl_V4:
+    return Hexagon::STrib_shl_nv_V4;
+
+  case Hexagon::STrih_shl_V4:
+    return Hexagon::STrih_shl_nv_V4;
+
+  case Hexagon::STriw_f:
+    return Hexagon::STriw_nv_V4;
+
+  case Hexagon::STriw_indexed_f:
+    return Hexagon::STriw_indexed_nv_V4;
+
+  case Hexagon::STriw_shl_V4:
+    return Hexagon::STriw_shl_nv_V4;
+
+  }
+  return 0;
+}
+
+// Return .new predicate version for an instruction.
+int HexagonInstrInfo::GetDotNewPredOp(MachineInstr *MI,
+                                      const MachineBranchProbabilityInfo
+                                      *MBPI) const {
+
+  int NewOpcode = Hexagon::getPredNewOpcode(MI->getOpcode());
+  if (NewOpcode >= 0) // Valid predicate new instruction
+    return NewOpcode;
+
+  switch (MI->getOpcode()) {
+  default: llvm_unreachable("Unknown .new type");
+  // Condtional Jumps
+  case Hexagon::JMP_t:
+  case Hexagon::JMP_f:
+    return getDotNewPredJumpOp(MI, MBPI);
+
+  case Hexagon::JMPR_t:
+    return Hexagon::JMPR_tnew_tV3;
+
+  case Hexagon::JMPR_f:
+    return Hexagon::JMPR_fnew_tV3;
+
+  case Hexagon::JMPret_t:
+    return Hexagon::JMPret_tnew_tV3;
+
+  case Hexagon::JMPret_f:
+    return Hexagon::JMPret_fnew_tV3;
+
+
+  // Conditional combine
+  case Hexagon::COMBINE_rr_cPt :
+    return Hexagon::COMBINE_rr_cdnPt;
+  case Hexagon::COMBINE_rr_cNotPt :
+    return Hexagon::COMBINE_rr_cdnNotPt;
+  }
+}
+
+
+unsigned HexagonInstrInfo::getAddrMode(const MachineInstr* MI) const {
+  const uint64_t F = MI->getDesc().TSFlags;
+
+  return((F >> HexagonII::AddrModePos) & HexagonII::AddrModeMask);
+}
+
+/// immediateExtend - Changes the instruction in place to one using an immediate
+/// extender.
+void HexagonInstrInfo::immediateExtend(MachineInstr *MI) const {
+  assert((isExtendable(MI)||isConstExtended(MI)) &&
+                               "Instruction must be extendable");
+  // Find which operand is extendable.
+  short ExtOpNum = getCExtOpNum(MI);
+  MachineOperand &MO = MI->getOperand(ExtOpNum);
+  // This needs to be something we understand.
+  assert((MO.isMBB() || MO.isImm()) &&
+         "Branch with unknown extendable field type");
+  // Mark given operand as extended.
+  MO.addTargetFlag(HexagonII::HMOTF_ConstExtended);
+}
+
+DFAPacketizer *HexagonInstrInfo::
+CreateTargetScheduleState(const TargetMachine *TM,
+                           const ScheduleDAG *DAG) const {
+  const InstrItineraryData *II = TM->getInstrItineraryData();
+  return TM->getSubtarget<HexagonGenSubtargetInfo>().createDFAPacketizer(II);
+}
+
+bool HexagonInstrInfo::isSchedulingBoundary(const MachineInstr *MI,
+                                            const MachineBasicBlock *MBB,
+                                            const MachineFunction &MF) const {
+  // Debug info is never a scheduling boundary. It's necessary to be explicit
+  // due to the special treatment of IT instructions below, otherwise a
+  // dbg_value followed by an IT will result in the IT instruction being
+  // considered a scheduling hazard, which is wrong. It should be the actual
+  // instruction preceding the dbg_value instruction(s), just like it is
+  // when debug info is not present.
+  if (MI->isDebugValue())
+    return false;
+
+  // Terminators and labels can't be scheduled around.
+  if (MI->getDesc().isTerminator() || MI->isPosition() || MI->isInlineAsm())
+    return true;
+
+  return false;
+}
+
+bool HexagonInstrInfo::isConstExtended(MachineInstr *MI) const {
+
+  // Constant extenders are allowed only for V4 and above.
+  if (!Subtarget.hasV4TOps())
+    return false;
+
+  const uint64_t F = MI->getDesc().TSFlags;
+  unsigned isExtended = (F >> HexagonII::ExtendedPos) & HexagonII::ExtendedMask;
+  if (isExtended) // Instruction must be extended.
+    return true;
+
+  unsigned isExtendable = (F >> HexagonII::ExtendablePos)
+                          & HexagonII::ExtendableMask;
+  if (!isExtendable)
+    return false;
+
+  short ExtOpNum = getCExtOpNum(MI);
+  const MachineOperand &MO = MI->getOperand(ExtOpNum);
+  // Use MO operand flags to determine if MO
+  // has the HMOTF_ConstExtended flag set.
+  if (MO.getTargetFlags() && HexagonII::HMOTF_ConstExtended)
+    return true;
+  // If this is a Machine BB address we are talking about, and it is
+  // not marked as extended, say so.
+  if (MO.isMBB())
+    return false;
+
+  // We could be using an instruction with an extendable immediate and shoehorn
+  // a global address into it. If it is a global address it will be constant
+  // extended. We do this for COMBINE.
+  // We currently only handle isGlobal() because it is the only kind of
+  // object we are going to end up with here for now.
+  // In the future we probably should add isSymbol(), etc.
+  if (MO.isGlobal() || MO.isSymbol())
+    return true;
+
+  // If the extendable operand is not 'Immediate' type, the instruction should
+  // have 'isExtended' flag set.
+  assert(MO.isImm() && "Extendable operand must be Immediate type");
+
+  int MinValue = getMinValue(MI);
+  int MaxValue = getMaxValue(MI);
+  int ImmValue = MO.getImm();
+
+  return (ImmValue < MinValue || ImmValue > MaxValue);
+}
+
+// Returns the opcode to use when converting MI, which is a conditional jump,
+// into a conditional instruction which uses the .new value of the predicate.
+// We also use branch probabilities to add a hint to the jump.
+int
+HexagonInstrInfo::getDotNewPredJumpOp(MachineInstr *MI,
+                                  const
+                                  MachineBranchProbabilityInfo *MBPI) const {
+
+  // We assume that block can have at most two successors.
+  bool taken = false;
+  MachineBasicBlock *Src = MI->getParent();
+  MachineOperand *BrTarget = &MI->getOperand(1);
+  MachineBasicBlock *Dst = BrTarget->getMBB();
+
+  const BranchProbability Prediction = MBPI->getEdgeProbability(Src, Dst);
+  if (Prediction >= BranchProbability(1,2))
+    taken = true;
+
+  switch (MI->getOpcode()) {
+  case Hexagon::JMP_t:
+    return taken ? Hexagon::JMP_tnew_t : Hexagon::JMP_tnew_nt;
+  case Hexagon::JMP_f:
+    return taken ? Hexagon::JMP_fnew_t : Hexagon::JMP_fnew_nt;
+
+  default:
+    llvm_unreachable("Unexpected jump instruction.");
+  }
+}
+// Returns true if a particular operand is extendable for an instruction.
+bool HexagonInstrInfo::isOperandExtended(const MachineInstr *MI,
+                                         unsigned short OperandNum) const {
+  // Constant extenders are allowed only for V4 and above.
+  if (!Subtarget.hasV4TOps())
+    return false;
+
+  const uint64_t F = MI->getDesc().TSFlags;
+
+  return ((F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask)
+          == OperandNum;
+}
+
+// Returns Operand Index for the constant extended instruction.
+unsigned short HexagonInstrInfo::getCExtOpNum(const MachineInstr *MI) const {
+  const uint64_t F = MI->getDesc().TSFlags;
+  return ((F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask);
+}
+
+// Returns the min value that doesn't need to be extended.
+int HexagonInstrInfo::getMinValue(const MachineInstr *MI) const {
+  const uint64_t F = MI->getDesc().TSFlags;
+  unsigned isSigned = (F >> HexagonII::ExtentSignedPos)
+                    & HexagonII::ExtentSignedMask;
+  unsigned bits =  (F >> HexagonII::ExtentBitsPos)
+                    & HexagonII::ExtentBitsMask;
+
+  if (isSigned) // if value is signed
+    return -1 << (bits - 1);
+  else
+    return 0;
+}
+
+// Returns the max value that doesn't need to be extended.
+int HexagonInstrInfo::getMaxValue(const MachineInstr *MI) const {
+  const uint64_t F = MI->getDesc().TSFlags;
+  unsigned isSigned = (F >> HexagonII::ExtentSignedPos)
+                    & HexagonII::ExtentSignedMask;
+  unsigned bits =  (F >> HexagonII::ExtentBitsPos)
+                    & HexagonII::ExtentBitsMask;
+
+  if (isSigned) // if value is signed
+    return ~(-1 << (bits - 1));
+  else
+    return ~(-1 << bits);
+}
+
+// Returns true if an instruction can be converted into a non-extended
+// equivalent instruction.
+bool HexagonInstrInfo::NonExtEquivalentExists (const MachineInstr *MI) const {
+
+  short NonExtOpcode;
+  // Check if the instruction has a register form that uses register in place
+  // of the extended operand, if so return that as the non-extended form.
+  if (Hexagon::getRegForm(MI->getOpcode()) >= 0)
+    return true;
+
+  if (MI->getDesc().mayLoad() || MI->getDesc().mayStore()) {
+    // Check addressing mode and retrieve non-ext equivalent instruction.
+
+    switch (getAddrMode(MI)) {
+    case HexagonII::Absolute :
+      // Load/store with absolute addressing mode can be converted into
+      // base+offset mode.
+      NonExtOpcode = Hexagon::getBasedWithImmOffset(MI->getOpcode());
+      break;
+    case HexagonII::BaseImmOffset :
+      // Load/store with base+offset addressing mode can be converted into
+      // base+register offset addressing mode. However left shift operand should
+      // be set to 0.
+      NonExtOpcode = Hexagon::getBaseWithRegOffset(MI->getOpcode());
+      break;
+    default:
+      return false;
+    }
+    if (NonExtOpcode < 0)
+      return false;
+    return true;
+  }
+  return false;
+}
+
+// Returns opcode of the non-extended equivalent instruction.
+short HexagonInstrInfo::getNonExtOpcode (const MachineInstr *MI) const {
+
+  // Check if the instruction has a register form that uses register in place
+  // of the extended operand, if so return that as the non-extended form.
+  short NonExtOpcode = Hexagon::getRegForm(MI->getOpcode());
+    if (NonExtOpcode >= 0)
+      return NonExtOpcode;
+
+  if (MI->getDesc().mayLoad() || MI->getDesc().mayStore()) {
+    // Check addressing mode and retrieve non-ext equivalent instruction.
+    switch (getAddrMode(MI)) {
+    case HexagonII::Absolute :
+      return Hexagon::getBasedWithImmOffset(MI->getOpcode());
+    case HexagonII::BaseImmOffset :
+      return Hexagon::getBaseWithRegOffset(MI->getOpcode());
+    default:
+      return -1;
+    }
+  }
+  return -1;
+}
+
+bool HexagonInstrInfo::PredOpcodeHasJMP_c(Opcode_t Opcode) const {
+  return (Opcode == Hexagon::JMP_t) ||
+         (Opcode == Hexagon::JMP_f) ||
+         (Opcode == Hexagon::JMP_tnew_t) ||
+         (Opcode == Hexagon::JMP_fnew_t) ||
+         (Opcode == Hexagon::JMP_tnew_nt) ||
+         (Opcode == Hexagon::JMP_fnew_nt);
+}
+
+bool HexagonInstrInfo::PredOpcodeHasNot(Opcode_t Opcode) const {
+  return (Opcode == Hexagon::JMP_f) ||
+         (Opcode == Hexagon::JMP_fnew_t) ||
+         (Opcode == Hexagon::JMP_fnew_nt);
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
new file mode 100644
index 0000000..6b032c9
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -0,0 +1,222 @@
+//===- HexagonInstrInfo.h - Hexagon Instruction Information -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Hexagon implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HexagonINSTRUCTIONINFO_H
+#define HexagonINSTRUCTIONINFO_H
+
+#include "HexagonRegisterInfo.h"
+#include "MCTargetDesc/HexagonBaseInfo.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "HexagonGenInstrInfo.inc"
+
+namespace llvm {
+
+struct EVT;
+
+class HexagonInstrInfo : public HexagonGenInstrInfo {
+  virtual void anchor();
+  const HexagonRegisterInfo RI;
+  const HexagonSubtarget &Subtarget;
+  typedef unsigned Opcode_t;
+
+public:
+  explicit HexagonInstrInfo(HexagonSubtarget &ST);
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  const HexagonRegisterInfo &getRegisterInfo() const { return RI; }
+
+  /// isLoadFromStackSlot - If the specified machine instruction is a direct
+  /// load from a stack slot, return the virtual or physical register number of
+  /// the destination along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than loading from the stack slot.
+  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                               int &FrameIndex) const override;
+
+  /// isStoreToStackSlot - If the specified machine instruction is a direct
+  /// store to a stack slot, return the virtual or physical register number of
+  /// the source reg along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than storing to the stack slot.
+  unsigned isStoreToStackSlot(const MachineInstr *MI,
+                              int &FrameIndex) const override;
+
+
+  bool AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
+                         MachineBasicBlock *&FBB,
+                         SmallVectorImpl<MachineOperand> &Cond,
+                         bool AllowModify) const override;
+
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
+
+  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB,
+                        const SmallVectorImpl<MachineOperand> &Cond,
+                        DebugLoc DL) const override;
+
+  bool analyzeCompare(const MachineInstr *MI,
+                      unsigned &SrcReg, unsigned &SrcReg2,
+                      int &Mask, int &Value) const override;
+
+  void copyPhysReg(MachineBasicBlock &MBB,
+                   MachineBasicBlock::iterator I, DebugLoc DL,
+                   unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+
+  void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
+                      SmallVectorImpl<MachineOperand> &Addr,
+                      const TargetRegisterClass *RC,
+                      SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
+
+  void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+                       SmallVectorImpl<MachineOperand> &Addr,
+                       const TargetRegisterClass *RC,
+                       SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+  MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                      MachineInstr* MI,
+                                      const SmallVectorImpl<unsigned> &Ops,
+                                      int FrameIndex) const override;
+
+  MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                      MachineInstr* MI,
+                                      const SmallVectorImpl<unsigned> &Ops,
+                                      MachineInstr* LoadMI) const override {
+    return nullptr;
+  }
+
+  unsigned createVR(MachineFunction* MF, MVT VT) const;
+
+  bool isBranch(const MachineInstr *MI) const;
+  bool isPredicable(MachineInstr *MI) const override;
+  bool PredicateInstruction(MachineInstr *MI,
+                    const SmallVectorImpl<MachineOperand> &Cond) const override;
+
+  bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
+                           unsigned ExtraPredCycles,
+                           const BranchProbability &Probability) const override;
+
+  bool isProfitableToIfCvt(MachineBasicBlock &TMBB,
+                           unsigned NumTCycles, unsigned ExtraTCycles,
+                           MachineBasicBlock &FMBB,
+                           unsigned NumFCycles, unsigned ExtraFCycles,
+                           const BranchProbability &Probability) const override;
+
+  bool isPredicated(const MachineInstr *MI) const override;
+  bool isPredicated(unsigned Opcode) const;
+  bool isPredicatedTrue(const MachineInstr *MI) const;
+  bool isPredicatedTrue(unsigned Opcode) const;
+  bool isPredicatedNew(const MachineInstr *MI) const;
+  bool isPredicatedNew(unsigned Opcode) const;
+  bool DefinesPredicate(MachineInstr *MI,
+                        std::vector<MachineOperand> &Pred) const override;
+  bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
+                   const SmallVectorImpl<MachineOperand> &Pred2) const override;
+
+  bool
+  ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+
+  bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
+                           const BranchProbability &Probability) const override;
+
+  DFAPacketizer*
+  CreateTargetScheduleState(const TargetMachine *TM,
+                            const ScheduleDAG *DAG) const override;
+
+  bool isSchedulingBoundary(const MachineInstr *MI,
+                            const MachineBasicBlock *MBB,
+                            const MachineFunction &MF) const override;
+  bool isValidOffset(const int Opcode, const int Offset) const;
+  bool isValidAutoIncImm(const EVT VT, const int Offset) const;
+  bool isMemOp(const MachineInstr *MI) const;
+  bool isSpillPredRegOp(const MachineInstr *MI) const;
+  bool isU6_3Immediate(const int value) const;
+  bool isU6_2Immediate(const int value) const;
+  bool isU6_1Immediate(const int value) const;
+  bool isU6_0Immediate(const int value) const;
+  bool isS4_3Immediate(const int value) const;
+  bool isS4_2Immediate(const int value) const;
+  bool isS4_1Immediate(const int value) const;
+  bool isS4_0Immediate(const int value) const;
+  bool isS12_Immediate(const int value) const;
+  bool isU6_Immediate(const int value) const;
+  bool isS8_Immediate(const int value) const;
+  bool isS6_Immediate(const int value) const;
+
+  bool isSaveCalleeSavedRegsCall(const MachineInstr* MI) const;
+  bool isConditionalTransfer(const MachineInstr* MI) const;
+  bool isConditionalALU32 (const MachineInstr* MI) const;
+  bool isConditionalLoad (const MachineInstr* MI) const;
+  bool isConditionalStore(const MachineInstr* MI) const;
+  bool isNewValueInst(const MachineInstr* MI) const;
+  bool isNewValue(const MachineInstr* MI) const;
+  bool isDotNewInst(const MachineInstr* MI) const;
+  int GetDotOldOp(const int opc) const;
+  int GetDotNewOp(const MachineInstr* MI) const;
+  int GetDotNewPredOp(MachineInstr *MI,
+                      const MachineBranchProbabilityInfo
+                      *MBPI) const;
+  bool mayBeNewStore(const MachineInstr* MI) const;
+  bool isDeallocRet(const MachineInstr *MI) const;
+  unsigned getInvertedPredicatedOpcode(const int Opc) const;
+  bool isExtendable(const MachineInstr* MI) const;
+  bool isExtended(const MachineInstr* MI) const;
+  bool isPostIncrement(const MachineInstr* MI) const;
+  bool isNewValueStore(const MachineInstr* MI) const;
+  bool isNewValueStore(unsigned Opcode) const;
+  bool isNewValueJump(const MachineInstr* MI) const;
+  bool isNewValueJumpCandidate(const MachineInstr *MI) const;
+
+
+  void immediateExtend(MachineInstr *MI) const;
+  bool isConstExtended(MachineInstr *MI) const;
+  int getDotNewPredJumpOp(MachineInstr *MI,
+                      const MachineBranchProbabilityInfo *MBPI) const;
+  unsigned getAddrMode(const MachineInstr* MI) const;
+  bool isOperandExtended(const MachineInstr *MI,
+                         unsigned short OperandNum) const;
+  unsigned short getCExtOpNum(const MachineInstr *MI) const;
+  int getMinValue(const MachineInstr *MI) const;
+  int getMaxValue(const MachineInstr *MI) const;
+  bool NonExtEquivalentExists (const MachineInstr *MI) const;
+  short getNonExtOpcode(const MachineInstr *MI) const;
+  bool PredOpcodeHasJMP_c(Opcode_t Opcode) const;
+  bool PredOpcodeHasNot(Opcode_t Opcode) const;
+
+private:
+  int getMatchingCondBranchOpcode(int Opc, bool sense) const;
+
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.td
new file mode 100644
index 0000000..4dcf101
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.td
@@ -0,0 +1,2853 @@
+//==- HexagonInstrInfo.td - Target Description for Hexagon -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+include "HexagonInstrFormats.td"
+include "HexagonOperands.td"
+
+//===----------------------------------------------------------------------===//
+
+// Multi-class for logical operators.
+multiclass ALU32_rr_ri<string OpcStr, SDNode OpNode> {
+  def rr : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
+                 !strconcat("$dst = ", !strconcat(OpcStr, "($b, $c)")),
+                 [(set (i32 IntRegs:$dst), (OpNode (i32 IntRegs:$b),
+                                                   (i32 IntRegs:$c)))]>;
+  def ri : ALU32_ri<(outs IntRegs:$dst), (ins s10Imm:$b, IntRegs:$c),
+                 !strconcat("$dst = ", !strconcat(OpcStr, "(#$b, $c)")),
+                 [(set (i32 IntRegs:$dst), (OpNode s10Imm:$b,
+                                                   (i32 IntRegs:$c)))]>;
+}
+
+// Multi-class for compare ops.
+let isCompare = 1 in {
+multiclass CMP64_rr<string OpcStr, PatFrag OpNode> {
+  def rr : ALU64_rr<(outs PredRegs:$dst), (ins DoubleRegs:$b, DoubleRegs:$c),
+                 !strconcat("$dst = ", !strconcat(OpcStr, "($b, $c)")),
+                 [(set (i1 PredRegs:$dst),
+                       (OpNode (i64 DoubleRegs:$b), (i64 DoubleRegs:$c)))]>;
+}
+
+multiclass CMP32_rr_ri_s10<string OpcStr, string CextOp, PatFrag OpNode> {
+  let CextOpcode = CextOp in {
+    let InputType = "reg" in
+    def rr : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
+                   !strconcat("$dst = ", !strconcat(OpcStr, "($b, $c)")),
+                   [(set (i1 PredRegs:$dst),
+                         (OpNode (i32 IntRegs:$b), (i32 IntRegs:$c)))]>;
+
+    let isExtendable = 1, opExtendable = 2, isExtentSigned = 1,
+    opExtentBits = 10, InputType = "imm" in
+    def ri : ALU32_ri<(outs PredRegs:$dst), (ins IntRegs:$b, s10Ext:$c),
+                   !strconcat("$dst = ", !strconcat(OpcStr, "($b, #$c)")),
+                   [(set (i1 PredRegs:$dst),
+                         (OpNode (i32 IntRegs:$b), s10ExtPred:$c))]>;
+  }
+}
+
+multiclass CMP32_rr_ri_u9<string OpcStr, string CextOp, PatFrag OpNode> {
+  let CextOpcode = CextOp in {
+    let InputType = "reg" in
+    def rr : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
+                   !strconcat("$dst = ", !strconcat(OpcStr, "($b, $c)")),
+                   [(set (i1 PredRegs:$dst),
+                         (OpNode (i32 IntRegs:$b), (i32 IntRegs:$c)))]>;
+
+    let isExtendable = 1, opExtendable = 2, isExtentSigned = 0,
+    opExtentBits = 9, InputType = "imm" in
+    def ri : ALU32_ri<(outs PredRegs:$dst), (ins IntRegs:$b, u9Ext:$c),
+                   !strconcat("$dst = ", !strconcat(OpcStr, "($b, #$c)")),
+                   [(set (i1 PredRegs:$dst),
+                         (OpNode (i32 IntRegs:$b), u9ExtPred:$c))]>;
+  }
+}
+
+multiclass CMP32_ri_s8<string OpcStr, PatFrag OpNode> {
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 8 in
+  def ri : ALU32_ri<(outs PredRegs:$dst), (ins IntRegs:$b, s8Ext:$c),
+                 !strconcat("$dst = ", !strconcat(OpcStr, "($b, #$c)")),
+                 [(set (i1 PredRegs:$dst), (OpNode (i32 IntRegs:$b),
+                                                   s8ExtPred:$c))]>;
+}
+}
+
+//===----------------------------------------------------------------------===//
+// ALU32/ALU (Instructions with register-register form)
+//===----------------------------------------------------------------------===//
+def SDTHexagonI64I32I32 : SDTypeProfile<1, 2,
+  [SDTCisVT<0, i64>, SDTCisVT<1, i32>, SDTCisSameAs<1, 2>]>;
+
+def HexagonWrapperCombineII :
+  SDNode<"HexagonISD::WrapperCombineII", SDTHexagonI64I32I32>;
+
+def HexagonWrapperCombineRR :
+  SDNode<"HexagonISD::WrapperCombineRR", SDTHexagonI64I32I32>;
+
+multiclass ALU32_Pbase<string mnemonic, RegisterClass RC, bit isNot,
+                       bit isPredNew> {
+  let isPredicatedNew = isPredNew in
+  def NAME : ALU32_rr<(outs RC:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs: $src3),
+            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew,".new) $dst = ",
+            ") $dst = ")#mnemonic#"($src2, $src3)",
+            []>;
+}
+
+multiclass ALU32_Pred<string mnemonic, RegisterClass RC, bit PredNot> {
+  let isPredicatedFalse = PredNot in {
+    defm _c#NAME : ALU32_Pbase<mnemonic, RC, PredNot, 0>;
+    // Predicate new
+    defm _cdn#NAME : ALU32_Pbase<mnemonic, RC, PredNot, 1>;
+  }
+}
+
+let InputType = "reg" in
+multiclass ALU32_base<string mnemonic, string CextOp, SDNode OpNode> {
+  let CextOpcode = CextOp, BaseOpcode = CextOp#_rr in {
+    let isPredicable = 1 in
+    def NAME : ALU32_rr<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2),
+            "$dst = "#mnemonic#"($src1, $src2)",
+            [(set (i32 IntRegs:$dst), (OpNode (i32 IntRegs:$src1),
+                                              (i32 IntRegs:$src2)))]>;
+
+    let neverHasSideEffects = 1, isPredicated = 1 in {
+      defm Pt : ALU32_Pred<mnemonic, IntRegs, 0>;
+      defm NotPt : ALU32_Pred<mnemonic, IntRegs, 1>;
+    }
+  }
+}
+
+let isCommutable = 1 in {
+  defm ADD_rr : ALU32_base<"add", "ADD", add>, ImmRegRel, PredNewRel;
+  defm AND_rr : ALU32_base<"and", "AND", and>, ImmRegRel, PredNewRel;
+  defm XOR_rr : ALU32_base<"xor", "XOR", xor>, ImmRegRel, PredNewRel;
+  defm OR_rr  : ALU32_base<"or", "OR", or>, ImmRegRel, PredNewRel;
+}
+
+defm SUB_rr : ALU32_base<"sub", "SUB", sub>, ImmRegRel, PredNewRel;
+
+// Combines the two integer registers SRC1 and SRC2 into a double register.
+let isPredicable = 1 in
+class T_Combine : ALU32_rr<(outs DoubleRegs:$dst),
+                           (ins IntRegs:$src1, IntRegs:$src2),
+            "$dst = combine($src1, $src2)",
+            [(set (i64 DoubleRegs:$dst),
+              (i64 (HexagonWrapperCombineRR (i32 IntRegs:$src1),
+                                            (i32 IntRegs:$src2))))]>;
+
+multiclass Combine_base {
+  let BaseOpcode = "combine" in {
+    def NAME : T_Combine;
+    let neverHasSideEffects = 1, isPredicated = 1 in {
+      defm Pt : ALU32_Pred<"combine", DoubleRegs, 0>;
+      defm NotPt : ALU32_Pred<"combine", DoubleRegs, 1>;
+    }
+  }
+}
+
+defm COMBINE_rr : Combine_base, PredNewRel;
+
+// Combines the two immediates SRC1 and SRC2 into a double register.
+class COMBINE_imm<Operand imm1, Operand imm2, PatLeaf pat1, PatLeaf pat2> :
+  ALU32_ii<(outs DoubleRegs:$dst), (ins imm1:$src1, imm2:$src2),
+  "$dst = combine(#$src1, #$src2)",
+  [(set (i64 DoubleRegs:$dst),
+        (i64 (HexagonWrapperCombineII (i32 pat1:$src1), (i32 pat2:$src2))))]>;
+
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 8 in
+def COMBINE_Ii : COMBINE_imm<s8Ext, s8Imm, s8ExtPred, s8ImmPred>;
+
+//===----------------------------------------------------------------------===//
+// ALU32/ALU (ADD with register-immediate form)
+//===----------------------------------------------------------------------===//
+multiclass ALU32ri_Pbase<string mnemonic, bit isNot, bit isPredNew> {
+  let isPredicatedNew = isPredNew in
+  def NAME : ALU32_ri<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, s8Ext: $src3),
+            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew,".new) $dst = ",
+            ") $dst = ")#mnemonic#"($src2, #$src3)",
+            []>;
+}
+
+multiclass ALU32ri_Pred<string mnemonic, bit PredNot> {
+  let isPredicatedFalse = PredNot in {
+    defm _c#NAME : ALU32ri_Pbase<mnemonic, PredNot, 0>;
+    // Predicate new
+    defm _cdn#NAME : ALU32ri_Pbase<mnemonic, PredNot, 1>;
+  }
+}
+
+let isExtendable = 1, InputType = "imm" in
+multiclass ALU32ri_base<string mnemonic, string CextOp, SDNode OpNode> {
+  let CextOpcode = CextOp, BaseOpcode = CextOp#_ri in {
+    let opExtendable = 2, isExtentSigned = 1, opExtentBits = 16,
+    isPredicable = 1 in
+    def NAME : ALU32_ri<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, s16Ext:$src2),
+            "$dst = "#mnemonic#"($src1, #$src2)",
+            [(set (i32 IntRegs:$dst), (OpNode (i32 IntRegs:$src1),
+                                              (s16ExtPred:$src2)))]>;
+
+    let opExtendable = 3, isExtentSigned = 1, opExtentBits = 8,
+    neverHasSideEffects = 1, isPredicated = 1 in {
+      defm Pt : ALU32ri_Pred<mnemonic, 0>;
+      defm NotPt : ALU32ri_Pred<mnemonic, 1>;
+    }
+  }
+}
+
+defm ADD_ri : ALU32ri_base<"add", "ADD", add>, ImmRegRel, PredNewRel;
+
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 10,
+CextOpcode = "OR", InputType = "imm" in
+def OR_ri : ALU32_ri<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, s10Ext:$src2),
+            "$dst = or($src1, #$src2)",
+            [(set (i32 IntRegs:$dst), (or (i32 IntRegs:$src1),
+                                          s10ExtPred:$src2))]>, ImmRegRel;
+
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 10,
+InputType = "imm", CextOpcode = "AND" in
+def AND_ri : ALU32_ri<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, s10Ext:$src2),
+            "$dst = and($src1, #$src2)",
+            [(set (i32 IntRegs:$dst), (and (i32 IntRegs:$src1),
+                                           s10ExtPred:$src2))]>, ImmRegRel;
+
+// Nop.
+let neverHasSideEffects = 1 in
+def NOP : ALU32_rr<(outs), (ins),
+          "nop",
+          []>;
+
+// Rd32=sub(#s10,Rs32)
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 10,
+CextOpcode = "SUB", InputType = "imm" in
+def SUB_ri : ALU32_ri<(outs IntRegs:$dst),
+            (ins s10Ext:$src1, IntRegs:$src2),
+            "$dst = sub(#$src1, $src2)",
+            [(set IntRegs:$dst, (sub s10ExtPred:$src1, IntRegs:$src2))]>,
+            ImmRegRel;
+
+// Rd = not(Rs) gets mapped to Rd=sub(#-1, Rs).
+def : Pat<(not (i32 IntRegs:$src1)),
+          (SUB_ri -1, (i32 IntRegs:$src1))>;
+
+// Rd = neg(Rs) gets mapped to Rd=sub(#0, Rs).
+// Pattern definition for 'neg' was not necessary.
+
+multiclass TFR_Pred<bit PredNot> {
+  let isPredicatedFalse = PredNot in {
+    def _c#NAME : ALU32_rr<(outs IntRegs:$dst),
+                           (ins PredRegs:$src1, IntRegs:$src2),
+            !if(PredNot, "if (!$src1", "if ($src1")#") $dst = $src2",
+            []>;
+    // Predicate new
+    let isPredicatedNew = 1 in
+    def _cdn#NAME : ALU32_rr<(outs IntRegs:$dst),
+                             (ins PredRegs:$src1, IntRegs:$src2),
+            !if(PredNot, "if (!$src1", "if ($src1")#".new) $dst = $src2",
+            []>;
+  }
+}
+
+let InputType = "reg", neverHasSideEffects = 1 in
+multiclass TFR_base<string CextOp> {
+  let CextOpcode = CextOp, BaseOpcode = CextOp in {
+    let isPredicable = 1 in
+    def NAME : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1),
+            "$dst = $src1",
+            []>;
+
+    let  isPredicated = 1 in {
+      defm Pt : TFR_Pred<0>;
+      defm NotPt : TFR_Pred<1>;
+    }
+  }
+}
+
+class T_TFR64_Pred<bit PredNot, bit isPredNew>
+            : ALU32_rr<(outs DoubleRegs:$dst),
+                       (ins PredRegs:$src1, DoubleRegs:$src2),
+            !if(PredNot, "if (!$src1", "if ($src1")#
+            !if(isPredNew, ".new) ", ") ")#"$dst = $src2", []>
+{
+    bits<5> dst;
+    bits<2> src1;
+    bits<5> src2;
+
+    let IClass = 0b1111;
+    let Inst{27-24} = 0b1101;
+    let Inst{13} = isPredNew;
+    let Inst{7} = PredNot;
+    let Inst{4-0} = dst;
+    let Inst{6-5} = src1;
+    let Inst{20-17} = src2{4-1};
+    let Inst{16} = 0b1;
+    let Inst{12-9} = src2{4-1};
+    let Inst{8} = 0b0;
+}
+
+multiclass TFR64_Pred<bit PredNot> {
+  let isPredicatedFalse = PredNot in {
+    def _c#NAME : T_TFR64_Pred<PredNot, 0>;
+
+    let isPredicatedNew = 1 in
+    def _cdn#NAME : T_TFR64_Pred<PredNot, 1>; // Predicate new
+  }
+}
+
+let neverHasSideEffects = 1 in
+multiclass TFR64_base<string BaseName> {
+  let BaseOpcode = BaseName in {
+    let isPredicable = 1 in
+    def NAME : ALU32Inst <(outs DoubleRegs:$dst),
+                          (ins DoubleRegs:$src1),
+                          "$dst = $src1" > {
+        bits<5> dst;
+        bits<5> src1;
+
+        let IClass = 0b1111;
+        let Inst{27-23} = 0b01010;
+        let Inst{4-0} = dst;
+        let Inst{20-17} = src1{4-1};
+        let Inst{16} = 0b1;
+        let Inst{12-9} = src1{4-1};
+        let Inst{8} = 0b0;
+    }
+
+    let  isPredicated = 1 in {
+      defm Pt : TFR64_Pred<0>;
+      defm NotPt : TFR64_Pred<1>;
+    }
+  }
+}
+
+multiclass TFRI_Pred<bit PredNot> {
+  let isMoveImm = 1, isPredicatedFalse = PredNot in {
+    def _c#NAME : ALU32_ri<(outs IntRegs:$dst),
+                           (ins PredRegs:$src1, s12Ext:$src2),
+            !if(PredNot, "if (!$src1", "if ($src1")#") $dst = #$src2",
+            []>;
+
+    // Predicate new
+    let isPredicatedNew = 1 in
+    def _cdn#NAME : ALU32_rr<(outs IntRegs:$dst),
+                             (ins PredRegs:$src1, s12Ext:$src2),
+            !if(PredNot, "if (!$src1", "if ($src1")#".new) $dst = #$src2",
+            []>;
+  }
+}
+
+let InputType = "imm", isExtendable = 1, isExtentSigned = 1 in
+multiclass TFRI_base<string CextOp> {
+  let CextOpcode = CextOp, BaseOpcode = CextOp#I in {
+    let isAsCheapAsAMove = 1 , opExtendable = 1, opExtentBits = 16,
+    isMoveImm = 1, isPredicable = 1, isReMaterializable = 1 in
+    def NAME : ALU32_ri<(outs IntRegs:$dst), (ins s16Ext:$src1),
+            "$dst = #$src1",
+            [(set (i32 IntRegs:$dst), s16ExtPred:$src1)]>;
+
+    let opExtendable = 2,  opExtentBits = 12, neverHasSideEffects = 1,
+    isPredicated = 1 in {
+      defm Pt    : TFRI_Pred<0>;
+      defm NotPt : TFRI_Pred<1>;
+    }
+  }
+}
+
+defm TFRI : TFRI_base<"TFR">, ImmRegRel, PredNewRel;
+defm TFR : TFR_base<"TFR">, ImmRegRel, PredNewRel;
+defm TFR64 : TFR64_base<"TFR64">, PredNewRel;
+
+// Transfer control register.
+let neverHasSideEffects = 1 in
+def TFCR : CRInst<(outs CRRegs:$dst), (ins IntRegs:$src1),
+           "$dst = $src1",
+           []>;
+//===----------------------------------------------------------------------===//
+// ALU32/ALU -
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// ALU32/PERM +
+//===----------------------------------------------------------------------===//
+
+let neverHasSideEffects = 1 in
+def COMBINE_ii : ALU32_ii<(outs DoubleRegs:$dst),
+            (ins s8Imm:$src1, s8Imm:$src2),
+            "$dst = combine(#$src1, #$src2)",
+            []>;
+
+// Mux.
+def VMUX_prr64 : ALU64_rr<(outs DoubleRegs:$dst), (ins PredRegs:$src1,
+                                                   DoubleRegs:$src2,
+                                                   DoubleRegs:$src3),
+            "$dst = vmux($src1, $src2, $src3)",
+            []>;
+
+let CextOpcode = "MUX", InputType = "reg" in
+def MUX_rr : ALU32_rr<(outs IntRegs:$dst), (ins PredRegs:$src1,
+                                            IntRegs:$src2, IntRegs:$src3),
+             "$dst = mux($src1, $src2, $src3)",
+             [(set (i32 IntRegs:$dst),
+                   (i32 (select (i1 PredRegs:$src1), (i32 IntRegs:$src2),
+                                (i32 IntRegs:$src3))))]>, ImmRegRel;
+
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 8,
+CextOpcode = "MUX", InputType = "imm" in
+def MUX_ir : ALU32_ir<(outs IntRegs:$dst), (ins PredRegs:$src1, s8Ext:$src2,
+                                                IntRegs:$src3),
+             "$dst = mux($src1, #$src2, $src3)",
+             [(set (i32 IntRegs:$dst),
+                   (i32 (select (i1 PredRegs:$src1), s8ExtPred:$src2,
+                                (i32 IntRegs:$src3))))]>, ImmRegRel;
+
+let isExtendable = 1, opExtendable = 3, isExtentSigned = 1, opExtentBits = 8,
+CextOpcode = "MUX", InputType = "imm" in
+def MUX_ri : ALU32_ri<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2,
+                                                s8Ext:$src3),
+             "$dst = mux($src1, $src2, #$src3)",
+             [(set (i32 IntRegs:$dst),
+                   (i32 (select (i1 PredRegs:$src1), (i32 IntRegs:$src2),
+                                 s8ExtPred:$src3)))]>, ImmRegRel;
+
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 8 in
+def MUX_ii : ALU32_ii<(outs IntRegs:$dst), (ins PredRegs:$src1, s8Ext:$src2,
+                                                s8Imm:$src3),
+             "$dst = mux($src1, #$src2, #$src3)",
+             [(set (i32 IntRegs:$dst), (i32 (select (i1 PredRegs:$src1),
+                                                    s8ExtPred:$src2,
+                                                    s8ImmPred:$src3)))]>;
+
+// ALU32 - aslh, asrh, sxtb, sxth, zxtb, zxth
+multiclass ALU32_2op_Pbase<string mnemonic, bit isNot, bit isPredNew> {
+  let isPredicatedNew = isPredNew in
+  def NAME : ALU32Inst<(outs IntRegs:$dst),
+                       (ins PredRegs:$src1, IntRegs:$src2),
+            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew,".new) $dst = ",
+            ") $dst = ")#mnemonic#"($src2)">,
+            Requires<[HasV4T]>;
+}
+
+multiclass ALU32_2op_Pred<string mnemonic, bit PredNot> {
+  let isPredicatedFalse = PredNot in {
+    defm _c#NAME : ALU32_2op_Pbase<mnemonic, PredNot, 0>;
+    // Predicate new
+    defm _cdn#NAME : ALU32_2op_Pbase<mnemonic, PredNot, 1>;
+  }
+}
+
+multiclass ALU32_2op_base<string mnemonic> {
+  let BaseOpcode = mnemonic in {
+    let isPredicable = 1, neverHasSideEffects = 1 in
+    def NAME : ALU32Inst<(outs IntRegs:$dst),
+                         (ins IntRegs:$src1),
+            "$dst = "#mnemonic#"($src1)">;
+
+    let Predicates = [HasV4T], validSubTargets = HasV4SubT, isPredicated = 1,
+    neverHasSideEffects = 1 in {
+      defm Pt_V4    : ALU32_2op_Pred<mnemonic, 0>;
+      defm NotPt_V4 : ALU32_2op_Pred<mnemonic, 1>;
+    }
+  }
+}
+
+defm ASLH : ALU32_2op_base<"aslh">, PredNewRel;
+defm ASRH : ALU32_2op_base<"asrh">, PredNewRel;
+defm SXTB : ALU32_2op_base<"sxtb">, PredNewRel;
+defm SXTH : ALU32_2op_base<"sxth">,  PredNewRel;
+defm ZXTB : ALU32_2op_base<"zxtb">, PredNewRel;
+defm ZXTH : ALU32_2op_base<"zxth">,  PredNewRel;
+
+def : Pat <(shl (i32 IntRegs:$src1), (i32 16)),
+           (ASLH IntRegs:$src1)>;
+
+def : Pat <(sra (i32 IntRegs:$src1), (i32 16)),
+           (ASRH IntRegs:$src1)>;
+
+def : Pat <(sext_inreg (i32 IntRegs:$src1), i8),
+           (SXTB IntRegs:$src1)>;
+
+def : Pat <(sext_inreg (i32 IntRegs:$src1), i16),
+           (SXTH IntRegs:$src1)>;
+
+//===----------------------------------------------------------------------===//
+// ALU32/PERM -
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// ALU32/PRED +
+//===----------------------------------------------------------------------===//
+
+// Compare.
+defm CMPGTU : CMP32_rr_ri_u9<"cmp.gtu", "CMPGTU", setugt>, ImmRegRel;
+defm CMPGT : CMP32_rr_ri_s10<"cmp.gt", "CMPGT", setgt>, ImmRegRel;
+defm CMPEQ : CMP32_rr_ri_s10<"cmp.eq", "CMPEQ", seteq>, ImmRegRel;
+
+// SDNode for converting immediate C to C-1.
+def DEC_CONST_SIGNED : SDNodeXForm<imm, [{
+   // Return the byte immediate const-1 as an SDNode.
+   int32_t imm = N->getSExtValue();
+   return XformSToSM1Imm(imm);
+}]>;
+
+// SDNode for converting immediate C to C-1.
+def DEC_CONST_UNSIGNED : SDNodeXForm<imm, [{
+   // Return the byte immediate const-1 as an SDNode.
+   uint32_t imm = N->getZExtValue();
+   return XformUToUM1Imm(imm);
+}]>;
+
+def CTLZ_rr : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1),
+    "$dst = cl0($src1)",
+    [(set (i32 IntRegs:$dst), (ctlz (i32 IntRegs:$src1)))]>;
+
+def CTTZ_rr : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1),
+    "$dst = ct0($src1)",
+    [(set (i32 IntRegs:$dst), (cttz (i32 IntRegs:$src1)))]>;
+
+def CTLZ64_rr : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1),
+    "$dst = cl0($src1)",
+    [(set (i32 IntRegs:$dst), (i32 (trunc (ctlz (i64 DoubleRegs:$src1)))))]>;
+
+def CTTZ64_rr : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1),
+    "$dst = ct0($src1)",
+    [(set (i32 IntRegs:$dst), (i32 (trunc (cttz (i64 DoubleRegs:$src1)))))]>;
+
+def TSTBIT_rr : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+    "$dst = tstbit($src1, $src2)",
+    [(set (i1 PredRegs:$dst),
+          (setne (and (shl 1, (i32 IntRegs:$src2)), (i32 IntRegs:$src1)), 0))]>;
+
+def TSTBIT_ri : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+    "$dst = tstbit($src1, $src2)",
+    [(set (i1 PredRegs:$dst),
+          (setne (and (shl 1, (u5ImmPred:$src2)), (i32 IntRegs:$src1)), 0))]>;
+
+//===----------------------------------------------------------------------===//
+// ALU32/PRED -
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// ALU64/ALU +
+//===----------------------------------------------------------------------===//
+// Add.
+def ADD64_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                     DoubleRegs:$src2),
+               "$dst = add($src1, $src2)",
+               [(set (i64 DoubleRegs:$dst), (add (i64 DoubleRegs:$src1),
+                                                 (i64 DoubleRegs:$src2)))]>;
+
+// Add halfword.
+
+// Compare.
+defm CMPEHexagon4 : CMP64_rr<"cmp.eq", seteq>;
+defm CMPGT64 : CMP64_rr<"cmp.gt", setgt>;
+defm CMPGTU64 : CMP64_rr<"cmp.gtu", setugt>;
+
+// Logical operations.
+def AND_rr64 : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                     DoubleRegs:$src2),
+               "$dst = and($src1, $src2)",
+               [(set (i64 DoubleRegs:$dst), (and (i64 DoubleRegs:$src1),
+                                                 (i64 DoubleRegs:$src2)))]>;
+
+def OR_rr64 : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                    DoubleRegs:$src2),
+              "$dst = or($src1, $src2)",
+              [(set (i64 DoubleRegs:$dst), (or (i64 DoubleRegs:$src1),
+                                               (i64 DoubleRegs:$src2)))]>;
+
+def XOR_rr64 : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                     DoubleRegs:$src2),
+               "$dst = xor($src1, $src2)",
+               [(set (i64 DoubleRegs:$dst), (xor (i64 DoubleRegs:$src1),
+                                                 (i64 DoubleRegs:$src2)))]>;
+
+// Maximum.
+def MAXw_rr : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+              "$dst = max($src2, $src1)",
+              [(set (i32 IntRegs:$dst),
+                    (i32 (select (i1 (setlt (i32 IntRegs:$src2),
+                                            (i32 IntRegs:$src1))),
+                                 (i32 IntRegs:$src1), (i32 IntRegs:$src2))))]>;
+
+def MAXUw_rr : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+              "$dst = maxu($src2, $src1)",
+              [(set (i32 IntRegs:$dst),
+                    (i32 (select (i1 (setult (i32 IntRegs:$src2),
+                                             (i32 IntRegs:$src1))),
+                                 (i32 IntRegs:$src1), (i32 IntRegs:$src2))))]>;
+
+def MAXd_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                    DoubleRegs:$src2),
+              "$dst = max($src2, $src1)",
+              [(set (i64 DoubleRegs:$dst),
+                    (i64 (select (i1 (setlt (i64 DoubleRegs:$src2),
+                                            (i64 DoubleRegs:$src1))),
+                                 (i64 DoubleRegs:$src1),
+                                 (i64 DoubleRegs:$src2))))]>;
+
+def MAXUd_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                     DoubleRegs:$src2),
+              "$dst = maxu($src2, $src1)",
+              [(set (i64 DoubleRegs:$dst),
+                    (i64 (select (i1 (setult (i64 DoubleRegs:$src2),
+                                             (i64 DoubleRegs:$src1))),
+                                 (i64 DoubleRegs:$src1),
+                                 (i64 DoubleRegs:$src2))))]>;
+
+// Minimum.
+def MINw_rr : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+              "$dst = min($src2, $src1)",
+              [(set (i32 IntRegs:$dst),
+                    (i32 (select (i1 (setgt (i32 IntRegs:$src2),
+                                            (i32 IntRegs:$src1))),
+                                 (i32 IntRegs:$src1), (i32 IntRegs:$src2))))]>;
+
+def MINUw_rr : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+              "$dst = minu($src2, $src1)",
+              [(set (i32 IntRegs:$dst),
+                    (i32 (select (i1 (setugt (i32 IntRegs:$src2),
+                                             (i32 IntRegs:$src1))),
+                                 (i32 IntRegs:$src1), (i32 IntRegs:$src2))))]>;
+
+def MINd_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                    DoubleRegs:$src2),
+              "$dst = min($src2, $src1)",
+              [(set (i64 DoubleRegs:$dst),
+                    (i64 (select (i1 (setgt (i64 DoubleRegs:$src2),
+                                            (i64 DoubleRegs:$src1))),
+                                 (i64 DoubleRegs:$src1),
+                                 (i64 DoubleRegs:$src2))))]>;
+
+def MINUd_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                     DoubleRegs:$src2),
+              "$dst = minu($src2, $src1)",
+              [(set (i64 DoubleRegs:$dst),
+                    (i64 (select (i1 (setugt (i64 DoubleRegs:$src2),
+                                             (i64 DoubleRegs:$src1))),
+                                 (i64 DoubleRegs:$src1),
+                                 (i64 DoubleRegs:$src2))))]>;
+
+// Subtract.
+def SUB64_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                     DoubleRegs:$src2),
+               "$dst = sub($src1, $src2)",
+               [(set (i64 DoubleRegs:$dst), (sub (i64 DoubleRegs:$src1),
+                                                 (i64 DoubleRegs:$src2)))]>;
+
+// Subtract halfword.
+
+//===----------------------------------------------------------------------===//
+// ALU64/ALU -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ALU64/BIT +
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+// ALU64/BIT -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ALU64/PERM +
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+// ALU64/PERM -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// CR +
+//===----------------------------------------------------------------------===//
+// Logical reductions on predicates.
+
+// Looping instructions.
+
+// Pipelined looping instructions.
+
+// Logical operations on predicates.
+def AND_pp : SInst<(outs PredRegs:$dst), (ins PredRegs:$src1, PredRegs:$src2),
+             "$dst = and($src1, $src2)",
+             [(set (i1 PredRegs:$dst), (and (i1 PredRegs:$src1),
+                                            (i1 PredRegs:$src2)))]>;
+
+let neverHasSideEffects = 1 in
+def AND_pnotp : SInst<(outs PredRegs:$dst), (ins PredRegs:$src1,
+                                                 PredRegs:$src2),
+                "$dst = and($src1, !$src2)",
+                []>;
+
+def ANY_pp : SInst<(outs PredRegs:$dst), (ins PredRegs:$src1),
+             "$dst = any8($src1)",
+             []>;
+
+def ALL_pp : SInst<(outs PredRegs:$dst), (ins PredRegs:$src1),
+             "$dst = all8($src1)",
+             []>;
+
+def VITPACK_pp : SInst<(outs IntRegs:$dst), (ins PredRegs:$src1,
+                                                 PredRegs:$src2),
+             "$dst = vitpack($src1, $src2)",
+             []>;
+
+def VALIGN_rrp : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                    DoubleRegs:$src2,
+                                                    PredRegs:$src3),
+             "$dst = valignb($src1, $src2, $src3)",
+             []>;
+
+def VSPLICE_rrp : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                     DoubleRegs:$src2,
+                                                     PredRegs:$src3),
+             "$dst = vspliceb($src1, $src2, $src3)",
+             []>;
+
+def MASK_p : SInst<(outs DoubleRegs:$dst), (ins PredRegs:$src1),
+             "$dst = mask($src1)",
+             []>;
+
+def NOT_p : SInst<(outs PredRegs:$dst), (ins PredRegs:$src1),
+             "$dst = not($src1)",
+             [(set (i1 PredRegs:$dst), (not (i1 PredRegs:$src1)))]>;
+
+def OR_pp : SInst<(outs PredRegs:$dst), (ins PredRegs:$src1, PredRegs:$src2),
+            "$dst = or($src1, $src2)",
+            [(set (i1 PredRegs:$dst), (or (i1 PredRegs:$src1),
+                                          (i1 PredRegs:$src2)))]>;
+
+def XOR_pp : SInst<(outs PredRegs:$dst), (ins PredRegs:$src1, PredRegs:$src2),
+             "$dst = xor($src1, $src2)",
+             [(set (i1 PredRegs:$dst), (xor (i1 PredRegs:$src1),
+                                            (i1 PredRegs:$src2)))]>;
+
+
+// User control register transfer.
+//===----------------------------------------------------------------------===//
+// CR -
+//===----------------------------------------------------------------------===//
+
+def retflag : SDNode<"HexagonISD::RET_FLAG", SDTNone,
+                               [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def eh_return: SDNode<"HexagonISD::EH_RETURN", SDTNone,
+                      [SDNPHasChain]>;
+
+def SDHexagonBR_JT: SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
+def HexagonBR_JT: SDNode<"HexagonISD::BR_JT", SDHexagonBR_JT, [SDNPHasChain]>;
+
+let InputType = "imm", isBarrier = 1, isPredicable = 1,
+Defs = [PC], isExtendable = 1, opExtendable = 0, isExtentSigned = 1,
+opExtentBits = 24 in
+class T_JMP <dag InsDag, list<dag> JumpList = []>
+            : JInst<(outs), InsDag,
+            "jump $dst" , JumpList> {
+    bits<24> dst;
+
+    let IClass = 0b0101;
+
+    let Inst{27-25} = 0b100;
+    let Inst{24-16} = dst{23-15};
+    let Inst{13-1} = dst{14-2};
+}
+
+let InputType = "imm", isExtendable = 1, opExtendable = 1, isExtentSigned = 1,
+Defs = [PC], isPredicated = 1, opExtentBits = 17 in
+class T_JMP_c <bit PredNot, bit isPredNew, bit isTak>:
+            JInst<(outs ), (ins PredRegs:$src, brtarget:$dst),
+            !if(PredNot, "if (!$src", "if ($src")#
+            !if(isPredNew, ".new) ", ") ")#"jump"#
+            !if(isPredNew, !if(isTak, ":t ", ":nt "), " ")#"$dst"> {
+
+    let isTaken = isTak;
+    let isBrTaken = !if(isPredNew, !if(isTaken, "true", "false"), "");
+    let isPredicatedFalse = PredNot;
+    let isPredicatedNew = isPredNew;
+    bits<2> src;
+    bits<17> dst;
+
+    let IClass = 0b0101;
+
+    let Inst{27-24} = 0b1100;
+    let Inst{21} = PredNot;
+    let Inst{12} = !if(isPredNew, isTak, zero);
+    let Inst{11} = isPredNew;
+    let Inst{9-8} = src;
+    let Inst{23-22} = dst{16-15};
+    let Inst{20-16} = dst{14-10};
+    let Inst{13} = dst{9};
+    let Inst{7-1} = dst{8-2};
+  }
+
+let isBarrier = 1, Defs = [PC], isPredicable = 1, InputType = "reg" in
+class T_JMPr<dag InsDag = (ins IntRegs:$dst)>
+            : JRInst<(outs ), InsDag,
+            "jumpr $dst" ,
+            []> {
+    bits<5> dst;
+
+    let IClass = 0b0101;
+    let Inst{27-21} = 0b0010100;
+    let Inst{20-16} = dst;
+}
+
+let Defs = [PC], isPredicated = 1, InputType = "reg" in
+class T_JMPr_c <bit PredNot, bit isPredNew, bit isTak>:
+            JRInst <(outs ), (ins PredRegs:$src, IntRegs:$dst),
+            !if(PredNot, "if (!$src", "if ($src")#
+            !if(isPredNew, ".new) ", ") ")#"jumpr"#
+            !if(isPredNew, !if(isTak, ":t ", ":nt "), " ")#"$dst"> {
+
+    let isTaken = isTak;
+    let isBrTaken = !if(isPredNew, !if(isTaken, "true", "false"), "");
+    let isPredicatedFalse = PredNot;
+    let isPredicatedNew = isPredNew;
+    bits<2> src;
+    bits<5> dst;
+
+    let IClass = 0b0101;
+
+    let Inst{27-22} = 0b001101;
+    let Inst{21} = PredNot;
+    let Inst{20-16} = dst;
+    let Inst{12} = !if(isPredNew, isTak, zero);
+    let Inst{11} = isPredNew;
+    let Inst{9-8} = src;
+    let Predicates = !if(isPredNew, [HasV3T], [HasV2T]);
+    let validSubTargets = !if(isPredNew, HasV3SubT, HasV2SubT);
+}
+
+multiclass JMP_Pred<bit PredNot> {
+  def _#NAME : T_JMP_c<PredNot, 0, 0>;
+  // Predicate new
+  def _#NAME#new_t  : T_JMP_c<PredNot, 1, 1>; // taken
+  def _#NAME#new_nt : T_JMP_c<PredNot, 1, 0>; // not taken
+}
+
+multiclass JMP_base<string BaseOp> {
+  let BaseOpcode = BaseOp in {
+    def NAME : T_JMP<(ins brtarget:$dst), [(br bb:$dst)]>;
+    defm t : JMP_Pred<0>;
+    defm f : JMP_Pred<1>;
+  }
+}
+
+multiclass JMPR_Pred<bit PredNot> {
+  def NAME: T_JMPr_c<PredNot, 0, 0>;
+  // Predicate new
+  def NAME#new_tV3  : T_JMPr_c<PredNot, 1, 1>; // taken
+  def NAME#new_ntV3 : T_JMPr_c<PredNot, 1, 0>; // not taken
+}
+
+multiclass JMPR_base<string BaseOp> {
+  let BaseOpcode = BaseOp in {
+    def NAME : T_JMPr;
+    defm _t : JMPR_Pred<0>;
+    defm _f : JMPR_Pred<1>;
+  }
+}
+
+let isTerminator = 1, neverHasSideEffects = 1 in {
+let isBranch = 1 in
+defm JMP : JMP_base<"JMP">, PredNewRel;
+
+let isBranch = 1, isIndirectBranch = 1 in
+defm JMPR : JMPR_base<"JMPr">, PredNewRel;
+
+let isReturn = 1, isCodeGenOnly = 1 in
+defm JMPret : JMPR_base<"JMPret">, PredNewRel;
+}
+
+def : Pat<(retflag),
+          (JMPret (i32 R31))>;
+
+def : Pat <(brcond (i1 PredRegs:$src1), bb:$offset),
+      (JMP_t (i1 PredRegs:$src1), bb:$offset)>;
+
+// A return through builtin_eh_return.
+let isReturn = 1, isTerminator = 1, isBarrier = 1, neverHasSideEffects = 1,
+isCodeGenOnly = 1, Defs = [PC], Uses = [R28], isPredicable = 0 in
+def EH_RETURN_JMPR : T_JMPr;
+
+def : Pat<(eh_return),
+          (EH_RETURN_JMPR (i32 R31))>;
+
+def : Pat<(HexagonBR_JT (i32 IntRegs:$dst)),
+          (JMPR (i32 IntRegs:$dst))>;
+
+def : Pat<(brind (i32 IntRegs:$dst)),
+          (JMPR (i32 IntRegs:$dst))>;
+
+//===----------------------------------------------------------------------===//
+// JR -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// LD +
+//===----------------------------------------------------------------------===//
+///
+// Load -- MEMri operand
+multiclass LD_MEMri_Pbase<string mnemonic, RegisterClass RC,
+                          bit isNot, bit isPredNew> {
+  let isPredicatedNew = isPredNew in
+  def NAME : LDInst2<(outs RC:$dst),
+                       (ins PredRegs:$src1, MEMri:$addr),
+            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+            ") ")#"$dst = "#mnemonic#"($addr)",
+            []>;
+}
+
+multiclass LD_MEMri_Pred<string mnemonic, RegisterClass RC, bit PredNot> {
+  let isPredicatedFalse = PredNot in {
+    defm _c#NAME : LD_MEMri_Pbase<mnemonic, RC, PredNot, 0>;
+    // Predicate new
+    defm _cdn#NAME : LD_MEMri_Pbase<mnemonic, RC, PredNot, 1>;
+  }
+}
+
+let isExtendable = 1, neverHasSideEffects = 1 in
+multiclass LD_MEMri<string mnemonic, string CextOp, RegisterClass RC,
+                    bits<5> ImmBits, bits<5> PredImmBits> {
+
+  let CextOpcode = CextOp, BaseOpcode = CextOp in {
+    let opExtendable = 2, isExtentSigned = 1, opExtentBits = ImmBits,
+        isPredicable = 1 in
+      def NAME : LDInst2<(outs RC:$dst), (ins MEMri:$addr),
+                   "$dst = "#mnemonic#"($addr)",
+                   []>;
+
+    let opExtendable = 3, isExtentSigned = 0, opExtentBits = PredImmBits,
+        isPredicated = 1 in {
+      defm Pt : LD_MEMri_Pred<mnemonic, RC, 0 >;
+      defm NotPt : LD_MEMri_Pred<mnemonic, RC, 1 >;
+    }
+  }
+}
+
+let addrMode = BaseImmOffset, isMEMri = "true" in {
+  let accessSize = ByteAccess in {
+    defm LDrib: LD_MEMri < "memb", "LDrib", IntRegs, 11, 6>, AddrModeRel;
+    defm LDriub: LD_MEMri < "memub" , "LDriub", IntRegs, 11, 6>, AddrModeRel;
+ }
+
+  let accessSize = HalfWordAccess in {
+    defm LDrih: LD_MEMri < "memh", "LDrih", IntRegs, 12, 7>, AddrModeRel;
+    defm LDriuh: LD_MEMri < "memuh", "LDriuh", IntRegs, 12, 7>, AddrModeRel;
+ }
+
+  let accessSize = WordAccess in
+    defm LDriw: LD_MEMri < "memw", "LDriw", IntRegs, 13, 8>, AddrModeRel;
+
+  let accessSize = DoubleWordAccess in
+    defm LDrid: LD_MEMri < "memd", "LDrid", DoubleRegs, 14, 9>, AddrModeRel;
+}
+
+def : Pat < (i32 (sextloadi8 ADDRriS11_0:$addr)),
+            (LDrib ADDRriS11_0:$addr) >;
+
+def : Pat < (i32 (zextloadi8 ADDRriS11_0:$addr)),
+            (LDriub ADDRriS11_0:$addr) >;
+
+def : Pat < (i32 (sextloadi16 ADDRriS11_1:$addr)),
+            (LDrih ADDRriS11_1:$addr) >;
+
+def : Pat < (i32 (zextloadi16 ADDRriS11_1:$addr)),
+            (LDriuh ADDRriS11_1:$addr) >;
+
+def : Pat < (i32 (load ADDRriS11_2:$addr)),
+            (LDriw ADDRriS11_2:$addr) >;
+
+def : Pat < (i64 (load ADDRriS11_3:$addr)),
+            (LDrid ADDRriS11_3:$addr) >;
+
+
+// Load - Base with Immediate offset addressing mode
+multiclass LD_Idxd_Pbase<string mnemonic, RegisterClass RC, Operand predImmOp,
+                        bit isNot, bit isPredNew> {
+  let isPredicatedNew = isPredNew in
+  def NAME : LDInst2<(outs RC:$dst),
+                     (ins PredRegs:$src1, IntRegs:$src2, predImmOp:$src3),
+            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+            ") ")#"$dst = "#mnemonic#"($src2+#$src3)",
+            []>;
+}
+
+multiclass LD_Idxd_Pred<string mnemonic, RegisterClass RC, Operand predImmOp,
+                        bit PredNot> {
+  let isPredicatedFalse = PredNot in {
+    defm _c#NAME : LD_Idxd_Pbase<mnemonic, RC, predImmOp, PredNot, 0>;
+    // Predicate new
+    defm _cdn#NAME : LD_Idxd_Pbase<mnemonic, RC, predImmOp, PredNot, 1>;
+  }
+}
+
+let isExtendable = 1, neverHasSideEffects = 1 in
+multiclass LD_Idxd<string mnemonic, string CextOp, RegisterClass RC,
+                   Operand ImmOp, Operand predImmOp, bits<5> ImmBits,
+                   bits<5> PredImmBits> {
+
+  let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed in {
+    let opExtendable = 2, isExtentSigned = 1, opExtentBits = ImmBits,
+        isPredicable = 1, AddedComplexity = 20 in
+      def NAME : LDInst2<(outs RC:$dst), (ins IntRegs:$src1, ImmOp:$offset),
+                   "$dst = "#mnemonic#"($src1+#$offset)",
+                   []>;
+
+    let opExtendable = 3, isExtentSigned = 0, opExtentBits = PredImmBits,
+        isPredicated = 1 in {
+      defm Pt : LD_Idxd_Pred<mnemonic, RC, predImmOp, 0 >;
+      defm NotPt : LD_Idxd_Pred<mnemonic, RC, predImmOp, 1 >;
+    }
+  }
+}
+
+let addrMode = BaseImmOffset in {
+  let accessSize = ByteAccess in {
+    defm LDrib_indexed: LD_Idxd <"memb", "LDrib", IntRegs, s11_0Ext, u6_0Ext,
+                                  11, 6>, AddrModeRel;
+    defm LDriub_indexed: LD_Idxd <"memub" , "LDriub", IntRegs, s11_0Ext, u6_0Ext,
+                                   11, 6>, AddrModeRel;
+  }
+  let accessSize = HalfWordAccess in {
+    defm LDrih_indexed: LD_Idxd <"memh", "LDrih", IntRegs, s11_1Ext, u6_1Ext,
+                                 12, 7>, AddrModeRel;
+    defm LDriuh_indexed: LD_Idxd <"memuh", "LDriuh", IntRegs, s11_1Ext, u6_1Ext,
+                                  12, 7>, AddrModeRel;
+  }
+  let accessSize = WordAccess in
+    defm LDriw_indexed: LD_Idxd <"memw", "LDriw", IntRegs, s11_2Ext, u6_2Ext,
+                                 13, 8>, AddrModeRel;
+
+  let accessSize = DoubleWordAccess in
+    defm LDrid_indexed: LD_Idxd <"memd", "LDrid", DoubleRegs, s11_3Ext, u6_3Ext,
+                                 14, 9>, AddrModeRel;
+}
+
+let AddedComplexity = 20 in {
+def : Pat < (i32 (sextloadi8 (add IntRegs:$src1, s11_0ExtPred:$offset))),
+            (LDrib_indexed IntRegs:$src1, s11_0ExtPred:$offset) >;
+
+def : Pat < (i32 (zextloadi8 (add IntRegs:$src1, s11_0ExtPred:$offset))),
+            (LDriub_indexed IntRegs:$src1, s11_0ExtPred:$offset) >;
+
+def : Pat < (i32 (sextloadi16 (add IntRegs:$src1, s11_1ExtPred:$offset))),
+            (LDrih_indexed IntRegs:$src1, s11_1ExtPred:$offset) >;
+
+def : Pat < (i32 (zextloadi16 (add IntRegs:$src1, s11_1ExtPred:$offset))),
+            (LDriuh_indexed IntRegs:$src1, s11_1ExtPred:$offset) >;
+
+def : Pat < (i32 (load (add IntRegs:$src1, s11_2ExtPred:$offset))),
+            (LDriw_indexed IntRegs:$src1, s11_2ExtPred:$offset) >;
+
+def : Pat < (i64 (load (add IntRegs:$src1, s11_3ExtPred:$offset))),
+            (LDrid_indexed IntRegs:$src1, s11_3ExtPred:$offset) >;
+}
+
+//===----------------------------------------------------------------------===//
+// Post increment load
+//===----------------------------------------------------------------------===//
+
+multiclass LD_PostInc_Pbase<string mnemonic, RegisterClass RC, Operand ImmOp,
+                            bit isNot, bit isPredNew> {
+  let isPredicatedNew = isPredNew in
+  def NAME : LDInst2PI<(outs RC:$dst, IntRegs:$dst2),
+                       (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$offset),
+            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+            ") ")#"$dst = "#mnemonic#"($src2++#$offset)",
+            [],
+            "$src2 = $dst2">;
+}
+
+multiclass LD_PostInc_Pred<string mnemonic, RegisterClass RC,
+                           Operand ImmOp, bit PredNot> {
+  let isPredicatedFalse = PredNot in {
+    defm _c#NAME : LD_PostInc_Pbase<mnemonic, RC, ImmOp, PredNot, 0>;
+    // Predicate new
+    let Predicates = [HasV4T], validSubTargets = HasV4SubT in
+    defm _cdn#NAME#_V4 : LD_PostInc_Pbase<mnemonic, RC, ImmOp, PredNot, 1>;
+  }
+}
+
+multiclass LD_PostInc<string mnemonic, string BaseOp, RegisterClass RC,
+                      Operand ImmOp> {
+
+  let BaseOpcode = "POST_"#BaseOp in {
+    let isPredicable = 1 in
+    def NAME : LDInst2PI<(outs RC:$dst, IntRegs:$dst2),
+                         (ins IntRegs:$src1, ImmOp:$offset),
+                 "$dst = "#mnemonic#"($src1++#$offset)",
+                 [],
+                 "$src1 = $dst2">;
+
+    let isPredicated = 1 in {
+      defm Pt : LD_PostInc_Pred<mnemonic, RC, ImmOp, 0 >;
+      defm NotPt : LD_PostInc_Pred<mnemonic, RC, ImmOp, 1 >;
+    }
+  }
+}
+
+let hasCtrlDep = 1, neverHasSideEffects = 1, addrMode = PostInc in {
+  defm POST_LDrib : LD_PostInc<"memb", "LDrib", IntRegs, s4_0Imm>,
+                    PredNewRel;
+  defm POST_LDriub : LD_PostInc<"memub", "LDriub", IntRegs, s4_0Imm>,
+                    PredNewRel;
+  defm POST_LDrih : LD_PostInc<"memh", "LDrih", IntRegs, s4_1Imm>,
+                    PredNewRel;
+  defm POST_LDriuh : LD_PostInc<"memuh", "LDriuh", IntRegs, s4_1Imm>,
+                    PredNewRel;
+  defm POST_LDriw : LD_PostInc<"memw", "LDriw", IntRegs, s4_2Imm>,
+                    PredNewRel;
+  defm POST_LDrid : LD_PostInc<"memd", "LDrid", DoubleRegs, s4_3Imm>,
+                    PredNewRel;
+}
+
+def : Pat< (i32 (extloadi1 ADDRriS11_0:$addr)),
+           (i32 (LDrib ADDRriS11_0:$addr)) >;
+
+// Load byte any-extend.
+def : Pat < (i32 (extloadi8 ADDRriS11_0:$addr)),
+            (i32 (LDrib ADDRriS11_0:$addr)) >;
+
+// Indexed load byte any-extend.
+let AddedComplexity = 20 in
+def : Pat < (i32 (extloadi8 (add IntRegs:$src1, s11_0ImmPred:$offset))),
+            (i32 (LDrib_indexed IntRegs:$src1, s11_0ImmPred:$offset)) >;
+
+def : Pat < (i32 (extloadi16 ADDRriS11_1:$addr)),
+            (i32 (LDrih ADDRriS11_1:$addr))>;
+
+let AddedComplexity = 20 in
+def : Pat < (i32 (extloadi16 (add IntRegs:$src1, s11_1ImmPred:$offset))),
+            (i32 (LDrih_indexed IntRegs:$src1, s11_1ImmPred:$offset)) >;
+
+let AddedComplexity = 10 in
+def : Pat < (i32 (zextloadi1 ADDRriS11_0:$addr)),
+            (i32 (LDriub ADDRriS11_0:$addr))>;
+
+let AddedComplexity = 20 in
+def : Pat < (i32 (zextloadi1 (add IntRegs:$src1, s11_0ImmPred:$offset))),
+            (i32 (LDriub_indexed IntRegs:$src1, s11_0ImmPred:$offset))>;
+
+// Load predicate.
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 13,
+isPseudo = 1, Defs = [R10,R11,D5], neverHasSideEffects = 1 in
+def LDriw_pred : LDInst2<(outs PredRegs:$dst),
+            (ins MEMri:$addr),
+            "Error; should not emit",
+            []>;
+
+// Deallocate stack frame.
+let Defs = [R29, R30, R31], Uses = [R29], neverHasSideEffects = 1 in {
+  def DEALLOCFRAME : LDInst2<(outs), (ins),
+                     "deallocframe",
+                     []>;
+}
+
+// Load and unpack bytes to halfwords.
+//===----------------------------------------------------------------------===//
+// LD -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MTYPE/ALU +
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// MTYPE/ALU -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MTYPE/COMPLEX +
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// MTYPE/COMPLEX -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MTYPE/MPYH +
+//===----------------------------------------------------------------------===//
+// Multiply and use lower result.
+// Rd=+mpyi(Rs,#u8)
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 0, opExtentBits = 8 in
+def MPYI_riu : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u8Ext:$src2),
+              "$dst =+ mpyi($src1, #$src2)",
+              [(set (i32 IntRegs:$dst), (mul (i32 IntRegs:$src1),
+                                             u8ExtPred:$src2))]>;
+
+// Rd=-mpyi(Rs,#u8)
+def MPYI_rin : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u8Imm:$src2),
+              "$dst =- mpyi($src1, #$src2)",
+              [(set (i32 IntRegs:$dst), (ineg (mul (i32 IntRegs:$src1),
+                                                   u8ImmPred:$src2)))]>;
+
+// Rd=mpyi(Rs,#m9)
+// s9 is NOT the same as m9 - but it works.. so far.
+// Assembler maps to either Rd=+mpyi(Rs,#u8 or Rd=-mpyi(Rs,#u8)
+// depending on the value of m9. See Arch Spec.
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 9,
+CextOpcode = "MPYI", InputType = "imm" in
+def MPYI_ri : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, s9Ext:$src2),
+              "$dst = mpyi($src1, #$src2)",
+              [(set (i32 IntRegs:$dst), (mul (i32 IntRegs:$src1),
+                                             s9ExtPred:$src2))]>, ImmRegRel;
+
+// Rd=mpyi(Rs,Rt)
+let CextOpcode = "MPYI", InputType = "reg" in
+def MPYI : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+           "$dst = mpyi($src1, $src2)",
+           [(set (i32 IntRegs:$dst), (mul (i32 IntRegs:$src1),
+                                          (i32 IntRegs:$src2)))]>, ImmRegRel;
+
+// Rx+=mpyi(Rs,#u8)
+let isExtendable = 1, opExtendable = 3, isExtentSigned = 0, opExtentBits = 8,
+CextOpcode = "MPYI_acc", InputType = "imm" in
+def MPYI_acc_ri : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2, u8Ext:$src3),
+            "$dst += mpyi($src2, #$src3)",
+            [(set (i32 IntRegs:$dst),
+                  (add (mul (i32 IntRegs:$src2), u8ExtPred:$src3),
+                       (i32 IntRegs:$src1)))],
+            "$src1 = $dst">, ImmRegRel;
+
+// Rx+=mpyi(Rs,Rt)
+let CextOpcode = "MPYI_acc", InputType = "reg" in
+def MPYI_acc_rr : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+            "$dst += mpyi($src2, $src3)",
+            [(set (i32 IntRegs:$dst),
+                  (add (mul (i32 IntRegs:$src2), (i32 IntRegs:$src3)),
+                       (i32 IntRegs:$src1)))],
+            "$src1 = $dst">, ImmRegRel;
+
+// Rx-=mpyi(Rs,#u8)
+let isExtendable = 1, opExtendable = 3, isExtentSigned = 0, opExtentBits = 8 in
+def MPYI_sub_ri : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2, u8Ext:$src3),
+            "$dst -= mpyi($src2, #$src3)",
+            [(set (i32 IntRegs:$dst),
+                  (sub (i32 IntRegs:$src1), (mul (i32 IntRegs:$src2),
+                                                 u8ExtPred:$src3)))],
+            "$src1 = $dst">;
+
+// Multiply and use upper result.
+// Rd=mpy(Rs,Rt.H):<<1:rnd:sat
+// Rd=mpy(Rs,Rt.L):<<1:rnd:sat
+// Rd=mpy(Rs,Rt)
+def MPY : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+          "$dst = mpy($src1, $src2)",
+          [(set (i32 IntRegs:$dst), (mulhs (i32 IntRegs:$src1),
+                                           (i32 IntRegs:$src2)))]>;
+
+// Rd=mpy(Rs,Rt):rnd
+// Rd=mpyu(Rs,Rt)
+def MPYU : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+           "$dst = mpyu($src1, $src2)",
+           [(set (i32 IntRegs:$dst), (mulhu (i32 IntRegs:$src1),
+                                            (i32 IntRegs:$src2)))]>;
+
+// Multiply and use full result.
+// Rdd=mpyu(Rs,Rt)
+def MPYU64 : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             "$dst = mpyu($src1, $src2)",
+             [(set (i64 DoubleRegs:$dst),
+                   (mul (i64 (anyext (i32 IntRegs:$src1))),
+                        (i64 (anyext (i32 IntRegs:$src2)))))]>;
+
+// Rdd=mpy(Rs,Rt)
+def MPY64 : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             "$dst = mpy($src1, $src2)",
+             [(set (i64 DoubleRegs:$dst),
+                   (mul (i64 (sext (i32 IntRegs:$src1))),
+                        (i64 (sext (i32 IntRegs:$src2)))))]>;
+
+// Multiply and accumulate, use full result.
+// Rxx[+-]=mpy(Rs,Rt)
+// Rxx+=mpy(Rs,Rt)
+def MPY64_acc : MInst_acc<(outs DoubleRegs:$dst),
+            (ins DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+            "$dst += mpy($src2, $src3)",
+            [(set (i64 DoubleRegs:$dst),
+            (add (mul (i64 (sext (i32 IntRegs:$src2))),
+                      (i64 (sext (i32 IntRegs:$src3)))),
+                 (i64 DoubleRegs:$src1)))],
+            "$src1 = $dst">;
+
+// Rxx-=mpy(Rs,Rt)
+def MPY64_sub : MInst_acc<(outs DoubleRegs:$dst),
+            (ins DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+            "$dst -= mpy($src2, $src3)",
+            [(set (i64 DoubleRegs:$dst),
+                  (sub (i64 DoubleRegs:$src1),
+                       (mul (i64 (sext (i32 IntRegs:$src2))),
+                            (i64 (sext (i32 IntRegs:$src3))))))],
+            "$src1 = $dst">;
+
+// Rxx[+-]=mpyu(Rs,Rt)
+// Rxx+=mpyu(Rs,Rt)
+def MPYU64_acc : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                            IntRegs:$src2, IntRegs:$src3),
+             "$dst += mpyu($src2, $src3)",
+             [(set (i64 DoubleRegs:$dst),
+                   (add (mul (i64 (anyext (i32 IntRegs:$src2))),
+                             (i64 (anyext (i32 IntRegs:$src3)))),
+                        (i64 DoubleRegs:$src1)))], "$src1 = $dst">;
+
+// Rxx-=mpyu(Rs,Rt)
+def MPYU64_sub : MInst_acc<(outs DoubleRegs:$dst),
+            (ins DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+            "$dst -= mpyu($src2, $src3)",
+            [(set (i64 DoubleRegs:$dst),
+                  (sub (i64 DoubleRegs:$src1),
+                       (mul (i64 (anyext (i32 IntRegs:$src2))),
+                            (i64 (anyext (i32 IntRegs:$src3))))))],
+            "$src1 = $dst">;
+
+
+let InputType = "reg", CextOpcode = "ADD_acc" in
+def ADDrr_acc : MInst_acc<(outs IntRegs: $dst), (ins IntRegs:$src1,
+                            IntRegs:$src2, IntRegs:$src3),
+             "$dst += add($src2, $src3)",
+             [(set (i32 IntRegs:$dst), (add (add (i32 IntRegs:$src2),
+                                                 (i32 IntRegs:$src3)),
+                                            (i32 IntRegs:$src1)))],
+             "$src1 = $dst">, ImmRegRel;
+
+let isExtendable = 1, opExtendable = 3, isExtentSigned = 1, opExtentBits = 8,
+InputType = "imm", CextOpcode = "ADD_acc" in
+def ADDri_acc : MInst_acc<(outs IntRegs: $dst), (ins IntRegs:$src1,
+                            IntRegs:$src2, s8Ext:$src3),
+             "$dst += add($src2, #$src3)",
+             [(set (i32 IntRegs:$dst), (add (add (i32 IntRegs:$src2),
+                                                 s8_16ExtPred:$src3),
+                                            (i32 IntRegs:$src1)))],
+             "$src1 = $dst">, ImmRegRel;
+
+let CextOpcode = "SUB_acc", InputType = "reg" in
+def SUBrr_acc : MInst_acc<(outs IntRegs: $dst), (ins IntRegs:$src1,
+                            IntRegs:$src2, IntRegs:$src3),
+             "$dst -= add($src2, $src3)",
+             [(set (i32 IntRegs:$dst),
+                   (sub (i32 IntRegs:$src1), (add (i32 IntRegs:$src2),
+                                                  (i32 IntRegs:$src3))))],
+             "$src1 = $dst">, ImmRegRel;
+
+let isExtendable = 1, opExtendable = 3, isExtentSigned = 1, opExtentBits = 8,
+CextOpcode = "SUB_acc", InputType = "imm" in
+def SUBri_acc : MInst_acc<(outs IntRegs: $dst), (ins IntRegs:$src1,
+                            IntRegs:$src2, s8Ext:$src3),
+             "$dst -= add($src2, #$src3)",
+             [(set (i32 IntRegs:$dst), (sub (i32 IntRegs:$src1),
+                                            (add (i32 IntRegs:$src2),
+                                                 s8_16ExtPred:$src3)))],
+             "$src1 = $dst">, ImmRegRel;
+
+//===----------------------------------------------------------------------===//
+// MTYPE/MPYH -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MTYPE/MPYS +
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// MTYPE/MPYS -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MTYPE/VB +
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// MTYPE/VB -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MTYPE/VH  +
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// MTYPE/VH  -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ST +
+//===----------------------------------------------------------------------===//
+///
+// Store doubleword.
+
+//===----------------------------------------------------------------------===//
+// Post increment store
+//===----------------------------------------------------------------------===//
+
+multiclass ST_PostInc_Pbase<string mnemonic, RegisterClass RC, Operand ImmOp,
+                            bit isNot, bit isPredNew> {
+  let isPredicatedNew = isPredNew in
+  def NAME : STInst2PI<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$offset, RC:$src3),
+            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+            ") ")#mnemonic#"($src2++#$offset) = $src3",
+            [],
+            "$src2 = $dst">;
+}
+
+multiclass ST_PostInc_Pred<string mnemonic, RegisterClass RC,
+                           Operand ImmOp, bit PredNot> {
+  let isPredicatedFalse = PredNot in {
+    defm _c#NAME : ST_PostInc_Pbase<mnemonic, RC, ImmOp, PredNot, 0>;
+    // Predicate new
+    let Predicates = [HasV4T], validSubTargets = HasV4SubT in
+    defm _cdn#NAME#_V4 : ST_PostInc_Pbase<mnemonic, RC, ImmOp, PredNot, 1>;
+  }
+}
+
+let hasCtrlDep = 1, isNVStorable = 1, neverHasSideEffects = 1 in
+multiclass ST_PostInc<string mnemonic, string BaseOp, RegisterClass RC,
+                      Operand ImmOp> {
+
+  let hasCtrlDep = 1, BaseOpcode = "POST_"#BaseOp in {
+    let isPredicable = 1 in
+    def NAME : STInst2PI<(outs IntRegs:$dst),
+                (ins IntRegs:$src1, ImmOp:$offset, RC:$src2),
+                mnemonic#"($src1++#$offset) = $src2",
+                [],
+                "$src1 = $dst">;
+
+    let isPredicated = 1 in {
+      defm Pt : ST_PostInc_Pred<mnemonic, RC, ImmOp, 0 >;
+      defm NotPt : ST_PostInc_Pred<mnemonic, RC, ImmOp, 1 >;
+    }
+  }
+}
+
+defm POST_STbri: ST_PostInc <"memb", "STrib", IntRegs, s4_0Imm>, AddrModeRel;
+defm POST_SThri: ST_PostInc <"memh", "STrih", IntRegs, s4_1Imm>, AddrModeRel;
+defm POST_STwri: ST_PostInc <"memw", "STriw", IntRegs, s4_2Imm>, AddrModeRel;
+
+let isNVStorable = 0 in
+defm POST_STdri: ST_PostInc <"memd", "STrid", DoubleRegs, s4_3Imm>, AddrModeRel;
+
+def : Pat<(post_truncsti8 (i32 IntRegs:$src1), IntRegs:$src2,
+                           s4_3ImmPred:$offset),
+          (POST_STbri IntRegs:$src2, s4_0ImmPred:$offset, IntRegs:$src1)>;
+
+def : Pat<(post_truncsti16 (i32 IntRegs:$src1), IntRegs:$src2,
+                            s4_3ImmPred:$offset),
+          (POST_SThri IntRegs:$src2, s4_1ImmPred:$offset, IntRegs:$src1)>;
+
+def : Pat<(post_store (i32 IntRegs:$src1), IntRegs:$src2, s4_2ImmPred:$offset),
+          (POST_STwri IntRegs:$src2, s4_1ImmPred:$offset, IntRegs:$src1)>;
+
+def : Pat<(post_store (i64 DoubleRegs:$src1), IntRegs:$src2,
+                       s4_3ImmPred:$offset),
+          (POST_STdri IntRegs:$src2, s4_3ImmPred:$offset, DoubleRegs:$src1)>;
+
+//===----------------------------------------------------------------------===//
+// multiclass for the store instructions with MEMri operand.
+//===----------------------------------------------------------------------===//
+multiclass ST_MEMri_Pbase<string mnemonic, RegisterClass RC, bit isNot,
+                          bit isPredNew> {
+  let isPredicatedNew = isPredNew in
+  def NAME : STInst2<(outs),
+            (ins PredRegs:$src1, MEMri:$addr, RC: $src2),
+            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+            ") ")#mnemonic#"($addr) = $src2",
+            []>;
+}
+
+multiclass ST_MEMri_Pred<string mnemonic, RegisterClass RC, bit PredNot> {
+  let isPredicatedFalse = PredNot in {
+    defm _c#NAME : ST_MEMri_Pbase<mnemonic, RC, PredNot, 0>;
+
+    // Predicate new
+    let validSubTargets = HasV4SubT, Predicates = [HasV4T] in
+    defm _cdn#NAME#_V4 : ST_MEMri_Pbase<mnemonic, RC, PredNot, 1>;
+  }
+}
+
+let isExtendable = 1, isNVStorable = 1, neverHasSideEffects = 1 in
+multiclass ST_MEMri<string mnemonic, string CextOp, RegisterClass RC,
+                    bits<5> ImmBits, bits<5> PredImmBits> {
+
+  let CextOpcode = CextOp, BaseOpcode = CextOp in {
+    let opExtendable = 1, isExtentSigned = 1, opExtentBits = ImmBits,
+         isPredicable = 1 in
+    def NAME : STInst2<(outs),
+            (ins MEMri:$addr, RC:$src),
+            mnemonic#"($addr) = $src",
+            []>;
+
+    let opExtendable = 2, isExtentSigned = 0, opExtentBits = PredImmBits,
+        isPredicated = 1 in {
+      defm Pt : ST_MEMri_Pred<mnemonic, RC, 0>;
+      defm NotPt : ST_MEMri_Pred<mnemonic, RC, 1>;
+    }
+  }
+}
+
+let addrMode = BaseImmOffset, isMEMri = "true" in {
+  let accessSize = ByteAccess in
+    defm STrib: ST_MEMri < "memb", "STrib", IntRegs, 11, 6>, AddrModeRel;
+
+  let accessSize = HalfWordAccess in
+    defm STrih: ST_MEMri < "memh", "STrih", IntRegs, 12, 7>, AddrModeRel;
+
+  let accessSize = WordAccess in
+    defm STriw: ST_MEMri < "memw", "STriw", IntRegs, 13, 8>, AddrModeRel;
+
+  let accessSize = DoubleWordAccess, isNVStorable = 0 in
+    defm STrid: ST_MEMri < "memd", "STrid", DoubleRegs, 14, 9>, AddrModeRel;
+}
+
+def : Pat<(truncstorei8 (i32 IntRegs:$src1), ADDRriS11_0:$addr),
+          (STrib ADDRriS11_0:$addr, (i32 IntRegs:$src1))>;
+
+def : Pat<(truncstorei16 (i32 IntRegs:$src1), ADDRriS11_1:$addr),
+          (STrih ADDRriS11_1:$addr, (i32 IntRegs:$src1))>;
+
+def : Pat<(store (i32 IntRegs:$src1), ADDRriS11_2:$addr),
+          (STriw ADDRriS11_2:$addr, (i32 IntRegs:$src1))>;
+
+def : Pat<(store (i64 DoubleRegs:$src1), ADDRriS11_3:$addr),
+          (STrid ADDRriS11_3:$addr, (i64 DoubleRegs:$src1))>;
+
+
+//===----------------------------------------------------------------------===//
+// multiclass for the store instructions with base+immediate offset
+// addressing mode
+//===----------------------------------------------------------------------===//
+multiclass ST_Idxd_Pbase<string mnemonic, RegisterClass RC, Operand predImmOp,
+                        bit isNot, bit isPredNew> {
+  let isPredicatedNew = isPredNew in
+  def NAME : STInst2<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, predImmOp:$src3, RC: $src4),
+            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+            ") ")#mnemonic#"($src2+#$src3) = $src4",
+            []>;
+}
+
+multiclass ST_Idxd_Pred<string mnemonic, RegisterClass RC, Operand predImmOp,
+                        bit PredNot> {
+  let isPredicatedFalse = PredNot, isPredicated = 1 in {
+    defm _c#NAME : ST_Idxd_Pbase<mnemonic, RC, predImmOp, PredNot, 0>;
+
+    // Predicate new
+    let validSubTargets = HasV4SubT, Predicates = [HasV4T] in
+    defm _cdn#NAME#_V4 : ST_Idxd_Pbase<mnemonic, RC, predImmOp, PredNot, 1>;
+  }
+}
+
+let isExtendable = 1, isNVStorable = 1, neverHasSideEffects = 1 in
+multiclass ST_Idxd<string mnemonic, string CextOp, RegisterClass RC,
+                   Operand ImmOp, Operand predImmOp, bits<5> ImmBits,
+                   bits<5> PredImmBits> {
+
+  let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed in {
+    let opExtendable = 1, isExtentSigned = 1, opExtentBits = ImmBits,
+         isPredicable = 1 in
+    def NAME : STInst2<(outs),
+            (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
+            mnemonic#"($src1+#$src2) = $src3",
+            []>;
+
+    let opExtendable = 2, isExtentSigned = 0, opExtentBits = PredImmBits in {
+      defm Pt : ST_Idxd_Pred<mnemonic, RC, predImmOp, 0>;
+      defm NotPt : ST_Idxd_Pred<mnemonic, RC, predImmOp, 1>;
+    }
+  }
+}
+
+let addrMode = BaseImmOffset, InputType = "reg" in {
+  let accessSize = ByteAccess in
+    defm STrib_indexed: ST_Idxd < "memb", "STrib", IntRegs, s11_0Ext,
+                                  u6_0Ext, 11, 6>, AddrModeRel, ImmRegRel;
+
+  let accessSize = HalfWordAccess in
+    defm STrih_indexed: ST_Idxd < "memh", "STrih", IntRegs, s11_1Ext,
+                                  u6_1Ext, 12, 7>, AddrModeRel, ImmRegRel;
+
+  let accessSize = WordAccess in
+    defm STriw_indexed: ST_Idxd < "memw", "STriw", IntRegs, s11_2Ext,
+                                  u6_2Ext, 13, 8>, AddrModeRel, ImmRegRel;
+
+  let accessSize = DoubleWordAccess, isNVStorable = 0 in
+    defm STrid_indexed: ST_Idxd < "memd", "STrid", DoubleRegs, s11_3Ext,
+                                  u6_3Ext, 14, 9>, AddrModeRel;
+}
+
+let AddedComplexity = 10 in {
+def : Pat<(truncstorei8 (i32 IntRegs:$src1), (add IntRegs:$src2,
+                                                  s11_0ExtPred:$offset)),
+          (STrib_indexed IntRegs:$src2, s11_0ImmPred:$offset,
+                         (i32 IntRegs:$src1))>;
+
+def : Pat<(truncstorei16 (i32 IntRegs:$src1), (add IntRegs:$src2,
+                                                   s11_1ExtPred:$offset)),
+          (STrih_indexed IntRegs:$src2, s11_1ImmPred:$offset,
+                         (i32 IntRegs:$src1))>;
+
+def : Pat<(store (i32 IntRegs:$src1), (add IntRegs:$src2,
+                                           s11_2ExtPred:$offset)),
+          (STriw_indexed IntRegs:$src2, s11_2ImmPred:$offset,
+                         (i32 IntRegs:$src1))>;
+
+def : Pat<(store (i64 DoubleRegs:$src1), (add IntRegs:$src2,
+                                              s11_3ExtPred:$offset)),
+          (STrid_indexed IntRegs:$src2, s11_3ImmPred:$offset,
+                         (i64 DoubleRegs:$src1))>;
+}
+
+// memh(Rx++#s4:1)=Rt.H
+
+// Store word.
+// Store predicate.
+let Defs = [R10,R11,D5], neverHasSideEffects = 1 in
+def STriw_pred : STInst2<(outs),
+            (ins MEMri:$addr, PredRegs:$src1),
+            "Error; should not emit",
+            []>;
+
+// Allocate stack frame.
+let Defs = [R29, R30], Uses = [R31, R30], neverHasSideEffects = 1 in {
+  def ALLOCFRAME : STInst2<(outs),
+             (ins i32imm:$amt),
+             "allocframe(#$amt)",
+             []>;
+}
+//===----------------------------------------------------------------------===//
+// ST -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// STYPE/ALU +
+//===----------------------------------------------------------------------===//
+// Logical NOT.
+def NOT_rr64 : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1),
+               "$dst = not($src1)",
+               [(set (i64 DoubleRegs:$dst), (not (i64 DoubleRegs:$src1)))]>;
+
+
+// Sign extend word to doubleword.
+def SXTW : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src1),
+           "$dst = sxtw($src1)",
+           [(set (i64 DoubleRegs:$dst), (sext (i32 IntRegs:$src1)))]>;
+//===----------------------------------------------------------------------===//
+// STYPE/ALU -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// STYPE/BIT +
+//===----------------------------------------------------------------------===//
+// clrbit.
+def CLRBIT : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+            "$dst = clrbit($src1, #$src2)",
+            [(set (i32 IntRegs:$dst), (and (i32 IntRegs:$src1),
+                                           (not
+                                              (shl 1, u5ImmPred:$src2))))]>;
+
+def CLRBIT_31 : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+            "$dst = clrbit($src1, #$src2)",
+            []>;
+
+// Map from r0 = and(r1, 2147483647) to r0 = clrbit(r1, #31).
+def : Pat <(and (i32 IntRegs:$src1), 2147483647),
+      (CLRBIT_31 (i32 IntRegs:$src1), 31)>;
+
+// setbit.
+def SETBIT : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+            "$dst = setbit($src1, #$src2)",
+            [(set (i32 IntRegs:$dst), (or (i32 IntRegs:$src1),
+                                          (shl 1, u5ImmPred:$src2)))]>;
+
+// Map from r0 = or(r1, -2147483648) to r0 = setbit(r1, #31).
+def SETBIT_31 : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+            "$dst = setbit($src1, #$src2)",
+            []>;
+
+def : Pat <(or (i32 IntRegs:$src1), -2147483648),
+      (SETBIT_31 (i32 IntRegs:$src1), 31)>;
+
+// togglebit.
+def TOGBIT : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+            "$dst = setbit($src1, #$src2)",
+            [(set (i32 IntRegs:$dst), (xor (i32 IntRegs:$src1),
+                                          (shl 1, u5ImmPred:$src2)))]>;
+
+// Map from r0 = xor(r1, -2147483648) to r0 = togglebit(r1, #31).
+def TOGBIT_31 : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+            "$dst = togglebit($src1, #$src2)",
+            []>;
+
+def : Pat <(xor (i32 IntRegs:$src1), -2147483648),
+      (TOGBIT_31 (i32 IntRegs:$src1), 31)>;
+
+// Predicate transfer.
+let neverHasSideEffects = 1 in
+def TFR_RsPd : SInst<(outs IntRegs:$dst), (ins PredRegs:$src1),
+               "$dst = $src1  /* Should almost never emit this. */",
+               []>;
+
+def TFR_PdRs : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1),
+               "$dst = $src1  /* Should almost never emit this. */",
+               [(set (i1 PredRegs:$dst), (trunc (i32 IntRegs:$src1)))]>;
+//===----------------------------------------------------------------------===//
+// STYPE/PRED -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// STYPE/SHIFT +
+//===----------------------------------------------------------------------===//
+// Shift by immediate.
+def ASR_ri : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+             "$dst = asr($src1, #$src2)",
+             [(set (i32 IntRegs:$dst), (sra (i32 IntRegs:$src1),
+                                            u5ImmPred:$src2))]>;
+
+def ASRd_ri : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, u6Imm:$src2),
+              "$dst = asr($src1, #$src2)",
+              [(set (i64 DoubleRegs:$dst), (sra (i64 DoubleRegs:$src1),
+                                                u6ImmPred:$src2))]>;
+
+def ASL : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+          "$dst = asl($src1, #$src2)",
+          [(set (i32 IntRegs:$dst), (shl (i32 IntRegs:$src1),
+                                         u5ImmPred:$src2))]>;
+
+def ASLd_ri : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, u6Imm:$src2),
+              "$dst = asl($src1, #$src2)",
+              [(set (i64 DoubleRegs:$dst), (shl (i64 DoubleRegs:$src1),
+                                                u6ImmPred:$src2))]>;
+
+def LSR_ri : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+             "$dst = lsr($src1, #$src2)",
+             [(set (i32 IntRegs:$dst), (srl (i32 IntRegs:$src1),
+                                            u5ImmPred:$src2))]>;
+
+def LSRd_ri : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, u6Imm:$src2),
+              "$dst = lsr($src1, #$src2)",
+              [(set (i64 DoubleRegs:$dst), (srl (i64 DoubleRegs:$src1),
+                                                u6ImmPred:$src2))]>;
+
+// Shift by immediate and add.
+let AddedComplexity = 100 in
+def ADDASL : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
+                                             u3Imm:$src3),
+             "$dst = addasl($src1, $src2, #$src3)",
+             [(set (i32 IntRegs:$dst), (add (i32 IntRegs:$src1),
+                                       (shl (i32 IntRegs:$src2),
+                                            u3ImmPred:$src3)))]>;
+
+// Shift by register.
+def ASL_rr : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             "$dst = asl($src1, $src2)",
+             [(set (i32 IntRegs:$dst), (shl (i32 IntRegs:$src1),
+                                            (i32 IntRegs:$src2)))]>;
+
+def ASR_rr : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             "$dst = asr($src1, $src2)",
+             [(set (i32 IntRegs:$dst), (sra (i32 IntRegs:$src1),
+                                            (i32 IntRegs:$src2)))]>;
+
+def LSL_rr : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             "$dst = lsl($src1, $src2)",
+             [(set (i32 IntRegs:$dst), (shl (i32 IntRegs:$src1),
+                                            (i32 IntRegs:$src2)))]>;
+
+def LSR_rr : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             "$dst = lsr($src1, $src2)",
+             [(set (i32 IntRegs:$dst), (srl (i32 IntRegs:$src1),
+                                            (i32 IntRegs:$src2)))]>;
+
+def ASLd : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, IntRegs:$src2),
+           "$dst = asl($src1, $src2)",
+           [(set (i64 DoubleRegs:$dst), (shl (i64 DoubleRegs:$src1),
+                                             (i32 IntRegs:$src2)))]>;
+
+def LSLd : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, IntRegs:$src2),
+           "$dst = lsl($src1, $src2)",
+           [(set (i64 DoubleRegs:$dst), (shl (i64 DoubleRegs:$src1),
+                                             (i32 IntRegs:$src2)))]>;
+
+def ASRd_rr : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                 IntRegs:$src2),
+              "$dst = asr($src1, $src2)",
+              [(set (i64 DoubleRegs:$dst), (sra (i64 DoubleRegs:$src1),
+                                                (i32 IntRegs:$src2)))]>;
+
+def LSRd_rr : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                 IntRegs:$src2),
+              "$dst = lsr($src1, $src2)",
+              [(set (i64 DoubleRegs:$dst), (srl (i64 DoubleRegs:$src1),
+                                                (i32 IntRegs:$src2)))]>;
+
+//===----------------------------------------------------------------------===//
+// STYPE/SHIFT -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// STYPE/VH +
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// STYPE/VH -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// STYPE/VW +
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// STYPE/VW -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SYSTEM/SUPER +
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SYSTEM/USER +
+//===----------------------------------------------------------------------===//
+def SDHexagonBARRIER: SDTypeProfile<0, 0, []>;
+def HexagonBARRIER: SDNode<"HexagonISD::BARRIER", SDHexagonBARRIER,
+                           [SDNPHasChain]>;
+
+let hasSideEffects = 1, isSolo = 1 in
+def BARRIER : SYSInst<(outs), (ins),
+                     "barrier",
+                     [(HexagonBARRIER)]>;
+
+//===----------------------------------------------------------------------===//
+// SYSTEM/SUPER -
+//===----------------------------------------------------------------------===//
+
+// TFRI64 - assembly mapped.
+let isReMaterializable = 1 in
+def TFRI64 : ALU64_rr<(outs DoubleRegs:$dst), (ins s8Imm64:$src1),
+             "$dst = #$src1",
+             [(set (i64 DoubleRegs:$dst), s8Imm64Pred:$src1)]>;
+
+// Pseudo instruction to encode a set of conditional transfers.
+// This instruction is used instead of a mux and trades-off codesize
+// for performance. We conduct this transformation optimistically in
+// the hope that these instructions get promoted to dot-new transfers.
+let AddedComplexity = 100, isPredicated = 1 in
+def TFR_condset_rr : ALU32_rr<(outs IntRegs:$dst), (ins PredRegs:$src1,
+                                                        IntRegs:$src2,
+                                                        IntRegs:$src3),
+                     "Error; should not emit",
+                     [(set (i32 IntRegs:$dst),
+                           (i32 (select (i1 PredRegs:$src1),
+                                        (i32 IntRegs:$src2),
+                                        (i32 IntRegs:$src3))))]>;
+let AddedComplexity = 100, isPredicated = 1 in
+def TFR_condset_ri : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, s12Imm:$src3),
+            "Error; should not emit",
+            [(set (i32 IntRegs:$dst),
+             (i32 (select (i1 PredRegs:$src1), (i32 IntRegs:$src2),
+                          s12ImmPred:$src3)))]>;
+
+let AddedComplexity = 100, isPredicated = 1 in
+def TFR_condset_ir : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, s12Imm:$src2, IntRegs:$src3),
+            "Error; should not emit",
+            [(set (i32 IntRegs:$dst),
+             (i32 (select (i1 PredRegs:$src1), s12ImmPred:$src2,
+                          (i32 IntRegs:$src3))))]>;
+
+let AddedComplexity = 100, isPredicated = 1 in
+def TFR_condset_ii : ALU32_rr<(outs IntRegs:$dst),
+                              (ins PredRegs:$src1, s12Imm:$src2, s12Imm:$src3),
+                     "Error; should not emit",
+                     [(set (i32 IntRegs:$dst),
+                           (i32 (select (i1 PredRegs:$src1), s12ImmPred:$src2,
+                                        s12ImmPred:$src3)))]>;
+
+// Generate frameindex addresses.
+let isReMaterializable = 1 in
+def TFR_FI : ALU32_ri<(outs IntRegs:$dst), (ins FrameIndex:$src1),
+             "$dst = add($src1)",
+             [(set (i32 IntRegs:$dst), ADDRri:$src1)]>;
+
+//
+// CR - Type.
+//
+let neverHasSideEffects = 1, Defs = [SA0, LC0] in {
+def LOOP0_i : CRInst<(outs), (ins brtarget:$offset, u10Imm:$src2),
+                      "loop0($offset, #$src2)",
+                      []>;
+}
+
+let neverHasSideEffects = 1, Defs = [SA0, LC0] in {
+def LOOP0_r : CRInst<(outs), (ins brtarget:$offset, IntRegs:$src2),
+                      "loop0($offset, $src2)",
+                      []>;
+}
+
+let isBranch = 1, isTerminator = 1, neverHasSideEffects = 1,
+    Defs = [PC, LC0], Uses = [SA0, LC0] in {
+def ENDLOOP0 : Endloop<(outs), (ins brtarget:$offset),
+                       ":endloop0",
+                       []>;
+}
+
+// Support for generating global address.
+// Taken from X86InstrInfo.td.
+def SDTHexagonCONST32 : SDTypeProfile<1, 1, [
+                                            SDTCisVT<0, i32>,
+                                            SDTCisVT<1, i32>,
+                                            SDTCisPtrTy<0>]>;
+def HexagonCONST32 : SDNode<"HexagonISD::CONST32",     SDTHexagonCONST32>;
+def HexagonCONST32_GP : SDNode<"HexagonISD::CONST32_GP",     SDTHexagonCONST32>;
+
+// HI/LO Instructions
+let isReMaterializable = 1, isMoveImm = 1, neverHasSideEffects = 1 in
+def LO : ALU32_ri<(outs IntRegs:$dst), (ins globaladdress:$global),
+                  "$dst.l = #LO($global)",
+                  []>;
+
+let isReMaterializable = 1, isMoveImm = 1, neverHasSideEffects = 1 in
+def HI : ALU32_ri<(outs IntRegs:$dst), (ins globaladdress:$global),
+                  "$dst.h = #HI($global)",
+                  []>;
+
+let isReMaterializable = 1, isMoveImm = 1, neverHasSideEffects = 1 in
+def LOi : ALU32_ri<(outs IntRegs:$dst), (ins i32imm:$imm_value),
+                  "$dst.l = #LO($imm_value)",
+                  []>;
+
+
+let isReMaterializable = 1, isMoveImm = 1, neverHasSideEffects = 1 in
+def HIi : ALU32_ri<(outs IntRegs:$dst), (ins i32imm:$imm_value),
+                  "$dst.h = #HI($imm_value)",
+                  []>;
+
+let isReMaterializable = 1, isMoveImm = 1, neverHasSideEffects = 1 in
+def LO_jt : ALU32_ri<(outs IntRegs:$dst), (ins jumptablebase:$jt),
+                  "$dst.l = #LO($jt)",
+                  []>;
+
+let isReMaterializable = 1, isMoveImm = 1, neverHasSideEffects = 1 in
+def HI_jt : ALU32_ri<(outs IntRegs:$dst), (ins jumptablebase:$jt),
+                  "$dst.h = #HI($jt)",
+                  []>;
+
+
+let isReMaterializable = 1, isMoveImm = 1, neverHasSideEffects = 1 in
+def LO_label : ALU32_ri<(outs IntRegs:$dst), (ins bblabel:$label),
+                  "$dst.l = #LO($label)",
+                  []>;
+
+let isReMaterializable = 1, isMoveImm = 1 , neverHasSideEffects = 1 in
+def HI_label : ALU32_ri<(outs IntRegs:$dst), (ins bblabel:$label),
+                  "$dst.h = #HI($label)",
+                  []>;
+
+// This pattern is incorrect. When we add small data, we should change
+// this pattern to use memw(#foo).
+// This is for sdata.
+let isMoveImm = 1 in
+def CONST32 : LDInst<(outs IntRegs:$dst), (ins globaladdress:$global),
+              "$dst = CONST32(#$global)",
+              [(set (i32 IntRegs:$dst),
+                    (load (HexagonCONST32 tglobaltlsaddr:$global)))]>;
+
+// This is for non-sdata.
+let isReMaterializable = 1, isMoveImm = 1 in
+def CONST32_set : LDInst2<(outs IntRegs:$dst), (ins globaladdress:$global),
+                  "$dst = CONST32(#$global)",
+                  [(set (i32 IntRegs:$dst),
+                        (HexagonCONST32 tglobaladdr:$global))]>;
+
+let isReMaterializable = 1, isMoveImm = 1 in
+def CONST32_set_jt : LDInst2<(outs IntRegs:$dst), (ins jumptablebase:$jt),
+                     "$dst = CONST32(#$jt)",
+                     [(set (i32 IntRegs:$dst),
+                           (HexagonCONST32 tjumptable:$jt))]>;
+
+let isReMaterializable = 1, isMoveImm = 1 in
+def CONST32GP_set : LDInst2<(outs IntRegs:$dst), (ins globaladdress:$global),
+                    "$dst = CONST32(#$global)",
+                    [(set (i32 IntRegs:$dst),
+                          (HexagonCONST32_GP tglobaladdr:$global))]>;
+
+let isReMaterializable = 1, isMoveImm = 1 in
+def CONST32_Int_Real : LDInst2<(outs IntRegs:$dst), (ins i32imm:$global),
+                       "$dst = CONST32(#$global)",
+                       [(set (i32 IntRegs:$dst), imm:$global) ]>;
+
+// Map BlockAddress lowering to CONST32_Int_Real
+def : Pat<(HexagonCONST32_GP tblockaddress:$addr),
+          (CONST32_Int_Real tblockaddress:$addr)>;
+
+let isReMaterializable = 1, isMoveImm = 1 in
+def CONST32_Label : LDInst2<(outs IntRegs:$dst), (ins bblabel:$label),
+                    "$dst = CONST32($label)",
+                    [(set (i32 IntRegs:$dst), (HexagonCONST32 bbl:$label))]>;
+
+let isReMaterializable = 1, isMoveImm = 1 in
+def CONST64_Int_Real : LDInst2<(outs DoubleRegs:$dst), (ins i64imm:$global),
+                       "$dst = CONST64(#$global)",
+                       [(set (i64 DoubleRegs:$dst), imm:$global) ]>;
+
+def TFR_PdFalse : SInst<(outs PredRegs:$dst), (ins),
+                  "$dst = xor($dst, $dst)",
+                  [(set (i1 PredRegs:$dst), 0)]>;
+
+def MPY_trsext : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+       "$dst = mpy($src1, $src2)",
+       [(set (i32 IntRegs:$dst),
+             (trunc (i64 (srl (i64 (mul (i64 (sext (i32 IntRegs:$src1))),
+                                        (i64 (sext (i32 IntRegs:$src2))))),
+                              (i32 32)))))]>;
+
+// Pseudo instructions.
+def SDT_SPCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+
+def SDT_SPCallSeqEnd : SDCallSeqEnd<[ SDTCisVT<0, i32>,
+                                        SDTCisVT<1, i32> ]>;
+
+def callseq_end : SDNode<"ISD::CALLSEQ_END",   SDT_SPCallSeqEnd,
+                  [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_SPCallSeqStart,
+                    [SDNPHasChain, SDNPOutGlue]>;
+
+def SDT_SPCall : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
+
+def call : SDNode<"HexagonISD::CALL", SDT_SPCall,
+           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, SDNPVariadic]>;
+
+// For tailcalls a HexagonTCRet SDNode has 3 SDNode Properties - a chain,
+// Optional Flag and Variable Arguments.
+// Its 1 Operand has pointer type.
+def HexagonTCRet    : SDNode<"HexagonISD::TC_RETURN", SDT_SPCall,
+                     [SDNPHasChain,  SDNPOptInGlue, SDNPVariadic]>;
+
+let Defs = [R29, R30], Uses = [R31, R30, R29] in {
+ def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
+                        "Should never be emitted",
+                        [(callseq_start timm:$amt)]>;
+}
+
+let Defs = [R29, R30, R31], Uses = [R29] in {
+ def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+                      "Should never be emitted",
+                      [(callseq_end timm:$amt1, timm:$amt2)]>;
+}
+// Call subroutine.
+let isCall = 1, neverHasSideEffects = 1,
+  Defs = [D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10,
+          R22, R23, R28, R31, P0, P1, P2, P3, LC0, LC1, SA0, SA1] in {
+  def CALL : JInst<(outs), (ins calltarget:$dst),
+             "call $dst", []>;
+}
+
+// Call subroutine from register.
+let isCall = 1, neverHasSideEffects = 1,
+  Defs = [D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10,
+          R22, R23, R28, R31, P0, P1, P2, P3, LC0, LC1, SA0, SA1] in {
+  def CALLR : JRInst<(outs), (ins IntRegs:$dst),
+              "callr $dst",
+              []>;
+ }
+
+
+// Indirect tail-call.
+let isCodeGenOnly = 1, isCall = 1, isReturn = 1  in
+def TCRETURNR : T_JMPr;
+
+// Direct tail-calls.
+let isCall = 1, isReturn = 1, isBarrier = 1, isPredicable = 0,
+isTerminator = 1, isCodeGenOnly = 1 in {
+  def TCRETURNtg   : T_JMP<(ins calltarget:$dst)>;
+  def TCRETURNtext : T_JMP<(ins calltarget:$dst)>;
+}
+
+// Map call instruction.
+def : Pat<(call (i32 IntRegs:$dst)),
+      (CALLR (i32 IntRegs:$dst))>, Requires<[HasV2TOnly]>;
+def : Pat<(call tglobaladdr:$dst),
+      (CALL tglobaladdr:$dst)>, Requires<[HasV2TOnly]>;
+def : Pat<(call texternalsym:$dst),
+      (CALL texternalsym:$dst)>, Requires<[HasV2TOnly]>;
+//Tail calls.
+def : Pat<(HexagonTCRet tglobaladdr:$dst),
+      (TCRETURNtg tglobaladdr:$dst)>;
+def : Pat<(HexagonTCRet texternalsym:$dst),
+      (TCRETURNtext texternalsym:$dst)>;
+def : Pat<(HexagonTCRet (i32 IntRegs:$dst)),
+      (TCRETURNR (i32 IntRegs:$dst))>;
+
+// Atomic load and store support
+// 8 bit atomic load
+def : Pat<(atomic_load_8 ADDRriS11_0:$src1),
+          (i32 (LDriub ADDRriS11_0:$src1))>;
+
+def : Pat<(atomic_load_8 (add (i32 IntRegs:$src1), s11_0ImmPred:$offset)),
+          (i32 (LDriub_indexed (i32 IntRegs:$src1), s11_0ImmPred:$offset))>;
+
+// 16 bit atomic load
+def : Pat<(atomic_load_16 ADDRriS11_1:$src1),
+          (i32 (LDriuh ADDRriS11_1:$src1))>;
+
+def : Pat<(atomic_load_16 (add (i32 IntRegs:$src1), s11_1ImmPred:$offset)),
+          (i32 (LDriuh_indexed (i32 IntRegs:$src1), s11_1ImmPred:$offset))>;
+
+def : Pat<(atomic_load_32 ADDRriS11_2:$src1),
+          (i32 (LDriw ADDRriS11_2:$src1))>;
+
+def : Pat<(atomic_load_32 (add (i32 IntRegs:$src1), s11_2ImmPred:$offset)),
+          (i32 (LDriw_indexed (i32 IntRegs:$src1), s11_2ImmPred:$offset))>;
+
+// 64 bit atomic load
+def : Pat<(atomic_load_64 ADDRriS11_3:$src1),
+          (i64 (LDrid ADDRriS11_3:$src1))>;
+
+def : Pat<(atomic_load_64 (add (i32 IntRegs:$src1), s11_3ImmPred:$offset)),
+          (i64 (LDrid_indexed (i32 IntRegs:$src1), s11_3ImmPred:$offset))>;
+
+
+def : Pat<(atomic_store_8 ADDRriS11_0:$src2, (i32 IntRegs:$src1)),
+          (STrib ADDRriS11_0:$src2, (i32 IntRegs:$src1))>;
+
+def : Pat<(atomic_store_8 (add (i32 IntRegs:$src2), s11_0ImmPred:$offset),
+                          (i32 IntRegs:$src1)),
+          (STrib_indexed (i32 IntRegs:$src2), s11_0ImmPred:$offset,
+                         (i32 IntRegs:$src1))>;
+
+
+def : Pat<(atomic_store_16 ADDRriS11_1:$src2, (i32 IntRegs:$src1)),
+          (STrih ADDRriS11_1:$src2, (i32 IntRegs:$src1))>;
+
+def : Pat<(atomic_store_16 (i32 IntRegs:$src1),
+                          (add (i32 IntRegs:$src2), s11_1ImmPred:$offset)),
+          (STrih_indexed (i32 IntRegs:$src2), s11_1ImmPred:$offset,
+                         (i32 IntRegs:$src1))>;
+
+def : Pat<(atomic_store_32 ADDRriS11_2:$src2, (i32 IntRegs:$src1)),
+          (STriw ADDRriS11_2:$src2, (i32 IntRegs:$src1))>;
+
+def : Pat<(atomic_store_32 (add (i32 IntRegs:$src2), s11_2ImmPred:$offset),
+                           (i32 IntRegs:$src1)),
+          (STriw_indexed (i32 IntRegs:$src2), s11_2ImmPred:$offset,
+                         (i32 IntRegs:$src1))>;
+
+
+
+
+def : Pat<(atomic_store_64 ADDRriS11_3:$src2, (i64 DoubleRegs:$src1)),
+          (STrid ADDRriS11_3:$src2, (i64 DoubleRegs:$src1))>;
+
+def : Pat<(atomic_store_64 (add (i32 IntRegs:$src2), s11_3ImmPred:$offset),
+                           (i64 DoubleRegs:$src1)),
+          (STrid_indexed (i32 IntRegs:$src2), s11_3ImmPred:$offset,
+                         (i64 DoubleRegs:$src1))>;
+
+// Map from r0 = and(r1, 65535) to r0 = zxth(r1)
+def : Pat <(and (i32 IntRegs:$src1), 65535),
+      (ZXTH (i32 IntRegs:$src1))>;
+
+// Map from r0 = and(r1, 255) to r0 = zxtb(r1).
+def : Pat <(and (i32 IntRegs:$src1), 255),
+      (ZXTB (i32 IntRegs:$src1))>;
+
+// Map Add(p1, true) to p1 = not(p1).
+//     Add(p1, false) should never be produced,
+//     if it does, it got to be mapped to NOOP.
+def : Pat <(add (i1 PredRegs:$src1), -1),
+      (NOT_p (i1 PredRegs:$src1))>;
+
+// Map from p0 = setlt(r0, r1) r2 = mux(p0, r3, r4) =>
+//   p0 = cmp.lt(r0, r1), r0 = mux(p0, r2, r1).
+// cmp.lt(r0, r1) -> cmp.gt(r1, r0)
+def : Pat <(select (i1 (setlt (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
+                   (i32 IntRegs:$src3),
+                   (i32 IntRegs:$src4)),
+      (i32 (TFR_condset_rr (CMPGTrr (i32 IntRegs:$src2), (i32 IntRegs:$src1)),
+                           (i32 IntRegs:$src4), (i32 IntRegs:$src3)))>,
+      Requires<[HasV2TOnly]>;
+
+// Map from p0 = pnot(p0); r0 = mux(p0, #i, #j) => r0 = mux(p0, #j, #i).
+def : Pat <(select (not (i1 PredRegs:$src1)), s8ImmPred:$src2, s8ImmPred:$src3),
+      (i32 (TFR_condset_ii (i1 PredRegs:$src1), s8ImmPred:$src3,
+                           s8ImmPred:$src2))>;
+
+// Map from p0 = pnot(p0); r0 = select(p0, #i, r1)
+// => r0 = TFR_condset_ri(p0, r1, #i)
+def : Pat <(select (not (i1 PredRegs:$src1)), s12ImmPred:$src2,
+                   (i32 IntRegs:$src3)),
+      (i32 (TFR_condset_ri (i1 PredRegs:$src1), (i32 IntRegs:$src3),
+                           s12ImmPred:$src2))>;
+
+// Map from p0 = pnot(p0); r0 = mux(p0, r1, #i)
+// => r0 = TFR_condset_ir(p0, #i, r1)
+def : Pat <(select (not (i1 PredRegs:$src1)), IntRegs:$src2, s12ImmPred:$src3),
+      (i32 (TFR_condset_ir (i1 PredRegs:$src1), s12ImmPred:$src3,
+                           (i32 IntRegs:$src2)))>;
+
+// Map from p0 = pnot(p0); if (p0) jump => if (!p0) jump.
+def : Pat <(brcond (not (i1 PredRegs:$src1)), bb:$offset),
+      (JMP_f (i1 PredRegs:$src1), bb:$offset)>;
+
+// Map from p2 = pnot(p2); p1 = and(p0, p2) => p1 = and(p0, !p2).
+def : Pat <(and (i1 PredRegs:$src1), (not (i1 PredRegs:$src2))),
+      (i1 (AND_pnotp (i1 PredRegs:$src1), (i1 PredRegs:$src2)))>;
+
+
+let AddedComplexity = 100 in
+def : Pat <(i64 (zextloadi1 (HexagonCONST32 tglobaladdr:$global))),
+      (i64 (COMBINE_rr (TFRI 0),
+                       (LDriub_indexed (CONST32_set tglobaladdr:$global), 0)))>,
+      Requires<[NoV4T]>;
+
+// Map from i1 loads to 32 bits. This assumes that the i1* is byte aligned.
+let AddedComplexity = 10 in
+def : Pat <(i32 (zextloadi1 ADDRriS11_0:$addr)),
+      (i32 (AND_rr (i32 (LDrib ADDRriS11_0:$addr)), (TFRI 0x1)))>;
+
+// Map from Rdd = sign_extend_inreg(Rss, i32) -> Rdd = SXTW(Rss.lo).
+def : Pat <(i64 (sext_inreg (i64 DoubleRegs:$src1), i32)),
+      (i64 (SXTW (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_loreg))))>;
+
+// Map from Rdd = sign_extend_inreg(Rss, i16) -> Rdd = SXTW(SXTH(Rss.lo)).
+def : Pat <(i64 (sext_inreg (i64 DoubleRegs:$src1), i16)),
+      (i64 (SXTW (i32 (SXTH (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1),
+                                                 subreg_loreg))))))>;
+
+// Map from Rdd = sign_extend_inreg(Rss, i8) -> Rdd = SXTW(SXTB(Rss.lo)).
+def : Pat <(i64 (sext_inreg (i64 DoubleRegs:$src1), i8)),
+      (i64 (SXTW (i32 (SXTB (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1),
+                                                 subreg_loreg))))))>;
+
+// We want to prevent emitting pnot's as much as possible.
+// Map brcond with an unsupported setcc to a JMP_f.
+def : Pat <(brcond (i1 (setne (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
+                        bb:$offset),
+      (JMP_f (CMPEQrr (i32 IntRegs:$src1), (i32 IntRegs:$src2)),
+                bb:$offset)>;
+
+def : Pat <(brcond (i1 (setne (i32 IntRegs:$src1), s10ImmPred:$src2)),
+                        bb:$offset),
+      (JMP_f (CMPEQri (i32 IntRegs:$src1), s10ImmPred:$src2), bb:$offset)>;
+
+def : Pat <(brcond (i1 (setne (i1 PredRegs:$src1), (i1 -1))), bb:$offset),
+      (JMP_f (i1 PredRegs:$src1), bb:$offset)>;
+
+def : Pat <(brcond (i1 (setne (i1 PredRegs:$src1), (i1 0))), bb:$offset),
+      (JMP_t (i1 PredRegs:$src1), bb:$offset)>;
+
+// cmp.lt(Rs, Imm) -> !cmp.ge(Rs, Imm) -> !cmp.gt(Rs, Imm-1)
+def : Pat <(brcond (i1 (setlt (i32 IntRegs:$src1), s8ImmPred:$src2)),
+                        bb:$offset),
+      (JMP_f (CMPGTri (i32 IntRegs:$src1),
+                (DEC_CONST_SIGNED s8ImmPred:$src2)), bb:$offset)>;
+
+// cmp.lt(r0, r1) -> cmp.gt(r1, r0)
+def : Pat <(brcond (i1 (setlt (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
+                        bb:$offset),
+      (JMP_t (CMPGTrr (i32 IntRegs:$src2), (i32 IntRegs:$src1)), bb:$offset)>;
+
+def : Pat <(brcond (i1 (setuge (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
+                   bb:$offset),
+      (JMP_f (CMPGTU64rr (i64 DoubleRegs:$src2), (i64 DoubleRegs:$src1)),
+                   bb:$offset)>;
+
+def : Pat <(brcond (i1 (setule (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
+                        bb:$offset),
+      (JMP_f (CMPGTUrr (i32 IntRegs:$src1), (i32 IntRegs:$src2)),
+                bb:$offset)>;
+
+def : Pat <(brcond (i1 (setule (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
+                   bb:$offset),
+      (JMP_f (CMPGTU64rr (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2)),
+                bb:$offset)>;
+
+// Map from a 64-bit select to an emulated 64-bit mux.
+// Hexagon does not support 64-bit MUXes; so emulate with combines.
+def : Pat <(select (i1 PredRegs:$src1), (i64 DoubleRegs:$src2),
+                   (i64 DoubleRegs:$src3)),
+      (i64 (COMBINE_rr (i32 (MUX_rr (i1 PredRegs:$src1),
+                                    (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2),
+                                                         subreg_hireg)),
+                                    (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src3),
+                                                         subreg_hireg)))),
+                       (i32 (MUX_rr (i1 PredRegs:$src1),
+                                    (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2),
+                                                         subreg_loreg)),
+                                    (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src3),
+                                                         subreg_loreg))))))>;
+
+// Map from a 1-bit select to logical ops.
+// From LegalizeDAG.cpp: (B1 ? B2 : B3) <=> (B1 & B2)|(!B1&B3).
+def : Pat <(select (i1 PredRegs:$src1), (i1 PredRegs:$src2),
+                   (i1 PredRegs:$src3)),
+      (OR_pp (AND_pp (i1 PredRegs:$src1), (i1 PredRegs:$src2)),
+             (AND_pp (NOT_p (i1 PredRegs:$src1)), (i1 PredRegs:$src3)))>;
+
+// Map Pd = load(addr) -> Rs = load(addr); Pd = Rs.
+def : Pat<(i1 (load ADDRriS11_2:$addr)),
+      (i1 (TFR_PdRs (i32 (LDrib ADDRriS11_2:$addr))))>;
+
+// Map for truncating from 64 immediates to 32 bit immediates.
+def : Pat<(i32 (trunc (i64 DoubleRegs:$src))),
+      (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src), subreg_loreg))>;
+
+// Map for truncating from i64 immediates to i1 bit immediates.
+def :  Pat<(i1 (trunc (i64 DoubleRegs:$src))),
+       (i1 (TFR_PdRs (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src),
+                                          subreg_loreg))))>;
+
+// Map memb(Rs) = Rdd -> memb(Rs) = Rt.
+def : Pat<(truncstorei8 (i64 DoubleRegs:$src), ADDRriS11_0:$addr),
+      (STrib ADDRriS11_0:$addr, (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src),
+                                                     subreg_loreg)))>;
+
+// Map memh(Rs) = Rdd -> memh(Rs) = Rt.
+def : Pat<(truncstorei16 (i64 DoubleRegs:$src), ADDRriS11_0:$addr),
+      (STrih ADDRriS11_0:$addr, (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src),
+                                                     subreg_loreg)))>;
+// Map memw(Rs) = Rdd -> memw(Rs) = Rt
+def : Pat<(truncstorei32 (i64  DoubleRegs:$src), ADDRriS11_0:$addr),
+      (STriw ADDRriS11_0:$addr, (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src),
+                                                     subreg_loreg)))>;
+
+// Map memw(Rs) = Rdd -> memw(Rs) = Rt.
+def : Pat<(truncstorei32 (i64 DoubleRegs:$src), ADDRriS11_0:$addr),
+      (STriw ADDRriS11_0:$addr, (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src),
+                                                     subreg_loreg)))>;
+
+// Map from i1 = constant<-1>; memw(addr) = i1 -> r0 = 1; memw(addr) = r0.
+def : Pat<(store (i1 -1), ADDRriS11_2:$addr),
+      (STrib ADDRriS11_2:$addr, (TFRI 1))>;
+
+
+// Map from i1 = constant<-1>; store i1 -> r0 = 1; store r0.
+def : Pat<(store (i1 -1), ADDRriS11_2:$addr),
+      (STrib ADDRriS11_2:$addr, (TFRI 1))>;
+
+// Map from memb(Rs) = Pd -> Rt = mux(Pd, #0, #1); store Rt.
+def : Pat<(store (i1 PredRegs:$src1), ADDRriS11_2:$addr),
+      (STrib ADDRriS11_2:$addr, (i32 (MUX_ii (i1 PredRegs:$src1), 1, 0)) )>;
+
+// Map Rdd = anyext(Rs) -> Rdd = sxtw(Rs).
+// Hexagon_TODO: We can probably use combine but that will cost 2 instructions.
+// Better way to do this?
+def : Pat<(i64 (anyext (i32 IntRegs:$src1))),
+      (i64 (SXTW (i32 IntRegs:$src1)))>;
+
+// Map cmple -> cmpgt.
+// rs <= rt -> !(rs > rt).
+def : Pat<(i1 (setle (i32 IntRegs:$src1), s10ExtPred:$src2)),
+      (i1 (NOT_p (CMPGTri (i32 IntRegs:$src1), s10ExtPred:$src2)))>;
+
+// rs <= rt -> !(rs > rt).
+def : Pat<(i1 (setle (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
+      (i1 (NOT_p (CMPGTrr (i32 IntRegs:$src1), (i32 IntRegs:$src2))))>;
+
+// Rss <= Rtt -> !(Rss > Rtt).
+def : Pat<(i1 (setle (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
+      (i1 (NOT_p (CMPGT64rr (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))))>;
+
+// Map cmpne -> cmpeq.
+// Hexagon_TODO: We should improve on this.
+// rs != rt -> !(rs == rt).
+def : Pat <(i1 (setne (i32 IntRegs:$src1), s10ExtPred:$src2)),
+      (i1 (NOT_p(i1 (CMPEQri (i32 IntRegs:$src1), s10ExtPred:$src2))))>;
+
+// Map cmpne(Rs) -> !cmpeqe(Rs).
+// rs != rt -> !(rs == rt).
+def : Pat <(i1 (setne (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
+      (i1 (NOT_p (i1 (CMPEQrr (i32 IntRegs:$src1), (i32 IntRegs:$src2)))))>;
+
+// Convert setne back to xor for hexagon since we compute w/ pred registers.
+def : Pat <(i1 (setne (i1 PredRegs:$src1), (i1 PredRegs:$src2))),
+      (i1 (XOR_pp (i1 PredRegs:$src1), (i1 PredRegs:$src2)))>;
+
+// Map cmpne(Rss) -> !cmpew(Rss).
+// rs != rt -> !(rs == rt).
+def : Pat <(i1 (setne (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
+      (i1 (NOT_p (i1 (CMPEHexagon4rr (i64 DoubleRegs:$src1),
+                                     (i64 DoubleRegs:$src2)))))>;
+
+// Map cmpge(Rs, Rt) -> !(cmpgt(Rs, Rt).
+// rs >= rt -> !(rt > rs).
+def : Pat <(i1 (setge (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
+      (i1 (NOT_p (i1 (CMPGTrr (i32 IntRegs:$src2), (i32 IntRegs:$src1)))))>;
+
+// cmpge(Rs, Imm) -> cmpgt(Rs, Imm-1)
+def : Pat <(i1 (setge (i32 IntRegs:$src1), s8ExtPred:$src2)),
+      (i1 (CMPGTri (i32 IntRegs:$src1), (DEC_CONST_SIGNED s8ExtPred:$src2)))>;
+
+// Map cmpge(Rss, Rtt) -> !cmpgt(Rtt, Rss).
+// rss >= rtt -> !(rtt > rss).
+def : Pat <(i1 (setge (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
+      (i1 (NOT_p (i1 (CMPGT64rr (i64 DoubleRegs:$src2),
+                                (i64 DoubleRegs:$src1)))))>;
+
+// Map cmplt(Rs, Imm) -> !cmpge(Rs, Imm).
+// !cmpge(Rs, Imm) -> !cmpgt(Rs, Imm-1).
+// rs < rt -> !(rs >= rt).
+def : Pat <(i1 (setlt (i32 IntRegs:$src1), s8ExtPred:$src2)),
+      (i1 (NOT_p (CMPGTri (i32 IntRegs:$src1), (DEC_CONST_SIGNED s8ExtPred:$src2))))>;
+
+// Map cmplt(Rs, Rt) -> cmpgt(Rt, Rs).
+// rs < rt -> rt > rs.
+// We can let assembler map it, or we can do in the compiler itself.
+def : Pat <(i1 (setlt (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
+      (i1 (CMPGTrr (i32 IntRegs:$src2), (i32 IntRegs:$src1)))>;
+
+// Map cmplt(Rss, Rtt) -> cmpgt(Rtt, Rss).
+// rss < rtt -> (rtt > rss).
+def : Pat <(i1 (setlt (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
+      (i1 (CMPGT64rr (i64 DoubleRegs:$src2), (i64 DoubleRegs:$src1)))>;
+
+// Map from cmpltu(Rs, Rd) -> cmpgtu(Rd, Rs)
+// rs < rt -> rt > rs.
+// We can let assembler map it, or we can do in the compiler itself.
+def : Pat <(i1 (setult (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
+      (i1 (CMPGTUrr (i32 IntRegs:$src2), (i32 IntRegs:$src1)))>;
+
+// Map from cmpltu(Rss, Rdd) -> cmpgtu(Rdd, Rss).
+// rs < rt -> rt > rs.
+def : Pat <(i1 (setult (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
+      (i1 (CMPGTU64rr (i64 DoubleRegs:$src2), (i64 DoubleRegs:$src1)))>;
+
+// Generate cmpgeu(Rs, #0) -> cmpeq(Rs, Rs)
+def : Pat <(i1 (setuge (i32 IntRegs:$src1), 0)),
+      (i1 (CMPEQrr (i32 IntRegs:$src1), (i32 IntRegs:$src1)))>;
+
+// Generate cmpgeu(Rs, #u8) -> cmpgtu(Rs, #u8 -1)
+def : Pat <(i1 (setuge (i32 IntRegs:$src1), u8ExtPred:$src2)),
+      (i1 (CMPGTUri (i32 IntRegs:$src1), (DEC_CONST_UNSIGNED u8ExtPred:$src2)))>;
+
+// Generate cmpgtu(Rs, #u9)
+def : Pat <(i1 (setugt (i32 IntRegs:$src1), u9ExtPred:$src2)),
+      (i1 (CMPGTUri (i32 IntRegs:$src1), u9ExtPred:$src2))>;
+
+// Map from Rs >= Rt -> !(Rt > Rs).
+// rs >= rt -> !(rt > rs).
+def : Pat <(i1 (setuge (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
+      (i1 (NOT_p (CMPGTUrr (i32 IntRegs:$src2), (i32 IntRegs:$src1))))>;
+
+// Map from Rs >= Rt -> !(Rt > Rs).
+// rs >= rt -> !(rt > rs).
+def : Pat <(i1 (setuge (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
+      (i1 (NOT_p (CMPGTU64rr (i64 DoubleRegs:$src2), (i64 DoubleRegs:$src1))))>;
+
+// Map from cmpleu(Rs, Rt) -> !cmpgtu(Rs, Rt).
+// Map from (Rs <= Rt) -> !(Rs > Rt).
+def : Pat <(i1 (setule (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
+      (i1 (NOT_p (CMPGTUrr (i32 IntRegs:$src1), (i32 IntRegs:$src2))))>;
+
+// Map from cmpleu(Rss, Rtt) -> !cmpgtu(Rss, Rtt-1).
+// Map from (Rs <= Rt) -> !(Rs > Rt).
+def : Pat <(i1 (setule (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
+      (i1 (NOT_p (CMPGTU64rr (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))))>;
+
+// Sign extends.
+// i1 -> i32
+def : Pat <(i32 (sext (i1 PredRegs:$src1))),
+      (i32 (MUX_ii (i1 PredRegs:$src1), -1, 0))>;
+
+// i1 -> i64
+def : Pat <(i64 (sext (i1 PredRegs:$src1))),
+      (i64 (COMBINE_rr (TFRI -1), (MUX_ii (i1 PredRegs:$src1), -1, 0)))>;
+
+// Convert sign-extended load back to load and sign extend.
+// i8 -> i64
+def:  Pat <(i64 (sextloadi8 ADDRriS11_0:$src1)),
+      (i64 (SXTW (LDrib ADDRriS11_0:$src1)))>;
+
+// Convert any-extended load back to load and sign extend.
+// i8 -> i64
+def:  Pat <(i64 (extloadi8 ADDRriS11_0:$src1)),
+      (i64 (SXTW (LDrib ADDRriS11_0:$src1)))>;
+
+// Convert sign-extended load back to load and sign extend.
+// i16 -> i64
+def:  Pat <(i64 (sextloadi16 ADDRriS11_1:$src1)),
+      (i64 (SXTW (LDrih ADDRriS11_1:$src1)))>;
+
+// Convert sign-extended load back to load and sign extend.
+// i32 -> i64
+def:  Pat <(i64 (sextloadi32 ADDRriS11_2:$src1)),
+      (i64 (SXTW (LDriw ADDRriS11_2:$src1)))>;
+
+
+// Zero extends.
+// i1 -> i32
+def : Pat <(i32 (zext (i1 PredRegs:$src1))),
+      (i32 (MUX_ii (i1 PredRegs:$src1), 1, 0))>;
+
+// i1 -> i64
+def : Pat <(i64 (zext (i1 PredRegs:$src1))),
+      (i64 (COMBINE_rr (TFRI 0), (MUX_ii (i1 PredRegs:$src1), 1, 0)))>,
+      Requires<[NoV4T]>;
+
+// i32 -> i64
+def : Pat <(i64 (zext (i32 IntRegs:$src1))),
+      (i64 (COMBINE_rr (TFRI 0), (i32 IntRegs:$src1)))>,
+      Requires<[NoV4T]>;
+
+// i8 -> i64
+def:  Pat <(i64 (zextloadi8 ADDRriS11_0:$src1)),
+      (i64 (COMBINE_rr (TFRI 0), (LDriub ADDRriS11_0:$src1)))>,
+      Requires<[NoV4T]>;
+
+let AddedComplexity = 20 in
+def:  Pat <(i64 (zextloadi8 (add (i32 IntRegs:$src1),
+                                s11_0ExtPred:$offset))),
+      (i64 (COMBINE_rr (TFRI 0), (LDriub_indexed IntRegs:$src1,
+                                  s11_0ExtPred:$offset)))>,
+      Requires<[NoV4T]>;
+
+// i1 -> i64
+def:  Pat <(i64 (zextloadi1 ADDRriS11_0:$src1)),
+      (i64 (COMBINE_rr (TFRI 0), (LDriub ADDRriS11_0:$src1)))>,
+      Requires<[NoV4T]>;
+
+let AddedComplexity = 20 in
+def:  Pat <(i64 (zextloadi1 (add (i32 IntRegs:$src1),
+                                s11_0ExtPred:$offset))),
+      (i64 (COMBINE_rr (TFRI 0), (LDriub_indexed IntRegs:$src1,
+                                  s11_0ExtPred:$offset)))>,
+      Requires<[NoV4T]>;
+
+// i16 -> i64
+def:  Pat <(i64 (zextloadi16 ADDRriS11_1:$src1)),
+      (i64 (COMBINE_rr (TFRI 0), (LDriuh ADDRriS11_1:$src1)))>,
+      Requires<[NoV4T]>;
+
+let AddedComplexity = 20 in
+def:  Pat <(i64 (zextloadi16 (add (i32 IntRegs:$src1),
+                                  s11_1ExtPred:$offset))),
+      (i64 (COMBINE_rr (TFRI 0), (LDriuh_indexed IntRegs:$src1,
+                                  s11_1ExtPred:$offset)))>,
+      Requires<[NoV4T]>;
+
+// i32 -> i64
+def:  Pat <(i64 (zextloadi32 ADDRriS11_2:$src1)),
+      (i64 (COMBINE_rr (TFRI 0), (LDriw ADDRriS11_2:$src1)))>,
+      Requires<[NoV4T]>;
+
+let AddedComplexity = 100 in
+def:  Pat <(i64 (zextloadi32 (i32 (add IntRegs:$src1, s11_2ExtPred:$offset)))),
+      (i64 (COMBINE_rr (TFRI 0), (LDriw_indexed IntRegs:$src1,
+                                  s11_2ExtPred:$offset)))>,
+      Requires<[NoV4T]>;
+
+let AddedComplexity = 10 in
+def:  Pat <(i32 (zextloadi1 ADDRriS11_0:$src1)),
+      (i32 (LDriw ADDRriS11_0:$src1))>;
+
+// Map from Rs = Pd to Pd = mux(Pd, #1, #0)
+def : Pat <(i32 (zext (i1 PredRegs:$src1))),
+      (i32 (MUX_ii (i1 PredRegs:$src1), 1, 0))>;
+
+// Map from Rs = Pd to Pd = mux(Pd, #1, #0)
+def : Pat <(i32 (anyext (i1 PredRegs:$src1))),
+      (i32 (MUX_ii (i1 PredRegs:$src1), 1, 0))>;
+
+// Map from Rss = Pd to Rdd = sxtw (mux(Pd, #1, #0))
+def : Pat <(i64 (anyext (i1 PredRegs:$src1))),
+      (i64 (SXTW (i32 (MUX_ii (i1 PredRegs:$src1), 1, 0))))>;
+
+
+let AddedComplexity = 100 in
+def: Pat<(i64 (or (i64 (shl (i64 DoubleRegs:$srcHigh),
+                           (i32 32))),
+               (i64 (zextloadi32 (i32 (add IntRegs:$src2,
+                                         s11_2ExtPred:$offset2)))))),
+        (i64 (COMBINE_rr (EXTRACT_SUBREG (i64 DoubleRegs:$srcHigh), subreg_loreg),
+                        (LDriw_indexed IntRegs:$src2,
+                                       s11_2ExtPred:$offset2)))>;
+
+def: Pat<(i64 (or (i64 (shl (i64 DoubleRegs:$srcHigh),
+                           (i32 32))),
+               (i64 (zextloadi32 ADDRriS11_2:$srcLow)))),
+        (i64 (COMBINE_rr (EXTRACT_SUBREG (i64 DoubleRegs:$srcHigh), subreg_loreg),
+                        (LDriw ADDRriS11_2:$srcLow)))>;
+
+def: Pat<(i64 (or (i64 (shl (i64 DoubleRegs:$srcHigh),
+                           (i32 32))),
+               (i64 (zext (i32 IntRegs:$srcLow))))),
+        (i64 (COMBINE_rr (EXTRACT_SUBREG (i64 DoubleRegs:$srcHigh), subreg_loreg),
+                        IntRegs:$srcLow))>;
+
+let AddedComplexity = 100 in
+def: Pat<(i64 (or (i64 (shl (i64 DoubleRegs:$srcHigh),
+                           (i32 32))),
+               (i64 (zextloadi32 (i32 (add IntRegs:$src2,
+                                         s11_2ExtPred:$offset2)))))),
+        (i64 (COMBINE_rr (EXTRACT_SUBREG (i64 DoubleRegs:$srcHigh), subreg_loreg),
+                        (LDriw_indexed IntRegs:$src2,
+                                       s11_2ExtPred:$offset2)))>;
+
+def: Pat<(i64 (or (i64 (shl (i64 DoubleRegs:$srcHigh),
+                           (i32 32))),
+               (i64 (zextloadi32 ADDRriS11_2:$srcLow)))),
+        (i64 (COMBINE_rr (EXTRACT_SUBREG (i64 DoubleRegs:$srcHigh), subreg_loreg),
+                        (LDriw ADDRriS11_2:$srcLow)))>;
+
+def: Pat<(i64 (or (i64 (shl (i64 DoubleRegs:$srcHigh),
+                           (i32 32))),
+               (i64 (zext (i32 IntRegs:$srcLow))))),
+        (i64 (COMBINE_rr (EXTRACT_SUBREG (i64 DoubleRegs:$srcHigh), subreg_loreg),
+                        IntRegs:$srcLow))>;
+
+// Any extended 64-bit load.
+// anyext i32 -> i64
+def:  Pat <(i64 (extloadi32 ADDRriS11_2:$src1)),
+      (i64 (COMBINE_rr (TFRI 0), (LDriw ADDRriS11_2:$src1)))>,
+      Requires<[NoV4T]>;
+
+// When there is an offset we should prefer the pattern below over the pattern above.
+// The complexity of the above is 13 (gleaned from HexagonGenDAGIsel.inc)
+// So this complexity below is comfortably higher to allow for choosing the below.
+// If this is not done then we generate addresses such as
+// ********************************************
+//        r1 = add (r0, #4)
+//        r1 = memw(r1 + #0)
+//  instead of
+//        r1 = memw(r0 + #4)
+// ********************************************
+let AddedComplexity = 100 in
+def:  Pat <(i64 (extloadi32 (i32 (add IntRegs:$src1, s11_2ExtPred:$offset)))),
+      (i64 (COMBINE_rr (TFRI 0), (LDriw_indexed IntRegs:$src1,
+                                  s11_2ExtPred:$offset)))>,
+      Requires<[NoV4T]>;
+
+// anyext i16 -> i64.
+def:  Pat <(i64 (extloadi16 ADDRriS11_2:$src1)),
+      (i64 (COMBINE_rr (TFRI 0), (LDrih ADDRriS11_2:$src1)))>,
+      Requires<[NoV4T]>;
+
+let AddedComplexity = 20 in
+def:  Pat <(i64 (extloadi16 (add (i32 IntRegs:$src1),
+                                  s11_1ExtPred:$offset))),
+      (i64 (COMBINE_rr (TFRI 0), (LDrih_indexed IntRegs:$src1,
+                                  s11_1ExtPred:$offset)))>,
+      Requires<[NoV4T]>;
+
+// Map from Rdd = zxtw(Rs) -> Rdd = combine(0, Rs).
+def : Pat<(i64 (zext (i32 IntRegs:$src1))),
+      (i64 (COMBINE_rr (TFRI 0), (i32 IntRegs:$src1)))>,
+      Requires<[NoV4T]>;
+
+// Multiply 64-bit unsigned and use upper result.
+def : Pat <(mulhu (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2)),
+      (i64
+       (MPYU64_acc
+        (i64
+         (COMBINE_rr
+          (TFRI 0),
+           (i32
+            (EXTRACT_SUBREG
+             (i64
+              (LSRd_ri
+               (i64
+                (MPYU64_acc
+                 (i64
+                  (MPYU64_acc
+                   (i64
+                    (COMBINE_rr (TFRI 0),
+                     (i32
+                      (EXTRACT_SUBREG
+                       (i64
+                        (LSRd_ri
+                         (i64
+                          (MPYU64 (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1),
+                                                       subreg_loreg)),
+                                  (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2),
+                                                       subreg_loreg)))), 32)),
+                       subreg_loreg)))),
+                  (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_hireg)),
+                  (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), subreg_loreg)))),
+                 (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_loreg)),
+                 (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), subreg_hireg)))),
+               32)), subreg_loreg)))),
+        (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_hireg)),
+        (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), subreg_hireg))))>;
+
+// Multiply 64-bit signed and use upper result.
+def : Pat <(mulhs (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2)),
+      (i64
+       (MPY64_acc
+        (i64
+         (COMBINE_rr (TFRI 0),
+          (i32
+           (EXTRACT_SUBREG
+            (i64
+             (LSRd_ri
+              (i64
+               (MPY64_acc
+                (i64
+                 (MPY64_acc
+                  (i64
+                   (COMBINE_rr (TFRI 0),
+                    (i32
+                     (EXTRACT_SUBREG
+                      (i64
+                       (LSRd_ri
+                        (i64
+                         (MPYU64 (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1),
+                                                      subreg_loreg)),
+                                 (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2),
+                                                      subreg_loreg)))), 32)),
+                      subreg_loreg)))),
+                  (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_hireg)),
+                  (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), subreg_loreg)))),
+                (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_loreg)),
+                (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), subreg_hireg)))),
+              32)), subreg_loreg)))),
+        (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_hireg)),
+        (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), subreg_hireg))))>;
+
+// Hexagon specific ISD nodes.
+//def SDTHexagonADJDYNALLOC : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>]>;
+def SDTHexagonADJDYNALLOC : SDTypeProfile<1, 2,
+                                  [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
+def Hexagon_ADJDYNALLOC : SDNode<"HexagonISD::ADJDYNALLOC",
+                                  SDTHexagonADJDYNALLOC>;
+// Needed to tag these instructions for stack layout.
+let usesCustomInserter = 1 in
+def ADJDYNALLOC : ALU32_ri<(outs IntRegs:$dst), (ins IntRegs:$src1,
+                                                     s16Imm:$src2),
+                  "$dst = add($src1, #$src2)",
+                  [(set (i32 IntRegs:$dst),
+                        (Hexagon_ADJDYNALLOC (i32 IntRegs:$src1),
+                                             s16ImmPred:$src2))]>;
+
+def SDTHexagonARGEXTEND : SDTypeProfile<1, 1, [SDTCisVT<0, i32>]>;
+def Hexagon_ARGEXTEND : SDNode<"HexagonISD::ARGEXTEND", SDTHexagonARGEXTEND>;
+def ARGEXTEND : ALU32_rr <(outs IntRegs:$dst), (ins IntRegs:$src1),
+                "$dst = $src1",
+                [(set (i32 IntRegs:$dst),
+                      (Hexagon_ARGEXTEND (i32 IntRegs:$src1)))]>;
+
+let AddedComplexity = 100 in
+def : Pat<(i32 (sext_inreg (Hexagon_ARGEXTEND (i32 IntRegs:$src1)), i16)),
+      (COPY (i32 IntRegs:$src1))>;
+
+def HexagonWrapperJT: SDNode<"HexagonISD::WrapperJT", SDTIntUnaryOp>;
+
+def : Pat<(HexagonWrapperJT tjumptable:$dst),
+          (i32 (CONST32_set_jt tjumptable:$dst))>;
+
+// XTYPE/SHIFT
+
+// Multi-class for logical operators :
+// Shift by immediate/register and accumulate/logical
+multiclass xtype_imm<string OpcStr, SDNode OpNode1, SDNode OpNode2> {
+  def _ri : SInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2, u5Imm:$src3),
+            !strconcat("$dst ", !strconcat(OpcStr, "($src2, #$src3)")),
+            [(set (i32 IntRegs:$dst),
+                  (OpNode2 (i32 IntRegs:$src1),
+                           (OpNode1 (i32 IntRegs:$src2),
+                                    u5ImmPred:$src3)))],
+            "$src1 = $dst">;
+
+  def d_ri : SInst_acc<(outs DoubleRegs:$dst),
+            (ins DoubleRegs:$src1, DoubleRegs:$src2, u6Imm:$src3),
+            !strconcat("$dst ", !strconcat(OpcStr, "($src2, #$src3)")),
+            [(set (i64 DoubleRegs:$dst), (OpNode2 (i64 DoubleRegs:$src1),
+                          (OpNode1 (i64 DoubleRegs:$src2), u6ImmPred:$src3)))],
+            "$src1 = $dst">;
+}
+
+// Multi-class for logical operators :
+// Shift by register and accumulate/logical (32/64 bits)
+multiclass xtype_reg<string OpcStr, SDNode OpNode1, SDNode OpNode2> {
+  def _rr : SInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+            !strconcat("$dst ", !strconcat(OpcStr, "($src2, $src3)")),
+            [(set (i32 IntRegs:$dst),
+                  (OpNode2 (i32 IntRegs:$src1),
+                           (OpNode1 (i32 IntRegs:$src2),
+                                    (i32 IntRegs:$src3))))],
+            "$src1 = $dst">;
+
+  def d_rr : SInst_acc<(outs DoubleRegs:$dst),
+            (ins DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+            !strconcat("$dst ", !strconcat(OpcStr, "($src2, $src3)")),
+            [(set (i64 DoubleRegs:$dst),
+                  (OpNode2 (i64 DoubleRegs:$src1),
+                           (OpNode1 (i64 DoubleRegs:$src2),
+                                    (i32 IntRegs:$src3))))],
+            "$src1 = $dst">;
+
+}
+
+multiclass basic_xtype_imm<string OpcStr, SDNode OpNode> {
+let AddedComplexity = 100 in
+  defm _ADD : xtype_imm< !strconcat("+= ", OpcStr), OpNode, add>;
+  defm _SUB : xtype_imm< !strconcat("-= ", OpcStr), OpNode, sub>;
+  defm _AND : xtype_imm< !strconcat("&= ", OpcStr), OpNode, and>;
+  defm _OR  : xtype_imm< !strconcat("|= ", OpcStr), OpNode, or>;
+}
+
+multiclass basic_xtype_reg<string OpcStr, SDNode OpNode> {
+let AddedComplexity = 100 in
+  defm _ADD : xtype_reg< !strconcat("+= ", OpcStr), OpNode, add>;
+  defm _SUB : xtype_reg< !strconcat("-= ", OpcStr), OpNode, sub>;
+  defm _AND : xtype_reg< !strconcat("&= ", OpcStr), OpNode, and>;
+  defm _OR  : xtype_reg< !strconcat("|= ", OpcStr), OpNode, or>;
+}
+
+multiclass xtype_xor_imm<string OpcStr, SDNode OpNode> {
+let AddedComplexity = 100 in
+  defm _XOR : xtype_imm< !strconcat("^= ", OpcStr), OpNode, xor>;
+}
+
+defm ASL : basic_xtype_imm<"asl", shl>, basic_xtype_reg<"asl", shl>,
+           xtype_xor_imm<"asl", shl>;
+
+defm LSR : basic_xtype_imm<"lsr", srl>, basic_xtype_reg<"lsr", srl>,
+           xtype_xor_imm<"lsr", srl>;
+
+defm ASR : basic_xtype_imm<"asr", sra>, basic_xtype_reg<"asr", sra>;
+defm LSL : basic_xtype_reg<"lsl", shl>;
+
+// Change the sign of the immediate for Rd=-mpyi(Rs,#u8)
+def : Pat <(mul (i32 IntRegs:$src1), (ineg n8ImmPred:$src2)),
+      (i32 (MPYI_rin (i32 IntRegs:$src1), u8ImmPred:$src2))>;
+
+//===----------------------------------------------------------------------===//
+// V3 Instructions +
+//===----------------------------------------------------------------------===//
+
+include "HexagonInstrInfoV3.td"
+
+//===----------------------------------------------------------------------===//
+// V3 Instructions -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// V4 Instructions +
+//===----------------------------------------------------------------------===//
+
+include "HexagonInstrInfoV4.td"
+
+//===----------------------------------------------------------------------===//
+// V4 Instructions -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// V5 Instructions +
+//===----------------------------------------------------------------------===//
+
+include "HexagonInstrInfoV5.td"
+
+//===----------------------------------------------------------------------===//
+// V5 Instructions -
+//===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV3.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV3.td
new file mode 100644
index 0000000..7e75554
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV3.td
@@ -0,0 +1,107 @@
+//=- HexagonInstrInfoV3.td - Target Desc. for Hexagon Target -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon V3 instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+def callv3 : SDNode<"HexagonISD::CALLv3", SDT_SPCall,
+           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, SDNPVariadic]>;
+
+def callv3nr : SDNode<"HexagonISD::CALLv3nr", SDT_SPCall,
+           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, SDNPVariadic]>;
+
+//===----------------------------------------------------------------------===//
+// J +
+//===----------------------------------------------------------------------===//
+// Call subroutine.
+let isCall = 1, neverHasSideEffects = 1,
+  Defs = [D0, D1, D2, D3, D4, D5, D6, D7, R28, R31,
+                P0, P1, P2, P3, LC0, LC1, SA0, SA1] in {
+  def CALLv3 : JInst<(outs), (ins calltarget:$dst),
+             "call $dst", []>, Requires<[HasV3T]>;
+}
+
+//===----------------------------------------------------------------------===//
+// J -
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// JR +
+//===----------------------------------------------------------------------===//
+// Call subroutine from register.
+let isCall = 1, neverHasSideEffects = 1,
+  Defs = [D0, D1, D2, D3, D4, D5, D6, D7, R28, R31,
+                P0, P1, P2, P3, LC0, LC1, SA0, SA1] in {
+  def CALLRv3 : JRInst<(outs), (ins IntRegs:$dst),
+              "callr $dst",
+              []>, Requires<[HasV3TOnly]>;
+ }
+
+//===----------------------------------------------------------------------===//
+// JR -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ALU64/ALU +
+//===----------------------------------------------------------------------===//
+
+let AddedComplexity = 200 in
+def MAXw_dd : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                    DoubleRegs:$src2),
+              "$dst = max($src2, $src1)",
+              [(set (i64 DoubleRegs:$dst),
+                    (i64 (select (i1 (setlt (i64 DoubleRegs:$src2),
+                                            (i64 DoubleRegs:$src1))),
+                                 (i64 DoubleRegs:$src1),
+                                 (i64 DoubleRegs:$src2))))]>,
+Requires<[HasV3T]>;
+
+let AddedComplexity = 200 in
+def MINw_dd : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                    DoubleRegs:$src2),
+              "$dst = min($src2, $src1)",
+              [(set (i64 DoubleRegs:$dst),
+                    (i64 (select (i1 (setgt (i64 DoubleRegs:$src2),
+                                            (i64 DoubleRegs:$src1))),
+                                 (i64 DoubleRegs:$src1),
+                                 (i64 DoubleRegs:$src2))))]>,
+Requires<[HasV3T]>;
+
+//===----------------------------------------------------------------------===//
+// ALU64/ALU -
+//===----------------------------------------------------------------------===//
+
+
+
+
+//def : Pat <(brcond (i1 (seteq (i32 IntRegs:$src1), 0)), bb:$offset),
+//      (JMP_RegEzt (i32 IntRegs:$src1), bb:$offset)>, Requires<[HasV3T]>;
+
+//def : Pat <(brcond (i1 (setne (i32 IntRegs:$src1), 0)), bb:$offset),
+//      (JMP_RegNzt (i32 IntRegs:$src1), bb:$offset)>, Requires<[HasV3T]>;
+
+//def : Pat <(brcond (i1 (setle (i32 IntRegs:$src1), 0)), bb:$offset),
+//      (JMP_RegLezt (i32 IntRegs:$src1), bb:$offset)>, Requires<[HasV3T]>;
+
+//def : Pat <(brcond (i1 (setge (i32 IntRegs:$src1), 0)), bb:$offset),
+//      (JMP_RegGezt (i32 IntRegs:$src1), bb:$offset)>, Requires<[HasV3T]>;
+
+//def : Pat <(brcond (i1 (setgt (i32 IntRegs:$src1), -1)), bb:$offset),
+//      (JMP_RegGezt (i32 IntRegs:$src1), bb:$offset)>, Requires<[HasV3T]>;
+
+
+// Map call instruction
+def : Pat<(call (i32 IntRegs:$dst)),
+      (CALLRv3 (i32 IntRegs:$dst))>, Requires<[HasV3T]>;
+def : Pat<(call tglobaladdr:$dst),
+      (CALLv3 tglobaladdr:$dst)>, Requires<[HasV3T]>;
+def : Pat<(call texternalsym:$dst),
+      (CALLv3 texternalsym:$dst)>, Requires<[HasV3T]>;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV4.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV4.td
new file mode 100644
index 0000000..db5b7ea
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV4.td
@@ -0,0 +1,3397 @@
+//=- HexagonInstrInfoV4.td - Target Desc. for Hexagon Target -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon V4 instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+let neverHasSideEffects = 1 in
+class T_Immext<dag ins> :
+  EXTENDERInst<(outs), ins, "immext(#$imm)", []>,
+  Requires<[HasV4T]>;
+
+def IMMEXT_b : T_Immext<(ins brtarget:$imm)>;
+def IMMEXT_c : T_Immext<(ins calltarget:$imm)>;
+def IMMEXT_g : T_Immext<(ins globaladdress:$imm)>;
+def IMMEXT_i : T_Immext<(ins u26_6Imm:$imm)>;
+
+// Fold (add (CONST32 tglobaladdr:$addr) <offset>) into a global address.
+def FoldGlobalAddr : ComplexPattern<i32, 1, "foldGlobalAddress", [], []>;
+
+// Fold (add (CONST32_GP tglobaladdr:$addr) <offset>) into a global address.
+def FoldGlobalAddrGP : ComplexPattern<i32, 1, "foldGlobalAddressGP", [], []>;
+
+def NumUsesBelowThresCONST32 : PatFrag<(ops node:$addr),
+                                       (HexagonCONST32 node:$addr), [{
+  return hasNumUsesBelowThresGA(N->getOperand(0).getNode());
+}]>;
+
+// Hexagon V4 Architecture spec defines 8 instruction classes:
+// LD ST ALU32 XTYPE J JR MEMOP NV CR SYSTEM(system is not implemented in the
+// compiler)
+
+// LD Instructions:
+// ========================================
+// Loads (8/16/32/64 bit)
+// Deallocframe
+
+// ST Instructions:
+// ========================================
+// Stores (8/16/32/64 bit)
+// Allocframe
+
+// ALU32 Instructions:
+// ========================================
+// Arithmetic / Logical (32 bit)
+// Vector Halfword
+
+// XTYPE Instructions (32/64 bit):
+// ========================================
+// Arithmetic, Logical, Bit Manipulation
+// Multiply (Integer, Fractional, Complex)
+// Permute / Vector Permute Operations
+// Predicate Operations
+// Shift / Shift with Add/Sub/Logical
+// Vector Byte ALU
+// Vector Halfword (ALU, Shift, Multiply)
+// Vector Word (ALU, Shift)
+
+// J Instructions:
+// ========================================
+// Jump/Call PC-relative
+
+// JR Instructions:
+// ========================================
+// Jump/Call Register
+
+// MEMOP Instructions:
+// ========================================
+// Operation on memory (8/16/32 bit)
+
+// NV Instructions:
+// ========================================
+// New-value Jumps
+// New-value Stores
+
+// CR Instructions:
+// ========================================
+// Control-Register Transfers
+// Hardware Loop Setup
+// Predicate Logicals & Reductions
+
+// SYSTEM Instructions (not implemented in the compiler):
+// ========================================
+// Prefetch
+// Cache Maintenance
+// Bus Operations
+
+
+//===----------------------------------------------------------------------===//
+// ALU32 +
+//===----------------------------------------------------------------------===//
+// Generate frame index addresses.
+let neverHasSideEffects = 1, isReMaterializable = 1,
+isExtended = 1, opExtendable = 2, validSubTargets = HasV4SubT in
+def TFR_FI_immext_V4 : ALU32_ri<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, s32Imm:$offset),
+            "$dst = add($src1, ##$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+// Rd=cmp.eq(Rs,#s8)
+let validSubTargets = HasV4SubT, isExtendable = 1, opExtendable = 2,
+isExtentSigned = 1, opExtentBits = 8 in
+def V4_A4_rcmpeqi : ALU32_ri<(outs IntRegs:$Rd),
+                    (ins IntRegs:$Rs, s8Ext:$s8),
+                    "$Rd = cmp.eq($Rs, #$s8)",
+                    [(set (i32 IntRegs:$Rd),
+                          (i32 (zext (i1 (seteq (i32 IntRegs:$Rs),
+                                                s8ExtPred:$s8)))))]>,
+                    Requires<[HasV4T]>;
+
+// Preserve the TSTBIT generation
+def : Pat <(i32 (zext (i1 (setne (i32 (and (i32 (shl 1, (i32 IntRegs:$src2))),
+                                           (i32 IntRegs:$src1))), 0)))),
+      (i32 (MUX_ii (i1 (TSTBIT_rr (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
+                   1, 0))>;
+
+// Interfered with tstbit generation, above pattern preserves, see : tstbit.ll
+// Rd=cmp.ne(Rs,#s8)
+let validSubTargets = HasV4SubT, isExtendable = 1, opExtendable = 2,
+isExtentSigned = 1, opExtentBits = 8 in
+def V4_A4_rcmpneqi : ALU32_ri<(outs IntRegs:$Rd),
+                     (ins IntRegs:$Rs, s8Ext:$s8),
+                     "$Rd = !cmp.eq($Rs, #$s8)",
+                     [(set (i32 IntRegs:$Rd),
+                           (i32 (zext (i1 (setne (i32 IntRegs:$Rs),
+                                                 s8ExtPred:$s8)))))]>,
+                     Requires<[HasV4T]>;
+
+// Rd=cmp.eq(Rs,Rt)
+let validSubTargets = HasV4SubT in
+def V4_A4_rcmpeq : ALU32_ri<(outs IntRegs:$Rd),
+                   (ins IntRegs:$Rs, IntRegs:$Rt),
+                   "$Rd = cmp.eq($Rs, $Rt)",
+                   [(set (i32 IntRegs:$Rd),
+                         (i32 (zext (i1 (seteq (i32 IntRegs:$Rs),
+                                               IntRegs:$Rt)))))]>,
+                   Requires<[HasV4T]>;
+
+// Rd=cmp.ne(Rs,Rt)
+let validSubTargets = HasV4SubT in
+def V4_A4_rcmpneq : ALU32_ri<(outs IntRegs:$Rd),
+                    (ins IntRegs:$Rs, IntRegs:$Rt),
+                    "$Rd = !cmp.eq($Rs, $Rt)",
+                    [(set (i32 IntRegs:$Rd),
+                          (i32 (zext (i1 (setne (i32 IntRegs:$Rs),
+                                               IntRegs:$Rt)))))]>,
+                    Requires<[HasV4T]>;
+
+//===----------------------------------------------------------------------===//
+// ALU32 -
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// ALU32/PERM +
+//===----------------------------------------------------------------------===//
+
+// Combine
+// Rdd=combine(Rs, #s8)
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 8,
+    neverHasSideEffects = 1, validSubTargets = HasV4SubT in
+def COMBINE_rI_V4 : ALU32_ri<(outs DoubleRegs:$dst),
+            (ins IntRegs:$src1, s8Ext:$src2),
+            "$dst = combine($src1, #$src2)",
+            []>,
+            Requires<[HasV4T]>;
+
+// Rdd=combine(#s8, Rs)
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 8,
+    neverHasSideEffects = 1, validSubTargets = HasV4SubT in
+def COMBINE_Ir_V4 : ALU32_ir<(outs DoubleRegs:$dst),
+            (ins s8Ext:$src1, IntRegs:$src2),
+            "$dst = combine(#$src1, $src2)",
+            []>,
+            Requires<[HasV4T]>;
+
+def HexagonWrapperCombineRI_V4 :
+  SDNode<"HexagonISD::WrapperCombineRI_V4", SDTHexagonI64I32I32>;
+def HexagonWrapperCombineIR_V4 :
+  SDNode<"HexagonISD::WrapperCombineIR_V4", SDTHexagonI64I32I32>;
+
+def : Pat <(HexagonWrapperCombineRI_V4 IntRegs:$r, s8ExtPred:$i),
+           (COMBINE_rI_V4 IntRegs:$r, s8ExtPred:$i)>,
+          Requires<[HasV4T]>;
+
+def : Pat <(HexagonWrapperCombineIR_V4 s8ExtPred:$i, IntRegs:$r),
+           (COMBINE_Ir_V4 s8ExtPred:$i, IntRegs:$r)>,
+          Requires<[HasV4T]>;
+
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 0, opExtentBits = 6,
+    neverHasSideEffects = 1, validSubTargets = HasV4SubT in
+def COMBINE_iI_V4 : ALU32_ii<(outs DoubleRegs:$dst),
+            (ins s8Imm:$src1, u6Ext:$src2),
+            "$dst = combine(#$src1, #$src2)",
+            []>,
+            Requires<[HasV4T]>;
+
+//===----------------------------------------------------------------------===//
+// ALU32/PERM +
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// LD +
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// Template class for load instructions with Absolute set addressing mode.
+//===----------------------------------------------------------------------===//
+let isExtended = 1, opExtendable = 2, neverHasSideEffects = 1,
+validSubTargets = HasV4SubT, addrMode = AbsoluteSet in
+class T_LD_abs_set<string mnemonic, RegisterClass RC>:
+            LDInst2<(outs RC:$dst1, IntRegs:$dst2),
+            (ins u0AlwaysExt:$addr),
+            "$dst1 = "#mnemonic#"($dst2=##$addr)",
+            []>,
+            Requires<[HasV4T]>;
+
+def LDrid_abs_set_V4  : T_LD_abs_set <"memd", DoubleRegs>;
+def LDrib_abs_set_V4  : T_LD_abs_set <"memb", IntRegs>;
+def LDriub_abs_set_V4 : T_LD_abs_set <"memub", IntRegs>;
+def LDrih_abs_set_V4  : T_LD_abs_set <"memh", IntRegs>;
+def LDriw_abs_set_V4  : T_LD_abs_set <"memw", IntRegs>;
+def LDriuh_abs_set_V4 : T_LD_abs_set <"memuh", IntRegs>;
+
+
+// multiclass for load instructions with base + register offset
+// addressing mode
+multiclass ld_idxd_shl_pbase<string mnemonic, RegisterClass RC, bit isNot,
+                             bit isPredNew> {
+  let isPredicatedNew = isPredNew in
+  def NAME : LDInst2<(outs RC:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$offset),
+            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+            ") ")#"$dst = "#mnemonic#"($src2+$src3<<#$offset)",
+            []>, Requires<[HasV4T]>;
+}
+
+multiclass ld_idxd_shl_pred<string mnemonic, RegisterClass RC, bit PredNot> {
+  let isPredicatedFalse = PredNot in {
+    defm _c#NAME : ld_idxd_shl_pbase<mnemonic, RC, PredNot, 0>;
+    // Predicate new
+    defm _cdn#NAME : ld_idxd_shl_pbase<mnemonic, RC, PredNot, 1>;
+  }
+}
+
+let neverHasSideEffects  = 1 in
+multiclass ld_idxd_shl<string mnemonic, string CextOp, RegisterClass RC> {
+  let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed_shl in {
+    let isPredicable = 1 in
+    def NAME#_V4 : LDInst2<(outs RC:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$offset),
+            "$dst = "#mnemonic#"($src1+$src2<<#$offset)",
+            []>, Requires<[HasV4T]>;
+
+    let isPredicated = 1 in {
+      defm Pt_V4 : ld_idxd_shl_pred<mnemonic, RC, 0 >;
+      defm NotPt_V4 : ld_idxd_shl_pred<mnemonic, RC, 1>;
+    }
+  }
+}
+
+let addrMode = BaseRegOffset in {
+  let accessSize = ByteAccess in {
+    defm LDrib_indexed_shl: ld_idxd_shl<"memb", "LDrib", IntRegs>,
+                                        AddrModeRel;
+    defm LDriub_indexed_shl: ld_idxd_shl<"memub", "LDriub", IntRegs>,
+                                        AddrModeRel;
+  }
+  let accessSize = HalfWordAccess in {
+    defm LDrih_indexed_shl: ld_idxd_shl<"memh", "LDrih", IntRegs>, AddrModeRel;
+    defm LDriuh_indexed_shl: ld_idxd_shl<"memuh", "LDriuh", IntRegs>,
+                             AddrModeRel;
+  }
+  let accessSize = WordAccess in
+     defm LDriw_indexed_shl: ld_idxd_shl<"memw", "LDriw", IntRegs>, AddrModeRel;
+
+  let accessSize = DoubleWordAccess in
+    defm LDrid_indexed_shl: ld_idxd_shl<"memd", "LDrid", DoubleRegs>,
+                             AddrModeRel;
+}
+
+// 'def pats' for load instructions with base + register offset and non-zero
+// immediate value. Immediate value is used to left-shift the second
+// register operand.
+let AddedComplexity = 40 in {
+def : Pat <(i32 (sextloadi8 (add IntRegs:$src1,
+                                 (shl IntRegs:$src2, u2ImmPred:$offset)))),
+           (LDrib_indexed_shl_V4 IntRegs:$src1,
+            IntRegs:$src2, u2ImmPred:$offset)>,
+            Requires<[HasV4T]>;
+
+def : Pat <(i32 (zextloadi8 (add IntRegs:$src1,
+                                 (shl IntRegs:$src2, u2ImmPred:$offset)))),
+           (LDriub_indexed_shl_V4 IntRegs:$src1,
+            IntRegs:$src2, u2ImmPred:$offset)>,
+            Requires<[HasV4T]>;
+
+def : Pat <(i32 (extloadi8 (add IntRegs:$src1,
+                                (shl IntRegs:$src2, u2ImmPred:$offset)))),
+           (LDriub_indexed_shl_V4 IntRegs:$src1,
+            IntRegs:$src2, u2ImmPred:$offset)>,
+            Requires<[HasV4T]>;
+
+def : Pat <(i32 (sextloadi16 (add IntRegs:$src1,
+                                  (shl IntRegs:$src2, u2ImmPred:$offset)))),
+           (LDrih_indexed_shl_V4 IntRegs:$src1,
+            IntRegs:$src2, u2ImmPred:$offset)>,
+            Requires<[HasV4T]>;
+
+def : Pat <(i32 (zextloadi16 (add IntRegs:$src1,
+                                  (shl IntRegs:$src2, u2ImmPred:$offset)))),
+           (LDriuh_indexed_shl_V4 IntRegs:$src1,
+            IntRegs:$src2, u2ImmPred:$offset)>,
+            Requires<[HasV4T]>;
+
+def : Pat <(i32 (extloadi16 (add IntRegs:$src1,
+                                 (shl IntRegs:$src2, u2ImmPred:$offset)))),
+           (LDriuh_indexed_shl_V4 IntRegs:$src1,
+            IntRegs:$src2, u2ImmPred:$offset)>,
+            Requires<[HasV4T]>;
+
+def : Pat <(i32 (load (add IntRegs:$src1,
+                           (shl IntRegs:$src2, u2ImmPred:$offset)))),
+           (LDriw_indexed_shl_V4 IntRegs:$src1,
+            IntRegs:$src2, u2ImmPred:$offset)>,
+            Requires<[HasV4T]>;
+
+def : Pat <(i64 (load (add IntRegs:$src1,
+                           (shl IntRegs:$src2, u2ImmPred:$offset)))),
+           (LDrid_indexed_shl_V4 IntRegs:$src1,
+            IntRegs:$src2, u2ImmPred:$offset)>,
+            Requires<[HasV4T]>;
+}
+
+
+// 'def pats' for load instruction base + register offset and
+// zero immediate value.
+let AddedComplexity = 10 in {
+def : Pat <(i64 (load (add IntRegs:$src1, IntRegs:$src2))),
+           (LDrid_indexed_shl_V4 IntRegs:$src1, IntRegs:$src2, 0)>,
+            Requires<[HasV4T]>;
+
+def : Pat <(i32 (sextloadi8 (add IntRegs:$src1, IntRegs:$src2))),
+           (LDrib_indexed_shl_V4 IntRegs:$src1, IntRegs:$src2, 0)>,
+            Requires<[HasV4T]>;
+
+def : Pat <(i32 (zextloadi8 (add IntRegs:$src1, IntRegs:$src2))),
+           (LDriub_indexed_shl_V4 IntRegs:$src1, IntRegs:$src2, 0)>,
+            Requires<[HasV4T]>;
+
+def : Pat <(i32 (extloadi8 (add IntRegs:$src1, IntRegs:$src2))),
+           (LDriub_indexed_shl_V4 IntRegs:$src1, IntRegs:$src2, 0)>,
+            Requires<[HasV4T]>;
+
+def : Pat <(i32 (sextloadi16 (add IntRegs:$src1, IntRegs:$src2))),
+           (LDrih_indexed_shl_V4 IntRegs:$src1, IntRegs:$src2, 0)>,
+            Requires<[HasV4T]>;
+
+def : Pat <(i32 (zextloadi16 (add IntRegs:$src1, IntRegs:$src2))),
+           (LDriuh_indexed_shl_V4 IntRegs:$src1, IntRegs:$src2, 0)>,
+            Requires<[HasV4T]>;
+
+def : Pat <(i32 (extloadi16 (add IntRegs:$src1, IntRegs:$src2))),
+           (LDriuh_indexed_shl_V4 IntRegs:$src1, IntRegs:$src2, 0)>,
+            Requires<[HasV4T]>;
+
+def : Pat <(i32 (load (add IntRegs:$src1, IntRegs:$src2))),
+           (LDriw_indexed_shl_V4 IntRegs:$src1, IntRegs:$src2, 0)>,
+            Requires<[HasV4T]>;
+}
+
+// zext i1->i64
+def : Pat <(i64 (zext (i1 PredRegs:$src1))),
+      (i64 (COMBINE_Ir_V4 0, (MUX_ii (i1 PredRegs:$src1), 1, 0)))>,
+      Requires<[HasV4T]>;
+
+// zext i32->i64
+def : Pat <(i64 (zext (i32 IntRegs:$src1))),
+      (i64 (COMBINE_Ir_V4 0, (i32 IntRegs:$src1)))>,
+      Requires<[HasV4T]>;
+// zext i8->i64
+def:  Pat <(i64 (zextloadi8 ADDRriS11_0:$src1)),
+      (i64 (COMBINE_Ir_V4 0, (LDriub ADDRriS11_0:$src1)))>,
+      Requires<[HasV4T]>;
+
+let AddedComplexity = 20 in
+def:  Pat <(i64 (zextloadi8 (add (i32 IntRegs:$src1),
+                                s11_0ExtPred:$offset))),
+      (i64 (COMBINE_Ir_V4 0, (LDriub_indexed IntRegs:$src1,
+                                  s11_0ExtPred:$offset)))>,
+      Requires<[HasV4T]>;
+
+// zext i1->i64
+def:  Pat <(i64 (zextloadi1 ADDRriS11_0:$src1)),
+      (i64 (COMBINE_Ir_V4 0, (LDriub ADDRriS11_0:$src1)))>,
+      Requires<[HasV4T]>;
+
+let AddedComplexity = 20 in
+def:  Pat <(i64 (zextloadi1 (add (i32 IntRegs:$src1),
+                                s11_0ExtPred:$offset))),
+      (i64 (COMBINE_Ir_V4 0, (LDriub_indexed IntRegs:$src1,
+                                  s11_0ExtPred:$offset)))>,
+      Requires<[HasV4T]>;
+
+// zext i16->i64
+def:  Pat <(i64 (zextloadi16 ADDRriS11_1:$src1)),
+      (i64 (COMBINE_Ir_V4 0, (LDriuh ADDRriS11_1:$src1)))>,
+      Requires<[HasV4T]>;
+
+let AddedComplexity = 20 in
+def:  Pat <(i64 (zextloadi16 (add (i32 IntRegs:$src1),
+                                  s11_1ExtPred:$offset))),
+      (i64 (COMBINE_Ir_V4 0, (LDriuh_indexed IntRegs:$src1,
+                                  s11_1ExtPred:$offset)))>,
+      Requires<[HasV4T]>;
+
+// anyext i16->i64
+def:  Pat <(i64 (extloadi16 ADDRriS11_2:$src1)),
+      (i64 (COMBINE_Ir_V4 0, (LDrih ADDRriS11_2:$src1)))>,
+      Requires<[HasV4T]>;
+
+let AddedComplexity = 20 in
+def:  Pat <(i64 (extloadi16 (add (i32 IntRegs:$src1),
+                                  s11_1ExtPred:$offset))),
+      (i64 (COMBINE_Ir_V4 0, (LDrih_indexed IntRegs:$src1,
+                                  s11_1ExtPred:$offset)))>,
+      Requires<[HasV4T]>;
+
+// zext i32->i64
+def:  Pat <(i64 (zextloadi32 ADDRriS11_2:$src1)),
+      (i64 (COMBINE_Ir_V4 0, (LDriw ADDRriS11_2:$src1)))>,
+      Requires<[HasV4T]>;
+
+let AddedComplexity = 100 in
+def:  Pat <(i64 (zextloadi32 (i32 (add IntRegs:$src1, s11_2ExtPred:$offset)))),
+      (i64 (COMBINE_Ir_V4 0, (LDriw_indexed IntRegs:$src1,
+                                  s11_2ExtPred:$offset)))>,
+      Requires<[HasV4T]>;
+
+// anyext i32->i64
+def:  Pat <(i64 (extloadi32 ADDRriS11_2:$src1)),
+      (i64 (COMBINE_Ir_V4 0, (LDriw ADDRriS11_2:$src1)))>,
+      Requires<[HasV4T]>;
+
+let AddedComplexity = 100 in
+def:  Pat <(i64 (extloadi32 (i32 (add IntRegs:$src1, s11_2ExtPred:$offset)))),
+      (i64 (COMBINE_Ir_V4 0, (LDriw_indexed IntRegs:$src1,
+                                  s11_2ExtPred:$offset)))>,
+      Requires<[HasV4T]>;
+
+
+
+//===----------------------------------------------------------------------===//
+// LD -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ST +
+//===----------------------------------------------------------------------===//
+///
+//===----------------------------------------------------------------------===//
+// Template class for store instructions with Absolute set addressing mode.
+//===----------------------------------------------------------------------===//
+let isExtended = 1, opExtendable = 2, validSubTargets = HasV4SubT,
+addrMode = AbsoluteSet in
+class T_ST_abs_set<string mnemonic, RegisterClass RC>:
+            STInst2<(outs IntRegs:$dst1),
+            (ins RC:$src1, u0AlwaysExt:$src2),
+            mnemonic#"($dst1=##$src2) = $src1",
+            []>,
+            Requires<[HasV4T]>;
+
+def STrid_abs_set_V4 : T_ST_abs_set <"memd", DoubleRegs>;
+def STrib_abs_set_V4 : T_ST_abs_set <"memb", IntRegs>;
+def STrih_abs_set_V4 : T_ST_abs_set <"memh", IntRegs>;
+def STriw_abs_set_V4 : T_ST_abs_set <"memw", IntRegs>;
+
+//===----------------------------------------------------------------------===//
+// multiclass for store instructions with base + register offset addressing
+// mode
+//===----------------------------------------------------------------------===//
+multiclass ST_Idxd_shl_Pbase<string mnemonic, RegisterClass RC, bit isNot,
+                             bit isPredNew> {
+  let isPredicatedNew = isPredNew in
+  def NAME : STInst2<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
+                 RC:$src5),
+            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+            ") ")#mnemonic#"($src2+$src3<<#$src4) = $src5",
+            []>,
+            Requires<[HasV4T]>;
+}
+
+multiclass ST_Idxd_shl_Pred<string mnemonic, RegisterClass RC, bit PredNot> {
+  let isPredicatedFalse = PredNot in {
+    defm _c#NAME : ST_Idxd_shl_Pbase<mnemonic, RC, PredNot, 0>;
+    // Predicate new
+    defm _cdn#NAME : ST_Idxd_shl_Pbase<mnemonic, RC, PredNot, 1>;
+  }
+}
+
+let isNVStorable = 1 in
+multiclass ST_Idxd_shl<string mnemonic, string CextOp, RegisterClass RC> {
+  let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed_shl in {
+    let isPredicable = 1 in
+    def NAME#_V4 : STInst2<(outs),
+            (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$src3, RC:$src4),
+            mnemonic#"($src1+$src2<<#$src3) = $src4",
+            []>,
+            Requires<[HasV4T]>;
+
+    let isPredicated = 1 in {
+      defm Pt_V4 : ST_Idxd_shl_Pred<mnemonic, RC, 0 >;
+      defm NotPt_V4 : ST_Idxd_shl_Pred<mnemonic, RC, 1>;
+    }
+  }
+}
+
+// multiclass for new-value store instructions with base + register offset
+// addressing mode.
+multiclass ST_Idxd_shl_Pbase_nv<string mnemonic, RegisterClass RC, bit isNot,
+                             bit isPredNew> {
+  let isPredicatedNew = isPredNew in
+  def NAME#_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
+                 RC:$src5),
+            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+            ") ")#mnemonic#"($src2+$src3<<#$src4) = $src5.new",
+            []>,
+            Requires<[HasV4T]>;
+}
+
+multiclass ST_Idxd_shl_Pred_nv<string mnemonic, RegisterClass RC, bit PredNot> {
+  let isPredicatedFalse = PredNot in {
+    defm _c#NAME : ST_Idxd_shl_Pbase_nv<mnemonic, RC, PredNot, 0>;
+    // Predicate new
+    defm _cdn#NAME : ST_Idxd_shl_Pbase_nv<mnemonic, RC, PredNot, 1>;
+  }
+}
+
+let mayStore = 1, isNVStore = 1 in
+multiclass ST_Idxd_shl_nv<string mnemonic, string CextOp, RegisterClass RC> {
+  let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed_shl in {
+    let isPredicable = 1 in
+    def NAME#_nv_V4 : NVInst_V4<(outs),
+            (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$src3, RC:$src4),
+            mnemonic#"($src1+$src2<<#$src3) = $src4.new",
+            []>,
+            Requires<[HasV4T]>;
+
+    let isPredicated = 1 in {
+      defm Pt : ST_Idxd_shl_Pred_nv<mnemonic, RC, 0 >;
+      defm NotPt : ST_Idxd_shl_Pred_nv<mnemonic, RC, 1>;
+    }
+  }
+}
+
+let addrMode = BaseRegOffset, neverHasSideEffects = 1,
+validSubTargets = HasV4SubT in {
+  let accessSize = ByteAccess in
+    defm STrib_indexed_shl: ST_Idxd_shl<"memb", "STrib", IntRegs>,
+                            ST_Idxd_shl_nv<"memb", "STrib", IntRegs>, AddrModeRel;
+
+  let accessSize = HalfWordAccess in
+    defm STrih_indexed_shl: ST_Idxd_shl<"memh", "STrih", IntRegs>,
+                            ST_Idxd_shl_nv<"memh", "STrih", IntRegs>, AddrModeRel;
+
+  let accessSize = WordAccess in
+    defm STriw_indexed_shl: ST_Idxd_shl<"memw", "STriw", IntRegs>,
+                            ST_Idxd_shl_nv<"memw", "STriw", IntRegs>, AddrModeRel;
+
+  let isNVStorable = 0, accessSize = DoubleWordAccess in
+    defm STrid_indexed_shl: ST_Idxd_shl<"memd", "STrid", DoubleRegs>, AddrModeRel;
+}
+
+let Predicates = [HasV4T], AddedComplexity = 10 in {
+def : Pat<(truncstorei8 (i32 IntRegs:$src4),
+                       (add IntRegs:$src1, (shl IntRegs:$src2,
+                                                u2ImmPred:$src3))),
+          (STrib_indexed_shl_V4 IntRegs:$src1, IntRegs:$src2,
+                                u2ImmPred:$src3, IntRegs:$src4)>;
+
+def : Pat<(truncstorei16 (i32 IntRegs:$src4),
+                        (add IntRegs:$src1, (shl IntRegs:$src2,
+                                                 u2ImmPred:$src3))),
+          (STrih_indexed_shl_V4 IntRegs:$src1, IntRegs:$src2,
+                                u2ImmPred:$src3, IntRegs:$src4)>;
+
+def : Pat<(store (i32 IntRegs:$src4),
+                 (add IntRegs:$src1, (shl IntRegs:$src2, u2ImmPred:$src3))),
+          (STriw_indexed_shl_V4 IntRegs:$src1, IntRegs:$src2,
+                                u2ImmPred:$src3, IntRegs:$src4)>;
+
+def : Pat<(store (i64 DoubleRegs:$src4),
+                (add IntRegs:$src1, (shl IntRegs:$src2, u2ImmPred:$src3))),
+          (STrid_indexed_shl_V4 IntRegs:$src1, IntRegs:$src2,
+                                u2ImmPred:$src3, DoubleRegs:$src4)>;
+}
+
+let isExtended = 1, opExtendable = 2 in
+class T_ST_LongOff <string mnemonic, PatFrag stOp, RegisterClass RC, ValueType VT> :
+            STInst<(outs),
+            (ins IntRegs:$src1, u2Imm:$src2, u0AlwaysExt:$src3, RC:$src4),
+            mnemonic#"($src1<<#$src2+##$src3) = $src4",
+            [(stOp (VT RC:$src4),
+                    (add (shl (i32 IntRegs:$src1), u2ImmPred:$src2),
+                         u0AlwaysExtPred:$src3))]>,
+            Requires<[HasV4T]>;
+
+let isExtended = 1, opExtendable = 2, mayStore = 1, isNVStore = 1 in
+class T_ST_LongOff_nv <string mnemonic> :
+            NVInst_V4<(outs),
+            (ins IntRegs:$src1, u2Imm:$src2, u0AlwaysExt:$src3, IntRegs:$src4),
+            mnemonic#"($src1<<#$src2+##$src3) = $src4.new",
+            []>,
+            Requires<[HasV4T]>;
+
+multiclass ST_LongOff <string mnemonic, string BaseOp, PatFrag stOp> {
+  let  BaseOpcode = BaseOp#"_shl" in {
+    let isNVStorable = 1 in
+    def NAME#_V4 : T_ST_LongOff<mnemonic, stOp, IntRegs, i32>;
+
+    def NAME#_nv_V4 : T_ST_LongOff_nv<mnemonic>;
+  }
+}
+
+let AddedComplexity = 10, validSubTargets = HasV4SubT in {
+  def STrid_shl_V4 : T_ST_LongOff<"memd", store, DoubleRegs, i64>;
+  defm STrib_shl   : ST_LongOff <"memb", "STrib", truncstorei8>, NewValueRel;
+  defm STrih_shl   : ST_LongOff <"memh", "Strih", truncstorei16>, NewValueRel;
+  defm STriw_shl   : ST_LongOff <"memw", "STriw", store>, NewValueRel;
+}
+
+let AddedComplexity = 40 in
+multiclass T_ST_LOff_Pats <InstHexagon I, RegisterClass RC, ValueType VT,
+                           PatFrag stOp> {
+ def : Pat<(stOp (VT RC:$src4),
+           (add (shl IntRegs:$src1, u2ImmPred:$src2),
+               (NumUsesBelowThresCONST32 tglobaladdr:$src3))),
+           (I IntRegs:$src1, u2ImmPred:$src2, tglobaladdr:$src3, RC:$src4)>;
+
+ def : Pat<(stOp (VT RC:$src4),
+           (add IntRegs:$src1,
+               (NumUsesBelowThresCONST32 tglobaladdr:$src3))),
+           (I IntRegs:$src1, 0, tglobaladdr:$src3, RC:$src4)>;
+}
+
+defm : T_ST_LOff_Pats<STrid_shl_V4, DoubleRegs, i64, store>;
+defm : T_ST_LOff_Pats<STriw_shl_V4, IntRegs, i32, store>;
+defm : T_ST_LOff_Pats<STrib_shl_V4, IntRegs, i32, truncstorei8>;
+defm : T_ST_LOff_Pats<STrih_shl_V4, IntRegs, i32, truncstorei16>;
+
+// memd(Rx++#s4:3)=Rtt
+// memd(Rx++#s4:3:circ(Mu))=Rtt
+// memd(Rx++I:circ(Mu))=Rtt
+// memd(Rx++Mu)=Rtt
+// memd(Rx++Mu:brev)=Rtt
+// memd(gp+#u16:3)=Rtt
+
+// Store doubleword conditionally.
+// if ([!]Pv[.new]) memd(#u6)=Rtt
+// TODO: needs to be implemented.
+
+//===----------------------------------------------------------------------===//
+// multiclass for store instructions with base + immediate offset
+// addressing mode and immediate stored value.
+// mem[bhw](Rx++#s4:3)=#s8
+// if ([!]Pv[.new]) mem[bhw](Rx++#s4:3)=#s6
+//===----------------------------------------------------------------------===//
+multiclass ST_Imm_Pbase<string mnemonic, Operand OffsetOp, bit isNot,
+                        bit isPredNew> {
+  let isPredicatedNew = isPredNew in
+  def NAME : STInst2<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, OffsetOp:$src3, s6Ext:$src4),
+            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+            ") ")#mnemonic#"($src2+#$src3) = #$src4",
+            []>,
+            Requires<[HasV4T]>;
+}
+
+multiclass ST_Imm_Pred<string mnemonic, Operand OffsetOp, bit PredNot> {
+  let isPredicatedFalse = PredNot in {
+    defm _c#NAME : ST_Imm_Pbase<mnemonic, OffsetOp, PredNot, 0>;
+    // Predicate new
+    defm _cdn#NAME : ST_Imm_Pbase<mnemonic, OffsetOp, PredNot, 1>;
+  }
+}
+
+let isExtendable = 1, isExtentSigned = 1, neverHasSideEffects = 1 in
+multiclass ST_Imm<string mnemonic, string CextOp, Operand OffsetOp> {
+  let CextOpcode = CextOp, BaseOpcode = CextOp#_imm in {
+    let opExtendable = 2, opExtentBits = 8, isPredicable = 1 in
+    def NAME#_V4 : STInst2<(outs),
+            (ins IntRegs:$src1, OffsetOp:$src2, s8Ext:$src3),
+            mnemonic#"($src1+#$src2) = #$src3",
+            []>,
+            Requires<[HasV4T]>;
+
+    let opExtendable = 3, opExtentBits = 6, isPredicated = 1 in {
+      defm Pt_V4 : ST_Imm_Pred<mnemonic, OffsetOp, 0>;
+      defm NotPt_V4 : ST_Imm_Pred<mnemonic, OffsetOp, 1 >;
+    }
+  }
+}
+
+let addrMode = BaseImmOffset, InputType = "imm",
+validSubTargets = HasV4SubT in {
+  let accessSize = ByteAccess in
+    defm STrib_imm : ST_Imm<"memb", "STrib", u6_0Imm>, ImmRegRel, PredNewRel;
+
+  let accessSize = HalfWordAccess in
+    defm STrih_imm : ST_Imm<"memh", "STrih", u6_1Imm>, ImmRegRel, PredNewRel;
+
+  let accessSize = WordAccess in
+    defm STriw_imm : ST_Imm<"memw", "STriw", u6_2Imm>, ImmRegRel, PredNewRel;
+}
+
+let Predicates = [HasV4T], AddedComplexity = 10 in {
+def: Pat<(truncstorei8 s8ExtPred:$src3, (add IntRegs:$src1, u6_0ImmPred:$src2)),
+            (STrib_imm_V4 IntRegs:$src1, u6_0ImmPred:$src2, s8ExtPred:$src3)>;
+
+def: Pat<(truncstorei16 s8ExtPred:$src3, (add IntRegs:$src1,
+                                              u6_1ImmPred:$src2)),
+            (STrih_imm_V4 IntRegs:$src1, u6_1ImmPred:$src2, s8ExtPred:$src3)>;
+
+def: Pat<(store s8ExtPred:$src3, (add IntRegs:$src1, u6_2ImmPred:$src2)),
+            (STriw_imm_V4 IntRegs:$src1, u6_2ImmPred:$src2, s8ExtPred:$src3)>;
+}
+
+let AddedComplexity = 6 in
+def : Pat <(truncstorei8 s8ExtPred:$src2, (i32 IntRegs:$src1)),
+           (STrib_imm_V4 IntRegs:$src1, 0, s8ExtPred:$src2)>,
+           Requires<[HasV4T]>;
+
+// memb(Rx++#s4:0:circ(Mu))=Rt
+// memb(Rx++I:circ(Mu))=Rt
+// memb(Rx++Mu)=Rt
+// memb(Rx++Mu:brev)=Rt
+// memb(gp+#u16:0)=Rt
+
+
+// Store halfword.
+// TODO: needs to be implemented
+// memh(Re=#U6)=Rt.H
+// memh(Rs+#s11:1)=Rt.H
+let AddedComplexity = 6 in
+def : Pat <(truncstorei16 s8ExtPred:$src2, (i32 IntRegs:$src1)),
+           (STrih_imm_V4 IntRegs:$src1, 0, s8ExtPred:$src2)>,
+           Requires<[HasV4T]>;
+
+// memh(Rs+Ru<<#u2)=Rt.H
+// TODO: needs to be implemented.
+
+// memh(Ru<<#u2+#U6)=Rt.H
+// memh(Rx++#s4:1:circ(Mu))=Rt.H
+// memh(Rx++#s4:1:circ(Mu))=Rt
+// memh(Rx++I:circ(Mu))=Rt.H
+// memh(Rx++I:circ(Mu))=Rt
+// memh(Rx++Mu)=Rt.H
+// memh(Rx++Mu)=Rt
+// memh(Rx++Mu:brev)=Rt.H
+// memh(Rx++Mu:brev)=Rt
+// memh(gp+#u16:1)=Rt
+// if ([!]Pv[.new]) memh(#u6)=Rt.H
+// if ([!]Pv[.new]) memh(#u6)=Rt
+
+
+// if ([!]Pv[.new]) memh(Rs+#u6:1)=Rt.H
+// TODO: needs to be implemented.
+
+// if ([!]Pv[.new]) memh(Rx++#s4:1)=Rt.H
+// TODO: Needs to be implemented.
+
+// Store word.
+// memw(Re=#U6)=Rt
+// TODO: Needs to be implemented.
+
+// Store predicate:
+let neverHasSideEffects = 1 in
+def STriw_pred_V4 : STInst2<(outs),
+            (ins MEMri:$addr, PredRegs:$src1),
+            "Error; should not emit",
+            []>,
+            Requires<[HasV4T]>;
+
+let AddedComplexity = 6 in
+def : Pat <(store s8ExtPred:$src2, (i32 IntRegs:$src1)),
+           (STriw_imm_V4 IntRegs:$src1, 0, s8ExtPred:$src2)>,
+           Requires<[HasV4T]>;
+
+// memw(Rx++#s4:2)=Rt
+// memw(Rx++#s4:2:circ(Mu))=Rt
+// memw(Rx++I:circ(Mu))=Rt
+// memw(Rx++Mu)=Rt
+// memw(Rx++Mu:brev)=Rt
+
+//===----------------------------------------------------------------------===
+// ST -
+//===----------------------------------------------------------------------===
+
+
+//===----------------------------------------------------------------------===//
+// NV/ST +
+//===----------------------------------------------------------------------===//
+
+// multiclass for new-value store instructions with base + immediate offset.
+//
+multiclass ST_Idxd_Pbase_nv<string mnemonic, RegisterClass RC,
+                            Operand predImmOp, bit isNot, bit isPredNew> {
+  let isPredicatedNew = isPredNew in
+  def NAME#_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, predImmOp:$src3, RC: $src4),
+            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+            ") ")#mnemonic#"($src2+#$src3) = $src4.new",
+            []>,
+            Requires<[HasV4T]>;
+}
+
+multiclass ST_Idxd_Pred_nv<string mnemonic, RegisterClass RC, Operand predImmOp,
+                           bit PredNot> {
+  let isPredicatedFalse = PredNot in {
+    defm _c#NAME : ST_Idxd_Pbase_nv<mnemonic, RC, predImmOp, PredNot, 0>;
+    // Predicate new
+    defm _cdn#NAME : ST_Idxd_Pbase_nv<mnemonic, RC, predImmOp, PredNot, 1>;
+  }
+}
+
+let mayStore = 1, isNVStore = 1, neverHasSideEffects = 1, isExtendable = 1 in
+multiclass ST_Idxd_nv<string mnemonic, string CextOp, RegisterClass RC,
+                   Operand ImmOp, Operand predImmOp, bits<5> ImmBits,
+                   bits<5> PredImmBits> {
+
+  let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed in {
+    let opExtendable = 1, isExtentSigned = 1, opExtentBits = ImmBits,
+    isPredicable = 1 in
+    def NAME#_nv_V4 : NVInst_V4<(outs),
+            (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
+            mnemonic#"($src1+#$src2) = $src3.new",
+            []>,
+            Requires<[HasV4T]>;
+
+    let opExtendable = 2, isExtentSigned = 0, opExtentBits = PredImmBits,
+    isPredicated = 1 in {
+      defm Pt : ST_Idxd_Pred_nv<mnemonic, RC, predImmOp, 0>;
+      defm NotPt : ST_Idxd_Pred_nv<mnemonic, RC, predImmOp, 1>;
+    }
+  }
+}
+
+let addrMode = BaseImmOffset, validSubTargets = HasV4SubT in {
+  let accessSize = ByteAccess in
+    defm STrib_indexed: ST_Idxd_nv<"memb", "STrib", IntRegs, s11_0Ext,
+                                   u6_0Ext, 11, 6>, AddrModeRel;
+
+  let accessSize = HalfWordAccess in
+    defm STrih_indexed: ST_Idxd_nv<"memh", "STrih", IntRegs, s11_1Ext,
+                                   u6_1Ext, 12, 7>, AddrModeRel;
+
+  let accessSize = WordAccess in
+    defm STriw_indexed: ST_Idxd_nv<"memw", "STriw", IntRegs, s11_2Ext,
+                                   u6_2Ext, 13, 8>, AddrModeRel;
+}
+
+// multiclass for new-value store instructions with base + immediate offset.
+// and MEMri operand.
+multiclass ST_MEMri_Pbase_nv<string mnemonic, RegisterClass RC, bit isNot,
+                          bit isPredNew> {
+  let isPredicatedNew = isPredNew in
+  def NAME#_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, MEMri:$addr, RC: $src2),
+            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+            ") ")#mnemonic#"($addr) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+}
+
+multiclass ST_MEMri_Pred_nv<string mnemonic, RegisterClass RC, bit PredNot> {
+  let isPredicatedFalse = PredNot in {
+    defm _c#NAME : ST_MEMri_Pbase_nv<mnemonic, RC, PredNot, 0>;
+
+    // Predicate new
+    defm _cdn#NAME : ST_MEMri_Pbase_nv<mnemonic, RC, PredNot, 1>;
+  }
+}
+
+let mayStore = 1, isNVStore = 1, isExtendable = 1, neverHasSideEffects = 1 in
+multiclass ST_MEMri_nv<string mnemonic, string CextOp, RegisterClass RC,
+                    bits<5> ImmBits, bits<5> PredImmBits> {
+
+  let CextOpcode = CextOp, BaseOpcode = CextOp in {
+    let opExtendable = 1, isExtentSigned = 1, opExtentBits = ImmBits,
+         isPredicable = 1 in
+    def NAME#_nv_V4 : NVInst_V4<(outs),
+            (ins MEMri:$addr, RC:$src),
+            mnemonic#"($addr) = $src.new",
+            []>,
+            Requires<[HasV4T]>;
+
+    let opExtendable = 2, isExtentSigned = 0, opExtentBits = PredImmBits,
+        neverHasSideEffects = 1, isPredicated = 1 in {
+      defm Pt : ST_MEMri_Pred_nv<mnemonic, RC, 0>;
+      defm NotPt : ST_MEMri_Pred_nv<mnemonic, RC, 1>;
+    }
+  }
+}
+
+let addrMode = BaseImmOffset, isMEMri = "true", validSubTargets = HasV4SubT,
+mayStore = 1 in {
+  let accessSize = ByteAccess in
+    defm STrib: ST_MEMri_nv<"memb", "STrib", IntRegs, 11, 6>, AddrModeRel;
+
+  let accessSize = HalfWordAccess in
+    defm STrih: ST_MEMri_nv<"memh", "STrih", IntRegs, 12, 7>, AddrModeRel;
+
+  let accessSize = WordAccess in
+    defm STriw: ST_MEMri_nv<"memw", "STriw", IntRegs, 13, 8>, AddrModeRel;
+}
+
+//===----------------------------------------------------------------------===//
+// Post increment store
+// mem[bhwd](Rx++#s4:[0123])=Nt.new
+//===----------------------------------------------------------------------===//
+
+multiclass ST_PostInc_Pbase_nv<string mnemonic, RegisterClass RC, Operand ImmOp,
+                            bit isNot, bit isPredNew> {
+  let isPredicatedNew = isPredNew in
+  def NAME#_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$offset, RC:$src3),
+            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+            ") ")#mnemonic#"($src2++#$offset) = $src3.new",
+            [],
+            "$src2 = $dst">,
+            Requires<[HasV4T]>;
+}
+
+multiclass ST_PostInc_Pred_nv<string mnemonic, RegisterClass RC,
+                           Operand ImmOp, bit PredNot> {
+  let isPredicatedFalse = PredNot in {
+    defm _c#NAME : ST_PostInc_Pbase_nv<mnemonic, RC, ImmOp, PredNot, 0>;
+    // Predicate new
+    let Predicates = [HasV4T], validSubTargets = HasV4SubT in
+    defm _cdn#NAME : ST_PostInc_Pbase_nv<mnemonic, RC, ImmOp, PredNot, 1>;
+  }
+}
+
+let hasCtrlDep = 1, isNVStore = 1, neverHasSideEffects = 1 in
+multiclass ST_PostInc_nv<string mnemonic, string BaseOp, RegisterClass RC,
+                      Operand ImmOp> {
+
+  let BaseOpcode = "POST_"#BaseOp in {
+    let isPredicable = 1 in
+    def NAME#_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
+                (ins IntRegs:$src1, ImmOp:$offset, RC:$src2),
+                mnemonic#"($src1++#$offset) = $src2.new",
+                [],
+                "$src1 = $dst">,
+                Requires<[HasV4T]>;
+
+    let isPredicated = 1 in {
+      defm Pt : ST_PostInc_Pred_nv<mnemonic, RC, ImmOp, 0 >;
+      defm NotPt : ST_PostInc_Pred_nv<mnemonic, RC, ImmOp, 1 >;
+    }
+  }
+}
+
+let addrMode = PostInc, validSubTargets = HasV4SubT in {
+defm POST_STbri: ST_PostInc_nv <"memb", "STrib", IntRegs, s4_0Imm>, AddrModeRel;
+defm POST_SThri: ST_PostInc_nv <"memh", "STrih", IntRegs, s4_1Imm>, AddrModeRel;
+defm POST_STwri: ST_PostInc_nv <"memw", "STriw", IntRegs, s4_2Imm>, AddrModeRel;
+}
+
+// memb(Rx++#s4:0:circ(Mu))=Nt.new
+// memb(Rx++I:circ(Mu))=Nt.new
+// memb(Rx++Mu)=Nt.new
+// memb(Rx++Mu:brev)=Nt.new
+// memh(Rx++#s4:1:circ(Mu))=Nt.new
+// memh(Rx++I:circ(Mu))=Nt.new
+// memh(Rx++Mu)=Nt.new
+// memh(Rx++Mu:brev)=Nt.new
+
+// memw(Rx++#s4:2:circ(Mu))=Nt.new
+// memw(Rx++I:circ(Mu))=Nt.new
+// memw(Rx++Mu)=Nt.new
+// memw(Rx++Mu:brev)=Nt.new
+
+//===----------------------------------------------------------------------===//
+// NV/ST -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// NV/J +
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// multiclass/template class for the new-value compare jumps with the register
+// operands.
+//===----------------------------------------------------------------------===//
+
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 11 in
+class NVJrr_template<string mnemonic, bits<3> majOp, bit NvOpNum,
+                      bit isNegCond, bit isTak>
+  : NVInst_V4<(outs),
+    (ins IntRegs:$src1, IntRegs:$src2, brtarget:$offset),
+    "if ("#!if(isNegCond, "!","")#mnemonic#
+    "($src1"#!if(!eq(NvOpNum, 0),".new, ",", ")#
+    "$src2"#!if(!eq(NvOpNum, 1),".new))","))")#" jump:"
+    #!if(isTak, "t","nt")#" $offset",
+    []>, Requires<[HasV4T]> {
+
+      bits<5> src1;
+      bits<5> src2;
+      bits<3> Ns;    // New-Value Operand
+      bits<5> RegOp; // Non-New-Value Operand
+      bits<11> offset;
+
+      let isTaken = isTak;
+      let isBrTaken = !if(isTaken, "true", "false");
+      let isPredicatedFalse = isNegCond;
+
+      let Ns = !if(!eq(NvOpNum, 0), src1{2-0}, src2{2-0});
+      let RegOp = !if(!eq(NvOpNum, 0), src2, src1);
+
+      let IClass = 0b0010;
+      let Inst{26} = 0b0;
+      let Inst{25-23} = majOp;
+      let Inst{22} = isNegCond;
+      let Inst{18-16} = Ns;
+      let Inst{13} = isTak;
+      let Inst{12-8} = RegOp;
+      let Inst{21-20} = offset{10-9};
+      let Inst{7-1} = offset{8-2};
+}
+
+
+multiclass NVJrr_cond<string mnemonic, bits<3> majOp, bit NvOpNum,
+                       bit isNegCond> {
+  // Branch not taken:
+  def _nt_V4: NVJrr_template<mnemonic, majOp, NvOpNum, isNegCond, 0>;
+  // Branch taken:
+  def _t_V4: NVJrr_template<mnemonic, majOp, NvOpNum, isNegCond, 1>;
+}
+
+// NvOpNum = 0 -> First Operand is a new-value Register
+// NvOpNum = 1 -> Second Operand is a new-value Register
+
+multiclass NVJrr_base<string mnemonic, string BaseOp, bits<3> majOp,
+                       bit NvOpNum> {
+  let BaseOpcode = BaseOp#_NVJ in {
+    defm _t_Jumpnv : NVJrr_cond<mnemonic, majOp, NvOpNum, 0>; // True cond
+    defm _f_Jumpnv : NVJrr_cond<mnemonic, majOp, NvOpNum, 1>; // False cond
+  }
+}
+
+// if ([!]cmp.eq(Ns.new,Rt)) jump:[n]t #r9:2
+// if ([!]cmp.gt(Ns.new,Rt)) jump:[n]t #r9:2
+// if ([!]cmp.gtu(Ns.new,Rt)) jump:[n]t #r9:2
+// if ([!]cmp.gt(Rt,Ns.new)) jump:[n]t #r9:2
+// if ([!]cmp.gtu(Rt,Ns.new)) jump:[n]t #r9:2
+
+let isPredicated = 1, isBranch = 1, isNewValue = 1, isTerminator = 1,
+  Defs = [PC], neverHasSideEffects = 1, validSubTargets = HasV4SubT in {
+  defm CMPEQrr  : NVJrr_base<"cmp.eq",  "CMPEQ",  0b000, 0>, PredRel;
+  defm CMPGTrr  : NVJrr_base<"cmp.gt",  "CMPGT",  0b001, 0>, PredRel;
+  defm CMPGTUrr : NVJrr_base<"cmp.gtu", "CMPGTU", 0b010, 0>, PredRel;
+  defm CMPLTrr  : NVJrr_base<"cmp.gt",  "CMPLT",  0b011, 1>, PredRel;
+  defm CMPLTUrr : NVJrr_base<"cmp.gtu", "CMPLTU", 0b100, 1>, PredRel;
+}
+
+//===----------------------------------------------------------------------===//
+// multiclass/template class for the new-value compare jumps instruction
+// with a register and an unsigned immediate (U5) operand.
+//===----------------------------------------------------------------------===//
+
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 11 in
+class NVJri_template<string mnemonic, bits<3> majOp, bit isNegCond,
+                         bit isTak>
+  : NVInst_V4<(outs),
+    (ins IntRegs:$src1, u5Imm:$src2, brtarget:$offset),
+    "if ("#!if(isNegCond, "!","")#mnemonic#"($src1.new, #$src2)) jump:"
+    #!if(isTak, "t","nt")#" $offset",
+    []>, Requires<[HasV4T]> {
+
+      let isTaken = isTak;
+      let isPredicatedFalse = isNegCond;
+      let isBrTaken = !if(isTaken, "true", "false");
+
+      bits<3> src1;
+      bits<5> src2;
+      bits<11> offset;
+
+      let IClass = 0b0010;
+      let Inst{26} = 0b1;
+      let Inst{25-23} = majOp;
+      let Inst{22} = isNegCond;
+      let Inst{18-16} = src1;
+      let Inst{13} = isTak;
+      let Inst{12-8} = src2;
+      let Inst{21-20} = offset{10-9};
+      let Inst{7-1} = offset{8-2};
+}
+
+multiclass NVJri_cond<string mnemonic, bits<3> majOp, bit isNegCond> {
+  // Branch not taken:
+  def _nt_V4: NVJri_template<mnemonic, majOp, isNegCond, 0>;
+  // Branch taken:
+  def _t_V4: NVJri_template<mnemonic, majOp, isNegCond, 1>;
+}
+
+multiclass NVJri_base<string mnemonic, string BaseOp, bits<3> majOp> {
+  let BaseOpcode = BaseOp#_NVJri in {
+    defm _t_Jumpnv : NVJri_cond<mnemonic, majOp, 0>; // True Cond
+    defm _f_Jumpnv : NVJri_cond<mnemonic, majOp, 1>; // False cond
+  }
+}
+
+// if ([!]cmp.eq(Ns.new,#U5)) jump:[n]t #r9:2
+// if ([!]cmp.gt(Ns.new,#U5)) jump:[n]t #r9:2
+// if ([!]cmp.gtu(Ns.new,#U5)) jump:[n]t #r9:2
+
+let isPredicated = 1, isBranch = 1, isNewValue = 1, isTerminator = 1,
+  Defs = [PC], neverHasSideEffects = 1, validSubTargets = HasV4SubT in {
+  defm CMPEQri  : NVJri_base<"cmp.eq", "CMPEQ", 0b000>, PredRel;
+  defm CMPGTri  : NVJri_base<"cmp.gt", "CMPGT", 0b001>, PredRel;
+  defm CMPGTUri : NVJri_base<"cmp.gtu", "CMPGTU", 0b010>, PredRel;
+}
+
+//===----------------------------------------------------------------------===//
+// multiclass/template class for the new-value compare jumps instruction
+// with a register and an hardcoded 0/-1 immediate value.
+//===----------------------------------------------------------------------===//
+
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 11 in
+class NVJ_ConstImm_template<string mnemonic, bits<3> majOp, string ImmVal,
+                            bit isNegCond, bit isTak>
+  : NVInst_V4<(outs),
+    (ins IntRegs:$src1, brtarget:$offset),
+    "if ("#!if(isNegCond, "!","")#mnemonic
+    #"($src1.new, #"#ImmVal#")) jump:"
+    #!if(isTak, "t","nt")#" $offset",
+    []>, Requires<[HasV4T]> {
+
+      let isTaken = isTak;
+      let isPredicatedFalse = isNegCond;
+      let isBrTaken = !if(isTaken, "true", "false");
+
+      bits<3> src1;
+      bits<11> offset;
+      let IClass = 0b0010;
+      let Inst{26} = 0b1;
+      let Inst{25-23} = majOp;
+      let Inst{22} = isNegCond;
+      let Inst{18-16} = src1;
+      let Inst{13} = isTak;
+      let Inst{21-20} = offset{10-9};
+      let Inst{7-1} = offset{8-2};
+}
+
+multiclass NVJ_ConstImm_cond<string mnemonic, bits<3> majOp, string ImmVal,
+                             bit isNegCond> {
+  // Branch not taken:
+  def _nt_V4: NVJ_ConstImm_template<mnemonic, majOp, ImmVal, isNegCond, 0>;
+  // Branch taken:
+  def _t_V4: NVJ_ConstImm_template<mnemonic, majOp, ImmVal, isNegCond, 1>;
+}
+
+multiclass NVJ_ConstImm_base<string mnemonic, string BaseOp, bits<3> majOp,
+                             string ImmVal> {
+  let BaseOpcode = BaseOp#_NVJ_ConstImm in {
+  defm _t_Jumpnv : NVJ_ConstImm_cond<mnemonic, majOp, ImmVal, 0>; // True cond
+  defm _f_Jumpnv : NVJ_ConstImm_cond<mnemonic, majOp, ImmVal, 1>; // False Cond
+  }
+}
+
+// if ([!]tstbit(Ns.new,#0)) jump:[n]t #r9:2
+// if ([!]cmp.eq(Ns.new,#-1)) jump:[n]t #r9:2
+// if ([!]cmp.gt(Ns.new,#-1)) jump:[n]t #r9:2
+
+let isPredicated = 1, isBranch = 1, isNewValue = 1, isTerminator=1,
+  Defs = [PC], neverHasSideEffects = 1 in {
+  defm TSTBIT0  : NVJ_ConstImm_base<"tstbit", "TSTBIT", 0b011, "0">, PredRel;
+  defm CMPEQn1  : NVJ_ConstImm_base<"cmp.eq", "CMPEQ",  0b100, "-1">, PredRel;
+  defm CMPGTn1  : NVJ_ConstImm_base<"cmp.gt", "CMPGT",  0b101, "-1">, PredRel;
+}
+
+//===----------------------------------------------------------------------===//
+// XTYPE/ALU +
+//===----------------------------------------------------------------------===//
+
+//  Add and accumulate.
+//  Rd=add(Rs,add(Ru,#s6))
+let isExtendable = 1, opExtendable = 3, isExtentSigned = 1, opExtentBits = 6,
+validSubTargets = HasV4SubT in
+def ADDr_ADDri_V4 : MInst<(outs IntRegs:$dst),
+          (ins IntRegs:$src1, IntRegs:$src2, s6Ext:$src3),
+          "$dst = add($src1, add($src2, #$src3))",
+          [(set (i32 IntRegs:$dst),
+           (add (i32 IntRegs:$src1), (add (i32 IntRegs:$src2),
+                                          s6_16ExtPred:$src3)))]>,
+          Requires<[HasV4T]>;
+
+//  Rd=add(Rs,sub(#s6,Ru))
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 6,
+validSubTargets = HasV4SubT in
+def ADDr_SUBri_V4 : MInst<(outs IntRegs:$dst),
+          (ins IntRegs:$src1, s6Ext:$src2, IntRegs:$src3),
+          "$dst = add($src1, sub(#$src2, $src3))",
+          [(set (i32 IntRegs:$dst),
+           (add (i32 IntRegs:$src1), (sub s6_10ExtPred:$src2,
+                                          (i32 IntRegs:$src3))))]>,
+          Requires<[HasV4T]>;
+
+// Generates the same instruction as ADDr_SUBri_V4 but matches different
+// pattern.
+//  Rd=add(Rs,sub(#s6,Ru))
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 6,
+validSubTargets = HasV4SubT in
+def ADDri_SUBr_V4 : MInst<(outs IntRegs:$dst),
+          (ins IntRegs:$src1, s6Ext:$src2, IntRegs:$src3),
+          "$dst = add($src1, sub(#$src2, $src3))",
+          [(set (i32 IntRegs:$dst),
+                (sub (add (i32 IntRegs:$src1), s6_10ExtPred:$src2),
+                     (i32 IntRegs:$src3)))]>,
+          Requires<[HasV4T]>;
+
+
+//  Add or subtract doublewords with carry.
+//TODO:
+//  Rdd=add(Rss,Rtt,Px):carry
+//TODO:
+//  Rdd=sub(Rss,Rtt,Px):carry
+
+
+//  Logical doublewords.
+//  Rdd=and(Rtt,~Rss)
+let validSubTargets = HasV4SubT in
+def ANDd_NOTd_V4 : MInst<(outs DoubleRegs:$dst),
+          (ins DoubleRegs:$src1, DoubleRegs:$src2),
+          "$dst = and($src1, ~$src2)",
+          [(set (i64 DoubleRegs:$dst), (and (i64 DoubleRegs:$src1),
+                                      (not (i64 DoubleRegs:$src2))))]>,
+          Requires<[HasV4T]>;
+
+//  Rdd=or(Rtt,~Rss)
+let validSubTargets = HasV4SubT in
+def ORd_NOTd_V4 : MInst<(outs DoubleRegs:$dst),
+          (ins DoubleRegs:$src1, DoubleRegs:$src2),
+          "$dst = or($src1, ~$src2)",
+          [(set (i64 DoubleRegs:$dst),
+           (or (i64 DoubleRegs:$src1), (not (i64 DoubleRegs:$src2))))]>,
+          Requires<[HasV4T]>;
+
+
+//  Logical-logical doublewords.
+//  Rxx^=xor(Rss,Rtt)
+let validSubTargets = HasV4SubT in
+def XORd_XORdd: MInst_acc<(outs DoubleRegs:$dst),
+          (ins DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+          "$dst ^= xor($src2, $src3)",
+          [(set (i64 DoubleRegs:$dst),
+           (xor (i64 DoubleRegs:$src1), (xor (i64 DoubleRegs:$src2),
+                                             (i64 DoubleRegs:$src3))))],
+          "$src1 = $dst">,
+          Requires<[HasV4T]>;
+
+
+// Logical-logical words.
+// Rx=or(Ru,and(Rx,#s10))
+let isExtendable = 1, opExtendable = 3, isExtentSigned = 1, opExtentBits = 10,
+validSubTargets = HasV4SubT in
+def ORr_ANDri_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs: $src2, s10Ext:$src3),
+            "$dst = or($src1, and($src2, #$src3))",
+            [(set (i32 IntRegs:$dst),
+                  (or (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
+                                                s10ExtPred:$src3)))],
+            "$src2 = $dst">,
+            Requires<[HasV4T]>;
+
+// Rx[&|^]=and(Rs,Rt)
+// Rx&=and(Rs,Rt)
+let validSubTargets = HasV4SubT in
+def ANDr_ANDrr_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
+            "$dst &= and($src2, $src3)",
+            [(set (i32 IntRegs:$dst),
+                  (and (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
+                                                 (i32 IntRegs:$src3))))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>;
+
+// Rx|=and(Rs,Rt)
+let validSubTargets = HasV4SubT, CextOpcode = "ORr_ANDr", InputType = "reg" in
+def ORr_ANDrr_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
+            "$dst |= and($src2, $src3)",
+            [(set (i32 IntRegs:$dst),
+                  (or (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
+                                                (i32 IntRegs:$src3))))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>, ImmRegRel;
+
+// Rx^=and(Rs,Rt)
+let validSubTargets = HasV4SubT in
+def XORr_ANDrr_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
+            "$dst ^= and($src2, $src3)",
+            [(set (i32 IntRegs:$dst),
+             (xor (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
+                                            (i32 IntRegs:$src3))))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>;
+
+// Rx[&|^]=and(Rs,~Rt)
+// Rx&=and(Rs,~Rt)
+let validSubTargets = HasV4SubT in
+def ANDr_ANDr_NOTr_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
+            "$dst &= and($src2, ~$src3)",
+            [(set (i32 IntRegs:$dst),
+                  (and (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
+                                                 (not (i32 IntRegs:$src3)))))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>;
+
+// Rx|=and(Rs,~Rt)
+let validSubTargets = HasV4SubT in
+def ORr_ANDr_NOTr_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
+            "$dst |= and($src2, ~$src3)",
+            [(set (i32 IntRegs:$dst),
+             (or (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
+                                           (not (i32 IntRegs:$src3)))))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>;
+
+// Rx^=and(Rs,~Rt)
+let validSubTargets = HasV4SubT in
+def XORr_ANDr_NOTr_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
+            "$dst ^= and($src2, ~$src3)",
+            [(set (i32 IntRegs:$dst),
+             (xor (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
+                                            (not (i32 IntRegs:$src3)))))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>;
+
+// Rx[&|^]=or(Rs,Rt)
+// Rx&=or(Rs,Rt)
+let validSubTargets = HasV4SubT in
+def ANDr_ORrr_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
+            "$dst &= or($src2, $src3)",
+            [(set (i32 IntRegs:$dst),
+                  (and (i32 IntRegs:$src1), (or (i32 IntRegs:$src2),
+                                                (i32 IntRegs:$src3))))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>;
+
+// Rx|=or(Rs,Rt)
+let validSubTargets = HasV4SubT, CextOpcode = "ORr_ORr", InputType = "reg" in
+def ORr_ORrr_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
+            "$dst |= or($src2, $src3)",
+            [(set (i32 IntRegs:$dst),
+                  (or (i32 IntRegs:$src1), (or (i32 IntRegs:$src2),
+                                               (i32 IntRegs:$src3))))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>, ImmRegRel;
+
+// Rx^=or(Rs,Rt)
+let validSubTargets = HasV4SubT in
+def XORr_ORrr_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
+            "$dst ^= or($src2, $src3)",
+            [(set (i32 IntRegs:$dst),
+             (xor (i32 IntRegs:$src1), (or (i32 IntRegs:$src2),
+                                           (i32 IntRegs:$src3))))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>;
+
+// Rx[&|^]=xor(Rs,Rt)
+// Rx&=xor(Rs,Rt)
+let validSubTargets = HasV4SubT in
+def ANDr_XORrr_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
+            "$dst &= xor($src2, $src3)",
+            [(set (i32 IntRegs:$dst),
+                  (and (i32 IntRegs:$src1), (xor (i32 IntRegs:$src2),
+                                                 (i32 IntRegs:$src3))))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>;
+
+// Rx|=xor(Rs,Rt)
+let validSubTargets = HasV4SubT in
+def ORr_XORrr_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
+            "$dst |= xor($src2, $src3)",
+            [(set (i32 IntRegs:$dst),
+                  (and (i32 IntRegs:$src1), (xor (i32 IntRegs:$src2),
+                                                 (i32 IntRegs:$src3))))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>;
+
+// Rx^=xor(Rs,Rt)
+let validSubTargets = HasV4SubT in
+def XORr_XORrr_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
+            "$dst ^= xor($src2, $src3)",
+            [(set (i32 IntRegs:$dst),
+             (and (i32 IntRegs:$src1), (xor (i32 IntRegs:$src2),
+                                            (i32 IntRegs:$src3))))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>;
+
+// Rx|=and(Rs,#s10)
+let isExtendable = 1, opExtendable = 3, isExtentSigned = 1, opExtentBits = 10,
+validSubTargets = HasV4SubT, CextOpcode = "ORr_ANDr", InputType = "imm" in
+def ORr_ANDri2_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs: $src2, s10Ext:$src3),
+            "$dst |= and($src2, #$src3)",
+            [(set (i32 IntRegs:$dst),
+                  (or (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
+                                                s10ExtPred:$src3)))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>, ImmRegRel;
+
+// Rx|=or(Rs,#s10)
+let isExtendable = 1, opExtendable = 3, isExtentSigned = 1, opExtentBits = 10,
+validSubTargets = HasV4SubT, CextOpcode = "ORr_ORr", InputType = "imm" in
+def ORr_ORri_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs: $src2, s10Ext:$src3),
+            "$dst |= or($src2, #$src3)",
+            [(set (i32 IntRegs:$dst),
+                  (or (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
+                                                s10ExtPred:$src3)))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>, ImmRegRel;
+
+
+//    Modulo wrap
+//        Rd=modwrap(Rs,Rt)
+//    Round
+//        Rd=cround(Rs,#u5)
+//        Rd=cround(Rs,Rt)
+//        Rd=round(Rs,#u5)[:sat]
+//        Rd=round(Rs,Rt)[:sat]
+//    Vector reduce add unsigned halfwords
+//        Rd=vraddh(Rss,Rtt)
+//    Vector add bytes
+//        Rdd=vaddb(Rss,Rtt)
+//    Vector conditional negate
+//        Rdd=vcnegh(Rss,Rt)
+//        Rxx+=vrcnegh(Rss,Rt)
+//    Vector maximum bytes
+//        Rdd=vmaxb(Rtt,Rss)
+//    Vector reduce maximum halfwords
+//        Rxx=vrmaxh(Rss,Ru)
+//        Rxx=vrmaxuh(Rss,Ru)
+//    Vector reduce maximum words
+//        Rxx=vrmaxuw(Rss,Ru)
+//        Rxx=vrmaxw(Rss,Ru)
+//    Vector minimum bytes
+//        Rdd=vminb(Rtt,Rss)
+//    Vector reduce minimum halfwords
+//        Rxx=vrminh(Rss,Ru)
+//        Rxx=vrminuh(Rss,Ru)
+//    Vector reduce minimum words
+//        Rxx=vrminuw(Rss,Ru)
+//        Rxx=vrminw(Rss,Ru)
+//    Vector subtract bytes
+//        Rdd=vsubb(Rss,Rtt)
+
+//===----------------------------------------------------------------------===//
+// XTYPE/ALU -
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// XTYPE/MPY +
+//===----------------------------------------------------------------------===//
+
+// Multiply and user lower result.
+// Rd=add(#u6,mpyi(Rs,#U6))
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 0, opExtentBits = 6,
+validSubTargets = HasV4SubT in
+def ADDi_MPYri_V4 : MInst<(outs IntRegs:$dst),
+            (ins u6Ext:$src1, IntRegs:$src2, u6Imm:$src3),
+            "$dst = add(#$src1, mpyi($src2, #$src3))",
+            [(set (i32 IntRegs:$dst),
+                  (add (mul (i32 IntRegs:$src2), u6ImmPred:$src3),
+                       u6ExtPred:$src1))]>,
+            Requires<[HasV4T]>;
+
+// Rd=add(##,mpyi(Rs,#U6))
+def : Pat <(add (mul (i32 IntRegs:$src2), u6ImmPred:$src3),
+                     (HexagonCONST32 tglobaladdr:$src1)),
+           (i32 (ADDi_MPYri_V4 tglobaladdr:$src1, IntRegs:$src2,
+                               u6ImmPred:$src3))>;
+
+// Rd=add(#u6,mpyi(Rs,Rt))
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 0, opExtentBits = 6,
+validSubTargets = HasV4SubT, InputType = "imm", CextOpcode = "ADD_MPY" in
+def ADDi_MPYrr_V4 : MInst<(outs IntRegs:$dst),
+            (ins u6Ext:$src1, IntRegs:$src2, IntRegs:$src3),
+            "$dst = add(#$src1, mpyi($src2, $src3))",
+            [(set (i32 IntRegs:$dst),
+                  (add (mul (i32 IntRegs:$src2), (i32 IntRegs:$src3)),
+                       u6ExtPred:$src1))]>,
+            Requires<[HasV4T]>, ImmRegRel;
+
+// Rd=add(##,mpyi(Rs,Rt))
+def : Pat <(add (mul (i32 IntRegs:$src2), (i32 IntRegs:$src3)),
+                     (HexagonCONST32 tglobaladdr:$src1)),
+           (i32 (ADDi_MPYrr_V4 tglobaladdr:$src1, IntRegs:$src2,
+                               IntRegs:$src3))>;
+
+// Rd=add(Ru,mpyi(#u6:2,Rs))
+let validSubTargets = HasV4SubT in
+def ADDr_MPYir_V4 : MInst<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, u6Imm:$src2, IntRegs:$src3),
+            "$dst = add($src1, mpyi(#$src2, $src3))",
+            [(set (i32 IntRegs:$dst),
+             (add (i32 IntRegs:$src1), (mul (i32 IntRegs:$src3),
+                                            u6_2ImmPred:$src2)))]>,
+            Requires<[HasV4T]>;
+
+// Rd=add(Ru,mpyi(Rs,#u6))
+let isExtendable = 1, opExtendable = 3, isExtentSigned = 0, opExtentBits = 6,
+validSubTargets = HasV4SubT, InputType = "imm", CextOpcode = "ADD_MPY" in
+def ADDr_MPYri_V4 : MInst<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2, u6Ext:$src3),
+            "$dst = add($src1, mpyi($src2, #$src3))",
+            [(set (i32 IntRegs:$dst),
+                  (add (i32 IntRegs:$src1), (mul (i32 IntRegs:$src2),
+                                                 u6ExtPred:$src3)))]>,
+            Requires<[HasV4T]>, ImmRegRel;
+
+// Rx=add(Ru,mpyi(Rx,Rs))
+let validSubTargets = HasV4SubT, InputType = "reg", CextOpcode = "ADD_MPY" in
+def ADDr_MPYrr_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+            "$dst = add($src1, mpyi($src2, $src3))",
+            [(set (i32 IntRegs:$dst),
+             (add (i32 IntRegs:$src1), (mul (i32 IntRegs:$src2),
+                                            (i32 IntRegs:$src3))))],
+            "$src2 = $dst">,
+            Requires<[HasV4T]>, ImmRegRel;
+
+
+// Polynomial multiply words
+// Rdd=pmpyw(Rs,Rt)
+// Rxx^=pmpyw(Rs,Rt)
+
+// Vector reduce multiply word by signed half (32x16)
+// Rdd=vrmpyweh(Rss,Rtt)[:<<1]
+// Rdd=vrmpywoh(Rss,Rtt)[:<<1]
+// Rxx+=vrmpyweh(Rss,Rtt)[:<<1]
+// Rxx+=vrmpywoh(Rss,Rtt)[:<<1]
+
+// Multiply and use upper result
+// Rd=mpy(Rs,Rt.H):<<1:sat
+// Rd=mpy(Rs,Rt.L):<<1:sat
+// Rd=mpy(Rs,Rt):<<1
+// Rd=mpy(Rs,Rt):<<1:sat
+// Rd=mpysu(Rs,Rt)
+// Rx+=mpy(Rs,Rt):<<1:sat
+// Rx-=mpy(Rs,Rt):<<1:sat
+
+// Vector multiply bytes
+// Rdd=vmpybsu(Rs,Rt)
+// Rdd=vmpybu(Rs,Rt)
+// Rxx+=vmpybsu(Rs,Rt)
+// Rxx+=vmpybu(Rs,Rt)
+
+// Vector polynomial multiply halfwords
+// Rdd=vpmpyh(Rs,Rt)
+// Rxx^=vpmpyh(Rs,Rt)
+
+//===----------------------------------------------------------------------===//
+// XTYPE/MPY -
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// XTYPE/SHIFT +
+//===----------------------------------------------------------------------===//
+
+// Shift by immediate and accumulate.
+// Rx=add(#u8,asl(Rx,#U5))
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 0, opExtentBits = 8,
+validSubTargets = HasV4SubT in
+def ADDi_ASLri_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins u8Ext:$src1, IntRegs:$src2, u5Imm:$src3),
+            "$dst = add(#$src1, asl($src2, #$src3))",
+            [(set (i32 IntRegs:$dst),
+                  (add (shl (i32 IntRegs:$src2), u5ImmPred:$src3),
+                       u8ExtPred:$src1))],
+            "$src2 = $dst">,
+            Requires<[HasV4T]>;
+
+// Rx=add(#u8,lsr(Rx,#U5))
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 0, opExtentBits = 8,
+validSubTargets = HasV4SubT in
+def ADDi_LSRri_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins u8Ext:$src1, IntRegs:$src2, u5Imm:$src3),
+            "$dst = add(#$src1, lsr($src2, #$src3))",
+            [(set (i32 IntRegs:$dst),
+                  (add (srl (i32 IntRegs:$src2), u5ImmPred:$src3),
+                       u8ExtPred:$src1))],
+            "$src2 = $dst">,
+            Requires<[HasV4T]>;
+
+// Rx=sub(#u8,asl(Rx,#U5))
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 0, opExtentBits = 8,
+validSubTargets = HasV4SubT in
+def SUBi_ASLri_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins u8Ext:$src1, IntRegs:$src2, u5Imm:$src3),
+            "$dst = sub(#$src1, asl($src2, #$src3))",
+            [(set (i32 IntRegs:$dst),
+                  (sub (shl (i32 IntRegs:$src2), u5ImmPred:$src3),
+                       u8ExtPred:$src1))],
+            "$src2 = $dst">,
+            Requires<[HasV4T]>;
+
+// Rx=sub(#u8,lsr(Rx,#U5))
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 0, opExtentBits = 8,
+validSubTargets = HasV4SubT in
+def SUBi_LSRri_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins u8Ext:$src1, IntRegs:$src2, u5Imm:$src3),
+            "$dst = sub(#$src1, lsr($src2, #$src3))",
+            [(set (i32 IntRegs:$dst),
+                  (sub (srl (i32 IntRegs:$src2), u5ImmPred:$src3),
+                       u8ExtPred:$src1))],
+            "$src2 = $dst">,
+            Requires<[HasV4T]>;
+
+
+//Shift by immediate and logical.
+//Rx=and(#u8,asl(Rx,#U5))
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 0, opExtentBits = 8,
+validSubTargets = HasV4SubT in
+def ANDi_ASLri_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins u8Ext:$src1, IntRegs:$src2, u5Imm:$src3),
+            "$dst = and(#$src1, asl($src2, #$src3))",
+            [(set (i32 IntRegs:$dst),
+                  (and (shl (i32 IntRegs:$src2), u5ImmPred:$src3),
+                       u8ExtPred:$src1))],
+            "$src2 = $dst">,
+            Requires<[HasV4T]>;
+
+//Rx=and(#u8,lsr(Rx,#U5))
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 0, opExtentBits = 8,
+validSubTargets = HasV4SubT in
+def ANDi_LSRri_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins u8Ext:$src1, IntRegs:$src2, u5Imm:$src3),
+            "$dst = and(#$src1, lsr($src2, #$src3))",
+            [(set (i32 IntRegs:$dst),
+                  (and (srl (i32 IntRegs:$src2), u5ImmPred:$src3),
+                       u8ExtPred:$src1))],
+            "$src2 = $dst">,
+            Requires<[HasV4T]>;
+
+//Rx=or(#u8,asl(Rx,#U5))
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 0, opExtentBits = 8,
+AddedComplexity = 30, validSubTargets = HasV4SubT in
+def ORi_ASLri_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins u8Ext:$src1, IntRegs:$src2, u5Imm:$src3),
+            "$dst = or(#$src1, asl($src2, #$src3))",
+            [(set (i32 IntRegs:$dst),
+                  (or (shl (i32 IntRegs:$src2), u5ImmPred:$src3),
+                      u8ExtPred:$src1))],
+            "$src2 = $dst">,
+            Requires<[HasV4T]>;
+
+//Rx=or(#u8,lsr(Rx,#U5))
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 0, opExtentBits = 8,
+AddedComplexity = 30, validSubTargets = HasV4SubT in
+def ORi_LSRri_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins u8Ext:$src1, IntRegs:$src2, u5Imm:$src3),
+            "$dst = or(#$src1, lsr($src2, #$src3))",
+            [(set (i32 IntRegs:$dst),
+                  (or (srl (i32 IntRegs:$src2), u5ImmPred:$src3),
+                      u8ExtPred:$src1))],
+            "$src2 = $dst">,
+            Requires<[HasV4T]>;
+
+
+//Shift by register.
+//Rd=lsl(#s6,Rt)
+let validSubTargets = HasV4SubT in {
+def LSLi_V4 : MInst<(outs IntRegs:$dst), (ins s6Imm:$src1, IntRegs:$src2),
+            "$dst = lsl(#$src1, $src2)",
+            [(set (i32 IntRegs:$dst), (shl s6ImmPred:$src1,
+                                           (i32 IntRegs:$src2)))]>,
+            Requires<[HasV4T]>;
+
+
+//Shift by register and logical.
+//Rxx^=asl(Rss,Rt)
+def ASLd_rr_xor_V4 : MInst_acc<(outs DoubleRegs:$dst),
+            (ins DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+            "$dst ^= asl($src2, $src3)",
+            [(set (i64 DoubleRegs:$dst),
+                  (xor (i64 DoubleRegs:$src1), (shl (i64 DoubleRegs:$src2),
+                                                    (i32 IntRegs:$src3))))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>;
+
+//Rxx^=asr(Rss,Rt)
+def ASRd_rr_xor_V4 : MInst_acc<(outs DoubleRegs:$dst),
+            (ins DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+            "$dst ^= asr($src2, $src3)",
+            [(set (i64 DoubleRegs:$dst),
+                  (xor (i64 DoubleRegs:$src1), (sra (i64 DoubleRegs:$src2),
+                                                    (i32 IntRegs:$src3))))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>;
+
+//Rxx^=lsl(Rss,Rt)
+def LSLd_rr_xor_V4 : MInst_acc<(outs DoubleRegs:$dst),
+            (ins DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+            "$dst ^= lsl($src2, $src3)",
+            [(set (i64 DoubleRegs:$dst), (xor (i64 DoubleRegs:$src1),
+                                              (shl (i64 DoubleRegs:$src2),
+                                                   (i32 IntRegs:$src3))))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>;
+
+//Rxx^=lsr(Rss,Rt)
+def LSRd_rr_xor_V4 : MInst_acc<(outs DoubleRegs:$dst),
+            (ins DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+            "$dst ^= lsr($src2, $src3)",
+            [(set (i64 DoubleRegs:$dst),
+                  (xor (i64 DoubleRegs:$src1), (srl (i64 DoubleRegs:$src2),
+                                                    (i32 IntRegs:$src3))))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>;
+}
+
+//===----------------------------------------------------------------------===//
+// XTYPE/SHIFT -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MEMOP: Word, Half, Byte
+//===----------------------------------------------------------------------===//
+
+def MEMOPIMM : SDNodeXForm<imm, [{
+  // Call the transformation function XformM5ToU5Imm to get the negative
+  // immediate's positive counterpart.
+  int32_t imm = N->getSExtValue();
+  return XformM5ToU5Imm(imm);
+}]>;
+
+def MEMOPIMM_HALF : SDNodeXForm<imm, [{
+  // -1 .. -31 represented as 65535..65515
+  // assigning to a short restores our desired signed value.
+  // Call the transformation function XformM5ToU5Imm to get the negative
+  // immediate's positive counterpart.
+  int16_t imm = N->getSExtValue();
+  return XformM5ToU5Imm(imm);
+}]>;
+
+def MEMOPIMM_BYTE : SDNodeXForm<imm, [{
+  // -1 .. -31 represented as 255..235
+  // assigning to a char restores our desired signed value.
+  // Call the transformation function XformM5ToU5Imm to get the negative
+  // immediate's positive counterpart.
+  int8_t imm = N->getSExtValue();
+  return XformM5ToU5Imm(imm);
+}]>;
+
+def SETMEMIMM : SDNodeXForm<imm, [{
+   // Return the bit position we will set [0-31].
+   // As an SDNode.
+   int32_t imm = N->getSExtValue();
+   return XformMskToBitPosU5Imm(imm);
+}]>;
+
+def CLRMEMIMM : SDNodeXForm<imm, [{
+   // Return the bit position we will clear [0-31].
+   // As an SDNode.
+   // we bit negate the value first
+   int32_t imm = ~(N->getSExtValue());
+   return XformMskToBitPosU5Imm(imm);
+}]>;
+
+def SETMEMIMM_SHORT : SDNodeXForm<imm, [{
+   // Return the bit position we will set [0-15].
+   // As an SDNode.
+   int16_t imm = N->getSExtValue();
+   return XformMskToBitPosU4Imm(imm);
+}]>;
+
+def CLRMEMIMM_SHORT : SDNodeXForm<imm, [{
+   // Return the bit position we will clear [0-15].
+   // As an SDNode.
+   // we bit negate the value first
+   int16_t imm = ~(N->getSExtValue());
+   return XformMskToBitPosU4Imm(imm);
+}]>;
+
+def SETMEMIMM_BYTE : SDNodeXForm<imm, [{
+   // Return the bit position we will set [0-7].
+   // As an SDNode.
+   int8_t imm =  N->getSExtValue();
+   return XformMskToBitPosU3Imm(imm);
+}]>;
+
+def CLRMEMIMM_BYTE : SDNodeXForm<imm, [{
+   // Return the bit position we will clear [0-7].
+   // As an SDNode.
+   // we bit negate the value first
+   int8_t imm = ~(N->getSExtValue());
+   return XformMskToBitPosU3Imm(imm);
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Template class for MemOp instructions with the register value.
+//===----------------------------------------------------------------------===//
+class MemOp_rr_base <string opc, bits<2> opcBits, Operand ImmOp,
+                     string memOp, bits<2> memOpBits> :
+      MEMInst_V4<(outs),
+                 (ins IntRegs:$base, ImmOp:$offset, IntRegs:$delta),
+                 opc#"($base+#$offset)"#memOp#"$delta",
+                 []>,
+                 Requires<[HasV4T, UseMEMOP]> {
+
+    bits<5> base;
+    bits<5> delta;
+    bits<32> offset;
+    bits<6> offsetBits; // memb - u6:0 , memh - u6:1, memw - u6:2
+
+    let offsetBits = !if (!eq(opcBits, 0b00), offset{5-0},
+                     !if (!eq(opcBits, 0b01), offset{6-1},
+                     !if (!eq(opcBits, 0b10), offset{7-2},0)));
+
+    let IClass = 0b0011;
+    let Inst{27-24} = 0b1110;
+    let Inst{22-21} = opcBits;
+    let Inst{20-16} = base;
+    let Inst{13} = 0b0;
+    let Inst{12-7} = offsetBits;
+    let Inst{6-5} = memOpBits;
+    let Inst{4-0} = delta;
+}
+
+//===----------------------------------------------------------------------===//
+// Template class for MemOp instructions with the immediate value.
+//===----------------------------------------------------------------------===//
+class MemOp_ri_base <string opc, bits<2> opcBits, Operand ImmOp,
+                     string memOp, bits<2> memOpBits> :
+      MEMInst_V4 <(outs),
+                  (ins IntRegs:$base, ImmOp:$offset, u5Imm:$delta),
+                  opc#"($base+#$offset)"#memOp#"#$delta"
+                  #!if(memOpBits{1},")", ""), // clrbit, setbit - include ')'
+                  []>,
+                  Requires<[HasV4T, UseMEMOP]> {
+
+    bits<5> base;
+    bits<5> delta;
+    bits<32> offset;
+    bits<6> offsetBits; // memb - u6:0 , memh - u6:1, memw - u6:2
+
+    let offsetBits = !if (!eq(opcBits, 0b00), offset{5-0},
+                     !if (!eq(opcBits, 0b01), offset{6-1},
+                     !if (!eq(opcBits, 0b10), offset{7-2},0)));
+
+    let IClass = 0b0011;
+    let Inst{27-24} = 0b1111;
+    let Inst{22-21} = opcBits;
+    let Inst{20-16} = base;
+    let Inst{13} = 0b0;
+    let Inst{12-7} = offsetBits;
+    let Inst{6-5} = memOpBits;
+    let Inst{4-0} = delta;
+}
+
+// multiclass to define MemOp instructions with register operand.
+multiclass MemOp_rr<string opc, bits<2> opcBits, Operand ImmOp> {
+  def _ADD#NAME#_V4 : MemOp_rr_base <opc, opcBits, ImmOp, " += ", 0b00>; // add
+  def _SUB#NAME#_V4 : MemOp_rr_base <opc, opcBits, ImmOp, " -= ", 0b01>; // sub
+  def _AND#NAME#_V4 : MemOp_rr_base <opc, opcBits, ImmOp, " &= ", 0b10>; // and
+  def _OR#NAME#_V4  : MemOp_rr_base <opc, opcBits, ImmOp, " |= ", 0b11>; // or
+}
+
+// multiclass to define MemOp instructions with immediate Operand.
+multiclass MemOp_ri<string opc, bits<2> opcBits, Operand ImmOp> {
+  def _ADD#NAME#_V4 : MemOp_ri_base <opc, opcBits, ImmOp, " += ", 0b00 >;
+  def _SUB#NAME#_V4 : MemOp_ri_base <opc, opcBits, ImmOp, " -= ", 0b01 >;
+  def _CLRBIT#NAME#_V4 : MemOp_ri_base<opc, opcBits, ImmOp, " =clrbit(", 0b10>;
+  def _SETBIT#NAME#_V4 : MemOp_ri_base<opc, opcBits, ImmOp, " =setbit(", 0b11>;
+}
+
+multiclass MemOp_base <string opc, bits<2> opcBits, Operand ImmOp> {
+  defm r : MemOp_rr <opc, opcBits, ImmOp>;
+  defm i : MemOp_ri <opc, opcBits, ImmOp>;
+}
+
+// Define MemOp instructions.
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 0,
+validSubTargets =HasV4SubT in {
+  let opExtentBits = 6, accessSize = ByteAccess in
+  defm MemOPb : MemOp_base <"memb", 0b00, u6_0Ext>;
+
+  let opExtentBits = 7, accessSize = HalfWordAccess in
+  defm MemOPh : MemOp_base <"memh", 0b01, u6_1Ext>;
+
+  let opExtentBits = 8, accessSize = WordAccess in
+  defm MemOPw : MemOp_base <"memw", 0b10, u6_2Ext>;
+}
+
+//===----------------------------------------------------------------------===//
+// Multiclass to define 'Def Pats' for ALU operations on the memory
+// Here value used for the ALU operation is an immediate value.
+// mem[bh](Rs+#0) += #U5
+// mem[bh](Rs+#u6) += #U5
+//===----------------------------------------------------------------------===//
+
+multiclass MemOpi_u5Pats <PatFrag ldOp, PatFrag stOp, PatLeaf ExtPred,
+                          InstHexagon MI, SDNode OpNode> {
+  let AddedComplexity = 180 in
+  def : Pat < (stOp (OpNode (ldOp IntRegs:$addr), u5ImmPred:$addend),
+                    IntRegs:$addr),
+              (MI IntRegs:$addr, #0, u5ImmPred:$addend )>;
+
+  let AddedComplexity = 190 in
+  def : Pat <(stOp (OpNode (ldOp (add IntRegs:$base, ExtPred:$offset)),
+                     u5ImmPred:$addend),
+             (add IntRegs:$base, ExtPred:$offset)),
+       (MI IntRegs:$base, ExtPred:$offset, u5ImmPred:$addend)>;
+}
+
+multiclass MemOpi_u5ALUOp<PatFrag ldOp, PatFrag stOp, PatLeaf ExtPred,
+                          InstHexagon addMI, InstHexagon subMI> {
+  defm : MemOpi_u5Pats<ldOp, stOp, ExtPred, addMI, add>;
+  defm : MemOpi_u5Pats<ldOp, stOp, ExtPred, subMI, sub>;
+}
+
+multiclass MemOpi_u5ExtType<PatFrag ldOpByte, PatFrag ldOpHalf > {
+  // Half Word
+  defm : MemOpi_u5ALUOp <ldOpHalf, truncstorei16, u6_1ExtPred,
+                         MemOPh_ADDi_V4, MemOPh_SUBi_V4>;
+  // Byte
+  defm : MemOpi_u5ALUOp <ldOpByte, truncstorei8, u6ExtPred,
+                         MemOPb_ADDi_V4, MemOPb_SUBi_V4>;
+}
+
+let Predicates = [HasV4T, UseMEMOP] in {
+  defm : MemOpi_u5ExtType<zextloadi8, zextloadi16>; // zero extend
+  defm : MemOpi_u5ExtType<sextloadi8, sextloadi16>; // sign extend
+  defm : MemOpi_u5ExtType<extloadi8,  extloadi16>;  // any extend
+
+  // Word
+  defm : MemOpi_u5ALUOp <load, store, u6_2ExtPred, MemOPw_ADDi_V4,
+                         MemOPw_SUBi_V4>;
+}
+
+//===----------------------------------------------------------------------===//
+// multiclass to define 'Def Pats' for ALU operations on the memory.
+// Here value used for the ALU operation is a negative value.
+// mem[bh](Rs+#0) += #m5
+// mem[bh](Rs+#u6) += #m5
+//===----------------------------------------------------------------------===//
+
+multiclass MemOpi_m5Pats <PatFrag ldOp, PatFrag stOp, PatLeaf extPred,
+                          PatLeaf immPred, ComplexPattern addrPred,
+                          SDNodeXForm xformFunc, InstHexagon MI> {
+  let AddedComplexity = 190 in
+  def : Pat <(stOp (add (ldOp IntRegs:$addr), immPred:$subend),
+                   IntRegs:$addr),
+             (MI IntRegs:$addr, #0, (xformFunc immPred:$subend) )>;
+
+  let AddedComplexity = 195 in
+  def : Pat<(stOp (add (ldOp (add IntRegs:$base, extPred:$offset)),
+                       immPred:$subend),
+                  (add IntRegs:$base, extPred:$offset)),
+            (MI IntRegs:$base, extPred:$offset, (xformFunc immPred:$subend))>;
+}
+
+multiclass MemOpi_m5ExtType<PatFrag ldOpByte, PatFrag ldOpHalf > {
+  // Half Word
+  defm : MemOpi_m5Pats <ldOpHalf, truncstorei16, u6_1ExtPred, m5HImmPred,
+                        ADDRriU6_1, MEMOPIMM_HALF, MemOPh_SUBi_V4>;
+  // Byte
+  defm : MemOpi_m5Pats <ldOpByte, truncstorei8, u6ExtPred, m5BImmPred,
+                        ADDRriU6_0, MEMOPIMM_BYTE, MemOPb_SUBi_V4>;
+}
+
+let Predicates = [HasV4T, UseMEMOP] in {
+  defm : MemOpi_m5ExtType<zextloadi8, zextloadi16>; // zero extend
+  defm : MemOpi_m5ExtType<sextloadi8, sextloadi16>; // sign extend
+  defm : MemOpi_m5ExtType<extloadi8,  extloadi16>;  // any extend
+
+  // Word
+  defm : MemOpi_m5Pats <load, store, u6_2ExtPred, m5ImmPred,
+                          ADDRriU6_2, MEMOPIMM, MemOPw_SUBi_V4>;
+}
+
+//===----------------------------------------------------------------------===//
+// Multiclass to define 'def Pats' for bit operations on the memory.
+// mem[bhw](Rs+#0) = [clrbit|setbit](#U5)
+// mem[bhw](Rs+#u6) = [clrbit|setbit](#U5)
+//===----------------------------------------------------------------------===//
+
+multiclass MemOpi_bitPats <PatFrag ldOp, PatFrag stOp, PatLeaf immPred,
+                     PatLeaf extPred, ComplexPattern addrPred,
+                     SDNodeXForm xformFunc, InstHexagon MI, SDNode OpNode> {
+
+  // mem[bhw](Rs+#u6:[012]) = [clrbit|setbit](#U5)
+  let AddedComplexity = 250 in
+  def : Pat<(stOp (OpNode (ldOp (add IntRegs:$base, extPred:$offset)),
+                          immPred:$bitend),
+                  (add IntRegs:$base, extPred:$offset)),
+            (MI IntRegs:$base, extPred:$offset, (xformFunc immPred:$bitend))>;
+
+  // mem[bhw](Rs+#0) = [clrbit|setbit](#U5)
+  let AddedComplexity = 225 in
+  def : Pat <(stOp (OpNode (ldOp (addrPred IntRegs:$addr, extPred:$offset)),
+                           immPred:$bitend),
+                   (addrPred (i32 IntRegs:$addr), extPred:$offset)),
+             (MI IntRegs:$addr, extPred:$offset, (xformFunc immPred:$bitend))>;
+}
+
+multiclass MemOpi_bitExtType<PatFrag ldOpByte, PatFrag ldOpHalf > {
+  // Byte - clrbit
+  defm : MemOpi_bitPats<ldOpByte, truncstorei8, Clr3ImmPred, u6ExtPred,
+                       ADDRriU6_0, CLRMEMIMM_BYTE, MemOPb_CLRBITi_V4, and>;
+  // Byte - setbit
+  defm : MemOpi_bitPats<ldOpByte, truncstorei8, Set3ImmPred,  u6ExtPred,
+                       ADDRriU6_0, SETMEMIMM_BYTE, MemOPb_SETBITi_V4, or>;
+  // Half Word - clrbit
+  defm : MemOpi_bitPats<ldOpHalf, truncstorei16, Clr4ImmPred, u6_1ExtPred,
+                       ADDRriU6_1, CLRMEMIMM_SHORT, MemOPh_CLRBITi_V4, and>;
+  // Half Word - setbit
+  defm : MemOpi_bitPats<ldOpHalf, truncstorei16, Set4ImmPred, u6_1ExtPred,
+                       ADDRriU6_1, SETMEMIMM_SHORT, MemOPh_SETBITi_V4, or>;
+}
+
+let Predicates = [HasV4T, UseMEMOP] in {
+  // mem[bh](Rs+#0) = [clrbit|setbit](#U5)
+  // mem[bh](Rs+#u6:[01]) = [clrbit|setbit](#U5)
+  defm : MemOpi_bitExtType<zextloadi8, zextloadi16>; // zero extend
+  defm : MemOpi_bitExtType<sextloadi8, sextloadi16>; // sign extend
+  defm : MemOpi_bitExtType<extloadi8,  extloadi16>;  // any extend
+
+  // memw(Rs+#0) = [clrbit|setbit](#U5)
+  // memw(Rs+#u6:2) = [clrbit|setbit](#U5)
+  defm : MemOpi_bitPats<load, store, Clr5ImmPred, u6_2ExtPred, ADDRriU6_2,
+                       CLRMEMIMM, MemOPw_CLRBITi_V4, and>;
+  defm : MemOpi_bitPats<load, store, Set5ImmPred, u6_2ExtPred, ADDRriU6_2,
+                       SETMEMIMM, MemOPw_SETBITi_V4, or>;
+}
+
+//===----------------------------------------------------------------------===//
+// Multiclass to define 'def Pats' for ALU operations on the memory
+// where addend is a register.
+// mem[bhw](Rs+#0) [+-&|]= Rt
+// mem[bhw](Rs+#U6:[012]) [+-&|]= Rt
+//===----------------------------------------------------------------------===//
+
+multiclass MemOpr_Pats <PatFrag ldOp, PatFrag stOp, ComplexPattern addrPred,
+                     PatLeaf extPred, InstHexagon MI, SDNode OpNode> {
+  let AddedComplexity = 141 in
+  // mem[bhw](Rs+#0) [+-&|]= Rt
+  def : Pat <(stOp (OpNode (ldOp (addrPred IntRegs:$addr, extPred:$offset)),
+                           (i32 IntRegs:$addend)),
+                   (addrPred (i32 IntRegs:$addr), extPred:$offset)),
+             (MI IntRegs:$addr, extPred:$offset, (i32 IntRegs:$addend) )>;
+
+  // mem[bhw](Rs+#U6:[012]) [+-&|]= Rt
+  let AddedComplexity = 150 in
+  def : Pat <(stOp (OpNode (ldOp (add IntRegs:$base, extPred:$offset)),
+                           (i32 IntRegs:$orend)),
+                   (add IntRegs:$base, extPred:$offset)),
+             (MI IntRegs:$base, extPred:$offset, (i32 IntRegs:$orend) )>;
+}
+
+multiclass MemOPr_ALUOp<PatFrag ldOp, PatFrag stOp,
+                        ComplexPattern addrPred, PatLeaf extPred,
+                        InstHexagon addMI, InstHexagon subMI,
+                        InstHexagon andMI, InstHexagon orMI > {
+
+  defm : MemOpr_Pats <ldOp, stOp, addrPred, extPred, addMI, add>;
+  defm : MemOpr_Pats <ldOp, stOp, addrPred, extPred, subMI, sub>;
+  defm : MemOpr_Pats <ldOp, stOp, addrPred, extPred, andMI, and>;
+  defm : MemOpr_Pats <ldOp, stOp, addrPred, extPred, orMI,  or>;
+}
+
+multiclass MemOPr_ExtType<PatFrag ldOpByte, PatFrag ldOpHalf > {
+  // Half Word
+  defm : MemOPr_ALUOp <ldOpHalf, truncstorei16, ADDRriU6_1, u6_1ExtPred,
+                       MemOPh_ADDr_V4, MemOPh_SUBr_V4,
+                       MemOPh_ANDr_V4, MemOPh_ORr_V4>;
+  // Byte
+  defm : MemOPr_ALUOp <ldOpByte, truncstorei8, ADDRriU6_0, u6ExtPred,
+                       MemOPb_ADDr_V4, MemOPb_SUBr_V4,
+                       MemOPb_ANDr_V4, MemOPb_ORr_V4>;
+}
+
+// Define 'def Pats' for MemOps with register addend.
+let Predicates = [HasV4T, UseMEMOP] in {
+  // Byte, Half Word
+  defm : MemOPr_ExtType<zextloadi8, zextloadi16>; // zero extend
+  defm : MemOPr_ExtType<sextloadi8, sextloadi16>; // sign extend
+  defm : MemOPr_ExtType<extloadi8,  extloadi16>;  // any extend
+  // Word
+  defm : MemOPr_ALUOp <load, store, ADDRriU6_2, u6_2ExtPred, MemOPw_ADDr_V4,
+                       MemOPw_SUBr_V4, MemOPw_ANDr_V4, MemOPw_ORr_V4 >;
+}
+
+//===----------------------------------------------------------------------===//
+// XTYPE/PRED +
+//===----------------------------------------------------------------------===//
+
+// Hexagon V4 only supports these flavors of byte/half compare instructions:
+// EQ/GT/GTU. Other flavors like GE/GEU/LT/LTU/LE/LEU are not supported by
+// hardware. However, compiler can still implement these patterns through
+// appropriate patterns combinations based on current implemented patterns.
+// The implemented patterns are: EQ/GT/GTU.
+// Missing patterns are: GE/GEU/LT/LTU/LE/LEU.
+
+// Following instruction is not being extended as it results into the
+// incorrect code for negative numbers.
+// Pd=cmpb.eq(Rs,#u8)
+
+// p=!cmp.eq(r1,r2)
+let isCompare = 1, validSubTargets = HasV4SubT in
+def CMPnotEQ_rr : ALU32_rr<(outs PredRegs:$dst),
+                           (ins IntRegs:$src1, IntRegs:$src2),
+      "$dst = !cmp.eq($src1, $src2)",
+      [(set (i1 PredRegs:$dst),
+            (setne (i32 IntRegs:$src1), (i32 IntRegs:$src2)))]>,
+      Requires<[HasV4T]>;
+
+// p=!cmp.eq(r1,#s10)
+let isCompare = 1, validSubTargets = HasV4SubT in
+def CMPnotEQ_ri : ALU32_ri<(outs PredRegs:$dst),
+                           (ins IntRegs:$src1, s10Ext:$src2),
+      "$dst = !cmp.eq($src1, #$src2)",
+      [(set (i1 PredRegs:$dst),
+            (setne (i32 IntRegs:$src1), s10ImmPred:$src2))]>,
+      Requires<[HasV4T]>;
+
+// p=!cmp.gt(r1,r2)
+let isCompare = 1, validSubTargets = HasV4SubT in
+def CMPnotGT_rr : ALU32_rr<(outs PredRegs:$dst),
+                           (ins IntRegs:$src1, IntRegs:$src2),
+      "$dst = !cmp.gt($src1, $src2)",
+      [(set (i1 PredRegs:$dst),
+            (not (setgt (i32 IntRegs:$src1), (i32 IntRegs:$src2))))]>,
+      Requires<[HasV4T]>;
+
+// p=!cmp.gt(r1,#s10)
+let isCompare = 1, validSubTargets = HasV4SubT in
+def CMPnotGT_ri : ALU32_ri<(outs PredRegs:$dst),
+                           (ins IntRegs:$src1, s10Ext:$src2),
+      "$dst = !cmp.gt($src1, #$src2)",
+      [(set (i1 PredRegs:$dst),
+            (not (setgt (i32 IntRegs:$src1), s10ImmPred:$src2)))]>,
+      Requires<[HasV4T]>;
+
+// p=!cmp.gtu(r1,r2)
+let isCompare = 1, validSubTargets = HasV4SubT in
+def CMPnotGTU_rr : ALU32_rr<(outs PredRegs:$dst),
+                            (ins IntRegs:$src1, IntRegs:$src2),
+      "$dst = !cmp.gtu($src1, $src2)",
+      [(set (i1 PredRegs:$dst),
+            (not (setugt (i32 IntRegs:$src1), (i32 IntRegs:$src2))))]>,
+      Requires<[HasV4T]>;
+
+// p=!cmp.gtu(r1,#u9)
+let isCompare = 1, validSubTargets = HasV4SubT in
+def CMPnotGTU_ri : ALU32_ri<(outs PredRegs:$dst),
+                            (ins IntRegs:$src1, u9Ext:$src2),
+      "$dst = !cmp.gtu($src1, #$src2)",
+      [(set (i1 PredRegs:$dst),
+            (not (setugt (i32 IntRegs:$src1), u9ImmPred:$src2)))]>,
+      Requires<[HasV4T]>;
+
+let isCompare = 1, validSubTargets = HasV4SubT in
+def CMPbEQri_V4 : MInst<(outs PredRegs:$dst),
+            (ins IntRegs:$src1, u8Imm:$src2),
+            "$dst = cmpb.eq($src1, #$src2)",
+            [(set (i1 PredRegs:$dst),
+                  (seteq (and (i32 IntRegs:$src1), 255), u8ImmPred:$src2))]>,
+            Requires<[HasV4T]>;
+
+def : Pat <(brcond (i1 (setne (and (i32 IntRegs:$src1), 255), u8ImmPred:$src2)),
+                       bb:$offset),
+      (JMP_f (CMPbEQri_V4 (i32 IntRegs:$src1), u8ImmPred:$src2),
+                bb:$offset)>,
+      Requires<[HasV4T]>;
+
+// Pd=cmpb.eq(Rs,Rt)
+let isCompare = 1, validSubTargets = HasV4SubT in
+def CMPbEQrr_ubub_V4 : MInst<(outs PredRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2),
+            "$dst = cmpb.eq($src1, $src2)",
+            [(set (i1 PredRegs:$dst),
+                  (seteq (and (xor (i32 IntRegs:$src1),
+                                   (i32 IntRegs:$src2)), 255), 0))]>,
+            Requires<[HasV4T]>;
+
+// Pd=cmpb.eq(Rs,Rt)
+let isCompare = 1, validSubTargets = HasV4SubT in
+def CMPbEQrr_sbsb_V4 : MInst<(outs PredRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2),
+            "$dst = cmpb.eq($src1, $src2)",
+            [(set (i1 PredRegs:$dst),
+                  (seteq (shl (i32 IntRegs:$src1), (i32 24)),
+                         (shl (i32 IntRegs:$src2), (i32 24))))]>,
+            Requires<[HasV4T]>;
+
+// Pd=cmpb.gt(Rs,Rt)
+let isCompare = 1, validSubTargets = HasV4SubT in
+def CMPbGTrr_V4 : MInst<(outs PredRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2),
+            "$dst = cmpb.gt($src1, $src2)",
+            [(set (i1 PredRegs:$dst),
+                  (setgt (shl (i32 IntRegs:$src1), (i32 24)),
+                         (shl (i32 IntRegs:$src2), (i32 24))))]>,
+            Requires<[HasV4T]>;
+
+// Pd=cmpb.gtu(Rs,#u7)
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 0, opExtentBits = 7,
+isCompare = 1, validSubTargets = HasV4SubT, CextOpcode = "CMPbGTU", InputType = "imm" in
+def CMPbGTUri_V4 : MInst<(outs PredRegs:$dst),
+            (ins IntRegs:$src1, u7Ext:$src2),
+            "$dst = cmpb.gtu($src1, #$src2)",
+            [(set (i1 PredRegs:$dst), (setugt (and (i32 IntRegs:$src1), 255),
+                                              u7ExtPred:$src2))]>,
+            Requires<[HasV4T]>, ImmRegRel;
+
+// SDNode for converting immediate C to C-1.
+def DEC_CONST_BYTE : SDNodeXForm<imm, [{
+   // Return the byte immediate const-1 as an SDNode.
+   int32_t imm = N->getSExtValue();
+   return XformU7ToU7M1Imm(imm);
+}]>;
+
+// For the sequence
+//   zext( seteq ( and(Rs, 255), u8))
+// Generate
+//   Pd=cmpb.eq(Rs, #u8)
+//   if (Pd.new) Rd=#1
+//   if (!Pd.new) Rd=#0
+def : Pat <(i32 (zext (i1 (seteq (i32 (and (i32 IntRegs:$Rs), 255)),
+                                           u8ExtPred:$u8)))),
+           (i32 (TFR_condset_ii (i1 (CMPbEQri_V4 (i32 IntRegs:$Rs),
+                                                 (u8ExtPred:$u8))),
+                                1, 0))>,
+           Requires<[HasV4T]>;
+
+// For the sequence
+//   zext( setne ( and(Rs, 255), u8))
+// Generate
+//   Pd=cmpb.eq(Rs, #u8)
+//   if (Pd.new) Rd=#0
+//   if (!Pd.new) Rd=#1
+def : Pat <(i32 (zext (i1 (setne (i32 (and (i32 IntRegs:$Rs), 255)),
+                                           u8ExtPred:$u8)))),
+           (i32 (TFR_condset_ii (i1 (CMPbEQri_V4 (i32 IntRegs:$Rs),
+                                                 (u8ExtPred:$u8))),
+                                0, 1))>,
+           Requires<[HasV4T]>;
+
+// For the sequence
+//   zext( seteq (Rs, and(Rt, 255)))
+// Generate
+//   Pd=cmpb.eq(Rs, Rt)
+//   if (Pd.new) Rd=#1
+//   if (!Pd.new) Rd=#0
+def : Pat <(i32 (zext (i1 (seteq (i32 IntRegs:$Rt),
+                                 (i32 (and (i32 IntRegs:$Rs), 255)))))),
+           (i32 (TFR_condset_ii (i1 (CMPbEQrr_ubub_V4 (i32 IntRegs:$Rs),
+                                                      (i32 IntRegs:$Rt))),
+                                1, 0))>,
+           Requires<[HasV4T]>;
+
+// For the sequence
+//   zext( setne (Rs, and(Rt, 255)))
+// Generate
+//   Pd=cmpb.eq(Rs, Rt)
+//   if (Pd.new) Rd=#0
+//   if (!Pd.new) Rd=#1
+def : Pat <(i32 (zext (i1 (setne (i32 IntRegs:$Rt),
+                                 (i32 (and (i32 IntRegs:$Rs), 255)))))),
+           (i32 (TFR_condset_ii (i1 (CMPbEQrr_ubub_V4 (i32 IntRegs:$Rs),
+                                                      (i32 IntRegs:$Rt))),
+                                0, 1))>,
+           Requires<[HasV4T]>;
+
+// For the sequence
+//   zext( setugt ( and(Rs, 255), u8))
+// Generate
+//   Pd=cmpb.gtu(Rs, #u8)
+//   if (Pd.new) Rd=#1
+//   if (!Pd.new) Rd=#0
+def : Pat <(i32 (zext (i1 (setugt (i32 (and (i32 IntRegs:$Rs), 255)),
+                                            u8ExtPred:$u8)))),
+           (i32 (TFR_condset_ii (i1 (CMPbGTUri_V4 (i32 IntRegs:$Rs),
+                                                  (u8ExtPred:$u8))),
+                                1, 0))>,
+           Requires<[HasV4T]>;
+
+// For the sequence
+//   zext( setugt ( and(Rs, 254), u8))
+// Generate
+//   Pd=cmpb.gtu(Rs, #u8)
+//   if (Pd.new) Rd=#1
+//   if (!Pd.new) Rd=#0
+def : Pat <(i32 (zext (i1 (setugt (i32 (and (i32 IntRegs:$Rs), 254)),
+                                            u8ExtPred:$u8)))),
+           (i32 (TFR_condset_ii (i1 (CMPbGTUri_V4 (i32 IntRegs:$Rs),
+                                                  (u8ExtPred:$u8))),
+                                1, 0))>,
+           Requires<[HasV4T]>;
+
+// For the sequence
+//   zext( setult ( Rs, Rt))
+// Generate
+//   Pd=cmp.ltu(Rs, Rt)
+//   if (Pd.new) Rd=#1
+//   if (!Pd.new) Rd=#0
+// cmp.ltu(Rs, Rt) -> cmp.gtu(Rt, Rs)
+def : Pat <(i32 (zext (i1 (setult (i32 IntRegs:$Rs), (i32 IntRegs:$Rt))))),
+           (i32 (TFR_condset_ii (i1 (CMPGTUrr (i32 IntRegs:$Rt),
+                                              (i32 IntRegs:$Rs))),
+                                1, 0))>,
+           Requires<[HasV4T]>;
+
+// For the sequence
+//   zext( setlt ( Rs, Rt))
+// Generate
+//   Pd=cmp.lt(Rs, Rt)
+//   if (Pd.new) Rd=#1
+//   if (!Pd.new) Rd=#0
+// cmp.lt(Rs, Rt) -> cmp.gt(Rt, Rs)
+def : Pat <(i32 (zext (i1 (setlt (i32 IntRegs:$Rs), (i32 IntRegs:$Rt))))),
+           (i32 (TFR_condset_ii (i1 (CMPGTrr (i32 IntRegs:$Rt),
+                                             (i32 IntRegs:$Rs))),
+                                1, 0))>,
+           Requires<[HasV4T]>;
+
+// For the sequence
+//   zext( setugt ( Rs, Rt))
+// Generate
+//   Pd=cmp.gtu(Rs, Rt)
+//   if (Pd.new) Rd=#1
+//   if (!Pd.new) Rd=#0
+def : Pat <(i32 (zext (i1 (setugt (i32 IntRegs:$Rs), (i32 IntRegs:$Rt))))),
+           (i32 (TFR_condset_ii (i1 (CMPGTUrr (i32 IntRegs:$Rs),
+                                              (i32 IntRegs:$Rt))),
+                                1, 0))>,
+           Requires<[HasV4T]>;
+
+// This pattern interefers with coremark performance, not implementing at this
+// time.
+// For the sequence
+//   zext( setgt ( Rs, Rt))
+// Generate
+//   Pd=cmp.gt(Rs, Rt)
+//   if (Pd.new) Rd=#1
+//   if (!Pd.new) Rd=#0
+
+// For the sequence
+//   zext( setuge ( Rs, Rt))
+// Generate
+//   Pd=cmp.ltu(Rs, Rt)
+//   if (Pd.new) Rd=#0
+//   if (!Pd.new) Rd=#1
+// cmp.ltu(Rs, Rt) -> cmp.gtu(Rt, Rs)
+def : Pat <(i32 (zext (i1 (setuge (i32 IntRegs:$Rs), (i32 IntRegs:$Rt))))),
+           (i32 (TFR_condset_ii (i1 (CMPGTUrr (i32 IntRegs:$Rt),
+                                              (i32 IntRegs:$Rs))),
+                                0, 1))>,
+           Requires<[HasV4T]>;
+
+// For the sequence
+//   zext( setge ( Rs, Rt))
+// Generate
+//   Pd=cmp.lt(Rs, Rt)
+//   if (Pd.new) Rd=#0
+//   if (!Pd.new) Rd=#1
+// cmp.lt(Rs, Rt) -> cmp.gt(Rt, Rs)
+def : Pat <(i32 (zext (i1 (setge (i32 IntRegs:$Rs), (i32 IntRegs:$Rt))))),
+           (i32 (TFR_condset_ii (i1 (CMPGTrr (i32 IntRegs:$Rt),
+                                             (i32 IntRegs:$Rs))),
+                                0, 1))>,
+           Requires<[HasV4T]>;
+
+// For the sequence
+//   zext( setule ( Rs, Rt))
+// Generate
+//   Pd=cmp.gtu(Rs, Rt)
+//   if (Pd.new) Rd=#0
+//   if (!Pd.new) Rd=#1
+def : Pat <(i32 (zext (i1 (setule (i32 IntRegs:$Rs), (i32 IntRegs:$Rt))))),
+           (i32 (TFR_condset_ii (i1 (CMPGTUrr (i32 IntRegs:$Rs),
+                                              (i32 IntRegs:$Rt))),
+                                0, 1))>,
+           Requires<[HasV4T]>;
+
+// For the sequence
+//   zext( setle ( Rs, Rt))
+// Generate
+//   Pd=cmp.gt(Rs, Rt)
+//   if (Pd.new) Rd=#0
+//   if (!Pd.new) Rd=#1
+def : Pat <(i32 (zext (i1 (setle (i32 IntRegs:$Rs), (i32 IntRegs:$Rt))))),
+           (i32 (TFR_condset_ii (i1 (CMPGTrr (i32 IntRegs:$Rs),
+                                             (i32 IntRegs:$Rt))),
+                                0, 1))>,
+           Requires<[HasV4T]>;
+
+// For the sequence
+//   zext( setult ( and(Rs, 255), u8))
+// Use the isdigit transformation below
+
+// Generate code of the form 'mux_ii(cmpbgtu(Rdd, C-1),0,1)'
+// for C code of the form r = ((c>='0') & (c<='9')) ? 1 : 0;.
+// The isdigit transformation relies on two 'clever' aspects:
+// 1) The data type is unsigned which allows us to eliminate a zero test after
+//    biasing the expression by 48. We are depending on the representation of
+//    the unsigned types, and semantics.
+// 2) The front end has converted <= 9 into < 10 on entry to LLVM
+//
+// For the C code:
+//   retval = ((c>='0') & (c<='9')) ? 1 : 0;
+// The code is transformed upstream of llvm into
+//   retval = (c-48) < 10 ? 1 : 0;
+let AddedComplexity = 139 in
+def : Pat <(i32 (zext (i1 (setult (i32 (and (i32 IntRegs:$src1), 255)),
+                                  u7StrictPosImmPred:$src2)))),
+  (i32 (MUX_ii (i1 (CMPbGTUri_V4 (i32 IntRegs:$src1),
+                                 (DEC_CONST_BYTE u7StrictPosImmPred:$src2))),
+                   0, 1))>,
+                   Requires<[HasV4T]>;
+
+// Pd=cmpb.gtu(Rs,Rt)
+let isCompare = 1, validSubTargets = HasV4SubT, CextOpcode = "CMPbGTU",
+InputType = "reg" in
+def CMPbGTUrr_V4 : MInst<(outs PredRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2),
+            "$dst = cmpb.gtu($src1, $src2)",
+            [(set (i1 PredRegs:$dst), (setugt (and (i32 IntRegs:$src1), 255),
+                                             (and (i32 IntRegs:$src2), 255)))]>,
+            Requires<[HasV4T]>, ImmRegRel;
+
+// Following instruction is not being extended as it results into the incorrect
+// code for negative numbers.
+
+// Signed half compare(.eq) ri.
+// Pd=cmph.eq(Rs,#s8)
+let isCompare = 1, validSubTargets = HasV4SubT in
+def CMPhEQri_V4 : MInst<(outs PredRegs:$dst),
+            (ins IntRegs:$src1, s8Imm:$src2),
+            "$dst = cmph.eq($src1, #$src2)",
+            [(set (i1 PredRegs:$dst), (seteq (and (i32 IntRegs:$src1), 65535),
+                                             s8ImmPred:$src2))]>,
+            Requires<[HasV4T]>;
+
+// Signed half compare(.eq) rr.
+// Case 1: xor + and, then compare:
+//   r0=xor(r0,r1)
+//   r0=and(r0,#0xffff)
+//   p0=cmp.eq(r0,#0)
+// Pd=cmph.eq(Rs,Rt)
+let isCompare = 1, validSubTargets = HasV4SubT in
+def CMPhEQrr_xor_V4 : MInst<(outs PredRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2),
+            "$dst = cmph.eq($src1, $src2)",
+            [(set (i1 PredRegs:$dst), (seteq (and (xor (i32 IntRegs:$src1),
+                                                       (i32 IntRegs:$src2)),
+                                                  65535), 0))]>,
+            Requires<[HasV4T]>;
+
+// Signed half compare(.eq) rr.
+// Case 2: shift left 16 bits then compare:
+//   r0=asl(r0,16)
+//   r1=asl(r1,16)
+//   p0=cmp.eq(r0,r1)
+// Pd=cmph.eq(Rs,Rt)
+let isCompare = 1, validSubTargets = HasV4SubT in
+def CMPhEQrr_shl_V4 : MInst<(outs PredRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2),
+            "$dst = cmph.eq($src1, $src2)",
+            [(set (i1 PredRegs:$dst),
+                  (seteq (shl (i32 IntRegs:$src1), (i32 16)),
+                         (shl (i32 IntRegs:$src2), (i32 16))))]>,
+            Requires<[HasV4T]>;
+
+/* Incorrect Pattern -- immediate should be right shifted before being
+used in the cmph.gt instruction.
+// Signed half compare(.gt) ri.
+// Pd=cmph.gt(Rs,#s8)
+
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 8,
+isCompare = 1, validSubTargets = HasV4SubT in
+def CMPhGTri_V4 : MInst<(outs PredRegs:$dst),
+            (ins IntRegs:$src1, s8Ext:$src2),
+            "$dst = cmph.gt($src1, #$src2)",
+            [(set (i1 PredRegs:$dst),
+                  (setgt (shl (i32 IntRegs:$src1), (i32 16)),
+                         s8ExtPred:$src2))]>,
+            Requires<[HasV4T]>;
+*/
+
+// Signed half compare(.gt) rr.
+// Pd=cmph.gt(Rs,Rt)
+let isCompare = 1, validSubTargets = HasV4SubT in
+def CMPhGTrr_shl_V4 : MInst<(outs PredRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2),
+            "$dst = cmph.gt($src1, $src2)",
+            [(set (i1 PredRegs:$dst),
+                  (setgt (shl (i32 IntRegs:$src1), (i32 16)),
+                         (shl (i32 IntRegs:$src2), (i32 16))))]>,
+            Requires<[HasV4T]>;
+
+// Unsigned half compare rr (.gtu).
+// Pd=cmph.gtu(Rs,Rt)
+let isCompare = 1, validSubTargets = HasV4SubT, CextOpcode = "CMPhGTU",
+InputType = "reg" in
+def CMPhGTUrr_V4 : MInst<(outs PredRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2),
+            "$dst = cmph.gtu($src1, $src2)",
+            [(set (i1 PredRegs:$dst),
+                  (setugt (and (i32 IntRegs:$src1), 65535),
+                          (and (i32 IntRegs:$src2), 65535)))]>,
+            Requires<[HasV4T]>, ImmRegRel;
+
+// Unsigned half compare ri (.gtu).
+// Pd=cmph.gtu(Rs,#u7)
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 0, opExtentBits = 7,
+isCompare = 1, validSubTargets = HasV4SubT, CextOpcode = "CMPhGTU",
+InputType = "imm" in
+def CMPhGTUri_V4 : MInst<(outs PredRegs:$dst),
+            (ins IntRegs:$src1, u7Ext:$src2),
+            "$dst = cmph.gtu($src1, #$src2)",
+            [(set (i1 PredRegs:$dst), (setugt (and (i32 IntRegs:$src1), 65535),
+                                              u7ExtPred:$src2))]>,
+            Requires<[HasV4T]>, ImmRegRel;
+
+let validSubTargets = HasV4SubT in
+def NTSTBIT_rr : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+    "$dst = !tstbit($src1, $src2)",
+    [(set (i1 PredRegs:$dst),
+          (seteq (and (shl 1, (i32 IntRegs:$src2)), (i32 IntRegs:$src1)), 0))]>,
+    Requires<[HasV4T]>;
+
+let validSubTargets = HasV4SubT in
+def NTSTBIT_ri : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+    "$dst = !tstbit($src1, $src2)",
+    [(set (i1 PredRegs:$dst),
+          (seteq (and (shl 1, u5ImmPred:$src2), (i32 IntRegs:$src1)), 0))]>,
+    Requires<[HasV4T]>;
+
+//===----------------------------------------------------------------------===//
+// XTYPE/PRED -
+//===----------------------------------------------------------------------===//
+
+//Deallocate frame and return.
+//    dealloc_return
+let isReturn = 1, isTerminator = 1, isBarrier = 1, isPredicable = 1,
+  Defs = [R29, R30, R31, PC], Uses = [R30], neverHasSideEffects = 1 in {
+let validSubTargets = HasV4SubT in
+  def DEALLOC_RET_V4 : LD0Inst<(outs), (ins),
+            "dealloc_return",
+            []>,
+            Requires<[HasV4T]>;
+}
+
+// Restore registers and dealloc return function call.
+let isCall = 1, isBarrier = 1, isReturn = 1, isTerminator = 1,
+  Defs = [R29, R30, R31, PC] in {
+let validSubTargets = HasV4SubT in
+  def RESTORE_DEALLOC_RET_JMP_V4 : JInst<(outs),
+                                   (ins calltarget:$dst),
+             "jump $dst",
+             []>,
+             Requires<[HasV4T]>;
+}
+
+// Restore registers and dealloc frame before a tail call.
+let isCall = 1, isBarrier = 1,
+  Defs = [R29, R30, R31, PC] in {
+let validSubTargets = HasV4SubT in
+  def RESTORE_DEALLOC_BEFORE_TAILCALL_V4 : JInst<(outs),
+                                           (ins calltarget:$dst),
+             "call $dst",
+             []>,
+             Requires<[HasV4T]>;
+}
+
+// Save registers function call.
+let isCall = 1, isBarrier = 1,
+  Uses = [R29, R31] in {
+  def SAVE_REGISTERS_CALL_V4 : JInst<(outs),
+                               (ins calltarget:$dst),
+             "call $dst // Save_calle_saved_registers",
+             []>,
+             Requires<[HasV4T]>;
+}
+
+//    if (Ps) dealloc_return
+let isReturn = 1, isTerminator = 1,
+    Defs = [R29, R30, R31, PC], Uses = [R30], neverHasSideEffects = 1,
+    isPredicated = 1 in {
+let validSubTargets = HasV4SubT in
+  def DEALLOC_RET_cPt_V4 : LD0Inst<(outs),
+                           (ins PredRegs:$src1),
+            "if ($src1) dealloc_return",
+            []>,
+            Requires<[HasV4T]>;
+}
+
+//    if (!Ps) dealloc_return
+let isReturn = 1, isTerminator = 1,
+    Defs = [R29, R30, R31, PC], Uses = [R30], neverHasSideEffects = 1,
+    isPredicated = 1, isPredicatedFalse = 1 in {
+let validSubTargets = HasV4SubT in
+  def DEALLOC_RET_cNotPt_V4 : LD0Inst<(outs), (ins PredRegs:$src1),
+            "if (!$src1) dealloc_return",
+            []>,
+            Requires<[HasV4T]>;
+}
+
+//    if (Ps.new) dealloc_return:nt
+let isReturn = 1, isTerminator = 1,
+    Defs = [R29, R30, R31, PC], Uses = [R30], neverHasSideEffects = 1,
+    isPredicated = 1 in {
+let validSubTargets = HasV4SubT in
+  def DEALLOC_RET_cdnPnt_V4 : LD0Inst<(outs), (ins PredRegs:$src1),
+            "if ($src1.new) dealloc_return:nt",
+            []>,
+            Requires<[HasV4T]>;
+}
+
+//    if (!Ps.new) dealloc_return:nt
+let isReturn = 1, isTerminator = 1,
+    Defs = [R29, R30, R31, PC], Uses = [R30], neverHasSideEffects = 1,
+    isPredicated = 1, isPredicatedFalse = 1 in {
+let validSubTargets = HasV4SubT in
+  def DEALLOC_RET_cNotdnPnt_V4 : LD0Inst<(outs), (ins PredRegs:$src1),
+            "if (!$src1.new) dealloc_return:nt",
+            []>,
+            Requires<[HasV4T]>;
+}
+
+//    if (Ps.new) dealloc_return:t
+let isReturn = 1, isTerminator = 1,
+    Defs = [R29, R30, R31, PC], Uses = [R30], neverHasSideEffects = 1,
+    isPredicated = 1 in {
+let validSubTargets = HasV4SubT in
+  def DEALLOC_RET_cdnPt_V4 : LD0Inst<(outs), (ins PredRegs:$src1),
+            "if ($src1.new) dealloc_return:t",
+            []>,
+            Requires<[HasV4T]>;
+}
+
+// if (!Ps.new) dealloc_return:nt
+let isReturn = 1, isTerminator = 1,
+    Defs = [R29, R30, R31, PC], Uses = [R30], neverHasSideEffects = 1,
+    isPredicated = 1, isPredicatedFalse = 1 in {
+let validSubTargets = HasV4SubT in
+  def DEALLOC_RET_cNotdnPt_V4 : LD0Inst<(outs), (ins PredRegs:$src1),
+            "if (!$src1.new) dealloc_return:t",
+            []>,
+            Requires<[HasV4T]>;
+}
+
+// Load/Store with absolute addressing mode
+// memw(#u6)=Rt
+
+multiclass ST_Abs_Predbase<string mnemonic, RegisterClass RC, bit isNot,
+                           bit isPredNew> {
+  let isPredicatedNew = isPredNew in
+  def NAME#_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, u0AlwaysExt:$absaddr, RC: $src2),
+            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+            ") ")#mnemonic#"(##$absaddr) = $src2",
+            []>,
+            Requires<[HasV4T]>;
+}
+
+multiclass ST_Abs_Pred<string mnemonic, RegisterClass RC, bit PredNot> {
+  let isPredicatedFalse = PredNot in {
+    defm _c#NAME : ST_Abs_Predbase<mnemonic, RC, PredNot, 0>;
+    // Predicate new
+    defm _cdn#NAME : ST_Abs_Predbase<mnemonic, RC, PredNot, 1>;
+  }
+}
+
+let isNVStorable = 1, isExtended = 1, neverHasSideEffects = 1 in
+multiclass ST_Abs<string mnemonic, string CextOp, RegisterClass RC> {
+  let CextOpcode = CextOp, BaseOpcode = CextOp#_abs in {
+    let opExtendable = 0, isPredicable = 1 in
+    def NAME#_V4 : STInst2<(outs),
+            (ins u0AlwaysExt:$absaddr, RC:$src),
+            mnemonic#"(##$absaddr) = $src",
+            []>,
+            Requires<[HasV4T]>;
+
+    let opExtendable = 1, isPredicated = 1 in {
+      defm Pt : ST_Abs_Pred<mnemonic, RC, 0>;
+      defm NotPt : ST_Abs_Pred<mnemonic, RC, 1>;
+    }
+  }
+}
+
+multiclass ST_Abs_Predbase_nv<string mnemonic, RegisterClass RC, bit isNot,
+                           bit isPredNew> {
+  let isPredicatedNew = isPredNew in
+  def NAME#_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, u0AlwaysExt:$absaddr, RC: $src2),
+            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+            ") ")#mnemonic#"(##$absaddr) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+}
+
+multiclass ST_Abs_Pred_nv<string mnemonic, RegisterClass RC, bit PredNot> {
+  let isPredicatedFalse = PredNot in {
+    defm _c#NAME : ST_Abs_Predbase_nv<mnemonic, RC, PredNot, 0>;
+    // Predicate new
+    defm _cdn#NAME : ST_Abs_Predbase_nv<mnemonic, RC, PredNot, 1>;
+  }
+}
+
+let mayStore = 1, isNVStore = 1, isExtended = 1, neverHasSideEffects = 1 in
+multiclass ST_Abs_nv<string mnemonic, string CextOp, RegisterClass RC> {
+  let CextOpcode = CextOp, BaseOpcode = CextOp#_abs in {
+    let opExtendable = 0, isPredicable = 1 in
+    def NAME#_nv_V4 : NVInst_V4<(outs),
+            (ins u0AlwaysExt:$absaddr, RC:$src),
+            mnemonic#"(##$absaddr) = $src.new",
+            []>,
+            Requires<[HasV4T]>;
+
+    let opExtendable = 1, isPredicated = 1 in {
+      defm Pt : ST_Abs_Pred_nv<mnemonic, RC, 0>;
+      defm NotPt : ST_Abs_Pred_nv<mnemonic, RC, 1>;
+    }
+  }
+}
+
+let addrMode = Absolute in {
+  let accessSize = ByteAccess in
+    defm STrib_abs : ST_Abs<"memb", "STrib", IntRegs>,
+                     ST_Abs_nv<"memb", "STrib", IntRegs>, AddrModeRel;
+
+  let accessSize = HalfWordAccess in
+    defm STrih_abs : ST_Abs<"memh", "STrih", IntRegs>,
+                     ST_Abs_nv<"memh", "STrih", IntRegs>, AddrModeRel;
+
+  let accessSize = WordAccess in
+    defm STriw_abs : ST_Abs<"memw", "STriw", IntRegs>,
+                     ST_Abs_nv<"memw", "STriw", IntRegs>, AddrModeRel;
+
+  let accessSize = DoubleWordAccess, isNVStorable = 0 in
+    defm STrid_abs : ST_Abs<"memd", "STrid", DoubleRegs>, AddrModeRel;
+}
+
+let Predicates = [HasV4T], AddedComplexity = 30 in {
+def : Pat<(truncstorei8 (i32 IntRegs:$src1),
+                        (HexagonCONST32 tglobaladdr:$absaddr)),
+          (STrib_abs_V4 tglobaladdr: $absaddr, IntRegs: $src1)>;
+
+def : Pat<(truncstorei16 (i32 IntRegs:$src1),
+                          (HexagonCONST32 tglobaladdr:$absaddr)),
+          (STrih_abs_V4 tglobaladdr: $absaddr, IntRegs: $src1)>;
+
+def : Pat<(store (i32 IntRegs:$src1), (HexagonCONST32 tglobaladdr:$absaddr)),
+          (STriw_abs_V4 tglobaladdr: $absaddr, IntRegs: $src1)>;
+
+def : Pat<(store (i64 DoubleRegs:$src1),
+                 (HexagonCONST32 tglobaladdr:$absaddr)),
+          (STrid_abs_V4 tglobaladdr: $absaddr, DoubleRegs: $src1)>;
+}
+
+//===----------------------------------------------------------------------===//
+// multiclass for store instructions with GP-relative addressing mode.
+// mem[bhwd](#global)=Rt
+// if ([!]Pv[.new]) mem[bhwd](##global) = Rt
+//===----------------------------------------------------------------------===//
+let mayStore = 1, isNVStorable = 1 in
+multiclass ST_GP<string mnemonic, string BaseOp, RegisterClass RC> {
+  let BaseOpcode = BaseOp, isPredicable = 1 in
+  def NAME#_V4 : STInst2<(outs),
+          (ins globaladdress:$global, RC:$src),
+          mnemonic#"(#$global) = $src",
+          []>;
+
+  // When GP-relative instructions are predicated, their addressing mode is
+  // changed to absolute and they are always constant extended.
+  let BaseOpcode = BaseOp, isExtended = 1, opExtendable = 1,
+  isPredicated = 1 in {
+    defm Pt : ST_Abs_Pred <mnemonic, RC, 0>;
+    defm NotPt : ST_Abs_Pred <mnemonic, RC, 1>;
+  }
+}
+
+let mayStore = 1, isNVStore = 1 in
+multiclass ST_GP_nv<string mnemonic, string BaseOp, RegisterClass RC> {
+  let BaseOpcode = BaseOp, isPredicable = 1 in
+  def NAME#_nv_V4 : NVInst_V4<(outs),
+          (ins u0AlwaysExt:$global, RC:$src),
+          mnemonic#"(#$global) = $src.new",
+          []>,
+          Requires<[HasV4T]>;
+
+  // When GP-relative instructions are predicated, their addressing mode is
+  // changed to absolute and they are always constant extended.
+  let BaseOpcode = BaseOp, isExtended = 1, opExtendable = 1,
+  isPredicated = 1 in {
+    defm Pt : ST_Abs_Pred_nv<mnemonic, RC, 0>;
+    defm NotPt : ST_Abs_Pred_nv<mnemonic, RC, 1>;
+  }
+}
+
+let validSubTargets = HasV4SubT, neverHasSideEffects = 1 in {
+  let isNVStorable = 0 in
+  defm STd_GP : ST_GP <"memd", "STd_GP", DoubleRegs>, PredNewRel;
+
+  defm STb_GP : ST_GP<"memb",  "STb_GP", IntRegs>,
+                ST_GP_nv<"memb", "STb_GP", IntRegs>, NewValueRel;
+  defm STh_GP : ST_GP<"memh",  "STh_GP", IntRegs>,
+                ST_GP_nv<"memh", "STh_GP", IntRegs>, NewValueRel;
+  defm STw_GP : ST_GP<"memw",  "STw_GP", IntRegs>,
+                ST_GP_nv<"memw", "STw_GP", IntRegs>, NewValueRel;
+}
+
+// 64 bit atomic store
+def : Pat <(atomic_store_64 (HexagonCONST32_GP tglobaladdr:$global),
+                            (i64 DoubleRegs:$src1)),
+           (STd_GP_V4 tglobaladdr:$global, (i64 DoubleRegs:$src1))>,
+           Requires<[HasV4T]>;
+
+// Map from store(globaladdress) -> memd(#foo)
+let AddedComplexity = 100 in
+def : Pat <(store (i64 DoubleRegs:$src1),
+                  (HexagonCONST32_GP tglobaladdr:$global)),
+           (STd_GP_V4 tglobaladdr:$global, (i64 DoubleRegs:$src1))>;
+
+// 8 bit atomic store
+def : Pat < (atomic_store_8 (HexagonCONST32_GP tglobaladdr:$global),
+                            (i32 IntRegs:$src1)),
+            (STb_GP_V4 tglobaladdr:$global, (i32 IntRegs:$src1))>;
+
+// Map from store(globaladdress) -> memb(#foo)
+let AddedComplexity = 100 in
+def : Pat<(truncstorei8 (i32 IntRegs:$src1),
+          (HexagonCONST32_GP tglobaladdr:$global)),
+          (STb_GP_V4 tglobaladdr:$global, (i32 IntRegs:$src1))>;
+
+// Map from "i1 = constant<-1>; memw(CONST32(#foo)) = i1"
+//       to "r0 = 1; memw(#foo) = r0"
+let AddedComplexity = 100 in
+def : Pat<(store (i1 -1), (HexagonCONST32_GP tglobaladdr:$global)),
+          (STb_GP_V4 tglobaladdr:$global, (TFRI 1))>;
+
+def : Pat<(atomic_store_16 (HexagonCONST32_GP tglobaladdr:$global),
+                           (i32 IntRegs:$src1)),
+          (STh_GP_V4 tglobaladdr:$global, (i32 IntRegs:$src1))>;
+
+// Map from store(globaladdress) -> memh(#foo)
+let AddedComplexity = 100 in
+def : Pat<(truncstorei16 (i32 IntRegs:$src1),
+                         (HexagonCONST32_GP tglobaladdr:$global)),
+          (STh_GP_V4 tglobaladdr:$global, (i32 IntRegs:$src1))>;
+
+// 32 bit atomic store
+def : Pat<(atomic_store_32 (HexagonCONST32_GP tglobaladdr:$global),
+                           (i32 IntRegs:$src1)),
+          (STw_GP_V4 tglobaladdr:$global, (i32 IntRegs:$src1))>;
+
+// Map from store(globaladdress) -> memw(#foo)
+let AddedComplexity = 100 in
+def : Pat<(store (i32 IntRegs:$src1), (HexagonCONST32_GP tglobaladdr:$global)),
+          (STw_GP_V4 tglobaladdr:$global, (i32 IntRegs:$src1))>;
+
+//===----------------------------------------------------------------------===//
+// Multiclass for the load instructions with absolute addressing mode.
+//===----------------------------------------------------------------------===//
+multiclass LD_Abs_Predbase<string mnemonic, RegisterClass RC, bit isNot,
+                           bit isPredNew> {
+  let isPredicatedNew = isPredNew in
+  def NAME : LDInst2<(outs RC:$dst),
+            (ins PredRegs:$src1, u0AlwaysExt:$absaddr),
+            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+            ") ")#"$dst = "#mnemonic#"(##$absaddr)",
+            []>,
+            Requires<[HasV4T]>;
+}
+
+multiclass LD_Abs_Pred<string mnemonic, RegisterClass RC, bit PredNot> {
+  let isPredicatedFalse = PredNot in {
+    defm _c#NAME : LD_Abs_Predbase<mnemonic, RC, PredNot, 0>;
+    // Predicate new
+    defm _cdn#NAME : LD_Abs_Predbase<mnemonic, RC, PredNot, 1>;
+  }
+}
+
+let isExtended = 1, neverHasSideEffects = 1 in
+multiclass LD_Abs<string mnemonic, string CextOp, RegisterClass RC> {
+  let CextOpcode = CextOp, BaseOpcode = CextOp#_abs in {
+    let  opExtendable = 1, isPredicable = 1 in
+    def NAME#_V4 : LDInst2<(outs RC:$dst),
+            (ins u0AlwaysExt:$absaddr),
+            "$dst = "#mnemonic#"(##$absaddr)",
+            []>,
+            Requires<[HasV4T]>;
+
+    let opExtendable = 2, isPredicated = 1 in {
+      defm Pt_V4 : LD_Abs_Pred<mnemonic, RC, 0>;
+      defm NotPt_V4 : LD_Abs_Pred<mnemonic, RC, 1>;
+    }
+  }
+}
+
+let addrMode = Absolute in {
+  let accessSize = ByteAccess in {
+    defm LDrib_abs  : LD_Abs<"memb", "LDrib", IntRegs>, AddrModeRel;
+    defm LDriub_abs : LD_Abs<"memub", "LDriub", IntRegs>, AddrModeRel;
+  }
+  let accessSize = HalfWordAccess in {
+    defm LDrih_abs  : LD_Abs<"memh", "LDrih", IntRegs>, AddrModeRel;
+    defm LDriuh_abs : LD_Abs<"memuh", "LDriuh", IntRegs>, AddrModeRel;
+  }
+  let accessSize = WordAccess in
+    defm LDriw_abs  : LD_Abs<"memw", "LDriw", IntRegs>, AddrModeRel;
+
+  let accessSize = DoubleWordAccess in
+    defm LDrid_abs : LD_Abs<"memd",  "LDrid", DoubleRegs>, AddrModeRel;
+}
+
+let Predicates = [HasV4T], AddedComplexity  = 30 in {
+def : Pat<(i32 (load (HexagonCONST32 tglobaladdr:$absaddr))),
+          (LDriw_abs_V4 tglobaladdr: $absaddr)>;
+
+def : Pat<(i32 (sextloadi8 (HexagonCONST32 tglobaladdr:$absaddr))),
+          (LDrib_abs_V4 tglobaladdr:$absaddr)>;
+
+def : Pat<(i32 (zextloadi8 (HexagonCONST32 tglobaladdr:$absaddr))),
+          (LDriub_abs_V4 tglobaladdr:$absaddr)>;
+
+def : Pat<(i32 (sextloadi16 (HexagonCONST32 tglobaladdr:$absaddr))),
+          (LDrih_abs_V4 tglobaladdr:$absaddr)>;
+
+def : Pat<(i32 (zextloadi16 (HexagonCONST32 tglobaladdr:$absaddr))),
+          (LDriuh_abs_V4 tglobaladdr:$absaddr)>;
+}
+
+//===----------------------------------------------------------------------===//
+// multiclass for load instructions with GP-relative addressing mode.
+// Rx=mem[bhwd](##global)
+// if ([!]Pv[.new]) Rx=mem[bhwd](##global)
+//===----------------------------------------------------------------------===//
+let neverHasSideEffects = 1, validSubTargets = HasV4SubT in
+multiclass LD_GP<string mnemonic, string BaseOp, RegisterClass RC> {
+  let BaseOpcode = BaseOp in {
+    let isPredicable = 1 in
+    def NAME#_V4 : LDInst2<(outs RC:$dst),
+            (ins globaladdress:$global),
+            "$dst = "#mnemonic#"(#$global)",
+            []>;
+
+    let isExtended = 1, opExtendable = 2, isPredicated = 1 in {
+      defm Pt_V4 : LD_Abs_Pred<mnemonic, RC, 0>;
+      defm NotPt_V4 : LD_Abs_Pred<mnemonic, RC, 1>;
+    }
+  }
+}
+
+defm LDd_GP  : LD_GP<"memd",  "LDd_GP",  DoubleRegs>, PredNewRel;
+defm LDb_GP  : LD_GP<"memb",  "LDb_GP",  IntRegs>, PredNewRel;
+defm LDub_GP : LD_GP<"memub", "LDub_GP", IntRegs>, PredNewRel;
+defm LDh_GP  : LD_GP<"memh",  "LDh_GP",  IntRegs>, PredNewRel;
+defm LDuh_GP : LD_GP<"memuh", "LDuh_GP", IntRegs>, PredNewRel;
+defm LDw_GP  : LD_GP<"memw",  "LDw_GP",  IntRegs>, PredNewRel;
+
+def : Pat <(atomic_load_64 (HexagonCONST32_GP tglobaladdr:$global)),
+           (i64 (LDd_GP_V4 tglobaladdr:$global))>;
+
+def : Pat <(atomic_load_32 (HexagonCONST32_GP tglobaladdr:$global)),
+           (i32 (LDw_GP_V4 tglobaladdr:$global))>;
+
+def : Pat <(atomic_load_16 (HexagonCONST32_GP tglobaladdr:$global)),
+           (i32 (LDuh_GP_V4 tglobaladdr:$global))>;
+
+def : Pat <(atomic_load_8 (HexagonCONST32_GP tglobaladdr:$global)),
+           (i32 (LDub_GP_V4 tglobaladdr:$global))>;
+
+// Map from load(globaladdress) -> memw(#foo + 0)
+let AddedComplexity = 100 in
+def : Pat <(i64 (load (HexagonCONST32_GP tglobaladdr:$global))),
+           (i64 (LDd_GP_V4 tglobaladdr:$global))>;
+
+// Map from Pd = load(globaladdress) -> Rd = memb(globaladdress), Pd = Rd
+let AddedComplexity = 100 in
+def : Pat <(i1 (load (HexagonCONST32_GP tglobaladdr:$global))),
+           (i1 (TFR_PdRs (i32 (LDb_GP_V4 tglobaladdr:$global))))>;
+
+// When the Interprocedural Global Variable optimizer realizes that a certain
+// global variable takes only two constant values, it shrinks the global to
+// a boolean. Catch those loads here in the following 3 patterns.
+let AddedComplexity = 100 in
+def : Pat <(i32 (extloadi1 (HexagonCONST32_GP tglobaladdr:$global))),
+           (i32 (LDb_GP_V4 tglobaladdr:$global))>;
+
+let AddedComplexity = 100 in
+def : Pat <(i32 (sextloadi1 (HexagonCONST32_GP tglobaladdr:$global))),
+           (i32 (LDb_GP_V4 tglobaladdr:$global))>;
+
+// Map from load(globaladdress) -> memb(#foo)
+let AddedComplexity = 100 in
+def : Pat <(i32 (extloadi8 (HexagonCONST32_GP tglobaladdr:$global))),
+           (i32 (LDb_GP_V4 tglobaladdr:$global))>;
+
+// Map from load(globaladdress) -> memb(#foo)
+let AddedComplexity = 100 in
+def : Pat <(i32 (sextloadi8 (HexagonCONST32_GP tglobaladdr:$global))),
+           (i32 (LDb_GP_V4 tglobaladdr:$global))>;
+
+let AddedComplexity = 100 in
+def : Pat <(i32 (zextloadi1 (HexagonCONST32_GP tglobaladdr:$global))),
+           (i32 (LDub_GP_V4 tglobaladdr:$global))>;
+
+// Map from load(globaladdress) -> memub(#foo)
+let AddedComplexity = 100 in
+def : Pat <(i32 (zextloadi8 (HexagonCONST32_GP tglobaladdr:$global))),
+           (i32 (LDub_GP_V4 tglobaladdr:$global))>;
+
+// Map from load(globaladdress) -> memh(#foo)
+let AddedComplexity = 100 in
+def : Pat <(i32 (extloadi16 (HexagonCONST32_GP tglobaladdr:$global))),
+           (i32 (LDh_GP_V4 tglobaladdr:$global))>;
+
+// Map from load(globaladdress) -> memh(#foo)
+let AddedComplexity = 100 in
+def : Pat <(i32 (sextloadi16 (HexagonCONST32_GP tglobaladdr:$global))),
+           (i32 (LDh_GP_V4 tglobaladdr:$global))>;
+
+// Map from load(globaladdress) -> memuh(#foo)
+let AddedComplexity = 100 in
+def : Pat <(i32 (zextloadi16 (HexagonCONST32_GP tglobaladdr:$global))),
+           (i32 (LDuh_GP_V4 tglobaladdr:$global))>;
+
+// Map from load(globaladdress) -> memw(#foo)
+let AddedComplexity = 100 in
+def : Pat <(i32 (load (HexagonCONST32_GP tglobaladdr:$global))),
+           (i32 (LDw_GP_V4 tglobaladdr:$global))>;
+
+
+// Transfer global address into a register
+let isExtended = 1, opExtendable = 1, AddedComplexity=50, isMoveImm = 1,
+isAsCheapAsAMove = 1, isReMaterializable = 1, validSubTargets = HasV4SubT in
+def TFRI_V4 : ALU32_ri<(outs IntRegs:$dst), (ins s16Ext:$src1),
+           "$dst = #$src1",
+           [(set IntRegs:$dst, (HexagonCONST32 tglobaladdr:$src1))]>,
+           Requires<[HasV4T]>;
+
+// Transfer a block address into a register
+def : Pat<(HexagonCONST32_GP tblockaddress:$src1),
+          (TFRI_V4 tblockaddress:$src1)>,
+          Requires<[HasV4T]>;
+
+let isExtended = 1, opExtendable = 2, AddedComplexity=50,
+neverHasSideEffects = 1, isPredicated = 1, validSubTargets = HasV4SubT in
+def TFRI_cPt_V4 : ALU32_ri<(outs IntRegs:$dst),
+                           (ins PredRegs:$src1, s16Ext:$src2),
+           "if($src1) $dst = #$src2",
+           []>,
+           Requires<[HasV4T]>;
+
+let isExtended = 1, opExtendable = 2, AddedComplexity=50, isPredicatedFalse = 1,
+neverHasSideEffects = 1, isPredicated = 1, validSubTargets = HasV4SubT in
+def TFRI_cNotPt_V4 : ALU32_ri<(outs IntRegs:$dst),
+                              (ins PredRegs:$src1, s16Ext:$src2),
+           "if(!$src1) $dst = #$src2",
+           []>,
+           Requires<[HasV4T]>;
+
+let isExtended = 1, opExtendable = 2, AddedComplexity=50,
+neverHasSideEffects = 1, isPredicated = 1, validSubTargets = HasV4SubT in
+def TFRI_cdnPt_V4 : ALU32_ri<(outs IntRegs:$dst),
+                             (ins PredRegs:$src1, s16Ext:$src2),
+           "if($src1.new) $dst = #$src2",
+           []>,
+           Requires<[HasV4T]>;
+
+let isExtended = 1, opExtendable = 2, AddedComplexity=50, isPredicatedFalse = 1,
+neverHasSideEffects = 1, isPredicated = 1, validSubTargets = HasV4SubT in
+def TFRI_cdnNotPt_V4 : ALU32_ri<(outs IntRegs:$dst),
+                                (ins PredRegs:$src1, s16Ext:$src2),
+           "if(!$src1.new) $dst = #$src2",
+           []>,
+           Requires<[HasV4T]>;
+
+let AddedComplexity = 50, Predicates = [HasV4T] in
+def : Pat<(HexagonCONST32_GP tglobaladdr:$src1),
+           (TFRI_V4 tglobaladdr:$src1)>,
+           Requires<[HasV4T]>;
+
+
+// Load - Indirect with long offset: These instructions take global address
+// as an operand
+let isExtended = 1, opExtendable = 3, AddedComplexity = 40,
+validSubTargets = HasV4SubT in
+def LDrid_ind_lo_V4 : LDInst<(outs DoubleRegs:$dst),
+            (ins IntRegs:$src1, u2Imm:$src2, globaladdressExt:$offset),
+            "$dst=memd($src1<<#$src2+##$offset)",
+            [(set (i64 DoubleRegs:$dst),
+                  (load (add (shl IntRegs:$src1, u2ImmPred:$src2),
+                        (HexagonCONST32 tglobaladdr:$offset))))]>,
+            Requires<[HasV4T]>;
+
+let AddedComplexity = 40 in
+multiclass LD_indirect_lo<string OpcStr, PatFrag OpNode> {
+let isExtended = 1, opExtendable = 3, validSubTargets = HasV4SubT in
+  def _lo_V4 : LDInst<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, u2Imm:$src2, globaladdressExt:$offset),
+            !strconcat("$dst = ",
+            !strconcat(OpcStr, "($src1<<#$src2+##$offset)")),
+            [(set IntRegs:$dst,
+                  (i32 (OpNode (add (shl IntRegs:$src1, u2ImmPred:$src2),
+                          (HexagonCONST32 tglobaladdr:$offset)))))]>,
+            Requires<[HasV4T]>;
+}
+
+defm LDrib_ind : LD_indirect_lo<"memb", sextloadi8>;
+defm LDriub_ind : LD_indirect_lo<"memub", zextloadi8>;
+defm LDriub_ind_anyext : LD_indirect_lo<"memub", extloadi8>;
+defm LDrih_ind : LD_indirect_lo<"memh", sextloadi16>;
+defm LDriuh_ind : LD_indirect_lo<"memuh", zextloadi16>;
+defm LDriuh_ind_anyext : LD_indirect_lo<"memuh", extloadi16>;
+defm LDriw_ind : LD_indirect_lo<"memw", load>;
+
+let AddedComplexity = 40 in
+def : Pat <(i32 (sextloadi8 (add IntRegs:$src1,
+                                 (NumUsesBelowThresCONST32 tglobaladdr:$offset)))),
+           (i32 (LDrib_ind_lo_V4 IntRegs:$src1, 0, tglobaladdr:$offset))>,
+           Requires<[HasV4T]>;
+
+let AddedComplexity = 40 in
+def : Pat <(i32 (zextloadi8 (add IntRegs:$src1,
+                                 (NumUsesBelowThresCONST32 tglobaladdr:$offset)))),
+           (i32 (LDriub_ind_lo_V4 IntRegs:$src1, 0, tglobaladdr:$offset))>,
+           Requires<[HasV4T]>;
+
+let Predicates = [HasV4T], AddedComplexity  = 30 in {
+def : Pat<(truncstorei8 (i32 IntRegs:$src1), u0AlwaysExtPred:$src2),
+          (STrib_abs_V4 u0AlwaysExtPred:$src2, IntRegs: $src1)>;
+
+def : Pat<(truncstorei16 (i32 IntRegs:$src1), u0AlwaysExtPred:$src2),
+          (STrih_abs_V4 u0AlwaysExtPred:$src2, IntRegs: $src1)>;
+
+def : Pat<(store (i32 IntRegs:$src1), u0AlwaysExtPred:$src2),
+          (STriw_abs_V4 u0AlwaysExtPred:$src2, IntRegs: $src1)>;
+}
+
+let Predicates = [HasV4T], AddedComplexity  = 30 in {
+def : Pat<(i32 (load u0AlwaysExtPred:$src)),
+          (LDriw_abs_V4 u0AlwaysExtPred:$src)>;
+
+def : Pat<(i32 (sextloadi8 u0AlwaysExtPred:$src)),
+          (LDrib_abs_V4 u0AlwaysExtPred:$src)>;
+
+def : Pat<(i32 (zextloadi8 u0AlwaysExtPred:$src)),
+          (LDriub_abs_V4 u0AlwaysExtPred:$src)>;
+
+def : Pat<(i32 (sextloadi16 u0AlwaysExtPred:$src)),
+          (LDrih_abs_V4 u0AlwaysExtPred:$src)>;
+
+def : Pat<(i32 (zextloadi16 u0AlwaysExtPred:$src)),
+          (LDriuh_abs_V4 u0AlwaysExtPred:$src)>;
+}
+
+// Indexed store word - global address.
+// memw(Rs+#u6:2)=#S8
+let AddedComplexity = 10 in
+def STriw_offset_ext_V4 : STInst<(outs),
+            (ins IntRegs:$src1, u6_2Imm:$src2, globaladdress:$src3),
+            "memw($src1+#$src2) = ##$src3",
+            [(store (HexagonCONST32 tglobaladdr:$src3),
+                    (add IntRegs:$src1, u6_2ImmPred:$src2))]>,
+            Requires<[HasV4T]>;
+
+def : Pat<(i64 (ctlz (i64 DoubleRegs:$src1))),
+          (i64 (COMBINE_Ir_V4 (i32 0), (i32 (CTLZ64_rr DoubleRegs:$src1))))>,
+          Requires<[HasV4T]>;
+
+def : Pat<(i64 (cttz (i64 DoubleRegs:$src1))),
+          (i64 (COMBINE_Ir_V4 (i32 0), (i32 (CTTZ64_rr DoubleRegs:$src1))))>,
+          Requires<[HasV4T]>;
+
+
+// i8 -> i64 loads
+// We need a complexity of 120 here to override preceding handling of
+// zextloadi8.
+let Predicates = [HasV4T], AddedComplexity = 120 in {
+def:  Pat <(i64 (extloadi8 (NumUsesBelowThresCONST32 tglobaladdr:$addr))),
+      (i64 (COMBINE_Ir_V4 0, (LDrib_abs_V4 tglobaladdr:$addr)))>;
+
+def:  Pat <(i64 (zextloadi8 (NumUsesBelowThresCONST32 tglobaladdr:$addr))),
+      (i64 (COMBINE_Ir_V4 0, (LDriub_abs_V4 tglobaladdr:$addr)))>;
+
+def:  Pat <(i64 (sextloadi8 (NumUsesBelowThresCONST32 tglobaladdr:$addr))),
+      (i64 (SXTW (LDrib_abs_V4 tglobaladdr:$addr)))>;
+
+def:  Pat <(i64 (extloadi8 FoldGlobalAddr:$addr)),
+      (i64 (COMBINE_Ir_V4 0, (LDrib_abs_V4 FoldGlobalAddr:$addr)))>;
+
+def:  Pat <(i64 (zextloadi8 FoldGlobalAddr:$addr)),
+      (i64 (COMBINE_Ir_V4 0, (LDriub_abs_V4 FoldGlobalAddr:$addr)))>;
+
+def:  Pat <(i64 (sextloadi8 FoldGlobalAddr:$addr)),
+      (i64 (SXTW (LDrib_abs_V4 FoldGlobalAddr:$addr)))>;
+}
+// i16 -> i64 loads
+// We need a complexity of 120 here to override preceding handling of
+// zextloadi16.
+let AddedComplexity = 120 in {
+def:  Pat <(i64 (extloadi16 (NumUsesBelowThresCONST32 tglobaladdr:$addr))),
+      (i64 (COMBINE_Ir_V4 0, (LDrih_abs_V4 tglobaladdr:$addr)))>,
+      Requires<[HasV4T]>;
+
+def:  Pat <(i64 (zextloadi16 (NumUsesBelowThresCONST32 tglobaladdr:$addr))),
+      (i64 (COMBINE_Ir_V4 0, (LDriuh_abs_V4 tglobaladdr:$addr)))>,
+      Requires<[HasV4T]>;
+
+def:  Pat <(i64 (sextloadi16 (NumUsesBelowThresCONST32 tglobaladdr:$addr))),
+      (i64 (SXTW (LDrih_abs_V4 tglobaladdr:$addr)))>,
+      Requires<[HasV4T]>;
+
+def:  Pat <(i64 (extloadi16 FoldGlobalAddr:$addr)),
+      (i64 (COMBINE_Ir_V4 0, (LDrih_abs_V4 FoldGlobalAddr:$addr)))>,
+      Requires<[HasV4T]>;
+
+def:  Pat <(i64 (zextloadi16 FoldGlobalAddr:$addr)),
+      (i64 (COMBINE_Ir_V4 0, (LDriuh_abs_V4 FoldGlobalAddr:$addr)))>,
+      Requires<[HasV4T]>;
+
+def:  Pat <(i64 (sextloadi16 FoldGlobalAddr:$addr)),
+      (i64 (SXTW (LDrih_abs_V4 FoldGlobalAddr:$addr)))>,
+      Requires<[HasV4T]>;
+}
+// i32->i64 loads
+// We need a complexity of 120 here to override preceding handling of
+// zextloadi32.
+let AddedComplexity = 120 in {
+def:  Pat <(i64 (extloadi32 (NumUsesBelowThresCONST32 tglobaladdr:$addr))),
+      (i64 (COMBINE_Ir_V4 0, (LDriw_abs_V4 tglobaladdr:$addr)))>,
+      Requires<[HasV4T]>;
+
+def:  Pat <(i64 (zextloadi32 (NumUsesBelowThresCONST32 tglobaladdr:$addr))),
+      (i64 (COMBINE_Ir_V4 0, (LDriw_abs_V4 tglobaladdr:$addr)))>,
+      Requires<[HasV4T]>;
+
+def:  Pat <(i64 (sextloadi32 (NumUsesBelowThresCONST32 tglobaladdr:$addr))),
+      (i64 (SXTW (LDriw_abs_V4 tglobaladdr:$addr)))>,
+      Requires<[HasV4T]>;
+
+def:  Pat <(i64 (extloadi32 FoldGlobalAddr:$addr)),
+      (i64 (COMBINE_Ir_V4 0, (LDriw_abs_V4 FoldGlobalAddr:$addr)))>,
+      Requires<[HasV4T]>;
+
+def:  Pat <(i64 (zextloadi32 FoldGlobalAddr:$addr)),
+      (i64 (COMBINE_Ir_V4 0, (LDriw_abs_V4 FoldGlobalAddr:$addr)))>,
+      Requires<[HasV4T]>;
+
+def:  Pat <(i64 (sextloadi32 FoldGlobalAddr:$addr)),
+      (i64 (SXTW (LDriw_abs_V4 FoldGlobalAddr:$addr)))>,
+      Requires<[HasV4T]>;
+}
+
+// Indexed store double word - global address.
+// memw(Rs+#u6:2)=#S8
+let AddedComplexity = 10 in
+def STrih_offset_ext_V4 : STInst<(outs),
+            (ins IntRegs:$src1, u6_1Imm:$src2, globaladdress:$src3),
+            "memh($src1+#$src2) = ##$src3",
+            [(truncstorei16 (HexagonCONST32 tglobaladdr:$src3),
+                    (add IntRegs:$src1, u6_1ImmPred:$src2))]>,
+            Requires<[HasV4T]>;
+// Map from store(globaladdress + x) -> memd(#foo + x)
+let AddedComplexity = 100 in
+def : Pat<(store (i64 DoubleRegs:$src1),
+                 FoldGlobalAddrGP:$addr),
+          (STrid_abs_V4 FoldGlobalAddrGP:$addr, (i64 DoubleRegs:$src1))>,
+          Requires<[HasV4T]>;
+
+def : Pat<(atomic_store_64 FoldGlobalAddrGP:$addr,
+                           (i64 DoubleRegs:$src1)),
+          (STrid_abs_V4 FoldGlobalAddrGP:$addr, (i64 DoubleRegs:$src1))>,
+          Requires<[HasV4T]>;
+
+// Map from store(globaladdress + x) -> memb(#foo + x)
+let AddedComplexity = 100 in
+def : Pat<(truncstorei8 (i32 IntRegs:$src1), FoldGlobalAddrGP:$addr),
+          (STrib_abs_V4 FoldGlobalAddrGP:$addr, (i32 IntRegs:$src1))>,
+            Requires<[HasV4T]>;
+
+def : Pat<(atomic_store_8 FoldGlobalAddrGP:$addr, (i32 IntRegs:$src1)),
+          (STrib_abs_V4 FoldGlobalAddrGP:$addr, (i32 IntRegs:$src1))>,
+            Requires<[HasV4T]>;
+
+// Map from store(globaladdress + x) -> memh(#foo + x)
+let AddedComplexity = 100 in
+def : Pat<(truncstorei16 (i32 IntRegs:$src1), FoldGlobalAddrGP:$addr),
+          (STrih_abs_V4 FoldGlobalAddrGP:$addr, (i32 IntRegs:$src1))>,
+            Requires<[HasV4T]>;
+
+def : Pat<(atomic_store_16 FoldGlobalAddrGP:$addr, (i32 IntRegs:$src1)),
+          (STrih_abs_V4 FoldGlobalAddrGP:$addr, (i32 IntRegs:$src1))>,
+            Requires<[HasV4T]>;
+
+// Map from store(globaladdress + x) -> memw(#foo + x)
+let AddedComplexity = 100 in
+def : Pat<(store (i32 IntRegs:$src1), FoldGlobalAddrGP:$addr),
+          (STriw_abs_V4 FoldGlobalAddrGP:$addr, (i32 IntRegs:$src1))>,
+           Requires<[HasV4T]>;
+
+def : Pat<(atomic_store_32 FoldGlobalAddrGP:$addr, (i32 IntRegs:$src1)),
+          (STriw_abs_V4 FoldGlobalAddrGP:$addr, (i32 IntRegs:$src1))>,
+            Requires<[HasV4T]>;
+
+// Map from load(globaladdress + x) -> memd(#foo + x)
+let AddedComplexity = 100 in
+def : Pat<(i64 (load FoldGlobalAddrGP:$addr)),
+          (i64 (LDrid_abs_V4 FoldGlobalAddrGP:$addr))>,
+           Requires<[HasV4T]>;
+
+def : Pat<(atomic_load_64 FoldGlobalAddrGP:$addr),
+          (i64 (LDrid_abs_V4 FoldGlobalAddrGP:$addr))>,
+           Requires<[HasV4T]>;
+
+// Map from load(globaladdress + x) -> memb(#foo + x)
+let AddedComplexity = 100 in
+def : Pat<(i32 (extloadi8 FoldGlobalAddrGP:$addr)),
+          (i32 (LDrib_abs_V4 FoldGlobalAddrGP:$addr))>,
+           Requires<[HasV4T]>;
+
+// Map from load(globaladdress + x) -> memb(#foo + x)
+let AddedComplexity = 100 in
+def : Pat<(i32 (sextloadi8 FoldGlobalAddrGP:$addr)),
+          (i32 (LDrib_abs_V4 FoldGlobalAddrGP:$addr))>,
+           Requires<[HasV4T]>;
+
+//let AddedComplexity = 100 in
+let AddedComplexity = 100 in
+def : Pat<(i32 (extloadi16 FoldGlobalAddrGP:$addr)),
+          (i32 (LDrih_abs_V4 FoldGlobalAddrGP:$addr))>,
+           Requires<[HasV4T]>;
+
+// Map from load(globaladdress + x) -> memh(#foo + x)
+let AddedComplexity = 100 in
+def : Pat<(i32 (sextloadi16 FoldGlobalAddrGP:$addr)),
+          (i32 (LDrih_abs_V4 FoldGlobalAddrGP:$addr))>,
+           Requires<[HasV4T]>;
+
+// Map from load(globaladdress + x) -> memuh(#foo + x)
+let AddedComplexity = 100 in
+def : Pat<(i32 (zextloadi16 FoldGlobalAddrGP:$addr)),
+          (i32 (LDriuh_abs_V4 FoldGlobalAddrGP:$addr))>,
+           Requires<[HasV4T]>;
+
+def : Pat<(atomic_load_16 FoldGlobalAddrGP:$addr),
+          (i32 (LDriuh_abs_V4 FoldGlobalAddrGP:$addr))>,
+           Requires<[HasV4T]>;
+
+// Map from load(globaladdress + x) -> memub(#foo + x)
+let AddedComplexity = 100 in
+def : Pat<(i32 (zextloadi8 FoldGlobalAddrGP:$addr)),
+          (i32 (LDriub_abs_V4 FoldGlobalAddrGP:$addr))>,
+           Requires<[HasV4T]>;
+
+def : Pat<(atomic_load_8 FoldGlobalAddrGP:$addr),
+          (i32 (LDriub_abs_V4 FoldGlobalAddrGP:$addr))>,
+           Requires<[HasV4T]>;
+
+// Map from load(globaladdress + x) -> memw(#foo + x)
+let AddedComplexity = 100 in
+def : Pat<(i32 (load FoldGlobalAddrGP:$addr)),
+          (i32 (LDriw_abs_V4 FoldGlobalAddrGP:$addr))>,
+           Requires<[HasV4T]>;
+
+def : Pat<(atomic_load_32 FoldGlobalAddrGP:$addr),
+          (i32 (LDriw_abs_V4 FoldGlobalAddrGP:$addr))>,
+           Requires<[HasV4T]>;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV5.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV5.td
new file mode 100644
index 0000000..9da6074
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV5.td
@@ -0,0 +1,633 @@
+def SDTHexagonFCONST32 : SDTypeProfile<1, 1, [
+                                            SDTCisVT<0, f32>,
+                                            SDTCisPtrTy<1>]>;
+def HexagonFCONST32 : SDNode<"HexagonISD::FCONST32",     SDTHexagonFCONST32>;
+
+let isReMaterializable = 1, isMoveImm = 1 in
+def FCONST32_nsdata : LDInst<(outs IntRegs:$dst), (ins globaladdress:$global),
+              "$dst = CONST32(#$global)",
+              [(set (f32 IntRegs:$dst),
+              (HexagonFCONST32 tglobaladdr:$global))]>,
+               Requires<[HasV5T]>;
+
+let isReMaterializable = 1, isMoveImm = 1 in
+def CONST64_Float_Real : LDInst<(outs DoubleRegs:$dst), (ins f64imm:$src1),
+                       "$dst = CONST64(#$src1)",
+                       [(set DoubleRegs:$dst, fpimm:$src1)]>,
+          Requires<[HasV5T]>;
+
+let isReMaterializable = 1, isMoveImm = 1 in
+def CONST32_Float_Real : LDInst<(outs IntRegs:$dst), (ins f32imm:$src1),
+                       "$dst = CONST32(#$src1)",
+                       [(set IntRegs:$dst, fpimm:$src1)]>,
+          Requires<[HasV5T]>;
+
+// Transfer immediate float.
+// Only works with single precision fp value.
+// For double precision, use CONST64_float_real, as 64bit transfer
+// can only hold 40-bit values - 32 from const ext + 8 bit immediate.
+// Make sure that complexity is more than the CONST32 pattern in
+// HexagonInstrInfo.td patterns.
+let isExtended = 1, opExtendable = 1, isMoveImm = 1, isReMaterializable = 1,
+isPredicable = 1, AddedComplexity = 30, validSubTargets = HasV5SubT,
+isCodeGenOnly = 1 in
+def TFRI_f : ALU32_ri<(outs IntRegs:$dst), (ins f32Ext:$src1),
+           "$dst = #$src1",
+           [(set IntRegs:$dst, fpimm:$src1)]>,
+          Requires<[HasV5T]>;
+
+let isExtended = 1, opExtendable = 2, isPredicated = 1,
+neverHasSideEffects = 1, validSubTargets = HasV5SubT in
+def TFRI_cPt_f : ALU32_ri<(outs IntRegs:$dst),
+                          (ins PredRegs:$src1, f32Ext:$src2),
+           "if ($src1) $dst = #$src2",
+           []>,
+          Requires<[HasV5T]>;
+
+let isExtended = 1, opExtendable = 2, isPredicated = 1, isPredicatedFalse = 1,
+neverHasSideEffects = 1, validSubTargets = HasV5SubT in
+def TFRI_cNotPt_f : ALU32_ri<(outs IntRegs:$dst),
+                             (ins PredRegs:$src1, f32Ext:$src2),
+           "if (!$src1) $dst =#$src2",
+           []>,
+          Requires<[HasV5T]>;
+
+// Convert single precision to double precision and vice-versa.
+def CONVERT_sf2df : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src),
+                "$dst = convert_sf2df($src)",
+                [(set DoubleRegs:$dst, (fextend IntRegs:$src))]>,
+          Requires<[HasV5T]>;
+
+def CONVERT_df2sf : ALU64_rr<(outs IntRegs:$dst), (ins DoubleRegs:$src),
+                "$dst = convert_df2sf($src)",
+                [(set IntRegs:$dst, (fround DoubleRegs:$src))]>,
+          Requires<[HasV5T]>;
+
+
+// Load.
+def LDrid_f : LDInst<(outs DoubleRegs:$dst),
+            (ins MEMri:$addr),
+            "$dst = memd($addr)",
+            [(set DoubleRegs:$dst, (f64 (load ADDRriS11_3:$addr)))]>,
+          Requires<[HasV5T]>;
+
+
+let AddedComplexity = 20 in
+def LDrid_indexed_f : LDInst<(outs DoubleRegs:$dst),
+            (ins IntRegs:$src1, s11_3Imm:$offset),
+            "$dst = memd($src1+#$offset)",
+            [(set DoubleRegs:$dst, (f64 (load (add IntRegs:$src1,
+                                              s11_3ImmPred:$offset))))]>,
+          Requires<[HasV5T]>;
+
+def LDriw_f : LDInst<(outs IntRegs:$dst),
+            (ins MEMri:$addr), "$dst = memw($addr)",
+            [(set IntRegs:$dst, (f32 (load ADDRriS11_2:$addr)))]>,
+          Requires<[HasV5T]>;
+
+
+let AddedComplexity = 20 in
+def LDriw_indexed_f : LDInst<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, s11_2Imm:$offset),
+            "$dst = memw($src1+#$offset)",
+            [(set IntRegs:$dst, (f32 (load (add IntRegs:$src1,
+                                           s11_2ImmPred:$offset))))]>,
+          Requires<[HasV5T]>;
+
+// Store.
+def STriw_f : STInst<(outs),
+            (ins MEMri:$addr, IntRegs:$src1),
+            "memw($addr) = $src1",
+            [(store (f32 IntRegs:$src1), ADDRriS11_2:$addr)]>,
+          Requires<[HasV5T]>;
+
+let AddedComplexity = 10 in
+def STriw_indexed_f : STInst<(outs),
+            (ins IntRegs:$src1, s11_2Imm:$src2, IntRegs:$src3),
+            "memw($src1+#$src2) = $src3",
+            [(store (f32 IntRegs:$src3),
+                (add IntRegs:$src1, s11_2ImmPred:$src2))]>,
+          Requires<[HasV5T]>;
+
+def STrid_f : STInst<(outs),
+            (ins MEMri:$addr, DoubleRegs:$src1),
+            "memd($addr) = $src1",
+            [(store (f64 DoubleRegs:$src1), ADDRriS11_2:$addr)]>,
+          Requires<[HasV5T]>;
+
+// Indexed store double word.
+let AddedComplexity = 10 in
+def STrid_indexed_f : STInst<(outs),
+            (ins IntRegs:$src1, s11_3Imm:$src2,  DoubleRegs:$src3),
+            "memd($src1+#$src2) = $src3",
+            [(store (f64 DoubleRegs:$src3),
+                                (add IntRegs:$src1, s11_3ImmPred:$src2))]>,
+          Requires<[HasV5T]>;
+
+
+// Add
+let isCommutable = 1 in
+def fADD_rr : ALU64_rr<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2),
+            "$dst = sfadd($src1, $src2)",
+            [(set IntRegs:$dst, (fadd IntRegs:$src1, IntRegs:$src2))]>,
+          Requires<[HasV5T]>;
+
+let isCommutable = 1 in
+def fADD64_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                     DoubleRegs:$src2),
+               "$dst = dfadd($src1, $src2)",
+               [(set DoubleRegs:$dst, (fadd DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>,
+          Requires<[HasV5T]>;
+
+def fSUB_rr : ALU64_rr<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2),
+            "$dst = sfsub($src1, $src2)",
+            [(set IntRegs:$dst, (fsub IntRegs:$src1, IntRegs:$src2))]>,
+          Requires<[HasV5T]>;
+
+def fSUB64_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                     DoubleRegs:$src2),
+               "$dst = dfsub($src1, $src2)",
+               [(set DoubleRegs:$dst, (fsub DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>,
+               Requires<[HasV5T]>;
+
+let isCommutable = 1 in
+def fMUL_rr : ALU64_rr<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2),
+            "$dst = sfmpy($src1, $src2)",
+            [(set IntRegs:$dst, (fmul IntRegs:$src1, IntRegs:$src2))]>,
+            Requires<[HasV5T]>;
+
+let isCommutable = 1 in
+def fMUL64_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                     DoubleRegs:$src2),
+               "$dst = dfmpy($src1, $src2)",
+               [(set DoubleRegs:$dst, (fmul DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>,
+               Requires<[HasV5T]>;
+
+// Compare.
+let isCompare = 1 in {
+multiclass FCMP64_rr<string OpcStr, PatFrag OpNode> {
+  def _rr : ALU64_rr<(outs PredRegs:$dst), (ins DoubleRegs:$b, DoubleRegs:$c),
+                 !strconcat("$dst = ", !strconcat(OpcStr, "($b, $c)")),
+                 [(set PredRegs:$dst,
+                        (OpNode (f64 DoubleRegs:$b), (f64 DoubleRegs:$c)))]>,
+                 Requires<[HasV5T]>;
+}
+
+multiclass FCMP32_rr<string OpcStr, PatFrag OpNode> {
+  def _rr : ALU64_rr<(outs PredRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
+                 !strconcat("$dst = ", !strconcat(OpcStr, "($b, $c)")),
+                 [(set PredRegs:$dst,
+                        (OpNode (f32 IntRegs:$b), (f32 IntRegs:$c)))]>,
+                 Requires<[HasV5T]>;
+}
+}
+
+defm FCMPOEQ64 : FCMP64_rr<"dfcmp.eq", setoeq>;
+defm FCMPUEQ64 : FCMP64_rr<"dfcmp.eq", setueq>;
+defm FCMPOGT64 : FCMP64_rr<"dfcmp.gt", setogt>;
+defm FCMPUGT64 : FCMP64_rr<"dfcmp.gt", setugt>;
+defm FCMPOGE64 : FCMP64_rr<"dfcmp.ge", setoge>;
+defm FCMPUGE64 : FCMP64_rr<"dfcmp.ge", setuge>;
+
+defm FCMPOEQ32 : FCMP32_rr<"sfcmp.eq", setoeq>;
+defm FCMPUEQ32 : FCMP32_rr<"sfcmp.eq", setueq>;
+defm FCMPOGT32 : FCMP32_rr<"sfcmp.gt", setogt>;
+defm FCMPUGT32 : FCMP32_rr<"sfcmp.gt", setugt>;
+defm FCMPOGE32 : FCMP32_rr<"sfcmp.ge", setoge>;
+defm FCMPUGE32 : FCMP32_rr<"sfcmp.ge", setuge>;
+
+// olt.
+def : Pat <(i1 (setolt (f32 IntRegs:$src1), (f32 IntRegs:$src2))),
+      (i1 (FCMPOGT32_rr IntRegs:$src2, IntRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+def : Pat <(i1 (setolt (f32 IntRegs:$src1), (fpimm:$src2))),
+      (i1 (FCMPOGT32_rr (f32 (TFRI_f fpimm:$src2)), (f32 IntRegs:$src1)))>,
+      Requires<[HasV5T]>;
+
+def : Pat <(i1 (setolt (f64 DoubleRegs:$src1), (f64 DoubleRegs:$src2))),
+      (i1 (FCMPOGT64_rr DoubleRegs:$src2, DoubleRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+def : Pat <(i1 (setolt (f64 DoubleRegs:$src1), (fpimm:$src2))),
+      (i1 (FCMPOGT64_rr (f64 (CONST64_Float_Real fpimm:$src2)),
+                        (f64 DoubleRegs:$src1)))>,
+      Requires<[HasV5T]>;
+
+// gt.
+def : Pat <(i1 (setugt (f64 DoubleRegs:$src1), (fpimm:$src2))),
+      (i1 (FCMPUGT64_rr (f64 DoubleRegs:$src1),
+                        (f64 (CONST64_Float_Real fpimm:$src2))))>,
+      Requires<[HasV5T]>;
+
+def : Pat <(i1 (setugt (f32 IntRegs:$src1), (fpimm:$src2))),
+      (i1 (FCMPUGT32_rr (f32 IntRegs:$src1), (f32 (TFRI_f fpimm:$src2))))>,
+      Requires<[HasV5T]>;
+
+// ult.
+def : Pat <(i1 (setult (f32 IntRegs:$src1), (f32 IntRegs:$src2))),
+      (i1 (FCMPUGT32_rr IntRegs:$src2, IntRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+def : Pat <(i1 (setult (f32 IntRegs:$src1), (fpimm:$src2))),
+      (i1 (FCMPUGT32_rr (f32 (TFRI_f fpimm:$src2)), (f32 IntRegs:$src1)))>,
+      Requires<[HasV5T]>;
+
+def : Pat <(i1 (setult (f64 DoubleRegs:$src1), (f64 DoubleRegs:$src2))),
+      (i1 (FCMPUGT64_rr DoubleRegs:$src2, DoubleRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+def : Pat <(i1 (setult (f64 DoubleRegs:$src1), (fpimm:$src2))),
+      (i1 (FCMPUGT64_rr (f64 (CONST64_Float_Real fpimm:$src2)),
+                        (f64 DoubleRegs:$src1)))>,
+      Requires<[HasV5T]>;
+
+// le.
+// rs <= rt -> rt >= rs.
+def : Pat<(i1 (setole (f32 IntRegs:$src1), (f32 IntRegs:$src2))),
+      (i1 (FCMPOGE32_rr IntRegs:$src2, IntRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+def : Pat<(i1 (setole (f32 IntRegs:$src1), (fpimm:$src2))),
+      (i1 (FCMPOGE32_rr (f32 (TFRI_f fpimm:$src2)), IntRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+
+// Rss <= Rtt -> Rtt >= Rss.
+def : Pat<(i1 (setole (f64 DoubleRegs:$src1), (f64 DoubleRegs:$src2))),
+      (i1 (FCMPOGE64_rr DoubleRegs:$src2, DoubleRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+def : Pat<(i1 (setole (f64 DoubleRegs:$src1), (fpimm:$src2))),
+      (i1 (FCMPOGE64_rr (f64 (CONST64_Float_Real fpimm:$src2)),
+                                DoubleRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+// rs <= rt -> rt >= rs.
+def : Pat<(i1 (setule (f32 IntRegs:$src1), (f32 IntRegs:$src2))),
+      (i1 (FCMPUGE32_rr IntRegs:$src2, IntRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+def : Pat<(i1 (setule (f32 IntRegs:$src1), (fpimm:$src2))),
+      (i1 (FCMPUGE32_rr (f32 (TFRI_f fpimm:$src2)), IntRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+// Rss <= Rtt -> Rtt >= Rss.
+def : Pat<(i1 (setule (f64 DoubleRegs:$src1), (f64 DoubleRegs:$src2))),
+      (i1 (FCMPUGE64_rr DoubleRegs:$src2, DoubleRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+def : Pat<(i1 (setule (f64 DoubleRegs:$src1), (fpimm:$src2))),
+      (i1 (FCMPUGE64_rr (f64 (CONST64_Float_Real fpimm:$src2)),
+                                DoubleRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+// ne.
+def : Pat<(i1 (setone (f32 IntRegs:$src1), (f32 IntRegs:$src2))),
+      (i1 (NOT_p (FCMPOEQ32_rr IntRegs:$src1, IntRegs:$src2)))>,
+      Requires<[HasV5T]>;
+
+def : Pat<(i1 (setone (f64 DoubleRegs:$src1), (f64 DoubleRegs:$src2))),
+      (i1 (NOT_p (FCMPOEQ64_rr DoubleRegs:$src1, DoubleRegs:$src2)))>,
+      Requires<[HasV5T]>;
+
+def : Pat<(i1 (setune (f32 IntRegs:$src1), (f32 IntRegs:$src2))),
+      (i1 (NOT_p (FCMPUEQ32_rr IntRegs:$src1, IntRegs:$src2)))>,
+      Requires<[HasV5T]>;
+
+def : Pat<(i1 (setune (f64 DoubleRegs:$src1), (f64 DoubleRegs:$src2))),
+      (i1 (NOT_p (FCMPUEQ64_rr DoubleRegs:$src1, DoubleRegs:$src2)))>,
+      Requires<[HasV5T]>;
+
+def : Pat<(i1 (setone (f32 IntRegs:$src1), (fpimm:$src2))),
+      (i1 (NOT_p (FCMPOEQ32_rr IntRegs:$src1, (f32 (TFRI_f fpimm:$src2)))))>,
+      Requires<[HasV5T]>;
+
+def : Pat<(i1 (setone (f64 DoubleRegs:$src1), (fpimm:$src2))),
+      (i1 (NOT_p (FCMPOEQ64_rr DoubleRegs:$src1,
+                              (f64 (CONST64_Float_Real fpimm:$src2)))))>,
+      Requires<[HasV5T]>;
+
+def : Pat<(i1 (setune (f32 IntRegs:$src1), (fpimm:$src2))),
+      (i1 (NOT_p (FCMPUEQ32_rr IntRegs:$src1,  (f32 (TFRI_f fpimm:$src2)))))>,
+      Requires<[HasV5T]>;
+
+def : Pat<(i1 (setune (f64 DoubleRegs:$src1), (fpimm:$src2))),
+      (i1 (NOT_p (FCMPUEQ64_rr DoubleRegs:$src1,
+                              (f64 (CONST64_Float_Real fpimm:$src2)))))>,
+      Requires<[HasV5T]>;
+
+// Convert Integer to Floating Point.
+def CONVERT_d2sf : ALU64_rr<(outs IntRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_d2sf($src)",
+              [(set (f32 IntRegs:$dst), (sint_to_fp (i64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_ud2sf : ALU64_rr<(outs IntRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_ud2sf($src)",
+              [(set (f32 IntRegs:$dst), (uint_to_fp (i64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_uw2sf : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_uw2sf($src)",
+              [(set (f32 IntRegs:$dst), (uint_to_fp (i32 IntRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_w2sf : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_w2sf($src)",
+              [(set (f32 IntRegs:$dst), (sint_to_fp (i32 IntRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_d2df : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_d2df($src)",
+              [(set (f64 DoubleRegs:$dst), (sint_to_fp (i64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_ud2df : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_ud2df($src)",
+              [(set (f64 DoubleRegs:$dst), (uint_to_fp (i64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_uw2df : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_uw2df($src)",
+              [(set (f64 DoubleRegs:$dst), (uint_to_fp (i32 IntRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_w2df : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_w2df($src)",
+              [(set (f64 DoubleRegs:$dst), (sint_to_fp (i32 IntRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+// Convert Floating Point to Integer - default.
+def CONVERT_df2uw : ALU64_rr<(outs IntRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_df2uw($src):chop",
+              [(set (i32 IntRegs:$dst), (fp_to_uint (f64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_df2w : ALU64_rr<(outs IntRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_df2w($src):chop",
+              [(set (i32 IntRegs:$dst), (fp_to_sint (f64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_sf2uw : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_sf2uw($src):chop",
+              [(set (i32 IntRegs:$dst), (fp_to_uint (f32 IntRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_sf2w : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_sf2w($src):chop",
+              [(set (i32 IntRegs:$dst), (fp_to_sint (f32 IntRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_df2d : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_df2d($src):chop",
+              [(set (i64 DoubleRegs:$dst), (fp_to_sint (f64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_df2ud : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_df2ud($src):chop",
+              [(set (i64 DoubleRegs:$dst), (fp_to_uint (f64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_sf2d : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_sf2d($src):chop",
+              [(set (i64 DoubleRegs:$dst), (fp_to_sint (f32 IntRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_sf2ud : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_sf2ud($src):chop",
+              [(set (i64 DoubleRegs:$dst), (fp_to_uint (f32 IntRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+// Convert Floating Point to Integer: non-chopped.
+let AddedComplexity = 20 in
+def CONVERT_df2uw_nchop : ALU64_rr<(outs IntRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_df2uw($src)",
+              [(set (i32 IntRegs:$dst), (fp_to_uint (f64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T, IEEERndNearV5T]>;
+
+let AddedComplexity = 20 in
+def CONVERT_df2w_nchop : ALU64_rr<(outs IntRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_df2w($src)",
+              [(set (i32 IntRegs:$dst), (fp_to_sint (f64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T, IEEERndNearV5T]>;
+
+let AddedComplexity = 20 in
+def CONVERT_sf2uw_nchop : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_sf2uw($src)",
+              [(set (i32 IntRegs:$dst), (fp_to_uint (f32 IntRegs:$src)))]>,
+              Requires<[HasV5T, IEEERndNearV5T]>;
+
+let AddedComplexity = 20 in
+def CONVERT_sf2w_nchop : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_sf2w($src)",
+              [(set (i32 IntRegs:$dst), (fp_to_sint (f32 IntRegs:$src)))]>,
+              Requires<[HasV5T, IEEERndNearV5T]>;
+
+let AddedComplexity = 20 in
+def CONVERT_df2d_nchop : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_df2d($src)",
+              [(set (i64 DoubleRegs:$dst), (fp_to_sint (f64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T, IEEERndNearV5T]>;
+
+let AddedComplexity = 20 in
+def CONVERT_df2ud_nchop : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_df2ud($src)",
+              [(set (i64 DoubleRegs:$dst), (fp_to_uint (f64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T, IEEERndNearV5T]>;
+
+let AddedComplexity = 20 in
+def CONVERT_sf2d_nchop : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_sf2d($src)",
+              [(set (i64 DoubleRegs:$dst), (fp_to_sint (f32 IntRegs:$src)))]>,
+              Requires<[HasV5T, IEEERndNearV5T]>;
+
+let AddedComplexity = 20 in
+def CONVERT_sf2ud_nchop : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_sf2ud($src)",
+              [(set (i64 DoubleRegs:$dst), (fp_to_uint (f32 IntRegs:$src)))]>,
+              Requires<[HasV5T, IEEERndNearV5T]>;
+
+
+
+// Bitcast is different than [fp|sint|uint]_to_[sint|uint|fp].
+def : Pat <(i32 (bitconvert (f32 IntRegs:$src))),
+           (i32 (TFR IntRegs:$src))>,
+          Requires<[HasV5T]>;
+
+def : Pat <(f32 (bitconvert (i32 IntRegs:$src))),
+           (f32 (TFR IntRegs:$src))>,
+          Requires<[HasV5T]>;
+
+def : Pat <(i64 (bitconvert (f64 DoubleRegs:$src))),
+           (i64 (TFR64 DoubleRegs:$src))>,
+          Requires<[HasV5T]>;
+
+def : Pat <(f64 (bitconvert (i64 DoubleRegs:$src))),
+           (f64 (TFR64 DoubleRegs:$src))>,
+          Requires<[HasV5T]>;
+
+// Floating point fused multiply-add.
+def FMADD_dp : ALU64_acc<(outs DoubleRegs:$dst),
+                  (ins DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+              "$dst += dfmpy($src2, $src3)",
+              [(set (f64 DoubleRegs:$dst),
+                  (fma DoubleRegs:$src2, DoubleRegs:$src3, DoubleRegs:$src1))],
+                  "$src1 = $dst">,
+              Requires<[HasV5T]>;
+
+def FMADD_sp : ALU64_acc<(outs IntRegs:$dst),
+                  (ins IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+              "$dst += sfmpy($src2, $src3)",
+              [(set (f32 IntRegs:$dst),
+                  (fma IntRegs:$src2, IntRegs:$src3, IntRegs:$src1))],
+                  "$src1 = $dst">,
+              Requires<[HasV5T]>;
+
+
+// Floating point max/min.
+let AddedComplexity = 100 in
+def FMAX_dp : ALU64_rr<(outs DoubleRegs:$dst),
+                  (ins DoubleRegs:$src1, DoubleRegs:$src2),
+              "$dst = dfmax($src1, $src2)",
+              [(set DoubleRegs:$dst, (f64 (select (i1 (setolt DoubleRegs:$src2,
+                                                        DoubleRegs:$src1)),
+                                             DoubleRegs:$src1,
+                                             DoubleRegs:$src2)))]>,
+               Requires<[HasV5T]>;
+
+let AddedComplexity = 100 in
+def FMAX_sp : ALU64_rr<(outs IntRegs:$dst),
+                  (ins IntRegs:$src1, IntRegs:$src2),
+              "$dst = sfmax($src1, $src2)",
+              [(set IntRegs:$dst, (f32 (select (i1 (setolt IntRegs:$src2,
+                                                        IntRegs:$src1)),
+                                             IntRegs:$src1,
+                                             IntRegs:$src2)))]>,
+               Requires<[HasV5T]>;
+
+let AddedComplexity = 100 in
+def FMIN_dp : ALU64_rr<(outs DoubleRegs:$dst),
+                  (ins DoubleRegs:$src1, DoubleRegs:$src2),
+              "$dst = dfmin($src1, $src2)",
+              [(set DoubleRegs:$dst, (f64 (select (i1 (setogt DoubleRegs:$src2,
+                                                        DoubleRegs:$src1)),
+                                             DoubleRegs:$src1,
+                                             DoubleRegs:$src2)))]>,
+               Requires<[HasV5T]>;
+
+let AddedComplexity = 100 in
+def FMIN_sp : ALU64_rr<(outs IntRegs:$dst),
+                  (ins IntRegs:$src1, IntRegs:$src2),
+              "$dst = sfmin($src1, $src2)",
+              [(set IntRegs:$dst, (f32 (select (i1 (setogt IntRegs:$src2,
+                                                        IntRegs:$src1)),
+                                             IntRegs:$src1,
+                                             IntRegs:$src2)))]>,
+               Requires<[HasV5T]>;
+
+// Pseudo instruction to encode a set of conditional transfers.
+// This instruction is used instead of a mux and trades-off codesize
+// for performance. We conduct this transformation optimistically in
+// the hope that these instructions get promoted to dot-new transfers.
+let AddedComplexity = 100, isPredicated = 1 in
+def TFR_condset_rr_f : ALU32_rr<(outs IntRegs:$dst), (ins PredRegs:$src1,
+                                                        IntRegs:$src2,
+                                                        IntRegs:$src3),
+                     "Error; should not emit",
+                     [(set IntRegs:$dst, (f32 (select PredRegs:$src1,
+                                                 IntRegs:$src2,
+                                                 IntRegs:$src3)))]>,
+               Requires<[HasV5T]>;
+
+let AddedComplexity = 100, isPredicated = 1 in
+def TFR_condset_rr64_f : ALU32_rr<(outs DoubleRegs:$dst), (ins PredRegs:$src1,
+                                                        DoubleRegs:$src2,
+                                                        DoubleRegs:$src3),
+                     "Error; should not emit",
+                     [(set DoubleRegs:$dst, (f64 (select PredRegs:$src1,
+                                                 DoubleRegs:$src2,
+                                                 DoubleRegs:$src3)))]>,
+               Requires<[HasV5T]>;
+
+
+
+let AddedComplexity = 100, isPredicated = 1 in
+def TFR_condset_ri_f : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, f32imm:$src3),
+            "Error; should not emit",
+            [(set IntRegs:$dst,
+             (f32 (select PredRegs:$src1, IntRegs:$src2, fpimm:$src3)))]>,
+               Requires<[HasV5T]>;
+
+let AddedComplexity = 100, isPredicated = 1 in
+def TFR_condset_ir_f : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, f32imm:$src2, IntRegs:$src3),
+            "Error; should not emit",
+            [(set IntRegs:$dst,
+             (f32 (select PredRegs:$src1, fpimm:$src2, IntRegs:$src3)))]>,
+               Requires<[HasV5T]>;
+
+let AddedComplexity = 100, isPredicated = 1 in
+def TFR_condset_ii_f : ALU32_rr<(outs IntRegs:$dst),
+                              (ins PredRegs:$src1, f32imm:$src2, f32imm:$src3),
+                     "Error; should not emit",
+                     [(set IntRegs:$dst, (f32 (select PredRegs:$src1,
+                                                 fpimm:$src2,
+                                                 fpimm:$src3)))]>,
+               Requires<[HasV5T]>;
+
+
+def : Pat <(select (i1 (setult (f32 IntRegs:$src1), (f32 IntRegs:$src2))),
+                   (f32 IntRegs:$src3),
+                   (f32 IntRegs:$src4)),
+    (TFR_condset_rr_f (FCMPUGT32_rr IntRegs:$src2, IntRegs:$src1), IntRegs:$src4,
+                      IntRegs:$src3)>, Requires<[HasV5T]>;
+
+def : Pat <(select (i1 (setult (f64 DoubleRegs:$src1), (f64 DoubleRegs:$src2))),
+                   (f64 DoubleRegs:$src3),
+                   (f64 DoubleRegs:$src4)),
+      (TFR_condset_rr64_f (FCMPUGT64_rr DoubleRegs:$src2, DoubleRegs:$src1),
+                DoubleRegs:$src4, DoubleRegs:$src3)>, Requires<[HasV5T]>;
+
+// Map from p0 = pnot(p0); r0 = mux(p0, #i, #j) => r0 = mux(p0, #j, #i).
+def : Pat <(select (not PredRegs:$src1), fpimm:$src2, fpimm:$src3),
+      (TFR_condset_ii_f PredRegs:$src1, fpimm:$src3, fpimm:$src2)>;
+
+// Map from p0 = pnot(p0); r0 = select(p0, #i, r1)
+// => r0 = TFR_condset_ri(p0, r1, #i)
+def : Pat <(select (not PredRegs:$src1), fpimm:$src2, IntRegs:$src3),
+      (TFR_condset_ri_f PredRegs:$src1, IntRegs:$src3, fpimm:$src2)>;
+
+// Map from p0 = pnot(p0); r0 = mux(p0, r1, #i)
+// => r0 = TFR_condset_ir(p0, #i, r1)
+def : Pat <(select (not PredRegs:$src1), IntRegs:$src2, fpimm:$src3),
+      (TFR_condset_ir_f PredRegs:$src1, fpimm:$src3, IntRegs:$src2)>;
+
+def : Pat <(i32 (fp_to_sint (f64 DoubleRegs:$src1))),
+          (i32 (EXTRACT_SUBREG (i64 (CONVERT_df2d (f64 DoubleRegs:$src1))), subreg_loreg))>,
+          Requires<[HasV5T]>;
+
+def : Pat <(fabs (f32 IntRegs:$src1)),
+           (CLRBIT_31 (f32 IntRegs:$src1), 31)>,
+          Requires<[HasV5T]>;
+
+def : Pat <(fneg (f32 IntRegs:$src1)),
+           (TOGBIT_31 (f32 IntRegs:$src1), 31)>,
+          Requires<[HasV5T]>;
+
+/*
+def : Pat <(fabs (f64 DoubleRegs:$src1)),
+          (CLRBIT_31 (f32 (EXTRACT_SUBREG DoubleRegs:$src1, subreg_hireg)), 31)>,
+          Requires<[HasV5T]>;
+
+def : Pat <(fabs (f64 DoubleRegs:$src1)),
+          (CLRBIT_31 (f32 (EXTRACT_SUBREG DoubleRegs:$src1, subreg_hireg)), 31)>,
+          Requires<[HasV5T]>;
+          */
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsics.td b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsics.td
new file mode 100644
index 0000000..99f59d5
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsics.td
@@ -0,0 +1,3503 @@
+//===-- HexagonIntrinsics.td - Instruction intrinsics ------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This is populated based on the following specs:
+// Hexagon V2 Architecture
+// Application-Level Specification
+// 80-V9418-8 Rev. B
+// March 4, 2008
+//===----------------------------------------------------------------------===//
+
+//
+// ALU 32 types.
+//
+
+class qi_ALU32_sisi<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class qi_ALU32_sis10<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$src1, s10Imm:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class qi_ALU32_sis8<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$src1, s8Imm:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class qi_ALU32_siu8<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$src1, u8Imm:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class qi_ALU32_siu9<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$src1, u9Imm:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class si_ALU32_qisisi<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
+                                      IntRegs:$src3),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2, $src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2,
+                                        IntRegs:$src3))]>;
+
+class si_ALU32_qis8si<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, s8Imm:$src2,
+                                       IntRegs:$src3),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2, $src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2,
+                                        IntRegs:$src3))]>;
+
+class si_ALU32_qisis8<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
+                                       s8Imm:$src3),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2, #$src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2,
+                                        imm:$src3))]>;
+
+class si_ALU32_qis8s8<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, s8Imm:$src2, s8Imm:$src3),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2, #$src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2, imm:$src3))]>;
+
+class si_ALU32_sisi<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU32_sisi_sat<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU32_sisi_rnd<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):rnd")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU32_sis16<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, s16Imm:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class si_ALU32_sis10<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, s10Imm:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class si_ALU32_s10si<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins s10Imm:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "(#$src1, $src2)")),
+             [(set IntRegs:$dst, (IntID imm:$src1, IntRegs:$src2))]>;
+
+class si_lo_ALU32_siu16<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, u16Imm:$src2),
+             !strconcat("$dst.l = ", !strconcat(opc , "#$src2")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class si_hi_ALU32_siu16<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, u16Imm:$src2),
+             !strconcat("$dst.h = ", !strconcat(opc , "#$src2")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class si_ALU32_s16<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins s16Imm:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "#$src1")),
+             [(set IntRegs:$dst, (IntID imm:$src1))]>;
+
+class di_ALU32_s8<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs DoubleRegs:$dst), (ins s8Imm:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "#$src1")),
+             [(set DoubleRegs:$dst, (IntID imm:$src1))]>;
+
+class di_ALU64_di<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src),
+             !strconcat("$dst = ", !strconcat(opc , "$src")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src))]>;
+
+class si_ALU32_si<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src),
+             !strconcat("$dst = ", !strconcat(opc , "($src)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src))]>;
+
+class si_ALU32_si_tfr<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src),
+             !strconcat("$dst = ", !strconcat(opc , "$src")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src))]>;
+
+//
+// ALU 64 types.
+//
+
+class si_ALU64_si_sat<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src),
+             !strconcat("$dst = ", !strconcat(opc , "($src):sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src))]>;
+
+class si_ALU64_didi<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set IntRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>;
+
+class di_ALU64_sidi<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, DoubleRegs:$src2))]>;
+
+class di_ALU64_didi<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>;
+
+class di_ALU64_qididi<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src1, DoubleRegs:$src2,
+                                          DoubleRegs:$src3),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2, $src3)")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, DoubleRegs:$src2,
+                                           DoubleRegs:$src3))]>;
+
+class di_ALU64_sisi<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_ALU64_didi_sat<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>;
+
+class di_ALU64_didi_rnd<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):rnd")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>;
+
+class di_ALU64_didi_crnd<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):crnd")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>;
+
+class di_ALU64_didi_rnd_sat<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):rnd:sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>;
+
+class di_ALU64_didi_crnd_sat<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):crnd:sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>;
+
+class qi_ALU64_didi<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs PredRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set PredRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>;
+
+class si_ALU64_sisi<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_sat_lh<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.H):sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_l16_sat_hh<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.H):sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_l16_sat_lh<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.H):sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_l16_sat_hl<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.L):sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_l16_sat_ll<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.L):sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_l16_hh<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.H)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_l16_hl<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.L)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_l16_lh<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.H)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_l16_ll<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.L)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_h16_sat_hh<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1.H, $src2.H):sat:<<16")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_h16_sat_lh<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1.L, $src2.H):sat:<<16")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_h16_sat_hl<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1.H, $src2.L):sat:<<16")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_h16_sat_ll<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1.L, $src2.L):sat:<<16")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_h16_hh<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.H):<<16")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_h16_hl<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.L):<<16")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_h16_lh<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.H):<<16")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_h16_ll<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.L):<<16")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_lh<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.H)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_ll<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.L)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_sat<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+//
+// SInst classes.
+//
+
+class qi_SInst_qi<string opc, Intrinsic IntID>
+  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src),
+             !strconcat("$dst = ", !strconcat(opc , "($src)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src))]>;
+
+class qi_SInst_qi_pxfer<string opc, Intrinsic IntID>
+  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src),
+             !strconcat("$dst = ", !strconcat(opc , "$src")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src))]>;
+
+class qi_SInst_qiqi<string opc, Intrinsic IntID>
+  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class qi_SInst_qiqi_neg<string opc, Intrinsic IntID>
+  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, !$src2)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_SInst_di<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src),
+             !strconcat("$dst = ", !strconcat(opc , "($src)")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src))]>;
+
+class di_SInst_di_sat<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src),
+             !strconcat("$dst = ", !strconcat(opc , "($src):sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src))]>;
+
+class si_SInst_di<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src),
+          !strconcat("$dst = ", !strconcat(opc , "($src)")),
+          [(set IntRegs:$dst, (IntID DoubleRegs:$src))]>;
+
+class si_SInst_di_sat<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src),
+          !strconcat("$dst = ", !strconcat(opc , "($src):sat")),
+          [(set IntRegs:$dst, (IntID DoubleRegs:$src))]>;
+
+class di_SInst_disi<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, IntRegs:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, IntRegs:$src2))]>;
+
+class di_SInst_didi<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>;
+
+class di_SInst_si<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1),
+          !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+          [(set DoubleRegs:$dst, (IntID IntRegs:$src1))]>;
+
+class si_SInst_sisiu3<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2, u3Imm:$src3),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2, #$src3)")),
+          [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2,
+                                     imm:$src3))]>;
+
+class si_SInst_diu5<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, u5Imm:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+          [(set IntRegs:$dst, (IntID DoubleRegs:$src1, imm:$src2))]>;
+
+class si_SInst_disi<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, IntRegs:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+          [(set IntRegs:$dst, (IntID DoubleRegs:$src1, IntRegs:$src2))]>;
+
+class si_SInst_sidi<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, DoubleRegs:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+          [(set IntRegs:$dst, (IntID IntRegs:$src1, DoubleRegs:$src2))]>;
+
+class di_SInst_disisi<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, IntRegs:$src2,
+                                       IntRegs:$src3),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2, $src3)")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, IntRegs:$src2,
+                                        IntRegs:$src3))]>;
+
+class di_SInst_sisi<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+          [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class qi_SInst_siu5<string opc, Intrinsic IntID>
+  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+          [(set PredRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class qi_SInst_siu6<string opc, Intrinsic IntID>
+  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, u6Imm:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+          [(set PredRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class qi_SInst_sisi<string opc, Intrinsic IntID>
+  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+          [(set PredRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_SInst_si<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src),
+          !strconcat("$dst = ", !strconcat(opc , "($src)")),
+          [(set IntRegs:$dst, (IntID IntRegs:$src))]>;
+
+class si_SInst_si_sat<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src),
+          !strconcat("$dst = ", !strconcat(opc , "($src):sat")),
+          [(set IntRegs:$dst, (IntID IntRegs:$src))]>;
+
+class di_SInst_qi<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins IntRegs:$src),
+          !strconcat("$dst = ", !strconcat(opc , "($src)")),
+          [(set DoubleRegs:$dst, (IntID IntRegs:$src))]>;
+
+class si_SInst_qi<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src),
+          !strconcat("$dst = ", !strconcat(opc , "$src")),
+          [(set IntRegs:$dst, (IntID IntRegs:$src))]>;
+
+class si_SInst_qiqi<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+          [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class qi_SInst_si<string opc, Intrinsic IntID>
+  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src),
+          !strconcat("$dst = ", !strconcat(opc , "$src")),
+          [(set PredRegs:$dst, (IntID IntRegs:$src))]>;
+
+class si_SInst_sisi<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+          [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_SInst_diu6<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, u6Imm:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, imm:$src2))]>;
+
+class si_SInst_siu5<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+          [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class si_SInst_siu5_rnd<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2):rnd")),
+          [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class si_SInst_siu5u5<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2, u5Imm:$src3),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2, #$src3)")),
+          [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2, imm:$src3))]>;
+
+class si_SInst_sisisi_acc<string opc, Intrinsic IntID>
+  : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+              !strconcat("$dst += ", !strconcat(opc , "($src1, $src2)")),
+              [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                         IntRegs:$src2))],
+              "$dst2 = $dst">;
+
+class si_SInst_sisisi_nac<string opc, Intrinsic IntID>
+  : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+              !strconcat("$dst -= ", !strconcat(opc , "($src1, $src2)")),
+              [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                         IntRegs:$src2))],
+              "$dst2 = $dst">;
+
+class di_SInst_didisi_acc<string opc, Intrinsic IntID>
+  : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc , "($src1, $src2)")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
+                                             DoubleRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_SInst_didisi_nac<string opc, Intrinsic IntID>
+  : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           IntRegs:$src2),
+          !strconcat("$dst -= ", !strconcat(opc , "($src1, $src2)")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
+                                        DoubleRegs:$src1, IntRegs:$src2))],
+          "$dst2 = $dst">;
+
+class si_SInst_sisiu5u5<string opc, Intrinsic IntID>
+  : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        u5Imm:$src2, u5Imm:$src3),
+              !strconcat("$dst = ", !strconcat(opc ,
+                                               "($src1, #$src2, #$src3)")),
+              [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                         imm:$src2, imm:$src3))],
+              "$dst2 = $dst">;
+
+class si_SInst_sisidi<string opc, Intrinsic IntID>
+  : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        DoubleRegs:$src2),
+              !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+              [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                         DoubleRegs:$src2))],
+              "$dst2 = $dst">;
+
+class di_SInst_didiu6u6<string opc, Intrinsic IntID>
+  : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           u6Imm:$src2, u6Imm:$src3),
+              !strconcat("$dst = ", !strconcat(opc ,
+                                               "($src1, #$src2, #$src3)")),
+              [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, DoubleRegs:$src1,
+                                            imm:$src2, imm:$src3))],
+              "$dst2 = $dst">;
+
+class di_SInst_dididi<string opc, Intrinsic IntID>
+  : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           DoubleRegs:$src2),
+              !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+              [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
+                                            DoubleRegs:$src1,
+                                            DoubleRegs:$src2))],
+              "$dst2 = $dst">;
+
+class di_SInst_diu6u6<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, u6Imm:$src2,
+                                       u6Imm:$src3),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2, #$src3)")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, imm:$src2,
+                                        imm:$src3))]>;
+
+class di_SInst_didiqi<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2,
+                                       IntRegs:$src3),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2, $src3)")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2,
+                                        IntRegs:$src3))]>;
+
+class di_SInst_didiu3<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2,
+                                       u3Imm:$src3),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2, #$src3)")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2,
+                                        imm:$src3))]>;
+
+class di_SInst_didisi_or<string opc, Intrinsic IntID>
+  : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           IntRegs:$src2),
+          !strconcat("$dst |= ", !strconcat(opc , "($src1, $src2)")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, DoubleRegs:$src1,
+                                        IntRegs:$src2))],
+          "$dst2 = $dst">;
+
+class di_SInst_didisi_and<string opc, Intrinsic IntID>
+  : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           IntRegs:$src2),
+          !strconcat("$dst &= ", !strconcat(opc , "($src1, $src2)")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, DoubleRegs:$src1,
+                                        IntRegs:$src2))],
+          "$dst2 = $dst">;
+
+class di_SInst_didiu6_and<string opc, Intrinsic IntID>
+  : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           u6Imm:$src2),
+          !strconcat("$dst &= ", !strconcat(opc , "($src1, #$src2)")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, DoubleRegs:$src1,
+                                        imm:$src2))],
+          "$dst2 = $dst">;
+
+class di_SInst_didiu6_or<string opc, Intrinsic IntID>
+  : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           u6Imm:$src2),
+          !strconcat("$dst |= ", !strconcat(opc , "($src1, #$src2)")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, DoubleRegs:$src1,
+                                        imm:$src2))],
+          "$dst2 = $dst">;
+
+class di_SInst_didiu6_xor<string opc, Intrinsic IntID>
+  : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           u6Imm:$src2),
+          !strconcat("$dst ^= ", !strconcat(opc , "($src1, #$src2)")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, DoubleRegs:$src1,
+                                        imm:$src2))],
+          "$dst2 = $dst">;
+
+class si_SInst_sisisi_and<string opc, Intrinsic IntID>
+  : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+              !strconcat("$dst &= ", !strconcat(opc , "($src1, $src2)")),
+              [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                         IntRegs:$src2))],
+              "$dst2 = $dst">;
+
+class si_SInst_sisisi_or<string opc, Intrinsic IntID>
+  : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+              !strconcat("$dst |= ", !strconcat(opc , "($src1, $src2)")),
+              [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                         IntRegs:$src2))],
+              "$dst2 = $dst">;
+
+
+class si_SInst_sisiu5_and<string opc, Intrinsic IntID>
+  : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        u5Imm:$src2),
+              !strconcat("$dst &= ", !strconcat(opc , "($src1, #$src2)")),
+              [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                         imm:$src2))],
+              "$dst2 = $dst">;
+
+class si_SInst_sisiu5_or<string opc, Intrinsic IntID>
+  : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        u5Imm:$src2),
+              !strconcat("$dst |= ", !strconcat(opc , "($src1, #$src2)")),
+              [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                         imm:$src2))],
+              "$dst2 = $dst">;
+
+class si_SInst_sisiu5_xor<string opc, Intrinsic IntID>
+  : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        u5Imm:$src2),
+              !strconcat("$dst ^= ", !strconcat(opc , "($src1, #$src2)")),
+              [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                         imm:$src2))],
+              "$dst2 = $dst">;
+
+class si_SInst_sisiu5_acc<string opc, Intrinsic IntID>
+  : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        u5Imm:$src2),
+              !strconcat("$dst += ", !strconcat(opc , "($src1, #$src2)")),
+              [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                         imm:$src2))],
+              "$dst2 = $dst">;
+
+class si_SInst_sisiu5_nac<string opc, Intrinsic IntID>
+  : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        u5Imm:$src2),
+              !strconcat("$dst -= ", !strconcat(opc , "($src1, #$src2)")),
+              [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                         imm:$src2))],
+              "$dst2 = $dst">;
+
+class di_SInst_didiu6_acc<string opc, Intrinsic IntID>
+  : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           u5Imm:$src2),
+              !strconcat("$dst += ", !strconcat(opc , "($src1, #$src2)")),
+              [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
+                                            DoubleRegs:$src1, imm:$src2))],
+              "$dst2 = $dst">;
+
+class di_SInst_didiu6_nac<string opc, Intrinsic IntID>
+  : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           u5Imm:$src2),
+              !strconcat("$dst -= ", !strconcat(opc , "($src1, #$src2)")),
+              [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, DoubleRegs:$src1,
+                                            imm:$src2))],
+              "$dst2 = $dst">;
+
+
+//
+// MInst classes.
+//
+
+class di_MInst_sisi_rnd_hh_s1<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1.H, $src2.H):<<1:rnd")),
+               [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_rnd_hh<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1.H, $src2.H):rnd")),
+               [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_rnd_hl_s1<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1.H, $src2.L):<<1:rnd")),
+               [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_rnd_hl<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1.H, $src2.L):rnd")),
+               [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_rnd_lh_s1<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1.L, $src2.H):<<1:rnd")),
+               [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_rnd_lh<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1.L, $src2.H):rnd")),
+               [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_rnd_ll_s1<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1.L, $src2.L):<<1:rnd")),
+               [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_rnd_ll<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1.L, $src2.L):rnd")),
+               [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_disisi_acc<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+             !strconcat("$dst += ", !strconcat(opc , "($src1, $src2)")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2))],
+             "$dst2 = $dst">;
+
+class di_MInst_disisi_nac<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+             !strconcat("$dst -= ", !strconcat(opc , "($src1, $src2)")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2))],
+             "$dst2 = $dst">;
+
+class di_MInst_disisi_acc_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+             !strconcat("$dst += ", !strconcat(opc , "($src1, $src2):sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2))],
+             "$dst2 = $dst">;
+
+class di_MInst_disisi_nac_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+             !strconcat("$dst -= ", !strconcat(opc , "($src1, $src2):sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2))],
+             "$dst2 = $dst">;
+
+class di_MInst_disisi_acc_sat_conj<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+             !strconcat("$dst += ", !strconcat(opc , "($src1, $src2*):sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2))],
+             "$dst2 = $dst">;
+
+class di_MInst_disisi_nac_sat_conj<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+             !strconcat("$dst -= ", !strconcat(opc , "($src1, $src2*):sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2))],
+             "$dst2 = $dst">;
+
+class di_MInst_disisi_nac_s1_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+             !strconcat("$dst -= ", !strconcat(opc ,
+                                               "($src1, $src2):<<1:sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2))],
+             "$dst2 = $dst">;
+
+class di_MInst_disisi_acc_s1_sat_conj<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+             !strconcat("$dst += ", !strconcat(opc ,
+                                               "($src1, $src2*):<<1:sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2))],
+             "$dst2 = $dst">;
+
+class di_MInst_disisi_nac_s1_sat_conj<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+             !strconcat("$dst -= ", !strconcat(opc ,
+                                               "($src1, $src2*):<<1:sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2))],
+             "$dst2 = $dst">;
+
+class di_MInst_s8s8<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins s8Imm:$src1, s8Imm:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "(#$src1, #$src2)")),
+             [(set DoubleRegs:$dst, (IntID imm:$src1, imm:$src2))]>;
+
+class si_MInst_sis9<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, s9Imm:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class si_MInst_sisi<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_hh<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.H)")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_hh_s1<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.H):<<1")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_lh<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.H)")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_lh_s1<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.H):<<1")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_hl<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.L)")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_hl_s1<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.L):<<1")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_ll<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.L)")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_ll_s1<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.L):<<1")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+
+class si_MInst_sisi_hh<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.H)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_hh_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.H):<<1")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_lh<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.H)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_lh_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.H):<<1")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_hl<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.L)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_hl_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.L):<<1")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_ll<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.L)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_ll_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.L):<<1")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_up<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_didi<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>;
+
+class di_MInst_didi_conj<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2*)")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>;
+
+class di_MInst_sisi_s1_sat_conj<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, $src2*):<<1:sat")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_didi_s1_rnd_sat<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, $src2):<<1:rnd:sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>;
+
+class di_MInst_didi_sat<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>;
+
+class di_MInst_didi_rnd_sat<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, $src2):rnd:sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>;
+
+class si_SInst_sisi_sat<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):sat")),
+          [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_SInst_didi_sat<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):sat")),
+          [(set IntRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>;
+
+class si_SInst_disi_s1_rnd_sat<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, $src2):<<1:rnd:sat")),
+             [(set IntRegs:$dst, (IntID DoubleRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_s1_rnd_sat<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, $src2):<<1:rnd:sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_l_s1_rnd_sat<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, $src2.L):<<1:rnd:sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_h_s1_rnd_sat<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, $src2.H):<<1:rnd:sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_rnd_sat_conj<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, $src2*):rnd:sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_s1_rnd_sat_conj<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, $src2*):<<1:rnd:sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_rnd_sat<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, $src2):rnd:sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_rnd<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):rnd")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisisi_xacc<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src2,
+                                        IntRegs:$src3),
+             !strconcat("$dst ^= ", !strconcat(opc , "($src2, $src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src2,
+                                        IntRegs:$src3))],
+             "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src2,
+                                        IntRegs:$src3),
+             !strconcat("$dst += ", !strconcat(opc , "($src2, $src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src2,
+                                        IntRegs:$src3))],
+             "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src2,
+                                        IntRegs:$src3),
+             !strconcat("$dst -= ", !strconcat(opc , "($src2, $src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src2,
+                                        IntRegs:$src3))],
+             "$dst2 = $dst">;
+
+class si_MInst_sisis8_acc<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src2,
+                                        s8Imm:$src3),
+             !strconcat("$dst += ", !strconcat(opc , "($src2, #$src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src2,
+                                        imm:$src3))],
+             "$dst2 = $dst">;
+
+class si_MInst_sisis8_nac<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src2,
+                                        s8Imm:$src3),
+             !strconcat("$dst -= ", !strconcat(opc , "($src2, #$src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src2,
+                                        imm:$src3))],
+             "$dst2 = $dst">;
+
+class si_MInst_sisiu4u5<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        u4Imm:$src2, u5Imm:$src3),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1, #$src2, #$src3)")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          imm:$src2, imm:$src3))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisiu8_acc<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src2,
+                                        u8Imm:$src3),
+               !strconcat("$dst += ", !strconcat(opc , "($src2, #$src3)")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src2,
+                                          imm:$src3))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisiu8_nac<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src2,
+                                        u8Imm:$src3),
+               !strconcat("$dst -= ", !strconcat(opc , "($src2, #$src3)")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src2,
+                                          imm:$src3))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_hh<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc , "($src1.H, $src2.H)")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_sat_lh<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1.L, $src2.H):sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_sat_lh_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1.L, $src2.H):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_sat_hh<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1.H, $src2.H):sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_sat_hh_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1.H, $src2.H):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_hh_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1.H, $src2.H):<<1")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_hh<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc , "($src1.H, $src2.H)")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_sat_hh_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.H, $src2.H):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_sat_hh<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.H, $src2.H):sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_sat_hl_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.H, $src2.L):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_sat_hl<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.H, $src2.L):sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_sat_lh_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.L, $src2.H):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_sat_lh<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.L, $src2.H):sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_sat_ll_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.L, $src2.L):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_sat_ll<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.L, $src2.L):sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_hh_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.H, $src2.H):<<1")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_hl<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc , "($src1.H, $src2.L)")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_hl_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1.H, $src2.L):<<1")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_hl<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc , "($src1.H, $src2.L)")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_hl_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.H, $src2.L):<<1")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_lh<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc , "($src1.L, $src2.H)")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_lh_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1.L, $src2.H):<<1")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_lh<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc , "($src1.L, $src2.H)")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_lh_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.L, $src2.H):<<1")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_ll<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc , "($src1.L, $src2.L)")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_ll_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1.L, $src2.L):<<1")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_sat_ll_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1.L, $src2.L):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_sat_hl_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1.H, $src2.L):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_sat_ll<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1.L, $src2.L):sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_sat_hl<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1.H, $src2.L):sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_ll<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc , "($src1.L, $src2.L)")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_ll_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.L, $src2.L):<<1")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_hh_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.H, $src2.H):sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_hh_s1_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.H, $src2.H):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_hl_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.H, $src2.L):sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_hl_s1_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.H, $src2.L):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_lh_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.L, $src2.H):sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_lh_s1_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.L, $src2.H):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_ll_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.L, $src2.L):sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_ll_s1_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.L, $src2.L):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_ALU32_sisi<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_sat<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):sat")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_sat_conj<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2*):sat")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_s1_sat<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):<<1:sat")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_didi_s1_sat<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):<<1:sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>;
+
+class si_MInst_didi_s1_rnd_sat<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, $src2):<<1:rnd:sat")),
+             [(set IntRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>;
+
+class si_MInst_didi_rnd_sat<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):rnd:sat")),
+             [(set IntRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>;
+
+class si_MInst_sisi_sat_hh<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.H):sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_sat_hh_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1.H, $src2.H):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_sat_hl<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.L):sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_sat_hl_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1.H, $src2.L):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_sat_lh<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.H):sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_sat_lh_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1.L, $src2.H):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_sat_ll<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.L):sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_sat_ll_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1.L, $src2.L):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_sat_rnd_hh<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1.H, $src2.H):rnd:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_rnd_hh<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1.H, $src2.H):rnd")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_rnd_hh_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1.H, $src2.H):<<1:rnd")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_sat_rnd_hh_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ",
+                          !strconcat(opc ,
+                                     "($src1.H, $src2.H):<<1:rnd:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_rnd_hl<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ",
+                          !strconcat(opc , "($src1.H, $src2.L):rnd")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_rnd_hl_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ",
+                          !strconcat(opc , "($src1.H, $src2.L):<<1:rnd")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_sat_rnd_hl<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ",
+                          !strconcat(opc , "($src1.H, $src2.L):rnd:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_sat_rnd_hl_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ",
+                          !strconcat(opc , "($src1.H, $src2.L):<<1:rnd:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_rnd_lh<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ",
+                          !strconcat(opc , "($src1.L, $src2.H):rnd")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_sat_rnd_lh<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ",
+                          !strconcat(opc , "($src1.L, $src2.H):rnd:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_sat_rnd_lh_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ",
+                          !strconcat(opc , "($src1.L, $src2.H):<<1:rnd:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_rnd_lh_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ",
+                          !strconcat(opc , "($src1.L, $src2.H):<<1:rnd")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_sat_rnd_ll<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ",
+                          !strconcat(opc , "($src1.L, $src2.L):rnd:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_sat_rnd_ll_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ",
+                          !strconcat(opc , "($src1.L, $src2.L):<<1:rnd:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_rnd_ll<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ",
+                          !strconcat(opc , "($src1.L, $src2.L):rnd")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_rnd_ll_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ",
+                          !strconcat(opc , "($src1.L, $src2.L):<<1:rnd")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_dididi_acc_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2,
+                                           DoubleRegs:$src1, DoubleRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc , "($src1, $src2):sat")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
+                                             DoubleRegs:$src1,
+                                             DoubleRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_dididi_acc_rnd_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           DoubleRegs:$src2),
+               !strconcat("$dst += ",
+                          !strconcat(opc , "($src1, $src2):rnd:sat")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
+                                             DoubleRegs:$src1,
+                                             DoubleRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_dididi_acc_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2,
+                                           DoubleRegs:$src1,
+                                           DoubleRegs:$src2),
+               !strconcat("$dst += ",
+                          !strconcat(opc , "($src1, $src2):<<1")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
+                                             DoubleRegs:$src1,
+                                             DoubleRegs:$src2))],
+               "$dst2 = $dst">;
+
+
+class di_MInst_dididi_acc_s1_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2,
+                                           DoubleRegs:$src1,
+                                           DoubleRegs:$src2),
+               !strconcat("$dst += ",
+                          !strconcat(opc , "($src1, $src2):<<1:sat")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
+                                             DoubleRegs:$src1,
+                                             DoubleRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_dididi_acc_s1_rnd_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           DoubleRegs:$src2),
+               !strconcat("$dst += ",
+                          !strconcat(opc , "($src1, $src2):<<1:rnd:sat")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
+                                             DoubleRegs:$src1,
+                                             DoubleRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_dididi_acc<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           DoubleRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc , "($src1, $src2)")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
+                                             DoubleRegs:$src1,
+                                             DoubleRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_dididi_acc_conj<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           DoubleRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc , "($src1, $src2*)")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
+                                             DoubleRegs:$src1,
+                                             DoubleRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_acc_hh<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc , "($src1.H, $src2.H)")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_acc_hl<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc , "($src1.H, $src2.L)")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_acc_lh<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc , "($src1.L, $src2.H)")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_acc_ll<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc , "($src1.L, $src2.L)")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_acc_hh_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst += ",
+                          !strconcat(opc , "($src1.H, $src2.H):<<1")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_acc_hl_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst += ",
+                          !strconcat(opc , "($src1.H, $src2.L):<<1")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_acc_lh_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst += ",
+                          !strconcat(opc , "($src1.L, $src2.H):<<1")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_acc_ll_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst += ",
+                          !strconcat(opc , "($src1.L, $src2.L):<<1")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_nac_hh<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc , "($src1.H, $src2.H)")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_nac_hl<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc , "($src1.H, $src2.L)")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_nac_lh<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc , "($src1.L, $src2.H)")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_nac_ll<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc , "($src1.L, $src2.L)")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_nac_hh_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst -= ",
+                          !strconcat(opc , "($src1.H, $src2.H):<<1")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_nac_hl_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst -= ",
+                          !strconcat(opc , "($src1.H, $src2.L):<<1")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_nac_lh_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst -= ",
+                          !strconcat(opc , "($src1.L, $src2.H):<<1")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_nac_ll_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst -= ",
+                          !strconcat(opc , "($src1.L, $src2.L):<<1")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_acc_s1_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst += ",
+                          !strconcat(opc , "($src1, $src2):<<1:sat")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disi_s1_sat<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):<<1:sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_didisi_acc_s1_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst += ",
+                          !strconcat(opc , "($src1, $src2):<<1:sat")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
+                                             DoubleRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_disi_s1_rnd_sat<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ",
+                        !strconcat(opc , "($src1, $src2):<<1:rnd:sat")),
+             [(set IntRegs:$dst, (IntID DoubleRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_didi<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set IntRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>;
+
+//
+// LDInst classes.
+//
+let mayLoad = 1, neverHasSideEffects = 1 in
+class di_LDInstPI_diu4<string opc, Intrinsic IntID>
+  : LDInstPI<(outs IntRegs:$dst, DoubleRegs:$dst2),
+           (ins IntRegs:$src1, IntRegs:$src2, CRRegs:$src3, s4Imm:$offset),
+           "$dst2 = memd($src1++#$offset:circ($src3))",
+           [],
+           "$src1 = $dst">;
+
+/********************************************************************
+*            ALU32/ALU                                              *
+*********************************************************************/
+
+// ALU32 / ALU / Add.
+def HEXAGON_A2_add:
+  si_ALU32_sisi                   <"add",      int_hexagon_A2_add>;
+def HEXAGON_A2_addi:
+  si_ALU32_sis16                  <"add",      int_hexagon_A2_addi>;
+
+// ALU32 / ALU / Logical operations.
+def HEXAGON_A2_and:
+  si_ALU32_sisi                   <"and",      int_hexagon_A2_and>;
+def HEXAGON_A2_andir:
+  si_ALU32_sis10                  <"and",      int_hexagon_A2_andir>;
+def HEXAGON_A2_not:
+  si_ALU32_si                     <"not",      int_hexagon_A2_not>;
+def HEXAGON_A2_or:
+  si_ALU32_sisi                   <"or",       int_hexagon_A2_or>;
+def HEXAGON_A2_orir:
+  si_ALU32_sis10                  <"or",       int_hexagon_A2_orir>;
+def HEXAGON_A2_xor:
+  si_ALU32_sisi                   <"xor",      int_hexagon_A2_xor>;
+
+// ALU32 / ALU / Negate.
+def HEXAGON_A2_neg:
+  si_ALU32_si                     <"neg",      int_hexagon_A2_neg>;
+
+// ALU32 / ALU / Subtract.
+def HEXAGON_A2_sub:
+  si_ALU32_sisi                   <"sub",      int_hexagon_A2_sub>;
+def HEXAGON_A2_subri:
+  si_ALU32_s10si                  <"sub",      int_hexagon_A2_subri>;
+
+// ALU32 / ALU / Transfer Immediate.
+def HEXAGON_A2_tfril:
+  si_lo_ALU32_siu16               <"",         int_hexagon_A2_tfril>;
+def HEXAGON_A2_tfrih:
+  si_hi_ALU32_siu16               <"",         int_hexagon_A2_tfrih>;
+def HEXAGON_A2_tfrsi:
+  si_ALU32_s16                    <"",         int_hexagon_A2_tfrsi>;
+def HEXAGON_A2_tfrpi:
+  di_ALU32_s8                     <"",         int_hexagon_A2_tfrpi>;
+
+// ALU32 / ALU / Transfer Register.
+def HEXAGON_A2_tfr:
+  si_ALU32_si_tfr                  <"",        int_hexagon_A2_tfr>;
+
+/********************************************************************
+*            ALU32/PERM                                             *
+*********************************************************************/
+
+// ALU32 / PERM / Combine.
+def HEXAGON_A2_combinew:
+  di_ALU32_sisi                   <"combine",  int_hexagon_A2_combinew>;
+def HEXAGON_A2_combine_hh:
+  si_MInst_sisi_hh                <"combine",  int_hexagon_A2_combine_hh>;
+def HEXAGON_A2_combine_lh:
+  si_MInst_sisi_lh                <"combine",  int_hexagon_A2_combine_lh>;
+def HEXAGON_A2_combine_hl:
+  si_MInst_sisi_hl                <"combine",  int_hexagon_A2_combine_hl>;
+def HEXAGON_A2_combine_ll:
+  si_MInst_sisi_ll                <"combine",  int_hexagon_A2_combine_ll>;
+def HEXAGON_A2_combineii:
+  di_MInst_s8s8                   <"combine",  int_hexagon_A2_combineii>;
+
+// ALU32 / PERM / Mux.
+def HEXAGON_C2_mux:
+  si_ALU32_qisisi                 <"mux",      int_hexagon_C2_mux>;
+def HEXAGON_C2_muxri:
+  si_ALU32_qis8si                 <"mux",      int_hexagon_C2_muxri>;
+def HEXAGON_C2_muxir:
+  si_ALU32_qisis8                 <"mux",      int_hexagon_C2_muxir>;
+def HEXAGON_C2_muxii:
+  si_ALU32_qis8s8                 <"mux",      int_hexagon_C2_muxii>;
+
+// ALU32 / PERM / Shift halfword.
+def HEXAGON_A2_aslh:
+  si_ALU32_si                     <"aslh",     int_hexagon_A2_aslh>;
+def HEXAGON_A2_asrh:
+  si_ALU32_si                     <"asrh",     int_hexagon_A2_asrh>;
+def SI_to_SXTHI_asrh:
+  si_ALU32_si                     <"asrh",     int_hexagon_SI_to_SXTHI_asrh>;
+
+// ALU32 / PERM / Sign/zero extend.
+def HEXAGON_A2_sxth:
+  si_ALU32_si                     <"sxth",     int_hexagon_A2_sxth>;
+def HEXAGON_A2_sxtb:
+  si_ALU32_si                     <"sxtb",     int_hexagon_A2_sxtb>;
+def HEXAGON_A2_zxth:
+  si_ALU32_si                     <"zxth",     int_hexagon_A2_zxth>;
+def HEXAGON_A2_zxtb:
+  si_ALU32_si                     <"zxtb",     int_hexagon_A2_zxtb>;
+
+/********************************************************************
+*            ALU32/PRED                                             *
+*********************************************************************/
+
+// ALU32 / PRED / Compare.
+def HEXAGON_C2_cmpeq:
+  qi_ALU32_sisi                   <"cmp.eq",   int_hexagon_C2_cmpeq>;
+def HEXAGON_C2_cmpeqi:
+  qi_ALU32_sis10                  <"cmp.eq",   int_hexagon_C2_cmpeqi>;
+def HEXAGON_C2_cmpgei:
+  qi_ALU32_sis8                   <"cmp.ge",   int_hexagon_C2_cmpgei>;
+def HEXAGON_C2_cmpgeui:
+  qi_ALU32_siu8                   <"cmp.geu",  int_hexagon_C2_cmpgeui>;
+def HEXAGON_C2_cmpgt:
+  qi_ALU32_sisi                   <"cmp.gt",   int_hexagon_C2_cmpgt>;
+def HEXAGON_C2_cmpgti:
+  qi_ALU32_sis10                  <"cmp.gt",   int_hexagon_C2_cmpgti>;
+def HEXAGON_C2_cmpgtu:
+  qi_ALU32_sisi                   <"cmp.gtu",  int_hexagon_C2_cmpgtu>;
+def HEXAGON_C2_cmpgtui:
+  qi_ALU32_siu9                   <"cmp.gtu",  int_hexagon_C2_cmpgtui>;
+def HEXAGON_C2_cmplt:
+  qi_ALU32_sisi                   <"cmp.lt",   int_hexagon_C2_cmplt>;
+def HEXAGON_C2_cmpltu:
+  qi_ALU32_sisi                   <"cmp.ltu",  int_hexagon_C2_cmpltu>;
+
+/********************************************************************
+*            ALU32/VH                                               *
+*********************************************************************/
+
+// ALU32 / VH / Vector add halfwords.
+// Rd32=vadd[u]h(Rs32,Rt32:sat]
+def HEXAGON_A2_svaddh:
+  si_ALU32_sisi                   <"vaddh",    int_hexagon_A2_svaddh>;
+def HEXAGON_A2_svaddhs:
+  si_ALU32_sisi_sat               <"vaddh",    int_hexagon_A2_svaddhs>;
+def HEXAGON_A2_svadduhs:
+  si_ALU32_sisi_sat               <"vadduh",   int_hexagon_A2_svadduhs>;
+
+// ALU32 / VH / Vector average halfwords.
+def HEXAGON_A2_svavgh:
+  si_ALU32_sisi                   <"vavgh",    int_hexagon_A2_svavgh>;
+def HEXAGON_A2_svavghs:
+  si_ALU32_sisi_rnd               <"vavgh",    int_hexagon_A2_svavghs>;
+def HEXAGON_A2_svnavgh:
+  si_ALU32_sisi                   <"vnavgh",   int_hexagon_A2_svnavgh>;
+
+// ALU32 / VH / Vector subtract halfwords.
+def HEXAGON_A2_svsubh:
+  si_ALU32_sisi                   <"vsubh",    int_hexagon_A2_svsubh>;
+def HEXAGON_A2_svsubhs:
+  si_ALU32_sisi_sat               <"vsubh",    int_hexagon_A2_svsubhs>;
+def HEXAGON_A2_svsubuhs:
+  si_ALU32_sisi_sat               <"vsubuh",   int_hexagon_A2_svsubuhs>;
+
+/********************************************************************
+*            ALU64/ALU                                              *
+*********************************************************************/
+
+// ALU64 / ALU / Add.
+def HEXAGON_A2_addp:
+  di_ALU64_didi                   <"add",      int_hexagon_A2_addp>;
+def HEXAGON_A2_addsat:
+  si_ALU64_sisi_sat               <"add",      int_hexagon_A2_addsat>;
+
+// ALU64 / ALU / Add halfword.
+// Even though the definition says hl, it should be lh -
+//so DON'T change the class " si_ALU64_sisi_l16_lh " it inherits.
+def HEXAGON_A2_addh_l16_hl:
+  si_ALU64_sisi_l16_lh            <"add",      int_hexagon_A2_addh_l16_hl>;
+def HEXAGON_A2_addh_l16_ll:
+  si_ALU64_sisi_l16_ll            <"add",      int_hexagon_A2_addh_l16_ll>;
+
+def HEXAGON_A2_addh_l16_sat_hl:
+  si_ALU64_sisi_l16_sat_lh        <"add",      int_hexagon_A2_addh_l16_sat_hl>;
+def HEXAGON_A2_addh_l16_sat_ll:
+  si_ALU64_sisi_l16_sat_ll        <"add",      int_hexagon_A2_addh_l16_sat_ll>;
+
+def HEXAGON_A2_addh_h16_hh:
+  si_ALU64_sisi_h16_hh            <"add",      int_hexagon_A2_addh_h16_hh>;
+def HEXAGON_A2_addh_h16_hl:
+  si_ALU64_sisi_h16_hl            <"add",      int_hexagon_A2_addh_h16_hl>;
+def HEXAGON_A2_addh_h16_lh:
+  si_ALU64_sisi_h16_lh            <"add",      int_hexagon_A2_addh_h16_lh>;
+def HEXAGON_A2_addh_h16_ll:
+  si_ALU64_sisi_h16_ll            <"add",      int_hexagon_A2_addh_h16_ll>;
+
+def HEXAGON_A2_addh_h16_sat_hh:
+  si_ALU64_sisi_h16_sat_hh        <"add",      int_hexagon_A2_addh_h16_sat_hh>;
+def HEXAGON_A2_addh_h16_sat_hl:
+  si_ALU64_sisi_h16_sat_hl        <"add",      int_hexagon_A2_addh_h16_sat_hl>;
+def HEXAGON_A2_addh_h16_sat_lh:
+  si_ALU64_sisi_h16_sat_lh        <"add",      int_hexagon_A2_addh_h16_sat_lh>;
+def HEXAGON_A2_addh_h16_sat_ll:
+  si_ALU64_sisi_h16_sat_ll        <"add",      int_hexagon_A2_addh_h16_sat_ll>;
+
+// ALU64 / ALU / Compare.
+def HEXAGON_C2_cmpeqp:
+  qi_ALU64_didi                   <"cmp.eq",   int_hexagon_C2_cmpeqp>;
+def HEXAGON_C2_cmpgtp:
+  qi_ALU64_didi                   <"cmp.gt",   int_hexagon_C2_cmpgtp>;
+def HEXAGON_C2_cmpgtup:
+  qi_ALU64_didi                   <"cmp.gtu",  int_hexagon_C2_cmpgtup>;
+
+// ALU64 / ALU / Logical operations.
+def HEXAGON_A2_andp:
+  di_ALU64_didi                   <"and",      int_hexagon_A2_andp>;
+def HEXAGON_A2_orp:
+  di_ALU64_didi                   <"or",       int_hexagon_A2_orp>;
+def HEXAGON_A2_xorp:
+  di_ALU64_didi                   <"xor",      int_hexagon_A2_xorp>;
+
+// ALU64 / ALU / Maximum.
+def HEXAGON_A2_max:
+  si_ALU64_sisi                   <"max",      int_hexagon_A2_max>;
+def HEXAGON_A2_maxu:
+  si_ALU64_sisi                   <"maxu",     int_hexagon_A2_maxu>;
+
+// ALU64 / ALU / Minimum.
+def HEXAGON_A2_min:
+  si_ALU64_sisi                   <"min",      int_hexagon_A2_min>;
+def HEXAGON_A2_minu:
+  si_ALU64_sisi                   <"minu",     int_hexagon_A2_minu>;
+
+// ALU64 / ALU / Subtract.
+def HEXAGON_A2_subp:
+  di_ALU64_didi                   <"sub",      int_hexagon_A2_subp>;
+def HEXAGON_A2_subsat:
+  si_ALU64_sisi_sat               <"sub",      int_hexagon_A2_subsat>;
+
+// ALU64 / ALU / Subtract halfword.
+// Even though the definition says hl, it should be lh -
+//so DON'T change the class " si_ALU64_sisi_l16_lh " it inherits.
+def HEXAGON_A2_subh_l16_hl:
+  si_ALU64_sisi_l16_lh            <"sub",      int_hexagon_A2_subh_l16_hl>;
+def HEXAGON_A2_subh_l16_ll:
+  si_ALU64_sisi_l16_ll            <"sub",      int_hexagon_A2_subh_l16_ll>;
+
+def HEXAGON_A2_subh_l16_sat_hl:
+  si_ALU64_sisi_l16_sat_lh        <"sub",      int_hexagon_A2_subh_l16_sat_hl>;
+def HEXAGON_A2_subh_l16_sat_ll:
+  si_ALU64_sisi_l16_sat_ll        <"sub",      int_hexagon_A2_subh_l16_sat_ll>;
+
+def HEXAGON_A2_subh_h16_hh:
+  si_ALU64_sisi_h16_hh            <"sub",      int_hexagon_A2_subh_h16_hh>;
+def HEXAGON_A2_subh_h16_hl:
+  si_ALU64_sisi_h16_hl            <"sub",      int_hexagon_A2_subh_h16_hl>;
+def HEXAGON_A2_subh_h16_lh:
+  si_ALU64_sisi_h16_lh            <"sub",      int_hexagon_A2_subh_h16_lh>;
+def HEXAGON_A2_subh_h16_ll:
+  si_ALU64_sisi_h16_ll            <"sub",      int_hexagon_A2_subh_h16_ll>;
+
+def HEXAGON_A2_subh_h16_sat_hh:
+  si_ALU64_sisi_h16_sat_hh        <"sub",      int_hexagon_A2_subh_h16_sat_hh>;
+def HEXAGON_A2_subh_h16_sat_hl:
+  si_ALU64_sisi_h16_sat_hl        <"sub",      int_hexagon_A2_subh_h16_sat_hl>;
+def HEXAGON_A2_subh_h16_sat_lh:
+  si_ALU64_sisi_h16_sat_lh        <"sub",      int_hexagon_A2_subh_h16_sat_lh>;
+def HEXAGON_A2_subh_h16_sat_ll:
+  si_ALU64_sisi_h16_sat_ll        <"sub",      int_hexagon_A2_subh_h16_sat_ll>;
+
+// ALU64 / ALU / Transfer register.
+def HEXAGON_A2_tfrp:
+  di_ALU64_di                     <"",         int_hexagon_A2_tfrp>;
+
+/********************************************************************
+*            ALU64/BIT                                              *
+*********************************************************************/
+
+// ALU64 / BIT / Masked parity.
+def HEXAGON_S2_parityp:
+  si_ALU64_didi                   <"parity",   int_hexagon_S2_parityp>;
+
+/********************************************************************
+*            ALU64/PERM                                             *
+*********************************************************************/
+
+// ALU64 / PERM / Vector pack high and low halfwords.
+def HEXAGON_S2_packhl:
+  di_ALU64_sisi                   <"packhl",   int_hexagon_S2_packhl>;
+
+/********************************************************************
+*            ALU64/VB                                               *
+*********************************************************************/
+
+// ALU64 / VB / Vector add unsigned bytes.
+def HEXAGON_A2_vaddub:
+  di_ALU64_didi                   <"vaddub",   int_hexagon_A2_vaddub>;
+def HEXAGON_A2_vaddubs:
+  di_ALU64_didi_sat               <"vaddub",   int_hexagon_A2_vaddubs>;
+
+// ALU64 / VB / Vector average unsigned bytes.
+def HEXAGON_A2_vavgub:
+  di_ALU64_didi                   <"vavgub",   int_hexagon_A2_vavgub>;
+def HEXAGON_A2_vavgubr:
+  di_ALU64_didi_rnd               <"vavgub",   int_hexagon_A2_vavgubr>;
+
+// ALU64 / VB / Vector compare unsigned bytes.
+def HEXAGON_A2_vcmpbeq:
+  qi_ALU64_didi                   <"vcmpb.eq", int_hexagon_A2_vcmpbeq>;
+def HEXAGON_A2_vcmpbgtu:
+  qi_ALU64_didi                   <"vcmpb.gtu",int_hexagon_A2_vcmpbgtu>;
+
+// ALU64 / VB / Vector maximum/minimum unsigned bytes.
+def HEXAGON_A2_vmaxub:
+  di_ALU64_didi                   <"vmaxub",   int_hexagon_A2_vmaxub>;
+def HEXAGON_A2_vminub:
+  di_ALU64_didi                   <"vminub",   int_hexagon_A2_vminub>;
+
+// ALU64 / VB / Vector subtract unsigned bytes.
+def HEXAGON_A2_vsubub:
+  di_ALU64_didi                   <"vsubub",   int_hexagon_A2_vsubub>;
+def HEXAGON_A2_vsububs:
+  di_ALU64_didi_sat               <"vsubub",   int_hexagon_A2_vsububs>;
+
+// ALU64 / VB / Vector mux.
+def HEXAGON_C2_vmux:
+  di_ALU64_qididi                 <"vmux",     int_hexagon_C2_vmux>;
+
+
+/********************************************************************
+*            ALU64/VH                                               *
+*********************************************************************/
+
+// ALU64 / VH / Vector add halfwords.
+// Rdd64=vadd[u]h(Rss64,Rtt64:sat]
+def HEXAGON_A2_vaddh:
+  di_ALU64_didi                   <"vaddh",    int_hexagon_A2_vaddh>;
+def HEXAGON_A2_vaddhs:
+  di_ALU64_didi_sat               <"vaddh",    int_hexagon_A2_vaddhs>;
+def HEXAGON_A2_vadduhs:
+  di_ALU64_didi_sat               <"vadduh",   int_hexagon_A2_vadduhs>;
+
+// ALU64 / VH / Vector average halfwords.
+// Rdd64=v[n]avg[u]h(Rss64,Rtt64:rnd/:crnd][:sat]
+def HEXAGON_A2_vavgh:
+  di_ALU64_didi                   <"vavgh",    int_hexagon_A2_vavgh>;
+def HEXAGON_A2_vavghcr:
+  di_ALU64_didi_crnd              <"vavgh",    int_hexagon_A2_vavghcr>;
+def HEXAGON_A2_vavghr:
+  di_ALU64_didi_rnd               <"vavgh",    int_hexagon_A2_vavghr>;
+def HEXAGON_A2_vavguh:
+  di_ALU64_didi                   <"vavguh",   int_hexagon_A2_vavguh>;
+def HEXAGON_A2_vavguhr:
+  di_ALU64_didi_rnd               <"vavguh",   int_hexagon_A2_vavguhr>;
+def HEXAGON_A2_vnavgh:
+  di_ALU64_didi                   <"vnavgh",   int_hexagon_A2_vnavgh>;
+def HEXAGON_A2_vnavghcr:
+  di_ALU64_didi_crnd_sat          <"vnavgh",   int_hexagon_A2_vnavghcr>;
+def HEXAGON_A2_vnavghr:
+  di_ALU64_didi_rnd_sat           <"vnavgh",   int_hexagon_A2_vnavghr>;
+
+// ALU64 / VH / Vector compare halfwords.
+def HEXAGON_A2_vcmpheq:
+  qi_ALU64_didi                   <"vcmph.eq", int_hexagon_A2_vcmpheq>;
+def HEXAGON_A2_vcmphgt:
+  qi_ALU64_didi                   <"vcmph.gt", int_hexagon_A2_vcmphgt>;
+def HEXAGON_A2_vcmphgtu:
+  qi_ALU64_didi                   <"vcmph.gtu",int_hexagon_A2_vcmphgtu>;
+
+// ALU64 / VH / Vector maximum halfwords.
+def HEXAGON_A2_vmaxh:
+  di_ALU64_didi                   <"vmaxh",    int_hexagon_A2_vmaxh>;
+def HEXAGON_A2_vmaxuh:
+  di_ALU64_didi                   <"vmaxuh",   int_hexagon_A2_vmaxuh>;
+
+// ALU64 / VH / Vector minimum halfwords.
+def HEXAGON_A2_vminh:
+  di_ALU64_didi                   <"vminh",    int_hexagon_A2_vminh>;
+def HEXAGON_A2_vminuh:
+  di_ALU64_didi                   <"vminuh",   int_hexagon_A2_vminuh>;
+
+// ALU64 / VH / Vector subtract halfwords.
+def HEXAGON_A2_vsubh:
+  di_ALU64_didi                   <"vsubh",    int_hexagon_A2_vsubh>;
+def HEXAGON_A2_vsubhs:
+  di_ALU64_didi_sat               <"vsubh",    int_hexagon_A2_vsubhs>;
+def HEXAGON_A2_vsubuhs:
+  di_ALU64_didi_sat               <"vsubuh",   int_hexagon_A2_vsubuhs>;
+
+
+/********************************************************************
+*            ALU64/VW                                               *
+*********************************************************************/
+
+// ALU64 / VW / Vector add words.
+// Rdd32=vaddw(Rss32,Rtt32)[:sat]
+def HEXAGON_A2_vaddw:
+  di_ALU64_didi                   <"vaddw",    int_hexagon_A2_vaddw>;
+def HEXAGON_A2_vaddws:
+  di_ALU64_didi_sat               <"vaddw",   int_hexagon_A2_vaddws>;
+
+// ALU64 / VW / Vector average words.
+def HEXAGON_A2_vavguw:
+  di_ALU64_didi                   <"vavguw",   int_hexagon_A2_vavguw>;
+def HEXAGON_A2_vavguwr:
+  di_ALU64_didi_rnd               <"vavguw",   int_hexagon_A2_vavguwr>;
+def HEXAGON_A2_vavgw:
+  di_ALU64_didi                   <"vavgw",    int_hexagon_A2_vavgw>;
+def HEXAGON_A2_vavgwcr:
+  di_ALU64_didi_crnd              <"vavgw",    int_hexagon_A2_vavgwcr>;
+def HEXAGON_A2_vavgwr:
+  di_ALU64_didi_rnd               <"vavgw",    int_hexagon_A2_vavgwr>;
+def HEXAGON_A2_vnavgw:
+  di_ALU64_didi                   <"vnavgw",   int_hexagon_A2_vnavgw>;
+def HEXAGON_A2_vnavgwcr:
+  di_ALU64_didi_crnd_sat          <"vnavgw",   int_hexagon_A2_vnavgwcr>;
+def HEXAGON_A2_vnavgwr:
+  di_ALU64_didi_rnd_sat           <"vnavgw",   int_hexagon_A2_vnavgwr>;
+
+// ALU64 / VW / Vector compare words.
+def HEXAGON_A2_vcmpweq:
+  qi_ALU64_didi                   <"vcmpw.eq", int_hexagon_A2_vcmpweq>;
+def HEXAGON_A2_vcmpwgt:
+  qi_ALU64_didi                   <"vcmpw.gt", int_hexagon_A2_vcmpwgt>;
+def HEXAGON_A2_vcmpwgtu:
+  qi_ALU64_didi                   <"vcmpw.gtu",int_hexagon_A2_vcmpwgtu>;
+
+// ALU64 / VW / Vector maximum words.
+def HEXAGON_A2_vmaxw:
+  di_ALU64_didi                   <"vmaxw",    int_hexagon_A2_vmaxw>;
+def HEXAGON_A2_vmaxuw:
+  di_ALU64_didi                   <"vmaxuw",   int_hexagon_A2_vmaxuw>;
+
+// ALU64 / VW / Vector minimum words.
+def HEXAGON_A2_vminw:
+  di_ALU64_didi                   <"vminw",    int_hexagon_A2_vminw>;
+def HEXAGON_A2_vminuw:
+  di_ALU64_didi                   <"vminuw",   int_hexagon_A2_vminuw>;
+
+// ALU64 / VW / Vector subtract words.
+def HEXAGON_A2_vsubw:
+  di_ALU64_didi                   <"vsubw",    int_hexagon_A2_vsubw>;
+def HEXAGON_A2_vsubws:
+  di_ALU64_didi_sat               <"vsubw",    int_hexagon_A2_vsubws>;
+
+
+/********************************************************************
+*            CR                                                     *
+*********************************************************************/
+
+// CR / Logical reductions on predicates.
+def HEXAGON_C2_all8:
+  qi_SInst_qi                     <"all8",     int_hexagon_C2_all8>;
+def HEXAGON_C2_any8:
+  qi_SInst_qi                     <"any8",     int_hexagon_C2_any8>;
+
+// CR / Logical operations on predicates.
+def HEXAGON_C2_pxfer_map:
+  qi_SInst_qi_pxfer               <"",         int_hexagon_C2_pxfer_map>;
+def HEXAGON_C2_and:
+  qi_SInst_qiqi                   <"and",      int_hexagon_C2_and>;
+def HEXAGON_C2_andn:
+  qi_SInst_qiqi_neg               <"and",      int_hexagon_C2_andn>;
+def HEXAGON_C2_not:
+  qi_SInst_qi                     <"not",      int_hexagon_C2_not>;
+def HEXAGON_C2_or:
+  qi_SInst_qiqi                   <"or",       int_hexagon_C2_or>;
+def HEXAGON_C2_orn:
+  qi_SInst_qiqi_neg               <"or",       int_hexagon_C2_orn>;
+def HEXAGON_C2_xor:
+  qi_SInst_qiqi                   <"xor",      int_hexagon_C2_xor>;
+
+
+/********************************************************************
+*            MTYPE/ALU                                              *
+*********************************************************************/
+
+// MTYPE / ALU / Add and accumulate.
+def HEXAGON_M2_acci:
+  si_MInst_sisisi_acc             <"add",      int_hexagon_M2_acci>;
+def HEXAGON_M2_accii:
+  si_MInst_sisis8_acc             <"add",      int_hexagon_M2_accii>;
+def HEXAGON_M2_nacci:
+  si_MInst_sisisi_nac             <"add",      int_hexagon_M2_nacci>;
+def HEXAGON_M2_naccii:
+  si_MInst_sisis8_nac             <"add",      int_hexagon_M2_naccii>;
+
+// MTYPE / ALU / Subtract and accumulate.
+def HEXAGON_M2_subacc:
+  si_MInst_sisisi_acc             <"sub",      int_hexagon_M2_subacc>;
+
+// MTYPE / ALU / Vector absolute difference.
+def HEXAGON_M2_vabsdiffh:
+  di_MInst_didi                   <"vabsdiffh",int_hexagon_M2_vabsdiffh>;
+def HEXAGON_M2_vabsdiffw:
+  di_MInst_didi                   <"vabsdiffw",int_hexagon_M2_vabsdiffw>;
+
+// MTYPE / ALU / XOR and xor with destination.
+def HEXAGON_M2_xor_xacc:
+  si_MInst_sisisi_xacc            <"xor",      int_hexagon_M2_xor_xacc>;
+
+
+/********************************************************************
+*            MTYPE/COMPLEX                                          *
+*********************************************************************/
+
+// MTYPE / COMPLEX / Complex multiply.
+// Rdd[-+]=cmpy(Rs, Rt:<<1]:sat
+def HEXAGON_M2_cmpys_s1:
+  di_MInst_sisi_s1_sat            <"cmpy",     int_hexagon_M2_cmpys_s1>;
+def HEXAGON_M2_cmpys_s0:
+  di_MInst_sisi_sat               <"cmpy",     int_hexagon_M2_cmpys_s0>;
+def HEXAGON_M2_cmpysc_s1:
+  di_MInst_sisi_s1_sat_conj       <"cmpy",     int_hexagon_M2_cmpysc_s1>;
+def HEXAGON_M2_cmpysc_s0:
+  di_MInst_sisi_sat_conj          <"cmpy",     int_hexagon_M2_cmpysc_s0>;
+
+def HEXAGON_M2_cmacs_s1:
+  di_MInst_disisi_acc_s1_sat      <"cmpy",     int_hexagon_M2_cmacs_s1>;
+def HEXAGON_M2_cmacs_s0:
+  di_MInst_disisi_acc_sat         <"cmpy",     int_hexagon_M2_cmacs_s0>;
+def HEXAGON_M2_cmacsc_s1:
+  di_MInst_disisi_acc_s1_sat_conj <"cmpy",     int_hexagon_M2_cmacsc_s1>;
+def HEXAGON_M2_cmacsc_s0:
+  di_MInst_disisi_acc_sat_conj    <"cmpy",     int_hexagon_M2_cmacsc_s0>;
+
+def HEXAGON_M2_cnacs_s1:
+  di_MInst_disisi_nac_s1_sat      <"cmpy",     int_hexagon_M2_cnacs_s1>;
+def HEXAGON_M2_cnacs_s0:
+  di_MInst_disisi_nac_sat         <"cmpy",     int_hexagon_M2_cnacs_s0>;
+def HEXAGON_M2_cnacsc_s1:
+  di_MInst_disisi_nac_s1_sat_conj <"cmpy",     int_hexagon_M2_cnacsc_s1>;
+def HEXAGON_M2_cnacsc_s0:
+  di_MInst_disisi_nac_sat_conj    <"cmpy",     int_hexagon_M2_cnacsc_s0>;
+
+// MTYPE / COMPLEX / Complex multiply real or imaginary.
+def HEXAGON_M2_cmpyr_s0:
+  di_MInst_sisi                   <"cmpyr",    int_hexagon_M2_cmpyr_s0>;
+def HEXAGON_M2_cmacr_s0:
+  di_MInst_disisi_acc             <"cmpyr",    int_hexagon_M2_cmacr_s0>;
+
+def HEXAGON_M2_cmpyi_s0:
+  di_MInst_sisi                   <"cmpyi",    int_hexagon_M2_cmpyi_s0>;
+def HEXAGON_M2_cmaci_s0:
+  di_MInst_disisi_acc             <"cmpyi",    int_hexagon_M2_cmaci_s0>;
+
+// MTYPE / COMPLEX / Complex multiply with round and pack.
+// Rxx32+=cmpy(Rs32,[*]Rt32:<<1]:rnd:sat
+def HEXAGON_M2_cmpyrs_s0:
+  si_MInst_sisi_rnd_sat           <"cmpy",     int_hexagon_M2_cmpyrs_s0>;
+def HEXAGON_M2_cmpyrs_s1:
+  si_MInst_sisi_s1_rnd_sat        <"cmpy",     int_hexagon_M2_cmpyrs_s1>;
+
+def HEXAGON_M2_cmpyrsc_s0:
+  si_MInst_sisi_rnd_sat_conj      <"cmpy",     int_hexagon_M2_cmpyrsc_s0>;
+def HEXAGON_M2_cmpyrsc_s1:
+  si_MInst_sisi_s1_rnd_sat_conj   <"cmpy",     int_hexagon_M2_cmpyrsc_s1>;
+
+//MTYPE / COMPLEX / Vector complex multiply real or imaginary.
+def HEXAGON_M2_vcmpy_s0_sat_i:
+  di_MInst_didi_sat               <"vcmpyi",   int_hexagon_M2_vcmpy_s0_sat_i>;
+def HEXAGON_M2_vcmpy_s1_sat_i:
+  di_MInst_didi_s1_sat            <"vcmpyi",   int_hexagon_M2_vcmpy_s1_sat_i>;
+
+def HEXAGON_M2_vcmpy_s0_sat_r:
+  di_MInst_didi_sat               <"vcmpyr",   int_hexagon_M2_vcmpy_s0_sat_r>;
+def HEXAGON_M2_vcmpy_s1_sat_r:
+  di_MInst_didi_s1_sat            <"vcmpyr",   int_hexagon_M2_vcmpy_s1_sat_r>;
+
+def HEXAGON_M2_vcmac_s0_sat_i:
+  di_MInst_dididi_acc_sat         <"vcmpyi",   int_hexagon_M2_vcmac_s0_sat_i>;
+def HEXAGON_M2_vcmac_s0_sat_r:
+  di_MInst_dididi_acc_sat         <"vcmpyr",   int_hexagon_M2_vcmac_s0_sat_r>;
+
+//MTYPE / COMPLEX / Vector reduce complex multiply real or imaginary.
+def HEXAGON_M2_vrcmpyi_s0:
+  di_MInst_didi                   <"vrcmpyi",  int_hexagon_M2_vrcmpyi_s0>;
+def HEXAGON_M2_vrcmpyr_s0:
+  di_MInst_didi                   <"vrcmpyr",  int_hexagon_M2_vrcmpyr_s0>;
+
+def HEXAGON_M2_vrcmpyi_s0c:
+  di_MInst_didi_conj              <"vrcmpyi",  int_hexagon_M2_vrcmpyi_s0c>;
+def HEXAGON_M2_vrcmpyr_s0c:
+  di_MInst_didi_conj              <"vrcmpyr",  int_hexagon_M2_vrcmpyr_s0c>;
+
+def HEXAGON_M2_vrcmaci_s0:
+  di_MInst_dididi_acc             <"vrcmpyi",  int_hexagon_M2_vrcmaci_s0>;
+def HEXAGON_M2_vrcmacr_s0:
+  di_MInst_dididi_acc             <"vrcmpyr",  int_hexagon_M2_vrcmacr_s0>;
+
+def HEXAGON_M2_vrcmaci_s0c:
+  di_MInst_dididi_acc_conj        <"vrcmpyi",  int_hexagon_M2_vrcmaci_s0c>;
+def HEXAGON_M2_vrcmacr_s0c:
+  di_MInst_dididi_acc_conj        <"vrcmpyr",  int_hexagon_M2_vrcmacr_s0c>;
+
+
+/********************************************************************
+*            MTYPE/MPYH                                             *
+*********************************************************************/
+
+// MTYPE / MPYH / Multiply and use lower result.
+//def HEXAGON_M2_mpysmi:
+//FIXME: Hexagon_M2_mpysmi should really by of the type si_MInst_sim9,
+// not si_MInst_sis9 - but for now, we will use s9.
+// def Hexagon_M2_mpysmi:
+//  si_MInst_sim9                   <"mpyi",     int_hexagon_M2_mpysmi>;
+def Hexagon_M2_mpysmi:
+  si_MInst_sis9                   <"mpyi",     int_hexagon_M2_mpysmi>;
+def HEXAGON_M2_mpyi:
+  si_MInst_sisi                   <"mpyi",     int_hexagon_M2_mpyi>;
+def HEXAGON_M2_mpyui:
+  si_MInst_sisi                   <"mpyui",    int_hexagon_M2_mpyui>;
+def HEXAGON_M2_macsip:
+  si_MInst_sisiu8_acc             <"mpyi",     int_hexagon_M2_macsip>;
+def HEXAGON_M2_maci:
+  si_MInst_sisisi_acc             <"mpyi",     int_hexagon_M2_maci>;
+def HEXAGON_M2_macsin:
+  si_MInst_sisiu8_nac             <"mpyi",     int_hexagon_M2_macsin>;
+
+// MTYPE / MPYH / Multiply word by half (32x16).
+//Rdd[+]=vmpywoh(Rss,Rtt)[:<<1][:rnd][:sat]
+//Rdd[+]=vmpyweh(Rss,Rtt)[:<<1][:rnd][:sat]
+def HEXAGON_M2_mmpyl_rs1:
+  di_MInst_didi_s1_rnd_sat        <"vmpyweh",  int_hexagon_M2_mmpyl_rs1>;
+def HEXAGON_M2_mmpyl_s1:
+  di_MInst_didi_s1_sat            <"vmpyweh",  int_hexagon_M2_mmpyl_s1>;
+def HEXAGON_M2_mmpyl_rs0:
+  di_MInst_didi_rnd_sat           <"vmpyweh",  int_hexagon_M2_mmpyl_rs0>;
+def HEXAGON_M2_mmpyl_s0:
+  di_MInst_didi_sat               <"vmpyweh",  int_hexagon_M2_mmpyl_s0>;
+def HEXAGON_M2_mmpyh_rs1:
+  di_MInst_didi_s1_rnd_sat        <"vmpywoh",  int_hexagon_M2_mmpyh_rs1>;
+def HEXAGON_M2_mmpyh_s1:
+  di_MInst_didi_s1_sat            <"vmpywoh",  int_hexagon_M2_mmpyh_s1>;
+def HEXAGON_M2_mmpyh_rs0:
+  di_MInst_didi_rnd_sat           <"vmpywoh",  int_hexagon_M2_mmpyh_rs0>;
+def HEXAGON_M2_mmpyh_s0:
+  di_MInst_didi_sat               <"vmpywoh",  int_hexagon_M2_mmpyh_s0>;
+def HEXAGON_M2_mmacls_rs1:
+  di_MInst_dididi_acc_s1_rnd_sat  <"vmpyweh",  int_hexagon_M2_mmacls_rs1>;
+def HEXAGON_M2_mmacls_s1:
+  di_MInst_dididi_acc_s1_sat      <"vmpyweh",  int_hexagon_M2_mmacls_s1>;
+def HEXAGON_M2_mmacls_rs0:
+  di_MInst_dididi_acc_rnd_sat     <"vmpyweh",  int_hexagon_M2_mmacls_rs0>;
+def HEXAGON_M2_mmacls_s0:
+  di_MInst_dididi_acc_sat         <"vmpyweh",  int_hexagon_M2_mmacls_s0>;
+def HEXAGON_M2_mmachs_rs1:
+  di_MInst_dididi_acc_s1_rnd_sat  <"vmpywoh",  int_hexagon_M2_mmachs_rs1>;
+def HEXAGON_M2_mmachs_s1:
+  di_MInst_dididi_acc_s1_sat      <"vmpywoh",  int_hexagon_M2_mmachs_s1>;
+def HEXAGON_M2_mmachs_rs0:
+  di_MInst_dididi_acc_rnd_sat     <"vmpywoh",  int_hexagon_M2_mmachs_rs0>;
+def HEXAGON_M2_mmachs_s0:
+  di_MInst_dididi_acc_sat         <"vmpywoh",  int_hexagon_M2_mmachs_s0>;
+
+// MTYPE / MPYH / Multiply word by unsigned half (32x16).
+//Rdd[+]=vmpywouh(Rss,Rtt)[:<<1][:rnd][:sat]
+//Rdd[+]=vmpyweuh(Rss,Rtt)[:<<1][:rnd][:sat]
+def HEXAGON_M2_mmpyul_rs1:
+  di_MInst_didi_s1_rnd_sat        <"vmpyweuh", int_hexagon_M2_mmpyul_rs1>;
+def HEXAGON_M2_mmpyul_s1:
+  di_MInst_didi_s1_sat            <"vmpyweuh", int_hexagon_M2_mmpyul_s1>;
+def HEXAGON_M2_mmpyul_rs0:
+  di_MInst_didi_rnd_sat           <"vmpyweuh", int_hexagon_M2_mmpyul_rs0>;
+def HEXAGON_M2_mmpyul_s0:
+  di_MInst_didi_sat               <"vmpyweuh", int_hexagon_M2_mmpyul_s0>;
+def HEXAGON_M2_mmpyuh_rs1:
+  di_MInst_didi_s1_rnd_sat        <"vmpywouh", int_hexagon_M2_mmpyuh_rs1>;
+def HEXAGON_M2_mmpyuh_s1:
+  di_MInst_didi_s1_sat            <"vmpywouh", int_hexagon_M2_mmpyuh_s1>;
+def HEXAGON_M2_mmpyuh_rs0:
+  di_MInst_didi_rnd_sat           <"vmpywouh", int_hexagon_M2_mmpyuh_rs0>;
+def HEXAGON_M2_mmpyuh_s0:
+  di_MInst_didi_sat               <"vmpywouh", int_hexagon_M2_mmpyuh_s0>;
+def HEXAGON_M2_mmaculs_rs1:
+  di_MInst_dididi_acc_s1_rnd_sat  <"vmpyweuh", int_hexagon_M2_mmaculs_rs1>;
+def HEXAGON_M2_mmaculs_s1:
+  di_MInst_dididi_acc_s1_sat      <"vmpyweuh", int_hexagon_M2_mmaculs_s1>;
+def HEXAGON_M2_mmaculs_rs0:
+  di_MInst_dididi_acc_rnd_sat     <"vmpyweuh", int_hexagon_M2_mmaculs_rs0>;
+def HEXAGON_M2_mmaculs_s0:
+  di_MInst_dididi_acc_sat         <"vmpyweuh", int_hexagon_M2_mmaculs_s0>;
+def HEXAGON_M2_mmacuhs_rs1:
+  di_MInst_dididi_acc_s1_rnd_sat  <"vmpywouh", int_hexagon_M2_mmacuhs_rs1>;
+def HEXAGON_M2_mmacuhs_s1:
+  di_MInst_dididi_acc_s1_sat      <"vmpywouh", int_hexagon_M2_mmacuhs_s1>;
+def HEXAGON_M2_mmacuhs_rs0:
+  di_MInst_dididi_acc_rnd_sat     <"vmpywouh", int_hexagon_M2_mmacuhs_rs0>;
+def HEXAGON_M2_mmacuhs_s0:
+  di_MInst_dididi_acc_sat         <"vmpywouh", int_hexagon_M2_mmacuhs_s0>;
+
+// MTYPE / MPYH / Multiply and use upper result.
+def HEXAGON_M2_hmmpyh_rs1:
+  si_MInst_sisi_h_s1_rnd_sat      <"mpy",      int_hexagon_M2_hmmpyh_rs1>;
+def HEXAGON_M2_hmmpyl_rs1:
+  si_MInst_sisi_l_s1_rnd_sat      <"mpy",      int_hexagon_M2_hmmpyl_rs1>;
+def HEXAGON_M2_mpy_up:
+  si_MInst_sisi                   <"mpy",      int_hexagon_M2_mpy_up>;
+def HEXAGON_M2_dpmpyss_rnd_s0:
+  si_MInst_sisi_rnd               <"mpy",      int_hexagon_M2_dpmpyss_rnd_s0>;
+def HEXAGON_M2_mpyu_up:
+  si_MInst_sisi                   <"mpyu",     int_hexagon_M2_mpyu_up>;
+
+// MTYPE / MPYH / Multiply and use full result.
+def HEXAGON_M2_dpmpyuu_s0:
+  di_MInst_sisi                   <"mpyu",     int_hexagon_M2_dpmpyuu_s0>;
+def HEXAGON_M2_dpmpyuu_acc_s0:
+  di_MInst_disisi_acc             <"mpyu",     int_hexagon_M2_dpmpyuu_acc_s0>;
+def HEXAGON_M2_dpmpyuu_nac_s0:
+  di_MInst_disisi_nac             <"mpyu",     int_hexagon_M2_dpmpyuu_nac_s0>;
+def HEXAGON_M2_dpmpyss_s0:
+  di_MInst_sisi                   <"mpy",      int_hexagon_M2_dpmpyss_s0>;
+def HEXAGON_M2_dpmpyss_acc_s0:
+  di_MInst_disisi_acc             <"mpy",      int_hexagon_M2_dpmpyss_acc_s0>;
+def HEXAGON_M2_dpmpyss_nac_s0:
+  di_MInst_disisi_nac             <"mpy",      int_hexagon_M2_dpmpyss_nac_s0>;
+
+
+/********************************************************************
+*            MTYPE/MPYS                                             *
+*********************************************************************/
+
+// MTYPE / MPYS / Scalar 16x16 multiply signed.
+//Rd=mpy(Rs.[H|L],Rt.[H|L:<<0|:<<1]|
+//          [:<<0[:rnd|:sat|:rnd:sat]|:<<1[:rnd|:sat|:rnd:sat]]]
+def HEXAGON_M2_mpy_hh_s0:
+  si_MInst_sisi_hh                <"mpy",     int_hexagon_M2_mpy_hh_s0>;
+def HEXAGON_M2_mpy_hh_s1:
+  si_MInst_sisi_hh_s1             <"mpy",     int_hexagon_M2_mpy_hh_s1>;
+def HEXAGON_M2_mpy_rnd_hh_s1:
+  si_MInst_sisi_rnd_hh_s1         <"mpy",     int_hexagon_M2_mpy_rnd_hh_s1>;
+def HEXAGON_M2_mpy_sat_rnd_hh_s1:
+  si_MInst_sisi_sat_rnd_hh_s1     <"mpy",     int_hexagon_M2_mpy_sat_rnd_hh_s1>;
+def HEXAGON_M2_mpy_sat_hh_s1:
+  si_MInst_sisi_sat_hh_s1         <"mpy",     int_hexagon_M2_mpy_sat_hh_s1>;
+def HEXAGON_M2_mpy_rnd_hh_s0:
+  si_MInst_sisi_rnd_hh            <"mpy",     int_hexagon_M2_mpy_rnd_hh_s0>;
+def HEXAGON_M2_mpy_sat_rnd_hh_s0:
+  si_MInst_sisi_sat_rnd_hh        <"mpy",     int_hexagon_M2_mpy_sat_rnd_hh_s0>;
+def HEXAGON_M2_mpy_sat_hh_s0:
+  si_MInst_sisi_sat_hh            <"mpy",     int_hexagon_M2_mpy_sat_hh_s0>;
+
+def HEXAGON_M2_mpy_hl_s0:
+  si_MInst_sisi_hl                <"mpy",     int_hexagon_M2_mpy_hl_s0>;
+def HEXAGON_M2_mpy_hl_s1:
+  si_MInst_sisi_hl_s1             <"mpy",     int_hexagon_M2_mpy_hl_s1>;
+def HEXAGON_M2_mpy_rnd_hl_s1:
+  si_MInst_sisi_rnd_hl_s1         <"mpy",     int_hexagon_M2_mpy_rnd_hl_s1>;
+def HEXAGON_M2_mpy_sat_rnd_hl_s1:
+  si_MInst_sisi_sat_rnd_hl_s1     <"mpy",     int_hexagon_M2_mpy_sat_rnd_hl_s1>;
+def HEXAGON_M2_mpy_sat_hl_s1:
+  si_MInst_sisi_sat_hl_s1         <"mpy",     int_hexagon_M2_mpy_sat_hl_s1>;
+def HEXAGON_M2_mpy_rnd_hl_s0:
+  si_MInst_sisi_rnd_hl            <"mpy",     int_hexagon_M2_mpy_rnd_hl_s0>;
+def HEXAGON_M2_mpy_sat_rnd_hl_s0:
+  si_MInst_sisi_sat_rnd_hl        <"mpy",     int_hexagon_M2_mpy_sat_rnd_hl_s0>;
+def HEXAGON_M2_mpy_sat_hl_s0:
+  si_MInst_sisi_sat_hl            <"mpy",     int_hexagon_M2_mpy_sat_hl_s0>;
+
+def HEXAGON_M2_mpy_lh_s0:
+  si_MInst_sisi_lh                <"mpy",     int_hexagon_M2_mpy_lh_s0>;
+def HEXAGON_M2_mpy_lh_s1:
+  si_MInst_sisi_lh_s1             <"mpy",     int_hexagon_M2_mpy_lh_s1>;
+def HEXAGON_M2_mpy_rnd_lh_s1:
+  si_MInst_sisi_rnd_lh_s1         <"mpy",     int_hexagon_M2_mpy_rnd_lh_s1>;
+def HEXAGON_M2_mpy_sat_rnd_lh_s1:
+  si_MInst_sisi_sat_rnd_lh_s1     <"mpy",     int_hexagon_M2_mpy_sat_rnd_lh_s1>;
+def HEXAGON_M2_mpy_sat_lh_s1:
+  si_MInst_sisi_sat_lh_s1         <"mpy",     int_hexagon_M2_mpy_sat_lh_s1>;
+def HEXAGON_M2_mpy_rnd_lh_s0:
+  si_MInst_sisi_rnd_lh            <"mpy",     int_hexagon_M2_mpy_rnd_lh_s0>;
+def HEXAGON_M2_mpy_sat_rnd_lh_s0:
+  si_MInst_sisi_sat_rnd_lh        <"mpy",     int_hexagon_M2_mpy_sat_rnd_lh_s0>;
+def HEXAGON_M2_mpy_sat_lh_s0:
+  si_MInst_sisi_sat_lh            <"mpy",     int_hexagon_M2_mpy_sat_lh_s0>;
+
+def HEXAGON_M2_mpy_ll_s0:
+  si_MInst_sisi_ll                <"mpy",     int_hexagon_M2_mpy_ll_s0>;
+def HEXAGON_M2_mpy_ll_s1:
+  si_MInst_sisi_ll_s1             <"mpy",     int_hexagon_M2_mpy_ll_s1>;
+def HEXAGON_M2_mpy_rnd_ll_s1:
+  si_MInst_sisi_rnd_ll_s1         <"mpy",     int_hexagon_M2_mpy_rnd_ll_s1>;
+def HEXAGON_M2_mpy_sat_rnd_ll_s1:
+  si_MInst_sisi_sat_rnd_ll_s1     <"mpy",     int_hexagon_M2_mpy_sat_rnd_ll_s1>;
+def HEXAGON_M2_mpy_sat_ll_s1:
+  si_MInst_sisi_sat_ll_s1         <"mpy",     int_hexagon_M2_mpy_sat_ll_s1>;
+def HEXAGON_M2_mpy_rnd_ll_s0:
+  si_MInst_sisi_rnd_ll            <"mpy",     int_hexagon_M2_mpy_rnd_ll_s0>;
+def HEXAGON_M2_mpy_sat_rnd_ll_s0:
+  si_MInst_sisi_sat_rnd_ll        <"mpy",     int_hexagon_M2_mpy_sat_rnd_ll_s0>;
+def HEXAGON_M2_mpy_sat_ll_s0:
+  si_MInst_sisi_sat_ll            <"mpy",     int_hexagon_M2_mpy_sat_ll_s0>;
+
+//Rdd=mpy(Rs.[H|L],Rt.[H|L])[[:<<0|:<<1]|[:<<0:rnd|:<<1:rnd]]
+def HEXAGON_M2_mpyd_hh_s0:
+  di_MInst_sisi_hh                <"mpy",     int_hexagon_M2_mpyd_hh_s0>;
+def HEXAGON_M2_mpyd_hh_s1:
+  di_MInst_sisi_hh_s1             <"mpy",     int_hexagon_M2_mpyd_hh_s1>;
+def HEXAGON_M2_mpyd_rnd_hh_s1:
+  di_MInst_sisi_rnd_hh_s1         <"mpy",     int_hexagon_M2_mpyd_rnd_hh_s1>;
+def HEXAGON_M2_mpyd_rnd_hh_s0:
+  di_MInst_sisi_rnd_hh            <"mpy",     int_hexagon_M2_mpyd_rnd_hh_s0>;
+
+def HEXAGON_M2_mpyd_hl_s0:
+  di_MInst_sisi_hl                <"mpy",     int_hexagon_M2_mpyd_hl_s0>;
+def HEXAGON_M2_mpyd_hl_s1:
+  di_MInst_sisi_hl_s1             <"mpy",     int_hexagon_M2_mpyd_hl_s1>;
+def HEXAGON_M2_mpyd_rnd_hl_s1:
+  di_MInst_sisi_rnd_hl_s1         <"mpy",     int_hexagon_M2_mpyd_rnd_hl_s1>;
+def HEXAGON_M2_mpyd_rnd_hl_s0:
+  di_MInst_sisi_rnd_hl            <"mpy",     int_hexagon_M2_mpyd_rnd_hl_s0>;
+
+def HEXAGON_M2_mpyd_lh_s0:
+  di_MInst_sisi_lh                <"mpy",     int_hexagon_M2_mpyd_lh_s0>;
+def HEXAGON_M2_mpyd_lh_s1:
+  di_MInst_sisi_lh_s1             <"mpy",     int_hexagon_M2_mpyd_lh_s1>;
+def HEXAGON_M2_mpyd_rnd_lh_s1:
+  di_MInst_sisi_rnd_lh_s1         <"mpy",     int_hexagon_M2_mpyd_rnd_lh_s1>;
+def HEXAGON_M2_mpyd_rnd_lh_s0:
+  di_MInst_sisi_rnd_lh            <"mpy",     int_hexagon_M2_mpyd_rnd_lh_s0>;
+
+def HEXAGON_M2_mpyd_ll_s0:
+  di_MInst_sisi_ll                <"mpy",     int_hexagon_M2_mpyd_ll_s0>;
+def HEXAGON_M2_mpyd_ll_s1:
+  di_MInst_sisi_ll_s1             <"mpy",     int_hexagon_M2_mpyd_ll_s1>;
+def HEXAGON_M2_mpyd_rnd_ll_s1:
+  di_MInst_sisi_rnd_ll_s1         <"mpy",     int_hexagon_M2_mpyd_rnd_ll_s1>;
+def HEXAGON_M2_mpyd_rnd_ll_s0:
+  di_MInst_sisi_rnd_ll            <"mpy",     int_hexagon_M2_mpyd_rnd_ll_s0>;
+
+//Rx+=mpy(Rs.[H|L],Rt.[H|L])[[[:<<0|:<<1]|[:<<0:sat|:<<1:sat]]
+def HEXAGON_M2_mpy_acc_hh_s0:
+  si_MInst_sisisi_acc_hh            <"mpy",  int_hexagon_M2_mpy_acc_hh_s0>;
+def HEXAGON_M2_mpy_acc_hh_s1:
+  si_MInst_sisisi_acc_hh_s1         <"mpy",  int_hexagon_M2_mpy_acc_hh_s1>;
+def HEXAGON_M2_mpy_acc_sat_hh_s1:
+  si_MInst_sisisi_acc_sat_hh_s1     <"mpy",  int_hexagon_M2_mpy_acc_sat_hh_s1>;
+def HEXAGON_M2_mpy_acc_sat_hh_s0:
+  si_MInst_sisisi_acc_sat_hh        <"mpy",  int_hexagon_M2_mpy_acc_sat_hh_s0>;
+
+def HEXAGON_M2_mpy_acc_hl_s0:
+  si_MInst_sisisi_acc_hl            <"mpy",  int_hexagon_M2_mpy_acc_hl_s0>;
+def HEXAGON_M2_mpy_acc_hl_s1:
+  si_MInst_sisisi_acc_hl_s1         <"mpy",  int_hexagon_M2_mpy_acc_hl_s1>;
+def HEXAGON_M2_mpy_acc_sat_hl_s1:
+  si_MInst_sisisi_acc_sat_hl_s1     <"mpy",  int_hexagon_M2_mpy_acc_sat_hl_s1>;
+def HEXAGON_M2_mpy_acc_sat_hl_s0:
+  si_MInst_sisisi_acc_sat_hl        <"mpy",  int_hexagon_M2_mpy_acc_sat_hl_s0>;
+
+def HEXAGON_M2_mpy_acc_lh_s0:
+  si_MInst_sisisi_acc_lh            <"mpy",  int_hexagon_M2_mpy_acc_lh_s0>;
+def HEXAGON_M2_mpy_acc_lh_s1:
+  si_MInst_sisisi_acc_lh_s1         <"mpy",  int_hexagon_M2_mpy_acc_lh_s1>;
+def HEXAGON_M2_mpy_acc_sat_lh_s1:
+  si_MInst_sisisi_acc_sat_lh_s1     <"mpy",  int_hexagon_M2_mpy_acc_sat_lh_s1>;
+def HEXAGON_M2_mpy_acc_sat_lh_s0:
+  si_MInst_sisisi_acc_sat_lh        <"mpy",  int_hexagon_M2_mpy_acc_sat_lh_s0>;
+
+def HEXAGON_M2_mpy_acc_ll_s0:
+  si_MInst_sisisi_acc_ll            <"mpy",  int_hexagon_M2_mpy_acc_ll_s0>;
+def HEXAGON_M2_mpy_acc_ll_s1:
+  si_MInst_sisisi_acc_ll_s1         <"mpy",  int_hexagon_M2_mpy_acc_ll_s1>;
+def HEXAGON_M2_mpy_acc_sat_ll_s1:
+  si_MInst_sisisi_acc_sat_ll_s1     <"mpy",  int_hexagon_M2_mpy_acc_sat_ll_s1>;
+def HEXAGON_M2_mpy_acc_sat_ll_s0:
+  si_MInst_sisisi_acc_sat_ll        <"mpy",  int_hexagon_M2_mpy_acc_sat_ll_s0>;
+
+//Rx-=mpy(Rs.[H|L],Rt.[H|L])[[[:<<0|:<<1]|[:<<0:sat|:<<1:sat]]
+def HEXAGON_M2_mpy_nac_hh_s0:
+  si_MInst_sisisi_nac_hh            <"mpy",  int_hexagon_M2_mpy_nac_hh_s0>;
+def HEXAGON_M2_mpy_nac_hh_s1:
+  si_MInst_sisisi_nac_hh_s1         <"mpy",  int_hexagon_M2_mpy_nac_hh_s1>;
+def HEXAGON_M2_mpy_nac_sat_hh_s1:
+  si_MInst_sisisi_nac_sat_hh_s1     <"mpy",  int_hexagon_M2_mpy_nac_sat_hh_s1>;
+def HEXAGON_M2_mpy_nac_sat_hh_s0:
+  si_MInst_sisisi_nac_sat_hh        <"mpy",  int_hexagon_M2_mpy_nac_sat_hh_s0>;
+
+def HEXAGON_M2_mpy_nac_hl_s0:
+  si_MInst_sisisi_nac_hl            <"mpy",  int_hexagon_M2_mpy_nac_hl_s0>;
+def HEXAGON_M2_mpy_nac_hl_s1:
+  si_MInst_sisisi_nac_hl_s1         <"mpy",  int_hexagon_M2_mpy_nac_hl_s1>;
+def HEXAGON_M2_mpy_nac_sat_hl_s1:
+  si_MInst_sisisi_nac_sat_hl_s1     <"mpy",  int_hexagon_M2_mpy_nac_sat_hl_s1>;
+def HEXAGON_M2_mpy_nac_sat_hl_s0:
+  si_MInst_sisisi_nac_sat_hl        <"mpy",  int_hexagon_M2_mpy_nac_sat_hl_s0>;
+
+def HEXAGON_M2_mpy_nac_lh_s0:
+  si_MInst_sisisi_nac_lh            <"mpy",  int_hexagon_M2_mpy_nac_lh_s0>;
+def HEXAGON_M2_mpy_nac_lh_s1:
+  si_MInst_sisisi_nac_lh_s1         <"mpy",  int_hexagon_M2_mpy_nac_lh_s1>;
+def HEXAGON_M2_mpy_nac_sat_lh_s1:
+  si_MInst_sisisi_nac_sat_lh_s1     <"mpy",  int_hexagon_M2_mpy_nac_sat_lh_s1>;
+def HEXAGON_M2_mpy_nac_sat_lh_s0:
+  si_MInst_sisisi_nac_sat_lh        <"mpy",  int_hexagon_M2_mpy_nac_sat_lh_s0>;
+
+def HEXAGON_M2_mpy_nac_ll_s0:
+  si_MInst_sisisi_nac_ll            <"mpy",  int_hexagon_M2_mpy_nac_ll_s0>;
+def HEXAGON_M2_mpy_nac_ll_s1:
+  si_MInst_sisisi_nac_ll_s1         <"mpy",  int_hexagon_M2_mpy_nac_ll_s1>;
+def HEXAGON_M2_mpy_nac_sat_ll_s1:
+  si_MInst_sisisi_nac_sat_ll_s1     <"mpy",  int_hexagon_M2_mpy_nac_sat_ll_s1>;
+def HEXAGON_M2_mpy_nac_sat_ll_s0:
+  si_MInst_sisisi_nac_sat_ll        <"mpy",  int_hexagon_M2_mpy_nac_sat_ll_s0>;
+
+//Rx+=mpy(Rs.[H|L],Rt.[H|L:<<0|:<<1]
+def HEXAGON_M2_mpyd_acc_hh_s0:
+  di_MInst_disisi_acc_hh          <"mpy",    int_hexagon_M2_mpyd_acc_hh_s0>;
+def HEXAGON_M2_mpyd_acc_hh_s1:
+  di_MInst_disisi_acc_hh_s1       <"mpy",    int_hexagon_M2_mpyd_acc_hh_s1>;
+
+def HEXAGON_M2_mpyd_acc_hl_s0:
+  di_MInst_disisi_acc_hl          <"mpy",    int_hexagon_M2_mpyd_acc_hl_s0>;
+def HEXAGON_M2_mpyd_acc_hl_s1:
+  di_MInst_disisi_acc_hl_s1       <"mpy",    int_hexagon_M2_mpyd_acc_hl_s1>;
+
+def HEXAGON_M2_mpyd_acc_lh_s0:
+  di_MInst_disisi_acc_lh          <"mpy",    int_hexagon_M2_mpyd_acc_lh_s0>;
+def HEXAGON_M2_mpyd_acc_lh_s1:
+  di_MInst_disisi_acc_lh_s1       <"mpy",    int_hexagon_M2_mpyd_acc_lh_s1>;
+
+def HEXAGON_M2_mpyd_acc_ll_s0:
+  di_MInst_disisi_acc_ll          <"mpy",    int_hexagon_M2_mpyd_acc_ll_s0>;
+def HEXAGON_M2_mpyd_acc_ll_s1:
+  di_MInst_disisi_acc_ll_s1       <"mpy",    int_hexagon_M2_mpyd_acc_ll_s1>;
+
+//Rx-=mpy(Rs.[H|L],Rt.[H|L:<<0|:<<1]
+def HEXAGON_M2_mpyd_nac_hh_s0:
+  di_MInst_disisi_nac_hh          <"mpy",    int_hexagon_M2_mpyd_nac_hh_s0>;
+def HEXAGON_M2_mpyd_nac_hh_s1:
+  di_MInst_disisi_nac_hh_s1       <"mpy",    int_hexagon_M2_mpyd_nac_hh_s1>;
+
+def HEXAGON_M2_mpyd_nac_hl_s0:
+  di_MInst_disisi_nac_hl          <"mpy",    int_hexagon_M2_mpyd_nac_hl_s0>;
+def HEXAGON_M2_mpyd_nac_hl_s1:
+  di_MInst_disisi_nac_hl_s1       <"mpy",    int_hexagon_M2_mpyd_nac_hl_s1>;
+
+def HEXAGON_M2_mpyd_nac_lh_s0:
+  di_MInst_disisi_nac_lh          <"mpy",    int_hexagon_M2_mpyd_nac_lh_s0>;
+def HEXAGON_M2_mpyd_nac_lh_s1:
+  di_MInst_disisi_nac_lh_s1       <"mpy",    int_hexagon_M2_mpyd_nac_lh_s1>;
+
+def HEXAGON_M2_mpyd_nac_ll_s0:
+  di_MInst_disisi_nac_ll          <"mpy",    int_hexagon_M2_mpyd_nac_ll_s0>;
+def HEXAGON_M2_mpyd_nac_ll_s1:
+  di_MInst_disisi_nac_ll_s1       <"mpy",    int_hexagon_M2_mpyd_nac_ll_s1>;
+
+// MTYPE / MPYS / Scalar 16x16 multiply unsigned.
+//Rd=mpyu(Rs.[H|L],Rt.[H|L])[:<<0|:<<1]
+def HEXAGON_M2_mpyu_hh_s0:
+  si_MInst_sisi_hh                <"mpyu",    int_hexagon_M2_mpyu_hh_s0>;
+def HEXAGON_M2_mpyu_hh_s1:
+  si_MInst_sisi_hh_s1             <"mpyu",    int_hexagon_M2_mpyu_hh_s1>;
+def HEXAGON_M2_mpyu_hl_s0:
+  si_MInst_sisi_hl                <"mpyu",    int_hexagon_M2_mpyu_hl_s0>;
+def HEXAGON_M2_mpyu_hl_s1:
+  si_MInst_sisi_hl_s1             <"mpyu",    int_hexagon_M2_mpyu_hl_s1>;
+def HEXAGON_M2_mpyu_lh_s0:
+  si_MInst_sisi_lh                <"mpyu",    int_hexagon_M2_mpyu_lh_s0>;
+def HEXAGON_M2_mpyu_lh_s1:
+  si_MInst_sisi_lh_s1             <"mpyu",    int_hexagon_M2_mpyu_lh_s1>;
+def HEXAGON_M2_mpyu_ll_s0:
+  si_MInst_sisi_ll                <"mpyu",    int_hexagon_M2_mpyu_ll_s0>;
+def HEXAGON_M2_mpyu_ll_s1:
+  si_MInst_sisi_ll_s1             <"mpyu",    int_hexagon_M2_mpyu_ll_s1>;
+
+//Rdd=mpyu(Rs.[H|L],Rt.[H|L])[:<<0|:<<1]
+def HEXAGON_M2_mpyud_hh_s0:
+  di_MInst_sisi_hh                <"mpyu",    int_hexagon_M2_mpyud_hh_s0>;
+def HEXAGON_M2_mpyud_hh_s1:
+  di_MInst_sisi_hh_s1             <"mpyu",    int_hexagon_M2_mpyud_hh_s1>;
+def HEXAGON_M2_mpyud_hl_s0:
+  di_MInst_sisi_hl                <"mpyu",    int_hexagon_M2_mpyud_hl_s0>;
+def HEXAGON_M2_mpyud_hl_s1:
+  di_MInst_sisi_hl_s1             <"mpyu",    int_hexagon_M2_mpyud_hl_s1>;
+def HEXAGON_M2_mpyud_lh_s0:
+  di_MInst_sisi_lh                <"mpyu",    int_hexagon_M2_mpyud_lh_s0>;
+def HEXAGON_M2_mpyud_lh_s1:
+  di_MInst_sisi_lh_s1             <"mpyu",    int_hexagon_M2_mpyud_lh_s1>;
+def HEXAGON_M2_mpyud_ll_s0:
+  di_MInst_sisi_ll                <"mpyu",    int_hexagon_M2_mpyud_ll_s0>;
+def HEXAGON_M2_mpyud_ll_s1:
+  di_MInst_sisi_ll_s1             <"mpyu",    int_hexagon_M2_mpyud_ll_s1>;
+
+//Rd+=mpyu(Rs.[H|L],Rt.[H|L])[:<<0|:<<1]
+def HEXAGON_M2_mpyu_acc_hh_s0:
+  si_MInst_sisisi_acc_hh            <"mpyu",    int_hexagon_M2_mpyu_acc_hh_s0>;
+def HEXAGON_M2_mpyu_acc_hh_s1:
+  si_MInst_sisisi_acc_hh_s1         <"mpyu",    int_hexagon_M2_mpyu_acc_hh_s1>;
+def HEXAGON_M2_mpyu_acc_hl_s0:
+  si_MInst_sisisi_acc_hl            <"mpyu",    int_hexagon_M2_mpyu_acc_hl_s0>;
+def HEXAGON_M2_mpyu_acc_hl_s1:
+  si_MInst_sisisi_acc_hl_s1         <"mpyu",    int_hexagon_M2_mpyu_acc_hl_s1>;
+def HEXAGON_M2_mpyu_acc_lh_s0:
+  si_MInst_sisisi_acc_lh            <"mpyu",    int_hexagon_M2_mpyu_acc_lh_s0>;
+def HEXAGON_M2_mpyu_acc_lh_s1:
+  si_MInst_sisisi_acc_lh_s1         <"mpyu",    int_hexagon_M2_mpyu_acc_lh_s1>;
+def HEXAGON_M2_mpyu_acc_ll_s0:
+  si_MInst_sisisi_acc_ll            <"mpyu",    int_hexagon_M2_mpyu_acc_ll_s0>;
+def HEXAGON_M2_mpyu_acc_ll_s1:
+  si_MInst_sisisi_acc_ll_s1         <"mpyu",    int_hexagon_M2_mpyu_acc_ll_s1>;
+
+//Rd+=mpyu(Rs.[H|L],Rt.[H|L])[:<<0|:<<1]
+def HEXAGON_M2_mpyu_nac_hh_s0:
+  si_MInst_sisisi_nac_hh            <"mpyu",    int_hexagon_M2_mpyu_nac_hh_s0>;
+def HEXAGON_M2_mpyu_nac_hh_s1:
+  si_MInst_sisisi_nac_hh_s1         <"mpyu",    int_hexagon_M2_mpyu_nac_hh_s1>;
+def HEXAGON_M2_mpyu_nac_hl_s0:
+  si_MInst_sisisi_nac_hl            <"mpyu",    int_hexagon_M2_mpyu_nac_hl_s0>;
+def HEXAGON_M2_mpyu_nac_hl_s1:
+  si_MInst_sisisi_nac_hl_s1         <"mpyu",    int_hexagon_M2_mpyu_nac_hl_s1>;
+def HEXAGON_M2_mpyu_nac_lh_s0:
+  si_MInst_sisisi_nac_lh            <"mpyu",    int_hexagon_M2_mpyu_nac_lh_s0>;
+def HEXAGON_M2_mpyu_nac_lh_s1:
+  si_MInst_sisisi_nac_lh_s1         <"mpyu",    int_hexagon_M2_mpyu_nac_lh_s1>;
+def HEXAGON_M2_mpyu_nac_ll_s0:
+  si_MInst_sisisi_nac_ll            <"mpyu",    int_hexagon_M2_mpyu_nac_ll_s0>;
+def HEXAGON_M2_mpyu_nac_ll_s1:
+  si_MInst_sisisi_nac_ll_s1         <"mpyu",    int_hexagon_M2_mpyu_nac_ll_s1>;
+
+//Rdd+=mpyu(Rs.[H|L],Rt.[H|L])[:<<0|:<<1]
+def HEXAGON_M2_mpyud_acc_hh_s0:
+  di_MInst_disisi_acc_hh            <"mpyu", int_hexagon_M2_mpyud_acc_hh_s0>;
+def HEXAGON_M2_mpyud_acc_hh_s1:
+  di_MInst_disisi_acc_hh_s1         <"mpyu", int_hexagon_M2_mpyud_acc_hh_s1>;
+def HEXAGON_M2_mpyud_acc_hl_s0:
+  di_MInst_disisi_acc_hl            <"mpyu", int_hexagon_M2_mpyud_acc_hl_s0>;
+def HEXAGON_M2_mpyud_acc_hl_s1:
+  di_MInst_disisi_acc_hl_s1         <"mpyu", int_hexagon_M2_mpyud_acc_hl_s1>;
+def HEXAGON_M2_mpyud_acc_lh_s0:
+  di_MInst_disisi_acc_lh            <"mpyu", int_hexagon_M2_mpyud_acc_lh_s0>;
+def HEXAGON_M2_mpyud_acc_lh_s1:
+  di_MInst_disisi_acc_lh_s1         <"mpyu", int_hexagon_M2_mpyud_acc_lh_s1>;
+def HEXAGON_M2_mpyud_acc_ll_s0:
+  di_MInst_disisi_acc_ll            <"mpyu", int_hexagon_M2_mpyud_acc_ll_s0>;
+def HEXAGON_M2_mpyud_acc_ll_s1:
+  di_MInst_disisi_acc_ll_s1         <"mpyu", int_hexagon_M2_mpyud_acc_ll_s1>;
+
+//Rdd-=mpyu(Rs.[H|L],Rt.[H|L])[:<<0|:<<1]
+def HEXAGON_M2_mpyud_nac_hh_s0:
+  di_MInst_disisi_nac_hh            <"mpyu", int_hexagon_M2_mpyud_nac_hh_s0>;
+def HEXAGON_M2_mpyud_nac_hh_s1:
+  di_MInst_disisi_nac_hh_s1         <"mpyu", int_hexagon_M2_mpyud_nac_hh_s1>;
+def HEXAGON_M2_mpyud_nac_hl_s0:
+  di_MInst_disisi_nac_hl            <"mpyu", int_hexagon_M2_mpyud_nac_hl_s0>;
+def HEXAGON_M2_mpyud_nac_hl_s1:
+  di_MInst_disisi_nac_hl_s1         <"mpyu", int_hexagon_M2_mpyud_nac_hl_s1>;
+def HEXAGON_M2_mpyud_nac_lh_s0:
+  di_MInst_disisi_nac_lh            <"mpyu", int_hexagon_M2_mpyud_nac_lh_s0>;
+def HEXAGON_M2_mpyud_nac_lh_s1:
+  di_MInst_disisi_nac_lh_s1         <"mpyu", int_hexagon_M2_mpyud_nac_lh_s1>;
+def HEXAGON_M2_mpyud_nac_ll_s0:
+  di_MInst_disisi_nac_ll            <"mpyu", int_hexagon_M2_mpyud_nac_ll_s0>;
+def HEXAGON_M2_mpyud_nac_ll_s1:
+  di_MInst_disisi_nac_ll_s1         <"mpyu", int_hexagon_M2_mpyud_nac_ll_s1>;
+
+
+/********************************************************************
+*            MTYPE/VB                                               *
+*********************************************************************/
+
+// MTYPE / VB / Vector reduce add unsigned bytes.
+def HEXAGON_A2_vraddub:
+  di_MInst_didi                   <"vraddub", int_hexagon_A2_vraddub>;
+def HEXAGON_A2_vraddub_acc:
+  di_MInst_dididi_acc             <"vraddub", int_hexagon_A2_vraddub_acc>;
+
+// MTYPE / VB / Vector sum of absolute differences unsigned bytes.
+def HEXAGON_A2_vrsadub:
+  di_MInst_didi                   <"vrsadub", int_hexagon_A2_vrsadub>;
+def HEXAGON_A2_vrsadub_acc:
+  di_MInst_dididi_acc             <"vrsadub", int_hexagon_A2_vrsadub_acc>;
+
+/********************************************************************
+*            MTYPE/VH                                               *
+*********************************************************************/
+
+// MTYPE / VH / Vector dual multiply.
+def HEXAGON_M2_vdmpys_s1:
+  di_MInst_didi_s1_sat            <"vdmpy",   int_hexagon_M2_vdmpys_s1>;
+def HEXAGON_M2_vdmpys_s0:
+  di_MInst_didi_sat               <"vdmpy",   int_hexagon_M2_vdmpys_s0>;
+def HEXAGON_M2_vdmacs_s1:
+  di_MInst_dididi_acc_s1_sat      <"vdmpy",   int_hexagon_M2_vdmacs_s1>;
+def HEXAGON_M2_vdmacs_s0:
+  di_MInst_dididi_acc_sat         <"vdmpy",   int_hexagon_M2_vdmacs_s0>;
+
+// MTYPE / VH / Vector dual multiply with round and pack.
+def HEXAGON_M2_vdmpyrs_s0:
+  si_MInst_didi_rnd_sat           <"vdmpy",   int_hexagon_M2_vdmpyrs_s0>;
+def HEXAGON_M2_vdmpyrs_s1:
+  si_MInst_didi_s1_rnd_sat        <"vdmpy",   int_hexagon_M2_vdmpyrs_s1>;
+
+// MTYPE / VH / Vector multiply even halfwords.
+def HEXAGON_M2_vmpy2es_s1:
+  di_MInst_didi_s1_sat            <"vmpyeh",  int_hexagon_M2_vmpy2es_s1>;
+def HEXAGON_M2_vmpy2es_s0:
+  di_MInst_didi_sat               <"vmpyeh",  int_hexagon_M2_vmpy2es_s0>;
+def HEXAGON_M2_vmac2es:
+  di_MInst_dididi_acc             <"vmpyeh",  int_hexagon_M2_vmac2es>;
+def HEXAGON_M2_vmac2es_s1:
+  di_MInst_dididi_acc_s1_sat      <"vmpyeh",  int_hexagon_M2_vmac2es_s1>;
+def HEXAGON_M2_vmac2es_s0:
+  di_MInst_dididi_acc_sat         <"vmpyeh",  int_hexagon_M2_vmac2es_s0>;
+
+// MTYPE / VH / Vector multiply halfwords.
+def HEXAGON_M2_vmpy2s_s0:
+  di_MInst_sisi_sat               <"vmpyh",   int_hexagon_M2_vmpy2s_s0>;
+def HEXAGON_M2_vmpy2s_s1:
+  di_MInst_sisi_s1_sat            <"vmpyh",   int_hexagon_M2_vmpy2s_s1>;
+def HEXAGON_M2_vmac2:
+  di_MInst_disisi_acc             <"vmpyh",   int_hexagon_M2_vmac2>;
+def HEXAGON_M2_vmac2s_s0:
+  di_MInst_disisi_acc_sat         <"vmpyh",   int_hexagon_M2_vmac2s_s0>;
+def HEXAGON_M2_vmac2s_s1:
+  di_MInst_disisi_acc_s1_sat      <"vmpyh",   int_hexagon_M2_vmac2s_s1>;
+
+// MTYPE / VH / Vector multiply halfwords with round and pack.
+def HEXAGON_M2_vmpy2s_s0pack:
+  si_MInst_sisi_rnd_sat           <"vmpyh",   int_hexagon_M2_vmpy2s_s0pack>;
+def HEXAGON_M2_vmpy2s_s1pack:
+  si_MInst_sisi_s1_rnd_sat        <"vmpyh",   int_hexagon_M2_vmpy2s_s1pack>;
+
+// MTYPE / VH / Vector reduce multiply halfwords.
+// Rxx32+=vrmpyh(Rss32,Rtt32)
+def HEXAGON_M2_vrmpy_s0:
+  di_MInst_didi                   <"vrmpyh",  int_hexagon_M2_vrmpy_s0>;
+def HEXAGON_M2_vrmac_s0:
+  di_MInst_dididi_acc             <"vrmpyh",  int_hexagon_M2_vrmac_s0>;
+
+
+/********************************************************************
+*            STYPE/ALU                                              *
+*********************************************************************/
+
+// STYPE / ALU / Absolute value.
+def HEXAGON_A2_abs:
+  si_SInst_si                     <"abs",     int_hexagon_A2_abs>;
+def HEXAGON_A2_absp:
+  di_SInst_di                     <"abs",     int_hexagon_A2_absp>;
+def HEXAGON_A2_abssat:
+  si_SInst_si_sat                 <"abs",     int_hexagon_A2_abssat>;
+
+// STYPE / ALU / Negate.
+def HEXAGON_A2_negp:
+  di_SInst_di                     <"neg",     int_hexagon_A2_negp>;
+def HEXAGON_A2_negsat:
+  si_SInst_si_sat                 <"neg",     int_hexagon_A2_negsat>;
+
+// STYPE / ALU / Logical Not.
+def HEXAGON_A2_notp:
+  di_SInst_di                     <"not",     int_hexagon_A2_notp>;
+
+// STYPE / ALU / Sign extend word to doubleword.
+def HEXAGON_A2_sxtw:
+  di_SInst_si                     <"sxtw",     int_hexagon_A2_sxtw>;
+
+
+/********************************************************************
+*            STYPE/BIT                                              *
+*********************************************************************/
+
+// STYPE / BIT / Count leading.
+def HEXAGON_S2_cl0:
+  si_SInst_si                     <"cl0",     int_hexagon_S2_cl0>;
+def HEXAGON_S2_cl0p:
+  si_SInst_di                     <"cl0",     int_hexagon_S2_cl0p>;
+def HEXAGON_S2_cl1:
+  si_SInst_si                     <"cl1",     int_hexagon_S2_cl1>;
+def HEXAGON_S2_cl1p:
+  si_SInst_di                     <"cl1",     int_hexagon_S2_cl1p>;
+def HEXAGON_S2_clb:
+  si_SInst_si                     <"clb",     int_hexagon_S2_clb>;
+def HEXAGON_S2_clbp:
+  si_SInst_di                     <"clb",     int_hexagon_S2_clbp>;
+def HEXAGON_S2_clbnorm:
+  si_SInst_si                     <"normamt", int_hexagon_S2_clbnorm>;
+
+// STYPE / BIT / Count trailing.
+def HEXAGON_S2_ct0:
+  si_SInst_si                     <"ct0",     int_hexagon_S2_ct0>;
+def HEXAGON_S2_ct1:
+  si_SInst_si                     <"ct1",     int_hexagon_S2_ct1>;
+
+// STYPE / BIT / Compare bit mask.
+def Hexagon_C2_bitsclr:
+  qi_SInst_sisi                   <"bitsclr", int_hexagon_C2_bitsclr>;
+def Hexagon_C2_bitsclri:
+  qi_SInst_siu6                   <"bitsclr", int_hexagon_C2_bitsclri>;
+def Hexagon_C2_bitsset:
+  qi_SInst_sisi                   <"bitsset", int_hexagon_C2_bitsset>;
+
+// STYPE / BIT / Extract unsigned.
+// Rd[d][32/64]=extractu(Rs[s],Rt[t],[imm])
+def HEXAGON_S2_extractu:
+  si_SInst_siu5u5                 <"extractu",int_hexagon_S2_extractu>;
+def HEXAGON_S2_extractu_rp:
+  si_SInst_sidi                   <"extractu",int_hexagon_S2_extractu_rp>;
+def HEXAGON_S2_extractup:
+  di_SInst_diu6u6                 <"extractu",int_hexagon_S2_extractup>;
+def HEXAGON_S2_extractup_rp:
+  di_SInst_didi                   <"extractu",int_hexagon_S2_extractup_rp>;
+
+// STYPE / BIT / Insert bitfield.
+def Hexagon_S2_insert:
+  si_SInst_sisiu5u5               <"insert",  int_hexagon_S2_insert>;
+def Hexagon_S2_insert_rp:
+  si_SInst_sisidi                 <"insert",  int_hexagon_S2_insert_rp>;
+def Hexagon_S2_insertp:
+  di_SInst_didiu6u6               <"insert",  int_hexagon_S2_insertp>;
+def Hexagon_S2_insertp_rp:
+  di_SInst_dididi                 <"insert",  int_hexagon_S2_insertp_rp>;
+
+// STYPE / BIT / Innterleave/deinterleave.
+def Hexagon_S2_interleave:
+  di_SInst_di                     <"interleave", int_hexagon_S2_interleave>;
+def Hexagon_S2_deinterleave:
+  di_SInst_di                     <"deinterleave", int_hexagon_S2_deinterleave>;
+
+// STYPE / BIT / Linear feedback-shift Iteration.
+def Hexagon_S2_lfsp:
+  di_SInst_didi                   <"lfs",     int_hexagon_S2_lfsp>;
+
+// STYPE / BIT / Bit reverse.
+def Hexagon_S2_brev:
+  si_SInst_si                     <"brev",    int_hexagon_S2_brev>;
+
+// STYPE / BIT / Set/Clear/Toggle Bit.
+def HEXAGON_S2_setbit_i:
+  si_SInst_siu5                   <"setbit",  int_hexagon_S2_setbit_i>;
+def HEXAGON_S2_togglebit_i:
+  si_SInst_siu5                   <"togglebit", int_hexagon_S2_togglebit_i>;
+def HEXAGON_S2_clrbit_i:
+  si_SInst_siu5                   <"clrbit",  int_hexagon_S2_clrbit_i>;
+def HEXAGON_S2_setbit_r:
+  si_SInst_sisi                   <"setbit",  int_hexagon_S2_setbit_r>;
+def HEXAGON_S2_togglebit_r:
+  si_SInst_sisi                   <"togglebit", int_hexagon_S2_togglebit_r>;
+def HEXAGON_S2_clrbit_r:
+  si_SInst_sisi                   <"clrbit",  int_hexagon_S2_clrbit_r>;
+
+// STYPE / BIT / Test Bit.
+def HEXAGON_S2_tstbit_i:
+  qi_SInst_siu5                   <"tstbit",  int_hexagon_S2_tstbit_i>;
+def HEXAGON_S2_tstbit_r:
+  qi_SInst_sisi                   <"tstbit",  int_hexagon_S2_tstbit_r>;
+
+
+/********************************************************************
+*            STYPE/COMPLEX                                          *
+*********************************************************************/
+
+// STYPE / COMPLEX / Vector Complex conjugate.
+def HEXAGON_A2_vconj:
+  di_SInst_di_sat                 <"vconj",   int_hexagon_A2_vconj>;
+
+// STYPE / COMPLEX / Vector Complex rotate.
+def HEXAGON_S2_vcrotate:
+  di_SInst_disi                   <"vcrotate",int_hexagon_S2_vcrotate>;
+
+
+/********************************************************************
+*            STYPE/PERM                                             *
+*********************************************************************/
+
+// STYPE / PERM / Saturate.
+def HEXAGON_A2_sat:
+  si_SInst_di                     <"sat",     int_hexagon_A2_sat>;
+def HEXAGON_A2_satb:
+  si_SInst_si                     <"satb",    int_hexagon_A2_satb>;
+def HEXAGON_A2_sath:
+  si_SInst_si                     <"sath",    int_hexagon_A2_sath>;
+def HEXAGON_A2_satub:
+  si_SInst_si                     <"satub",   int_hexagon_A2_satub>;
+def HEXAGON_A2_satuh:
+  si_SInst_si                     <"satuh",   int_hexagon_A2_satuh>;
+
+// STYPE / PERM / Swizzle bytes.
+def HEXAGON_A2_swiz:
+  si_SInst_si                     <"swiz",    int_hexagon_A2_swiz>;
+
+// STYPE / PERM / Vector align.
+// Need custom lowering
+def HEXAGON_S2_valignib:
+  di_SInst_didiu3                 <"valignb", int_hexagon_S2_valignib>;
+def HEXAGON_S2_valignrb:
+  di_SInst_didiqi                 <"valignb", int_hexagon_S2_valignrb>;
+
+// STYPE / PERM / Vector round and pack.
+def HEXAGON_S2_vrndpackwh:
+  si_SInst_di                     <"vrndwh",  int_hexagon_S2_vrndpackwh>;
+def HEXAGON_S2_vrndpackwhs:
+  si_SInst_di_sat                 <"vrndwh",  int_hexagon_S2_vrndpackwhs>;
+
+// STYPE / PERM / Vector saturate and pack.
+def HEXAGON_S2_svsathb:
+  si_SInst_si                     <"vsathb",  int_hexagon_S2_svsathb>;
+def HEXAGON_S2_vsathb:
+  si_SInst_di                     <"vsathb",  int_hexagon_S2_vsathb>;
+def HEXAGON_S2_svsathub:
+  si_SInst_si                     <"vsathub", int_hexagon_S2_svsathub>;
+def HEXAGON_S2_vsathub:
+  si_SInst_di                     <"vsathub", int_hexagon_S2_vsathub>;
+def HEXAGON_S2_vsatwh:
+  si_SInst_di                     <"vsatwh",  int_hexagon_S2_vsatwh>;
+def HEXAGON_S2_vsatwuh:
+  si_SInst_di                     <"vsatwuh", int_hexagon_S2_vsatwuh>;
+
+// STYPE / PERM / Vector saturate without pack.
+def HEXAGON_S2_vsathb_nopack:
+  di_SInst_di                     <"vsathb",  int_hexagon_S2_vsathb_nopack>;
+def HEXAGON_S2_vsathub_nopack:
+  di_SInst_di                     <"vsathub", int_hexagon_S2_vsathub_nopack>;
+def HEXAGON_S2_vsatwh_nopack:
+  di_SInst_di                     <"vsatwh",  int_hexagon_S2_vsatwh_nopack>;
+def HEXAGON_S2_vsatwuh_nopack:
+  di_SInst_di                     <"vsatwuh", int_hexagon_S2_vsatwuh_nopack>;
+
+// STYPE / PERM / Vector shuffle.
+def HEXAGON_S2_shuffeb:
+  di_SInst_didi                   <"shuffeb", int_hexagon_S2_shuffeb>;
+def HEXAGON_S2_shuffeh:
+  di_SInst_didi                   <"shuffeh", int_hexagon_S2_shuffeh>;
+def HEXAGON_S2_shuffob:
+  di_SInst_didi                   <"shuffob", int_hexagon_S2_shuffob>;
+def HEXAGON_S2_shuffoh:
+  di_SInst_didi                   <"shuffoh", int_hexagon_S2_shuffoh>;
+
+// STYPE / PERM / Vector splat bytes.
+def HEXAGON_S2_vsplatrb:
+  si_SInst_si                     <"vsplatb", int_hexagon_S2_vsplatrb>;
+
+// STYPE / PERM / Vector splat halfwords.
+def HEXAGON_S2_vsplatrh:
+  di_SInst_si                     <"vsplath", int_hexagon_S2_vsplatrh>;
+
+// STYPE / PERM / Vector splice.
+def Hexagon_S2_vsplicerb:
+  di_SInst_didiqi                 <"vspliceb",int_hexagon_S2_vsplicerb>;
+def Hexagon_S2_vspliceib:
+  di_SInst_didiu3                 <"vspliceb",int_hexagon_S2_vspliceib>;
+
+// STYPE / PERM / Sign extend.
+def HEXAGON_S2_vsxtbh:
+  di_SInst_si                     <"vsxtbh",  int_hexagon_S2_vsxtbh>;
+def HEXAGON_S2_vsxthw:
+  di_SInst_si                     <"vsxthw",  int_hexagon_S2_vsxthw>;
+
+// STYPE / PERM / Truncate.
+def HEXAGON_S2_vtrunehb:
+  si_SInst_di                     <"vtrunehb",int_hexagon_S2_vtrunehb>;
+def HEXAGON_S2_vtrunohb:
+  si_SInst_di                     <"vtrunohb",int_hexagon_S2_vtrunohb>;
+def HEXAGON_S2_vtrunewh:
+  di_SInst_didi                   <"vtrunewh",int_hexagon_S2_vtrunewh>;
+def HEXAGON_S2_vtrunowh:
+  di_SInst_didi                   <"vtrunowh",int_hexagon_S2_vtrunowh>;
+
+// STYPE / PERM / Zero extend.
+def HEXAGON_S2_vzxtbh:
+  di_SInst_si                     <"vzxtbh",  int_hexagon_S2_vzxtbh>;
+def HEXAGON_S2_vzxthw:
+  di_SInst_si                     <"vzxthw",  int_hexagon_S2_vzxthw>;
+
+
+/********************************************************************
+*            STYPE/PRED                                             *
+*********************************************************************/
+
+// STYPE / PRED / Mask generate from predicate.
+def HEXAGON_C2_mask:
+  di_SInst_qi                     <"mask",   int_hexagon_C2_mask>;
+
+// STYPE / PRED / Predicate transfer.
+def HEXAGON_C2_tfrpr:
+  si_SInst_qi                     <"",       int_hexagon_C2_tfrpr>;
+def HEXAGON_C2_tfrrp:
+  qi_SInst_si                     <"",       int_hexagon_C2_tfrrp>;
+
+// STYPE / PRED / Viterbi pack even and odd predicate bits.
+def HEXAGON_C2_vitpack:
+  si_SInst_qiqi                   <"vitpack",int_hexagon_C2_vitpack>;
+
+
+/********************************************************************
+*            STYPE/SHIFT                                            *
+*********************************************************************/
+
+// STYPE / SHIFT / Shift by immediate.
+def HEXAGON_S2_asl_i_r:
+  si_SInst_siu5                   <"asl",     int_hexagon_S2_asl_i_r>;
+def HEXAGON_S2_asr_i_r:
+  si_SInst_siu5                   <"asr",     int_hexagon_S2_asr_i_r>;
+def HEXAGON_S2_lsr_i_r:
+  si_SInst_siu5                   <"lsr",     int_hexagon_S2_lsr_i_r>;
+def HEXAGON_S2_asl_i_p:
+  di_SInst_diu6                   <"asl",     int_hexagon_S2_asl_i_p>;
+def HEXAGON_S2_asr_i_p:
+  di_SInst_diu6                   <"asr",     int_hexagon_S2_asr_i_p>;
+def HEXAGON_S2_lsr_i_p:
+  di_SInst_diu6                   <"lsr",     int_hexagon_S2_lsr_i_p>;
+
+// STYPE / SHIFT / Shift by immediate and accumulate.
+def HEXAGON_S2_asl_i_r_acc:
+  si_SInst_sisiu5_acc             <"asl",     int_hexagon_S2_asl_i_r_acc>;
+def HEXAGON_S2_asr_i_r_acc:
+  si_SInst_sisiu5_acc             <"asr",     int_hexagon_S2_asr_i_r_acc>;
+def HEXAGON_S2_lsr_i_r_acc:
+  si_SInst_sisiu5_acc             <"lsr",     int_hexagon_S2_lsr_i_r_acc>;
+def HEXAGON_S2_asl_i_r_nac:
+  si_SInst_sisiu5_nac             <"asl",     int_hexagon_S2_asl_i_r_nac>;
+def HEXAGON_S2_asr_i_r_nac:
+  si_SInst_sisiu5_nac             <"asr",     int_hexagon_S2_asr_i_r_nac>;
+def HEXAGON_S2_lsr_i_r_nac:
+  si_SInst_sisiu5_nac             <"lsr",     int_hexagon_S2_lsr_i_r_nac>;
+def HEXAGON_S2_asl_i_p_acc:
+  di_SInst_didiu6_acc             <"asl",     int_hexagon_S2_asl_i_p_acc>;
+def HEXAGON_S2_asr_i_p_acc:
+  di_SInst_didiu6_acc             <"asr",     int_hexagon_S2_asr_i_p_acc>;
+def HEXAGON_S2_lsr_i_p_acc:
+  di_SInst_didiu6_acc             <"lsr",     int_hexagon_S2_lsr_i_p_acc>;
+def HEXAGON_S2_asl_i_p_nac:
+  di_SInst_didiu6_nac             <"asl",     int_hexagon_S2_asl_i_p_nac>;
+def HEXAGON_S2_asr_i_p_nac:
+  di_SInst_didiu6_nac             <"asr",     int_hexagon_S2_asr_i_p_nac>;
+def HEXAGON_S2_lsr_i_p_nac:
+  di_SInst_didiu6_nac             <"lsr",     int_hexagon_S2_lsr_i_p_nac>;
+
+// STYPE / SHIFT / Shift by immediate and add.
+def HEXAGON_S2_addasl_rrri:
+  si_SInst_sisiu3                 <"addasl",  int_hexagon_S2_addasl_rrri>;
+
+// STYPE / SHIFT / Shift by immediate and logical.
+def HEXAGON_S2_asl_i_r_and:
+  si_SInst_sisiu5_and             <"asl",     int_hexagon_S2_asl_i_r_and>;
+def HEXAGON_S2_asr_i_r_and:
+  si_SInst_sisiu5_and             <"asr",     int_hexagon_S2_asr_i_r_and>;
+def HEXAGON_S2_lsr_i_r_and:
+  si_SInst_sisiu5_and             <"lsr",     int_hexagon_S2_lsr_i_r_and>;
+
+def HEXAGON_S2_asl_i_r_xacc:
+  si_SInst_sisiu5_xor             <"asl",     int_hexagon_S2_asl_i_r_xacc>;
+def HEXAGON_S2_lsr_i_r_xacc:
+  si_SInst_sisiu5_xor             <"lsr",     int_hexagon_S2_lsr_i_r_xacc>;
+
+def HEXAGON_S2_asl_i_r_or:
+  si_SInst_sisiu5_or              <"asl",     int_hexagon_S2_asl_i_r_or>;
+def HEXAGON_S2_asr_i_r_or:
+  si_SInst_sisiu5_or              <"asr",     int_hexagon_S2_asr_i_r_or>;
+def HEXAGON_S2_lsr_i_r_or:
+  si_SInst_sisiu5_or              <"lsr",     int_hexagon_S2_lsr_i_r_or>;
+
+def HEXAGON_S2_asl_i_p_and:
+  di_SInst_didiu6_and             <"asl",     int_hexagon_S2_asl_i_p_and>;
+def HEXAGON_S2_asr_i_p_and:
+  di_SInst_didiu6_and             <"asr",     int_hexagon_S2_asr_i_p_and>;
+def HEXAGON_S2_lsr_i_p_and:
+  di_SInst_didiu6_and             <"lsr",     int_hexagon_S2_lsr_i_p_and>;
+
+def HEXAGON_S2_asl_i_p_xacc:
+  di_SInst_didiu6_xor             <"asl",     int_hexagon_S2_asl_i_p_xacc>;
+def HEXAGON_S2_lsr_i_p_xacc:
+  di_SInst_didiu6_xor             <"lsr",     int_hexagon_S2_lsr_i_p_xacc>;
+
+def HEXAGON_S2_asl_i_p_or:
+  di_SInst_didiu6_or              <"asl",     int_hexagon_S2_asl_i_p_or>;
+def HEXAGON_S2_asr_i_p_or:
+  di_SInst_didiu6_or              <"asr",     int_hexagon_S2_asr_i_p_or>;
+def HEXAGON_S2_lsr_i_p_or:
+  di_SInst_didiu6_or              <"lsr",     int_hexagon_S2_lsr_i_p_or>;
+
+// STYPE / SHIFT / Shift right by immediate with rounding.
+def HEXAGON_S2_asr_i_r_rnd:
+  si_SInst_siu5_rnd               <"asr",     int_hexagon_S2_asr_i_r_rnd>;
+def HEXAGON_S2_asr_i_r_rnd_goodsyntax:
+  si_SInst_siu5              <"asrrnd",  int_hexagon_S2_asr_i_r_rnd_goodsyntax>;
+
+// STYPE / SHIFT / Shift left by immediate with saturation.
+def HEXAGON_S2_asl_i_r_sat:
+  si_SInst_sisi_sat               <"asl",     int_hexagon_S2_asl_i_r_sat>;
+
+// STYPE / SHIFT / Shift by register.
+def HEXAGON_S2_asl_r_r:
+  si_SInst_sisi                   <"asl",     int_hexagon_S2_asl_r_r>;
+def HEXAGON_S2_asr_r_r:
+  si_SInst_sisi                   <"asr",     int_hexagon_S2_asr_r_r>;
+def HEXAGON_S2_lsl_r_r:
+  si_SInst_sisi                   <"lsl",     int_hexagon_S2_lsl_r_r>;
+def HEXAGON_S2_lsr_r_r:
+  si_SInst_sisi                   <"lsr",     int_hexagon_S2_lsr_r_r>;
+def HEXAGON_S2_asl_r_p:
+  di_SInst_disi                   <"asl",     int_hexagon_S2_asl_r_p>;
+def HEXAGON_S2_asr_r_p:
+  di_SInst_disi                   <"asr",     int_hexagon_S2_asr_r_p>;
+def HEXAGON_S2_lsl_r_p:
+  di_SInst_disi                   <"lsl",     int_hexagon_S2_lsl_r_p>;
+def HEXAGON_S2_lsr_r_p:
+  di_SInst_disi                   <"lsr",     int_hexagon_S2_lsr_r_p>;
+
+// STYPE / SHIFT / Shift by register and accumulate.
+def HEXAGON_S2_asl_r_r_acc:
+  si_SInst_sisisi_acc             <"asl",     int_hexagon_S2_asl_r_r_acc>;
+def HEXAGON_S2_asr_r_r_acc:
+  si_SInst_sisisi_acc             <"asr",     int_hexagon_S2_asr_r_r_acc>;
+def HEXAGON_S2_lsl_r_r_acc:
+  si_SInst_sisisi_acc             <"lsl",     int_hexagon_S2_lsl_r_r_acc>;
+def HEXAGON_S2_lsr_r_r_acc:
+  si_SInst_sisisi_acc             <"lsr",     int_hexagon_S2_lsr_r_r_acc>;
+def HEXAGON_S2_asl_r_p_acc:
+  di_SInst_didisi_acc             <"asl",     int_hexagon_S2_asl_r_p_acc>;
+def HEXAGON_S2_asr_r_p_acc:
+  di_SInst_didisi_acc             <"asr",     int_hexagon_S2_asr_r_p_acc>;
+def HEXAGON_S2_lsl_r_p_acc:
+  di_SInst_didisi_acc             <"lsl",     int_hexagon_S2_lsl_r_p_acc>;
+def HEXAGON_S2_lsr_r_p_acc:
+  di_SInst_didisi_acc             <"lsr",     int_hexagon_S2_lsr_r_p_acc>;
+
+def HEXAGON_S2_asl_r_r_nac:
+  si_SInst_sisisi_nac             <"asl",     int_hexagon_S2_asl_r_r_nac>;
+def HEXAGON_S2_asr_r_r_nac:
+  si_SInst_sisisi_nac             <"asr",     int_hexagon_S2_asr_r_r_nac>;
+def HEXAGON_S2_lsl_r_r_nac:
+  si_SInst_sisisi_nac             <"lsl",     int_hexagon_S2_lsl_r_r_nac>;
+def HEXAGON_S2_lsr_r_r_nac:
+  si_SInst_sisisi_nac             <"lsr",     int_hexagon_S2_lsr_r_r_nac>;
+def HEXAGON_S2_asl_r_p_nac:
+  di_SInst_didisi_nac             <"asl",     int_hexagon_S2_asl_r_p_nac>;
+def HEXAGON_S2_asr_r_p_nac:
+  di_SInst_didisi_nac             <"asr",     int_hexagon_S2_asr_r_p_nac>;
+def HEXAGON_S2_lsl_r_p_nac:
+  di_SInst_didisi_nac             <"lsl",     int_hexagon_S2_lsl_r_p_nac>;
+def HEXAGON_S2_lsr_r_p_nac:
+  di_SInst_didisi_nac             <"lsr",     int_hexagon_S2_lsr_r_p_nac>;
+
+// STYPE / SHIFT / Shift by register and logical.
+def HEXAGON_S2_asl_r_r_and:
+  si_SInst_sisisi_and             <"asl",     int_hexagon_S2_asl_r_r_and>;
+def HEXAGON_S2_asr_r_r_and:
+  si_SInst_sisisi_and             <"asr",     int_hexagon_S2_asr_r_r_and>;
+def HEXAGON_S2_lsl_r_r_and:
+  si_SInst_sisisi_and             <"lsl",     int_hexagon_S2_lsl_r_r_and>;
+def HEXAGON_S2_lsr_r_r_and:
+  si_SInst_sisisi_and             <"lsr",     int_hexagon_S2_lsr_r_r_and>;
+
+def HEXAGON_S2_asl_r_r_or:
+  si_SInst_sisisi_or              <"asl",     int_hexagon_S2_asl_r_r_or>;
+def HEXAGON_S2_asr_r_r_or:
+  si_SInst_sisisi_or              <"asr",     int_hexagon_S2_asr_r_r_or>;
+def HEXAGON_S2_lsl_r_r_or:
+  si_SInst_sisisi_or              <"lsl",     int_hexagon_S2_lsl_r_r_or>;
+def HEXAGON_S2_lsr_r_r_or:
+  si_SInst_sisisi_or              <"lsr",     int_hexagon_S2_lsr_r_r_or>;
+
+def HEXAGON_S2_asl_r_p_and:
+  di_SInst_didisi_and             <"asl",     int_hexagon_S2_asl_r_p_and>;
+def HEXAGON_S2_asr_r_p_and:
+  di_SInst_didisi_and             <"asr",     int_hexagon_S2_asr_r_p_and>;
+def HEXAGON_S2_lsl_r_p_and:
+  di_SInst_didisi_and             <"lsl",     int_hexagon_S2_lsl_r_p_and>;
+def HEXAGON_S2_lsr_r_p_and:
+  di_SInst_didisi_and             <"lsr",     int_hexagon_S2_lsr_r_p_and>;
+
+def HEXAGON_S2_asl_r_p_or:
+  di_SInst_didisi_or              <"asl",     int_hexagon_S2_asl_r_p_or>;
+def HEXAGON_S2_asr_r_p_or:
+  di_SInst_didisi_or              <"asr",     int_hexagon_S2_asr_r_p_or>;
+def HEXAGON_S2_lsl_r_p_or:
+  di_SInst_didisi_or              <"lsl",     int_hexagon_S2_lsl_r_p_or>;
+def HEXAGON_S2_lsr_r_p_or:
+  di_SInst_didisi_or              <"lsr",     int_hexagon_S2_lsr_r_p_or>;
+
+// STYPE / SHIFT / Shift by register with saturation.
+def HEXAGON_S2_asl_r_r_sat:
+  si_SInst_sisi_sat               <"asl",     int_hexagon_S2_asl_r_r_sat>;
+def HEXAGON_S2_asr_r_r_sat:
+  si_SInst_sisi_sat               <"asr",     int_hexagon_S2_asr_r_r_sat>;
+
+// STYPE / SHIFT / Table Index.
+def Hexagon_S2_tableidxb_goodsyntax:
+  si_MInst_sisiu4u5          <"tableidxb",int_hexagon_S2_tableidxb_goodsyntax>;
+def Hexagon_S2_tableidxd_goodsyntax:
+  si_MInst_sisiu4u5          <"tableidxd",int_hexagon_S2_tableidxd_goodsyntax>;
+def Hexagon_S2_tableidxh_goodsyntax:
+  si_MInst_sisiu4u5          <"tableidxh",int_hexagon_S2_tableidxh_goodsyntax>;
+def Hexagon_S2_tableidxw_goodsyntax:
+  si_MInst_sisiu4u5          <"tableidxw",int_hexagon_S2_tableidxw_goodsyntax>;
+
+
+/********************************************************************
+*            STYPE/VH                                               *
+*********************************************************************/
+
+// STYPE / VH / Vector absolute value halfwords.
+// Rdd64=vabsh(Rss64)
+def HEXAGON_A2_vabsh:
+  di_SInst_di                     <"vabsh",   int_hexagon_A2_vabsh>;
+def HEXAGON_A2_vabshsat:
+  di_SInst_di_sat                 <"vabsh",   int_hexagon_A2_vabshsat>;
+
+// STYPE / VH / Vector shift halfwords by immediate.
+// Rdd64=v[asl/asr/lsr]h(Rss64,Rt32)
+def HEXAGON_S2_asl_i_vh:
+  di_SInst_disi                   <"vaslh",   int_hexagon_S2_asl_i_vh>;
+def HEXAGON_S2_asr_i_vh:
+  di_SInst_disi                   <"vasrh",   int_hexagon_S2_asr_i_vh>;
+def HEXAGON_S2_lsr_i_vh:
+  di_SInst_disi                   <"vlsrh",   int_hexagon_S2_lsr_i_vh>;
+
+// STYPE / VH / Vector shift halfwords by register.
+// Rdd64=v[asl/asr/lsl/lsr]w(Rss64,Rt32)
+def HEXAGON_S2_asl_r_vh:
+  di_SInst_disi                   <"vaslh",   int_hexagon_S2_asl_r_vh>;
+def HEXAGON_S2_asr_r_vh:
+  di_SInst_disi                   <"vasrh",   int_hexagon_S2_asr_r_vh>;
+def HEXAGON_S2_lsl_r_vh:
+  di_SInst_disi                   <"vlslh",   int_hexagon_S2_lsl_r_vh>;
+def HEXAGON_S2_lsr_r_vh:
+  di_SInst_disi                   <"vlsrh",   int_hexagon_S2_lsr_r_vh>;
+
+
+/********************************************************************
+*            STYPE/VW                                               *
+*********************************************************************/
+
+// STYPE / VW / Vector absolute value words.
+def HEXAGON_A2_vabsw:
+  di_SInst_di                     <"vabsw",   int_hexagon_A2_vabsw>;
+def HEXAGON_A2_vabswsat:
+  di_SInst_di_sat                 <"vabsw",   int_hexagon_A2_vabswsat>;
+
+// STYPE / VW / Vector shift words by immediate.
+// Rdd64=v[asl/vsl]w(Rss64,Rt32)
+def HEXAGON_S2_asl_i_vw:
+  di_SInst_disi                   <"vaslw",   int_hexagon_S2_asl_i_vw>;
+def HEXAGON_S2_asr_i_vw:
+  di_SInst_disi                   <"vasrw",   int_hexagon_S2_asr_i_vw>;
+def HEXAGON_S2_lsr_i_vw:
+  di_SInst_disi                   <"vlsrw",   int_hexagon_S2_lsr_i_vw>;
+
+// STYPE / VW / Vector shift words by register.
+// Rdd64=v[asl/vsl]w(Rss64,Rt32)
+def HEXAGON_S2_asl_r_vw:
+  di_SInst_disi                   <"vaslw",   int_hexagon_S2_asl_r_vw>;
+def HEXAGON_S2_asr_r_vw:
+  di_SInst_disi                   <"vasrw",   int_hexagon_S2_asr_r_vw>;
+def HEXAGON_S2_lsl_r_vw:
+  di_SInst_disi                   <"vlslw",   int_hexagon_S2_lsl_r_vw>;
+def HEXAGON_S2_lsr_r_vw:
+  di_SInst_disi                   <"vlsrw",   int_hexagon_S2_lsr_r_vw>;
+
+// STYPE / VW / Vector shift words with truncate and pack.
+def HEXAGON_S2_asr_r_svw_trun:
+  si_SInst_disi                   <"vasrw",   int_hexagon_S2_asr_r_svw_trun>;
+def HEXAGON_S2_asr_i_svw_trun:
+  si_SInst_diu5                   <"vasrw",   int_hexagon_S2_asr_i_svw_trun>;
+
+// LD / Circular loads.
+def HEXAGON_circ_ldd:
+  di_LDInstPI_diu4                <"circ_ldd", int_hexagon_circ_ldd>;
+
+include "HexagonIntrinsicsV3.td"
+include "HexagonIntrinsicsV4.td"
+include "HexagonIntrinsicsV5.td"
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsDerived.td b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsDerived.td
new file mode 100644
index 0000000..2788101
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsDerived.td
@@ -0,0 +1,39 @@
+//===-- HexagonIntrinsicsDerived.td - Derived intrinsics ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Multiply 64-bit and use lower result
+//
+// Optimized with intrinisics accumulates
+//
+def : Pat <(mul DoubleRegs:$src1, DoubleRegs:$src2),
+      (i64
+       (COMBINE_rr
+        (HEXAGON_M2_maci
+         (HEXAGON_M2_maci
+          (i32
+           (EXTRACT_SUBREG
+            (i64
+             (MPYU64 (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1),
+                                          subreg_loreg)),
+                     (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2),
+                                          subreg_loreg)))),
+            subreg_hireg)),
+          (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_loreg)),
+          (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), subreg_hireg))),
+         (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), subreg_loreg)),
+         (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_hireg))),
+        (i32
+         (EXTRACT_SUBREG
+          (i64
+           (MPYU64 (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_loreg)),
+                   (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2),
+                                        subreg_loreg)))), subreg_loreg))))>;
+
+
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV3.td b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV3.td
new file mode 100644
index 0000000..2a54e62
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV3.td
@@ -0,0 +1,50 @@
+//=- HexagonIntrinsicsV3.td - Target Description for Hexagon -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon V3 Compiler Intrinsics in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+
+
+
+// MTYPE / COMPLEX / Vector reduce complex multiply real or imaginary.
+def Hexagon_M2_vrcmpys_s1:
+  di_MInst_disi_s1_sat            <"vrcmpys",  int_hexagon_M2_vrcmpys_s1>;
+def Hexagon_M2_vrcmpys_acc_s1:
+  di_MInst_didisi_acc_s1_sat      <"vrcmpys",  int_hexagon_M2_vrcmpys_acc_s1>;
+def Hexagon_M2_vrcmpys_s1rp:
+  si_MInst_disi_s1_rnd_sat        <"vrcmpys",  int_hexagon_M2_vrcmpys_s1rp>;
+
+
+
+
+/********************************************************************
+*            MTYPE/VB                                               *
+*********************************************************************/
+
+// MTYPE / VB / Vector reduce add unsigned bytes.
+def Hexagon_M2_vradduh:
+  si_MInst_didi                   <"vradduh",  int_hexagon_M2_vradduh>;
+
+
+/********************************************************************
+*            ALU64/ALU                                              *
+*********************************************************************/
+
+// ALU64 / ALU / Add.
+def Hexagon_A2_addsp:
+  di_ALU64_sidi                   <"add",      int_hexagon_A2_addsp>;
+def Hexagon_A2_addpsat:
+  di_ALU64_didi                   <"add",      int_hexagon_A2_addpsat>;
+
+def Hexagon_A2_maxp:
+  di_ALU64_didi                   <"max",      int_hexagon_A2_maxp>;
+def Hexagon_A2_maxup:
+  di_ALU64_didi                   <"maxu",     int_hexagon_A2_maxup>;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV4.td b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV4.td
new file mode 100644
index 0000000..dd28ebb
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV4.td
@@ -0,0 +1,369 @@
+//===- HexagonIntrinsicsV4.td - V4 Instruction intrinsics --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This is populated based on the following specs:
+// Hexagon V4 Architecture Extensions
+// Application-Level Specification
+// 80-V9418-12 Rev. A
+// June 15, 2010
+
+
+//
+// ALU 32 types.
+//
+
+class si_ALU32_sisi_not<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, ~$src2)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_ALU32_s8si<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs DoubleRegs:$dst), (ins s8Imm:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "(#$src1, $src2)")),
+             [(set DoubleRegs:$dst, (IntID imm:$src1, IntRegs:$src2))]>;
+
+class di_ALU32_sis8<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src1, s8Imm:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class qi_neg_ALU32_sisi<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = !", !strconcat(opc , "($src1, $src2)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class qi_neg_ALU32_sis10<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$src1, s10Imm:$src2),
+             !strconcat("$dst = !", !strconcat(opc , "($src1, #$src2)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class qi_neg_ALU32_siu9<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$src1, u9Imm:$src2),
+             !strconcat("$dst = !", !strconcat(opc , "($src1, #$src2)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class si_neg_ALU32_sisi<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = !", !strconcat(opc , "($src1, $src2)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_neg_ALU32_sis8<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, s8Imm:$src2),
+             !strconcat("$dst = !", !strconcat(opc , "($src1, #$src2)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class si_ALU32_sis8<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, s8Imm:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+
+//
+// SInst Classes.
+//
+class qi_neg_SInst_qiqi<string opc, Intrinsic IntID>
+  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = !", !strconcat(opc , "($src1, $src2)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class qi_SInst_qi_andqiqi_neg<string opc, Intrinsic IntID>
+  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
+                                     IntRegs:$src3),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, and($src2, !$src3)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2,
+                                         IntRegs:$src3))]>;
+
+class qi_SInst_qi_andqiqi<string opc, Intrinsic IntID>
+  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
+                                     IntRegs:$src3),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, and($src2, $src3)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2,
+                                         IntRegs:$src3))]>;
+
+class qi_SInst_qi_orqiqi_neg<string opc, Intrinsic IntID>
+  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
+                                     IntRegs:$src3),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, or($src2, !$src3)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2,
+                                         IntRegs:$src3))]>;
+
+class qi_SInst_qi_orqiqi<string opc, Intrinsic IntID>
+  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
+                                     IntRegs:$src3),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, or($src2, $src3)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2,
+                                         IntRegs:$src3))]>;
+
+class si_SInst_si_addsis6<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2, s6Imm:$src3),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, add($src2, #$src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2,
+                                        imm:$src3))]>;
+
+class si_SInst_si_subs6si<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, s6Imm:$src2, IntRegs:$src3),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, sub(#$src2, $src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2,
+                                        IntRegs:$src3))]>;
+
+class di_ALU64_didi_neg<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, ~$src2)")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>;
+
+class di_MInst_dididi_xacc<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           DoubleRegs:$src2),
+               !strconcat("$dst ^= ", !strconcat(opc , "($src1, $src2)")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, DoubleRegs:$src1,
+                                             DoubleRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_and<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$dst1, IntRegs:$src2,
+                                    IntRegs:$src3),
+             !strconcat("$dst &= ", !strconcat(opc , "($src2, $src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$dst1, IntRegs:$src2,
+                                        IntRegs:$src3))]>;
+
+class si_MInst_sisisi_andn<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$dst1, IntRegs:$src2,
+                                    IntRegs:$src3),
+             !strconcat("$dst &= ", !strconcat(opc , "($src2, ~$src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$dst1, IntRegs:$src2,
+                                        IntRegs:$src3))]>;
+
+class si_SInst_sisis10_andi<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2, s10Imm:$src3),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, and($src2, #$src3))")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2,
+                                        imm:$src3))]>;
+
+class si_MInst_sisisi_xor<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$dst1, IntRegs:$src2,
+                                    IntRegs:$src3),
+             !strconcat("$dst ^= ", !strconcat(opc , "($src2, $src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$dst1, IntRegs:$src2,
+                                        IntRegs:$src3))]>;
+
+class si_MInst_sisisi_xorn<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$dst1, IntRegs:$src2,
+                                    IntRegs:$src3),
+             !strconcat("$dst ^= ", !strconcat(opc , "($src2, ~$src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$dst1, IntRegs:$src2,
+                                        IntRegs:$src3))]>;
+
+class si_SInst_sisis10_or<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$dst1, IntRegs:$src2, s10Imm:$src3),
+             !strconcat("$dst |= ", !strconcat(opc , "($src2, #$src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$dst1, IntRegs:$src2,
+                                        imm:$src3))]>;
+
+class si_MInst_sisisi_or<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$dst1, IntRegs:$src2,
+                                    IntRegs:$src3),
+             !strconcat("$dst |= ", !strconcat(opc , "($src2, $src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$dst1, IntRegs:$src2,
+                                        IntRegs:$src3))]>;
+
+class si_MInst_sisisi_orn<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$dst1, IntRegs:$src2,
+                                    IntRegs:$src3),
+             !strconcat("$dst |= ", !strconcat(opc , "($src2, ~$src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$dst1, IntRegs:$src2,
+                                        IntRegs:$src3))]>;
+
+class si_SInst_siu5_sat<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2):sat")),
+          [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+
+/********************************************************************
+*            ALU32/ALU                                              *
+*********************************************************************/
+
+// ALU32 / ALU / Logical Operations.
+def Hexagon_A4_orn  : si_ALU32_sisi_not <"or",  int_hexagon_A4_orn>;
+def Hexagon_A4_andn : si_ALU32_sisi_not <"and", int_hexagon_A4_andn>;
+
+
+/********************************************************************
+*            ALU32/PERM                                             *
+*********************************************************************/
+
+// ALU32 / PERM / Combine Words Into Doublewords.
+def Hexagon_A4_combineir : di_ALU32_s8si  <"combine", int_hexagon_A4_combineir>;
+def Hexagon_A4_combineri : di_ALU32_sis8  <"combine", int_hexagon_A4_combineri>;
+
+
+/********************************************************************
+*            ALU32/PRED                                             *
+*********************************************************************/
+
+// ALU32 / PRED / Conditional Shift Halfword.
+// ALU32 / PRED / Conditional Sign Extend.
+// ALU32 / PRED / Conditional Zero Extend.
+// ALU32 / PRED / Compare.
+def Hexagon_C4_cmpneq  : qi_neg_ALU32_sisi  <"cmp.eq", int_hexagon_C4_cmpneq>;
+def Hexagon_C4_cmpneqi : qi_neg_ALU32_sis10 <"cmp.eq", int_hexagon_C4_cmpneqi>;
+def Hexagon_C4_cmplte  : qi_neg_ALU32_sisi  <"cmp.gt", int_hexagon_C4_cmplte>;
+def Hexagon_C4_cmpltei : qi_neg_ALU32_sis10 <"cmp.gt", int_hexagon_C4_cmpltei>;
+def Hexagon_C4_cmplteu : qi_neg_ALU32_sisi  <"cmp.gtu",int_hexagon_C4_cmplteu>;
+def Hexagon_C4_cmplteui: qi_neg_ALU32_siu9  <"cmp.gtu",int_hexagon_C4_cmplteui>;
+
+// ALU32 / PRED / cmpare To General Register.
+def Hexagon_A4_rcmpneq : si_neg_ALU32_sisi <"cmp.eq", int_hexagon_A4_rcmpneq>;
+def Hexagon_A4_rcmpneqi: si_neg_ALU32_sis8 <"cmp.eq", int_hexagon_A4_rcmpneqi>;
+def Hexagon_A4_rcmpeq  : si_ALU32_sisi     <"cmp.eq", int_hexagon_A4_rcmpeq>;
+def Hexagon_A4_rcmpeqi : si_ALU32_sis8     <"cmp.eq", int_hexagon_A4_rcmpeqi>;
+
+
+/********************************************************************
+*            CR                                                     *
+*********************************************************************/
+
+// CR / Corner Detection Acceleration.
+def Hexagon_C4_fastcorner9:
+  qi_SInst_qiqi<"fastcorner9", int_hexagon_C4_fastcorner9>;
+def Hexagon_C4_fastcorner9_not:
+  qi_neg_SInst_qiqi<"fastcorner9",int_hexagon_C4_fastcorner9_not>;
+
+// CR / Logical Operations On Predicates.
+def Hexagon_C4_and_andn:
+  qi_SInst_qi_andqiqi_neg         <"and",      int_hexagon_C4_and_andn>;
+def Hexagon_C4_and_and:
+  qi_SInst_qi_andqiqi             <"and",      int_hexagon_C4_and_and>;
+def Hexagon_C4_and_orn:
+  qi_SInst_qi_orqiqi_neg          <"and",      int_hexagon_C4_and_orn>;
+def Hexagon_C4_and_or:
+  qi_SInst_qi_orqiqi              <"and",      int_hexagon_C4_and_or>;
+def Hexagon_C4_or_andn:
+  qi_SInst_qi_andqiqi_neg         <"or",       int_hexagon_C4_or_andn>;
+def Hexagon_C4_or_and:
+  qi_SInst_qi_andqiqi             <"or",       int_hexagon_C4_or_and>;
+def Hexagon_C4_or_orn:
+  qi_SInst_qi_orqiqi_neg          <"or",       int_hexagon_C4_or_orn>;
+def Hexagon_C4_or_or:
+  qi_SInst_qi_orqiqi              <"or",       int_hexagon_C4_or_or>;
+
+
+/********************************************************************
+*            XTYPE/ALU                                              *
+*********************************************************************/
+
+// XTYPE / ALU / Add And Accumulate.
+def Hexagon_S4_addaddi:
+  si_SInst_si_addsis6             <"add",      int_hexagon_S4_addaddi>;
+def Hexagon_S4_subaddi:
+  si_SInst_si_subs6si             <"add",      int_hexagon_S4_subaddi>;
+
+// XTYPE / ALU / Logical Doublewords.
+def Hexagon_S4_andnp:
+  di_ALU64_didi_neg               <"and",      int_hexagon_A4_andnp>;
+def Hexagon_S4_ornp:
+  di_ALU64_didi_neg               <"or",       int_hexagon_A4_ornp>;
+
+// XTYPE / ALU / Logical-logical Doublewords.
+def Hexagon_M4_xor_xacc:
+  di_MInst_dididi_xacc            <"xor",      int_hexagon_M4_xor_xacc>;
+
+// XTYPE / ALU / Logical-logical Words.
+def HEXAGON_M4_and_and:
+  si_MInst_sisisi_and             <"and",      int_hexagon_M4_and_and>;
+def HEXAGON_M4_and_or:
+  si_MInst_sisisi_and             <"or",       int_hexagon_M4_and_or>;
+def HEXAGON_M4_and_xor:
+  si_MInst_sisisi_and             <"xor",      int_hexagon_M4_and_xor>;
+def HEXAGON_M4_and_andn:
+  si_MInst_sisisi_andn            <"and",      int_hexagon_M4_and_andn>;
+def HEXAGON_M4_xor_and:
+  si_MInst_sisisi_xor             <"and",      int_hexagon_M4_xor_and>;
+def HEXAGON_M4_xor_or:
+  si_MInst_sisisi_xor             <"or",       int_hexagon_M4_xor_or>;
+def HEXAGON_M4_xor_andn:
+  si_MInst_sisisi_xorn            <"and",      int_hexagon_M4_xor_andn>;
+def HEXAGON_M4_or_and:
+  si_MInst_sisisi_or              <"and",      int_hexagon_M4_or_and>;
+def HEXAGON_M4_or_or:
+  si_MInst_sisisi_or              <"or",       int_hexagon_M4_or_or>;
+def HEXAGON_M4_or_xor:
+  si_MInst_sisisi_or              <"xor",      int_hexagon_M4_or_xor>;
+def HEXAGON_M4_or_andn:
+  si_MInst_sisisi_orn             <"and",      int_hexagon_M4_or_andn>;
+def HEXAGON_S4_or_andix:
+  si_SInst_sisis10_andi           <"or",       int_hexagon_S4_or_andix>;
+def HEXAGON_S4_or_andi:
+  si_SInst_sisis10_or             <"and",      int_hexagon_S4_or_andi>;
+def HEXAGON_S4_or_ori:
+  si_SInst_sisis10_or             <"or",       int_hexagon_S4_or_ori>;
+
+// XTYPE / ALU / Modulo wrap.
+def HEXAGON_A4_modwrapu:
+  si_ALU64_sisi                   <"modwrap",  int_hexagon_A4_modwrapu>;
+
+// XTYPE / ALU / Round.
+def HEXAGON_A4_cround_ri:
+  si_SInst_siu5                   <"cround",   int_hexagon_A4_cround_ri>;
+def HEXAGON_A4_cround_rr:
+  si_SInst_sisi                   <"cround",   int_hexagon_A4_cround_rr>;
+def HEXAGON_A4_round_ri:
+  si_SInst_siu5                   <"round",    int_hexagon_A4_round_ri>;
+def HEXAGON_A4_round_rr:
+  si_SInst_sisi                   <"round",    int_hexagon_A4_round_rr>;
+def HEXAGON_A4_round_ri_sat:
+  si_SInst_siu5_sat               <"round",    int_hexagon_A4_round_ri_sat>;
+def HEXAGON_A4_round_rr_sat:
+  si_SInst_sisi_sat               <"round",    int_hexagon_A4_round_rr_sat>;
+
+// XTYPE / ALU / Vector reduce add unsigned halfwords.
+// XTYPE / ALU / Vector add bytes.
+// XTYPE / ALU / Vector conditional negate.
+// XTYPE / ALU / Vector maximum bytes.
+// XTYPE / ALU / Vector reduce maximum halfwords.
+// XTYPE / ALU / Vector reduce maximum words.
+// XTYPE / ALU / Vector minimum bytes.
+// XTYPE / ALU / Vector reduce minimum halfwords.
+// XTYPE / ALU / Vector reduce minimum words.
+// XTYPE / ALU / Vector subtract bytes.
+
+
+/********************************************************************
+*            XTYPE/BIT                                              *
+*********************************************************************/
+
+// XTYPE / BIT / Count leading.
+// XTYPE / BIT / Count trailing.
+// XTYPE / BIT / Extract bitfield.
+// XTYPE / BIT / Masked parity.
+// XTYPE / BIT / Bit reverse.
+// XTYPE / BIT / Split bitfield.
+
+
+/********************************************************************
+*            XTYPE/COMPLEX                                          *
+*********************************************************************/
+
+// XTYPE / COMPLEX / Complex add/sub halfwords.
+// XTYPE / COMPLEX / Complex add/sub words.
+// XTYPE / COMPLEX / Complex multiply 32x16.
+// XTYPE / COMPLEX / Vector reduce complex rotate.
+
+
+/********************************************************************
+*            XTYPE/MPY                                              *
+*********************************************************************/
+
+// XTYPE / COMPLEX / Complex add/sub halfwords.
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV5.td b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV5.td
new file mode 100644
index 0000000..1d44b52
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV5.td
@@ -0,0 +1,395 @@
+class sf_SInst_sf<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1))]>;
+
+class si_SInst_sf<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1))]>;
+
+class sf_SInst_si<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1))]>;
+
+class sf_SInst_di<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set IntRegs:$dst, (IntID DoubleRegs:$src1))]>;
+
+class sf_SInst_df<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set IntRegs:$dst, (IntID DoubleRegs:$src1))]>;
+
+class si_SInst_df<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set IntRegs:$dst, (IntID DoubleRegs:$src1))]>;
+
+class df_SInst_sf<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1))]>;
+
+class di_SInst_sf<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1))]>;
+
+class df_SInst_si<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1))]>;
+
+class df_SInst_df<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1))]>;
+
+class di_SInst_df<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1))]>;
+
+
+class df_SInst_di<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1))]>;
+
+class sf_MInst_sfsf<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class df_MInst_dfdf<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+           !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+           [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>;
+
+class qi_ALU64_dfdf<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs PredRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+           !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+           [(set PredRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>;
+
+class qi_ALU64_dfu5<string opc, Intrinsic IntID>
+  : ALU64_ri<(outs PredRegs:$dst), (ins DoubleRegs:$src1, u5Imm:$src2),
+           !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+           [(set PredRegs:$dst, (IntID DoubleRegs:$src1, imm:$src2))]>;
+
+
+class sf_MInst_sfsfsf_acc<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
+                                        IntRegs:$dst2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1, $src2)")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1,
+                                          IntRegs:$src2, IntRegs:$dst2))],
+               "$dst2 = $dst">;
+
+class sf_MInst_sfsfsf_nac<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
+                                        IntRegs:$dst2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1, $src2)")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1,
+                                          IntRegs:$src2, IntRegs:$dst2))],
+               "$dst2 = $dst">;
+
+
+class sf_MInst_sfsfsfsi_sc<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2, IntRegs:$src3),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1, $src2, $src3):scale")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2, IntRegs:$src3))],
+               "$dst2 = $dst">;
+
+class sf_MInst_sfsfsf_acc_lib<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
+                                        IntRegs:$dst2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1, $src2):lib")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1,
+                                          IntRegs:$src2, IntRegs:$dst2))],
+               "$dst2 = $dst">;
+
+class sf_MInst_sfsfsf_nac_lib<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
+                                        IntRegs:$dst2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1, $src2):lib")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1,
+                                          IntRegs:$src2, IntRegs:$dst2))],
+               "$dst2 = $dst">;
+
+class df_MInst_dfdfdf_acc<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2,
+                                        DoubleRegs:$dst2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1, $src2)")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                          DoubleRegs:$src2, DoubleRegs:$dst2))],
+               "$dst2 = $dst">;
+
+class df_MInst_dfdfdf_nac<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2,
+                                        DoubleRegs:$dst2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1, $src2)")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                          DoubleRegs:$src2, DoubleRegs:$dst2))],
+               "$dst2 = $dst">;
+
+
+class df_MInst_dfdfdfsi_sc<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                        DoubleRegs:$src2, IntRegs:$src3),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1, $src2, $src3):scale")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, DoubleRegs:$src1,
+                                        DoubleRegs:$src2, IntRegs:$src3))],
+               "$dst2 = $dst">;
+
+class df_MInst_dfdfdf_acc_lib<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2,
+                                        DoubleRegs:$dst2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1, $src2):lib")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                          DoubleRegs:$src2, DoubleRegs:$dst2))],
+               "$dst2 = $dst">;
+
+class df_MInst_dfdfdf_nac_lib<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2,
+                                        DoubleRegs:$dst2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1, $src2):lib")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                          DoubleRegs:$src2, DoubleRegs:$dst2))],
+               "$dst2 = $dst">;
+
+class qi_SInst_sfsf<string opc, Intrinsic IntID>
+  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class qi_SInst_sfu5<string opc, Intrinsic IntID>
+  : MInst<(outs PredRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class sf_ALU64_u10_pos<string opc, Intrinsic IntID>
+  : ALU64_ri<(outs IntRegs:$dst), (ins u10Imm:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "#$src1):pos")),
+             [(set IntRegs:$dst, (IntID imm:$src1))]>;
+
+class sf_ALU64_u10_neg<string opc, Intrinsic IntID>
+  : ALU64_ri<(outs IntRegs:$dst), (ins u10Imm:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "#$src1):neg")),
+             [(set IntRegs:$dst, (IntID imm:$src1))]>;
+
+class df_ALU64_u10_pos<string opc, Intrinsic IntID>
+  : ALU64_ri<(outs DoubleRegs:$dst), (ins u10Imm:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "#$src1):pos")),
+             [(set DoubleRegs:$dst, (IntID imm:$src1))]>;
+
+class df_ALU64_u10_neg<string opc, Intrinsic IntID>
+  : ALU64_ri<(outs DoubleRegs:$dst), (ins u10Imm:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "#$src1):neg")),
+             [(set DoubleRegs:$dst, (IntID imm:$src1))]>;
+
+class di_MInst_diu6<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, u6Imm:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, imm:$src2))]>;
+
+class di_MInst_diu4_rnd<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, u4Imm:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2):rnd")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, imm:$src2))]>;
+
+class si_MInst_diu4_rnd_sat<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, u4Imm:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2):rnd:sat")),
+          [(set IntRegs:$dst, (IntID DoubleRegs:$src1, imm:$src2))]>;
+
+class si_SInst_diu4_sat<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, u4Imm:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2):sat")),
+          [(set IntRegs:$dst, (IntID DoubleRegs:$src1, imm:$src2))]>;
+
+
+def HEXAGON_C4_fastcorner9:
+    qi_SInst_qiqi       <"fastcorner9", int_hexagon_C4_fastcorner9>;
+def HEXAGON_C4_fastcorner9_not:
+    qi_SInst_qiqi <"!fastcorner9", int_hexagon_C4_fastcorner9_not>;
+def HEXAGON_M5_vrmpybuu:
+    di_MInst_didi <"vrmpybu", int_hexagon_M5_vrmpybuu>;
+def HEXAGON_M5_vrmacbuu:
+    di_MInst_dididi_acc <"vrmpybu", int_hexagon_M5_vrmacbuu>;
+def HEXAGON_M5_vrmpybsu:
+    di_MInst_didi <"vrmpybsu", int_hexagon_M5_vrmpybsu>;
+def HEXAGON_M5_vrmacbsu:
+    di_MInst_dididi_acc <"vrmpybsu", int_hexagon_M5_vrmacbsu>;
+def HEXAGON_M5_vmpybuu:
+    di_MInst_sisi <"vmpybu", int_hexagon_M5_vmpybuu>;
+def HEXAGON_M5_vmpybsu:
+    di_MInst_sisi <"vmpybsu", int_hexagon_M5_vmpybsu>;
+def HEXAGON_M5_vmacbuu:
+    di_MInst_disisi_acc <"vmpybu", int_hexagon_M5_vmacbuu>;
+def HEXAGON_M5_vmacbsu:
+    di_MInst_disisi_acc <"vmpybsu", int_hexagon_M5_vmacbsu>;
+def HEXAGON_M5_vdmpybsu:
+    di_MInst_didi_sat <"vdmpybsu", int_hexagon_M5_vdmpybsu>;
+def HEXAGON_M5_vdmacbsu:
+    di_MInst_dididi_acc_sat <"vdmpybsu", int_hexagon_M5_vdmacbsu>;
+def HEXAGON_A5_vaddhubs:
+    si_SInst_didi_sat <"vaddhub", int_hexagon_A5_vaddhubs>;
+def HEXAGON_S5_popcountp:
+    si_SInst_di <"popcount", int_hexagon_S5_popcountp>;
+def HEXAGON_S5_asrhub_rnd_sat_goodsyntax:
+    si_MInst_diu4_rnd_sat <"vasrhub", int_hexagon_S5_asrhub_rnd_sat_goodsyntax>;
+def HEXAGON_S5_asrhub_sat:
+    si_SInst_diu4_sat <"vasrhub", int_hexagon_S5_asrhub_sat>;
+def HEXAGON_S5_vasrhrnd_goodsyntax:
+    di_MInst_diu4_rnd <"vasrh", int_hexagon_S5_vasrhrnd_goodsyntax>;
+def HEXAGON_S2_asr_i_p_rnd:
+    di_SInst_diu6 <"asr", int_hexagon_S2_asr_i_p_rnd>;
+def HEXAGON_S2_asr_i_p_rnd_goodsyntax:
+    di_MInst_diu6 <"asrrnd", int_hexagon_S2_asr_i_p_rnd_goodsyntax>;
+def HEXAGON_F2_sfadd:
+    sf_MInst_sfsf <"sfadd", int_hexagon_F2_sfadd>;
+def HEXAGON_F2_sfsub:
+    sf_MInst_sfsf <"sfsub", int_hexagon_F2_sfsub>;
+def HEXAGON_F2_sfmpy:
+    sf_MInst_sfsf <"sfmpy", int_hexagon_F2_sfmpy>;
+def HEXAGON_F2_sffma:
+    sf_MInst_sfsfsf_acc <"sfmpy", int_hexagon_F2_sffma>;
+def HEXAGON_F2_sffma_sc:
+    sf_MInst_sfsfsfsi_sc <"sfmpy", int_hexagon_F2_sffma_sc>;
+def HEXAGON_F2_sffms:
+    sf_MInst_sfsfsf_nac <"sfmpy", int_hexagon_F2_sffms>;
+def HEXAGON_F2_sffma_lib:
+    sf_MInst_sfsfsf_acc_lib <"sfmpy", int_hexagon_F2_sffma_lib>;
+def HEXAGON_F2_sffms_lib:
+    sf_MInst_sfsfsf_nac_lib <"sfmpy", int_hexagon_F2_sffms_lib>;
+def HEXAGON_F2_sfcmpeq:
+    qi_SInst_sfsf <"sfcmp.eq", int_hexagon_F2_sfcmpeq>;
+def HEXAGON_F2_sfcmpgt:
+    qi_SInst_sfsf <"sfcmp.gt", int_hexagon_F2_sfcmpgt>;
+def HEXAGON_F2_sfcmpge:
+    qi_SInst_sfsf <"sfcmp.ge", int_hexagon_F2_sfcmpge>;
+def HEXAGON_F2_sfcmpuo:
+    qi_SInst_sfsf <"sfcmp.uo", int_hexagon_F2_sfcmpuo>;
+def HEXAGON_F2_sfmax:
+    sf_MInst_sfsf <"sfmax", int_hexagon_F2_sfmax>;
+def HEXAGON_F2_sfmin:
+    sf_MInst_sfsf <"sfmin", int_hexagon_F2_sfmin>;
+def HEXAGON_F2_sfclass:
+    qi_SInst_sfu5 <"sfclass", int_hexagon_F2_sfclass>;
+def HEXAGON_F2_sfimm_p:
+    sf_ALU64_u10_pos <"sfmake", int_hexagon_F2_sfimm_p>;
+def HEXAGON_F2_sfimm_n:
+    sf_ALU64_u10_neg <"sfmake", int_hexagon_F2_sfimm_n>;
+def HEXAGON_F2_sffixupn:
+    sf_MInst_sfsf <"sffixupn", int_hexagon_F2_sffixupn>;
+def HEXAGON_F2_sffixupd:
+    sf_MInst_sfsf <"sffixupd", int_hexagon_F2_sffixupd>;
+def HEXAGON_F2_sffixupr:
+    sf_SInst_sf <"sffixupr", int_hexagon_F2_sffixupr>;
+def HEXAGON_F2_dfadd:
+    df_MInst_dfdf <"dfadd", int_hexagon_F2_dfadd>;
+def HEXAGON_F2_dfsub:
+    df_MInst_dfdf <"dfsub", int_hexagon_F2_dfsub>;
+def HEXAGON_F2_dfmpy:
+    df_MInst_dfdf <"dfmpy", int_hexagon_F2_dfmpy>;
+def HEXAGON_F2_dffma:
+    df_MInst_dfdfdf_acc <"dfmpy", int_hexagon_F2_dffma>;
+def HEXAGON_F2_dffms:
+    df_MInst_dfdfdf_nac <"dfmpy", int_hexagon_F2_dffms>;
+def HEXAGON_F2_dffma_lib:
+    df_MInst_dfdfdf_acc_lib <"dfmpy", int_hexagon_F2_dffma_lib>;
+def HEXAGON_F2_dffms_lib:
+    df_MInst_dfdfdf_nac_lib <"dfmpy", int_hexagon_F2_dffms_lib>;
+def HEXAGON_F2_dffma_sc:
+    df_MInst_dfdfdfsi_sc <"dfmpy", int_hexagon_F2_dffma_sc>;
+def HEXAGON_F2_dfmax:
+    df_MInst_dfdf <"dfmax", int_hexagon_F2_dfmax>;
+def HEXAGON_F2_dfmin:
+    df_MInst_dfdf <"dfmin", int_hexagon_F2_dfmin>;
+def HEXAGON_F2_dfcmpeq:
+    qi_ALU64_dfdf <"dfcmp.eq", int_hexagon_F2_dfcmpeq>;
+def HEXAGON_F2_dfcmpgt:
+    qi_ALU64_dfdf <"dfcmp.gt", int_hexagon_F2_dfcmpgt>;
+def HEXAGON_F2_dfcmpge:
+    qi_ALU64_dfdf <"dfcmp.ge", int_hexagon_F2_dfcmpge>;
+def HEXAGON_F2_dfcmpuo:
+    qi_ALU64_dfdf <"dfcmp.uo", int_hexagon_F2_dfcmpuo>;
+def HEXAGON_F2_dfclass:
+    qi_ALU64_dfu5 <"dfclass", int_hexagon_F2_dfclass>;
+def HEXAGON_F2_dfimm_p:
+    df_ALU64_u10_pos <"dfmake", int_hexagon_F2_dfimm_p>;
+def HEXAGON_F2_dfimm_n:
+    df_ALU64_u10_neg <"dfmake", int_hexagon_F2_dfimm_n>;
+def HEXAGON_F2_dffixupn:
+    df_MInst_dfdf <"dffixupn", int_hexagon_F2_dffixupn>;
+def HEXAGON_F2_dffixupd:
+    df_MInst_dfdf <"dffixupd", int_hexagon_F2_dffixupd>;
+def HEXAGON_F2_dffixupr:
+    df_SInst_df <"dffixupr", int_hexagon_F2_dffixupr>;
+def HEXAGON_F2_conv_sf2df:
+    df_SInst_sf <"convert_sf2df", int_hexagon_F2_conv_sf2df>;
+def HEXAGON_F2_conv_df2sf:
+    sf_SInst_df <"convert_df2sf", int_hexagon_F2_conv_df2sf>;
+def HEXAGON_F2_conv_uw2sf:
+    sf_SInst_si <"convert_uw2sf", int_hexagon_F2_conv_uw2sf>;
+def HEXAGON_F2_conv_uw2df:
+    df_SInst_si <"convert_uw2df", int_hexagon_F2_conv_uw2df>;
+def HEXAGON_F2_conv_w2sf:
+    sf_SInst_si <"convert_w2sf", int_hexagon_F2_conv_w2sf>;
+def HEXAGON_F2_conv_w2df:
+    df_SInst_si <"convert_w2df", int_hexagon_F2_conv_w2df>;
+def HEXAGON_F2_conv_ud2sf:
+    sf_SInst_di <"convert_ud2sf", int_hexagon_F2_conv_ud2sf>;
+def HEXAGON_F2_conv_ud2df:
+    df_SInst_di <"convert_ud2df", int_hexagon_F2_conv_ud2df>;
+def HEXAGON_F2_conv_d2sf:
+    sf_SInst_di <"convert_d2sf", int_hexagon_F2_conv_d2sf>;
+def HEXAGON_F2_conv_d2df:
+    df_SInst_di <"convert_d2df", int_hexagon_F2_conv_d2df>;
+def HEXAGON_F2_conv_sf2uw:
+    si_SInst_sf <"convert_sf2uw", int_hexagon_F2_conv_sf2uw>;
+def HEXAGON_F2_conv_sf2w:
+    si_SInst_sf <"convert_sf2w", int_hexagon_F2_conv_sf2w>;
+def HEXAGON_F2_conv_sf2ud:
+    di_SInst_sf <"convert_sf2ud", int_hexagon_F2_conv_sf2ud>;
+def HEXAGON_F2_conv_sf2d:
+    di_SInst_sf <"convert_sf2d", int_hexagon_F2_conv_sf2d>;
+def HEXAGON_F2_conv_df2uw:
+    si_SInst_df <"convert_df2uw", int_hexagon_F2_conv_df2uw>;
+def HEXAGON_F2_conv_df2w:
+    si_SInst_df <"convert_df2w", int_hexagon_F2_conv_df2w>;
+def HEXAGON_F2_conv_df2ud:
+    di_SInst_df <"convert_df2ud", int_hexagon_F2_conv_df2ud>;
+def HEXAGON_F2_conv_df2d:
+    di_SInst_df <"convert_df2d", int_hexagon_F2_conv_df2d>;
+def HEXAGON_F2_conv_sf2uw_chop:
+    si_SInst_sf <"convert_sf2uw", int_hexagon_F2_conv_sf2uw_chop>;
+def HEXAGON_F2_conv_sf2w_chop:
+    si_SInst_sf <"convert_sf2w", int_hexagon_F2_conv_sf2w_chop>;
+def HEXAGON_F2_conv_sf2ud_chop:
+    di_SInst_sf <"convert_sf2ud", int_hexagon_F2_conv_sf2ud_chop>;
+def HEXAGON_F2_conv_sf2d_chop:
+    di_SInst_sf <"convert_sf2d", int_hexagon_F2_conv_sf2d_chop>;
+def HEXAGON_F2_conv_df2uw_chop:
+    si_SInst_df <"convert_df2uw", int_hexagon_F2_conv_df2uw_chop>;
+def HEXAGON_F2_conv_df2w_chop:
+    si_SInst_df <"convert_df2w", int_hexagon_F2_conv_df2w_chop>;
+def HEXAGON_F2_conv_df2ud_chop:
+    di_SInst_df <"convert_df2ud", int_hexagon_F2_conv_df2ud_chop>;
+def HEXAGON_F2_conv_df2d_chop:
+    di_SInst_df <"convert_df2d", int_hexagon_F2_conv_df2d_chop>;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp
new file mode 100644
index 0000000..5e4346d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp
@@ -0,0 +1,95 @@
+//===- HexagonMCInstLower.cpp - Convert Hexagon MachineInstr to an MCInst -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower Hexagon MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Hexagon.h"
+#include "HexagonAsmPrinter.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "MCTargetDesc/HexagonMCInst.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+
+using namespace llvm;
+
+static MCOperand GetSymbolRef(const MachineOperand& MO, const MCSymbol* Symbol,
+                              HexagonAsmPrinter& Printer) {
+  MCContext &MC = Printer.OutContext;
+  const MCExpr *ME;
+
+  ME = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_None, MC);
+
+  if (!MO.isJTI() && MO.getOffset())
+    ME = MCBinaryExpr::CreateAdd(ME, MCConstantExpr::Create(MO.getOffset(), MC),
+                                 MC);
+
+  return (MCOperand::CreateExpr(ME));
+}
+
+// Create an MCInst from a MachineInstr
+void llvm::HexagonLowerToMC(const MachineInstr* MI, HexagonMCInst& MCI,
+                            HexagonAsmPrinter& AP) {
+  MCI.setOpcode(MI->getOpcode());
+  MCI.setDesc(MI->getDesc());
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i < e; i++) {
+    const MachineOperand &MO = MI->getOperand(i);
+    MCOperand MCO;
+
+    switch (MO.getType()) {
+    default:
+      MI->dump();
+      llvm_unreachable("unknown operand type");
+    case MachineOperand::MO_Register:
+      // Ignore all implicit register operands.
+      if (MO.isImplicit()) continue;
+      MCO = MCOperand::CreateReg(MO.getReg());
+      break;
+    case MachineOperand::MO_FPImmediate: {
+      APFloat Val = MO.getFPImm()->getValueAPF();
+      // FP immediates are used only when setting GPRs, so they may be dealt
+      // with like regular immediates from this point on.
+      MCO = MCOperand::CreateImm(*Val.bitcastToAPInt().getRawData());
+      break;
+    }
+    case MachineOperand::MO_Immediate:
+      MCO = MCOperand::CreateImm(MO.getImm());
+      break;
+    case MachineOperand::MO_MachineBasicBlock:
+      MCO = MCOperand::CreateExpr
+              (MCSymbolRefExpr::Create(MO.getMBB()->getSymbol(),
+               AP.OutContext));
+      break;
+    case MachineOperand::MO_GlobalAddress:
+      MCO = GetSymbolRef(MO, AP.getSymbol(MO.getGlobal()), AP);
+      break;
+    case MachineOperand::MO_ExternalSymbol:
+      MCO = GetSymbolRef(MO, AP.GetExternalSymbolSymbol(MO.getSymbolName()),
+                         AP);
+      break;
+    case MachineOperand::MO_JumpTableIndex:
+      MCO = GetSymbolRef(MO, AP.GetJTISymbol(MO.getIndex()), AP);
+      break;
+    case MachineOperand::MO_ConstantPoolIndex:
+      MCO = GetSymbolRef(MO, AP.GetCPISymbol(MO.getIndex()), AP);
+      break;
+    case MachineOperand::MO_BlockAddress:
+      MCO = GetSymbolRef(MO, AP.GetBlockAddressSymbol(MO.getBlockAddress()),AP);
+      break;
+    }
+
+    MCI.addOperand(MCO);
+  }
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonMachineFunctionInfo.cpp
new file mode 100644
index 0000000..9579c8b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonMachineFunctionInfo.cpp
@@ -0,0 +1,16 @@
+//= HexagonMachineFunctionInfo.cpp - Hexagon machine function info *- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonMachineFunctionInfo.h"
+
+using namespace llvm;
+
+// pin vtable to this file
+void HexagonMachineFunctionInfo::anchor() {}
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonMachineFunctionInfo.h b/contrib/llvm/lib/Target/Hexagon/HexagonMachineFunctionInfo.h
new file mode 100644
index 0000000..d799bdb
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonMachineFunctionInfo.h
@@ -0,0 +1,80 @@
+//=- HexagonMachineFunctionInfo.h - Hexagon machine function info -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HexagonMACHINEFUNCTIONINFO_H
+#define HexagonMACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include <map>
+
+namespace llvm {
+
+  namespace Hexagon {
+    const unsigned int StartPacket = 0x1;
+    const unsigned int EndPacket = 0x2;
+  }
+
+
+/// Hexagon target-specific information for each MachineFunction.
+class HexagonMachineFunctionInfo : public MachineFunctionInfo {
+  // SRetReturnReg - Some subtargets require that sret lowering includes
+  // returning the value of the returned struct in a register. This field
+  // holds the virtual register into which the sret argument is passed.
+  unsigned SRetReturnReg;
+  std::vector<MachineInstr*> AllocaAdjustInsts;
+  int VarArgsFrameIndex;
+  bool HasClobberLR;
+  bool HasEHReturn;
+  std::map<const MachineInstr*, unsigned> PacketInfo;
+  virtual void anchor();
+
+public:
+  HexagonMachineFunctionInfo() : SRetReturnReg(0), HasClobberLR(0),
+    HasEHReturn(false) {}
+
+  HexagonMachineFunctionInfo(MachineFunction &MF) : SRetReturnReg(0),
+                                                    HasClobberLR(0),
+                                                    HasEHReturn(false) {}
+
+  unsigned getSRetReturnReg() const { return SRetReturnReg; }
+  void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
+
+  void addAllocaAdjustInst(MachineInstr* MI) {
+    AllocaAdjustInsts.push_back(MI);
+  }
+  const std::vector<MachineInstr*>& getAllocaAdjustInsts() {
+    return AllocaAdjustInsts;
+  }
+
+  void setVarArgsFrameIndex(int v) { VarArgsFrameIndex = v; }
+  int getVarArgsFrameIndex() { return VarArgsFrameIndex; }
+
+  void setStartPacket(MachineInstr* MI) {
+    PacketInfo[MI] |= Hexagon::StartPacket;
+  }
+  void setEndPacket(MachineInstr* MI)   {
+    PacketInfo[MI] |= Hexagon::EndPacket;
+  }
+  bool isStartPacket(const MachineInstr* MI) const {
+    return (PacketInfo.count(MI) &&
+            (PacketInfo.find(MI)->second & Hexagon::StartPacket));
+  }
+  bool isEndPacket(const MachineInstr* MI) const {
+    return (PacketInfo.count(MI) &&
+            (PacketInfo.find(MI)->second & Hexagon::EndPacket));
+  }
+  void setHasClobberLR(bool v) { HasClobberLR = v;  }
+  bool hasClobberLR() const { return HasClobberLR; }
+
+  bool hasEHReturn() const { return HasEHReturn; };
+  void setHasEHReturn(bool H = true) { HasEHReturn = H; };
+};
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp
new file mode 100644
index 0000000..6fcaa20
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp
@@ -0,0 +1,694 @@
+//===- HexagonMachineScheduler.cpp - MI Scheduler for Hexagon -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// MachineScheduler schedules machine instructions after phi elimination. It
+// preserves LiveIntervals so it can be invoked before register allocation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonMachineScheduler.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/IR/Function.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "misched"
+
+/// Platform-specific modifications to DAG.
+void VLIWMachineScheduler::postprocessDAG() {
+  SUnit* LastSequentialCall = nullptr;
+  // Currently we only catch the situation when compare gets scheduled
+  // before preceding call.
+  for (unsigned su = 0, e = SUnits.size(); su != e; ++su) {
+    // Remember the call.
+    if (SUnits[su].getInstr()->isCall())
+      LastSequentialCall = &(SUnits[su]);
+    // Look for a compare that defines a predicate.
+    else if (SUnits[su].getInstr()->isCompare() && LastSequentialCall)
+      SUnits[su].addPred(SDep(LastSequentialCall, SDep::Barrier));
+  }
+}
+
+/// Check if scheduling of this SU is possible
+/// in the current packet.
+/// It is _not_ precise (statefull), it is more like
+/// another heuristic. Many corner cases are figured
+/// empirically.
+bool VLIWResourceModel::isResourceAvailable(SUnit *SU) {
+  if (!SU || !SU->getInstr())
+    return false;
+
+  // First see if the pipeline could receive this instruction
+  // in the current cycle.
+  switch (SU->getInstr()->getOpcode()) {
+  default:
+    if (!ResourcesModel->canReserveResources(SU->getInstr()))
+      return false;
+  case TargetOpcode::EXTRACT_SUBREG:
+  case TargetOpcode::INSERT_SUBREG:
+  case TargetOpcode::SUBREG_TO_REG:
+  case TargetOpcode::REG_SEQUENCE:
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::COPY:
+  case TargetOpcode::INLINEASM:
+    break;
+  }
+
+  // Now see if there are no other dependencies to instructions already
+  // in the packet.
+  for (unsigned i = 0, e = Packet.size(); i != e; ++i) {
+    if (Packet[i]->Succs.size() == 0)
+      continue;
+    for (SUnit::const_succ_iterator I = Packet[i]->Succs.begin(),
+         E = Packet[i]->Succs.end(); I != E; ++I) {
+      // Since we do not add pseudos to packets, might as well
+      // ignore order dependencies.
+      if (I->isCtrl())
+        continue;
+
+      if (I->getSUnit() == SU)
+        return false;
+    }
+  }
+  return true;
+}
+
+/// Keep track of available resources.
+bool VLIWResourceModel::reserveResources(SUnit *SU) {
+  bool startNewCycle = false;
+  // Artificially reset state.
+  if (!SU) {
+    ResourcesModel->clearResources();
+    Packet.clear();
+    TotalPackets++;
+    return false;
+  }
+  // If this SU does not fit in the packet
+  // start a new one.
+  if (!isResourceAvailable(SU)) {
+    ResourcesModel->clearResources();
+    Packet.clear();
+    TotalPackets++;
+    startNewCycle = true;
+  }
+
+  switch (SU->getInstr()->getOpcode()) {
+  default:
+    ResourcesModel->reserveResources(SU->getInstr());
+    break;
+  case TargetOpcode::EXTRACT_SUBREG:
+  case TargetOpcode::INSERT_SUBREG:
+  case TargetOpcode::SUBREG_TO_REG:
+  case TargetOpcode::REG_SEQUENCE:
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::KILL:
+  case TargetOpcode::CFI_INSTRUCTION:
+  case TargetOpcode::EH_LABEL:
+  case TargetOpcode::COPY:
+  case TargetOpcode::INLINEASM:
+    break;
+  }
+  Packet.push_back(SU);
+
+#ifndef NDEBUG
+  DEBUG(dbgs() << "Packet[" << TotalPackets << "]:\n");
+  for (unsigned i = 0, e = Packet.size(); i != e; ++i) {
+    DEBUG(dbgs() << "\t[" << i << "] SU(");
+    DEBUG(dbgs() << Packet[i]->NodeNum << ")\t");
+    DEBUG(Packet[i]->getInstr()->dump());
+  }
+#endif
+
+  // If packet is now full, reset the state so in the next cycle
+  // we start fresh.
+  if (Packet.size() >= SchedModel->getIssueWidth()) {
+    ResourcesModel->clearResources();
+    Packet.clear();
+    TotalPackets++;
+    startNewCycle = true;
+  }
+
+  return startNewCycle;
+}
+
+/// schedule - Called back from MachineScheduler::runOnMachineFunction
+/// after setting up the current scheduling region. [RegionBegin, RegionEnd)
+/// only includes instructions that have DAG nodes, not scheduling boundaries.
+void VLIWMachineScheduler::schedule() {
+  DEBUG(dbgs()
+        << "********** MI Converging Scheduling VLIW BB#" << BB->getNumber()
+        << " " << BB->getName()
+        << " in_func " << BB->getParent()->getFunction()->getName()
+        << " at loop depth "  << MLI.getLoopDepth(BB)
+        << " \n");
+
+  buildDAGWithRegPressure();
+
+  // Postprocess the DAG to add platform-specific artificial dependencies.
+  postprocessDAG();
+
+  SmallVector<SUnit*, 8> TopRoots, BotRoots;
+  findRootsAndBiasEdges(TopRoots, BotRoots);
+
+  // Initialize the strategy before modifying the DAG.
+  SchedImpl->initialize(this);
+
+  // To view Height/Depth correctly, they should be accessed at least once.
+  //
+  // FIXME: SUnit::dumpAll always recompute depth and height now. The max
+  // depth/height could be computed directly from the roots and leaves.
+  DEBUG(unsigned maxH = 0;
+        for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
+          if (SUnits[su].getHeight() > maxH)
+            maxH = SUnits[su].getHeight();
+        dbgs() << "Max Height " << maxH << "\n";);
+  DEBUG(unsigned maxD = 0;
+        for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
+          if (SUnits[su].getDepth() > maxD)
+            maxD = SUnits[su].getDepth();
+        dbgs() << "Max Depth " << maxD << "\n";);
+  DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
+          SUnits[su].dumpAll(this));
+
+  initQueues(TopRoots, BotRoots);
+
+  bool IsTopNode = false;
+  while (SUnit *SU = SchedImpl->pickNode(IsTopNode)) {
+    if (!checkSchedLimit())
+      break;
+
+    scheduleMI(SU, IsTopNode);
+
+    updateQueues(SU, IsTopNode);
+
+    // Notify the scheduling strategy after updating the DAG.
+    SchedImpl->schedNode(SU, IsTopNode);
+  }
+  assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
+
+  placeDebugValues();
+}
+
+void ConvergingVLIWScheduler::initialize(ScheduleDAGMI *dag) {
+  DAG = static_cast<VLIWMachineScheduler*>(dag);
+  SchedModel = DAG->getSchedModel();
+
+  Top.init(DAG, SchedModel);
+  Bot.init(DAG, SchedModel);
+
+  // Initialize the HazardRecognizers. If itineraries don't exist, are empty, or
+  // are disabled, then these HazardRecs will be disabled.
+  const InstrItineraryData *Itin = DAG->getSchedModel()->getInstrItineraries();
+  const TargetMachine &TM = DAG->MF.getTarget();
+  delete Top.HazardRec;
+  delete Bot.HazardRec;
+  Top.HazardRec = TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
+  Bot.HazardRec = TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
+
+  delete Top.ResourceModel;
+  delete Bot.ResourceModel;
+  Top.ResourceModel = new VLIWResourceModel(TM, DAG->getSchedModel());
+  Bot.ResourceModel = new VLIWResourceModel(TM, DAG->getSchedModel());
+
+  assert((!llvm::ForceTopDown || !llvm::ForceBottomUp) &&
+         "-misched-topdown incompatible with -misched-bottomup");
+}
+
+void ConvergingVLIWScheduler::releaseTopNode(SUnit *SU) {
+  if (SU->isScheduled)
+    return;
+
+  for (SUnit::succ_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    unsigned PredReadyCycle = I->getSUnit()->TopReadyCycle;
+    unsigned MinLatency = I->getLatency();
+#ifndef NDEBUG
+    Top.MaxMinLatency = std::max(MinLatency, Top.MaxMinLatency);
+#endif
+    if (SU->TopReadyCycle < PredReadyCycle + MinLatency)
+      SU->TopReadyCycle = PredReadyCycle + MinLatency;
+  }
+  Top.releaseNode(SU, SU->TopReadyCycle);
+}
+
+void ConvergingVLIWScheduler::releaseBottomNode(SUnit *SU) {
+  if (SU->isScheduled)
+    return;
+
+  assert(SU->getInstr() && "Scheduled SUnit must have instr");
+
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    unsigned SuccReadyCycle = I->getSUnit()->BotReadyCycle;
+    unsigned MinLatency = I->getLatency();
+#ifndef NDEBUG
+    Bot.MaxMinLatency = std::max(MinLatency, Bot.MaxMinLatency);
+#endif
+    if (SU->BotReadyCycle < SuccReadyCycle + MinLatency)
+      SU->BotReadyCycle = SuccReadyCycle + MinLatency;
+  }
+  Bot.releaseNode(SU, SU->BotReadyCycle);
+}
+
+/// Does this SU have a hazard within the current instruction group.
+///
+/// The scheduler supports two modes of hazard recognition. The first is the
+/// ScheduleHazardRecognizer API. It is a fully general hazard recognizer that
+/// supports highly complicated in-order reservation tables
+/// (ScoreboardHazardRecognizer) and arbitrary target-specific logic.
+///
+/// The second is a streamlined mechanism that checks for hazards based on
+/// simple counters that the scheduler itself maintains. It explicitly checks
+/// for instruction dispatch limitations, including the number of micro-ops that
+/// can dispatch per cycle.
+///
+/// TODO: Also check whether the SU must start a new group.
+bool ConvergingVLIWScheduler::VLIWSchedBoundary::checkHazard(SUnit *SU) {
+  if (HazardRec->isEnabled())
+    return HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard;
+
+  unsigned uops = SchedModel->getNumMicroOps(SU->getInstr());
+  if (IssueCount + uops > SchedModel->getIssueWidth())
+    return true;
+
+  return false;
+}
+
+void ConvergingVLIWScheduler::VLIWSchedBoundary::releaseNode(SUnit *SU,
+                                                     unsigned ReadyCycle) {
+  if (ReadyCycle < MinReadyCycle)
+    MinReadyCycle = ReadyCycle;
+
+  // Check for interlocks first. For the purpose of other heuristics, an
+  // instruction that cannot issue appears as if it's not in the ReadyQueue.
+  if (ReadyCycle > CurrCycle || checkHazard(SU))
+
+    Pending.push(SU);
+  else
+    Available.push(SU);
+}
+
+/// Move the boundary of scheduled code by one cycle.
+void ConvergingVLIWScheduler::VLIWSchedBoundary::bumpCycle() {
+  unsigned Width = SchedModel->getIssueWidth();
+  IssueCount = (IssueCount <= Width) ? 0 : IssueCount - Width;
+
+  assert(MinReadyCycle < UINT_MAX && "MinReadyCycle uninitialized");
+  unsigned NextCycle = std::max(CurrCycle + 1, MinReadyCycle);
+
+  if (!HazardRec->isEnabled()) {
+    // Bypass HazardRec virtual calls.
+    CurrCycle = NextCycle;
+  } else {
+    // Bypass getHazardType calls in case of long latency.
+    for (; CurrCycle != NextCycle; ++CurrCycle) {
+      if (isTop())
+        HazardRec->AdvanceCycle();
+      else
+        HazardRec->RecedeCycle();
+    }
+  }
+  CheckPending = true;
+
+  DEBUG(dbgs() << "*** " << Available.getName() << " cycle "
+        << CurrCycle << '\n');
+}
+
+/// Move the boundary of scheduled code by one SUnit.
+void ConvergingVLIWScheduler::VLIWSchedBoundary::bumpNode(SUnit *SU) {
+  bool startNewCycle = false;
+
+  // Update the reservation table.
+  if (HazardRec->isEnabled()) {
+    if (!isTop() && SU->isCall) {
+      // Calls are scheduled with their preceding instructions. For bottom-up
+      // scheduling, clear the pipeline state before emitting.
+      HazardRec->Reset();
+    }
+    HazardRec->EmitInstruction(SU);
+  }
+
+  // Update DFA model.
+  startNewCycle = ResourceModel->reserveResources(SU);
+
+  // Check the instruction group dispatch limit.
+  // TODO: Check if this SU must end a dispatch group.
+  IssueCount += SchedModel->getNumMicroOps(SU->getInstr());
+  if (startNewCycle) {
+    DEBUG(dbgs() << "*** Max instrs at cycle " << CurrCycle << '\n');
+    bumpCycle();
+  }
+  else
+    DEBUG(dbgs() << "*** IssueCount " << IssueCount
+          << " at cycle " << CurrCycle << '\n');
+}
+
+/// Release pending ready nodes in to the available queue. This makes them
+/// visible to heuristics.
+void ConvergingVLIWScheduler::VLIWSchedBoundary::releasePending() {
+  // If the available queue is empty, it is safe to reset MinReadyCycle.
+  if (Available.empty())
+    MinReadyCycle = UINT_MAX;
+
+  // Check to see if any of the pending instructions are ready to issue.  If
+  // so, add them to the available queue.
+  for (unsigned i = 0, e = Pending.size(); i != e; ++i) {
+    SUnit *SU = *(Pending.begin()+i);
+    unsigned ReadyCycle = isTop() ? SU->TopReadyCycle : SU->BotReadyCycle;
+
+    if (ReadyCycle < MinReadyCycle)
+      MinReadyCycle = ReadyCycle;
+
+    if (ReadyCycle > CurrCycle)
+      continue;
+
+    if (checkHazard(SU))
+      continue;
+
+    Available.push(SU);
+    Pending.remove(Pending.begin()+i);
+    --i; --e;
+  }
+  CheckPending = false;
+}
+
+/// Remove SU from the ready set for this boundary.
+void ConvergingVLIWScheduler::VLIWSchedBoundary::removeReady(SUnit *SU) {
+  if (Available.isInQueue(SU))
+    Available.remove(Available.find(SU));
+  else {
+    assert(Pending.isInQueue(SU) && "bad ready count");
+    Pending.remove(Pending.find(SU));
+  }
+}
+
+/// If this queue only has one ready candidate, return it. As a side effect,
+/// advance the cycle until at least one node is ready. If multiple instructions
+/// are ready, return NULL.
+SUnit *ConvergingVLIWScheduler::VLIWSchedBoundary::pickOnlyChoice() {
+  if (CheckPending)
+    releasePending();
+
+  for (unsigned i = 0; Available.empty(); ++i) {
+    assert(i <= (HazardRec->getMaxLookAhead() + MaxMinLatency) &&
+           "permanent hazard"); (void)i;
+    ResourceModel->reserveResources(nullptr);
+    bumpCycle();
+    releasePending();
+  }
+  if (Available.size() == 1)
+    return *Available.begin();
+  return nullptr;
+}
+
+#ifndef NDEBUG
+void ConvergingVLIWScheduler::traceCandidate(const char *Label,
+                                             const ReadyQueue &Q,
+                                             SUnit *SU, PressureChange P) {
+  dbgs() << Label << " " << Q.getName() << " ";
+  if (P.isValid())
+    dbgs() << DAG->TRI->getRegPressureSetName(P.getPSet()) << ":"
+           << P.getUnitInc() << " ";
+  else
+    dbgs() << "     ";
+  SU->dump(DAG);
+}
+#endif
+
+/// getSingleUnscheduledPred - If there is exactly one unscheduled predecessor
+/// of SU, return it, otherwise return null.
+static SUnit *getSingleUnscheduledPred(SUnit *SU) {
+  SUnit *OnlyAvailablePred = nullptr;
+  for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    SUnit &Pred = *I->getSUnit();
+    if (!Pred.isScheduled) {
+      // We found an available, but not scheduled, predecessor.  If it's the
+      // only one we have found, keep track of it... otherwise give up.
+      if (OnlyAvailablePred && OnlyAvailablePred != &Pred)
+        return nullptr;
+      OnlyAvailablePred = &Pred;
+    }
+  }
+  return OnlyAvailablePred;
+}
+
+/// getSingleUnscheduledSucc - If there is exactly one unscheduled successor
+/// of SU, return it, otherwise return null.
+static SUnit *getSingleUnscheduledSucc(SUnit *SU) {
+  SUnit *OnlyAvailableSucc = nullptr;
+  for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    SUnit &Succ = *I->getSUnit();
+    if (!Succ.isScheduled) {
+      // We found an available, but not scheduled, successor.  If it's the
+      // only one we have found, keep track of it... otherwise give up.
+      if (OnlyAvailableSucc && OnlyAvailableSucc != &Succ)
+        return nullptr;
+      OnlyAvailableSucc = &Succ;
+    }
+  }
+  return OnlyAvailableSucc;
+}
+
+// Constants used to denote relative importance of
+// heuristic components for cost computation.
+static const unsigned PriorityOne = 200;
+static const unsigned PriorityTwo = 50;
+static const unsigned ScaleTwo = 10;
+static const unsigned FactorOne = 2;
+
+/// Single point to compute overall scheduling cost.
+/// TODO: More heuristics will be used soon.
+int ConvergingVLIWScheduler::SchedulingCost(ReadyQueue &Q, SUnit *SU,
+                                            SchedCandidate &Candidate,
+                                            RegPressureDelta &Delta,
+                                            bool verbose) {
+  // Initial trivial priority.
+  int ResCount = 1;
+
+  // Do not waste time on a node that is already scheduled.
+  if (!SU || SU->isScheduled)
+    return ResCount;
+
+  // Forced priority is high.
+  if (SU->isScheduleHigh)
+    ResCount += PriorityOne;
+
+  // Critical path first.
+  if (Q.getID() == TopQID) {
+    ResCount += (SU->getHeight() * ScaleTwo);
+
+    // If resources are available for it, multiply the
+    // chance of scheduling.
+    if (Top.ResourceModel->isResourceAvailable(SU))
+      ResCount <<= FactorOne;
+  } else {
+    ResCount += (SU->getDepth() * ScaleTwo);
+
+    // If resources are available for it, multiply the
+    // chance of scheduling.
+    if (Bot.ResourceModel->isResourceAvailable(SU))
+      ResCount <<= FactorOne;
+  }
+
+  unsigned NumNodesBlocking = 0;
+  if (Q.getID() == TopQID) {
+    // How many SUs does it block from scheduling?
+    // Look at all of the successors of this node.
+    // Count the number of nodes that
+    // this node is the sole unscheduled node for.
+    for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+         I != E; ++I)
+      if (getSingleUnscheduledPred(I->getSUnit()) == SU)
+        ++NumNodesBlocking;
+  } else {
+    // How many unscheduled predecessors block this node?
+    for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+         I != E; ++I)
+      if (getSingleUnscheduledSucc(I->getSUnit()) == SU)
+        ++NumNodesBlocking;
+  }
+  ResCount += (NumNodesBlocking * ScaleTwo);
+
+  // Factor in reg pressure as a heuristic.
+  ResCount -= (Delta.Excess.getUnitInc()*PriorityTwo);
+  ResCount -= (Delta.CriticalMax.getUnitInc()*PriorityTwo);
+
+  DEBUG(if (verbose) dbgs() << " Total(" << ResCount << ")");
+
+  return ResCount;
+}
+
+/// Pick the best candidate from the top queue.
+///
+/// TODO: getMaxPressureDelta results can be mostly cached for each SUnit during
+/// DAG building. To adjust for the current scheduling location we need to
+/// maintain the number of vreg uses remaining to be top-scheduled.
+ConvergingVLIWScheduler::CandResult ConvergingVLIWScheduler::
+pickNodeFromQueue(ReadyQueue &Q, const RegPressureTracker &RPTracker,
+                  SchedCandidate &Candidate) {
+  DEBUG(Q.dump());
+
+  // getMaxPressureDelta temporarily modifies the tracker.
+  RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);
+
+  // BestSU remains NULL if no top candidates beat the best existing candidate.
+  CandResult FoundCandidate = NoCand;
+  for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) {
+    RegPressureDelta RPDelta;
+    TempTracker.getMaxPressureDelta((*I)->getInstr(), RPDelta,
+                                    DAG->getRegionCriticalPSets(),
+                                    DAG->getRegPressure().MaxSetPressure);
+
+    int CurrentCost = SchedulingCost(Q, *I, Candidate, RPDelta, false);
+
+    // Initialize the candidate if needed.
+    if (!Candidate.SU) {
+      Candidate.SU = *I;
+      Candidate.RPDelta = RPDelta;
+      Candidate.SCost = CurrentCost;
+      FoundCandidate = NodeOrder;
+      continue;
+    }
+
+    // Best cost.
+    if (CurrentCost > Candidate.SCost) {
+      DEBUG(traceCandidate("CCAND", Q, *I));
+      Candidate.SU = *I;
+      Candidate.RPDelta = RPDelta;
+      Candidate.SCost = CurrentCost;
+      FoundCandidate = BestCost;
+      continue;
+    }
+
+    // Fall through to original instruction order.
+    // Only consider node order if Candidate was chosen from this Q.
+    if (FoundCandidate == NoCand)
+      continue;
+  }
+  return FoundCandidate;
+}
+
+/// Pick the best candidate node from either the top or bottom queue.
+SUnit *ConvergingVLIWScheduler::pickNodeBidrectional(bool &IsTopNode) {
+  // Schedule as far as possible in the direction of no choice. This is most
+  // efficient, but also provides the best heuristics for CriticalPSets.
+  if (SUnit *SU = Bot.pickOnlyChoice()) {
+    IsTopNode = false;
+    return SU;
+  }
+  if (SUnit *SU = Top.pickOnlyChoice()) {
+    IsTopNode = true;
+    return SU;
+  }
+  SchedCandidate BotCand;
+  // Prefer bottom scheduling when heuristics are silent.
+  CandResult BotResult = pickNodeFromQueue(Bot.Available,
+                                           DAG->getBotRPTracker(), BotCand);
+  assert(BotResult != NoCand && "failed to find the first candidate");
+
+  // If either Q has a single candidate that provides the least increase in
+  // Excess pressure, we can immediately schedule from that Q.
+  //
+  // RegionCriticalPSets summarizes the pressure within the scheduled region and
+  // affects picking from either Q. If scheduling in one direction must
+  // increase pressure for one of the excess PSets, then schedule in that
+  // direction first to provide more freedom in the other direction.
+  if (BotResult == SingleExcess || BotResult == SingleCritical) {
+    IsTopNode = false;
+    return BotCand.SU;
+  }
+  // Check if the top Q has a better candidate.
+  SchedCandidate TopCand;
+  CandResult TopResult = pickNodeFromQueue(Top.Available,
+                                           DAG->getTopRPTracker(), TopCand);
+  assert(TopResult != NoCand && "failed to find the first candidate");
+
+  if (TopResult == SingleExcess || TopResult == SingleCritical) {
+    IsTopNode = true;
+    return TopCand.SU;
+  }
+  // If either Q has a single candidate that minimizes pressure above the
+  // original region's pressure pick it.
+  if (BotResult == SingleMax) {
+    IsTopNode = false;
+    return BotCand.SU;
+  }
+  if (TopResult == SingleMax) {
+    IsTopNode = true;
+    return TopCand.SU;
+  }
+  if (TopCand.SCost > BotCand.SCost) {
+    IsTopNode = true;
+    return TopCand.SU;
+  }
+  // Otherwise prefer the bottom candidate in node order.
+  IsTopNode = false;
+  return BotCand.SU;
+}
+
+/// Pick the best node to balance the schedule. Implements MachineSchedStrategy.
+SUnit *ConvergingVLIWScheduler::pickNode(bool &IsTopNode) {
+  if (DAG->top() == DAG->bottom()) {
+    assert(Top.Available.empty() && Top.Pending.empty() &&
+           Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
+    return nullptr;
+  }
+  SUnit *SU;
+  if (llvm::ForceTopDown) {
+    SU = Top.pickOnlyChoice();
+    if (!SU) {
+      SchedCandidate TopCand;
+      CandResult TopResult =
+        pickNodeFromQueue(Top.Available, DAG->getTopRPTracker(), TopCand);
+      assert(TopResult != NoCand && "failed to find the first candidate");
+      (void)TopResult;
+      SU = TopCand.SU;
+    }
+    IsTopNode = true;
+  } else if (llvm::ForceBottomUp) {
+    SU = Bot.pickOnlyChoice();
+    if (!SU) {
+      SchedCandidate BotCand;
+      CandResult BotResult =
+        pickNodeFromQueue(Bot.Available, DAG->getBotRPTracker(), BotCand);
+      assert(BotResult != NoCand && "failed to find the first candidate");
+      (void)BotResult;
+      SU = BotCand.SU;
+    }
+    IsTopNode = false;
+  } else {
+    SU = pickNodeBidrectional(IsTopNode);
+  }
+  if (SU->isTopReady())
+    Top.removeReady(SU);
+  if (SU->isBottomReady())
+    Bot.removeReady(SU);
+
+  DEBUG(dbgs() << "*** " << (IsTopNode ? "Top" : "Bottom")
+        << " Scheduling Instruction in cycle "
+        << (IsTopNode ? Top.CurrCycle : Bot.CurrCycle) << '\n';
+        SU->dump(DAG));
+  return SU;
+}
+
+/// Update the scheduler's state after scheduling a node. This is the same node
+/// that was just returned by pickNode(). However, VLIWMachineScheduler needs
+/// to update it's state based on the current cycle before MachineSchedStrategy
+/// does.
+void ConvergingVLIWScheduler::schedNode(SUnit *SU, bool IsTopNode) {
+  if (IsTopNode) {
+    SU->TopReadyCycle = Top.CurrCycle;
+    Top.bumpNode(SU);
+  } else {
+    SU->BotReadyCycle = Bot.CurrCycle;
+    Bot.bumpNode(SU);
+  }
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.h b/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.h
new file mode 100644
index 0000000..8c41086
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.h
@@ -0,0 +1,244 @@
+//===-- HexagonMachineScheduler.h - Custom Hexagon MI scheduler.      ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Custom Hexagon MI scheduler.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HEXAGONASMPRINTER_H
+#define HEXAGONASMPRINTER_H
+
+#include "llvm/ADT/PriorityQueue.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/RegisterPressure.h"
+#include "llvm/CodeGen/ResourcePriorityQueue.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+namespace llvm {
+//===----------------------------------------------------------------------===//
+// ConvergingVLIWScheduler - Implementation of the standard
+// MachineSchedStrategy.
+//===----------------------------------------------------------------------===//
+
+class VLIWResourceModel {
+  /// ResourcesModel - Represents VLIW state.
+  /// Not limited to VLIW targets per say, but assumes
+  /// definition of DFA by a target.
+  DFAPacketizer *ResourcesModel;
+
+  const TargetSchedModel *SchedModel;
+
+  /// Local packet/bundle model. Purely
+  /// internal to the MI schedulre at the time.
+  std::vector<SUnit*> Packet;
+
+  /// Total packets created.
+  unsigned TotalPackets;
+
+public:
+VLIWResourceModel(const TargetMachine &TM, const TargetSchedModel *SM) :
+    SchedModel(SM), TotalPackets(0) {
+    ResourcesModel = TM.getInstrInfo()->CreateTargetScheduleState(&TM, nullptr);
+
+    // This hard requirement could be relaxed,
+    // but for now do not let it proceed.
+    assert(ResourcesModel && "Unimplemented CreateTargetScheduleState.");
+
+    Packet.resize(SchedModel->getIssueWidth());
+    Packet.clear();
+    ResourcesModel->clearResources();
+  }
+
+  ~VLIWResourceModel() {
+    delete ResourcesModel;
+  }
+
+  void resetPacketState() {
+    Packet.clear();
+  }
+
+  void resetDFA() {
+    ResourcesModel->clearResources();
+  }
+
+  void reset() {
+    Packet.clear();
+    ResourcesModel->clearResources();
+  }
+
+  bool isResourceAvailable(SUnit *SU);
+  bool reserveResources(SUnit *SU);
+  unsigned getTotalPackets() const { return TotalPackets; }
+};
+
+/// Extend the standard ScheduleDAGMI to provide more context and override the
+/// top-level schedule() driver.
+class VLIWMachineScheduler : public ScheduleDAGMILive {
+public:
+  VLIWMachineScheduler(MachineSchedContext *C,
+                       std::unique_ptr<MachineSchedStrategy> S)
+      : ScheduleDAGMILive(C, std::move(S)) {}
+
+  /// Schedule - This is called back from ScheduleDAGInstrs::Run() when it's
+  /// time to do some work.
+  virtual void schedule() override;
+  /// Perform platform-specific DAG postprocessing.
+  void postprocessDAG();
+};
+
+/// ConvergingVLIWScheduler shrinks the unscheduled zone using heuristics
+/// to balance the schedule.
+class ConvergingVLIWScheduler : public MachineSchedStrategy {
+
+  /// Store the state used by ConvergingVLIWScheduler heuristics, required
+  ///  for the lifetime of one invocation of pickNode().
+  struct SchedCandidate {
+    // The best SUnit candidate.
+    SUnit *SU;
+
+    // Register pressure values for the best candidate.
+    RegPressureDelta RPDelta;
+
+    // Best scheduling cost.
+    int SCost;
+
+    SchedCandidate(): SU(nullptr), SCost(0) {}
+  };
+  /// Represent the type of SchedCandidate found within a single queue.
+  enum CandResult {
+    NoCand, NodeOrder, SingleExcess, SingleCritical, SingleMax, MultiPressure,
+    BestCost};
+
+  /// Each Scheduling boundary is associated with ready queues. It tracks the
+  /// current cycle in whichever direction at has moved, and maintains the state
+  /// of "hazards" and other interlocks at the current cycle.
+  struct VLIWSchedBoundary {
+    VLIWMachineScheduler *DAG;
+    const TargetSchedModel *SchedModel;
+
+    ReadyQueue Available;
+    ReadyQueue Pending;
+    bool CheckPending;
+
+    ScheduleHazardRecognizer *HazardRec;
+    VLIWResourceModel *ResourceModel;
+
+    unsigned CurrCycle;
+    unsigned IssueCount;
+
+    /// MinReadyCycle - Cycle of the soonest available instruction.
+    unsigned MinReadyCycle;
+
+    // Remember the greatest min operand latency.
+    unsigned MaxMinLatency;
+
+    /// Pending queues extend the ready queues with the same ID and the
+    /// PendingFlag set.
+    VLIWSchedBoundary(unsigned ID, const Twine &Name):
+      DAG(nullptr), SchedModel(nullptr), Available(ID, Name+".A"),
+      Pending(ID << ConvergingVLIWScheduler::LogMaxQID, Name+".P"),
+      CheckPending(false), HazardRec(nullptr), ResourceModel(nullptr),
+      CurrCycle(0), IssueCount(0),
+      MinReadyCycle(UINT_MAX), MaxMinLatency(0) {}
+
+    ~VLIWSchedBoundary() {
+      delete ResourceModel;
+      delete HazardRec;
+    }
+
+    void init(VLIWMachineScheduler *dag, const TargetSchedModel *smodel) {
+      DAG = dag;
+      SchedModel = smodel;
+    }
+
+    bool isTop() const {
+      return Available.getID() == ConvergingVLIWScheduler::TopQID;
+    }
+
+    bool checkHazard(SUnit *SU);
+
+    void releaseNode(SUnit *SU, unsigned ReadyCycle);
+
+    void bumpCycle();
+
+    void bumpNode(SUnit *SU);
+
+    void releasePending();
+
+    void removeReady(SUnit *SU);
+
+    SUnit *pickOnlyChoice();
+  };
+
+  VLIWMachineScheduler *DAG;
+  const TargetSchedModel *SchedModel;
+
+  // State of the top and bottom scheduled instruction boundaries.
+  VLIWSchedBoundary Top;
+  VLIWSchedBoundary Bot;
+
+public:
+  /// SUnit::NodeQueueId: 0 (none), 1 (top), 2 (bot), 3 (both)
+  enum {
+    TopQID = 1,
+    BotQID = 2,
+    LogMaxQID = 2
+  };
+
+  ConvergingVLIWScheduler()
+    : DAG(nullptr), SchedModel(nullptr), Top(TopQID, "TopQ"),
+      Bot(BotQID, "BotQ") {}
+
+  virtual void initialize(ScheduleDAGMI *dag) override;
+
+  virtual SUnit *pickNode(bool &IsTopNode) override;
+
+  virtual void schedNode(SUnit *SU, bool IsTopNode) override;
+
+  virtual void releaseTopNode(SUnit *SU) override;
+
+  virtual void releaseBottomNode(SUnit *SU) override;
+
+  unsigned ReportPackets() {
+    return Top.ResourceModel->getTotalPackets() +
+           Bot.ResourceModel->getTotalPackets();
+  }
+
+protected:
+  SUnit *pickNodeBidrectional(bool &IsTopNode);
+
+  int SchedulingCost(ReadyQueue &Q,
+                     SUnit *SU, SchedCandidate &Candidate,
+                     RegPressureDelta &Delta, bool verbose);
+
+  CandResult pickNodeFromQueue(ReadyQueue &Q,
+                               const RegPressureTracker &RPTracker,
+                               SchedCandidate &Candidate);
+#ifndef NDEBUG
+  void traceCandidate(const char *Label, const ReadyQueue &Q, SUnit *SU,
+                      PressureChange P = PressureChange());
+#endif
+};
+
+} // namespace
+
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp
new file mode 100644
index 0000000..b7c03a7
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp
@@ -0,0 +1,656 @@
+//===----- HexagonNewValueJump.cpp - Hexagon Backend New Value Jump -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements NewValueJump pass in Hexagon.
+// Ideally, we should merge this as a Peephole pass prior to register
+// allocation, but because we have a spill in between the feeder and new value
+// jump instructions, we are forced to write after register allocation.
+// Having said that, we should re-attempt to pull this earlier at some point
+// in future.
+
+// The basic approach looks for sequence of predicated jump, compare instruciton
+// that genereates the predicate and, the feeder to the predicate. Once it finds
+// all, it collapses compare and jump instruction into a new valu jump
+// intstructions.
+//
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/PassSupport.h"
+#include "Hexagon.h"
+#include "HexagonInstrInfo.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <map>
+using namespace llvm;
+
+#define DEBUG_TYPE "hexagon-nvj"
+
+STATISTIC(NumNVJGenerated, "Number of New Value Jump Instructions created");
+
+static cl::opt<int>
+DbgNVJCount("nvj-count", cl::init(-1), cl::Hidden, cl::desc(
+  "Maximum number of predicated jumps to be converted to New Value Jump"));
+
+static cl::opt<bool> DisableNewValueJumps("disable-nvjump", cl::Hidden,
+    cl::ZeroOrMore, cl::init(false),
+    cl::desc("Disable New Value Jumps"));
+
+namespace llvm {
+  void initializeHexagonNewValueJumpPass(PassRegistry&);
+}
+
+
+namespace {
+  struct HexagonNewValueJump : public MachineFunctionPass {
+    const HexagonInstrInfo    *QII;
+    const HexagonRegisterInfo *QRI;
+
+  public:
+    static char ID;
+
+    HexagonNewValueJump() : MachineFunctionPass(ID) {
+      initializeHexagonNewValueJumpPass(*PassRegistry::getPassRegistry());
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<MachineBranchProbabilityInfo>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    const char *getPassName() const override {
+      return "Hexagon NewValueJump";
+    }
+
+    bool runOnMachineFunction(MachineFunction &Fn) override;
+
+  private:
+    /// \brief A handle to the branch probability pass.
+    const MachineBranchProbabilityInfo *MBPI;
+
+  };
+
+} // end of anonymous namespace
+
+char HexagonNewValueJump::ID = 0;
+
+INITIALIZE_PASS_BEGIN(HexagonNewValueJump, "hexagon-nvj",
+                      "Hexagon NewValueJump", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_END(HexagonNewValueJump, "hexagon-nvj",
+                    "Hexagon NewValueJump", false, false)
+
+
+// We have identified this II could be feeder to NVJ,
+// verify that it can be.
+static bool canBeFeederToNewValueJump(const HexagonInstrInfo *QII,
+                                      const TargetRegisterInfo *TRI,
+                                      MachineBasicBlock::iterator II,
+                                      MachineBasicBlock::iterator end,
+                                      MachineBasicBlock::iterator skip,
+                                      MachineFunction &MF) {
+
+  // Predicated instruction can not be feeder to NVJ.
+  if (QII->isPredicated(II))
+    return false;
+
+  // Bail out if feederReg is a paired register (double regs in
+  // our case). One would think that we can check to see if a given
+  // register cmpReg1 or cmpReg2 is a sub register of feederReg
+  // using -- if (QRI->isSubRegister(feederReg, cmpReg1) logic
+  // before the callsite of this function
+  // But we can not as it comes in the following fashion.
+  //    %D0<def> = Hexagon_S2_lsr_r_p %D0<kill>, %R2<kill>
+  //    %R0<def> = KILL %R0, %D0<imp-use,kill>
+  //    %P0<def> = CMPEQri %R0<kill>, 0
+  // Hence, we need to check if it's a KILL instruction.
+  if (II->getOpcode() == TargetOpcode::KILL)
+    return false;
+
+
+  // Make sure there there is no 'def' or 'use' of any of the uses of
+  // feeder insn between it's definition, this MI and jump, jmpInst
+  // skipping compare, cmpInst.
+  // Here's the example.
+  //    r21=memub(r22+r24<<#0)
+  //    p0 = cmp.eq(r21, #0)
+  //    r4=memub(r3+r21<<#0)
+  //    if (p0.new) jump:t .LBB29_45
+  // Without this check, it will be converted into
+  //    r4=memub(r3+r21<<#0)
+  //    r21=memub(r22+r24<<#0)
+  //    p0 = cmp.eq(r21, #0)
+  //    if (p0.new) jump:t .LBB29_45
+  // and result WAR hazards if converted to New Value Jump.
+
+  for (unsigned i = 0; i < II->getNumOperands(); ++i) {
+    if (II->getOperand(i).isReg() &&
+        (II->getOperand(i).isUse() || II->getOperand(i).isDef())) {
+      MachineBasicBlock::iterator localII = II;
+      ++localII;
+      unsigned Reg = II->getOperand(i).getReg();
+      for (MachineBasicBlock::iterator localBegin = localII;
+                        localBegin != end; ++localBegin) {
+        if (localBegin == skip ) continue;
+        // Check for Subregisters too.
+        if (localBegin->modifiesRegister(Reg, TRI) ||
+            localBegin->readsRegister(Reg, TRI))
+          return false;
+      }
+    }
+  }
+  return true;
+}
+
+// These are the common checks that need to performed
+// to determine if
+// 1. compare instruction can be moved before jump.
+// 2. feeder to the compare instruction can be moved before jump.
+static bool commonChecksToProhibitNewValueJump(bool afterRA,
+                          MachineBasicBlock::iterator MII) {
+
+  // If store in path, bail out.
+  if (MII->getDesc().mayStore())
+    return false;
+
+  // if call in path, bail out.
+  if (MII->getOpcode() == Hexagon::CALLv3)
+    return false;
+
+  // if NVJ is running prior to RA, do the following checks.
+  if (!afterRA) {
+    // The following Target Opcode instructions are spurious
+    // to new value jump. If they are in the path, bail out.
+    // KILL sets kill flag on the opcode. It also sets up a
+    // single register, out of pair.
+    //    %D0<def> = Hexagon_S2_lsr_r_p %D0<kill>, %R2<kill>
+    //    %R0<def> = KILL %R0, %D0<imp-use,kill>
+    //    %P0<def> = CMPEQri %R0<kill>, 0
+    // PHI can be anything after RA.
+    // COPY can remateriaze things in between feeder, compare and nvj.
+    if (MII->getOpcode() == TargetOpcode::KILL ||
+        MII->getOpcode() == TargetOpcode::PHI  ||
+        MII->getOpcode() == TargetOpcode::COPY)
+      return false;
+
+    // The following pseudo Hexagon instructions sets "use" and "def"
+    // of registers by individual passes in the backend. At this time,
+    // we don't know the scope of usage and definitions of these
+    // instructions.
+    if (MII->getOpcode() == Hexagon::TFR_condset_rr ||
+        MII->getOpcode() == Hexagon::TFR_condset_ii ||
+        MII->getOpcode() == Hexagon::TFR_condset_ri ||
+        MII->getOpcode() == Hexagon::TFR_condset_ir ||
+        MII->getOpcode() == Hexagon::LDriw_pred     ||
+        MII->getOpcode() == Hexagon::STriw_pred)
+      return false;
+  }
+
+  return true;
+}
+
+static bool canCompareBeNewValueJump(const HexagonInstrInfo *QII,
+                                     const TargetRegisterInfo *TRI,
+                                     MachineBasicBlock::iterator II,
+                                     unsigned pReg,
+                                     bool secondReg,
+                                     bool optLocation,
+                                     MachineBasicBlock::iterator end,
+                                     MachineFunction &MF) {
+
+  MachineInstr *MI = II;
+
+  // If the second operand of the compare is an imm, make sure it's in the
+  // range specified by the arch.
+  if (!secondReg) {
+    int64_t v = MI->getOperand(2).getImm();
+
+    if (!(isUInt<5>(v) ||
+         ((MI->getOpcode() == Hexagon::CMPEQri ||
+           MI->getOpcode() == Hexagon::CMPGTri) &&
+          (v == -1))))
+      return false;
+  }
+
+  unsigned cmpReg1, cmpOp2 = 0; // cmpOp2 assignment silences compiler warning.
+  cmpReg1 = MI->getOperand(1).getReg();
+
+  if (secondReg) {
+    cmpOp2 = MI->getOperand(2).getReg();
+
+    // Make sure that that second register is not from COPY
+    // At machine code level, we don't need this, but if we decide
+    // to move new value jump prior to RA, we would be needing this.
+    MachineRegisterInfo &MRI = MF.getRegInfo();
+    if (secondReg && !TargetRegisterInfo::isPhysicalRegister(cmpOp2)) {
+      MachineInstr *def = MRI.getVRegDef(cmpOp2);
+      if (def->getOpcode() == TargetOpcode::COPY)
+        return false;
+    }
+  }
+
+  // Walk the instructions after the compare (predicate def) to the jump,
+  // and satisfy the following conditions.
+  ++II ;
+  for (MachineBasicBlock::iterator localII = II; localII != end;
+       ++localII) {
+
+    // Check 1.
+    // If "common" checks fail, bail out.
+    if (!commonChecksToProhibitNewValueJump(optLocation, localII))
+      return false;
+
+    // Check 2.
+    // If there is a def or use of predicate (result of compare), bail out.
+    if (localII->modifiesRegister(pReg, TRI) ||
+        localII->readsRegister(pReg, TRI))
+      return false;
+
+    // Check 3.
+    // If there is a def of any of the use of the compare (operands of compare),
+    // bail out.
+    // Eg.
+    //    p0 = cmp.eq(r2, r0)
+    //    r2 = r4
+    //    if (p0.new) jump:t .LBB28_3
+    if (localII->modifiesRegister(cmpReg1, TRI) ||
+        (secondReg && localII->modifiesRegister(cmpOp2, TRI)))
+      return false;
+  }
+  return true;
+}
+
+// Given a compare operator, return a matching New Value Jump
+// compare operator. Make sure that MI here is included in
+// HexagonInstrInfo.cpp::isNewValueJumpCandidate
+static unsigned getNewValueJumpOpcode(MachineInstr *MI, int reg,
+                                      bool secondRegNewified,
+                                      MachineBasicBlock *jmpTarget,
+                                      const MachineBranchProbabilityInfo
+                                      *MBPI) {
+  bool taken = false;
+  MachineBasicBlock *Src = MI->getParent();
+  const BranchProbability Prediction =
+    MBPI->getEdgeProbability(Src, jmpTarget);
+
+  if (Prediction >= BranchProbability(1,2))
+    taken = true;
+
+  switch (MI->getOpcode()) {
+    case Hexagon::CMPEQrr:
+      return taken ? Hexagon::CMPEQrr_t_Jumpnv_t_V4
+                   : Hexagon::CMPEQrr_t_Jumpnv_nt_V4;
+
+    case Hexagon::CMPEQri: {
+      if (reg >= 0)
+        return taken ? Hexagon::CMPEQri_t_Jumpnv_t_V4
+                     : Hexagon::CMPEQri_t_Jumpnv_nt_V4;
+      else
+        return taken ? Hexagon::CMPEQn1_t_Jumpnv_t_V4
+                     : Hexagon::CMPEQn1_t_Jumpnv_nt_V4;
+    }
+
+    case Hexagon::CMPGTrr: {
+      if (secondRegNewified)
+        return taken ? Hexagon::CMPLTrr_t_Jumpnv_t_V4
+                     : Hexagon::CMPLTrr_t_Jumpnv_nt_V4;
+      else
+        return taken ? Hexagon::CMPGTrr_t_Jumpnv_t_V4
+                     : Hexagon::CMPGTrr_t_Jumpnv_nt_V4;
+    }
+
+    case Hexagon::CMPGTri: {
+      if (reg >= 0)
+        return taken ? Hexagon::CMPGTri_t_Jumpnv_t_V4
+                     : Hexagon::CMPGTri_t_Jumpnv_nt_V4;
+      else
+        return taken ? Hexagon::CMPGTn1_t_Jumpnv_t_V4
+                     : Hexagon::CMPGTn1_t_Jumpnv_nt_V4;
+    }
+
+    case Hexagon::CMPGTUrr: {
+      if (secondRegNewified)
+        return taken ? Hexagon::CMPLTUrr_t_Jumpnv_t_V4
+                     : Hexagon::CMPLTUrr_t_Jumpnv_nt_V4;
+      else
+        return taken ? Hexagon::CMPGTUrr_t_Jumpnv_t_V4
+                     : Hexagon::CMPGTUrr_t_Jumpnv_nt_V4;
+    }
+
+    case Hexagon::CMPGTUri:
+      return taken ? Hexagon::CMPGTUri_t_Jumpnv_t_V4
+                   : Hexagon::CMPGTUri_t_Jumpnv_nt_V4;
+
+    default:
+       llvm_unreachable("Could not find matching New Value Jump instruction.");
+  }
+  // return *some value* to avoid compiler warning
+  return 0;
+}
+
+bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
+
+  DEBUG(dbgs() << "********** Hexagon New Value Jump **********\n"
+               << "********** Function: "
+               << MF.getName() << "\n");
+
+#if 0
+  // for now disable this, if we move NewValueJump before register
+  // allocation we need this information.
+  LiveVariables &LVs = getAnalysis<LiveVariables>();
+#endif
+
+  QII = static_cast<const HexagonInstrInfo *>(MF.getTarget().getInstrInfo());
+  QRI =
+    static_cast<const HexagonRegisterInfo *>(MF.getTarget().getRegisterInfo());
+  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
+
+  if (!QRI->Subtarget.hasV4TOps() ||
+      DisableNewValueJumps) {
+    return false;
+  }
+
+  int nvjCount = DbgNVJCount;
+  int nvjGenerated = 0;
+
+  // Loop through all the bb's of the function
+  for (MachineFunction::iterator MBBb = MF.begin(), MBBe = MF.end();
+        MBBb != MBBe; ++MBBb) {
+    MachineBasicBlock* MBB = MBBb;
+
+    DEBUG(dbgs() << "** dumping bb ** "
+                 << MBB->getNumber() << "\n");
+    DEBUG(MBB->dump());
+    DEBUG(dbgs() << "\n" << "********** dumping instr bottom up **********\n");
+    bool foundJump    = false;
+    bool foundCompare = false;
+    bool invertPredicate = false;
+    unsigned predReg = 0; // predicate reg of the jump.
+    unsigned cmpReg1 = 0;
+    int cmpOp2 = 0;
+    bool MO1IsKill = false;
+    bool MO2IsKill = false;
+    MachineBasicBlock::iterator jmpPos;
+    MachineBasicBlock::iterator cmpPos;
+    MachineInstr *cmpInstr = nullptr, *jmpInstr = nullptr;
+    MachineBasicBlock *jmpTarget = nullptr;
+    bool afterRA = false;
+    bool isSecondOpReg = false;
+    bool isSecondOpNewified = false;
+    // Traverse the basic block - bottom up
+    for (MachineBasicBlock::iterator MII = MBB->end(), E = MBB->begin();
+             MII != E;) {
+      MachineInstr *MI = --MII;
+      if (MI->isDebugValue()) {
+        continue;
+      }
+
+      if ((nvjCount == 0) || (nvjCount > -1 && nvjCount <= nvjGenerated))
+        break;
+
+      DEBUG(dbgs() << "Instr: "; MI->dump(); dbgs() << "\n");
+
+      if (!foundJump &&
+         (MI->getOpcode() == Hexagon::JMP_t ||
+          MI->getOpcode() == Hexagon::JMP_f ||
+          MI->getOpcode() == Hexagon::JMP_tnew_t ||
+          MI->getOpcode() == Hexagon::JMP_tnew_nt ||
+          MI->getOpcode() == Hexagon::JMP_fnew_t ||
+          MI->getOpcode() == Hexagon::JMP_fnew_nt)) {
+        // This is where you would insert your compare and
+        // instr that feeds compare
+        jmpPos = MII;
+        jmpInstr = MI;
+        predReg = MI->getOperand(0).getReg();
+        afterRA = TargetRegisterInfo::isPhysicalRegister(predReg);
+
+        // If ifconverter had not messed up with the kill flags of the
+        // operands, the following check on the kill flag would suffice.
+        // if(!jmpInstr->getOperand(0).isKill()) break;
+
+        // This predicate register is live out out of BB
+        // this would only work if we can actually use Live
+        // variable analysis on phy regs - but LLVM does not
+        // provide LV analysis on phys regs.
+        //if(LVs.isLiveOut(predReg, *MBB)) break;
+
+        // Get all the successors of this block - which will always
+        // be 2. Check if the predicate register is live in in those
+        // successor. If yes, we can not delete the predicate -
+        // I am doing this only because LLVM does not provide LiveOut
+        // at the BB level.
+        bool predLive = false;
+        for (MachineBasicBlock::const_succ_iterator SI = MBB->succ_begin(),
+                            SIE = MBB->succ_end(); SI != SIE; ++SI) {
+          MachineBasicBlock* succMBB = *SI;
+         if (succMBB->isLiveIn(predReg)) {
+            predLive = true;
+          }
+        }
+        if (predLive)
+          break;
+
+        jmpTarget = MI->getOperand(1).getMBB();
+        foundJump = true;
+        if (MI->getOpcode() == Hexagon::JMP_f ||
+            MI->getOpcode() == Hexagon::JMP_fnew_t ||
+            MI->getOpcode() == Hexagon::JMP_fnew_nt) {
+          invertPredicate = true;
+        }
+        continue;
+      }
+
+      // No new value jump if there is a barrier. A barrier has to be in its
+      // own packet. A barrier has zero operands. We conservatively bail out
+      // here if we see any instruction with zero operands.
+      if (foundJump && MI->getNumOperands() == 0)
+        break;
+
+      if (foundJump &&
+         !foundCompare &&
+          MI->getOperand(0).isReg() &&
+          MI->getOperand(0).getReg() == predReg) {
+
+        // Not all compares can be new value compare. Arch Spec: 7.6.1.1
+        if (QII->isNewValueJumpCandidate(MI)) {
+
+          assert((MI->getDesc().isCompare()) &&
+              "Only compare instruction can be collapsed into New Value Jump");
+          isSecondOpReg = MI->getOperand(2).isReg();
+
+          if (!canCompareBeNewValueJump(QII, QRI, MII, predReg, isSecondOpReg,
+                                        afterRA, jmpPos, MF))
+            break;
+
+          cmpInstr = MI;
+          cmpPos = MII;
+          foundCompare = true;
+
+          // We need cmpReg1 and cmpOp2(imm or reg) while building
+          // new value jump instruction.
+          cmpReg1 = MI->getOperand(1).getReg();
+          if (MI->getOperand(1).isKill())
+            MO1IsKill = true;
+
+          if (isSecondOpReg) {
+            cmpOp2 = MI->getOperand(2).getReg();
+            if (MI->getOperand(2).isKill())
+              MO2IsKill = true;
+          } else
+            cmpOp2 = MI->getOperand(2).getImm();
+          continue;
+        }
+      }
+
+      if (foundCompare && foundJump) {
+
+        // If "common" checks fail, bail out on this BB.
+        if (!commonChecksToProhibitNewValueJump(afterRA, MII))
+          break;
+
+        bool foundFeeder = false;
+        MachineBasicBlock::iterator feederPos = MII;
+        if (MI->getOperand(0).isReg() &&
+            MI->getOperand(0).isDef() &&
+           (MI->getOperand(0).getReg() == cmpReg1 ||
+            (isSecondOpReg &&
+             MI->getOperand(0).getReg() == (unsigned) cmpOp2))) {
+
+          unsigned feederReg = MI->getOperand(0).getReg();
+
+          // First try to see if we can get the feeder from the first operand
+          // of the compare. If we can not, and if secondOpReg is true
+          // (second operand of the compare is also register), try that one.
+          // TODO: Try to come up with some heuristic to figure out which
+          // feeder would benefit.
+
+          if (feederReg == cmpReg1) {
+            if (!canBeFeederToNewValueJump(QII, QRI, MII, jmpPos, cmpPos, MF)) {
+              if (!isSecondOpReg)
+                break;
+              else
+                continue;
+            } else
+              foundFeeder = true;
+          }
+
+          if (!foundFeeder &&
+               isSecondOpReg &&
+               feederReg == (unsigned) cmpOp2)
+            if (!canBeFeederToNewValueJump(QII, QRI, MII, jmpPos, cmpPos, MF))
+              break;
+
+          if (isSecondOpReg) {
+            // In case of CMPLT, or CMPLTU, or EQ with the second register
+            // to newify, swap the operands.
+            if (cmpInstr->getOpcode() == Hexagon::CMPEQrr &&
+                                     feederReg == (unsigned) cmpOp2) {
+              unsigned tmp = cmpReg1;
+              bool tmpIsKill = MO1IsKill;
+              cmpReg1 = cmpOp2;
+              MO1IsKill = MO2IsKill;
+              cmpOp2 = tmp;
+              MO2IsKill = tmpIsKill;
+            }
+
+            // Now we have swapped the operands, all we need to check is,
+            // if the second operand (after swap) is the feeder.
+            // And if it is, make a note.
+            if (feederReg == (unsigned)cmpOp2)
+              isSecondOpNewified = true;
+          }
+
+          // Now that we are moving feeder close the jump,
+          // make sure we are respecting the kill values of
+          // the operands of the feeder.
+
+          bool updatedIsKill = false;
+          for (unsigned i = 0; i < MI->getNumOperands(); i++) {
+            MachineOperand &MO = MI->getOperand(i);
+            if (MO.isReg() && MO.isUse()) {
+              unsigned feederReg = MO.getReg();
+              for (MachineBasicBlock::iterator localII = feederPos,
+                   end = jmpPos; localII != end; localII++) {
+                MachineInstr *localMI = localII;
+                for (unsigned j = 0; j < localMI->getNumOperands(); j++) {
+                  MachineOperand &localMO = localMI->getOperand(j);
+                  if (localMO.isReg() && localMO.isUse() &&
+                      localMO.isKill() && feederReg == localMO.getReg()) {
+                    // We found that there is kill of a use register
+                    // Set up a kill flag on the register
+                    localMO.setIsKill(false);
+                    MO.setIsKill();
+                    updatedIsKill = true;
+                    break;
+                  }
+                }
+                if (updatedIsKill) break;
+              }
+            }
+            if (updatedIsKill) break;
+          }
+
+          MBB->splice(jmpPos, MI->getParent(), MI);
+          MBB->splice(jmpPos, MI->getParent(), cmpInstr);
+          DebugLoc dl = MI->getDebugLoc();
+          MachineInstr *NewMI;
+
+           assert((QII->isNewValueJumpCandidate(cmpInstr)) &&
+                      "This compare is not a New Value Jump candidate.");
+          unsigned opc = getNewValueJumpOpcode(cmpInstr, cmpOp2,
+                                               isSecondOpNewified,
+                                               jmpTarget, MBPI);
+          if (invertPredicate)
+            opc = QII->getInvertedPredicatedOpcode(opc);
+
+          if (isSecondOpReg)
+            NewMI = BuildMI(*MBB, jmpPos, dl,
+                                  QII->get(opc))
+                                    .addReg(cmpReg1, getKillRegState(MO1IsKill))
+                                    .addReg(cmpOp2, getKillRegState(MO2IsKill))
+                                    .addMBB(jmpTarget);
+
+          else if ((cmpInstr->getOpcode() == Hexagon::CMPEQri ||
+                    cmpInstr->getOpcode() == Hexagon::CMPGTri) &&
+                    cmpOp2 == -1 )
+            // Corresponding new-value compare jump instructions don't have the
+            // operand for -1 immediate value.
+            NewMI = BuildMI(*MBB, jmpPos, dl,
+                                  QII->get(opc))
+                                    .addReg(cmpReg1, getKillRegState(MO1IsKill))
+                                    .addMBB(jmpTarget);
+
+          else
+            NewMI = BuildMI(*MBB, jmpPos, dl,
+                                  QII->get(opc))
+                                    .addReg(cmpReg1, getKillRegState(MO1IsKill))
+                                    .addImm(cmpOp2)
+                                    .addMBB(jmpTarget);
+
+          assert(NewMI && "New Value Jump Instruction Not created!");
+          (void)NewMI;
+          if (cmpInstr->getOperand(0).isReg() &&
+              cmpInstr->getOperand(0).isKill())
+            cmpInstr->getOperand(0).setIsKill(false);
+          if (cmpInstr->getOperand(1).isReg() &&
+              cmpInstr->getOperand(1).isKill())
+            cmpInstr->getOperand(1).setIsKill(false);
+          cmpInstr->eraseFromParent();
+          jmpInstr->eraseFromParent();
+          ++nvjGenerated;
+          ++NumNVJGenerated;
+          break;
+        }
+      }
+    }
+  }
+
+  return true;
+
+}
+
+FunctionPass *llvm::createHexagonNewValueJump() {
+  return new HexagonNewValueJump();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonOperands.td b/contrib/llvm/lib/Target/Hexagon/HexagonOperands.td
new file mode 100644
index 0000000..c79d78f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonOperands.td
@@ -0,0 +1,858 @@
+//===- HexagonOperands.td - Hexagon immediate processing -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illnois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// Immediate operands.
+
+let PrintMethod = "printImmOperand" in {
+  // f32Ext type is used to identify constant extended floating point immediates.
+  def f32Ext : Operand<f32>;
+  def s32Imm : Operand<i32>;
+  def s26_6Imm : Operand<i32>;
+  def s16Imm : Operand<i32>;
+  def s12Imm : Operand<i32>;
+  def s11Imm : Operand<i32>;
+  def s11_0Imm : Operand<i32>;
+  def s11_1Imm : Operand<i32>;
+  def s11_2Imm : Operand<i32>;
+  def s11_3Imm : Operand<i32>;
+  def s10Imm : Operand<i32>;
+  def s9Imm : Operand<i32>;
+  def m9Imm : Operand<i32>;
+  def s8Imm : Operand<i32>;
+  def s8Imm64 : Operand<i64>;
+  def s6Imm : Operand<i32>;
+  def s4Imm : Operand<i32>;
+  def s4_0Imm : Operand<i32>;
+  def s4_1Imm : Operand<i32>;
+  def s4_2Imm : Operand<i32>;
+  def s4_3Imm : Operand<i32>;
+  def u64Imm : Operand<i64>;
+  def u32Imm : Operand<i32>;
+  def u26_6Imm : Operand<i32>;
+  def u16Imm : Operand<i32>;
+  def u16_0Imm : Operand<i32>;
+  def u16_1Imm : Operand<i32>;
+  def u16_2Imm : Operand<i32>;
+  def u11_3Imm : Operand<i32>;
+  def u10Imm : Operand<i32>;
+  def u9Imm : Operand<i32>;
+  def u8Imm : Operand<i32>;
+  def u7Imm : Operand<i32>;
+  def u6Imm : Operand<i32>;
+  def u6_0Imm : Operand<i32>;
+  def u6_1Imm : Operand<i32>;
+  def u6_2Imm : Operand<i32>;
+  def u6_3Imm : Operand<i32>;
+  def u5Imm : Operand<i32>;
+  def u4Imm : Operand<i32>;
+  def u3Imm : Operand<i32>;
+  def u2Imm : Operand<i32>;
+  def u1Imm : Operand<i32>;
+  def n8Imm : Operand<i32>;
+  def m6Imm : Operand<i32>;
+}
+
+let PrintMethod = "printNOneImmOperand" in
+def nOneImm : Operand<i32>;
+
+//
+// Immediate predicates
+//
+def s32ImmPred  : PatLeaf<(i32 imm), [{
+  // s32ImmPred predicate - True if the immediate fits in a 32-bit sign extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isInt<32>(v);
+}]>;
+
+def s32_24ImmPred  : PatLeaf<(i32 imm), [{
+  // s32_24ImmPred predicate - True if the immediate fits in a 32-bit sign
+  // extended field that is a multiple of 0x1000000.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedInt<32,24>(v);
+}]>;
+
+def s32_16s8ImmPred  : PatLeaf<(i32 imm), [{
+  // s32_16s8ImmPred predicate - True if the immediate fits in a 32-bit sign
+  // extended field that is a multiple of 0x10000.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedInt<24,16>(v);
+}]>;
+
+def s26_6ImmPred  : PatLeaf<(i32 imm), [{
+  // s26_6ImmPred predicate - True if the immediate fits in a 32-bit
+  // sign extended field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedInt<26,6>(v);
+}]>;
+
+
+def s16ImmPred  : PatLeaf<(i32 imm), [{
+  // s16ImmPred predicate - True if the immediate fits in a 16-bit sign extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isInt<16>(v);
+}]>;
+
+
+def s13ImmPred  : PatLeaf<(i32 imm), [{
+  // s13ImmPred predicate - True if the immediate fits in a 13-bit sign extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isInt<13>(v);
+}]>;
+
+
+def s12ImmPred  : PatLeaf<(i32 imm), [{
+  // s12ImmPred predicate - True if the immediate fits in a 12-bit
+  // sign extended field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isInt<12>(v);
+}]>;
+
+def s11_0ImmPred  : PatLeaf<(i32 imm), [{
+  // s11_0ImmPred predicate - True if the immediate fits in a 11-bit
+  // sign extended field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isInt<11>(v);
+}]>;
+
+
+def s11_1ImmPred  : PatLeaf<(i32 imm), [{
+  // s11_1ImmPred predicate - True if the immediate fits in a 12-bit
+  // sign extended field and is a multiple of 2.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedInt<11,1>(v);
+}]>;
+
+
+def s11_2ImmPred  : PatLeaf<(i32 imm), [{
+  // s11_2ImmPred predicate - True if the immediate fits in a 13-bit
+  // sign extended field and is a multiple of 4.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedInt<11,2>(v);
+}]>;
+
+
+def s11_3ImmPred  : PatLeaf<(i32 imm), [{
+  // s11_3ImmPred predicate - True if the immediate fits in a 14-bit
+  // sign extended field and is a multiple of 8.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedInt<11,3>(v);
+}]>;
+
+
+def s10ImmPred  : PatLeaf<(i32 imm), [{
+  // s10ImmPred predicate - True if the immediate fits in a 10-bit sign extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isInt<10>(v);
+}]>;
+
+
+def s9ImmPred  : PatLeaf<(i32 imm), [{
+  // s9ImmPred predicate - True if the immediate fits in a 9-bit sign extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isInt<9>(v);
+}]>;
+
+def m9ImmPred  : PatLeaf<(i32 imm), [{
+  // m9ImmPred predicate - True if the immediate fits in a 9-bit magnitude
+  // field. The range of m9 is -255 to 255.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isInt<9>(v) && (v != -256);
+}]>;
+
+def s8ImmPred  : PatLeaf<(i32 imm), [{
+  // s8ImmPred predicate - True if the immediate fits in a 8-bit sign extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isInt<8>(v);
+}]>;
+
+
+def s8Imm64Pred  : PatLeaf<(i64 imm), [{
+  // s8ImmPred predicate - True if the immediate fits in a 8-bit sign extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isInt<8>(v);
+}]>;
+
+
+def s6ImmPred  : PatLeaf<(i32 imm), [{
+  // s6ImmPred predicate - True if the immediate fits in a 6-bit sign extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isInt<6>(v);
+}]>;
+
+
+def s4_0ImmPred  : PatLeaf<(i32 imm), [{
+  // s4_0ImmPred predicate - True if the immediate fits in a 4-bit sign extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isInt<4>(v);
+}]>;
+
+
+def s4_1ImmPred  : PatLeaf<(i32 imm), [{
+  // s4_1ImmPred predicate - True if the immediate fits in a 4-bit sign extended
+  // field of 2.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedInt<4,1>(v);
+}]>;
+
+
+def s4_2ImmPred  : PatLeaf<(i32 imm), [{
+  // s4_2ImmPred predicate - True if the immediate fits in a 4-bit sign extended
+  // field that is a multiple of 4.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedInt<4,2>(v);
+}]>;
+
+
+def s4_3ImmPred  : PatLeaf<(i32 imm), [{
+  // s4_3ImmPred predicate - True if the immediate fits in a 4-bit sign extended
+  // field that is a multiple of 8.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedInt<4,3>(v);
+}]>;
+
+
+def u64ImmPred  : PatLeaf<(i64 imm), [{
+  // Adding "N ||" to suppress gcc unused warning.
+  return (N || true);
+}]>;
+
+def u32ImmPred  : PatLeaf<(i32 imm), [{
+  // u32ImmPred predicate - True if the immediate fits in a 32-bit field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<32>(v);
+}]>;
+
+def u26_6ImmPred  : PatLeaf<(i32 imm), [{
+  // u26_6ImmPred - True if the immediate fits in a 32-bit field and
+  // is a multiple of 64.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedUInt<26,6>(v);
+}]>;
+
+def u16ImmPred  : PatLeaf<(i32 imm), [{
+  // u16ImmPred predicate - True if the immediate fits in a 16-bit unsigned
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<16>(v);
+}]>;
+
+def u16_s8ImmPred  : PatLeaf<(i32 imm), [{
+  // u16_s8ImmPred predicate - True if the immediate fits in a 16-bit sign
+  // extended s8 field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedUInt<16,8>(v);
+}]>;
+
+def u9ImmPred  : PatLeaf<(i32 imm), [{
+  // u9ImmPred predicate - True if the immediate fits in a 9-bit unsigned
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<9>(v);
+}]>;
+
+
+def u8ImmPred  : PatLeaf<(i32 imm), [{
+  // u8ImmPred predicate - True if the immediate fits in a 8-bit unsigned
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<8>(v);
+}]>;
+
+def u7StrictPosImmPred : ImmLeaf<i32, [{
+  // u7StrictPosImmPred predicate - True if the immediate fits in an 7-bit
+  // unsigned field and is strictly greater than 0.
+  return isUInt<7>(Imm) && Imm > 0;
+}]>;
+
+def u7ImmPred  : PatLeaf<(i32 imm), [{
+  // u7ImmPred predicate - True if the immediate fits in a 7-bit unsigned
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<7>(v);
+}]>;
+
+
+def u6ImmPred  : PatLeaf<(i32 imm), [{
+  // u6ImmPred predicate - True if the immediate fits in a 6-bit unsigned
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<6>(v);
+}]>;
+
+def u6_0ImmPred  : PatLeaf<(i32 imm), [{
+  // u6_0ImmPred predicate - True if the immediate fits in a 6-bit unsigned
+  // field. Same as u6ImmPred.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<6>(v);
+}]>;
+
+def u6_1ImmPred  : PatLeaf<(i32 imm), [{
+  // u6_1ImmPred predicate - True if the immediate fits in a 7-bit unsigned
+  // field that is 1 bit alinged - multiple of 2.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedUInt<6,1>(v);
+}]>;
+
+def u6_2ImmPred  : PatLeaf<(i32 imm), [{
+  // u6_2ImmPred predicate - True if the immediate fits in a 8-bit unsigned
+  // field that is 2 bits alinged - multiple of 4.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedUInt<6,2>(v);
+}]>;
+
+def u6_3ImmPred  : PatLeaf<(i32 imm), [{
+  // u6_3ImmPred predicate - True if the immediate fits in a 9-bit unsigned
+  // field that is 3 bits alinged - multiple of 8.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedUInt<6,3>(v);
+}]>;
+
+def u5ImmPred  : PatLeaf<(i32 imm), [{
+  // u5ImmPred predicate - True if the immediate fits in a 5-bit unsigned
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<5>(v);
+}]>;
+
+
+def u3ImmPred  : PatLeaf<(i32 imm), [{
+  // u3ImmPred predicate - True if the immediate fits in a 3-bit unsigned
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<3>(v);
+}]>;
+
+
+def u2ImmPred  : PatLeaf<(i32 imm), [{
+  // u2ImmPred predicate - True if the immediate fits in a 2-bit unsigned
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<2>(v);
+}]>;
+
+
+def u1ImmPred  : PatLeaf<(i1 imm), [{
+  // u1ImmPred predicate - True if the immediate fits in a 1-bit unsigned
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<1>(v);
+}]>;
+
+def m5BImmPred  : PatLeaf<(i32 imm), [{
+  // m5BImmPred predicate - True if the (char) number is in range -1 .. -31
+  // and will fit in a 5 bit field when made positive, for use in memops.
+  // this is specific to the zero extending of a negative by CombineInstr
+  int8_t v = (int8_t)N->getSExtValue();
+  return (-31 <= v && v <= -1);
+}]>;
+
+def m5HImmPred  : PatLeaf<(i32 imm), [{
+  // m5HImmPred predicate - True if the (short) number is in range -1 .. -31
+  // and will fit in a 5 bit field when made positive, for use in memops.
+  // this is specific to the zero extending of a negative by CombineInstr
+  int16_t v = (int16_t)N->getSExtValue();
+  return (-31 <= v && v <= -1);
+}]>;
+
+def m5ImmPred  : PatLeaf<(i32 imm), [{
+  // m5ImmPred predicate - True if the number is in range -1 .. -31
+  // and will fit in a 5 bit field when made positive, for use in memops.
+  int64_t v = (int64_t)N->getSExtValue();
+  return (-31 <= v && v <= -1);
+}]>;
+
+//InN means negative integers in [-(2^N - 1), 0]
+def n8ImmPred  : PatLeaf<(i32 imm), [{
+  // n8ImmPred predicate - True if the immediate fits in a 8-bit signed
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return (-255 <= v && v <= 0);
+}]>;
+
+def nOneImmPred  : PatLeaf<(i32 imm), [{
+  // nOneImmPred predicate - True if the immediate is -1.
+  int64_t v = (int64_t)N->getSExtValue();
+  return (-1 == v);
+}]>;
+
+def Set5ImmPred : PatLeaf<(i32 imm), [{
+  // Set5ImmPred predicate - True if the number is in the series of values.
+  // [ 2^0, 2^1, ... 2^31 ]
+  // For use in setbit immediate.
+  uint32_t v = (int32_t)N->getSExtValue();
+  // Constrain to 32 bits, and then check for single bit.
+  return ImmIsSingleBit(v);
+}]>;
+
+def Clr5ImmPred : PatLeaf<(i32 imm), [{
+  // Clr5ImmPred predicate - True if the number is in the series of
+  // bit negated values.
+  // [ 2^0, 2^1, ... 2^31 ]
+  // For use in clrbit immediate.
+  // Note: we are bit NOTing the value.
+  uint32_t v = ~ (int32_t)N->getSExtValue();
+  // Constrain to 32 bits, and then check for single bit.
+  return ImmIsSingleBit(v);
+}]>;
+
+def SetClr5ImmPred : PatLeaf<(i32 imm), [{
+  // SetClr5ImmPred predicate - True if the immediate is in range 0..31.
+  int32_t v = (int32_t)N->getSExtValue();
+  return (v >= 0 && v <= 31);
+}]>;
+
+def Set4ImmPred : PatLeaf<(i32 imm), [{
+  // Set4ImmPred predicate - True if the number is in the series of values:
+  // [ 2^0, 2^1, ... 2^15 ].
+  // For use in setbit immediate.
+  uint16_t v = (int16_t)N->getSExtValue();
+  // Constrain to 16 bits, and then check for single bit.
+  return ImmIsSingleBit(v);
+}]>;
+
+def Clr4ImmPred : PatLeaf<(i32 imm), [{
+  // Clr4ImmPred predicate - True if the number is in the series of
+  // bit negated values:
+  // [ 2^0, 2^1, ... 2^15 ].
+  // For use in setbit and clrbit immediate.
+  uint16_t v = ~ (int16_t)N->getSExtValue();
+  // Constrain to 16 bits, and then check for single bit.
+  return ImmIsSingleBit(v);
+}]>;
+
+def SetClr4ImmPred : PatLeaf<(i32 imm), [{
+  // SetClr4ImmPred predicate - True if the immediate is in the range 0..15.
+  int16_t v = (int16_t)N->getSExtValue();
+  return (v >= 0 && v <= 15);
+}]>;
+
+def Set3ImmPred : PatLeaf<(i32 imm), [{
+  // Set3ImmPred predicate - True if the number is in the series of values:
+  // [ 2^0, 2^1, ... 2^7 ].
+  // For use in setbit immediate.
+  uint8_t v = (int8_t)N->getSExtValue();
+  // Constrain to 8 bits, and then check for single bit.
+  return ImmIsSingleBit(v);
+}]>;
+
+def Clr3ImmPred : PatLeaf<(i32 imm), [{
+  // Clr3ImmPred predicate - True if the number is in the series of
+  // bit negated values:
+  // [ 2^0, 2^1, ... 2^7 ].
+  // For use in setbit and clrbit immediate.
+  uint8_t v = ~ (int8_t)N->getSExtValue();
+  // Constrain to 8 bits, and then check for single bit.
+  return ImmIsSingleBit(v);
+}]>;
+
+def SetClr3ImmPred : PatLeaf<(i32 imm), [{
+  // SetClr3ImmPred predicate - True if the immediate is in the range  0..7.
+  int8_t v = (int8_t)N->getSExtValue();
+  return (v >= 0 && v <= 7);
+}]>;
+
+
+// Extendable immediate operands.
+
+let PrintMethod = "printExtOperand" in {
+  def s16Ext : Operand<i32>;
+  def s12Ext : Operand<i32>;
+  def s10Ext : Operand<i32>;
+  def s9Ext : Operand<i32>;
+  def s8Ext : Operand<i32>;
+  def s6Ext : Operand<i32>;
+  def s11_0Ext : Operand<i32>;
+  def s11_1Ext : Operand<i32>;
+  def s11_2Ext : Operand<i32>;
+  def s11_3Ext : Operand<i32>;
+  def u6Ext : Operand<i32>;
+  def u7Ext : Operand<i32>;
+  def u8Ext : Operand<i32>;
+  def u9Ext : Operand<i32>;
+  def u10Ext : Operand<i32>;
+  def u6_0Ext : Operand<i32>;
+  def u6_1Ext : Operand<i32>;
+  def u6_2Ext : Operand<i32>;
+  def u6_3Ext : Operand<i32>;
+}
+
+let PrintMethod = "printImmOperand" in
+def u0AlwaysExt : Operand<i32>;
+
+// Predicates for constant extendable operands
+def s16ExtPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  if (!Subtarget.hasV4TOps())
+    // Return true if the immediate can fit in a 16-bit sign extended field.
+    return isInt<16>(v);
+  else {
+    if (isInt<16>(v))
+      return true;
+
+    // Return true if extending this immediate is profitable and the value
+    // can fit in a 32-bit signed field.
+    return isConstExtProfitable(Node) && isInt<32>(v);
+  }
+}]>;
+
+def s10ExtPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  if (!Subtarget.hasV4TOps())
+    // Return true if the immediate can fit in a 10-bit sign extended field.
+    return isInt<10>(v);
+  else {
+    if (isInt<10>(v))
+      return true;
+
+    // Return true if extending this immediate is profitable and the value
+    // can fit in a 32-bit signed field.
+    return isConstExtProfitable(Node) && isInt<32>(v);
+  }
+}]>;
+
+def s9ExtPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  if (!Subtarget.hasV4TOps())
+    // Return true if the immediate can fit in a 9-bit sign extended field.
+    return isInt<9>(v);
+  else {
+    if (isInt<9>(v))
+      return true;
+
+    // Return true if extending this immediate is profitable and the value
+    // can fit in a 32-bit unsigned field.
+    return isConstExtProfitable(Node) && isInt<32>(v);
+  }
+}]>;
+
+def s8ExtPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  if (!Subtarget.hasV4TOps())
+    // Return true if the immediate can fit in a 8-bit sign extended field.
+    return isInt<8>(v);
+  else {
+    if (isInt<8>(v))
+      return true;
+
+    // Return true if extending this immediate is profitable and the value
+    // can fit in a 32-bit signed field.
+    return isConstExtProfitable(Node) && isInt<32>(v);
+  }
+}]>;
+
+def s8_16ExtPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  if (!Subtarget.hasV4TOps())
+    // Return true if the immediate fits in a 8-bit sign extended field.
+    return isInt<8>(v);
+  else {
+    if (isInt<8>(v))
+      return true;
+
+    // Return true if extending this immediate is profitable and the value
+    // can't fit in a 16-bit signed field. This is required to avoid
+    // unnecessary constant extenders.
+    return isConstExtProfitable(Node) && !isInt<16>(v);
+  }
+}]>;
+
+def s6ExtPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  if (!Subtarget.hasV4TOps())
+    // Return true if the immediate can fit in a 6-bit sign extended field.
+    return isInt<6>(v);
+  else {
+    if (isInt<6>(v))
+      return true;
+
+    // Return true if extending this immediate is profitable and the value
+    // can fit in a 32-bit unsigned field.
+    return isConstExtProfitable(Node) && isInt<32>(v);
+  }
+}]>;
+
+def s6_16ExtPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  if (!Subtarget.hasV4TOps())
+    // Return true if the immediate fits in a 6-bit sign extended field.
+    return isInt<6>(v);
+  else {
+    if (isInt<6>(v))
+      return true;
+
+    // Return true if extending this immediate is profitable and the value
+    // can't fit in a 16-bit signed field. This is required to avoid
+    // unnecessary constant extenders.
+    return isConstExtProfitable(Node) && !isInt<16>(v);
+  }
+}]>;
+
+def s6_10ExtPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  if (!Subtarget.hasV4TOps())
+    // Return true if the immediate can fit in a 6-bit sign extended field.
+    return isInt<6>(v);
+  else {
+    if (isInt<6>(v))
+      return true;
+
+    // Return true if extending this immediate is profitable and the value
+    // can't fit in a 10-bit signed field. This is required to avoid
+    // unnecessary constant extenders.
+    return isConstExtProfitable(Node) && !isInt<10>(v);
+  }
+}]>;
+
+def s11_0ExtPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  if (!Subtarget.hasV4TOps())
+    // Return true if the immediate can fit in a 11-bit sign extended field.
+    return isShiftedInt<11,0>(v);
+  else {
+    if (isInt<11>(v))
+      return true;
+
+    // Return true if extending this immediate is profitable and the value
+    // can fit in a 32-bit signed field.
+    return isConstExtProfitable(Node) && isInt<32>(v);
+  }
+}]>;
+
+def s11_1ExtPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  if (!Subtarget.hasV4TOps())
+    // Return true if the immediate can fit in a 12-bit sign extended field and
+    // is 2 byte aligned.
+    return isShiftedInt<11,1>(v);
+  else {
+    if (isInt<12>(v))
+      return isShiftedInt<11,1>(v);
+
+    // Return true if extending this immediate is profitable and the low 1 bit
+    // is zero (2-byte aligned).
+    return isConstExtProfitable(Node) && isInt<32>(v) && ((v % 2) == 0);
+  }
+}]>;
+
+def s11_2ExtPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  if (!Subtarget.hasV4TOps())
+    // Return true if the immediate can fit in a 13-bit sign extended field and
+    // is 4-byte aligned.
+    return isShiftedInt<11,2>(v);
+  else {
+    if (isInt<13>(v))
+      return isShiftedInt<11,2>(v);
+
+    // Return true if extending this immediate is profitable and the low 2-bits
+    // are zero (4-byte aligned).
+    return isConstExtProfitable(Node)  && isInt<32>(v) && ((v % 4) == 0);
+  }
+}]>;
+
+def s11_3ExtPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  if (!Subtarget.hasV4TOps())
+    // Return true if the immediate can fit in a 14-bit sign extended field and
+    // is 8-byte aligned.
+    return isShiftedInt<11,3>(v);
+  else {
+    if (isInt<14>(v))
+     return isShiftedInt<11,3>(v);
+
+    // Return true if extending this immediate is profitable and the low 3-bits
+    // are zero (8-byte aligned).
+    return isConstExtProfitable(Node)  && isInt<32>(v) && ((v % 8) == 0);
+  }
+}]>;
+
+def u0AlwaysExtPred : PatLeaf<(i32 imm), [{
+  // Predicate for an unsigned 32-bit value that always needs to be extended.
+  if (Subtarget.hasV4TOps()) {
+    if (isConstExtProfitable(Node)) {
+      int64_t v = (int64_t)N->getSExtValue();
+      return isUInt<32>(v);
+    }
+  }
+  return false;
+}]>;
+
+def u6ExtPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  if (!Subtarget.hasV4TOps())
+    // Return true if the immediate can fit in a 6-bit unsigned field.
+    return isUInt<6>(v);
+  else {
+    if (isUInt<6>(v))
+      return true;
+
+    // Return true if extending this immediate is profitable and the value
+    // can fit in a 32-bit unsigned field.
+    return isConstExtProfitable(Node) && isUInt<32>(v);
+  }
+}]>;
+
+def u7ExtPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  if (!Subtarget.hasV4TOps())
+    // Return true if the immediate can fit in a 7-bit unsigned field.
+    return isUInt<7>(v);
+  else {
+    if (isUInt<7>(v))
+      return true;
+
+    // Return true if extending this immediate is profitable and the value
+    // can fit in a 32-bit unsigned field.
+    return isConstExtProfitable(Node) && isUInt<32>(v);
+  }
+}]>;
+
+def u8ExtPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  if (!Subtarget.hasV4TOps())
+    // Return true if the immediate can fit in a 8-bit unsigned field.
+    return isUInt<8>(v);
+  else {
+    if (isUInt<8>(v))
+      return true;
+
+    // Return true if extending this immediate is profitable and the value
+    // can fit in a 32-bit unsigned field.
+    return isConstExtProfitable(Node) && isUInt<32>(v);
+  }
+}]>;
+
+def u9ExtPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  if (!Subtarget.hasV4TOps())
+    // Return true if the immediate can fit in a 9-bit unsigned field.
+    return isUInt<9>(v);
+  else {
+    if (isUInt<9>(v))
+      return true;
+
+    // Return true if extending this immediate is profitable and the value
+    // can fit in a 32-bit unsigned field.
+    return isConstExtProfitable(Node) && isUInt<32>(v);
+  }
+}]>;
+
+def u6_1ExtPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  if (!Subtarget.hasV4TOps())
+    // Return true if the immediate can fit in a 7-bit unsigned field and
+    // is 2-byte aligned.
+    return isShiftedUInt<6,1>(v);
+  else {
+    if (isUInt<7>(v))
+      return isShiftedUInt<6,1>(v);
+
+    // Return true if extending this immediate is profitable and the value
+    // can fit in a 32-bit unsigned field.
+    return isConstExtProfitable(Node) && isUInt<32>(v) && ((v % 2) == 0);
+  }
+}]>;
+
+def u6_2ExtPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  if (!Subtarget.hasV4TOps())
+    // Return true if the immediate can fit in a 8-bit unsigned field and
+    // is 4-byte aligned.
+    return isShiftedUInt<6,2>(v);
+  else {
+    if (isUInt<8>(v))
+      return isShiftedUInt<6,2>(v);
+
+    // Return true if extending this immediate is profitable and the value
+    // can fit in a 32-bit unsigned field.
+    return isConstExtProfitable(Node) && isUInt<32>(v) && ((v % 4) == 0);
+  }
+}]>;
+
+def u6_3ExtPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  if (!Subtarget.hasV4TOps())
+    // Return true if the immediate can fit in a 9-bit unsigned field and
+    // is 8-byte aligned.
+    return isShiftedUInt<6,3>(v);
+  else {
+    if (isUInt<9>(v))
+      return isShiftedUInt<6,3>(v);
+
+    // Return true if extending this immediate is profitable and the value
+    // can fit in a 32-bit unsigned field.
+    return isConstExtProfitable(Node) && isUInt<32>(v) && ((v % 8) == 0);
+  }
+}]>;
+
+// Addressing modes.
+
+def ADDRrr : ComplexPattern<i32, 2, "SelectADDRrr", [], []>;
+def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex], []>;
+def ADDRriS11_0 : ComplexPattern<i32, 2, "SelectADDRriS11_0", [frameindex], []>;
+def ADDRriS11_1 : ComplexPattern<i32, 2, "SelectADDRriS11_1", [frameindex], []>;
+def ADDRriS11_2 : ComplexPattern<i32, 2, "SelectADDRriS11_2", [frameindex], []>;
+def ADDRriS11_3 : ComplexPattern<i32, 2, "SelectADDRriS11_3", [frameindex], []>;
+def ADDRriU6_0 : ComplexPattern<i32, 2, "SelectADDRriU6_0", [frameindex], []>;
+def ADDRriU6_1 : ComplexPattern<i32, 2, "SelectADDRriU6_1", [frameindex], []>;
+def ADDRriU6_2 : ComplexPattern<i32, 2, "SelectADDRriU6_2", [frameindex], []>;
+
+// Address operands.
+
+def MEMrr : Operand<i32> {
+  let PrintMethod = "printMEMrrOperand";
+  let MIOperandInfo = (ops IntRegs, IntRegs);
+}
+
+def MEMri : Operand<i32> {
+  let PrintMethod = "printMEMriOperand";
+  let MIOperandInfo = (ops IntRegs, IntRegs);
+}
+
+def MEMri_s11_2 : Operand<i32>,
+  ComplexPattern<i32, 2, "SelectMEMriS11_2", []> {
+  let PrintMethod = "printMEMriOperand";
+  let MIOperandInfo = (ops IntRegs, s11Imm);
+}
+
+def FrameIndex : Operand<i32> {
+  let PrintMethod = "printFrameIndexOperand";
+  let MIOperandInfo = (ops IntRegs, s11Imm);
+}
+
+let PrintMethod = "printGlobalOperand" in {
+  def globaladdress : Operand<i32>;
+  def globaladdressExt : Operand<i32>;
+}
+
+let PrintMethod = "printJumpTable" in
+def jumptablebase : Operand<i32>;
+
+def brtarget : Operand<OtherVT>;
+def brtargetExt : Operand<OtherVT>;
+def calltarget : Operand<i32>;
+
+def bblabel : Operand<i32>;
+def bbl   : SDNode<"ISD::BasicBlock", SDTPtrLeaf   , [], "BasicBlockSDNode">;
+
+def symbolHi32 : Operand<i32> {
+  let PrintMethod = "printSymbolHi";
+}
+def symbolLo32 : Operand<i32> {
+  let PrintMethod = "printSymbolLo";
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonPeephole.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonPeephole.cpp
new file mode 100644
index 0000000..48b6159
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonPeephole.cpp
@@ -0,0 +1,347 @@
+//===-- HexagonPeephole.cpp - Hexagon Peephole Optimiztions ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// This peephole pass optimizes in the following cases.
+// 1. Optimizes redundant sign extends for the following case
+//    Transform the following pattern
+//    %vreg170<def> = SXTW %vreg166
+//    ...
+//    %vreg176<def> = COPY %vreg170:subreg_loreg
+//
+//    Into
+//    %vreg176<def> = COPY vreg166
+//
+//  2. Optimizes redundant negation of predicates.
+//     %vreg15<def> = CMPGTrr %vreg6, %vreg2
+//     ...
+//     %vreg16<def> = NOT_p %vreg15<kill>
+//     ...
+//     JMP_c %vreg16<kill>, <BB#1>, %PC<imp-def,dead>
+//
+//     Into
+//     %vreg15<def> = CMPGTrr %vreg6, %vreg2;
+//     ...
+//     JMP_cNot %vreg15<kill>, <BB#1>, %PC<imp-def,dead>;
+//
+// Note: The peephole pass makes the instrucstions like
+// %vreg170<def> = SXTW %vreg166 or %vreg16<def> = NOT_p %vreg15<kill>
+// redundant and relies on some form of dead removal instructions, like
+// DCE or DIE to actually eliminate them.
+
+
+//===----------------------------------------------------------------------===//
+
+#include "Hexagon.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/PassSupport.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <algorithm>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "hexagon-peephole"
+
+static cl::opt<bool> DisableHexagonPeephole("disable-hexagon-peephole",
+    cl::Hidden, cl::ZeroOrMore, cl::init(false),
+    cl::desc("Disable Peephole Optimization"));
+
+static cl::opt<bool> DisablePNotP("disable-hexagon-pnotp",
+    cl::Hidden, cl::ZeroOrMore, cl::init(false),
+    cl::desc("Disable Optimization of PNotP"));
+
+static cl::opt<bool> DisableOptSZExt("disable-hexagon-optszext",
+    cl::Hidden, cl::ZeroOrMore, cl::init(false),
+    cl::desc("Disable Optimization of Sign/Zero Extends"));
+
+static cl::opt<bool> DisableOptExtTo64("disable-hexagon-opt-ext-to-64",
+    cl::Hidden, cl::ZeroOrMore, cl::init(false),
+    cl::desc("Disable Optimization of extensions to i64."));
+
+namespace llvm {
+  void initializeHexagonPeepholePass(PassRegistry&);
+}
+
+namespace {
+  struct HexagonPeephole : public MachineFunctionPass {
+    const HexagonInstrInfo    *QII;
+    const HexagonRegisterInfo *QRI;
+    const MachineRegisterInfo *MRI;
+
+  public:
+    static char ID;
+    HexagonPeephole() : MachineFunctionPass(ID) {
+      initializeHexagonPeepholePass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    const char *getPassName() const override {
+      return "Hexagon optimize redundant zero and size extends";
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+  private:
+    void ChangeOpInto(MachineOperand &Dst, MachineOperand &Src);
+  };
+}
+
+char HexagonPeephole::ID = 0;
+
+INITIALIZE_PASS(HexagonPeephole, "hexagon-peephole", "Hexagon Peephole",
+                false, false)
+
+bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
+  QII = static_cast<const HexagonInstrInfo *>(MF.getTarget().
+                                        getInstrInfo());
+  QRI = static_cast<const HexagonRegisterInfo *>(MF.getTarget().
+                                       getRegisterInfo());
+  MRI = &MF.getRegInfo();
+
+  DenseMap<unsigned, unsigned> PeepholeMap;
+  DenseMap<unsigned, std::pair<unsigned, unsigned> > PeepholeDoubleRegsMap;
+
+  if (DisableHexagonPeephole) return false;
+
+  // Loop over all of the basic blocks.
+  for (MachineFunction::iterator MBBb = MF.begin(), MBBe = MF.end();
+       MBBb != MBBe; ++MBBb) {
+    MachineBasicBlock* MBB = MBBb;
+    PeepholeMap.clear();
+    PeepholeDoubleRegsMap.clear();
+
+    // Traverse the basic block.
+    for (MachineBasicBlock::iterator MII = MBB->begin(); MII != MBB->end();
+                                     ++MII) {
+      MachineInstr *MI = MII;
+      // Look for sign extends:
+      // %vreg170<def> = SXTW %vreg166
+      if (!DisableOptSZExt && MI->getOpcode() == Hexagon::SXTW) {
+        assert (MI->getNumOperands() == 2);
+        MachineOperand &Dst = MI->getOperand(0);
+        MachineOperand &Src  = MI->getOperand(1);
+        unsigned DstReg = Dst.getReg();
+        unsigned SrcReg = Src.getReg();
+        // Just handle virtual registers.
+        if (TargetRegisterInfo::isVirtualRegister(DstReg) &&
+            TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+          // Map the following:
+          // %vreg170<def> = SXTW %vreg166
+          // PeepholeMap[170] = vreg166
+          PeepholeMap[DstReg] = SrcReg;
+        }
+      }
+
+      // Look for  %vreg170<def> = COMBINE_ir_V4 (0, %vreg169)
+      // %vreg170:DoublRegs, %vreg169:IntRegs
+      if (!DisableOptExtTo64 &&
+          MI->getOpcode () == Hexagon::COMBINE_Ir_V4) {
+        assert (MI->getNumOperands() == 3);
+        MachineOperand &Dst = MI->getOperand(0);
+        MachineOperand &Src1 = MI->getOperand(1);
+        MachineOperand &Src2 = MI->getOperand(2);
+        if (Src1.getImm() != 0)
+          continue;
+        unsigned DstReg = Dst.getReg();
+        unsigned SrcReg = Src2.getReg();
+        PeepholeMap[DstReg] = SrcReg;
+      }
+
+      // Look for this sequence below
+      // %vregDoubleReg1 = LSRd_ri %vregDoubleReg0, 32
+      // %vregIntReg = COPY %vregDoubleReg1:subreg_loreg.
+      // and convert into
+      // %vregIntReg = COPY %vregDoubleReg0:subreg_hireg.
+      if (MI->getOpcode() == Hexagon::LSRd_ri) {
+        assert(MI->getNumOperands() == 3);
+        MachineOperand &Dst = MI->getOperand(0);
+        MachineOperand &Src1 = MI->getOperand(1);
+        MachineOperand &Src2 = MI->getOperand(2);
+        if (Src2.getImm() != 32)
+          continue;
+        unsigned DstReg = Dst.getReg();
+        unsigned SrcReg = Src1.getReg();
+        PeepholeDoubleRegsMap[DstReg] =
+          std::make_pair(*&SrcReg, 1/*Hexagon::subreg_hireg*/);
+      }
+
+      // Look for P=NOT(P).
+      if (!DisablePNotP &&
+          (MI->getOpcode() == Hexagon::NOT_p)) {
+        assert (MI->getNumOperands() == 2);
+        MachineOperand &Dst = MI->getOperand(0);
+        MachineOperand &Src  = MI->getOperand(1);
+        unsigned DstReg = Dst.getReg();
+        unsigned SrcReg = Src.getReg();
+        // Just handle virtual registers.
+        if (TargetRegisterInfo::isVirtualRegister(DstReg) &&
+            TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+          // Map the following:
+          // %vreg170<def> = NOT_xx %vreg166
+          // PeepholeMap[170] = vreg166
+          PeepholeMap[DstReg] = SrcReg;
+        }
+      }
+
+      // Look for copy:
+      // %vreg176<def> = COPY %vreg170:subreg_loreg
+      if (!DisableOptSZExt && MI->isCopy()) {
+        assert (MI->getNumOperands() == 2);
+        MachineOperand &Dst = MI->getOperand(0);
+        MachineOperand &Src  = MI->getOperand(1);
+
+        // Make sure we are copying the lower 32 bits.
+        if (Src.getSubReg() != Hexagon::subreg_loreg)
+          continue;
+
+        unsigned DstReg = Dst.getReg();
+        unsigned SrcReg = Src.getReg();
+        if (TargetRegisterInfo::isVirtualRegister(DstReg) &&
+            TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+          // Try to find in the map.
+          if (unsigned PeepholeSrc = PeepholeMap.lookup(SrcReg)) {
+            // Change the 1st operand.
+            MI->RemoveOperand(1);
+            MI->addOperand(MachineOperand::CreateReg(PeepholeSrc, false));
+          } else  {
+            DenseMap<unsigned, std::pair<unsigned, unsigned> >::iterator DI =
+              PeepholeDoubleRegsMap.find(SrcReg);
+            if (DI != PeepholeDoubleRegsMap.end()) {
+              std::pair<unsigned,unsigned> PeepholeSrc = DI->second;
+              MI->RemoveOperand(1);
+              MI->addOperand(MachineOperand::CreateReg(PeepholeSrc.first,
+                                                       false /*isDef*/,
+                                                       false /*isImp*/,
+                                                       false /*isKill*/,
+                                                       false /*isDead*/,
+                                                       false /*isUndef*/,
+                                                       false /*isEarlyClobber*/,
+                                                       PeepholeSrc.second));
+            }
+          }
+        }
+      }
+
+      // Look for Predicated instructions.
+      if (!DisablePNotP) {
+        bool Done = false;
+        if (QII->isPredicated(MI)) {
+          MachineOperand &Op0 = MI->getOperand(0);
+          unsigned Reg0 = Op0.getReg();
+          const TargetRegisterClass *RC0 = MRI->getRegClass(Reg0);
+          if (RC0->getID() == Hexagon::PredRegsRegClassID) {
+            // Handle instructions that have a prediate register in op0
+            // (most cases of predicable instructions).
+            if (TargetRegisterInfo::isVirtualRegister(Reg0)) {
+              // Try to find in the map.
+              if (unsigned PeepholeSrc = PeepholeMap.lookup(Reg0)) {
+                // Change the 1st operand and, flip the opcode.
+                MI->getOperand(0).setReg(PeepholeSrc);
+                int NewOp = QII->getInvertedPredicatedOpcode(MI->getOpcode());
+                MI->setDesc(QII->get(NewOp));
+                Done = true;
+              }
+            }
+          }
+        }
+
+        if (!Done) {
+          // Handle special instructions.
+          unsigned Op = MI->getOpcode();
+          unsigned NewOp = 0;
+          unsigned PR = 1, S1 = 2, S2 = 3;   // Operand indices.
+
+          switch (Op) {
+            case Hexagon::TFR_condset_rr:
+            case Hexagon::TFR_condset_ii:
+            case Hexagon::MUX_ii:
+            case Hexagon::MUX_rr:
+              NewOp = Op;
+              break;
+            case Hexagon::TFR_condset_ri:
+              NewOp = Hexagon::TFR_condset_ir;
+              break;
+            case Hexagon::TFR_condset_ir:
+              NewOp = Hexagon::TFR_condset_ri;
+              break;
+            case Hexagon::MUX_ri:
+              NewOp = Hexagon::MUX_ir;
+              break;
+            case Hexagon::MUX_ir:
+              NewOp = Hexagon::MUX_ri;
+              break;
+          }
+          if (NewOp) {
+            unsigned PSrc = MI->getOperand(PR).getReg();
+            if (unsigned POrig = PeepholeMap.lookup(PSrc)) {
+              MI->getOperand(PR).setReg(POrig);
+              MI->setDesc(QII->get(NewOp));
+              // Swap operands S1 and S2.
+              MachineOperand Op1 = MI->getOperand(S1);
+              MachineOperand Op2 = MI->getOperand(S2);
+              ChangeOpInto(MI->getOperand(S1), Op2);
+              ChangeOpInto(MI->getOperand(S2), Op1);
+            }
+          } // if (NewOp)
+        } // if (!Done)
+
+      } // if (!DisablePNotP)
+
+    } // Instruction
+  } // Basic Block
+  return true;
+}
+
+void HexagonPeephole::ChangeOpInto(MachineOperand &Dst, MachineOperand &Src) {
+  assert (&Dst != &Src && "Cannot duplicate into itself");
+  switch (Dst.getType()) {
+    case MachineOperand::MO_Register:
+      if (Src.isReg()) {
+        Dst.setReg(Src.getReg());
+      } else if (Src.isImm()) {
+        Dst.ChangeToImmediate(Src.getImm());
+      } else {
+        llvm_unreachable("Unexpected src operand type");
+      }
+      break;
+
+    case MachineOperand::MO_Immediate:
+      if (Src.isImm()) {
+        Dst.setImm(Src.getImm());
+      } else if (Src.isReg()) {
+        Dst.ChangeToRegister(Src.getReg(), Src.isDef(), Src.isImplicit(),
+                             Src.isKill(), Src.isDead(), Src.isUndef(),
+                             Src.isDebug());
+      } else {
+        llvm_unreachable("Unexpected src operand type");
+      }
+      break;
+
+    default:
+      llvm_unreachable("Unexpected dst operand type");
+      break;
+  }
+}
+
+FunctionPass *llvm::createHexagonPeephole() {
+  return new HexagonPeephole();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp
new file mode 100644
index 0000000..fb466d3
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp
@@ -0,0 +1,298 @@
+//===-- HexagonRegisterInfo.cpp - Hexagon Register Information ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Hexagon implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonRegisterInfo.h"
+#include "Hexagon.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+
+HexagonRegisterInfo::HexagonRegisterInfo(HexagonSubtarget &st)
+  : HexagonGenRegisterInfo(Hexagon::R31),
+    Subtarget(st) {
+}
+
+const MCPhysReg *
+HexagonRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  static const MCPhysReg CalleeSavedRegsV2[] = {
+    Hexagon::R24,   Hexagon::R25,   Hexagon::R26,   Hexagon::R27, 0
+  };
+  static const MCPhysReg CalleeSavedRegsV3[] = {
+    Hexagon::R16,   Hexagon::R17,   Hexagon::R18,   Hexagon::R19,
+    Hexagon::R20,   Hexagon::R21,   Hexagon::R22,   Hexagon::R23,
+    Hexagon::R24,   Hexagon::R25,   Hexagon::R26,   Hexagon::R27, 0
+  };
+
+  switch(Subtarget.getHexagonArchVersion()) {
+  case HexagonSubtarget::V1:
+    break;
+  case HexagonSubtarget::V2:
+    return CalleeSavedRegsV2;
+  case HexagonSubtarget::V3:
+  case HexagonSubtarget::V4:
+  case HexagonSubtarget::V5:
+    return CalleeSavedRegsV3;
+  }
+  llvm_unreachable("Callee saved registers requested for unknown architecture "
+                   "version");
+}
+
+BitVector HexagonRegisterInfo::getReservedRegs(const MachineFunction &MF)
+  const {
+  BitVector Reserved(getNumRegs());
+  Reserved.set(HEXAGON_RESERVED_REG_1);
+  Reserved.set(HEXAGON_RESERVED_REG_2);
+  Reserved.set(Hexagon::R29);
+  Reserved.set(Hexagon::R30);
+  Reserved.set(Hexagon::R31);
+  Reserved.set(Hexagon::D14);
+  Reserved.set(Hexagon::D15);
+  Reserved.set(Hexagon::LC0);
+  Reserved.set(Hexagon::LC1);
+  Reserved.set(Hexagon::SA0);
+  Reserved.set(Hexagon::SA1);
+  return Reserved;
+}
+
+
+const TargetRegisterClass* const*
+HexagonRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
+  static const TargetRegisterClass * const CalleeSavedRegClassesV2[] = {
+    &Hexagon::IntRegsRegClass,     &Hexagon::IntRegsRegClass,
+    &Hexagon::IntRegsRegClass,     &Hexagon::IntRegsRegClass,
+    };
+  static const TargetRegisterClass * const CalleeSavedRegClassesV3[] = {
+    &Hexagon::IntRegsRegClass,     &Hexagon::IntRegsRegClass,
+    &Hexagon::IntRegsRegClass,     &Hexagon::IntRegsRegClass,
+    &Hexagon::IntRegsRegClass,     &Hexagon::IntRegsRegClass,
+    &Hexagon::IntRegsRegClass,     &Hexagon::IntRegsRegClass,
+    &Hexagon::IntRegsRegClass,     &Hexagon::IntRegsRegClass,
+    &Hexagon::IntRegsRegClass,     &Hexagon::IntRegsRegClass,
+  };
+
+  switch(Subtarget.getHexagonArchVersion()) {
+  case HexagonSubtarget::V1:
+    break;
+  case HexagonSubtarget::V2:
+    return CalleeSavedRegClassesV2;
+  case HexagonSubtarget::V3:
+  case HexagonSubtarget::V4:
+  case HexagonSubtarget::V5:
+    return CalleeSavedRegClassesV3;
+  }
+  llvm_unreachable("Callee saved register classes requested for unknown "
+                   "architecture version");
+}
+
+void HexagonRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                              int SPAdj, unsigned FIOperandNum,
+                                              RegScavenger *RS) const {
+  //
+  // Hexagon_TODO: Do we need to enforce this for Hexagon?
+  assert(SPAdj == 0 && "Unexpected");
+
+  MachineInstr &MI = *II;
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+
+  // Addressable stack objects are accessed using neg. offsets from %fp.
+  MachineFunction &MF = *MI.getParent()->getParent();
+  const HexagonInstrInfo &TII =
+    *static_cast<const HexagonInstrInfo*>(MF.getTarget().getInstrInfo());
+  int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex);
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+
+  unsigned FrameReg = getFrameRegister(MF);
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  if (!TFI->hasFP(MF)) {
+    // We will not reserve space on the stack for the lr and fp registers.
+    Offset -= 2 * Hexagon_WordSize;
+  }
+
+  const unsigned FrameSize = MFI.getStackSize();
+
+  if (!MFI.hasVarSizedObjects() &&
+      TII.isValidOffset(MI.getOpcode(), (FrameSize+Offset)) &&
+      !TII.isSpillPredRegOp(&MI)) {
+    // Replace frame index with a stack pointer reference.
+    MI.getOperand(FIOperandNum).ChangeToRegister(getStackRegister(), false,
+                                                 false, true);
+    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(FrameSize+Offset);
+  } else {
+    // Replace frame index with a frame pointer reference.
+    if (!TII.isValidOffset(MI.getOpcode(), Offset)) {
+
+      // If the offset overflows, then correct it.
+      //
+      // For loads, we do not need a reserved register
+      // r0 = memw(r30 + #10000) to:
+      //
+      // r0 = add(r30, #10000)
+      // r0 = memw(r0)
+      if ( (MI.getOpcode() == Hexagon::LDriw)  ||
+           (MI.getOpcode() == Hexagon::LDrid)   ||
+           (MI.getOpcode() == Hexagon::LDrih)   ||
+           (MI.getOpcode() == Hexagon::LDriuh)  ||
+           (MI.getOpcode() == Hexagon::LDrib)   ||
+           (MI.getOpcode() == Hexagon::LDriub)  ||
+           (MI.getOpcode() == Hexagon::LDriw_f) ||
+           (MI.getOpcode() == Hexagon::LDrid_f)) {
+        unsigned dstReg = (MI.getOpcode() == Hexagon::LDrid) ?
+          getSubReg(MI.getOperand(0).getReg(), Hexagon::subreg_loreg) :
+          MI.getOperand(0).getReg();
+
+        // Check if offset can fit in addi.
+        if (!TII.isValidOffset(Hexagon::ADD_ri, Offset)) {
+          BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
+                  TII.get(Hexagon::CONST32_Int_Real), dstReg).addImm(Offset);
+          BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
+                  TII.get(Hexagon::ADD_rr),
+                  dstReg).addReg(FrameReg).addReg(dstReg);
+        } else {
+          BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
+                  TII.get(Hexagon::ADD_ri),
+                  dstReg).addReg(FrameReg).addImm(Offset);
+        }
+
+        MI.getOperand(FIOperandNum).ChangeToRegister(dstReg, false, false,true);
+        MI.getOperand(FIOperandNum+1).ChangeToImmediate(0);
+      } else if ((MI.getOpcode() == Hexagon::STriw_indexed) ||
+                 (MI.getOpcode() == Hexagon::STriw) ||
+                 (MI.getOpcode() == Hexagon::STrid) ||
+                 (MI.getOpcode() == Hexagon::STrih) ||
+                 (MI.getOpcode() == Hexagon::STrib) ||
+                 (MI.getOpcode() == Hexagon::STrid_f) ||
+                 (MI.getOpcode() == Hexagon::STriw_f)) {
+        // For stores, we need a reserved register. Change
+        // memw(r30 + #10000) = r0 to:
+        //
+        // rs = add(r30, #10000);
+        // memw(rs) = r0
+        unsigned resReg = HEXAGON_RESERVED_REG_1;
+
+        // Check if offset can fit in addi.
+        if (!TII.isValidOffset(Hexagon::ADD_ri, Offset)) {
+          BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
+                  TII.get(Hexagon::CONST32_Int_Real), resReg).addImm(Offset);
+          BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
+                  TII.get(Hexagon::ADD_rr),
+                  resReg).addReg(FrameReg).addReg(resReg);
+        } else {
+          BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
+                  TII.get(Hexagon::ADD_ri),
+                  resReg).addReg(FrameReg).addImm(Offset);
+        }
+        MI.getOperand(FIOperandNum).ChangeToRegister(resReg, false, false,true);
+        MI.getOperand(FIOperandNum+1).ChangeToImmediate(0);
+      } else if (TII.isMemOp(&MI)) {
+        // use the constant extender if the instruction provides it
+        // and we are V4TOps.
+        if (Subtarget.hasV4TOps()) {
+          if (TII.isConstExtended(&MI)) {
+            MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false);
+            MI.getOperand(FIOperandNum+1).ChangeToImmediate(Offset);
+            TII.immediateExtend(&MI);
+          } else {
+            llvm_unreachable("Need to implement for memops");
+          }
+        } else {
+          // Only V3 and older instructions here.
+          unsigned ResReg = HEXAGON_RESERVED_REG_1;
+          if (!MFI.hasVarSizedObjects() &&
+              TII.isValidOffset(MI.getOpcode(), (FrameSize+Offset))) {
+            MI.getOperand(FIOperandNum).ChangeToRegister(getStackRegister(),
+                                                         false, false, false);
+            MI.getOperand(FIOperandNum+1).ChangeToImmediate(FrameSize+Offset);
+          } else if (!TII.isValidOffset(Hexagon::ADD_ri, Offset)) {
+            BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
+                    TII.get(Hexagon::CONST32_Int_Real), ResReg).addImm(Offset);
+            BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
+                    TII.get(Hexagon::ADD_rr), ResReg).addReg(FrameReg).
+              addReg(ResReg);
+            MI.getOperand(FIOperandNum).ChangeToRegister(ResReg, false, false,
+                                                         true);
+            MI.getOperand(FIOperandNum+1).ChangeToImmediate(0);
+          } else {
+            BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
+                    TII.get(Hexagon::ADD_ri), ResReg).addReg(FrameReg).
+              addImm(Offset);
+            MI.getOperand(FIOperandNum).ChangeToRegister(ResReg, false, false,
+                                                         true);
+            MI.getOperand(FIOperandNum+1).ChangeToImmediate(0);
+          }
+        }
+      } else {
+        unsigned dstReg = MI.getOperand(0).getReg();
+        BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
+                TII.get(Hexagon::CONST32_Int_Real), dstReg).addImm(Offset);
+        BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
+                TII.get(Hexagon::ADD_rr),
+                dstReg).addReg(FrameReg).addReg(dstReg);
+        // Can we delete MI??? r2 = add (r2, #0).
+        MI.getOperand(FIOperandNum).ChangeToRegister(dstReg, false, false,true);
+        MI.getOperand(FIOperandNum+1).ChangeToImmediate(0);
+      }
+    } else {
+      // If the offset is small enough to fit in the immediate field, directly
+      // encode it.
+      MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false);
+      MI.getOperand(FIOperandNum+1).ChangeToImmediate(Offset);
+    }
+  }
+
+}
+
+unsigned HexagonRegisterInfo::getRARegister() const {
+  return Hexagon::R31;
+}
+
+unsigned HexagonRegisterInfo::getFrameRegister(const MachineFunction
+                                               &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  if (TFI->hasFP(MF)) {
+    return Hexagon::R30;
+  }
+
+  return Hexagon::R29;
+}
+
+unsigned HexagonRegisterInfo::getFrameRegister() const {
+  return Hexagon::R30;
+}
+
+unsigned HexagonRegisterInfo::getStackRegister() const {
+  return Hexagon::R29;
+}
+
+#define GET_REGINFO_TARGET_DESC
+#include "HexagonGenRegisterInfo.inc"
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.h b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.h
new file mode 100644
index 0000000..648b4af
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.h
@@ -0,0 +1,86 @@
+//==- HexagonRegisterInfo.h - Hexagon Register Information Impl --*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Hexagon implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HexagonREGISTERINFO_H
+#define HexagonREGISTERINFO_H
+
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "HexagonGenRegisterInfo.inc"
+
+//
+//  We try not to hard code the reserved registers in our code,
+//  so the following two macros were defined. However, there
+//  are still a few places that R11 and R10 are hard wired.
+//  See below. If, in the future, we decided to change the reserved
+//  register. Don't forget changing the following places.
+//
+//  1. the "Defs" set of STriw_pred in HexagonInstrInfo.td
+//  2. the "Defs" set of LDri_pred in HexagonInstrInfo.td
+//  3. the definition of "IntRegs" in HexagonRegisterInfo.td
+//  4. the definition of "DoubleRegs" in HexagonRegisterInfo.td
+//
+#define HEXAGON_RESERVED_REG_1 Hexagon::R10
+#define HEXAGON_RESERVED_REG_2 Hexagon::R11
+
+namespace llvm {
+
+class HexagonSubtarget;
+class HexagonInstrInfo;
+class Type;
+
+struct HexagonRegisterInfo : public HexagonGenRegisterInfo {
+  HexagonSubtarget &Subtarget;
+
+  HexagonRegisterInfo(HexagonSubtarget &st);
+
+  /// Code Generation virtual methods...
+  const MCPhysReg *
+  getCalleeSavedRegs(const MachineFunction *MF = nullptr) const override;
+
+  const TargetRegisterClass* const*
+  getCalleeSavedRegClasses(const MachineFunction *MF = nullptr) const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                           int SPAdj, unsigned FIOperandNum,
+                           RegScavenger *RS = nullptr) const override;
+
+  /// determineFrameLayout - Determine the size of the frame and maximum call
+  /// frame size.
+  void determineFrameLayout(MachineFunction &MF) const;
+
+  /// requiresRegisterScavenging - returns true since we may need scavenging for
+  /// a temporary register when generating hardware loop instructions.
+  bool requiresRegisterScavenging(const MachineFunction &MF) const override {
+    return true;
+  }
+
+  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override {
+    return true;
+  }
+
+  // Debug information queries.
+  unsigned getRARegister() const;
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
+  unsigned getFrameRegister() const;
+  unsigned getStackRegister() const;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td
new file mode 100644
index 0000000..8ea1b7e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td
@@ -0,0 +1,167 @@
+//===-- HexagonRegisterInfo.td - Hexagon Register defs -----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the Hexagon register file.
+//===----------------------------------------------------------------------===//
+
+let Namespace = "Hexagon" in {
+
+  class HexagonReg<string n> : Register<n> {
+    field bits<5> Num;
+  }
+
+  class HexagonDoubleReg<string n, list<Register> subregs> :
+        RegisterWithSubRegs<n, subregs> {
+    field bits<5> Num;
+  }
+
+  // Registers are identified with 5-bit ID numbers.
+  // Ri - 32-bit integer registers.
+  class Ri<bits<5> num, string n> : HexagonReg<n> {
+    let Num = num;
+  }
+
+  // Rf - 32-bit floating-point registers.
+  class Rf<bits<5> num, string n> : HexagonReg<n> {
+    let Num = num;
+  }
+
+
+  // Rd - 64-bit registers.
+  class Rd<bits<5> num, string n, list<Register> subregs> :
+        HexagonDoubleReg<n, subregs> {
+    let Num = num;
+    let SubRegs = subregs;
+  }
+
+  // Rp - predicate registers
+  class Rp<bits<5> num, string n> : HexagonReg<n> {
+    let Num = num;
+  }
+
+  // Rc - control registers
+  class Rc<bits<5> num, string n> : HexagonReg<n> {
+    let Num = num;
+  }
+
+  // Rj - aliased integer registers
+  class Rj<string n, Ri R>: HexagonReg<n> {
+    let Num = R.Num;
+    let Aliases = [R];
+  }
+
+  def subreg_loreg  : SubRegIndex<32>;
+  def subreg_hireg  : SubRegIndex<32, 32>;
+
+  // Integer registers.
+  def R0 : Ri< 0, "r0">, DwarfRegNum<[0]>;
+  def R1 : Ri< 1, "r1">, DwarfRegNum<[1]>;
+  def R2 : Ri< 2, "r2">, DwarfRegNum<[2]>;
+  def R3 : Ri< 3, "r3">, DwarfRegNum<[3]>;
+  def R4 : Ri< 4, "r4">, DwarfRegNum<[4]>;
+  def R5 : Ri< 5, "r5">, DwarfRegNum<[5]>;
+  def R6 : Ri< 6, "r6">, DwarfRegNum<[6]>;
+  def R7 : Ri< 7, "r7">, DwarfRegNum<[7]>;
+  def R8 : Ri< 8, "r8">, DwarfRegNum<[8]>;
+  def R9 : Ri< 9, "r9">, DwarfRegNum<[9]>;
+  def R10 : Ri<10, "r10">, DwarfRegNum<[10]>;
+  def R11 : Ri<11, "r11">, DwarfRegNum<[11]>;
+  def R12 : Ri<12, "r12">, DwarfRegNum<[12]>;
+  def R13 : Ri<13, "r13">, DwarfRegNum<[13]>;
+  def R14 : Ri<14, "r14">, DwarfRegNum<[14]>;
+  def R15 : Ri<15, "r15">, DwarfRegNum<[15]>;
+  def R16 : Ri<16, "r16">, DwarfRegNum<[16]>;
+  def R17 : Ri<17, "r17">, DwarfRegNum<[17]>;
+  def R18 : Ri<18, "r18">, DwarfRegNum<[18]>;
+  def R19 : Ri<19, "r19">, DwarfRegNum<[19]>;
+  def R20 : Ri<20, "r20">, DwarfRegNum<[20]>;
+  def R21 : Ri<21, "r21">, DwarfRegNum<[21]>;
+  def R22 : Ri<22, "r22">, DwarfRegNum<[22]>;
+  def R23 : Ri<23, "r23">, DwarfRegNum<[23]>;
+  def R24 : Ri<24, "r24">, DwarfRegNum<[24]>;
+  def R25 : Ri<25, "r25">, DwarfRegNum<[25]>;
+  def R26 : Ri<26, "r26">, DwarfRegNum<[26]>;
+  def R27 : Ri<27, "r27">, DwarfRegNum<[27]>;
+  def R28 : Ri<28, "r28">, DwarfRegNum<[28]>;
+  def R29 : Ri<29, "r29">, DwarfRegNum<[29]>;
+  def R30 : Ri<30, "r30">, DwarfRegNum<[30]>;
+  def R31 : Ri<31, "r31">, DwarfRegNum<[31]>;
+
+  def SP : Rj<"sp", R29>, DwarfRegNum<[29]>;
+  def FP : Rj<"fp", R30>, DwarfRegNum<[30]>;
+  def LR : Rj<"lr", R31>, DwarfRegNum<[31]>;
+
+  // Aliases of the R* registers used to hold 64-bit int values (doubles).
+  let SubRegIndices = [subreg_loreg, subreg_hireg], CoveredBySubRegs = 1 in {
+  def D0  : Rd< 0,  "r1:0",  [R0,  R1]>, DwarfRegNum<[32]>;
+  def D1  : Rd< 2,  "r3:2",  [R2,  R3]>, DwarfRegNum<[34]>;
+  def D2  : Rd< 4,  "r5:4",  [R4,  R5]>, DwarfRegNum<[36]>;
+  def D3  : Rd< 6,  "r7:6",  [R6,  R7]>, DwarfRegNum<[38]>;
+  def D4  : Rd< 8,  "r9:8",  [R8,  R9]>, DwarfRegNum<[40]>;
+  def D5  : Rd<10, "r11:10", [R10, R11]>, DwarfRegNum<[42]>;
+  def D6  : Rd<12, "r13:12", [R12, R13]>, DwarfRegNum<[44]>;
+  def D7  : Rd<14, "r15:14", [R14, R15]>, DwarfRegNum<[46]>;
+  def D8  : Rd<16, "r17:16", [R16, R17]>, DwarfRegNum<[48]>;
+  def D9  : Rd<18, "r19:18", [R18, R19]>, DwarfRegNum<[50]>;
+  def D10 : Rd<20, "r21:20", [R20, R21]>, DwarfRegNum<[52]>;
+  def D11 : Rd<22, "r23:22", [R22, R23]>, DwarfRegNum<[54]>;
+  def D12 : Rd<24, "r25:24", [R24, R25]>, DwarfRegNum<[56]>;
+  def D13 : Rd<26, "r27:26", [R26, R27]>, DwarfRegNum<[58]>;
+  def D14 : Rd<28, "r29:28", [R28, R29]>, DwarfRegNum<[60]>;
+  def D15 : Rd<30, "r31:30", [R30, R31]>, DwarfRegNum<[62]>;
+  }
+
+  // Predicate registers.
+  def P0 : Rp<0, "p0">, DwarfRegNum<[63]>;
+  def P1 : Rp<1, "p1">, DwarfRegNum<[64]>;
+  def P2 : Rp<2, "p2">, DwarfRegNum<[65]>;
+  def P3 : Rp<3, "p3">, DwarfRegNum<[66]>;
+
+  // Control registers.
+  def SA0 : Rc<0, "sa0">, DwarfRegNum<[67]>;
+  def LC0 : Rc<1, "lc0">, DwarfRegNum<[68]>;
+
+  def SA1 : Rc<2, "sa1">, DwarfRegNum<[69]>;
+  def LC1 : Rc<3, "lc1">, DwarfRegNum<[70]>;
+
+  def M0 : Rc<6, "m0">, DwarfRegNum<[71]>;
+  def M1 : Rc<7, "m1">, DwarfRegNum<[72]>;
+
+  def PC : Rc<9,  "pc">, DwarfRegNum<[32]>; // is the Dwarf number correct?
+  def GP : Rc<11, "gp">, DwarfRegNum<[33]>; // is the Dwarf number correct?
+}
+
+// Register classes.
+//
+// FIXME: the register order should be defined in terms of the preferred
+// allocation order...
+//
+def IntRegs : RegisterClass<"Hexagon", [i32,f32], 32,
+                            (add (sequence "R%u", 0, 9),
+                                 (sequence "R%u", 12, 28),
+                                 R10, R11, R29, R30, R31)> {
+}
+
+def DoubleRegs : RegisterClass<"Hexagon", [i64,f64], 64,
+                               (add (sequence "D%u", 0, 4),
+                                    (sequence "D%u", 6, 13), D5, D14, D15)>;
+
+
+def PredRegs : RegisterClass<"Hexagon", [i1], 32, (add (sequence "P%u", 0, 3))>
+{
+  let Size = 32;
+}
+
+def CRRegs : RegisterClass<"Hexagon", [i32], 32,
+                           (add (sequence "LC%u", 0, 1),
+                                (sequence "SA%u", 0, 1),
+                                (sequence "M%u", 0, 1), PC, GP)> {
+  let Size = 32;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp
new file mode 100644
index 0000000..2b459a4
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp
@@ -0,0 +1,89 @@
+//===- HexagonRemoveExtendArgs.cpp - Remove unnecessary argument sign extends //
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Pass that removes sign extends for function parameters. These parameters
+// are already sign extended by the caller per Hexagon's ABI
+//
+//===----------------------------------------------------------------------===//
+
+#include "Hexagon.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+
+namespace llvm {
+  void initializeHexagonRemoveExtendArgsPass(PassRegistry&);
+}
+
+namespace {
+  struct HexagonRemoveExtendArgs : public FunctionPass {
+  public:
+    static char ID;
+    HexagonRemoveExtendArgs() : FunctionPass(ID) {
+      initializeHexagonRemoveExtendArgsPass(*PassRegistry::getPassRegistry());
+    }
+    bool runOnFunction(Function &F) override;
+
+    const char *getPassName() const override {
+      return "Remove sign extends";
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<MachineFunctionAnalysis>();
+      AU.addPreserved<MachineFunctionAnalysis>();
+      AU.addPreserved("stack-protector");
+      FunctionPass::getAnalysisUsage(AU);
+    }
+  };
+}
+
+char HexagonRemoveExtendArgs::ID = 0;
+
+INITIALIZE_PASS(HexagonRemoveExtendArgs, "reargs",
+                "Remove Sign and Zero Extends for Args", false, false)
+
+bool HexagonRemoveExtendArgs::runOnFunction(Function &F) {
+  unsigned Idx = 1;
+  for (Function::arg_iterator AI = F.arg_begin(), AE = F.arg_end(); AI != AE;
+       ++AI, ++Idx) {
+    if (F.getAttributes().hasAttribute(Idx, Attribute::SExt)) {
+      Argument* Arg = AI;
+      if (!isa<PointerType>(Arg->getType())) {
+        for (auto UI = Arg->user_begin(); UI != Arg->user_end();) {
+          if (isa<SExtInst>(*UI)) {
+            Instruction* I = cast<Instruction>(*UI);
+            SExtInst* SI = new SExtInst(Arg, I->getType());
+            assert (EVT::getEVT(SI->getType()) ==
+                    (EVT::getEVT(I->getType())));
+            ++UI;
+            I->replaceAllUsesWith(SI);
+            Instruction* First = F.getEntryBlock().begin();
+            SI->insertBefore(First);
+            I->eraseFromParent();
+          } else {
+            ++UI;
+          }
+        }
+      }
+    }
+  }
+  return true;
+}
+
+
+
+FunctionPass*
+llvm::createHexagonRemoveExtendArgs(const HexagonTargetMachine &TM) {
+  return new HexagonRemoveExtendArgs();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSchedule.td b/contrib/llvm/lib/Target/Hexagon/HexagonSchedule.td
new file mode 100644
index 0000000..528cafc
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonSchedule.td
@@ -0,0 +1,18 @@
+//===- HexagonSchedule.td - Hexagon Scheduling Definitions -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// V4 Machine Info +
+//===----------------------------------------------------------------------===//
+
+include "HexagonScheduleV4.td"
+
+//===----------------------------------------------------------------------===//
+// V4 Machine Info -
+//===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV4.td b/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV4.td
new file mode 100644
index 0000000..a7d2d47
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV4.td
@@ -0,0 +1,203 @@
+//=-HexagonScheduleV4.td - HexagonV4 Scheduling Definitions --*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// There are four SLOTS (four parallel pipelines) in Hexagon V4 machine.
+// This file describes that machine information.
+
+//
+//    |===========|==================================================|
+//    | PIPELINE  |              Instruction Classes                 |
+//    |===========|==================================================|
+//    | SLOT0     |  LD       ST    ALU32     MEMOP     NV    SYSTEM |
+//    |-----------|--------------------------------------------------|
+//    | SLOT1     |  LD       ST    ALU32                            |
+//    |-----------|--------------------------------------------------|
+//    | SLOT2     |  XTYPE          ALU32     J         JR           |
+//    |-----------|--------------------------------------------------|
+//    | SLOT3     |  XTYPE          ALU32     J         CR           |
+//    |===========|==================================================|
+
+// Functional Units.
+def SLOT0       : FuncUnit;
+def SLOT1       : FuncUnit;
+def SLOT2       : FuncUnit;
+def SLOT3       : FuncUnit;
+// Endloop is a pseudo instruction that is encoded with 2 bits in a packet
+// rather than taking an execution slot. This special unit is needed
+// to schedule an ENDLOOP with 4 other instructions.
+def SLOT_ENDLOOP: FuncUnit;
+
+// Itinerary classes.
+def PSEUDO      : InstrItinClass;
+def PSEUDOM   : InstrItinClass;
+// ALU64/M/S Instruction classes of V2 are collectively knownn as XTYPE in V4.
+def DUPLEX      : InstrItinClass;
+def PREFIX      : InstrItinClass;
+def COMPOUND    : InstrItinClass;
+
+def ALU32_2op_tc_1_SLOT0123  : InstrItinClass;
+def ALU32_2op_tc_2early_SLOT0123  : InstrItinClass;
+def ALU32_3op_tc_2early_SLOT0123  : InstrItinClass;
+def ALU32_3op_tc_1_SLOT0123  : InstrItinClass;
+def ALU32_3op_tc_2_SLOT0123  : InstrItinClass;
+def ALU32_ADDI_tc_1_SLOT0123 : InstrItinClass;
+def ALU64_tc_1_SLOT23        : InstrItinClass;
+def ALU64_tc_1or2_SLOT23     : InstrItinClass;
+def ALU64_tc_2_SLOT23        : InstrItinClass;
+def ALU64_tc_2early_SLOT23   : InstrItinClass;
+def ALU64_tc_3x_SLOT23       : InstrItinClass;
+def CR_tc_2_SLOT3            : InstrItinClass;
+def CR_tc_2early_SLOT23      : InstrItinClass;
+def CR_tc_2early_SLOT3       : InstrItinClass;
+def CR_tc_3x_SLOT23          : InstrItinClass;
+def CR_tc_3x_SLOT3           : InstrItinClass;
+def J_tc_2early_SLOT23       : InstrItinClass;
+def J_tc_2early_SLOT2        : InstrItinClass;
+def LD_tc_ld_SLOT01          : InstrItinClass;
+def LD_tc_ld_SLOT0           : InstrItinClass;
+def LD_tc_3or4stall_SLOT0    : InstrItinClass;
+def M_tc_1_SLOT23            : InstrItinClass;
+def M_tc_1or2_SLOT23         : InstrItinClass;
+def M_tc_2_SLOT23            : InstrItinClass;
+def M_tc_3_SLOT23            : InstrItinClass;
+def M_tc_3x_SLOT23           : InstrItinClass;
+def M_tc_3or4x_SLOT23        : InstrItinClass;
+def ST_tc_st_SLOT01          : InstrItinClass;
+def ST_tc_st_SLOT0           : InstrItinClass;
+def ST_tc_ld_SLOT0           : InstrItinClass;
+def ST_tc_3stall_SLOT0       : InstrItinClass;
+def S_2op_tc_1_SLOT23        : InstrItinClass;
+def S_2op_tc_2_SLOT23        : InstrItinClass;
+def S_2op_tc_2early_SLOT23   : InstrItinClass;
+def S_2op_tc_3or4x_SLOT23    : InstrItinClass;
+def S_3op_tc_1_SLOT23        : InstrItinClass;
+def S_3op_tc_1or2_SLOT23     : InstrItinClass;
+def S_3op_tc_2_SLOT23        : InstrItinClass;
+def S_3op_tc_2early_SLOT23   : InstrItinClass;
+def S_3op_tc_3_SLOT23        : InstrItinClass;
+def S_3op_tc_3x_SLOT23       : InstrItinClass;
+def NCJ_tc_3or4stall_SLOT0   : InstrItinClass;
+def V2LDST_tc_ld_SLOT01      : InstrItinClass;
+def V2LDST_tc_st_SLOT0       : InstrItinClass;
+def V2LDST_tc_st_SLOT01      : InstrItinClass;
+def V4LDST_tc_ld_SLOT01      : InstrItinClass;
+def V4LDST_tc_st_SLOT0       : InstrItinClass;
+def V4LDST_tc_st_SLOT01      : InstrItinClass;
+def J_tc_2early_SLOT0123     : InstrItinClass;
+def EXTENDER_tc_1_SLOT0123   : InstrItinClass;
+
+
+def HexagonItinerariesV4 :
+      ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP], [], [
+        // ALU32
+        InstrItinData<ALU32_2op_tc_1_SLOT0123  ,
+                     [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<ALU32_2op_tc_2early_SLOT0123,
+                     [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<ALU32_3op_tc_1_SLOT0123   ,
+                     [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<ALU32_3op_tc_2early_SLOT0123,
+                     [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<ALU32_3op_tc_2_SLOT0123   ,
+                     [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<ALU32_ADDI_tc_1_SLOT0123  ,
+                     [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+
+        // ALU64
+        InstrItinData<ALU64_tc_1_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<ALU64_tc_1or2_SLOT23   , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<ALU64_tc_2_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<ALU64_tc_2early_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<ALU64_tc_3x_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
+
+        // CR -> System
+        InstrItinData<CR_tc_2_SLOT3          , [InstrStage<1, [SLOT3]>]>,
+        InstrItinData<CR_tc_2early_SLOT3     , [InstrStage<1, [SLOT3]>]>,
+        InstrItinData<CR_tc_3x_SLOT3         , [InstrStage<1, [SLOT3]>]>,
+
+        // Jump (conditional/unconditional/return etc)
+        // CR
+        InstrItinData<CR_tc_2early_SLOT23    , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<CR_tc_3x_SLOT23        , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        // J
+        InstrItinData<J_tc_2early_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        // JR
+        InstrItinData<J_tc_2early_SLOT2      , [InstrStage<1, [SLOT2]>]>,
+
+        //Load
+        InstrItinData<LD_tc_ld_SLOT01        , [InstrStage<1, [SLOT0, SLOT1]>]>,
+        InstrItinData<LD_tc_ld_SLOT0         , [InstrStage<1, [SLOT0]>]>,
+        InstrItinData<LD_tc_3or4stall_SLOT0  , [InstrStage<1, [SLOT0]>]>,
+
+        // M
+        InstrItinData<M_tc_1_SLOT23          , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<M_tc_1or2_SLOT23       , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<M_tc_2_SLOT23          , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<M_tc_3_SLOT23          , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<M_tc_3x_SLOT23         , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<M_tc_3or4x_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
+
+        // Store
+        // ST
+        InstrItinData<ST_tc_st_SLOT01        , [InstrStage<1, [SLOT0, SLOT1]>]>,
+        // ST0
+        InstrItinData<ST_tc_st_SLOT0         , [InstrStage<1, [SLOT0]>]>,
+        InstrItinData<ST_tc_ld_SLOT0         , [InstrStage<1, [SLOT0]>]>,
+
+        // S
+        InstrItinData<S_2op_tc_1_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_2op_tc_2_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_2op_tc_2early_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_2op_tc_3or4x_SLOT23  , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_3op_tc_1_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_3op_tc_1or2_SLOT23   , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_3op_tc_2early_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_3op_tc_2_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_3op_tc_3_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_3op_tc_3x_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
+
+        // SYS
+        InstrItinData<ST_tc_3stall_SLOT0     , [InstrStage<1, [SLOT0]>]>,
+
+        // New Value Compare Jump
+        InstrItinData<NCJ_tc_3or4stall_SLOT0 , [InstrStage<1, [SLOT0]>]>,
+
+        // Mem ops - MEM_V4
+        InstrItinData<V2LDST_tc_st_SLOT0     , [InstrStage<1, [SLOT0]>]>,
+        InstrItinData<V2LDST_tc_ld_SLOT01    , [InstrStage<1, [SLOT0, SLOT1]>]>,
+        InstrItinData<V2LDST_tc_st_SLOT01    , [InstrStage<1, [SLOT0, SLOT1]>]>,
+        InstrItinData<V4LDST_tc_st_SLOT0     , [InstrStage<1, [SLOT0]>]>,
+        InstrItinData<V4LDST_tc_ld_SLOT01    , [InstrStage<1, [SLOT0, SLOT1]>]>,
+        InstrItinData<V4LDST_tc_st_SLOT01    , [InstrStage<1, [SLOT0, SLOT1]>]>,
+
+        InstrItinData<DUPLEX , [InstrStage<1, [SLOT0]>]>,
+
+        // ENDLOOP
+        InstrItinData<J_tc_2early_SLOT0123   , [InstrStage<1, [SLOT_ENDLOOP]>]>,
+
+        // Extender/PREFIX
+        InstrItinData<EXTENDER_tc_1_SLOT0123,
+                     [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+
+        InstrItinData<COMPOUND , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<PSEUDO , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<PSEUDOM, [InstrStage<1, [SLOT2, SLOT3], 0>,
+                                InstrStage<1, [SLOT2, SLOT3]>]>
+      ]>;
+
+def HexagonModelV4 : SchedMachineModel {
+  // Max issue per cycle == bundle width.
+  let IssueWidth = 4;
+  let Itineraries = HexagonItinerariesV4;
+  let LoadLatency = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Hexagon V4 Resource Definitions -
+//===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSelectCCInfo.td b/contrib/llvm/lib/Target/Hexagon/HexagonSelectCCInfo.td
new file mode 100644
index 0000000..d8feb89
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonSelectCCInfo.td
@@ -0,0 +1,121 @@
+//===-- HexagoSelectCCInfo.td - Selectcc mappings ----------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+//
+// selectcc mappings.
+//
+def : Pat <(i32 (selectcc IntRegs:$lhs, IntRegs:$rhs, IntRegs:$tval,
+                          IntRegs:$fval, SETEQ)),
+      (i32 (MUX_rr (i1 (CMPEQrr IntRegs:$lhs, IntRegs:$rhs)),
+                   IntRegs:$tval, IntRegs:$fval))>;
+
+def : Pat <(i32 (selectcc IntRegs:$lhs, IntRegs:$rhs, IntRegs:$tval,
+                          IntRegs:$fval, SETNE)),
+      (i32 (MUX_rr (i1 (NOT_p (CMPEQrr IntRegs:$lhs, IntRegs:$rhs))),
+                   IntRegs:$tval, IntRegs:$fval))>;
+
+def : Pat <(i32 (selectcc IntRegs:$lhs, IntRegs:$rhs, IntRegs:$tval,
+                          IntRegs:$fval, SETGT)),
+      (i32 (MUX_rr (i1 (CMPGTrr IntRegs:$lhs, IntRegs:$rhs)),
+                   IntRegs:$tval, IntRegs:$fval))>;
+
+def : Pat <(i32 (selectcc IntRegs:$lhs, IntRegs:$rhs, IntRegs:$tval,
+                          IntRegs:$fval, SETUGT)),
+      (i32 (MUX_rr (i1 (CMPGTUrr IntRegs:$lhs, IntRegs:$rhs)),
+                   IntRegs:$tval, IntRegs:$fval))>;
+
+
+
+def : Pat <(i32 (selectcc IntRegs:$lhs, IntRegs:$rhs, IntRegs:$tval,
+                          IntRegs:$fval, SETULT)),
+      (i32 (MUX_rr (i1 (NOT_p (CMPGTUrr IntRegs:$lhs,
+                                         (ADD_ri IntRegs:$rhs, -1)))),
+                   IntRegs:$tval, IntRegs:$fval))>;
+
+def : Pat <(i32 (selectcc IntRegs:$lhs, IntRegs:$rhs, IntRegs:$tval,
+                          IntRegs:$fval, SETLT)),
+      (i32 (MUX_rr (i1 (NOT_p (CMPGTrr IntRegs:$lhs,
+                                        (ADD_ri IntRegs:$rhs, -1)))),
+                   IntRegs:$tval, IntRegs:$fval))>;
+
+def : Pat <(i32 (selectcc IntRegs:$lhs, IntRegs:$rhs, IntRegs:$tval,
+                          IntRegs:$fval, SETLE)),
+      (i32 (MUX_rr (i1 (NOT_p (CMPGTrr IntRegs:$lhs, IntRegs:$rhs))),
+                   IntRegs:$tval, IntRegs:$fval))>;
+
+def : Pat <(i32 (selectcc IntRegs:$lhs, IntRegs:$rhs, IntRegs:$tval,
+                          IntRegs:$fval, SETULE)),
+      (i32 (MUX_rr (i1 (NOT_p (CMPGTUrr IntRegs:$lhs, IntRegs:$rhs))),
+                   IntRegs:$tval, IntRegs:$fval))>;
+
+
+//
+// selectcc mappings for greater-equal-to Rs => greater-than Rs-1.
+//
+def : Pat <(i32 (selectcc IntRegs:$lhs, IntRegs:$rhs, IntRegs:$tval,
+                          IntRegs:$fval, SETGE)),
+      (i32 (MUX_rr (i1 (CMPGTrr IntRegs:$lhs, (ADD_ri IntRegs:$rhs, -1))),
+                   IntRegs:$tval, IntRegs:$fval))>;
+
+def : Pat <(i32 (selectcc IntRegs:$lhs, IntRegs:$rhs, IntRegs:$tval,
+                          IntRegs:$fval, SETUGE)),
+      (i32 (MUX_rr (i1 (CMPGTUrr IntRegs:$lhs, (ADD_ri IntRegs:$rhs, -1))),
+                   IntRegs:$tval, IntRegs:$fval))>;
+
+
+
+//
+// selectcc mappings for predicate comparisons.
+//
+// Convert Rd = selectcc(p0, p1, true_val, false_val, SETEQ) into:
+//  pt = not(p1 xor p2)
+//  Rd = mux(pt, true_val, false_val)
+// and similarly for SETNE
+//
+def : Pat <(i32 (selectcc PredRegs:$lhs, PredRegs:$rhs, IntRegs:$tval,
+                          IntRegs:$fval, SETNE)),
+      (i32 (MUX_rr (i1 (XOR_pp PredRegs:$lhs, PredRegs:$rhs)), IntRegs:$tval,
+                   IntRegs:$fval))>;
+
+def : Pat <(i32 (selectcc PredRegs:$lhs, PredRegs:$rhs, IntRegs:$tval,
+                          IntRegs:$fval, SETEQ)),
+      (i32 (MUX_rr (i1 (NOT_p (XOR_pp PredRegs:$lhs, PredRegs:$rhs))),
+                   IntRegs:$tval, IntRegs:$fval))>;
+
+
+//
+// selectcc mappings for 64-bit operands are messy. Hexagon does not have a
+// MUX64 o, use this:
+// selectcc(Rss, Rdd, tval, fval, cond) ->
+//   combine(mux(cmp_cond(Rss, Rdd), tval.hi, fval.hi),
+//           mux(cmp_cond(Rss, Rdd), tval.lo, fval.lo))
+
+// setgt-64.
+def : Pat<(i64 (selectcc DoubleRegs:$lhs, DoubleRegs:$rhs, DoubleRegs:$tval,
+                         DoubleRegs:$fval, SETGT)),
+      (COMBINE_rr (MUX_rr (CMPGT64rr DoubleRegs:$lhs, DoubleRegs:$rhs),
+                           (EXTRACT_SUBREG DoubleRegs:$tval, subreg_hireg),
+                           (EXTRACT_SUBREG DoubleRegs:$fval, subreg_hireg)),
+                   (MUX_rr (CMPGT64rr DoubleRegs:$lhs, DoubleRegs:$rhs),
+                           (EXTRACT_SUBREG DoubleRegs:$tval, subreg_loreg),
+                           (EXTRACT_SUBREG DoubleRegs:$fval, subreg_loreg)))>;
+
+
+// setlt-64 -> setgt-64.
+def : Pat<(i64 (selectcc DoubleRegs:$lhs, DoubleRegs:$rhs, DoubleRegs:$tval,
+                         DoubleRegs:$fval, SETLT)),
+      (COMBINE_rr (MUX_rr (CMPGT64rr DoubleRegs:$lhs,
+                                     (ADD64_rr DoubleRegs:$rhs, (TFRI64 -1))),
+                           (EXTRACT_SUBREG DoubleRegs:$tval, subreg_hireg),
+                           (EXTRACT_SUBREG DoubleRegs:$fval, subreg_hireg)),
+                   (MUX_rr (CMPGT64rr DoubleRegs:$lhs,
+                                      (ADD64_rr DoubleRegs:$rhs, (TFRI64 -1))),
+                           (EXTRACT_SUBREG DoubleRegs:$tval, subreg_loreg),
+                           (EXTRACT_SUBREG DoubleRegs:$fval, subreg_loreg)))>;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
new file mode 100644
index 0000000..b5db997
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
@@ -0,0 +1,45 @@
+//===-- HexagonSelectionDAGInfo.cpp - Hexagon SelectionDAG Info -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the HexagonSelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonTargetMachine.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "hexagon-selectiondag-info"
+
+bool llvm::flag_aligned_memcpy;
+
+HexagonSelectionDAGInfo::HexagonSelectionDAGInfo(const DataLayout &DL)
+    : TargetSelectionDAGInfo(&DL) {}
+
+HexagonSelectionDAGInfo::~HexagonSelectionDAGInfo() {
+}
+
+SDValue
+HexagonSelectionDAGInfo::
+EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SDValue Chain,
+                        SDValue Dst, SDValue Src, SDValue Size, unsigned Align,
+                        bool isVolatile, bool AlwaysInline,
+                        MachinePointerInfo DstPtrInfo,
+                        MachinePointerInfo SrcPtrInfo) const {
+  flag_aligned_memcpy = false;
+  if ((Align & 0x3) == 0) {
+    ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+    if (ConstantSize) {
+      uint64_t SizeVal = ConstantSize->getZExtValue();
+      if ((SizeVal > 32) && ((SizeVal % 8) == 0))
+        flag_aligned_memcpy = true;
+    }
+  }
+
+  return SDValue();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.h b/contrib/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.h
new file mode 100644
index 0000000..b40b303
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.h
@@ -0,0 +1,37 @@
+//===-- HexagonSelectionDAGInfo.h - Hexagon SelectionDAG Info ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the Hexagon subclass for TargetSelectionDAGInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HexagonSELECTIONDAGINFO_H
+#define HexagonSELECTIONDAGINFO_H
+
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+
+namespace llvm {
+
+class HexagonSelectionDAGInfo : public TargetSelectionDAGInfo {
+public:
+  explicit HexagonSelectionDAGInfo(const DataLayout &DL);
+  ~HexagonSelectionDAGInfo();
+
+  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
+                                  SDValue Chain,
+                                  SDValue Dst, SDValue Src,
+                                  SDValue Size, unsigned Align,
+                                  bool isVolatile, bool AlwaysInline,
+                                  MachinePointerInfo DstPtrInfo,
+                                  MachinePointerInfo SrcPtrInfo) const override;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
new file mode 100644
index 0000000..247207f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
@@ -0,0 +1,180 @@
+//=== HexagonSplitConst32AndConst64.cpp - split CONST32/Const64 into HI/LO ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// When the compiler is invoked with no small data, for instance, with the -G0
+// command line option, then all CONST32_* opcodes should be broken down into
+// appropriate LO and HI instructions. This splitting is done by this pass.
+// The only reason this is not done in the DAG lowering itself is that there
+// is no simple way of getting the register allocator to allot the same hard
+// register to the result of LO and HI instructions. This pass is always
+// scheduled after register allocation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonMachineFunctionInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
+#include "HexagonTargetObjectFile.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LatencyPriorityQueue.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <map>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "xfer"
+
+namespace {
+
+class HexagonSplitConst32AndConst64 : public MachineFunctionPass {
+  const HexagonTargetMachine &QTM;
+
+ public:
+    static char ID;
+    HexagonSplitConst32AndConst64(const HexagonTargetMachine &TM)
+        : MachineFunctionPass(ID), QTM(TM) {}
+
+    const char *getPassName() const override {
+      return "Hexagon Split Const32s and Const64s";
+    }
+    bool runOnMachineFunction(MachineFunction &Fn) override;
+};
+
+
+char HexagonSplitConst32AndConst64::ID = 0;
+
+
+bool HexagonSplitConst32AndConst64::runOnMachineFunction(MachineFunction &Fn) {
+
+  const HexagonTargetObjectFile &TLOF =
+      (const HexagonTargetObjectFile &)
+      QTM.getTargetLowering()->getObjFileLowering();
+  if (TLOF.IsSmallDataEnabled())
+    return true;
+
+  const TargetInstrInfo *TII = QTM.getInstrInfo();
+
+  // Loop over all of the basic blocks
+  for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end();
+       MBBb != MBBe; ++MBBb) {
+    MachineBasicBlock* MBB = MBBb;
+    // Traverse the basic block
+    MachineBasicBlock::iterator MII = MBB->begin();
+    MachineBasicBlock::iterator MIE = MBB->end ();
+    while (MII != MIE) {
+      MachineInstr *MI = MII;
+      int Opc = MI->getOpcode();
+      if (Opc == Hexagon::CONST32_set) {
+        int DestReg = MI->getOperand(0).getReg();
+        MachineOperand &Symbol = MI->getOperand (1);
+
+        BuildMI (*MBB, MII, MI->getDebugLoc(),
+                 TII->get(Hexagon::LO), DestReg).addOperand(Symbol);
+        BuildMI (*MBB, MII, MI->getDebugLoc(),
+                 TII->get(Hexagon::HI), DestReg).addOperand(Symbol);
+        // MBB->erase returns the iterator to the next instruction, which is the
+        // one we want to process next
+        MII = MBB->erase (MI);
+        continue;
+      }
+      else if (Opc == Hexagon::CONST32_set_jt) {
+        int DestReg = MI->getOperand(0).getReg();
+        MachineOperand &Symbol = MI->getOperand (1);
+
+        BuildMI (*MBB, MII, MI->getDebugLoc(),
+                 TII->get(Hexagon::LO_jt), DestReg).addOperand(Symbol);
+        BuildMI (*MBB, MII, MI->getDebugLoc(),
+                 TII->get(Hexagon::HI_jt), DestReg).addOperand(Symbol);
+        // MBB->erase returns the iterator to the next instruction, which is the
+        // one we want to process next
+        MII = MBB->erase (MI);
+        continue;
+      }
+      else if (Opc == Hexagon::CONST32_Label) {
+        int DestReg = MI->getOperand(0).getReg();
+        MachineOperand &Symbol = MI->getOperand (1);
+
+        BuildMI (*MBB, MII, MI->getDebugLoc(),
+                 TII->get(Hexagon::LO_label), DestReg).addOperand(Symbol);
+        BuildMI (*MBB, MII, MI->getDebugLoc(),
+                 TII->get(Hexagon::HI_label), DestReg).addOperand(Symbol);
+        // MBB->erase returns the iterator to the next instruction, which is the
+        // one we want to process next
+        MII = MBB->erase (MI);
+        continue;
+      }
+      else if (Opc == Hexagon::CONST32_Int_Real) {
+        int DestReg = MI->getOperand(0).getReg();
+        int64_t ImmValue = MI->getOperand(1).getImm ();
+
+        BuildMI (*MBB, MII, MI->getDebugLoc(),
+                 TII->get(Hexagon::LOi), DestReg).addImm(ImmValue);
+        BuildMI (*MBB, MII, MI->getDebugLoc(),
+                 TII->get(Hexagon::HIi), DestReg).addImm(ImmValue);
+        MII = MBB->erase (MI);
+        continue;
+      }
+      else if (Opc == Hexagon::CONST64_Int_Real) {
+        int DestReg = MI->getOperand(0).getReg();
+        int64_t ImmValue = MI->getOperand(1).getImm ();
+        unsigned DestLo =
+          QTM.getRegisterInfo()->getSubReg (DestReg, Hexagon::subreg_loreg);
+        unsigned DestHi =
+          QTM.getRegisterInfo()->getSubReg (DestReg, Hexagon::subreg_hireg);
+
+        int32_t LowWord = (ImmValue & 0xFFFFFFFF);
+        int32_t HighWord = (ImmValue >> 32) & 0xFFFFFFFF;
+
+        // Lower Registers Lower Half
+        BuildMI (*MBB, MII, MI->getDebugLoc(),
+                 TII->get(Hexagon::LOi), DestLo).addImm(LowWord);
+        // Lower Registers Higher Half
+        BuildMI (*MBB, MII, MI->getDebugLoc(),
+                 TII->get(Hexagon::HIi), DestLo).addImm(LowWord);
+        // Higher Registers Lower Half
+        BuildMI (*MBB, MII, MI->getDebugLoc(),
+                 TII->get(Hexagon::LOi), DestHi).addImm(HighWord);
+        // Higher Registers Higher Half.
+        BuildMI (*MBB, MII, MI->getDebugLoc(),
+                 TII->get(Hexagon::HIi), DestHi).addImm(HighWord);
+        MII = MBB->erase (MI);
+        continue;
+       }
+      ++MII;
+    }
+  }
+
+  return true;
+}
+
+}
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+FunctionPass *
+llvm::createHexagonSplitConst32AndConst64(const HexagonTargetMachine &TM) {
+  return new HexagonSplitConst32AndConst64(TM);
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSplitTFRCondSets.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonSplitTFRCondSets.cpp
new file mode 100644
index 0000000..9601090
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonSplitTFRCondSets.cpp
@@ -0,0 +1,237 @@
+//===-- HexagonSplitTFRCondSets.cpp - split TFR condsets into xfers -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//
+//===----------------------------------------------------------------------===//
+// This pass tries to provide opportunities for better optimization of muxes.
+// The default code generated for something like: flag = (a == b) ? 1 : 3;
+// would be:
+//
+//   {p0 = cmp.eq(r0,r1)}
+//   {r3 = mux(p0,#1,#3)}
+//
+// This requires two packets.  If we use .new predicated immediate transfers,
+// then we can do this in a single packet, e.g.:
+//
+//   {p0 = cmp.eq(r0,r1)
+//    if (p0.new) r3 = #1
+//    if (!p0.new) r3 = #3}
+//
+// Note that the conditional assignments are not generated in .new form here.
+// We assume opptimisically that they will be formed later.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Hexagon.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/CodeGen/LatencyPriorityQueue.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "xfer"
+
+namespace llvm {
+  void initializeHexagonSplitTFRCondSetsPass(PassRegistry&);
+}
+
+
+namespace {
+
+class HexagonSplitTFRCondSets : public MachineFunctionPass {
+    const HexagonTargetMachine &QTM;
+    const HexagonSubtarget &QST;
+
+ public:
+    static char ID;
+    HexagonSplitTFRCondSets(const HexagonTargetMachine& TM) :
+      MachineFunctionPass(ID), QTM(TM), QST(*TM.getSubtargetImpl()) {
+      initializeHexagonSplitTFRCondSetsPass(*PassRegistry::getPassRegistry());
+    }
+
+    const char *getPassName() const override {
+      return "Hexagon Split TFRCondSets";
+    }
+    bool runOnMachineFunction(MachineFunction &Fn) override;
+};
+
+
+char HexagonSplitTFRCondSets::ID = 0;
+
+
+bool HexagonSplitTFRCondSets::runOnMachineFunction(MachineFunction &Fn) {
+
+  const TargetInstrInfo *TII = QTM.getInstrInfo();
+
+  // Loop over all of the basic blocks.
+  for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end();
+       MBBb != MBBe; ++MBBb) {
+    MachineBasicBlock* MBB = MBBb;
+    // Traverse the basic block.
+    for (MachineBasicBlock::iterator MII = MBB->begin(); MII != MBB->end();
+         ++MII) {
+      MachineInstr *MI = MII;
+      int Opc1, Opc2;
+      switch(MI->getOpcode()) {
+        case Hexagon::TFR_condset_rr:
+        case Hexagon::TFR_condset_rr_f:
+        case Hexagon::TFR_condset_rr64_f: {
+          int DestReg = MI->getOperand(0).getReg();
+          int SrcReg1 = MI->getOperand(2).getReg();
+          int SrcReg2 = MI->getOperand(3).getReg();
+
+          if (MI->getOpcode() == Hexagon::TFR_condset_rr ||
+              MI->getOpcode() == Hexagon::TFR_condset_rr_f) {
+            Opc1 = Hexagon::TFR_cPt;
+            Opc2 = Hexagon::TFR_cNotPt;
+          }
+          else if (MI->getOpcode() == Hexagon::TFR_condset_rr64_f) {
+            Opc1 = Hexagon::TFR64_cPt;
+            Opc2 = Hexagon::TFR64_cNotPt;
+          }
+
+          // Minor optimization: do not emit the predicated copy if the source
+          // and the destination is the same register.
+          if (DestReg != SrcReg1) {
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Opc1),
+                    DestReg).addReg(MI->getOperand(1).getReg()).addReg(SrcReg1);
+          }
+          if (DestReg != SrcReg2) {
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Opc2),
+                    DestReg).addReg(MI->getOperand(1).getReg()).addReg(SrcReg2);
+          }
+          MII = MBB->erase(MI);
+          --MII;
+          break;
+        }
+        case Hexagon::TFR_condset_ri:
+        case Hexagon::TFR_condset_ri_f: {
+          int DestReg = MI->getOperand(0).getReg();
+          int SrcReg1 = MI->getOperand(2).getReg();
+
+          //  Do not emit the predicated copy if the source and the destination
+          // is the same register.
+          if (DestReg != SrcReg1) {
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+              TII->get(Hexagon::TFR_cPt), DestReg).
+              addReg(MI->getOperand(1).getReg()).addReg(SrcReg1);
+          }
+          if (MI->getOpcode() ==  Hexagon::TFR_condset_ri ) {
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+              TII->get(Hexagon::TFRI_cNotPt), DestReg).
+              addReg(MI->getOperand(1).getReg()).
+              addImm(MI->getOperand(3).getImm());
+          } else if (MI->getOpcode() ==  Hexagon::TFR_condset_ri_f ) {
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+              TII->get(Hexagon::TFRI_cNotPt_f), DestReg).
+              addReg(MI->getOperand(1).getReg()).
+              addFPImm(MI->getOperand(3).getFPImm());
+          }
+
+          MII = MBB->erase(MI);
+          --MII;
+          break;
+        }
+        case Hexagon::TFR_condset_ir:
+        case Hexagon::TFR_condset_ir_f: {
+          int DestReg = MI->getOperand(0).getReg();
+          int SrcReg2 = MI->getOperand(3).getReg();
+
+          if (MI->getOpcode() ==  Hexagon::TFR_condset_ir ) {
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+              TII->get(Hexagon::TFRI_cPt), DestReg).
+              addReg(MI->getOperand(1).getReg()).
+              addImm(MI->getOperand(2).getImm());
+          } else if (MI->getOpcode() ==  Hexagon::TFR_condset_ir_f ) {
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+              TII->get(Hexagon::TFRI_cPt_f), DestReg).
+              addReg(MI->getOperand(1).getReg()).
+              addFPImm(MI->getOperand(2).getFPImm());
+          }
+
+          // Do not emit the predicated copy if the source and
+          // the destination is the same register.
+          if (DestReg != SrcReg2) {
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+              TII->get(Hexagon::TFR_cNotPt), DestReg).
+              addReg(MI->getOperand(1).getReg()).addReg(SrcReg2);
+          }
+          MII = MBB->erase(MI);
+          --MII;
+          break;
+        }
+        case Hexagon::TFR_condset_ii:
+        case Hexagon::TFR_condset_ii_f: {
+          int DestReg = MI->getOperand(0).getReg();
+          int SrcReg1 = MI->getOperand(1).getReg();
+
+          if (MI->getOpcode() ==  Hexagon::TFR_condset_ii ) {
+            int Immed1 = MI->getOperand(2).getImm();
+            int Immed2 = MI->getOperand(3).getImm();
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+                    TII->get(Hexagon::TFRI_cPt),
+                    DestReg).addReg(SrcReg1).addImm(Immed1);
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+                    TII->get(Hexagon::TFRI_cNotPt),
+                    DestReg).addReg(SrcReg1).addImm(Immed2);
+          } else if (MI->getOpcode() ==  Hexagon::TFR_condset_ii_f ) {
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+                    TII->get(Hexagon::TFRI_cPt_f), DestReg).
+                    addReg(SrcReg1).
+                    addFPImm(MI->getOperand(2).getFPImm());
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+                    TII->get(Hexagon::TFRI_cNotPt_f), DestReg).
+                    addReg(SrcReg1).
+                    addFPImm(MI->getOperand(3).getFPImm());
+          }
+          MII = MBB->erase(MI);
+          --MII;
+          break;
+        }
+      }
+    }
+  }
+  return true;
+}
+
+}
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+static void initializePassOnce(PassRegistry &Registry) {
+  const char *Name = "Hexagon Split TFRCondSets";
+  PassInfo *PI = new PassInfo(Name, "hexagon-split-tfr",
+                              &HexagonSplitTFRCondSets::ID, nullptr, false,
+                              false);
+  Registry.registerPass(*PI, true);
+}
+
+void llvm::initializeHexagonSplitTFRCondSetsPass(PassRegistry &Registry) {
+  CALL_ONCE_INITIALIZATION(initializePassOnce)
+}
+
+FunctionPass*
+llvm::createHexagonSplitTFRCondSets(const HexagonTargetMachine &TM) {
+  return new HexagonSplitTFRCondSets(TM);
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
new file mode 100644
index 0000000..657893f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -0,0 +1,99 @@
+//===-- HexagonSubtarget.cpp - Hexagon Subtarget Information --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Hexagon specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonSubtarget.h"
+#include "Hexagon.h"
+#include "HexagonRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "hexagon-subtarget"
+
+#define GET_SUBTARGETINFO_CTOR
+#define GET_SUBTARGETINFO_TARGET_DESC
+#include "HexagonGenSubtargetInfo.inc"
+
+static cl::opt<bool>
+EnableV3("enable-hexagon-v3", cl::Hidden,
+         cl::desc("Enable Hexagon V3 instructions."));
+
+static cl::opt<bool>
+EnableMemOps(
+    "enable-hexagon-memops",
+    cl::Hidden, cl::ZeroOrMore, cl::ValueDisallowed, cl::init(true),
+    cl::desc(
+      "Generate V4 MEMOP in code generation for Hexagon target"));
+
+static cl::opt<bool>
+DisableMemOps(
+    "disable-hexagon-memops",
+    cl::Hidden, cl::ZeroOrMore, cl::ValueDisallowed, cl::init(false),
+    cl::desc(
+      "Do not generate V4 MEMOP in code generation for Hexagon target"));
+
+static cl::opt<bool>
+EnableIEEERndNear(
+    "enable-hexagon-ieee-rnd-near",
+    cl::Hidden, cl::ZeroOrMore, cl::init(false),
+    cl::desc("Generate non-chopped conversion from fp to int."));
+
+HexagonSubtarget &
+HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
+  // If the programmer has not specified a Hexagon version, default to -mv4.
+  if (CPUString.empty())
+    CPUString = "hexagonv4";
+
+  if (CPUString == "hexagonv2") {
+    HexagonArchVersion = V2;
+  } else if (CPUString == "hexagonv3") {
+    EnableV3 = true;
+    HexagonArchVersion = V3;
+  } else if (CPUString == "hexagonv4") {
+    HexagonArchVersion = V4;
+  } else if (CPUString == "hexagonv5") {
+    HexagonArchVersion = V5;
+  } else {
+    llvm_unreachable("Unrecognized Hexagon processor version");
+  }
+
+  ParseSubtargetFeatures(CPUString, FS);
+  return *this;
+}
+
+HexagonSubtarget::HexagonSubtarget(StringRef TT, StringRef CPU, StringRef FS,
+                                   const TargetMachine &TM)
+    : HexagonGenSubtargetInfo(TT, CPU, FS), CPUString(CPU.str()),
+      DL("e-m:e-p:32:32-i1:32-i64:64-a:0-n32"),
+      InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM),
+      TSInfo(DL), FrameLowering() {
+
+  // Initialize scheduling itinerary for the specified CPU.
+  InstrItins = getInstrItineraryForCPU(CPUString);
+
+  // UseMemOps on by default unless disabled explicitly
+  if (DisableMemOps)
+    UseMemOps = false;
+  else if (EnableMemOps)
+    UseMemOps = true;
+  else
+    UseMemOps = false;
+
+  if (EnableIEEERndNear)
+    ModeIEEERndNear = true;
+  else
+    ModeIEEERndNear = false;
+}
+
+// Pin the vtable to this file.
+void HexagonSubtarget::anchor() {}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.h
new file mode 100644
index 0000000..b184e62
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.h
@@ -0,0 +1,105 @@
+//===-- HexagonSubtarget.h - Define Subtarget for the Hexagon ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Hexagon specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef Hexagon_SUBTARGET_H
+#define Hexagon_SUBTARGET_H
+
+#include "HexagonFrameLowering.h"
+#include "HexagonInstrInfo.h"
+#include "HexagonISelLowering.h"
+#include "HexagonSelectionDAGInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "HexagonGenSubtargetInfo.inc"
+
+#define Hexagon_SMALL_DATA_THRESHOLD 8
+#define Hexagon_SLOTS 4
+
+namespace llvm {
+
+class HexagonSubtarget : public HexagonGenSubtargetInfo {
+  virtual void anchor();
+
+  bool UseMemOps;
+  bool ModeIEEERndNear;
+
+public:
+  enum HexagonArchEnum {
+    V1, V2, V3, V4, V5
+  };
+
+  HexagonArchEnum HexagonArchVersion;
+private:
+  std::string CPUString;
+  const DataLayout DL;       // Calculates type size & alignment.
+  HexagonInstrInfo InstrInfo;
+  HexagonTargetLowering TLInfo;
+  HexagonSelectionDAGInfo TSInfo;
+  HexagonFrameLowering FrameLowering;
+  InstrItineraryData InstrItins;
+
+public:
+  HexagonSubtarget(StringRef TT, StringRef CPU, StringRef FS,
+                   const TargetMachine &TM);
+
+  /// getInstrItins - Return the instruction itineraies based on subtarget
+  /// selection.
+  const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
+  const HexagonInstrInfo *getInstrInfo() const { return &InstrInfo; }
+  const HexagonRegisterInfo *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+  const HexagonTargetLowering *getTargetLowering() const { return &TLInfo; }
+  const HexagonFrameLowering *getFrameLowering() const {
+    return &FrameLowering;
+  }
+  const HexagonSelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; }
+  const DataLayout *getDataLayout() const { return &DL; }
+
+  HexagonSubtarget &initializeSubtargetDependencies(StringRef CPU,
+                                                    StringRef FS);
+
+  /// ParseSubtargetFeatures - Parses features string setting specified
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  bool hasV2TOps () const { return HexagonArchVersion >= V2; }
+  bool hasV2TOpsOnly () const { return HexagonArchVersion == V2; }
+  bool hasV3TOps () const { return HexagonArchVersion >= V3; }
+  bool hasV3TOpsOnly () const { return HexagonArchVersion == V3; }
+  bool hasV4TOps () const { return HexagonArchVersion >= V4; }
+  bool hasV4TOpsOnly () const { return HexagonArchVersion == V4; }
+  bool useMemOps () const { return HexagonArchVersion >= V4 && UseMemOps; }
+  bool hasV5TOps () const { return HexagonArchVersion >= V5; }
+  bool hasV5TOpsOnly () const { return HexagonArchVersion == V5; }
+  bool modeIEEERndNear () const { return ModeIEEERndNear; }
+
+  bool isSubtargetV2() const { return HexagonArchVersion == V2;}
+  const std::string &getCPUString () const { return CPUString; }
+
+  // Threshold for small data section
+  unsigned getSmallDataThreshold() const {
+    return Hexagon_SMALL_DATA_THRESHOLD;
+  }
+  const HexagonArchEnum &getHexagonArchVersion() const {
+    return  HexagonArchVersion;
+  }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
new file mode 100644
index 0000000..7831410
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -0,0 +1,178 @@
+//===-- HexagonTargetMachine.cpp - Define TargetMachine for Hexagon -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements the info about Hexagon target spec.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonTargetMachine.h"
+#include "Hexagon.h"
+#include "HexagonISelLowering.h"
+#include "HexagonMachineScheduler.h"
+#include "HexagonTargetObjectFile.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Module.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+
+static cl:: opt<bool> DisableHardwareLoops("disable-hexagon-hwloops",
+      cl::Hidden, cl::desc("Disable Hardware Loops for Hexagon target"));
+
+static cl::opt<bool> DisableHexagonMISched("disable-hexagon-misched",
+      cl::Hidden, cl::ZeroOrMore, cl::init(false),
+      cl::desc("Disable Hexagon MI Scheduling"));
+
+static cl::opt<bool> DisableHexagonCFGOpt("disable-hexagon-cfgopt",
+      cl::Hidden, cl::ZeroOrMore, cl::init(false),
+      cl::desc("Disable Hexagon CFG Optimization"));
+
+
+/// HexagonTargetMachineModule - Note that this is used on hosts that
+/// cannot link in a library unless there are references into the
+/// library.  In particular, it seems that it is not possible to get
+/// things to work on Win32 without this.  Though it is unused, do not
+/// remove it.
+extern "C" int HexagonTargetMachineModule;
+int HexagonTargetMachineModule = 0;
+
+extern "C" void LLVMInitializeHexagonTarget() {
+  // Register the target.
+  RegisterTargetMachine<HexagonTargetMachine> X(TheHexagonTarget);
+}
+
+static ScheduleDAGInstrs *createVLIWMachineSched(MachineSchedContext *C) {
+  return new VLIWMachineScheduler(C, make_unique<ConvergingVLIWScheduler>());
+}
+
+static MachineSchedRegistry
+SchedCustomRegistry("hexagon", "Run Hexagon's custom scheduler",
+                    createVLIWMachineSched);
+
+/// HexagonTargetMachine ctor - Create an ILP32 architecture model.
+///
+
+/// Hexagon_TODO: Do I need an aggregate alignment?
+///
+HexagonTargetMachine::HexagonTargetMachine(const Target &T, StringRef TT,
+                                           StringRef CPU, StringRef FS,
+                                           const TargetOptions &Options,
+                                           Reloc::Model RM, CodeModel::Model CM,
+                                           CodeGenOpt::Level OL)
+    : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+      Subtarget(TT, CPU, FS, *this) {
+    initAsmInfo();
+}
+
+namespace {
+/// Hexagon Code Generator Pass Configuration Options.
+class HexagonPassConfig : public TargetPassConfig {
+public:
+  HexagonPassConfig(HexagonTargetMachine *TM, PassManagerBase &PM)
+    : TargetPassConfig(TM, PM) {
+    // FIXME: Rather than calling enablePass(&MachineSchedulerID) below, define
+    // HexagonSubtarget::enableMachineScheduler() { return true; }.
+    // That will bypass the SelectionDAG VLIW scheduler, which is probably just
+    // hurting compile time and will be removed eventually anyway.
+    if (DisableHexagonMISched)
+      disablePass(&MachineSchedulerID);
+    else
+      enablePass(&MachineSchedulerID);
+  }
+
+  HexagonTargetMachine &getHexagonTargetMachine() const {
+    return getTM<HexagonTargetMachine>();
+  }
+
+  ScheduleDAGInstrs *
+  createMachineScheduler(MachineSchedContext *C) const override {
+    return createVLIWMachineSched(C);
+  }
+
+  bool addInstSelector() override;
+  bool addPreRegAlloc() override;
+  bool addPostRegAlloc() override;
+  bool addPreSched2() override;
+  bool addPreEmitPass() override;
+};
+} // namespace
+
+TargetPassConfig *HexagonTargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new HexagonPassConfig(this, PM);
+}
+
+bool HexagonPassConfig::addInstSelector() {
+  HexagonTargetMachine &TM = getHexagonTargetMachine();
+  bool NoOpt = (getOptLevel() == CodeGenOpt::None);
+
+  if (!NoOpt)
+    addPass(createHexagonRemoveExtendArgs(TM));
+
+  addPass(createHexagonISelDag(TM, getOptLevel()));
+
+  if (!NoOpt) {
+    addPass(createHexagonPeephole());
+    printAndVerify("After hexagon peephole pass");
+  }
+
+  return false;
+}
+
+bool HexagonPassConfig::addPreRegAlloc() {
+  if (getOptLevel() != CodeGenOpt::None)
+    if (!DisableHardwareLoops)
+      addPass(createHexagonHardwareLoops());
+  return false;
+}
+
+bool HexagonPassConfig::addPostRegAlloc() {
+  const HexagonTargetMachine &TM = getHexagonTargetMachine();
+  if (getOptLevel() != CodeGenOpt::None)
+    if (!DisableHexagonCFGOpt)
+      addPass(createHexagonCFGOptimizer(TM));
+  return false;
+}
+
+bool HexagonPassConfig::addPreSched2() {
+  const HexagonTargetMachine &TM = getHexagonTargetMachine();
+
+  addPass(createHexagonCopyToCombine());
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(&IfConverterID);
+  addPass(createHexagonSplitConst32AndConst64(TM));
+  printAndVerify("After hexagon split const32/64 pass");
+  return true;
+}
+
+bool HexagonPassConfig::addPreEmitPass() {
+  const HexagonTargetMachine &TM = getHexagonTargetMachine();
+  bool NoOpt = (getOptLevel() == CodeGenOpt::None);
+
+  if (!NoOpt)
+    addPass(createHexagonNewValueJump());
+
+  // Expand Spill code for predicate registers.
+  addPass(createHexagonExpandPredSpillCode(TM));
+
+  // Split up TFRcondsets into conditional transfers.
+  addPass(createHexagonSplitTFRCondSets(TM));
+
+  // Create Packets.
+  if (!NoOpt) {
+    if (!DisableHardwareLoops)
+      addPass(createHexagonFixupHwLoops());
+    addPass(createHexagonPacketizer());
+  }
+
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.h b/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
new file mode 100644
index 0000000..d88178e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
@@ -0,0 +1,67 @@
+//=-- HexagonTargetMachine.h - Define TargetMachine for Hexagon ---*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Hexagon specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HexagonTARGETMACHINE_H
+#define HexagonTARGETMACHINE_H
+
+#include "HexagonInstrInfo.h"
+#include "HexagonSubtarget.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class Module;
+
+class HexagonTargetMachine : public LLVMTargetMachine {
+  HexagonSubtarget Subtarget;
+
+public:
+  HexagonTargetMachine(const Target &T, StringRef TT,StringRef CPU,
+                       StringRef FS, const TargetOptions &Options,
+                       Reloc::Model RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL);
+
+  const HexagonInstrInfo *getInstrInfo() const override {
+    return getSubtargetImpl()->getInstrInfo();
+  }
+  const HexagonSubtarget *getSubtargetImpl() const override {
+    return &Subtarget;
+  }
+  const HexagonRegisterInfo *getRegisterInfo() const override {
+    return getSubtargetImpl()->getRegisterInfo();
+  }
+  const InstrItineraryData* getInstrItineraryData() const override {
+    return &getSubtargetImpl()->getInstrItineraryData();
+  }
+  const HexagonTargetLowering* getTargetLowering() const override {
+    return getSubtargetImpl()->getTargetLowering();
+  }
+  const HexagonFrameLowering* getFrameLowering() const override {
+    return getSubtargetImpl()->getFrameLowering();
+  }
+  const HexagonSelectionDAGInfo* getSelectionDAGInfo() const override {
+    return getSubtargetImpl()->getSelectionDAGInfo();
+  }
+  const DataLayout *getDataLayout() const override {
+    return getSubtargetImpl()->getDataLayout();
+  }
+  static unsigned getModuleMatchQuality(const Module &M);
+
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+};
+
+extern bool flag_aligned_memcpy;
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
new file mode 100644
index 0000000..c97526e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
@@ -0,0 +1,101 @@
+//===-- HexagonTargetObjectFile.cpp - Hexagon asm properties --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the HexagonTargetAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonTargetObjectFile.h"
+#include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ELF.h"
+
+using namespace llvm;
+
+static cl::opt<int> SmallDataThreshold("hexagon-small-data-threshold",
+                                cl::init(8), cl::Hidden,
+                cl::desc("The maximum size of an object in the sdata section"));
+
+void HexagonTargetObjectFile::Initialize(MCContext &Ctx,
+                                         const TargetMachine &TM) {
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+
+
+  SmallDataSection =
+    getContext().getELFSection(".sdata", ELF::SHT_PROGBITS,
+                               ELF::SHF_WRITE | ELF::SHF_ALLOC,
+                               SectionKind::getDataRel());
+  SmallBSSSection =
+    getContext().getELFSection(".sbss", ELF::SHT_NOBITS,
+                               ELF::SHF_WRITE | ELF::SHF_ALLOC,
+                               SectionKind::getBSS());
+}
+
+// sdata/sbss support taken largely from the MIPS Backend.
+static bool IsInSmallSection(uint64_t Size) {
+  return Size > 0 && Size <= (uint64_t)SmallDataThreshold;
+}
+
+bool HexagonTargetObjectFile::IsSmallDataEnabled () const {
+  return SmallDataThreshold > 0;
+}
+
+/// IsGlobalInSmallSection - Return true if this global value should be
+/// placed into small data/bss section.
+bool HexagonTargetObjectFile::IsGlobalInSmallSection(const GlobalValue *GV,
+                                                const TargetMachine &TM) const {
+  // If the primary definition of this global value is outside the current
+  // translation unit or the global value is available for inspection but not
+  // emission, then do nothing.
+  if (GV->isDeclaration() || GV->hasAvailableExternallyLinkage())
+    return false;
+
+  // Otherwise, Check if GV should be in sdata/sbss, when normally it would end
+  // up in getKindForGlobal(GV, TM).
+  return IsGlobalInSmallSection(GV, TM, getKindForGlobal(GV, TM));
+}
+
+/// IsGlobalInSmallSection - Return true if this global value should be
+/// placed into small data/bss section.
+bool HexagonTargetObjectFile::
+IsGlobalInSmallSection(const GlobalValue *GV, const TargetMachine &TM,
+                       SectionKind Kind) const {
+  // Only global variables, not functions.
+  const GlobalVariable *GVA = dyn_cast<GlobalVariable>(GV);
+  if (!GVA)
+    return false;
+
+  if (Kind.isBSS() || Kind.isDataNoRel() || Kind.isCommon()) {
+    Type *Ty = GV->getType()->getElementType();
+    return IsInSmallSection(TM.getDataLayout()->getTypeAllocSize(Ty));
+  }
+
+  return false;
+}
+
+const MCSection *
+HexagonTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV,
+                                                SectionKind Kind, Mangler &Mang,
+                                                const TargetMachine &TM) const {
+
+  // Handle Small Section classification here.
+  if (Kind.isBSS() && IsGlobalInSmallSection(GV, TM, Kind))
+    return SmallBSSSection;
+  if (Kind.isDataNoRel() && IsGlobalInSmallSection(GV, TM, Kind))
+    return SmallDataSection;
+
+  // Otherwise, we work the same as ELF.
+  return TargetLoweringObjectFileELF::SelectSectionForGlobal(GV, Kind, Mang,TM);
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.h b/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.h
new file mode 100644
index 0000000..1bd1272
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.h
@@ -0,0 +1,40 @@
+//===-- HexagonTargetAsmInfo.h - Hexagon asm properties --------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HexagonTARGETOBJECTFILE_H
+#define HexagonTARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/MC/MCSectionELF.h"
+
+namespace llvm {
+
+  class HexagonTargetObjectFile : public TargetLoweringObjectFileELF {
+    const MCSectionELF *SmallDataSection;
+    const MCSectionELF *SmallBSSSection;
+  public:
+    void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+
+    /// IsGlobalInSmallSection - Return true if this global address should be
+    /// placed into small data/bss section.
+    bool IsGlobalInSmallSection(const GlobalValue *GV,
+                                const TargetMachine &TM,
+                                SectionKind Kind) const;
+    bool IsGlobalInSmallSection(const GlobalValue *GV,
+                                const TargetMachine &TM) const;
+
+    bool IsSmallDataEnabled () const;
+    const MCSection *SelectSectionForGlobal(const GlobalValue *GV,
+                                        SectionKind Kind, Mangler &Mang,
+                                        const TargetMachine &TM) const override;
+  };
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
new file mode 100644
index 0000000..87ce960
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -0,0 +1,1428 @@
+//===----- HexagonPacketizer.cpp - vliw packetizer ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements a simple VLIW packetizer using DFA. The packetizer works on
+// machine basic blocks. For each instruction I in BB, the packetizer consults
+// the DFA to see if machine resources are available to execute I. If so, the
+// packetizer checks if I depends on any instruction J in the current packet.
+// If no dependency is found, I is added to current packet and machine resource
+// is marked as taken. If any dependency is found, a target API call is made to
+// prune the dependence.
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/CodeGen/DFAPacketizer.h"
+#include "Hexagon.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LatencyPriorityQueue.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <map>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "packets"
+
+static cl::opt<bool> PacketizeVolatiles("hexagon-packetize-volatiles",
+      cl::ZeroOrMore, cl::Hidden, cl::init(true),
+      cl::desc("Allow non-solo packetization of volatile memory references"));
+
+namespace llvm {
+  void initializeHexagonPacketizerPass(PassRegistry&);
+}
+
+
+namespace {
+  class HexagonPacketizer : public MachineFunctionPass {
+
+  public:
+    static char ID;
+    HexagonPacketizer() : MachineFunctionPass(ID) {
+      initializeHexagonPacketizerPass(*PassRegistry::getPassRegistry());
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      AU.addRequired<MachineDominatorTree>();
+      AU.addRequired<MachineBranchProbabilityInfo>();
+      AU.addPreserved<MachineDominatorTree>();
+      AU.addRequired<MachineLoopInfo>();
+      AU.addPreserved<MachineLoopInfo>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    const char *getPassName() const override {
+      return "Hexagon Packetizer";
+    }
+
+    bool runOnMachineFunction(MachineFunction &Fn) override;
+  };
+  char HexagonPacketizer::ID = 0;
+
+  class HexagonPacketizerList : public VLIWPacketizerList {
+
+  private:
+
+    // Has the instruction been promoted to a dot-new instruction.
+    bool PromotedToDotNew;
+
+    // Has the instruction been glued to allocframe.
+    bool GlueAllocframeStore;
+
+    // Has the feeder instruction been glued to new value jump.
+    bool GlueToNewValueJump;
+
+    // Check if there is a dependence between some instruction already in this
+    // packet and this instruction.
+    bool Dependence;
+
+    // Only check for dependence if there are resources available to
+    // schedule this instruction.
+    bool FoundSequentialDependence;
+
+    /// \brief A handle to the branch probability pass.
+   const MachineBranchProbabilityInfo *MBPI;
+
+   // Track MIs with ignored dependece.
+   std::vector<MachineInstr*> IgnoreDepMIs;
+
+  public:
+    // Ctor.
+    HexagonPacketizerList(MachineFunction &MF, MachineLoopInfo &MLI,
+                          MachineDominatorTree &MDT,
+                          const MachineBranchProbabilityInfo *MBPI);
+
+    // initPacketizerState - initialize some internal flags.
+    void initPacketizerState() override;
+
+    // ignorePseudoInstruction - Ignore bundling of pseudo instructions.
+    bool ignorePseudoInstruction(MachineInstr *MI,
+                                 MachineBasicBlock *MBB) override;
+
+    // isSoloInstruction - return true if instruction MI can not be packetized
+    // with any other instruction, which means that MI itself is a packet.
+    bool isSoloInstruction(MachineInstr *MI) override;
+
+    // isLegalToPacketizeTogether - Is it legal to packetize SUI and SUJ
+    // together.
+    bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) override;
+
+    // isLegalToPruneDependencies - Is it legal to prune dependece between SUI
+    // and SUJ.
+    bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) override;
+
+    MachineBasicBlock::iterator addToPacket(MachineInstr *MI) override;
+  private:
+    bool IsCallDependent(MachineInstr* MI, SDep::Kind DepType, unsigned DepReg);
+    bool PromoteToDotNew(MachineInstr* MI, SDep::Kind DepType,
+                         MachineBasicBlock::iterator &MII,
+                         const TargetRegisterClass* RC);
+    bool CanPromoteToDotNew(MachineInstr* MI, SUnit* PacketSU,
+                            unsigned DepReg,
+                            std::map <MachineInstr*, SUnit*> MIToSUnit,
+                            MachineBasicBlock::iterator &MII,
+                            const TargetRegisterClass* RC);
+    bool CanPromoteToNewValue(MachineInstr* MI, SUnit* PacketSU,
+                              unsigned DepReg,
+                              std::map <MachineInstr*, SUnit*> MIToSUnit,
+                              MachineBasicBlock::iterator &MII);
+    bool CanPromoteToNewValueStore(MachineInstr* MI, MachineInstr* PacketMI,
+                                   unsigned DepReg,
+                                   std::map <MachineInstr*, SUnit*> MIToSUnit);
+    bool DemoteToDotOld(MachineInstr* MI);
+    bool ArePredicatesComplements(MachineInstr* MI1, MachineInstr* MI2,
+                    std::map <MachineInstr*, SUnit*> MIToSUnit);
+    bool RestrictingDepExistInPacket(MachineInstr*,
+                    unsigned, std::map <MachineInstr*, SUnit*>);
+    bool isNewifiable(MachineInstr* MI);
+    bool isCondInst(MachineInstr* MI);
+    bool tryAllocateResourcesForConstExt(MachineInstr* MI);
+    bool canReserveResourcesForConstExt(MachineInstr *MI);
+    void reserveResourcesForConstExt(MachineInstr* MI);
+    bool isNewValueInst(MachineInstr* MI);
+  };
+}
+
+INITIALIZE_PASS_BEGIN(HexagonPacketizer, "packets", "Hexagon Packetizer",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_END(HexagonPacketizer, "packets", "Hexagon Packetizer",
+                    false, false)
+
+
+// HexagonPacketizerList Ctor.
+HexagonPacketizerList::HexagonPacketizerList(
+  MachineFunction &MF, MachineLoopInfo &MLI,MachineDominatorTree &MDT,
+  const MachineBranchProbabilityInfo *MBPI)
+  : VLIWPacketizerList(MF, MLI, MDT, true){
+  this->MBPI = MBPI;
+}
+
+bool HexagonPacketizer::runOnMachineFunction(MachineFunction &Fn) {
+  const TargetInstrInfo *TII = Fn.getTarget().getInstrInfo();
+  MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
+  MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>();
+  const MachineBranchProbabilityInfo *MBPI =
+    &getAnalysis<MachineBranchProbabilityInfo>();
+  // Instantiate the packetizer.
+  HexagonPacketizerList Packetizer(Fn, MLI, MDT, MBPI);
+
+  // DFA state table should not be empty.
+  assert(Packetizer.getResourceTracker() && "Empty DFA table!");
+
+  //
+  // Loop over all basic blocks and remove KILL pseudo-instructions
+  // These instructions confuse the dependence analysis. Consider:
+  // D0 = ...   (Insn 0)
+  // R0 = KILL R0, D0 (Insn 1)
+  // R0 = ... (Insn 2)
+  // Here, Insn 1 will result in the dependence graph not emitting an output
+  // dependence between Insn 0 and Insn 2. This can lead to incorrect
+  // packetization
+  //
+  for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
+       MBB != MBBe; ++MBB) {
+    MachineBasicBlock::iterator End = MBB->end();
+    MachineBasicBlock::iterator MI = MBB->begin();
+    while (MI != End) {
+      if (MI->isKill()) {
+        MachineBasicBlock::iterator DeleteMI = MI;
+        ++MI;
+        MBB->erase(DeleteMI);
+        End = MBB->end();
+        continue;
+      }
+      ++MI;
+    }
+  }
+
+  // Loop over all of the basic blocks.
+  for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
+       MBB != MBBe; ++MBB) {
+    // Find scheduling regions and schedule / packetize each region.
+    unsigned RemainingCount = MBB->size();
+    for(MachineBasicBlock::iterator RegionEnd = MBB->end();
+        RegionEnd != MBB->begin();) {
+      // The next region starts above the previous region. Look backward in the
+      // instruction stream until we find the nearest boundary.
+      MachineBasicBlock::iterator I = RegionEnd;
+      for(;I != MBB->begin(); --I, --RemainingCount) {
+        if (TII->isSchedulingBoundary(std::prev(I), MBB, Fn))
+          break;
+      }
+      I = MBB->begin();
+
+      // Skip empty scheduling regions.
+      if (I == RegionEnd) {
+        RegionEnd = std::prev(RegionEnd);
+        --RemainingCount;
+        continue;
+      }
+      // Skip regions with one instruction.
+      if (I == std::prev(RegionEnd)) {
+        RegionEnd = std::prev(RegionEnd);
+        continue;
+      }
+
+      Packetizer.PacketizeMIs(MBB, I, RegionEnd);
+      RegionEnd = I;
+    }
+  }
+
+  return true;
+}
+
+
+static bool IsIndirectCall(MachineInstr* MI) {
+  return ((MI->getOpcode() == Hexagon::CALLR) ||
+          (MI->getOpcode() == Hexagon::CALLRv3));
+}
+
+// Reserve resources for constant extender. Trigure an assertion if
+// reservation fail.
+void HexagonPacketizerList::reserveResourcesForConstExt(MachineInstr* MI) {
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  MachineFunction *MF = MI->getParent()->getParent();
+  MachineInstr *PseudoMI = MF->CreateMachineInstr(QII->get(Hexagon::IMMEXT_i),
+                                                  MI->getDebugLoc());
+
+  if (ResourceTracker->canReserveResources(PseudoMI)) {
+    ResourceTracker->reserveResources(PseudoMI);
+    MI->getParent()->getParent()->DeleteMachineInstr(PseudoMI);
+  } else {
+    MI->getParent()->getParent()->DeleteMachineInstr(PseudoMI);
+    llvm_unreachable("can not reserve resources for constant extender.");
+  }
+  return;
+}
+
+bool HexagonPacketizerList::canReserveResourcesForConstExt(MachineInstr *MI) {
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  assert((QII->isExtended(MI) || QII->isConstExtended(MI)) &&
+         "Should only be called for constant extended instructions");
+  MachineFunction *MF = MI->getParent()->getParent();
+  MachineInstr *PseudoMI = MF->CreateMachineInstr(QII->get(Hexagon::IMMEXT_i),
+                                                  MI->getDebugLoc());
+  bool CanReserve = ResourceTracker->canReserveResources(PseudoMI);
+  MF->DeleteMachineInstr(PseudoMI);
+  return CanReserve;
+}
+
+// Allocate resources (i.e. 4 bytes) for constant extender. If succeed, return
+// true, otherwise, return false.
+bool HexagonPacketizerList::tryAllocateResourcesForConstExt(MachineInstr* MI) {
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  MachineFunction *MF = MI->getParent()->getParent();
+  MachineInstr *PseudoMI = MF->CreateMachineInstr(QII->get(Hexagon::IMMEXT_i),
+                                                  MI->getDebugLoc());
+
+  if (ResourceTracker->canReserveResources(PseudoMI)) {
+    ResourceTracker->reserveResources(PseudoMI);
+    MI->getParent()->getParent()->DeleteMachineInstr(PseudoMI);
+    return true;
+  } else {
+    MI->getParent()->getParent()->DeleteMachineInstr(PseudoMI);
+    return false;
+  }
+}
+
+
+bool HexagonPacketizerList::IsCallDependent(MachineInstr* MI,
+                                          SDep::Kind DepType,
+                                          unsigned DepReg) {
+
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  const HexagonRegisterInfo* QRI =
+              (const HexagonRegisterInfo *) TM.getRegisterInfo();
+
+  // Check for lr dependence
+  if (DepReg == QRI->getRARegister()) {
+    return true;
+  }
+
+  if (QII->isDeallocRet(MI)) {
+    if (DepReg == QRI->getFrameRegister() ||
+        DepReg == QRI->getStackRegister())
+      return true;
+  }
+
+  // Check if this is a predicate dependence
+  const TargetRegisterClass* RC = QRI->getMinimalPhysRegClass(DepReg);
+  if (RC == &Hexagon::PredRegsRegClass) {
+    return true;
+  }
+
+  //
+  // Lastly check for an operand used in an indirect call
+  // If we had an attribute for checking if an instruction is an indirect call,
+  // then we could have avoided this relatively brittle implementation of
+  // IsIndirectCall()
+  //
+  // Assumes that the first operand of the CALLr is the function address
+  //
+  if (IsIndirectCall(MI) && (DepType == SDep::Data)) {
+    MachineOperand MO = MI->getOperand(0);
+    if (MO.isReg() && MO.isUse() && (MO.getReg() == DepReg)) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+static bool IsRegDependence(const SDep::Kind DepType) {
+  return (DepType == SDep::Data || DepType == SDep::Anti ||
+          DepType == SDep::Output);
+}
+
+static bool IsDirectJump(MachineInstr* MI) {
+  return (MI->getOpcode() == Hexagon::JMP);
+}
+
+static bool IsSchedBarrier(MachineInstr* MI) {
+  switch (MI->getOpcode()) {
+  case Hexagon::BARRIER:
+    return true;
+  }
+  return false;
+}
+
+static bool IsControlFlow(MachineInstr* MI) {
+  return (MI->getDesc().isTerminator() || MI->getDesc().isCall());
+}
+
+static bool IsLoopN(MachineInstr *MI) {
+  return (MI->getOpcode() == Hexagon::LOOP0_i ||
+          MI->getOpcode() == Hexagon::LOOP0_r);
+}
+
+/// DoesModifyCalleeSavedReg - Returns true if the instruction modifies a
+/// callee-saved register.
+static bool DoesModifyCalleeSavedReg(MachineInstr *MI,
+                                     const TargetRegisterInfo *TRI) {
+  for (const MCPhysReg *CSR = TRI->getCalleeSavedRegs(); *CSR; ++CSR) {
+    unsigned CalleeSavedReg = *CSR;
+    if (MI->modifiesRegister(CalleeSavedReg, TRI))
+      return true;
+  }
+  return false;
+}
+
+// Returns true if an instruction can be promoted to .new predicate
+// or new-value store.
+bool HexagonPacketizerList::isNewifiable(MachineInstr* MI) {
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  if ( isCondInst(MI) || QII->mayBeNewStore(MI))
+    return true;
+  else
+    return false;
+}
+
+bool HexagonPacketizerList::isCondInst (MachineInstr* MI) {
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  const MCInstrDesc& TID = MI->getDesc();
+                                    // bug 5670: until that is fixed,
+                                    // this portion is disabled.
+  if (   TID.isConditionalBranch()  // && !IsRegisterJump(MI)) ||
+      || QII->isConditionalTransfer(MI)
+      || QII->isConditionalALU32(MI)
+      || QII->isConditionalLoad(MI)
+      || QII->isConditionalStore(MI)) {
+    return true;
+  }
+  return false;
+}
+
+
+// Promote an instructiont to its .new form.
+// At this time, we have already made a call to CanPromoteToDotNew
+// and made sure that it can *indeed* be promoted.
+bool HexagonPacketizerList::PromoteToDotNew(MachineInstr* MI,
+                        SDep::Kind DepType, MachineBasicBlock::iterator &MII,
+                        const TargetRegisterClass* RC) {
+
+  assert (DepType == SDep::Data);
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+
+  int NewOpcode;
+  if (RC == &Hexagon::PredRegsRegClass)
+    NewOpcode = QII->GetDotNewPredOp(MI, MBPI);
+  else
+    NewOpcode = QII->GetDotNewOp(MI);
+  MI->setDesc(QII->get(NewOpcode));
+
+  return true;
+}
+
+bool HexagonPacketizerList::DemoteToDotOld(MachineInstr* MI) {
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  int NewOpcode = QII->GetDotOldOp(MI->getOpcode());
+  MI->setDesc(QII->get(NewOpcode));
+  return true;
+}
+
+enum PredicateKind {
+  PK_False,
+  PK_True,
+  PK_Unknown
+};
+
+/// Returns true if an instruction is predicated on p0 and false if it's
+/// predicated on !p0.
+static PredicateKind getPredicateSense(MachineInstr* MI,
+                                       const HexagonInstrInfo *QII) {
+  if (!QII->isPredicated(MI))
+    return PK_Unknown;
+
+  if (QII->isPredicatedTrue(MI))
+    return PK_True;
+
+  return PK_False;
+}
+
+static MachineOperand& GetPostIncrementOperand(MachineInstr *MI,
+                                               const HexagonInstrInfo *QII) {
+  assert(QII->isPostIncrement(MI) && "Not a post increment operation.");
+#ifndef NDEBUG
+  // Post Increment means duplicates. Use dense map to find duplicates in the
+  // list. Caution: Densemap initializes with the minimum of 64 buckets,
+  // whereas there are at most 5 operands in the post increment.
+  DenseMap<unsigned,  unsigned> DefRegsSet;
+  for(unsigned opNum = 0; opNum < MI->getNumOperands(); opNum++)
+    if (MI->getOperand(opNum).isReg() &&
+        MI->getOperand(opNum).isDef()) {
+      DefRegsSet[MI->getOperand(opNum).getReg()] = 1;
+    }
+
+  for(unsigned opNum = 0; opNum < MI->getNumOperands(); opNum++)
+    if (MI->getOperand(opNum).isReg() &&
+        MI->getOperand(opNum).isUse()) {
+      if (DefRegsSet[MI->getOperand(opNum).getReg()]) {
+        return MI->getOperand(opNum);
+      }
+    }
+#else
+  if (MI->getDesc().mayLoad()) {
+    // The 2nd operand is always the post increment operand in load.
+    assert(MI->getOperand(1).isReg() &&
+                "Post increment operand has be to a register.");
+    return (MI->getOperand(1));
+  }
+  if (MI->getDesc().mayStore()) {
+    // The 1st operand is always the post increment operand in store.
+    assert(MI->getOperand(0).isReg() &&
+                "Post increment operand has be to a register.");
+    return (MI->getOperand(0));
+  }
+#endif
+  // we should never come here.
+  llvm_unreachable("mayLoad or mayStore not set for Post Increment operation");
+}
+
+// get the value being stored
+static MachineOperand& GetStoreValueOperand(MachineInstr *MI) {
+  // value being stored is always the last operand.
+  return (MI->getOperand(MI->getNumOperands()-1));
+}
+
+// can be new value store?
+// Following restrictions are to be respected in convert a store into
+// a new value store.
+// 1. If an instruction uses auto-increment, its address register cannot
+//    be a new-value register. Arch Spec 5.4.2.1
+// 2. If an instruction uses absolute-set addressing mode,
+//    its address register cannot be a new-value register.
+//    Arch Spec 5.4.2.1.TODO: This is not enabled as
+//    as absolute-set address mode patters are not implemented.
+// 3. If an instruction produces a 64-bit result, its registers cannot be used
+//    as new-value registers. Arch Spec 5.4.2.2.
+// 4. If the instruction that sets a new-value register is conditional, then
+//    the instruction that uses the new-value register must also be conditional,
+//    and both must always have their predicates evaluate identically.
+//    Arch Spec 5.4.2.3.
+// 5. There is an implied restriction of a packet can not have another store,
+//    if there is a  new value store in the packet. Corollary, if there is
+//    already a store in a packet, there can not be a new value store.
+//    Arch Spec: 3.4.4.2
+bool HexagonPacketizerList::CanPromoteToNewValueStore( MachineInstr *MI,
+                MachineInstr *PacketMI, unsigned DepReg,
+                std::map <MachineInstr*, SUnit*> MIToSUnit) {
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  // Make sure we are looking at the store, that can be promoted.
+  if (!QII->mayBeNewStore(MI))
+    return false;
+
+  // Make sure there is dependency and can be new value'ed
+  if (GetStoreValueOperand(MI).isReg() &&
+      GetStoreValueOperand(MI).getReg() != DepReg)
+    return false;
+
+  const HexagonRegisterInfo* QRI =
+                            (const HexagonRegisterInfo *) TM.getRegisterInfo();
+  const MCInstrDesc& MCID = PacketMI->getDesc();
+  // first operand is always the result
+
+  const TargetRegisterClass* PacketRC = QII->getRegClass(MCID, 0, QRI, MF);
+
+  // if there is already an store in the packet, no can do new value store
+  // Arch Spec 3.4.4.2.
+  for (std::vector<MachineInstr*>::iterator VI = CurrentPacketMIs.begin(),
+         VE = CurrentPacketMIs.end();
+       (VI != VE); ++VI) {
+    SUnit* PacketSU = MIToSUnit[*VI];
+    if (PacketSU->getInstr()->getDesc().mayStore() ||
+        // if we have mayStore = 1 set on ALLOCFRAME and DEALLOCFRAME,
+        // then we don't need this
+        PacketSU->getInstr()->getOpcode() == Hexagon::ALLOCFRAME ||
+        PacketSU->getInstr()->getOpcode() == Hexagon::DEALLOCFRAME)
+      return false;
+  }
+
+  if (PacketRC == &Hexagon::DoubleRegsRegClass) {
+    // new value store constraint: double regs can not feed into new value store
+    // arch spec section: 5.4.2.2
+    return false;
+  }
+
+  // Make sure it's NOT the post increment register that we are going to
+  // new value.
+  if (QII->isPostIncrement(MI) &&
+      MI->getDesc().mayStore() &&
+      GetPostIncrementOperand(MI, QII).getReg() == DepReg) {
+    return false;
+  }
+
+  if (QII->isPostIncrement(PacketMI) &&
+      PacketMI->getDesc().mayLoad() &&
+      GetPostIncrementOperand(PacketMI, QII).getReg() == DepReg) {
+    // if source is post_inc, or absolute-set addressing,
+    // it can not feed into new value store
+    //  r3 = memw(r2++#4)
+    //  memw(r30 + #-1404) = r2.new -> can not be new value store
+    // arch spec section: 5.4.2.1
+    return false;
+  }
+
+  // If the source that feeds the store is predicated, new value store must
+  // also be predicated.
+  if (QII->isPredicated(PacketMI)) {
+    if (!QII->isPredicated(MI))
+      return false;
+
+    // Check to make sure that they both will have their predicates
+    // evaluate identically
+    unsigned predRegNumSrc = 0;
+    unsigned predRegNumDst = 0;
+    const TargetRegisterClass* predRegClass = nullptr;
+
+    // Get predicate register used in the source instruction
+    for(unsigned opNum = 0; opNum < PacketMI->getNumOperands(); opNum++) {
+      if ( PacketMI->getOperand(opNum).isReg())
+      predRegNumSrc = PacketMI->getOperand(opNum).getReg();
+      predRegClass = QRI->getMinimalPhysRegClass(predRegNumSrc);
+      if (predRegClass == &Hexagon::PredRegsRegClass) {
+        break;
+      }
+    }
+    assert ((predRegClass == &Hexagon::PredRegsRegClass ) &&
+        ("predicate register not found in a predicated PacketMI instruction"));
+
+    // Get predicate register used in new-value store instruction
+    for(unsigned opNum = 0; opNum < MI->getNumOperands(); opNum++) {
+      if ( MI->getOperand(opNum).isReg())
+      predRegNumDst = MI->getOperand(opNum).getReg();
+      predRegClass = QRI->getMinimalPhysRegClass(predRegNumDst);
+      if (predRegClass == &Hexagon::PredRegsRegClass) {
+        break;
+      }
+    }
+    assert ((predRegClass == &Hexagon::PredRegsRegClass ) &&
+            ("predicate register not found in a predicated MI instruction"));
+
+    // New-value register producer and user (store) need to satisfy these
+    // constraints:
+    // 1) Both instructions should be predicated on the same register.
+    // 2) If producer of the new-value register is .new predicated then store
+    // should also be .new predicated and if producer is not .new predicated
+    // then store should not be .new predicated.
+    // 3) Both new-value register producer and user should have same predicate
+    // sense, i.e, either both should be negated or both should be none negated.
+
+    if (( predRegNumDst != predRegNumSrc) ||
+          QII->isDotNewInst(PacketMI) != QII->isDotNewInst(MI)  ||
+          getPredicateSense(MI, QII) != getPredicateSense(PacketMI, QII)) {
+      return false;
+    }
+  }
+
+  // Make sure that other than the new-value register no other store instruction
+  // register has been modified in the same packet. Predicate registers can be
+  // modified by they should not be modified between the producer and the store
+  // instruction as it will make them both conditional on different values.
+  // We already know this to be true for all the instructions before and
+  // including PacketMI. Howerver, we need to perform the check for the
+  // remaining instructions in the packet.
+
+  std::vector<MachineInstr*>::iterator VI;
+  std::vector<MachineInstr*>::iterator VE;
+  unsigned StartCheck = 0;
+
+  for (VI=CurrentPacketMIs.begin(), VE = CurrentPacketMIs.end();
+      (VI != VE); ++VI) {
+    SUnit* TempSU = MIToSUnit[*VI];
+    MachineInstr* TempMI = TempSU->getInstr();
+
+    // Following condition is true for all the instructions until PacketMI is
+    // reached (StartCheck is set to 0 before the for loop).
+    // StartCheck flag is 1 for all the instructions after PacketMI.
+    if (TempMI != PacketMI && !StartCheck) // start processing only after
+      continue;                            // encountering PacketMI
+
+    StartCheck = 1;
+    if (TempMI == PacketMI) // We don't want to check PacketMI for dependence
+      continue;
+
+    for(unsigned opNum = 0; opNum < MI->getNumOperands(); opNum++) {
+      if (MI->getOperand(opNum).isReg() &&
+          TempSU->getInstr()->modifiesRegister(MI->getOperand(opNum).getReg(),
+                                               QRI))
+        return false;
+    }
+  }
+
+  // Make sure that for non-POST_INC stores:
+  // 1. The only use of reg is DepReg and no other registers.
+  //    This handles V4 base+index registers.
+  //    The following store can not be dot new.
+  //    Eg.   r0 = add(r0, #3)a
+  //          memw(r1+r0<<#2) = r0
+  if (!QII->isPostIncrement(MI) &&
+      GetStoreValueOperand(MI).isReg() &&
+      GetStoreValueOperand(MI).getReg() == DepReg) {
+    for(unsigned opNum = 0; opNum < MI->getNumOperands()-1; opNum++) {
+      if (MI->getOperand(opNum).isReg() &&
+          MI->getOperand(opNum).getReg() == DepReg) {
+        return false;
+      }
+    }
+    // 2. If data definition is because of implicit definition of the register,
+    //    do not newify the store. Eg.
+    //    %R9<def> = ZXTH %R12, %D6<imp-use>, %R12<imp-def>
+    //    STrih_indexed %R8, 2, %R12<kill>; mem:ST2[%scevgep343]
+    for(unsigned opNum = 0; opNum < PacketMI->getNumOperands(); opNum++) {
+      if (PacketMI->getOperand(opNum).isReg() &&
+          PacketMI->getOperand(opNum).getReg() == DepReg &&
+          PacketMI->getOperand(opNum).isDef() &&
+          PacketMI->getOperand(opNum).isImplicit()) {
+        return false;
+      }
+    }
+  }
+
+  // Can be dot new store.
+  return true;
+}
+
+// can this MI to promoted to either
+// new value store or new value jump
+bool HexagonPacketizerList::CanPromoteToNewValue( MachineInstr *MI,
+                SUnit *PacketSU, unsigned DepReg,
+                std::map <MachineInstr*, SUnit*> MIToSUnit,
+                MachineBasicBlock::iterator &MII)
+{
+
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  const HexagonRegisterInfo* QRI =
+                            (const HexagonRegisterInfo *) TM.getRegisterInfo();
+  if (!QRI->Subtarget.hasV4TOps() ||
+      !QII->mayBeNewStore(MI))
+    return false;
+
+  MachineInstr *PacketMI = PacketSU->getInstr();
+
+  // Check to see the store can be new value'ed.
+  if (CanPromoteToNewValueStore(MI, PacketMI, DepReg, MIToSUnit))
+    return true;
+
+  // Check to see the compare/jump can be new value'ed.
+  // This is done as a pass on its own. Don't need to check it here.
+  return false;
+}
+
+// Check to see if an instruction can be dot new
+// There are three kinds.
+// 1. dot new on predicate - V2/V3/V4
+// 2. dot new on stores NV/ST - V4
+// 3. dot new on jump NV/J - V4 -- This is generated in a pass.
+bool HexagonPacketizerList::CanPromoteToDotNew( MachineInstr *MI,
+                              SUnit *PacketSU, unsigned DepReg,
+                              std::map <MachineInstr*, SUnit*> MIToSUnit,
+                              MachineBasicBlock::iterator &MII,
+                              const TargetRegisterClass* RC )
+{
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  // Already a dot new instruction.
+  if (QII->isDotNewInst(MI) && !QII->mayBeNewStore(MI))
+    return false;
+
+  if (!isNewifiable(MI))
+    return false;
+
+  // predicate .new
+  if (RC == &Hexagon::PredRegsRegClass && isCondInst(MI))
+      return true;
+  else if (RC != &Hexagon::PredRegsRegClass &&
+      !QII->mayBeNewStore(MI)) // MI is not a new-value store
+    return false;
+  else {
+    // Create a dot new machine instruction to see if resources can be
+    // allocated. If not, bail out now.
+    int NewOpcode = QII->GetDotNewOp(MI);
+    const MCInstrDesc &desc = QII->get(NewOpcode);
+    DebugLoc dl;
+    MachineInstr *NewMI =
+                    MI->getParent()->getParent()->CreateMachineInstr(desc, dl);
+    bool ResourcesAvailable = ResourceTracker->canReserveResources(NewMI);
+    MI->getParent()->getParent()->DeleteMachineInstr(NewMI);
+
+    if (!ResourcesAvailable)
+      return false;
+
+    // new value store only
+    // new new value jump generated as a passes
+    if (!CanPromoteToNewValue(MI, PacketSU, DepReg, MIToSUnit, MII)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Go through the packet instructions and search for anti dependency
+// between them and DepReg from MI
+// Consider this case:
+// Trying to add
+// a) %R1<def> = TFRI_cdNotPt %P3, 2
+// to this packet:
+// {
+//   b) %P0<def> = OR_pp %P3<kill>, %P0<kill>
+//   c) %P3<def> = TFR_PdRs %R23
+//   d) %R1<def> = TFRI_cdnPt %P3, 4
+//  }
+// The P3 from a) and d) will be complements after
+// a)'s P3 is converted to .new form
+// Anti Dep between c) and b) is irrelevant for this case
+bool HexagonPacketizerList::RestrictingDepExistInPacket (MachineInstr* MI,
+      unsigned DepReg,
+      std::map <MachineInstr*, SUnit*> MIToSUnit) {
+
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  SUnit* PacketSUDep = MIToSUnit[MI];
+
+  for (std::vector<MachineInstr*>::iterator VIN = CurrentPacketMIs.begin(),
+       VEN = CurrentPacketMIs.end(); (VIN != VEN); ++VIN) {
+
+    // We only care for dependencies to predicated instructions
+    if(!QII->isPredicated(*VIN)) continue;
+
+    // Scheduling Unit for current insn in the packet
+    SUnit* PacketSU = MIToSUnit[*VIN];
+
+    // Look at dependencies between current members of the packet
+    // and predicate defining instruction MI.
+    // Make sure that dependency is on the exact register
+    // we care about.
+    if (PacketSU->isSucc(PacketSUDep)) {
+      for (unsigned i = 0; i < PacketSU->Succs.size(); ++i) {
+        if ((PacketSU->Succs[i].getSUnit() == PacketSUDep) &&
+            (PacketSU->Succs[i].getKind() == SDep::Anti) &&
+            (PacketSU->Succs[i].getReg() == DepReg)) {
+          return true;
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
+
+/// Gets the predicate register of a predicated instruction.
+static unsigned getPredicatedRegister(MachineInstr *MI,
+                                      const HexagonInstrInfo *QII) {
+  /// We use the following rule: The first predicate register that is a use is
+  /// the predicate register of a predicated instruction.
+
+  assert(QII->isPredicated(MI) && "Must be predicated instruction");
+
+  for (MachineInstr::mop_iterator OI = MI->operands_begin(),
+       OE = MI->operands_end(); OI != OE; ++OI) {
+    MachineOperand &Op = *OI;
+    if (Op.isReg() && Op.getReg() && Op.isUse() &&
+        Hexagon::PredRegsRegClass.contains(Op.getReg()))
+      return Op.getReg();
+  }
+
+  llvm_unreachable("Unknown instruction operand layout");
+
+  return 0;
+}
+
+// Given two predicated instructions, this function detects whether
+// the predicates are complements
+bool HexagonPacketizerList::ArePredicatesComplements (MachineInstr* MI1,
+     MachineInstr* MI2, std::map <MachineInstr*, SUnit*> MIToSUnit) {
+
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+
+  // If we don't know the predicate sense of the instructions bail out early, we
+  // need it later.
+  if (getPredicateSense(MI1, QII) == PK_Unknown ||
+      getPredicateSense(MI2, QII) == PK_Unknown)
+    return false;
+
+  // Scheduling unit for candidate
+  SUnit* SU = MIToSUnit[MI1];
+
+  // One corner case deals with the following scenario:
+  // Trying to add
+  // a) %R24<def> = TFR_cPt %P0, %R25
+  // to this packet:
+  //
+  // {
+  //   b) %R25<def> = TFR_cNotPt %P0, %R24
+  //   c) %P0<def> = CMPEQri %R26, 1
+  // }
+  //
+  // On general check a) and b) are complements, but
+  // presence of c) will convert a) to .new form, and
+  // then it is not a complement
+  // We attempt to detect it by analyzing  existing
+  // dependencies in the packet
+
+  // Analyze relationships between all existing members of the packet.
+  // Look for Anti dependecy on the same predicate reg
+  // as used in the candidate
+  for (std::vector<MachineInstr*>::iterator VIN = CurrentPacketMIs.begin(),
+       VEN = CurrentPacketMIs.end(); (VIN != VEN); ++VIN) {
+
+    // Scheduling Unit for current insn in the packet
+    SUnit* PacketSU = MIToSUnit[*VIN];
+
+    // If this instruction in the packet is succeeded by the candidate...
+    if (PacketSU->isSucc(SU)) {
+      for (unsigned i = 0; i < PacketSU->Succs.size(); ++i) {
+        // The corner case exist when there is true data
+        // dependency between candidate and one of current
+        // packet members, this dep is on predicate reg, and
+        // there already exist anti dep on the same pred in
+        // the packet.
+        if (PacketSU->Succs[i].getSUnit() == SU &&
+            PacketSU->Succs[i].getKind() == SDep::Data &&
+            Hexagon::PredRegsRegClass.contains(
+              PacketSU->Succs[i].getReg()) &&
+            // Here I know that *VIN is predicate setting instruction
+            // with true data dep to candidate on the register
+            // we care about - c) in the above example.
+            // Now I need to see if there is an anti dependency
+            // from c) to any other instruction in the
+            // same packet on the pred reg of interest
+            RestrictingDepExistInPacket(*VIN,PacketSU->Succs[i].getReg(),
+                                        MIToSUnit)) {
+           return false;
+        }
+      }
+    }
+  }
+
+  // If the above case does not apply, check regular
+  // complement condition.
+  // Check that the predicate register is the same and
+  // that the predicate sense is different
+  // We also need to differentiate .old vs. .new:
+  // !p0 is not complimentary to p0.new
+  unsigned PReg1 = getPredicatedRegister(MI1, QII);
+  unsigned PReg2 = getPredicatedRegister(MI2, QII);
+  return ((PReg1 == PReg2) &&
+          Hexagon::PredRegsRegClass.contains(PReg1) &&
+          Hexagon::PredRegsRegClass.contains(PReg2) &&
+          (getPredicateSense(MI1, QII) != getPredicateSense(MI2, QII)) &&
+          (QII->isDotNewInst(MI1) == QII->isDotNewInst(MI2)));
+}
+
+// initPacketizerState - Initialize packetizer flags
+void HexagonPacketizerList::initPacketizerState() {
+
+  Dependence = false;
+  PromotedToDotNew = false;
+  GlueToNewValueJump = false;
+  GlueAllocframeStore = false;
+  FoundSequentialDependence = false;
+
+  return;
+}
+
+// ignorePseudoInstruction - Ignore bundling of pseudo instructions.
+bool HexagonPacketizerList::ignorePseudoInstruction(MachineInstr *MI,
+                                                    MachineBasicBlock *MBB) {
+  if (MI->isDebugValue())
+    return true;
+
+  // We must print out inline assembly
+  if (MI->isInlineAsm())
+    return false;
+
+  // We check if MI has any functional units mapped to it.
+  // If it doesn't, we ignore the instruction.
+  const MCInstrDesc& TID = MI->getDesc();
+  unsigned SchedClass = TID.getSchedClass();
+  const InstrStage* IS =
+                    ResourceTracker->getInstrItins()->beginStage(SchedClass);
+  unsigned FuncUnits = IS->getUnits();
+  return !FuncUnits;
+}
+
+// isSoloInstruction: - Returns true for instructions that must be
+// scheduled in their own packet.
+bool HexagonPacketizerList::isSoloInstruction(MachineInstr *MI) {
+
+  if (MI->isInlineAsm())
+    return true;
+
+  if (MI->isEHLabel())
+    return true;
+
+  // From Hexagon V4 Programmer's Reference Manual 3.4.4 Grouping constraints:
+  // trap, pause, barrier, icinva, isync, and syncht are solo instructions.
+  // They must not be grouped with other instructions in a packet.
+  if (IsSchedBarrier(MI))
+    return true;
+
+  return false;
+}
+
+// isLegalToPacketizeTogether:
+// SUI is the current instruction that is out side of the current packet.
+// SUJ is the current instruction inside the current packet against which that
+// SUI will be packetized.
+bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
+  MachineInstr *I = SUI->getInstr();
+  MachineInstr *J = SUJ->getInstr();
+  assert(I && J && "Unable to packetize null instruction!");
+
+  const MCInstrDesc &MCIDI = I->getDesc();
+  const MCInstrDesc &MCIDJ = J->getDesc();
+
+  MachineBasicBlock::iterator II = I;
+
+  const unsigned FrameSize = MF.getFrameInfo()->getStackSize();
+  const HexagonRegisterInfo* QRI =
+                      (const HexagonRegisterInfo *) TM.getRegisterInfo();
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+
+  // Inline asm cannot go in the packet.
+  if (I->getOpcode() == Hexagon::INLINEASM)
+    llvm_unreachable("Should not meet inline asm here!");
+
+  if (isSoloInstruction(I))
+    llvm_unreachable("Should not meet solo instr here!");
+
+  // A save callee-save register function call can only be in a packet
+  // with instructions that don't write to the callee-save registers.
+  if ((QII->isSaveCalleeSavedRegsCall(I) &&
+       DoesModifyCalleeSavedReg(J, QRI)) ||
+      (QII->isSaveCalleeSavedRegsCall(J) &&
+       DoesModifyCalleeSavedReg(I, QRI))) {
+    Dependence = true;
+    return false;
+  }
+
+  // Two control flow instructions cannot go in the same packet.
+  if (IsControlFlow(I) && IsControlFlow(J)) {
+    Dependence = true;
+    return false;
+  }
+
+  // A LoopN instruction cannot appear in the same packet as a jump or call.
+  if (IsLoopN(I) &&
+     (IsDirectJump(J) || MCIDJ.isCall() || QII->isDeallocRet(J))) {
+    Dependence = true;
+    return false;
+  }
+  if (IsLoopN(J) &&
+     (IsDirectJump(I) || MCIDI.isCall() || QII->isDeallocRet(I))) {
+    Dependence = true;
+    return false;
+  }
+
+  // dealloc_return cannot appear in the same packet as a conditional or
+  // unconditional jump.
+  if (QII->isDeallocRet(I) &&
+     (MCIDJ.isBranch() || MCIDJ.isCall() || MCIDJ.isBarrier())) {
+    Dependence = true;
+    return false;
+  }
+
+
+  // V4 allows dual store. But does not allow second store, if the
+  // first store is not in SLOT0. New value store, new value jump,
+  // dealloc_return and memop always take SLOT0.
+  // Arch spec 3.4.4.2
+  if (QRI->Subtarget.hasV4TOps()) {
+    if (MCIDI.mayStore() && MCIDJ.mayStore() &&
+       (QII->isNewValueInst(J) || QII->isMemOp(J) || QII->isMemOp(I))) {
+      Dependence = true;
+      return false;
+    }
+
+    if ((QII->isMemOp(J) && MCIDI.mayStore())
+        || (MCIDJ.mayStore() && QII->isMemOp(I))
+        || (QII->isMemOp(J) && QII->isMemOp(I))) {
+      Dependence = true;
+      return false;
+    }
+
+    //if dealloc_return
+    if (MCIDJ.mayStore() && QII->isDeallocRet(I)) {
+      Dependence = true;
+      return false;
+    }
+
+    // If an instruction feeds new value jump, glue it.
+    MachineBasicBlock::iterator NextMII = I;
+    ++NextMII;
+    if (NextMII != I->getParent()->end() && QII->isNewValueJump(NextMII)) {
+      MachineInstr *NextMI = NextMII;
+
+      bool secondRegMatch = false;
+      bool maintainNewValueJump = false;
+
+      if (NextMI->getOperand(1).isReg() &&
+          I->getOperand(0).getReg() == NextMI->getOperand(1).getReg()) {
+        secondRegMatch = true;
+        maintainNewValueJump = true;
+      }
+
+      if (!secondRegMatch &&
+           I->getOperand(0).getReg() == NextMI->getOperand(0).getReg()) {
+        maintainNewValueJump = true;
+      }
+
+      for (std::vector<MachineInstr*>::iterator
+            VI = CurrentPacketMIs.begin(),
+             VE = CurrentPacketMIs.end();
+           (VI != VE && maintainNewValueJump); ++VI) {
+        SUnit* PacketSU = MIToSUnit[*VI];
+
+        // NVJ can not be part of the dual jump - Arch Spec: section 7.8
+        if (PacketSU->getInstr()->getDesc().isCall()) {
+          Dependence = true;
+          break;
+        }
+        // Validate
+        // 1. Packet does not have a store in it.
+        // 2. If the first operand of the nvj is newified, and the second
+        //    operand is also a reg, it (second reg) is not defined in
+        //    the same packet.
+        // 3. If the second operand of the nvj is newified, (which means
+        //    first operand is also a reg), first reg is not defined in
+        //    the same packet.
+        if (PacketSU->getInstr()->getDesc().mayStore()               ||
+            PacketSU->getInstr()->getOpcode() == Hexagon::ALLOCFRAME ||
+            // Check #2.
+            (!secondRegMatch && NextMI->getOperand(1).isReg() &&
+             PacketSU->getInstr()->modifiesRegister(
+                               NextMI->getOperand(1).getReg(), QRI)) ||
+            // Check #3.
+            (secondRegMatch &&
+             PacketSU->getInstr()->modifiesRegister(
+                               NextMI->getOperand(0).getReg(), QRI))) {
+          Dependence = true;
+          break;
+        }
+      }
+      if (!Dependence)
+        GlueToNewValueJump = true;
+      else
+        return false;
+    }
+  }
+
+  if (SUJ->isSucc(SUI)) {
+    for (unsigned i = 0;
+         (i < SUJ->Succs.size()) && !FoundSequentialDependence;
+         ++i) {
+
+      if (SUJ->Succs[i].getSUnit() != SUI) {
+        continue;
+      }
+
+      SDep::Kind DepType = SUJ->Succs[i].getKind();
+
+      // For direct calls:
+      // Ignore register dependences for call instructions for
+      // packetization purposes except for those due to r31 and
+      // predicate registers.
+      //
+      // For indirect calls:
+      // Same as direct calls + check for true dependences to the register
+      // used in the indirect call.
+      //
+      // We completely ignore Order dependences for call instructions
+      //
+      // For returns:
+      // Ignore register dependences for return instructions like jumpr,
+      // dealloc return unless we have dependencies on the explicit uses
+      // of the registers used by jumpr (like r31) or dealloc return
+      // (like r29 or r30).
+      //
+      // TODO: Currently, jumpr is handling only return of r31. So, the
+      // following logic (specificaly IsCallDependent) is working fine.
+      // We need to enable jumpr for register other than r31 and then,
+      // we need to rework the last part, where it handles indirect call
+      // of that (IsCallDependent) function. Bug 6216 is opened for this.
+      //
+      unsigned DepReg = 0;
+      const TargetRegisterClass* RC = nullptr;
+      if (DepType == SDep::Data) {
+        DepReg = SUJ->Succs[i].getReg();
+        RC = QRI->getMinimalPhysRegClass(DepReg);
+      }
+      if ((MCIDI.isCall() || MCIDI.isReturn()) &&
+          (!IsRegDependence(DepType) ||
+            !IsCallDependent(I, DepType, SUJ->Succs[i].getReg()))) {
+        /* do nothing */
+      }
+
+      // For instructions that can be promoted to dot-new, try to promote.
+      else if ((DepType == SDep::Data) &&
+               CanPromoteToDotNew(I, SUJ, DepReg, MIToSUnit, II, RC) &&
+               PromoteToDotNew(I, DepType, II, RC)) {
+        PromotedToDotNew = true;
+        /* do nothing */
+      }
+
+      else if ((DepType == SDep::Data) &&
+               (QII->isNewValueJump(I))) {
+        /* do nothing */
+      }
+
+      // For predicated instructions, if the predicates are complements
+      // then there can be no dependence.
+      else if (QII->isPredicated(I) &&
+               QII->isPredicated(J) &&
+          ArePredicatesComplements(I, J, MIToSUnit)) {
+        /* do nothing */
+
+      }
+      else if (IsDirectJump(I) &&
+               !MCIDJ.isBranch() &&
+               !MCIDJ.isCall() &&
+               (DepType == SDep::Order)) {
+        // Ignore Order dependences between unconditional direct branches
+        // and non-control-flow instructions
+        /* do nothing */
+      }
+      else if (MCIDI.isConditionalBranch() && (DepType != SDep::Data) &&
+               (DepType != SDep::Output)) {
+        // Ignore all dependences for jumps except for true and output
+        // dependences
+        /* do nothing */
+      }
+
+      // Ignore output dependences due to superregs. We can
+      // write to two different subregisters of R1:0 for instance
+      // in the same cycle
+      //
+
+      //
+      // Let the
+      // If neither I nor J defines DepReg, then this is a
+      // superfluous output dependence. The dependence must be of the
+      // form:
+      //  R0 = ...
+      //  R1 = ...
+      // and there is an output dependence between the two instructions
+      // with
+      // DepReg = D0
+      // We want to ignore these dependences.
+      // Ideally, the dependence constructor should annotate such
+      // dependences. We can then avoid this relatively expensive check.
+      //
+      else if (DepType == SDep::Output) {
+        // DepReg is the register that's responsible for the dependence.
+        unsigned DepReg = SUJ->Succs[i].getReg();
+
+        // Check if I and J really defines DepReg.
+        if (I->definesRegister(DepReg) ||
+            J->definesRegister(DepReg)) {
+          FoundSequentialDependence = true;
+          break;
+        }
+      }
+
+      // We ignore Order dependences for
+      // 1. Two loads unless they are volatile.
+      // 2. Two stores in V4 unless they are volatile.
+      else if ((DepType == SDep::Order) &&
+               !I->hasOrderedMemoryRef() &&
+               !J->hasOrderedMemoryRef()) {
+        if (QRI->Subtarget.hasV4TOps() &&
+            // hexagonv4 allows dual store.
+            MCIDI.mayStore() && MCIDJ.mayStore()) {
+          /* do nothing */
+        }
+        // store followed by store-- not OK on V2
+        // store followed by load -- not OK on all (OK if addresses
+        // are not aliased)
+        // load followed by store -- OK on all
+        // load followed by load  -- OK on all
+        else if ( !MCIDJ.mayStore()) {
+          /* do nothing */
+        }
+        else {
+          FoundSequentialDependence = true;
+          break;
+        }
+      }
+
+      // For V4, special case ALLOCFRAME. Even though there is dependency
+      // between ALLOCAFRAME and subsequent store, allow it to be
+      // packetized in a same packet. This implies that the store is using
+      // caller's SP. Hense, offset needs to be updated accordingly.
+      else if (DepType == SDep::Data
+               && QRI->Subtarget.hasV4TOps()
+               && J->getOpcode() == Hexagon::ALLOCFRAME
+               && (I->getOpcode() == Hexagon::STrid
+                   || I->getOpcode() == Hexagon::STriw
+                   || I->getOpcode() == Hexagon::STrib)
+               && I->getOperand(0).getReg() == QRI->getStackRegister()
+               && QII->isValidOffset(I->getOpcode(),
+                                     I->getOperand(1).getImm() -
+                                     (FrameSize + HEXAGON_LRFP_SIZE)))
+      {
+        GlueAllocframeStore = true;
+        // Since this store is to be glued with allocframe in the same
+        // packet, it will use SP of the previous stack frame, i.e
+        // caller's SP. Therefore, we need to recalculate offset according
+        // to this change.
+        I->getOperand(1).setImm(I->getOperand(1).getImm() -
+                                        (FrameSize + HEXAGON_LRFP_SIZE));
+      }
+
+      //
+      // Skip over anti-dependences. Two instructions that are
+      // anti-dependent can share a packet
+      //
+      else if (DepType != SDep::Anti) {
+        FoundSequentialDependence = true;
+        break;
+      }
+    }
+
+    if (FoundSequentialDependence) {
+      Dependence = true;
+      return false;
+    }
+  }
+
+  return true;
+}
+
+// isLegalToPruneDependencies
+bool HexagonPacketizerList::isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) {
+  MachineInstr *I = SUI->getInstr();
+  assert(I && SUJ->getInstr() && "Unable to packetize null instruction!");
+
+  const unsigned FrameSize = MF.getFrameInfo()->getStackSize();
+
+  if (Dependence) {
+
+    // Check if the instruction was promoted to a dot-new. If so, demote it
+    // back into a dot-old.
+    if (PromotedToDotNew) {
+      DemoteToDotOld(I);
+    }
+
+    // Check if the instruction (must be a store) was glued with an Allocframe
+    // instruction. If so, restore its offset to its original value, i.e. use
+    // curent SP instead of caller's SP.
+    if (GlueAllocframeStore) {
+      I->getOperand(1).setImm(I->getOperand(1).getImm() +
+                                             FrameSize + HEXAGON_LRFP_SIZE);
+    }
+
+    return false;
+  }
+  return true;
+}
+
+MachineBasicBlock::iterator
+HexagonPacketizerList::addToPacket(MachineInstr *MI) {
+
+    MachineBasicBlock::iterator MII = MI;
+    MachineBasicBlock *MBB = MI->getParent();
+
+    const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+
+    if (GlueToNewValueJump) {
+
+      ++MII;
+      MachineInstr *nvjMI = MII;
+      assert(ResourceTracker->canReserveResources(MI));
+      ResourceTracker->reserveResources(MI);
+      if ((QII->isExtended(MI) || QII->isConstExtended(MI)) &&
+          !tryAllocateResourcesForConstExt(MI)) {
+        endPacket(MBB, MI);
+        ResourceTracker->reserveResources(MI);
+        assert(canReserveResourcesForConstExt(MI) &&
+               "Ensure that there is a slot");
+        reserveResourcesForConstExt(MI);
+        // Reserve resources for new value jump constant extender.
+        assert(canReserveResourcesForConstExt(MI) &&
+               "Ensure that there is a slot");
+        reserveResourcesForConstExt(nvjMI);
+        assert(ResourceTracker->canReserveResources(nvjMI) &&
+               "Ensure that there is a slot");
+
+      } else if (   // Extended instruction takes two slots in the packet.
+        // Try reserve and allocate 4-byte in the current packet first.
+        (QII->isExtended(nvjMI)
+            && (!tryAllocateResourcesForConstExt(nvjMI)
+                || !ResourceTracker->canReserveResources(nvjMI)))
+        || // For non-extended instruction, no need to allocate extra 4 bytes.
+        (!QII->isExtended(nvjMI) &&
+              !ResourceTracker->canReserveResources(nvjMI)))
+      {
+        endPacket(MBB, MI);
+        // A new and empty packet starts.
+        // We are sure that the resources requirements can be satisfied.
+        // Therefore, do not need to call "canReserveResources" anymore.
+        ResourceTracker->reserveResources(MI);
+        if (QII->isExtended(nvjMI))
+          reserveResourcesForConstExt(nvjMI);
+      }
+      // Here, we are sure that "reserveResources" would succeed.
+      ResourceTracker->reserveResources(nvjMI);
+      CurrentPacketMIs.push_back(MI);
+      CurrentPacketMIs.push_back(nvjMI);
+    } else {
+      if (   (QII->isExtended(MI) || QII->isConstExtended(MI))
+          && (   !tryAllocateResourcesForConstExt(MI)
+              || !ResourceTracker->canReserveResources(MI)))
+      {
+        endPacket(MBB, MI);
+        // Check if the instruction was promoted to a dot-new. If so, demote it
+        // back into a dot-old
+        if (PromotedToDotNew) {
+          DemoteToDotOld(MI);
+        }
+        reserveResourcesForConstExt(MI);
+      }
+      // In case that "MI" is not an extended insn,
+      // the resource availability has already been checked.
+      ResourceTracker->reserveResources(MI);
+      CurrentPacketMIs.push_back(MI);
+    }
+    return MII;
+}
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+FunctionPass *llvm::createHexagonPacketizer() {
+  return new HexagonPacketizer();
+}
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonVarargsCallingConvention.h b/contrib/llvm/lib/Target/Hexagon/HexagonVarargsCallingConvention.h
new file mode 100644
index 0000000..668ca98
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonVarargsCallingConvention.h
@@ -0,0 +1,141 @@
+//===-- HexagonVarargsCallingConvention.h - Calling Conventions -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the functions that assign locations to outgoing function
+// arguments. Adapted from the target independent version but this handles
+// calls to varargs functions
+//
+//===----------------------------------------------------------------------===//
+//
+
+
+
+
+static bool RetCC_Hexagon32_VarArgs(unsigned ValNo, EVT ValVT,
+                                    EVT LocVT, CCValAssign::LocInfo LocInfo,
+                                    ISD::ArgFlagsTy ArgFlags,
+                                    Hexagon_CCState &State,
+                                    int NonVarArgsParams,
+                                    int CurrentParam,
+                                    bool ForceMem);
+
+
+static bool CC_Hexagon32_VarArgs(unsigned ValNo, EVT ValVT,
+                                 EVT LocVT, CCValAssign::LocInfo LocInfo,
+                                 ISD::ArgFlagsTy ArgFlags,
+                                 Hexagon_CCState &State,
+                                 int NonVarArgsParams,
+                                 int CurrentParam,
+                                 bool ForceMem) {
+  unsigned ByValSize = 0;
+  if (ArgFlags.isByVal() &&
+      ((ByValSize = ArgFlags.getByValSize()) >
+       (MVT(MVT::i64).getSizeInBits() / 8))) {
+    ForceMem = true;
+  }
+
+
+  // Only assign registers for named (non-varargs) arguments
+  if ( !ForceMem && ((NonVarArgsParams == -1) || (CurrentParam <=
+                                                  NonVarArgsParams))) {
+
+    if (LocVT == MVT::i32 ||
+        LocVT == MVT::i16 ||
+        LocVT == MVT::i8 ||
+        LocVT == MVT::f32) {
+      static const unsigned RegList1[] = {
+        Hexagon::R0, Hexagon::R1, Hexagon::R2, Hexagon::R3, Hexagon::R4,
+        Hexagon::R5
+      };
+      if (unsigned Reg = State.AllocateReg(RegList1, 6)) {
+        State.addLoc(CCValAssign::getReg(ValNo, ValVT.getSimpleVT(), Reg,
+                                         LocVT.getSimpleVT(), LocInfo));
+        return false;
+      }
+    }
+
+    if (LocVT == MVT::i64 ||
+        LocVT == MVT::f64) {
+      static const unsigned RegList2[] = {
+        Hexagon::D0, Hexagon::D1, Hexagon::D2
+      };
+      if (unsigned Reg = State.AllocateReg(RegList2, 3)) {
+        State.addLoc(CCValAssign::getReg(ValNo, ValVT.getSimpleVT(), Reg,
+                                         LocVT.getSimpleVT(), LocInfo));
+        return false;
+      }
+    }
+  }
+
+  const Type* ArgTy = LocVT.getTypeForEVT(State.getContext());
+  unsigned Alignment =
+    State.getTarget().getDataLayout()->getABITypeAlignment(ArgTy);
+  unsigned Size =
+    State.getTarget().getDataLayout()->getTypeSizeInBits(ArgTy) / 8;
+
+  // If it's passed by value, then we need the size of the aggregate not of
+  // the pointer.
+  if (ArgFlags.isByVal()) {
+    Size = ByValSize;
+
+    // Hexagon_TODO: Get the alignment of the contained type here.
+    Alignment = 8;
+  }
+
+  unsigned Offset3 = State.AllocateStack(Size, Alignment);
+  State.addLoc(CCValAssign::getMem(ValNo, ValVT.getSimpleVT(), Offset3,
+                                   LocVT.getSimpleVT(), LocInfo));
+  return false;
+}
+
+
+static bool RetCC_Hexagon32_VarArgs(unsigned ValNo, EVT ValVT,
+                                    EVT LocVT, CCValAssign::LocInfo LocInfo,
+                                    ISD::ArgFlagsTy ArgFlags,
+                                    Hexagon_CCState &State,
+                                    int NonVarArgsParams,
+                                    int CurrentParam,
+                                    bool ForceMem) {
+
+  if (LocVT == MVT::i32 ||
+      LocVT == MVT::f32) {
+    static const unsigned RegList1[] = {
+      Hexagon::R0, Hexagon::R1, Hexagon::R2, Hexagon::R3, Hexagon::R4,
+      Hexagon::R5
+    };
+    if (unsigned Reg = State.AllocateReg(RegList1, 6)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT.getSimpleVT(), Reg,
+                                       LocVT.getSimpleVT(), LocInfo));
+      return false;
+    }
+  }
+
+  if (LocVT == MVT::i64 ||
+      LocVT == MVT::f64) {
+    static const unsigned RegList2[] = {
+      Hexagon::D0, Hexagon::D1, Hexagon::D2
+    };
+    if (unsigned Reg = State.AllocateReg(RegList2, 3)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT.getSimpleVT(), Reg,
+                                       LocVT.getSimpleVT(), LocInfo));
+      return false;
+    }
+  }
+
+  const Type* ArgTy = LocVT.getTypeForEVT(State.getContext());
+  unsigned Alignment =
+    State.getTarget().getDataLayout()->getABITypeAlignment(ArgTy);
+  unsigned Size =
+    State.getTarget().getDataLayout()->getTypeSizeInBits(ArgTy) / 8;
+
+  unsigned Offset3 = State.AllocateStack(Size, Alignment);
+  State.addLoc(CCValAssign::getMem(ValNo, ValVT.getSimpleVT(), Offset3,
+                                   LocVT.getSimpleVT(), LocInfo));
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp b/contrib/llvm/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp
new file mode 100644
index 0000000..9942a60
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp
@@ -0,0 +1,204 @@
+//===- HexagonInstPrinter.cpp - Convert Hexagon MCInst to assembly syntax -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an Hexagon MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonAsmPrinter.h"
+#include "Hexagon.h"
+#include "HexagonInstPrinter.h"
+#include "MCTargetDesc/HexagonMCInst.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+#define GET_INSTRUCTION_NAME
+#include "HexagonGenAsmWriter.inc"
+
+const char HexagonInstPrinter::PacketPadding = '\t';
+
+StringRef HexagonInstPrinter::getOpcodeName(unsigned Opcode) const {
+  return MII.getName(Opcode);
+}
+
+StringRef HexagonInstPrinter::getRegName(unsigned RegNo) const {
+  return getRegisterName(RegNo);
+}
+
+void HexagonInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                                   StringRef Annot) {
+  printInst((const HexagonMCInst*)(MI), O, Annot);
+}
+
+void HexagonInstPrinter::printInst(const HexagonMCInst *MI, raw_ostream &O,
+                                   StringRef Annot) {
+  const char startPacket = '{',
+             endPacket = '}';
+  // TODO: add outer HW loop when it's supported too.
+  if (MI->getOpcode() == Hexagon::ENDLOOP0) {
+    // Ending a harware loop is different from ending an regular packet.
+    assert(MI->isPacketEnd() && "Loop-end must also end the packet");
+
+    if (MI->isPacketStart()) {
+      // There must be a packet to end a loop.
+      // FIXME: when shuffling is always run, this shouldn't be needed.
+      HexagonMCInst Nop;
+      StringRef NoAnnot;
+
+      Nop.setOpcode (Hexagon::NOP);
+      Nop.setPacketStart (MI->isPacketStart());
+      printInst (&Nop, O, NoAnnot);
+    }
+
+    // Close the packet.
+    if (MI->isPacketEnd())
+      O << PacketPadding << endPacket;
+
+    printInstruction(MI, O);
+  }
+  else {
+    // Prefix the insn opening the packet.
+    if (MI->isPacketStart())
+      O << PacketPadding << startPacket << '\n';
+
+    printInstruction(MI, O);
+
+    // Suffix the insn closing the packet.
+    if (MI->isPacketEnd())
+      // Suffix the packet in a new line always, since the GNU assembler has
+      // issues with a closing brace on the same line as CONST{32,64}.
+      O << '\n' << PacketPadding << endPacket;
+  }
+
+  printAnnotation(O, Annot);
+}
+
+void HexagonInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                      raw_ostream &O) const {
+  const MCOperand& MO = MI->getOperand(OpNo);
+
+  if (MO.isReg()) {
+    O << getRegisterName(MO.getReg());
+  } else if(MO.isExpr()) {
+    O << *MO.getExpr();
+  } else if(MO.isImm()) {
+    printImmOperand(MI, OpNo, O);
+  } else {
+    llvm_unreachable("Unknown operand");
+  }
+}
+
+void HexagonInstPrinter::printImmOperand(const MCInst *MI, unsigned OpNo,
+                                         raw_ostream &O) const {
+  const MCOperand& MO = MI->getOperand(OpNo);
+
+  if(MO.isExpr()) {
+    O << *MO.getExpr();
+  } else if(MO.isImm()) {
+    O << MI->getOperand(OpNo).getImm();
+  } else {
+    llvm_unreachable("Unknown operand");
+  }
+}
+
+void HexagonInstPrinter::printExtOperand(const MCInst *MI, unsigned OpNo,
+                                         raw_ostream &O) const {
+  const HexagonMCInst *HMCI = static_cast<const HexagonMCInst*>(MI);
+  if (HMCI->isConstExtended())
+    O << "#";
+  printOperand(MI, OpNo, O);
+}
+
+void HexagonInstPrinter::printUnsignedImmOperand(const MCInst *MI,
+                                    unsigned OpNo, raw_ostream &O) const {
+  O << MI->getOperand(OpNo).getImm();
+}
+
+void HexagonInstPrinter::printNegImmOperand(const MCInst *MI, unsigned OpNo,
+                                            raw_ostream &O) const {
+  O << -MI->getOperand(OpNo).getImm();
+}
+
+void HexagonInstPrinter::printNOneImmOperand(const MCInst *MI, unsigned OpNo,
+                                             raw_ostream &O) const {
+  O << -1;
+}
+
+void HexagonInstPrinter::printMEMriOperand(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O) const {
+  const MCOperand& MO0 = MI->getOperand(OpNo);
+  const MCOperand& MO1 = MI->getOperand(OpNo + 1);
+
+  O << getRegisterName(MO0.getReg());
+  O << " + #" << MO1.getImm();
+}
+
+void HexagonInstPrinter::printFrameIndexOperand(const MCInst *MI, unsigned OpNo,
+                                                raw_ostream &O) const {
+  const MCOperand& MO0 = MI->getOperand(OpNo);
+  const MCOperand& MO1 = MI->getOperand(OpNo + 1);
+
+  O << getRegisterName(MO0.getReg()) << ", #" << MO1.getImm();
+}
+
+void HexagonInstPrinter::printGlobalOperand(const MCInst *MI, unsigned OpNo,
+                                            raw_ostream &O) const {
+  assert(MI->getOperand(OpNo).isExpr() && "Expecting expression");
+
+  printOperand(MI, OpNo, O);
+}
+
+void HexagonInstPrinter::printJumpTable(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) const {
+  assert(MI->getOperand(OpNo).isExpr() && "Expecting expression");
+
+  printOperand(MI, OpNo, O);
+}
+
+void HexagonInstPrinter::printConstantPool(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O) const {
+  assert(MI->getOperand(OpNo).isExpr() && "Expecting expression");
+
+  printOperand(MI, OpNo, O);
+}
+
+void HexagonInstPrinter::printBranchOperand(const MCInst *MI, unsigned OpNo,
+                                            raw_ostream &O) const {
+  // Branches can take an immediate operand.  This is used by the branch
+  // selection pass to print $+8, an eight byte displacement from the PC.
+  llvm_unreachable("Unknown branch operand.");
+}
+
+void HexagonInstPrinter::printCallOperand(const MCInst *MI, unsigned OpNo,
+                                          raw_ostream &O) const {
+}
+
+void HexagonInstPrinter::printAbsAddrOperand(const MCInst *MI, unsigned OpNo,
+                                             raw_ostream &O) const {
+}
+
+void HexagonInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNo,
+                                               raw_ostream &O) const {
+}
+
+void HexagonInstPrinter::printSymbol(const MCInst *MI, unsigned OpNo,
+                                     raw_ostream &O, bool hi) const {
+  assert(MI->getOperand(OpNo).isImm() && "Unknown symbol operand");
+
+  O << '#' << (hi ? "HI" : "LO") << "(#";
+  printOperand(MI, OpNo, O);
+  O << ')';
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.h b/contrib/llvm/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.h
new file mode 100644
index 0000000..09e3f88
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.h
@@ -0,0 +1,87 @@
+//===-- HexagonInstPrinter.h - Convert Hexagon MCInst to assembly syntax --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an Hexagon MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HEXAGONINSTPRINTER_H
+#define HEXAGONINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCInstrInfo.h"
+
+namespace llvm {
+  class HexagonMCInst;
+
+  class HexagonInstPrinter : public MCInstPrinter {
+  public:
+    explicit HexagonInstPrinter(const MCAsmInfo &MAI,
+                                const MCInstrInfo &MII,
+                                const MCRegisterInfo &MRI)
+      : MCInstPrinter(MAI, MII, MRI), MII(MII) {}
+
+    void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override;
+    void printInst(const HexagonMCInst *MI, raw_ostream &O, StringRef Annot);
+    virtual StringRef getOpcodeName(unsigned Opcode) const;
+    void printInstruction(const MCInst *MI, raw_ostream &O);
+    StringRef getRegName(unsigned RegNo) const;
+    static const char *getRegisterName(unsigned RegNo);
+
+    void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) const;
+    void printImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) const;
+    void printExtOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) const;
+    void printUnsignedImmOperand(const MCInst *MI, unsigned OpNo,
+                                 raw_ostream &O) const;
+    void printNegImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
+           const;
+    void printNOneImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
+           const;
+    void printMEMriOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
+           const;
+    void printFrameIndexOperand(const MCInst *MI, unsigned OpNo,
+                                raw_ostream &O) const;
+    void printBranchOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
+           const;
+    void printCallOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
+           const;
+    void printAbsAddrOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
+           const;
+    void printPredicateOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
+           const;
+    void printGlobalOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
+           const;
+    void printJumpTable(const MCInst *MI, unsigned OpNo, raw_ostream &O) const;
+
+    void printConstantPool(const MCInst *MI, unsigned OpNo,
+                           raw_ostream &O) const;
+
+    void printSymbolHi(const MCInst *MI, unsigned OpNo, raw_ostream &O) const
+      { printSymbol(MI, OpNo, O, true); }
+    void printSymbolLo(const MCInst *MI, unsigned OpNo, raw_ostream &O) const
+      { printSymbol(MI, OpNo, O, false); }
+
+    const MCInstrInfo &getMII() const {
+      return MII;
+    }
+
+  protected:
+    void printSymbol(const MCInst *MI, unsigned OpNo, raw_ostream &O, bool hi)
+           const;
+
+    static const char PacketPadding;
+
+  private:
+    const MCInstrInfo &MII;
+
+  };
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
new file mode 100644
index 0000000..f8be77c
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
@@ -0,0 +1,196 @@
+//===-- HexagonBaseInfo.h - Top level definitions for Hexagon --*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the Hexagon target useful for the compiler back-end and the MC libraries.
+// As such, it deliberately does not include references to LLVM core
+// code gen types, passes, etc..
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HEXAGONBASEINFO_H
+#define HEXAGONBASEINFO_H
+
+#include "HexagonMCTargetDesc.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+/// HexagonII - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace HexagonII {
+  // *** The code below must match HexagonInstrFormat*.td *** //
+
+  // Insn types.
+  // *** Must match HexagonInstrFormat*.td ***
+  enum Type {
+    TypePSEUDO  = 0,
+    TypeALU32   = 1,
+    TypeCR      = 2,
+    TypeJR      = 3,
+    TypeJ       = 4,
+    TypeLD      = 5,
+    TypeST      = 6,
+    TypeSYSTEM  = 7,
+    TypeXTYPE   = 8,
+    TypeMEMOP   = 9,
+    TypeNV      = 10,
+    TypePREFIX  = 30, // Such as extenders.
+    TypeENDLOOP = 31  // Such as end of a HW loop.
+  };
+
+  enum SubTarget {
+    HasV2SubT     = 0xf,
+    HasV2SubTOnly = 0x1,
+    NoV2SubT      = 0x0,
+    HasV3SubT     = 0xe,
+    HasV3SubTOnly = 0x2,
+    NoV3SubT      = 0x1,
+    HasV4SubT     = 0xc,
+    NoV4SubT      = 0x3,
+    HasV5SubT     = 0x8,
+    NoV5SubT      = 0x7
+  };
+
+  enum AddrMode {
+    NoAddrMode     = 0,  // No addressing mode
+    Absolute       = 1,  // Absolute addressing mode
+    AbsoluteSet    = 2,  // Absolute set addressing mode
+    BaseImmOffset  = 3,  // Indirect with offset
+    BaseLongOffset = 4,  // Indirect with long offset
+    BaseRegOffset  = 5,  // Indirect with register offset
+    PostInc        = 6   // Post increment addressing mode
+  };
+
+  enum MemAccessSize {
+    NoMemAccess = 0,            // Not a memory acces instruction.
+    ByteAccess = 1,             // Byte access instruction (memb).
+    HalfWordAccess = 2,         // Half word access instruction (memh).
+    WordAccess = 3,             // Word access instruction (memw).
+    DoubleWordAccess = 4        // Double word access instruction (memd)
+  };
+
+  // MCInstrDesc TSFlags
+  // *** Must match HexagonInstrFormat*.td ***
+  enum {
+    // This 5-bit field describes the insn type.
+    TypePos  = 0,
+    TypeMask = 0x1f,
+
+    // Solo instructions.
+    SoloPos  = 5,
+    SoloMask = 0x1,
+    // Packed only with A or X-type instructions.
+    SoloAXPos  = 6,
+    SoloAXMask = 0x1,
+    // Only A-type instruction in first slot or nothing.
+    SoloAin1Pos  = 7,
+    SoloAin1Mask = 0x1,
+
+    // Predicated instructions.
+    PredicatedPos  = 8,
+    PredicatedMask = 0x1,
+    PredicatedFalsePos  = 9,
+    PredicatedFalseMask = 0x1,
+    PredicatedNewPos  = 10,
+    PredicatedNewMask = 0x1,
+    PredicateLatePos  = 11,
+    PredicateLateMask = 0x1,
+
+    // New-Value consumer instructions.
+    NewValuePos  = 12,
+    NewValueMask = 0x1,
+    // New-Value producer instructions.
+    hasNewValuePos  = 13,
+    hasNewValueMask = 0x1,
+    // Which operand consumes or produces a new value.
+    NewValueOpPos  = 14,
+    NewValueOpMask = 0x7,
+    // Stores that can become new-value stores.
+    mayNVStorePos  = 17,
+    mayNVStoreMask = 0x1,
+    // New-value store instructions.
+    NVStorePos  = 18,
+    NVStoreMask = 0x1,
+    // Loads that can become current-value loads.
+    mayCVLoadPos  = 19,
+    mayCVLoadMask = 0x1,
+    // Current-value load instructions.
+    CVLoadPos  = 20,
+    CVLoadMask = 0x1,
+
+    // Extendable insns.
+    ExtendablePos  = 21,
+    ExtendableMask = 0x1,
+    // Insns must be extended.
+    ExtendedPos  = 22,
+    ExtendedMask = 0x1,
+    // Which operand may be extended.
+    ExtendableOpPos  = 23,
+    ExtendableOpMask = 0x7,
+    // Signed or unsigned range.
+    ExtentSignedPos  = 26,
+    ExtentSignedMask = 0x1,
+    // Number of bits of range before extending operand.
+    ExtentBitsPos  = 27,
+    ExtentBitsMask = 0x1f,
+    // Alignment power-of-two before extending operand.
+    ExtentAlignPos  = 32,
+    ExtentAlignMask = 0x3,
+
+    // Valid subtargets
+    validSubTargetPos  = 34,
+    validSubTargetMask = 0xf,
+
+    // Addressing mode for load/store instructions.
+    AddrModePos  = 40,
+    AddrModeMask = 0x7,
+    // Access size for load/store instructions.
+    MemAccessSizePos = 43,
+    MemAccesSizeMask = 0x7,
+
+    // Branch predicted taken.
+    TakenPos = 47,
+    TakenMask = 0x1,
+
+    // Floating-point instructions.
+    FPPos  = 48,
+    FPMask = 0x1
+  };
+
+  // *** The code above must match HexagonInstrFormat*.td *** //
+
+  // Hexagon specific MO operand flag mask.
+  enum HexagonMOTargetFlagVal {
+    //===------------------------------------------------------------------===//
+    // Hexagon Specific MachineOperand flags.
+    MO_NO_FLAG,
+
+    HMOTF_ConstExtended = 1,
+
+    /// MO_PCREL - On a symbol operand, indicates a PC-relative relocation
+    /// Used for computing a global address for PIC compilations
+    MO_PCREL,
+
+    /// MO_GOT - Indicates a GOT-relative relocation
+    MO_GOT,
+
+    // Low or high part of a symbol.
+    MO_LO16, MO_HI16,
+
+    // Offset from the base of the SDA.
+    MO_GPREL
+  };
+
+} // End namespace HexagonII.
+
+} // End namespace llvm.
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
new file mode 100644
index 0000000..141e514
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
@@ -0,0 +1,38 @@
+//===-- HexagonMCAsmInfo.cpp - Hexagon asm properties ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the HexagonMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonMCAsmInfo.h"
+
+using namespace llvm;
+
+// Pin the vtable to this file.
+void HexagonMCAsmInfo::anchor() {}
+
+HexagonMCAsmInfo::HexagonMCAsmInfo(StringRef TT) {
+  Data16bitsDirective = "\t.half\t";
+  Data32bitsDirective = "\t.word\t";
+  Data64bitsDirective = nullptr;  // .xword is only supported by V9.
+  ZeroDirective = "\t.skip\t";
+  CommentString = "//";
+  HasLEB128 = true;
+
+  LCOMMDirectiveAlignmentType = LCOMM::ByteAlignment;
+  InlineAsmStart = "# InlineAsm Start";
+  InlineAsmEnd = "# InlineAsm End";
+  ZeroDirective = "\t.space\t";
+  AscizDirective = "\t.string\t";
+
+  SupportsDebugInformation = true;
+  UsesELFSectionDirectiveForBSS  = true;
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h
new file mode 100644
index 0000000..953d804
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h
@@ -0,0 +1,29 @@
+//===-- HexagonTargetAsmInfo.h - Hexagon asm properties --------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the HexagonMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HexagonMCASMINFO_H
+#define HexagonMCASMINFO_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCAsmInfoELF.h"
+
+namespace llvm {
+  class HexagonMCAsmInfo : public MCAsmInfoELF {
+    void anchor() override;
+  public:
+    explicit HexagonMCAsmInfo(StringRef TT);
+  };
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInst.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInst.cpp
new file mode 100644
index 0000000..9260b4a
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInst.cpp
@@ -0,0 +1,175 @@
+//===- HexagonMCInst.cpp - Hexagon sub-class of MCInst --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class extends MCInst to allow some Hexagon VLIW annotations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonInstrInfo.h"
+#include "MCTargetDesc/HexagonBaseInfo.h"
+#include "MCTargetDesc/HexagonMCInst.h"
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
+
+using namespace llvm;
+
+// Return the slots used by the insn.
+unsigned HexagonMCInst::getUnits(const HexagonTargetMachine* TM) const {
+  const HexagonInstrInfo* QII = TM->getInstrInfo();
+  const InstrItineraryData* II = TM->getInstrItineraryData();
+  const InstrStage*
+    IS = II->beginStage(QII->get(this->getOpcode()).getSchedClass());
+
+  return (IS->getUnits());
+}
+
+// Return the Hexagon ISA class for the insn.
+unsigned HexagonMCInst::getType() const {
+  const uint64_t F = MCID->TSFlags;
+
+  return ((F >> HexagonII::TypePos) & HexagonII::TypeMask);
+}
+
+// Return whether the insn is an actual insn.
+bool HexagonMCInst::isCanon() const {
+  return (!MCID->isPseudo() &&
+          !isPrefix() &&
+          getType() != HexagonII::TypeENDLOOP);
+}
+
+// Return whether the insn is a prefix.
+bool HexagonMCInst::isPrefix() const {
+  return (getType() == HexagonII::TypePREFIX);
+}
+
+// Return whether the insn is solo, i.e., cannot be in a packet.
+bool HexagonMCInst::isSolo() const {
+  const uint64_t F = MCID->TSFlags;
+  return ((F >> HexagonII::SoloPos) & HexagonII::SoloMask);
+}
+
+// Return whether the insn is a new-value consumer.
+bool HexagonMCInst::isNewValue() const {
+  const uint64_t F = MCID->TSFlags;
+  return ((F >> HexagonII::NewValuePos) & HexagonII::NewValueMask);
+}
+
+// Return whether the instruction is a legal new-value producer.
+bool HexagonMCInst::hasNewValue() const {
+  const uint64_t F = MCID->TSFlags;
+  return ((F >> HexagonII::hasNewValuePos) & HexagonII::hasNewValueMask);
+}
+
+// Return the operand that consumes or produces a new value.
+const MCOperand& HexagonMCInst::getNewValue() const {
+  const uint64_t F = MCID->TSFlags;
+  const unsigned O = (F >> HexagonII::NewValueOpPos) &
+                     HexagonII::NewValueOpMask;
+  const MCOperand& MCO = getOperand(O);
+
+  assert ((isNewValue() || hasNewValue()) && MCO.isReg());
+  return (MCO);
+}
+
+// Return whether the instruction needs to be constant extended.
+// 1) Always return true if the instruction has 'isExtended' flag set.
+//
+// isExtendable:
+// 2) For immediate extended operands, return true only if the value is
+//    out-of-range.
+// 3) For global address, always return true.
+
+bool HexagonMCInst::isConstExtended(void) const {
+  if (isExtended())
+    return true;
+
+  if (!isExtendable())
+    return false;
+
+  short ExtOpNum = getCExtOpNum();
+  int MinValue   = getMinValue();
+  int MaxValue   = getMaxValue();
+  const MCOperand& MO = getOperand(ExtOpNum);
+
+  // We could be using an instruction with an extendable immediate and shoehorn
+  // a global address into it. If it is a global address it will be constant
+  // extended. We do this for COMBINE.
+  // We currently only handle isGlobal() because it is the only kind of
+  // object we are going to end up with here for now.
+  // In the future we probably should add isSymbol(), etc.
+  if (MO.isExpr())
+    return true;
+
+  // If the extendable operand is not 'Immediate' type, the instruction should
+  // have 'isExtended' flag set.
+  assert(MO.isImm() && "Extendable operand must be Immediate type");
+
+  int ImmValue = MO.getImm();
+  return (ImmValue < MinValue || ImmValue > MaxValue);
+}
+
+// Return whether the instruction must be always extended.
+bool HexagonMCInst::isExtended(void) const {
+  const uint64_t F = MCID->TSFlags;
+  return (F >> HexagonII::ExtendedPos) & HexagonII::ExtendedMask;
+}
+
+// Return true if the instruction may be extended based on the operand value.
+bool HexagonMCInst::isExtendable(void) const {
+  const uint64_t F = MCID->TSFlags;
+  return (F >> HexagonII::ExtendablePos) & HexagonII::ExtendableMask;
+}
+
+// Return number of bits in the constant extended operand.
+unsigned HexagonMCInst::getBitCount(void) const {
+  const uint64_t F = MCID->TSFlags;
+  return ((F >> HexagonII::ExtentBitsPos) & HexagonII::ExtentBitsMask);
+}
+
+// Return constant extended operand number.
+unsigned short HexagonMCInst::getCExtOpNum(void) const {
+  const uint64_t F = MCID->TSFlags;
+  return ((F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask);
+}
+
+// Return whether the operand can be constant extended.
+bool HexagonMCInst::isOperandExtended(const unsigned short OperandNum) const {
+  const uint64_t F = MCID->TSFlags;
+  return ((F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask)
+          == OperandNum;
+}
+
+// Return the min value that a constant extendable operand can have
+// without being extended.
+int HexagonMCInst::getMinValue(void) const {
+  const uint64_t F = MCID->TSFlags;
+  unsigned isSigned = (F >> HexagonII::ExtentSignedPos)
+                    & HexagonII::ExtentSignedMask;
+  unsigned bits =  (F >> HexagonII::ExtentBitsPos)
+                    & HexagonII::ExtentBitsMask;
+
+  if (isSigned) // if value is signed
+    return -1 << (bits - 1);
+  else
+    return 0;
+}
+
+// Return the max value that a constant extendable operand can have
+// without being extended.
+int HexagonMCInst::getMaxValue(void) const {
+  const uint64_t F = MCID->TSFlags;
+  unsigned isSigned = (F >> HexagonII::ExtentSignedPos)
+                    & HexagonII::ExtentSignedMask;
+  unsigned bits =  (F >> HexagonII::ExtentBitsPos)
+                    & HexagonII::ExtentBitsMask;
+
+  if (isSigned) // if value is signed
+    return ~(-1 << (bits - 1));
+  else
+    return ~(-1 << bits);
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInst.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInst.h
new file mode 100644
index 0000000..3c52d456
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInst.h
@@ -0,0 +1,100 @@
+//===- HexagonMCInst.h - Hexagon sub-class of MCInst ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class extends MCInst to allow some VLIW annotations.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HEXAGONMCINST_H
+#define HEXAGONMCINST_H
+
+#include "HexagonTargetMachine.h"
+#include "llvm/MC/MCInst.h"
+
+namespace llvm {
+  class MCOperand;
+
+  class HexagonMCInst: public MCInst {
+    // MCID is set during instruction lowering.
+    // It is needed in order to access TSFlags for
+    // use in checking MC instruction properties.
+    const MCInstrDesc *MCID;
+
+    // Packet start and end markers
+    unsigned packetStart: 1, packetEnd: 1;
+
+  public:
+    explicit HexagonMCInst():
+      MCInst(), MCID(nullptr), packetStart(0), packetEnd(0) {};
+    HexagonMCInst(const MCInstrDesc& mcid):
+      MCInst(), MCID(&mcid), packetStart(0), packetEnd(0) {};
+
+    bool isPacketStart() const { return (packetStart); };
+    bool isPacketEnd() const { return (packetEnd); };
+    void setPacketStart(bool Y) { packetStart = Y; };
+    void setPacketEnd(bool Y) { packetEnd = Y; };
+    void resetPacket() { setPacketStart(false); setPacketEnd(false); };
+
+    // Return the slots used by the insn.
+    unsigned getUnits(const HexagonTargetMachine* TM) const;
+
+    // Return the Hexagon ISA class for the insn.
+    unsigned getType() const;
+
+    void setDesc(const MCInstrDesc& mcid) { MCID = &mcid; };
+    const MCInstrDesc& getDesc(void) const { return *MCID; };
+
+    // Return whether the insn is an actual insn.
+    bool isCanon() const;
+
+    // Return whether the insn is a prefix.
+    bool isPrefix() const;
+
+    // Return whether the insn is solo, i.e., cannot be in a packet.
+    bool isSolo() const;
+
+    // Return whether the instruction needs to be constant extended.
+    bool isConstExtended() const;
+
+    // Return constant extended operand number.
+    unsigned short getCExtOpNum(void) const;
+
+    // Return whether the insn is a new-value consumer.
+    bool isNewValue() const;
+
+    // Return whether the instruction is a legal new-value producer.
+    bool hasNewValue() const;
+
+    // Return the operand that consumes or produces a new value.
+    const MCOperand& getNewValue() const;
+
+    // Return number of bits in the constant extended operand.
+    unsigned getBitCount(void) const;
+
+  private:
+    // Return whether the instruction must be always extended.
+    bool isExtended() const;
+
+    // Return true if the insn may be extended based on the operand value.
+    bool isExtendable() const;
+
+    // Return true if the operand can be constant extended.
+    bool isOperandExtended(const unsigned short OperandNum) const;
+
+    // Return the min value that a constant extendable operand can have
+    // without being extended.
+    int getMinValue() const;
+
+    // Return the max value that a constant extendable operand can have
+    // without being extended.
+    int getMaxValue() const;
+  };
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
new file mode 100644
index 0000000..581674d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -0,0 +1,98 @@
+//===-- HexagonMCTargetDesc.cpp - Hexagon Target Descriptions -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Hexagon specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonMCTargetDesc.h"
+#include "HexagonMCAsmInfo.h"
+#include "InstPrinter/HexagonInstPrinter.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_MC_DESC
+#include "HexagonGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "HexagonGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "HexagonGenRegisterInfo.inc"
+
+static MCInstrInfo *createHexagonMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitHexagonMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createHexagonMCRegisterInfo(StringRef TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitHexagonMCRegisterInfo(X, Hexagon::R0);
+  return X;
+}
+
+static MCSubtargetInfo *createHexagonMCSubtargetInfo(StringRef TT,
+                                                     StringRef CPU,
+                                                     StringRef FS) {
+  MCSubtargetInfo *X = new MCSubtargetInfo();
+  InitHexagonMCSubtargetInfo(X, TT, CPU, FS);
+  return X;
+}
+
+static MCAsmInfo *createHexagonMCAsmInfo(const MCRegisterInfo &MRI,
+                                         StringRef TT) {
+  MCAsmInfo *MAI = new HexagonMCAsmInfo(TT);
+
+  // VirtualFP = (R30 + #0).
+  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(
+      nullptr, Hexagon::R30, 0);
+  MAI->addInitialFrameState(Inst);
+
+  return MAI;
+}
+
+static MCCodeGenInfo *createHexagonMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                             CodeModel::Model CM,
+                                             CodeGenOpt::Level OL) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  // For the time being, use static relocations, since there's really no
+  // support for PIC yet.
+  X->InitMCCodeGenInfo(Reloc::Static, CM, OL);
+  return X;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeHexagonTargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfoFn X(TheHexagonTarget, createHexagonMCAsmInfo);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheHexagonTarget,
+                                        createHexagonMCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheHexagonTarget, createHexagonMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheHexagonTarget,
+                                    createHexagonMCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheHexagonTarget,
+                                          createHexagonMCSubtargetInfo);
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
new file mode 100644
index 0000000..2238b1a
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
@@ -0,0 +1,39 @@
+//===-- HexagonMCTargetDesc.h - Hexagon Target Descriptions -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Hexagon specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HEXAGONMCTARGETDESC_H
+#define HEXAGONMCTARGETDESC_H
+
+namespace llvm {
+class MCSubtargetInfo;
+class Target;
+
+extern Target TheHexagonTarget;
+
+} // End llvm namespace
+
+// Define symbolic names for Hexagon registers.  This defines a mapping from
+// register name to register number.
+//
+#define GET_REGINFO_ENUM
+#include "HexagonGenRegisterInfo.inc"
+
+// Defines symbolic names for the Hexagon instructions.
+//
+#define GET_INSTRINFO_ENUM
+#include "HexagonGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "HexagonGenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp b/contrib/llvm/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp
new file mode 100644
index 0000000..40f6c8d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp
@@ -0,0 +1,19 @@
+//===-- HexagonTargetInfo.cpp - Hexagon Target Implementation ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Hexagon.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheHexagonTarget;
+
+extern "C" void LLVMInitializeHexagonTargetInfo() {
+  RegisterTarget<Triple::hexagon, /*HasJIT=*/false>  X(TheHexagonTarget, "hexagon", "Hexagon");
+}
diff --git a/contrib/llvm/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp b/contrib/llvm/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp
new file mode 100644
index 0000000..acf1214
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp
@@ -0,0 +1,115 @@
+//===-- MSP430InstPrinter.cpp - Convert MSP430 MCInst to assembly syntax --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an MSP430 MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430InstPrinter.h"
+#include "MSP430.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+
+// Include the auto-generated portion of the assembly writer.
+#include "MSP430GenAsmWriter.inc"
+
+void MSP430InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                                  StringRef Annot) {
+  printInstruction(MI, O);
+  printAnnotation(O, Annot);
+}
+
+void MSP430InstPrinter::printPCRelImmOperand(const MCInst *MI, unsigned OpNo,
+                                             raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isImm())
+    O << Op.getImm();
+  else {
+    assert(Op.isExpr() && "unknown pcrel immediate operand");
+    O << *Op.getExpr();
+  }
+}
+
+void MSP430InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                     raw_ostream &O, const char *Modifier) {
+  assert((Modifier == nullptr || Modifier[0] == 0) && "No modifiers supported");
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    O << getRegisterName(Op.getReg());
+  } else if (Op.isImm()) {
+    O << '#' << Op.getImm();
+  } else {
+    assert(Op.isExpr() && "unknown operand kind in printOperand");
+    O << '#' << *Op.getExpr();
+  }
+}
+
+void MSP430InstPrinter::printSrcMemOperand(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O,
+                                           const char *Modifier) {
+  const MCOperand &Base = MI->getOperand(OpNo);
+  const MCOperand &Disp = MI->getOperand(OpNo+1);
+
+  // Print displacement first
+
+  // If the global address expression is a part of displacement field with a
+  // register base, we should not emit any prefix symbol here, e.g.
+  //   mov.w &foo, r1
+  // vs
+  //   mov.w glb(r1), r2
+  // Otherwise (!) msp430-as will silently miscompile the output :(
+  if (!Base.getReg())
+    O << '&';
+
+  if (Disp.isExpr())
+    O << *Disp.getExpr();
+  else {
+    assert(Disp.isImm() && "Expected immediate in displacement field");
+    O << Disp.getImm();
+  }
+
+  // Print register base field
+  if (Base.getReg())
+    O << '(' << getRegisterName(Base.getReg()) << ')';
+}
+
+void MSP430InstPrinter::printCCOperand(const MCInst *MI, unsigned OpNo,
+                                       raw_ostream &O) {
+  unsigned CC = MI->getOperand(OpNo).getImm();
+
+  switch (CC) {
+  default:
+   llvm_unreachable("Unsupported CC code");
+  case MSP430CC::COND_E:
+   O << "eq";
+   break;
+  case MSP430CC::COND_NE:
+   O << "ne";
+   break;
+  case MSP430CC::COND_HS:
+   O << "hs";
+   break;
+  case MSP430CC::COND_LO:
+   O << "lo";
+   break;
+  case MSP430CC::COND_GE:
+   O << "ge";
+   break;
+  case MSP430CC::COND_L:
+   O << 'l';
+   break;
+  }
+}
diff --git a/contrib/llvm/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h b/contrib/llvm/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h
new file mode 100644
index 0000000..5afbd20
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h
@@ -0,0 +1,44 @@
+//= MSP430InstPrinter.h - Convert MSP430 MCInst to assembly syntax -*- C++ -*-//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints a MSP430 MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MSP430INSTPRINTER_H
+#define MSP430INSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+  class MCOperand;
+
+  class MSP430InstPrinter : public MCInstPrinter {
+  public:
+    MSP430InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                      const MCRegisterInfo &MRI)
+      : MCInstPrinter(MAI, MII, MRI) {}
+
+    void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override;
+
+    // Autogenerated by tblgen.
+    void printInstruction(const MCInst *MI, raw_ostream &O);
+    static const char *getRegisterName(unsigned RegNo);
+
+    void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O,
+                      const char *Modifier = nullptr);
+    void printPCRelImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+    void printSrcMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O,
+                            const char *Modifier = nullptr);
+    void printCCOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+
+  };
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp b/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp
new file mode 100644
index 0000000..df1aa1a
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp
@@ -0,0 +1,27 @@
+//===-- MSP430MCAsmInfo.cpp - MSP430 asm properties -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the MSP430MCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430MCAsmInfo.h"
+#include "llvm/ADT/StringRef.h"
+using namespace llvm;
+
+void MSP430MCAsmInfo::anchor() { }
+
+MSP430MCAsmInfo::MSP430MCAsmInfo(StringRef TT) {
+  PointerSize = CalleeSaveStackSlotSize = 2;
+
+  CommentString = ";";
+
+  AlignmentIsInBytes = false;
+  UsesELFSectionDirectiveForBSS = true;
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h b/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h
new file mode 100644
index 0000000..ef805bb
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h
@@ -0,0 +1,30 @@
+//===-- MSP430MCAsmInfo.h - MSP430 asm properties --------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the MSP430MCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MSP430TARGETASMINFO_H
+#define MSP430TARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfoELF.h"
+
+namespace llvm {
+  class StringRef;
+
+  class MSP430MCAsmInfo : public MCAsmInfoELF {
+    void anchor() override;
+  public:
+    explicit MSP430MCAsmInfo(StringRef TT);
+  };
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp b/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
new file mode 100644
index 0000000..72adb45
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
@@ -0,0 +1,94 @@
+//===-- MSP430MCTargetDesc.cpp - MSP430 Target Descriptions ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides MSP430 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430MCTargetDesc.h"
+#include "InstPrinter/MSP430InstPrinter.h"
+#include "MSP430MCAsmInfo.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_MC_DESC
+#include "MSP430GenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "MSP430GenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "MSP430GenRegisterInfo.inc"
+
+static MCInstrInfo *createMSP430MCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitMSP430MCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createMSP430MCRegisterInfo(StringRef TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitMSP430MCRegisterInfo(X, MSP430::PCW);
+  return X;
+}
+
+static MCSubtargetInfo *createMSP430MCSubtargetInfo(StringRef TT, StringRef CPU,
+                                                    StringRef FS) {
+  MCSubtargetInfo *X = new MCSubtargetInfo();
+  InitMSP430MCSubtargetInfo(X, TT, CPU, FS);
+  return X;
+}
+
+static MCCodeGenInfo *createMSP430MCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                                CodeModel::Model CM,
+                                                CodeGenOpt::Level OL) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  X->InitMCCodeGenInfo(RM, CM, OL);
+  return X;
+}
+
+static MCInstPrinter *createMSP430MCInstPrinter(const Target &T,
+                                                unsigned SyntaxVariant,
+                                                const MCAsmInfo &MAI,
+                                                const MCInstrInfo &MII,
+                                                const MCRegisterInfo &MRI,
+                                                const MCSubtargetInfo &STI) {
+  if (SyntaxVariant == 0)
+    return new MSP430InstPrinter(MAI, MII, MRI);
+  return nullptr;
+}
+
+extern "C" void LLVMInitializeMSP430TargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfo<MSP430MCAsmInfo> X(TheMSP430Target);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheMSP430Target,
+                                        createMSP430MCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheMSP430Target, createMSP430MCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheMSP430Target,
+                                    createMSP430MCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheMSP430Target,
+                                          createMSP430MCSubtargetInfo);
+
+  // Register the MCInstPrinter.
+  TargetRegistry::RegisterMCInstPrinter(TheMSP430Target,
+                                        createMSP430MCInstPrinter);
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h b/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h
new file mode 100644
index 0000000..7f3505c
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h
@@ -0,0 +1,36 @@
+//===-- MSP430MCTargetDesc.h - MSP430 Target Descriptions -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides MSP430 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MSP430MCTARGETDESC_H
+#define MSP430MCTARGETDESC_H
+
+namespace llvm {
+class Target;
+
+extern Target TheMSP430Target;
+
+} // End llvm namespace
+
+// Defines symbolic names for MSP430 registers.
+// This defines a mapping from register name to register number.
+#define GET_REGINFO_ENUM
+#include "MSP430GenRegisterInfo.inc"
+
+// Defines symbolic names for the MSP430 instructions.
+#define GET_INSTRINFO_ENUM
+#include "MSP430GenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "MSP430GenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430.h b/contrib/llvm/lib/Target/MSP430/MSP430.h
new file mode 100644
index 0000000..4574ce5
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430.h
@@ -0,0 +1,47 @@
+//==-- MSP430.h - Top-level interface for MSP430 representation --*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in
+// the LLVM MSP430 backend.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_MSP430_H
+#define LLVM_TARGET_MSP430_H
+
+#include "MCTargetDesc/MSP430MCTargetDesc.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace MSP430CC {
+  // MSP430 specific condition code.
+  enum CondCodes {
+    COND_E  = 0,  // aka COND_Z
+    COND_NE = 1,  // aka COND_NZ
+    COND_HS = 2,  // aka COND_C
+    COND_LO = 3,  // aka COND_NC
+    COND_GE = 4,
+    COND_L  = 5,
+
+    COND_INVALID = -1
+  };
+}
+
+namespace llvm {
+  class MSP430TargetMachine;
+  class FunctionPass;
+  class formatted_raw_ostream;
+
+  FunctionPass *createMSP430ISelDag(MSP430TargetMachine &TM,
+                                    CodeGenOpt::Level OptLevel);
+
+  FunctionPass *createMSP430BranchSelectionPass();
+
+} // end namespace llvm;
+
+#endif
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430.td b/contrib/llvm/lib/Target/MSP430/MSP430.td
new file mode 100644
index 0000000..dfea669
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430.td
@@ -0,0 +1,60 @@
+//===-- MSP430.td - Describe the MSP430 Target Machine -----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This is the top level entry point for the MSP430 target.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// Subtarget Features. 
+//===----------------------------------------------------------------------===//
+def FeatureX
+ : SubtargetFeature<"ext", "ExtendedInsts", "true",
+                    "Enable MSP430-X extensions">;
+
+//===----------------------------------------------------------------------===//
+// MSP430 supported processors.
+//===----------------------------------------------------------------------===//
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+def : Proc<"generic",         []>;
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "MSP430RegisterInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Calling Convention Description
+//===----------------------------------------------------------------------===//
+
+include "MSP430CallingConv.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "MSP430InstrInfo.td"
+
+def MSP430InstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// Target Declaration
+//===----------------------------------------------------------------------===//
+
+def MSP430 : Target {
+  let InstructionSet = MSP430InstrInfo;
+}
+
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp b/contrib/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp
new file mode 100644
index 0000000..22a973e
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp
@@ -0,0 +1,161 @@
+//===-- MSP430AsmPrinter.cpp - MSP430 LLVM assembly writer ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to the MSP430 assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430.h"
+#include "InstPrinter/MSP430InstPrinter.h"
+#include "MSP430InstrInfo.h"
+#include "MSP430MCInstLower.h"
+#include "MSP430TargetMachine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+namespace {
+  class MSP430AsmPrinter : public AsmPrinter {
+  public:
+    MSP430AsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+      : AsmPrinter(TM, Streamer) {}
+
+    const char *getPassName() const override {
+      return "MSP430 Assembly Printer";
+    }
+
+    void printOperand(const MachineInstr *MI, int OpNum,
+                      raw_ostream &O, const char* Modifier = nullptr);
+    void printSrcMemOperand(const MachineInstr *MI, int OpNum,
+                            raw_ostream &O);
+    bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                         unsigned AsmVariant, const char *ExtraCode,
+                         raw_ostream &O) override;
+    bool PrintAsmMemoryOperand(const MachineInstr *MI,
+                               unsigned OpNo, unsigned AsmVariant,
+                               const char *ExtraCode, raw_ostream &O) override;
+    void EmitInstruction(const MachineInstr *MI) override;
+  };
+} // end of anonymous namespace
+
+
+void MSP430AsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
+                                    raw_ostream &O, const char *Modifier) {
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  switch (MO.getType()) {
+  default: llvm_unreachable("Not implemented yet!");
+  case MachineOperand::MO_Register:
+    O << MSP430InstPrinter::getRegisterName(MO.getReg());
+    return;
+  case MachineOperand::MO_Immediate:
+    if (!Modifier || strcmp(Modifier, "nohash"))
+      O << '#';
+    O << MO.getImm();
+    return;
+  case MachineOperand::MO_MachineBasicBlock:
+    O << *MO.getMBB()->getSymbol();
+    return;
+  case MachineOperand::MO_GlobalAddress: {
+    bool isMemOp  = Modifier && !strcmp(Modifier, "mem");
+    uint64_t Offset = MO.getOffset();
+
+    // If the global address expression is a part of displacement field with a
+    // register base, we should not emit any prefix symbol here, e.g.
+    //   mov.w &foo, r1
+    // vs
+    //   mov.w glb(r1), r2
+    // Otherwise (!) msp430-as will silently miscompile the output :(
+    if (!Modifier || strcmp(Modifier, "nohash"))
+      O << (isMemOp ? '&' : '#');
+    if (Offset)
+      O << '(' << Offset << '+';
+
+    O << *getSymbol(MO.getGlobal());
+
+    if (Offset)
+      O << ')';
+
+    return;
+  }
+  }
+}
+
+void MSP430AsmPrinter::printSrcMemOperand(const MachineInstr *MI, int OpNum,
+                                          raw_ostream &O) {
+  const MachineOperand &Base = MI->getOperand(OpNum);
+  const MachineOperand &Disp = MI->getOperand(OpNum+1);
+
+  // Print displacement first
+
+  // Imm here is in fact global address - print extra modifier.
+  if (Disp.isImm() && !Base.getReg())
+    O << '&';
+  printOperand(MI, OpNum+1, O, "nohash");
+
+  // Print register base field
+  if (Base.getReg()) {
+    O << '(';
+    printOperand(MI, OpNum, O);
+    O << ')';
+  }
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool MSP430AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                       unsigned AsmVariant,
+                                       const char *ExtraCode, raw_ostream &O) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0])
+    return true; // Unknown modifier.
+
+  printOperand(MI, OpNo, O);
+  return false;
+}
+
+bool MSP430AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                             unsigned OpNo, unsigned AsmVariant,
+                                             const char *ExtraCode,
+                                             raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0]) {
+    return true; // Unknown modifier.
+  }
+  printSrcMemOperand(MI, OpNo, O);
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+void MSP430AsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  MSP430MCInstLower MCInstLowering(OutContext, *this);
+
+  MCInst TmpInst;
+  MCInstLowering.Lower(MI, TmpInst);
+  EmitToStreamer(OutStreamer, TmpInst);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeMSP430AsmPrinter() {
+  RegisterAsmPrinter<MSP430AsmPrinter> X(TheMSP430Target);
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430BranchSelector.cpp b/contrib/llvm/lib/Target/MSP430/MSP430BranchSelector.cpp
new file mode 100644
index 0000000..a96930a
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430BranchSelector.cpp
@@ -0,0 +1,181 @@
+//===-- MSP430BranchSelector.cpp - Emit long conditional branches ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that scans a machine function to determine which
+// conditional branches need more than 10 bits of displacement to reach their
+// target basic block.  It does this in two passes; a calculation of basic block
+// positions pass, and a branch pseudo op to machine branch opcode pass.  This
+// pass should be run last, just before the assembly printer.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430.h"
+#include "MSP430InstrInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "msp430-branch-select"
+
+STATISTIC(NumExpanded, "Number of branches expanded to long format");
+
+namespace {
+  struct MSP430BSel : public MachineFunctionPass {
+    static char ID;
+    MSP430BSel() : MachineFunctionPass(ID) {}
+
+    /// BlockSizes - The sizes of the basic blocks in the function.
+    std::vector<unsigned> BlockSizes;
+
+    bool runOnMachineFunction(MachineFunction &Fn) override;
+
+    const char *getPassName() const override {
+      return "MSP430 Branch Selector";
+    }
+  };
+  char MSP430BSel::ID = 0;
+}
+
+/// createMSP430BranchSelectionPass - returns an instance of the Branch
+/// Selection Pass
+///
+FunctionPass *llvm::createMSP430BranchSelectionPass() {
+  return new MSP430BSel();
+}
+
+bool MSP430BSel::runOnMachineFunction(MachineFunction &Fn) {
+  const MSP430InstrInfo *TII =
+             static_cast<const MSP430InstrInfo*>(Fn.getTarget().getInstrInfo());
+  // Give the blocks of the function a dense, in-order, numbering.
+  Fn.RenumberBlocks();
+  BlockSizes.resize(Fn.getNumBlockIDs());
+
+  // Measure each MBB and compute a size for the entire function.
+  unsigned FuncSize = 0;
+  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+       ++MFI) {
+    MachineBasicBlock *MBB = MFI;
+
+    unsigned BlockSize = 0;
+    for (MachineBasicBlock::iterator MBBI = MBB->begin(), EE = MBB->end();
+         MBBI != EE; ++MBBI)
+      BlockSize += TII->GetInstSizeInBytes(MBBI);
+
+    BlockSizes[MBB->getNumber()] = BlockSize;
+    FuncSize += BlockSize;
+  }
+
+  // If the entire function is smaller than the displacement of a branch field,
+  // we know we don't need to shrink any branches in this function.  This is a
+  // common case.
+  if (FuncSize < (1 << 9)) {
+    BlockSizes.clear();
+    return false;
+  }
+
+  // For each conditional branch, if the offset to its destination is larger
+  // than the offset field allows, transform it into a long branch sequence
+  // like this:
+  //   short branch:
+  //     bCC MBB
+  //   long branch:
+  //     b!CC $PC+6
+  //     b MBB
+  //
+  bool MadeChange = true;
+  bool EverMadeChange = false;
+  while (MadeChange) {
+    // Iteratively expand branches until we reach a fixed point.
+    MadeChange = false;
+
+    for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+         ++MFI) {
+      MachineBasicBlock &MBB = *MFI;
+      unsigned MBBStartOffset = 0;
+      for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+           I != E; ++I) {
+        if ((I->getOpcode() != MSP430::JCC || I->getOperand(0).isImm()) &&
+            I->getOpcode() != MSP430::JMP) {
+          MBBStartOffset += TII->GetInstSizeInBytes(I);
+          continue;
+        }
+
+        // Determine the offset from the current branch to the destination
+        // block.
+        MachineBasicBlock *Dest = I->getOperand(0).getMBB();
+
+        int BranchSize;
+        if (Dest->getNumber() <= MBB.getNumber()) {
+          // If this is a backwards branch, the delta is the offset from the
+          // start of this block to this branch, plus the sizes of all blocks
+          // from this block to the dest.
+          BranchSize = MBBStartOffset;
+
+          for (unsigned i = Dest->getNumber(), e = MBB.getNumber(); i != e; ++i)
+            BranchSize += BlockSizes[i];
+        } else {
+          // Otherwise, add the size of the blocks between this block and the
+          // dest to the number of bytes left in this block.
+          BranchSize = -MBBStartOffset;
+
+          for (unsigned i = MBB.getNumber(), e = Dest->getNumber(); i != e; ++i)
+            BranchSize += BlockSizes[i];
+        }
+
+        // If this branch is in range, ignore it.
+        if (isInt<10>(BranchSize)) {
+          MBBStartOffset += 2;
+          continue;
+        }
+
+        // Otherwise, we have to expand it to a long branch.
+        unsigned NewSize;
+        MachineInstr *OldBranch = I;
+        DebugLoc dl = OldBranch->getDebugLoc();
+
+        if (I->getOpcode() == MSP430::JMP) {
+          NewSize = 4;
+        } else {
+          // The BCC operands are:
+          // 0. MSP430 branch predicate
+          // 1. Target MBB
+          SmallVector<MachineOperand, 1> Cond;
+          Cond.push_back(I->getOperand(1));
+
+          // Jump over the uncond branch inst (i.e. $+6) on opposite condition.
+          TII->ReverseBranchCondition(Cond);
+          BuildMI(MBB, I, dl, TII->get(MSP430::JCC))
+            .addImm(4).addOperand(Cond[0]);
+
+          NewSize = 6;
+        }
+        // Uncond branch to the real destination.
+        I = BuildMI(MBB, I, dl, TII->get(MSP430::Bi)).addMBB(Dest);
+
+        // Remove the old branch from the function.
+        OldBranch->eraseFromParent();
+
+        // Remember that this instruction is NewSize bytes, increase the size of the
+        // block by NewSize-2, remember to iterate.
+        BlockSizes[MBB.getNumber()] += NewSize-2;
+        MBBStartOffset += NewSize;
+
+        ++NumExpanded;
+        MadeChange = true;
+      }
+    }
+    EverMadeChange |= MadeChange;
+  }
+
+  BlockSizes.clear();
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430CallingConv.td b/contrib/llvm/lib/Target/MSP430/MSP430CallingConv.td
new file mode 100644
index 0000000..8a69d1e
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430CallingConv.td
@@ -0,0 +1,37 @@
+//==- MSP430CallingConv.td - Calling Conventions for MSP430 -*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This describes the calling conventions for MSP430 architecture.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MSP430 Return Value Calling Convention
+//===----------------------------------------------------------------------===//
+def RetCC_MSP430 : CallingConv<[
+  // i8 are returned in registers R15B, R14B, R13B, R12B
+  CCIfType<[i8], CCAssignToReg<[R15B, R14B, R13B, R12B]>>,
+
+  // i16 are returned in registers R15, R14, R13, R12
+  CCIfType<[i16], CCAssignToReg<[R15W, R14W, R13W, R12W]>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// MSP430 Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+def CC_MSP430_AssignStack : CallingConv<[
+  // Pass by value if the byval attribute is given
+  CCIfByVal<CCPassByVal<2, 2>>,
+
+  // Promote i8 arguments to i16.
+  CCIfType<[i8], CCPromoteToType<i16>>,
+
+  // Integer values get stored in stack slots that are 2 bytes in
+  // size and 2-byte aligned.
+  CCIfType<[i16], CCAssignToStack<2, 2>>
+]>;
+
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp b/contrib/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp
new file mode 100644
index 0000000..82c8b29
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp
@@ -0,0 +1,297 @@
+//===-- MSP430FrameLowering.cpp - MSP430 Frame Information ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MSP430 implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430FrameLowering.h"
+#include "MSP430InstrInfo.h"
+#include "MSP430MachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+bool MSP430FrameLowering::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
+          MF.getFrameInfo()->hasVarSizedObjects() ||
+          MFI->isFrameAddressTaken());
+}
+
+bool MSP430FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+  return !MF.getFrameInfo()->hasVarSizedObjects();
+}
+
+void MSP430FrameLowering::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();   // Prolog goes in entry BB
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MSP430MachineFunctionInfo *MSP430FI = MF.getInfo<MSP430MachineFunctionInfo>();
+  const MSP430InstrInfo &TII =
+    *static_cast<const MSP430InstrInfo*>(MF.getTarget().getInstrInfo());
+
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+
+  // Get the number of bytes to allocate from the FrameInfo.
+  uint64_t StackSize = MFI->getStackSize();
+
+  uint64_t NumBytes = 0;
+  if (hasFP(MF)) {
+    // Calculate required stack adjustment
+    uint64_t FrameSize = StackSize - 2;
+    NumBytes = FrameSize - MSP430FI->getCalleeSavedFrameSize();
+
+    // Get the offset of the stack slot for the EBP register... which is
+    // guaranteed to be the last slot by processFunctionBeforeFrameFinalized.
+    // Update the frame offset adjustment.
+    MFI->setOffsetAdjustment(-NumBytes);
+
+    // Save FPW into the appropriate stack slot...
+    BuildMI(MBB, MBBI, DL, TII.get(MSP430::PUSH16r))
+      .addReg(MSP430::FPW, RegState::Kill);
+
+    // Update FPW with the new base value...
+    BuildMI(MBB, MBBI, DL, TII.get(MSP430::MOV16rr), MSP430::FPW)
+      .addReg(MSP430::SPW);
+
+    // Mark the FramePtr as live-in in every block except the entry.
+    for (MachineFunction::iterator I = std::next(MF.begin()), E = MF.end();
+         I != E; ++I)
+      I->addLiveIn(MSP430::FPW);
+
+  } else
+    NumBytes = StackSize - MSP430FI->getCalleeSavedFrameSize();
+
+  // Skip the callee-saved push instructions.
+  while (MBBI != MBB.end() && (MBBI->getOpcode() == MSP430::PUSH16r))
+    ++MBBI;
+
+  if (MBBI != MBB.end())
+    DL = MBBI->getDebugLoc();
+
+  if (NumBytes) { // adjust stack pointer: SPW -= numbytes
+    // If there is an SUB16ri of SPW immediately before this instruction, merge
+    // the two.
+    //NumBytes -= mergeSPUpdates(MBB, MBBI, true);
+    // If there is an ADD16ri or SUB16ri of SPW immediately after this
+    // instruction, merge the two instructions.
+    // mergeSPUpdatesDown(MBB, MBBI, &NumBytes);
+
+    if (NumBytes) {
+      MachineInstr *MI =
+        BuildMI(MBB, MBBI, DL, TII.get(MSP430::SUB16ri), MSP430::SPW)
+        .addReg(MSP430::SPW).addImm(NumBytes);
+      // The SRW implicit def is dead.
+      MI->getOperand(3).setIsDead();
+    }
+  }
+}
+
+void MSP430FrameLowering::emitEpilogue(MachineFunction &MF,
+                                       MachineBasicBlock &MBB) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  MSP430MachineFunctionInfo *MSP430FI = MF.getInfo<MSP430MachineFunctionInfo>();
+  const MSP430InstrInfo &TII =
+    *static_cast<const MSP430InstrInfo*>(MF.getTarget().getInstrInfo());
+
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  unsigned RetOpcode = MBBI->getOpcode();
+  DebugLoc DL = MBBI->getDebugLoc();
+
+  switch (RetOpcode) {
+  case MSP430::RET:
+  case MSP430::RETI: break;  // These are ok
+  default:
+    llvm_unreachable("Can only insert epilog into returning blocks");
+  }
+
+  // Get the number of bytes to allocate from the FrameInfo
+  uint64_t StackSize = MFI->getStackSize();
+  unsigned CSSize = MSP430FI->getCalleeSavedFrameSize();
+  uint64_t NumBytes = 0;
+
+  if (hasFP(MF)) {
+    // Calculate required stack adjustment
+    uint64_t FrameSize = StackSize - 2;
+    NumBytes = FrameSize - CSSize;
+
+    // pop FPW.
+    BuildMI(MBB, MBBI, DL, TII.get(MSP430::POP16r), MSP430::FPW);
+  } else
+    NumBytes = StackSize - CSSize;
+
+  // Skip the callee-saved pop instructions.
+  while (MBBI != MBB.begin()) {
+    MachineBasicBlock::iterator PI = std::prev(MBBI);
+    unsigned Opc = PI->getOpcode();
+    if (Opc != MSP430::POP16r && !PI->isTerminator())
+      break;
+    --MBBI;
+  }
+
+  DL = MBBI->getDebugLoc();
+
+  // If there is an ADD16ri or SUB16ri of SPW immediately before this
+  // instruction, merge the two instructions.
+  //if (NumBytes || MFI->hasVarSizedObjects())
+  //  mergeSPUpdatesUp(MBB, MBBI, StackPtr, &NumBytes);
+
+  if (MFI->hasVarSizedObjects()) {
+    BuildMI(MBB, MBBI, DL,
+            TII.get(MSP430::MOV16rr), MSP430::SPW).addReg(MSP430::FPW);
+    if (CSSize) {
+      MachineInstr *MI =
+        BuildMI(MBB, MBBI, DL,
+                TII.get(MSP430::SUB16ri), MSP430::SPW)
+        .addReg(MSP430::SPW).addImm(CSSize);
+      // The SRW implicit def is dead.
+      MI->getOperand(3).setIsDead();
+    }
+  } else {
+    // adjust stack pointer back: SPW += numbytes
+    if (NumBytes) {
+      MachineInstr *MI =
+        BuildMI(MBB, MBBI, DL, TII.get(MSP430::ADD16ri), MSP430::SPW)
+        .addReg(MSP430::SPW).addImm(NumBytes);
+      // The SRW implicit def is dead.
+      MI->getOperand(3).setIsDead();
+    }
+  }
+}
+
+// FIXME: Can we eleminate these in favour of generic code?
+bool
+MSP430FrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                           MachineBasicBlock::iterator MI,
+                                        const std::vector<CalleeSavedInfo> &CSI,
+                                        const TargetRegisterInfo *TRI) const {
+  if (CSI.empty())
+    return false;
+
+  DebugLoc DL;
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  MSP430MachineFunctionInfo *MFI = MF.getInfo<MSP430MachineFunctionInfo>();
+  MFI->setCalleeSavedFrameSize(CSI.size() * 2);
+
+  for (unsigned i = CSI.size(); i != 0; --i) {
+    unsigned Reg = CSI[i-1].getReg();
+    // Add the callee-saved register as live-in. It's killed at the spill.
+    MBB.addLiveIn(Reg);
+    BuildMI(MBB, MI, DL, TII.get(MSP430::PUSH16r))
+      .addReg(Reg, RegState::Kill);
+  }
+  return true;
+}
+
+bool
+MSP430FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                                 MachineBasicBlock::iterator MI,
+                                        const std::vector<CalleeSavedInfo> &CSI,
+                                        const TargetRegisterInfo *TRI) const {
+  if (CSI.empty())
+    return false;
+
+  DebugLoc DL;
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i)
+    BuildMI(MBB, MI, DL, TII.get(MSP430::POP16r), CSI[i].getReg());
+
+  return true;
+}
+
+void MSP430FrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  const MSP430InstrInfo &TII =
+    *static_cast<const MSP430InstrInfo*>(MF.getTarget().getInstrInfo());
+  unsigned StackAlign = getStackAlignment();
+
+  if (!hasReservedCallFrame(MF)) {
+    // If the stack pointer can be changed after prologue, turn the
+    // adjcallstackup instruction into a 'sub SPW, <amt>' and the
+    // adjcallstackdown instruction into 'add SPW, <amt>'
+    // TODO: consider using push / pop instead of sub + store / add
+    MachineInstr *Old = I;
+    uint64_t Amount = Old->getOperand(0).getImm();
+    if (Amount != 0) {
+      // We need to keep the stack aligned properly.  To do this, we round the
+      // amount of space needed for the outgoing arguments up to the next
+      // alignment boundary.
+      Amount = (Amount+StackAlign-1)/StackAlign*StackAlign;
+
+      MachineInstr *New = nullptr;
+      if (Old->getOpcode() == TII.getCallFrameSetupOpcode()) {
+        New = BuildMI(MF, Old->getDebugLoc(),
+                      TII.get(MSP430::SUB16ri), MSP430::SPW)
+          .addReg(MSP430::SPW).addImm(Amount);
+      } else {
+        assert(Old->getOpcode() == TII.getCallFrameDestroyOpcode());
+        // factor out the amount the callee already popped.
+        uint64_t CalleeAmt = Old->getOperand(1).getImm();
+        Amount -= CalleeAmt;
+        if (Amount)
+          New = BuildMI(MF, Old->getDebugLoc(),
+                        TII.get(MSP430::ADD16ri), MSP430::SPW)
+            .addReg(MSP430::SPW).addImm(Amount);
+      }
+
+      if (New) {
+        // The SRW implicit def is dead.
+        New->getOperand(3).setIsDead();
+
+        // Replace the pseudo instruction with a new instruction...
+        MBB.insert(I, New);
+      }
+    }
+  } else if (I->getOpcode() == TII.getCallFrameDestroyOpcode()) {
+    // If we are performing frame pointer elimination and if the callee pops
+    // something off the stack pointer, add it back.
+    if (uint64_t CalleeAmt = I->getOperand(1).getImm()) {
+      MachineInstr *Old = I;
+      MachineInstr *New =
+        BuildMI(MF, Old->getDebugLoc(), TII.get(MSP430::SUB16ri),
+                MSP430::SPW).addReg(MSP430::SPW).addImm(CalleeAmt);
+      // The SRW implicit def is dead.
+      New->getOperand(3).setIsDead();
+
+      MBB.insert(I, New);
+    }
+  }
+
+  MBB.erase(I);
+}
+
+void
+MSP430FrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
+                                                         RegScavenger *) const {
+  // Create a frame entry for the FPW register that must be saved.
+  if (hasFP(MF)) {
+    int FrameIdx = MF.getFrameInfo()->CreateFixedObject(2, -4, true);
+    (void)FrameIdx;
+    assert(FrameIdx == MF.getFrameInfo()->getObjectIndexBegin() &&
+           "Slot for FPW register must be last in order to be found!");
+  }
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430FrameLowering.h b/contrib/llvm/lib/Target/MSP430/MSP430FrameLowering.h
new file mode 100644
index 0000000..fadfeed
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430FrameLowering.h
@@ -0,0 +1,54 @@
+//==- MSP430FrameLowering.h - Define frame lowering for MSP430 --*- C++ -*--==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MSP430_FRAMEINFO_H
+#define MSP430_FRAMEINFO_H
+
+#include "MSP430.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+class MSP430FrameLowering : public TargetFrameLowering {
+protected:
+
+public:
+  explicit MSP430FrameLowering()
+      : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 2, -2, 2) {}
+
+  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+  /// the function.
+  void emitPrologue(MachineFunction &MF) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                  MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I) const override;
+
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 const TargetRegisterInfo *TRI) const override;
+  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator MI,
+                                  const std::vector<CalleeSavedInfo> &CSI,
+                                  const TargetRegisterInfo *TRI) const override;
+
+  bool hasFP(const MachineFunction &MF) const override;
+  bool hasReservedCallFrame(const MachineFunction &MF) const override;
+  void processFunctionBeforeFrameFinalized(MachineFunction &MF,
+                                     RegScavenger *RS = nullptr) const override;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
new file mode 100644
index 0000000..a9b9035
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
@@ -0,0 +1,493 @@
+//===-- MSP430ISelDAGToDAG.cpp - A dag to dag inst selector for MSP430 ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the MSP430 target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430.h"
+#include "MSP430TargetMachine.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLowering.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "msp430-isel"
+
+namespace {
+  struct MSP430ISelAddressMode {
+    enum {
+      RegBase,
+      FrameIndexBase
+    } BaseType;
+
+    struct {            // This is really a union, discriminated by BaseType!
+      SDValue Reg;
+      int FrameIndex;
+    } Base;
+
+    int16_t Disp;
+    const GlobalValue *GV;
+    const Constant *CP;
+    const BlockAddress *BlockAddr;
+    const char *ES;
+    int JT;
+    unsigned Align;    // CP alignment.
+
+    MSP430ISelAddressMode()
+      : BaseType(RegBase), Disp(0), GV(nullptr), CP(nullptr),
+        BlockAddr(nullptr), ES(nullptr), JT(-1), Align(0) {
+    }
+
+    bool hasSymbolicDisplacement() const {
+      return GV != nullptr || CP != nullptr || ES != nullptr || JT != -1;
+    }
+
+    void dump() {
+      errs() << "MSP430ISelAddressMode " << this << '\n';
+      if (BaseType == RegBase && Base.Reg.getNode() != nullptr) {
+        errs() << "Base.Reg ";
+        Base.Reg.getNode()->dump();
+      } else if (BaseType == FrameIndexBase) {
+        errs() << " Base.FrameIndex " << Base.FrameIndex << '\n';
+      }
+      errs() << " Disp " << Disp << '\n';
+      if (GV) {
+        errs() << "GV ";
+        GV->dump();
+      } else if (CP) {
+        errs() << " CP ";
+        CP->dump();
+        errs() << " Align" << Align << '\n';
+      } else if (ES) {
+        errs() << "ES ";
+        errs() << ES << '\n';
+      } else if (JT != -1)
+        errs() << " JT" << JT << " Align" << Align << '\n';
+    }
+  };
+}
+
+/// MSP430DAGToDAGISel - MSP430 specific code to select MSP430 machine
+/// instructions for SelectionDAG operations.
+///
+namespace {
+  class MSP430DAGToDAGISel : public SelectionDAGISel {
+    const MSP430TargetLowering &Lowering;
+    const MSP430Subtarget &Subtarget;
+
+  public:
+    MSP430DAGToDAGISel(MSP430TargetMachine &TM, CodeGenOpt::Level OptLevel)
+      : SelectionDAGISel(TM, OptLevel),
+        Lowering(*TM.getTargetLowering()),
+        Subtarget(*TM.getSubtargetImpl()) { }
+
+    const char *getPassName() const override {
+      return "MSP430 DAG->DAG Pattern Instruction Selection";
+    }
+
+    bool MatchAddress(SDValue N, MSP430ISelAddressMode &AM);
+    bool MatchWrapper(SDValue N, MSP430ISelAddressMode &AM);
+    bool MatchAddressBase(SDValue N, MSP430ISelAddressMode &AM);
+
+    bool SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
+                                      std::vector<SDValue> &OutOps) override;
+
+    // Include the pieces autogenerated from the target description.
+  #include "MSP430GenDAGISel.inc"
+
+  private:
+    SDNode *Select(SDNode *N) override;
+    SDNode *SelectIndexedLoad(SDNode *Op);
+    SDNode *SelectIndexedBinOp(SDNode *Op, SDValue N1, SDValue N2,
+                               unsigned Opc8, unsigned Opc16);
+
+    bool SelectAddr(SDValue Addr, SDValue &Base, SDValue &Disp);
+  };
+}  // end anonymous namespace
+
+/// createMSP430ISelDag - This pass converts a legalized DAG into a
+/// MSP430-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createMSP430ISelDag(MSP430TargetMachine &TM,
+                                        CodeGenOpt::Level OptLevel) {
+  return new MSP430DAGToDAGISel(TM, OptLevel);
+}
+
+
+/// MatchWrapper - Try to match MSP430ISD::Wrapper node into an addressing mode.
+/// These wrap things that will resolve down into a symbol reference.  If no
+/// match is possible, this returns true, otherwise it returns false.
+bool MSP430DAGToDAGISel::MatchWrapper(SDValue N, MSP430ISelAddressMode &AM) {
+  // If the addressing mode already has a symbol as the displacement, we can
+  // never match another symbol.
+  if (AM.hasSymbolicDisplacement())
+    return true;
+
+  SDValue N0 = N.getOperand(0);
+
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
+    AM.GV = G->getGlobal();
+    AM.Disp += G->getOffset();
+    //AM.SymbolFlags = G->getTargetFlags();
+  } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
+    AM.CP = CP->getConstVal();
+    AM.Align = CP->getAlignment();
+    AM.Disp += CP->getOffset();
+    //AM.SymbolFlags = CP->getTargetFlags();
+  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
+    AM.ES = S->getSymbol();
+    //AM.SymbolFlags = S->getTargetFlags();
+  } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
+    AM.JT = J->getIndex();
+    //AM.SymbolFlags = J->getTargetFlags();
+  } else {
+    AM.BlockAddr = cast<BlockAddressSDNode>(N0)->getBlockAddress();
+    //AM.SymbolFlags = cast<BlockAddressSDNode>(N0)->getTargetFlags();
+  }
+  return false;
+}
+
+/// MatchAddressBase - Helper for MatchAddress. Add the specified node to the
+/// specified addressing mode without any further recursion.
+bool MSP430DAGToDAGISel::MatchAddressBase(SDValue N, MSP430ISelAddressMode &AM) {
+  // Is the base register already occupied?
+  if (AM.BaseType != MSP430ISelAddressMode::RegBase || AM.Base.Reg.getNode()) {
+    // If so, we cannot select it.
+    return true;
+  }
+
+  // Default, generate it as a register.
+  AM.BaseType = MSP430ISelAddressMode::RegBase;
+  AM.Base.Reg = N;
+  return false;
+}
+
+bool MSP430DAGToDAGISel::MatchAddress(SDValue N, MSP430ISelAddressMode &AM) {
+  DEBUG(errs() << "MatchAddress: "; AM.dump());
+
+  switch (N.getOpcode()) {
+  default: break;
+  case ISD::Constant: {
+    uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
+    AM.Disp += Val;
+    return false;
+  }
+
+  case MSP430ISD::Wrapper:
+    if (!MatchWrapper(N, AM))
+      return false;
+    break;
+
+  case ISD::FrameIndex:
+    if (AM.BaseType == MSP430ISelAddressMode::RegBase
+        && AM.Base.Reg.getNode() == nullptr) {
+      AM.BaseType = MSP430ISelAddressMode::FrameIndexBase;
+      AM.Base.FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
+      return false;
+    }
+    break;
+
+  case ISD::ADD: {
+    MSP430ISelAddressMode Backup = AM;
+    if (!MatchAddress(N.getNode()->getOperand(0), AM) &&
+        !MatchAddress(N.getNode()->getOperand(1), AM))
+      return false;
+    AM = Backup;
+    if (!MatchAddress(N.getNode()->getOperand(1), AM) &&
+        !MatchAddress(N.getNode()->getOperand(0), AM))
+      return false;
+    AM = Backup;
+
+    break;
+  }
+
+  case ISD::OR:
+    // Handle "X | C" as "X + C" iff X is known to have C bits clear.
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+      MSP430ISelAddressMode Backup = AM;
+      uint64_t Offset = CN->getSExtValue();
+      // Start with the LHS as an addr mode.
+      if (!MatchAddress(N.getOperand(0), AM) &&
+          // Address could not have picked a GV address for the displacement.
+          AM.GV == nullptr &&
+          // Check to see if the LHS & C is zero.
+          CurDAG->MaskedValueIsZero(N.getOperand(0), CN->getAPIntValue())) {
+        AM.Disp += Offset;
+        return false;
+      }
+      AM = Backup;
+    }
+    break;
+  }
+
+  return MatchAddressBase(N, AM);
+}
+
+/// SelectAddr - returns true if it is able pattern match an addressing mode.
+/// It returns the operands which make up the maximal addressing mode it can
+/// match by reference.
+bool MSP430DAGToDAGISel::SelectAddr(SDValue N,
+                                    SDValue &Base, SDValue &Disp) {
+  MSP430ISelAddressMode AM;
+
+  if (MatchAddress(N, AM))
+    return false;
+
+  EVT VT = N.getValueType();
+  if (AM.BaseType == MSP430ISelAddressMode::RegBase) {
+    if (!AM.Base.Reg.getNode())
+      AM.Base.Reg = CurDAG->getRegister(0, VT);
+  }
+
+  Base  = (AM.BaseType == MSP430ISelAddressMode::FrameIndexBase) ?
+    CurDAG->getTargetFrameIndex(AM.Base.FrameIndex,
+                                getTargetLowering()->getPointerTy()) :
+    AM.Base.Reg;
+
+  if (AM.GV)
+    Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(N),
+                                          MVT::i16, AM.Disp,
+                                          0/*AM.SymbolFlags*/);
+  else if (AM.CP)
+    Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i16,
+                                         AM.Align, AM.Disp, 0/*AM.SymbolFlags*/);
+  else if (AM.ES)
+    Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i16, 0/*AM.SymbolFlags*/);
+  else if (AM.JT != -1)
+    Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i16, 0/*AM.SymbolFlags*/);
+  else if (AM.BlockAddr)
+    Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, 0,
+                                         0/*AM.SymbolFlags*/);
+  else
+    Disp = CurDAG->getTargetConstant(AM.Disp, MVT::i16);
+
+  return true;
+}
+
+bool MSP430DAGToDAGISel::
+SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
+                             std::vector<SDValue> &OutOps) {
+  SDValue Op0, Op1;
+  switch (ConstraintCode) {
+  default: return true;
+  case 'm':   // memory
+    if (!SelectAddr(Op, Op0, Op1))
+      return true;
+    break;
+  }
+
+  OutOps.push_back(Op0);
+  OutOps.push_back(Op1);
+  return false;
+}
+
+static bool isValidIndexedLoad(const LoadSDNode *LD) {
+  ISD::MemIndexedMode AM = LD->getAddressingMode();
+  if (AM != ISD::POST_INC || LD->getExtensionType() != ISD::NON_EXTLOAD)
+    return false;
+
+  EVT VT = LD->getMemoryVT();
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::i8:
+    // Sanity check
+    if (cast<ConstantSDNode>(LD->getOffset())->getZExtValue() != 1)
+      return false;
+
+    break;
+  case MVT::i16:
+    // Sanity check
+    if (cast<ConstantSDNode>(LD->getOffset())->getZExtValue() != 2)
+      return false;
+
+    break;
+  default:
+    return false;
+  }
+
+  return true;
+}
+
+SDNode *MSP430DAGToDAGISel::SelectIndexedLoad(SDNode *N) {
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  if (!isValidIndexedLoad(LD))
+    return nullptr;
+
+  MVT VT = LD->getMemoryVT().getSimpleVT();
+
+  unsigned Opcode = 0;
+  switch (VT.SimpleTy) {
+  case MVT::i8:
+    Opcode = MSP430::MOV8rm_POST;
+    break;
+  case MVT::i16:
+    Opcode = MSP430::MOV16rm_POST;
+    break;
+  default:
+    return nullptr;
+  }
+
+   return CurDAG->getMachineNode(Opcode, SDLoc(N),
+                                 VT, MVT::i16, MVT::Other,
+                                 LD->getBasePtr(), LD->getChain());
+}
+
+SDNode *MSP430DAGToDAGISel::SelectIndexedBinOp(SDNode *Op,
+                                               SDValue N1, SDValue N2,
+                                               unsigned Opc8, unsigned Opc16) {
+  if (N1.getOpcode() == ISD::LOAD &&
+      N1.hasOneUse() &&
+      IsLegalToFold(N1, Op, Op, OptLevel)) {
+    LoadSDNode *LD = cast<LoadSDNode>(N1);
+    if (!isValidIndexedLoad(LD))
+      return nullptr;
+
+    MVT VT = LD->getMemoryVT().getSimpleVT();
+    unsigned Opc = (VT == MVT::i16 ? Opc16 : Opc8);
+    MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+    MemRefs0[0] = cast<MemSDNode>(N1)->getMemOperand();
+    SDValue Ops0[] = { N2, LD->getBasePtr(), LD->getChain() };
+    SDNode *ResNode =
+      CurDAG->SelectNodeTo(Op, Opc, VT, MVT::i16, MVT::Other, Ops0);
+    cast<MachineSDNode>(ResNode)->setMemRefs(MemRefs0, MemRefs0 + 1);
+    // Transfer chain.
+    ReplaceUses(SDValue(N1.getNode(), 2), SDValue(ResNode, 2));
+    // Transfer writeback.
+    ReplaceUses(SDValue(N1.getNode(), 1), SDValue(ResNode, 1));
+    return ResNode;
+  }
+
+  return nullptr;
+}
+
+
+SDNode *MSP430DAGToDAGISel::Select(SDNode *Node) {
+  SDLoc dl(Node);
+
+  // Dump information about the Node being selected
+  DEBUG(errs() << "Selecting: ");
+  DEBUG(Node->dump(CurDAG));
+  DEBUG(errs() << "\n");
+
+  // If we have a custom node, we already have selected!
+  if (Node->isMachineOpcode()) {
+    DEBUG(errs() << "== ";
+          Node->dump(CurDAG);
+          errs() << "\n");
+    Node->setNodeId(-1);
+    return nullptr;
+  }
+
+  // Few custom selection stuff.
+  switch (Node->getOpcode()) {
+  default: break;
+  case ISD::FrameIndex: {
+    assert(Node->getValueType(0) == MVT::i16);
+    int FI = cast<FrameIndexSDNode>(Node)->getIndex();
+    SDValue TFI = CurDAG->getTargetFrameIndex(FI, MVT::i16);
+    if (Node->hasOneUse())
+      return CurDAG->SelectNodeTo(Node, MSP430::ADD16ri, MVT::i16,
+                                  TFI, CurDAG->getTargetConstant(0, MVT::i16));
+    return CurDAG->getMachineNode(MSP430::ADD16ri, dl, MVT::i16,
+                                  TFI, CurDAG->getTargetConstant(0, MVT::i16));
+  }
+  case ISD::LOAD:
+    if (SDNode *ResNode = SelectIndexedLoad(Node))
+      return ResNode;
+    // Other cases are autogenerated.
+    break;
+  case ISD::ADD:
+    if (SDNode *ResNode =
+        SelectIndexedBinOp(Node,
+                           Node->getOperand(0), Node->getOperand(1),
+                           MSP430::ADD8rm_POST, MSP430::ADD16rm_POST))
+      return ResNode;
+    else if (SDNode *ResNode =
+             SelectIndexedBinOp(Node, Node->getOperand(1), Node->getOperand(0),
+                                MSP430::ADD8rm_POST, MSP430::ADD16rm_POST))
+      return ResNode;
+
+    // Other cases are autogenerated.
+    break;
+  case ISD::SUB:
+    if (SDNode *ResNode =
+        SelectIndexedBinOp(Node,
+                           Node->getOperand(0), Node->getOperand(1),
+                           MSP430::SUB8rm_POST, MSP430::SUB16rm_POST))
+      return ResNode;
+
+    // Other cases are autogenerated.
+    break;
+  case ISD::AND:
+    if (SDNode *ResNode =
+        SelectIndexedBinOp(Node,
+                           Node->getOperand(0), Node->getOperand(1),
+                           MSP430::AND8rm_POST, MSP430::AND16rm_POST))
+      return ResNode;
+    else if (SDNode *ResNode =
+             SelectIndexedBinOp(Node, Node->getOperand(1), Node->getOperand(0),
+                                MSP430::AND8rm_POST, MSP430::AND16rm_POST))
+      return ResNode;
+
+    // Other cases are autogenerated.
+    break;
+  case ISD::OR:
+    if (SDNode *ResNode =
+        SelectIndexedBinOp(Node,
+                           Node->getOperand(0), Node->getOperand(1),
+                           MSP430::OR8rm_POST, MSP430::OR16rm_POST))
+      return ResNode;
+    else if (SDNode *ResNode =
+             SelectIndexedBinOp(Node, Node->getOperand(1), Node->getOperand(0),
+                                MSP430::OR8rm_POST, MSP430::OR16rm_POST))
+      return ResNode;
+
+    // Other cases are autogenerated.
+    break;
+  case ISD::XOR:
+    if (SDNode *ResNode =
+        SelectIndexedBinOp(Node,
+                           Node->getOperand(0), Node->getOperand(1),
+                           MSP430::XOR8rm_POST, MSP430::XOR16rm_POST))
+      return ResNode;
+    else if (SDNode *ResNode =
+             SelectIndexedBinOp(Node, Node->getOperand(1), Node->getOperand(0),
+                                MSP430::XOR8rm_POST, MSP430::XOR16rm_POST))
+      return ResNode;
+
+    // Other cases are autogenerated.
+    break;
+  }
+
+  // Select the default instruction
+  SDNode *ResNode = SelectCode(Node);
+
+  DEBUG(errs() << "=> ");
+  if (ResNode == nullptr || ResNode == Node)
+    DEBUG(Node->dump(CurDAG));
+  else
+    DEBUG(ResNode->dump(CurDAG));
+  DEBUG(errs() << "\n");
+
+  return ResNode;
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp b/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
new file mode 100644
index 0000000..3d3ee92
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -0,0 +1,1371 @@
+//===-- MSP430ISelLowering.cpp - MSP430 DAG Lowering Implementation  ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MSP430TargetLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430ISelLowering.h"
+#include "MSP430.h"
+#include "MSP430MachineFunctionInfo.h"
+#include "MSP430Subtarget.h"
+#include "MSP430TargetMachine.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "msp430-lower"
+
+typedef enum {
+  NoHWMult,
+  HWMultIntr,
+  HWMultNoIntr
+} HWMultUseMode;
+
+static cl::opt<HWMultUseMode>
+HWMultMode("msp430-hwmult-mode", cl::Hidden,
+           cl::desc("Hardware multiplier use mode"),
+           cl::init(HWMultNoIntr),
+           cl::values(
+             clEnumValN(NoHWMult, "no",
+                "Do not use hardware multiplier"),
+             clEnumValN(HWMultIntr, "interrupts",
+                "Assume hardware multiplier can be used inside interrupts"),
+             clEnumValN(HWMultNoIntr, "use",
+                "Assume hardware multiplier cannot be used inside interrupts"),
+             clEnumValEnd));
+
+MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM)
+    : TargetLowering(TM, new TargetLoweringObjectFileELF()) {
+
+  // Set up the register classes.
+  addRegisterClass(MVT::i8,  &MSP430::GR8RegClass);
+  addRegisterClass(MVT::i16, &MSP430::GR16RegClass);
+
+  // Compute derived properties from the register classes
+  computeRegisterProperties();
+
+  // Provide all sorts of operation actions
+
+  // Division is expensive
+  setIntDivIsCheap(false);
+
+  setStackPointerRegisterToSaveRestore(MSP430::SPW);
+  setBooleanContents(ZeroOrOneBooleanContent);
+  setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
+
+  // We have post-incremented loads / stores.
+  setIndexedLoadAction(ISD::POST_INC, MVT::i8, Legal);
+  setIndexedLoadAction(ISD::POST_INC, MVT::i16, Legal);
+
+  setLoadExtAction(ISD::EXTLOAD,  MVT::i1,  Promote);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1,  Promote);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1,  Promote);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i8,  Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Expand);
+
+  // We don't have any truncstores
+  setTruncStoreAction(MVT::i16, MVT::i8, Expand);
+
+  setOperationAction(ISD::SRA,              MVT::i8,    Custom);
+  setOperationAction(ISD::SHL,              MVT::i8,    Custom);
+  setOperationAction(ISD::SRL,              MVT::i8,    Custom);
+  setOperationAction(ISD::SRA,              MVT::i16,   Custom);
+  setOperationAction(ISD::SHL,              MVT::i16,   Custom);
+  setOperationAction(ISD::SRL,              MVT::i16,   Custom);
+  setOperationAction(ISD::ROTL,             MVT::i8,    Expand);
+  setOperationAction(ISD::ROTR,             MVT::i8,    Expand);
+  setOperationAction(ISD::ROTL,             MVT::i16,   Expand);
+  setOperationAction(ISD::ROTR,             MVT::i16,   Expand);
+  setOperationAction(ISD::GlobalAddress,    MVT::i16,   Custom);
+  setOperationAction(ISD::ExternalSymbol,   MVT::i16,   Custom);
+  setOperationAction(ISD::BlockAddress,     MVT::i16,   Custom);
+  setOperationAction(ISD::BR_JT,            MVT::Other, Expand);
+  setOperationAction(ISD::BR_CC,            MVT::i8,    Custom);
+  setOperationAction(ISD::BR_CC,            MVT::i16,   Custom);
+  setOperationAction(ISD::BRCOND,           MVT::Other, Expand);
+  setOperationAction(ISD::SETCC,            MVT::i8,    Custom);
+  setOperationAction(ISD::SETCC,            MVT::i16,   Custom);
+  setOperationAction(ISD::SELECT,           MVT::i8,    Expand);
+  setOperationAction(ISD::SELECT,           MVT::i16,   Expand);
+  setOperationAction(ISD::SELECT_CC,        MVT::i8,    Custom);
+  setOperationAction(ISD::SELECT_CC,        MVT::i16,   Custom);
+  setOperationAction(ISD::SIGN_EXTEND,      MVT::i16,   Custom);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i8, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i16, Expand);
+
+  setOperationAction(ISD::CTTZ,             MVT::i8,    Expand);
+  setOperationAction(ISD::CTTZ,             MVT::i16,   Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::i8,    Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::i16,   Expand);
+  setOperationAction(ISD::CTLZ,             MVT::i8,    Expand);
+  setOperationAction(ISD::CTLZ,             MVT::i16,   Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::i8,    Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::i16,   Expand);
+  setOperationAction(ISD::CTPOP,            MVT::i8,    Expand);
+  setOperationAction(ISD::CTPOP,            MVT::i16,   Expand);
+
+  setOperationAction(ISD::SHL_PARTS,        MVT::i8,    Expand);
+  setOperationAction(ISD::SHL_PARTS,        MVT::i16,   Expand);
+  setOperationAction(ISD::SRL_PARTS,        MVT::i8,    Expand);
+  setOperationAction(ISD::SRL_PARTS,        MVT::i16,   Expand);
+  setOperationAction(ISD::SRA_PARTS,        MVT::i8,    Expand);
+  setOperationAction(ISD::SRA_PARTS,        MVT::i16,   Expand);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1,   Expand);
+
+  // FIXME: Implement efficiently multiplication by a constant
+  setOperationAction(ISD::MUL,              MVT::i8,    Expand);
+  setOperationAction(ISD::MULHS,            MVT::i8,    Expand);
+  setOperationAction(ISD::MULHU,            MVT::i8,    Expand);
+  setOperationAction(ISD::SMUL_LOHI,        MVT::i8,    Expand);
+  setOperationAction(ISD::UMUL_LOHI,        MVT::i8,    Expand);
+  setOperationAction(ISD::MUL,              MVT::i16,   Expand);
+  setOperationAction(ISD::MULHS,            MVT::i16,   Expand);
+  setOperationAction(ISD::MULHU,            MVT::i16,   Expand);
+  setOperationAction(ISD::SMUL_LOHI,        MVT::i16,   Expand);
+  setOperationAction(ISD::UMUL_LOHI,        MVT::i16,   Expand);
+
+  setOperationAction(ISD::UDIV,             MVT::i8,    Expand);
+  setOperationAction(ISD::UDIVREM,          MVT::i8,    Expand);
+  setOperationAction(ISD::UREM,             MVT::i8,    Expand);
+  setOperationAction(ISD::SDIV,             MVT::i8,    Expand);
+  setOperationAction(ISD::SDIVREM,          MVT::i8,    Expand);
+  setOperationAction(ISD::SREM,             MVT::i8,    Expand);
+  setOperationAction(ISD::UDIV,             MVT::i16,   Expand);
+  setOperationAction(ISD::UDIVREM,          MVT::i16,   Expand);
+  setOperationAction(ISD::UREM,             MVT::i16,   Expand);
+  setOperationAction(ISD::SDIV,             MVT::i16,   Expand);
+  setOperationAction(ISD::SDIVREM,          MVT::i16,   Expand);
+  setOperationAction(ISD::SREM,             MVT::i16,   Expand);
+
+  // varargs support
+  setOperationAction(ISD::VASTART,          MVT::Other, Custom);
+  setOperationAction(ISD::VAARG,            MVT::Other, Expand);
+  setOperationAction(ISD::VAEND,            MVT::Other, Expand);
+  setOperationAction(ISD::VACOPY,           MVT::Other, Expand);
+  setOperationAction(ISD::JumpTable,        MVT::i16,   Custom);
+
+  // Libcalls names.
+  if (HWMultMode == HWMultIntr) {
+    setLibcallName(RTLIB::MUL_I8,  "__mulqi3hw");
+    setLibcallName(RTLIB::MUL_I16, "__mulhi3hw");
+  } else if (HWMultMode == HWMultNoIntr) {
+    setLibcallName(RTLIB::MUL_I8,  "__mulqi3hw_noint");
+    setLibcallName(RTLIB::MUL_I16, "__mulhi3hw_noint");
+  }
+
+  setMinFunctionAlignment(1);
+  setPrefFunctionAlignment(2);
+}
+
+SDValue MSP430TargetLowering::LowerOperation(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  case ISD::SHL: // FALLTHROUGH
+  case ISD::SRL:
+  case ISD::SRA:              return LowerShifts(Op, DAG);
+  case ISD::GlobalAddress:    return LowerGlobalAddress(Op, DAG);
+  case ISD::BlockAddress:     return LowerBlockAddress(Op, DAG);
+  case ISD::ExternalSymbol:   return LowerExternalSymbol(Op, DAG);
+  case ISD::SETCC:            return LowerSETCC(Op, DAG);
+  case ISD::BR_CC:            return LowerBR_CC(Op, DAG);
+  case ISD::SELECT_CC:        return LowerSELECT_CC(Op, DAG);
+  case ISD::SIGN_EXTEND:      return LowerSIGN_EXTEND(Op, DAG);
+  case ISD::RETURNADDR:       return LowerRETURNADDR(Op, DAG);
+  case ISD::FRAMEADDR:        return LowerFRAMEADDR(Op, DAG);
+  case ISD::VASTART:          return LowerVASTART(Op, DAG);
+  case ISD::JumpTable:        return LowerJumpTable(Op, DAG);
+  default:
+    llvm_unreachable("unimplemented operand");
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                       MSP430 Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+TargetLowering::ConstraintType
+MSP430TargetLowering::getConstraintType(const std::string &Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'r':
+      return C_RegisterClass;
+    default:
+      break;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+std::pair<unsigned, const TargetRegisterClass*>
+MSP430TargetLowering::
+getRegForInlineAsmConstraint(const std::string &Constraint,
+                             MVT VT) const {
+  if (Constraint.size() == 1) {
+    // GCC Constraint Letters
+    switch (Constraint[0]) {
+    default: break;
+    case 'r':   // GENERAL_REGS
+      if (VT == MVT::i8)
+        return std::make_pair(0U, &MSP430::GR8RegClass);
+
+      return std::make_pair(0U, &MSP430::GR16RegClass);
+    }
+  }
+
+  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+}
+
+//===----------------------------------------------------------------------===//
+//                      Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "MSP430GenCallingConv.inc"
+
+/// For each argument in a function store the number of pieces it is composed
+/// of.
+template<typename ArgT>
+static void ParseFunctionArgs(const SmallVectorImpl<ArgT> &Args,
+                              SmallVectorImpl<unsigned> &Out) {
+  unsigned CurrentArgIndex = ~0U;
+  for (unsigned i = 0, e = Args.size(); i != e; i++) {
+    if (CurrentArgIndex == Args[i].OrigArgIndex) {
+      Out.back()++;
+    } else {
+      Out.push_back(1);
+      CurrentArgIndex++;
+    }
+  }
+}
+
+static void AnalyzeVarArgs(CCState &State,
+                           const SmallVectorImpl<ISD::OutputArg> &Outs) {
+  State.AnalyzeCallOperands(Outs, CC_MSP430_AssignStack);
+}
+
+static void AnalyzeVarArgs(CCState &State,
+                           const SmallVectorImpl<ISD::InputArg> &Ins) {
+  State.AnalyzeFormalArguments(Ins, CC_MSP430_AssignStack);
+}
+
+/// Analyze incoming and outgoing function arguments. We need custom C++ code
+/// to handle special constraints in the ABI like reversing the order of the
+/// pieces of splitted arguments. In addition, all pieces of a certain argument
+/// have to be passed either using registers or the stack but never mixing both.
+template<typename ArgT>
+static void AnalyzeArguments(CCState &State,
+                             SmallVectorImpl<CCValAssign> &ArgLocs,
+                             const SmallVectorImpl<ArgT> &Args) {
+  static const MCPhysReg RegList[] = {
+    MSP430::R15W, MSP430::R14W, MSP430::R13W, MSP430::R12W
+  };
+  static const unsigned NbRegs = array_lengthof(RegList);
+
+  if (State.isVarArg()) {
+    AnalyzeVarArgs(State, Args);
+    return;
+  }
+
+  SmallVector<unsigned, 4> ArgsParts;
+  ParseFunctionArgs(Args, ArgsParts);
+
+  unsigned RegsLeft = NbRegs;
+  bool UseStack = false;
+  unsigned ValNo = 0;
+
+  for (unsigned i = 0, e = ArgsParts.size(); i != e; i++) {
+    MVT ArgVT = Args[ValNo].VT;
+    ISD::ArgFlagsTy ArgFlags = Args[ValNo].Flags;
+    MVT LocVT = ArgVT;
+    CCValAssign::LocInfo LocInfo = CCValAssign::Full;
+
+    // Promote i8 to i16
+    if (LocVT == MVT::i8) {
+      LocVT = MVT::i16;
+      if (ArgFlags.isSExt())
+          LocInfo = CCValAssign::SExt;
+      else if (ArgFlags.isZExt())
+          LocInfo = CCValAssign::ZExt;
+      else
+          LocInfo = CCValAssign::AExt;
+    }
+
+    // Handle byval arguments
+    if (ArgFlags.isByVal()) {
+      State.HandleByVal(ValNo++, ArgVT, LocVT, LocInfo, 2, 2, ArgFlags);
+      continue;
+    }
+
+    unsigned Parts = ArgsParts[i];
+
+    if (!UseStack && Parts <= RegsLeft) {
+      unsigned FirstVal = ValNo;
+      for (unsigned j = 0; j < Parts; j++) {
+        unsigned Reg = State.AllocateReg(RegList, NbRegs);
+        State.addLoc(CCValAssign::getReg(ValNo++, ArgVT, Reg, LocVT, LocInfo));
+        RegsLeft--;
+      }
+
+      // Reverse the order of the pieces to agree with the "big endian" format
+      // required in the calling convention ABI.
+      SmallVectorImpl<CCValAssign>::iterator B = ArgLocs.begin() + FirstVal;
+      std::reverse(B, B + Parts);
+    } else {
+      UseStack = true;
+      for (unsigned j = 0; j < Parts; j++)
+        CC_MSP430_AssignStack(ValNo++, ArgVT, LocVT, LocInfo, ArgFlags, State);
+    }
+  }
+}
+
+static void AnalyzeRetResult(CCState &State,
+                             const SmallVectorImpl<ISD::InputArg> &Ins) {
+  State.AnalyzeCallResult(Ins, RetCC_MSP430);
+}
+
+static void AnalyzeRetResult(CCState &State,
+                             const SmallVectorImpl<ISD::OutputArg> &Outs) {
+  State.AnalyzeReturn(Outs, RetCC_MSP430);
+}
+
+template<typename ArgT>
+static void AnalyzeReturnValues(CCState &State,
+                                SmallVectorImpl<CCValAssign> &RVLocs,
+                                const SmallVectorImpl<ArgT> &Args) {
+  AnalyzeRetResult(State, Args);
+
+  // Reverse splitted return values to get the "big endian" format required
+  // to agree with the calling convention ABI.
+  std::reverse(RVLocs.begin(), RVLocs.end());
+}
+
+SDValue
+MSP430TargetLowering::LowerFormalArguments(SDValue Chain,
+                                           CallingConv::ID CallConv,
+                                           bool isVarArg,
+                                           const SmallVectorImpl<ISD::InputArg>
+                                             &Ins,
+                                           SDLoc dl,
+                                           SelectionDAG &DAG,
+                                           SmallVectorImpl<SDValue> &InVals)
+                                             const {
+
+  switch (CallConv) {
+  default:
+    llvm_unreachable("Unsupported calling convention");
+  case CallingConv::C:
+  case CallingConv::Fast:
+    return LowerCCCArguments(Chain, CallConv, isVarArg, Ins, dl, DAG, InVals);
+  case CallingConv::MSP430_INTR:
+    if (Ins.empty())
+      return Chain;
+    report_fatal_error("ISRs cannot have arguments");
+  }
+}
+
+SDValue
+MSP430TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                                SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  SDLoc &dl                             = CLI.DL;
+  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+  SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
+  SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  bool &isTailCall                      = CLI.IsTailCall;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool isVarArg                         = CLI.IsVarArg;
+
+  // MSP430 target does not yet support tail call optimization.
+  isTailCall = false;
+
+  switch (CallConv) {
+  default:
+    llvm_unreachable("Unsupported calling convention");
+  case CallingConv::Fast:
+  case CallingConv::C:
+    return LowerCCCCallTo(Chain, Callee, CallConv, isVarArg, isTailCall,
+                          Outs, OutVals, Ins, dl, DAG, InVals);
+  case CallingConv::MSP430_INTR:
+    report_fatal_error("ISRs cannot be called directly");
+  }
+}
+
+/// LowerCCCArguments - transform physical registers into virtual registers and
+/// generate load operations for arguments places on the stack.
+// FIXME: struct return stuff
+SDValue
+MSP430TargetLowering::LowerCCCArguments(SDValue Chain,
+                                        CallingConv::ID CallConv,
+                                        bool isVarArg,
+                                        const SmallVectorImpl<ISD::InputArg>
+                                          &Ins,
+                                        SDLoc dl,
+                                        SelectionDAG &DAG,
+                                        SmallVectorImpl<SDValue> &InVals)
+                                          const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+  MSP430MachineFunctionInfo *FuncInfo = MF.getInfo<MSP430MachineFunctionInfo>();
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+  AnalyzeArguments(CCInfo, ArgLocs, Ins);
+
+  // Create frame index for the start of the first vararg value
+  if (isVarArg) {
+    unsigned Offset = CCInfo.getNextStackOffset();
+    FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, Offset, true));
+  }
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    if (VA.isRegLoc()) {
+      // Arguments passed in registers
+      EVT RegVT = VA.getLocVT();
+      switch (RegVT.getSimpleVT().SimpleTy) {
+      default:
+        {
+#ifndef NDEBUG
+          errs() << "LowerFormalArguments Unhandled argument type: "
+               << RegVT.getSimpleVT().SimpleTy << "\n";
+#endif
+          llvm_unreachable(nullptr);
+        }
+      case MVT::i16:
+        unsigned VReg = RegInfo.createVirtualRegister(&MSP430::GR16RegClass);
+        RegInfo.addLiveIn(VA.getLocReg(), VReg);
+        SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, VReg, RegVT);
+
+        // If this is an 8-bit value, it is really passed promoted to 16
+        // bits. Insert an assert[sz]ext to capture this, then truncate to the
+        // right size.
+        if (VA.getLocInfo() == CCValAssign::SExt)
+          ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
+                                 DAG.getValueType(VA.getValVT()));
+        else if (VA.getLocInfo() == CCValAssign::ZExt)
+          ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
+                                 DAG.getValueType(VA.getValVT()));
+
+        if (VA.getLocInfo() != CCValAssign::Full)
+          ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
+
+        InVals.push_back(ArgValue);
+      }
+    } else {
+      // Sanity check
+      assert(VA.isMemLoc());
+
+      SDValue InVal;
+      ISD::ArgFlagsTy Flags = Ins[i].Flags;
+
+      if (Flags.isByVal()) {
+        int FI = MFI->CreateFixedObject(Flags.getByValSize(),
+                                        VA.getLocMemOffset(), true);
+        InVal = DAG.getFrameIndex(FI, getPointerTy());
+      } else {
+        // Load the argument to a virtual register
+        unsigned ObjSize = VA.getLocVT().getSizeInBits()/8;
+        if (ObjSize > 2) {
+            errs() << "LowerFormalArguments Unhandled argument type: "
+                << EVT(VA.getLocVT()).getEVTString()
+                << "\n";
+        }
+        // Create the frame index object for this incoming parameter...
+        int FI = MFI->CreateFixedObject(ObjSize, VA.getLocMemOffset(), true);
+
+        // Create the SelectionDAG nodes corresponding to a load
+        //from this parameter
+        SDValue FIN = DAG.getFrameIndex(FI, MVT::i16);
+        InVal = DAG.getLoad(VA.getLocVT(), dl, Chain, FIN,
+                            MachinePointerInfo::getFixedStack(FI),
+                            false, false, false, 0);
+      }
+
+      InVals.push_back(InVal);
+    }
+  }
+
+  return Chain;
+}
+
+SDValue
+MSP430TargetLowering::LowerReturn(SDValue Chain,
+                                  CallingConv::ID CallConv, bool isVarArg,
+                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                  const SmallVectorImpl<SDValue> &OutVals,
+                                  SDLoc dl, SelectionDAG &DAG) const {
+
+  // CCValAssign - represent the assignment of the return value to a location
+  SmallVector<CCValAssign, 16> RVLocs;
+
+  // ISRs cannot return any value.
+  if (CallConv == CallingConv::MSP430_INTR && !Outs.empty())
+    report_fatal_error("ISRs cannot return any value");
+
+  // CCState - Info about the registers and stack slot.
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), RVLocs, *DAG.getContext());
+
+  // Analize return values.
+  AnalyzeReturnValues(CCInfo, RVLocs, Outs);
+
+  SDValue Flag;
+  SmallVector<SDValue, 4> RetOps(1, Chain);
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
+                             OutVals[i], Flag);
+
+    // Guarantee that all emitted copies are stuck together,
+    // avoiding something bad.
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
+
+  unsigned Opc = (CallConv == CallingConv::MSP430_INTR ?
+                  MSP430ISD::RETI_FLAG : MSP430ISD::RET_FLAG);
+
+  RetOps[0] = Chain;  // Update chain.
+
+  // Add the flag if we have it.
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
+  return DAG.getNode(Opc, dl, MVT::Other, RetOps);
+}
+
+/// LowerCCCCallTo - functions arguments are copied from virtual regs to
+/// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
+// TODO: sret.
+SDValue
+MSP430TargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
+                                     CallingConv::ID CallConv, bool isVarArg,
+                                     bool isTailCall,
+                                     const SmallVectorImpl<ISD::OutputArg>
+                                       &Outs,
+                                     const SmallVectorImpl<SDValue> &OutVals,
+                                     const SmallVectorImpl<ISD::InputArg> &Ins,
+                                     SDLoc dl, SelectionDAG &DAG,
+                                     SmallVectorImpl<SDValue> &InVals) const {
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+  AnalyzeArguments(CCInfo, ArgLocs, Outs);
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+
+  Chain = DAG.getCALLSEQ_START(Chain ,DAG.getConstant(NumBytes,
+                                                      getPointerTy(), true),
+                               dl);
+
+  SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass;
+  SmallVector<SDValue, 12> MemOpChains;
+  SDValue StackPtr;
+
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+
+    SDValue Arg = OutVals[i];
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+      default: llvm_unreachable("Unknown loc info!");
+      case CCValAssign::Full: break;
+      case CCValAssign::SExt:
+        Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+      case CCValAssign::ZExt:
+        Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+      case CCValAssign::AExt:
+        Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+    }
+
+    // Arguments that can be passed on register must be kept at RegsToPass
+    // vector
+    if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else {
+      assert(VA.isMemLoc());
+
+      if (!StackPtr.getNode())
+        StackPtr = DAG.getCopyFromReg(Chain, dl, MSP430::SPW, getPointerTy());
+
+      SDValue PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(),
+                                   StackPtr,
+                                   DAG.getIntPtrConstant(VA.getLocMemOffset()));
+
+      SDValue MemOp;
+      ISD::ArgFlagsTy Flags = Outs[i].Flags;
+
+      if (Flags.isByVal()) {
+        SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i16);
+        MemOp = DAG.getMemcpy(Chain, dl, PtrOff, Arg, SizeNode,
+                              Flags.getByValAlign(),
+                              /*isVolatile*/false,
+                              /*AlwaysInline=*/true,
+                              MachinePointerInfo(),
+                              MachinePointerInfo());
+      } else {
+        MemOp = DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo(),
+                             false, false, 0);
+      }
+
+      MemOpChains.push_back(MemOp);
+    }
+  }
+
+  // Transform all store nodes into one single node because all store nodes are
+  // independent of each other.
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain and
+  // flag operands which copy the outgoing args into registers.  The InFlag in
+  // necessary since all emitted instructions must be stuck together.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // If the callee is a GlobalAddress node (quite common, every direct call is)
+  // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+  // Likewise ExternalSymbol -> TargetExternalSymbol.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, MVT::i16);
+  else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
+    Callee = DAG.getTargetExternalSymbol(E->getSymbol(), MVT::i16);
+
+  // Returns a chain & a flag for retval copy to use.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add argument registers to the end of the list so that they are
+  // known live into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  Chain = DAG.getNode(MSP430ISD::CALL, dl, NodeTys, Ops);
+  InFlag = Chain.getValue(1);
+
+  // Create the CALLSEQ_END node.
+  Chain = DAG.getCALLSEQ_END(Chain,
+                             DAG.getConstant(NumBytes, getPointerTy(), true),
+                             DAG.getConstant(0, getPointerTy(), true),
+                             InFlag, dl);
+  InFlag = Chain.getValue(1);
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl,
+                         DAG, InVals);
+}
+
+/// LowerCallResult - Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+///
+SDValue
+MSP430TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
+                                      CallingConv::ID CallConv, bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                      SDLoc dl, SelectionDAG &DAG,
+                                      SmallVectorImpl<SDValue> &InVals) const {
+
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), RVLocs, *DAG.getContext());
+
+  AnalyzeReturnValues(CCInfo, RVLocs, Ins);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    Chain = DAG.getCopyFromReg(Chain, dl, RVLocs[i].getLocReg(),
+                               RVLocs[i].getValVT(), InFlag).getValue(1);
+    InFlag = Chain.getValue(2);
+    InVals.push_back(Chain.getValue(0));
+  }
+
+  return Chain;
+}
+
+SDValue MSP430TargetLowering::LowerShifts(SDValue Op,
+                                          SelectionDAG &DAG) const {
+  unsigned Opc = Op.getOpcode();
+  SDNode* N = Op.getNode();
+  EVT VT = Op.getValueType();
+  SDLoc dl(N);
+
+  // Expand non-constant shifts to loops:
+  if (!isa<ConstantSDNode>(N->getOperand(1)))
+    switch (Opc) {
+    default: llvm_unreachable("Invalid shift opcode!");
+    case ISD::SHL:
+      return DAG.getNode(MSP430ISD::SHL, dl,
+                         VT, N->getOperand(0), N->getOperand(1));
+    case ISD::SRA:
+      return DAG.getNode(MSP430ISD::SRA, dl,
+                         VT, N->getOperand(0), N->getOperand(1));
+    case ISD::SRL:
+      return DAG.getNode(MSP430ISD::SRL, dl,
+                         VT, N->getOperand(0), N->getOperand(1));
+    }
+
+  uint64_t ShiftAmount = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+
+  // Expand the stuff into sequence of shifts.
+  // FIXME: for some shift amounts this might be done better!
+  // E.g.: foo >> (8 + N) => sxt(swpb(foo)) >> N
+  SDValue Victim = N->getOperand(0);
+
+  if (Opc == ISD::SRL && ShiftAmount) {
+    // Emit a special goodness here:
+    // srl A, 1 => clrc; rrc A
+    Victim = DAG.getNode(MSP430ISD::RRC, dl, VT, Victim);
+    ShiftAmount -= 1;
+  }
+
+  while (ShiftAmount--)
+    Victim = DAG.getNode((Opc == ISD::SHL ? MSP430ISD::RLA : MSP430ISD::RRA),
+                         dl, VT, Victim);
+
+  return Victim;
+}
+
+SDValue MSP430TargetLowering::LowerGlobalAddress(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
+
+  // Create the TargetGlobalAddress node, folding in the constant offset.
+  SDValue Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
+                                              getPointerTy(), Offset);
+  return DAG.getNode(MSP430ISD::Wrapper, SDLoc(Op),
+                     getPointerTy(), Result);
+}
+
+SDValue MSP430TargetLowering::LowerExternalSymbol(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
+  SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy());
+
+  return DAG.getNode(MSP430ISD::Wrapper, dl, getPointerTy(), Result);
+}
+
+SDValue MSP430TargetLowering::LowerBlockAddress(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+  SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy());
+
+  return DAG.getNode(MSP430ISD::Wrapper, dl, getPointerTy(), Result);
+}
+
+static SDValue EmitCMP(SDValue &LHS, SDValue &RHS, SDValue &TargetCC,
+                       ISD::CondCode CC,
+                       SDLoc dl, SelectionDAG &DAG) {
+  // FIXME: Handle bittests someday
+  assert(!LHS.getValueType().isFloatingPoint() && "We don't handle FP yet");
+
+  // FIXME: Handle jump negative someday
+  MSP430CC::CondCodes TCC = MSP430CC::COND_INVALID;
+  switch (CC) {
+  default: llvm_unreachable("Invalid integer condition!");
+  case ISD::SETEQ:
+    TCC = MSP430CC::COND_E;     // aka COND_Z
+    // Minor optimization: if LHS is a constant, swap operands, then the
+    // constant can be folded into comparison.
+    if (LHS.getOpcode() == ISD::Constant)
+      std::swap(LHS, RHS);
+    break;
+  case ISD::SETNE:
+    TCC = MSP430CC::COND_NE;    // aka COND_NZ
+    // Minor optimization: if LHS is a constant, swap operands, then the
+    // constant can be folded into comparison.
+    if (LHS.getOpcode() == ISD::Constant)
+      std::swap(LHS, RHS);
+    break;
+  case ISD::SETULE:
+    std::swap(LHS, RHS);        // FALLTHROUGH
+  case ISD::SETUGE:
+    // Turn lhs u>= rhs with lhs constant into rhs u< lhs+1, this allows us to
+    // fold constant into instruction.
+    if (const ConstantSDNode * C = dyn_cast<ConstantSDNode>(LHS)) {
+      LHS = RHS;
+      RHS = DAG.getConstant(C->getSExtValue() + 1, C->getValueType(0));
+      TCC = MSP430CC::COND_LO;
+      break;
+    }
+    TCC = MSP430CC::COND_HS;    // aka COND_C
+    break;
+  case ISD::SETUGT:
+    std::swap(LHS, RHS);        // FALLTHROUGH
+  case ISD::SETULT:
+    // Turn lhs u< rhs with lhs constant into rhs u>= lhs+1, this allows us to
+    // fold constant into instruction.
+    if (const ConstantSDNode * C = dyn_cast<ConstantSDNode>(LHS)) {
+      LHS = RHS;
+      RHS = DAG.getConstant(C->getSExtValue() + 1, C->getValueType(0));
+      TCC = MSP430CC::COND_HS;
+      break;
+    }
+    TCC = MSP430CC::COND_LO;    // aka COND_NC
+    break;
+  case ISD::SETLE:
+    std::swap(LHS, RHS);        // FALLTHROUGH
+  case ISD::SETGE:
+    // Turn lhs >= rhs with lhs constant into rhs < lhs+1, this allows us to
+    // fold constant into instruction.
+    if (const ConstantSDNode * C = dyn_cast<ConstantSDNode>(LHS)) {
+      LHS = RHS;
+      RHS = DAG.getConstant(C->getSExtValue() + 1, C->getValueType(0));
+      TCC = MSP430CC::COND_L;
+      break;
+    }
+    TCC = MSP430CC::COND_GE;
+    break;
+  case ISD::SETGT:
+    std::swap(LHS, RHS);        // FALLTHROUGH
+  case ISD::SETLT:
+    // Turn lhs < rhs with lhs constant into rhs >= lhs+1, this allows us to
+    // fold constant into instruction.
+    if (const ConstantSDNode * C = dyn_cast<ConstantSDNode>(LHS)) {
+      LHS = RHS;
+      RHS = DAG.getConstant(C->getSExtValue() + 1, C->getValueType(0));
+      TCC = MSP430CC::COND_GE;
+      break;
+    }
+    TCC = MSP430CC::COND_L;
+    break;
+  }
+
+  TargetCC = DAG.getConstant(TCC, MVT::i8);
+  return DAG.getNode(MSP430ISD::CMP, dl, MVT::Glue, LHS, RHS);
+}
+
+
+SDValue MSP430TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+  SDValue LHS   = Op.getOperand(2);
+  SDValue RHS   = Op.getOperand(3);
+  SDValue Dest  = Op.getOperand(4);
+  SDLoc dl  (Op);
+
+  SDValue TargetCC;
+  SDValue Flag = EmitCMP(LHS, RHS, TargetCC, CC, dl, DAG);
+
+  return DAG.getNode(MSP430ISD::BR_CC, dl, Op.getValueType(),
+                     Chain, Dest, TargetCC, Flag);
+}
+
+SDValue MSP430TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+  SDValue LHS   = Op.getOperand(0);
+  SDValue RHS   = Op.getOperand(1);
+  SDLoc dl  (Op);
+
+  // If we are doing an AND and testing against zero, then the CMP
+  // will not be generated.  The AND (or BIT) will generate the condition codes,
+  // but they are different from CMP.
+  // FIXME: since we're doing a post-processing, use a pseudoinstr here, so
+  // lowering & isel wouldn't diverge.
+  bool andCC = false;
+  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
+    if (RHSC->isNullValue() && LHS.hasOneUse() &&
+        (LHS.getOpcode() == ISD::AND ||
+         (LHS.getOpcode() == ISD::TRUNCATE &&
+          LHS.getOperand(0).getOpcode() == ISD::AND))) {
+      andCC = true;
+    }
+  }
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+  SDValue TargetCC;
+  SDValue Flag = EmitCMP(LHS, RHS, TargetCC, CC, dl, DAG);
+
+  // Get the condition codes directly from the status register, if its easy.
+  // Otherwise a branch will be generated.  Note that the AND and BIT
+  // instructions generate different flags than CMP, the carry bit can be used
+  // for NE/EQ.
+  bool Invert = false;
+  bool Shift = false;
+  bool Convert = true;
+  switch (cast<ConstantSDNode>(TargetCC)->getZExtValue()) {
+   default:
+    Convert = false;
+    break;
+   case MSP430CC::COND_HS:
+     // Res = SRW & 1, no processing is required
+     break;
+   case MSP430CC::COND_LO:
+     // Res = ~(SRW & 1)
+     Invert = true;
+     break;
+   case MSP430CC::COND_NE:
+     if (andCC) {
+       // C = ~Z, thus Res = SRW & 1, no processing is required
+     } else {
+       // Res = ~((SRW >> 1) & 1)
+       Shift = true;
+       Invert = true;
+     }
+     break;
+   case MSP430CC::COND_E:
+     Shift = true;
+     // C = ~Z for AND instruction, thus we can put Res = ~(SRW & 1), however,
+     // Res = (SRW >> 1) & 1 is 1 word shorter.
+     break;
+  }
+  EVT VT = Op.getValueType();
+  SDValue One  = DAG.getConstant(1, VT);
+  if (Convert) {
+    SDValue SR = DAG.getCopyFromReg(DAG.getEntryNode(), dl, MSP430::SRW,
+                                    MVT::i16, Flag);
+    if (Shift)
+      // FIXME: somewhere this is turned into a SRL, lower it MSP specific?
+      SR = DAG.getNode(ISD::SRA, dl, MVT::i16, SR, One);
+    SR = DAG.getNode(ISD::AND, dl, MVT::i16, SR, One);
+    if (Invert)
+      SR = DAG.getNode(ISD::XOR, dl, MVT::i16, SR, One);
+    return SR;
+  } else {
+    SDValue Zero = DAG.getConstant(0, VT);
+    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
+    SmallVector<SDValue, 4> Ops;
+    Ops.push_back(One);
+    Ops.push_back(Zero);
+    Ops.push_back(TargetCC);
+    Ops.push_back(Flag);
+    return DAG.getNode(MSP430ISD::SELECT_CC, dl, VTs, Ops);
+  }
+}
+
+SDValue MSP430TargetLowering::LowerSELECT_CC(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  SDValue LHS    = Op.getOperand(0);
+  SDValue RHS    = Op.getOperand(1);
+  SDValue TrueV  = Op.getOperand(2);
+  SDValue FalseV = Op.getOperand(3);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+  SDLoc dl   (Op);
+
+  SDValue TargetCC;
+  SDValue Flag = EmitCMP(LHS, RHS, TargetCC, CC, dl, DAG);
+
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
+  SmallVector<SDValue, 4> Ops;
+  Ops.push_back(TrueV);
+  Ops.push_back(FalseV);
+  Ops.push_back(TargetCC);
+  Ops.push_back(Flag);
+
+  return DAG.getNode(MSP430ISD::SELECT_CC, dl, VTs, Ops);
+}
+
+SDValue MSP430TargetLowering::LowerSIGN_EXTEND(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDValue Val = Op.getOperand(0);
+  EVT VT      = Op.getValueType();
+  SDLoc dl(Op);
+
+  assert(VT == MVT::i16 && "Only support i16 for now!");
+
+  return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT,
+                     DAG.getNode(ISD::ANY_EXTEND, dl, VT, Val),
+                     DAG.getValueType(Val.getValueType()));
+}
+
+SDValue
+MSP430TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MSP430MachineFunctionInfo *FuncInfo = MF.getInfo<MSP430MachineFunctionInfo>();
+  int ReturnAddrIndex = FuncInfo->getRAIndex();
+
+  if (ReturnAddrIndex == 0) {
+    // Set up a frame object for the return address.
+    uint64_t SlotSize = getDataLayout()->getPointerSize();
+    ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize,
+                                                           true);
+    FuncInfo->setRAIndex(ReturnAddrIndex);
+  }
+
+  return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
+}
+
+SDValue MSP430TargetLowering::LowerRETURNADDR(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI->setReturnAddressIsTaken(true);
+
+  if (verifyReturnAddressArgumentIsConstant(Op, DAG))
+    return SDValue();
+
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  SDLoc dl(Op);
+
+  if (Depth > 0) {
+    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+    SDValue Offset =
+        DAG.getConstant(getDataLayout()->getPointerSize(), MVT::i16);
+    return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
+                       DAG.getNode(ISD::ADD, dl, getPointerTy(),
+                                   FrameAddr, Offset),
+                       MachinePointerInfo(), false, false, false, 0);
+  }
+
+  // Just load the return address.
+  SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
+  return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
+                     RetAddrFI, MachinePointerInfo(), false, false, false, 0);
+}
+
+SDValue MSP430TargetLowering::LowerFRAMEADDR(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI->setFrameAddressIsTaken(true);
+
+  EVT VT = Op.getValueType();
+  SDLoc dl(Op);  // FIXME probably not meaningful
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl,
+                                         MSP430::FPW, VT);
+  while (Depth--)
+    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
+                            MachinePointerInfo(),
+                            false, false, false, 0);
+  return FrameAddr;
+}
+
+SDValue MSP430TargetLowering::LowerVASTART(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MSP430MachineFunctionInfo *FuncInfo = MF.getInfo<MSP430MachineFunctionInfo>();
+
+  // Frame index of first vararg argument
+  SDValue FrameIndex = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
+                                         getPointerTy());
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+
+  // Create a store of the frame index to the location operand
+  return DAG.getStore(Op.getOperand(0), SDLoc(Op), FrameIndex,
+                      Op.getOperand(1), MachinePointerInfo(SV),
+                      false, false, 0);
+}
+
+SDValue MSP430TargetLowering::LowerJumpTable(SDValue Op,
+                                             SelectionDAG &DAG) const {
+    JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+    SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy());
+    return DAG.getNode(MSP430ISD::Wrapper, SDLoc(JT),
+                       getPointerTy(), Result);
+}
+
+/// getPostIndexedAddressParts - returns true by value, base pointer and
+/// offset pointer and addressing mode by reference if this node can be
+/// combined with a load / store to form a post-indexed load / store.
+bool MSP430TargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
+                                                      SDValue &Base,
+                                                      SDValue &Offset,
+                                                      ISD::MemIndexedMode &AM,
+                                                      SelectionDAG &DAG) const {
+
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  if (LD->getExtensionType() != ISD::NON_EXTLOAD)
+    return false;
+
+  EVT VT = LD->getMemoryVT();
+  if (VT != MVT::i8 && VT != MVT::i16)
+    return false;
+
+  if (Op->getOpcode() != ISD::ADD)
+    return false;
+
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
+    uint64_t RHSC = RHS->getZExtValue();
+    if ((VT == MVT::i16 && RHSC != 2) ||
+        (VT == MVT::i8 && RHSC != 1))
+      return false;
+
+    Base = Op->getOperand(0);
+    Offset = DAG.getConstant(RHSC, VT);
+    AM = ISD::POST_INC;
+    return true;
+  }
+
+  return false;
+}
+
+
+const char *MSP430TargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default: return nullptr;
+  case MSP430ISD::RET_FLAG:           return "MSP430ISD::RET_FLAG";
+  case MSP430ISD::RETI_FLAG:          return "MSP430ISD::RETI_FLAG";
+  case MSP430ISD::RRA:                return "MSP430ISD::RRA";
+  case MSP430ISD::RLA:                return "MSP430ISD::RLA";
+  case MSP430ISD::RRC:                return "MSP430ISD::RRC";
+  case MSP430ISD::CALL:               return "MSP430ISD::CALL";
+  case MSP430ISD::Wrapper:            return "MSP430ISD::Wrapper";
+  case MSP430ISD::BR_CC:              return "MSP430ISD::BR_CC";
+  case MSP430ISD::CMP:                return "MSP430ISD::CMP";
+  case MSP430ISD::SELECT_CC:          return "MSP430ISD::SELECT_CC";
+  case MSP430ISD::SHL:                return "MSP430ISD::SHL";
+  case MSP430ISD::SRA:                return "MSP430ISD::SRA";
+  }
+}
+
+bool MSP430TargetLowering::isTruncateFree(Type *Ty1,
+                                          Type *Ty2) const {
+  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+    return false;
+
+  return (Ty1->getPrimitiveSizeInBits() > Ty2->getPrimitiveSizeInBits());
+}
+
+bool MSP430TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
+  if (!VT1.isInteger() || !VT2.isInteger())
+    return false;
+
+  return (VT1.getSizeInBits() > VT2.getSizeInBits());
+}
+
+bool MSP430TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
+  // MSP430 implicitly zero-extends 8-bit results in 16-bit registers.
+  return 0 && Ty1->isIntegerTy(8) && Ty2->isIntegerTy(16);
+}
+
+bool MSP430TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
+  // MSP430 implicitly zero-extends 8-bit results in 16-bit registers.
+  return 0 && VT1 == MVT::i8 && VT2 == MVT::i16;
+}
+
+bool MSP430TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+  return isZExtFree(Val.getValueType(), VT2);
+}
+
+//===----------------------------------------------------------------------===//
+//  Other Lowering Code
+//===----------------------------------------------------------------------===//
+
+MachineBasicBlock*
+MSP430TargetLowering::EmitShiftInstr(MachineInstr *MI,
+                                     MachineBasicBlock *BB) const {
+  MachineFunction *F = BB->getParent();
+  MachineRegisterInfo &RI = F->getRegInfo();
+  DebugLoc dl = MI->getDebugLoc();
+  const TargetInstrInfo &TII = *getTargetMachine().getInstrInfo();
+
+  unsigned Opc;
+  const TargetRegisterClass * RC;
+  switch (MI->getOpcode()) {
+  default: llvm_unreachable("Invalid shift opcode!");
+  case MSP430::Shl8:
+   Opc = MSP430::SHL8r1;
+   RC = &MSP430::GR8RegClass;
+   break;
+  case MSP430::Shl16:
+   Opc = MSP430::SHL16r1;
+   RC = &MSP430::GR16RegClass;
+   break;
+  case MSP430::Sra8:
+   Opc = MSP430::SAR8r1;
+   RC = &MSP430::GR8RegClass;
+   break;
+  case MSP430::Sra16:
+   Opc = MSP430::SAR16r1;
+   RC = &MSP430::GR16RegClass;
+   break;
+  case MSP430::Srl8:
+   Opc = MSP430::SAR8r1c;
+   RC = &MSP430::GR8RegClass;
+   break;
+  case MSP430::Srl16:
+   Opc = MSP430::SAR16r1c;
+   RC = &MSP430::GR16RegClass;
+   break;
+  }
+
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator I = BB;
+  ++I;
+
+  // Create loop block
+  MachineBasicBlock *LoopBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *RemBB  = F->CreateMachineBasicBlock(LLVM_BB);
+
+  F->insert(I, LoopBB);
+  F->insert(I, RemBB);
+
+  // Update machine-CFG edges by transferring all successors of the current
+  // block to the block containing instructions after shift.
+  RemBB->splice(RemBB->begin(), BB, std::next(MachineBasicBlock::iterator(MI)),
+                BB->end());
+  RemBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Add adges BB => LoopBB => RemBB, BB => RemBB, LoopBB => LoopBB
+  BB->addSuccessor(LoopBB);
+  BB->addSuccessor(RemBB);
+  LoopBB->addSuccessor(RemBB);
+  LoopBB->addSuccessor(LoopBB);
+
+  unsigned ShiftAmtReg = RI.createVirtualRegister(&MSP430::GR8RegClass);
+  unsigned ShiftAmtReg2 = RI.createVirtualRegister(&MSP430::GR8RegClass);
+  unsigned ShiftReg = RI.createVirtualRegister(RC);
+  unsigned ShiftReg2 = RI.createVirtualRegister(RC);
+  unsigned ShiftAmtSrcReg = MI->getOperand(2).getReg();
+  unsigned SrcReg = MI->getOperand(1).getReg();
+  unsigned DstReg = MI->getOperand(0).getReg();
+
+  // BB:
+  // cmp 0, N
+  // je RemBB
+  BuildMI(BB, dl, TII.get(MSP430::CMP8ri))
+    .addReg(ShiftAmtSrcReg).addImm(0);
+  BuildMI(BB, dl, TII.get(MSP430::JCC))
+    .addMBB(RemBB)
+    .addImm(MSP430CC::COND_E);
+
+  // LoopBB:
+  // ShiftReg = phi [%SrcReg, BB], [%ShiftReg2, LoopBB]
+  // ShiftAmt = phi [%N, BB],      [%ShiftAmt2, LoopBB]
+  // ShiftReg2 = shift ShiftReg
+  // ShiftAmt2 = ShiftAmt - 1;
+  BuildMI(LoopBB, dl, TII.get(MSP430::PHI), ShiftReg)
+    .addReg(SrcReg).addMBB(BB)
+    .addReg(ShiftReg2).addMBB(LoopBB);
+  BuildMI(LoopBB, dl, TII.get(MSP430::PHI), ShiftAmtReg)
+    .addReg(ShiftAmtSrcReg).addMBB(BB)
+    .addReg(ShiftAmtReg2).addMBB(LoopBB);
+  BuildMI(LoopBB, dl, TII.get(Opc), ShiftReg2)
+    .addReg(ShiftReg);
+  BuildMI(LoopBB, dl, TII.get(MSP430::SUB8ri), ShiftAmtReg2)
+    .addReg(ShiftAmtReg).addImm(1);
+  BuildMI(LoopBB, dl, TII.get(MSP430::JCC))
+    .addMBB(LoopBB)
+    .addImm(MSP430CC::COND_NE);
+
+  // RemBB:
+  // DestReg = phi [%SrcReg, BB], [%ShiftReg, LoopBB]
+  BuildMI(*RemBB, RemBB->begin(), dl, TII.get(MSP430::PHI), DstReg)
+    .addReg(SrcReg).addMBB(BB)
+    .addReg(ShiftReg2).addMBB(LoopBB);
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return RemBB;
+}
+
+MachineBasicBlock*
+MSP430TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                  MachineBasicBlock *BB) const {
+  unsigned Opc = MI->getOpcode();
+
+  if (Opc == MSP430::Shl8 || Opc == MSP430::Shl16 ||
+      Opc == MSP430::Sra8 || Opc == MSP430::Sra16 ||
+      Opc == MSP430::Srl8 || Opc == MSP430::Srl16)
+    return EmitShiftInstr(MI, BB);
+
+  const TargetInstrInfo &TII = *getTargetMachine().getInstrInfo();
+  DebugLoc dl = MI->getDebugLoc();
+
+  assert((Opc == MSP430::Select16 || Opc == MSP430::Select8) &&
+         "Unexpected instr type to insert");
+
+  // To "insert" a SELECT instruction, we actually have to insert the diamond
+  // control-flow pattern.  The incoming instruction knows the destination vreg
+  // to set, the condition code register to branch on, the true/false values to
+  // select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator I = BB;
+  ++I;
+
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   cmpTY ccX, r1, r2
+  //   jCC copy1MBB
+  //   fallthrough --> copy0MBB
+  MachineBasicBlock *thisMBB = BB;
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *copy1MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(I, copy0MBB);
+  F->insert(I, copy1MBB);
+  // Update machine-CFG edges by transferring all successors of the current
+  // block to the new block which will contain the Phi node for the select.
+  copy1MBB->splice(copy1MBB->begin(), BB,
+                   std::next(MachineBasicBlock::iterator(MI)), BB->end());
+  copy1MBB->transferSuccessorsAndUpdatePHIs(BB);
+  // Next, add the true and fallthrough blocks as its successors.
+  BB->addSuccessor(copy0MBB);
+  BB->addSuccessor(copy1MBB);
+
+  BuildMI(BB, dl, TII.get(MSP430::JCC))
+    .addMBB(copy1MBB)
+    .addImm(MI->getOperand(3).getImm());
+
+  //  copy0MBB:
+  //   %FalseValue = ...
+  //   # fallthrough to copy1MBB
+  BB = copy0MBB;
+
+  // Update machine-CFG edges
+  BB->addSuccessor(copy1MBB);
+
+  //  copy1MBB:
+  //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+  //  ...
+  BB = copy1MBB;
+  BuildMI(*BB, BB->begin(), dl, TII.get(MSP430::PHI),
+          MI->getOperand(0).getReg())
+    .addReg(MI->getOperand(2).getReg()).addMBB(copy0MBB)
+    .addReg(MI->getOperand(1).getReg()).addMBB(thisMBB);
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.h b/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.h
new file mode 100644
index 0000000..3e2f344
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.h
@@ -0,0 +1,173 @@
+//===-- MSP430ISelLowering.h - MSP430 DAG Lowering Interface ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that MSP430 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_MSP430_ISELLOWERING_H
+#define LLVM_TARGET_MSP430_ISELLOWERING_H
+
+#include "MSP430.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+  namespace MSP430ISD {
+    enum {
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+      /// Return with a flag operand. Operand 0 is the chain operand.
+      RET_FLAG,
+
+      /// Same as RET_FLAG, but used for returning from ISRs.
+      RETI_FLAG,
+
+      /// Y = R{R,L}A X, rotate right (left) arithmetically
+      RRA, RLA,
+
+      /// Y = RRC X, rotate right via carry
+      RRC,
+
+      /// CALL - These operations represent an abstract call
+      /// instruction, which includes a bunch of information.
+      CALL,
+
+      /// Wrapper - A wrapper node for TargetConstantPool, TargetExternalSymbol,
+      /// and TargetGlobalAddress.
+      Wrapper,
+
+      /// CMP - Compare instruction.
+      CMP,
+
+      /// SetCC - Operand 0 is condition code, and operand 1 is the flag
+      /// operand produced by a CMP instruction.
+      SETCC,
+
+      /// MSP430 conditional branches. Operand 0 is the chain operand, operand 1
+      /// is the block to branch if condition is true, operand 2 is the
+      /// condition code, and operand 3 is the flag operand produced by a CMP
+      /// instruction.
+      BR_CC,
+
+      /// SELECT_CC - Operand 0 and operand 1 are selection variable, operand 3
+      /// is condition code and operand 4 is flag operand.
+      SELECT_CC,
+
+      /// SHL, SRA, SRL - Non-constant shifts.
+      SHL, SRA, SRL
+    };
+  }
+
+  class MSP430TargetLowering : public TargetLowering {
+  public:
+    explicit MSP430TargetLowering(const TargetMachine &TM);
+
+    MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i8; }
+
+    /// LowerOperation - Provide custom lowering hooks for some operations.
+    SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+    /// getTargetNodeName - This method returns the name of a target specific
+    /// DAG node.
+    const char *getTargetNodeName(unsigned Opcode) const override;
+
+    SDValue LowerShifts(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+    SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;
+
+    TargetLowering::ConstraintType
+    getConstraintType(const std::string &Constraint) const override;
+    std::pair<unsigned, const TargetRegisterClass*>
+    getRegForInlineAsmConstraint(const std::string &Constraint,
+                                 MVT VT) const override;
+
+    /// isTruncateFree - Return true if it's free to truncate a value of type
+    /// Ty1 to type Ty2. e.g. On msp430 it's free to truncate a i16 value in
+    /// register R15W to i8 by referencing its sub-register R15B.
+    bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
+    bool isTruncateFree(EVT VT1, EVT VT2) const override;
+
+    /// isZExtFree - Return true if any actual instruction that defines a value
+    /// of type Ty1 implicit zero-extends the value to Ty2 in the result
+    /// register. This does not necessarily include registers defined in unknown
+    /// ways, such as incoming arguments, or copies from unknown virtual
+    /// registers. Also, if isTruncateFree(Ty2, Ty1) is true, this does not
+    /// necessarily apply to truncate instructions. e.g. on msp430, all
+    /// instructions that define 8-bit values implicit zero-extend the result
+    /// out to 16 bits.
+    bool isZExtFree(Type *Ty1, Type *Ty2) const override;
+    bool isZExtFree(EVT VT1, EVT VT2) const override;
+    bool isZExtFree(SDValue Val, EVT VT2) const override;
+
+    MachineBasicBlock* EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                   MachineBasicBlock *BB) const override;
+    MachineBasicBlock* EmitShiftInstr(MachineInstr *MI,
+                                      MachineBasicBlock *BB) const;
+
+  private:
+    SDValue LowerCCCCallTo(SDValue Chain, SDValue Callee,
+                           CallingConv::ID CallConv, bool isVarArg,
+                           bool isTailCall,
+                           const SmallVectorImpl<ISD::OutputArg> &Outs,
+                           const SmallVectorImpl<SDValue> &OutVals,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           SDLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals) const;
+
+    SDValue LowerCCCArguments(SDValue Chain,
+                              CallingConv::ID CallConv,
+                              bool isVarArg,
+                              const SmallVectorImpl<ISD::InputArg> &Ins,
+                              SDLoc dl,
+                              SelectionDAG &DAG,
+                              SmallVectorImpl<SDValue> &InVals) const;
+
+    SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                            CallingConv::ID CallConv, bool isVarArg,
+                            const SmallVectorImpl<ISD::InputArg> &Ins,
+                            SDLoc dl, SelectionDAG &DAG,
+                            SmallVectorImpl<SDValue> &InVals) const;
+
+    SDValue
+      LowerFormalArguments(SDValue Chain,
+                           CallingConv::ID CallConv, bool isVarArg,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           SDLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals) const override;
+    SDValue
+      LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                SmallVectorImpl<SDValue> &InVals) const override;
+
+    SDValue LowerReturn(SDValue Chain,
+                        CallingConv::ID CallConv, bool isVarArg,
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        const SmallVectorImpl<SDValue> &OutVals,
+                        SDLoc dl, SelectionDAG &DAG) const override;
+
+    bool getPostIndexedAddressParts(SDNode *N, SDNode *Op,
+                                    SDValue &Base,
+                                    SDValue &Offset,
+                                    ISD::MemIndexedMode &AM,
+                                    SelectionDAG &DAG) const override;
+  };
+} // namespace llvm
+
+#endif // LLVM_TARGET_MSP430_ISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430InstrFormats.td b/contrib/llvm/lib/Target/MSP430/MSP430InstrFormats.td
new file mode 100644
index 0000000..a9e87da
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430InstrFormats.td
@@ -0,0 +1,211 @@
+//===-- MSP430InstrFormats.td - MSP430 Instruction Formats -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Describe MSP430 instructions format here
+//
+
+// Format specifies the encoding used by the instruction.  This is part of the
+// ad-hoc solution used to emit machine instruction encodings by our machine
+// code emitter.
+class Format<bits<2> val> {
+  bits<2> Value = val;
+}
+
+def PseudoFrm   : Format<0>;
+def SingleOpFrm : Format<1>;
+def DoubleOpFrm : Format<2>;
+def CondJumpFrm : Format<3>;
+
+class SourceMode<bits<2> val> {
+  bits<2> Value = val;
+}
+
+def SrcReg      : SourceMode<0>;
+def SrcMem      : SourceMode<1>;
+def SrcIndReg   : SourceMode<2>;
+def SrcPostInc  : SourceMode<3>;
+def SrcImm      : SourceMode<3>;
+
+class DestMode<bit val> {
+  bit Value = val;
+}
+
+def DstReg      : DestMode<0>;
+def DstMem      : DestMode<1>;
+
+class SizeVal<bits<3> val> {
+  bits<3> Value = val;
+}
+
+def SizeUnknown : SizeVal<0>; // Unknown / unset size
+def SizeSpecial : SizeVal<1>; // Special instruction, e.g. pseudo
+def Size2Bytes  : SizeVal<2>;
+def Size4Bytes  : SizeVal<3>;
+def Size6Bytes  : SizeVal<4>;
+
+// Generic MSP430 Format
+class MSP430Inst<dag outs, dag ins, SizeVal sz, Format f,
+                 string asmstr> : Instruction {
+  field bits<16> Inst;
+
+  let Namespace = "MSP430";
+
+  dag OutOperandList = outs;
+  dag InOperandList  = ins;
+
+  Format Form = f;
+  SizeVal Sz = sz;
+
+  // Define how we want to layout our TargetSpecific information field... This
+  // should be kept up-to-date with the fields in the MSP430InstrInfo.h file.
+  let TSFlags{1-0} = Form.Value;
+  let TSFlags{4-2} = Sz.Value;
+
+  let AsmString   = asmstr;
+}
+
+// FIXME: Create different classes for different addressing modes.
+
+// MSP430 Double Operand (Format I) Instructions
+class IForm<bits<4> opcode, DestMode dest, bit bw, SourceMode src, SizeVal sz,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : MSP430Inst<outs, ins, sz, DoubleOpFrm, asmstr> {
+  let Pattern = pattern;
+
+  DestMode ad = dest;
+  SourceMode as = src;
+  
+  let Inst{12-15} = opcode;
+  let Inst{7}     = ad.Value;
+  let Inst{6}     = bw;
+  let Inst{4-5}   = as.Value;
+}
+
+// 8 bit IForm instructions
+class IForm8<bits<4> opcode, DestMode dest, SourceMode src, SizeVal sz,
+             dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm<opcode, dest, 1, src, sz, outs, ins, asmstr, pattern>;
+
+class I8rr<bits<4> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm8<opcode, DstReg, SrcReg, Size2Bytes, outs, ins, asmstr, pattern>;
+
+class I8ri<bits<4> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm8<opcode, DstReg, SrcImm, Size4Bytes, outs, ins, asmstr, pattern>;
+
+class I8rm<bits<4> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm8<opcode, DstReg, SrcMem, Size4Bytes, outs, ins, asmstr, pattern>;
+
+class I8mr<bits<4> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm8<opcode, DstMem, SrcReg, Size4Bytes, outs, ins, asmstr, pattern>;
+
+class I8mi<bits<4> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm8<opcode, DstMem, SrcImm, Size6Bytes, outs, ins, asmstr, pattern>;
+
+class I8mm<bits<4> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm8<opcode, DstMem, SrcMem, Size6Bytes, outs, ins, asmstr, pattern>;
+
+// 16 bit IForm instructions
+class IForm16<bits<4> opcode, DestMode dest, SourceMode src, SizeVal sz,
+              dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm<opcode, dest, 0, src, sz, outs, ins, asmstr, pattern>;
+
+class I16rr<bits<4> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm16<opcode, DstReg, SrcReg, Size2Bytes, outs, ins, asmstr, pattern>;
+
+class I16ri<bits<4> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm16<opcode, DstReg, SrcImm, Size4Bytes, outs, ins, asmstr, pattern>;
+
+class I16rm<bits<4> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm16<opcode, DstReg, SrcMem, Size4Bytes, outs, ins, asmstr, pattern>;
+
+class I16mr<bits<4> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm16<opcode, DstMem, SrcReg, Size4Bytes, outs, ins, asmstr, pattern>;
+
+class I16mi<bits<4> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm16<opcode, DstMem, SrcImm, Size6Bytes, outs, ins, asmstr, pattern>;
+
+class I16mm<bits<4> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm16<opcode, DstMem, SrcMem, Size6Bytes, outs, ins, asmstr, pattern>;
+
+// MSP430 Single Operand (Format II) Instructions
+class IIForm<bits<9> opcode, bit bw, SourceMode src, SizeVal sz,
+             dag outs, dag ins, string asmstr, list<dag> pattern>
+  : MSP430Inst<outs, ins, sz, SingleOpFrm, asmstr> {
+  let Pattern = pattern;
+  
+  SourceMode as = src;
+
+  let Inst{7-15} = opcode;
+  let Inst{6}    = bw;
+  let Inst{4-5}  = as.Value;
+}
+
+// 8 bit IIForm instructions
+class IIForm8<bits<9> opcode, SourceMode src, SizeVal sz,
+              dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IIForm<opcode, 1, src, sz, outs, ins, asmstr, pattern>;
+
+class II8r<bits<9> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IIForm8<opcode, SrcReg, Size2Bytes, outs, ins, asmstr, pattern>;
+
+class II8m<bits<9> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IIForm8<opcode, SrcMem, Size4Bytes, outs, ins, asmstr, pattern>;
+
+class II8i<bits<9> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IIForm8<opcode, SrcImm, Size4Bytes, outs, ins, asmstr, pattern>;
+
+// 16 bit IIForm instructions
+class IIForm16<bits<9> opcode, SourceMode src, SizeVal sz,
+               dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IIForm<opcode, 0, src, sz, outs, ins, asmstr, pattern>;
+
+class II16r<bits<9> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IIForm16<opcode, SrcReg, Size2Bytes, outs, ins, asmstr, pattern>;
+
+class II16m<bits<9> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IIForm16<opcode, SrcMem, Size4Bytes, outs, ins, asmstr, pattern>;
+
+class II16i<bits<9> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IIForm16<opcode, SrcImm, Size4Bytes, outs, ins, asmstr, pattern>;
+
+// MSP430 Conditional Jumps Instructions
+class CJForm<bits<3> opcode, bits<3> cond,
+             dag outs, dag ins, string asmstr, list<dag> pattern>
+  : MSP430Inst<outs, ins, Size2Bytes, CondJumpFrm, asmstr> {
+  let Pattern = pattern;
+  
+  let Inst{13-15} = opcode;
+  let Inst{10-12} = cond;
+}
+
+// Pseudo instructions
+class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : MSP430Inst<outs, ins, SizeSpecial, PseudoFrm, asmstr> {
+  let Pattern = pattern;
+  let Inst{15-0} = 0;
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp b/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp
new file mode 100644
index 0000000..ccb6c09
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp
@@ -0,0 +1,329 @@
+//===-- MSP430InstrInfo.cpp - MSP430 Instruction Information --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MSP430 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430InstrInfo.h"
+#include "MSP430.h"
+#include "MSP430MachineFunctionInfo.h"
+#include "MSP430TargetMachine.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "MSP430GenInstrInfo.inc"
+
+// Pin the vtable to this file.
+void MSP430InstrInfo::anchor() {}
+
+MSP430InstrInfo::MSP430InstrInfo(MSP430Subtarget &STI)
+  : MSP430GenInstrInfo(MSP430::ADJCALLSTACKDOWN, MSP430::ADJCALLSTACKUP),
+    RI() {}
+
+void MSP430InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator MI,
+                                    unsigned SrcReg, bool isKill, int FrameIdx,
+                                          const TargetRegisterClass *RC,
+                                          const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx),
+                            MachineMemOperand::MOStore,
+                            MFI.getObjectSize(FrameIdx),
+                            MFI.getObjectAlignment(FrameIdx));
+
+  if (RC == &MSP430::GR16RegClass)
+    BuildMI(MBB, MI, DL, get(MSP430::MOV16mr))
+      .addFrameIndex(FrameIdx).addImm(0)
+      .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
+  else if (RC == &MSP430::GR8RegClass)
+    BuildMI(MBB, MI, DL, get(MSP430::MOV8mr))
+      .addFrameIndex(FrameIdx).addImm(0)
+      .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
+  else
+    llvm_unreachable("Cannot store this register to stack slot!");
+}
+
+void MSP430InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                           MachineBasicBlock::iterator MI,
+                                           unsigned DestReg, int FrameIdx,
+                                           const TargetRegisterClass *RC,
+                                           const TargetRegisterInfo *TRI) const{
+  DebugLoc DL;
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx),
+                            MachineMemOperand::MOLoad,
+                            MFI.getObjectSize(FrameIdx),
+                            MFI.getObjectAlignment(FrameIdx));
+
+  if (RC == &MSP430::GR16RegClass)
+    BuildMI(MBB, MI, DL, get(MSP430::MOV16rm))
+      .addReg(DestReg).addFrameIndex(FrameIdx).addImm(0).addMemOperand(MMO);
+  else if (RC == &MSP430::GR8RegClass)
+    BuildMI(MBB, MI, DL, get(MSP430::MOV8rm))
+      .addReg(DestReg).addFrameIndex(FrameIdx).addImm(0).addMemOperand(MMO);
+  else
+    llvm_unreachable("Cannot store this register to stack slot!");
+}
+
+void MSP430InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I, DebugLoc DL,
+                                  unsigned DestReg, unsigned SrcReg,
+                                  bool KillSrc) const {
+  unsigned Opc;
+  if (MSP430::GR16RegClass.contains(DestReg, SrcReg))
+    Opc = MSP430::MOV16rr;
+  else if (MSP430::GR8RegClass.contains(DestReg, SrcReg))
+    Opc = MSP430::MOV8rr;
+  else
+    llvm_unreachable("Impossible reg-to-reg copy");
+
+  BuildMI(MBB, I, DL, get(Opc), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+}
+
+unsigned MSP430InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.end();
+  unsigned Count = 0;
+
+  while (I != MBB.begin()) {
+    --I;
+    if (I->isDebugValue())
+      continue;
+    if (I->getOpcode() != MSP430::JMP &&
+        I->getOpcode() != MSP430::JCC &&
+        I->getOpcode() != MSP430::Br &&
+        I->getOpcode() != MSP430::Bm)
+      break;
+    // Remove the branch.
+    I->eraseFromParent();
+    I = MBB.end();
+    ++Count;
+  }
+
+  return Count;
+}
+
+bool MSP430InstrInfo::
+ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+  assert(Cond.size() == 1 && "Invalid Xbranch condition!");
+
+  MSP430CC::CondCodes CC = static_cast<MSP430CC::CondCodes>(Cond[0].getImm());
+
+  switch (CC) {
+  default: llvm_unreachable("Invalid branch condition!");
+  case MSP430CC::COND_E:
+    CC = MSP430CC::COND_NE;
+    break;
+  case MSP430CC::COND_NE:
+    CC = MSP430CC::COND_E;
+    break;
+  case MSP430CC::COND_L:
+    CC = MSP430CC::COND_GE;
+    break;
+  case MSP430CC::COND_GE:
+    CC = MSP430CC::COND_L;
+    break;
+  case MSP430CC::COND_HS:
+    CC = MSP430CC::COND_LO;
+    break;
+  case MSP430CC::COND_LO:
+    CC = MSP430CC::COND_HS;
+    break;
+  }
+
+  Cond[0].setImm(CC);
+  return false;
+}
+
+bool MSP430InstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
+  if (!MI->isTerminator()) return false;
+
+  // Conditional branch is a special case.
+  if (MI->isBranch() && !MI->isBarrier())
+    return true;
+  if (!MI->isPredicable())
+    return true;
+  return !isPredicated(MI);
+}
+
+bool MSP430InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+                                    MachineBasicBlock *&TBB,
+                                    MachineBasicBlock *&FBB,
+                                    SmallVectorImpl<MachineOperand> &Cond,
+                                    bool AllowModify) const {
+  // Start from the bottom of the block and work up, examining the
+  // terminator instructions.
+  MachineBasicBlock::iterator I = MBB.end();
+  while (I != MBB.begin()) {
+    --I;
+    if (I->isDebugValue())
+      continue;
+
+    // Working from the bottom, when we see a non-terminator
+    // instruction, we're done.
+    if (!isUnpredicatedTerminator(I))
+      break;
+
+    // A terminator that isn't a branch can't easily be handled
+    // by this analysis.
+    if (!I->isBranch())
+      return true;
+
+    // Cannot handle indirect branches.
+    if (I->getOpcode() == MSP430::Br ||
+        I->getOpcode() == MSP430::Bm)
+      return true;
+
+    // Handle unconditional branches.
+    if (I->getOpcode() == MSP430::JMP) {
+      if (!AllowModify) {
+        TBB = I->getOperand(0).getMBB();
+        continue;
+      }
+
+      // If the block has any instructions after a JMP, delete them.
+      while (std::next(I) != MBB.end())
+        std::next(I)->eraseFromParent();
+      Cond.clear();
+      FBB = nullptr;
+
+      // Delete the JMP if it's equivalent to a fall-through.
+      if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
+        TBB = nullptr;
+        I->eraseFromParent();
+        I = MBB.end();
+        continue;
+      }
+
+      // TBB is used to indicate the unconditinal destination.
+      TBB = I->getOperand(0).getMBB();
+      continue;
+    }
+
+    // Handle conditional branches.
+    assert(I->getOpcode() == MSP430::JCC && "Invalid conditional branch");
+    MSP430CC::CondCodes BranchCode =
+      static_cast<MSP430CC::CondCodes>(I->getOperand(1).getImm());
+    if (BranchCode == MSP430CC::COND_INVALID)
+      return true;  // Can't handle weird stuff.
+
+    // Working from the bottom, handle the first conditional branch.
+    if (Cond.empty()) {
+      FBB = TBB;
+      TBB = I->getOperand(0).getMBB();
+      Cond.push_back(MachineOperand::CreateImm(BranchCode));
+      continue;
+    }
+
+    // Handle subsequent conditional branches. Only handle the case where all
+    // conditional branches branch to the same destination.
+    assert(Cond.size() == 1);
+    assert(TBB);
+
+    // Only handle the case where all conditional branches branch to
+    // the same destination.
+    if (TBB != I->getOperand(0).getMBB())
+      return true;
+
+    MSP430CC::CondCodes OldBranchCode = (MSP430CC::CondCodes)Cond[0].getImm();
+    // If the conditions are the same, we can leave them alone.
+    if (OldBranchCode == BranchCode)
+      continue;
+
+    return true;
+  }
+
+  return false;
+}
+
+unsigned
+MSP430InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                              MachineBasicBlock *FBB,
+                              const SmallVectorImpl<MachineOperand> &Cond,
+                              DebugLoc DL) const {
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 1 || Cond.size() == 0) &&
+         "MSP430 branch conditions have one component!");
+
+  if (Cond.empty()) {
+    // Unconditional branch?
+    assert(!FBB && "Unconditional branch with multiple successors!");
+    BuildMI(&MBB, DL, get(MSP430::JMP)).addMBB(TBB);
+    return 1;
+  }
+
+  // Conditional branch.
+  unsigned Count = 0;
+  BuildMI(&MBB, DL, get(MSP430::JCC)).addMBB(TBB).addImm(Cond[0].getImm());
+  ++Count;
+
+  if (FBB) {
+    // Two-way Conditional branch. Insert the second branch.
+    BuildMI(&MBB, DL, get(MSP430::JMP)).addMBB(FBB);
+    ++Count;
+  }
+  return Count;
+}
+
+/// GetInstSize - Return the number of bytes of code the specified
+/// instruction may be.  This returns the maximum number of bytes.
+///
+unsigned MSP430InstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
+  const MCInstrDesc &Desc = MI->getDesc();
+
+  switch (Desc.TSFlags & MSP430II::SizeMask) {
+  default:
+    switch (Desc.getOpcode()) {
+    default: llvm_unreachable("Unknown instruction size!");
+    case TargetOpcode::CFI_INSTRUCTION:
+    case TargetOpcode::EH_LABEL:
+    case TargetOpcode::IMPLICIT_DEF:
+    case TargetOpcode::KILL:
+    case TargetOpcode::DBG_VALUE:
+      return 0;
+    case TargetOpcode::INLINEASM: {
+      const MachineFunction *MF = MI->getParent()->getParent();
+      const TargetInstrInfo &TII = *MF->getTarget().getInstrInfo();
+      return TII.getInlineAsmLength(MI->getOperand(0).getSymbolName(),
+                                    *MF->getTarget().getMCAsmInfo());
+    }
+    }
+  case MSP430II::SizeSpecial:
+    switch (MI->getOpcode()) {
+    default: llvm_unreachable("Unknown instruction size!");
+    case MSP430::SAR8r1c:
+    case MSP430::SAR16r1c:
+      return 4;
+    }
+  case MSP430II::Size2Bytes:
+    return 2;
+  case MSP430II::Size4Bytes:
+    return 4;
+  case MSP430II::Size6Bytes:
+    return 6;
+  }
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.h b/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.h
new file mode 100644
index 0000000..e6baaef
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.h
@@ -0,0 +1,93 @@
+//===-- MSP430InstrInfo.h - MSP430 Instruction Information ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MSP430 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_MSP430INSTRINFO_H
+#define LLVM_TARGET_MSP430INSTRINFO_H
+
+#include "MSP430RegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "MSP430GenInstrInfo.inc"
+
+namespace llvm {
+
+class MSP430Subtarget;
+
+/// MSP430II - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace MSP430II {
+  enum {
+    SizeShift   = 2,
+    SizeMask    = 7 << SizeShift,
+
+    SizeUnknown = 0 << SizeShift,
+    SizeSpecial = 1 << SizeShift,
+    Size2Bytes  = 2 << SizeShift,
+    Size4Bytes  = 3 << SizeShift,
+    Size6Bytes  = 4 << SizeShift
+  };
+}
+
+class MSP430InstrInfo : public MSP430GenInstrInfo {
+  const MSP430RegisterInfo RI;
+  virtual void anchor();
+public:
+  explicit MSP430InstrInfo(MSP430Subtarget &STI);
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  const TargetRegisterInfo &getRegisterInfo() const { return RI; }
+
+  void copyPhysReg(MachineBasicBlock &MBB,
+                   MachineBasicBlock::iterator I, DebugLoc DL,
+                   unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI,
+                           unsigned SrcReg, bool isKill,
+                           int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            unsigned DestReg, int FrameIdx,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
+
+  unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
+
+  // Branch folding goodness
+  bool
+  ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+  bool isUnpredicatedTerminator(const MachineInstr *MI) const override;
+  bool AnalyzeBranch(MachineBasicBlock &MBB,
+                     MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify) const override;
+
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
+  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB,
+                        const SmallVectorImpl<MachineOperand> &Cond,
+                        DebugLoc DL) const override;
+
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.td b/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.td
new file mode 100644
index 0000000..50e3fda
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.td
@@ -0,0 +1,1211 @@
+//===-- MSP430InstrInfo.td - MSP430 Instruction defs -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the MSP430 instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+include "MSP430InstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Type Constraints.
+//===----------------------------------------------------------------------===//
+class SDTCisI8<int OpNum> : SDTCisVT<OpNum, i8>;
+class SDTCisI16<int OpNum> : SDTCisVT<OpNum, i16>;
+
+//===----------------------------------------------------------------------===//
+// Type Profiles.
+//===----------------------------------------------------------------------===//
+def SDT_MSP430Call         : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
+def SDT_MSP430CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i16>]>;
+def SDT_MSP430CallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i16>, SDTCisVT<1, i16>]>;
+def SDT_MSP430Wrapper      : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
+                                                  SDTCisPtrTy<0>]>;
+def SDT_MSP430Cmp          : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
+def SDT_MSP430BrCC         : SDTypeProfile<0, 2, [SDTCisVT<0, OtherVT>,
+                                                  SDTCisVT<1, i8>]>;
+def SDT_MSP430SelectCC     : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>,
+                                                  SDTCisSameAs<1, 2>, 
+                                                  SDTCisVT<3, i8>]>;
+def SDT_MSP430Shift        : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>,
+                                                  SDTCisI8<2>]>;
+
+//===----------------------------------------------------------------------===//
+// MSP430 Specific Node Definitions.
+//===----------------------------------------------------------------------===//
+def MSP430retflag  : SDNode<"MSP430ISD::RET_FLAG", SDTNone,
+                       [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def MSP430retiflag : SDNode<"MSP430ISD::RETI_FLAG", SDTNone,
+                       [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+def MSP430rra     : SDNode<"MSP430ISD::RRA", SDTIntUnaryOp, []>;
+def MSP430rla     : SDNode<"MSP430ISD::RLA", SDTIntUnaryOp, []>;
+def MSP430rrc     : SDNode<"MSP430ISD::RRC", SDTIntUnaryOp, []>;
+
+def MSP430call    : SDNode<"MSP430ISD::CALL", SDT_MSP430Call,
+                     [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue, SDNPVariadic]>;
+def MSP430callseq_start :
+                 SDNode<"ISD::CALLSEQ_START", SDT_MSP430CallSeqStart,
+                        [SDNPHasChain, SDNPOutGlue]>;
+def MSP430callseq_end :
+                 SDNode<"ISD::CALLSEQ_END",   SDT_MSP430CallSeqEnd,
+                        [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def MSP430Wrapper : SDNode<"MSP430ISD::Wrapper", SDT_MSP430Wrapper>;
+def MSP430cmp     : SDNode<"MSP430ISD::CMP", SDT_MSP430Cmp, [SDNPOutGlue]>;
+def MSP430brcc    : SDNode<"MSP430ISD::BR_CC", SDT_MSP430BrCC,
+                            [SDNPHasChain, SDNPInGlue]>;
+def MSP430selectcc: SDNode<"MSP430ISD::SELECT_CC", SDT_MSP430SelectCC,
+                            [SDNPInGlue]>;
+def MSP430shl     : SDNode<"MSP430ISD::SHL", SDT_MSP430Shift, []>;
+def MSP430sra     : SDNode<"MSP430ISD::SRA", SDT_MSP430Shift, []>;
+def MSP430srl     : SDNode<"MSP430ISD::SRL", SDT_MSP430Shift, []>;
+
+//===----------------------------------------------------------------------===//
+// MSP430 Operand Definitions.
+//===----------------------------------------------------------------------===//
+
+// Address operands
+def memsrc : Operand<i16> {
+  let PrintMethod = "printSrcMemOperand";
+  let MIOperandInfo = (ops GR16, i16imm);
+}
+
+def memdst : Operand<i16> {
+  let PrintMethod = "printSrcMemOperand";
+  let MIOperandInfo = (ops GR16, i16imm);
+}
+
+// Short jump targets have OtherVT type and are printed as pcrel imm values.
+def jmptarget : Operand<OtherVT> {
+  let PrintMethod = "printPCRelImmOperand";
+}
+
+// Operand for printing out a condition code.
+def cc : Operand<i8> {
+  let PrintMethod = "printCCOperand";
+}
+
+//===----------------------------------------------------------------------===//
+// MSP430 Complex Pattern Definitions.
+//===----------------------------------------------------------------------===//
+
+def addr : ComplexPattern<iPTR, 2, "SelectAddr", [], []>;
+
+//===----------------------------------------------------------------------===//
+// Pattern Fragments
+def zextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (zextloadi8 node:$ptr))>;
+def  extloadi16i8 : PatFrag<(ops node:$ptr), (i16 ( extloadi8 node:$ptr))>;
+def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{
+  return N->hasOneUse();
+}]>;
+//===----------------------------------------------------------------------===//
+// Instruction list..
+
+// ADJCALLSTACKDOWN/UP implicitly use/def SP because they may be expanded into
+// a stack adjustment and the codegen must know that they may modify the stack
+// pointer before prolog-epilog rewriting occurs.
+// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
+// sub / add which can clobber SRW.
+let Defs = [SPW, SRW], Uses = [SPW] in {
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i16imm:$amt),
+                              "#ADJCALLSTACKDOWN",
+                              [(MSP430callseq_start timm:$amt)]>;
+def ADJCALLSTACKUP   : Pseudo<(outs), (ins i16imm:$amt1, i16imm:$amt2),
+                              "#ADJCALLSTACKUP",
+                              [(MSP430callseq_end timm:$amt1, timm:$amt2)]>;
+}
+
+let usesCustomInserter = 1 in {
+  def Select8  : Pseudo<(outs GR8:$dst), (ins GR8:$src, GR8:$src2, i8imm:$cc),
+                        "# Select8 PSEUDO",
+                        [(set GR8:$dst,
+                          (MSP430selectcc GR8:$src, GR8:$src2, imm:$cc))]>;
+  def Select16 : Pseudo<(outs GR16:$dst), (ins GR16:$src, GR16:$src2, i8imm:$cc),
+                        "# Select16 PSEUDO",
+                        [(set GR16:$dst,
+                          (MSP430selectcc GR16:$src, GR16:$src2, imm:$cc))]>;
+  let Defs = [SRW] in {
+  def Shl8     : Pseudo<(outs GR8:$dst), (ins GR8:$src, GR8:$cnt),
+                        "# Shl8 PSEUDO",
+                        [(set GR8:$dst, (MSP430shl GR8:$src, GR8:$cnt))]>;
+  def Shl16    : Pseudo<(outs GR16:$dst), (ins GR16:$src, GR8:$cnt),
+                        "# Shl16 PSEUDO",
+                        [(set GR16:$dst, (MSP430shl GR16:$src, GR8:$cnt))]>;
+  def Sra8     : Pseudo<(outs GR8:$dst), (ins GR8:$src, GR8:$cnt),
+                        "# Sra8 PSEUDO",
+                        [(set GR8:$dst, (MSP430sra GR8:$src, GR8:$cnt))]>;
+  def Sra16    : Pseudo<(outs GR16:$dst), (ins GR16:$src, GR8:$cnt),
+                        "# Sra16 PSEUDO",
+                        [(set GR16:$dst, (MSP430sra GR16:$src, GR8:$cnt))]>;
+  def Srl8     : Pseudo<(outs GR8:$dst), (ins GR8:$src, GR8:$cnt),
+                        "# Srl8 PSEUDO",
+                        [(set GR8:$dst, (MSP430srl GR8:$src, GR8:$cnt))]>;
+  def Srl16    : Pseudo<(outs GR16:$dst), (ins GR16:$src, GR8:$cnt),
+                        "# Srl16 PSEUDO",
+                        [(set GR16:$dst, (MSP430srl GR16:$src, GR8:$cnt))]>;
+
+  }
+}
+
+let neverHasSideEffects = 1 in
+def NOP : Pseudo<(outs), (ins), "nop", []>;
+
+//===----------------------------------------------------------------------===//
+//  Control Flow Instructions...
+//
+
+// FIXME: Provide proper encoding!
+let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
+  def RET  : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
+                     (outs), (ins), "ret",  [(MSP430retflag)]>;
+  def RETI : II16r<0x0, (outs), (ins), "reti", [(MSP430retiflag)]>;
+}
+
+let isBranch = 1, isTerminator = 1 in {
+
+// FIXME: expand opcode & cond field for branches!
+
+// Direct branch
+let isBarrier = 1 in {
+  // Short branch
+  def JMP : CJForm<0, 0, (outs), (ins jmptarget:$dst),
+                   "jmp\t$dst",
+                   [(br bb:$dst)]>;
+  let isIndirectBranch = 1 in {
+    // Long branches
+    def Bi  : I16ri<0, (outs), (ins i16imm:$brdst),
+                    "br\t$brdst",
+                    [(brind tblockaddress:$brdst)]>;
+    def Br  : I16rr<0, (outs), (ins GR16:$brdst),
+                    "br\t$brdst",
+                    [(brind GR16:$brdst)]>;
+    def Bm  : I16rm<0, (outs), (ins memsrc:$brdst),
+                    "br\t$brdst",
+                    [(brind (load addr:$brdst))]>;
+  }
+}
+
+// Conditional branches
+let Uses = [SRW] in
+  def JCC : CJForm<0, 0,
+                   (outs), (ins jmptarget:$dst, cc:$cc),
+                   "j$cc\t$dst",
+                   [(MSP430brcc bb:$dst, imm:$cc)]>;
+} // isBranch, isTerminator
+
+//===----------------------------------------------------------------------===//
+//  Call Instructions...
+//
+let isCall = 1 in
+  // All calls clobber the non-callee saved registers. SPW is marked as
+  // a use to prevent stack-pointer assignments that appear immediately
+  // before calls from potentially appearing dead. Uses for argument
+  // registers are added manually.
+  let Defs = [R12W, R13W, R14W, R15W, SRW],
+      Uses = [SPW] in {
+    def CALLi     : II16i<0x0,
+                          (outs), (ins i16imm:$dst),
+                          "call\t$dst", [(MSP430call imm:$dst)]>;
+    def CALLr     : II16r<0x0,
+                          (outs), (ins GR16:$dst),
+                          "call\t$dst", [(MSP430call GR16:$dst)]>;
+    def CALLm     : II16m<0x0,
+                          (outs), (ins memsrc:$dst),
+                          "call\t${dst:mem}", [(MSP430call (load addr:$dst))]>;
+  }
+
+
+//===----------------------------------------------------------------------===//
+//  Miscellaneous Instructions...
+//
+let Defs = [SPW], Uses = [SPW], neverHasSideEffects=1 in {
+let mayLoad = 1 in
+def POP16r   : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
+                       (outs GR16:$reg), (ins), "pop.w\t$reg", []>;
+
+let mayStore = 1 in
+def PUSH16r  : II16r<0x0,
+                     (outs), (ins GR16:$reg), "push.w\t$reg",[]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Move Instructions
+
+// FIXME: Provide proper encoding!
+let neverHasSideEffects = 1 in {
+def MOV8rr  : I8rr<0x0,
+                   (outs GR8:$dst), (ins GR8:$src),
+                   "mov.b\t{$src, $dst}",
+                   []>;
+def MOV16rr : I16rr<0x0,
+                    (outs GR16:$dst), (ins GR16:$src),
+                    "mov.w\t{$src, $dst}",
+                    []>;
+}
+
+// FIXME: Provide proper encoding!
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+def MOV8ri  : I8ri<0x0,
+                   (outs GR8:$dst), (ins i8imm:$src),
+                   "mov.b\t{$src, $dst}",
+                   [(set GR8:$dst, imm:$src)]>;
+def MOV16ri : I16ri<0x0,
+                    (outs GR16:$dst), (ins i16imm:$src),
+                    "mov.w\t{$src, $dst}",
+                    [(set GR16:$dst, imm:$src)]>;
+}
+
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
+def MOV8rm  : I8rm<0x0,
+                   (outs GR8:$dst), (ins memsrc:$src),
+                   "mov.b\t{$src, $dst}",
+                   [(set GR8:$dst, (load addr:$src))]>;
+def MOV16rm : I16rm<0x0,
+                    (outs GR16:$dst), (ins memsrc:$src),
+                    "mov.w\t{$src, $dst}",
+                    [(set GR16:$dst, (load addr:$src))]>;
+}
+
+def MOVZX16rr8 : I8rr<0x0,
+                      (outs GR16:$dst), (ins GR8:$src),
+                      "mov.b\t{$src, $dst}",
+                      [(set GR16:$dst, (zext GR8:$src))]>;
+def MOVZX16rm8 : I8rm<0x0,
+                      (outs GR16:$dst), (ins memsrc:$src),
+                      "mov.b\t{$src, $dst}",
+                      [(set GR16:$dst, (zextloadi16i8 addr:$src))]>;
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1, Constraints = "$base = $base_wb" in {
+def MOV8rm_POST  : IForm8<0x0, DstReg, SrcPostInc, Size2Bytes,
+                         (outs GR8:$dst, GR16:$base_wb), (ins GR16:$base),
+                         "mov.b\t{@$base+, $dst}", []>;
+def MOV16rm_POST : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
+                           (outs GR16:$dst, GR16:$base_wb), (ins GR16:$base),
+                           "mov.w\t{@$base+, $dst}", []>;
+}
+
+// Any instruction that defines a 8-bit result leaves the high half of the
+// register. Truncate can be lowered to EXTRACT_SUBREG, and CopyFromReg may
+// be copying from a truncate, but any other 8-bit operation will zero-extend
+// up to 16 bits.
+def def8 : PatLeaf<(i8 GR8:$src), [{
+  return N->getOpcode() != ISD::TRUNCATE &&
+         N->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
+         N->getOpcode() != ISD::CopyFromReg;
+}]>;
+
+// In the case of a 8-bit def that is known to implicitly zero-extend,
+// we can use a SUBREG_TO_REG.
+def : Pat<(i16 (zext def8:$src)),
+          (SUBREG_TO_REG (i16 0), GR8:$src, subreg_8bit)>;
+
+def MOV8mi  : I8mi<0x0,
+                   (outs), (ins memdst:$dst, i8imm:$src),
+                   "mov.b\t{$src, $dst}",
+                   [(store (i8 imm:$src), addr:$dst)]>;
+def MOV16mi : I16mi<0x0,
+                    (outs), (ins memdst:$dst, i16imm:$src),
+                    "mov.w\t{$src, $dst}",
+                    [(store (i16 imm:$src), addr:$dst)]>;
+
+def MOV8mr  : I8mr<0x0,
+                   (outs), (ins memdst:$dst, GR8:$src),
+                   "mov.b\t{$src, $dst}",
+                   [(store GR8:$src, addr:$dst)]>;
+def MOV16mr : I16mr<0x0,
+                    (outs), (ins memdst:$dst, GR16:$src),
+                    "mov.w\t{$src, $dst}",
+                    [(store GR16:$src, addr:$dst)]>;
+
+def MOV8mm  : I8mm<0x0,
+                   (outs), (ins memdst:$dst, memsrc:$src),
+                   "mov.b\t{$src, $dst}",
+                   [(store (i8 (load addr:$src)), addr:$dst)]>;
+def MOV16mm : I16mm<0x0,
+                    (outs), (ins memdst:$dst, memsrc:$src),
+                    "mov.w\t{$src, $dst}",
+                    [(store (i16 (load addr:$src)), addr:$dst)]>;
+
+//===----------------------------------------------------------------------===//
+// Arithmetic Instructions
+
+let Constraints = "$src = $dst" in {
+
+let Defs = [SRW] in {
+
+let isCommutable = 1 in { // X = ADD Y, Z  == X = ADD Z, Y
+
+def ADD8rr  : I8rr<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
+                   "add.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (add GR8:$src, GR8:$src2)),
+                    (implicit SRW)]>;
+def ADD16rr : I16rr<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
+                    "add.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (add GR16:$src, GR16:$src2)),
+                     (implicit SRW)]>;
+}
+
+def ADD8rm  : I8rm<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
+                   "add.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (add GR8:$src, (load addr:$src2))),
+                    (implicit SRW)]>;
+def ADD16rm : I16rm<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
+                    "add.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (add GR16:$src, (load addr:$src2))),
+                     (implicit SRW)]>;
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1, 
+Constraints = "$base = $base_wb, $src = $dst" in {
+def ADD8rm_POST : IForm8<0x0, DstReg, SrcPostInc, Size2Bytes,
+                         (outs GR8:$dst, GR16:$base_wb),
+                         (ins GR8:$src, GR16:$base),
+                         "add.b\t{@$base+, $dst}", []>;
+def ADD16rm_POST : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
+                           (outs GR16:$dst, GR16:$base_wb),
+                           (ins GR16:$src, GR16:$base),
+                          "add.w\t{@$base+, $dst}", []>;
+}
+
+
+def ADD8ri  : I8ri<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
+                   "add.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (add GR8:$src, imm:$src2)),
+                    (implicit SRW)]>;
+def ADD16ri : I16ri<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
+                    "add.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (add GR16:$src, imm:$src2)),
+                     (implicit SRW)]>;
+
+let Constraints = "" in {
+def ADD8mr  : I8mr<0x0,
+                   (outs), (ins memdst:$dst, GR8:$src),
+                   "add.b\t{$src, $dst}",
+                   [(store (add (load addr:$dst), GR8:$src), addr:$dst),
+                    (implicit SRW)]>;
+def ADD16mr : I16mr<0x0,
+                    (outs), (ins memdst:$dst, GR16:$src),
+                    "add.w\t{$src, $dst}",
+                    [(store (add (load addr:$dst), GR16:$src), addr:$dst),
+                     (implicit SRW)]>;
+
+def ADD8mi  : I8mi<0x0,
+                   (outs), (ins memdst:$dst, i8imm:$src),
+                   "add.b\t{$src, $dst}",
+                   [(store (add (load addr:$dst), (i8 imm:$src)), addr:$dst),
+                    (implicit SRW)]>;
+def ADD16mi : I16mi<0x0,
+                    (outs), (ins memdst:$dst, i16imm:$src),
+                    "add.w\t{$src, $dst}",
+                    [(store (add (load addr:$dst), (i16 imm:$src)), addr:$dst),
+                     (implicit SRW)]>;
+
+def ADD8mm  : I8mm<0x0,
+                   (outs), (ins memdst:$dst, memsrc:$src),
+                   "add.b\t{$src, $dst}",
+                   [(store (add (load addr:$dst), 
+                                (i8 (load addr:$src))), addr:$dst),
+                    (implicit SRW)]>;
+def ADD16mm : I16mm<0x0,
+                    (outs), (ins memdst:$dst, memsrc:$src),
+                    "add.w\t{$src, $dst}",
+                    [(store (add (load addr:$dst), 
+                                  (i16 (load addr:$src))), addr:$dst),
+                     (implicit SRW)]>;
+}
+
+let Uses = [SRW] in {
+
+let isCommutable = 1 in { // X = ADDC Y, Z  == X = ADDC Z, Y
+def ADC8rr  : I8rr<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
+                   "addc.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (adde GR8:$src, GR8:$src2)),
+                    (implicit SRW)]>;
+def ADC16rr : I16rr<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
+                    "addc.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (adde GR16:$src, GR16:$src2)),
+                     (implicit SRW)]>;
+} // isCommutable
+
+def ADC8ri  : I8ri<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
+                   "addc.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (adde GR8:$src, imm:$src2)),
+                    (implicit SRW)]>;
+def ADC16ri : I16ri<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
+                    "addc.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (adde GR16:$src, imm:$src2)),
+                     (implicit SRW)]>;
+
+def ADC8rm  : I8rm<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
+                   "addc.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (adde GR8:$src, (load addr:$src2))),
+                    (implicit SRW)]>;
+def ADC16rm : I16rm<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
+                    "addc.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (adde GR16:$src, (load addr:$src2))),
+                     (implicit SRW)]>;
+
+let Constraints = "" in {
+def ADC8mr  : I8mr<0x0,
+                   (outs), (ins memdst:$dst, GR8:$src),
+                   "addc.b\t{$src, $dst}",
+                   [(store (adde (load addr:$dst), GR8:$src), addr:$dst),
+                    (implicit SRW)]>;
+def ADC16mr : I16mr<0x0,
+                    (outs), (ins memdst:$dst, GR16:$src),
+                    "addc.w\t{$src, $dst}",
+                    [(store (adde (load addr:$dst), GR16:$src), addr:$dst),
+                     (implicit SRW)]>;
+
+def ADC8mi  : I8mi<0x0,
+                   (outs), (ins memdst:$dst, i8imm:$src),
+                   "addc.b\t{$src, $dst}",
+                   [(store (adde (load addr:$dst), (i8 imm:$src)), addr:$dst),
+                    (implicit SRW)]>;
+def ADC16mi : I16mi<0x0,
+                    (outs), (ins memdst:$dst, i16imm:$src),
+                    "addc.w\t{$src, $dst}",
+                    [(store (adde (load addr:$dst), (i16 imm:$src)), addr:$dst),
+                     (implicit SRW)]>;
+
+def ADC8mm  : I8mm<0x0,
+                   (outs), (ins memdst:$dst, memsrc:$src),
+                   "addc.b\t{$src, $dst}",
+                   [(store (adde (load addr:$dst), 
+                                 (i8 (load addr:$src))), addr:$dst),
+                    (implicit SRW)]>;
+def ADC16mm : I8mm<0x0,
+                   (outs), (ins memdst:$dst, memsrc:$src),
+                   "addc.w\t{$src, $dst}",
+                   [(store (adde (load addr:$dst), 
+                                 (i16 (load addr:$src))), addr:$dst),
+                    (implicit SRW)]>;
+}
+
+} // Uses = [SRW]
+
+let isCommutable = 1 in { // X = AND Y, Z  == X = AND Z, Y
+def AND8rr  : I8rr<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
+                   "and.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (and GR8:$src, GR8:$src2)),
+                    (implicit SRW)]>;
+def AND16rr : I16rr<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
+                    "and.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (and GR16:$src, GR16:$src2)),
+                     (implicit SRW)]>;
+}
+
+def AND8ri  : I8ri<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
+                   "and.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (and GR8:$src, imm:$src2)),
+                    (implicit SRW)]>;
+def AND16ri : I16ri<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
+                    "and.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (and GR16:$src, imm:$src2)),
+                     (implicit SRW)]>;
+
+def AND8rm  : I8rm<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
+                   "and.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (and GR8:$src, (load addr:$src2))),
+                    (implicit SRW)]>;
+def AND16rm : I16rm<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
+                    "and.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (and GR16:$src, (load addr:$src2))),
+                     (implicit SRW)]>;
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1, 
+Constraints = "$base = $base_wb, $src = $dst" in {
+def AND8rm_POST : IForm8<0x0, DstReg, SrcPostInc, Size2Bytes,
+                         (outs GR8:$dst, GR16:$base_wb),
+                         (ins GR8:$src, GR16:$base),
+                         "and.b\t{@$base+, $dst}", []>;
+def AND16rm_POST : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
+                           (outs GR16:$dst, GR16:$base_wb),
+                           (ins GR16:$src, GR16:$base),
+                           "and.w\t{@$base+, $dst}", []>;
+}
+
+let Constraints = "" in {
+def AND8mr  : I8mr<0x0,
+                   (outs), (ins memdst:$dst, GR8:$src),
+                   "and.b\t{$src, $dst}",
+                   [(store (and (load addr:$dst), GR8:$src), addr:$dst),
+                    (implicit SRW)]>;
+def AND16mr : I16mr<0x0,
+                    (outs), (ins memdst:$dst, GR16:$src),
+                    "and.w\t{$src, $dst}",
+                    [(store (and (load addr:$dst), GR16:$src), addr:$dst),
+                     (implicit SRW)]>;
+
+def AND8mi  : I8mi<0x0,
+                   (outs), (ins memdst:$dst, i8imm:$src),
+                   "and.b\t{$src, $dst}",
+                   [(store (and (load addr:$dst), (i8 imm:$src)), addr:$dst),
+                    (implicit SRW)]>;
+def AND16mi : I16mi<0x0,
+                    (outs), (ins memdst:$dst, i16imm:$src),
+                    "and.w\t{$src, $dst}",
+                    [(store (and (load addr:$dst), (i16 imm:$src)), addr:$dst),
+                     (implicit SRW)]>;
+
+def AND8mm  : I8mm<0x0,
+                   (outs), (ins memdst:$dst, memsrc:$src),
+                   "and.b\t{$src, $dst}",
+                   [(store (and (load addr:$dst), 
+                                (i8 (load addr:$src))), addr:$dst),
+                    (implicit SRW)]>;
+def AND16mm : I16mm<0x0,
+                    (outs), (ins memdst:$dst, memsrc:$src),
+                    "and.w\t{$src, $dst}",
+                    [(store (and (load addr:$dst), 
+                                 (i16 (load addr:$src))), addr:$dst),
+                     (implicit SRW)]>;
+}
+
+let isCommutable = 1 in { // X = OR Y, Z  == X = OR Z, Y
+def OR8rr  : I8rr<0x0,
+                  (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
+                  "bis.b\t{$src2, $dst}",
+                  [(set GR8:$dst, (or GR8:$src, GR8:$src2))]>;
+def OR16rr : I16rr<0x0,
+                   (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
+                   "bis.w\t{$src2, $dst}",
+                   [(set GR16:$dst, (or GR16:$src, GR16:$src2))]>;
+}
+
+def OR8ri  : I8ri<0x0,
+                  (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
+                  "bis.b\t{$src2, $dst}",
+                  [(set GR8:$dst, (or GR8:$src, imm:$src2))]>;
+def OR16ri : I16ri<0x0,
+                   (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
+                   "bis.w\t{$src2, $dst}",
+                   [(set GR16:$dst, (or GR16:$src, imm:$src2))]>;
+
+def OR8rm  : I8rm<0x0,
+                  (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
+                  "bis.b\t{$src2, $dst}",
+                  [(set GR8:$dst, (or GR8:$src, (load addr:$src2)))]>;
+def OR16rm : I16rm<0x0,
+                   (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
+                   "bis.w\t{$src2, $dst}",
+                   [(set GR16:$dst, (or GR16:$src, (load addr:$src2)))]>;
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1, 
+Constraints = "$base = $base_wb, $src = $dst" in {
+def OR8rm_POST : IForm8<0x0, DstReg, SrcPostInc, Size2Bytes,
+                        (outs GR8:$dst, GR16:$base_wb),
+                        (ins GR8:$src, GR16:$base),
+                        "bis.b\t{@$base+, $dst}", []>;
+def OR16rm_POST : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
+                          (outs GR16:$dst, GR16:$base_wb),
+                          (ins GR16:$src, GR16:$base),
+                          "bis.w\t{@$base+, $dst}", []>;
+}
+
+let Constraints = "" in {
+def OR8mr  : I8mr<0x0,
+                  (outs), (ins memdst:$dst, GR8:$src),
+                  "bis.b\t{$src, $dst}",
+                  [(store (or (load addr:$dst), GR8:$src), addr:$dst)]>;
+def OR16mr : I16mr<0x0,
+                   (outs), (ins memdst:$dst, GR16:$src),
+                   "bis.w\t{$src, $dst}",
+                   [(store (or (load addr:$dst), GR16:$src), addr:$dst)]>;
+
+def OR8mi  : I8mi<0x0, 
+                  (outs), (ins memdst:$dst, i8imm:$src),
+                  "bis.b\t{$src, $dst}",
+                  [(store (or (load addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+def OR16mi : I16mi<0x0,
+                   (outs), (ins memdst:$dst, i16imm:$src),
+                   "bis.w\t{$src, $dst}",
+                   [(store (or (load addr:$dst), (i16 imm:$src)), addr:$dst)]>;
+
+def OR8mm  : I8mm<0x0,
+                  (outs), (ins memdst:$dst, memsrc:$src),
+                  "bis.b\t{$src, $dst}",
+                  [(store (or (i8 (load addr:$dst)),
+                              (i8 (load addr:$src))), addr:$dst)]>;
+def OR16mm : I16mm<0x0,
+                   (outs), (ins memdst:$dst, memsrc:$src),
+                   "bis.w\t{$src, $dst}",
+                   [(store (or (i16 (load addr:$dst)),
+                               (i16 (load addr:$src))), addr:$dst)]>;
+}
+
+// bic does not modify condition codes
+def BIC8rr :  I8rr<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
+                   "bic.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (and GR8:$src, (not GR8:$src2)))]>;
+def BIC16rr : I16rr<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
+                    "bic.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (and GR16:$src, (not GR16:$src2)))]>;
+
+def BIC8rm :  I8rm<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
+                   "bic.b\t{$src2, $dst}",
+                    [(set GR8:$dst, (and GR8:$src, (not (i8 (load addr:$src2)))))]>;
+def BIC16rm : I16rm<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
+                    "bic.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (and GR16:$src, (not (i16 (load addr:$src2)))))]>;
+
+let Constraints = "" in {
+def BIC8mr :  I8mr<0x0,
+                   (outs), (ins memdst:$dst, GR8:$src),
+                   "bic.b\t{$src, $dst}",
+                   [(store (and (load addr:$dst), (not GR8:$src)), addr:$dst)]>;
+def BIC16mr : I16mr<0x0,
+                    (outs), (ins memdst:$dst, GR16:$src),
+                    "bic.w\t{$src, $dst}",
+                    [(store (and (load addr:$dst), (not GR16:$src)), addr:$dst)]>;
+
+def BIC8mm :  I8mm<0x0,
+                   (outs), (ins memdst:$dst, memsrc:$src),
+                   "bic.b\t{$src, $dst}",
+                   [(store (and (load addr:$dst),
+                                (not (i8 (load addr:$src)))), addr:$dst)]>;
+def BIC16mm : I16mm<0x0,
+                    (outs), (ins memdst:$dst, memsrc:$src),
+                    "bic.w\t{$src, $dst}",
+                    [(store (and (load addr:$dst),
+                                 (not (i16 (load addr:$src)))), addr:$dst)]>;
+}
+
+let isCommutable = 1 in { // X = XOR Y, Z  == X = XOR Z, Y
+def XOR8rr  : I8rr<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
+                   "xor.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (xor GR8:$src, GR8:$src2)),
+                    (implicit SRW)]>;
+def XOR16rr : I16rr<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
+                    "xor.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (xor GR16:$src, GR16:$src2)),
+                     (implicit SRW)]>;
+}
+
+def XOR8ri  : I8ri<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
+                   "xor.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (xor GR8:$src, imm:$src2)),
+                    (implicit SRW)]>;
+def XOR16ri : I16ri<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
+                    "xor.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (xor GR16:$src, imm:$src2)),
+                     (implicit SRW)]>;
+
+def XOR8rm  : I8rm<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
+                   "xor.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (xor GR8:$src, (load addr:$src2))),
+                    (implicit SRW)]>;
+def XOR16rm : I16rm<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
+                    "xor.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (xor GR16:$src, (load addr:$src2))),
+                     (implicit SRW)]>;
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1, 
+Constraints = "$base = $base_wb, $src = $dst" in {
+def XOR8rm_POST : IForm8<0x0, DstReg, SrcPostInc, Size2Bytes,
+                         (outs GR8:$dst, GR16:$base_wb),
+                         (ins GR8:$src, GR16:$base),
+                         "xor.b\t{@$base+, $dst}", []>;
+def XOR16rm_POST : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
+                           (outs GR16:$dst, GR16:$base_wb),
+                           (ins GR16:$src, GR16:$base),
+                           "xor.w\t{@$base+, $dst}", []>;
+}
+
+let Constraints = "" in {
+def XOR8mr  : I8mr<0x0,
+                   (outs), (ins memdst:$dst, GR8:$src),
+                   "xor.b\t{$src, $dst}",
+                   [(store (xor (load addr:$dst), GR8:$src), addr:$dst),
+                    (implicit SRW)]>;
+def XOR16mr : I16mr<0x0,
+                    (outs), (ins memdst:$dst, GR16:$src),
+                    "xor.w\t{$src, $dst}",
+                    [(store (xor (load addr:$dst), GR16:$src), addr:$dst),
+                     (implicit SRW)]>;
+
+def XOR8mi  : I8mi<0x0,
+                   (outs), (ins memdst:$dst, i8imm:$src),
+                   "xor.b\t{$src, $dst}",
+                   [(store (xor (load addr:$dst), (i8 imm:$src)), addr:$dst),
+                    (implicit SRW)]>;
+def XOR16mi : I16mi<0x0,
+                    (outs), (ins memdst:$dst, i16imm:$src),
+                    "xor.w\t{$src, $dst}",
+                    [(store (xor (load addr:$dst), (i16 imm:$src)), addr:$dst),
+                     (implicit SRW)]>;
+
+def XOR8mm  : I8mm<0x0,
+                   (outs), (ins memdst:$dst, memsrc:$src),
+                   "xor.b\t{$src, $dst}",
+                   [(store (xor (load addr:$dst), (i8 (load addr:$src))), addr:$dst),
+                    (implicit SRW)]>;
+def XOR16mm : I16mm<0x0,
+                    (outs), (ins memdst:$dst, memsrc:$src),
+                    "xor.w\t{$src, $dst}",
+                    [(store (xor (load addr:$dst), (i16 (load addr:$src))), addr:$dst),
+                     (implicit SRW)]>;
+}
+
+
+def SUB8rr  : I8rr<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
+                   "sub.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (sub GR8:$src, GR8:$src2)),
+                    (implicit SRW)]>;
+def SUB16rr : I16rr<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
+                    "sub.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (sub GR16:$src, GR16:$src2)),
+                     (implicit SRW)]>;
+
+def SUB8ri  : I8ri<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
+                   "sub.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (sub GR8:$src, imm:$src2)),
+                    (implicit SRW)]>;
+def SUB16ri : I16ri<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
+                    "sub.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (sub GR16:$src, imm:$src2)),
+                     (implicit SRW)]>;
+
+def SUB8rm  : I8rm<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
+                   "sub.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (sub GR8:$src, (load addr:$src2))),
+                    (implicit SRW)]>;
+def SUB16rm : I16rm<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
+                    "sub.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (sub GR16:$src, (load addr:$src2))),
+                     (implicit SRW)]>;
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1, 
+Constraints = "$base = $base_wb, $src = $dst" in {
+def SUB8rm_POST : IForm8<0x0, DstReg, SrcPostInc, Size2Bytes,
+                         (outs GR8:$dst, GR16:$base_wb),
+                         (ins GR8:$src, GR16:$base),
+                         "sub.b\t{@$base+, $dst}", []>;
+def SUB16rm_POST : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
+                          (outs GR16:$dst, GR16:$base_wb),
+                          (ins GR16:$src, GR16:$base),
+                          "sub.w\t{@$base+, $dst}", []>;
+}
+
+let Constraints = "" in {
+def SUB8mr  : I8mr<0x0,
+                   (outs), (ins memdst:$dst, GR8:$src),
+                   "sub.b\t{$src, $dst}",
+                   [(store (sub (load addr:$dst), GR8:$src), addr:$dst),
+                    (implicit SRW)]>;
+def SUB16mr : I16mr<0x0,
+                    (outs), (ins memdst:$dst, GR16:$src),
+                    "sub.w\t{$src, $dst}",
+                    [(store (sub (load addr:$dst), GR16:$src), addr:$dst),
+                     (implicit SRW)]>;
+
+def SUB8mi  : I8mi<0x0,
+                   (outs), (ins memdst:$dst, i8imm:$src),
+                   "sub.b\t{$src, $dst}",
+                   [(store (sub (load addr:$dst), (i8 imm:$src)), addr:$dst),
+                    (implicit SRW)]>;
+def SUB16mi : I16mi<0x0,
+                    (outs), (ins memdst:$dst, i16imm:$src),
+                    "sub.w\t{$src, $dst}",
+                    [(store (sub (load addr:$dst), (i16 imm:$src)), addr:$dst),
+                     (implicit SRW)]>;
+
+def SUB8mm  : I8mm<0x0,
+                   (outs), (ins memdst:$dst, memsrc:$src),
+                   "sub.b\t{$src, $dst}",
+                   [(store (sub (load addr:$dst), 
+                                (i8 (load addr:$src))), addr:$dst),
+                    (implicit SRW)]>;
+def SUB16mm : I16mm<0x0,
+                    (outs), (ins memdst:$dst, memsrc:$src),
+                    "sub.w\t{$src, $dst}",
+                    [(store (sub (load addr:$dst), 
+                                 (i16 (load addr:$src))), addr:$dst),
+                     (implicit SRW)]>;
+}
+
+let Uses = [SRW] in {
+def SBC8rr  : I8rr<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
+                   "subc.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (sube GR8:$src, GR8:$src2)),
+                    (implicit SRW)]>;
+def SBC16rr : I16rr<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
+                    "subc.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (sube GR16:$src, GR16:$src2)),
+                     (implicit SRW)]>;
+
+def SBC8ri  : I8ri<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
+                   "subc.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (sube GR8:$src, imm:$src2)),
+                    (implicit SRW)]>;
+def SBC16ri : I16ri<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
+                    "subc.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (sube GR16:$src, imm:$src2)),
+                     (implicit SRW)]>;
+
+def SBC8rm  : I8rm<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
+                   "subc.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (sube GR8:$src, (load addr:$src2))),
+                    (implicit SRW)]>;
+def SBC16rm : I16rm<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
+                    "subc.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (sube GR16:$src, (load addr:$src2))),
+                     (implicit SRW)]>;
+
+let Constraints = "" in {
+def SBC8mr  : I8mr<0x0,
+                   (outs), (ins memdst:$dst, GR8:$src),
+                   "subc.b\t{$src, $dst}",
+                  [(store (sube (load addr:$dst), GR8:$src), addr:$dst),
+                   (implicit SRW)]>;
+def SBC16mr : I16mr<0x0,
+                    (outs), (ins memdst:$dst, GR16:$src),
+                    "subc.w\t{$src, $dst}",
+                    [(store (sube (load addr:$dst), GR16:$src), addr:$dst),
+                     (implicit SRW)]>;
+
+def SBC8mi  : I8mi<0x0,
+                   (outs), (ins memdst:$dst, i8imm:$src),
+                   "subc.b\t{$src, $dst}",
+                   [(store (sube (load addr:$dst), (i8 imm:$src)), addr:$dst),
+                    (implicit SRW)]>;
+def SBC16mi : I16mi<0x0,
+                    (outs), (ins memdst:$dst, i16imm:$src),
+                    "subc.w\t{$src, $dst}",
+                    [(store (sube (load addr:$dst), (i16 imm:$src)), addr:$dst),
+                     (implicit SRW)]>;
+
+def SBC8mm  : I8mm<0x0,
+                   (outs), (ins memdst:$dst, memsrc:$src),
+                   "subc.b\t{$src, $dst}",
+                   [(store (sube (load addr:$dst),
+                                 (i8 (load addr:$src))), addr:$dst),
+                    (implicit SRW)]>;
+def SBC16mm : I16mm<0x0,
+                    (outs), (ins memdst:$dst, memsrc:$src),
+                    "subc.w\t{$src, $dst}",
+                    [(store (sube (load addr:$dst),
+                            (i16 (load addr:$src))), addr:$dst),
+                     (implicit SRW)]>;
+}
+
+} // Uses = [SRW]
+
+// FIXME: memory variant!
+def SAR8r1  : II8r<0x0,
+                   (outs GR8:$dst), (ins GR8:$src),
+                   "rra.b\t$dst",
+                   [(set GR8:$dst, (MSP430rra GR8:$src)),
+                    (implicit SRW)]>;
+def SAR16r1 : II16r<0x0,
+                    (outs GR16:$dst), (ins GR16:$src),
+                    "rra.w\t$dst",
+                    [(set GR16:$dst, (MSP430rra GR16:$src)),
+                     (implicit SRW)]>;
+
+def SHL8r1  : I8rr<0x0,
+                   (outs GR8:$dst), (ins GR8:$src),
+                   "rla.b\t$dst",
+                   [(set GR8:$dst, (MSP430rla GR8:$src)),
+                    (implicit SRW)]>;
+def SHL16r1 : I16rr<0x0,
+                    (outs GR16:$dst), (ins GR16:$src),
+                    "rla.w\t$dst",
+                    [(set GR16:$dst, (MSP430rla GR16:$src)),
+                     (implicit SRW)]>;
+
+def SAR8r1c  : Pseudo<(outs GR8:$dst), (ins GR8:$src),
+                      "clrc\n\t"
+                      "rrc.b\t$dst",
+                      [(set GR8:$dst, (MSP430rrc GR8:$src)),
+                       (implicit SRW)]>;
+def SAR16r1c : Pseudo<(outs GR16:$dst), (ins GR16:$src),
+                      "clrc\n\t"
+                      "rrc.w\t$dst",
+                      [(set GR16:$dst, (MSP430rrc GR16:$src)),
+                       (implicit SRW)]>;
+
+// FIXME: Memory sext's ?
+def SEXT16r : II16r<0x0,
+                    (outs GR16:$dst), (ins GR16:$src),
+                    "sxt\t$dst",
+                    [(set GR16:$dst, (sext_inreg GR16:$src, i8)),
+                     (implicit SRW)]>;
+
+} // Defs = [SRW]
+
+def ZEXT16r : I8rr<0x0,
+                   (outs GR16:$dst), (ins GR16:$src),
+                   "mov.b\t{$src, $dst}",
+                   [(set GR16:$dst, (zext (trunc GR16:$src)))]>;
+
+// FIXME: Memory bitswaps?
+def SWPB16r : II16r<0x0,
+                    (outs GR16:$dst), (ins GR16:$src),
+                    "swpb\t$dst",
+                    [(set GR16:$dst, (bswap GR16:$src))]>;
+
+} // Constraints = "$src = $dst"
+
+// Integer comparisons
+let Defs = [SRW] in {
+def CMP8rr  : I8rr<0x0,
+                   (outs), (ins GR8:$src, GR8:$src2),
+                   "cmp.b\t{$src2, $src}",
+                   [(MSP430cmp GR8:$src, GR8:$src2), (implicit SRW)]>;
+def CMP16rr : I16rr<0x0,
+                    (outs), (ins GR16:$src, GR16:$src2),
+                    "cmp.w\t{$src2, $src}",
+                    [(MSP430cmp GR16:$src, GR16:$src2), (implicit SRW)]>;
+
+def CMP8ri  : I8ri<0x0,
+                   (outs), (ins GR8:$src, i8imm:$src2),
+                   "cmp.b\t{$src2, $src}",
+                   [(MSP430cmp GR8:$src, imm:$src2), (implicit SRW)]>;
+def CMP16ri : I16ri<0x0,
+                    (outs), (ins GR16:$src, i16imm:$src2),
+                    "cmp.w\t{$src2, $src}",
+                    [(MSP430cmp GR16:$src, imm:$src2), (implicit SRW)]>;
+
+def CMP8mi  : I8mi<0x0,
+                   (outs), (ins memsrc:$src, i8imm:$src2),
+                   "cmp.b\t{$src2, $src}",
+                   [(MSP430cmp (load addr:$src),
+                               (i8 imm:$src2)), (implicit SRW)]>;
+def CMP16mi : I16mi<0x0,
+                    (outs), (ins memsrc:$src, i16imm:$src2),
+                    "cmp.w\t{$src2, $src}",
+                     [(MSP430cmp (load addr:$src),
+                                 (i16 imm:$src2)), (implicit SRW)]>;
+
+def CMP8rm  : I8rm<0x0,
+                   (outs), (ins GR8:$src, memsrc:$src2),
+                   "cmp.b\t{$src2, $src}",
+                   [(MSP430cmp GR8:$src, (load addr:$src2)), 
+                    (implicit SRW)]>;
+def CMP16rm : I16rm<0x0,
+                    (outs), (ins GR16:$src, memsrc:$src2),
+                    "cmp.w\t{$src2, $src}",
+                    [(MSP430cmp GR16:$src, (load addr:$src2)),
+                     (implicit SRW)]>;
+
+def CMP8mr  : I8mr<0x0,
+                   (outs), (ins memsrc:$src, GR8:$src2),
+                   "cmp.b\t{$src2, $src}",
+                   [(MSP430cmp (load addr:$src), GR8:$src2),
+                    (implicit SRW)]>;
+def CMP16mr : I16mr<0x0,
+                    (outs), (ins memsrc:$src, GR16:$src2),
+                    "cmp.w\t{$src2, $src}",
+                    [(MSP430cmp (load addr:$src), GR16:$src2), 
+                     (implicit SRW)]>;
+
+
+// BIT TESTS, just sets condition codes
+// Note that the C condition is set differently than when using CMP.
+let isCommutable = 1 in {
+def BIT8rr  : I8rr<0x0,
+                   (outs), (ins GR8:$src, GR8:$src2),
+                   "bit.b\t{$src2, $src}",
+                   [(MSP430cmp (and_su GR8:$src, GR8:$src2), 0),
+                    (implicit SRW)]>;
+def BIT16rr : I16rr<0x0,
+                    (outs), (ins GR16:$src, GR16:$src2),
+                    "bit.w\t{$src2, $src}",
+                    [(MSP430cmp (and_su GR16:$src, GR16:$src2), 0),
+                     (implicit SRW)]>;
+}
+def BIT8ri  : I8ri<0x0,
+                   (outs), (ins GR8:$src, i8imm:$src2),
+                   "bit.b\t{$src2, $src}",
+                   [(MSP430cmp (and_su GR8:$src, imm:$src2), 0),
+                    (implicit SRW)]>;
+def BIT16ri : I16ri<0x0,
+                    (outs), (ins GR16:$src, i16imm:$src2),
+                    "bit.w\t{$src2, $src}",
+                    [(MSP430cmp (and_su GR16:$src, imm:$src2), 0),
+                     (implicit SRW)]>;
+
+def BIT8rm  : I8rm<0x0,
+                   (outs), (ins GR8:$src, memdst:$src2),
+                   "bit.b\t{$src2, $src}",
+                   [(MSP430cmp (and_su GR8:$src,  (load addr:$src2)), 0),
+                    (implicit SRW)]>;
+def BIT16rm : I16rm<0x0,
+                    (outs), (ins GR16:$src, memdst:$src2),
+                    "bit.w\t{$src2, $src}",
+                    [(MSP430cmp (and_su GR16:$src,  (load addr:$src2)), 0),
+                     (implicit SRW)]>;
+
+def BIT8mr  : I8mr<0x0,
+                  (outs), (ins memsrc:$src, GR8:$src2),
+                  "bit.b\t{$src2, $src}",
+                  [(MSP430cmp (and_su (load addr:$src), GR8:$src2), 0),
+                   (implicit SRW)]>;
+def BIT16mr : I16mr<0x0,
+                    (outs), (ins memsrc:$src, GR16:$src2),
+                    "bit.w\t{$src2, $src}",
+                    [(MSP430cmp (and_su (load addr:$src), GR16:$src2), 0),
+                     (implicit SRW)]>;
+
+def BIT8mi  : I8mi<0x0,
+                   (outs), (ins memsrc:$src, i8imm:$src2),
+                   "bit.b\t{$src2, $src}",
+                   [(MSP430cmp (and_su (load addr:$src), (i8 imm:$src2)), 0),
+                    (implicit SRW)]>;
+def BIT16mi : I16mi<0x0,
+                    (outs), (ins memsrc:$src, i16imm:$src2),
+                    "bit.w\t{$src2, $src}",
+                    [(MSP430cmp (and_su (load addr:$src), (i16 imm:$src2)), 0),
+                     (implicit SRW)]>;
+
+def BIT8mm  : I8mm<0x0,
+                   (outs), (ins memsrc:$src, memsrc:$src2),
+                   "bit.b\t{$src2, $src}",
+                   [(MSP430cmp (and_su (i8 (load addr:$src)),
+                                       (load addr:$src2)),
+                                 0),
+                      (implicit SRW)]>;
+def BIT16mm : I16mm<0x0,
+                    (outs), (ins memsrc:$src, memsrc:$src2),
+                    "bit.w\t{$src2, $src}",
+                    [(MSP430cmp (and_su (i16 (load addr:$src)),
+                                        (load addr:$src2)),
+                                 0),
+                     (implicit SRW)]>;
+} // Defs = [SRW]
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+
+// extload
+def : Pat<(extloadi16i8 addr:$src), (MOVZX16rm8 addr:$src)>;
+
+// anyext
+def : Pat<(i16 (anyext GR8:$src)),
+          (SUBREG_TO_REG (i16 0), GR8:$src, subreg_8bit)>;
+
+// truncs
+def : Pat<(i8 (trunc GR16:$src)),
+          (EXTRACT_SUBREG GR16:$src, subreg_8bit)>;
+
+// GlobalAddress, ExternalSymbol
+def : Pat<(i16 (MSP430Wrapper tglobaladdr:$dst)), (MOV16ri tglobaladdr:$dst)>;
+def : Pat<(i16 (MSP430Wrapper texternalsym:$dst)), (MOV16ri texternalsym:$dst)>;
+def : Pat<(i16 (MSP430Wrapper tblockaddress:$dst)), (MOV16ri tblockaddress:$dst)>;
+
+def : Pat<(add GR16:$src, (MSP430Wrapper tglobaladdr :$src2)),
+          (ADD16ri GR16:$src, tglobaladdr:$src2)>;
+def : Pat<(add GR16:$src, (MSP430Wrapper texternalsym:$src2)),
+          (ADD16ri GR16:$src, texternalsym:$src2)>;
+def : Pat<(add GR16:$src, (MSP430Wrapper tblockaddress:$src2)),
+          (ADD16ri GR16:$src, tblockaddress:$src2)>;
+
+def : Pat<(store (i16 (MSP430Wrapper tglobaladdr:$src)), addr:$dst),
+          (MOV16mi addr:$dst, tglobaladdr:$src)>;
+def : Pat<(store (i16 (MSP430Wrapper texternalsym:$src)), addr:$dst),
+          (MOV16mi addr:$dst, texternalsym:$src)>;
+def : Pat<(store (i16 (MSP430Wrapper tblockaddress:$src)), addr:$dst),
+          (MOV16mi addr:$dst, tblockaddress:$src)>;
+
+// calls
+def : Pat<(MSP430call (i16 tglobaladdr:$dst)),
+          (CALLi tglobaladdr:$dst)>;
+def : Pat<(MSP430call (i16 texternalsym:$dst)),
+          (CALLi texternalsym:$dst)>;
+
+// add and sub always produce carry
+def : Pat<(addc GR16:$src, GR16:$src2),
+          (ADD16rr GR16:$src, GR16:$src2)>;
+def : Pat<(addc GR16:$src, (load addr:$src2)),
+          (ADD16rm GR16:$src, addr:$src2)>;
+def : Pat<(addc GR16:$src, imm:$src2),
+          (ADD16ri GR16:$src, imm:$src2)>;
+def : Pat<(store (addc (load addr:$dst), GR16:$src), addr:$dst),
+          (ADD16mr addr:$dst, GR16:$src)>;
+def : Pat<(store (addc (load addr:$dst), (i16 (load addr:$src))), addr:$dst),
+          (ADD16mm addr:$dst, addr:$src)>;
+
+def : Pat<(addc GR8:$src, GR8:$src2),
+          (ADD8rr GR8:$src, GR8:$src2)>;
+def : Pat<(addc GR8:$src, (load addr:$src2)),
+          (ADD8rm GR8:$src, addr:$src2)>;
+def : Pat<(addc GR8:$src, imm:$src2),
+          (ADD8ri GR8:$src, imm:$src2)>;
+def : Pat<(store (addc (load addr:$dst), GR8:$src), addr:$dst),
+          (ADD8mr addr:$dst, GR8:$src)>;
+def : Pat<(store (addc (load addr:$dst), (i8 (load addr:$src))), addr:$dst),
+          (ADD8mm addr:$dst, addr:$src)>;
+
+def : Pat<(subc GR16:$src, GR16:$src2),
+          (SUB16rr GR16:$src, GR16:$src2)>;
+def : Pat<(subc GR16:$src, (load addr:$src2)),
+          (SUB16rm GR16:$src, addr:$src2)>;
+def : Pat<(subc GR16:$src, imm:$src2),
+          (SUB16ri GR16:$src, imm:$src2)>;
+def : Pat<(store (subc (load addr:$dst), GR16:$src), addr:$dst),
+          (SUB16mr addr:$dst, GR16:$src)>;
+def : Pat<(store (subc (load addr:$dst), (i16 (load addr:$src))), addr:$dst),
+          (SUB16mm addr:$dst, addr:$src)>;
+
+def : Pat<(subc GR8:$src, GR8:$src2),
+          (SUB8rr GR8:$src, GR8:$src2)>;
+def : Pat<(subc GR8:$src, (load addr:$src2)),
+          (SUB8rm GR8:$src, addr:$src2)>;
+def : Pat<(subc GR8:$src, imm:$src2),
+          (SUB8ri GR8:$src, imm:$src2)>;
+def : Pat<(store (subc (load addr:$dst), GR8:$src), addr:$dst),
+          (SUB8mr addr:$dst, GR8:$src)>;
+def : Pat<(store (subc (load addr:$dst), (i8 (load addr:$src))), addr:$dst),
+          (SUB8mm addr:$dst, addr:$src)>;
+
+// peephole patterns
+def : Pat<(and GR16:$src, 255), (ZEXT16r GR16:$src)>;
+def : Pat<(MSP430cmp (trunc (and_su GR16:$src, GR16:$src2)), 0),
+          (BIT8rr (EXTRACT_SUBREG GR16:$src, subreg_8bit),
+                  (EXTRACT_SUBREG GR16:$src2, subreg_8bit))>;
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430MCInstLower.cpp b/contrib/llvm/lib/Target/MSP430/MSP430MCInstLower.cpp
new file mode 100644
index 0000000..05352a2
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430MCInstLower.cpp
@@ -0,0 +1,157 @@
+//===-- MSP430MCInstLower.cpp - Convert MSP430 MachineInstr to an MCInst --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower MSP430 MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430MCInstLower.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+MCSymbol *MSP430MCInstLower::
+GetGlobalAddressSymbol(const MachineOperand &MO) const {
+  switch (MO.getTargetFlags()) {
+  default: llvm_unreachable("Unknown target flag on GV operand");
+  case 0: break;
+  }
+
+  return Printer.getSymbol(MO.getGlobal());
+}
+
+MCSymbol *MSP430MCInstLower::
+GetExternalSymbolSymbol(const MachineOperand &MO) const {
+  switch (MO.getTargetFlags()) {
+  default: llvm_unreachable("Unknown target flag on GV operand");
+  case 0: break;
+  }
+
+  return Printer.GetExternalSymbolSymbol(MO.getSymbolName());
+}
+
+MCSymbol *MSP430MCInstLower::
+GetJumpTableSymbol(const MachineOperand &MO) const {
+  const DataLayout *DL = Printer.TM.getDataLayout();
+  SmallString<256> Name;
+  raw_svector_ostream(Name) << DL->getPrivateGlobalPrefix() << "JTI"
+                            << Printer.getFunctionNumber() << '_'
+                            << MO.getIndex();
+
+  switch (MO.getTargetFlags()) {
+  default: llvm_unreachable("Unknown target flag on GV operand");
+  case 0: break;
+  }
+
+  // Create a symbol for the name.
+  return Ctx.GetOrCreateSymbol(Name.str());
+}
+
+MCSymbol *MSP430MCInstLower::
+GetConstantPoolIndexSymbol(const MachineOperand &MO) const {
+  const DataLayout *DL = Printer.TM.getDataLayout();
+  SmallString<256> Name;
+  raw_svector_ostream(Name) << DL->getPrivateGlobalPrefix() << "CPI"
+                            << Printer.getFunctionNumber() << '_'
+                            << MO.getIndex();
+
+  switch (MO.getTargetFlags()) {
+  default: llvm_unreachable("Unknown target flag on GV operand");
+  case 0: break;
+  }
+
+  // Create a symbol for the name.
+  return Ctx.GetOrCreateSymbol(Name.str());
+}
+
+MCSymbol *MSP430MCInstLower::
+GetBlockAddressSymbol(const MachineOperand &MO) const {
+  switch (MO.getTargetFlags()) {
+  default: llvm_unreachable("Unknown target flag on GV operand");
+  case 0: break;
+  }
+
+  return Printer.GetBlockAddressSymbol(MO.getBlockAddress());
+}
+
+MCOperand MSP430MCInstLower::
+LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const {
+  // FIXME: We would like an efficient form for this, so we don't have to do a
+  // lot of extra uniquing.
+  const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+
+  switch (MO.getTargetFlags()) {
+  default: llvm_unreachable("Unknown target flag on GV operand");
+  case 0: break;
+  }
+
+  if (!MO.isJTI() && MO.getOffset())
+    Expr = MCBinaryExpr::CreateAdd(Expr,
+                                   MCConstantExpr::Create(MO.getOffset(), Ctx),
+                                   Ctx);
+  return MCOperand::CreateExpr(Expr);
+}
+
+void MSP430MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+  OutMI.setOpcode(MI->getOpcode());
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+
+    MCOperand MCOp;
+    switch (MO.getType()) {
+    default:
+      MI->dump();
+      llvm_unreachable("unknown operand type");
+    case MachineOperand::MO_Register:
+      // Ignore all implicit register operands.
+      if (MO.isImplicit()) continue;
+      MCOp = MCOperand::CreateReg(MO.getReg());
+      break;
+    case MachineOperand::MO_Immediate:
+      MCOp = MCOperand::CreateImm(MO.getImm());
+      break;
+    case MachineOperand::MO_MachineBasicBlock:
+      MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(
+                         MO.getMBB()->getSymbol(), Ctx));
+      break;
+    case MachineOperand::MO_GlobalAddress:
+      MCOp = LowerSymbolOperand(MO, GetGlobalAddressSymbol(MO));
+      break;
+    case MachineOperand::MO_ExternalSymbol:
+      MCOp = LowerSymbolOperand(MO, GetExternalSymbolSymbol(MO));
+      break;
+    case MachineOperand::MO_JumpTableIndex:
+      MCOp = LowerSymbolOperand(MO, GetJumpTableSymbol(MO));
+      break;
+    case MachineOperand::MO_ConstantPoolIndex:
+      MCOp = LowerSymbolOperand(MO, GetConstantPoolIndexSymbol(MO));
+      break;
+    case MachineOperand::MO_BlockAddress:
+      MCOp = LowerSymbolOperand(MO, GetBlockAddressSymbol(MO));
+      break;
+    case MachineOperand::MO_RegisterMask:
+      continue;
+    }
+
+    OutMI.addOperand(MCOp);
+  }
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430MCInstLower.h b/contrib/llvm/lib/Target/MSP430/MSP430MCInstLower.h
new file mode 100644
index 0000000..794aa56
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430MCInstLower.h
@@ -0,0 +1,47 @@
+//===-- MSP430MCInstLower.h - Lower MachineInstr to MCInst ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MSP430_MCINSTLOWER_H
+#define MSP430_MCINSTLOWER_H
+
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+  class AsmPrinter;
+  class MCContext;
+  class MCInst;
+  class MCOperand;
+  class MCSymbol;
+  class MachineInstr;
+  class MachineModuleInfoMachO;
+  class MachineOperand;
+
+  /// MSP430MCInstLower - This class is used to lower an MachineInstr
+  /// into an MCInst.
+class LLVM_LIBRARY_VISIBILITY MSP430MCInstLower {
+  MCContext &Ctx;
+
+  AsmPrinter &Printer;
+public:
+  MSP430MCInstLower(MCContext &ctx, AsmPrinter &printer)
+    : Ctx(ctx), Printer(printer) {}
+  void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+  MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
+
+  MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetJumpTableSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetConstantPoolIndexSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetBlockAddressSymbol(const MachineOperand &MO) const;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.cpp b/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.cpp
new file mode 100644
index 0000000..0f75399
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.cpp
@@ -0,0 +1,14 @@
+//===-- MSP430MachineFuctionInfo.cpp - MSP430 machine function info -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430MachineFunctionInfo.h"
+
+using namespace llvm;
+
+void MSP430MachineFunctionInfo::anchor() { }
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.h b/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.h
new file mode 100644
index 0000000..d1697f4
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.h
@@ -0,0 +1,54 @@
+//===- MSP430MachineFuctionInfo.h - MSP430 machine function info -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares MSP430-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MSP430MACHINEFUNCTIONINFO_H
+#define MSP430MACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+/// MSP430MachineFunctionInfo - This class is derived from MachineFunction and
+/// contains private MSP430 target-specific information for each MachineFunction.
+class MSP430MachineFunctionInfo : public MachineFunctionInfo {
+  virtual void anchor();
+
+  /// CalleeSavedFrameSize - Size of the callee-saved register portion of the
+  /// stack frame in bytes.
+  unsigned CalleeSavedFrameSize;
+
+  /// ReturnAddrIndex - FrameIndex for return slot.
+  int ReturnAddrIndex;
+
+  /// VarArgsFrameIndex - FrameIndex for start of varargs area.
+  int VarArgsFrameIndex;
+
+public:
+  MSP430MachineFunctionInfo() : CalleeSavedFrameSize(0) {}
+
+  explicit MSP430MachineFunctionInfo(MachineFunction &MF)
+    : CalleeSavedFrameSize(0), ReturnAddrIndex(0) {}
+
+  unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
+  void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; }
+
+  int getRAIndex() const { return ReturnAddrIndex; }
+  void setRAIndex(int Index) { ReturnAddrIndex = Index; }
+
+  int getVarArgsFrameIndex() const { return VarArgsFrameIndex;}
+  void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; }
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430RegisterInfo.cpp b/contrib/llvm/lib/Target/MSP430/MSP430RegisterInfo.cpp
new file mode 100644
index 0000000..691bcee
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430RegisterInfo.cpp
@@ -0,0 +1,162 @@
+//===-- MSP430RegisterInfo.cpp - MSP430 Register Information --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MSP430 implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430RegisterInfo.h"
+#include "MSP430.h"
+#include "MSP430MachineFunctionInfo.h"
+#include "MSP430TargetMachine.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "msp430-reg-info"
+
+#define GET_REGINFO_TARGET_DESC
+#include "MSP430GenRegisterInfo.inc"
+
+// FIXME: Provide proper call frame setup / destroy opcodes.
+MSP430RegisterInfo::MSP430RegisterInfo()
+  : MSP430GenRegisterInfo(MSP430::PCW) {}
+
+const MCPhysReg*
+MSP430RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  const TargetFrameLowering *TFI = MF->getTarget().getFrameLowering();
+  const Function* F = MF->getFunction();
+  static const MCPhysReg CalleeSavedRegs[] = {
+    MSP430::FPW, MSP430::R5W, MSP430::R6W, MSP430::R7W,
+    MSP430::R8W, MSP430::R9W, MSP430::R10W, MSP430::R11W,
+    0
+  };
+  static const MCPhysReg CalleeSavedRegsFP[] = {
+    MSP430::R5W, MSP430::R6W, MSP430::R7W,
+    MSP430::R8W, MSP430::R9W, MSP430::R10W, MSP430::R11W,
+    0
+  };
+  static const MCPhysReg CalleeSavedRegsIntr[] = {
+    MSP430::FPW,  MSP430::R5W,  MSP430::R6W,  MSP430::R7W,
+    MSP430::R8W,  MSP430::R9W,  MSP430::R10W, MSP430::R11W,
+    MSP430::R12W, MSP430::R13W, MSP430::R14W, MSP430::R15W,
+    0
+  };
+  static const MCPhysReg CalleeSavedRegsIntrFP[] = {
+    MSP430::R5W,  MSP430::R6W,  MSP430::R7W,
+    MSP430::R8W,  MSP430::R9W,  MSP430::R10W, MSP430::R11W,
+    MSP430::R12W, MSP430::R13W, MSP430::R14W, MSP430::R15W,
+    0
+  };
+
+  if (TFI->hasFP(*MF))
+    return (F->getCallingConv() == CallingConv::MSP430_INTR ?
+            CalleeSavedRegsIntrFP : CalleeSavedRegsFP);
+  else
+    return (F->getCallingConv() == CallingConv::MSP430_INTR ?
+            CalleeSavedRegsIntr : CalleeSavedRegs);
+
+}
+
+BitVector MSP430RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  // Mark 4 special registers with subregisters as reserved.
+  Reserved.set(MSP430::PCB);
+  Reserved.set(MSP430::SPB);
+  Reserved.set(MSP430::SRB);
+  Reserved.set(MSP430::CGB);
+  Reserved.set(MSP430::PCW);
+  Reserved.set(MSP430::SPW);
+  Reserved.set(MSP430::SRW);
+  Reserved.set(MSP430::CGW);
+
+  // Mark frame pointer as reserved if needed.
+  if (TFI->hasFP(MF)) {
+    Reserved.set(MSP430::FPB);
+    Reserved.set(MSP430::FPW);
+  }
+
+  return Reserved;
+}
+
+const TargetRegisterClass *
+MSP430RegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind)
+                                                                         const {
+  return &MSP430::GR16RegClass;
+}
+
+void
+MSP430RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                        int SPAdj, unsigned FIOperandNum,
+                                        RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected");
+
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  DebugLoc dl = MI.getDebugLoc();
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+
+  unsigned BasePtr = (TFI->hasFP(MF) ? MSP430::FPW : MSP430::SPW);
+  int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex);
+
+  // Skip the saved PC
+  Offset += 2;
+
+  if (!TFI->hasFP(MF))
+    Offset += MF.getFrameInfo()->getStackSize();
+  else
+    Offset += 2; // Skip the saved FPW
+
+  // Fold imm into offset
+  Offset += MI.getOperand(FIOperandNum + 1).getImm();
+
+  if (MI.getOpcode() == MSP430::ADD16ri) {
+    // This is actually "load effective address" of the stack slot
+    // instruction. We have only two-address instructions, thus we need to
+    // expand it into mov + add
+    const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+
+    MI.setDesc(TII.get(MSP430::MOV16rr));
+    MI.getOperand(FIOperandNum).ChangeToRegister(BasePtr, false);
+
+    if (Offset == 0)
+      return;
+
+    // We need to materialize the offset via add instruction.
+    unsigned DstReg = MI.getOperand(0).getReg();
+    if (Offset < 0)
+      BuildMI(MBB, std::next(II), dl, TII.get(MSP430::SUB16ri), DstReg)
+        .addReg(DstReg).addImm(-Offset);
+    else
+      BuildMI(MBB, std::next(II), dl, TII.get(MSP430::ADD16ri), DstReg)
+        .addReg(DstReg).addImm(Offset);
+
+    return;
+  }
+
+  MI.getOperand(FIOperandNum).ChangeToRegister(BasePtr, false);
+  MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+}
+
+unsigned MSP430RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  return TFI->hasFP(MF) ? MSP430::FPW : MSP430::SPW;
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430RegisterInfo.h b/contrib/llvm/lib/Target/MSP430/MSP430RegisterInfo.h
new file mode 100644
index 0000000..cb01961
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430RegisterInfo.h
@@ -0,0 +1,47 @@
+//===-- MSP430RegisterInfo.h - MSP430 Register Information Impl -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MSP430 implementation of the MRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_MSP430REGISTERINFO_H
+#define LLVM_TARGET_MSP430REGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "MSP430GenRegisterInfo.inc"
+
+namespace llvm {
+
+struct MSP430RegisterInfo : public MSP430GenRegisterInfo {
+public:
+  MSP430RegisterInfo();
+
+  /// Code Generation virtual methods...
+  const MCPhysReg *
+  getCalleeSavedRegs(const MachineFunction *MF = nullptr) const override;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+  const TargetRegisterClass*
+  getPointerRegClass(const MachineFunction &MF,
+                     unsigned Kind = 0) const override;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                           int SPAdj, unsigned FIOperandNum,
+                           RegScavenger *RS = nullptr) const override;
+
+  // Debug information queries.
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TARGET_MSP430REGISTERINFO_H
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430RegisterInfo.td b/contrib/llvm/lib/Target/MSP430/MSP430RegisterInfo.td
new file mode 100644
index 0000000..4010781
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430RegisterInfo.td
@@ -0,0 +1,81 @@
+//===-- MSP430RegisterInfo.td - MSP430 Register defs -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the MSP430 register file
+//===----------------------------------------------------------------------===//
+
+class MSP430Reg<bits<4> num, string n> : Register<n> {
+  field bits<4> Num = num;
+  let Namespace = "MSP430";
+}
+
+class MSP430RegWithSubregs<bits<4> num, string n, list<Register> subregs> 
+  : RegisterWithSubRegs<n, subregs> {
+  field bits<4> Num = num;
+  let Namespace = "MSP430";
+}
+
+//===----------------------------------------------------------------------===//
+//  Registers
+//===----------------------------------------------------------------------===//
+
+def PCB  : MSP430Reg<0,  "r0">;
+def SPB  : MSP430Reg<1,  "r1">;
+def SRB  : MSP430Reg<2,  "r2">;
+def CGB  : MSP430Reg<3,  "r3">;
+def FPB  : MSP430Reg<4,  "r4">;
+def R5B  : MSP430Reg<5,  "r5">;
+def R6B  : MSP430Reg<6,  "r6">;
+def R7B  : MSP430Reg<7,  "r7">;
+def R8B  : MSP430Reg<8,  "r8">;
+def R9B  : MSP430Reg<9,  "r9">;
+def R10B : MSP430Reg<10, "r10">;
+def R11B : MSP430Reg<11, "r11">;
+def R12B : MSP430Reg<12, "r12">;
+def R13B : MSP430Reg<13, "r13">;
+def R14B : MSP430Reg<14, "r14">;
+def R15B : MSP430Reg<15, "r15">;
+
+def subreg_8bit : SubRegIndex<8> { let Namespace = "MSP430"; }
+
+let SubRegIndices = [subreg_8bit] in {
+def PCW  : MSP430RegWithSubregs<0,  "r0",  [PCB]>;
+def SPW  : MSP430RegWithSubregs<1,  "r1",  [SPB]>;
+def SRW  : MSP430RegWithSubregs<2,  "r2",  [SRB]>;
+def CGW  : MSP430RegWithSubregs<3,  "r3",  [CGB]>;
+def FPW  : MSP430RegWithSubregs<4,  "r4",  [FPB]>;
+def R5W  : MSP430RegWithSubregs<5,  "r5",  [R5B]>;
+def R6W  : MSP430RegWithSubregs<6,  "r6",  [R6B]>;
+def R7W  : MSP430RegWithSubregs<7,  "r7",  [R7B]>;
+def R8W  : MSP430RegWithSubregs<8,  "r8",  [R8B]>;
+def R9W  : MSP430RegWithSubregs<9,  "r9",  [R9B]>;
+def R10W : MSP430RegWithSubregs<10, "r10", [R10B]>;
+def R11W : MSP430RegWithSubregs<11, "r11", [R11B]>;
+def R12W : MSP430RegWithSubregs<12, "r12", [R12B]>;
+def R13W : MSP430RegWithSubregs<13, "r13", [R13B]>;
+def R14W : MSP430RegWithSubregs<14, "r14", [R14B]>;
+def R15W : MSP430RegWithSubregs<15, "r15", [R15B]>;
+}
+
+def GR8 : RegisterClass<"MSP430", [i8], 8,
+   // Volatile registers
+  (add R12B, R13B, R14B, R15B, R11B, R10B, R9B, R8B, R7B, R6B, R5B,
+   // Frame pointer, sometimes allocable
+   FPB,
+   // Volatile, but not allocable
+   PCB, SPB, SRB, CGB)>;
+
+def GR16 : RegisterClass<"MSP430", [i16], 16,
+   // Volatile registers
+  (add R12W, R13W, R14W, R15W, R11W, R10W, R9W, R8W, R7W, R6W, R5W,
+   // Frame pointer, sometimes allocable
+   FPW,
+   // Volatile, but not allocable
+   PCW, SPW, SRW, CGW)>;
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430SelectionDAGInfo.cpp b/contrib/llvm/lib/Target/MSP430/MSP430SelectionDAGInfo.cpp
new file mode 100644
index 0000000..3897ef6
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430SelectionDAGInfo.cpp
@@ -0,0 +1,23 @@
+//===-- MSP430SelectionDAGInfo.cpp - MSP430 SelectionDAG Info -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MSP430SelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430TargetMachine.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "msp430-selectiondag-info"
+
+MSP430SelectionDAGInfo::MSP430SelectionDAGInfo(const DataLayout &DL)
+    : TargetSelectionDAGInfo(&DL) {}
+
+MSP430SelectionDAGInfo::~MSP430SelectionDAGInfo() {
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430SelectionDAGInfo.h b/contrib/llvm/lib/Target/MSP430/MSP430SelectionDAGInfo.h
new file mode 100644
index 0000000..cb04adc
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430SelectionDAGInfo.h
@@ -0,0 +1,31 @@
+//===-- MSP430SelectionDAGInfo.h - MSP430 SelectionDAG Info -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the MSP430 subclass for TargetSelectionDAGInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MSP430SELECTIONDAGINFO_H
+#define MSP430SELECTIONDAGINFO_H
+
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+
+namespace llvm {
+
+class MSP430TargetMachine;
+
+class MSP430SelectionDAGInfo : public TargetSelectionDAGInfo {
+public:
+  explicit MSP430SelectionDAGInfo(const DataLayout &DL);
+  ~MSP430SelectionDAGInfo();
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430Subtarget.cpp b/contrib/llvm/lib/Target/MSP430/MSP430Subtarget.cpp
new file mode 100644
index 0000000..dbddc52
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430Subtarget.cpp
@@ -0,0 +1,39 @@
+//===-- MSP430Subtarget.cpp - MSP430 Subtarget Information ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MSP430 specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430Subtarget.h"
+#include "MSP430.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "msp430-subtarget"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "MSP430GenSubtargetInfo.inc"
+
+void MSP430Subtarget::anchor() { }
+
+MSP430Subtarget &MSP430Subtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
+  ParseSubtargetFeatures("generic", FS);
+  return *this;
+}
+
+MSP430Subtarget::MSP430Subtarget(const std::string &TT, const std::string &CPU,
+                                 const std::string &FS, const TargetMachine &TM)
+    : MSP430GenSubtargetInfo(TT, CPU, FS),
+      // FIXME: Check DataLayout string.
+      DL("e-m:e-p:16:16-i32:16:32-n8:16"), FrameLowering(),
+      InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM),
+      TSInfo(DL) {}
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430Subtarget.h b/contrib/llvm/lib/Target/MSP430/MSP430Subtarget.h
new file mode 100644
index 0000000..0152ad1
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430Subtarget.h
@@ -0,0 +1,65 @@
+//===-- MSP430Subtarget.h - Define Subtarget for the MSP430 ----*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the MSP430 specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_MSP430_SUBTARGET_H
+#define LLVM_TARGET_MSP430_SUBTARGET_H
+
+#include "MSP430FrameLowering.h"
+#include "MSP430InstrInfo.h"
+#include "MSP430ISelLowering.h"
+#include "MSP430RegisterInfo.h"
+#include "MSP430SelectionDAGInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "MSP430GenSubtargetInfo.inc"
+
+namespace llvm {
+class StringRef;
+
+class MSP430Subtarget : public MSP430GenSubtargetInfo {
+  virtual void anchor();
+  bool ExtendedInsts;
+  const DataLayout DL; // Calculates type size & alignment
+  MSP430FrameLowering FrameLowering;
+  MSP430InstrInfo InstrInfo;
+  MSP430TargetLowering TLInfo;
+  MSP430SelectionDAGInfo TSInfo;
+
+public:
+  /// This constructor initializes the data members to match that
+  /// of the specified triple.
+  ///
+  MSP430Subtarget(const std::string &TT, const std::string &CPU,
+                  const std::string &FS, const TargetMachine &TM);
+
+  MSP430Subtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
+
+  /// ParseSubtargetFeatures - Parses features string setting specified
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  const TargetFrameLowering *getFrameLowering() const { return &FrameLowering; }
+  const MSP430InstrInfo *getInstrInfo() const { return &InstrInfo; }
+  const DataLayout *getDataLayout() const { return &DL; }
+  const TargetRegisterInfo *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+  const MSP430TargetLowering *getTargetLowering() const { return &TLInfo; }
+  const MSP430SelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; }
+};
+} // End llvm namespace
+
+#endif  // LLVM_TARGET_MSP430_SUBTARGET_H
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp b/contrib/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp
new file mode 100644
index 0000000..5ca36f2
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp
@@ -0,0 +1,67 @@
+//===-- MSP430TargetMachine.cpp - Define TargetMachine for MSP430 ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Top-level implementation for the MSP430 target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430TargetMachine.h"
+#include "MSP430.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+extern "C" void LLVMInitializeMSP430Target() {
+  // Register the target.
+  RegisterTargetMachine<MSP430TargetMachine> X(TheMSP430Target);
+}
+
+MSP430TargetMachine::MSP430TargetMachine(const Target &T, StringRef TT,
+                                         StringRef CPU, StringRef FS,
+                                         const TargetOptions &Options,
+                                         Reloc::Model RM, CodeModel::Model CM,
+                                         CodeGenOpt::Level OL)
+    : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+      Subtarget(TT, CPU, FS, *this) {
+  initAsmInfo();
+}
+
+namespace {
+/// MSP430 Code Generator Pass Configuration Options.
+class MSP430PassConfig : public TargetPassConfig {
+public:
+  MSP430PassConfig(MSP430TargetMachine *TM, PassManagerBase &PM)
+    : TargetPassConfig(TM, PM) {}
+
+  MSP430TargetMachine &getMSP430TargetMachine() const {
+    return getTM<MSP430TargetMachine>();
+  }
+
+  bool addInstSelector() override;
+  bool addPreEmitPass() override;
+};
+} // namespace
+
+TargetPassConfig *MSP430TargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new MSP430PassConfig(this, PM);
+}
+
+bool MSP430PassConfig::addInstSelector() {
+  // Install an instruction selector.
+  addPass(createMSP430ISelDag(getMSP430TargetMachine(), getOptLevel()));
+  return false;
+}
+
+bool MSP430PassConfig::addPreEmitPass() {
+  // Must run branch selection immediately preceding the asm printer.
+  addPass(createMSP430BranchSelectionPass());
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430TargetMachine.h b/contrib/llvm/lib/Target/MSP430/MSP430TargetMachine.h
new file mode 100644
index 0000000..efa8403
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430TargetMachine.h
@@ -0,0 +1,61 @@
+//===-- MSP430TargetMachine.h - Define TargetMachine for MSP430 -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the MSP430 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef LLVM_TARGET_MSP430_TARGETMACHINE_H
+#define LLVM_TARGET_MSP430_TARGETMACHINE_H
+
+#include "MSP430Subtarget.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+/// MSP430TargetMachine
+///
+class MSP430TargetMachine : public LLVMTargetMachine {
+  MSP430Subtarget        Subtarget;
+
+public:
+  MSP430TargetMachine(const Target &T, StringRef TT,
+                      StringRef CPU, StringRef FS, const TargetOptions &Options,
+                      Reloc::Model RM, CodeModel::Model CM,
+                      CodeGenOpt::Level OL);
+
+  const TargetFrameLowering *getFrameLowering() const override {
+    return getSubtargetImpl()->getFrameLowering();
+  }
+  const MSP430InstrInfo *getInstrInfo() const override {
+    return getSubtargetImpl()->getInstrInfo();
+  }
+  const DataLayout *getDataLayout() const override {
+    return getSubtargetImpl()->getDataLayout();
+  }
+  const MSP430Subtarget *getSubtargetImpl() const override {
+    return &Subtarget;
+  }
+  const TargetRegisterInfo *getRegisterInfo() const override {
+    return getSubtargetImpl()->getRegisterInfo();
+  }
+  const MSP430TargetLowering *getTargetLowering() const override {
+    return getSubtargetImpl()->getTargetLowering();
+  }
+  const MSP430SelectionDAGInfo *getSelectionDAGInfo() const override {
+    return getSubtargetImpl()->getSelectionDAGInfo();
+  }
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+}; // MSP430TargetMachine.
+
+} // end namespace llvm
+
+#endif // LLVM_TARGET_MSP430_TARGETMACHINE_H
diff --git a/contrib/llvm/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp b/contrib/llvm/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp
new file mode 100644
index 0000000..0d71d04
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp
@@ -0,0 +1,20 @@
+//===-- MSP430TargetInfo.cpp - MSP430 Target Implementation ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheMSP430Target;
+
+extern "C" void LLVMInitializeMSP430TargetInfo() { 
+  RegisterTarget<Triple::msp430> 
+    X(TheMSP430Target, "msp430", "MSP430 [experimental]");
+}
diff --git a/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
new file mode 100644
index 0000000..53b30f9
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -0,0 +1,3020 @@
+//===-- MipsAsmParser.cpp - Parse Mips assembly to MCInst instructions ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/MipsMCExpr.h"
+#include "MCTargetDesc/MipsMCTargetDesc.h"
+#include "MipsRegisterInfo.h"
+#include "MipsTargetStreamer.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-asm-parser"
+
+namespace llvm {
+class MCInstrInfo;
+}
+
+namespace {
+class MipsAssemblerOptions {
+public:
+  MipsAssemblerOptions() : aTReg(1), reorder(true), macro(true) {}
+
+  unsigned getATRegNum() { return aTReg; }
+  bool setATReg(unsigned Reg);
+
+  bool isReorder() { return reorder; }
+  void setReorder() { reorder = true; }
+  void setNoreorder() { reorder = false; }
+
+  bool isMacro() { return macro; }
+  void setMacro() { macro = true; }
+  void setNomacro() { macro = false; }
+
+private:
+  unsigned aTReg;
+  bool reorder;
+  bool macro;
+};
+}
+
+namespace {
+class MipsAsmParser : public MCTargetAsmParser {
+  MipsTargetStreamer &getTargetStreamer() {
+    MCTargetStreamer &TS = *Parser.getStreamer().getTargetStreamer();
+    return static_cast<MipsTargetStreamer &>(TS);
+  }
+
+  MCSubtargetInfo &STI;
+  MCAsmParser &Parser;
+  MipsAssemblerOptions Options;
+
+#define GET_ASSEMBLER_HEADER
+#include "MipsGenAsmMatcher.inc"
+
+  unsigned checkTargetMatchPredicate(MCInst &Inst) override;
+
+  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                               OperandVector &Operands, MCStreamer &Out,
+                               unsigned &ErrorInfo,
+                               bool MatchingInlineAsm) override;
+
+  /// Parse a register as used in CFI directives
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+
+  bool ParseParenSuffix(StringRef Name, OperandVector &Operands);
+
+  bool ParseBracketSuffix(StringRef Name, OperandVector &Operands);
+
+  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                        SMLoc NameLoc, OperandVector &Operands) override;
+
+  bool ParseDirective(AsmToken DirectiveID) override;
+
+  MipsAsmParser::OperandMatchResultTy parseMemOperand(OperandVector &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  MatchAnyRegisterNameWithoutDollar(OperandVector &Operands,
+                                    StringRef Identifier, SMLoc S);
+
+  MipsAsmParser::OperandMatchResultTy
+  MatchAnyRegisterWithoutDollar(OperandVector &Operands, SMLoc S);
+
+  MipsAsmParser::OperandMatchResultTy ParseAnyRegister(OperandVector &Operands);
+
+  MipsAsmParser::OperandMatchResultTy ParseImm(OperandVector &Operands);
+
+  MipsAsmParser::OperandMatchResultTy ParseJumpTarget(OperandVector &Operands);
+
+  MipsAsmParser::OperandMatchResultTy parseInvNum(OperandVector &Operands);
+
+  MipsAsmParser::OperandMatchResultTy ParseLSAImm(OperandVector &Operands);
+
+  bool searchSymbolAlias(OperandVector &Operands);
+
+  bool ParseOperand(OperandVector &, StringRef Mnemonic);
+
+  bool needsExpansion(MCInst &Inst);
+
+  // Expands assembly pseudo instructions.
+  // Returns false on success, true otherwise.
+  bool expandInstruction(MCInst &Inst, SMLoc IDLoc,
+                         SmallVectorImpl<MCInst> &Instructions);
+
+  bool expandLoadImm(MCInst &Inst, SMLoc IDLoc,
+                     SmallVectorImpl<MCInst> &Instructions);
+
+  bool expandLoadAddressImm(MCInst &Inst, SMLoc IDLoc,
+                            SmallVectorImpl<MCInst> &Instructions);
+
+  bool expandLoadAddressReg(MCInst &Inst, SMLoc IDLoc,
+                            SmallVectorImpl<MCInst> &Instructions);
+
+  void expandMemInst(MCInst &Inst, SMLoc IDLoc,
+                     SmallVectorImpl<MCInst> &Instructions, bool isLoad,
+                     bool isImmOpnd);
+  bool reportParseError(Twine ErrorMsg);
+  bool reportParseError(SMLoc Loc, Twine ErrorMsg);
+
+  bool parseMemOffset(const MCExpr *&Res, bool isParenExpr);
+  bool parseRelocOperand(const MCExpr *&Res);
+
+  const MCExpr *evaluateRelocExpr(const MCExpr *Expr, StringRef RelocStr);
+
+  bool isEvaluated(const MCExpr *Expr);
+  bool parseSetFeature(uint64_t Feature);
+  bool parseDirectiveCPLoad(SMLoc Loc);
+  bool parseDirectiveCPSetup();
+  bool parseDirectiveNaN();
+  bool parseDirectiveSet();
+  bool parseDirectiveOption();
+
+  bool parseSetAtDirective();
+  bool parseSetNoAtDirective();
+  bool parseSetMacroDirective();
+  bool parseSetNoMacroDirective();
+  bool parseSetReorderDirective();
+  bool parseSetNoReorderDirective();
+  bool parseSetNoMips16Directive();
+  bool parseSetFpDirective();
+
+  bool parseSetAssignment();
+
+  bool parseDataDirective(unsigned Size, SMLoc L);
+  bool parseDirectiveGpWord();
+  bool parseDirectiveGpDWord();
+  bool parseDirectiveModule();
+  bool parseDirectiveModuleFP();
+  bool parseFpABIValue(MipsABIFlagsSection::FpABIKind &FpABI,
+                       StringRef Directive);
+
+  MCSymbolRefExpr::VariantKind getVariantKind(StringRef Symbol);
+
+  bool eatComma(StringRef ErrorStr);
+
+  int matchCPURegisterName(StringRef Symbol);
+
+  int matchRegisterByNumber(unsigned RegNum, unsigned RegClass);
+
+  int matchFPURegisterName(StringRef Name);
+
+  int matchFCCRegisterName(StringRef Name);
+
+  int matchACRegisterName(StringRef Name);
+
+  int matchMSA128RegisterName(StringRef Name);
+
+  int matchMSA128CtrlRegisterName(StringRef Name);
+
+  unsigned getReg(int RC, int RegNo);
+
+  unsigned getGPR(int RegNo);
+
+  int getATReg(SMLoc Loc);
+
+  bool processInstruction(MCInst &Inst, SMLoc IDLoc,
+                          SmallVectorImpl<MCInst> &Instructions);
+
+  // Helper function that checks if the value of a vector index is within the
+  // boundaries of accepted values for each RegisterKind
+  // Example: INSERT.B $w0[n], $1 => 16 > n >= 0
+  bool validateMSAIndex(int Val, int RegKind);
+
+  void setFeatureBits(unsigned Feature, StringRef FeatureString) {
+    if (!(STI.getFeatureBits() & Feature)) {
+      setAvailableFeatures(
+          ComputeAvailableFeatures(STI.ToggleFeature(FeatureString)));
+    }
+  }
+
+  void clearFeatureBits(unsigned Feature, StringRef FeatureString) {
+    if (STI.getFeatureBits() & Feature) {
+      setAvailableFeatures(
+          ComputeAvailableFeatures(STI.ToggleFeature(FeatureString)));
+    }
+  }
+
+public:
+  enum MipsMatchResultTy {
+    Match_RequiresDifferentSrcAndDst = FIRST_TARGET_MATCH_RESULT_TY
+#define GET_OPERAND_DIAGNOSTIC_TYPES
+#include "MipsGenAsmMatcher.inc"
+#undef GET_OPERAND_DIAGNOSTIC_TYPES
+
+  };
+
+  MipsAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser,
+                const MCInstrInfo &MII, const MCTargetOptions &Options)
+      : MCTargetAsmParser(), STI(sti), Parser(parser) {
+    // Initialize the set of available features.
+    setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+
+    getTargetStreamer().updateABIInfo(*this);
+
+    // Assert exactly one ABI was chosen.
+    assert((((STI.getFeatureBits() & Mips::FeatureO32) != 0) +
+            ((STI.getFeatureBits() & Mips::FeatureEABI) != 0) +
+            ((STI.getFeatureBits() & Mips::FeatureN32) != 0) +
+            ((STI.getFeatureBits() & Mips::FeatureN64) != 0)) == 1);
+
+    if (!isABI_O32() && !useOddSPReg() != 0)
+      report_fatal_error("-mno-odd-spreg requires the O32 ABI");
+  }
+
+  MCAsmParser &getParser() const { return Parser; }
+  MCAsmLexer &getLexer() const { return Parser.getLexer(); }
+
+  /// True if all of $fcc0 - $fcc7 exist for the current ISA.
+  bool hasEightFccRegisters() const { return hasMips4() || hasMips32(); }
+
+  bool isGP64bit() const { return STI.getFeatureBits() & Mips::FeatureGP64Bit; }
+  bool isFP64bit() const { return STI.getFeatureBits() & Mips::FeatureFP64Bit; }
+  bool isABI_N32() const { return STI.getFeatureBits() & Mips::FeatureN32; }
+  bool isABI_N64() const { return STI.getFeatureBits() & Mips::FeatureN64; }
+  bool isABI_O32() const { return STI.getFeatureBits() & Mips::FeatureO32; }
+  bool isABI_FPXX() const { return STI.getFeatureBits() & Mips::FeatureFPXX; }
+
+  bool useOddSPReg() const {
+    return !(STI.getFeatureBits() & Mips::FeatureNoOddSPReg);
+  }
+
+  bool inMicroMipsMode() const {
+    return STI.getFeatureBits() & Mips::FeatureMicroMips;
+  }
+  bool hasMips1() const { return STI.getFeatureBits() & Mips::FeatureMips1; }
+  bool hasMips2() const { return STI.getFeatureBits() & Mips::FeatureMips2; }
+  bool hasMips3() const { return STI.getFeatureBits() & Mips::FeatureMips3; }
+  bool hasMips4() const { return STI.getFeatureBits() & Mips::FeatureMips4; }
+  bool hasMips5() const { return STI.getFeatureBits() & Mips::FeatureMips5; }
+  bool hasMips32() const {
+    return (STI.getFeatureBits() & Mips::FeatureMips32);
+  }
+  bool hasMips64() const {
+    return (STI.getFeatureBits() & Mips::FeatureMips64);
+  }
+  bool hasMips32r2() const {
+    return (STI.getFeatureBits() & Mips::FeatureMips32r2);
+  }
+  bool hasMips64r2() const {
+    return (STI.getFeatureBits() & Mips::FeatureMips64r2);
+  }
+  bool hasMips32r6() const {
+    return (STI.getFeatureBits() & Mips::FeatureMips32r6);
+  }
+  bool hasMips64r6() const {
+    return (STI.getFeatureBits() & Mips::FeatureMips64r6);
+  }
+  bool hasDSP() const { return (STI.getFeatureBits() & Mips::FeatureDSP); }
+  bool hasDSPR2() const { return (STI.getFeatureBits() & Mips::FeatureDSPR2); }
+  bool hasMSA() const { return (STI.getFeatureBits() & Mips::FeatureMSA); }
+
+  bool inMips16Mode() const {
+    return STI.getFeatureBits() & Mips::FeatureMips16;
+  }
+  // TODO: see how can we get this info.
+  bool abiUsesSoftFloat() const { return false; }
+
+  /// Warn if RegNo is the current assembler temporary.
+  void WarnIfAssemblerTemporary(int RegNo, SMLoc Loc);
+};
+}
+
+namespace {
+
+/// MipsOperand - Instances of this class represent a parsed Mips machine
+/// instruction.
+class MipsOperand : public MCParsedAsmOperand {
+public:
+  /// Broad categories of register classes
+  /// The exact class is finalized by the render method.
+  enum RegKind {
+    RegKind_GPR = 1,      /// GPR32 and GPR64 (depending on isGP64bit())
+    RegKind_FGR = 2,      /// FGR32, FGR64, AFGR64 (depending on context and
+                          /// isFP64bit())
+    RegKind_FCC = 4,      /// FCC
+    RegKind_MSA128 = 8,   /// MSA128[BHWD] (makes no difference which)
+    RegKind_MSACtrl = 16, /// MSA control registers
+    RegKind_COP2 = 32,    /// COP2
+    RegKind_ACC = 64,     /// HI32DSP, LO32DSP, and ACC64DSP (depending on
+                          /// context).
+    RegKind_CCR = 128,    /// CCR
+    RegKind_HWRegs = 256, /// HWRegs
+    RegKind_COP3 = 512,   /// COP3
+
+    /// Potentially any (e.g. $1)
+    RegKind_Numeric = RegKind_GPR | RegKind_FGR | RegKind_FCC | RegKind_MSA128 |
+                      RegKind_MSACtrl | RegKind_COP2 | RegKind_ACC |
+                      RegKind_CCR | RegKind_HWRegs | RegKind_COP3
+  };
+
+private:
+  enum KindTy {
+    k_Immediate,     /// An immediate (possibly involving symbol references)
+    k_Memory,        /// Base + Offset Memory Address
+    k_PhysRegister,  /// A physical register from the Mips namespace
+    k_RegisterIndex, /// A register index in one or more RegKind.
+    k_Token          /// A simple token
+  } Kind;
+
+public:
+  MipsOperand(KindTy K, MipsAsmParser &Parser)
+      : MCParsedAsmOperand(), Kind(K), AsmParser(Parser) {}
+
+private:
+  /// For diagnostics, and checking the assembler temporary
+  MipsAsmParser &AsmParser;
+
+  struct Token {
+    const char *Data;
+    unsigned Length;
+  };
+
+  struct PhysRegOp {
+    unsigned Num; /// Register Number
+  };
+
+  struct RegIdxOp {
+    unsigned Index; /// Index into the register class
+    RegKind Kind;   /// Bitfield of the kinds it could possibly be
+    const MCRegisterInfo *RegInfo;
+  };
+
+  struct ImmOp {
+    const MCExpr *Val;
+  };
+
+  struct MemOp {
+    MipsOperand *Base;
+    const MCExpr *Off;
+  };
+
+  union {
+    struct Token Tok;
+    struct PhysRegOp PhysReg;
+    struct RegIdxOp RegIdx;
+    struct ImmOp Imm;
+    struct MemOp Mem;
+  };
+
+  SMLoc StartLoc, EndLoc;
+
+  /// Internal constructor for register kinds
+  static std::unique_ptr<MipsOperand> CreateReg(unsigned Index, RegKind RegKind,
+                                                const MCRegisterInfo *RegInfo,
+                                                SMLoc S, SMLoc E,
+                                                MipsAsmParser &Parser) {
+    auto Op = make_unique<MipsOperand>(k_RegisterIndex, Parser);
+    Op->RegIdx.Index = Index;
+    Op->RegIdx.RegInfo = RegInfo;
+    Op->RegIdx.Kind = RegKind;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+public:
+  /// Coerce the register to GPR32 and return the real register for the current
+  /// target.
+  unsigned getGPR32Reg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_GPR) && "Invalid access!");
+    AsmParser.WarnIfAssemblerTemporary(RegIdx.Index, StartLoc);
+    unsigned ClassID = Mips::GPR32RegClassID;
+    return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+  }
+
+  /// Coerce the register to GPR64 and return the real register for the current
+  /// target.
+  unsigned getGPR64Reg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_GPR) && "Invalid access!");
+    unsigned ClassID = Mips::GPR64RegClassID;
+    return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+  }
+
+private:
+  /// Coerce the register to AFGR64 and return the real register for the current
+  /// target.
+  unsigned getAFGR64Reg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_FGR) && "Invalid access!");
+    if (RegIdx.Index % 2 != 0)
+      AsmParser.Warning(StartLoc, "Float register should be even.");
+    return RegIdx.RegInfo->getRegClass(Mips::AFGR64RegClassID)
+        .getRegister(RegIdx.Index / 2);
+  }
+
+  /// Coerce the register to FGR64 and return the real register for the current
+  /// target.
+  unsigned getFGR64Reg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_FGR) && "Invalid access!");
+    return RegIdx.RegInfo->getRegClass(Mips::FGR64RegClassID)
+        .getRegister(RegIdx.Index);
+  }
+
+  /// Coerce the register to FGR32 and return the real register for the current
+  /// target.
+  unsigned getFGR32Reg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_FGR) && "Invalid access!");
+    return RegIdx.RegInfo->getRegClass(Mips::FGR32RegClassID)
+        .getRegister(RegIdx.Index);
+  }
+
+  /// Coerce the register to FGRH32 and return the real register for the current
+  /// target.
+  unsigned getFGRH32Reg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_FGR) && "Invalid access!");
+    return RegIdx.RegInfo->getRegClass(Mips::FGRH32RegClassID)
+        .getRegister(RegIdx.Index);
+  }
+
+  /// Coerce the register to FCC and return the real register for the current
+  /// target.
+  unsigned getFCCReg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_FCC) && "Invalid access!");
+    return RegIdx.RegInfo->getRegClass(Mips::FCCRegClassID)
+        .getRegister(RegIdx.Index);
+  }
+
+  /// Coerce the register to MSA128 and return the real register for the current
+  /// target.
+  unsigned getMSA128Reg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_MSA128) && "Invalid access!");
+    // It doesn't matter which of the MSA128[BHWD] classes we use. They are all
+    // identical
+    unsigned ClassID = Mips::MSA128BRegClassID;
+    return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+  }
+
+  /// Coerce the register to MSACtrl and return the real register for the
+  /// current target.
+  unsigned getMSACtrlReg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_MSACtrl) && "Invalid access!");
+    unsigned ClassID = Mips::MSACtrlRegClassID;
+    return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+  }
+
+  /// Coerce the register to COP2 and return the real register for the
+  /// current target.
+  unsigned getCOP2Reg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_COP2) && "Invalid access!");
+    unsigned ClassID = Mips::COP2RegClassID;
+    return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+  }
+
+  /// Coerce the register to COP3 and return the real register for the
+  /// current target.
+  unsigned getCOP3Reg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_COP3) && "Invalid access!");
+    unsigned ClassID = Mips::COP3RegClassID;
+    return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+  }
+
+  /// Coerce the register to ACC64DSP and return the real register for the
+  /// current target.
+  unsigned getACC64DSPReg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_ACC) && "Invalid access!");
+    unsigned ClassID = Mips::ACC64DSPRegClassID;
+    return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+  }
+
+  /// Coerce the register to HI32DSP and return the real register for the
+  /// current target.
+  unsigned getHI32DSPReg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_ACC) && "Invalid access!");
+    unsigned ClassID = Mips::HI32DSPRegClassID;
+    return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+  }
+
+  /// Coerce the register to LO32DSP and return the real register for the
+  /// current target.
+  unsigned getLO32DSPReg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_ACC) && "Invalid access!");
+    unsigned ClassID = Mips::LO32DSPRegClassID;
+    return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+  }
+
+  /// Coerce the register to CCR and return the real register for the
+  /// current target.
+  unsigned getCCRReg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_CCR) && "Invalid access!");
+    unsigned ClassID = Mips::CCRRegClassID;
+    return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+  }
+
+  /// Coerce the register to HWRegs and return the real register for the
+  /// current target.
+  unsigned getHWRegsReg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_HWRegs) && "Invalid access!");
+    unsigned ClassID = Mips::HWRegsRegClassID;
+    return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+  }
+
+public:
+  void addExpr(MCInst &Inst, const MCExpr *Expr) const {
+    // Add as immediate when possible.  Null MCExpr = 0.
+    if (!Expr)
+      Inst.addOperand(MCOperand::CreateImm(0));
+    else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+      Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::CreateExpr(Expr));
+  }
+
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    llvm_unreachable("Use a custom parser instead");
+  }
+
+  /// Render the operand to an MCInst as a GPR32
+  /// Asserts if the wrong number of operands are requested, or the operand
+  /// is not a k_RegisterIndex compatible with RegKind_GPR
+  void addGPR32AsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getGPR32Reg()));
+  }
+
+  /// Render the operand to an MCInst as a GPR64
+  /// Asserts if the wrong number of operands are requested, or the operand
+  /// is not a k_RegisterIndex compatible with RegKind_GPR
+  void addGPR64AsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getGPR64Reg()));
+  }
+
+  void addAFGR64AsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getAFGR64Reg()));
+  }
+
+  void addFGR64AsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getFGR64Reg()));
+  }
+
+  void addFGR32AsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getFGR32Reg()));
+    // FIXME: We ought to do this for -integrated-as without -via-file-asm too.
+    if (!AsmParser.useOddSPReg() && RegIdx.Index & 1)
+      AsmParser.Error(StartLoc, "-mno-odd-spreg prohibits the use of odd FPU "
+                                "registers");
+  }
+
+  void addFGRH32AsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getFGRH32Reg()));
+  }
+
+  void addFCCAsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getFCCReg()));
+  }
+
+  void addMSA128AsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getMSA128Reg()));
+  }
+
+  void addMSACtrlAsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getMSACtrlReg()));
+  }
+
+  void addCOP2AsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getCOP2Reg()));
+  }
+
+  void addCOP3AsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getCOP3Reg()));
+  }
+
+  void addACC64DSPAsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getACC64DSPReg()));
+  }
+
+  void addHI32DSPAsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getHI32DSPReg()));
+  }
+
+  void addLO32DSPAsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getLO32DSPReg()));
+  }
+
+  void addCCRAsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getCCRReg()));
+  }
+
+  void addHWRegsAsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getHWRegsReg()));
+  }
+
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCExpr *Expr = getImm();
+    addExpr(Inst, Expr);
+  }
+
+  void addMemOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+
+    Inst.addOperand(MCOperand::CreateReg(getMemBase()->getGPR32Reg()));
+
+    const MCExpr *Expr = getMemOff();
+    addExpr(Inst, Expr);
+  }
+
+  bool isReg() const override {
+    // As a special case until we sort out the definition of div/divu, pretend
+    // that $0/$zero are k_PhysRegister so that MCK_ZERO works correctly.
+    if (isGPRAsmReg() && RegIdx.Index == 0)
+      return true;
+
+    return Kind == k_PhysRegister;
+  }
+  bool isRegIdx() const { return Kind == k_RegisterIndex; }
+  bool isImm() const override { return Kind == k_Immediate; }
+  bool isConstantImm() const {
+    return isImm() && dyn_cast<MCConstantExpr>(getImm());
+  }
+  bool isToken() const override {
+    // Note: It's not possible to pretend that other operand kinds are tokens.
+    // The matcher emitter checks tokens first.
+    return Kind == k_Token;
+  }
+  bool isMem() const override { return Kind == k_Memory; }
+  bool isConstantMemOff() const {
+    return isMem() && dyn_cast<MCConstantExpr>(getMemOff());
+  }
+  template <unsigned Bits> bool isMemWithSimmOffset() const {
+    return isMem() && isConstantMemOff() && isInt<Bits>(getConstantMemOff());
+  }
+  bool isInvNum() const { return Kind == k_Immediate; }
+  bool isLSAImm() const {
+    if (!isConstantImm())
+      return false;
+    int64_t Val = getConstantImm();
+    return 1 <= Val && Val <= 4;
+  }
+
+  StringRef getToken() const {
+    assert(Kind == k_Token && "Invalid access!");
+    return StringRef(Tok.Data, Tok.Length);
+  }
+
+  unsigned getReg() const override {
+    // As a special case until we sort out the definition of div/divu, pretend
+    // that $0/$zero are k_PhysRegister so that MCK_ZERO works correctly.
+    if (Kind == k_RegisterIndex && RegIdx.Index == 0 &&
+        RegIdx.Kind & RegKind_GPR)
+      return getGPR32Reg(); // FIXME: GPR64 too
+
+    assert(Kind == k_PhysRegister && "Invalid access!");
+    return PhysReg.Num;
+  }
+
+  const MCExpr *getImm() const {
+    assert((Kind == k_Immediate) && "Invalid access!");
+    return Imm.Val;
+  }
+
+  int64_t getConstantImm() const {
+    const MCExpr *Val = getImm();
+    return static_cast<const MCConstantExpr *>(Val)->getValue();
+  }
+
+  MipsOperand *getMemBase() const {
+    assert((Kind == k_Memory) && "Invalid access!");
+    return Mem.Base;
+  }
+
+  const MCExpr *getMemOff() const {
+    assert((Kind == k_Memory) && "Invalid access!");
+    return Mem.Off;
+  }
+
+  int64_t getConstantMemOff() const {
+    return static_cast<const MCConstantExpr *>(getMemOff())->getValue();
+  }
+
+  static std::unique_ptr<MipsOperand> CreateToken(StringRef Str, SMLoc S,
+                                                  MipsAsmParser &Parser) {
+    auto Op = make_unique<MipsOperand>(k_Token, Parser);
+    Op->Tok.Data = Str.data();
+    Op->Tok.Length = Str.size();
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  /// Create a numeric register (e.g. $1). The exact register remains
+  /// unresolved until an instruction successfully matches
+  static std::unique_ptr<MipsOperand>
+  CreateNumericReg(unsigned Index, const MCRegisterInfo *RegInfo, SMLoc S,
+                   SMLoc E, MipsAsmParser &Parser) {
+    DEBUG(dbgs() << "CreateNumericReg(" << Index << ", ...)\n");
+    return CreateReg(Index, RegKind_Numeric, RegInfo, S, E, Parser);
+  }
+
+  /// Create a register that is definitely a GPR.
+  /// This is typically only used for named registers such as $gp.
+  static std::unique_ptr<MipsOperand>
+  CreateGPRReg(unsigned Index, const MCRegisterInfo *RegInfo, SMLoc S, SMLoc E,
+               MipsAsmParser &Parser) {
+    return CreateReg(Index, RegKind_GPR, RegInfo, S, E, Parser);
+  }
+
+  /// Create a register that is definitely a FGR.
+  /// This is typically only used for named registers such as $f0.
+  static std::unique_ptr<MipsOperand>
+  CreateFGRReg(unsigned Index, const MCRegisterInfo *RegInfo, SMLoc S, SMLoc E,
+               MipsAsmParser &Parser) {
+    return CreateReg(Index, RegKind_FGR, RegInfo, S, E, Parser);
+  }
+
+  /// Create a register that is definitely an FCC.
+  /// This is typically only used for named registers such as $fcc0.
+  static std::unique_ptr<MipsOperand>
+  CreateFCCReg(unsigned Index, const MCRegisterInfo *RegInfo, SMLoc S, SMLoc E,
+               MipsAsmParser &Parser) {
+    return CreateReg(Index, RegKind_FCC, RegInfo, S, E, Parser);
+  }
+
+  /// Create a register that is definitely an ACC.
+  /// This is typically only used for named registers such as $ac0.
+  static std::unique_ptr<MipsOperand>
+  CreateACCReg(unsigned Index, const MCRegisterInfo *RegInfo, SMLoc S, SMLoc E,
+               MipsAsmParser &Parser) {
+    return CreateReg(Index, RegKind_ACC, RegInfo, S, E, Parser);
+  }
+
+  /// Create a register that is definitely an MSA128.
+  /// This is typically only used for named registers such as $w0.
+  static std::unique_ptr<MipsOperand>
+  CreateMSA128Reg(unsigned Index, const MCRegisterInfo *RegInfo, SMLoc S,
+                  SMLoc E, MipsAsmParser &Parser) {
+    return CreateReg(Index, RegKind_MSA128, RegInfo, S, E, Parser);
+  }
+
+  /// Create a register that is definitely an MSACtrl.
+  /// This is typically only used for named registers such as $msaaccess.
+  static std::unique_ptr<MipsOperand>
+  CreateMSACtrlReg(unsigned Index, const MCRegisterInfo *RegInfo, SMLoc S,
+                   SMLoc E, MipsAsmParser &Parser) {
+    return CreateReg(Index, RegKind_MSACtrl, RegInfo, S, E, Parser);
+  }
+
+  static std::unique_ptr<MipsOperand>
+  CreateImm(const MCExpr *Val, SMLoc S, SMLoc E, MipsAsmParser &Parser) {
+    auto Op = make_unique<MipsOperand>(k_Immediate, Parser);
+    Op->Imm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<MipsOperand>
+  CreateMem(std::unique_ptr<MipsOperand> Base, const MCExpr *Off, SMLoc S,
+            SMLoc E, MipsAsmParser &Parser) {
+    auto Op = make_unique<MipsOperand>(k_Memory, Parser);
+    Op->Mem.Base = Base.release();
+    Op->Mem.Off = Off;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  bool isGPRAsmReg() const {
+    return isRegIdx() && RegIdx.Kind & RegKind_GPR && RegIdx.Index <= 31;
+  }
+  bool isFGRAsmReg() const {
+    // AFGR64 is $0-$15 but we handle this in getAFGR64()
+    return isRegIdx() && RegIdx.Kind & RegKind_FGR && RegIdx.Index <= 31;
+  }
+  bool isHWRegsAsmReg() const {
+    return isRegIdx() && RegIdx.Kind & RegKind_HWRegs && RegIdx.Index <= 31;
+  }
+  bool isCCRAsmReg() const {
+    return isRegIdx() && RegIdx.Kind & RegKind_CCR && RegIdx.Index <= 31;
+  }
+  bool isFCCAsmReg() const {
+    if (!(isRegIdx() && RegIdx.Kind & RegKind_FCC))
+      return false;
+    if (!AsmParser.hasEightFccRegisters())
+      return RegIdx.Index == 0;
+    return RegIdx.Index <= 7;
+  }
+  bool isACCAsmReg() const {
+    return isRegIdx() && RegIdx.Kind & RegKind_ACC && RegIdx.Index <= 3;
+  }
+  bool isCOP2AsmReg() const {
+    return isRegIdx() && RegIdx.Kind & RegKind_COP2 && RegIdx.Index <= 31;
+  }
+  bool isCOP3AsmReg() const {
+    return isRegIdx() && RegIdx.Kind & RegKind_COP3 && RegIdx.Index <= 31;
+  }
+  bool isMSA128AsmReg() const {
+    return isRegIdx() && RegIdx.Kind & RegKind_MSA128 && RegIdx.Index <= 31;
+  }
+  bool isMSACtrlAsmReg() const {
+    return isRegIdx() && RegIdx.Kind & RegKind_MSACtrl && RegIdx.Index <= 7;
+  }
+
+  /// getStartLoc - Get the location of the first token of this operand.
+  SMLoc getStartLoc() const override { return StartLoc; }
+  /// getEndLoc - Get the location of the last token of this operand.
+  SMLoc getEndLoc() const override { return EndLoc; }
+
+  virtual ~MipsOperand() {
+    switch (Kind) {
+    case k_Immediate:
+      break;
+    case k_Memory:
+      delete Mem.Base;
+      break;
+    case k_PhysRegister:
+    case k_RegisterIndex:
+    case k_Token:
+      break;
+    }
+  }
+
+  void print(raw_ostream &OS) const override {
+    switch (Kind) {
+    case k_Immediate:
+      OS << "Imm<";
+      Imm.Val->print(OS);
+      OS << ">";
+      break;
+    case k_Memory:
+      OS << "Mem<";
+      Mem.Base->print(OS);
+      OS << ", ";
+      Mem.Off->print(OS);
+      OS << ">";
+      break;
+    case k_PhysRegister:
+      OS << "PhysReg<" << PhysReg.Num << ">";
+      break;
+    case k_RegisterIndex:
+      OS << "RegIdx<" << RegIdx.Index << ":" << RegIdx.Kind << ">";
+      break;
+    case k_Token:
+      OS << Tok.Data;
+      break;
+    }
+  }
+}; // class MipsOperand
+} // namespace
+
+namespace llvm {
+extern const MCInstrDesc MipsInsts[];
+}
+static const MCInstrDesc &getInstDesc(unsigned Opcode) {
+  return MipsInsts[Opcode];
+}
+
+bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
+                                       SmallVectorImpl<MCInst> &Instructions) {
+  const MCInstrDesc &MCID = getInstDesc(Inst.getOpcode());
+
+  Inst.setLoc(IDLoc);
+
+  if (MCID.isBranch() || MCID.isCall()) {
+    const unsigned Opcode = Inst.getOpcode();
+    MCOperand Offset;
+
+    switch (Opcode) {
+    default:
+      break;
+    case Mips::BEQ:
+    case Mips::BNE:
+    case Mips::BEQ_MM:
+    case Mips::BNE_MM:
+      assert(MCID.getNumOperands() == 3 && "unexpected number of operands");
+      Offset = Inst.getOperand(2);
+      if (!Offset.isImm())
+        break; // We'll deal with this situation later on when applying fixups.
+      if (!isIntN(inMicroMipsMode() ? 17 : 18, Offset.getImm()))
+        return Error(IDLoc, "branch target out of range");
+      if (OffsetToAlignment(Offset.getImm(),
+                            1LL << (inMicroMipsMode() ? 1 : 2)))
+        return Error(IDLoc, "branch to misaligned address");
+      break;
+    case Mips::BGEZ:
+    case Mips::BGTZ:
+    case Mips::BLEZ:
+    case Mips::BLTZ:
+    case Mips::BGEZAL:
+    case Mips::BLTZAL:
+    case Mips::BC1F:
+    case Mips::BC1T:
+    case Mips::BGEZ_MM:
+    case Mips::BGTZ_MM:
+    case Mips::BLEZ_MM:
+    case Mips::BLTZ_MM:
+    case Mips::BGEZAL_MM:
+    case Mips::BLTZAL_MM:
+    case Mips::BC1F_MM:
+    case Mips::BC1T_MM:
+      assert(MCID.getNumOperands() == 2 && "unexpected number of operands");
+      Offset = Inst.getOperand(1);
+      if (!Offset.isImm())
+        break; // We'll deal with this situation later on when applying fixups.
+      if (!isIntN(inMicroMipsMode() ? 17 : 18, Offset.getImm()))
+        return Error(IDLoc, "branch target out of range");
+      if (OffsetToAlignment(Offset.getImm(),
+                            1LL << (inMicroMipsMode() ? 1 : 2)))
+        return Error(IDLoc, "branch to misaligned address");
+      break;
+    }
+  }
+
+  // SSNOP is deprecated on MIPS32r6/MIPS64r6
+  // We still accept it but it is a normal nop.
+  if (hasMips32r6() && Inst.getOpcode() == Mips::SSNOP) {
+    std::string ISA = hasMips64r6() ? "MIPS64r6" : "MIPS32r6";
+    Warning(IDLoc, "ssnop is deprecated for " + ISA + " and is equivalent to a "
+                                                      "nop instruction");
+  }
+
+  if (MCID.hasDelaySlot() && Options.isReorder()) {
+    // If this instruction has a delay slot and .set reorder is active,
+    // emit a NOP after it.
+    Instructions.push_back(Inst);
+    MCInst NopInst;
+    NopInst.setOpcode(Mips::SLL);
+    NopInst.addOperand(MCOperand::CreateReg(Mips::ZERO));
+    NopInst.addOperand(MCOperand::CreateReg(Mips::ZERO));
+    NopInst.addOperand(MCOperand::CreateImm(0));
+    Instructions.push_back(NopInst);
+    return false;
+  }
+
+  if (MCID.mayLoad() || MCID.mayStore()) {
+    // Check the offset of memory operand, if it is a symbol
+    // reference or immediate we may have to expand instructions.
+    for (unsigned i = 0; i < MCID.getNumOperands(); i++) {
+      const MCOperandInfo &OpInfo = MCID.OpInfo[i];
+      if ((OpInfo.OperandType == MCOI::OPERAND_MEMORY) ||
+          (OpInfo.OperandType == MCOI::OPERAND_UNKNOWN)) {
+        MCOperand &Op = Inst.getOperand(i);
+        if (Op.isImm()) {
+          int MemOffset = Op.getImm();
+          if (MemOffset < -32768 || MemOffset > 32767) {
+            // Offset can't exceed 16bit value.
+            expandMemInst(Inst, IDLoc, Instructions, MCID.mayLoad(), true);
+            return false;
+          }
+        } else if (Op.isExpr()) {
+          const MCExpr *Expr = Op.getExpr();
+          if (Expr->getKind() == MCExpr::SymbolRef) {
+            const MCSymbolRefExpr *SR =
+                static_cast<const MCSymbolRefExpr *>(Expr);
+            if (SR->getKind() == MCSymbolRefExpr::VK_None) {
+              // Expand symbol.
+              expandMemInst(Inst, IDLoc, Instructions, MCID.mayLoad(), false);
+              return false;
+            }
+          } else if (!isEvaluated(Expr)) {
+            expandMemInst(Inst, IDLoc, Instructions, MCID.mayLoad(), false);
+            return false;
+          }
+        }
+      }
+    } // for
+  }   // if load/store
+
+  if (needsExpansion(Inst))
+    return expandInstruction(Inst, IDLoc, Instructions);
+  else
+    Instructions.push_back(Inst);
+
+  return false;
+}
+
+bool MipsAsmParser::needsExpansion(MCInst &Inst) {
+
+  switch (Inst.getOpcode()) {
+  case Mips::LoadImm32Reg:
+  case Mips::LoadAddr32Imm:
+  case Mips::LoadAddr32Reg:
+  case Mips::LoadImm64Reg:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool MipsAsmParser::expandInstruction(MCInst &Inst, SMLoc IDLoc,
+                                      SmallVectorImpl<MCInst> &Instructions) {
+  switch (Inst.getOpcode()) {
+  default:
+    assert(0 && "unimplemented expansion");
+    return true;
+  case Mips::LoadImm32Reg:
+    return expandLoadImm(Inst, IDLoc, Instructions);
+  case Mips::LoadImm64Reg:
+    if (!isGP64bit()) {
+      Error(IDLoc, "instruction requires a CPU feature not currently enabled");
+      return true;
+    }
+    return expandLoadImm(Inst, IDLoc, Instructions);
+  case Mips::LoadAddr32Imm:
+    return expandLoadAddressImm(Inst, IDLoc, Instructions);
+  case Mips::LoadAddr32Reg:
+    return expandLoadAddressReg(Inst, IDLoc, Instructions);
+  }
+}
+
+namespace {
+template <int Shift, bool PerformShift>
+void createShiftOr(int64_t Value, unsigned RegNo, SMLoc IDLoc,
+                   SmallVectorImpl<MCInst> &Instructions) {
+  MCInst tmpInst;
+  if (PerformShift) {
+    tmpInst.setOpcode(Mips::DSLL);
+    tmpInst.addOperand(MCOperand::CreateReg(RegNo));
+    tmpInst.addOperand(MCOperand::CreateReg(RegNo));
+    tmpInst.addOperand(MCOperand::CreateImm(16));
+    tmpInst.setLoc(IDLoc);
+    Instructions.push_back(tmpInst);
+    tmpInst.clear();
+  }
+  tmpInst.setOpcode(Mips::ORi);
+  tmpInst.addOperand(MCOperand::CreateReg(RegNo));
+  tmpInst.addOperand(MCOperand::CreateReg(RegNo));
+  tmpInst.addOperand(
+      MCOperand::CreateImm(((Value & (0xffffLL << Shift)) >> Shift)));
+  tmpInst.setLoc(IDLoc);
+  Instructions.push_back(tmpInst);
+}
+}
+
+bool MipsAsmParser::expandLoadImm(MCInst &Inst, SMLoc IDLoc,
+                                  SmallVectorImpl<MCInst> &Instructions) {
+  MCInst tmpInst;
+  const MCOperand &ImmOp = Inst.getOperand(1);
+  assert(ImmOp.isImm() && "expected immediate operand kind");
+  const MCOperand &RegOp = Inst.getOperand(0);
+  assert(RegOp.isReg() && "expected register operand kind");
+
+  int64_t ImmValue = ImmOp.getImm();
+  tmpInst.setLoc(IDLoc);
+  // FIXME: gas has a special case for values that are 000...1111, which
+  // becomes a li -1 and then a dsrl
+  if (0 <= ImmValue && ImmValue <= 65535) {
+    // For 0 <= j <= 65535.
+    // li d,j => ori d,$zero,j
+    tmpInst.setOpcode(Mips::ORi);
+    tmpInst.addOperand(MCOperand::CreateReg(RegOp.getReg()));
+    tmpInst.addOperand(MCOperand::CreateReg(Mips::ZERO));
+    tmpInst.addOperand(MCOperand::CreateImm(ImmValue));
+    Instructions.push_back(tmpInst);
+  } else if (ImmValue < 0 && ImmValue >= -32768) {
+    // For -32768 <= j < 0.
+    // li d,j => addiu d,$zero,j
+    tmpInst.setOpcode(Mips::ADDiu);
+    tmpInst.addOperand(MCOperand::CreateReg(RegOp.getReg()));
+    tmpInst.addOperand(MCOperand::CreateReg(Mips::ZERO));
+    tmpInst.addOperand(MCOperand::CreateImm(ImmValue));
+    Instructions.push_back(tmpInst);
+  } else if ((ImmValue & 0xffffffff) == ImmValue) {
+    // For any value of j that is representable as a 32-bit integer, create
+    // a sequence of:
+    // li d,j => lui d,hi16(j)
+    //           ori d,d,lo16(j)
+    tmpInst.setOpcode(Mips::LUi);
+    tmpInst.addOperand(MCOperand::CreateReg(RegOp.getReg()));
+    tmpInst.addOperand(MCOperand::CreateImm((ImmValue & 0xffff0000) >> 16));
+    Instructions.push_back(tmpInst);
+    createShiftOr<0, false>(ImmValue, RegOp.getReg(), IDLoc, Instructions);
+  } else if ((ImmValue & (0xffffLL << 48)) == 0) {
+    if (!isGP64bit()) {
+      Error(IDLoc, "instruction requires a CPU feature not currently enabled");
+      return true;
+    }
+
+    //            <-------  lo32 ------>
+    // <-------  hi32 ------>
+    // <- hi16 ->             <- lo16 ->
+    //  _________________________________
+    // |          |          |          |
+    // | 16-bytes | 16-bytes | 16-bytes |
+    // |__________|__________|__________|
+    //
+    // For any value of j that is representable as a 48-bit integer, create
+    // a sequence of:
+    // li d,j => lui d,hi16(j)
+    //           ori d,d,hi16(lo32(j))
+    //           dsll d,d,16
+    //           ori d,d,lo16(lo32(j))
+    tmpInst.setOpcode(Mips::LUi);
+    tmpInst.addOperand(MCOperand::CreateReg(RegOp.getReg()));
+    tmpInst.addOperand(
+        MCOperand::CreateImm((ImmValue & (0xffffLL << 32)) >> 32));
+    Instructions.push_back(tmpInst);
+    createShiftOr<16, false>(ImmValue, RegOp.getReg(), IDLoc, Instructions);
+    createShiftOr<0, true>(ImmValue, RegOp.getReg(), IDLoc, Instructions);
+  } else {
+    if (!isGP64bit()) {
+      Error(IDLoc, "instruction requires a CPU feature not currently enabled");
+      return true;
+    }
+
+    // <-------  hi32 ------> <-------  lo32 ------>
+    // <- hi16 ->                        <- lo16 ->
+    //  ___________________________________________
+    // |          |          |          |          |
+    // | 16-bytes | 16-bytes | 16-bytes | 16-bytes |
+    // |__________|__________|__________|__________|
+    //
+    // For any value of j that isn't representable as a 48-bit integer.
+    // li d,j => lui d,hi16(j)
+    //           ori d,d,lo16(hi32(j))
+    //           dsll d,d,16
+    //           ori d,d,hi16(lo32(j))
+    //           dsll d,d,16
+    //           ori d,d,lo16(lo32(j))
+    tmpInst.setOpcode(Mips::LUi);
+    tmpInst.addOperand(MCOperand::CreateReg(RegOp.getReg()));
+    tmpInst.addOperand(
+        MCOperand::CreateImm((ImmValue & (0xffffLL << 48)) >> 48));
+    Instructions.push_back(tmpInst);
+    createShiftOr<32, false>(ImmValue, RegOp.getReg(), IDLoc, Instructions);
+    createShiftOr<16, true>(ImmValue, RegOp.getReg(), IDLoc, Instructions);
+    createShiftOr<0, true>(ImmValue, RegOp.getReg(), IDLoc, Instructions);
+  }
+  return false;
+}
+
+bool
+MipsAsmParser::expandLoadAddressReg(MCInst &Inst, SMLoc IDLoc,
+                                    SmallVectorImpl<MCInst> &Instructions) {
+  MCInst tmpInst;
+  const MCOperand &ImmOp = Inst.getOperand(2);
+  assert(ImmOp.isImm() && "expected immediate operand kind");
+  const MCOperand &SrcRegOp = Inst.getOperand(1);
+  assert(SrcRegOp.isReg() && "expected register operand kind");
+  const MCOperand &DstRegOp = Inst.getOperand(0);
+  assert(DstRegOp.isReg() && "expected register operand kind");
+  int ImmValue = ImmOp.getImm();
+  if (-32768 <= ImmValue && ImmValue <= 65535) {
+    // For -32768 <= j <= 65535.
+    // la d,j(s) => addiu d,s,j
+    tmpInst.setOpcode(Mips::ADDiu);
+    tmpInst.addOperand(MCOperand::CreateReg(DstRegOp.getReg()));
+    tmpInst.addOperand(MCOperand::CreateReg(SrcRegOp.getReg()));
+    tmpInst.addOperand(MCOperand::CreateImm(ImmValue));
+    Instructions.push_back(tmpInst);
+  } else {
+    // For any other value of j that is representable as a 32-bit integer.
+    // la d,j(s) => lui d,hi16(j)
+    //              ori d,d,lo16(j)
+    //              addu d,d,s
+    tmpInst.setOpcode(Mips::LUi);
+    tmpInst.addOperand(MCOperand::CreateReg(DstRegOp.getReg()));
+    tmpInst.addOperand(MCOperand::CreateImm((ImmValue & 0xffff0000) >> 16));
+    Instructions.push_back(tmpInst);
+    tmpInst.clear();
+    tmpInst.setOpcode(Mips::ORi);
+    tmpInst.addOperand(MCOperand::CreateReg(DstRegOp.getReg()));
+    tmpInst.addOperand(MCOperand::CreateReg(DstRegOp.getReg()));
+    tmpInst.addOperand(MCOperand::CreateImm(ImmValue & 0xffff));
+    Instructions.push_back(tmpInst);
+    tmpInst.clear();
+    tmpInst.setOpcode(Mips::ADDu);
+    tmpInst.addOperand(MCOperand::CreateReg(DstRegOp.getReg()));
+    tmpInst.addOperand(MCOperand::CreateReg(DstRegOp.getReg()));
+    tmpInst.addOperand(MCOperand::CreateReg(SrcRegOp.getReg()));
+    Instructions.push_back(tmpInst);
+  }
+  return false;
+}
+
+bool
+MipsAsmParser::expandLoadAddressImm(MCInst &Inst, SMLoc IDLoc,
+                                    SmallVectorImpl<MCInst> &Instructions) {
+  MCInst tmpInst;
+  const MCOperand &ImmOp = Inst.getOperand(1);
+  assert(ImmOp.isImm() && "expected immediate operand kind");
+  const MCOperand &RegOp = Inst.getOperand(0);
+  assert(RegOp.isReg() && "expected register operand kind");
+  int ImmValue = ImmOp.getImm();
+  if (-32768 <= ImmValue && ImmValue <= 65535) {
+    // For -32768 <= j <= 65535.
+    // la d,j => addiu d,$zero,j
+    tmpInst.setOpcode(Mips::ADDiu);
+    tmpInst.addOperand(MCOperand::CreateReg(RegOp.getReg()));
+    tmpInst.addOperand(MCOperand::CreateReg(Mips::ZERO));
+    tmpInst.addOperand(MCOperand::CreateImm(ImmValue));
+    Instructions.push_back(tmpInst);
+  } else {
+    // For any other value of j that is representable as a 32-bit integer.
+    // la d,j => lui d,hi16(j)
+    //           ori d,d,lo16(j)
+    tmpInst.setOpcode(Mips::LUi);
+    tmpInst.addOperand(MCOperand::CreateReg(RegOp.getReg()));
+    tmpInst.addOperand(MCOperand::CreateImm((ImmValue & 0xffff0000) >> 16));
+    Instructions.push_back(tmpInst);
+    tmpInst.clear();
+    tmpInst.setOpcode(Mips::ORi);
+    tmpInst.addOperand(MCOperand::CreateReg(RegOp.getReg()));
+    tmpInst.addOperand(MCOperand::CreateReg(RegOp.getReg()));
+    tmpInst.addOperand(MCOperand::CreateImm(ImmValue & 0xffff));
+    Instructions.push_back(tmpInst);
+  }
+  return false;
+}
+
+void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc,
+                                  SmallVectorImpl<MCInst> &Instructions,
+                                  bool isLoad, bool isImmOpnd) {
+  const MCSymbolRefExpr *SR;
+  MCInst TempInst;
+  unsigned ImmOffset, HiOffset, LoOffset;
+  const MCExpr *ExprOffset;
+  unsigned TmpRegNum;
+  // 1st operand is either the source or destination register.
+  assert(Inst.getOperand(0).isReg() && "expected register operand kind");
+  unsigned RegOpNum = Inst.getOperand(0).getReg();
+  // 2nd operand is the base register.
+  assert(Inst.getOperand(1).isReg() && "expected register operand kind");
+  unsigned BaseRegNum = Inst.getOperand(1).getReg();
+  // 3rd operand is either an immediate or expression.
+  if (isImmOpnd) {
+    assert(Inst.getOperand(2).isImm() && "expected immediate operand kind");
+    ImmOffset = Inst.getOperand(2).getImm();
+    LoOffset = ImmOffset & 0x0000ffff;
+    HiOffset = (ImmOffset & 0xffff0000) >> 16;
+    // If msb of LoOffset is 1(negative number) we must increment HiOffset.
+    if (LoOffset & 0x8000)
+      HiOffset++;
+  } else
+    ExprOffset = Inst.getOperand(2).getExpr();
+  // All instructions will have the same location.
+  TempInst.setLoc(IDLoc);
+  // These are some of the types of expansions we perform here:
+  // 1) lw $8, sym        => lui $8, %hi(sym)
+  //                         lw $8, %lo(sym)($8)
+  // 2) lw $8, offset($9) => lui $8, %hi(offset)
+  //                         add $8, $8, $9
+  //                         lw $8, %lo(offset)($9)
+  // 3) lw $8, offset($8) => lui $at, %hi(offset)
+  //                         add $at, $at, $8
+  //                         lw $8, %lo(offset)($at)
+  // 4) sw $8, sym        => lui $at, %hi(sym)
+  //                         sw $8, %lo(sym)($at)
+  // 5) sw $8, offset($8) => lui $at, %hi(offset)
+  //                         add $at, $at, $8
+  //                         sw $8, %lo(offset)($at)
+  // 6) ldc1 $f0, sym     => lui $at, %hi(sym)
+  //                         ldc1 $f0, %lo(sym)($at)
+  //
+  // For load instructions we can use the destination register as a temporary
+  // if base and dst are different (examples 1 and 2) and if the base register
+  // is general purpose otherwise we must use $at (example 6) and error if it's
+  // not available. For stores we must use $at (examples 4 and 5) because we
+  // must not clobber the source register setting up the offset.
+  const MCInstrDesc &Desc = getInstDesc(Inst.getOpcode());
+  int16_t RegClassOp0 = Desc.OpInfo[0].RegClass;
+  unsigned RegClassIDOp0 =
+      getContext().getRegisterInfo()->getRegClass(RegClassOp0).getID();
+  bool IsGPR = (RegClassIDOp0 == Mips::GPR32RegClassID) ||
+               (RegClassIDOp0 == Mips::GPR64RegClassID);
+  if (isLoad && IsGPR && (BaseRegNum != RegOpNum))
+    TmpRegNum = RegOpNum;
+  else {
+    int AT = getATReg(IDLoc);
+    // At this point we need AT to perform the expansions and we exit if it is
+    // not available.
+    if (!AT)
+      return;
+    TmpRegNum = getReg(
+        (isGP64bit()) ? Mips::GPR64RegClassID : Mips::GPR32RegClassID, AT);
+  }
+
+  TempInst.setOpcode(Mips::LUi);
+  TempInst.addOperand(MCOperand::CreateReg(TmpRegNum));
+  if (isImmOpnd)
+    TempInst.addOperand(MCOperand::CreateImm(HiOffset));
+  else {
+    if (ExprOffset->getKind() == MCExpr::SymbolRef) {
+      SR = static_cast<const MCSymbolRefExpr *>(ExprOffset);
+      const MCSymbolRefExpr *HiExpr = MCSymbolRefExpr::Create(
+          SR->getSymbol().getName(), MCSymbolRefExpr::VK_Mips_ABS_HI,
+          getContext());
+      TempInst.addOperand(MCOperand::CreateExpr(HiExpr));
+    } else {
+      const MCExpr *HiExpr = evaluateRelocExpr(ExprOffset, "hi");
+      TempInst.addOperand(MCOperand::CreateExpr(HiExpr));
+    }
+  }
+  // Add the instruction to the list.
+  Instructions.push_back(TempInst);
+  // Prepare TempInst for next instruction.
+  TempInst.clear();
+  // Add temp register to base.
+  TempInst.setOpcode(Mips::ADDu);
+  TempInst.addOperand(MCOperand::CreateReg(TmpRegNum));
+  TempInst.addOperand(MCOperand::CreateReg(TmpRegNum));
+  TempInst.addOperand(MCOperand::CreateReg(BaseRegNum));
+  Instructions.push_back(TempInst);
+  TempInst.clear();
+  // And finally, create original instruction with low part
+  // of offset and new base.
+  TempInst.setOpcode(Inst.getOpcode());
+  TempInst.addOperand(MCOperand::CreateReg(RegOpNum));
+  TempInst.addOperand(MCOperand::CreateReg(TmpRegNum));
+  if (isImmOpnd)
+    TempInst.addOperand(MCOperand::CreateImm(LoOffset));
+  else {
+    if (ExprOffset->getKind() == MCExpr::SymbolRef) {
+      const MCSymbolRefExpr *LoExpr = MCSymbolRefExpr::Create(
+          SR->getSymbol().getName(), MCSymbolRefExpr::VK_Mips_ABS_LO,
+          getContext());
+      TempInst.addOperand(MCOperand::CreateExpr(LoExpr));
+    } else {
+      const MCExpr *LoExpr = evaluateRelocExpr(ExprOffset, "lo");
+      TempInst.addOperand(MCOperand::CreateExpr(LoExpr));
+    }
+  }
+  Instructions.push_back(TempInst);
+  TempInst.clear();
+}
+
+unsigned MipsAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
+  // As described by the Mips32r2 spec, the registers Rd and Rs for
+  // jalr.hb must be different.
+  unsigned Opcode = Inst.getOpcode();
+
+  if (Opcode == Mips::JALR_HB &&
+      (Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg()))
+    return Match_RequiresDifferentSrcAndDst;
+
+  return Match_Success;
+}
+
+bool MipsAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                            OperandVector &Operands,
+                                            MCStreamer &Out,
+                                            unsigned &ErrorInfo,
+                                            bool MatchingInlineAsm) {
+
+  MCInst Inst;
+  SmallVector<MCInst, 8> Instructions;
+  unsigned MatchResult =
+      MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm);
+
+  switch (MatchResult) {
+  default:
+    break;
+  case Match_Success: {
+    if (processInstruction(Inst, IDLoc, Instructions))
+      return true;
+    for (unsigned i = 0; i < Instructions.size(); i++)
+      Out.EmitInstruction(Instructions[i], STI);
+    return false;
+  }
+  case Match_MissingFeature:
+    Error(IDLoc, "instruction requires a CPU feature not currently enabled");
+    return true;
+  case Match_InvalidOperand: {
+    SMLoc ErrorLoc = IDLoc;
+    if (ErrorInfo != ~0U) {
+      if (ErrorInfo >= Operands.size())
+        return Error(IDLoc, "too few operands for instruction");
+
+      ErrorLoc = ((MipsOperand &)*Operands[ErrorInfo]).getStartLoc();
+      if (ErrorLoc == SMLoc())
+        ErrorLoc = IDLoc;
+    }
+
+    return Error(ErrorLoc, "invalid operand for instruction");
+  }
+  case Match_MnemonicFail:
+    return Error(IDLoc, "invalid instruction");
+  case Match_RequiresDifferentSrcAndDst:
+    return Error(IDLoc, "source and destination must be different");
+  }
+  return true;
+}
+
+void MipsAsmParser::WarnIfAssemblerTemporary(int RegIndex, SMLoc Loc) {
+  if ((RegIndex != 0) && ((int)Options.getATRegNum() == RegIndex)) {
+    if (RegIndex == 1)
+      Warning(Loc, "Used $at without \".set noat\"");
+    else
+      Warning(Loc, Twine("Used $") + Twine(RegIndex) + " with \".set at=$" +
+                       Twine(RegIndex) + "\"");
+  }
+}
+
+int MipsAsmParser::matchCPURegisterName(StringRef Name) {
+  int CC;
+
+  CC = StringSwitch<unsigned>(Name)
+           .Case("zero", 0)
+           .Case("at", 1)
+           .Case("a0", 4)
+           .Case("a1", 5)
+           .Case("a2", 6)
+           .Case("a3", 7)
+           .Case("v0", 2)
+           .Case("v1", 3)
+           .Case("s0", 16)
+           .Case("s1", 17)
+           .Case("s2", 18)
+           .Case("s3", 19)
+           .Case("s4", 20)
+           .Case("s5", 21)
+           .Case("s6", 22)
+           .Case("s7", 23)
+           .Case("k0", 26)
+           .Case("k1", 27)
+           .Case("gp", 28)
+           .Case("sp", 29)
+           .Case("fp", 30)
+           .Case("s8", 30)
+           .Case("ra", 31)
+           .Case("t0", 8)
+           .Case("t1", 9)
+           .Case("t2", 10)
+           .Case("t3", 11)
+           .Case("t4", 12)
+           .Case("t5", 13)
+           .Case("t6", 14)
+           .Case("t7", 15)
+           .Case("t8", 24)
+           .Case("t9", 25)
+           .Default(-1);
+
+  if (isABI_N32() || isABI_N64()) {
+    // Although SGI documentation just cuts out t0-t3 for n32/n64,
+    // GNU pushes the values of t0-t3 to override the o32/o64 values for t4-t7
+    // We are supporting both cases, so for t0-t3 we'll just push them to t4-t7.
+    if (8 <= CC && CC <= 11)
+      CC += 4;
+
+    if (CC == -1)
+      CC = StringSwitch<unsigned>(Name)
+               .Case("a4", 8)
+               .Case("a5", 9)
+               .Case("a6", 10)
+               .Case("a7", 11)
+               .Case("kt0", 26)
+               .Case("kt1", 27)
+               .Default(-1);
+  }
+
+  return CC;
+}
+
+int MipsAsmParser::matchFPURegisterName(StringRef Name) {
+
+  if (Name[0] == 'f') {
+    StringRef NumString = Name.substr(1);
+    unsigned IntVal;
+    if (NumString.getAsInteger(10, IntVal))
+      return -1;     // This is not an integer.
+    if (IntVal > 31) // Maximum index for fpu register.
+      return -1;
+    return IntVal;
+  }
+  return -1;
+}
+
+int MipsAsmParser::matchFCCRegisterName(StringRef Name) {
+
+  if (Name.startswith("fcc")) {
+    StringRef NumString = Name.substr(3);
+    unsigned IntVal;
+    if (NumString.getAsInteger(10, IntVal))
+      return -1;    // This is not an integer.
+    if (IntVal > 7) // There are only 8 fcc registers.
+      return -1;
+    return IntVal;
+  }
+  return -1;
+}
+
+int MipsAsmParser::matchACRegisterName(StringRef Name) {
+
+  if (Name.startswith("ac")) {
+    StringRef NumString = Name.substr(2);
+    unsigned IntVal;
+    if (NumString.getAsInteger(10, IntVal))
+      return -1;    // This is not an integer.
+    if (IntVal > 3) // There are only 3 acc registers.
+      return -1;
+    return IntVal;
+  }
+  return -1;
+}
+
+int MipsAsmParser::matchMSA128RegisterName(StringRef Name) {
+  unsigned IntVal;
+
+  if (Name.front() != 'w' || Name.drop_front(1).getAsInteger(10, IntVal))
+    return -1;
+
+  if (IntVal > 31)
+    return -1;
+
+  return IntVal;
+}
+
+int MipsAsmParser::matchMSA128CtrlRegisterName(StringRef Name) {
+  int CC;
+
+  CC = StringSwitch<unsigned>(Name)
+           .Case("msair", 0)
+           .Case("msacsr", 1)
+           .Case("msaaccess", 2)
+           .Case("msasave", 3)
+           .Case("msamodify", 4)
+           .Case("msarequest", 5)
+           .Case("msamap", 6)
+           .Case("msaunmap", 7)
+           .Default(-1);
+
+  return CC;
+}
+
+bool MipsAssemblerOptions::setATReg(unsigned Reg) {
+  if (Reg > 31)
+    return false;
+
+  aTReg = Reg;
+  return true;
+}
+
+int MipsAsmParser::getATReg(SMLoc Loc) {
+  int AT = Options.getATRegNum();
+  if (AT == 0)
+    reportParseError(Loc,
+                     "Pseudo instruction requires $at, which is not available");
+  return AT;
+}
+
+unsigned MipsAsmParser::getReg(int RC, int RegNo) {
+  return *(getContext().getRegisterInfo()->getRegClass(RC).begin() + RegNo);
+}
+
+unsigned MipsAsmParser::getGPR(int RegNo) {
+  return getReg(isGP64bit() ? Mips::GPR64RegClassID : Mips::GPR32RegClassID,
+                RegNo);
+}
+
+int MipsAsmParser::matchRegisterByNumber(unsigned RegNum, unsigned RegClass) {
+  if (RegNum >
+      getContext().getRegisterInfo()->getRegClass(RegClass).getNumRegs() - 1)
+    return -1;
+
+  return getReg(RegClass, RegNum);
+}
+
+bool MipsAsmParser::ParseOperand(OperandVector &Operands, StringRef Mnemonic) {
+  DEBUG(dbgs() << "ParseOperand\n");
+
+  // Check if the current operand has a custom associated parser, if so, try to
+  // custom parse the operand, or fallback to the general approach.
+  OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
+  if (ResTy == MatchOperand_Success)
+    return false;
+  // If there wasn't a custom match, try the generic matcher below. Otherwise,
+  // there was a match, but an error occurred, in which case, just return that
+  // the operand parsing failed.
+  if (ResTy == MatchOperand_ParseFail)
+    return true;
+
+  DEBUG(dbgs() << ".. Generic Parser\n");
+
+  switch (getLexer().getKind()) {
+  default:
+    Error(Parser.getTok().getLoc(), "unexpected token in operand");
+    return true;
+  case AsmToken::Dollar: {
+    // Parse the register.
+    SMLoc S = Parser.getTok().getLoc();
+
+    // Almost all registers have been parsed by custom parsers. There is only
+    // one exception to this. $zero (and it's alias $0) will reach this point
+    // for div, divu, and similar instructions because it is not an operand
+    // to the instruction definition but an explicit register. Special case
+    // this situation for now.
+    if (ParseAnyRegister(Operands) != MatchOperand_NoMatch)
+      return false;
+
+    // Maybe it is a symbol reference.
+    StringRef Identifier;
+    if (Parser.parseIdentifier(Identifier))
+      return true;
+
+    SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+    MCSymbol *Sym = getContext().GetOrCreateSymbol("$" + Identifier);
+    // Otherwise create a symbol reference.
+    const MCExpr *Res =
+        MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None, getContext());
+
+    Operands.push_back(MipsOperand::CreateImm(Res, S, E, *this));
+    return false;
+  }
+  // Else drop to expression parsing.
+  case AsmToken::LParen:
+  case AsmToken::Minus:
+  case AsmToken::Plus:
+  case AsmToken::Integer:
+  case AsmToken::Tilde:
+  case AsmToken::String: {
+    DEBUG(dbgs() << ".. generic integer\n");
+    OperandMatchResultTy ResTy = ParseImm(Operands);
+    return ResTy != MatchOperand_Success;
+  }
+  case AsmToken::Percent: {
+    // It is a symbol reference or constant expression.
+    const MCExpr *IdVal;
+    SMLoc S = Parser.getTok().getLoc(); // Start location of the operand.
+    if (parseRelocOperand(IdVal))
+      return true;
+
+    SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+
+    Operands.push_back(MipsOperand::CreateImm(IdVal, S, E, *this));
+    return false;
+  } // case AsmToken::Percent
+  } // switch(getLexer().getKind())
+  return true;
+}
+
+const MCExpr *MipsAsmParser::evaluateRelocExpr(const MCExpr *Expr,
+                                               StringRef RelocStr) {
+  const MCExpr *Res;
+  // Check the type of the expression.
+  if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Expr)) {
+    // It's a constant, evaluate reloc value.
+    int16_t Val;
+    switch (getVariantKind(RelocStr)) {
+    case MCSymbolRefExpr::VK_Mips_ABS_LO:
+      // Get the 1st 16-bits.
+      Val = MCE->getValue() & 0xffff;
+      break;
+    case MCSymbolRefExpr::VK_Mips_ABS_HI:
+      // Get the 2nd 16-bits. Also add 1 if bit 15 is 1, to compensate for low
+      // 16 bits being negative.
+      Val = ((MCE->getValue() + 0x8000) >> 16) & 0xffff;
+      break;
+    case MCSymbolRefExpr::VK_Mips_HIGHER:
+      // Get the 3rd 16-bits.
+      Val = ((MCE->getValue() + 0x80008000LL) >> 32) & 0xffff;
+      break;
+    case MCSymbolRefExpr::VK_Mips_HIGHEST:
+      // Get the 4th 16-bits.
+      Val = ((MCE->getValue() + 0x800080008000LL) >> 48) & 0xffff;
+      break;
+    default:
+      report_fatal_error("Unsupported reloc value!");
+    }
+    return MCConstantExpr::Create(Val, getContext());
+  }
+
+  if (const MCSymbolRefExpr *MSRE = dyn_cast<MCSymbolRefExpr>(Expr)) {
+    // It's a symbol, create a symbolic expression from the symbol.
+    StringRef Symbol = MSRE->getSymbol().getName();
+    MCSymbolRefExpr::VariantKind VK = getVariantKind(RelocStr);
+    Res = MCSymbolRefExpr::Create(Symbol, VK, getContext());
+    return Res;
+  }
+
+  if (const MCBinaryExpr *BE = dyn_cast<MCBinaryExpr>(Expr)) {
+    MCSymbolRefExpr::VariantKind VK = getVariantKind(RelocStr);
+
+    // Try to create target expression.
+    if (MipsMCExpr::isSupportedBinaryExpr(VK, BE))
+      return MipsMCExpr::Create(VK, Expr, getContext());
+
+    const MCExpr *LExp = evaluateRelocExpr(BE->getLHS(), RelocStr);
+    const MCExpr *RExp = evaluateRelocExpr(BE->getRHS(), RelocStr);
+    Res = MCBinaryExpr::Create(BE->getOpcode(), LExp, RExp, getContext());
+    return Res;
+  }
+
+  if (const MCUnaryExpr *UN = dyn_cast<MCUnaryExpr>(Expr)) {
+    const MCExpr *UnExp = evaluateRelocExpr(UN->getSubExpr(), RelocStr);
+    Res = MCUnaryExpr::Create(UN->getOpcode(), UnExp, getContext());
+    return Res;
+  }
+  // Just return the original expression.
+  return Expr;
+}
+
+bool MipsAsmParser::isEvaluated(const MCExpr *Expr) {
+
+  switch (Expr->getKind()) {
+  case MCExpr::Constant:
+    return true;
+  case MCExpr::SymbolRef:
+    return (cast<MCSymbolRefExpr>(Expr)->getKind() != MCSymbolRefExpr::VK_None);
+  case MCExpr::Binary:
+    if (const MCBinaryExpr *BE = dyn_cast<MCBinaryExpr>(Expr)) {
+      if (!isEvaluated(BE->getLHS()))
+        return false;
+      return isEvaluated(BE->getRHS());
+    }
+  case MCExpr::Unary:
+    return isEvaluated(cast<MCUnaryExpr>(Expr)->getSubExpr());
+  case MCExpr::Target:
+    return true;
+  }
+  return false;
+}
+
+bool MipsAsmParser::parseRelocOperand(const MCExpr *&Res) {
+  Parser.Lex();                          // Eat the % token.
+  const AsmToken &Tok = Parser.getTok(); // Get next token, operation.
+  if (Tok.isNot(AsmToken::Identifier))
+    return true;
+
+  std::string Str = Tok.getIdentifier().str();
+
+  Parser.Lex(); // Eat the identifier.
+  // Now make an expression from the rest of the operand.
+  const MCExpr *IdVal;
+  SMLoc EndLoc;
+
+  if (getLexer().getKind() == AsmToken::LParen) {
+    while (1) {
+      Parser.Lex(); // Eat the '(' token.
+      if (getLexer().getKind() == AsmToken::Percent) {
+        Parser.Lex(); // Eat the % token.
+        const AsmToken &nextTok = Parser.getTok();
+        if (nextTok.isNot(AsmToken::Identifier))
+          return true;
+        Str += "(%";
+        Str += nextTok.getIdentifier();
+        Parser.Lex(); // Eat the identifier.
+        if (getLexer().getKind() != AsmToken::LParen)
+          return true;
+      } else
+        break;
+    }
+    if (getParser().parseParenExpression(IdVal, EndLoc))
+      return true;
+
+    while (getLexer().getKind() == AsmToken::RParen)
+      Parser.Lex(); // Eat the ')' token.
+
+  } else
+    return true; // Parenthesis must follow the relocation operand.
+
+  Res = evaluateRelocExpr(IdVal, Str);
+  return false;
+}
+
+bool MipsAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                  SMLoc &EndLoc) {
+  SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> Operands;
+  OperandMatchResultTy ResTy = ParseAnyRegister(Operands);
+  if (ResTy == MatchOperand_Success) {
+    assert(Operands.size() == 1);
+    MipsOperand &Operand = static_cast<MipsOperand &>(*Operands.front());
+    StartLoc = Operand.getStartLoc();
+    EndLoc = Operand.getEndLoc();
+
+    // AFAIK, we only support numeric registers and named GPR's in CFI
+    // directives.
+    // Don't worry about eating tokens before failing. Using an unrecognised
+    // register is a parse error.
+    if (Operand.isGPRAsmReg()) {
+      // Resolve to GPR32 or GPR64 appropriately.
+      RegNo = isGP64bit() ? Operand.getGPR64Reg() : Operand.getGPR32Reg();
+    }
+
+    return (RegNo == (unsigned)-1);
+  }
+
+  assert(Operands.size() == 0);
+  return (RegNo == (unsigned)-1);
+}
+
+bool MipsAsmParser::parseMemOffset(const MCExpr *&Res, bool isParenExpr) {
+  SMLoc S;
+  bool Result = true;
+
+  while (getLexer().getKind() == AsmToken::LParen)
+    Parser.Lex();
+
+  switch (getLexer().getKind()) {
+  default:
+    return true;
+  case AsmToken::Identifier:
+  case AsmToken::LParen:
+  case AsmToken::Integer:
+  case AsmToken::Minus:
+  case AsmToken::Plus:
+    if (isParenExpr)
+      Result = getParser().parseParenExpression(Res, S);
+    else
+      Result = (getParser().parseExpression(Res));
+    while (getLexer().getKind() == AsmToken::RParen)
+      Parser.Lex();
+    break;
+  case AsmToken::Percent:
+    Result = parseRelocOperand(Res);
+  }
+  return Result;
+}
+
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::parseMemOperand(OperandVector &Operands) {
+  DEBUG(dbgs() << "parseMemOperand\n");
+  const MCExpr *IdVal = nullptr;
+  SMLoc S;
+  bool isParenExpr = false;
+  MipsAsmParser::OperandMatchResultTy Res = MatchOperand_NoMatch;
+  // First operand is the offset.
+  S = Parser.getTok().getLoc();
+
+  if (getLexer().getKind() == AsmToken::LParen) {
+    Parser.Lex();
+    isParenExpr = true;
+  }
+
+  if (getLexer().getKind() != AsmToken::Dollar) {
+    if (parseMemOffset(IdVal, isParenExpr))
+      return MatchOperand_ParseFail;
+
+    const AsmToken &Tok = Parser.getTok(); // Get the next token.
+    if (Tok.isNot(AsmToken::LParen)) {
+      MipsOperand &Mnemonic = static_cast<MipsOperand &>(*Operands[0]);
+      if (Mnemonic.getToken() == "la") {
+        SMLoc E =
+            SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+        Operands.push_back(MipsOperand::CreateImm(IdVal, S, E, *this));
+        return MatchOperand_Success;
+      }
+      if (Tok.is(AsmToken::EndOfStatement)) {
+        SMLoc E =
+            SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+
+        // Zero register assumed, add a memory operand with ZERO as its base.
+        // "Base" will be managed by k_Memory.
+        auto Base = MipsOperand::CreateGPRReg(0, getContext().getRegisterInfo(),
+                                              S, E, *this);
+        Operands.push_back(
+            MipsOperand::CreateMem(std::move(Base), IdVal, S, E, *this));
+        return MatchOperand_Success;
+      }
+      Error(Parser.getTok().getLoc(), "'(' expected");
+      return MatchOperand_ParseFail;
+    }
+
+    Parser.Lex(); // Eat the '(' token.
+  }
+
+  Res = ParseAnyRegister(Operands);
+  if (Res != MatchOperand_Success)
+    return Res;
+
+  if (Parser.getTok().isNot(AsmToken::RParen)) {
+    Error(Parser.getTok().getLoc(), "')' expected");
+    return MatchOperand_ParseFail;
+  }
+
+  SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+
+  Parser.Lex(); // Eat the ')' token.
+
+  if (!IdVal)
+    IdVal = MCConstantExpr::Create(0, getContext());
+
+  // Replace the register operand with the memory operand.
+  std::unique_ptr<MipsOperand> op(
+      static_cast<MipsOperand *>(Operands.back().release()));
+  // Remove the register from the operands.
+  // "op" will be managed by k_Memory.
+  Operands.pop_back();
+  // Add the memory operand.
+  if (const MCBinaryExpr *BE = dyn_cast<MCBinaryExpr>(IdVal)) {
+    int64_t Imm;
+    if (IdVal->EvaluateAsAbsolute(Imm))
+      IdVal = MCConstantExpr::Create(Imm, getContext());
+    else if (BE->getLHS()->getKind() != MCExpr::SymbolRef)
+      IdVal = MCBinaryExpr::Create(BE->getOpcode(), BE->getRHS(), BE->getLHS(),
+                                   getContext());
+  }
+
+  Operands.push_back(MipsOperand::CreateMem(std::move(op), IdVal, S, E, *this));
+  return MatchOperand_Success;
+}
+
+bool MipsAsmParser::searchSymbolAlias(OperandVector &Operands) {
+
+  MCSymbol *Sym = getContext().LookupSymbol(Parser.getTok().getIdentifier());
+  if (Sym) {
+    SMLoc S = Parser.getTok().getLoc();
+    const MCExpr *Expr;
+    if (Sym->isVariable())
+      Expr = Sym->getVariableValue();
+    else
+      return false;
+    if (Expr->getKind() == MCExpr::SymbolRef) {
+      const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr *>(Expr);
+      const StringRef DefSymbol = Ref->getSymbol().getName();
+      if (DefSymbol.startswith("$")) {
+        OperandMatchResultTy ResTy =
+            MatchAnyRegisterNameWithoutDollar(Operands, DefSymbol.substr(1), S);
+        if (ResTy == MatchOperand_Success) {
+          Parser.Lex();
+          return true;
+        } else if (ResTy == MatchOperand_ParseFail)
+          llvm_unreachable("Should never ParseFail");
+        return false;
+      }
+    } else if (Expr->getKind() == MCExpr::Constant) {
+      Parser.Lex();
+      const MCConstantExpr *Const = static_cast<const MCConstantExpr *>(Expr);
+      Operands.push_back(
+          MipsOperand::CreateImm(Const, S, Parser.getTok().getLoc(), *this));
+      return true;
+    }
+  }
+  return false;
+}
+
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::MatchAnyRegisterNameWithoutDollar(OperandVector &Operands,
+                                                 StringRef Identifier,
+                                                 SMLoc S) {
+  int Index = matchCPURegisterName(Identifier);
+  if (Index != -1) {
+    Operands.push_back(MipsOperand::CreateGPRReg(
+        Index, getContext().getRegisterInfo(), S, getLexer().getLoc(), *this));
+    return MatchOperand_Success;
+  }
+
+  Index = matchFPURegisterName(Identifier);
+  if (Index != -1) {
+    Operands.push_back(MipsOperand::CreateFGRReg(
+        Index, getContext().getRegisterInfo(), S, getLexer().getLoc(), *this));
+    return MatchOperand_Success;
+  }
+
+  Index = matchFCCRegisterName(Identifier);
+  if (Index != -1) {
+    Operands.push_back(MipsOperand::CreateFCCReg(
+        Index, getContext().getRegisterInfo(), S, getLexer().getLoc(), *this));
+    return MatchOperand_Success;
+  }
+
+  Index = matchACRegisterName(Identifier);
+  if (Index != -1) {
+    Operands.push_back(MipsOperand::CreateACCReg(
+        Index, getContext().getRegisterInfo(), S, getLexer().getLoc(), *this));
+    return MatchOperand_Success;
+  }
+
+  Index = matchMSA128RegisterName(Identifier);
+  if (Index != -1) {
+    Operands.push_back(MipsOperand::CreateMSA128Reg(
+        Index, getContext().getRegisterInfo(), S, getLexer().getLoc(), *this));
+    return MatchOperand_Success;
+  }
+
+  Index = matchMSA128CtrlRegisterName(Identifier);
+  if (Index != -1) {
+    Operands.push_back(MipsOperand::CreateMSACtrlReg(
+        Index, getContext().getRegisterInfo(), S, getLexer().getLoc(), *this));
+    return MatchOperand_Success;
+  }
+
+  return MatchOperand_NoMatch;
+}
+
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::MatchAnyRegisterWithoutDollar(OperandVector &Operands, SMLoc S) {
+  auto Token = Parser.getLexer().peekTok(false);
+
+  if (Token.is(AsmToken::Identifier)) {
+    DEBUG(dbgs() << ".. identifier\n");
+    StringRef Identifier = Token.getIdentifier();
+    OperandMatchResultTy ResTy =
+        MatchAnyRegisterNameWithoutDollar(Operands, Identifier, S);
+    return ResTy;
+  } else if (Token.is(AsmToken::Integer)) {
+    DEBUG(dbgs() << ".. integer\n");
+    Operands.push_back(MipsOperand::CreateNumericReg(
+        Token.getIntVal(), getContext().getRegisterInfo(), S, Token.getLoc(),
+        *this));
+    return MatchOperand_Success;
+  }
+
+  DEBUG(dbgs() << Parser.getTok().getKind() << "\n");
+
+  return MatchOperand_NoMatch;
+}
+
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::ParseAnyRegister(OperandVector &Operands) {
+  DEBUG(dbgs() << "ParseAnyRegister\n");
+
+  auto Token = Parser.getTok();
+
+  SMLoc S = Token.getLoc();
+
+  if (Token.isNot(AsmToken::Dollar)) {
+    DEBUG(dbgs() << ".. !$ -> try sym aliasing\n");
+    if (Token.is(AsmToken::Identifier)) {
+      if (searchSymbolAlias(Operands))
+        return MatchOperand_Success;
+    }
+    DEBUG(dbgs() << ".. !symalias -> NoMatch\n");
+    return MatchOperand_NoMatch;
+  }
+  DEBUG(dbgs() << ".. $\n");
+
+  OperandMatchResultTy ResTy = MatchAnyRegisterWithoutDollar(Operands, S);
+  if (ResTy == MatchOperand_Success) {
+    Parser.Lex(); // $
+    Parser.Lex(); // identifier
+  }
+  return ResTy;
+}
+
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::ParseImm(OperandVector &Operands) {
+  switch (getLexer().getKind()) {
+  default:
+    return MatchOperand_NoMatch;
+  case AsmToken::LParen:
+  case AsmToken::Minus:
+  case AsmToken::Plus:
+  case AsmToken::Integer:
+  case AsmToken::Tilde:
+  case AsmToken::String:
+    break;
+  }
+
+  const MCExpr *IdVal;
+  SMLoc S = Parser.getTok().getLoc();
+  if (getParser().parseExpression(IdVal))
+    return MatchOperand_ParseFail;
+
+  SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+  Operands.push_back(MipsOperand::CreateImm(IdVal, S, E, *this));
+  return MatchOperand_Success;
+}
+
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::ParseJumpTarget(OperandVector &Operands) {
+  DEBUG(dbgs() << "ParseJumpTarget\n");
+
+  SMLoc S = getLexer().getLoc();
+
+  // Integers and expressions are acceptable
+  OperandMatchResultTy ResTy = ParseImm(Operands);
+  if (ResTy != MatchOperand_NoMatch)
+    return ResTy;
+
+  // Registers are a valid target and have priority over symbols.
+  ResTy = ParseAnyRegister(Operands);
+  if (ResTy != MatchOperand_NoMatch)
+    return ResTy;
+
+  const MCExpr *Expr = nullptr;
+  if (Parser.parseExpression(Expr)) {
+    // We have no way of knowing if a symbol was consumed so we must ParseFail
+    return MatchOperand_ParseFail;
+  }
+  Operands.push_back(
+      MipsOperand::CreateImm(Expr, S, getLexer().getLoc(), *this));
+  return MatchOperand_Success;
+}
+
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::parseInvNum(OperandVector &Operands) {
+  const MCExpr *IdVal;
+  // If the first token is '$' we may have register operand.
+  if (Parser.getTok().is(AsmToken::Dollar))
+    return MatchOperand_NoMatch;
+  SMLoc S = Parser.getTok().getLoc();
+  if (getParser().parseExpression(IdVal))
+    return MatchOperand_ParseFail;
+  const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(IdVal);
+  assert(MCE && "Unexpected MCExpr type.");
+  int64_t Val = MCE->getValue();
+  SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+  Operands.push_back(MipsOperand::CreateImm(
+      MCConstantExpr::Create(0 - Val, getContext()), S, E, *this));
+  return MatchOperand_Success;
+}
+
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::ParseLSAImm(OperandVector &Operands) {
+  switch (getLexer().getKind()) {
+  default:
+    return MatchOperand_NoMatch;
+  case AsmToken::LParen:
+  case AsmToken::Plus:
+  case AsmToken::Minus:
+  case AsmToken::Integer:
+    break;
+  }
+
+  const MCExpr *Expr;
+  SMLoc S = Parser.getTok().getLoc();
+
+  if (getParser().parseExpression(Expr))
+    return MatchOperand_ParseFail;
+
+  int64_t Val;
+  if (!Expr->EvaluateAsAbsolute(Val)) {
+    Error(S, "expected immediate value");
+    return MatchOperand_ParseFail;
+  }
+
+  // The LSA instruction allows a 2-bit unsigned immediate. For this reason
+  // and because the CPU always adds one to the immediate field, the allowed
+  // range becomes 1..4. We'll only check the range here and will deal
+  // with the addition/subtraction when actually decoding/encoding
+  // the instruction.
+  if (Val < 1 || Val > 4) {
+    Error(S, "immediate not in range (1..4)");
+    return MatchOperand_ParseFail;
+  }
+
+  Operands.push_back(
+      MipsOperand::CreateImm(Expr, S, Parser.getTok().getLoc(), *this));
+  return MatchOperand_Success;
+}
+
+MCSymbolRefExpr::VariantKind MipsAsmParser::getVariantKind(StringRef Symbol) {
+
+  MCSymbolRefExpr::VariantKind VK =
+      StringSwitch<MCSymbolRefExpr::VariantKind>(Symbol)
+          .Case("hi", MCSymbolRefExpr::VK_Mips_ABS_HI)
+          .Case("lo", MCSymbolRefExpr::VK_Mips_ABS_LO)
+          .Case("gp_rel", MCSymbolRefExpr::VK_Mips_GPREL)
+          .Case("call16", MCSymbolRefExpr::VK_Mips_GOT_CALL)
+          .Case("got", MCSymbolRefExpr::VK_Mips_GOT)
+          .Case("tlsgd", MCSymbolRefExpr::VK_Mips_TLSGD)
+          .Case("tlsldm", MCSymbolRefExpr::VK_Mips_TLSLDM)
+          .Case("dtprel_hi", MCSymbolRefExpr::VK_Mips_DTPREL_HI)
+          .Case("dtprel_lo", MCSymbolRefExpr::VK_Mips_DTPREL_LO)
+          .Case("gottprel", MCSymbolRefExpr::VK_Mips_GOTTPREL)
+          .Case("tprel_hi", MCSymbolRefExpr::VK_Mips_TPREL_HI)
+          .Case("tprel_lo", MCSymbolRefExpr::VK_Mips_TPREL_LO)
+          .Case("got_disp", MCSymbolRefExpr::VK_Mips_GOT_DISP)
+          .Case("got_page", MCSymbolRefExpr::VK_Mips_GOT_PAGE)
+          .Case("got_ofst", MCSymbolRefExpr::VK_Mips_GOT_OFST)
+          .Case("hi(%neg(%gp_rel", MCSymbolRefExpr::VK_Mips_GPOFF_HI)
+          .Case("lo(%neg(%gp_rel", MCSymbolRefExpr::VK_Mips_GPOFF_LO)
+          .Case("got_hi", MCSymbolRefExpr::VK_Mips_GOT_HI16)
+          .Case("got_lo", MCSymbolRefExpr::VK_Mips_GOT_LO16)
+          .Case("call_hi", MCSymbolRefExpr::VK_Mips_CALL_HI16)
+          .Case("call_lo", MCSymbolRefExpr::VK_Mips_CALL_LO16)
+          .Case("higher", MCSymbolRefExpr::VK_Mips_HIGHER)
+          .Case("highest", MCSymbolRefExpr::VK_Mips_HIGHEST)
+          .Case("pcrel_hi", MCSymbolRefExpr::VK_Mips_PCREL_HI16)
+          .Case("pcrel_lo", MCSymbolRefExpr::VK_Mips_PCREL_LO16)
+          .Default(MCSymbolRefExpr::VK_None);
+
+  assert(VK != MCSymbolRefExpr::VK_None);
+
+  return VK;
+}
+
+/// Sometimes (i.e. load/stores) the operand may be followed immediately by
+/// either this.
+/// ::= '(', register, ')'
+/// handle it before we iterate so we don't get tripped up by the lack of
+/// a comma.
+bool MipsAsmParser::ParseParenSuffix(StringRef Name, OperandVector &Operands) {
+  if (getLexer().is(AsmToken::LParen)) {
+    Operands.push_back(
+        MipsOperand::CreateToken("(", getLexer().getLoc(), *this));
+    Parser.Lex();
+    if (ParseOperand(Operands, Name)) {
+      SMLoc Loc = getLexer().getLoc();
+      Parser.eatToEndOfStatement();
+      return Error(Loc, "unexpected token in argument list");
+    }
+    if (Parser.getTok().isNot(AsmToken::RParen)) {
+      SMLoc Loc = getLexer().getLoc();
+      Parser.eatToEndOfStatement();
+      return Error(Loc, "unexpected token, expected ')'");
+    }
+    Operands.push_back(
+        MipsOperand::CreateToken(")", getLexer().getLoc(), *this));
+    Parser.Lex();
+  }
+  return false;
+}
+
+/// Sometimes (i.e. in MSA) the operand may be followed immediately by
+/// either one of these.
+/// ::= '[', register, ']'
+/// ::= '[', integer, ']'
+/// handle it before we iterate so we don't get tripped up by the lack of
+/// a comma.
+bool MipsAsmParser::ParseBracketSuffix(StringRef Name,
+                                       OperandVector &Operands) {
+  if (getLexer().is(AsmToken::LBrac)) {
+    Operands.push_back(
+        MipsOperand::CreateToken("[", getLexer().getLoc(), *this));
+    Parser.Lex();
+    if (ParseOperand(Operands, Name)) {
+      SMLoc Loc = getLexer().getLoc();
+      Parser.eatToEndOfStatement();
+      return Error(Loc, "unexpected token in argument list");
+    }
+    if (Parser.getTok().isNot(AsmToken::RBrac)) {
+      SMLoc Loc = getLexer().getLoc();
+      Parser.eatToEndOfStatement();
+      return Error(Loc, "unexpected token, expected ']'");
+    }
+    Operands.push_back(
+        MipsOperand::CreateToken("]", getLexer().getLoc(), *this));
+    Parser.Lex();
+  }
+  return false;
+}
+
+bool MipsAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                                     SMLoc NameLoc, OperandVector &Operands) {
+  DEBUG(dbgs() << "ParseInstruction\n");
+  // We have reached first instruction, module directive after
+  // this is forbidden.
+  getTargetStreamer().setCanHaveModuleDir(false);
+  // Check if we have valid mnemonic
+  if (!mnemonicIsValid(Name, 0)) {
+    Parser.eatToEndOfStatement();
+    return Error(NameLoc, "Unknown instruction");
+  }
+  // First operand in MCInst is instruction mnemonic.
+  Operands.push_back(MipsOperand::CreateToken(Name, NameLoc, *this));
+
+  // Read the remaining operands.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    // Read the first operand.
+    if (ParseOperand(Operands, Name)) {
+      SMLoc Loc = getLexer().getLoc();
+      Parser.eatToEndOfStatement();
+      return Error(Loc, "unexpected token in argument list");
+    }
+    if (getLexer().is(AsmToken::LBrac) && ParseBracketSuffix(Name, Operands))
+      return true;
+    // AFAIK, parenthesis suffixes are never on the first operand
+
+    while (getLexer().is(AsmToken::Comma)) {
+      Parser.Lex(); // Eat the comma.
+      // Parse and remember the operand.
+      if (ParseOperand(Operands, Name)) {
+        SMLoc Loc = getLexer().getLoc();
+        Parser.eatToEndOfStatement();
+        return Error(Loc, "unexpected token in argument list");
+      }
+      // Parse bracket and parenthesis suffixes before we iterate
+      if (getLexer().is(AsmToken::LBrac)) {
+        if (ParseBracketSuffix(Name, Operands))
+          return true;
+      } else if (getLexer().is(AsmToken::LParen) &&
+                 ParseParenSuffix(Name, Operands))
+        return true;
+    }
+  }
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    SMLoc Loc = getLexer().getLoc();
+    Parser.eatToEndOfStatement();
+    return Error(Loc, "unexpected token in argument list");
+  }
+  Parser.Lex(); // Consume the EndOfStatement.
+  return false;
+}
+
+bool MipsAsmParser::reportParseError(Twine ErrorMsg) {
+  SMLoc Loc = getLexer().getLoc();
+  Parser.eatToEndOfStatement();
+  return Error(Loc, ErrorMsg);
+}
+
+bool MipsAsmParser::reportParseError(SMLoc Loc, Twine ErrorMsg) {
+  return Error(Loc, ErrorMsg);
+}
+
+bool MipsAsmParser::parseSetNoAtDirective() {
+  // Line should look like: ".set noat".
+  // set at reg to 0.
+  Options.setATReg(0);
+  // eat noat
+  Parser.Lex();
+  // If this is not the end of the statement, report an error.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token in statement");
+    return false;
+  }
+  Parser.Lex(); // Consume the EndOfStatement.
+  return false;
+}
+
+bool MipsAsmParser::parseSetAtDirective() {
+  // Line can be .set at - defaults to $1
+  // or .set at=$reg
+  int AtRegNo;
+  getParser().Lex();
+  if (getLexer().is(AsmToken::EndOfStatement)) {
+    Options.setATReg(1);
+    Parser.Lex(); // Consume the EndOfStatement.
+    return false;
+  } else if (getLexer().is(AsmToken::Equal)) {
+    getParser().Lex(); // Eat the '='.
+    if (getLexer().isNot(AsmToken::Dollar)) {
+      reportParseError("unexpected token in statement");
+      return false;
+    }
+    Parser.Lex(); // Eat the '$'.
+    const AsmToken &Reg = Parser.getTok();
+    if (Reg.is(AsmToken::Identifier)) {
+      AtRegNo = matchCPURegisterName(Reg.getIdentifier());
+    } else if (Reg.is(AsmToken::Integer)) {
+      AtRegNo = Reg.getIntVal();
+    } else {
+      reportParseError("unexpected token in statement");
+      return false;
+    }
+
+    if (AtRegNo < 0 || AtRegNo > 31) {
+      reportParseError("unexpected token in statement");
+      return false;
+    }
+
+    if (!Options.setATReg(AtRegNo)) {
+      reportParseError("unexpected token in statement");
+      return false;
+    }
+    getParser().Lex(); // Eat the register.
+
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      reportParseError("unexpected token in statement");
+      return false;
+    }
+    Parser.Lex(); // Consume the EndOfStatement.
+    return false;
+  } else {
+    reportParseError("unexpected token in statement");
+    return false;
+  }
+}
+
+bool MipsAsmParser::parseSetReorderDirective() {
+  Parser.Lex();
+  // If this is not the end of the statement, report an error.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token in statement");
+    return false;
+  }
+  Options.setReorder();
+  getTargetStreamer().emitDirectiveSetReorder();
+  Parser.Lex(); // Consume the EndOfStatement.
+  return false;
+}
+
+bool MipsAsmParser::parseSetNoReorderDirective() {
+  Parser.Lex();
+  // If this is not the end of the statement, report an error.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token in statement");
+    return false;
+  }
+  Options.setNoreorder();
+  getTargetStreamer().emitDirectiveSetNoReorder();
+  Parser.Lex(); // Consume the EndOfStatement.
+  return false;
+}
+
+bool MipsAsmParser::parseSetMacroDirective() {
+  Parser.Lex();
+  // If this is not the end of the statement, report an error.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token in statement");
+    return false;
+  }
+  Options.setMacro();
+  Parser.Lex(); // Consume the EndOfStatement.
+  return false;
+}
+
+bool MipsAsmParser::parseSetNoMacroDirective() {
+  Parser.Lex();
+  // If this is not the end of the statement, report an error.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("`noreorder' must be set before `nomacro'");
+    return false;
+  }
+  if (Options.isReorder()) {
+    reportParseError("`noreorder' must be set before `nomacro'");
+    return false;
+  }
+  Options.setNomacro();
+  Parser.Lex(); // Consume the EndOfStatement.
+  return false;
+}
+
+bool MipsAsmParser::parseSetNoMips16Directive() {
+  Parser.Lex();
+  // If this is not the end of the statement, report an error.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token in statement");
+    return false;
+  }
+  // For now do nothing.
+  Parser.Lex(); // Consume the EndOfStatement.
+  return false;
+}
+
+bool MipsAsmParser::parseSetFpDirective() {
+  MipsABIFlagsSection::FpABIKind FpAbiVal;
+  // Line can be: .set fp=32
+  //              .set fp=xx
+  //              .set fp=64
+  Parser.Lex(); // Eat fp token
+  AsmToken Tok = Parser.getTok();
+  if (Tok.isNot(AsmToken::Equal)) {
+    reportParseError("unexpected token in statement");
+    return false;
+  }
+  Parser.Lex(); // Eat '=' token.
+  Tok = Parser.getTok();
+
+  if (!parseFpABIValue(FpAbiVal, ".set"))
+    return false;
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token in statement");
+    return false;
+  }
+  getTargetStreamer().emitDirectiveSetFp(FpAbiVal);
+  Parser.Lex(); // Consume the EndOfStatement.
+  return false;
+}
+
+bool MipsAsmParser::parseSetAssignment() {
+  StringRef Name;
+  const MCExpr *Value;
+
+  if (Parser.parseIdentifier(Name))
+    reportParseError("expected identifier after .set");
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return reportParseError("unexpected token in .set directive");
+  Lex(); // Eat comma
+
+  if (Parser.parseExpression(Value))
+    return reportParseError("expected valid expression after comma");
+
+  // Check if the Name already exists as a symbol.
+  MCSymbol *Sym = getContext().LookupSymbol(Name);
+  if (Sym)
+    return reportParseError("symbol already defined");
+  Sym = getContext().GetOrCreateSymbol(Name);
+  Sym->setVariableValue(Value);
+
+  return false;
+}
+
+bool MipsAsmParser::parseSetFeature(uint64_t Feature) {
+  Parser.Lex();
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return reportParseError("unexpected token in .set directive");
+
+  switch (Feature) {
+  default:
+    llvm_unreachable("Unimplemented feature");
+  case Mips::FeatureDSP:
+    setFeatureBits(Mips::FeatureDSP, "dsp");
+    getTargetStreamer().emitDirectiveSetDsp();
+    break;
+  case Mips::FeatureMicroMips:
+    getTargetStreamer().emitDirectiveSetMicroMips();
+    break;
+  case Mips::FeatureMips16:
+    getTargetStreamer().emitDirectiveSetMips16();
+    break;
+  case Mips::FeatureMips32r2:
+    setFeatureBits(Mips::FeatureMips32r2, "mips32r2");
+    getTargetStreamer().emitDirectiveSetMips32R2();
+    break;
+  case Mips::FeatureMips64:
+    setFeatureBits(Mips::FeatureMips64, "mips64");
+    getTargetStreamer().emitDirectiveSetMips64();
+    break;
+  case Mips::FeatureMips64r2:
+    setFeatureBits(Mips::FeatureMips64r2, "mips64r2");
+    getTargetStreamer().emitDirectiveSetMips64R2();
+    break;
+  }
+  return false;
+}
+
+bool MipsAsmParser::eatComma(StringRef ErrorStr) {
+  if (getLexer().isNot(AsmToken::Comma)) {
+    SMLoc Loc = getLexer().getLoc();
+    Parser.eatToEndOfStatement();
+    return Error(Loc, ErrorStr);
+  }
+
+  Parser.Lex(); // Eat the comma.
+  return true;
+}
+
+bool MipsAsmParser::parseDirectiveCPLoad(SMLoc Loc) {
+  if (Options.isReorder())
+    Warning(Loc, ".cpload in reorder section");
+
+  // FIXME: Warn if cpload is used in Mips16 mode.
+
+  SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> Reg;
+  OperandMatchResultTy ResTy = ParseAnyRegister(Reg);
+  if (ResTy == MatchOperand_NoMatch || ResTy == MatchOperand_ParseFail) {
+    reportParseError("expected register containing function address");
+    return false;
+  }
+
+  MipsOperand &RegOpnd = static_cast<MipsOperand &>(*Reg[0]);
+  if (!RegOpnd.isGPRAsmReg()) {
+    reportParseError(RegOpnd.getStartLoc(), "invalid register");
+    return false;
+  }
+
+  getTargetStreamer().emitDirectiveCpload(RegOpnd.getGPR32Reg());
+  return false;
+}
+
+bool MipsAsmParser::parseDirectiveCPSetup() {
+  unsigned FuncReg;
+  unsigned Save;
+  bool SaveIsReg = true;
+
+  SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> TmpReg;
+  OperandMatchResultTy ResTy = ParseAnyRegister(TmpReg);
+  if (ResTy == MatchOperand_NoMatch) {
+    reportParseError("expected register containing function address");
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
+  MipsOperand &FuncRegOpnd = static_cast<MipsOperand &>(*TmpReg[0]);
+  if (!FuncRegOpnd.isGPRAsmReg()) {
+    reportParseError(FuncRegOpnd.getStartLoc(), "invalid register");
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
+  FuncReg = FuncRegOpnd.getGPR32Reg();
+  TmpReg.clear();
+
+  if (!eatComma("expected comma parsing directive"))
+    return true;
+
+  ResTy = ParseAnyRegister(TmpReg);
+  if (ResTy == MatchOperand_NoMatch) {
+    const AsmToken &Tok = Parser.getTok();
+    if (Tok.is(AsmToken::Integer)) {
+      Save = Tok.getIntVal();
+      SaveIsReg = false;
+      Parser.Lex();
+    } else {
+      reportParseError("expected save register or stack offset");
+      Parser.eatToEndOfStatement();
+      return false;
+    }
+  } else {
+    MipsOperand &SaveOpnd = static_cast<MipsOperand &>(*TmpReg[0]);
+    if (!SaveOpnd.isGPRAsmReg()) {
+      reportParseError(SaveOpnd.getStartLoc(), "invalid register");
+      Parser.eatToEndOfStatement();
+      return false;
+    }
+    Save = SaveOpnd.getGPR32Reg();
+  }
+
+  if (!eatComma("expected comma parsing directive"))
+    return true;
+
+  StringRef Name;
+  if (Parser.parseIdentifier(Name))
+    reportParseError("expected identifier");
+  MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
+
+  getTargetStreamer().emitDirectiveCpsetup(FuncReg, Save, *Sym, SaveIsReg);
+  return false;
+}
+
+bool MipsAsmParser::parseDirectiveNaN() {
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    const AsmToken &Tok = Parser.getTok();
+
+    if (Tok.getString() == "2008") {
+      Parser.Lex();
+      getTargetStreamer().emitDirectiveNaN2008();
+      return false;
+    } else if (Tok.getString() == "legacy") {
+      Parser.Lex();
+      getTargetStreamer().emitDirectiveNaNLegacy();
+      return false;
+    }
+  }
+  // If we don't recognize the option passed to the .nan
+  // directive (e.g. no option or unknown option), emit an error.
+  reportParseError("invalid option in .nan directive");
+  return false;
+}
+
+bool MipsAsmParser::parseDirectiveSet() {
+
+  // Get the next token.
+  const AsmToken &Tok = Parser.getTok();
+
+  if (Tok.getString() == "noat") {
+    return parseSetNoAtDirective();
+  } else if (Tok.getString() == "at") {
+    return parseSetAtDirective();
+  } else if (Tok.getString() == "fp") {
+    return parseSetFpDirective();
+  } else if (Tok.getString() == "reorder") {
+    return parseSetReorderDirective();
+  } else if (Tok.getString() == "noreorder") {
+    return parseSetNoReorderDirective();
+  } else if (Tok.getString() == "macro") {
+    return parseSetMacroDirective();
+  } else if (Tok.getString() == "nomacro") {
+    return parseSetNoMacroDirective();
+  } else if (Tok.getString() == "mips16") {
+    return parseSetFeature(Mips::FeatureMips16);
+  } else if (Tok.getString() == "nomips16") {
+    return parseSetNoMips16Directive();
+  } else if (Tok.getString() == "nomicromips") {
+    getTargetStreamer().emitDirectiveSetNoMicroMips();
+    Parser.eatToEndOfStatement();
+    return false;
+  } else if (Tok.getString() == "micromips") {
+    return parseSetFeature(Mips::FeatureMicroMips);
+  } else if (Tok.getString() == "mips32r2") {
+    return parseSetFeature(Mips::FeatureMips32r2);
+  } else if (Tok.getString() == "mips64") {
+    return parseSetFeature(Mips::FeatureMips64);
+  } else if (Tok.getString() == "mips64r2") {
+    return parseSetFeature(Mips::FeatureMips64r2);
+  } else if (Tok.getString() == "dsp") {
+    return parseSetFeature(Mips::FeatureDSP);
+  } else {
+    // It is just an identifier, look for an assignment.
+    parseSetAssignment();
+    return false;
+  }
+
+  return true;
+}
+
+/// parseDataDirective
+///  ::= .word [ expression (, expression)* ]
+bool MipsAsmParser::parseDataDirective(unsigned Size, SMLoc L) {
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    for (;;) {
+      const MCExpr *Value;
+      if (getParser().parseExpression(Value))
+        return true;
+
+      getParser().getStreamer().EmitValue(Value, Size);
+
+      if (getLexer().is(AsmToken::EndOfStatement))
+        break;
+
+      // FIXME: Improve diagnostic.
+      if (getLexer().isNot(AsmToken::Comma))
+        return Error(L, "unexpected token in directive");
+      Parser.Lex();
+    }
+  }
+
+  Parser.Lex();
+  return false;
+}
+
+/// parseDirectiveGpWord
+///  ::= .gpword local_sym
+bool MipsAsmParser::parseDirectiveGpWord() {
+  const MCExpr *Value;
+  // EmitGPRel32Value requires an expression, so we are using base class
+  // method to evaluate the expression.
+  if (getParser().parseExpression(Value))
+    return true;
+  getParser().getStreamer().EmitGPRel32Value(Value);
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return Error(getLexer().getLoc(), "unexpected token in directive");
+  Parser.Lex(); // Eat EndOfStatement token.
+  return false;
+}
+
+/// parseDirectiveGpDWord
+///  ::= .gpdword local_sym
+bool MipsAsmParser::parseDirectiveGpDWord() {
+  const MCExpr *Value;
+  // EmitGPRel64Value requires an expression, so we are using base class
+  // method to evaluate the expression.
+  if (getParser().parseExpression(Value))
+    return true;
+  getParser().getStreamer().EmitGPRel64Value(Value);
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return Error(getLexer().getLoc(), "unexpected token in directive");
+  Parser.Lex(); // Eat EndOfStatement token.
+  return false;
+}
+
+bool MipsAsmParser::parseDirectiveOption() {
+  // Get the option token.
+  AsmToken Tok = Parser.getTok();
+  // At the moment only identifiers are supported.
+  if (Tok.isNot(AsmToken::Identifier)) {
+    Error(Parser.getTok().getLoc(), "unexpected token in .option directive");
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
+  StringRef Option = Tok.getIdentifier();
+
+  if (Option == "pic0") {
+    getTargetStreamer().emitDirectiveOptionPic0();
+    Parser.Lex();
+    if (Parser.getTok().isNot(AsmToken::EndOfStatement)) {
+      Error(Parser.getTok().getLoc(),
+            "unexpected token in .option pic0 directive");
+      Parser.eatToEndOfStatement();
+    }
+    return false;
+  }
+
+  if (Option == "pic2") {
+    getTargetStreamer().emitDirectiveOptionPic2();
+    Parser.Lex();
+    if (Parser.getTok().isNot(AsmToken::EndOfStatement)) {
+      Error(Parser.getTok().getLoc(),
+            "unexpected token in .option pic2 directive");
+      Parser.eatToEndOfStatement();
+    }
+    return false;
+  }
+
+  // Unknown option.
+  Warning(Parser.getTok().getLoc(), "unknown option in .option directive");
+  Parser.eatToEndOfStatement();
+  return false;
+}
+
+/// parseDirectiveModule
+///  ::= .module oddspreg
+///  ::= .module nooddspreg
+///  ::= .module fp=value
+bool MipsAsmParser::parseDirectiveModule() {
+  MCAsmLexer &Lexer = getLexer();
+  SMLoc L = Lexer.getLoc();
+
+  if (!getTargetStreamer().getCanHaveModuleDir()) {
+    // TODO : get a better message.
+    reportParseError(".module directive must appear before any code");
+    return false;
+  }
+
+  if (Lexer.is(AsmToken::Identifier)) {
+    StringRef Option = Parser.getTok().getString();
+    Parser.Lex();
+
+    if (Option == "oddspreg") {
+      getTargetStreamer().emitDirectiveModuleOddSPReg(true, isABI_O32());
+      clearFeatureBits(Mips::FeatureNoOddSPReg, "nooddspreg");
+
+      if (getLexer().isNot(AsmToken::EndOfStatement)) {
+        reportParseError("Expected end of statement");
+        return false;
+      }
+
+      return false;
+    } else if (Option == "nooddspreg") {
+      if (!isABI_O32()) {
+        Error(L, "'.module nooddspreg' requires the O32 ABI");
+        return false;
+      }
+
+      getTargetStreamer().emitDirectiveModuleOddSPReg(false, isABI_O32());
+      setFeatureBits(Mips::FeatureNoOddSPReg, "nooddspreg");
+
+      if (getLexer().isNot(AsmToken::EndOfStatement)) {
+        reportParseError("Expected end of statement");
+        return false;
+      }
+
+      return false;
+    } else if (Option == "fp") {
+      return parseDirectiveModuleFP();
+    }
+
+    return Error(L, "'" + Twine(Option) + "' is not a valid .module option.");
+  }
+
+  return false;
+}
+
+/// parseDirectiveModuleFP
+///  ::= =32
+///  ::= =xx
+///  ::= =64
+bool MipsAsmParser::parseDirectiveModuleFP() {
+  MCAsmLexer &Lexer = getLexer();
+
+  if (Lexer.isNot(AsmToken::Equal)) {
+    reportParseError("unexpected token in statement");
+    return false;
+  }
+  Parser.Lex(); // Eat '=' token.
+
+  MipsABIFlagsSection::FpABIKind FpABI;
+  if (!parseFpABIValue(FpABI, ".module"))
+    return false;
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token in statement");
+    return false;
+  }
+
+  // Emit appropriate flags.
+  getTargetStreamer().emitDirectiveModuleFP(FpABI, isABI_O32());
+  Parser.Lex(); // Consume the EndOfStatement.
+  return false;
+}
+
+bool MipsAsmParser::parseFpABIValue(MipsABIFlagsSection::FpABIKind &FpABI,
+                                    StringRef Directive) {
+  MCAsmLexer &Lexer = getLexer();
+
+  if (Lexer.is(AsmToken::Identifier)) {
+    StringRef Value = Parser.getTok().getString();
+    Parser.Lex();
+
+    if (Value != "xx") {
+      reportParseError("unsupported value, expected 'xx', '32' or '64'");
+      return false;
+    }
+
+    if (!isABI_O32()) {
+      reportParseError("'" + Directive + " fp=xx' requires the O32 ABI");
+      return false;
+    }
+
+    FpABI = MipsABIFlagsSection::FpABIKind::XX;
+    return true;
+  }
+
+  if (Lexer.is(AsmToken::Integer)) {
+    unsigned Value = Parser.getTok().getIntVal();
+    Parser.Lex();
+
+    if (Value != 32 && Value != 64) {
+      reportParseError("unsupported value, expected 'xx', '32' or '64'");
+      return false;
+    }
+
+    if (Value == 32) {
+      if (!isABI_O32()) {
+        reportParseError("'" + Directive + " fp=32' requires the O32 ABI");
+        return false;
+      }
+
+      FpABI = MipsABIFlagsSection::FpABIKind::S32;
+    } else
+      FpABI = MipsABIFlagsSection::FpABIKind::S64;
+
+    return true;
+  }
+
+  return false;
+}
+
+bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
+  StringRef IDVal = DirectiveID.getString();
+
+  if (IDVal == ".cpload")
+    return parseDirectiveCPLoad(DirectiveID.getLoc());
+  if (IDVal == ".dword") {
+    parseDataDirective(8, DirectiveID.getLoc());
+    return false;
+  }
+
+  if (IDVal == ".ent") {
+    // Ignore this directive for now.
+    Parser.Lex();
+    return false;
+  }
+
+  if (IDVal == ".end") {
+    // Ignore this directive for now.
+    Parser.Lex();
+    return false;
+  }
+
+  if (IDVal == ".frame") {
+    // Ignore this directive for now.
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
+  if (IDVal == ".set") {
+    return parseDirectiveSet();
+  }
+
+  if (IDVal == ".fmask") {
+    // Ignore this directive for now.
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
+  if (IDVal == ".mask") {
+    // Ignore this directive for now.
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
+  if (IDVal == ".nan")
+    return parseDirectiveNaN();
+
+  if (IDVal == ".gpword") {
+    parseDirectiveGpWord();
+    return false;
+  }
+
+  if (IDVal == ".gpdword") {
+    parseDirectiveGpDWord();
+    return false;
+  }
+
+  if (IDVal == ".word") {
+    parseDataDirective(4, DirectiveID.getLoc());
+    return false;
+  }
+
+  if (IDVal == ".option")
+    return parseDirectiveOption();
+
+  if (IDVal == ".abicalls") {
+    getTargetStreamer().emitDirectiveAbiCalls();
+    if (Parser.getTok().isNot(AsmToken::EndOfStatement)) {
+      Error(Parser.getTok().getLoc(), "unexpected token in directive");
+      // Clear line
+      Parser.eatToEndOfStatement();
+    }
+    return false;
+  }
+
+  if (IDVal == ".cpsetup")
+    return parseDirectiveCPSetup();
+
+  if (IDVal == ".module")
+    return parseDirectiveModule();
+
+  return true;
+}
+
+extern "C" void LLVMInitializeMipsAsmParser() {
+  RegisterMCAsmParser<MipsAsmParser> X(TheMipsTarget);
+  RegisterMCAsmParser<MipsAsmParser> Y(TheMipselTarget);
+  RegisterMCAsmParser<MipsAsmParser> A(TheMips64Target);
+  RegisterMCAsmParser<MipsAsmParser> B(TheMips64elTarget);
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_MATCHER_IMPLEMENTATION
+#include "MipsGenAsmMatcher.inc"
diff --git a/contrib/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/contrib/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
new file mode 100644
index 0000000..f35a8de
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
@@ -0,0 +1,1328 @@
+//===- MipsDisassembler.cpp - Disassembler for Mips -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the Mips Disassembler.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mips.h"
+#include "MipsRegisterInfo.h"
+#include "MipsSubtarget.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/MemoryObject.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-disassembler"
+
+typedef MCDisassembler::DecodeStatus DecodeStatus;
+
+namespace {
+
+/// MipsDisassemblerBase - a disasembler class for Mips.
+class MipsDisassemblerBase : public MCDisassembler {
+public:
+  /// Constructor     - Initializes the disassembler.
+  ///
+  MipsDisassemblerBase(const MCSubtargetInfo &STI, MCContext &Ctx,
+                       bool bigEndian) :
+    MCDisassembler(STI, Ctx),
+    IsN64(STI.getFeatureBits() & Mips::FeatureN64), isBigEndian(bigEndian) {}
+
+  virtual ~MipsDisassemblerBase() {}
+
+  bool isN64() const { return IsN64; }
+
+private:
+  bool IsN64;
+protected:
+  bool isBigEndian;
+};
+
+/// MipsDisassembler - a disasembler class for Mips32.
+class MipsDisassembler : public MipsDisassemblerBase {
+  bool IsMicroMips;
+public:
+  /// Constructor     - Initializes the disassembler.
+  ///
+  MipsDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx, bool bigEndian)
+      : MipsDisassemblerBase(STI, Ctx, bigEndian) {
+    IsMicroMips = STI.getFeatureBits() & Mips::FeatureMicroMips;
+  }
+
+  bool hasMips3() const { return STI.getFeatureBits() & Mips::FeatureMips3; }
+  bool hasMips32() const { return STI.getFeatureBits() & Mips::FeatureMips32; }
+  bool hasMips32r6() const {
+    return STI.getFeatureBits() & Mips::FeatureMips32r6;
+  }
+
+  bool isGP64() const { return STI.getFeatureBits() & Mips::FeatureGP64Bit; }
+
+  bool hasCOP3() const {
+    // Only present in MIPS-I and MIPS-II
+    return !hasMips32() && !hasMips3();
+  }
+
+  /// getInstruction - See MCDisassembler.
+  DecodeStatus getInstruction(MCInst &instr,
+                              uint64_t &size,
+                              const MemoryObject &region,
+                              uint64_t address,
+                              raw_ostream &vStream,
+                              raw_ostream &cStream) const override;
+};
+
+
+/// Mips64Disassembler - a disasembler class for Mips64.
+class Mips64Disassembler : public MipsDisassemblerBase {
+public:
+  /// Constructor     - Initializes the disassembler.
+  ///
+  Mips64Disassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
+                     bool bigEndian) :
+    MipsDisassemblerBase(STI, Ctx, bigEndian) {}
+
+  /// getInstruction - See MCDisassembler.
+  DecodeStatus getInstruction(MCInst &instr,
+                              uint64_t &size,
+                              const MemoryObject &region,
+                              uint64_t address,
+                              raw_ostream &vStream,
+                              raw_ostream &cStream) const override;
+};
+
+} // end anonymous namespace
+
+// Forward declare these because the autogenerated code will reference them.
+// Definitions are further down.
+static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst,
+                                             unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+
+static DecodeStatus DecodeCPU16RegsRegisterClass(MCInst &Inst,
+                                                 unsigned RegNo,
+                                                 uint64_t Address,
+                                                 const void *Decoder);
+
+static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst,
+                                             unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+
+static DecodeStatus DecodePtrRegisterClass(MCInst &Inst,
+                                           unsigned Insn,
+                                           uint64_t Address,
+                                           const void *Decoder);
+
+static DecodeStatus DecodeDSPRRegisterClass(MCInst &Inst,
+                                            unsigned RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder);
+
+static DecodeStatus DecodeFGR64RegisterClass(MCInst &Inst,
+                                             unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+
+static DecodeStatus DecodeFGR32RegisterClass(MCInst &Inst,
+                                             unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+
+static DecodeStatus DecodeCCRRegisterClass(MCInst &Inst,
+                                           unsigned RegNo,
+                                           uint64_t Address,
+                                           const void *Decoder);
+
+static DecodeStatus DecodeFCCRegisterClass(MCInst &Inst,
+                                           unsigned RegNo,
+                                           uint64_t Address,
+                                           const void *Decoder);
+
+static DecodeStatus DecodeFGRCCRegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+
+static DecodeStatus DecodeHWRegsRegisterClass(MCInst &Inst,
+                                              unsigned Insn,
+                                              uint64_t Address,
+                                              const void *Decoder);
+
+static DecodeStatus DecodeAFGR64RegisterClass(MCInst &Inst,
+                                              unsigned RegNo,
+                                              uint64_t Address,
+                                              const void *Decoder);
+
+static DecodeStatus DecodeACC64DSPRegisterClass(MCInst &Inst,
+                                                unsigned RegNo,
+                                                uint64_t Address,
+                                                const void *Decoder);
+
+static DecodeStatus DecodeHI32DSPRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeLO32DSPRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeMSA128BRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeMSA128HRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeMSA128WRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeMSA128DRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeMSACtrlRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeCOP2RegisterClass(MCInst &Inst,
+                                            unsigned RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder);
+
+static DecodeStatus DecodeBranchTarget(MCInst &Inst,
+                                       unsigned Offset,
+                                       uint64_t Address,
+                                       const void *Decoder);
+
+static DecodeStatus DecodeJumpTarget(MCInst &Inst,
+                                     unsigned Insn,
+                                     uint64_t Address,
+                                     const void *Decoder);
+
+static DecodeStatus DecodeBranchTarget21(MCInst &Inst,
+                                         unsigned Offset,
+                                         uint64_t Address,
+                                         const void *Decoder);
+
+static DecodeStatus DecodeBranchTarget26(MCInst &Inst,
+                                         unsigned Offset,
+                                         uint64_t Address,
+                                         const void *Decoder);
+
+// DecodeBranchTargetMM - Decode microMIPS branch offset, which is
+// shifted left by 1 bit.
+static DecodeStatus DecodeBranchTargetMM(MCInst &Inst,
+                                         unsigned Offset,
+                                         uint64_t Address,
+                                         const void *Decoder);
+
+// DecodeJumpTargetMM - Decode microMIPS jump target, which is
+// shifted left by 1 bit.
+static DecodeStatus DecodeJumpTargetMM(MCInst &Inst,
+                                       unsigned Insn,
+                                       uint64_t Address,
+                                       const void *Decoder);
+
+static DecodeStatus DecodeMem(MCInst &Inst,
+                              unsigned Insn,
+                              uint64_t Address,
+                              const void *Decoder);
+
+static DecodeStatus DecodeMSA128Mem(MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeMemMMImm12(MCInst &Inst,
+                                     unsigned Insn,
+                                     uint64_t Address,
+                                     const void *Decoder);
+
+static DecodeStatus DecodeMemMMImm16(MCInst &Inst,
+                                     unsigned Insn,
+                                     uint64_t Address,
+                                     const void *Decoder);
+
+static DecodeStatus DecodeFMem(MCInst &Inst, unsigned Insn,
+                               uint64_t Address,
+                               const void *Decoder);
+
+static DecodeStatus DecodeSpecial3LlSc(MCInst &Inst,
+                                       unsigned Insn,
+                                       uint64_t Address,
+                                       const void *Decoder);
+
+static DecodeStatus DecodeSimm16(MCInst &Inst,
+                                 unsigned Insn,
+                                 uint64_t Address,
+                                 const void *Decoder);
+
+// Decode the immediate field of an LSA instruction which
+// is off by one.
+static DecodeStatus DecodeLSAImm(MCInst &Inst,
+                                 unsigned Insn,
+                                 uint64_t Address,
+                                 const void *Decoder);
+
+static DecodeStatus DecodeInsSize(MCInst &Inst,
+                                  unsigned Insn,
+                                  uint64_t Address,
+                                  const void *Decoder);
+
+static DecodeStatus DecodeExtSize(MCInst &Inst,
+                                  unsigned Insn,
+                                  uint64_t Address,
+                                  const void *Decoder);
+
+static DecodeStatus DecodeSimm19Lsl2(MCInst &Inst, unsigned Insn,
+                                     uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeSimm18Lsl3(MCInst &Inst, unsigned Insn,
+                                     uint64_t Address, const void *Decoder);
+
+/// INSVE_[BHWD] have an implicit operand that the generated decoder doesn't
+/// handle.
+template <typename InsnType>
+static DecodeStatus DecodeINSVE_DF(MCInst &MI, InsnType insn, uint64_t Address,
+                                   const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
+DecodeAddiGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
+                      const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
+DecodeDaddiGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
+                       const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
+DecodeBlezlGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
+                       const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
+DecodeBgtzlGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
+                       const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
+DecodeBgtzGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
+                      const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
+DecodeBlezGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
+                       const void *Decoder);
+
+namespace llvm {
+extern Target TheMipselTarget, TheMipsTarget, TheMips64Target,
+              TheMips64elTarget;
+}
+
+static MCDisassembler *createMipsDisassembler(
+                       const Target &T,
+                       const MCSubtargetInfo &STI,
+                       MCContext &Ctx) {
+  return new MipsDisassembler(STI, Ctx, true);
+}
+
+static MCDisassembler *createMipselDisassembler(
+                       const Target &T,
+                       const MCSubtargetInfo &STI,
+                       MCContext &Ctx) {
+  return new MipsDisassembler(STI, Ctx, false);
+}
+
+static MCDisassembler *createMips64Disassembler(
+                       const Target &T,
+                       const MCSubtargetInfo &STI,
+                       MCContext &Ctx) {
+  return new Mips64Disassembler(STI, Ctx, true);
+}
+
+static MCDisassembler *createMips64elDisassembler(
+                       const Target &T,
+                       const MCSubtargetInfo &STI,
+                       MCContext &Ctx) {
+  return new Mips64Disassembler(STI, Ctx, false);
+}
+
+extern "C" void LLVMInitializeMipsDisassembler() {
+  // Register the disassembler.
+  TargetRegistry::RegisterMCDisassembler(TheMipsTarget,
+                                         createMipsDisassembler);
+  TargetRegistry::RegisterMCDisassembler(TheMipselTarget,
+                                         createMipselDisassembler);
+  TargetRegistry::RegisterMCDisassembler(TheMips64Target,
+                                         createMips64Disassembler);
+  TargetRegistry::RegisterMCDisassembler(TheMips64elTarget,
+                                         createMips64elDisassembler);
+}
+
+#include "MipsGenDisassemblerTables.inc"
+
+static unsigned getReg(const void *D, unsigned RC, unsigned RegNo) {
+  const MipsDisassemblerBase *Dis = static_cast<const MipsDisassemblerBase*>(D);
+  const MCRegisterInfo *RegInfo = Dis->getContext().getRegisterInfo();
+  return *(RegInfo->getRegClass(RC).begin() + RegNo);
+}
+
+template <typename InsnType>
+static DecodeStatus DecodeINSVE_DF(MCInst &MI, InsnType insn, uint64_t Address,
+                                   const void *Decoder) {
+  typedef DecodeStatus (*DecodeFN)(MCInst &, unsigned, uint64_t, const void *);
+  // The size of the n field depends on the element size
+  // The register class also depends on this.
+  InsnType tmp = fieldFromInstruction(insn, 17, 5);
+  unsigned NSize = 0;
+  DecodeFN RegDecoder = nullptr;
+  if ((tmp & 0x18) == 0x00) { // INSVE_B
+    NSize = 4;
+    RegDecoder = DecodeMSA128BRegisterClass;
+  } else if ((tmp & 0x1c) == 0x10) { // INSVE_H
+    NSize = 3;
+    RegDecoder = DecodeMSA128HRegisterClass;
+  } else if ((tmp & 0x1e) == 0x18) { // INSVE_W
+    NSize = 2;
+    RegDecoder = DecodeMSA128WRegisterClass;
+  } else if ((tmp & 0x1f) == 0x1c) { // INSVE_D
+    NSize = 1;
+    RegDecoder = DecodeMSA128DRegisterClass;
+  } else
+    llvm_unreachable("Invalid encoding");
+
+  assert(NSize != 0 && RegDecoder != nullptr);
+
+  // $wd
+  tmp = fieldFromInstruction(insn, 6, 5);
+  if (RegDecoder(MI, tmp, Address, Decoder) == MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+  // $wd_in
+  if (RegDecoder(MI, tmp, Address, Decoder) == MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+  // $n
+  tmp = fieldFromInstruction(insn, 16, NSize);
+  MI.addOperand(MCOperand::CreateImm(tmp));
+  // $ws
+  tmp = fieldFromInstruction(insn, 11, 5);
+  if (RegDecoder(MI, tmp, Address, Decoder) == MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+  // $n2
+  MI.addOperand(MCOperand::CreateImm(0));
+
+  return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodeAddiGroupBranch(MCInst &MI, InsnType insn,
+                                          uint64_t Address,
+                                          const void *Decoder) {
+  // If we are called then we can assume that MIPS32r6/MIPS64r6 is enabled
+  // (otherwise we would have matched the ADDI instruction from the earlier
+  // ISA's instead).
+  //
+  // We have:
+  //    0b001000 sssss ttttt iiiiiiiiiiiiiiii
+  //      BOVC if rs >= rt
+  //      BEQZALC if rs == 0 && rt != 0
+  //      BEQC if rs < rt && rs != 0
+
+  InsnType Rs = fieldFromInstruction(insn, 21, 5);
+  InsnType Rt = fieldFromInstruction(insn, 16, 5);
+  InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) << 2;
+  bool HasRs = false;
+
+  if (Rs >= Rt) {
+    MI.setOpcode(Mips::BOVC);
+    HasRs = true;
+  } else if (Rs != 0 && Rs < Rt) {
+    MI.setOpcode(Mips::BEQC);
+    HasRs = true;
+  } else
+    MI.setOpcode(Mips::BEQZALC);
+
+  if (HasRs)
+    MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rs)));
+
+  MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                     Rt)));
+  MI.addOperand(MCOperand::CreateImm(Imm));
+
+  return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodeDaddiGroupBranch(MCInst &MI, InsnType insn,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  // If we are called then we can assume that MIPS32r6/MIPS64r6 is enabled
+  // (otherwise we would have matched the ADDI instruction from the earlier
+  // ISA's instead).
+  //
+  // We have:
+  //    0b011000 sssss ttttt iiiiiiiiiiiiiiii
+  //      BNVC if rs >= rt
+  //      BNEZALC if rs == 0 && rt != 0
+  //      BNEC if rs < rt && rs != 0
+
+  InsnType Rs = fieldFromInstruction(insn, 21, 5);
+  InsnType Rt = fieldFromInstruction(insn, 16, 5);
+  InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) << 2;
+  bool HasRs = false;
+
+  if (Rs >= Rt) {
+    MI.setOpcode(Mips::BNVC);
+    HasRs = true;
+  } else if (Rs != 0 && Rs < Rt) {
+    MI.setOpcode(Mips::BNEC);
+    HasRs = true;
+  } else
+    MI.setOpcode(Mips::BNEZALC);
+
+  if (HasRs)
+    MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rs)));
+
+  MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                     Rt)));
+  MI.addOperand(MCOperand::CreateImm(Imm));
+
+  return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodeBlezlGroupBranch(MCInst &MI, InsnType insn,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  // If we are called then we can assume that MIPS32r6/MIPS64r6 is enabled
+  // (otherwise we would have matched the BLEZL instruction from the earlier
+  // ISA's instead).
+  //
+  // We have:
+  //    0b010110 sssss ttttt iiiiiiiiiiiiiiii
+  //      Invalid if rs == 0
+  //      BLEZC   if rs == 0  && rt != 0
+  //      BGEZC   if rs == rt && rt != 0
+  //      BGEC    if rs != rt && rs != 0  && rt != 0
+
+  InsnType Rs = fieldFromInstruction(insn, 21, 5);
+  InsnType Rt = fieldFromInstruction(insn, 16, 5);
+  InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) << 2;
+  bool HasRs = false;
+
+  if (Rt == 0)
+    return MCDisassembler::Fail;
+  else if (Rs == 0)
+    MI.setOpcode(Mips::BLEZC);
+  else if (Rs == Rt)
+    MI.setOpcode(Mips::BGEZC);
+  else {
+    HasRs = true;
+    MI.setOpcode(Mips::BGEC);
+  }
+
+  if (HasRs)
+    MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rs)));
+
+  MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                     Rt)));
+
+  MI.addOperand(MCOperand::CreateImm(Imm));
+
+  return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodeBgtzlGroupBranch(MCInst &MI, InsnType insn,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  // If we are called then we can assume that MIPS32r6/MIPS64r6 is enabled
+  // (otherwise we would have matched the BGTZL instruction from the earlier
+  // ISA's instead).
+  //
+  // We have:
+  //    0b010111 sssss ttttt iiiiiiiiiiiiiiii
+  //      Invalid if rs == 0
+  //      BGTZC   if rs == 0  && rt != 0
+  //      BLTZC   if rs == rt && rt != 0
+  //      BLTC    if rs != rt && rs != 0  && rt != 0
+
+  bool HasRs = false;
+
+  InsnType Rs = fieldFromInstruction(insn, 21, 5);
+  InsnType Rt = fieldFromInstruction(insn, 16, 5);
+  InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) << 2;
+
+  if (Rt == 0)
+    return MCDisassembler::Fail;
+  else if (Rs == 0)
+    MI.setOpcode(Mips::BGTZC);
+  else if (Rs == Rt)
+    MI.setOpcode(Mips::BLTZC);
+  else {
+    MI.setOpcode(Mips::BLTC);
+    HasRs = true;
+  }
+
+  if (HasRs)
+    MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                              Rs)));
+
+  MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                     Rt)));
+
+  MI.addOperand(MCOperand::CreateImm(Imm));
+
+  return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodeBgtzGroupBranch(MCInst &MI, InsnType insn,
+                                          uint64_t Address,
+                                          const void *Decoder) {
+  // If we are called then we can assume that MIPS32r6/MIPS64r6 is enabled
+  // (otherwise we would have matched the BGTZ instruction from the earlier
+  // ISA's instead).
+  //
+  // We have:
+  //    0b000111 sssss ttttt iiiiiiiiiiiiiiii
+  //      BGTZ    if rt == 0
+  //      BGTZALC if rs == 0 && rt != 0
+  //      BLTZALC if rs != 0 && rs == rt
+  //      BLTUC   if rs != 0 && rs != rt
+
+  InsnType Rs = fieldFromInstruction(insn, 21, 5);
+  InsnType Rt = fieldFromInstruction(insn, 16, 5);
+  InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) << 2;
+  bool HasRs = false;
+  bool HasRt = false;
+
+  if (Rt == 0) {
+    MI.setOpcode(Mips::BGTZ);
+    HasRs = true;
+  } else if (Rs == 0) {
+    MI.setOpcode(Mips::BGTZALC);
+    HasRt = true;
+  } else if (Rs == Rt) {
+    MI.setOpcode(Mips::BLTZALC);
+    HasRs = true;
+  } else {
+    MI.setOpcode(Mips::BLTUC);
+    HasRs = true;
+    HasRt = true;
+  }
+
+  if (HasRs)
+    MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rs)));
+
+  if (HasRt)
+    MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rt)));
+
+  MI.addOperand(MCOperand::CreateImm(Imm));
+
+  return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodeBlezGroupBranch(MCInst &MI, InsnType insn,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  // If we are called then we can assume that MIPS32r6/MIPS64r6 is enabled
+  // (otherwise we would have matched the BLEZL instruction from the earlier
+  // ISA's instead).
+  //
+  // We have:
+  //    0b000110 sssss ttttt iiiiiiiiiiiiiiii
+  //      Invalid   if rs == 0
+  //      BLEZALC   if rs == 0  && rt != 0
+  //      BGEZALC   if rs == rt && rt != 0
+  //      BGEUC     if rs != rt && rs != 0  && rt != 0
+
+  InsnType Rs = fieldFromInstruction(insn, 21, 5);
+  InsnType Rt = fieldFromInstruction(insn, 16, 5);
+  InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) << 2;
+  bool HasRs = false;
+
+  if (Rt == 0)
+    return MCDisassembler::Fail;
+  else if (Rs == 0)
+    MI.setOpcode(Mips::BLEZALC);
+  else if (Rs == Rt)
+    MI.setOpcode(Mips::BGEZALC);
+  else {
+    HasRs = true;
+    MI.setOpcode(Mips::BGEUC);
+  }
+
+  if (HasRs)
+    MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rs)));
+  MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                     Rt)));
+
+  MI.addOperand(MCOperand::CreateImm(Imm));
+
+  return MCDisassembler::Success;
+}
+
+  /// readInstruction - read four bytes from the MemoryObject
+  /// and return 32 bit word sorted according to the given endianess
+static DecodeStatus readInstruction32(const MemoryObject &region,
+                                      uint64_t address,
+                                      uint64_t &size,
+                                      uint32_t &insn,
+                                      bool isBigEndian,
+                                      bool IsMicroMips) {
+  uint8_t Bytes[4];
+
+  // We want to read exactly 4 Bytes of data.
+  if (region.readBytes(address, 4, Bytes) == -1) {
+    size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  if (isBigEndian) {
+    // Encoded as a big-endian 32-bit word in the stream.
+    insn = (Bytes[3] <<  0) |
+           (Bytes[2] <<  8) |
+           (Bytes[1] << 16) |
+           (Bytes[0] << 24);
+  }
+  else {
+    // Encoded as a small-endian 32-bit word in the stream.
+    // Little-endian byte ordering:
+    //   mips32r2:   4 | 3 | 2 | 1
+    //   microMIPS:  2 | 1 | 4 | 3
+    if (IsMicroMips) {
+      insn = (Bytes[2] <<  0) |
+             (Bytes[3] <<  8) |
+             (Bytes[0] << 16) |
+             (Bytes[1] << 24);
+    } else {
+      insn = (Bytes[0] <<  0) |
+             (Bytes[1] <<  8) |
+             (Bytes[2] << 16) |
+             (Bytes[3] << 24);
+    }
+  }
+
+  return MCDisassembler::Success;
+}
+
+DecodeStatus
+MipsDisassembler::getInstruction(MCInst &instr,
+                                 uint64_t &Size,
+                                 const MemoryObject &Region,
+                                 uint64_t Address,
+                                 raw_ostream &vStream,
+                                 raw_ostream &cStream) const {
+  uint32_t Insn;
+
+  DecodeStatus Result = readInstruction32(Region, Address, Size,
+                                          Insn, isBigEndian, IsMicroMips);
+  if (Result == MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+
+  if (IsMicroMips) {
+    DEBUG(dbgs() << "Trying MicroMips32 table (32-bit opcodes):\n");
+    // Calling the auto-generated decoder function.
+    Result = decodeInstruction(DecoderTableMicroMips32, instr, Insn, Address,
+                               this, STI);
+    if (Result != MCDisassembler::Fail) {
+      Size = 4;
+      return Result;
+    }
+    return MCDisassembler::Fail;
+  }
+
+  if (hasCOP3()) {
+    DEBUG(dbgs() << "Trying COP3_ table (32-bit opcodes):\n");
+    Result =
+        decodeInstruction(DecoderTableCOP3_32, instr, Insn, Address, this, STI);
+    if (Result != MCDisassembler::Fail) {
+      Size = 4;
+      return Result;
+    }
+  }
+
+  if (hasMips32r6() && isGP64()) {
+    DEBUG(dbgs() << "Trying Mips32r6_64r6 (GPR64) table (32-bit opcodes):\n");
+    Result = decodeInstruction(DecoderTableMips32r6_64r6_GP6432, instr, Insn,
+                               Address, this, STI);
+    if (Result != MCDisassembler::Fail) {
+      Size = 4;
+      return Result;
+    }
+  }
+
+  if (hasMips32r6()) {
+    DEBUG(dbgs() << "Trying Mips32r6_64r6 table (32-bit opcodes):\n");
+    Result = decodeInstruction(DecoderTableMips32r6_64r632, instr, Insn,
+                               Address, this, STI);
+    if (Result != MCDisassembler::Fail) {
+      Size = 4;
+      return Result;
+    }
+  }
+
+  DEBUG(dbgs() << "Trying Mips table (32-bit opcodes):\n");
+  // Calling the auto-generated decoder function.
+  Result = decodeInstruction(DecoderTableMips32, instr, Insn, Address,
+                             this, STI);
+  if (Result != MCDisassembler::Fail) {
+    Size = 4;
+    return Result;
+  }
+
+  return MCDisassembler::Fail;
+}
+
+DecodeStatus
+Mips64Disassembler::getInstruction(MCInst &instr,
+                                   uint64_t &Size,
+                                   const MemoryObject &Region,
+                                   uint64_t Address,
+                                   raw_ostream &vStream,
+                                   raw_ostream &cStream) const {
+  uint32_t Insn;
+
+  DecodeStatus Result = readInstruction32(Region, Address, Size,
+                                          Insn, isBigEndian, false);
+  if (Result == MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+
+  // Calling the auto-generated decoder function.
+  Result = decodeInstruction(DecoderTableMips6432, instr, Insn, Address,
+                             this, STI);
+  if (Result != MCDisassembler::Fail) {
+    Size = 4;
+    return Result;
+  }
+  // If we fail to decode in Mips64 decoder space we can try in Mips32
+  Result = decodeInstruction(DecoderTableMips32, instr, Insn, Address,
+                             this, STI);
+  if (Result != MCDisassembler::Fail) {
+    Size = 4;
+    return Result;
+  }
+
+  return MCDisassembler::Fail;
+}
+
+static DecodeStatus DecodeCPU16RegsRegisterClass(MCInst &Inst,
+                                                 unsigned RegNo,
+                                                 uint64_t Address,
+                                                 const void *Decoder) {
+
+  return MCDisassembler::Fail;
+
+}
+
+static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst,
+                                             unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder) {
+
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::GPR64RegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst,
+                                             unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+  unsigned Reg = getReg(Decoder, Mips::GPR32RegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodePtrRegisterClass(MCInst &Inst,
+                                           unsigned RegNo,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  if (static_cast<const MipsDisassembler *>(Decoder)->isN64())
+    return DecodeGPR64RegisterClass(Inst, RegNo, Address, Decoder);
+
+  return DecodeGPR32RegisterClass(Inst, RegNo, Address, Decoder);
+}
+
+static DecodeStatus DecodeDSPRRegisterClass(MCInst &Inst,
+                                            unsigned RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return DecodeGPR32RegisterClass(Inst, RegNo, Address, Decoder);
+}
+
+static DecodeStatus DecodeFGR64RegisterClass(MCInst &Inst,
+                                             unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::FGR64RegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeFGR32RegisterClass(MCInst &Inst,
+                                             unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::FGR32RegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCCRRegisterClass(MCInst &Inst,
+                                           unsigned RegNo,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+  unsigned Reg = getReg(Decoder, Mips::CCRRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeFCCRegisterClass(MCInst &Inst,
+                                           unsigned RegNo,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  if (RegNo > 7)
+    return MCDisassembler::Fail;
+  unsigned Reg = getReg(Decoder, Mips::FCCRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeFGRCCRegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::FGRCCRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMem(MCInst &Inst,
+                              unsigned Insn,
+                              uint64_t Address,
+                              const void *Decoder) {
+  int Offset = SignExtend32<16>(Insn & 0xffff);
+  unsigned Reg = fieldFromInstruction(Insn, 16, 5);
+  unsigned Base = fieldFromInstruction(Insn, 21, 5);
+
+  Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  if(Inst.getOpcode() == Mips::SC){
+    Inst.addOperand(MCOperand::CreateReg(Reg));
+  }
+
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  Inst.addOperand(MCOperand::CreateReg(Base));
+  Inst.addOperand(MCOperand::CreateImm(Offset));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMSA128Mem(MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder) {
+  int Offset = SignExtend32<10>(fieldFromInstruction(Insn, 16, 10));
+  unsigned Reg = fieldFromInstruction(Insn, 6, 5);
+  unsigned Base = fieldFromInstruction(Insn, 11, 5);
+
+  Reg = getReg(Decoder, Mips::MSA128BRegClassID, Reg);
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  Inst.addOperand(MCOperand::CreateReg(Base));
+
+  // The immediate field of an LD/ST instruction is scaled which means it must
+  // be multiplied (when decoding) by the size (in bytes) of the instructions'
+  // data format.
+  // .b - 1 byte
+  // .h - 2 bytes
+  // .w - 4 bytes
+  // .d - 8 bytes
+  switch(Inst.getOpcode())
+  {
+  default:
+    assert (0 && "Unexpected instruction");
+    return MCDisassembler::Fail;
+    break;
+  case Mips::LD_B:
+  case Mips::ST_B:
+    Inst.addOperand(MCOperand::CreateImm(Offset));
+    break;
+  case Mips::LD_H:
+  case Mips::ST_H:
+    Inst.addOperand(MCOperand::CreateImm(Offset << 1));
+    break;
+  case Mips::LD_W:
+  case Mips::ST_W:
+    Inst.addOperand(MCOperand::CreateImm(Offset << 2));
+    break;
+  case Mips::LD_D:
+  case Mips::ST_D:
+    Inst.addOperand(MCOperand::CreateImm(Offset << 3));
+    break;
+  }
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMemMMImm12(MCInst &Inst,
+                                     unsigned Insn,
+                                     uint64_t Address,
+                                     const void *Decoder) {
+  int Offset = SignExtend32<12>(Insn & 0x0fff);
+  unsigned Reg = fieldFromInstruction(Insn, 21, 5);
+  unsigned Base = fieldFromInstruction(Insn, 16, 5);
+
+  Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  if (Inst.getOpcode() == Mips::SC_MM)
+    Inst.addOperand(MCOperand::CreateReg(Reg));
+
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  Inst.addOperand(MCOperand::CreateReg(Base));
+  Inst.addOperand(MCOperand::CreateImm(Offset));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMemMMImm16(MCInst &Inst,
+                                     unsigned Insn,
+                                     uint64_t Address,
+                                     const void *Decoder) {
+  int Offset = SignExtend32<16>(Insn & 0xffff);
+  unsigned Reg = fieldFromInstruction(Insn, 21, 5);
+  unsigned Base = fieldFromInstruction(Insn, 16, 5);
+
+  Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  Inst.addOperand(MCOperand::CreateReg(Base));
+  Inst.addOperand(MCOperand::CreateImm(Offset));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeFMem(MCInst &Inst,
+                               unsigned Insn,
+                               uint64_t Address,
+                               const void *Decoder) {
+  int Offset = SignExtend32<16>(Insn & 0xffff);
+  unsigned Reg = fieldFromInstruction(Insn, 16, 5);
+  unsigned Base = fieldFromInstruction(Insn, 21, 5);
+
+  Reg = getReg(Decoder, Mips::FGR64RegClassID, Reg);
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  Inst.addOperand(MCOperand::CreateReg(Base));
+  Inst.addOperand(MCOperand::CreateImm(Offset));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSpecial3LlSc(MCInst &Inst,
+                                       unsigned Insn,
+                                       uint64_t Address,
+                                       const void *Decoder) {
+  int64_t Offset = SignExtend64<9>((Insn >> 7) & 0x1ff);
+  unsigned Rt = fieldFromInstruction(Insn, 16, 5);
+  unsigned Base = fieldFromInstruction(Insn, 21, 5);
+
+  Rt = getReg(Decoder, Mips::GPR32RegClassID, Rt);
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  if(Inst.getOpcode() == Mips::SC_R6 || Inst.getOpcode() == Mips::SCD_R6){
+    Inst.addOperand(MCOperand::CreateReg(Rt));
+  }
+
+  Inst.addOperand(MCOperand::CreateReg(Rt));
+  Inst.addOperand(MCOperand::CreateReg(Base));
+  Inst.addOperand(MCOperand::CreateImm(Offset));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeHWRegsRegisterClass(MCInst &Inst,
+                                              unsigned RegNo,
+                                              uint64_t Address,
+                                              const void *Decoder) {
+  // Currently only hardware register 29 is supported.
+  if (RegNo != 29)
+    return  MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateReg(Mips::HWR29));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeAFGR64RegisterClass(MCInst &Inst,
+                                              unsigned RegNo,
+                                              uint64_t Address,
+                                              const void *Decoder) {
+  if (RegNo > 30 || RegNo %2)
+    return MCDisassembler::Fail;
+
+  ;
+  unsigned Reg = getReg(Decoder, Mips::AFGR64RegClassID, RegNo /2);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeACC64DSPRegisterClass(MCInst &Inst,
+                                                unsigned RegNo,
+                                                uint64_t Address,
+                                                const void *Decoder) {
+  if (RegNo >= 4)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::ACC64DSPRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeHI32DSPRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo >= 4)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::HI32DSPRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeLO32DSPRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo >= 4)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::LO32DSPRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMSA128BRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::MSA128BRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMSA128HRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::MSA128HRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMSA128WRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::MSA128WRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMSA128DRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::MSA128DRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMSACtrlRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo > 7)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::MSACtrlRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCOP2RegisterClass(MCInst &Inst,
+                                            unsigned RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::COP2RegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeBranchTarget(MCInst &Inst,
+                                       unsigned Offset,
+                                       uint64_t Address,
+                                       const void *Decoder) {
+  int32_t BranchOffset = (SignExtend32<16>(Offset) << 2) + 4;
+  Inst.addOperand(MCOperand::CreateImm(BranchOffset));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeJumpTarget(MCInst &Inst,
+                                     unsigned Insn,
+                                     uint64_t Address,
+                                     const void *Decoder) {
+
+  unsigned JumpOffset = fieldFromInstruction(Insn, 0, 26) << 2;
+  Inst.addOperand(MCOperand::CreateImm(JumpOffset));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeBranchTarget21(MCInst &Inst,
+                                         unsigned Offset,
+                                         uint64_t Address,
+                                         const void *Decoder) {
+  int32_t BranchOffset = SignExtend32<21>(Offset) << 2;
+
+  Inst.addOperand(MCOperand::CreateImm(BranchOffset));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeBranchTarget26(MCInst &Inst,
+                                         unsigned Offset,
+                                         uint64_t Address,
+                                         const void *Decoder) {
+  int32_t BranchOffset = SignExtend32<26>(Offset) << 2;
+
+  Inst.addOperand(MCOperand::CreateImm(BranchOffset));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeBranchTargetMM(MCInst &Inst,
+                                         unsigned Offset,
+                                         uint64_t Address,
+                                         const void *Decoder) {
+  int32_t BranchOffset = SignExtend32<16>(Offset) << 1;
+  Inst.addOperand(MCOperand::CreateImm(BranchOffset));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeJumpTargetMM(MCInst &Inst,
+                                       unsigned Insn,
+                                       uint64_t Address,
+                                       const void *Decoder) {
+  unsigned JumpOffset = fieldFromInstruction(Insn, 0, 26) << 1;
+  Inst.addOperand(MCOperand::CreateImm(JumpOffset));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSimm16(MCInst &Inst,
+                                 unsigned Insn,
+                                 uint64_t Address,
+                                 const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(SignExtend32<16>(Insn)));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeLSAImm(MCInst &Inst,
+                                 unsigned Insn,
+                                 uint64_t Address,
+                                 const void *Decoder) {
+  // We add one to the immediate field as it was encoded as 'imm - 1'.
+  Inst.addOperand(MCOperand::CreateImm(Insn + 1));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeInsSize(MCInst &Inst,
+                                  unsigned Insn,
+                                  uint64_t Address,
+                                  const void *Decoder) {
+  // First we need to grab the pos(lsb) from MCInst.
+  int Pos = Inst.getOperand(2).getImm();
+  int Size = (int) Insn - Pos + 1;
+  Inst.addOperand(MCOperand::CreateImm(SignExtend32<16>(Size)));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeExtSize(MCInst &Inst,
+                                  unsigned Insn,
+                                  uint64_t Address,
+                                  const void *Decoder) {
+  int Size = (int) Insn  + 1;
+  Inst.addOperand(MCOperand::CreateImm(SignExtend32<16>(Size)));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSimm19Lsl2(MCInst &Inst, unsigned Insn,
+                                     uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(SignExtend32<19>(Insn) << 2));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSimm18Lsl3(MCInst &Inst, unsigned Insn,
+                                     uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(SignExtend32<18>(Insn) << 3));
+  return MCDisassembler::Success;
+}
diff --git a/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp b/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
new file mode 100644
index 0000000..8c79751
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
@@ -0,0 +1,326 @@
+//===-- MipsInstPrinter.cpp - Convert Mips MCInst to assembly syntax ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an Mips MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsInstPrinter.h"
+#include "MCTargetDesc/MipsMCExpr.h"
+#include "MipsInstrInfo.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+#define PRINT_ALIAS_INSTR
+#include "MipsGenAsmWriter.inc"
+
+template<unsigned R>
+static bool isReg(const MCInst &MI, unsigned OpNo) {
+  assert(MI.getOperand(OpNo).isReg() && "Register operand expected.");
+  return MI.getOperand(OpNo).getReg() == R;
+}
+
+const char* Mips::MipsFCCToString(Mips::CondCode CC) {
+  switch (CC) {
+  case FCOND_F:
+  case FCOND_T:   return "f";
+  case FCOND_UN:
+  case FCOND_OR:  return "un";
+  case FCOND_OEQ:
+  case FCOND_UNE: return "eq";
+  case FCOND_UEQ:
+  case FCOND_ONE: return "ueq";
+  case FCOND_OLT:
+  case FCOND_UGE: return "olt";
+  case FCOND_ULT:
+  case FCOND_OGE: return "ult";
+  case FCOND_OLE:
+  case FCOND_UGT: return "ole";
+  case FCOND_ULE:
+  case FCOND_OGT: return "ule";
+  case FCOND_SF:
+  case FCOND_ST:  return "sf";
+  case FCOND_NGLE:
+  case FCOND_GLE: return "ngle";
+  case FCOND_SEQ:
+  case FCOND_SNE: return "seq";
+  case FCOND_NGL:
+  case FCOND_GL:  return "ngl";
+  case FCOND_LT:
+  case FCOND_NLT: return "lt";
+  case FCOND_NGE:
+  case FCOND_GE:  return "nge";
+  case FCOND_LE:
+  case FCOND_NLE: return "le";
+  case FCOND_NGT:
+  case FCOND_GT:  return "ngt";
+  }
+  llvm_unreachable("Impossible condition code!");
+}
+
+void MipsInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+  OS << '$' << StringRef(getRegisterName(RegNo)).lower();
+}
+
+void MipsInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                                StringRef Annot) {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case Mips::RDHWR:
+  case Mips::RDHWR64:
+    O << "\t.set\tpush\n";
+    O << "\t.set\tmips32r2\n";
+    break;
+  case Mips::Save16:
+    O << "\tsave\t";
+    printSaveRestore(MI, O);
+    O << " # 16 bit inst\n";
+    return;
+  case Mips::SaveX16:
+    O << "\tsave\t";
+    printSaveRestore(MI, O);
+    O << "\n";
+    return;
+  case Mips::Restore16:
+    O << "\trestore\t";
+    printSaveRestore(MI, O);
+    O << " # 16 bit inst\n";
+    return;
+  case Mips::RestoreX16:
+    O << "\trestore\t";
+    printSaveRestore(MI, O);
+    O << "\n";
+    return;
+  }
+
+  // Try to print any aliases first.
+  if (!printAliasInstr(MI, O) && !printAlias(*MI, O))
+    printInstruction(MI, O);
+  printAnnotation(O, Annot);
+
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case Mips::RDHWR:
+  case Mips::RDHWR64:
+    O << "\n\t.set\tpop";
+  }
+}
+
+static void printExpr(const MCExpr *Expr, raw_ostream &OS) {
+  int Offset = 0;
+  const MCSymbolRefExpr *SRE;
+
+  if (const MCBinaryExpr *BE = dyn_cast<MCBinaryExpr>(Expr)) {
+    SRE = dyn_cast<MCSymbolRefExpr>(BE->getLHS());
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(BE->getRHS());
+    assert(SRE && CE && "Binary expression must be sym+const.");
+    Offset = CE->getValue();
+  } else if (const MipsMCExpr *ME = dyn_cast<MipsMCExpr>(Expr)) {
+    ME->print(OS);
+    return;
+  } else if (!(SRE = dyn_cast<MCSymbolRefExpr>(Expr)))
+    assert(false && "Unexpected MCExpr type.");
+
+  MCSymbolRefExpr::VariantKind Kind = SRE->getKind();
+
+  switch (Kind) {
+  default:                                 llvm_unreachable("Invalid kind!");
+  case MCSymbolRefExpr::VK_None:           break;
+  case MCSymbolRefExpr::VK_Mips_GPREL:     OS << "%gp_rel("; break;
+  case MCSymbolRefExpr::VK_Mips_GOT_CALL:  OS << "%call16("; break;
+  case MCSymbolRefExpr::VK_Mips_GOT16:     OS << "%got(";    break;
+  case MCSymbolRefExpr::VK_Mips_GOT:       OS << "%got(";    break;
+  case MCSymbolRefExpr::VK_Mips_ABS_HI:    OS << "%hi(";     break;
+  case MCSymbolRefExpr::VK_Mips_ABS_LO:    OS << "%lo(";     break;
+  case MCSymbolRefExpr::VK_Mips_TLSGD:     OS << "%tlsgd(";  break;
+  case MCSymbolRefExpr::VK_Mips_TLSLDM:    OS << "%tlsldm(";  break;
+  case MCSymbolRefExpr::VK_Mips_DTPREL_HI: OS << "%dtprel_hi(";  break;
+  case MCSymbolRefExpr::VK_Mips_DTPREL_LO: OS << "%dtprel_lo(";  break;
+  case MCSymbolRefExpr::VK_Mips_GOTTPREL:  OS << "%gottprel("; break;
+  case MCSymbolRefExpr::VK_Mips_TPREL_HI:  OS << "%tprel_hi("; break;
+  case MCSymbolRefExpr::VK_Mips_TPREL_LO:  OS << "%tprel_lo("; break;
+  case MCSymbolRefExpr::VK_Mips_GPOFF_HI:  OS << "%hi(%neg(%gp_rel("; break;
+  case MCSymbolRefExpr::VK_Mips_GPOFF_LO:  OS << "%lo(%neg(%gp_rel("; break;
+  case MCSymbolRefExpr::VK_Mips_GOT_DISP:  OS << "%got_disp("; break;
+  case MCSymbolRefExpr::VK_Mips_GOT_PAGE:  OS << "%got_page("; break;
+  case MCSymbolRefExpr::VK_Mips_GOT_OFST:  OS << "%got_ofst("; break;
+  case MCSymbolRefExpr::VK_Mips_HIGHER:    OS << "%higher("; break;
+  case MCSymbolRefExpr::VK_Mips_HIGHEST:   OS << "%highest("; break;
+  case MCSymbolRefExpr::VK_Mips_GOT_HI16:  OS << "%got_hi("; break;
+  case MCSymbolRefExpr::VK_Mips_GOT_LO16:  OS << "%got_lo("; break;
+  case MCSymbolRefExpr::VK_Mips_CALL_HI16: OS << "%call_hi("; break;
+  case MCSymbolRefExpr::VK_Mips_CALL_LO16: OS << "%call_lo("; break;
+  case MCSymbolRefExpr::VK_Mips_PCREL_HI16: OS << "%pcrel_hi("; break;
+  case MCSymbolRefExpr::VK_Mips_PCREL_LO16: OS << "%pcrel_lo("; break;
+  }
+
+  OS << SRE->getSymbol();
+
+  if (Offset) {
+    if (Offset > 0)
+      OS << '+';
+    OS << Offset;
+  }
+
+  if ((Kind == MCSymbolRefExpr::VK_Mips_GPOFF_HI) ||
+      (Kind == MCSymbolRefExpr::VK_Mips_GPOFF_LO))
+    OS << ")))";
+  else if (Kind != MCSymbolRefExpr::VK_None)
+    OS << ')';
+}
+
+void MipsInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                   raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    printRegName(O, Op.getReg());
+    return;
+  }
+
+  if (Op.isImm()) {
+    O << Op.getImm();
+    return;
+  }
+
+  assert(Op.isExpr() && "unknown operand kind in printOperand");
+  printExpr(Op.getExpr(), O);
+}
+
+void MipsInstPrinter::printUnsignedImm(const MCInst *MI, int opNum,
+                                       raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(opNum);
+  if (MO.isImm())
+    O << (unsigned short int)MO.getImm();
+  else
+    printOperand(MI, opNum, O);
+}
+
+void MipsInstPrinter::printUnsignedImm8(const MCInst *MI, int opNum,
+                                        raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(opNum);
+  if (MO.isImm())
+    O << (unsigned short int)(unsigned char)MO.getImm();
+  else
+    printOperand(MI, opNum, O);
+}
+
+void MipsInstPrinter::
+printMemOperand(const MCInst *MI, int opNum, raw_ostream &O) {
+  // Load/Store memory operands -- imm($reg)
+  // If PIC target the target is loaded as the
+  // pattern lw $25,%call16($28)
+  printOperand(MI, opNum+1, O);
+  O << "(";
+  printOperand(MI, opNum, O);
+  O << ")";
+}
+
+void MipsInstPrinter::
+printMemOperandEA(const MCInst *MI, int opNum, raw_ostream &O) {
+  // when using stack locations for not load/store instructions
+  // print the same way as all normal 3 operand instructions.
+  printOperand(MI, opNum, O);
+  O << ", ";
+  printOperand(MI, opNum+1, O);
+  return;
+}
+
+void MipsInstPrinter::
+printFCCOperand(const MCInst *MI, int opNum, raw_ostream &O) {
+  const MCOperand& MO = MI->getOperand(opNum);
+  O << MipsFCCToString((Mips::CondCode)MO.getImm());
+}
+
+void MipsInstPrinter::
+printSHFMask(const MCInst *MI, int opNum, raw_ostream &O) {
+  llvm_unreachable("TODO");
+}
+
+bool MipsInstPrinter::printAlias(const char *Str, const MCInst &MI,
+                                 unsigned OpNo, raw_ostream &OS) {
+  OS << "\t" << Str << "\t";
+  printOperand(&MI, OpNo, OS);
+  return true;
+}
+
+bool MipsInstPrinter::printAlias(const char *Str, const MCInst &MI,
+                                 unsigned OpNo0, unsigned OpNo1,
+                                 raw_ostream &OS) {
+  printAlias(Str, MI, OpNo0, OS);
+  OS << ", ";
+  printOperand(&MI, OpNo1, OS);
+  return true;
+}
+
+bool MipsInstPrinter::printAlias(const MCInst &MI, raw_ostream &OS) {
+  switch (MI.getOpcode()) {
+  case Mips::BEQ:
+    // beq $zero, $zero, $L2 => b $L2
+    // beq $r0, $zero, $L2 => beqz $r0, $L2
+    return (isReg<Mips::ZERO>(MI, 0) && isReg<Mips::ZERO>(MI, 1) &&
+            printAlias("b", MI, 2, OS)) ||
+           (isReg<Mips::ZERO>(MI, 1) && printAlias("beqz", MI, 0, 2, OS));
+  case Mips::BEQ64:
+    // beq $r0, $zero, $L2 => beqz $r0, $L2
+    return isReg<Mips::ZERO_64>(MI, 1) && printAlias("beqz", MI, 0, 2, OS);
+  case Mips::BNE:
+    // bne $r0, $zero, $L2 => bnez $r0, $L2
+    return isReg<Mips::ZERO>(MI, 1) && printAlias("bnez", MI, 0, 2, OS);
+  case Mips::BNE64:
+    // bne $r0, $zero, $L2 => bnez $r0, $L2
+    return isReg<Mips::ZERO_64>(MI, 1) && printAlias("bnez", MI, 0, 2, OS);
+  case Mips::BGEZAL:
+    // bgezal $zero, $L1 => bal $L1
+    return isReg<Mips::ZERO>(MI, 0) && printAlias("bal", MI, 1, OS);
+  case Mips::BC1T:
+    // bc1t $fcc0, $L1 => bc1t $L1
+    return isReg<Mips::FCC0>(MI, 0) && printAlias("bc1t", MI, 1, OS);
+  case Mips::BC1F:
+    // bc1f $fcc0, $L1 => bc1f $L1
+    return isReg<Mips::FCC0>(MI, 0) && printAlias("bc1f", MI, 1, OS);
+  case Mips::JALR:
+    // jalr $ra, $r1 => jalr $r1
+    return isReg<Mips::RA>(MI, 0) && printAlias("jalr", MI, 1, OS);
+  case Mips::JALR64:
+    // jalr $ra, $r1 => jalr $r1
+    return isReg<Mips::RA_64>(MI, 0) && printAlias("jalr", MI, 1, OS);
+  case Mips::NOR:
+  case Mips::NOR_MM:
+    // nor $r0, $r1, $zero => not $r0, $r1
+    return isReg<Mips::ZERO>(MI, 2) && printAlias("not", MI, 0, 1, OS);
+  case Mips::NOR64:
+    // nor $r0, $r1, $zero => not $r0, $r1
+    return isReg<Mips::ZERO_64>(MI, 2) && printAlias("not", MI, 0, 1, OS);
+  case Mips::OR:
+    // or $r0, $r1, $zero => move $r0, $r1
+    return isReg<Mips::ZERO>(MI, 2) && printAlias("move", MI, 0, 1, OS);
+  default: return false;
+  }
+}
+
+void MipsInstPrinter::printSaveRestore(const MCInst *MI, raw_ostream &O) {
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    if (i != 0) O << ", ";
+    if (MI->getOperand(i).isReg())
+      printRegName(O, MI->getOperand(i).getReg());
+    else
+      printUnsignedImm(MI, i, O);
+  }
+}
+
diff --git a/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.h b/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
new file mode 100644
index 0000000..550a0f10
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
@@ -0,0 +1,113 @@
+//=== MipsInstPrinter.h - Convert Mips MCInst to assembly syntax -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints a Mips MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSINSTPRINTER_H
+#define MIPSINSTPRINTER_H
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+// These enumeration declarations were originally in MipsInstrInfo.h but
+// had to be moved here to avoid circular dependencies between
+// LLVMMipsCodeGen and LLVMMipsAsmPrinter.
+namespace Mips {
+// Mips Branch Codes
+enum FPBranchCode {
+  BRANCH_F,
+  BRANCH_T,
+  BRANCH_FL,
+  BRANCH_TL,
+  BRANCH_INVALID
+};
+
+// Mips Condition Codes
+enum CondCode {
+  // To be used with float branch True
+  FCOND_F,
+  FCOND_UN,
+  FCOND_OEQ,
+  FCOND_UEQ,
+  FCOND_OLT,
+  FCOND_ULT,
+  FCOND_OLE,
+  FCOND_ULE,
+  FCOND_SF,
+  FCOND_NGLE,
+  FCOND_SEQ,
+  FCOND_NGL,
+  FCOND_LT,
+  FCOND_NGE,
+  FCOND_LE,
+  FCOND_NGT,
+
+  // To be used with float branch False
+  // This conditions have the same mnemonic as the
+  // above ones, but are used with a branch False;
+  FCOND_T,
+  FCOND_OR,
+  FCOND_UNE,
+  FCOND_ONE,
+  FCOND_UGE,
+  FCOND_OGE,
+  FCOND_UGT,
+  FCOND_OGT,
+  FCOND_ST,
+  FCOND_GLE,
+  FCOND_SNE,
+  FCOND_GL,
+  FCOND_NLT,
+  FCOND_GE,
+  FCOND_NLE,
+  FCOND_GT
+};
+
+const char *MipsFCCToString(Mips::CondCode CC);
+} // end namespace Mips
+
+class TargetMachine;
+
+class MipsInstPrinter : public MCInstPrinter {
+public:
+  MipsInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                  const MCRegisterInfo &MRI)
+    : MCInstPrinter(MAI, MII, MRI) {}
+
+  // Autogenerated by tblgen.
+  void printInstruction(const MCInst *MI, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo);
+
+  void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override;
+
+  bool printAliasInstr(const MCInst *MI, raw_ostream &OS);
+  void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+                               unsigned PrintMethodIdx, raw_ostream &O);
+
+private:
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printUnsignedImm(const MCInst *MI, int opNum, raw_ostream &O);
+  void printUnsignedImm8(const MCInst *MI, int opNum, raw_ostream &O);
+  void printMemOperand(const MCInst *MI, int opNum, raw_ostream &O);
+  void printMemOperandEA(const MCInst *MI, int opNum, raw_ostream &O);
+  void printFCCOperand(const MCInst *MI, int opNum, raw_ostream &O);
+  void printSHFMask(const MCInst *MI, int opNum, raw_ostream &O);
+
+  bool printAlias(const char *Str, const MCInst &MI, unsigned OpNo,
+                  raw_ostream &OS);
+  bool printAlias(const char *Str, const MCInst &MI, unsigned OpNo0,
+                  unsigned OpNo1, raw_ostream &OS);
+  bool printAlias(const MCInst &MI, raw_ostream &OS);
+  void printSaveRestore(const MCInst *MI, raw_ostream &O);
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp
new file mode 100644
index 0000000..5b0f950
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp
@@ -0,0 +1,66 @@
+//===-- MipsABIFlagsSection.cpp - Mips ELF ABI Flags Section ---*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsABIFlagsSection.h"
+
+using namespace llvm;
+
+uint8_t MipsABIFlagsSection::getFpABIValue() {
+  switch (FpABI) {
+  case FpABIKind::ANY:
+    return Val_GNU_MIPS_ABI_FP_ANY;
+  case FpABIKind::XX:
+    return Val_GNU_MIPS_ABI_FP_XX;
+  case FpABIKind::S32:
+    return Val_GNU_MIPS_ABI_FP_DOUBLE;
+  case FpABIKind::S64:
+    if (Is32BitABI)
+      return OddSPReg ? Val_GNU_MIPS_ABI_FP_64 : Val_GNU_MIPS_ABI_FP_64A;
+    return Val_GNU_MIPS_ABI_FP_DOUBLE;
+  }
+
+  llvm_unreachable("unexpected fp abi value");
+}
+
+StringRef MipsABIFlagsSection::getFpABIString(FpABIKind Value) {
+  switch (Value) {
+  case FpABIKind::XX:
+    return "xx";
+  case FpABIKind::S32:
+    return "32";
+  case FpABIKind::S64:
+    return "64";
+  default:
+    llvm_unreachable("unsupported fp abi value");
+  }
+}
+
+uint8_t MipsABIFlagsSection::getCPR1SizeValue() {
+  if (FpABI == FpABIKind::XX)
+    return (uint8_t)AFL_REG_32;
+  return (uint8_t)CPR1Size;
+}
+
+namespace llvm {
+MCStreamer &operator<<(MCStreamer &OS, MipsABIFlagsSection &ABIFlagsSection) {
+  // Write out a Elf_Internal_ABIFlags_v0 struct
+  OS.EmitIntValue(ABIFlagsSection.getVersionValue(), 2);         // version
+  OS.EmitIntValue(ABIFlagsSection.getISALevelValue(), 1);        // isa_level
+  OS.EmitIntValue(ABIFlagsSection.getISARevisionValue(), 1);     // isa_rev
+  OS.EmitIntValue(ABIFlagsSection.getGPRSizeValue(), 1);         // gpr_size
+  OS.EmitIntValue(ABIFlagsSection.getCPR1SizeValue(), 1);        // cpr1_size
+  OS.EmitIntValue(ABIFlagsSection.getCPR2SizeValue(), 1);        // cpr2_size
+  OS.EmitIntValue(ABIFlagsSection.getFpABIValue(), 1);           // fp_abi
+  OS.EmitIntValue(ABIFlagsSection.getISAExtensionSetValue(), 4); // isa_ext
+  OS.EmitIntValue(ABIFlagsSection.getASESetValue(), 4);          // ases
+  OS.EmitIntValue(ABIFlagsSection.getFlags1Value(), 4);          // flags1
+  OS.EmitIntValue(ABIFlagsSection.getFlags2Value(), 4);          // flags2
+  return OS;
+}
+}
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h
new file mode 100644
index 0000000..ea5bc12
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h
@@ -0,0 +1,238 @@
+//===-- MipsABIFlagsSection.h - Mips ELF ABI Flags Section -----*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSABIFLAGSSECTION_H
+#define MIPSABIFLAGSSECTION_H
+
+#include "llvm/MC/MCStreamer.h"
+
+namespace llvm {
+
+class MCStreamer;
+
+struct MipsABIFlagsSection {
+  // Values for the xxx_size bytes of an ABI flags structure.
+  enum AFL_REG {
+    AFL_REG_NONE = 0x00, // No registers.
+    AFL_REG_32 = 0x01,   // 32-bit registers.
+    AFL_REG_64 = 0x02,   // 64-bit registers.
+    AFL_REG_128 = 0x03   // 128-bit registers.
+  };
+
+  // Masks for the ases word of an ABI flags structure.
+  enum AFL_ASE {
+    AFL_ASE_DSP = 0x00000001,       // DSP ASE.
+    AFL_ASE_DSPR2 = 0x00000002,     // DSP R2 ASE.
+    AFL_ASE_EVA = 0x00000004,       // Enhanced VA Scheme.
+    AFL_ASE_MCU = 0x00000008,       // MCU (MicroController) ASE.
+    AFL_ASE_MDMX = 0x00000010,      // MDMX ASE.
+    AFL_ASE_MIPS3D = 0x00000020,    // MIPS-3D ASE.
+    AFL_ASE_MT = 0x00000040,        // MT ASE.
+    AFL_ASE_SMARTMIPS = 0x00000080, // SmartMIPS ASE.
+    AFL_ASE_VIRT = 0x00000100,      // VZ ASE.
+    AFL_ASE_MSA = 0x00000200,       // MSA ASE.
+    AFL_ASE_MIPS16 = 0x00000400,    // MIPS16 ASE.
+    AFL_ASE_MICROMIPS = 0x00000800, // MICROMIPS ASE.
+    AFL_ASE_XPA = 0x00001000        // XPA ASE.
+  };
+
+  // Values for the isa_ext word of an ABI flags structure.
+  enum AFL_EXT {
+    AFL_EXT_XLR = 1,          // RMI Xlr instruction.
+    AFL_EXT_OCTEON2 = 2,      // Cavium Networks Octeon2.
+    AFL_EXT_OCTEONP = 3,      // Cavium Networks OcteonP.
+    AFL_EXT_LOONGSON_3A = 4,  // Loongson 3A.
+    AFL_EXT_OCTEON = 5,       // Cavium Networks Octeon.
+    AFL_EXT_5900 = 6,         // MIPS R5900 instruction.
+    AFL_EXT_4650 = 7,         // MIPS R4650 instruction.
+    AFL_EXT_4010 = 8,         // LSI R4010 instruction.
+    AFL_EXT_4100 = 9,         // NEC VR4100 instruction.
+    AFL_EXT_3900 = 10,        // Toshiba R3900 instruction.
+    AFL_EXT_10000 = 11,       // MIPS R10000 instruction.
+    AFL_EXT_SB1 = 12,         // Broadcom SB-1 instruction.
+    AFL_EXT_4111 = 13,        // NEC VR4111/VR4181 instruction.
+    AFL_EXT_4120 = 14,        // NEC VR4120 instruction.
+    AFL_EXT_5400 = 15,        // NEC VR5400 instruction.
+    AFL_EXT_5500 = 16,        // NEC VR5500 instruction.
+    AFL_EXT_LOONGSON_2E = 17, // ST Microelectronics Loongson 2E.
+    AFL_EXT_LOONGSON_2F = 18  // ST Microelectronics Loongson 2F.
+  };
+
+  // Values for the fp_abi word of an ABI flags structure.
+  enum Val_GNU_MIPS_ABI {
+    Val_GNU_MIPS_ABI_FP_ANY = 0,
+    Val_GNU_MIPS_ABI_FP_DOUBLE = 1,
+    Val_GNU_MIPS_ABI_FP_XX = 5,
+    Val_GNU_MIPS_ABI_FP_64 = 6,
+    Val_GNU_MIPS_ABI_FP_64A = 7
+  };
+
+  enum AFL_FLAGS1 {
+    AFL_FLAGS1_ODDSPREG = 1
+  };
+
+  // Internal representation of the values used in .module fp=value
+  enum class FpABIKind { ANY, XX, S32, S64 };
+
+  // Version of flags structure.
+  uint16_t Version;
+  // The level of the ISA: 1-5, 32, 64.
+  uint8_t ISALevel;
+  // The revision of ISA: 0 for MIPS V and below, 1-n otherwise.
+  uint8_t ISARevision;
+  // The size of general purpose registers.
+  AFL_REG GPRSize;
+  // The size of co-processor 1 registers.
+  AFL_REG CPR1Size;
+  // The size of co-processor 2 registers.
+  AFL_REG CPR2Size;
+  // Processor-specific extension.
+  uint32_t ISAExtensionSet;
+  // Mask of ASEs used.
+  uint32_t ASESet;
+
+  bool OddSPReg;
+
+  bool Is32BitABI;
+
+protected:
+  // The floating-point ABI.
+  FpABIKind FpABI;
+
+public:
+  MipsABIFlagsSection()
+      : Version(0), ISALevel(0), ISARevision(0), GPRSize(AFL_REG_NONE),
+        CPR1Size(AFL_REG_NONE), CPR2Size(AFL_REG_NONE), ISAExtensionSet(0),
+        ASESet(0), OddSPReg(false), Is32BitABI(false), FpABI(FpABIKind::ANY) {}
+
+  uint16_t getVersionValue() { return (uint16_t)Version; }
+  uint8_t getISALevelValue() { return (uint8_t)ISALevel; }
+  uint8_t getISARevisionValue() { return (uint8_t)ISARevision; }
+  uint8_t getGPRSizeValue() { return (uint8_t)GPRSize; }
+  uint8_t getCPR1SizeValue();
+  uint8_t getCPR2SizeValue() { return (uint8_t)CPR2Size; }
+  uint8_t getFpABIValue();
+  uint32_t getISAExtensionSetValue() { return (uint32_t)ISAExtensionSet; }
+  uint32_t getASESetValue() { return (uint32_t)ASESet; }
+
+  uint32_t getFlags1Value() {
+    uint32_t Value = 0;
+
+    if (OddSPReg)
+      Value |= (uint32_t)AFL_FLAGS1_ODDSPREG;
+
+    return Value;
+  }
+
+  uint32_t getFlags2Value() { return 0; }
+
+  FpABIKind getFpABI() { return FpABI; }
+  void setFpABI(FpABIKind Value, bool IsABI32Bit) {
+    FpABI = Value;
+    Is32BitABI = IsABI32Bit;
+  }
+  StringRef getFpABIString(FpABIKind Value);
+
+  template <class PredicateLibrary>
+  void setISALevelAndRevisionFromPredicates(const PredicateLibrary &P) {
+    if (P.hasMips64()) {
+      ISALevel = 64;
+      if (P.hasMips64r6())
+        ISARevision = 6;
+      else if (P.hasMips64r2())
+        ISARevision = 2;
+      else
+        ISARevision = 1;
+    } else if (P.hasMips32()) {
+      ISALevel = 32;
+      if (P.hasMips32r6())
+        ISARevision = 6;
+      else if (P.hasMips32r2())
+        ISARevision = 2;
+      else
+        ISARevision = 1;
+    } else {
+      ISARevision = 0;
+      if (P.hasMips5())
+        ISALevel = 5;
+      else if (P.hasMips4())
+        ISALevel = 4;
+      else if (P.hasMips3())
+        ISALevel = 3;
+      else if (P.hasMips2())
+        ISALevel = 2;
+      else if (P.hasMips1())
+        ISALevel = 1;
+      else
+        llvm_unreachable("Unknown ISA level!");
+    }
+  }
+
+  template <class PredicateLibrary>
+  void setGPRSizeFromPredicates(const PredicateLibrary &P) {
+    GPRSize = P.isGP64bit() ? AFL_REG_64 : AFL_REG_32;
+  }
+
+  template <class PredicateLibrary>
+  void setCPR1SizeFromPredicates(const PredicateLibrary &P) {
+    if (P.abiUsesSoftFloat())
+      CPR1Size = AFL_REG_NONE;
+    else if (P.hasMSA())
+      CPR1Size = AFL_REG_128;
+    else
+      CPR1Size = P.isFP64bit() ? AFL_REG_64 : AFL_REG_32;
+  }
+
+  template <class PredicateLibrary>
+  void setASESetFromPredicates(const PredicateLibrary &P) {
+    ASESet = 0;
+    if (P.hasDSP())
+      ASESet |= AFL_ASE_DSP;
+    if (P.hasDSPR2())
+      ASESet |= AFL_ASE_DSPR2;
+    if (P.hasMSA())
+      ASESet |= AFL_ASE_MSA;
+    if (P.inMicroMipsMode())
+      ASESet |= AFL_ASE_MICROMIPS;
+    if (P.inMips16Mode())
+      ASESet |= AFL_ASE_MIPS16;
+  }
+
+  template <class PredicateLibrary>
+  void setFpAbiFromPredicates(const PredicateLibrary &P) {
+    Is32BitABI = P.isABI_O32();
+
+    FpABI = FpABIKind::ANY;
+    if (P.isABI_N32() || P.isABI_N64())
+      FpABI = FpABIKind::S64;
+    else if (P.isABI_O32()) {
+      if (P.isABI_FPXX())
+        FpABI = FpABIKind::XX;
+      else if (P.isFP64bit())
+        FpABI = FpABIKind::S64;
+      else
+        FpABI = FpABIKind::S32;
+    }
+  }
+
+  template <class PredicateLibrary>
+  void setAllFromPredicates(const PredicateLibrary &P) {
+    setISALevelAndRevisionFromPredicates(P);
+    setGPRSizeFromPredicates(P);
+    setCPR1SizeFromPredicates(P);
+    setASESetFromPredicates(P);
+    setFpAbiFromPredicates(P);
+    OddSPReg = P.useOddSPReg();
+  }
+};
+
+MCStreamer &operator<<(MCStreamer &OS, MipsABIFlagsSection &ABIFlagsSection);
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
new file mode 100644
index 0000000..d8e6128
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -0,0 +1,425 @@
+//===-- MipsAsmBackend.cpp - Mips Asm Backend  ----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MipsAsmBackend class.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#include "MCTargetDesc/MipsFixupKinds.h"
+#include "MCTargetDesc/MipsAsmBackend.h"
+#include "MCTargetDesc/MipsMCTargetDesc.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+// Prepare value for the target space for it
+static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
+                                 MCContext *Ctx = nullptr) {
+
+  unsigned Kind = Fixup.getKind();
+
+  // Add/subtract and shift
+  switch (Kind) {
+  default:
+    return 0;
+  case FK_Data_2:
+  case FK_GPRel_4:
+  case FK_Data_4:
+  case FK_Data_8:
+  case Mips::fixup_Mips_LO16:
+  case Mips::fixup_Mips_GPREL16:
+  case Mips::fixup_Mips_GPOFF_HI:
+  case Mips::fixup_Mips_GPOFF_LO:
+  case Mips::fixup_Mips_GOT_PAGE:
+  case Mips::fixup_Mips_GOT_OFST:
+  case Mips::fixup_Mips_GOT_DISP:
+  case Mips::fixup_Mips_GOT_LO16:
+  case Mips::fixup_Mips_CALL_LO16:
+  case Mips::fixup_MICROMIPS_LO16:
+  case Mips::fixup_MICROMIPS_GOT_PAGE:
+  case Mips::fixup_MICROMIPS_GOT_OFST:
+  case Mips::fixup_MICROMIPS_GOT_DISP:
+  case Mips::fixup_MIPS_PCLO16:
+    break;
+  case Mips::fixup_Mips_PC16:
+    // So far we are only using this type for branches.
+    // For branches we start 1 instruction after the branch
+    // so the displacement will be one instruction size less.
+    Value -= 4;
+    // The displacement is then divided by 4 to give us an 18 bit
+    // address range. Forcing a signed division because Value can be negative.
+    Value = (int64_t)Value / 4;
+    // We now check if Value can be encoded as a 16-bit signed immediate.
+    if (!isIntN(16, Value) && Ctx)
+      Ctx->FatalError(Fixup.getLoc(), "out of range PC16 fixup");
+    break;
+  case Mips::fixup_MIPS_PC19_S2:
+    // Forcing a signed division because Value can be negative.
+    Value = (int64_t)Value / 4;
+    // We now check if Value can be encoded as a 19-bit signed immediate.
+    if (!isIntN(19, Value) && Ctx)
+      Ctx->FatalError(Fixup.getLoc(), "out of range PC19 fixup");
+    break;
+  case Mips::fixup_Mips_26:
+    // So far we are only using this type for jumps.
+    // The displacement is then divided by 4 to give us an 28 bit
+    // address range.
+    Value >>= 2;
+    break;
+  case Mips::fixup_Mips_HI16:
+  case Mips::fixup_Mips_GOT_Local:
+  case Mips::fixup_Mips_GOT_HI16:
+  case Mips::fixup_Mips_CALL_HI16:
+  case Mips::fixup_MICROMIPS_HI16:
+  case Mips::fixup_MIPS_PCHI16:
+    // Get the 2nd 16-bits. Also add 1 if bit 15 is 1.
+    Value = ((Value + 0x8000) >> 16) & 0xffff;
+    break;
+  case Mips::fixup_Mips_HIGHER:
+    // Get the 3rd 16-bits.
+    Value = ((Value + 0x80008000LL) >> 32) & 0xffff;
+    break;
+  case Mips::fixup_Mips_HIGHEST:
+    // Get the 4th 16-bits.
+    Value = ((Value + 0x800080008000LL) >> 48) & 0xffff;
+    break;
+  case Mips::fixup_MICROMIPS_26_S1:
+    Value >>= 1;
+    break;
+  case Mips::fixup_MICROMIPS_PC16_S1:
+    Value -= 4;
+    // Forcing a signed division because Value can be negative.
+    Value = (int64_t)Value / 2;
+    // We now check if Value can be encoded as a 16-bit signed immediate.
+    if (!isIntN(16, Value) && Ctx)
+      Ctx->FatalError(Fixup.getLoc(), "out of range PC16 fixup");
+    break;
+  case Mips::fixup_MIPS_PC18_S3:
+    // Forcing a signed division because Value can be negative.
+    Value = (int64_t)Value / 8;
+    // We now check if Value can be encoded as a 18-bit signed immediate.
+    if (!isIntN(18, Value) && Ctx)
+      Ctx->FatalError(Fixup.getLoc(), "out of range PC18 fixup");
+    break;
+  case Mips::fixup_MIPS_PC21_S2:
+    Value -= 4;
+    // Forcing a signed division because Value can be negative.
+    Value = (int64_t) Value / 4;
+    // We now check if Value can be encoded as a 21-bit signed immediate.
+    if (!isIntN(21, Value) && Ctx)
+      Ctx->FatalError(Fixup.getLoc(), "out of range PC21 fixup");
+    break;
+  case Mips::fixup_MIPS_PC26_S2:
+    Value -= 4;
+    // Forcing a signed division because Value can be negative.
+    Value = (int64_t) Value / 4;
+    // We now check if Value can be encoded as a 26-bit signed immediate.
+    if (!isIntN(26, Value) && Ctx)
+      Ctx->FatalError(Fixup.getLoc(), "out of range PC26 fixup");
+    break;
+  }
+
+  return Value;
+}
+
+MCObjectWriter *MipsAsmBackend::createObjectWriter(raw_ostream &OS) const {
+  return createMipsELFObjectWriter(OS,
+    MCELFObjectTargetWriter::getOSABI(OSType), IsLittle, Is64Bit);
+}
+
+// Little-endian fixup data byte ordering:
+//   mips32r2:   a | b | x | x
+//   microMIPS:  x | x | a | b
+
+static bool needsMMLEByteOrder(unsigned Kind) {
+  return Kind >= Mips::fixup_MICROMIPS_26_S1 &&
+         Kind < Mips::LastTargetFixupKind;
+}
+
+// Calculate index for microMIPS specific little endian byte order
+static unsigned calculateMMLEIndex(unsigned i) {
+  assert(i <= 3 && "Index out of range!");
+
+  return (1 - i / 2) * 2 + i % 2;
+}
+
+/// ApplyFixup - Apply the \p Value for given \p Fixup into the provided
+/// data fragment, at the offset specified by the fixup and following the
+/// fixup kind as appropriate.
+void MipsAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+                                unsigned DataSize, uint64_t Value,
+                                bool IsPCRel) const {
+  MCFixupKind Kind = Fixup.getKind();
+  Value = adjustFixupValue(Fixup, Value);
+
+  if (!Value)
+    return; // Doesn't change encoding.
+
+  // Where do we start in the object
+  unsigned Offset = Fixup.getOffset();
+  // Number of bytes we need to fixup
+  unsigned NumBytes = (getFixupKindInfo(Kind).TargetSize + 7) / 8;
+  // Used to point to big endian bytes
+  unsigned FullSize;
+
+  switch ((unsigned)Kind) {
+  case FK_Data_2:
+  case Mips::fixup_Mips_16:
+    FullSize = 2;
+    break;
+  case FK_Data_8:
+  case Mips::fixup_Mips_64:
+    FullSize = 8;
+    break;
+  case FK_Data_4:
+  default:
+    FullSize = 4;
+    break;
+  }
+
+  // Grab current value, if any, from bits.
+  uint64_t CurVal = 0;
+
+  bool microMipsLEByteOrder = needsMMLEByteOrder((unsigned) Kind);
+
+  for (unsigned i = 0; i != NumBytes; ++i) {
+    unsigned Idx = IsLittle ? (microMipsLEByteOrder ? calculateMMLEIndex(i)
+                                                    : i)
+                            : (FullSize - 1 - i);
+    CurVal |= (uint64_t)((uint8_t)Data[Offset + Idx]) << (i*8);
+  }
+
+  uint64_t Mask = ((uint64_t)(-1) >>
+                    (64 - getFixupKindInfo(Kind).TargetSize));
+  CurVal |= Value & Mask;
+
+  // Write out the fixed up bytes back to the code/data bits.
+  for (unsigned i = 0; i != NumBytes; ++i) {
+    unsigned Idx = IsLittle ? (microMipsLEByteOrder ? calculateMMLEIndex(i)
+                                                    : i)
+                            : (FullSize - 1 - i);
+    Data[Offset + Idx] = (uint8_t)((CurVal >> (i*8)) & 0xff);
+  }
+}
+
+const MCFixupKindInfo &MipsAsmBackend::
+getFixupKindInfo(MCFixupKind Kind) const {
+  const static MCFixupKindInfo LittleEndianInfos[Mips::NumTargetFixupKinds] = {
+    // This table *must* be in same the order of fixup_* kinds in
+    // MipsFixupKinds.h.
+    //
+    // name                    offset  bits  flags
+    { "fixup_Mips_16",           0,     16,   0 },
+    { "fixup_Mips_32",           0,     32,   0 },
+    { "fixup_Mips_REL32",        0,     32,   0 },
+    { "fixup_Mips_26",           0,     26,   0 },
+    { "fixup_Mips_HI16",         0,     16,   0 },
+    { "fixup_Mips_LO16",         0,     16,   0 },
+    { "fixup_Mips_GPREL16",      0,     16,   0 },
+    { "fixup_Mips_LITERAL",      0,     16,   0 },
+    { "fixup_Mips_GOT_Global",   0,     16,   0 },
+    { "fixup_Mips_GOT_Local",    0,     16,   0 },
+    { "fixup_Mips_PC16",         0,     16,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_Mips_CALL16",       0,     16,   0 },
+    { "fixup_Mips_GPREL32",      0,     32,   0 },
+    { "fixup_Mips_SHIFT5",       6,      5,   0 },
+    { "fixup_Mips_SHIFT6",       6,      5,   0 },
+    { "fixup_Mips_64",           0,     64,   0 },
+    { "fixup_Mips_TLSGD",        0,     16,   0 },
+    { "fixup_Mips_GOTTPREL",     0,     16,   0 },
+    { "fixup_Mips_TPREL_HI",     0,     16,   0 },
+    { "fixup_Mips_TPREL_LO",     0,     16,   0 },
+    { "fixup_Mips_TLSLDM",       0,     16,   0 },
+    { "fixup_Mips_DTPREL_HI",    0,     16,   0 },
+    { "fixup_Mips_DTPREL_LO",    0,     16,   0 },
+    { "fixup_Mips_Branch_PCRel", 0,     16,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_Mips_GPOFF_HI",     0,     16,   0 },
+    { "fixup_Mips_GPOFF_LO",     0,     16,   0 },
+    { "fixup_Mips_GOT_PAGE",     0,     16,   0 },
+    { "fixup_Mips_GOT_OFST",     0,     16,   0 },
+    { "fixup_Mips_GOT_DISP",     0,     16,   0 },
+    { "fixup_Mips_HIGHER",       0,     16,   0 },
+    { "fixup_Mips_HIGHEST",      0,     16,   0 },
+    { "fixup_Mips_GOT_HI16",     0,     16,   0 },
+    { "fixup_Mips_GOT_LO16",     0,     16,   0 },
+    { "fixup_Mips_CALL_HI16",    0,     16,   0 },
+    { "fixup_Mips_CALL_LO16",    0,     16,   0 },
+    { "fixup_Mips_PC18_S3",      0,     18,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MIPS_PC19_S2",      0,     19,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MIPS_PC21_S2",      0,     21,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MIPS_PC26_S2",      0,     26,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MIPS_PCHI16",       0,     16,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MIPS_PCLO16",       0,     16,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MICROMIPS_26_S1",   0,     26,   0 },
+    { "fixup_MICROMIPS_HI16",    0,     16,   0 },
+    { "fixup_MICROMIPS_LO16",    0,     16,   0 },
+    { "fixup_MICROMIPS_GOT16",   0,     16,   0 },
+    { "fixup_MICROMIPS_PC16_S1", 0,     16,   MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MICROMIPS_CALL16",  0,     16,   0 },
+    { "fixup_MICROMIPS_GOT_DISP",        0,     16,   0 },
+    { "fixup_MICROMIPS_GOT_PAGE",        0,     16,   0 },
+    { "fixup_MICROMIPS_GOT_OFST",        0,     16,   0 },
+    { "fixup_MICROMIPS_TLS_GD",          0,     16,   0 },
+    { "fixup_MICROMIPS_TLS_LDM",         0,     16,   0 },
+    { "fixup_MICROMIPS_TLS_DTPREL_HI16", 0,     16,   0 },
+    { "fixup_MICROMIPS_TLS_DTPREL_LO16", 0,     16,   0 },
+    { "fixup_MICROMIPS_TLS_TPREL_HI16",  0,     16,   0 },
+    { "fixup_MICROMIPS_TLS_TPREL_LO16",  0,     16,   0 }
+  };
+
+  const static MCFixupKindInfo BigEndianInfos[Mips::NumTargetFixupKinds] = {
+    // This table *must* be in same the order of fixup_* kinds in
+    // MipsFixupKinds.h.
+    //
+    // name                    offset  bits  flags
+    { "fixup_Mips_16",          16,     16,   0 },
+    { "fixup_Mips_32",           0,     32,   0 },
+    { "fixup_Mips_REL32",        0,     32,   0 },
+    { "fixup_Mips_26",           6,     26,   0 },
+    { "fixup_Mips_HI16",        16,     16,   0 },
+    { "fixup_Mips_LO16",        16,     16,   0 },
+    { "fixup_Mips_GPREL16",     16,     16,   0 },
+    { "fixup_Mips_LITERAL",     16,     16,   0 },
+    { "fixup_Mips_GOT_Global",  16,     16,   0 },
+    { "fixup_Mips_GOT_Local",   16,     16,   0 },
+    { "fixup_Mips_PC16",        16,     16,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_Mips_CALL16",      16,     16,   0 },
+    { "fixup_Mips_GPREL32",      0,     32,   0 },
+    { "fixup_Mips_SHIFT5",      21,      5,   0 },
+    { "fixup_Mips_SHIFT6",      21,      5,   0 },
+    { "fixup_Mips_64",           0,     64,   0 },
+    { "fixup_Mips_TLSGD",       16,     16,   0 },
+    { "fixup_Mips_GOTTPREL",    16,     16,   0 },
+    { "fixup_Mips_TPREL_HI",    16,     16,   0 },
+    { "fixup_Mips_TPREL_LO",    16,     16,   0 },
+    { "fixup_Mips_TLSLDM",      16,     16,   0 },
+    { "fixup_Mips_DTPREL_HI",   16,     16,   0 },
+    { "fixup_Mips_DTPREL_LO",   16,     16,   0 },
+    { "fixup_Mips_Branch_PCRel",16,     16,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_Mips_GPOFF_HI",    16,     16,   0 },
+    { "fixup_Mips_GPOFF_LO",    16,     16,   0 },
+    { "fixup_Mips_GOT_PAGE",    16,     16,   0 },
+    { "fixup_Mips_GOT_OFST",    16,     16,   0 },
+    { "fixup_Mips_GOT_DISP",    16,     16,   0 },
+    { "fixup_Mips_HIGHER",      16,     16,   0 },
+    { "fixup_Mips_HIGHEST",     16,     16,   0 },
+    { "fixup_Mips_GOT_HI16",    16,     16,   0 },
+    { "fixup_Mips_GOT_LO16",    16,     16,   0 },
+    { "fixup_Mips_CALL_HI16",   16,     16,   0 },
+    { "fixup_Mips_CALL_LO16",   16,     16,   0 },
+    { "fixup_Mips_PC18_S3",     14,     18,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MIPS_PC19_S2",     13,     19,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MIPS_PC21_S2",     11,     21,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MIPS_PC26_S2",      6,     26,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MIPS_PCHI16",      16,     16,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MIPS_PCLO16",      16,     16,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MICROMIPS_26_S1",   6,     26,   0 },
+    { "fixup_MICROMIPS_HI16",   16,     16,   0 },
+    { "fixup_MICROMIPS_LO16",   16,     16,   0 },
+    { "fixup_MICROMIPS_GOT16",  16,     16,   0 },
+    { "fixup_MICROMIPS_PC16_S1",16,     16,   MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MICROMIPS_CALL16", 16,     16,   0 },
+    { "fixup_MICROMIPS_GOT_DISP",        16,     16,   0 },
+    { "fixup_MICROMIPS_GOT_PAGE",        16,     16,   0 },
+    { "fixup_MICROMIPS_GOT_OFST",        16,     16,   0 },
+    { "fixup_MICROMIPS_TLS_GD",          16,     16,   0 },
+    { "fixup_MICROMIPS_TLS_LDM",         16,     16,   0 },
+    { "fixup_MICROMIPS_TLS_DTPREL_HI16", 16,     16,   0 },
+    { "fixup_MICROMIPS_TLS_DTPREL_LO16", 16,     16,   0 },
+    { "fixup_MICROMIPS_TLS_TPREL_HI16",  16,     16,   0 },
+    { "fixup_MICROMIPS_TLS_TPREL_LO16",  16,     16,   0 }
+  };
+
+  if (Kind < FirstTargetFixupKind)
+    return MCAsmBackend::getFixupKindInfo(Kind);
+
+  assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+          "Invalid kind!");
+
+  if (IsLittle)
+    return LittleEndianInfos[Kind - FirstTargetFixupKind];
+  return BigEndianInfos[Kind - FirstTargetFixupKind];
+}
+
+/// WriteNopData - Write an (optimal) nop sequence of Count bytes
+/// to the given output. If the target cannot generate such a sequence,
+/// it should return an error.
+///
+/// \return - True on success.
+bool MipsAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+  // Check for a less than instruction size number of bytes
+  // FIXME: 16 bit instructions are not handled yet here.
+  // We shouldn't be using a hard coded number for instruction size.
+  if (Count % 4) return false;
+
+  uint64_t NumNops = Count / 4;
+  for (uint64_t i = 0; i != NumNops; ++i)
+    OW->Write32(0);
+  return true;
+}
+
+/// processFixupValue - Target hook to process the literal value of a fixup
+/// if necessary.
+void MipsAsmBackend::processFixupValue(const MCAssembler &Asm,
+                                       const MCAsmLayout &Layout,
+                                       const MCFixup &Fixup,
+                                       const MCFragment *DF,
+                                       const MCValue &Target,
+                                       uint64_t &Value,
+                                       bool &IsResolved) {
+  // At this point we'll ignore the value returned by adjustFixupValue as
+  // we are only checking if the fixup can be applied correctly. We have
+  // access to MCContext from here which allows us to report a fatal error
+  // with *possibly* a source code location.
+  (void)adjustFixupValue(Fixup, Value, &Asm.getContext());
+}
+
+// MCAsmBackend
+MCAsmBackend *llvm::createMipsAsmBackendEL32(const Target &T,
+                                             const MCRegisterInfo &MRI,
+                                             StringRef TT,
+                                             StringRef CPU) {
+  return new MipsAsmBackend(T, Triple(TT).getOS(),
+                            /*IsLittle*/true, /*Is64Bit*/false);
+}
+
+MCAsmBackend *llvm::createMipsAsmBackendEB32(const Target &T,
+                                             const MCRegisterInfo &MRI,
+                                             StringRef TT,
+                                             StringRef CPU) {
+  return new MipsAsmBackend(T, Triple(TT).getOS(),
+                            /*IsLittle*/false, /*Is64Bit*/false);
+}
+
+MCAsmBackend *llvm::createMipsAsmBackendEL64(const Target &T,
+                                             const MCRegisterInfo &MRI,
+                                             StringRef TT,
+                                             StringRef CPU) {
+  return new MipsAsmBackend(T, Triple(TT).getOS(),
+                            /*IsLittle*/true, /*Is64Bit*/true);
+}
+
+MCAsmBackend *llvm::createMipsAsmBackendEB64(const Target &T,
+                                             const MCRegisterInfo &MRI,
+                                             StringRef TT,
+                                             StringRef CPU) {
+  return new MipsAsmBackend(T, Triple(TT).getOS(),
+                            /*IsLittle*/false, /*Is64Bit*/true);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
new file mode 100644
index 0000000..d5c3dbc
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
@@ -0,0 +1,93 @@
+//===-- MipsAsmBackend.h - Mips Asm Backend  ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the MipsAsmBackend class.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#ifndef MIPSASMBACKEND_H
+#define MIPSASMBACKEND_H
+
+#include "MCTargetDesc/MipsFixupKinds.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/ADT/Triple.h"
+
+namespace llvm {
+
+class MCAssembler;
+struct MCFixupKindInfo;
+class Target;
+class MCObjectWriter;
+
+class MipsAsmBackend : public MCAsmBackend {
+  Triple::OSType OSType;
+  bool IsLittle; // Big or little endian
+  bool Is64Bit;  // 32 or 64 bit words
+
+public:
+  MipsAsmBackend(const Target &T, Triple::OSType _OSType, bool _isLittle,
+                 bool _is64Bit)
+      : MCAsmBackend(), OSType(_OSType), IsLittle(_isLittle),
+        Is64Bit(_is64Bit) {}
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override;
+
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value, bool IsPCRel) const override;
+
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
+
+  unsigned getNumFixupKinds() const override {
+    return Mips::NumTargetFixupKinds;
+  }
+
+  /// @name Target Relaxation Interfaces
+  /// @{
+
+  /// MayNeedRelaxation - Check whether the given instruction may need
+  /// relaxation.
+  ///
+  /// \param Inst - The instruction to test.
+  bool mayNeedRelaxation(const MCInst &Inst) const override {
+    return false;
+  }
+
+  /// fixupNeedsRelaxation - Target specific predicate for whether a given
+  /// fixup requires the associated instruction to be relaxed.
+   bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                             const MCRelaxableFragment *DF,
+                             const MCAsmLayout &Layout) const override {
+    // FIXME.
+    llvm_unreachable("RelaxInstruction() unimplemented");
+    return false;
+  }
+
+  /// RelaxInstruction - Relax the instruction in the given fragment
+  /// to the next wider instruction.
+  ///
+  /// \param Inst - The instruction to relax, which may be the same
+  /// as the output.
+  /// \param [out] Res On return, the relaxed instruction.
+  void relaxInstruction(const MCInst &Inst, MCInst &Res) const override {}
+
+  /// @}
+
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+
+  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
+                         const MCFixup &Fixup, const MCFragment *DF,
+                         const MCValue &Target, uint64_t &Value,
+                         bool &IsResolved) override;
+
+}; // class MipsAsmBackend
+
+} // namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
new file mode 100644
index 0000000..d2323dc
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
@@ -0,0 +1,125 @@
+//===-- MipsBaseInfo.h - Top level definitions for MIPS MC ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the Mips target useful for the compiler back-end and the MC libraries.
+//
+//===----------------------------------------------------------------------===//
+#ifndef MIPSBASEINFO_H
+#define MIPSBASEINFO_H
+
+#include "MipsFixupKinds.h"
+#include "MipsMCTargetDesc.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+/// MipsII - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace MipsII {
+  /// Target Operand Flag enum.
+  enum TOF {
+    //===------------------------------------------------------------------===//
+    // Mips Specific MachineOperand flags.
+
+    MO_NO_FLAG,
+
+    /// MO_GOT16 - Represents the offset into the global offset table at which
+    /// the address the relocation entry symbol resides during execution.
+    MO_GOT16,
+    MO_GOT,
+
+    /// MO_GOT_CALL - Represents the offset into the global offset table at
+    /// which the address of a call site relocation entry symbol resides
+    /// during execution. This is different from the above since this flag
+    /// can only be present in call instructions.
+    MO_GOT_CALL,
+
+    /// MO_GPREL - Represents the offset from the current gp value to be used
+    /// for the relocatable object file being produced.
+    MO_GPREL,
+
+    /// MO_ABS_HI/LO - Represents the hi or low part of an absolute symbol
+    /// address.
+    MO_ABS_HI,
+    MO_ABS_LO,
+
+    /// MO_TLSGD - Represents the offset into the global offset table at which
+    // the module ID and TSL block offset reside during execution (General
+    // Dynamic TLS).
+    MO_TLSGD,
+
+    /// MO_TLSLDM - Represents the offset into the global offset table at which
+    // the module ID and TSL block offset reside during execution (Local
+    // Dynamic TLS).
+    MO_TLSLDM,
+    MO_DTPREL_HI,
+    MO_DTPREL_LO,
+
+    /// MO_GOTTPREL - Represents the offset from the thread pointer (Initial
+    // Exec TLS).
+    MO_GOTTPREL,
+
+    /// MO_TPREL_HI/LO - Represents the hi and low part of the offset from
+    // the thread pointer (Local Exec TLS).
+    MO_TPREL_HI,
+    MO_TPREL_LO,
+
+    // N32/64 Flags.
+    MO_GPOFF_HI,
+    MO_GPOFF_LO,
+    MO_GOT_DISP,
+    MO_GOT_PAGE,
+    MO_GOT_OFST,
+
+    /// MO_HIGHER/HIGHEST - Represents the highest or higher half word of a
+    /// 64-bit symbol address.
+    MO_HIGHER,
+    MO_HIGHEST,
+
+    /// MO_GOT_HI16/LO16, MO_CALL_HI16/LO16 - Relocations used for large GOTs.
+    MO_GOT_HI16,
+    MO_GOT_LO16,
+    MO_CALL_HI16,
+    MO_CALL_LO16
+  };
+
+  enum {
+    //===------------------------------------------------------------------===//
+    // Instruction encodings.  These are the standard/most common forms for
+    // Mips instructions.
+    //
+
+    // Pseudo - This represents an instruction that is a pseudo instruction
+    // or one that has not been implemented yet.  It is illegal to code generate
+    // it, but tolerated for intermediate implementation stages.
+    Pseudo   = 0,
+
+    /// FrmR - This form is for instructions of the format R.
+    FrmR  = 1,
+    /// FrmI - This form is for instructions of the format I.
+    FrmI  = 2,
+    /// FrmJ - This form is for instructions of the format J.
+    FrmJ  = 3,
+    /// FrmFR - This form is for instructions of the format FR.
+    FrmFR = 4,
+    /// FrmFI - This form is for instructions of the format FI.
+    FrmFI = 5,
+    /// FrmOther - This form is for instructions that have no specific format.
+    FrmOther = 6,
+
+    FormMask = 15
+  };
+}
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
new file mode 100644
index 0000000..4ea7846
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -0,0 +1,263 @@
+//===-- MipsELFObjectWriter.cpp - Mips ELF Writer -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MCTargetDesc/MipsFixupKinds.h"
+#include "MCTargetDesc/MipsMCTargetDesc.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <list>
+
+using namespace llvm;
+
+namespace {
+  class MipsELFObjectWriter : public MCELFObjectTargetWriter {
+  public:
+    MipsELFObjectWriter(bool _is64Bit, uint8_t OSABI,
+                        bool _isN64, bool IsLittleEndian);
+
+    virtual ~MipsELFObjectWriter();
+
+    unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
+                          bool IsPCRel) const override;
+    bool needsRelocateWithSymbol(const MCSymbolData &SD,
+                                 unsigned Type) const override;
+  };
+}
+
+MipsELFObjectWriter::MipsELFObjectWriter(bool _is64Bit, uint8_t OSABI,
+                                         bool _isN64, bool IsLittleEndian)
+  : MCELFObjectTargetWriter(_is64Bit, OSABI, ELF::EM_MIPS,
+                            /*HasRelocationAddend*/ (_isN64) ? true : false,
+                            /*IsN64*/ _isN64) {}
+
+MipsELFObjectWriter::~MipsELFObjectWriter() {}
+
+unsigned MipsELFObjectWriter::GetRelocType(const MCValue &Target,
+                                           const MCFixup &Fixup,
+                                           bool IsPCRel) const {
+  // determine the type of the relocation
+  unsigned Type = (unsigned)ELF::R_MIPS_NONE;
+  unsigned Kind = (unsigned)Fixup.getKind();
+
+  switch (Kind) {
+  default:
+    llvm_unreachable("invalid fixup kind!");
+  case FK_Data_4:
+    Type = ELF::R_MIPS_32;
+    break;
+  case FK_Data_8:
+    Type = ELF::R_MIPS_64;
+    break;
+  case FK_GPRel_4:
+    if (isN64()) {
+      Type = setRType((unsigned)ELF::R_MIPS_GPREL32, Type);
+      Type = setRType2((unsigned)ELF::R_MIPS_64, Type);
+      Type = setRType3((unsigned)ELF::R_MIPS_NONE, Type);
+    }
+    else
+      Type = ELF::R_MIPS_GPREL32;
+    break;
+  case Mips::fixup_Mips_GPREL16:
+    Type = ELF::R_MIPS_GPREL16;
+    break;
+  case Mips::fixup_Mips_26:
+    Type = ELF::R_MIPS_26;
+    break;
+  case Mips::fixup_Mips_CALL16:
+    Type = ELF::R_MIPS_CALL16;
+    break;
+  case Mips::fixup_Mips_GOT_Global:
+  case Mips::fixup_Mips_GOT_Local:
+    Type = ELF::R_MIPS_GOT16;
+    break;
+  case Mips::fixup_Mips_HI16:
+    Type = ELF::R_MIPS_HI16;
+    break;
+  case Mips::fixup_Mips_LO16:
+    Type = ELF::R_MIPS_LO16;
+    break;
+  case Mips::fixup_Mips_TLSGD:
+    Type = ELF::R_MIPS_TLS_GD;
+    break;
+  case Mips::fixup_Mips_GOTTPREL:
+    Type = ELF::R_MIPS_TLS_GOTTPREL;
+    break;
+  case Mips::fixup_Mips_TPREL_HI:
+    Type = ELF::R_MIPS_TLS_TPREL_HI16;
+    break;
+  case Mips::fixup_Mips_TPREL_LO:
+    Type = ELF::R_MIPS_TLS_TPREL_LO16;
+    break;
+  case Mips::fixup_Mips_TLSLDM:
+    Type = ELF::R_MIPS_TLS_LDM;
+    break;
+  case Mips::fixup_Mips_DTPREL_HI:
+    Type = ELF::R_MIPS_TLS_DTPREL_HI16;
+    break;
+  case Mips::fixup_Mips_DTPREL_LO:
+    Type = ELF::R_MIPS_TLS_DTPREL_LO16;
+    break;
+  case Mips::fixup_Mips_Branch_PCRel:
+  case Mips::fixup_Mips_PC16:
+    Type = ELF::R_MIPS_PC16;
+    break;
+  case Mips::fixup_Mips_GOT_PAGE:
+    Type = ELF::R_MIPS_GOT_PAGE;
+    break;
+  case Mips::fixup_Mips_GOT_OFST:
+    Type = ELF::R_MIPS_GOT_OFST;
+    break;
+  case Mips::fixup_Mips_GOT_DISP:
+    Type = ELF::R_MIPS_GOT_DISP;
+    break;
+  case Mips::fixup_Mips_GPOFF_HI:
+    Type = setRType((unsigned)ELF::R_MIPS_GPREL16, Type);
+    Type = setRType2((unsigned)ELF::R_MIPS_SUB, Type);
+    Type = setRType3((unsigned)ELF::R_MIPS_HI16, Type);
+    break;
+  case Mips::fixup_Mips_GPOFF_LO:
+    Type = setRType((unsigned)ELF::R_MIPS_GPREL16, Type);
+    Type = setRType2((unsigned)ELF::R_MIPS_SUB, Type);
+    Type = setRType3((unsigned)ELF::R_MIPS_LO16, Type);
+    break;
+  case Mips::fixup_Mips_HIGHER:
+    Type = ELF::R_MIPS_HIGHER;
+    break;
+  case Mips::fixup_Mips_HIGHEST:
+    Type = ELF::R_MIPS_HIGHEST;
+    break;
+  case Mips::fixup_Mips_GOT_HI16:
+    Type = ELF::R_MIPS_GOT_HI16;
+    break;
+  case Mips::fixup_Mips_GOT_LO16:
+    Type = ELF::R_MIPS_GOT_LO16;
+    break;
+  case Mips::fixup_Mips_CALL_HI16:
+    Type = ELF::R_MIPS_CALL_HI16;
+    break;
+  case Mips::fixup_Mips_CALL_LO16:
+    Type = ELF::R_MIPS_CALL_LO16;
+    break;
+  case Mips::fixup_MICROMIPS_26_S1:
+    Type = ELF::R_MICROMIPS_26_S1;
+    break;
+  case Mips::fixup_MICROMIPS_HI16:
+    Type = ELF::R_MICROMIPS_HI16;
+    break;
+  case Mips::fixup_MICROMIPS_LO16:
+    Type = ELF::R_MICROMIPS_LO16;
+    break;
+  case Mips::fixup_MICROMIPS_GOT16:
+    Type = ELF::R_MICROMIPS_GOT16;
+    break;
+  case Mips::fixup_MICROMIPS_PC16_S1:
+    Type = ELF::R_MICROMIPS_PC16_S1;
+    break;
+  case Mips::fixup_MICROMIPS_CALL16:
+    Type = ELF::R_MICROMIPS_CALL16;
+    break;
+  case Mips::fixup_MICROMIPS_GOT_DISP:
+    Type = ELF::R_MICROMIPS_GOT_DISP;
+    break;
+  case Mips::fixup_MICROMIPS_GOT_PAGE:
+    Type = ELF::R_MICROMIPS_GOT_PAGE;
+    break;
+  case Mips::fixup_MICROMIPS_GOT_OFST:
+    Type = ELF::R_MICROMIPS_GOT_OFST;
+    break;
+  case Mips::fixup_MICROMIPS_TLS_GD:
+    Type = ELF::R_MICROMIPS_TLS_GD;
+    break;
+  case Mips::fixup_MICROMIPS_TLS_LDM:
+    Type = ELF::R_MICROMIPS_TLS_LDM;
+    break;
+  case Mips::fixup_MICROMIPS_TLS_DTPREL_HI16:
+    Type = ELF::R_MICROMIPS_TLS_DTPREL_HI16;
+    break;
+  case Mips::fixup_MICROMIPS_TLS_DTPREL_LO16:
+    Type = ELF::R_MICROMIPS_TLS_DTPREL_LO16;
+    break;
+  case Mips::fixup_MICROMIPS_TLS_TPREL_HI16:
+    Type = ELF::R_MICROMIPS_TLS_TPREL_HI16;
+    break;
+  case Mips::fixup_MICROMIPS_TLS_TPREL_LO16:
+    Type = ELF::R_MICROMIPS_TLS_TPREL_LO16;
+    break;
+  case Mips::fixup_MIPS_PC19_S2:
+    Type = ELF::R_MIPS_PC19_S2;
+    break;
+  case Mips::fixup_MIPS_PC18_S3:
+    Type = ELF::R_MIPS_PC18_S3;
+    break;
+  case Mips::fixup_MIPS_PC21_S2:
+    Type = ELF::R_MIPS_PC21_S2;
+    break;
+  case Mips::fixup_MIPS_PC26_S2:
+    Type = ELF::R_MIPS_PC26_S2;
+    break;
+  case Mips::fixup_MIPS_PCHI16:
+    Type = ELF::R_MIPS_PCHI16;
+    break;
+  case Mips::fixup_MIPS_PCLO16:
+    Type = ELF::R_MIPS_PCLO16;
+    break;
+  }
+  return Type;
+}
+
+bool
+MipsELFObjectWriter::needsRelocateWithSymbol(const MCSymbolData &SD,
+                                             unsigned Type) const {
+  // FIXME: This is extremelly conservative. This really needs to use a
+  // whitelist with a clear explanation for why each realocation needs to
+  // point to the symbol, not to the section.
+  switch (Type) {
+  default:
+    return true;
+
+  case ELF::R_MIPS_GOT16:
+  case ELF::R_MIPS16_GOT16:
+  case ELF::R_MICROMIPS_GOT16:
+    llvm_unreachable("Should have been handled already");
+
+  // These relocations might be paired with another relocation. The pairing is
+  // done by the static linker by matching the symbol. Since we only see one
+  // relocation at a time, we have to force them to relocate with a symbol to
+  // avoid ending up with a pair where one points to a section and another
+  // points to a symbol.
+  case ELF::R_MIPS_HI16:
+  case ELF::R_MIPS16_HI16:
+  case ELF::R_MICROMIPS_HI16:
+  case ELF::R_MIPS_LO16:
+  case ELF::R_MIPS16_LO16:
+  case ELF::R_MICROMIPS_LO16:
+    return true;
+
+  case ELF::R_MIPS_26:
+  case ELF::R_MIPS_32:
+  case ELF::R_MIPS_64:
+  case ELF::R_MIPS_GPREL16:
+    return false;
+  }
+}
+
+MCObjectWriter *llvm::createMipsELFObjectWriter(raw_ostream &OS,
+                                                uint8_t OSABI,
+                                                bool IsLittleEndian,
+                                                bool Is64Bit) {
+  MCELFObjectTargetWriter *MOTW = new MipsELFObjectWriter(Is64Bit, OSABI,
+                                                (Is64Bit) ? true : false,
+                                                IsLittleEndian);
+  return createELFObjectWriter(MOTW, OS, IsLittleEndian);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
new file mode 100644
index 0000000..803ab85
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
@@ -0,0 +1,43 @@
+//===-------- MipsELFStreamer.cpp - ELF Object Output ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsELFStreamer.h"
+#include "llvm/MC/MCInst.h"
+
+void MipsELFStreamer::EmitInstruction(const MCInst &Inst,
+                                      const MCSubtargetInfo &STI) {
+  MCELFStreamer::EmitInstruction(Inst, STI);
+
+  MCContext &Context = getContext();
+  const MCRegisterInfo *MCRegInfo = Context.getRegisterInfo();
+
+  for (unsigned OpIndex = 0; OpIndex < Inst.getNumOperands(); ++OpIndex) {
+    const MCOperand &Op = Inst.getOperand(OpIndex);
+
+    if (!Op.isReg())
+      continue;
+
+    unsigned Reg = Op.getReg();
+    RegInfoRecord->SetPhysRegUsed(Reg, MCRegInfo);
+  }
+}
+
+void MipsELFStreamer::EmitMipsOptionRecords() {
+  for (const auto &I : MipsOptionRecords)
+    I->EmitMipsOptionRecord();
+}
+
+namespace llvm {
+MCELFStreamer *createMipsELFStreamer(MCContext &Context, MCAsmBackend &MAB,
+                                     raw_ostream &OS, MCCodeEmitter *Emitter,
+                                     const MCSubtargetInfo &STI, bool RelaxAll,
+                                     bool NoExecStack) {
+  return new MipsELFStreamer(Context, MAB, OS, Emitter, STI);
+}
+}
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
new file mode 100644
index 0000000..58863be
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
@@ -0,0 +1,58 @@
+//===-------- MipsELFStreamer.h - ELF Object Output -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a custom MCELFStreamer which allows us to insert some hooks before
+// emitting data into an actual object file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSELFSTREAMER_H
+#define MIPSELFSTREAMER_H
+
+#include "MipsOptionRecord.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include <memory>
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCSubtargetInfo;
+
+class MipsELFStreamer : public MCELFStreamer {
+  SmallVector<std::unique_ptr<MipsOptionRecord>, 8> MipsOptionRecords;
+  MipsRegInfoRecord *RegInfoRecord;
+
+public:
+  MipsELFStreamer(MCContext &Context, MCAsmBackend &MAB, raw_ostream &OS,
+                  MCCodeEmitter *Emitter, const MCSubtargetInfo &STI)
+      : MCELFStreamer(Context, MAB, OS, Emitter) {
+
+    RegInfoRecord = new MipsRegInfoRecord(this, Context, STI);
+    MipsOptionRecords.push_back(
+        std::unique_ptr<MipsRegInfoRecord>(RegInfoRecord));
+  }
+
+  /// Overriding this function allows us to add arbitrary behaviour before the
+  /// \p Inst is actually emitted. For example, we can inspect the operands and
+  /// gather sufficient information that allows us to reason about the register
+  /// usage for the translation unit.
+  void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override;
+
+  /// Emits all the option records stored up until the point it's called.
+  void EmitMipsOptionRecords();
+};
+
+MCELFStreamer *createMipsELFStreamer(MCContext &Context, MCAsmBackend &MAB,
+                                     raw_ostream &OS, MCCodeEmitter *Emitter,
+                                     const MCSubtargetInfo &STI, bool RelaxAll,
+                                     bool NoExecStack);
+} // namespace llvm.
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
new file mode 100644
index 0000000..05080f0
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
@@ -0,0 +1,202 @@
+//===-- MipsFixupKinds.h - Mips Specific Fixup Entries ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_MIPS_MIPSFIXUPKINDS_H
+#define LLVM_MIPS_MIPSFIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace Mips {
+  // Although most of the current fixup types reflect a unique relocation
+  // one can have multiple fixup types for a given relocation and thus need
+  // to be uniquely named.
+  //
+  // This table *must* be in the save order of
+  // MCFixupKindInfo Infos[Mips::NumTargetFixupKinds]
+  // in MipsAsmBackend.cpp.
+  //
+  enum Fixups {
+    // Branch fixups resulting in R_MIPS_16.
+    fixup_Mips_16 = FirstTargetFixupKind,
+
+    // Pure 32 bit data fixup resulting in - R_MIPS_32.
+    fixup_Mips_32,
+
+    // Full 32 bit data relative data fixup resulting in - R_MIPS_REL32.
+    fixup_Mips_REL32,
+
+    // Jump 26 bit fixup resulting in - R_MIPS_26.
+    fixup_Mips_26,
+
+    // Pure upper 16 bit fixup resulting in - R_MIPS_HI16.
+    fixup_Mips_HI16,
+
+    // Pure lower 16 bit fixup resulting in - R_MIPS_LO16.
+    fixup_Mips_LO16,
+
+    // 16 bit fixup for GP offest resulting in - R_MIPS_GPREL16.
+    fixup_Mips_GPREL16,
+
+    // 16 bit literal fixup resulting in - R_MIPS_LITERAL.
+    fixup_Mips_LITERAL,
+
+    // Global symbol fixup resulting in - R_MIPS_GOT16.
+    fixup_Mips_GOT_Global,
+
+    // Local symbol fixup resulting in - R_MIPS_GOT16.
+    fixup_Mips_GOT_Local,
+
+    // PC relative branch fixup resulting in - R_MIPS_PC16.
+    fixup_Mips_PC16,
+
+    // resulting in - R_MIPS_CALL16.
+    fixup_Mips_CALL16,
+
+    // resulting in - R_MIPS_GPREL32.
+    fixup_Mips_GPREL32,
+
+    // resulting in - R_MIPS_SHIFT5.
+    fixup_Mips_SHIFT5,
+
+    // resulting in - R_MIPS_SHIFT6.
+    fixup_Mips_SHIFT6,
+
+    // Pure 64 bit data fixup resulting in - R_MIPS_64.
+    fixup_Mips_64,
+
+    // resulting in - R_MIPS_TLS_GD.
+    fixup_Mips_TLSGD,
+
+    // resulting in - R_MIPS_TLS_GOTTPREL.
+    fixup_Mips_GOTTPREL,
+
+    // resulting in - R_MIPS_TLS_TPREL_HI16.
+    fixup_Mips_TPREL_HI,
+
+    // resulting in - R_MIPS_TLS_TPREL_LO16.
+    fixup_Mips_TPREL_LO,
+
+    // resulting in - R_MIPS_TLS_LDM.
+    fixup_Mips_TLSLDM,
+
+    // resulting in - R_MIPS_TLS_DTPREL_HI16.
+    fixup_Mips_DTPREL_HI,
+
+    // resulting in - R_MIPS_TLS_DTPREL_LO16.
+    fixup_Mips_DTPREL_LO,
+
+    // PC relative branch fixup resulting in - R_MIPS_PC16
+    fixup_Mips_Branch_PCRel,
+
+    // resulting in - R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_HI16
+    fixup_Mips_GPOFF_HI,
+
+    // resulting in - R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_LO16
+    fixup_Mips_GPOFF_LO,
+
+    // resulting in - R_MIPS_PAGE
+    fixup_Mips_GOT_PAGE,
+
+    // resulting in - R_MIPS_GOT_OFST
+    fixup_Mips_GOT_OFST,
+
+    // resulting in - R_MIPS_GOT_DISP
+    fixup_Mips_GOT_DISP,
+
+    // resulting in - R_MIPS_GOT_HIGHER
+    fixup_Mips_HIGHER,
+
+    // resulting in - R_MIPS_HIGHEST
+    fixup_Mips_HIGHEST,
+
+    // resulting in - R_MIPS_GOT_HI16
+    fixup_Mips_GOT_HI16,
+
+    // resulting in - R_MIPS_GOT_LO16
+    fixup_Mips_GOT_LO16,
+
+    // resulting in - R_MIPS_CALL_HI16
+    fixup_Mips_CALL_HI16,
+
+    // resulting in - R_MIPS_CALL_LO16
+    fixup_Mips_CALL_LO16,
+
+    // resulting in - R_MIPS_PC18_S3
+    fixup_MIPS_PC18_S3,
+
+    // resulting in - R_MIPS_PC19_S2
+    fixup_MIPS_PC19_S2,
+
+    // resulting in - R_MIPS_PC21_S2
+    fixup_MIPS_PC21_S2,
+
+    // resulting in - R_MIPS_PC26_S2
+    fixup_MIPS_PC26_S2,
+
+    // resulting in - R_MIPS_PCHI16
+    fixup_MIPS_PCHI16,
+
+    // resulting in - R_MIPS_PCLO16
+    fixup_MIPS_PCLO16,
+
+    // resulting in - R_MICROMIPS_26_S1
+    fixup_MICROMIPS_26_S1,
+
+    // resulting in - R_MICROMIPS_HI16
+    fixup_MICROMIPS_HI16,
+
+    // resulting in - R_MICROMIPS_LO16
+    fixup_MICROMIPS_LO16,
+
+    // resulting in - R_MICROMIPS_GOT16
+    fixup_MICROMIPS_GOT16,
+
+    // resulting in - R_MICROMIPS_PC16_S1
+    fixup_MICROMIPS_PC16_S1,
+
+    // resulting in - R_MICROMIPS_CALL16
+    fixup_MICROMIPS_CALL16,
+
+    // resulting in - R_MICROMIPS_GOT_DISP
+    fixup_MICROMIPS_GOT_DISP,
+
+    // resulting in - R_MICROMIPS_GOT_PAGE
+    fixup_MICROMIPS_GOT_PAGE,
+
+    // resulting in - R_MICROMIPS_GOT_OFST
+    fixup_MICROMIPS_GOT_OFST,
+
+    // resulting in - R_MICROMIPS_TLS_GD
+    fixup_MICROMIPS_TLS_GD,
+
+    // resulting in - R_MICROMIPS_TLS_LDM
+    fixup_MICROMIPS_TLS_LDM,
+
+    // resulting in - R_MICROMIPS_TLS_DTPREL_HI16
+    fixup_MICROMIPS_TLS_DTPREL_HI16,
+
+    // resulting in - R_MICROMIPS_TLS_DTPREL_LO16
+    fixup_MICROMIPS_TLS_DTPREL_LO16,
+
+    // resulting in - R_MICROMIPS_TLS_TPREL_HI16
+    fixup_MICROMIPS_TLS_TPREL_HI16,
+
+    // resulting in - R_MICROMIPS_TLS_TPREL_LO16
+    fixup_MICROMIPS_TLS_TPREL_LO16,
+
+    // Marker
+    LastTargetFixupKind,
+    NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+  };
+} // namespace Mips
+} // namespace llvm
+
+
+#endif // LLVM_MIPS_MIPSFIXUPKINDS_H
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
new file mode 100644
index 0000000..e415412
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
@@ -0,0 +1,46 @@
+//===-- MipsMCAsmInfo.cpp - Mips Asm Properties ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the MipsMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsMCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
+
+using namespace llvm;
+
+void MipsMCAsmInfo::anchor() { }
+
+MipsMCAsmInfo::MipsMCAsmInfo(StringRef TT) {
+  Triple TheTriple(TT);
+  if ((TheTriple.getArch() == Triple::mips) ||
+      (TheTriple.getArch() == Triple::mips64))
+    IsLittleEndian = false;
+
+  if ((TheTriple.getArch() == Triple::mips64el) ||
+      (TheTriple.getArch() == Triple::mips64)) {
+    PointerSize = CalleeSaveStackSlotSize = 8;
+  }
+
+  AlignmentIsInBytes          = false;
+  Data16bitsDirective         = "\t.2byte\t";
+  Data32bitsDirective         = "\t.4byte\t";
+  Data64bitsDirective         = "\t.8byte\t";
+  PrivateGlobalPrefix         = "$";
+  CommentString               = "#";
+  ZeroDirective               = "\t.space\t";
+  GPRel32Directive            = "\t.gpword\t";
+  GPRel64Directive            = "\t.gpdword\t";
+  UseAssignmentForEHBegin = true;
+  SupportsDebugInformation = true;
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+  HasLEB128 = true;
+  DwarfRegNumForCFI = true;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
new file mode 100644
index 0000000..37ba0c4
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
@@ -0,0 +1,30 @@
+//===-- MipsMCAsmInfo.h - Mips Asm Info ------------------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the MipsMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSTARGETASMINFO_H
+#define MIPSTARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfoELF.h"
+
+namespace llvm {
+  class StringRef;
+
+  class MipsMCAsmInfo : public MCAsmInfoELF {
+    void anchor() override;
+  public:
+    explicit MipsMCAsmInfo(StringRef TT);
+  };
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
new file mode 100644
index 0000000..43fc521
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -0,0 +1,662 @@
+//===-- MipsMCCodeEmitter.cpp - Convert Mips Code to Machine Code ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MipsMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#include "MipsMCCodeEmitter.h"
+#include "MCTargetDesc/MipsFixupKinds.h"
+#include "MCTargetDesc/MipsMCExpr.h"
+#include "MCTargetDesc/MipsMCTargetDesc.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "mccodeemitter"
+
+#define GET_INSTRMAP_INFO
+#include "MipsGenInstrInfo.inc"
+#undef GET_INSTRMAP_INFO
+
+namespace llvm {
+MCCodeEmitter *createMipsMCCodeEmitterEB(const MCInstrInfo &MCII,
+                                         const MCRegisterInfo &MRI,
+                                         const MCSubtargetInfo &STI,
+                                         MCContext &Ctx) {
+  return new MipsMCCodeEmitter(MCII, Ctx, false);
+}
+
+MCCodeEmitter *createMipsMCCodeEmitterEL(const MCInstrInfo &MCII,
+                                         const MCRegisterInfo &MRI,
+                                         const MCSubtargetInfo &STI,
+                                         MCContext &Ctx) {
+  return new MipsMCCodeEmitter(MCII, Ctx, true);
+}
+} // End of namespace llvm.
+
+// If the D<shift> instruction has a shift amount that is greater
+// than 31 (checked in calling routine), lower it to a D<shift>32 instruction
+static void LowerLargeShift(MCInst& Inst) {
+
+  assert(Inst.getNumOperands() == 3 && "Invalid no. of operands for shift!");
+  assert(Inst.getOperand(2).isImm());
+
+  int64_t Shift = Inst.getOperand(2).getImm();
+  if (Shift <= 31)
+    return; // Do nothing
+  Shift -= 32;
+
+  // saminus32
+  Inst.getOperand(2).setImm(Shift);
+
+  switch (Inst.getOpcode()) {
+  default:
+    // Calling function is not synchronized
+    llvm_unreachable("Unexpected shift instruction");
+  case Mips::DSLL:
+    Inst.setOpcode(Mips::DSLL32);
+    return;
+  case Mips::DSRL:
+    Inst.setOpcode(Mips::DSRL32);
+    return;
+  case Mips::DSRA:
+    Inst.setOpcode(Mips::DSRA32);
+    return;
+  case Mips::DROTR:
+    Inst.setOpcode(Mips::DROTR32);
+    return;
+  }
+}
+
+// Pick a DEXT or DINS instruction variant based on the pos and size operands
+static void LowerDextDins(MCInst& InstIn) {
+  int Opcode = InstIn.getOpcode();
+
+  if (Opcode == Mips::DEXT)
+    assert(InstIn.getNumOperands() == 4 &&
+           "Invalid no. of machine operands for DEXT!");
+  else // Only DEXT and DINS are possible
+    assert(InstIn.getNumOperands() == 5 &&
+           "Invalid no. of machine operands for DINS!");
+
+  assert(InstIn.getOperand(2).isImm());
+  int64_t pos = InstIn.getOperand(2).getImm();
+  assert(InstIn.getOperand(3).isImm());
+  int64_t size = InstIn.getOperand(3).getImm();
+
+  if (size <= 32) {
+    if (pos < 32)  // DEXT/DINS, do nothing
+      return;
+    // DEXTU/DINSU
+    InstIn.getOperand(2).setImm(pos - 32);
+    InstIn.setOpcode((Opcode == Mips::DEXT) ? Mips::DEXTU : Mips::DINSU);
+    return;
+  }
+  // DEXTM/DINSM
+  assert(pos < 32 && "DEXT/DINS cannot have both size and pos > 32");
+  InstIn.getOperand(3).setImm(size - 32);
+  InstIn.setOpcode((Opcode == Mips::DEXT) ? Mips::DEXTM : Mips::DINSM);
+  return;
+}
+
+bool MipsMCCodeEmitter::isMicroMips(const MCSubtargetInfo &STI) const {
+  return STI.getFeatureBits() & Mips::FeatureMicroMips;
+}
+
+void MipsMCCodeEmitter::EmitByte(unsigned char C, raw_ostream &OS) const {
+  OS << (char)C;
+}
+
+void MipsMCCodeEmitter::EmitInstruction(uint64_t Val, unsigned Size,
+                                        const MCSubtargetInfo &STI,
+                                        raw_ostream &OS) const {
+  // Output the instruction encoding in little endian byte order.
+  // Little-endian byte ordering:
+  //   mips32r2:   4 | 3 | 2 | 1
+  //   microMIPS:  2 | 1 | 4 | 3
+  if (IsLittleEndian && Size == 4 && isMicroMips(STI)) {
+    EmitInstruction(Val >> 16, 2, STI, OS);
+    EmitInstruction(Val, 2, STI, OS);
+  } else {
+    for (unsigned i = 0; i < Size; ++i) {
+      unsigned Shift = IsLittleEndian ? i * 8 : (Size - 1 - i) * 8;
+      EmitByte((Val >> Shift) & 0xff, OS);
+    }
+  }
+}
+
+/// EncodeInstruction - Emit the instruction.
+/// Size the instruction with Desc.getSize().
+void MipsMCCodeEmitter::
+EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                  SmallVectorImpl<MCFixup> &Fixups,
+                  const MCSubtargetInfo &STI) const
+{
+
+  // Non-pseudo instructions that get changed for direct object
+  // only based on operand values.
+  // If this list of instructions get much longer we will move
+  // the check to a function call. Until then, this is more efficient.
+  MCInst TmpInst = MI;
+  switch (MI.getOpcode()) {
+  // If shift amount is >= 32 it the inst needs to be lowered further
+  case Mips::DSLL:
+  case Mips::DSRL:
+  case Mips::DSRA:
+  case Mips::DROTR:
+    LowerLargeShift(TmpInst);
+    break;
+    // Double extract instruction is chosen by pos and size operands
+  case Mips::DEXT:
+  case Mips::DINS:
+    LowerDextDins(TmpInst);
+  }
+
+  unsigned long N = Fixups.size();
+  uint32_t Binary = getBinaryCodeForInstr(TmpInst, Fixups, STI);
+
+  // Check for unimplemented opcodes.
+  // Unfortunately in MIPS both NOP and SLL will come in with Binary == 0
+  // so we have to special check for them.
+  unsigned Opcode = TmpInst.getOpcode();
+  if ((Opcode != Mips::NOP) && (Opcode != Mips::SLL) && !Binary)
+    llvm_unreachable("unimplemented opcode in EncodeInstruction()");
+
+  if (STI.getFeatureBits() & Mips::FeatureMicroMips) {
+    int NewOpcode = Mips::Std2MicroMips (Opcode, Mips::Arch_micromips);
+    if (NewOpcode != -1) {
+      if (Fixups.size() > N)
+        Fixups.pop_back();
+      Opcode = NewOpcode;
+      TmpInst.setOpcode (NewOpcode);
+      Binary = getBinaryCodeForInstr(TmpInst, Fixups, STI);
+    }
+  }
+
+  const MCInstrDesc &Desc = MCII.get(TmpInst.getOpcode());
+
+  // Get byte count of instruction
+  unsigned Size = Desc.getSize();
+  if (!Size)
+    llvm_unreachable("Desc.getSize() returns 0");
+
+  EmitInstruction(Binary, Size, STI, OS);
+}
+
+/// getBranchTargetOpValue - Return binary encoding of the branch
+/// target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getBranchTargetOpValue(const MCInst &MI, unsigned OpNo,
+                       SmallVectorImpl<MCFixup> &Fixups,
+                       const MCSubtargetInfo &STI) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+
+  // If the destination is an immediate, divide by 4.
+  if (MO.isImm()) return MO.getImm() >> 2;
+
+  assert(MO.isExpr() &&
+         "getBranchTargetOpValue expects only expressions or immediates");
+
+  const MCExpr *Expr = MO.getExpr();
+  Fixups.push_back(MCFixup::Create(0, Expr,
+                                   MCFixupKind(Mips::fixup_Mips_PC16)));
+  return 0;
+}
+
+/// getBranchTargetOpValue - Return binary encoding of the microMIPS branch
+/// target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getBranchTargetOpValueMM(const MCInst &MI, unsigned OpNo,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+
+  // If the destination is an immediate, divide by 2.
+  if (MO.isImm()) return MO.getImm() >> 1;
+
+  assert(MO.isExpr() &&
+         "getBranchTargetOpValueMM expects only expressions or immediates");
+
+  const MCExpr *Expr = MO.getExpr();
+  Fixups.push_back(MCFixup::Create(0, Expr,
+                   MCFixupKind(Mips::
+                               fixup_MICROMIPS_PC16_S1)));
+  return 0;
+}
+
+/// getBranchTarget21OpValue - Return binary encoding of the branch
+/// target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getBranchTarget21OpValue(const MCInst &MI, unsigned OpNo,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+
+  // If the destination is an immediate, divide by 4.
+  if (MO.isImm()) return MO.getImm() >> 2;
+
+  assert(MO.isExpr() &&
+         "getBranchTarget21OpValue expects only expressions or immediates");
+
+  const MCExpr *Expr = MO.getExpr();
+  Fixups.push_back(MCFixup::Create(0, Expr,
+                                   MCFixupKind(Mips::fixup_MIPS_PC21_S2)));
+  return 0;
+}
+
+/// getBranchTarget26OpValue - Return binary encoding of the branch
+/// target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getBranchTarget26OpValue(const MCInst &MI, unsigned OpNo,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+
+  // If the destination is an immediate, divide by 4.
+  if (MO.isImm()) return MO.getImm() >> 2;
+
+  assert(MO.isExpr() &&
+         "getBranchTarget26OpValue expects only expressions or immediates");
+
+  const MCExpr *Expr = MO.getExpr();
+  Fixups.push_back(MCFixup::Create(0, Expr,
+                                   MCFixupKind(Mips::fixup_MIPS_PC26_S2)));
+  return 0;
+}
+
+/// getJumpOffset16OpValue - Return binary encoding of the jump
+/// target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getJumpOffset16OpValue(const MCInst &MI, unsigned OpNo,
+                       SmallVectorImpl<MCFixup> &Fixups,
+                       const MCSubtargetInfo &STI) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+
+  if (MO.isImm()) return MO.getImm();
+
+  assert(MO.isExpr() &&
+         "getJumpOffset16OpValue expects only expressions or an immediate");
+
+   // TODO: Push fixup.
+   return 0;
+}
+
+/// getJumpTargetOpValue - Return binary encoding of the jump
+/// target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getJumpTargetOpValue(const MCInst &MI, unsigned OpNo,
+                     SmallVectorImpl<MCFixup> &Fixups,
+                     const MCSubtargetInfo &STI) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+  // If the destination is an immediate, divide by 4.
+  if (MO.isImm()) return MO.getImm()>>2;
+
+  assert(MO.isExpr() &&
+         "getJumpTargetOpValue expects only expressions or an immediate");
+
+  const MCExpr *Expr = MO.getExpr();
+  Fixups.push_back(MCFixup::Create(0, Expr,
+                                   MCFixupKind(Mips::fixup_Mips_26)));
+  return 0;
+}
+
+unsigned MipsMCCodeEmitter::
+getJumpTargetOpValueMM(const MCInst &MI, unsigned OpNo,
+                       SmallVectorImpl<MCFixup> &Fixups,
+                       const MCSubtargetInfo &STI) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+  // If the destination is an immediate, divide by 2.
+  if (MO.isImm()) return MO.getImm() >> 1;
+
+  assert(MO.isExpr() &&
+         "getJumpTargetOpValueMM expects only expressions or an immediate");
+
+  const MCExpr *Expr = MO.getExpr();
+  Fixups.push_back(MCFixup::Create(0, Expr,
+                                   MCFixupKind(Mips::fixup_MICROMIPS_26_S1)));
+  return 0;
+}
+
+unsigned MipsMCCodeEmitter::
+getExprOpValue(const MCExpr *Expr,SmallVectorImpl<MCFixup> &Fixups,
+               const MCSubtargetInfo &STI) const {
+  int64_t Res;
+
+  if (Expr->EvaluateAsAbsolute(Res))
+    return Res;
+
+  MCExpr::ExprKind Kind = Expr->getKind();
+  if (Kind == MCExpr::Constant) {
+    return cast<MCConstantExpr>(Expr)->getValue();
+  }
+
+  if (Kind == MCExpr::Binary) {
+    unsigned Res = getExprOpValue(cast<MCBinaryExpr>(Expr)->getLHS(), Fixups, STI);
+    Res += getExprOpValue(cast<MCBinaryExpr>(Expr)->getRHS(), Fixups, STI);
+    return Res;
+  }
+
+  if (Kind == MCExpr::Target) {
+    const MipsMCExpr *MipsExpr = cast<MipsMCExpr>(Expr);
+
+    Mips::Fixups FixupKind = Mips::Fixups(0);
+    switch (MipsExpr->getKind()) {
+    default: llvm_unreachable("Unsupported fixup kind for target expression!");
+    case MipsMCExpr::VK_Mips_HIGHEST:
+      FixupKind = Mips::fixup_Mips_HIGHEST;
+      break;
+    case MipsMCExpr::VK_Mips_HIGHER:
+      FixupKind = Mips::fixup_Mips_HIGHER;
+      break;
+    case MipsMCExpr::VK_Mips_HI:
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_HI16
+                                   : Mips::fixup_Mips_HI16;
+      break;
+    case MipsMCExpr::VK_Mips_LO:
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_LO16
+                                   : Mips::fixup_Mips_LO16;
+      break;
+    }
+    Fixups.push_back(MCFixup::Create(0, MipsExpr, MCFixupKind(FixupKind)));
+    return 0;
+  }
+
+  if (Kind == MCExpr::SymbolRef) {
+    Mips::Fixups FixupKind = Mips::Fixups(0);
+
+    switch(cast<MCSymbolRefExpr>(Expr)->getKind()) {
+    default: llvm_unreachable("Unknown fixup kind!");
+      break;
+    case MCSymbolRefExpr::VK_Mips_GPOFF_HI :
+      FixupKind = Mips::fixup_Mips_GPOFF_HI;
+      break;
+    case MCSymbolRefExpr::VK_Mips_GPOFF_LO :
+      FixupKind = Mips::fixup_Mips_GPOFF_LO;
+      break;
+    case MCSymbolRefExpr::VK_Mips_GOT_PAGE :
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GOT_PAGE
+                              : Mips::fixup_Mips_GOT_PAGE;
+      break;
+    case MCSymbolRefExpr::VK_Mips_GOT_OFST :
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GOT_OFST
+                              : Mips::fixup_Mips_GOT_OFST;
+      break;
+    case MCSymbolRefExpr::VK_Mips_GOT_DISP :
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GOT_DISP
+                              : Mips::fixup_Mips_GOT_DISP;
+      break;
+    case MCSymbolRefExpr::VK_Mips_GPREL:
+      FixupKind = Mips::fixup_Mips_GPREL16;
+      break;
+    case MCSymbolRefExpr::VK_Mips_GOT_CALL:
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_CALL16
+                              : Mips::fixup_Mips_CALL16;
+      break;
+    case MCSymbolRefExpr::VK_Mips_GOT16:
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GOT16
+                              : Mips::fixup_Mips_GOT_Global;
+      break;
+    case MCSymbolRefExpr::VK_Mips_GOT:
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GOT16
+                              : Mips::fixup_Mips_GOT_Local;
+      break;
+    case MCSymbolRefExpr::VK_Mips_ABS_HI:
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_HI16
+                              : Mips::fixup_Mips_HI16;
+      break;
+    case MCSymbolRefExpr::VK_Mips_ABS_LO:
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_LO16
+                              : Mips::fixup_Mips_LO16;
+      break;
+    case MCSymbolRefExpr::VK_Mips_TLSGD:
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_TLS_GD
+                              : Mips::fixup_Mips_TLSGD;
+      break;
+    case MCSymbolRefExpr::VK_Mips_TLSLDM:
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_TLS_LDM
+                              : Mips::fixup_Mips_TLSLDM;
+      break;
+    case MCSymbolRefExpr::VK_Mips_DTPREL_HI:
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_TLS_DTPREL_HI16
+                              : Mips::fixup_Mips_DTPREL_HI;
+      break;
+    case MCSymbolRefExpr::VK_Mips_DTPREL_LO:
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_TLS_DTPREL_LO16
+                              : Mips::fixup_Mips_DTPREL_LO;
+      break;
+    case MCSymbolRefExpr::VK_Mips_GOTTPREL:
+      FixupKind = Mips::fixup_Mips_GOTTPREL;
+      break;
+    case MCSymbolRefExpr::VK_Mips_TPREL_HI:
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_TLS_TPREL_HI16
+                              : Mips::fixup_Mips_TPREL_HI;
+      break;
+    case MCSymbolRefExpr::VK_Mips_TPREL_LO:
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_TLS_TPREL_LO16
+                              : Mips::fixup_Mips_TPREL_LO;
+      break;
+    case MCSymbolRefExpr::VK_Mips_HIGHER:
+      FixupKind = Mips::fixup_Mips_HIGHER;
+      break;
+    case MCSymbolRefExpr::VK_Mips_HIGHEST:
+      FixupKind = Mips::fixup_Mips_HIGHEST;
+      break;
+    case MCSymbolRefExpr::VK_Mips_GOT_HI16:
+      FixupKind = Mips::fixup_Mips_GOT_HI16;
+      break;
+    case MCSymbolRefExpr::VK_Mips_GOT_LO16:
+      FixupKind = Mips::fixup_Mips_GOT_LO16;
+      break;
+    case MCSymbolRefExpr::VK_Mips_CALL_HI16:
+      FixupKind = Mips::fixup_Mips_CALL_HI16;
+      break;
+    case MCSymbolRefExpr::VK_Mips_CALL_LO16:
+      FixupKind = Mips::fixup_Mips_CALL_LO16;
+      break;
+    case MCSymbolRefExpr::VK_Mips_PCREL_HI16:
+      FixupKind = Mips::fixup_MIPS_PCHI16;
+      break;
+    case MCSymbolRefExpr::VK_Mips_PCREL_LO16:
+      FixupKind = Mips::fixup_MIPS_PCLO16;
+      break;
+    } // switch
+
+    Fixups.push_back(MCFixup::Create(0, Expr, MCFixupKind(FixupKind)));
+    return 0;
+  }
+  return 0;
+}
+
+/// getMachineOpValue - Return binary encoding of operand. If the machine
+/// operand requires relocation, record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                  SmallVectorImpl<MCFixup> &Fixups,
+                  const MCSubtargetInfo &STI) const {
+  if (MO.isReg()) {
+    unsigned Reg = MO.getReg();
+    unsigned RegNo = Ctx.getRegisterInfo()->getEncodingValue(Reg);
+    return RegNo;
+  } else if (MO.isImm()) {
+    return static_cast<unsigned>(MO.getImm());
+  } else if (MO.isFPImm()) {
+    return static_cast<unsigned>(APFloat(MO.getFPImm())
+        .bitcastToAPInt().getHiBits(32).getLimitedValue());
+  }
+  // MO must be an Expr.
+  assert(MO.isExpr());
+  return getExprOpValue(MO.getExpr(),Fixups, STI);
+}
+
+/// getMSAMemEncoding - Return binary encoding of memory operand for LD/ST
+/// instructions.
+unsigned
+MipsMCCodeEmitter::getMSAMemEncoding(const MCInst &MI, unsigned OpNo,
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const {
+  // Base register is encoded in bits 20-16, offset is encoded in bits 15-0.
+  assert(MI.getOperand(OpNo).isReg());
+  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo),Fixups, STI) << 16;
+  unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI);
+
+  // The immediate field of an LD/ST instruction is scaled which means it must
+  // be divided (when encoding) by the size (in bytes) of the instructions'
+  // data format.
+  // .b - 1 byte
+  // .h - 2 bytes
+  // .w - 4 bytes
+  // .d - 8 bytes
+  switch(MI.getOpcode())
+  {
+  default:
+    assert (0 && "Unexpected instruction");
+    break;
+  case Mips::LD_B:
+  case Mips::ST_B:
+    // We don't need to scale the offset in this case
+    break;
+  case Mips::LD_H:
+  case Mips::ST_H:
+    OffBits >>= 1;
+    break;
+  case Mips::LD_W:
+  case Mips::ST_W:
+    OffBits >>= 2;
+    break;
+  case Mips::LD_D:
+  case Mips::ST_D:
+    OffBits >>= 3;
+    break;
+  }
+
+  return (OffBits & 0xFFFF) | RegBits;
+}
+
+/// getMemEncoding - Return binary encoding of memory related operand.
+/// If the offset operand requires relocation, record the relocation.
+unsigned
+MipsMCCodeEmitter::getMemEncoding(const MCInst &MI, unsigned OpNo,
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const {
+  // Base register is encoded in bits 20-16, offset is encoded in bits 15-0.
+  assert(MI.getOperand(OpNo).isReg());
+  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo),Fixups, STI) << 16;
+  unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI);
+
+  return (OffBits & 0xFFFF) | RegBits;
+}
+
+unsigned MipsMCCodeEmitter::
+getMemEncodingMMImm12(const MCInst &MI, unsigned OpNo,
+                      SmallVectorImpl<MCFixup> &Fixups,
+                      const MCSubtargetInfo &STI) const {
+  // Base register is encoded in bits 20-16, offset is encoded in bits 11-0.
+  assert(MI.getOperand(OpNo).isReg());
+  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI) << 16;
+  unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI);
+
+  return (OffBits & 0x0FFF) | RegBits;
+}
+
+unsigned
+MipsMCCodeEmitter::getSizeExtEncoding(const MCInst &MI, unsigned OpNo,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const {
+  assert(MI.getOperand(OpNo).isImm());
+  unsigned SizeEncoding = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI);
+  return SizeEncoding - 1;
+}
+
+// FIXME: should be called getMSBEncoding
+//
+unsigned
+MipsMCCodeEmitter::getSizeInsEncoding(const MCInst &MI, unsigned OpNo,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const {
+  assert(MI.getOperand(OpNo-1).isImm());
+  assert(MI.getOperand(OpNo).isImm());
+  unsigned Position = getMachineOpValue(MI, MI.getOperand(OpNo-1), Fixups, STI);
+  unsigned Size = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI);
+
+  return Position + Size - 1;
+}
+
+unsigned
+MipsMCCodeEmitter::getLSAImmEncoding(const MCInst &MI, unsigned OpNo,
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const {
+  assert(MI.getOperand(OpNo).isImm());
+  // The immediate is encoded as 'immediate - 1'.
+  return getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI) - 1;
+}
+
+unsigned
+MipsMCCodeEmitter::getSimm19Lsl2Encoding(const MCInst &MI, unsigned OpNo,
+                                         SmallVectorImpl<MCFixup> &Fixups,
+                                         const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isImm()) {
+    // The immediate is encoded as 'immediate << 2'.
+    unsigned Res = getMachineOpValue(MI, MO, Fixups, STI);
+    assert((Res & 3) == 0);
+    return Res >> 2;
+  }
+
+  assert(MO.isExpr() &&
+         "getSimm19Lsl2Encoding expects only expressions or an immediate");
+
+  const MCExpr *Expr = MO.getExpr();
+  Fixups.push_back(MCFixup::Create(0, Expr,
+                                   MCFixupKind(Mips::fixup_MIPS_PC19_S2)));
+  return 0;
+}
+
+unsigned
+MipsMCCodeEmitter::getSimm18Lsl3Encoding(const MCInst &MI, unsigned OpNo,
+                                         SmallVectorImpl<MCFixup> &Fixups,
+                                         const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isImm()) {
+    // The immediate is encoded as 'immediate << 3'.
+    unsigned Res = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI);
+    assert((Res & 7) == 0);
+    return Res >> 3;
+  }
+
+  assert(MO.isExpr() &&
+         "getSimm18Lsl2Encoding expects only expressions or an immediate");
+
+  const MCExpr *Expr = MO.getExpr();
+  Fixups.push_back(MCFixup::Create(0, Expr,
+                                   MCFixupKind(Mips::fixup_MIPS_PC18_S3)));
+  return 0;
+}
+
+#include "MipsGenMCCodeEmitter.inc"
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
new file mode 100644
index 0000000..304167f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
@@ -0,0 +1,154 @@
+//===-- MipsMCCodeEmitter.h - Convert Mips Code to Machine Code -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the MipsMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#ifndef MIPS_MC_CODE_EMITTER_H
+#define MIPS_MC_CODE_EMITTER_H
+
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/Support/DataTypes.h"
+
+using namespace llvm;
+
+namespace llvm {
+class MCContext;
+class MCExpr;
+class MCInst;
+class MCInstrInfo;
+class MCFixup;
+class MCOperand;
+class MCSubtargetInfo;
+class raw_ostream;
+
+class MipsMCCodeEmitter : public MCCodeEmitter {
+  MipsMCCodeEmitter(const MipsMCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  void operator=(const MipsMCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  const MCInstrInfo &MCII;
+  MCContext &Ctx;
+  bool IsLittleEndian;
+
+  bool isMicroMips(const MCSubtargetInfo &STI) const;
+
+public:
+  MipsMCCodeEmitter(const MCInstrInfo &mcii, MCContext &Ctx_, bool IsLittle)
+      : MCII(mcii), Ctx(Ctx_), IsLittleEndian(IsLittle) {}
+
+  ~MipsMCCodeEmitter() {}
+
+  void EmitByte(unsigned char C, raw_ostream &OS) const;
+
+  void EmitInstruction(uint64_t Val, unsigned Size, const MCSubtargetInfo &STI,
+                       raw_ostream &OS) const;
+
+  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const override;
+
+  // getBinaryCodeForInstr - TableGen'erated function for getting the
+  // binary encoding for an instruction.
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  // getBranchJumpOpValue - Return binary encoding of the jump
+  // target operand. If the machine operand requires relocation,
+  // record the relocation and return zero.
+  unsigned getJumpTargetOpValue(const MCInst &MI, unsigned OpNo,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+
+  // getBranchJumpOpValueMM - Return binary encoding of the microMIPS jump
+  // target operand. If the machine operand requires relocation,
+  // record the relocation and return zero.
+  unsigned getJumpTargetOpValueMM(const MCInst &MI, unsigned OpNo,
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const;
+
+  // getBranchTargetOpValue - Return binary encoding of the branch
+  // target operand. If the machine operand requires relocation,
+  // record the relocation and return zero.
+  unsigned getBranchTargetOpValue(const MCInst &MI, unsigned OpNo,
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const;
+
+  // getBranchTargetOpValue - Return binary encoding of the microMIPS branch
+  // target operand. If the machine operand requires relocation,
+  // record the relocation and return zero.
+  unsigned getBranchTargetOpValueMM(const MCInst &MI, unsigned OpNo,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const;
+
+  // getBranchTarget21OpValue - Return binary encoding of the branch
+  // offset operand. If the machine operand requires relocation,
+  // record the relocation and return zero.
+  unsigned getBranchTarget21OpValue(const MCInst &MI, unsigned OpNo,
+                                   SmallVectorImpl<MCFixup> &Fixups,
+                                   const MCSubtargetInfo &STI) const;
+
+  // getBranchTarget26OpValue - Return binary encoding of the branch
+  // offset operand. If the machine operand requires relocation,
+  // record the relocation and return zero.
+  unsigned getBranchTarget26OpValue(const MCInst &MI, unsigned OpNo,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const;
+
+  // getJumpOffset16OpValue - Return binary encoding of the jump
+  // offset operand. If the machine operand requires relocation,
+  // record the relocation and return zero.
+  unsigned getJumpOffset16OpValue(const MCInst &MI, unsigned OpNo,
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const;
+
+  // getMachineOpValue - Return binary encoding of operand. If the machin
+  // operand requires relocation, record the relocation and return zero.
+  unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+
+  unsigned getMSAMemEncoding(const MCInst &MI, unsigned OpNo,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+
+  unsigned getMemEncoding(const MCInst &MI, unsigned OpNo,
+                          SmallVectorImpl<MCFixup> &Fixups,
+                          const MCSubtargetInfo &STI) const;
+  unsigned getMemEncodingMMImm12(const MCInst &MI, unsigned OpNo,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  unsigned getSizeExtEncoding(const MCInst &MI, unsigned OpNo,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
+  unsigned getSizeInsEncoding(const MCInst &MI, unsigned OpNo,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
+
+  // getLSAImmEncoding - Return binary encoding of LSA immediate.
+  unsigned getLSAImmEncoding(const MCInst &MI, unsigned OpNo,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+
+  unsigned getSimm19Lsl2Encoding(const MCInst &MI, unsigned OpNo,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  unsigned getSimm18Lsl3Encoding(const MCInst &MI, unsigned OpNo,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  unsigned getExprOpValue(const MCExpr *Expr, SmallVectorImpl<MCFixup> &Fixups,
+                          const MCSubtargetInfo &STI) const;
+
+}; // class MipsMCCodeEmitter
+} // namespace llvm.
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
new file mode 100644
index 0000000..5bba3e5
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
@@ -0,0 +1,89 @@
+//===-- MipsMCExpr.cpp - Mips specific MC expression classes --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsMCExpr.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCObjectStreamer.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mipsmcexpr"
+
+bool MipsMCExpr::isSupportedBinaryExpr(MCSymbolRefExpr::VariantKind VK,
+                                       const MCBinaryExpr *BE) {
+  switch (VK) {
+  case MCSymbolRefExpr::VK_Mips_ABS_LO:
+  case MCSymbolRefExpr::VK_Mips_ABS_HI:
+  case MCSymbolRefExpr::VK_Mips_HIGHER:
+  case MCSymbolRefExpr::VK_Mips_HIGHEST:
+    break;
+  default:
+    return false;
+  }
+
+  // We support expressions of the form "(sym1 binop1 sym2) binop2 const",
+  // where "binop2 const" is optional.
+  if (isa<MCBinaryExpr>(BE->getLHS())) {
+    if (!isa<MCConstantExpr>(BE->getRHS()))
+      return false;
+    BE = cast<MCBinaryExpr>(BE->getLHS());
+  }
+  return (isa<MCSymbolRefExpr>(BE->getLHS())
+          && isa<MCSymbolRefExpr>(BE->getRHS()));
+}
+
+const MipsMCExpr*
+MipsMCExpr::Create(MCSymbolRefExpr::VariantKind VK, const MCExpr *Expr,
+                   MCContext &Ctx) {
+  VariantKind Kind;
+  switch (VK) {
+  case MCSymbolRefExpr::VK_Mips_ABS_LO:
+    Kind = VK_Mips_LO;
+    break;
+  case MCSymbolRefExpr::VK_Mips_ABS_HI:
+    Kind = VK_Mips_HI;
+    break;
+  case MCSymbolRefExpr::VK_Mips_HIGHER:
+    Kind = VK_Mips_HIGHER;
+    break;
+  case MCSymbolRefExpr::VK_Mips_HIGHEST:
+    Kind = VK_Mips_HIGHEST;
+    break;
+  default:
+    llvm_unreachable("Invalid kind!");
+  }
+
+  return new (Ctx) MipsMCExpr(Kind, Expr);
+}
+
+void MipsMCExpr::PrintImpl(raw_ostream &OS) const {
+  switch (Kind) {
+  default: llvm_unreachable("Invalid kind!");
+  case VK_Mips_LO: OS << "%lo"; break;
+  case VK_Mips_HI: OS << "%hi"; break;
+  case VK_Mips_HIGHER: OS << "%higher"; break;
+  case VK_Mips_HIGHEST: OS << "%highest"; break;
+  }
+
+  OS << '(';
+  Expr->print(OS);
+  OS << ')';
+}
+
+bool
+MipsMCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
+                                      const MCAsmLayout *Layout) const {
+  return getSubExpr()->EvaluateAsRelocatable(Res, Layout);
+}
+
+void MipsMCExpr::visitUsedExpr(MCStreamer &Streamer) const {
+  Streamer.visitUsedExpr(*getSubExpr());
+}
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
new file mode 100644
index 0000000..f193dc9
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
@@ -0,0 +1,66 @@
+//===-- MipsMCExpr.h - Mips specific MC expression classes ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSMCEXPR_H
+#define MIPSMCEXPR_H
+
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
+
+namespace llvm {
+
+class MipsMCExpr : public MCTargetExpr {
+public:
+  enum VariantKind {
+    VK_Mips_None,
+    VK_Mips_LO,
+    VK_Mips_HI,
+    VK_Mips_HIGHER,
+    VK_Mips_HIGHEST
+  };
+
+private:
+  const VariantKind Kind;
+  const MCExpr *Expr;
+
+  explicit MipsMCExpr(VariantKind Kind, const MCExpr *Expr)
+    : Kind(Kind), Expr(Expr) {}
+
+public:
+  static bool isSupportedBinaryExpr(MCSymbolRefExpr::VariantKind VK,
+                                    const MCBinaryExpr *BE);
+
+  static const MipsMCExpr *Create(MCSymbolRefExpr::VariantKind VK,
+                                  const MCExpr *Expr, MCContext &Ctx);
+
+  /// getOpcode - Get the kind of this expression.
+  VariantKind getKind() const { return Kind; }
+
+  /// getSubExpr - Get the child of this expression.
+  const MCExpr *getSubExpr() const { return Expr; }
+
+  void PrintImpl(raw_ostream &OS) const override;
+  bool EvaluateAsRelocatableImpl(MCValue &Res,
+                                 const MCAsmLayout *Layout) const override;
+  void visitUsedExpr(MCStreamer &Streamer) const override;
+  const MCSection *FindAssociatedSection() const override {
+    return getSubExpr()->FindAssociatedSection();
+  }
+
+  // There are no TLS MipsMCExprs at the moment.
+  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override {}
+
+  static bool classof(const MCExpr *E) {
+    return E->getKind() == MCExpr::Target;
+  }
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h
new file mode 100644
index 0000000..01d5363
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h
@@ -0,0 +1,33 @@
+//===-- MipsMCNaCl.h - NaCl-related declarations --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSMCNACL_H
+#define MIPSMCNACL_H
+
+#include "llvm/MC/MCELFStreamer.h"
+
+namespace llvm {
+
+// Log2 of the NaCl MIPS sandbox's instruction bundle size.
+static const unsigned MIPS_NACL_BUNDLE_ALIGN = 4u;
+
+bool isBasePlusOffsetMemoryAccess(unsigned Opcode, unsigned *AddrIdx,
+                                  bool *IsStore = nullptr);
+bool baseRegNeedsLoadStoreMask(unsigned Reg);
+
+// This function creates an MCELFStreamer for Mips NaCl.
+MCELFStreamer *createMipsNaClELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+                                         raw_ostream &OS,
+                                         MCCodeEmitter *Emitter,
+                                         const MCSubtargetInfo &STI,
+                                         bool RelaxAll, bool NoExecStack);
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
new file mode 100644
index 0000000..d2b929b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
@@ -0,0 +1,231 @@
+//===-- MipsMCTargetDesc.cpp - Mips Target Descriptions -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Mips specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstPrinter/MipsInstPrinter.h"
+#include "MipsELFStreamer.h"
+#include "MipsMCAsmInfo.h"
+#include "MipsMCNaCl.h"
+#include "MipsMCTargetDesc.h"
+#include "MipsTargetStreamer.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_MC_DESC
+#include "MipsGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "MipsGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "MipsGenRegisterInfo.inc"
+
+/// Select the Mips CPU for the given triple and cpu name.
+/// FIXME: Merge with the copy in MipsSubtarget.cpp
+static inline StringRef selectMipsCPU(StringRef TT, StringRef CPU) {
+  if (CPU.empty() || CPU == "generic") {
+    Triple TheTriple(TT);
+    if (TheTriple.getArch() == Triple::mips ||
+        TheTriple.getArch() == Triple::mipsel)
+      CPU = "mips32";
+    else
+      CPU = "mips64";
+  }
+  return CPU;
+}
+
+static MCInstrInfo *createMipsMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitMipsMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createMipsMCRegisterInfo(StringRef TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitMipsMCRegisterInfo(X, Mips::RA);
+  return X;
+}
+
+static MCSubtargetInfo *createMipsMCSubtargetInfo(StringRef TT, StringRef CPU,
+                                                  StringRef FS) {
+  CPU = selectMipsCPU(TT, CPU);
+  MCSubtargetInfo *X = new MCSubtargetInfo();
+  InitMipsMCSubtargetInfo(X, TT, CPU, FS);
+  return X;
+}
+
+static MCAsmInfo *createMipsMCAsmInfo(const MCRegisterInfo &MRI, StringRef TT) {
+  MCAsmInfo *MAI = new MipsMCAsmInfo(TT);
+
+  unsigned SP = MRI.getDwarfRegNum(Mips::SP, true);
+  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, SP, 0);
+  MAI->addInitialFrameState(Inst);
+
+  return MAI;
+}
+
+static MCCodeGenInfo *createMipsMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                              CodeModel::Model CM,
+                                              CodeGenOpt::Level OL) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  if (CM == CodeModel::JITDefault)
+    RM = Reloc::Static;
+  else if (RM == Reloc::Default)
+    RM = Reloc::PIC_;
+  X->InitMCCodeGenInfo(RM, CM, OL);
+  return X;
+}
+
+static MCInstPrinter *createMipsMCInstPrinter(const Target &T,
+                                              unsigned SyntaxVariant,
+                                              const MCAsmInfo &MAI,
+                                              const MCInstrInfo &MII,
+                                              const MCRegisterInfo &MRI,
+                                              const MCSubtargetInfo &STI) {
+  return new MipsInstPrinter(MAI, MII, MRI);
+}
+
+static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
+                                    MCContext &Context, MCAsmBackend &MAB,
+                                    raw_ostream &OS, MCCodeEmitter *Emitter,
+                                    const MCSubtargetInfo &STI,
+                                    bool RelaxAll, bool NoExecStack) {
+  MCStreamer *S;
+  if (!Triple(TT).isOSNaCl())
+    S = createMipsELFStreamer(Context, MAB, OS, Emitter, STI, RelaxAll,
+                              NoExecStack);
+  else
+    S = createMipsNaClELFStreamer(Context, MAB, OS, Emitter, STI, RelaxAll,
+                                  NoExecStack);
+  new MipsTargetELFStreamer(*S, STI);
+  return S;
+}
+
+static MCStreamer *
+createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
+                    bool isVerboseAsm, bool useDwarfDirectory,
+                    MCInstPrinter *InstPrint, MCCodeEmitter *CE,
+                    MCAsmBackend *TAB, bool ShowInst) {
+  MCStreamer *S = llvm::createAsmStreamer(
+      Ctx, OS, isVerboseAsm, useDwarfDirectory, InstPrint, CE, TAB, ShowInst);
+  new MipsTargetAsmStreamer(*S, OS);
+  return S;
+}
+
+static MCStreamer *createMipsNullStreamer(MCContext &Ctx) {
+  MCStreamer *S = llvm::createNullStreamer(Ctx);
+  new MipsTargetStreamer(*S);
+  return S;
+}
+
+extern "C" void LLVMInitializeMipsTargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfoFn X(TheMipsTarget, createMipsMCAsmInfo);
+  RegisterMCAsmInfoFn Y(TheMipselTarget, createMipsMCAsmInfo);
+  RegisterMCAsmInfoFn A(TheMips64Target, createMipsMCAsmInfo);
+  RegisterMCAsmInfoFn B(TheMips64elTarget, createMipsMCAsmInfo);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheMipsTarget,
+                                        createMipsMCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheMipselTarget,
+                                        createMipsMCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheMips64Target,
+                                        createMipsMCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheMips64elTarget,
+                                        createMipsMCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheMipsTarget, createMipsMCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheMipselTarget, createMipsMCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheMips64Target, createMipsMCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheMips64elTarget,
+                                      createMipsMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheMipsTarget, createMipsMCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheMipselTarget, createMipsMCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheMips64Target, createMipsMCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheMips64elTarget,
+                                    createMipsMCRegisterInfo);
+
+  // Register the MC Code Emitter
+  TargetRegistry::RegisterMCCodeEmitter(TheMipsTarget,
+                                        createMipsMCCodeEmitterEB);
+  TargetRegistry::RegisterMCCodeEmitter(TheMipselTarget,
+                                        createMipsMCCodeEmitterEL);
+  TargetRegistry::RegisterMCCodeEmitter(TheMips64Target,
+                                        createMipsMCCodeEmitterEB);
+  TargetRegistry::RegisterMCCodeEmitter(TheMips64elTarget,
+                                        createMipsMCCodeEmitterEL);
+
+  // Register the object streamer.
+  TargetRegistry::RegisterMCObjectStreamer(TheMipsTarget, createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(TheMipselTarget, createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(TheMips64Target, createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(TheMips64elTarget,
+                                           createMCStreamer);
+
+  // Register the asm streamer.
+  TargetRegistry::RegisterAsmStreamer(TheMipsTarget, createMCAsmStreamer);
+  TargetRegistry::RegisterAsmStreamer(TheMipselTarget, createMCAsmStreamer);
+  TargetRegistry::RegisterAsmStreamer(TheMips64Target, createMCAsmStreamer);
+  TargetRegistry::RegisterAsmStreamer(TheMips64elTarget, createMCAsmStreamer);
+
+  TargetRegistry::RegisterNullStreamer(TheMipsTarget, createMipsNullStreamer);
+  TargetRegistry::RegisterNullStreamer(TheMipselTarget, createMipsNullStreamer);
+  TargetRegistry::RegisterNullStreamer(TheMips64Target, createMipsNullStreamer);
+  TargetRegistry::RegisterNullStreamer(TheMips64elTarget,
+                                       createMipsNullStreamer);
+
+  // Register the asm backend.
+  TargetRegistry::RegisterMCAsmBackend(TheMipsTarget,
+                                       createMipsAsmBackendEB32);
+  TargetRegistry::RegisterMCAsmBackend(TheMipselTarget,
+                                       createMipsAsmBackendEL32);
+  TargetRegistry::RegisterMCAsmBackend(TheMips64Target,
+                                       createMipsAsmBackendEB64);
+  TargetRegistry::RegisterMCAsmBackend(TheMips64elTarget,
+                                       createMipsAsmBackendEL64);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheMipsTarget,
+                                          createMipsMCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(TheMipselTarget,
+                                          createMipsMCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(TheMips64Target,
+                                          createMipsMCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(TheMips64elTarget,
+                                          createMipsMCSubtargetInfo);
+
+  // Register the MCInstPrinter.
+  TargetRegistry::RegisterMCInstPrinter(TheMipsTarget,
+                                        createMipsMCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheMipselTarget,
+                                        createMipsMCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheMips64Target,
+                                        createMipsMCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheMips64elTarget,
+                                        createMipsMCInstPrinter);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
new file mode 100644
index 0000000..161d1ea
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
@@ -0,0 +1,76 @@
+//===-- MipsMCTargetDesc.h - Mips Target Descriptions -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Mips specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSMCTARGETDESC_H
+#define MIPSMCTARGETDESC_H
+
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectWriter;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class StringRef;
+class Target;
+class raw_ostream;
+
+extern Target TheMipsTarget;
+extern Target TheMipselTarget;
+extern Target TheMips64Target;
+extern Target TheMips64elTarget;
+
+MCCodeEmitter *createMipsMCCodeEmitterEB(const MCInstrInfo &MCII,
+                                         const MCRegisterInfo &MRI,
+                                         const MCSubtargetInfo &STI,
+                                         MCContext &Ctx);
+MCCodeEmitter *createMipsMCCodeEmitterEL(const MCInstrInfo &MCII,
+                                         const MCRegisterInfo &MRI,
+                                         const MCSubtargetInfo &STI,
+                                         MCContext &Ctx);
+
+MCAsmBackend *createMipsAsmBackendEB32(const Target &T,
+                                       const MCRegisterInfo &MRI, StringRef TT,
+                                       StringRef CPU);
+MCAsmBackend *createMipsAsmBackendEL32(const Target &T,
+                                       const MCRegisterInfo &MRI, StringRef TT,
+                                       StringRef CPU);
+MCAsmBackend *createMipsAsmBackendEB64(const Target &T,
+                                       const MCRegisterInfo &MRI, StringRef TT,
+                                       StringRef CPU);
+MCAsmBackend *createMipsAsmBackendEL64(const Target &T,
+                                       const MCRegisterInfo &MRI, StringRef TT,
+                                       StringRef CPU);
+
+MCObjectWriter *createMipsELFObjectWriter(raw_ostream &OS,
+                                          uint8_t OSABI,
+                                          bool IsLittleEndian,
+                                          bool Is64Bit);
+} // End llvm namespace
+
+// Defines symbolic names for Mips registers.  This defines a mapping from
+// register name to register number.
+#define GET_REGINFO_ENUM
+#include "MipsGenRegisterInfo.inc"
+
+// Defines symbolic names for the Mips instructions.
+#define GET_INSTRINFO_ENUM
+#include "MipsGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "MipsGenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
new file mode 100644
index 0000000..6cde8f9
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
@@ -0,0 +1,272 @@
+//===-- MipsNaClELFStreamer.cpp - ELF Object Output for Mips NaCl ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements MCELFStreamer for Mips NaCl.  It emits .o object files
+// as required by NaCl's SFI sandbox.  It inserts address-masking instructions
+// before dangerous control-flow and memory access instructions.  It inserts
+// address-masking instructions after instructions that change the stack
+// pointer.  It ensures that the mask and the dangerous instruction are always
+// emitted in the same bundle.  It aligns call + branch delay to the bundle end,
+// so that return address is always aligned to the start of next bundle.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mips.h"
+#include "MipsELFStreamer.h"
+#include "MipsMCNaCl.h"
+#include "llvm/MC/MCELFStreamer.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-mc-nacl"
+
+namespace {
+
+const unsigned IndirectBranchMaskReg = Mips::T6;
+const unsigned LoadStoreStackMaskReg = Mips::T7;
+
+/// Extend the generic MCELFStreamer class so that it can mask dangerous
+/// instructions.
+
+class MipsNaClELFStreamer : public MipsELFStreamer {
+public:
+  MipsNaClELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS,
+                      MCCodeEmitter *Emitter, const MCSubtargetInfo &STI)
+    : MipsELFStreamer(Context, TAB, OS, Emitter, STI), PendingCall(false) {}
+
+  ~MipsNaClELFStreamer() {}
+
+private:
+  // Whether we started the sandboxing sequence for calls.  Calls are bundled
+  // with branch delays and aligned to the bundle end.
+  bool PendingCall;
+
+  bool isIndirectJump(const MCInst &MI) {
+    if (MI.getOpcode() == Mips::JALR) {
+      // MIPS32r6/MIPS64r6 doesn't have a JR instruction and uses JALR instead.
+      // JALR is an indirect branch if the link register is $0.
+      assert(MI.getOperand(0).isReg());
+      return MI.getOperand(0).getReg() == Mips::ZERO;
+    }
+    return MI.getOpcode() == Mips::JR;
+  }
+
+  bool isStackPointerFirstOperand(const MCInst &MI) {
+    return (MI.getNumOperands() > 0 && MI.getOperand(0).isReg()
+            && MI.getOperand(0).getReg() == Mips::SP);
+  }
+
+  bool isCall(const MCInst &MI, bool *IsIndirectCall) {
+    unsigned Opcode = MI.getOpcode();
+
+    *IsIndirectCall = false;
+
+    switch (Opcode) {
+    default:
+      return false;
+
+    case Mips::JAL:
+    case Mips::BAL:
+    case Mips::BAL_BR:
+    case Mips::BLTZAL:
+    case Mips::BGEZAL:
+      return true;
+
+    case Mips::JALR:
+      // JALR is only a call if the link register is not $0. Otherwise it's an
+      // indirect branch.
+      assert(MI.getOperand(0).isReg());
+      if (MI.getOperand(0).getReg() == Mips::ZERO)
+        return false;
+
+      *IsIndirectCall = true;
+      return true;
+    }
+  }
+
+  void emitMask(unsigned AddrReg, unsigned MaskReg,
+                const MCSubtargetInfo &STI) {
+    MCInst MaskInst;
+    MaskInst.setOpcode(Mips::AND);
+    MaskInst.addOperand(MCOperand::CreateReg(AddrReg));
+    MaskInst.addOperand(MCOperand::CreateReg(AddrReg));
+    MaskInst.addOperand(MCOperand::CreateReg(MaskReg));
+    MipsELFStreamer::EmitInstruction(MaskInst, STI);
+  }
+
+  // Sandbox indirect branch or return instruction by inserting mask operation
+  // before it.
+  void sandboxIndirectJump(const MCInst &MI, const MCSubtargetInfo &STI) {
+    unsigned AddrReg = MI.getOperand(0).getReg();
+
+    EmitBundleLock(false);
+    emitMask(AddrReg, IndirectBranchMaskReg, STI);
+    MipsELFStreamer::EmitInstruction(MI, STI);
+    EmitBundleUnlock();
+  }
+
+  // Sandbox memory access or SP change.  Insert mask operation before and/or
+  // after the instruction.
+  void sandboxLoadStoreStackChange(const MCInst &MI, unsigned AddrIdx,
+                                   const MCSubtargetInfo &STI, bool MaskBefore,
+                                   bool MaskAfter) {
+    EmitBundleLock(false);
+    if (MaskBefore) {
+      // Sandbox memory access.
+      unsigned BaseReg = MI.getOperand(AddrIdx).getReg();
+      emitMask(BaseReg, LoadStoreStackMaskReg, STI);
+    }
+    MipsELFStreamer::EmitInstruction(MI, STI);
+    if (MaskAfter) {
+      // Sandbox SP change.
+      unsigned SPReg = MI.getOperand(0).getReg();
+      assert((Mips::SP == SPReg) && "Unexpected stack-pointer register.");
+      emitMask(SPReg, LoadStoreStackMaskReg, STI);
+    }
+    EmitBundleUnlock();
+  }
+
+public:
+  /// This function is the one used to emit instruction data into the ELF
+  /// streamer.  We override it to mask dangerous instructions.
+  void EmitInstruction(const MCInst &Inst,
+                       const MCSubtargetInfo &STI) override {
+    // Sandbox indirect jumps.
+    if (isIndirectJump(Inst)) {
+      if (PendingCall)
+        report_fatal_error("Dangerous instruction in branch delay slot!");
+      sandboxIndirectJump(Inst, STI);
+      return;
+    }
+
+    // Sandbox loads, stores and SP changes.
+    unsigned AddrIdx;
+    bool IsStore;
+    bool IsMemAccess = isBasePlusOffsetMemoryAccess(Inst.getOpcode(), &AddrIdx,
+                                                    &IsStore);
+    bool IsSPFirstOperand = isStackPointerFirstOperand(Inst);
+    if (IsMemAccess || IsSPFirstOperand) {
+      bool MaskBefore = (IsMemAccess
+                         && baseRegNeedsLoadStoreMask(Inst.getOperand(AddrIdx)
+                                                          .getReg()));
+      bool MaskAfter = IsSPFirstOperand && !IsStore;
+      if (MaskBefore || MaskAfter) {
+        if (PendingCall)
+          report_fatal_error("Dangerous instruction in branch delay slot!");
+        sandboxLoadStoreStackChange(Inst, AddrIdx, STI, MaskBefore, MaskAfter);
+        return;
+      }
+      // fallthrough
+    }
+
+    // Sandbox calls by aligning call and branch delay to the bundle end.
+    // For indirect calls, emit the mask before the call.
+    bool IsIndirectCall;
+    if (isCall(Inst, &IsIndirectCall)) {
+      if (PendingCall)
+        report_fatal_error("Dangerous instruction in branch delay slot!");
+
+      // Start the sandboxing sequence by emitting call.
+      EmitBundleLock(true);
+      if (IsIndirectCall) {
+        unsigned TargetReg = Inst.getOperand(1).getReg();
+        emitMask(TargetReg, IndirectBranchMaskReg, STI);
+      }
+      MipsELFStreamer::EmitInstruction(Inst, STI);
+      PendingCall = true;
+      return;
+    }
+    if (PendingCall) {
+      // Finish the sandboxing sequence by emitting branch delay.
+      MipsELFStreamer::EmitInstruction(Inst, STI);
+      EmitBundleUnlock();
+      PendingCall = false;
+      return;
+    }
+
+    // None of the sandboxing applies, just emit the instruction.
+    MipsELFStreamer::EmitInstruction(Inst, STI);
+  }
+};
+
+} // end anonymous namespace
+
+namespace llvm {
+
+bool isBasePlusOffsetMemoryAccess(unsigned Opcode, unsigned *AddrIdx,
+                                  bool *IsStore) {
+  if (IsStore)
+    *IsStore = false;
+
+  switch (Opcode) {
+  default:
+    return false;
+
+  // Load instructions with base address register in position 1.
+  case Mips::LB:
+  case Mips::LBu:
+  case Mips::LH:
+  case Mips::LHu:
+  case Mips::LW:
+  case Mips::LWC1:
+  case Mips::LDC1:
+  case Mips::LL:
+  case Mips::LL_R6:
+  case Mips::LWL:
+  case Mips::LWR:
+    *AddrIdx = 1;
+    return true;
+
+  // Store instructions with base address register in position 1.
+  case Mips::SB:
+  case Mips::SH:
+  case Mips::SW:
+  case Mips::SWC1:
+  case Mips::SDC1:
+  case Mips::SWL:
+  case Mips::SWR:
+    *AddrIdx = 1;
+    if (IsStore)
+      *IsStore = true;
+    return true;
+
+  // Store instructions with base address register in position 2.
+  case Mips::SC:
+  case Mips::SC_R6:
+    *AddrIdx = 2;
+    if (IsStore)
+      *IsStore = true;
+    return true;
+  }
+}
+
+bool baseRegNeedsLoadStoreMask(unsigned Reg) {
+  // The contents of SP and thread pointer register do not require masking.
+  return Reg != Mips::SP && Reg != Mips::T8;
+}
+
+MCELFStreamer *createMipsNaClELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+                                         raw_ostream &OS,
+                                         MCCodeEmitter *Emitter,
+                                         const MCSubtargetInfo &STI,
+                                         bool RelaxAll, bool NoExecStack) {
+  MipsNaClELFStreamer *S = new MipsNaClELFStreamer(Context, TAB, OS, Emitter,
+                                                   STI);
+  if (RelaxAll)
+    S->getAssembler().setRelaxAll(true);
+  if (NoExecStack)
+    S->getAssembler().setNoExecStack(true);
+
+  // Set bundle-alignment as required by the NaCl ABI for the target.
+  S->EmitBundleAlignMode(MIPS_NACL_BUNDLE_ALIGN);
+
+  return S;
+}
+
+}
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
new file mode 100644
index 0000000..0ef2208
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
@@ -0,0 +1,92 @@
+//===-- MipsOptionRecord.cpp - Abstraction for storing information --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsOptionRecord.h"
+#include "MipsELFStreamer.h"
+#include "llvm/MC/MCSectionELF.h"
+
+using namespace llvm;
+
+void MipsRegInfoRecord::EmitMipsOptionRecord() {
+  MCAssembler &MCA = Streamer->getAssembler();
+  Triple T(STI.getTargetTriple());
+  uint64_t Features = STI.getFeatureBits();
+
+  Streamer->PushSection();
+
+  // We need to distinguish between N64 and the rest because at the moment
+  // we don't emit .Mips.options for other ELFs other than N64.
+  // Since .reginfo has the same information as .Mips.options (ODK_REGINFO),
+  // we can use the same abstraction (MipsRegInfoRecord class) to handle both.
+  if (Features & Mips::FeatureN64) {
+    // The EntrySize value of 1 seems strange since the records are neither
+    // 1-byte long nor fixed length but it matches the value GAS emits.
+    const MCSectionELF *Sec =
+        Context.getELFSection(".MIPS.options", ELF::SHT_MIPS_OPTIONS,
+                              ELF::SHF_ALLOC | ELF::SHF_MIPS_NOSTRIP,
+                              SectionKind::getMetadata(), 1, "");
+    MCA.getOrCreateSectionData(*Sec).setAlignment(8);
+    Streamer->SwitchSection(Sec);
+
+    Streamer->EmitIntValue(1, 1);  // kind
+    Streamer->EmitIntValue(40, 1); // size
+    Streamer->EmitIntValue(0, 2);  // section
+    Streamer->EmitIntValue(0, 4);  // info
+    Streamer->EmitIntValue(ri_gprmask, 4);
+    Streamer->EmitIntValue(0, 4); // pad
+    Streamer->EmitIntValue(ri_cprmask[0], 4);
+    Streamer->EmitIntValue(ri_cprmask[1], 4);
+    Streamer->EmitIntValue(ri_cprmask[2], 4);
+    Streamer->EmitIntValue(ri_cprmask[3], 4);
+    Streamer->EmitIntValue(ri_gp_value, 8);
+  } else {
+    const MCSectionELF *Sec =
+        Context.getELFSection(".reginfo", ELF::SHT_MIPS_REGINFO, ELF::SHF_ALLOC,
+                              SectionKind::getMetadata(), 24, "");
+    MCA.getOrCreateSectionData(*Sec)
+        .setAlignment(Features & Mips::FeatureN32 ? 8 : 4);
+    Streamer->SwitchSection(Sec);
+
+    Streamer->EmitIntValue(ri_gprmask, 4);
+    Streamer->EmitIntValue(ri_cprmask[0], 4);
+    Streamer->EmitIntValue(ri_cprmask[1], 4);
+    Streamer->EmitIntValue(ri_cprmask[2], 4);
+    Streamer->EmitIntValue(ri_cprmask[3], 4);
+    assert((ri_gp_value & 0xffffffff) == ri_gp_value);
+    Streamer->EmitIntValue(ri_gp_value, 4);
+  }
+
+  Streamer->PopSection();
+}
+
+void MipsRegInfoRecord::SetPhysRegUsed(unsigned Reg,
+                                       const MCRegisterInfo *MCRegInfo) {
+  unsigned Value = 0;
+
+  for (MCSubRegIterator SubRegIt(Reg, MCRegInfo, true); SubRegIt.isValid();
+       ++SubRegIt) {
+    unsigned CurrentSubReg = *SubRegIt;
+
+    unsigned EncVal = MCRegInfo->getEncodingValue(CurrentSubReg);
+    Value |= 1 << EncVal;
+
+    if (GPR32RegClass->contains(CurrentSubReg) ||
+        GPR64RegClass->contains(CurrentSubReg))
+      ri_gprmask |= Value;
+    else if (FGR32RegClass->contains(CurrentSubReg) ||
+             FGR64RegClass->contains(CurrentSubReg) ||
+             AFGR64RegClass->contains(CurrentSubReg) ||
+             MSA128BRegClass->contains(CurrentSubReg))
+      ri_cprmask[1] |= Value;
+    else if (COP2RegClass->contains(CurrentSubReg))
+      ri_cprmask[2] |= Value;
+    else if (COP3RegClass->contains(CurrentSubReg))
+      ri_cprmask[3] |= Value;
+  }
+}
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
new file mode 100644
index 0000000..4a178e2
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -0,0 +1,642 @@
+//===-- MipsTargetStreamer.cpp - Mips Target Streamer Methods -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Mips specific target streamer methods.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstPrinter/MipsInstPrinter.h"
+#include "MipsELFStreamer.h"
+#include "MipsMCTargetDesc.h"
+#include "MipsTargetObjectFile.h"
+#include "MipsTargetStreamer.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELF.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+
+using namespace llvm;
+
+MipsTargetStreamer::MipsTargetStreamer(MCStreamer &S)
+    : MCTargetStreamer(S), canHaveModuleDirective(true) {}
+void MipsTargetStreamer::emitDirectiveSetMicroMips() {}
+void MipsTargetStreamer::emitDirectiveSetNoMicroMips() {}
+void MipsTargetStreamer::emitDirectiveSetMips16() {}
+void MipsTargetStreamer::emitDirectiveSetNoMips16() {}
+void MipsTargetStreamer::emitDirectiveSetReorder() {}
+void MipsTargetStreamer::emitDirectiveSetNoReorder() {}
+void MipsTargetStreamer::emitDirectiveSetMacro() {}
+void MipsTargetStreamer::emitDirectiveSetNoMacro() {}
+void MipsTargetStreamer::emitDirectiveSetAt() {}
+void MipsTargetStreamer::emitDirectiveSetNoAt() {}
+void MipsTargetStreamer::emitDirectiveEnd(StringRef Name) {}
+void MipsTargetStreamer::emitDirectiveEnt(const MCSymbol &Symbol) {}
+void MipsTargetStreamer::emitDirectiveAbiCalls() {}
+void MipsTargetStreamer::emitDirectiveNaN2008() {}
+void MipsTargetStreamer::emitDirectiveNaNLegacy() {}
+void MipsTargetStreamer::emitDirectiveOptionPic0() {}
+void MipsTargetStreamer::emitDirectiveOptionPic2() {}
+void MipsTargetStreamer::emitFrame(unsigned StackReg, unsigned StackSize,
+                                   unsigned ReturnReg) {}
+void MipsTargetStreamer::emitMask(unsigned CPUBitmask, int CPUTopSavedRegOff) {}
+void MipsTargetStreamer::emitFMask(unsigned FPUBitmask, int FPUTopSavedRegOff) {
+}
+void MipsTargetStreamer::emitDirectiveSetMips32R2() {}
+void MipsTargetStreamer::emitDirectiveSetMips64() {}
+void MipsTargetStreamer::emitDirectiveSetMips64R2() {}
+void MipsTargetStreamer::emitDirectiveSetDsp() {}
+void MipsTargetStreamer::emitDirectiveCpload(unsigned RegNo) {}
+void MipsTargetStreamer::emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset,
+                                              const MCSymbol &Sym, bool IsReg) {
+}
+void MipsTargetStreamer::emitDirectiveModuleOddSPReg(bool Enabled,
+                                                     bool IsO32ABI) {
+  if (!Enabled && !IsO32ABI)
+    report_fatal_error("+nooddspreg is only valid for O32");
+}
+
+MipsTargetAsmStreamer::MipsTargetAsmStreamer(MCStreamer &S,
+                                             formatted_raw_ostream &OS)
+    : MipsTargetStreamer(S), OS(OS) {}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMicroMips() {
+  OS << "\t.set\tmicromips\n";
+  setCanHaveModuleDir(false);
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetNoMicroMips() {
+  OS << "\t.set\tnomicromips\n";
+  setCanHaveModuleDir(false);
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips16() {
+  OS << "\t.set\tmips16\n";
+  setCanHaveModuleDir(false);
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetNoMips16() {
+  OS << "\t.set\tnomips16\n";
+  setCanHaveModuleDir(false);
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetReorder() {
+  OS << "\t.set\treorder\n";
+  setCanHaveModuleDir(false);
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetNoReorder() {
+  OS << "\t.set\tnoreorder\n";
+  setCanHaveModuleDir(false);
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMacro() {
+  OS << "\t.set\tmacro\n";
+  setCanHaveModuleDir(false);
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetNoMacro() {
+  OS << "\t.set\tnomacro\n";
+  setCanHaveModuleDir(false);
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetAt() {
+  OS << "\t.set\tat\n";
+  setCanHaveModuleDir(false);
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetNoAt() {
+  OS << "\t.set\tnoat\n";
+  setCanHaveModuleDir(false);
+}
+
+void MipsTargetAsmStreamer::emitDirectiveEnd(StringRef Name) {
+  OS << "\t.end\t" << Name << '\n';
+}
+
+void MipsTargetAsmStreamer::emitDirectiveEnt(const MCSymbol &Symbol) {
+  OS << "\t.ent\t" << Symbol.getName() << '\n';
+}
+
+void MipsTargetAsmStreamer::emitDirectiveAbiCalls() { OS << "\t.abicalls\n"; }
+
+void MipsTargetAsmStreamer::emitDirectiveNaN2008() { OS << "\t.nan\t2008\n"; }
+
+void MipsTargetAsmStreamer::emitDirectiveNaNLegacy() {
+  OS << "\t.nan\tlegacy\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveOptionPic0() {
+  OS << "\t.option\tpic0\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveOptionPic2() {
+  OS << "\t.option\tpic2\n";
+}
+
+void MipsTargetAsmStreamer::emitFrame(unsigned StackReg, unsigned StackSize,
+                                      unsigned ReturnReg) {
+  OS << "\t.frame\t$"
+     << StringRef(MipsInstPrinter::getRegisterName(StackReg)).lower() << ","
+     << StackSize << ",$"
+     << StringRef(MipsInstPrinter::getRegisterName(ReturnReg)).lower() << '\n';
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips32R2() {
+  OS << "\t.set\tmips32r2\n";
+  setCanHaveModuleDir(false);
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips64() {
+  OS << "\t.set\tmips64\n";
+  setCanHaveModuleDir(false);
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips64R2() {
+  OS << "\t.set\tmips64r2\n";
+  setCanHaveModuleDir(false);
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetDsp() {
+  OS << "\t.set\tdsp\n";
+  setCanHaveModuleDir(false);
+}
+// Print a 32 bit hex number with all numbers.
+static void printHex32(unsigned Value, raw_ostream &OS) {
+  OS << "0x";
+  for (int i = 7; i >= 0; i--)
+    OS.write_hex((Value & (0xF << (i * 4))) >> (i * 4));
+}
+
+void MipsTargetAsmStreamer::emitMask(unsigned CPUBitmask,
+                                     int CPUTopSavedRegOff) {
+  OS << "\t.mask \t";
+  printHex32(CPUBitmask, OS);
+  OS << ',' << CPUTopSavedRegOff << '\n';
+}
+
+void MipsTargetAsmStreamer::emitFMask(unsigned FPUBitmask,
+                                      int FPUTopSavedRegOff) {
+  OS << "\t.fmask\t";
+  printHex32(FPUBitmask, OS);
+  OS << "," << FPUTopSavedRegOff << '\n';
+}
+
+void MipsTargetAsmStreamer::emitDirectiveCpload(unsigned RegNo) {
+  OS << "\t.cpload\t$"
+     << StringRef(MipsInstPrinter::getRegisterName(RegNo)).lower() << "\n";
+  setCanHaveModuleDir(false);
+}
+
+void MipsTargetAsmStreamer::emitDirectiveCpsetup(unsigned RegNo,
+                                                 int RegOrOffset,
+                                                 const MCSymbol &Sym,
+                                                 bool IsReg) {
+  OS << "\t.cpsetup\t$"
+     << StringRef(MipsInstPrinter::getRegisterName(RegNo)).lower() << ", ";
+
+  if (IsReg)
+    OS << "$"
+       << StringRef(MipsInstPrinter::getRegisterName(RegOrOffset)).lower();
+  else
+    OS << RegOrOffset;
+
+  OS << ", ";
+
+  OS << Sym.getName() << "\n";
+  setCanHaveModuleDir(false);
+}
+
+void MipsTargetAsmStreamer::emitDirectiveModuleFP(
+    MipsABIFlagsSection::FpABIKind Value, bool Is32BitABI) {
+  MipsTargetStreamer::emitDirectiveModuleFP(Value, Is32BitABI);
+
+  StringRef ModuleValue;
+  OS << "\t.module\tfp=";
+  OS << ABIFlagsSection.getFpABIString(Value) << "\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetFp(
+    MipsABIFlagsSection::FpABIKind Value) {
+  StringRef ModuleValue;
+  OS << "\t.set\tfp=";
+  OS << ABIFlagsSection.getFpABIString(Value) << "\n";
+}
+
+void MipsTargetAsmStreamer::emitMipsAbiFlags() {
+  // No action required for text output.
+}
+
+void MipsTargetAsmStreamer::emitDirectiveModuleOddSPReg(bool Enabled,
+                                                        bool IsO32ABI) {
+  MipsTargetStreamer::emitDirectiveModuleOddSPReg(Enabled, IsO32ABI);
+
+  OS << "\t.module\t" << (Enabled ? "" : "no") << "oddspreg\n";
+}
+
+// This part is for ELF object output.
+MipsTargetELFStreamer::MipsTargetELFStreamer(MCStreamer &S,
+                                             const MCSubtargetInfo &STI)
+    : MipsTargetStreamer(S), MicroMipsEnabled(false), STI(STI) {
+  MCAssembler &MCA = getStreamer().getAssembler();
+  uint64_t Features = STI.getFeatureBits();
+  Triple T(STI.getTargetTriple());
+  Pic = (MCA.getContext().getObjectFileInfo()->getRelocM() == Reloc::PIC_)
+            ? true
+            : false;
+
+  // Update e_header flags
+  unsigned EFlags = 0;
+
+  // Architecture
+  if (Features & Mips::FeatureMips64r6)
+    EFlags |= ELF::EF_MIPS_ARCH_64R6;
+  else if (Features & Mips::FeatureMips64r2)
+    EFlags |= ELF::EF_MIPS_ARCH_64R2;
+  else if (Features & Mips::FeatureMips64)
+    EFlags |= ELF::EF_MIPS_ARCH_64;
+  else if (Features & Mips::FeatureMips5)
+    EFlags |= ELF::EF_MIPS_ARCH_5;
+  else if (Features & Mips::FeatureMips4)
+    EFlags |= ELF::EF_MIPS_ARCH_4;
+  else if (Features & Mips::FeatureMips3)
+    EFlags |= ELF::EF_MIPS_ARCH_3;
+  else if (Features & Mips::FeatureMips32r6)
+    EFlags |= ELF::EF_MIPS_ARCH_32R6;
+  else if (Features & Mips::FeatureMips32r2)
+    EFlags |= ELF::EF_MIPS_ARCH_32R2;
+  else if (Features & Mips::FeatureMips32)
+    EFlags |= ELF::EF_MIPS_ARCH_32;
+  else if (Features & Mips::FeatureMips2)
+    EFlags |= ELF::EF_MIPS_ARCH_2;
+  else
+    EFlags |= ELF::EF_MIPS_ARCH_1;
+
+  // ABI
+  // N64 does not require any ABI bits.
+  if (Features & Mips::FeatureO32)
+    EFlags |= ELF::EF_MIPS_ABI_O32;
+  else if (Features & Mips::FeatureN32)
+    EFlags |= ELF::EF_MIPS_ABI2;
+
+  if (Features & Mips::FeatureGP64Bit) {
+    if (Features & Mips::FeatureO32)
+      EFlags |= ELF::EF_MIPS_32BITMODE; /* Compatibility Mode */
+  } else if (Features & Mips::FeatureMips64r2 || Features & Mips::FeatureMips64)
+    EFlags |= ELF::EF_MIPS_32BITMODE;
+
+  // Other options.
+  if (Features & Mips::FeatureNaN2008)
+    EFlags |= ELF::EF_MIPS_NAN2008;
+
+  // -mabicalls and -mplt are not implemented but we should act as if they were
+  // given.
+  EFlags |= ELF::EF_MIPS_CPIC;
+  if (Features & Mips::FeatureN64)
+    EFlags |= ELF::EF_MIPS_PIC;
+
+  MCA.setELFHeaderEFlags(EFlags);
+}
+
+void MipsTargetELFStreamer::emitLabel(MCSymbol *Symbol) {
+  if (!isMicroMipsEnabled())
+    return;
+  MCSymbolData &Data = getStreamer().getOrCreateSymbolData(Symbol);
+  uint8_t Type = MCELF::GetType(Data);
+  if (Type != ELF::STT_FUNC)
+    return;
+
+  // The "other" values are stored in the last 6 bits of the second byte
+  // The traditional defines for STO values assume the full byte and thus
+  // the shift to pack it.
+  MCELF::setOther(Data, ELF::STO_MIPS_MICROMIPS >> 2);
+}
+
+void MipsTargetELFStreamer::finish() {
+  MCAssembler &MCA = getStreamer().getAssembler();
+  const MCObjectFileInfo &OFI = *MCA.getContext().getObjectFileInfo();
+
+  // .bss, .text and .data are always at least 16-byte aligned.
+  MCSectionData &TextSectionData =
+      MCA.getOrCreateSectionData(*OFI.getTextSection());
+  MCSectionData &DataSectionData =
+      MCA.getOrCreateSectionData(*OFI.getDataSection());
+  MCSectionData &BSSSectionData =
+      MCA.getOrCreateSectionData(*OFI.getBSSSection());
+
+  TextSectionData.setAlignment(std::max(16u, TextSectionData.getAlignment()));
+  DataSectionData.setAlignment(std::max(16u, DataSectionData.getAlignment()));
+  BSSSectionData.setAlignment(std::max(16u, BSSSectionData.getAlignment()));
+
+  // Emit all the option records.
+  // At the moment we are only emitting .Mips.options (ODK_REGINFO) and
+  // .reginfo.
+  MipsELFStreamer &MEF = static_cast<MipsELFStreamer &>(Streamer);
+  MEF.EmitMipsOptionRecords();
+
+  emitMipsAbiFlags();
+}
+
+void MipsTargetELFStreamer::emitAssignment(MCSymbol *Symbol,
+                                           const MCExpr *Value) {
+  // If on rhs is micromips symbol then mark Symbol as microMips.
+  if (Value->getKind() != MCExpr::SymbolRef)
+    return;
+  const MCSymbol &RhsSym =
+      static_cast<const MCSymbolRefExpr *>(Value)->getSymbol();
+  MCSymbolData &Data = getStreamer().getOrCreateSymbolData(&RhsSym);
+  uint8_t Type = MCELF::GetType(Data);
+  if ((Type != ELF::STT_FUNC) ||
+      !(MCELF::getOther(Data) & (ELF::STO_MIPS_MICROMIPS >> 2)))
+    return;
+
+  MCSymbolData &SymbolData = getStreamer().getOrCreateSymbolData(Symbol);
+  // The "other" values are stored in the last 6 bits of the second byte.
+  // The traditional defines for STO values assume the full byte and thus
+  // the shift to pack it.
+  MCELF::setOther(SymbolData, ELF::STO_MIPS_MICROMIPS >> 2);
+}
+
+MCELFStreamer &MipsTargetELFStreamer::getStreamer() {
+  return static_cast<MCELFStreamer &>(Streamer);
+}
+
+void MipsTargetELFStreamer::emitDirectiveSetMicroMips() {
+  MicroMipsEnabled = true;
+
+  MCAssembler &MCA = getStreamer().getAssembler();
+  unsigned Flags = MCA.getELFHeaderEFlags();
+  Flags |= ELF::EF_MIPS_MICROMIPS;
+  MCA.setELFHeaderEFlags(Flags);
+}
+
+void MipsTargetELFStreamer::emitDirectiveSetNoMicroMips() {
+  MicroMipsEnabled = false;
+  setCanHaveModuleDir(false);
+}
+
+void MipsTargetELFStreamer::emitDirectiveSetMips16() {
+  MCAssembler &MCA = getStreamer().getAssembler();
+  unsigned Flags = MCA.getELFHeaderEFlags();
+  Flags |= ELF::EF_MIPS_ARCH_ASE_M16;
+  MCA.setELFHeaderEFlags(Flags);
+  setCanHaveModuleDir(false);
+}
+
+void MipsTargetELFStreamer::emitDirectiveSetNoMips16() {
+  // FIXME: implement.
+  setCanHaveModuleDir(false);
+}
+
+void MipsTargetELFStreamer::emitDirectiveSetReorder() {
+  // FIXME: implement.
+  setCanHaveModuleDir(false);
+}
+
+void MipsTargetELFStreamer::emitDirectiveSetNoReorder() {
+  MCAssembler &MCA = getStreamer().getAssembler();
+  unsigned Flags = MCA.getELFHeaderEFlags();
+  Flags |= ELF::EF_MIPS_NOREORDER;
+  MCA.setELFHeaderEFlags(Flags);
+  setCanHaveModuleDir(false);
+}
+
+void MipsTargetELFStreamer::emitDirectiveSetMacro() {
+  // FIXME: implement.
+  setCanHaveModuleDir(false);
+}
+
+void MipsTargetELFStreamer::emitDirectiveSetNoMacro() {
+  // FIXME: implement.
+  setCanHaveModuleDir(false);
+}
+
+void MipsTargetELFStreamer::emitDirectiveSetAt() {
+  // FIXME: implement.
+  setCanHaveModuleDir(false);
+}
+
+void MipsTargetELFStreamer::emitDirectiveSetNoAt() {
+  // FIXME: implement.
+  setCanHaveModuleDir(false);
+}
+
+void MipsTargetELFStreamer::emitDirectiveEnd(StringRef Name) {
+  // FIXME: implement.
+}
+
+void MipsTargetELFStreamer::emitDirectiveEnt(const MCSymbol &Symbol) {
+  // FIXME: implement.
+}
+
+void MipsTargetELFStreamer::emitDirectiveAbiCalls() {
+  MCAssembler &MCA = getStreamer().getAssembler();
+  unsigned Flags = MCA.getELFHeaderEFlags();
+  Flags |= ELF::EF_MIPS_CPIC | ELF::EF_MIPS_PIC;
+  MCA.setELFHeaderEFlags(Flags);
+}
+
+void MipsTargetELFStreamer::emitDirectiveNaN2008() {
+  MCAssembler &MCA = getStreamer().getAssembler();
+  unsigned Flags = MCA.getELFHeaderEFlags();
+  Flags |= ELF::EF_MIPS_NAN2008;
+  MCA.setELFHeaderEFlags(Flags);
+}
+
+void MipsTargetELFStreamer::emitDirectiveNaNLegacy() {
+  MCAssembler &MCA = getStreamer().getAssembler();
+  unsigned Flags = MCA.getELFHeaderEFlags();
+  Flags &= ~ELF::EF_MIPS_NAN2008;
+  MCA.setELFHeaderEFlags(Flags);
+}
+
+void MipsTargetELFStreamer::emitDirectiveOptionPic0() {
+  MCAssembler &MCA = getStreamer().getAssembler();
+  unsigned Flags = MCA.getELFHeaderEFlags();
+  // This option overrides other PIC options like -KPIC.
+  Pic = false;
+  Flags &= ~ELF::EF_MIPS_PIC;
+  MCA.setELFHeaderEFlags(Flags);
+}
+
+void MipsTargetELFStreamer::emitDirectiveOptionPic2() {
+  MCAssembler &MCA = getStreamer().getAssembler();
+  unsigned Flags = MCA.getELFHeaderEFlags();
+  Pic = true;
+  // NOTE: We are following the GAS behaviour here which means the directive
+  // 'pic2' also sets the CPIC bit in the ELF header. This is different from
+  // what is stated in the SYSV ABI which consider the bits EF_MIPS_PIC and
+  // EF_MIPS_CPIC to be mutually exclusive.
+  Flags |= ELF::EF_MIPS_PIC | ELF::EF_MIPS_CPIC;
+  MCA.setELFHeaderEFlags(Flags);
+}
+
+void MipsTargetELFStreamer::emitFrame(unsigned StackReg, unsigned StackSize,
+                                      unsigned ReturnReg) {
+  // FIXME: implement.
+}
+
+void MipsTargetELFStreamer::emitMask(unsigned CPUBitmask,
+                                     int CPUTopSavedRegOff) {
+  // FIXME: implement.
+}
+
+void MipsTargetELFStreamer::emitFMask(unsigned FPUBitmask,
+                                      int FPUTopSavedRegOff) {
+  // FIXME: implement.
+}
+
+void MipsTargetELFStreamer::emitDirectiveSetMips32R2() {
+  setCanHaveModuleDir(false);
+}
+
+void MipsTargetELFStreamer::emitDirectiveSetMips64() {
+  setCanHaveModuleDir(false);
+}
+
+void MipsTargetELFStreamer::emitDirectiveSetMips64R2() {
+  setCanHaveModuleDir(false);
+}
+
+void MipsTargetELFStreamer::emitDirectiveSetDsp() {
+  setCanHaveModuleDir(false);
+}
+
+void MipsTargetELFStreamer::emitDirectiveCpload(unsigned RegNo) {
+  // .cpload $reg
+  // This directive expands to:
+  // lui   $gp, %hi(_gp_disp)
+  // addui $gp, $gp, %lo(_gp_disp)
+  // addu  $gp, $gp, $reg
+  // when support for position independent code is enabled.
+  if (!Pic || (isN32() || isN64()))
+    return;
+
+  // There's a GNU extension controlled by -mno-shared that allows
+  // locally-binding symbols to be accessed using absolute addresses.
+  // This is currently not supported. When supported -mno-shared makes
+  // .cpload expand to:
+  //   lui     $gp, %hi(__gnu_local_gp)
+  //   addiu   $gp, $gp, %lo(__gnu_local_gp)
+
+  StringRef SymName("_gp_disp");
+  MCAssembler &MCA = getStreamer().getAssembler();
+  MCSymbol *GP_Disp = MCA.getContext().GetOrCreateSymbol(SymName);
+  MCA.getOrCreateSymbolData(*GP_Disp);
+
+  MCInst TmpInst;
+  TmpInst.setOpcode(Mips::LUi);
+  TmpInst.addOperand(MCOperand::CreateReg(Mips::GP));
+  const MCSymbolRefExpr *HiSym = MCSymbolRefExpr::Create(
+      "_gp_disp", MCSymbolRefExpr::VK_Mips_ABS_HI, MCA.getContext());
+  TmpInst.addOperand(MCOperand::CreateExpr(HiSym));
+  getStreamer().EmitInstruction(TmpInst, STI);
+
+  TmpInst.clear();
+
+  TmpInst.setOpcode(Mips::ADDiu);
+  TmpInst.addOperand(MCOperand::CreateReg(Mips::GP));
+  TmpInst.addOperand(MCOperand::CreateReg(Mips::GP));
+  const MCSymbolRefExpr *LoSym = MCSymbolRefExpr::Create(
+      "_gp_disp", MCSymbolRefExpr::VK_Mips_ABS_LO, MCA.getContext());
+  TmpInst.addOperand(MCOperand::CreateExpr(LoSym));
+  getStreamer().EmitInstruction(TmpInst, STI);
+
+  TmpInst.clear();
+
+  TmpInst.setOpcode(Mips::ADDu);
+  TmpInst.addOperand(MCOperand::CreateReg(Mips::GP));
+  TmpInst.addOperand(MCOperand::CreateReg(Mips::GP));
+  TmpInst.addOperand(MCOperand::CreateReg(RegNo));
+  getStreamer().EmitInstruction(TmpInst, STI);
+
+  setCanHaveModuleDir(false);
+}
+
+void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo,
+                                                 int RegOrOffset,
+                                                 const MCSymbol &Sym,
+                                                 bool IsReg) {
+  // Only N32 and N64 emit anything for .cpsetup iff PIC is set.
+  if (!Pic || !(isN32() || isN64()))
+    return;
+
+  MCAssembler &MCA = getStreamer().getAssembler();
+  MCInst Inst;
+
+  // Either store the old $gp in a register or on the stack
+  if (IsReg) {
+    // move $save, $gpreg
+    Inst.setOpcode(Mips::DADDu);
+    Inst.addOperand(MCOperand::CreateReg(RegOrOffset));
+    Inst.addOperand(MCOperand::CreateReg(Mips::GP));
+    Inst.addOperand(MCOperand::CreateReg(Mips::ZERO));
+  } else {
+    // sd $gpreg, offset($sp)
+    Inst.setOpcode(Mips::SD);
+    Inst.addOperand(MCOperand::CreateReg(Mips::GP));
+    Inst.addOperand(MCOperand::CreateReg(Mips::SP));
+    Inst.addOperand(MCOperand::CreateImm(RegOrOffset));
+  }
+  getStreamer().EmitInstruction(Inst, STI);
+  Inst.clear();
+
+  const MCSymbolRefExpr *HiExpr = MCSymbolRefExpr::Create(
+      Sym.getName(), MCSymbolRefExpr::VK_Mips_GPOFF_HI, MCA.getContext());
+  const MCSymbolRefExpr *LoExpr = MCSymbolRefExpr::Create(
+      Sym.getName(), MCSymbolRefExpr::VK_Mips_GPOFF_LO, MCA.getContext());
+  // lui $gp, %hi(%neg(%gp_rel(funcSym)))
+  Inst.setOpcode(Mips::LUi);
+  Inst.addOperand(MCOperand::CreateReg(Mips::GP));
+  Inst.addOperand(MCOperand::CreateExpr(HiExpr));
+  getStreamer().EmitInstruction(Inst, STI);
+  Inst.clear();
+
+  // addiu  $gp, $gp, %lo(%neg(%gp_rel(funcSym)))
+  Inst.setOpcode(Mips::ADDiu);
+  Inst.addOperand(MCOperand::CreateReg(Mips::GP));
+  Inst.addOperand(MCOperand::CreateReg(Mips::GP));
+  Inst.addOperand(MCOperand::CreateExpr(LoExpr));
+  getStreamer().EmitInstruction(Inst, STI);
+  Inst.clear();
+
+  // daddu  $gp, $gp, $funcreg
+  Inst.setOpcode(Mips::DADDu);
+  Inst.addOperand(MCOperand::CreateReg(Mips::GP));
+  Inst.addOperand(MCOperand::CreateReg(Mips::GP));
+  Inst.addOperand(MCOperand::CreateReg(RegNo));
+  getStreamer().EmitInstruction(Inst, STI);
+
+  setCanHaveModuleDir(false);
+}
+
+void MipsTargetELFStreamer::emitMipsAbiFlags() {
+  MCAssembler &MCA = getStreamer().getAssembler();
+  MCContext &Context = MCA.getContext();
+  MCStreamer &OS = getStreamer();
+  const MCSectionELF *Sec =
+      Context.getELFSection(".MIPS.abiflags", ELF::SHT_MIPS_ABIFLAGS,
+                            ELF::SHF_ALLOC, SectionKind::getMetadata(), 24, "");
+  MCSectionData &ABIShndxSD = MCA.getOrCreateSectionData(*Sec);
+  ABIShndxSD.setAlignment(8);
+  OS.SwitchSection(Sec);
+
+  OS << ABIFlagsSection;
+}
+
+void MipsTargetELFStreamer::emitDirectiveModuleOddSPReg(bool Enabled,
+                                                        bool IsO32ABI) {
+  MipsTargetStreamer::emitDirectiveModuleOddSPReg(Enabled, IsO32ABI);
+
+  ABIFlagsSection.OddSPReg = Enabled;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MSA.txt b/contrib/llvm/lib/Target/Mips/MSA.txt
new file mode 100644
index 0000000..113375f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MSA.txt
@@ -0,0 +1,83 @@
+Code Generation Notes for MSA
+=============================
+
+Intrinsics are lowered to SelectionDAG nodes where possible in order to enable
+optimisation, reduce the size of the ISel matcher, and reduce repetition in
+the implementation. In a small number of cases, this can cause different
+(semantically equivalent) instructions to be used in place of the requested
+instruction, even when no optimisation has taken place.
+
+Instructions
+============
+
+This section describes any quirks of instruction selection for MSA. For
+example, two instructions might be equally valid for some given IR and one is
+chosen in preference to the other.
+
+bclri.b:
+        It is not possible to emit bclri.b since andi.b covers exactly the
+        same cases. andi.b should use fractionally less power than bclri.b in
+        most hardware implementations so it is used in preference to bclri.b.
+
+vshf.w:
+        It is not possible to emit vshf.w when the shuffle description is
+        constant since shf.w covers exactly the same cases. shf.w is used
+        instead. It is also impossible for the shuffle description to be
+        unknown at compile-time due to the definition of shufflevector in
+        LLVM IR.
+
+vshf.[bhwd]
+        When the shuffle description describes a splat operation, splat.[bhwd]
+        instructions will be selected instead of vshf.[bhwd]. Unlike the ilv*,
+        and pck* instructions, this is matched from MipsISD::VSHF instead of
+        a special-case MipsISD node.
+
+ilvl.d, pckev.d:
+        It is not possible to emit ilvl.d, or pckev.d since ilvev.d covers the
+        same shuffle. ilvev.d will be emitted instead.
+
+ilvr.d, ilvod.d, pckod.d:
+        It is not possible to emit ilvr.d, or pckod.d since ilvod.d covers the
+        same shuffle. ilvod.d will be emitted instead.
+
+splat.[bhwd]
+        The intrinsic will work as expected. However, unlike other intrinsics
+        it lowers directly to MipsISD::VSHF instead of using common IR.
+
+splati.w:
+        It is not possible to emit splati.w since shf.w covers the same cases.
+        shf.w will be emitted instead.
+
+copy_s.w:
+        On MIPS32, the copy_u.d intrinsic will emit this instruction instead of
+        copy_u.w. This is semantically equivalent since the general-purpose
+        register file is 32-bits wide.
+
+binsri.[bhwd],  binsli.[bhwd]:
+        These two operations are equivalent to each other with the operands
+        swapped and condition inverted. The compiler may use either one as
+        appropriate.
+        Furthermore, the compiler may use bsel.[bhwd] for some masks that do
+        not survive the legalization process (this is a bug and will be fixed).
+
+bmnz.v, bmz.v, bsel.v:
+        These three operations differ only in the operand that is tied to the
+        result and the order of the operands.
+        It is (currently) not possible to emit bmz.v, or bsel.v since bmnz.v is
+        the same operation and will be emitted instead.
+        In future, the compiler may choose between these three instructions
+        according to register allocation.
+        These three operations can be very confusing so here is a mapping
+        between the instructions and the vselect node in one place:
+                bmz.v  wd, ws, wt/i8 -> (vselect wt/i8, wd, ws)
+                bmnz.v wd, ws, wt/i8 -> (vselect wt/i8, ws, wd)
+                bsel.v wd, ws, wt/i8 -> (vselect wd, wt/i8, ws)
+
+bmnzi.b, bmzi.b:
+        Like their non-immediate counterparts, bmnzi.v and bmzi.v are the same
+        operation with the operands swapped. bmnzi.v will (currently) be emitted
+        for both cases.
+
+bseli.v:
+        Unlike the non-immediate versions, bseli.v is distinguishable from
+        bmnzi.b and bmzi.b and can be emitted.
diff --git a/contrib/llvm/lib/Target/Mips/MicroMipsInstrFPU.td b/contrib/llvm/lib/Target/Mips/MicroMipsInstrFPU.td
new file mode 100644
index 0000000..b93017a
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MicroMipsInstrFPU.td
@@ -0,0 +1,148 @@
+let isCodeGenOnly = 1, Predicates = [InMicroMips] in {
+def FADD_S_MM : MMRel, ADDS_FT<"add.s", FGR32Opnd, II_ADD_S, 1, fadd>,
+                ADDS_FM_MM<0, 0x30>;
+def FDIV_S_MM : MMRel, ADDS_FT<"div.s", FGR32Opnd, II_DIV_S, 0, fdiv>,
+                ADDS_FM_MM<0, 0xf0>;
+def FMUL_S_MM : MMRel, ADDS_FT<"mul.s", FGR32Opnd, II_MUL_S, 1, fmul>,
+                ADDS_FM_MM<0, 0xb0>;
+def FSUB_S_MM : MMRel, ADDS_FT<"sub.s", FGR32Opnd, II_SUB_S, 0, fsub>,
+                ADDS_FM_MM<0, 0x70>;
+
+def FADD_MM  : MMRel, ADDS_FT<"add.d", AFGR64Opnd, II_ADD_D, 1, fadd>,
+               ADDS_FM_MM<1, 0x30>;
+def FDIV_MM  : MMRel, ADDS_FT<"div.d", AFGR64Opnd, II_DIV_D, 0, fdiv>,
+               ADDS_FM_MM<1, 0xf0>;
+def FMUL_MM  : MMRel, ADDS_FT<"mul.d", AFGR64Opnd, II_MUL_D, 1, fmul>,
+               ADDS_FM_MM<1, 0xb0>;
+def FSUB_MM  : MMRel, ADDS_FT<"sub.d", AFGR64Opnd, II_SUB_D, 0, fsub>,
+               ADDS_FM_MM<1, 0x70>;
+
+def LWC1_MM : MMRel, LW_FT<"lwc1", FGR32Opnd, II_LWC1, load>, LW_FM_MM<0x27>;
+def SWC1_MM : MMRel, SW_FT<"swc1", FGR32Opnd, II_SWC1, store>,
+              LW_FM_MM<0x26>;
+def LDC1_MM : MMRel, LW_FT<"ldc1", AFGR64Opnd, II_LDC1, load>, LW_FM_MM<0x2f>;
+def SDC1_MM : MMRel, SW_FT<"sdc1", AFGR64Opnd, II_SDC1, store>,
+              LW_FM_MM<0x2e>;
+def LWXC1_MM : MMRel, LWXC1_FT<"lwxc1", FGR32Opnd, II_LWXC1, load>,
+               LWXC1_FM_MM<0x48>, INSN_MIPS4_32R2_NOT_32R6_64R6;
+def SWXC1_MM : MMRel, SWXC1_FT<"swxc1", FGR32Opnd, II_SWXC1, store>,
+               SWXC1_FM_MM<0x88>, INSN_MIPS4_32R2_NOT_32R6_64R6;
+def LUXC1_MM : MMRel, LWXC1_FT<"luxc1", AFGR64Opnd, II_LUXC1>,
+               LWXC1_FM_MM<0x148>, INSN_MIPS5_32R2_NOT_32R6_64R6;
+def SUXC1_MM : MMRel, SWXC1_FT<"suxc1", AFGR64Opnd, II_SUXC1>,
+               SWXC1_FM_MM<0x188>, INSN_MIPS5_32R2_NOT_32R6_64R6;
+
+def FCMP_S32_MM : MMRel, CEQS_FT<"s", FGR32, II_C_CC_S, MipsFPCmp>,
+                  CEQS_FM_MM<0>;
+def FCMP_D32_MM : MMRel, CEQS_FT<"d", AFGR64, II_C_CC_D, MipsFPCmp>,
+                  CEQS_FM_MM<1>;
+
+def BC1F_MM : MMRel, BC1F_FT<"bc1f", brtarget_mm, IIBranch, MIPS_BRANCH_F>,
+              BC1F_FM_MM<0x1c>, ISA_MIPS1_NOT_32R6_64R6;
+def BC1T_MM : MMRel, BC1F_FT<"bc1t", brtarget_mm, IIBranch, MIPS_BRANCH_T>,
+              BC1F_FM_MM<0x1d>, ISA_MIPS1_NOT_32R6_64R6;
+
+def CEIL_W_S_MM  : MMRel, ABSS_FT<"ceil.w.s", FGR32Opnd, FGR32Opnd, II_CEIL>,
+                   ROUND_W_FM_MM<0, 0x6c>;
+def CVT_W_S_MM   : MMRel, ABSS_FT<"cvt.w.s", FGR32Opnd, FGR32Opnd, II_CVT>,
+                   ROUND_W_FM_MM<0, 0x24>;
+def FLOOR_W_S_MM : MMRel, ABSS_FT<"floor.w.s", FGR32Opnd, FGR32Opnd, II_FLOOR>,
+                   ROUND_W_FM_MM<0, 0x2c>;
+def ROUND_W_S_MM : MMRel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, II_ROUND>,
+                   ROUND_W_FM_MM<0, 0xec>;
+def TRUNC_W_S_MM : MMRel, ABSS_FT<"trunc.w.s", FGR32Opnd, FGR32Opnd, II_TRUNC>,
+                   ROUND_W_FM_MM<0, 0xac>;
+def FSQRT_S_MM : MMRel, ABSS_FT<"sqrt.s", FGR32Opnd, FGR32Opnd, II_SQRT_S,
+                                fsqrt>, ROUND_W_FM_MM<0, 0x28>;
+
+def CEIL_W_MM  : MMRel, ABSS_FT<"ceil.w.d", FGR32Opnd, AFGR64Opnd, II_CEIL>,
+                 ROUND_W_FM_MM<1, 0x6c>;
+def CVT_W_MM   : MMRel, ABSS_FT<"cvt.w.d", FGR32Opnd, AFGR64Opnd, II_CVT>,
+                 ROUND_W_FM_MM<1, 0x24>;
+def FLOOR_W_MM : MMRel, ABSS_FT<"floor.w.d", FGR32Opnd, AFGR64Opnd, II_FLOOR>,
+                 ROUND_W_FM_MM<1, 0x2c>;
+def ROUND_W_MM : MMRel, ABSS_FT<"round.w.d", FGR32Opnd, AFGR64Opnd, II_ROUND>,
+                 ROUND_W_FM_MM<1, 0xec>;
+def TRUNC_W_MM : MMRel, ABSS_FT<"trunc.w.d", FGR32Opnd, AFGR64Opnd, II_TRUNC>,
+                 ROUND_W_FM_MM<1, 0xac>;
+
+def FSQRT_MM : MMRel, ABSS_FT<"sqrt.d", AFGR64Opnd, AFGR64Opnd, II_SQRT_D,
+                              fsqrt>, ROUND_W_FM_MM<1, 0x28>;
+
+def CVT_L_S_MM   : MMRel, ABSS_FT<"cvt.l.s", FGR64Opnd, FGR32Opnd, II_CVT>,
+                   ROUND_W_FM_MM<0, 0x4>, INSN_MIPS3_32R2;
+def CVT_L_D64_MM : MMRel, ABSS_FT<"cvt.l.d", FGR64Opnd, FGR64Opnd, II_CVT>,
+                   ROUND_W_FM_MM<1, 0x4>, INSN_MIPS3_32R2;
+
+def FABS_S_MM : MMRel, ABSS_FT<"abs.s", FGR32Opnd, FGR32Opnd, II_ABS, fabs>,
+                ABS_FM_MM<0, 0xd>;
+def FMOV_S_MM : MMRel, ABSS_FT<"mov.s", FGR32Opnd, FGR32Opnd, II_MOV_S>,
+                ABS_FM_MM<0, 0x1>;
+def FNEG_S_MM : MMRel, ABSS_FT<"neg.s", FGR32Opnd, FGR32Opnd, II_NEG, fneg>,
+                ABS_FM_MM<0, 0x2d>;
+def CVT_D_S_MM : MMRel, ABSS_FT<"cvt.d.s", AFGR64Opnd, FGR32Opnd, II_CVT>,
+                 ABS_FM_MM<0, 0x4d>;
+def CVT_D32_W_MM : MMRel, ABSS_FT<"cvt.d.w", AFGR64Opnd, FGR32Opnd, II_CVT>,
+                   ABS_FM_MM<1, 0x4d>;
+def CVT_S_D32_MM : MMRel, ABSS_FT<"cvt.s.d", FGR32Opnd, AFGR64Opnd, II_CVT>,
+                   ABS_FM_MM<0, 0x6d>;
+def CVT_S_W_MM : MMRel, ABSS_FT<"cvt.s.w", FGR32Opnd, FGR32Opnd, II_CVT>,
+                 ABS_FM_MM<1, 0x6d>;
+
+def FABS_MM : MMRel, ABSS_FT<"abs.d", AFGR64Opnd, AFGR64Opnd, II_ABS, fabs>,
+              ABS_FM_MM<1, 0xd>;
+def FNEG_MM : MMRel, ABSS_FT<"neg.d", AFGR64Opnd, AFGR64Opnd, II_NEG, fneg>,
+              ABS_FM_MM<1, 0x2d>;
+
+def FMOV_D32_MM : MMRel, ABSS_FT<"mov.d", AFGR64Opnd, AFGR64Opnd, II_MOV_D>,
+                  ABS_FM_MM<1, 0x1>, AdditionalRequires<[NotFP64bit]>;
+
+def MOVZ_I_S_MM : MMRel, CMov_I_F_FT<"movz.s", GPR32Opnd, FGR32Opnd,
+                                     II_MOVZ_S>, CMov_I_F_FM_MM<0x78, 0>;
+def MOVN_I_S_MM : MMRel, CMov_I_F_FT<"movn.s", GPR32Opnd, FGR32Opnd,
+                                     II_MOVN_S>, CMov_I_F_FM_MM<0x38, 0>;
+def MOVZ_I_D32_MM : MMRel, CMov_I_F_FT<"movz.d", GPR32Opnd, AFGR64Opnd,
+                                       II_MOVZ_D>, CMov_I_F_FM_MM<0x78, 1>;
+def MOVN_I_D32_MM : MMRel, CMov_I_F_FT<"movn.d", GPR32Opnd, AFGR64Opnd,
+                                       II_MOVN_D>, CMov_I_F_FM_MM<0x38, 1>;
+
+def MOVT_S_MM : MMRel, CMov_F_F_FT<"movt.s", FGR32Opnd, II_MOVT_S,
+                                   MipsCMovFP_T>, CMov_F_F_FM_MM<0x60, 0>;
+def MOVF_S_MM : MMRel, CMov_F_F_FT<"movf.s", FGR32Opnd, II_MOVF_S,
+                                   MipsCMovFP_F>, CMov_F_F_FM_MM<0x20, 0>;
+def MOVT_D32_MM : MMRel, CMov_F_F_FT<"movt.d", AFGR64Opnd, II_MOVT_D,
+                                     MipsCMovFP_T>, CMov_F_F_FM_MM<0x60, 1>;
+def MOVF_D32_MM : MMRel, CMov_F_F_FT<"movf.d", AFGR64Opnd, II_MOVF_D,
+                                     MipsCMovFP_F>, CMov_F_F_FM_MM<0x20, 1>;
+
+def CFC1_MM : MMRel, MFC1_FT<"cfc1", GPR32Opnd, CCROpnd, II_CFC1>,
+              MFC1_FM_MM<0x40>;
+def CTC1_MM : MMRel, MTC1_FT<"ctc1", CCROpnd, GPR32Opnd, II_CTC1>,
+              MFC1_FM_MM<0x60>;
+def MFC1_MM : MMRel, MFC1_FT<"mfc1", GPR32Opnd, FGR32Opnd,
+                             II_MFC1, bitconvert>, MFC1_FM_MM<0x80>;
+def MTC1_MM : MMRel, MTC1_FT<"mtc1", FGR32Opnd, GPR32Opnd,
+                             II_MTC1, bitconvert>, MFC1_FM_MM<0xa0>;
+def MFHC1_MM : MMRel, MFC1_FT<"mfhc1", GPR32Opnd, FGRH32Opnd, II_MFHC1>,
+               MFC1_FM_MM<3>, ISA_MIPS32R2;
+def MTHC1_MM : MMRel, MTC1_FT<"mthc1", FGRH32Opnd, GPR32Opnd, II_MTHC1>,
+               MFC1_FM_MM<7>, ISA_MIPS32R2;
+
+def MADD_S_MM : MMRel, MADDS_FT<"madd.s", FGR32Opnd, II_MADD_S, fadd>,
+                MADDS_FM_MM<0x1>;
+def MSUB_S_MM : MMRel, MADDS_FT<"msub.s", FGR32Opnd, II_MSUB_S, fsub>,
+                MADDS_FM_MM<0x21>;
+def NMADD_S_MM : MMRel, NMADDS_FT<"nmadd.s", FGR32Opnd, II_NMADD_S, fadd>,
+                 MADDS_FM_MM<0x2>;
+def NMSUB_S_MM : MMRel, NMADDS_FT<"nmsub.s", FGR32Opnd, II_NMSUB_S, fsub>,
+                 MADDS_FM_MM<0x22>;
+
+def MADD_D32_MM  : MMRel, MADDS_FT<"madd.d", AFGR64Opnd, II_MADD_D, fadd>,
+                   MADDS_FM_MM<0x9>;
+def MSUB_D32_MM  : MMRel, MADDS_FT<"msub.d", AFGR64Opnd, II_MSUB_D, fsub>,
+                   MADDS_FM_MM<0x29>;
+def NMADD_D32_MM : MMRel, NMADDS_FT<"nmadd.d", AFGR64Opnd, II_NMADD_D, fadd>,
+                   MADDS_FM_MM<0xa>;
+def NMSUB_D32_MM : MMRel, NMADDS_FT<"nmsub.d", AFGR64Opnd, II_NMSUB_D, fsub>,
+                   MADDS_FM_MM<0x2a>;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MicroMipsInstrFormats.td b/contrib/llvm/lib/Target/Mips/MicroMipsInstrFormats.td
new file mode 100644
index 0000000..15b951d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MicroMipsInstrFormats.td
@@ -0,0 +1,623 @@
+//===----------------------------------------------------------------------===//
+// MicroMIPS Base Classes
+//===----------------------------------------------------------------------===//
+
+//
+// Base class for MicroMips instructions.
+// This class does not depend on the instruction size.
+//
+class MicroMipsInstBase<dag outs, dag ins, string asmstr, list<dag> pattern,
+                        InstrItinClass itin, Format f> : Instruction
+{
+  let Namespace = "Mips";
+  let DecoderNamespace = "MicroMips";
+
+  let OutOperandList = outs;
+  let InOperandList  = ins;
+
+  let AsmString   = asmstr;
+  let Pattern     = pattern;
+  let Itinerary   = itin;
+
+  let Predicates = [InMicroMips];
+
+  Format Form = f;
+}
+
+//
+// Base class for MicroMIPS 16-bit instructions.
+//
+class MicroMipsInst16<dag outs, dag ins, string asmstr, list<dag> pattern,
+               InstrItinClass itin, Format f> :
+  MicroMipsInstBase<outs, ins, asmstr, pattern, itin, f>
+{
+  let Size = 2;
+  field bits<16> Inst;
+  field bits<16> SoftFail = 0;
+  bits<6> Opcode = 0x0;
+}
+
+//===----------------------------------------------------------------------===//
+// MicroMIPS 16-bit Instruction Formats
+//===----------------------------------------------------------------------===//
+
+class MOVE_FM_MM16<bits<6> funct> {
+  bits<5> rs;
+  bits<5> rd;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = funct;
+  let Inst{9-5}   = rd;
+  let Inst{4-0}   = rs;
+}
+
+class JALR_FM_MM16<bits<5> op> {
+  bits<5> rs;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0x11;
+  let Inst{9-5}   = op;
+  let Inst{4-0}   = rs;
+}
+
+class MFHILO_FM_MM16<bits<5> funct> {
+  bits<5> rd;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0x11;
+  let Inst{9-5}   = funct;
+  let Inst{4-0}   = rd;
+}
+
+//===----------------------------------------------------------------------===//
+// MicroMIPS 32-bit Instruction Formats
+//===----------------------------------------------------------------------===//
+
+class MMArch {
+  string Arch = "micromips";
+  list<dag> Pattern = [];
+}
+
+class ADD_FM_MM<bits<6> op, bits<10> funct> : MMArch {
+  bits<5> rt;
+  bits<5> rs;
+  bits<5> rd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-11} = rd;
+  let Inst{10}    = 0;
+  let Inst{9-0}   = funct;
+}
+
+class ADDI_FM_MM<bits<6> op> : MMArch {
+  bits<5>  rs;
+  bits<5>  rt;
+  bits<16> imm16;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-0}  = imm16;
+}
+
+class SLTI_FM_MM<bits<6> op> : MMArch {
+  bits<5> rt;
+  bits<5> rs;
+  bits<16> imm16;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-0}  = imm16;
+}
+
+class LUI_FM_MM : MMArch {
+  bits<5> rt;
+  bits<16> imm16;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x10;
+  let Inst{25-21} = 0xd;
+  let Inst{20-16} = rt;
+  let Inst{15-0}  = imm16;
+}
+
+class MULT_FM_MM<bits<10> funct> : MMArch {
+  bits<5>  rs;
+  bits<5>  rt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-6}  = funct;
+  let Inst{5-0}   = 0x3c;
+}
+
+class SRA_FM_MM<bits<10> funct, bit rotate> : MMArch {
+  bits<5> rd;
+  bits<5> rt;
+  bits<5> shamt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = rd;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = shamt;
+  let Inst{10}    = rotate;
+  let Inst{9-0}   = funct;
+}
+
+class SRLV_FM_MM<bits<10> funct, bit rotate> : MMArch {
+  bits<5> rd;
+  bits<5> rt;
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-11} = rd;
+  let Inst{10}    = rotate;
+  let Inst{9-0}   = funct;
+}
+
+class LW_FM_MM<bits<6> op> : MMArch {
+  bits<5> rt;
+  bits<21> addr;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = addr{20-16};
+  let Inst{15-0}  = addr{15-0};
+}
+
+class LWL_FM_MM<bits<4> funct> {
+  bits<5> rt;
+  bits<21> addr;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x18;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = addr{20-16};
+  let Inst{15-12} = funct;
+  let Inst{11-0}  = addr{11-0};
+}
+
+class CMov_F_I_FM_MM<bits<7> func> : MMArch {
+  bits<5> rd;
+  bits<5> rs;
+  bits<3> fcc;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x15;
+  let Inst{25-21} = rd;
+  let Inst{20-16} = rs;
+  let Inst{15-13} = fcc;
+  let Inst{12-6}  = func;
+  let Inst{5-0}   = 0x3b;
+}
+
+class MTLO_FM_MM<bits<10> funct> : MMArch {
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-21} = 0x00;
+  let Inst{20-16} = rs;
+  let Inst{15-6}  = funct;
+  let Inst{5-0}   = 0x3c;
+}
+
+class MFLO_FM_MM<bits<10> funct> : MMArch {
+  bits<5> rd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-21} = 0x00;
+  let Inst{20-16} = rd;
+  let Inst{15-6}  = funct;
+  let Inst{5-0}   = 0x3c;
+}
+
+class CLO_FM_MM<bits<10> funct> : MMArch {
+  bits<5> rd;
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-21} = rd;
+  let Inst{20-16} = rs;
+  let Inst{15-6}  = funct;
+  let Inst{5-0}   = 0x3c;
+}
+
+class SEB_FM_MM<bits<10> funct> : MMArch {
+  bits<5> rd;
+  bits<5> rt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-21} = rd;
+  let Inst{20-16} = rt;
+  let Inst{15-6}  = funct;
+  let Inst{5-0}   = 0x3c;
+}
+
+class EXT_FM_MM<bits<6> funct> : MMArch {
+  bits<5> rt;
+  bits<5> rs;
+  bits<5> pos;
+  bits<5> size;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-11} = size;
+  let Inst{10-6}  = pos;
+  let Inst{5-0}   = funct;
+}
+
+class J_FM_MM<bits<6> op> : MMArch {
+  bits<26> target;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-0}  = target;
+}
+
+class JR_FM_MM<bits<8> funct> : MMArch {
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-21} = 0x00;
+  let Inst{20-16} = rs;
+  let Inst{15-14} = 0x0;
+  let Inst{13-6}  = funct;
+  let Inst{5-0}   = 0x3c;
+}
+
+class JALR_FM_MM<bits<10> funct> {
+  bits<5> rs;
+  bits<5> rd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-21} = rd;
+  let Inst{20-16} = rs;
+  let Inst{15-6}  = funct;
+  let Inst{5-0}   = 0x3c;
+}
+
+class BEQ_FM_MM<bits<6> op> : MMArch {
+  bits<5>  rs;
+  bits<5>  rt;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-0}  = offset;
+}
+
+class BGEZ_FM_MM<bits<5> funct> : MMArch {
+  bits<5>  rs;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x10;
+  let Inst{25-21} = funct;
+  let Inst{20-16} = rs;
+  let Inst{15-0}  = offset;
+}
+
+class BGEZAL_FM_MM<bits<5> funct> : MMArch {
+  bits<5>  rs;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x10;
+  let Inst{25-21} = funct;
+  let Inst{20-16} = rs;
+  let Inst{15-0}  = offset;
+}
+
+class SYNC_FM_MM : MMArch {
+  bits<5> stype;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-21} = 0x0;
+  let Inst{20-16} = stype;
+  let Inst{15-6}  = 0x1ad;
+  let Inst{5-0}   = 0x3c;
+}
+
+class BRK_FM_MM : MMArch {
+  bits<10> code_1;
+  bits<10> code_2;
+  bits<32> Inst;
+  let Inst{31-26} = 0x0;
+  let Inst{25-16} = code_1;
+  let Inst{15-6}  = code_2;
+  let Inst{5-0}   = 0x07;
+}
+
+class SYS_FM_MM : MMArch {
+  bits<10> code_;
+  bits<32> Inst;
+  let Inst{31-26} = 0x0;
+  let Inst{25-16} = code_;
+  let Inst{15-6}  = 0x22d;
+  let Inst{5-0}   = 0x3c;
+}
+
+class WAIT_FM_MM {
+  bits<10> code_;
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-16} = code_;
+  let Inst{15-6}  = 0x24d;
+  let Inst{5-0}   = 0x3c;
+}
+
+class ER_FM_MM<bits<10> funct> : MMArch {
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-16} = 0x00;
+  let Inst{15-6}  = funct;
+  let Inst{5-0}   = 0x3c;
+}
+
+class EI_FM_MM<bits<10> funct> : MMArch {
+  bits<32> Inst;
+  bits<5> rt;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-21} = 0x00;
+  let Inst{20-16} = rt;
+  let Inst{15-6}  = funct;
+  let Inst{5-0}   = 0x3c;
+}
+
+class TEQ_FM_MM<bits<6> funct> : MMArch {
+  bits<5> rs;
+  bits<5> rt;
+  bits<4> code_;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-12} = code_;
+  let Inst{11-6}  = funct;
+  let Inst{5-0}   = 0x3c;
+}
+
+class TEQI_FM_MM<bits<5> funct> : MMArch {
+  bits<5> rs;
+  bits<16> imm16;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x10;
+  let Inst{25-21} = funct;
+  let Inst{20-16} = rs;
+  let Inst{15-0}  = imm16;
+}
+
+class LL_FM_MM<bits<4> funct> {
+  bits<5> rt;
+  bits<21> addr;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x18;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = addr{20-16};
+  let Inst{15-12} = funct;
+  let Inst{11-0}  = addr{11-0};
+}
+
+class ADDS_FM_MM<bits<2> fmt, bits<8> funct> : MMArch {
+  bits<5> ft;
+  bits<5> fs;
+  bits<5> fd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x15;
+  let Inst{25-21} = ft;
+  let Inst{20-16} = fs;
+  let Inst{15-11} = fd;
+  let Inst{10}    = 0;
+  let Inst{9-8}   = fmt;
+  let Inst{7-0}   = funct;
+
+  list<dag> Pattern = [];
+}
+
+class LWXC1_FM_MM<bits<9> funct> : MMArch {
+  bits<5> fd;
+  bits<5> base;
+  bits<5> index;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x15;
+  let Inst{25-21} = index;
+  let Inst{20-16} = base;
+  let Inst{15-11} = fd;
+  let Inst{10-9}  = 0x0;
+  let Inst{8-0}   = funct;
+}
+
+class SWXC1_FM_MM<bits<9> funct> : MMArch {
+  bits<5> fs;
+  bits<5> base;
+  bits<5> index;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x15;
+  let Inst{25-21} = index;
+  let Inst{20-16} = base;
+  let Inst{15-11} = fs;
+  let Inst{10-9}  = 0x0;
+  let Inst{8-0}   = funct;
+}
+
+class CEQS_FM_MM<bits<2> fmt> : MMArch {
+  bits<5> fs;
+  bits<5> ft;
+  bits<4> cond;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x15;
+  let Inst{25-21} = ft;
+  let Inst{20-16} = fs;
+  let Inst{15-13} = 0x0;  // cc
+  let Inst{12}    = 0;
+  let Inst{11-10} = fmt;
+  let Inst{9-6}   = cond;
+  let Inst{5-0}   = 0x3c;
+}
+
+class BC1F_FM_MM<bits<5> tf> : MMArch {
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x10;
+  let Inst{25-21} = tf;
+  let Inst{20-18} = 0x0; // cc
+  let Inst{17-16} = 0x0;
+  let Inst{15-0}  = offset;
+}
+
+class ROUND_W_FM_MM<bits<1> fmt, bits<8> funct> : MMArch {
+  bits<5> fd;
+  bits<5> fs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x15;
+  let Inst{25-21} = fd;
+  let Inst{20-16} = fs;
+  let Inst{15}    = 0;
+  let Inst{14}    = fmt;
+  let Inst{13-6}  = funct;
+  let Inst{5-0}   = 0x3b;
+}
+
+class ABS_FM_MM<bits<2> fmt, bits<7> funct> : MMArch {
+  bits<5> fd;
+  bits<5> fs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x15;
+  let Inst{25-21} = fd;
+  let Inst{20-16} = fs;
+  let Inst{15}    = 0;
+  let Inst{14-13} = fmt;
+  let Inst{12-6}  = funct;
+  let Inst{5-0}   = 0x3b;
+}
+
+class CMov_F_F_FM_MM<bits<9> func, bits<2> fmt> : MMArch {
+  bits<5> fd;
+  bits<5> fs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x15;
+  let Inst{25-21} = fd;
+  let Inst{20-16} = fs;
+  let Inst{15-13} = 0x0; //cc
+  let Inst{12-11} = 0x0;
+  let Inst{10-9}  = fmt;
+  let Inst{8-0}   = func;
+}
+
+class CMov_I_F_FM_MM<bits<8> funct, bits<2> fmt> : MMArch {
+  bits<5> fd;
+  bits<5> fs;
+  bits<5> rt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x15;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = fs;
+  let Inst{15-11} = fd;
+  let Inst{9-8}   = fmt;
+  let Inst{7-0}   = funct;
+}
+
+class MFC1_FM_MM<bits<8> funct> : MMArch {
+  bits<5> rt;
+  bits<5> fs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x15;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = fs;
+  let Inst{15-14} = 0x0;
+  let Inst{13-6}  = funct;
+  let Inst{5-0}   = 0x3b;
+}
+
+class MADDS_FM_MM<bits<6> funct>: MMArch {
+  bits<5> ft;
+  bits<5> fs;
+  bits<5> fd;
+  bits<5> fr;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x15;
+  let Inst{25-21} = ft;
+  let Inst{20-16} = fs;
+  let Inst{15-11} = fd;
+  let Inst{10-6}  = fr;
+  let Inst{5-0}   = funct;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MicroMipsInstrInfo.td b/contrib/llvm/lib/Target/Mips/MicroMipsInstrInfo.td
new file mode 100644
index 0000000..87a3a3e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MicroMipsInstrInfo.td
@@ -0,0 +1,306 @@
+def addrimm12 : ComplexPattern<iPTR, 2, "selectIntAddrMM", [frameindex]>;
+
+def simm12 : Operand<i32> {
+  let DecoderMethod = "DecodeSimm12";
+}
+
+def mem_mm_12 : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops GPR32, simm12);
+  let EncoderMethod = "getMemEncodingMMImm12";
+  let ParserMatchClass = MipsMemAsmOperand;
+  let OperandType = "OPERAND_MEMORY";
+}
+
+def jmptarget_mm : Operand<OtherVT> {
+  let EncoderMethod = "getJumpTargetOpValueMM";
+}
+
+def calltarget_mm : Operand<iPTR> {
+  let EncoderMethod = "getJumpTargetOpValueMM";
+}
+
+def brtarget_mm : Operand<OtherVT> {
+  let EncoderMethod = "getBranchTargetOpValueMM";
+  let OperandType   = "OPERAND_PCREL";
+  let DecoderMethod = "DecodeBranchTargetMM";
+}
+
+let canFoldAsLoad = 1 in
+class LoadLeftRightMM<string opstr, SDNode OpNode, RegisterOperand RO,
+                      Operand MemOpnd> :
+  InstSE<(outs RO:$rt), (ins MemOpnd:$addr, RO:$src),
+         !strconcat(opstr, "\t$rt, $addr"),
+         [(set RO:$rt, (OpNode addrimm12:$addr, RO:$src))],
+         NoItinerary, FrmI> {
+  let DecoderMethod = "DecodeMemMMImm12";
+  string Constraints = "$src = $rt";
+}
+
+class StoreLeftRightMM<string opstr, SDNode OpNode, RegisterOperand RO,
+                       Operand MemOpnd>:
+  InstSE<(outs), (ins RO:$rt, MemOpnd:$addr),
+         !strconcat(opstr, "\t$rt, $addr"),
+         [(OpNode RO:$rt, addrimm12:$addr)], NoItinerary, FrmI> {
+  let DecoderMethod = "DecodeMemMMImm12";
+}
+
+class LLBaseMM<string opstr, RegisterOperand RO> :
+  InstSE<(outs RO:$rt), (ins mem_mm_12:$addr),
+         !strconcat(opstr, "\t$rt, $addr"), [], NoItinerary, FrmI> {
+  let DecoderMethod = "DecodeMemMMImm12";
+  let mayLoad = 1;
+}
+
+class SCBaseMM<string opstr, RegisterOperand RO> :
+  InstSE<(outs RO:$dst), (ins RO:$rt, mem_mm_12:$addr),
+         !strconcat(opstr, "\t$rt, $addr"), [], NoItinerary, FrmI> {
+  let DecoderMethod = "DecodeMemMMImm12";
+  let mayStore = 1;
+  let Constraints = "$rt = $dst";
+}
+
+class LoadMM<string opstr, DAGOperand RO, SDPatternOperator OpNode = null_frag,
+             InstrItinClass Itin = NoItinerary> :
+  InstSE<(outs RO:$rt), (ins mem_mm_12:$addr),
+         !strconcat(opstr, "\t$rt, $addr"),
+         [(set RO:$rt, (OpNode addrimm12:$addr))], Itin, FrmI> {
+  let DecoderMethod = "DecodeMemMMImm12";
+  let canFoldAsLoad = 1;
+  let mayLoad = 1;
+}
+
+class MoveFromHILOMM<string opstr, RegisterOperand RO, Register UseReg> :
+      MicroMipsInst16<(outs RO:$rd), (ins), !strconcat(opstr, "\t$rd"),
+  [], II_MFHI_MFLO, FrmR> {
+  let Uses = [UseReg];
+  let hasSideEffects = 0;
+}
+
+class MoveMM16<string opstr, RegisterOperand RO, bit isComm = 0,
+               InstrItinClass Itin = NoItinerary> :
+  MicroMipsInst16<(outs RO:$rd), (ins RO:$rs),
+                  !strconcat(opstr, "\t$rd, $rs"), [], Itin, FrmR> {
+  let isCommutable = isComm;
+  let isReMaterializable = 1;
+}
+
+// 16-bit Jump and Link (Call)
+class JumpLinkRegMM16<string opstr, RegisterOperand RO> :
+  MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"),
+           [(MipsJmpLink RO:$rs)], IIBranch, FrmR> {
+  let isCall = 1;
+  let hasDelaySlot = 1;
+  let Defs = [RA];
+}
+
+def MFHI16_MM : MoveFromHILOMM<"mfhi", GPR32Opnd, AC0>, MFHILO_FM_MM16<0x10>;
+def MFLO16_MM : MoveFromHILOMM<"mflo", GPR32Opnd, AC0>, MFHILO_FM_MM16<0x12>;
+def MOVE16_MM : MoveMM16<"move", GPR32Opnd>, MOVE_FM_MM16<0x03>;
+def JALR16_MM : JumpLinkRegMM16<"jalr", GPR32Opnd>, JALR_FM_MM16<0x0e>;
+
+class WaitMM<string opstr> :
+  InstSE<(outs), (ins uimm10:$code_), !strconcat(opstr, "\t$code_"), [],
+         NoItinerary, FrmOther, opstr>;
+
+let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
+  /// Arithmetic Instructions (ALU Immediate)
+  def ADDiu_MM : MMRel, ArithLogicI<"addiu", simm16, GPR32Opnd>,
+                 ADDI_FM_MM<0xc>;
+  def ADDi_MM  : MMRel, ArithLogicI<"addi", simm16, GPR32Opnd>,
+                 ADDI_FM_MM<0x4>;
+  def SLTi_MM  : MMRel, SetCC_I<"slti", setlt, simm16, immSExt16, GPR32Opnd>,
+                 SLTI_FM_MM<0x24>;
+  def SLTiu_MM : MMRel, SetCC_I<"sltiu", setult, simm16, immSExt16, GPR32Opnd>,
+                 SLTI_FM_MM<0x2c>;
+  def ANDi_MM  : MMRel, ArithLogicI<"andi", uimm16, GPR32Opnd>,
+                 ADDI_FM_MM<0x34>;
+  def ORi_MM   : MMRel, ArithLogicI<"ori", uimm16, GPR32Opnd>,
+                 ADDI_FM_MM<0x14>;
+  def XORi_MM  : MMRel, ArithLogicI<"xori", uimm16, GPR32Opnd>,
+                 ADDI_FM_MM<0x1c>;
+  def LUi_MM   : MMRel, LoadUpper<"lui", GPR32Opnd, uimm16>, LUI_FM_MM;
+
+  def LEA_ADDiu_MM : MMRel, EffectiveAddress<"addiu", GPR32Opnd>,
+                     LW_FM_MM<0xc>;
+
+  /// Arithmetic Instructions (3-Operand, R-Type)
+  def ADDu_MM  : MMRel, ArithLogicR<"addu", GPR32Opnd>, ADD_FM_MM<0, 0x150>;
+  def SUBu_MM  : MMRel, ArithLogicR<"subu", GPR32Opnd>, ADD_FM_MM<0, 0x1d0>;
+  def MUL_MM   : MMRel, ArithLogicR<"mul", GPR32Opnd>, ADD_FM_MM<0, 0x210>;
+  def ADD_MM   : MMRel, ArithLogicR<"add", GPR32Opnd>, ADD_FM_MM<0, 0x110>;
+  def SUB_MM   : MMRel, ArithLogicR<"sub", GPR32Opnd>, ADD_FM_MM<0, 0x190>;
+  def SLT_MM   : MMRel, SetCC_R<"slt", setlt, GPR32Opnd>, ADD_FM_MM<0, 0x350>;
+  def SLTu_MM  : MMRel, SetCC_R<"sltu", setult, GPR32Opnd>,
+                 ADD_FM_MM<0, 0x390>;
+  def AND_MM   : MMRel, ArithLogicR<"and", GPR32Opnd, 1, II_AND, and>,
+                 ADD_FM_MM<0, 0x250>;
+  def OR_MM    : MMRel, ArithLogicR<"or", GPR32Opnd, 1, II_OR, or>,
+                 ADD_FM_MM<0, 0x290>;
+  def XOR_MM   : MMRel, ArithLogicR<"xor", GPR32Opnd, 1, II_XOR, xor>,
+                 ADD_FM_MM<0, 0x310>;
+  def NOR_MM   : MMRel, LogicNOR<"nor", GPR32Opnd>, ADD_FM_MM<0, 0x2d0>;
+  def MULT_MM  : MMRel, Mult<"mult", II_MULT, GPR32Opnd, [HI0, LO0]>,
+                 MULT_FM_MM<0x22c>;
+  def MULTu_MM : MMRel, Mult<"multu", II_MULTU, GPR32Opnd, [HI0, LO0]>,
+                 MULT_FM_MM<0x26c>;
+  def SDIV_MM  : MMRel, Div<"div", II_DIV, GPR32Opnd, [HI0, LO0]>,
+                 MULT_FM_MM<0x2ac>;
+  def UDIV_MM  : MMRel, Div<"divu", II_DIVU, GPR32Opnd, [HI0, LO0]>,
+                 MULT_FM_MM<0x2ec>;
+
+  /// Shift Instructions
+  def SLL_MM   : MMRel, shift_rotate_imm<"sll", uimm5, GPR32Opnd, II_SLL>,
+                 SRA_FM_MM<0, 0>;
+  def SRL_MM   : MMRel, shift_rotate_imm<"srl", uimm5, GPR32Opnd, II_SRL>,
+                 SRA_FM_MM<0x40, 0>;
+  def SRA_MM   : MMRel, shift_rotate_imm<"sra", uimm5, GPR32Opnd, II_SRA>,
+                 SRA_FM_MM<0x80, 0>;
+  def SLLV_MM  : MMRel, shift_rotate_reg<"sllv", GPR32Opnd, II_SLLV>,
+                 SRLV_FM_MM<0x10, 0>;
+  def SRLV_MM  : MMRel, shift_rotate_reg<"srlv", GPR32Opnd, II_SRLV>,
+                 SRLV_FM_MM<0x50, 0>;
+  def SRAV_MM  : MMRel, shift_rotate_reg<"srav", GPR32Opnd, II_SRAV>,
+                 SRLV_FM_MM<0x90, 0>;
+  def ROTR_MM  : MMRel, shift_rotate_imm<"rotr", uimm5, GPR32Opnd, II_ROTR>,
+                 SRA_FM_MM<0xc0, 0>;
+  def ROTRV_MM : MMRel, shift_rotate_reg<"rotrv", GPR32Opnd, II_ROTRV>,
+                 SRLV_FM_MM<0xd0, 0>;
+
+  /// Load and Store Instructions - aligned
+  let DecoderMethod = "DecodeMemMMImm16" in {
+    def LB_MM  : Load<"lb", GPR32Opnd>, MMRel, LW_FM_MM<0x7>;
+    def LBu_MM : Load<"lbu", GPR32Opnd>, MMRel, LW_FM_MM<0x5>;
+    def LH_MM  : Load<"lh", GPR32Opnd>, MMRel, LW_FM_MM<0xf>;
+    def LHu_MM : Load<"lhu", GPR32Opnd>, MMRel, LW_FM_MM<0xd>;
+    def LW_MM  : Load<"lw", GPR32Opnd>, MMRel, LW_FM_MM<0x3f>;
+    def SB_MM  : Store<"sb", GPR32Opnd>, MMRel, LW_FM_MM<0x6>;
+    def SH_MM  : Store<"sh", GPR32Opnd>, MMRel, LW_FM_MM<0xe>;
+    def SW_MM  : Store<"sw", GPR32Opnd>, MMRel, LW_FM_MM<0x3e>;
+  }
+
+  def LWU_MM : LoadMM<"lwu", GPR32Opnd, zextloadi32, II_LWU>, LL_FM_MM<0xe>;
+
+  /// Load and Store Instructions - unaligned
+  def LWL_MM : LoadLeftRightMM<"lwl", MipsLWL, GPR32Opnd, mem_mm_12>,
+               LWL_FM_MM<0x0>;
+  def LWR_MM : LoadLeftRightMM<"lwr", MipsLWR, GPR32Opnd, mem_mm_12>,
+               LWL_FM_MM<0x1>;
+  def SWL_MM : StoreLeftRightMM<"swl", MipsSWL, GPR32Opnd, mem_mm_12>,
+               LWL_FM_MM<0x8>;
+  def SWR_MM : StoreLeftRightMM<"swr", MipsSWR, GPR32Opnd, mem_mm_12>,
+               LWL_FM_MM<0x9>;
+
+  /// Move Conditional
+  def MOVZ_I_MM : MMRel, CMov_I_I_FT<"movz", GPR32Opnd, GPR32Opnd,
+                  NoItinerary>, ADD_FM_MM<0, 0x58>;
+  def MOVN_I_MM : MMRel, CMov_I_I_FT<"movn", GPR32Opnd, GPR32Opnd,
+                  NoItinerary>, ADD_FM_MM<0, 0x18>;
+  def MOVT_I_MM : MMRel, CMov_F_I_FT<"movt", GPR32Opnd, II_MOVT>,
+                  CMov_F_I_FM_MM<0x25>;
+  def MOVF_I_MM : MMRel, CMov_F_I_FT<"movf", GPR32Opnd, II_MOVF>,
+                  CMov_F_I_FM_MM<0x5>;
+
+  /// Move to/from HI/LO
+  def MTHI_MM : MMRel, MoveToLOHI<"mthi", GPR32Opnd, [HI0]>,
+                MTLO_FM_MM<0x0b5>;
+  def MTLO_MM : MMRel, MoveToLOHI<"mtlo", GPR32Opnd, [LO0]>,
+                MTLO_FM_MM<0x0f5>;
+  def MFHI_MM : MMRel, MoveFromLOHI<"mfhi", GPR32Opnd, AC0>,
+                MFLO_FM_MM<0x035>;
+  def MFLO_MM : MMRel, MoveFromLOHI<"mflo", GPR32Opnd, AC0>,
+                MFLO_FM_MM<0x075>;
+
+  /// Multiply Add/Sub Instructions
+  def MADD_MM  : MMRel, MArithR<"madd", II_MADD, 1>, MULT_FM_MM<0x32c>;
+  def MADDU_MM : MMRel, MArithR<"maddu", II_MADDU, 1>, MULT_FM_MM<0x36c>;
+  def MSUB_MM  : MMRel, MArithR<"msub", II_MSUB>, MULT_FM_MM<0x3ac>;
+  def MSUBU_MM : MMRel, MArithR<"msubu", II_MSUBU>, MULT_FM_MM<0x3ec>;
+
+  /// Count Leading
+  def CLZ_MM : MMRel, CountLeading0<"clz", GPR32Opnd>, CLO_FM_MM<0x16c>,
+               ISA_MIPS32;
+  def CLO_MM : MMRel, CountLeading1<"clo", GPR32Opnd>, CLO_FM_MM<0x12c>,
+               ISA_MIPS32;
+
+  /// Sign Ext In Register Instructions.
+  def SEB_MM : MMRel, SignExtInReg<"seb", i8, GPR32Opnd, II_SEB>,
+               SEB_FM_MM<0x0ac>, ISA_MIPS32R2;
+  def SEH_MM : MMRel, SignExtInReg<"seh", i16, GPR32Opnd, II_SEH>,
+               SEB_FM_MM<0x0ec>, ISA_MIPS32R2;
+
+  /// Word Swap Bytes Within Halfwords
+  def WSBH_MM : MMRel, SubwordSwap<"wsbh", GPR32Opnd>, SEB_FM_MM<0x1ec>,
+                ISA_MIPS32R2;
+
+  def EXT_MM : MMRel, ExtBase<"ext", GPR32Opnd, uimm5, MipsExt>,
+               EXT_FM_MM<0x2c>;
+  def INS_MM : MMRel, InsBase<"ins", GPR32Opnd, uimm5, MipsIns>,
+               EXT_FM_MM<0x0c>;
+
+  /// Jump Instructions
+  let DecoderMethod = "DecodeJumpTargetMM" in {
+    def J_MM        : MMRel, JumpFJ<jmptarget_mm, "j", br, bb, "j">,
+                      J_FM_MM<0x35>;
+    def JAL_MM      : MMRel, JumpLink<"jal", calltarget_mm>, J_FM_MM<0x3d>;
+  }
+  def JR_MM   : MMRel, IndirectBranch<"jr", GPR32Opnd>, JR_FM_MM<0x3c>;
+  def JALR_MM : JumpLinkReg<"jalr", GPR32Opnd>, JALR_FM_MM<0x03c>;
+
+  /// Branch Instructions
+  def BEQ_MM  : MMRel, CBranch<"beq", brtarget_mm, seteq, GPR32Opnd>,
+                BEQ_FM_MM<0x25>;
+  def BNE_MM  : MMRel, CBranch<"bne", brtarget_mm, setne, GPR32Opnd>,
+                BEQ_FM_MM<0x2d>;
+  def BGEZ_MM : MMRel, CBranchZero<"bgez", brtarget_mm, setge, GPR32Opnd>,
+                BGEZ_FM_MM<0x2>;
+  def BGTZ_MM : MMRel, CBranchZero<"bgtz", brtarget_mm, setgt, GPR32Opnd>,
+                BGEZ_FM_MM<0x6>;
+  def BLEZ_MM : MMRel, CBranchZero<"blez", brtarget_mm, setle, GPR32Opnd>,
+                BGEZ_FM_MM<0x4>;
+  def BLTZ_MM : MMRel, CBranchZero<"bltz", brtarget_mm, setlt, GPR32Opnd>,
+                BGEZ_FM_MM<0x0>;
+  def BGEZAL_MM : MMRel, BGEZAL_FT<"bgezal", brtarget_mm, GPR32Opnd>,
+                  BGEZAL_FM_MM<0x03>;
+  def BLTZAL_MM : MMRel, BGEZAL_FT<"bltzal", brtarget_mm, GPR32Opnd>,
+                  BGEZAL_FM_MM<0x01>;
+
+  /// Control Instructions
+  def SYNC_MM    : MMRel, SYNC_FT<"sync">, SYNC_FM_MM;
+  def BREAK_MM   : MMRel, BRK_FT<"break">, BRK_FM_MM;
+  def SYSCALL_MM : MMRel, SYS_FT<"syscall">, SYS_FM_MM;
+  def WAIT_MM    : WaitMM<"wait">, WAIT_FM_MM;
+  def ERET_MM    : MMRel, ER_FT<"eret">, ER_FM_MM<0x3cd>;
+  def DERET_MM   : MMRel, ER_FT<"deret">, ER_FM_MM<0x38d>;
+  def EI_MM      : MMRel, DEI_FT<"ei", GPR32Opnd>, EI_FM_MM<0x15d>,
+                   ISA_MIPS32R2;
+  def DI_MM      : MMRel, DEI_FT<"di", GPR32Opnd>, EI_FM_MM<0x11d>,
+                   ISA_MIPS32R2;
+
+  /// Trap Instructions
+  def TEQ_MM  : MMRel, TEQ_FT<"teq", GPR32Opnd>, TEQ_FM_MM<0x0>;
+  def TGE_MM  : MMRel, TEQ_FT<"tge", GPR32Opnd>, TEQ_FM_MM<0x08>;
+  def TGEU_MM : MMRel, TEQ_FT<"tgeu", GPR32Opnd>, TEQ_FM_MM<0x10>;
+  def TLT_MM  : MMRel, TEQ_FT<"tlt", GPR32Opnd>, TEQ_FM_MM<0x20>;
+  def TLTU_MM : MMRel, TEQ_FT<"tltu", GPR32Opnd>, TEQ_FM_MM<0x28>;
+  def TNE_MM  : MMRel, TEQ_FT<"tne", GPR32Opnd>, TEQ_FM_MM<0x30>;
+
+  def TEQI_MM  : MMRel, TEQI_FT<"teqi", GPR32Opnd>, TEQI_FM_MM<0x0e>;
+  def TGEI_MM  : MMRel, TEQI_FT<"tgei", GPR32Opnd>, TEQI_FM_MM<0x09>;
+  def TGEIU_MM : MMRel, TEQI_FT<"tgeiu", GPR32Opnd>, TEQI_FM_MM<0x0b>;
+  def TLTI_MM  : MMRel, TEQI_FT<"tlti", GPR32Opnd>, TEQI_FM_MM<0x08>;
+  def TLTIU_MM : MMRel, TEQI_FT<"tltiu", GPR32Opnd>, TEQI_FM_MM<0x0a>;
+  def TNEI_MM  : MMRel, TEQI_FT<"tnei", GPR32Opnd>, TEQI_FM_MM<0x0c>;
+
+  /// Load-linked, Store-conditional
+  def LL_MM : LLBaseMM<"ll", GPR32Opnd>, LL_FM_MM<0x3>;
+  def SC_MM : SCBaseMM<"sc", GPR32Opnd>, LL_FM_MM<0xb>;
+}
+
+//===----------------------------------------------------------------------===//
+// MicroMips instruction aliases
+//===----------------------------------------------------------------------===//
+
+let Predicates = [InMicroMips] in {
+  def : MipsInstAlias<"wait", (WAIT_MM 0x0), 1>;
+}
diff --git a/contrib/llvm/lib/Target/Mips/Mips.h b/contrib/llvm/lib/Target/Mips/Mips.h
new file mode 100644
index 0000000..d512d65
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips.h
@@ -0,0 +1,34 @@
+//===-- Mips.h - Top-level interface for Mips representation ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in
+// the LLVM Mips back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_MIPS_H
+#define TARGET_MIPS_H
+
+#include "MCTargetDesc/MipsMCTargetDesc.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+  class MipsTargetMachine;
+  class FunctionPass;
+
+  FunctionPass *createMipsISelDag(MipsTargetMachine &TM);
+  FunctionPass *createMipsOptimizePICCallPass(MipsTargetMachine &TM);
+  FunctionPass *createMipsDelaySlotFillerPass(MipsTargetMachine &TM);
+  FunctionPass *createMipsLongBranchPass(MipsTargetMachine &TM);
+  FunctionPass *createMipsJITCodeEmitterPass(MipsTargetMachine &TM,
+                                             JITCodeEmitter &JCE);
+  FunctionPass *createMipsConstantIslandPass(MipsTargetMachine &tm);
+} // end namespace llvm;
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/Mips.td b/contrib/llvm/lib/Target/Mips/Mips.td
new file mode 100644
index 0000000..dd3bc9b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips.td
@@ -0,0 +1,194 @@
+//===-- Mips.td - Describe the Mips Target Machine ---------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This is the top level entry point for the Mips target.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+// The overall idea of the PredicateControl class is to chop the Predicates list
+// into subsets that are usually overridden independently. This allows
+// subclasses to partially override the predicates of their superclasses without
+// having to re-add all the existing predicates.
+class PredicateControl {
+  // Predicates for the encoding scheme in use such as HasStdEnc
+  list<Predicate> EncodingPredicates = [];
+  // Predicates for the GPR size such as IsGP64bit
+  list<Predicate> GPRPredicates = [];
+  // Predicates for the FGR size and layout such as IsFP64bit
+  list<Predicate> FGRPredicates = [];
+  // Predicates for the instruction group membership such as ISA's and ASE's
+  list<Predicate> InsnPredicates = [];
+  // Predicates for anything else
+  list<Predicate> AdditionalPredicates = [];
+  list<Predicate> Predicates = !listconcat(EncodingPredicates,
+                                           GPRPredicates,
+                                           FGRPredicates,
+                                           InsnPredicates,
+                                           AdditionalPredicates);
+}
+
+// Like Requires<> but for the AdditionalPredicates list
+class AdditionalRequires<list<Predicate> preds> {
+  list<Predicate> AdditionalPredicates = preds;
+}
+
+//===----------------------------------------------------------------------===//
+// Register File, Calling Conv, Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "MipsRegisterInfo.td"
+include "MipsSchedule.td"
+include "MipsInstrInfo.td"
+include "MipsCallingConv.td"
+
+def MipsInstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// Mips Subtarget features                                                    //
+//===----------------------------------------------------------------------===//
+
+def FeatureGP64Bit     : SubtargetFeature<"gp64", "IsGP64bit", "true",
+                                "General Purpose Registers are 64-bit wide.">;
+def FeatureFP64Bit     : SubtargetFeature<"fp64", "IsFP64bit", "true",
+                                "Support 64-bit FP registers.">;
+def FeatureFPXX        : SubtargetFeature<"fpxx", "IsFPXX", "true",
+                                "Support for FPXX.">;
+def FeatureNaN2008     : SubtargetFeature<"nan2008", "IsNaN2008bit", "true",
+                                "IEEE 754-2008 NaN encoding.">;
+def FeatureSingleFloat : SubtargetFeature<"single-float", "IsSingleFloat",
+                                "true", "Only supports single precision float">;
+def FeatureO32         : SubtargetFeature<"o32", "MipsABI", "O32",
+                                "Enable o32 ABI">;
+def FeatureN32         : SubtargetFeature<"n32", "MipsABI", "N32",
+                                "Enable n32 ABI">;
+def FeatureN64         : SubtargetFeature<"n64", "MipsABI", "N64",
+                                "Enable n64 ABI">;
+def FeatureEABI        : SubtargetFeature<"eabi", "MipsABI", "EABI",
+                                "Enable eabi ABI">;
+def FeatureNoOddSPReg  : SubtargetFeature<"nooddspreg", "UseOddSPReg", "false",
+                              "Disable odd numbered single-precision "
+                              "registers">;
+def FeatureVFPU        : SubtargetFeature<"vfpu", "HasVFPU",
+                                "true", "Enable vector FPU instructions.">;
+def FeatureMips1       : SubtargetFeature<"mips1", "MipsArchVersion", "Mips1",
+                                "Mips I ISA Support [highly experimental]">;
+def FeatureMips2       : SubtargetFeature<"mips2", "MipsArchVersion", "Mips2",
+                                "Mips II ISA Support [highly experimental]",
+                                [FeatureMips1]>;
+def FeatureMips3_32    : SubtargetFeature<"mips3_32", "HasMips3_32", "true",
+                                "Subset of MIPS-III that is also in MIPS32 "
+                                "[highly experimental]">;
+def FeatureMips3_32r2  : SubtargetFeature<"mips3_32r2", "HasMips3_32r2", "true",
+                                "Subset of MIPS-III that is also in MIPS32r2 "
+                                "[highly experimental]">;
+def FeatureMips3       : SubtargetFeature<"mips3", "MipsArchVersion", "Mips3",
+                                "MIPS III ISA Support [highly experimental]",
+                                [FeatureMips2, FeatureMips3_32,
+                                 FeatureMips3_32r2, FeatureGP64Bit,
+                                 FeatureFP64Bit]>;
+def FeatureMips4_32    : SubtargetFeature<"mips4_32", "HasMips4_32", "true",
+                                "Subset of MIPS-IV that is also in MIPS32 "
+                                "[highly experimental]">;
+def FeatureMips4_32r2  : SubtargetFeature<"mips4_32r2", "HasMips4_32r2", "true",
+                                "Subset of MIPS-IV that is also in MIPS32r2 "
+                                "[highly experimental]">;
+def FeatureMips4       : SubtargetFeature<"mips4", "MipsArchVersion",
+                                "Mips4", "MIPS IV ISA Support",
+                                [FeatureMips3, FeatureMips4_32,
+                                 FeatureMips4_32r2]>;
+def FeatureMips5_32r2  : SubtargetFeature<"mips5_32r2", "HasMips5_32r2", "true",
+                                "Subset of MIPS-V that is also in MIPS32r2 "
+                                "[highly experimental]">;
+def FeatureMips5       : SubtargetFeature<"mips5", "MipsArchVersion", "Mips5",
+                                "MIPS V ISA Support [highly experimental]",
+                                [FeatureMips4, FeatureMips5_32r2]>;
+def FeatureMips32      : SubtargetFeature<"mips32", "MipsArchVersion", "Mips32",
+                                "Mips32 ISA Support",
+                                [FeatureMips2, FeatureMips3_32,
+                                 FeatureMips4_32]>;
+def FeatureMips32r2    : SubtargetFeature<"mips32r2", "MipsArchVersion",
+                                "Mips32r2", "Mips32r2 ISA Support",
+                                [FeatureMips3_32r2, FeatureMips4_32r2,
+                                 FeatureMips5_32r2, FeatureMips32]>;
+def FeatureMips32r6    : SubtargetFeature<"mips32r6", "MipsArchVersion",
+                                "Mips32r6",
+                                "Mips32r6 ISA Support [experimental]",
+                                [FeatureMips32r2, FeatureFP64Bit,
+                                 FeatureNaN2008]>;
+def FeatureMips64      : SubtargetFeature<"mips64", "MipsArchVersion",
+                                "Mips64", "Mips64 ISA Support",
+                                [FeatureMips5, FeatureMips32]>;
+def FeatureMips64r2    : SubtargetFeature<"mips64r2", "MipsArchVersion",
+                                "Mips64r2", "Mips64r2 ISA Support",
+                                [FeatureMips64, FeatureMips32r2]>;
+def FeatureMips64r6    : SubtargetFeature<"mips64r6", "MipsArchVersion",
+                                "Mips64r6",
+                                "Mips64r6 ISA Support [experimental]",
+                                [FeatureMips32r6, FeatureMips64r2,
+                                 FeatureNaN2008]>;
+
+def FeatureMips16  : SubtargetFeature<"mips16", "InMips16Mode", "true",
+                                      "Mips16 mode">;
+
+def FeatureDSP : SubtargetFeature<"dsp", "HasDSP", "true", "Mips DSP ASE">;
+def FeatureDSPR2 : SubtargetFeature<"dspr2", "HasDSPR2", "true",
+                                    "Mips DSP-R2 ASE", [FeatureDSP]>;
+
+def FeatureMSA : SubtargetFeature<"msa", "HasMSA", "true", "Mips MSA ASE">;
+
+def FeatureMicroMips  : SubtargetFeature<"micromips", "InMicroMipsMode", "true",
+                                         "microMips mode">;
+
+def FeatureCnMips     : SubtargetFeature<"cnmips", "HasCnMips",
+                                "true", "Octeon cnMIPS Support",
+                                [FeatureMips64r2]>;
+
+//===----------------------------------------------------------------------===//
+// Mips processors supported.
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, MipsGenericItineraries, Features>;
+
+def : Proc<"mips1", [FeatureMips1, FeatureO32]>;
+def : Proc<"mips2", [FeatureMips2, FeatureO32]>;
+def : Proc<"mips32", [FeatureMips32, FeatureO32]>;
+def : Proc<"mips32r2", [FeatureMips32r2, FeatureO32]>;
+def : Proc<"mips32r6", [FeatureMips32r6, FeatureO32]>;
+
+def : Proc<"mips3", [FeatureMips3, FeatureN64]>;
+def : Proc<"mips4", [FeatureMips4, FeatureN64]>;
+def : Proc<"mips5", [FeatureMips5, FeatureN64]>;
+def : Proc<"mips64", [FeatureMips64, FeatureN64]>;
+def : Proc<"mips64r2", [FeatureMips64r2, FeatureN64]>;
+def : Proc<"mips64r6", [FeatureMips64r6, FeatureN64]>;
+def : Proc<"mips16", [FeatureMips16, FeatureO32]>;
+def : Proc<"octeon", [FeatureMips64r2, FeatureN64, FeatureCnMips]>;
+
+def MipsAsmParser : AsmParser {
+  let ShouldEmitMatchRegisterName = 0;
+  let MnemonicContainsDot = 1;
+}
+
+def MipsAsmParserVariant : AsmParserVariant {
+  int Variant = 0;
+
+  // Recognize hard coded registers.
+  string RegisterPrefix = "$";
+}
+
+def Mips : Target {
+  let InstructionSet = MipsInstrInfo;
+  let AssemblyParsers = [MipsAsmParser];
+  let AssemblyParserVariants = [MipsAsmParserVariant];
+}
diff --git a/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.cpp b/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.cpp
new file mode 100644
index 0000000..93706c2
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.cpp
@@ -0,0 +1,190 @@
+//===-- Mips16FrameLowering.cpp - Mips16 Frame Information ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips16 implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mips16FrameLowering.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "Mips16InstrInfo.h"
+#include "MipsInstrInfo.h"
+#include "MipsRegisterInfo.h"
+#include "MipsSubtarget.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+Mips16FrameLowering::Mips16FrameLowering(const MipsSubtarget &STI)
+    : MipsFrameLowering(STI, STI.stackAlignment()) {}
+
+void Mips16FrameLowering::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const Mips16InstrInfo &TII =
+    *static_cast<const Mips16InstrInfo*>(MF.getTarget().getInstrInfo());
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+  uint64_t StackSize = MFI->getStackSize();
+
+  // No need to allocate space on the stack.
+  if (StackSize == 0 && !MFI->adjustsStack()) return;
+
+  MachineModuleInfo &MMI = MF.getMMI();
+  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+  MachineLocation DstML, SrcML;
+
+  // Adjust stack.
+  TII.makeFrame(Mips::SP, StackSize, MBB, MBBI);
+
+  // emit ".cfi_def_cfa_offset StackSize"
+  unsigned CFIIndex = MMI.addFrameInst(
+      MCCFIInstruction::createDefCfaOffset(nullptr, -StackSize));
+  BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+
+  if (CSI.size()) {
+    const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+
+    for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(),
+         E = CSI.end(); I != E; ++I) {
+      int64_t Offset = MFI->getObjectOffset(I->getFrameIdx());
+      unsigned Reg = I->getReg();
+      unsigned DReg = MRI->getDwarfRegNum(Reg, true);
+      unsigned CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createOffset(nullptr, DReg, Offset));
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    }
+  }
+  if (hasFP(MF))
+    BuildMI(MBB, MBBI, dl, TII.get(Mips::MoveR3216), Mips::S0)
+      .addReg(Mips::SP).setMIFlag(MachineInstr::FrameSetup);
+
+}
+
+void Mips16FrameLowering::emitEpilogue(MachineFunction &MF,
+                                 MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const Mips16InstrInfo &TII =
+    *static_cast<const Mips16InstrInfo*>(MF.getTarget().getInstrInfo());
+  DebugLoc dl = MBBI->getDebugLoc();
+  uint64_t StackSize = MFI->getStackSize();
+
+  if (!StackSize)
+    return;
+
+  if (hasFP(MF))
+    BuildMI(MBB, MBBI, dl, TII.get(Mips::Move32R16), Mips::SP)
+      .addReg(Mips::S0);
+
+  // Adjust stack.
+  // assumes stacksize multiple of 8
+  TII.restoreFrame(Mips::SP, StackSize, MBB, MBBI);
+}
+
+bool Mips16FrameLowering::
+spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MI,
+                          const std::vector<CalleeSavedInfo> &CSI,
+                          const TargetRegisterInfo *TRI) const {
+  MachineFunction *MF = MBB.getParent();
+  MachineBasicBlock *EntryBlock = MF->begin();
+
+  //
+  // Registers RA, S0,S1 are the callee saved registers and they
+  // will be saved with the "save" instruction
+  // during emitPrologue
+  //
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    // Add the callee-saved register as live-in. Do not add if the register is
+    // RA and return address is taken, because it has already been added in
+    // method MipsTargetLowering::LowerRETURNADDR.
+    // It's killed at the spill, unless the register is RA and return address
+    // is taken.
+    unsigned Reg = CSI[i].getReg();
+    bool IsRAAndRetAddrIsTaken = (Reg == Mips::RA)
+      && MF->getFrameInfo()->isReturnAddressTaken();
+    if (!IsRAAndRetAddrIsTaken)
+      EntryBlock->addLiveIn(Reg);
+  }
+
+  return true;
+}
+
+bool Mips16FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator MI,
+                                       const std::vector<CalleeSavedInfo> &CSI,
+                                       const TargetRegisterInfo *TRI) const {
+  //
+  // Registers RA,S0,S1 are the callee saved registers and they will be restored
+  // with the restore instruction during emitEpilogue.
+  // We need to override this virtual function, otherwise llvm will try and
+  // restore the registers on it's on from the stack.
+  //
+
+  return true;
+}
+
+// Eliminate ADJCALLSTACKDOWN, ADJCALLSTACKUP pseudo instructions
+void Mips16FrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  if (!hasReservedCallFrame(MF)) {
+    int64_t Amount = I->getOperand(0).getImm();
+
+    if (I->getOpcode() == Mips::ADJCALLSTACKDOWN)
+      Amount = -Amount;
+
+    const Mips16InstrInfo &TII =
+      *static_cast<const Mips16InstrInfo*>(MF.getTarget().getInstrInfo());
+
+    TII.adjustStackPtr(Mips::SP, Amount, MBB, I);
+  }
+
+  MBB.erase(I);
+}
+
+bool
+Mips16FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  // Reserve call frame if the size of the maximum call frame fits into 15-bit
+  // immediate field and there are no variable sized objects on the stack.
+  return isInt<15>(MFI->getMaxCallFrameSize()) && !MFI->hasVarSizedObjects();
+}
+
+void Mips16FrameLowering::
+processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                     RegScavenger *RS) const {
+  const Mips16InstrInfo &TII =
+    *static_cast<const Mips16InstrInfo*>(MF.getTarget().getInstrInfo());
+  const MipsRegisterInfo &RI = TII.getRegisterInfo();
+  const BitVector Reserved = RI.getReservedRegs(MF);
+  bool SaveS2 = Reserved[Mips::S2];
+  if (SaveS2)
+    MF.getRegInfo().setPhysRegUsed(Mips::S2);
+  if (hasFP(MF))
+    MF.getRegInfo().setPhysRegUsed(Mips::S0);
+}
+
+const MipsFrameLowering *
+llvm::createMips16FrameLowering(const MipsSubtarget &ST) {
+  return new Mips16FrameLowering(ST);
+}
diff --git a/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.h b/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.h
new file mode 100644
index 0000000..1fb7eda
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.h
@@ -0,0 +1,51 @@
+//===-- Mips16FrameLowering.h - Mips16 frame lowering  ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPS16_FRAMEINFO_H
+#define MIPS16_FRAMEINFO_H
+
+#include "MipsFrameLowering.h"
+
+namespace llvm {
+class Mips16FrameLowering : public MipsFrameLowering {
+public:
+  explicit Mips16FrameLowering(const MipsSubtarget &STI);
+
+  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+  /// the function.
+  void emitPrologue(MachineFunction &MF) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                  MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I) const override;
+
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 const TargetRegisterInfo *TRI) const override;
+
+  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator MI,
+                                  const std::vector<CalleeSavedInfo> &CSI,
+                                  const TargetRegisterInfo *TRI) const override;
+
+  bool hasReservedCallFrame(const MachineFunction &MF) const override;
+
+  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                            RegScavenger *RS) const override;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/Mips16HardFloat.cpp b/contrib/llvm/lib/Target/Mips/Mips16HardFloat.cpp
new file mode 100644
index 0000000..14055d6
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16HardFloat.cpp
@@ -0,0 +1,542 @@
+//===---- Mips16HardFloat.cpp for Mips16 Hard Float               --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pass needed for Mips16 Hard Float
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mips16HardFloat.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <string>
+
+#define DEBUG_TYPE "mips16-hard-float"
+
+static void inlineAsmOut
+  (LLVMContext &C, StringRef AsmString, BasicBlock *BB ) {
+  std::vector<llvm::Type *> AsmArgTypes;
+  std::vector<llvm::Value*> AsmArgs;
+  llvm::FunctionType *AsmFTy =
+    llvm::FunctionType::get(Type::getVoidTy(C),
+                            AsmArgTypes, false);
+  llvm::InlineAsm *IA =
+    llvm::InlineAsm::get(AsmFTy, AsmString, "", true,
+                         /* IsAlignStack */ false,
+                         llvm::InlineAsm::AD_ATT);
+  CallInst::Create(IA, AsmArgs, "", BB);
+}
+
+namespace {
+
+class InlineAsmHelper {
+  LLVMContext &C;
+  BasicBlock *BB;
+public:
+  InlineAsmHelper(LLVMContext &C_, BasicBlock *BB_) :
+    C(C_), BB(BB_) {
+  }
+
+  void Out(StringRef AsmString) {
+    inlineAsmOut(C, AsmString, BB);
+  }
+
+};
+}
+//
+// Return types that matter for hard float are:
+// float, double, complex float, and complex double
+//
+enum FPReturnVariant {
+  FRet, DRet, CFRet, CDRet, NoFPRet
+};
+
+//
+// Determine which FP return type this function has
+//
+static FPReturnVariant whichFPReturnVariant(Type *T) {
+  switch (T->getTypeID()) {
+  case Type::FloatTyID:
+    return FRet;
+  case Type::DoubleTyID:
+    return DRet;
+  case Type::StructTyID:
+    if (T->getStructNumElements() != 2)
+      break;
+    if ((T->getContainedType(0)->isFloatTy()) &&
+        (T->getContainedType(1)->isFloatTy()))
+      return CFRet;
+    if ((T->getContainedType(0)->isDoubleTy()) &&
+        (T->getContainedType(1)->isDoubleTy()))
+      return CDRet;
+    break;
+  default:
+    break;
+  }
+  return NoFPRet;
+}
+
+//
+// Parameter type that matter are float, (float, float), (float, double),
+// double, (double, double), (double, float)
+//
+enum FPParamVariant {
+  FSig, FFSig, FDSig,
+  DSig, DDSig, DFSig, NoSig
+};
+
+// which floating point parameter signature variant we are dealing with
+//
+typedef Type::TypeID TypeID;
+const Type::TypeID FloatTyID = Type::FloatTyID;
+const Type::TypeID DoubleTyID = Type::DoubleTyID;
+
+static FPParamVariant whichFPParamVariantNeeded(Function &F) {
+  switch (F.arg_size()) {
+  case 0:
+    return NoSig;
+  case 1:{
+    TypeID ArgTypeID = F.getFunctionType()->getParamType(0)->getTypeID();
+    switch (ArgTypeID) {
+    case FloatTyID:
+      return FSig;
+    case DoubleTyID:
+      return DSig;
+    default:
+      return NoSig;
+    }
+  }
+  default: {
+    TypeID ArgTypeID0 = F.getFunctionType()->getParamType(0)->getTypeID();
+    TypeID ArgTypeID1 = F.getFunctionType()->getParamType(1)->getTypeID();
+    switch(ArgTypeID0) {
+    case FloatTyID: {
+      switch (ArgTypeID1) {
+      case FloatTyID:
+        return FFSig;
+      case DoubleTyID:
+        return FDSig;
+      default:
+        return FSig;
+      }
+    }
+    case DoubleTyID: {
+      switch (ArgTypeID1) {
+      case FloatTyID:
+        return DFSig;
+      case DoubleTyID:
+        return DDSig;
+      default:
+        return DSig;
+      }
+    }
+    default:
+      return NoSig;
+    }
+  }
+  }
+  llvm_unreachable("can't get here");
+}
+
+// Figure out if we need float point based on the function parameters.
+// We need to move variables in and/or out of floating point
+// registers because of the ABI
+//
+static bool needsFPStubFromParams(Function &F) {
+  if (F.arg_size() >=1) {
+    Type *ArgType = F.getFunctionType()->getParamType(0);
+    switch (ArgType->getTypeID()) {
+      case Type::FloatTyID:
+      case Type::DoubleTyID:
+        return true;
+      default:
+        break;
+    }
+  }
+  return false;
+}
+
+static bool needsFPReturnHelper(Function &F) {
+  Type* RetType = F.getReturnType();
+  return whichFPReturnVariant(RetType) != NoFPRet;
+}
+
+static bool needsFPReturnHelper(const FunctionType &FT) {
+  Type* RetType = FT.getReturnType();
+  return whichFPReturnVariant(RetType) != NoFPRet;
+}
+
+static bool needsFPHelperFromSig(Function &F) {
+  return needsFPStubFromParams(F) || needsFPReturnHelper(F);
+}
+
+//
+// We swap between FP and Integer registers to allow Mips16 and Mips32 to
+// interoperate
+//
+
+static void swapFPIntParams
+  (FPParamVariant PV, Module *M, InlineAsmHelper &IAH,
+   bool LE, bool ToFP) {
+  //LLVMContext &Context = M->getContext();
+  std::string MI = ToFP? "mtc1 ": "mfc1 ";
+  switch (PV) {
+  case FSig:
+    IAH.Out(MI + "$$4,$$f12");
+    break;
+  case FFSig:
+    IAH.Out(MI +"$$4,$$f12");
+    IAH.Out(MI + "$$5,$$f14");
+    break;
+  case FDSig:
+    IAH.Out(MI + "$$4,$$f12");
+    if (LE) {
+      IAH.Out(MI + "$$6,$$f14");
+      IAH.Out(MI + "$$7,$$f15");
+    } else {
+      IAH.Out(MI + "$$7,$$f14");
+      IAH.Out(MI + "$$6,$$f15");
+    }
+    break;
+  case DSig:
+    if (LE) {
+      IAH.Out(MI + "$$4,$$f12");
+      IAH.Out(MI + "$$5,$$f13");
+    } else {
+      IAH.Out(MI + "$$5,$$f12");
+      IAH.Out(MI + "$$4,$$f13");
+    }
+    break;
+  case DDSig:
+    if (LE) {
+      IAH.Out(MI + "$$4,$$f12");
+      IAH.Out(MI + "$$5,$$f13");
+      IAH.Out(MI + "$$6,$$f14");
+      IAH.Out(MI + "$$7,$$f15");
+    } else {
+      IAH.Out(MI + "$$5,$$f12");
+      IAH.Out(MI + "$$4,$$f13");
+      IAH.Out(MI + "$$7,$$f14");
+      IAH.Out(MI + "$$6,$$f15");
+    }
+    break;
+  case DFSig:
+    if (LE) {
+      IAH.Out(MI + "$$4,$$f12");
+      IAH.Out(MI + "$$5,$$f13");
+    } else {
+      IAH.Out(MI + "$$5,$$f12");
+      IAH.Out(MI + "$$4,$$f13");
+    }
+    IAH.Out(MI + "$$6,$$f14");
+    break;
+  case NoSig:
+    return;
+  }
+}
+//
+// Make sure that we know we already need a stub for this function.
+// Having called needsFPHelperFromSig
+//
+static void assureFPCallStub(Function &F, Module *M,
+                             const MipsSubtarget &Subtarget) {
+  // for now we only need them for static relocation
+  if (Subtarget.getRelocationModel() == Reloc::PIC_)
+    return;
+  LLVMContext &Context = M->getContext();
+  bool LE = Subtarget.isLittle();
+  std::string Name = F.getName();
+  std::string SectionName = ".mips16.call.fp." + Name;
+  std::string StubName = "__call_stub_fp_" + Name;
+  //
+  // see if we already have the stub
+  //
+  Function *FStub = M->getFunction(StubName);
+  if (FStub && !FStub->isDeclaration()) return;
+  FStub = Function::Create(F.getFunctionType(),
+                           Function::InternalLinkage, StubName, M);
+  FStub->addFnAttr("mips16_fp_stub");
+  FStub->addFnAttr(llvm::Attribute::Naked);
+  FStub->addFnAttr(llvm::Attribute::NoInline);
+  FStub->addFnAttr(llvm::Attribute::NoUnwind);
+  FStub->addFnAttr("nomips16");
+  FStub->setSection(SectionName);
+  BasicBlock *BB = BasicBlock::Create(Context, "entry", FStub);
+  InlineAsmHelper IAH(Context, BB);
+  IAH.Out(".set reorder");
+  FPReturnVariant RV = whichFPReturnVariant(FStub->getReturnType());
+  FPParamVariant PV = whichFPParamVariantNeeded(F);
+  swapFPIntParams(PV, M, IAH, LE, true);
+  if (RV != NoFPRet) {
+    IAH.Out("move $$18, $$31");
+    IAH.Out("jal " + Name);
+  } else {
+    IAH.Out("lui  $$25,%hi(" + Name + ")");
+    IAH.Out("addiu  $$25,$$25,%lo(" + Name + ")" );
+  }
+  switch (RV) {
+  case FRet:
+    IAH.Out("mfc1 $$2,$$f0");
+    break;
+  case DRet:
+    if (LE) {
+      IAH.Out("mfc1 $$2,$$f0");
+      IAH.Out("mfc1 $$3,$$f1");
+    } else {
+      IAH.Out("mfc1 $$3,$$f0");
+      IAH.Out("mfc1 $$2,$$f1");
+    }
+    break;
+  case CFRet:
+    if (LE) {
+    IAH.Out("mfc1 $$2,$$f0");
+    IAH.Out("mfc1 $$3,$$f2");
+    } else {
+      IAH.Out("mfc1 $$3,$$f0");
+      IAH.Out("mfc1 $$3,$$f2");
+    }
+    break;
+  case CDRet:
+    if (LE) {
+      IAH.Out("mfc1 $$4,$$f2");
+      IAH.Out("mfc1 $$5,$$f3");
+      IAH.Out("mfc1 $$2,$$f0");
+      IAH.Out("mfc1 $$3,$$f1");
+
+    } else {
+      IAH.Out("mfc1 $$5,$$f2");
+      IAH.Out("mfc1 $$4,$$f3");
+      IAH.Out("mfc1 $$3,$$f0");
+      IAH.Out("mfc1 $$2,$$f1");
+    }
+    break;
+  case NoFPRet:
+    break;
+  }
+  if (RV != NoFPRet)
+    IAH.Out("jr $$18");
+  else
+    IAH.Out("jr $$25");
+  new UnreachableInst(Context, BB);
+}
+
+//
+// Functions that are llvm intrinsics and don't need helpers.
+//
+static const char *IntrinsicInline[] =
+  {"fabs",
+   "fabsf",
+   "llvm.ceil.f32", "llvm.ceil.f64",
+   "llvm.copysign.f32", "llvm.copysign.f64",
+   "llvm.cos.f32", "llvm.cos.f64",
+   "llvm.exp.f32", "llvm.exp.f64",
+   "llvm.exp2.f32", "llvm.exp2.f64",
+   "llvm.fabs.f32", "llvm.fabs.f64",
+   "llvm.floor.f32", "llvm.floor.f64",
+   "llvm.fma.f32", "llvm.fma.f64",
+   "llvm.log.f32", "llvm.log.f64",
+   "llvm.log10.f32", "llvm.log10.f64",
+   "llvm.nearbyint.f32", "llvm.nearbyint.f64",
+   "llvm.pow.f32", "llvm.pow.f64",
+   "llvm.powi.f32", "llvm.powi.f64",
+   "llvm.rint.f32", "llvm.rint.f64",
+   "llvm.round.f32", "llvm.round.f64",
+   "llvm.sin.f32", "llvm.sin.f64",
+   "llvm.sqrt.f32", "llvm.sqrt.f64",
+   "llvm.trunc.f32", "llvm.trunc.f64",
+  };
+
+static bool isIntrinsicInline(Function *F) {
+  return std::binary_search(std::begin(IntrinsicInline),
+                            std::end(IntrinsicInline), F->getName());
+}
+//
+// Returns of float, double and complex need to be handled with a helper
+// function.
+//
+static bool fixupFPReturnAndCall
+  (Function &F, Module *M,  const MipsSubtarget &Subtarget) {
+  bool Modified = false;
+  LLVMContext &C = M->getContext();
+  Type *MyVoid = Type::getVoidTy(C);
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end();
+         I != E; ++I) {
+      Instruction &Inst = *I;
+      if (const ReturnInst *RI = dyn_cast<ReturnInst>(I)) {
+        Value *RVal = RI->getReturnValue();
+        if (!RVal) continue;
+        //
+        // If there is a return value and it needs a helper function,
+        // figure out which one and add a call before the actual
+        // return to this helper. The purpose of the helper is to move
+        // floating point values from their soft float return mapping to
+        // where they would have been mapped to in floating point registers.
+        //
+        Type *T = RVal->getType();
+        FPReturnVariant RV = whichFPReturnVariant(T);
+        if (RV == NoFPRet) continue;
+        static const char* Helper[NoFPRet] =
+          {"__mips16_ret_sf", "__mips16_ret_df", "__mips16_ret_sc",
+           "__mips16_ret_dc"};
+        const char *Name = Helper[RV];
+        AttributeSet A;
+        Value *Params[] = {RVal};
+        Modified = true;
+        //
+        // These helper functions have a different calling ABI so
+        // this __Mips16RetHelper indicates that so that later
+        // during call setup, the proper call lowering to the helper
+        // functions will take place.
+        //
+        A = A.addAttribute(C, AttributeSet::FunctionIndex,
+                           "__Mips16RetHelper");
+        A = A.addAttribute(C, AttributeSet::FunctionIndex,
+                           Attribute::ReadNone);
+        A = A.addAttribute(C, AttributeSet::FunctionIndex,
+                           Attribute::NoInline);
+        Value *F = (M->getOrInsertFunction(Name, A, MyVoid, T, NULL));
+        CallInst::Create(F, Params, "", &Inst );
+      } else if (const CallInst *CI = dyn_cast<CallInst>(I)) {
+          const Value* V = CI->getCalledValue();
+          const Type* T = nullptr;
+          if (V) T = V->getType();
+          const PointerType *PFT=nullptr;
+          if (T) PFT = dyn_cast<PointerType>(T);
+          const FunctionType *FT=nullptr;
+          if (PFT) FT = dyn_cast<FunctionType>(PFT->getElementType());
+          Function *F_ =  CI->getCalledFunction();
+          if (FT && needsFPReturnHelper(*FT) &&
+              !(F_ && isIntrinsicInline(F_))) {
+            Modified=true;
+            F.addFnAttr("saveS2");
+          }
+          if (F_ && !isIntrinsicInline(F_)) {
+          // pic mode calls are handled by already defined
+          // helper functions
+            if (needsFPReturnHelper(*F_)) {
+              Modified=true;
+              F.addFnAttr("saveS2");
+            }
+            if (Subtarget.getRelocationModel() != Reloc::PIC_ ) {
+              if (needsFPHelperFromSig(*F_)) {
+                assureFPCallStub(*F_, M, Subtarget);
+                Modified=true;
+              }
+            }
+          }
+      }
+    }
+  return Modified;
+}
+
+static void createFPFnStub(Function *F, Module *M, FPParamVariant PV,
+                  const MipsSubtarget &Subtarget ) {
+  bool PicMode = Subtarget.getRelocationModel() == Reloc::PIC_;
+  bool LE = Subtarget.isLittle();
+  LLVMContext &Context = M->getContext();
+  std::string Name = F->getName();
+  std::string SectionName = ".mips16.fn." + Name;
+  std::string StubName = "__fn_stub_" + Name;
+  std::string LocalName = "$$__fn_local_" + Name;
+  Function *FStub = Function::Create
+    (F->getFunctionType(),
+     Function::InternalLinkage, StubName, M);
+  FStub->addFnAttr("mips16_fp_stub");
+  FStub->addFnAttr(llvm::Attribute::Naked);
+  FStub->addFnAttr(llvm::Attribute::NoUnwind);
+  FStub->addFnAttr(llvm::Attribute::NoInline);
+  FStub->addFnAttr("nomips16");
+  FStub->setSection(SectionName);
+  BasicBlock *BB = BasicBlock::Create(Context, "entry", FStub);
+  InlineAsmHelper IAH(Context, BB);
+  IAH.Out(" .set  macro");
+  if (PicMode) {
+    IAH.Out(".set noreorder");
+    IAH.Out(".cpload  $$25");
+    IAH.Out(".set reorder");
+    IAH.Out(".reloc 0,R_MIPS_NONE," + Name);
+    IAH.Out("la $$25," + LocalName);
+  }
+  else {
+    IAH.Out(".set reorder");
+    IAH.Out("la $$25," + Name);
+  }
+  swapFPIntParams(PV, M, IAH, LE, false);
+  IAH.Out("jr $$25");
+  IAH.Out(LocalName + " = " + Name);
+  new UnreachableInst(FStub->getContext(), BB);
+}
+
+//
+// remove the use-soft-float attribute
+//
+static void removeUseSoftFloat(Function &F) {
+  AttributeSet A;
+  DEBUG(errs() << "removing -use-soft-float\n");
+  A = A.addAttribute(F.getContext(), AttributeSet::FunctionIndex,
+                     "use-soft-float", "false");
+  F.removeAttributes(AttributeSet::FunctionIndex, A);
+  if (F.hasFnAttribute("use-soft-float")) {
+    DEBUG(errs() << "still has -use-soft-float\n");
+  }
+  F.addAttributes(AttributeSet::FunctionIndex, A);
+}
+
+namespace llvm {
+
+//
+// This pass only makes sense when the underlying chip has floating point but
+// we are compiling as mips16.
+// For all mips16 functions (that are not stubs we have already generated), or
+// declared via attributes as nomips16, we must:
+//    1) fixup all returns of float, double, single and double complex
+//       by calling a helper function before the actual return.
+//    2) generate helper functions (stubs) that can be called by mips32
+//       functions that will move parameters passed normally passed in
+//       floating point
+//       registers the soft float equivalents.
+//    3) in the case of static relocation, generate helper functions so that
+//       mips16 functions can call extern functions of unknown type (mips16 or
+//       mips32).
+//    4) TBD. For pic, calls to extern functions of unknown type are handled by
+//       predefined helper functions in libc but this work is currently done
+//       during call lowering but it should be moved here in the future.
+//
+bool Mips16HardFloat::runOnModule(Module &M) {
+  DEBUG(errs() << "Run on Module Mips16HardFloat\n");
+  bool Modified = false;
+  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
+    if (F->hasFnAttribute("nomips16") &&
+        F->hasFnAttribute("use-soft-float")) {
+      removeUseSoftFloat(*F);
+      continue;
+    }
+    if (F->isDeclaration() || F->hasFnAttribute("mips16_fp_stub") ||
+        F->hasFnAttribute("nomips16")) continue;
+    Modified |= fixupFPReturnAndCall(*F, &M, Subtarget);
+    FPParamVariant V = whichFPParamVariantNeeded(*F);
+    if (V != NoSig) {
+      Modified = true;
+      createFPFnStub(F, &M, V, Subtarget);
+    }
+  }
+  return Modified;
+}
+
+char Mips16HardFloat::ID = 0;
+
+}
+
+ModulePass *llvm::createMips16HardFloat(MipsTargetMachine &TM) {
+  return new Mips16HardFloat(TM);
+}
+
diff --git a/contrib/llvm/lib/Target/Mips/Mips16HardFloat.h b/contrib/llvm/lib/Target/Mips/Mips16HardFloat.h
new file mode 100644
index 0000000..826887e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16HardFloat.h
@@ -0,0 +1,54 @@
+//===---- Mips16HardFloat.h for Mips16 Hard Float                  --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a phase which implements part of the floating point
+// interoperability between Mips16 and Mips32 code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/MipsMCTargetDesc.h"
+#include "MipsTargetMachine.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetMachine.h"
+
+
+#ifndef MIPS16HARDFLOAT_H
+#define MIPS16HARDFLOAT_H
+
+using namespace llvm;
+
+namespace llvm {
+
+class Mips16HardFloat : public ModulePass {
+
+public:
+  static char ID;
+
+  Mips16HardFloat(MipsTargetMachine &TM_) : ModulePass(ID),
+    TM(TM_), Subtarget(TM.getSubtarget<MipsSubtarget>()) {
+  }
+
+  const char *getPassName() const override {
+    return "MIPS16 Hard Float Pass";
+  }
+
+  bool runOnModule(Module &M) override;
+
+protected:
+  /// Keep a pointer to the MipsSubtarget around so that we can make the right
+  /// decision when generating code for different targets.
+  const TargetMachine &TM;
+  const MipsSubtarget &Subtarget;
+
+};
+
+ModulePass *createMips16HardFloat(MipsTargetMachine &TM);
+
+}
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/Mips16HardFloatInfo.cpp b/contrib/llvm/lib/Target/Mips/Mips16HardFloatInfo.cpp
new file mode 100644
index 0000000..2eb6e5d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16HardFloatInfo.cpp
@@ -0,0 +1,50 @@
+//===---- Mips16HardFloatInfo.cpp for Mips16 Hard Float              -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips16 implementation of Mips16HardFloatInfo
+// namespace.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mips16HardFloatInfo.h"
+#include <string.h>
+
+namespace llvm {
+
+namespace Mips16HardFloatInfo {
+
+const FuncNameSignature PredefinedFuncs[] = {
+  { "__floatdidf", { NoSig, DRet } },
+  { "__floatdisf", { NoSig, FRet } },
+  { "__floatundidf", { NoSig, DRet } },
+  { "__fixsfdi", { FSig, NoFPRet } },
+  { "__fixunsdfsi", { DSig, NoFPRet } },
+  { "__fixunsdfdi", { DSig, NoFPRet } },
+  { "__fixdfdi", { DSig, NoFPRet } },
+  { "__fixunssfsi", { FSig, NoFPRet } },
+  { "__fixunssfdi", { FSig, NoFPRet } },
+  { "__floatundisf", { NoSig, FRet } },
+  { nullptr, { NoSig, NoFPRet } }
+};
+
+// just do a search for now. there are very few of these special cases.
+//
+extern FuncSignature const *findFuncSignature(const char *name) {
+  const char *name_;
+  int i = 0;
+  while (PredefinedFuncs[i].Name) {
+    name_ = PredefinedFuncs[i].Name;
+    if (strcmp(name, name_) == 0)
+      return &PredefinedFuncs[i].Signature;
+    i++;
+  }
+  return nullptr;
+}
+}
+}
diff --git a/contrib/llvm/lib/Target/Mips/Mips16HardFloatInfo.h b/contrib/llvm/lib/Target/Mips/Mips16HardFloatInfo.h
new file mode 100644
index 0000000..02444d9
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16HardFloatInfo.h
@@ -0,0 +1,50 @@
+//===---- Mips16HardFloatInfo.h for Mips16 Hard Float              --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines some data structures relevant to the implementation of
+// Mips16 hard float.
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPS16HARDFLOATINFO_H
+#define MIPS16HARDFLOATINFO_H
+
+namespace llvm {
+
+namespace Mips16HardFloatInfo {
+
+// Return types that matter for hard float are:
+// float, double, complex float, and complex double
+//
+enum FPReturnVariant { FRet, DRet, CFRet, CDRet, NoFPRet };
+
+//
+// Parameter type that matter are float, (float, float), (float, double),
+// double, (double, double), (double, float)
+//
+enum FPParamVariant { FSig, FFSig, FDSig, DSig, DDSig, DFSig, NoSig };
+
+struct FuncSignature {
+  FPParamVariant ParamSig;
+  FPReturnVariant RetSig;
+};
+
+struct FuncNameSignature {
+  const char *Name;
+  FuncSignature Signature;
+};
+
+extern const FuncNameSignature PredefinedFuncs[];
+
+extern FuncSignature const *findFuncSignature(const char *name);
+}
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
new file mode 100644
index 0000000..7b05842
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
@@ -0,0 +1,320 @@
+//===-- Mips16ISelDAGToDAG.cpp - A Dag to Dag Inst Selector for Mips16 ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Subclass of MipsDAGToDAGISel specialized for mips16.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mips16ISelDAGToDAG.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "Mips.h"
+#include "MipsAnalyzeImmediate.h"
+#include "MipsMachineFunction.h"
+#include "MipsRegisterInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-isel"
+
+bool Mips16DAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
+  Subtarget = &TM.getSubtarget<MipsSubtarget>();
+  if (!Subtarget->inMips16Mode())
+    return false;
+  return MipsDAGToDAGISel::runOnMachineFunction(MF);
+}
+/// Select multiply instructions.
+std::pair<SDNode*, SDNode*>
+Mips16DAGToDAGISel::selectMULT(SDNode *N, unsigned Opc, SDLoc DL, EVT Ty,
+                               bool HasLo, bool HasHi) {
+  SDNode *Lo = nullptr, *Hi = nullptr;
+  SDNode *Mul = CurDAG->getMachineNode(Opc, DL, MVT::Glue, N->getOperand(0),
+                                       N->getOperand(1));
+  SDValue InFlag = SDValue(Mul, 0);
+
+  if (HasLo) {
+    unsigned Opcode = Mips::Mflo16;
+    Lo = CurDAG->getMachineNode(Opcode, DL, Ty, MVT::Glue, InFlag);
+    InFlag = SDValue(Lo, 1);
+  }
+  if (HasHi) {
+    unsigned Opcode = Mips::Mfhi16;
+    Hi = CurDAG->getMachineNode(Opcode, DL, Ty, InFlag);
+  }
+  return std::make_pair(Lo, Hi);
+}
+
+void Mips16DAGToDAGISel::initGlobalBaseReg(MachineFunction &MF) {
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+
+  if (!MipsFI->globalBaseRegSet())
+    return;
+
+  MachineBasicBlock &MBB = MF.front();
+  MachineBasicBlock::iterator I = MBB.begin();
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
+  unsigned V0, V1, V2, GlobalBaseReg = MipsFI->getGlobalBaseReg();
+  const TargetRegisterClass *RC =
+    (const TargetRegisterClass*)&Mips::CPU16RegsRegClass;
+
+  V0 = RegInfo.createVirtualRegister(RC);
+  V1 = RegInfo.createVirtualRegister(RC);
+  V2 = RegInfo.createVirtualRegister(RC);
+
+  BuildMI(MBB, I, DL, TII.get(Mips::GotPrologue16), V0).
+    addReg(V1, RegState::Define).
+    addExternalSymbol("_gp_disp", MipsII::MO_ABS_HI).
+    addExternalSymbol("_gp_disp", MipsII::MO_ABS_LO);
+
+  BuildMI(MBB, I, DL, TII.get(Mips::SllX16), V2).addReg(V0).addImm(16);
+  BuildMI(MBB, I, DL, TII.get(Mips::AdduRxRyRz16), GlobalBaseReg)
+    .addReg(V1).addReg(V2);
+}
+
+// Insert instructions to initialize the Mips16 SP Alias register in the
+// first MBB of the function.
+//
+void Mips16DAGToDAGISel::initMips16SPAliasReg(MachineFunction &MF) {
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+
+  if (!MipsFI->mips16SPAliasRegSet())
+    return;
+
+  MachineBasicBlock &MBB = MF.front();
+  MachineBasicBlock::iterator I = MBB.begin();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
+  unsigned Mips16SPAliasReg = MipsFI->getMips16SPAliasReg();
+
+  BuildMI(MBB, I, DL, TII.get(Mips::MoveR3216), Mips16SPAliasReg)
+    .addReg(Mips::SP);
+}
+
+void Mips16DAGToDAGISel::processFunctionAfterISel(MachineFunction &MF) {
+  initGlobalBaseReg(MF);
+  initMips16SPAliasReg(MF);
+}
+
+/// getMips16SPAliasReg - Output the instructions required to put the
+/// SP into a Mips16 accessible aliased register.
+SDValue Mips16DAGToDAGISel::getMips16SPAliasReg() {
+  unsigned Mips16SPAliasReg =
+    MF->getInfo<MipsFunctionInfo>()->getMips16SPAliasReg();
+  return CurDAG->getRegister(Mips16SPAliasReg,
+                             getTargetLowering()->getPointerTy());
+}
+
+void Mips16DAGToDAGISel::getMips16SPRefReg(SDNode *Parent, SDValue &AliasReg) {
+  SDValue AliasFPReg = CurDAG->getRegister(Mips::S0,
+                                           getTargetLowering()->getPointerTy());
+  if (Parent) {
+    switch (Parent->getOpcode()) {
+      case ISD::LOAD: {
+        LoadSDNode *SD = dyn_cast<LoadSDNode>(Parent);
+        switch (SD->getMemoryVT().getSizeInBits()) {
+        case 8:
+        case 16:
+          AliasReg = TM.getFrameLowering()->hasFP(*MF)?
+            AliasFPReg: getMips16SPAliasReg();
+          return;
+        }
+        break;
+      }
+      case ISD::STORE: {
+        StoreSDNode *SD = dyn_cast<StoreSDNode>(Parent);
+        switch (SD->getMemoryVT().getSizeInBits()) {
+        case 8:
+        case 16:
+          AliasReg = TM.getFrameLowering()->hasFP(*MF)?
+            AliasFPReg: getMips16SPAliasReg();
+          return;
+        }
+        break;
+      }
+    }
+  }
+  AliasReg = CurDAG->getRegister(Mips::SP, getTargetLowering()->getPointerTy());
+  return;
+
+}
+
+bool Mips16DAGToDAGISel::selectAddr16(
+  SDNode *Parent, SDValue Addr, SDValue &Base, SDValue &Offset,
+  SDValue &Alias) {
+  EVT ValTy = Addr.getValueType();
+
+  Alias = CurDAG->getTargetConstant(0, ValTy);
+
+  // if Address is FI, get the TargetFrameIndex.
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base   = CurDAG->getTargetFrameIndex(FIN->getIndex(), ValTy);
+    Offset = CurDAG->getTargetConstant(0, ValTy);
+    getMips16SPRefReg(Parent, Alias);
+    return true;
+  }
+  // on PIC code Load GA
+  if (Addr.getOpcode() == MipsISD::Wrapper) {
+    Base   = Addr.getOperand(0);
+    Offset = Addr.getOperand(1);
+    return true;
+  }
+  if (TM.getRelocationModel() != Reloc::PIC_) {
+    if ((Addr.getOpcode() == ISD::TargetExternalSymbol ||
+        Addr.getOpcode() == ISD::TargetGlobalAddress))
+      return false;
+  }
+  // Addresses of the form FI+const or FI|const
+  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
+    if (isInt<16>(CN->getSExtValue())) {
+
+      // If the first operand is a FI, get the TargetFI Node
+      if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>
+                                  (Addr.getOperand(0))) {
+        Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), ValTy);
+        getMips16SPRefReg(Parent, Alias);
+      }
+      else
+        Base = Addr.getOperand(0);
+
+      Offset = CurDAG->getTargetConstant(CN->getZExtValue(), ValTy);
+      return true;
+    }
+  }
+  // Operand is a result from an ADD.
+  if (Addr.getOpcode() == ISD::ADD) {
+    // When loading from constant pools, load the lower address part in
+    // the instruction itself. Example, instead of:
+    //  lui $2, %hi($CPI1_0)
+    //  addiu $2, $2, %lo($CPI1_0)
+    //  lwc1 $f0, 0($2)
+    // Generate:
+    //  lui $2, %hi($CPI1_0)
+    //  lwc1 $f0, %lo($CPI1_0)($2)
+    if (Addr.getOperand(1).getOpcode() == MipsISD::Lo ||
+        Addr.getOperand(1).getOpcode() == MipsISD::GPRel) {
+      SDValue Opnd0 = Addr.getOperand(1).getOperand(0);
+      if (isa<ConstantPoolSDNode>(Opnd0) || isa<GlobalAddressSDNode>(Opnd0) ||
+          isa<JumpTableSDNode>(Opnd0)) {
+        Base = Addr.getOperand(0);
+        Offset = Opnd0;
+        return true;
+      }
+    }
+
+    // If an indexed floating point load/store can be emitted, return false.
+    const LSBaseSDNode *LS = dyn_cast<LSBaseSDNode>(Parent);
+
+    if (LS) {
+      if (LS->getMemoryVT() == MVT::f32 && Subtarget->hasMips4_32r2())
+        return false;
+      if (LS->getMemoryVT() == MVT::f64 && Subtarget->hasMips4_32r2())
+        return false;
+    }
+  }
+  Base   = Addr;
+  Offset = CurDAG->getTargetConstant(0, ValTy);
+  return true;
+}
+
+/// Select instructions not customized! Used for
+/// expanded, promoted and normal instructions
+std::pair<bool, SDNode*> Mips16DAGToDAGISel::selectNode(SDNode *Node) {
+  unsigned Opcode = Node->getOpcode();
+  SDLoc DL(Node);
+
+  ///
+  // Instruction Selection not handled by the auto-generated
+  // tablegen selection should be handled here.
+  ///
+  EVT NodeTy = Node->getValueType(0);
+  unsigned MultOpc;
+
+  switch(Opcode) {
+  default: break;
+
+  case ISD::SUBE:
+  case ISD::ADDE: {
+    SDValue InFlag = Node->getOperand(2), CmpLHS;
+    unsigned Opc = InFlag.getOpcode(); (void)Opc;
+    assert(((Opc == ISD::ADDC || Opc == ISD::ADDE) ||
+            (Opc == ISD::SUBC || Opc == ISD::SUBE)) &&
+           "(ADD|SUB)E flag operand must come from (ADD|SUB)C/E insn");
+
+    unsigned MOp;
+    if (Opcode == ISD::ADDE) {
+      CmpLHS = InFlag.getValue(0);
+      MOp = Mips::AdduRxRyRz16;
+    } else {
+      CmpLHS = InFlag.getOperand(0);
+      MOp = Mips::SubuRxRyRz16;
+    }
+
+    SDValue Ops[] = { CmpLHS, InFlag.getOperand(1) };
+
+    SDValue LHS = Node->getOperand(0);
+    SDValue RHS = Node->getOperand(1);
+
+    EVT VT = LHS.getValueType();
+
+    unsigned Sltu_op = Mips::SltuRxRyRz16;
+    SDNode *Carry = CurDAG->getMachineNode(Sltu_op, DL, VT, Ops);
+    unsigned Addu_op = Mips::AdduRxRyRz16;
+    SDNode *AddCarry = CurDAG->getMachineNode(Addu_op, DL, VT,
+                                              SDValue(Carry,0), RHS);
+
+    SDNode *Result = CurDAG->SelectNodeTo(Node, MOp, VT, MVT::Glue, LHS,
+                                          SDValue(AddCarry,0));
+    return std::make_pair(true, Result);
+  }
+
+  /// Mul with two results
+  case ISD::SMUL_LOHI:
+  case ISD::UMUL_LOHI: {
+    MultOpc = (Opcode == ISD::UMUL_LOHI ? Mips::MultuRxRy16 : Mips::MultRxRy16);
+    std::pair<SDNode*, SDNode*> LoHi = selectMULT(Node, MultOpc, DL, NodeTy,
+                                                  true, true);
+    if (!SDValue(Node, 0).use_empty())
+      ReplaceUses(SDValue(Node, 0), SDValue(LoHi.first, 0));
+
+    if (!SDValue(Node, 1).use_empty())
+      ReplaceUses(SDValue(Node, 1), SDValue(LoHi.second, 0));
+
+    return std::make_pair(true, nullptr);
+  }
+
+  case ISD::MULHS:
+  case ISD::MULHU: {
+    MultOpc = (Opcode == ISD::MULHU ? Mips::MultuRxRy16 : Mips::MultRxRy16);
+    SDNode *Result = selectMULT(Node, MultOpc, DL, NodeTy, false, true).second;
+    return std::make_pair(true, Result);
+  }
+  }
+
+  return std::make_pair(false, nullptr);
+}
+
+FunctionPass *llvm::createMips16ISelDag(MipsTargetMachine &TM) {
+  return new Mips16DAGToDAGISel(TM);
+}
diff --git a/contrib/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.h b/contrib/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.h
new file mode 100644
index 0000000..e653b39
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.h
@@ -0,0 +1,53 @@
+//===---- Mips16ISelDAGToDAG.h - A Dag to Dag Inst Selector for Mips ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Subclass of MipsDAGToDAGISel specialized for mips16.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPS16ISELDAGTODAG_H
+#define MIPS16ISELDAGTODAG_H
+
+#include "MipsISelDAGToDAG.h"
+
+namespace llvm {
+
+class Mips16DAGToDAGISel : public MipsDAGToDAGISel {
+public:
+  explicit Mips16DAGToDAGISel(MipsTargetMachine &TM) : MipsDAGToDAGISel(TM) {}
+
+private:
+  std::pair<SDNode*, SDNode*> selectMULT(SDNode *N, unsigned Opc, SDLoc DL,
+                                         EVT Ty, bool HasLo, bool HasHi);
+
+  SDValue getMips16SPAliasReg();
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void getMips16SPRefReg(SDNode *Parent, SDValue &AliasReg);
+
+  bool selectAddr16(SDNode *Parent, SDValue N, SDValue &Base,
+                    SDValue &Offset, SDValue &Alias) override;
+
+  std::pair<bool, SDNode*> selectNode(SDNode *Node) override;
+
+  void processFunctionAfterISel(MachineFunction &MF) override;
+
+  // Insert instructions to initialize the global base register in the
+  // first MBB of the function.
+  void initGlobalBaseReg(MachineFunction &MF);
+
+  void initMips16SPAliasReg(MachineFunction &MF);
+};
+
+FunctionPass *createMips16ISelDag(MipsTargetMachine &TM);
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/Mips16ISelLowering.cpp b/contrib/llvm/lib/Target/Mips/Mips16ISelLowering.cpp
new file mode 100644
index 0000000..587925d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16ISelLowering.cpp
@@ -0,0 +1,792 @@
+//===-- Mips16ISelLowering.h - Mips16 DAG Lowering Interface ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Subclass of MipsTargetLowering specialized for mips16.
+//
+//===----------------------------------------------------------------------===//
+#include "Mips16ISelLowering.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MipsRegisterInfo.h"
+#include "MipsTargetMachine.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include <string>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-lower"
+
+static cl::opt<bool> DontExpandCondPseudos16(
+  "mips16-dont-expand-cond-pseudo",
+  cl::init(false),
+  cl::desc("Dont expand conditional move related "
+           "pseudos for Mips 16"),
+  cl::Hidden);
+
+namespace {
+struct Mips16Libcall {
+  RTLIB::Libcall Libcall;
+  const char *Name;
+
+  bool operator<(const Mips16Libcall &RHS) const {
+    return std::strcmp(Name, RHS.Name) < 0;
+  }
+};
+
+struct Mips16IntrinsicHelperType{
+  const char* Name;
+  const char* Helper;
+
+  bool operator<(const Mips16IntrinsicHelperType &RHS) const {
+    return std::strcmp(Name, RHS.Name) < 0;
+  }
+  bool operator==(const Mips16IntrinsicHelperType &RHS) const {
+    return std::strcmp(Name, RHS.Name) == 0;
+  }
+};
+}
+
+// Libcalls for which no helper is generated. Sorted by name for binary search.
+static const Mips16Libcall HardFloatLibCalls[] = {
+  { RTLIB::ADD_F64, "__mips16_adddf3" },
+  { RTLIB::ADD_F32, "__mips16_addsf3" },
+  { RTLIB::DIV_F64, "__mips16_divdf3" },
+  { RTLIB::DIV_F32, "__mips16_divsf3" },
+  { RTLIB::OEQ_F64, "__mips16_eqdf2" },
+  { RTLIB::OEQ_F32, "__mips16_eqsf2" },
+  { RTLIB::FPEXT_F32_F64, "__mips16_extendsfdf2" },
+  { RTLIB::FPTOSINT_F64_I32, "__mips16_fix_truncdfsi" },
+  { RTLIB::FPTOSINT_F32_I32, "__mips16_fix_truncsfsi" },
+  { RTLIB::SINTTOFP_I32_F64, "__mips16_floatsidf" },
+  { RTLIB::SINTTOFP_I32_F32, "__mips16_floatsisf" },
+  { RTLIB::UINTTOFP_I32_F64, "__mips16_floatunsidf" },
+  { RTLIB::UINTTOFP_I32_F32, "__mips16_floatunsisf" },
+  { RTLIB::OGE_F64, "__mips16_gedf2" },
+  { RTLIB::OGE_F32, "__mips16_gesf2" },
+  { RTLIB::OGT_F64, "__mips16_gtdf2" },
+  { RTLIB::OGT_F32, "__mips16_gtsf2" },
+  { RTLIB::OLE_F64, "__mips16_ledf2" },
+  { RTLIB::OLE_F32, "__mips16_lesf2" },
+  { RTLIB::OLT_F64, "__mips16_ltdf2" },
+  { RTLIB::OLT_F32, "__mips16_ltsf2" },
+  { RTLIB::MUL_F64, "__mips16_muldf3" },
+  { RTLIB::MUL_F32, "__mips16_mulsf3" },
+  { RTLIB::UNE_F64, "__mips16_nedf2" },
+  { RTLIB::UNE_F32, "__mips16_nesf2" },
+  { RTLIB::UNKNOWN_LIBCALL, "__mips16_ret_dc" }, // No associated libcall.
+  { RTLIB::UNKNOWN_LIBCALL, "__mips16_ret_df" }, // No associated libcall.
+  { RTLIB::UNKNOWN_LIBCALL, "__mips16_ret_sc" }, // No associated libcall.
+  { RTLIB::UNKNOWN_LIBCALL, "__mips16_ret_sf" }, // No associated libcall.
+  { RTLIB::SUB_F64, "__mips16_subdf3" },
+  { RTLIB::SUB_F32, "__mips16_subsf3" },
+  { RTLIB::FPROUND_F64_F32, "__mips16_truncdfsf2" },
+  { RTLIB::UO_F64, "__mips16_unorddf2" },
+  { RTLIB::UO_F32, "__mips16_unordsf2" }
+};
+
+static const Mips16IntrinsicHelperType Mips16IntrinsicHelper[] = {
+  {"__fixunsdfsi", "__mips16_call_stub_2" },
+  {"ceil",  "__mips16_call_stub_df_2"},
+  {"ceilf", "__mips16_call_stub_sf_1"},
+  {"copysign",  "__mips16_call_stub_df_10"},
+  {"copysignf", "__mips16_call_stub_sf_5"},
+  {"cos",  "__mips16_call_stub_df_2"},
+  {"cosf", "__mips16_call_stub_sf_1"},
+  {"exp2",  "__mips16_call_stub_df_2"},
+  {"exp2f", "__mips16_call_stub_sf_1"},
+  {"floor",  "__mips16_call_stub_df_2"},
+  {"floorf", "__mips16_call_stub_sf_1"},
+  {"log2",  "__mips16_call_stub_df_2"},
+  {"log2f", "__mips16_call_stub_sf_1"},
+  {"nearbyint",  "__mips16_call_stub_df_2"},
+  {"nearbyintf", "__mips16_call_stub_sf_1"},
+  {"rint",  "__mips16_call_stub_df_2"},
+  {"rintf", "__mips16_call_stub_sf_1"},
+  {"sin",  "__mips16_call_stub_df_2"},
+  {"sinf", "__mips16_call_stub_sf_1"},
+  {"sqrt",  "__mips16_call_stub_df_2"},
+  {"sqrtf", "__mips16_call_stub_sf_1"},
+  {"trunc",  "__mips16_call_stub_df_2"},
+  {"truncf", "__mips16_call_stub_sf_1"},
+};
+
+Mips16TargetLowering::Mips16TargetLowering(MipsTargetMachine &TM,
+                                           const MipsSubtarget &STI)
+    : MipsTargetLowering(TM, STI) {
+
+  // Set up the register classes
+  addRegisterClass(MVT::i32, &Mips::CPU16RegsRegClass);
+
+  if (!TM.Options.UseSoftFloat)
+    setMips16HardFloatLibCalls();
+
+  setOperationAction(ISD::ATOMIC_FENCE,       MVT::Other, Expand);
+  setOperationAction(ISD::ATOMIC_CMP_SWAP,    MVT::i32,   Expand);
+  setOperationAction(ISD::ATOMIC_SWAP,        MVT::i32,   Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_ADD,    MVT::i32,   Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_SUB,    MVT::i32,   Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_AND,    MVT::i32,   Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_OR,     MVT::i32,   Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_XOR,    MVT::i32,   Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_NAND,   MVT::i32,   Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_MIN,    MVT::i32,   Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_MAX,    MVT::i32,   Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_UMIN,   MVT::i32,   Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_UMAX,   MVT::i32,   Expand);
+
+  setOperationAction(ISD::ROTR, MVT::i32,  Expand);
+  setOperationAction(ISD::ROTR, MVT::i64,  Expand);
+  setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+  setOperationAction(ISD::BSWAP, MVT::i64, Expand);
+
+  computeRegisterProperties();
+}
+
+const MipsTargetLowering *
+llvm::createMips16TargetLowering(MipsTargetMachine &TM,
+                                 const MipsSubtarget &STI) {
+  return new Mips16TargetLowering(TM, STI);
+}
+
+bool
+Mips16TargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
+                                                    unsigned,
+                                                    bool *Fast) const {
+  return false;
+}
+
+MachineBasicBlock *
+Mips16TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                  MachineBasicBlock *BB) const {
+  switch (MI->getOpcode()) {
+  default:
+    return MipsTargetLowering::EmitInstrWithCustomInserter(MI, BB);
+  case Mips::SelBeqZ:
+    return emitSel16(Mips::BeqzRxImm16, MI, BB);
+  case Mips::SelBneZ:
+    return emitSel16(Mips::BnezRxImm16, MI, BB);
+  case Mips::SelTBteqZCmpi:
+    return emitSeliT16(Mips::Bteqz16, Mips::CmpiRxImmX16, MI, BB);
+  case Mips::SelTBteqZSlti:
+    return emitSeliT16(Mips::Bteqz16, Mips::SltiRxImmX16, MI, BB);
+  case Mips::SelTBteqZSltiu:
+    return emitSeliT16(Mips::Bteqz16, Mips::SltiuRxImmX16, MI, BB);
+  case Mips::SelTBtneZCmpi:
+    return emitSeliT16(Mips::Btnez16, Mips::CmpiRxImmX16, MI, BB);
+  case Mips::SelTBtneZSlti:
+    return emitSeliT16(Mips::Btnez16, Mips::SltiRxImmX16, MI, BB);
+  case Mips::SelTBtneZSltiu:
+    return emitSeliT16(Mips::Btnez16, Mips::SltiuRxImmX16, MI, BB);
+  case Mips::SelTBteqZCmp:
+    return emitSelT16(Mips::Bteqz16, Mips::CmpRxRy16, MI, BB);
+  case Mips::SelTBteqZSlt:
+    return emitSelT16(Mips::Bteqz16, Mips::SltRxRy16, MI, BB);
+  case Mips::SelTBteqZSltu:
+    return emitSelT16(Mips::Bteqz16, Mips::SltuRxRy16, MI, BB);
+  case Mips::SelTBtneZCmp:
+    return emitSelT16(Mips::Btnez16, Mips::CmpRxRy16, MI, BB);
+  case Mips::SelTBtneZSlt:
+    return emitSelT16(Mips::Btnez16, Mips::SltRxRy16, MI, BB);
+  case Mips::SelTBtneZSltu:
+    return emitSelT16(Mips::Btnez16, Mips::SltuRxRy16, MI, BB);
+  case Mips::BteqzT8CmpX16:
+    return emitFEXT_T8I816_ins(Mips::Bteqz16, Mips::CmpRxRy16, MI, BB);
+  case Mips::BteqzT8SltX16:
+    return emitFEXT_T8I816_ins(Mips::Bteqz16, Mips::SltRxRy16, MI, BB);
+  case Mips::BteqzT8SltuX16:
+    // TBD: figure out a way to get this or remove the instruction
+    // altogether.
+    return emitFEXT_T8I816_ins(Mips::Bteqz16, Mips::SltuRxRy16, MI, BB);
+  case Mips::BtnezT8CmpX16:
+    return emitFEXT_T8I816_ins(Mips::Btnez16, Mips::CmpRxRy16, MI, BB);
+  case Mips::BtnezT8SltX16:
+    return emitFEXT_T8I816_ins(Mips::Btnez16, Mips::SltRxRy16, MI, BB);
+  case Mips::BtnezT8SltuX16:
+    // TBD: figure out a way to get this or remove the instruction
+    // altogether.
+    return emitFEXT_T8I816_ins(Mips::Btnez16, Mips::SltuRxRy16, MI, BB);
+  case Mips::BteqzT8CmpiX16: return emitFEXT_T8I8I16_ins(
+    Mips::Bteqz16, Mips::CmpiRxImm16, Mips::CmpiRxImmX16, false, MI, BB);
+  case Mips::BteqzT8SltiX16: return emitFEXT_T8I8I16_ins(
+    Mips::Bteqz16, Mips::SltiRxImm16, Mips::SltiRxImmX16, true, MI, BB);
+  case Mips::BteqzT8SltiuX16: return emitFEXT_T8I8I16_ins(
+    Mips::Bteqz16, Mips::SltiuRxImm16, Mips::SltiuRxImmX16, false, MI, BB);
+  case Mips::BtnezT8CmpiX16: return emitFEXT_T8I8I16_ins(
+    Mips::Btnez16, Mips::CmpiRxImm16, Mips::CmpiRxImmX16, false, MI, BB);
+  case Mips::BtnezT8SltiX16: return emitFEXT_T8I8I16_ins(
+    Mips::Btnez16, Mips::SltiRxImm16, Mips::SltiRxImmX16, true, MI, BB);
+  case Mips::BtnezT8SltiuX16: return emitFEXT_T8I8I16_ins(
+    Mips::Btnez16, Mips::SltiuRxImm16, Mips::SltiuRxImmX16, false, MI, BB);
+    break;
+  case Mips::SltCCRxRy16:
+    return emitFEXT_CCRX16_ins(Mips::SltRxRy16, MI, BB);
+    break;
+  case Mips::SltiCCRxImmX16:
+    return emitFEXT_CCRXI16_ins
+      (Mips::SltiRxImm16, Mips::SltiRxImmX16, MI, BB);
+  case Mips::SltiuCCRxImmX16:
+    return emitFEXT_CCRXI16_ins
+      (Mips::SltiuRxImm16, Mips::SltiuRxImmX16, MI, BB);
+  case Mips::SltuCCRxRy16:
+    return emitFEXT_CCRX16_ins
+      (Mips::SltuRxRy16, MI, BB);
+  }
+}
+
+bool Mips16TargetLowering::
+isEligibleForTailCallOptimization(const MipsCC &MipsCCInfo,
+                                  unsigned NextStackOffset,
+                                  const MipsFunctionInfo& FI) const {
+  // No tail call optimization for mips16.
+  return false;
+}
+
+void Mips16TargetLowering::setMips16HardFloatLibCalls() {
+  for (unsigned I = 0; I != array_lengthof(HardFloatLibCalls); ++I) {
+    assert((I == 0 || HardFloatLibCalls[I - 1] < HardFloatLibCalls[I]) &&
+           "Array not sorted!");
+    if (HardFloatLibCalls[I].Libcall != RTLIB::UNKNOWN_LIBCALL)
+      setLibcallName(HardFloatLibCalls[I].Libcall, HardFloatLibCalls[I].Name);
+  }
+
+  setLibcallName(RTLIB::O_F64, "__mips16_unorddf2");
+  setLibcallName(RTLIB::O_F32, "__mips16_unordsf2");
+}
+
+//
+// The Mips16 hard float is a crazy quilt inherited from gcc. I have a much
+// cleaner way to do all of this but it will have to wait until the traditional
+// gcc mechanism is completed.
+//
+// For Pic, in order for Mips16 code to call Mips32 code which according the abi
+// have either arguments or returned values placed in floating point registers,
+// we use a set of helper functions. (This includes functions which return type
+//  complex which on Mips are returned in a pair of floating point registers).
+//
+// This is an encoding that we inherited from gcc.
+// In Mips traditional O32, N32 ABI, floating point numbers are passed in
+// floating point argument registers 1,2 only when the first and optionally
+// the second arguments are float (sf) or double (df).
+// For Mips16 we are only concerned with the situations where floating point
+// arguments are being passed in floating point registers by the ABI, because
+// Mips16 mode code cannot execute floating point instructions to load those
+// values and hence helper functions are needed.
+// The possibilities are (), (sf), (sf, sf), (sf, df), (df), (df, sf), (df, df)
+// the helper function suffixs for these are:
+//                        0,  1,    5,        9,         2,   6,        10
+// this suffix can then be calculated as follows:
+// for a given argument Arg:
+//     Arg1x, Arg2x = 1 :  Arg is sf
+//                    2 :  Arg is df
+//                    0:   Arg is neither sf or df
+// So this stub is the string for number Arg1x + Arg2x*4.
+// However not all numbers between 0 and 10 are possible, we check anyway and
+// assert if the impossible exists.
+//
+
+unsigned int Mips16TargetLowering::getMips16HelperFunctionStubNumber
+  (ArgListTy &Args) const {
+  unsigned int resultNum = 0;
+  if (Args.size() >= 1) {
+    Type *t = Args[0].Ty;
+    if (t->isFloatTy()) {
+      resultNum = 1;
+    }
+    else if (t->isDoubleTy()) {
+      resultNum = 2;
+    }
+  }
+  if (resultNum) {
+    if (Args.size() >=2) {
+      Type *t = Args[1].Ty;
+      if (t->isFloatTy()) {
+        resultNum += 4;
+      }
+      else if (t->isDoubleTy()) {
+        resultNum += 8;
+      }
+    }
+  }
+  return resultNum;
+}
+
+//
+// prefixs are attached to stub numbers depending on the return type .
+// return type: float  sf_
+//              double df_
+//              single complex sc_
+//              double complext dc_
+//              others  NO PREFIX
+//
+//
+// The full name of a helper function is__mips16_call_stub +
+//    return type dependent prefix + stub number
+//
+//
+// This is something that probably should be in a different source file and
+// perhaps done differently but my main purpose is to not waste runtime
+// on something that we can enumerate in the source. Another possibility is
+// to have a python script to generate these mapping tables. This will do
+// for now. There are a whole series of helper function mapping arrays, one
+// for each return type class as outlined above. There there are 11 possible
+//  entries. Ones with 0 are ones which should never be selected
+//
+// All the arrays are similar except for ones which return neither
+// sf, df, sc, dc, in which only care about ones which have sf or df as a
+// first parameter.
+//
+#define P_ "__mips16_call_stub_"
+#define MAX_STUB_NUMBER 10
+#define T1 P "1", P "2", 0, 0, P "5", P "6", 0, 0, P "9", P "10"
+#define T P "0" , T1
+#define P P_
+static char const * vMips16Helper[MAX_STUB_NUMBER+1] =
+  {nullptr, T1 };
+#undef P
+#define P P_ "sf_"
+static char const * sfMips16Helper[MAX_STUB_NUMBER+1] =
+  { T };
+#undef P
+#define P P_ "df_"
+static char const * dfMips16Helper[MAX_STUB_NUMBER+1] =
+  { T };
+#undef P
+#define P P_ "sc_"
+static char const * scMips16Helper[MAX_STUB_NUMBER+1] =
+  { T };
+#undef P
+#define P P_ "dc_"
+static char const * dcMips16Helper[MAX_STUB_NUMBER+1] =
+  { T };
+#undef P
+#undef P_
+
+
+const char* Mips16TargetLowering::
+  getMips16HelperFunction
+    (Type* RetTy, ArgListTy &Args, bool &needHelper) const {
+  const unsigned int stubNum = getMips16HelperFunctionStubNumber(Args);
+#ifndef NDEBUG
+  const unsigned int maxStubNum = 10;
+  assert(stubNum <= maxStubNum);
+  const bool validStubNum[maxStubNum+1] =
+    {true, true, true, false, false, true, true, false, false, true, true};
+  assert(validStubNum[stubNum]);
+#endif
+  const char *result;
+  if (RetTy->isFloatTy()) {
+    result = sfMips16Helper[stubNum];
+  }
+  else if (RetTy ->isDoubleTy()) {
+    result = dfMips16Helper[stubNum];
+  }
+  else if (RetTy->isStructTy()) {
+    // check if it's complex
+    if (RetTy->getNumContainedTypes() == 2) {
+      if ((RetTy->getContainedType(0)->isFloatTy()) &&
+          (RetTy->getContainedType(1)->isFloatTy())) {
+        result = scMips16Helper[stubNum];
+      }
+      else if ((RetTy->getContainedType(0)->isDoubleTy()) &&
+               (RetTy->getContainedType(1)->isDoubleTy())) {
+        result = dcMips16Helper[stubNum];
+      }
+      else {
+        llvm_unreachable("Uncovered condition");
+      }
+    }
+    else {
+      llvm_unreachable("Uncovered condition");
+    }
+  }
+  else {
+    if (stubNum == 0) {
+      needHelper = false;
+      return "";
+    }
+    result = vMips16Helper[stubNum];
+  }
+  needHelper = true;
+  return result;
+}
+
+void Mips16TargetLowering::
+getOpndList(SmallVectorImpl<SDValue> &Ops,
+            std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
+            bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
+            CallLoweringInfo &CLI, SDValue Callee, SDValue Chain) const {
+  SelectionDAG &DAG = CLI.DAG;
+  MachineFunction &MF = DAG.getMachineFunction();
+  MipsFunctionInfo *FuncInfo = MF.getInfo<MipsFunctionInfo>();
+  const char* Mips16HelperFunction = nullptr;
+  bool NeedMips16Helper = false;
+
+  if (Subtarget.inMips16HardFloat()) {
+    //
+    // currently we don't have symbols tagged with the mips16 or mips32
+    // qualifier so we will assume that we don't know what kind it is.
+    // and generate the helper
+    //
+    bool LookupHelper = true;
+    if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(CLI.Callee)) {
+      Mips16Libcall Find = { RTLIB::UNKNOWN_LIBCALL, S->getSymbol() };
+
+      if (std::binary_search(std::begin(HardFloatLibCalls),
+                             std::end(HardFloatLibCalls), Find))
+        LookupHelper = false;
+      else {
+        const char *Symbol = S->getSymbol();
+        Mips16IntrinsicHelperType IntrinsicFind = { Symbol, "" };
+        const Mips16HardFloatInfo::FuncSignature *Signature =
+            Mips16HardFloatInfo::findFuncSignature(Symbol);
+        if (!IsPICCall && (Signature && (FuncInfo->StubsNeeded.find(Symbol) ==
+                                         FuncInfo->StubsNeeded.end()))) {
+          FuncInfo->StubsNeeded[Symbol] = Signature;
+          //
+          // S2 is normally saved if the stub is for a function which
+          // returns a float or double value and is not otherwise. This is
+          // because more work is required after the function the stub
+          // is calling completes, and so the stub cannot directly return
+          // and the stub has no stack space to store the return address so
+          // S2 is used for that purpose.
+          // In order to take advantage of not saving S2, we need to also
+          // optimize the call in the stub and this requires some further
+          // functionality in MipsAsmPrinter which we don't have yet.
+          // So for now we always save S2. The optimization will be done
+          // in a follow-on patch.
+          //
+          if (1 || (Signature->RetSig != Mips16HardFloatInfo::NoFPRet))
+            FuncInfo->setSaveS2();
+        }
+        // one more look at list of intrinsics
+        const Mips16IntrinsicHelperType *Helper =
+            std::lower_bound(std::begin(Mips16IntrinsicHelper),
+                             std::end(Mips16IntrinsicHelper), IntrinsicFind);
+        if (Helper != std::end(Mips16IntrinsicHelper) &&
+            *Helper == IntrinsicFind) {
+          Mips16HelperFunction = Helper->Helper;
+          NeedMips16Helper = true;
+          LookupHelper = false;
+        }
+
+      }
+    } else if (GlobalAddressSDNode *G =
+                   dyn_cast<GlobalAddressSDNode>(CLI.Callee)) {
+      Mips16Libcall Find = { RTLIB::UNKNOWN_LIBCALL,
+                             G->getGlobal()->getName().data() };
+
+      if (std::binary_search(std::begin(HardFloatLibCalls),
+                             std::end(HardFloatLibCalls), Find))
+        LookupHelper = false;
+    }
+    if (LookupHelper)
+      Mips16HelperFunction =
+        getMips16HelperFunction(CLI.RetTy, CLI.getArgs(), NeedMips16Helper);
+  }
+
+  SDValue JumpTarget = Callee;
+
+  // T9 should contain the address of the callee function if
+  // -reloction-model=pic or it is an indirect call.
+  if (IsPICCall || !GlobalOrExternal) {
+    unsigned V0Reg = Mips::V0;
+    if (NeedMips16Helper) {
+      RegsToPass.push_front(std::make_pair(V0Reg, Callee));
+      JumpTarget = DAG.getExternalSymbol(Mips16HelperFunction, getPointerTy());
+      ExternalSymbolSDNode *S = cast<ExternalSymbolSDNode>(JumpTarget);
+      JumpTarget = getAddrGlobal(S, JumpTarget.getValueType(), DAG,
+                                 MipsII::MO_GOT, Chain,
+                                 FuncInfo->callPtrInfo(S->getSymbol()));
+    } else
+      RegsToPass.push_front(std::make_pair((unsigned)Mips::T9, Callee));
+  }
+
+  Ops.push_back(JumpTarget);
+
+  MipsTargetLowering::getOpndList(Ops, RegsToPass, IsPICCall, GlobalOrExternal,
+                                  InternalLinkage, CLI, Callee, Chain);
+}
+
+MachineBasicBlock *Mips16TargetLowering::
+emitSel16(unsigned Opc, MachineInstr *MI, MachineBasicBlock *BB) const {
+  if (DontExpandCondPseudos16)
+    return BB;
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  // To "insert" a SELECT_CC instruction, we actually have to insert the
+  // diamond control-flow pattern.  The incoming instruction knows the
+  // destination vreg to set, the condition code register to branch on, the
+  // true/false values to select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   setcc r1, r2, r3
+  //   bNE   r1, r0, copy1MBB
+  //   fallthrough --> copy0MBB
+  MachineBasicBlock *thisMBB  = BB;
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB  = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, copy0MBB);
+  F->insert(It, sinkMBB);
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), BB,
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Next, add the true and fallthrough blocks as its successors.
+  BB->addSuccessor(copy0MBB);
+  BB->addSuccessor(sinkMBB);
+
+  BuildMI(BB, DL, TII->get(Opc)).addReg(MI->getOperand(3).getReg())
+    .addMBB(sinkMBB);
+
+  //  copy0MBB:
+  //   %FalseValue = ...
+  //   # fallthrough to sinkMBB
+  BB = copy0MBB;
+
+  // Update machine-CFG edges
+  BB->addSuccessor(sinkMBB);
+
+  //  sinkMBB:
+  //   %Result = phi [ %TrueValue, thisMBB ], [ %FalseValue, copy0MBB ]
+  //  ...
+  BB = sinkMBB;
+
+  BuildMI(*BB, BB->begin(), DL,
+          TII->get(Mips::PHI), MI->getOperand(0).getReg())
+    .addReg(MI->getOperand(1).getReg()).addMBB(thisMBB)
+    .addReg(MI->getOperand(2).getReg()).addMBB(copy0MBB);
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+}
+
+MachineBasicBlock *Mips16TargetLowering::emitSelT16
+  (unsigned Opc1, unsigned Opc2,
+   MachineInstr *MI, MachineBasicBlock *BB) const {
+  if (DontExpandCondPseudos16)
+    return BB;
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  // To "insert" a SELECT_CC instruction, we actually have to insert the
+  // diamond control-flow pattern.  The incoming instruction knows the
+  // destination vreg to set, the condition code register to branch on, the
+  // true/false values to select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   setcc r1, r2, r3
+  //   bNE   r1, r0, copy1MBB
+  //   fallthrough --> copy0MBB
+  MachineBasicBlock *thisMBB  = BB;
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB  = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, copy0MBB);
+  F->insert(It, sinkMBB);
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), BB,
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Next, add the true and fallthrough blocks as its successors.
+  BB->addSuccessor(copy0MBB);
+  BB->addSuccessor(sinkMBB);
+
+  BuildMI(BB, DL, TII->get(Opc2)).addReg(MI->getOperand(3).getReg())
+    .addReg(MI->getOperand(4).getReg());
+  BuildMI(BB, DL, TII->get(Opc1)).addMBB(sinkMBB);
+
+  //  copy0MBB:
+  //   %FalseValue = ...
+  //   # fallthrough to sinkMBB
+  BB = copy0MBB;
+
+  // Update machine-CFG edges
+  BB->addSuccessor(sinkMBB);
+
+  //  sinkMBB:
+  //   %Result = phi [ %TrueValue, thisMBB ], [ %FalseValue, copy0MBB ]
+  //  ...
+  BB = sinkMBB;
+
+  BuildMI(*BB, BB->begin(), DL,
+          TII->get(Mips::PHI), MI->getOperand(0).getReg())
+    .addReg(MI->getOperand(1).getReg()).addMBB(thisMBB)
+    .addReg(MI->getOperand(2).getReg()).addMBB(copy0MBB);
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+
+}
+
+MachineBasicBlock *Mips16TargetLowering::emitSeliT16
+  (unsigned Opc1, unsigned Opc2,
+   MachineInstr *MI, MachineBasicBlock *BB) const {
+  if (DontExpandCondPseudos16)
+    return BB;
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  // To "insert" a SELECT_CC instruction, we actually have to insert the
+  // diamond control-flow pattern.  The incoming instruction knows the
+  // destination vreg to set, the condition code register to branch on, the
+  // true/false values to select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   setcc r1, r2, r3
+  //   bNE   r1, r0, copy1MBB
+  //   fallthrough --> copy0MBB
+  MachineBasicBlock *thisMBB  = BB;
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB  = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, copy0MBB);
+  F->insert(It, sinkMBB);
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), BB,
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Next, add the true and fallthrough blocks as its successors.
+  BB->addSuccessor(copy0MBB);
+  BB->addSuccessor(sinkMBB);
+
+  BuildMI(BB, DL, TII->get(Opc2)).addReg(MI->getOperand(3).getReg())
+    .addImm(MI->getOperand(4).getImm());
+  BuildMI(BB, DL, TII->get(Opc1)).addMBB(sinkMBB);
+
+  //  copy0MBB:
+  //   %FalseValue = ...
+  //   # fallthrough to sinkMBB
+  BB = copy0MBB;
+
+  // Update machine-CFG edges
+  BB->addSuccessor(sinkMBB);
+
+  //  sinkMBB:
+  //   %Result = phi [ %TrueValue, thisMBB ], [ %FalseValue, copy0MBB ]
+  //  ...
+  BB = sinkMBB;
+
+  BuildMI(*BB, BB->begin(), DL,
+          TII->get(Mips::PHI), MI->getOperand(0).getReg())
+    .addReg(MI->getOperand(1).getReg()).addMBB(thisMBB)
+    .addReg(MI->getOperand(2).getReg()).addMBB(copy0MBB);
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+
+}
+
+MachineBasicBlock
+  *Mips16TargetLowering::emitFEXT_T8I816_ins(unsigned BtOpc, unsigned CmpOpc,
+                                             MachineInstr *MI,
+                                             MachineBasicBlock *BB) const {
+  if (DontExpandCondPseudos16)
+    return BB;
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  unsigned regX = MI->getOperand(0).getReg();
+  unsigned regY = MI->getOperand(1).getReg();
+  MachineBasicBlock *target = MI->getOperand(2).getMBB();
+  BuildMI(*BB, MI, MI->getDebugLoc(), TII->get(CmpOpc)).addReg(regX)
+    .addReg(regY);
+  BuildMI(*BB, MI, MI->getDebugLoc(), TII->get(BtOpc)).addMBB(target);
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+}
+
+MachineBasicBlock *Mips16TargetLowering::emitFEXT_T8I8I16_ins(
+  unsigned BtOpc, unsigned CmpiOpc, unsigned CmpiXOpc, bool ImmSigned,
+  MachineInstr *MI,  MachineBasicBlock *BB) const {
+  if (DontExpandCondPseudos16)
+    return BB;
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  unsigned regX = MI->getOperand(0).getReg();
+  int64_t imm = MI->getOperand(1).getImm();
+  MachineBasicBlock *target = MI->getOperand(2).getMBB();
+  unsigned CmpOpc;
+  if (isUInt<8>(imm))
+    CmpOpc = CmpiOpc;
+  else if ((!ImmSigned && isUInt<16>(imm)) ||
+           (ImmSigned && isInt<16>(imm)))
+    CmpOpc = CmpiXOpc;
+  else
+    llvm_unreachable("immediate field not usable");
+  BuildMI(*BB, MI, MI->getDebugLoc(), TII->get(CmpOpc)).addReg(regX)
+    .addImm(imm);
+  BuildMI(*BB, MI, MI->getDebugLoc(), TII->get(BtOpc)).addMBB(target);
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+}
+
+static unsigned Mips16WhichOp8uOr16simm
+  (unsigned shortOp, unsigned longOp, int64_t Imm) {
+  if (isUInt<8>(Imm))
+    return shortOp;
+  else if (isInt<16>(Imm))
+    return longOp;
+  else
+    llvm_unreachable("immediate field not usable");
+}
+
+MachineBasicBlock *Mips16TargetLowering::emitFEXT_CCRX16_ins(
+  unsigned SltOpc,
+  MachineInstr *MI,  MachineBasicBlock *BB) const {
+  if (DontExpandCondPseudos16)
+    return BB;
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  unsigned CC = MI->getOperand(0).getReg();
+  unsigned regX = MI->getOperand(1).getReg();
+  unsigned regY = MI->getOperand(2).getReg();
+  BuildMI(*BB, MI, MI->getDebugLoc(), TII->get(SltOpc)).addReg(regX).addReg(
+      regY);
+  BuildMI(*BB, MI, MI->getDebugLoc(),
+          TII->get(Mips::MoveR3216), CC).addReg(Mips::T8);
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+}
+
+MachineBasicBlock *Mips16TargetLowering::emitFEXT_CCRXI16_ins(
+  unsigned SltiOpc, unsigned SltiXOpc,
+  MachineInstr *MI,  MachineBasicBlock *BB )const {
+  if (DontExpandCondPseudos16)
+    return BB;
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  unsigned CC = MI->getOperand(0).getReg();
+  unsigned regX = MI->getOperand(1).getReg();
+  int64_t Imm = MI->getOperand(2).getImm();
+  unsigned SltOpc = Mips16WhichOp8uOr16simm(SltiOpc, SltiXOpc, Imm);
+  BuildMI(*BB, MI, MI->getDebugLoc(),
+          TII->get(SltOpc)).addReg(regX).addImm(Imm);
+  BuildMI(*BB, MI, MI->getDebugLoc(),
+          TII->get(Mips::MoveR3216), CC).addReg(Mips::T8);
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+
+}
diff --git a/contrib/llvm/lib/Target/Mips/Mips16ISelLowering.h b/contrib/llvm/lib/Target/Mips/Mips16ISelLowering.h
new file mode 100644
index 0000000..e7e4d7f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16ISelLowering.h
@@ -0,0 +1,81 @@
+//===-- Mips16ISelLowering.h - Mips16 DAG Lowering Interface ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Subclass of MipsTargetLowering specialized for mips16.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPS16ISELLOWERING_H
+#define MIPS16ISELLOWERING_H
+
+#include "MipsISelLowering.h"
+
+namespace llvm {
+  class Mips16TargetLowering : public MipsTargetLowering  {
+  public:
+    explicit Mips16TargetLowering(MipsTargetMachine &TM,
+                                  const MipsSubtarget &STI);
+
+    bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AddrSpace,
+                                       bool *Fast) const override;
+
+    MachineBasicBlock *
+    EmitInstrWithCustomInserter(MachineInstr *MI,
+                                MachineBasicBlock *MBB) const override;
+
+  private:
+    bool isEligibleForTailCallOptimization(const MipsCC &MipsCCInfo,
+                                     unsigned NextStackOffset,
+                                     const MipsFunctionInfo& FI) const override;
+
+    void setMips16HardFloatLibCalls();
+
+    unsigned int
+      getMips16HelperFunctionStubNumber(ArgListTy &Args) const;
+
+    const char *getMips16HelperFunction
+      (Type* RetTy, ArgListTy &Args, bool &needHelper) const;
+
+    void
+    getOpndList(SmallVectorImpl<SDValue> &Ops,
+                std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
+                bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
+                CallLoweringInfo &CLI, SDValue Callee,
+                SDValue Chain) const override;
+
+    MachineBasicBlock *emitSel16(unsigned Opc, MachineInstr *MI,
+                                 MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *emitSeliT16(unsigned Opc1, unsigned Opc2,
+                                   MachineInstr *MI,
+                                   MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *emitSelT16(unsigned Opc1, unsigned Opc2,
+                                  MachineInstr *MI,
+                                  MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *emitFEXT_T8I816_ins(unsigned BtOpc, unsigned CmpOpc,
+                                           MachineInstr *MI,
+                                           MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *emitFEXT_T8I8I16_ins(
+      unsigned BtOpc, unsigned CmpiOpc, unsigned CmpiXOpc, bool ImmSigned,
+      MachineInstr *MI,  MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *emitFEXT_CCRX16_ins(
+      unsigned SltOpc,
+      MachineInstr *MI,  MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *emitFEXT_CCRXI16_ins(
+      unsigned SltiOpc, unsigned SltiXOpc,
+      MachineInstr *MI,  MachineBasicBlock *BB )const;
+  };
+}
+
+#endif // Mips16ISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/Mips/Mips16InstrFormats.td b/contrib/llvm/lib/Target/Mips/Mips16InstrFormats.td
new file mode 100644
index 0000000..da3a1f1
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16InstrFormats.td
@@ -0,0 +1,640 @@
+//===- Mips16InstrFormats.td - Mips Instruction Formats ----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Describe MIPS instructions format
+//
+//  CPU INSTRUCTION FORMATS
+//
+//  funct or f      Function field
+//
+//  immediate       4-,5-,8- or 11-bit immediate, branch displacement, or
+//  or imm          address displacement
+//
+//  op              5-bit major operation code
+//
+//  rx              3-bit source or destination register
+//
+//  ry              3-bit source or destination register
+//
+//  rz              3-bit source or destination register
+//
+//  sa              3- or 5-bit shift amount
+//
+//===----------------------------------------------------------------------===//
+
+
+// Base class for Mips 16 Format
+// This class does not depend on the instruction size
+//
+class MipsInst16_Base<dag outs, dag ins, string asmstr, list<dag> pattern,
+                      InstrItinClass itin>: Instruction
+{
+
+  let Namespace = "Mips";
+
+  let OutOperandList = outs;
+  let InOperandList  = ins;
+
+  let AsmString   = asmstr;
+  let Pattern     = pattern;
+  let Itinerary   = itin;
+
+  let Predicates = [InMips16Mode];
+}
+
+//
+// Generic Mips 16 Format
+//
+class MipsInst16<dag outs, dag ins, string asmstr, list<dag> pattern,
+                 InstrItinClass itin>:
+  MipsInst16_Base<outs, ins, asmstr, pattern, itin>
+{
+  field bits<16> Inst;
+  bits<5> Opcode = 0;
+
+  // Top 5 bits are the 'opcode' field
+  let Inst{15-11} = Opcode;
+
+  let Size=2;
+  field bits<16> SoftFail = 0;
+}
+
+//
+// For 32 bit extended instruction forms.
+//
+class MipsInst16_32<dag outs, dag ins, string asmstr, list<dag> pattern,
+                    InstrItinClass itin>:
+  MipsInst16_Base<outs, ins, asmstr, pattern, itin>
+{
+  field bits<32> Inst;
+
+  let Size=4;
+  field bits<32> SoftFail = 0;
+}
+
+class MipsInst16_EXTEND<dag outs, dag ins, string asmstr, list<dag> pattern,
+                        InstrItinClass itin>:
+  MipsInst16_32<outs, ins, asmstr, pattern, itin>
+{
+  let Inst{31-27} = 0b11110;
+}
+
+
+
+// Mips Pseudo Instructions Format
+class MipsPseudo16<dag outs, dag ins, string asmstr, list<dag> pattern>:
+  MipsInst16<outs, ins, asmstr, pattern, IIPseudo> {
+  let isCodeGenOnly = 1;
+  let isPseudo = 1;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Format I instruction class in Mips : <|opcode|imm11|>
+//===----------------------------------------------------------------------===//
+
+class FI16<bits<5> op, dag outs, dag ins, string asmstr, list<dag> pattern,
+           InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+  bits<11> imm11;
+
+  let Opcode = op;
+
+  let Inst{10-0}  = imm11;
+}
+
+//===----------------------------------------------------------------------===//
+// Format RI instruction class in Mips : <|opcode|rx|imm8|>
+//===----------------------------------------------------------------------===//
+
+class FRI16<bits<5> op, dag outs, dag ins, string asmstr,
+            list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+  bits<3>  rx;
+  bits<8>   imm8;
+
+  let Opcode = op;
+
+  let Inst{10-8} = rx;
+  let Inst{7-0} = imm8;
+}
+
+//===----------------------------------------------------------------------===//
+// Format RR instruction class in Mips : <|opcode|rx|ry|funct|>
+//===----------------------------------------------------------------------===//
+
+class FRR16<bits<5> _funct, dag outs, dag ins, string asmstr,
+            list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+  bits<3>  rx;
+  bits<3>  ry;
+  bits<5>  funct;
+
+  let Opcode = 0b11101;
+  let funct  = _funct;
+
+  let Inst{10-8} = rx;
+  let Inst{7-5} = ry;
+  let Inst{4-0}   = funct;
+}
+
+class FRRBreak16<dag outs, dag ins, string asmstr,
+                 list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+  bits<6>  Code;
+  bits<5>  funct;
+
+  let Opcode = 0b11101;
+  let funct  = 0b00101;
+
+  let Inst{10-5} = Code;
+  let Inst{4-0}   = funct;
+}
+
+//
+// For conversion functions.
+//
+class FRR_SF16<bits<5> _funct, bits<3> _subfunct, dag outs, dag ins,
+               string asmstr, list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+  bits<3>  rx;
+  bits<3>  subfunct;
+  bits<5>  funct;
+
+  let Opcode = 0b11101; // RR
+  let funct  = _funct;
+  let subfunct = _subfunct;
+
+  let Inst{10-8} = rx;
+  let Inst{7-5} = subfunct;
+  let Inst{4-0}   = funct;
+}
+
+//
+// just used for breakpoint (hardware and software) instructions.
+//
+class FC16<bits<5> _funct, dag outs, dag ins, string asmstr,
+           list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+  bits<6>  _code;  // code is a keyword in tablegen
+  bits<5>  funct;
+
+  let Opcode = 0b11101; // RR
+  let funct  = _funct;
+
+  let Inst{10-5} = _code;
+  let Inst{4-0}   = funct;
+}
+
+//
+// J(AL)R(C) subformat
+//
+class FRR16_JALRC<bits<1> _nd, bits<1> _l, bits<1> r_a,
+                  dag outs, dag ins, string asmstr,
+                  list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+  bits<3>  rx;
+  bits<1>  nd;
+  bits<1>  l;
+  bits<1>  ra;
+
+  let nd = _nd;
+  let l = _l;
+  let ra = r_a;
+
+  let Opcode = 0b11101;
+
+  let Inst{10-8} = rx;
+  let Inst{7} = nd;
+  let Inst{6} = l;
+  let Inst{5} = ra;
+  let Inst{4-0} = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Format RRI instruction class in Mips : <|opcode|rx|ry|imm5|>
+//===----------------------------------------------------------------------===//
+
+class FRRI16<bits<5> op, dag outs, dag ins, string asmstr,
+             list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+  bits<3>  rx;
+  bits<3>  ry;
+  bits<5>  imm5;
+
+  let Opcode = op;
+
+
+  let Inst{10-8} = rx;
+  let Inst{7-5} = ry;
+  let Inst{4-0}   = imm5;
+}
+
+//===----------------------------------------------------------------------===//
+// Format RRR instruction class in Mips : <|opcode|rx|ry|rz|f|>
+//===----------------------------------------------------------------------===//
+
+class FRRR16<bits<2> _f, dag outs, dag ins, string asmstr,
+             list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+  bits<3>  rx;
+  bits<3>  ry;
+  bits<3>  rz;
+  bits<2>  f;
+
+  let Opcode = 0b11100;
+  let f  = _f;
+
+  let Inst{10-8} = rx;
+  let Inst{7-5} = ry;
+  let Inst{4-2} = rz;
+  let Inst{1-0}   = f;
+}
+
+//===----------------------------------------------------------------------===//
+// Format RRI-A instruction class in Mips : <|opcode|rx|ry|f|imm4|>
+//===----------------------------------------------------------------------===//
+
+class FRRI_A16<bits<1> _f, dag outs, dag ins, string asmstr,
+               list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+  bits<3>  rx;
+  bits<3>  ry;
+  bits<1>  f;
+  bits<4>  imm4;
+
+  let Opcode = 0b01000;
+  let  f = _f;
+
+  let Inst{10-8} = rx;
+  let Inst{7-5} = ry;
+  let Inst{4} = f;
+  let Inst{3-0}   = imm4;
+}
+
+//===----------------------------------------------------------------------===//
+// Format Shift instruction class in Mips : <|opcode|rx|ry|sa|f|>
+//===----------------------------------------------------------------------===//
+
+class FSHIFT16<bits<2> _f, dag outs, dag ins, string asmstr,
+               list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+  bits<3>  rx;
+  bits<3>  ry;
+  bits<3>  sa;
+  bits<2>  f;
+
+  let Opcode = 0b00110;
+  let f  = _f;
+
+  let Inst{10-8} = rx;
+  let Inst{7-5} = ry;
+  let Inst{4-2} = sa;
+  let Inst{1-0}   = f;
+}
+
+//===----------------------------------------------------------------------===//
+// Format i8 instruction class in Mips : <|opcode|funct|imm8>
+//===----------------------------------------------------------------------===//
+
+class FI816<bits<3> _func, dag outs, dag ins, string asmstr,
+            list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+  bits<3>  func;
+  bits<8>   imm8;
+
+  let Opcode = 0b01100;
+  let func  = _func;
+
+  let Inst{10-8} = func;
+  let Inst{7-0} = imm8;
+}
+
+//===----------------------------------------------------------------------===//
+// Format i8_MOVR32 instruction class in Mips : <|opcode|func|ry|r32>
+//===----------------------------------------------------------------------===//
+
+class FI8_MOVR3216<dag outs, dag ins, string asmstr,
+                   list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+
+  bits<4> ry;
+  bits<4> r32;
+
+  let Opcode = 0b01100;
+
+  let Inst{10-8} = 0b111;
+  let Inst{7-4} = ry;
+  let Inst{3-0} = r32;
+
+}
+
+
+
+//===----------------------------------------------------------------------===//
+// Format i8_MOV32R instruction class in Mips : <|opcode|func|r32|rz>
+//===----------------------------------------------------------------------===//
+
+class FI8_MOV32R16<dag outs, dag ins, string asmstr,
+                   list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+
+  bits<3>  func;
+  bits<5> r32;
+  bits<3> rz;
+
+
+  let Opcode = 0b01100;
+
+  let Inst{10-8} = 0b101;
+  let Inst{7-5} = r32{2-0};
+  let Inst{4-3} = r32{4-3};
+  let Inst{2-0} = rz;
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format i8_SVRS instruction class in Mips :
+//    <|opcode|svrs|s|ra|s0|s1|framesize>
+//===----------------------------------------------------------------------===//
+
+class FI8_SVRS16<bits<1> _s, dag outs, dag ins, string asmstr,
+                 list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+  bits<1> s;
+  bits<1> ra = 0;
+  bits<1> s0 = 0;
+  bits<1> s1 = 0;
+  bits<4> framesize = 0;
+
+  let s =_s;
+  let Opcode = 0b01100;
+
+  let Inst{10-8} = 0b100;
+  let Inst{7} = s;
+  let Inst{6} = ra;
+  let Inst{5} = s0;
+  let Inst{4} = s1;
+  let Inst{3-0} = framesize;
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format JAL instruction class in Mips16 :
+//    <|opcode|svrs|s|ra|s0|s1|framesize>
+//===----------------------------------------------------------------------===//
+
+class FJAL16<bits<1> _X, dag outs, dag ins, string asmstr,
+             list<dag> pattern, InstrItinClass itin>:
+  MipsInst16_32<outs, ins, asmstr, pattern, itin>
+{
+  bits<1> X;
+  bits<26> imm26;
+
+
+  let X = _X;
+
+  let Inst{31-27} = 0b00011;
+  let Inst{26} = X;
+  let Inst{25-21} = imm26{20-16};
+  let Inst{20-16} = imm26{25-21};
+  let Inst{15-0}  = imm26{15-0};
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format EXT-I instruction class in Mips16 :
+//     <|EXTEND|imm10:5|imm15:11|op|0|0|0|0|0|0|imm4:0>
+//===----------------------------------------------------------------------===//
+
+class FEXT_I16<bits<5> _eop, dag outs, dag ins, string asmstr,
+               list<dag> pattern, InstrItinClass itin>:
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin>
+{
+  bits<16> imm16;
+  bits<5> eop;
+
+  let eop = _eop;
+
+  let Inst{26-21} = imm16{10-5};
+  let Inst{20-16} = imm16{15-11};
+  let Inst{15-11} = eop;
+  let Inst{10-5} = 0;
+  let Inst{4-0} = imm16{4-0};
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format ASMACRO instruction class in Mips16 :
+//    <EXTEND|select|p4|p3|RRR|p2|p1|p0>
+//===----------------------------------------------------------------------===//
+
+class FASMACRO16<dag outs, dag ins, string asmstr,
+                 list<dag> pattern, InstrItinClass itin>:
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin>
+{
+  bits<3> select;
+  bits<3> p4;
+  bits<5> p3;
+  bits<5> RRR = 0b11100;
+  bits<3> p2;
+  bits<3> p1;
+  bits<5> p0;
+
+
+  let Inst{26-24} = select;
+  let Inst{23-21} = p4;
+  let Inst{20-16} = p3;
+  let Inst{15-11} = RRR;
+  let Inst{10-8} = p2;
+  let Inst{7-5} = p1;
+  let Inst{4-0} = p0;
+
+}
+
+
+//===----------------------------------------------------------------------===//
+// Format EXT-RI instruction class in Mips16 :
+//    <|EXTEND|imm10:5|imm15:11|op|rx|0|0|0|imm4:0>
+//===----------------------------------------------------------------------===//
+
+class FEXT_RI16<bits<5> _op, dag outs, dag ins, string asmstr,
+                list<dag> pattern, InstrItinClass itin>:
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin>
+{
+  bits<16> imm16;
+  bits<5> op;
+  bits<3> rx;
+
+  let op = _op;
+
+  let Inst{26-21} = imm16{10-5};
+  let Inst{20-16} = imm16{15-11};
+  let Inst{15-11} = op;
+  let Inst{10-8} = rx;
+  let Inst{7-5} = 0;
+  let Inst{4-0} = imm16{4-0};
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format EXT-RRI instruction class in Mips16 :
+//     <|EXTEND|imm10:5|imm15:11|op|rx|ry|imm4:0>
+//===----------------------------------------------------------------------===//
+
+class FEXT_RRI16<bits<5> _op, dag outs, dag ins, string asmstr,
+                 list<dag> pattern, InstrItinClass itin>:
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin>
+{
+  bits<5> op;
+  bits<16> imm16;
+  bits<3> rx;
+  bits<3> ry;
+
+  let op=_op;
+
+  let Inst{26-21} = imm16{10-5};
+  let Inst{20-16} = imm16{15-11};
+  let Inst{15-11} = op;
+  let Inst{10-8} = rx;
+  let Inst{7-5} = ry;
+  let Inst{4-0} = imm16{4-0};
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format EXT-RRI-A instruction class in Mips16 :
+//    <|EXTEND|imm10:4|imm14:11|RRI-A|rx|ry|f|imm3:0>
+//===----------------------------------------------------------------------===//
+
+class FEXT_RRI_A16<bits<1> _f, dag outs, dag ins, string asmstr,
+                   list<dag> pattern, InstrItinClass itin>:
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin>
+{
+  bits<15> imm15;
+  bits<3> rx;
+  bits<3> ry;
+  bits<1> f;
+
+  let f = _f;
+
+  let Inst{26-20} = imm15{10-4};
+  let Inst{19-16} = imm15{14-11};
+  let Inst{15-11} = 0b01000;
+  let Inst{10-8} = rx;
+  let Inst{7-5} = ry;
+  let Inst{4} = f;
+  let Inst{3-0} = imm15{3-0};
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format EXT-SHIFT instruction class in Mips16 :
+//    <|EXTEND|sa 4:0|s5|0|SHIFT|rx|ry|0|f>
+//===----------------------------------------------------------------------===//
+
+class FEXT_SHIFT16<bits<2> _f, dag outs, dag ins, string asmstr,
+                   list<dag> pattern, InstrItinClass itin>:
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin>
+{
+  bits<6> sa6;
+  bits<3> rx;
+  bits<3> ry;
+  bits<2> f;
+
+  let f = _f;
+
+  let Inst{26-22} = sa6{4-0};
+  let Inst{21} = sa6{5};
+  let Inst{20-16} = 0;
+  let Inst{15-11} = 0b00110;
+  let Inst{10-8} = rx;
+  let Inst{7-5} = ry;
+  let Inst{4-2} = 0;
+  let Inst{1-0} = f;
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format EXT-I8 instruction class in Mips16 :
+//    <|EXTEND|imm10:5|imm15:11|I8|funct|0|imm4:0>
+//===----------------------------------------------------------------------===//
+
+class FEXT_I816<bits<3> _funct, dag outs, dag ins, string asmstr,
+                list<dag> pattern, InstrItinClass itin>:
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin>
+{
+  bits<16> imm16;
+  bits<5> I8;
+  bits<3> funct;
+
+  let funct = _funct;
+  let I8 = 0b0110;
+
+  let Inst{26-21} = imm16{10-5};
+  let Inst{20-16} = imm16{15-11};
+  let Inst{15-11} = I8;
+  let Inst{10-8} = funct;
+  let Inst{7-5} = 0;
+  let Inst{4-0} = imm16{4-0};
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format EXT-I8_SVRS instruction class in Mips16 :
+//    <|EXTEND|xsregs|framesize7:4|aregs|I8|SVRS|s|ra|s0|s1|framesize3:0>
+//===----------------------------------------------------------------------===//
+
+class FEXT_I8_SVRS16<bits<1> s_, dag outs, dag ins, string asmstr,
+                     list<dag> pattern, InstrItinClass itin>:
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin>
+{
+  bits<3> xsregs =0;
+  bits<8> framesize =0;
+  bits<3> aregs =0;
+  bits<5> I8 = 0b01100;
+  bits<3> SVRS = 0b100;
+  bits<1> s;
+  bits<1> ra = 0;
+  bits<1> s0 = 0;
+  bits<1> s1 = 0;
+
+  let s= s_;
+
+  let Inst{26-24} = xsregs;
+  let Inst{23-20} = framesize{7-4};
+  let Inst{19} = 0;
+  let Inst{18-16} = aregs;
+  let Inst{15-11} = I8;
+  let Inst{10-8} = SVRS;
+  let Inst{7} = s;
+  let Inst{6} = ra;
+  let Inst{5} = s0;
+  let Inst{4} = s1;
+  let Inst{3-0} = framesize{3-0};
+
+
+}
+
diff --git a/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.cpp b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.cpp
new file mode 100644
index 0000000..4dd9af2
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.cpp
@@ -0,0 +1,519 @@
+
+//===-- Mips16InstrInfo.cpp - Mips16 Instruction Information --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips16 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+#include "Mips16InstrInfo.h"
+#include "InstPrinter/MipsInstPrinter.h"
+#include "MipsMachineFunction.h"
+#include "MipsTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include <cctype>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips16-instrinfo"
+
+Mips16InstrInfo::Mips16InstrInfo(const MipsSubtarget &STI)
+    : MipsInstrInfo(STI, Mips::Bimm16), RI(STI) {}
+
+const MipsRegisterInfo &Mips16InstrInfo::getRegisterInfo() const {
+  return RI;
+}
+
+/// isLoadFromStackSlot - If the specified machine instruction is a direct
+/// load from a stack slot, return the virtual or physical register number of
+/// the destination along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than loading from the stack slot.
+unsigned Mips16InstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+                                              int &FrameIndex) const {
+  return 0;
+}
+
+/// isStoreToStackSlot - If the specified machine instruction is a direct
+/// store to a stack slot, return the virtual or physical register number of
+/// the source reg along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than storing to the stack slot.
+unsigned Mips16InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+                                             int &FrameIndex) const {
+  return 0;
+}
+
+void Mips16InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I, DebugLoc DL,
+                                  unsigned DestReg, unsigned SrcReg,
+                                  bool KillSrc) const {
+  unsigned Opc = 0;
+
+  if (Mips::CPU16RegsRegClass.contains(DestReg) &&
+      Mips::GPR32RegClass.contains(SrcReg))
+    Opc = Mips::MoveR3216;
+  else if (Mips::GPR32RegClass.contains(DestReg) &&
+           Mips::CPU16RegsRegClass.contains(SrcReg))
+    Opc = Mips::Move32R16;
+  else if ((SrcReg == Mips::HI0) &&
+           (Mips::CPU16RegsRegClass.contains(DestReg)))
+    Opc = Mips::Mfhi16, SrcReg = 0;
+
+  else if ((SrcReg == Mips::LO0) &&
+           (Mips::CPU16RegsRegClass.contains(DestReg)))
+    Opc = Mips::Mflo16, SrcReg = 0;
+
+
+  assert(Opc && "Cannot copy registers");
+
+  MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opc));
+
+  if (DestReg)
+    MIB.addReg(DestReg, RegState::Define);
+
+  if (SrcReg)
+    MIB.addReg(SrcReg, getKillRegState(KillSrc));
+}
+
+void Mips16InstrInfo::storeRegToStack(MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator I,
+                                      unsigned SrcReg, bool isKill, int FI,
+                                      const TargetRegisterClass *RC,
+                                      const TargetRegisterInfo *TRI,
+                                      int64_t Offset) const {
+  DebugLoc DL;
+  if (I != MBB.end()) DL = I->getDebugLoc();
+  MachineMemOperand *MMO = GetMemOperand(MBB, FI, MachineMemOperand::MOStore);
+  unsigned Opc = 0;
+  if (Mips::CPU16RegsRegClass.hasSubClassEq(RC))
+    Opc = Mips::SwRxSpImmX16;
+  assert(Opc && "Register class not handled!");
+  BuildMI(MBB, I, DL, get(Opc)).addReg(SrcReg, getKillRegState(isKill)).
+      addFrameIndex(FI).addImm(Offset)
+      .addMemOperand(MMO);
+}
+
+void Mips16InstrInfo::loadRegFromStack(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator I,
+                                       unsigned DestReg, int FI,
+                                       const TargetRegisterClass *RC,
+                                       const TargetRegisterInfo *TRI,
+                                       int64_t Offset) const {
+  DebugLoc DL;
+  if (I != MBB.end()) DL = I->getDebugLoc();
+  MachineMemOperand *MMO = GetMemOperand(MBB, FI, MachineMemOperand::MOLoad);
+  unsigned Opc = 0;
+
+  if (Mips::CPU16RegsRegClass.hasSubClassEq(RC))
+    Opc = Mips::LwRxSpImmX16;
+  assert(Opc && "Register class not handled!");
+  BuildMI(MBB, I, DL, get(Opc), DestReg).addFrameIndex(FI).addImm(Offset)
+    .addMemOperand(MMO);
+}
+
+bool Mips16InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
+  MachineBasicBlock &MBB = *MI->getParent();
+  switch(MI->getDesc().getOpcode()) {
+  default:
+    return false;
+  case Mips::RetRA16:
+    ExpandRetRA16(MBB, MI, Mips::JrcRa16);
+    break;
+  }
+
+  MBB.erase(MI);
+  return true;
+}
+
+/// GetOppositeBranchOpc - Return the inverse of the specified
+/// opcode, e.g. turning BEQ to BNE.
+unsigned Mips16InstrInfo::getOppositeBranchOpc(unsigned Opc) const {
+  switch (Opc) {
+  default:  llvm_unreachable("Illegal opcode!");
+  case Mips::BeqzRxImmX16: return Mips::BnezRxImmX16;
+  case Mips::BnezRxImmX16: return Mips::BeqzRxImmX16;
+  case Mips::BeqzRxImm16: return Mips::BnezRxImm16;
+  case Mips::BnezRxImm16: return Mips::BeqzRxImm16;
+  case Mips::BteqzT8CmpX16: return Mips::BtnezT8CmpX16;
+  case Mips::BteqzT8SltX16: return Mips::BtnezT8SltX16;
+  case Mips::BteqzT8SltiX16: return Mips::BtnezT8SltiX16;
+  case Mips::Btnez16: return Mips::Bteqz16;
+  case Mips::BtnezX16: return Mips::BteqzX16;
+  case Mips::BtnezT8CmpiX16: return Mips::BteqzT8CmpiX16;
+  case Mips::BtnezT8SltuX16: return Mips::BteqzT8SltuX16;
+  case Mips::BtnezT8SltiuX16: return Mips::BteqzT8SltiuX16;
+  case Mips::Bteqz16: return Mips::Btnez16;
+  case Mips::BteqzX16: return Mips::BtnezX16;
+  case Mips::BteqzT8CmpiX16: return Mips::BtnezT8CmpiX16;
+  case Mips::BteqzT8SltuX16: return Mips::BtnezT8SltuX16;
+  case Mips::BteqzT8SltiuX16: return Mips::BtnezT8SltiuX16;
+  case Mips::BtnezT8CmpX16: return Mips::BteqzT8CmpX16;
+  case Mips::BtnezT8SltX16: return Mips::BteqzT8SltX16;
+  case Mips::BtnezT8SltiX16: return Mips::BteqzT8SltiX16;
+  }
+  assert(false && "Implement this function.");
+  return 0;
+}
+
+static void addSaveRestoreRegs(MachineInstrBuilder &MIB,
+                               const std::vector<CalleeSavedInfo> &CSI,
+                               unsigned Flags = 0) {
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    // Add the callee-saved register as live-in. Do not add if the register is
+    // RA and return address is taken, because it has already been added in
+    // method MipsTargetLowering::LowerRETURNADDR.
+    // It's killed at the spill, unless the register is RA and return address
+    // is taken.
+    unsigned Reg = CSI[e-i-1].getReg();
+    switch (Reg) {
+    case Mips::RA:
+    case Mips::S0:
+    case Mips::S1:
+      MIB.addReg(Reg, Flags);
+      break;
+    case Mips::S2:
+      break;
+    default:
+      llvm_unreachable("unexpected mips16 callee saved register");
+
+    }
+  }
+}
+// Adjust SP by FrameSize bytes. Save RA, S0, S1
+void Mips16InstrInfo::makeFrame(unsigned SP, int64_t FrameSize,
+                                MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator I) const {
+  DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo *MFI    = MF.getFrameInfo();
+  const BitVector Reserved = RI.getReservedRegs(MF);
+  bool SaveS2 = Reserved[Mips::S2];
+  MachineInstrBuilder MIB;
+  unsigned Opc = ((FrameSize <= 128) && !SaveS2)? Mips::Save16:Mips::SaveX16;
+  MIB = BuildMI(MBB, I, DL, get(Opc));
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+  addSaveRestoreRegs(MIB, CSI);
+  if (SaveS2)
+    MIB.addReg(Mips::S2);
+  if (isUInt<11>(FrameSize))
+    MIB.addImm(FrameSize);
+  else {
+    int Base = 2040; // should create template function like isUInt that
+                     // returns largest possible n bit unsigned integer
+    int64_t Remainder = FrameSize - Base;
+    MIB.addImm(Base);
+    if (isInt<16>(-Remainder))
+      BuildAddiuSpImm(MBB, I, -Remainder);
+    else
+      adjustStackPtrBig(SP, -Remainder, MBB, I, Mips::V0, Mips::V1);
+  }
+}
+
+// Adjust SP by FrameSize bytes. Restore RA, S0, S1
+void Mips16InstrInfo::restoreFrame(unsigned SP, int64_t FrameSize,
+                                   MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator I) const {
+  DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
+  MachineFunction *MF = MBB.getParent();
+  MachineFrameInfo *MFI    = MF->getFrameInfo();
+  const BitVector Reserved = RI.getReservedRegs(*MF);
+  bool SaveS2 = Reserved[Mips::S2];
+  MachineInstrBuilder MIB;
+  unsigned Opc = ((FrameSize <= 128) && !SaveS2)?
+    Mips::Restore16:Mips::RestoreX16;
+
+  if (!isUInt<11>(FrameSize)) {
+    unsigned Base = 2040;
+    int64_t Remainder = FrameSize - Base;
+    FrameSize = Base; // should create template function like isUInt that
+                     // returns largest possible n bit unsigned integer
+
+    if (isInt<16>(Remainder))
+      BuildAddiuSpImm(MBB, I, Remainder);
+    else
+      adjustStackPtrBig(SP, Remainder, MBB, I, Mips::A0, Mips::A1);
+  }
+  MIB = BuildMI(MBB, I, DL, get(Opc));
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+  addSaveRestoreRegs(MIB, CSI, RegState::Define);
+  if (SaveS2)
+    MIB.addReg(Mips::S2, RegState::Define);
+  MIB.addImm(FrameSize);
+}
+
+// Adjust SP by Amount bytes where bytes can be up to 32bit number.
+// This can only be called at times that we know that there is at least one free
+// register.
+// This is clearly safe at prologue and epilogue.
+//
+void Mips16InstrInfo::adjustStackPtrBig(unsigned SP, int64_t Amount,
+                                        MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator I,
+                                        unsigned Reg1, unsigned Reg2) const {
+  DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
+  //
+  // li reg1, constant
+  // move reg2, sp
+  // add reg1, reg1, reg2
+  // move sp, reg1
+  //
+  //
+  MachineInstrBuilder MIB1 = BuildMI(MBB, I, DL, get(Mips::LwConstant32), Reg1);
+  MIB1.addImm(Amount).addImm(-1);
+  MachineInstrBuilder MIB2 = BuildMI(MBB, I, DL, get(Mips::MoveR3216), Reg2);
+  MIB2.addReg(Mips::SP, RegState::Kill);
+  MachineInstrBuilder MIB3 = BuildMI(MBB, I, DL, get(Mips::AdduRxRyRz16), Reg1);
+  MIB3.addReg(Reg1);
+  MIB3.addReg(Reg2, RegState::Kill);
+  MachineInstrBuilder MIB4 = BuildMI(MBB, I, DL, get(Mips::Move32R16),
+                                                     Mips::SP);
+  MIB4.addReg(Reg1, RegState::Kill);
+}
+
+void Mips16InstrInfo::adjustStackPtrBigUnrestricted(
+    unsigned SP, int64_t Amount, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator I) const {
+   assert(false && "adjust stack pointer amount exceeded");
+}
+
+/// Adjust SP by Amount bytes.
+void Mips16InstrInfo::adjustStackPtr(unsigned SP, int64_t Amount,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const {
+  if (isInt<16>(Amount))  // need to change to addiu sp, ....and isInt<16>
+    BuildAddiuSpImm(MBB, I, Amount);
+  else
+    adjustStackPtrBigUnrestricted(SP, Amount, MBB, I);
+}
+
+/// This function generates the sequence of instructions needed to get the
+/// result of adding register REG and immediate IMM.
+unsigned Mips16InstrInfo::loadImmediate(unsigned FrameReg, int64_t Imm,
+                                        MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator II,
+                                        DebugLoc DL, unsigned &NewImm) const {
+  //
+  // given original instruction is:
+  // Instr rx, T[offset] where offset is too big.
+  //
+  // lo = offset & 0xFFFF
+  // hi = ((offset >> 16) + (lo >> 15)) & 0xFFFF;
+  //
+  // let T = temporary register
+  // li T, hi
+  // shl T, 16
+  // add T, Rx, T
+  //
+  RegScavenger rs;
+  int32_t lo = Imm & 0xFFFF;
+  NewImm = lo;
+  int Reg =0;
+  int SpReg = 0;
+
+  rs.enterBasicBlock(&MBB);
+  rs.forward(II);
+  //
+  // We need to know which registers can be used, in the case where there
+  // are not enough free registers. We exclude all registers that
+  // are used in the instruction that we are helping.
+  //  // Consider all allocatable registers in the register class initially
+  BitVector Candidates =
+      RI.getAllocatableSet
+      (*II->getParent()->getParent(), &Mips::CPU16RegsRegClass);
+  // Exclude all the registers being used by the instruction.
+  for (unsigned i = 0, e = II->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = II->getOperand(i);
+    if (MO.isReg() && MO.getReg() != 0 && !MO.isDef() &&
+        !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+      Candidates.reset(MO.getReg());
+  }
+
+  // If the same register was used and defined in an instruction, then
+  // it will not be in the list of candidates.
+  //
+  // we need to analyze the instruction that we are helping.
+  // we need to know if it defines register x but register x is not
+  // present as an operand of the instruction. this tells
+  // whether the register is live before the instruction. if it's not
+  // then we don't need to save it in case there are no free registers.
+  int DefReg = 0;
+  for (unsigned i = 0, e = II->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = II->getOperand(i);
+    if (MO.isReg() && MO.isDef()) {
+      DefReg = MO.getReg();
+      break;
+    }
+  }
+
+  BitVector Available = rs.getRegsAvailable(&Mips::CPU16RegsRegClass);
+  Available &= Candidates;
+  //
+  // we use T0 for the first register, if we need to save something away.
+  // we use T1 for the second register, if we need to save something away.
+  //
+  unsigned FirstRegSaved =0, SecondRegSaved=0;
+  unsigned FirstRegSavedTo = 0, SecondRegSavedTo = 0;
+
+  Reg = Available.find_first();
+
+  if (Reg == -1) {
+    Reg = Candidates.find_first();
+    Candidates.reset(Reg);
+    if (DefReg != Reg) {
+      FirstRegSaved = Reg;
+      FirstRegSavedTo = Mips::T0;
+      copyPhysReg(MBB, II, DL, FirstRegSavedTo, FirstRegSaved, true);
+    }
+  }
+  else
+    Available.reset(Reg);
+  BuildMI(MBB, II, DL, get(Mips::LwConstant32), Reg).addImm(Imm).addImm(-1);
+  NewImm = 0;
+  if (FrameReg == Mips::SP) {
+    SpReg = Available.find_first();
+    if (SpReg == -1) {
+      SpReg = Candidates.find_first();
+      // Candidates.reset(SpReg); // not really needed
+      if (DefReg!= SpReg) {
+        SecondRegSaved = SpReg;
+        SecondRegSavedTo = Mips::T1;
+      }
+      if (SecondRegSaved)
+        copyPhysReg(MBB, II, DL, SecondRegSavedTo, SecondRegSaved, true);
+    }
+   else
+     Available.reset(SpReg);
+    copyPhysReg(MBB, II, DL, SpReg, Mips::SP, false);
+    BuildMI(MBB, II, DL, get(Mips::  AdduRxRyRz16), Reg).addReg(SpReg, RegState::Kill)
+      .addReg(Reg);
+  }
+  else
+    BuildMI(MBB, II, DL, get(Mips::  AdduRxRyRz16), Reg).addReg(FrameReg)
+      .addReg(Reg, RegState::Kill);
+  if (FirstRegSaved || SecondRegSaved) {
+    II = std::next(II);
+    if (FirstRegSaved)
+      copyPhysReg(MBB, II, DL, FirstRegSaved, FirstRegSavedTo, true);
+    if (SecondRegSaved)
+      copyPhysReg(MBB, II, DL, SecondRegSaved, SecondRegSavedTo, true);
+  }
+  return Reg;
+}
+
+unsigned Mips16InstrInfo::getAnalyzableBrOpc(unsigned Opc) const {
+  return (Opc == Mips::BeqzRxImmX16   || Opc == Mips::BimmX16  ||
+          Opc == Mips::Bimm16  ||
+          Opc == Mips::Bteqz16        || Opc == Mips::Btnez16 ||
+          Opc == Mips::BeqzRxImm16    || Opc == Mips::BnezRxImm16   ||
+          Opc == Mips::BnezRxImmX16   || Opc == Mips::BteqzX16 ||
+          Opc == Mips::BteqzT8CmpX16  || Opc == Mips::BteqzT8CmpiX16 ||
+          Opc == Mips::BteqzT8SltX16  || Opc == Mips::BteqzT8SltuX16  ||
+          Opc == Mips::BteqzT8SltiX16 || Opc == Mips::BteqzT8SltiuX16 ||
+          Opc == Mips::BtnezX16       || Opc == Mips::BtnezT8CmpX16 ||
+          Opc == Mips::BtnezT8CmpiX16 || Opc == Mips::BtnezT8SltX16 ||
+          Opc == Mips::BtnezT8SltuX16 || Opc == Mips::BtnezT8SltiX16 ||
+          Opc == Mips::BtnezT8SltiuX16 ) ? Opc : 0;
+}
+
+void Mips16InstrInfo::ExpandRetRA16(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I,
+                                  unsigned Opc) const {
+  BuildMI(MBB, I, I->getDebugLoc(), get(Opc));
+}
+
+const MCInstrDesc &Mips16InstrInfo::AddiuSpImm(int64_t Imm) const {
+  if (validSpImm8(Imm))
+    return get(Mips::AddiuSpImm16);
+  else
+    return get(Mips::AddiuSpImmX16);
+}
+
+void Mips16InstrInfo::BuildAddiuSpImm
+  (MachineBasicBlock &MBB, MachineBasicBlock::iterator I, int64_t Imm) const {
+  DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
+  BuildMI(MBB, I, DL, AddiuSpImm(Imm)).addImm(Imm);
+}
+
+const MipsInstrInfo *llvm::createMips16InstrInfo(const MipsSubtarget &STI) {
+  return new Mips16InstrInfo(STI);
+}
+
+bool Mips16InstrInfo::validImmediate(unsigned Opcode, unsigned Reg,
+                                     int64_t Amount) {
+  switch (Opcode) {
+  case Mips::LbRxRyOffMemX16:
+  case Mips::LbuRxRyOffMemX16:
+  case Mips::LhRxRyOffMemX16:
+  case Mips::LhuRxRyOffMemX16:
+  case Mips::SbRxRyOffMemX16:
+  case Mips::ShRxRyOffMemX16:
+  case Mips::LwRxRyOffMemX16:
+  case Mips::SwRxRyOffMemX16:
+  case Mips::SwRxSpImmX16:
+  case Mips::LwRxSpImmX16:
+    return isInt<16>(Amount);
+  case Mips::AddiuRxRyOffMemX16:
+    if ((Reg == Mips::PC) || (Reg == Mips::SP))
+      return isInt<16>(Amount);
+    return isInt<15>(Amount);
+  }
+  llvm_unreachable("unexpected Opcode in validImmediate");
+}
+
+/// Measure the specified inline asm to determine an approximation of its
+/// length.
+/// Comments (which run till the next SeparatorString or newline) do not
+/// count as an instruction.
+/// Any other non-whitespace text is considered an instruction, with
+/// multiple instructions separated by SeparatorString or newlines.
+/// Variable-length instructions are not handled here; this function
+/// may be overloaded in the target code to do that.
+/// We implement the special case of the .space directive taking only an
+/// integer argument, which is the size in bytes. This is used for creating
+/// inline code spacing for testing purposes using inline assembly.
+///
+unsigned Mips16InstrInfo::getInlineAsmLength(const char *Str,
+                                             const MCAsmInfo &MAI) const {
+
+  // Count the number of instructions in the asm.
+  bool atInsnStart = true;
+  unsigned Length = 0;
+  for (; *Str; ++Str) {
+    if (*Str == '\n' || strncmp(Str, MAI.getSeparatorString(),
+                                strlen(MAI.getSeparatorString())) == 0)
+      atInsnStart = true;
+    if (atInsnStart && !std::isspace(static_cast<unsigned char>(*Str))) {
+      if (strncmp(Str, ".space", 6)==0) {
+        char *EStr; int Sz;
+        Sz = strtol(Str+6, &EStr, 10);
+        while (isspace(*EStr)) ++EStr;
+        if (*EStr=='\0') {
+          DEBUG(dbgs() << "parsed .space " << Sz << '\n');
+          return Sz;
+        }
+      }
+      Length += MAI.getMaxInstLength();
+      atInsnStart = false;
+    }
+    if (atInsnStart && strncmp(Str, MAI.getCommentString(),
+                               strlen(MAI.getCommentString())) == 0)
+      atInsnStart = false;
+  }
+
+  return Length;
+}
diff --git a/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.h b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.h
new file mode 100644
index 0000000..a004c56
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.h
@@ -0,0 +1,128 @@
+//===-- Mips16InstrInfo.h - Mips16 Instruction Information ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips16 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPS16INSTRUCTIONINFO_H
+#define MIPS16INSTRUCTIONINFO_H
+
+#include "Mips16RegisterInfo.h"
+#include "MipsInstrInfo.h"
+
+namespace llvm {
+
+class Mips16InstrInfo : public MipsInstrInfo {
+  const Mips16RegisterInfo RI;
+
+public:
+  explicit Mips16InstrInfo(const MipsSubtarget &STI);
+
+  const MipsRegisterInfo &getRegisterInfo() const override;
+
+  /// isLoadFromStackSlot - If the specified machine instruction is a direct
+  /// load from a stack slot, return the virtual or physical register number of
+  /// the destination along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than loading from the stack slot.
+  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                               int &FrameIndex) const override;
+
+  /// isStoreToStackSlot - If the specified machine instruction is a direct
+  /// store to a stack slot, return the virtual or physical register number of
+  /// the source reg along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than storing to the stack slot.
+  unsigned isStoreToStackSlot(const MachineInstr *MI,
+                              int &FrameIndex) const override;
+
+  void copyPhysReg(MachineBasicBlock &MBB,
+                   MachineBasicBlock::iterator MI, DebugLoc DL,
+                   unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+
+  void storeRegToStack(MachineBasicBlock &MBB,
+                       MachineBasicBlock::iterator MBBI,
+                       unsigned SrcReg, bool isKill, int FrameIndex,
+                       const TargetRegisterClass *RC,
+                       const TargetRegisterInfo *TRI,
+                       int64_t Offset) const override;
+
+  void loadRegFromStack(MachineBasicBlock &MBB,
+                        MachineBasicBlock::iterator MBBI,
+                        unsigned DestReg, int FrameIndex,
+                        const TargetRegisterClass *RC,
+                        const TargetRegisterInfo *TRI,
+                        int64_t Offset) const override;
+
+  bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
+
+  unsigned getOppositeBranchOpc(unsigned Opc) const override;
+
+  // Adjust SP by FrameSize bytes. Save RA, S0, S1
+  void makeFrame(unsigned SP, int64_t FrameSize, MachineBasicBlock &MBB,
+                 MachineBasicBlock::iterator I) const;
+
+  // Adjust SP by FrameSize bytes. Restore RA, S0, S1
+  void restoreFrame(unsigned SP, int64_t FrameSize, MachineBasicBlock &MBB,
+                      MachineBasicBlock::iterator I) const;
+
+
+  /// Adjust SP by Amount bytes.
+  void adjustStackPtr(unsigned SP, int64_t Amount, MachineBasicBlock &MBB,
+                      MachineBasicBlock::iterator I) const;
+
+  /// Emit a series of instructions to load an immediate.
+  // This is to adjust some FrameReg. We return the new register to be used
+  // in place of FrameReg and the adjusted immediate field (&NewImm)
+  //
+  unsigned loadImmediate(unsigned FrameReg,
+                         int64_t Imm, MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator II, DebugLoc DL,
+                         unsigned &NewImm) const;
+
+  static bool validImmediate(unsigned Opcode, unsigned Reg, int64_t Amount);
+
+  static bool validSpImm8(int offset) {
+    return ((offset & 7) == 0) && isInt<11>(offset);
+  }
+
+  //
+  // build the proper one based on the Imm field
+  //
+
+  const MCInstrDesc& AddiuSpImm(int64_t Imm) const;
+
+  void BuildAddiuSpImm
+    (MachineBasicBlock &MBB, MachineBasicBlock::iterator I, int64_t Imm) const;
+
+  unsigned getInlineAsmLength(const char *Str,
+                              const MCAsmInfo &MAI) const override;
+private:
+  unsigned getAnalyzableBrOpc(unsigned Opc) const override;
+
+  void ExpandRetRA16(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                   unsigned Opc) const;
+
+  // Adjust SP by Amount bytes where bytes can be up to 32bit number.
+  void adjustStackPtrBig(unsigned SP, int64_t Amount, MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator I,
+                         unsigned Reg1, unsigned Reg2) const;
+
+  // Adjust SP by Amount bytes where bytes can be up to 32bit number.
+  void adjustStackPtrBigUnrestricted(unsigned SP, int64_t Amount,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.td b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.td
new file mode 100644
index 0000000..5e4eebb
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.td
@@ -0,0 +1,1917 @@
+//===- Mips16InstrInfo.td - Target Description for Mips16  -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Mips16 instructions.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+// Mips Address
+//
+def addr16 :
+  ComplexPattern<iPTR, 3, "selectAddr16", [frameindex], [SDNPWantParent]>;
+
+//
+// Address operand
+def mem16 : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops CPU16Regs, simm16, CPU16RegsPlusSP);
+  let EncoderMethod = "getMemEncoding";
+}
+
+def mem16_ea : Operand<i32> {
+  let PrintMethod = "printMemOperandEA";
+  let MIOperandInfo = (ops CPU16RegsPlusSP, simm16);
+  let EncoderMethod = "getMemEncoding";
+}
+
+//
+// I-type instruction format
+//
+// this is only used by bimm. the actual assembly value is a 12 bit signed
+// number
+//
+class FI16_ins<bits<5> op, string asmstr, InstrItinClass itin>:
+  FI16<op, (outs), (ins brtarget:$imm16),
+            !strconcat(asmstr, "\t$imm16 # 16 bit inst"), [], itin>;
+
+//
+//
+// I8 instruction format
+//
+
+class FI816_ins_base<bits<3> _func, string asmstr,
+                     string asmstr2, InstrItinClass itin>:
+  FI816<_func, (outs), (ins simm16:$imm), !strconcat(asmstr, asmstr2),
+        [], itin>;
+
+class FI816_ins<bits<3> _func, string asmstr,
+                InstrItinClass itin>:
+  FI816_ins_base<_func, asmstr, "\t$imm  # 16 bit inst", itin>;
+ 
+class FI816_SP_ins<bits<3> _func, string asmstr,
+                   InstrItinClass itin>:
+  FI816_ins_base<_func, asmstr, "\t$$sp, $imm # 16 bit inst", itin>;
+
+//
+// RI instruction format
+//
+
+
+class FRI16_ins_base<bits<5> op, string asmstr, string asmstr2,
+                     InstrItinClass itin>:
+  FRI16<op, (outs CPU16Regs:$rx), (ins simm16:$imm),
+        !strconcat(asmstr, asmstr2), [], itin>;
+
+class FRI16_ins<bits<5> op, string asmstr,
+                InstrItinClass itin>:
+  FRI16_ins_base<op, asmstr, "\t$rx, $imm \t# 16 bit inst", itin>;
+
+class FRI16_TCP_ins<bits<5> _op, string asmstr,
+                    InstrItinClass itin>:
+  FRI16<_op, (outs CPU16Regs:$rx), (ins pcrel16:$imm, i32imm:$size),
+            !strconcat(asmstr, "\t$rx, $imm\t# 16 bit inst"), [], itin>;
+            
+class FRI16R_ins_base<bits<5> op, string asmstr, string asmstr2,
+                     InstrItinClass itin>:
+  FRI16<op, (outs), (ins CPU16Regs:$rx, simm16:$imm),
+        !strconcat(asmstr, asmstr2), [], itin>;
+
+class FRI16R_ins<bits<5> op, string asmstr,
+                InstrItinClass itin>:
+  FRI16R_ins_base<op, asmstr, "\t$rx, $imm \t# 16 bit inst", itin>;
+
+class F2RI16_ins<bits<5> _op, string asmstr,
+                     InstrItinClass itin>:
+  FRI16<_op, (outs CPU16Regs:$rx), (ins CPU16Regs:$rx_, simm16:$imm),
+        !strconcat(asmstr, "\t$rx, $imm\t# 16 bit inst"), [], itin> {
+  let Constraints = "$rx_ = $rx";
+}
+
+class FRI16_B_ins<bits<5> _op, string asmstr,
+                  InstrItinClass itin>:
+  FRI16<_op, (outs), (ins  CPU16Regs:$rx, brtarget:$imm),
+        !strconcat(asmstr, "\t$rx, $imm  # 16 bit inst"), [], itin>;
+//
+// Compare a register and immediate and place result in CC
+// Implicit use of T8
+//
+// EXT-CCRR Instruction format
+//
+class FEXT_CCRXI16_ins<string asmstr>:
+  MipsPseudo16<(outs CPU16Regs:$cc), (ins CPU16Regs:$rx, simm16:$imm),
+               !strconcat(asmstr, "\t$rx, $imm\n\tmove\t$cc, $$t8"), []> {
+  let isCodeGenOnly=1;
+  let usesCustomInserter = 1;
+}
+
+// JAL and JALX instruction format
+//
+class FJAL16_ins<bits<1> _X, string asmstr,
+                 InstrItinClass itin>:
+  FJAL16<_X, (outs), (ins simm20:$imm),
+         !strconcat(asmstr, "\t$imm\n\tnop"),[],
+         itin>  {
+  let isCodeGenOnly=1;
+  let Size=6;
+}
+
+class FJALB16_ins<bits<1> _X, string asmstr,
+                 InstrItinClass itin>:
+  FJAL16<_X, (outs), (ins simm20:$imm),
+         !strconcat(asmstr, "\t$imm\t# branch\n\tnop"),[],
+         itin>  {
+  let isCodeGenOnly=1;
+  let Size=6;
+}
+
+//
+// EXT-I instruction format
+//
+class FEXT_I16_ins<bits<5> eop, string asmstr, InstrItinClass itin> :
+  FEXT_I16<eop, (outs), (ins brtarget:$imm16),
+           !strconcat(asmstr, "\t$imm16"),[], itin>;
+
+//
+// EXT-I8 instruction format
+//
+
+class FEXT_I816_ins_base<bits<3> _func, string asmstr,
+                         string asmstr2, InstrItinClass itin>:
+  FEXT_I816<_func, (outs), (ins simm16:$imm), !strconcat(asmstr, asmstr2),
+            [], itin>;
+
+class FEXT_I816_ins<bits<3> _func, string asmstr,
+                    InstrItinClass itin>:
+  FEXT_I816_ins_base<_func, asmstr, "\t$imm", itin>;
+
+class FEXT_I816_SP_ins<bits<3> _func, string asmstr,
+                       InstrItinClass itin>:
+      FEXT_I816_ins_base<_func, asmstr, "\t$$sp, $imm", itin>;
+
+//
+// Assembler formats in alphabetical order.
+// Natural and pseudos are mixed together.
+//
+// Compare two registers and place result in CC
+// Implicit use of T8
+//
+// CC-RR Instruction format
+//
+class FCCRR16_ins<string asmstr> :
+  MipsPseudo16<(outs CPU16Regs:$cc), (ins CPU16Regs:$rx, CPU16Regs:$ry),
+               !strconcat(asmstr, "\t$rx, $ry\n\tmove\t$cc, $$t8"), []> {
+  let isCodeGenOnly=1;
+  let usesCustomInserter = 1;
+}
+
+//
+// EXT-RI instruction format
+//
+
+class FEXT_RI16_ins_base<bits<5> _op, string asmstr, string asmstr2,
+                         InstrItinClass itin>:
+  FEXT_RI16<_op, (outs CPU16Regs:$rx), (ins simm16:$imm),
+                  !strconcat(asmstr, asmstr2), [], itin>;
+
+class FEXT_RI16_ins<bits<5> _op, string asmstr,
+                    InstrItinClass itin>:
+  FEXT_RI16_ins_base<_op, asmstr, "\t$rx, $imm", itin>;
+
+class FEXT_RI16R_ins_base<bits<5> _op, string asmstr, string asmstr2,
+                         InstrItinClass itin>:
+  FEXT_RI16<_op, (outs ), (ins CPU16Regs:$rx, simm16:$imm),
+                  !strconcat(asmstr, asmstr2), [], itin>;
+
+class FEXT_RI16R_ins<bits<5> _op, string asmstr,
+                    InstrItinClass itin>:
+  FEXT_RI16R_ins_base<_op, asmstr, "\t$rx, $imm", itin>;
+
+class FEXT_RI16_PC_ins<bits<5> _op, string asmstr, InstrItinClass itin>:
+  FEXT_RI16_ins_base<_op, asmstr, "\t$rx, $$pc, $imm", itin>;
+
+class FEXT_RI16_B_ins<bits<5> _op, string asmstr,
+                      InstrItinClass itin>:
+  FEXT_RI16<_op, (outs), (ins  CPU16Regs:$rx, brtarget:$imm),
+            !strconcat(asmstr, "\t$rx, $imm"), [], itin>;
+
+class FEXT_RI16_TCP_ins<bits<5> _op, string asmstr,
+                        InstrItinClass itin>:
+  FEXT_RI16<_op, (outs CPU16Regs:$rx), (ins pcrel16:$imm, i32imm:$size),
+            !strconcat(asmstr, "\t$rx, $imm"), [], itin>;
+
+class FEXT_2RI16_ins<bits<5> _op, string asmstr,
+                     InstrItinClass itin>:
+  FEXT_RI16<_op, (outs CPU16Regs:$rx), (ins CPU16Regs:$rx_, simm16:$imm),
+            !strconcat(asmstr, "\t$rx, $imm"), [], itin> {
+  let Constraints = "$rx_ = $rx";
+}
+
+
+// this has an explicit sp argument that we ignore to work around a problem
+// in the compiler
+class FEXT_RI16_SP_explicit_ins<bits<5> _op, string asmstr,
+                                InstrItinClass itin>:
+  FEXT_RI16<_op, (outs CPU16Regs:$rx), (ins CPUSPReg:$ry, simm16:$imm),
+            !strconcat(asmstr, "\t$rx, $imm ( $ry ); "), [], itin>;
+
+class FEXT_RI16_SP_Store_explicit_ins<bits<5> _op, string asmstr,
+                                InstrItinClass itin>:
+  FEXT_RI16<_op, (outs), (ins  CPU16Regs:$rx, CPUSPReg:$ry, simm16:$imm),
+            !strconcat(asmstr, "\t$rx, $imm ( $ry ); "), [], itin>;
+
+//
+// EXT-RRI instruction format
+//
+
+class FEXT_RRI16_mem_ins<bits<5> op, string asmstr, Operand MemOpnd,
+                         InstrItinClass itin>:
+  FEXT_RRI16<op, (outs CPU16Regs:$ry), (ins  MemOpnd:$addr),
+             !strconcat(asmstr, "\t$ry, $addr"), [], itin>;
+
+class FEXT_RRI16_mem2_ins<bits<5> op, string asmstr, Operand MemOpnd,
+                          InstrItinClass itin>:
+  FEXT_RRI16<op, (outs ), (ins  CPU16Regs:$ry, MemOpnd:$addr),
+             !strconcat(asmstr, "\t$ry, $addr"), [], itin>;
+
+//
+//
+// EXT-RRI-A instruction format
+//
+
+class FEXT_RRI_A16_mem_ins<bits<1> op, string asmstr, Operand MemOpnd,
+                           InstrItinClass itin>:
+  FEXT_RRI_A16<op, (outs CPU16Regs:$ry), (ins  MemOpnd:$addr),
+               !strconcat(asmstr, "\t$ry, $addr"), [], itin>;
+
+//
+// EXT-SHIFT instruction format
+//
+class FEXT_SHIFT16_ins<bits<2> _f, string asmstr, InstrItinClass itin>:
+  FEXT_SHIFT16<_f, (outs CPU16Regs:$rx), (ins CPU16Regs:$ry, uimm5:$sa),
+               !strconcat(asmstr, "\t$rx, $ry, $sa"), [], itin>;
+
+//
+// EXT-T8I8
+//
+class FEXT_T8I816_ins<string asmstr, string asmstr2>:
+  MipsPseudo16<(outs),
+               (ins CPU16Regs:$rx, CPU16Regs:$ry, brtarget:$imm),
+               !strconcat(asmstr2, !strconcat("\t$rx, $ry\n\t",
+               !strconcat(asmstr, "\t$imm"))),[]> {
+  let isCodeGenOnly=1;
+  let usesCustomInserter = 1;
+}
+
+//
+// EXT-T8I8I
+//
+class FEXT_T8I8I16_ins<string asmstr, string asmstr2>:
+  MipsPseudo16<(outs),
+               (ins CPU16Regs:$rx, simm16:$imm, brtarget:$targ),
+               !strconcat(asmstr2, !strconcat("\t$rx, $imm\n\t",
+               !strconcat(asmstr, "\t$targ"))), []> {
+  let isCodeGenOnly=1;
+  let usesCustomInserter = 1;
+}
+//
+
+
+//
+// I8_MOVR32 instruction format (used only by the MOVR32 instructio
+//
+class FI8_MOVR3216_ins<string asmstr, InstrItinClass itin>:
+       FI8_MOVR3216<(outs CPU16Regs:$rz), (ins GPR32:$r32),
+       !strconcat(asmstr,  "\t$rz, $r32"), [], itin>;
+
+//
+// I8_MOV32R instruction format (used only by MOV32R instruction)
+//
+
+class FI8_MOV32R16_ins<string asmstr, InstrItinClass itin>:
+  FI8_MOV32R16<(outs GPR32:$r32), (ins CPU16Regs:$rz),
+               !strconcat(asmstr,  "\t$r32, $rz"), [], itin>;
+
+//
+// This are pseudo formats for multiply
+// This first one can be changed to non-pseudo now.
+//
+// MULT
+//
+class FMULT16_ins<string asmstr, InstrItinClass itin> :
+  MipsPseudo16<(outs), (ins CPU16Regs:$rx, CPU16Regs:$ry),
+               !strconcat(asmstr, "\t$rx, $ry"), []>;
+
+//
+// MULT-LO
+//
+class FMULT16_LO_ins<string asmstr, InstrItinClass itin> :
+  MipsPseudo16<(outs CPU16Regs:$rz), (ins CPU16Regs:$rx, CPU16Regs:$ry),
+               !strconcat(asmstr, "\t$rx, $ry\n\tmflo\t$rz"), []> {
+  let isCodeGenOnly=1;
+}
+
+//
+// RR-type instruction format
+//
+
+class FRR16_ins<bits<5> f, string asmstr, InstrItinClass itin> :
+  FRR16<f, (outs CPU16Regs:$rx), (ins CPU16Regs:$ry),
+        !strconcat(asmstr, "\t$rx, $ry"), [], itin> {
+}
+
+class FRRBreakNull16_ins<string asmstr, InstrItinClass itin> :
+  FRRBreak16<(outs), (ins), asmstr, [], itin> {
+  let Code=0;
+}
+
+class FRR16R_ins<bits<5> f, string asmstr, InstrItinClass itin> :
+  FRR16<f, (outs), (ins  CPU16Regs:$rx, CPU16Regs:$ry),
+        !strconcat(asmstr, "\t$rx, $ry"), [], itin> {
+}
+
+class FRRTR16_ins<string asmstr> :
+  MipsPseudo16<(outs CPU16Regs:$rz), (ins CPU16Regs:$rx, CPU16Regs:$ry),
+               !strconcat(asmstr, "\t$rx, $ry\n\tmove\t$rz, $$t8"), []> ;
+
+//
+// maybe refactor but need a $zero as a dummy first parameter
+//
+class FRR16_div_ins<bits<5> f, string asmstr, InstrItinClass itin> :
+  FRR16<f, (outs ), (ins CPU16Regs:$rx, CPU16Regs:$ry),
+        !strconcat(asmstr, "\t$$zero, $rx, $ry"), [], itin> ;
+
+class FUnaryRR16_ins<bits<5> f, string asmstr, InstrItinClass itin> :
+  FRR16<f, (outs CPU16Regs:$rx), (ins CPU16Regs:$ry),
+        !strconcat(asmstr, "\t$rx, $ry"), [], itin> ;
+
+
+class FRR16_M_ins<bits<5> f, string asmstr,
+                  InstrItinClass itin> :
+  FRR16<f, (outs CPU16Regs:$rx), (ins),
+        !strconcat(asmstr, "\t$rx"), [], itin>;
+
+class FRxRxRy16_ins<bits<5> f, string asmstr,
+                    InstrItinClass itin> :
+  FRR16<f, (outs CPU16Regs:$rz), (ins CPU16Regs:$rx, CPU16Regs:$ry),
+            !strconcat(asmstr, "\t$rz, $ry"),
+            [], itin> {
+  let Constraints = "$rx = $rz";
+}
+
+let rx=0 in
+class FRR16_JALRC_RA_only_ins<bits<1> nd_, bits<1> l_,
+                              string asmstr, InstrItinClass itin>:
+  FRR16_JALRC<nd_, l_, 1, (outs), (ins), !strconcat(asmstr, "\t $$ra"),
+              [], itin> ;
+
+
+class FRR16_JALRC_ins<bits<1> nd, bits<1> l, bits<1> ra,
+                      string asmstr, InstrItinClass itin>:
+  FRR16_JALRC<nd, l, ra, (outs), (ins CPU16Regs:$rx),
+              !strconcat(asmstr, "\t $rx"), [], itin> ;
+
+class FRR_SF16_ins
+  <bits<5> _funct, bits<3> _subfunc,
+    string asmstr, InstrItinClass itin>:
+  FRR_SF16<_funct, _subfunc, (outs CPU16Regs:$rx), (ins CPU16Regs:$rx_),
+           !strconcat(asmstr, "\t $rx"),
+           [], itin> {
+  let Constraints = "$rx_ = $rx";
+  }
+//
+// RRR-type instruction format
+//
+
+class FRRR16_ins<bits<2> _f, string asmstr,  InstrItinClass itin> :
+  FRRR16<_f, (outs CPU16Regs:$rz), (ins CPU16Regs:$rx, CPU16Regs:$ry),
+         !strconcat(asmstr, "\t$rz, $rx, $ry"), [], itin>;
+
+//
+// These Sel patterns support the generation of conditional move
+// pseudo instructions.
+//
+// The nomenclature uses the components making up the pseudo and may
+// be a bit counter intuitive when compared with the end result we seek.
+// For example using a bqez in the example directly below results in the
+// conditional move being done if the tested register is not zero.
+// I considered in easier to check by keeping the pseudo consistent with
+// it's components but it could have been done differently.
+//
+// The simplest case is when can test and operand directly and do the
+// conditional move based on a simple mips16 conditional
+//  branch instruction.
+// for example:
+// if $op == beqz or bnez:
+//
+// $op1 $rt, .+4
+// move $rd, $rs
+//
+// if $op == beqz, then if $rt != 0, then the conditional assignment
+// $rd = $rs is done.
+
+// if $op == bnez, then if $rt == 0, then the conditional assignment
+// $rd = $rs is done.
+//
+// So this pseudo class only has one operand, i.e. op
+//
+class Sel<string op>:
+  MipsPseudo16<(outs CPU16Regs:$rd_), (ins CPU16Regs:$rd, CPU16Regs:$rs,
+               CPU16Regs:$rt),
+               !strconcat(op, "\t$rt, .+4\n\t\n\tmove $rd, $rs"), []> {
+  //let isCodeGenOnly=1;
+  let Constraints = "$rd = $rd_";
+  let usesCustomInserter = 1;
+}
+
+//
+// The next two instruction classes allow for an operand which tests
+// two operands and returns a value in register T8 and
+//then does a conditional branch based on the value of T8
+//
+
+// op2 can be cmpi or slti/sltiu
+// op1 can bteqz or btnez
+// the operands for op2 are a register and a signed constant
+//
+// $op2 $t, $imm  ;test register t and branch conditionally
+// $op1 .+4       ;op1 is a conditional branch
+// move $rd, $rs
+//
+//
+class SeliT<string op1, string op2>:
+  MipsPseudo16<(outs CPU16Regs:$rd_), (ins CPU16Regs:$rd, CPU16Regs:$rs,
+                                       CPU16Regs:$rl, simm16:$imm),
+               !strconcat(op2,
+               !strconcat("\t$rl, $imm\n\t",
+               !strconcat(op1, "\t.+4\n\tmove $rd, $rs"))), []> {
+  let isCodeGenOnly=1;
+  let Constraints = "$rd = $rd_";
+  let usesCustomInserter = 1;
+}
+
+//
+// op2 can be cmp or slt/sltu
+// op1 can be bteqz or btnez
+// the operands for op2 are two registers
+// op1 is a conditional branch
+//
+//
+// $op2 $rl, $rr  ;test registers rl,rr
+// $op1 .+4       ;op2 is a conditional branch
+// move $rd, $rs
+//
+//
+class SelT<string op1, string op2>:
+  MipsPseudo16<(outs CPU16Regs:$rd_),
+               (ins CPU16Regs:$rd, CPU16Regs:$rs,
+                CPU16Regs:$rl, CPU16Regs:$rr),
+               !strconcat(op2,
+               !strconcat("\t$rl, $rr\n\t",
+               !strconcat(op1, "\t.+4\n\tmove $rd, $rs"))), []> {
+  let isCodeGenOnly=1;
+  let Constraints = "$rd = $rd_";
+  let usesCustomInserter = 1;
+}
+
+//
+// 32 bit constant
+//
+def imm32: Operand<i32>;
+
+def Constant32:
+  MipsPseudo16<(outs), (ins imm32:$imm), "\t.word $imm", []>;
+
+def LwConstant32:
+  MipsPseudo16<(outs CPU16Regs:$rx), (ins imm32:$imm, imm32:$constid),
+    "lw\t$rx, 1f\n\tb\t2f\n\t.align\t2\n1: \t.word\t$imm\n2:", []>;
+
+
+//
+// Some general instruction class info
+//
+//
+
+class ArithLogic16Defs<bit isCom=0> {
+  bits<5> shamt = 0;
+  bit isCommutable = isCom;
+  bit isReMaterializable = 1;
+  bit neverHasSideEffects = 1;
+}
+
+class branch16 {
+  bit isBranch = 1;
+  bit isTerminator = 1;
+  bit isBarrier = 1;
+}
+
+class cbranch16 {
+  bit isBranch = 1;
+  bit isTerminator = 1;
+}
+
+class MayLoad {
+  bit mayLoad = 1;
+}
+
+class MayStore {
+  bit mayStore = 1;
+}
+//
+
+
+// Format: ADDIU rx, immediate MIPS16e
+// Purpose: Add Immediate Unsigned Word (2-Operand, Extended)
+// To add a constant to a 32-bit integer.
+//
+def AddiuRxImmX16: FEXT_RI16_ins<0b01001, "addiu", IIAlu>;
+
+def AddiuRxRxImm16: F2RI16_ins<0b01001, "addiu", IIAlu>,
+  ArithLogic16Defs<0> {
+  let AddedComplexity = 5;
+}
+def AddiuRxRxImmX16: FEXT_2RI16_ins<0b01001, "addiu", IIAlu>,
+  ArithLogic16Defs<0> {
+  let isCodeGenOnly = 1;
+}
+
+def AddiuRxRyOffMemX16:
+  FEXT_RRI_A16_mem_ins<0, "addiu", mem16_ea, IIAlu>;
+
+//
+
+// Format: ADDIU rx, pc, immediate MIPS16e
+// Purpose: Add Immediate Unsigned Word (3-Operand, PC-Relative, Extended)
+// To add a constant to the program counter.
+//
+def AddiuRxPcImmX16: FEXT_RI16_PC_ins<0b00001, "addiu", IIAlu>;
+
+//
+// Format: ADDIU sp, immediate MIPS16e
+// Purpose: Add Immediate Unsigned Word (2-Operand, SP-Relative, Extended)
+// To add a constant to the stack pointer.
+//
+def AddiuSpImm16
+  : FI816_SP_ins<0b011, "addiu", IIAlu> {
+  let Defs = [SP];
+  let Uses = [SP];
+  let AddedComplexity = 5;
+}
+
+def AddiuSpImmX16
+  : FEXT_I816_SP_ins<0b011, "addiu", IIAlu> {
+  let Defs = [SP];
+  let Uses = [SP];
+}
+
+//
+// Format: ADDU rz, rx, ry MIPS16e
+// Purpose: Add Unsigned Word (3-Operand)
+// To add 32-bit integers.
+//
+
+def AdduRxRyRz16: FRRR16_ins<01, "addu", IIAlu>, ArithLogic16Defs<1>;
+
+//
+// Format: AND rx, ry MIPS16e
+// Purpose: AND
+// To do a bitwise logical AND.
+
+def AndRxRxRy16: FRxRxRy16_ins<0b01100, "and", IIAlu>, ArithLogic16Defs<1>;
+
+
+//
+// Format: BEQZ rx, offset MIPS16e
+// Purpose: Branch on Equal to Zero
+// To test a GPR then do a PC-relative conditional branch.
+//
+def BeqzRxImm16: FRI16_B_ins<0b00100, "beqz", IIAlu>, cbranch16;
+
+
+//
+// Format: BEQZ rx, offset MIPS16e
+// Purpose: Branch on Equal to Zero (Extended)
+// To test a GPR then do a PC-relative conditional branch.
+//
+def BeqzRxImmX16: FEXT_RI16_B_ins<0b00100, "beqz", IIAlu>, cbranch16;
+
+//
+// Format: B offset MIPS16e
+// Purpose: Unconditional Branch (Extended)
+// To do an unconditional PC-relative branch.
+//
+
+def Bimm16: FI16_ins<0b00010, "b", IIAlu>, branch16;
+
+// Format: B offset MIPS16e
+// Purpose: Unconditional Branch
+// To do an unconditional PC-relative branch.
+//
+def BimmX16: FEXT_I16_ins<0b00010, "b", IIAlu>, branch16;
+
+//
+// Format: BNEZ rx, offset MIPS16e
+// Purpose: Branch on Not Equal to Zero
+// To test a GPR then do a PC-relative conditional branch.
+//
+def BnezRxImm16: FRI16_B_ins<0b00101, "bnez", IIAlu>, cbranch16;
+
+//
+// Format: BNEZ rx, offset MIPS16e
+// Purpose: Branch on Not Equal to Zero (Extended)
+// To test a GPR then do a PC-relative conditional branch.
+//
+def BnezRxImmX16: FEXT_RI16_B_ins<0b00101, "bnez", IIAlu>, cbranch16;
+
+
+//
+//Format: BREAK immediate
+// Purpose: Breakpoint
+// To cause a Breakpoint exception.
+
+def Break16: FRRBreakNull16_ins<"break 0", NoItinerary>; 
+//
+// Format: BTEQZ offset MIPS16e
+// Purpose: Branch on T Equal to Zero (Extended)
+// To test special register T then do a PC-relative conditional branch.
+//
+def Bteqz16: FI816_ins<0b000, "bteqz", IIAlu>, cbranch16 {
+  let Uses = [T8];
+}
+
+def BteqzX16: FEXT_I816_ins<0b000, "bteqz", IIAlu>, cbranch16 {
+  let Uses = [T8];
+}
+
+def BteqzT8CmpX16: FEXT_T8I816_ins<"bteqz", "cmp">, cbranch16;
+
+def BteqzT8CmpiX16: FEXT_T8I8I16_ins<"bteqz", "cmpi">,
+  cbranch16;
+
+def BteqzT8SltX16: FEXT_T8I816_ins<"bteqz", "slt">, cbranch16;
+
+def BteqzT8SltuX16: FEXT_T8I816_ins<"bteqz", "sltu">, cbranch16;
+
+def BteqzT8SltiX16: FEXT_T8I8I16_ins<"bteqz", "slti">, cbranch16;
+
+def BteqzT8SltiuX16: FEXT_T8I8I16_ins<"bteqz", "sltiu">,
+  cbranch16;
+
+//
+// Format: BTNEZ offset MIPS16e
+// Purpose: Branch on T Not Equal to Zero (Extended)
+// To test special register T then do a PC-relative conditional branch.
+//
+
+def Btnez16: FI816_ins<0b001, "btnez", IIAlu>, cbranch16 {
+  let Uses = [T8];
+}
+
+def BtnezX16: FEXT_I816_ins<0b001, "btnez", IIAlu> ,cbranch16 {
+  let Uses = [T8];
+}
+
+def BtnezT8CmpX16: FEXT_T8I816_ins<"btnez", "cmp">, cbranch16;
+
+def BtnezT8CmpiX16: FEXT_T8I8I16_ins<"btnez", "cmpi">, cbranch16;
+
+def BtnezT8SltX16: FEXT_T8I816_ins<"btnez", "slt">, cbranch16;
+
+def BtnezT8SltuX16: FEXT_T8I816_ins<"btnez", "sltu">, cbranch16;
+
+def BtnezT8SltiX16: FEXT_T8I8I16_ins<"btnez", "slti">, cbranch16;
+
+def BtnezT8SltiuX16: FEXT_T8I8I16_ins<"btnez", "sltiu">,
+  cbranch16;
+
+//
+// Format: CMP rx, ry MIPS16e
+// Purpose: Compare
+// To compare the contents of two GPRs.
+//
+def CmpRxRy16: FRR16R_ins<0b01010, "cmp", IIAlu> {
+  let Defs = [T8];
+}
+
+//
+// Format: CMPI rx, immediate MIPS16e
+// Purpose: Compare Immediate
+// To compare a constant with the contents of a GPR.
+//
+def CmpiRxImm16: FRI16R_ins<0b01110, "cmpi", IIAlu> {
+  let Defs = [T8];
+}
+
+//
+// Format: CMPI rx, immediate MIPS16e
+// Purpose: Compare Immediate (Extended)
+// To compare a constant with the contents of a GPR.
+//
+def CmpiRxImmX16: FEXT_RI16R_ins<0b01110, "cmpi", IIAlu> {
+  let Defs = [T8];
+}
+
+
+//
+// Format: DIV rx, ry MIPS16e
+// Purpose: Divide Word
+// To divide 32-bit signed integers.
+//
+def DivRxRy16: FRR16_div_ins<0b11010, "div", IIAlu> {
+  let Defs = [HI0, LO0];
+}
+
+//
+// Format: DIVU rx, ry MIPS16e
+// Purpose: Divide Unsigned Word
+// To divide 32-bit unsigned integers.
+//
+def DivuRxRy16: FRR16_div_ins<0b11011, "divu", IIAlu> {
+  let Defs = [HI0, LO0];
+}
+//
+// Format: JAL target MIPS16e
+// Purpose: Jump and Link
+// To execute a procedure call within the current 256 MB-aligned
+// region and preserve the current ISA.
+//
+
+def Jal16 : FJAL16_ins<0b0, "jal", IIAlu> {
+  let hasDelaySlot = 0;  // not true, but we add the nop for now
+  let isCall=1;
+  let Defs = [RA];
+}
+
+def JalB16 : FJALB16_ins<0b0, "jal", IIAlu>, branch16 {
+  let hasDelaySlot = 0;  // not true, but we add the nop for now
+  let isBranch=1;
+  let Defs = [RA];
+}
+
+//
+// Format: JR ra MIPS16e
+// Purpose: Jump Register Through Register ra
+// To execute a branch to the instruction address in the return
+// address register.
+//
+
+def JrRa16: FRR16_JALRC_RA_only_ins<0, 0, "jr", IIAlu> {
+  let isBranch = 1;
+  let isIndirectBranch = 1;
+  let hasDelaySlot = 1;
+  let isTerminator=1;
+  let isBarrier=1;
+}
+
+def JrcRa16: FRR16_JALRC_RA_only_ins<1, 1, "jrc", IIAlu> {
+  let isBranch = 1;
+  let isIndirectBranch = 1;
+  let isTerminator=1;
+  let isBarrier=1;
+}
+
+def JrcRx16: FRR16_JALRC_ins<1, 1, 0, "jrc", IIAlu> {
+  let isBranch = 1;
+  let isIndirectBranch = 1;
+  let isTerminator=1;
+  let isBarrier=1;
+}
+//
+// Format: LB ry, offset(rx) MIPS16e
+// Purpose: Load Byte (Extended)
+// To load a byte from memory as a signed value.
+//
+def LbRxRyOffMemX16: FEXT_RRI16_mem_ins<0b10011, "lb", mem16, II_LB>, MayLoad{
+  let isCodeGenOnly = 1;
+}
+
+//
+// Format: LBU ry, offset(rx) MIPS16e
+// Purpose: Load Byte Unsigned (Extended)
+// To load a byte from memory as a unsigned value.
+//
+def LbuRxRyOffMemX16:
+  FEXT_RRI16_mem_ins<0b10100, "lbu", mem16, II_LBU>, MayLoad {
+  let isCodeGenOnly = 1;
+}
+
+//
+// Format: LH ry, offset(rx) MIPS16e
+// Purpose: Load Halfword signed (Extended)
+// To load a halfword from memory as a signed value.
+//
+def LhRxRyOffMemX16: FEXT_RRI16_mem_ins<0b10100, "lh", mem16, II_LH>, MayLoad{
+  let isCodeGenOnly = 1;
+}
+
+//
+// Format: LHU ry, offset(rx) MIPS16e
+// Purpose: Load Halfword unsigned (Extended)
+// To load a halfword from memory as an unsigned value.
+//
+def LhuRxRyOffMemX16:
+  FEXT_RRI16_mem_ins<0b10100, "lhu", mem16, II_LHU>, MayLoad {
+  let isCodeGenOnly = 1;
+}
+
+//
+// Format: LI rx, immediate MIPS16e
+// Purpose: Load Immediate
+// To load a constant into a GPR.
+//
+def LiRxImm16: FRI16_ins<0b01101, "li", IIAlu>;
+
+//
+// Format: LI rx, immediate MIPS16e
+// Purpose: Load Immediate (Extended)
+// To load a constant into a GPR.
+//
+def LiRxImmX16: FEXT_RI16_ins<0b01101, "li", IIAlu>;
+
+def LiRxImmAlignX16: FEXT_RI16_ins<0b01101, ".align 2\n\tli", IIAlu> {
+  let isCodeGenOnly = 1;
+}
+
+//
+// Format: LW ry, offset(rx) MIPS16e
+// Purpose: Load Word (Extended)
+// To load a word from memory as a signed value.
+//
+def LwRxRyOffMemX16: FEXT_RRI16_mem_ins<0b10011, "lw", mem16, II_LW>, MayLoad{
+  let isCodeGenOnly = 1;
+}
+
+// Format: LW rx, offset(sp) MIPS16e
+// Purpose: Load Word (SP-Relative, Extended)
+// To load an SP-relative word from memory as a signed value.
+//
+def LwRxSpImmX16: FEXT_RI16_SP_explicit_ins<0b10010, "lw", II_LW>, MayLoad{
+  let Uses = [SP];
+}
+
+def LwRxPcTcp16: FRI16_TCP_ins<0b10110, "lw", II_LW>, MayLoad;
+
+def LwRxPcTcpX16: FEXT_RI16_TCP_ins<0b10110, "lw", II_LW>, MayLoad;
+//
+// Format: MOVE r32, rz MIPS16e
+// Purpose: Move
+// To move the contents of a GPR to a GPR.
+//
+def Move32R16: FI8_MOV32R16_ins<"move", IIAlu>;
+
+//
+// Format: MOVE ry, r32 MIPS16e
+//Purpose: Move
+// To move the contents of a GPR to a GPR.
+//
+def MoveR3216: FI8_MOVR3216_ins<"move", IIAlu>;
+
+//
+// Format: MFHI rx MIPS16e
+// Purpose: Move From HI Register
+// To copy the special purpose HI register to a GPR.
+//
+def Mfhi16: FRR16_M_ins<0b10000, "mfhi", IIAlu> {
+  let Uses = [HI0];
+  let neverHasSideEffects = 1;
+}
+
+//
+// Format: MFLO rx MIPS16e
+// Purpose: Move From LO Register
+// To copy the special purpose LO register to a GPR.
+//
+def Mflo16: FRR16_M_ins<0b10010, "mflo", IIAlu> {
+  let Uses = [LO0];
+  let neverHasSideEffects = 1;
+}
+
+//
+// Pseudo Instruction for mult
+//
+def MultRxRy16:  FMULT16_ins<"mult",  IIAlu> {
+  let isCommutable = 1;
+  let neverHasSideEffects = 1;
+  let Defs = [HI0, LO0];
+}
+
+def MultuRxRy16: FMULT16_ins<"multu", IIAlu> {
+  let isCommutable = 1;
+  let neverHasSideEffects = 1;
+  let Defs = [HI0, LO0];
+}
+
+//
+// Format: MULT rx, ry MIPS16e
+// Purpose: Multiply Word
+// To multiply 32-bit signed integers.
+//
+def MultRxRyRz16: FMULT16_LO_ins<"mult", IIAlu> {
+  let isCommutable = 1;
+  let neverHasSideEffects = 1;
+  let Defs = [HI0, LO0];
+}
+
+//
+// Format: MULTU rx, ry MIPS16e
+// Purpose: Multiply Unsigned Word
+// To multiply 32-bit unsigned integers.
+//
+def MultuRxRyRz16: FMULT16_LO_ins<"multu", IIAlu> {
+  let isCommutable = 1;
+  let neverHasSideEffects = 1;
+  let Defs = [HI0, LO0];
+}
+
+//
+// Format: NEG rx, ry MIPS16e
+// Purpose: Negate
+// To negate an integer value.
+//
+def NegRxRy16: FUnaryRR16_ins<0b11101, "neg", IIAlu>;
+
+//
+// Format: NOT rx, ry MIPS16e
+// Purpose: Not
+// To complement an integer value
+//
+def NotRxRy16: FUnaryRR16_ins<0b01111, "not", IIAlu>;
+
+//
+// Format: OR rx, ry MIPS16e
+// Purpose: Or
+// To do a bitwise logical OR.
+//
+def OrRxRxRy16: FRxRxRy16_ins<0b01101, "or", IIAlu>, ArithLogic16Defs<1>;
+
+//
+// Format: RESTORE {ra,}{s0/s1/s0-1,}{framesize}
+// (All args are optional) MIPS16e
+// Purpose: Restore Registers and Deallocate Stack Frame
+// To deallocate a stack frame before exit from a subroutine,
+// restoring return address and static registers, and adjusting
+// stack
+//
+
+def Restore16:
+  FI8_SVRS16<0b1, (outs), (ins variable_ops),
+             "", [], II_RESTORE >, MayLoad {
+  let isCodeGenOnly = 1;
+  let Defs = [SP];
+  let Uses = [SP];
+}
+
+
+def RestoreX16:
+  FI8_SVRS16<0b1, (outs), (ins variable_ops),
+             "", [], II_RESTORE >, MayLoad {
+  let isCodeGenOnly = 1;
+  let Defs = [SP];
+  let Uses = [SP];
+}
+
+//
+// Format: SAVE {ra,}{s0/s1/s0-1,}{framesize} (All arguments are optional)
+// MIPS16e
+// Purpose: Save Registers and Set Up Stack Frame
+// To set up a stack frame on entry to a subroutine,
+// saving return address and static registers, and adjusting stack
+//
+def Save16: 
+  FI8_SVRS16<0b1, (outs), (ins variable_ops),
+             "", [], II_SAVE >, MayStore {
+  let isCodeGenOnly = 1;
+  let Uses = [SP];
+  let Defs = [SP];
+}
+
+def SaveX16:
+  FI8_SVRS16<0b1, (outs), (ins variable_ops),
+             "", [], II_SAVE >, MayStore {
+  let isCodeGenOnly = 1;
+  let Uses = [SP];
+  let Defs = [SP];
+}
+//
+// Format: SB ry, offset(rx) MIPS16e
+// Purpose: Store Byte (Extended)
+// To store a byte to memory.
+//
+def SbRxRyOffMemX16:
+  FEXT_RRI16_mem2_ins<0b11000, "sb", mem16, II_SB>, MayStore;
+
+//
+// Format: SEB rx MIPS16e
+// Purpose: Sign-Extend Byte
+// Sign-extend least significant byte in register rx.
+//
+def SebRx16
+  : FRR_SF16_ins<0b10001, 0b100, "seb", IIAlu>;
+
+//
+// Format: SEH rx MIPS16e
+// Purpose: Sign-Extend Halfword
+// Sign-extend least significant word in register rx.
+//
+def SehRx16
+  : FRR_SF16_ins<0b10001, 0b101, "seh", IIAlu>;
+
+//
+// The Sel(T) instructions are pseudos
+// T means that they use T8 implicitly.
+//
+//
+// Format: SelBeqZ rd, rs, rt
+// Purpose: if rt==0, do nothing
+//          else rs = rt
+//
+def SelBeqZ: Sel<"beqz">;
+
+//
+// Format:  SelTBteqZCmp rd, rs, rl, rr
+// Purpose: b = Cmp rl, rr.
+//          If b==0 then do nothing.
+//          if b!=0 then rd = rs
+//
+def SelTBteqZCmp: SelT<"bteqz", "cmp">;
+
+//
+// Format:  SelTBteqZCmpi rd, rs, rl, rr
+// Purpose: b = Cmpi rl, imm.
+//          If b==0 then do nothing.
+//          if b!=0 then rd = rs
+//
+def SelTBteqZCmpi: SeliT<"bteqz", "cmpi">;
+
+//
+// Format:  SelTBteqZSlt rd, rs, rl, rr
+// Purpose: b = Slt rl, rr.
+//          If b==0 then do nothing.
+//          if b!=0 then rd = rs
+//
+def SelTBteqZSlt: SelT<"bteqz", "slt">;
+
+//
+// Format:  SelTBteqZSlti rd, rs, rl, rr
+// Purpose: b = Slti rl, imm.
+//          If b==0 then do nothing.
+//          if b!=0 then rd = rs
+//
+def SelTBteqZSlti: SeliT<"bteqz", "slti">;
+
+//
+// Format:  SelTBteqZSltu rd, rs, rl, rr
+// Purpose: b = Sltu rl, rr.
+//          If b==0 then do nothing.
+//          if b!=0 then rd = rs
+//
+def SelTBteqZSltu: SelT<"bteqz", "sltu">;
+
+//
+// Format:  SelTBteqZSltiu rd, rs, rl, rr
+// Purpose: b = Sltiu rl, imm.
+//          If b==0 then do nothing.
+//          if b!=0 then rd = rs
+//
+def SelTBteqZSltiu: SeliT<"bteqz", "sltiu">;
+
+//
+// Format: SelBnez rd, rs, rt
+// Purpose: if rt!=0, do nothing
+//          else rs = rt
+//
+def SelBneZ: Sel<"bnez">;
+
+//
+// Format:  SelTBtneZCmp rd, rs, rl, rr
+// Purpose: b = Cmp rl, rr.
+//          If b!=0 then do nothing.
+//          if b0=0 then rd = rs
+//
+def SelTBtneZCmp: SelT<"btnez", "cmp">;
+
+//
+// Format:  SelTBtnezCmpi rd, rs, rl, rr
+// Purpose: b = Cmpi rl, imm.
+//          If b!=0 then do nothing.
+//          if b==0 then rd = rs
+//
+def SelTBtneZCmpi: SeliT<"btnez", "cmpi">;
+
+//
+// Format:  SelTBtneZSlt rd, rs, rl, rr
+// Purpose: b = Slt rl, rr.
+//          If b!=0 then do nothing.
+//          if b==0 then rd = rs
+//
+def SelTBtneZSlt: SelT<"btnez", "slt">;
+
+//
+// Format:  SelTBtneZSlti rd, rs, rl, rr
+// Purpose: b = Slti rl, imm.
+//          If b!=0 then do nothing.
+//          if b==0 then rd = rs
+//
+def SelTBtneZSlti: SeliT<"btnez", "slti">;
+
+//
+// Format:  SelTBtneZSltu rd, rs, rl, rr
+// Purpose: b = Sltu rl, rr.
+//          If b!=0 then do nothing.
+//          if b==0 then rd = rs
+//
+def SelTBtneZSltu: SelT<"btnez", "sltu">;
+
+//
+// Format:  SelTBtneZSltiu rd, rs, rl, rr
+// Purpose: b = Slti rl, imm.
+//          If b!=0 then do nothing.
+//          if b==0 then rd = rs
+//
+def SelTBtneZSltiu: SeliT<"btnez", "sltiu">;
+//
+//
+// Format: SH ry, offset(rx) MIPS16e
+// Purpose: Store Halfword (Extended)
+// To store a halfword to memory.
+//
+def ShRxRyOffMemX16:
+  FEXT_RRI16_mem2_ins<0b11001, "sh", mem16, II_SH>, MayStore;
+
+//
+// Format: SLL rx, ry, sa MIPS16e
+// Purpose: Shift Word Left Logical (Extended)
+// To execute a left-shift of a word by a fixed number of bits-0 to 31 bits.
+//
+def SllX16: FEXT_SHIFT16_ins<0b00, "sll", IIAlu>;
+
+//
+// Format: SLLV ry, rx MIPS16e
+// Purpose: Shift Word Left Logical Variable
+// To execute a left-shift of a word by a variable number of bits.
+//
+def SllvRxRy16 : FRxRxRy16_ins<0b00100, "sllv", IIAlu>;
+
+// Format: SLTI rx, immediate MIPS16e
+// Purpose: Set on Less Than Immediate
+// To record the result of a less-than comparison with a constant.
+//
+//
+def SltiRxImm16: FRI16R_ins<0b01010, "slti", IIAlu> {
+  let Defs = [T8];
+}
+
+//
+// Format: SLTI rx, immediate MIPS16e
+// Purpose: Set on Less Than Immediate (Extended)
+// To record the result of a less-than comparison with a constant.
+//
+//
+def SltiRxImmX16: FEXT_RI16R_ins<0b01010, "slti", IIAlu> {
+  let Defs = [T8];
+}
+
+def SltiCCRxImmX16: FEXT_CCRXI16_ins<"slti">;
+
+// Format: SLTIU rx, immediate MIPS16e
+// Purpose: Set on Less Than Immediate Unsigned
+// To record the result of a less-than comparison with a constant.
+//
+//
+def SltiuRxImm16: FRI16R_ins<0b01011, "sltiu", IIAlu> {
+  let Defs = [T8];
+}
+
+//
+// Format: SLTI rx, immediate MIPS16e
+// Purpose: Set on Less Than Immediate Unsigned (Extended)
+// To record the result of a less-than comparison with a constant.
+//
+//
+def SltiuRxImmX16: FEXT_RI16R_ins<0b01011, "sltiu", IIAlu> {
+  let Defs = [T8];
+}
+//
+// Format: SLTIU rx, immediate MIPS16e
+// Purpose: Set on Less Than Immediate Unsigned (Extended)
+// To record the result of a less-than comparison with a constant.
+//
+def SltiuCCRxImmX16: FEXT_CCRXI16_ins<"sltiu">;
+
+//
+// Format: SLT rx, ry MIPS16e
+// Purpose: Set on Less Than
+// To record the result of a less-than comparison.
+//
+def SltRxRy16: FRR16R_ins<0b00010, "slt", IIAlu>{
+  let Defs = [T8];
+}
+
+def SltCCRxRy16: FCCRR16_ins<"slt">;
+
+// Format: SLTU rx, ry MIPS16e
+// Purpose: Set on Less Than Unsigned
+// To record the result of an unsigned less-than comparison.
+//
+def SltuRxRy16: FRR16R_ins<0b00011, "sltu", IIAlu>{
+  let Defs = [T8];
+}
+
+def SltuRxRyRz16: FRRTR16_ins<"sltu"> {
+  let isCodeGenOnly=1;
+  let Defs = [T8];
+}
+
+
+def SltuCCRxRy16: FCCRR16_ins<"sltu">;
+//
+// Format: SRAV ry, rx MIPS16e
+// Purpose: Shift Word Right Arithmetic Variable
+// To execute an arithmetic right-shift of a word by a variable
+// number of bits.
+//
+def SravRxRy16: FRxRxRy16_ins<0b00111, "srav", IIAlu>;
+
+
+//
+// Format: SRA rx, ry, sa MIPS16e
+// Purpose: Shift Word Right Arithmetic (Extended)
+// To execute an arithmetic right-shift of a word by a fixed
+// number of bits-1 to 8 bits.
+//
+def SraX16: FEXT_SHIFT16_ins<0b11, "sra", IIAlu>;
+
+
+//
+// Format: SRLV ry, rx MIPS16e
+// Purpose: Shift Word Right Logical Variable
+// To execute a logical right-shift of a word by a variable
+// number of bits.
+//
+def SrlvRxRy16: FRxRxRy16_ins<0b00110, "srlv", IIAlu>;
+
+
+//
+// Format: SRL rx, ry, sa MIPS16e
+// Purpose: Shift Word Right Logical (Extended)
+// To execute a logical right-shift of a word by a fixed
+// number of bits-1 to 31 bits.
+//
+def SrlX16: FEXT_SHIFT16_ins<0b10, "srl", IIAlu>;
+
+//
+// Format: SUBU rz, rx, ry MIPS16e
+// Purpose: Subtract Unsigned Word
+// To subtract 32-bit integers
+//
+def SubuRxRyRz16: FRRR16_ins<0b11, "subu", IIAlu>, ArithLogic16Defs<0>;
+
+//
+// Format: SW ry, offset(rx) MIPS16e
+// Purpose: Store Word (Extended)
+// To store a word to memory.
+//
+def SwRxRyOffMemX16:
+  FEXT_RRI16_mem2_ins<0b11011, "sw", mem16, II_SW>, MayStore;
+
+//
+// Format: SW rx, offset(sp) MIPS16e
+// Purpose: Store Word rx (SP-Relative)
+// To store an SP-relative word to memory.
+//
+def SwRxSpImmX16: FEXT_RI16_SP_Store_explicit_ins
+  <0b11010, "sw", II_SW>, MayStore;
+
+//
+//
+// Format: XOR rx, ry MIPS16e
+// Purpose: Xor
+// To do a bitwise logical XOR.
+//
+def XorRxRxRy16: FRxRxRy16_ins<0b01110, "xor", IIAlu>, ArithLogic16Defs<1>;
+
+class Mips16Pat<dag pattern, dag result> : Pat<pattern, result> {
+  let Predicates = [InMips16Mode];
+}
+
+// Unary Arith/Logic
+//
+class ArithLogicU_pat<PatFrag OpNode, Instruction I> :
+  Mips16Pat<(OpNode CPU16Regs:$r),
+            (I CPU16Regs:$r)>;
+
+def: ArithLogicU_pat<not, NotRxRy16>;
+def: ArithLogicU_pat<ineg, NegRxRy16>;
+
+class ArithLogic16_pat<SDNode OpNode, Instruction I> :
+  Mips16Pat<(OpNode CPU16Regs:$l, CPU16Regs:$r),
+            (I CPU16Regs:$l, CPU16Regs:$r)>;
+
+def: ArithLogic16_pat<add, AdduRxRyRz16>;
+def: ArithLogic16_pat<and, AndRxRxRy16>;
+def: ArithLogic16_pat<mul, MultRxRyRz16>;
+def: ArithLogic16_pat<or, OrRxRxRy16>;
+def: ArithLogic16_pat<sub, SubuRxRyRz16>;
+def: ArithLogic16_pat<xor, XorRxRxRy16>;
+
+// Arithmetic and logical instructions with 2 register operands.
+
+class ArithLogicI16_pat<SDNode OpNode, PatFrag imm_type, Instruction I> :
+  Mips16Pat<(OpNode CPU16Regs:$in, imm_type:$imm),
+            (I CPU16Regs:$in, imm_type:$imm)>;
+
+def: ArithLogicI16_pat<add, immSExt8, AddiuRxRxImm16>;
+def: ArithLogicI16_pat<add, immSExt16, AddiuRxRxImmX16>;
+def: ArithLogicI16_pat<shl, immZExt5, SllX16>;
+def: ArithLogicI16_pat<srl, immZExt5, SrlX16>;
+def: ArithLogicI16_pat<sra, immZExt5, SraX16>;
+
+class shift_rotate_reg16_pat<SDNode OpNode, Instruction I> :
+  Mips16Pat<(OpNode CPU16Regs:$r, CPU16Regs:$ra),
+            (I CPU16Regs:$r, CPU16Regs:$ra)>;
+
+def: shift_rotate_reg16_pat<shl, SllvRxRy16>;
+def: shift_rotate_reg16_pat<sra, SravRxRy16>;
+def: shift_rotate_reg16_pat<srl, SrlvRxRy16>;
+
+class LoadM16_pat<PatFrag OpNode, Instruction I> :
+  Mips16Pat<(OpNode addr16:$addr), (I addr16:$addr)>;
+
+def: LoadM16_pat<sextloadi8, LbRxRyOffMemX16>;
+def: LoadM16_pat<zextloadi8, LbuRxRyOffMemX16>;
+def: LoadM16_pat<sextloadi16, LhRxRyOffMemX16>;
+def: LoadM16_pat<zextloadi16, LhuRxRyOffMemX16>;
+def: LoadM16_pat<load, LwRxRyOffMemX16>;
+
+class StoreM16_pat<PatFrag OpNode, Instruction I> :
+  Mips16Pat<(OpNode CPU16Regs:$r, addr16:$addr),
+            (I CPU16Regs:$r, addr16:$addr)>;
+
+def: StoreM16_pat<truncstorei8, SbRxRyOffMemX16>;
+def: StoreM16_pat<truncstorei16, ShRxRyOffMemX16>;
+def: StoreM16_pat<store, SwRxRyOffMemX16>;
+
+// Unconditional branch
+class UncondBranch16_pat<SDNode OpNode, Instruction I>:
+  Mips16Pat<(OpNode bb:$imm16), (I bb:$imm16)> {
+    let Predicates = [InMips16Mode];
+  }
+
+def : Mips16Pat<(MipsJmpLink (i32 tglobaladdr:$dst)),
+                (Jal16 tglobaladdr:$dst)>;
+
+def : Mips16Pat<(MipsJmpLink (i32 texternalsym:$dst)),
+                (Jal16 texternalsym:$dst)>;
+
+// Indirect branch
+def: Mips16Pat<(brind CPU16Regs:$rs), (JrcRx16 CPU16Regs:$rs)> {
+  // Ensure that the addition of MIPS32r6/MIPS64r6 support does not change
+  // MIPS16's behaviour.
+  let AddedComplexity = 1;
+}
+
+// Jump and Link (Call)
+let isCall=1, hasDelaySlot=0 in
+def JumpLinkReg16:
+  FRR16_JALRC<0, 0, 0, (outs), (ins CPU16Regs:$rs),
+              "jalrc \t$rs", [(MipsJmpLink CPU16Regs:$rs)], IIBranch> {
+  let Defs = [RA];
+}
+
+// Mips16 pseudos
+let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1, hasCtrlDep=1,
+  hasExtraSrcRegAllocReq = 1 in
+def RetRA16 : MipsPseudo16<(outs), (ins), "", [(MipsRet)]>;
+
+
+// setcc patterns
+
+class SetCC_R16<PatFrag cond_op, Instruction I>:
+  Mips16Pat<(cond_op CPU16Regs:$rx, CPU16Regs:$ry),
+            (I CPU16Regs:$rx, CPU16Regs:$ry)>;
+
+class SetCC_I16<PatFrag cond_op, PatLeaf imm_type, Instruction I>:
+  Mips16Pat<(cond_op CPU16Regs:$rx, imm_type:$imm16),
+            (I CPU16Regs:$rx, imm_type:$imm16)>;
+
+
+def: Mips16Pat<(i32  addr16:$addr),
+               (AddiuRxRyOffMemX16  addr16:$addr)>;
+
+
+// Large (>16 bit) immediate loads
+def : Mips16Pat<(i32 imm:$imm), (LwConstant32 imm:$imm, -1)>;
+
+// Carry MipsPatterns
+def : Mips16Pat<(subc CPU16Regs:$lhs, CPU16Regs:$rhs),
+                (SubuRxRyRz16 CPU16Regs:$lhs, CPU16Regs:$rhs)>;
+def : Mips16Pat<(addc CPU16Regs:$lhs, CPU16Regs:$rhs),
+                (AdduRxRyRz16 CPU16Regs:$lhs, CPU16Regs:$rhs)>;
+def : Mips16Pat<(addc  CPU16Regs:$src, immSExt16:$imm),
+                (AddiuRxRxImmX16 CPU16Regs:$src, imm:$imm)>;
+
+//
+// Some branch conditional patterns are not generated by llvm at this time.
+// Some are for seemingly arbitrary reasons not used: i.e. with signed number
+// comparison they are used and for unsigned a different pattern is used.
+// I am pushing upstream from the full mips16 port and it seemed that I needed
+// these earlier and the mips32 port has these but now I cannot create test
+// cases that use these patterns. While I sort this all out I will leave these
+// extra patterns commented out and if I can be sure they are really not used,
+// I will delete the code. I don't want to check the code in uncommented without
+// a valid test case. In some cases, the compiler is generating patterns with
+// setcc instead and earlier I had implemented setcc first so may have masked
+// the problem. The setcc variants are suboptimal for mips16 so I may wantto
+// figure out how to enable the brcond patterns or else possibly new
+// combinations of of brcond and setcc.
+//
+//
+// bcond-seteq
+//
+def: Mips16Pat
+  <(brcond (i32 (seteq CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+   (BteqzT8CmpX16 CPU16Regs:$rx, CPU16Regs:$ry,  bb:$imm16)
+  >;
+
+
+def: Mips16Pat
+  <(brcond (i32 (seteq CPU16Regs:$rx, immZExt16:$imm)), bb:$targ16),
+   (BteqzT8CmpiX16 CPU16Regs:$rx, immSExt16:$imm,  bb:$targ16)
+  >;
+
+def: Mips16Pat
+  <(brcond (i32 (seteq CPU16Regs:$rx, 0)), bb:$targ16),
+   (BeqzRxImm16 CPU16Regs:$rx, bb:$targ16)
+  >;
+
+//
+// bcond-setgt (do we need to have this pair of setlt, setgt??)
+//
+def: Mips16Pat
+  <(brcond (i32 (setgt CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+   (BtnezT8SltX16 CPU16Regs:$ry, CPU16Regs:$rx,  bb:$imm16)
+  >;
+
+//
+// bcond-setge
+//
+def: Mips16Pat
+  <(brcond (i32 (setge CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+   (BteqzT8SltX16 CPU16Regs:$rx, CPU16Regs:$ry,  bb:$imm16)
+  >;
+
+//
+// never called because compiler transforms a >= k to a > (k-1)
+def: Mips16Pat
+  <(brcond (i32 (setge CPU16Regs:$rx, immSExt16:$imm)), bb:$imm16),
+   (BteqzT8SltiX16 CPU16Regs:$rx, immSExt16:$imm,  bb:$imm16)
+  >;
+
+//
+// bcond-setlt
+//
+def: Mips16Pat
+  <(brcond (i32 (setlt CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+   (BtnezT8SltX16 CPU16Regs:$rx, CPU16Regs:$ry,  bb:$imm16)
+  >;
+
+def: Mips16Pat
+  <(brcond (i32 (setlt CPU16Regs:$rx, immSExt16:$imm)), bb:$imm16),
+   (BtnezT8SltiX16 CPU16Regs:$rx, immSExt16:$imm,  bb:$imm16)
+  >;
+
+//
+// bcond-setle
+//
+def: Mips16Pat
+  <(brcond (i32 (setle CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+   (BteqzT8SltX16 CPU16Regs:$ry, CPU16Regs:$rx,  bb:$imm16)
+  >;
+
+//
+// bcond-setne
+//
+def: Mips16Pat
+  <(brcond (i32 (setne CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+   (BtnezT8CmpX16 CPU16Regs:$rx, CPU16Regs:$ry,  bb:$imm16)
+  >;
+
+def: Mips16Pat
+  <(brcond (i32 (setne CPU16Regs:$rx, immZExt16:$imm)), bb:$targ16),
+   (BtnezT8CmpiX16 CPU16Regs:$rx, immSExt16:$imm,  bb:$targ16)
+  >;
+
+def: Mips16Pat
+  <(brcond (i32 (setne CPU16Regs:$rx, 0)), bb:$targ16),
+   (BnezRxImm16 CPU16Regs:$rx, bb:$targ16)
+  >;
+
+//
+// This needs to be there but I forget which code will generate it
+//
+def: Mips16Pat
+  <(brcond CPU16Regs:$rx, bb:$targ16),
+   (BnezRxImm16 CPU16Regs:$rx, bb:$targ16)
+  >;
+
+//
+
+//
+// bcond-setugt
+//
+//def: Mips16Pat
+//  <(brcond (i32 (setugt CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+//   (BtnezT8SltuX16 CPU16Regs:$ry, CPU16Regs:$rx,  bb:$imm16)
+//  >;
+
+//
+// bcond-setuge
+//
+//def: Mips16Pat
+//  <(brcond (i32 (setuge CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+//   (BteqzT8SltuX16 CPU16Regs:$rx, CPU16Regs:$ry,  bb:$imm16)
+//  >;
+
+
+//
+// bcond-setult
+//
+//def: Mips16Pat
+//  <(brcond (i32 (setult CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+//   (BtnezT8SltuX16 CPU16Regs:$rx, CPU16Regs:$ry,  bb:$imm16)
+//  >;
+
+def: UncondBranch16_pat<br, Bimm16>;
+
+// Small immediates
+def: Mips16Pat<(i32 immSExt16:$in),
+               (AddiuRxRxImmX16 (Move32R16 ZERO), immSExt16:$in)>;
+
+def: Mips16Pat<(i32 immZExt16:$in), (LiRxImmX16 immZExt16:$in)>;
+
+//
+// MipsDivRem
+//
+def: Mips16Pat
+  <(MipsDivRem16 CPU16Regs:$rx, CPU16Regs:$ry),
+   (DivRxRy16 CPU16Regs:$rx, CPU16Regs:$ry)>;
+
+//
+// MipsDivRemU
+//
+def: Mips16Pat
+  <(MipsDivRemU16 CPU16Regs:$rx, CPU16Regs:$ry),
+   (DivuRxRy16 CPU16Regs:$rx, CPU16Regs:$ry)>;
+
+//  signed a,b
+//  x = (a>=b)?x:y
+//
+//  if !(a < b) x = y
+//
+def : Mips16Pat<(select (i32 (setge CPU16Regs:$a, CPU16Regs:$b)),
+                 CPU16Regs:$x, CPU16Regs:$y),
+                (SelTBteqZSlt CPU16Regs:$x, CPU16Regs:$y,
+                 CPU16Regs:$a, CPU16Regs:$b)>;
+
+//  signed a,b
+//  x = (a>b)?x:y
+//
+//  if  (b < a) x = y
+//
+def : Mips16Pat<(select (i32 (setgt CPU16Regs:$a, CPU16Regs:$b)),
+                 CPU16Regs:$x, CPU16Regs:$y),
+                (SelTBtneZSlt CPU16Regs:$x, CPU16Regs:$y,
+                 CPU16Regs:$b, CPU16Regs:$a)>;
+
+// unsigned a,b
+// x = (a>=b)?x:y
+//
+// if !(a < b) x = y;
+//
+def : Mips16Pat<
+  (select (i32 (setuge CPU16Regs:$a, CPU16Regs:$b)),
+   CPU16Regs:$x, CPU16Regs:$y),
+  (SelTBteqZSltu CPU16Regs:$x, CPU16Regs:$y,
+   CPU16Regs:$a, CPU16Regs:$b)>;
+
+//  unsigned a,b
+//  x = (a>b)?x:y
+//
+//  if (b < a) x = y
+//
+def : Mips16Pat<(select (i32 (setugt CPU16Regs:$a, CPU16Regs:$b)),
+                 CPU16Regs:$x, CPU16Regs:$y),
+                (SelTBtneZSltu CPU16Regs:$x, CPU16Regs:$y,
+                 CPU16Regs:$b, CPU16Regs:$a)>;
+
+// signed
+// x = (a >= k)?x:y
+// due to an llvm optimization, i don't think that this will ever
+// be used. This is transformed into x = (a > k-1)?x:y
+//
+//
+
+//def : Mips16Pat<
+//  (select (i32 (setge CPU16Regs:$lhs, immSExt16:$rhs)),
+//   CPU16Regs:$T, CPU16Regs:$F),
+//  (SelTBteqZSlti CPU16Regs:$T, CPU16Regs:$F,
+//   CPU16Regs:$lhs, immSExt16:$rhs)>;
+
+//def : Mips16Pat<
+//  (select (i32 (setuge CPU16Regs:$lhs, immSExt16:$rhs)),
+//   CPU16Regs:$T, CPU16Regs:$F),
+//  (SelTBteqZSltiu CPU16Regs:$T, CPU16Regs:$F,
+//   CPU16Regs:$lhs, immSExt16:$rhs)>;
+
+// signed
+// x = (a < k)?x:y
+//
+// if !(a < k) x = y;
+//
+def : Mips16Pat<
+  (select (i32 (setlt CPU16Regs:$a, immSExt16:$b)),
+   CPU16Regs:$x, CPU16Regs:$y),
+  (SelTBtneZSlti CPU16Regs:$x, CPU16Regs:$y,
+   CPU16Regs:$a, immSExt16:$b)>;
+
+
+//
+//
+// signed
+// x = (a <= b)? x : y
+//
+// if  (b < a) x = y
+//
+def : Mips16Pat<(select (i32 (setle CPU16Regs:$a, CPU16Regs:$b)),
+                 CPU16Regs:$x, CPU16Regs:$y),
+                (SelTBteqZSlt CPU16Regs:$x, CPU16Regs:$y,
+                 CPU16Regs:$b, CPU16Regs:$a)>;
+
+//
+// unnsigned
+// x = (a <= b)? x : y
+//
+// if  (b < a) x = y
+//
+def : Mips16Pat<(select (i32 (setule CPU16Regs:$a, CPU16Regs:$b)),
+                 CPU16Regs:$x, CPU16Regs:$y),
+                (SelTBteqZSltu CPU16Regs:$x, CPU16Regs:$y,
+                 CPU16Regs:$b, CPU16Regs:$a)>;
+
+//
+// signed/unsigned
+// x = (a == b)? x : y
+//
+// if (a != b) x = y
+//
+def : Mips16Pat<(select (i32 (seteq CPU16Regs:$a, CPU16Regs:$b)),
+                 CPU16Regs:$x, CPU16Regs:$y),
+                (SelTBteqZCmp CPU16Regs:$x, CPU16Regs:$y,
+                 CPU16Regs:$b, CPU16Regs:$a)>;
+
+//
+// signed/unsigned
+// x = (a == 0)? x : y
+//
+// if (a != 0) x = y
+//
+def : Mips16Pat<(select (i32 (seteq CPU16Regs:$a, 0)),
+                 CPU16Regs:$x, CPU16Regs:$y),
+                (SelBeqZ CPU16Regs:$x, CPU16Regs:$y,
+                 CPU16Regs:$a)>;
+
+
+//
+// signed/unsigned
+// x = (a == k)? x : y
+//
+// if (a != k) x = y
+//
+def : Mips16Pat<(select (i32 (seteq CPU16Regs:$a, immZExt16:$k)),
+                 CPU16Regs:$x, CPU16Regs:$y),
+                (SelTBteqZCmpi CPU16Regs:$x, CPU16Regs:$y,
+                 CPU16Regs:$a, immZExt16:$k)>;
+
+
+//
+// signed/unsigned
+// x = (a != b)? x : y
+//
+// if (a == b) x = y
+//
+//
+def : Mips16Pat<(select (i32 (setne CPU16Regs:$a, CPU16Regs:$b)),
+                 CPU16Regs:$x, CPU16Regs:$y),
+                (SelTBtneZCmp CPU16Regs:$x, CPU16Regs:$y,
+                 CPU16Regs:$b, CPU16Regs:$a)>;
+
+//
+// signed/unsigned
+// x = (a != 0)? x : y
+//
+// if (a == 0) x = y
+//
+def : Mips16Pat<(select (i32 (setne CPU16Regs:$a, 0)),
+                 CPU16Regs:$x, CPU16Regs:$y),
+                (SelBneZ CPU16Regs:$x, CPU16Regs:$y,
+                 CPU16Regs:$a)>;
+
+// signed/unsigned
+// x = (a)? x : y
+//
+// if (!a) x = y
+//
+def : Mips16Pat<(select  CPU16Regs:$a,
+                 CPU16Regs:$x, CPU16Regs:$y),
+      (SelBneZ CPU16Regs:$x, CPU16Regs:$y,
+       CPU16Regs:$a)>;
+
+
+//
+// signed/unsigned
+// x = (a != k)? x : y
+//
+// if (a == k) x = y
+//
+def : Mips16Pat<(select (i32 (setne CPU16Regs:$a, immZExt16:$k)),
+                 CPU16Regs:$x, CPU16Regs:$y),
+                (SelTBtneZCmpi CPU16Regs:$x, CPU16Regs:$y,
+                 CPU16Regs:$a, immZExt16:$k)>;
+
+//
+// When writing C code to test setxx these patterns,
+// some will be transformed into
+// other things. So we test using C code but using -O3 and -O0
+//
+// seteq
+//
+def : Mips16Pat
+  <(seteq CPU16Regs:$lhs,CPU16Regs:$rhs),
+   (SltiuCCRxImmX16 (XorRxRxRy16 CPU16Regs:$lhs, CPU16Regs:$rhs), 1)>;
+
+def : Mips16Pat
+  <(seteq CPU16Regs:$lhs, 0),
+   (SltiuCCRxImmX16 CPU16Regs:$lhs, 1)>;
+
+
+//
+// setge
+//
+
+def: Mips16Pat
+  <(setge CPU16Regs:$lhs, CPU16Regs:$rhs),
+   (XorRxRxRy16 (SltCCRxRy16 CPU16Regs:$lhs, CPU16Regs:$rhs),
+   (LiRxImmX16 1))>;
+
+//
+// For constants, llvm transforms this to:
+// x > (k -1) and then reverses the operands to use setlt. So this pattern
+// is not used now by the compiler. (Presumably checking that k-1 does not
+// overflow). The compiler never uses this at a the current time, due to
+// other optimizations.
+//
+//def: Mips16Pat
+//  <(setge CPU16Regs:$lhs, immSExt16:$rhs),
+//   (XorRxRxRy16 (SltiCCRxImmX16 CPU16Regs:$lhs, immSExt16:$rhs),
+//   (LiRxImmX16 1))>;
+
+// This catches the x >= -32768 case by transforming it to  x > -32769
+//
+def: Mips16Pat
+  <(setgt CPU16Regs:$lhs, -32769),
+   (XorRxRxRy16 (SltiCCRxImmX16 CPU16Regs:$lhs, -32768),
+   (LiRxImmX16 1))>;
+
+//
+// setgt
+//
+//
+
+def: Mips16Pat
+  <(setgt CPU16Regs:$lhs, CPU16Regs:$rhs),
+   (SltCCRxRy16 CPU16Regs:$rhs, CPU16Regs:$lhs)>;
+
+//
+// setle
+//
+def: Mips16Pat
+  <(setle CPU16Regs:$lhs, CPU16Regs:$rhs),
+   (XorRxRxRy16 (SltCCRxRy16 CPU16Regs:$rhs, CPU16Regs:$lhs), (LiRxImm16 1))>;
+
+//
+// setlt
+//
+def: SetCC_R16<setlt, SltCCRxRy16>;
+
+def: SetCC_I16<setlt, immSExt16, SltiCCRxImmX16>;
+
+//
+// setne
+//
+def : Mips16Pat
+  <(setne CPU16Regs:$lhs,CPU16Regs:$rhs),
+   (SltuCCRxRy16 (LiRxImmX16 0),
+   (XorRxRxRy16 CPU16Regs:$lhs, CPU16Regs:$rhs))>;
+
+
+//
+// setuge
+//
+def: Mips16Pat
+  <(setuge CPU16Regs:$lhs, CPU16Regs:$rhs),
+   (XorRxRxRy16 (SltuCCRxRy16 CPU16Regs:$lhs, CPU16Regs:$rhs),
+   (LiRxImmX16 1))>;
+
+// this pattern will never be used because the compiler will transform
+// x >= k to x > (k - 1) and then use SLT
+//
+//def: Mips16Pat
+//  <(setuge CPU16Regs:$lhs, immZExt16:$rhs),
+//   (XorRxRxRy16 (SltiuCCRxImmX16 CPU16Regs:$lhs, immZExt16:$rhs),
+//   (LiRxImmX16 1))>;
+
+//
+// setugt
+//
+def: Mips16Pat
+  <(setugt CPU16Regs:$lhs, CPU16Regs:$rhs),
+   (SltuCCRxRy16 CPU16Regs:$rhs, CPU16Regs:$lhs)>;
+
+//
+// setule
+//
+def: Mips16Pat
+  <(setule CPU16Regs:$lhs, CPU16Regs:$rhs),
+   (XorRxRxRy16 (SltuCCRxRy16 CPU16Regs:$rhs, CPU16Regs:$lhs), (LiRxImmX16 1))>;
+
+//
+// setult
+//
+def: SetCC_R16<setult, SltuCCRxRy16>;
+
+def: SetCC_I16<setult, immSExt16, SltiuCCRxImmX16>;
+
+def: Mips16Pat<(add CPU16Regs:$hi, (MipsLo tglobaladdr:$lo)),
+               (AddiuRxRxImmX16 CPU16Regs:$hi, tglobaladdr:$lo)>;
+
+// hi/lo relocs
+def : Mips16Pat<(MipsHi tblockaddress:$in),
+                (SllX16 (LiRxImmX16 tblockaddress:$in), 16)>;
+def : Mips16Pat<(MipsHi tglobaladdr:$in),
+                (SllX16 (LiRxImmX16 tglobaladdr:$in), 16)>;
+def : Mips16Pat<(MipsHi tjumptable:$in),
+                (SllX16 (LiRxImmX16 tjumptable:$in), 16)>;
+def : Mips16Pat<(MipsHi tglobaltlsaddr:$in),
+                (SllX16 (LiRxImmX16 tglobaltlsaddr:$in), 16)>;
+
+def : Mips16Pat<(MipsLo tblockaddress:$in), (LiRxImmX16 tblockaddress:$in)>;
+
+// wrapper_pic
+class Wrapper16Pat<SDNode node, Instruction ADDiuOp, RegisterClass RC>:
+  Mips16Pat<(MipsWrapper RC:$gp, node:$in),
+            (ADDiuOp RC:$gp, node:$in)>;
+
+
+def : Wrapper16Pat<tglobaladdr, AddiuRxRxImmX16, CPU16Regs>;
+def : Wrapper16Pat<tglobaltlsaddr, AddiuRxRxImmX16, CPU16Regs>;
+
+def : Mips16Pat<(i32 (extloadi8   addr16:$src)),
+                (LbuRxRyOffMemX16  addr16:$src)>;
+def : Mips16Pat<(i32 (extloadi16  addr16:$src)),
+                (LhuRxRyOffMemX16  addr16:$src)>;
+
+def: Mips16Pat<(trap), (Break16)>;
+
+def : Mips16Pat<(sext_inreg CPU16Regs:$val, i8),
+                (SebRx16 CPU16Regs:$val)>;
+
+def : Mips16Pat<(sext_inreg CPU16Regs:$val, i16),
+                (SehRx16 CPU16Regs:$val)>;
+
+def GotPrologue16:   
+  MipsPseudo16<
+    (outs CPU16Regs:$rh, CPU16Regs:$rl),
+    (ins simm16:$immHi, simm16:$immLo),
+    "li\t$rh, $immHi\n\taddiu\t$rl, $$pc, $immLo\n ",[]> ;
+
+// An operand for the CONSTPOOL_ENTRY pseudo-instruction.
+def cpinst_operand : Operand<i32> {
+  // let PrintMethod = "printCPInstOperand";
+}
+
+// CONSTPOOL_ENTRY - This instruction represents a floating constant pool in
+// the function.  The first operand is the ID# for this instruction, the second
+// is the index into the MachineConstantPool that this is, the third is the
+// size in bytes of this constant pool entry.
+//
+let neverHasSideEffects = 1, isNotDuplicable = 1 in
+def CONSTPOOL_ENTRY :
+MipsPseudo16<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx,
+                      i32imm:$size), "foo", []>;
+
diff --git a/contrib/llvm/lib/Target/Mips/Mips16RegisterInfo.cpp b/contrib/llvm/lib/Target/Mips/Mips16RegisterInfo.cpp
new file mode 100644
index 0000000..dbee774
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16RegisterInfo.cpp
@@ -0,0 +1,153 @@
+//===-- Mips16RegisterInfo.cpp - MIPS16 Register Information --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MIPS16 implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mips16RegisterInfo.h"
+#include "Mips.h"
+#include "Mips16InstrInfo.h"
+#include "MipsAnalyzeImmediate.h"
+#include "MipsInstrInfo.h"
+#include "MipsMachineFunction.h"
+#include "MipsSubtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips16-registerinfo"
+
+Mips16RegisterInfo::Mips16RegisterInfo(const MipsSubtarget &ST)
+  : MipsRegisterInfo(ST) {}
+
+bool Mips16RegisterInfo::requiresRegisterScavenging
+  (const MachineFunction &MF) const {
+  return false;
+}
+bool Mips16RegisterInfo::requiresFrameIndexScavenging
+  (const MachineFunction &MF) const {
+  return false;
+}
+
+bool Mips16RegisterInfo::useFPForScavengingIndex
+  (const MachineFunction &MF) const {
+  return false;
+}
+
+bool Mips16RegisterInfo::saveScavengerRegister
+  (MachineBasicBlock &MBB,
+   MachineBasicBlock::iterator I,
+   MachineBasicBlock::iterator &UseMI,
+   const TargetRegisterClass *RC,
+   unsigned Reg) const {
+  DebugLoc DL;
+  const TargetInstrInfo &TII = *MBB.getParent()->getTarget().getInstrInfo();
+  TII.copyPhysReg(MBB, I, DL, Mips::T0, Reg, true);
+  TII.copyPhysReg(MBB, UseMI, DL, Reg, Mips::T0, true);
+  return true;
+}
+
+const TargetRegisterClass *
+Mips16RegisterInfo::intRegClass(unsigned Size) const {
+  assert(Size == 4);
+  return &Mips::CPU16RegsRegClass;
+}
+
+void Mips16RegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
+                                     unsigned OpNo, int FrameIndex,
+                                     uint64_t StackSize,
+                                     int64_t SPOffset) const {
+  MachineInstr &MI = *II;
+  MachineFunction &MF = *MI.getParent()->getParent();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+  int MinCSFI = 0;
+  int MaxCSFI = -1;
+
+  if (CSI.size()) {
+    MinCSFI = CSI[0].getFrameIdx();
+    MaxCSFI = CSI[CSI.size() - 1].getFrameIdx();
+  }
+
+  // The following stack frame objects are always
+  // referenced relative to $sp:
+  //  1. Outgoing arguments.
+  //  2. Pointer to dynamically allocated stack space.
+  //  3. Locations for callee-saved registers.
+  // Everything else is referenced relative to whatever register
+  // getFrameRegister() returns.
+  unsigned FrameReg;
+
+  if (FrameIndex >= MinCSFI && FrameIndex <= MaxCSFI)
+    FrameReg = Mips::SP;
+  else {
+    const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+    if (TFI->hasFP(MF)) {
+      FrameReg = Mips::S0;
+    }
+    else {
+      if ((MI.getNumOperands()> OpNo+2) && MI.getOperand(OpNo+2).isReg())
+        FrameReg = MI.getOperand(OpNo+2).getReg();
+      else
+        FrameReg = Mips::SP;
+    }
+  }
+  // Calculate final offset.
+  // - There is no need to change the offset if the frame object
+  //   is one of the
+  //   following: an outgoing argument, pointer to a dynamically allocated
+  //   stack space or a $gp restore location,
+  // - If the frame object is any of the following,
+  //   its offset must be adjusted
+  //   by adding the size of the stack:
+  //   incoming argument, callee-saved register location or local variable.
+  int64_t Offset;
+  bool IsKill = false;
+  Offset = SPOffset + (int64_t)StackSize;
+  Offset += MI.getOperand(OpNo + 1).getImm();
+
+
+  DEBUG(errs() << "Offset     : " << Offset << "\n" << "<--------->\n");
+
+  if (!MI.isDebugValue() &&
+      !Mips16InstrInfo::validImmediate(MI.getOpcode(), FrameReg, Offset)) {
+    MachineBasicBlock &MBB = *MI.getParent();
+    DebugLoc DL = II->getDebugLoc();
+    unsigned NewImm;
+    const Mips16InstrInfo &TII =
+      *static_cast<const Mips16InstrInfo*>(
+        MBB.getParent()->getTarget().getInstrInfo());
+    FrameReg = TII.loadImmediate(FrameReg, Offset, MBB, II, DL, NewImm);
+    Offset = SignExtend64<16>(NewImm);
+    IsKill = true;
+  }
+  MI.getOperand(OpNo).ChangeToRegister(FrameReg, false, false, IsKill);
+  MI.getOperand(OpNo + 1).ChangeToImmediate(Offset);
+
+
+}
diff --git a/contrib/llvm/lib/Target/Mips/Mips16RegisterInfo.h b/contrib/llvm/lib/Target/Mips/Mips16RegisterInfo.h
new file mode 100644
index 0000000..f59f1a7
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16RegisterInfo.h
@@ -0,0 +1,48 @@
+//===-- Mips16RegisterInfo.h - Mips16 Register Information ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips16 implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPS16REGISTERINFO_H
+#define MIPS16REGISTERINFO_H
+
+#include "MipsRegisterInfo.h"
+
+namespace llvm {
+class Mips16InstrInfo;
+
+class Mips16RegisterInfo : public MipsRegisterInfo {
+public:
+  Mips16RegisterInfo(const MipsSubtarget &Subtarget);
+
+  bool requiresRegisterScavenging(const MachineFunction &MF) const override;
+
+  bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
+
+  bool useFPForScavengingIndex(const MachineFunction &MF) const override;
+
+  bool saveScavengerRegister(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I,
+                                     MachineBasicBlock::iterator &UseMI,
+                                     const TargetRegisterClass *RC,
+                                     unsigned Reg) const override;
+
+  const TargetRegisterClass *intRegClass(unsigned Size) const override;
+
+private:
+  void eliminateFI(MachineBasicBlock::iterator II, unsigned OpNo,
+                   int FrameIndex, uint64_t StackSize,
+                   int64_t SPOffset) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/Mips32r6InstrFormats.td b/contrib/llvm/lib/Target/Mips/Mips32r6InstrFormats.td
new file mode 100644
index 0000000..e4ec96a
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips32r6InstrFormats.td
@@ -0,0 +1,543 @@
+//=- Mips32r6InstrFormats.td - Mips32r6 Instruction Formats -*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Mips32r6 instruction formats.
+//
+//===----------------------------------------------------------------------===//
+
+class MipsR6Inst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther>,
+                   PredicateControl {
+  let DecoderNamespace = "Mips32r6_64r6";
+  let EncodingPredicates = [HasStdEnc];
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Field Values
+//
+//===----------------------------------------------------------------------===//
+
+class OPGROUP<bits<6> Val> {
+  bits<6> Value = Val;
+}
+def OPGROUP_COP1     : OPGROUP<0b010001>;
+def OPGROUP_COP2     : OPGROUP<0b010010>;
+def OPGROUP_ADDI     : OPGROUP<0b001000>;
+def OPGROUP_AUI      : OPGROUP<0b001111>;
+def OPGROUP_BLEZ     : OPGROUP<0b000110>;
+def OPGROUP_BGTZ     : OPGROUP<0b000111>;
+def OPGROUP_BLEZL    : OPGROUP<0b010110>;
+def OPGROUP_BGTZL    : OPGROUP<0b010111>;
+def OPGROUP_DADDI    : OPGROUP<0b011000>;
+def OPGROUP_DAUI     : OPGROUP<0b011101>;
+def OPGROUP_PCREL    : OPGROUP<0b111011>;
+def OPGROUP_REGIMM   : OPGROUP<0b000001>;
+def OPGROUP_SPECIAL  : OPGROUP<0b000000>;
+// The spec occasionally names this value LL, LLD, SC, or SCD.
+def OPGROUP_SPECIAL3 : OPGROUP<0b011111>;
+// The spec names this constant LWC2, LDC2, SWC2, and SDC2 in different places.
+def OPGROUP_COP2LDST : OPGROUP<0b010010>;
+
+class OPCODE2<bits<2> Val> {
+  bits<2> Value = Val;
+}
+def OPCODE2_ADDIUPC : OPCODE2<0b00>;
+def OPCODE2_LWPC    : OPCODE2<0b01>;
+def OPCODE2_LWUPC   : OPCODE2<0b10>;
+
+class OPCODE3<bits<3> Val> {
+  bits<3> Value = Val;
+}
+def OPCODE3_LDPC : OPCODE3<0b110>;
+
+class OPCODE5<bits<5> Val> {
+  bits<5> Value = Val;
+}
+def OPCODE5_ALUIPC : OPCODE5<0b11111>;
+def OPCODE5_AUIPC  : OPCODE5<0b11110>;
+def OPCODE5_DAHI : OPCODE5<0b00110>;
+def OPCODE5_DATI : OPCODE5<0b11110>;
+def OPCODE5_BC1EQZ : OPCODE5<0b01001>;
+def OPCODE5_BC1NEZ : OPCODE5<0b01101>;
+def OPCODE5_BC2EQZ : OPCODE5<0b01001>;
+def OPCODE5_BC2NEZ : OPCODE5<0b01101>;
+def OPCODE5_BGEZAL : OPCODE5<0b10001>;
+// The next four constants are unnamed in the spec. These names are taken from
+// the OPGROUP names they are used with.
+def OPCODE5_LDC2   : OPCODE5<0b01110>;
+def OPCODE5_LWC2   : OPCODE5<0b01010>;
+def OPCODE5_SDC2   : OPCODE5<0b01111>;
+def OPCODE5_SWC2   : OPCODE5<0b01011>;
+
+class OPCODE6<bits<6> Val> {
+  bits<6> Value = Val;
+}
+def OPCODE6_ALIGN    : OPCODE6<0b100000>;
+def OPCODE6_DALIGN   : OPCODE6<0b100100>;
+def OPCODE6_BITSWAP  : OPCODE6<0b100000>;
+def OPCODE6_DBITSWAP : OPCODE6<0b100100>;
+def OPCODE6_JALR     : OPCODE6<0b001001>;
+def OPCODE6_CACHE    : OPCODE6<0b100101>;
+def OPCODE6_PREF     : OPCODE6<0b110101>;
+// The next four constants are unnamed in the spec. These names are taken from
+// the OPGROUP names they are used with.
+def OPCODE6_LL       : OPCODE6<0b110110>;
+def OPCODE6_LLD      : OPCODE6<0b110111>;
+def OPCODE6_SC       : OPCODE6<0b100110>;
+def OPCODE6_SCD      : OPCODE6<0b100111>;
+def OPCODE6_CLO      : OPCODE6<0b010001>;
+def OPCODE6_CLZ      : OPCODE6<0b010000>;
+def OPCODE6_DCLO     : OPCODE6<0b010011>;
+def OPCODE6_DCLZ     : OPCODE6<0b010010>;
+def OPCODE6_LSA      : OPCODE6<0b000101>;
+def OPCODE6_DLSA     : OPCODE6<0b010101>;
+def OPCODE6_SDBBP    : OPCODE6<0b001110>;
+
+class FIELD_FMT<bits<5> Val> {
+  bits<5> Value = Val;
+}
+def FIELD_FMT_S : FIELD_FMT<0b10000>;
+def FIELD_FMT_D : FIELD_FMT<0b10001>;
+
+class FIELD_CMP_COND<bits<5> Val> {
+  bits<5> Value = Val;
+}
+// Note: The CMP_COND_FMT names differ from the C_COND_FMT names.
+def FIELD_CMP_COND_AF   : FIELD_CMP_COND<0b00000>;
+def FIELD_CMP_COND_UN   : FIELD_CMP_COND<0b00001>;
+def FIELD_CMP_COND_EQ   : FIELD_CMP_COND<0b00010>;
+def FIELD_CMP_COND_UEQ  : FIELD_CMP_COND<0b00011>;
+def FIELD_CMP_COND_LT   : FIELD_CMP_COND<0b00100>;
+def FIELD_CMP_COND_ULT  : FIELD_CMP_COND<0b00101>;
+def FIELD_CMP_COND_LE   : FIELD_CMP_COND<0b00110>;
+def FIELD_CMP_COND_ULE  : FIELD_CMP_COND<0b00111>;
+def FIELD_CMP_COND_SAF  : FIELD_CMP_COND<0b01000>;
+def FIELD_CMP_COND_SUN  : FIELD_CMP_COND<0b01001>;
+def FIELD_CMP_COND_SEQ  : FIELD_CMP_COND<0b01010>;
+def FIELD_CMP_COND_SUEQ : FIELD_CMP_COND<0b01011>;
+def FIELD_CMP_COND_SLT  : FIELD_CMP_COND<0b01100>;
+def FIELD_CMP_COND_SULT : FIELD_CMP_COND<0b01101>;
+def FIELD_CMP_COND_SLE  : FIELD_CMP_COND<0b01110>;
+def FIELD_CMP_COND_SULE : FIELD_CMP_COND<0b01111>;
+
+class FIELD_CMP_FORMAT<bits<5> Val> {
+  bits<5> Value = Val;
+}
+def FIELD_CMP_FORMAT_S : FIELD_CMP_FORMAT<0b10100>;
+def FIELD_CMP_FORMAT_D : FIELD_CMP_FORMAT<0b10101>;
+
+//===----------------------------------------------------------------------===//
+//
+// Disambiguators
+//
+//===----------------------------------------------------------------------===//
+//
+// Some encodings are ambiguous except by comparing field values.
+
+class DecodeDisambiguates<string Name> {
+  string DecoderMethod = !strconcat("Decode", Name);
+}
+
+class DecodeDisambiguatedBy<string Name> : DecodeDisambiguates<Name> {
+  string DecoderNamespace = "Mips32r6_64r6_Ambiguous";
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Encoding Formats
+//
+//===----------------------------------------------------------------------===//
+
+class AUI_FM : MipsR6Inst {
+  bits<5> rs;
+  bits<5> rt;
+  bits<16> imm;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_AUI.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-0} = imm;
+}
+
+class DAUI_FM : AUI_FM {
+  let Inst{31-26} = OPGROUP_DAUI.Value;
+}
+
+class BAL_FM : MipsR6Inst {
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_REGIMM.Value;
+  let Inst{25-21} = 0b00000;
+  let Inst{20-16} = OPCODE5_BGEZAL.Value;
+  let Inst{15-0} = offset;
+}
+
+class COP1_2R_FM<bits<6> funct, FIELD_FMT Format> : MipsR6Inst {
+  bits<5> fs;
+  bits<5> fd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_COP1.Value;
+  let Inst{25-21} = Format.Value;
+  let Inst{20-16} = 0b00000;
+  let Inst{15-11} = fs;
+  let Inst{10-6}  = fd;
+  let Inst{5-0}   = funct;
+}
+
+class COP1_3R_FM<bits<6> funct, FIELD_FMT Format> : MipsR6Inst {
+  bits<5> ft;
+  bits<5> fs;
+  bits<5> fd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_COP1.Value;
+  let Inst{25-21} = Format.Value;
+  let Inst{20-16} = ft;
+  let Inst{15-11} = fs;
+  let Inst{10-6} = fd;
+  let Inst{5-0} = funct;
+}
+
+class COP1_BCCZ_FM<OPCODE5 Operation> : MipsR6Inst {
+  bits<5> ft;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_COP1.Value;
+  let Inst{25-21} = Operation.Value;
+  let Inst{20-16} = ft;
+  let Inst{15-0} = offset;
+}
+
+class COP2_BCCZ_FM<OPCODE5 Operation> : MipsR6Inst {
+  bits<5> ct;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_COP2.Value;
+  let Inst{25-21} = Operation.Value;
+  let Inst{20-16} = ct;
+  let Inst{15-0} = offset;
+}
+
+class PCREL16_FM<OPCODE5 Operation> : MipsR6Inst {
+  bits<5> rs;
+  bits<16> imm;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_PCREL.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = Operation.Value;
+  let Inst{15-0} = imm;
+}
+
+class PCREL19_FM<OPCODE2 Operation> : MipsR6Inst {
+  bits<5> rs;
+  bits<19> imm;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_PCREL.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-19} = Operation.Value;
+  let Inst{18-0} = imm;
+}
+
+class PCREL18_FM<OPCODE3 Operation> : MipsR6Inst {
+  bits<5> rs;
+  bits<18> imm;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_PCREL.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-18} = Operation.Value;
+  let Inst{17-0} = imm;
+}
+
+class SPECIAL3_2R_FM<OPCODE6 Operation> : MipsR6Inst {
+  bits<5> rd;
+  bits<5> rt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_SPECIAL3.Value;
+  let Inst{25-21} = 0b00000;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = 0b00000;
+  let Inst{5-0}   = Operation.Value;
+}
+
+class SPECIAL3_MEM_FM<OPCODE6 Operation> : MipsR6Inst {
+  bits<21> addr;
+  bits<5> hint;
+  bits<5> base = addr{20-16};
+  bits<9> offset = addr{8-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_SPECIAL3.Value;
+  let Inst{25-21} = base;
+  let Inst{20-16} = hint;
+  let Inst{15-7}  = offset;
+  let Inst{6}     = 0;
+  let Inst{5-0}   = Operation.Value;
+}
+
+class SPECIAL_2R_FM<OPCODE6 Operation> : MipsR6Inst {
+  bits<5> rd;
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_SPECIAL.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = 0b00000;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = 0b00001;
+  let Inst{5-0}   = Operation.Value;
+}
+
+class SPECIAL_3R_FM<bits<5> mulop, bits<6> funct> : MipsR6Inst {
+  bits<5> rd;
+  bits<5> rs;
+  bits<5> rt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_SPECIAL.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = mulop;
+  let Inst{5-0}   = funct;
+}
+
+class SPECIAL_SDBBP_FM : MipsR6Inst {
+  bits<20> code_;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_SPECIAL.Value;
+  let Inst{25-6}  = code_;
+  let Inst{5-0}   = OPCODE6_SDBBP.Value;
+}
+
+// This class is ambiguous with other branches:
+//   BEQC/BNEC require that rs > rt
+class CMP_BRANCH_2R_OFF16_FM<OPGROUP funct> : MipsR6Inst {
+  bits<5> rs;
+  bits<5> rt;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = funct.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-0} = offset;
+}
+
+// This class is ambiguous with other branches:
+//   BLEZC/BGEZC/BEQZALC/BNEZALC/BGTZALC require that rs == 0 && rt != 0
+// The '1R_RT' in the name means 1 register in the rt field.
+class CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP funct> : MipsR6Inst {
+  bits<5> rt;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = funct.Value;
+  let Inst{25-21} = 0b00000;
+  let Inst{20-16} = rt;
+  let Inst{15-0} = offset;
+}
+
+// This class is ambiguous with other branches:
+//   BLTZC/BGTZC/BLTZALC/BGEZALC require that rs == rt && rt != 0
+// The '1R_BOTH' in the name means 1 register in both the rs and rt fields.
+class CMP_BRANCH_1R_BOTH_OFF16_FM<OPGROUP funct> : MipsR6Inst {
+  bits<5> rt;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = funct.Value;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rt;
+  let Inst{15-0} = offset;
+}
+
+class CMP_BRANCH_OFF21_FM<bits<6> funct> : MipsR6Inst {
+  bits<5> rs; // rs != 0
+  bits<21> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = funct;
+  let Inst{25-21} = rs;
+  let Inst{20-0} = offset;
+}
+
+class JMP_IDX_COMPACT_FM<bits<6> funct> : MipsR6Inst {
+  bits<5> rt;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = funct;
+  let Inst{25-21} = 0b000000;
+  let Inst{20-16} = rt;
+  let Inst{15-0} = offset;
+}
+
+class BRANCH_OFF26_FM<bits<6> funct> : MipsR6Inst {
+  bits<32> Inst;
+  bits<26> offset;
+
+  let Inst{31-26} = funct;
+  let Inst{25-0} = offset;
+}
+
+class SPECIAL3_ALIGN_FM<OPCODE6 Operation> : MipsR6Inst {
+  bits<5> rd;
+  bits<5> rs;
+  bits<5> rt;
+  bits<2> bp;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_SPECIAL3.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-8}  = 0b010;
+  let Inst{7-6}   = bp;
+  let Inst{5-0}   = Operation.Value;
+}
+
+class SPECIAL3_DALIGN_FM<OPCODE6 Operation> : MipsR6Inst {
+  bits<5> rd;
+  bits<5> rs;
+  bits<5> rt;
+  bits<3> bp;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_SPECIAL3.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-9}  = 0b01;
+  let Inst{8-6}   = bp;
+  let Inst{5-0}   = Operation.Value;
+}
+
+class SPECIAL3_LL_SC_FM<OPCODE6 Operation> : MipsR6Inst {
+  bits<5> rt;
+  bits<21> addr;
+  bits<5> base = addr{20-16};
+  bits<9> offset = addr{8-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_SPECIAL3.Value;
+  let Inst{25-21} = base;
+  let Inst{20-16} = rt;
+  let Inst{15-7} = offset;
+  let Inst{5-0} = Operation.Value;
+
+  string DecoderMethod = "DecodeSpecial3LlSc";
+}
+
+class SPECIAL_LSA_FM<OPCODE6 Operation> : MipsR6Inst {
+  bits<5> rd;
+  bits<5> rs;
+  bits<5> rt;
+  bits<2> imm2;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_SPECIAL.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-8}  = 0b000;
+  let Inst{7-6}   = imm2;
+  let Inst{5-0}   = Operation.Value;
+}
+
+class REGIMM_FM<OPCODE5 Operation> : MipsR6Inst {
+  bits<5> rs;
+  bits<16> imm;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_REGIMM.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = Operation.Value;
+  let Inst{15-0} = imm;
+}
+
+class COP1_CMP_CONDN_FM<FIELD_CMP_FORMAT Format,
+                        FIELD_CMP_COND Cond> : MipsR6Inst {
+  bits<5> fd;
+  bits<5> fs;
+  bits<5> ft;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_COP1.Value;
+  let Inst{25-21} = Format.Value;
+  let Inst{20-16} = ft;
+  let Inst{15-11} = fs;
+  let Inst{10-6}  = fd;
+  let Inst{5}     = 0;
+  let Inst{4-0}   = Cond.Value;
+}
+
+class JR_HB_R6_FM<OPCODE6 Operation> : MipsR6Inst {
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_SPECIAL.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = 0;
+  let Inst{15-11} = 0;
+  let Inst{10} = 1;
+  let Inst{9-6} = 0;
+  let Inst{5-0} = Operation.Value;
+}
+
+class COP2LDST_FM<OPCODE5 Operation> : MipsR6Inst {
+  bits<5> rt;
+  bits<21> addr;
+  bits<5> base = addr{20-16};
+  bits<11> offset = addr{10-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_COP2LDST.Value;
+  let Inst{25-21} = Operation.Value;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = base;
+  let Inst{10-0}  = offset;
+}
diff --git a/contrib/llvm/lib/Target/Mips/Mips32r6InstrInfo.td b/contrib/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
new file mode 100644
index 0000000..6d6735b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
@@ -0,0 +1,824 @@
+//=- Mips32r6InstrInfo.td - Mips32r6 Instruction Information -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Mips32r6 instructions.
+//
+//===----------------------------------------------------------------------===//
+
+include "Mips32r6InstrFormats.td"
+
+// Notes about removals/changes from MIPS32r6:
+// Reencoded: jr -> jalr
+// Reencoded: jr.hb -> jalr.hb
+
+def brtarget21 : Operand<OtherVT> {
+  let EncoderMethod = "getBranchTarget21OpValue";
+  let OperandType = "OPERAND_PCREL";
+  let DecoderMethod = "DecodeBranchTarget21";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
+def brtarget26 : Operand<OtherVT> {
+  let EncoderMethod = "getBranchTarget26OpValue";
+  let OperandType = "OPERAND_PCREL";
+  let DecoderMethod = "DecodeBranchTarget26";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
+def jmpoffset16 : Operand<OtherVT> {
+  let EncoderMethod = "getJumpOffset16OpValue";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
+def calloffset16 : Operand<iPTR> {
+  let EncoderMethod = "getJumpOffset16OpValue";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Encodings
+//
+//===----------------------------------------------------------------------===//
+
+class ADDIUPC_ENC : PCREL19_FM<OPCODE2_ADDIUPC>;
+class ALIGN_ENC  : SPECIAL3_ALIGN_FM<OPCODE6_ALIGN>;
+class ALUIPC_ENC : PCREL16_FM<OPCODE5_ALUIPC>;
+class AUI_ENC    : AUI_FM;
+class AUIPC_ENC  : PCREL16_FM<OPCODE5_AUIPC>;
+
+class BAL_ENC   : BAL_FM;
+class BALC_ENC  : BRANCH_OFF26_FM<0b111010>;
+class BC_ENC    : BRANCH_OFF26_FM<0b110010>;
+class BEQC_ENC  : CMP_BRANCH_2R_OFF16_FM<OPGROUP_ADDI>,
+                  DecodeDisambiguates<"AddiGroupBranch">;
+class BEQZALC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_ADDI>,
+                    DecodeDisambiguatedBy<"DaddiGroupBranch">;
+class BNEC_ENC  : CMP_BRANCH_2R_OFF16_FM<OPGROUP_DADDI>,
+                  DecodeDisambiguates<"DaddiGroupBranch">;
+class BNEZALC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_DADDI>,
+                    DecodeDisambiguatedBy<"DaddiGroupBranch">;
+
+class BLTZC_ENC : CMP_BRANCH_1R_BOTH_OFF16_FM<OPGROUP_BGTZL>,
+                  DecodeDisambiguates<"BgtzlGroupBranch">;
+class BGEC_ENC  : CMP_BRANCH_2R_OFF16_FM<OPGROUP_BLEZL>,
+                  DecodeDisambiguatedBy<"BlezlGroupBranch">;
+class BGEUC_ENC : CMP_BRANCH_2R_OFF16_FM<OPGROUP_BLEZ>,
+                  DecodeDisambiguatedBy<"BlezGroupBranch">;
+class BGEZC_ENC : CMP_BRANCH_1R_BOTH_OFF16_FM<OPGROUP_BLEZL>,
+                  DecodeDisambiguates<"BlezlGroupBranch">;
+class BGTZALC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_BGTZ>,
+                    DecodeDisambiguatedBy<"BgtzGroupBranch">;
+
+class BLTC_ENC : CMP_BRANCH_2R_OFF16_FM<OPGROUP_BGTZL>,
+                 DecodeDisambiguatedBy<"BgtzlGroupBranch">;
+class BLTUC_ENC : CMP_BRANCH_2R_OFF16_FM<OPGROUP_BGTZ>,
+                  DecodeDisambiguatedBy<"BgtzGroupBranch">;
+
+class BLEZC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_BLEZL>,
+                  DecodeDisambiguatedBy<"BlezlGroupBranch">;
+class BLTZALC_ENC : CMP_BRANCH_1R_BOTH_OFF16_FM<OPGROUP_BGTZ>,
+                    DecodeDisambiguates<"BgtzGroupBranch">;
+class BGTZC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_BGTZL>,
+                  DecodeDisambiguatedBy<"BgtzlGroupBranch">;
+
+class BEQZC_ENC : CMP_BRANCH_OFF21_FM<0b110110>;
+class BGEZALC_ENC : CMP_BRANCH_1R_BOTH_OFF16_FM<OPGROUP_BLEZ>,
+                    DecodeDisambiguates<"BlezGroupBranch">;
+class BNEZC_ENC : CMP_BRANCH_OFF21_FM<0b111110>;
+
+class BC1EQZ_ENC : COP1_BCCZ_FM<OPCODE5_BC1EQZ>;
+class BC1NEZ_ENC : COP1_BCCZ_FM<OPCODE5_BC1NEZ>;
+class BC2EQZ_ENC : COP2_BCCZ_FM<OPCODE5_BC2EQZ>;
+class BC2NEZ_ENC : COP2_BCCZ_FM<OPCODE5_BC2NEZ>;
+
+class JIALC_ENC : JMP_IDX_COMPACT_FM<0b111110>;
+class JIC_ENC   : JMP_IDX_COMPACT_FM<0b110110>;
+class JR_HB_R6_ENC : JR_HB_R6_FM<OPCODE6_JALR>;
+class BITSWAP_ENC : SPECIAL3_2R_FM<OPCODE6_BITSWAP>;
+class BLEZALC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_BLEZ>,
+                    DecodeDisambiguatedBy<"BlezGroupBranch">;
+class BNVC_ENC   : CMP_BRANCH_2R_OFF16_FM<OPGROUP_DADDI>,
+                   DecodeDisambiguatedBy<"DaddiGroupBranch">;
+class BOVC_ENC   : CMP_BRANCH_2R_OFF16_FM<OPGROUP_ADDI>,
+                   DecodeDisambiguatedBy<"AddiGroupBranch">;
+class DIV_ENC    : SPECIAL_3R_FM<0b00010, 0b011010>;
+class DIVU_ENC   : SPECIAL_3R_FM<0b00010, 0b011011>;
+class MOD_ENC    : SPECIAL_3R_FM<0b00011, 0b011010>;
+class MODU_ENC   : SPECIAL_3R_FM<0b00011, 0b011011>;
+class MUH_ENC    : SPECIAL_3R_FM<0b00011, 0b011000>;
+class MUHU_ENC   : SPECIAL_3R_FM<0b00011, 0b011001>;
+class MUL_R6_ENC : SPECIAL_3R_FM<0b00010, 0b011000>;
+class MULU_ENC   : SPECIAL_3R_FM<0b00010, 0b011001>;
+
+class MADDF_S_ENC  : COP1_3R_FM<0b011000, FIELD_FMT_S>;
+class MADDF_D_ENC  : COP1_3R_FM<0b011000, FIELD_FMT_D>;
+class MSUBF_S_ENC  : COP1_3R_FM<0b011001, FIELD_FMT_S>;
+class MSUBF_D_ENC  : COP1_3R_FM<0b011001, FIELD_FMT_D>;
+
+class SEL_D_ENC  : COP1_3R_FM<0b010000, FIELD_FMT_D>;
+class SEL_S_ENC  : COP1_3R_FM<0b010000, FIELD_FMT_S>;
+
+class SELEQZ_ENC : SPECIAL_3R_FM<0b00000, 0b110101>;
+class SELNEZ_ENC : SPECIAL_3R_FM<0b00000, 0b110111>;
+
+class LWPC_ENC   : PCREL19_FM<OPCODE2_LWPC>;
+class LWUPC_ENC  : PCREL19_FM<OPCODE2_LWUPC>;
+
+class MAX_S_ENC : COP1_3R_FM<0b011101, FIELD_FMT_S>;
+class MAX_D_ENC : COP1_3R_FM<0b011101, FIELD_FMT_D>;
+class MIN_S_ENC : COP1_3R_FM<0b011100, FIELD_FMT_S>;
+class MIN_D_ENC : COP1_3R_FM<0b011100, FIELD_FMT_D>;
+
+class MAXA_S_ENC : COP1_3R_FM<0b011111, FIELD_FMT_S>;
+class MAXA_D_ENC : COP1_3R_FM<0b011111, FIELD_FMT_D>;
+class MINA_S_ENC : COP1_3R_FM<0b011110, FIELD_FMT_S>;
+class MINA_D_ENC : COP1_3R_FM<0b011110, FIELD_FMT_D>;
+
+class SELEQZ_S_ENC : COP1_3R_FM<0b010100, FIELD_FMT_S>;
+class SELEQZ_D_ENC : COP1_3R_FM<0b010100, FIELD_FMT_D>;
+class SELNEZ_S_ENC : COP1_3R_FM<0b010111, FIELD_FMT_S>;
+class SELNEZ_D_ENC : COP1_3R_FM<0b010111, FIELD_FMT_D>;
+
+class RINT_S_ENC : COP1_2R_FM<0b011010, FIELD_FMT_S>;
+class RINT_D_ENC : COP1_2R_FM<0b011010, FIELD_FMT_D>;
+class CLASS_S_ENC : COP1_2R_FM<0b011011, FIELD_FMT_S>;
+class CLASS_D_ENC : COP1_2R_FM<0b011011, FIELD_FMT_D>;
+
+class CACHE_ENC : SPECIAL3_MEM_FM<OPCODE6_CACHE>;
+class PREF_ENC : SPECIAL3_MEM_FM<OPCODE6_PREF>;
+
+class LDC2_R6_ENC : COP2LDST_FM<OPCODE5_LDC2>;
+class LWC2_R6_ENC : COP2LDST_FM<OPCODE5_LWC2>;
+class SDC2_R6_ENC : COP2LDST_FM<OPCODE5_SDC2>;
+class SWC2_R6_ENC : COP2LDST_FM<OPCODE5_SWC2>;
+
+class LSA_R6_ENC : SPECIAL_LSA_FM<OPCODE6_LSA>;
+
+class LL_R6_ENC : SPECIAL3_LL_SC_FM<OPCODE6_LL>;
+class SC_R6_ENC : SPECIAL3_LL_SC_FM<OPCODE6_SC>;
+
+class CLO_R6_ENC : SPECIAL_2R_FM<OPCODE6_CLO>;
+class CLZ_R6_ENC : SPECIAL_2R_FM<OPCODE6_CLZ>;
+
+class SDBBP_R6_ENC : SPECIAL_SDBBP_FM;
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Multiclasses
+//
+//===----------------------------------------------------------------------===//
+
+class CMP_CONDN_DESC_BASE<string CondStr, string Typestr,
+                          RegisterOperand FGROpnd,
+                          SDPatternOperator Op = null_frag> {
+  dag OutOperandList = (outs FGRCCOpnd:$fd);
+  dag InOperandList = (ins FGROpnd:$fs, FGROpnd:$ft);
+  string AsmString = !strconcat("cmp.", CondStr, ".", Typestr, "\t$fd, $fs, $ft");
+  list<dag> Pattern = [(set FGRCCOpnd:$fd, (Op FGROpnd:$fs, FGROpnd:$ft))];
+}
+
+multiclass CMP_CC_M <FIELD_CMP_FORMAT Format, string Typestr,
+                     RegisterOperand FGROpnd>{
+  def CMP_F_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_AF>,
+                    CMP_CONDN_DESC_BASE<"af", Typestr, FGROpnd>,
+                    ISA_MIPS32R6;
+  def CMP_UN_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_UN>,
+                     CMP_CONDN_DESC_BASE<"un", Typestr, FGROpnd, setuo>,
+                     ISA_MIPS32R6;
+  def CMP_EQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_EQ>,
+                     CMP_CONDN_DESC_BASE<"eq", Typestr, FGROpnd, setoeq>,
+                     ISA_MIPS32R6;
+  def CMP_UEQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_UEQ>,
+                      CMP_CONDN_DESC_BASE<"ueq", Typestr, FGROpnd, setueq>,
+                      ISA_MIPS32R6;
+  def CMP_LT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_LT>,
+                     CMP_CONDN_DESC_BASE<"lt", Typestr, FGROpnd, setolt>,
+                     ISA_MIPS32R6;
+  def CMP_ULT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_ULT>,
+                      CMP_CONDN_DESC_BASE<"ult", Typestr, FGROpnd, setult>,
+                      ISA_MIPS32R6;
+  def CMP_LE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_LE>,
+                     CMP_CONDN_DESC_BASE<"le", Typestr, FGROpnd, setole>,
+                     ISA_MIPS32R6;
+  def CMP_ULE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_ULE>,
+                      CMP_CONDN_DESC_BASE<"ule", Typestr, FGROpnd, setule>,
+                      ISA_MIPS32R6;
+  def CMP_SAF_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SAF>,
+                      CMP_CONDN_DESC_BASE<"saf", Typestr, FGROpnd>,
+                      ISA_MIPS32R6;
+  def CMP_SUN_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SUN>,
+                      CMP_CONDN_DESC_BASE<"sun", Typestr, FGROpnd>,
+                      ISA_MIPS32R6;
+  def CMP_SEQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SEQ>,
+                      CMP_CONDN_DESC_BASE<"seq", Typestr, FGROpnd>,
+                      ISA_MIPS32R6;
+  def CMP_SUEQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SUEQ>,
+                       CMP_CONDN_DESC_BASE<"sueq", Typestr, FGROpnd>,
+                       ISA_MIPS32R6;
+  def CMP_SLT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SLT>,
+                      CMP_CONDN_DESC_BASE<"slt", Typestr, FGROpnd>,
+                      ISA_MIPS32R6;
+  def CMP_SULT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SULT>,
+                       CMP_CONDN_DESC_BASE<"sult", Typestr, FGROpnd>,
+                       ISA_MIPS32R6;
+  def CMP_SLE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SLE>,
+                      CMP_CONDN_DESC_BASE<"sle", Typestr, FGROpnd>,
+                      ISA_MIPS32R6;
+  def CMP_SULE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SULE>,
+                       CMP_CONDN_DESC_BASE<"sule", Typestr, FGROpnd>,
+                       ISA_MIPS32R6;
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Descriptions
+//
+//===----------------------------------------------------------------------===//
+
+class PCREL_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                      Operand ImmOpnd> {
+  dag OutOperandList = (outs GPROpnd:$rs);
+  dag InOperandList = (ins ImmOpnd:$imm);
+  string AsmString = !strconcat(instr_asm, "\t$rs, $imm");
+  list<dag> Pattern = [];
+}
+
+class ADDIUPC_DESC : PCREL_DESC_BASE<"addiupc", GPR32Opnd, simm19_lsl2>;
+class LWPC_DESC: PCREL_DESC_BASE<"lwpc", GPR32Opnd, simm19_lsl2>;
+class LWUPC_DESC: PCREL_DESC_BASE<"lwupc", GPR32Opnd, simm19_lsl2>;
+
+class ALIGN_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                      Operand ImmOpnd> {
+  dag OutOperandList = (outs GPROpnd:$rd);
+  dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt, ImmOpnd:$bp);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt, $bp");
+  list<dag> Pattern = [];
+}
+
+class ALIGN_DESC : ALIGN_DESC_BASE<"align", GPR32Opnd, uimm2>;
+
+class ALUIPC_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs GPROpnd:$rs);
+  dag InOperandList = (ins simm16:$imm);
+  string AsmString = !strconcat(instr_asm, "\t$rs, $imm");
+  list<dag> Pattern = [];
+}
+
+class ALUIPC_DESC : ALUIPC_DESC_BASE<"aluipc", GPR32Opnd>;
+class AUIPC_DESC : ALUIPC_DESC_BASE<"auipc", GPR32Opnd>;
+
+class AUI_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs GPROpnd:$rs);
+  dag InOperandList = (ins GPROpnd:$rt, simm16:$imm);
+  string AsmString = !strconcat(instr_asm, "\t$rs, $rt, $imm");
+  list<dag> Pattern = [];
+}
+
+class AUI_DESC : AUI_DESC_BASE<"aui", GPR32Opnd>;
+
+class BRANCH_DESC_BASE {
+  bit isBranch = 1;
+  bit isTerminator = 1;
+  bit hasDelaySlot = 0;
+}
+
+class BC_DESC_BASE<string instr_asm, DAGOperand opnd> : BRANCH_DESC_BASE {
+  dag InOperandList = (ins opnd:$offset);
+  dag OutOperandList = (outs);
+  string AsmString = !strconcat(instr_asm, "\t$offset");
+  bit isBarrier = 1;
+}
+
+class CMP_BC_DESC_BASE<string instr_asm, DAGOperand opnd,
+                       RegisterOperand GPROpnd> : BRANCH_DESC_BASE {
+  dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt, opnd:$offset);
+  dag OutOperandList = (outs);
+  string AsmString = !strconcat(instr_asm, "\t$rs, $rt, $offset");
+  list<Register> Defs = [AT];
+}
+
+class CMP_CBR_EQNE_Z_DESC_BASE<string instr_asm, DAGOperand opnd,
+                               RegisterOperand GPROpnd> : BRANCH_DESC_BASE {
+  dag InOperandList = (ins GPROpnd:$rs, opnd:$offset);
+  dag OutOperandList = (outs);
+  string AsmString = !strconcat(instr_asm, "\t$rs, $offset");
+  list<Register> Defs = [AT];
+}
+
+class CMP_CBR_RT_Z_DESC_BASE<string instr_asm, DAGOperand opnd,
+                             RegisterOperand GPROpnd> : BRANCH_DESC_BASE {
+  dag InOperandList = (ins GPROpnd:$rt, opnd:$offset);
+  dag OutOperandList = (outs);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $offset");
+  list<Register> Defs = [AT];
+}
+
+class BAL_DESC : BC_DESC_BASE<"bal", brtarget> {
+  bit isCall = 1;
+  bit hasDelaySlot = 1;
+  list<Register> Defs = [RA];
+}
+
+class BALC_DESC : BC_DESC_BASE<"balc", brtarget26> {
+  bit isCall = 1;
+  list<Register> Defs = [RA];
+}
+
+class BC_DESC : BC_DESC_BASE<"bc", brtarget26>;
+class BGEC_DESC : CMP_BC_DESC_BASE<"bgec", brtarget, GPR32Opnd>;
+class BGEUC_DESC : CMP_BC_DESC_BASE<"bgeuc", brtarget, GPR32Opnd>;
+class BEQC_DESC : CMP_BC_DESC_BASE<"beqc", brtarget, GPR32Opnd>;
+class BNEC_DESC : CMP_BC_DESC_BASE<"bnec", brtarget, GPR32Opnd>;
+
+class BLTC_DESC : CMP_BC_DESC_BASE<"bltc", brtarget, GPR32Opnd>;
+class BLTUC_DESC : CMP_BC_DESC_BASE<"bltuc", brtarget, GPR32Opnd>;
+
+class BLTZC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bltzc", brtarget, GPR32Opnd>;
+class BGEZC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bgezc", brtarget, GPR32Opnd>;
+
+class BLEZC_DESC : CMP_CBR_RT_Z_DESC_BASE<"blezc", brtarget, GPR32Opnd>;
+class BGTZC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bgtzc", brtarget, GPR32Opnd>;
+
+class BEQZC_DESC : CMP_CBR_EQNE_Z_DESC_BASE<"beqzc", brtarget21, GPR32Opnd>;
+class BNEZC_DESC : CMP_CBR_EQNE_Z_DESC_BASE<"bnezc", brtarget21, GPR32Opnd>;
+
+class COP1_BCCZ_DESC_BASE<string instr_asm> : BRANCH_DESC_BASE {
+  dag InOperandList = (ins FGR64Opnd:$ft, brtarget:$offset);
+  dag OutOperandList = (outs);
+  string AsmString = instr_asm;
+  bit hasDelaySlot = 1;
+}
+
+class BC1EQZ_DESC : COP1_BCCZ_DESC_BASE<"bc1eqz $ft, $offset">;
+class BC1NEZ_DESC : COP1_BCCZ_DESC_BASE<"bc1nez $ft, $offset">;
+
+class COP2_BCCZ_DESC_BASE<string instr_asm> : BRANCH_DESC_BASE {
+  dag InOperandList = (ins COP2Opnd:$ct, brtarget:$offset);
+  dag OutOperandList = (outs);
+  string AsmString = instr_asm;
+  bit hasDelaySlot = 1;
+}
+
+class BC2EQZ_DESC : COP2_BCCZ_DESC_BASE<"bc2eqz $ct, $offset">;
+class BC2NEZ_DESC : COP2_BCCZ_DESC_BASE<"bc2nez $ct, $offset">;
+
+class BOVC_DESC   : CMP_BC_DESC_BASE<"bovc", brtarget, GPR32Opnd>;
+class BNVC_DESC   : CMP_BC_DESC_BASE<"bnvc", brtarget, GPR32Opnd>;
+
+class JMP_IDX_COMPACT_DESC_BASE<string opstr, DAGOperand opnd,
+                                RegisterOperand GPROpnd> {
+  dag InOperandList = (ins GPROpnd:$rt, opnd:$offset);
+  string AsmString = !strconcat(opstr, "\t$rt, $offset");
+  list<dag> Pattern = [];
+  bit isTerminator = 1;
+  bit hasDelaySlot = 0;
+  string DecoderMethod = "DecodeSimm16";
+}
+
+class JIALC_DESC : JMP_IDX_COMPACT_DESC_BASE<"jialc", calloffset16,
+                                             GPR32Opnd> {
+  bit isCall = 1;
+  list<Register> Defs = [RA];
+}
+
+class JIC_DESC : JMP_IDX_COMPACT_DESC_BASE<"jic", jmpoffset16, GPR32Opnd> {
+  bit isBarrier = 1;
+  list<Register> Defs = [AT];
+}
+
+class JR_HB_R6_DESC : JR_HB_DESC_BASE<"jr.hb", GPR32Opnd> {
+  bit isBranch = 1;
+  bit isIndirectBranch = 1;
+  bit hasDelaySlot = 1;
+  bit isTerminator=1;
+  bit isBarrier=1;
+}
+
+class BITSWAP_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs GPROpnd:$rd);
+  dag InOperandList = (ins GPROpnd:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rt");
+  list<dag> Pattern = [];
+}
+
+class BITSWAP_DESC : BITSWAP_DESC_BASE<"bitswap", GPR32Opnd>;
+
+class DIVMOD_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                       SDPatternOperator Op=null_frag> {
+  dag OutOperandList = (outs GPROpnd:$rd);
+  dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
+  list<dag> Pattern = [(set GPROpnd:$rd, (Op GPROpnd:$rs, GPROpnd:$rt))];
+
+  // This instruction doesn't trap division by zero itself. We must insert
+  // teq instructions as well.
+  bit usesCustomInserter = 1;
+}
+
+class DIV_DESC  : DIVMOD_DESC_BASE<"div", GPR32Opnd, sdiv>;
+class DIVU_DESC : DIVMOD_DESC_BASE<"divu", GPR32Opnd, udiv>;
+class MOD_DESC  : DIVMOD_DESC_BASE<"mod", GPR32Opnd, srem>;
+class MODU_DESC : DIVMOD_DESC_BASE<"modu", GPR32Opnd, urem>;
+
+class BEQZALC_DESC : CMP_CBR_RT_Z_DESC_BASE<"beqzalc", brtarget, GPR32Opnd> {
+  list<Register> Defs = [RA];
+}
+
+class BGEZALC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bgezalc", brtarget, GPR32Opnd> {
+  list<Register> Defs = [RA];
+}
+
+class BGTZALC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bgtzalc", brtarget, GPR32Opnd> {
+  list<Register> Defs = [RA];
+}
+
+class BLEZALC_DESC : CMP_CBR_RT_Z_DESC_BASE<"blezalc", brtarget, GPR32Opnd> {
+  list<Register> Defs = [RA];
+}
+
+class BLTZALC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bltzalc", brtarget, GPR32Opnd> {
+  list<Register> Defs = [RA];
+}
+
+class BNEZALC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bnezalc", brtarget, GPR32Opnd> {
+  list<Register> Defs = [RA];
+}
+
+class MUL_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                       SDPatternOperator Op=null_frag> {
+  dag OutOperandList = (outs GPROpnd:$rd);
+  dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
+  list<dag> Pattern = [(set GPROpnd:$rd, (Op GPROpnd:$rs, GPROpnd:$rt))];
+}
+
+class MUH_DESC    : MUL_R6_DESC_BASE<"muh", GPR32Opnd, mulhs>;
+class MUHU_DESC   : MUL_R6_DESC_BASE<"muhu", GPR32Opnd, mulhu>;
+class MUL_R6_DESC : MUL_R6_DESC_BASE<"mul", GPR32Opnd, mul>;
+class MULU_DESC   : MUL_R6_DESC_BASE<"mulu", GPR32Opnd>;
+
+class COP1_SEL_DESC_BASE<string instr_asm, RegisterOperand FGROpnd> {
+  dag OutOperandList = (outs FGROpnd:$fd);
+  dag InOperandList = (ins FGRCCOpnd:$fd_in, FGROpnd:$fs, FGROpnd:$ft);
+  string AsmString = !strconcat(instr_asm, "\t$fd, $fs, $ft");
+  list<dag> Pattern = [(set FGROpnd:$fd, (select FGRCCOpnd:$fd_in,
+                                                 FGROpnd:$ft,
+                                                 FGROpnd:$fs))];
+  string Constraints = "$fd_in = $fd";
+}
+
+class SEL_D_DESC : COP1_SEL_DESC_BASE<"sel.d", FGR64Opnd> {
+  // We must insert a SUBREG_TO_REG around $fd_in
+  bit usesCustomInserter = 1;
+}
+class SEL_S_DESC : COP1_SEL_DESC_BASE<"sel.s", FGR32Opnd>;
+
+class SELEQNE_Z_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs GPROpnd:$rd);
+  dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
+  list<dag> Pattern = [];
+}
+
+class SELEQZ_DESC : SELEQNE_Z_DESC_BASE<"seleqz", GPR32Opnd>;
+class SELNEZ_DESC : SELEQNE_Z_DESC_BASE<"selnez", GPR32Opnd>;
+
+class COP1_4R_DESC_BASE<string instr_asm, RegisterOperand FGROpnd> {
+  dag OutOperandList = (outs FGROpnd:$fd);
+  dag InOperandList = (ins FGROpnd:$fd_in, FGROpnd:$fs, FGROpnd:$ft);
+  string AsmString = !strconcat(instr_asm, "\t$fd, $fs, $ft");
+  list<dag> Pattern = [];
+  string Constraints = "$fd_in = $fd";
+}
+
+class MADDF_S_DESC  : COP1_4R_DESC_BASE<"maddf.s", FGR32Opnd>;
+class MADDF_D_DESC  : COP1_4R_DESC_BASE<"maddf.d", FGR64Opnd>;
+class MSUBF_S_DESC  : COP1_4R_DESC_BASE<"msubf.s", FGR32Opnd>;
+class MSUBF_D_DESC  : COP1_4R_DESC_BASE<"msubf.d", FGR64Opnd>;
+
+class MAX_MIN_DESC_BASE<string instr_asm, RegisterOperand FGROpnd> {
+  dag OutOperandList = (outs FGROpnd:$fd);
+  dag InOperandList = (ins FGROpnd:$fs, FGROpnd:$ft);
+  string AsmString = !strconcat(instr_asm, "\t$fd, $fs, $ft");
+  list<dag> Pattern = [];
+}
+
+class MAX_S_DESC : MAX_MIN_DESC_BASE<"max.s", FGR32Opnd>;
+class MAX_D_DESC : MAX_MIN_DESC_BASE<"max.d", FGR64Opnd>;
+class MIN_S_DESC : MAX_MIN_DESC_BASE<"min.s", FGR32Opnd>;
+class MIN_D_DESC : MAX_MIN_DESC_BASE<"min.d", FGR64Opnd>;
+
+class MAXA_S_DESC : MAX_MIN_DESC_BASE<"maxa.s", FGR32Opnd>;
+class MAXA_D_DESC : MAX_MIN_DESC_BASE<"maxa.d", FGR64Opnd>;
+class MINA_S_DESC : MAX_MIN_DESC_BASE<"mina.s", FGR32Opnd>;
+class MINA_D_DESC : MAX_MIN_DESC_BASE<"mina.d", FGR64Opnd>;
+
+class SELEQNEZ_DESC_BASE<string instr_asm, RegisterOperand FGROpnd> {
+  dag OutOperandList = (outs FGROpnd:$fd);
+  dag InOperandList = (ins FGROpnd:$fs, FGROpnd:$ft);
+  string AsmString = !strconcat(instr_asm, "\t$fd, $fs, $ft");
+  list<dag> Pattern = [];
+}
+
+class SELEQZ_S_DESC : SELEQNEZ_DESC_BASE<"seleqz.s", FGR32Opnd>;
+class SELEQZ_D_DESC : SELEQNEZ_DESC_BASE<"seleqz.d", FGR64Opnd>;
+class SELNEZ_S_DESC : SELEQNEZ_DESC_BASE<"selnez.s", FGR32Opnd>;
+class SELNEZ_D_DESC : SELEQNEZ_DESC_BASE<"selnez.d", FGR64Opnd>;
+
+class CLASS_RINT_DESC_BASE<string instr_asm, RegisterOperand FGROpnd> {
+  dag OutOperandList = (outs FGROpnd:$fd);
+  dag InOperandList = (ins FGROpnd:$fs);
+  string AsmString = !strconcat(instr_asm, "\t$fd, $fs");
+  list<dag> Pattern = [];
+}
+
+class RINT_S_DESC : CLASS_RINT_DESC_BASE<"rint.s", FGR32Opnd>;
+class RINT_D_DESC : CLASS_RINT_DESC_BASE<"rint.d", FGR64Opnd>;
+class CLASS_S_DESC : CLASS_RINT_DESC_BASE<"class.s", FGR32Opnd>;
+class CLASS_D_DESC : CLASS_RINT_DESC_BASE<"class.d", FGR64Opnd>;
+
+class CACHE_HINT_DESC<string instr_asm, Operand MemOpnd,
+                      RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins MemOpnd:$addr, uimm5:$hint);
+  string AsmString = !strconcat(instr_asm, "\t$hint, $addr");
+  list<dag> Pattern = [];
+}
+
+class CACHE_DESC : CACHE_HINT_DESC<"cache", mem_simm9, GPR32Opnd>;
+class PREF_DESC : CACHE_HINT_DESC<"pref", mem_simm9, GPR32Opnd>;
+
+class COP2LD_DESC_BASE<string instr_asm, RegisterOperand COPOpnd> {
+  dag OutOperandList = (outs COPOpnd:$rt);
+  dag InOperandList = (ins mem_simm11:$addr);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+  list<dag> Pattern = [];
+  bit mayLoad = 1;
+}
+
+class LDC2_R6_DESC : COP2LD_DESC_BASE<"ldc2", COP2Opnd>;
+class LWC2_R6_DESC : COP2LD_DESC_BASE<"lwc2", COP2Opnd>;
+
+class COP2ST_DESC_BASE<string instr_asm, RegisterOperand COPOpnd> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins COPOpnd:$rt, mem_simm11:$addr);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+  list<dag> Pattern = [];
+  bit mayStore = 1;
+}
+
+class SDC2_R6_DESC : COP2ST_DESC_BASE<"sdc2", COP2Opnd>;
+class SWC2_R6_DESC : COP2ST_DESC_BASE<"swc2", COP2Opnd>;
+
+class LSA_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                       Operand ImmOpnd> {
+  dag OutOperandList = (outs GPROpnd:$rd);
+  dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt, ImmOpnd:$imm2);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt, $imm2");
+  list<dag> Pattern = [];
+}
+
+class LSA_R6_DESC : LSA_R6_DESC_BASE<"lsa", GPR32Opnd, uimm2>;
+
+class LL_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs GPROpnd:$rt);
+  dag InOperandList = (ins mem_simm9:$addr);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+  list<dag> Pattern = [];
+  bit mayLoad = 1;
+}
+
+class LL_R6_DESC : LL_R6_DESC_BASE<"ll", GPR32Opnd>;
+
+class SC_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs GPROpnd:$dst);
+  dag InOperandList = (ins GPROpnd:$rt, mem_simm9:$addr);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+  list<dag> Pattern = [];
+  bit mayStore = 1;
+  string Constraints = "$rt = $dst";
+}
+
+class SC_R6_DESC : SC_R6_DESC_BASE<"sc", GPR32Opnd>;
+
+class CLO_CLZ_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs GPROpnd:$rd);
+  dag InOperandList = (ins GPROpnd:$rs);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs");
+}
+
+class CLO_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> :
+    CLO_CLZ_R6_DESC_BASE<instr_asm, GPROpnd> {
+  list<dag> Pattern = [(set GPROpnd:$rd, (ctlz (not GPROpnd:$rs)))];
+}
+
+class CLZ_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> :
+    CLO_CLZ_R6_DESC_BASE<instr_asm, GPROpnd> {
+  list<dag> Pattern = [(set GPROpnd:$rd, (ctlz GPROpnd:$rs))];
+}
+
+class CLO_R6_DESC : CLO_R6_DESC_BASE<"clo", GPR32Opnd>;
+class CLZ_R6_DESC : CLZ_R6_DESC_BASE<"clz", GPR32Opnd>;
+
+class SDBBP_R6_DESC {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins uimm20:$code_);
+  string AsmString = "sdbbp\t$code_";
+  list<dag> Pattern = [];
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Definitions
+//
+//===----------------------------------------------------------------------===//
+
+def ADDIUPC : ADDIUPC_ENC, ADDIUPC_DESC, ISA_MIPS32R6;
+def ALIGN : ALIGN_ENC, ALIGN_DESC, ISA_MIPS32R6;
+def ALUIPC : ALUIPC_ENC, ALUIPC_DESC, ISA_MIPS32R6;
+def AUI : AUI_ENC, AUI_DESC, ISA_MIPS32R6;
+def AUIPC : AUIPC_ENC, AUIPC_DESC, ISA_MIPS32R6;
+def BAL : BAL_ENC, BAL_DESC, ISA_MIPS32R6;
+def BALC : BALC_ENC, BALC_DESC, ISA_MIPS32R6;
+def BC1EQZ : BC1EQZ_ENC, BC1EQZ_DESC, ISA_MIPS32R6;
+def BC1NEZ : BC1NEZ_ENC, BC1NEZ_DESC, ISA_MIPS32R6;
+def BC2EQZ : BC2EQZ_ENC, BC2EQZ_DESC, ISA_MIPS32R6;
+def BC2NEZ : BC2NEZ_ENC, BC2NEZ_DESC, ISA_MIPS32R6;
+def BC : BC_ENC, BC_DESC, ISA_MIPS32R6;
+def BEQC : BEQC_ENC, BEQC_DESC, ISA_MIPS32R6;
+def BEQZALC : BEQZALC_ENC, BEQZALC_DESC, ISA_MIPS32R6;
+def BEQZC : BEQZC_ENC, BEQZC_DESC, ISA_MIPS32R6;
+def BGEC : BGEC_ENC, BGEC_DESC, ISA_MIPS32R6;
+def BGEUC : BGEUC_ENC, BGEUC_DESC, ISA_MIPS32R6;
+def BGEZALC : BGEZALC_ENC, BGEZALC_DESC, ISA_MIPS32R6;
+def BGEZC : BGEZC_ENC, BGEZC_DESC, ISA_MIPS32R6;
+def BGTZALC : BGTZALC_ENC, BGTZALC_DESC, ISA_MIPS32R6;
+def BGTZC : BGTZC_ENC, BGTZC_DESC, ISA_MIPS32R6;
+def BITSWAP : BITSWAP_ENC, BITSWAP_DESC, ISA_MIPS32R6;
+def BLEZALC : BLEZALC_ENC, BLEZALC_DESC, ISA_MIPS32R6;
+def BLEZC : BLEZC_ENC, BLEZC_DESC, ISA_MIPS32R6;
+def BLTC : BLTC_ENC, BLTC_DESC, ISA_MIPS32R6;
+def BLTUC : BLTUC_ENC, BLTUC_DESC, ISA_MIPS32R6;
+def BLTZALC : BLTZALC_ENC, BLTZALC_DESC, ISA_MIPS32R6;
+def BLTZC : BLTZC_ENC, BLTZC_DESC, ISA_MIPS32R6;
+def BNEC : BNEC_ENC, BNEC_DESC, ISA_MIPS32R6;
+def BNEZALC : BNEZALC_ENC, BNEZALC_DESC, ISA_MIPS32R6;
+def BNEZC : BNEZC_ENC, BNEZC_DESC, ISA_MIPS32R6;
+def BNVC : BNVC_ENC, BNVC_DESC, ISA_MIPS32R6;
+def BOVC : BOVC_ENC, BOVC_DESC, ISA_MIPS32R6;
+def CACHE_R6 : CACHE_ENC, CACHE_DESC, ISA_MIPS32R6;
+def CLASS_D : CLASS_D_ENC, CLASS_D_DESC, ISA_MIPS32R6;
+def CLASS_S : CLASS_S_ENC, CLASS_S_DESC, ISA_MIPS32R6;
+def CLO_R6 : CLO_R6_ENC, CLO_R6_DESC, ISA_MIPS32R6;
+def CLZ_R6 : CLZ_R6_ENC, CLZ_R6_DESC, ISA_MIPS32R6;
+defm S : CMP_CC_M<FIELD_CMP_FORMAT_S, "s", FGR32Opnd>;
+defm D : CMP_CC_M<FIELD_CMP_FORMAT_D, "d", FGR64Opnd>;
+def DIV : DIV_ENC, DIV_DESC, ISA_MIPS32R6;
+def DIVU : DIVU_ENC, DIVU_DESC, ISA_MIPS32R6;
+def JIALC : JIALC_ENC, JIALC_DESC, ISA_MIPS32R6;
+def JIC : JIC_ENC, JIC_DESC, ISA_MIPS32R6;
+def JR_HB_R6 : JR_HB_R6_ENC, JR_HB_R6_DESC, ISA_MIPS32R6;
+def LDC2_R6 : LDC2_R6_ENC, LDC2_R6_DESC, ISA_MIPS32R6;
+def LL_R6 : LL_R6_ENC, LL_R6_DESC, ISA_MIPS32R6;
+def LSA_R6 : LSA_R6_ENC, LSA_R6_DESC, ISA_MIPS32R6;
+def LWC2_R6 : LWC2_R6_ENC, LWC2_R6_DESC, ISA_MIPS32R6;
+def LWPC : LWPC_ENC, LWPC_DESC, ISA_MIPS32R6;
+def LWUPC : LWUPC_ENC, LWUPC_DESC, ISA_MIPS32R6;
+def MADDF_S : MADDF_S_ENC, MADDF_S_DESC, ISA_MIPS32R6;
+def MADDF_D : MADDF_D_ENC, MADDF_D_DESC, ISA_MIPS32R6;
+def MAXA_D : MAXA_D_ENC, MAXA_D_DESC, ISA_MIPS32R6;
+def MAXA_S : MAXA_S_ENC, MAXA_S_DESC, ISA_MIPS32R6;
+def MAX_D : MAX_D_ENC, MAX_D_DESC, ISA_MIPS32R6;
+def MAX_S : MAX_S_ENC, MAX_S_DESC, ISA_MIPS32R6;
+def MINA_D : MINA_D_ENC, MINA_D_DESC, ISA_MIPS32R6;
+def MINA_S : MINA_S_ENC, MINA_S_DESC, ISA_MIPS32R6;
+def MIN_D : MIN_D_ENC, MIN_D_DESC, ISA_MIPS32R6;
+def MIN_S : MIN_S_ENC, MIN_S_DESC, ISA_MIPS32R6;
+def MOD : MOD_ENC, MOD_DESC, ISA_MIPS32R6;
+def MODU : MODU_ENC, MODU_DESC, ISA_MIPS32R6;
+def MSUBF_S : MSUBF_S_ENC, MSUBF_S_DESC, ISA_MIPS32R6;
+def MSUBF_D : MSUBF_D_ENC, MSUBF_D_DESC, ISA_MIPS32R6;
+def MUH    : MUH_ENC, MUH_DESC, ISA_MIPS32R6;
+def MUHU   : MUHU_ENC, MUHU_DESC, ISA_MIPS32R6;
+def MUL_R6 : MUL_R6_ENC, MUL_R6_DESC, ISA_MIPS32R6;
+def MULU   : MULU_ENC, MULU_DESC, ISA_MIPS32R6;
+def NAL; // BAL with rd=0
+def PREF_R6 : PREF_ENC, PREF_DESC, ISA_MIPS32R6;
+def RINT_D : RINT_D_ENC, RINT_D_DESC, ISA_MIPS32R6;
+def RINT_S : RINT_S_ENC, RINT_S_DESC, ISA_MIPS32R6;
+def SC_R6 : SC_R6_ENC, SC_R6_DESC, ISA_MIPS32R6;
+def SDBBP_R6 : SDBBP_R6_ENC, SDBBP_R6_DESC, ISA_MIPS32R6;
+def SDC2_R6 : SDC2_R6_ENC, SDC2_R6_DESC, ISA_MIPS32R6;
+def SELEQZ : SELEQZ_ENC, SELEQZ_DESC, ISA_MIPS32R6, GPR_32;
+def SELEQZ_D : SELEQZ_D_ENC, SELEQZ_D_DESC, ISA_MIPS32R6;
+def SELEQZ_S : SELEQZ_S_ENC, SELEQZ_S_DESC, ISA_MIPS32R6;
+def SELNEZ : SELNEZ_ENC, SELNEZ_DESC, ISA_MIPS32R6, GPR_32;
+def SELNEZ_D : SELNEZ_D_ENC, SELNEZ_D_DESC, ISA_MIPS32R6;
+def SELNEZ_S : SELNEZ_S_ENC, SELNEZ_S_DESC, ISA_MIPS32R6;
+def SEL_D : SEL_D_ENC, SEL_D_DESC, ISA_MIPS32R6;
+def SEL_S : SEL_S_ENC, SEL_S_DESC, ISA_MIPS32R6;
+def SWC2_R6 : SWC2_R6_ENC, SWC2_R6_DESC, ISA_MIPS32R6;
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Aliases
+//
+//===----------------------------------------------------------------------===//
+
+def : MipsInstAlias<"sdbbp", (SDBBP_R6 0)>, ISA_MIPS32R6;
+def : MipsInstAlias<"jr $rs", (JALR ZERO, GPR32Opnd:$rs), 1>, ISA_MIPS32R6;
+
+//===----------------------------------------------------------------------===//
+//
+// Patterns and Pseudo Instructions
+//
+//===----------------------------------------------------------------------===//
+
+// f32 comparisons supported via another comparison
+def : MipsPat<(setone f32:$lhs, f32:$rhs),
+              (NOR (CMP_UEQ_S f32:$lhs, f32:$rhs), ZERO)>, ISA_MIPS32R6;
+def : MipsPat<(seto f32:$lhs, f32:$rhs),
+              (NOR (CMP_UN_S f32:$lhs, f32:$rhs), ZERO)>, ISA_MIPS32R6;
+def : MipsPat<(setune f32:$lhs, f32:$rhs),
+              (NOR (CMP_EQ_S f32:$lhs, f32:$rhs), ZERO)>, ISA_MIPS32R6;
+def : MipsPat<(seteq f32:$lhs, f32:$rhs), (CMP_EQ_S f32:$lhs, f32:$rhs)>,
+      ISA_MIPS32R6;
+def : MipsPat<(setgt f32:$lhs, f32:$rhs), (CMP_LE_S f32:$rhs, f32:$lhs)>,
+      ISA_MIPS32R6;
+def : MipsPat<(setge f32:$lhs, f32:$rhs), (CMP_LT_S f32:$rhs, f32:$lhs)>,
+      ISA_MIPS32R6;
+def : MipsPat<(setlt f32:$lhs, f32:$rhs), (CMP_LT_S f32:$lhs, f32:$rhs)>,
+      ISA_MIPS32R6;
+def : MipsPat<(setlt f32:$lhs, f32:$rhs), (CMP_LE_S f32:$lhs, f32:$rhs)>,
+      ISA_MIPS32R6;
+def : MipsPat<(setne f32:$lhs, f32:$rhs),
+              (NOR (CMP_EQ_S f32:$lhs, f32:$rhs), ZERO)>, ISA_MIPS32R6;
+
+// f64 comparisons supported via another comparison
+def : MipsPat<(setone f64:$lhs, f64:$rhs),
+              (NOR (CMP_UEQ_D f64:$lhs, f64:$rhs), ZERO)>, ISA_MIPS32R6;
+def : MipsPat<(seto f64:$lhs, f64:$rhs),
+              (NOR (CMP_UN_D f64:$lhs, f64:$rhs), ZERO)>, ISA_MIPS32R6;
+def : MipsPat<(setune f64:$lhs, f64:$rhs),
+              (NOR (CMP_EQ_D f64:$lhs, f64:$rhs), ZERO)>, ISA_MIPS32R6;
+def : MipsPat<(seteq f64:$lhs, f64:$rhs), (CMP_EQ_D f64:$lhs, f64:$rhs)>,
+      ISA_MIPS32R6;
+def : MipsPat<(setgt f64:$lhs, f64:$rhs), (CMP_LE_D f64:$rhs, f64:$lhs)>,
+      ISA_MIPS32R6;
+def : MipsPat<(setge f64:$lhs, f64:$rhs), (CMP_LT_D f64:$rhs, f64:$lhs)>,
+      ISA_MIPS32R6;
+def : MipsPat<(setlt f64:$lhs, f64:$rhs), (CMP_LT_D f64:$lhs, f64:$rhs)>,
+      ISA_MIPS32R6;
+def : MipsPat<(setlt f64:$lhs, f64:$rhs), (CMP_LE_D f64:$lhs, f64:$rhs)>,
+      ISA_MIPS32R6;
+def : MipsPat<(setne f64:$lhs, f64:$rhs),
+              (NOR (CMP_EQ_D f64:$lhs, f64:$rhs), ZERO)>, ISA_MIPS32R6;
+
+// i32 selects
+def : MipsPat<(select i32:$cond, i32:$t, i32:$f),
+              (OR (SELNEZ i32:$t, i32:$cond), (SELEQZ i32:$f, i32:$cond))>,
+              ISA_MIPS32R6;
+def : MipsPat<(select (i32 (seteq i32:$cond, immz)), i32:$t, i32:$f),
+              (OR (SELEQZ i32:$t, i32:$cond), (SELNEZ i32:$f, i32:$cond))>,
+              ISA_MIPS32R6;
+def : MipsPat<(select (i32 (setne i32:$cond, immz)), i32:$t, i32:$f),
+              (OR (SELNEZ i32:$t, i32:$cond), (SELEQZ i32:$f, i32:$cond))>,
+              ISA_MIPS32R6;
+def : MipsPat<(select (i32 (seteq i32:$cond, immZExt16:$imm)), i32:$t, i32:$f),
+              (OR (SELEQZ i32:$t, (XORi i32:$cond, immZExt16:$imm)),
+                  (SELNEZ i32:$f, (XORi i32:$cond, immZExt16:$imm)))>,
+              ISA_MIPS32R6;
+def : MipsPat<(select (i32 (setne i32:$cond, immZExt16:$imm)), i32:$t, i32:$f),
+              (OR (SELNEZ i32:$t, (XORi i32:$cond, immZExt16:$imm)),
+                  (SELEQZ i32:$f, (XORi i32:$cond, immZExt16:$imm)))>,
+              ISA_MIPS32R6;
+def : MipsPat<(select (i32 (setgt i32:$cond, immSExt16Plus1:$imm)), i32:$t,
+                      i32:$f),
+              (OR (SELEQZ i32:$t, (SLTi i32:$cond, (Plus1 imm:$imm))),
+                  (SELNEZ i32:$f, (SLTi i32:$cond, (Plus1 imm:$imm))))>,
+              ISA_MIPS32R6;
+def : MipsPat<(select (i32 (setugt i32:$cond, immSExt16Plus1:$imm)),
+                      i32:$t, i32:$f),
+              (OR (SELEQZ i32:$t, (SLTiu i32:$cond, (Plus1 imm:$imm))),
+                  (SELNEZ i32:$f, (SLTiu i32:$cond, (Plus1 imm:$imm))))>,
+              ISA_MIPS32R6;
+
+def : MipsPat<(select i32:$cond, i32:$t, immz),
+              (SELNEZ i32:$t, i32:$cond)>, ISA_MIPS32R6;
+def : MipsPat<(select (i32 (setne i32:$cond, immz)), i32:$t, immz),
+              (SELNEZ i32:$t, i32:$cond)>, ISA_MIPS32R6;
+def : MipsPat<(select (i32 (seteq i32:$cond, immz)), i32:$t, immz),
+              (SELEQZ i32:$t, i32:$cond)>, ISA_MIPS32R6;
+def : MipsPat<(select i32:$cond, immz, i32:$f),
+              (SELEQZ i32:$f, i32:$cond)>, ISA_MIPS32R6;
+def : MipsPat<(select (i32 (setne i32:$cond, immz)), immz, i32:$f),
+              (SELEQZ i32:$f, i32:$cond)>, ISA_MIPS32R6;
+def : MipsPat<(select (i32 (seteq i32:$cond, immz)), immz, i32:$f),
+              (SELNEZ i32:$f, i32:$cond)>, ISA_MIPS32R6;
diff --git a/contrib/llvm/lib/Target/Mips/Mips64InstrInfo.td b/contrib/llvm/lib/Target/Mips/Mips64InstrInfo.td
new file mode 100644
index 0000000..f0b6814
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips64InstrInfo.td
@@ -0,0 +1,512 @@
+//===- Mips64InstrInfo.td - Mips64 Instruction Information -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Mips64 instructions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Mips Operand, Complex Patterns and Transformations Definitions.
+//===----------------------------------------------------------------------===//
+
+// Unsigned Operand
+def uimm16_64      : Operand<i64> {
+  let PrintMethod = "printUnsignedImm";
+}
+
+// Signed Operand
+def simm10_64 : Operand<i64>;
+
+def imm64: Operand<i64>;
+
+// Transformation Function - get Imm - 32.
+def Subtract32 : SDNodeXForm<imm, [{
+  return getImm(N, (unsigned)N->getZExtValue() - 32);
+}]>;
+
+// shamt must fit in 6 bits.
+def immZExt6 : ImmLeaf<i32, [{return Imm == (Imm & 0x3f);}]>;
+
+// Node immediate fits as 10-bit sign extended on target immediate.
+// e.g. seqi, snei
+def immSExt10_64 : PatLeaf<(i64 imm),
+                           [{ return isInt<10>(N->getSExtValue()); }]>;
+
+def immZExt16_64 : PatLeaf<(i64 imm),
+                           [{ return isInt<16>(N->getZExtValue()); }]>;
+
+//===----------------------------------------------------------------------===//
+// Instructions specific format
+//===----------------------------------------------------------------------===//
+let usesCustomInserter = 1 in {
+  def ATOMIC_LOAD_ADD_I64  : Atomic2Ops<atomic_load_add_64, GPR64>;
+  def ATOMIC_LOAD_SUB_I64  : Atomic2Ops<atomic_load_sub_64, GPR64>;
+  def ATOMIC_LOAD_AND_I64  : Atomic2Ops<atomic_load_and_64, GPR64>;
+  def ATOMIC_LOAD_OR_I64   : Atomic2Ops<atomic_load_or_64, GPR64>;
+  def ATOMIC_LOAD_XOR_I64  : Atomic2Ops<atomic_load_xor_64, GPR64>;
+  def ATOMIC_LOAD_NAND_I64 : Atomic2Ops<atomic_load_nand_64, GPR64>;
+  def ATOMIC_SWAP_I64      : Atomic2Ops<atomic_swap_64, GPR64>;
+  def ATOMIC_CMP_SWAP_I64  : AtomicCmpSwap<atomic_cmp_swap_64, GPR64>;
+}
+
+/// Pseudo instructions for loading and storing accumulator registers.
+let isPseudo = 1, isCodeGenOnly = 1 in {
+  def LOAD_ACC128  : Load<"", ACC128>;
+  def STORE_ACC128 : Store<"", ACC128>;
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction definition
+//===----------------------------------------------------------------------===//
+let DecoderNamespace = "Mips64" in {
+/// Arithmetic Instructions (ALU Immediate)
+def DADDi   : ArithLogicI<"daddi", simm16_64, GPR64Opnd>, ADDI_FM<0x18>,
+              ISA_MIPS3_NOT_32R6_64R6;
+def DADDiu  : ArithLogicI<"daddiu", simm16_64, GPR64Opnd, II_DADDIU,
+                          immSExt16, add>,
+              ADDI_FM<0x19>, IsAsCheapAsAMove, ISA_MIPS3;
+
+let isCodeGenOnly = 1 in {
+def SLTi64  : SetCC_I<"slti", setlt, simm16_64, immSExt16, GPR64Opnd>,
+              SLTI_FM<0xa>;
+def SLTiu64 : SetCC_I<"sltiu", setult, simm16_64, immSExt16, GPR64Opnd>,
+              SLTI_FM<0xb>;
+def ANDi64 : ArithLogicI<"andi", uimm16_64, GPR64Opnd, II_AND, immZExt16, and>,
+             ADDI_FM<0xc>;
+def ORi64   : ArithLogicI<"ori", uimm16_64, GPR64Opnd, II_OR, immZExt16, or>,
+              ADDI_FM<0xd>;
+def XORi64  : ArithLogicI<"xori", uimm16_64, GPR64Opnd, II_XOR, immZExt16, xor>,
+              ADDI_FM<0xe>;
+def LUi64   : LoadUpper<"lui", GPR64Opnd, uimm16_64>, LUI_FM;
+}
+
+/// Arithmetic Instructions (3-Operand, R-Type)
+def DADD   : ArithLogicR<"dadd", GPR64Opnd, 1, II_DADD>, ADD_FM<0, 0x2c>,
+             ISA_MIPS3;
+def DADDu  : ArithLogicR<"daddu", GPR64Opnd, 1, II_DADDU, add>, ADD_FM<0, 0x2d>,
+             ISA_MIPS3;
+def DSUBu  : ArithLogicR<"dsubu", GPR64Opnd, 0, II_DSUBU, sub>, ADD_FM<0, 0x2f>,
+             ISA_MIPS3;
+def DSUB   : ArithLogicR<"dsub", GPR64Opnd, 0, II_DSUB>, ADD_FM<0, 0x2e>,
+             ISA_MIPS3;
+
+let isCodeGenOnly = 1 in {
+def SLT64  : SetCC_R<"slt", setlt, GPR64Opnd>, ADD_FM<0, 0x2a>;
+def SLTu64 : SetCC_R<"sltu", setult, GPR64Opnd>, ADD_FM<0, 0x2b>;
+def AND64  : ArithLogicR<"and", GPR64Opnd, 1, II_AND, and>, ADD_FM<0, 0x24>;
+def OR64   : ArithLogicR<"or", GPR64Opnd, 1, II_OR, or>, ADD_FM<0, 0x25>;
+def XOR64  : ArithLogicR<"xor", GPR64Opnd, 1, II_XOR, xor>, ADD_FM<0, 0x26>;
+def NOR64  : LogicNOR<"nor", GPR64Opnd>, ADD_FM<0, 0x27>;
+}
+
+/// Shift Instructions
+def DSLL   : shift_rotate_imm<"dsll", uimm6, GPR64Opnd, II_DSLL, shl, immZExt6>,
+             SRA_FM<0x38, 0>, ISA_MIPS3;
+def DSRL   : shift_rotate_imm<"dsrl", uimm6, GPR64Opnd, II_DSRL, srl, immZExt6>,
+             SRA_FM<0x3a, 0>, ISA_MIPS3;
+def DSRA   : shift_rotate_imm<"dsra", uimm6, GPR64Opnd, II_DSRA, sra, immZExt6>,
+             SRA_FM<0x3b, 0>, ISA_MIPS3;
+def DSLLV  : shift_rotate_reg<"dsllv", GPR64Opnd, II_DSLLV, shl>,
+             SRLV_FM<0x14, 0>, ISA_MIPS3;
+def DSRLV  : shift_rotate_reg<"dsrlv", GPR64Opnd, II_DSRLV, srl>,
+             SRLV_FM<0x16, 0>, ISA_MIPS3;
+def DSRAV  : shift_rotate_reg<"dsrav", GPR64Opnd, II_DSRAV, sra>,
+             SRLV_FM<0x17, 0>, ISA_MIPS3;
+def DSLL32 : shift_rotate_imm<"dsll32", uimm5, GPR64Opnd, II_DSLL32>,
+             SRA_FM<0x3c, 0>, ISA_MIPS3;
+def DSRL32 : shift_rotate_imm<"dsrl32", uimm5, GPR64Opnd, II_DSRL32>,
+             SRA_FM<0x3e, 0>, ISA_MIPS3;
+def DSRA32 : shift_rotate_imm<"dsra32", uimm5, GPR64Opnd, II_DSRA32>,
+             SRA_FM<0x3f, 0>, ISA_MIPS3;
+
+// Rotate Instructions
+def DROTR  : shift_rotate_imm<"drotr", uimm6, GPR64Opnd, II_DROTR, rotr,
+                              immZExt6>,
+             SRA_FM<0x3a, 1>, ISA_MIPS64R2;
+def DROTRV : shift_rotate_reg<"drotrv", GPR64Opnd, II_DROTRV, rotr>,
+             SRLV_FM<0x16, 1>, ISA_MIPS64R2;
+def DROTR32 : shift_rotate_imm<"drotr32", uimm5, GPR64Opnd, II_DROTR32>,
+              SRA_FM<0x3e, 1>, ISA_MIPS64R2;
+
+/// Load and Store Instructions
+///  aligned
+let isCodeGenOnly = 1 in {
+def LB64  : Load<"lb", GPR64Opnd, sextloadi8, II_LB>, LW_FM<0x20>;
+def LBu64 : Load<"lbu", GPR64Opnd, zextloadi8, II_LBU>, LW_FM<0x24>;
+def LH64  : Load<"lh", GPR64Opnd, sextloadi16, II_LH>, LW_FM<0x21>;
+def LHu64 : Load<"lhu", GPR64Opnd, zextloadi16, II_LHU>, LW_FM<0x25>;
+def LW64  : Load<"lw", GPR64Opnd, sextloadi32, II_LW>, LW_FM<0x23>;
+def SB64  : Store<"sb", GPR64Opnd, truncstorei8, II_SB>, LW_FM<0x28>;
+def SH64  : Store<"sh", GPR64Opnd, truncstorei16, II_SH>, LW_FM<0x29>;
+def SW64  : Store<"sw", GPR64Opnd, truncstorei32, II_SW>, LW_FM<0x2b>;
+}
+
+def LWu   : Load<"lwu", GPR64Opnd, zextloadi32, II_LWU>, LW_FM<0x27>, ISA_MIPS3;
+def LD    : Load<"ld", GPR64Opnd, load, II_LD>, LW_FM<0x37>, ISA_MIPS3;
+def SD    : Store<"sd", GPR64Opnd, store, II_SD>, LW_FM<0x3f>, ISA_MIPS3;
+
+/// load/store left/right
+let isCodeGenOnly = 1 in {
+def LWL64 : LoadLeftRight<"lwl", MipsLWL, GPR64Opnd, II_LWL>, LW_FM<0x22>;
+def LWR64 : LoadLeftRight<"lwr", MipsLWR, GPR64Opnd, II_LWR>, LW_FM<0x26>;
+def SWL64 : StoreLeftRight<"swl", MipsSWL, GPR64Opnd, II_SWL>, LW_FM<0x2a>;
+def SWR64 : StoreLeftRight<"swr", MipsSWR, GPR64Opnd, II_SWR>, LW_FM<0x2e>;
+}
+
+def LDL   : LoadLeftRight<"ldl", MipsLDL, GPR64Opnd, II_LDL>, LW_FM<0x1a>,
+            ISA_MIPS3_NOT_32R6_64R6;
+def LDR   : LoadLeftRight<"ldr", MipsLDR, GPR64Opnd, II_LDR>, LW_FM<0x1b>,
+            ISA_MIPS3_NOT_32R6_64R6;
+def SDL   : StoreLeftRight<"sdl", MipsSDL, GPR64Opnd, II_SDL>, LW_FM<0x2c>,
+            ISA_MIPS3_NOT_32R6_64R6;
+def SDR   : StoreLeftRight<"sdr", MipsSDR, GPR64Opnd, II_SDR>, LW_FM<0x2d>,
+            ISA_MIPS3_NOT_32R6_64R6;
+
+/// Load-linked, Store-conditional
+def LLD : LLBase<"lld", GPR64Opnd>, LW_FM<0x34>, ISA_MIPS3_NOT_32R6_64R6;
+def SCD : SCBase<"scd", GPR64Opnd>, LW_FM<0x3c>, ISA_MIPS3_NOT_32R6_64R6;
+
+/// Jump and Branch Instructions
+let isCodeGenOnly = 1 in {
+  def JR64   : IndirectBranch<"jr", GPR64Opnd>, MTLO_FM<8>;
+  def BEQ64  : CBranch<"beq", brtarget, seteq, GPR64Opnd>, BEQ_FM<4>;
+  def BNE64  : CBranch<"bne", brtarget, setne, GPR64Opnd>, BEQ_FM<5>;
+  def BGEZ64 : CBranchZero<"bgez", brtarget, setge, GPR64Opnd>, BGEZ_FM<1, 1>;
+  def BGTZ64 : CBranchZero<"bgtz", brtarget, setgt, GPR64Opnd>, BGEZ_FM<7, 0>;
+  def BLEZ64 : CBranchZero<"blez", brtarget, setle, GPR64Opnd>, BGEZ_FM<6, 0>;
+  def BLTZ64 : CBranchZero<"bltz", brtarget, setlt, GPR64Opnd>, BGEZ_FM<1, 0>;
+  def JALR64 : JumpLinkReg<"jalr", GPR64Opnd>, JALR_FM;
+  def JALR64Pseudo : JumpLinkRegPseudo<GPR64Opnd, JALR, RA, GPR32Opnd>;
+  def TAILCALL64_R : TailCallReg<GPR64Opnd, JR, GPR32Opnd>;
+}
+
+def PseudoReturn64 : PseudoReturnBase<GPR64Opnd>;
+def PseudoIndirectBranch64 : PseudoIndirectBranchBase<GPR64Opnd>;
+
+/// Multiply and Divide Instructions.
+def DMULT  : Mult<"dmult", II_DMULT, GPR64Opnd, [HI0_64, LO0_64]>,
+             MULT_FM<0, 0x1c>, ISA_MIPS3_NOT_32R6_64R6;
+def DMULTu : Mult<"dmultu", II_DMULTU, GPR64Opnd, [HI0_64, LO0_64]>,
+             MULT_FM<0, 0x1d>, ISA_MIPS3_NOT_32R6_64R6;
+def PseudoDMULT  : MultDivPseudo<DMULT, ACC128, GPR64Opnd, MipsMult,
+                                 II_DMULT>, ISA_MIPS3_NOT_32R6_64R6;
+def PseudoDMULTu : MultDivPseudo<DMULTu, ACC128, GPR64Opnd, MipsMultu,
+                                 II_DMULTU>, ISA_MIPS3_NOT_32R6_64R6;
+def DSDIV : Div<"ddiv", II_DDIV, GPR64Opnd, [HI0_64, LO0_64]>,
+            MULT_FM<0, 0x1e>, ISA_MIPS3_NOT_32R6_64R6;
+def DUDIV : Div<"ddivu", II_DDIVU, GPR64Opnd, [HI0_64, LO0_64]>,
+            MULT_FM<0, 0x1f>, ISA_MIPS3_NOT_32R6_64R6;
+def PseudoDSDIV : MultDivPseudo<DSDIV, ACC128, GPR64Opnd, MipsDivRem,
+                                II_DDIV, 0, 1, 1>, ISA_MIPS3_NOT_32R6_64R6;
+def PseudoDUDIV : MultDivPseudo<DUDIV, ACC128, GPR64Opnd, MipsDivRemU,
+                                II_DDIVU, 0, 1, 1>, ISA_MIPS3_NOT_32R6_64R6;
+
+let isCodeGenOnly = 1 in {
+def MTHI64 : MoveToLOHI<"mthi", GPR64Opnd, [HI0_64]>, MTLO_FM<0x11>,
+             ISA_MIPS3_NOT_32R6_64R6;
+def MTLO64 : MoveToLOHI<"mtlo", GPR64Opnd, [LO0_64]>, MTLO_FM<0x13>,
+             ISA_MIPS3_NOT_32R6_64R6;
+def MFHI64 : MoveFromLOHI<"mfhi", GPR64Opnd, AC0_64>, MFLO_FM<0x10>,
+             ISA_MIPS3_NOT_32R6_64R6;
+def MFLO64 : MoveFromLOHI<"mflo", GPR64Opnd, AC0_64>, MFLO_FM<0x12>,
+             ISA_MIPS3_NOT_32R6_64R6;
+def PseudoMFHI64 : PseudoMFLOHI<GPR64, ACC128, MipsMFHI>,
+                   ISA_MIPS3_NOT_32R6_64R6;
+def PseudoMFLO64 : PseudoMFLOHI<GPR64, ACC128, MipsMFLO>,
+                   ISA_MIPS3_NOT_32R6_64R6;
+def PseudoMTLOHI64 : PseudoMTLOHI<ACC128, GPR64>, ISA_MIPS3_NOT_32R6_64R6;
+
+/// Sign Ext In Register Instructions.
+def SEB64 : SignExtInReg<"seb", i8, GPR64Opnd, II_SEB>, SEB_FM<0x10, 0x20>,
+            ISA_MIPS32R2;
+def SEH64 : SignExtInReg<"seh", i16, GPR64Opnd, II_SEH>, SEB_FM<0x18, 0x20>,
+            ISA_MIPS32R2;
+}
+
+/// Count Leading
+def DCLZ : CountLeading0<"dclz", GPR64Opnd>, CLO_FM<0x24>, ISA_MIPS64_NOT_64R6;
+def DCLO : CountLeading1<"dclo", GPR64Opnd>, CLO_FM<0x25>, ISA_MIPS64_NOT_64R6;
+
+/// Double Word Swap Bytes/HalfWords
+def DSBH : SubwordSwap<"dsbh", GPR64Opnd>, SEB_FM<2, 0x24>, ISA_MIPS64R2;
+def DSHD : SubwordSwap<"dshd", GPR64Opnd>, SEB_FM<5, 0x24>, ISA_MIPS64R2;
+
+def LEA_ADDiu64 : EffectiveAddress<"daddiu", GPR64Opnd>, LW_FM<0x19>;
+
+let isCodeGenOnly = 1 in
+def RDHWR64 : ReadHardware<GPR64Opnd, HWRegsOpnd>, RDHWR_FM;
+
+def DEXT : ExtBase<"dext", GPR64Opnd, uimm6, MipsExt>, EXT_FM<3>;
+def DEXTU : ExtBase<"dextu", GPR64Opnd, uimm6>, EXT_FM<2>;
+def DEXTM : ExtBase<"dextm", GPR64Opnd, uimm5>, EXT_FM<1>;
+
+def DINS : InsBase<"dins", GPR64Opnd, uimm6, MipsIns>, EXT_FM<7>;
+def DINSU : InsBase<"dinsu", GPR64Opnd, uimm6>, EXT_FM<6>;
+def DINSM : InsBase<"dinsm", GPR64Opnd, uimm5>, EXT_FM<5>;
+
+let isCodeGenOnly = 1, rs = 0, shamt = 0 in {
+  def DSLL64_32 : FR<0x00, 0x3c, (outs GPR64:$rd), (ins GPR32:$rt),
+                     "dsll\t$rd, $rt, 32", [], II_DSLL>;
+  def SLL64_32 : FR<0x0, 0x00, (outs GPR64:$rd), (ins GPR32:$rt),
+                    "sll\t$rd, $rt, 0", [], II_SLL>;
+  def SLL64_64 : FR<0x0, 0x00, (outs GPR64:$rd), (ins GPR64:$rt),
+                    "sll\t$rd, $rt, 0", [], II_SLL>;
+}
+
+// We need the following pseudo instruction to avoid offset calculation for
+// long branches.  See the comment in file MipsLongBranch.cpp for detailed
+// explanation.
+
+// Expands to: daddiu $dst, $src, %PART($tgt - $baltgt)
+// where %PART may be %hi or %lo, depending on the relocation kind
+// that $tgt is annotated with.
+def LONG_BRANCH_DADDiu : PseudoSE<(outs GPR64Opnd:$dst),
+  (ins GPR64Opnd:$src, brtarget:$tgt, brtarget:$baltgt), []>;
+
+// Cavium Octeon cmMIPS instructions
+let EncodingPredicates = []<Predicate>, // FIXME: The lack of HasStdEnc is probably a bug
+    AdditionalPredicates = [HasCnMips] in {
+
+class Count1s<string opstr, RegisterOperand RO>:
+  InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"),
+         [(set RO:$rd, (ctpop RO:$rs))], II_POP, FrmR, opstr> {
+  let TwoOperandAliasConstraint = "$rd = $rs";
+}
+
+class ExtsCins<string opstr, SDPatternOperator Op = null_frag>:
+  InstSE<(outs GPR64Opnd:$rt), (ins GPR64Opnd:$rs, uimm5:$pos, uimm5:$lenm1),
+         !strconcat(opstr, " $rt, $rs, $pos, $lenm1"),
+         [(set GPR64Opnd:$rt, (Op GPR64Opnd:$rs, imm:$pos, imm:$lenm1))],
+         NoItinerary, FrmR, opstr> {
+  let TwoOperandAliasConstraint = "$rt = $rs";
+}
+
+class SetCC64_R<string opstr, PatFrag cond_op> :
+  InstSE<(outs GPR64Opnd:$rd), (ins GPR64Opnd:$rs, GPR64Opnd:$rt),
+         !strconcat(opstr, "\t$rd, $rs, $rt"),
+         [(set GPR64Opnd:$rd, (cond_op GPR64Opnd:$rs, GPR64Opnd:$rt))],
+         II_SEQ_SNE, FrmR, opstr> {
+  let TwoOperandAliasConstraint = "$rd = $rs";
+}
+
+class SetCC64_I<string opstr, PatFrag cond_op>:
+  InstSE<(outs GPR64Opnd:$rt), (ins GPR64Opnd:$rs, simm10_64:$imm10),
+         !strconcat(opstr, "\t$rt, $rs, $imm10"),
+         [(set GPR64Opnd:$rt, (cond_op GPR64Opnd:$rs, immSExt10_64:$imm10))],
+         II_SEQI_SNEI, FrmI, opstr> {
+  let TwoOperandAliasConstraint = "$rt = $rs";
+}
+
+// Unsigned Byte Add
+let Pattern = [(set GPR64Opnd:$rd,
+                    (and (add GPR64Opnd:$rs, GPR64Opnd:$rt), 255))] in
+def BADDu  : ArithLogicR<"baddu", GPR64Opnd, 1, II_BADDU>,
+                              ADD_FM<0x1c, 0x28>;
+
+// Multiply Doubleword to GPR
+let Defs = [HI0, LO0, P0, P1, P2] in
+def DMUL  : ArithLogicR<"dmul", GPR64Opnd, 1, II_DMUL, mul>,
+                              ADD_FM<0x1c, 0x03>;
+
+// Extract a signed bit field /+32
+def EXTS  : ExtsCins<"exts">, EXTS_FM<0x3a>;
+def EXTS32: ExtsCins<"exts32">, EXTS_FM<0x3b>;
+
+// Clear and insert a bit field /+32
+def CINS  : ExtsCins<"cins">, EXTS_FM<0x32>;
+def CINS32: ExtsCins<"cins32">, EXTS_FM<0x33>;
+
+// Move to multiplier/product register
+def MTM0   : MoveToLOHI<"mtm0", GPR64Opnd, [MPL0, P0, P1, P2]>, MTMR_FM<0x08>;
+def MTM1   : MoveToLOHI<"mtm1", GPR64Opnd, [MPL1, P0, P1, P2]>, MTMR_FM<0x0c>;
+def MTM2   : MoveToLOHI<"mtm2", GPR64Opnd, [MPL2, P0, P1, P2]>, MTMR_FM<0x0d>;
+def MTP0   : MoveToLOHI<"mtp0", GPR64Opnd, [P0]>, MTMR_FM<0x09>;
+def MTP1   : MoveToLOHI<"mtp1", GPR64Opnd, [P1]>, MTMR_FM<0x0a>;
+def MTP2   : MoveToLOHI<"mtp2", GPR64Opnd, [P2]>, MTMR_FM<0x0b>;
+
+// Count Ones in a Word/Doubleword
+def POP   : Count1s<"pop", GPR32Opnd>, POP_FM<0x2c>;
+def DPOP  : Count1s<"dpop", GPR64Opnd>, POP_FM<0x2d>;
+
+// Set on equal/not equal
+def SEQ   : SetCC64_R<"seq", seteq>, SEQ_FM<0x2a>;
+def SEQi  : SetCC64_I<"seqi", seteq>, SEQI_FM<0x2e>;
+def SNE   : SetCC64_R<"sne", setne>, SEQ_FM<0x2b>;
+def SNEi  : SetCC64_I<"snei", setne>, SEQI_FM<0x2f>;
+
+// 192-bit x 64-bit Unsigned Multiply and Add
+let Defs = [P0, P1, P2] in
+def V3MULU: ArithLogicR<"v3mulu", GPR64Opnd, 0, II_DMUL>,
+                                  ADD_FM<0x1c, 0x11>;
+
+// 64-bit Unsigned Multiply and Add Move
+let Defs = [MPL0, P0, P1, P2] in
+def VMM0  : ArithLogicR<"vmm0", GPR64Opnd, 0, II_DMUL>,
+                                ADD_FM<0x1c, 0x10>;
+
+// 64-bit Unsigned Multiply and Add
+let Defs = [MPL1, MPL2, P0, P1, P2] in
+def VMULU : ArithLogicR<"vmulu", GPR64Opnd, 0, II_DMUL>,
+                                 ADD_FM<0x1c, 0x0f>;
+
+}
+
+}
+
+//===----------------------------------------------------------------------===//
+//  Arbitrary patterns that map to one or more instructions
+//===----------------------------------------------------------------------===//
+
+// extended loads
+def : MipsPat<(i64 (extloadi1  addr:$src)), (LB64 addr:$src)>;
+def : MipsPat<(i64 (extloadi8  addr:$src)), (LB64 addr:$src)>;
+def : MipsPat<(i64 (extloadi16 addr:$src)), (LH64 addr:$src)>;
+def : MipsPat<(i64 (extloadi32 addr:$src)), (LW64 addr:$src)>;
+
+// hi/lo relocs
+def : MipsPat<(MipsHi tglobaladdr:$in), (LUi64 tglobaladdr:$in)>;
+def : MipsPat<(MipsHi tblockaddress:$in), (LUi64 tblockaddress:$in)>;
+def : MipsPat<(MipsHi tjumptable:$in), (LUi64 tjumptable:$in)>;
+def : MipsPat<(MipsHi tconstpool:$in), (LUi64 tconstpool:$in)>;
+def : MipsPat<(MipsHi tglobaltlsaddr:$in), (LUi64 tglobaltlsaddr:$in)>;
+def : MipsPat<(MipsHi texternalsym:$in), (LUi64 texternalsym:$in)>;
+
+def : MipsPat<(MipsLo tglobaladdr:$in), (DADDiu ZERO_64, tglobaladdr:$in)>;
+def : MipsPat<(MipsLo tblockaddress:$in), (DADDiu ZERO_64, tblockaddress:$in)>;
+def : MipsPat<(MipsLo tjumptable:$in), (DADDiu ZERO_64, tjumptable:$in)>;
+def : MipsPat<(MipsLo tconstpool:$in), (DADDiu ZERO_64, tconstpool:$in)>;
+def : MipsPat<(MipsLo tglobaltlsaddr:$in),
+              (DADDiu ZERO_64, tglobaltlsaddr:$in)>;
+def : MipsPat<(MipsLo texternalsym:$in), (DADDiu ZERO_64, texternalsym:$in)>;
+
+def : MipsPat<(add GPR64:$hi, (MipsLo tglobaladdr:$lo)),
+              (DADDiu GPR64:$hi, tglobaladdr:$lo)>;
+def : MipsPat<(add GPR64:$hi, (MipsLo tblockaddress:$lo)),
+              (DADDiu GPR64:$hi, tblockaddress:$lo)>;
+def : MipsPat<(add GPR64:$hi, (MipsLo tjumptable:$lo)),
+              (DADDiu GPR64:$hi, tjumptable:$lo)>;
+def : MipsPat<(add GPR64:$hi, (MipsLo tconstpool:$lo)),
+              (DADDiu GPR64:$hi, tconstpool:$lo)>;
+def : MipsPat<(add GPR64:$hi, (MipsLo tglobaltlsaddr:$lo)),
+              (DADDiu GPR64:$hi, tglobaltlsaddr:$lo)>;
+
+def : WrapperPat<tglobaladdr, DADDiu, GPR64>;
+def : WrapperPat<tconstpool, DADDiu, GPR64>;
+def : WrapperPat<texternalsym, DADDiu, GPR64>;
+def : WrapperPat<tblockaddress, DADDiu, GPR64>;
+def : WrapperPat<tjumptable, DADDiu, GPR64>;
+def : WrapperPat<tglobaltlsaddr, DADDiu, GPR64>;
+
+defm : BrcondPats<GPR64, BEQ64, BNE64, SLT64, SLTu64, SLTi64, SLTiu64,
+                  ZERO_64>;
+
+def : MipsPat<(brcond (i32 (setlt i64:$lhs, 1)), bb:$dst),
+              (BLEZ64 i64:$lhs, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setgt i64:$lhs, -1)), bb:$dst),
+              (BGEZ64 i64:$lhs, bb:$dst)>;
+
+// setcc patterns
+defm : SeteqPats<GPR64, SLTiu64, XOR64, SLTu64, ZERO_64>;
+defm : SetlePats<GPR64, SLT64, SLTu64>;
+defm : SetgtPats<GPR64, SLT64, SLTu64>;
+defm : SetgePats<GPR64, SLT64, SLTu64>;
+defm : SetgeImmPats<GPR64, SLTi64, SLTiu64>;
+
+// truncate
+def : MipsPat<(i32 (trunc GPR64:$src)),
+              (SLL (EXTRACT_SUBREG GPR64:$src, sub_32), 0)>;
+
+// 32-to-64-bit extension
+def : MipsPat<(i64 (anyext GPR32:$src)), (SLL64_32 GPR32:$src)>;
+def : MipsPat<(i64 (zext GPR32:$src)), (DSRL (DSLL64_32 GPR32:$src), 32)>;
+def : MipsPat<(i64 (sext GPR32:$src)), (SLL64_32 GPR32:$src)>;
+
+// Sign extend in register
+def : MipsPat<(i64 (sext_inreg GPR64:$src, i32)),
+              (SLL64_64 GPR64:$src)>;
+
+// bswap MipsPattern
+def : MipsPat<(bswap GPR64:$rt), (DSHD (DSBH GPR64:$rt))>;
+
+//===----------------------------------------------------------------------===//
+// Instruction aliases
+//===----------------------------------------------------------------------===//
+def : MipsInstAlias<"move $dst, $src",
+                    (DADDu GPR64Opnd:$dst,  GPR64Opnd:$src, ZERO_64), 1>,
+      GPR_64;
+def : MipsInstAlias<"daddu $rs, $rt, $imm",
+                    (DADDiu GPR64Opnd:$rs, GPR64Opnd:$rt, simm16_64:$imm),
+                    0>;
+def : MipsInstAlias<"dadd $rs, $rt, $imm",
+                    (DADDi GPR64Opnd:$rs, GPR64Opnd:$rt, simm16_64:$imm),
+                    0>, ISA_MIPS3_NOT_32R6_64R6;
+def : MipsInstAlias<"daddu $rs, $imm",
+                    (DADDiu GPR64Opnd:$rs, GPR64Opnd:$rs, simm16_64:$imm),
+                    0>;
+def : MipsInstAlias<"dadd $rs, $imm",
+                    (DADDi GPR64Opnd:$rs, GPR64Opnd:$rs, simm16_64:$imm),
+                    0>, ISA_MIPS3_NOT_32R6_64R6;
+def : MipsInstAlias<"add $rs, $imm",
+                    (ADDi GPR32Opnd:$rs, GPR32Opnd:$rs, simm16:$imm),
+                    0>;
+def : MipsInstAlias<"addu $rs, $imm",
+                    (ADDiu GPR32Opnd:$rs, GPR32Opnd:$rs, simm16:$imm),
+                    0>;
+def : MipsInstAlias<"dsll $rd, $rt, $rs",
+                    (DSLLV GPR64Opnd:$rd, GPR64Opnd:$rt, GPR32Opnd:$rs), 0>,
+                    ISA_MIPS3;
+def : MipsInstAlias<"dsubu $rt, $rs, $imm",
+                    (DADDiu GPR64Opnd:$rt, GPR64Opnd:$rs,
+                            InvertedImOperand64:$imm), 0>;
+def : MipsInstAlias<"dsubi $rs, $rt, $imm",
+                    (DADDi GPR64Opnd:$rs, GPR64Opnd:$rt,
+                           InvertedImOperand64:$imm),
+                    0>, ISA_MIPS3_NOT_32R6_64R6;
+def : MipsInstAlias<"dsubi $rs, $imm",
+                    (DADDi GPR64Opnd:$rs, GPR64Opnd:$rs,
+                           InvertedImOperand64:$imm),
+                    0>, ISA_MIPS3_NOT_32R6_64R6;
+def : MipsInstAlias<"dsub $rs, $rt, $imm",
+                    (DADDi GPR64Opnd:$rs, GPR64Opnd:$rt,
+                           InvertedImOperand64:$imm),
+                    0>, ISA_MIPS3_NOT_32R6_64R6;
+def : MipsInstAlias<"dsub $rs, $imm",
+                    (DADDi GPR64Opnd:$rs, GPR64Opnd:$rs,
+                           InvertedImOperand64:$imm),
+                    0>, ISA_MIPS3_NOT_32R6_64R6;
+def : MipsInstAlias<"dsubu $rs, $imm",
+                    (DADDiu GPR64Opnd:$rs, GPR64Opnd:$rs,
+                            InvertedImOperand64:$imm),
+                    0>;
+def : MipsInstAlias<"dsra $rd, $rt, $rs",
+                    (DSRAV GPR64Opnd:$rd, GPR64Opnd:$rt, GPR32Opnd:$rs), 0>,
+                    ISA_MIPS3;
+def : MipsInstAlias<"dsrl $rd, $rt, $rs",
+                    (DSRLV GPR64Opnd:$rd, GPR64Opnd:$rt, GPR32Opnd:$rs), 0>,
+                    ISA_MIPS3;
+
+class LoadImm64< string instr_asm, Operand Od, RegisterOperand RO> :
+  MipsAsmPseudoInst<(outs RO:$rt), (ins Od:$imm64),
+                     !strconcat(instr_asm, "\t$rt, $imm64")> ;
+def LoadImm64Reg : LoadImm64<"dli", imm64, GPR64Opnd>;
+
+/// Move between CPU and coprocessor registers
+let DecoderNamespace = "Mips64", Predicates = [HasMips64] in {
+def DMFC0 : MFC3OP<"dmfc0", GPR64Opnd>, MFC3OP_FM<0x10, 1>;
+def DMTC0 : MFC3OP<"dmtc0", GPR64Opnd>, MFC3OP_FM<0x10, 5>, ISA_MIPS3;
+def DMFC2 : MFC3OP<"dmfc2", GPR64Opnd>, MFC3OP_FM<0x12, 1>, ISA_MIPS3;
+def DMTC2 : MFC3OP<"dmtc2", GPR64Opnd>, MFC3OP_FM<0x12, 5>, ISA_MIPS3;
+}
+
+// Two operand (implicit 0 selector) versions:
+def : MipsInstAlias<"dmfc0 $rt, $rd", (DMFC0 GPR64Opnd:$rt, GPR64Opnd:$rd, 0), 0>;
+def : MipsInstAlias<"dmtc0 $rt, $rd", (DMTC0 GPR64Opnd:$rt, GPR64Opnd:$rd, 0), 0>;
+def : MipsInstAlias<"dmfc2 $rt, $rd", (DMFC2 GPR64Opnd:$rt, GPR64Opnd:$rd, 0), 0>;
+def : MipsInstAlias<"dmtc2 $rt, $rd", (DMTC2 GPR64Opnd:$rt, GPR64Opnd:$rd, 0), 0>;
+
diff --git a/contrib/llvm/lib/Target/Mips/Mips64r6InstrInfo.td b/contrib/llvm/lib/Target/Mips/Mips64r6InstrInfo.td
new file mode 100644
index 0000000..6b546e8
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips64r6InstrInfo.td
@@ -0,0 +1,217 @@
+//=- Mips64r6InstrInfo.td - Mips64r6 Instruction Information -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Mips64r6 instructions.
+//
+//===----------------------------------------------------------------------===//
+
+// Notes about removals/changes from MIPS32r6:
+// Reencoded: dclo, dclz
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Encodings
+//
+//===----------------------------------------------------------------------===//
+
+class DALIGN_ENC  : SPECIAL3_DALIGN_FM<OPCODE6_DALIGN>;
+class DAUI_ENC    : DAUI_FM;
+class DAHI_ENC    : REGIMM_FM<OPCODE5_DAHI>;
+class DATI_ENC    : REGIMM_FM<OPCODE5_DATI>;
+class DBITSWAP_ENC : SPECIAL3_2R_FM<OPCODE6_DBITSWAP>;
+class DCLO_R6_ENC : SPECIAL_2R_FM<OPCODE6_DCLO>;
+class DCLZ_R6_ENC : SPECIAL_2R_FM<OPCODE6_DCLZ>;
+class DDIV_ENC    : SPECIAL_3R_FM<0b00010, 0b011110>;
+class DDIVU_ENC   : SPECIAL_3R_FM<0b00010, 0b011111>;
+class DLSA_R6_ENC : SPECIAL_LSA_FM<OPCODE6_DLSA>;
+class DMOD_ENC    : SPECIAL_3R_FM<0b00011, 0b011110>;
+class DMODU_ENC   : SPECIAL_3R_FM<0b00011, 0b011111>;
+class DMUH_ENC    : SPECIAL_3R_FM<0b00011, 0b011100>;
+class DMUHU_ENC   : SPECIAL_3R_FM<0b00011, 0b011101>;
+class DMUL_R6_ENC : SPECIAL_3R_FM<0b00010, 0b011100>;
+class DMULU_ENC   : SPECIAL_3R_FM<0b00010, 0b011101>;
+class LDPC_ENC    : PCREL18_FM<OPCODE3_LDPC>;
+class LLD_R6_ENC : SPECIAL3_LL_SC_FM<OPCODE6_LLD>;
+class SCD_R6_ENC : SPECIAL3_LL_SC_FM<OPCODE6_SCD>;
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Descriptions
+//
+//===----------------------------------------------------------------------===//
+
+class AHI_ATI_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs GPROpnd:$rs);
+  dag InOperandList = (ins GPROpnd:$rt, simm16:$imm);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $imm");
+  string Constraints = "$rs = $rt";
+}
+
+class DALIGN_DESC  : ALIGN_DESC_BASE<"dalign", GPR64Opnd, uimm3>;
+class DAHI_DESC    : AHI_ATI_DESC_BASE<"dahi", GPR64Opnd>;
+class DATI_DESC    : AHI_ATI_DESC_BASE<"dati", GPR64Opnd>;
+class DAUI_DESC    : AUI_DESC_BASE<"daui", GPR64Opnd>;
+class DBITSWAP_DESC : BITSWAP_DESC_BASE<"dbitswap", GPR64Opnd>;
+class DCLO_R6_DESC : CLO_R6_DESC_BASE<"dclo", GPR64Opnd>;
+class DCLZ_R6_DESC : CLZ_R6_DESC_BASE<"dclz", GPR64Opnd>;
+class DDIV_DESC    : DIVMOD_DESC_BASE<"ddiv", GPR64Opnd, sdiv>;
+class DDIVU_DESC   : DIVMOD_DESC_BASE<"ddivu", GPR64Opnd, udiv>;
+class DLSA_R6_DESC : LSA_R6_DESC_BASE<"dlsa", GPR64Opnd, uimm2>;
+class DMOD_DESC    : DIVMOD_DESC_BASE<"dmod", GPR64Opnd, srem>;
+class DMODU_DESC   : DIVMOD_DESC_BASE<"dmodu", GPR64Opnd, urem>;
+class DMUH_DESC    : MUL_R6_DESC_BASE<"dmuh", GPR64Opnd, mulhs>;
+class DMUHU_DESC   : MUL_R6_DESC_BASE<"dmuhu", GPR64Opnd, mulhu>;
+class DMUL_R6_DESC : MUL_R6_DESC_BASE<"dmul", GPR64Opnd, mul>;
+class DMULU_DESC   : MUL_R6_DESC_BASE<"dmulu", GPR64Opnd>;
+class LDPC_DESC    : PCREL_DESC_BASE<"ldpc", GPR64Opnd, simm18_lsl3>;
+class LLD_R6_DESC   : LL_R6_DESC_BASE<"lld", GPR64Opnd>;
+class SCD_R6_DESC   : SC_R6_DESC_BASE<"scd", GPR64Opnd>;
+class SELEQZ64_DESC : SELEQNE_Z_DESC_BASE<"seleqz", GPR64Opnd>;
+class SELNEZ64_DESC : SELEQNE_Z_DESC_BASE<"selnez", GPR64Opnd>;
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Definitions
+//
+//===----------------------------------------------------------------------===//
+
+def DAHI : DAHI_ENC, DAHI_DESC, ISA_MIPS64R6;
+def DALIGN : DALIGN_ENC, DALIGN_DESC, ISA_MIPS64R6;
+def DATI : DATI_ENC, DATI_DESC, ISA_MIPS64R6;
+def DAUI : DAUI_ENC, DAUI_DESC, ISA_MIPS64R6;
+def DBITSWAP : DBITSWAP_ENC, DBITSWAP_DESC, ISA_MIPS64R6;
+def DCLO_R6 : DCLO_R6_ENC, DCLO_R6_DESC, ISA_MIPS64R6;
+def DCLZ_R6 : DCLZ_R6_ENC, DCLZ_R6_DESC, ISA_MIPS64R6;
+def DDIV : DDIV_ENC, DDIV_DESC, ISA_MIPS64R6;
+def DDIVU : DDIVU_ENC, DDIVU_DESC, ISA_MIPS64R6;
+def DLSA_R6 : DLSA_R6_ENC, DLSA_R6_DESC, ISA_MIPS64R6;
+def DMOD : DMOD_ENC, DMOD_DESC, ISA_MIPS64R6;
+def DMODU : DMODU_ENC, DMODU_DESC, ISA_MIPS64R6;
+def DMUH: DMUH_ENC, DMUH_DESC, ISA_MIPS64R6;
+def DMUHU: DMUHU_ENC, DMUHU_DESC, ISA_MIPS64R6;
+def DMUL_R6: DMUL_R6_ENC, DMUL_R6_DESC, ISA_MIPS64R6;
+def DMULU: DMULU_ENC, DMULU_DESC, ISA_MIPS64R6;
+def LDPC: LDPC_ENC, LDPC_DESC, ISA_MIPS64R6;
+def LLD_R6 : LLD_R6_ENC, LLD_R6_DESC, ISA_MIPS32R6;
+def SCD_R6 : SCD_R6_ENC, SCD_R6_DESC, ISA_MIPS32R6;
+let DecoderNamespace = "Mips32r6_64r6_GP64" in {
+  def SELEQZ64 : SELEQZ_ENC, SELEQZ64_DESC, ISA_MIPS32R6, GPR_64;
+  def SELNEZ64 : SELNEZ_ENC, SELNEZ64_DESC, ISA_MIPS32R6, GPR_64;
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Aliases
+//
+//===----------------------------------------------------------------------===//
+
+def : MipsInstAlias<"jr $rs", (JALR64 ZERO_64, GPR64Opnd:$rs), 1>, ISA_MIPS64R6;
+
+//===----------------------------------------------------------------------===//
+//
+// Patterns and Pseudo Instructions
+//
+//===----------------------------------------------------------------------===//
+
+// i64 selects
+def : MipsPat<(select i64:$cond, i64:$t, i64:$f),
+              (OR64 (SELNEZ64 i64:$t, i64:$cond),
+                    (SELEQZ64 i64:$f, i64:$cond))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (seteq i64:$cond, immz)), i64:$t, i64:$f),
+              (OR64 (SELEQZ64 i64:$t, i64:$cond),
+                    (SELNEZ64 i64:$f, i64:$cond))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (setne i64:$cond, immz)), i64:$t, i64:$f),
+              (OR64 (SELNEZ64 i64:$t, i64:$cond),
+                    (SELEQZ64 i64:$f, i64:$cond))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (seteq i64:$cond, immZExt16_64:$imm)), i64:$t, i64:$f),
+              (OR64 (SELEQZ64 i64:$t, (XORi64 i64:$cond, immZExt16_64:$imm)),
+                    (SELNEZ64 i64:$f, (XORi64 i64:$cond, immZExt16_64:$imm)))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (setne i64:$cond, immZExt16_64:$imm)), i64:$t, i64:$f),
+              (OR64 (SELNEZ64 i64:$t, (XORi64 i64:$cond, immZExt16_64:$imm)),
+                    (SELEQZ64 i64:$f, (XORi64 i64:$cond, immZExt16_64:$imm)))>,
+              ISA_MIPS64R6;
+def : MipsPat<
+  (select (i32 (setgt i64:$cond, immSExt16Plus1:$imm)), i64:$t, i64:$f),
+  (OR64 (SELEQZ64 i64:$t,
+                  (SUBREG_TO_REG (i64 0), (SLTi64 i64:$cond, (Plus1 imm:$imm)),
+                                 sub_32)),
+        (SELNEZ64 i64:$f,
+                  (SUBREG_TO_REG (i64 0), (SLTi64 i64:$cond, (Plus1 imm:$imm)),
+                                 sub_32)))>,
+  ISA_MIPS64R6;
+def : MipsPat<
+  (select (i32 (setugt i64:$cond, immSExt16Plus1:$imm)), i64:$t, i64:$f),
+  (OR64 (SELEQZ64 i64:$t,
+                  (SUBREG_TO_REG (i64 0), (SLTiu64 i64:$cond, (Plus1 imm:$imm)),
+                                 sub_32)),
+        (SELNEZ64 i64:$f,
+                  (SUBREG_TO_REG (i64 0), (SLTiu64 i64:$cond, (Plus1 imm:$imm)),
+                                 sub_32)))>,
+  ISA_MIPS64R6;
+
+def : MipsPat<(select (i32 (setne i64:$cond, immz)), i64:$t, immz),
+              (SELNEZ64 i64:$t, i64:$cond)>, ISA_MIPS64R6;
+def : MipsPat<(select (i32 (seteq i64:$cond, immz)), i64:$t, immz),
+              (SELEQZ64 i64:$t, i64:$cond)>, ISA_MIPS64R6;
+def : MipsPat<(select (i32 (setne i64:$cond, immz)), immz, i64:$f),
+              (SELEQZ64 i64:$f, i64:$cond)>, ISA_MIPS64R6;
+def : MipsPat<(select (i32 (seteq i64:$cond, immz)), immz, i64:$f),
+              (SELNEZ64 i64:$f, i64:$cond)>, ISA_MIPS64R6;
+
+// i64 selects from an i32 comparison
+// One complicating factor here is that bits 32-63 of an i32 are undefined.
+// FIXME: Ideally, setcc would always produce an i64 on MIPS64 targets.
+//        This would allow us to remove the sign-extensions here.
+def : MipsPat<(select i32:$cond, i64:$t, i64:$f),
+              (OR64 (SELNEZ64 i64:$t, (SLL64_32 i32:$cond)),
+                    (SELEQZ64 i64:$f, (SLL64_32 i32:$cond)))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (seteq i32:$cond, immz)), i64:$t, i64:$f),
+              (OR64 (SELEQZ64 i64:$t, (SLL64_32 i32:$cond)),
+                    (SELNEZ64 i64:$f, (SLL64_32 i32:$cond)))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (setne i32:$cond, immz)), i64:$t, i64:$f),
+              (OR64 (SELNEZ64 i64:$t, (SLL64_32 i32:$cond)),
+                    (SELEQZ64 i64:$f, (SLL64_32 i32:$cond)))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (seteq i32:$cond, immZExt16:$imm)), i64:$t, i64:$f),
+              (OR64 (SELEQZ64 i64:$t, (SLL64_32 (XORi i32:$cond,
+                                                      immZExt16:$imm))),
+                    (SELNEZ64 i64:$f, (SLL64_32 (XORi i32:$cond,
+                                                      immZExt16:$imm))))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (setne i32:$cond, immZExt16:$imm)), i64:$t, i64:$f),
+              (OR64 (SELNEZ64 i64:$t, (SLL64_32 (XORi i32:$cond,
+                                                      immZExt16:$imm))),
+                    (SELEQZ64 i64:$f, (SLL64_32 (XORi i32:$cond,
+                                                      immZExt16:$imm))))>,
+              ISA_MIPS64R6;
+
+def : MipsPat<(select i32:$cond, i64:$t, immz),
+              (SELNEZ64 i64:$t, (SLL64_32 i32:$cond))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (setne i32:$cond, immz)), i64:$t, immz),
+              (SELNEZ64 i64:$t, (SLL64_32 i32:$cond))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (seteq i32:$cond, immz)), i64:$t, immz),
+              (SELEQZ64 i64:$t, (SLL64_32 i32:$cond))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select i32:$cond, immz, i64:$f),
+              (SELEQZ64 i64:$f, (SLL64_32 i32:$cond))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (setne i32:$cond, immz)), immz, i64:$f),
+              (SELEQZ64 i64:$f, (SLL64_32 i32:$cond))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (seteq i32:$cond, immz)), immz, i64:$f),
+              (SELNEZ64 i64:$f, (SLL64_32 i32:$cond))>,
+              ISA_MIPS64R6;
diff --git a/contrib/llvm/lib/Target/Mips/MipsAnalyzeImmediate.cpp b/contrib/llvm/lib/Target/Mips/MipsAnalyzeImmediate.cpp
new file mode 100644
index 0000000..31a9b7d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsAnalyzeImmediate.cpp
@@ -0,0 +1,153 @@
+//===-- MipsAnalyzeImmediate.cpp - Analyze Immediates ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include "MipsAnalyzeImmediate.h"
+#include "Mips.h"
+#include "llvm/Support/MathExtras.h"
+
+using namespace llvm;
+
+MipsAnalyzeImmediate::Inst::Inst(unsigned O, unsigned I) : Opc(O), ImmOpnd(I) {}
+
+// Add I to the instruction sequences.
+void MipsAnalyzeImmediate::AddInstr(InstSeqLs &SeqLs, const Inst &I) {
+  // Add an instruction seqeunce consisting of just I.
+  if (SeqLs.empty()) {
+    SeqLs.push_back(InstSeq(1, I));
+    return;
+  }
+
+  for (InstSeqLs::iterator Iter = SeqLs.begin(); Iter != SeqLs.end(); ++Iter)
+    Iter->push_back(I);
+}
+
+void MipsAnalyzeImmediate::GetInstSeqLsADDiu(uint64_t Imm, unsigned RemSize,
+                                             InstSeqLs &SeqLs) {
+  GetInstSeqLs((Imm + 0x8000ULL) & 0xffffffffffff0000ULL, RemSize, SeqLs);
+  AddInstr(SeqLs, Inst(ADDiu, Imm & 0xffffULL));
+}
+
+void MipsAnalyzeImmediate::GetInstSeqLsORi(uint64_t Imm, unsigned RemSize,
+                                           InstSeqLs &SeqLs) {
+  GetInstSeqLs(Imm & 0xffffffffffff0000ULL, RemSize, SeqLs);
+  AddInstr(SeqLs, Inst(ORi, Imm & 0xffffULL));
+}
+
+void MipsAnalyzeImmediate::GetInstSeqLsSLL(uint64_t Imm, unsigned RemSize,
+                                           InstSeqLs &SeqLs) {
+  unsigned Shamt = countTrailingZeros(Imm);
+  GetInstSeqLs(Imm >> Shamt, RemSize - Shamt, SeqLs);
+  AddInstr(SeqLs, Inst(SLL, Shamt));
+}
+
+void MipsAnalyzeImmediate::GetInstSeqLs(uint64_t Imm, unsigned RemSize,
+                                        InstSeqLs &SeqLs) {
+  uint64_t MaskedImm = Imm & (0xffffffffffffffffULL >> (64 - Size));
+
+  // Do nothing if Imm is 0.
+  if (!MaskedImm)
+    return;
+
+  // A single ADDiu will do if RemSize <= 16.
+  if (RemSize <= 16) {
+    AddInstr(SeqLs, Inst(ADDiu, MaskedImm));
+    return;
+  }
+
+  // Shift if the lower 16-bit is cleared.
+  if (!(Imm & 0xffff)) {
+    GetInstSeqLsSLL(Imm, RemSize, SeqLs);
+    return;
+  }
+
+  GetInstSeqLsADDiu(Imm, RemSize, SeqLs);
+
+  // If bit 15 is cleared, it doesn't make a difference whether the last
+  // instruction is an ADDiu or ORi. In that case, do not call GetInstSeqLsORi.
+  if (Imm & 0x8000) {
+    InstSeqLs SeqLsORi;
+    GetInstSeqLsORi(Imm, RemSize, SeqLsORi);
+    SeqLs.insert(SeqLs.end(), SeqLsORi.begin(), SeqLsORi.end());
+  }
+}
+
+// Replace a ADDiu & SLL pair with a LUi.
+// e.g. the following two instructions
+//  ADDiu 0x0111
+//  SLL 18
+// are replaced with
+//  LUi 0x444
+void MipsAnalyzeImmediate::ReplaceADDiuSLLWithLUi(InstSeq &Seq) {
+  // Check if the first two instructions are ADDiu and SLL and the shift amount
+  // is at least 16.
+  if ((Seq.size() < 2) || (Seq[0].Opc != ADDiu) ||
+      (Seq[1].Opc != SLL) || (Seq[1].ImmOpnd < 16))
+    return;
+
+  // Sign-extend and shift operand of ADDiu and see if it still fits in 16-bit.
+  int64_t Imm = SignExtend64<16>(Seq[0].ImmOpnd);
+  int64_t ShiftedImm = (uint64_t)Imm << (Seq[1].ImmOpnd - 16);
+
+  if (!isInt<16>(ShiftedImm))
+    return;
+
+  // Replace the first instruction and erase the second.
+  Seq[0].Opc = LUi;
+  Seq[0].ImmOpnd = (unsigned)(ShiftedImm & 0xffff);
+  Seq.erase(Seq.begin() + 1);
+}
+
+void MipsAnalyzeImmediate::GetShortestSeq(InstSeqLs &SeqLs, InstSeq &Insts) {
+  InstSeqLs::iterator ShortestSeq = SeqLs.end();
+  // The length of an instruction sequence is at most 7.
+  unsigned ShortestLength = 8;
+
+  for (InstSeqLs::iterator S = SeqLs.begin(); S != SeqLs.end(); ++S) {
+    ReplaceADDiuSLLWithLUi(*S);
+    assert(S->size() <= 7);
+
+    if (S->size() < ShortestLength) {
+      ShortestSeq = S;
+      ShortestLength = S->size();
+    }
+  }
+
+  Insts.clear();
+  Insts.append(ShortestSeq->begin(), ShortestSeq->end());
+}
+
+const MipsAnalyzeImmediate::InstSeq
+&MipsAnalyzeImmediate::Analyze(uint64_t Imm, unsigned Size,
+                               bool LastInstrIsADDiu) {
+  this->Size = Size;
+
+  if (Size == 32) {
+    ADDiu = Mips::ADDiu;
+    ORi = Mips::ORi;
+    SLL = Mips::SLL;
+    LUi = Mips::LUi;
+  } else {
+    ADDiu = Mips::DADDiu;
+    ORi = Mips::ORi64;
+    SLL = Mips::DSLL;
+    LUi = Mips::LUi64;
+  }
+
+  InstSeqLs SeqLs;
+
+  // Get the list of instruction sequences.
+  if (LastInstrIsADDiu | !Imm)
+    GetInstSeqLsADDiu(Imm, Size, SeqLs);
+  else
+    GetInstSeqLs(Imm, Size, SeqLs);
+
+  // Set Insts to the shortest instruction sequence.
+  GetShortestSeq(SeqLs, Insts);
+
+  return Insts;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsAnalyzeImmediate.h b/contrib/llvm/lib/Target/Mips/MipsAnalyzeImmediate.h
new file mode 100644
index 0000000..cc09034
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsAnalyzeImmediate.h
@@ -0,0 +1,63 @@
+//===-- MipsAnalyzeImmediate.h - Analyze Immediates ------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#ifndef MIPS_ANALYZE_IMMEDIATE_H
+#define MIPS_ANALYZE_IMMEDIATE_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+
+  class MipsAnalyzeImmediate {
+  public:
+    struct Inst {
+      unsigned Opc, ImmOpnd;
+      Inst(unsigned Opc, unsigned ImmOpnd);
+    };
+    typedef SmallVector<Inst, 7 > InstSeq;
+
+    /// Analyze - Get an instruction sequence to load immediate Imm. The last
+    /// instruction in the sequence must be an ADDiu if LastInstrIsADDiu is
+    /// true;
+    const InstSeq &Analyze(uint64_t Imm, unsigned Size, bool LastInstrIsADDiu);
+  private:
+    typedef SmallVector<InstSeq, 5> InstSeqLs;
+
+    /// AddInstr - Add I to all instruction sequences in SeqLs.
+    void AddInstr(InstSeqLs &SeqLs, const Inst &I);
+
+    /// GetInstSeqLsADDiu - Get instruction sequences which end with an ADDiu to
+    /// load immediate Imm
+    void GetInstSeqLsADDiu(uint64_t Imm, unsigned RemSize, InstSeqLs &SeqLs);
+
+    /// GetInstSeqLsORi - Get instrutcion sequences which end with an ORi to
+    /// load immediate Imm
+    void GetInstSeqLsORi(uint64_t Imm, unsigned RemSize, InstSeqLs &SeqLs);
+
+    /// GetInstSeqLsSLL - Get instruction sequences which end with a SLL to
+    /// load immediate Imm
+    void GetInstSeqLsSLL(uint64_t Imm, unsigned RemSize, InstSeqLs &SeqLs);
+
+    /// GetInstSeqLs - Get instruction sequences to load immediate Imm.
+    void GetInstSeqLs(uint64_t Imm, unsigned RemSize, InstSeqLs &SeqLs);
+
+    /// ReplaceADDiuSLLWithLUi - Replace an ADDiu & SLL pair with a LUi.
+    void ReplaceADDiuSLLWithLUi(InstSeq &Seq);
+
+    /// GetShortestSeq - Find the shortest instruction sequence in SeqLs and
+    /// return it in Insts.
+    void GetShortestSeq(InstSeqLs &SeqLs, InstSeq &Insts);
+
+    unsigned Size;
+    unsigned ADDiu, ORi, SLL, LUi;
+    InstSeq Insts;
+  };
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
new file mode 100644
index 0000000..7f21d68
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -0,0 +1,1036 @@
+//===-- MipsAsmPrinter.cpp - Mips LLVM Assembly Printer -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to GAS-format MIPS assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstPrinter/MipsInstPrinter.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MCTargetDesc/MipsMCNaCl.h"
+#include "Mips.h"
+#include "MipsAsmPrinter.h"
+#include "MipsInstrInfo.h"
+#include "MipsMCInstLower.h"
+#include "MipsTargetStreamer.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetOptions.h"
+#include <string>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-asm-printer"
+
+MipsTargetStreamer &MipsAsmPrinter::getTargetStreamer() {
+  return static_cast<MipsTargetStreamer &>(*OutStreamer.getTargetStreamer());
+}
+
+bool MipsAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  Subtarget = &TM.getSubtarget<MipsSubtarget>();
+
+  // Initialize TargetLoweringObjectFile.
+  const_cast<TargetLoweringObjectFile &>(getObjFileLowering())
+      .Initialize(OutContext, TM);
+
+  MipsFI = MF.getInfo<MipsFunctionInfo>();
+  if (Subtarget->inMips16Mode())
+    for (std::map<
+             const char *,
+             const llvm::Mips16HardFloatInfo::FuncSignature *>::const_iterator
+             it = MipsFI->StubsNeeded.begin();
+         it != MipsFI->StubsNeeded.end(); ++it) {
+      const char *Symbol = it->first;
+      const llvm::Mips16HardFloatInfo::FuncSignature *Signature = it->second;
+      if (StubsNeeded.find(Symbol) == StubsNeeded.end())
+        StubsNeeded[Symbol] = Signature;
+    }
+  MCP = MF.getConstantPool();
+
+  // In NaCl, all indirect jump targets must be aligned to bundle size.
+  if (Subtarget->isTargetNaCl())
+    NaClAlignIndirectJumpTargets(MF);
+
+  AsmPrinter::runOnMachineFunction(MF);
+  return true;
+}
+
+bool MipsAsmPrinter::lowerOperand(const MachineOperand &MO, MCOperand &MCOp) {
+  MCOp = MCInstLowering.LowerOperand(MO);
+  return MCOp.isValid();
+}
+
+#include "MipsGenMCPseudoLowering.inc"
+
+// Lower PseudoReturn/PseudoIndirectBranch/PseudoIndirectBranch64 to JR, JR_MM,
+// JALR, or JALR64 as appropriate for the target
+void MipsAsmPrinter::emitPseudoIndirectBranch(MCStreamer &OutStreamer,
+                                              const MachineInstr *MI) {
+  bool HasLinkReg = false;
+  MCInst TmpInst0;
+
+  if (Subtarget->hasMips64r6()) {
+    // MIPS64r6 should use (JALR64 ZERO_64, $rs)
+    TmpInst0.setOpcode(Mips::JALR64);
+    HasLinkReg = true;
+  } else if (Subtarget->hasMips32r6()) {
+    // MIPS32r6 should use (JALR ZERO, $rs)
+    TmpInst0.setOpcode(Mips::JALR);
+    HasLinkReg = true;
+  } else if (Subtarget->inMicroMipsMode())
+    // microMIPS should use (JR_MM $rs)
+    TmpInst0.setOpcode(Mips::JR_MM);
+  else {
+    // Everything else should use (JR $rs)
+    TmpInst0.setOpcode(Mips::JR);
+  }
+
+  MCOperand MCOp;
+
+  if (HasLinkReg) {
+    unsigned ZeroReg = Subtarget->isGP64bit() ? Mips::ZERO_64 : Mips::ZERO;
+    TmpInst0.addOperand(MCOperand::CreateReg(ZeroReg));
+  }
+
+  lowerOperand(MI->getOperand(0), MCOp);
+  TmpInst0.addOperand(MCOp);
+
+  EmitToStreamer(OutStreamer, TmpInst0);
+}
+
+void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  MipsTargetStreamer &TS = getTargetStreamer();
+  TS.setCanHaveModuleDir(false);
+
+  if (MI->isDebugValue()) {
+    SmallString<128> Str;
+    raw_svector_ostream OS(Str);
+
+    PrintDebugValueComment(MI, OS);
+    return;
+  }
+
+  // If we just ended a constant pool, mark it as such.
+  if (InConstantPool && MI->getOpcode() != Mips::CONSTPOOL_ENTRY) {
+    OutStreamer.EmitDataRegion(MCDR_DataRegionEnd);
+    InConstantPool = false;
+  }
+  if (MI->getOpcode() == Mips::CONSTPOOL_ENTRY) {
+    // CONSTPOOL_ENTRY - This instruction represents a floating
+    //constant pool in the function.  The first operand is the ID#
+    // for this instruction, the second is the index into the
+    // MachineConstantPool that this is, the third is the size in
+    // bytes of this constant pool entry.
+    // The required alignment is specified on the basic block holding this MI.
+    //
+    unsigned LabelId = (unsigned)MI->getOperand(0).getImm();
+    unsigned CPIdx   = (unsigned)MI->getOperand(1).getIndex();
+
+    // If this is the first entry of the pool, mark it.
+    if (!InConstantPool) {
+      OutStreamer.EmitDataRegion(MCDR_DataRegion);
+      InConstantPool = true;
+    }
+
+    OutStreamer.EmitLabel(GetCPISymbol(LabelId));
+
+    const MachineConstantPoolEntry &MCPE = MCP->getConstants()[CPIdx];
+    if (MCPE.isMachineConstantPoolEntry())
+      EmitMachineConstantPoolValue(MCPE.Val.MachineCPVal);
+    else
+      EmitGlobalConstant(MCPE.Val.ConstVal);
+    return;
+  }
+
+
+  MachineBasicBlock::const_instr_iterator I = MI;
+  MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
+
+  do {
+    // Do any auto-generated pseudo lowerings.
+    if (emitPseudoExpansionLowering(OutStreamer, &*I))
+      continue;
+
+    if (I->getOpcode() == Mips::PseudoReturn ||
+        I->getOpcode() == Mips::PseudoReturn64 ||
+        I->getOpcode() == Mips::PseudoIndirectBranch ||
+        I->getOpcode() == Mips::PseudoIndirectBranch64) {
+      emitPseudoIndirectBranch(OutStreamer, &*I);
+      continue;
+    }
+
+    // The inMips16Mode() test is not permanent.
+    // Some instructions are marked as pseudo right now which
+    // would make the test fail for the wrong reason but
+    // that will be fixed soon. We need this here because we are
+    // removing another test for this situation downstream in the
+    // callchain.
+    //
+    if (I->isPseudo() && !Subtarget->inMips16Mode()
+        && !isLongBranchPseudo(I->getOpcode()))
+      llvm_unreachable("Pseudo opcode found in EmitInstruction()");
+
+    MCInst TmpInst0;
+    MCInstLowering.Lower(I, TmpInst0);
+    EmitToStreamer(OutStreamer, TmpInst0);
+  } while ((++I != E) && I->isInsideBundle()); // Delay slot check
+}
+
+//===----------------------------------------------------------------------===//
+//
+//  Mips Asm Directives
+//
+//  -- Frame directive "frame Stackpointer, Stacksize, RARegister"
+//  Describe the stack frame.
+//
+//  -- Mask directives "(f)mask  bitmask, offset"
+//  Tells the assembler which registers are saved and where.
+//  bitmask - contain a little endian bitset indicating which registers are
+//            saved on function prologue (e.g. with a 0x80000000 mask, the
+//            assembler knows the register 31 (RA) is saved at prologue.
+//  offset  - the position before stack pointer subtraction indicating where
+//            the first saved register on prologue is located. (e.g. with a
+//
+//  Consider the following function prologue:
+//
+//    .frame  $fp,48,$ra
+//    .mask   0xc0000000,-8
+//       addiu $sp, $sp, -48
+//       sw $ra, 40($sp)
+//       sw $fp, 36($sp)
+//
+//    With a 0xc0000000 mask, the assembler knows the register 31 (RA) and
+//    30 (FP) are saved at prologue. As the save order on prologue is from
+//    left to right, RA is saved first. A -8 offset means that after the
+//    stack pointer subtration, the first register in the mask (RA) will be
+//    saved at address 48-8=40.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Mask directives
+//===----------------------------------------------------------------------===//
+
+// Create a bitmask with all callee saved registers for CPU or Floating Point
+// registers. For CPU registers consider RA, GP and FP for saving if necessary.
+void MipsAsmPrinter::printSavedRegsBitmask() {
+  // CPU and FPU Saved Registers Bitmasks
+  unsigned CPUBitmask = 0, FPUBitmask = 0;
+  int CPUTopSavedRegOff, FPUTopSavedRegOff;
+
+  // Set the CPU and FPU Bitmasks
+  const MachineFrameInfo *MFI = MF->getFrameInfo();
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+  // size of stack area to which FP callee-saved regs are saved.
+  unsigned CPURegSize = Mips::GPR32RegClass.getSize();
+  unsigned FGR32RegSize = Mips::FGR32RegClass.getSize();
+  unsigned AFGR64RegSize = Mips::AFGR64RegClass.getSize();
+  bool HasAFGR64Reg = false;
+  unsigned CSFPRegsSize = 0;
+  unsigned i, e = CSI.size();
+
+  // Set FPU Bitmask.
+  for (i = 0; i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    if (Mips::GPR32RegClass.contains(Reg))
+      break;
+
+    unsigned RegNum = TM.getRegisterInfo()->getEncodingValue(Reg);
+    if (Mips::AFGR64RegClass.contains(Reg)) {
+      FPUBitmask |= (3 << RegNum);
+      CSFPRegsSize += AFGR64RegSize;
+      HasAFGR64Reg = true;
+      continue;
+    }
+
+    FPUBitmask |= (1 << RegNum);
+    CSFPRegsSize += FGR32RegSize;
+  }
+
+  // Set CPU Bitmask.
+  for (; i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    unsigned RegNum = TM.getRegisterInfo()->getEncodingValue(Reg);
+    CPUBitmask |= (1 << RegNum);
+  }
+
+  // FP Regs are saved right below where the virtual frame pointer points to.
+  FPUTopSavedRegOff = FPUBitmask ?
+    (HasAFGR64Reg ? -AFGR64RegSize : -FGR32RegSize) : 0;
+
+  // CPU Regs are saved below FP Regs.
+  CPUTopSavedRegOff = CPUBitmask ? -CSFPRegsSize - CPURegSize : 0;
+
+  MipsTargetStreamer &TS = getTargetStreamer();
+  // Print CPUBitmask
+  TS.emitMask(CPUBitmask, CPUTopSavedRegOff);
+
+  // Print FPUBitmask
+  TS.emitFMask(FPUBitmask, FPUTopSavedRegOff);
+}
+
+//===----------------------------------------------------------------------===//
+// Frame and Set directives
+//===----------------------------------------------------------------------===//
+
+/// Frame Directive
+void MipsAsmPrinter::emitFrameDirective() {
+  const TargetRegisterInfo &RI = *TM.getRegisterInfo();
+
+  unsigned stackReg  = RI.getFrameRegister(*MF);
+  unsigned returnReg = RI.getRARegister();
+  unsigned stackSize = MF->getFrameInfo()->getStackSize();
+
+  getTargetStreamer().emitFrame(stackReg, stackSize, returnReg);
+}
+
+/// Emit Set directives.
+const char *MipsAsmPrinter::getCurrentABIString() const {
+  switch (Subtarget->getTargetABI()) {
+  case MipsSubtarget::O32:  return "abi32";
+  case MipsSubtarget::N32:  return "abiN32";
+  case MipsSubtarget::N64:  return "abi64";
+  case MipsSubtarget::EABI: return "eabi32"; // TODO: handle eabi64
+  default: llvm_unreachable("Unknown Mips ABI");
+  }
+}
+
+void MipsAsmPrinter::EmitFunctionEntryLabel() {
+  MipsTargetStreamer &TS = getTargetStreamer();
+
+  // NaCl sandboxing requires that indirect call instructions are masked.
+  // This means that function entry points should be bundle-aligned.
+  if (Subtarget->isTargetNaCl())
+    EmitAlignment(std::max(MF->getAlignment(), MIPS_NACL_BUNDLE_ALIGN));
+
+  if (Subtarget->inMicroMipsMode())
+    TS.emitDirectiveSetMicroMips();
+  else
+    TS.emitDirectiveSetNoMicroMips();
+
+  if (Subtarget->inMips16Mode())
+    TS.emitDirectiveSetMips16();
+  else
+    TS.emitDirectiveSetNoMips16();
+
+  TS.emitDirectiveEnt(*CurrentFnSym);
+  OutStreamer.EmitLabel(CurrentFnSym);
+}
+
+/// EmitFunctionBodyStart - Targets can override this to emit stuff before
+/// the first basic block in the function.
+void MipsAsmPrinter::EmitFunctionBodyStart() {
+  MipsTargetStreamer &TS = getTargetStreamer();
+
+  MCInstLowering.Initialize(&MF->getContext());
+
+  bool IsNakedFunction =
+    MF->getFunction()->
+      getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                   Attribute::Naked);
+  if (!IsNakedFunction)
+    emitFrameDirective();
+
+  if (!IsNakedFunction)
+    printSavedRegsBitmask();
+
+  if (!Subtarget->inMips16Mode()) {
+    TS.emitDirectiveSetNoReorder();
+    TS.emitDirectiveSetNoMacro();
+    TS.emitDirectiveSetNoAt();
+  }
+}
+
+/// EmitFunctionBodyEnd - Targets can override this to emit stuff after
+/// the last basic block in the function.
+void MipsAsmPrinter::EmitFunctionBodyEnd() {
+  MipsTargetStreamer &TS = getTargetStreamer();
+
+  // There are instruction for this macros, but they must
+  // always be at the function end, and we can't emit and
+  // break with BB logic.
+  if (!Subtarget->inMips16Mode()) {
+    TS.emitDirectiveSetAt();
+    TS.emitDirectiveSetMacro();
+    TS.emitDirectiveSetReorder();
+  }
+  TS.emitDirectiveEnd(CurrentFnSym->getName());
+  // Make sure to terminate any constant pools that were at the end
+  // of the function.
+  if (!InConstantPool)
+    return;
+  InConstantPool = false;
+  OutStreamer.EmitDataRegion(MCDR_DataRegionEnd);
+}
+
+/// isBlockOnlyReachableByFallthough - Return true if the basic block has
+/// exactly one predecessor and the control transfer mechanism between
+/// the predecessor and this block is a fall-through.
+bool MipsAsmPrinter::isBlockOnlyReachableByFallthrough(const MachineBasicBlock*
+                                                       MBB) const {
+  // The predecessor has to be immediately before this block.
+  const MachineBasicBlock *Pred = *MBB->pred_begin();
+
+  // If the predecessor is a switch statement, assume a jump table
+  // implementation, so it is not a fall through.
+  if (const BasicBlock *bb = Pred->getBasicBlock())
+    if (isa<SwitchInst>(bb->getTerminator()))
+      return false;
+
+  // If this is a landing pad, it isn't a fall through.  If it has no preds,
+  // then nothing falls through to it.
+  if (MBB->isLandingPad() || MBB->pred_empty())
+    return false;
+
+  // If there isn't exactly one predecessor, it can't be a fall through.
+  MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(), PI2 = PI;
+  ++PI2;
+
+  if (PI2 != MBB->pred_end())
+    return false;
+
+  // The predecessor has to be immediately before this block.
+  if (!Pred->isLayoutSuccessor(MBB))
+    return false;
+
+  // If the block is completely empty, then it definitely does fall through.
+  if (Pred->empty())
+    return true;
+
+  // Otherwise, check the last instruction.
+  // Check if the last terminator is an unconditional branch.
+  MachineBasicBlock::const_iterator I = Pred->end();
+  while (I != Pred->begin() && !(--I)->isTerminator()) ;
+
+  return !I->isBarrier();
+}
+
+// Print out an operand for an inline asm expression.
+bool MipsAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
+                                     unsigned AsmVariant,const char *ExtraCode,
+                                     raw_ostream &O) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    const MachineOperand &MO = MI->getOperand(OpNum);
+    switch (ExtraCode[0]) {
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI,OpNum,AsmVariant,ExtraCode,O);
+    case 'X': // hex const int
+      if ((MO.getType()) != MachineOperand::MO_Immediate)
+        return true;
+      O << "0x" << StringRef(utohexstr(MO.getImm())).lower();
+      return false;
+    case 'x': // hex const int (low 16 bits)
+      if ((MO.getType()) != MachineOperand::MO_Immediate)
+        return true;
+      O << "0x" << StringRef(utohexstr(MO.getImm() & 0xffff)).lower();
+      return false;
+    case 'd': // decimal const int
+      if ((MO.getType()) != MachineOperand::MO_Immediate)
+        return true;
+      O << MO.getImm();
+      return false;
+    case 'm': // decimal const int minus 1
+      if ((MO.getType()) != MachineOperand::MO_Immediate)
+        return true;
+      O << MO.getImm() - 1;
+      return false;
+    case 'z': {
+      // $0 if zero, regular printing otherwise
+      if (MO.getType() != MachineOperand::MO_Immediate)
+        return true;
+      int64_t Val = MO.getImm();
+      if (Val)
+        O << Val;
+      else
+        O << "$0";
+      return false;
+    }
+    case 'D': // Second part of a double word register operand
+    case 'L': // Low order register of a double word register operand
+    case 'M': // High order register of a double word register operand
+    {
+      if (OpNum == 0)
+        return true;
+      const MachineOperand &FlagsOP = MI->getOperand(OpNum - 1);
+      if (!FlagsOP.isImm())
+        return true;
+      unsigned Flags = FlagsOP.getImm();
+      unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
+      // Number of registers represented by this operand. We are looking
+      // for 2 for 32 bit mode and 1 for 64 bit mode.
+      if (NumVals != 2) {
+        if (Subtarget->isGP64bit() && NumVals == 1 && MO.isReg()) {
+          unsigned Reg = MO.getReg();
+          O << '$' << MipsInstPrinter::getRegisterName(Reg);
+          return false;
+        }
+        return true;
+      }
+
+      unsigned RegOp = OpNum;
+      if (!Subtarget->isGP64bit()){
+        // Endianess reverses which register holds the high or low value
+        // between M and L.
+        switch(ExtraCode[0]) {
+        case 'M':
+          RegOp = (Subtarget->isLittle()) ? OpNum + 1 : OpNum;
+          break;
+        case 'L':
+          RegOp = (Subtarget->isLittle()) ? OpNum : OpNum + 1;
+          break;
+        case 'D': // Always the second part
+          RegOp = OpNum + 1;
+        }
+        if (RegOp >= MI->getNumOperands())
+          return true;
+        const MachineOperand &MO = MI->getOperand(RegOp);
+        if (!MO.isReg())
+          return true;
+        unsigned Reg = MO.getReg();
+        O << '$' << MipsInstPrinter::getRegisterName(Reg);
+        return false;
+      }
+    }
+    case 'w':
+      // Print MSA registers for the 'f' constraint
+      // In LLVM, the 'w' modifier doesn't need to do anything.
+      // We can just call printOperand as normal.
+      break;
+    }
+  }
+
+  printOperand(MI, OpNum, O);
+  return false;
+}
+
+bool MipsAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                           unsigned OpNum, unsigned AsmVariant,
+                                           const char *ExtraCode,
+                                           raw_ostream &O) {
+  int Offset = 0;
+  // Currently we are expecting either no ExtraCode or 'D'
+  if (ExtraCode) {
+    if (ExtraCode[0] == 'D')
+      Offset = 4;
+    else
+      return true; // Unknown modifier.
+  }
+
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  assert(MO.isReg() && "unexpected inline asm memory operand");
+  O << Offset << "($" << MipsInstPrinter::getRegisterName(MO.getReg()) << ")";
+
+  return false;
+}
+
+void MipsAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
+                                  raw_ostream &O) {
+  const DataLayout *DL = TM.getDataLayout();
+  const MachineOperand &MO = MI->getOperand(opNum);
+  bool closeP = false;
+
+  if (MO.getTargetFlags())
+    closeP = true;
+
+  switch(MO.getTargetFlags()) {
+  case MipsII::MO_GPREL:    O << "%gp_rel("; break;
+  case MipsII::MO_GOT_CALL: O << "%call16("; break;
+  case MipsII::MO_GOT:      O << "%got(";    break;
+  case MipsII::MO_ABS_HI:   O << "%hi(";     break;
+  case MipsII::MO_ABS_LO:   O << "%lo(";     break;
+  case MipsII::MO_TLSGD:    O << "%tlsgd(";  break;
+  case MipsII::MO_GOTTPREL: O << "%gottprel("; break;
+  case MipsII::MO_TPREL_HI: O << "%tprel_hi("; break;
+  case MipsII::MO_TPREL_LO: O << "%tprel_lo("; break;
+  case MipsII::MO_GPOFF_HI: O << "%hi(%neg(%gp_rel("; break;
+  case MipsII::MO_GPOFF_LO: O << "%lo(%neg(%gp_rel("; break;
+  case MipsII::MO_GOT_DISP: O << "%got_disp("; break;
+  case MipsII::MO_GOT_PAGE: O << "%got_page("; break;
+  case MipsII::MO_GOT_OFST: O << "%got_ofst("; break;
+  }
+
+  switch (MO.getType()) {
+    case MachineOperand::MO_Register:
+      O << '$'
+        << StringRef(MipsInstPrinter::getRegisterName(MO.getReg())).lower();
+      break;
+
+    case MachineOperand::MO_Immediate:
+      O << MO.getImm();
+      break;
+
+    case MachineOperand::MO_MachineBasicBlock:
+      O << *MO.getMBB()->getSymbol();
+      return;
+
+    case MachineOperand::MO_GlobalAddress:
+      O << *getSymbol(MO.getGlobal());
+      break;
+
+    case MachineOperand::MO_BlockAddress: {
+      MCSymbol *BA = GetBlockAddressSymbol(MO.getBlockAddress());
+      O << BA->getName();
+      break;
+    }
+
+    case MachineOperand::MO_ConstantPoolIndex:
+      O << DL->getPrivateGlobalPrefix() << "CPI"
+        << getFunctionNumber() << "_" << MO.getIndex();
+      if (MO.getOffset())
+        O << "+" << MO.getOffset();
+      break;
+
+    default:
+      llvm_unreachable("<unknown operand type>");
+  }
+
+  if (closeP) O << ")";
+}
+
+void MipsAsmPrinter::printUnsignedImm(const MachineInstr *MI, int opNum,
+                                      raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand(opNum);
+  if (MO.isImm())
+    O << (unsigned short int)MO.getImm();
+  else
+    printOperand(MI, opNum, O);
+}
+
+void MipsAsmPrinter::printUnsignedImm8(const MachineInstr *MI, int opNum,
+                                       raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand(opNum);
+  if (MO.isImm())
+    O << (unsigned short int)(unsigned char)MO.getImm();
+  else
+    printOperand(MI, opNum, O);
+}
+
+void MipsAsmPrinter::
+printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O) {
+  // Load/Store memory operands -- imm($reg)
+  // If PIC target the target is loaded as the
+  // pattern lw $25,%call16($28)
+  printOperand(MI, opNum+1, O);
+  O << "(";
+  printOperand(MI, opNum, O);
+  O << ")";
+}
+
+void MipsAsmPrinter::
+printMemOperandEA(const MachineInstr *MI, int opNum, raw_ostream &O) {
+  // when using stack locations for not load/store instructions
+  // print the same way as all normal 3 operand instructions.
+  printOperand(MI, opNum, O);
+  O << ", ";
+  printOperand(MI, opNum+1, O);
+  return;
+}
+
+void MipsAsmPrinter::
+printFCCOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
+                const char *Modifier) {
+  const MachineOperand &MO = MI->getOperand(opNum);
+  O << Mips::MipsFCCToString((Mips::CondCode)MO.getImm());
+}
+
+void MipsAsmPrinter::EmitStartOfAsmFile(Module &M) {
+  // TODO: Need to add -mabicalls and -mno-abicalls flags.
+  // Currently we assume that -mabicalls is the default.
+  bool IsABICalls = true;
+  if (IsABICalls) {
+    getTargetStreamer().emitDirectiveAbiCalls();
+    Reloc::Model RM = TM.getRelocationModel();
+    // FIXME: This condition should be a lot more complicated that it is here.
+    //        Ideally it should test for properties of the ABI and not the ABI
+    //        itself.
+    //        For the moment, I'm only correcting enough to make MIPS-IV work.
+    if (RM == Reloc::Static && !Subtarget->isABI_N64())
+      getTargetStreamer().emitDirectiveOptionPic0();
+  }
+
+  // Tell the assembler which ABI we are using
+  std::string SectionName = std::string(".mdebug.") + getCurrentABIString();
+  OutStreamer.SwitchSection(OutContext.getELFSection(
+      SectionName, ELF::SHT_PROGBITS, 0, SectionKind::getDataRel()));
+
+  // NaN: At the moment we only support:
+  // 1. .nan legacy (default)
+  // 2. .nan 2008
+  Subtarget->isNaN2008() ? getTargetStreamer().emitDirectiveNaN2008()
+    : getTargetStreamer().emitDirectiveNaNLegacy();
+
+  // TODO: handle O64 ABI
+
+  if (Subtarget->isABI_EABI()) {
+    if (Subtarget->isGP32bit())
+      OutStreamer.SwitchSection(
+          OutContext.getELFSection(".gcc_compiled_long32", ELF::SHT_PROGBITS, 0,
+                                   SectionKind::getDataRel()));
+    else
+      OutStreamer.SwitchSection(
+          OutContext.getELFSection(".gcc_compiled_long64", ELF::SHT_PROGBITS, 0,
+                                   SectionKind::getDataRel()));
+  }
+
+  getTargetStreamer().updateABIInfo(*Subtarget);
+
+  // We should always emit a '.module fp=...' but binutils 2.24 does not accept
+  // it. We therefore emit it when it contradicts the ABI defaults (-mfpxx or
+  // -mfp64) and omit it otherwise.
+  if (Subtarget->isABI_O32() && (Subtarget->isABI_FPXX() ||
+                                 Subtarget->isFP64bit()))
+    getTargetStreamer().emitDirectiveModuleFP();
+
+  // We should always emit a '.module [no]oddspreg' but binutils 2.24 does not
+  // accept it. We therefore emit it when it contradicts the default or an
+  // option has changed the default (i.e. FPXX) and omit it otherwise.
+  if (Subtarget->isABI_O32() && (!Subtarget->useOddSPReg() ||
+                                 Subtarget->isABI_FPXX()))
+    getTargetStreamer().emitDirectiveModuleOddSPReg(Subtarget->useOddSPReg(),
+                                                    Subtarget->isABI_O32());
+}
+
+void MipsAsmPrinter::EmitJal(MCSymbol *Symbol) {
+  MCInst I;
+  I.setOpcode(Mips::JAL);
+  I.addOperand(
+      MCOperand::CreateExpr(MCSymbolRefExpr::Create(Symbol, OutContext)));
+  OutStreamer.EmitInstruction(I, getSubtargetInfo());
+}
+
+void MipsAsmPrinter::EmitInstrReg(unsigned Opcode, unsigned Reg) {
+  MCInst I;
+  I.setOpcode(Opcode);
+  I.addOperand(MCOperand::CreateReg(Reg));
+  OutStreamer.EmitInstruction(I, getSubtargetInfo());
+}
+
+void MipsAsmPrinter::EmitInstrRegReg(unsigned Opcode, unsigned Reg1,
+                                     unsigned Reg2) {
+  MCInst I;
+  //
+  // Because of the current td files for Mips32, the operands for MTC1
+  // appear backwards from their normal assembly order. It's not a trivial
+  // change to fix this in the td file so we adjust for it here.
+  //
+  if (Opcode == Mips::MTC1) {
+    unsigned Temp = Reg1;
+    Reg1 = Reg2;
+    Reg2 = Temp;
+  }
+  I.setOpcode(Opcode);
+  I.addOperand(MCOperand::CreateReg(Reg1));
+  I.addOperand(MCOperand::CreateReg(Reg2));
+  OutStreamer.EmitInstruction(I, getSubtargetInfo());
+}
+
+void MipsAsmPrinter::EmitInstrRegRegReg(unsigned Opcode, unsigned Reg1,
+                                        unsigned Reg2, unsigned Reg3) {
+  MCInst I;
+  I.setOpcode(Opcode);
+  I.addOperand(MCOperand::CreateReg(Reg1));
+  I.addOperand(MCOperand::CreateReg(Reg2));
+  I.addOperand(MCOperand::CreateReg(Reg3));
+  OutStreamer.EmitInstruction(I, getSubtargetInfo());
+}
+
+void MipsAsmPrinter::EmitMovFPIntPair(unsigned MovOpc, unsigned Reg1,
+                                      unsigned Reg2, unsigned FPReg1,
+                                      unsigned FPReg2, bool LE) {
+  if (!LE) {
+    unsigned temp = Reg1;
+    Reg1 = Reg2;
+    Reg2 = temp;
+  }
+  EmitInstrRegReg(MovOpc, Reg1, FPReg1);
+  EmitInstrRegReg(MovOpc, Reg2, FPReg2);
+}
+
+void MipsAsmPrinter::EmitSwapFPIntParams(Mips16HardFloatInfo::FPParamVariant PV,
+                                         bool LE, bool ToFP) {
+  using namespace Mips16HardFloatInfo;
+  unsigned MovOpc = ToFP ? Mips::MTC1 : Mips::MFC1;
+  switch (PV) {
+  case FSig:
+    EmitInstrRegReg(MovOpc, Mips::A0, Mips::F12);
+    break;
+  case FFSig:
+    EmitMovFPIntPair(MovOpc, Mips::A0, Mips::A1, Mips::F12, Mips::F14, LE);
+    break;
+  case FDSig:
+    EmitInstrRegReg(MovOpc, Mips::A0, Mips::F12);
+    EmitMovFPIntPair(MovOpc, Mips::A2, Mips::A3, Mips::F14, Mips::F15, LE);
+    break;
+  case DSig:
+    EmitMovFPIntPair(MovOpc, Mips::A0, Mips::A1, Mips::F12, Mips::F13, LE);
+    break;
+  case DDSig:
+    EmitMovFPIntPair(MovOpc, Mips::A0, Mips::A1, Mips::F12, Mips::F13, LE);
+    EmitMovFPIntPair(MovOpc, Mips::A2, Mips::A3, Mips::F14, Mips::F15, LE);
+    break;
+  case DFSig:
+    EmitMovFPIntPair(MovOpc, Mips::A0, Mips::A1, Mips::F12, Mips::F13, LE);
+    EmitInstrRegReg(MovOpc, Mips::A2, Mips::F14);
+    break;
+  case NoSig:
+    return;
+  }
+}
+
+void
+MipsAsmPrinter::EmitSwapFPIntRetval(Mips16HardFloatInfo::FPReturnVariant RV,
+                                    bool LE) {
+  using namespace Mips16HardFloatInfo;
+  unsigned MovOpc = Mips::MFC1;
+  switch (RV) {
+  case FRet:
+    EmitInstrRegReg(MovOpc, Mips::V0, Mips::F0);
+    break;
+  case DRet:
+    EmitMovFPIntPair(MovOpc, Mips::V0, Mips::V1, Mips::F0, Mips::F1, LE);
+    break;
+  case CFRet:
+    EmitMovFPIntPair(MovOpc, Mips::V0, Mips::V1, Mips::F0, Mips::F1, LE);
+    break;
+  case CDRet:
+    EmitMovFPIntPair(MovOpc, Mips::V0, Mips::V1, Mips::F0, Mips::F1, LE);
+    EmitMovFPIntPair(MovOpc, Mips::A0, Mips::A1, Mips::F2, Mips::F3, LE);
+    break;
+  case NoFPRet:
+    break;
+  }
+}
+
+void MipsAsmPrinter::EmitFPCallStub(
+    const char *Symbol, const Mips16HardFloatInfo::FuncSignature *Signature) {
+  MCSymbol *MSymbol = OutContext.GetOrCreateSymbol(StringRef(Symbol));
+  using namespace Mips16HardFloatInfo;
+  bool LE = Subtarget->isLittle();
+  //
+  // .global xxxx
+  //
+  OutStreamer.EmitSymbolAttribute(MSymbol, MCSA_Global);
+  const char *RetType;
+  //
+  // make the comment field identifying the return and parameter
+  // types of the floating point stub
+  // # Stub function to call rettype xxxx (params)
+  //
+  switch (Signature->RetSig) {
+  case FRet:
+    RetType = "float";
+    break;
+  case DRet:
+    RetType = "double";
+    break;
+  case CFRet:
+    RetType = "complex";
+    break;
+  case CDRet:
+    RetType = "double complex";
+    break;
+  case NoFPRet:
+    RetType = "";
+    break;
+  }
+  const char *Parms;
+  switch (Signature->ParamSig) {
+  case FSig:
+    Parms = "float";
+    break;
+  case FFSig:
+    Parms = "float, float";
+    break;
+  case FDSig:
+    Parms = "float, double";
+    break;
+  case DSig:
+    Parms = "double";
+    break;
+  case DDSig:
+    Parms = "double, double";
+    break;
+  case DFSig:
+    Parms = "double, float";
+    break;
+  case NoSig:
+    Parms = "";
+    break;
+  }
+  OutStreamer.AddComment("\t# Stub function to call " + Twine(RetType) + " " +
+                         Twine(Symbol) + " (" + Twine(Parms) + ")");
+  //
+  // probably not necessary but we save and restore the current section state
+  //
+  OutStreamer.PushSection();
+  //
+  // .section mips16.call.fpxxxx,"ax",@progbits
+  //
+  const MCSectionELF *M = OutContext.getELFSection(
+      ".mips16.call.fp." + std::string(Symbol), ELF::SHT_PROGBITS,
+      ELF::SHF_ALLOC | ELF::SHF_EXECINSTR, SectionKind::getText());
+  OutStreamer.SwitchSection(M, nullptr);
+  //
+  // .align 2
+  //
+  OutStreamer.EmitValueToAlignment(4);
+  MipsTargetStreamer &TS = getTargetStreamer();
+  //
+  // .set nomips16
+  // .set nomicromips
+  //
+  TS.emitDirectiveSetNoMips16();
+  TS.emitDirectiveSetNoMicroMips();
+  //
+  // .ent __call_stub_fp_xxxx
+  // .type  __call_stub_fp_xxxx,@function
+  //  __call_stub_fp_xxxx:
+  //
+  std::string x = "__call_stub_fp_" + std::string(Symbol);
+  MCSymbol *Stub = OutContext.GetOrCreateSymbol(StringRef(x));
+  TS.emitDirectiveEnt(*Stub);
+  MCSymbol *MType =
+      OutContext.GetOrCreateSymbol("__call_stub_fp_" + Twine(Symbol));
+  OutStreamer.EmitSymbolAttribute(MType, MCSA_ELF_TypeFunction);
+  OutStreamer.EmitLabel(Stub);
+  //
+  // we just handle non pic for now. these function will not be
+  // called otherwise. when the full stub generation is moved here
+  // we need to deal with pic.
+  //
+  if (Subtarget->getRelocationModel() == Reloc::PIC_)
+    llvm_unreachable("should not be here if we are compiling pic");
+  TS.emitDirectiveSetReorder();
+  //
+  // We need to add a MipsMCExpr class to MCTargetDesc to fully implement
+  // stubs without raw text but this current patch is for compiler generated
+  // functions and they all return some value.
+  // The calling sequence for non pic is different in that case and we need
+  // to implement %lo and %hi in order to handle the case of no return value
+  // See the corresponding method in Mips16HardFloat for details.
+  //
+  // mov the return address to S2.
+  // we have no stack space to store it and we are about to make another call.
+  // We need to make sure that the enclosing function knows to save S2
+  // This should have already been handled.
+  //
+  // Mov $18, $31
+
+  EmitInstrRegRegReg(Mips::ADDu, Mips::S2, Mips::RA, Mips::ZERO);
+
+  EmitSwapFPIntParams(Signature->ParamSig, LE, true);
+
+  // Jal xxxx
+  //
+  EmitJal(MSymbol);
+
+  // fix return values
+  EmitSwapFPIntRetval(Signature->RetSig, LE);
+  //
+  // do the return
+  // if (Signature->RetSig == NoFPRet)
+  //  llvm_unreachable("should not be any stubs here with no return value");
+  // else
+  EmitInstrReg(Mips::JR, Mips::S2);
+
+  MCSymbol *Tmp = OutContext.CreateTempSymbol();
+  OutStreamer.EmitLabel(Tmp);
+  const MCSymbolRefExpr *E = MCSymbolRefExpr::Create(Stub, OutContext);
+  const MCSymbolRefExpr *T = MCSymbolRefExpr::Create(Tmp, OutContext);
+  const MCExpr *T_min_E = MCBinaryExpr::CreateSub(T, E, OutContext);
+  OutStreamer.EmitELFSize(Stub, T_min_E);
+  TS.emitDirectiveEnd(x);
+  OutStreamer.PopSection();
+}
+
+void MipsAsmPrinter::EmitEndOfAsmFile(Module &M) {
+  // Emit needed stubs
+  //
+  for (std::map<
+           const char *,
+           const llvm::Mips16HardFloatInfo::FuncSignature *>::const_iterator
+           it = StubsNeeded.begin();
+       it != StubsNeeded.end(); ++it) {
+    const char *Symbol = it->first;
+    const llvm::Mips16HardFloatInfo::FuncSignature *Signature = it->second;
+    EmitFPCallStub(Symbol, Signature);
+  }
+  // return to the text section
+  OutStreamer.SwitchSection(OutContext.getObjectFileInfo()->getTextSection());
+}
+
+void MipsAsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
+                                           raw_ostream &OS) {
+  // TODO: implement
+}
+
+// Align all targets of indirect branches on bundle size.  Used only if target
+// is NaCl.
+void MipsAsmPrinter::NaClAlignIndirectJumpTargets(MachineFunction &MF) {
+  // Align all blocks that are jumped to through jump table.
+  if (MachineJumpTableInfo *JtInfo = MF.getJumpTableInfo()) {
+    const std::vector<MachineJumpTableEntry> &JT = JtInfo->getJumpTables();
+    for (unsigned I = 0; I < JT.size(); ++I) {
+      const std::vector<MachineBasicBlock*> &MBBs = JT[I].MBBs;
+
+      for (unsigned J = 0; J < MBBs.size(); ++J)
+        MBBs[J]->setAlignment(MIPS_NACL_BUNDLE_ALIGN);
+    }
+  }
+
+  // If basic block address is taken, block can be target of indirect branch.
+  for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
+                                 MBB != E; ++MBB) {
+    if (MBB->hasAddressTaken())
+      MBB->setAlignment(MIPS_NACL_BUNDLE_ALIGN);
+  }
+}
+
+bool MipsAsmPrinter::isLongBranchPseudo(int Opcode) const {
+  return (Opcode == Mips::LONG_BRANCH_LUi
+          || Opcode == Mips::LONG_BRANCH_ADDiu
+          || Opcode == Mips::LONG_BRANCH_DADDiu);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeMipsAsmPrinter() {
+  RegisterAsmPrinter<MipsAsmPrinter> X(TheMipsTarget);
+  RegisterAsmPrinter<MipsAsmPrinter> Y(TheMipselTarget);
+  RegisterAsmPrinter<MipsAsmPrinter> A(TheMips64Target);
+  RegisterAsmPrinter<MipsAsmPrinter> B(TheMips64elTarget);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.h b/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.h
new file mode 100644
index 0000000..abbd39b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.h
@@ -0,0 +1,144 @@
+//===-- MipsAsmPrinter.h - Mips LLVM Assembly Printer ----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Mips Assembly printer class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSASMPRINTER_H
+#define MIPSASMPRINTER_H
+
+#include "Mips16HardFloatInfo.h"
+#include "MipsMCInstLower.h"
+#include "MipsMachineFunction.h"
+#include "MipsSubtarget.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+class MCStreamer;
+class MachineInstr;
+class MachineBasicBlock;
+class MipsTargetStreamer;
+class Module;
+class raw_ostream;
+
+class LLVM_LIBRARY_VISIBILITY MipsAsmPrinter : public AsmPrinter {
+  MipsTargetStreamer &getTargetStreamer();
+
+  void EmitInstrWithMacroNoAT(const MachineInstr *MI);
+
+private:
+  // tblgen'erated function.
+  bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
+                                   const MachineInstr *MI);
+
+  // Emit PseudoReturn, PseudoReturn64, PseudoIndirectBranch,
+  // and PseudoIndirectBranch64 as a JR, JR_MM, JALR, or JALR64 as appropriate
+  // for the target.
+  void emitPseudoIndirectBranch(MCStreamer &OutStreamer,
+                                const MachineInstr *MI);
+
+  // lowerOperand - Convert a MachineOperand into the equivalent MCOperand.
+  bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp);
+
+  /// MCP - Keep a pointer to constantpool entries of the current
+  /// MachineFunction.
+  const MachineConstantPool *MCP;
+
+  /// InConstantPool - Maintain state when emitting a sequence of constant
+  /// pool entries so we can properly mark them as data regions.
+  bool InConstantPool;
+
+  std::map<const char *, const llvm::Mips16HardFloatInfo::FuncSignature *>
+  StubsNeeded;
+
+  void EmitJal(MCSymbol *Symbol);
+
+  void EmitInstrReg(unsigned Opcode, unsigned Reg);
+
+  void EmitInstrRegReg(unsigned Opcode, unsigned Reg1, unsigned Reg2);
+
+  void EmitInstrRegRegReg(unsigned Opcode, unsigned Reg1, unsigned Reg2,
+                          unsigned Reg3);
+
+  void EmitMovFPIntPair(unsigned MovOpc, unsigned Reg1, unsigned Reg2,
+                        unsigned FPReg1, unsigned FPReg2, bool LE);
+
+  void EmitSwapFPIntParams(Mips16HardFloatInfo::FPParamVariant, bool LE,
+                           bool ToFP);
+
+  void EmitSwapFPIntRetval(Mips16HardFloatInfo::FPReturnVariant, bool LE);
+
+  void EmitFPCallStub(const char *, const Mips16HardFloatInfo::FuncSignature *);
+
+  void NaClAlignIndirectJumpTargets(MachineFunction &MF);
+
+  bool isLongBranchPseudo(int Opcode) const;
+
+public:
+
+  const MipsSubtarget *Subtarget;
+  const MipsFunctionInfo *MipsFI;
+  MipsMCInstLower MCInstLowering;
+
+  // We initialize the subtarget here and in runOnMachineFunction
+  // since there are certain target specific flags (ABI) that could
+  // reside on the TargetMachine, but are on the subtarget currently
+  // and we need them for the beginning of file output before we've
+  // seen a single function.
+  explicit MipsAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+      : AsmPrinter(TM, Streamer), MCP(nullptr), InConstantPool(false),
+        Subtarget(&TM.getSubtarget<MipsSubtarget>()), MCInstLowering(*this) {}
+
+  const char *getPassName() const override {
+    return "Mips Assembly Printer";
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void EmitConstantPool() override {
+    bool UsingConstantPools =
+      (Subtarget->inMips16Mode() && Subtarget->useConstantIslands());
+    if (!UsingConstantPools)
+      AsmPrinter::EmitConstantPool();
+    // we emit constant pools customly!
+  }
+
+  void EmitInstruction(const MachineInstr *MI) override;
+  void printSavedRegsBitmask();
+  void emitFrameDirective();
+  const char *getCurrentABIString() const;
+  void EmitFunctionEntryLabel() override;
+  void EmitFunctionBodyStart() override;
+  void EmitFunctionBodyEnd() override;
+  bool isBlockOnlyReachableByFallthrough(
+                                   const MachineBasicBlock* MBB) const override;
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                       unsigned AsmVariant, const char *ExtraCode,
+                       raw_ostream &O) override;
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
+                             unsigned AsmVariant, const char *ExtraCode,
+                             raw_ostream &O) override;
+  void printOperand(const MachineInstr *MI, int opNum, raw_ostream &O);
+  void printUnsignedImm(const MachineInstr *MI, int opNum, raw_ostream &O);
+  void printUnsignedImm8(const MachineInstr *MI, int opNum, raw_ostream &O);
+  void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O);
+  void printMemOperandEA(const MachineInstr *MI, int opNum, raw_ostream &O);
+  void printFCCOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
+                       const char *Modifier = nullptr);
+  void EmitStartOfAsmFile(Module &M) override;
+  void EmitEndOfAsmFile(Module &M) override;
+  void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
+};
+}
+
+#endif
+
diff --git a/contrib/llvm/lib/Target/Mips/MipsCallingConv.td b/contrib/llvm/lib/Target/Mips/MipsCallingConv.td
new file mode 100644
index 0000000..b1cd3c3
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsCallingConv.td
@@ -0,0 +1,269 @@
+//===-- MipsCallingConv.td - Calling Conventions for Mips --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This describes the calling conventions for Mips architecture.
+//===----------------------------------------------------------------------===//
+
+/// CCIfSubtarget - Match if the current subtarget has a feature F.
+class CCIfSubtarget<string F, CCAction A>:
+  CCIf<!strconcat("State.getTarget().getSubtarget<MipsSubtarget>().", F), A>;
+
+//===----------------------------------------------------------------------===//
+// Mips O32 Calling Convention
+//===----------------------------------------------------------------------===//
+
+// Only the return rules are defined here for O32. The rules for argument
+// passing are defined in MipsISelLowering.cpp.
+def RetCC_MipsO32 : CallingConv<[
+  // i32 are returned in registers V0, V1, A0, A1
+  CCIfType<[i32], CCAssignToReg<[V0, V1, A0, A1]>>,
+
+  // f32 are returned in registers F0, F2
+  CCIfType<[f32], CCAssignToReg<[F0, F2]>>,
+
+  // f64 arguments are returned in D0_64 and D2_64 in FP64bit mode or
+  // in D0 and D1 in FP32bit mode.
+  CCIfType<[f64], CCIfSubtarget<"isFP64bit()", CCAssignToReg<[D0_64, D2_64]>>>,
+  CCIfType<[f64], CCIfSubtarget<"isNotFP64bit()", CCAssignToReg<[D0, D1]>>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Mips N32/64 Calling Convention
+//===----------------------------------------------------------------------===//
+
+def CC_MipsN : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // Integer arguments are passed in integer registers.
+  CCIfType<[i32], CCAssignToRegWithShadow<[A0, A1, A2, A3,
+                                           T0, T1, T2, T3],
+                                          [F12, F13, F14, F15,
+                                           F16, F17, F18, F19]>>,
+
+  CCIfType<[i64], CCAssignToRegWithShadow<[A0_64, A1_64, A2_64, A3_64,
+                                           T0_64, T1_64, T2_64, T3_64],
+                                          [D12_64, D13_64, D14_64, D15_64,
+                                           D16_64, D17_64, D18_64, D19_64]>>,
+
+  // f32 arguments are passed in single precision FP registers.
+  CCIfType<[f32], CCAssignToRegWithShadow<[F12, F13, F14, F15,
+                                           F16, F17, F18, F19],
+                                          [A0_64, A1_64, A2_64, A3_64,
+                                           T0_64, T1_64, T2_64, T3_64]>>,
+
+  // f64 arguments are passed in double precision FP registers.
+  CCIfType<[f64], CCAssignToRegWithShadow<[D12_64, D13_64, D14_64, D15_64,
+                                           D16_64, D17_64, D18_64, D19_64],
+                                          [A0_64, A1_64, A2_64, A3_64,
+                                           T0_64, T1_64, T2_64, T3_64]>>,
+
+  // All stack parameter slots become 64-bit doublewords and are 8-byte aligned.
+  CCIfType<[i32, f32], CCAssignToStack<4, 8>>,
+  CCIfType<[i64, f64], CCAssignToStack<8, 8>>
+]>;
+
+// N32/64 variable arguments.
+// All arguments are passed in integer registers.
+def CC_MipsN_VarArg : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  CCIfType<[i32, f32], CCAssignToReg<[A0, A1, A2, A3, T0, T1, T2, T3]>>,
+
+  CCIfType<[i64, f64], CCAssignToReg<[A0_64, A1_64, A2_64, A3_64,
+                                      T0_64, T1_64, T2_64, T3_64]>>,
+
+  // All stack parameter slots become 64-bit doublewords and are 8-byte aligned.
+  CCIfType<[i32, f32], CCAssignToStack<4, 8>>,
+  CCIfType<[i64, f64], CCAssignToStack<8, 8>>
+]>;
+
+def RetCC_MipsN : CallingConv<[
+  // i32 are returned in registers V0, V1
+  CCIfType<[i32], CCAssignToReg<[V0, V1]>>,
+
+  // i64 are returned in registers V0_64, V1_64
+  CCIfType<[i64], CCAssignToReg<[V0_64, V1_64]>>,
+
+  // f32 are returned in registers F0, F2
+  CCIfType<[f32], CCAssignToReg<[F0, F2]>>,
+
+  // f64 are returned in registers D0, D2
+  CCIfType<[f64], CCAssignToReg<[D0_64, D2_64]>>
+]>;
+
+// In soft-mode, register A0_64, instead of V1_64, is used to return a long
+// double value.
+def RetCC_F128Soft : CallingConv<[
+  CCIfType<[i64], CCAssignToReg<[V0_64, A0_64]>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Mips EABI Calling Convention
+//===----------------------------------------------------------------------===//
+
+def CC_MipsEABI : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // Integer arguments are passed in integer registers.
+  CCIfType<[i32], CCAssignToReg<[A0, A1, A2, A3, T0, T1, T2, T3]>>,
+
+  // Single fp arguments are passed in pairs within 32-bit mode
+  CCIfType<[f32], CCIfSubtarget<"isSingleFloat()",
+                  CCAssignToReg<[F12, F13, F14, F15, F16, F17, F18, F19]>>>,
+
+  CCIfType<[f32], CCIfSubtarget<"isNotSingleFloat()",
+                  CCAssignToReg<[F12, F14, F16, F18]>>>,
+
+  // The first 4 double fp arguments are passed in single fp registers.
+  CCIfType<[f64], CCIfSubtarget<"isNotSingleFloat()",
+                  CCAssignToReg<[D6, D7, D8, D9]>>>,
+
+  // Integer values get stored in stack slots that are 4 bytes in
+  // size and 4-byte aligned.
+  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+
+  // Integer values get stored in stack slots that are 8 bytes in
+  // size and 8-byte aligned.
+  CCIfType<[f64], CCIfSubtarget<"isNotSingleFloat()", CCAssignToStack<8, 8>>>
+]>;
+
+def RetCC_MipsEABI : CallingConv<[
+  // i32 are returned in registers V0, V1
+  CCIfType<[i32], CCAssignToReg<[V0, V1]>>,
+
+  // f32 are returned in registers F0, F1
+  CCIfType<[f32], CCAssignToReg<[F0, F1]>>,
+
+  // f64 are returned in register D0
+  CCIfType<[f64], CCIfSubtarget<"isNotSingleFloat()", CCAssignToReg<[D0]>>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Mips FastCC Calling Convention
+//===----------------------------------------------------------------------===//
+def CC_MipsO32_FastCC : CallingConv<[
+  // f64 arguments are passed in double-precision floating pointer registers.
+  CCIfType<[f64], CCIfSubtarget<"isNotFP64bit()",
+                                CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7,
+                                               D8, D9]>>>,
+  CCIfType<[f64], CCIfSubtarget<"isFP64bit()",
+                                CCAssignToReg<[D0_64, D1_64, D2_64, D3_64,
+                                               D4_64, D5_64, D6_64, D7_64,
+                                               D8_64, D9_64, D10_64, D11_64,
+                                               D12_64, D13_64, D14_64, D15_64,
+                                               D16_64, D17_64, D18_64,
+                                               D19_64]>>>,
+
+  // Stack parameter slots for f64 are 64-bit doublewords and 8-byte aligned.
+  CCIfType<[f64], CCAssignToStack<8, 8>>
+]>;
+
+def CC_MipsN_FastCC : CallingConv<[
+  // Integer arguments are passed in integer registers.
+  CCIfType<[i64], CCAssignToReg<[A0_64, A1_64, A2_64, A3_64, T0_64, T1_64,
+                                 T2_64, T3_64, T4_64, T5_64, T6_64, T7_64,
+                                 T8_64, V1_64]>>,
+
+  // f64 arguments are passed in double-precision floating pointer registers.
+  CCIfType<[f64], CCAssignToReg<[D0_64, D1_64, D2_64, D3_64, D4_64, D5_64,
+                                 D6_64, D7_64, D8_64, D9_64, D10_64, D11_64,
+                                 D12_64, D13_64, D14_64, D15_64, D16_64, D17_64,
+                                 D18_64, D19_64]>>,
+
+  // Stack parameter slots for i64 and f64 are 64-bit doublewords and
+  // 8-byte aligned.
+  CCIfType<[i64, f64], CCAssignToStack<8, 8>>
+]>;
+
+def CC_Mips_FastCC : CallingConv<[
+  // Handles byval parameters.
+  CCIfByVal<CCPassByVal<4, 4>>,
+
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // Integer arguments are passed in integer registers. All scratch registers,
+  // except for AT, V0 and T9, are available to be used as argument registers.
+  CCIfType<[i32], CCIfSubtarget<"isNotTargetNaCl()",
+      CCAssignToReg<[A0, A1, A2, A3, T0, T1, T2, T3, T4, T5, T6, T7, T8, V1]>>>,
+
+  // In NaCl, T6, T7 and T8 are reserved and not available as argument
+  // registers for fastcc.  T6 contains the mask for sandboxing control flow
+  // (indirect jumps and calls).  T7 contains the mask for sandboxing memory
+  // accesses (loads and stores).  T8 contains the thread pointer.
+  CCIfType<[i32], CCIfSubtarget<"isTargetNaCl()",
+      CCAssignToReg<[A0, A1, A2, A3, T0, T1, T2, T3, T4, T5, V1]>>>,
+
+  // f32 arguments are passed in single-precision floating pointer registers.
+  CCIfType<[f32], CCIfSubtarget<"useOddSPReg()",
+      CCAssignToReg<[F0, F1, F2, F3, F4, F5, F6, F7, F8, F9, F10, F11, F12, F13,
+                     F14, F15, F16, F17, F18, F19]>>>,
+
+  // Don't use odd numbered single-precision registers for -mno-odd-spreg.
+  CCIfType<[f32], CCIfSubtarget<"noOddSPReg()",
+      CCAssignToReg<[F0, F2, F4, F6, F8, F10, F12, F14, F16, F18]>>>,
+
+  // Stack parameter slots for i32 and f32 are 32-bit words and 4-byte aligned.
+  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+
+  CCIfSubtarget<"isABI_EABI()", CCDelegateTo<CC_MipsEABI>>,
+  CCIfSubtarget<"isABI_O32()", CCDelegateTo<CC_MipsO32_FastCC>>,
+  CCDelegateTo<CC_MipsN_FastCC>
+]>;
+
+//==
+
+def CC_Mips16RetHelper : CallingConv<[
+  // Integer arguments are passed in integer registers.
+  CCIfType<[i32], CCAssignToReg<[V0, V1, A0, A1]>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Mips Calling Convention Dispatch
+//===----------------------------------------------------------------------===//
+
+def RetCC_Mips : CallingConv<[
+  CCIfSubtarget<"isABI_EABI()", CCDelegateTo<RetCC_MipsEABI>>,
+  CCIfSubtarget<"isABI_N32()", CCDelegateTo<RetCC_MipsN>>,
+  CCIfSubtarget<"isABI_N64()", CCDelegateTo<RetCC_MipsN>>,
+  CCDelegateTo<RetCC_MipsO32>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Callee-saved register lists.
+//===----------------------------------------------------------------------===//
+
+def CSR_SingleFloatOnly : CalleeSavedRegs<(add (sequence "F%u", 31, 20), RA, FP,
+                                               (sequence "S%u", 7, 0))>;
+
+def CSR_O32_FPXX : CalleeSavedRegs<(add (sequence "D%u", 15, 10), RA, FP,
+                                        (sequence "S%u", 7, 0))> {
+  let OtherPreserved = (add (decimate (sequence "F%u", 30, 20), 2));
+}
+
+def CSR_O32 : CalleeSavedRegs<(add (sequence "D%u", 15, 10), RA, FP,
+                                   (sequence "S%u", 7, 0))>;
+
+def CSR_O32_FP64 :
+  CalleeSavedRegs<(add (decimate (sequence "D%u_64", 30, 20), 2), RA, FP,
+                       (sequence "S%u", 7, 0))>;
+
+def CSR_N32 : CalleeSavedRegs<(add D20_64, D22_64, D24_64, D26_64, D28_64,
+                                   D30_64, RA_64, FP_64, GP_64,
+                                   (sequence "S%u_64", 7, 0))>;
+
+def CSR_N64 : CalleeSavedRegs<(add (sequence "D%u_64", 31, 24), RA_64, FP_64,
+                                   GP_64, (sequence "S%u_64", 7, 0))>;
+
+def CSR_Mips16RetHelper :
+  CalleeSavedRegs<(add V0, V1, FP,
+                   (sequence "A%u", 3, 0), (sequence "S%u", 7, 0),
+                   (sequence "D%u", 15, 10))>;
diff --git a/contrib/llvm/lib/Target/Mips/MipsCodeEmitter.cpp b/contrib/llvm/lib/Target/Mips/MipsCodeEmitter.cpp
new file mode 100644
index 0000000..794c718
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsCodeEmitter.cpp
@@ -0,0 +1,481 @@
+//===-- Mips/MipsCodeEmitter.cpp - Convert Mips Code to Machine Code ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===---------------------------------------------------------------------===//
+//
+// This file contains the pass that transforms the Mips machine instructions
+// into relocatable machine code.
+//
+//===---------------------------------------------------------------------===//
+
+#include "Mips.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MipsInstrInfo.h"
+#include "MipsRelocations.h"
+#include "MipsSubtarget.h"
+#include "MipsTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#ifndef NDEBUG
+#include <iomanip>
+#endif
+
+using namespace llvm;
+
+#define DEBUG_TYPE "jit"
+
+STATISTIC(NumEmitted, "Number of machine instructions emitted");
+
+namespace {
+
+class MipsCodeEmitter : public MachineFunctionPass {
+  MipsJITInfo *JTI;
+  const MipsInstrInfo *II;
+  const DataLayout *TD;
+  const MipsSubtarget *Subtarget;
+  TargetMachine &TM;
+  JITCodeEmitter &MCE;
+  const std::vector<MachineConstantPoolEntry> *MCPEs;
+  const std::vector<MachineJumpTableEntry> *MJTEs;
+  bool IsPIC;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineModuleInfo> ();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  static char ID;
+
+public:
+  MipsCodeEmitter(TargetMachine &tm, JITCodeEmitter &mce)
+    : MachineFunctionPass(ID), JTI(nullptr), II(nullptr), TD(nullptr),
+      TM(tm), MCE(mce), MCPEs(nullptr), MJTEs(nullptr),
+      IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "Mips Machine Code Emitter";
+  }
+
+  /// getBinaryCodeForInstr - This function, generated by the
+  /// CodeEmitterGenerator using TableGen, produces the binary encoding for
+  /// machine instructions.
+  uint64_t getBinaryCodeForInstr(const MachineInstr &MI) const;
+
+  void emitInstruction(MachineBasicBlock::instr_iterator MI,
+                       MachineBasicBlock &MBB);
+
+private:
+
+  void emitWord(unsigned Word);
+
+  /// Routines that handle operands which add machine relocations which are
+  /// fixed up by the relocation stage.
+  void emitGlobalAddress(const GlobalValue *GV, unsigned Reloc,
+                         bool MayNeedFarStub) const;
+  void emitExternalSymbolAddress(const char *ES, unsigned Reloc) const;
+  void emitConstPoolAddress(unsigned CPI, unsigned Reloc) const;
+  void emitJumpTableAddress(unsigned JTIndex, unsigned Reloc) const;
+  void emitMachineBasicBlock(MachineBasicBlock *BB, unsigned Reloc) const;
+
+  /// getMachineOpValue - Return binary encoding of operand. If the machine
+  /// operand requires relocation, record the relocation and return zero.
+  unsigned getMachineOpValue(const MachineInstr &MI,
+                             const MachineOperand &MO) const;
+
+  unsigned getRelocation(const MachineInstr &MI,
+                         const MachineOperand &MO) const;
+
+  unsigned getJumpTargetOpValue(const MachineInstr &MI, unsigned OpNo) const;
+  unsigned getJumpTargetOpValueMM(const MachineInstr &MI, unsigned OpNo) const;
+  unsigned getBranchTargetOpValueMM(const MachineInstr &MI,
+                                    unsigned OpNo) const;
+
+  unsigned getBranchTarget21OpValue(const MachineInstr &MI,
+                                    unsigned OpNo) const;
+  unsigned getBranchTarget26OpValue(const MachineInstr &MI,
+                                    unsigned OpNo) const;
+  unsigned getJumpOffset16OpValue(const MachineInstr &MI, unsigned OpNo) const;
+
+  unsigned getBranchTargetOpValue(const MachineInstr &MI, unsigned OpNo) const;
+  unsigned getMemEncoding(const MachineInstr &MI, unsigned OpNo) const;
+  unsigned getMemEncodingMMImm12(const MachineInstr &MI, unsigned OpNo) const;
+  unsigned getMSAMemEncoding(const MachineInstr &MI, unsigned OpNo) const;
+  unsigned getSizeExtEncoding(const MachineInstr &MI, unsigned OpNo) const;
+  unsigned getSizeInsEncoding(const MachineInstr &MI, unsigned OpNo) const;
+  unsigned getLSAImmEncoding(const MachineInstr &MI, unsigned OpNo) const;
+  unsigned getSimm19Lsl2Encoding(const MachineInstr &MI, unsigned OpNo) const;
+  unsigned getSimm18Lsl3Encoding(const MachineInstr &MI, unsigned OpNo) const;
+
+  /// Expand pseudo instructions with accumulator register operands.
+  void expandACCInstr(MachineBasicBlock::instr_iterator MI,
+                      MachineBasicBlock &MBB, unsigned Opc) const;
+
+  void expandPseudoIndirectBranch(MachineBasicBlock::instr_iterator MI,
+                                  MachineBasicBlock &MBB) const;
+
+  /// \brief Expand pseudo instruction. Return true if MI was expanded.
+  bool expandPseudos(MachineBasicBlock::instr_iterator &MI,
+                     MachineBasicBlock &MBB) const;
+};
+}
+
+char MipsCodeEmitter::ID = 0;
+
+bool MipsCodeEmitter::runOnMachineFunction(MachineFunction &MF) {
+  MipsTargetMachine &Target = static_cast<MipsTargetMachine &>(
+                                const_cast<TargetMachine &>(MF.getTarget()));
+
+  JTI = Target.getJITInfo();
+  II = Target.getInstrInfo();
+  TD = Target.getDataLayout();
+  Subtarget = &TM.getSubtarget<MipsSubtarget> ();
+  MCPEs = &MF.getConstantPool()->getConstants();
+  MJTEs = nullptr;
+  if (MF.getJumpTableInfo()) MJTEs = &MF.getJumpTableInfo()->getJumpTables();
+  JTI->Initialize(MF, IsPIC, Subtarget->isLittle());
+  MCE.setModuleInfo(&getAnalysis<MachineModuleInfo> ());
+
+  do {
+    DEBUG(errs() << "JITTing function '"
+        << MF.getName() << "'\n");
+    MCE.startFunction(MF);
+
+    for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
+        MBB != E; ++MBB){
+      MCE.StartMachineBasicBlock(MBB);
+      for (MachineBasicBlock::instr_iterator I = MBB->instr_begin(),
+           E = MBB->instr_end(); I != E;)
+        emitInstruction(*I++, *MBB);
+    }
+  } while (MCE.finishFunction(MF));
+
+  return false;
+}
+
+unsigned MipsCodeEmitter::getRelocation(const MachineInstr &MI,
+                                        const MachineOperand &MO) const {
+  // NOTE: This relocations are for static.
+  uint64_t TSFlags = MI.getDesc().TSFlags;
+  uint64_t Form = TSFlags & MipsII::FormMask;
+  if (Form == MipsII::FrmJ)
+    return Mips::reloc_mips_26;
+  if ((Form == MipsII::FrmI || Form == MipsII::FrmFI)
+       && MI.isBranch())
+    return Mips::reloc_mips_pc16;
+  if (Form == MipsII::FrmI && MI.getOpcode() == Mips::LUi)
+    return Mips::reloc_mips_hi;
+  return Mips::reloc_mips_lo;
+}
+
+unsigned MipsCodeEmitter::getJumpTargetOpValue(const MachineInstr &MI,
+                                               unsigned OpNo) const {
+  MachineOperand MO = MI.getOperand(OpNo);
+  if (MO.isGlobal())
+    emitGlobalAddress(MO.getGlobal(), getRelocation(MI, MO), true);
+  else if (MO.isSymbol())
+    emitExternalSymbolAddress(MO.getSymbolName(), getRelocation(MI, MO));
+  else if (MO.isMBB())
+    emitMachineBasicBlock(MO.getMBB(), getRelocation(MI, MO));
+  else
+    llvm_unreachable("Unexpected jump target operand kind.");
+  return 0;
+}
+
+unsigned MipsCodeEmitter::getJumpTargetOpValueMM(const MachineInstr &MI,
+                                                 unsigned OpNo) const {
+  llvm_unreachable("Unimplemented function.");
+  return 0;
+}
+
+unsigned MipsCodeEmitter::getBranchTargetOpValueMM(const MachineInstr &MI,
+                                                   unsigned OpNo) const {
+  llvm_unreachable("Unimplemented function.");
+  return 0;
+}
+
+unsigned MipsCodeEmitter::getBranchTarget21OpValue(const MachineInstr &MI,
+                                                   unsigned OpNo) const {
+  llvm_unreachable("Unimplemented function.");
+  return 0;
+}
+
+unsigned MipsCodeEmitter::getBranchTarget26OpValue(const MachineInstr &MI,
+                                                   unsigned OpNo) const {
+  llvm_unreachable("Unimplemented function.");
+  return 0;
+}
+
+unsigned MipsCodeEmitter::getJumpOffset16OpValue(const MachineInstr &MI,
+                                                 unsigned OpNo) const {
+  llvm_unreachable("Unimplemented function.");
+  return 0;
+}
+
+unsigned MipsCodeEmitter::getBranchTargetOpValue(const MachineInstr &MI,
+                                                 unsigned OpNo) const {
+  MachineOperand MO = MI.getOperand(OpNo);
+  emitMachineBasicBlock(MO.getMBB(), getRelocation(MI, MO));
+  return 0;
+}
+
+unsigned MipsCodeEmitter::getMemEncoding(const MachineInstr &MI,
+                                         unsigned OpNo) const {
+  // Base register is encoded in bits 20-16, offset is encoded in bits 15-0.
+  assert(MI.getOperand(OpNo).isReg());
+  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo)) << 16;
+  return (getMachineOpValue(MI, MI.getOperand(OpNo+1)) & 0xFFFF) | RegBits;
+}
+
+unsigned MipsCodeEmitter::getMemEncodingMMImm12(const MachineInstr &MI,
+                                                unsigned OpNo) const {
+  llvm_unreachable("Unimplemented function.");
+  return 0;
+}
+
+unsigned MipsCodeEmitter::getMSAMemEncoding(const MachineInstr &MI,
+                                            unsigned OpNo) const {
+  llvm_unreachable("Unimplemented function.");
+  return 0;
+}
+
+unsigned MipsCodeEmitter::getSizeExtEncoding(const MachineInstr &MI,
+                                             unsigned OpNo) const {
+  // size is encoded as size-1.
+  return getMachineOpValue(MI, MI.getOperand(OpNo)) - 1;
+}
+
+unsigned MipsCodeEmitter::getSizeInsEncoding(const MachineInstr &MI,
+                                             unsigned OpNo) const {
+  // size is encoded as pos+size-1.
+  return getMachineOpValue(MI, MI.getOperand(OpNo-1)) +
+         getMachineOpValue(MI, MI.getOperand(OpNo)) - 1;
+}
+
+unsigned MipsCodeEmitter::getLSAImmEncoding(const MachineInstr &MI,
+                                            unsigned OpNo) const {
+  llvm_unreachable("Unimplemented function.");
+  return 0;
+}
+
+unsigned MipsCodeEmitter::getSimm18Lsl3Encoding(const MachineInstr &MI,
+                                                unsigned OpNo) const {
+  llvm_unreachable("Unimplemented function.");
+  return 0;
+}
+
+unsigned MipsCodeEmitter::getSimm19Lsl2Encoding(const MachineInstr &MI,
+                                                unsigned OpNo) const {
+  llvm_unreachable("Unimplemented function.");
+  return 0;
+}
+
+/// getMachineOpValue - Return binary encoding of operand. If the machine
+/// operand requires relocation, record the relocation and return zero.
+unsigned MipsCodeEmitter::getMachineOpValue(const MachineInstr &MI,
+                                            const MachineOperand &MO) const {
+  if (MO.isReg())
+    return TM.getRegisterInfo()->getEncodingValue(MO.getReg());
+  else if (MO.isImm())
+    return static_cast<unsigned>(MO.getImm());
+  else if (MO.isGlobal())
+    emitGlobalAddress(MO.getGlobal(), getRelocation(MI, MO), true);
+  else if (MO.isSymbol())
+    emitExternalSymbolAddress(MO.getSymbolName(), getRelocation(MI, MO));
+  else if (MO.isCPI())
+    emitConstPoolAddress(MO.getIndex(), getRelocation(MI, MO));
+  else if (MO.isJTI())
+    emitJumpTableAddress(MO.getIndex(), getRelocation(MI, MO));
+  else if (MO.isMBB())
+    emitMachineBasicBlock(MO.getMBB(), getRelocation(MI, MO));
+  else
+    llvm_unreachable("Unable to encode MachineOperand!");
+  return 0;
+}
+
+void MipsCodeEmitter::emitGlobalAddress(const GlobalValue *GV, unsigned Reloc,
+                                        bool MayNeedFarStub) const {
+  MCE.addRelocation(MachineRelocation::getGV(MCE.getCurrentPCOffset(), Reloc,
+                                             const_cast<GlobalValue *>(GV), 0,
+                                             MayNeedFarStub));
+}
+
+void MipsCodeEmitter::
+emitExternalSymbolAddress(const char *ES, unsigned Reloc) const {
+  MCE.addRelocation(MachineRelocation::getExtSym(MCE.getCurrentPCOffset(),
+                                                 Reloc, ES, 0, 0));
+}
+
+void MipsCodeEmitter::emitConstPoolAddress(unsigned CPI, unsigned Reloc) const {
+  MCE.addRelocation(MachineRelocation::getConstPool(MCE.getCurrentPCOffset(),
+                                                    Reloc, CPI, 0, false));
+}
+
+void MipsCodeEmitter::
+emitJumpTableAddress(unsigned JTIndex, unsigned Reloc) const {
+  MCE.addRelocation(MachineRelocation::getJumpTable(MCE.getCurrentPCOffset(),
+                                                    Reloc, JTIndex, 0, false));
+}
+
+void MipsCodeEmitter::emitMachineBasicBlock(MachineBasicBlock *BB,
+                                            unsigned Reloc) const {
+  MCE.addRelocation(MachineRelocation::getBB(MCE.getCurrentPCOffset(),
+                                             Reloc, BB));
+}
+
+void MipsCodeEmitter::emitInstruction(MachineBasicBlock::instr_iterator MI,
+                                      MachineBasicBlock &MBB) {
+  DEBUG(errs() << "JIT: " << (void*)MCE.getCurrentPCValue() << ":\t" << *MI);
+
+  // Expand pseudo instruction. Skip if MI was not expanded.
+  if (((MI->getDesc().TSFlags & MipsII::FormMask) == MipsII::Pseudo) &&
+      !expandPseudos(MI, MBB))
+    return;
+
+  MCE.processDebugLoc(MI->getDebugLoc(), true);
+
+  emitWord(getBinaryCodeForInstr(*MI));
+  ++NumEmitted;  // Keep track of the # of mi's emitted
+
+  MCE.processDebugLoc(MI->getDebugLoc(), false);
+}
+
+void MipsCodeEmitter::emitWord(unsigned Word) {
+  DEBUG(errs() << "  0x";
+        errs().write_hex(Word) << "\n");
+  if (Subtarget->isLittle())
+    MCE.emitWordLE(Word);
+  else
+    MCE.emitWordBE(Word);
+}
+
+void MipsCodeEmitter::expandACCInstr(MachineBasicBlock::instr_iterator MI,
+                                     MachineBasicBlock &MBB,
+                                     unsigned Opc) const {
+  // Expand "pseudomult $ac0, $t0, $t1" to "mult $t0, $t1".
+  BuildMI(MBB, &*MI, MI->getDebugLoc(), II->get(Opc))
+    .addReg(MI->getOperand(1).getReg()).addReg(MI->getOperand(2).getReg());
+}
+
+void MipsCodeEmitter::expandPseudoIndirectBranch(
+    MachineBasicBlock::instr_iterator MI, MachineBasicBlock &MBB) const {
+  // This logic is duplicated from MipsAsmPrinter::emitPseudoIndirectBranch()
+  bool HasLinkReg = false;
+  unsigned Opcode = 0;
+
+  if (Subtarget->hasMips64r6()) {
+    // MIPS64r6 should use (JALR64 ZERO_64, $rs)
+    Opcode = Mips::JALR64;
+    HasLinkReg = true;
+  } else if (Subtarget->hasMips32r6()) {
+    // MIPS32r6 should use (JALR ZERO, $rs)
+    Opcode = Mips::JALR;
+    HasLinkReg = true;
+  } else if (Subtarget->inMicroMipsMode())
+    // microMIPS should use (JR_MM $rs)
+    Opcode = Mips::JR_MM;
+  else {
+    // Everything else should use (JR $rs)
+    Opcode = Mips::JR;
+  }
+
+  auto MIB = BuildMI(MBB, &*MI, MI->getDebugLoc(), II->get(Opcode));
+
+  if (HasLinkReg) {
+    unsigned ZeroReg = Subtarget->isGP64bit() ? Mips::ZERO_64 : Mips::ZERO;
+    MIB.addReg(ZeroReg);
+  }
+
+  MIB.addReg(MI->getOperand(0).getReg());
+}
+
+bool MipsCodeEmitter::expandPseudos(MachineBasicBlock::instr_iterator &MI,
+                                    MachineBasicBlock &MBB) const {
+  switch (MI->getOpcode()) {
+  default:
+    llvm_unreachable("Unhandled pseudo");
+    return false;
+  case Mips::NOP:
+    BuildMI(MBB, &*MI, MI->getDebugLoc(), II->get(Mips::SLL), Mips::ZERO)
+      .addReg(Mips::ZERO).addImm(0);
+    break;
+  case Mips::B:
+    BuildMI(MBB, &*MI, MI->getDebugLoc(), II->get(Mips::BEQ)).addReg(Mips::ZERO)
+      .addReg(Mips::ZERO).addOperand(MI->getOperand(0));
+    break;
+  case Mips::TRAP:
+    BuildMI(MBB, &*MI, MI->getDebugLoc(), II->get(Mips::BREAK)).addImm(0)
+      .addImm(0);
+    break;
+  case Mips::JALRPseudo:
+    BuildMI(MBB, &*MI, MI->getDebugLoc(), II->get(Mips::JALR), Mips::RA)
+      .addReg(MI->getOperand(0).getReg());
+    break;
+  case Mips::PseudoMULT:
+    expandACCInstr(MI, MBB, Mips::MULT);
+    break;
+  case Mips::PseudoMULTu:
+    expandACCInstr(MI, MBB, Mips::MULTu);
+    break;
+  case Mips::PseudoSDIV:
+    expandACCInstr(MI, MBB, Mips::SDIV);
+    break;
+  case Mips::PseudoUDIV:
+    expandACCInstr(MI, MBB, Mips::UDIV);
+    break;
+  case Mips::PseudoMADD:
+    expandACCInstr(MI, MBB, Mips::MADD);
+    break;
+  case Mips::PseudoMADDU:
+    expandACCInstr(MI, MBB, Mips::MADDU);
+    break;
+  case Mips::PseudoMSUB:
+    expandACCInstr(MI, MBB, Mips::MSUB);
+    break;
+  case Mips::PseudoMSUBU:
+    expandACCInstr(MI, MBB, Mips::MSUBU);
+    break;
+  case Mips::PseudoReturn:
+  case Mips::PseudoReturn64:
+  case Mips::PseudoIndirectBranch:
+  case Mips::PseudoIndirectBranch64:
+      expandPseudoIndirectBranch(MI, MBB);
+      break;
+  case TargetOpcode::CFI_INSTRUCTION:
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::KILL:
+      // Do nothing
+      return false;
+  }
+
+  (MI--)->eraseFromBundle();
+  return true;
+}
+
+/// createMipsJITCodeEmitterPass - Return a pass that emits the collected Mips
+/// code to the specified MCE object.
+FunctionPass *llvm::createMipsJITCodeEmitterPass(MipsTargetMachine &TM,
+                                                 JITCodeEmitter &JCE) {
+  return new MipsCodeEmitter(TM, JCE);
+}
+
+#include "MipsGenCodeEmitter.inc"
diff --git a/contrib/llvm/lib/Target/Mips/MipsCondMov.td b/contrib/llvm/lib/Target/Mips/MipsCondMov.td
new file mode 100644
index 0000000..690f626
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsCondMov.td
@@ -0,0 +1,265 @@
+//===-- MipsCondMov.td - Describe Mips Conditional Moves --*- tablegen -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the Conditional Moves implementation.
+//
+//===----------------------------------------------------------------------===//
+
+// Conditional moves:
+// These instructions are expanded in
+// MipsISelLowering::EmitInstrWithCustomInserter if target does not have
+// conditional move instructions.
+// cond:int, data:int
+class CMov_I_I_FT<string opstr, RegisterOperand CRC, RegisterOperand DRC,
+                  InstrItinClass Itin> :
+  InstSE<(outs DRC:$rd), (ins DRC:$rs, CRC:$rt, DRC:$F),
+         !strconcat(opstr, "\t$rd, $rs, $rt"), [], Itin, FrmFR, opstr> {
+  let Constraints = "$F = $rd";
+}
+
+// cond:int, data:float
+class CMov_I_F_FT<string opstr, RegisterOperand CRC, RegisterOperand DRC,
+                  InstrItinClass Itin> :
+  InstSE<(outs DRC:$fd), (ins DRC:$fs, CRC:$rt, DRC:$F),
+         !strconcat(opstr, "\t$fd, $fs, $rt"), [], Itin, FrmFR, opstr> {
+  let Constraints = "$F = $fd";
+}
+
+// cond:float, data:int
+class CMov_F_I_FT<string opstr, RegisterOperand RC, InstrItinClass Itin,
+                  SDPatternOperator OpNode = null_frag> :
+  InstSE<(outs RC:$rd), (ins RC:$rs, FCCRegsOpnd:$fcc, RC:$F),
+         !strconcat(opstr, "\t$rd, $rs, $fcc"),
+         [(set RC:$rd, (OpNode RC:$rs, FCCRegsOpnd:$fcc, RC:$F))],
+         Itin, FrmFR, opstr> {
+  let Constraints = "$F = $rd";
+}
+
+// cond:float, data:float
+class CMov_F_F_FT<string opstr, RegisterOperand RC, InstrItinClass Itin,
+                  SDPatternOperator OpNode = null_frag> :
+  InstSE<(outs RC:$fd), (ins RC:$fs, FCCRegsOpnd:$fcc, RC:$F),
+         !strconcat(opstr, "\t$fd, $fs, $fcc"),
+         [(set RC:$fd, (OpNode RC:$fs, FCCRegsOpnd:$fcc, RC:$F))],
+         Itin, FrmFR, opstr> {
+  let Constraints = "$F = $fd";
+}
+
+// select patterns
+multiclass MovzPats0<RegisterClass CRC, RegisterClass DRC,
+                     Instruction MOVZInst, Instruction SLTOp,
+                     Instruction SLTuOp, Instruction SLTiOp,
+                     Instruction SLTiuOp> {
+  def : MipsPat<(select (i32 (setge CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
+                (MOVZInst DRC:$T, (SLTOp CRC:$lhs, CRC:$rhs), DRC:$F)>;
+  def : MipsPat<(select (i32 (setuge CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
+                (MOVZInst DRC:$T, (SLTuOp CRC:$lhs, CRC:$rhs), DRC:$F)>;
+  def : MipsPat<(select (i32 (setge CRC:$lhs, immSExt16:$rhs)), DRC:$T, DRC:$F),
+                (MOVZInst DRC:$T, (SLTiOp CRC:$lhs, immSExt16:$rhs), DRC:$F)>;
+  def : MipsPat<(select (i32 (setuge CRC:$lh, immSExt16:$rh)), DRC:$T, DRC:$F),
+                (MOVZInst DRC:$T, (SLTiuOp CRC:$lh, immSExt16:$rh), DRC:$F)>;
+  def : MipsPat<(select (i32 (setle CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
+                (MOVZInst DRC:$T, (SLTOp CRC:$rhs, CRC:$lhs), DRC:$F)>;
+  def : MipsPat<(select (i32 (setule CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
+                (MOVZInst DRC:$T, (SLTuOp CRC:$rhs, CRC:$lhs), DRC:$F)>;
+  def : MipsPat<(select (i32 (setgt CRC:$lhs, immSExt16Plus1:$rhs)),
+                        DRC:$T, DRC:$F),
+                (MOVZInst DRC:$T, (SLTiOp CRC:$lhs, (Plus1 imm:$rhs)), DRC:$F)>;
+  def : MipsPat<(select (i32 (setugt CRC:$lhs, immSExt16Plus1:$rhs)),
+                        DRC:$T, DRC:$F),
+                (MOVZInst DRC:$T, (SLTiuOp CRC:$lhs, (Plus1 imm:$rhs)),
+                          DRC:$F)>;
+}
+
+multiclass MovzPats1<RegisterClass CRC, RegisterClass DRC,
+                     Instruction MOVZInst, Instruction XOROp> {
+  def : MipsPat<(select (i32 (seteq CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
+                (MOVZInst DRC:$T, (XOROp CRC:$lhs, CRC:$rhs), DRC:$F)>;
+  def : MipsPat<(select (i32 (seteq CRC:$lhs, 0)), DRC:$T, DRC:$F),
+                (MOVZInst DRC:$T, CRC:$lhs, DRC:$F)>;
+}
+
+multiclass MovzPats2<RegisterClass CRC, RegisterClass DRC,
+                     Instruction MOVZInst, Instruction XORiOp> {
+  def : MipsPat<
+            (select (i32 (seteq CRC:$lhs, immZExt16:$uimm16)), DRC:$T, DRC:$F),
+            (MOVZInst DRC:$T, (XORiOp CRC:$lhs, immZExt16:$uimm16), DRC:$F)>;
+}
+
+multiclass MovnPats<RegisterClass CRC, RegisterClass DRC, Instruction MOVNInst,
+                    Instruction XOROp> {
+  def : MipsPat<(select (i32 (setne CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
+                (MOVNInst DRC:$T, (XOROp CRC:$lhs, CRC:$rhs), DRC:$F)>;
+  def : MipsPat<(select CRC:$cond, DRC:$T, DRC:$F),
+                (MOVNInst DRC:$T, CRC:$cond, DRC:$F)>;
+  def : MipsPat<(select (i32 (setne CRC:$lhs, 0)),DRC:$T, DRC:$F),
+                (MOVNInst DRC:$T, CRC:$lhs, DRC:$F)>;
+}
+
+// Instantiation of instructions.
+def MOVZ_I_I : MMRel, CMov_I_I_FT<"movz", GPR32Opnd, GPR32Opnd, II_MOVZ>,
+               ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6;
+
+let isCodeGenOnly = 1 in {
+  def MOVZ_I_I64   : CMov_I_I_FT<"movz", GPR32Opnd, GPR64Opnd, II_MOVZ>,
+                     ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6;
+  def MOVZ_I64_I   : CMov_I_I_FT<"movz", GPR64Opnd, GPR32Opnd, II_MOVZ>,
+                     ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6;
+  def MOVZ_I64_I64 : CMov_I_I_FT<"movz", GPR64Opnd, GPR64Opnd, II_MOVZ>,
+                     ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6;
+}
+
+def MOVN_I_I       : MMRel, CMov_I_I_FT<"movn", GPR32Opnd, GPR32Opnd, II_MOVN>,
+                     ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6;
+
+let isCodeGenOnly = 1 in {
+  def MOVN_I_I64   : CMov_I_I_FT<"movn", GPR32Opnd, GPR64Opnd, II_MOVN>,
+                     ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6;
+  def MOVN_I64_I   : CMov_I_I_FT<"movn", GPR64Opnd, GPR32Opnd, II_MOVN>,
+                     ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6;
+  def MOVN_I64_I64 : CMov_I_I_FT<"movn", GPR64Opnd, GPR64Opnd, II_MOVN>,
+                     ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6;
+}
+
+def MOVZ_I_S : MMRel, CMov_I_F_FT<"movz.s", GPR32Opnd, FGR32Opnd, II_MOVZ_S>,
+               CMov_I_F_FM<18, 16>, INSN_MIPS4_32_NOT_32R6_64R6;
+
+let isCodeGenOnly = 1 in
+def MOVZ_I64_S : CMov_I_F_FT<"movz.s", GPR64Opnd, FGR32Opnd, II_MOVZ_S>,
+                 CMov_I_F_FM<18, 16>, INSN_MIPS4_32_NOT_32R6_64R6,
+                 AdditionalRequires<[HasMips64]>;
+
+def MOVN_I_S : MMRel, CMov_I_F_FT<"movn.s", GPR32Opnd, FGR32Opnd, II_MOVN_S>,
+               CMov_I_F_FM<19, 16>, INSN_MIPS4_32_NOT_32R6_64R6;
+
+let isCodeGenOnly = 1 in
+def MOVN_I64_S : CMov_I_F_FT<"movn.s", GPR64Opnd, FGR32Opnd, II_MOVN_S>,
+                 CMov_I_F_FM<19, 16>, INSN_MIPS4_32_NOT_32R6_64R6,
+                 AdditionalRequires<[IsGP64bit]>;
+
+def MOVZ_I_D32 : MMRel, CMov_I_F_FT<"movz.d", GPR32Opnd, AFGR64Opnd,
+                                    II_MOVZ_D>, CMov_I_F_FM<18, 17>,
+                 INSN_MIPS4_32_NOT_32R6_64R6, FGR_32;
+def MOVN_I_D32 : MMRel, CMov_I_F_FT<"movn.d", GPR32Opnd, AFGR64Opnd,
+                                    II_MOVN_D>, CMov_I_F_FM<19, 17>,
+                 INSN_MIPS4_32_NOT_32R6_64R6, FGR_32;
+
+let DecoderNamespace = "Mips64" in {
+  def MOVZ_I_D64 : CMov_I_F_FT<"movz.d", GPR32Opnd, FGR64Opnd, II_MOVZ_D>,
+                   CMov_I_F_FM<18, 17>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+  def MOVN_I_D64 : CMov_I_F_FT<"movn.d", GPR32Opnd, FGR64Opnd, II_MOVN_D>,
+                   CMov_I_F_FM<19, 17>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+  let isCodeGenOnly = 1 in {
+    def MOVZ_I64_D64 : CMov_I_F_FT<"movz.d", GPR64Opnd, FGR64Opnd, II_MOVZ_D>,
+                       CMov_I_F_FM<18, 17>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+    def MOVN_I64_D64 : CMov_I_F_FT<"movn.d", GPR64Opnd, FGR64Opnd, II_MOVN_D>,
+                       CMov_I_F_FM<19, 17>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+  }
+}
+
+def MOVT_I : MMRel, CMov_F_I_FT<"movt", GPR32Opnd, II_MOVT, MipsCMovFP_T>,
+             CMov_F_I_FM<1>, INSN_MIPS4_32_NOT_32R6_64R6;
+
+let isCodeGenOnly = 1 in
+def MOVT_I64 : CMov_F_I_FT<"movt", GPR64Opnd, II_MOVT, MipsCMovFP_T>,
+               CMov_F_I_FM<1>, INSN_MIPS4_32_NOT_32R6_64R6,
+               AdditionalRequires<[IsGP64bit]>;
+
+def MOVF_I : MMRel, CMov_F_I_FT<"movf", GPR32Opnd, II_MOVF, MipsCMovFP_F>,
+             CMov_F_I_FM<0>, INSN_MIPS4_32_NOT_32R6_64R6;
+
+let isCodeGenOnly = 1 in
+def MOVF_I64 : CMov_F_I_FT<"movf", GPR64Opnd, II_MOVF, MipsCMovFP_F>,
+               CMov_F_I_FM<0>, INSN_MIPS4_32_NOT_32R6_64R6,
+               AdditionalRequires<[IsGP64bit]>;
+
+def MOVT_S : MMRel, CMov_F_F_FT<"movt.s", FGR32Opnd, II_MOVT_S, MipsCMovFP_T>,
+             CMov_F_F_FM<16, 1>, INSN_MIPS4_32_NOT_32R6_64R6;
+def MOVF_S : MMRel, CMov_F_F_FT<"movf.s", FGR32Opnd, II_MOVF_S, MipsCMovFP_F>,
+             CMov_F_F_FM<16, 0>, INSN_MIPS4_32_NOT_32R6_64R6;
+
+def MOVT_D32 : MMRel, CMov_F_F_FT<"movt.d", AFGR64Opnd, II_MOVT_D,
+                                  MipsCMovFP_T>, CMov_F_F_FM<17, 1>,
+               INSN_MIPS4_32_NOT_32R6_64R6, FGR_32;
+def MOVF_D32 : MMRel, CMov_F_F_FT<"movf.d", AFGR64Opnd, II_MOVF_D,
+                                  MipsCMovFP_F>, CMov_F_F_FM<17, 0>,
+               INSN_MIPS4_32_NOT_32R6_64R6, FGR_32;
+
+let DecoderNamespace = "Mips64" in {
+  def MOVT_D64 : CMov_F_F_FT<"movt.d", FGR64Opnd, II_MOVT_D, MipsCMovFP_T>,
+                 CMov_F_F_FM<17, 1>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+  def MOVF_D64 : CMov_F_F_FT<"movf.d", FGR64Opnd, II_MOVF_D, MipsCMovFP_F>,
+                 CMov_F_F_FM<17, 0>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+}
+
+// Instantiation of conditional move patterns.
+defm : MovzPats0<GPR32, GPR32, MOVZ_I_I, SLT, SLTu, SLTi, SLTiu>,
+       INSN_MIPS4_32_NOT_32R6_64R6;
+defm : MovzPats1<GPR32, GPR32, MOVZ_I_I, XOR>, INSN_MIPS4_32_NOT_32R6_64R6;
+defm : MovzPats2<GPR32, GPR32, MOVZ_I_I, XORi>, INSN_MIPS4_32_NOT_32R6_64R6;
+
+defm : MovzPats0<GPR32, GPR64, MOVZ_I_I64, SLT, SLTu, SLTi, SLTiu>,
+       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+defm : MovzPats0<GPR64, GPR32, MOVZ_I_I, SLT64, SLTu64, SLTi64, SLTiu64>,
+       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+defm : MovzPats0<GPR64, GPR64, MOVZ_I_I64, SLT64, SLTu64, SLTi64, SLTiu64>,
+       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+defm : MovzPats1<GPR32, GPR64, MOVZ_I_I64, XOR>,
+       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+defm : MovzPats1<GPR64, GPR32, MOVZ_I64_I, XOR64>,
+       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+defm : MovzPats1<GPR64, GPR64, MOVZ_I64_I64, XOR64>,
+       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+defm : MovzPats2<GPR32, GPR64, MOVZ_I_I64, XORi>,
+       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+defm : MovzPats2<GPR64, GPR32, MOVZ_I64_I, XORi64>,
+       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+defm : MovzPats2<GPR64, GPR64, MOVZ_I64_I64, XORi64>,
+       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+
+defm : MovnPats<GPR32, GPR32, MOVN_I_I, XOR>, INSN_MIPS4_32_NOT_32R6_64R6;
+
+defm : MovnPats<GPR32, GPR64, MOVN_I_I64, XOR>, INSN_MIPS4_32_NOT_32R6_64R6,
+       GPR_64;
+defm : MovnPats<GPR64, GPR32, MOVN_I64_I, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6,
+       GPR_64;
+defm : MovnPats<GPR64, GPR64, MOVN_I64_I64, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6,
+       GPR_64;
+
+defm : MovzPats0<GPR32, FGR32, MOVZ_I_S, SLT, SLTu, SLTi, SLTiu>,
+       INSN_MIPS4_32_NOT_32R6_64R6;
+defm : MovzPats1<GPR32, FGR32, MOVZ_I_S, XOR>, INSN_MIPS4_32_NOT_32R6_64R6;
+defm : MovnPats<GPR32, FGR32, MOVN_I_S, XOR>, INSN_MIPS4_32_NOT_32R6_64R6;
+
+defm : MovzPats0<GPR64, FGR32, MOVZ_I_S, SLT64, SLTu64, SLTi64, SLTiu64>,
+       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+defm : MovzPats1<GPR64, FGR32, MOVZ_I64_S, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6,
+       GPR_64;
+defm : MovnPats<GPR64, FGR32, MOVN_I64_S, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6,
+       GPR_64;
+
+defm : MovzPats0<GPR32, AFGR64, MOVZ_I_D32, SLT, SLTu, SLTi, SLTiu>,
+       INSN_MIPS4_32_NOT_32R6_64R6, FGR_32;
+defm : MovzPats1<GPR32, AFGR64, MOVZ_I_D32, XOR>, INSN_MIPS4_32_NOT_32R6_64R6,
+       FGR_32;
+defm : MovnPats<GPR32, AFGR64, MOVN_I_D32, XOR>, INSN_MIPS4_32_NOT_32R6_64R6,
+       FGR_32;
+
+defm : MovzPats0<GPR32, FGR64, MOVZ_I_D64, SLT, SLTu, SLTi, SLTiu>,
+       INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+defm : MovzPats0<GPR64, FGR64, MOVZ_I_D64, SLT64, SLTu64, SLTi64, SLTiu64>,
+       INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+defm : MovzPats1<GPR32, FGR64, MOVZ_I_D64, XOR>, INSN_MIPS4_32_NOT_32R6_64R6,
+       FGR_64;
+defm : MovzPats1<GPR64, FGR64, MOVZ_I64_D64, XOR64>,
+       INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+defm : MovnPats<GPR32, FGR64, MOVN_I_D64, XOR>, INSN_MIPS4_32_NOT_32R6_64R6,
+       FGR_64;
+defm : MovnPats<GPR64, FGR64, MOVN_I64_D64, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6,
+       FGR_64;
diff --git a/contrib/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp b/contrib/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp
new file mode 100644
index 0000000..80bf573
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp
@@ -0,0 +1,1719 @@
+//===-- MipsConstantIslandPass.cpp - Emit Pc Relative loads----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+// This pass is used to make Pc relative loads of constants.
+// For now, only Mips16 will use this. 
+//
+// Loading constants inline is expensive on Mips16 and it's in general better
+// to place the constant nearby in code space and then it can be loaded with a
+// simple 16 bit load instruction.
+//
+// The constants can be not just numbers but addresses of functions and labels.
+// This can be particularly helpful in static relocation mode for embedded
+// non-linux targets.
+//
+//
+
+#include "Mips.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "Mips16InstrInfo.h"
+#include "MipsMachineFunction.h"
+#include "MipsTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <algorithm>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-constant-islands"
+
+STATISTIC(NumCPEs,       "Number of constpool entries");
+STATISTIC(NumSplit,      "Number of uncond branches inserted");
+STATISTIC(NumCBrFixed,   "Number of cond branches fixed");
+STATISTIC(NumUBrFixed,   "Number of uncond branches fixed");
+
+// FIXME: This option should be removed once it has received sufficient testing.
+static cl::opt<bool>
+AlignConstantIslands("mips-align-constant-islands", cl::Hidden, cl::init(true),
+          cl::desc("Align constant islands in code"));
+
+
+// Rather than do make check tests with huge amounts of code, we force
+// the test to use this amount.
+//
+static cl::opt<int> ConstantIslandsSmallOffset(
+  "mips-constant-islands-small-offset",
+  cl::init(0),
+  cl::desc("Make small offsets be this amount for testing purposes"),
+  cl::Hidden);
+
+//
+// For testing purposes we tell it to not use relaxed load forms so that it
+// will split blocks.
+//
+static cl::opt<bool> NoLoadRelaxation(
+  "mips-constant-islands-no-load-relaxation",
+  cl::init(false),
+  cl::desc("Don't relax loads to long loads - for testing purposes"),
+  cl::Hidden);
+
+static unsigned int branchTargetOperand(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  case Mips::Bimm16:
+  case Mips::BimmX16:
+  case Mips::Bteqz16:
+  case Mips::BteqzX16:
+  case Mips::Btnez16:
+  case Mips::BtnezX16:
+  case Mips::JalB16:
+    return 0;
+  case Mips::BeqzRxImm16:
+  case Mips::BeqzRxImmX16:
+  case Mips::BnezRxImm16:
+  case Mips::BnezRxImmX16:
+    return 1;
+  }
+  llvm_unreachable("Unknown branch type");
+}
+
+static bool isUnconditionalBranch(unsigned int Opcode) {
+  switch (Opcode) {
+  default: return false;
+  case Mips::Bimm16:
+  case Mips::BimmX16:
+  case Mips::JalB16:
+    return true;
+  }
+}
+
+static unsigned int longformBranchOpcode(unsigned int Opcode) {
+  switch (Opcode) {
+  case Mips::Bimm16:
+  case Mips::BimmX16:
+    return Mips::BimmX16;
+  case Mips::Bteqz16:
+  case Mips::BteqzX16:
+    return Mips::BteqzX16;
+  case Mips::Btnez16:
+  case Mips::BtnezX16:
+    return Mips::BtnezX16;
+  case Mips::JalB16:
+    return Mips::JalB16;
+  case Mips::BeqzRxImm16:
+  case Mips::BeqzRxImmX16:
+    return Mips::BeqzRxImmX16;
+  case Mips::BnezRxImm16:
+  case Mips::BnezRxImmX16:
+    return Mips::BnezRxImmX16;
+  }
+  llvm_unreachable("Unknown branch type");
+}
+
+//
+// FIXME: need to go through this whole constant islands port and check the math
+// for branch ranges and clean this up and make some functions to calculate things
+// that are done many times identically.
+// Need to refactor some of the code to call this routine.
+//
+static unsigned int branchMaxOffsets(unsigned int Opcode) {
+  unsigned Bits, Scale;
+  switch (Opcode) {
+    case Mips::Bimm16:
+      Bits = 11;
+      Scale = 2;
+      break;
+    case Mips::BimmX16:
+      Bits = 16;
+      Scale = 2;
+      break;
+    case Mips::BeqzRxImm16:
+      Bits = 8;
+      Scale = 2;
+      break;
+    case Mips::BeqzRxImmX16:
+      Bits = 16;
+      Scale = 2;
+      break;
+    case Mips::BnezRxImm16:
+      Bits = 8;
+      Scale = 2;
+      break;
+    case Mips::BnezRxImmX16:
+      Bits = 16;
+      Scale = 2;
+      break;
+    case Mips::Bteqz16:
+      Bits = 8;
+      Scale = 2;
+      break;
+    case Mips::BteqzX16:
+      Bits = 16;
+      Scale = 2;
+      break;
+    case Mips::Btnez16:
+      Bits = 8;
+      Scale = 2;
+      break;
+    case Mips::BtnezX16:
+      Bits = 16;
+      Scale = 2;
+      break;
+    default:
+      llvm_unreachable("Unknown branch type");
+  }
+  unsigned MaxOffs = ((1 << (Bits-1))-1) * Scale;
+  return MaxOffs;
+}
+
+namespace {
+
+
+  typedef MachineBasicBlock::iterator Iter;
+  typedef MachineBasicBlock::reverse_iterator ReverseIter;
+
+  /// MipsConstantIslands - Due to limited PC-relative displacements, Mips
+  /// requires constant pool entries to be scattered among the instructions
+  /// inside a function.  To do this, it completely ignores the normal LLVM
+  /// constant pool; instead, it places constants wherever it feels like with
+  /// special instructions.
+  ///
+  /// The terminology used in this pass includes:
+  ///   Islands - Clumps of constants placed in the function.
+  ///   Water   - Potential places where an island could be formed.
+  ///   CPE     - A constant pool entry that has been placed somewhere, which
+  ///             tracks a list of users.
+
+  class MipsConstantIslands : public MachineFunctionPass {
+
+    /// BasicBlockInfo - Information about the offset and size of a single
+    /// basic block.
+    struct BasicBlockInfo {
+      /// Offset - Distance from the beginning of the function to the beginning
+      /// of this basic block.
+      ///
+      /// Offsets are computed assuming worst case padding before an aligned
+      /// block. This means that subtracting basic block offsets always gives a
+      /// conservative estimate of the real distance which may be smaller.
+      ///
+      /// Because worst case padding is used, the computed offset of an aligned
+      /// block may not actually be aligned.
+      unsigned Offset;
+
+      /// Size - Size of the basic block in bytes.  If the block contains
+      /// inline assembly, this is a worst case estimate.
+      ///
+      /// The size does not include any alignment padding whether from the
+      /// beginning of the block, or from an aligned jump table at the end.
+      unsigned Size;
+
+      // FIXME: ignore LogAlign for this patch
+      //
+      unsigned postOffset(unsigned LogAlign = 0) const {
+        unsigned PO = Offset + Size;
+        return PO;
+      }
+
+      BasicBlockInfo() : Offset(0), Size(0) {}
+
+    };
+
+    std::vector<BasicBlockInfo> BBInfo;
+
+    /// WaterList - A sorted list of basic blocks where islands could be placed
+    /// (i.e. blocks that don't fall through to the following block, due
+    /// to a return, unreachable, or unconditional branch).
+    std::vector<MachineBasicBlock*> WaterList;
+
+    /// NewWaterList - The subset of WaterList that was created since the
+    /// previous iteration by inserting unconditional branches.
+    SmallSet<MachineBasicBlock*, 4> NewWaterList;
+
+    typedef std::vector<MachineBasicBlock*>::iterator water_iterator;
+
+    /// CPUser - One user of a constant pool, keeping the machine instruction
+    /// pointer, the constant pool being referenced, and the max displacement
+    /// allowed from the instruction to the CP.  The HighWaterMark records the
+    /// highest basic block where a new CPEntry can be placed.  To ensure this
+    /// pass terminates, the CP entries are initially placed at the end of the
+    /// function and then move monotonically to lower addresses.  The
+    /// exception to this rule is when the current CP entry for a particular
+    /// CPUser is out of range, but there is another CP entry for the same
+    /// constant value in range.  We want to use the existing in-range CP
+    /// entry, but if it later moves out of range, the search for new water
+    /// should resume where it left off.  The HighWaterMark is used to record
+    /// that point.
+    struct CPUser {
+      MachineInstr *MI;
+      MachineInstr *CPEMI;
+      MachineBasicBlock *HighWaterMark;
+    private:
+      unsigned MaxDisp;
+      unsigned LongFormMaxDisp; // mips16 has 16/32 bit instructions
+                                // with different displacements
+      unsigned LongFormOpcode;
+    public:
+      bool NegOk;
+      CPUser(MachineInstr *mi, MachineInstr *cpemi, unsigned maxdisp,
+             bool neg,
+             unsigned longformmaxdisp, unsigned longformopcode)
+        : MI(mi), CPEMI(cpemi), MaxDisp(maxdisp),
+          LongFormMaxDisp(longformmaxdisp), LongFormOpcode(longformopcode),
+          NegOk(neg){
+        HighWaterMark = CPEMI->getParent();
+      }
+      /// getMaxDisp - Returns the maximum displacement supported by MI.
+      unsigned getMaxDisp() const {
+        unsigned xMaxDisp = ConstantIslandsSmallOffset?
+                            ConstantIslandsSmallOffset: MaxDisp;
+        return xMaxDisp;
+      }
+      void setMaxDisp(unsigned val) {
+        MaxDisp = val;
+      }
+      unsigned getLongFormMaxDisp() const {
+        return LongFormMaxDisp;
+      }
+      unsigned getLongFormOpcode() const {
+          return LongFormOpcode;
+      }
+    };
+
+    /// CPUsers - Keep track of all of the machine instructions that use various
+    /// constant pools and their max displacement.
+    std::vector<CPUser> CPUsers;
+
+  /// CPEntry - One per constant pool entry, keeping the machine instruction
+  /// pointer, the constpool index, and the number of CPUser's which
+  /// reference this entry.
+  struct CPEntry {
+    MachineInstr *CPEMI;
+    unsigned CPI;
+    unsigned RefCount;
+    CPEntry(MachineInstr *cpemi, unsigned cpi, unsigned rc = 0)
+      : CPEMI(cpemi), CPI(cpi), RefCount(rc) {}
+  };
+
+  /// CPEntries - Keep track of all of the constant pool entry machine
+  /// instructions. For each original constpool index (i.e. those that
+  /// existed upon entry to this pass), it keeps a vector of entries.
+  /// Original elements are cloned as we go along; the clones are
+  /// put in the vector of the original element, but have distinct CPIs.
+  std::vector<std::vector<CPEntry> > CPEntries;
+
+  /// ImmBranch - One per immediate branch, keeping the machine instruction
+  /// pointer, conditional or unconditional, the max displacement,
+  /// and (if isCond is true) the corresponding unconditional branch
+  /// opcode.
+  struct ImmBranch {
+    MachineInstr *MI;
+    unsigned MaxDisp : 31;
+    bool isCond : 1;
+    int UncondBr;
+    ImmBranch(MachineInstr *mi, unsigned maxdisp, bool cond, int ubr)
+      : MI(mi), MaxDisp(maxdisp), isCond(cond), UncondBr(ubr) {}
+  };
+
+  /// ImmBranches - Keep track of all the immediate branch instructions.
+  ///
+  std::vector<ImmBranch> ImmBranches;
+
+  /// HasFarJump - True if any far jump instruction has been emitted during
+  /// the branch fix up pass.
+  bool HasFarJump;
+
+  const TargetMachine &TM;
+  bool IsPIC;
+  unsigned ABI;
+  const MipsSubtarget *STI;
+  const Mips16InstrInfo *TII;
+  MipsFunctionInfo *MFI;
+  MachineFunction *MF;
+  MachineConstantPool *MCP;
+
+  unsigned PICLabelUId;
+  bool PrescannedForConstants;
+
+  void initPICLabelUId(unsigned UId) {
+    PICLabelUId = UId;
+  }
+
+
+  unsigned createPICLabelUId() {
+    return PICLabelUId++;
+  }
+
+  public:
+    static char ID;
+    MipsConstantIslands(TargetMachine &tm)
+        : MachineFunctionPass(ID), TM(tm),
+          IsPIC(TM.getRelocationModel() == Reloc::PIC_),
+          ABI(TM.getSubtarget<MipsSubtarget>().getTargetABI()), STI(nullptr),
+          MF(nullptr), MCP(nullptr), PrescannedForConstants(false) {}
+
+    const char *getPassName() const override {
+      return "Mips Constant Islands";
+    }
+
+    bool runOnMachineFunction(MachineFunction &F) override;
+
+    void doInitialPlacement(std::vector<MachineInstr*> &CPEMIs);
+    CPEntry *findConstPoolEntry(unsigned CPI, const MachineInstr *CPEMI);
+    unsigned getCPELogAlign(const MachineInstr *CPEMI);
+    void initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs);
+    unsigned getOffsetOf(MachineInstr *MI) const;
+    unsigned getUserOffset(CPUser&) const;
+    void dumpBBs();
+
+    bool isOffsetInRange(unsigned UserOffset, unsigned TrialOffset,
+                         unsigned Disp, bool NegativeOK);
+    bool isOffsetInRange(unsigned UserOffset, unsigned TrialOffset,
+                         const CPUser &U);
+
+    void computeBlockSize(MachineBasicBlock *MBB);
+    MachineBasicBlock *splitBlockBeforeInstr(MachineInstr *MI);
+    void updateForInsertedWaterBlock(MachineBasicBlock *NewBB);
+    void adjustBBOffsetsAfter(MachineBasicBlock *BB);
+    bool decrementCPEReferenceCount(unsigned CPI, MachineInstr* CPEMI);
+    int findInRangeCPEntry(CPUser& U, unsigned UserOffset);
+    int findLongFormInRangeCPEntry(CPUser& U, unsigned UserOffset);
+    bool findAvailableWater(CPUser&U, unsigned UserOffset,
+                            water_iterator &WaterIter);
+    void createNewWater(unsigned CPUserIndex, unsigned UserOffset,
+                        MachineBasicBlock *&NewMBB);
+    bool handleConstantPoolUser(unsigned CPUserIndex);
+    void removeDeadCPEMI(MachineInstr *CPEMI);
+    bool removeUnusedCPEntries();
+    bool isCPEntryInRange(MachineInstr *MI, unsigned UserOffset,
+                          MachineInstr *CPEMI, unsigned Disp, bool NegOk,
+                          bool DoDump = false);
+    bool isWaterInRange(unsigned UserOffset, MachineBasicBlock *Water,
+                        CPUser &U, unsigned &Growth);
+    bool isBBInRange(MachineInstr *MI, MachineBasicBlock *BB, unsigned Disp);
+    bool fixupImmediateBr(ImmBranch &Br);
+    bool fixupConditionalBr(ImmBranch &Br);
+    bool fixupUnconditionalBr(ImmBranch &Br);
+
+    void prescanForConstants();
+
+  private:
+
+  };
+
+  char MipsConstantIslands::ID = 0;
+} // end of anonymous namespace
+
+bool MipsConstantIslands::isOffsetInRange
+  (unsigned UserOffset, unsigned TrialOffset,
+   const CPUser &U) {
+  return isOffsetInRange(UserOffset, TrialOffset,
+                         U.getMaxDisp(), U.NegOk);
+}
+/// print block size and offset information - debugging
+void MipsConstantIslands::dumpBBs() {
+  DEBUG({
+    for (unsigned J = 0, E = BBInfo.size(); J !=E; ++J) {
+      const BasicBlockInfo &BBI = BBInfo[J];
+      dbgs() << format("%08x BB#%u\t", BBI.Offset, J)
+             << format(" size=%#x\n", BBInfo[J].Size);
+    }
+  });
+}
+/// createMipsLongBranchPass - Returns a pass that converts branches to long
+/// branches.
+FunctionPass *llvm::createMipsConstantIslandPass(MipsTargetMachine &tm) {
+  return new MipsConstantIslands(tm);
+}
+
+bool MipsConstantIslands::runOnMachineFunction(MachineFunction &mf) {
+  // The intention is for this to be a mips16 only pass for now
+  // FIXME:
+  MF = &mf;
+  MCP = mf.getConstantPool();
+  STI = &mf.getTarget().getSubtarget<MipsSubtarget>();
+  DEBUG(dbgs() << "constant island machine function " << "\n");
+  if (!STI->inMips16Mode() || !MipsSubtarget::useConstantIslands()) {
+    return false;
+  }
+  TII = (const Mips16InstrInfo*)MF->getTarget().getInstrInfo();
+  MFI = MF->getInfo<MipsFunctionInfo>();
+  DEBUG(dbgs() << "constant island processing " << "\n");
+  //
+  // will need to make predermination if there is any constants we need to
+  // put in constant islands. TBD.
+  //
+  if (!PrescannedForConstants) prescanForConstants();
+
+  HasFarJump = false;
+  // This pass invalidates liveness information when it splits basic blocks.
+  MF->getRegInfo().invalidateLiveness();
+
+  // Renumber all of the machine basic blocks in the function, guaranteeing that
+  // the numbers agree with the position of the block in the function.
+  MF->RenumberBlocks();
+
+  bool MadeChange = false;
+
+  // Perform the initial placement of the constant pool entries.  To start with,
+  // we put them all at the end of the function.
+  std::vector<MachineInstr*> CPEMIs;
+  if (!MCP->isEmpty())
+    doInitialPlacement(CPEMIs);
+
+  /// The next UID to take is the first unused one.
+  initPICLabelUId(CPEMIs.size());
+
+  // Do the initial scan of the function, building up information about the
+  // sizes of each block, the location of all the water, and finding all of the
+  // constant pool users.
+  initializeFunctionInfo(CPEMIs);
+  CPEMIs.clear();
+  DEBUG(dumpBBs());
+
+  /// Remove dead constant pool entries.
+  MadeChange |= removeUnusedCPEntries();
+
+  // Iteratively place constant pool entries and fix up branches until there
+  // is no change.
+  unsigned NoCPIters = 0, NoBRIters = 0;
+  (void)NoBRIters;
+  while (true) {
+    DEBUG(dbgs() << "Beginning CP iteration #" << NoCPIters << '\n');
+    bool CPChange = false;
+    for (unsigned i = 0, e = CPUsers.size(); i != e; ++i)
+      CPChange |= handleConstantPoolUser(i);
+    if (CPChange && ++NoCPIters > 30)
+      report_fatal_error("Constant Island pass failed to converge!");
+    DEBUG(dumpBBs());
+
+    // Clear NewWaterList now.  If we split a block for branches, it should
+    // appear as "new water" for the next iteration of constant pool placement.
+    NewWaterList.clear();
+
+    DEBUG(dbgs() << "Beginning BR iteration #" << NoBRIters << '\n');
+    bool BRChange = false;
+    for (unsigned i = 0, e = ImmBranches.size(); i != e; ++i)
+      BRChange |= fixupImmediateBr(ImmBranches[i]);
+    if (BRChange && ++NoBRIters > 30)
+      report_fatal_error("Branch Fix Up pass failed to converge!");
+    DEBUG(dumpBBs());
+    if (!CPChange && !BRChange)
+      break;
+    MadeChange = true;
+  }
+
+  DEBUG(dbgs() << '\n'; dumpBBs());
+
+  BBInfo.clear();
+  WaterList.clear();
+  CPUsers.clear();
+  CPEntries.clear();
+  ImmBranches.clear();
+  return MadeChange;
+}
+
+/// doInitialPlacement - Perform the initial placement of the constant pool
+/// entries.  To start with, we put them all at the end of the function.
+void
+MipsConstantIslands::doInitialPlacement(std::vector<MachineInstr*> &CPEMIs) {
+  // Create the basic block to hold the CPE's.
+  MachineBasicBlock *BB = MF->CreateMachineBasicBlock();
+  MF->push_back(BB);
+
+
+  // MachineConstantPool measures alignment in bytes. We measure in log2(bytes).
+  unsigned MaxAlign = Log2_32(MCP->getConstantPoolAlignment());
+
+  // Mark the basic block as required by the const-pool.
+  // If AlignConstantIslands isn't set, use 4-byte alignment for everything.
+  BB->setAlignment(AlignConstantIslands ? MaxAlign : 2);
+
+  // The function needs to be as aligned as the basic blocks. The linker may
+  // move functions around based on their alignment.
+  MF->ensureAlignment(BB->getAlignment());
+
+  // Order the entries in BB by descending alignment.  That ensures correct
+  // alignment of all entries as long as BB is sufficiently aligned.  Keep
+  // track of the insertion point for each alignment.  We are going to bucket
+  // sort the entries as they are created.
+  SmallVector<MachineBasicBlock::iterator, 8> InsPoint(MaxAlign + 1, BB->end());
+
+  // Add all of the constants from the constant pool to the end block, use an
+  // identity mapping of CPI's to CPE's.
+  const std::vector<MachineConstantPoolEntry> &CPs = MCP->getConstants();
+
+  const DataLayout &TD = *MF->getTarget().getDataLayout();
+  for (unsigned i = 0, e = CPs.size(); i != e; ++i) {
+    unsigned Size = TD.getTypeAllocSize(CPs[i].getType());
+    assert(Size >= 4 && "Too small constant pool entry");
+    unsigned Align = CPs[i].getAlignment();
+    assert(isPowerOf2_32(Align) && "Invalid alignment");
+    // Verify that all constant pool entries are a multiple of their alignment.
+    // If not, we would have to pad them out so that instructions stay aligned.
+    assert((Size % Align) == 0 && "CP Entry not multiple of 4 bytes!");
+
+    // Insert CONSTPOOL_ENTRY before entries with a smaller alignment.
+    unsigned LogAlign = Log2_32(Align);
+    MachineBasicBlock::iterator InsAt = InsPoint[LogAlign];
+
+    MachineInstr *CPEMI =
+      BuildMI(*BB, InsAt, DebugLoc(), TII->get(Mips::CONSTPOOL_ENTRY))
+        .addImm(i).addConstantPoolIndex(i).addImm(Size);
+
+    CPEMIs.push_back(CPEMI);
+
+    // Ensure that future entries with higher alignment get inserted before
+    // CPEMI. This is bucket sort with iterators.
+    for (unsigned a = LogAlign + 1; a <= MaxAlign; ++a)
+      if (InsPoint[a] == InsAt)
+        InsPoint[a] = CPEMI;
+    // Add a new CPEntry, but no corresponding CPUser yet.
+    std::vector<CPEntry> CPEs;
+    CPEs.push_back(CPEntry(CPEMI, i));
+    CPEntries.push_back(CPEs);
+    ++NumCPEs;
+    DEBUG(dbgs() << "Moved CPI#" << i << " to end of function, size = "
+                 << Size << ", align = " << Align <<'\n');
+  }
+  DEBUG(BB->dump());
+}
+
+/// BBHasFallthrough - Return true if the specified basic block can fallthrough
+/// into the block immediately after it.
+static bool BBHasFallthrough(MachineBasicBlock *MBB) {
+  // Get the next machine basic block in the function.
+  MachineFunction::iterator MBBI = MBB;
+  // Can't fall off end of function.
+  if (std::next(MBBI) == MBB->getParent()->end())
+    return false;
+
+  MachineBasicBlock *NextBB = std::next(MBBI);
+  for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
+       E = MBB->succ_end(); I != E; ++I)
+    if (*I == NextBB)
+      return true;
+
+  return false;
+}
+
+/// findConstPoolEntry - Given the constpool index and CONSTPOOL_ENTRY MI,
+/// look up the corresponding CPEntry.
+MipsConstantIslands::CPEntry
+*MipsConstantIslands::findConstPoolEntry(unsigned CPI,
+                                        const MachineInstr *CPEMI) {
+  std::vector<CPEntry> &CPEs = CPEntries[CPI];
+  // Number of entries per constpool index should be small, just do a
+  // linear search.
+  for (unsigned i = 0, e = CPEs.size(); i != e; ++i) {
+    if (CPEs[i].CPEMI == CPEMI)
+      return &CPEs[i];
+  }
+  return nullptr;
+}
+
+/// getCPELogAlign - Returns the required alignment of the constant pool entry
+/// represented by CPEMI.  Alignment is measured in log2(bytes) units.
+unsigned MipsConstantIslands::getCPELogAlign(const MachineInstr *CPEMI) {
+  assert(CPEMI && CPEMI->getOpcode() == Mips::CONSTPOOL_ENTRY);
+
+  // Everything is 4-byte aligned unless AlignConstantIslands is set.
+  if (!AlignConstantIslands)
+    return 2;
+
+  unsigned CPI = CPEMI->getOperand(1).getIndex();
+  assert(CPI < MCP->getConstants().size() && "Invalid constant pool index.");
+  unsigned Align = MCP->getConstants()[CPI].getAlignment();
+  assert(isPowerOf2_32(Align) && "Invalid CPE alignment");
+  return Log2_32(Align);
+}
+
+/// initializeFunctionInfo - Do the initial scan of the function, building up
+/// information about the sizes of each block, the location of all the water,
+/// and finding all of the constant pool users.
+void MipsConstantIslands::
+initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
+  BBInfo.clear();
+  BBInfo.resize(MF->getNumBlockIDs());
+
+  // First thing, compute the size of all basic blocks, and see if the function
+  // has any inline assembly in it. If so, we have to be conservative about
+  // alignment assumptions, as we don't know for sure the size of any
+  // instructions in the inline assembly.
+  for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I)
+    computeBlockSize(I);
+
+
+  // Compute block offsets.
+  adjustBBOffsetsAfter(MF->begin());
+
+  // Now go back through the instructions and build up our data structures.
+  for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
+       MBBI != E; ++MBBI) {
+    MachineBasicBlock &MBB = *MBBI;
+
+    // If this block doesn't fall through into the next MBB, then this is
+    // 'water' that a constant pool island could be placed.
+    if (!BBHasFallthrough(&MBB))
+      WaterList.push_back(&MBB);
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+         I != E; ++I) {
+      if (I->isDebugValue())
+        continue;
+
+      int Opc = I->getOpcode();
+      if (I->isBranch()) {
+        bool isCond = false;
+        unsigned Bits = 0;
+        unsigned Scale = 1;
+        int UOpc = Opc;
+        switch (Opc) {
+        default:
+          continue;  // Ignore other branches for now
+        case Mips::Bimm16:
+          Bits = 11;
+          Scale = 2;
+          isCond = false;
+          break;
+        case Mips::BimmX16:
+          Bits = 16;
+          Scale = 2;
+          isCond = false;
+          break;
+        case Mips::BeqzRxImm16:
+          UOpc=Mips::Bimm16;
+          Bits = 8;
+          Scale = 2;
+          isCond = true;
+          break;
+        case Mips::BeqzRxImmX16:
+          UOpc=Mips::Bimm16;
+          Bits = 16;
+          Scale = 2;
+          isCond = true;
+          break;
+        case Mips::BnezRxImm16:
+          UOpc=Mips::Bimm16;
+          Bits = 8;
+          Scale = 2;
+          isCond = true;
+          break;
+        case Mips::BnezRxImmX16:
+          UOpc=Mips::Bimm16;
+          Bits = 16;
+          Scale = 2;
+          isCond = true;
+          break;
+        case Mips::Bteqz16:
+          UOpc=Mips::Bimm16;
+          Bits = 8;
+          Scale = 2;
+          isCond = true;
+          break;
+        case Mips::BteqzX16:
+          UOpc=Mips::Bimm16;
+          Bits = 16;
+          Scale = 2;
+          isCond = true;
+          break;
+        case Mips::Btnez16:
+          UOpc=Mips::Bimm16;
+          Bits = 8;
+          Scale = 2;
+          isCond = true;
+          break;
+        case Mips::BtnezX16:
+          UOpc=Mips::Bimm16;
+          Bits = 16;
+          Scale = 2;
+          isCond = true;
+          break;
+        }
+        // Record this immediate branch.
+        unsigned MaxOffs = ((1 << (Bits-1))-1) * Scale;
+        ImmBranches.push_back(ImmBranch(I, MaxOffs, isCond, UOpc));
+      }
+
+      if (Opc == Mips::CONSTPOOL_ENTRY)
+        continue;
+
+
+      // Scan the instructions for constant pool operands.
+      for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op)
+        if (I->getOperand(op).isCPI()) {
+
+          // We found one.  The addressing mode tells us the max displacement
+          // from the PC that this instruction permits.
+
+          // Basic size info comes from the TSFlags field.
+          unsigned Bits = 0;
+          unsigned Scale = 1;
+          bool NegOk = false;
+          unsigned LongFormBits = 0;
+          unsigned LongFormScale = 0;
+          unsigned LongFormOpcode = 0;
+          switch (Opc) {
+          default:
+            llvm_unreachable("Unknown addressing mode for CP reference!");
+          case Mips::LwRxPcTcp16:
+            Bits = 8;
+            Scale = 4;
+            LongFormOpcode = Mips::LwRxPcTcpX16;
+            LongFormBits = 14;
+            LongFormScale = 1;
+            break;
+          case Mips::LwRxPcTcpX16:
+            Bits = 14;
+            Scale = 1;
+            NegOk = true;
+            break;
+          }
+          // Remember that this is a user of a CP entry.
+          unsigned CPI = I->getOperand(op).getIndex();
+          MachineInstr *CPEMI = CPEMIs[CPI];
+          unsigned MaxOffs = ((1 << Bits)-1) * Scale;
+          unsigned LongFormMaxOffs = ((1 << LongFormBits)-1) * LongFormScale;
+          CPUsers.push_back(CPUser(I, CPEMI, MaxOffs, NegOk,
+                                   LongFormMaxOffs, LongFormOpcode));
+
+          // Increment corresponding CPEntry reference count.
+          CPEntry *CPE = findConstPoolEntry(CPI, CPEMI);
+          assert(CPE && "Cannot find a corresponding CPEntry!");
+          CPE->RefCount++;
+
+          // Instructions can only use one CP entry, don't bother scanning the
+          // rest of the operands.
+          break;
+
+        }
+
+    }
+  }
+
+}
+
+/// computeBlockSize - Compute the size and some alignment information for MBB.
+/// This function updates BBInfo directly.
+void MipsConstantIslands::computeBlockSize(MachineBasicBlock *MBB) {
+  BasicBlockInfo &BBI = BBInfo[MBB->getNumber()];
+  BBI.Size = 0;
+
+  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
+       ++I)
+    BBI.Size += TII->GetInstSizeInBytes(I);
+
+}
+
+/// getOffsetOf - Return the current offset of the specified machine instruction
+/// from the start of the function.  This offset changes as stuff is moved
+/// around inside the function.
+unsigned MipsConstantIslands::getOffsetOf(MachineInstr *MI) const {
+  MachineBasicBlock *MBB = MI->getParent();
+
+  // The offset is composed of two things: the sum of the sizes of all MBB's
+  // before this instruction's block, and the offset from the start of the block
+  // it is in.
+  unsigned Offset = BBInfo[MBB->getNumber()].Offset;
+
+  // Sum instructions before MI in MBB.
+  for (MachineBasicBlock::iterator I = MBB->begin(); &*I != MI; ++I) {
+    assert(I != MBB->end() && "Didn't find MI in its own basic block?");
+    Offset += TII->GetInstSizeInBytes(I);
+  }
+  return Offset;
+}
+
+/// CompareMBBNumbers - Little predicate function to sort the WaterList by MBB
+/// ID.
+static bool CompareMBBNumbers(const MachineBasicBlock *LHS,
+                              const MachineBasicBlock *RHS) {
+  return LHS->getNumber() < RHS->getNumber();
+}
+
+/// updateForInsertedWaterBlock - When a block is newly inserted into the
+/// machine function, it upsets all of the block numbers.  Renumber the blocks
+/// and update the arrays that parallel this numbering.
+void MipsConstantIslands::updateForInsertedWaterBlock
+  (MachineBasicBlock *NewBB) {
+  // Renumber the MBB's to keep them consecutive.
+  NewBB->getParent()->RenumberBlocks(NewBB);
+
+  // Insert an entry into BBInfo to align it properly with the (newly
+  // renumbered) block numbers.
+  BBInfo.insert(BBInfo.begin() + NewBB->getNumber(), BasicBlockInfo());
+
+  // Next, update WaterList.  Specifically, we need to add NewMBB as having
+  // available water after it.
+  water_iterator IP =
+    std::lower_bound(WaterList.begin(), WaterList.end(), NewBB,
+                     CompareMBBNumbers);
+  WaterList.insert(IP, NewBB);
+}
+
+unsigned MipsConstantIslands::getUserOffset(CPUser &U) const {
+  return getOffsetOf(U.MI);
+}
+
+/// Split the basic block containing MI into two blocks, which are joined by
+/// an unconditional branch.  Update data structures and renumber blocks to
+/// account for this change and returns the newly created block.
+MachineBasicBlock *MipsConstantIslands::splitBlockBeforeInstr
+  (MachineInstr *MI) {
+  MachineBasicBlock *OrigBB = MI->getParent();
+
+  // Create a new MBB for the code after the OrigBB.
+  MachineBasicBlock *NewBB =
+    MF->CreateMachineBasicBlock(OrigBB->getBasicBlock());
+  MachineFunction::iterator MBBI = OrigBB; ++MBBI;
+  MF->insert(MBBI, NewBB);
+
+  // Splice the instructions starting with MI over to NewBB.
+  NewBB->splice(NewBB->end(), OrigBB, MI, OrigBB->end());
+
+  // Add an unconditional branch from OrigBB to NewBB.
+  // Note the new unconditional branch is not being recorded.
+  // There doesn't seem to be meaningful DebugInfo available; this doesn't
+  // correspond to anything in the source.
+  BuildMI(OrigBB, DebugLoc(), TII->get(Mips::Bimm16)).addMBB(NewBB);
+  ++NumSplit;
+
+  // Update the CFG.  All succs of OrigBB are now succs of NewBB.
+  NewBB->transferSuccessors(OrigBB);
+
+  // OrigBB branches to NewBB.
+  OrigBB->addSuccessor(NewBB);
+
+  // Update internal data structures to account for the newly inserted MBB.
+  // This is almost the same as updateForInsertedWaterBlock, except that
+  // the Water goes after OrigBB, not NewBB.
+  MF->RenumberBlocks(NewBB);
+
+  // Insert an entry into BBInfo to align it properly with the (newly
+  // renumbered) block numbers.
+  BBInfo.insert(BBInfo.begin() + NewBB->getNumber(), BasicBlockInfo());
+
+  // Next, update WaterList.  Specifically, we need to add OrigMBB as having
+  // available water after it (but not if it's already there, which happens
+  // when splitting before a conditional branch that is followed by an
+  // unconditional branch - in that case we want to insert NewBB).
+  water_iterator IP =
+    std::lower_bound(WaterList.begin(), WaterList.end(), OrigBB,
+                     CompareMBBNumbers);
+  MachineBasicBlock* WaterBB = *IP;
+  if (WaterBB == OrigBB)
+    WaterList.insert(std::next(IP), NewBB);
+  else
+    WaterList.insert(IP, OrigBB);
+  NewWaterList.insert(OrigBB);
+
+  // Figure out how large the OrigBB is.  As the first half of the original
+  // block, it cannot contain a tablejump.  The size includes
+  // the new jump we added.  (It should be possible to do this without
+  // recounting everything, but it's very confusing, and this is rarely
+  // executed.)
+  computeBlockSize(OrigBB);
+
+  // Figure out how large the NewMBB is.  As the second half of the original
+  // block, it may contain a tablejump.
+  computeBlockSize(NewBB);
+
+  // All BBOffsets following these blocks must be modified.
+  adjustBBOffsetsAfter(OrigBB);
+
+  return NewBB;
+}
+
+
+
+/// isOffsetInRange - Checks whether UserOffset (the location of a constant pool
+/// reference) is within MaxDisp of TrialOffset (a proposed location of a
+/// constant pool entry).
+bool MipsConstantIslands::isOffsetInRange(unsigned UserOffset,
+                                         unsigned TrialOffset, unsigned MaxDisp,
+                                         bool NegativeOK) {
+  if (UserOffset <= TrialOffset) {
+    // User before the Trial.
+    if (TrialOffset - UserOffset <= MaxDisp)
+      return true;
+  } else if (NegativeOK) {
+    if (UserOffset - TrialOffset <= MaxDisp)
+      return true;
+  }
+  return false;
+}
+
+/// isWaterInRange - Returns true if a CPE placed after the specified
+/// Water (a basic block) will be in range for the specific MI.
+///
+/// Compute how much the function will grow by inserting a CPE after Water.
+bool MipsConstantIslands::isWaterInRange(unsigned UserOffset,
+                                        MachineBasicBlock* Water, CPUser &U,
+                                        unsigned &Growth) {
+  unsigned CPELogAlign = getCPELogAlign(U.CPEMI);
+  unsigned CPEOffset = BBInfo[Water->getNumber()].postOffset(CPELogAlign);
+  unsigned NextBlockOffset, NextBlockAlignment;
+  MachineFunction::const_iterator NextBlock = Water;
+  if (++NextBlock == MF->end()) {
+    NextBlockOffset = BBInfo[Water->getNumber()].postOffset();
+    NextBlockAlignment = 0;
+  } else {
+    NextBlockOffset = BBInfo[NextBlock->getNumber()].Offset;
+    NextBlockAlignment = NextBlock->getAlignment();
+  }
+  unsigned Size = U.CPEMI->getOperand(2).getImm();
+  unsigned CPEEnd = CPEOffset + Size;
+
+  // The CPE may be able to hide in the alignment padding before the next
+  // block. It may also cause more padding to be required if it is more aligned
+  // that the next block.
+  if (CPEEnd > NextBlockOffset) {
+    Growth = CPEEnd - NextBlockOffset;
+    // Compute the padding that would go at the end of the CPE to align the next
+    // block.
+    Growth += OffsetToAlignment(CPEEnd, 1u << NextBlockAlignment);
+
+    // If the CPE is to be inserted before the instruction, that will raise
+    // the offset of the instruction. Also account for unknown alignment padding
+    // in blocks between CPE and the user.
+    if (CPEOffset < UserOffset)
+      UserOffset += Growth;
+  } else
+    // CPE fits in existing padding.
+    Growth = 0;
+
+  return isOffsetInRange(UserOffset, CPEOffset, U);
+}
+
+/// isCPEntryInRange - Returns true if the distance between specific MI and
+/// specific ConstPool entry instruction can fit in MI's displacement field.
+bool MipsConstantIslands::isCPEntryInRange
+  (MachineInstr *MI, unsigned UserOffset,
+   MachineInstr *CPEMI, unsigned MaxDisp,
+   bool NegOk, bool DoDump) {
+  unsigned CPEOffset  = getOffsetOf(CPEMI);
+
+  if (DoDump) {
+    DEBUG({
+      unsigned Block = MI->getParent()->getNumber();
+      const BasicBlockInfo &BBI = BBInfo[Block];
+      dbgs() << "User of CPE#" << CPEMI->getOperand(0).getImm()
+             << " max delta=" << MaxDisp
+             << format(" insn address=%#x", UserOffset)
+             << " in BB#" << Block << ": "
+             << format("%#x-%x\t", BBI.Offset, BBI.postOffset()) << *MI
+             << format("CPE address=%#x offset=%+d: ", CPEOffset,
+                       int(CPEOffset-UserOffset));
+    });
+  }
+
+  return isOffsetInRange(UserOffset, CPEOffset, MaxDisp, NegOk);
+}
+
+#ifndef NDEBUG
+/// BBIsJumpedOver - Return true of the specified basic block's only predecessor
+/// unconditionally branches to its only successor.
+static bool BBIsJumpedOver(MachineBasicBlock *MBB) {
+  if (MBB->pred_size() != 1 || MBB->succ_size() != 1)
+    return false;
+  MachineBasicBlock *Succ = *MBB->succ_begin();
+  MachineBasicBlock *Pred = *MBB->pred_begin();
+  MachineInstr *PredMI = &Pred->back();
+  if (PredMI->getOpcode() == Mips::Bimm16)
+    return PredMI->getOperand(0).getMBB() == Succ;
+  return false;
+}
+#endif
+
+void MipsConstantIslands::adjustBBOffsetsAfter(MachineBasicBlock *BB) {
+  unsigned BBNum = BB->getNumber();
+  for(unsigned i = BBNum + 1, e = MF->getNumBlockIDs(); i < e; ++i) {
+    // Get the offset and known bits at the end of the layout predecessor.
+    // Include the alignment of the current block.
+    unsigned Offset = BBInfo[i - 1].Offset + BBInfo[i - 1].Size;
+    BBInfo[i].Offset = Offset;
+  }
+}
+
+/// decrementCPEReferenceCount - find the constant pool entry with index CPI
+/// and instruction CPEMI, and decrement its refcount.  If the refcount
+/// becomes 0 remove the entry and instruction.  Returns true if we removed
+/// the entry, false if we didn't.
+
+bool MipsConstantIslands::decrementCPEReferenceCount(unsigned CPI,
+                                                    MachineInstr *CPEMI) {
+  // Find the old entry. Eliminate it if it is no longer used.
+  CPEntry *CPE = findConstPoolEntry(CPI, CPEMI);
+  assert(CPE && "Unexpected!");
+  if (--CPE->RefCount == 0) {
+    removeDeadCPEMI(CPEMI);
+    CPE->CPEMI = nullptr;
+    --NumCPEs;
+    return true;
+  }
+  return false;
+}
+
+/// LookForCPEntryInRange - see if the currently referenced CPE is in range;
+/// if not, see if an in-range clone of the CPE is in range, and if so,
+/// change the data structures so the user references the clone.  Returns:
+/// 0 = no existing entry found
+/// 1 = entry found, and there were no code insertions or deletions
+/// 2 = entry found, and there were code insertions or deletions
+int MipsConstantIslands::findInRangeCPEntry(CPUser& U, unsigned UserOffset)
+{
+  MachineInstr *UserMI = U.MI;
+  MachineInstr *CPEMI  = U.CPEMI;
+
+  // Check to see if the CPE is already in-range.
+  if (isCPEntryInRange(UserMI, UserOffset, CPEMI, U.getMaxDisp(), U.NegOk,
+                       true)) {
+    DEBUG(dbgs() << "In range\n");
+    return 1;
+  }
+
+  // No.  Look for previously created clones of the CPE that are in range.
+  unsigned CPI = CPEMI->getOperand(1).getIndex();
+  std::vector<CPEntry> &CPEs = CPEntries[CPI];
+  for (unsigned i = 0, e = CPEs.size(); i != e; ++i) {
+    // We already tried this one
+    if (CPEs[i].CPEMI == CPEMI)
+      continue;
+    // Removing CPEs can leave empty entries, skip
+    if (CPEs[i].CPEMI == nullptr)
+      continue;
+    if (isCPEntryInRange(UserMI, UserOffset, CPEs[i].CPEMI, U.getMaxDisp(),
+                     U.NegOk)) {
+      DEBUG(dbgs() << "Replacing CPE#" << CPI << " with CPE#"
+                   << CPEs[i].CPI << "\n");
+      // Point the CPUser node to the replacement
+      U.CPEMI = CPEs[i].CPEMI;
+      // Change the CPI in the instruction operand to refer to the clone.
+      for (unsigned j = 0, e = UserMI->getNumOperands(); j != e; ++j)
+        if (UserMI->getOperand(j).isCPI()) {
+          UserMI->getOperand(j).setIndex(CPEs[i].CPI);
+          break;
+        }
+      // Adjust the refcount of the clone...
+      CPEs[i].RefCount++;
+      // ...and the original.  If we didn't remove the old entry, none of the
+      // addresses changed, so we don't need another pass.
+      return decrementCPEReferenceCount(CPI, CPEMI) ? 2 : 1;
+    }
+  }
+  return 0;
+}
+
+/// LookForCPEntryInRange - see if the currently referenced CPE is in range;
+/// This version checks if the longer form of the instruction can be used to
+/// to satisfy things.
+/// if not, see if an in-range clone of the CPE is in range, and if so,
+/// change the data structures so the user references the clone.  Returns:
+/// 0 = no existing entry found
+/// 1 = entry found, and there were no code insertions or deletions
+/// 2 = entry found, and there were code insertions or deletions
+int MipsConstantIslands::findLongFormInRangeCPEntry
+  (CPUser& U, unsigned UserOffset)
+{
+  MachineInstr *UserMI = U.MI;
+  MachineInstr *CPEMI  = U.CPEMI;
+
+  // Check to see if the CPE is already in-range.
+  if (isCPEntryInRange(UserMI, UserOffset, CPEMI,
+                       U.getLongFormMaxDisp(), U.NegOk,
+                       true)) {
+    DEBUG(dbgs() << "In range\n");
+    UserMI->setDesc(TII->get(U.getLongFormOpcode()));
+    U.setMaxDisp(U.getLongFormMaxDisp());
+    return 2;  // instruction is longer length now
+  }
+
+  // No.  Look for previously created clones of the CPE that are in range.
+  unsigned CPI = CPEMI->getOperand(1).getIndex();
+  std::vector<CPEntry> &CPEs = CPEntries[CPI];
+  for (unsigned i = 0, e = CPEs.size(); i != e; ++i) {
+    // We already tried this one
+    if (CPEs[i].CPEMI == CPEMI)
+      continue;
+    // Removing CPEs can leave empty entries, skip
+    if (CPEs[i].CPEMI == nullptr)
+      continue;
+    if (isCPEntryInRange(UserMI, UserOffset, CPEs[i].CPEMI,
+                         U.getLongFormMaxDisp(), U.NegOk)) {
+      DEBUG(dbgs() << "Replacing CPE#" << CPI << " with CPE#"
+                   << CPEs[i].CPI << "\n");
+      // Point the CPUser node to the replacement
+      U.CPEMI = CPEs[i].CPEMI;
+      // Change the CPI in the instruction operand to refer to the clone.
+      for (unsigned j = 0, e = UserMI->getNumOperands(); j != e; ++j)
+        if (UserMI->getOperand(j).isCPI()) {
+          UserMI->getOperand(j).setIndex(CPEs[i].CPI);
+          break;
+        }
+      // Adjust the refcount of the clone...
+      CPEs[i].RefCount++;
+      // ...and the original.  If we didn't remove the old entry, none of the
+      // addresses changed, so we don't need another pass.
+      return decrementCPEReferenceCount(CPI, CPEMI) ? 2 : 1;
+    }
+  }
+  return 0;
+}
+
+/// getUnconditionalBrDisp - Returns the maximum displacement that can fit in
+/// the specific unconditional branch instruction.
+static inline unsigned getUnconditionalBrDisp(int Opc) {
+  switch (Opc) {
+  case Mips::Bimm16:
+    return ((1<<10)-1)*2;
+  case Mips::BimmX16:
+    return ((1<<16)-1)*2;
+  default:
+    break;
+  }
+  return ((1<<16)-1)*2;
+}
+
+/// findAvailableWater - Look for an existing entry in the WaterList in which
+/// we can place the CPE referenced from U so it's within range of U's MI.
+/// Returns true if found, false if not.  If it returns true, WaterIter
+/// is set to the WaterList entry.  
+/// To ensure that this pass
+/// terminates, the CPE location for a particular CPUser is only allowed to
+/// move to a lower address, so search backward from the end of the list and
+/// prefer the first water that is in range.
+bool MipsConstantIslands::findAvailableWater(CPUser &U, unsigned UserOffset,
+                                      water_iterator &WaterIter) {
+  if (WaterList.empty())
+    return false;
+
+  unsigned BestGrowth = ~0u;
+  for (water_iterator IP = std::prev(WaterList.end()), B = WaterList.begin();;
+       --IP) {
+    MachineBasicBlock* WaterBB = *IP;
+    // Check if water is in range and is either at a lower address than the
+    // current "high water mark" or a new water block that was created since
+    // the previous iteration by inserting an unconditional branch.  In the
+    // latter case, we want to allow resetting the high water mark back to
+    // this new water since we haven't seen it before.  Inserting branches
+    // should be relatively uncommon and when it does happen, we want to be
+    // sure to take advantage of it for all the CPEs near that block, so that
+    // we don't insert more branches than necessary.
+    unsigned Growth;
+    if (isWaterInRange(UserOffset, WaterBB, U, Growth) &&
+        (WaterBB->getNumber() < U.HighWaterMark->getNumber() ||
+         NewWaterList.count(WaterBB)) && Growth < BestGrowth) {
+      // This is the least amount of required padding seen so far.
+      BestGrowth = Growth;
+      WaterIter = IP;
+      DEBUG(dbgs() << "Found water after BB#" << WaterBB->getNumber()
+                   << " Growth=" << Growth << '\n');
+
+      // Keep looking unless it is perfect.
+      if (BestGrowth == 0)
+        return true;
+    }
+    if (IP == B)
+      break;
+  }
+  return BestGrowth != ~0u;
+}
+
+/// createNewWater - No existing WaterList entry will work for
+/// CPUsers[CPUserIndex], so create a place to put the CPE.  The end of the
+/// block is used if in range, and the conditional branch munged so control
+/// flow is correct.  Otherwise the block is split to create a hole with an
+/// unconditional branch around it.  In either case NewMBB is set to a
+/// block following which the new island can be inserted (the WaterList
+/// is not adjusted).
+void MipsConstantIslands::createNewWater(unsigned CPUserIndex,
+                                        unsigned UserOffset,
+                                        MachineBasicBlock *&NewMBB) {
+  CPUser &U = CPUsers[CPUserIndex];
+  MachineInstr *UserMI = U.MI;
+  MachineInstr *CPEMI  = U.CPEMI;
+  unsigned CPELogAlign = getCPELogAlign(CPEMI);
+  MachineBasicBlock *UserMBB = UserMI->getParent();
+  const BasicBlockInfo &UserBBI = BBInfo[UserMBB->getNumber()];
+
+  // If the block does not end in an unconditional branch already, and if the
+  // end of the block is within range, make new water there.  
+  if (BBHasFallthrough(UserMBB)) {
+    // Size of branch to insert.
+    unsigned Delta = 2;
+    // Compute the offset where the CPE will begin.
+    unsigned CPEOffset = UserBBI.postOffset(CPELogAlign) + Delta;
+
+    if (isOffsetInRange(UserOffset, CPEOffset, U)) {
+      DEBUG(dbgs() << "Split at end of BB#" << UserMBB->getNumber()
+            << format(", expected CPE offset %#x\n", CPEOffset));
+      NewMBB = std::next(MachineFunction::iterator(UserMBB));
+      // Add an unconditional branch from UserMBB to fallthrough block.  Record
+      // it for branch lengthening; this new branch will not get out of range,
+      // but if the preceding conditional branch is out of range, the targets
+      // will be exchanged, and the altered branch may be out of range, so the
+      // machinery has to know about it.
+      int UncondBr = Mips::Bimm16;
+      BuildMI(UserMBB, DebugLoc(), TII->get(UncondBr)).addMBB(NewMBB);
+      unsigned MaxDisp = getUnconditionalBrDisp(UncondBr);
+      ImmBranches.push_back(ImmBranch(&UserMBB->back(),
+                                      MaxDisp, false, UncondBr));
+      BBInfo[UserMBB->getNumber()].Size += Delta;
+      adjustBBOffsetsAfter(UserMBB);
+      return;
+    }
+  }
+
+  // What a big block.  Find a place within the block to split it.  
+
+  // Try to split the block so it's fully aligned.  Compute the latest split
+  // point where we can add a 4-byte branch instruction, and then align to
+  // LogAlign which is the largest possible alignment in the function.
+  unsigned LogAlign = MF->getAlignment();
+  assert(LogAlign >= CPELogAlign && "Over-aligned constant pool entry");
+  unsigned BaseInsertOffset = UserOffset + U.getMaxDisp();
+  DEBUG(dbgs() << format("Split in middle of big block before %#x",
+                         BaseInsertOffset));
+
+  // The 4 in the following is for the unconditional branch we'll be inserting
+  // Alignment of the island is handled
+  // inside isOffsetInRange.
+  BaseInsertOffset -= 4;
+
+  DEBUG(dbgs() << format(", adjusted to %#x", BaseInsertOffset)
+               << " la=" << LogAlign << '\n');
+
+  // This could point off the end of the block if we've already got constant
+  // pool entries following this block; only the last one is in the water list.
+  // Back past any possible branches (allow for a conditional and a maximally
+  // long unconditional).
+  if (BaseInsertOffset + 8 >= UserBBI.postOffset()) {
+    BaseInsertOffset = UserBBI.postOffset() - 8;
+    DEBUG(dbgs() << format("Move inside block: %#x\n", BaseInsertOffset));
+  }
+  unsigned EndInsertOffset = BaseInsertOffset + 4 +
+    CPEMI->getOperand(2).getImm();
+  MachineBasicBlock::iterator MI = UserMI;
+  ++MI;
+  unsigned CPUIndex = CPUserIndex+1;
+  unsigned NumCPUsers = CPUsers.size();
+  //MachineInstr *LastIT = 0;
+  for (unsigned Offset = UserOffset+TII->GetInstSizeInBytes(UserMI);
+       Offset < BaseInsertOffset;
+       Offset += TII->GetInstSizeInBytes(MI), MI = std::next(MI)) {
+    assert(MI != UserMBB->end() && "Fell off end of block");
+    if (CPUIndex < NumCPUsers && CPUsers[CPUIndex].MI == MI) {
+      CPUser &U = CPUsers[CPUIndex];
+      if (!isOffsetInRange(Offset, EndInsertOffset, U)) {
+        // Shift intertion point by one unit of alignment so it is within reach.
+        BaseInsertOffset -= 1u << LogAlign;
+        EndInsertOffset  -= 1u << LogAlign;
+      }
+      // This is overly conservative, as we don't account for CPEMIs being
+      // reused within the block, but it doesn't matter much.  Also assume CPEs
+      // are added in order with alignment padding.  We may eventually be able
+      // to pack the aligned CPEs better.
+      EndInsertOffset += U.CPEMI->getOperand(2).getImm();
+      CPUIndex++;
+    }
+  }
+
+  --MI;
+  NewMBB = splitBlockBeforeInstr(MI);
+}
+
+/// handleConstantPoolUser - Analyze the specified user, checking to see if it
+/// is out-of-range.  If so, pick up the constant pool value and move it some
+/// place in-range.  Return true if we changed any addresses (thus must run
+/// another pass of branch lengthening), false otherwise.
+bool MipsConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
+  CPUser &U = CPUsers[CPUserIndex];
+  MachineInstr *UserMI = U.MI;
+  MachineInstr *CPEMI  = U.CPEMI;
+  unsigned CPI = CPEMI->getOperand(1).getIndex();
+  unsigned Size = CPEMI->getOperand(2).getImm();
+  // Compute this only once, it's expensive.
+  unsigned UserOffset = getUserOffset(U);
+
+  // See if the current entry is within range, or there is a clone of it
+  // in range.
+  int result = findInRangeCPEntry(U, UserOffset);
+  if (result==1) return false;
+  else if (result==2) return true;
+
+
+  // Look for water where we can place this CPE.
+  MachineBasicBlock *NewIsland = MF->CreateMachineBasicBlock();
+  MachineBasicBlock *NewMBB;
+  water_iterator IP;
+  if (findAvailableWater(U, UserOffset, IP)) {
+    DEBUG(dbgs() << "Found water in range\n");
+    MachineBasicBlock *WaterBB = *IP;
+
+    // If the original WaterList entry was "new water" on this iteration,
+    // propagate that to the new island.  This is just keeping NewWaterList
+    // updated to match the WaterList, which will be updated below.
+    if (NewWaterList.erase(WaterBB))
+      NewWaterList.insert(NewIsland);
+
+    // The new CPE goes before the following block (NewMBB).
+    NewMBB = std::next(MachineFunction::iterator(WaterBB));
+
+  } else {
+    // No water found.
+    // we first see if a longer form of the instrucion could have reached
+    // the constant. in that case we won't bother to split
+    if (!NoLoadRelaxation) {
+      result = findLongFormInRangeCPEntry(U, UserOffset);
+      if (result != 0) return true;
+    }
+    DEBUG(dbgs() << "No water found\n");
+    createNewWater(CPUserIndex, UserOffset, NewMBB);
+
+    // splitBlockBeforeInstr adds to WaterList, which is important when it is
+    // called while handling branches so that the water will be seen on the
+    // next iteration for constant pools, but in this context, we don't want
+    // it.  Check for this so it will be removed from the WaterList.
+    // Also remove any entry from NewWaterList.
+    MachineBasicBlock *WaterBB = std::prev(MachineFunction::iterator(NewMBB));
+    IP = std::find(WaterList.begin(), WaterList.end(), WaterBB);
+    if (IP != WaterList.end())
+      NewWaterList.erase(WaterBB);
+
+    // We are adding new water.  Update NewWaterList.
+    NewWaterList.insert(NewIsland);
+  }
+
+  // Remove the original WaterList entry; we want subsequent insertions in
+  // this vicinity to go after the one we're about to insert.  This
+  // considerably reduces the number of times we have to move the same CPE
+  // more than once and is also important to ensure the algorithm terminates.
+  if (IP != WaterList.end())
+    WaterList.erase(IP);
+
+  // Okay, we know we can put an island before NewMBB now, do it!
+  MF->insert(NewMBB, NewIsland);
+
+  // Update internal data structures to account for the newly inserted MBB.
+  updateForInsertedWaterBlock(NewIsland);
+
+  // Decrement the old entry, and remove it if refcount becomes 0.
+  decrementCPEReferenceCount(CPI, CPEMI);
+
+  // No existing clone of this CPE is within range.
+  // We will be generating a new clone.  Get a UID for it.
+  unsigned ID = createPICLabelUId();
+
+  // Now that we have an island to add the CPE to, clone the original CPE and
+  // add it to the island.
+  U.HighWaterMark = NewIsland;
+  U.CPEMI = BuildMI(NewIsland, DebugLoc(), TII->get(Mips::CONSTPOOL_ENTRY))
+                .addImm(ID).addConstantPoolIndex(CPI).addImm(Size);
+  CPEntries[CPI].push_back(CPEntry(U.CPEMI, ID, 1));
+  ++NumCPEs;
+
+  // Mark the basic block as aligned as required by the const-pool entry.
+  NewIsland->setAlignment(getCPELogAlign(U.CPEMI));
+
+  // Increase the size of the island block to account for the new entry.
+  BBInfo[NewIsland->getNumber()].Size += Size;
+  adjustBBOffsetsAfter(std::prev(MachineFunction::iterator(NewIsland)));
+
+
+
+  // Finally, change the CPI in the instruction operand to be ID.
+  for (unsigned i = 0, e = UserMI->getNumOperands(); i != e; ++i)
+    if (UserMI->getOperand(i).isCPI()) {
+      UserMI->getOperand(i).setIndex(ID);
+      break;
+    }
+
+  DEBUG(dbgs() << "  Moved CPE to #" << ID << " CPI=" << CPI
+        << format(" offset=%#x\n", BBInfo[NewIsland->getNumber()].Offset));
+
+  return true;
+}
+
+/// removeDeadCPEMI - Remove a dead constant pool entry instruction. Update
+/// sizes and offsets of impacted basic blocks.
+void MipsConstantIslands::removeDeadCPEMI(MachineInstr *CPEMI) {
+  MachineBasicBlock *CPEBB = CPEMI->getParent();
+  unsigned Size = CPEMI->getOperand(2).getImm();
+  CPEMI->eraseFromParent();
+  BBInfo[CPEBB->getNumber()].Size -= Size;
+  // All succeeding offsets have the current size value added in, fix this.
+  if (CPEBB->empty()) {
+    BBInfo[CPEBB->getNumber()].Size = 0;
+
+    // This block no longer needs to be aligned.
+    CPEBB->setAlignment(0);
+  } else
+    // Entries are sorted by descending alignment, so realign from the front.
+    CPEBB->setAlignment(getCPELogAlign(CPEBB->begin()));
+
+  adjustBBOffsetsAfter(CPEBB);
+  // An island has only one predecessor BB and one successor BB. Check if
+  // this BB's predecessor jumps directly to this BB's successor. This
+  // shouldn't happen currently.
+  assert(!BBIsJumpedOver(CPEBB) && "How did this happen?");
+  // FIXME: remove the empty blocks after all the work is done?
+}
+
+/// removeUnusedCPEntries - Remove constant pool entries whose refcounts
+/// are zero.
+bool MipsConstantIslands::removeUnusedCPEntries() {
+  unsigned MadeChange = false;
+  for (unsigned i = 0, e = CPEntries.size(); i != e; ++i) {
+      std::vector<CPEntry> &CPEs = CPEntries[i];
+      for (unsigned j = 0, ee = CPEs.size(); j != ee; ++j) {
+        if (CPEs[j].RefCount == 0 && CPEs[j].CPEMI) {
+          removeDeadCPEMI(CPEs[j].CPEMI);
+          CPEs[j].CPEMI = nullptr;
+          MadeChange = true;
+        }
+      }
+  }
+  return MadeChange;
+}
+
+/// isBBInRange - Returns true if the distance between specific MI and
+/// specific BB can fit in MI's displacement field.
+bool MipsConstantIslands::isBBInRange
+  (MachineInstr *MI,MachineBasicBlock *DestBB, unsigned MaxDisp) {
+
+unsigned PCAdj = 4;
+
+  unsigned BrOffset   = getOffsetOf(MI) + PCAdj;
+  unsigned DestOffset = BBInfo[DestBB->getNumber()].Offset;
+
+  DEBUG(dbgs() << "Branch of destination BB#" << DestBB->getNumber()
+               << " from BB#" << MI->getParent()->getNumber()
+               << " max delta=" << MaxDisp
+               << " from " << getOffsetOf(MI) << " to " << DestOffset
+               << " offset " << int(DestOffset-BrOffset) << "\t" << *MI);
+
+  if (BrOffset <= DestOffset) {
+    // Branch before the Dest.
+    if (DestOffset-BrOffset <= MaxDisp)
+      return true;
+  } else {
+    if (BrOffset-DestOffset <= MaxDisp)
+      return true;
+  }
+  return false;
+}
+
+/// fixupImmediateBr - Fix up an immediate branch whose destination is too far
+/// away to fit in its displacement field.
+bool MipsConstantIslands::fixupImmediateBr(ImmBranch &Br) {
+  MachineInstr *MI = Br.MI;
+  unsigned TargetOperand = branchTargetOperand(MI);
+  MachineBasicBlock *DestBB = MI->getOperand(TargetOperand).getMBB();
+
+  // Check to see if the DestBB is already in-range.
+  if (isBBInRange(MI, DestBB, Br.MaxDisp))
+    return false;
+
+  if (!Br.isCond)
+    return fixupUnconditionalBr(Br);
+  return fixupConditionalBr(Br);
+}
+
+/// fixupUnconditionalBr - Fix up an unconditional branch whose destination is
+/// too far away to fit in its displacement field. If the LR register has been
+/// spilled in the epilogue, then we can use BL to implement a far jump.
+/// Otherwise, add an intermediate branch instruction to a branch.
+bool
+MipsConstantIslands::fixupUnconditionalBr(ImmBranch &Br) {
+  MachineInstr *MI = Br.MI;
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineBasicBlock *DestBB = MI->getOperand(0).getMBB();
+  // Use BL to implement far jump.
+  unsigned BimmX16MaxDisp = ((1 << 16)-1) * 2;
+  if (isBBInRange(MI, DestBB, BimmX16MaxDisp)) {
+    Br.MaxDisp = BimmX16MaxDisp;
+    MI->setDesc(TII->get(Mips::BimmX16));
+  }
+  else {
+    // need to give the math a more careful look here
+    // this is really a segment address and not
+    // a PC relative address. FIXME. But I think that
+    // just reducing the bits by 1 as I've done is correct.
+    // The basic block we are branching too much be longword aligned.
+    // we know that RA is saved because we always save it right now.
+    // this requirement will be relaxed later but we also have an alternate
+    // way to implement this that I will implement that does not need jal.
+    // We should have a way to back out this alignment restriction if we "can" later.
+    // but it is not harmful.
+    //
+    DestBB->setAlignment(2);
+    Br.MaxDisp = ((1<<24)-1) * 2;
+    MI->setDesc(TII->get(Mips::JalB16));
+  }
+  BBInfo[MBB->getNumber()].Size += 2;
+  adjustBBOffsetsAfter(MBB);
+  HasFarJump = true;
+  ++NumUBrFixed;
+
+  DEBUG(dbgs() << "  Changed B to long jump " << *MI);
+
+  return true;
+}
+
+
+/// fixupConditionalBr - Fix up a conditional branch whose destination is too
+/// far away to fit in its displacement field. It is converted to an inverse
+/// conditional branch + an unconditional branch to the destination.
+bool
+MipsConstantIslands::fixupConditionalBr(ImmBranch &Br) {
+  MachineInstr *MI = Br.MI;
+  unsigned TargetOperand = branchTargetOperand(MI);
+  MachineBasicBlock *DestBB = MI->getOperand(TargetOperand).getMBB();
+  unsigned Opcode = MI->getOpcode();
+  unsigned LongFormOpcode = longformBranchOpcode(Opcode);
+  unsigned LongFormMaxOff = branchMaxOffsets(LongFormOpcode);
+
+  // Check to see if the DestBB is already in-range.
+  if (isBBInRange(MI, DestBB, LongFormMaxOff)) {
+    Br.MaxDisp = LongFormMaxOff;
+    MI->setDesc(TII->get(LongFormOpcode));
+    return true;
+  }
+
+  // Add an unconditional branch to the destination and invert the branch
+  // condition to jump over it:
+  // bteqz L1
+  // =>
+  // bnez L2
+  // b   L1
+  // L2:
+
+  // If the branch is at the end of its MBB and that has a fall-through block,
+  // direct the updated conditional branch to the fall-through block. Otherwise,
+  // split the MBB before the next instruction.
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineInstr *BMI = &MBB->back();
+  bool NeedSplit = (BMI != MI) || !BBHasFallthrough(MBB);
+  unsigned OppositeBranchOpcode = TII->getOppositeBranchOpc(Opcode);
+ 
+  ++NumCBrFixed;
+  if (BMI != MI) {
+    if (std::next(MachineBasicBlock::iterator(MI)) == std::prev(MBB->end()) &&
+        isUnconditionalBranch(BMI->getOpcode())) {
+      // Last MI in the BB is an unconditional branch. Can we simply invert the
+      // condition and swap destinations:
+      // beqz L1
+      // b   L2
+      // =>
+      // bnez L2
+      // b   L1
+      unsigned BMITargetOperand = branchTargetOperand(BMI);
+      MachineBasicBlock *NewDest = 
+        BMI->getOperand(BMITargetOperand).getMBB();
+      if (isBBInRange(MI, NewDest, Br.MaxDisp)) {
+        DEBUG(dbgs() << "  Invert Bcc condition and swap its destination with "
+                     << *BMI);
+        MI->setDesc(TII->get(OppositeBranchOpcode));
+        BMI->getOperand(BMITargetOperand).setMBB(DestBB);
+        MI->getOperand(TargetOperand).setMBB(NewDest);
+        return true;
+      }
+    }
+  }
+
+
+  if (NeedSplit) {
+    splitBlockBeforeInstr(MI);
+    // No need for the branch to the next block. We're adding an unconditional
+    // branch to the destination.
+    int delta = TII->GetInstSizeInBytes(&MBB->back());
+    BBInfo[MBB->getNumber()].Size -= delta;
+    MBB->back().eraseFromParent();
+    // BBInfo[SplitBB].Offset is wrong temporarily, fixed below
+  }
+  MachineBasicBlock *NextBB = std::next(MachineFunction::iterator(MBB));
+
+  DEBUG(dbgs() << "  Insert B to BB#" << DestBB->getNumber()
+               << " also invert condition and change dest. to BB#"
+               << NextBB->getNumber() << "\n");
+
+  // Insert a new conditional branch and a new unconditional branch.
+  // Also update the ImmBranch as well as adding a new entry for the new branch.
+  if (MI->getNumExplicitOperands() == 2) {
+    BuildMI(MBB, DebugLoc(), TII->get(OppositeBranchOpcode))
+           .addReg(MI->getOperand(0).getReg())
+           .addMBB(NextBB);
+  } else {
+    BuildMI(MBB, DebugLoc(), TII->get(OppositeBranchOpcode))
+           .addMBB(NextBB);
+  }
+  Br.MI = &MBB->back();
+  BBInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back());
+  BuildMI(MBB, DebugLoc(), TII->get(Br.UncondBr)).addMBB(DestBB);
+  BBInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back());
+  unsigned MaxDisp = getUnconditionalBrDisp(Br.UncondBr);
+  ImmBranches.push_back(ImmBranch(&MBB->back(), MaxDisp, false, Br.UncondBr));
+
+  // Remove the old conditional branch.  It may or may not still be in MBB.
+  BBInfo[MI->getParent()->getNumber()].Size -= TII->GetInstSizeInBytes(MI);
+  MI->eraseFromParent();
+  adjustBBOffsetsAfter(MBB);
+  return true;
+}
+
+
+void MipsConstantIslands::prescanForConstants() {
+  unsigned J = 0;
+  (void)J;
+  for (MachineFunction::iterator B =
+         MF->begin(), E = MF->end(); B != E; ++B) {
+    for (MachineBasicBlock::instr_iterator I =
+        B->instr_begin(), EB = B->instr_end(); I != EB; ++I) {
+      switch(I->getDesc().getOpcode()) {
+        case Mips::LwConstant32: {
+          PrescannedForConstants = true;
+          DEBUG(dbgs() << "constant island constant " << *I << "\n");
+          J = I->getNumOperands();
+          DEBUG(dbgs() << "num operands " << J  << "\n");
+          MachineOperand& Literal = I->getOperand(1);
+          if (Literal.isImm()) {
+            int64_t V = Literal.getImm();
+            DEBUG(dbgs() << "literal " << V  << "\n");
+            Type *Int32Ty =
+              Type::getInt32Ty(MF->getFunction()->getContext());
+            const Constant *C = ConstantInt::get(Int32Ty, V);
+            unsigned index = MCP->getConstantPoolIndex(C, 4);
+            I->getOperand(2).ChangeToImmediate(index);
+            DEBUG(dbgs() << "constant island constant " << *I << "\n");
+            I->setDesc(TII->get(Mips::LwRxPcTcp16));
+            I->RemoveOperand(1);
+            I->RemoveOperand(1);
+            I->addOperand(MachineOperand::CreateCPI(index, 0));
+            I->addOperand(MachineOperand::CreateImm(4));
+          }
+          break;
+        }
+        default:
+          break;
+      }
+    }
+  }
+}
+
diff --git a/contrib/llvm/lib/Target/Mips/MipsDSPInstrFormats.td b/contrib/llvm/lib/Target/Mips/MipsDSPInstrFormats.td
new file mode 100644
index 0000000..b5d52ce
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsDSPInstrFormats.td
@@ -0,0 +1,337 @@
+//===- MipsDSPInstrFormats.td - Mips Instruction Formats ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def HasDSP : Predicate<"Subtarget->hasDSP()">,
+             AssemblerPredicate<"FeatureDSP">;
+def HasDSPR2 : Predicate<"Subtarget->hasDSPR2()">,
+               AssemblerPredicate<"FeatureDSPR2">;
+
+// Fields.
+class Field6<bits<6> val> {
+  bits<6> V = val;
+}
+
+def SPECIAL3_OPCODE : Field6<0b011111>;
+def REGIMM_OPCODE : Field6<0b000001>;
+
+class DSPInst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther> {
+  let Predicates = [HasDSP];
+}
+
+class PseudoDSP<dag outs, dag ins, list<dag> pattern,
+                InstrItinClass itin = IIPseudo>:
+  MipsPseudo<outs, ins, pattern, itin> {
+  let Predicates = [HasDSP];
+}
+
+// ADDU.QB sub-class format.
+class ADDU_QB_FMT<bits<5> op> : DSPInst {
+  bits<5> rd;
+  bits<5> rs;
+  bits<5> rt;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = op;
+  let Inst{5-0}   = 0b010000;
+}
+
+class RADDU_W_QB_FMT<bits<5> op> : DSPInst {
+  bits<5> rd;
+  bits<5> rs;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = 0;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = op;
+  let Inst{5-0}   = 0b010000;
+}
+
+// CMPU.EQ.QB sub-class format.
+class CMP_EQ_QB_R2_FMT<bits<5> op> : DSPInst {
+  bits<5> rs;
+  bits<5> rt;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = 0;
+  let Inst{10-6}  = op;
+  let Inst{5-0}   = 0b010001;
+}
+
+class CMP_EQ_QB_R3_FMT<bits<5> op> : DSPInst {
+  bits<5> rs;
+  bits<5> rt;
+  bits<5> rd;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = op;
+  let Inst{5-0}   = 0b010001;
+}
+
+class PRECR_SRA_PH_W_FMT<bits<5> op> : DSPInst {
+  bits<5> rs;
+  bits<5> rt;
+  bits<5> sa;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = sa;
+  let Inst{10-6}  = op;
+  let Inst{5-0}   = 0b010001;
+}
+
+// ABSQ_S.PH sub-class format.
+class ABSQ_S_PH_R2_FMT<bits<5> op> : DSPInst {
+  bits<5> rd;
+  bits<5> rt;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = 0;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = op;
+  let Inst{5-0}   = 0b010010;
+}
+
+
+class REPL_FMT<bits<5> op> : DSPInst {
+  bits<5> rd;
+  bits<10> imm;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-16} = imm;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = op;
+  let Inst{5-0}   = 0b010010;
+}
+
+// SHLL.QB sub-class format.
+class SHLL_QB_FMT<bits<5> op> : DSPInst {
+  bits<5> rd;
+  bits<5> rt;
+  bits<5> rs_sa;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs_sa;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = op;
+  let Inst{5-0}   = 0b010011;
+}
+
+// LX sub-class format.
+class LX_FMT<bits<5> op> : DSPInst {
+  bits<5> rd;
+  bits<5> base;
+  bits<5> index;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = base;
+  let Inst{20-16} = index;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = op;
+  let Inst{5-0} = 0b001010;
+}
+
+// ADDUH.QB sub-class format.
+class ADDUH_QB_FMT<bits<5> op> : DSPInst {
+  bits<5> rd;
+  bits<5> rs;
+  bits<5> rt;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6} = op;
+  let Inst{5-0} = 0b011000;
+}
+
+// APPEND sub-class format.
+class APPEND_FMT<bits<5> op> : DSPInst {
+  bits<5> rt;
+  bits<5> rs;
+  bits<5> sa;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = sa;
+  let Inst{10-6} = op;
+  let Inst{5-0} = 0b110001;
+}
+
+// DPA.W.PH sub-class format.
+class DPA_W_PH_FMT<bits<5> op> : DSPInst {
+  bits<2> ac;
+  bits<5> rs;
+  bits<5> rt;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-13} = 0;
+  let Inst{12-11} = ac;
+  let Inst{10-6}  = op;
+  let Inst{5-0} = 0b110000;
+}
+
+// MULT sub-class format.
+class MULT_FMT<bits<6> opcode, bits<6> funct> : DSPInst {
+  bits<2> ac;
+  bits<5> rs;
+  bits<5> rt;
+
+  let Opcode = opcode;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-13} = 0;
+  let Inst{12-11} = ac;
+  let Inst{10-6}  = 0;
+  let Inst{5-0} = funct;
+}
+
+// MFHI sub-class format.
+class MFHI_FMT<bits<6> funct> : DSPInst {
+  bits<5> rd;
+  bits<2> ac;
+
+  let Inst{31-26} = 0;
+  let Inst{25-23} = 0;
+  let Inst{22-21} = ac;
+  let Inst{20-16} = 0;
+  let Inst{15-11} = rd;
+  let Inst{10-6} = 0;
+  let Inst{5-0} = funct;
+}
+
+// MTHI sub-class format.
+class MTHI_FMT<bits<6> funct> : DSPInst {
+  bits<5> rs;
+  bits<2> ac;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = rs;
+  let Inst{20-13} = 0;
+  let Inst{12-11} = ac;
+  let Inst{10-6} = 0;
+  let Inst{5-0} = funct;
+}
+
+// EXTR.W sub-class format (type 1).
+class EXTR_W_TY1_FMT<bits<5> op> : DSPInst {
+  bits<5> rt;
+  bits<2> ac;
+  bits<5> shift_rs;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = shift_rs;
+  let Inst{20-16} = rt;
+  let Inst{15-13} = 0;
+  let Inst{12-11} = ac;
+  let Inst{10-6} = op;
+  let Inst{5-0} = 0b111000;
+}
+
+// SHILO sub-class format.
+class SHILO_R1_FMT<bits<5> op> : DSPInst {
+  bits<2> ac;
+  bits<6> shift;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-20} = shift;
+  let Inst{19-13} = 0;
+  let Inst{12-11} = ac;
+  let Inst{10-6} = op;
+  let Inst{5-0} = 0b111000;
+}
+
+class SHILO_R2_FMT<bits<5> op> : DSPInst {
+  bits<2> ac;
+  bits<5> rs;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs;
+  let Inst{20-13} = 0;
+  let Inst{12-11} = ac;
+  let Inst{10-6} = op;
+  let Inst{5-0} = 0b111000;
+}
+
+class RDDSP_FMT<bits<5> op> : DSPInst {
+  bits<5> rd;
+  bits<10> mask;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-16} = mask;
+  let Inst{15-11} = rd;
+  let Inst{10-6} = op;
+  let Inst{5-0} = 0b111000;
+}
+
+class WRDSP_FMT<bits<5> op> : DSPInst {
+  bits<5> rs;
+  bits<10> mask;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs;
+  let Inst{20-11} = mask;
+  let Inst{10-6} = op;
+  let Inst{5-0} = 0b111000;
+}
+
+class BPOSGE32_FMT<bits<5> op> : DSPInst {
+  bits<16> offset;
+
+  let Opcode = REGIMM_OPCODE.V;
+
+  let Inst{25-21} = 0;
+  let Inst{20-16} = op;
+  let Inst{15-0} = offset;
+}
+
+// INSV sub-class format.
+class INSV_FMT<bits<6> op> : DSPInst {
+  bits<5> rt;
+  bits<5> rs;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-6} = 0;
+  let Inst{5-0} = op;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsDSPInstrInfo.td b/contrib/llvm/lib/Target/Mips/MipsDSPInstrInfo.td
new file mode 100644
index 0000000..d268384
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsDSPInstrInfo.td
@@ -0,0 +1,1417 @@
+//===- MipsDSPInstrInfo.td - DSP ASE instructions -*- tablegen ------------*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Mips DSP ASE instructions.
+//
+//===----------------------------------------------------------------------===//
+
+// ImmLeaf
+def immZExt2 : ImmLeaf<i32, [{return isUInt<2>(Imm);}]>;
+def immZExt3 : ImmLeaf<i32, [{return isUInt<3>(Imm);}]>;
+def immZExt4 : ImmLeaf<i32, [{return isUInt<4>(Imm);}]>;
+def immZExt8 : ImmLeaf<i32, [{return isUInt<8>(Imm);}]>;
+def immZExt10 : ImmLeaf<i32, [{return isUInt<10>(Imm);}]>;
+def immSExt6 : ImmLeaf<i32, [{return isInt<6>(Imm);}]>;
+
+// Mips-specific dsp nodes
+def SDT_MipsExtr : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisSameAs<0, 1>,
+                                        SDTCisVT<2, untyped>]>;
+def SDT_MipsShilo : SDTypeProfile<1, 2, [SDTCisVT<0, untyped>,
+                                         SDTCisSameAs<0, 2>, SDTCisVT<1, i32>]>;
+def SDT_MipsDPA : SDTypeProfile<1, 3, [SDTCisVT<0, untyped>, SDTCisSameAs<0, 3>,
+                                       SDTCisVT<1, i32>, SDTCisSameAs<1, 2>]>;
+def SDT_MipsSHIFT_DSP : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
+                                             SDTCisVT<2, i32>]>;
+
+class MipsDSPBase<string Opc, SDTypeProfile Prof> :
+  SDNode<!strconcat("MipsISD::", Opc), Prof>;
+
+class MipsDSPSideEffectBase<string Opc, SDTypeProfile Prof> :
+  SDNode<!strconcat("MipsISD::", Opc), Prof, [SDNPHasChain, SDNPSideEffect]>;
+
+def MipsEXTP : MipsDSPSideEffectBase<"EXTP", SDT_MipsExtr>;
+def MipsEXTPDP : MipsDSPSideEffectBase<"EXTPDP", SDT_MipsExtr>;
+def MipsEXTR_S_H : MipsDSPSideEffectBase<"EXTR_S_H", SDT_MipsExtr>;
+def MipsEXTR_W : MipsDSPSideEffectBase<"EXTR_W", SDT_MipsExtr>;
+def MipsEXTR_R_W : MipsDSPSideEffectBase<"EXTR_R_W", SDT_MipsExtr>;
+def MipsEXTR_RS_W : MipsDSPSideEffectBase<"EXTR_RS_W", SDT_MipsExtr>;
+
+def MipsSHILO : MipsDSPBase<"SHILO", SDT_MipsShilo>;
+def MipsMTHLIP : MipsDSPSideEffectBase<"MTHLIP", SDT_MipsShilo>;
+
+def MipsMULSAQ_S_W_PH : MipsDSPSideEffectBase<"MULSAQ_S_W_PH", SDT_MipsDPA>;
+def MipsMAQ_S_W_PHL : MipsDSPSideEffectBase<"MAQ_S_W_PHL", SDT_MipsDPA>;
+def MipsMAQ_S_W_PHR : MipsDSPSideEffectBase<"MAQ_S_W_PHR", SDT_MipsDPA>;
+def MipsMAQ_SA_W_PHL : MipsDSPSideEffectBase<"MAQ_SA_W_PHL", SDT_MipsDPA>;
+def MipsMAQ_SA_W_PHR : MipsDSPSideEffectBase<"MAQ_SA_W_PHR", SDT_MipsDPA>;
+
+def MipsDPAU_H_QBL : MipsDSPBase<"DPAU_H_QBL", SDT_MipsDPA>;
+def MipsDPAU_H_QBR : MipsDSPBase<"DPAU_H_QBR", SDT_MipsDPA>;
+def MipsDPSU_H_QBL : MipsDSPBase<"DPSU_H_QBL", SDT_MipsDPA>;
+def MipsDPSU_H_QBR : MipsDSPBase<"DPSU_H_QBR", SDT_MipsDPA>;
+def MipsDPAQ_S_W_PH : MipsDSPSideEffectBase<"DPAQ_S_W_PH", SDT_MipsDPA>;
+def MipsDPSQ_S_W_PH : MipsDSPSideEffectBase<"DPSQ_S_W_PH", SDT_MipsDPA>;
+def MipsDPAQ_SA_L_W : MipsDSPSideEffectBase<"DPAQ_SA_L_W", SDT_MipsDPA>;
+def MipsDPSQ_SA_L_W : MipsDSPSideEffectBase<"DPSQ_SA_L_W", SDT_MipsDPA>;
+
+def MipsDPA_W_PH : MipsDSPBase<"DPA_W_PH", SDT_MipsDPA>;
+def MipsDPS_W_PH : MipsDSPBase<"DPS_W_PH", SDT_MipsDPA>;
+def MipsDPAQX_S_W_PH : MipsDSPSideEffectBase<"DPAQX_S_W_PH", SDT_MipsDPA>;
+def MipsDPAQX_SA_W_PH : MipsDSPSideEffectBase<"DPAQX_SA_W_PH", SDT_MipsDPA>;
+def MipsDPAX_W_PH : MipsDSPBase<"DPAX_W_PH", SDT_MipsDPA>;
+def MipsDPSX_W_PH : MipsDSPBase<"DPSX_W_PH", SDT_MipsDPA>;
+def MipsDPSQX_S_W_PH : MipsDSPSideEffectBase<"DPSQX_S_W_PH", SDT_MipsDPA>;
+def MipsDPSQX_SA_W_PH : MipsDSPSideEffectBase<"DPSQX_SA_W_PH", SDT_MipsDPA>;
+def MipsMULSA_W_PH : MipsDSPBase<"MULSA_W_PH", SDT_MipsDPA>;
+
+def MipsMULT : MipsDSPBase<"MULT", SDT_MipsDPA>;
+def MipsMULTU : MipsDSPBase<"MULTU", SDT_MipsDPA>;
+def MipsMADD_DSP : MipsDSPBase<"MADD_DSP", SDT_MipsDPA>;
+def MipsMADDU_DSP : MipsDSPBase<"MADDU_DSP", SDT_MipsDPA>;
+def MipsMSUB_DSP : MipsDSPBase<"MSUB_DSP", SDT_MipsDPA>;
+def MipsMSUBU_DSP : MipsDSPBase<"MSUBU_DSP", SDT_MipsDPA>;
+def MipsSHLL_DSP : MipsDSPBase<"SHLL_DSP", SDT_MipsSHIFT_DSP>;
+def MipsSHRA_DSP : MipsDSPBase<"SHRA_DSP", SDT_MipsSHIFT_DSP>;
+def MipsSHRL_DSP : MipsDSPBase<"SHRL_DSP", SDT_MipsSHIFT_DSP>;
+def MipsSETCC_DSP : MipsDSPBase<"SETCC_DSP", SDTSetCC>;
+def MipsSELECT_CC_DSP : MipsDSPBase<"SELECT_CC_DSP", SDTSelectCC>;
+
+// Flags.
+class Uses<list<Register> Regs> {
+  list<Register> Uses = Regs;
+}
+
+class Defs<list<Register> Regs> {
+  list<Register> Defs = Regs;
+}
+
+// Instruction encoding.
+class ADDU_QB_ENC : ADDU_QB_FMT<0b00000>;
+class ADDU_S_QB_ENC : ADDU_QB_FMT<0b00100>;
+class SUBU_QB_ENC : ADDU_QB_FMT<0b00001>;
+class SUBU_S_QB_ENC : ADDU_QB_FMT<0b00101>;
+class ADDQ_PH_ENC : ADDU_QB_FMT<0b01010>;
+class ADDQ_S_PH_ENC : ADDU_QB_FMT<0b01110>;
+class SUBQ_PH_ENC : ADDU_QB_FMT<0b01011>;
+class SUBQ_S_PH_ENC : ADDU_QB_FMT<0b01111>;
+class ADDQ_S_W_ENC : ADDU_QB_FMT<0b10110>;
+class SUBQ_S_W_ENC : ADDU_QB_FMT<0b10111>;
+class ADDSC_ENC : ADDU_QB_FMT<0b10000>;
+class ADDWC_ENC : ADDU_QB_FMT<0b10001>;
+class MODSUB_ENC : ADDU_QB_FMT<0b10010>;
+class RADDU_W_QB_ENC : RADDU_W_QB_FMT<0b10100>;
+class ABSQ_S_PH_ENC : ABSQ_S_PH_R2_FMT<0b01001>;
+class ABSQ_S_W_ENC : ABSQ_S_PH_R2_FMT<0b10001>;
+class PRECRQ_QB_PH_ENC : CMP_EQ_QB_R3_FMT<0b01100>;
+class PRECRQ_PH_W_ENC : CMP_EQ_QB_R3_FMT<0b10100>;
+class PRECRQ_RS_PH_W_ENC : CMP_EQ_QB_R3_FMT<0b10101>;
+class PRECRQU_S_QB_PH_ENC : CMP_EQ_QB_R3_FMT<0b01111>;
+class PRECEQ_W_PHL_ENC : ABSQ_S_PH_R2_FMT<0b01100>;
+class PRECEQ_W_PHR_ENC : ABSQ_S_PH_R2_FMT<0b01101>;
+class PRECEQU_PH_QBL_ENC : ABSQ_S_PH_R2_FMT<0b00100>;
+class PRECEQU_PH_QBR_ENC : ABSQ_S_PH_R2_FMT<0b00101>;
+class PRECEQU_PH_QBLA_ENC : ABSQ_S_PH_R2_FMT<0b00110>;
+class PRECEQU_PH_QBRA_ENC : ABSQ_S_PH_R2_FMT<0b00111>;
+class PRECEU_PH_QBL_ENC : ABSQ_S_PH_R2_FMT<0b11100>;
+class PRECEU_PH_QBR_ENC : ABSQ_S_PH_R2_FMT<0b11101>;
+class PRECEU_PH_QBLA_ENC : ABSQ_S_PH_R2_FMT<0b11110>;
+class PRECEU_PH_QBRA_ENC : ABSQ_S_PH_R2_FMT<0b11111>;
+class SHLL_QB_ENC : SHLL_QB_FMT<0b00000>;
+class SHLLV_QB_ENC : SHLL_QB_FMT<0b00010>;
+class SHRL_QB_ENC : SHLL_QB_FMT<0b00001>;
+class SHRLV_QB_ENC : SHLL_QB_FMT<0b00011>;
+class SHLL_PH_ENC : SHLL_QB_FMT<0b01000>;
+class SHLLV_PH_ENC : SHLL_QB_FMT<0b01010>;
+class SHLL_S_PH_ENC : SHLL_QB_FMT<0b01100>;
+class SHLLV_S_PH_ENC : SHLL_QB_FMT<0b01110>;
+class SHRA_PH_ENC : SHLL_QB_FMT<0b01001>;
+class SHRAV_PH_ENC : SHLL_QB_FMT<0b01011>;
+class SHRA_R_PH_ENC : SHLL_QB_FMT<0b01101>;
+class SHRAV_R_PH_ENC : SHLL_QB_FMT<0b01111>;
+class SHLL_S_W_ENC : SHLL_QB_FMT<0b10100>;
+class SHLLV_S_W_ENC : SHLL_QB_FMT<0b10110>;
+class SHRA_R_W_ENC : SHLL_QB_FMT<0b10101>;
+class SHRAV_R_W_ENC : SHLL_QB_FMT<0b10111>;
+class MULEU_S_PH_QBL_ENC : ADDU_QB_FMT<0b00110>;
+class MULEU_S_PH_QBR_ENC : ADDU_QB_FMT<0b00111>;
+class MULEQ_S_W_PHL_ENC : ADDU_QB_FMT<0b11100>;
+class MULEQ_S_W_PHR_ENC : ADDU_QB_FMT<0b11101>;
+class MULQ_RS_PH_ENC : ADDU_QB_FMT<0b11111>;
+class MULSAQ_S_W_PH_ENC : DPA_W_PH_FMT<0b00110>;
+class MAQ_S_W_PHL_ENC : DPA_W_PH_FMT<0b10100>;
+class MAQ_S_W_PHR_ENC : DPA_W_PH_FMT<0b10110>;
+class MAQ_SA_W_PHL_ENC : DPA_W_PH_FMT<0b10000>;
+class MAQ_SA_W_PHR_ENC : DPA_W_PH_FMT<0b10010>;
+class MFHI_ENC : MFHI_FMT<0b010000>;
+class MFLO_ENC : MFHI_FMT<0b010010>;
+class MTHI_ENC : MTHI_FMT<0b010001>;
+class MTLO_ENC : MTHI_FMT<0b010011>;
+class DPAU_H_QBL_ENC : DPA_W_PH_FMT<0b00011>;
+class DPAU_H_QBR_ENC : DPA_W_PH_FMT<0b00111>;
+class DPSU_H_QBL_ENC : DPA_W_PH_FMT<0b01011>;
+class DPSU_H_QBR_ENC : DPA_W_PH_FMT<0b01111>;
+class DPAQ_S_W_PH_ENC : DPA_W_PH_FMT<0b00100>;
+class DPSQ_S_W_PH_ENC : DPA_W_PH_FMT<0b00101>;
+class DPAQ_SA_L_W_ENC : DPA_W_PH_FMT<0b01100>;
+class DPSQ_SA_L_W_ENC : DPA_W_PH_FMT<0b01101>;
+class MULT_DSP_ENC : MULT_FMT<0b000000, 0b011000>;
+class MULTU_DSP_ENC : MULT_FMT<0b000000, 0b011001>;
+class MADD_DSP_ENC : MULT_FMT<0b011100, 0b000000>;
+class MADDU_DSP_ENC : MULT_FMT<0b011100, 0b000001>;
+class MSUB_DSP_ENC : MULT_FMT<0b011100, 0b000100>;
+class MSUBU_DSP_ENC : MULT_FMT<0b011100, 0b000101>;
+class CMPU_EQ_QB_ENC : CMP_EQ_QB_R2_FMT<0b00000>;
+class CMPU_LT_QB_ENC : CMP_EQ_QB_R2_FMT<0b00001>;
+class CMPU_LE_QB_ENC : CMP_EQ_QB_R2_FMT<0b00010>;
+class CMPGU_EQ_QB_ENC : CMP_EQ_QB_R3_FMT<0b00100>;
+class CMPGU_LT_QB_ENC : CMP_EQ_QB_R3_FMT<0b00101>;
+class CMPGU_LE_QB_ENC : CMP_EQ_QB_R3_FMT<0b00110>;
+class CMP_EQ_PH_ENC : CMP_EQ_QB_R2_FMT<0b01000>;
+class CMP_LT_PH_ENC : CMP_EQ_QB_R2_FMT<0b01001>;
+class CMP_LE_PH_ENC : CMP_EQ_QB_R2_FMT<0b01010>;
+class BITREV_ENC : ABSQ_S_PH_R2_FMT<0b11011>;
+class PACKRL_PH_ENC : CMP_EQ_QB_R3_FMT<0b01110>;
+class REPL_QB_ENC : REPL_FMT<0b00010>;
+class REPL_PH_ENC : REPL_FMT<0b01010>;
+class REPLV_QB_ENC : ABSQ_S_PH_R2_FMT<0b00011>;
+class REPLV_PH_ENC : ABSQ_S_PH_R2_FMT<0b01011>;
+class PICK_QB_ENC : CMP_EQ_QB_R3_FMT<0b00011>;
+class PICK_PH_ENC : CMP_EQ_QB_R3_FMT<0b01011>;
+class LWX_ENC : LX_FMT<0b00000>;
+class LHX_ENC : LX_FMT<0b00100>;
+class LBUX_ENC : LX_FMT<0b00110>;
+class BPOSGE32_ENC : BPOSGE32_FMT<0b11100>;
+class INSV_ENC : INSV_FMT<0b001100>;
+
+class EXTP_ENC : EXTR_W_TY1_FMT<0b00010>;
+class EXTPV_ENC : EXTR_W_TY1_FMT<0b00011>;
+class EXTPDP_ENC : EXTR_W_TY1_FMT<0b01010>;
+class EXTPDPV_ENC : EXTR_W_TY1_FMT<0b01011>;
+class EXTR_W_ENC : EXTR_W_TY1_FMT<0b00000>;
+class EXTRV_W_ENC : EXTR_W_TY1_FMT<0b00001>;
+class EXTR_R_W_ENC : EXTR_W_TY1_FMT<0b00100>;
+class EXTRV_R_W_ENC : EXTR_W_TY1_FMT<0b00101>;
+class EXTR_RS_W_ENC : EXTR_W_TY1_FMT<0b00110>;
+class EXTRV_RS_W_ENC : EXTR_W_TY1_FMT<0b00111>;
+class EXTR_S_H_ENC : EXTR_W_TY1_FMT<0b01110>;
+class EXTRV_S_H_ENC : EXTR_W_TY1_FMT<0b01111>;
+class SHILO_ENC : SHILO_R1_FMT<0b11010>;
+class SHILOV_ENC : SHILO_R2_FMT<0b11011>;
+class MTHLIP_ENC : SHILO_R2_FMT<0b11111>;
+
+class RDDSP_ENC : RDDSP_FMT<0b10010>;
+class WRDSP_ENC : WRDSP_FMT<0b10011>;
+class ADDU_PH_ENC : ADDU_QB_FMT<0b01000>;
+class ADDU_S_PH_ENC : ADDU_QB_FMT<0b01100>;
+class SUBU_PH_ENC : ADDU_QB_FMT<0b01001>;
+class SUBU_S_PH_ENC : ADDU_QB_FMT<0b01101>;
+class CMPGDU_EQ_QB_ENC : CMP_EQ_QB_R3_FMT<0b11000>;
+class CMPGDU_LT_QB_ENC : CMP_EQ_QB_R3_FMT<0b11001>;
+class CMPGDU_LE_QB_ENC : CMP_EQ_QB_R3_FMT<0b11010>;
+class ABSQ_S_QB_ENC : ABSQ_S_PH_R2_FMT<0b00001>;
+class ADDUH_QB_ENC : ADDUH_QB_FMT<0b00000>;
+class ADDUH_R_QB_ENC : ADDUH_QB_FMT<0b00010>;
+class SUBUH_QB_ENC : ADDUH_QB_FMT<0b00001>;
+class SUBUH_R_QB_ENC : ADDUH_QB_FMT<0b00011>;
+class ADDQH_PH_ENC : ADDUH_QB_FMT<0b01000>;
+class ADDQH_R_PH_ENC : ADDUH_QB_FMT<0b01010>;
+class SUBQH_PH_ENC : ADDUH_QB_FMT<0b01001>;
+class SUBQH_R_PH_ENC : ADDUH_QB_FMT<0b01011>;
+class ADDQH_W_ENC : ADDUH_QB_FMT<0b10000>;
+class ADDQH_R_W_ENC : ADDUH_QB_FMT<0b10010>;
+class SUBQH_W_ENC : ADDUH_QB_FMT<0b10001>;
+class SUBQH_R_W_ENC : ADDUH_QB_FMT<0b10011>;
+class MUL_PH_ENC : ADDUH_QB_FMT<0b01100>;
+class MUL_S_PH_ENC : ADDUH_QB_FMT<0b01110>;
+class MULQ_S_W_ENC : ADDUH_QB_FMT<0b10110>;
+class MULQ_RS_W_ENC : ADDUH_QB_FMT<0b10111>;
+class MULQ_S_PH_ENC : ADDU_QB_FMT<0b11110>;
+class DPA_W_PH_ENC : DPA_W_PH_FMT<0b00000>;
+class DPS_W_PH_ENC : DPA_W_PH_FMT<0b00001>;
+class DPAQX_S_W_PH_ENC : DPA_W_PH_FMT<0b11000>;
+class DPAQX_SA_W_PH_ENC : DPA_W_PH_FMT<0b11010>;
+class DPAX_W_PH_ENC : DPA_W_PH_FMT<0b01000>;
+class DPSX_W_PH_ENC : DPA_W_PH_FMT<0b01001>;
+class DPSQX_S_W_PH_ENC : DPA_W_PH_FMT<0b11001>;
+class DPSQX_SA_W_PH_ENC : DPA_W_PH_FMT<0b11011>;
+class MULSA_W_PH_ENC : DPA_W_PH_FMT<0b00010>;
+class PRECR_QB_PH_ENC : CMP_EQ_QB_R3_FMT<0b01101>;
+class PRECR_SRA_PH_W_ENC : PRECR_SRA_PH_W_FMT<0b11110>;
+class PRECR_SRA_R_PH_W_ENC : PRECR_SRA_PH_W_FMT<0b11111>;
+class SHRA_QB_ENC : SHLL_QB_FMT<0b00100>;
+class SHRAV_QB_ENC : SHLL_QB_FMT<0b00110>;
+class SHRA_R_QB_ENC : SHLL_QB_FMT<0b00101>;
+class SHRAV_R_QB_ENC : SHLL_QB_FMT<0b00111>;
+class SHRL_PH_ENC : SHLL_QB_FMT<0b11001>;
+class SHRLV_PH_ENC : SHLL_QB_FMT<0b11011>;
+class APPEND_ENC : APPEND_FMT<0b00000>;
+class BALIGN_ENC : APPEND_FMT<0b10000>;
+class PREPEND_ENC : APPEND_FMT<0b00001>;
+
+// Instruction desc.
+class ADDU_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                        InstrItinClass itin, RegisterOperand ROD,
+                        RegisterOperand ROS,  RegisterOperand ROT = ROS> {
+  dag OutOperandList = (outs ROD:$rd);
+  dag InOperandList = (ins ROS:$rs, ROT:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
+  list<dag> Pattern = [(set ROD:$rd, (OpNode ROS:$rs, ROT:$rt))];
+  InstrItinClass Itinerary = itin;
+}
+
+class RADDU_W_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                           InstrItinClass itin, RegisterOperand ROD,
+                           RegisterOperand ROS = ROD> {
+  dag OutOperandList = (outs ROD:$rd);
+  dag InOperandList = (ins ROS:$rs);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs");
+  list<dag> Pattern = [(set ROD:$rd, (OpNode ROS:$rs))];
+  InstrItinClass Itinerary = itin;
+}
+
+class CMP_EQ_QB_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                             InstrItinClass itin, RegisterOperand ROS,
+                             RegisterOperand ROT = ROS> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins ROS:$rs, ROT:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$rs, $rt");
+  list<dag> Pattern = [(OpNode ROS:$rs, ROT:$rt)];
+  InstrItinClass Itinerary = itin;
+}
+
+class CMP_EQ_QB_R3_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                             InstrItinClass itin, RegisterOperand ROD,
+                             RegisterOperand ROS,  RegisterOperand ROT = ROS> {
+  dag OutOperandList = (outs ROD:$rd);
+  dag InOperandList = (ins ROS:$rs, ROT:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
+  list<dag> Pattern = [(set ROD:$rd, (OpNode ROS:$rs, ROT:$rt))];
+  InstrItinClass Itinerary = itin;
+}
+
+class PRECR_SRA_PH_W_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                               InstrItinClass itin, RegisterOperand ROT,
+                               RegisterOperand ROS = ROT> {
+  dag OutOperandList = (outs ROT:$rt);
+  dag InOperandList = (ins ROS:$rs, uimm5:$sa, ROS:$src);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $sa");
+  list<dag> Pattern = [(set ROT:$rt, (OpNode ROS:$src, ROS:$rs, immZExt5:$sa))];
+  InstrItinClass Itinerary = itin;
+  string Constraints = "$src = $rt";
+}
+
+class ABSQ_S_PH_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                             InstrItinClass itin, RegisterOperand ROD,
+                             RegisterOperand ROT = ROD> {
+  dag OutOperandList = (outs ROD:$rd);
+  dag InOperandList = (ins ROT:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rt");
+  list<dag> Pattern = [(set ROD:$rd, (OpNode ROT:$rt))];
+  InstrItinClass Itinerary = itin;
+}
+
+class REPL_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                     ImmLeaf immPat, InstrItinClass itin, RegisterOperand RO> {
+  dag OutOperandList = (outs RO:$rd);
+  dag InOperandList = (ins uimm16:$imm);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $imm");
+  list<dag> Pattern = [(set RO:$rd, (OpNode immPat:$imm))];
+  InstrItinClass Itinerary = itin;
+}
+
+class SHLL_QB_R3_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                           InstrItinClass itin, RegisterOperand RO> {
+  dag OutOperandList = (outs RO:$rd);
+  dag InOperandList =  (ins RO:$rt, GPR32Opnd:$rs_sa);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rt, $rs_sa");
+  list<dag> Pattern = [(set RO:$rd, (OpNode RO:$rt, GPR32Opnd:$rs_sa))];
+  InstrItinClass Itinerary = itin;
+}
+
+class SHLL_QB_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                           SDPatternOperator ImmPat, InstrItinClass itin,
+                           RegisterOperand RO> {
+  dag OutOperandList = (outs RO:$rd);
+  dag InOperandList = (ins RO:$rt, uimm16:$rs_sa);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rt, $rs_sa");
+  list<dag> Pattern = [(set RO:$rd, (OpNode RO:$rt, ImmPat:$rs_sa))];
+  InstrItinClass Itinerary = itin;
+  bit hasSideEffects = 1;
+}
+
+class LX_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                   InstrItinClass itin> {
+  dag OutOperandList = (outs GPR32Opnd:$rd);
+  dag InOperandList = (ins PtrRC:$base, PtrRC:$index);
+  string AsmString = !strconcat(instr_asm, "\t$rd, ${index}(${base})");
+  list<dag> Pattern = [(set GPR32Opnd:$rd, (OpNode iPTR:$base, iPTR:$index))];
+  InstrItinClass Itinerary = itin;
+  bit mayLoad = 1;
+}
+
+class ADDUH_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                         InstrItinClass itin, RegisterOperand ROD,
+                         RegisterOperand ROS = ROD,  RegisterOperand ROT = ROD> {
+  dag OutOperandList = (outs ROD:$rd);
+  dag InOperandList = (ins ROS:$rs, ROT:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
+  list<dag> Pattern = [(set ROD:$rd, (OpNode ROS:$rs, ROT:$rt))];
+  InstrItinClass Itinerary = itin;
+}
+
+class APPEND_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                       SDPatternOperator ImmOp, InstrItinClass itin> {
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins GPR32Opnd:$rs, uimm5:$sa, GPR32Opnd:$src);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $sa");
+  list<dag> Pattern =  [(set GPR32Opnd:$rt,
+                        (OpNode GPR32Opnd:$src, GPR32Opnd:$rs, ImmOp:$sa))];
+  InstrItinClass Itinerary = itin;
+  string Constraints = "$src = $rt";
+}
+
+class EXTR_W_TY1_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                              InstrItinClass itin> {
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins ACC64DSPOpnd:$ac, GPR32Opnd:$shift_rs);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $ac, $shift_rs");
+  InstrItinClass Itinerary = itin;
+}
+
+class EXTR_W_TY1_R1_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                              InstrItinClass itin> {
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins ACC64DSPOpnd:$ac, uimm16:$shift_rs);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $ac, $shift_rs");
+  InstrItinClass Itinerary = itin;
+}
+
+class SHILO_R1_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
+  dag OutOperandList = (outs ACC64DSPOpnd:$ac);
+  dag InOperandList = (ins simm16:$shift, ACC64DSPOpnd:$acin);
+  string AsmString = !strconcat(instr_asm, "\t$ac, $shift");
+  list<dag> Pattern = [(set ACC64DSPOpnd:$ac,
+                        (OpNode immSExt6:$shift, ACC64DSPOpnd:$acin))];
+  string Constraints = "$acin = $ac";
+}
+
+class SHILO_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
+  dag OutOperandList = (outs ACC64DSPOpnd:$ac);
+  dag InOperandList = (ins GPR32Opnd:$rs, ACC64DSPOpnd:$acin);
+  string AsmString = !strconcat(instr_asm, "\t$ac, $rs");
+  list<dag> Pattern = [(set ACC64DSPOpnd:$ac,
+                        (OpNode GPR32Opnd:$rs, ACC64DSPOpnd:$acin))];
+  string Constraints = "$acin = $ac";
+}
+
+class MTHLIP_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
+  dag OutOperandList = (outs ACC64DSPOpnd:$ac);
+  dag InOperandList = (ins GPR32Opnd:$rs, ACC64DSPOpnd:$acin);
+  string AsmString = !strconcat(instr_asm, "\t$rs, $ac");
+  list<dag> Pattern = [(set ACC64DSPOpnd:$ac,
+                        (OpNode GPR32Opnd:$rs, ACC64DSPOpnd:$acin))];
+  string Constraints = "$acin = $ac";
+}
+
+class RDDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                      InstrItinClass itin> {
+  dag OutOperandList = (outs GPR32Opnd:$rd);
+  dag InOperandList = (ins uimm16:$mask);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $mask");
+  list<dag> Pattern = [(set GPR32Opnd:$rd, (OpNode immZExt10:$mask))];
+  InstrItinClass Itinerary = itin;
+}
+
+class WRDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                      InstrItinClass itin> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins GPR32Opnd:$rs, uimm16:$mask);
+  string AsmString = !strconcat(instr_asm, "\t$rs, $mask");
+  list<dag> Pattern = [(OpNode GPR32Opnd:$rs, immZExt10:$mask)];
+  InstrItinClass Itinerary = itin;
+}
+
+class DPA_W_PH_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
+  dag OutOperandList = (outs ACC64DSPOpnd:$ac);
+  dag InOperandList = (ins GPR32Opnd:$rs, GPR32Opnd:$rt, ACC64DSPOpnd:$acin);
+  string AsmString = !strconcat(instr_asm, "\t$ac, $rs, $rt");
+  list<dag> Pattern = [(set ACC64DSPOpnd:$ac,
+                        (OpNode GPR32Opnd:$rs, GPR32Opnd:$rt, ACC64DSPOpnd:$acin))];
+  string Constraints = "$acin = $ac";
+}
+
+class MULT_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                     InstrItinClass itin> {
+  dag OutOperandList = (outs ACC64DSPOpnd:$ac);
+  dag InOperandList = (ins GPR32Opnd:$rs, GPR32Opnd:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$ac, $rs, $rt");
+  list<dag> Pattern = [(set ACC64DSPOpnd:$ac, (OpNode GPR32Opnd:$rs, GPR32Opnd:$rt))];
+  InstrItinClass Itinerary = itin;
+  bit isCommutable = 1;
+}
+
+class MADD_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                     InstrItinClass itin> {
+  dag OutOperandList = (outs ACC64DSPOpnd:$ac);
+  dag InOperandList = (ins GPR32Opnd:$rs, GPR32Opnd:$rt, ACC64DSPOpnd:$acin);
+  string AsmString = !strconcat(instr_asm, "\t$ac, $rs, $rt");
+  list<dag> Pattern = [(set ACC64DSPOpnd:$ac,
+                        (OpNode GPR32Opnd:$rs, GPR32Opnd:$rt, ACC64DSPOpnd:$acin))];
+  InstrItinClass Itinerary = itin;
+  string Constraints = "$acin = $ac";
+}
+
+class MFHI_DESC_BASE<string instr_asm, RegisterOperand RO, SDNode OpNode,
+                     InstrItinClass itin> {
+  dag OutOperandList = (outs GPR32Opnd:$rd);
+  dag InOperandList = (ins RO:$ac);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $ac");
+  list<dag> Pattern = [(set GPR32Opnd:$rd, (OpNode RO:$ac))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MTHI_DESC_BASE<string instr_asm, RegisterOperand RO, InstrItinClass itin> {
+  dag OutOperandList = (outs RO:$ac);
+  dag InOperandList = (ins GPR32Opnd:$rs);
+  string AsmString = !strconcat(instr_asm, "\t$rs, $ac");
+  InstrItinClass Itinerary = itin;
+}
+
+class BPOSGE32_PSEUDO_DESC_BASE<SDPatternOperator OpNode, InstrItinClass itin> :
+  MipsPseudo<(outs GPR32Opnd:$dst), (ins), [(set GPR32Opnd:$dst, (OpNode))]> {
+  bit usesCustomInserter = 1;
+}
+
+class BPOSGE32_DESC_BASE<string instr_asm, InstrItinClass itin> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins brtarget:$offset);
+  string AsmString = !strconcat(instr_asm, "\t$offset");
+  InstrItinClass Itinerary = itin;
+  bit isBranch = 1;
+  bit isTerminator = 1;
+  bit hasDelaySlot = 1;
+}
+
+class INSV_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                     InstrItinClass itin> {
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins GPR32Opnd:$src, GPR32Opnd:$rs);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $rs");
+  list<dag> Pattern = [(set GPR32Opnd:$rt, (OpNode GPR32Opnd:$src, GPR32Opnd:$rs))];
+  InstrItinClass Itinerary = itin;
+  string Constraints = "$src = $rt";
+}
+
+//===----------------------------------------------------------------------===//
+// MIPS DSP Rev 1
+//===----------------------------------------------------------------------===//
+
+// Addition/subtraction
+class ADDU_QB_DESC : ADDU_QB_DESC_BASE<"addu.qb", null_frag, NoItinerary,
+                                       DSPROpnd, DSPROpnd>, IsCommutable,
+                     Defs<[DSPOutFlag20]>;
+
+class ADDU_S_QB_DESC : ADDU_QB_DESC_BASE<"addu_s.qb", int_mips_addu_s_qb,
+                                         NoItinerary, DSPROpnd, DSPROpnd>,
+                       IsCommutable, Defs<[DSPOutFlag20]>;
+
+class SUBU_QB_DESC : ADDU_QB_DESC_BASE<"subu.qb", null_frag, NoItinerary,
+                                       DSPROpnd, DSPROpnd>,
+                     Defs<[DSPOutFlag20]>;
+
+class SUBU_S_QB_DESC : ADDU_QB_DESC_BASE<"subu_s.qb", int_mips_subu_s_qb,
+                                         NoItinerary, DSPROpnd, DSPROpnd>,
+                       Defs<[DSPOutFlag20]>;
+
+class ADDQ_PH_DESC : ADDU_QB_DESC_BASE<"addq.ph", null_frag, NoItinerary,
+                                       DSPROpnd, DSPROpnd>, IsCommutable,
+                     Defs<[DSPOutFlag20]>;
+
+class ADDQ_S_PH_DESC : ADDU_QB_DESC_BASE<"addq_s.ph", int_mips_addq_s_ph,
+                                         NoItinerary, DSPROpnd, DSPROpnd>,
+                       IsCommutable, Defs<[DSPOutFlag20]>;
+
+class SUBQ_PH_DESC : ADDU_QB_DESC_BASE<"subq.ph", null_frag, NoItinerary,
+                                       DSPROpnd, DSPROpnd>,
+                     Defs<[DSPOutFlag20]>;
+
+class SUBQ_S_PH_DESC : ADDU_QB_DESC_BASE<"subq_s.ph", int_mips_subq_s_ph,
+                                         NoItinerary, DSPROpnd, DSPROpnd>,
+                       Defs<[DSPOutFlag20]>;
+
+class ADDQ_S_W_DESC : ADDU_QB_DESC_BASE<"addq_s.w", int_mips_addq_s_w,
+                                        NoItinerary, GPR32Opnd, GPR32Opnd>,
+                      IsCommutable, Defs<[DSPOutFlag20]>;
+
+class SUBQ_S_W_DESC : ADDU_QB_DESC_BASE<"subq_s.w", int_mips_subq_s_w,
+                                        NoItinerary, GPR32Opnd, GPR32Opnd>,
+                      Defs<[DSPOutFlag20]>;
+
+class ADDSC_DESC : ADDU_QB_DESC_BASE<"addsc", null_frag, NoItinerary,
+                                     GPR32Opnd, GPR32Opnd>, IsCommutable,
+                   Defs<[DSPCarry]>;
+
+class ADDWC_DESC : ADDU_QB_DESC_BASE<"addwc", null_frag, NoItinerary,
+                                     GPR32Opnd, GPR32Opnd>,
+                   IsCommutable, Uses<[DSPCarry]>, Defs<[DSPOutFlag20]>;
+
+class MODSUB_DESC : ADDU_QB_DESC_BASE<"modsub", int_mips_modsub, NoItinerary,
+                                      GPR32Opnd, GPR32Opnd>;
+
+class RADDU_W_QB_DESC : RADDU_W_QB_DESC_BASE<"raddu.w.qb", int_mips_raddu_w_qb,
+                                             NoItinerary, GPR32Opnd, DSPROpnd>;
+
+// Absolute value
+class ABSQ_S_PH_DESC : ABSQ_S_PH_R2_DESC_BASE<"absq_s.ph", int_mips_absq_s_ph,
+                                              NoItinerary, DSPROpnd>,
+                       Defs<[DSPOutFlag20]>;
+
+class ABSQ_S_W_DESC : ABSQ_S_PH_R2_DESC_BASE<"absq_s.w", int_mips_absq_s_w,
+                                             NoItinerary, GPR32Opnd>,
+                      Defs<[DSPOutFlag20]>;
+
+// Precision reduce/expand
+class PRECRQ_QB_PH_DESC : CMP_EQ_QB_R3_DESC_BASE<"precrq.qb.ph",
+                                                 int_mips_precrq_qb_ph,
+                                                 NoItinerary, DSPROpnd, DSPROpnd>;
+
+class PRECRQ_PH_W_DESC : CMP_EQ_QB_R3_DESC_BASE<"precrq.ph.w",
+                                                int_mips_precrq_ph_w,
+                                                NoItinerary, DSPROpnd, GPR32Opnd>;
+
+class PRECRQ_RS_PH_W_DESC : CMP_EQ_QB_R3_DESC_BASE<"precrq_rs.ph.w",
+                                                   int_mips_precrq_rs_ph_w,
+                                                   NoItinerary, DSPROpnd,
+                                                   GPR32Opnd>,
+                            Defs<[DSPOutFlag22]>;
+
+class PRECRQU_S_QB_PH_DESC : CMP_EQ_QB_R3_DESC_BASE<"precrqu_s.qb.ph",
+                                                    int_mips_precrqu_s_qb_ph,
+                                                    NoItinerary, DSPROpnd,
+                                                    DSPROpnd>,
+                             Defs<[DSPOutFlag22]>;
+
+class PRECEQ_W_PHL_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceq.w.phl",
+                                                 int_mips_preceq_w_phl,
+                                                 NoItinerary, GPR32Opnd, DSPROpnd>;
+
+class PRECEQ_W_PHR_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceq.w.phr",
+                                                 int_mips_preceq_w_phr,
+                                                 NoItinerary, GPR32Opnd, DSPROpnd>;
+
+class PRECEQU_PH_QBL_DESC : ABSQ_S_PH_R2_DESC_BASE<"precequ.ph.qbl",
+                                                   int_mips_precequ_ph_qbl,
+                                                   NoItinerary, DSPROpnd>;
+
+class PRECEQU_PH_QBR_DESC : ABSQ_S_PH_R2_DESC_BASE<"precequ.ph.qbr",
+                                                   int_mips_precequ_ph_qbr,
+                                                   NoItinerary, DSPROpnd>;
+
+class PRECEQU_PH_QBLA_DESC : ABSQ_S_PH_R2_DESC_BASE<"precequ.ph.qbla",
+                                                    int_mips_precequ_ph_qbla,
+                                                    NoItinerary, DSPROpnd>;
+
+class PRECEQU_PH_QBRA_DESC : ABSQ_S_PH_R2_DESC_BASE<"precequ.ph.qbra",
+                                                    int_mips_precequ_ph_qbra,
+                                                    NoItinerary, DSPROpnd>;
+
+class PRECEU_PH_QBL_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceu.ph.qbl",
+                                                  int_mips_preceu_ph_qbl,
+                                                  NoItinerary, DSPROpnd>;
+
+class PRECEU_PH_QBR_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceu.ph.qbr",
+                                                  int_mips_preceu_ph_qbr,
+                                                  NoItinerary, DSPROpnd>;
+
+class PRECEU_PH_QBLA_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceu.ph.qbla",
+                                                   int_mips_preceu_ph_qbla,
+                                                   NoItinerary, DSPROpnd>;
+
+class PRECEU_PH_QBRA_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceu.ph.qbra",
+                                                   int_mips_preceu_ph_qbra,
+                                                   NoItinerary, DSPROpnd>;
+
+// Shift
+class SHLL_QB_DESC : SHLL_QB_R2_DESC_BASE<"shll.qb", null_frag, immZExt3,
+                                          NoItinerary, DSPROpnd>,
+                     Defs<[DSPOutFlag22]>;
+
+class SHLLV_QB_DESC : SHLL_QB_R3_DESC_BASE<"shllv.qb", int_mips_shll_qb,
+                                           NoItinerary, DSPROpnd>,
+                      Defs<[DSPOutFlag22]>;
+
+class SHRL_QB_DESC : SHLL_QB_R2_DESC_BASE<"shrl.qb", null_frag, immZExt3,
+                                          NoItinerary, DSPROpnd>;
+
+class SHRLV_QB_DESC : SHLL_QB_R3_DESC_BASE<"shrlv.qb", int_mips_shrl_qb,
+                                           NoItinerary, DSPROpnd>;
+
+class SHLL_PH_DESC : SHLL_QB_R2_DESC_BASE<"shll.ph", null_frag, immZExt4,
+                                          NoItinerary, DSPROpnd>,
+                     Defs<[DSPOutFlag22]>;
+
+class SHLLV_PH_DESC : SHLL_QB_R3_DESC_BASE<"shllv.ph", int_mips_shll_ph,
+                                           NoItinerary, DSPROpnd>,
+                      Defs<[DSPOutFlag22]>;
+
+class SHLL_S_PH_DESC : SHLL_QB_R2_DESC_BASE<"shll_s.ph", int_mips_shll_s_ph,
+                                            immZExt4, NoItinerary, DSPROpnd>,
+                       Defs<[DSPOutFlag22]>;
+
+class SHLLV_S_PH_DESC : SHLL_QB_R3_DESC_BASE<"shllv_s.ph", int_mips_shll_s_ph,
+                                             NoItinerary, DSPROpnd>,
+                        Defs<[DSPOutFlag22]>;
+
+class SHRA_PH_DESC : SHLL_QB_R2_DESC_BASE<"shra.ph", null_frag, immZExt4,
+                                          NoItinerary, DSPROpnd>;
+
+class SHRAV_PH_DESC : SHLL_QB_R3_DESC_BASE<"shrav.ph", int_mips_shra_ph,
+                                           NoItinerary, DSPROpnd>;
+
+class SHRA_R_PH_DESC : SHLL_QB_R2_DESC_BASE<"shra_r.ph", int_mips_shra_r_ph,
+                                            immZExt4, NoItinerary, DSPROpnd>;
+
+class SHRAV_R_PH_DESC : SHLL_QB_R3_DESC_BASE<"shrav_r.ph", int_mips_shra_r_ph,
+                                             NoItinerary, DSPROpnd>;
+
+class SHLL_S_W_DESC : SHLL_QB_R2_DESC_BASE<"shll_s.w", int_mips_shll_s_w,
+                                           immZExt5, NoItinerary, GPR32Opnd>,
+                      Defs<[DSPOutFlag22]>;
+
+class SHLLV_S_W_DESC : SHLL_QB_R3_DESC_BASE<"shllv_s.w", int_mips_shll_s_w,
+                                            NoItinerary, GPR32Opnd>,
+                       Defs<[DSPOutFlag22]>;
+
+class SHRA_R_W_DESC : SHLL_QB_R2_DESC_BASE<"shra_r.w", int_mips_shra_r_w,
+                                           immZExt5, NoItinerary, GPR32Opnd>;
+
+class SHRAV_R_W_DESC : SHLL_QB_R3_DESC_BASE<"shrav_r.w", int_mips_shra_r_w,
+                                            NoItinerary, GPR32Opnd>;
+
+// Multiplication
+class MULEU_S_PH_QBL_DESC : ADDU_QB_DESC_BASE<"muleu_s.ph.qbl",
+                                              int_mips_muleu_s_ph_qbl,
+                                              NoItinerary, DSPROpnd, DSPROpnd>,
+                            Defs<[DSPOutFlag21]>;
+
+class MULEU_S_PH_QBR_DESC : ADDU_QB_DESC_BASE<"muleu_s.ph.qbr",
+                                              int_mips_muleu_s_ph_qbr,
+                                              NoItinerary, DSPROpnd, DSPROpnd>,
+                            Defs<[DSPOutFlag21]>;
+
+class MULEQ_S_W_PHL_DESC : ADDU_QB_DESC_BASE<"muleq_s.w.phl",
+                                             int_mips_muleq_s_w_phl,
+                                             NoItinerary, GPR32Opnd, DSPROpnd>,
+                           IsCommutable, Defs<[DSPOutFlag21]>;
+
+class MULEQ_S_W_PHR_DESC : ADDU_QB_DESC_BASE<"muleq_s.w.phr",
+                                             int_mips_muleq_s_w_phr,
+                                             NoItinerary, GPR32Opnd, DSPROpnd>,
+                           IsCommutable, Defs<[DSPOutFlag21]>;
+
+class MULQ_RS_PH_DESC : ADDU_QB_DESC_BASE<"mulq_rs.ph", int_mips_mulq_rs_ph,
+                                          NoItinerary, DSPROpnd, DSPROpnd>,
+                        IsCommutable, Defs<[DSPOutFlag21]>;
+
+class MULSAQ_S_W_PH_DESC : DPA_W_PH_DESC_BASE<"mulsaq_s.w.ph",
+                                              MipsMULSAQ_S_W_PH>,
+                           Defs<[DSPOutFlag16_19]>;
+
+class MAQ_S_W_PHL_DESC : DPA_W_PH_DESC_BASE<"maq_s.w.phl", MipsMAQ_S_W_PHL>,
+                         Defs<[DSPOutFlag16_19]>;
+
+class MAQ_S_W_PHR_DESC : DPA_W_PH_DESC_BASE<"maq_s.w.phr", MipsMAQ_S_W_PHR>,
+                         Defs<[DSPOutFlag16_19]>;
+
+class MAQ_SA_W_PHL_DESC : DPA_W_PH_DESC_BASE<"maq_sa.w.phl", MipsMAQ_SA_W_PHL>,
+                          Defs<[DSPOutFlag16_19]>;
+
+class MAQ_SA_W_PHR_DESC : DPA_W_PH_DESC_BASE<"maq_sa.w.phr", MipsMAQ_SA_W_PHR>,
+                          Defs<[DSPOutFlag16_19]>;
+
+// Move from/to hi/lo.
+class MFHI_DESC : MFHI_DESC_BASE<"mfhi", ACC64DSPOpnd, MipsMFHI, NoItinerary>;
+class MFLO_DESC : MFHI_DESC_BASE<"mflo", ACC64DSPOpnd, MipsMFLO, NoItinerary>;
+class MTHI_DESC : MTHI_DESC_BASE<"mthi", HI32DSPOpnd, NoItinerary>;
+class MTLO_DESC : MTHI_DESC_BASE<"mtlo", LO32DSPOpnd, NoItinerary>;
+
+// Dot product with accumulate/subtract
+class DPAU_H_QBL_DESC : DPA_W_PH_DESC_BASE<"dpau.h.qbl", MipsDPAU_H_QBL>;
+
+class DPAU_H_QBR_DESC : DPA_W_PH_DESC_BASE<"dpau.h.qbr", MipsDPAU_H_QBR>;
+
+class DPSU_H_QBL_DESC : DPA_W_PH_DESC_BASE<"dpsu.h.qbl", MipsDPSU_H_QBL>;
+
+class DPSU_H_QBR_DESC : DPA_W_PH_DESC_BASE<"dpsu.h.qbr", MipsDPSU_H_QBR>;
+
+class DPAQ_S_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpaq_s.w.ph", MipsDPAQ_S_W_PH>,
+                         Defs<[DSPOutFlag16_19]>;
+
+class DPSQ_S_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpsq_s.w.ph", MipsDPSQ_S_W_PH>,
+                         Defs<[DSPOutFlag16_19]>;
+
+class DPAQ_SA_L_W_DESC : DPA_W_PH_DESC_BASE<"dpaq_sa.l.w", MipsDPAQ_SA_L_W>,
+                         Defs<[DSPOutFlag16_19]>;
+
+class DPSQ_SA_L_W_DESC : DPA_W_PH_DESC_BASE<"dpsq_sa.l.w", MipsDPSQ_SA_L_W>,
+                         Defs<[DSPOutFlag16_19]>;
+
+class MULT_DSP_DESC  : MULT_DESC_BASE<"mult", MipsMult, NoItinerary>;
+class MULTU_DSP_DESC : MULT_DESC_BASE<"multu", MipsMultu, NoItinerary>;
+class MADD_DSP_DESC  : MADD_DESC_BASE<"madd", MipsMAdd, NoItinerary>;
+class MADDU_DSP_DESC : MADD_DESC_BASE<"maddu", MipsMAddu, NoItinerary>;
+class MSUB_DSP_DESC  : MADD_DESC_BASE<"msub", MipsMSub, NoItinerary>;
+class MSUBU_DSP_DESC : MADD_DESC_BASE<"msubu", MipsMSubu, NoItinerary>;
+
+// Comparison
+class CMPU_EQ_QB_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmpu.eq.qb",
+                                               int_mips_cmpu_eq_qb, NoItinerary,
+                                               DSPROpnd>,
+                        IsCommutable, Defs<[DSPCCond]>;
+
+class CMPU_LT_QB_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmpu.lt.qb",
+                                               int_mips_cmpu_lt_qb, NoItinerary,
+                                               DSPROpnd>, Defs<[DSPCCond]>;
+
+class CMPU_LE_QB_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmpu.le.qb",
+                                               int_mips_cmpu_le_qb, NoItinerary,
+                                               DSPROpnd>, Defs<[DSPCCond]>;
+
+class CMPGU_EQ_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgu.eq.qb",
+                                                int_mips_cmpgu_eq_qb,
+                                                NoItinerary, GPR32Opnd, DSPROpnd>,
+                         IsCommutable;
+
+class CMPGU_LT_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgu.lt.qb",
+                                                int_mips_cmpgu_lt_qb,
+                                                NoItinerary, GPR32Opnd, DSPROpnd>;
+
+class CMPGU_LE_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgu.le.qb",
+                                                int_mips_cmpgu_le_qb,
+                                                NoItinerary, GPR32Opnd, DSPROpnd>;
+
+class CMP_EQ_PH_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmp.eq.ph", int_mips_cmp_eq_ph,
+                                              NoItinerary, DSPROpnd>,
+                       IsCommutable, Defs<[DSPCCond]>;
+
+class CMP_LT_PH_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmp.lt.ph", int_mips_cmp_lt_ph,
+                                              NoItinerary, DSPROpnd>,
+                       Defs<[DSPCCond]>;
+
+class CMP_LE_PH_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmp.le.ph", int_mips_cmp_le_ph,
+                                              NoItinerary, DSPROpnd>,
+                       Defs<[DSPCCond]>;
+
+// Misc
+class BITREV_DESC : ABSQ_S_PH_R2_DESC_BASE<"bitrev", int_mips_bitrev,
+                                           NoItinerary, GPR32Opnd>;
+
+class PACKRL_PH_DESC : CMP_EQ_QB_R3_DESC_BASE<"packrl.ph", int_mips_packrl_ph,
+                                              NoItinerary, DSPROpnd, DSPROpnd>;
+
+class REPL_QB_DESC : REPL_DESC_BASE<"repl.qb", int_mips_repl_qb, immZExt8,
+                                    NoItinerary, DSPROpnd>;
+
+class REPL_PH_DESC : REPL_DESC_BASE<"repl.ph", int_mips_repl_ph, immZExt10,
+                                    NoItinerary, DSPROpnd>;
+
+class REPLV_QB_DESC : ABSQ_S_PH_R2_DESC_BASE<"replv.qb", int_mips_repl_qb,
+                                             NoItinerary, DSPROpnd, GPR32Opnd>;
+
+class REPLV_PH_DESC : ABSQ_S_PH_R2_DESC_BASE<"replv.ph", int_mips_repl_ph,
+                                             NoItinerary, DSPROpnd, GPR32Opnd>;
+
+class PICK_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"pick.qb", int_mips_pick_qb,
+                                            NoItinerary, DSPROpnd, DSPROpnd>,
+                     Uses<[DSPCCond]>;
+
+class PICK_PH_DESC : CMP_EQ_QB_R3_DESC_BASE<"pick.ph", int_mips_pick_ph,
+                                            NoItinerary, DSPROpnd, DSPROpnd>,
+                     Uses<[DSPCCond]>;
+
+class LWX_DESC : LX_DESC_BASE<"lwx", int_mips_lwx, NoItinerary>;
+
+class LHX_DESC : LX_DESC_BASE<"lhx", int_mips_lhx, NoItinerary>;
+
+class LBUX_DESC : LX_DESC_BASE<"lbux", int_mips_lbux, NoItinerary>;
+
+class BPOSGE32_DESC : BPOSGE32_DESC_BASE<"bposge32", NoItinerary>;
+
+// Extr
+class EXTP_DESC : EXTR_W_TY1_R1_DESC_BASE<"extp", MipsEXTP, NoItinerary>,
+                  Uses<[DSPPos]>, Defs<[DSPEFI]>;
+
+class EXTPV_DESC : EXTR_W_TY1_R2_DESC_BASE<"extpv", MipsEXTP, NoItinerary>,
+                   Uses<[DSPPos]>, Defs<[DSPEFI]>;
+
+class EXTPDP_DESC : EXTR_W_TY1_R1_DESC_BASE<"extpdp", MipsEXTPDP, NoItinerary>,
+                    Uses<[DSPPos]>, Defs<[DSPPos, DSPEFI]>;
+
+class EXTPDPV_DESC : EXTR_W_TY1_R2_DESC_BASE<"extpdpv", MipsEXTPDP,
+                                             NoItinerary>,
+                     Uses<[DSPPos]>, Defs<[DSPPos, DSPEFI]>;
+
+class EXTR_W_DESC : EXTR_W_TY1_R1_DESC_BASE<"extr.w", MipsEXTR_W, NoItinerary>,
+                    Defs<[DSPOutFlag23]>;
+
+class EXTRV_W_DESC : EXTR_W_TY1_R2_DESC_BASE<"extrv.w", MipsEXTR_W,
+                                             NoItinerary>, Defs<[DSPOutFlag23]>;
+
+class EXTR_R_W_DESC : EXTR_W_TY1_R1_DESC_BASE<"extr_r.w", MipsEXTR_R_W,
+                                              NoItinerary>,
+                      Defs<[DSPOutFlag23]>;
+
+class EXTRV_R_W_DESC : EXTR_W_TY1_R2_DESC_BASE<"extrv_r.w", MipsEXTR_R_W,
+                                               NoItinerary>,
+                       Defs<[DSPOutFlag23]>;
+
+class EXTR_RS_W_DESC : EXTR_W_TY1_R1_DESC_BASE<"extr_rs.w", MipsEXTR_RS_W,
+                                               NoItinerary>,
+                       Defs<[DSPOutFlag23]>;
+
+class EXTRV_RS_W_DESC : EXTR_W_TY1_R2_DESC_BASE<"extrv_rs.w", MipsEXTR_RS_W,
+                                                NoItinerary>,
+                        Defs<[DSPOutFlag23]>;
+
+class EXTR_S_H_DESC : EXTR_W_TY1_R1_DESC_BASE<"extr_s.h", MipsEXTR_S_H,
+                                              NoItinerary>,
+                      Defs<[DSPOutFlag23]>;
+
+class EXTRV_S_H_DESC : EXTR_W_TY1_R2_DESC_BASE<"extrv_s.h", MipsEXTR_S_H,
+                                               NoItinerary>,
+                       Defs<[DSPOutFlag23]>;
+
+class SHILO_DESC : SHILO_R1_DESC_BASE<"shilo", MipsSHILO>;
+
+class SHILOV_DESC : SHILO_R2_DESC_BASE<"shilov", MipsSHILO>;
+
+class MTHLIP_DESC : MTHLIP_DESC_BASE<"mthlip", MipsMTHLIP>, Defs<[DSPPos]>;
+
+class RDDSP_DESC : RDDSP_DESC_BASE<"rddsp", int_mips_rddsp, NoItinerary>;
+
+class WRDSP_DESC : WRDSP_DESC_BASE<"wrdsp", int_mips_wrdsp, NoItinerary>;
+
+class INSV_DESC : INSV_DESC_BASE<"insv", int_mips_insv, NoItinerary>,
+                  Uses<[DSPPos, DSPSCount]>;
+
+//===----------------------------------------------------------------------===//
+// MIPS DSP Rev 2
+// Addition/subtraction
+class ADDU_PH_DESC : ADDU_QB_DESC_BASE<"addu.ph", int_mips_addu_ph, NoItinerary,
+                                       DSPROpnd, DSPROpnd>, IsCommutable,
+                     Defs<[DSPOutFlag20]>;
+
+class ADDU_S_PH_DESC : ADDU_QB_DESC_BASE<"addu_s.ph", int_mips_addu_s_ph,
+                                         NoItinerary, DSPROpnd, DSPROpnd>,
+                       IsCommutable, Defs<[DSPOutFlag20]>;
+
+class SUBU_PH_DESC : ADDU_QB_DESC_BASE<"subu.ph", int_mips_subu_ph, NoItinerary,
+                                       DSPROpnd, DSPROpnd>,
+                     Defs<[DSPOutFlag20]>;
+
+class SUBU_S_PH_DESC : ADDU_QB_DESC_BASE<"subu_s.ph", int_mips_subu_s_ph,
+                                         NoItinerary, DSPROpnd, DSPROpnd>,
+                       Defs<[DSPOutFlag20]>;
+
+class ADDUH_QB_DESC : ADDUH_QB_DESC_BASE<"adduh.qb", int_mips_adduh_qb,
+                                         NoItinerary, DSPROpnd>, IsCommutable;
+
+class ADDUH_R_QB_DESC : ADDUH_QB_DESC_BASE<"adduh_r.qb", int_mips_adduh_r_qb,
+                                           NoItinerary, DSPROpnd>, IsCommutable;
+
+class SUBUH_QB_DESC : ADDUH_QB_DESC_BASE<"subuh.qb", int_mips_subuh_qb,
+                                         NoItinerary, DSPROpnd>;
+
+class SUBUH_R_QB_DESC : ADDUH_QB_DESC_BASE<"subuh_r.qb", int_mips_subuh_r_qb,
+                                           NoItinerary, DSPROpnd>;
+
+class ADDQH_PH_DESC : ADDUH_QB_DESC_BASE<"addqh.ph", int_mips_addqh_ph,
+                                         NoItinerary, DSPROpnd>, IsCommutable;
+
+class ADDQH_R_PH_DESC : ADDUH_QB_DESC_BASE<"addqh_r.ph", int_mips_addqh_r_ph,
+                                           NoItinerary, DSPROpnd>, IsCommutable;
+
+class SUBQH_PH_DESC : ADDUH_QB_DESC_BASE<"subqh.ph", int_mips_subqh_ph,
+                                         NoItinerary, DSPROpnd>;
+
+class SUBQH_R_PH_DESC : ADDUH_QB_DESC_BASE<"subqh_r.ph", int_mips_subqh_r_ph,
+                                           NoItinerary, DSPROpnd>;
+
+class ADDQH_W_DESC : ADDUH_QB_DESC_BASE<"addqh.w", int_mips_addqh_w,
+                                        NoItinerary, GPR32Opnd>, IsCommutable;
+
+class ADDQH_R_W_DESC : ADDUH_QB_DESC_BASE<"addqh_r.w", int_mips_addqh_r_w,
+                                          NoItinerary, GPR32Opnd>, IsCommutable;
+
+class SUBQH_W_DESC : ADDUH_QB_DESC_BASE<"subqh.w", int_mips_subqh_w,
+                                        NoItinerary, GPR32Opnd>;
+
+class SUBQH_R_W_DESC : ADDUH_QB_DESC_BASE<"subqh_r.w", int_mips_subqh_r_w,
+                                          NoItinerary, GPR32Opnd>;
+
+// Comparison
+class CMPGDU_EQ_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgdu.eq.qb",
+                                                 int_mips_cmpgdu_eq_qb,
+                                                 NoItinerary, GPR32Opnd, DSPROpnd>,
+                          IsCommutable, Defs<[DSPCCond]>;
+
+class CMPGDU_LT_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgdu.lt.qb",
+                                                 int_mips_cmpgdu_lt_qb,
+                                                 NoItinerary, GPR32Opnd, DSPROpnd>,
+                          Defs<[DSPCCond]>;
+
+class CMPGDU_LE_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgdu.le.qb",
+                                                 int_mips_cmpgdu_le_qb,
+                                                 NoItinerary, GPR32Opnd, DSPROpnd>,
+                          Defs<[DSPCCond]>;
+
+// Absolute
+class ABSQ_S_QB_DESC : ABSQ_S_PH_R2_DESC_BASE<"absq_s.qb", int_mips_absq_s_qb,
+                                              NoItinerary, DSPROpnd>,
+                       Defs<[DSPOutFlag20]>;
+
+// Multiplication
+class MUL_PH_DESC : ADDUH_QB_DESC_BASE<"mul.ph", null_frag, NoItinerary,
+                                       DSPROpnd>, IsCommutable,
+                    Defs<[DSPOutFlag21]>;
+
+class MUL_S_PH_DESC : ADDUH_QB_DESC_BASE<"mul_s.ph", int_mips_mul_s_ph,
+                                         NoItinerary, DSPROpnd>, IsCommutable,
+                      Defs<[DSPOutFlag21]>;
+
+class MULQ_S_W_DESC : ADDUH_QB_DESC_BASE<"mulq_s.w", int_mips_mulq_s_w,
+                                         NoItinerary, GPR32Opnd>, IsCommutable,
+                      Defs<[DSPOutFlag21]>;
+
+class MULQ_RS_W_DESC : ADDUH_QB_DESC_BASE<"mulq_rs.w", int_mips_mulq_rs_w,
+                                          NoItinerary, GPR32Opnd>, IsCommutable,
+                       Defs<[DSPOutFlag21]>;
+
+class MULQ_S_PH_DESC : ADDU_QB_DESC_BASE<"mulq_s.ph", int_mips_mulq_s_ph,
+                                         NoItinerary, DSPROpnd, DSPROpnd>,
+                       IsCommutable, Defs<[DSPOutFlag21]>;
+
+// Dot product with accumulate/subtract
+class DPA_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpa.w.ph", MipsDPA_W_PH>;
+
+class DPS_W_PH_DESC : DPA_W_PH_DESC_BASE<"dps.w.ph", MipsDPS_W_PH>;
+
+class DPAQX_S_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpaqx_s.w.ph", MipsDPAQX_S_W_PH>,
+                          Defs<[DSPOutFlag16_19]>;
+
+class DPAQX_SA_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpaqx_sa.w.ph",
+                                              MipsDPAQX_SA_W_PH>,
+                           Defs<[DSPOutFlag16_19]>;
+
+class DPAX_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpax.w.ph", MipsDPAX_W_PH>;
+
+class DPSX_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpsx.w.ph", MipsDPSX_W_PH>;
+
+class DPSQX_S_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpsqx_s.w.ph", MipsDPSQX_S_W_PH>,
+                          Defs<[DSPOutFlag16_19]>;
+
+class DPSQX_SA_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpsqx_sa.w.ph",
+                                              MipsDPSQX_SA_W_PH>,
+                           Defs<[DSPOutFlag16_19]>;
+
+class MULSA_W_PH_DESC : DPA_W_PH_DESC_BASE<"mulsa.w.ph", MipsMULSA_W_PH>;
+
+// Precision reduce/expand
+class PRECR_QB_PH_DESC : CMP_EQ_QB_R3_DESC_BASE<"precr.qb.ph",
+                                                int_mips_precr_qb_ph,
+                                                NoItinerary, DSPROpnd, DSPROpnd>;
+
+class PRECR_SRA_PH_W_DESC : PRECR_SRA_PH_W_DESC_BASE<"precr_sra.ph.w",
+                                                     int_mips_precr_sra_ph_w,
+                                                     NoItinerary, DSPROpnd,
+                                                     GPR32Opnd>;
+
+class PRECR_SRA_R_PH_W_DESC : PRECR_SRA_PH_W_DESC_BASE<"precr_sra_r.ph.w",
+                                                      int_mips_precr_sra_r_ph_w,
+                                                       NoItinerary, DSPROpnd,
+                                                       GPR32Opnd>;
+
+// Shift
+class SHRA_QB_DESC : SHLL_QB_R2_DESC_BASE<"shra.qb", null_frag, immZExt3,
+                                          NoItinerary, DSPROpnd>;
+
+class SHRAV_QB_DESC : SHLL_QB_R3_DESC_BASE<"shrav.qb", int_mips_shra_qb,
+                                           NoItinerary, DSPROpnd>;
+
+class SHRA_R_QB_DESC : SHLL_QB_R2_DESC_BASE<"shra_r.qb", int_mips_shra_r_qb,
+                                            immZExt3, NoItinerary, DSPROpnd>;
+
+class SHRAV_R_QB_DESC : SHLL_QB_R3_DESC_BASE<"shrav_r.qb", int_mips_shra_r_qb,
+                                             NoItinerary, DSPROpnd>;
+
+class SHRL_PH_DESC : SHLL_QB_R2_DESC_BASE<"shrl.ph", null_frag, immZExt4,
+                                          NoItinerary, DSPROpnd>;
+
+class SHRLV_PH_DESC : SHLL_QB_R3_DESC_BASE<"shrlv.ph", int_mips_shrl_ph,
+                                           NoItinerary, DSPROpnd>;
+
+// Misc
+class APPEND_DESC : APPEND_DESC_BASE<"append", int_mips_append, immZExt5,
+                                     NoItinerary>;
+
+class BALIGN_DESC : APPEND_DESC_BASE<"balign", int_mips_balign, immZExt2,
+                                     NoItinerary>;
+
+class PREPEND_DESC : APPEND_DESC_BASE<"prepend", int_mips_prepend, immZExt5,
+                                      NoItinerary>;
+
+// Pseudos.
+def BPOSGE32_PSEUDO : BPOSGE32_PSEUDO_DESC_BASE<int_mips_bposge32,
+                                                NoItinerary>, Uses<[DSPPos]>;
+
+// Instruction defs.
+// MIPS DSP Rev 1
+def ADDU_QB : ADDU_QB_ENC, ADDU_QB_DESC;
+def ADDU_S_QB : ADDU_S_QB_ENC, ADDU_S_QB_DESC;
+def SUBU_QB : SUBU_QB_ENC, SUBU_QB_DESC;
+def SUBU_S_QB : SUBU_S_QB_ENC, SUBU_S_QB_DESC;
+def ADDQ_PH : ADDQ_PH_ENC, ADDQ_PH_DESC;
+def ADDQ_S_PH : ADDQ_S_PH_ENC, ADDQ_S_PH_DESC;
+def SUBQ_PH : SUBQ_PH_ENC, SUBQ_PH_DESC;
+def SUBQ_S_PH : SUBQ_S_PH_ENC, SUBQ_S_PH_DESC;
+def ADDQ_S_W : ADDQ_S_W_ENC, ADDQ_S_W_DESC;
+def SUBQ_S_W : SUBQ_S_W_ENC, SUBQ_S_W_DESC;
+def ADDSC : ADDSC_ENC, ADDSC_DESC;
+def ADDWC : ADDWC_ENC, ADDWC_DESC;
+def MODSUB : MODSUB_ENC, MODSUB_DESC;
+def RADDU_W_QB : RADDU_W_QB_ENC, RADDU_W_QB_DESC;
+def ABSQ_S_PH : ABSQ_S_PH_ENC, ABSQ_S_PH_DESC;
+def ABSQ_S_W : ABSQ_S_W_ENC, ABSQ_S_W_DESC;
+def PRECRQ_QB_PH : PRECRQ_QB_PH_ENC, PRECRQ_QB_PH_DESC;
+def PRECRQ_PH_W : PRECRQ_PH_W_ENC, PRECRQ_PH_W_DESC;
+def PRECRQ_RS_PH_W : PRECRQ_RS_PH_W_ENC, PRECRQ_RS_PH_W_DESC;
+def PRECRQU_S_QB_PH : PRECRQU_S_QB_PH_ENC, PRECRQU_S_QB_PH_DESC;
+def PRECEQ_W_PHL : PRECEQ_W_PHL_ENC, PRECEQ_W_PHL_DESC;
+def PRECEQ_W_PHR : PRECEQ_W_PHR_ENC, PRECEQ_W_PHR_DESC;
+def PRECEQU_PH_QBL : PRECEQU_PH_QBL_ENC, PRECEQU_PH_QBL_DESC;
+def PRECEQU_PH_QBR : PRECEQU_PH_QBR_ENC, PRECEQU_PH_QBR_DESC;
+def PRECEQU_PH_QBLA : PRECEQU_PH_QBLA_ENC, PRECEQU_PH_QBLA_DESC;
+def PRECEQU_PH_QBRA : PRECEQU_PH_QBRA_ENC, PRECEQU_PH_QBRA_DESC;
+def PRECEU_PH_QBL : PRECEU_PH_QBL_ENC, PRECEU_PH_QBL_DESC;
+def PRECEU_PH_QBR : PRECEU_PH_QBR_ENC, PRECEU_PH_QBR_DESC;
+def PRECEU_PH_QBLA : PRECEU_PH_QBLA_ENC, PRECEU_PH_QBLA_DESC;
+def PRECEU_PH_QBRA : PRECEU_PH_QBRA_ENC, PRECEU_PH_QBRA_DESC;
+def SHLL_QB : SHLL_QB_ENC, SHLL_QB_DESC;
+def SHLLV_QB : SHLLV_QB_ENC, SHLLV_QB_DESC;
+def SHRL_QB : SHRL_QB_ENC, SHRL_QB_DESC;
+def SHRLV_QB : SHRLV_QB_ENC, SHRLV_QB_DESC;
+def SHLL_PH : SHLL_PH_ENC, SHLL_PH_DESC;
+def SHLLV_PH : SHLLV_PH_ENC, SHLLV_PH_DESC;
+def SHLL_S_PH : SHLL_S_PH_ENC, SHLL_S_PH_DESC;
+def SHLLV_S_PH : SHLLV_S_PH_ENC, SHLLV_S_PH_DESC;
+def SHRA_PH : SHRA_PH_ENC, SHRA_PH_DESC;
+def SHRAV_PH : SHRAV_PH_ENC, SHRAV_PH_DESC;
+def SHRA_R_PH : SHRA_R_PH_ENC, SHRA_R_PH_DESC;
+def SHRAV_R_PH : SHRAV_R_PH_ENC, SHRAV_R_PH_DESC;
+def SHLL_S_W : SHLL_S_W_ENC, SHLL_S_W_DESC;
+def SHLLV_S_W : SHLLV_S_W_ENC, SHLLV_S_W_DESC;
+def SHRA_R_W : SHRA_R_W_ENC, SHRA_R_W_DESC;
+def SHRAV_R_W : SHRAV_R_W_ENC, SHRAV_R_W_DESC;
+def MULEU_S_PH_QBL : MULEU_S_PH_QBL_ENC, MULEU_S_PH_QBL_DESC;
+def MULEU_S_PH_QBR : MULEU_S_PH_QBR_ENC, MULEU_S_PH_QBR_DESC;
+def MULEQ_S_W_PHL : MULEQ_S_W_PHL_ENC, MULEQ_S_W_PHL_DESC;
+def MULEQ_S_W_PHR : MULEQ_S_W_PHR_ENC, MULEQ_S_W_PHR_DESC;
+def MULQ_RS_PH : MULQ_RS_PH_ENC, MULQ_RS_PH_DESC;
+def MULSAQ_S_W_PH : MULSAQ_S_W_PH_ENC, MULSAQ_S_W_PH_DESC;
+def MAQ_S_W_PHL : MAQ_S_W_PHL_ENC, MAQ_S_W_PHL_DESC;
+def MAQ_S_W_PHR : MAQ_S_W_PHR_ENC, MAQ_S_W_PHR_DESC;
+def MAQ_SA_W_PHL : MAQ_SA_W_PHL_ENC, MAQ_SA_W_PHL_DESC;
+def MAQ_SA_W_PHR : MAQ_SA_W_PHR_ENC, MAQ_SA_W_PHR_DESC;
+def MFHI_DSP : MFHI_ENC, MFHI_DESC;
+def MFLO_DSP : MFLO_ENC, MFLO_DESC;
+def MTHI_DSP : MTHI_ENC, MTHI_DESC;
+def MTLO_DSP : MTLO_ENC, MTLO_DESC;
+def DPAU_H_QBL : DPAU_H_QBL_ENC, DPAU_H_QBL_DESC;
+def DPAU_H_QBR : DPAU_H_QBR_ENC, DPAU_H_QBR_DESC;
+def DPSU_H_QBL : DPSU_H_QBL_ENC, DPSU_H_QBL_DESC;
+def DPSU_H_QBR : DPSU_H_QBR_ENC, DPSU_H_QBR_DESC;
+def DPAQ_S_W_PH : DPAQ_S_W_PH_ENC, DPAQ_S_W_PH_DESC;
+def DPSQ_S_W_PH : DPSQ_S_W_PH_ENC, DPSQ_S_W_PH_DESC;
+def DPAQ_SA_L_W : DPAQ_SA_L_W_ENC, DPAQ_SA_L_W_DESC;
+def DPSQ_SA_L_W : DPSQ_SA_L_W_ENC, DPSQ_SA_L_W_DESC;
+def MULT_DSP : MULT_DSP_ENC, MULT_DSP_DESC;
+def MULTU_DSP : MULTU_DSP_ENC, MULTU_DSP_DESC;
+def MADD_DSP : MADD_DSP_ENC, MADD_DSP_DESC;
+def MADDU_DSP : MADDU_DSP_ENC, MADDU_DSP_DESC;
+def MSUB_DSP : MSUB_DSP_ENC, MSUB_DSP_DESC;
+def MSUBU_DSP : MSUBU_DSP_ENC, MSUBU_DSP_DESC;
+def CMPU_EQ_QB : CMPU_EQ_QB_ENC, CMPU_EQ_QB_DESC;
+def CMPU_LT_QB : CMPU_LT_QB_ENC, CMPU_LT_QB_DESC;
+def CMPU_LE_QB : CMPU_LE_QB_ENC, CMPU_LE_QB_DESC;
+def CMPGU_EQ_QB : CMPGU_EQ_QB_ENC, CMPGU_EQ_QB_DESC;
+def CMPGU_LT_QB : CMPGU_LT_QB_ENC, CMPGU_LT_QB_DESC;
+def CMPGU_LE_QB : CMPGU_LE_QB_ENC, CMPGU_LE_QB_DESC;
+def CMP_EQ_PH : CMP_EQ_PH_ENC, CMP_EQ_PH_DESC;
+def CMP_LT_PH : CMP_LT_PH_ENC, CMP_LT_PH_DESC;
+def CMP_LE_PH : CMP_LE_PH_ENC, CMP_LE_PH_DESC;
+def BITREV : BITREV_ENC, BITREV_DESC;
+def PACKRL_PH : PACKRL_PH_ENC, PACKRL_PH_DESC;
+def REPL_QB : REPL_QB_ENC, REPL_QB_DESC;
+def REPL_PH : REPL_PH_ENC, REPL_PH_DESC;
+def REPLV_QB : REPLV_QB_ENC, REPLV_QB_DESC;
+def REPLV_PH : REPLV_PH_ENC, REPLV_PH_DESC;
+def PICK_QB : PICK_QB_ENC, PICK_QB_DESC;
+def PICK_PH : PICK_PH_ENC, PICK_PH_DESC;
+def LWX : LWX_ENC, LWX_DESC;
+def LHX : LHX_ENC, LHX_DESC;
+def LBUX : LBUX_ENC, LBUX_DESC;
+def BPOSGE32 : BPOSGE32_ENC, BPOSGE32_DESC;
+def INSV : INSV_ENC, INSV_DESC;
+def EXTP : EXTP_ENC, EXTP_DESC;
+def EXTPV : EXTPV_ENC, EXTPV_DESC;
+def EXTPDP : EXTPDP_ENC, EXTPDP_DESC;
+def EXTPDPV : EXTPDPV_ENC, EXTPDPV_DESC;
+def EXTR_W : EXTR_W_ENC, EXTR_W_DESC;
+def EXTRV_W : EXTRV_W_ENC, EXTRV_W_DESC;
+def EXTR_R_W : EXTR_R_W_ENC, EXTR_R_W_DESC;
+def EXTRV_R_W : EXTRV_R_W_ENC, EXTRV_R_W_DESC;
+def EXTR_RS_W : EXTR_RS_W_ENC, EXTR_RS_W_DESC;
+def EXTRV_RS_W : EXTRV_RS_W_ENC, EXTRV_RS_W_DESC;
+def EXTR_S_H : EXTR_S_H_ENC, EXTR_S_H_DESC;
+def EXTRV_S_H : EXTRV_S_H_ENC, EXTRV_S_H_DESC;
+def SHILO : SHILO_ENC, SHILO_DESC;
+def SHILOV : SHILOV_ENC, SHILOV_DESC;
+def MTHLIP : MTHLIP_ENC, MTHLIP_DESC;
+def RDDSP : RDDSP_ENC, RDDSP_DESC;
+def WRDSP : WRDSP_ENC, WRDSP_DESC;
+
+// MIPS DSP Rev 2
+let Predicates = [HasDSPR2] in {
+
+def ADDU_PH : ADDU_PH_ENC, ADDU_PH_DESC;
+def ADDU_S_PH : ADDU_S_PH_ENC, ADDU_S_PH_DESC;
+def SUBU_PH : SUBU_PH_ENC, SUBU_PH_DESC;
+def SUBU_S_PH : SUBU_S_PH_ENC, SUBU_S_PH_DESC;
+def CMPGDU_EQ_QB : CMPGDU_EQ_QB_ENC, CMPGDU_EQ_QB_DESC;
+def CMPGDU_LT_QB : CMPGDU_LT_QB_ENC, CMPGDU_LT_QB_DESC;
+def CMPGDU_LE_QB : CMPGDU_LE_QB_ENC, CMPGDU_LE_QB_DESC;
+def ABSQ_S_QB : ABSQ_S_QB_ENC, ABSQ_S_QB_DESC;
+def ADDUH_QB : ADDUH_QB_ENC, ADDUH_QB_DESC;
+def ADDUH_R_QB : ADDUH_R_QB_ENC, ADDUH_R_QB_DESC;
+def SUBUH_QB : SUBUH_QB_ENC, SUBUH_QB_DESC;
+def SUBUH_R_QB : SUBUH_R_QB_ENC, SUBUH_R_QB_DESC;
+def ADDQH_PH : ADDQH_PH_ENC, ADDQH_PH_DESC;
+def ADDQH_R_PH : ADDQH_R_PH_ENC, ADDQH_R_PH_DESC;
+def SUBQH_PH : SUBQH_PH_ENC, SUBQH_PH_DESC;
+def SUBQH_R_PH : SUBQH_R_PH_ENC, SUBQH_R_PH_DESC;
+def ADDQH_W : ADDQH_W_ENC, ADDQH_W_DESC;
+def ADDQH_R_W : ADDQH_R_W_ENC, ADDQH_R_W_DESC;
+def SUBQH_W : SUBQH_W_ENC, SUBQH_W_DESC;
+def SUBQH_R_W : SUBQH_R_W_ENC, SUBQH_R_W_DESC;
+def MUL_PH : MUL_PH_ENC, MUL_PH_DESC;
+def MUL_S_PH : MUL_S_PH_ENC, MUL_S_PH_DESC;
+def MULQ_S_W : MULQ_S_W_ENC, MULQ_S_W_DESC;
+def MULQ_RS_W : MULQ_RS_W_ENC, MULQ_RS_W_DESC;
+def MULQ_S_PH : MULQ_S_PH_ENC, MULQ_S_PH_DESC;
+def DPA_W_PH : DPA_W_PH_ENC, DPA_W_PH_DESC;
+def DPS_W_PH : DPS_W_PH_ENC, DPS_W_PH_DESC;
+def DPAQX_S_W_PH : DPAQX_S_W_PH_ENC, DPAQX_S_W_PH_DESC;
+def DPAQX_SA_W_PH : DPAQX_SA_W_PH_ENC, DPAQX_SA_W_PH_DESC;
+def DPAX_W_PH : DPAX_W_PH_ENC, DPAX_W_PH_DESC;
+def DPSX_W_PH : DPSX_W_PH_ENC, DPSX_W_PH_DESC;
+def DPSQX_S_W_PH : DPSQX_S_W_PH_ENC, DPSQX_S_W_PH_DESC;
+def DPSQX_SA_W_PH : DPSQX_SA_W_PH_ENC, DPSQX_SA_W_PH_DESC;
+def MULSA_W_PH : MULSA_W_PH_ENC, MULSA_W_PH_DESC;
+def PRECR_QB_PH : PRECR_QB_PH_ENC, PRECR_QB_PH_DESC;
+def PRECR_SRA_PH_W : PRECR_SRA_PH_W_ENC, PRECR_SRA_PH_W_DESC;
+def PRECR_SRA_R_PH_W : PRECR_SRA_R_PH_W_ENC, PRECR_SRA_R_PH_W_DESC;
+def SHRA_QB : SHRA_QB_ENC, SHRA_QB_DESC;
+def SHRAV_QB : SHRAV_QB_ENC, SHRAV_QB_DESC;
+def SHRA_R_QB : SHRA_R_QB_ENC, SHRA_R_QB_DESC;
+def SHRAV_R_QB : SHRAV_R_QB_ENC, SHRAV_R_QB_DESC;
+def SHRL_PH : SHRL_PH_ENC, SHRL_PH_DESC;
+def SHRLV_PH : SHRLV_PH_ENC, SHRLV_PH_DESC;
+def APPEND : APPEND_ENC, APPEND_DESC;
+def BALIGN : BALIGN_ENC, BALIGN_DESC;
+def PREPEND : PREPEND_ENC, PREPEND_DESC;
+
+}
+
+// Pseudos.
+let isPseudo = 1, isCodeGenOnly = 1 in {
+  // Pseudo instructions for loading and storing accumulator registers.
+  def LOAD_ACC64DSP  : Load<"", ACC64DSPOpnd>;
+  def STORE_ACC64DSP : Store<"", ACC64DSPOpnd>;
+
+  // Pseudos for loading and storing ccond field of DSP control register.
+  def LOAD_CCOND_DSP  : Load<"load_ccond_dsp", DSPCC>;
+  def STORE_CCOND_DSP : Store<"store_ccond_dsp", DSPCC>;
+}
+
+// Pseudo CMP and PICK instructions.
+class PseudoCMP<Instruction RealInst> :
+  PseudoDSP<(outs DSPCC:$cmp), (ins DSPROpnd:$rs, DSPROpnd:$rt), []>,
+  PseudoInstExpansion<(RealInst DSPROpnd:$rs, DSPROpnd:$rt)>, NeverHasSideEffects;
+
+class PseudoPICK<Instruction RealInst> :
+  PseudoDSP<(outs DSPROpnd:$rd), (ins DSPCC:$cmp, DSPROpnd:$rs, DSPROpnd:$rt), []>,
+  PseudoInstExpansion<(RealInst DSPROpnd:$rd, DSPROpnd:$rs, DSPROpnd:$rt)>,
+  NeverHasSideEffects;
+
+def PseudoCMP_EQ_PH : PseudoCMP<CMP_EQ_PH>;
+def PseudoCMP_LT_PH : PseudoCMP<CMP_LT_PH>;
+def PseudoCMP_LE_PH : PseudoCMP<CMP_LE_PH>;
+def PseudoCMPU_EQ_QB : PseudoCMP<CMPU_EQ_QB>;
+def PseudoCMPU_LT_QB : PseudoCMP<CMPU_LT_QB>;
+def PseudoCMPU_LE_QB : PseudoCMP<CMPU_LE_QB>;
+
+def PseudoPICK_PH : PseudoPICK<PICK_PH>;
+def PseudoPICK_QB : PseudoPICK<PICK_QB>;
+
+def PseudoMTLOHI_DSP : PseudoMTLOHI<ACC64DSP, GPR32>;
+
+// Patterns.
+class DSPPat<dag pattern, dag result, Predicate pred = HasDSP> :
+  Pat<pattern, result>, Requires<[pred]>;
+
+class BitconvertPat<ValueType DstVT, ValueType SrcVT, RegisterClass DstRC,
+                    RegisterClass SrcRC> :
+   DSPPat<(DstVT (bitconvert (SrcVT SrcRC:$src))),
+          (COPY_TO_REGCLASS SrcRC:$src, DstRC)>;
+
+def : BitconvertPat<i32, v2i16, GPR32, DSPR>;
+def : BitconvertPat<i32, v4i8, GPR32, DSPR>;
+def : BitconvertPat<v2i16, i32, DSPR, GPR32>;
+def : BitconvertPat<v4i8, i32, DSPR, GPR32>;
+
+def : DSPPat<(v2i16 (load addr:$a)),
+             (v2i16 (COPY_TO_REGCLASS (LW addr:$a), DSPR))>;
+def : DSPPat<(v4i8 (load addr:$a)),
+             (v4i8 (COPY_TO_REGCLASS (LW addr:$a), DSPR))>;
+def : DSPPat<(store (v2i16 DSPR:$val), addr:$a),
+             (SW (COPY_TO_REGCLASS DSPR:$val, GPR32), addr:$a)>;
+def : DSPPat<(store (v4i8 DSPR:$val), addr:$a),
+             (SW (COPY_TO_REGCLASS DSPR:$val, GPR32), addr:$a)>;
+
+// Binary operations.
+class DSPBinPat<Instruction Inst, ValueType ValTy, SDPatternOperator Node,
+                Predicate Pred = HasDSP> :
+  DSPPat<(Node ValTy:$a, ValTy:$b), (Inst ValTy:$a, ValTy:$b), Pred>;
+
+def : DSPBinPat<ADDQ_PH, v2i16, int_mips_addq_ph>;
+def : DSPBinPat<ADDQ_PH, v2i16, add>;
+def : DSPBinPat<SUBQ_PH, v2i16, int_mips_subq_ph>;
+def : DSPBinPat<SUBQ_PH, v2i16, sub>;
+def : DSPBinPat<MUL_PH, v2i16, int_mips_mul_ph, HasDSPR2>;
+def : DSPBinPat<MUL_PH, v2i16, mul, HasDSPR2>;
+def : DSPBinPat<ADDU_QB, v4i8, int_mips_addu_qb>;
+def : DSPBinPat<ADDU_QB, v4i8, add>;
+def : DSPBinPat<SUBU_QB, v4i8, int_mips_subu_qb>;
+def : DSPBinPat<SUBU_QB, v4i8, sub>;
+def : DSPBinPat<ADDSC, i32, int_mips_addsc>;
+def : DSPBinPat<ADDSC, i32, addc>;
+def : DSPBinPat<ADDWC, i32, int_mips_addwc>;
+def : DSPBinPat<ADDWC, i32, adde>;
+
+// Shift immediate patterns.
+class DSPShiftPat<Instruction Inst, ValueType ValTy, SDPatternOperator Node,
+                  SDPatternOperator Imm, Predicate Pred = HasDSP> :
+  DSPPat<(Node ValTy:$a, Imm:$shamt), (Inst ValTy:$a, Imm:$shamt), Pred>;
+
+def : DSPShiftPat<SHLL_PH, v2i16, MipsSHLL_DSP, imm>;
+def : DSPShiftPat<SHRA_PH, v2i16, MipsSHRA_DSP, imm>;
+def : DSPShiftPat<SHRL_PH, v2i16, MipsSHRL_DSP, imm, HasDSPR2>;
+def : DSPShiftPat<SHLL_PH, v2i16, int_mips_shll_ph, immZExt4>;
+def : DSPShiftPat<SHRA_PH, v2i16, int_mips_shra_ph, immZExt4>;
+def : DSPShiftPat<SHRL_PH, v2i16, int_mips_shrl_ph, immZExt4, HasDSPR2>;
+def : DSPShiftPat<SHLL_QB, v4i8, MipsSHLL_DSP, imm>;
+def : DSPShiftPat<SHRA_QB, v4i8, MipsSHRA_DSP, imm, HasDSPR2>;
+def : DSPShiftPat<SHRL_QB, v4i8, MipsSHRL_DSP, imm>;
+def : DSPShiftPat<SHLL_QB, v4i8, int_mips_shll_qb, immZExt3>;
+def : DSPShiftPat<SHRA_QB, v4i8, int_mips_shra_qb, immZExt3, HasDSPR2>;
+def : DSPShiftPat<SHRL_QB, v4i8, int_mips_shrl_qb, immZExt3>;
+
+// SETCC/SELECT_CC patterns.
+class DSPSetCCPat<Instruction Cmp, Instruction Pick, ValueType ValTy,
+                  CondCode CC> :
+  DSPPat<(ValTy (MipsSETCC_DSP ValTy:$a, ValTy:$b, CC)),
+         (ValTy (Pick (ValTy (Cmp ValTy:$a, ValTy:$b)),
+                      (ValTy (COPY_TO_REGCLASS (ADDiu ZERO, -1), DSPR)),
+                      (ValTy ZERO)))>;
+
+class DSPSetCCPatInv<Instruction Cmp, Instruction Pick, ValueType ValTy,
+                     CondCode CC> :
+  DSPPat<(ValTy (MipsSETCC_DSP ValTy:$a, ValTy:$b, CC)),
+         (ValTy (Pick (ValTy (Cmp ValTy:$a, ValTy:$b)),
+                      (ValTy ZERO),
+                      (ValTy (COPY_TO_REGCLASS (ADDiu ZERO, -1), DSPR))))>;
+
+class DSPSelectCCPat<Instruction Cmp, Instruction Pick, ValueType ValTy,
+                     CondCode CC> :
+  DSPPat<(ValTy (MipsSELECT_CC_DSP ValTy:$a, ValTy:$b, ValTy:$c, ValTy:$d, CC)),
+         (ValTy (Pick (ValTy (Cmp ValTy:$a, ValTy:$b)), $c, $d))>;
+
+class DSPSelectCCPatInv<Instruction Cmp, Instruction Pick, ValueType ValTy,
+                        CondCode CC> :
+  DSPPat<(ValTy (MipsSELECT_CC_DSP ValTy:$a, ValTy:$b, ValTy:$c, ValTy:$d, CC)),
+         (ValTy (Pick (ValTy (Cmp ValTy:$a, ValTy:$b)), $d, $c))>;
+
+def : DSPSetCCPat<PseudoCMP_EQ_PH, PseudoPICK_PH, v2i16, SETEQ>;
+def : DSPSetCCPat<PseudoCMP_LT_PH, PseudoPICK_PH, v2i16, SETLT>;
+def : DSPSetCCPat<PseudoCMP_LE_PH, PseudoPICK_PH, v2i16, SETLE>;
+def : DSPSetCCPatInv<PseudoCMP_EQ_PH, PseudoPICK_PH, v2i16, SETNE>;
+def : DSPSetCCPatInv<PseudoCMP_LT_PH, PseudoPICK_PH, v2i16, SETGE>;
+def : DSPSetCCPatInv<PseudoCMP_LE_PH, PseudoPICK_PH, v2i16, SETGT>;
+def : DSPSetCCPat<PseudoCMPU_EQ_QB, PseudoPICK_QB, v4i8, SETEQ>;
+def : DSPSetCCPat<PseudoCMPU_LT_QB, PseudoPICK_QB, v4i8, SETULT>;
+def : DSPSetCCPat<PseudoCMPU_LE_QB, PseudoPICK_QB, v4i8, SETULE>;
+def : DSPSetCCPatInv<PseudoCMPU_EQ_QB, PseudoPICK_QB, v4i8, SETNE>;
+def : DSPSetCCPatInv<PseudoCMPU_LT_QB, PseudoPICK_QB, v4i8, SETUGE>;
+def : DSPSetCCPatInv<PseudoCMPU_LE_QB, PseudoPICK_QB, v4i8, SETUGT>;
+
+def : DSPSelectCCPat<PseudoCMP_EQ_PH, PseudoPICK_PH, v2i16, SETEQ>;
+def : DSPSelectCCPat<PseudoCMP_LT_PH, PseudoPICK_PH, v2i16, SETLT>;
+def : DSPSelectCCPat<PseudoCMP_LE_PH, PseudoPICK_PH, v2i16, SETLE>;
+def : DSPSelectCCPatInv<PseudoCMP_EQ_PH, PseudoPICK_PH, v2i16, SETNE>;
+def : DSPSelectCCPatInv<PseudoCMP_LT_PH, PseudoPICK_PH, v2i16, SETGE>;
+def : DSPSelectCCPatInv<PseudoCMP_LE_PH, PseudoPICK_PH, v2i16, SETGT>;
+def : DSPSelectCCPat<PseudoCMPU_EQ_QB, PseudoPICK_QB, v4i8, SETEQ>;
+def : DSPSelectCCPat<PseudoCMPU_LT_QB, PseudoPICK_QB, v4i8, SETULT>;
+def : DSPSelectCCPat<PseudoCMPU_LE_QB, PseudoPICK_QB, v4i8, SETULE>;
+def : DSPSelectCCPatInv<PseudoCMPU_EQ_QB, PseudoPICK_QB, v4i8, SETNE>;
+def : DSPSelectCCPatInv<PseudoCMPU_LT_QB, PseudoPICK_QB, v4i8, SETUGE>;
+def : DSPSelectCCPatInv<PseudoCMPU_LE_QB, PseudoPICK_QB, v4i8, SETUGT>;
+
+// Extr patterns.
+class EXTR_W_TY1_R2_Pat<SDPatternOperator OpNode, Instruction Instr> :
+  DSPPat<(i32 (OpNode GPR32:$rs, ACC64DSP:$ac)),
+         (Instr ACC64DSP:$ac, GPR32:$rs)>;
+
+class EXTR_W_TY1_R1_Pat<SDPatternOperator OpNode, Instruction Instr> :
+  DSPPat<(i32 (OpNode immZExt5:$shift, ACC64DSP:$ac)),
+         (Instr ACC64DSP:$ac, immZExt5:$shift)>;
+
+def : EXTR_W_TY1_R1_Pat<MipsEXTP, EXTP>;
+def : EXTR_W_TY1_R2_Pat<MipsEXTP, EXTPV>;
+def : EXTR_W_TY1_R1_Pat<MipsEXTPDP, EXTPDP>;
+def : EXTR_W_TY1_R2_Pat<MipsEXTPDP, EXTPDPV>;
+def : EXTR_W_TY1_R1_Pat<MipsEXTR_W, EXTR_W>;
+def : EXTR_W_TY1_R2_Pat<MipsEXTR_W, EXTRV_W>;
+def : EXTR_W_TY1_R1_Pat<MipsEXTR_R_W, EXTR_R_W>;
+def : EXTR_W_TY1_R2_Pat<MipsEXTR_R_W, EXTRV_R_W>;
+def : EXTR_W_TY1_R1_Pat<MipsEXTR_RS_W, EXTR_RS_W>;
+def : EXTR_W_TY1_R2_Pat<MipsEXTR_RS_W, EXTRV_RS_W>;
+def : EXTR_W_TY1_R1_Pat<MipsEXTR_S_H, EXTR_S_H>;
+def : EXTR_W_TY1_R2_Pat<MipsEXTR_S_H, EXTRV_S_H>;
+
+// Indexed load patterns.
+class IndexedLoadPat<SDPatternOperator LoadNode, Instruction Instr> :
+  DSPPat<(i32 (LoadNode (add i32:$base, i32:$index))),
+         (Instr i32:$base, i32:$index)>;
+
+let AddedComplexity = 20 in {
+  def : IndexedLoadPat<zextloadi8, LBUX>;
+  def : IndexedLoadPat<sextloadi16, LHX>;
+  def : IndexedLoadPat<load, LWX>;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp b/contrib/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
new file mode 100644
index 0000000..bcfbc12
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -0,0 +1,738 @@
+//===-- MipsDelaySlotFiller.cpp - Mips Delay Slot Filler ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Simple pass to fill delay slots with useful instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/MipsMCNaCl.h"
+#include "Mips.h"
+#include "MipsInstrInfo.h"
+#include "MipsTargetMachine.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "delay-slot-filler"
+
+STATISTIC(FilledSlots, "Number of delay slots filled");
+STATISTIC(UsefulSlots, "Number of delay slots filled with instructions that"
+                       " are not NOP.");
+
+static cl::opt<bool> DisableDelaySlotFiller(
+  "disable-mips-delay-filler",
+  cl::init(false),
+  cl::desc("Fill all delay slots with NOPs."),
+  cl::Hidden);
+
+static cl::opt<bool> DisableForwardSearch(
+  "disable-mips-df-forward-search",
+  cl::init(true),
+  cl::desc("Disallow MIPS delay filler to search forward."),
+  cl::Hidden);
+
+static cl::opt<bool> DisableSuccBBSearch(
+  "disable-mips-df-succbb-search",
+  cl::init(true),
+  cl::desc("Disallow MIPS delay filler to search successor basic blocks."),
+  cl::Hidden);
+
+static cl::opt<bool> DisableBackwardSearch(
+  "disable-mips-df-backward-search",
+  cl::init(false),
+  cl::desc("Disallow MIPS delay filler to search backward."),
+  cl::Hidden);
+
+namespace {
+  typedef MachineBasicBlock::iterator Iter;
+  typedef MachineBasicBlock::reverse_iterator ReverseIter;
+  typedef SmallDenseMap<MachineBasicBlock*, MachineInstr*, 2> BB2BrMap;
+
+  class RegDefsUses {
+  public:
+    RegDefsUses(TargetMachine &TM);
+    void init(const MachineInstr &MI);
+
+    /// This function sets all caller-saved registers in Defs.
+    void setCallerSaved(const MachineInstr &MI);
+
+    /// This function sets all unallocatable registers in Defs.
+    void setUnallocatableRegs(const MachineFunction &MF);
+
+    /// Set bits in Uses corresponding to MBB's live-out registers except for
+    /// the registers that are live-in to SuccBB.
+    void addLiveOut(const MachineBasicBlock &MBB,
+                    const MachineBasicBlock &SuccBB);
+
+    bool update(const MachineInstr &MI, unsigned Begin, unsigned End);
+
+  private:
+    bool checkRegDefsUses(BitVector &NewDefs, BitVector &NewUses, unsigned Reg,
+                          bool IsDef) const;
+
+    /// Returns true if Reg or its alias is in RegSet.
+    bool isRegInSet(const BitVector &RegSet, unsigned Reg) const;
+
+    const TargetRegisterInfo &TRI;
+    BitVector Defs, Uses;
+  };
+
+  /// Base class for inspecting loads and stores.
+  class InspectMemInstr {
+  public:
+    InspectMemInstr(bool ForbidMemInstr_)
+      : OrigSeenLoad(false), OrigSeenStore(false), SeenLoad(false),
+        SeenStore(false), ForbidMemInstr(ForbidMemInstr_) {}
+
+    /// Return true if MI cannot be moved to delay slot.
+    bool hasHazard(const MachineInstr &MI);
+
+    virtual ~InspectMemInstr() {}
+
+  protected:
+    /// Flags indicating whether loads or stores have been seen.
+    bool OrigSeenLoad, OrigSeenStore, SeenLoad, SeenStore;
+
+    /// Memory instructions are not allowed to move to delay slot if this flag
+    /// is true.
+    bool ForbidMemInstr;
+
+  private:
+    virtual bool hasHazard_(const MachineInstr &MI) = 0;
+  };
+
+  /// This subclass rejects any memory instructions.
+  class NoMemInstr : public InspectMemInstr {
+  public:
+    NoMemInstr() : InspectMemInstr(true) {}
+  private:
+    bool hasHazard_(const MachineInstr &MI) override { return true; }
+  };
+
+  /// This subclass accepts loads from stacks and constant loads.
+  class LoadFromStackOrConst : public InspectMemInstr {
+  public:
+    LoadFromStackOrConst() : InspectMemInstr(false) {}
+  private:
+    bool hasHazard_(const MachineInstr &MI) override;
+  };
+
+  /// This subclass uses memory dependence information to determine whether a
+  /// memory instruction can be moved to a delay slot.
+  class MemDefsUses : public InspectMemInstr {
+  public:
+    MemDefsUses(const MachineFrameInfo *MFI);
+
+  private:
+    typedef PointerUnion<const Value *, const PseudoSourceValue *> ValueType;
+
+    bool hasHazard_(const MachineInstr &MI) override;
+
+    /// Update Defs and Uses. Return true if there exist dependences that
+    /// disqualify the delay slot candidate between V and values in Uses and
+    /// Defs.
+    bool updateDefsUses(ValueType V, bool MayStore);
+
+    /// Get the list of underlying objects of MI's memory operand.
+    bool getUnderlyingObjects(const MachineInstr &MI,
+                              SmallVectorImpl<ValueType> &Objects) const;
+
+    const MachineFrameInfo *MFI;
+    SmallPtrSet<ValueType, 4> Uses, Defs;
+
+    /// Flags indicating whether loads or stores with no underlying objects have
+    /// been seen.
+    bool SeenNoObjLoad, SeenNoObjStore;
+  };
+
+  class Filler : public MachineFunctionPass {
+  public:
+    Filler(TargetMachine &tm)
+      : MachineFunctionPass(ID), TM(tm) { }
+
+    const char *getPassName() const override {
+      return "Mips Delay Slot Filler";
+    }
+
+    bool runOnMachineFunction(MachineFunction &F) override {
+      bool Changed = false;
+      for (MachineFunction::iterator FI = F.begin(), FE = F.end();
+           FI != FE; ++FI)
+        Changed |= runOnMachineBasicBlock(*FI);
+
+      // This pass invalidates liveness information when it reorders
+      // instructions to fill delay slot. Without this, -verify-machineinstrs
+      // will fail.
+      if (Changed)
+        F.getRegInfo().invalidateLiveness();
+
+      return Changed;
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<MachineBranchProbabilityInfo>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+  private:
+    bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
+
+    /// This function checks if it is valid to move Candidate to the delay slot
+    /// and returns true if it isn't. It also updates memory and register
+    /// dependence information.
+    bool delayHasHazard(const MachineInstr &Candidate, RegDefsUses &RegDU,
+                        InspectMemInstr &IM) const;
+
+    /// This function searches range [Begin, End) for an instruction that can be
+    /// moved to the delay slot. Returns true on success.
+    template<typename IterTy>
+    bool searchRange(MachineBasicBlock &MBB, IterTy Begin, IterTy End,
+                     RegDefsUses &RegDU, InspectMemInstr &IM,
+                     IterTy &Filler) const;
+
+    /// This function searches in the backward direction for an instruction that
+    /// can be moved to the delay slot. Returns true on success.
+    bool searchBackward(MachineBasicBlock &MBB, Iter Slot) const;
+
+    /// This function searches MBB in the forward direction for an instruction
+    /// that can be moved to the delay slot. Returns true on success.
+    bool searchForward(MachineBasicBlock &MBB, Iter Slot) const;
+
+    /// This function searches one of MBB's successor blocks for an instruction
+    /// that can be moved to the delay slot and inserts clones of the
+    /// instruction into the successor's predecessor blocks.
+    bool searchSuccBBs(MachineBasicBlock &MBB, Iter Slot) const;
+
+    /// Pick a successor block of MBB. Return NULL if MBB doesn't have a
+    /// successor block that is not a landing pad.
+    MachineBasicBlock *selectSuccBB(MachineBasicBlock &B) const;
+
+    /// This function analyzes MBB and returns an instruction with an unoccupied
+    /// slot that branches to Dst.
+    std::pair<MipsInstrInfo::BranchType, MachineInstr *>
+    getBranch(MachineBasicBlock &MBB, const MachineBasicBlock &Dst) const;
+
+    /// Examine Pred and see if it is possible to insert an instruction into
+    /// one of its branches delay slot or its end.
+    bool examinePred(MachineBasicBlock &Pred, const MachineBasicBlock &Succ,
+                     RegDefsUses &RegDU, bool &HasMultipleSuccs,
+                     BB2BrMap &BrMap) const;
+
+    bool terminateSearch(const MachineInstr &Candidate) const;
+
+    TargetMachine &TM;
+
+    static char ID;
+  };
+  char Filler::ID = 0;
+} // end of anonymous namespace
+
+static bool hasUnoccupiedSlot(const MachineInstr *MI) {
+  return MI->hasDelaySlot() && !MI->isBundledWithSucc();
+}
+
+/// This function inserts clones of Filler into predecessor blocks.
+static void insertDelayFiller(Iter Filler, const BB2BrMap &BrMap) {
+  MachineFunction *MF = Filler->getParent()->getParent();
+
+  for (BB2BrMap::const_iterator I = BrMap.begin(); I != BrMap.end(); ++I) {
+    if (I->second) {
+      MIBundleBuilder(I->second).append(MF->CloneMachineInstr(&*Filler));
+      ++UsefulSlots;
+    } else {
+      I->first->insert(I->first->end(), MF->CloneMachineInstr(&*Filler));
+    }
+  }
+}
+
+/// This function adds registers Filler defines to MBB's live-in register list.
+static void addLiveInRegs(Iter Filler, MachineBasicBlock &MBB) {
+  for (unsigned I = 0, E = Filler->getNumOperands(); I != E; ++I) {
+    const MachineOperand &MO = Filler->getOperand(I);
+    unsigned R;
+
+    if (!MO.isReg() || !MO.isDef() || !(R = MO.getReg()))
+      continue;
+
+#ifndef NDEBUG
+    const MachineFunction &MF = *MBB.getParent();
+    assert(MF.getTarget().getRegisterInfo()->getAllocatableSet(MF).test(R) &&
+           "Shouldn't move an instruction with unallocatable registers across "
+           "basic block boundaries.");
+#endif
+
+    if (!MBB.isLiveIn(R))
+      MBB.addLiveIn(R);
+  }
+}
+
+RegDefsUses::RegDefsUses(TargetMachine &TM)
+  : TRI(*TM.getRegisterInfo()), Defs(TRI.getNumRegs(), false),
+    Uses(TRI.getNumRegs(), false) {}
+
+void RegDefsUses::init(const MachineInstr &MI) {
+  // Add all register operands which are explicit and non-variadic.
+  update(MI, 0, MI.getDesc().getNumOperands());
+
+  // If MI is a call, add RA to Defs to prevent users of RA from going into
+  // delay slot.
+  if (MI.isCall())
+    Defs.set(Mips::RA);
+
+  // Add all implicit register operands of branch instructions except
+  // register AT.
+  if (MI.isBranch()) {
+    update(MI, MI.getDesc().getNumOperands(), MI.getNumOperands());
+    Defs.reset(Mips::AT);
+  }
+}
+
+void RegDefsUses::setCallerSaved(const MachineInstr &MI) {
+  assert(MI.isCall());
+
+  // If MI is a call, add all caller-saved registers to Defs.
+  BitVector CallerSavedRegs(TRI.getNumRegs(), true);
+
+  CallerSavedRegs.reset(Mips::ZERO);
+  CallerSavedRegs.reset(Mips::ZERO_64);
+
+  for (const MCPhysReg *R = TRI.getCalleeSavedRegs(); *R; ++R)
+    for (MCRegAliasIterator AI(*R, &TRI, true); AI.isValid(); ++AI)
+      CallerSavedRegs.reset(*AI);
+
+  Defs |= CallerSavedRegs;
+}
+
+void RegDefsUses::setUnallocatableRegs(const MachineFunction &MF) {
+  BitVector AllocSet = TRI.getAllocatableSet(MF);
+
+  for (int R = AllocSet.find_first(); R != -1; R = AllocSet.find_next(R))
+    for (MCRegAliasIterator AI(R, &TRI, false); AI.isValid(); ++AI)
+      AllocSet.set(*AI);
+
+  AllocSet.set(Mips::ZERO);
+  AllocSet.set(Mips::ZERO_64);
+
+  Defs |= AllocSet.flip();
+}
+
+void RegDefsUses::addLiveOut(const MachineBasicBlock &MBB,
+                             const MachineBasicBlock &SuccBB) {
+  for (MachineBasicBlock::const_succ_iterator SI = MBB.succ_begin(),
+       SE = MBB.succ_end(); SI != SE; ++SI)
+    if (*SI != &SuccBB)
+      for (MachineBasicBlock::livein_iterator LI = (*SI)->livein_begin(),
+           LE = (*SI)->livein_end(); LI != LE; ++LI)
+        Uses.set(*LI);
+}
+
+bool RegDefsUses::update(const MachineInstr &MI, unsigned Begin, unsigned End) {
+  BitVector NewDefs(TRI.getNumRegs()), NewUses(TRI.getNumRegs());
+  bool HasHazard = false;
+
+  for (unsigned I = Begin; I != End; ++I) {
+    const MachineOperand &MO = MI.getOperand(I);
+
+    if (MO.isReg() && MO.getReg())
+      HasHazard |= checkRegDefsUses(NewDefs, NewUses, MO.getReg(), MO.isDef());
+  }
+
+  Defs |= NewDefs;
+  Uses |= NewUses;
+
+  return HasHazard;
+}
+
+bool RegDefsUses::checkRegDefsUses(BitVector &NewDefs, BitVector &NewUses,
+                                   unsigned Reg, bool IsDef) const {
+  if (IsDef) {
+    NewDefs.set(Reg);
+    // check whether Reg has already been defined or used.
+    return (isRegInSet(Defs, Reg) || isRegInSet(Uses, Reg));
+  }
+
+  NewUses.set(Reg);
+  // check whether Reg has already been defined.
+  return isRegInSet(Defs, Reg);
+}
+
+bool RegDefsUses::isRegInSet(const BitVector &RegSet, unsigned Reg) const {
+  // Check Reg and all aliased Registers.
+  for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI)
+    if (RegSet.test(*AI))
+      return true;
+  return false;
+}
+
+bool InspectMemInstr::hasHazard(const MachineInstr &MI) {
+  if (!MI.mayStore() && !MI.mayLoad())
+    return false;
+
+  if (ForbidMemInstr)
+    return true;
+
+  OrigSeenLoad = SeenLoad;
+  OrigSeenStore = SeenStore;
+  SeenLoad |= MI.mayLoad();
+  SeenStore |= MI.mayStore();
+
+  // If MI is an ordered or volatile memory reference, disallow moving
+  // subsequent loads and stores to delay slot.
+  if (MI.hasOrderedMemoryRef() && (OrigSeenLoad || OrigSeenStore)) {
+    ForbidMemInstr = true;
+    return true;
+  }
+
+  return hasHazard_(MI);
+}
+
+bool LoadFromStackOrConst::hasHazard_(const MachineInstr &MI) {
+  if (MI.mayStore())
+    return true;
+
+  if (!MI.hasOneMemOperand() || !(*MI.memoperands_begin())->getPseudoValue())
+    return true;
+
+  if (const PseudoSourceValue *PSV =
+      (*MI.memoperands_begin())->getPseudoValue()) {
+    if (isa<FixedStackPseudoSourceValue>(PSV))
+      return false;
+    return !PSV->isConstant(nullptr) && PSV != PseudoSourceValue::getStack();
+  }
+
+  return true;
+}
+
+MemDefsUses::MemDefsUses(const MachineFrameInfo *MFI_)
+  : InspectMemInstr(false), MFI(MFI_), SeenNoObjLoad(false),
+    SeenNoObjStore(false) {}
+
+bool MemDefsUses::hasHazard_(const MachineInstr &MI) {
+  bool HasHazard = false;
+  SmallVector<ValueType, 4> Objs;
+
+  // Check underlying object list.
+  if (getUnderlyingObjects(MI, Objs)) {
+    for (SmallVectorImpl<ValueType>::const_iterator I = Objs.begin();
+         I != Objs.end(); ++I)
+      HasHazard |= updateDefsUses(*I, MI.mayStore());
+
+    return HasHazard;
+  }
+
+  // No underlying objects found.
+  HasHazard = MI.mayStore() && (OrigSeenLoad || OrigSeenStore);
+  HasHazard |= MI.mayLoad() || OrigSeenStore;
+
+  SeenNoObjLoad |= MI.mayLoad();
+  SeenNoObjStore |= MI.mayStore();
+
+  return HasHazard;
+}
+
+bool MemDefsUses::updateDefsUses(ValueType V, bool MayStore) {
+  if (MayStore)
+    return !Defs.insert(V) || Uses.count(V) || SeenNoObjStore || SeenNoObjLoad;
+
+  Uses.insert(V);
+  return Defs.count(V) || SeenNoObjStore;
+}
+
+bool MemDefsUses::
+getUnderlyingObjects(const MachineInstr &MI,
+                     SmallVectorImpl<ValueType> &Objects) const {
+  if (!MI.hasOneMemOperand() ||
+      (!(*MI.memoperands_begin())->getValue() &&
+       !(*MI.memoperands_begin())->getPseudoValue()))
+    return false;
+
+  if (const PseudoSourceValue *PSV =
+      (*MI.memoperands_begin())->getPseudoValue()) {
+    if (!PSV->isAliased(MFI))
+      return false;
+    Objects.push_back(PSV);
+    return true;
+  }
+
+  const Value *V = (*MI.memoperands_begin())->getValue();
+
+  SmallVector<Value *, 4> Objs;
+  GetUnderlyingObjects(const_cast<Value *>(V), Objs);
+
+  for (SmallVectorImpl<Value *>::iterator I = Objs.begin(), E = Objs.end();
+       I != E; ++I) {
+    if (!isIdentifiedObject(V))
+      return false;
+
+    Objects.push_back(*I);
+  }
+
+  return true;
+}
+
+/// runOnMachineBasicBlock - Fill in delay slots for the given basic block.
+/// We assume there is only one delay slot per delayed instruction.
+bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
+  bool Changed = false;
+
+  for (Iter I = MBB.begin(); I != MBB.end(); ++I) {
+    if (!hasUnoccupiedSlot(&*I))
+      continue;
+
+    ++FilledSlots;
+    Changed = true;
+
+    // Delay slot filling is disabled at -O0.
+    if (!DisableDelaySlotFiller && (TM.getOptLevel() != CodeGenOpt::None)) {
+      if (searchBackward(MBB, I))
+        continue;
+
+      if (I->isTerminator()) {
+        if (searchSuccBBs(MBB, I))
+          continue;
+      } else if (searchForward(MBB, I)) {
+        continue;
+      }
+    }
+
+    // Bundle the NOP to the instruction with the delay slot.
+    const MipsInstrInfo *TII =
+      static_cast<const MipsInstrInfo*>(TM.getInstrInfo());
+    BuildMI(MBB, std::next(I), I->getDebugLoc(), TII->get(Mips::NOP));
+    MIBundleBuilder(MBB, I, std::next(I, 2));
+  }
+
+  return Changed;
+}
+
+/// createMipsDelaySlotFillerPass - Returns a pass that fills in delay
+/// slots in Mips MachineFunctions
+FunctionPass *llvm::createMipsDelaySlotFillerPass(MipsTargetMachine &tm) {
+  return new Filler(tm);
+}
+
+template<typename IterTy>
+bool Filler::searchRange(MachineBasicBlock &MBB, IterTy Begin, IterTy End,
+                         RegDefsUses &RegDU, InspectMemInstr& IM,
+                         IterTy &Filler) const {
+  for (IterTy I = Begin; I != End; ++I) {
+    // skip debug value
+    if (I->isDebugValue())
+      continue;
+
+    if (terminateSearch(*I))
+      break;
+
+    assert((!I->isCall() && !I->isReturn() && !I->isBranch()) &&
+           "Cannot put calls, returns or branches in delay slot.");
+
+    if (delayHasHazard(*I, RegDU, IM))
+      continue;
+
+    if (TM.getSubtarget<MipsSubtarget>().isTargetNaCl()) {
+      // In NaCl, instructions that must be masked are forbidden in delay slots.
+      // We only check for loads, stores and SP changes.  Calls, returns and
+      // branches are not checked because non-NaCl targets never put them in
+      // delay slots.
+      unsigned AddrIdx;
+      if ((isBasePlusOffsetMemoryAccess(I->getOpcode(), &AddrIdx)
+           && baseRegNeedsLoadStoreMask(I->getOperand(AddrIdx).getReg()))
+          || I->modifiesRegister(Mips::SP, TM.getRegisterInfo()))
+        continue;
+    }
+
+    Filler = I;
+    return true;
+  }
+
+  return false;
+}
+
+bool Filler::searchBackward(MachineBasicBlock &MBB, Iter Slot) const {
+  if (DisableBackwardSearch)
+    return false;
+
+  RegDefsUses RegDU(TM);
+  MemDefsUses MemDU(MBB.getParent()->getFrameInfo());
+  ReverseIter Filler;
+
+  RegDU.init(*Slot);
+
+  if (!searchRange(MBB, ReverseIter(Slot), MBB.rend(), RegDU, MemDU, Filler))
+    return false;
+
+  MBB.splice(std::next(Slot), &MBB, std::next(Filler).base());
+  MIBundleBuilder(MBB, Slot, std::next(Slot, 2));
+  ++UsefulSlots;
+  return true;
+}
+
+bool Filler::searchForward(MachineBasicBlock &MBB, Iter Slot) const {
+  // Can handle only calls.
+  if (DisableForwardSearch || !Slot->isCall())
+    return false;
+
+  RegDefsUses RegDU(TM);
+  NoMemInstr NM;
+  Iter Filler;
+
+  RegDU.setCallerSaved(*Slot);
+
+  if (!searchRange(MBB, std::next(Slot), MBB.end(), RegDU, NM, Filler))
+    return false;
+
+  MBB.splice(std::next(Slot), &MBB, Filler);
+  MIBundleBuilder(MBB, Slot, std::next(Slot, 2));
+  ++UsefulSlots;
+  return true;
+}
+
+bool Filler::searchSuccBBs(MachineBasicBlock &MBB, Iter Slot) const {
+  if (DisableSuccBBSearch)
+    return false;
+
+  MachineBasicBlock *SuccBB = selectSuccBB(MBB);
+
+  if (!SuccBB)
+    return false;
+
+  RegDefsUses RegDU(TM);
+  bool HasMultipleSuccs = false;
+  BB2BrMap BrMap;
+  std::unique_ptr<InspectMemInstr> IM;
+  Iter Filler;
+
+  // Iterate over SuccBB's predecessor list.
+  for (MachineBasicBlock::pred_iterator PI = SuccBB->pred_begin(),
+       PE = SuccBB->pred_end(); PI != PE; ++PI)
+    if (!examinePred(**PI, *SuccBB, RegDU, HasMultipleSuccs, BrMap))
+      return false;
+
+  // Do not allow moving instructions which have unallocatable register operands
+  // across basic block boundaries.
+  RegDU.setUnallocatableRegs(*MBB.getParent());
+
+  // Only allow moving loads from stack or constants if any of the SuccBB's
+  // predecessors have multiple successors.
+  if (HasMultipleSuccs) {
+    IM.reset(new LoadFromStackOrConst());
+  } else {
+    const MachineFrameInfo *MFI = MBB.getParent()->getFrameInfo();
+    IM.reset(new MemDefsUses(MFI));
+  }
+
+  if (!searchRange(MBB, SuccBB->begin(), SuccBB->end(), RegDU, *IM, Filler))
+    return false;
+
+  insertDelayFiller(Filler, BrMap);
+  addLiveInRegs(Filler, *SuccBB);
+  Filler->eraseFromParent();
+
+  return true;
+}
+
+MachineBasicBlock *Filler::selectSuccBB(MachineBasicBlock &B) const {
+  if (B.succ_empty())
+    return nullptr;
+
+  // Select the successor with the larget edge weight.
+  auto &Prob = getAnalysis<MachineBranchProbabilityInfo>();
+  MachineBasicBlock *S = *std::max_element(B.succ_begin(), B.succ_end(),
+                                           [&](const MachineBasicBlock *Dst0,
+                                               const MachineBasicBlock *Dst1) {
+    return Prob.getEdgeWeight(&B, Dst0) < Prob.getEdgeWeight(&B, Dst1);
+  });
+  return S->isLandingPad() ? nullptr : S;
+}
+
+std::pair<MipsInstrInfo::BranchType, MachineInstr *>
+Filler::getBranch(MachineBasicBlock &MBB, const MachineBasicBlock &Dst) const {
+  const MipsInstrInfo *TII =
+    static_cast<const MipsInstrInfo*>(TM.getInstrInfo());
+  MachineBasicBlock *TrueBB = nullptr, *FalseBB = nullptr;
+  SmallVector<MachineInstr*, 2> BranchInstrs;
+  SmallVector<MachineOperand, 2> Cond;
+
+  MipsInstrInfo::BranchType R =
+    TII->AnalyzeBranch(MBB, TrueBB, FalseBB, Cond, false, BranchInstrs);
+
+  if ((R == MipsInstrInfo::BT_None) || (R == MipsInstrInfo::BT_NoBranch))
+    return std::make_pair(R, nullptr);
+
+  if (R != MipsInstrInfo::BT_CondUncond) {
+    if (!hasUnoccupiedSlot(BranchInstrs[0]))
+      return std::make_pair(MipsInstrInfo::BT_None, nullptr);
+
+    assert(((R != MipsInstrInfo::BT_Uncond) || (TrueBB == &Dst)));
+
+    return std::make_pair(R, BranchInstrs[0]);
+  }
+
+  assert((TrueBB == &Dst) || (FalseBB == &Dst));
+
+  // Examine the conditional branch. See if its slot is occupied.
+  if (hasUnoccupiedSlot(BranchInstrs[0]))
+    return std::make_pair(MipsInstrInfo::BT_Cond, BranchInstrs[0]);
+
+  // If that fails, try the unconditional branch.
+  if (hasUnoccupiedSlot(BranchInstrs[1]) && (FalseBB == &Dst))
+    return std::make_pair(MipsInstrInfo::BT_Uncond, BranchInstrs[1]);
+
+  return std::make_pair(MipsInstrInfo::BT_None, nullptr);
+}
+
+bool Filler::examinePred(MachineBasicBlock &Pred, const MachineBasicBlock &Succ,
+                         RegDefsUses &RegDU, bool &HasMultipleSuccs,
+                         BB2BrMap &BrMap) const {
+  std::pair<MipsInstrInfo::BranchType, MachineInstr *> P =
+    getBranch(Pred, Succ);
+
+  // Return if either getBranch wasn't able to analyze the branches or there
+  // were no branches with unoccupied slots.
+  if (P.first == MipsInstrInfo::BT_None)
+    return false;
+
+  if ((P.first != MipsInstrInfo::BT_Uncond) &&
+      (P.first != MipsInstrInfo::BT_NoBranch)) {
+    HasMultipleSuccs = true;
+    RegDU.addLiveOut(Pred, Succ);
+  }
+
+  BrMap[&Pred] = P.second;
+  return true;
+}
+
+bool Filler::delayHasHazard(const MachineInstr &Candidate, RegDefsUses &RegDU,
+                            InspectMemInstr &IM) const {
+  bool HasHazard = (Candidate.isImplicitDef() || Candidate.isKill());
+
+  HasHazard |= IM.hasHazard(Candidate);
+  HasHazard |= RegDU.update(Candidate, 0, Candidate.getNumOperands());
+
+  return HasHazard;
+}
+
+bool Filler::terminateSearch(const MachineInstr &Candidate) const {
+  return (Candidate.isTerminator() || Candidate.isCall() ||
+          Candidate.isPosition() || Candidate.isInlineAsm() ||
+          Candidate.hasUnmodeledSideEffects());
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsFastISel.cpp b/contrib/llvm/lib/Target/Mips/MipsFastISel.cpp
new file mode 100644
index 0000000..617801b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsFastISel.cpp
@@ -0,0 +1,400 @@
+//===-- MipsastISel.cpp - Mips FastISel implementation
+//---------------------===//
+
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "MipsRegisterInfo.h"
+#include "MipsISelLowering.h"
+#include "MipsMachineFunction.h"
+#include "MipsSubtarget.h"
+#include "MipsTargetMachine.h"
+
+using namespace llvm;
+
+namespace {
+
+// All possible address modes.
+typedef struct Address {
+  enum { RegBase, FrameIndexBase } BaseType;
+
+  union {
+    unsigned Reg;
+    int FI;
+  } Base;
+
+  int64_t Offset;
+
+  // Innocuous defaults for our address.
+  Address() : BaseType(RegBase), Offset(0) { Base.Reg = 0; }
+} Address;
+
+class MipsFastISel final : public FastISel {
+
+  /// Subtarget - Keep a pointer to the MipsSubtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  Module &M;
+  const TargetMachine &TM;
+  const TargetInstrInfo &TII;
+  const TargetLowering &TLI;
+  const MipsSubtarget *Subtarget;
+  MipsFunctionInfo *MFI;
+
+  // Convenience variables to avoid some queries.
+  LLVMContext *Context;
+
+  bool TargetSupported;
+
+public:
+  explicit MipsFastISel(FunctionLoweringInfo &funcInfo,
+                        const TargetLibraryInfo *libInfo)
+      : FastISel(funcInfo, libInfo),
+        M(const_cast<Module &>(*funcInfo.Fn->getParent())),
+        TM(funcInfo.MF->getTarget()), TII(*TM.getInstrInfo()),
+        TLI(*TM.getTargetLowering()),
+        Subtarget(&TM.getSubtarget<MipsSubtarget>()) {
+    MFI = funcInfo.MF->getInfo<MipsFunctionInfo>();
+    Context = &funcInfo.Fn->getContext();
+    TargetSupported = ((Subtarget->getRelocationModel() == Reloc::PIC_) &&
+                       (Subtarget->hasMips32r2() && (Subtarget->isABI_O32())));
+  }
+
+  bool TargetSelectInstruction(const Instruction *I) override;
+  unsigned TargetMaterializeConstant(const Constant *C) override;
+
+  bool ComputeAddress(const Value *Obj, Address &Addr);
+
+private:
+  bool EmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
+                unsigned Alignment = 0);
+  bool EmitStore(MVT VT, unsigned SrcReg, Address &Addr,
+                 unsigned Alignment = 0);
+  bool SelectLoad(const Instruction *I);
+  bool SelectRet(const Instruction *I);
+  bool SelectStore(const Instruction *I);
+
+  bool isTypeLegal(Type *Ty, MVT &VT);
+  bool isLoadTypeLegal(Type *Ty, MVT &VT);
+
+  unsigned MaterializeFP(const ConstantFP *CFP, MVT VT);
+  unsigned MaterializeGV(const GlobalValue *GV, MVT VT);
+  unsigned MaterializeInt(const Constant *C, MVT VT);
+  unsigned Materialize32BitInt(int64_t Imm, const TargetRegisterClass *RC);
+
+  // for some reason, this default is not generated by tablegen
+  // so we explicitly generate it here.
+  //
+  unsigned FastEmitInst_riir(uint64_t inst, const TargetRegisterClass *RC,
+                             unsigned Op0, bool Op0IsKill, uint64_t imm1,
+                             uint64_t imm2, unsigned Op3, bool Op3IsKill) {
+    return 0;
+  }
+
+  MachineInstrBuilder EmitInst(unsigned Opc) {
+    return BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc));
+  }
+
+  MachineInstrBuilder EmitInst(unsigned Opc, unsigned DstReg) {
+    return BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
+                   DstReg);
+  }
+
+  MachineInstrBuilder EmitInstStore(unsigned Opc, unsigned SrcReg,
+                                    unsigned MemReg, int64_t MemOffset) {
+    return EmitInst(Opc).addReg(SrcReg).addReg(MemReg).addImm(MemOffset);
+  }
+
+  MachineInstrBuilder EmitInstLoad(unsigned Opc, unsigned DstReg,
+                                      unsigned MemReg, int64_t MemOffset) {
+    return EmitInst(Opc, DstReg).addReg(MemReg).addImm(MemOffset);
+  }
+
+#include "MipsGenFastISel.inc"
+};
+
+bool MipsFastISel::isTypeLegal(Type *Ty, MVT &VT) {
+  EVT evt = TLI.getValueType(Ty, true);
+  // Only handle simple types.
+  if (evt == MVT::Other || !evt.isSimple())
+    return false;
+  VT = evt.getSimpleVT();
+
+  // Handle all legal types, i.e. a register that will directly hold this
+  // value.
+  return TLI.isTypeLegal(VT);
+}
+
+bool MipsFastISel::isLoadTypeLegal(Type *Ty, MVT &VT) {
+  if (isTypeLegal(Ty, VT))
+    return true;
+  // We will extend this in a later patch:
+  //   If this is a type than can be sign or zero-extended to a basic operation
+  //   go ahead and accept it now.
+  if (VT == MVT::i8 || VT == MVT::i16)
+    return true;
+  return false;
+}
+
+bool MipsFastISel::ComputeAddress(const Value *Obj, Address &Addr) {
+  // This construct looks a big awkward but it is how other ports handle this
+  // and as this function is more fully completed, these cases which
+  // return false will have additional code in them.
+  //
+  if (isa<Instruction>(Obj))
+    return false;
+  else if (isa<ConstantExpr>(Obj))
+    return false;
+  Addr.Base.Reg = getRegForValue(Obj);
+  return Addr.Base.Reg != 0;
+}
+
+bool MipsFastISel::EmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
+                            unsigned Alignment) {
+  //
+  // more cases will be handled here in following patches.
+  //
+  unsigned Opc;
+  switch (VT.SimpleTy) {
+  case MVT::i32: {
+    ResultReg = createResultReg(&Mips::GPR32RegClass);
+    Opc = Mips::LW;
+    break;
+  }
+  case MVT::i16: {
+    ResultReg = createResultReg(&Mips::GPR32RegClass);
+    Opc = Mips::LHu;
+    break;
+  }
+  case MVT::i8: {
+    ResultReg = createResultReg(&Mips::GPR32RegClass);
+    Opc = Mips::LBu;
+    break;
+  }
+  case MVT::f32: {
+    ResultReg = createResultReg(&Mips::FGR32RegClass);
+    Opc = Mips::LWC1;
+    break;
+  }
+  case MVT::f64: {
+    ResultReg = createResultReg(&Mips::AFGR64RegClass);
+    Opc = Mips::LDC1;
+    break;
+  }
+  default:
+    return false;
+  }
+  EmitInstLoad(Opc, ResultReg, Addr.Base.Reg, Addr.Offset);
+  return true;
+}
+
+// Materialize a constant into a register, and return the register
+// number (or zero if we failed to handle it).
+unsigned MipsFastISel::TargetMaterializeConstant(const Constant *C) {
+  EVT CEVT = TLI.getValueType(C->getType(), true);
+
+  // Only handle simple types.
+  if (!CEVT.isSimple())
+    return 0;
+  MVT VT = CEVT.getSimpleVT();
+
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
+    return MaterializeFP(CFP, VT);
+  else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+    return MaterializeGV(GV, VT);
+  else if (isa<ConstantInt>(C))
+    return MaterializeInt(C, VT);
+
+  return 0;
+}
+
+bool MipsFastISel::EmitStore(MVT VT, unsigned SrcReg, Address &Addr,
+                             unsigned Alignment) {
+  //
+  // more cases will be handled here in following patches.
+  //
+  unsigned Opc;
+  switch (VT.SimpleTy) {
+  case MVT::i8:
+    Opc = Mips::SB;
+    break;
+  case MVT::i16:
+    Opc = Mips::SH;
+    break;
+  case MVT::i32:
+    Opc = Mips::SW;
+    break;
+  case MVT::f32:
+    Opc = Mips::SWC1;
+    break;
+  case MVT::f64:
+    Opc = Mips::SDC1;
+    break;
+  default:
+    return false;
+  }
+  EmitInstStore(Opc, SrcReg, Addr.Base.Reg, Addr.Offset);
+  return true;
+}
+
+bool MipsFastISel::SelectLoad(const Instruction *I) {
+  // Atomic loads need special handling.
+  if (cast<LoadInst>(I)->isAtomic())
+    return false;
+
+  // Verify we have a legal type before going any further.
+  MVT VT;
+  if (!isLoadTypeLegal(I->getType(), VT))
+    return false;
+
+  // See if we can handle this address.
+  Address Addr;
+  if (!ComputeAddress(I->getOperand(0), Addr))
+    return false;
+
+  unsigned ResultReg;
+  if (!EmitLoad(VT, ResultReg, Addr, cast<LoadInst>(I)->getAlignment()))
+    return false;
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool MipsFastISel::SelectStore(const Instruction *I) {
+  Value *Op0 = I->getOperand(0);
+  unsigned SrcReg = 0;
+
+  // Atomic stores need special handling.
+  if (cast<StoreInst>(I)->isAtomic())
+    return false;
+
+  // Verify we have a legal type before going any further.
+  MVT VT;
+  if (!isLoadTypeLegal(I->getOperand(0)->getType(), VT))
+    return false;
+
+  // Get the value to be stored into a register.
+  SrcReg = getRegForValue(Op0);
+  if (SrcReg == 0)
+    return false;
+
+  // See if we can handle this address.
+  Address Addr;
+  if (!ComputeAddress(I->getOperand(1), Addr))
+    return false;
+
+  if (!EmitStore(VT, SrcReg, Addr, cast<StoreInst>(I)->getAlignment()))
+    return false;
+  return true;
+}
+
+bool MipsFastISel::SelectRet(const Instruction *I) {
+  const ReturnInst *Ret = cast<ReturnInst>(I);
+
+  if (!FuncInfo.CanLowerReturn)
+    return false;
+  if (Ret->getNumOperands() > 0) {
+    return false;
+  }
+  EmitInst(Mips::RetRA);
+  return true;
+}
+
+bool MipsFastISel::TargetSelectInstruction(const Instruction *I) {
+  if (!TargetSupported)
+    return false;
+  switch (I->getOpcode()) {
+  default:
+    break;
+  case Instruction::Load:
+    return SelectLoad(I);
+  case Instruction::Store:
+    return SelectStore(I);
+  case Instruction::Ret:
+    return SelectRet(I);
+  }
+  return false;
+}
+}
+
+unsigned MipsFastISel::MaterializeFP(const ConstantFP *CFP, MVT VT) {
+  int64_t Imm = CFP->getValueAPF().bitcastToAPInt().getZExtValue();
+  if (VT == MVT::f32) {
+    const TargetRegisterClass *RC = &Mips::FGR32RegClass;
+    unsigned DestReg = createResultReg(RC);
+    unsigned TempReg = Materialize32BitInt(Imm, &Mips::GPR32RegClass);
+    EmitInst(Mips::MTC1, DestReg).addReg(TempReg);
+    return DestReg;
+  } else if (VT == MVT::f64) {
+    const TargetRegisterClass *RC = &Mips::AFGR64RegClass;
+    unsigned DestReg = createResultReg(RC);
+    unsigned TempReg1 = Materialize32BitInt(Imm >> 32, &Mips::GPR32RegClass);
+    unsigned TempReg2 =
+        Materialize32BitInt(Imm & 0xFFFFFFFF, &Mips::GPR32RegClass);
+    EmitInst(Mips::BuildPairF64, DestReg).addReg(TempReg2).addReg(TempReg1);
+    return DestReg;
+  }
+  return 0;
+}
+
+unsigned MipsFastISel::MaterializeGV(const GlobalValue *GV, MVT VT) {
+  // For now 32-bit only.
+  if (VT != MVT::i32)
+    return 0;
+  const TargetRegisterClass *RC = &Mips::GPR32RegClass;
+  unsigned DestReg = createResultReg(RC);
+  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
+  bool IsThreadLocal = GVar && GVar->isThreadLocal();
+  // TLS not supported at this time.
+  if (IsThreadLocal)
+    return 0;
+  EmitInst(Mips::LW, DestReg).addReg(MFI->getGlobalBaseReg()).addGlobalAddress(
+      GV, 0, MipsII::MO_GOT);
+  return DestReg;
+}
+unsigned MipsFastISel::MaterializeInt(const Constant *C, MVT VT) {
+  if (VT != MVT::i32 && VT != MVT::i16 && VT != MVT::i8 && VT != MVT::i1)
+    return 0;
+  const TargetRegisterClass *RC = &Mips::GPR32RegClass;
+  const ConstantInt *CI = cast<ConstantInt>(C);
+  int64_t Imm;
+  if (CI->isNegative())
+    Imm = CI->getSExtValue();
+  else
+    Imm = CI->getZExtValue();
+  return Materialize32BitInt(Imm, RC);
+}
+
+unsigned MipsFastISel::Materialize32BitInt(int64_t Imm,
+                                           const TargetRegisterClass *RC) {
+  unsigned ResultReg = createResultReg(RC);
+
+  if (isInt<16>(Imm)) {
+    unsigned Opc = Mips::ADDiu;
+    EmitInst(Opc, ResultReg).addReg(Mips::ZERO).addImm(Imm);
+    return ResultReg;
+  } else if (isUInt<16>(Imm)) {
+    EmitInst(Mips::ORi, ResultReg).addReg(Mips::ZERO).addImm(Imm);
+    return ResultReg;
+  }
+  unsigned Lo = Imm & 0xFFFF;
+  unsigned Hi = (Imm >> 16) & 0xFFFF;
+  if (Lo) {
+    // Both Lo and Hi have nonzero bits.
+    unsigned TmpReg = createResultReg(RC);
+    EmitInst(Mips::LUi, TmpReg).addImm(Hi);
+    EmitInst(Mips::ORi, ResultReg).addReg(TmpReg).addImm(Lo);
+  } else {
+    EmitInst(Mips::LUi, ResultReg).addImm(Hi);
+  }
+  return ResultReg;
+}
+
+namespace llvm {
+FastISel *Mips::createFastISel(FunctionLoweringInfo &funcInfo,
+                               const TargetLibraryInfo *libInfo) {
+  return new MipsFastISel(funcInfo, libInfo);
+}
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsFrameLowering.cpp b/contrib/llvm/lib/Target/Mips/MipsFrameLowering.cpp
new file mode 100644
index 0000000..61afe17
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsFrameLowering.cpp
@@ -0,0 +1,133 @@
+//===-- MipsFrameLowering.cpp - Mips Frame Information --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsFrameLowering.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MipsAnalyzeImmediate.h"
+#include "MipsInstrInfo.h"
+#include "MipsMachineFunction.h"
+#include "MipsTargetMachine.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+
+//===----------------------------------------------------------------------===//
+//
+// Stack Frame Processing methods
+// +----------------------------+
+//
+// The stack is allocated decrementing the stack pointer on
+// the first instruction of a function prologue. Once decremented,
+// all stack references are done thought a positive offset
+// from the stack/frame pointer, so the stack is considering
+// to grow up! Otherwise terrible hacks would have to be made
+// to get this stack ABI compliant :)
+//
+//  The stack frame required by the ABI (after call):
+//  Offset
+//
+//  0                 ----------
+//  4                 Args to pass
+//  .                 saved $GP  (used in PIC)
+//  .                 Alloca allocations
+//  .                 Local Area
+//  .                 CPU "Callee Saved" Registers
+//  .                 saved FP
+//  .                 saved RA
+//  .                 FPU "Callee Saved" Registers
+//  StackSize         -----------
+//
+// Offset - offset from sp after stack allocation on function prologue
+//
+// The sp is the stack pointer subtracted/added from the stack size
+// at the Prologue/Epilogue
+//
+// References to the previous stack (to obtain arguments) are done
+// with offsets that exceeds the stack size: (stacksize+(4*(num_arg-1))
+//
+// Examples:
+// - reference to the actual stack frame
+//   for any local area var there is smt like : FI >= 0, StackOffset: 4
+//     sw REGX, 4(SP)
+//
+// - reference to previous stack frame
+//   suppose there's a load to the 5th arguments : FI < 0, StackOffset: 16.
+//   The emitted instruction will be something like:
+//     lw REGX, 16+StackSize(SP)
+//
+// Since the total stack size is unknown on LowerFormalArguments, all
+// stack references (ObjectOffset) created to reference the function
+// arguments, are negative numbers. This way, on eliminateFrameIndex it's
+// possible to detect those references and the offsets are adjusted to
+// their real location.
+//
+//===----------------------------------------------------------------------===//
+
+const MipsFrameLowering *MipsFrameLowering::create(const MipsSubtarget &ST) {
+  if (ST.inMips16Mode())
+    return llvm::createMips16FrameLowering(ST);
+
+  return llvm::createMipsSEFrameLowering(ST);
+}
+
+// hasFP - Return true if the specified function should have a dedicated frame
+// pointer register.  This is true if the function has variable sized allocas or
+// if frame pointer elimination is disabled.
+bool MipsFrameLowering::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  return MF.getTarget().Options.DisableFramePointerElim(MF) ||
+      MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken();
+}
+
+uint64_t MipsFrameLowering::estimateStackSize(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const TargetRegisterInfo &TRI = *MF.getTarget().getRegisterInfo();
+
+  int64_t Offset = 0;
+
+  // Iterate over fixed sized objects.
+  for (int I = MFI->getObjectIndexBegin(); I != 0; ++I)
+    Offset = std::max(Offset, -MFI->getObjectOffset(I));
+
+  // Conservatively assume all callee-saved registers will be saved.
+  for (const MCPhysReg *R = TRI.getCalleeSavedRegs(&MF); *R; ++R) {
+    unsigned Size = TRI.getMinimalPhysRegClass(*R)->getSize();
+    Offset = RoundUpToAlignment(Offset + Size, Size);
+  }
+
+  unsigned MaxAlign = MFI->getMaxAlignment();
+
+  // Check that MaxAlign is not zero if there is a stack object that is not a
+  // callee-saved spill.
+  assert(!MFI->getObjectIndexEnd() || MaxAlign);
+
+  // Iterate over other objects.
+  for (unsigned I = 0, E = MFI->getObjectIndexEnd(); I != E; ++I)
+    Offset = RoundUpToAlignment(Offset + MFI->getObjectSize(I), MaxAlign);
+
+  // Call frame.
+  if (MFI->adjustsStack() && hasReservedCallFrame(MF))
+    Offset = RoundUpToAlignment(Offset + MFI->getMaxCallFrameSize(),
+                                std::max(MaxAlign, getStackAlignment()));
+
+  return RoundUpToAlignment(Offset, getStackAlignment());
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsFrameLowering.h b/contrib/llvm/lib/Target/Mips/MipsFrameLowering.h
new file mode 100644
index 0000000..9d59309
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsFrameLowering.h
@@ -0,0 +1,45 @@
+//===-- MipsFrameLowering.h - Define frame lowering for Mips ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPS_FRAMEINFO_H
+#define MIPS_FRAMEINFO_H
+
+#include "Mips.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+  class MipsSubtarget;
+
+class MipsFrameLowering : public TargetFrameLowering {
+protected:
+  const MipsSubtarget &STI;
+
+public:
+  explicit MipsFrameLowering(const MipsSubtarget &sti, unsigned Alignment)
+    : TargetFrameLowering(StackGrowsDown, Alignment, 0, Alignment), STI(sti) {}
+
+  static const MipsFrameLowering *create(const MipsSubtarget &ST);
+
+  bool hasFP(const MachineFunction &MF) const override;
+
+protected:
+  uint64_t estimateStackSize(const MachineFunction &MF) const;
+};
+
+/// Create MipsFrameLowering objects.
+const MipsFrameLowering *createMips16FrameLowering(const MipsSubtarget &ST);
+const MipsFrameLowering *createMipsSEFrameLowering(const MipsSubtarget &ST);
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Mips/MipsISelDAGToDAG.cpp
new file mode 100644
index 0000000..0bdabf3
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsISelDAGToDAG.cpp
@@ -0,0 +1,241 @@
+//===-- MipsISelDAGToDAG.cpp - A Dag to Dag Inst Selector for Mips --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the MIPS target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsISelDAGToDAG.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "Mips.h"
+#include "Mips16ISelDAGToDAG.h"
+#include "MipsMachineFunction.h"
+#include "MipsRegisterInfo.h"
+#include "MipsSEISelDAGToDAG.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-isel"
+
+//===----------------------------------------------------------------------===//
+// Instruction Selector Implementation
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MipsDAGToDAGISel - MIPS specific code to select MIPS machine
+// instructions for SelectionDAG operations.
+//===----------------------------------------------------------------------===//
+
+bool MipsDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
+  Subtarget = &TM.getSubtarget<MipsSubtarget>();
+  bool Ret = SelectionDAGISel::runOnMachineFunction(MF);
+
+  processFunctionAfterISel(MF);
+
+  return Ret;
+}
+
+/// getGlobalBaseReg - Output the instructions required to put the
+/// GOT address into a register.
+SDNode *MipsDAGToDAGISel::getGlobalBaseReg() {
+  unsigned GlobalBaseReg = MF->getInfo<MipsFunctionInfo>()->getGlobalBaseReg();
+  return CurDAG->getRegister(GlobalBaseReg,
+                             getTargetLowering()->getPointerTy()).getNode();
+}
+
+/// ComplexPattern used on MipsInstrInfo
+/// Used on Mips Load/Store instructions
+bool MipsDAGToDAGISel::selectAddrRegImm(SDValue Addr, SDValue &Base,
+                                        SDValue &Offset) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectAddrRegReg(SDValue Addr, SDValue &Base,
+                                        SDValue &Offset) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectAddrDefault(SDValue Addr, SDValue &Base,
+                                         SDValue &Offset) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectIntAddr(SDValue Addr, SDValue &Base,
+                                     SDValue &Offset) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectIntAddrMM(SDValue Addr, SDValue &Base,
+                                       SDValue &Offset) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectIntAddrMSA(SDValue Addr, SDValue &Base,
+                                        SDValue &Offset) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectAddr16(SDNode *Parent, SDValue N, SDValue &Base,
+                                    SDValue &Offset, SDValue &Alias) {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplat(SDNode *N, APInt &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimm1(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimm2(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimm3(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimm4(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimm5(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimm6(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimm8(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatSimm5(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimmPow2(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimmInvPow2(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatMaskL(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatMaskR(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+/// Select instructions not customized! Used for
+/// expanded, promoted and normal instructions
+SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
+  unsigned Opcode = Node->getOpcode();
+
+  // Dump information about the Node being selected
+  DEBUG(errs() << "Selecting: "; Node->dump(CurDAG); errs() << "\n");
+
+  // If we have a custom node, we already have selected!
+  if (Node->isMachineOpcode()) {
+    DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
+    Node->setNodeId(-1);
+    return nullptr;
+  }
+
+  // See if subclasses can handle this node.
+  std::pair<bool, SDNode*> Ret = selectNode(Node);
+
+  if (Ret.first)
+    return Ret.second;
+
+  switch(Opcode) {
+  default: break;
+
+  // Get target GOT address.
+  case ISD::GLOBAL_OFFSET_TABLE:
+    return getGlobalBaseReg();
+
+#ifndef NDEBUG
+  case ISD::LOAD:
+  case ISD::STORE:
+    assert((Subtarget->systemSupportsUnalignedAccess() ||
+            cast<MemSDNode>(Node)->getMemoryVT().getSizeInBits() / 8 <=
+            cast<MemSDNode>(Node)->getAlignment()) &&
+           "Unexpected unaligned loads/stores.");
+    break;
+#endif
+  }
+
+  // Select the default instruction
+  SDNode *ResNode = SelectCode(Node);
+
+  DEBUG(errs() << "=> ");
+  if (ResNode == nullptr || ResNode == Node)
+    DEBUG(Node->dump(CurDAG));
+  else
+    DEBUG(ResNode->dump(CurDAG));
+  DEBUG(errs() << "\n");
+  return ResNode;
+}
+
+bool MipsDAGToDAGISel::
+SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
+                             std::vector<SDValue> &OutOps) {
+  assert(ConstraintCode == 'm' && "unexpected asm memory constraint");
+  OutOps.push_back(Op);
+  return false;
+}
+
+/// createMipsISelDag - This pass converts a legalized DAG into a
+/// MIPS-specific DAG, ready for instruction scheduling.
+FunctionPass *llvm::createMipsISelDag(MipsTargetMachine &TM) {
+  if (TM.getSubtargetImpl()->inMips16Mode())
+    return llvm::createMips16ISelDag(TM);
+
+  return llvm::createMipsSEISelDag(TM);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsISelDAGToDAG.h b/contrib/llvm/lib/Target/Mips/MipsISelDAGToDAG.h
new file mode 100644
index 0000000..52f4c0d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsISelDAGToDAG.h
@@ -0,0 +1,135 @@
+//===---- MipsISelDAGToDAG.h - A Dag to Dag Inst Selector for Mips --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the MIPS target.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSISELDAGTODAG_H
+#define MIPSISELDAGTODAG_H
+
+#include "Mips.h"
+#include "MipsSubtarget.h"
+#include "MipsTargetMachine.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+
+//===----------------------------------------------------------------------===//
+// Instruction Selector Implementation
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MipsDAGToDAGISel - MIPS specific code to select MIPS machine
+// instructions for SelectionDAG operations.
+//===----------------------------------------------------------------------===//
+namespace llvm {
+
+class MipsDAGToDAGISel : public SelectionDAGISel {
+public:
+  explicit MipsDAGToDAGISel(MipsTargetMachine &TM)
+      : SelectionDAGISel(TM), Subtarget(nullptr) {}
+
+  // Pass Name
+  const char *getPassName() const override {
+    return "MIPS DAG->DAG Pattern Instruction Selection";
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+protected:
+  SDNode *getGlobalBaseReg();
+
+  /// Keep a pointer to the MipsSubtarget around so that we can make the right
+  /// decision when generating code for different targets.
+  const MipsSubtarget *Subtarget;
+
+private:
+  // Include the pieces autogenerated from the target description.
+  #include "MipsGenDAGISel.inc"
+
+  // Complex Pattern.
+  /// (reg + imm).
+  virtual bool selectAddrRegImm(SDValue Addr, SDValue &Base,
+                                SDValue &Offset) const;
+
+  // Complex Pattern.
+  /// (reg + reg).
+  virtual bool selectAddrRegReg(SDValue Addr, SDValue &Base,
+                                SDValue &Offset) const;
+
+  /// Fall back on this function if all else fails.
+  virtual bool selectAddrDefault(SDValue Addr, SDValue &Base,
+                                 SDValue &Offset) const;
+
+  /// Match integer address pattern.
+  virtual bool selectIntAddr(SDValue Addr, SDValue &Base,
+                             SDValue &Offset) const;
+
+  virtual bool selectIntAddrMM(SDValue Addr, SDValue &Base,
+                               SDValue &Offset) const;
+
+  /// Match addr+simm10 and addr
+  virtual bool selectIntAddrMSA(SDValue Addr, SDValue &Base,
+                                SDValue &Offset) const;
+
+  virtual bool selectAddr16(SDNode *Parent, SDValue N, SDValue &Base,
+                            SDValue &Offset, SDValue &Alias);
+
+  /// \brief Select constant vector splats.
+  virtual bool selectVSplat(SDNode *N, APInt &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm1.
+  virtual bool selectVSplatUimm1(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm2.
+  virtual bool selectVSplatUimm2(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm3.
+  virtual bool selectVSplatUimm3(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm4.
+  virtual bool selectVSplatUimm4(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm5.
+  virtual bool selectVSplatUimm5(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm6.
+  virtual bool selectVSplatUimm6(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm8.
+  virtual bool selectVSplatUimm8(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a simm5.
+  virtual bool selectVSplatSimm5(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value is a power of 2.
+  virtual bool selectVSplatUimmPow2(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value is the inverse of a
+  /// power of 2.
+  virtual bool selectVSplatUimmInvPow2(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value is a run of set bits
+  /// ending at the most significant bit
+  virtual bool selectVSplatMaskL(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value is a run of set bits
+  /// starting at bit zero.
+  virtual bool selectVSplatMaskR(SDValue N, SDValue &Imm) const;
+
+  SDNode *Select(SDNode *N) override;
+
+  virtual std::pair<bool, SDNode*> selectNode(SDNode *Node) = 0;
+
+  // getImm - Return a target constant with the specified value.
+  inline SDValue getImm(const SDNode *Node, uint64_t Imm) {
+    return CurDAG->getTargetConstant(Imm, Node->getValueType(0));
+  }
+
+  virtual void processFunctionAfterISel(MachineFunction &MF) = 0;
+
+  bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                    char ConstraintCode,
+                                    std::vector<SDValue> &OutOps) override;
+};
+
+/// createMipsISelDag - This pass converts a legalized DAG into a
+/// MIPS-specific DAG, ready for instruction scheduling.
+FunctionPass *createMipsISelDag(MipsTargetMachine &TM);
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp b/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp
new file mode 100644
index 0000000..40dc8e4
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -0,0 +1,3715 @@
+//===-- MipsISelLowering.cpp - Mips DAG Lowering Implementation -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that Mips uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+#include "MipsISelLowering.h"
+#include "InstPrinter/MipsInstPrinter.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MipsMachineFunction.h"
+#include "MipsSubtarget.h"
+#include "MipsTargetMachine.h"
+#include "MipsTargetObjectFile.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cctype>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-lower"
+
+STATISTIC(NumTailCalls, "Number of tail calls");
+
+static cl::opt<bool>
+LargeGOT("mxgot", cl::Hidden,
+         cl::desc("MIPS: Enable GOT larger than 64k."), cl::init(false));
+
+static cl::opt<bool>
+NoZeroDivCheck("mno-check-zero-division", cl::Hidden,
+               cl::desc("MIPS: Don't trap on integer division by zero."),
+               cl::init(false));
+
+cl::opt<bool>
+EnableMipsFastISel("mips-fast-isel", cl::Hidden,
+  cl::desc("Allow mips-fast-isel to be used"),
+  cl::init(false));
+
+static const MCPhysReg O32IntRegs[4] = {
+  Mips::A0, Mips::A1, Mips::A2, Mips::A3
+};
+
+static const MCPhysReg Mips64IntRegs[8] = {
+  Mips::A0_64, Mips::A1_64, Mips::A2_64, Mips::A3_64,
+  Mips::T0_64, Mips::T1_64, Mips::T2_64, Mips::T3_64
+};
+
+static const MCPhysReg Mips64DPRegs[8] = {
+  Mips::D12_64, Mips::D13_64, Mips::D14_64, Mips::D15_64,
+  Mips::D16_64, Mips::D17_64, Mips::D18_64, Mips::D19_64
+};
+
+// If I is a shifted mask, set the size (Size) and the first bit of the
+// mask (Pos), and return true.
+// For example, if I is 0x003ff800, (Pos, Size) = (11, 11).
+static bool isShiftedMask(uint64_t I, uint64_t &Pos, uint64_t &Size) {
+  if (!isShiftedMask_64(I))
+    return false;
+
+  Size = CountPopulation_64(I);
+  Pos = countTrailingZeros(I);
+  return true;
+}
+
+SDValue MipsTargetLowering::getGlobalReg(SelectionDAG &DAG, EVT Ty) const {
+  MipsFunctionInfo *FI = DAG.getMachineFunction().getInfo<MipsFunctionInfo>();
+  return DAG.getRegister(FI->getGlobalBaseReg(), Ty);
+}
+
+SDValue MipsTargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
+                                          SelectionDAG &DAG,
+                                          unsigned Flag) const {
+  return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty, 0, Flag);
+}
+
+SDValue MipsTargetLowering::getTargetNode(ExternalSymbolSDNode *N, EVT Ty,
+                                          SelectionDAG &DAG,
+                                          unsigned Flag) const {
+  return DAG.getTargetExternalSymbol(N->getSymbol(), Ty, Flag);
+}
+
+SDValue MipsTargetLowering::getTargetNode(BlockAddressSDNode *N, EVT Ty,
+                                          SelectionDAG &DAG,
+                                          unsigned Flag) const {
+  return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
+}
+
+SDValue MipsTargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
+                                          SelectionDAG &DAG,
+                                          unsigned Flag) const {
+  return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
+}
+
+SDValue MipsTargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
+                                          SelectionDAG &DAG,
+                                          unsigned Flag) const {
+  return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(),
+                                   N->getOffset(), Flag);
+}
+
+const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  case MipsISD::JmpLink:           return "MipsISD::JmpLink";
+  case MipsISD::TailCall:          return "MipsISD::TailCall";
+  case MipsISD::Hi:                return "MipsISD::Hi";
+  case MipsISD::Lo:                return "MipsISD::Lo";
+  case MipsISD::GPRel:             return "MipsISD::GPRel";
+  case MipsISD::ThreadPointer:     return "MipsISD::ThreadPointer";
+  case MipsISD::Ret:               return "MipsISD::Ret";
+  case MipsISD::EH_RETURN:         return "MipsISD::EH_RETURN";
+  case MipsISD::FPBrcond:          return "MipsISD::FPBrcond";
+  case MipsISD::FPCmp:             return "MipsISD::FPCmp";
+  case MipsISD::CMovFP_T:          return "MipsISD::CMovFP_T";
+  case MipsISD::CMovFP_F:          return "MipsISD::CMovFP_F";
+  case MipsISD::TruncIntFP:        return "MipsISD::TruncIntFP";
+  case MipsISD::MFHI:              return "MipsISD::MFHI";
+  case MipsISD::MFLO:              return "MipsISD::MFLO";
+  case MipsISD::MTLOHI:            return "MipsISD::MTLOHI";
+  case MipsISD::Mult:              return "MipsISD::Mult";
+  case MipsISD::Multu:             return "MipsISD::Multu";
+  case MipsISD::MAdd:              return "MipsISD::MAdd";
+  case MipsISD::MAddu:             return "MipsISD::MAddu";
+  case MipsISD::MSub:              return "MipsISD::MSub";
+  case MipsISD::MSubu:             return "MipsISD::MSubu";
+  case MipsISD::DivRem:            return "MipsISD::DivRem";
+  case MipsISD::DivRemU:           return "MipsISD::DivRemU";
+  case MipsISD::DivRem16:          return "MipsISD::DivRem16";
+  case MipsISD::DivRemU16:         return "MipsISD::DivRemU16";
+  case MipsISD::BuildPairF64:      return "MipsISD::BuildPairF64";
+  case MipsISD::ExtractElementF64: return "MipsISD::ExtractElementF64";
+  case MipsISD::Wrapper:           return "MipsISD::Wrapper";
+  case MipsISD::Sync:              return "MipsISD::Sync";
+  case MipsISD::Ext:               return "MipsISD::Ext";
+  case MipsISD::Ins:               return "MipsISD::Ins";
+  case MipsISD::LWL:               return "MipsISD::LWL";
+  case MipsISD::LWR:               return "MipsISD::LWR";
+  case MipsISD::SWL:               return "MipsISD::SWL";
+  case MipsISD::SWR:               return "MipsISD::SWR";
+  case MipsISD::LDL:               return "MipsISD::LDL";
+  case MipsISD::LDR:               return "MipsISD::LDR";
+  case MipsISD::SDL:               return "MipsISD::SDL";
+  case MipsISD::SDR:               return "MipsISD::SDR";
+  case MipsISD::EXTP:              return "MipsISD::EXTP";
+  case MipsISD::EXTPDP:            return "MipsISD::EXTPDP";
+  case MipsISD::EXTR_S_H:          return "MipsISD::EXTR_S_H";
+  case MipsISD::EXTR_W:            return "MipsISD::EXTR_W";
+  case MipsISD::EXTR_R_W:          return "MipsISD::EXTR_R_W";
+  case MipsISD::EXTR_RS_W:         return "MipsISD::EXTR_RS_W";
+  case MipsISD::SHILO:             return "MipsISD::SHILO";
+  case MipsISD::MTHLIP:            return "MipsISD::MTHLIP";
+  case MipsISD::MULT:              return "MipsISD::MULT";
+  case MipsISD::MULTU:             return "MipsISD::MULTU";
+  case MipsISD::MADD_DSP:          return "MipsISD::MADD_DSP";
+  case MipsISD::MADDU_DSP:         return "MipsISD::MADDU_DSP";
+  case MipsISD::MSUB_DSP:          return "MipsISD::MSUB_DSP";
+  case MipsISD::MSUBU_DSP:         return "MipsISD::MSUBU_DSP";
+  case MipsISD::SHLL_DSP:          return "MipsISD::SHLL_DSP";
+  case MipsISD::SHRA_DSP:          return "MipsISD::SHRA_DSP";
+  case MipsISD::SHRL_DSP:          return "MipsISD::SHRL_DSP";
+  case MipsISD::SETCC_DSP:         return "MipsISD::SETCC_DSP";
+  case MipsISD::SELECT_CC_DSP:     return "MipsISD::SELECT_CC_DSP";
+  case MipsISD::VALL_ZERO:         return "MipsISD::VALL_ZERO";
+  case MipsISD::VANY_ZERO:         return "MipsISD::VANY_ZERO";
+  case MipsISD::VALL_NONZERO:      return "MipsISD::VALL_NONZERO";
+  case MipsISD::VANY_NONZERO:      return "MipsISD::VANY_NONZERO";
+  case MipsISD::VCEQ:              return "MipsISD::VCEQ";
+  case MipsISD::VCLE_S:            return "MipsISD::VCLE_S";
+  case MipsISD::VCLE_U:            return "MipsISD::VCLE_U";
+  case MipsISD::VCLT_S:            return "MipsISD::VCLT_S";
+  case MipsISD::VCLT_U:            return "MipsISD::VCLT_U";
+  case MipsISD::VSMAX:             return "MipsISD::VSMAX";
+  case MipsISD::VSMIN:             return "MipsISD::VSMIN";
+  case MipsISD::VUMAX:             return "MipsISD::VUMAX";
+  case MipsISD::VUMIN:             return "MipsISD::VUMIN";
+  case MipsISD::VEXTRACT_SEXT_ELT: return "MipsISD::VEXTRACT_SEXT_ELT";
+  case MipsISD::VEXTRACT_ZEXT_ELT: return "MipsISD::VEXTRACT_ZEXT_ELT";
+  case MipsISD::VNOR:              return "MipsISD::VNOR";
+  case MipsISD::VSHF:              return "MipsISD::VSHF";
+  case MipsISD::SHF:               return "MipsISD::SHF";
+  case MipsISD::ILVEV:             return "MipsISD::ILVEV";
+  case MipsISD::ILVOD:             return "MipsISD::ILVOD";
+  case MipsISD::ILVL:              return "MipsISD::ILVL";
+  case MipsISD::ILVR:              return "MipsISD::ILVR";
+  case MipsISD::PCKEV:             return "MipsISD::PCKEV";
+  case MipsISD::PCKOD:             return "MipsISD::PCKOD";
+  case MipsISD::INSVE:             return "MipsISD::INSVE";
+  default:                         return nullptr;
+  }
+}
+
+MipsTargetLowering::MipsTargetLowering(MipsTargetMachine &TM,
+                                       const MipsSubtarget &STI)
+    : TargetLowering(TM, new MipsTargetObjectFile()), Subtarget(STI) {
+  // Mips does not have i1 type, so use i32 for
+  // setcc operations results (slt, sgt, ...).
+  setBooleanContents(ZeroOrOneBooleanContent);
+  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+  // The cmp.cond.fmt instruction in MIPS32r6/MIPS64r6 uses 0 and -1 like MSA
+  // does. Integer booleans still use 0 and 1.
+  if (Subtarget.hasMips32r6())
+    setBooleanContents(ZeroOrOneBooleanContent,
+                       ZeroOrNegativeOneBooleanContent);
+
+  // Load extented operations for i1 types must be promoted
+  setLoadExtAction(ISD::EXTLOAD,  MVT::i1,  Promote);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1,  Promote);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1,  Promote);
+
+  // MIPS doesn't have extending float->double load/store
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+  // Used by legalize types to correctly generate the setcc result.
+  // Without this, every float setcc comes with a AND/OR with the result,
+  // we don't want this, since the fpcmp result goes to a flag register,
+  // which is used implicitly by brcond and select operations.
+  AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
+
+  // Mips Custom Operations
+  setOperationAction(ISD::BR_JT,              MVT::Other, Custom);
+  setOperationAction(ISD::GlobalAddress,      MVT::i32,   Custom);
+  setOperationAction(ISD::BlockAddress,       MVT::i32,   Custom);
+  setOperationAction(ISD::GlobalTLSAddress,   MVT::i32,   Custom);
+  setOperationAction(ISD::JumpTable,          MVT::i32,   Custom);
+  setOperationAction(ISD::ConstantPool,       MVT::i32,   Custom);
+  setOperationAction(ISD::SELECT,             MVT::f32,   Custom);
+  setOperationAction(ISD::SELECT,             MVT::f64,   Custom);
+  setOperationAction(ISD::SELECT,             MVT::i32,   Custom);
+  setOperationAction(ISD::SELECT_CC,          MVT::f32,   Custom);
+  setOperationAction(ISD::SELECT_CC,          MVT::f64,   Custom);
+  setOperationAction(ISD::SETCC,              MVT::f32,   Custom);
+  setOperationAction(ISD::SETCC,              MVT::f64,   Custom);
+  setOperationAction(ISD::BRCOND,             MVT::Other, Custom);
+  setOperationAction(ISD::VASTART,            MVT::Other, Custom);
+  setOperationAction(ISD::FCOPYSIGN,          MVT::f32,   Custom);
+  setOperationAction(ISD::FCOPYSIGN,          MVT::f64,   Custom);
+  setOperationAction(ISD::FP_TO_SINT,         MVT::i32,   Custom);
+
+  if (Subtarget.isGP64bit()) {
+    setOperationAction(ISD::GlobalAddress,      MVT::i64,   Custom);
+    setOperationAction(ISD::BlockAddress,       MVT::i64,   Custom);
+    setOperationAction(ISD::GlobalTLSAddress,   MVT::i64,   Custom);
+    setOperationAction(ISD::JumpTable,          MVT::i64,   Custom);
+    setOperationAction(ISD::ConstantPool,       MVT::i64,   Custom);
+    setOperationAction(ISD::SELECT,             MVT::i64,   Custom);
+    setOperationAction(ISD::LOAD,               MVT::i64,   Custom);
+    setOperationAction(ISD::STORE,              MVT::i64,   Custom);
+    setOperationAction(ISD::FP_TO_SINT,         MVT::i64,   Custom);
+  }
+
+  if (!Subtarget.isGP64bit()) {
+    setOperationAction(ISD::SHL_PARTS,          MVT::i32,   Custom);
+    setOperationAction(ISD::SRA_PARTS,          MVT::i32,   Custom);
+    setOperationAction(ISD::SRL_PARTS,          MVT::i32,   Custom);
+  }
+
+  setOperationAction(ISD::ADD,                MVT::i32,   Custom);
+  if (Subtarget.isGP64bit())
+    setOperationAction(ISD::ADD,                MVT::i64,   Custom);
+
+  setOperationAction(ISD::SDIV, MVT::i32, Expand);
+  setOperationAction(ISD::SREM, MVT::i32, Expand);
+  setOperationAction(ISD::UDIV, MVT::i32, Expand);
+  setOperationAction(ISD::UREM, MVT::i32, Expand);
+  setOperationAction(ISD::SDIV, MVT::i64, Expand);
+  setOperationAction(ISD::SREM, MVT::i64, Expand);
+  setOperationAction(ISD::UDIV, MVT::i64, Expand);
+  setOperationAction(ISD::UREM, MVT::i64, Expand);
+
+  // Operations not directly supported by Mips.
+  setOperationAction(ISD::BR_CC,             MVT::f32,   Expand);
+  setOperationAction(ISD::BR_CC,             MVT::f64,   Expand);
+  setOperationAction(ISD::BR_CC,             MVT::i32,   Expand);
+  setOperationAction(ISD::BR_CC,             MVT::i64,   Expand);
+  setOperationAction(ISD::SELECT_CC,         MVT::i32,   Expand);
+  setOperationAction(ISD::SELECT_CC,         MVT::i64,   Expand);
+  setOperationAction(ISD::UINT_TO_FP,        MVT::i32,   Expand);
+  setOperationAction(ISD::UINT_TO_FP,        MVT::i64,   Expand);
+  setOperationAction(ISD::FP_TO_UINT,        MVT::i32,   Expand);
+  setOperationAction(ISD::FP_TO_UINT,        MVT::i64,   Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1,    Expand);
+  if (Subtarget.hasCnMips()) {
+    setOperationAction(ISD::CTPOP,           MVT::i32,   Legal);
+    setOperationAction(ISD::CTPOP,           MVT::i64,   Legal);
+  } else {
+    setOperationAction(ISD::CTPOP,           MVT::i32,   Expand);
+    setOperationAction(ISD::CTPOP,           MVT::i64,   Expand);
+  }
+  setOperationAction(ISD::CTTZ,              MVT::i32,   Expand);
+  setOperationAction(ISD::CTTZ,              MVT::i64,   Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF,   MVT::i32,   Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF,   MVT::i64,   Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF,   MVT::i32,   Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF,   MVT::i64,   Expand);
+  setOperationAction(ISD::ROTL,              MVT::i32,   Expand);
+  setOperationAction(ISD::ROTL,              MVT::i64,   Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32,  Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64,  Expand);
+
+  if (!Subtarget.hasMips32r2())
+    setOperationAction(ISD::ROTR, MVT::i32,   Expand);
+
+  if (!Subtarget.hasMips64r2())
+    setOperationAction(ISD::ROTR, MVT::i64,   Expand);
+
+  setOperationAction(ISD::FSIN,              MVT::f32,   Expand);
+  setOperationAction(ISD::FSIN,              MVT::f64,   Expand);
+  setOperationAction(ISD::FCOS,              MVT::f32,   Expand);
+  setOperationAction(ISD::FCOS,              MVT::f64,   Expand);
+  setOperationAction(ISD::FSINCOS,           MVT::f32,   Expand);
+  setOperationAction(ISD::FSINCOS,           MVT::f64,   Expand);
+  setOperationAction(ISD::FPOWI,             MVT::f32,   Expand);
+  setOperationAction(ISD::FPOW,              MVT::f32,   Expand);
+  setOperationAction(ISD::FPOW,              MVT::f64,   Expand);
+  setOperationAction(ISD::FLOG,              MVT::f32,   Expand);
+  setOperationAction(ISD::FLOG2,             MVT::f32,   Expand);
+  setOperationAction(ISD::FLOG10,            MVT::f32,   Expand);
+  setOperationAction(ISD::FEXP,              MVT::f32,   Expand);
+  setOperationAction(ISD::FMA,               MVT::f32,   Expand);
+  setOperationAction(ISD::FMA,               MVT::f64,   Expand);
+  setOperationAction(ISD::FREM,              MVT::f32,   Expand);
+  setOperationAction(ISD::FREM,              MVT::f64,   Expand);
+
+  setOperationAction(ISD::EH_RETURN, MVT::Other, Custom);
+
+  setOperationAction(ISD::VAARG,             MVT::Other, Expand);
+  setOperationAction(ISD::VACOPY,            MVT::Other, Expand);
+  setOperationAction(ISD::VAEND,             MVT::Other, Expand);
+
+  // Use the default for now
+  setOperationAction(ISD::STACKSAVE,         MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE,      MVT::Other, Expand);
+
+  setOperationAction(ISD::ATOMIC_LOAD,       MVT::i32,    Expand);
+  setOperationAction(ISD::ATOMIC_LOAD,       MVT::i64,    Expand);
+  setOperationAction(ISD::ATOMIC_STORE,      MVT::i32,    Expand);
+  setOperationAction(ISD::ATOMIC_STORE,      MVT::i64,    Expand);
+
+  setInsertFencesForAtomic(true);
+
+  if (!Subtarget.hasMips32r2()) {
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8,  Expand);
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
+  }
+
+  // MIPS16 lacks MIPS32's clz and clo instructions.
+  if (!Subtarget.hasMips32() || Subtarget.inMips16Mode())
+    setOperationAction(ISD::CTLZ, MVT::i32, Expand);
+  if (!Subtarget.hasMips64())
+    setOperationAction(ISD::CTLZ, MVT::i64, Expand);
+
+  if (!Subtarget.hasMips32r2())
+    setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+  if (!Subtarget.hasMips64r2())
+    setOperationAction(ISD::BSWAP, MVT::i64, Expand);
+
+  if (Subtarget.isGP64bit()) {
+    setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Custom);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, Custom);
+    setLoadExtAction(ISD::EXTLOAD, MVT::i32, Custom);
+    setTruncStoreAction(MVT::i64, MVT::i32, Custom);
+  }
+
+  setOperationAction(ISD::TRAP, MVT::Other, Legal);
+
+  setTargetDAGCombine(ISD::SDIVREM);
+  setTargetDAGCombine(ISD::UDIVREM);
+  setTargetDAGCombine(ISD::SELECT);
+  setTargetDAGCombine(ISD::AND);
+  setTargetDAGCombine(ISD::OR);
+  setTargetDAGCombine(ISD::ADD);
+
+  setMinFunctionAlignment(Subtarget.isGP64bit() ? 3 : 2);
+
+  setStackPointerRegisterToSaveRestore(Subtarget.isABI_N64() ? Mips::SP_64
+                                                             : Mips::SP);
+
+  setExceptionPointerRegister(Subtarget.isABI_N64() ? Mips::A0_64 : Mips::A0);
+  setExceptionSelectorRegister(Subtarget.isABI_N64() ? Mips::A1_64 : Mips::A1);
+
+  MaxStoresPerMemcpy = 16;
+
+  isMicroMips = Subtarget.inMicroMipsMode();
+}
+
+const MipsTargetLowering *MipsTargetLowering::create(MipsTargetMachine &TM,
+                                                     const MipsSubtarget &STI) {
+  if (STI.inMips16Mode())
+    return llvm::createMips16TargetLowering(TM, STI);
+
+  return llvm::createMipsSETargetLowering(TM, STI);
+}
+
+// Create a fast isel object.
+FastISel *
+MipsTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
+                                  const TargetLibraryInfo *libInfo) const {
+  if (!EnableMipsFastISel)
+    return TargetLowering::createFastISel(funcInfo, libInfo);
+  return Mips::createFastISel(funcInfo, libInfo);
+}
+
+EVT MipsTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
+  if (!VT.isVector())
+    return MVT::i32;
+  return VT.changeVectorElementTypeToInteger();
+}
+
+static SDValue performDivRemCombine(SDNode *N, SelectionDAG &DAG,
+                                    TargetLowering::DAGCombinerInfo &DCI,
+                                    const MipsSubtarget &Subtarget) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  EVT Ty = N->getValueType(0);
+  unsigned LO = (Ty == MVT::i32) ? Mips::LO0 : Mips::LO0_64;
+  unsigned HI = (Ty == MVT::i32) ? Mips::HI0 : Mips::HI0_64;
+  unsigned Opc = N->getOpcode() == ISD::SDIVREM ? MipsISD::DivRem16 :
+                                                  MipsISD::DivRemU16;
+  SDLoc DL(N);
+
+  SDValue DivRem = DAG.getNode(Opc, DL, MVT::Glue,
+                               N->getOperand(0), N->getOperand(1));
+  SDValue InChain = DAG.getEntryNode();
+  SDValue InGlue = DivRem;
+
+  // insert MFLO
+  if (N->hasAnyUseOfValue(0)) {
+    SDValue CopyFromLo = DAG.getCopyFromReg(InChain, DL, LO, Ty,
+                                            InGlue);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), CopyFromLo);
+    InChain = CopyFromLo.getValue(1);
+    InGlue = CopyFromLo.getValue(2);
+  }
+
+  // insert MFHI
+  if (N->hasAnyUseOfValue(1)) {
+    SDValue CopyFromHi = DAG.getCopyFromReg(InChain, DL,
+                                            HI, Ty, InGlue);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), CopyFromHi);
+  }
+
+  return SDValue();
+}
+
+static Mips::CondCode condCodeToFCC(ISD::CondCode CC) {
+  switch (CC) {
+  default: llvm_unreachable("Unknown fp condition code!");
+  case ISD::SETEQ:
+  case ISD::SETOEQ: return Mips::FCOND_OEQ;
+  case ISD::SETUNE: return Mips::FCOND_UNE;
+  case ISD::SETLT:
+  case ISD::SETOLT: return Mips::FCOND_OLT;
+  case ISD::SETGT:
+  case ISD::SETOGT: return Mips::FCOND_OGT;
+  case ISD::SETLE:
+  case ISD::SETOLE: return Mips::FCOND_OLE;
+  case ISD::SETGE:
+  case ISD::SETOGE: return Mips::FCOND_OGE;
+  case ISD::SETULT: return Mips::FCOND_ULT;
+  case ISD::SETULE: return Mips::FCOND_ULE;
+  case ISD::SETUGT: return Mips::FCOND_UGT;
+  case ISD::SETUGE: return Mips::FCOND_UGE;
+  case ISD::SETUO:  return Mips::FCOND_UN;
+  case ISD::SETO:   return Mips::FCOND_OR;
+  case ISD::SETNE:
+  case ISD::SETONE: return Mips::FCOND_ONE;
+  case ISD::SETUEQ: return Mips::FCOND_UEQ;
+  }
+}
+
+
+/// This function returns true if the floating point conditional branches and
+/// conditional moves which use condition code CC should be inverted.
+static bool invertFPCondCodeUser(Mips::CondCode CC) {
+  if (CC >= Mips::FCOND_F && CC <= Mips::FCOND_NGT)
+    return false;
+
+  assert((CC >= Mips::FCOND_T && CC <= Mips::FCOND_GT) &&
+         "Illegal Condition Code");
+
+  return true;
+}
+
+// Creates and returns an FPCmp node from a setcc node.
+// Returns Op if setcc is not a floating point comparison.
+static SDValue createFPCmp(SelectionDAG &DAG, const SDValue &Op) {
+  // must be a SETCC node
+  if (Op.getOpcode() != ISD::SETCC)
+    return Op;
+
+  SDValue LHS = Op.getOperand(0);
+
+  if (!LHS.getValueType().isFloatingPoint())
+    return Op;
+
+  SDValue RHS = Op.getOperand(1);
+  SDLoc DL(Op);
+
+  // Assume the 3rd operand is a CondCodeSDNode. Add code to check the type of
+  // node if necessary.
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+
+  return DAG.getNode(MipsISD::FPCmp, DL, MVT::Glue, LHS, RHS,
+                     DAG.getConstant(condCodeToFCC(CC), MVT::i32));
+}
+
+// Creates and returns a CMovFPT/F node.
+static SDValue createCMovFP(SelectionDAG &DAG, SDValue Cond, SDValue True,
+                            SDValue False, SDLoc DL) {
+  ConstantSDNode *CC = cast<ConstantSDNode>(Cond.getOperand(2));
+  bool invert = invertFPCondCodeUser((Mips::CondCode)CC->getSExtValue());
+  SDValue FCC0 = DAG.getRegister(Mips::FCC0, MVT::i32);
+
+  return DAG.getNode((invert ? MipsISD::CMovFP_F : MipsISD::CMovFP_T), DL,
+                     True.getValueType(), True, FCC0, False, Cond);
+}
+
+static SDValue performSELECTCombine(SDNode *N, SelectionDAG &DAG,
+                                    TargetLowering::DAGCombinerInfo &DCI,
+                                    const MipsSubtarget &Subtarget) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SDValue SetCC = N->getOperand(0);
+
+  if ((SetCC.getOpcode() != ISD::SETCC) ||
+      !SetCC.getOperand(0).getValueType().isInteger())
+    return SDValue();
+
+  SDValue False = N->getOperand(2);
+  EVT FalseTy = False.getValueType();
+
+  if (!FalseTy.isInteger())
+    return SDValue();
+
+  ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(False);
+
+  // If the RHS (False) is 0, we swap the order of the operands
+  // of ISD::SELECT (obviously also inverting the condition) so that we can
+  // take advantage of conditional moves using the $0 register.
+  // Example:
+  //   return (a != 0) ? x : 0;
+  //     load $reg, x
+  //     movz $reg, $0, a
+  if (!FalseC)
+    return SDValue();
+
+  const SDLoc DL(N);
+
+  if (!FalseC->getZExtValue()) {
+    ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
+    SDValue True = N->getOperand(1);
+
+    SetCC = DAG.getSetCC(DL, SetCC.getValueType(), SetCC.getOperand(0),
+                         SetCC.getOperand(1), ISD::getSetCCInverse(CC, true));
+
+    return DAG.getNode(ISD::SELECT, DL, FalseTy, SetCC, False, True);
+  }
+
+  // If both operands are integer constants there's a possibility that we
+  // can do some interesting optimizations.
+  SDValue True = N->getOperand(1);
+  ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(True);
+
+  if (!TrueC || !True.getValueType().isInteger())
+    return SDValue();
+
+  // We'll also ignore MVT::i64 operands as this optimizations proves
+  // to be ineffective because of the required sign extensions as the result
+  // of a SETCC operator is always MVT::i32 for non-vector types.
+  if (True.getValueType() == MVT::i64)
+    return SDValue();
+
+  int64_t Diff = TrueC->getSExtValue() - FalseC->getSExtValue();
+
+  // 1)  (a < x) ? y : y-1
+  //  slti $reg1, a, x
+  //  addiu $reg2, $reg1, y-1
+  if (Diff == 1)
+    return DAG.getNode(ISD::ADD, DL, SetCC.getValueType(), SetCC, False);
+
+  // 2)  (a < x) ? y-1 : y
+  //  slti $reg1, a, x
+  //  xor $reg1, $reg1, 1
+  //  addiu $reg2, $reg1, y-1
+  if (Diff == -1) {
+    ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
+    SetCC = DAG.getSetCC(DL, SetCC.getValueType(), SetCC.getOperand(0),
+                         SetCC.getOperand(1), ISD::getSetCCInverse(CC, true));
+    return DAG.getNode(ISD::ADD, DL, SetCC.getValueType(), SetCC, True);
+  }
+
+  // Couldn't optimize.
+  return SDValue();
+}
+
+static SDValue performANDCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const MipsSubtarget &Subtarget) {
+  // Pattern match EXT.
+  //  $dst = and ((sra or srl) $src , pos), (2**size - 1)
+  //  => ext $dst, $src, size, pos
+  if (DCI.isBeforeLegalizeOps() || !Subtarget.hasExtractInsert())
+    return SDValue();
+
+  SDValue ShiftRight = N->getOperand(0), Mask = N->getOperand(1);
+  unsigned ShiftRightOpc = ShiftRight.getOpcode();
+
+  // Op's first operand must be a shift right.
+  if (ShiftRightOpc != ISD::SRA && ShiftRightOpc != ISD::SRL)
+    return SDValue();
+
+  // The second operand of the shift must be an immediate.
+  ConstantSDNode *CN;
+  if (!(CN = dyn_cast<ConstantSDNode>(ShiftRight.getOperand(1))))
+    return SDValue();
+
+  uint64_t Pos = CN->getZExtValue();
+  uint64_t SMPos, SMSize;
+
+  // Op's second operand must be a shifted mask.
+  if (!(CN = dyn_cast<ConstantSDNode>(Mask)) ||
+      !isShiftedMask(CN->getZExtValue(), SMPos, SMSize))
+    return SDValue();
+
+  // Return if the shifted mask does not start at bit 0 or the sum of its size
+  // and Pos exceeds the word's size.
+  EVT ValTy = N->getValueType(0);
+  if (SMPos != 0 || Pos + SMSize > ValTy.getSizeInBits())
+    return SDValue();
+
+  return DAG.getNode(MipsISD::Ext, SDLoc(N), ValTy,
+                     ShiftRight.getOperand(0), DAG.getConstant(Pos, MVT::i32),
+                     DAG.getConstant(SMSize, MVT::i32));
+}
+
+static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
+                                TargetLowering::DAGCombinerInfo &DCI,
+                                const MipsSubtarget &Subtarget) {
+  // Pattern match INS.
+  //  $dst = or (and $src1 , mask0), (and (shl $src, pos), mask1),
+  //  where mask1 = (2**size - 1) << pos, mask0 = ~mask1
+  //  => ins $dst, $src, size, pos, $src1
+  if (DCI.isBeforeLegalizeOps() || !Subtarget.hasExtractInsert())
+    return SDValue();
+
+  SDValue And0 = N->getOperand(0), And1 = N->getOperand(1);
+  uint64_t SMPos0, SMSize0, SMPos1, SMSize1;
+  ConstantSDNode *CN;
+
+  // See if Op's first operand matches (and $src1 , mask0).
+  if (And0.getOpcode() != ISD::AND)
+    return SDValue();
+
+  if (!(CN = dyn_cast<ConstantSDNode>(And0.getOperand(1))) ||
+      !isShiftedMask(~CN->getSExtValue(), SMPos0, SMSize0))
+    return SDValue();
+
+  // See if Op's second operand matches (and (shl $src, pos), mask1).
+  if (And1.getOpcode() != ISD::AND)
+    return SDValue();
+
+  if (!(CN = dyn_cast<ConstantSDNode>(And1.getOperand(1))) ||
+      !isShiftedMask(CN->getZExtValue(), SMPos1, SMSize1))
+    return SDValue();
+
+  // The shift masks must have the same position and size.
+  if (SMPos0 != SMPos1 || SMSize0 != SMSize1)
+    return SDValue();
+
+  SDValue Shl = And1.getOperand(0);
+  if (Shl.getOpcode() != ISD::SHL)
+    return SDValue();
+
+  if (!(CN = dyn_cast<ConstantSDNode>(Shl.getOperand(1))))
+    return SDValue();
+
+  unsigned Shamt = CN->getZExtValue();
+
+  // Return if the shift amount and the first bit position of mask are not the
+  // same.
+  EVT ValTy = N->getValueType(0);
+  if ((Shamt != SMPos0) || (SMPos0 + SMSize0 > ValTy.getSizeInBits()))
+    return SDValue();
+
+  return DAG.getNode(MipsISD::Ins, SDLoc(N), ValTy, Shl.getOperand(0),
+                     DAG.getConstant(SMPos0, MVT::i32),
+                     DAG.getConstant(SMSize0, MVT::i32), And0.getOperand(0));
+}
+
+static SDValue performADDCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const MipsSubtarget &Subtarget) {
+  // (add v0, (add v1, abs_lo(tjt))) => (add (add v0, v1), abs_lo(tjt))
+
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SDValue Add = N->getOperand(1);
+
+  if (Add.getOpcode() != ISD::ADD)
+    return SDValue();
+
+  SDValue Lo = Add.getOperand(1);
+
+  if ((Lo.getOpcode() != MipsISD::Lo) ||
+      (Lo.getOperand(0).getOpcode() != ISD::TargetJumpTable))
+    return SDValue();
+
+  EVT ValTy = N->getValueType(0);
+  SDLoc DL(N);
+
+  SDValue Add1 = DAG.getNode(ISD::ADD, DL, ValTy, N->getOperand(0),
+                             Add.getOperand(0));
+  return DAG.getNode(ISD::ADD, DL, ValTy, Add1, Lo);
+}
+
+SDValue  MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
+  const {
+  SelectionDAG &DAG = DCI.DAG;
+  unsigned Opc = N->getOpcode();
+
+  switch (Opc) {
+  default: break;
+  case ISD::SDIVREM:
+  case ISD::UDIVREM:
+    return performDivRemCombine(N, DAG, DCI, Subtarget);
+  case ISD::SELECT:
+    return performSELECTCombine(N, DAG, DCI, Subtarget);
+  case ISD::AND:
+    return performANDCombine(N, DAG, DCI, Subtarget);
+  case ISD::OR:
+    return performORCombine(N, DAG, DCI, Subtarget);
+  case ISD::ADD:
+    return performADDCombine(N, DAG, DCI, Subtarget);
+  }
+
+  return SDValue();
+}
+
+void
+MipsTargetLowering::LowerOperationWrapper(SDNode *N,
+                                          SmallVectorImpl<SDValue> &Results,
+                                          SelectionDAG &DAG) const {
+  SDValue Res = LowerOperation(SDValue(N, 0), DAG);
+
+  for (unsigned I = 0, E = Res->getNumValues(); I != E; ++I)
+    Results.push_back(Res.getValue(I));
+}
+
+void
+MipsTargetLowering::ReplaceNodeResults(SDNode *N,
+                                       SmallVectorImpl<SDValue> &Results,
+                                       SelectionDAG &DAG) const {
+  return LowerOperationWrapper(N, Results, DAG);
+}
+
+SDValue MipsTargetLowering::
+LowerOperation(SDValue Op, SelectionDAG &DAG) const
+{
+  switch (Op.getOpcode())
+  {
+  case ISD::BR_JT:              return lowerBR_JT(Op, DAG);
+  case ISD::BRCOND:             return lowerBRCOND(Op, DAG);
+  case ISD::ConstantPool:       return lowerConstantPool(Op, DAG);
+  case ISD::GlobalAddress:      return lowerGlobalAddress(Op, DAG);
+  case ISD::BlockAddress:       return lowerBlockAddress(Op, DAG);
+  case ISD::GlobalTLSAddress:   return lowerGlobalTLSAddress(Op, DAG);
+  case ISD::JumpTable:          return lowerJumpTable(Op, DAG);
+  case ISD::SELECT:             return lowerSELECT(Op, DAG);
+  case ISD::SELECT_CC:          return lowerSELECT_CC(Op, DAG);
+  case ISD::SETCC:              return lowerSETCC(Op, DAG);
+  case ISD::VASTART:            return lowerVASTART(Op, DAG);
+  case ISD::FCOPYSIGN:          return lowerFCOPYSIGN(Op, DAG);
+  case ISD::FRAMEADDR:          return lowerFRAMEADDR(Op, DAG);
+  case ISD::RETURNADDR:         return lowerRETURNADDR(Op, DAG);
+  case ISD::EH_RETURN:          return lowerEH_RETURN(Op, DAG);
+  case ISD::ATOMIC_FENCE:       return lowerATOMIC_FENCE(Op, DAG);
+  case ISD::SHL_PARTS:          return lowerShiftLeftParts(Op, DAG);
+  case ISD::SRA_PARTS:          return lowerShiftRightParts(Op, DAG, true);
+  case ISD::SRL_PARTS:          return lowerShiftRightParts(Op, DAG, false);
+  case ISD::LOAD:               return lowerLOAD(Op, DAG);
+  case ISD::STORE:              return lowerSTORE(Op, DAG);
+  case ISD::ADD:                return lowerADD(Op, DAG);
+  case ISD::FP_TO_SINT:         return lowerFP_TO_SINT(Op, DAG);
+  }
+  return SDValue();
+}
+
+//===----------------------------------------------------------------------===//
+//  Lower helper functions
+//===----------------------------------------------------------------------===//
+
+// addLiveIn - This helper function adds the specified physical register to the
+// MachineFunction as a live in value.  It also creates a corresponding
+// virtual register for it.
+static unsigned
+addLiveIn(MachineFunction &MF, unsigned PReg, const TargetRegisterClass *RC)
+{
+  unsigned VReg = MF.getRegInfo().createVirtualRegister(RC);
+  MF.getRegInfo().addLiveIn(PReg, VReg);
+  return VReg;
+}
+
+static MachineBasicBlock *insertDivByZeroTrap(MachineInstr *MI,
+                                              MachineBasicBlock &MBB,
+                                              const TargetInstrInfo &TII,
+                                              bool Is64Bit) {
+  if (NoZeroDivCheck)
+    return &MBB;
+
+  // Insert instruction "teq $divisor_reg, $zero, 7".
+  MachineBasicBlock::iterator I(MI);
+  MachineInstrBuilder MIB;
+  MachineOperand &Divisor = MI->getOperand(2);
+  MIB = BuildMI(MBB, std::next(I), MI->getDebugLoc(), TII.get(Mips::TEQ))
+    .addReg(Divisor.getReg(), getKillRegState(Divisor.isKill()))
+    .addReg(Mips::ZERO).addImm(7);
+
+  // Use the 32-bit sub-register if this is a 64-bit division.
+  if (Is64Bit)
+    MIB->getOperand(0).setSubReg(Mips::sub_32);
+
+  // Clear Divisor's kill flag.
+  Divisor.setIsKill(false);
+
+  // We would normally delete the original instruction here but in this case
+  // we only needed to inject an additional instruction rather than replace it.
+
+  return &MBB;
+}
+
+MachineBasicBlock *
+MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                MachineBasicBlock *BB) const {
+  switch (MI->getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected instr type to insert");
+  case Mips::ATOMIC_LOAD_ADD_I8:
+    return emitAtomicBinaryPartword(MI, BB, 1, Mips::ADDu);
+  case Mips::ATOMIC_LOAD_ADD_I16:
+    return emitAtomicBinaryPartword(MI, BB, 2, Mips::ADDu);
+  case Mips::ATOMIC_LOAD_ADD_I32:
+    return emitAtomicBinary(MI, BB, 4, Mips::ADDu);
+  case Mips::ATOMIC_LOAD_ADD_I64:
+    return emitAtomicBinary(MI, BB, 8, Mips::DADDu);
+
+  case Mips::ATOMIC_LOAD_AND_I8:
+    return emitAtomicBinaryPartword(MI, BB, 1, Mips::AND);
+  case Mips::ATOMIC_LOAD_AND_I16:
+    return emitAtomicBinaryPartword(MI, BB, 2, Mips::AND);
+  case Mips::ATOMIC_LOAD_AND_I32:
+    return emitAtomicBinary(MI, BB, 4, Mips::AND);
+  case Mips::ATOMIC_LOAD_AND_I64:
+    return emitAtomicBinary(MI, BB, 8, Mips::AND64);
+
+  case Mips::ATOMIC_LOAD_OR_I8:
+    return emitAtomicBinaryPartword(MI, BB, 1, Mips::OR);
+  case Mips::ATOMIC_LOAD_OR_I16:
+    return emitAtomicBinaryPartword(MI, BB, 2, Mips::OR);
+  case Mips::ATOMIC_LOAD_OR_I32:
+    return emitAtomicBinary(MI, BB, 4, Mips::OR);
+  case Mips::ATOMIC_LOAD_OR_I64:
+    return emitAtomicBinary(MI, BB, 8, Mips::OR64);
+
+  case Mips::ATOMIC_LOAD_XOR_I8:
+    return emitAtomicBinaryPartword(MI, BB, 1, Mips::XOR);
+  case Mips::ATOMIC_LOAD_XOR_I16:
+    return emitAtomicBinaryPartword(MI, BB, 2, Mips::XOR);
+  case Mips::ATOMIC_LOAD_XOR_I32:
+    return emitAtomicBinary(MI, BB, 4, Mips::XOR);
+  case Mips::ATOMIC_LOAD_XOR_I64:
+    return emitAtomicBinary(MI, BB, 8, Mips::XOR64);
+
+  case Mips::ATOMIC_LOAD_NAND_I8:
+    return emitAtomicBinaryPartword(MI, BB, 1, 0, true);
+  case Mips::ATOMIC_LOAD_NAND_I16:
+    return emitAtomicBinaryPartword(MI, BB, 2, 0, true);
+  case Mips::ATOMIC_LOAD_NAND_I32:
+    return emitAtomicBinary(MI, BB, 4, 0, true);
+  case Mips::ATOMIC_LOAD_NAND_I64:
+    return emitAtomicBinary(MI, BB, 8, 0, true);
+
+  case Mips::ATOMIC_LOAD_SUB_I8:
+    return emitAtomicBinaryPartword(MI, BB, 1, Mips::SUBu);
+  case Mips::ATOMIC_LOAD_SUB_I16:
+    return emitAtomicBinaryPartword(MI, BB, 2, Mips::SUBu);
+  case Mips::ATOMIC_LOAD_SUB_I32:
+    return emitAtomicBinary(MI, BB, 4, Mips::SUBu);
+  case Mips::ATOMIC_LOAD_SUB_I64:
+    return emitAtomicBinary(MI, BB, 8, Mips::DSUBu);
+
+  case Mips::ATOMIC_SWAP_I8:
+    return emitAtomicBinaryPartword(MI, BB, 1, 0);
+  case Mips::ATOMIC_SWAP_I16:
+    return emitAtomicBinaryPartword(MI, BB, 2, 0);
+  case Mips::ATOMIC_SWAP_I32:
+    return emitAtomicBinary(MI, BB, 4, 0);
+  case Mips::ATOMIC_SWAP_I64:
+    return emitAtomicBinary(MI, BB, 8, 0);
+
+  case Mips::ATOMIC_CMP_SWAP_I8:
+    return emitAtomicCmpSwapPartword(MI, BB, 1);
+  case Mips::ATOMIC_CMP_SWAP_I16:
+    return emitAtomicCmpSwapPartword(MI, BB, 2);
+  case Mips::ATOMIC_CMP_SWAP_I32:
+    return emitAtomicCmpSwap(MI, BB, 4);
+  case Mips::ATOMIC_CMP_SWAP_I64:
+    return emitAtomicCmpSwap(MI, BB, 8);
+  case Mips::PseudoSDIV:
+  case Mips::PseudoUDIV:
+  case Mips::DIV:
+  case Mips::DIVU:
+  case Mips::MOD:
+  case Mips::MODU:
+    return insertDivByZeroTrap(MI, *BB, *getTargetMachine().getInstrInfo(),
+                               false);
+  case Mips::PseudoDSDIV:
+  case Mips::PseudoDUDIV:
+  case Mips::DDIV:
+  case Mips::DDIVU:
+  case Mips::DMOD:
+  case Mips::DMODU:
+    return insertDivByZeroTrap(MI, *BB, *getTargetMachine().getInstrInfo(),
+                               true);
+  case Mips::SEL_D:
+    return emitSEL_D(MI, BB);
+  }
+}
+
+// This function also handles Mips::ATOMIC_SWAP_I32 (when BinOpcode == 0), and
+// Mips::ATOMIC_LOAD_NAND_I32 (when Nand == true)
+MachineBasicBlock *
+MipsTargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
+                                     unsigned Size, unsigned BinOpcode,
+                                     bool Nand) const {
+  assert((Size == 4 || Size == 8) && "Unsupported size for EmitAtomicBinary.");
+
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  const TargetRegisterClass *RC = getRegClassFor(MVT::getIntegerVT(Size * 8));
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned LL, SC, AND, NOR, ZERO, BEQ;
+
+  if (Size == 4) {
+    if (isMicroMips) {
+      LL = Mips::LL_MM;
+      SC = Mips::SC_MM;
+    } else {
+      LL = Subtarget.hasMips32r6() ? Mips::LL_R6 : Mips::LL;
+      SC = Subtarget.hasMips32r6() ? Mips::SC_R6 : Mips::SC;
+    }
+    AND = Mips::AND;
+    NOR = Mips::NOR;
+    ZERO = Mips::ZERO;
+    BEQ = Mips::BEQ;
+  } else {
+    LL = Subtarget.hasMips64r6() ? Mips::LLD_R6 : Mips::LLD;
+    SC = Subtarget.hasMips64r6() ? Mips::SCD_R6 : Mips::SCD;
+    AND = Mips::AND64;
+    NOR = Mips::NOR64;
+    ZERO = Mips::ZERO_64;
+    BEQ = Mips::BEQ64;
+  }
+
+  unsigned OldVal = MI->getOperand(0).getReg();
+  unsigned Ptr = MI->getOperand(1).getReg();
+  unsigned Incr = MI->getOperand(2).getReg();
+
+  unsigned StoreVal = RegInfo.createVirtualRegister(RC);
+  unsigned AndRes = RegInfo.createVirtualRegister(RC);
+  unsigned Success = RegInfo.createVirtualRegister(RC);
+
+  // insert new blocks after the current block
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineFunction::iterator It = BB;
+  ++It;
+  MF->insert(It, loopMBB);
+  MF->insert(It, exitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), BB,
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  //  thisMBB:
+  //    ...
+  //    fallthrough --> loopMBB
+  BB->addSuccessor(loopMBB);
+  loopMBB->addSuccessor(loopMBB);
+  loopMBB->addSuccessor(exitMBB);
+
+  //  loopMBB:
+  //    ll oldval, 0(ptr)
+  //    <binop> storeval, oldval, incr
+  //    sc success, storeval, 0(ptr)
+  //    beq success, $0, loopMBB
+  BB = loopMBB;
+  BuildMI(BB, DL, TII->get(LL), OldVal).addReg(Ptr).addImm(0);
+  if (Nand) {
+    //  and andres, oldval, incr
+    //  nor storeval, $0, andres
+    BuildMI(BB, DL, TII->get(AND), AndRes).addReg(OldVal).addReg(Incr);
+    BuildMI(BB, DL, TII->get(NOR), StoreVal).addReg(ZERO).addReg(AndRes);
+  } else if (BinOpcode) {
+    //  <binop> storeval, oldval, incr
+    BuildMI(BB, DL, TII->get(BinOpcode), StoreVal).addReg(OldVal).addReg(Incr);
+  } else {
+    StoreVal = Incr;
+  }
+  BuildMI(BB, DL, TII->get(SC), Success).addReg(StoreVal).addReg(Ptr).addImm(0);
+  BuildMI(BB, DL, TII->get(BEQ)).addReg(Success).addReg(ZERO).addMBB(loopMBB);
+
+  MI->eraseFromParent(); // The instruction is gone now.
+
+  return exitMBB;
+}
+
+MachineBasicBlock *MipsTargetLowering::emitSignExtendToI32InReg(
+    MachineInstr *MI, MachineBasicBlock *BB, unsigned Size, unsigned DstReg,
+    unsigned SrcReg) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+
+  if (Subtarget.hasMips32r2() && Size == 1) {
+    BuildMI(BB, DL, TII->get(Mips::SEB), DstReg).addReg(SrcReg);
+    return BB;
+  }
+
+  if (Subtarget.hasMips32r2() && Size == 2) {
+    BuildMI(BB, DL, TII->get(Mips::SEH), DstReg).addReg(SrcReg);
+    return BB;
+  }
+
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  const TargetRegisterClass *RC = getRegClassFor(MVT::i32);
+  unsigned ScrReg = RegInfo.createVirtualRegister(RC);
+
+  assert(Size < 32);
+  int64_t ShiftImm = 32 - (Size * 8);
+
+  BuildMI(BB, DL, TII->get(Mips::SLL), ScrReg).addReg(SrcReg).addImm(ShiftImm);
+  BuildMI(BB, DL, TII->get(Mips::SRA), DstReg).addReg(ScrReg).addImm(ShiftImm);
+
+  return BB;
+}
+
+MachineBasicBlock *MipsTargetLowering::emitAtomicBinaryPartword(
+    MachineInstr *MI, MachineBasicBlock *BB, unsigned Size, unsigned BinOpcode,
+    bool Nand) const {
+  assert((Size == 1 || Size == 2) &&
+         "Unsupported size for EmitAtomicBinaryPartial.");
+
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  const TargetRegisterClass *RC = getRegClassFor(MVT::i32);
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+
+  unsigned Dest = MI->getOperand(0).getReg();
+  unsigned Ptr = MI->getOperand(1).getReg();
+  unsigned Incr = MI->getOperand(2).getReg();
+
+  unsigned AlignedAddr = RegInfo.createVirtualRegister(RC);
+  unsigned ShiftAmt = RegInfo.createVirtualRegister(RC);
+  unsigned Mask = RegInfo.createVirtualRegister(RC);
+  unsigned Mask2 = RegInfo.createVirtualRegister(RC);
+  unsigned NewVal = RegInfo.createVirtualRegister(RC);
+  unsigned OldVal = RegInfo.createVirtualRegister(RC);
+  unsigned Incr2 = RegInfo.createVirtualRegister(RC);
+  unsigned MaskLSB2 = RegInfo.createVirtualRegister(RC);
+  unsigned PtrLSB2 = RegInfo.createVirtualRegister(RC);
+  unsigned MaskUpper = RegInfo.createVirtualRegister(RC);
+  unsigned AndRes = RegInfo.createVirtualRegister(RC);
+  unsigned BinOpRes = RegInfo.createVirtualRegister(RC);
+  unsigned MaskedOldVal0 = RegInfo.createVirtualRegister(RC);
+  unsigned StoreVal = RegInfo.createVirtualRegister(RC);
+  unsigned MaskedOldVal1 = RegInfo.createVirtualRegister(RC);
+  unsigned SrlRes = RegInfo.createVirtualRegister(RC);
+  unsigned Success = RegInfo.createVirtualRegister(RC);
+
+  // insert new blocks after the current block
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineFunction::iterator It = BB;
+  ++It;
+  MF->insert(It, loopMBB);
+  MF->insert(It, sinkMBB);
+  MF->insert(It, exitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), BB,
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  BB->addSuccessor(loopMBB);
+  loopMBB->addSuccessor(loopMBB);
+  loopMBB->addSuccessor(sinkMBB);
+  sinkMBB->addSuccessor(exitMBB);
+
+  //  thisMBB:
+  //    addiu   masklsb2,$0,-4                # 0xfffffffc
+  //    and     alignedaddr,ptr,masklsb2
+  //    andi    ptrlsb2,ptr,3
+  //    sll     shiftamt,ptrlsb2,3
+  //    ori     maskupper,$0,255               # 0xff
+  //    sll     mask,maskupper,shiftamt
+  //    nor     mask2,$0,mask
+  //    sll     incr2,incr,shiftamt
+
+  int64_t MaskImm = (Size == 1) ? 255 : 65535;
+  BuildMI(BB, DL, TII->get(Mips::ADDiu), MaskLSB2)
+    .addReg(Mips::ZERO).addImm(-4);
+  BuildMI(BB, DL, TII->get(Mips::AND), AlignedAddr)
+    .addReg(Ptr).addReg(MaskLSB2);
+  BuildMI(BB, DL, TII->get(Mips::ANDi), PtrLSB2).addReg(Ptr).addImm(3);
+  if (Subtarget.isLittle()) {
+    BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(PtrLSB2).addImm(3);
+  } else {
+    unsigned Off = RegInfo.createVirtualRegister(RC);
+    BuildMI(BB, DL, TII->get(Mips::XORi), Off)
+      .addReg(PtrLSB2).addImm((Size == 1) ? 3 : 2);
+    BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(Off).addImm(3);
+  }
+  BuildMI(BB, DL, TII->get(Mips::ORi), MaskUpper)
+    .addReg(Mips::ZERO).addImm(MaskImm);
+  BuildMI(BB, DL, TII->get(Mips::SLLV), Mask)
+    .addReg(MaskUpper).addReg(ShiftAmt);
+  BuildMI(BB, DL, TII->get(Mips::NOR), Mask2).addReg(Mips::ZERO).addReg(Mask);
+  BuildMI(BB, DL, TII->get(Mips::SLLV), Incr2).addReg(Incr).addReg(ShiftAmt);
+
+  // atomic.load.binop
+  // loopMBB:
+  //   ll      oldval,0(alignedaddr)
+  //   binop   binopres,oldval,incr2
+  //   and     newval,binopres,mask
+  //   and     maskedoldval0,oldval,mask2
+  //   or      storeval,maskedoldval0,newval
+  //   sc      success,storeval,0(alignedaddr)
+  //   beq     success,$0,loopMBB
+
+  // atomic.swap
+  // loopMBB:
+  //   ll      oldval,0(alignedaddr)
+  //   and     newval,incr2,mask
+  //   and     maskedoldval0,oldval,mask2
+  //   or      storeval,maskedoldval0,newval
+  //   sc      success,storeval,0(alignedaddr)
+  //   beq     success,$0,loopMBB
+
+  BB = loopMBB;
+  BuildMI(BB, DL, TII->get(Mips::LL), OldVal).addReg(AlignedAddr).addImm(0);
+  if (Nand) {
+    //  and andres, oldval, incr2
+    //  nor binopres, $0, andres
+    //  and newval, binopres, mask
+    BuildMI(BB, DL, TII->get(Mips::AND), AndRes).addReg(OldVal).addReg(Incr2);
+    BuildMI(BB, DL, TII->get(Mips::NOR), BinOpRes)
+      .addReg(Mips::ZERO).addReg(AndRes);
+    BuildMI(BB, DL, TII->get(Mips::AND), NewVal).addReg(BinOpRes).addReg(Mask);
+  } else if (BinOpcode) {
+    //  <binop> binopres, oldval, incr2
+    //  and newval, binopres, mask
+    BuildMI(BB, DL, TII->get(BinOpcode), BinOpRes).addReg(OldVal).addReg(Incr2);
+    BuildMI(BB, DL, TII->get(Mips::AND), NewVal).addReg(BinOpRes).addReg(Mask);
+  } else { // atomic.swap
+    //  and newval, incr2, mask
+    BuildMI(BB, DL, TII->get(Mips::AND), NewVal).addReg(Incr2).addReg(Mask);
+  }
+
+  BuildMI(BB, DL, TII->get(Mips::AND), MaskedOldVal0)
+    .addReg(OldVal).addReg(Mask2);
+  BuildMI(BB, DL, TII->get(Mips::OR), StoreVal)
+    .addReg(MaskedOldVal0).addReg(NewVal);
+  BuildMI(BB, DL, TII->get(Mips::SC), Success)
+    .addReg(StoreVal).addReg(AlignedAddr).addImm(0);
+  BuildMI(BB, DL, TII->get(Mips::BEQ))
+    .addReg(Success).addReg(Mips::ZERO).addMBB(loopMBB);
+
+  //  sinkMBB:
+  //    and     maskedoldval1,oldval,mask
+  //    srl     srlres,maskedoldval1,shiftamt
+  //    sign_extend dest,srlres
+  BB = sinkMBB;
+
+  BuildMI(BB, DL, TII->get(Mips::AND), MaskedOldVal1)
+    .addReg(OldVal).addReg(Mask);
+  BuildMI(BB, DL, TII->get(Mips::SRLV), SrlRes)
+      .addReg(MaskedOldVal1).addReg(ShiftAmt);
+  BB = emitSignExtendToI32InReg(MI, BB, Size, Dest, SrlRes);
+
+  MI->eraseFromParent(); // The instruction is gone now.
+
+  return exitMBB;
+}
+
+MachineBasicBlock * MipsTargetLowering::emitAtomicCmpSwap(MachineInstr *MI,
+                                                          MachineBasicBlock *BB,
+                                                          unsigned Size) const {
+  assert((Size == 4 || Size == 8) && "Unsupported size for EmitAtomicCmpSwap.");
+
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  const TargetRegisterClass *RC = getRegClassFor(MVT::getIntegerVT(Size * 8));
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned LL, SC, ZERO, BNE, BEQ;
+
+  if (Size == 4) {
+    LL = isMicroMips ? Mips::LL_MM : Mips::LL;
+    SC = isMicroMips ? Mips::SC_MM : Mips::SC;
+    ZERO = Mips::ZERO;
+    BNE = Mips::BNE;
+    BEQ = Mips::BEQ;
+  } else {
+    LL = Mips::LLD;
+    SC = Mips::SCD;
+    ZERO = Mips::ZERO_64;
+    BNE = Mips::BNE64;
+    BEQ = Mips::BEQ64;
+  }
+
+  unsigned Dest    = MI->getOperand(0).getReg();
+  unsigned Ptr     = MI->getOperand(1).getReg();
+  unsigned OldVal  = MI->getOperand(2).getReg();
+  unsigned NewVal  = MI->getOperand(3).getReg();
+
+  unsigned Success = RegInfo.createVirtualRegister(RC);
+
+  // insert new blocks after the current block
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineFunction::iterator It = BB;
+  ++It;
+  MF->insert(It, loop1MBB);
+  MF->insert(It, loop2MBB);
+  MF->insert(It, exitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), BB,
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  //  thisMBB:
+  //    ...
+  //    fallthrough --> loop1MBB
+  BB->addSuccessor(loop1MBB);
+  loop1MBB->addSuccessor(exitMBB);
+  loop1MBB->addSuccessor(loop2MBB);
+  loop2MBB->addSuccessor(loop1MBB);
+  loop2MBB->addSuccessor(exitMBB);
+
+  // loop1MBB:
+  //   ll dest, 0(ptr)
+  //   bne dest, oldval, exitMBB
+  BB = loop1MBB;
+  BuildMI(BB, DL, TII->get(LL), Dest).addReg(Ptr).addImm(0);
+  BuildMI(BB, DL, TII->get(BNE))
+    .addReg(Dest).addReg(OldVal).addMBB(exitMBB);
+
+  // loop2MBB:
+  //   sc success, newval, 0(ptr)
+  //   beq success, $0, loop1MBB
+  BB = loop2MBB;
+  BuildMI(BB, DL, TII->get(SC), Success)
+    .addReg(NewVal).addReg(Ptr).addImm(0);
+  BuildMI(BB, DL, TII->get(BEQ))
+    .addReg(Success).addReg(ZERO).addMBB(loop1MBB);
+
+  MI->eraseFromParent(); // The instruction is gone now.
+
+  return exitMBB;
+}
+
+MachineBasicBlock *
+MipsTargetLowering::emitAtomicCmpSwapPartword(MachineInstr *MI,
+                                              MachineBasicBlock *BB,
+                                              unsigned Size) const {
+  assert((Size == 1 || Size == 2) &&
+      "Unsupported size for EmitAtomicCmpSwapPartial.");
+
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  const TargetRegisterClass *RC = getRegClassFor(MVT::i32);
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+
+  unsigned Dest    = MI->getOperand(0).getReg();
+  unsigned Ptr     = MI->getOperand(1).getReg();
+  unsigned CmpVal  = MI->getOperand(2).getReg();
+  unsigned NewVal  = MI->getOperand(3).getReg();
+
+  unsigned AlignedAddr = RegInfo.createVirtualRegister(RC);
+  unsigned ShiftAmt = RegInfo.createVirtualRegister(RC);
+  unsigned Mask = RegInfo.createVirtualRegister(RC);
+  unsigned Mask2 = RegInfo.createVirtualRegister(RC);
+  unsigned ShiftedCmpVal = RegInfo.createVirtualRegister(RC);
+  unsigned OldVal = RegInfo.createVirtualRegister(RC);
+  unsigned MaskedOldVal0 = RegInfo.createVirtualRegister(RC);
+  unsigned ShiftedNewVal = RegInfo.createVirtualRegister(RC);
+  unsigned MaskLSB2 = RegInfo.createVirtualRegister(RC);
+  unsigned PtrLSB2 = RegInfo.createVirtualRegister(RC);
+  unsigned MaskUpper = RegInfo.createVirtualRegister(RC);
+  unsigned MaskedCmpVal = RegInfo.createVirtualRegister(RC);
+  unsigned MaskedNewVal = RegInfo.createVirtualRegister(RC);
+  unsigned MaskedOldVal1 = RegInfo.createVirtualRegister(RC);
+  unsigned StoreVal = RegInfo.createVirtualRegister(RC);
+  unsigned SrlRes = RegInfo.createVirtualRegister(RC);
+  unsigned Success = RegInfo.createVirtualRegister(RC);
+
+  // insert new blocks after the current block
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineFunction::iterator It = BB;
+  ++It;
+  MF->insert(It, loop1MBB);
+  MF->insert(It, loop2MBB);
+  MF->insert(It, sinkMBB);
+  MF->insert(It, exitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), BB,
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  BB->addSuccessor(loop1MBB);
+  loop1MBB->addSuccessor(sinkMBB);
+  loop1MBB->addSuccessor(loop2MBB);
+  loop2MBB->addSuccessor(loop1MBB);
+  loop2MBB->addSuccessor(sinkMBB);
+  sinkMBB->addSuccessor(exitMBB);
+
+  // FIXME: computation of newval2 can be moved to loop2MBB.
+  //  thisMBB:
+  //    addiu   masklsb2,$0,-4                # 0xfffffffc
+  //    and     alignedaddr,ptr,masklsb2
+  //    andi    ptrlsb2,ptr,3
+  //    sll     shiftamt,ptrlsb2,3
+  //    ori     maskupper,$0,255               # 0xff
+  //    sll     mask,maskupper,shiftamt
+  //    nor     mask2,$0,mask
+  //    andi    maskedcmpval,cmpval,255
+  //    sll     shiftedcmpval,maskedcmpval,shiftamt
+  //    andi    maskednewval,newval,255
+  //    sll     shiftednewval,maskednewval,shiftamt
+  int64_t MaskImm = (Size == 1) ? 255 : 65535;
+  BuildMI(BB, DL, TII->get(Mips::ADDiu), MaskLSB2)
+    .addReg(Mips::ZERO).addImm(-4);
+  BuildMI(BB, DL, TII->get(Mips::AND), AlignedAddr)
+    .addReg(Ptr).addReg(MaskLSB2);
+  BuildMI(BB, DL, TII->get(Mips::ANDi), PtrLSB2).addReg(Ptr).addImm(3);
+  if (Subtarget.isLittle()) {
+    BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(PtrLSB2).addImm(3);
+  } else {
+    unsigned Off = RegInfo.createVirtualRegister(RC);
+    BuildMI(BB, DL, TII->get(Mips::XORi), Off)
+      .addReg(PtrLSB2).addImm((Size == 1) ? 3 : 2);
+    BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(Off).addImm(3);
+  }
+  BuildMI(BB, DL, TII->get(Mips::ORi), MaskUpper)
+    .addReg(Mips::ZERO).addImm(MaskImm);
+  BuildMI(BB, DL, TII->get(Mips::SLLV), Mask)
+    .addReg(MaskUpper).addReg(ShiftAmt);
+  BuildMI(BB, DL, TII->get(Mips::NOR), Mask2).addReg(Mips::ZERO).addReg(Mask);
+  BuildMI(BB, DL, TII->get(Mips::ANDi), MaskedCmpVal)
+    .addReg(CmpVal).addImm(MaskImm);
+  BuildMI(BB, DL, TII->get(Mips::SLLV), ShiftedCmpVal)
+    .addReg(MaskedCmpVal).addReg(ShiftAmt);
+  BuildMI(BB, DL, TII->get(Mips::ANDi), MaskedNewVal)
+    .addReg(NewVal).addImm(MaskImm);
+  BuildMI(BB, DL, TII->get(Mips::SLLV), ShiftedNewVal)
+    .addReg(MaskedNewVal).addReg(ShiftAmt);
+
+  //  loop1MBB:
+  //    ll      oldval,0(alginedaddr)
+  //    and     maskedoldval0,oldval,mask
+  //    bne     maskedoldval0,shiftedcmpval,sinkMBB
+  BB = loop1MBB;
+  BuildMI(BB, DL, TII->get(Mips::LL), OldVal).addReg(AlignedAddr).addImm(0);
+  BuildMI(BB, DL, TII->get(Mips::AND), MaskedOldVal0)
+    .addReg(OldVal).addReg(Mask);
+  BuildMI(BB, DL, TII->get(Mips::BNE))
+    .addReg(MaskedOldVal0).addReg(ShiftedCmpVal).addMBB(sinkMBB);
+
+  //  loop2MBB:
+  //    and     maskedoldval1,oldval,mask2
+  //    or      storeval,maskedoldval1,shiftednewval
+  //    sc      success,storeval,0(alignedaddr)
+  //    beq     success,$0,loop1MBB
+  BB = loop2MBB;
+  BuildMI(BB, DL, TII->get(Mips::AND), MaskedOldVal1)
+    .addReg(OldVal).addReg(Mask2);
+  BuildMI(BB, DL, TII->get(Mips::OR), StoreVal)
+    .addReg(MaskedOldVal1).addReg(ShiftedNewVal);
+  BuildMI(BB, DL, TII->get(Mips::SC), Success)
+      .addReg(StoreVal).addReg(AlignedAddr).addImm(0);
+  BuildMI(BB, DL, TII->get(Mips::BEQ))
+      .addReg(Success).addReg(Mips::ZERO).addMBB(loop1MBB);
+
+  //  sinkMBB:
+  //    srl     srlres,maskedoldval0,shiftamt
+  //    sign_extend dest,srlres
+  BB = sinkMBB;
+
+  BuildMI(BB, DL, TII->get(Mips::SRLV), SrlRes)
+      .addReg(MaskedOldVal0).addReg(ShiftAmt);
+  BB = emitSignExtendToI32InReg(MI, BB, Size, Dest, SrlRes);
+
+  MI->eraseFromParent();   // The instruction is gone now.
+
+  return exitMBB;
+}
+
+MachineBasicBlock *MipsTargetLowering::emitSEL_D(MachineInstr *MI,
+                                                 MachineBasicBlock *BB) const {
+  MachineFunction *MF = BB->getParent();
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  MachineBasicBlock::iterator II(MI);
+
+  unsigned Fc = MI->getOperand(1).getReg();
+  const auto &FGR64RegClass = TRI->getRegClass(Mips::FGR64RegClassID);
+
+  unsigned Fc2 = RegInfo.createVirtualRegister(FGR64RegClass);
+
+  BuildMI(*BB, II, DL, TII->get(Mips::SUBREG_TO_REG), Fc2)
+      .addImm(0)
+      .addReg(Fc)
+      .addImm(Mips::sub_lo);
+
+  // We don't erase the original instruction, we just replace the condition
+  // register with the 64-bit super-register.
+  MI->getOperand(1).setReg(Fc2);
+
+  return BB;
+}
+
+//===----------------------------------------------------------------------===//
+//  Misc Lower Operation implementation
+//===----------------------------------------------------------------------===//
+SDValue MipsTargetLowering::lowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  SDValue Table = Op.getOperand(1);
+  SDValue Index = Op.getOperand(2);
+  SDLoc DL(Op);
+  EVT PTy = getPointerTy();
+  unsigned EntrySize =
+    DAG.getMachineFunction().getJumpTableInfo()->getEntrySize(*getDataLayout());
+
+  Index = DAG.getNode(ISD::MUL, DL, PTy, Index,
+                      DAG.getConstant(EntrySize, PTy));
+  SDValue Addr = DAG.getNode(ISD::ADD, DL, PTy, Index, Table);
+
+  EVT MemVT = EVT::getIntegerVT(*DAG.getContext(), EntrySize * 8);
+  Addr = DAG.getExtLoad(ISD::SEXTLOAD, DL, PTy, Chain, Addr,
+                        MachinePointerInfo::getJumpTable(), MemVT, false, false,
+                        0);
+  Chain = Addr.getValue(1);
+
+  if ((getTargetMachine().getRelocationModel() == Reloc::PIC_) ||
+      Subtarget.isABI_N64()) {
+    // For PIC, the sequence is:
+    // BRIND(load(Jumptable + index) + RelocBase)
+    // RelocBase can be JumpTable, GOT or some sort of global base.
+    Addr = DAG.getNode(ISD::ADD, DL, PTy, Addr,
+                       getPICJumpTableRelocBase(Table, DAG));
+  }
+
+  return DAG.getNode(ISD::BRIND, DL, MVT::Other, Chain, Addr);
+}
+
+SDValue MipsTargetLowering::lowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
+  // The first operand is the chain, the second is the condition, the third is
+  // the block to branch to if the condition is true.
+  SDValue Chain = Op.getOperand(0);
+  SDValue Dest = Op.getOperand(2);
+  SDLoc DL(Op);
+
+  assert(!Subtarget.hasMips32r6() && !Subtarget.hasMips64r6());
+  SDValue CondRes = createFPCmp(DAG, Op.getOperand(1));
+
+  // Return if flag is not set by a floating point comparison.
+  if (CondRes.getOpcode() != MipsISD::FPCmp)
+    return Op;
+
+  SDValue CCNode  = CondRes.getOperand(2);
+  Mips::CondCode CC =
+    (Mips::CondCode)cast<ConstantSDNode>(CCNode)->getZExtValue();
+  unsigned Opc = invertFPCondCodeUser(CC) ? Mips::BRANCH_F : Mips::BRANCH_T;
+  SDValue BrCode = DAG.getConstant(Opc, MVT::i32);
+  SDValue FCC0 = DAG.getRegister(Mips::FCC0, MVT::i32);
+  return DAG.getNode(MipsISD::FPBrcond, DL, Op.getValueType(), Chain, BrCode,
+                     FCC0, Dest, CondRes);
+}
+
+SDValue MipsTargetLowering::
+lowerSELECT(SDValue Op, SelectionDAG &DAG) const
+{
+  assert(!Subtarget.hasMips32r6() && !Subtarget.hasMips64r6());
+  SDValue Cond = createFPCmp(DAG, Op.getOperand(0));
+
+  // Return if flag is not set by a floating point comparison.
+  if (Cond.getOpcode() != MipsISD::FPCmp)
+    return Op;
+
+  return createCMovFP(DAG, Cond, Op.getOperand(1), Op.getOperand(2),
+                      SDLoc(Op));
+}
+
+SDValue MipsTargetLowering::
+lowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
+{
+  SDLoc DL(Op);
+  EVT Ty = Op.getOperand(0).getValueType();
+  SDValue Cond = DAG.getNode(ISD::SETCC, DL,
+                             getSetCCResultType(*DAG.getContext(), Ty),
+                             Op.getOperand(0), Op.getOperand(1),
+                             Op.getOperand(4));
+
+  return DAG.getNode(ISD::SELECT, DL, Op.getValueType(), Cond, Op.getOperand(2),
+                     Op.getOperand(3));
+}
+
+SDValue MipsTargetLowering::lowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+  assert(!Subtarget.hasMips32r6() && !Subtarget.hasMips64r6());
+  SDValue Cond = createFPCmp(DAG, Op);
+
+  assert(Cond.getOpcode() == MipsISD::FPCmp &&
+         "Floating point operand expected.");
+
+  SDValue True  = DAG.getConstant(1, MVT::i32);
+  SDValue False = DAG.getConstant(0, MVT::i32);
+
+  return createCMovFP(DAG, Cond, True, False, SDLoc(Op));
+}
+
+SDValue MipsTargetLowering::lowerGlobalAddress(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  // FIXME there isn't actually debug info here
+  SDLoc DL(Op);
+  EVT Ty = Op.getValueType();
+  GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op);
+  const GlobalValue *GV = N->getGlobal();
+
+  if (getTargetMachine().getRelocationModel() != Reloc::PIC_ &&
+      !Subtarget.isABI_N64()) {
+    const MipsTargetObjectFile &TLOF =
+      (const MipsTargetObjectFile&)getObjFileLowering();
+
+    // %gp_rel relocation
+    if (TLOF.IsGlobalInSmallSection(GV, getTargetMachine())) {
+      SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
+                                              MipsII::MO_GPREL);
+      SDValue GPRelNode = DAG.getNode(MipsISD::GPRel, DL,
+                                      DAG.getVTList(MVT::i32), GA);
+      SDValue GPReg = DAG.getRegister(Mips::GP, MVT::i32);
+      return DAG.getNode(ISD::ADD, DL, MVT::i32, GPReg, GPRelNode);
+    }
+
+    // %hi/%lo relocation
+    return getAddrNonPIC(N, Ty, DAG);
+  }
+
+  if (GV->hasInternalLinkage() || (GV->hasLocalLinkage() && !isa<Function>(GV)))
+    return getAddrLocal(N, Ty, DAG,
+                        Subtarget.isABI_N32() || Subtarget.isABI_N64());
+
+  if (LargeGOT)
+    return getAddrGlobalLargeGOT(N, Ty, DAG, MipsII::MO_GOT_HI16,
+                                 MipsII::MO_GOT_LO16, DAG.getEntryNode(),
+                                 MachinePointerInfo::getGOT());
+
+  return getAddrGlobal(N, Ty, DAG,
+                       (Subtarget.isABI_N32() || Subtarget.isABI_N64())
+                           ? MipsII::MO_GOT_DISP
+                           : MipsII::MO_GOT16,
+                       DAG.getEntryNode(), MachinePointerInfo::getGOT());
+}
+
+SDValue MipsTargetLowering::lowerBlockAddress(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  BlockAddressSDNode *N = cast<BlockAddressSDNode>(Op);
+  EVT Ty = Op.getValueType();
+
+  if (getTargetMachine().getRelocationModel() != Reloc::PIC_ &&
+      !Subtarget.isABI_N64())
+    return getAddrNonPIC(N, Ty, DAG);
+
+  return getAddrLocal(N, Ty, DAG,
+                      Subtarget.isABI_N32() || Subtarget.isABI_N64());
+}
+
+SDValue MipsTargetLowering::
+lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
+{
+  // If the relocation model is PIC, use the General Dynamic TLS Model or
+  // Local Dynamic TLS model, otherwise use the Initial Exec or
+  // Local Exec TLS Model.
+
+  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+  SDLoc DL(GA);
+  const GlobalValue *GV = GA->getGlobal();
+  EVT PtrVT = getPointerTy();
+
+  TLSModel::Model model = getTargetMachine().getTLSModel(GV);
+
+  if (model == TLSModel::GeneralDynamic || model == TLSModel::LocalDynamic) {
+    // General Dynamic and Local Dynamic TLS Model.
+    unsigned Flag = (model == TLSModel::LocalDynamic) ? MipsII::MO_TLSLDM
+                                                      : MipsII::MO_TLSGD;
+
+    SDValue TGA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, Flag);
+    SDValue Argument = DAG.getNode(MipsISD::Wrapper, DL, PtrVT,
+                                   getGlobalReg(DAG, PtrVT), TGA);
+    unsigned PtrSize = PtrVT.getSizeInBits();
+    IntegerType *PtrTy = Type::getIntNTy(*DAG.getContext(), PtrSize);
+
+    SDValue TlsGetAddr = DAG.getExternalSymbol("__tls_get_addr", PtrVT);
+
+    ArgListTy Args;
+    ArgListEntry Entry;
+    Entry.Node = Argument;
+    Entry.Ty = PtrTy;
+    Args.push_back(Entry);
+
+    TargetLowering::CallLoweringInfo CLI(DAG);
+    CLI.setDebugLoc(DL).setChain(DAG.getEntryNode())
+      .setCallee(CallingConv::C, PtrTy, TlsGetAddr, std::move(Args), 0);
+    std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+
+    SDValue Ret = CallResult.first;
+
+    if (model != TLSModel::LocalDynamic)
+      return Ret;
+
+    SDValue TGAHi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
+                                               MipsII::MO_DTPREL_HI);
+    SDValue Hi = DAG.getNode(MipsISD::Hi, DL, PtrVT, TGAHi);
+    SDValue TGALo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
+                                               MipsII::MO_DTPREL_LO);
+    SDValue Lo = DAG.getNode(MipsISD::Lo, DL, PtrVT, TGALo);
+    SDValue Add = DAG.getNode(ISD::ADD, DL, PtrVT, Hi, Ret);
+    return DAG.getNode(ISD::ADD, DL, PtrVT, Add, Lo);
+  }
+
+  SDValue Offset;
+  if (model == TLSModel::InitialExec) {
+    // Initial Exec TLS Model
+    SDValue TGA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
+                                             MipsII::MO_GOTTPREL);
+    TGA = DAG.getNode(MipsISD::Wrapper, DL, PtrVT, getGlobalReg(DAG, PtrVT),
+                      TGA);
+    Offset = DAG.getLoad(PtrVT, DL,
+                         DAG.getEntryNode(), TGA, MachinePointerInfo(),
+                         false, false, false, 0);
+  } else {
+    // Local Exec TLS Model
+    assert(model == TLSModel::LocalExec);
+    SDValue TGAHi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
+                                               MipsII::MO_TPREL_HI);
+    SDValue TGALo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
+                                               MipsII::MO_TPREL_LO);
+    SDValue Hi = DAG.getNode(MipsISD::Hi, DL, PtrVT, TGAHi);
+    SDValue Lo = DAG.getNode(MipsISD::Lo, DL, PtrVT, TGALo);
+    Offset = DAG.getNode(ISD::ADD, DL, PtrVT, Hi, Lo);
+  }
+
+  SDValue ThreadPointer = DAG.getNode(MipsISD::ThreadPointer, DL, PtrVT);
+  return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadPointer, Offset);
+}
+
+SDValue MipsTargetLowering::
+lowerJumpTable(SDValue Op, SelectionDAG &DAG) const
+{
+  JumpTableSDNode *N = cast<JumpTableSDNode>(Op);
+  EVT Ty = Op.getValueType();
+
+  if (getTargetMachine().getRelocationModel() != Reloc::PIC_ &&
+      !Subtarget.isABI_N64())
+    return getAddrNonPIC(N, Ty, DAG);
+
+  return getAddrLocal(N, Ty, DAG,
+                      Subtarget.isABI_N32() || Subtarget.isABI_N64());
+}
+
+SDValue MipsTargetLowering::
+lowerConstantPool(SDValue Op, SelectionDAG &DAG) const
+{
+  // gp_rel relocation
+  // FIXME: we should reference the constant pool using small data sections,
+  // but the asm printer currently doesn't support this feature without
+  // hacking it. This feature should come soon so we can uncomment the
+  // stuff below.
+  //if (IsInSmallSection(C->getType())) {
+  //  SDValue GPRelNode = DAG.getNode(MipsISD::GPRel, MVT::i32, CP);
+  //  SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(MVT::i32);
+  //  ResNode = DAG.getNode(ISD::ADD, MVT::i32, GOT, GPRelNode);
+  ConstantPoolSDNode *N = cast<ConstantPoolSDNode>(Op);
+  EVT Ty = Op.getValueType();
+
+  if (getTargetMachine().getRelocationModel() != Reloc::PIC_ &&
+      !Subtarget.isABI_N64())
+    return getAddrNonPIC(N, Ty, DAG);
+
+  return getAddrLocal(N, Ty, DAG,
+                      Subtarget.isABI_N32() || Subtarget.isABI_N64());
+}
+
+SDValue MipsTargetLowering::lowerVASTART(SDValue Op, SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MipsFunctionInfo *FuncInfo = MF.getInfo<MipsFunctionInfo>();
+
+  SDLoc DL(Op);
+  SDValue FI = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
+                                 getPointerTy());
+
+  // vastart just stores the address of the VarArgsFrameIndex slot into the
+  // memory location argument.
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  return DAG.getStore(Op.getOperand(0), DL, FI, Op.getOperand(1),
+                      MachinePointerInfo(SV), false, false, 0);
+}
+
+static SDValue lowerFCOPYSIGN32(SDValue Op, SelectionDAG &DAG,
+                                bool HasExtractInsert) {
+  EVT TyX = Op.getOperand(0).getValueType();
+  EVT TyY = Op.getOperand(1).getValueType();
+  SDValue Const1 = DAG.getConstant(1, MVT::i32);
+  SDValue Const31 = DAG.getConstant(31, MVT::i32);
+  SDLoc DL(Op);
+  SDValue Res;
+
+  // If operand is of type f64, extract the upper 32-bit. Otherwise, bitcast it
+  // to i32.
+  SDValue X = (TyX == MVT::f32) ?
+    DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op.getOperand(0)) :
+    DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32, Op.getOperand(0),
+                Const1);
+  SDValue Y = (TyY == MVT::f32) ?
+    DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op.getOperand(1)) :
+    DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32, Op.getOperand(1),
+                Const1);
+
+  if (HasExtractInsert) {
+    // ext  E, Y, 31, 1  ; extract bit31 of Y
+    // ins  X, E, 31, 1  ; insert extracted bit at bit31 of X
+    SDValue E = DAG.getNode(MipsISD::Ext, DL, MVT::i32, Y, Const31, Const1);
+    Res = DAG.getNode(MipsISD::Ins, DL, MVT::i32, E, Const31, Const1, X);
+  } else {
+    // sll SllX, X, 1
+    // srl SrlX, SllX, 1
+    // srl SrlY, Y, 31
+    // sll SllY, SrlX, 31
+    // or  Or, SrlX, SllY
+    SDValue SllX = DAG.getNode(ISD::SHL, DL, MVT::i32, X, Const1);
+    SDValue SrlX = DAG.getNode(ISD::SRL, DL, MVT::i32, SllX, Const1);
+    SDValue SrlY = DAG.getNode(ISD::SRL, DL, MVT::i32, Y, Const31);
+    SDValue SllY = DAG.getNode(ISD::SHL, DL, MVT::i32, SrlY, Const31);
+    Res = DAG.getNode(ISD::OR, DL, MVT::i32, SrlX, SllY);
+  }
+
+  if (TyX == MVT::f32)
+    return DAG.getNode(ISD::BITCAST, DL, Op.getOperand(0).getValueType(), Res);
+
+  SDValue LowX = DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32,
+                             Op.getOperand(0), DAG.getConstant(0, MVT::i32));
+  return DAG.getNode(MipsISD::BuildPairF64, DL, MVT::f64, LowX, Res);
+}
+
+static SDValue lowerFCOPYSIGN64(SDValue Op, SelectionDAG &DAG,
+                                bool HasExtractInsert) {
+  unsigned WidthX = Op.getOperand(0).getValueSizeInBits();
+  unsigned WidthY = Op.getOperand(1).getValueSizeInBits();
+  EVT TyX = MVT::getIntegerVT(WidthX), TyY = MVT::getIntegerVT(WidthY);
+  SDValue Const1 = DAG.getConstant(1, MVT::i32);
+  SDLoc DL(Op);
+
+  // Bitcast to integer nodes.
+  SDValue X = DAG.getNode(ISD::BITCAST, DL, TyX, Op.getOperand(0));
+  SDValue Y = DAG.getNode(ISD::BITCAST, DL, TyY, Op.getOperand(1));
+
+  if (HasExtractInsert) {
+    // ext  E, Y, width(Y) - 1, 1  ; extract bit width(Y)-1 of Y
+    // ins  X, E, width(X) - 1, 1  ; insert extracted bit at bit width(X)-1 of X
+    SDValue E = DAG.getNode(MipsISD::Ext, DL, TyY, Y,
+                            DAG.getConstant(WidthY - 1, MVT::i32), Const1);
+
+    if (WidthX > WidthY)
+      E = DAG.getNode(ISD::ZERO_EXTEND, DL, TyX, E);
+    else if (WidthY > WidthX)
+      E = DAG.getNode(ISD::TRUNCATE, DL, TyX, E);
+
+    SDValue I = DAG.getNode(MipsISD::Ins, DL, TyX, E,
+                            DAG.getConstant(WidthX - 1, MVT::i32), Const1, X);
+    return DAG.getNode(ISD::BITCAST, DL, Op.getOperand(0).getValueType(), I);
+  }
+
+  // (d)sll SllX, X, 1
+  // (d)srl SrlX, SllX, 1
+  // (d)srl SrlY, Y, width(Y)-1
+  // (d)sll SllY, SrlX, width(Y)-1
+  // or     Or, SrlX, SllY
+  SDValue SllX = DAG.getNode(ISD::SHL, DL, TyX, X, Const1);
+  SDValue SrlX = DAG.getNode(ISD::SRL, DL, TyX, SllX, Const1);
+  SDValue SrlY = DAG.getNode(ISD::SRL, DL, TyY, Y,
+                             DAG.getConstant(WidthY - 1, MVT::i32));
+
+  if (WidthX > WidthY)
+    SrlY = DAG.getNode(ISD::ZERO_EXTEND, DL, TyX, SrlY);
+  else if (WidthY > WidthX)
+    SrlY = DAG.getNode(ISD::TRUNCATE, DL, TyX, SrlY);
+
+  SDValue SllY = DAG.getNode(ISD::SHL, DL, TyX, SrlY,
+                             DAG.getConstant(WidthX - 1, MVT::i32));
+  SDValue Or = DAG.getNode(ISD::OR, DL, TyX, SrlX, SllY);
+  return DAG.getNode(ISD::BITCAST, DL, Op.getOperand(0).getValueType(), Or);
+}
+
+SDValue
+MipsTargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
+  if (Subtarget.isGP64bit())
+    return lowerFCOPYSIGN64(Op, DAG, Subtarget.hasExtractInsert());
+
+  return lowerFCOPYSIGN32(Op, DAG, Subtarget.hasExtractInsert());
+}
+
+SDValue MipsTargetLowering::
+lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
+  // check the depth
+  assert((cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() == 0) &&
+         "Frame address can only be determined for current frame.");
+
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI->setFrameAddressIsTaken(true);
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  SDValue FrameAddr =
+      DAG.getCopyFromReg(DAG.getEntryNode(), DL,
+                         Subtarget.isABI_N64() ? Mips::FP_64 : Mips::FP, VT);
+  return FrameAddr;
+}
+
+SDValue MipsTargetLowering::lowerRETURNADDR(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  if (verifyReturnAddressArgumentIsConstant(Op, DAG))
+    return SDValue();
+
+  // check the depth
+  assert((cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() == 0) &&
+         "Return address can be determined only for current frame.");
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MVT VT = Op.getSimpleValueType();
+  unsigned RA = Subtarget.isABI_N64() ? Mips::RA_64 : Mips::RA;
+  MFI->setReturnAddressIsTaken(true);
+
+  // Return RA, which contains the return address. Mark it an implicit live-in.
+  unsigned Reg = MF.addLiveIn(RA, getRegClassFor(VT));
+  return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(Op), Reg, VT);
+}
+
+// An EH_RETURN is the result of lowering llvm.eh.return which in turn is
+// generated from __builtin_eh_return (offset, handler)
+// The effect of this is to adjust the stack pointer by "offset"
+// and then branch to "handler".
+SDValue MipsTargetLowering::lowerEH_RETURN(SDValue Op, SelectionDAG &DAG)
+                                                                     const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+
+  MipsFI->setCallsEhReturn();
+  SDValue Chain     = Op.getOperand(0);
+  SDValue Offset    = Op.getOperand(1);
+  SDValue Handler   = Op.getOperand(2);
+  SDLoc DL(Op);
+  EVT Ty = Subtarget.isABI_N64() ? MVT::i64 : MVT::i32;
+
+  // Store stack offset in V1, store jump target in V0. Glue CopyToReg and
+  // EH_RETURN nodes, so that instructions are emitted back-to-back.
+  unsigned OffsetReg = Subtarget.isABI_N64() ? Mips::V1_64 : Mips::V1;
+  unsigned AddrReg = Subtarget.isABI_N64() ? Mips::V0_64 : Mips::V0;
+  Chain = DAG.getCopyToReg(Chain, DL, OffsetReg, Offset, SDValue());
+  Chain = DAG.getCopyToReg(Chain, DL, AddrReg, Handler, Chain.getValue(1));
+  return DAG.getNode(MipsISD::EH_RETURN, DL, MVT::Other, Chain,
+                     DAG.getRegister(OffsetReg, Ty),
+                     DAG.getRegister(AddrReg, getPointerTy()),
+                     Chain.getValue(1));
+}
+
+SDValue MipsTargetLowering::lowerATOMIC_FENCE(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  // FIXME: Need pseudo-fence for 'singlethread' fences
+  // FIXME: Set SType for weaker fences where supported/appropriate.
+  unsigned SType = 0;
+  SDLoc DL(Op);
+  return DAG.getNode(MipsISD::Sync, DL, MVT::Other, Op.getOperand(0),
+                     DAG.getConstant(SType, MVT::i32));
+}
+
+SDValue MipsTargetLowering::lowerShiftLeftParts(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  SDValue Lo = Op.getOperand(0), Hi = Op.getOperand(1);
+  SDValue Shamt = Op.getOperand(2);
+
+  // if shamt < 32:
+  //  lo = (shl lo, shamt)
+  //  hi = (or (shl hi, shamt) (srl (srl lo, 1), ~shamt))
+  // else:
+  //  lo = 0
+  //  hi = (shl lo, shamt[4:0])
+  SDValue Not = DAG.getNode(ISD::XOR, DL, MVT::i32, Shamt,
+                            DAG.getConstant(-1, MVT::i32));
+  SDValue ShiftRight1Lo = DAG.getNode(ISD::SRL, DL, MVT::i32, Lo,
+                                      DAG.getConstant(1, MVT::i32));
+  SDValue ShiftRightLo = DAG.getNode(ISD::SRL, DL, MVT::i32, ShiftRight1Lo,
+                                     Not);
+  SDValue ShiftLeftHi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi, Shamt);
+  SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, ShiftLeftHi, ShiftRightLo);
+  SDValue ShiftLeftLo = DAG.getNode(ISD::SHL, DL, MVT::i32, Lo, Shamt);
+  SDValue Cond = DAG.getNode(ISD::AND, DL, MVT::i32, Shamt,
+                             DAG.getConstant(0x20, MVT::i32));
+  Lo = DAG.getNode(ISD::SELECT, DL, MVT::i32, Cond,
+                   DAG.getConstant(0, MVT::i32), ShiftLeftLo);
+  Hi = DAG.getNode(ISD::SELECT, DL, MVT::i32, Cond, ShiftLeftLo, Or);
+
+  SDValue Ops[2] = {Lo, Hi};
+  return DAG.getMergeValues(Ops, DL);
+}
+
+SDValue MipsTargetLowering::lowerShiftRightParts(SDValue Op, SelectionDAG &DAG,
+                                                 bool IsSRA) const {
+  SDLoc DL(Op);
+  SDValue Lo = Op.getOperand(0), Hi = Op.getOperand(1);
+  SDValue Shamt = Op.getOperand(2);
+
+  // if shamt < 32:
+  //  lo = (or (shl (shl hi, 1), ~shamt) (srl lo, shamt))
+  //  if isSRA:
+  //    hi = (sra hi, shamt)
+  //  else:
+  //    hi = (srl hi, shamt)
+  // else:
+  //  if isSRA:
+  //   lo = (sra hi, shamt[4:0])
+  //   hi = (sra hi, 31)
+  //  else:
+  //   lo = (srl hi, shamt[4:0])
+  //   hi = 0
+  SDValue Not = DAG.getNode(ISD::XOR, DL, MVT::i32, Shamt,
+                            DAG.getConstant(-1, MVT::i32));
+  SDValue ShiftLeft1Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
+                                     DAG.getConstant(1, MVT::i32));
+  SDValue ShiftLeftHi = DAG.getNode(ISD::SHL, DL, MVT::i32, ShiftLeft1Hi, Not);
+  SDValue ShiftRightLo = DAG.getNode(ISD::SRL, DL, MVT::i32, Lo, Shamt);
+  SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, ShiftLeftHi, ShiftRightLo);
+  SDValue ShiftRightHi = DAG.getNode(IsSRA ? ISD::SRA : ISD::SRL, DL, MVT::i32,
+                                     Hi, Shamt);
+  SDValue Cond = DAG.getNode(ISD::AND, DL, MVT::i32, Shamt,
+                             DAG.getConstant(0x20, MVT::i32));
+  SDValue Shift31 = DAG.getNode(ISD::SRA, DL, MVT::i32, Hi,
+                                DAG.getConstant(31, MVT::i32));
+  Lo = DAG.getNode(ISD::SELECT, DL, MVT::i32, Cond, ShiftRightHi, Or);
+  Hi = DAG.getNode(ISD::SELECT, DL, MVT::i32, Cond,
+                   IsSRA ? Shift31 : DAG.getConstant(0, MVT::i32),
+                   ShiftRightHi);
+
+  SDValue Ops[2] = {Lo, Hi};
+  return DAG.getMergeValues(Ops, DL);
+}
+
+static SDValue createLoadLR(unsigned Opc, SelectionDAG &DAG, LoadSDNode *LD,
+                            SDValue Chain, SDValue Src, unsigned Offset) {
+  SDValue Ptr = LD->getBasePtr();
+  EVT VT = LD->getValueType(0), MemVT = LD->getMemoryVT();
+  EVT BasePtrVT = Ptr.getValueType();
+  SDLoc DL(LD);
+  SDVTList VTList = DAG.getVTList(VT, MVT::Other);
+
+  if (Offset)
+    Ptr = DAG.getNode(ISD::ADD, DL, BasePtrVT, Ptr,
+                      DAG.getConstant(Offset, BasePtrVT));
+
+  SDValue Ops[] = { Chain, Ptr, Src };
+  return DAG.getMemIntrinsicNode(Opc, DL, VTList, Ops, MemVT,
+                                 LD->getMemOperand());
+}
+
+// Expand an unaligned 32 or 64-bit integer load node.
+SDValue MipsTargetLowering::lowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+  LoadSDNode *LD = cast<LoadSDNode>(Op);
+  EVT MemVT = LD->getMemoryVT();
+
+  if (Subtarget.systemSupportsUnalignedAccess())
+    return Op;
+
+  // Return if load is aligned or if MemVT is neither i32 nor i64.
+  if ((LD->getAlignment() >= MemVT.getSizeInBits() / 8) ||
+      ((MemVT != MVT::i32) && (MemVT != MVT::i64)))
+    return SDValue();
+
+  bool IsLittle = Subtarget.isLittle();
+  EVT VT = Op.getValueType();
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  SDValue Chain = LD->getChain(), Undef = DAG.getUNDEF(VT);
+
+  assert((VT == MVT::i32) || (VT == MVT::i64));
+
+  // Expand
+  //  (set dst, (i64 (load baseptr)))
+  // to
+  //  (set tmp, (ldl (add baseptr, 7), undef))
+  //  (set dst, (ldr baseptr, tmp))
+  if ((VT == MVT::i64) && (ExtType == ISD::NON_EXTLOAD)) {
+    SDValue LDL = createLoadLR(MipsISD::LDL, DAG, LD, Chain, Undef,
+                               IsLittle ? 7 : 0);
+    return createLoadLR(MipsISD::LDR, DAG, LD, LDL.getValue(1), LDL,
+                        IsLittle ? 0 : 7);
+  }
+
+  SDValue LWL = createLoadLR(MipsISD::LWL, DAG, LD, Chain, Undef,
+                             IsLittle ? 3 : 0);
+  SDValue LWR = createLoadLR(MipsISD::LWR, DAG, LD, LWL.getValue(1), LWL,
+                             IsLittle ? 0 : 3);
+
+  // Expand
+  //  (set dst, (i32 (load baseptr))) or
+  //  (set dst, (i64 (sextload baseptr))) or
+  //  (set dst, (i64 (extload baseptr)))
+  // to
+  //  (set tmp, (lwl (add baseptr, 3), undef))
+  //  (set dst, (lwr baseptr, tmp))
+  if ((VT == MVT::i32) || (ExtType == ISD::SEXTLOAD) ||
+      (ExtType == ISD::EXTLOAD))
+    return LWR;
+
+  assert((VT == MVT::i64) && (ExtType == ISD::ZEXTLOAD));
+
+  // Expand
+  //  (set dst, (i64 (zextload baseptr)))
+  // to
+  //  (set tmp0, (lwl (add baseptr, 3), undef))
+  //  (set tmp1, (lwr baseptr, tmp0))
+  //  (set tmp2, (shl tmp1, 32))
+  //  (set dst, (srl tmp2, 32))
+  SDLoc DL(LD);
+  SDValue Const32 = DAG.getConstant(32, MVT::i32);
+  SDValue SLL = DAG.getNode(ISD::SHL, DL, MVT::i64, LWR, Const32);
+  SDValue SRL = DAG.getNode(ISD::SRL, DL, MVT::i64, SLL, Const32);
+  SDValue Ops[] = { SRL, LWR.getValue(1) };
+  return DAG.getMergeValues(Ops, DL);
+}
+
+static SDValue createStoreLR(unsigned Opc, SelectionDAG &DAG, StoreSDNode *SD,
+                             SDValue Chain, unsigned Offset) {
+  SDValue Ptr = SD->getBasePtr(), Value = SD->getValue();
+  EVT MemVT = SD->getMemoryVT(), BasePtrVT = Ptr.getValueType();
+  SDLoc DL(SD);
+  SDVTList VTList = DAG.getVTList(MVT::Other);
+
+  if (Offset)
+    Ptr = DAG.getNode(ISD::ADD, DL, BasePtrVT, Ptr,
+                      DAG.getConstant(Offset, BasePtrVT));
+
+  SDValue Ops[] = { Chain, Value, Ptr };
+  return DAG.getMemIntrinsicNode(Opc, DL, VTList, Ops, MemVT,
+                                 SD->getMemOperand());
+}
+
+// Expand an unaligned 32 or 64-bit integer store node.
+static SDValue lowerUnalignedIntStore(StoreSDNode *SD, SelectionDAG &DAG,
+                                      bool IsLittle) {
+  SDValue Value = SD->getValue(), Chain = SD->getChain();
+  EVT VT = Value.getValueType();
+
+  // Expand
+  //  (store val, baseptr) or
+  //  (truncstore val, baseptr)
+  // to
+  //  (swl val, (add baseptr, 3))
+  //  (swr val, baseptr)
+  if ((VT == MVT::i32) || SD->isTruncatingStore()) {
+    SDValue SWL = createStoreLR(MipsISD::SWL, DAG, SD, Chain,
+                                IsLittle ? 3 : 0);
+    return createStoreLR(MipsISD::SWR, DAG, SD, SWL, IsLittle ? 0 : 3);
+  }
+
+  assert(VT == MVT::i64);
+
+  // Expand
+  //  (store val, baseptr)
+  // to
+  //  (sdl val, (add baseptr, 7))
+  //  (sdr val, baseptr)
+  SDValue SDL = createStoreLR(MipsISD::SDL, DAG, SD, Chain, IsLittle ? 7 : 0);
+  return createStoreLR(MipsISD::SDR, DAG, SD, SDL, IsLittle ? 0 : 7);
+}
+
+// Lower (store (fp_to_sint $fp) $ptr) to (store (TruncIntFP $fp), $ptr).
+static SDValue lowerFP_TO_SINT_STORE(StoreSDNode *SD, SelectionDAG &DAG) {
+  SDValue Val = SD->getValue();
+
+  if (Val.getOpcode() != ISD::FP_TO_SINT)
+    return SDValue();
+
+  EVT FPTy = EVT::getFloatingPointVT(Val.getValueSizeInBits());
+  SDValue Tr = DAG.getNode(MipsISD::TruncIntFP, SDLoc(Val), FPTy,
+                           Val.getOperand(0));
+
+  return DAG.getStore(SD->getChain(), SDLoc(SD), Tr, SD->getBasePtr(),
+                      SD->getPointerInfo(), SD->isVolatile(),
+                      SD->isNonTemporal(), SD->getAlignment());
+}
+
+SDValue MipsTargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+  StoreSDNode *SD = cast<StoreSDNode>(Op);
+  EVT MemVT = SD->getMemoryVT();
+
+  // Lower unaligned integer stores.
+  if (!Subtarget.systemSupportsUnalignedAccess() &&
+      (SD->getAlignment() < MemVT.getSizeInBits() / 8) &&
+      ((MemVT == MVT::i32) || (MemVT == MVT::i64)))
+    return lowerUnalignedIntStore(SD, DAG, Subtarget.isLittle());
+
+  return lowerFP_TO_SINT_STORE(SD, DAG);
+}
+
+SDValue MipsTargetLowering::lowerADD(SDValue Op, SelectionDAG &DAG) const {
+  if (Op->getOperand(0).getOpcode() != ISD::FRAMEADDR
+      || cast<ConstantSDNode>
+        (Op->getOperand(0).getOperand(0))->getZExtValue() != 0
+      || Op->getOperand(1).getOpcode() != ISD::FRAME_TO_ARGS_OFFSET)
+    return SDValue();
+
+  // The pattern
+  //   (add (frameaddr 0), (frame_to_args_offset))
+  // results from lowering llvm.eh.dwarf.cfa intrinsic. Transform it to
+  //   (add FrameObject, 0)
+  // where FrameObject is a fixed StackObject with offset 0 which points to
+  // the old stack pointer.
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  EVT ValTy = Op->getValueType(0);
+  int FI = MFI->CreateFixedObject(Op.getValueSizeInBits() / 8, 0, false);
+  SDValue InArgsAddr = DAG.getFrameIndex(FI, ValTy);
+  return DAG.getNode(ISD::ADD, SDLoc(Op), ValTy, InArgsAddr,
+                     DAG.getConstant(0, ValTy));
+}
+
+SDValue MipsTargetLowering::lowerFP_TO_SINT(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  EVT FPTy = EVT::getFloatingPointVT(Op.getValueSizeInBits());
+  SDValue Trunc = DAG.getNode(MipsISD::TruncIntFP, SDLoc(Op), FPTy,
+                              Op.getOperand(0));
+  return DAG.getNode(ISD::BITCAST, SDLoc(Op), Op.getValueType(), Trunc);
+}
+
+//===----------------------------------------------------------------------===//
+//                      Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// TODO: Implement a generic logic using tblgen that can support this.
+// Mips O32 ABI rules:
+// ---
+// i32 - Passed in A0, A1, A2, A3 and stack
+// f32 - Only passed in f32 registers if no int reg has been used yet to hold
+//       an argument. Otherwise, passed in A1, A2, A3 and stack.
+// f64 - Only passed in two aliased f32 registers if no int reg has been used
+//       yet to hold an argument. Otherwise, use A2, A3 and stack. If A1 is
+//       not used, it must be shadowed. If only A3 is avaiable, shadow it and
+//       go to stack.
+//
+//  For vararg functions, all arguments are passed in A0, A1, A2, A3 and stack.
+//===----------------------------------------------------------------------===//
+
+static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT,
+                       CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
+                       CCState &State, const MCPhysReg *F64Regs) {
+
+  static const unsigned IntRegsSize = 4, FloatRegsSize = 2;
+
+  static const MCPhysReg IntRegs[] = { Mips::A0, Mips::A1, Mips::A2, Mips::A3 };
+  static const MCPhysReg F32Regs[] = { Mips::F12, Mips::F14 };
+
+  // Do not process byval args here.
+  if (ArgFlags.isByVal())
+    return true;
+
+  // Promote i8 and i16
+  if (LocVT == MVT::i8 || LocVT == MVT::i16) {
+    LocVT = MVT::i32;
+    if (ArgFlags.isSExt())
+      LocInfo = CCValAssign::SExt;
+    else if (ArgFlags.isZExt())
+      LocInfo = CCValAssign::ZExt;
+    else
+      LocInfo = CCValAssign::AExt;
+  }
+
+  unsigned Reg;
+
+  // f32 and f64 are allocated in A0, A1, A2, A3 when either of the following
+  // is true: function is vararg, argument is 3rd or higher, there is previous
+  // argument which is not f32 or f64.
+  bool AllocateFloatsInIntReg = State.isVarArg() || ValNo > 1
+      || State.getFirstUnallocated(F32Regs, FloatRegsSize) != ValNo;
+  unsigned OrigAlign = ArgFlags.getOrigAlign();
+  bool isI64 = (ValVT == MVT::i32 && OrigAlign == 8);
+
+  if (ValVT == MVT::i32 || (ValVT == MVT::f32 && AllocateFloatsInIntReg)) {
+    Reg = State.AllocateReg(IntRegs, IntRegsSize);
+    // If this is the first part of an i64 arg,
+    // the allocated register must be either A0 or A2.
+    if (isI64 && (Reg == Mips::A1 || Reg == Mips::A3))
+      Reg = State.AllocateReg(IntRegs, IntRegsSize);
+    LocVT = MVT::i32;
+  } else if (ValVT == MVT::f64 && AllocateFloatsInIntReg) {
+    // Allocate int register and shadow next int register. If first
+    // available register is Mips::A1 or Mips::A3, shadow it too.
+    Reg = State.AllocateReg(IntRegs, IntRegsSize);
+    if (Reg == Mips::A1 || Reg == Mips::A3)
+      Reg = State.AllocateReg(IntRegs, IntRegsSize);
+    State.AllocateReg(IntRegs, IntRegsSize);
+    LocVT = MVT::i32;
+  } else if (ValVT.isFloatingPoint() && !AllocateFloatsInIntReg) {
+    // we are guaranteed to find an available float register
+    if (ValVT == MVT::f32) {
+      Reg = State.AllocateReg(F32Regs, FloatRegsSize);
+      // Shadow int register
+      State.AllocateReg(IntRegs, IntRegsSize);
+    } else {
+      Reg = State.AllocateReg(F64Regs, FloatRegsSize);
+      // Shadow int registers
+      unsigned Reg2 = State.AllocateReg(IntRegs, IntRegsSize);
+      if (Reg2 == Mips::A1 || Reg2 == Mips::A3)
+        State.AllocateReg(IntRegs, IntRegsSize);
+      State.AllocateReg(IntRegs, IntRegsSize);
+    }
+  } else
+    llvm_unreachable("Cannot handle this ValVT.");
+
+  if (!Reg) {
+    unsigned Offset = State.AllocateStack(ValVT.getSizeInBits() >> 3,
+                                          OrigAlign);
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+  } else
+    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+
+  return false;
+}
+
+static bool CC_MipsO32_FP32(unsigned ValNo, MVT ValVT,
+                            MVT LocVT, CCValAssign::LocInfo LocInfo,
+                            ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  static const MCPhysReg F64Regs[] = { Mips::D6, Mips::D7 };
+
+  return CC_MipsO32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, F64Regs);
+}
+
+static bool CC_MipsO32_FP64(unsigned ValNo, MVT ValVT,
+                            MVT LocVT, CCValAssign::LocInfo LocInfo,
+                            ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  static const MCPhysReg F64Regs[] = { Mips::D12_64, Mips::D14_64 };
+
+  return CC_MipsO32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, F64Regs);
+}
+
+#include "MipsGenCallingConv.inc"
+
+//===----------------------------------------------------------------------===//
+//                  Call Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+// Return next O32 integer argument register.
+static unsigned getNextIntArgReg(unsigned Reg) {
+  assert((Reg == Mips::A0) || (Reg == Mips::A2));
+  return (Reg == Mips::A0) ? Mips::A1 : Mips::A3;
+}
+
+SDValue
+MipsTargetLowering::passArgOnStack(SDValue StackPtr, unsigned Offset,
+                                   SDValue Chain, SDValue Arg, SDLoc DL,
+                                   bool IsTailCall, SelectionDAG &DAG) const {
+  if (!IsTailCall) {
+    SDValue PtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr,
+                                 DAG.getIntPtrConstant(Offset));
+    return DAG.getStore(Chain, DL, Arg, PtrOff, MachinePointerInfo(), false,
+                        false, 0);
+  }
+
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  int FI = MFI->CreateFixedObject(Arg.getValueSizeInBits() / 8, Offset, false);
+  SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+  return DAG.getStore(Chain, DL, Arg, FIN, MachinePointerInfo(),
+                      /*isVolatile=*/ true, false, 0);
+}
+
+void MipsTargetLowering::
+getOpndList(SmallVectorImpl<SDValue> &Ops,
+            std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
+            bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
+            CallLoweringInfo &CLI, SDValue Callee, SDValue Chain) const {
+  // Insert node "GP copy globalreg" before call to function.
+  //
+  // R_MIPS_CALL* operators (emitted when non-internal functions are called
+  // in PIC mode) allow symbols to be resolved via lazy binding.
+  // The lazy binding stub requires GP to point to the GOT.
+  if (IsPICCall && !InternalLinkage) {
+    unsigned GPReg = Subtarget.isABI_N64() ? Mips::GP_64 : Mips::GP;
+    EVT Ty = Subtarget.isABI_N64() ? MVT::i64 : MVT::i32;
+    RegsToPass.push_back(std::make_pair(GPReg, getGlobalReg(CLI.DAG, Ty)));
+  }
+
+  // Build a sequence of copy-to-reg nodes chained together with token
+  // chain and flag operands which copy the outgoing args into registers.
+  // The InFlag in necessary since all emitted instructions must be
+  // stuck together.
+  SDValue InFlag;
+
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = CLI.DAG.getCopyToReg(Chain, CLI.DL, RegsToPass[i].first,
+                                 RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // Add argument registers to the end of the list so that they are
+  // known live into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(CLI.DAG.getRegister(RegsToPass[i].first,
+                                      RegsToPass[i].second.getValueType()));
+
+  // Add a register mask operand representing the call-preserved registers.
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const uint32_t *Mask = TRI->getCallPreservedMask(CLI.CallConv);
+  assert(Mask && "Missing call preserved mask for calling convention");
+  if (Subtarget.inMips16HardFloat()) {
+    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(CLI.Callee)) {
+      llvm::StringRef Sym = G->getGlobal()->getName();
+      Function *F = G->getGlobal()->getParent()->getFunction(Sym);
+      if (F && F->hasFnAttribute("__Mips16RetHelper")) {
+        Mask = MipsRegisterInfo::getMips16RetHelperMask();
+      }
+    }
+  }
+  Ops.push_back(CLI.DAG.getRegisterMask(Mask));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+}
+
+/// LowerCall - functions arguments are copied from virtual regs to
+/// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
+SDValue
+MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                              SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  SDLoc DL                              = CLI.DL;
+  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+  SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
+  SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  bool &IsTailCall                      = CLI.IsTailCall;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool IsVarArg                         = CLI.IsVarArg;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const TargetFrameLowering *TFL = MF.getTarget().getFrameLowering();
+  MipsFunctionInfo *FuncInfo = MF.getInfo<MipsFunctionInfo>();
+  bool IsPIC = getTargetMachine().getRelocationModel() == Reloc::PIC_;
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+  MipsCC::SpecialCallingConvType SpecialCallingConv =
+    getSpecialCallingConv(Callee);
+  MipsCC MipsCCInfo(CallConv, Subtarget.isABI_O32(), Subtarget.isFP64bit(),
+                    CCInfo, SpecialCallingConv);
+
+  MipsCCInfo.analyzeCallOperands(Outs, IsVarArg,
+                                 Subtarget.abiUsesSoftFloat(),
+                                 Callee.getNode(), CLI.getArgs());
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NextStackOffset = CCInfo.getNextStackOffset();
+
+  // Check if it's really possible to do a tail call.
+  if (IsTailCall)
+    IsTailCall =
+      isEligibleForTailCallOptimization(MipsCCInfo, NextStackOffset,
+                                        *MF.getInfo<MipsFunctionInfo>());
+
+  if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall())
+    report_fatal_error("failed to perform tail call elimination on a call "
+                       "site marked musttail");
+
+  if (IsTailCall)
+    ++NumTailCalls;
+
+  // Chain is the output chain of the last Load/Store or CopyToReg node.
+  // ByValChain is the output chain of the last Memcpy node created for copying
+  // byval arguments to the stack.
+  unsigned StackAlignment = TFL->getStackAlignment();
+  NextStackOffset = RoundUpToAlignment(NextStackOffset, StackAlignment);
+  SDValue NextStackOffsetVal = DAG.getIntPtrConstant(NextStackOffset, true);
+
+  if (!IsTailCall)
+    Chain = DAG.getCALLSEQ_START(Chain, NextStackOffsetVal, DL);
+
+  SDValue StackPtr = DAG.getCopyFromReg(
+      Chain, DL, Subtarget.isABI_N64() ? Mips::SP_64 : Mips::SP,
+      getPointerTy());
+
+  // With EABI is it possible to have 16 args on registers.
+  std::deque< std::pair<unsigned, SDValue> > RegsToPass;
+  SmallVector<SDValue, 8> MemOpChains;
+  MipsCC::byval_iterator ByValArg = MipsCCInfo.byval_begin();
+
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    SDValue Arg = OutVals[i];
+    CCValAssign &VA = ArgLocs[i];
+    MVT ValVT = VA.getValVT(), LocVT = VA.getLocVT();
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
+
+    // ByVal Arg.
+    if (Flags.isByVal()) {
+      assert(Flags.getByValSize() &&
+             "ByVal args of size 0 should have been ignored by front-end.");
+      assert(ByValArg != MipsCCInfo.byval_end());
+      assert(!IsTailCall &&
+             "Do not tail-call optimize if there is a byval argument.");
+      passByValArg(Chain, DL, RegsToPass, MemOpChains, StackPtr, MFI, DAG, Arg,
+                   MipsCCInfo, *ByValArg, Flags, Subtarget.isLittle());
+      ++ByValArg;
+      continue;
+    }
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full:
+      if (VA.isRegLoc()) {
+        if ((ValVT == MVT::f32 && LocVT == MVT::i32) ||
+            (ValVT == MVT::f64 && LocVT == MVT::i64) ||
+            (ValVT == MVT::i64 && LocVT == MVT::f64))
+          Arg = DAG.getNode(ISD::BITCAST, DL, LocVT, Arg);
+        else if (ValVT == MVT::f64 && LocVT == MVT::i32) {
+          SDValue Lo = DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32,
+                                   Arg, DAG.getConstant(0, MVT::i32));
+          SDValue Hi = DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32,
+                                   Arg, DAG.getConstant(1, MVT::i32));
+          if (!Subtarget.isLittle())
+            std::swap(Lo, Hi);
+          unsigned LocRegLo = VA.getLocReg();
+          unsigned LocRegHigh = getNextIntArgReg(LocRegLo);
+          RegsToPass.push_back(std::make_pair(LocRegLo, Lo));
+          RegsToPass.push_back(std::make_pair(LocRegHigh, Hi));
+          continue;
+        }
+      }
+      break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, LocVT, Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, LocVT, Arg);
+      break;
+    case CCValAssign::AExt:
+      Arg = DAG.getNode(ISD::ANY_EXTEND, DL, LocVT, Arg);
+      break;
+    }
+
+    // Arguments that can be passed on register must be kept at
+    // RegsToPass vector
+    if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+      continue;
+    }
+
+    // Register can't get to this point...
+    assert(VA.isMemLoc());
+
+    // emit ISD::STORE whichs stores the
+    // parameter value to a stack Location
+    MemOpChains.push_back(passArgOnStack(StackPtr, VA.getLocMemOffset(),
+                                         Chain, Arg, DL, IsTailCall, DAG));
+  }
+
+  // Transform all store nodes into one single node because all store
+  // nodes are independent of each other.
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
+
+  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
+  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
+  // node so that legalize doesn't hack it.
+  bool IsPICCall =
+      (Subtarget.isABI_N64() || IsPIC); // true if calls are translated to
+                                         // jalr $25
+  bool GlobalOrExternal = false, InternalLinkage = false;
+  SDValue CalleeLo;
+  EVT Ty = Callee.getValueType();
+
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    if (IsPICCall) {
+      const GlobalValue *Val = G->getGlobal();
+      InternalLinkage = Val->hasInternalLinkage();
+
+      if (InternalLinkage)
+        Callee = getAddrLocal(G, Ty, DAG,
+                              Subtarget.isABI_N32() || Subtarget.isABI_N64());
+      else if (LargeGOT)
+        Callee = getAddrGlobalLargeGOT(G, Ty, DAG, MipsII::MO_CALL_HI16,
+                                       MipsII::MO_CALL_LO16, Chain,
+                                       FuncInfo->callPtrInfo(Val));
+      else
+        Callee = getAddrGlobal(G, Ty, DAG, MipsII::MO_GOT_CALL, Chain,
+                               FuncInfo->callPtrInfo(Val));
+    } else
+      Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, getPointerTy(), 0,
+                                          MipsII::MO_NO_FLAG);
+    GlobalOrExternal = true;
+  }
+  else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    const char *Sym = S->getSymbol();
+
+    if (!Subtarget.isABI_N64() && !IsPIC) // !N64 && static
+      Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(),
+                                            MipsII::MO_NO_FLAG);
+    else if (LargeGOT)
+      Callee = getAddrGlobalLargeGOT(S, Ty, DAG, MipsII::MO_CALL_HI16,
+                                     MipsII::MO_CALL_LO16, Chain,
+                                     FuncInfo->callPtrInfo(Sym));
+    else // N64 || PIC
+      Callee = getAddrGlobal(S, Ty, DAG, MipsII::MO_GOT_CALL, Chain,
+                             FuncInfo->callPtrInfo(Sym));
+
+    GlobalOrExternal = true;
+  }
+
+  SmallVector<SDValue, 8> Ops(1, Chain);
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+
+  getOpndList(Ops, RegsToPass, IsPICCall, GlobalOrExternal, InternalLinkage,
+              CLI, Callee, Chain);
+
+  if (IsTailCall)
+    return DAG.getNode(MipsISD::TailCall, DL, MVT::Other, Ops);
+
+  Chain = DAG.getNode(MipsISD::JmpLink, DL, NodeTys, Ops);
+  SDValue InFlag = Chain.getValue(1);
+
+  // Create the CALLSEQ_END node.
+  Chain = DAG.getCALLSEQ_END(Chain, NextStackOffsetVal,
+                             DAG.getIntPtrConstant(0, true), InFlag, DL);
+  InFlag = Chain.getValue(1);
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return LowerCallResult(Chain, InFlag, CallConv, IsVarArg,
+                         Ins, DL, DAG, InVals, CLI.Callee.getNode(), CLI.RetTy);
+}
+
+/// LowerCallResult - Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+SDValue
+MipsTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
+                                    CallingConv::ID CallConv, bool IsVarArg,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                    SDLoc DL, SelectionDAG &DAG,
+                                    SmallVectorImpl<SDValue> &InVals,
+                                    const SDNode *CallNode,
+                                    const Type *RetTy) const {
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), RVLocs, *DAG.getContext());
+  MipsCC MipsCCInfo(CallConv, Subtarget.isABI_O32(), Subtarget.isFP64bit(),
+                    CCInfo);
+
+  MipsCCInfo.analyzeCallResult(Ins, Subtarget.abiUsesSoftFloat(),
+                               CallNode, RetTy);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    SDValue Val = DAG.getCopyFromReg(Chain, DL, RVLocs[i].getLocReg(),
+                                     RVLocs[i].getLocVT(), InFlag);
+    Chain = Val.getValue(1);
+    InFlag = Val.getValue(2);
+
+    if (RVLocs[i].getValVT() != RVLocs[i].getLocVT())
+      Val = DAG.getNode(ISD::BITCAST, DL, RVLocs[i].getValVT(), Val);
+
+    InVals.push_back(Val);
+  }
+
+  return Chain;
+}
+
+//===----------------------------------------------------------------------===//
+//             Formal Arguments Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+/// LowerFormalArguments - transform physical registers into virtual registers
+/// and generate load operations for arguments places on the stack.
+SDValue
+MipsTargetLowering::LowerFormalArguments(SDValue Chain,
+                                         CallingConv::ID CallConv,
+                                         bool IsVarArg,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                         SDLoc DL, SelectionDAG &DAG,
+                                         SmallVectorImpl<SDValue> &InVals)
+                                          const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+
+  MipsFI->setVarArgsFrameIndex(0);
+
+  // Used with vargs to acumulate store chains.
+  std::vector<SDValue> OutChains;
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+  MipsCC MipsCCInfo(CallConv, Subtarget.isABI_O32(), Subtarget.isFP64bit(),
+                    CCInfo);
+  Function::const_arg_iterator FuncArg =
+    DAG.getMachineFunction().getFunction()->arg_begin();
+  bool UseSoftFloat = Subtarget.abiUsesSoftFloat();
+
+  MipsCCInfo.analyzeFormalArguments(Ins, UseSoftFloat, FuncArg);
+  MipsFI->setFormalArgInfo(CCInfo.getNextStackOffset(),
+                           MipsCCInfo.hasByValArg());
+
+  unsigned CurArgIdx = 0;
+  MipsCC::byval_iterator ByValArg = MipsCCInfo.byval_begin();
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    std::advance(FuncArg, Ins[i].OrigArgIndex - CurArgIdx);
+    CurArgIdx = Ins[i].OrigArgIndex;
+    EVT ValVT = VA.getValVT();
+    ISD::ArgFlagsTy Flags = Ins[i].Flags;
+    bool IsRegLoc = VA.isRegLoc();
+
+    if (Flags.isByVal()) {
+      assert(Flags.getByValSize() &&
+             "ByVal args of size 0 should have been ignored by front-end.");
+      assert(ByValArg != MipsCCInfo.byval_end());
+      copyByValRegs(Chain, DL, OutChains, DAG, Flags, InVals, &*FuncArg,
+                    MipsCCInfo, *ByValArg);
+      ++ByValArg;
+      continue;
+    }
+
+    // Arguments stored on registers
+    if (IsRegLoc) {
+      MVT RegVT = VA.getLocVT();
+      unsigned ArgReg = VA.getLocReg();
+      const TargetRegisterClass *RC = getRegClassFor(RegVT);
+
+      // Transform the arguments stored on
+      // physical registers into virtual ones
+      unsigned Reg = addLiveIn(DAG.getMachineFunction(), ArgReg, RC);
+      SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
+
+      // If this is an 8 or 16-bit value, it has been passed promoted
+      // to 32 bits.  Insert an assert[sz]ext to capture this, then
+      // truncate to the right size.
+      if (VA.getLocInfo() != CCValAssign::Full) {
+        unsigned Opcode = 0;
+        if (VA.getLocInfo() == CCValAssign::SExt)
+          Opcode = ISD::AssertSext;
+        else if (VA.getLocInfo() == CCValAssign::ZExt)
+          Opcode = ISD::AssertZext;
+        if (Opcode)
+          ArgValue = DAG.getNode(Opcode, DL, RegVT, ArgValue,
+                                 DAG.getValueType(ValVT));
+        ArgValue = DAG.getNode(ISD::TRUNCATE, DL, ValVT, ArgValue);
+      }
+
+      // Handle floating point arguments passed in integer registers and
+      // long double arguments passed in floating point registers.
+      if ((RegVT == MVT::i32 && ValVT == MVT::f32) ||
+          (RegVT == MVT::i64 && ValVT == MVT::f64) ||
+          (RegVT == MVT::f64 && ValVT == MVT::i64))
+        ArgValue = DAG.getNode(ISD::BITCAST, DL, ValVT, ArgValue);
+      else if (Subtarget.isABI_O32() && RegVT == MVT::i32 &&
+               ValVT == MVT::f64) {
+        unsigned Reg2 = addLiveIn(DAG.getMachineFunction(),
+                                  getNextIntArgReg(ArgReg), RC);
+        SDValue ArgValue2 = DAG.getCopyFromReg(Chain, DL, Reg2, RegVT);
+        if (!Subtarget.isLittle())
+          std::swap(ArgValue, ArgValue2);
+        ArgValue = DAG.getNode(MipsISD::BuildPairF64, DL, MVT::f64,
+                               ArgValue, ArgValue2);
+      }
+
+      InVals.push_back(ArgValue);
+    } else { // VA.isRegLoc()
+
+      // sanity check
+      assert(VA.isMemLoc());
+
+      // The stack pointer offset is relative to the caller stack frame.
+      int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
+                                      VA.getLocMemOffset(), true);
+
+      // Create load nodes to retrieve arguments from the stack
+      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+      SDValue Load = DAG.getLoad(ValVT, DL, Chain, FIN,
+                                 MachinePointerInfo::getFixedStack(FI),
+                                 false, false, false, 0);
+      InVals.push_back(Load);
+      OutChains.push_back(Load.getValue(1));
+    }
+  }
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    // The mips ABIs for returning structs by value requires that we copy
+    // the sret argument into $v0 for the return. Save the argument into
+    // a virtual register so that we can access it from the return points.
+    if (Ins[i].Flags.isSRet()) {
+      unsigned Reg = MipsFI->getSRetReturnReg();
+      if (!Reg) {
+        Reg = MF.getRegInfo().createVirtualRegister(
+            getRegClassFor(Subtarget.isABI_N64() ? MVT::i64 : MVT::i32));
+        MipsFI->setSRetReturnReg(Reg);
+      }
+      SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[i]);
+      Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
+      break;
+    }
+  }
+
+  if (IsVarArg)
+    writeVarArgRegs(OutChains, MipsCCInfo, Chain, DL, DAG);
+
+  // All stores are grouped in one node to allow the matching between
+  // the size of Ins and InVals. This only happens when on varg functions
+  if (!OutChains.empty()) {
+    OutChains.push_back(Chain);
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
+  }
+
+  return Chain;
+}
+
+//===----------------------------------------------------------------------===//
+//               Return Value Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+bool
+MipsTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
+                                   MachineFunction &MF, bool IsVarArg,
+                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                   LLVMContext &Context) const {
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, IsVarArg, MF, getTargetMachine(),
+                 RVLocs, Context);
+  return CCInfo.CheckReturn(Outs, RetCC_Mips);
+}
+
+SDValue
+MipsTargetLowering::LowerReturn(SDValue Chain,
+                                CallingConv::ID CallConv, bool IsVarArg,
+                                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                const SmallVectorImpl<SDValue> &OutVals,
+                                SDLoc DL, SelectionDAG &DAG) const {
+  // CCValAssign - represent the assignment of
+  // the return value to a location
+  SmallVector<CCValAssign, 16> RVLocs;
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  // CCState - Info about the registers and stack slot.
+  CCState CCInfo(CallConv, IsVarArg, MF, getTargetMachine(), RVLocs,
+                 *DAG.getContext());
+  MipsCC MipsCCInfo(CallConv, Subtarget.isABI_O32(), Subtarget.isFP64bit(),
+                    CCInfo);
+
+  // Analyze return values.
+  MipsCCInfo.analyzeReturn(Outs, Subtarget.abiUsesSoftFloat(),
+                           MF.getFunction()->getReturnType());
+
+  SDValue Flag;
+  SmallVector<SDValue, 4> RetOps(1, Chain);
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    SDValue Val = OutVals[i];
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    if (RVLocs[i].getValVT() != RVLocs[i].getLocVT())
+      Val = DAG.getNode(ISD::BITCAST, DL, RVLocs[i].getLocVT(), Val);
+
+    Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Val, Flag);
+
+    // Guarantee that all emitted copies are stuck together with flags.
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
+
+  // The mips ABIs for returning structs by value requires that we copy
+  // the sret argument into $v0 for the return. We saved the argument into
+  // a virtual register in the entry block, so now we copy the value out
+  // and into $v0.
+  if (MF.getFunction()->hasStructRetAttr()) {
+    MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+    unsigned Reg = MipsFI->getSRetReturnReg();
+
+    if (!Reg)
+      llvm_unreachable("sret virtual register not created in the entry block");
+    SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy());
+    unsigned V0 = Subtarget.isABI_N64() ? Mips::V0_64 : Mips::V0;
+
+    Chain = DAG.getCopyToReg(Chain, DL, V0, Val, Flag);
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(V0, getPointerTy()));
+  }
+
+  RetOps[0] = Chain;  // Update chain.
+
+  // Add the flag if we have it.
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
+  // Return on Mips is always a "jr $ra"
+  return DAG.getNode(MipsISD::Ret, DL, MVT::Other, RetOps);
+}
+
+//===----------------------------------------------------------------------===//
+//                           Mips Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+MipsTargetLowering::ConstraintType MipsTargetLowering::
+getConstraintType(const std::string &Constraint) const
+{
+  // Mips specific constraints
+  // GCC config/mips/constraints.md
+  //
+  // 'd' : An address register. Equivalent to r
+  //       unless generating MIPS16 code.
+  // 'y' : Equivalent to r; retained for
+  //       backwards compatibility.
+  // 'c' : A register suitable for use in an indirect
+  //       jump. This will always be $25 for -mabicalls.
+  // 'l' : The lo register. 1 word storage.
+  // 'x' : The hilo register pair. Double word storage.
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+      default : break;
+      case 'd':
+      case 'y':
+      case 'f':
+      case 'c':
+      case 'l':
+      case 'x':
+        return C_RegisterClass;
+      case 'R':
+        return C_Memory;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+/// Examine constraint type and operand type and determine a weight value.
+/// This object must already have been set up with the operand type
+/// and the current alternative constraint selected.
+TargetLowering::ConstraintWeight
+MipsTargetLowering::getSingleConstraintMatchWeight(
+    AsmOperandInfo &info, const char *constraint) const {
+  ConstraintWeight weight = CW_Invalid;
+  Value *CallOperandVal = info.CallOperandVal;
+    // If we don't have a value, we can't do a match,
+    // but allow it at the lowest weight.
+  if (!CallOperandVal)
+    return CW_Default;
+  Type *type = CallOperandVal->getType();
+  // Look at the constraint type.
+  switch (*constraint) {
+  default:
+    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+    break;
+  case 'd':
+  case 'y':
+    if (type->isIntegerTy())
+      weight = CW_Register;
+    break;
+  case 'f': // FPU or MSA register
+    if (Subtarget.hasMSA() && type->isVectorTy() &&
+        cast<VectorType>(type)->getBitWidth() == 128)
+      weight = CW_Register;
+    else if (type->isFloatTy())
+      weight = CW_Register;
+    break;
+  case 'c': // $25 for indirect jumps
+  case 'l': // lo register
+  case 'x': // hilo register pair
+    if (type->isIntegerTy())
+      weight = CW_SpecificReg;
+    break;
+  case 'I': // signed 16 bit immediate
+  case 'J': // integer zero
+  case 'K': // unsigned 16 bit immediate
+  case 'L': // signed 32 bit immediate where lower 16 bits are 0
+  case 'N': // immediate in the range of -65535 to -1 (inclusive)
+  case 'O': // signed 15 bit immediate (+- 16383)
+  case 'P': // immediate in the range of 65535 to 1 (inclusive)
+    if (isa<ConstantInt>(CallOperandVal))
+      weight = CW_Constant;
+    break;
+  case 'R':
+    weight = CW_Memory;
+    break;
+  }
+  return weight;
+}
+
+/// This is a helper function to parse a physical register string and split it
+/// into non-numeric and numeric parts (Prefix and Reg). The first boolean flag
+/// that is returned indicates whether parsing was successful. The second flag
+/// is true if the numeric part exists.
+static std::pair<bool, bool>
+parsePhysicalReg(const StringRef &C, std::string &Prefix,
+                 unsigned long long &Reg) {
+  if (C.front() != '{' || C.back() != '}')
+    return std::make_pair(false, false);
+
+  // Search for the first numeric character.
+  StringRef::const_iterator I, B = C.begin() + 1, E = C.end() - 1;
+  I = std::find_if(B, E, std::ptr_fun(isdigit));
+
+  Prefix.assign(B, I - B);
+
+  // The second flag is set to false if no numeric characters were found.
+  if (I == E)
+    return std::make_pair(true, false);
+
+  // Parse the numeric characters.
+  return std::make_pair(!getAsUnsignedInteger(StringRef(I, E - I), 10, Reg),
+                        true);
+}
+
+std::pair<unsigned, const TargetRegisterClass *> MipsTargetLowering::
+parseRegForInlineAsmConstraint(const StringRef &C, MVT VT) const {
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const TargetRegisterClass *RC;
+  std::string Prefix;
+  unsigned long long Reg;
+
+  std::pair<bool, bool> R = parsePhysicalReg(C, Prefix, Reg);
+
+  if (!R.first)
+    return std::make_pair(0U, nullptr);
+
+  if ((Prefix == "hi" || Prefix == "lo")) { // Parse hi/lo.
+    // No numeric characters follow "hi" or "lo".
+    if (R.second)
+      return std::make_pair(0U, nullptr);
+
+    RC = TRI->getRegClass(Prefix == "hi" ?
+                          Mips::HI32RegClassID : Mips::LO32RegClassID);
+    return std::make_pair(*(RC->begin()), RC);
+  } else if (Prefix.compare(0, 4, "$msa") == 0) {
+    // Parse $msa(ir|csr|access|save|modify|request|map|unmap)
+
+    // No numeric characters follow the name.
+    if (R.second)
+      return std::make_pair(0U, nullptr);
+
+    Reg = StringSwitch<unsigned long long>(Prefix)
+              .Case("$msair", Mips::MSAIR)
+              .Case("$msacsr", Mips::MSACSR)
+              .Case("$msaaccess", Mips::MSAAccess)
+              .Case("$msasave", Mips::MSASave)
+              .Case("$msamodify", Mips::MSAModify)
+              .Case("$msarequest", Mips::MSARequest)
+              .Case("$msamap", Mips::MSAMap)
+              .Case("$msaunmap", Mips::MSAUnmap)
+              .Default(0);
+
+    if (!Reg)
+      return std::make_pair(0U, nullptr);
+
+    RC = TRI->getRegClass(Mips::MSACtrlRegClassID);
+    return std::make_pair(Reg, RC);
+  }
+
+  if (!R.second)
+    return std::make_pair(0U, nullptr);
+
+  if (Prefix == "$f") { // Parse $f0-$f31.
+    // If the size of FP registers is 64-bit or Reg is an even number, select
+    // the 64-bit register class. Otherwise, select the 32-bit register class.
+    if (VT == MVT::Other)
+      VT = (Subtarget.isFP64bit() || !(Reg % 2)) ? MVT::f64 : MVT::f32;
+
+    RC = getRegClassFor(VT);
+
+    if (RC == &Mips::AFGR64RegClass) {
+      assert(Reg % 2 == 0);
+      Reg >>= 1;
+    }
+  } else if (Prefix == "$fcc") // Parse $fcc0-$fcc7.
+    RC = TRI->getRegClass(Mips::FCCRegClassID);
+  else if (Prefix == "$w") { // Parse $w0-$w31.
+    RC = getRegClassFor((VT == MVT::Other) ? MVT::v16i8 : VT);
+  } else { // Parse $0-$31.
+    assert(Prefix == "$");
+    RC = getRegClassFor((VT == MVT::Other) ? MVT::i32 : VT);
+  }
+
+  assert(Reg < RC->getNumRegs());
+  return std::make_pair(*(RC->begin() + Reg), RC);
+}
+
+/// Given a register class constraint, like 'r', if this corresponds directly
+/// to an LLVM register class, return a register of 0 and the register class
+/// pointer.
+std::pair<unsigned, const TargetRegisterClass*> MipsTargetLowering::
+getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const
+{
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'd': // Address register. Same as 'r' unless generating MIPS16 code.
+    case 'y': // Same as 'r'. Exists for compatibility.
+    case 'r':
+      if (VT == MVT::i32 || VT == MVT::i16 || VT == MVT::i8) {
+        if (Subtarget.inMips16Mode())
+          return std::make_pair(0U, &Mips::CPU16RegsRegClass);
+        return std::make_pair(0U, &Mips::GPR32RegClass);
+      }
+      if (VT == MVT::i64 && !Subtarget.isGP64bit())
+        return std::make_pair(0U, &Mips::GPR32RegClass);
+      if (VT == MVT::i64 && Subtarget.isGP64bit())
+        return std::make_pair(0U, &Mips::GPR64RegClass);
+      // This will generate an error message
+      return std::make_pair(0U, nullptr);
+    case 'f': // FPU or MSA register
+      if (VT == MVT::v16i8)
+        return std::make_pair(0U, &Mips::MSA128BRegClass);
+      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
+        return std::make_pair(0U, &Mips::MSA128HRegClass);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return std::make_pair(0U, &Mips::MSA128WRegClass);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return std::make_pair(0U, &Mips::MSA128DRegClass);
+      else if (VT == MVT::f32)
+        return std::make_pair(0U, &Mips::FGR32RegClass);
+      else if ((VT == MVT::f64) && (!Subtarget.isSingleFloat())) {
+        if (Subtarget.isFP64bit())
+          return std::make_pair(0U, &Mips::FGR64RegClass);
+        return std::make_pair(0U, &Mips::AFGR64RegClass);
+      }
+      break;
+    case 'c': // register suitable for indirect jump
+      if (VT == MVT::i32)
+        return std::make_pair((unsigned)Mips::T9, &Mips::GPR32RegClass);
+      assert(VT == MVT::i64 && "Unexpected type.");
+      return std::make_pair((unsigned)Mips::T9_64, &Mips::GPR64RegClass);
+    case 'l': // register suitable for indirect jump
+      if (VT == MVT::i32)
+        return std::make_pair((unsigned)Mips::LO0, &Mips::LO32RegClass);
+      return std::make_pair((unsigned)Mips::LO0_64, &Mips::LO64RegClass);
+    case 'x': // register suitable for indirect jump
+      // Fixme: Not triggering the use of both hi and low
+      // This will generate an error message
+      return std::make_pair(0U, nullptr);
+    }
+  }
+
+  std::pair<unsigned, const TargetRegisterClass *> R;
+  R = parseRegForInlineAsmConstraint(Constraint, VT);
+
+  if (R.second)
+    return R;
+
+  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+}
+
+/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+/// vector.  If it is invalid, don't add anything to Ops.
+void MipsTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+                                                     std::string &Constraint,
+                                                     std::vector<SDValue>&Ops,
+                                                     SelectionDAG &DAG) const {
+  SDValue Result;
+
+  // Only support length 1 constraints for now.
+  if (Constraint.length() > 1) return;
+
+  char ConstraintLetter = Constraint[0];
+  switch (ConstraintLetter) {
+  default: break; // This will fall through to the generic implementation
+  case 'I': // Signed 16 bit constant
+    // If this fails, the parent routine will give an error
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      EVT Type = Op.getValueType();
+      int64_t Val = C->getSExtValue();
+      if (isInt<16>(Val)) {
+        Result = DAG.getTargetConstant(Val, Type);
+        break;
+      }
+    }
+    return;
+  case 'J': // integer zero
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      EVT Type = Op.getValueType();
+      int64_t Val = C->getZExtValue();
+      if (Val == 0) {
+        Result = DAG.getTargetConstant(0, Type);
+        break;
+      }
+    }
+    return;
+  case 'K': // unsigned 16 bit immediate
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      EVT Type = Op.getValueType();
+      uint64_t Val = (uint64_t)C->getZExtValue();
+      if (isUInt<16>(Val)) {
+        Result = DAG.getTargetConstant(Val, Type);
+        break;
+      }
+    }
+    return;
+  case 'L': // signed 32 bit immediate where lower 16 bits are 0
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      EVT Type = Op.getValueType();
+      int64_t Val = C->getSExtValue();
+      if ((isInt<32>(Val)) && ((Val & 0xffff) == 0)){
+        Result = DAG.getTargetConstant(Val, Type);
+        break;
+      }
+    }
+    return;
+  case 'N': // immediate in the range of -65535 to -1 (inclusive)
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      EVT Type = Op.getValueType();
+      int64_t Val = C->getSExtValue();
+      if ((Val >= -65535) && (Val <= -1)) {
+        Result = DAG.getTargetConstant(Val, Type);
+        break;
+      }
+    }
+    return;
+  case 'O': // signed 15 bit immediate
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      EVT Type = Op.getValueType();
+      int64_t Val = C->getSExtValue();
+      if ((isInt<15>(Val))) {
+        Result = DAG.getTargetConstant(Val, Type);
+        break;
+      }
+    }
+    return;
+  case 'P': // immediate in the range of 1 to 65535 (inclusive)
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      EVT Type = Op.getValueType();
+      int64_t Val = C->getSExtValue();
+      if ((Val <= 65535) && (Val >= 1)) {
+        Result = DAG.getTargetConstant(Val, Type);
+        break;
+      }
+    }
+    return;
+  }
+
+  if (Result.getNode()) {
+    Ops.push_back(Result);
+    return;
+  }
+
+  TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+bool MipsTargetLowering::isLegalAddressingMode(const AddrMode &AM,
+                                               Type *Ty) const {
+  // No global is ever allowed as a base.
+  if (AM.BaseGV)
+    return false;
+
+  switch (AM.Scale) {
+  case 0: // "r+i" or just "i", depending on HasBaseReg.
+    break;
+  case 1:
+    if (!AM.HasBaseReg) // allow "r+i".
+      break;
+    return false; // disallow "r+r" or "r+r+i".
+  default:
+    return false;
+  }
+
+  return true;
+}
+
+bool
+MipsTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+  // The Mips target isn't yet aware of offsets.
+  return false;
+}
+
+EVT MipsTargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
+                                            unsigned SrcAlign,
+                                            bool IsMemset, bool ZeroMemset,
+                                            bool MemcpyStrSrc,
+                                            MachineFunction &MF) const {
+  if (Subtarget.hasMips64())
+    return MVT::i64;
+
+  return MVT::i32;
+}
+
+bool MipsTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+  if (VT != MVT::f32 && VT != MVT::f64)
+    return false;
+  if (Imm.isNegZero())
+    return false;
+  return Imm.isZero();
+}
+
+unsigned MipsTargetLowering::getJumpTableEncoding() const {
+  if (Subtarget.isABI_N64())
+    return MachineJumpTableInfo::EK_GPRel64BlockAddress;
+
+  return TargetLowering::getJumpTableEncoding();
+}
+
+/// This function returns true if CallSym is a long double emulation routine.
+static bool isF128SoftLibCall(const char *CallSym) {
+  const char *const LibCalls[] =
+    {"__addtf3", "__divtf3", "__eqtf2", "__extenddftf2", "__extendsftf2",
+     "__fixtfdi", "__fixtfsi", "__fixtfti", "__fixunstfdi", "__fixunstfsi",
+     "__fixunstfti", "__floatditf", "__floatsitf", "__floattitf",
+     "__floatunditf", "__floatunsitf", "__floatuntitf", "__getf2", "__gttf2",
+     "__letf2", "__lttf2", "__multf3", "__netf2", "__powitf2", "__subtf3",
+     "__trunctfdf2", "__trunctfsf2", "__unordtf2",
+     "ceill", "copysignl", "cosl", "exp2l", "expl", "floorl", "fmal", "fmodl",
+     "log10l", "log2l", "logl", "nearbyintl", "powl", "rintl", "sinl", "sqrtl",
+     "truncl"};
+
+  const char *const *End = LibCalls + array_lengthof(LibCalls);
+
+  // Check that LibCalls is sorted alphabetically.
+  MipsTargetLowering::LTStr Comp;
+
+#ifndef NDEBUG
+  for (const char *const *I = LibCalls; I < End - 1; ++I)
+    assert(Comp(*I, *(I + 1)));
+#endif
+
+  return std::binary_search(LibCalls, End, CallSym, Comp);
+}
+
+/// This function returns true if Ty is fp128 or i128 which was originally a
+/// fp128.
+static bool originalTypeIsF128(const Type *Ty, const SDNode *CallNode) {
+  if (Ty->isFP128Ty())
+    return true;
+
+  const ExternalSymbolSDNode *ES =
+    dyn_cast_or_null<const ExternalSymbolSDNode>(CallNode);
+
+  // If the Ty is i128 and the function being called is a long double emulation
+  // routine, then the original type is f128.
+  return (ES && Ty->isIntegerTy(128) && isF128SoftLibCall(ES->getSymbol()));
+}
+
+MipsTargetLowering::MipsCC::SpecialCallingConvType
+  MipsTargetLowering::getSpecialCallingConv(SDValue Callee) const {
+  MipsCC::SpecialCallingConvType SpecialCallingConv =
+    MipsCC::NoSpecialCallingConv;
+  if (Subtarget.inMips16HardFloat()) {
+    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+      llvm::StringRef Sym = G->getGlobal()->getName();
+      Function *F = G->getGlobal()->getParent()->getFunction(Sym);
+      if (F && F->hasFnAttribute("__Mips16RetHelper")) {
+        SpecialCallingConv = MipsCC::Mips16RetHelperConv;
+      }
+    }
+  }
+  return SpecialCallingConv;
+}
+
+MipsTargetLowering::MipsCC::MipsCC(
+  CallingConv::ID CC, bool IsO32_, bool IsFP64_, CCState &Info,
+  MipsCC::SpecialCallingConvType SpecialCallingConv_)
+  : CCInfo(Info), CallConv(CC), IsO32(IsO32_), IsFP64(IsFP64_),
+    SpecialCallingConv(SpecialCallingConv_){
+  // Pre-allocate reserved argument area.
+  CCInfo.AllocateStack(reservedArgArea(), 1);
+}
+
+
+void MipsTargetLowering::MipsCC::
+analyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Args,
+                    bool IsVarArg, bool IsSoftFloat, const SDNode *CallNode,
+                    std::vector<ArgListEntry> &FuncArgs) {
+  assert((CallConv != CallingConv::Fast || !IsVarArg) &&
+         "CallingConv::Fast shouldn't be used for vararg functions.");
+
+  unsigned NumOpnds = Args.size();
+  llvm::CCAssignFn *FixedFn = fixedArgFn(), *VarFn = varArgFn();
+
+  for (unsigned I = 0; I != NumOpnds; ++I) {
+    MVT ArgVT = Args[I].VT;
+    ISD::ArgFlagsTy ArgFlags = Args[I].Flags;
+    bool R;
+
+    if (ArgFlags.isByVal()) {
+      handleByValArg(I, ArgVT, ArgVT, CCValAssign::Full, ArgFlags);
+      continue;
+    }
+
+    if (IsVarArg && !Args[I].IsFixed)
+      R = VarFn(I, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
+    else {
+      MVT RegVT = getRegVT(ArgVT, FuncArgs[Args[I].OrigArgIndex].Ty, CallNode,
+                           IsSoftFloat);
+      R = FixedFn(I, ArgVT, RegVT, CCValAssign::Full, ArgFlags, CCInfo);
+    }
+
+    if (R) {
+#ifndef NDEBUG
+      dbgs() << "Call operand #" << I << " has unhandled type "
+             << EVT(ArgVT).getEVTString();
+#endif
+      llvm_unreachable(nullptr);
+    }
+  }
+}
+
+void MipsTargetLowering::MipsCC::
+analyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Args,
+                       bool IsSoftFloat, Function::const_arg_iterator FuncArg) {
+  unsigned NumArgs = Args.size();
+  llvm::CCAssignFn *FixedFn = fixedArgFn();
+  unsigned CurArgIdx = 0;
+
+  for (unsigned I = 0; I != NumArgs; ++I) {
+    MVT ArgVT = Args[I].VT;
+    ISD::ArgFlagsTy ArgFlags = Args[I].Flags;
+    std::advance(FuncArg, Args[I].OrigArgIndex - CurArgIdx);
+    CurArgIdx = Args[I].OrigArgIndex;
+
+    if (ArgFlags.isByVal()) {
+      handleByValArg(I, ArgVT, ArgVT, CCValAssign::Full, ArgFlags);
+      continue;
+    }
+
+    MVT RegVT = getRegVT(ArgVT, FuncArg->getType(), nullptr, IsSoftFloat);
+
+    if (!FixedFn(I, ArgVT, RegVT, CCValAssign::Full, ArgFlags, CCInfo))
+      continue;
+
+#ifndef NDEBUG
+    dbgs() << "Formal Arg #" << I << " has unhandled type "
+           << EVT(ArgVT).getEVTString();
+#endif
+    llvm_unreachable(nullptr);
+  }
+}
+
+template<typename Ty>
+void MipsTargetLowering::MipsCC::
+analyzeReturn(const SmallVectorImpl<Ty> &RetVals, bool IsSoftFloat,
+              const SDNode *CallNode, const Type *RetTy) const {
+  CCAssignFn *Fn;
+
+  if (IsSoftFloat && originalTypeIsF128(RetTy, CallNode))
+    Fn = RetCC_F128Soft;
+  else
+    Fn = RetCC_Mips;
+
+  for (unsigned I = 0, E = RetVals.size(); I < E; ++I) {
+    MVT VT = RetVals[I].VT;
+    ISD::ArgFlagsTy Flags = RetVals[I].Flags;
+    MVT RegVT = this->getRegVT(VT, RetTy, CallNode, IsSoftFloat);
+
+    if (Fn(I, VT, RegVT, CCValAssign::Full, Flags, this->CCInfo)) {
+#ifndef NDEBUG
+      dbgs() << "Call result #" << I << " has unhandled type "
+             << EVT(VT).getEVTString() << '\n';
+#endif
+      llvm_unreachable(nullptr);
+    }
+  }
+}
+
+void MipsTargetLowering::MipsCC::
+analyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsSoftFloat,
+                  const SDNode *CallNode, const Type *RetTy) const {
+  analyzeReturn(Ins, IsSoftFloat, CallNode, RetTy);
+}
+
+void MipsTargetLowering::MipsCC::
+analyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsSoftFloat,
+              const Type *RetTy) const {
+  analyzeReturn(Outs, IsSoftFloat, nullptr, RetTy);
+}
+
+void MipsTargetLowering::MipsCC::handleByValArg(unsigned ValNo, MVT ValVT,
+                                                MVT LocVT,
+                                                CCValAssign::LocInfo LocInfo,
+                                                ISD::ArgFlagsTy ArgFlags) {
+  assert(ArgFlags.getByValSize() && "Byval argument's size shouldn't be 0.");
+
+  struct ByValArgInfo ByVal;
+  unsigned RegSize = regSize();
+  unsigned ByValSize = RoundUpToAlignment(ArgFlags.getByValSize(), RegSize);
+  unsigned Align = std::min(std::max(ArgFlags.getByValAlign(), RegSize),
+                            RegSize * 2);
+
+  if (useRegsForByval())
+    allocateRegs(ByVal, ByValSize, Align);
+
+  // Allocate space on caller's stack.
+  ByVal.Address = CCInfo.AllocateStack(ByValSize - RegSize * ByVal.NumRegs,
+                                       Align);
+  CCInfo.addLoc(CCValAssign::getMem(ValNo, ValVT, ByVal.Address, LocVT,
+                                    LocInfo));
+  ByValArgs.push_back(ByVal);
+}
+
+unsigned MipsTargetLowering::MipsCC::numIntArgRegs() const {
+  return IsO32 ? array_lengthof(O32IntRegs) : array_lengthof(Mips64IntRegs);
+}
+
+unsigned MipsTargetLowering::MipsCC::reservedArgArea() const {
+  return (IsO32 && (CallConv != CallingConv::Fast)) ? 16 : 0;
+}
+
+const MCPhysReg *MipsTargetLowering::MipsCC::intArgRegs() const {
+  return IsO32 ? O32IntRegs : Mips64IntRegs;
+}
+
+llvm::CCAssignFn *MipsTargetLowering::MipsCC::fixedArgFn() const {
+  if (CallConv == CallingConv::Fast)
+    return CC_Mips_FastCC;
+
+  if (SpecialCallingConv == Mips16RetHelperConv)
+    return CC_Mips16RetHelper;
+  return IsO32 ? (IsFP64 ? CC_MipsO32_FP64 : CC_MipsO32_FP32) : CC_MipsN;
+}
+
+llvm::CCAssignFn *MipsTargetLowering::MipsCC::varArgFn() const {
+  return IsO32 ? (IsFP64 ? CC_MipsO32_FP64 : CC_MipsO32_FP32) : CC_MipsN_VarArg;
+}
+
+const MCPhysReg *MipsTargetLowering::MipsCC::shadowRegs() const {
+  return IsO32 ? O32IntRegs : Mips64DPRegs;
+}
+
+void MipsTargetLowering::MipsCC::allocateRegs(ByValArgInfo &ByVal,
+                                              unsigned ByValSize,
+                                              unsigned Align) {
+  unsigned RegSize = regSize(), NumIntArgRegs = numIntArgRegs();
+  const MCPhysReg *IntArgRegs = intArgRegs(), *ShadowRegs = shadowRegs();
+  assert(!(ByValSize % RegSize) && !(Align % RegSize) &&
+         "Byval argument's size and alignment should be a multiple of"
+         "RegSize.");
+
+  ByVal.FirstIdx = CCInfo.getFirstUnallocated(IntArgRegs, NumIntArgRegs);
+
+  // If Align > RegSize, the first arg register must be even.
+  if ((Align > RegSize) && (ByVal.FirstIdx % 2)) {
+    CCInfo.AllocateReg(IntArgRegs[ByVal.FirstIdx], ShadowRegs[ByVal.FirstIdx]);
+    ++ByVal.FirstIdx;
+  }
+
+  // Mark the registers allocated.
+  for (unsigned I = ByVal.FirstIdx; ByValSize && (I < NumIntArgRegs);
+       ByValSize -= RegSize, ++I, ++ByVal.NumRegs)
+    CCInfo.AllocateReg(IntArgRegs[I], ShadowRegs[I]);
+}
+
+MVT MipsTargetLowering::MipsCC::getRegVT(MVT VT, const Type *OrigTy,
+                                         const SDNode *CallNode,
+                                         bool IsSoftFloat) const {
+  if (IsSoftFloat || IsO32)
+    return VT;
+
+  // Check if the original type was fp128.
+  if (originalTypeIsF128(OrigTy, CallNode)) {
+    assert(VT == MVT::i64);
+    return MVT::f64;
+  }
+
+  return VT;
+}
+
+void MipsTargetLowering::
+copyByValRegs(SDValue Chain, SDLoc DL, std::vector<SDValue> &OutChains,
+              SelectionDAG &DAG, const ISD::ArgFlagsTy &Flags,
+              SmallVectorImpl<SDValue> &InVals, const Argument *FuncArg,
+              const MipsCC &CC, const ByValArgInfo &ByVal) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  unsigned RegAreaSize = ByVal.NumRegs * CC.regSize();
+  unsigned FrameObjSize = std::max(Flags.getByValSize(), RegAreaSize);
+  int FrameObjOffset;
+
+  if (RegAreaSize)
+    FrameObjOffset = (int)CC.reservedArgArea() -
+      (int)((CC.numIntArgRegs() - ByVal.FirstIdx) * CC.regSize());
+  else
+    FrameObjOffset = ByVal.Address;
+
+  // Create frame object.
+  EVT PtrTy = getPointerTy();
+  int FI = MFI->CreateFixedObject(FrameObjSize, FrameObjOffset, true);
+  SDValue FIN = DAG.getFrameIndex(FI, PtrTy);
+  InVals.push_back(FIN);
+
+  if (!ByVal.NumRegs)
+    return;
+
+  // Copy arg registers.
+  MVT RegTy = MVT::getIntegerVT(CC.regSize() * 8);
+  const TargetRegisterClass *RC = getRegClassFor(RegTy);
+
+  for (unsigned I = 0; I < ByVal.NumRegs; ++I) {
+    unsigned ArgReg = CC.intArgRegs()[ByVal.FirstIdx + I];
+    unsigned VReg = addLiveIn(MF, ArgReg, RC);
+    unsigned Offset = I * CC.regSize();
+    SDValue StorePtr = DAG.getNode(ISD::ADD, DL, PtrTy, FIN,
+                                   DAG.getConstant(Offset, PtrTy));
+    SDValue Store = DAG.getStore(Chain, DL, DAG.getRegister(VReg, RegTy),
+                                 StorePtr, MachinePointerInfo(FuncArg, Offset),
+                                 false, false, 0);
+    OutChains.push_back(Store);
+  }
+}
+
+// Copy byVal arg to registers and stack.
+void MipsTargetLowering::
+passByValArg(SDValue Chain, SDLoc DL,
+             std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
+             SmallVectorImpl<SDValue> &MemOpChains, SDValue StackPtr,
+             MachineFrameInfo *MFI, SelectionDAG &DAG, SDValue Arg,
+             const MipsCC &CC, const ByValArgInfo &ByVal,
+             const ISD::ArgFlagsTy &Flags, bool isLittle) const {
+  unsigned ByValSizeInBytes = Flags.getByValSize();
+  unsigned OffsetInBytes = 0; // From beginning of struct
+  unsigned RegSizeInBytes = CC.regSize();
+  unsigned Alignment = std::min(Flags.getByValAlign(), RegSizeInBytes);
+  EVT PtrTy = getPointerTy(), RegTy = MVT::getIntegerVT(RegSizeInBytes * 8);
+
+  if (ByVal.NumRegs) {
+    const MCPhysReg *ArgRegs = CC.intArgRegs();
+    bool LeftoverBytes = (ByVal.NumRegs * RegSizeInBytes > ByValSizeInBytes);
+    unsigned I = 0;
+
+    // Copy words to registers.
+    for (; I < ByVal.NumRegs - LeftoverBytes;
+         ++I, OffsetInBytes += RegSizeInBytes) {
+      SDValue LoadPtr = DAG.getNode(ISD::ADD, DL, PtrTy, Arg,
+                                    DAG.getConstant(OffsetInBytes, PtrTy));
+      SDValue LoadVal = DAG.getLoad(RegTy, DL, Chain, LoadPtr,
+                                    MachinePointerInfo(), false, false, false,
+                                    Alignment);
+      MemOpChains.push_back(LoadVal.getValue(1));
+      unsigned ArgReg = ArgRegs[ByVal.FirstIdx + I];
+      RegsToPass.push_back(std::make_pair(ArgReg, LoadVal));
+    }
+
+    // Return if the struct has been fully copied.
+    if (ByValSizeInBytes == OffsetInBytes)
+      return;
+
+    // Copy the remainder of the byval argument with sub-word loads and shifts.
+    if (LeftoverBytes) {
+      assert((ByValSizeInBytes > OffsetInBytes) &&
+             (ByValSizeInBytes < OffsetInBytes + RegSizeInBytes) &&
+             "Size of the remainder should be smaller than RegSizeInBytes.");
+      SDValue Val;
+
+      for (unsigned LoadSizeInBytes = RegSizeInBytes / 2, TotalBytesLoaded = 0;
+           OffsetInBytes < ByValSizeInBytes; LoadSizeInBytes /= 2) {
+        unsigned RemainingSizeInBytes = ByValSizeInBytes - OffsetInBytes;
+
+        if (RemainingSizeInBytes < LoadSizeInBytes)
+          continue;
+
+        // Load subword.
+        SDValue LoadPtr = DAG.getNode(ISD::ADD, DL, PtrTy, Arg,
+                                      DAG.getConstant(OffsetInBytes, PtrTy));
+        SDValue LoadVal = DAG.getExtLoad(
+            ISD::ZEXTLOAD, DL, RegTy, Chain, LoadPtr, MachinePointerInfo(),
+            MVT::getIntegerVT(LoadSizeInBytes * 8), false, false, Alignment);
+        MemOpChains.push_back(LoadVal.getValue(1));
+
+        // Shift the loaded value.
+        unsigned Shamt;
+
+        if (isLittle)
+          Shamt = TotalBytesLoaded * 8;
+        else
+          Shamt = (RegSizeInBytes - (TotalBytesLoaded + LoadSizeInBytes)) * 8;
+
+        SDValue Shift = DAG.getNode(ISD::SHL, DL, RegTy, LoadVal,
+                                    DAG.getConstant(Shamt, MVT::i32));
+
+        if (Val.getNode())
+          Val = DAG.getNode(ISD::OR, DL, RegTy, Val, Shift);
+        else
+          Val = Shift;
+
+        OffsetInBytes += LoadSizeInBytes;
+        TotalBytesLoaded += LoadSizeInBytes;
+        Alignment = std::min(Alignment, LoadSizeInBytes);
+      }
+
+      unsigned ArgReg = ArgRegs[ByVal.FirstIdx + I];
+      RegsToPass.push_back(std::make_pair(ArgReg, Val));
+      return;
+    }
+  }
+
+  // Copy remainder of byval arg to it with memcpy.
+  unsigned MemCpySize = ByValSizeInBytes - OffsetInBytes;
+  SDValue Src = DAG.getNode(ISD::ADD, DL, PtrTy, Arg,
+                            DAG.getConstant(OffsetInBytes, PtrTy));
+  SDValue Dst = DAG.getNode(ISD::ADD, DL, PtrTy, StackPtr,
+                            DAG.getIntPtrConstant(ByVal.Address));
+  Chain = DAG.getMemcpy(Chain, DL, Dst, Src, DAG.getConstant(MemCpySize, PtrTy),
+                        Alignment, /*isVolatile=*/false, /*AlwaysInline=*/false,
+                        MachinePointerInfo(), MachinePointerInfo());
+  MemOpChains.push_back(Chain);
+}
+
+void MipsTargetLowering::writeVarArgRegs(std::vector<SDValue> &OutChains,
+                                         const MipsCC &CC, SDValue Chain,
+                                         SDLoc DL, SelectionDAG &DAG) const {
+  unsigned NumRegs = CC.numIntArgRegs();
+  const MCPhysReg *ArgRegs = CC.intArgRegs();
+  const CCState &CCInfo = CC.getCCInfo();
+  unsigned Idx = CCInfo.getFirstUnallocated(ArgRegs, NumRegs);
+  unsigned RegSize = CC.regSize();
+  MVT RegTy = MVT::getIntegerVT(RegSize * 8);
+  const TargetRegisterClass *RC = getRegClassFor(RegTy);
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+
+  // Offset of the first variable argument from stack pointer.
+  int VaArgOffset;
+
+  if (NumRegs == Idx)
+    VaArgOffset = RoundUpToAlignment(CCInfo.getNextStackOffset(), RegSize);
+  else
+    VaArgOffset = (int)CC.reservedArgArea() - (int)(RegSize * (NumRegs - Idx));
+
+  // Record the frame index of the first variable argument
+  // which is a value necessary to VASTART.
+  int FI = MFI->CreateFixedObject(RegSize, VaArgOffset, true);
+  MipsFI->setVarArgsFrameIndex(FI);
+
+  // Copy the integer registers that have not been used for argument passing
+  // to the argument register save area. For O32, the save area is allocated
+  // in the caller's stack frame, while for N32/64, it is allocated in the
+  // callee's stack frame.
+  for (unsigned I = Idx; I < NumRegs; ++I, VaArgOffset += RegSize) {
+    unsigned Reg = addLiveIn(MF, ArgRegs[I], RC);
+    SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegTy);
+    FI = MFI->CreateFixedObject(RegSize, VaArgOffset, true);
+    SDValue PtrOff = DAG.getFrameIndex(FI, getPointerTy());
+    SDValue Store = DAG.getStore(Chain, DL, ArgValue, PtrOff,
+                                 MachinePointerInfo(), false, false, 0);
+    cast<StoreSDNode>(Store.getNode())->getMemOperand()->setValue(
+        (Value *)nullptr);
+    OutChains.push_back(Store);
+  }
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsISelLowering.h b/contrib/llvm/lib/Target/Mips/MipsISelLowering.h
new file mode 100644
index 0000000..10e4e0b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsISelLowering.h
@@ -0,0 +1,627 @@
+//===-- MipsISelLowering.h - Mips DAG Lowering Interface --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that Mips uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MipsISELLOWERING_H
+#define MipsISELLOWERING_H
+
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "Mips.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Target/TargetLowering.h"
+#include <deque>
+#include <string>
+
+namespace llvm {
+  namespace MipsISD {
+    enum NodeType {
+      // Start the numbering from where ISD NodeType finishes.
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+      // Jump and link (call)
+      JmpLink,
+
+      // Tail call
+      TailCall,
+
+      // Get the Higher 16 bits from a 32-bit immediate
+      // No relation with Mips Hi register
+      Hi,
+
+      // Get the Lower 16 bits from a 32-bit immediate
+      // No relation with Mips Lo register
+      Lo,
+
+      // Handle gp_rel (small data/bss sections) relocation.
+      GPRel,
+
+      // Thread Pointer
+      ThreadPointer,
+
+      // Floating Point Branch Conditional
+      FPBrcond,
+
+      // Floating Point Compare
+      FPCmp,
+
+      // Floating Point Conditional Moves
+      CMovFP_T,
+      CMovFP_F,
+
+      // FP-to-int truncation node.
+      TruncIntFP,
+
+      // Return
+      Ret,
+
+      EH_RETURN,
+
+      // Node used to extract integer from accumulator.
+      MFHI,
+      MFLO,
+
+      // Node used to insert integers to accumulator.
+      MTLOHI,
+
+      // Mult nodes.
+      Mult,
+      Multu,
+
+      // MAdd/Sub nodes
+      MAdd,
+      MAddu,
+      MSub,
+      MSubu,
+
+      // DivRem(u)
+      DivRem,
+      DivRemU,
+      DivRem16,
+      DivRemU16,
+
+      BuildPairF64,
+      ExtractElementF64,
+
+      Wrapper,
+
+      DynAlloc,
+
+      Sync,
+
+      Ext,
+      Ins,
+
+      // EXTR.W instrinsic nodes.
+      EXTP,
+      EXTPDP,
+      EXTR_S_H,
+      EXTR_W,
+      EXTR_R_W,
+      EXTR_RS_W,
+      SHILO,
+      MTHLIP,
+
+      // DPA.W intrinsic nodes.
+      MULSAQ_S_W_PH,
+      MAQ_S_W_PHL,
+      MAQ_S_W_PHR,
+      MAQ_SA_W_PHL,
+      MAQ_SA_W_PHR,
+      DPAU_H_QBL,
+      DPAU_H_QBR,
+      DPSU_H_QBL,
+      DPSU_H_QBR,
+      DPAQ_S_W_PH,
+      DPSQ_S_W_PH,
+      DPAQ_SA_L_W,
+      DPSQ_SA_L_W,
+      DPA_W_PH,
+      DPS_W_PH,
+      DPAQX_S_W_PH,
+      DPAQX_SA_W_PH,
+      DPAX_W_PH,
+      DPSX_W_PH,
+      DPSQX_S_W_PH,
+      DPSQX_SA_W_PH,
+      MULSA_W_PH,
+
+      MULT,
+      MULTU,
+      MADD_DSP,
+      MADDU_DSP,
+      MSUB_DSP,
+      MSUBU_DSP,
+
+      // DSP shift nodes.
+      SHLL_DSP,
+      SHRA_DSP,
+      SHRL_DSP,
+
+      // DSP setcc and select_cc nodes.
+      SETCC_DSP,
+      SELECT_CC_DSP,
+
+      // Vector comparisons.
+      // These take a vector and return a boolean.
+      VALL_ZERO,
+      VANY_ZERO,
+      VALL_NONZERO,
+      VANY_NONZERO,
+
+      // These take a vector and return a vector bitmask.
+      VCEQ,
+      VCLE_S,
+      VCLE_U,
+      VCLT_S,
+      VCLT_U,
+
+      // Element-wise vector max/min.
+      VSMAX,
+      VSMIN,
+      VUMAX,
+      VUMIN,
+
+      // Vector Shuffle with mask as an operand
+      VSHF,  // Generic shuffle
+      SHF,   // 4-element set shuffle.
+      ILVEV, // Interleave even elements
+      ILVOD, // Interleave odd elements
+      ILVL,  // Interleave left elements
+      ILVR,  // Interleave right elements
+      PCKEV, // Pack even elements
+      PCKOD, // Pack odd elements
+
+      // Vector Lane Copy
+      INSVE, // Copy element from one vector to another
+
+      // Combined (XOR (OR $a, $b), -1)
+      VNOR,
+
+      // Extended vector element extraction
+      VEXTRACT_SEXT_ELT,
+      VEXTRACT_ZEXT_ELT,
+
+      // Load/Store Left/Right nodes.
+      LWL = ISD::FIRST_TARGET_MEMORY_OPCODE,
+      LWR,
+      SWL,
+      SWR,
+      LDL,
+      LDR,
+      SDL,
+      SDR
+    };
+  }
+
+  //===--------------------------------------------------------------------===//
+  // TargetLowering Implementation
+  //===--------------------------------------------------------------------===//
+  class MipsFunctionInfo;
+  class MipsSubtarget;
+
+  class MipsTargetLowering : public TargetLowering  {
+    bool isMicroMips;
+  public:
+    explicit MipsTargetLowering(MipsTargetMachine &TM,
+                                const MipsSubtarget &STI);
+
+    static const MipsTargetLowering *create(MipsTargetMachine &TM,
+                                            const MipsSubtarget &STI);
+
+    /// createFastISel - This method returns a target specific FastISel object,
+    /// or null if the target does not support "fast" ISel.
+    FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                             const TargetLibraryInfo *libInfo) const override;
+
+    MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i32; }
+
+    void LowerOperationWrapper(SDNode *N,
+                               SmallVectorImpl<SDValue> &Results,
+                               SelectionDAG &DAG) const override;
+
+    /// LowerOperation - Provide custom lowering hooks for some operations.
+    SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+    /// ReplaceNodeResults - Replace the results of node with an illegal result
+    /// type with new values built out of custom code.
+    ///
+    void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+                            SelectionDAG &DAG) const override;
+
+    /// getTargetNodeName - This method returns the name of a target specific
+    //  DAG node.
+    const char *getTargetNodeName(unsigned Opcode) const override;
+
+    /// getSetCCResultType - get the ISD::SETCC result ValueType
+    EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
+
+    SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
+    MachineBasicBlock *
+    EmitInstrWithCustomInserter(MachineInstr *MI,
+                                MachineBasicBlock *MBB) const override;
+
+    struct LTStr {
+      bool operator()(const char *S1, const char *S2) const {
+        return strcmp(S1, S2) < 0;
+      }
+    };
+
+  protected:
+    SDValue getGlobalReg(SelectionDAG &DAG, EVT Ty) const;
+
+    // This method creates the following nodes, which are necessary for
+    // computing a local symbol's address:
+    //
+    // (add (load (wrapper $gp, %got(sym)), %lo(sym))
+    template <class NodeTy>
+    SDValue getAddrLocal(NodeTy *N, EVT Ty, SelectionDAG &DAG,
+                         bool IsN32OrN64) const {
+      SDLoc DL(N);
+      unsigned GOTFlag = IsN32OrN64 ? MipsII::MO_GOT_PAGE : MipsII::MO_GOT;
+      SDValue GOT = DAG.getNode(MipsISD::Wrapper, DL, Ty, getGlobalReg(DAG, Ty),
+                                getTargetNode(N, Ty, DAG, GOTFlag));
+      SDValue Load = DAG.getLoad(Ty, DL, DAG.getEntryNode(), GOT,
+                                 MachinePointerInfo::getGOT(), false, false,
+                                 false, 0);
+      unsigned LoFlag = IsN32OrN64 ? MipsII::MO_GOT_OFST : MipsII::MO_ABS_LO;
+      SDValue Lo = DAG.getNode(MipsISD::Lo, DL, Ty,
+                               getTargetNode(N, Ty, DAG, LoFlag));
+      return DAG.getNode(ISD::ADD, DL, Ty, Load, Lo);
+    }
+
+    // This method creates the following nodes, which are necessary for
+    // computing a global symbol's address:
+    //
+    // (load (wrapper $gp, %got(sym)))
+    template<class NodeTy>
+    SDValue getAddrGlobal(NodeTy *N, EVT Ty, SelectionDAG &DAG,
+                          unsigned Flag, SDValue Chain,
+                          const MachinePointerInfo &PtrInfo) const {
+      SDLoc DL(N);
+      SDValue Tgt = DAG.getNode(MipsISD::Wrapper, DL, Ty, getGlobalReg(DAG, Ty),
+                                getTargetNode(N, Ty, DAG, Flag));
+      return DAG.getLoad(Ty, DL, Chain, Tgt, PtrInfo, false, false, false, 0);
+    }
+
+    // This method creates the following nodes, which are necessary for
+    // computing a global symbol's address in large-GOT mode:
+    //
+    // (load (wrapper (add %hi(sym), $gp), %lo(sym)))
+    template<class NodeTy>
+    SDValue getAddrGlobalLargeGOT(NodeTy *N, EVT Ty, SelectionDAG &DAG,
+                                  unsigned HiFlag, unsigned LoFlag,
+                                  SDValue Chain,
+                                  const MachinePointerInfo &PtrInfo) const {
+      SDLoc DL(N);
+      SDValue Hi = DAG.getNode(MipsISD::Hi, DL, Ty,
+                               getTargetNode(N, Ty, DAG, HiFlag));
+      Hi = DAG.getNode(ISD::ADD, DL, Ty, Hi, getGlobalReg(DAG, Ty));
+      SDValue Wrapper = DAG.getNode(MipsISD::Wrapper, DL, Ty, Hi,
+                                    getTargetNode(N, Ty, DAG, LoFlag));
+      return DAG.getLoad(Ty, DL, Chain, Wrapper, PtrInfo, false, false, false,
+                         0);
+    }
+
+    // This method creates the following nodes, which are necessary for
+    // computing a symbol's address in non-PIC mode:
+    //
+    // (add %hi(sym), %lo(sym))
+    template<class NodeTy>
+    SDValue getAddrNonPIC(NodeTy *N, EVT Ty, SelectionDAG &DAG) const {
+      SDLoc DL(N);
+      SDValue Hi = getTargetNode(N, Ty, DAG, MipsII::MO_ABS_HI);
+      SDValue Lo = getTargetNode(N, Ty, DAG, MipsII::MO_ABS_LO);
+      return DAG.getNode(ISD::ADD, DL, Ty,
+                         DAG.getNode(MipsISD::Hi, DL, Ty, Hi),
+                         DAG.getNode(MipsISD::Lo, DL, Ty, Lo));
+    }
+
+    /// This function fills Ops, which is the list of operands that will later
+    /// be used when a function call node is created. It also generates
+    /// copyToReg nodes to set up argument registers.
+    virtual void
+    getOpndList(SmallVectorImpl<SDValue> &Ops,
+                std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
+                bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
+                CallLoweringInfo &CLI, SDValue Callee, SDValue Chain) const;
+
+    /// ByValArgInfo - Byval argument information.
+    struct ByValArgInfo {
+      unsigned FirstIdx; // Index of the first register used.
+      unsigned NumRegs;  // Number of registers used for this argument.
+      unsigned Address;  // Offset of the stack area used to pass this argument.
+
+      ByValArgInfo() : FirstIdx(0), NumRegs(0), Address(0) {}
+    };
+
+    /// MipsCC - This class provides methods used to analyze formal and call
+    /// arguments and inquire about calling convention information.
+    class MipsCC {
+    public:
+      enum SpecialCallingConvType {
+        Mips16RetHelperConv, NoSpecialCallingConv
+      };
+
+      MipsCC(CallingConv::ID CallConv, bool IsO32, bool IsFP64, CCState &Info,
+             SpecialCallingConvType SpecialCallingConv = NoSpecialCallingConv);
+
+
+      void analyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               bool IsVarArg, bool IsSoftFloat,
+                               const SDNode *CallNode,
+                               std::vector<ArgListEntry> &FuncArgs);
+      void analyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
+                                  bool IsSoftFloat,
+                                  Function::const_arg_iterator FuncArg);
+
+      void analyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins,
+                             bool IsSoftFloat, const SDNode *CallNode,
+                             const Type *RetTy) const;
+
+      void analyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
+                         bool IsSoftFloat, const Type *RetTy) const;
+
+      const CCState &getCCInfo() const { return CCInfo; }
+
+      /// hasByValArg - Returns true if function has byval arguments.
+      bool hasByValArg() const { return !ByValArgs.empty(); }
+
+      /// regSize - Size (in number of bits) of integer registers.
+      unsigned regSize() const { return IsO32 ? 4 : 8; }
+
+      /// numIntArgRegs - Number of integer registers available for calls.
+      unsigned numIntArgRegs() const;
+
+      /// reservedArgArea - The size of the area the caller reserves for
+      /// register arguments. This is 16-byte if ABI is O32.
+      unsigned reservedArgArea() const;
+
+      /// Return pointer to array of integer argument registers.
+      const MCPhysReg *intArgRegs() const;
+
+      typedef SmallVectorImpl<ByValArgInfo>::const_iterator byval_iterator;
+      byval_iterator byval_begin() const { return ByValArgs.begin(); }
+      byval_iterator byval_end() const { return ByValArgs.end(); }
+
+    private:
+      void handleByValArg(unsigned ValNo, MVT ValVT, MVT LocVT,
+                          CCValAssign::LocInfo LocInfo,
+                          ISD::ArgFlagsTy ArgFlags);
+
+      /// useRegsForByval - Returns true if the calling convention allows the
+      /// use of registers to pass byval arguments.
+      bool useRegsForByval() const { return CallConv != CallingConv::Fast; }
+
+      /// Return the function that analyzes fixed argument list functions.
+      llvm::CCAssignFn *fixedArgFn() const;
+
+      /// Return the function that analyzes variable argument list functions.
+      llvm::CCAssignFn *varArgFn() const;
+
+      const MCPhysReg *shadowRegs() const;
+
+      void allocateRegs(ByValArgInfo &ByVal, unsigned ByValSize,
+                        unsigned Align);
+
+      /// Return the type of the register which is used to pass an argument or
+      /// return a value. This function returns f64 if the argument is an i64
+      /// value which has been generated as a result of softening an f128 value.
+      /// Otherwise, it just returns VT.
+      MVT getRegVT(MVT VT, const Type *OrigTy, const SDNode *CallNode,
+                   bool IsSoftFloat) const;
+
+      template<typename Ty>
+      void analyzeReturn(const SmallVectorImpl<Ty> &RetVals, bool IsSoftFloat,
+                         const SDNode *CallNode, const Type *RetTy) const;
+
+      CCState &CCInfo;
+      CallingConv::ID CallConv;
+      bool IsO32, IsFP64;
+      SpecialCallingConvType SpecialCallingConv;
+      SmallVector<ByValArgInfo, 2> ByValArgs;
+    };
+  protected:
+    SDValue lowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+
+    // Subtarget Info
+    const MipsSubtarget &Subtarget;
+
+  private:
+    // Create a TargetGlobalAddress node.
+    SDValue getTargetNode(GlobalAddressSDNode *N, EVT Ty, SelectionDAG &DAG,
+                          unsigned Flag) const;
+
+    // Create a TargetExternalSymbol node.
+    SDValue getTargetNode(ExternalSymbolSDNode *N, EVT Ty, SelectionDAG &DAG,
+                          unsigned Flag) const;
+
+    // Create a TargetBlockAddress node.
+    SDValue getTargetNode(BlockAddressSDNode *N, EVT Ty, SelectionDAG &DAG,
+                          unsigned Flag) const;
+
+    // Create a TargetJumpTable node.
+    SDValue getTargetNode(JumpTableSDNode *N, EVT Ty, SelectionDAG &DAG,
+                          unsigned Flag) const;
+
+    // Create a TargetConstantPool node.
+    SDValue getTargetNode(ConstantPoolSDNode *N, EVT Ty, SelectionDAG &DAG,
+                          unsigned Flag) const;
+
+    MipsCC::SpecialCallingConvType getSpecialCallingConv(SDValue Callee) const;
+    // Lower Operand helpers
+    SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                            CallingConv::ID CallConv, bool isVarArg,
+                            const SmallVectorImpl<ISD::InputArg> &Ins,
+                            SDLoc dl, SelectionDAG &DAG,
+                            SmallVectorImpl<SDValue> &InVals,
+                            const SDNode *CallNode, const Type *RetTy) const;
+
+    // Lower Operand specifics
+    SDValue lowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerFABS(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerATOMIC_FENCE(SDValue Op, SelectionDAG& DAG) const;
+    SDValue lowerShiftLeftParts(SDValue Op, SelectionDAG& DAG) const;
+    SDValue lowerShiftRightParts(SDValue Op, SelectionDAG& DAG,
+                                 bool IsSRA) const;
+    SDValue lowerADD(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
+
+    /// isEligibleForTailCallOptimization - Check whether the call is eligible
+    /// for tail call optimization.
+    virtual bool
+    isEligibleForTailCallOptimization(const MipsCC &MipsCCInfo,
+                                      unsigned NextStackOffset,
+                                      const MipsFunctionInfo& FI) const = 0;
+
+    /// copyByValArg - Copy argument registers which were used to pass a byval
+    /// argument to the stack. Create a stack frame object for the byval
+    /// argument.
+    void copyByValRegs(SDValue Chain, SDLoc DL,
+                       std::vector<SDValue> &OutChains, SelectionDAG &DAG,
+                       const ISD::ArgFlagsTy &Flags,
+                       SmallVectorImpl<SDValue> &InVals,
+                       const Argument *FuncArg,
+                       const MipsCC &CC, const ByValArgInfo &ByVal) const;
+
+    /// passByValArg - Pass a byval argument in registers or on stack.
+    void passByValArg(SDValue Chain, SDLoc DL,
+                      std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
+                      SmallVectorImpl<SDValue> &MemOpChains, SDValue StackPtr,
+                      MachineFrameInfo *MFI, SelectionDAG &DAG, SDValue Arg,
+                      const MipsCC &CC, const ByValArgInfo &ByVal,
+                      const ISD::ArgFlagsTy &Flags, bool isLittle) const;
+
+    /// writeVarArgRegs - Write variable function arguments passed in registers
+    /// to the stack. Also create a stack frame object for the first variable
+    /// argument.
+    void writeVarArgRegs(std::vector<SDValue> &OutChains, const MipsCC &CC,
+                         SDValue Chain, SDLoc DL, SelectionDAG &DAG) const;
+
+    SDValue
+      LowerFormalArguments(SDValue Chain,
+                           CallingConv::ID CallConv, bool isVarArg,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           SDLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals) const override;
+
+    SDValue passArgOnStack(SDValue StackPtr, unsigned Offset, SDValue Chain,
+                           SDValue Arg, SDLoc DL, bool IsTailCall,
+                           SelectionDAG &DAG) const;
+
+    SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                      SmallVectorImpl<SDValue> &InVals) const override;
+
+    bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+                        bool isVarArg,
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        LLVMContext &Context) const override;
+
+    SDValue LowerReturn(SDValue Chain,
+                        CallingConv::ID CallConv, bool isVarArg,
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        const SmallVectorImpl<SDValue> &OutVals,
+                        SDLoc dl, SelectionDAG &DAG) const override;
+
+    // Inline asm support
+    ConstraintType
+      getConstraintType(const std::string &Constraint) const override;
+
+    /// Examine constraint string and operand type and determine a weight value.
+    /// The operand object must already have been set up with the operand type.
+    ConstraintWeight getSingleConstraintMatchWeight(
+      AsmOperandInfo &info, const char *constraint) const override;
+
+    /// This function parses registers that appear in inline-asm constraints.
+    /// It returns pair (0, 0) on failure.
+    std::pair<unsigned, const TargetRegisterClass *>
+    parseRegForInlineAsmConstraint(const StringRef &C, MVT VT) const;
+
+    std::pair<unsigned, const TargetRegisterClass*>
+              getRegForInlineAsmConstraint(const std::string &Constraint,
+                                           MVT VT) const override;
+
+    /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+    /// vector.  If it is invalid, don't add anything to Ops. If hasMemory is
+    /// true it means one of the asm constraint of the inline asm instruction
+    /// being processed is 'm'.
+    void LowerAsmOperandForConstraint(SDValue Op,
+                                      std::string &Constraint,
+                                      std::vector<SDValue> &Ops,
+                                      SelectionDAG &DAG) const override;
+
+    bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
+
+    bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+
+    EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
+                            unsigned SrcAlign,
+                            bool IsMemset, bool ZeroMemset,
+                            bool MemcpyStrSrc,
+                            MachineFunction &MF) const override;
+
+    /// isFPImmLegal - Returns true if the target can instruction select the
+    /// specified FP immediate natively. If false, the legalizer will
+    /// materialize the FP immediate as a load from a constant pool.
+    bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
+
+    unsigned getJumpTableEncoding() const override;
+
+    /// Emit a sign-extension using sll/sra, seb, or seh appropriately.
+    MachineBasicBlock *emitSignExtendToI32InReg(MachineInstr *MI,
+                                                MachineBasicBlock *BB,
+                                                unsigned Size, unsigned DstReg,
+                                                unsigned SrcRec) const;
+
+    MachineBasicBlock *emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
+                    unsigned Size, unsigned BinOpcode, bool Nand = false) const;
+    MachineBasicBlock *emitAtomicBinaryPartword(MachineInstr *MI,
+                    MachineBasicBlock *BB, unsigned Size, unsigned BinOpcode,
+                    bool Nand = false) const;
+    MachineBasicBlock *emitAtomicCmpSwap(MachineInstr *MI,
+                                  MachineBasicBlock *BB, unsigned Size) const;
+    MachineBasicBlock *emitAtomicCmpSwapPartword(MachineInstr *MI,
+                                  MachineBasicBlock *BB, unsigned Size) const;
+    MachineBasicBlock *emitSEL_D(MachineInstr *MI, MachineBasicBlock *BB) const;
+  };
+
+  /// Create MipsTargetLowering objects.
+  const MipsTargetLowering *
+  createMips16TargetLowering(MipsTargetMachine &TM, const MipsSubtarget &STI);
+  const MipsTargetLowering *
+  createMipsSETargetLowering(MipsTargetMachine &TM, const MipsSubtarget &STI);
+
+  namespace Mips {
+    FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                             const TargetLibraryInfo *libInfo);
+  }
+}
+
+#endif // MipsISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/Mips/MipsInstrFPU.td b/contrib/llvm/lib/Target/Mips/MipsInstrFPU.td
new file mode 100644
index 0000000..29d8e30
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsInstrFPU.td
@@ -0,0 +1,655 @@
+//===-- MipsInstrFPU.td - Mips FPU Instruction Information -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Mips FPU instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Floating Point Instructions
+// ------------------------
+// * 64bit fp:
+//    - 32 64-bit registers (default mode)
+//    - 16 even 32-bit registers (32-bit compatible mode) for
+//      single and double access.
+// * 32bit fp:
+//    - 16 even 32-bit registers - single and double (aliased)
+//    - 32 32-bit registers (within single-only mode)
+//===----------------------------------------------------------------------===//
+
+// Floating Point Compare and Branch
+def SDT_MipsFPBrcond : SDTypeProfile<0, 3, [SDTCisInt<0>,
+                                            SDTCisVT<1, i32>,
+                                            SDTCisVT<2, OtherVT>]>;
+def SDT_MipsFPCmp : SDTypeProfile<0, 3, [SDTCisSameAs<0, 1>, SDTCisFP<1>,
+                                         SDTCisVT<2, i32>]>;
+def SDT_MipsCMovFP : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisVT<2, i32>,
+                                          SDTCisSameAs<1, 3>]>;
+def SDT_MipsTruncIntFP : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]>;
+def SDT_MipsBuildPairF64 : SDTypeProfile<1, 2, [SDTCisVT<0, f64>,
+                                                SDTCisVT<1, i32>,
+                                                SDTCisSameAs<1, 2>]>;
+def SDT_MipsExtractElementF64 : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
+                                                     SDTCisVT<1, f64>,
+                                                     SDTCisVT<2, i32>]>;
+
+def MipsFPCmp : SDNode<"MipsISD::FPCmp", SDT_MipsFPCmp, [SDNPOutGlue]>;
+def MipsCMovFP_T : SDNode<"MipsISD::CMovFP_T", SDT_MipsCMovFP, [SDNPInGlue]>;
+def MipsCMovFP_F : SDNode<"MipsISD::CMovFP_F", SDT_MipsCMovFP, [SDNPInGlue]>;
+def MipsFPBrcond : SDNode<"MipsISD::FPBrcond", SDT_MipsFPBrcond,
+                          [SDNPHasChain, SDNPOptInGlue]>;
+def MipsTruncIntFP : SDNode<"MipsISD::TruncIntFP", SDT_MipsTruncIntFP>;
+def MipsBuildPairF64 : SDNode<"MipsISD::BuildPairF64", SDT_MipsBuildPairF64>;
+def MipsExtractElementF64 : SDNode<"MipsISD::ExtractElementF64",
+                                   SDT_MipsExtractElementF64>;
+
+// Operand for printing out a condition code.
+let PrintMethod = "printFCCOperand", DecoderMethod = "DecodeCondCode" in
+  def condcode : Operand<i32>;
+
+//===----------------------------------------------------------------------===//
+// Feature predicates.
+//===----------------------------------------------------------------------===//
+
+def IsFP64bit        : Predicate<"Subtarget->isFP64bit()">,
+                       AssemblerPredicate<"FeatureFP64Bit">;
+def NotFP64bit       : Predicate<"!Subtarget->isFP64bit()">,
+                       AssemblerPredicate<"!FeatureFP64Bit">;
+def IsSingleFloat    : Predicate<"Subtarget->isSingleFloat()">,
+                       AssemblerPredicate<"FeatureSingleFloat">;
+def IsNotSingleFloat : Predicate<"!Subtarget->isSingleFloat()">,
+                       AssemblerPredicate<"!FeatureSingleFloat">;
+
+//===----------------------------------------------------------------------===//
+// Mips FGR size adjectives.
+// They are mutually exclusive.
+//===----------------------------------------------------------------------===//
+
+class FGR_32 { list<Predicate> FGRPredicates = [NotFP64bit]; }
+class FGR_64 { list<Predicate> FGRPredicates = [IsFP64bit]; }
+
+//===----------------------------------------------------------------------===//
+
+// FP immediate patterns.
+def fpimm0 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(+0.0);
+}]>;
+
+def fpimm0neg : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(-0.0);
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Instruction Class Templates
+//
+// A set of multiclasses is used to address the register usage.
+//
+// S32 - single precision in 16 32bit even fp registers
+//       single precision in 32 32bit fp registers in SingleOnly mode
+// S64 - single precision in 32 64bit fp registers (In64BitMode)
+// D32 - double precision in 16 32bit even fp registers
+// D64 - double precision in 32 64bit fp registers (In64BitMode)
+//
+// Only S32 and D32 are supported right now.
+//===----------------------------------------------------------------------===//
+
+class ADDS_FT<string opstr, RegisterOperand RC, InstrItinClass Itin, bit IsComm,
+              SDPatternOperator OpNode= null_frag> :
+  InstSE<(outs RC:$fd), (ins RC:$fs, RC:$ft),
+         !strconcat(opstr, "\t$fd, $fs, $ft"),
+         [(set RC:$fd, (OpNode RC:$fs, RC:$ft))], Itin, FrmFR, opstr> {
+  let isCommutable = IsComm;
+}
+
+multiclass ADDS_M<string opstr, InstrItinClass Itin, bit IsComm,
+                  SDPatternOperator OpNode = null_frag> {
+  def _D32 : MMRel, ADDS_FT<opstr, AFGR64Opnd, Itin, IsComm, OpNode>,
+             AdditionalRequires<[NotFP64bit]>;
+  def _D64 : ADDS_FT<opstr, FGR64Opnd, Itin,
+                     IsComm, OpNode>,
+             AdditionalRequires<[IsFP64bit]> {
+    string DecoderNamespace = "Mips64";
+  }
+}
+
+class ABSS_FT<string opstr, RegisterOperand DstRC, RegisterOperand SrcRC,
+              InstrItinClass Itin, SDPatternOperator OpNode= null_frag> :
+  InstSE<(outs DstRC:$fd), (ins SrcRC:$fs), !strconcat(opstr, "\t$fd, $fs"),
+         [(set DstRC:$fd, (OpNode SrcRC:$fs))], Itin, FrmFR, opstr>,
+  NeverHasSideEffects;
+
+multiclass ABSS_M<string opstr, InstrItinClass Itin,
+                  SDPatternOperator OpNode= null_frag> {
+  def _D32 : MMRel, ABSS_FT<opstr, AFGR64Opnd, AFGR64Opnd, Itin, OpNode>,
+             AdditionalRequires<[NotFP64bit]>;
+  def _D64 : ABSS_FT<opstr, FGR64Opnd, FGR64Opnd, Itin, OpNode>,
+             AdditionalRequires<[IsFP64bit]> {
+    string DecoderNamespace = "Mips64";
+  }
+}
+
+multiclass ROUND_M<string opstr, InstrItinClass Itin> {
+  def _D32 : MMRel, ABSS_FT<opstr, FGR32Opnd, AFGR64Opnd, Itin>,
+             AdditionalRequires<[NotFP64bit]>;
+  def _D64 : ABSS_FT<opstr, FGR32Opnd, FGR64Opnd, Itin>,
+             AdditionalRequires<[IsFP64bit]> {
+    let DecoderNamespace = "Mips64";
+  }
+}
+
+class MFC1_FT<string opstr, RegisterOperand DstRC, RegisterOperand SrcRC,
+              InstrItinClass Itin, SDPatternOperator OpNode= null_frag> :
+  InstSE<(outs DstRC:$rt), (ins SrcRC:$fs), !strconcat(opstr, "\t$rt, $fs"),
+         [(set DstRC:$rt, (OpNode SrcRC:$fs))], Itin, FrmFR, opstr>;
+
+class MTC1_FT<string opstr, RegisterOperand DstRC, RegisterOperand SrcRC,
+              InstrItinClass Itin, SDPatternOperator OpNode= null_frag> :
+  InstSE<(outs DstRC:$fs), (ins SrcRC:$rt), !strconcat(opstr, "\t$rt, $fs"),
+         [(set DstRC:$fs, (OpNode SrcRC:$rt))], Itin, FrmFR, opstr>;
+
+class MTC1_64_FT<string opstr, RegisterOperand DstRC, RegisterOperand SrcRC,
+                 InstrItinClass Itin> :
+  InstSE<(outs DstRC:$fs), (ins DstRC:$fs_in, SrcRC:$rt),
+         !strconcat(opstr, "\t$rt, $fs"), [], Itin, FrmFR, opstr> {
+  // $fs_in is part of a white lie to work around a widespread bug in the FPU
+  // implementation. See expandBuildPairF64 for details.
+  let Constraints = "$fs = $fs_in";
+}
+
+class LW_FT<string opstr, RegisterOperand RC, InstrItinClass Itin,
+            SDPatternOperator OpNode= null_frag> :
+  InstSE<(outs RC:$rt), (ins mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
+         [(set RC:$rt, (OpNode addrDefault:$addr))], Itin, FrmFI, opstr> {
+  let DecoderMethod = "DecodeFMem";
+  let mayLoad = 1;
+}
+
+class SW_FT<string opstr, RegisterOperand RC, InstrItinClass Itin,
+            SDPatternOperator OpNode= null_frag> :
+  InstSE<(outs), (ins RC:$rt, mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
+         [(OpNode RC:$rt, addrDefault:$addr)], Itin, FrmFI, opstr> {
+  let DecoderMethod = "DecodeFMem";
+  let mayStore = 1;
+}
+
+class MADDS_FT<string opstr, RegisterOperand RC, InstrItinClass Itin,
+               SDPatternOperator OpNode = null_frag> :
+  InstSE<(outs RC:$fd), (ins RC:$fr, RC:$fs, RC:$ft),
+         !strconcat(opstr, "\t$fd, $fr, $fs, $ft"),
+         [(set RC:$fd, (OpNode (fmul RC:$fs, RC:$ft), RC:$fr))], Itin,
+         FrmFR, opstr>;
+
+class NMADDS_FT<string opstr, RegisterOperand RC, InstrItinClass Itin,
+                SDPatternOperator OpNode = null_frag> :
+  InstSE<(outs RC:$fd), (ins RC:$fr, RC:$fs, RC:$ft),
+         !strconcat(opstr, "\t$fd, $fr, $fs, $ft"),
+         [(set RC:$fd, (fsub fpimm0, (OpNode (fmul RC:$fs, RC:$ft), RC:$fr)))],
+         Itin, FrmFR, opstr>;
+
+class LWXC1_FT<string opstr, RegisterOperand DRC,
+               InstrItinClass Itin, SDPatternOperator OpNode = null_frag> :
+  InstSE<(outs DRC:$fd), (ins PtrRC:$base, PtrRC:$index),
+         !strconcat(opstr, "\t$fd, ${index}(${base})"),
+         [(set DRC:$fd, (OpNode (add iPTR:$base, iPTR:$index)))], Itin,
+         FrmFI, opstr> {
+  let AddedComplexity = 20;
+}
+
+class SWXC1_FT<string opstr, RegisterOperand DRC,
+               InstrItinClass Itin, SDPatternOperator OpNode = null_frag> :
+  InstSE<(outs), (ins DRC:$fs, PtrRC:$base, PtrRC:$index),
+         !strconcat(opstr, "\t$fs, ${index}(${base})"),
+         [(OpNode DRC:$fs, (add iPTR:$base, iPTR:$index))], Itin,
+         FrmFI, opstr> {
+  let AddedComplexity = 20;
+}
+
+class BC1F_FT<string opstr, DAGOperand opnd, InstrItinClass Itin,
+              SDPatternOperator Op = null_frag>  :
+  InstSE<(outs), (ins FCCRegsOpnd:$fcc, opnd:$offset),
+         !strconcat(opstr, "\t$fcc, $offset"),
+         [(MipsFPBrcond Op, FCCRegsOpnd:$fcc, bb:$offset)], Itin,
+         FrmFI, opstr> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let hasDelaySlot = 1;
+  let Defs = [AT];
+}
+
+class CEQS_FT<string typestr, RegisterClass RC, InstrItinClass Itin,
+              SDPatternOperator OpNode = null_frag>  :
+  InstSE<(outs), (ins RC:$fs, RC:$ft, condcode:$cond),
+         !strconcat("c.$cond.", typestr, "\t$fs, $ft"),
+         [(OpNode RC:$fs, RC:$ft, imm:$cond)], Itin, FrmFR,
+         !strconcat("c.$cond.", typestr)> {
+  let Defs = [FCC0];
+  let isCodeGenOnly = 1;
+}
+
+class C_COND_FT<string CondStr, string Typestr, RegisterOperand RC,
+                InstrItinClass itin>  :
+   InstSE<(outs), (ins RC:$fs, RC:$ft),
+          !strconcat("c.", CondStr, ".", Typestr, "\t$fs, $ft"), [], itin,
+          FrmFR>;
+
+multiclass C_COND_M<string TypeStr, RegisterOperand RC, bits<5> fmt,
+                    InstrItinClass itin> {
+  def C_F_#NAME : C_COND_FT<"f", TypeStr, RC, itin>, C_COND_FM<fmt, 0>;
+  def C_UN_#NAME : C_COND_FT<"un", TypeStr, RC, itin>, C_COND_FM<fmt, 1>;
+  def C_EQ_#NAME : C_COND_FT<"eq", TypeStr, RC, itin>, C_COND_FM<fmt, 2>;
+  def C_UEQ_#NAME : C_COND_FT<"ueq", TypeStr, RC, itin>, C_COND_FM<fmt, 3>;
+  def C_OLT_#NAME : C_COND_FT<"olt", TypeStr, RC, itin>, C_COND_FM<fmt, 4>;
+  def C_ULT_#NAME : C_COND_FT<"ult", TypeStr, RC, itin>, C_COND_FM<fmt, 5>;
+  def C_OLE_#NAME : C_COND_FT<"ole", TypeStr, RC, itin>, C_COND_FM<fmt, 6>;
+  def C_ULE_#NAME : C_COND_FT<"ule", TypeStr, RC, itin>, C_COND_FM<fmt, 7>;
+  def C_SF_#NAME : C_COND_FT<"sf", TypeStr, RC, itin>, C_COND_FM<fmt, 8>;
+  def C_NGLE_#NAME : C_COND_FT<"ngle", TypeStr, RC, itin>, C_COND_FM<fmt, 9>;
+  def C_SEQ_#NAME : C_COND_FT<"seq", TypeStr, RC, itin>, C_COND_FM<fmt, 10>;
+  def C_NGL_#NAME : C_COND_FT<"ngl", TypeStr, RC, itin>, C_COND_FM<fmt, 11>;
+  def C_LT_#NAME : C_COND_FT<"lt", TypeStr, RC, itin>, C_COND_FM<fmt, 12>;
+  def C_NGE_#NAME : C_COND_FT<"nge", TypeStr, RC, itin>, C_COND_FM<fmt, 13>;
+  def C_LE_#NAME : C_COND_FT<"le", TypeStr, RC, itin>, C_COND_FM<fmt, 14>;
+  def C_NGT_#NAME : C_COND_FT<"ngt", TypeStr, RC, itin>, C_COND_FM<fmt, 15>;
+}
+
+defm S : C_COND_M<"s", FGR32Opnd, 16, II_C_CC_S>, ISA_MIPS1_NOT_32R6_64R6;
+defm D32 : C_COND_M<"d", AFGR64Opnd, 17, II_C_CC_D>, ISA_MIPS1_NOT_32R6_64R6,
+           AdditionalRequires<[NotFP64bit]>;
+let DecoderNamespace = "Mips64" in
+defm D64 : C_COND_M<"d", FGR64Opnd, 17, II_C_CC_D>, ISA_MIPS1_NOT_32R6_64R6,
+           AdditionalRequires<[IsFP64bit]>;
+
+//===----------------------------------------------------------------------===//
+// Floating Point Instructions
+//===----------------------------------------------------------------------===//
+def ROUND_W_S  : MMRel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, II_ROUND>,
+                 ABSS_FM<0xc, 16>, ISA_MIPS2;
+def TRUNC_W_S  : MMRel, ABSS_FT<"trunc.w.s", FGR32Opnd, FGR32Opnd, II_TRUNC>,
+                 ABSS_FM<0xd, 16>, ISA_MIPS2;
+def CEIL_W_S   : MMRel, ABSS_FT<"ceil.w.s", FGR32Opnd, FGR32Opnd, II_CEIL>,
+                 ABSS_FM<0xe, 16>, ISA_MIPS2;
+def FLOOR_W_S  : MMRel, ABSS_FT<"floor.w.s", FGR32Opnd, FGR32Opnd, II_FLOOR>,
+                 ABSS_FM<0xf, 16>, ISA_MIPS2;
+def CVT_W_S    : MMRel, ABSS_FT<"cvt.w.s", FGR32Opnd, FGR32Opnd, II_CVT>,
+                 ABSS_FM<0x24, 16>;
+
+defm ROUND_W : ROUND_M<"round.w.d", II_ROUND>, ABSS_FM<0xc, 17>, ISA_MIPS2;
+defm TRUNC_W : ROUND_M<"trunc.w.d", II_TRUNC>, ABSS_FM<0xd, 17>, ISA_MIPS2;
+defm CEIL_W  : ROUND_M<"ceil.w.d", II_CEIL>, ABSS_FM<0xe, 17>, ISA_MIPS2;
+defm FLOOR_W : ROUND_M<"floor.w.d", II_FLOOR>, ABSS_FM<0xf, 17>, ISA_MIPS2;
+defm CVT_W   : ROUND_M<"cvt.w.d", II_CVT>, ABSS_FM<0x24, 17>;
+
+let DecoderNamespace = "Mips64" in {
+  def ROUND_L_S : ABSS_FT<"round.l.s", FGR64Opnd, FGR32Opnd, II_ROUND>,
+                  ABSS_FM<0x8, 16>, FGR_64;
+  def ROUND_L_D64 : ABSS_FT<"round.l.d", FGR64Opnd, FGR64Opnd, II_ROUND>,
+                    ABSS_FM<0x8, 17>, FGR_64;
+  def TRUNC_L_S : ABSS_FT<"trunc.l.s", FGR64Opnd, FGR32Opnd, II_TRUNC>,
+                  ABSS_FM<0x9, 16>, FGR_64;
+  def TRUNC_L_D64 : ABSS_FT<"trunc.l.d", FGR64Opnd, FGR64Opnd, II_TRUNC>,
+                    ABSS_FM<0x9, 17>, FGR_64;
+  def CEIL_L_S  : ABSS_FT<"ceil.l.s", FGR64Opnd, FGR32Opnd, II_CEIL>,
+                  ABSS_FM<0xa, 16>, FGR_64;
+  def CEIL_L_D64 : ABSS_FT<"ceil.l.d", FGR64Opnd, FGR64Opnd, II_CEIL>,
+                   ABSS_FM<0xa, 17>, FGR_64;
+  def FLOOR_L_S : ABSS_FT<"floor.l.s", FGR64Opnd, FGR32Opnd, II_FLOOR>,
+                  ABSS_FM<0xb, 16>, FGR_64;
+  def FLOOR_L_D64 : ABSS_FT<"floor.l.d", FGR64Opnd, FGR64Opnd, II_FLOOR>,
+                    ABSS_FM<0xb, 17>, FGR_64;
+}
+
+def CVT_S_W : MMRel, ABSS_FT<"cvt.s.w", FGR32Opnd, FGR32Opnd, II_CVT>,
+              ABSS_FM<0x20, 20>;
+def CVT_L_S : MMRel, ABSS_FT<"cvt.l.s", FGR64Opnd, FGR32Opnd, II_CVT>,
+              ABSS_FM<0x25, 16>, INSN_MIPS3_32R2;
+def CVT_L_D64: MMRel, ABSS_FT<"cvt.l.d", FGR64Opnd, FGR64Opnd, II_CVT>,
+               ABSS_FM<0x25, 17>, INSN_MIPS3_32R2;
+
+def CVT_S_D32 : MMRel, ABSS_FT<"cvt.s.d", FGR32Opnd, AFGR64Opnd, II_CVT>,
+                ABSS_FM<0x20, 17>, FGR_32;
+def CVT_D32_W : MMRel, ABSS_FT<"cvt.d.w", AFGR64Opnd, FGR32Opnd, II_CVT>,
+                ABSS_FM<0x21, 20>, FGR_32;
+def CVT_D32_S : MMRel, ABSS_FT<"cvt.d.s", AFGR64Opnd, FGR32Opnd, II_CVT>,
+                ABSS_FM<0x21, 16>, FGR_32;
+
+let DecoderNamespace = "Mips64" in {
+  def CVT_S_D64 : ABSS_FT<"cvt.s.d", FGR32Opnd, FGR64Opnd, II_CVT>,
+                  ABSS_FM<0x20, 17>, FGR_64;
+  def CVT_S_L   : ABSS_FT<"cvt.s.l", FGR32Opnd, FGR64Opnd, II_CVT>,
+                  ABSS_FM<0x20, 21>, FGR_64;
+  def CVT_D64_W : ABSS_FT<"cvt.d.w", FGR64Opnd, FGR32Opnd, II_CVT>,
+                  ABSS_FM<0x21, 20>, FGR_64;
+  def CVT_D64_S : ABSS_FT<"cvt.d.s", FGR64Opnd, FGR32Opnd, II_CVT>,
+                  ABSS_FM<0x21, 16>, FGR_64;
+  def CVT_D64_L : ABSS_FT<"cvt.d.l", FGR64Opnd, FGR64Opnd, II_CVT>,
+                  ABSS_FM<0x21, 21>, FGR_64;
+}
+
+let isPseudo = 1, isCodeGenOnly = 1 in {
+  def PseudoCVT_S_W : ABSS_FT<"", FGR32Opnd, GPR32Opnd, II_CVT>;
+  def PseudoCVT_D32_W : ABSS_FT<"", AFGR64Opnd, GPR32Opnd, II_CVT>;
+  def PseudoCVT_S_L : ABSS_FT<"", FGR64Opnd, GPR64Opnd, II_CVT>;
+  def PseudoCVT_D64_W : ABSS_FT<"", FGR64Opnd, GPR32Opnd, II_CVT>;
+  def PseudoCVT_D64_L : ABSS_FT<"", FGR64Opnd, GPR64Opnd, II_CVT>;
+}
+
+def FABS_S : MMRel, ABSS_FT<"abs.s", FGR32Opnd, FGR32Opnd, II_ABS, fabs>,
+             ABSS_FM<0x5, 16>;
+def FNEG_S : MMRel, ABSS_FT<"neg.s", FGR32Opnd, FGR32Opnd, II_NEG, fneg>,
+             ABSS_FM<0x7, 16>;
+defm FABS : ABSS_M<"abs.d", II_ABS, fabs>, ABSS_FM<0x5, 17>;
+defm FNEG : ABSS_M<"neg.d", II_NEG, fneg>, ABSS_FM<0x7, 17>;
+
+def FSQRT_S : MMRel, ABSS_FT<"sqrt.s", FGR32Opnd, FGR32Opnd, II_SQRT_S, fsqrt>,
+              ABSS_FM<0x4, 16>, ISA_MIPS2;
+defm FSQRT : ABSS_M<"sqrt.d", II_SQRT_D, fsqrt>, ABSS_FM<0x4, 17>, ISA_MIPS2;
+
+// The odd-numbered registers are only referenced when doing loads,
+// stores, and moves between floating-point and integer registers.
+// When defining instructions, we reference all 32-bit registers,
+// regardless of register aliasing.
+
+/// Move Control Registers From/To CPU Registers
+def CFC1 : MMRel, MFC1_FT<"cfc1", GPR32Opnd, CCROpnd, II_CFC1>, MFC1_FM<2>;
+def CTC1 : MMRel, MTC1_FT<"ctc1", CCROpnd, GPR32Opnd, II_CTC1>, MFC1_FM<6>;
+def MFC1 : MMRel, MFC1_FT<"mfc1", GPR32Opnd, FGR32Opnd, II_MFC1,
+                          bitconvert>, MFC1_FM<0>;
+def MTC1 : MMRel, MTC1_FT<"mtc1", FGR32Opnd, GPR32Opnd, II_MTC1,
+                          bitconvert>, MFC1_FM<4>;
+def MFHC1_D32 : MMRel, MFC1_FT<"mfhc1", GPR32Opnd, AFGR64Opnd, II_MFHC1>,
+                MFC1_FM<3>, ISA_MIPS32R2, AdditionalRequires<[NotFP64bit]>;
+def MFHC1_D64 : MFC1_FT<"mfhc1", GPR32Opnd, FGR64Opnd, II_MFHC1>,
+                MFC1_FM<3>, ISA_MIPS32R2, AdditionalRequires<[IsFP64bit]> {
+  let DecoderNamespace = "Mips64";
+}
+def MTHC1_D32 : MMRel, MTC1_64_FT<"mthc1", AFGR64Opnd, GPR32Opnd, II_MTHC1>,
+                MFC1_FM<7>, ISA_MIPS32R2, AdditionalRequires<[NotFP64bit]>;
+def MTHC1_D64 : MTC1_64_FT<"mthc1", FGR64Opnd, GPR32Opnd, II_MTHC1>,
+                MFC1_FM<7>, ISA_MIPS32R2, AdditionalRequires<[IsFP64bit]> {
+  let DecoderNamespace = "Mips64";
+}
+def DMFC1 : MFC1_FT<"dmfc1", GPR64Opnd, FGR64Opnd, II_DMFC1,
+            bitconvert>, MFC1_FM<1>, ISA_MIPS3;
+def DMTC1 : MTC1_FT<"dmtc1", FGR64Opnd, GPR64Opnd, II_DMTC1,
+            bitconvert>, MFC1_FM<5>, ISA_MIPS3;
+
+def FMOV_S   : MMRel, ABSS_FT<"mov.s", FGR32Opnd, FGR32Opnd, II_MOV_S>,
+               ABSS_FM<0x6, 16>;
+def FMOV_D32 : MMRel, ABSS_FT<"mov.d", AFGR64Opnd, AFGR64Opnd, II_MOV_D>,
+               ABSS_FM<0x6, 17>, AdditionalRequires<[NotFP64bit]>;
+def FMOV_D64 : ABSS_FT<"mov.d", FGR64Opnd, FGR64Opnd, II_MOV_D>,
+               ABSS_FM<0x6, 17>, AdditionalRequires<[IsFP64bit]> {
+                 let DecoderNamespace = "Mips64";
+}
+
+/// Floating Point Memory Instructions
+def LWC1 : MMRel, LW_FT<"lwc1", FGR32Opnd, II_LWC1, load>, LW_FM<0x31>;
+def SWC1 : MMRel, SW_FT<"swc1", FGR32Opnd, II_SWC1, store>, LW_FM<0x39>;
+
+let DecoderNamespace = "Mips64" in {
+  def LDC164 : LW_FT<"ldc1", FGR64Opnd, II_LDC1, load>, LW_FM<0x35>, ISA_MIPS2,
+               FGR_64;
+  def SDC164 : SW_FT<"sdc1", FGR64Opnd, II_SDC1, store>, LW_FM<0x3d>, ISA_MIPS2,
+               FGR_64;
+}
+
+def LDC1 : MMRel, LW_FT<"ldc1", AFGR64Opnd, II_LDC1, load>, LW_FM<0x35>,
+           ISA_MIPS2, FGR_32;
+def SDC1 : MMRel, SW_FT<"sdc1", AFGR64Opnd, II_SDC1, store>, LW_FM<0x3d>,
+           ISA_MIPS2, FGR_32;
+
+// Cop2 Memory Instructions
+// FIXME: These aren't really FPU instructions and as such don't belong in this
+//        file
+def LWC2 : LW_FT<"lwc2", COP2Opnd, NoItinerary, load>, LW_FM<0x32>,
+           ISA_MIPS1_NOT_32R6_64R6;
+def SWC2 : SW_FT<"swc2", COP2Opnd, NoItinerary, store>, LW_FM<0x3a>,
+           ISA_MIPS1_NOT_32R6_64R6;
+def LDC2 : LW_FT<"ldc2", COP2Opnd, NoItinerary, load>, LW_FM<0x36>,
+           ISA_MIPS2_NOT_32R6_64R6;
+def SDC2 : SW_FT<"sdc2", COP2Opnd, NoItinerary, store>, LW_FM<0x3e>,
+           ISA_MIPS2_NOT_32R6_64R6;
+
+// Cop3 Memory Instructions
+// FIXME: These aren't really FPU instructions and as such don't belong in this
+//        file
+let DecoderNamespace = "COP3_" in {
+  def LWC3 : LW_FT<"lwc3", COP3Opnd, NoItinerary, load>, LW_FM<0x33>;
+  def SWC3 : SW_FT<"swc3", COP3Opnd, NoItinerary, store>, LW_FM<0x3b>;
+  def LDC3 : LW_FT<"ldc3", COP3Opnd, NoItinerary, load>, LW_FM<0x37>,
+             ISA_MIPS2;
+  def SDC3 : SW_FT<"sdc3", COP3Opnd, NoItinerary, store>, LW_FM<0x3f>,
+             ISA_MIPS2;
+}
+
+// Indexed loads and stores.
+// Base register + offset register addressing mode (indicated by "x" in the
+// instruction mnemonic) is disallowed under NaCl.
+let AdditionalPredicates = [IsNotNaCl] in {
+  def LWXC1 : MMRel, LWXC1_FT<"lwxc1", FGR32Opnd, II_LWXC1, load>, LWXC1_FM<0>,
+              INSN_MIPS4_32R2_NOT_32R6_64R6;
+  def SWXC1 : MMRel, SWXC1_FT<"swxc1", FGR32Opnd, II_SWXC1, store>, SWXC1_FM<8>,
+              INSN_MIPS4_32R2_NOT_32R6_64R6;
+}
+
+let AdditionalPredicates = [NotInMicroMips, IsNotNaCl] in {
+  def LDXC1 : LWXC1_FT<"ldxc1", AFGR64Opnd, II_LDXC1, load>, LWXC1_FM<1>,
+              INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32;
+  def SDXC1 : SWXC1_FT<"sdxc1", AFGR64Opnd, II_SDXC1, store>, SWXC1_FM<9>,
+              INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32;
+}
+
+let DecoderNamespace="Mips64" in {
+  def LDXC164 : LWXC1_FT<"ldxc1", FGR64Opnd, II_LDXC1, load>, LWXC1_FM<1>,
+                INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64;
+  def SDXC164 : SWXC1_FT<"sdxc1", FGR64Opnd, II_SDXC1, store>, SWXC1_FM<9>,
+                INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64;
+}
+
+// Load/store doubleword indexed unaligned.
+let AdditionalPredicates = [IsNotNaCl] in {
+  def LUXC1 : MMRel, LWXC1_FT<"luxc1", AFGR64Opnd, II_LUXC1>, LWXC1_FM<0x5>,
+              INSN_MIPS5_32R2_NOT_32R6_64R6, FGR_32;
+  def SUXC1 : MMRel, SWXC1_FT<"suxc1", AFGR64Opnd, II_SUXC1>, SWXC1_FM<0xd>,
+              INSN_MIPS5_32R2_NOT_32R6_64R6, FGR_32;
+}
+
+let DecoderNamespace="Mips64" in {
+  def LUXC164 : LWXC1_FT<"luxc1", FGR64Opnd, II_LUXC1>, LWXC1_FM<0x5>,
+                INSN_MIPS5_32R2_NOT_32R6_64R6, FGR_64;
+  def SUXC164 : SWXC1_FT<"suxc1", FGR64Opnd, II_SUXC1>, SWXC1_FM<0xd>,
+                INSN_MIPS5_32R2_NOT_32R6_64R6, FGR_64;
+}
+
+/// Floating-point Aritmetic
+def FADD_S : MMRel, ADDS_FT<"add.s", FGR32Opnd, II_ADD_S, 1, fadd>,
+             ADDS_FM<0x00, 16>;
+defm FADD :  ADDS_M<"add.d", II_ADD_D, 1, fadd>, ADDS_FM<0x00, 17>;
+def FDIV_S : MMRel, ADDS_FT<"div.s", FGR32Opnd, II_DIV_S, 0, fdiv>,
+             ADDS_FM<0x03, 16>;
+defm FDIV :  ADDS_M<"div.d", II_DIV_D, 0, fdiv>, ADDS_FM<0x03, 17>;
+def FMUL_S : MMRel, ADDS_FT<"mul.s", FGR32Opnd, II_MUL_S, 1, fmul>,
+             ADDS_FM<0x02, 16>;
+defm FMUL :  ADDS_M<"mul.d", II_MUL_D, 1, fmul>, ADDS_FM<0x02, 17>;
+def FSUB_S : MMRel, ADDS_FT<"sub.s", FGR32Opnd, II_SUB_S, 0, fsub>,
+             ADDS_FM<0x01, 16>;
+defm FSUB :  ADDS_M<"sub.d", II_SUB_D, 0, fsub>, ADDS_FM<0x01, 17>;
+
+def MADD_S : MMRel, MADDS_FT<"madd.s", FGR32Opnd, II_MADD_S, fadd>,
+             MADDS_FM<4, 0>, ISA_MIPS32R2_NOT_32R6_64R6;
+def MSUB_S : MMRel, MADDS_FT<"msub.s", FGR32Opnd, II_MSUB_S, fsub>,
+             MADDS_FM<5, 0>, ISA_MIPS32R2_NOT_32R6_64R6;
+
+let AdditionalPredicates = [NoNaNsFPMath] in {
+  def NMADD_S : MMRel, NMADDS_FT<"nmadd.s", FGR32Opnd, II_NMADD_S, fadd>,
+                MADDS_FM<6, 0>, ISA_MIPS32R2_NOT_32R6_64R6;
+  def NMSUB_S : MMRel, NMADDS_FT<"nmsub.s", FGR32Opnd, II_NMSUB_S, fsub>,
+                MADDS_FM<7, 0>, ISA_MIPS32R2_NOT_32R6_64R6;
+}
+
+def MADD_D32 : MMRel, MADDS_FT<"madd.d", AFGR64Opnd, II_MADD_D, fadd>,
+               MADDS_FM<4, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_32;
+def MSUB_D32 : MMRel, MADDS_FT<"msub.d", AFGR64Opnd, II_MSUB_D, fsub>,
+               MADDS_FM<5, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_32;
+
+let AdditionalPredicates = [NoNaNsFPMath] in {
+  def NMADD_D32 : MMRel, NMADDS_FT<"nmadd.d", AFGR64Opnd, II_NMADD_D, fadd>,
+                  MADDS_FM<6, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_32;
+  def NMSUB_D32 : MMRel, NMADDS_FT<"nmsub.d", AFGR64Opnd, II_NMSUB_D, fsub>,
+                  MADDS_FM<7, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_32;
+}
+
+let isCodeGenOnly=1 in {
+  def MADD_D64 : MADDS_FT<"madd.d", FGR64Opnd, II_MADD_D, fadd>,
+                 MADDS_FM<4, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_64;
+  def MSUB_D64 : MADDS_FT<"msub.d", FGR64Opnd, II_MSUB_D, fsub>,
+                 MADDS_FM<5, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_64;
+}
+
+let AdditionalPredicates = [NoNaNsFPMath],
+    isCodeGenOnly=1 in {
+  def NMADD_D64 : NMADDS_FT<"nmadd.d", FGR64Opnd, II_NMADD_D, fadd>,
+                  MADDS_FM<6, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_64;
+  def NMSUB_D64 : NMADDS_FT<"nmsub.d", FGR64Opnd, II_NMSUB_D, fsub>,
+                  MADDS_FM<7, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_64;
+}
+
+//===----------------------------------------------------------------------===//
+// Floating Point Branch Codes
+//===----------------------------------------------------------------------===//
+// Mips branch codes. These correspond to condcode in MipsInstrInfo.h.
+// They must be kept in synch.
+def MIPS_BRANCH_F  : PatLeaf<(i32 0)>;
+def MIPS_BRANCH_T  : PatLeaf<(i32 1)>;
+
+def BC1F : MMRel, BC1F_FT<"bc1f", brtarget, IIBranch, MIPS_BRANCH_F>,
+           BC1F_FM<0, 0>, ISA_MIPS1_NOT_32R6_64R6;
+def BC1T : MMRel, BC1F_FT<"bc1t", brtarget, IIBranch, MIPS_BRANCH_T>,
+           BC1F_FM<0, 1>, ISA_MIPS1_NOT_32R6_64R6;
+
+//===----------------------------------------------------------------------===//
+// Floating Point Flag Conditions
+//===----------------------------------------------------------------------===//
+// Mips condition codes. They must correspond to condcode in MipsInstrInfo.h.
+// They must be kept in synch.
+def MIPS_FCOND_F    : PatLeaf<(i32 0)>;
+def MIPS_FCOND_UN   : PatLeaf<(i32 1)>;
+def MIPS_FCOND_OEQ  : PatLeaf<(i32 2)>;
+def MIPS_FCOND_UEQ  : PatLeaf<(i32 3)>;
+def MIPS_FCOND_OLT  : PatLeaf<(i32 4)>;
+def MIPS_FCOND_ULT  : PatLeaf<(i32 5)>;
+def MIPS_FCOND_OLE  : PatLeaf<(i32 6)>;
+def MIPS_FCOND_ULE  : PatLeaf<(i32 7)>;
+def MIPS_FCOND_SF   : PatLeaf<(i32 8)>;
+def MIPS_FCOND_NGLE : PatLeaf<(i32 9)>;
+def MIPS_FCOND_SEQ  : PatLeaf<(i32 10)>;
+def MIPS_FCOND_NGL  : PatLeaf<(i32 11)>;
+def MIPS_FCOND_LT   : PatLeaf<(i32 12)>;
+def MIPS_FCOND_NGE  : PatLeaf<(i32 13)>;
+def MIPS_FCOND_LE   : PatLeaf<(i32 14)>;
+def MIPS_FCOND_NGT  : PatLeaf<(i32 15)>;
+
+/// Floating Point Compare
+def FCMP_S32 : MMRel, CEQS_FT<"s", FGR32, II_C_CC_S, MipsFPCmp>, CEQS_FM<16>,
+               ISA_MIPS1_NOT_32R6_64R6;
+def FCMP_D32 : MMRel, CEQS_FT<"d", AFGR64, II_C_CC_D, MipsFPCmp>, CEQS_FM<17>,
+               ISA_MIPS1_NOT_32R6_64R6, AdditionalRequires<[NotFP64bit]>;
+let DecoderNamespace = "Mips64" in
+def FCMP_D64 : CEQS_FT<"d", FGR64, II_C_CC_D, MipsFPCmp>, CEQS_FM<17>,
+               ISA_MIPS1_NOT_32R6_64R6, AdditionalRequires<[IsFP64bit]>;
+
+//===----------------------------------------------------------------------===//
+// Floating Point Pseudo-Instructions
+//===----------------------------------------------------------------------===//
+
+// This pseudo instr gets expanded into 2 mtc1 instrs after register
+// allocation.
+class BuildPairF64Base<RegisterOperand RO> :
+  PseudoSE<(outs RO:$dst), (ins GPR32Opnd:$lo, GPR32Opnd:$hi),
+           [(set RO:$dst, (MipsBuildPairF64 GPR32Opnd:$lo, GPR32Opnd:$hi))]>;
+
+def BuildPairF64 : BuildPairF64Base<AFGR64Opnd>,
+                   AdditionalRequires<[NotFP64bit]>;
+def BuildPairF64_64 : BuildPairF64Base<FGR64Opnd>,
+                      AdditionalRequires<[IsFP64bit]>;
+
+// This pseudo instr gets expanded into 2 mfc1 instrs after register
+// allocation.
+// if n is 0, lower part of src is extracted.
+// if n is 1, higher part of src is extracted.
+class ExtractElementF64Base<RegisterOperand RO> :
+  PseudoSE<(outs GPR32Opnd:$dst), (ins RO:$src, i32imm:$n),
+           [(set GPR32Opnd:$dst, (MipsExtractElementF64 RO:$src, imm:$n))]>;
+
+def ExtractElementF64 : ExtractElementF64Base<AFGR64Opnd>,
+                        AdditionalRequires<[NotFP64bit]>;
+def ExtractElementF64_64 : ExtractElementF64Base<FGR64Opnd>,
+                           AdditionalRequires<[IsFP64bit]>;
+
+//===----------------------------------------------------------------------===//
+// InstAliases.
+//===----------------------------------------------------------------------===//
+def : MipsInstAlias<"bc1t $offset", (BC1T FCC0, brtarget:$offset)>,
+      ISA_MIPS1_NOT_32R6_64R6;
+def : MipsInstAlias<"bc1f $offset", (BC1F FCC0, brtarget:$offset)>,
+      ISA_MIPS1_NOT_32R6_64R6;
+
+//===----------------------------------------------------------------------===//
+// Floating Point Patterns
+//===----------------------------------------------------------------------===//
+def : MipsPat<(f32 fpimm0), (MTC1 ZERO)>;
+def : MipsPat<(f32 fpimm0neg), (FNEG_S (MTC1 ZERO))>;
+
+def : MipsPat<(f32 (sint_to_fp GPR32Opnd:$src)),
+              (PseudoCVT_S_W GPR32Opnd:$src)>;
+def : MipsPat<(MipsTruncIntFP FGR32Opnd:$src),
+              (TRUNC_W_S FGR32Opnd:$src)>;
+
+def : MipsPat<(f64 (sint_to_fp GPR32Opnd:$src)),
+              (PseudoCVT_D32_W GPR32Opnd:$src)>, FGR_32;
+def : MipsPat<(MipsTruncIntFP AFGR64Opnd:$src),
+              (TRUNC_W_D32 AFGR64Opnd:$src)>, FGR_32;
+def : MipsPat<(f32 (fround AFGR64Opnd:$src)),
+              (CVT_S_D32 AFGR64Opnd:$src)>, FGR_32;
+def : MipsPat<(f64 (fextend FGR32Opnd:$src)),
+              (CVT_D32_S FGR32Opnd:$src)>, FGR_32;
+
+def : MipsPat<(f64 fpimm0), (DMTC1 ZERO_64)>, FGR_64;
+def : MipsPat<(f64 fpimm0neg), (FNEG_D64 (DMTC1 ZERO_64))>, FGR_64;
+
+def : MipsPat<(f64 (sint_to_fp GPR32Opnd:$src)),
+              (PseudoCVT_D64_W GPR32Opnd:$src)>, FGR_64;
+def : MipsPat<(f32 (sint_to_fp GPR64Opnd:$src)),
+              (EXTRACT_SUBREG (PseudoCVT_S_L GPR64Opnd:$src), sub_lo)>, FGR_64;
+def : MipsPat<(f64 (sint_to_fp GPR64Opnd:$src)),
+              (PseudoCVT_D64_L GPR64Opnd:$src)>, FGR_64;
+
+def : MipsPat<(MipsTruncIntFP FGR64Opnd:$src),
+              (TRUNC_W_D64 FGR64Opnd:$src)>, FGR_64;
+def : MipsPat<(MipsTruncIntFP FGR32Opnd:$src),
+              (TRUNC_L_S FGR32Opnd:$src)>, FGR_64;
+def : MipsPat<(MipsTruncIntFP FGR64Opnd:$src),
+              (TRUNC_L_D64 FGR64Opnd:$src)>, FGR_64;
+
+def : MipsPat<(f32 (fround FGR64Opnd:$src)),
+              (CVT_S_D64 FGR64Opnd:$src)>, FGR_64;
+def : MipsPat<(f64 (fextend FGR32Opnd:$src)),
+              (CVT_D64_S FGR32Opnd:$src)>, FGR_64;
+
+// Patterns for loads/stores with a reg+imm operand.
+let AddedComplexity = 40 in {
+  def : LoadRegImmPat<LWC1, f32, load>;
+  def : StoreRegImmPat<SWC1, f32>;
+
+  def : LoadRegImmPat<LDC164, f64, load>, FGR_64;
+  def : StoreRegImmPat<SDC164, f64>, FGR_64;
+
+  def : LoadRegImmPat<LDC1, f64, load>, FGR_32;
+  def : StoreRegImmPat<SDC1, f64>, FGR_32;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsInstrFormats.td b/contrib/llvm/lib/Target/Mips/MipsInstrFormats.td
new file mode 100644
index 0000000..6a01ae5
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsInstrFormats.td
@@ -0,0 +1,906 @@
+//===-- MipsInstrFormats.td - Mips Instruction Formats -----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Describe MIPS instructions format
+//
+//  CPU INSTRUCTION FORMATS
+//
+//  opcode  - operation code.
+//  rs      - src reg.
+//  rt      - dst reg (on a 2 regs instr) or src reg (on a 3 reg instr).
+//  rd      - dst reg, only used on 3 regs instr.
+//  shamt   - only used on shift instructions, contains the shift amount.
+//  funct   - combined with opcode field give us an operation code.
+//
+//===----------------------------------------------------------------------===//
+
+// Format specifies the encoding used by the instruction.  This is part of the
+// ad-hoc solution used to emit machine instruction encodings by our machine
+// code emitter.
+class Format<bits<4> val> {
+  bits<4> Value = val;
+}
+
+def Pseudo    : Format<0>;
+def FrmR      : Format<1>;
+def FrmI      : Format<2>;
+def FrmJ      : Format<3>;
+def FrmFR     : Format<4>;
+def FrmFI     : Format<5>;
+def FrmOther  : Format<6>; // Instruction w/ a custom format
+
+class MMRel;
+
+def Std2MicroMips : InstrMapping {
+  let FilterClass = "MMRel";
+  // Instructions with the same BaseOpcode and isNVStore values form a row.
+  let RowFields = ["BaseOpcode"];
+  // Instructions with the same predicate sense form a column.
+  let ColFields = ["Arch"];
+  // The key column is the unpredicated instructions.
+  let KeyCol = ["se"];
+  // Value columns are PredSense=true and PredSense=false
+  let ValueCols = [["se"], ["micromips"]];
+}
+
+class StdArch {
+  string Arch = "se";
+}
+
+// Generic Mips Format
+class MipsInst<dag outs, dag ins, string asmstr, list<dag> pattern,
+               InstrItinClass itin, Format f>: Instruction
+{
+  field bits<32> Inst;
+  Format Form = f;
+
+  let Namespace = "Mips";
+
+  let Size = 4;
+
+  bits<6> Opcode = 0;
+
+  // Top 6 bits are the 'opcode' field
+  let Inst{31-26} = Opcode;
+
+  let OutOperandList = outs;
+  let InOperandList  = ins;
+
+  let AsmString   = asmstr;
+  let Pattern     = pattern;
+  let Itinerary   = itin;
+
+  //
+  // Attributes specific to Mips instructions...
+  //
+  bits<4> FormBits = Form.Value;
+
+  // TSFlags layout should be kept in sync with MipsInstrInfo.h.
+  let TSFlags{3-0}   = FormBits;
+
+  let DecoderNamespace = "Mips";
+
+  field bits<32> SoftFail = 0;
+}
+
+// Mips32/64 Instruction Format
+class InstSE<dag outs, dag ins, string asmstr, list<dag> pattern,
+             InstrItinClass itin, Format f, string opstr = ""> :
+  MipsInst<outs, ins, asmstr, pattern, itin, f>, PredicateControl {
+  let EncodingPredicates = [HasStdEnc];
+  string BaseOpcode = opstr;
+  string Arch;
+}
+
+// Mips Pseudo Instructions Format
+class MipsPseudo<dag outs, dag ins, list<dag> pattern,
+                 InstrItinClass itin = IIPseudo> :
+  MipsInst<outs, ins, "", pattern, itin, Pseudo> {
+  let isCodeGenOnly = 1;
+  let isPseudo = 1;
+}
+
+// Mips32/64 Pseudo Instruction Format
+class PseudoSE<dag outs, dag ins, list<dag> pattern,
+               InstrItinClass itin = IIPseudo> :
+  MipsPseudo<outs, ins, pattern, itin>, PredicateControl {
+  let EncodingPredicates = [HasStdEnc];
+}
+
+// Pseudo-instructions for alternate assembly syntax (never used by codegen).
+// These are aliases that require C++ handling to convert to the target
+// instruction, while InstAliases can be handled directly by tblgen.
+class MipsAsmPseudoInst<dag outs, dag ins, string asmstr>:
+  MipsInst<outs, ins, asmstr, [], IIPseudo, Pseudo> {
+  let isPseudo = 1;
+  let Pattern = [];
+}
+//===----------------------------------------------------------------------===//
+// Format R instruction class in Mips : <|opcode|rs|rt|rd|shamt|funct|>
+//===----------------------------------------------------------------------===//
+
+class FR<bits<6> op, bits<6> _funct, dag outs, dag ins, string asmstr,
+         list<dag> pattern, InstrItinClass itin>:
+  InstSE<outs, ins, asmstr, pattern, itin, FrmR>
+{
+  bits<5>  rd;
+  bits<5>  rs;
+  bits<5>  rt;
+  bits<5>  shamt;
+  bits<6>  funct;
+
+  let Opcode = op;
+  let funct  = _funct;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = shamt;
+  let Inst{5-0}   = funct;
+}
+
+//===----------------------------------------------------------------------===//
+// Format I instruction class in Mips : <|opcode|rs|rt|immediate|>
+//===----------------------------------------------------------------------===//
+
+class FI<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern,
+         InstrItinClass itin>: InstSE<outs, ins, asmstr, pattern, itin, FrmI>
+{
+  bits<5>  rt;
+  bits<5>  rs;
+  bits<16> imm16;
+
+  let Opcode = op;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-0}  = imm16;
+}
+
+class BranchBase<bits<6> op, dag outs, dag ins, string asmstr,
+                  list<dag> pattern, InstrItinClass itin>:
+  InstSE<outs, ins, asmstr, pattern, itin, FrmI>
+{
+  bits<5>  rs;
+  bits<5>  rt;
+  bits<16> imm16;
+
+  let Opcode = op;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-0}  = imm16;
+}
+
+//===----------------------------------------------------------------------===//
+// Format J instruction class in Mips : <|opcode|address|>
+//===----------------------------------------------------------------------===//
+
+class FJ<bits<6> op> : StdArch
+{
+  bits<26> target;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-0}  = target;
+}
+
+//===----------------------------------------------------------------------===//
+// MFC instruction class in Mips : <|op|mf|rt|rd|0000000|sel|>
+//===----------------------------------------------------------------------===//
+class MFC3OP_FM<bits<6> op, bits<5> mfmt>
+{
+  bits<5> rt;
+  bits<5> rd;
+  bits<3> sel;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = mfmt;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-3}  = 0;
+  let Inst{2-0}   = sel;
+}
+
+class ADD_FM<bits<6> op, bits<6> funct> : StdArch {
+  bits<5> rd;
+  bits<5> rs;
+  bits<5> rt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = 0;
+  let Inst{5-0}   = funct;
+}
+
+class ADDI_FM<bits<6> op> : StdArch {
+  bits<5>  rs;
+  bits<5>  rt;
+  bits<16> imm16;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-0}  = imm16;
+}
+
+class SRA_FM<bits<6> funct, bit rotate> : StdArch {
+  bits<5> rd;
+  bits<5> rt;
+  bits<5> shamt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0;
+  let Inst{25-22} = 0;
+  let Inst{21}    = rotate;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = shamt;
+  let Inst{5-0}   = funct;
+}
+
+class SRLV_FM<bits<6> funct, bit rotate> : StdArch {
+  bits<5> rd;
+  bits<5> rt;
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-7}  = 0;
+  let Inst{6}     = rotate;
+  let Inst{5-0}   = funct;
+}
+
+class BEQ_FM<bits<6> op> : StdArch {
+  bits<5>  rs;
+  bits<5>  rt;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-0}  = offset;
+}
+
+class BGEZ_FM<bits<6> op, bits<5> funct> : StdArch {
+  bits<5>  rs;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = funct;
+  let Inst{15-0}  = offset;
+}
+
+class SLTI_FM<bits<6> op> : StdArch {
+  bits<5> rt;
+  bits<5> rs;
+  bits<16> imm16;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-0}  = imm16;
+}
+
+class MFLO_FM<bits<6> funct> : StdArch {
+  bits<5> rd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0;
+  let Inst{25-16} = 0;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = 0;
+  let Inst{5-0}   = funct;
+}
+
+class MTLO_FM<bits<6> funct> : StdArch {
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = rs;
+  let Inst{20-6}  = 0;
+  let Inst{5-0}   = funct;
+}
+
+class SEB_FM<bits<5> funct, bits<6> funct2> : StdArch {
+  bits<5> rd;
+  bits<5> rt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x1f;
+  let Inst{25-21} = 0;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = funct;
+  let Inst{5-0}   = funct2;
+}
+
+class CLO_FM<bits<6> funct> : StdArch {
+  bits<5> rd;
+  bits<5> rs;
+  bits<5> rt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x1c;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = 0;
+  let Inst{5-0}   = funct;
+  let rt = rd;
+}
+
+class LUI_FM : StdArch {
+  bits<5> rt;
+  bits<16> imm16;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0xf;
+  let Inst{25-21} = 0;
+  let Inst{20-16} = rt;
+  let Inst{15-0}  = imm16;
+}
+
+class JALR_FM {
+  bits<5> rd;
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = 0;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = 0;
+  let Inst{5-0}   = 9;
+}
+
+class BGEZAL_FM<bits<5> funct> : StdArch {
+  bits<5>  rs;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 1;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = funct;
+  let Inst{15-0}  = offset;
+}
+
+class SYNC_FM : StdArch {
+  bits<5> stype;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0;
+  let Inst{10-6}  = stype;
+  let Inst{5-0}   = 0xf;
+}
+
+class MULT_FM<bits<6> op, bits<6> funct> : StdArch {
+  bits<5>  rs;
+  bits<5>  rt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-6}  = 0;
+  let Inst{5-0}   = funct;
+}
+
+class EXT_FM<bits<6> funct> : StdArch {
+  bits<5> rt;
+  bits<5> rs;
+  bits<5> pos;
+  bits<5> size;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x1f;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = size;
+  let Inst{10-6}  = pos;
+  let Inst{5-0}   = funct;
+}
+
+class RDHWR_FM {
+  bits<5> rt;
+  bits<5> rd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x1f;
+  let Inst{25-21} = 0;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = 0;
+  let Inst{5-0}   = 0x3b;
+}
+
+class TEQ_FM<bits<6> funct> : StdArch {
+  bits<5> rs;
+  bits<5> rt;
+  bits<10> code_;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-6}  = code_;
+  let Inst{5-0}   = funct;
+}
+
+class TEQI_FM<bits<5> funct> : StdArch {
+  bits<5> rs;
+  bits<16> imm16;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 1;
+  let Inst{25-21} = rs;
+  let Inst{20-16}   = funct;
+  let Inst{15-0}  = imm16;
+}
+
+class WAIT_FM : StdArch {
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x10;
+  let Inst{25}    = 1;
+  let Inst{24-6}  = 0;
+  let Inst{5-0}   = 0x20;
+}
+
+class EXTS_FM<bits<6> funct> : StdArch {
+  bits<5> rt;
+  bits<5> rs;
+  bits<5> pos;
+  bits<5> lenm1;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x1c;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = lenm1;
+  let Inst{10-6}  = pos;
+  let Inst{5-0}   = funct;
+}
+
+class MTMR_FM<bits<6> funct> : StdArch {
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x1c;
+  let Inst{25-21} = rs;
+  let Inst{20-6}  = 0;
+  let Inst{5-0}   = funct;
+}
+
+class POP_FM<bits<6> funct> : StdArch {
+  bits<5> rd;
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x1c;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = 0;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = 0;
+  let Inst{5-0}   = funct;
+}
+
+class SEQ_FM<bits<6> funct> : StdArch {
+  bits<5> rd;
+  bits<5> rs;
+  bits<5> rt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x1c;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = 0;
+  let Inst{5-0}   = funct;
+}
+
+class SEQI_FM<bits<6> funct> : StdArch {
+  bits<5> rs;
+  bits<5> rt;
+  bits<10> imm10;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x1c;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-6}  = imm10;
+  let Inst{5-0}   = funct;
+}
+
+//===----------------------------------------------------------------------===//
+//  System calls format <op|code_|funct>
+//===----------------------------------------------------------------------===//
+
+class SYS_FM<bits<6> funct> : StdArch
+{
+  bits<20> code_;
+  bits<32> Inst;
+  let Inst{31-26} = 0x0;
+  let Inst{25-6} = code_;
+  let Inst{5-0}  = funct;
+}
+
+//===----------------------------------------------------------------------===//
+//  Break instruction format <op|code_1|funct>
+//===----------------------------------------------------------------------===//
+
+class BRK_FM<bits<6> funct> : StdArch
+{
+  bits<10> code_1;
+  bits<10> code_2;
+  bits<32> Inst;
+  let Inst{31-26} = 0x0;
+  let Inst{25-16} = code_1;
+  let Inst{15-6}  = code_2;
+  let Inst{5-0}   = funct;
+}
+
+//===----------------------------------------------------------------------===//
+//  Exception return format <Cop0|1|0|funct>
+//===----------------------------------------------------------------------===//
+
+class ER_FM<bits<6> funct> : StdArch
+{
+  bits<32> Inst;
+  let Inst{31-26} = 0x10;
+  let Inst{25}    = 1;
+  let Inst{24-6}  = 0;
+  let Inst{5-0}   = funct;
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Enable/disable interrupt instruction format <Cop0|MFMC0|rt|12|0|sc|0|0>
+//===----------------------------------------------------------------------===//
+
+class EI_FM<bits<1> sc> : StdArch
+{
+  bits<32> Inst;
+  bits<5> rt;
+  let Inst{31-26} = 0x10;
+  let Inst{25-21} = 0xb;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = 0xc;
+  let Inst{10-6}  = 0;
+  let Inst{5}     = sc;
+  let Inst{4-0}   = 0;
+}
+
+//===----------------------------------------------------------------------===//
+//
+//  FLOATING POINT INSTRUCTION FORMATS
+//
+//  opcode  - operation code.
+//  fs      - src reg.
+//  ft      - dst reg (on a 2 regs instr) or src reg (on a 3 reg instr).
+//  fd      - dst reg, only used on 3 regs instr.
+//  fmt     - double or single precision.
+//  funct   - combined with opcode field give us an operation code.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Format FI instruction class in Mips : <|opcode|base|ft|immediate|>
+//===----------------------------------------------------------------------===//
+
+class FFI<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern>:
+  InstSE<outs, ins, asmstr, pattern, NoItinerary, FrmFI>
+{
+  bits<5>  ft;
+  bits<5>  base;
+  bits<16> imm16;
+
+  let Opcode = op;
+
+  let Inst{25-21} = base;
+  let Inst{20-16} = ft;
+  let Inst{15-0}  = imm16;
+}
+
+class ADDS_FM<bits<6> funct, bits<5> fmt> : StdArch {
+  bits<5> fd;
+  bits<5> fs;
+  bits<5> ft;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x11;
+  let Inst{25-21} = fmt;
+  let Inst{20-16} = ft;
+  let Inst{15-11} = fs;
+  let Inst{10-6}  = fd;
+  let Inst{5-0}   = funct;
+}
+
+class ABSS_FM<bits<6> funct, bits<5> fmt> : StdArch {
+  bits<5> fd;
+  bits<5> fs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x11;
+  let Inst{25-21} = fmt;
+  let Inst{20-16} = 0;
+  let Inst{15-11} = fs;
+  let Inst{10-6}  = fd;
+  let Inst{5-0}   = funct;
+}
+
+class MFC1_FM<bits<5> funct> : StdArch {
+  bits<5> rt;
+  bits<5> fs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x11;
+  let Inst{25-21} = funct;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = fs;
+  let Inst{10-0}  = 0;
+}
+
+class LW_FM<bits<6> op> : StdArch {
+  bits<5> rt;
+  bits<21> addr;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = addr{20-16};
+  let Inst{20-16} = rt;
+  let Inst{15-0}  = addr{15-0};
+}
+
+class MADDS_FM<bits<3> funct, bits<3> fmt> : StdArch {
+  bits<5> fd;
+  bits<5> fr;
+  bits<5> fs;
+  bits<5> ft;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x13;
+  let Inst{25-21} = fr;
+  let Inst{20-16} = ft;
+  let Inst{15-11} = fs;
+  let Inst{10-6}  = fd;
+  let Inst{5-3}   = funct;
+  let Inst{2-0}   = fmt;
+}
+
+class LWXC1_FM<bits<6> funct> : StdArch {
+  bits<5> fd;
+  bits<5> base;
+  bits<5> index;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x13;
+  let Inst{25-21} = base;
+  let Inst{20-16} = index;
+  let Inst{15-11} = 0;
+  let Inst{10-6}  = fd;
+  let Inst{5-0}   = funct;
+}
+
+class SWXC1_FM<bits<6> funct> : StdArch {
+  bits<5> fs;
+  bits<5> base;
+  bits<5> index;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x13;
+  let Inst{25-21} = base;
+  let Inst{20-16} = index;
+  let Inst{15-11} = fs;
+  let Inst{10-6}  = 0;
+  let Inst{5-0}   = funct;
+}
+
+class BC1F_FM<bit nd, bit tf> : StdArch {
+  bits<3>  fcc;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x11;
+  let Inst{25-21} = 0x8;
+  let Inst{20-18} = fcc;
+  let Inst{17} = nd;
+  let Inst{16} = tf;
+  let Inst{15-0} = offset;
+}
+
+class CEQS_FM<bits<5> fmt> : StdArch {
+  bits<5> fs;
+  bits<5> ft;
+  bits<4> cond;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x11;
+  let Inst{25-21} = fmt;
+  let Inst{20-16} = ft;
+  let Inst{15-11} = fs;
+  let Inst{10-8} = 0; // cc
+  let Inst{7-4} = 0x3;
+  let Inst{3-0} = cond;
+}
+
+class C_COND_FM<bits<5> fmt, bits<4> c> : CEQS_FM<fmt> {
+  let cond = c;
+}
+
+class CMov_I_F_FM<bits<6> funct, bits<5> fmt> : StdArch {
+  bits<5> fd;
+  bits<5> fs;
+  bits<5> rt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x11;
+  let Inst{25-21} = fmt;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = fs;
+  let Inst{10-6} = fd;
+  let Inst{5-0} = funct;
+}
+
+class CMov_F_I_FM<bit tf> : StdArch {
+  bits<5> rd;
+  bits<5> rs;
+  bits<3> fcc;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = rs;
+  let Inst{20-18} = fcc;
+  let Inst{17} = 0;
+  let Inst{16} = tf;
+  let Inst{15-11} = rd;
+  let Inst{10-6} = 0;
+  let Inst{5-0} = 1;
+}
+
+class CMov_F_F_FM<bits<5> fmt, bit tf> : StdArch {
+  bits<5> fd;
+  bits<5> fs;
+  bits<3> fcc;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x11;
+  let Inst{25-21} = fmt;
+  let Inst{20-18} = fcc;
+  let Inst{17} = 0;
+  let Inst{16} = tf;
+  let Inst{15-11} = fs;
+  let Inst{10-6} = fd;
+  let Inst{5-0} = 0x11;
+}
+
+class BARRIER_FM<bits<5> op> : StdArch {
+  bits<32> Inst;
+
+  let Inst{31-26} = 0; // SPECIAL
+  let Inst{25-21} = 0;
+  let Inst{20-16} = 0; // rt = 0
+  let Inst{15-11} = 0; // rd = 0
+  let Inst{10-6} = op; // Operation
+  let Inst{5-0} = 0;   // SLL
+}
+
+class SDBBP_FM : StdArch {
+  bits<20> code_;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b011100; // SPECIAL2
+  let Inst{25-6} = code_;
+  let Inst{5-0} = 0b111111;   // SDBBP
+}
+
+class JR_HB_FM<bits<6> op> : StdArch{
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0; // SPECIAL
+  let Inst{25-21} = rs;
+  let Inst{20-11} = 0;
+  let Inst{10} = 1;
+  let Inst{9-6} = 0;
+  let Inst{5-0} = op;
+}
+
+class JALR_HB_FM<bits<6> op> : StdArch {
+  bits<5> rd;
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0; // SPECIAL
+  let Inst{25-21} = rs;
+  let Inst{20-16} = 0;
+  let Inst{15-11} = rd;
+  let Inst{10} = 1;
+  let Inst{9-6} = 0;
+  let Inst{5-0} = op;
+}
+
+class COP0_TLB_FM<bits<6> op> : StdArch {
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x10; // COP0
+  let Inst{25} = 1;       // CO
+  let Inst{24-6} = 0;
+  let Inst{5-0} = op;     // Operation
+}
+
+class CACHEOP_FM<bits<6> op> : StdArch {
+  bits<21> addr;
+  bits<5> hint;
+  bits<5> base = addr{20-16};
+  bits<16> offset = addr{15-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = base;
+  let Inst{20-16} = hint;
+  let Inst{15-0}  = offset;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsInstrInfo.cpp b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.cpp
new file mode 100644
index 0000000..dcc0e24
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.cpp
@@ -0,0 +1,289 @@
+//===-- MipsInstrInfo.cpp - Mips Instruction Information ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsInstrInfo.h"
+#include "InstPrinter/MipsInstPrinter.h"
+#include "MipsAnalyzeImmediate.h"
+#include "MipsMachineFunction.h"
+#include "MipsTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "MipsGenInstrInfo.inc"
+
+// Pin the vtable to this file.
+void MipsInstrInfo::anchor() {}
+
+MipsInstrInfo::MipsInstrInfo(const MipsSubtarget &STI, unsigned UncondBr)
+    : MipsGenInstrInfo(Mips::ADJCALLSTACKDOWN, Mips::ADJCALLSTACKUP),
+      Subtarget(STI), UncondBrOpc(UncondBr) {}
+
+const MipsInstrInfo *MipsInstrInfo::create(MipsSubtarget &STI) {
+  if (STI.inMips16Mode())
+    return llvm::createMips16InstrInfo(STI);
+
+  return llvm::createMipsSEInstrInfo(STI);
+}
+
+bool MipsInstrInfo::isZeroImm(const MachineOperand &op) const {
+  return op.isImm() && op.getImm() == 0;
+}
+
+/// insertNoop - If data hazard condition is found insert the target nop
+/// instruction.
+void MipsInstrInfo::
+insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const
+{
+  DebugLoc DL;
+  BuildMI(MBB, MI, DL, get(Mips::NOP));
+}
+
+MachineMemOperand *MipsInstrInfo::GetMemOperand(MachineBasicBlock &MBB, int FI,
+                                                unsigned Flag) const {
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+  unsigned Align = MFI.getObjectAlignment(FI);
+
+  return MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI), Flag,
+                                 MFI.getObjectSize(FI), Align);
+}
+
+//===----------------------------------------------------------------------===//
+// Branch Analysis
+//===----------------------------------------------------------------------===//
+
+void MipsInstrInfo::AnalyzeCondBr(const MachineInstr *Inst, unsigned Opc,
+                                  MachineBasicBlock *&BB,
+                                  SmallVectorImpl<MachineOperand> &Cond) const {
+  assert(getAnalyzableBrOpc(Opc) && "Not an analyzable branch");
+  int NumOp = Inst->getNumExplicitOperands();
+
+  // for both int and fp branches, the last explicit operand is the
+  // MBB.
+  BB = Inst->getOperand(NumOp-1).getMBB();
+  Cond.push_back(MachineOperand::CreateImm(Opc));
+
+  for (int i=0; i<NumOp-1; i++)
+    Cond.push_back(Inst->getOperand(i));
+}
+
+bool MipsInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+                                  MachineBasicBlock *&TBB,
+                                  MachineBasicBlock *&FBB,
+                                  SmallVectorImpl<MachineOperand> &Cond,
+                                  bool AllowModify) const {
+  SmallVector<MachineInstr*, 2> BranchInstrs;
+  BranchType BT = AnalyzeBranch(MBB, TBB, FBB, Cond, AllowModify, BranchInstrs);
+
+  return (BT == BT_None) || (BT == BT_Indirect);
+}
+
+void
+MipsInstrInfo::BuildCondBr(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                           DebugLoc DL,
+                           const SmallVectorImpl<MachineOperand> &Cond) const {
+  unsigned Opc = Cond[0].getImm();
+  const MCInstrDesc &MCID = get(Opc);
+  MachineInstrBuilder MIB = BuildMI(&MBB, DL, MCID);
+
+  for (unsigned i = 1; i < Cond.size(); ++i) {
+    if (Cond[i].isReg())
+      MIB.addReg(Cond[i].getReg());
+    else if (Cond[i].isImm())
+      MIB.addImm(Cond[i].getImm());
+    else
+       assert(true && "Cannot copy operand");
+  }
+  MIB.addMBB(TBB);
+}
+
+unsigned MipsInstrInfo::InsertBranch(
+    MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
+    const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const {
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+
+  // # of condition operands:
+  //  Unconditional branches: 0
+  //  Floating point branches: 1 (opc)
+  //  Int BranchZero: 2 (opc, reg)
+  //  Int Branch: 3 (opc, reg0, reg1)
+  assert((Cond.size() <= 3) &&
+         "# of Mips branch conditions must be <= 3!");
+
+  // Two-way Conditional branch.
+  if (FBB) {
+    BuildCondBr(MBB, TBB, DL, Cond);
+    BuildMI(&MBB, DL, get(UncondBrOpc)).addMBB(FBB);
+    return 2;
+  }
+
+  // One way branch.
+  // Unconditional branch.
+  if (Cond.empty())
+    BuildMI(&MBB, DL, get(UncondBrOpc)).addMBB(TBB);
+  else // Conditional branch.
+    BuildCondBr(MBB, TBB, DL, Cond);
+  return 1;
+}
+
+unsigned MipsInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::reverse_iterator I = MBB.rbegin(), REnd = MBB.rend();
+  MachineBasicBlock::reverse_iterator FirstBr;
+  unsigned removed;
+
+  // Skip all the debug instructions.
+  while (I != REnd && I->isDebugValue())
+    ++I;
+
+  FirstBr = I;
+
+  // Up to 2 branches are removed.
+  // Note that indirect branches are not removed.
+  for (removed = 0; I != REnd && removed < 2; ++I, ++removed)
+    if (!getAnalyzableBrOpc(I->getOpcode()))
+      break;
+
+  MBB.erase(I.base(), FirstBr.base());
+
+  return removed;
+}
+
+/// ReverseBranchCondition - Return the inverse opcode of the
+/// specified Branch instruction.
+bool MipsInstrInfo::ReverseBranchCondition(
+    SmallVectorImpl<MachineOperand> &Cond) const {
+  assert( (Cond.size() && Cond.size() <= 3) &&
+          "Invalid Mips branch condition!");
+  Cond[0].setImm(getOppositeBranchOpc(Cond[0].getImm()));
+  return false;
+}
+
+MipsInstrInfo::BranchType MipsInstrInfo::AnalyzeBranch(
+    MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
+    SmallVectorImpl<MachineOperand> &Cond, bool AllowModify,
+    SmallVectorImpl<MachineInstr *> &BranchInstrs) const {
+
+  MachineBasicBlock::reverse_iterator I = MBB.rbegin(), REnd = MBB.rend();
+
+  // Skip all the debug instructions.
+  while (I != REnd && I->isDebugValue())
+    ++I;
+
+  if (I == REnd || !isUnpredicatedTerminator(&*I)) {
+    // This block ends with no branches (it just falls through to its succ).
+    // Leave TBB/FBB null.
+    TBB = FBB = nullptr;
+    return BT_NoBranch;
+  }
+
+  MachineInstr *LastInst = &*I;
+  unsigned LastOpc = LastInst->getOpcode();
+  BranchInstrs.push_back(LastInst);
+
+  // Not an analyzable branch (e.g., indirect jump).
+  if (!getAnalyzableBrOpc(LastOpc))
+    return LastInst->isIndirectBranch() ? BT_Indirect : BT_None;
+
+  // Get the second to last instruction in the block.
+  unsigned SecondLastOpc = 0;
+  MachineInstr *SecondLastInst = nullptr;
+
+  if (++I != REnd) {
+    SecondLastInst = &*I;
+    SecondLastOpc = getAnalyzableBrOpc(SecondLastInst->getOpcode());
+
+    // Not an analyzable branch (must be an indirect jump).
+    if (isUnpredicatedTerminator(SecondLastInst) && !SecondLastOpc)
+      return BT_None;
+  }
+
+  // If there is only one terminator instruction, process it.
+  if (!SecondLastOpc) {
+    // Unconditional branch.
+    if (LastOpc == UncondBrOpc) {
+      TBB = LastInst->getOperand(0).getMBB();
+      return BT_Uncond;
+    }
+
+    // Conditional branch
+    AnalyzeCondBr(LastInst, LastOpc, TBB, Cond);
+    return BT_Cond;
+  }
+
+  // If we reached here, there are two branches.
+  // If there are three terminators, we don't know what sort of block this is.
+  if (++I != REnd && isUnpredicatedTerminator(&*I))
+    return BT_None;
+
+  BranchInstrs.insert(BranchInstrs.begin(), SecondLastInst);
+
+  // If second to last instruction is an unconditional branch,
+  // analyze it and remove the last instruction.
+  if (SecondLastOpc == UncondBrOpc) {
+    // Return if the last instruction cannot be removed.
+    if (!AllowModify)
+      return BT_None;
+
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    LastInst->eraseFromParent();
+    BranchInstrs.pop_back();
+    return BT_Uncond;
+  }
+
+  // Conditional branch followed by an unconditional branch.
+  // The last one must be unconditional.
+  if (LastOpc != UncondBrOpc)
+    return BT_None;
+
+  AnalyzeCondBr(SecondLastInst, SecondLastOpc, TBB, Cond);
+  FBB = LastInst->getOperand(0).getMBB();
+
+  return BT_CondUncond;
+}
+
+/// Return the number of bytes of code the specified instruction may be.
+unsigned MipsInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  default:
+    return MI->getDesc().getSize();
+  case  TargetOpcode::INLINEASM: {       // Inline Asm: Variable size.
+    const MachineFunction *MF = MI->getParent()->getParent();
+    const char *AsmStr = MI->getOperand(0).getSymbolName();
+    return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
+  }
+  case Mips::CONSTPOOL_ENTRY:
+    // If this machine instr is a constant pool entry, its size is recorded as
+    // operand #2.
+    return MI->getOperand(2).getImm();
+  }
+}
+
+MachineInstrBuilder
+MipsInstrInfo::genInstrWithNewOpc(unsigned NewOpc,
+                                  MachineBasicBlock::iterator I) const {
+  MachineInstrBuilder MIB;
+  MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), get(NewOpc));
+
+  for (unsigned J = 0, E = I->getDesc().getNumOperands(); J < E; ++J)
+    MIB.addOperand(I->getOperand(J));
+
+  MIB.setMemRefs(I->memoperands_begin(), I->memoperands_end());
+  return MIB;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsInstrInfo.h b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.h
new file mode 100644
index 0000000..bdf2fd3
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.h
@@ -0,0 +1,148 @@
+//===-- MipsInstrInfo.h - Mips Instruction Information ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips implementation of the TargetInstrInfo class.
+//
+// FIXME: We need to override TargetInstrInfo::getInlineAsmLength method in
+// order for MipsLongBranch pass to work correctly when the code has inline
+// assembly.  The returned value doesn't have to be the asm instruction's exact
+// size in bytes; MipsLongBranch only expects it to be the correct upper bound.
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSINSTRUCTIONINFO_H
+#define MIPSINSTRUCTIONINFO_H
+
+#include "Mips.h"
+#include "MipsAnalyzeImmediate.h"
+#include "MipsRegisterInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "MipsGenInstrInfo.inc"
+
+namespace llvm {
+
+class MipsInstrInfo : public MipsGenInstrInfo {
+  virtual void anchor();
+protected:
+  const MipsSubtarget &Subtarget;
+  unsigned UncondBrOpc;
+
+public:
+  enum BranchType {
+    BT_None,       // Couldn't analyze branch.
+    BT_NoBranch,   // No branches found.
+    BT_Uncond,     // One unconditional branch.
+    BT_Cond,       // One conditional branch.
+    BT_CondUncond, // A conditional branch followed by an unconditional branch.
+    BT_Indirect    // One indirct branch.
+  };
+
+  explicit MipsInstrInfo(const MipsSubtarget &STI, unsigned UncondBrOpc);
+
+  static const MipsInstrInfo *create(MipsSubtarget &STI);
+
+  /// Branch Analysis
+  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify) const override;
+
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
+
+  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB,
+                        const SmallVectorImpl<MachineOperand> &Cond,
+                        DebugLoc DL) const override;
+
+  bool
+  ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+
+  BranchType AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                           MachineBasicBlock *&FBB,
+                           SmallVectorImpl<MachineOperand> &Cond,
+                           bool AllowModify,
+                           SmallVectorImpl<MachineInstr*> &BranchInstrs) const;
+
+  /// Insert nop instruction when hazard condition is found
+  void insertNoop(MachineBasicBlock &MBB,
+                  MachineBasicBlock::iterator MI) const override;
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  virtual const MipsRegisterInfo &getRegisterInfo() const = 0;
+
+  virtual unsigned getOppositeBranchOpc(unsigned Opc) const = 0;
+
+  /// Return the number of bytes of code the specified instruction may be.
+  unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override {
+    storeRegToStack(MBB, MBBI, SrcReg, isKill, FrameIndex, RC, TRI, 0);
+  }
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override {
+    loadRegFromStack(MBB, MBBI, DestReg, FrameIndex, RC, TRI, 0);
+  }
+
+  virtual void storeRegToStack(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator MI,
+                               unsigned SrcReg, bool isKill, int FrameIndex,
+                               const TargetRegisterClass *RC,
+                               const TargetRegisterInfo *TRI,
+                               int64_t Offset) const = 0;
+
+  virtual void loadRegFromStack(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MI,
+                                unsigned DestReg, int FrameIndex,
+                                const TargetRegisterClass *RC,
+                                const TargetRegisterInfo *TRI,
+                                int64_t Offset) const = 0;
+
+  /// Create an instruction which has the same operands and memory operands
+  /// as MI but has a new opcode.
+  MachineInstrBuilder genInstrWithNewOpc(unsigned NewOpc,
+                                         MachineBasicBlock::iterator I) const;
+
+protected:
+  bool isZeroImm(const MachineOperand &op) const;
+
+  MachineMemOperand *GetMemOperand(MachineBasicBlock &MBB, int FI,
+                                   unsigned Flag) const;
+
+private:
+  virtual unsigned getAnalyzableBrOpc(unsigned Opc) const = 0;
+
+  void AnalyzeCondBr(const MachineInstr *Inst, unsigned Opc,
+                     MachineBasicBlock *&BB,
+                     SmallVectorImpl<MachineOperand> &Cond) const;
+
+  void BuildCondBr(MachineBasicBlock &MBB, MachineBasicBlock *TBB, DebugLoc DL,
+                   const SmallVectorImpl<MachineOperand>& Cond) const;
+};
+
+/// Create MipsInstrInfo objects.
+const MipsInstrInfo *createMips16InstrInfo(const MipsSubtarget &STI);
+const MipsInstrInfo *createMipsSEInstrInfo(const MipsSubtarget &STI);
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsInstrInfo.td b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.td
new file mode 100644
index 0000000..8e9472c
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.td
@@ -0,0 +1,1763 @@
+//===- MipsInstrInfo.td - Target Description for Mips Target -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// Mips profiles and nodes
+//===----------------------------------------------------------------------===//
+
+def SDT_MipsJmpLink      : SDTypeProfile<0, 1, [SDTCisVT<0, iPTR>]>;
+def SDT_MipsCMov         : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>,
+                                                SDTCisSameAs<1, 2>,
+                                                SDTCisSameAs<3, 4>,
+                                                SDTCisInt<4>]>;
+def SDT_MipsCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
+def SDT_MipsCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
+def SDT_MFLOHI : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisVT<1, untyped>]>;
+def SDT_MTLOHI : SDTypeProfile<1, 2, [SDTCisVT<0, untyped>,
+                                      SDTCisInt<1>, SDTCisSameAs<1, 2>]>;
+def SDT_MipsMultDiv : SDTypeProfile<1, 2, [SDTCisVT<0, untyped>, SDTCisInt<1>,
+                                    SDTCisSameAs<1, 2>]>;
+def SDT_MipsMAddMSub : SDTypeProfile<1, 3,
+                                     [SDTCisVT<0, untyped>, SDTCisSameAs<0, 3>,
+                                      SDTCisVT<1, i32>, SDTCisSameAs<1, 2>]>;
+def SDT_MipsDivRem16 : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisSameAs<0, 1>]>;
+
+def SDT_MipsThreadPointer : SDTypeProfile<1, 0, [SDTCisPtrTy<0>]>;
+
+def SDT_Sync             : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
+
+def SDT_Ext : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
+                                   SDTCisVT<2, i32>, SDTCisSameAs<2, 3>]>;
+def SDT_Ins : SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
+                                   SDTCisVT<2, i32>, SDTCisSameAs<2, 3>,
+                                   SDTCisSameAs<0, 4>]>;
+
+def SDTMipsLoadLR  : SDTypeProfile<1, 2,
+                                   [SDTCisInt<0>, SDTCisPtrTy<1>,
+                                    SDTCisSameAs<0, 2>]>;
+
+// Call
+def MipsJmpLink : SDNode<"MipsISD::JmpLink",SDT_MipsJmpLink,
+                         [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
+                          SDNPVariadic]>;
+
+// Tail call
+def MipsTailCall : SDNode<"MipsISD::TailCall", SDT_MipsJmpLink,
+                          [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+// Hi and Lo nodes are used to handle global addresses. Used on
+// MipsISelLowering to lower stuff like GlobalAddress, ExternalSymbol
+// static model. (nothing to do with Mips Registers Hi and Lo)
+def MipsHi    : SDNode<"MipsISD::Hi", SDTIntUnaryOp>;
+def MipsLo    : SDNode<"MipsISD::Lo", SDTIntUnaryOp>;
+def MipsGPRel : SDNode<"MipsISD::GPRel", SDTIntUnaryOp>;
+
+// TlsGd node is used to handle General Dynamic TLS
+def MipsTlsGd : SDNode<"MipsISD::TlsGd", SDTIntUnaryOp>;
+
+// TprelHi and TprelLo nodes are used to handle Local Exec TLS
+def MipsTprelHi    : SDNode<"MipsISD::TprelHi", SDTIntUnaryOp>;
+def MipsTprelLo    : SDNode<"MipsISD::TprelLo", SDTIntUnaryOp>;
+
+// Thread pointer
+def MipsThreadPointer: SDNode<"MipsISD::ThreadPointer", SDT_MipsThreadPointer>;
+
+// Return
+def MipsRet : SDNode<"MipsISD::Ret", SDTNone,
+                     [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+// These are target-independent nodes, but have target-specific formats.
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_MipsCallSeqStart,
+                           [SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>;
+def callseq_end   : SDNode<"ISD::CALLSEQ_END", SDT_MipsCallSeqEnd,
+                           [SDNPHasChain, SDNPSideEffect,
+                            SDNPOptInGlue, SDNPOutGlue]>;
+
+// Nodes used to extract LO/HI registers.
+def MipsMFHI : SDNode<"MipsISD::MFHI", SDT_MFLOHI>;
+def MipsMFLO : SDNode<"MipsISD::MFLO", SDT_MFLOHI>;
+
+// Node used to insert 32-bit integers to LOHI register pair.
+def MipsMTLOHI : SDNode<"MipsISD::MTLOHI", SDT_MTLOHI>;
+
+// Mult nodes.
+def MipsMult  : SDNode<"MipsISD::Mult", SDT_MipsMultDiv>;
+def MipsMultu : SDNode<"MipsISD::Multu", SDT_MipsMultDiv>;
+
+// MAdd*/MSub* nodes
+def MipsMAdd  : SDNode<"MipsISD::MAdd", SDT_MipsMAddMSub>;
+def MipsMAddu : SDNode<"MipsISD::MAddu", SDT_MipsMAddMSub>;
+def MipsMSub  : SDNode<"MipsISD::MSub", SDT_MipsMAddMSub>;
+def MipsMSubu : SDNode<"MipsISD::MSubu", SDT_MipsMAddMSub>;
+
+// DivRem(u) nodes
+def MipsDivRem    : SDNode<"MipsISD::DivRem", SDT_MipsMultDiv>;
+def MipsDivRemU   : SDNode<"MipsISD::DivRemU", SDT_MipsMultDiv>;
+def MipsDivRem16  : SDNode<"MipsISD::DivRem16", SDT_MipsDivRem16,
+                           [SDNPOutGlue]>;
+def MipsDivRemU16 : SDNode<"MipsISD::DivRemU16", SDT_MipsDivRem16,
+                           [SDNPOutGlue]>;
+
+// Target constant nodes that are not part of any isel patterns and remain
+// unchanged can cause instructions with illegal operands to be emitted.
+// Wrapper node patterns give the instruction selector a chance to replace
+// target constant nodes that would otherwise remain unchanged with ADDiu
+// nodes. Without these wrapper node patterns, the following conditional move
+// instruction is emitted when function cmov2 in test/CodeGen/Mips/cmov.ll is
+// compiled:
+//  movn  %got(d)($gp), %got(c)($gp), $4
+// This instruction is illegal since movn can take only register operands.
+
+def MipsWrapper    : SDNode<"MipsISD::Wrapper", SDTIntBinOp>;
+
+def MipsSync : SDNode<"MipsISD::Sync", SDT_Sync, [SDNPHasChain,SDNPSideEffect]>;
+
+def MipsExt :  SDNode<"MipsISD::Ext", SDT_Ext>;
+def MipsIns :  SDNode<"MipsISD::Ins", SDT_Ins>;
+
+def MipsLWL : SDNode<"MipsISD::LWL", SDTMipsLoadLR,
+                     [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def MipsLWR : SDNode<"MipsISD::LWR", SDTMipsLoadLR,
+                     [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def MipsSWL : SDNode<"MipsISD::SWL", SDTStore,
+                     [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def MipsSWR : SDNode<"MipsISD::SWR", SDTStore,
+                     [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def MipsLDL : SDNode<"MipsISD::LDL", SDTMipsLoadLR,
+                     [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def MipsLDR : SDNode<"MipsISD::LDR", SDTMipsLoadLR,
+                     [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def MipsSDL : SDNode<"MipsISD::SDL", SDTStore,
+                     [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def MipsSDR : SDNode<"MipsISD::SDR", SDTStore,
+                     [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+//===----------------------------------------------------------------------===//
+// Mips Instruction Predicate Definitions.
+//===----------------------------------------------------------------------===//
+def HasMips2     :    Predicate<"Subtarget->hasMips2()">,
+                      AssemblerPredicate<"FeatureMips2">;
+def HasMips3_32  :    Predicate<"Subtarget->hasMips3_32()">,
+                      AssemblerPredicate<"FeatureMips3_32">;
+def HasMips3_32r2 :   Predicate<"Subtarget->hasMips3_32r2()">,
+                      AssemblerPredicate<"FeatureMips3_32r2">;
+def HasMips3     :    Predicate<"Subtarget->hasMips3()">,
+                      AssemblerPredicate<"FeatureMips3">;
+def HasMips4_32  :    Predicate<"Subtarget->hasMips4_32()">,
+                      AssemblerPredicate<"FeatureMips4_32">;
+def HasMips4_32r2 :   Predicate<"Subtarget->hasMips4_32r2()">,
+                      AssemblerPredicate<"FeatureMips4_32r2">;
+def HasMips5_32r2 :   Predicate<"Subtarget->hasMips5_32r2()">,
+                      AssemblerPredicate<"FeatureMips5_32r2">;
+def HasMips32    :    Predicate<"Subtarget->hasMips32()">,
+                      AssemblerPredicate<"FeatureMips32">;
+def HasMips32r2  :    Predicate<"Subtarget->hasMips32r2()">,
+                      AssemblerPredicate<"FeatureMips32r2">;
+def HasMips32r6  :    Predicate<"Subtarget->hasMips32r6()">,
+                      AssemblerPredicate<"FeatureMips32r6">;
+def NotMips32r6  :    Predicate<"!Subtarget->hasMips32r6()">,
+                      AssemblerPredicate<"!FeatureMips32r6">;
+def IsGP64bit    :    Predicate<"Subtarget->isGP64bit()">,
+                      AssemblerPredicate<"FeatureGP64Bit">;
+def IsGP32bit    :    Predicate<"!Subtarget->isGP64bit()">,
+                      AssemblerPredicate<"!FeatureGP64Bit">;
+def HasMips64    :    Predicate<"Subtarget->hasMips64()">,
+                      AssemblerPredicate<"FeatureMips64">;
+def HasMips64r2  :    Predicate<"Subtarget->hasMips64r2()">,
+                      AssemblerPredicate<"FeatureMips64r2">;
+def HasMips64r6  :    Predicate<"Subtarget->hasMips64r6()">,
+                      AssemblerPredicate<"FeatureMips64r6">;
+def NotMips64r6  :    Predicate<"!Subtarget->hasMips64r6()">,
+                      AssemblerPredicate<"!FeatureMips64r6">;
+def IsN64       :     Predicate<"Subtarget->isABI_N64()">,
+                      AssemblerPredicate<"FeatureN64">;
+def InMips16Mode :    Predicate<"Subtarget->inMips16Mode()">,
+                      AssemblerPredicate<"FeatureMips16">;
+def HasCnMips    :    Predicate<"Subtarget->hasCnMips()">,
+                      AssemblerPredicate<"FeatureCnMips">;
+def RelocStatic :     Predicate<"TM.getRelocationModel() == Reloc::Static">,
+                      AssemblerPredicate<"FeatureMips32">;
+def RelocPIC    :     Predicate<"TM.getRelocationModel() == Reloc::PIC_">,
+                      AssemblerPredicate<"FeatureMips32">;
+def NoNaNsFPMath :    Predicate<"TM.Options.NoNaNsFPMath">;
+def HasStdEnc :       Predicate<"Subtarget->hasStandardEncoding()">,
+                      AssemblerPredicate<"!FeatureMips16">;
+def NotDSP :          Predicate<"!Subtarget->hasDSP()">;
+def InMicroMips    :  Predicate<"Subtarget->inMicroMipsMode()">,
+                      AssemblerPredicate<"FeatureMicroMips">;
+def NotInMicroMips :  Predicate<"!Subtarget->inMicroMipsMode()">,
+                      AssemblerPredicate<"!FeatureMicroMips">;
+def IsLE           :  Predicate<"Subtarget->isLittle()">;
+def IsBE           :  Predicate<"!Subtarget->isLittle()">;
+def IsNotNaCl    :    Predicate<"!Subtarget->isTargetNaCl()">;
+
+//===----------------------------------------------------------------------===//
+// Mips GPR size adjectives.
+// They are mutually exclusive.
+//===----------------------------------------------------------------------===//
+
+class GPR_32 { list<Predicate> GPRPredicates = [IsGP32bit]; }
+class GPR_64 { list<Predicate> GPRPredicates = [IsGP64bit]; }
+
+//===----------------------------------------------------------------------===//
+// Mips ISA/ASE membership and instruction group membership adjectives.
+// They are mutually exclusive.
+//===----------------------------------------------------------------------===//
+
+// FIXME: I'd prefer to use additive predicates to build the instruction sets
+//        but we are short on assembler feature bits at the moment. Using a
+//        subtractive predicate will hopefully keep us under the 32 predicate
+//        limit long enough to develop an alternative way to handle P1||P2
+//        predicates.
+class ISA_MIPS1_NOT_32R6_64R6 {
+  list<Predicate> InsnPredicates = [NotMips32r6, NotMips64r6];
+}
+class ISA_MIPS2    { list<Predicate> InsnPredicates = [HasMips2]; }
+class ISA_MIPS2_NOT_32R6_64R6 {
+  list<Predicate> InsnPredicates = [HasMips2, NotMips32r6, NotMips64r6];
+}
+class ISA_MIPS3    { list<Predicate> InsnPredicates = [HasMips3]; }
+class ISA_MIPS3_NOT_32R6_64R6 {
+  list<Predicate> InsnPredicates = [HasMips3, NotMips32r6, NotMips64r6];
+}
+class ISA_MIPS32   { list<Predicate> InsnPredicates = [HasMips32]; }
+class ISA_MIPS32_NOT_32R6_64R6 {
+  list<Predicate> InsnPredicates = [HasMips32, NotMips32r6, NotMips64r6];
+}
+class ISA_MIPS32R2 { list<Predicate> InsnPredicates = [HasMips32r2]; }
+class ISA_MIPS32R2_NOT_32R6_64R6 {
+  list<Predicate> InsnPredicates = [HasMips32r2, NotMips32r6, NotMips64r6];
+}
+class ISA_MIPS64   { list<Predicate> InsnPredicates = [HasMips64]; }
+class ISA_MIPS64_NOT_64R6 {
+  list<Predicate> InsnPredicates = [HasMips64, NotMips64r6];
+}
+class ISA_MIPS64R2 { list<Predicate> InsnPredicates = [HasMips64r2]; }
+class ISA_MIPS32R6 { list<Predicate> InsnPredicates = [HasMips32r6]; }
+class ISA_MIPS64R6 { list<Predicate> InsnPredicates = [HasMips64r6]; }
+
+// The portions of MIPS-III that were also added to MIPS32
+class INSN_MIPS3_32 { list<Predicate> InsnPredicates = [HasMips3_32]; }
+
+// The portions of MIPS-III that were also added to MIPS32 but were removed in
+// MIPS32r6 and MIPS64r6.
+class INSN_MIPS3_32_NOT_32R6_64R6 {
+  list<Predicate> InsnPredicates = [HasMips3_32, NotMips32r6, NotMips64r6];
+}
+
+// The portions of MIPS-III that were also added to MIPS32
+class INSN_MIPS3_32R2 { list<Predicate> InsnPredicates = [HasMips3_32r2]; }
+
+// The portions of MIPS-IV that were also added to MIPS32 but were removed in
+// MIPS32r6 and MIPS64r6.
+class INSN_MIPS4_32_NOT_32R6_64R6 {
+  list<Predicate> InsnPredicates = [HasMips4_32, NotMips32r6, NotMips64r6];
+}
+
+// The portions of MIPS-IV that were also added to MIPS32r2 but were removed in
+// MIPS32r6 and MIPS64r6.
+class INSN_MIPS4_32R2_NOT_32R6_64R6 {
+  list<Predicate> InsnPredicates = [HasMips4_32r2, NotMips32r6, NotMips64r6];
+}
+
+// The portions of MIPS-V that were also added to MIPS32r2 but were removed in
+// MIPS32r6 and MIPS64r6.
+class INSN_MIPS5_32R2_NOT_32R6_64R6 {
+  list<Predicate> InsnPredicates = [HasMips5_32r2, NotMips32r6, NotMips64r6];
+}
+
+//===----------------------------------------------------------------------===//
+
+class MipsPat<dag pattern, dag result> : Pat<pattern, result>, PredicateControl {
+  let EncodingPredicates = [HasStdEnc];
+}
+
+class MipsInstAlias<string Asm, dag Result, bit Emit = 0b1> :
+  InstAlias<Asm, Result, Emit>, PredicateControl;
+
+class IsCommutable {
+  bit isCommutable = 1;
+}
+
+class IsBranch {
+  bit isBranch = 1;
+}
+
+class IsReturn {
+  bit isReturn = 1;
+}
+
+class IsCall {
+  bit isCall = 1;
+}
+
+class IsTailCall {
+  bit isCall = 1;
+  bit isTerminator = 1;
+  bit isReturn = 1;
+  bit isBarrier = 1;
+  bit hasExtraSrcRegAllocReq = 1;
+  bit isCodeGenOnly = 1;
+}
+
+class IsAsCheapAsAMove {
+  bit isAsCheapAsAMove = 1;
+}
+
+class NeverHasSideEffects {
+  bit neverHasSideEffects = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass
+//===----------------------------------------------------------------------===//
+
+include "MipsInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Mips Operand, Complex Patterns and Transformations Definitions.
+//===----------------------------------------------------------------------===//
+
+def MipsJumpTargetAsmOperand : AsmOperandClass {
+  let Name = "JumpTarget";
+  let ParserMethod = "ParseJumpTarget";
+  let PredicateMethod = "isImm";
+  let RenderMethod = "addImmOperands";
+}
+
+// Instruction operand types
+def jmptarget   : Operand<OtherVT> {
+  let EncoderMethod = "getJumpTargetOpValue";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+def brtarget    : Operand<OtherVT> {
+  let EncoderMethod = "getBranchTargetOpValue";
+  let OperandType = "OPERAND_PCREL";
+  let DecoderMethod = "DecodeBranchTarget";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+def calltarget  : Operand<iPTR> {
+  let EncoderMethod = "getJumpTargetOpValue";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
+def simm9 : Operand<i32>;
+def simm10 : Operand<i32>;
+def simm11 : Operand<i32>;
+
+def simm16      : Operand<i32> {
+  let DecoderMethod= "DecodeSimm16";
+}
+
+def simm19_lsl2 : Operand<i32> {
+  let EncoderMethod = "getSimm19Lsl2Encoding";
+  let DecoderMethod = "DecodeSimm19Lsl2";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
+def simm18_lsl3 : Operand<i32> {
+  let EncoderMethod = "getSimm18Lsl3Encoding";
+  let DecoderMethod = "DecodeSimm18Lsl3";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
+def simm20      : Operand<i32> {
+}
+
+def uimm20      : Operand<i32> {
+}
+
+def uimm10      : Operand<i32> {
+}
+
+def simm16_64   : Operand<i64> {
+  let DecoderMethod = "DecodeSimm16";
+}
+
+// Zero
+def uimmz       : Operand<i32> {
+  let PrintMethod = "printUnsignedImm";
+}
+
+// Unsigned Operand
+def uimm2 : Operand<i32> {
+  let PrintMethod = "printUnsignedImm";
+}
+
+def uimm3 : Operand<i32> {
+  let PrintMethod = "printUnsignedImm";
+}
+
+def uimm5       : Operand<i32> {
+  let PrintMethod = "printUnsignedImm";
+}
+
+def uimm6 : Operand<i32> {
+  let PrintMethod = "printUnsignedImm";
+}
+
+def uimm16      : Operand<i32> {
+  let PrintMethod = "printUnsignedImm";
+}
+
+def pcrel16      : Operand<i32> {
+}
+
+def MipsMemAsmOperand : AsmOperandClass {
+  let Name = "Mem";
+  let ParserMethod = "parseMemOperand";
+}
+
+def MipsMemSimm11AsmOperand : AsmOperandClass {
+  let Name = "MemOffsetSimm11";
+  let SuperClasses = [MipsMemAsmOperand];
+  let RenderMethod = "addMemOperands";
+  let ParserMethod = "parseMemOperand";
+  let PredicateMethod = "isMemWithSimmOffset<11>";
+  //let DiagnosticType = "Simm11";
+}
+
+def MipsInvertedImmoperand : AsmOperandClass {
+  let Name = "InvNum";
+  let RenderMethod = "addImmOperands";
+  let ParserMethod = "parseInvNum";
+}
+
+def InvertedImOperand : Operand<i32> {
+  let ParserMatchClass = MipsInvertedImmoperand;
+}
+
+def InvertedImOperand64 : Operand<i64> {
+  let ParserMatchClass = MipsInvertedImmoperand;
+}
+
+class mem_generic : Operand<iPTR> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops ptr_rc, simm16);
+  let EncoderMethod = "getMemEncoding";
+  let ParserMatchClass = MipsMemAsmOperand;
+  let OperandType = "OPERAND_MEMORY";
+}
+
+// Address operand
+def mem : mem_generic;
+
+// MSA specific address operand
+def mem_msa : mem_generic {
+  let MIOperandInfo = (ops ptr_rc, simm10);
+  let EncoderMethod = "getMSAMemEncoding";
+}
+
+def mem_simm9 : mem_generic {
+  let MIOperandInfo = (ops ptr_rc, simm9);
+  let EncoderMethod = "getMemEncoding";
+}
+
+def mem_simm11 : mem_generic {
+  let MIOperandInfo = (ops ptr_rc, simm11);
+  let EncoderMethod = "getMemEncoding";
+  let ParserMatchClass = MipsMemSimm11AsmOperand;
+}
+
+def mem_ea : Operand<iPTR> {
+  let PrintMethod = "printMemOperandEA";
+  let MIOperandInfo = (ops ptr_rc, simm16);
+  let EncoderMethod = "getMemEncoding";
+  let OperandType = "OPERAND_MEMORY";
+}
+
+def PtrRC : Operand<iPTR> {
+  let MIOperandInfo = (ops ptr_rc);
+  let DecoderMethod = "DecodePtrRegisterClass";
+  let ParserMatchClass = GPR32AsmOperand;
+}
+
+// size operand of ext instruction
+def size_ext : Operand<i32> {
+  let EncoderMethod = "getSizeExtEncoding";
+  let DecoderMethod = "DecodeExtSize";
+}
+
+// size operand of ins instruction
+def size_ins : Operand<i32> {
+  let EncoderMethod = "getSizeInsEncoding";
+  let DecoderMethod = "DecodeInsSize";
+}
+
+// Transformation Function - get the lower 16 bits.
+def LO16 : SDNodeXForm<imm, [{
+  return getImm(N, N->getZExtValue() & 0xFFFF);
+}]>;
+
+// Transformation Function - get the higher 16 bits.
+def HI16 : SDNodeXForm<imm, [{
+  return getImm(N, (N->getZExtValue() >> 16) & 0xFFFF);
+}]>;
+
+// Plus 1.
+def Plus1 : SDNodeXForm<imm, [{ return getImm(N, N->getSExtValue() + 1); }]>;
+
+// Node immediate is zero (e.g. insve.d)
+def immz : PatLeaf<(imm), [{ return N->getSExtValue() == 0; }]>;
+
+// Node immediate fits as 16-bit sign extended on target immediate.
+// e.g. addi, andi
+def immSExt8  : PatLeaf<(imm), [{ return isInt<8>(N->getSExtValue()); }]>;
+
+// Node immediate fits as 16-bit sign extended on target immediate.
+// e.g. addi, andi
+def immSExt16  : PatLeaf<(imm), [{ return isInt<16>(N->getSExtValue()); }]>;
+
+// Node immediate fits as 15-bit sign extended on target immediate.
+// e.g. addi, andi
+def immSExt15  : PatLeaf<(imm), [{ return isInt<15>(N->getSExtValue()); }]>;
+
+// Node immediate fits as 16-bit zero extended on target immediate.
+// The LO16 param means that only the lower 16 bits of the node
+// immediate are caught.
+// e.g. addiu, sltiu
+def immZExt16  : PatLeaf<(imm), [{
+  if (N->getValueType(0) == MVT::i32)
+    return (uint32_t)N->getZExtValue() == (unsigned short)N->getZExtValue();
+  else
+    return (uint64_t)N->getZExtValue() == (unsigned short)N->getZExtValue();
+}], LO16>;
+
+// Immediate can be loaded with LUi (32-bit int with lower 16-bit cleared).
+def immLow16Zero : PatLeaf<(imm), [{
+  int64_t Val = N->getSExtValue();
+  return isInt<32>(Val) && !(Val & 0xffff);
+}]>;
+
+// shamt field must fit in 5 bits.
+def immZExt5 : ImmLeaf<i32, [{return Imm == (Imm & 0x1f);}]>;
+
+// True if (N + 1) fits in 16-bit field.
+def immSExt16Plus1 : PatLeaf<(imm), [{
+  return isInt<17>(N->getSExtValue()) && isInt<16>(N->getSExtValue() + 1);
+}]>;
+
+// Mips Address Mode! SDNode frameindex could possibily be a match
+// since load and store instructions from stack used it.
+def addr :
+  ComplexPattern<iPTR, 2, "selectIntAddr", [frameindex]>;
+
+def addrRegImm :
+  ComplexPattern<iPTR, 2, "selectAddrRegImm", [frameindex]>;
+
+def addrRegReg :
+  ComplexPattern<iPTR, 2, "selectAddrRegReg", [frameindex]>;
+
+def addrDefault :
+  ComplexPattern<iPTR, 2, "selectAddrDefault", [frameindex]>;
+
+def addrimm10 : ComplexPattern<iPTR, 2, "selectIntAddrMSA", [frameindex]>;
+
+//===----------------------------------------------------------------------===//
+// Instructions specific format
+//===----------------------------------------------------------------------===//
+
+// Arithmetic and logical instructions with 3 register operands.
+class ArithLogicR<string opstr, RegisterOperand RO, bit isComm = 0,
+                  InstrItinClass Itin = NoItinerary,
+                  SDPatternOperator OpNode = null_frag>:
+  InstSE<(outs RO:$rd), (ins RO:$rs, RO:$rt),
+         !strconcat(opstr, "\t$rd, $rs, $rt"),
+         [(set RO:$rd, (OpNode RO:$rs, RO:$rt))], Itin, FrmR, opstr> {
+  let isCommutable = isComm;
+  let isReMaterializable = 1;
+  let TwoOperandAliasConstraint = "$rd = $rs";
+}
+
+// Arithmetic and logical instructions with 2 register operands.
+class ArithLogicI<string opstr, Operand Od, RegisterOperand RO,
+                  InstrItinClass Itin = NoItinerary,
+                  SDPatternOperator imm_type = null_frag,
+                  SDPatternOperator OpNode = null_frag> :
+  InstSE<(outs RO:$rt), (ins RO:$rs, Od:$imm16),
+         !strconcat(opstr, "\t$rt, $rs, $imm16"),
+         [(set RO:$rt, (OpNode RO:$rs, imm_type:$imm16))],
+         Itin, FrmI, opstr> {
+  let isReMaterializable = 1;
+  let TwoOperandAliasConstraint = "$rs = $rt";
+}
+
+// Arithmetic Multiply ADD/SUB
+class MArithR<string opstr, InstrItinClass itin, bit isComm = 0> :
+  InstSE<(outs), (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
+         !strconcat(opstr, "\t$rs, $rt"), [], itin, FrmR, opstr> {
+  let Defs = [HI0, LO0];
+  let Uses = [HI0, LO0];
+  let isCommutable = isComm;
+}
+
+//  Logical
+class LogicNOR<string opstr, RegisterOperand RO>:
+  InstSE<(outs RO:$rd), (ins RO:$rs, RO:$rt),
+         !strconcat(opstr, "\t$rd, $rs, $rt"),
+         [(set RO:$rd, (not (or RO:$rs, RO:$rt)))], II_NOR, FrmR, opstr> {
+  let isCommutable = 1;
+}
+
+// Shifts
+class shift_rotate_imm<string opstr, Operand ImmOpnd,
+                       RegisterOperand RO, InstrItinClass itin,
+                       SDPatternOperator OpNode = null_frag,
+                       SDPatternOperator PF = null_frag> :
+  InstSE<(outs RO:$rd), (ins RO:$rt, ImmOpnd:$shamt),
+         !strconcat(opstr, "\t$rd, $rt, $shamt"),
+         [(set RO:$rd, (OpNode RO:$rt, PF:$shamt))], itin, FrmR, opstr> {
+  let TwoOperandAliasConstraint = "$rt = $rd";
+}
+
+class shift_rotate_reg<string opstr, RegisterOperand RO, InstrItinClass itin,
+                       SDPatternOperator OpNode = null_frag>:
+  InstSE<(outs RO:$rd), (ins RO:$rt, GPR32Opnd:$rs),
+         !strconcat(opstr, "\t$rd, $rt, $rs"),
+         [(set RO:$rd, (OpNode RO:$rt, GPR32Opnd:$rs))], itin, FrmR,
+         opstr>;
+
+// Load Upper Imediate
+class LoadUpper<string opstr, RegisterOperand RO, Operand Imm>:
+  InstSE<(outs RO:$rt), (ins Imm:$imm16), !strconcat(opstr, "\t$rt, $imm16"),
+         [], II_LUI, FrmI, opstr>, IsAsCheapAsAMove {
+  let neverHasSideEffects = 1;
+  let isReMaterializable = 1;
+}
+
+// Memory Load/Store
+class Load<string opstr, DAGOperand RO, SDPatternOperator OpNode = null_frag,
+           InstrItinClass Itin = NoItinerary, ComplexPattern Addr = addr> :
+  InstSE<(outs RO:$rt), (ins mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
+         [(set RO:$rt, (OpNode Addr:$addr))], Itin, FrmI, opstr> {
+  let DecoderMethod = "DecodeMem";
+  let canFoldAsLoad = 1;
+  let mayLoad = 1;
+}
+
+class Store<string opstr, DAGOperand RO, SDPatternOperator OpNode = null_frag,
+            InstrItinClass Itin = NoItinerary, ComplexPattern Addr = addr> :
+  InstSE<(outs), (ins RO:$rt, mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
+         [(OpNode RO:$rt, Addr:$addr)], Itin, FrmI, opstr> {
+  let DecoderMethod = "DecodeMem";
+  let mayStore = 1;
+}
+
+// Load/Store Left/Right
+let canFoldAsLoad = 1 in
+class LoadLeftRight<string opstr, SDNode OpNode, RegisterOperand RO,
+                    InstrItinClass Itin> :
+  InstSE<(outs RO:$rt), (ins mem:$addr, RO:$src),
+         !strconcat(opstr, "\t$rt, $addr"),
+         [(set RO:$rt, (OpNode addr:$addr, RO:$src))], Itin, FrmI> {
+  let DecoderMethod = "DecodeMem";
+  string Constraints = "$src = $rt";
+}
+
+class StoreLeftRight<string opstr, SDNode OpNode, RegisterOperand RO,
+                     InstrItinClass Itin> :
+  InstSE<(outs), (ins RO:$rt, mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
+         [(OpNode RO:$rt, addr:$addr)], Itin, FrmI> {
+  let DecoderMethod = "DecodeMem";
+}
+
+// Conditional Branch
+class CBranch<string opstr, DAGOperand opnd, PatFrag cond_op,
+              RegisterOperand RO> :
+  InstSE<(outs), (ins RO:$rs, RO:$rt, opnd:$offset),
+         !strconcat(opstr, "\t$rs, $rt, $offset"),
+         [(brcond (i32 (cond_op RO:$rs, RO:$rt)), bb:$offset)], IIBranch,
+         FrmI, opstr> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let hasDelaySlot = 1;
+  let Defs = [AT];
+}
+
+class CBranchZero<string opstr, DAGOperand opnd, PatFrag cond_op,
+                  RegisterOperand RO> :
+  InstSE<(outs), (ins RO:$rs, opnd:$offset),
+         !strconcat(opstr, "\t$rs, $offset"),
+         [(brcond (i32 (cond_op RO:$rs, 0)), bb:$offset)], IIBranch,
+         FrmI, opstr> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let hasDelaySlot = 1;
+  let Defs = [AT];
+}
+
+// SetCC
+class SetCC_R<string opstr, PatFrag cond_op, RegisterOperand RO> :
+  InstSE<(outs GPR32Opnd:$rd), (ins RO:$rs, RO:$rt),
+         !strconcat(opstr, "\t$rd, $rs, $rt"),
+         [(set GPR32Opnd:$rd, (cond_op RO:$rs, RO:$rt))],
+         II_SLT_SLTU, FrmR, opstr>;
+
+class SetCC_I<string opstr, PatFrag cond_op, Operand Od, PatLeaf imm_type,
+              RegisterOperand RO>:
+  InstSE<(outs GPR32Opnd:$rt), (ins RO:$rs, Od:$imm16),
+         !strconcat(opstr, "\t$rt, $rs, $imm16"),
+         [(set GPR32Opnd:$rt, (cond_op RO:$rs, imm_type:$imm16))],
+         II_SLTI_SLTIU, FrmI, opstr>;
+
+// Jump
+class JumpFJ<DAGOperand opnd, string opstr, SDPatternOperator operator,
+             SDPatternOperator targetoperator, string bopstr> :
+  InstSE<(outs), (ins opnd:$target), !strconcat(opstr, "\t$target"),
+         [(operator targetoperator:$target)], IIBranch, FrmJ, bopstr> {
+  let isTerminator=1;
+  let isBarrier=1;
+  let hasDelaySlot = 1;
+  let DecoderMethod = "DecodeJumpTarget";
+  let Defs = [AT];
+}
+
+// Unconditional branch
+class UncondBranch<Instruction BEQInst> :
+  PseudoSE<(outs), (ins brtarget:$offset), [(br bb:$offset)], IIBranch>,
+  PseudoInstExpansion<(BEQInst ZERO, ZERO, brtarget:$offset)> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let isBarrier = 1;
+  let hasDelaySlot = 1;
+  let AdditionalPredicates = [RelocPIC];
+  let Defs = [AT];
+}
+
+// Base class for indirect branch and return instruction classes.
+let isTerminator=1, isBarrier=1, hasDelaySlot = 1 in
+class JumpFR<string opstr, RegisterOperand RO,
+             SDPatternOperator operator = null_frag>:
+  InstSE<(outs), (ins RO:$rs), "jr\t$rs", [(operator RO:$rs)], IIBranch,
+         FrmR, opstr>;
+
+// Indirect branch
+class IndirectBranch<string opstr, RegisterOperand RO> : JumpFR<opstr, RO> {
+  let isBranch = 1;
+  let isIndirectBranch = 1;
+}
+
+// Jump and Link (Call)
+let isCall=1, hasDelaySlot=1, Defs = [RA] in {
+  class JumpLink<string opstr, DAGOperand opnd> :
+    InstSE<(outs), (ins opnd:$target), !strconcat(opstr, "\t$target"),
+           [(MipsJmpLink imm:$target)], IIBranch, FrmJ, opstr> {
+    let DecoderMethod = "DecodeJumpTarget";
+  }
+
+  class JumpLinkRegPseudo<RegisterOperand RO, Instruction JALRInst,
+                          Register RetReg, RegisterOperand ResRO = RO>:
+    PseudoSE<(outs), (ins RO:$rs), [(MipsJmpLink RO:$rs)], IIBranch>,
+    PseudoInstExpansion<(JALRInst RetReg, ResRO:$rs)>;
+
+  class JumpLinkReg<string opstr, RegisterOperand RO>:
+    InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"),
+           [], IIBranch, FrmR>;
+
+  class BGEZAL_FT<string opstr, DAGOperand opnd, RegisterOperand RO> :
+    InstSE<(outs), (ins RO:$rs, opnd:$offset),
+           !strconcat(opstr, "\t$rs, $offset"), [], IIBranch, FrmI, opstr>;
+
+}
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, hasDelaySlot = 1,
+    hasExtraSrcRegAllocReq = 1, Defs = [AT] in {
+  class TailCall<Instruction JumpInst> :
+    PseudoSE<(outs), (ins calltarget:$target), [], IIBranch>,
+    PseudoInstExpansion<(JumpInst jmptarget:$target)>;
+
+  class TailCallReg<RegisterOperand RO, Instruction JRInst,
+                    RegisterOperand ResRO = RO> :
+    PseudoSE<(outs), (ins RO:$rs), [(MipsTailCall RO:$rs)], IIBranch>,
+    PseudoInstExpansion<(JRInst ResRO:$rs)>;
+}
+
+class BAL_BR_Pseudo<Instruction RealInst> :
+  PseudoSE<(outs), (ins brtarget:$offset), [], IIBranch>,
+  PseudoInstExpansion<(RealInst ZERO, brtarget:$offset)> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let isBarrier = 1;
+  let hasDelaySlot = 1;
+  let Defs = [RA];
+}
+
+// Syscall
+class SYS_FT<string opstr> :
+  InstSE<(outs), (ins uimm20:$code_),
+         !strconcat(opstr, "\t$code_"), [], NoItinerary, FrmI, opstr>;
+// Break
+class BRK_FT<string opstr> :
+  InstSE<(outs), (ins uimm10:$code_1, uimm10:$code_2),
+         !strconcat(opstr, "\t$code_1, $code_2"), [], NoItinerary,
+         FrmOther, opstr>;
+
+// (D)Eret
+class ER_FT<string opstr> :
+  InstSE<(outs), (ins),
+         opstr, [], NoItinerary, FrmOther, opstr>;
+
+// Interrupts
+class DEI_FT<string opstr, RegisterOperand RO> :
+  InstSE<(outs RO:$rt), (ins),
+         !strconcat(opstr, "\t$rt"), [], NoItinerary, FrmOther, opstr>;
+
+// Wait
+class WAIT_FT<string opstr> :
+  InstSE<(outs), (ins), opstr, [], NoItinerary, FrmOther, opstr>;
+
+// Sync
+let hasSideEffects = 1 in
+class SYNC_FT<string opstr> :
+  InstSE<(outs), (ins i32imm:$stype), "sync $stype", [(MipsSync imm:$stype)],
+         NoItinerary, FrmOther, opstr>;
+
+let hasSideEffects = 1 in
+class TEQ_FT<string opstr, RegisterOperand RO> :
+  InstSE<(outs), (ins RO:$rs, RO:$rt, uimm16:$code_),
+         !strconcat(opstr, "\t$rs, $rt, $code_"), [], NoItinerary,
+         FrmI, opstr>;
+
+class TEQI_FT<string opstr, RegisterOperand RO> :
+  InstSE<(outs), (ins RO:$rs, uimm16:$imm16),
+         !strconcat(opstr, "\t$rs, $imm16"), [], NoItinerary, FrmOther, opstr>;
+// Mul, Div
+class Mult<string opstr, InstrItinClass itin, RegisterOperand RO,
+           list<Register> DefRegs> :
+  InstSE<(outs), (ins RO:$rs, RO:$rt), !strconcat(opstr, "\t$rs, $rt"), [],
+         itin, FrmR, opstr> {
+  let isCommutable = 1;
+  let Defs = DefRegs;
+  let neverHasSideEffects = 1;
+}
+
+// Pseudo multiply/divide instruction with explicit accumulator register
+// operands.
+class MultDivPseudo<Instruction RealInst, RegisterClass R0, RegisterOperand R1,
+                    SDPatternOperator OpNode, InstrItinClass Itin,
+                    bit IsComm = 1, bit HasSideEffects = 0,
+                    bit UsesCustomInserter = 0> :
+  PseudoSE<(outs R0:$ac), (ins R1:$rs, R1:$rt),
+           [(set R0:$ac, (OpNode R1:$rs, R1:$rt))], Itin>,
+  PseudoInstExpansion<(RealInst R1:$rs, R1:$rt)> {
+  let isCommutable = IsComm;
+  let hasSideEffects = HasSideEffects;
+  let usesCustomInserter = UsesCustomInserter;
+}
+
+// Pseudo multiply add/sub instruction with explicit accumulator register
+// operands.
+class MAddSubPseudo<Instruction RealInst, SDPatternOperator OpNode,
+                    InstrItinClass itin>
+  : PseudoSE<(outs ACC64:$ac),
+             (ins GPR32Opnd:$rs, GPR32Opnd:$rt, ACC64:$acin),
+             [(set ACC64:$ac,
+              (OpNode GPR32Opnd:$rs, GPR32Opnd:$rt, ACC64:$acin))],
+             itin>,
+    PseudoInstExpansion<(RealInst GPR32Opnd:$rs, GPR32Opnd:$rt)> {
+  string Constraints = "$acin = $ac";
+}
+
+class Div<string opstr, InstrItinClass itin, RegisterOperand RO,
+          list<Register> DefRegs> :
+  InstSE<(outs), (ins RO:$rs, RO:$rt), !strconcat(opstr, "\t$$zero, $rs, $rt"),
+         [], itin, FrmR, opstr> {
+  let Defs = DefRegs;
+}
+
+// Move from Hi/Lo
+class PseudoMFLOHI<RegisterClass DstRC, RegisterClass SrcRC, SDNode OpNode>
+  : PseudoSE<(outs DstRC:$rd), (ins SrcRC:$hilo),
+             [(set DstRC:$rd, (OpNode SrcRC:$hilo))], II_MFHI_MFLO>;
+
+class MoveFromLOHI<string opstr, RegisterOperand RO, Register UseReg>:
+  InstSE<(outs RO:$rd), (ins), !strconcat(opstr, "\t$rd"), [], II_MFHI_MFLO,
+         FrmR, opstr> {
+  let Uses = [UseReg];
+  let neverHasSideEffects = 1;
+}
+
+class PseudoMTLOHI<RegisterClass DstRC, RegisterClass SrcRC>
+  : PseudoSE<(outs DstRC:$lohi), (ins SrcRC:$lo, SrcRC:$hi),
+             [(set DstRC:$lohi, (MipsMTLOHI SrcRC:$lo, SrcRC:$hi))],
+             II_MTHI_MTLO>;
+
+class MoveToLOHI<string opstr, RegisterOperand RO, list<Register> DefRegs>:
+  InstSE<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"), [], II_MTHI_MTLO,
+  FrmR, opstr> {
+  let Defs = DefRegs;
+  let neverHasSideEffects = 1;
+}
+
+class EffectiveAddress<string opstr, RegisterOperand RO> :
+  InstSE<(outs RO:$rt), (ins mem_ea:$addr), !strconcat(opstr, "\t$rt, $addr"),
+         [(set RO:$rt, addr:$addr)], NoItinerary, FrmI,
+         !strconcat(opstr, "_lea")> {
+  let isCodeGenOnly = 1;
+  let DecoderMethod = "DecodeMem";
+}
+
+// Count Leading Ones/Zeros in Word
+class CountLeading0<string opstr, RegisterOperand RO>:
+  InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"),
+         [(set RO:$rd, (ctlz RO:$rs))], II_CLZ, FrmR, opstr>;
+
+class CountLeading1<string opstr, RegisterOperand RO>:
+  InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"),
+         [(set RO:$rd, (ctlz (not RO:$rs)))], II_CLO, FrmR, opstr>;
+
+// Sign Extend in Register.
+class SignExtInReg<string opstr, ValueType vt, RegisterOperand RO,
+                   InstrItinClass itin> :
+  InstSE<(outs RO:$rd), (ins RO:$rt), !strconcat(opstr, "\t$rd, $rt"),
+         [(set RO:$rd, (sext_inreg RO:$rt, vt))], itin, FrmR, opstr>;
+
+// Subword Swap
+class SubwordSwap<string opstr, RegisterOperand RO>:
+  InstSE<(outs RO:$rd), (ins RO:$rt), !strconcat(opstr, "\t$rd, $rt"), [],
+         NoItinerary, FrmR, opstr> {
+  let neverHasSideEffects = 1;
+}
+
+// Read Hardware
+class ReadHardware<RegisterOperand CPURegOperand, RegisterOperand RO> :
+  InstSE<(outs CPURegOperand:$rt), (ins RO:$rd), "rdhwr\t$rt, $rd", [],
+         II_RDHWR, FrmR>;
+
+// Ext and Ins
+class ExtBase<string opstr, RegisterOperand RO, Operand PosOpnd,
+              SDPatternOperator Op = null_frag>:
+  InstSE<(outs RO:$rt), (ins RO:$rs, PosOpnd:$pos, size_ext:$size),
+         !strconcat(opstr, " $rt, $rs, $pos, $size"),
+         [(set RO:$rt, (Op RO:$rs, imm:$pos, imm:$size))], NoItinerary,
+         FrmR, opstr>, ISA_MIPS32R2;
+
+class InsBase<string opstr, RegisterOperand RO, Operand PosOpnd,
+              SDPatternOperator Op = null_frag>:
+  InstSE<(outs RO:$rt), (ins RO:$rs, PosOpnd:$pos, size_ins:$size, RO:$src),
+         !strconcat(opstr, " $rt, $rs, $pos, $size"),
+         [(set RO:$rt, (Op RO:$rs, imm:$pos, imm:$size, RO:$src))],
+         NoItinerary, FrmR, opstr>, ISA_MIPS32R2 {
+  let Constraints = "$src = $rt";
+}
+
+// Atomic instructions with 2 source operands (ATOMIC_SWAP & ATOMIC_LOAD_*).
+class Atomic2Ops<PatFrag Op, RegisterClass DRC> :
+  PseudoSE<(outs DRC:$dst), (ins PtrRC:$ptr, DRC:$incr),
+           [(set DRC:$dst, (Op iPTR:$ptr, DRC:$incr))]>;
+
+// Atomic Compare & Swap.
+class AtomicCmpSwap<PatFrag Op, RegisterClass DRC> :
+  PseudoSE<(outs DRC:$dst), (ins PtrRC:$ptr, DRC:$cmp, DRC:$swap),
+           [(set DRC:$dst, (Op iPTR:$ptr, DRC:$cmp, DRC:$swap))]>;
+
+class LLBase<string opstr, RegisterOperand RO> :
+  InstSE<(outs RO:$rt), (ins mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
+         [], NoItinerary, FrmI> {
+  let DecoderMethod = "DecodeMem";
+  let mayLoad = 1;
+}
+
+class SCBase<string opstr, RegisterOperand RO> :
+  InstSE<(outs RO:$dst), (ins RO:$rt, mem:$addr),
+         !strconcat(opstr, "\t$rt, $addr"), [], NoItinerary, FrmI> {
+  let DecoderMethod = "DecodeMem";
+  let mayStore = 1;
+  let Constraints = "$rt = $dst";
+}
+
+class MFC3OP<string asmstr, RegisterOperand RO> :
+  InstSE<(outs RO:$rt, RO:$rd, uimm16:$sel), (ins),
+         !strconcat(asmstr, "\t$rt, $rd, $sel"), [], NoItinerary, FrmFR>;
+
+class TrapBase<Instruction RealInst>
+  : PseudoSE<(outs), (ins), [(trap)], NoItinerary>,
+    PseudoInstExpansion<(RealInst 0, 0)> {
+  let isBarrier = 1;
+  let isTerminator = 1;
+  let isCodeGenOnly = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions
+//===----------------------------------------------------------------------===//
+
+// Return RA.
+let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1, hasCtrlDep=1 in
+def RetRA : PseudoSE<(outs), (ins), [(MipsRet)]>;
+
+let Defs = [SP], Uses = [SP], hasSideEffects = 1 in {
+def ADJCALLSTACKDOWN : MipsPseudo<(outs), (ins i32imm:$amt),
+                                  [(callseq_start timm:$amt)]>;
+def ADJCALLSTACKUP   : MipsPseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+                                  [(callseq_end timm:$amt1, timm:$amt2)]>;
+}
+
+let usesCustomInserter = 1 in {
+  def ATOMIC_LOAD_ADD_I8   : Atomic2Ops<atomic_load_add_8, GPR32>;
+  def ATOMIC_LOAD_ADD_I16  : Atomic2Ops<atomic_load_add_16, GPR32>;
+  def ATOMIC_LOAD_ADD_I32  : Atomic2Ops<atomic_load_add_32, GPR32>;
+  def ATOMIC_LOAD_SUB_I8   : Atomic2Ops<atomic_load_sub_8, GPR32>;
+  def ATOMIC_LOAD_SUB_I16  : Atomic2Ops<atomic_load_sub_16, GPR32>;
+  def ATOMIC_LOAD_SUB_I32  : Atomic2Ops<atomic_load_sub_32, GPR32>;
+  def ATOMIC_LOAD_AND_I8   : Atomic2Ops<atomic_load_and_8, GPR32>;
+  def ATOMIC_LOAD_AND_I16  : Atomic2Ops<atomic_load_and_16, GPR32>;
+  def ATOMIC_LOAD_AND_I32  : Atomic2Ops<atomic_load_and_32, GPR32>;
+  def ATOMIC_LOAD_OR_I8    : Atomic2Ops<atomic_load_or_8, GPR32>;
+  def ATOMIC_LOAD_OR_I16   : Atomic2Ops<atomic_load_or_16, GPR32>;
+  def ATOMIC_LOAD_OR_I32   : Atomic2Ops<atomic_load_or_32, GPR32>;
+  def ATOMIC_LOAD_XOR_I8   : Atomic2Ops<atomic_load_xor_8, GPR32>;
+  def ATOMIC_LOAD_XOR_I16  : Atomic2Ops<atomic_load_xor_16, GPR32>;
+  def ATOMIC_LOAD_XOR_I32  : Atomic2Ops<atomic_load_xor_32, GPR32>;
+  def ATOMIC_LOAD_NAND_I8  : Atomic2Ops<atomic_load_nand_8, GPR32>;
+  def ATOMIC_LOAD_NAND_I16 : Atomic2Ops<atomic_load_nand_16, GPR32>;
+  def ATOMIC_LOAD_NAND_I32 : Atomic2Ops<atomic_load_nand_32, GPR32>;
+
+  def ATOMIC_SWAP_I8       : Atomic2Ops<atomic_swap_8, GPR32>;
+  def ATOMIC_SWAP_I16      : Atomic2Ops<atomic_swap_16, GPR32>;
+  def ATOMIC_SWAP_I32      : Atomic2Ops<atomic_swap_32, GPR32>;
+
+  def ATOMIC_CMP_SWAP_I8   : AtomicCmpSwap<atomic_cmp_swap_8, GPR32>;
+  def ATOMIC_CMP_SWAP_I16  : AtomicCmpSwap<atomic_cmp_swap_16, GPR32>;
+  def ATOMIC_CMP_SWAP_I32  : AtomicCmpSwap<atomic_cmp_swap_32, GPR32>;
+}
+
+/// Pseudo instructions for loading and storing accumulator registers.
+let isPseudo = 1, isCodeGenOnly = 1 in {
+  def LOAD_ACC64  : Load<"", ACC64>;
+  def STORE_ACC64 : Store<"", ACC64>;
+}
+
+// We need these two pseudo instructions to avoid offset calculation for long
+// branches.  See the comment in file MipsLongBranch.cpp for detailed
+// explanation.
+
+// Expands to: lui $dst, %hi($tgt - $baltgt)
+def LONG_BRANCH_LUi : PseudoSE<(outs GPR32Opnd:$dst),
+  (ins brtarget:$tgt, brtarget:$baltgt), []>;
+
+// Expands to: addiu $dst, $src, %lo($tgt - $baltgt)
+def LONG_BRANCH_ADDiu : PseudoSE<(outs GPR32Opnd:$dst),
+  (ins GPR32Opnd:$src, brtarget:$tgt, brtarget:$baltgt), []>;
+
+//===----------------------------------------------------------------------===//
+// Instruction definition
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// MipsI Instructions
+//===----------------------------------------------------------------------===//
+
+/// Arithmetic Instructions (ALU Immediate)
+def ADDiu : MMRel, ArithLogicI<"addiu", simm16, GPR32Opnd, II_ADDIU, immSExt16,
+                               add>,
+            ADDI_FM<0x9>, IsAsCheapAsAMove;
+def ADDi  : MMRel, ArithLogicI<"addi", simm16, GPR32Opnd>, ADDI_FM<0x8>,
+            ISA_MIPS1_NOT_32R6_64R6;
+def SLTi  : MMRel, SetCC_I<"slti", setlt, simm16, immSExt16, GPR32Opnd>,
+            SLTI_FM<0xa>;
+def SLTiu : MMRel, SetCC_I<"sltiu", setult, simm16, immSExt16, GPR32Opnd>,
+            SLTI_FM<0xb>;
+def ANDi  : MMRel, ArithLogicI<"andi", uimm16, GPR32Opnd, II_ANDI, immZExt16,
+                               and>,
+            ADDI_FM<0xc>;
+def ORi   : MMRel, ArithLogicI<"ori", uimm16, GPR32Opnd, II_ORI, immZExt16,
+                               or>,
+            ADDI_FM<0xd>;
+def XORi  : MMRel, ArithLogicI<"xori", uimm16, GPR32Opnd, II_XORI, immZExt16,
+                               xor>,
+            ADDI_FM<0xe>;
+def LUi   : MMRel, LoadUpper<"lui", GPR32Opnd, uimm16>, LUI_FM;
+
+/// Arithmetic Instructions (3-Operand, R-Type)
+def ADDu  : MMRel, ArithLogicR<"addu", GPR32Opnd, 1, II_ADDU, add>,
+            ADD_FM<0, 0x21>;
+def SUBu  : MMRel, ArithLogicR<"subu", GPR32Opnd, 0, II_SUBU, sub>,
+            ADD_FM<0, 0x23>;
+let Defs = [HI0, LO0] in
+def MUL   : MMRel, ArithLogicR<"mul", GPR32Opnd, 1, II_MUL, mul>,
+            ADD_FM<0x1c, 2>, ISA_MIPS32_NOT_32R6_64R6;
+def ADD   : MMRel, ArithLogicR<"add", GPR32Opnd>, ADD_FM<0, 0x20>;
+def SUB   : MMRel, ArithLogicR<"sub", GPR32Opnd>, ADD_FM<0, 0x22>;
+def SLT   : MMRel, SetCC_R<"slt", setlt, GPR32Opnd>, ADD_FM<0, 0x2a>;
+def SLTu  : MMRel, SetCC_R<"sltu", setult, GPR32Opnd>, ADD_FM<0, 0x2b>;
+def AND   : MMRel, ArithLogicR<"and", GPR32Opnd, 1, II_AND, and>,
+            ADD_FM<0, 0x24>;
+def OR    : MMRel, ArithLogicR<"or", GPR32Opnd, 1, II_OR, or>,
+            ADD_FM<0, 0x25>;
+def XOR   : MMRel, ArithLogicR<"xor", GPR32Opnd, 1, II_XOR, xor>,
+            ADD_FM<0, 0x26>;
+def NOR   : MMRel, LogicNOR<"nor", GPR32Opnd>, ADD_FM<0, 0x27>;
+
+/// Shift Instructions
+def SLL  : MMRel, shift_rotate_imm<"sll", uimm5, GPR32Opnd, II_SLL, shl,
+                                   immZExt5>, SRA_FM<0, 0>;
+def SRL  : MMRel, shift_rotate_imm<"srl", uimm5, GPR32Opnd, II_SRL, srl,
+                                   immZExt5>, SRA_FM<2, 0>;
+def SRA  : MMRel, shift_rotate_imm<"sra", uimm5, GPR32Opnd, II_SRA, sra,
+                                   immZExt5>, SRA_FM<3, 0>;
+def SLLV : MMRel, shift_rotate_reg<"sllv", GPR32Opnd, II_SLLV, shl>,
+           SRLV_FM<4, 0>;
+def SRLV : MMRel, shift_rotate_reg<"srlv", GPR32Opnd, II_SRLV, srl>,
+           SRLV_FM<6, 0>;
+def SRAV : MMRel, shift_rotate_reg<"srav", GPR32Opnd, II_SRAV, sra>,
+           SRLV_FM<7, 0>;
+
+// Rotate Instructions
+def ROTR  : MMRel, shift_rotate_imm<"rotr", uimm5, GPR32Opnd, II_ROTR, rotr,
+                                    immZExt5>,
+            SRA_FM<2, 1>, ISA_MIPS32R2;
+def ROTRV : MMRel, shift_rotate_reg<"rotrv", GPR32Opnd, II_ROTRV, rotr>,
+            SRLV_FM<6, 1>, ISA_MIPS32R2;
+
+/// Load and Store Instructions
+///  aligned
+def LB  : Load<"lb", GPR32Opnd, sextloadi8, II_LB>, MMRel, LW_FM<0x20>;
+def LBu : Load<"lbu", GPR32Opnd, zextloadi8, II_LBU, addrDefault>, MMRel,
+          LW_FM<0x24>;
+def LH  : Load<"lh", GPR32Opnd, sextloadi16, II_LH, addrDefault>, MMRel,
+          LW_FM<0x21>;
+def LHu : Load<"lhu", GPR32Opnd, zextloadi16, II_LHU>, MMRel, LW_FM<0x25>;
+def LW  : Load<"lw", GPR32Opnd, load, II_LW, addrDefault>, MMRel,
+          LW_FM<0x23>;
+def SB  : Store<"sb", GPR32Opnd, truncstorei8, II_SB>, MMRel, LW_FM<0x28>;
+def SH  : Store<"sh", GPR32Opnd, truncstorei16, II_SH>, MMRel, LW_FM<0x29>;
+def SW  : Store<"sw", GPR32Opnd, store, II_SW>, MMRel, LW_FM<0x2b>;
+
+/// load/store left/right
+let EncodingPredicates = []<Predicate>, // FIXME: Lack of HasStdEnc is probably a bug
+    AdditionalPredicates = [NotInMicroMips] in {
+def LWL : LoadLeftRight<"lwl", MipsLWL, GPR32Opnd, II_LWL>, LW_FM<0x22>,
+          ISA_MIPS1_NOT_32R6_64R6;
+def LWR : LoadLeftRight<"lwr", MipsLWR, GPR32Opnd, II_LWR>, LW_FM<0x26>,
+          ISA_MIPS1_NOT_32R6_64R6;
+def SWL : StoreLeftRight<"swl", MipsSWL, GPR32Opnd, II_SWL>, LW_FM<0x2a>,
+          ISA_MIPS1_NOT_32R6_64R6;
+def SWR : StoreLeftRight<"swr", MipsSWR, GPR32Opnd, II_SWR>, LW_FM<0x2e>,
+          ISA_MIPS1_NOT_32R6_64R6;
+}
+
+def SYNC : MMRel, SYNC_FT<"sync">, SYNC_FM, ISA_MIPS32;
+def TEQ : MMRel, TEQ_FT<"teq", GPR32Opnd>, TEQ_FM<0x34>;
+def TGE : MMRel, TEQ_FT<"tge", GPR32Opnd>, TEQ_FM<0x30>;
+def TGEU : MMRel, TEQ_FT<"tgeu", GPR32Opnd>, TEQ_FM<0x31>;
+def TLT : MMRel, TEQ_FT<"tlt", GPR32Opnd>, TEQ_FM<0x32>;
+def TLTU : MMRel, TEQ_FT<"tltu", GPR32Opnd>, TEQ_FM<0x33>;
+def TNE : MMRel, TEQ_FT<"tne", GPR32Opnd>, TEQ_FM<0x36>;
+
+def TEQI : MMRel, TEQI_FT<"teqi", GPR32Opnd>, TEQI_FM<0xc>,
+           ISA_MIPS2_NOT_32R6_64R6;
+def TGEI : MMRel, TEQI_FT<"tgei", GPR32Opnd>, TEQI_FM<0x8>,
+           ISA_MIPS2_NOT_32R6_64R6;
+def TGEIU : MMRel, TEQI_FT<"tgeiu", GPR32Opnd>, TEQI_FM<0x9>,
+           ISA_MIPS2_NOT_32R6_64R6;
+def TLTI : MMRel, TEQI_FT<"tlti", GPR32Opnd>, TEQI_FM<0xa>,
+           ISA_MIPS2_NOT_32R6_64R6;
+def TTLTIU : MMRel, TEQI_FT<"tltiu", GPR32Opnd>, TEQI_FM<0xb>,
+           ISA_MIPS2_NOT_32R6_64R6;
+def TNEI : MMRel, TEQI_FT<"tnei", GPR32Opnd>, TEQI_FM<0xe>,
+           ISA_MIPS2_NOT_32R6_64R6;
+
+def BREAK : MMRel, BRK_FT<"break">, BRK_FM<0xd>;
+def SYSCALL : MMRel, SYS_FT<"syscall">, SYS_FM<0xc>;
+def TRAP : TrapBase<BREAK>;
+def SDBBP : SYS_FT<"sdbbp">, SDBBP_FM, ISA_MIPS32_NOT_32R6_64R6;
+
+def ERET : MMRel, ER_FT<"eret">, ER_FM<0x18>, INSN_MIPS3_32;
+def DERET : MMRel, ER_FT<"deret">, ER_FM<0x1f>, ISA_MIPS32;
+
+def EI : MMRel, DEI_FT<"ei", GPR32Opnd>, EI_FM<1>, ISA_MIPS32R2;
+def DI : MMRel, DEI_FT<"di", GPR32Opnd>, EI_FM<0>, ISA_MIPS32R2;
+
+let EncodingPredicates = []<Predicate>, // FIXME: Lack of HasStdEnc is probably a bug
+    AdditionalPredicates = [NotInMicroMips] in {
+def WAIT : WAIT_FT<"wait">, WAIT_FM;
+
+/// Load-linked, Store-conditional
+def LL : LLBase<"ll", GPR32Opnd>, LW_FM<0x30>, ISA_MIPS2_NOT_32R6_64R6;
+def SC : SCBase<"sc", GPR32Opnd>, LW_FM<0x38>, ISA_MIPS2_NOT_32R6_64R6;
+}
+
+/// Jump and Branch Instructions
+def J       : MMRel, JumpFJ<jmptarget, "j", br, bb, "j">, FJ<2>,
+              AdditionalRequires<[RelocStatic]>, IsBranch;
+def JR      : MMRel, IndirectBranch<"jr", GPR32Opnd>, MTLO_FM<8>;
+def BEQ     : MMRel, CBranch<"beq", brtarget, seteq, GPR32Opnd>, BEQ_FM<4>;
+def BNE     : MMRel, CBranch<"bne", brtarget, setne, GPR32Opnd>, BEQ_FM<5>;
+def BGEZ    : MMRel, CBranchZero<"bgez", brtarget, setge, GPR32Opnd>,
+              BGEZ_FM<1, 1>;
+def BGTZ    : MMRel, CBranchZero<"bgtz", brtarget, setgt, GPR32Opnd>,
+              BGEZ_FM<7, 0>;
+def BLEZ    : MMRel, CBranchZero<"blez", brtarget, setle, GPR32Opnd>,
+              BGEZ_FM<6, 0>;
+def BLTZ    : MMRel, CBranchZero<"bltz", brtarget, setlt, GPR32Opnd>,
+              BGEZ_FM<1, 0>;
+def B       : UncondBranch<BEQ>;
+
+def JAL  : MMRel, JumpLink<"jal", calltarget>, FJ<3>;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def JALR : JumpLinkReg<"jalr", GPR32Opnd>, JALR_FM;
+  def JALRPseudo : JumpLinkRegPseudo<GPR32Opnd, JALR, RA>;
+}
+
+// FIXME: JALX really requires either MIPS16 or microMIPS in addition to MIPS32.
+def JALX  : JumpLink<"jalx", calltarget>, FJ<0x1D>, ISA_MIPS32_NOT_32R6_64R6;
+def BGEZAL : MMRel, BGEZAL_FT<"bgezal", brtarget, GPR32Opnd>, BGEZAL_FM<0x11>,
+             ISA_MIPS1_NOT_32R6_64R6;
+def BLTZAL : MMRel, BGEZAL_FT<"bltzal", brtarget, GPR32Opnd>, BGEZAL_FM<0x10>,
+             ISA_MIPS1_NOT_32R6_64R6;
+def BAL_BR : BAL_BR_Pseudo<BGEZAL>;
+def TAILCALL : TailCall<J>;
+def TAILCALL_R : TailCallReg<GPR32Opnd, JR>;
+
+// Indirect branches are matched as PseudoIndirectBranch/PseudoIndirectBranch64
+// then are expanded to JR, JR64, JALR, or JALR64 depending on the ISA.
+class PseudoIndirectBranchBase<RegisterOperand RO> :
+    MipsPseudo<(outs), (ins RO:$rs), [(brind RO:$rs)], IIBranch> {
+  let isTerminator=1;
+  let isBarrier=1;
+  let hasDelaySlot = 1;
+  let isBranch = 1;
+  let isIndirectBranch = 1;
+}
+
+def PseudoIndirectBranch : PseudoIndirectBranchBase<GPR32Opnd>;
+
+// Return instructions are matched as a RetRA instruction, then ar expanded
+// into PseudoReturn/PseudoReturn64 after register allocation. Finally,
+// MipsAsmPrinter expands this into JR, JR64, JALR, or JALR64 depending on the
+// ISA.
+class PseudoReturnBase<RegisterOperand RO> : MipsPseudo<(outs), (ins RO:$rs),
+                                                        [], IIBranch> {
+  let isTerminator = 1;
+  let isBarrier = 1;
+  let hasDelaySlot = 1;
+  let isReturn = 1;
+  let isCodeGenOnly = 1;
+  let hasCtrlDep = 1;
+  let hasExtraSrcRegAllocReq = 1;
+}
+
+def PseudoReturn : PseudoReturnBase<GPR32Opnd>;
+
+// Exception handling related node and instructions.
+// The conversion sequence is:
+// ISD::EH_RETURN -> MipsISD::EH_RETURN ->
+// MIPSeh_return -> (stack change + indirect branch)
+//
+// MIPSeh_return takes the place of regular return instruction
+// but takes two arguments (V1, V0) which are used for storing
+// the offset and return address respectively.
+def SDT_MipsEHRET : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisPtrTy<1>]>;
+
+def MIPSehret : SDNode<"MipsISD::EH_RETURN", SDT_MipsEHRET,
+                      [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+let Uses = [V0, V1], isTerminator = 1, isReturn = 1, isBarrier = 1 in {
+  def MIPSeh_return32 : MipsPseudo<(outs), (ins GPR32:$spoff, GPR32:$dst),
+                                [(MIPSehret GPR32:$spoff, GPR32:$dst)]>;
+  def MIPSeh_return64 : MipsPseudo<(outs), (ins GPR64:$spoff,
+                                                GPR64:$dst),
+                                [(MIPSehret GPR64:$spoff, GPR64:$dst)]>;
+}
+
+/// Multiply and Divide Instructions.
+def MULT  : MMRel, Mult<"mult", II_MULT, GPR32Opnd, [HI0, LO0]>,
+            MULT_FM<0, 0x18>, ISA_MIPS1_NOT_32R6_64R6;
+def MULTu : MMRel, Mult<"multu", II_MULTU, GPR32Opnd, [HI0, LO0]>,
+            MULT_FM<0, 0x19>, ISA_MIPS1_NOT_32R6_64R6;
+def SDIV  : MMRel, Div<"div", II_DIV, GPR32Opnd, [HI0, LO0]>,
+            MULT_FM<0, 0x1a>, ISA_MIPS1_NOT_32R6_64R6;
+def UDIV  : MMRel, Div<"divu", II_DIVU, GPR32Opnd, [HI0, LO0]>,
+            MULT_FM<0, 0x1b>, ISA_MIPS1_NOT_32R6_64R6;
+
+def MTHI : MMRel, MoveToLOHI<"mthi", GPR32Opnd, [HI0]>, MTLO_FM<0x11>,
+           ISA_MIPS1_NOT_32R6_64R6;
+def MTLO : MMRel, MoveToLOHI<"mtlo", GPR32Opnd, [LO0]>, MTLO_FM<0x13>,
+           ISA_MIPS1_NOT_32R6_64R6;
+let EncodingPredicates = []<Predicate>, // FIXME: Lack of HasStdEnc is probably a bug
+    AdditionalPredicates = [NotInMicroMips] in {
+def MFHI : MMRel, MoveFromLOHI<"mfhi", GPR32Opnd, AC0>, MFLO_FM<0x10>,
+           ISA_MIPS1_NOT_32R6_64R6;
+def MFLO : MMRel, MoveFromLOHI<"mflo", GPR32Opnd, AC0>, MFLO_FM<0x12>,
+           ISA_MIPS1_NOT_32R6_64R6;
+}
+
+/// Sign Ext In Register Instructions.
+def SEB : MMRel, SignExtInReg<"seb", i8, GPR32Opnd, II_SEB>,
+          SEB_FM<0x10, 0x20>, ISA_MIPS32R2;
+def SEH : MMRel, SignExtInReg<"seh", i16, GPR32Opnd, II_SEH>,
+          SEB_FM<0x18, 0x20>, ISA_MIPS32R2;
+
+/// Count Leading
+def CLZ : MMRel, CountLeading0<"clz", GPR32Opnd>, CLO_FM<0x20>,
+          ISA_MIPS32_NOT_32R6_64R6;
+def CLO : MMRel, CountLeading1<"clo", GPR32Opnd>, CLO_FM<0x21>,
+          ISA_MIPS32_NOT_32R6_64R6;
+
+/// Word Swap Bytes Within Halfwords
+def WSBH : MMRel, SubwordSwap<"wsbh", GPR32Opnd>, SEB_FM<2, 0x20>, ISA_MIPS32R2;
+
+/// No operation.
+def NOP : PseudoSE<(outs), (ins), []>, PseudoInstExpansion<(SLL ZERO, ZERO, 0)>;
+
+// FrameIndexes are legalized when they are operands from load/store
+// instructions. The same not happens for stack address copies, so an
+// add op with mem ComplexPattern is used and the stack address copy
+// can be matched. It's similar to Sparc LEA_ADDRi
+def LEA_ADDiu : MMRel, EffectiveAddress<"addiu", GPR32Opnd>, LW_FM<9>;
+
+// MADD*/MSUB*
+def MADD  : MMRel, MArithR<"madd", II_MADD, 1>, MULT_FM<0x1c, 0>,
+            ISA_MIPS32_NOT_32R6_64R6;
+def MADDU : MMRel, MArithR<"maddu", II_MADDU, 1>, MULT_FM<0x1c, 1>,
+            ISA_MIPS32_NOT_32R6_64R6;
+def MSUB  : MMRel, MArithR<"msub", II_MSUB>, MULT_FM<0x1c, 4>,
+            ISA_MIPS32_NOT_32R6_64R6;
+def MSUBU : MMRel, MArithR<"msubu", II_MSUBU>, MULT_FM<0x1c, 5>,
+            ISA_MIPS32_NOT_32R6_64R6;
+
+let AdditionalPredicates = [NotDSP] in {
+def PseudoMULT  : MultDivPseudo<MULT, ACC64, GPR32Opnd, MipsMult, II_MULT>,
+                  ISA_MIPS1_NOT_32R6_64R6;
+def PseudoMULTu : MultDivPseudo<MULTu, ACC64, GPR32Opnd, MipsMultu, II_MULTU>,
+                  ISA_MIPS1_NOT_32R6_64R6;
+def PseudoMFHI : PseudoMFLOHI<GPR32, ACC64, MipsMFHI>, ISA_MIPS1_NOT_32R6_64R6;
+def PseudoMFLO : PseudoMFLOHI<GPR32, ACC64, MipsMFLO>, ISA_MIPS1_NOT_32R6_64R6;
+def PseudoMTLOHI : PseudoMTLOHI<ACC64, GPR32>, ISA_MIPS1_NOT_32R6_64R6;
+def PseudoMADD  : MAddSubPseudo<MADD, MipsMAdd, II_MADD>,
+                  ISA_MIPS32_NOT_32R6_64R6;
+def PseudoMADDU : MAddSubPseudo<MADDU, MipsMAddu, II_MADDU>,
+                  ISA_MIPS32_NOT_32R6_64R6;
+def PseudoMSUB  : MAddSubPseudo<MSUB, MipsMSub, II_MSUB>,
+                  ISA_MIPS32_NOT_32R6_64R6;
+def PseudoMSUBU : MAddSubPseudo<MSUBU, MipsMSubu, II_MSUBU>,
+                  ISA_MIPS32_NOT_32R6_64R6;
+}
+
+def PseudoSDIV : MultDivPseudo<SDIV, ACC64, GPR32Opnd, MipsDivRem, II_DIV,
+                               0, 1, 1>, ISA_MIPS1_NOT_32R6_64R6;
+def PseudoUDIV : MultDivPseudo<UDIV, ACC64, GPR32Opnd, MipsDivRemU, II_DIVU,
+                               0, 1, 1>, ISA_MIPS1_NOT_32R6_64R6;
+
+def RDHWR : ReadHardware<GPR32Opnd, HWRegsOpnd>, RDHWR_FM;
+
+def EXT : MMRel, ExtBase<"ext", GPR32Opnd, uimm5, MipsExt>, EXT_FM<0>;
+def INS : MMRel, InsBase<"ins", GPR32Opnd, uimm5, MipsIns>, EXT_FM<4>;
+
+/// Move Control Registers From/To CPU Registers
+def MFC0 : MFC3OP<"mfc0", GPR32Opnd>, MFC3OP_FM<0x10, 0>, ISA_MIPS32;
+def MTC0 : MFC3OP<"mtc0", GPR32Opnd>, MFC3OP_FM<0x10, 4>, ISA_MIPS32;
+def MFC2 : MFC3OP<"mfc2", GPR32Opnd>, MFC3OP_FM<0x12, 0>;
+def MTC2 : MFC3OP<"mtc2", GPR32Opnd>, MFC3OP_FM<0x12, 4>;
+
+class Barrier<string asmstr> : InstSE<(outs), (ins), asmstr, [], NoItinerary,
+                                      FrmOther>;
+def SSNOP : Barrier<"ssnop">, BARRIER_FM<1>;
+def EHB : Barrier<"ehb">, BARRIER_FM<3>;
+def PAUSE : Barrier<"pause">, BARRIER_FM<5>, ISA_MIPS32R2;
+
+// JR_HB and JALR_HB are defined here using the new style naming
+// scheme because some of this code is shared with Mips32r6InstrInfo.td
+// and because of that it doesn't follow the naming convention of the
+// rest of the file. To avoid a mixture of old vs new style, the new
+// style was chosen.
+class JR_HB_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins GPROpnd:$rs);
+  string AsmString = !strconcat(instr_asm, "\t$rs");
+  list<dag> Pattern = [];
+}
+
+class JALR_HB_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs GPROpnd:$rd);
+  dag InOperandList = (ins GPROpnd:$rs);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs");
+  list<dag> Pattern = [];
+}
+
+class JR_HB_DESC : InstSE<(outs), (ins), "", [], NoItinerary, FrmJ>,
+                   JR_HB_DESC_BASE<"jr.hb", GPR32Opnd> {
+  let isBranch=1;
+  let isIndirectBranch=1;
+  let hasDelaySlot=1;
+  let isTerminator=1;
+  let isBarrier=1;
+}
+
+class JALR_HB_DESC : InstSE<(outs), (ins), "", [], NoItinerary, FrmJ>,
+                     JALR_HB_DESC_BASE<"jalr.hb", GPR32Opnd> {
+  let isIndirectBranch=1;
+  let hasDelaySlot=1;
+}
+
+class JR_HB_ENC : JR_HB_FM<8>;
+class JALR_HB_ENC : JALR_HB_FM<9>;
+
+def JR_HB : JR_HB_DESC, JR_HB_ENC, ISA_MIPS32_NOT_32R6_64R6;
+def JALR_HB : JALR_HB_DESC, JALR_HB_ENC, ISA_MIPS32;
+
+class TLB<string asmstr> : InstSE<(outs), (ins), asmstr, [], NoItinerary,
+                                      FrmOther>;
+def TLBP : TLB<"tlbp">, COP0_TLB_FM<0x08>;
+def TLBR : TLB<"tlbr">, COP0_TLB_FM<0x01>;
+def TLBWI : TLB<"tlbwi">, COP0_TLB_FM<0x02>;
+def TLBWR : TLB<"tlbwr">, COP0_TLB_FM<0x06>;
+
+class CacheOp<string instr_asm, Operand MemOpnd, RegisterOperand GPROpnd> :
+    InstSE<(outs), (ins  MemOpnd:$addr, uimm5:$hint),
+           !strconcat(instr_asm, "\t$hint, $addr"), [], NoItinerary, FrmOther>;
+
+def CACHE : CacheOp<"cache", mem, GPR32Opnd>, CACHEOP_FM<0b101111>,
+            INSN_MIPS3_32_NOT_32R6_64R6;
+def PREF :  CacheOp<"pref", mem, GPR32Opnd>, CACHEOP_FM<0b110011>,
+            INSN_MIPS3_32_NOT_32R6_64R6;
+
+//===----------------------------------------------------------------------===//
+// Instruction aliases
+//===----------------------------------------------------------------------===//
+def : MipsInstAlias<"move $dst, $src",
+                    (ADDu GPR32Opnd:$dst, GPR32Opnd:$src,ZERO), 1>,
+      GPR_32 {
+  let AdditionalPredicates = [NotInMicroMips];
+}
+def : MipsInstAlias<"bal $offset", (BGEZAL ZERO, brtarget:$offset), 0>,
+      ISA_MIPS1_NOT_32R6_64R6;
+def : MipsInstAlias<"addu $rs, $rt, $imm",
+                    (ADDiu GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>;
+def : MipsInstAlias<"add $rs, $rt, $imm",
+                    (ADDi GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>;
+def : MipsInstAlias<"and $rs, $rt, $imm",
+                    (ANDi GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>;
+def : MipsInstAlias<"and $rs, $imm",
+                    (ANDi GPR32Opnd:$rs, GPR32Opnd:$rs, simm16:$imm), 0>;
+def : MipsInstAlias<"j $rs", (JR GPR32Opnd:$rs), 0>;
+let Predicates = [NotInMicroMips] in {
+def : MipsInstAlias<"jalr $rs", (JALR RA, GPR32Opnd:$rs), 0>;
+}
+def : MipsInstAlias<"jal $rs", (JALR RA, GPR32Opnd:$rs), 0>;
+def : MipsInstAlias<"jal $rd,$rs", (JALR GPR32Opnd:$rd, GPR32Opnd:$rs), 0>;
+def : MipsInstAlias<"jalr.hb $rs", (JALR_HB RA, GPR32Opnd:$rs), 1>, ISA_MIPS32;
+def : MipsInstAlias<"not $rt, $rs",
+                    (NOR GPR32Opnd:$rt, GPR32Opnd:$rs, ZERO), 0>;
+def : MipsInstAlias<"neg $rt, $rs",
+                    (SUB GPR32Opnd:$rt, ZERO, GPR32Opnd:$rs), 1>;
+def : MipsInstAlias<"negu $rt",
+                    (SUBu GPR32Opnd:$rt, ZERO, GPR32Opnd:$rt), 0>;
+def : MipsInstAlias<"negu $rt, $rs",
+                    (SUBu GPR32Opnd:$rt, ZERO, GPR32Opnd:$rs), 1>;
+def : MipsInstAlias<"slt $rs, $rt, $imm",
+                    (SLTi GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>;
+def : MipsInstAlias<"sltu $rt, $rs, $imm",
+                    (SLTiu GPR32Opnd:$rt, GPR32Opnd:$rs, simm16:$imm), 0>;
+def : MipsInstAlias<"xor $rs, $rt, $imm",
+                    (XORi GPR32Opnd:$rs, GPR32Opnd:$rt, uimm16:$imm), 0>;
+def : MipsInstAlias<"or $rs, $rt, $imm",
+                    (ORi GPR32Opnd:$rs, GPR32Opnd:$rt, uimm16:$imm), 0>;
+def : MipsInstAlias<"or $rs, $imm",
+                    (ORi GPR32Opnd:$rs, GPR32Opnd:$rs, uimm16:$imm), 0>;
+def : MipsInstAlias<"nop", (SLL ZERO, ZERO, 0), 1>;
+def : MipsInstAlias<"mfc0 $rt, $rd", (MFC0 GPR32Opnd:$rt, GPR32Opnd:$rd, 0), 0>;
+def : MipsInstAlias<"mtc0 $rt, $rd", (MTC0 GPR32Opnd:$rt, GPR32Opnd:$rd, 0), 0>;
+def : MipsInstAlias<"mfc2 $rt, $rd", (MFC2 GPR32Opnd:$rt, GPR32Opnd:$rd, 0), 0>;
+def : MipsInstAlias<"mtc2 $rt, $rd", (MTC2 GPR32Opnd:$rt, GPR32Opnd:$rd, 0), 0>;
+def : MipsInstAlias<"b $offset", (BEQ ZERO, ZERO, brtarget:$offset), 0>;
+def : MipsInstAlias<"bnez $rs,$offset",
+                    (BNE GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>;
+def : MipsInstAlias<"beqz $rs,$offset",
+                    (BEQ GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>;
+def : MipsInstAlias<"syscall", (SYSCALL 0), 1>;
+    
+def : MipsInstAlias<"break", (BREAK 0, 0), 1>;
+def : MipsInstAlias<"break $imm", (BREAK uimm10:$imm, 0), 1>;
+def : MipsInstAlias<"ei", (EI ZERO), 1>;
+def : MipsInstAlias<"di", (DI ZERO), 1>;
+
+def  : MipsInstAlias<"teq $rs, $rt", (TEQ GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+def  : MipsInstAlias<"tge $rs, $rt", (TGE GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+def  : MipsInstAlias<"tgeu $rs, $rt", (TGEU GPR32Opnd:$rs, GPR32Opnd:$rt, 0),
+                     1>;
+def  : MipsInstAlias<"tlt $rs, $rt", (TLT GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+def  : MipsInstAlias<"tltu $rs, $rt", (TLTU GPR32Opnd:$rs, GPR32Opnd:$rt, 0),
+                     1>;
+def  : MipsInstAlias<"tne $rs, $rt", (TNE GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+def  : MipsInstAlias<"sll $rd, $rt, $rs",
+                     (SLLV GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+def : MipsInstAlias<"sub, $rd, $rs, $imm",
+                    (ADDi GPR32Opnd:$rd, GPR32Opnd:$rs,
+                          InvertedImOperand:$imm), 0>;
+def : MipsInstAlias<"sub $rs, $imm",
+                    (ADDi GPR32Opnd:$rs, GPR32Opnd:$rs, InvertedImOperand:$imm),
+                    0>;
+def : MipsInstAlias<"subu, $rd, $rs, $imm",
+                    (ADDiu GPR32Opnd:$rd, GPR32Opnd:$rs,
+                           InvertedImOperand:$imm), 0>;
+def : MipsInstAlias<"subu $rs, $imm", (ADDiu GPR32Opnd:$rs, GPR32Opnd:$rs,
+                                             InvertedImOperand:$imm), 0>;
+def : MipsInstAlias<"sra $rd, $rt, $rs",
+                    (SRAV GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+def : MipsInstAlias<"srl $rd, $rt, $rs",
+                    (SRLV GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+def : MipsInstAlias<"sdbbp", (SDBBP 0)>, ISA_MIPS32_NOT_32R6_64R6;
+def : MipsInstAlias<"sync",
+                    (SYNC 0), 1>, ISA_MIPS2;
+//===----------------------------------------------------------------------===//
+// Assembler Pseudo Instructions
+//===----------------------------------------------------------------------===//
+
+class LoadImm32< string instr_asm, Operand Od, RegisterOperand RO> :
+  MipsAsmPseudoInst<(outs RO:$rt), (ins Od:$imm32),
+                     !strconcat(instr_asm, "\t$rt, $imm32")> ;
+def LoadImm32Reg : LoadImm32<"li", uimm5, GPR32Opnd>;
+
+class LoadAddress<string instr_asm, Operand MemOpnd, RegisterOperand RO> :
+  MipsAsmPseudoInst<(outs RO:$rt), (ins MemOpnd:$addr),
+                     !strconcat(instr_asm, "\t$rt, $addr")> ;
+def LoadAddr32Reg : LoadAddress<"la", mem, GPR32Opnd>;
+
+class LoadAddressImm<string instr_asm, Operand Od, RegisterOperand RO> :
+  MipsAsmPseudoInst<(outs RO:$rt), (ins Od:$imm32),
+                     !strconcat(instr_asm, "\t$rt, $imm32")> ;
+def LoadAddr32Imm : LoadAddressImm<"la", uimm5, GPR32Opnd>;
+
+//===----------------------------------------------------------------------===//
+//  Arbitrary patterns that map to one or more instructions
+//===----------------------------------------------------------------------===//
+
+// Load/store pattern templates.
+class LoadRegImmPat<Instruction LoadInst, ValueType ValTy, PatFrag Node> :
+  MipsPat<(ValTy (Node addrRegImm:$a)), (LoadInst addrRegImm:$a)>;
+
+class StoreRegImmPat<Instruction StoreInst, ValueType ValTy> :
+  MipsPat<(store ValTy:$v, addrRegImm:$a), (StoreInst ValTy:$v, addrRegImm:$a)>;
+
+// Small immediates
+def : MipsPat<(i32 immSExt16:$in),
+              (ADDiu ZERO, imm:$in)>;
+def : MipsPat<(i32 immZExt16:$in),
+              (ORi ZERO, imm:$in)>;
+def : MipsPat<(i32 immLow16Zero:$in),
+              (LUi (HI16 imm:$in))>;
+
+// Arbitrary immediates
+def : MipsPat<(i32 imm:$imm),
+          (ORi (LUi (HI16 imm:$imm)), (LO16 imm:$imm))>;
+
+// Carry MipsPatterns
+def : MipsPat<(subc GPR32:$lhs, GPR32:$rhs),
+              (SUBu GPR32:$lhs, GPR32:$rhs)>;
+let AdditionalPredicates = [NotDSP] in {
+  def : MipsPat<(addc GPR32:$lhs, GPR32:$rhs),
+                (ADDu GPR32:$lhs, GPR32:$rhs)>;
+  def : MipsPat<(addc  GPR32:$src, immSExt16:$imm),
+                (ADDiu GPR32:$src, imm:$imm)>;
+}
+
+// SYNC
+def : MipsPat<(MipsSync (i32 immz)),
+              (SYNC 0)>, ISA_MIPS2;
+
+// Call
+def : MipsPat<(MipsJmpLink (i32 tglobaladdr:$dst)),
+              (JAL tglobaladdr:$dst)>;
+def : MipsPat<(MipsJmpLink (i32 texternalsym:$dst)),
+              (JAL texternalsym:$dst)>;
+//def : MipsPat<(MipsJmpLink GPR32:$dst),
+//              (JALR GPR32:$dst)>;
+
+// Tail call
+def : MipsPat<(MipsTailCall (iPTR tglobaladdr:$dst)),
+              (TAILCALL tglobaladdr:$dst)>;
+def : MipsPat<(MipsTailCall (iPTR texternalsym:$dst)),
+              (TAILCALL texternalsym:$dst)>;
+// hi/lo relocs
+def : MipsPat<(MipsHi tglobaladdr:$in), (LUi tglobaladdr:$in)>;
+def : MipsPat<(MipsHi tblockaddress:$in), (LUi tblockaddress:$in)>;
+def : MipsPat<(MipsHi tjumptable:$in), (LUi tjumptable:$in)>;
+def : MipsPat<(MipsHi tconstpool:$in), (LUi tconstpool:$in)>;
+def : MipsPat<(MipsHi tglobaltlsaddr:$in), (LUi tglobaltlsaddr:$in)>;
+def : MipsPat<(MipsHi texternalsym:$in), (LUi texternalsym:$in)>;
+
+def : MipsPat<(MipsLo tglobaladdr:$in), (ADDiu ZERO, tglobaladdr:$in)>;
+def : MipsPat<(MipsLo tblockaddress:$in), (ADDiu ZERO, tblockaddress:$in)>;
+def : MipsPat<(MipsLo tjumptable:$in), (ADDiu ZERO, tjumptable:$in)>;
+def : MipsPat<(MipsLo tconstpool:$in), (ADDiu ZERO, tconstpool:$in)>;
+def : MipsPat<(MipsLo tglobaltlsaddr:$in), (ADDiu ZERO, tglobaltlsaddr:$in)>;
+def : MipsPat<(MipsLo texternalsym:$in), (ADDiu ZERO, texternalsym:$in)>;
+
+def : MipsPat<(add GPR32:$hi, (MipsLo tglobaladdr:$lo)),
+              (ADDiu GPR32:$hi, tglobaladdr:$lo)>;
+def : MipsPat<(add GPR32:$hi, (MipsLo tblockaddress:$lo)),
+              (ADDiu GPR32:$hi, tblockaddress:$lo)>;
+def : MipsPat<(add GPR32:$hi, (MipsLo tjumptable:$lo)),
+              (ADDiu GPR32:$hi, tjumptable:$lo)>;
+def : MipsPat<(add GPR32:$hi, (MipsLo tconstpool:$lo)),
+              (ADDiu GPR32:$hi, tconstpool:$lo)>;
+def : MipsPat<(add GPR32:$hi, (MipsLo tglobaltlsaddr:$lo)),
+              (ADDiu GPR32:$hi, tglobaltlsaddr:$lo)>;
+
+// gp_rel relocs
+def : MipsPat<(add GPR32:$gp, (MipsGPRel tglobaladdr:$in)),
+              (ADDiu GPR32:$gp, tglobaladdr:$in)>;
+def : MipsPat<(add GPR32:$gp, (MipsGPRel tconstpool:$in)),
+              (ADDiu GPR32:$gp, tconstpool:$in)>;
+
+// wrapper_pic
+class WrapperPat<SDNode node, Instruction ADDiuOp, RegisterClass RC>:
+      MipsPat<(MipsWrapper RC:$gp, node:$in),
+              (ADDiuOp RC:$gp, node:$in)>;
+
+def : WrapperPat<tglobaladdr, ADDiu, GPR32>;
+def : WrapperPat<tconstpool, ADDiu, GPR32>;
+def : WrapperPat<texternalsym, ADDiu, GPR32>;
+def : WrapperPat<tblockaddress, ADDiu, GPR32>;
+def : WrapperPat<tjumptable, ADDiu, GPR32>;
+def : WrapperPat<tglobaltlsaddr, ADDiu, GPR32>;
+
+// Mips does not have "not", so we expand our way
+def : MipsPat<(not GPR32:$in),
+              (NOR GPR32Opnd:$in, ZERO)>;
+
+// extended loads
+def : MipsPat<(i32 (extloadi1  addr:$src)), (LBu addr:$src)>;
+def : MipsPat<(i32 (extloadi8  addr:$src)), (LBu addr:$src)>;
+def : MipsPat<(i32 (extloadi16 addr:$src)), (LHu addr:$src)>;
+
+// peepholes
+def : MipsPat<(store (i32 0), addr:$dst), (SW ZERO, addr:$dst)>;
+
+// brcond patterns
+multiclass BrcondPats<RegisterClass RC, Instruction BEQOp, Instruction BNEOp,
+                      Instruction SLTOp, Instruction SLTuOp, Instruction SLTiOp,
+                      Instruction SLTiuOp, Register ZEROReg> {
+def : MipsPat<(brcond (i32 (setne RC:$lhs, 0)), bb:$dst),
+              (BNEOp RC:$lhs, ZEROReg, bb:$dst)>;
+def : MipsPat<(brcond (i32 (seteq RC:$lhs, 0)), bb:$dst),
+              (BEQOp RC:$lhs, ZEROReg, bb:$dst)>;
+
+def : MipsPat<(brcond (i32 (setge RC:$lhs, RC:$rhs)), bb:$dst),
+              (BEQ (SLTOp RC:$lhs, RC:$rhs), ZERO, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setuge RC:$lhs, RC:$rhs)), bb:$dst),
+              (BEQ (SLTuOp RC:$lhs, RC:$rhs), ZERO, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setge RC:$lhs, immSExt16:$rhs)), bb:$dst),
+              (BEQ (SLTiOp RC:$lhs, immSExt16:$rhs), ZERO, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setuge RC:$lhs, immSExt16:$rhs)), bb:$dst),
+              (BEQ (SLTiuOp RC:$lhs, immSExt16:$rhs), ZERO, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setgt RC:$lhs, immSExt16Plus1:$rhs)), bb:$dst),
+              (BEQ (SLTiOp RC:$lhs, (Plus1 imm:$rhs)), ZERO, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setugt RC:$lhs, immSExt16Plus1:$rhs)), bb:$dst),
+              (BEQ (SLTiuOp RC:$lhs, (Plus1 imm:$rhs)), ZERO, bb:$dst)>;
+
+def : MipsPat<(brcond (i32 (setle RC:$lhs, RC:$rhs)), bb:$dst),
+              (BEQ (SLTOp RC:$rhs, RC:$lhs), ZERO, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setule RC:$lhs, RC:$rhs)), bb:$dst),
+              (BEQ (SLTuOp RC:$rhs, RC:$lhs), ZERO, bb:$dst)>;
+
+def : MipsPat<(brcond RC:$cond, bb:$dst),
+              (BNEOp RC:$cond, ZEROReg, bb:$dst)>;
+}
+
+defm : BrcondPats<GPR32, BEQ, BNE, SLT, SLTu, SLTi, SLTiu, ZERO>;
+
+def : MipsPat<(brcond (i32 (setlt i32:$lhs, 1)), bb:$dst),
+              (BLEZ i32:$lhs, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setgt i32:$lhs, -1)), bb:$dst),
+              (BGEZ i32:$lhs, bb:$dst)>;
+
+// setcc patterns
+multiclass SeteqPats<RegisterClass RC, Instruction SLTiuOp, Instruction XOROp,
+                     Instruction SLTuOp, Register ZEROReg> {
+  def : MipsPat<(seteq RC:$lhs, 0),
+                (SLTiuOp RC:$lhs, 1)>;
+  def : MipsPat<(setne RC:$lhs, 0),
+                (SLTuOp ZEROReg, RC:$lhs)>;
+  def : MipsPat<(seteq RC:$lhs, RC:$rhs),
+                (SLTiuOp (XOROp RC:$lhs, RC:$rhs), 1)>;
+  def : MipsPat<(setne RC:$lhs, RC:$rhs),
+                (SLTuOp ZEROReg, (XOROp RC:$lhs, RC:$rhs))>;
+}
+
+multiclass SetlePats<RegisterClass RC, Instruction SLTOp, Instruction SLTuOp> {
+  def : MipsPat<(setle RC:$lhs, RC:$rhs),
+                (XORi (SLTOp RC:$rhs, RC:$lhs), 1)>;
+  def : MipsPat<(setule RC:$lhs, RC:$rhs),
+                (XORi (SLTuOp RC:$rhs, RC:$lhs), 1)>;
+}
+
+multiclass SetgtPats<RegisterClass RC, Instruction SLTOp, Instruction SLTuOp> {
+  def : MipsPat<(setgt RC:$lhs, RC:$rhs),
+                (SLTOp RC:$rhs, RC:$lhs)>;
+  def : MipsPat<(setugt RC:$lhs, RC:$rhs),
+                (SLTuOp RC:$rhs, RC:$lhs)>;
+}
+
+multiclass SetgePats<RegisterClass RC, Instruction SLTOp, Instruction SLTuOp> {
+  def : MipsPat<(setge RC:$lhs, RC:$rhs),
+                (XORi (SLTOp RC:$lhs, RC:$rhs), 1)>;
+  def : MipsPat<(setuge RC:$lhs, RC:$rhs),
+                (XORi (SLTuOp RC:$lhs, RC:$rhs), 1)>;
+}
+
+multiclass SetgeImmPats<RegisterClass RC, Instruction SLTiOp,
+                        Instruction SLTiuOp> {
+  def : MipsPat<(setge RC:$lhs, immSExt16:$rhs),
+                (XORi (SLTiOp RC:$lhs, immSExt16:$rhs), 1)>;
+  def : MipsPat<(setuge RC:$lhs, immSExt16:$rhs),
+                (XORi (SLTiuOp RC:$lhs, immSExt16:$rhs), 1)>;
+}
+
+defm : SeteqPats<GPR32, SLTiu, XOR, SLTu, ZERO>;
+defm : SetlePats<GPR32, SLT, SLTu>;
+defm : SetgtPats<GPR32, SLT, SLTu>;
+defm : SetgePats<GPR32, SLT, SLTu>;
+defm : SetgeImmPats<GPR32, SLTi, SLTiu>;
+
+// bswap pattern
+def : MipsPat<(bswap GPR32:$rt), (ROTR (WSBH GPR32:$rt), 16)>;
+
+// Load halfword/word patterns.
+let AddedComplexity = 40 in {
+  def : LoadRegImmPat<LBu, i32, zextloadi8>;
+  def : LoadRegImmPat<LH, i32, sextloadi16>;
+  def : LoadRegImmPat<LW, i32, load>;
+}
+
+//===----------------------------------------------------------------------===//
+// Floating Point Support
+//===----------------------------------------------------------------------===//
+
+include "MipsInstrFPU.td"
+include "Mips64InstrInfo.td"
+include "MipsCondMov.td"
+
+include "Mips32r6InstrInfo.td"
+include "Mips64r6InstrInfo.td"
+
+//
+// Mips16
+
+include "Mips16InstrFormats.td"
+include "Mips16InstrInfo.td"
+
+// DSP
+include "MipsDSPInstrFormats.td"
+include "MipsDSPInstrInfo.td"
+
+// MSA
+include "MipsMSAInstrFormats.td"
+include "MipsMSAInstrInfo.td"
+
+// Micromips
+include "MicroMipsInstrFormats.td"
+include "MicroMipsInstrInfo.td"
+include "MicroMipsInstrFPU.td"
diff --git a/contrib/llvm/lib/Target/Mips/MipsJITInfo.cpp b/contrib/llvm/lib/Target/Mips/MipsJITInfo.cpp
new file mode 100644
index 0000000..2072488
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsJITInfo.cpp
@@ -0,0 +1,286 @@
+//===-- MipsJITInfo.cpp - Implement the Mips JIT Interface ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the JIT interfaces for the Mips target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsJITInfo.h"
+#include "MipsInstrInfo.h"
+#include "MipsRelocations.h"
+#include "MipsSubtarget.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Memory.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstdlib>
+using namespace llvm;
+
+#define DEBUG_TYPE "jit"
+
+
+void MipsJITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
+  unsigned NewAddr = (intptr_t)New;
+  unsigned OldAddr = (intptr_t)Old;
+  const unsigned NopInstr = 0x0;
+
+  // If the functions are in the same memory segment, insert PC-region branch.
+  if ((NewAddr & 0xF0000000) == ((OldAddr + 4) & 0xF0000000)) {
+    unsigned *OldInstruction = (unsigned *)Old;
+    *OldInstruction = 0x08000000;
+    unsigned JTargetAddr = NewAddr & 0x0FFFFFFC;
+
+    JTargetAddr >>= 2;
+    *OldInstruction |= JTargetAddr;
+
+    // Insert a NOP.
+    OldInstruction++;
+    *OldInstruction = NopInstr;
+
+    sys::Memory::InvalidateInstructionCache(Old, 2 * 4);
+  } else {
+    // We need to clear hint bits from the instruction, in case it is 'jr ra'.
+    const unsigned HintMask = 0xFFFFF83F, ReturnSequence = 0x03e00008;
+    unsigned* CurrentInstr = (unsigned*)Old;
+    unsigned CurrInstrHintClear = (*CurrentInstr) & HintMask;
+    unsigned* NextInstr = CurrentInstr + 1;
+    unsigned NextInstrHintClear = (*NextInstr) & HintMask;
+
+    // Do absolute jump if there are 2 or more instructions before return from
+    // the old function.
+    if ((CurrInstrHintClear != ReturnSequence) &&
+        (NextInstrHintClear != ReturnSequence)) {
+      const unsigned LuiT0Instr = 0x3c080000, AddiuT0Instr = 0x25080000;
+      const unsigned JrT0Instr = 0x01000008;
+      // lui  t0,  high 16 bit of the NewAddr
+      (*(CurrentInstr++)) = LuiT0Instr | ((NewAddr & 0xffff0000) >> 16);
+      // addiu  t0, t0, low 16 bit of the NewAddr
+      (*(CurrentInstr++)) = AddiuT0Instr | (NewAddr & 0x0000ffff);
+      // jr t0
+      (*(CurrentInstr++)) = JrT0Instr;
+      (*CurrentInstr) = NopInstr;
+
+      sys::Memory::InvalidateInstructionCache(Old, 4 * 4);
+    } else {
+      // Unsupported case
+      report_fatal_error("MipsJITInfo::replaceMachineCodeForFunction");
+    }
+  }
+}
+
+/// JITCompilerFunction - This contains the address of the JIT function used to
+/// compile a function lazily.
+static TargetJITInfo::JITCompilerFn JITCompilerFunction;
+
+// Get the ASMPREFIX for the current host.  This is often '_'.
+#ifndef __USER_LABEL_PREFIX__
+#define __USER_LABEL_PREFIX__
+#endif
+#define GETASMPREFIX2(X) #X
+#define GETASMPREFIX(X) GETASMPREFIX2(X)
+#define ASMPREFIX GETASMPREFIX(__USER_LABEL_PREFIX__)
+
+// CompilationCallback stub - We can't use a C function with inline assembly in
+// it, because the prolog/epilog inserted by GCC won't work for us. Instead,
+// write our own wrapper, which does things our way, so we have complete control
+// over register saving and restoring. This code saves registers, calls
+// MipsCompilationCallbackC and restores registers.
+extern "C" {
+#if defined (__mips__)
+void MipsCompilationCallback();
+
+  asm(
+    ".text\n"
+    ".align 2\n"
+    ".globl " ASMPREFIX "MipsCompilationCallback\n"
+    ASMPREFIX "MipsCompilationCallback:\n"
+    ".ent " ASMPREFIX "MipsCompilationCallback\n"
+    ".frame  $sp, 32, $ra\n"
+    ".set  noreorder\n"
+    ".cpload $t9\n"
+
+    "addiu $sp, $sp, -64\n"
+    ".cprestore 16\n"
+
+    // Save argument registers a0, a1, a2, a3, f12, f14 since they may contain
+    // stuff for the real target function right now. We have to act as if this
+    // whole compilation callback doesn't exist as far as the caller is
+    // concerned. We also need to save the ra register since it contains the
+    // original return address, and t8 register since it contains the address
+    // of the end of function stub.
+    "sw $a0, 20($sp)\n"
+    "sw $a1, 24($sp)\n"
+    "sw $a2, 28($sp)\n"
+    "sw $a3, 32($sp)\n"
+    "sw $ra, 36($sp)\n"
+    "sw $t8, 40($sp)\n"
+    "sdc1 $f12, 48($sp)\n"
+    "sdc1 $f14, 56($sp)\n"
+
+    // t8 points at the end of function stub. Pass the beginning of the stub
+    // to the MipsCompilationCallbackC.
+    "addiu $a0, $t8, -16\n"
+    "jal " ASMPREFIX "MipsCompilationCallbackC\n"
+    "nop\n"
+
+    // Restore registers.
+    "lw $a0, 20($sp)\n"
+    "lw $a1, 24($sp)\n"
+    "lw $a2, 28($sp)\n"
+    "lw $a3, 32($sp)\n"
+    "lw $ra, 36($sp)\n"
+    "lw $t8, 40($sp)\n"
+    "ldc1 $f12, 48($sp)\n"
+    "ldc1 $f14, 56($sp)\n"
+    "addiu $sp, $sp, 64\n"
+
+    // Jump to the (newly modified) stub to invoke the real function.
+    "addiu $t8, $t8, -16\n"
+    "jr $t8\n"
+    "nop\n"
+
+    ".set  reorder\n"
+    ".end " ASMPREFIX "MipsCompilationCallback\n"
+      );
+#else  // host != Mips
+  void MipsCompilationCallback() {
+    llvm_unreachable(
+      "Cannot call MipsCompilationCallback() on a non-Mips arch!");
+  }
+#endif
+}
+
+/// MipsCompilationCallbackC - This is the target-specific function invoked
+/// by the function stub when we did not know the real target of a call.
+/// This function must locate the start of the stub or call site and pass
+/// it into the JIT compiler function.
+extern "C" void MipsCompilationCallbackC(intptr_t StubAddr) {
+  // Get the address of the compiled code for this function.
+  intptr_t NewVal = (intptr_t) JITCompilerFunction((void*) StubAddr);
+
+  // Rewrite the function stub so that we don't end up here every time we
+  // execute the call. We're replacing the first four instructions of the
+  // stub with code that jumps to the compiled function:
+  //   lui $t9, %hi(NewVal)
+  //   addiu $t9, $t9, %lo(NewVal)
+  //   jr $t9
+  //   nop
+
+  int Hi = ((unsigned)NewVal & 0xffff0000) >> 16;
+  if ((NewVal & 0x8000) != 0)
+    Hi++;
+  int Lo = (int)(NewVal & 0xffff);
+
+  *(intptr_t *)(StubAddr) = 0xf << 26 | 25 << 16 | Hi;
+  *(intptr_t *)(StubAddr + 4) = 9 << 26 | 25 << 21 | 25 << 16 | Lo;
+  *(intptr_t *)(StubAddr + 8) = 25 << 21 | 8;
+  *(intptr_t *)(StubAddr + 12) = 0;
+
+  sys::Memory::InvalidateInstructionCache((void*) StubAddr, 16);
+}
+
+TargetJITInfo::LazyResolverFn MipsJITInfo::getLazyResolverFunction(
+    JITCompilerFn F) {
+  JITCompilerFunction = F;
+  return MipsCompilationCallback;
+}
+
+TargetJITInfo::StubLayout MipsJITInfo::getStubLayout() {
+  // The stub contains 4 4-byte instructions, aligned at 4 bytes. See
+  // emitFunctionStub for details.
+  StubLayout Result = { 4*4, 4 };
+  return Result;
+}
+
+void *MipsJITInfo::emitFunctionStub(const Function *F, void *Fn,
+                                    JITCodeEmitter &JCE) {
+  JCE.emitAlignment(4);
+  void *Addr = (void*) (JCE.getCurrentPCValue());
+  if (!sys::Memory::setRangeWritable(Addr, 16))
+    llvm_unreachable("ERROR: Unable to mark stub writable.");
+
+  intptr_t EmittedAddr;
+  if (Fn != (void*)(intptr_t)MipsCompilationCallback)
+    EmittedAddr = (intptr_t)Fn;
+  else
+    EmittedAddr = (intptr_t)MipsCompilationCallback;
+
+
+  int Hi = ((unsigned)EmittedAddr & 0xffff0000) >> 16;
+  if ((EmittedAddr & 0x8000) != 0)
+    Hi++;
+  int Lo = (int)(EmittedAddr & 0xffff);
+
+  // lui $t9, %hi(EmittedAddr)
+  // addiu $t9, $t9, %lo(EmittedAddr)
+  // jalr $t8, $t9
+  // nop
+  if (IsLittleEndian) {
+    JCE.emitWordLE(0xf << 26 | 25 << 16 | Hi);
+    JCE.emitWordLE(9 << 26 | 25 << 21 | 25 << 16 | Lo);
+    JCE.emitWordLE(25 << 21 | 24 << 11 | 9);
+    JCE.emitWordLE(0);
+  } else {
+    JCE.emitWordBE(0xf << 26 | 25 << 16 | Hi);
+    JCE.emitWordBE(9 << 26 | 25 << 21 | 25 << 16 | Lo);
+    JCE.emitWordBE(25 << 21 | 24 << 11 | 9);
+    JCE.emitWordBE(0);
+  }
+
+  sys::Memory::InvalidateInstructionCache(Addr, 16);
+  if (!sys::Memory::setRangeExecutable(Addr, 16))
+    llvm_unreachable("ERROR: Unable to mark stub executable.");
+
+  return Addr;
+}
+
+/// relocate - Before the JIT can run a block of code that has been emitted,
+/// it must rewrite the code to contain the actual addresses of any
+/// referenced global symbols.
+void MipsJITInfo::relocate(void *Function, MachineRelocation *MR,
+                           unsigned NumRelocs, unsigned char *GOTBase) {
+  for (unsigned i = 0; i != NumRelocs; ++i, ++MR) {
+
+    void *RelocPos = (char*) Function + MR->getMachineCodeOffset();
+    intptr_t ResultPtr = (intptr_t) MR->getResultPointer();
+
+    switch ((Mips::RelocationType) MR->getRelocationType()) {
+    case Mips::reloc_mips_pc16:
+      ResultPtr = (((ResultPtr - (intptr_t) RelocPos) - 4) >> 2) & 0xffff;
+      *((unsigned*) RelocPos) |= (unsigned) ResultPtr;
+      break;
+
+    case Mips::reloc_mips_26:
+      ResultPtr = (ResultPtr & 0x0fffffff) >> 2;
+      *((unsigned*) RelocPos) |= (unsigned) ResultPtr;
+      break;
+
+    case Mips::reloc_mips_hi:
+      ResultPtr = ResultPtr >> 16;
+      if ((((intptr_t) (MR->getResultPointer()) & 0xffff) >> 15) == 1) {
+        ResultPtr += 1;
+      }
+      *((unsigned*) RelocPos) |= (unsigned) ResultPtr;
+      break;
+
+    case Mips::reloc_mips_lo: {
+      // Addend is needed for unaligned load/store instructions, where offset
+      // for the second load/store in the expanded instruction sequence must
+      // be modified by +1 or +3. Otherwise, Addend is 0.
+      int Addend = *((unsigned*) RelocPos) & 0xffff;
+      ResultPtr = (ResultPtr + Addend) & 0xffff;
+      *((unsigned*) RelocPos) &= 0xffff0000;
+      *((unsigned*) RelocPos) |= (unsigned) ResultPtr;
+      break;
+    }
+    }
+  }
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsJITInfo.h b/contrib/llvm/lib/Target/Mips/MipsJITInfo.h
new file mode 100644
index 0000000..c9dfd83
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsJITInfo.h
@@ -0,0 +1,71 @@
+//===- MipsJITInfo.h - Mips Implementation of the JIT Interface -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the MipsJITInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSJITINFO_H
+#define MIPSJITINFO_H
+
+#include "MipsMachineFunction.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/Target/TargetJITInfo.h"
+
+namespace llvm {
+class MipsTargetMachine;
+
+class MipsJITInfo : public TargetJITInfo {
+
+  bool IsPIC;
+  bool IsLittleEndian;
+
+  public:
+    explicit MipsJITInfo() :
+      IsPIC(false), IsLittleEndian(true) {}
+
+    /// replaceMachineCodeForFunction - Make it so that calling the function
+    /// whose machine code is at OLD turns into a call to NEW, perhaps by
+    /// overwriting OLD with a branch to NEW.  This is used for self-modifying
+    /// code.
+    ///
+    void replaceMachineCodeForFunction(void *Old, void *New) override;
+
+    // getStubLayout - Returns the size and alignment of the largest call stub
+    // on Mips.
+    StubLayout getStubLayout() override;
+
+    /// emitFunctionStub - Use the specified JITCodeEmitter object to emit a
+    /// small native function that simply calls the function at the specified
+    /// address.
+    void *emitFunctionStub(const Function *F, void *Fn,
+                           JITCodeEmitter &JCE) override;
+
+    /// getLazyResolverFunction - Expose the lazy resolver to the JIT.
+    LazyResolverFn getLazyResolverFunction(JITCompilerFn) override;
+
+    /// relocate - Before the JIT can run a block of code that has been emitted,
+    /// it must rewrite the code to contain the actual addresses of any
+    /// referenced global symbols.
+    void relocate(void *Function, MachineRelocation *MR,
+                  unsigned NumRelocs, unsigned char *GOTBase) override;
+
+    /// Initialize - Initialize internal stage for the function being JITted.
+    void Initialize(const MachineFunction &MF, bool isPIC,
+                    bool isLittleEndian) {
+      IsPIC = isPIC;
+      IsLittleEndian = isLittleEndian;
+    }
+
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsLongBranch.cpp b/contrib/llvm/lib/Target/Mips/MipsLongBranch.cpp
new file mode 100644
index 0000000..27110b6
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsLongBranch.cpp
@@ -0,0 +1,519 @@
+//===-- MipsLongBranch.cpp - Emit long branches ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass expands a branch or jump instruction into a long branch if its
+// offset is too large to fit into its immediate field.
+//
+// FIXME: Fix pc-region jump instructions which cross 256MB segment boundaries.
+//===----------------------------------------------------------------------===//
+
+#include "Mips.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MCTargetDesc/MipsMCNaCl.h"
+#include "MipsTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-long-branch"
+
+STATISTIC(LongBranches, "Number of long branches.");
+
+static cl::opt<bool> SkipLongBranch(
+  "skip-mips-long-branch",
+  cl::init(false),
+  cl::desc("MIPS: Skip long branch pass."),
+  cl::Hidden);
+
+static cl::opt<bool> ForceLongBranch(
+  "force-mips-long-branch",
+  cl::init(false),
+  cl::desc("MIPS: Expand all branches to long format."),
+  cl::Hidden);
+
+namespace {
+  typedef MachineBasicBlock::iterator Iter;
+  typedef MachineBasicBlock::reverse_iterator ReverseIter;
+
+  struct MBBInfo {
+    uint64_t Size, Address;
+    bool HasLongBranch;
+    MachineInstr *Br;
+
+    MBBInfo() : Size(0), HasLongBranch(false), Br(nullptr) {}
+  };
+
+  class MipsLongBranch : public MachineFunctionPass {
+
+  public:
+    static char ID;
+    MipsLongBranch(TargetMachine &tm)
+      : MachineFunctionPass(ID), TM(tm),
+        IsPIC(TM.getRelocationModel() == Reloc::PIC_),
+        ABI(TM.getSubtarget<MipsSubtarget>().getTargetABI()),
+        LongBranchSeqSize(!IsPIC ? 2 : (ABI == MipsSubtarget::N64 ? 10 :
+            (!TM.getSubtarget<MipsSubtarget>().isTargetNaCl() ? 9 : 10))) {}
+
+    const char *getPassName() const override {
+      return "Mips Long Branch";
+    }
+
+    bool runOnMachineFunction(MachineFunction &F) override;
+
+  private:
+    void splitMBB(MachineBasicBlock *MBB);
+    void initMBBInfo();
+    int64_t computeOffset(const MachineInstr *Br);
+    void replaceBranch(MachineBasicBlock &MBB, Iter Br, DebugLoc DL,
+                       MachineBasicBlock *MBBOpnd);
+    void expandToLongBranch(MBBInfo &Info);
+
+    const TargetMachine &TM;
+    MachineFunction *MF;
+    SmallVector<MBBInfo, 16> MBBInfos;
+    bool IsPIC;
+    unsigned ABI;
+    unsigned LongBranchSeqSize;
+  };
+
+  char MipsLongBranch::ID = 0;
+} // end of anonymous namespace
+
+/// createMipsLongBranchPass - Returns a pass that converts branches to long
+/// branches.
+FunctionPass *llvm::createMipsLongBranchPass(MipsTargetMachine &tm) {
+  return new MipsLongBranch(tm);
+}
+
+/// Iterate over list of Br's operands and search for a MachineBasicBlock
+/// operand.
+static MachineBasicBlock *getTargetMBB(const MachineInstr &Br) {
+  for (unsigned I = 0, E = Br.getDesc().getNumOperands(); I < E; ++I) {
+    const MachineOperand &MO = Br.getOperand(I);
+
+    if (MO.isMBB())
+      return MO.getMBB();
+  }
+
+  assert(false && "This instruction does not have an MBB operand.");
+  return nullptr;
+}
+
+// Traverse the list of instructions backwards until a non-debug instruction is
+// found or it reaches E.
+static ReverseIter getNonDebugInstr(ReverseIter B, ReverseIter E) {
+  for (; B != E; ++B)
+    if (!B->isDebugValue())
+      return B;
+
+  return E;
+}
+
+// Split MBB if it has two direct jumps/branches.
+void MipsLongBranch::splitMBB(MachineBasicBlock *MBB) {
+  ReverseIter End = MBB->rend();
+  ReverseIter LastBr = getNonDebugInstr(MBB->rbegin(), End);
+
+  // Return if MBB has no branch instructions.
+  if ((LastBr == End) ||
+      (!LastBr->isConditionalBranch() && !LastBr->isUnconditionalBranch()))
+    return;
+
+  ReverseIter FirstBr = getNonDebugInstr(std::next(LastBr), End);
+
+  // MBB has only one branch instruction if FirstBr is not a branch
+  // instruction.
+  if ((FirstBr == End) ||
+      (!FirstBr->isConditionalBranch() && !FirstBr->isUnconditionalBranch()))
+    return;
+
+  assert(!FirstBr->isIndirectBranch() && "Unexpected indirect branch found.");
+
+  // Create a new MBB. Move instructions in MBB to the newly created MBB.
+  MachineBasicBlock *NewMBB =
+    MF->CreateMachineBasicBlock(MBB->getBasicBlock());
+
+  // Insert NewMBB and fix control flow.
+  MachineBasicBlock *Tgt = getTargetMBB(*FirstBr);
+  NewMBB->transferSuccessors(MBB);
+  NewMBB->removeSuccessor(Tgt);
+  MBB->addSuccessor(NewMBB);
+  MBB->addSuccessor(Tgt);
+  MF->insert(std::next(MachineFunction::iterator(MBB)), NewMBB);
+
+  NewMBB->splice(NewMBB->end(), MBB, (++LastBr).base(), MBB->end());
+}
+
+// Fill MBBInfos.
+void MipsLongBranch::initMBBInfo() {
+  // Split the MBBs if they have two branches. Each basic block should have at
+  // most one branch after this loop is executed.
+  for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E;)
+    splitMBB(I++);
+
+  MF->RenumberBlocks();
+  MBBInfos.clear();
+  MBBInfos.resize(MF->size());
+
+  const MipsInstrInfo *TII =
+    static_cast<const MipsInstrInfo*>(TM.getInstrInfo());
+  for (unsigned I = 0, E = MBBInfos.size(); I < E; ++I) {
+    MachineBasicBlock *MBB = MF->getBlockNumbered(I);
+
+    // Compute size of MBB.
+    for (MachineBasicBlock::instr_iterator MI = MBB->instr_begin();
+         MI != MBB->instr_end(); ++MI)
+      MBBInfos[I].Size += TII->GetInstSizeInBytes(&*MI);
+
+    // Search for MBB's branch instruction.
+    ReverseIter End = MBB->rend();
+    ReverseIter Br = getNonDebugInstr(MBB->rbegin(), End);
+
+    if ((Br != End) && !Br->isIndirectBranch() &&
+        (Br->isConditionalBranch() ||
+         (Br->isUnconditionalBranch() &&
+          TM.getRelocationModel() == Reloc::PIC_)))
+      MBBInfos[I].Br = (++Br).base();
+  }
+}
+
+// Compute offset of branch in number of bytes.
+int64_t MipsLongBranch::computeOffset(const MachineInstr *Br) {
+  int64_t Offset = 0;
+  int ThisMBB = Br->getParent()->getNumber();
+  int TargetMBB = getTargetMBB(*Br)->getNumber();
+
+  // Compute offset of a forward branch.
+  if (ThisMBB < TargetMBB) {
+    for (int N = ThisMBB + 1; N < TargetMBB; ++N)
+      Offset += MBBInfos[N].Size;
+
+    return Offset + 4;
+  }
+
+  // Compute offset of a backward branch.
+  for (int N = ThisMBB; N >= TargetMBB; --N)
+    Offset += MBBInfos[N].Size;
+
+  return -Offset + 4;
+}
+
+// Replace Br with a branch which has the opposite condition code and a
+// MachineBasicBlock operand MBBOpnd.
+void MipsLongBranch::replaceBranch(MachineBasicBlock &MBB, Iter Br,
+                                   DebugLoc DL, MachineBasicBlock *MBBOpnd) {
+  const MipsInstrInfo *TII =
+    static_cast<const MipsInstrInfo*>(TM.getInstrInfo());
+  unsigned NewOpc = TII->getOppositeBranchOpc(Br->getOpcode());
+  const MCInstrDesc &NewDesc = TII->get(NewOpc);
+
+  MachineInstrBuilder MIB = BuildMI(MBB, Br, DL, NewDesc);
+
+  for (unsigned I = 0, E = Br->getDesc().getNumOperands(); I < E; ++I) {
+    MachineOperand &MO = Br->getOperand(I);
+
+    if (!MO.isReg()) {
+      assert(MO.isMBB() && "MBB operand expected.");
+      break;
+    }
+
+    MIB.addReg(MO.getReg());
+  }
+
+  MIB.addMBB(MBBOpnd);
+
+  // Bundle the instruction in the delay slot to the newly created branch
+  // and erase the original branch.
+  assert(Br->isBundledWithSucc());
+  MachineBasicBlock::instr_iterator II(Br);
+  MIBundleBuilder(&*MIB).append((++II)->removeFromBundle());
+  Br->eraseFromParent();
+}
+
+// Expand branch instructions to long branches.
+void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
+  MachineBasicBlock::iterator Pos;
+  MachineBasicBlock *MBB = I.Br->getParent(), *TgtMBB = getTargetMBB(*I.Br);
+  DebugLoc DL = I.Br->getDebugLoc();
+  const BasicBlock *BB = MBB->getBasicBlock();
+  MachineFunction::iterator FallThroughMBB = ++MachineFunction::iterator(MBB);
+  MachineBasicBlock *LongBrMBB = MF->CreateMachineBasicBlock(BB);
+
+  const MipsInstrInfo *TII =
+    static_cast<const MipsInstrInfo*>(TM.getInstrInfo());
+
+  MF->insert(FallThroughMBB, LongBrMBB);
+  MBB->removeSuccessor(TgtMBB);
+  MBB->addSuccessor(LongBrMBB);
+
+  if (IsPIC) {
+    MachineBasicBlock *BalTgtMBB = MF->CreateMachineBasicBlock(BB);
+    MF->insert(FallThroughMBB, BalTgtMBB);
+    LongBrMBB->addSuccessor(BalTgtMBB);
+    BalTgtMBB->addSuccessor(TgtMBB);
+
+    // We must select between the MIPS32r6/MIPS64r6 BAL (which is a normal
+    // instruction) and the pre-MIPS32r6/MIPS64r6 definition (which is an
+    // pseudo-instruction wrapping BGEZAL).
+
+    const MipsSubtarget &Subtarget = TM.getSubtarget<MipsSubtarget>();
+    unsigned BalOp = Subtarget.hasMips32r6() ? Mips::BAL : Mips::BAL_BR;
+
+    if (ABI != MipsSubtarget::N64) {
+      // $longbr:
+      //  addiu $sp, $sp, -8
+      //  sw $ra, 0($sp)
+      //  lui $at, %hi($tgt - $baltgt)
+      //  bal $baltgt
+      //  addiu $at, $at, %lo($tgt - $baltgt)
+      // $baltgt:
+      //  addu $at, $ra, $at
+      //  lw $ra, 0($sp)
+      //  jr $at
+      //  addiu $sp, $sp, 8
+      // $fallthrough:
+      //
+
+      Pos = LongBrMBB->begin();
+
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::ADDiu), Mips::SP)
+        .addReg(Mips::SP).addImm(-8);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::SW)).addReg(Mips::RA)
+        .addReg(Mips::SP).addImm(0);
+
+      // LUi and ADDiu instructions create 32-bit offset of the target basic
+      // block from the target of BAL instruction.  We cannot use immediate
+      // value for this offset because it cannot be determined accurately when
+      // the program has inline assembly statements.  We therefore use the
+      // relocation expressions %hi($tgt-$baltgt) and %lo($tgt-$baltgt) which
+      // are resolved during the fixup, so the values will always be correct.
+      //
+      // Since we cannot create %hi($tgt-$baltgt) and %lo($tgt-$baltgt)
+      // expressions at this point (it is possible only at the MC layer),
+      // we replace LUi and ADDiu with pseudo instructions
+      // LONG_BRANCH_LUi and LONG_BRANCH_ADDiu, and add both basic
+      // blocks as operands to these instructions.  When lowering these pseudo
+      // instructions to LUi and ADDiu in the MC layer, we will create
+      // %hi($tgt-$baltgt) and %lo($tgt-$baltgt) expressions and add them as
+      // operands to lowered instructions.
+
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LONG_BRANCH_LUi), Mips::AT)
+        .addMBB(TgtMBB).addMBB(BalTgtMBB);
+      MIBundleBuilder(*LongBrMBB, Pos)
+          .append(BuildMI(*MF, DL, TII->get(BalOp)).addMBB(BalTgtMBB))
+          .append(BuildMI(*MF, DL, TII->get(Mips::LONG_BRANCH_ADDiu), Mips::AT)
+                      .addReg(Mips::AT)
+                      .addMBB(TgtMBB)
+                      .addMBB(BalTgtMBB));
+
+      Pos = BalTgtMBB->begin();
+
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::ADDu), Mips::AT)
+        .addReg(Mips::RA).addReg(Mips::AT);
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::LW), Mips::RA)
+        .addReg(Mips::SP).addImm(0);
+
+      if (!TM.getSubtarget<MipsSubtarget>().isTargetNaCl()) {
+        MIBundleBuilder(*BalTgtMBB, Pos)
+          .append(BuildMI(*MF, DL, TII->get(Mips::JR)).addReg(Mips::AT))
+          .append(BuildMI(*MF, DL, TII->get(Mips::ADDiu), Mips::SP)
+                  .addReg(Mips::SP).addImm(8));
+      } else {
+        // In NaCl, modifying the sp is not allowed in branch delay slot.
+        BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::ADDiu), Mips::SP)
+          .addReg(Mips::SP).addImm(8);
+
+        MIBundleBuilder(*BalTgtMBB, Pos)
+          .append(BuildMI(*MF, DL, TII->get(Mips::JR)).addReg(Mips::AT))
+          .append(BuildMI(*MF, DL, TII->get(Mips::NOP)));
+
+        // Bundle-align the target of indirect branch JR.
+        TgtMBB->setAlignment(MIPS_NACL_BUNDLE_ALIGN);
+      }
+    } else {
+      // $longbr:
+      //  daddiu $sp, $sp, -16
+      //  sd $ra, 0($sp)
+      //  daddiu $at, $zero, %hi($tgt - $baltgt)
+      //  dsll $at, $at, 16
+      //  bal $baltgt
+      //  daddiu $at, $at, %lo($tgt - $baltgt)
+      // $baltgt:
+      //  daddu $at, $ra, $at
+      //  ld $ra, 0($sp)
+      //  jr64 $at
+      //  daddiu $sp, $sp, 16
+      // $fallthrough:
+      //
+
+      // We assume the branch is within-function, and that offset is within
+      // +/- 2GB.  High 32 bits will therefore always be zero.
+
+      // Note that this will work even if the offset is negative, because
+      // of the +1 modification that's added in that case.  For example, if the
+      // offset is -1MB (0xFFFFFFFFFFF00000), the computation for %higher is
+      //
+      // 0xFFFFFFFFFFF00000 + 0x80008000 = 0x000000007FF08000
+      //
+      // and the bits [47:32] are zero.  For %highest
+      //
+      // 0xFFFFFFFFFFF00000 + 0x800080008000 = 0x000080007FF08000
+      //
+      // and the bits [63:48] are zero.
+
+      Pos = LongBrMBB->begin();
+
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::DADDiu), Mips::SP_64)
+        .addReg(Mips::SP_64).addImm(-16);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::SD)).addReg(Mips::RA_64)
+        .addReg(Mips::SP_64).addImm(0);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LONG_BRANCH_DADDiu),
+              Mips::AT_64).addReg(Mips::ZERO_64)
+                          .addMBB(TgtMBB, MipsII::MO_ABS_HI).addMBB(BalTgtMBB);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::DSLL), Mips::AT_64)
+        .addReg(Mips::AT_64).addImm(16);
+
+      MIBundleBuilder(*LongBrMBB, Pos)
+          .append(BuildMI(*MF, DL, TII->get(BalOp)).addMBB(BalTgtMBB))
+          .append(
+              BuildMI(*MF, DL, TII->get(Mips::LONG_BRANCH_DADDiu), Mips::AT_64)
+                  .addReg(Mips::AT_64)
+                  .addMBB(TgtMBB, MipsII::MO_ABS_LO)
+                  .addMBB(BalTgtMBB));
+
+      Pos = BalTgtMBB->begin();
+
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::DADDu), Mips::AT_64)
+        .addReg(Mips::RA_64).addReg(Mips::AT_64);
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::LD), Mips::RA_64)
+        .addReg(Mips::SP_64).addImm(0);
+
+      MIBundleBuilder(*BalTgtMBB, Pos)
+        .append(BuildMI(*MF, DL, TII->get(Mips::JR64)).addReg(Mips::AT_64))
+        .append(BuildMI(*MF, DL, TII->get(Mips::DADDiu), Mips::SP_64)
+                .addReg(Mips::SP_64).addImm(16));
+    }
+
+    assert(LongBrMBB->size() + BalTgtMBB->size() == LongBranchSeqSize);
+  } else {
+    // $longbr:
+    //  j $tgt
+    //  nop
+    // $fallthrough:
+    //
+    Pos = LongBrMBB->begin();
+    LongBrMBB->addSuccessor(TgtMBB);
+    MIBundleBuilder(*LongBrMBB, Pos)
+      .append(BuildMI(*MF, DL, TII->get(Mips::J)).addMBB(TgtMBB))
+      .append(BuildMI(*MF, DL, TII->get(Mips::NOP)));
+
+    assert(LongBrMBB->size() == LongBranchSeqSize);
+  }
+
+  if (I.Br->isUnconditionalBranch()) {
+    // Change branch destination.
+    assert(I.Br->getDesc().getNumOperands() == 1);
+    I.Br->RemoveOperand(0);
+    I.Br->addOperand(MachineOperand::CreateMBB(LongBrMBB));
+  } else
+    // Change branch destination and reverse condition.
+    replaceBranch(*MBB, I.Br, DL, FallThroughMBB);
+}
+
+static void emitGPDisp(MachineFunction &F, const MipsInstrInfo *TII) {
+  MachineBasicBlock &MBB = F.front();
+  MachineBasicBlock::iterator I = MBB.begin();
+  DebugLoc DL = MBB.findDebugLoc(MBB.begin());
+  BuildMI(MBB, I, DL, TII->get(Mips::LUi), Mips::V0)
+    .addExternalSymbol("_gp_disp", MipsII::MO_ABS_HI);
+  BuildMI(MBB, I, DL, TII->get(Mips::ADDiu), Mips::V0)
+    .addReg(Mips::V0).addExternalSymbol("_gp_disp", MipsII::MO_ABS_LO);
+  MBB.removeLiveIn(Mips::V0);
+}
+
+bool MipsLongBranch::runOnMachineFunction(MachineFunction &F) {
+  const MipsInstrInfo *TII =
+    static_cast<const MipsInstrInfo*>(TM.getInstrInfo());
+
+  const MipsSubtarget &STI = TM.getSubtarget<MipsSubtarget>();
+  if (STI.inMips16Mode() || !STI.enableLongBranchPass())
+    return false;
+  if ((TM.getRelocationModel() == Reloc::PIC_) &&
+      TM.getSubtarget<MipsSubtarget>().isABI_O32() &&
+      F.getInfo<MipsFunctionInfo>()->globalBaseRegSet())
+    emitGPDisp(F, TII);
+
+  if (SkipLongBranch)
+    return true;
+
+  MF = &F;
+  initMBBInfo();
+
+  SmallVectorImpl<MBBInfo>::iterator I, E = MBBInfos.end();
+  bool EverMadeChange = false, MadeChange = true;
+
+  while (MadeChange) {
+    MadeChange = false;
+
+    for (I = MBBInfos.begin(); I != E; ++I) {
+      // Skip if this MBB doesn't have a branch or the branch has already been
+      // converted to a long branch.
+      if (!I->Br || I->HasLongBranch)
+        continue;
+
+      int ShVal = TM.getSubtarget<MipsSubtarget>().inMicroMipsMode() ? 2 : 4;
+      int64_t Offset = computeOffset(I->Br) / ShVal;
+
+      if (TM.getSubtarget<MipsSubtarget>().isTargetNaCl()) {
+        // The offset calculation does not include sandboxing instructions
+        // that will be added later in the MC layer.  Since at this point we
+        // don't know the exact amount of code that "sandboxing" will add, we
+        // conservatively estimate that code will not grow more than 100%.
+        Offset *= 2;
+      }
+
+      // Check if offset fits into 16-bit immediate field of branches.
+      if (!ForceLongBranch && isInt<16>(Offset))
+        continue;
+
+      I->HasLongBranch = true;
+      I->Size += LongBranchSeqSize * 4;
+      ++LongBranches;
+      EverMadeChange = MadeChange = true;
+    }
+  }
+
+  if (!EverMadeChange)
+    return true;
+
+  // Compute basic block addresses.
+  if (TM.getRelocationModel() == Reloc::PIC_) {
+    uint64_t Address = 0;
+
+    for (I = MBBInfos.begin(); I != E; Address += I->Size, ++I)
+      I->Address = Address;
+  }
+
+  // Do the expansion.
+  for (I = MBBInfos.begin(); I != E; ++I)
+    if (I->HasLongBranch)
+      expandToLongBranch(*I);
+
+  MF->RenumberBlocks();
+
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsMCInstLower.cpp b/contrib/llvm/lib/Target/Mips/MipsMCInstLower.cpp
new file mode 100644
index 0000000..821392e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsMCInstLower.cpp
@@ -0,0 +1,233 @@
+//===-- MipsMCInstLower.cpp - Convert Mips MachineInstr to MCInst ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower Mips MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+#include "MipsMCInstLower.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MipsAsmPrinter.h"
+#include "MipsInstrInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+
+using namespace llvm;
+
+MipsMCInstLower::MipsMCInstLower(MipsAsmPrinter &asmprinter)
+  : AsmPrinter(asmprinter) {}
+
+void MipsMCInstLower::Initialize(MCContext *C) {
+  Ctx = C;
+}
+
+MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
+                                              MachineOperandType MOTy,
+                                              unsigned Offset) const {
+  MCSymbolRefExpr::VariantKind Kind;
+  const MCSymbol *Symbol;
+
+  switch(MO.getTargetFlags()) {
+  default:                   llvm_unreachable("Invalid target flag!");
+  case MipsII::MO_NO_FLAG:   Kind = MCSymbolRefExpr::VK_None; break;
+  case MipsII::MO_GPREL:     Kind = MCSymbolRefExpr::VK_Mips_GPREL; break;
+  case MipsII::MO_GOT_CALL:  Kind = MCSymbolRefExpr::VK_Mips_GOT_CALL; break;
+  case MipsII::MO_GOT16:     Kind = MCSymbolRefExpr::VK_Mips_GOT16; break;
+  case MipsII::MO_GOT:       Kind = MCSymbolRefExpr::VK_Mips_GOT; break;
+  case MipsII::MO_ABS_HI:    Kind = MCSymbolRefExpr::VK_Mips_ABS_HI; break;
+  case MipsII::MO_ABS_LO:    Kind = MCSymbolRefExpr::VK_Mips_ABS_LO; break;
+  case MipsII::MO_TLSGD:     Kind = MCSymbolRefExpr::VK_Mips_TLSGD; break;
+  case MipsII::MO_TLSLDM:    Kind = MCSymbolRefExpr::VK_Mips_TLSLDM; break;
+  case MipsII::MO_DTPREL_HI: Kind = MCSymbolRefExpr::VK_Mips_DTPREL_HI; break;
+  case MipsII::MO_DTPREL_LO: Kind = MCSymbolRefExpr::VK_Mips_DTPREL_LO; break;
+  case MipsII::MO_GOTTPREL:  Kind = MCSymbolRefExpr::VK_Mips_GOTTPREL; break;
+  case MipsII::MO_TPREL_HI:  Kind = MCSymbolRefExpr::VK_Mips_TPREL_HI; break;
+  case MipsII::MO_TPREL_LO:  Kind = MCSymbolRefExpr::VK_Mips_TPREL_LO; break;
+  case MipsII::MO_GPOFF_HI:  Kind = MCSymbolRefExpr::VK_Mips_GPOFF_HI; break;
+  case MipsII::MO_GPOFF_LO:  Kind = MCSymbolRefExpr::VK_Mips_GPOFF_LO; break;
+  case MipsII::MO_GOT_DISP:  Kind = MCSymbolRefExpr::VK_Mips_GOT_DISP; break;
+  case MipsII::MO_GOT_PAGE:  Kind = MCSymbolRefExpr::VK_Mips_GOT_PAGE; break;
+  case MipsII::MO_GOT_OFST:  Kind = MCSymbolRefExpr::VK_Mips_GOT_OFST; break;
+  case MipsII::MO_HIGHER:    Kind = MCSymbolRefExpr::VK_Mips_HIGHER; break;
+  case MipsII::MO_HIGHEST:   Kind = MCSymbolRefExpr::VK_Mips_HIGHEST; break;
+  case MipsII::MO_GOT_HI16:  Kind = MCSymbolRefExpr::VK_Mips_GOT_HI16; break;
+  case MipsII::MO_GOT_LO16:  Kind = MCSymbolRefExpr::VK_Mips_GOT_LO16; break;
+  case MipsII::MO_CALL_HI16: Kind = MCSymbolRefExpr::VK_Mips_CALL_HI16; break;
+  case MipsII::MO_CALL_LO16: Kind = MCSymbolRefExpr::VK_Mips_CALL_LO16; break;
+  }
+
+  switch (MOTy) {
+  case MachineOperand::MO_MachineBasicBlock:
+    Symbol = MO.getMBB()->getSymbol();
+    break;
+
+  case MachineOperand::MO_GlobalAddress:
+    Symbol = AsmPrinter.getSymbol(MO.getGlobal());
+    Offset += MO.getOffset();
+    break;
+
+  case MachineOperand::MO_BlockAddress:
+    Symbol = AsmPrinter.GetBlockAddressSymbol(MO.getBlockAddress());
+    Offset += MO.getOffset();
+    break;
+
+  case MachineOperand::MO_ExternalSymbol:
+    Symbol = AsmPrinter.GetExternalSymbolSymbol(MO.getSymbolName());
+    Offset += MO.getOffset();
+    break;
+
+  case MachineOperand::MO_JumpTableIndex:
+    Symbol = AsmPrinter.GetJTISymbol(MO.getIndex());
+    break;
+
+  case MachineOperand::MO_ConstantPoolIndex:
+    Symbol = AsmPrinter.GetCPISymbol(MO.getIndex());
+    Offset += MO.getOffset();
+    break;
+
+  default:
+    llvm_unreachable("<unknown operand type>");
+  }
+
+  const MCSymbolRefExpr *MCSym = MCSymbolRefExpr::Create(Symbol, Kind, *Ctx);
+
+  if (!Offset)
+    return MCOperand::CreateExpr(MCSym);
+
+  // Assume offset is never negative.
+  assert(Offset > 0);
+
+  const MCConstantExpr *OffsetExpr =  MCConstantExpr::Create(Offset, *Ctx);
+  const MCBinaryExpr *Add = MCBinaryExpr::CreateAdd(MCSym, OffsetExpr, *Ctx);
+  return MCOperand::CreateExpr(Add);
+}
+
+/*
+static void CreateMCInst(MCInst& Inst, unsigned Opc, const MCOperand &Opnd0,
+                         const MCOperand &Opnd1,
+                         const MCOperand &Opnd2 = MCOperand()) {
+  Inst.setOpcode(Opc);
+  Inst.addOperand(Opnd0);
+  Inst.addOperand(Opnd1);
+  if (Opnd2.isValid())
+    Inst.addOperand(Opnd2);
+}
+*/
+
+MCOperand MipsMCInstLower::LowerOperand(const MachineOperand &MO,
+                                        unsigned offset) const {
+  MachineOperandType MOTy = MO.getType();
+
+  switch (MOTy) {
+  default: llvm_unreachable("unknown operand type");
+  case MachineOperand::MO_Register:
+    // Ignore all implicit register operands.
+    if (MO.isImplicit()) break;
+    return MCOperand::CreateReg(MO.getReg());
+  case MachineOperand::MO_Immediate:
+    return MCOperand::CreateImm(MO.getImm() + offset);
+  case MachineOperand::MO_MachineBasicBlock:
+  case MachineOperand::MO_GlobalAddress:
+  case MachineOperand::MO_ExternalSymbol:
+  case MachineOperand::MO_JumpTableIndex:
+  case MachineOperand::MO_ConstantPoolIndex:
+  case MachineOperand::MO_BlockAddress:
+    return LowerSymbolOperand(MO, MOTy, offset);
+  case MachineOperand::MO_RegisterMask:
+    break;
+ }
+
+  return MCOperand();
+}
+
+MCOperand MipsMCInstLower::createSub(MachineBasicBlock *BB1,
+                                     MachineBasicBlock *BB2,
+                                     MCSymbolRefExpr::VariantKind Kind) const {
+  const MCSymbolRefExpr *Sym1 = MCSymbolRefExpr::Create(BB1->getSymbol(), *Ctx);
+  const MCSymbolRefExpr *Sym2 = MCSymbolRefExpr::Create(BB2->getSymbol(), *Ctx);
+  const MCBinaryExpr *Sub = MCBinaryExpr::CreateSub(Sym1, Sym2, *Ctx);
+
+  return MCOperand::CreateExpr(MipsMCExpr::Create(Kind, Sub, *Ctx));
+}
+
+void MipsMCInstLower::
+lowerLongBranchLUi(const MachineInstr *MI, MCInst &OutMI) const {
+  OutMI.setOpcode(Mips::LUi);
+
+  // Lower register operand.
+  OutMI.addOperand(LowerOperand(MI->getOperand(0)));
+
+  // Create %hi($tgt-$baltgt).
+  OutMI.addOperand(createSub(MI->getOperand(1).getMBB(),
+                             MI->getOperand(2).getMBB(),
+                             MCSymbolRefExpr::VK_Mips_ABS_HI));
+}
+
+void MipsMCInstLower::
+lowerLongBranchADDiu(const MachineInstr *MI, MCInst &OutMI, int Opcode,
+                     MCSymbolRefExpr::VariantKind Kind) const {
+  OutMI.setOpcode(Opcode);
+
+  // Lower two register operands.
+  for (unsigned I = 0, E = 2; I != E; ++I) {
+    const MachineOperand &MO = MI->getOperand(I);
+    OutMI.addOperand(LowerOperand(MO));
+  }
+
+  // Create %lo($tgt-$baltgt) or %hi($tgt-$baltgt).
+  OutMI.addOperand(createSub(MI->getOperand(2).getMBB(),
+                             MI->getOperand(3).getMBB(), Kind));
+}
+
+bool MipsMCInstLower::lowerLongBranch(const MachineInstr *MI,
+                                      MCInst &OutMI) const {
+  switch (MI->getOpcode()) {
+  default:
+    return false;
+  case Mips::LONG_BRANCH_LUi:
+    lowerLongBranchLUi(MI, OutMI);
+    return true;
+  case Mips::LONG_BRANCH_ADDiu:
+    lowerLongBranchADDiu(MI, OutMI, Mips::ADDiu,
+                         MCSymbolRefExpr::VK_Mips_ABS_LO);
+    return true;
+  case Mips::LONG_BRANCH_DADDiu:
+    unsigned TargetFlags = MI->getOperand(2).getTargetFlags();
+    if (TargetFlags == MipsII::MO_ABS_HI)
+      lowerLongBranchADDiu(MI, OutMI, Mips::DADDiu,
+                           MCSymbolRefExpr::VK_Mips_ABS_HI);
+    else if (TargetFlags == MipsII::MO_ABS_LO)
+      lowerLongBranchADDiu(MI, OutMI, Mips::DADDiu,
+                           MCSymbolRefExpr::VK_Mips_ABS_LO);
+    else
+      report_fatal_error("Unexpected flags for LONG_BRANCH_DADDiu");
+    return true;
+  }
+}
+
+void MipsMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+  if (lowerLongBranch(MI, OutMI))
+    return;
+
+  OutMI.setOpcode(MI->getOpcode());
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    MCOperand MCOp = LowerOperand(MO);
+
+    if (MCOp.isValid())
+      OutMI.addOperand(MCOp);
+  }
+}
+
diff --git a/contrib/llvm/lib/Target/Mips/MipsMCInstLower.h b/contrib/llvm/lib/Target/Mips/MipsMCInstLower.h
new file mode 100644
index 0000000..269190f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsMCInstLower.h
@@ -0,0 +1,50 @@
+//===-- MipsMCInstLower.h - Lower MachineInstr to MCInst -------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSMCINSTLOWER_H
+#define MIPSMCINSTLOWER_H
+#include "MCTargetDesc/MipsMCExpr.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+  class MCContext;
+  class MCInst;
+  class MCOperand;
+  class MachineInstr;
+  class MachineFunction;
+  class MipsAsmPrinter;
+
+/// MipsMCInstLower - This class is used to lower an MachineInstr into an
+//                    MCInst.
+class LLVM_LIBRARY_VISIBILITY MipsMCInstLower {
+  typedef MachineOperand::MachineOperandType MachineOperandType;
+  MCContext *Ctx;
+  MipsAsmPrinter &AsmPrinter;
+public:
+  MipsMCInstLower(MipsAsmPrinter &asmprinter);
+  void Initialize(MCContext *C);
+  void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+  MCOperand LowerOperand(const MachineOperand& MO, unsigned offset = 0) const;
+
+private:
+  MCOperand LowerSymbolOperand(const MachineOperand &MO,
+                               MachineOperandType MOTy, unsigned Offset) const;
+  MCOperand createSub(MachineBasicBlock *BB1, MachineBasicBlock *BB2,
+                      MCSymbolRefExpr::VariantKind Kind) const;
+  void lowerLongBranchLUi(const MachineInstr *MI, MCInst &OutMI) const;
+  void lowerLongBranchADDiu(const MachineInstr *MI, MCInst &OutMI,
+                            int Opcode,
+                            MCSymbolRefExpr::VariantKind Kind) const;
+  bool lowerLongBranch(const MachineInstr *MI, MCInst &OutMI) const;
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsMSAInstrFormats.td b/contrib/llvm/lib/Target/Mips/MipsMSAInstrFormats.td
new file mode 100644
index 0000000..bff2d0f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsMSAInstrFormats.td
@@ -0,0 +1,465 @@
+//===- MipsMSAInstrFormats.td - Mips Instruction Formats ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def HasMSA : Predicate<"Subtarget->hasMSA()">,
+             AssemblerPredicate<"FeatureMSA">;
+
+class MSAInst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther> {
+  let Predicates = [HasMSA];
+  let Inst{31-26} = 0b011110;
+}
+
+class MSA64Inst : MSAInst {
+  let Predicates = [HasMSA, HasMips64];
+}
+
+class MSACBranch : MSAInst {
+  let Inst{31-26} = 0b010001;
+}
+
+class MSASpecial : MSAInst {
+  let Inst{31-26} = 0b000000;
+}
+
+class MSA64Special : MSA64Inst {
+  let Inst{31-26} = 0b000000;
+}
+
+class MSAPseudo<dag outs, dag ins, list<dag> pattern,
+                InstrItinClass itin = IIPseudo>:
+  MipsPseudo<outs, ins, pattern, itin> {
+  let Predicates = [HasMSA];
+}
+
+class MSA_BIT_B_FMT<bits<3> major, bits<6> minor>: MSAInst {
+  bits<5> ws;
+  bits<5> wd;
+  bits<3> m;
+
+  let Inst{25-23} = major;
+  let Inst{22-19} = 0b1110;
+  let Inst{18-16} = m;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_BIT_H_FMT<bits<3> major, bits<6> minor>: MSAInst {
+  bits<5> ws;
+  bits<5> wd;
+  bits<4> m;
+
+  let Inst{25-23} = major;
+  let Inst{22-20} = 0b110;
+  let Inst{19-16} = m;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_BIT_W_FMT<bits<3> major, bits<6> minor>: MSAInst {
+  bits<5> ws;
+  bits<5> wd;
+  bits<5> m;
+
+  let Inst{25-23} = major;
+  let Inst{22-21} = 0b10;
+  let Inst{20-16} = m;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_BIT_D_FMT<bits<3> major, bits<6> minor>: MSAInst {
+  bits<5> ws;
+  bits<5> wd;
+  bits<6> m;
+
+  let Inst{25-23} = major;
+  let Inst{22} = 0b0;
+  let Inst{21-16} = m;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_2R_FILL_FMT<bits<8> major, bits<2> df, bits<6> minor>: MSAInst {
+  bits<5> rs;
+  bits<5> wd;
+
+  let Inst{25-18} = major;
+  let Inst{17-16} = df;
+  let Inst{15-11} = rs;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_2R_FILL_D_FMT<bits<8> major, bits<2> df, bits<6> minor>: MSA64Inst {
+  bits<5> rs;
+  bits<5> wd;
+
+  let Inst{25-18} = major;
+  let Inst{17-16} = df;
+  let Inst{15-11} = rs;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_2R_FMT<bits<8> major, bits<2> df, bits<6> minor>: MSAInst {
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-18} = major;
+  let Inst{17-16} = df;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_2RF_FMT<bits<9> major, bits<1> df, bits<6> minor>: MSAInst {
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-17} = major;
+  let Inst{16} = df;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_3R_FMT<bits<3> major, bits<2> df, bits<6> minor>: MSAInst {
+  bits<5> wt;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-23} = major;
+  let Inst{22-21} = df;
+  let Inst{20-16} = wt;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_3RF_FMT<bits<4> major, bits<1> df, bits<6> minor>: MSAInst {
+  bits<5> wt;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-22} = major;
+  let Inst{21} = df;
+  let Inst{20-16} = wt;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_3R_INDEX_FMT<bits<3> major, bits<2> df, bits<6> minor>: MSAInst {
+  bits<5> rt;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-23} = major;
+  let Inst{22-21} = df;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_FMT<bits<10> major, bits<6> minor>: MSAInst {
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-16} = major;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_CFCMSA_FMT<bits<10> major, bits<6> minor>: MSAInst {
+  bits<5> rd;
+  bits<5> cs;
+
+  let Inst{25-16} = major;
+  let Inst{15-11} = cs;
+  let Inst{10-6} = rd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_CTCMSA_FMT<bits<10> major, bits<6> minor>: MSAInst {
+  bits<5> rs;
+  bits<5> cd;
+
+  let Inst{25-16} = major;
+  let Inst{15-11} = rs;
+  let Inst{10-6} = cd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_B_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<4> n;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-22} = major;
+  let Inst{21-20} = 0b00;
+  let Inst{19-16} = n{3-0};
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_H_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<4> n;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-22} = major;
+  let Inst{21-19} = 0b100;
+  let Inst{18-16} = n{2-0};
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_W_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<4> n;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-22} = major;
+  let Inst{21-18} = 0b1100;
+  let Inst{17-16} = n{1-0};
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_D_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<4> n;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-22} = major;
+  let Inst{21-17} = 0b11100;
+  let Inst{16} = n{0};
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_COPY_B_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<4> n;
+  bits<5> ws;
+  bits<5> rd;
+
+  let Inst{25-22} = major;
+  let Inst{21-20} = 0b00;
+  let Inst{19-16} = n{3-0};
+  let Inst{15-11} = ws;
+  let Inst{10-6} = rd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_COPY_H_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<4> n;
+  bits<5> ws;
+  bits<5> rd;
+
+  let Inst{25-22} = major;
+  let Inst{21-19} = 0b100;
+  let Inst{18-16} = n{2-0};
+  let Inst{15-11} = ws;
+  let Inst{10-6} = rd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_COPY_W_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<4> n;
+  bits<5> ws;
+  bits<5> rd;
+
+  let Inst{25-22} = major;
+  let Inst{21-18} = 0b1100;
+  let Inst{17-16} = n{1-0};
+  let Inst{15-11} = ws;
+  let Inst{10-6} = rd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_COPY_D_FMT<bits<4> major, bits<6> minor>: MSA64Inst {
+  bits<4> n;
+  bits<5> ws;
+  bits<5> rd;
+
+  let Inst{25-22} = major;
+  let Inst{21-17} = 0b11100;
+  let Inst{16} = n{0};
+  let Inst{15-11} = ws;
+  let Inst{10-6} = rd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_INSERT_B_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<6> n;
+  bits<5> rs;
+  bits<5> wd;
+
+  let Inst{25-22} = major;
+  let Inst{21-20} = 0b00;
+  let Inst{19-16} = n{3-0};
+  let Inst{15-11} = rs;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_INSERT_H_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<6> n;
+  bits<5> rs;
+  bits<5> wd;
+
+  let Inst{25-22} = major;
+  let Inst{21-19} = 0b100;
+  let Inst{18-16} = n{2-0};
+  let Inst{15-11} = rs;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_INSERT_W_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<6> n;
+  bits<5> rs;
+  bits<5> wd;
+
+  let Inst{25-22} = major;
+  let Inst{21-18} = 0b1100;
+  let Inst{17-16} = n{1-0};
+  let Inst{15-11} = rs;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_INSERT_D_FMT<bits<4> major, bits<6> minor>: MSA64Inst {
+  bits<6> n;
+  bits<5> rs;
+  bits<5> wd;
+
+  let Inst{25-22} = major;
+  let Inst{21-17} = 0b11100;
+  let Inst{16} = n{0};
+  let Inst{15-11} = rs;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_I5_FMT<bits<3> major, bits<2> df, bits<6> minor>: MSAInst {
+  bits<5> imm;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-23} = major;
+  let Inst{22-21} = df;
+  let Inst{20-16} = imm;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_I8_FMT<bits<2> major, bits<6> minor>: MSAInst {
+  bits<8> u8;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-24} = major;
+  let Inst{23-16} = u8;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_I10_FMT<bits<3> major, bits<2> df, bits<6> minor>: MSAInst {
+  bits<10> s10;
+  bits<5> wd;
+
+  let Inst{25-23} = major;
+  let Inst{22-21} = df;
+  let Inst{20-11} = s10;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_MI10_FMT<bits<2> df, bits<4> minor>: MSAInst {
+  bits<21> addr;
+  bits<5> wd;
+
+  let Inst{25-16} = addr{9-0};
+  let Inst{15-11} = addr{20-16};
+  let Inst{10-6} = wd;
+  let Inst{5-2} = minor;
+  let Inst{1-0} = df;
+}
+
+class MSA_VEC_FMT<bits<5> major, bits<6> minor>: MSAInst {
+  bits<5> wt;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-21} = major;
+  let Inst{20-16} = wt;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_CBRANCH_FMT<bits<3> major, bits<2> df>: MSACBranch {
+  bits<16> offset;
+  bits<5> wt;
+
+  let Inst{25-23} = major;
+  let Inst{22-21} = df;
+  let Inst{20-16} = wt;
+  let Inst{15-0} = offset;
+}
+
+class MSA_CBRANCH_V_FMT<bits<5> major>: MSACBranch {
+  bits<16> offset;
+  bits<5> wt;
+
+  let Inst{25-21} = major;
+  let Inst{20-16} = wt;
+  let Inst{15-0} = offset;
+}
+
+class SPECIAL_LSA_FMT<bits<6> minor>: MSASpecial {
+  bits<5> rs;
+  bits<5> rt;
+  bits<5> rd;
+  bits<2> sa;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-8} = 0b000;
+  let Inst{7-6} = sa;
+  let Inst{5-0} = minor;
+}
+
+class SPECIAL_DLSA_FMT<bits<6> minor>: MSA64Special {
+  bits<5> rs;
+  bits<5> rt;
+  bits<5> rd;
+  bits<2> sa;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-8} = 0b000;
+  let Inst{7-6} = sa;
+  let Inst{5-0} = minor;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsMSAInstrInfo.td b/contrib/llvm/lib/Target/Mips/MipsMSAInstrInfo.td
new file mode 100644
index 0000000..285bb14
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsMSAInstrInfo.td
@@ -0,0 +1,3807 @@
+//===- MipsMSAInstrInfo.td - MSA ASE instructions -*- tablegen ------------*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Mips MSA ASE instructions.
+//
+//===----------------------------------------------------------------------===//
+
+def SDT_MipsVecCond : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisVec<1>]>;
+def SDT_VSetCC : SDTypeProfile<1, 3, [SDTCisInt<0>,
+                                      SDTCisInt<1>,
+                                      SDTCisSameAs<1, 2>,
+                                      SDTCisVT<3, OtherVT>]>;
+def SDT_VFSetCC : SDTypeProfile<1, 3, [SDTCisInt<0>,
+                                       SDTCisFP<1>,
+                                       SDTCisSameAs<1, 2>,
+                                       SDTCisVT<3, OtherVT>]>;
+def SDT_VSHF : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisVec<0>,
+                                    SDTCisInt<1>, SDTCisVec<1>,
+                                    SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>]>;
+def SDT_SHF : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisVec<0>,
+                                   SDTCisVT<1, i32>, SDTCisSameAs<0, 2>]>;
+def SDT_ILV : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisVec<0>,
+                                   SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>]>;
+def SDT_INSVE : SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
+                                     SDTCisVT<2, i32>, SDTCisSameAs<0, 3>,
+                                     SDTCisVT<4, i32>]>;
+
+def MipsVAllNonZero : SDNode<"MipsISD::VALL_NONZERO", SDT_MipsVecCond>;
+def MipsVAnyNonZero : SDNode<"MipsISD::VANY_NONZERO", SDT_MipsVecCond>;
+def MipsVAllZero : SDNode<"MipsISD::VALL_ZERO", SDT_MipsVecCond>;
+def MipsVAnyZero : SDNode<"MipsISD::VANY_ZERO", SDT_MipsVecCond>;
+def MipsVSMax : SDNode<"MipsISD::VSMAX", SDTIntBinOp,
+                       [SDNPCommutative, SDNPAssociative]>;
+def MipsVSMin : SDNode<"MipsISD::VSMIN", SDTIntBinOp,
+                       [SDNPCommutative, SDNPAssociative]>;
+def MipsVUMax : SDNode<"MipsISD::VUMAX", SDTIntBinOp,
+                       [SDNPCommutative, SDNPAssociative]>;
+def MipsVUMin : SDNode<"MipsISD::VUMIN", SDTIntBinOp,
+                       [SDNPCommutative, SDNPAssociative]>;
+def MipsVNOR : SDNode<"MipsISD::VNOR", SDTIntBinOp,
+                      [SDNPCommutative, SDNPAssociative]>;
+def MipsVSHF : SDNode<"MipsISD::VSHF", SDT_VSHF>;
+def MipsSHF : SDNode<"MipsISD::SHF", SDT_SHF>;
+def MipsILVEV : SDNode<"MipsISD::ILVEV", SDT_ILV>;
+def MipsILVOD : SDNode<"MipsISD::ILVOD", SDT_ILV>;
+def MipsILVL  : SDNode<"MipsISD::ILVL",  SDT_ILV>;
+def MipsILVR  : SDNode<"MipsISD::ILVR",  SDT_ILV>;
+def MipsPCKEV : SDNode<"MipsISD::PCKEV", SDT_ILV>;
+def MipsPCKOD : SDNode<"MipsISD::PCKOD", SDT_ILV>;
+def MipsINSVE : SDNode<"MipsISD::INSVE", SDT_INSVE>;
+
+def vsetcc : SDNode<"ISD::SETCC", SDT_VSetCC>;
+def vfsetcc : SDNode<"ISD::SETCC", SDT_VFSetCC>;
+
+def MipsVExtractSExt : SDNode<"MipsISD::VEXTRACT_SEXT_ELT",
+    SDTypeProfile<1, 3, [SDTCisPtrTy<2>]>, []>;
+def MipsVExtractZExt : SDNode<"MipsISD::VEXTRACT_ZEXT_ELT",
+    SDTypeProfile<1, 3, [SDTCisPtrTy<2>]>, []>;
+
+// Operands
+
+// The immediate of an LSA instruction needs special handling
+// as the encoded value should be subtracted by one.
+def uimm2LSAAsmOperand : AsmOperandClass {
+  let Name = "LSAImm";
+  let ParserMethod = "ParseLSAImm";
+  let RenderMethod = "addImmOperands";
+}
+
+def LSAImm : Operand<i32> {
+  let PrintMethod = "printUnsignedImm";
+  let EncoderMethod = "getLSAImmEncoding";
+  let DecoderMethod = "DecodeLSAImm";
+  let ParserMatchClass = uimm2LSAAsmOperand;
+}
+
+def uimm4 : Operand<i32> {
+  let PrintMethod = "printUnsignedImm8";
+}
+
+def uimm8 : Operand<i32> {
+  let PrintMethod = "printUnsignedImm8";
+}
+
+def simm5 : Operand<i32>;
+
+def vsplat_uimm1 : Operand<vAny> {
+  let PrintMethod = "printUnsignedImm8";
+}
+
+def vsplat_uimm2 : Operand<vAny> {
+  let PrintMethod = "printUnsignedImm8";
+}
+
+def vsplat_uimm3 : Operand<vAny> {
+  let PrintMethod = "printUnsignedImm8";
+}
+
+def vsplat_uimm4 : Operand<vAny> {
+  let PrintMethod = "printUnsignedImm8";
+}
+
+def vsplat_uimm5 : Operand<vAny> {
+  let PrintMethod = "printUnsignedImm8";
+}
+
+def vsplat_uimm6 : Operand<vAny> {
+  let PrintMethod = "printUnsignedImm8";
+}
+
+def vsplat_uimm8 : Operand<vAny> {
+  let PrintMethod = "printUnsignedImm8";
+}
+
+def vsplat_simm5 : Operand<vAny>;
+
+def vsplat_simm10 : Operand<vAny>;
+
+def immZExt2Lsa : ImmLeaf<i32, [{return isUInt<2>(Imm - 1);}]>;
+
+// Pattern fragments
+def vextract_sext_i8  : PatFrag<(ops node:$vec, node:$idx),
+                                (MipsVExtractSExt node:$vec, node:$idx, i8)>;
+def vextract_sext_i16 : PatFrag<(ops node:$vec, node:$idx),
+                                (MipsVExtractSExt node:$vec, node:$idx, i16)>;
+def vextract_sext_i32 : PatFrag<(ops node:$vec, node:$idx),
+                                (MipsVExtractSExt node:$vec, node:$idx, i32)>;
+def vextract_sext_i64 : PatFrag<(ops node:$vec, node:$idx),
+                                (MipsVExtractSExt node:$vec, node:$idx, i64)>;
+
+def vextract_zext_i8  : PatFrag<(ops node:$vec, node:$idx),
+                                (MipsVExtractZExt node:$vec, node:$idx, i8)>;
+def vextract_zext_i16 : PatFrag<(ops node:$vec, node:$idx),
+                                (MipsVExtractZExt node:$vec, node:$idx, i16)>;
+def vextract_zext_i32 : PatFrag<(ops node:$vec, node:$idx),
+                                (MipsVExtractZExt node:$vec, node:$idx, i32)>;
+def vextract_zext_i64 : PatFrag<(ops node:$vec, node:$idx),
+                                (MipsVExtractZExt node:$vec, node:$idx, i64)>;
+
+def vinsert_v16i8 : PatFrag<(ops node:$vec, node:$val, node:$idx),
+    (v16i8 (vector_insert node:$vec, node:$val, node:$idx))>;
+def vinsert_v8i16 : PatFrag<(ops node:$vec, node:$val, node:$idx),
+    (v8i16 (vector_insert node:$vec, node:$val, node:$idx))>;
+def vinsert_v4i32 : PatFrag<(ops node:$vec, node:$val, node:$idx),
+    (v4i32 (vector_insert node:$vec, node:$val, node:$idx))>;
+def vinsert_v2i64 : PatFrag<(ops node:$vec, node:$val, node:$idx),
+    (v2i64 (vector_insert node:$vec, node:$val, node:$idx))>;
+
+def insve_v16i8 : PatFrag<(ops node:$v1, node:$i1, node:$v2, node:$i2),
+    (v16i8 (MipsINSVE node:$v1, node:$i1, node:$v2, node:$i2))>;
+def insve_v8i16 : PatFrag<(ops node:$v1, node:$i1, node:$v2, node:$i2),
+    (v8i16 (MipsINSVE node:$v1, node:$i1, node:$v2, node:$i2))>;
+def insve_v4i32 : PatFrag<(ops node:$v1, node:$i1, node:$v2, node:$i2),
+    (v4i32 (MipsINSVE node:$v1, node:$i1, node:$v2, node:$i2))>;
+def insve_v2i64 : PatFrag<(ops node:$v1, node:$i1, node:$v2, node:$i2),
+    (v2i64 (MipsINSVE node:$v1, node:$i1, node:$v2, node:$i2))>;
+
+class vfsetcc_type<ValueType ResTy, ValueType OpTy, CondCode CC> :
+  PatFrag<(ops node:$lhs, node:$rhs),
+          (ResTy (vfsetcc (OpTy node:$lhs), (OpTy node:$rhs), CC))>;
+
+// ISD::SETFALSE cannot occur
+def vfsetoeq_v4f32 : vfsetcc_type<v4i32, v4f32, SETOEQ>;
+def vfsetoeq_v2f64 : vfsetcc_type<v2i64, v2f64, SETOEQ>;
+def vfsetoge_v4f32 : vfsetcc_type<v4i32, v4f32, SETOGE>;
+def vfsetoge_v2f64 : vfsetcc_type<v2i64, v2f64, SETOGE>;
+def vfsetogt_v4f32 : vfsetcc_type<v4i32, v4f32, SETOGT>;
+def vfsetogt_v2f64 : vfsetcc_type<v2i64, v2f64, SETOGT>;
+def vfsetole_v4f32 : vfsetcc_type<v4i32, v4f32, SETOLE>;
+def vfsetole_v2f64 : vfsetcc_type<v2i64, v2f64, SETOLE>;
+def vfsetolt_v4f32 : vfsetcc_type<v4i32, v4f32, SETOLT>;
+def vfsetolt_v2f64 : vfsetcc_type<v2i64, v2f64, SETOLT>;
+def vfsetone_v4f32 : vfsetcc_type<v4i32, v4f32, SETONE>;
+def vfsetone_v2f64 : vfsetcc_type<v2i64, v2f64, SETONE>;
+def vfsetord_v4f32 : vfsetcc_type<v4i32, v4f32, SETO>;
+def vfsetord_v2f64 : vfsetcc_type<v2i64, v2f64, SETO>;
+def vfsetun_v4f32  : vfsetcc_type<v4i32, v4f32, SETUO>;
+def vfsetun_v2f64  : vfsetcc_type<v2i64, v2f64, SETUO>;
+def vfsetueq_v4f32 : vfsetcc_type<v4i32, v4f32, SETUEQ>;
+def vfsetueq_v2f64 : vfsetcc_type<v2i64, v2f64, SETUEQ>;
+def vfsetuge_v4f32 : vfsetcc_type<v4i32, v4f32, SETUGE>;
+def vfsetuge_v2f64 : vfsetcc_type<v2i64, v2f64, SETUGE>;
+def vfsetugt_v4f32 : vfsetcc_type<v4i32, v4f32, SETUGT>;
+def vfsetugt_v2f64 : vfsetcc_type<v2i64, v2f64, SETUGT>;
+def vfsetule_v4f32 : vfsetcc_type<v4i32, v4f32, SETULE>;
+def vfsetule_v2f64 : vfsetcc_type<v2i64, v2f64, SETULE>;
+def vfsetult_v4f32 : vfsetcc_type<v4i32, v4f32, SETULT>;
+def vfsetult_v2f64 : vfsetcc_type<v2i64, v2f64, SETULT>;
+def vfsetune_v4f32 : vfsetcc_type<v4i32, v4f32, SETUNE>;
+def vfsetune_v2f64 : vfsetcc_type<v2i64, v2f64, SETUNE>;
+// ISD::SETTRUE cannot occur
+// ISD::SETFALSE2 cannot occur
+// ISD::SETTRUE2 cannot occur
+
+class vsetcc_type<ValueType ResTy, CondCode CC> :
+  PatFrag<(ops node:$lhs, node:$rhs),
+          (ResTy (vsetcc node:$lhs, node:$rhs, CC))>;
+
+def vseteq_v16i8  : vsetcc_type<v16i8, SETEQ>;
+def vseteq_v8i16  : vsetcc_type<v8i16, SETEQ>;
+def vseteq_v4i32  : vsetcc_type<v4i32, SETEQ>;
+def vseteq_v2i64  : vsetcc_type<v2i64, SETEQ>;
+def vsetle_v16i8  : vsetcc_type<v16i8, SETLE>;
+def vsetle_v8i16  : vsetcc_type<v8i16, SETLE>;
+def vsetle_v4i32  : vsetcc_type<v4i32, SETLE>;
+def vsetle_v2i64  : vsetcc_type<v2i64, SETLE>;
+def vsetlt_v16i8  : vsetcc_type<v16i8, SETLT>;
+def vsetlt_v8i16  : vsetcc_type<v8i16, SETLT>;
+def vsetlt_v4i32  : vsetcc_type<v4i32, SETLT>;
+def vsetlt_v2i64  : vsetcc_type<v2i64, SETLT>;
+def vsetule_v16i8 : vsetcc_type<v16i8, SETULE>;
+def vsetule_v8i16 : vsetcc_type<v8i16, SETULE>;
+def vsetule_v4i32 : vsetcc_type<v4i32, SETULE>;
+def vsetule_v2i64 : vsetcc_type<v2i64, SETULE>;
+def vsetult_v16i8 : vsetcc_type<v16i8, SETULT>;
+def vsetult_v8i16 : vsetcc_type<v8i16, SETULT>;
+def vsetult_v4i32 : vsetcc_type<v4i32, SETULT>;
+def vsetult_v2i64 : vsetcc_type<v2i64, SETULT>;
+
+def vsplati8  : PatFrag<(ops node:$e0),
+                        (v16i8 (build_vector node:$e0, node:$e0,
+                                             node:$e0, node:$e0,
+                                             node:$e0, node:$e0,
+                                             node:$e0, node:$e0,
+                                             node:$e0, node:$e0,
+                                             node:$e0, node:$e0,
+                                             node:$e0, node:$e0,
+                                             node:$e0, node:$e0))>;
+def vsplati16 : PatFrag<(ops node:$e0),
+                        (v8i16 (build_vector node:$e0, node:$e0,
+                                             node:$e0, node:$e0,
+                                             node:$e0, node:$e0,
+                                             node:$e0, node:$e0))>;
+def vsplati32 : PatFrag<(ops node:$e0),
+                        (v4i32 (build_vector node:$e0, node:$e0,
+                                             node:$e0, node:$e0))>;
+def vsplati64 : PatFrag<(ops node:$e0),
+                        (v2i64 (build_vector node:$e0, node:$e0))>;
+def vsplatf32 : PatFrag<(ops node:$e0),
+                        (v4f32 (build_vector node:$e0, node:$e0,
+                                             node:$e0, node:$e0))>;
+def vsplatf64 : PatFrag<(ops node:$e0),
+                        (v2f64 (build_vector node:$e0, node:$e0))>;
+
+def vsplati8_elt  : PatFrag<(ops node:$v, node:$i),
+                            (MipsVSHF (vsplati8 node:$i), node:$v, node:$v)>;
+def vsplati16_elt : PatFrag<(ops node:$v, node:$i),
+                            (MipsVSHF (vsplati16 node:$i), node:$v, node:$v)>;
+def vsplati32_elt : PatFrag<(ops node:$v, node:$i),
+                            (MipsVSHF (vsplati32 node:$i), node:$v, node:$v)>;
+def vsplati64_elt : PatFrag<(ops node:$v, node:$i),
+                            (MipsVSHF (vsplati64 node:$i), node:$v, node:$v)>;
+
+class SplatPatLeaf<Operand opclass, dag frag, code pred = [{}],
+                   SDNodeXForm xform = NOOP_SDNodeXForm>
+  : PatLeaf<frag, pred, xform> {
+  Operand OpClass = opclass;
+}
+
+class SplatComplexPattern<Operand opclass, ValueType ty, int numops, string fn,
+                          list<SDNode> roots = [],
+                          list<SDNodeProperty> props = []> :
+  ComplexPattern<ty, numops, fn, roots, props> {
+  Operand OpClass = opclass;
+}
+
+def vsplati8_uimm3 : SplatComplexPattern<vsplat_uimm3, v16i8, 1,
+                                         "selectVSplatUimm3",
+                                         [build_vector, bitconvert]>;
+
+def vsplati8_uimm4 : SplatComplexPattern<vsplat_uimm4, v16i8, 1,
+                                         "selectVSplatUimm4",
+                                         [build_vector, bitconvert]>;
+
+def vsplati8_uimm5 : SplatComplexPattern<vsplat_uimm5, v16i8, 1,
+                                         "selectVSplatUimm5",
+                                         [build_vector, bitconvert]>;
+
+def vsplati8_uimm8 : SplatComplexPattern<vsplat_uimm8, v16i8, 1,
+                                         "selectVSplatUimm8",
+                                         [build_vector, bitconvert]>;
+
+def vsplati8_simm5 : SplatComplexPattern<vsplat_simm5, v16i8, 1,
+                                         "selectVSplatSimm5",
+                                         [build_vector, bitconvert]>;
+
+def vsplati16_uimm3 : SplatComplexPattern<vsplat_uimm3, v8i16, 1,
+                                          "selectVSplatUimm3",
+                                          [build_vector, bitconvert]>;
+
+def vsplati16_uimm4 : SplatComplexPattern<vsplat_uimm4, v8i16, 1,
+                                          "selectVSplatUimm4",
+                                          [build_vector, bitconvert]>;
+
+def vsplati16_uimm5 : SplatComplexPattern<vsplat_uimm5, v8i16, 1,
+                                          "selectVSplatUimm5",
+                                          [build_vector, bitconvert]>;
+
+def vsplati16_simm5 : SplatComplexPattern<vsplat_simm5, v8i16, 1,
+                                          "selectVSplatSimm5",
+                                          [build_vector, bitconvert]>;
+
+def vsplati32_uimm2 : SplatComplexPattern<vsplat_uimm2, v4i32, 1,
+                                          "selectVSplatUimm2",
+                                          [build_vector, bitconvert]>;
+
+def vsplati32_uimm5 : SplatComplexPattern<vsplat_uimm5, v4i32, 1,
+                                          "selectVSplatUimm5",
+                                          [build_vector, bitconvert]>;
+
+def vsplati32_simm5 : SplatComplexPattern<vsplat_simm5, v4i32, 1,
+                                          "selectVSplatSimm5",
+                                          [build_vector, bitconvert]>;
+
+def vsplati64_uimm1 : SplatComplexPattern<vsplat_uimm1, v2i64, 1,
+                                          "selectVSplatUimm1",
+                                          [build_vector, bitconvert]>;
+
+def vsplati64_uimm5 : SplatComplexPattern<vsplat_uimm5, v2i64, 1,
+                                          "selectVSplatUimm5",
+                                          [build_vector, bitconvert]>;
+
+def vsplati64_uimm6 : SplatComplexPattern<vsplat_uimm6, v2i64, 1,
+                                          "selectVSplatUimm6",
+                                          [build_vector, bitconvert]>;
+
+def vsplati64_simm5 : SplatComplexPattern<vsplat_simm5, v2i64, 1,
+                                          "selectVSplatSimm5",
+                                          [build_vector, bitconvert]>;
+
+// Any build_vector that is a constant splat with a value that is an exact
+// power of 2
+def vsplat_uimm_pow2 : ComplexPattern<vAny, 1, "selectVSplatUimmPow2",
+                                      [build_vector, bitconvert]>;
+
+// Any build_vector that is a constant splat with a value that is the bitwise
+// inverse of an exact power of 2
+def vsplat_uimm_inv_pow2 : ComplexPattern<vAny, 1, "selectVSplatUimmInvPow2",
+                                          [build_vector, bitconvert]>;
+
+// Any build_vector that is a constant splat with only a consecutive sequence
+// of left-most bits set.
+def vsplat_maskl_bits : SplatComplexPattern<vsplat_uimm8, vAny, 1,
+                                            "selectVSplatMaskL",
+                                            [build_vector, bitconvert]>;
+
+// Any build_vector that is a constant splat with only a consecutive sequence
+// of right-most bits set.
+def vsplat_maskr_bits : SplatComplexPattern<vsplat_uimm8, vAny, 1,
+                                            "selectVSplatMaskR",
+                                            [build_vector, bitconvert]>;
+
+// Any build_vector that is a constant splat with a value that equals 1
+// FIXME: These should be a ComplexPattern but we can't use them because the
+//        ISel generator requires the uses to have a name, but providing a name
+//        causes other errors ("used in pattern but not operand list")
+def vsplat_imm_eq_1 : PatLeaf<(build_vector), [{
+  APInt Imm;
+  EVT EltTy = N->getValueType(0).getVectorElementType();
+
+  return selectVSplat (N, Imm) &&
+         Imm.getBitWidth() == EltTy.getSizeInBits() && Imm == 1;
+}]>;
+
+def vsplati64_imm_eq_1 : PatLeaf<(bitconvert (v4i32 (build_vector))), [{
+  APInt Imm;
+  SDNode *BV = N->getOperand(0).getNode();
+  EVT EltTy = N->getValueType(0).getVectorElementType();
+
+  return selectVSplat (BV, Imm) &&
+         Imm.getBitWidth() == EltTy.getSizeInBits() && Imm == 1;
+}]>;
+
+def vbclr_b : PatFrag<(ops node:$ws, node:$wt),
+                      (and node:$ws, (xor (shl vsplat_imm_eq_1, node:$wt),
+                                          immAllOnesV))>;
+def vbclr_h : PatFrag<(ops node:$ws, node:$wt),
+                      (and node:$ws, (xor (shl vsplat_imm_eq_1, node:$wt),
+                                          immAllOnesV))>;
+def vbclr_w : PatFrag<(ops node:$ws, node:$wt),
+                      (and node:$ws, (xor (shl vsplat_imm_eq_1, node:$wt),
+                                          immAllOnesV))>;
+def vbclr_d : PatFrag<(ops node:$ws, node:$wt),
+                      (and node:$ws, (xor (shl (v2i64 vsplati64_imm_eq_1),
+                                               node:$wt),
+                                          (bitconvert (v4i32 immAllOnesV))))>;
+
+def vbneg_b : PatFrag<(ops node:$ws, node:$wt),
+                      (xor node:$ws, (shl vsplat_imm_eq_1, node:$wt))>;
+def vbneg_h : PatFrag<(ops node:$ws, node:$wt),
+                      (xor node:$ws, (shl vsplat_imm_eq_1, node:$wt))>;
+def vbneg_w : PatFrag<(ops node:$ws, node:$wt),
+                      (xor node:$ws, (shl vsplat_imm_eq_1, node:$wt))>;
+def vbneg_d : PatFrag<(ops node:$ws, node:$wt),
+                      (xor node:$ws, (shl (v2i64 vsplati64_imm_eq_1),
+                                          node:$wt))>;
+
+def vbset_b : PatFrag<(ops node:$ws, node:$wt),
+                      (or node:$ws, (shl vsplat_imm_eq_1, node:$wt))>;
+def vbset_h : PatFrag<(ops node:$ws, node:$wt),
+                      (or node:$ws, (shl vsplat_imm_eq_1, node:$wt))>;
+def vbset_w : PatFrag<(ops node:$ws, node:$wt),
+                      (or node:$ws, (shl vsplat_imm_eq_1, node:$wt))>;
+def vbset_d : PatFrag<(ops node:$ws, node:$wt),
+                      (or node:$ws, (shl (v2i64 vsplati64_imm_eq_1),
+                                         node:$wt))>;
+
+def fms : PatFrag<(ops node:$wd, node:$ws, node:$wt),
+                  (fsub node:$wd, (fmul node:$ws, node:$wt))>;
+
+def muladd : PatFrag<(ops node:$wd, node:$ws, node:$wt),
+                     (add node:$wd, (mul node:$ws, node:$wt))>;
+
+def mulsub : PatFrag<(ops node:$wd, node:$ws, node:$wt),
+                     (sub node:$wd, (mul node:$ws, node:$wt))>;
+
+def mul_fexp2 : PatFrag<(ops node:$ws, node:$wt),
+                        (fmul node:$ws, (fexp2 node:$wt))>;
+
+// Immediates
+def immSExt5 : ImmLeaf<i32, [{return isInt<5>(Imm);}]>;
+def immSExt10: ImmLeaf<i32, [{return isInt<10>(Imm);}]>;
+
+// Instruction encoding.
+class ADD_A_B_ENC : MSA_3R_FMT<0b000, 0b00, 0b010000>;
+class ADD_A_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b010000>;
+class ADD_A_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b010000>;
+class ADD_A_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b010000>;
+
+class ADDS_A_B_ENC : MSA_3R_FMT<0b001, 0b00, 0b010000>;
+class ADDS_A_H_ENC : MSA_3R_FMT<0b001, 0b01, 0b010000>;
+class ADDS_A_W_ENC : MSA_3R_FMT<0b001, 0b10, 0b010000>;
+class ADDS_A_D_ENC : MSA_3R_FMT<0b001, 0b11, 0b010000>;
+
+class ADDS_S_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b010000>;
+class ADDS_S_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b010000>;
+class ADDS_S_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b010000>;
+class ADDS_S_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b010000>;
+
+class ADDS_U_B_ENC : MSA_3R_FMT<0b011, 0b00, 0b010000>;
+class ADDS_U_H_ENC : MSA_3R_FMT<0b011, 0b01, 0b010000>;
+class ADDS_U_W_ENC : MSA_3R_FMT<0b011, 0b10, 0b010000>;
+class ADDS_U_D_ENC : MSA_3R_FMT<0b011, 0b11, 0b010000>;
+
+class ADDV_B_ENC : MSA_3R_FMT<0b000, 0b00, 0b001110>;
+class ADDV_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b001110>;
+class ADDV_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b001110>;
+class ADDV_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b001110>;
+
+class ADDVI_B_ENC : MSA_I5_FMT<0b000, 0b00, 0b000110>;
+class ADDVI_H_ENC : MSA_I5_FMT<0b000, 0b01, 0b000110>;
+class ADDVI_W_ENC : MSA_I5_FMT<0b000, 0b10, 0b000110>;
+class ADDVI_D_ENC : MSA_I5_FMT<0b000, 0b11, 0b000110>;
+
+class AND_V_ENC : MSA_VEC_FMT<0b00000, 0b011110>;
+
+class ANDI_B_ENC : MSA_I8_FMT<0b00, 0b000000>;
+
+class ASUB_S_B_ENC : MSA_3R_FMT<0b100, 0b00, 0b010001>;
+class ASUB_S_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b010001>;
+class ASUB_S_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b010001>;
+class ASUB_S_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b010001>;
+
+class ASUB_U_B_ENC : MSA_3R_FMT<0b101, 0b00, 0b010001>;
+class ASUB_U_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b010001>;
+class ASUB_U_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b010001>;
+class ASUB_U_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b010001>;
+
+class AVE_S_B_ENC : MSA_3R_FMT<0b100, 0b00, 0b010000>;
+class AVE_S_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b010000>;
+class AVE_S_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b010000>;
+class AVE_S_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b010000>;
+
+class AVE_U_B_ENC : MSA_3R_FMT<0b101, 0b00, 0b010000>;
+class AVE_U_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b010000>;
+class AVE_U_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b010000>;
+class AVE_U_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b010000>;
+
+class AVER_S_B_ENC : MSA_3R_FMT<0b110, 0b00, 0b010000>;
+class AVER_S_H_ENC : MSA_3R_FMT<0b110, 0b01, 0b010000>;
+class AVER_S_W_ENC : MSA_3R_FMT<0b110, 0b10, 0b010000>;
+class AVER_S_D_ENC : MSA_3R_FMT<0b110, 0b11, 0b010000>;
+
+class AVER_U_B_ENC : MSA_3R_FMT<0b111, 0b00, 0b010000>;
+class AVER_U_H_ENC : MSA_3R_FMT<0b111, 0b01, 0b010000>;
+class AVER_U_W_ENC : MSA_3R_FMT<0b111, 0b10, 0b010000>;
+class AVER_U_D_ENC : MSA_3R_FMT<0b111, 0b11, 0b010000>;
+
+class BCLR_B_ENC : MSA_3R_FMT<0b011, 0b00, 0b001101>;
+class BCLR_H_ENC : MSA_3R_FMT<0b011, 0b01, 0b001101>;
+class BCLR_W_ENC : MSA_3R_FMT<0b011, 0b10, 0b001101>;
+class BCLR_D_ENC : MSA_3R_FMT<0b011, 0b11, 0b001101>;
+
+class BCLRI_B_ENC : MSA_BIT_B_FMT<0b011, 0b001001>;
+class BCLRI_H_ENC : MSA_BIT_H_FMT<0b011, 0b001001>;
+class BCLRI_W_ENC : MSA_BIT_W_FMT<0b011, 0b001001>;
+class BCLRI_D_ENC : MSA_BIT_D_FMT<0b011, 0b001001>;
+
+class BINSL_B_ENC : MSA_3R_FMT<0b110, 0b00, 0b001101>;
+class BINSL_H_ENC : MSA_3R_FMT<0b110, 0b01, 0b001101>;
+class BINSL_W_ENC : MSA_3R_FMT<0b110, 0b10, 0b001101>;
+class BINSL_D_ENC : MSA_3R_FMT<0b110, 0b11, 0b001101>;
+
+class BINSLI_B_ENC : MSA_BIT_B_FMT<0b110, 0b001001>;
+class BINSLI_H_ENC : MSA_BIT_H_FMT<0b110, 0b001001>;
+class BINSLI_W_ENC : MSA_BIT_W_FMT<0b110, 0b001001>;
+class BINSLI_D_ENC : MSA_BIT_D_FMT<0b110, 0b001001>;
+
+class BINSR_B_ENC : MSA_3R_FMT<0b111, 0b00, 0b001101>;
+class BINSR_H_ENC : MSA_3R_FMT<0b111, 0b01, 0b001101>;
+class BINSR_W_ENC : MSA_3R_FMT<0b111, 0b10, 0b001101>;
+class BINSR_D_ENC : MSA_3R_FMT<0b111, 0b11, 0b001101>;
+
+class BINSRI_B_ENC : MSA_BIT_B_FMT<0b111, 0b001001>;
+class BINSRI_H_ENC : MSA_BIT_H_FMT<0b111, 0b001001>;
+class BINSRI_W_ENC : MSA_BIT_W_FMT<0b111, 0b001001>;
+class BINSRI_D_ENC : MSA_BIT_D_FMT<0b111, 0b001001>;
+
+class BMNZ_V_ENC : MSA_VEC_FMT<0b00100, 0b011110>;
+
+class BMNZI_B_ENC : MSA_I8_FMT<0b00, 0b000001>;
+
+class BMZ_V_ENC : MSA_VEC_FMT<0b00101, 0b011110>;
+
+class BMZI_B_ENC : MSA_I8_FMT<0b01, 0b000001>;
+
+class BNEG_B_ENC : MSA_3R_FMT<0b101, 0b00, 0b001101>;
+class BNEG_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b001101>;
+class BNEG_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b001101>;
+class BNEG_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b001101>;
+
+class BNEGI_B_ENC : MSA_BIT_B_FMT<0b101, 0b001001>;
+class BNEGI_H_ENC : MSA_BIT_H_FMT<0b101, 0b001001>;
+class BNEGI_W_ENC : MSA_BIT_W_FMT<0b101, 0b001001>;
+class BNEGI_D_ENC : MSA_BIT_D_FMT<0b101, 0b001001>;
+
+class BNZ_B_ENC : MSA_CBRANCH_FMT<0b111, 0b00>;
+class BNZ_H_ENC : MSA_CBRANCH_FMT<0b111, 0b01>;
+class BNZ_W_ENC : MSA_CBRANCH_FMT<0b111, 0b10>;
+class BNZ_D_ENC : MSA_CBRANCH_FMT<0b111, 0b11>;
+
+class BNZ_V_ENC : MSA_CBRANCH_V_FMT<0b01111>;
+
+class BSEL_V_ENC : MSA_VEC_FMT<0b00110, 0b011110>;
+
+class BSELI_B_ENC : MSA_I8_FMT<0b10, 0b000001>;
+
+class BSET_B_ENC : MSA_3R_FMT<0b100, 0b00, 0b001101>;
+class BSET_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b001101>;
+class BSET_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b001101>;
+class BSET_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b001101>;
+
+class BSETI_B_ENC : MSA_BIT_B_FMT<0b100, 0b001001>;
+class BSETI_H_ENC : MSA_BIT_H_FMT<0b100, 0b001001>;
+class BSETI_W_ENC : MSA_BIT_W_FMT<0b100, 0b001001>;
+class BSETI_D_ENC : MSA_BIT_D_FMT<0b100, 0b001001>;
+
+class BZ_B_ENC : MSA_CBRANCH_FMT<0b110, 0b00>;
+class BZ_H_ENC : MSA_CBRANCH_FMT<0b110, 0b01>;
+class BZ_W_ENC : MSA_CBRANCH_FMT<0b110, 0b10>;
+class BZ_D_ENC : MSA_CBRANCH_FMT<0b110, 0b11>;
+
+class BZ_V_ENC : MSA_CBRANCH_V_FMT<0b01011>;
+
+class CEQ_B_ENC : MSA_3R_FMT<0b000, 0b00, 0b001111>;
+class CEQ_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b001111>;
+class CEQ_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b001111>;
+class CEQ_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b001111>;
+
+class CEQI_B_ENC : MSA_I5_FMT<0b000, 0b00, 0b000111>;
+class CEQI_H_ENC : MSA_I5_FMT<0b000, 0b01, 0b000111>;
+class CEQI_W_ENC : MSA_I5_FMT<0b000, 0b10, 0b000111>;
+class CEQI_D_ENC : MSA_I5_FMT<0b000, 0b11, 0b000111>;
+
+class CFCMSA_ENC : MSA_ELM_CFCMSA_FMT<0b0001111110, 0b011001>;
+
+class CLE_S_B_ENC : MSA_3R_FMT<0b100, 0b00, 0b001111>;
+class CLE_S_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b001111>;
+class CLE_S_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b001111>;
+class CLE_S_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b001111>;
+
+class CLE_U_B_ENC : MSA_3R_FMT<0b101, 0b00, 0b001111>;
+class CLE_U_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b001111>;
+class CLE_U_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b001111>;
+class CLE_U_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b001111>;
+
+class CLEI_S_B_ENC : MSA_I5_FMT<0b100, 0b00, 0b000111>;
+class CLEI_S_H_ENC : MSA_I5_FMT<0b100, 0b01, 0b000111>;
+class CLEI_S_W_ENC : MSA_I5_FMT<0b100, 0b10, 0b000111>;
+class CLEI_S_D_ENC : MSA_I5_FMT<0b100, 0b11, 0b000111>;
+
+class CLEI_U_B_ENC : MSA_I5_FMT<0b101, 0b00, 0b000111>;
+class CLEI_U_H_ENC : MSA_I5_FMT<0b101, 0b01, 0b000111>;
+class CLEI_U_W_ENC : MSA_I5_FMT<0b101, 0b10, 0b000111>;
+class CLEI_U_D_ENC : MSA_I5_FMT<0b101, 0b11, 0b000111>;
+
+class CLT_S_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b001111>;
+class CLT_S_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b001111>;
+class CLT_S_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b001111>;
+class CLT_S_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b001111>;
+
+class CLT_U_B_ENC : MSA_3R_FMT<0b011, 0b00, 0b001111>;
+class CLT_U_H_ENC : MSA_3R_FMT<0b011, 0b01, 0b001111>;
+class CLT_U_W_ENC : MSA_3R_FMT<0b011, 0b10, 0b001111>;
+class CLT_U_D_ENC : MSA_3R_FMT<0b011, 0b11, 0b001111>;
+
+class CLTI_S_B_ENC : MSA_I5_FMT<0b010, 0b00, 0b000111>;
+class CLTI_S_H_ENC : MSA_I5_FMT<0b010, 0b01, 0b000111>;
+class CLTI_S_W_ENC : MSA_I5_FMT<0b010, 0b10, 0b000111>;
+class CLTI_S_D_ENC : MSA_I5_FMT<0b010, 0b11, 0b000111>;
+
+class CLTI_U_B_ENC : MSA_I5_FMT<0b011, 0b00, 0b000111>;
+class CLTI_U_H_ENC : MSA_I5_FMT<0b011, 0b01, 0b000111>;
+class CLTI_U_W_ENC : MSA_I5_FMT<0b011, 0b10, 0b000111>;
+class CLTI_U_D_ENC : MSA_I5_FMT<0b011, 0b11, 0b000111>;
+
+class COPY_S_B_ENC : MSA_ELM_COPY_B_FMT<0b0010, 0b011001>;
+class COPY_S_H_ENC : MSA_ELM_COPY_H_FMT<0b0010, 0b011001>;
+class COPY_S_W_ENC : MSA_ELM_COPY_W_FMT<0b0010, 0b011001>;
+class COPY_S_D_ENC : MSA_ELM_COPY_D_FMT<0b0010, 0b011001>;
+
+class COPY_U_B_ENC : MSA_ELM_COPY_B_FMT<0b0011, 0b011001>;
+class COPY_U_H_ENC : MSA_ELM_COPY_H_FMT<0b0011, 0b011001>;
+class COPY_U_W_ENC : MSA_ELM_COPY_W_FMT<0b0011, 0b011001>;
+class COPY_U_D_ENC : MSA_ELM_COPY_D_FMT<0b0011, 0b011001>;
+
+class CTCMSA_ENC : MSA_ELM_CTCMSA_FMT<0b0000111110, 0b011001>;
+
+class DIV_S_B_ENC : MSA_3R_FMT<0b100, 0b00, 0b010010>;
+class DIV_S_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b010010>;
+class DIV_S_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b010010>;
+class DIV_S_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b010010>;
+
+class DIV_U_B_ENC : MSA_3R_FMT<0b101, 0b00, 0b010010>;
+class DIV_U_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b010010>;
+class DIV_U_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b010010>;
+class DIV_U_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b010010>;
+
+class DOTP_S_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b010011>;
+class DOTP_S_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b010011>;
+class DOTP_S_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b010011>;
+
+class DOTP_U_H_ENC : MSA_3R_FMT<0b001, 0b01, 0b010011>;
+class DOTP_U_W_ENC : MSA_3R_FMT<0b001, 0b10, 0b010011>;
+class DOTP_U_D_ENC : MSA_3R_FMT<0b001, 0b11, 0b010011>;
+
+class DPADD_S_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b010011>;
+class DPADD_S_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b010011>;
+class DPADD_S_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b010011>;
+
+class DPADD_U_H_ENC : MSA_3R_FMT<0b011, 0b01, 0b010011>;
+class DPADD_U_W_ENC : MSA_3R_FMT<0b011, 0b10, 0b010011>;
+class DPADD_U_D_ENC : MSA_3R_FMT<0b011, 0b11, 0b010011>;
+
+class DPSUB_S_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b010011>;
+class DPSUB_S_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b010011>;
+class DPSUB_S_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b010011>;
+
+class DPSUB_U_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b010011>;
+class DPSUB_U_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b010011>;
+class DPSUB_U_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b010011>;
+
+class FADD_W_ENC : MSA_3RF_FMT<0b0000, 0b0, 0b011011>;
+class FADD_D_ENC : MSA_3RF_FMT<0b0000, 0b1, 0b011011>;
+
+class FCAF_W_ENC : MSA_3RF_FMT<0b0000, 0b0, 0b011010>;
+class FCAF_D_ENC : MSA_3RF_FMT<0b0000, 0b1, 0b011010>;
+
+class FCEQ_W_ENC : MSA_3RF_FMT<0b0010, 0b0, 0b011010>;
+class FCEQ_D_ENC : MSA_3RF_FMT<0b0010, 0b1, 0b011010>;
+
+class FCLASS_W_ENC : MSA_2RF_FMT<0b110010000, 0b0, 0b011110>;
+class FCLASS_D_ENC : MSA_2RF_FMT<0b110010000, 0b1, 0b011110>;
+
+class FCLE_W_ENC : MSA_3RF_FMT<0b0110, 0b0, 0b011010>;
+class FCLE_D_ENC : MSA_3RF_FMT<0b0110, 0b1, 0b011010>;
+
+class FCLT_W_ENC : MSA_3RF_FMT<0b0100, 0b0, 0b011010>;
+class FCLT_D_ENC : MSA_3RF_FMT<0b0100, 0b1, 0b011010>;
+
+class FCNE_W_ENC : MSA_3RF_FMT<0b0011, 0b0, 0b011100>;
+class FCNE_D_ENC : MSA_3RF_FMT<0b0011, 0b1, 0b011100>;
+
+class FCOR_W_ENC : MSA_3RF_FMT<0b0001, 0b0, 0b011100>;
+class FCOR_D_ENC : MSA_3RF_FMT<0b0001, 0b1, 0b011100>;
+
+class FCUEQ_W_ENC : MSA_3RF_FMT<0b0011, 0b0, 0b011010>;
+class FCUEQ_D_ENC : MSA_3RF_FMT<0b0011, 0b1, 0b011010>;
+
+class FCULE_W_ENC : MSA_3RF_FMT<0b0111, 0b0, 0b011010>;
+class FCULE_D_ENC : MSA_3RF_FMT<0b0111, 0b1, 0b011010>;
+
+class FCULT_W_ENC : MSA_3RF_FMT<0b0101, 0b0, 0b011010>;
+class FCULT_D_ENC : MSA_3RF_FMT<0b0101, 0b1, 0b011010>;
+
+class FCUN_W_ENC : MSA_3RF_FMT<0b0001, 0b0, 0b011010>;
+class FCUN_D_ENC : MSA_3RF_FMT<0b0001, 0b1, 0b011010>;
+
+class FCUNE_W_ENC : MSA_3RF_FMT<0b0010, 0b0, 0b011100>;
+class FCUNE_D_ENC : MSA_3RF_FMT<0b0010, 0b1, 0b011100>;
+
+class FDIV_W_ENC : MSA_3RF_FMT<0b0011, 0b0, 0b011011>;
+class FDIV_D_ENC : MSA_3RF_FMT<0b0011, 0b1, 0b011011>;
+
+class FEXDO_H_ENC : MSA_3RF_FMT<0b1000, 0b0, 0b011011>;
+class FEXDO_W_ENC : MSA_3RF_FMT<0b1000, 0b1, 0b011011>;
+
+class FEXP2_W_ENC : MSA_3RF_FMT<0b0111, 0b0, 0b011011>;
+class FEXP2_D_ENC : MSA_3RF_FMT<0b0111, 0b1, 0b011011>;
+
+class FEXUPL_W_ENC : MSA_2RF_FMT<0b110011000, 0b0, 0b011110>;
+class FEXUPL_D_ENC : MSA_2RF_FMT<0b110011000, 0b1, 0b011110>;
+
+class FEXUPR_W_ENC : MSA_2RF_FMT<0b110011001, 0b0, 0b011110>;
+class FEXUPR_D_ENC : MSA_2RF_FMT<0b110011001, 0b1, 0b011110>;
+
+class FFINT_S_W_ENC : MSA_2RF_FMT<0b110011110, 0b0, 0b011110>;
+class FFINT_S_D_ENC : MSA_2RF_FMT<0b110011110, 0b1, 0b011110>;
+
+class FFINT_U_W_ENC : MSA_2RF_FMT<0b110011111, 0b0, 0b011110>;
+class FFINT_U_D_ENC : MSA_2RF_FMT<0b110011111, 0b1, 0b011110>;
+
+class FFQL_W_ENC : MSA_2RF_FMT<0b110011010, 0b0, 0b011110>;
+class FFQL_D_ENC : MSA_2RF_FMT<0b110011010, 0b1, 0b011110>;
+
+class FFQR_W_ENC : MSA_2RF_FMT<0b110011011, 0b0, 0b011110>;
+class FFQR_D_ENC : MSA_2RF_FMT<0b110011011, 0b1, 0b011110>;
+
+class FILL_B_ENC : MSA_2R_FILL_FMT<0b11000000, 0b00, 0b011110>;
+class FILL_H_ENC : MSA_2R_FILL_FMT<0b11000000, 0b01, 0b011110>;
+class FILL_W_ENC : MSA_2R_FILL_FMT<0b11000000, 0b10, 0b011110>;
+class FILL_D_ENC : MSA_2R_FILL_D_FMT<0b11000000, 0b11, 0b011110>;
+
+class FLOG2_W_ENC : MSA_2RF_FMT<0b110010111, 0b0, 0b011110>;
+class FLOG2_D_ENC : MSA_2RF_FMT<0b110010111, 0b1, 0b011110>;
+
+class FMADD_W_ENC : MSA_3RF_FMT<0b0100, 0b0, 0b011011>;
+class FMADD_D_ENC : MSA_3RF_FMT<0b0100, 0b1, 0b011011>;
+
+class FMAX_W_ENC : MSA_3RF_FMT<0b1110, 0b0, 0b011011>;
+class FMAX_D_ENC : MSA_3RF_FMT<0b1110, 0b1, 0b011011>;
+
+class FMAX_A_W_ENC : MSA_3RF_FMT<0b1111, 0b0, 0b011011>;
+class FMAX_A_D_ENC : MSA_3RF_FMT<0b1111, 0b1, 0b011011>;
+
+class FMIN_W_ENC : MSA_3RF_FMT<0b1100, 0b0, 0b011011>;
+class FMIN_D_ENC : MSA_3RF_FMT<0b1100, 0b1, 0b011011>;
+
+class FMIN_A_W_ENC : MSA_3RF_FMT<0b1101, 0b0, 0b011011>;
+class FMIN_A_D_ENC : MSA_3RF_FMT<0b1101, 0b1, 0b011011>;
+
+class FMSUB_W_ENC : MSA_3RF_FMT<0b0101, 0b0, 0b011011>;
+class FMSUB_D_ENC : MSA_3RF_FMT<0b0101, 0b1, 0b011011>;
+
+class FMUL_W_ENC : MSA_3RF_FMT<0b0010, 0b0, 0b011011>;
+class FMUL_D_ENC : MSA_3RF_FMT<0b0010, 0b1, 0b011011>;
+
+class FRINT_W_ENC : MSA_2RF_FMT<0b110010110, 0b0, 0b011110>;
+class FRINT_D_ENC : MSA_2RF_FMT<0b110010110, 0b1, 0b011110>;
+
+class FRCP_W_ENC : MSA_2RF_FMT<0b110010101, 0b0, 0b011110>;
+class FRCP_D_ENC : MSA_2RF_FMT<0b110010101, 0b1, 0b011110>;
+
+class FRSQRT_W_ENC : MSA_2RF_FMT<0b110010100, 0b0, 0b011110>;
+class FRSQRT_D_ENC : MSA_2RF_FMT<0b110010100, 0b1, 0b011110>;
+
+class FSAF_W_ENC : MSA_3RF_FMT<0b1000, 0b0, 0b011010>;
+class FSAF_D_ENC : MSA_3RF_FMT<0b1000, 0b1, 0b011010>;
+
+class FSEQ_W_ENC : MSA_3RF_FMT<0b1010, 0b0, 0b011010>;
+class FSEQ_D_ENC : MSA_3RF_FMT<0b1010, 0b1, 0b011010>;
+
+class FSLE_W_ENC : MSA_3RF_FMT<0b1110, 0b0, 0b011010>;
+class FSLE_D_ENC : MSA_3RF_FMT<0b1110, 0b1, 0b011010>;
+
+class FSLT_W_ENC : MSA_3RF_FMT<0b1100, 0b0, 0b011010>;
+class FSLT_D_ENC : MSA_3RF_FMT<0b1100, 0b1, 0b011010>;
+
+class FSNE_W_ENC : MSA_3RF_FMT<0b1011, 0b0, 0b011100>;
+class FSNE_D_ENC : MSA_3RF_FMT<0b1011, 0b1, 0b011100>;
+
+class FSOR_W_ENC : MSA_3RF_FMT<0b1001, 0b0, 0b011100>;
+class FSOR_D_ENC : MSA_3RF_FMT<0b1001, 0b1, 0b011100>;
+
+class FSQRT_W_ENC : MSA_2RF_FMT<0b110010011, 0b0, 0b011110>;
+class FSQRT_D_ENC : MSA_2RF_FMT<0b110010011, 0b1, 0b011110>;
+
+class FSUB_W_ENC : MSA_3RF_FMT<0b0001, 0b0, 0b011011>;
+class FSUB_D_ENC : MSA_3RF_FMT<0b0001, 0b1, 0b011011>;
+
+class FSUEQ_W_ENC : MSA_3RF_FMT<0b1011, 0b0, 0b011010>;
+class FSUEQ_D_ENC : MSA_3RF_FMT<0b1011, 0b1, 0b011010>;
+
+class FSULE_W_ENC : MSA_3RF_FMT<0b1111, 0b0, 0b011010>;
+class FSULE_D_ENC : MSA_3RF_FMT<0b1111, 0b1, 0b011010>;
+
+class FSULT_W_ENC : MSA_3RF_FMT<0b1101, 0b0, 0b011010>;
+class FSULT_D_ENC : MSA_3RF_FMT<0b1101, 0b1, 0b011010>;
+
+class FSUN_W_ENC : MSA_3RF_FMT<0b1001, 0b0, 0b011010>;
+class FSUN_D_ENC : MSA_3RF_FMT<0b1001, 0b1, 0b011010>;
+
+class FSUNE_W_ENC : MSA_3RF_FMT<0b1010, 0b0, 0b011100>;
+class FSUNE_D_ENC : MSA_3RF_FMT<0b1010, 0b1, 0b011100>;
+
+class FTINT_S_W_ENC : MSA_2RF_FMT<0b110011100, 0b0, 0b011110>;
+class FTINT_S_D_ENC : MSA_2RF_FMT<0b110011100, 0b1, 0b011110>;
+
+class FTINT_U_W_ENC : MSA_2RF_FMT<0b110011101, 0b0, 0b011110>;
+class FTINT_U_D_ENC : MSA_2RF_FMT<0b110011101, 0b1, 0b011110>;
+
+class FTQ_H_ENC : MSA_3RF_FMT<0b1010, 0b0, 0b011011>;
+class FTQ_W_ENC : MSA_3RF_FMT<0b1010, 0b1, 0b011011>;
+
+class FTRUNC_S_W_ENC : MSA_2RF_FMT<0b110010001, 0b0, 0b011110>;
+class FTRUNC_S_D_ENC : MSA_2RF_FMT<0b110010001, 0b1, 0b011110>;
+
+class FTRUNC_U_W_ENC : MSA_2RF_FMT<0b110010010, 0b0, 0b011110>;
+class FTRUNC_U_D_ENC : MSA_2RF_FMT<0b110010010, 0b1, 0b011110>;
+
+class HADD_S_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b010101>;
+class HADD_S_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b010101>;
+class HADD_S_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b010101>;
+
+class HADD_U_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b010101>;
+class HADD_U_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b010101>;
+class HADD_U_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b010101>;
+
+class HSUB_S_H_ENC : MSA_3R_FMT<0b110, 0b01, 0b010101>;
+class HSUB_S_W_ENC : MSA_3R_FMT<0b110, 0b10, 0b010101>;
+class HSUB_S_D_ENC : MSA_3R_FMT<0b110, 0b11, 0b010101>;
+
+class HSUB_U_H_ENC : MSA_3R_FMT<0b111, 0b01, 0b010101>;
+class HSUB_U_W_ENC : MSA_3R_FMT<0b111, 0b10, 0b010101>;
+class HSUB_U_D_ENC : MSA_3R_FMT<0b111, 0b11, 0b010101>;
+
+class ILVEV_B_ENC : MSA_3R_FMT<0b110, 0b00, 0b010100>;
+class ILVEV_H_ENC : MSA_3R_FMT<0b110, 0b01, 0b010100>;
+class ILVEV_W_ENC : MSA_3R_FMT<0b110, 0b10, 0b010100>;
+class ILVEV_D_ENC : MSA_3R_FMT<0b110, 0b11, 0b010100>;
+
+class ILVL_B_ENC : MSA_3R_FMT<0b100, 0b00, 0b010100>;
+class ILVL_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b010100>;
+class ILVL_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b010100>;
+class ILVL_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b010100>;
+
+class ILVOD_B_ENC : MSA_3R_FMT<0b111, 0b00, 0b010100>;
+class ILVOD_H_ENC : MSA_3R_FMT<0b111, 0b01, 0b010100>;
+class ILVOD_W_ENC : MSA_3R_FMT<0b111, 0b10, 0b010100>;
+class ILVOD_D_ENC : MSA_3R_FMT<0b111, 0b11, 0b010100>;
+
+class ILVR_B_ENC : MSA_3R_FMT<0b101, 0b00, 0b010100>;
+class ILVR_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b010100>;
+class ILVR_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b010100>;
+class ILVR_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b010100>;
+
+class INSERT_B_ENC : MSA_ELM_INSERT_B_FMT<0b0100, 0b011001>;
+class INSERT_H_ENC : MSA_ELM_INSERT_H_FMT<0b0100, 0b011001>;
+class INSERT_W_ENC : MSA_ELM_INSERT_W_FMT<0b0100, 0b011001>;
+class INSERT_D_ENC : MSA_ELM_INSERT_D_FMT<0b0100, 0b011001>;
+
+class INSVE_B_ENC : MSA_ELM_B_FMT<0b0101, 0b011001>;
+class INSVE_H_ENC : MSA_ELM_H_FMT<0b0101, 0b011001>;
+class INSVE_W_ENC : MSA_ELM_W_FMT<0b0101, 0b011001>;
+class INSVE_D_ENC : MSA_ELM_D_FMT<0b0101, 0b011001>;
+
+class LD_B_ENC   : MSA_MI10_FMT<0b00, 0b1000>;
+class LD_H_ENC   : MSA_MI10_FMT<0b01, 0b1000>;
+class LD_W_ENC   : MSA_MI10_FMT<0b10, 0b1000>;
+class LD_D_ENC   : MSA_MI10_FMT<0b11, 0b1000>;
+
+class LDI_B_ENC  : MSA_I10_FMT<0b110, 0b00, 0b000111>;
+class LDI_H_ENC  : MSA_I10_FMT<0b110, 0b01, 0b000111>;
+class LDI_W_ENC  : MSA_I10_FMT<0b110, 0b10, 0b000111>;
+class LDI_D_ENC  : MSA_I10_FMT<0b110, 0b11, 0b000111>;
+
+class LSA_ENC : SPECIAL_LSA_FMT<0b000101>;
+class DLSA_ENC : SPECIAL_DLSA_FMT<0b010101>;
+
+class MADD_Q_H_ENC : MSA_3RF_FMT<0b0101, 0b0, 0b011100>;
+class MADD_Q_W_ENC : MSA_3RF_FMT<0b0101, 0b1, 0b011100>;
+
+class MADDR_Q_H_ENC : MSA_3RF_FMT<0b1101, 0b0, 0b011100>;
+class MADDR_Q_W_ENC : MSA_3RF_FMT<0b1101, 0b1, 0b011100>;
+
+class MADDV_B_ENC : MSA_3R_FMT<0b001, 0b00, 0b010010>;
+class MADDV_H_ENC : MSA_3R_FMT<0b001, 0b01, 0b010010>;
+class MADDV_W_ENC : MSA_3R_FMT<0b001, 0b10, 0b010010>;
+class MADDV_D_ENC : MSA_3R_FMT<0b001, 0b11, 0b010010>;
+
+class MAX_A_B_ENC : MSA_3R_FMT<0b110, 0b00, 0b001110>;
+class MAX_A_H_ENC : MSA_3R_FMT<0b110, 0b01, 0b001110>;
+class MAX_A_W_ENC : MSA_3R_FMT<0b110, 0b10, 0b001110>;
+class MAX_A_D_ENC : MSA_3R_FMT<0b110, 0b11, 0b001110>;
+
+class MAX_S_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b001110>;
+class MAX_S_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b001110>;
+class MAX_S_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b001110>;
+class MAX_S_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b001110>;
+
+class MAX_U_B_ENC : MSA_3R_FMT<0b011, 0b00, 0b001110>;
+class MAX_U_H_ENC : MSA_3R_FMT<0b011, 0b01, 0b001110>;
+class MAX_U_W_ENC : MSA_3R_FMT<0b011, 0b10, 0b001110>;
+class MAX_U_D_ENC : MSA_3R_FMT<0b011, 0b11, 0b001110>;
+
+class MAXI_S_B_ENC : MSA_I5_FMT<0b010, 0b00, 0b000110>;
+class MAXI_S_H_ENC : MSA_I5_FMT<0b010, 0b01, 0b000110>;
+class MAXI_S_W_ENC : MSA_I5_FMT<0b010, 0b10, 0b000110>;
+class MAXI_S_D_ENC : MSA_I5_FMT<0b010, 0b11, 0b000110>;
+
+class MAXI_U_B_ENC : MSA_I5_FMT<0b011, 0b00, 0b000110>;
+class MAXI_U_H_ENC : MSA_I5_FMT<0b011, 0b01, 0b000110>;
+class MAXI_U_W_ENC : MSA_I5_FMT<0b011, 0b10, 0b000110>;
+class MAXI_U_D_ENC : MSA_I5_FMT<0b011, 0b11, 0b000110>;
+
+class MIN_A_B_ENC : MSA_3R_FMT<0b111, 0b00, 0b001110>;
+class MIN_A_H_ENC : MSA_3R_FMT<0b111, 0b01, 0b001110>;
+class MIN_A_W_ENC : MSA_3R_FMT<0b111, 0b10, 0b001110>;
+class MIN_A_D_ENC : MSA_3R_FMT<0b111, 0b11, 0b001110>;
+
+class MIN_S_B_ENC : MSA_3R_FMT<0b100, 0b00, 0b001110>;
+class MIN_S_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b001110>;
+class MIN_S_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b001110>;
+class MIN_S_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b001110>;
+
+class MIN_U_B_ENC : MSA_3R_FMT<0b101, 0b00, 0b001110>;
+class MIN_U_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b001110>;
+class MIN_U_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b001110>;
+class MIN_U_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b001110>;
+
+class MINI_S_B_ENC : MSA_I5_FMT<0b100, 0b00, 0b000110>;
+class MINI_S_H_ENC : MSA_I5_FMT<0b100, 0b01, 0b000110>;
+class MINI_S_W_ENC : MSA_I5_FMT<0b100, 0b10, 0b000110>;
+class MINI_S_D_ENC : MSA_I5_FMT<0b100, 0b11, 0b000110>;
+
+class MINI_U_B_ENC : MSA_I5_FMT<0b101, 0b00, 0b000110>;
+class MINI_U_H_ENC : MSA_I5_FMT<0b101, 0b01, 0b000110>;
+class MINI_U_W_ENC : MSA_I5_FMT<0b101, 0b10, 0b000110>;
+class MINI_U_D_ENC : MSA_I5_FMT<0b101, 0b11, 0b000110>;
+
+class MOD_S_B_ENC : MSA_3R_FMT<0b110, 0b00, 0b010010>;
+class MOD_S_H_ENC : MSA_3R_FMT<0b110, 0b01, 0b010010>;
+class MOD_S_W_ENC : MSA_3R_FMT<0b110, 0b10, 0b010010>;
+class MOD_S_D_ENC : MSA_3R_FMT<0b110, 0b11, 0b010010>;
+
+class MOD_U_B_ENC : MSA_3R_FMT<0b111, 0b00, 0b010010>;
+class MOD_U_H_ENC : MSA_3R_FMT<0b111, 0b01, 0b010010>;
+class MOD_U_W_ENC : MSA_3R_FMT<0b111, 0b10, 0b010010>;
+class MOD_U_D_ENC : MSA_3R_FMT<0b111, 0b11, 0b010010>;
+
+class MOVE_V_ENC : MSA_ELM_FMT<0b0010111110, 0b011001>;
+
+class MSUB_Q_H_ENC : MSA_3RF_FMT<0b0110, 0b0, 0b011100>;
+class MSUB_Q_W_ENC : MSA_3RF_FMT<0b0110, 0b1, 0b011100>;
+
+class MSUBR_Q_H_ENC : MSA_3RF_FMT<0b1110, 0b0, 0b011100>;
+class MSUBR_Q_W_ENC : MSA_3RF_FMT<0b1110, 0b1, 0b011100>;
+
+class MSUBV_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b010010>;
+class MSUBV_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b010010>;
+class MSUBV_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b010010>;
+class MSUBV_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b010010>;
+
+class MUL_Q_H_ENC : MSA_3RF_FMT<0b0100, 0b0, 0b011100>;
+class MUL_Q_W_ENC : MSA_3RF_FMT<0b0100, 0b1, 0b011100>;
+
+class MULR_Q_H_ENC : MSA_3RF_FMT<0b1100, 0b0, 0b011100>;
+class MULR_Q_W_ENC : MSA_3RF_FMT<0b1100, 0b1, 0b011100>;
+
+class MULV_B_ENC : MSA_3R_FMT<0b000, 0b00, 0b010010>;
+class MULV_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b010010>;
+class MULV_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b010010>;
+class MULV_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b010010>;
+
+class NLOC_B_ENC : MSA_2R_FMT<0b11000010, 0b00, 0b011110>;
+class NLOC_H_ENC : MSA_2R_FMT<0b11000010, 0b01, 0b011110>;
+class NLOC_W_ENC : MSA_2R_FMT<0b11000010, 0b10, 0b011110>;
+class NLOC_D_ENC : MSA_2R_FMT<0b11000010, 0b11, 0b011110>;
+
+class NLZC_B_ENC : MSA_2R_FMT<0b11000011, 0b00, 0b011110>;
+class NLZC_H_ENC : MSA_2R_FMT<0b11000011, 0b01, 0b011110>;
+class NLZC_W_ENC : MSA_2R_FMT<0b11000011, 0b10, 0b011110>;
+class NLZC_D_ENC : MSA_2R_FMT<0b11000011, 0b11, 0b011110>;
+
+class NOR_V_ENC : MSA_VEC_FMT<0b00010, 0b011110>;
+
+class NORI_B_ENC : MSA_I8_FMT<0b10, 0b000000>;
+
+class OR_V_ENC : MSA_VEC_FMT<0b00001, 0b011110>;
+
+class ORI_B_ENC  : MSA_I8_FMT<0b01, 0b000000>;
+
+class PCKEV_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b010100>;
+class PCKEV_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b010100>;
+class PCKEV_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b010100>;
+class PCKEV_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b010100>;
+
+class PCKOD_B_ENC : MSA_3R_FMT<0b011, 0b00, 0b010100>;
+class PCKOD_H_ENC : MSA_3R_FMT<0b011, 0b01, 0b010100>;
+class PCKOD_W_ENC : MSA_3R_FMT<0b011, 0b10, 0b010100>;
+class PCKOD_D_ENC : MSA_3R_FMT<0b011, 0b11, 0b010100>;
+
+class PCNT_B_ENC : MSA_2R_FMT<0b11000001, 0b00, 0b011110>;
+class PCNT_H_ENC : MSA_2R_FMT<0b11000001, 0b01, 0b011110>;
+class PCNT_W_ENC : MSA_2R_FMT<0b11000001, 0b10, 0b011110>;
+class PCNT_D_ENC : MSA_2R_FMT<0b11000001, 0b11, 0b011110>;
+
+class SAT_S_B_ENC : MSA_BIT_B_FMT<0b000, 0b001010>;
+class SAT_S_H_ENC : MSA_BIT_H_FMT<0b000, 0b001010>;
+class SAT_S_W_ENC : MSA_BIT_W_FMT<0b000, 0b001010>;
+class SAT_S_D_ENC : MSA_BIT_D_FMT<0b000, 0b001010>;
+
+class SAT_U_B_ENC : MSA_BIT_B_FMT<0b001, 0b001010>;
+class SAT_U_H_ENC : MSA_BIT_H_FMT<0b001, 0b001010>;
+class SAT_U_W_ENC : MSA_BIT_W_FMT<0b001, 0b001010>;
+class SAT_U_D_ENC : MSA_BIT_D_FMT<0b001, 0b001010>;
+
+class SHF_B_ENC  : MSA_I8_FMT<0b00, 0b000010>;
+class SHF_H_ENC  : MSA_I8_FMT<0b01, 0b000010>;
+class SHF_W_ENC  : MSA_I8_FMT<0b10, 0b000010>;
+
+class SLD_B_ENC : MSA_3R_INDEX_FMT<0b000, 0b00, 0b010100>;
+class SLD_H_ENC : MSA_3R_INDEX_FMT<0b000, 0b01, 0b010100>;
+class SLD_W_ENC : MSA_3R_INDEX_FMT<0b000, 0b10, 0b010100>;
+class SLD_D_ENC : MSA_3R_INDEX_FMT<0b000, 0b11, 0b010100>;
+
+class SLDI_B_ENC : MSA_ELM_B_FMT<0b0000, 0b011001>;
+class SLDI_H_ENC : MSA_ELM_H_FMT<0b0000, 0b011001>;
+class SLDI_W_ENC : MSA_ELM_W_FMT<0b0000, 0b011001>;
+class SLDI_D_ENC : MSA_ELM_D_FMT<0b0000, 0b011001>;
+
+class SLL_B_ENC : MSA_3R_FMT<0b000, 0b00, 0b001101>;
+class SLL_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b001101>;
+class SLL_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b001101>;
+class SLL_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b001101>;
+
+class SLLI_B_ENC : MSA_BIT_B_FMT<0b000, 0b001001>;
+class SLLI_H_ENC : MSA_BIT_H_FMT<0b000, 0b001001>;
+class SLLI_W_ENC : MSA_BIT_W_FMT<0b000, 0b001001>;
+class SLLI_D_ENC : MSA_BIT_D_FMT<0b000, 0b001001>;
+
+class SPLAT_B_ENC : MSA_3R_INDEX_FMT<0b001, 0b00, 0b010100>;
+class SPLAT_H_ENC : MSA_3R_INDEX_FMT<0b001, 0b01, 0b010100>;
+class SPLAT_W_ENC : MSA_3R_INDEX_FMT<0b001, 0b10, 0b010100>;
+class SPLAT_D_ENC : MSA_3R_INDEX_FMT<0b001, 0b11, 0b010100>;
+
+class SPLATI_B_ENC : MSA_ELM_B_FMT<0b0001, 0b011001>;
+class SPLATI_H_ENC : MSA_ELM_H_FMT<0b0001, 0b011001>;
+class SPLATI_W_ENC : MSA_ELM_W_FMT<0b0001, 0b011001>;
+class SPLATI_D_ENC : MSA_ELM_D_FMT<0b0001, 0b011001>;
+
+class SRA_B_ENC : MSA_3R_FMT<0b001, 0b00, 0b001101>;
+class SRA_H_ENC : MSA_3R_FMT<0b001, 0b01, 0b001101>;
+class SRA_W_ENC : MSA_3R_FMT<0b001, 0b10, 0b001101>;
+class SRA_D_ENC : MSA_3R_FMT<0b001, 0b11, 0b001101>;
+
+class SRAI_B_ENC : MSA_BIT_B_FMT<0b001, 0b001001>;
+class SRAI_H_ENC : MSA_BIT_H_FMT<0b001, 0b001001>;
+class SRAI_W_ENC : MSA_BIT_W_FMT<0b001, 0b001001>;
+class SRAI_D_ENC : MSA_BIT_D_FMT<0b001, 0b001001>;
+
+class SRAR_B_ENC : MSA_3R_FMT<0b001, 0b00, 0b010101>;
+class SRAR_H_ENC : MSA_3R_FMT<0b001, 0b01, 0b010101>;
+class SRAR_W_ENC : MSA_3R_FMT<0b001, 0b10, 0b010101>;
+class SRAR_D_ENC : MSA_3R_FMT<0b001, 0b11, 0b010101>;
+
+class SRARI_B_ENC : MSA_BIT_B_FMT<0b010, 0b001010>;
+class SRARI_H_ENC : MSA_BIT_H_FMT<0b010, 0b001010>;
+class SRARI_W_ENC : MSA_BIT_W_FMT<0b010, 0b001010>;
+class SRARI_D_ENC : MSA_BIT_D_FMT<0b010, 0b001010>;
+
+class SRL_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b001101>;
+class SRL_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b001101>;
+class SRL_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b001101>;
+class SRL_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b001101>;
+
+class SRLI_B_ENC : MSA_BIT_B_FMT<0b010, 0b001001>;
+class SRLI_H_ENC : MSA_BIT_H_FMT<0b010, 0b001001>;
+class SRLI_W_ENC : MSA_BIT_W_FMT<0b010, 0b001001>;
+class SRLI_D_ENC : MSA_BIT_D_FMT<0b010, 0b001001>;
+
+class SRLR_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b010101>;
+class SRLR_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b010101>;
+class SRLR_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b010101>;
+class SRLR_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b010101>;
+
+class SRLRI_B_ENC : MSA_BIT_B_FMT<0b011, 0b001010>;
+class SRLRI_H_ENC : MSA_BIT_H_FMT<0b011, 0b001010>;
+class SRLRI_W_ENC : MSA_BIT_W_FMT<0b011, 0b001010>;
+class SRLRI_D_ENC : MSA_BIT_D_FMT<0b011, 0b001010>;
+
+class ST_B_ENC   : MSA_MI10_FMT<0b00, 0b1001>;
+class ST_H_ENC   : MSA_MI10_FMT<0b01, 0b1001>;
+class ST_W_ENC   : MSA_MI10_FMT<0b10, 0b1001>;
+class ST_D_ENC   : MSA_MI10_FMT<0b11, 0b1001>;
+
+class SUBS_S_B_ENC : MSA_3R_FMT<0b000, 0b00, 0b010001>;
+class SUBS_S_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b010001>;
+class SUBS_S_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b010001>;
+class SUBS_S_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b010001>;
+
+class SUBS_U_B_ENC : MSA_3R_FMT<0b001, 0b00, 0b010001>;
+class SUBS_U_H_ENC : MSA_3R_FMT<0b001, 0b01, 0b010001>;
+class SUBS_U_W_ENC : MSA_3R_FMT<0b001, 0b10, 0b010001>;
+class SUBS_U_D_ENC : MSA_3R_FMT<0b001, 0b11, 0b010001>;
+
+class SUBSUS_U_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b010001>;
+class SUBSUS_U_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b010001>;
+class SUBSUS_U_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b010001>;
+class SUBSUS_U_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b010001>;
+
+class SUBSUU_S_B_ENC : MSA_3R_FMT<0b011, 0b00, 0b010001>;
+class SUBSUU_S_H_ENC : MSA_3R_FMT<0b011, 0b01, 0b010001>;
+class SUBSUU_S_W_ENC : MSA_3R_FMT<0b011, 0b10, 0b010001>;
+class SUBSUU_S_D_ENC : MSA_3R_FMT<0b011, 0b11, 0b010001>;
+
+class SUBV_B_ENC : MSA_3R_FMT<0b001, 0b00, 0b001110>;
+class SUBV_H_ENC : MSA_3R_FMT<0b001, 0b01, 0b001110>;
+class SUBV_W_ENC : MSA_3R_FMT<0b001, 0b10, 0b001110>;
+class SUBV_D_ENC : MSA_3R_FMT<0b001, 0b11, 0b001110>;
+
+class SUBVI_B_ENC : MSA_I5_FMT<0b001, 0b00, 0b000110>;
+class SUBVI_H_ENC : MSA_I5_FMT<0b001, 0b01, 0b000110>;
+class SUBVI_W_ENC : MSA_I5_FMT<0b001, 0b10, 0b000110>;
+class SUBVI_D_ENC : MSA_I5_FMT<0b001, 0b11, 0b000110>;
+
+class VSHF_B_ENC : MSA_3R_FMT<0b000, 0b00, 0b010101>;
+class VSHF_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b010101>;
+class VSHF_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b010101>;
+class VSHF_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b010101>;
+
+class XOR_V_ENC : MSA_VEC_FMT<0b00011, 0b011110>;
+
+class XORI_B_ENC : MSA_I8_FMT<0b11, 0b000000>;
+
+// Instruction desc.
+class MSA_BIT_B_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                          ComplexPattern Imm, RegisterOperand ROWD,
+                          RegisterOperand ROWS = ROWD,
+                          InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, vsplat_uimm3:$m);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, Imm:$m))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_BIT_H_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                          ComplexPattern Imm, RegisterOperand ROWD,
+                          RegisterOperand ROWS = ROWD,
+                          InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, vsplat_uimm4:$m);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, Imm:$m))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_BIT_W_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                          ComplexPattern Imm, RegisterOperand ROWD,
+                          RegisterOperand ROWS = ROWD,
+                          InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, vsplat_uimm5:$m);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, Imm:$m))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_BIT_D_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                          ComplexPattern Imm, RegisterOperand ROWD,
+                          RegisterOperand ROWS = ROWD,
+                          InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, vsplat_uimm6:$m);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, Imm:$m))];
+  InstrItinClass Itinerary = itin;
+}
+
+// This class is deprecated and will be removed soon.
+class MSA_BIT_B_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                            RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                            InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, uimm3:$m);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt3:$m))];
+  InstrItinClass Itinerary = itin;
+}
+
+// This class is deprecated and will be removed soon.
+class MSA_BIT_H_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                            RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                            InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, uimm4:$m);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt4:$m))];
+  InstrItinClass Itinerary = itin;
+}
+
+// This class is deprecated and will be removed soon.
+class MSA_BIT_W_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                            RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                            InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, uimm5:$m);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt5:$m))];
+  InstrItinClass Itinerary = itin;
+}
+
+// This class is deprecated and will be removed soon.
+class MSA_BIT_D_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                            RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                            InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, uimm6:$m);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt6:$m))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_BIT_BINSXI_DESC_BASE<string instr_asm, ValueType Ty,
+                               ComplexPattern Mask, RegisterOperand ROWD,
+                               RegisterOperand ROWS = ROWD,
+                               InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWD:$wd_in, ROWS:$ws, vsplat_uimm8:$m);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+  // Note that binsxi and vselect treat the condition operand the opposite
+  // way to each other.
+  //   (vselect cond, if_set, if_clear)
+  //   (BSEL_V cond, if_clear, if_set)
+  list<dag> Pattern = [(set ROWD:$wd, (vselect (Ty Mask:$m), (Ty ROWD:$ws),
+                                               ROWS:$wd_in))];
+  InstrItinClass Itinerary = itin;
+  string Constraints = "$wd = $wd_in";
+}
+
+class MSA_BIT_BINSLI_DESC_BASE<string instr_asm, ValueType Ty,
+                               RegisterOperand ROWD,
+                               RegisterOperand ROWS = ROWD,
+                               InstrItinClass itin = NoItinerary> :
+  MSA_BIT_BINSXI_DESC_BASE<instr_asm, Ty, vsplat_maskl_bits, ROWD, ROWS, itin>;
+
+class MSA_BIT_BINSRI_DESC_BASE<string instr_asm, ValueType Ty,
+                               RegisterOperand ROWD,
+                               RegisterOperand ROWS = ROWD,
+                               InstrItinClass itin = NoItinerary> :
+  MSA_BIT_BINSXI_DESC_BASE<instr_asm, Ty, vsplat_maskr_bits, ROWD, ROWS, itin>;
+
+class MSA_BIT_SPLAT_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                              SplatComplexPattern SplatImm,
+                              RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                              InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, SplatImm.OpClass:$m);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, SplatImm:$m))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_COPY_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                         ValueType VecTy, RegisterOperand ROD,
+                         RegisterOperand ROWS,
+                         InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROD:$rd);
+  dag InOperandList = (ins ROWS:$ws, uimm4:$n);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $ws[$n]");
+  list<dag> Pattern = [(set ROD:$rd, (OpNode (VecTy ROWS:$ws), immZExt4:$n))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_ELM_SLD_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                            RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                            InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWD:$wd_in, ROWS:$ws, uimm4:$n);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws[$n]");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWD:$wd_in, ROWS:$ws,
+                                              immZExt4:$n))];
+  string Constraints = "$wd = $wd_in";
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_COPY_PSEUDO_BASE<SDPatternOperator OpNode, ValueType VecTy,
+                           RegisterClass RCD, RegisterClass RCWS> :
+      MSAPseudo<(outs RCD:$wd), (ins RCWS:$ws, uimm4:$n),
+                [(set RCD:$wd, (OpNode (VecTy RCWS:$ws), immZExt4:$n))]> {
+  bit usesCustomInserter = 1;
+}
+
+class MSA_I5_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                       SplatComplexPattern SplatImm, RegisterOperand ROWD,
+                       RegisterOperand ROWS = ROWD,
+                       InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, SplatImm.OpClass:$imm);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $imm");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, SplatImm:$imm))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_I8_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                       SplatComplexPattern SplatImm, RegisterOperand ROWD,
+                       RegisterOperand ROWS = ROWD,
+                       InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, SplatImm.OpClass:$u8);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $u8");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, SplatImm:$u8))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_I8_SHF_DESC_BASE<string instr_asm, RegisterOperand ROWD,
+                           RegisterOperand ROWS = ROWD,
+                           InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, uimm8:$u8);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $u8");
+  list<dag> Pattern = [(set ROWD:$wd, (MipsSHF immZExt8:$u8, ROWS:$ws))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_I10_LDI_DESC_BASE<string instr_asm, RegisterOperand ROWD,
+                            InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins vsplat_simm10:$s10);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $s10");
+  // LDI is matched using custom matching code in MipsSEISelDAGToDAG.cpp
+  list<dag> Pattern = [];
+  bit hasSideEffects = 0;
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_2R_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                       RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                       InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_2R_FILL_DESC_BASE<string instr_asm, ValueType VT,
+                            SDPatternOperator OpNode, RegisterOperand ROWD,
+                            RegisterOperand ROS = ROWD,
+                            InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROS:$rs);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $rs");
+  list<dag> Pattern = [(set ROWD:$wd, (VT (OpNode ROS:$rs)))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_2R_FILL_PSEUDO_BASE<ValueType VT, SDPatternOperator OpNode,
+                              RegisterClass RCWD, RegisterClass RCWS = RCWD> :
+      MSAPseudo<(outs RCWD:$wd), (ins RCWS:$fs),
+                [(set RCWD:$wd, (OpNode RCWS:$fs))]> {
+  let usesCustomInserter = 1;
+}
+
+class MSA_2RF_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                        RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                        InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_3R_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                       RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                       RegisterOperand ROWT = ROWD,
+                       InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, ROWT:$wt);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $wt");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, ROWT:$wt))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_3R_BINSX_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                             RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                             RegisterOperand ROWT = ROWD,
+                             InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWD:$wd_in, ROWS:$ws, ROWT:$wt);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $wt");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWD:$wd_in, ROWS:$ws,
+                                              ROWT:$wt))];
+  string Constraints = "$wd = $wd_in";
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_3R_SPLAT_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                             RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                             InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, GPR32Opnd:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws[$rt]");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, GPR32Opnd:$rt))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_3R_VSHF_DESC_BASE<string instr_asm, RegisterOperand ROWD,
+                            RegisterOperand ROWS = ROWD,
+                            RegisterOperand ROWT = ROWD,
+                            InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWD:$wd_in, ROWS:$ws, ROWT:$wt);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $wt");
+  list<dag> Pattern = [(set ROWD:$wd, (MipsVSHF ROWD:$wd_in, ROWS:$ws,
+                                                ROWT:$wt))];
+  string Constraints = "$wd = $wd_in";
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_3R_SLD_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                           RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                           InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWD:$wd_in, ROWS:$ws, GPR32Opnd:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws[$rt]");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWD:$wd_in, ROWS:$ws,
+                                              GPR32Opnd:$rt))];
+  InstrItinClass Itinerary = itin;
+  string Constraints = "$wd = $wd_in";
+}
+
+class MSA_3R_4R_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                          RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                          RegisterOperand ROWT = ROWD,
+                          InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWD:$wd_in, ROWS:$ws, ROWT:$wt);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $wt");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWD:$wd_in, ROWS:$ws,
+                                              ROWT:$wt))];
+  InstrItinClass Itinerary = itin;
+  string Constraints = "$wd = $wd_in";
+}
+
+class MSA_3RF_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                        RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                        RegisterOperand ROWT = ROWD,
+                        InstrItinClass itin = NoItinerary> :
+  MSA_3R_DESC_BASE<instr_asm, OpNode, ROWD, ROWS, ROWT, itin>;
+
+class MSA_3RF_4RF_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                            RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                            RegisterOperand ROWT = ROWD,
+                            InstrItinClass itin = NoItinerary> :
+  MSA_3R_4R_DESC_BASE<instr_asm, OpNode, ROWD, ROWS, ROWT, itin>;
+
+class MSA_CBRANCH_DESC_BASE<string instr_asm, RegisterOperand ROWD> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins ROWD:$wt, brtarget:$offset);
+  string AsmString = !strconcat(instr_asm, "\t$wt, $offset");
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = IIBranch;
+  bit isBranch = 1;
+  bit isTerminator = 1;
+  bit hasDelaySlot = 1;
+  list<Register> Defs = [AT];
+}
+
+class MSA_INSERT_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                           RegisterOperand ROWD, RegisterOperand ROS,
+                           InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWD:$wd_in, ROS:$rs, uimm6:$n);
+  string AsmString = !strconcat(instr_asm, "\t$wd[$n], $rs");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWD:$wd_in,
+                                              ROS:$rs,
+                                              immZExt6:$n))];
+  InstrItinClass Itinerary = itin;
+  string Constraints = "$wd = $wd_in";
+}
+
+class MSA_INSERT_PSEUDO_BASE<SDPatternOperator OpNode, ValueType Ty,
+                             RegisterOperand ROWD, RegisterOperand ROFS> :
+      MSAPseudo<(outs ROWD:$wd), (ins ROWD:$wd_in, uimm6:$n, ROFS:$fs),
+                [(set ROWD:$wd, (OpNode (Ty ROWD:$wd_in), ROFS:$fs,
+                                        immZExt6:$n))]> {
+  bit usesCustomInserter = 1;
+  string Constraints = "$wd = $wd_in";
+}
+
+class MSA_INSERT_VIDX_PSEUDO_BASE<SDPatternOperator OpNode, ValueType Ty,
+                                  RegisterOperand ROWD, RegisterOperand ROFS> :
+      MSAPseudo<(outs ROWD:$wd), (ins ROWD:$wd_in, GPR32Opnd:$n, ROFS:$fs),
+                [(set ROWD:$wd, (OpNode (Ty ROWD:$wd_in), ROFS:$fs,
+                                        GPR32Opnd:$n))]> {
+  bit usesCustomInserter = 1;
+  string Constraints = "$wd = $wd_in";
+}
+
+class MSA_INSVE_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                          RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                          InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWD:$wd_in, uimm6:$n, ROWS:$ws, uimmz:$n2);
+  string AsmString = !strconcat(instr_asm, "\t$wd[$n], $ws[$n2]");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWD:$wd_in,
+                                              immZExt6:$n,
+                                              ROWS:$ws,
+                                              immz:$n2))];
+  InstrItinClass Itinerary = itin;
+  string Constraints = "$wd = $wd_in";
+}
+
+class MSA_VEC_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                        RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                        RegisterOperand ROWT = ROWD,
+                        InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, ROWT:$wt);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $wt");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, ROWT:$wt))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_ELM_SPLAT_DESC_BASE<string instr_asm, SplatComplexPattern SplatImm,
+                              RegisterOperand ROWD,
+                              RegisterOperand ROWS = ROWD,
+                              InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, SplatImm.OpClass:$n);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws[$n]");
+  list<dag> Pattern = [(set ROWD:$wd, (MipsVSHF SplatImm:$n, ROWS:$ws,
+                                                ROWS:$ws))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_VEC_PSEUDO_BASE<SDPatternOperator OpNode, RegisterOperand ROWD,
+                          RegisterOperand ROWS = ROWD,
+                          RegisterOperand ROWT = ROWD> :
+      MSAPseudo<(outs ROWD:$wd), (ins ROWS:$ws, ROWT:$wt),
+                [(set ROWD:$wd, (OpNode ROWS:$ws, ROWT:$wt))]>;
+
+class ADD_A_B_DESC : MSA_3R_DESC_BASE<"add_a.b", int_mips_add_a_b, MSA128BOpnd>,
+                     IsCommutable;
+class ADD_A_H_DESC : MSA_3R_DESC_BASE<"add_a.h", int_mips_add_a_h, MSA128HOpnd>,
+                     IsCommutable;
+class ADD_A_W_DESC : MSA_3R_DESC_BASE<"add_a.w", int_mips_add_a_w, MSA128WOpnd>,
+                     IsCommutable;
+class ADD_A_D_DESC : MSA_3R_DESC_BASE<"add_a.d", int_mips_add_a_d, MSA128DOpnd>,
+                     IsCommutable;
+
+class ADDS_A_B_DESC : MSA_3R_DESC_BASE<"adds_a.b", int_mips_adds_a_b,
+                                       MSA128BOpnd>, IsCommutable;
+class ADDS_A_H_DESC : MSA_3R_DESC_BASE<"adds_a.h", int_mips_adds_a_h,
+                                       MSA128HOpnd>, IsCommutable;
+class ADDS_A_W_DESC : MSA_3R_DESC_BASE<"adds_a.w", int_mips_adds_a_w,
+                                       MSA128WOpnd>, IsCommutable;
+class ADDS_A_D_DESC : MSA_3R_DESC_BASE<"adds_a.d", int_mips_adds_a_d,
+                                       MSA128DOpnd>, IsCommutable;
+
+class ADDS_S_B_DESC : MSA_3R_DESC_BASE<"adds_s.b", int_mips_adds_s_b,
+                                       MSA128BOpnd>, IsCommutable;
+class ADDS_S_H_DESC : MSA_3R_DESC_BASE<"adds_s.h", int_mips_adds_s_h,
+                                       MSA128HOpnd>, IsCommutable;
+class ADDS_S_W_DESC : MSA_3R_DESC_BASE<"adds_s.w", int_mips_adds_s_w,
+                                       MSA128WOpnd>, IsCommutable;
+class ADDS_S_D_DESC : MSA_3R_DESC_BASE<"adds_s.d", int_mips_adds_s_d,
+                                       MSA128DOpnd>, IsCommutable;
+
+class ADDS_U_B_DESC : MSA_3R_DESC_BASE<"adds_u.b", int_mips_adds_u_b,
+                                       MSA128BOpnd>, IsCommutable;
+class ADDS_U_H_DESC : MSA_3R_DESC_BASE<"adds_u.h", int_mips_adds_u_h,
+                                       MSA128HOpnd>, IsCommutable;
+class ADDS_U_W_DESC : MSA_3R_DESC_BASE<"adds_u.w", int_mips_adds_u_w,
+                                       MSA128WOpnd>, IsCommutable;
+class ADDS_U_D_DESC : MSA_3R_DESC_BASE<"adds_u.d", int_mips_adds_u_d,
+                                       MSA128DOpnd>, IsCommutable;
+
+class ADDV_B_DESC : MSA_3R_DESC_BASE<"addv.b", add, MSA128BOpnd>, IsCommutable;
+class ADDV_H_DESC : MSA_3R_DESC_BASE<"addv.h", add, MSA128HOpnd>, IsCommutable;
+class ADDV_W_DESC : MSA_3R_DESC_BASE<"addv.w", add, MSA128WOpnd>, IsCommutable;
+class ADDV_D_DESC : MSA_3R_DESC_BASE<"addv.d", add, MSA128DOpnd>, IsCommutable;
+
+class ADDVI_B_DESC : MSA_I5_DESC_BASE<"addvi.b", add, vsplati8_uimm5,
+                                      MSA128BOpnd>;
+class ADDVI_H_DESC : MSA_I5_DESC_BASE<"addvi.h", add, vsplati16_uimm5,
+                                      MSA128HOpnd>;
+class ADDVI_W_DESC : MSA_I5_DESC_BASE<"addvi.w", add, vsplati32_uimm5,
+                                      MSA128WOpnd>;
+class ADDVI_D_DESC : MSA_I5_DESC_BASE<"addvi.d", add, vsplati64_uimm5,
+                                      MSA128DOpnd>;
+
+class AND_V_DESC : MSA_VEC_DESC_BASE<"and.v", and, MSA128BOpnd>;
+class AND_V_H_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<and, MSA128HOpnd>;
+class AND_V_W_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<and, MSA128WOpnd>;
+class AND_V_D_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<and, MSA128DOpnd>;
+
+class ANDI_B_DESC : MSA_I8_DESC_BASE<"andi.b", and, vsplati8_uimm8,
+                                     MSA128BOpnd>;
+
+class ASUB_S_B_DESC : MSA_3R_DESC_BASE<"asub_s.b", int_mips_asub_s_b,
+                                       MSA128BOpnd>;
+class ASUB_S_H_DESC : MSA_3R_DESC_BASE<"asub_s.h", int_mips_asub_s_h,
+                                       MSA128HOpnd>;
+class ASUB_S_W_DESC : MSA_3R_DESC_BASE<"asub_s.w", int_mips_asub_s_w,
+                                       MSA128WOpnd>;
+class ASUB_S_D_DESC : MSA_3R_DESC_BASE<"asub_s.d", int_mips_asub_s_d,
+                                       MSA128DOpnd>;
+
+class ASUB_U_B_DESC : MSA_3R_DESC_BASE<"asub_u.b", int_mips_asub_u_b,
+                                       MSA128BOpnd>;
+class ASUB_U_H_DESC : MSA_3R_DESC_BASE<"asub_u.h", int_mips_asub_u_h,
+                                       MSA128HOpnd>;
+class ASUB_U_W_DESC : MSA_3R_DESC_BASE<"asub_u.w", int_mips_asub_u_w,
+                                       MSA128WOpnd>;
+class ASUB_U_D_DESC : MSA_3R_DESC_BASE<"asub_u.d", int_mips_asub_u_d,
+                                       MSA128DOpnd>;
+
+class AVE_S_B_DESC : MSA_3R_DESC_BASE<"ave_s.b", int_mips_ave_s_b, MSA128BOpnd>,
+                     IsCommutable;
+class AVE_S_H_DESC : MSA_3R_DESC_BASE<"ave_s.h", int_mips_ave_s_h, MSA128HOpnd>,
+                     IsCommutable;
+class AVE_S_W_DESC : MSA_3R_DESC_BASE<"ave_s.w", int_mips_ave_s_w, MSA128WOpnd>,
+                     IsCommutable;
+class AVE_S_D_DESC : MSA_3R_DESC_BASE<"ave_s.d", int_mips_ave_s_d, MSA128DOpnd>,
+                     IsCommutable;
+
+class AVE_U_B_DESC : MSA_3R_DESC_BASE<"ave_u.b", int_mips_ave_u_b, MSA128BOpnd>,
+                     IsCommutable;
+class AVE_U_H_DESC : MSA_3R_DESC_BASE<"ave_u.h", int_mips_ave_u_h, MSA128HOpnd>,
+                     IsCommutable;
+class AVE_U_W_DESC : MSA_3R_DESC_BASE<"ave_u.w", int_mips_ave_u_w, MSA128WOpnd>,
+                     IsCommutable;
+class AVE_U_D_DESC : MSA_3R_DESC_BASE<"ave_u.d", int_mips_ave_u_d, MSA128DOpnd>,
+                     IsCommutable;
+
+class AVER_S_B_DESC : MSA_3R_DESC_BASE<"aver_s.b", int_mips_aver_s_b,
+                                       MSA128BOpnd>, IsCommutable;
+class AVER_S_H_DESC : MSA_3R_DESC_BASE<"aver_s.h", int_mips_aver_s_h,
+                                       MSA128HOpnd>, IsCommutable;
+class AVER_S_W_DESC : MSA_3R_DESC_BASE<"aver_s.w", int_mips_aver_s_w,
+                                       MSA128WOpnd>, IsCommutable;
+class AVER_S_D_DESC : MSA_3R_DESC_BASE<"aver_s.d", int_mips_aver_s_d,
+                                       MSA128DOpnd>, IsCommutable;
+
+class AVER_U_B_DESC : MSA_3R_DESC_BASE<"aver_u.b", int_mips_aver_u_b,
+                                       MSA128BOpnd>, IsCommutable;
+class AVER_U_H_DESC : MSA_3R_DESC_BASE<"aver_u.h", int_mips_aver_u_h,
+                                       MSA128HOpnd>, IsCommutable;
+class AVER_U_W_DESC : MSA_3R_DESC_BASE<"aver_u.w", int_mips_aver_u_w,
+                                       MSA128WOpnd>, IsCommutable;
+class AVER_U_D_DESC : MSA_3R_DESC_BASE<"aver_u.d", int_mips_aver_u_d,
+                                       MSA128DOpnd>, IsCommutable;
+
+class BCLR_B_DESC : MSA_3R_DESC_BASE<"bclr.b", vbclr_b, MSA128BOpnd>;
+class BCLR_H_DESC : MSA_3R_DESC_BASE<"bclr.h", vbclr_h, MSA128HOpnd>;
+class BCLR_W_DESC : MSA_3R_DESC_BASE<"bclr.w", vbclr_w, MSA128WOpnd>;
+class BCLR_D_DESC : MSA_3R_DESC_BASE<"bclr.d", vbclr_d, MSA128DOpnd>;
+
+class BCLRI_B_DESC : MSA_BIT_B_DESC_BASE<"bclri.b", and, vsplat_uimm_inv_pow2,
+                                         MSA128BOpnd>;
+class BCLRI_H_DESC : MSA_BIT_H_DESC_BASE<"bclri.h", and, vsplat_uimm_inv_pow2,
+                                         MSA128HOpnd>;
+class BCLRI_W_DESC : MSA_BIT_W_DESC_BASE<"bclri.w", and, vsplat_uimm_inv_pow2,
+                                         MSA128WOpnd>;
+class BCLRI_D_DESC : MSA_BIT_D_DESC_BASE<"bclri.d", and, vsplat_uimm_inv_pow2,
+                                         MSA128DOpnd>;
+
+class BINSL_B_DESC : MSA_3R_BINSX_DESC_BASE<"binsl.b", int_mips_binsl_b,
+                                            MSA128BOpnd>;
+class BINSL_H_DESC : MSA_3R_BINSX_DESC_BASE<"binsl.h", int_mips_binsl_h,
+                                            MSA128HOpnd>;
+class BINSL_W_DESC : MSA_3R_BINSX_DESC_BASE<"binsl.w", int_mips_binsl_w,
+                                            MSA128WOpnd>;
+class BINSL_D_DESC : MSA_3R_BINSX_DESC_BASE<"binsl.d", int_mips_binsl_d,
+                                            MSA128DOpnd>;
+
+class BINSLI_B_DESC : MSA_BIT_BINSLI_DESC_BASE<"binsli.b", v16i8, MSA128BOpnd>;
+class BINSLI_H_DESC : MSA_BIT_BINSLI_DESC_BASE<"binsli.h", v8i16, MSA128HOpnd>;
+class BINSLI_W_DESC : MSA_BIT_BINSLI_DESC_BASE<"binsli.w", v4i32, MSA128WOpnd>;
+class BINSLI_D_DESC : MSA_BIT_BINSLI_DESC_BASE<"binsli.d", v2i64, MSA128DOpnd>;
+
+class BINSR_B_DESC : MSA_3R_BINSX_DESC_BASE<"binsr.b", int_mips_binsr_b,
+                                            MSA128BOpnd>;
+class BINSR_H_DESC : MSA_3R_BINSX_DESC_BASE<"binsr.h", int_mips_binsr_h,
+                                            MSA128HOpnd>;
+class BINSR_W_DESC : MSA_3R_BINSX_DESC_BASE<"binsr.w", int_mips_binsr_w,
+                                            MSA128WOpnd>;
+class BINSR_D_DESC : MSA_3R_BINSX_DESC_BASE<"binsr.d", int_mips_binsr_d,
+                                            MSA128DOpnd>;
+
+class BINSRI_B_DESC : MSA_BIT_BINSRI_DESC_BASE<"binsri.b", v16i8, MSA128BOpnd>;
+class BINSRI_H_DESC : MSA_BIT_BINSRI_DESC_BASE<"binsri.h", v8i16, MSA128HOpnd>;
+class BINSRI_W_DESC : MSA_BIT_BINSRI_DESC_BASE<"binsri.w", v4i32, MSA128WOpnd>;
+class BINSRI_D_DESC : MSA_BIT_BINSRI_DESC_BASE<"binsri.d", v2i64, MSA128DOpnd>;
+
+class BMNZ_V_DESC {
+  dag OutOperandList = (outs MSA128BOpnd:$wd);
+  dag InOperandList = (ins MSA128BOpnd:$wd_in, MSA128BOpnd:$ws,
+                       MSA128BOpnd:$wt);
+  string AsmString = "bmnz.v\t$wd, $ws, $wt";
+  list<dag> Pattern = [(set MSA128BOpnd:$wd, (vselect MSA128BOpnd:$wt,
+                                                      MSA128BOpnd:$ws,
+                                                      MSA128BOpnd:$wd_in))];
+  InstrItinClass Itinerary = NoItinerary;
+  string Constraints = "$wd = $wd_in";
+}
+
+class BMNZI_B_DESC {
+  dag OutOperandList = (outs MSA128BOpnd:$wd);
+  dag InOperandList = (ins MSA128BOpnd:$wd_in, MSA128BOpnd:$ws,
+                           vsplat_uimm8:$u8);
+  string AsmString = "bmnzi.b\t$wd, $ws, $u8";
+  list<dag> Pattern = [(set MSA128BOpnd:$wd, (vselect vsplati8_uimm8:$u8,
+                                                      MSA128BOpnd:$ws,
+                                                      MSA128BOpnd:$wd_in))];
+  InstrItinClass Itinerary = NoItinerary;
+  string Constraints = "$wd = $wd_in";
+}
+
+class BMZ_V_DESC {
+  dag OutOperandList = (outs MSA128BOpnd:$wd);
+  dag InOperandList = (ins MSA128BOpnd:$wd_in, MSA128BOpnd:$ws,
+                       MSA128BOpnd:$wt);
+  string AsmString = "bmz.v\t$wd, $ws, $wt";
+  list<dag> Pattern = [(set MSA128BOpnd:$wd, (vselect MSA128BOpnd:$wt,
+                                                      MSA128BOpnd:$wd_in,
+                                                      MSA128BOpnd:$ws))];
+  InstrItinClass Itinerary = NoItinerary;
+  string Constraints = "$wd = $wd_in";
+}
+
+class BMZI_B_DESC {
+  dag OutOperandList = (outs MSA128BOpnd:$wd);
+  dag InOperandList = (ins MSA128BOpnd:$wd_in, MSA128BOpnd:$ws,
+                           vsplat_uimm8:$u8);
+  string AsmString = "bmzi.b\t$wd, $ws, $u8";
+  list<dag> Pattern = [(set MSA128BOpnd:$wd, (vselect vsplati8_uimm8:$u8,
+                                                      MSA128BOpnd:$wd_in,
+                                                      MSA128BOpnd:$ws))];
+  InstrItinClass Itinerary = NoItinerary;
+  string Constraints = "$wd = $wd_in";
+}
+
+class BNEG_B_DESC : MSA_3R_DESC_BASE<"bneg.b", vbneg_b, MSA128BOpnd>;
+class BNEG_H_DESC : MSA_3R_DESC_BASE<"bneg.h", vbneg_h, MSA128HOpnd>;
+class BNEG_W_DESC : MSA_3R_DESC_BASE<"bneg.w", vbneg_w, MSA128WOpnd>;
+class BNEG_D_DESC : MSA_3R_DESC_BASE<"bneg.d", vbneg_d, MSA128DOpnd>;
+
+class BNEGI_B_DESC : MSA_BIT_B_DESC_BASE<"bnegi.b", xor, vsplat_uimm_pow2,
+                                         MSA128BOpnd>;
+class BNEGI_H_DESC : MSA_BIT_H_DESC_BASE<"bnegi.h", xor, vsplat_uimm_pow2,
+                                         MSA128HOpnd>;
+class BNEGI_W_DESC : MSA_BIT_W_DESC_BASE<"bnegi.w", xor, vsplat_uimm_pow2,
+                                         MSA128WOpnd>;
+class BNEGI_D_DESC : MSA_BIT_D_DESC_BASE<"bnegi.d", xor, vsplat_uimm_pow2,
+                                         MSA128DOpnd>;
+
+class BNZ_B_DESC : MSA_CBRANCH_DESC_BASE<"bnz.b", MSA128BOpnd>;
+class BNZ_H_DESC : MSA_CBRANCH_DESC_BASE<"bnz.h", MSA128HOpnd>;
+class BNZ_W_DESC : MSA_CBRANCH_DESC_BASE<"bnz.w", MSA128WOpnd>;
+class BNZ_D_DESC : MSA_CBRANCH_DESC_BASE<"bnz.d", MSA128DOpnd>;
+
+class BNZ_V_DESC : MSA_CBRANCH_DESC_BASE<"bnz.v", MSA128BOpnd>;
+
+class BSEL_V_DESC {
+  dag OutOperandList = (outs MSA128BOpnd:$wd);
+  dag InOperandList = (ins MSA128BOpnd:$wd_in, MSA128BOpnd:$ws,
+                       MSA128BOpnd:$wt);
+  string AsmString = "bsel.v\t$wd, $ws, $wt";
+  // Note that vselect and BSEL_V treat the condition operand the opposite way
+  // from each other.
+  //   (vselect cond, if_set, if_clear)
+  //   (BSEL_V cond, if_clear, if_set)
+  list<dag> Pattern = [(set MSA128BOpnd:$wd,
+                        (vselect MSA128BOpnd:$wd_in, MSA128BOpnd:$wt,
+                                                     MSA128BOpnd:$ws))];
+  InstrItinClass Itinerary = NoItinerary;
+  string Constraints = "$wd = $wd_in";
+}
+
+class BSELI_B_DESC {
+  dag OutOperandList = (outs MSA128BOpnd:$wd);
+  dag InOperandList = (ins MSA128BOpnd:$wd_in, MSA128BOpnd:$ws,
+                           vsplat_uimm8:$u8);
+  string AsmString = "bseli.b\t$wd, $ws, $u8";
+  // Note that vselect and BSEL_V treat the condition operand the opposite way
+  // from each other.
+  //   (vselect cond, if_set, if_clear)
+  //   (BSEL_V cond, if_clear, if_set)
+  list<dag> Pattern = [(set MSA128BOpnd:$wd, (vselect MSA128BOpnd:$wd_in,
+                                                      vsplati8_uimm8:$u8,
+                                                      MSA128BOpnd:$ws))];
+  InstrItinClass Itinerary = NoItinerary;
+  string Constraints = "$wd = $wd_in";
+}
+
+class BSET_B_DESC : MSA_3R_DESC_BASE<"bset.b", vbset_b, MSA128BOpnd>;
+class BSET_H_DESC : MSA_3R_DESC_BASE<"bset.h", vbset_h, MSA128HOpnd>;
+class BSET_W_DESC : MSA_3R_DESC_BASE<"bset.w", vbset_w, MSA128WOpnd>;
+class BSET_D_DESC : MSA_3R_DESC_BASE<"bset.d", vbset_d, MSA128DOpnd>;
+
+class BSETI_B_DESC : MSA_BIT_B_DESC_BASE<"bseti.b", or, vsplat_uimm_pow2,
+                                         MSA128BOpnd>;
+class BSETI_H_DESC : MSA_BIT_H_DESC_BASE<"bseti.h", or, vsplat_uimm_pow2,
+                                         MSA128HOpnd>;
+class BSETI_W_DESC : MSA_BIT_W_DESC_BASE<"bseti.w", or, vsplat_uimm_pow2,
+                                         MSA128WOpnd>;
+class BSETI_D_DESC : MSA_BIT_D_DESC_BASE<"bseti.d", or, vsplat_uimm_pow2,
+                                         MSA128DOpnd>;
+
+class BZ_B_DESC : MSA_CBRANCH_DESC_BASE<"bz.b", MSA128BOpnd>;
+class BZ_H_DESC : MSA_CBRANCH_DESC_BASE<"bz.h", MSA128HOpnd>;
+class BZ_W_DESC : MSA_CBRANCH_DESC_BASE<"bz.w", MSA128WOpnd>;
+class BZ_D_DESC : MSA_CBRANCH_DESC_BASE<"bz.d", MSA128DOpnd>;
+
+class BZ_V_DESC : MSA_CBRANCH_DESC_BASE<"bz.v", MSA128BOpnd>;
+
+class CEQ_B_DESC : MSA_3R_DESC_BASE<"ceq.b", vseteq_v16i8, MSA128BOpnd>,
+                   IsCommutable;
+class CEQ_H_DESC : MSA_3R_DESC_BASE<"ceq.h", vseteq_v8i16, MSA128HOpnd>,
+                   IsCommutable;
+class CEQ_W_DESC : MSA_3R_DESC_BASE<"ceq.w", vseteq_v4i32, MSA128WOpnd>,
+                   IsCommutable;
+class CEQ_D_DESC : MSA_3R_DESC_BASE<"ceq.d", vseteq_v2i64, MSA128DOpnd>,
+                   IsCommutable;
+
+class CEQI_B_DESC : MSA_I5_DESC_BASE<"ceqi.b", vseteq_v16i8, vsplati8_simm5,
+                                     MSA128BOpnd>;
+class CEQI_H_DESC : MSA_I5_DESC_BASE<"ceqi.h", vseteq_v8i16, vsplati16_simm5,
+                                     MSA128HOpnd>;
+class CEQI_W_DESC : MSA_I5_DESC_BASE<"ceqi.w", vseteq_v4i32, vsplati32_simm5,
+                                     MSA128WOpnd>;
+class CEQI_D_DESC : MSA_I5_DESC_BASE<"ceqi.d", vseteq_v2i64, vsplati64_simm5,
+                                     MSA128DOpnd>;
+
+class CFCMSA_DESC {
+  dag OutOperandList = (outs GPR32Opnd:$rd);
+  dag InOperandList = (ins MSA128CROpnd:$cs);
+  string AsmString = "cfcmsa\t$rd, $cs";
+  InstrItinClass Itinerary = NoItinerary;
+  bit hasSideEffects = 1;
+}
+
+class CLE_S_B_DESC : MSA_3R_DESC_BASE<"cle_s.b", vsetle_v16i8, MSA128BOpnd>;
+class CLE_S_H_DESC : MSA_3R_DESC_BASE<"cle_s.h", vsetle_v8i16, MSA128HOpnd>;
+class CLE_S_W_DESC : MSA_3R_DESC_BASE<"cle_s.w", vsetle_v4i32, MSA128WOpnd>;
+class CLE_S_D_DESC : MSA_3R_DESC_BASE<"cle_s.d", vsetle_v2i64, MSA128DOpnd>;
+
+class CLE_U_B_DESC : MSA_3R_DESC_BASE<"cle_u.b", vsetule_v16i8, MSA128BOpnd>;
+class CLE_U_H_DESC : MSA_3R_DESC_BASE<"cle_u.h", vsetule_v8i16, MSA128HOpnd>;
+class CLE_U_W_DESC : MSA_3R_DESC_BASE<"cle_u.w", vsetule_v4i32, MSA128WOpnd>;
+class CLE_U_D_DESC : MSA_3R_DESC_BASE<"cle_u.d", vsetule_v2i64, MSA128DOpnd>;
+
+class CLEI_S_B_DESC : MSA_I5_DESC_BASE<"clei_s.b", vsetle_v16i8,
+                                       vsplati8_simm5,  MSA128BOpnd>;
+class CLEI_S_H_DESC : MSA_I5_DESC_BASE<"clei_s.h", vsetle_v8i16,
+                                       vsplati16_simm5, MSA128HOpnd>;
+class CLEI_S_W_DESC : MSA_I5_DESC_BASE<"clei_s.w", vsetle_v4i32,
+                                       vsplati32_simm5, MSA128WOpnd>;
+class CLEI_S_D_DESC : MSA_I5_DESC_BASE<"clei_s.d", vsetle_v2i64,
+                                       vsplati64_simm5, MSA128DOpnd>;
+
+class CLEI_U_B_DESC : MSA_I5_DESC_BASE<"clei_u.b", vsetule_v16i8,
+                                       vsplati8_uimm5,  MSA128BOpnd>;
+class CLEI_U_H_DESC : MSA_I5_DESC_BASE<"clei_u.h", vsetule_v8i16,
+                                       vsplati16_uimm5, MSA128HOpnd>;
+class CLEI_U_W_DESC : MSA_I5_DESC_BASE<"clei_u.w", vsetule_v4i32,
+                                       vsplati32_uimm5, MSA128WOpnd>;
+class CLEI_U_D_DESC : MSA_I5_DESC_BASE<"clei_u.d", vsetule_v2i64,
+                                       vsplati64_uimm5, MSA128DOpnd>;
+
+class CLT_S_B_DESC : MSA_3R_DESC_BASE<"clt_s.b", vsetlt_v16i8, MSA128BOpnd>;
+class CLT_S_H_DESC : MSA_3R_DESC_BASE<"clt_s.h", vsetlt_v8i16, MSA128HOpnd>;
+class CLT_S_W_DESC : MSA_3R_DESC_BASE<"clt_s.w", vsetlt_v4i32, MSA128WOpnd>;
+class CLT_S_D_DESC : MSA_3R_DESC_BASE<"clt_s.d", vsetlt_v2i64, MSA128DOpnd>;
+
+class CLT_U_B_DESC : MSA_3R_DESC_BASE<"clt_u.b", vsetult_v16i8, MSA128BOpnd>;
+class CLT_U_H_DESC : MSA_3R_DESC_BASE<"clt_u.h", vsetult_v8i16, MSA128HOpnd>;
+class CLT_U_W_DESC : MSA_3R_DESC_BASE<"clt_u.w", vsetult_v4i32, MSA128WOpnd>;
+class CLT_U_D_DESC : MSA_3R_DESC_BASE<"clt_u.d", vsetult_v2i64, MSA128DOpnd>;
+
+class CLTI_S_B_DESC : MSA_I5_DESC_BASE<"clti_s.b", vsetlt_v16i8,
+                                       vsplati8_simm5, MSA128BOpnd>;
+class CLTI_S_H_DESC : MSA_I5_DESC_BASE<"clti_s.h", vsetlt_v8i16,
+                                       vsplati16_simm5, MSA128HOpnd>;
+class CLTI_S_W_DESC : MSA_I5_DESC_BASE<"clti_s.w", vsetlt_v4i32,
+                                       vsplati32_simm5, MSA128WOpnd>;
+class CLTI_S_D_DESC : MSA_I5_DESC_BASE<"clti_s.d", vsetlt_v2i64,
+                                       vsplati64_simm5, MSA128DOpnd>;
+
+class CLTI_U_B_DESC : MSA_I5_DESC_BASE<"clti_u.b", vsetult_v16i8,
+                                       vsplati8_uimm5, MSA128BOpnd>;
+class CLTI_U_H_DESC : MSA_I5_DESC_BASE<"clti_u.h", vsetult_v8i16,
+                                       vsplati16_uimm5, MSA128HOpnd>;
+class CLTI_U_W_DESC : MSA_I5_DESC_BASE<"clti_u.w", vsetult_v4i32,
+                                       vsplati32_uimm5, MSA128WOpnd>;
+class CLTI_U_D_DESC : MSA_I5_DESC_BASE<"clti_u.d", vsetult_v2i64,
+                                       vsplati64_uimm5, MSA128DOpnd>;
+
+class COPY_S_B_DESC : MSA_COPY_DESC_BASE<"copy_s.b", vextract_sext_i8,  v16i8,
+                                         GPR32Opnd, MSA128BOpnd>;
+class COPY_S_H_DESC : MSA_COPY_DESC_BASE<"copy_s.h", vextract_sext_i16, v8i16,
+                                         GPR32Opnd, MSA128HOpnd>;
+class COPY_S_W_DESC : MSA_COPY_DESC_BASE<"copy_s.w", vextract_sext_i32, v4i32,
+                                         GPR32Opnd, MSA128WOpnd>;
+class COPY_S_D_DESC : MSA_COPY_DESC_BASE<"copy_s.d", vextract_sext_i64, v2i64,
+                                         GPR64Opnd, MSA128DOpnd>;
+
+class COPY_U_B_DESC : MSA_COPY_DESC_BASE<"copy_u.b", vextract_zext_i8,  v16i8,
+                                         GPR32Opnd, MSA128BOpnd>;
+class COPY_U_H_DESC : MSA_COPY_DESC_BASE<"copy_u.h", vextract_zext_i16, v8i16,
+                                         GPR32Opnd, MSA128HOpnd>;
+class COPY_U_W_DESC : MSA_COPY_DESC_BASE<"copy_u.w", vextract_zext_i32, v4i32,
+                                         GPR32Opnd, MSA128WOpnd>;
+class COPY_U_D_DESC : MSA_COPY_DESC_BASE<"copy_u.d", vextract_zext_i64, v2i64,
+                                         GPR64Opnd, MSA128DOpnd>;
+
+class COPY_FW_PSEUDO_DESC : MSA_COPY_PSEUDO_BASE<vector_extract, v4f32, FGR32,
+                                                 MSA128W>;
+class COPY_FD_PSEUDO_DESC : MSA_COPY_PSEUDO_BASE<vector_extract, v2f64, FGR64,
+                                                 MSA128D>;
+
+class CTCMSA_DESC {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins MSA128CROpnd:$cd, GPR32Opnd:$rs);
+  string AsmString = "ctcmsa\t$cd, $rs";
+  InstrItinClass Itinerary = NoItinerary;
+  bit hasSideEffects = 1;
+}
+
+class DIV_S_B_DESC : MSA_3R_DESC_BASE<"div_s.b", sdiv, MSA128BOpnd>;
+class DIV_S_H_DESC : MSA_3R_DESC_BASE<"div_s.h", sdiv, MSA128HOpnd>;
+class DIV_S_W_DESC : MSA_3R_DESC_BASE<"div_s.w", sdiv, MSA128WOpnd>;
+class DIV_S_D_DESC : MSA_3R_DESC_BASE<"div_s.d", sdiv, MSA128DOpnd>;
+
+class DIV_U_B_DESC : MSA_3R_DESC_BASE<"div_u.b", udiv, MSA128BOpnd>;
+class DIV_U_H_DESC : MSA_3R_DESC_BASE<"div_u.h", udiv, MSA128HOpnd>;
+class DIV_U_W_DESC : MSA_3R_DESC_BASE<"div_u.w", udiv, MSA128WOpnd>;
+class DIV_U_D_DESC : MSA_3R_DESC_BASE<"div_u.d", udiv, MSA128DOpnd>;
+
+class DOTP_S_H_DESC : MSA_3R_DESC_BASE<"dotp_s.h", int_mips_dotp_s_h,
+                                       MSA128HOpnd, MSA128BOpnd, MSA128BOpnd>,
+                      IsCommutable;
+class DOTP_S_W_DESC : MSA_3R_DESC_BASE<"dotp_s.w", int_mips_dotp_s_w,
+                                       MSA128WOpnd, MSA128HOpnd, MSA128HOpnd>,
+                      IsCommutable;
+class DOTP_S_D_DESC : MSA_3R_DESC_BASE<"dotp_s.d", int_mips_dotp_s_d,
+                                       MSA128DOpnd, MSA128WOpnd, MSA128WOpnd>,
+                      IsCommutable;
+
+class DOTP_U_H_DESC : MSA_3R_DESC_BASE<"dotp_u.h", int_mips_dotp_u_h,
+                                       MSA128HOpnd, MSA128BOpnd, MSA128BOpnd>,
+                      IsCommutable;
+class DOTP_U_W_DESC : MSA_3R_DESC_BASE<"dotp_u.w", int_mips_dotp_u_w,
+                                       MSA128WOpnd, MSA128HOpnd, MSA128HOpnd>,
+                      IsCommutable;
+class DOTP_U_D_DESC : MSA_3R_DESC_BASE<"dotp_u.d", int_mips_dotp_u_d,
+                                       MSA128DOpnd, MSA128WOpnd, MSA128WOpnd>,
+                      IsCommutable;
+
+class DPADD_S_H_DESC : MSA_3R_4R_DESC_BASE<"dpadd_s.h", int_mips_dpadd_s_h,
+                                           MSA128HOpnd, MSA128BOpnd,
+                                           MSA128BOpnd>, IsCommutable;
+class DPADD_S_W_DESC : MSA_3R_4R_DESC_BASE<"dpadd_s.w", int_mips_dpadd_s_w,
+                                           MSA128WOpnd, MSA128HOpnd,
+                                           MSA128HOpnd>, IsCommutable;
+class DPADD_S_D_DESC : MSA_3R_4R_DESC_BASE<"dpadd_s.d", int_mips_dpadd_s_d,
+                                           MSA128DOpnd, MSA128WOpnd,
+                                           MSA128WOpnd>, IsCommutable;
+
+class DPADD_U_H_DESC : MSA_3R_4R_DESC_BASE<"dpadd_u.h", int_mips_dpadd_u_h,
+                                           MSA128HOpnd, MSA128BOpnd,
+                                           MSA128BOpnd>, IsCommutable;
+class DPADD_U_W_DESC : MSA_3R_4R_DESC_BASE<"dpadd_u.w", int_mips_dpadd_u_w,
+                                           MSA128WOpnd, MSA128HOpnd,
+                                           MSA128HOpnd>, IsCommutable;
+class DPADD_U_D_DESC : MSA_3R_4R_DESC_BASE<"dpadd_u.d", int_mips_dpadd_u_d,
+                                           MSA128DOpnd, MSA128WOpnd,
+                                           MSA128WOpnd>, IsCommutable;
+
+class DPSUB_S_H_DESC : MSA_3R_4R_DESC_BASE<"dpsub_s.h", int_mips_dpsub_s_h,
+                                           MSA128HOpnd, MSA128BOpnd,
+                                           MSA128BOpnd>;
+class DPSUB_S_W_DESC : MSA_3R_4R_DESC_BASE<"dpsub_s.w", int_mips_dpsub_s_w,
+                                           MSA128WOpnd, MSA128HOpnd,
+                                           MSA128HOpnd>;
+class DPSUB_S_D_DESC : MSA_3R_4R_DESC_BASE<"dpsub_s.d", int_mips_dpsub_s_d,
+                                           MSA128DOpnd, MSA128WOpnd,
+                                           MSA128WOpnd>;
+
+class DPSUB_U_H_DESC : MSA_3R_4R_DESC_BASE<"dpsub_u.h", int_mips_dpsub_u_h,
+                                           MSA128HOpnd, MSA128BOpnd,
+                                           MSA128BOpnd>;
+class DPSUB_U_W_DESC : MSA_3R_4R_DESC_BASE<"dpsub_u.w", int_mips_dpsub_u_w,
+                                           MSA128WOpnd, MSA128HOpnd,
+                                           MSA128HOpnd>;
+class DPSUB_U_D_DESC : MSA_3R_4R_DESC_BASE<"dpsub_u.d", int_mips_dpsub_u_d,
+                                           MSA128DOpnd, MSA128WOpnd,
+                                           MSA128WOpnd>;
+
+class FADD_W_DESC : MSA_3RF_DESC_BASE<"fadd.w", fadd, MSA128WOpnd>,
+                    IsCommutable;
+class FADD_D_DESC : MSA_3RF_DESC_BASE<"fadd.d", fadd, MSA128DOpnd>,
+                    IsCommutable;
+
+class FCAF_W_DESC : MSA_3RF_DESC_BASE<"fcaf.w", int_mips_fcaf_w, MSA128WOpnd>,
+                    IsCommutable;
+class FCAF_D_DESC : MSA_3RF_DESC_BASE<"fcaf.d", int_mips_fcaf_d, MSA128DOpnd>,
+                    IsCommutable;
+
+class FCEQ_W_DESC : MSA_3RF_DESC_BASE<"fceq.w", vfsetoeq_v4f32, MSA128WOpnd>,
+                    IsCommutable;
+class FCEQ_D_DESC : MSA_3RF_DESC_BASE<"fceq.d", vfsetoeq_v2f64, MSA128DOpnd>,
+                    IsCommutable;
+
+class FCLASS_W_DESC : MSA_2RF_DESC_BASE<"fclass.w", int_mips_fclass_w,
+                                        MSA128WOpnd>;
+class FCLASS_D_DESC : MSA_2RF_DESC_BASE<"fclass.d", int_mips_fclass_d,
+                                        MSA128DOpnd>;
+
+class FCLE_W_DESC : MSA_3RF_DESC_BASE<"fcle.w", vfsetole_v4f32, MSA128WOpnd>;
+class FCLE_D_DESC : MSA_3RF_DESC_BASE<"fcle.d", vfsetole_v2f64, MSA128DOpnd>;
+
+class FCLT_W_DESC : MSA_3RF_DESC_BASE<"fclt.w", vfsetolt_v4f32, MSA128WOpnd>;
+class FCLT_D_DESC : MSA_3RF_DESC_BASE<"fclt.d", vfsetolt_v2f64, MSA128DOpnd>;
+
+class FCNE_W_DESC : MSA_3RF_DESC_BASE<"fcne.w", vfsetone_v4f32, MSA128WOpnd>,
+                    IsCommutable;
+class FCNE_D_DESC : MSA_3RF_DESC_BASE<"fcne.d", vfsetone_v2f64, MSA128DOpnd>,
+                    IsCommutable;
+
+class FCOR_W_DESC : MSA_3RF_DESC_BASE<"fcor.w", vfsetord_v4f32, MSA128WOpnd>,
+                    IsCommutable;
+class FCOR_D_DESC : MSA_3RF_DESC_BASE<"fcor.d", vfsetord_v2f64, MSA128DOpnd>,
+                    IsCommutable;
+
+class FCUEQ_W_DESC : MSA_3RF_DESC_BASE<"fcueq.w", vfsetueq_v4f32, MSA128WOpnd>,
+                     IsCommutable;
+class FCUEQ_D_DESC : MSA_3RF_DESC_BASE<"fcueq.d", vfsetueq_v2f64, MSA128DOpnd>,
+                     IsCommutable;
+
+class FCULE_W_DESC : MSA_3RF_DESC_BASE<"fcule.w", vfsetule_v4f32, MSA128WOpnd>,
+                     IsCommutable;
+class FCULE_D_DESC : MSA_3RF_DESC_BASE<"fcule.d", vfsetule_v2f64, MSA128DOpnd>,
+                     IsCommutable;
+
+class FCULT_W_DESC : MSA_3RF_DESC_BASE<"fcult.w", vfsetult_v4f32, MSA128WOpnd>,
+                     IsCommutable;
+class FCULT_D_DESC : MSA_3RF_DESC_BASE<"fcult.d", vfsetult_v2f64, MSA128DOpnd>,
+                     IsCommutable;
+
+class FCUN_W_DESC : MSA_3RF_DESC_BASE<"fcun.w", vfsetun_v4f32, MSA128WOpnd>,
+                    IsCommutable;
+class FCUN_D_DESC : MSA_3RF_DESC_BASE<"fcun.d", vfsetun_v2f64, MSA128DOpnd>,
+                    IsCommutable;
+
+class FCUNE_W_DESC : MSA_3RF_DESC_BASE<"fcune.w", vfsetune_v4f32, MSA128WOpnd>,
+                     IsCommutable;
+class FCUNE_D_DESC : MSA_3RF_DESC_BASE<"fcune.d", vfsetune_v2f64, MSA128DOpnd>,
+                     IsCommutable;
+
+class FDIV_W_DESC : MSA_3RF_DESC_BASE<"fdiv.w", fdiv, MSA128WOpnd>;
+class FDIV_D_DESC : MSA_3RF_DESC_BASE<"fdiv.d", fdiv, MSA128DOpnd>;
+
+class FEXDO_H_DESC : MSA_3RF_DESC_BASE<"fexdo.h", int_mips_fexdo_h,
+                                       MSA128HOpnd, MSA128WOpnd, MSA128WOpnd>;
+class FEXDO_W_DESC : MSA_3RF_DESC_BASE<"fexdo.w", int_mips_fexdo_w,
+                                       MSA128WOpnd, MSA128DOpnd, MSA128DOpnd>;
+
+// The fexp2.df instruction multiplies the first operand by 2 to the power of
+// the second operand. We therefore need a pseudo-insn in order to invent the
+// 1.0 when we only need to match ISD::FEXP2.
+class FEXP2_W_DESC : MSA_3RF_DESC_BASE<"fexp2.w", mul_fexp2, MSA128WOpnd>;
+class FEXP2_D_DESC : MSA_3RF_DESC_BASE<"fexp2.d", mul_fexp2, MSA128DOpnd>;
+let usesCustomInserter = 1 in {
+  class FEXP2_W_1_PSEUDO_DESC :
+      MSAPseudo<(outs MSA128W:$wd), (ins MSA128W:$ws),
+                [(set MSA128W:$wd, (fexp2 MSA128W:$ws))]>;
+  class FEXP2_D_1_PSEUDO_DESC :
+      MSAPseudo<(outs MSA128D:$wd), (ins MSA128D:$ws),
+                [(set MSA128D:$wd, (fexp2 MSA128D:$ws))]>;
+}
+
+class FEXUPL_W_DESC : MSA_2RF_DESC_BASE<"fexupl.w", int_mips_fexupl_w,
+                                        MSA128WOpnd, MSA128HOpnd>;
+class FEXUPL_D_DESC : MSA_2RF_DESC_BASE<"fexupl.d", int_mips_fexupl_d,
+                                        MSA128DOpnd, MSA128WOpnd>;
+
+class FEXUPR_W_DESC : MSA_2RF_DESC_BASE<"fexupr.w", int_mips_fexupr_w,
+                                        MSA128WOpnd, MSA128HOpnd>;
+class FEXUPR_D_DESC : MSA_2RF_DESC_BASE<"fexupr.d", int_mips_fexupr_d,
+                                        MSA128DOpnd, MSA128WOpnd>;
+
+class FFINT_S_W_DESC : MSA_2RF_DESC_BASE<"ffint_s.w", sint_to_fp, MSA128WOpnd>;
+class FFINT_S_D_DESC : MSA_2RF_DESC_BASE<"ffint_s.d", sint_to_fp, MSA128DOpnd>;
+
+class FFINT_U_W_DESC : MSA_2RF_DESC_BASE<"ffint_u.w", uint_to_fp, MSA128WOpnd>;
+class FFINT_U_D_DESC : MSA_2RF_DESC_BASE<"ffint_u.d", uint_to_fp, MSA128DOpnd>;
+
+class FFQL_W_DESC : MSA_2RF_DESC_BASE<"ffql.w", int_mips_ffql_w,
+                                      MSA128WOpnd, MSA128HOpnd>;
+class FFQL_D_DESC : MSA_2RF_DESC_BASE<"ffql.d", int_mips_ffql_d,
+                                      MSA128DOpnd, MSA128WOpnd>;
+
+class FFQR_W_DESC : MSA_2RF_DESC_BASE<"ffqr.w", int_mips_ffqr_w,
+                                      MSA128WOpnd, MSA128HOpnd>;
+class FFQR_D_DESC : MSA_2RF_DESC_BASE<"ffqr.d", int_mips_ffqr_d,
+                                      MSA128DOpnd, MSA128WOpnd>;
+
+class FILL_B_DESC : MSA_2R_FILL_DESC_BASE<"fill.b", v16i8, vsplati8,
+                                          MSA128BOpnd, GPR32Opnd>;
+class FILL_H_DESC : MSA_2R_FILL_DESC_BASE<"fill.h", v8i16, vsplati16,
+                                          MSA128HOpnd, GPR32Opnd>;
+class FILL_W_DESC : MSA_2R_FILL_DESC_BASE<"fill.w", v4i32, vsplati32,
+                                          MSA128WOpnd, GPR32Opnd>;
+class FILL_D_DESC : MSA_2R_FILL_DESC_BASE<"fill.d", v2i64, vsplati64,
+                                          MSA128DOpnd, GPR64Opnd>;
+
+class FILL_FW_PSEUDO_DESC : MSA_2R_FILL_PSEUDO_BASE<v4f32, vsplatf32, MSA128W,
+                                                    FGR32>;
+class FILL_FD_PSEUDO_DESC : MSA_2R_FILL_PSEUDO_BASE<v2f64, vsplatf64, MSA128D,
+                                                    FGR64>;
+
+class FLOG2_W_DESC : MSA_2RF_DESC_BASE<"flog2.w", flog2, MSA128WOpnd>;
+class FLOG2_D_DESC : MSA_2RF_DESC_BASE<"flog2.d", flog2, MSA128DOpnd>;
+
+class FMADD_W_DESC : MSA_3RF_4RF_DESC_BASE<"fmadd.w", fma, MSA128WOpnd>;
+class FMADD_D_DESC : MSA_3RF_4RF_DESC_BASE<"fmadd.d", fma, MSA128DOpnd>;
+
+class FMAX_W_DESC : MSA_3RF_DESC_BASE<"fmax.w", int_mips_fmax_w, MSA128WOpnd>;
+class FMAX_D_DESC : MSA_3RF_DESC_BASE<"fmax.d", int_mips_fmax_d, MSA128DOpnd>;
+
+class FMAX_A_W_DESC : MSA_3RF_DESC_BASE<"fmax_a.w", int_mips_fmax_a_w,
+                                        MSA128WOpnd>;
+class FMAX_A_D_DESC : MSA_3RF_DESC_BASE<"fmax_a.d", int_mips_fmax_a_d,
+                                        MSA128DOpnd>;
+
+class FMIN_W_DESC : MSA_3RF_DESC_BASE<"fmin.w", int_mips_fmin_w, MSA128WOpnd>;
+class FMIN_D_DESC : MSA_3RF_DESC_BASE<"fmin.d", int_mips_fmin_d, MSA128DOpnd>;
+
+class FMIN_A_W_DESC : MSA_3RF_DESC_BASE<"fmin_a.w", int_mips_fmin_a_w,
+                                        MSA128WOpnd>;
+class FMIN_A_D_DESC : MSA_3RF_DESC_BASE<"fmin_a.d", int_mips_fmin_a_d,
+                                        MSA128DOpnd>;
+
+class FMSUB_W_DESC : MSA_3RF_4RF_DESC_BASE<"fmsub.w", fms, MSA128WOpnd>;
+class FMSUB_D_DESC : MSA_3RF_4RF_DESC_BASE<"fmsub.d", fms, MSA128DOpnd>;
+
+class FMUL_W_DESC : MSA_3RF_DESC_BASE<"fmul.w", fmul, MSA128WOpnd>;
+class FMUL_D_DESC : MSA_3RF_DESC_BASE<"fmul.d", fmul, MSA128DOpnd>;
+
+class FRINT_W_DESC : MSA_2RF_DESC_BASE<"frint.w", frint, MSA128WOpnd>;
+class FRINT_D_DESC : MSA_2RF_DESC_BASE<"frint.d", frint, MSA128DOpnd>;
+
+class FRCP_W_DESC : MSA_2RF_DESC_BASE<"frcp.w", int_mips_frcp_w, MSA128WOpnd>;
+class FRCP_D_DESC : MSA_2RF_DESC_BASE<"frcp.d", int_mips_frcp_d, MSA128DOpnd>;
+
+class FRSQRT_W_DESC : MSA_2RF_DESC_BASE<"frsqrt.w", int_mips_frsqrt_w,
+                                        MSA128WOpnd>;
+class FRSQRT_D_DESC : MSA_2RF_DESC_BASE<"frsqrt.d", int_mips_frsqrt_d,
+                                        MSA128DOpnd>;
+
+class FSAF_W_DESC : MSA_3RF_DESC_BASE<"fsaf.w", int_mips_fsaf_w, MSA128WOpnd>;
+class FSAF_D_DESC : MSA_3RF_DESC_BASE<"fsaf.d", int_mips_fsaf_d, MSA128DOpnd>;
+
+class FSEQ_W_DESC : MSA_3RF_DESC_BASE<"fseq.w", int_mips_fseq_w, MSA128WOpnd>;
+class FSEQ_D_DESC : MSA_3RF_DESC_BASE<"fseq.d", int_mips_fseq_d, MSA128DOpnd>;
+
+class FSLE_W_DESC : MSA_3RF_DESC_BASE<"fsle.w", int_mips_fsle_w, MSA128WOpnd>;
+class FSLE_D_DESC : MSA_3RF_DESC_BASE<"fsle.d", int_mips_fsle_d, MSA128DOpnd>;
+
+class FSLT_W_DESC : MSA_3RF_DESC_BASE<"fslt.w", int_mips_fslt_w, MSA128WOpnd>;
+class FSLT_D_DESC : MSA_3RF_DESC_BASE<"fslt.d", int_mips_fslt_d, MSA128DOpnd>;
+
+class FSNE_W_DESC : MSA_3RF_DESC_BASE<"fsne.w", int_mips_fsne_w, MSA128WOpnd>;
+class FSNE_D_DESC : MSA_3RF_DESC_BASE<"fsne.d", int_mips_fsne_d, MSA128DOpnd>;
+
+class FSOR_W_DESC : MSA_3RF_DESC_BASE<"fsor.w", int_mips_fsor_w, MSA128WOpnd>;
+class FSOR_D_DESC : MSA_3RF_DESC_BASE<"fsor.d", int_mips_fsor_d, MSA128DOpnd>;
+
+class FSQRT_W_DESC : MSA_2RF_DESC_BASE<"fsqrt.w", fsqrt, MSA128WOpnd>;
+class FSQRT_D_DESC : MSA_2RF_DESC_BASE<"fsqrt.d", fsqrt, MSA128DOpnd>;
+
+class FSUB_W_DESC : MSA_3RF_DESC_BASE<"fsub.w", fsub, MSA128WOpnd>;
+class FSUB_D_DESC : MSA_3RF_DESC_BASE<"fsub.d", fsub, MSA128DOpnd>;
+
+class FSUEQ_W_DESC : MSA_3RF_DESC_BASE<"fsueq.w", int_mips_fsueq_w,
+                                       MSA128WOpnd>;
+class FSUEQ_D_DESC : MSA_3RF_DESC_BASE<"fsueq.d", int_mips_fsueq_d,
+                                       MSA128DOpnd>;
+
+class FSULE_W_DESC : MSA_3RF_DESC_BASE<"fsule.w", int_mips_fsule_w,
+                                       MSA128WOpnd>;
+class FSULE_D_DESC : MSA_3RF_DESC_BASE<"fsule.d", int_mips_fsule_d,
+                                       MSA128DOpnd>;
+
+class FSULT_W_DESC : MSA_3RF_DESC_BASE<"fsult.w", int_mips_fsult_w,
+                                       MSA128WOpnd>;
+class FSULT_D_DESC : MSA_3RF_DESC_BASE<"fsult.d", int_mips_fsult_d,
+                                       MSA128DOpnd>;
+
+class FSUN_W_DESC : MSA_3RF_DESC_BASE<"fsun.w", int_mips_fsun_w,
+                                      MSA128WOpnd>;
+class FSUN_D_DESC : MSA_3RF_DESC_BASE<"fsun.d", int_mips_fsun_d,
+                                      MSA128DOpnd>;
+
+class FSUNE_W_DESC : MSA_3RF_DESC_BASE<"fsune.w", int_mips_fsune_w,
+                                       MSA128WOpnd>;
+class FSUNE_D_DESC : MSA_3RF_DESC_BASE<"fsune.d", int_mips_fsune_d,
+                                       MSA128DOpnd>;
+
+class FTINT_S_W_DESC : MSA_2RF_DESC_BASE<"ftint_s.w", int_mips_ftint_s_w,
+                                         MSA128WOpnd>;
+class FTINT_S_D_DESC : MSA_2RF_DESC_BASE<"ftint_s.d", int_mips_ftint_s_d,
+                                         MSA128DOpnd>;
+
+class FTINT_U_W_DESC : MSA_2RF_DESC_BASE<"ftint_u.w", int_mips_ftint_u_w,
+                                         MSA128WOpnd>;
+class FTINT_U_D_DESC : MSA_2RF_DESC_BASE<"ftint_u.d", int_mips_ftint_u_d,
+                                         MSA128DOpnd>;
+
+class FTQ_H_DESC : MSA_3RF_DESC_BASE<"ftq.h", int_mips_ftq_h,
+                                     MSA128HOpnd, MSA128WOpnd, MSA128WOpnd>;
+class FTQ_W_DESC : MSA_3RF_DESC_BASE<"ftq.w", int_mips_ftq_w,
+                                     MSA128WOpnd, MSA128DOpnd, MSA128DOpnd>;
+
+class FTRUNC_S_W_DESC : MSA_2RF_DESC_BASE<"ftrunc_s.w", fp_to_sint,
+                                          MSA128WOpnd>;
+class FTRUNC_S_D_DESC : MSA_2RF_DESC_BASE<"ftrunc_s.d", fp_to_sint,
+                                          MSA128DOpnd>;
+
+class FTRUNC_U_W_DESC : MSA_2RF_DESC_BASE<"ftrunc_u.w", fp_to_uint,
+                                          MSA128WOpnd>;
+class FTRUNC_U_D_DESC : MSA_2RF_DESC_BASE<"ftrunc_u.d", fp_to_uint,
+                                          MSA128DOpnd>;
+
+class HADD_S_H_DESC : MSA_3R_DESC_BASE<"hadd_s.h", int_mips_hadd_s_h,
+                                       MSA128HOpnd, MSA128BOpnd, MSA128BOpnd>;
+class HADD_S_W_DESC : MSA_3R_DESC_BASE<"hadd_s.w", int_mips_hadd_s_w,
+                                       MSA128WOpnd, MSA128HOpnd, MSA128HOpnd>;
+class HADD_S_D_DESC : MSA_3R_DESC_BASE<"hadd_s.d", int_mips_hadd_s_d,
+                                       MSA128DOpnd, MSA128WOpnd, MSA128WOpnd>;
+
+class HADD_U_H_DESC : MSA_3R_DESC_BASE<"hadd_u.h", int_mips_hadd_u_h,
+                                       MSA128HOpnd, MSA128BOpnd, MSA128BOpnd>;
+class HADD_U_W_DESC : MSA_3R_DESC_BASE<"hadd_u.w", int_mips_hadd_u_w,
+                                       MSA128WOpnd, MSA128HOpnd, MSA128HOpnd>;
+class HADD_U_D_DESC : MSA_3R_DESC_BASE<"hadd_u.d", int_mips_hadd_u_d,
+                                       MSA128DOpnd, MSA128WOpnd, MSA128WOpnd>;
+
+class HSUB_S_H_DESC : MSA_3R_DESC_BASE<"hsub_s.h", int_mips_hsub_s_h,
+                                       MSA128HOpnd, MSA128BOpnd, MSA128BOpnd>;
+class HSUB_S_W_DESC : MSA_3R_DESC_BASE<"hsub_s.w", int_mips_hsub_s_w,
+                                       MSA128WOpnd, MSA128HOpnd, MSA128HOpnd>;
+class HSUB_S_D_DESC : MSA_3R_DESC_BASE<"hsub_s.d", int_mips_hsub_s_d,
+                                       MSA128DOpnd, MSA128WOpnd, MSA128WOpnd>;
+
+class HSUB_U_H_DESC : MSA_3R_DESC_BASE<"hsub_u.h", int_mips_hsub_u_h,
+                                       MSA128HOpnd, MSA128BOpnd, MSA128BOpnd>;
+class HSUB_U_W_DESC : MSA_3R_DESC_BASE<"hsub_u.w", int_mips_hsub_u_w,
+                                       MSA128WOpnd, MSA128HOpnd, MSA128HOpnd>;
+class HSUB_U_D_DESC : MSA_3R_DESC_BASE<"hsub_u.d", int_mips_hsub_u_d,
+                                       MSA128DOpnd, MSA128WOpnd, MSA128WOpnd>;
+
+class ILVEV_B_DESC : MSA_3R_DESC_BASE<"ilvev.b", MipsILVEV, MSA128BOpnd>;
+class ILVEV_H_DESC : MSA_3R_DESC_BASE<"ilvev.h", MipsILVEV, MSA128HOpnd>;
+class ILVEV_W_DESC : MSA_3R_DESC_BASE<"ilvev.w", MipsILVEV, MSA128WOpnd>;
+class ILVEV_D_DESC : MSA_3R_DESC_BASE<"ilvev.d", MipsILVEV, MSA128DOpnd>;
+
+class ILVL_B_DESC : MSA_3R_DESC_BASE<"ilvl.b", MipsILVL, MSA128BOpnd>;
+class ILVL_H_DESC : MSA_3R_DESC_BASE<"ilvl.h", MipsILVL, MSA128HOpnd>;
+class ILVL_W_DESC : MSA_3R_DESC_BASE<"ilvl.w", MipsILVL, MSA128WOpnd>;
+class ILVL_D_DESC : MSA_3R_DESC_BASE<"ilvl.d", MipsILVL, MSA128DOpnd>;
+
+class ILVOD_B_DESC : MSA_3R_DESC_BASE<"ilvod.b", MipsILVOD, MSA128BOpnd>;
+class ILVOD_H_DESC : MSA_3R_DESC_BASE<"ilvod.h", MipsILVOD, MSA128HOpnd>;
+class ILVOD_W_DESC : MSA_3R_DESC_BASE<"ilvod.w", MipsILVOD, MSA128WOpnd>;
+class ILVOD_D_DESC : MSA_3R_DESC_BASE<"ilvod.d", MipsILVOD, MSA128DOpnd>;
+
+class ILVR_B_DESC : MSA_3R_DESC_BASE<"ilvr.b", MipsILVR, MSA128BOpnd>;
+class ILVR_H_DESC : MSA_3R_DESC_BASE<"ilvr.h", MipsILVR, MSA128HOpnd>;
+class ILVR_W_DESC : MSA_3R_DESC_BASE<"ilvr.w", MipsILVR, MSA128WOpnd>;
+class ILVR_D_DESC : MSA_3R_DESC_BASE<"ilvr.d", MipsILVR, MSA128DOpnd>;
+
+class INSERT_B_DESC : MSA_INSERT_DESC_BASE<"insert.b", vinsert_v16i8,
+                                           MSA128BOpnd, GPR32Opnd>;
+class INSERT_H_DESC : MSA_INSERT_DESC_BASE<"insert.h", vinsert_v8i16,
+                                           MSA128HOpnd, GPR32Opnd>;
+class INSERT_W_DESC : MSA_INSERT_DESC_BASE<"insert.w", vinsert_v4i32,
+                                           MSA128WOpnd, GPR32Opnd>;
+class INSERT_D_DESC : MSA_INSERT_DESC_BASE<"insert.d", vinsert_v2i64,
+                                           MSA128DOpnd, GPR64Opnd>;
+
+class INSERT_B_VIDX_PSEUDO_DESC :
+    MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v16i8, MSA128BOpnd, GPR32Opnd>;
+class INSERT_H_VIDX_PSEUDO_DESC :
+    MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v8i16, MSA128HOpnd, GPR32Opnd>;
+class INSERT_W_VIDX_PSEUDO_DESC :
+    MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v4i32, MSA128WOpnd, GPR32Opnd>;
+class INSERT_D_VIDX_PSEUDO_DESC :
+    MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v2i64, MSA128DOpnd, GPR64Opnd>;
+
+class INSERT_FW_PSEUDO_DESC : MSA_INSERT_PSEUDO_BASE<vector_insert, v4f32,
+                                                     MSA128WOpnd, FGR32Opnd>;
+class INSERT_FD_PSEUDO_DESC : MSA_INSERT_PSEUDO_BASE<vector_insert, v2f64,
+                                                     MSA128DOpnd, FGR64Opnd>;
+
+class INSERT_FW_VIDX_PSEUDO_DESC :
+    MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v4f32, MSA128WOpnd, FGR32Opnd>;
+class INSERT_FD_VIDX_PSEUDO_DESC :
+    MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v2f64, MSA128DOpnd, FGR64Opnd>;
+
+class INSVE_B_DESC : MSA_INSVE_DESC_BASE<"insve.b", insve_v16i8,
+                                         MSA128BOpnd>;
+class INSVE_H_DESC : MSA_INSVE_DESC_BASE<"insve.h", insve_v8i16,
+                                         MSA128HOpnd>;
+class INSVE_W_DESC : MSA_INSVE_DESC_BASE<"insve.w", insve_v4i32,
+                                         MSA128WOpnd>;
+class INSVE_D_DESC : MSA_INSVE_DESC_BASE<"insve.d", insve_v2i64,
+                                         MSA128DOpnd>;
+
+class LD_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                   ValueType TyNode, RegisterOperand ROWD,
+                   Operand MemOpnd = mem_msa, ComplexPattern Addr = addrimm10,
+                   InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins MemOpnd:$addr);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $addr");
+  list<dag> Pattern = [(set ROWD:$wd, (TyNode (OpNode Addr:$addr)))];
+  InstrItinClass Itinerary = itin;
+  string DecoderMethod = "DecodeMSA128Mem";
+}
+
+class LD_B_DESC : LD_DESC_BASE<"ld.b", load, v16i8, MSA128BOpnd>;
+class LD_H_DESC : LD_DESC_BASE<"ld.h", load, v8i16, MSA128HOpnd>;
+class LD_W_DESC : LD_DESC_BASE<"ld.w", load, v4i32, MSA128WOpnd>;
+class LD_D_DESC : LD_DESC_BASE<"ld.d", load, v2i64, MSA128DOpnd>;
+
+class LDI_B_DESC : MSA_I10_LDI_DESC_BASE<"ldi.b", MSA128BOpnd>;
+class LDI_H_DESC : MSA_I10_LDI_DESC_BASE<"ldi.h", MSA128HOpnd>;
+class LDI_W_DESC : MSA_I10_LDI_DESC_BASE<"ldi.w", MSA128WOpnd>;
+class LDI_D_DESC : MSA_I10_LDI_DESC_BASE<"ldi.d", MSA128DOpnd>;
+
+class LSA_DESC_BASE<string instr_asm, RegisterOperand RORD,
+                    RegisterOperand RORS = RORD, RegisterOperand RORT = RORD,
+                    InstrItinClass itin = NoItinerary > {
+  dag OutOperandList = (outs RORD:$rd);
+  dag InOperandList = (ins RORS:$rs, RORT:$rt, LSAImm:$sa);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt, $sa");
+  list<dag> Pattern = [(set RORD:$rd, (add RORT:$rt,
+                                                (shl RORS:$rs,
+                                                     immZExt2Lsa:$sa)))];
+  InstrItinClass Itinerary = itin;
+}
+
+class LSA_DESC : LSA_DESC_BASE<"lsa", GPR32Opnd>;
+class DLSA_DESC : LSA_DESC_BASE<"dlsa", GPR64Opnd>;
+
+class MADD_Q_H_DESC : MSA_3RF_4RF_DESC_BASE<"madd_q.h", int_mips_madd_q_h,
+                                            MSA128HOpnd>;
+class MADD_Q_W_DESC : MSA_3RF_4RF_DESC_BASE<"madd_q.w", int_mips_madd_q_w,
+                                            MSA128WOpnd>;
+
+class MADDR_Q_H_DESC : MSA_3RF_4RF_DESC_BASE<"maddr_q.h", int_mips_maddr_q_h,
+                                             MSA128HOpnd>;
+class MADDR_Q_W_DESC : MSA_3RF_4RF_DESC_BASE<"maddr_q.w", int_mips_maddr_q_w,
+                                             MSA128WOpnd>;
+
+class MADDV_B_DESC : MSA_3R_4R_DESC_BASE<"maddv.b", muladd, MSA128BOpnd>;
+class MADDV_H_DESC : MSA_3R_4R_DESC_BASE<"maddv.h", muladd, MSA128HOpnd>;
+class MADDV_W_DESC : MSA_3R_4R_DESC_BASE<"maddv.w", muladd, MSA128WOpnd>;
+class MADDV_D_DESC : MSA_3R_4R_DESC_BASE<"maddv.d", muladd, MSA128DOpnd>;
+
+class MAX_A_B_DESC : MSA_3R_DESC_BASE<"max_a.b", int_mips_max_a_b, MSA128BOpnd>;
+class MAX_A_H_DESC : MSA_3R_DESC_BASE<"max_a.h", int_mips_max_a_h, MSA128HOpnd>;
+class MAX_A_W_DESC : MSA_3R_DESC_BASE<"max_a.w", int_mips_max_a_w, MSA128WOpnd>;
+class MAX_A_D_DESC : MSA_3R_DESC_BASE<"max_a.d", int_mips_max_a_d, MSA128DOpnd>;
+
+class MAX_S_B_DESC : MSA_3R_DESC_BASE<"max_s.b", MipsVSMax, MSA128BOpnd>;
+class MAX_S_H_DESC : MSA_3R_DESC_BASE<"max_s.h", MipsVSMax, MSA128HOpnd>;
+class MAX_S_W_DESC : MSA_3R_DESC_BASE<"max_s.w", MipsVSMax, MSA128WOpnd>;
+class MAX_S_D_DESC : MSA_3R_DESC_BASE<"max_s.d", MipsVSMax, MSA128DOpnd>;
+
+class MAX_U_B_DESC : MSA_3R_DESC_BASE<"max_u.b", MipsVUMax, MSA128BOpnd>;
+class MAX_U_H_DESC : MSA_3R_DESC_BASE<"max_u.h", MipsVUMax, MSA128HOpnd>;
+class MAX_U_W_DESC : MSA_3R_DESC_BASE<"max_u.w", MipsVUMax, MSA128WOpnd>;
+class MAX_U_D_DESC : MSA_3R_DESC_BASE<"max_u.d", MipsVUMax, MSA128DOpnd>;
+
+class MAXI_S_B_DESC : MSA_I5_DESC_BASE<"maxi_s.b", MipsVSMax, vsplati8_simm5,
+                                       MSA128BOpnd>;
+class MAXI_S_H_DESC : MSA_I5_DESC_BASE<"maxi_s.h", MipsVSMax, vsplati16_simm5,
+                                       MSA128HOpnd>;
+class MAXI_S_W_DESC : MSA_I5_DESC_BASE<"maxi_s.w", MipsVSMax, vsplati32_simm5,
+                                       MSA128WOpnd>;
+class MAXI_S_D_DESC : MSA_I5_DESC_BASE<"maxi_s.d", MipsVSMax, vsplati64_simm5,
+                                       MSA128DOpnd>;
+
+class MAXI_U_B_DESC : MSA_I5_DESC_BASE<"maxi_u.b", MipsVUMax, vsplati8_uimm5,
+                                       MSA128BOpnd>;
+class MAXI_U_H_DESC : MSA_I5_DESC_BASE<"maxi_u.h", MipsVUMax, vsplati16_uimm5,
+                                       MSA128HOpnd>;
+class MAXI_U_W_DESC : MSA_I5_DESC_BASE<"maxi_u.w", MipsVUMax, vsplati32_uimm5,
+                                       MSA128WOpnd>;
+class MAXI_U_D_DESC : MSA_I5_DESC_BASE<"maxi_u.d", MipsVUMax, vsplati64_uimm5,
+                                       MSA128DOpnd>;
+
+class MIN_A_B_DESC : MSA_3R_DESC_BASE<"min_a.b", int_mips_min_a_b, MSA128BOpnd>;
+class MIN_A_H_DESC : MSA_3R_DESC_BASE<"min_a.h", int_mips_min_a_h, MSA128HOpnd>;
+class MIN_A_W_DESC : MSA_3R_DESC_BASE<"min_a.w", int_mips_min_a_w, MSA128WOpnd>;
+class MIN_A_D_DESC : MSA_3R_DESC_BASE<"min_a.d", int_mips_min_a_d, MSA128DOpnd>;
+
+class MIN_S_B_DESC : MSA_3R_DESC_BASE<"min_s.b", MipsVSMin, MSA128BOpnd>;
+class MIN_S_H_DESC : MSA_3R_DESC_BASE<"min_s.h", MipsVSMin, MSA128HOpnd>;
+class MIN_S_W_DESC : MSA_3R_DESC_BASE<"min_s.w", MipsVSMin, MSA128WOpnd>;
+class MIN_S_D_DESC : MSA_3R_DESC_BASE<"min_s.d", MipsVSMin, MSA128DOpnd>;
+
+class MIN_U_B_DESC : MSA_3R_DESC_BASE<"min_u.b", MipsVUMin, MSA128BOpnd>;
+class MIN_U_H_DESC : MSA_3R_DESC_BASE<"min_u.h", MipsVUMin, MSA128HOpnd>;
+class MIN_U_W_DESC : MSA_3R_DESC_BASE<"min_u.w", MipsVUMin, MSA128WOpnd>;
+class MIN_U_D_DESC : MSA_3R_DESC_BASE<"min_u.d", MipsVUMin, MSA128DOpnd>;
+
+class MINI_S_B_DESC : MSA_I5_DESC_BASE<"mini_s.b", MipsVSMin, vsplati8_simm5,
+                                       MSA128BOpnd>;
+class MINI_S_H_DESC : MSA_I5_DESC_BASE<"mini_s.h", MipsVSMin, vsplati16_simm5,
+                                       MSA128HOpnd>;
+class MINI_S_W_DESC : MSA_I5_DESC_BASE<"mini_s.w", MipsVSMin, vsplati32_simm5,
+                                       MSA128WOpnd>;
+class MINI_S_D_DESC : MSA_I5_DESC_BASE<"mini_s.d", MipsVSMin, vsplati64_simm5,
+                                       MSA128DOpnd>;
+
+class MINI_U_B_DESC : MSA_I5_DESC_BASE<"mini_u.b", MipsVUMin, vsplati8_uimm5,
+                                       MSA128BOpnd>;
+class MINI_U_H_DESC : MSA_I5_DESC_BASE<"mini_u.h", MipsVUMin, vsplati16_uimm5,
+                                       MSA128HOpnd>;
+class MINI_U_W_DESC : MSA_I5_DESC_BASE<"mini_u.w", MipsVUMin, vsplati32_uimm5,
+                                       MSA128WOpnd>;
+class MINI_U_D_DESC : MSA_I5_DESC_BASE<"mini_u.d", MipsVUMin, vsplati64_uimm5,
+                                       MSA128DOpnd>;
+
+class MOD_S_B_DESC : MSA_3R_DESC_BASE<"mod_s.b", srem, MSA128BOpnd>;
+class MOD_S_H_DESC : MSA_3R_DESC_BASE<"mod_s.h", srem, MSA128HOpnd>;
+class MOD_S_W_DESC : MSA_3R_DESC_BASE<"mod_s.w", srem, MSA128WOpnd>;
+class MOD_S_D_DESC : MSA_3R_DESC_BASE<"mod_s.d", srem, MSA128DOpnd>;
+
+class MOD_U_B_DESC : MSA_3R_DESC_BASE<"mod_u.b", urem, MSA128BOpnd>;
+class MOD_U_H_DESC : MSA_3R_DESC_BASE<"mod_u.h", urem, MSA128HOpnd>;
+class MOD_U_W_DESC : MSA_3R_DESC_BASE<"mod_u.w", urem, MSA128WOpnd>;
+class MOD_U_D_DESC : MSA_3R_DESC_BASE<"mod_u.d", urem, MSA128DOpnd>;
+
+class MOVE_V_DESC {
+  dag OutOperandList = (outs MSA128BOpnd:$wd);
+  dag InOperandList = (ins MSA128BOpnd:$ws);
+  string AsmString = "move.v\t$wd, $ws";
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = NoItinerary;
+}
+
+class MSUB_Q_H_DESC : MSA_3RF_4RF_DESC_BASE<"msub_q.h", int_mips_msub_q_h,
+                                            MSA128HOpnd>;
+class MSUB_Q_W_DESC : MSA_3RF_4RF_DESC_BASE<"msub_q.w", int_mips_msub_q_w,
+                                            MSA128WOpnd>;
+
+class MSUBR_Q_H_DESC : MSA_3RF_4RF_DESC_BASE<"msubr_q.h", int_mips_msubr_q_h,
+                                             MSA128HOpnd>;
+class MSUBR_Q_W_DESC : MSA_3RF_4RF_DESC_BASE<"msubr_q.w", int_mips_msubr_q_w,
+                                             MSA128WOpnd>;
+
+class MSUBV_B_DESC : MSA_3R_4R_DESC_BASE<"msubv.b", mulsub, MSA128BOpnd>;
+class MSUBV_H_DESC : MSA_3R_4R_DESC_BASE<"msubv.h", mulsub, MSA128HOpnd>;
+class MSUBV_W_DESC : MSA_3R_4R_DESC_BASE<"msubv.w", mulsub, MSA128WOpnd>;
+class MSUBV_D_DESC : MSA_3R_4R_DESC_BASE<"msubv.d", mulsub, MSA128DOpnd>;
+
+class MUL_Q_H_DESC : MSA_3RF_DESC_BASE<"mul_q.h", int_mips_mul_q_h,
+                                       MSA128HOpnd>;
+class MUL_Q_W_DESC : MSA_3RF_DESC_BASE<"mul_q.w", int_mips_mul_q_w,
+                                       MSA128WOpnd>;
+
+class MULR_Q_H_DESC : MSA_3RF_DESC_BASE<"mulr_q.h", int_mips_mulr_q_h,
+                                        MSA128HOpnd>;
+class MULR_Q_W_DESC : MSA_3RF_DESC_BASE<"mulr_q.w", int_mips_mulr_q_w,
+                                        MSA128WOpnd>;
+
+class MULV_B_DESC : MSA_3R_DESC_BASE<"mulv.b", mul, MSA128BOpnd>;
+class MULV_H_DESC : MSA_3R_DESC_BASE<"mulv.h", mul, MSA128HOpnd>;
+class MULV_W_DESC : MSA_3R_DESC_BASE<"mulv.w", mul, MSA128WOpnd>;
+class MULV_D_DESC : MSA_3R_DESC_BASE<"mulv.d", mul, MSA128DOpnd>;
+
+class NLOC_B_DESC : MSA_2R_DESC_BASE<"nloc.b", int_mips_nloc_b, MSA128BOpnd>;
+class NLOC_H_DESC : MSA_2R_DESC_BASE<"nloc.h", int_mips_nloc_h, MSA128HOpnd>;
+class NLOC_W_DESC : MSA_2R_DESC_BASE<"nloc.w", int_mips_nloc_w, MSA128WOpnd>;
+class NLOC_D_DESC : MSA_2R_DESC_BASE<"nloc.d", int_mips_nloc_d, MSA128DOpnd>;
+
+class NLZC_B_DESC : MSA_2R_DESC_BASE<"nlzc.b", ctlz, MSA128BOpnd>;
+class NLZC_H_DESC : MSA_2R_DESC_BASE<"nlzc.h", ctlz, MSA128HOpnd>;
+class NLZC_W_DESC : MSA_2R_DESC_BASE<"nlzc.w", ctlz, MSA128WOpnd>;
+class NLZC_D_DESC : MSA_2R_DESC_BASE<"nlzc.d", ctlz, MSA128DOpnd>;
+
+class NOR_V_DESC : MSA_VEC_DESC_BASE<"nor.v", MipsVNOR, MSA128BOpnd>;
+class NOR_V_H_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<MipsVNOR, MSA128HOpnd>;
+class NOR_V_W_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<MipsVNOR, MSA128WOpnd>;
+class NOR_V_D_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<MipsVNOR, MSA128DOpnd>;
+
+class NORI_B_DESC : MSA_I8_DESC_BASE<"nori.b", MipsVNOR, vsplati8_uimm8,
+                                     MSA128BOpnd>;
+
+class OR_V_DESC : MSA_VEC_DESC_BASE<"or.v", or, MSA128BOpnd>;
+class OR_V_H_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<or, MSA128HOpnd>;
+class OR_V_W_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<or, MSA128WOpnd>;
+class OR_V_D_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<or, MSA128DOpnd>;
+
+class ORI_B_DESC : MSA_I8_DESC_BASE<"ori.b", or, vsplati8_uimm8, MSA128BOpnd>;
+
+class PCKEV_B_DESC : MSA_3R_DESC_BASE<"pckev.b", MipsPCKEV, MSA128BOpnd>;
+class PCKEV_H_DESC : MSA_3R_DESC_BASE<"pckev.h", MipsPCKEV, MSA128HOpnd>;
+class PCKEV_W_DESC : MSA_3R_DESC_BASE<"pckev.w", MipsPCKEV, MSA128WOpnd>;
+class PCKEV_D_DESC : MSA_3R_DESC_BASE<"pckev.d", MipsPCKEV, MSA128DOpnd>;
+
+class PCKOD_B_DESC : MSA_3R_DESC_BASE<"pckod.b", MipsPCKOD, MSA128BOpnd>;
+class PCKOD_H_DESC : MSA_3R_DESC_BASE<"pckod.h", MipsPCKOD, MSA128HOpnd>;
+class PCKOD_W_DESC : MSA_3R_DESC_BASE<"pckod.w", MipsPCKOD, MSA128WOpnd>;
+class PCKOD_D_DESC : MSA_3R_DESC_BASE<"pckod.d", MipsPCKOD, MSA128DOpnd>;
+
+class PCNT_B_DESC : MSA_2R_DESC_BASE<"pcnt.b", ctpop, MSA128BOpnd>;
+class PCNT_H_DESC : MSA_2R_DESC_BASE<"pcnt.h", ctpop, MSA128HOpnd>;
+class PCNT_W_DESC : MSA_2R_DESC_BASE<"pcnt.w", ctpop, MSA128WOpnd>;
+class PCNT_D_DESC : MSA_2R_DESC_BASE<"pcnt.d", ctpop, MSA128DOpnd>;
+
+class SAT_S_B_DESC : MSA_BIT_B_X_DESC_BASE<"sat_s.b", int_mips_sat_s_b,
+                                           MSA128BOpnd>;
+class SAT_S_H_DESC : MSA_BIT_H_X_DESC_BASE<"sat_s.h", int_mips_sat_s_h,
+                                           MSA128HOpnd>;
+class SAT_S_W_DESC : MSA_BIT_W_X_DESC_BASE<"sat_s.w", int_mips_sat_s_w,
+                                           MSA128WOpnd>;
+class SAT_S_D_DESC : MSA_BIT_D_X_DESC_BASE<"sat_s.d", int_mips_sat_s_d,
+                                           MSA128DOpnd>;
+
+class SAT_U_B_DESC : MSA_BIT_B_X_DESC_BASE<"sat_u.b", int_mips_sat_u_b,
+                                           MSA128BOpnd>;
+class SAT_U_H_DESC : MSA_BIT_H_X_DESC_BASE<"sat_u.h", int_mips_sat_u_h,
+                                           MSA128HOpnd>;
+class SAT_U_W_DESC : MSA_BIT_W_X_DESC_BASE<"sat_u.w", int_mips_sat_u_w,
+                                           MSA128WOpnd>;
+class SAT_U_D_DESC : MSA_BIT_D_X_DESC_BASE<"sat_u.d", int_mips_sat_u_d,
+                                           MSA128DOpnd>;
+
+class SHF_B_DESC : MSA_I8_SHF_DESC_BASE<"shf.b", MSA128BOpnd>;
+class SHF_H_DESC : MSA_I8_SHF_DESC_BASE<"shf.h", MSA128HOpnd>;
+class SHF_W_DESC : MSA_I8_SHF_DESC_BASE<"shf.w", MSA128WOpnd>;
+
+class SLD_B_DESC : MSA_3R_SLD_DESC_BASE<"sld.b", int_mips_sld_b, MSA128BOpnd>;
+class SLD_H_DESC : MSA_3R_SLD_DESC_BASE<"sld.h", int_mips_sld_h, MSA128HOpnd>;
+class SLD_W_DESC : MSA_3R_SLD_DESC_BASE<"sld.w", int_mips_sld_w, MSA128WOpnd>;
+class SLD_D_DESC : MSA_3R_SLD_DESC_BASE<"sld.d", int_mips_sld_d, MSA128DOpnd>;
+
+class SLDI_B_DESC : MSA_ELM_SLD_DESC_BASE<"sldi.b", int_mips_sldi_b,
+                                          MSA128BOpnd>;
+class SLDI_H_DESC : MSA_ELM_SLD_DESC_BASE<"sldi.h", int_mips_sldi_h,
+                                          MSA128HOpnd>;
+class SLDI_W_DESC : MSA_ELM_SLD_DESC_BASE<"sldi.w", int_mips_sldi_w,
+                                          MSA128WOpnd>;
+class SLDI_D_DESC : MSA_ELM_SLD_DESC_BASE<"sldi.d", int_mips_sldi_d,
+                                          MSA128DOpnd>;
+
+class SLL_B_DESC : MSA_3R_DESC_BASE<"sll.b", shl, MSA128BOpnd>;
+class SLL_H_DESC : MSA_3R_DESC_BASE<"sll.h", shl, MSA128HOpnd>;
+class SLL_W_DESC : MSA_3R_DESC_BASE<"sll.w", shl, MSA128WOpnd>;
+class SLL_D_DESC : MSA_3R_DESC_BASE<"sll.d", shl, MSA128DOpnd>;
+
+class SLLI_B_DESC : MSA_BIT_SPLAT_DESC_BASE<"slli.b", shl, vsplati8_uimm3,
+                                            MSA128BOpnd>;
+class SLLI_H_DESC : MSA_BIT_SPLAT_DESC_BASE<"slli.h", shl, vsplati16_uimm4,
+                                            MSA128HOpnd>;
+class SLLI_W_DESC : MSA_BIT_SPLAT_DESC_BASE<"slli.w", shl, vsplati32_uimm5,
+                                            MSA128WOpnd>;
+class SLLI_D_DESC : MSA_BIT_SPLAT_DESC_BASE<"slli.d", shl, vsplati64_uimm6,
+                                            MSA128DOpnd>;
+
+class SPLAT_B_DESC : MSA_3R_SPLAT_DESC_BASE<"splat.b", vsplati8_elt,
+                                            MSA128BOpnd>;
+class SPLAT_H_DESC : MSA_3R_SPLAT_DESC_BASE<"splat.h", vsplati16_elt,
+                                            MSA128HOpnd>;
+class SPLAT_W_DESC : MSA_3R_SPLAT_DESC_BASE<"splat.w", vsplati32_elt,
+                                            MSA128WOpnd>;
+class SPLAT_D_DESC : MSA_3R_SPLAT_DESC_BASE<"splat.d", vsplati64_elt,
+                                            MSA128DOpnd>;
+
+class SPLATI_B_DESC : MSA_ELM_SPLAT_DESC_BASE<"splati.b", vsplati8_uimm4,
+                                              MSA128BOpnd>;
+class SPLATI_H_DESC : MSA_ELM_SPLAT_DESC_BASE<"splati.h", vsplati16_uimm3,
+                                              MSA128HOpnd>;
+class SPLATI_W_DESC : MSA_ELM_SPLAT_DESC_BASE<"splati.w", vsplati32_uimm2,
+                                              MSA128WOpnd>;
+class SPLATI_D_DESC : MSA_ELM_SPLAT_DESC_BASE<"splati.d", vsplati64_uimm1,
+                                              MSA128DOpnd>;
+
+class SRA_B_DESC : MSA_3R_DESC_BASE<"sra.b", sra, MSA128BOpnd>;
+class SRA_H_DESC : MSA_3R_DESC_BASE<"sra.h", sra, MSA128HOpnd>;
+class SRA_W_DESC : MSA_3R_DESC_BASE<"sra.w", sra, MSA128WOpnd>;
+class SRA_D_DESC : MSA_3R_DESC_BASE<"sra.d", sra, MSA128DOpnd>;
+
+class SRAI_B_DESC : MSA_BIT_SPLAT_DESC_BASE<"srai.b", sra, vsplati8_uimm3,
+                                            MSA128BOpnd>;
+class SRAI_H_DESC : MSA_BIT_SPLAT_DESC_BASE<"srai.h", sra, vsplati16_uimm4,
+                                            MSA128HOpnd>;
+class SRAI_W_DESC : MSA_BIT_SPLAT_DESC_BASE<"srai.w", sra, vsplati32_uimm5,
+                                            MSA128WOpnd>;
+class SRAI_D_DESC : MSA_BIT_SPLAT_DESC_BASE<"srai.d", sra, vsplati64_uimm6,
+                                            MSA128DOpnd>;
+
+class SRAR_B_DESC : MSA_3R_DESC_BASE<"srar.b", int_mips_srar_b, MSA128BOpnd>;
+class SRAR_H_DESC : MSA_3R_DESC_BASE<"srar.h", int_mips_srar_h, MSA128HOpnd>;
+class SRAR_W_DESC : MSA_3R_DESC_BASE<"srar.w", int_mips_srar_w, MSA128WOpnd>;
+class SRAR_D_DESC : MSA_3R_DESC_BASE<"srar.d", int_mips_srar_d, MSA128DOpnd>;
+
+class SRARI_B_DESC : MSA_BIT_B_X_DESC_BASE<"srari.b", int_mips_srari_b,
+                                           MSA128BOpnd>;
+class SRARI_H_DESC : MSA_BIT_H_X_DESC_BASE<"srari.h", int_mips_srari_h,
+                                           MSA128HOpnd>;
+class SRARI_W_DESC : MSA_BIT_W_X_DESC_BASE<"srari.w", int_mips_srari_w,
+                                           MSA128WOpnd>;
+class SRARI_D_DESC : MSA_BIT_D_X_DESC_BASE<"srari.d", int_mips_srari_d,
+                                           MSA128DOpnd>;
+
+class SRL_B_DESC : MSA_3R_DESC_BASE<"srl.b", srl, MSA128BOpnd>;
+class SRL_H_DESC : MSA_3R_DESC_BASE<"srl.h", srl, MSA128HOpnd>;
+class SRL_W_DESC : MSA_3R_DESC_BASE<"srl.w", srl, MSA128WOpnd>;
+class SRL_D_DESC : MSA_3R_DESC_BASE<"srl.d", srl, MSA128DOpnd>;
+
+class SRLI_B_DESC : MSA_BIT_SPLAT_DESC_BASE<"srli.b", srl, vsplati8_uimm3,
+                                            MSA128BOpnd>;
+class SRLI_H_DESC : MSA_BIT_SPLAT_DESC_BASE<"srli.h", srl, vsplati16_uimm4,
+                                            MSA128HOpnd>;
+class SRLI_W_DESC : MSA_BIT_SPLAT_DESC_BASE<"srli.w", srl, vsplati32_uimm5,
+                                            MSA128WOpnd>;
+class SRLI_D_DESC : MSA_BIT_SPLAT_DESC_BASE<"srli.d", srl, vsplati64_uimm6,
+                                            MSA128DOpnd>;
+
+class SRLR_B_DESC : MSA_3R_DESC_BASE<"srlr.b", int_mips_srlr_b, MSA128BOpnd>;
+class SRLR_H_DESC : MSA_3R_DESC_BASE<"srlr.h", int_mips_srlr_h, MSA128HOpnd>;
+class SRLR_W_DESC : MSA_3R_DESC_BASE<"srlr.w", int_mips_srlr_w, MSA128WOpnd>;
+class SRLR_D_DESC : MSA_3R_DESC_BASE<"srlr.d", int_mips_srlr_d, MSA128DOpnd>;
+
+class SRLRI_B_DESC : MSA_BIT_B_X_DESC_BASE<"srlri.b", int_mips_srlri_b,
+                                           MSA128BOpnd>;
+class SRLRI_H_DESC : MSA_BIT_H_X_DESC_BASE<"srlri.h", int_mips_srlri_h,
+                                           MSA128HOpnd>;
+class SRLRI_W_DESC : MSA_BIT_W_X_DESC_BASE<"srlri.w", int_mips_srlri_w,
+                                           MSA128WOpnd>;
+class SRLRI_D_DESC : MSA_BIT_D_X_DESC_BASE<"srlri.d", int_mips_srlri_d,
+                                           MSA128DOpnd>;
+
+class ST_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                   ValueType TyNode, RegisterOperand ROWD,
+                   Operand MemOpnd = mem_msa, ComplexPattern Addr = addrimm10,
+                   InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins ROWD:$wd, MemOpnd:$addr);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $addr");
+  list<dag> Pattern = [(OpNode (TyNode ROWD:$wd), Addr:$addr)];
+  InstrItinClass Itinerary = itin;
+  string DecoderMethod = "DecodeMSA128Mem";
+}
+
+class ST_B_DESC : ST_DESC_BASE<"st.b", store, v16i8, MSA128BOpnd>;
+class ST_H_DESC : ST_DESC_BASE<"st.h", store, v8i16, MSA128HOpnd>;
+class ST_W_DESC : ST_DESC_BASE<"st.w", store, v4i32, MSA128WOpnd>;
+class ST_D_DESC : ST_DESC_BASE<"st.d", store, v2i64, MSA128DOpnd>;
+
+class SUBS_S_B_DESC : MSA_3R_DESC_BASE<"subs_s.b", int_mips_subs_s_b,
+                                       MSA128BOpnd>;
+class SUBS_S_H_DESC : MSA_3R_DESC_BASE<"subs_s.h", int_mips_subs_s_h,
+                                       MSA128HOpnd>;
+class SUBS_S_W_DESC : MSA_3R_DESC_BASE<"subs_s.w", int_mips_subs_s_w,
+                                       MSA128WOpnd>;
+class SUBS_S_D_DESC : MSA_3R_DESC_BASE<"subs_s.d", int_mips_subs_s_d,
+                                       MSA128DOpnd>;
+
+class SUBS_U_B_DESC : MSA_3R_DESC_BASE<"subs_u.b", int_mips_subs_u_b,
+                                       MSA128BOpnd>;
+class SUBS_U_H_DESC : MSA_3R_DESC_BASE<"subs_u.h", int_mips_subs_u_h,
+                                       MSA128HOpnd>;
+class SUBS_U_W_DESC : MSA_3R_DESC_BASE<"subs_u.w", int_mips_subs_u_w,
+                                       MSA128WOpnd>;
+class SUBS_U_D_DESC : MSA_3R_DESC_BASE<"subs_u.d", int_mips_subs_u_d,
+                                       MSA128DOpnd>;
+
+class SUBSUS_U_B_DESC : MSA_3R_DESC_BASE<"subsus_u.b", int_mips_subsus_u_b,
+                                         MSA128BOpnd>;
+class SUBSUS_U_H_DESC : MSA_3R_DESC_BASE<"subsus_u.h", int_mips_subsus_u_h,
+                                         MSA128HOpnd>;
+class SUBSUS_U_W_DESC : MSA_3R_DESC_BASE<"subsus_u.w", int_mips_subsus_u_w,
+                                         MSA128WOpnd>;
+class SUBSUS_U_D_DESC : MSA_3R_DESC_BASE<"subsus_u.d", int_mips_subsus_u_d,
+                                         MSA128DOpnd>;
+
+class SUBSUU_S_B_DESC : MSA_3R_DESC_BASE<"subsuu_s.b", int_mips_subsuu_s_b,
+                                         MSA128BOpnd>;
+class SUBSUU_S_H_DESC : MSA_3R_DESC_BASE<"subsuu_s.h", int_mips_subsuu_s_h,
+                                         MSA128HOpnd>;
+class SUBSUU_S_W_DESC : MSA_3R_DESC_BASE<"subsuu_s.w", int_mips_subsuu_s_w,
+                                         MSA128WOpnd>;
+class SUBSUU_S_D_DESC : MSA_3R_DESC_BASE<"subsuu_s.d", int_mips_subsuu_s_d,
+                                         MSA128DOpnd>;
+
+class SUBV_B_DESC : MSA_3R_DESC_BASE<"subv.b", sub, MSA128BOpnd>;
+class SUBV_H_DESC : MSA_3R_DESC_BASE<"subv.h", sub, MSA128HOpnd>;
+class SUBV_W_DESC : MSA_3R_DESC_BASE<"subv.w", sub, MSA128WOpnd>;
+class SUBV_D_DESC : MSA_3R_DESC_BASE<"subv.d", sub, MSA128DOpnd>;
+
+class SUBVI_B_DESC : MSA_I5_DESC_BASE<"subvi.b", sub, vsplati8_uimm5,
+                                      MSA128BOpnd>;
+class SUBVI_H_DESC : MSA_I5_DESC_BASE<"subvi.h", sub, vsplati16_uimm5,
+                                      MSA128HOpnd>;
+class SUBVI_W_DESC : MSA_I5_DESC_BASE<"subvi.w", sub, vsplati32_uimm5,
+                                      MSA128WOpnd>;
+class SUBVI_D_DESC : MSA_I5_DESC_BASE<"subvi.d", sub, vsplati64_uimm5,
+                                      MSA128DOpnd>;
+
+class VSHF_B_DESC : MSA_3R_VSHF_DESC_BASE<"vshf.b", MSA128BOpnd>;
+class VSHF_H_DESC : MSA_3R_VSHF_DESC_BASE<"vshf.h", MSA128HOpnd>;
+class VSHF_W_DESC : MSA_3R_VSHF_DESC_BASE<"vshf.w", MSA128WOpnd>;
+class VSHF_D_DESC : MSA_3R_VSHF_DESC_BASE<"vshf.d", MSA128DOpnd>;
+
+class XOR_V_DESC : MSA_VEC_DESC_BASE<"xor.v", xor, MSA128BOpnd>;
+class XOR_V_H_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<xor, MSA128HOpnd>;
+class XOR_V_W_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<xor, MSA128WOpnd>;
+class XOR_V_D_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<xor, MSA128DOpnd>;
+
+class XORI_B_DESC : MSA_I8_DESC_BASE<"xori.b", xor, vsplati8_uimm8,
+                                     MSA128BOpnd>;
+
+// Instruction defs.
+def ADD_A_B : ADD_A_B_ENC, ADD_A_B_DESC;
+def ADD_A_H : ADD_A_H_ENC, ADD_A_H_DESC;
+def ADD_A_W : ADD_A_W_ENC, ADD_A_W_DESC;
+def ADD_A_D : ADD_A_D_ENC, ADD_A_D_DESC;
+
+def ADDS_A_B : ADDS_A_B_ENC, ADDS_A_B_DESC;
+def ADDS_A_H : ADDS_A_H_ENC, ADDS_A_H_DESC;
+def ADDS_A_W : ADDS_A_W_ENC, ADDS_A_W_DESC;
+def ADDS_A_D : ADDS_A_D_ENC, ADDS_A_D_DESC;
+
+def ADDS_S_B : ADDS_S_B_ENC, ADDS_S_B_DESC;
+def ADDS_S_H : ADDS_S_H_ENC, ADDS_S_H_DESC;
+def ADDS_S_W : ADDS_S_W_ENC, ADDS_S_W_DESC;
+def ADDS_S_D : ADDS_S_D_ENC, ADDS_S_D_DESC;
+
+def ADDS_U_B : ADDS_U_B_ENC, ADDS_U_B_DESC;
+def ADDS_U_H : ADDS_U_H_ENC, ADDS_U_H_DESC;
+def ADDS_U_W : ADDS_U_W_ENC, ADDS_U_W_DESC;
+def ADDS_U_D : ADDS_U_D_ENC, ADDS_U_D_DESC;
+
+def ADDV_B : ADDV_B_ENC, ADDV_B_DESC;
+def ADDV_H : ADDV_H_ENC, ADDV_H_DESC;
+def ADDV_W : ADDV_W_ENC, ADDV_W_DESC;
+def ADDV_D : ADDV_D_ENC, ADDV_D_DESC;
+
+def ADDVI_B : ADDVI_B_ENC, ADDVI_B_DESC;
+def ADDVI_H : ADDVI_H_ENC, ADDVI_H_DESC;
+def ADDVI_W : ADDVI_W_ENC, ADDVI_W_DESC;
+def ADDVI_D : ADDVI_D_ENC, ADDVI_D_DESC;
+
+def AND_V : AND_V_ENC, AND_V_DESC;
+def AND_V_H_PSEUDO : AND_V_H_PSEUDO_DESC,
+                     PseudoInstExpansion<(AND_V MSA128BOpnd:$wd,
+                                                MSA128BOpnd:$ws,
+                                                MSA128BOpnd:$wt)>;
+def AND_V_W_PSEUDO : AND_V_W_PSEUDO_DESC,
+                     PseudoInstExpansion<(AND_V MSA128BOpnd:$wd,
+                                                MSA128BOpnd:$ws,
+                                                MSA128BOpnd:$wt)>;
+def AND_V_D_PSEUDO : AND_V_D_PSEUDO_DESC,
+                     PseudoInstExpansion<(AND_V MSA128BOpnd:$wd,
+                                                MSA128BOpnd:$ws,
+                                                MSA128BOpnd:$wt)>;
+
+def ANDI_B : ANDI_B_ENC, ANDI_B_DESC;
+
+def ASUB_S_B : ASUB_S_B_ENC, ASUB_S_B_DESC;
+def ASUB_S_H : ASUB_S_H_ENC, ASUB_S_H_DESC;
+def ASUB_S_W : ASUB_S_W_ENC, ASUB_S_W_DESC;
+def ASUB_S_D : ASUB_S_D_ENC, ASUB_S_D_DESC;
+
+def ASUB_U_B : ASUB_U_B_ENC, ASUB_U_B_DESC;
+def ASUB_U_H : ASUB_U_H_ENC, ASUB_U_H_DESC;
+def ASUB_U_W : ASUB_U_W_ENC, ASUB_U_W_DESC;
+def ASUB_U_D : ASUB_U_D_ENC, ASUB_U_D_DESC;
+
+def AVE_S_B : AVE_S_B_ENC, AVE_S_B_DESC;
+def AVE_S_H : AVE_S_H_ENC, AVE_S_H_DESC;
+def AVE_S_W : AVE_S_W_ENC, AVE_S_W_DESC;
+def AVE_S_D : AVE_S_D_ENC, AVE_S_D_DESC;
+
+def AVE_U_B : AVE_U_B_ENC, AVE_U_B_DESC;
+def AVE_U_H : AVE_U_H_ENC, AVE_U_H_DESC;
+def AVE_U_W : AVE_U_W_ENC, AVE_U_W_DESC;
+def AVE_U_D : AVE_U_D_ENC, AVE_U_D_DESC;
+
+def AVER_S_B : AVER_S_B_ENC, AVER_S_B_DESC;
+def AVER_S_H : AVER_S_H_ENC, AVER_S_H_DESC;
+def AVER_S_W : AVER_S_W_ENC, AVER_S_W_DESC;
+def AVER_S_D : AVER_S_D_ENC, AVER_S_D_DESC;
+
+def AVER_U_B : AVER_U_B_ENC, AVER_U_B_DESC;
+def AVER_U_H : AVER_U_H_ENC, AVER_U_H_DESC;
+def AVER_U_W : AVER_U_W_ENC, AVER_U_W_DESC;
+def AVER_U_D : AVER_U_D_ENC, AVER_U_D_DESC;
+
+def BCLR_B : BCLR_B_ENC, BCLR_B_DESC;
+def BCLR_H : BCLR_H_ENC, BCLR_H_DESC;
+def BCLR_W : BCLR_W_ENC, BCLR_W_DESC;
+def BCLR_D : BCLR_D_ENC, BCLR_D_DESC;
+
+def BCLRI_B : BCLRI_B_ENC, BCLRI_B_DESC;
+def BCLRI_H : BCLRI_H_ENC, BCLRI_H_DESC;
+def BCLRI_W : BCLRI_W_ENC, BCLRI_W_DESC;
+def BCLRI_D : BCLRI_D_ENC, BCLRI_D_DESC;
+
+def BINSL_B : BINSL_B_ENC, BINSL_B_DESC;
+def BINSL_H : BINSL_H_ENC, BINSL_H_DESC;
+def BINSL_W : BINSL_W_ENC, BINSL_W_DESC;
+def BINSL_D : BINSL_D_ENC, BINSL_D_DESC;
+
+def BINSLI_B : BINSLI_B_ENC, BINSLI_B_DESC;
+def BINSLI_H : BINSLI_H_ENC, BINSLI_H_DESC;
+def BINSLI_W : BINSLI_W_ENC, BINSLI_W_DESC;
+def BINSLI_D : BINSLI_D_ENC, BINSLI_D_DESC;
+
+def BINSR_B : BINSR_B_ENC, BINSR_B_DESC;
+def BINSR_H : BINSR_H_ENC, BINSR_H_DESC;
+def BINSR_W : BINSR_W_ENC, BINSR_W_DESC;
+def BINSR_D : BINSR_D_ENC, BINSR_D_DESC;
+
+def BINSRI_B : BINSRI_B_ENC, BINSRI_B_DESC;
+def BINSRI_H : BINSRI_H_ENC, BINSRI_H_DESC;
+def BINSRI_W : BINSRI_W_ENC, BINSRI_W_DESC;
+def BINSRI_D : BINSRI_D_ENC, BINSRI_D_DESC;
+
+def BMNZ_V : BMNZ_V_ENC, BMNZ_V_DESC;
+
+def BMNZI_B : BMNZI_B_ENC, BMNZI_B_DESC;
+
+def BMZ_V : BMZ_V_ENC, BMZ_V_DESC;
+
+def BMZI_B : BMZI_B_ENC, BMZI_B_DESC;
+
+def BNEG_B : BNEG_B_ENC, BNEG_B_DESC;
+def BNEG_H : BNEG_H_ENC, BNEG_H_DESC;
+def BNEG_W : BNEG_W_ENC, BNEG_W_DESC;
+def BNEG_D : BNEG_D_ENC, BNEG_D_DESC;
+
+def BNEGI_B : BNEGI_B_ENC, BNEGI_B_DESC;
+def BNEGI_H : BNEGI_H_ENC, BNEGI_H_DESC;
+def BNEGI_W : BNEGI_W_ENC, BNEGI_W_DESC;
+def BNEGI_D : BNEGI_D_ENC, BNEGI_D_DESC;
+
+def BNZ_B : BNZ_B_ENC, BNZ_B_DESC;
+def BNZ_H : BNZ_H_ENC, BNZ_H_DESC;
+def BNZ_W : BNZ_W_ENC, BNZ_W_DESC;
+def BNZ_D : BNZ_D_ENC, BNZ_D_DESC;
+
+def BNZ_V : BNZ_V_ENC, BNZ_V_DESC;
+
+def BSEL_V : BSEL_V_ENC, BSEL_V_DESC;
+
+class MSA_BSEL_PSEUDO_BASE<RegisterOperand RO, ValueType Ty> :
+  MSAPseudo<(outs RO:$wd), (ins RO:$wd_in, RO:$ws, RO:$wt),
+            [(set RO:$wd, (Ty (vselect RO:$wd_in, RO:$wt, RO:$ws)))]>,
+  // Note that vselect and BSEL_V treat the condition operand the opposite way
+  // from each other.
+  //   (vselect cond, if_set, if_clear)
+  //   (BSEL_V cond, if_clear, if_set)
+  PseudoInstExpansion<(BSEL_V MSA128BOpnd:$wd, MSA128BOpnd:$wd_in,
+                              MSA128BOpnd:$ws, MSA128BOpnd:$wt)> {
+  let Constraints = "$wd_in = $wd";
+}
+
+def BSEL_H_PSEUDO : MSA_BSEL_PSEUDO_BASE<MSA128HOpnd, v8i16>;
+def BSEL_W_PSEUDO : MSA_BSEL_PSEUDO_BASE<MSA128WOpnd, v4i32>;
+def BSEL_D_PSEUDO : MSA_BSEL_PSEUDO_BASE<MSA128DOpnd, v2i64>;
+def BSEL_FW_PSEUDO : MSA_BSEL_PSEUDO_BASE<MSA128WOpnd, v4f32>;
+def BSEL_FD_PSEUDO : MSA_BSEL_PSEUDO_BASE<MSA128DOpnd, v2f64>;
+
+def BSELI_B : BSELI_B_ENC, BSELI_B_DESC;
+
+def BSET_B : BSET_B_ENC, BSET_B_DESC;
+def BSET_H : BSET_H_ENC, BSET_H_DESC;
+def BSET_W : BSET_W_ENC, BSET_W_DESC;
+def BSET_D : BSET_D_ENC, BSET_D_DESC;
+
+def BSETI_B : BSETI_B_ENC, BSETI_B_DESC;
+def BSETI_H : BSETI_H_ENC, BSETI_H_DESC;
+def BSETI_W : BSETI_W_ENC, BSETI_W_DESC;
+def BSETI_D : BSETI_D_ENC, BSETI_D_DESC;
+
+def BZ_B : BZ_B_ENC, BZ_B_DESC;
+def BZ_H : BZ_H_ENC, BZ_H_DESC;
+def BZ_W : BZ_W_ENC, BZ_W_DESC;
+def BZ_D : BZ_D_ENC, BZ_D_DESC;
+
+def BZ_V : BZ_V_ENC, BZ_V_DESC;
+
+def CEQ_B : CEQ_B_ENC, CEQ_B_DESC;
+def CEQ_H : CEQ_H_ENC, CEQ_H_DESC;
+def CEQ_W : CEQ_W_ENC, CEQ_W_DESC;
+def CEQ_D : CEQ_D_ENC, CEQ_D_DESC;
+
+def CEQI_B : CEQI_B_ENC, CEQI_B_DESC;
+def CEQI_H : CEQI_H_ENC, CEQI_H_DESC;
+def CEQI_W : CEQI_W_ENC, CEQI_W_DESC;
+def CEQI_D : CEQI_D_ENC, CEQI_D_DESC;
+
+def CFCMSA : CFCMSA_ENC, CFCMSA_DESC;
+
+def CLE_S_B : CLE_S_B_ENC, CLE_S_B_DESC;
+def CLE_S_H : CLE_S_H_ENC, CLE_S_H_DESC;
+def CLE_S_W : CLE_S_W_ENC, CLE_S_W_DESC;
+def CLE_S_D : CLE_S_D_ENC, CLE_S_D_DESC;
+
+def CLE_U_B : CLE_U_B_ENC, CLE_U_B_DESC;
+def CLE_U_H : CLE_U_H_ENC, CLE_U_H_DESC;
+def CLE_U_W : CLE_U_W_ENC, CLE_U_W_DESC;
+def CLE_U_D : CLE_U_D_ENC, CLE_U_D_DESC;
+
+def CLEI_S_B : CLEI_S_B_ENC, CLEI_S_B_DESC;
+def CLEI_S_H : CLEI_S_H_ENC, CLEI_S_H_DESC;
+def CLEI_S_W : CLEI_S_W_ENC, CLEI_S_W_DESC;
+def CLEI_S_D : CLEI_S_D_ENC, CLEI_S_D_DESC;
+
+def CLEI_U_B : CLEI_U_B_ENC, CLEI_U_B_DESC;
+def CLEI_U_H : CLEI_U_H_ENC, CLEI_U_H_DESC;
+def CLEI_U_W : CLEI_U_W_ENC, CLEI_U_W_DESC;
+def CLEI_U_D : CLEI_U_D_ENC, CLEI_U_D_DESC;
+
+def CLT_S_B : CLT_S_B_ENC, CLT_S_B_DESC;
+def CLT_S_H : CLT_S_H_ENC, CLT_S_H_DESC;
+def CLT_S_W : CLT_S_W_ENC, CLT_S_W_DESC;
+def CLT_S_D : CLT_S_D_ENC, CLT_S_D_DESC;
+
+def CLT_U_B : CLT_U_B_ENC, CLT_U_B_DESC;
+def CLT_U_H : CLT_U_H_ENC, CLT_U_H_DESC;
+def CLT_U_W : CLT_U_W_ENC, CLT_U_W_DESC;
+def CLT_U_D : CLT_U_D_ENC, CLT_U_D_DESC;
+
+def CLTI_S_B : CLTI_S_B_ENC, CLTI_S_B_DESC;
+def CLTI_S_H : CLTI_S_H_ENC, CLTI_S_H_DESC;
+def CLTI_S_W : CLTI_S_W_ENC, CLTI_S_W_DESC;
+def CLTI_S_D : CLTI_S_D_ENC, CLTI_S_D_DESC;
+
+def CLTI_U_B : CLTI_U_B_ENC, CLTI_U_B_DESC;
+def CLTI_U_H : CLTI_U_H_ENC, CLTI_U_H_DESC;
+def CLTI_U_W : CLTI_U_W_ENC, CLTI_U_W_DESC;
+def CLTI_U_D : CLTI_U_D_ENC, CLTI_U_D_DESC;
+
+def COPY_S_B : COPY_S_B_ENC, COPY_S_B_DESC;
+def COPY_S_H : COPY_S_H_ENC, COPY_S_H_DESC;
+def COPY_S_W : COPY_S_W_ENC, COPY_S_W_DESC;
+def COPY_S_D : COPY_S_D_ENC, COPY_S_D_DESC;
+
+def COPY_U_B : COPY_U_B_ENC, COPY_U_B_DESC;
+def COPY_U_H : COPY_U_H_ENC, COPY_U_H_DESC;
+def COPY_U_W : COPY_U_W_ENC, COPY_U_W_DESC;
+def COPY_U_D : COPY_U_D_ENC, COPY_U_D_DESC;
+
+def COPY_FW_PSEUDO : COPY_FW_PSEUDO_DESC;
+def COPY_FD_PSEUDO : COPY_FD_PSEUDO_DESC;
+
+def CTCMSA : CTCMSA_ENC, CTCMSA_DESC;
+
+def DIV_S_B : DIV_S_B_ENC, DIV_S_B_DESC;
+def DIV_S_H : DIV_S_H_ENC, DIV_S_H_DESC;
+def DIV_S_W : DIV_S_W_ENC, DIV_S_W_DESC;
+def DIV_S_D : DIV_S_D_ENC, DIV_S_D_DESC;
+
+def DIV_U_B : DIV_U_B_ENC, DIV_U_B_DESC;
+def DIV_U_H : DIV_U_H_ENC, DIV_U_H_DESC;
+def DIV_U_W : DIV_U_W_ENC, DIV_U_W_DESC;
+def DIV_U_D : DIV_U_D_ENC, DIV_U_D_DESC;
+
+def DOTP_S_H : DOTP_S_H_ENC, DOTP_S_H_DESC;
+def DOTP_S_W : DOTP_S_W_ENC, DOTP_S_W_DESC;
+def DOTP_S_D : DOTP_S_D_ENC, DOTP_S_D_DESC;
+
+def DOTP_U_H : DOTP_U_H_ENC, DOTP_U_H_DESC;
+def DOTP_U_W : DOTP_U_W_ENC, DOTP_U_W_DESC;
+def DOTP_U_D : DOTP_U_D_ENC, DOTP_U_D_DESC;
+
+def DPADD_S_H : DPADD_S_H_ENC, DPADD_S_H_DESC;
+def DPADD_S_W : DPADD_S_W_ENC, DPADD_S_W_DESC;
+def DPADD_S_D : DPADD_S_D_ENC, DPADD_S_D_DESC;
+
+def DPADD_U_H : DPADD_U_H_ENC, DPADD_U_H_DESC;
+def DPADD_U_W : DPADD_U_W_ENC, DPADD_U_W_DESC;
+def DPADD_U_D : DPADD_U_D_ENC, DPADD_U_D_DESC;
+
+def DPSUB_S_H : DPSUB_S_H_ENC, DPSUB_S_H_DESC;
+def DPSUB_S_W : DPSUB_S_W_ENC, DPSUB_S_W_DESC;
+def DPSUB_S_D : DPSUB_S_D_ENC, DPSUB_S_D_DESC;
+
+def DPSUB_U_H : DPSUB_U_H_ENC, DPSUB_U_H_DESC;
+def DPSUB_U_W : DPSUB_U_W_ENC, DPSUB_U_W_DESC;
+def DPSUB_U_D : DPSUB_U_D_ENC, DPSUB_U_D_DESC;
+
+def FADD_W : FADD_W_ENC, FADD_W_DESC;
+def FADD_D : FADD_D_ENC, FADD_D_DESC;
+
+def FCAF_W : FCAF_W_ENC, FCAF_W_DESC;
+def FCAF_D : FCAF_D_ENC, FCAF_D_DESC;
+
+def FCEQ_W : FCEQ_W_ENC, FCEQ_W_DESC;
+def FCEQ_D : FCEQ_D_ENC, FCEQ_D_DESC;
+
+def FCLE_W : FCLE_W_ENC, FCLE_W_DESC;
+def FCLE_D : FCLE_D_ENC, FCLE_D_DESC;
+
+def FCLT_W : FCLT_W_ENC, FCLT_W_DESC;
+def FCLT_D : FCLT_D_ENC, FCLT_D_DESC;
+
+def FCLASS_W : FCLASS_W_ENC, FCLASS_W_DESC;
+def FCLASS_D : FCLASS_D_ENC, FCLASS_D_DESC;
+
+def FCNE_W : FCNE_W_ENC, FCNE_W_DESC;
+def FCNE_D : FCNE_D_ENC, FCNE_D_DESC;
+
+def FCOR_W : FCOR_W_ENC, FCOR_W_DESC;
+def FCOR_D : FCOR_D_ENC, FCOR_D_DESC;
+
+def FCUEQ_W : FCUEQ_W_ENC, FCUEQ_W_DESC;
+def FCUEQ_D : FCUEQ_D_ENC, FCUEQ_D_DESC;
+
+def FCULE_W : FCULE_W_ENC, FCULE_W_DESC;
+def FCULE_D : FCULE_D_ENC, FCULE_D_DESC;
+
+def FCULT_W : FCULT_W_ENC, FCULT_W_DESC;
+def FCULT_D : FCULT_D_ENC, FCULT_D_DESC;
+
+def FCUN_W : FCUN_W_ENC, FCUN_W_DESC;
+def FCUN_D : FCUN_D_ENC, FCUN_D_DESC;
+
+def FCUNE_W : FCUNE_W_ENC, FCUNE_W_DESC;
+def FCUNE_D : FCUNE_D_ENC, FCUNE_D_DESC;
+
+def FDIV_W : FDIV_W_ENC, FDIV_W_DESC;
+def FDIV_D : FDIV_D_ENC, FDIV_D_DESC;
+
+def FEXDO_H : FEXDO_H_ENC, FEXDO_H_DESC;
+def FEXDO_W : FEXDO_W_ENC, FEXDO_W_DESC;
+
+def FEXP2_W : FEXP2_W_ENC, FEXP2_W_DESC;
+def FEXP2_D : FEXP2_D_ENC, FEXP2_D_DESC;
+def FEXP2_W_1_PSEUDO : FEXP2_W_1_PSEUDO_DESC;
+def FEXP2_D_1_PSEUDO : FEXP2_D_1_PSEUDO_DESC;
+
+def FEXUPL_W : FEXUPL_W_ENC, FEXUPL_W_DESC;
+def FEXUPL_D : FEXUPL_D_ENC, FEXUPL_D_DESC;
+
+def FEXUPR_W : FEXUPR_W_ENC, FEXUPR_W_DESC;
+def FEXUPR_D : FEXUPR_D_ENC, FEXUPR_D_DESC;
+
+def FFINT_S_W : FFINT_S_W_ENC, FFINT_S_W_DESC;
+def FFINT_S_D : FFINT_S_D_ENC, FFINT_S_D_DESC;
+
+def FFINT_U_W : FFINT_U_W_ENC, FFINT_U_W_DESC;
+def FFINT_U_D : FFINT_U_D_ENC, FFINT_U_D_DESC;
+
+def FFQL_W : FFQL_W_ENC, FFQL_W_DESC;
+def FFQL_D : FFQL_D_ENC, FFQL_D_DESC;
+
+def FFQR_W : FFQR_W_ENC, FFQR_W_DESC;
+def FFQR_D : FFQR_D_ENC, FFQR_D_DESC;
+
+def FILL_B : FILL_B_ENC, FILL_B_DESC;
+def FILL_H : FILL_H_ENC, FILL_H_DESC;
+def FILL_W : FILL_W_ENC, FILL_W_DESC;
+def FILL_D : FILL_D_ENC, FILL_D_DESC;
+def FILL_FW_PSEUDO : FILL_FW_PSEUDO_DESC;
+def FILL_FD_PSEUDO : FILL_FD_PSEUDO_DESC;
+
+def FLOG2_W : FLOG2_W_ENC, FLOG2_W_DESC;
+def FLOG2_D : FLOG2_D_ENC, FLOG2_D_DESC;
+
+def FMADD_W : FMADD_W_ENC, FMADD_W_DESC;
+def FMADD_D : FMADD_D_ENC, FMADD_D_DESC;
+
+def FMAX_W : FMAX_W_ENC, FMAX_W_DESC;
+def FMAX_D : FMAX_D_ENC, FMAX_D_DESC;
+
+def FMAX_A_W : FMAX_A_W_ENC, FMAX_A_W_DESC;
+def FMAX_A_D : FMAX_A_D_ENC, FMAX_A_D_DESC;
+
+def FMIN_W : FMIN_W_ENC, FMIN_W_DESC;
+def FMIN_D : FMIN_D_ENC, FMIN_D_DESC;
+
+def FMIN_A_W : FMIN_A_W_ENC, FMIN_A_W_DESC;
+def FMIN_A_D : FMIN_A_D_ENC, FMIN_A_D_DESC;
+
+def FMSUB_W : FMSUB_W_ENC, FMSUB_W_DESC;
+def FMSUB_D : FMSUB_D_ENC, FMSUB_D_DESC;
+
+def FMUL_W : FMUL_W_ENC, FMUL_W_DESC;
+def FMUL_D : FMUL_D_ENC, FMUL_D_DESC;
+
+def FRINT_W : FRINT_W_ENC, FRINT_W_DESC;
+def FRINT_D : FRINT_D_ENC, FRINT_D_DESC;
+
+def FRCP_W : FRCP_W_ENC, FRCP_W_DESC;
+def FRCP_D : FRCP_D_ENC, FRCP_D_DESC;
+
+def FRSQRT_W : FRSQRT_W_ENC, FRSQRT_W_DESC;
+def FRSQRT_D : FRSQRT_D_ENC, FRSQRT_D_DESC;
+
+def FSAF_W : FSAF_W_ENC, FSAF_W_DESC;
+def FSAF_D : FSAF_D_ENC, FSAF_D_DESC;
+
+def FSEQ_W : FSEQ_W_ENC, FSEQ_W_DESC;
+def FSEQ_D : FSEQ_D_ENC, FSEQ_D_DESC;
+
+def FSLE_W : FSLE_W_ENC, FSLE_W_DESC;
+def FSLE_D : FSLE_D_ENC, FSLE_D_DESC;
+
+def FSLT_W : FSLT_W_ENC, FSLT_W_DESC;
+def FSLT_D : FSLT_D_ENC, FSLT_D_DESC;
+
+def FSNE_W : FSNE_W_ENC, FSNE_W_DESC;
+def FSNE_D : FSNE_D_ENC, FSNE_D_DESC;
+
+def FSOR_W : FSOR_W_ENC, FSOR_W_DESC;
+def FSOR_D : FSOR_D_ENC, FSOR_D_DESC;
+
+def FSQRT_W : FSQRT_W_ENC, FSQRT_W_DESC;
+def FSQRT_D : FSQRT_D_ENC, FSQRT_D_DESC;
+
+def FSUB_W : FSUB_W_ENC, FSUB_W_DESC;
+def FSUB_D : FSUB_D_ENC, FSUB_D_DESC;
+
+def FSUEQ_W : FSUEQ_W_ENC, FSUEQ_W_DESC;
+def FSUEQ_D : FSUEQ_D_ENC, FSUEQ_D_DESC;
+
+def FSULE_W : FSULE_W_ENC, FSULE_W_DESC;
+def FSULE_D : FSULE_D_ENC, FSULE_D_DESC;
+
+def FSULT_W : FSULT_W_ENC, FSULT_W_DESC;
+def FSULT_D : FSULT_D_ENC, FSULT_D_DESC;
+
+def FSUN_W : FSUN_W_ENC, FSUN_W_DESC;
+def FSUN_D : FSUN_D_ENC, FSUN_D_DESC;
+
+def FSUNE_W : FSUNE_W_ENC, FSUNE_W_DESC;
+def FSUNE_D : FSUNE_D_ENC, FSUNE_D_DESC;
+
+def FTINT_S_W : FTINT_S_W_ENC, FTINT_S_W_DESC;
+def FTINT_S_D : FTINT_S_D_ENC, FTINT_S_D_DESC;
+
+def FTINT_U_W : FTINT_U_W_ENC, FTINT_U_W_DESC;
+def FTINT_U_D : FTINT_U_D_ENC, FTINT_U_D_DESC;
+
+def FTQ_H : FTQ_H_ENC, FTQ_H_DESC;
+def FTQ_W : FTQ_W_ENC, FTQ_W_DESC;
+
+def FTRUNC_S_W : FTRUNC_S_W_ENC, FTRUNC_S_W_DESC;
+def FTRUNC_S_D : FTRUNC_S_D_ENC, FTRUNC_S_D_DESC;
+
+def FTRUNC_U_W : FTRUNC_U_W_ENC, FTRUNC_U_W_DESC;
+def FTRUNC_U_D : FTRUNC_U_D_ENC, FTRUNC_U_D_DESC;
+
+def HADD_S_H : HADD_S_H_ENC, HADD_S_H_DESC;
+def HADD_S_W : HADD_S_W_ENC, HADD_S_W_DESC;
+def HADD_S_D : HADD_S_D_ENC, HADD_S_D_DESC;
+
+def HADD_U_H : HADD_U_H_ENC, HADD_U_H_DESC;
+def HADD_U_W : HADD_U_W_ENC, HADD_U_W_DESC;
+def HADD_U_D : HADD_U_D_ENC, HADD_U_D_DESC;
+
+def HSUB_S_H : HSUB_S_H_ENC, HSUB_S_H_DESC;
+def HSUB_S_W : HSUB_S_W_ENC, HSUB_S_W_DESC;
+def HSUB_S_D : HSUB_S_D_ENC, HSUB_S_D_DESC;
+
+def HSUB_U_H : HSUB_U_H_ENC, HSUB_U_H_DESC;
+def HSUB_U_W : HSUB_U_W_ENC, HSUB_U_W_DESC;
+def HSUB_U_D : HSUB_U_D_ENC, HSUB_U_D_DESC;
+
+def ILVEV_B : ILVEV_B_ENC, ILVEV_B_DESC;
+def ILVEV_H : ILVEV_H_ENC, ILVEV_H_DESC;
+def ILVEV_W : ILVEV_W_ENC, ILVEV_W_DESC;
+def ILVEV_D : ILVEV_D_ENC, ILVEV_D_DESC;
+
+def ILVL_B : ILVL_B_ENC, ILVL_B_DESC;
+def ILVL_H : ILVL_H_ENC, ILVL_H_DESC;
+def ILVL_W : ILVL_W_ENC, ILVL_W_DESC;
+def ILVL_D : ILVL_D_ENC, ILVL_D_DESC;
+
+def ILVOD_B : ILVOD_B_ENC, ILVOD_B_DESC;
+def ILVOD_H : ILVOD_H_ENC, ILVOD_H_DESC;
+def ILVOD_W : ILVOD_W_ENC, ILVOD_W_DESC;
+def ILVOD_D : ILVOD_D_ENC, ILVOD_D_DESC;
+
+def ILVR_B : ILVR_B_ENC, ILVR_B_DESC;
+def ILVR_H : ILVR_H_ENC, ILVR_H_DESC;
+def ILVR_W : ILVR_W_ENC, ILVR_W_DESC;
+def ILVR_D : ILVR_D_ENC, ILVR_D_DESC;
+
+def INSERT_B : INSERT_B_ENC, INSERT_B_DESC;
+def INSERT_H : INSERT_H_ENC, INSERT_H_DESC;
+def INSERT_W : INSERT_W_ENC, INSERT_W_DESC;
+def INSERT_D : INSERT_D_ENC, INSERT_D_DESC;
+
+// INSERT_FW_PSEUDO defined after INSVE_W
+// INSERT_FD_PSEUDO defined after INSVE_D
+
+// There is a fourth operand that is not present in the encoding. Use a
+// custom decoder to get a chance to add it.
+let DecoderMethod = "DecodeINSVE_DF" in {
+  def INSVE_B : INSVE_B_ENC, INSVE_B_DESC;
+  def INSVE_H : INSVE_H_ENC, INSVE_H_DESC;
+  def INSVE_W : INSVE_W_ENC, INSVE_W_DESC;
+  def INSVE_D : INSVE_D_ENC, INSVE_D_DESC;
+}
+
+def INSERT_FW_PSEUDO : INSERT_FW_PSEUDO_DESC;
+def INSERT_FD_PSEUDO : INSERT_FD_PSEUDO_DESC;
+
+def INSERT_B_VIDX_PSEUDO : INSERT_B_VIDX_PSEUDO_DESC;
+def INSERT_H_VIDX_PSEUDO : INSERT_H_VIDX_PSEUDO_DESC;
+def INSERT_W_VIDX_PSEUDO : INSERT_W_VIDX_PSEUDO_DESC;
+def INSERT_D_VIDX_PSEUDO : INSERT_D_VIDX_PSEUDO_DESC;
+def INSERT_FW_VIDX_PSEUDO : INSERT_FW_VIDX_PSEUDO_DESC;
+def INSERT_FD_VIDX_PSEUDO : INSERT_FD_VIDX_PSEUDO_DESC;
+
+def LD_B: LD_B_ENC, LD_B_DESC;
+def LD_H: LD_H_ENC, LD_H_DESC;
+def LD_W: LD_W_ENC, LD_W_DESC;
+def LD_D: LD_D_ENC, LD_D_DESC;
+
+def LDI_B : LDI_B_ENC, LDI_B_DESC;
+def LDI_H : LDI_H_ENC, LDI_H_DESC;
+def LDI_W : LDI_W_ENC, LDI_W_DESC;
+def LDI_D : LDI_D_ENC, LDI_D_DESC;
+
+def LSA : LSA_ENC, LSA_DESC;
+def DLSA : DLSA_ENC, DLSA_DESC;
+
+def MADD_Q_H : MADD_Q_H_ENC, MADD_Q_H_DESC;
+def MADD_Q_W : MADD_Q_W_ENC, MADD_Q_W_DESC;
+
+def MADDR_Q_H : MADDR_Q_H_ENC, MADDR_Q_H_DESC;
+def MADDR_Q_W : MADDR_Q_W_ENC, MADDR_Q_W_DESC;
+
+def MADDV_B : MADDV_B_ENC, MADDV_B_DESC;
+def MADDV_H : MADDV_H_ENC, MADDV_H_DESC;
+def MADDV_W : MADDV_W_ENC, MADDV_W_DESC;
+def MADDV_D : MADDV_D_ENC, MADDV_D_DESC;
+
+def MAX_A_B : MAX_A_B_ENC, MAX_A_B_DESC;
+def MAX_A_H : MAX_A_H_ENC, MAX_A_H_DESC;
+def MAX_A_W : MAX_A_W_ENC, MAX_A_W_DESC;
+def MAX_A_D : MAX_A_D_ENC, MAX_A_D_DESC;
+
+def MAX_S_B : MAX_S_B_ENC, MAX_S_B_DESC;
+def MAX_S_H : MAX_S_H_ENC, MAX_S_H_DESC;
+def MAX_S_W : MAX_S_W_ENC, MAX_S_W_DESC;
+def MAX_S_D : MAX_S_D_ENC, MAX_S_D_DESC;
+
+def MAX_U_B : MAX_U_B_ENC, MAX_U_B_DESC;
+def MAX_U_H : MAX_U_H_ENC, MAX_U_H_DESC;
+def MAX_U_W : MAX_U_W_ENC, MAX_U_W_DESC;
+def MAX_U_D : MAX_U_D_ENC, MAX_U_D_DESC;
+
+def MAXI_S_B : MAXI_S_B_ENC, MAXI_S_B_DESC;
+def MAXI_S_H : MAXI_S_H_ENC, MAXI_S_H_DESC;
+def MAXI_S_W : MAXI_S_W_ENC, MAXI_S_W_DESC;
+def MAXI_S_D : MAXI_S_D_ENC, MAXI_S_D_DESC;
+
+def MAXI_U_B : MAXI_U_B_ENC, MAXI_U_B_DESC;
+def MAXI_U_H : MAXI_U_H_ENC, MAXI_U_H_DESC;
+def MAXI_U_W : MAXI_U_W_ENC, MAXI_U_W_DESC;
+def MAXI_U_D : MAXI_U_D_ENC, MAXI_U_D_DESC;
+
+def MIN_A_B : MIN_A_B_ENC, MIN_A_B_DESC;
+def MIN_A_H : MIN_A_H_ENC, MIN_A_H_DESC;
+def MIN_A_W : MIN_A_W_ENC, MIN_A_W_DESC;
+def MIN_A_D : MIN_A_D_ENC, MIN_A_D_DESC;
+
+def MIN_S_B : MIN_S_B_ENC, MIN_S_B_DESC;
+def MIN_S_H : MIN_S_H_ENC, MIN_S_H_DESC;
+def MIN_S_W : MIN_S_W_ENC, MIN_S_W_DESC;
+def MIN_S_D : MIN_S_D_ENC, MIN_S_D_DESC;
+
+def MIN_U_B : MIN_U_B_ENC, MIN_U_B_DESC;
+def MIN_U_H : MIN_U_H_ENC, MIN_U_H_DESC;
+def MIN_U_W : MIN_U_W_ENC, MIN_U_W_DESC;
+def MIN_U_D : MIN_U_D_ENC, MIN_U_D_DESC;
+
+def MINI_S_B : MINI_S_B_ENC, MINI_S_B_DESC;
+def MINI_S_H : MINI_S_H_ENC, MINI_S_H_DESC;
+def MINI_S_W : MINI_S_W_ENC, MINI_S_W_DESC;
+def MINI_S_D : MINI_S_D_ENC, MINI_S_D_DESC;
+
+def MINI_U_B : MINI_U_B_ENC, MINI_U_B_DESC;
+def MINI_U_H : MINI_U_H_ENC, MINI_U_H_DESC;
+def MINI_U_W : MINI_U_W_ENC, MINI_U_W_DESC;
+def MINI_U_D : MINI_U_D_ENC, MINI_U_D_DESC;
+
+def MOD_S_B : MOD_S_B_ENC, MOD_S_B_DESC;
+def MOD_S_H : MOD_S_H_ENC, MOD_S_H_DESC;
+def MOD_S_W : MOD_S_W_ENC, MOD_S_W_DESC;
+def MOD_S_D : MOD_S_D_ENC, MOD_S_D_DESC;
+
+def MOD_U_B : MOD_U_B_ENC, MOD_U_B_DESC;
+def MOD_U_H : MOD_U_H_ENC, MOD_U_H_DESC;
+def MOD_U_W : MOD_U_W_ENC, MOD_U_W_DESC;
+def MOD_U_D : MOD_U_D_ENC, MOD_U_D_DESC;
+
+def MOVE_V : MOVE_V_ENC, MOVE_V_DESC;
+
+def MSUB_Q_H : MSUB_Q_H_ENC, MSUB_Q_H_DESC;
+def MSUB_Q_W : MSUB_Q_W_ENC, MSUB_Q_W_DESC;
+
+def MSUBR_Q_H : MSUBR_Q_H_ENC, MSUBR_Q_H_DESC;
+def MSUBR_Q_W : MSUBR_Q_W_ENC, MSUBR_Q_W_DESC;
+
+def MSUBV_B : MSUBV_B_ENC, MSUBV_B_DESC;
+def MSUBV_H : MSUBV_H_ENC, MSUBV_H_DESC;
+def MSUBV_W : MSUBV_W_ENC, MSUBV_W_DESC;
+def MSUBV_D : MSUBV_D_ENC, MSUBV_D_DESC;
+
+def MUL_Q_H : MUL_Q_H_ENC, MUL_Q_H_DESC;
+def MUL_Q_W : MUL_Q_W_ENC, MUL_Q_W_DESC;
+
+def MULR_Q_H : MULR_Q_H_ENC, MULR_Q_H_DESC;
+def MULR_Q_W : MULR_Q_W_ENC, MULR_Q_W_DESC;
+
+def MULV_B : MULV_B_ENC, MULV_B_DESC;
+def MULV_H : MULV_H_ENC, MULV_H_DESC;
+def MULV_W : MULV_W_ENC, MULV_W_DESC;
+def MULV_D : MULV_D_ENC, MULV_D_DESC;
+
+def NLOC_B : NLOC_B_ENC, NLOC_B_DESC;
+def NLOC_H : NLOC_H_ENC, NLOC_H_DESC;
+def NLOC_W : NLOC_W_ENC, NLOC_W_DESC;
+def NLOC_D : NLOC_D_ENC, NLOC_D_DESC;
+
+def NLZC_B : NLZC_B_ENC, NLZC_B_DESC;
+def NLZC_H : NLZC_H_ENC, NLZC_H_DESC;
+def NLZC_W : NLZC_W_ENC, NLZC_W_DESC;
+def NLZC_D : NLZC_D_ENC, NLZC_D_DESC;
+
+def NOR_V : NOR_V_ENC, NOR_V_DESC;
+def NOR_V_H_PSEUDO : NOR_V_H_PSEUDO_DESC,
+                     PseudoInstExpansion<(NOR_V MSA128BOpnd:$wd,
+                                                MSA128BOpnd:$ws,
+                                                MSA128BOpnd:$wt)>;
+def NOR_V_W_PSEUDO : NOR_V_W_PSEUDO_DESC,
+                     PseudoInstExpansion<(NOR_V MSA128BOpnd:$wd,
+                                                MSA128BOpnd:$ws,
+                                                MSA128BOpnd:$wt)>;
+def NOR_V_D_PSEUDO : NOR_V_D_PSEUDO_DESC,
+                     PseudoInstExpansion<(NOR_V MSA128BOpnd:$wd,
+                                                MSA128BOpnd:$ws,
+                                                MSA128BOpnd:$wt)>;
+
+def NORI_B : NORI_B_ENC, NORI_B_DESC;
+
+def OR_V : OR_V_ENC, OR_V_DESC;
+def OR_V_H_PSEUDO : OR_V_H_PSEUDO_DESC,
+                    PseudoInstExpansion<(OR_V MSA128BOpnd:$wd,
+                                              MSA128BOpnd:$ws,
+                                              MSA128BOpnd:$wt)>;
+def OR_V_W_PSEUDO : OR_V_W_PSEUDO_DESC,
+                    PseudoInstExpansion<(OR_V MSA128BOpnd:$wd,
+                                              MSA128BOpnd:$ws,
+                                              MSA128BOpnd:$wt)>;
+def OR_V_D_PSEUDO : OR_V_D_PSEUDO_DESC,
+                    PseudoInstExpansion<(OR_V MSA128BOpnd:$wd,
+                                              MSA128BOpnd:$ws,
+                                              MSA128BOpnd:$wt)>;
+
+def ORI_B : ORI_B_ENC, ORI_B_DESC;
+
+def PCKEV_B : PCKEV_B_ENC, PCKEV_B_DESC;
+def PCKEV_H : PCKEV_H_ENC, PCKEV_H_DESC;
+def PCKEV_W : PCKEV_W_ENC, PCKEV_W_DESC;
+def PCKEV_D : PCKEV_D_ENC, PCKEV_D_DESC;
+
+def PCKOD_B : PCKOD_B_ENC, PCKOD_B_DESC;
+def PCKOD_H : PCKOD_H_ENC, PCKOD_H_DESC;
+def PCKOD_W : PCKOD_W_ENC, PCKOD_W_DESC;
+def PCKOD_D : PCKOD_D_ENC, PCKOD_D_DESC;
+
+def PCNT_B : PCNT_B_ENC, PCNT_B_DESC;
+def PCNT_H : PCNT_H_ENC, PCNT_H_DESC;
+def PCNT_W : PCNT_W_ENC, PCNT_W_DESC;
+def PCNT_D : PCNT_D_ENC, PCNT_D_DESC;
+
+def SAT_S_B : SAT_S_B_ENC, SAT_S_B_DESC;
+def SAT_S_H : SAT_S_H_ENC, SAT_S_H_DESC;
+def SAT_S_W : SAT_S_W_ENC, SAT_S_W_DESC;
+def SAT_S_D : SAT_S_D_ENC, SAT_S_D_DESC;
+
+def SAT_U_B : SAT_U_B_ENC, SAT_U_B_DESC;
+def SAT_U_H : SAT_U_H_ENC, SAT_U_H_DESC;
+def SAT_U_W : SAT_U_W_ENC, SAT_U_W_DESC;
+def SAT_U_D : SAT_U_D_ENC, SAT_U_D_DESC;
+
+def SHF_B : SHF_B_ENC, SHF_B_DESC;
+def SHF_H : SHF_H_ENC, SHF_H_DESC;
+def SHF_W : SHF_W_ENC, SHF_W_DESC;
+
+def SLD_B : SLD_B_ENC, SLD_B_DESC;
+def SLD_H : SLD_H_ENC, SLD_H_DESC;
+def SLD_W : SLD_W_ENC, SLD_W_DESC;
+def SLD_D : SLD_D_ENC, SLD_D_DESC;
+
+def SLDI_B : SLDI_B_ENC, SLDI_B_DESC;
+def SLDI_H : SLDI_H_ENC, SLDI_H_DESC;
+def SLDI_W : SLDI_W_ENC, SLDI_W_DESC;
+def SLDI_D : SLDI_D_ENC, SLDI_D_DESC;
+
+def SLL_B : SLL_B_ENC, SLL_B_DESC;
+def SLL_H : SLL_H_ENC, SLL_H_DESC;
+def SLL_W : SLL_W_ENC, SLL_W_DESC;
+def SLL_D : SLL_D_ENC, SLL_D_DESC;
+
+def SLLI_B : SLLI_B_ENC, SLLI_B_DESC;
+def SLLI_H : SLLI_H_ENC, SLLI_H_DESC;
+def SLLI_W : SLLI_W_ENC, SLLI_W_DESC;
+def SLLI_D : SLLI_D_ENC, SLLI_D_DESC;
+
+def SPLAT_B : SPLAT_B_ENC, SPLAT_B_DESC;
+def SPLAT_H : SPLAT_H_ENC, SPLAT_H_DESC;
+def SPLAT_W : SPLAT_W_ENC, SPLAT_W_DESC;
+def SPLAT_D : SPLAT_D_ENC, SPLAT_D_DESC;
+
+def SPLATI_B : SPLATI_B_ENC, SPLATI_B_DESC;
+def SPLATI_H : SPLATI_H_ENC, SPLATI_H_DESC;
+def SPLATI_W : SPLATI_W_ENC, SPLATI_W_DESC;
+def SPLATI_D : SPLATI_D_ENC, SPLATI_D_DESC;
+
+def SRA_B : SRA_B_ENC, SRA_B_DESC;
+def SRA_H : SRA_H_ENC, SRA_H_DESC;
+def SRA_W : SRA_W_ENC, SRA_W_DESC;
+def SRA_D : SRA_D_ENC, SRA_D_DESC;
+
+def SRAI_B : SRAI_B_ENC, SRAI_B_DESC;
+def SRAI_H : SRAI_H_ENC, SRAI_H_DESC;
+def SRAI_W : SRAI_W_ENC, SRAI_W_DESC;
+def SRAI_D : SRAI_D_ENC, SRAI_D_DESC;
+
+def SRAR_B : SRAR_B_ENC, SRAR_B_DESC;
+def SRAR_H : SRAR_H_ENC, SRAR_H_DESC;
+def SRAR_W : SRAR_W_ENC, SRAR_W_DESC;
+def SRAR_D : SRAR_D_ENC, SRAR_D_DESC;
+
+def SRARI_B : SRARI_B_ENC, SRARI_B_DESC;
+def SRARI_H : SRARI_H_ENC, SRARI_H_DESC;
+def SRARI_W : SRARI_W_ENC, SRARI_W_DESC;
+def SRARI_D : SRARI_D_ENC, SRARI_D_DESC;
+
+def SRL_B : SRL_B_ENC, SRL_B_DESC;
+def SRL_H : SRL_H_ENC, SRL_H_DESC;
+def SRL_W : SRL_W_ENC, SRL_W_DESC;
+def SRL_D : SRL_D_ENC, SRL_D_DESC;
+
+def SRLI_B : SRLI_B_ENC, SRLI_B_DESC;
+def SRLI_H : SRLI_H_ENC, SRLI_H_DESC;
+def SRLI_W : SRLI_W_ENC, SRLI_W_DESC;
+def SRLI_D : SRLI_D_ENC, SRLI_D_DESC;
+
+def SRLR_B : SRLR_B_ENC, SRLR_B_DESC;
+def SRLR_H : SRLR_H_ENC, SRLR_H_DESC;
+def SRLR_W : SRLR_W_ENC, SRLR_W_DESC;
+def SRLR_D : SRLR_D_ENC, SRLR_D_DESC;
+
+def SRLRI_B : SRLRI_B_ENC, SRLRI_B_DESC;
+def SRLRI_H : SRLRI_H_ENC, SRLRI_H_DESC;
+def SRLRI_W : SRLRI_W_ENC, SRLRI_W_DESC;
+def SRLRI_D : SRLRI_D_ENC, SRLRI_D_DESC;
+
+def ST_B: ST_B_ENC, ST_B_DESC;
+def ST_H: ST_H_ENC, ST_H_DESC;
+def ST_W: ST_W_ENC, ST_W_DESC;
+def ST_D: ST_D_ENC, ST_D_DESC;
+
+def SUBS_S_B : SUBS_S_B_ENC, SUBS_S_B_DESC;
+def SUBS_S_H : SUBS_S_H_ENC, SUBS_S_H_DESC;
+def SUBS_S_W : SUBS_S_W_ENC, SUBS_S_W_DESC;
+def SUBS_S_D : SUBS_S_D_ENC, SUBS_S_D_DESC;
+
+def SUBS_U_B : SUBS_U_B_ENC, SUBS_U_B_DESC;
+def SUBS_U_H : SUBS_U_H_ENC, SUBS_U_H_DESC;
+def SUBS_U_W : SUBS_U_W_ENC, SUBS_U_W_DESC;
+def SUBS_U_D : SUBS_U_D_ENC, SUBS_U_D_DESC;
+
+def SUBSUS_U_B : SUBSUS_U_B_ENC, SUBSUS_U_B_DESC;
+def SUBSUS_U_H : SUBSUS_U_H_ENC, SUBSUS_U_H_DESC;
+def SUBSUS_U_W : SUBSUS_U_W_ENC, SUBSUS_U_W_DESC;
+def SUBSUS_U_D : SUBSUS_U_D_ENC, SUBSUS_U_D_DESC;
+
+def SUBSUU_S_B : SUBSUU_S_B_ENC, SUBSUU_S_B_DESC;
+def SUBSUU_S_H : SUBSUU_S_H_ENC, SUBSUU_S_H_DESC;
+def SUBSUU_S_W : SUBSUU_S_W_ENC, SUBSUU_S_W_DESC;
+def SUBSUU_S_D : SUBSUU_S_D_ENC, SUBSUU_S_D_DESC;
+
+def SUBV_B : SUBV_B_ENC, SUBV_B_DESC;
+def SUBV_H : SUBV_H_ENC, SUBV_H_DESC;
+def SUBV_W : SUBV_W_ENC, SUBV_W_DESC;
+def SUBV_D : SUBV_D_ENC, SUBV_D_DESC;
+
+def SUBVI_B : SUBVI_B_ENC, SUBVI_B_DESC;
+def SUBVI_H : SUBVI_H_ENC, SUBVI_H_DESC;
+def SUBVI_W : SUBVI_W_ENC, SUBVI_W_DESC;
+def SUBVI_D : SUBVI_D_ENC, SUBVI_D_DESC;
+
+def VSHF_B : VSHF_B_ENC, VSHF_B_DESC;
+def VSHF_H : VSHF_H_ENC, VSHF_H_DESC;
+def VSHF_W : VSHF_W_ENC, VSHF_W_DESC;
+def VSHF_D : VSHF_D_ENC, VSHF_D_DESC;
+
+def XOR_V : XOR_V_ENC, XOR_V_DESC;
+def XOR_V_H_PSEUDO : XOR_V_H_PSEUDO_DESC,
+                     PseudoInstExpansion<(XOR_V MSA128BOpnd:$wd,
+                                                MSA128BOpnd:$ws,
+                                                MSA128BOpnd:$wt)>;
+def XOR_V_W_PSEUDO : XOR_V_W_PSEUDO_DESC,
+                     PseudoInstExpansion<(XOR_V MSA128BOpnd:$wd,
+                                                MSA128BOpnd:$ws,
+                                                MSA128BOpnd:$wt)>;
+def XOR_V_D_PSEUDO : XOR_V_D_PSEUDO_DESC,
+                     PseudoInstExpansion<(XOR_V MSA128BOpnd:$wd,
+                                                MSA128BOpnd:$ws,
+                                                MSA128BOpnd:$wt)>;
+
+def XORI_B : XORI_B_ENC, XORI_B_DESC;
+
+// Patterns.
+class MSAPat<dag pattern, dag result, list<Predicate> pred = [HasMSA]> :
+  Pat<pattern, result>, Requires<pred>;
+
+def : MSAPat<(extractelt (v4i32 MSA128W:$ws), immZExt4:$idx),
+             (COPY_S_W MSA128W:$ws, immZExt4:$idx)>;
+
+def : MSAPat<(v8f16 (load addrimm10:$addr)), (LD_H addrimm10:$addr)>;
+def : MSAPat<(v4f32 (load addrimm10:$addr)), (LD_W addrimm10:$addr)>;
+def : MSAPat<(v2f64 (load addrimm10:$addr)), (LD_D addrimm10:$addr)>;
+
+def ST_FH : MSAPat<(store (v8f16 MSA128H:$ws), addrimm10:$addr),
+                   (ST_H MSA128H:$ws, addrimm10:$addr)>;
+def ST_FW : MSAPat<(store (v4f32 MSA128W:$ws), addrimm10:$addr),
+                   (ST_W MSA128W:$ws, addrimm10:$addr)>;
+def ST_FD : MSAPat<(store (v2f64 MSA128D:$ws), addrimm10:$addr),
+                   (ST_D MSA128D:$ws, addrimm10:$addr)>;
+
+class MSA_FABS_PSEUDO_DESC_BASE<RegisterOperand ROWD,
+                                RegisterOperand ROWS = ROWD,
+                                InstrItinClass itin = NoItinerary> :
+  MSAPseudo<(outs ROWD:$wd),
+            (ins ROWS:$ws),
+            [(set ROWD:$wd, (fabs ROWS:$ws))]> {
+  InstrItinClass Itinerary = itin;
+}
+def FABS_W : MSA_FABS_PSEUDO_DESC_BASE<MSA128WOpnd>,
+             PseudoInstExpansion<(FMAX_A_W MSA128WOpnd:$wd, MSA128WOpnd:$ws,
+                                           MSA128WOpnd:$ws)>;
+def FABS_D : MSA_FABS_PSEUDO_DESC_BASE<MSA128DOpnd>,
+             PseudoInstExpansion<(FMAX_A_D MSA128DOpnd:$wd, MSA128DOpnd:$ws,
+                                           MSA128DOpnd:$ws)>;
+
+class MSABitconvertPat<ValueType DstVT, ValueType SrcVT,
+                       RegisterClass DstRC, list<Predicate> preds = [HasMSA]> :
+   MSAPat<(DstVT (bitconvert SrcVT:$src)),
+          (COPY_TO_REGCLASS SrcVT:$src, DstRC), preds>;
+
+// These are endian-independent because the element size doesnt change
+def : MSABitconvertPat<v8i16, v8f16, MSA128H>;
+def : MSABitconvertPat<v4i32, v4f32, MSA128W>;
+def : MSABitconvertPat<v2i64, v2f64, MSA128D>;
+def : MSABitconvertPat<v8f16, v8i16, MSA128H>;
+def : MSABitconvertPat<v4f32, v4i32, MSA128W>;
+def : MSABitconvertPat<v2f64, v2i64, MSA128D>;
+
+// Little endian bitcasts are always no-ops
+def : MSABitconvertPat<v16i8, v8i16, MSA128B, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v16i8, v4i32, MSA128B, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v16i8, v2i64, MSA128B, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v16i8, v8f16, MSA128B, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v16i8, v4f32, MSA128B, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v16i8, v2f64, MSA128B, [HasMSA, IsLE]>;
+
+def : MSABitconvertPat<v8i16, v16i8, MSA128H, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v8i16, v4i32, MSA128H, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v8i16, v2i64, MSA128H, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v8i16, v4f32, MSA128H, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v8i16, v2f64, MSA128H, [HasMSA, IsLE]>;
+
+def : MSABitconvertPat<v4i32, v16i8, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4i32, v8i16, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4i32, v2i64, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4i32, v8f16, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4i32, v2f64, MSA128W, [HasMSA, IsLE]>;
+
+def : MSABitconvertPat<v2i64, v16i8, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2i64, v8i16, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2i64, v4i32, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2i64, v8f16, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2i64, v4f32, MSA128D, [HasMSA, IsLE]>;
+
+def : MSABitconvertPat<v4f32, v16i8, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4f32, v8i16, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4f32, v2i64, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4f32, v8f16, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4f32, v2f64, MSA128W, [HasMSA, IsLE]>;
+
+def : MSABitconvertPat<v2f64, v16i8, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2f64, v8i16, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2f64, v4i32, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2f64, v8f16, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2f64, v4f32, MSA128D, [HasMSA, IsLE]>;
+
+// Big endian bitcasts expand to shuffle instructions.
+// This is because bitcast is defined to be a store/load sequence and the
+// vector store/load instructions are mixed-endian with respect to the vector
+// as a whole (little endian with respect to element order, but big endian
+// elements).
+
+class MSABitconvertReverseQuartersPat<ValueType DstVT, ValueType SrcVT,
+                                      RegisterClass DstRC, MSAInst Insn,
+                                      RegisterClass ViaRC> :
+  MSAPat<(DstVT (bitconvert SrcVT:$src)),
+         (COPY_TO_REGCLASS (Insn (COPY_TO_REGCLASS SrcVT:$src, ViaRC), 27),
+                           DstRC),
+         [HasMSA, IsBE]>;
+
+class MSABitconvertReverseHalvesPat<ValueType DstVT, ValueType SrcVT,
+                                    RegisterClass DstRC, MSAInst Insn,
+                                    RegisterClass ViaRC> :
+  MSAPat<(DstVT (bitconvert SrcVT:$src)),
+         (COPY_TO_REGCLASS (Insn (COPY_TO_REGCLASS SrcVT:$src, ViaRC), 177),
+                           DstRC),
+         [HasMSA, IsBE]>;
+
+class MSABitconvertReverseBInHPat<ValueType DstVT, ValueType SrcVT,
+                                  RegisterClass DstRC> :
+  MSABitconvertReverseHalvesPat<DstVT, SrcVT, DstRC, SHF_B, MSA128B>;
+
+class MSABitconvertReverseBInWPat<ValueType DstVT, ValueType SrcVT,
+                                  RegisterClass DstRC> :
+  MSABitconvertReverseQuartersPat<DstVT, SrcVT, DstRC, SHF_B, MSA128B>;
+
+class MSABitconvertReverseBInDPat<ValueType DstVT, ValueType SrcVT,
+                                  RegisterClass DstRC> :
+  MSAPat<(DstVT (bitconvert SrcVT:$src)),
+         (COPY_TO_REGCLASS
+           (SHF_W
+             (COPY_TO_REGCLASS
+               (SHF_B (COPY_TO_REGCLASS SrcVT:$src, MSA128B), 27),
+               MSA128W), 177),
+           DstRC),
+         [HasMSA, IsBE]>;
+
+class MSABitconvertReverseHInWPat<ValueType DstVT, ValueType SrcVT,
+                                  RegisterClass DstRC> :
+  MSABitconvertReverseHalvesPat<DstVT, SrcVT, DstRC, SHF_H, MSA128H>;
+
+class MSABitconvertReverseHInDPat<ValueType DstVT, ValueType SrcVT,
+                                  RegisterClass DstRC> :
+  MSABitconvertReverseQuartersPat<DstVT, SrcVT, DstRC, SHF_H, MSA128H>;
+
+class MSABitconvertReverseWInDPat<ValueType DstVT, ValueType SrcVT,
+                                  RegisterClass DstRC> :
+  MSABitconvertReverseHalvesPat<DstVT, SrcVT, DstRC, SHF_W, MSA128W>;
+
+def : MSABitconvertReverseBInHPat<v8i16, v16i8, MSA128H>;
+def : MSABitconvertReverseBInHPat<v8f16, v16i8, MSA128H>;
+def : MSABitconvertReverseBInWPat<v4i32, v16i8, MSA128W>;
+def : MSABitconvertReverseBInWPat<v4f32, v16i8, MSA128W>;
+def : MSABitconvertReverseBInDPat<v2i64, v16i8, MSA128D>;
+def : MSABitconvertReverseBInDPat<v2f64, v16i8, MSA128D>;
+
+def : MSABitconvertReverseBInHPat<v16i8, v8i16, MSA128B>;
+def : MSABitconvertReverseHInWPat<v4i32, v8i16, MSA128W>;
+def : MSABitconvertReverseHInWPat<v4f32, v8i16, MSA128W>;
+def : MSABitconvertReverseHInDPat<v2i64, v8i16, MSA128D>;
+def : MSABitconvertReverseHInDPat<v2f64, v8i16, MSA128D>;
+
+def : MSABitconvertReverseBInHPat<v16i8, v8f16, MSA128B>;
+def : MSABitconvertReverseHInWPat<v4i32, v8f16, MSA128W>;
+def : MSABitconvertReverseHInWPat<v4f32, v8f16, MSA128W>;
+def : MSABitconvertReverseHInDPat<v2i64, v8f16, MSA128D>;
+def : MSABitconvertReverseHInDPat<v2f64, v8f16, MSA128D>;
+
+def : MSABitconvertReverseBInWPat<v16i8, v4i32, MSA128B>;
+def : MSABitconvertReverseHInWPat<v8i16, v4i32, MSA128H>;
+def : MSABitconvertReverseHInWPat<v8f16, v4i32, MSA128H>;
+def : MSABitconvertReverseWInDPat<v2i64, v4i32, MSA128D>;
+def : MSABitconvertReverseWInDPat<v2f64, v4i32, MSA128D>;
+
+def : MSABitconvertReverseBInWPat<v16i8, v4f32, MSA128B>;
+def : MSABitconvertReverseHInWPat<v8i16, v4f32, MSA128H>;
+def : MSABitconvertReverseHInWPat<v8f16, v4f32, MSA128H>;
+def : MSABitconvertReverseWInDPat<v2i64, v4f32, MSA128D>;
+def : MSABitconvertReverseWInDPat<v2f64, v4f32, MSA128D>;
+
+def : MSABitconvertReverseBInDPat<v16i8, v2i64, MSA128B>;
+def : MSABitconvertReverseHInDPat<v8i16, v2i64, MSA128H>;
+def : MSABitconvertReverseHInDPat<v8f16, v2i64, MSA128H>;
+def : MSABitconvertReverseWInDPat<v4i32, v2i64, MSA128W>;
+def : MSABitconvertReverseWInDPat<v4f32, v2i64, MSA128W>;
+
+def : MSABitconvertReverseBInDPat<v16i8, v2f64, MSA128B>;
+def : MSABitconvertReverseHInDPat<v8i16, v2f64, MSA128H>;
+def : MSABitconvertReverseHInDPat<v8f16, v2f64, MSA128H>;
+def : MSABitconvertReverseWInDPat<v4i32, v2f64, MSA128W>;
+def : MSABitconvertReverseWInDPat<v4f32, v2f64, MSA128W>;
+
+// Pseudos used to implement BNZ.df, and BZ.df
+
+class MSA_CBRANCH_PSEUDO_DESC_BASE<SDPatternOperator OpNode, ValueType TyNode,
+                                   RegisterClass RCWS,
+                                   InstrItinClass itin = NoItinerary> :
+  MipsPseudo<(outs GPR32:$dst),
+             (ins RCWS:$ws),
+             [(set GPR32:$dst, (OpNode (TyNode RCWS:$ws)))]> {
+  bit usesCustomInserter = 1;
+}
+
+def SNZ_B_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllNonZero, v16i8,
+                                                MSA128B, NoItinerary>;
+def SNZ_H_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllNonZero, v8i16,
+                                                MSA128H, NoItinerary>;
+def SNZ_W_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllNonZero, v4i32,
+                                                MSA128W, NoItinerary>;
+def SNZ_D_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllNonZero, v2i64,
+                                                MSA128D, NoItinerary>;
+def SNZ_V_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAnyNonZero, v16i8,
+                                                MSA128B, NoItinerary>;
+
+def SZ_B_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllZero, v16i8,
+                                               MSA128B, NoItinerary>;
+def SZ_H_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllZero, v8i16,
+                                               MSA128H, NoItinerary>;
+def SZ_W_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllZero, v4i32,
+                                               MSA128W, NoItinerary>;
+def SZ_D_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllZero, v2i64,
+                                               MSA128D, NoItinerary>;
+def SZ_V_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAnyZero, v16i8,
+                                               MSA128B, NoItinerary>;
+
+// Vector extraction with variable index
+def : MSAPat<(i32 (vextract_sext_i8 v16i8:$ws, i32:$idx)),
+             (SRA (COPY_TO_REGCLASS (i32 (EXTRACT_SUBREG (SPLAT_B v16i8:$ws,
+                                                                  i32:$idx),
+                                                         sub_lo)),
+                                    GPR32), (i32 24))>;
+def : MSAPat<(i32 (vextract_sext_i16 v8i16:$ws, i32:$idx)),
+             (SRA (COPY_TO_REGCLASS (i32 (EXTRACT_SUBREG (SPLAT_H v8i16:$ws,
+                                                                  i32:$idx),
+                                                         sub_lo)),
+                                    GPR32), (i32 16))>;
+def : MSAPat<(i32 (vextract_sext_i32 v4i32:$ws, i32:$idx)),
+             (COPY_TO_REGCLASS (i32 (EXTRACT_SUBREG (SPLAT_W v4i32:$ws,
+                                                             i32:$idx),
+                                                    sub_lo)),
+                               GPR32)>;
+def : MSAPat<(i64 (vextract_sext_i64 v2i64:$ws, i32:$idx)),
+             (COPY_TO_REGCLASS (i64 (EXTRACT_SUBREG (SPLAT_D v2i64:$ws,
+                                                             i32:$idx),
+                                                    sub_64)),
+                               GPR64), [HasMSA, IsGP64bit]>;
+
+def : MSAPat<(i32 (vextract_zext_i8 v16i8:$ws, i32:$idx)),
+             (SRL (COPY_TO_REGCLASS (i32 (EXTRACT_SUBREG (SPLAT_B v16i8:$ws,
+                                                                  i32:$idx),
+                                                         sub_lo)),
+                                    GPR32), (i32 24))>;
+def : MSAPat<(i32 (vextract_zext_i16 v8i16:$ws, i32:$idx)),
+             (SRL (COPY_TO_REGCLASS (i32 (EXTRACT_SUBREG (SPLAT_H v8i16:$ws,
+                                                                  i32:$idx),
+                                                         sub_lo)),
+                                    GPR32), (i32 16))>;
+def : MSAPat<(i32 (vextract_zext_i32 v4i32:$ws, i32:$idx)),
+             (COPY_TO_REGCLASS (i32 (EXTRACT_SUBREG (SPLAT_W v4i32:$ws,
+                                                             i32:$idx),
+                                                    sub_lo)),
+                               GPR32)>;
+def : MSAPat<(i64 (vextract_zext_i64 v2i64:$ws, i32:$idx)),
+             (COPY_TO_REGCLASS (i64 (EXTRACT_SUBREG (SPLAT_D v2i64:$ws,
+                                                             i32:$idx),
+                                                    sub_64)),
+                               GPR64), [HasMSA, IsGP64bit]>;
+
+def : MSAPat<(f32 (vector_extract v4f32:$ws, i32:$idx)),
+             (f32 (EXTRACT_SUBREG (SPLAT_W v4f32:$ws,
+                                           i32:$idx),
+                                  sub_lo))>;
+def : MSAPat<(f64 (vector_extract v2f64:$ws, i32:$idx)),
+             (f64 (EXTRACT_SUBREG (SPLAT_D v2f64:$ws,
+                                           i32:$idx),
+                                  sub_64))>;
diff --git a/contrib/llvm/lib/Target/Mips/MipsMachineFunction.cpp b/contrib/llvm/lib/Target/Mips/MipsMachineFunction.cpp
new file mode 100644
index 0000000..bc896be
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsMachineFunction.cpp
@@ -0,0 +1,148 @@
+//===-- MipsMachineFunctionInfo.cpp - Private data used for Mips ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsMachineFunction.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MipsInstrInfo.h"
+#include "MipsSubtarget.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+static cl::opt<bool>
+FixGlobalBaseReg("mips-fix-global-base-reg", cl::Hidden, cl::init(true),
+                 cl::desc("Always use $gp as the global base register."));
+
+// class MipsCallEntry.
+MipsCallEntry::MipsCallEntry(const StringRef &N) {
+#ifndef NDEBUG
+  Name = N;
+  Val = nullptr;
+#endif
+}
+
+MipsCallEntry::MipsCallEntry(const GlobalValue *V) {
+#ifndef NDEBUG
+  Val = V;
+#endif
+}
+
+bool MipsCallEntry::isConstant(const MachineFrameInfo *) const {
+  return false;
+}
+
+bool MipsCallEntry::isAliased(const MachineFrameInfo *) const {
+  return false;
+}
+
+bool MipsCallEntry::mayAlias(const MachineFrameInfo *) const {
+  return false;
+}
+
+void MipsCallEntry::printCustom(raw_ostream &O) const {
+  O << "MipsCallEntry: ";
+#ifndef NDEBUG
+  if (Val)
+    O << Val->getName();
+  else
+    O << Name;
+#endif
+}
+
+MipsFunctionInfo::~MipsFunctionInfo() {
+  for (StringMap<const MipsCallEntry *>::iterator
+       I = ExternalCallEntries.begin(), E = ExternalCallEntries.end(); I != E;
+       ++I)
+    delete I->getValue();
+
+  for (const auto &Entry : GlobalCallEntries)
+    delete Entry.second;
+}
+
+bool MipsFunctionInfo::globalBaseRegSet() const {
+  return GlobalBaseReg;
+}
+
+unsigned MipsFunctionInfo::getGlobalBaseReg() {
+  // Return if it has already been initialized.
+  if (GlobalBaseReg)
+    return GlobalBaseReg;
+
+  const MipsSubtarget &ST = MF.getTarget().getSubtarget<MipsSubtarget>();
+
+  const TargetRegisterClass *RC;
+  if (ST.inMips16Mode())
+    RC=(const TargetRegisterClass*)&Mips::CPU16RegsRegClass;
+  else
+    RC = ST.isABI_N64() ?
+      (const TargetRegisterClass*)&Mips::GPR64RegClass :
+      (const TargetRegisterClass*)&Mips::GPR32RegClass;
+  return GlobalBaseReg = MF.getRegInfo().createVirtualRegister(RC);
+}
+
+bool MipsFunctionInfo::mips16SPAliasRegSet() const {
+  return Mips16SPAliasReg;
+}
+unsigned MipsFunctionInfo::getMips16SPAliasReg() {
+  // Return if it has already been initialized.
+  if (Mips16SPAliasReg)
+    return Mips16SPAliasReg;
+
+  const TargetRegisterClass *RC;
+  RC=(const TargetRegisterClass*)&Mips::CPU16RegsRegClass;
+  return Mips16SPAliasReg = MF.getRegInfo().createVirtualRegister(RC);
+}
+
+void MipsFunctionInfo::createEhDataRegsFI() {
+  for (int I = 0; I < 4; ++I) {
+    const MipsSubtarget &ST = MF.getTarget().getSubtarget<MipsSubtarget>();
+    const TargetRegisterClass *RC = ST.isABI_N64() ?
+        &Mips::GPR64RegClass : &Mips::GPR32RegClass;
+
+    EhDataRegFI[I] = MF.getFrameInfo()->CreateStackObject(RC->getSize(),
+        RC->getAlignment(), false);
+  }
+}
+
+bool MipsFunctionInfo::isEhDataRegFI(int FI) const {
+  return CallsEhReturn && (FI == EhDataRegFI[0] || FI == EhDataRegFI[1]
+                        || FI == EhDataRegFI[2] || FI == EhDataRegFI[3]);
+}
+
+MachinePointerInfo MipsFunctionInfo::callPtrInfo(const StringRef &Name) {
+  const MipsCallEntry *&E = ExternalCallEntries[Name];
+
+  if (!E)
+    E = new MipsCallEntry(Name);
+
+  return MachinePointerInfo(E);
+}
+
+MachinePointerInfo MipsFunctionInfo::callPtrInfo(const GlobalValue *Val) {
+  const MipsCallEntry *&E = GlobalCallEntries[Val];
+
+  if (!E)
+    E = new MipsCallEntry(Val);
+
+  return MachinePointerInfo(E);
+}
+
+int MipsFunctionInfo::getMoveF64ViaSpillFI(const TargetRegisterClass *RC) {
+  if (MoveF64ViaSpillFI == -1) {
+    MoveF64ViaSpillFI = MF.getFrameInfo()->CreateStackObject(
+        RC->getSize(), RC->getAlignment(), false);
+  }
+  return MoveF64ViaSpillFI;
+}
+
+void MipsFunctionInfo::anchor() { }
diff --git a/contrib/llvm/lib/Target/Mips/MipsMachineFunction.h b/contrib/llvm/lib/Target/Mips/MipsMachineFunction.h
new file mode 100644
index 0000000..61260e5
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsMachineFunction.h
@@ -0,0 +1,153 @@
+//===-- MipsMachineFunctionInfo.h - Private data used for Mips ----*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Mips specific subclass of MachineFunctionInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPS_MACHINE_FUNCTION_INFO_H
+#define MIPS_MACHINE_FUNCTION_INFO_H
+
+#include "Mips16HardFloatInfo.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/ValueMap.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include <map>
+#include <string>
+#include <utility>
+
+namespace llvm {
+
+/// \brief A class derived from PseudoSourceValue that represents a GOT entry
+/// resolved by lazy-binding.
+class MipsCallEntry : public PseudoSourceValue {
+public:
+  explicit MipsCallEntry(const StringRef &N);
+  explicit MipsCallEntry(const GlobalValue *V);
+  bool isConstant(const MachineFrameInfo *) const override;
+  bool isAliased(const MachineFrameInfo *) const override;
+  bool mayAlias(const MachineFrameInfo *) const override;
+
+private:
+  void printCustom(raw_ostream &O) const override;
+#ifndef NDEBUG
+  std::string Name;
+  const GlobalValue *Val;
+#endif
+};
+
+/// MipsFunctionInfo - This class is derived from MachineFunction private
+/// Mips target-specific information for each MachineFunction.
+class MipsFunctionInfo : public MachineFunctionInfo {
+public:
+  MipsFunctionInfo(MachineFunction &MF)
+      : MF(MF), SRetReturnReg(0), GlobalBaseReg(0), Mips16SPAliasReg(0),
+        VarArgsFrameIndex(0), CallsEhReturn(false), SaveS2(false),
+        MoveF64ViaSpillFI(-1) {}
+
+  ~MipsFunctionInfo();
+
+  unsigned getSRetReturnReg() const { return SRetReturnReg; }
+  void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
+
+  bool globalBaseRegSet() const;
+  unsigned getGlobalBaseReg();
+
+  bool mips16SPAliasRegSet() const;
+  unsigned getMips16SPAliasReg();
+
+  int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
+  void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; }
+
+  bool hasByvalArg() const { return HasByvalArg; }
+  void setFormalArgInfo(unsigned Size, bool HasByval) {
+    IncomingArgSize = Size;
+    HasByvalArg = HasByval;
+  }
+
+  unsigned getIncomingArgSize() const { return IncomingArgSize; }
+
+  bool callsEhReturn() const { return CallsEhReturn; }
+  void setCallsEhReturn() { CallsEhReturn = true; }
+
+  void createEhDataRegsFI();
+  int getEhDataRegFI(unsigned Reg) const { return EhDataRegFI[Reg]; }
+  bool isEhDataRegFI(int FI) const;
+
+  /// \brief Create a MachinePointerInfo that has a MipsCallEntr object
+  /// representing a GOT entry for an external function.
+  MachinePointerInfo callPtrInfo(const StringRef &Name);
+
+  /// \brief Create a MachinePointerInfo that has a MipsCallEntr object
+  /// representing a GOT entry for a global function.
+  MachinePointerInfo callPtrInfo(const GlobalValue *Val);
+
+  void setSaveS2() { SaveS2 = true; }
+  bool hasSaveS2() const { return SaveS2; }
+
+  int getMoveF64ViaSpillFI(const TargetRegisterClass *RC);
+
+  std::map<const char *, const llvm::Mips16HardFloatInfo::FuncSignature *>
+  StubsNeeded;
+
+private:
+  virtual void anchor();
+
+  MachineFunction& MF;
+  /// SRetReturnReg - Some subtargets require that sret lowering includes
+  /// returning the value of the returned struct in a register. This field
+  /// holds the virtual register into which the sret argument is passed.
+  unsigned SRetReturnReg;
+
+  /// GlobalBaseReg - keeps track of the virtual register initialized for
+  /// use as the global base register. This is used for PIC in some PIC
+  /// relocation models.
+  unsigned GlobalBaseReg;
+
+  /// Mips16SPAliasReg - keeps track of the virtual register initialized for
+  /// use as an alias for SP for use in load/store of halfword/byte from/to
+  /// the stack
+  unsigned Mips16SPAliasReg;
+
+  /// VarArgsFrameIndex - FrameIndex for start of varargs area.
+  int VarArgsFrameIndex;
+
+  /// True if function has a byval argument.
+  bool HasByvalArg;
+
+  /// Size of incoming argument area.
+  unsigned IncomingArgSize;
+
+  /// CallsEhReturn - Whether the function calls llvm.eh.return.
+  bool CallsEhReturn;
+
+  /// Frame objects for spilling eh data registers.
+  int EhDataRegFI[4];
+
+  // saveS2
+  bool SaveS2;
+
+  /// FrameIndex for expanding BuildPairF64 nodes to spill and reload when the
+  /// O32 FPXX ABI is enabled. -1 is used to denote invalid index.
+  int MoveF64ViaSpillFI;
+
+  /// MipsCallEntry maps.
+  StringMap<const MipsCallEntry *> ExternalCallEntries;
+  ValueMap<const GlobalValue *, const MipsCallEntry *> GlobalCallEntries;
+};
+
+} // end of namespace llvm
+
+#endif // MIPS_MACHINE_FUNCTION_INFO_H
diff --git a/contrib/llvm/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp
new file mode 100644
index 0000000..b011e8f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp
@@ -0,0 +1,36 @@
+//===----------------------------------------------------------------------===//
+// Instruction Selector Subtarget Control
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// This file defines a pass used to change the subtarget for the
+// Mips Instruction selector.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsISelDAGToDAG.h"
+#include "MipsModuleISelDAGToDAG.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "mips-isel"
+
+namespace llvm {
+
+bool MipsModuleDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(errs() << "In MipsModuleDAGToDAGISel::runMachineFunction\n");
+  TM.resetSubtarget(&MF);
+  return false;
+}
+
+char MipsModuleDAGToDAGISel::ID = 0;
+
+}
+
+
+llvm::FunctionPass *llvm::createMipsModuleISelDag(MipsTargetMachine &TM) {
+  return new MipsModuleDAGToDAGISel(TM);
+}
+
+
diff --git a/contrib/llvm/lib/Target/Mips/MipsModuleISelDAGToDAG.h b/contrib/llvm/lib/Target/Mips/MipsModuleISelDAGToDAG.h
new file mode 100644
index 0000000..f7a0310
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsModuleISelDAGToDAG.h
@@ -0,0 +1,58 @@
+//===---- MipsModuleISelDAGToDAG.h -  Change Subtarget             --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pass used to change the subtarget for the
+// Mips Instruction selector.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSMODULEISELDAGTODAG_H
+#define MIPSMODULEISELDAGTODAG_H
+
+#include "Mips.h"
+#include "MipsSubtarget.h"
+#include "MipsTargetMachine.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+
+
+//===----------------------------------------------------------------------===//
+// Instruction Selector Implementation
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MipsModuleDAGToDAGISel - MIPS specific code to select MIPS machine
+// instructions for SelectionDAG operations.
+//===----------------------------------------------------------------------===//
+namespace llvm {
+
+class MipsModuleDAGToDAGISel : public MachineFunctionPass {
+public:
+
+  static char ID;
+
+  explicit MipsModuleDAGToDAGISel(MipsTargetMachine &TM_)
+      : MachineFunctionPass(ID), TM(TM_) {}
+
+  // Pass Name
+  const char *getPassName() const override {
+    return "MIPS DAG->DAG Pattern Instruction Selection";
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+protected:
+  MipsTargetMachine &TM;
+};
+
+/// createMipsISelDag - This pass converts a legalized DAG into a
+/// MIPS-specific DAG, ready for instruction scheduling.
+FunctionPass *createMipsModuleISelDag(MipsTargetMachine &TM);
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsOptimizePICCall.cpp b/contrib/llvm/lib/Target/Mips/MipsOptimizePICCall.cpp
new file mode 100644
index 0000000..c234049
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsOptimizePICCall.cpp
@@ -0,0 +1,301 @@
+//===--------- MipsOptimizePICCall.cpp - Optimize PIC Calls ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass eliminates unnecessary instructions that set up $gp and replace
+// instructions that load target function addresses with copy instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mips.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MipsMachineFunction.h"
+#include "MipsTargetMachine.h"
+#include "llvm/ADT/ScopedHashTable.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "optimize-mips-pic-call"
+
+static cl::opt<bool> LoadTargetFromGOT("mips-load-target-from-got",
+                                       cl::init(true),
+                                       cl::desc("Load target address from GOT"),
+                                       cl::Hidden);
+
+static cl::opt<bool> EraseGPOpnd("mips-erase-gp-opnd",
+                                 cl::init(true), cl::desc("Erase GP Operand"),
+                                 cl::Hidden);
+
+namespace {
+typedef PointerUnion<const Value *, const PseudoSourceValue *> ValueType;
+
+typedef std::pair<unsigned, unsigned> CntRegP;
+typedef RecyclingAllocator<BumpPtrAllocator,
+                           ScopedHashTableVal<ValueType, CntRegP> >
+AllocatorTy;
+typedef ScopedHashTable<ValueType, CntRegP, DenseMapInfo<ValueType>,
+                        AllocatorTy> ScopedHTType;
+
+class MBBInfo {
+public:
+  MBBInfo(MachineDomTreeNode *N);
+  const MachineDomTreeNode *getNode() const;
+  bool isVisited() const;
+  void preVisit(ScopedHTType &ScopedHT);
+  void postVisit();
+
+private:
+  MachineDomTreeNode *Node;
+  ScopedHTType::ScopeTy *HTScope;
+};
+
+class OptimizePICCall : public MachineFunctionPass {
+public:
+  OptimizePICCall(TargetMachine &tm) : MachineFunctionPass(ID) {}
+
+  const char *getPassName() const override { return "Mips OptimizePICCall"; }
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineDominatorTree>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+private:
+  /// \brief Visit MBB.
+  bool visitNode(MBBInfo &MBBI);
+
+  /// \brief Test if MI jumps to a function via a register.
+  ///
+  /// Also, return the virtual register containing the target function's address
+  /// and the underlying object in Reg and Val respectively, if the function's
+  /// address can be resolved lazily.
+  bool isCallViaRegister(MachineInstr &MI, unsigned &Reg,
+                         ValueType &Val) const;
+
+  /// \brief Return the number of instructions that dominate the current
+  /// instruction and load the function address from object Entry.
+  unsigned getCount(ValueType Entry);
+
+  /// \brief Return the destination virtual register of the last instruction
+  /// that loads from object Entry.
+  unsigned getReg(ValueType Entry);
+
+  /// \brief Update ScopedHT.
+  void incCntAndSetReg(ValueType Entry, unsigned Reg);
+
+  ScopedHTType ScopedHT;
+  static char ID;
+};
+
+char OptimizePICCall::ID = 0;
+} // end of anonymous namespace
+
+/// Return the first MachineOperand of MI if it is a used virtual register.
+static MachineOperand *getCallTargetRegOpnd(MachineInstr &MI) {
+  if (MI.getNumOperands() == 0)
+    return nullptr;
+
+  MachineOperand &MO = MI.getOperand(0);
+
+  if (!MO.isReg() || !MO.isUse() ||
+      !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+    return nullptr;
+
+  return &MO;
+}
+
+/// Return type of register Reg.
+static MVT::SimpleValueType getRegTy(unsigned Reg, MachineFunction &MF) {
+  const TargetRegisterClass *RC = MF.getRegInfo().getRegClass(Reg);
+  assert(RC->vt_end() - RC->vt_begin() == 1);
+  return *RC->vt_begin();
+}
+
+/// Do the following transformation:
+///
+/// jalr $vreg
+/// =>
+/// copy $t9, $vreg
+/// jalr $t9
+static void setCallTargetReg(MachineBasicBlock *MBB,
+                             MachineBasicBlock::iterator I) {
+  MachineFunction &MF = *MBB->getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  unsigned SrcReg = I->getOperand(0).getReg();
+  unsigned DstReg = getRegTy(SrcReg, MF) == MVT::i32 ? Mips::T9 : Mips::T9_64;
+  BuildMI(*MBB, I, I->getDebugLoc(), TII.get(TargetOpcode::COPY), DstReg)
+      .addReg(SrcReg);
+  I->getOperand(0).setReg(DstReg);
+}
+
+/// Search MI's operands for register GP and erase it.
+static void eraseGPOpnd(MachineInstr &MI) {
+  if (!EraseGPOpnd)
+    return;
+
+  MachineFunction &MF = *MI.getParent()->getParent();
+  MVT::SimpleValueType Ty = getRegTy(MI.getOperand(0).getReg(), MF);
+  unsigned Reg = Ty == MVT::i32 ? Mips::GP : Mips::GP_64;
+
+  for (unsigned I = 0; I < MI.getNumOperands(); ++I) {
+    MachineOperand &MO = MI.getOperand(I);
+    if (MO.isReg() && MO.getReg() == Reg) {
+      MI.RemoveOperand(I);
+      return;
+    }
+  }
+
+  llvm_unreachable(nullptr);
+}
+
+MBBInfo::MBBInfo(MachineDomTreeNode *N) : Node(N), HTScope(nullptr) {}
+
+const MachineDomTreeNode *MBBInfo::getNode() const { return Node; }
+
+bool MBBInfo::isVisited() const { return HTScope; }
+
+void MBBInfo::preVisit(ScopedHTType &ScopedHT) {
+  HTScope = new ScopedHTType::ScopeTy(ScopedHT);
+}
+
+void MBBInfo::postVisit() {
+  delete HTScope;
+}
+
+// OptimizePICCall methods.
+bool OptimizePICCall::runOnMachineFunction(MachineFunction &F) {
+  if (F.getTarget().getSubtarget<MipsSubtarget>().inMips16Mode())
+    return false;
+
+  // Do a pre-order traversal of the dominator tree.
+  MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>();
+  bool Changed = false;
+
+  SmallVector<MBBInfo, 8> WorkList(1, MBBInfo(MDT->getRootNode()));
+
+  while (!WorkList.empty()) {
+    MBBInfo &MBBI = WorkList.back();
+
+    // If this MBB has already been visited, destroy the scope for the MBB and
+    // pop it from the work list.
+    if (MBBI.isVisited()) {
+      MBBI.postVisit();
+      WorkList.pop_back();
+      continue;
+    }
+
+    // Visit the MBB and add its children to the work list.
+    MBBI.preVisit(ScopedHT);
+    Changed |= visitNode(MBBI);
+    const MachineDomTreeNode *Node = MBBI.getNode();
+    const std::vector<MachineDomTreeNode *> &Children = Node->getChildren();
+    WorkList.append(Children.begin(), Children.end());
+  }
+
+  return Changed;
+}
+
+bool OptimizePICCall::visitNode(MBBInfo &MBBI) {
+  bool Changed = false;
+  MachineBasicBlock *MBB = MBBI.getNode()->getBlock();
+
+  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
+       ++I) {
+    unsigned Reg;
+    ValueType Entry;
+
+    // Skip instructions that are not call instructions via registers.
+    if (!isCallViaRegister(*I, Reg, Entry))
+      continue;
+
+    Changed = true;
+    unsigned N = getCount(Entry);
+
+    if (N != 0) {
+      // If a function has been called more than twice, we do not have to emit a
+      // load instruction to get the function address from the GOT, but can
+      // instead reuse the address that has been loaded before.
+      if (N >= 2 && !LoadTargetFromGOT)
+        getCallTargetRegOpnd(*I)->setReg(getReg(Entry));
+
+      // Erase the $gp operand if this isn't the first time a function has
+      // been called. $gp needs to be set up only if the function call can go
+      // through a lazy binding stub.
+      eraseGPOpnd(*I);
+    }
+
+    if (Entry)
+      incCntAndSetReg(Entry, Reg);
+
+    setCallTargetReg(MBB, I);
+  }
+
+  return Changed;
+}
+
+bool OptimizePICCall::isCallViaRegister(MachineInstr &MI, unsigned &Reg,
+                                        ValueType &Val) const {
+  if (!MI.isCall())
+    return false;
+
+  MachineOperand *MO = getCallTargetRegOpnd(MI);
+
+  // Return if MI is not a function call via a register.
+  if (!MO)
+    return false;
+
+  // Get the instruction that loads the function address from the GOT.
+  Reg = MO->getReg();
+  Val = (Value*)nullptr;
+  MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+  MachineInstr *DefMI = MRI.getVRegDef(Reg);
+
+  assert(DefMI);
+
+  // See if DefMI is an instruction that loads from a GOT entry that holds the
+  // address of a lazy binding stub.
+  if (!DefMI->mayLoad() || DefMI->getNumOperands() < 3)
+    return true;
+
+  unsigned Flags = DefMI->getOperand(2).getTargetFlags();
+
+  if (Flags != MipsII::MO_GOT_CALL && Flags != MipsII::MO_CALL_LO16)
+    return true;
+
+  // Return the underlying object for the GOT entry in Val.
+  assert(DefMI->hasOneMemOperand());
+  Val = (*DefMI->memoperands_begin())->getValue();
+  if (!Val)
+    Val = (*DefMI->memoperands_begin())->getPseudoValue();
+  return true;
+}
+
+unsigned OptimizePICCall::getCount(ValueType Entry) {
+  return ScopedHT.lookup(Entry).first;
+}
+
+unsigned OptimizePICCall::getReg(ValueType Entry) {
+  unsigned Reg = ScopedHT.lookup(Entry).second;
+  assert(Reg);
+  return Reg;
+}
+
+void OptimizePICCall::incCntAndSetReg(ValueType Entry, unsigned Reg) {
+  CntRegP P = ScopedHT.lookup(Entry);
+  ScopedHT.insert(Entry, std::make_pair(P.first + 1, Reg));
+}
+
+/// Return an OptimizeCall object.
+FunctionPass *llvm::createMipsOptimizePICCallPass(MipsTargetMachine &TM) {
+  return new OptimizePICCall(TM);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsOptionRecord.h b/contrib/llvm/lib/Target/Mips/MipsOptionRecord.h
new file mode 100644
index 0000000..c0abce3
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsOptionRecord.h
@@ -0,0 +1,80 @@
+//===-- MipsOptionRecord.h - Abstraction for storing information ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// MipsOptionRecord - Abstraction for storing arbitrary information in
+// ELF files. Arbitrary information (e.g. register usage) can be stored in Mips
+// specific ELF sections like .Mips.options. Specific records should subclass
+// MipsOptionRecord and provide an implementation to EmitMipsOptionRecord which
+// basically just dumps the information into an ELF section. More information
+// about .Mips.option can be found in the SysV ABI and the 64-bit ELF Object
+// specification.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSOPTIONRECORD_H
+#define MIPSOPTIONRECORD_H
+
+#include "MipsMCTargetDesc.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCRegisterInfo.h"
+
+using namespace llvm;
+
+namespace llvm {
+class MipsELFStreamer;
+class MCSubtargetInfo;
+}
+
+class MipsOptionRecord {
+public:
+  virtual ~MipsOptionRecord(){};
+  virtual void EmitMipsOptionRecord() = 0;
+};
+
+class MipsRegInfoRecord : public MipsOptionRecord {
+public:
+  MipsRegInfoRecord(MipsELFStreamer *S, MCContext &Context,
+                    const MCSubtargetInfo &STI)
+      : Streamer(S), Context(Context), STI(STI) {
+    ri_gprmask = 0;
+    ri_cprmask[0] = ri_cprmask[1] = ri_cprmask[2] = ri_cprmask[3] = 0;
+    ri_gp_value = 0;
+
+    const MCRegisterInfo *TRI = Context.getRegisterInfo();
+    GPR32RegClass = &(TRI->getRegClass(Mips::GPR32RegClassID));
+    GPR64RegClass = &(TRI->getRegClass(Mips::GPR64RegClassID));
+    FGR32RegClass = &(TRI->getRegClass(Mips::FGR32RegClassID));
+    FGR64RegClass = &(TRI->getRegClass(Mips::FGR64RegClassID));
+    AFGR64RegClass = &(TRI->getRegClass(Mips::AFGR64RegClassID));
+    MSA128BRegClass = &(TRI->getRegClass(Mips::MSA128BRegClassID));
+    COP2RegClass = &(TRI->getRegClass(Mips::COP2RegClassID));
+    COP3RegClass = &(TRI->getRegClass(Mips::COP3RegClassID));
+  }
+  ~MipsRegInfoRecord() {}
+
+  void EmitMipsOptionRecord();
+  void SetPhysRegUsed(unsigned Reg, const MCRegisterInfo *MCRegInfo);
+
+private:
+  MipsELFStreamer *Streamer;
+  MCContext &Context;
+  const MCSubtargetInfo &STI;
+  const MCRegisterClass *GPR32RegClass;
+  const MCRegisterClass *GPR64RegClass;
+  const MCRegisterClass *FGR32RegClass;
+  const MCRegisterClass *FGR64RegClass;
+  const MCRegisterClass *AFGR64RegClass;
+  const MCRegisterClass *MSA128BRegClass;
+  const MCRegisterClass *COP2RegClass;
+  const MCRegisterClass *COP3RegClass;
+  uint32_t ri_gprmask;
+  uint32_t ri_cprmask[4];
+  int64_t ri_gp_value;
+};
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsOs16.cpp b/contrib/llvm/lib/Target/Mips/MipsOs16.cpp
new file mode 100644
index 0000000..7aae964
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsOs16.cpp
@@ -0,0 +1,147 @@
+//===---- MipsOs16.cpp for Mips Option -Os16                       --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an optimization phase for the MIPS target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsOs16.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "mips-os16"
+
+
+static cl::opt<std::string> Mips32FunctionMask(
+  "mips32-function-mask",
+  cl::init(""),
+  cl::desc("Force function to be mips32"),
+  cl::Hidden);
+
+namespace {
+
+  // Figure out if we need float point based on the function signature.
+  // We need to move variables in and/or out of floating point
+  // registers because of the ABI
+  //
+  bool needsFPFromSig(Function &F) {
+    Type* RetType = F.getReturnType();
+    switch (RetType->getTypeID()) {
+    case Type::FloatTyID:
+    case Type::DoubleTyID:
+      return true;
+    default:
+      ;
+    }
+    if (F.arg_size() >=1) {
+      Argument &Arg = F.getArgumentList().front();
+      switch (Arg.getType()->getTypeID()) {
+        case Type::FloatTyID:
+        case Type::DoubleTyID:
+          return true;
+        default:
+          ;
+      }
+    }
+    return false;
+  }
+
+  // Figure out if the function will need floating point operations
+  //
+  bool needsFP(Function &F) {
+    if (needsFPFromSig(F))
+      return true;
+    for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+      for (BasicBlock::const_iterator I = BB->begin(), E = BB->end();
+         I != E; ++I) {
+        const Instruction &Inst = *I;
+        switch (Inst.getOpcode()) {
+        case Instruction::FAdd:
+        case Instruction::FSub:
+        case Instruction::FMul:
+        case Instruction::FDiv:
+        case Instruction::FRem:
+        case Instruction::FPToUI:
+        case Instruction::FPToSI:
+        case Instruction::UIToFP:
+        case Instruction::SIToFP:
+        case Instruction::FPTrunc:
+        case Instruction::FPExt:
+        case Instruction::FCmp:
+          return true;
+        default:
+          ;
+        }
+        if (const CallInst *CI = dyn_cast<CallInst>(I)) {
+          DEBUG(dbgs() << "Working on call" << "\n");
+          Function &F_ =  *CI->getCalledFunction();
+          if (needsFPFromSig(F_))
+            return true;
+        }
+      }
+    return false;
+  }
+}
+namespace llvm {
+
+
+bool MipsOs16::runOnModule(Module &M) {
+  bool usingMask = Mips32FunctionMask.length() > 0;
+  bool doneUsingMask = false; // this will make it stop repeating
+  DEBUG(dbgs() << "Run on Module MipsOs16 \n" << Mips32FunctionMask << "\n");
+  if (usingMask)
+    DEBUG(dbgs() << "using mask \n" << Mips32FunctionMask << "\n");
+  unsigned int functionIndex = 0;
+  bool modified = false;
+  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
+    if (F->isDeclaration()) continue;
+    DEBUG(dbgs() << "Working on " << F->getName() << "\n");
+    if (usingMask) {
+      if (!doneUsingMask) {
+        if (functionIndex == Mips32FunctionMask.length())
+          functionIndex = 0;
+        switch (Mips32FunctionMask[functionIndex]) {
+        case '1':
+          DEBUG(dbgs() << "mask forced mips32: " << F->getName() << "\n");
+          F->addFnAttr("nomips16");
+          break;
+        case '.':
+          doneUsingMask = true;
+          break;
+        default:
+          break;
+        }
+        functionIndex++;
+      }
+    }
+    else {
+      if (needsFP(*F)) {
+        DEBUG(dbgs() << "os16 forced mips32: " << F->getName() << "\n");
+        F->addFnAttr("nomips16");
+      }
+      else {
+        DEBUG(dbgs() << "os16 forced mips16: " << F->getName() << "\n");
+        F->addFnAttr("mips16");
+      }
+    }
+  }
+  return modified;
+}
+
+char MipsOs16::ID = 0;
+
+}
+
+ModulePass *llvm::createMipsOs16(MipsTargetMachine &TM) {
+  return new MipsOs16;
+}
+
+
diff --git a/contrib/llvm/lib/Target/Mips/MipsOs16.h b/contrib/llvm/lib/Target/Mips/MipsOs16.h
new file mode 100644
index 0000000..55e5a81
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsOs16.h
@@ -0,0 +1,49 @@
+//===---- MipsOs16.h for Mips Option -Os16                         --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an optimization phase for the MIPS target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/MipsMCTargetDesc.h"
+#include "MipsTargetMachine.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetMachine.h"
+
+
+
+#ifndef MIPSOS16_H
+#define MIPSOS16_H
+
+using namespace llvm;
+
+namespace llvm {
+
+class MipsOs16 : public ModulePass {
+
+public:
+  static char ID;
+
+  MipsOs16() : ModulePass(ID) {
+
+  }
+
+  const char *getPassName() const override {
+    return "MIPS Os16 Optimization";
+  }
+
+  bool runOnModule(Module &M) override;
+
+};
+
+ModulePass *createMipsOs16(MipsTargetMachine &TM);
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.cpp b/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.cpp
new file mode 100644
index 0000000..084449b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -0,0 +1,263 @@
+//===-- MipsRegisterInfo.cpp - MIPS Register Information -== --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MIPS implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsRegisterInfo.h"
+#include "Mips.h"
+#include "MipsAnalyzeImmediate.h"
+#include "MipsInstrInfo.h"
+#include "MipsMachineFunction.h"
+#include "MipsSubtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-reg-info"
+
+#define GET_REGINFO_TARGET_DESC
+#include "MipsGenRegisterInfo.inc"
+
+MipsRegisterInfo::MipsRegisterInfo(const MipsSubtarget &ST)
+  : MipsGenRegisterInfo(Mips::RA), Subtarget(ST) {}
+
+unsigned MipsRegisterInfo::getPICCallReg() { return Mips::T9; }
+
+const TargetRegisterClass *
+MipsRegisterInfo::getPointerRegClass(const MachineFunction &MF,
+                                     unsigned Kind) const {
+  return Subtarget.isABI_N64() ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
+}
+
+unsigned
+MipsRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
+                                      MachineFunction &MF) const {
+  switch (RC->getID()) {
+  default:
+    return 0;
+  case Mips::GPR32RegClassID:
+  case Mips::GPR64RegClassID:
+  case Mips::DSPRRegClassID: {
+    const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+    return 28 - TFI->hasFP(MF);
+  }
+  case Mips::FGR32RegClassID:
+    return 32;
+  case Mips::AFGR64RegClassID:
+    return 16;
+  case Mips::FGR64RegClassID:
+    return 32;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Callee Saved Registers methods
+//===----------------------------------------------------------------------===//
+
+/// Mips Callee Saved Registers
+const MCPhysReg *
+MipsRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  if (Subtarget.isSingleFloat())
+    return CSR_SingleFloatOnly_SaveList;
+
+  if (Subtarget.isABI_N64())
+    return CSR_N64_SaveList;
+
+  if (Subtarget.isABI_N32())
+    return CSR_N32_SaveList;
+
+  if (Subtarget.isFP64bit())
+    return CSR_O32_FP64_SaveList;
+
+  if (Subtarget.isFPXX())
+    return CSR_O32_FPXX_SaveList;
+
+  return CSR_O32_SaveList;
+}
+
+const uint32_t*
+MipsRegisterInfo::getCallPreservedMask(CallingConv::ID) const {
+  if (Subtarget.isSingleFloat())
+    return CSR_SingleFloatOnly_RegMask;
+
+  if (Subtarget.isABI_N64())
+    return CSR_N64_RegMask;
+
+  if (Subtarget.isABI_N32())
+    return CSR_N32_RegMask;
+
+  if (Subtarget.isFP64bit())
+    return CSR_O32_FP64_RegMask;
+
+  if (Subtarget.isFPXX())
+    return CSR_O32_FPXX_RegMask;
+
+  return CSR_O32_RegMask;
+}
+
+const uint32_t *MipsRegisterInfo::getMips16RetHelperMask() {
+  return CSR_Mips16RetHelper_RegMask;
+}
+
+BitVector MipsRegisterInfo::
+getReservedRegs(const MachineFunction &MF) const {
+  static const MCPhysReg ReservedGPR32[] = {
+    Mips::ZERO, Mips::K0, Mips::K1, Mips::SP
+  };
+
+  static const MCPhysReg ReservedGPR64[] = {
+    Mips::ZERO_64, Mips::K0_64, Mips::K1_64, Mips::SP_64
+  };
+
+  BitVector Reserved(getNumRegs());
+  typedef TargetRegisterClass::const_iterator RegIter;
+
+  for (unsigned I = 0; I < array_lengthof(ReservedGPR32); ++I)
+    Reserved.set(ReservedGPR32[I]);
+
+  // Reserve registers for the NaCl sandbox.
+  if (Subtarget.isTargetNaCl()) {
+    Reserved.set(Mips::T6);   // Reserved for control flow mask.
+    Reserved.set(Mips::T7);   // Reserved for memory access mask.
+    Reserved.set(Mips::T8);   // Reserved for thread pointer.
+  }
+
+  for (unsigned I = 0; I < array_lengthof(ReservedGPR64); ++I)
+    Reserved.set(ReservedGPR64[I]);
+
+  if (Subtarget.isFP64bit()) {
+    // Reserve all registers in AFGR64.
+    for (RegIter Reg = Mips::AFGR64RegClass.begin(),
+         EReg = Mips::AFGR64RegClass.end(); Reg != EReg; ++Reg)
+      Reserved.set(*Reg);
+  } else {
+    // Reserve all registers in FGR64.
+    for (RegIter Reg = Mips::FGR64RegClass.begin(),
+         EReg = Mips::FGR64RegClass.end(); Reg != EReg; ++Reg)
+      Reserved.set(*Reg);
+  }
+  // Reserve FP if this function should have a dedicated frame pointer register.
+  if (MF.getTarget().getFrameLowering()->hasFP(MF)) {
+    if (Subtarget.inMips16Mode())
+      Reserved.set(Mips::S0);
+    else {
+      Reserved.set(Mips::FP);
+      Reserved.set(Mips::FP_64);
+    }
+  }
+
+  // Reserve hardware registers.
+  Reserved.set(Mips::HWR29);
+
+  // Reserve DSP control register.
+  Reserved.set(Mips::DSPPos);
+  Reserved.set(Mips::DSPSCount);
+  Reserved.set(Mips::DSPCarry);
+  Reserved.set(Mips::DSPEFI);
+  Reserved.set(Mips::DSPOutFlag);
+
+  // Reserve MSA control registers.
+  Reserved.set(Mips::MSAIR);
+  Reserved.set(Mips::MSACSR);
+  Reserved.set(Mips::MSAAccess);
+  Reserved.set(Mips::MSASave);
+  Reserved.set(Mips::MSAModify);
+  Reserved.set(Mips::MSARequest);
+  Reserved.set(Mips::MSAMap);
+  Reserved.set(Mips::MSAUnmap);
+
+  // Reserve RA if in mips16 mode.
+  if (Subtarget.inMips16Mode()) {
+    const MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+    Reserved.set(Mips::RA);
+    Reserved.set(Mips::RA_64);
+    Reserved.set(Mips::T0);
+    Reserved.set(Mips::T1);
+    if (MF.getFunction()->hasFnAttribute("saveS2") || MipsFI->hasSaveS2())
+      Reserved.set(Mips::S2);
+  }
+
+  // Reserve GP if small section is used.
+  if (Subtarget.useSmallSection()) {
+    Reserved.set(Mips::GP);
+    Reserved.set(Mips::GP_64);
+  }
+
+  if (Subtarget.isABI_O32() && !Subtarget.useOddSPReg()) {
+    for (const auto &Reg : Mips::OddSPRegClass)
+      Reserved.set(Reg);
+  }
+
+  return Reserved;
+}
+
+bool
+MipsRegisterInfo::requiresRegisterScavenging(const MachineFunction &MF) const {
+  return true;
+}
+
+bool
+MipsRegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
+  return true;
+}
+
+// FrameIndex represent objects inside a abstract stack.
+// We must replace FrameIndex with an stack/frame pointer
+// direct reference.
+void MipsRegisterInfo::
+eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
+                    unsigned FIOperandNum, RegScavenger *RS) const {
+  MachineInstr &MI = *II;
+  MachineFunction &MF = *MI.getParent()->getParent();
+
+  DEBUG(errs() << "\nFunction : " << MF.getName() << "\n";
+        errs() << "<--------->\n" << MI);
+
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+  uint64_t stackSize = MF.getFrameInfo()->getStackSize();
+  int64_t spOffset = MF.getFrameInfo()->getObjectOffset(FrameIndex);
+
+  DEBUG(errs() << "FrameIndex : " << FrameIndex << "\n"
+               << "spOffset   : " << spOffset << "\n"
+               << "stackSize  : " << stackSize << "\n");
+
+  eliminateFI(MI, FIOperandNum, FrameIndex, stackSize, spOffset);
+}
+
+unsigned MipsRegisterInfo::
+getFrameRegister(const MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  bool IsN64 = Subtarget.isABI_N64();
+
+  if (Subtarget.inMips16Mode())
+    return TFI->hasFP(MF) ? Mips::S0 : Mips::SP;
+  else
+    return TFI->hasFP(MF) ? (IsN64 ? Mips::FP_64 : Mips::FP) :
+                            (IsN64 ? Mips::SP_64 : Mips::SP);
+
+}
+
diff --git a/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.h b/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.h
new file mode 100644
index 0000000..b34496f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.h
@@ -0,0 +1,83 @@
+//===-- MipsRegisterInfo.h - Mips Register Information Impl -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSREGISTERINFO_H
+#define MIPSREGISTERINFO_H
+
+#include "Mips.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "MipsGenRegisterInfo.inc"
+
+namespace llvm {
+class MipsSubtarget;
+class Type;
+
+class MipsRegisterInfo : public MipsGenRegisterInfo {
+protected:
+  const MipsSubtarget &Subtarget;
+
+public:
+  MipsRegisterInfo(const MipsSubtarget &Subtarget);
+
+  /// getRegisterNumbering - Given the enum value for some register, e.g.
+  /// Mips::RA, return the number that it corresponds to (e.g. 31).
+  static unsigned getRegisterNumbering(unsigned RegEnum);
+
+  /// Get PIC indirect call register
+  static unsigned getPICCallReg();
+
+  /// Adjust the Mips stack frame.
+  void adjustMipsStackFrame(MachineFunction &MF) const;
+
+  /// Code Generation virtual methods...
+  const TargetRegisterClass *getPointerRegClass(const MachineFunction &MF,
+                                                unsigned Kind) const override;
+
+  unsigned getRegPressureLimit(const TargetRegisterClass *RC,
+                               MachineFunction &MF) const override;
+  const MCPhysReg *
+  getCalleeSavedRegs(const MachineFunction *MF = nullptr) const override;
+  const uint32_t *getCallPreservedMask(CallingConv::ID) const override;
+  static const uint32_t *getMips16RetHelperMask();
+
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+  bool requiresRegisterScavenging(const MachineFunction &MF) const override;
+
+  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;
+
+  /// Stack Frame Processing Methods
+  void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                           int SPAdj, unsigned FIOperandNum,
+                           RegScavenger *RS = nullptr) const override;
+
+  void processFunctionBeforeFrameFinalized(MachineFunction &MF,
+                                       RegScavenger *RS = nullptr) const;
+
+  /// Debug information queries.
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
+
+  /// \brief Return GPR register class.
+  virtual const TargetRegisterClass *intRegClass(unsigned Size) const = 0;
+
+private:
+  virtual void eliminateFI(MachineBasicBlock::iterator II, unsigned OpNo,
+                           int FrameIndex, uint64_t StackSize,
+                           int64_t SPOffset) const = 0;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.td b/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.td
new file mode 100644
index 0000000..74dfa4f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.td
@@ -0,0 +1,581 @@
+//===-- MipsRegisterInfo.td - Mips Register defs -----------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the MIPS register file
+//===----------------------------------------------------------------------===//
+let Namespace = "Mips" in {
+def sub_32     : SubRegIndex<32>;
+def sub_64     : SubRegIndex<64>;
+def sub_lo     : SubRegIndex<32>;
+def sub_hi     : SubRegIndex<32, 32>;
+def sub_dsp16_19 : SubRegIndex<4, 16>;
+def sub_dsp20    : SubRegIndex<1, 20>;
+def sub_dsp21    : SubRegIndex<1, 21>;
+def sub_dsp22    : SubRegIndex<1, 22>;
+def sub_dsp23    : SubRegIndex<1, 23>;
+}
+
+class Unallocatable {
+  bit isAllocatable = 0;
+}
+
+// We have banks of 32 registers each.
+class MipsReg<bits<16> Enc, string n> : Register<n> {
+  let HWEncoding = Enc;
+  let Namespace = "Mips";
+}
+
+class MipsRegWithSubRegs<bits<16> Enc, string n, list<Register> subregs>
+  : RegisterWithSubRegs<n, subregs> {
+  let HWEncoding = Enc;
+  let Namespace = "Mips";
+}
+
+// Mips CPU Registers
+class MipsGPRReg<bits<16> Enc, string n> : MipsReg<Enc, n>;
+
+// Mips 64-bit CPU Registers
+class Mips64GPRReg<bits<16> Enc, string n, list<Register> subregs>
+  : MipsRegWithSubRegs<Enc, n, subregs> {
+  let SubRegIndices = [sub_32];
+}
+
+// Mips 32-bit FPU Registers
+class FPR<bits<16> Enc, string n> : MipsReg<Enc, n>;
+
+// Mips 64-bit (aliased) FPU Registers
+class AFPR<bits<16> Enc, string n, list<Register> subregs>
+  : MipsRegWithSubRegs<Enc, n, subregs> {
+  let SubRegIndices = [sub_lo, sub_hi];
+  let CoveredBySubRegs = 1;
+}
+
+class AFPR64<bits<16> Enc, string n, list<Register> subregs>
+  : MipsRegWithSubRegs<Enc, n, subregs> {
+  let SubRegIndices = [sub_lo, sub_hi];
+  let CoveredBySubRegs = 1;
+}
+
+// Mips 128-bit (aliased) MSA Registers
+class AFPR128<bits<16> Enc, string n, list<Register> subregs>
+  : MipsRegWithSubRegs<Enc, n, subregs> {
+  let SubRegIndices = [sub_64];
+}
+
+// Accumulator Registers
+class ACCReg<bits<16> Enc, string n, list<Register> subregs>
+  : MipsRegWithSubRegs<Enc, n, subregs> {
+  let SubRegIndices = [sub_lo, sub_hi];
+  let CoveredBySubRegs = 1;
+}
+
+// Mips Hardware Registers
+class HWR<bits<16> Enc, string n> : MipsReg<Enc, n>;
+
+//===----------------------------------------------------------------------===//
+//  Registers
+//===----------------------------------------------------------------------===//
+
+let Namespace = "Mips" in {
+  // General Purpose Registers
+  def ZERO : MipsGPRReg< 0, "zero">, DwarfRegNum<[0]>;
+  def AT   : MipsGPRReg< 1, "1">,    DwarfRegNum<[1]>;
+  def V0   : MipsGPRReg< 2, "2">,    DwarfRegNum<[2]>;
+  def V1   : MipsGPRReg< 3, "3">,    DwarfRegNum<[3]>;
+  def A0   : MipsGPRReg< 4, "4">,    DwarfRegNum<[4]>;
+  def A1   : MipsGPRReg< 5, "5">,    DwarfRegNum<[5]>;
+  def A2   : MipsGPRReg< 6, "6">,    DwarfRegNum<[6]>;
+  def A3   : MipsGPRReg< 7, "7">,    DwarfRegNum<[7]>;
+  def T0   : MipsGPRReg< 8, "8">,    DwarfRegNum<[8]>;
+  def T1   : MipsGPRReg< 9, "9">,    DwarfRegNum<[9]>;
+  def T2   : MipsGPRReg< 10, "10">,  DwarfRegNum<[10]>;
+  def T3   : MipsGPRReg< 11, "11">,  DwarfRegNum<[11]>;
+  def T4   : MipsGPRReg< 12, "12">,  DwarfRegNum<[12]>;
+  def T5   : MipsGPRReg< 13, "13">,  DwarfRegNum<[13]>;
+  def T6   : MipsGPRReg< 14, "14">,  DwarfRegNum<[14]>;
+  def T7   : MipsGPRReg< 15, "15">,  DwarfRegNum<[15]>;
+  def S0   : MipsGPRReg< 16, "16">,  DwarfRegNum<[16]>;
+  def S1   : MipsGPRReg< 17, "17">,  DwarfRegNum<[17]>;
+  def S2   : MipsGPRReg< 18, "18">,  DwarfRegNum<[18]>;
+  def S3   : MipsGPRReg< 19, "19">,  DwarfRegNum<[19]>;
+  def S4   : MipsGPRReg< 20, "20">,  DwarfRegNum<[20]>;
+  def S5   : MipsGPRReg< 21, "21">,  DwarfRegNum<[21]>;
+  def S6   : MipsGPRReg< 22, "22">,  DwarfRegNum<[22]>;
+  def S7   : MipsGPRReg< 23, "23">,  DwarfRegNum<[23]>;
+  def T8   : MipsGPRReg< 24, "24">,  DwarfRegNum<[24]>;
+  def T9   : MipsGPRReg< 25, "25">,  DwarfRegNum<[25]>;
+  def K0   : MipsGPRReg< 26, "26">,  DwarfRegNum<[26]>;
+  def K1   : MipsGPRReg< 27, "27">,  DwarfRegNum<[27]>;
+  def GP   : MipsGPRReg< 28, "gp">,  DwarfRegNum<[28]>;
+  def SP   : MipsGPRReg< 29, "sp">,  DwarfRegNum<[29]>;
+  def FP   : MipsGPRReg< 30, "fp">,  DwarfRegNum<[30]>;
+  def RA   : MipsGPRReg< 31, "ra">,  DwarfRegNum<[31]>;
+
+  // General Purpose 64-bit Registers
+  def ZERO_64 : Mips64GPRReg< 0, "zero", [ZERO]>, DwarfRegNum<[0]>;
+  def AT_64   : Mips64GPRReg< 1, "1",    [AT]>, DwarfRegNum<[1]>;
+  def V0_64   : Mips64GPRReg< 2, "2",    [V0]>, DwarfRegNum<[2]>;
+  def V1_64   : Mips64GPRReg< 3, "3",    [V1]>, DwarfRegNum<[3]>;
+  def A0_64   : Mips64GPRReg< 4, "4",    [A0]>, DwarfRegNum<[4]>;
+  def A1_64   : Mips64GPRReg< 5, "5",    [A1]>, DwarfRegNum<[5]>;
+  def A2_64   : Mips64GPRReg< 6, "6",    [A2]>, DwarfRegNum<[6]>;
+  def A3_64   : Mips64GPRReg< 7, "7",    [A3]>, DwarfRegNum<[7]>;
+  def T0_64   : Mips64GPRReg< 8, "8",    [T0]>, DwarfRegNum<[8]>;
+  def T1_64   : Mips64GPRReg< 9, "9",    [T1]>, DwarfRegNum<[9]>;
+  def T2_64   : Mips64GPRReg< 10, "10",  [T2]>, DwarfRegNum<[10]>;
+  def T3_64   : Mips64GPRReg< 11, "11",  [T3]>, DwarfRegNum<[11]>;
+  def T4_64   : Mips64GPRReg< 12, "12",  [T4]>, DwarfRegNum<[12]>;
+  def T5_64   : Mips64GPRReg< 13, "13",  [T5]>, DwarfRegNum<[13]>;
+  def T6_64   : Mips64GPRReg< 14, "14",  [T6]>, DwarfRegNum<[14]>;
+  def T7_64   : Mips64GPRReg< 15, "15",  [T7]>, DwarfRegNum<[15]>;
+  def S0_64   : Mips64GPRReg< 16, "16",  [S0]>, DwarfRegNum<[16]>;
+  def S1_64   : Mips64GPRReg< 17, "17",  [S1]>, DwarfRegNum<[17]>;
+  def S2_64   : Mips64GPRReg< 18, "18",  [S2]>, DwarfRegNum<[18]>;
+  def S3_64   : Mips64GPRReg< 19, "19",  [S3]>, DwarfRegNum<[19]>;
+  def S4_64   : Mips64GPRReg< 20, "20",  [S4]>, DwarfRegNum<[20]>;
+  def S5_64   : Mips64GPRReg< 21, "21",  [S5]>, DwarfRegNum<[21]>;
+  def S6_64   : Mips64GPRReg< 22, "22",  [S6]>, DwarfRegNum<[22]>;
+  def S7_64   : Mips64GPRReg< 23, "23",  [S7]>, DwarfRegNum<[23]>;
+  def T8_64   : Mips64GPRReg< 24, "24",  [T8]>, DwarfRegNum<[24]>;
+  def T9_64   : Mips64GPRReg< 25, "25",  [T9]>, DwarfRegNum<[25]>;
+  def K0_64   : Mips64GPRReg< 26, "26",  [K0]>, DwarfRegNum<[26]>;
+  def K1_64   : Mips64GPRReg< 27, "27",  [K1]>, DwarfRegNum<[27]>;
+  def GP_64   : Mips64GPRReg< 28, "gp",  [GP]>, DwarfRegNum<[28]>;
+  def SP_64   : Mips64GPRReg< 29, "sp",  [SP]>, DwarfRegNum<[29]>;
+  def FP_64   : Mips64GPRReg< 30, "fp",  [FP]>, DwarfRegNum<[30]>;
+  def RA_64   : Mips64GPRReg< 31, "ra",  [RA]>, DwarfRegNum<[31]>;
+
+  /// Mips Single point precision FPU Registers
+  foreach I = 0-31 in
+  def F#I : FPR<I, "f"#I>, DwarfRegNum<[!add(I, 32)]>;
+
+  // Higher half of 64-bit FP registers.
+  foreach I = 0-31 in
+  def F_HI#I : FPR<I, "f"#I>, DwarfRegNum<[!add(I, 32)]>;
+
+  /// Mips Double point precision FPU Registers (aliased
+  /// with the single precision to hold 64 bit values)
+  foreach I = 0-15 in
+  def D#I : AFPR<!shl(I, 1), "f"#!shl(I, 1),
+                 [!cast<FPR>("F"#!shl(I, 1)),
+                  !cast<FPR>("F"#!add(!shl(I, 1), 1))]>;
+
+  /// Mips Double point precision FPU Registers in MFP64 mode.
+  foreach I = 0-31 in
+  def D#I#_64 : AFPR64<I, "f"#I, [!cast<FPR>("F"#I), !cast<FPR>("F_HI"#I)]>,
+                DwarfRegNum<[!add(I, 32)]>;
+
+  /// Mips MSA registers
+  /// MSA and FPU cannot both be present unless the FPU has 64-bit registers
+  foreach I = 0-31 in
+  def W#I : AFPR128<I, "w"#I, [!cast<AFPR64>("D"#I#"_64")]>,
+            DwarfRegNum<[!add(I, 32)]>;
+
+  // Hi/Lo registers
+  def HI0 : MipsReg<0, "ac0">, DwarfRegNum<[64]>;
+  def HI1 : MipsReg<1, "ac1">, DwarfRegNum<[176]>;
+  def HI2 : MipsReg<2, "ac2">, DwarfRegNum<[178]>;
+  def HI3 : MipsReg<3, "ac3">, DwarfRegNum<[180]>;
+  def LO0 : MipsReg<0, "ac0">, DwarfRegNum<[65]>;
+  def LO1 : MipsReg<1, "ac1">, DwarfRegNum<[177]>;
+  def LO2 : MipsReg<2, "ac2">, DwarfRegNum<[179]>;
+  def LO3 : MipsReg<3, "ac3">, DwarfRegNum<[181]>;
+
+  let SubRegIndices = [sub_32] in {
+  def HI0_64  : RegisterWithSubRegs<"hi", [HI0]>;
+  def LO0_64  : RegisterWithSubRegs<"lo", [LO0]>;
+  }
+
+  // FP control registers.
+  foreach I = 0-31 in
+  def FCR#I : MipsReg<#I, ""#I>;
+
+  // FP condition code registers.
+  foreach I = 0-7 in
+  def FCC#I : MipsReg<#I, "fcc"#I>;
+
+  // COP2 registers.
+  foreach I = 0-31 in
+  def COP2#I : MipsReg<#I, ""#I>;
+
+  // COP3 registers.
+  foreach I = 0-31 in
+  def COP3#I : MipsReg<#I, ""#I>;
+
+  // PC register
+  def PC : Register<"pc">;
+
+  // Hardware register $29
+  foreach I = 0-31 in
+  def HWR#I : MipsReg<#I, ""#I>;
+
+  // Accum registers
+  foreach I = 0-3 in
+  def AC#I : ACCReg<#I, "ac"#I,
+                    [!cast<Register>("LO"#I), !cast<Register>("HI"#I)]>;
+
+  def AC0_64 : ACCReg<0, "ac0", [LO0_64, HI0_64]>;
+
+  // DSP-ASE control register fields.
+  def DSPPos : Register<"">;
+  def DSPSCount : Register<"">;
+  def DSPCarry : Register<"">;
+  def DSPEFI : Register<"">;
+  def DSPOutFlag16_19 : Register<"">;
+  def DSPOutFlag20 : Register<"">;
+  def DSPOutFlag21 : Register<"">;
+  def DSPOutFlag22 : Register<"">;
+  def DSPOutFlag23 : Register<"">;
+  def DSPCCond : Register<"">;
+
+  let SubRegIndices = [sub_dsp16_19, sub_dsp20, sub_dsp21, sub_dsp22,
+                       sub_dsp23] in
+  def DSPOutFlag : RegisterWithSubRegs<"", [DSPOutFlag16_19, DSPOutFlag20,
+                                            DSPOutFlag21, DSPOutFlag22,
+                                            DSPOutFlag23]>;
+
+  // MSA-ASE control registers.
+  def MSAIR      : MipsReg<0, "0">;
+  def MSACSR     : MipsReg<1, "1">;
+  def MSAAccess  : MipsReg<2, "2">;
+  def MSASave    : MipsReg<3, "3">;
+  def MSAModify  : MipsReg<4, "4">;
+  def MSARequest : MipsReg<5, "5">;
+  def MSAMap     : MipsReg<6, "6">;
+  def MSAUnmap   : MipsReg<7, "7">;
+
+  // Octeon multiplier and product registers
+  def MPL0 : MipsReg<0, "mpl0">;
+  def MPL1 : MipsReg<1, "mpl1">;
+  def MPL2 : MipsReg<2, "mpl2">;
+  def P0 : MipsReg<0, "p0">;
+  def P1 : MipsReg<1, "p1">;
+  def P2 : MipsReg<2, "p2">;
+
+}
+
+//===----------------------------------------------------------------------===//
+// Register Classes
+//===----------------------------------------------------------------------===//
+
+class GPR32Class<list<ValueType> regTypes> :
+  RegisterClass<"Mips", regTypes, 32, (add
+  // Reserved
+  ZERO, AT,
+  // Return Values and Arguments
+  V0, V1, A0, A1, A2, A3,
+  // Not preserved across procedure calls
+  T0, T1, T2, T3, T4, T5, T6, T7,
+  // Callee save
+  S0, S1, S2, S3, S4, S5, S6, S7,
+  // Not preserved across procedure calls
+  T8, T9,
+  // Reserved
+  K0, K1, GP, SP, FP, RA)>;
+
+def GPR32 : GPR32Class<[i32]>;
+def DSPR  : GPR32Class<[v4i8, v2i16]>;
+
+def GPR64 : RegisterClass<"Mips", [i64], 64, (add
+// Reserved
+  ZERO_64, AT_64,
+  // Return Values and Arguments
+  V0_64, V1_64, A0_64, A1_64, A2_64, A3_64,
+  // Not preserved across procedure calls
+  T0_64, T1_64, T2_64, T3_64, T4_64, T5_64, T6_64, T7_64,
+  // Callee save
+  S0_64, S1_64, S2_64, S3_64, S4_64, S5_64, S6_64, S7_64,
+  // Not preserved across procedure calls
+  T8_64, T9_64,
+  // Reserved
+  K0_64, K1_64, GP_64, SP_64, FP_64, RA_64)>;
+
+def CPU16Regs : RegisterClass<"Mips", [i32], 32, (add
+  // Return Values and Arguments
+  V0, V1, A0, A1, A2, A3,
+  // Callee save
+  S0, S1)>;
+
+def CPU16RegsPlusSP : RegisterClass<"Mips", [i32], 32, (add
+  // Return Values and Arguments
+  V0, V1, A0, A1, A2, A3,
+  // Callee save
+  S0, S1,
+  SP)>;
+
+def CPURAReg : RegisterClass<"Mips", [i32], 32, (add RA)>, Unallocatable;
+
+def CPUSPReg : RegisterClass<"Mips", [i32], 32, (add SP)>, Unallocatable;
+
+// 64bit fp:
+// * FGR64  - 32 64-bit registers
+// * AFGR64 - 16 32-bit even registers (32-bit FP Mode)
+//
+// 32bit fp:
+// * FGR32 - 16 32-bit even registers
+// * FGR32 - 32 32-bit registers (single float only mode)
+def FGR32 : RegisterClass<"Mips", [f32], 32, (sequence "F%u", 0, 31)>;
+
+def FGRH32 : RegisterClass<"Mips", [f32], 32, (sequence "F_HI%u", 0, 31)>,
+             Unallocatable;
+
+def AFGR64 : RegisterClass<"Mips", [f64], 64, (add
+  // Return Values and Arguments
+  D0, D1,
+  // Not preserved across procedure calls
+  D2, D3, D4, D5,
+  // Return Values and Arguments
+  D6, D7,
+  // Not preserved across procedure calls
+  D8, D9,
+  // Callee save
+  D10, D11, D12, D13, D14, D15)>;
+
+def FGR64 : RegisterClass<"Mips", [f64], 64, (sequence "D%u_64", 0, 31)>;
+
+// Used to reserve odd registers when given -mattr=+nooddspreg
+// FIXME: Remove double precision registers from this set.
+def OddSP : RegisterClass<"Mips", [f32], 32,
+                          (add (decimate (sequence "F%u", 1, 31), 2),
+                               (decimate (sequence "F_HI%u", 1, 31), 2),
+                               (decimate (sequence "D%u", 1, 15), 2),
+                               (decimate (sequence "D%u_64", 1, 31), 2))>,
+            Unallocatable;
+
+// FP control registers.
+def CCR : RegisterClass<"Mips", [i32], 32, (sequence "FCR%u", 0, 31)>,
+          Unallocatable;
+
+// FP condition code registers.
+def FCC : RegisterClass<"Mips", [i32], 32, (sequence "FCC%u", 0, 7)>,
+          Unallocatable;
+
+// MIPS32r6/MIPS64r6 store FPU condition codes in normal FGR registers.
+// This class allows us to represent this in codegen patterns.
+def FGRCC : RegisterClass<"Mips", [i32], 32, (sequence "F%u", 0, 31)>;
+
+def MSA128B: RegisterClass<"Mips", [v16i8], 128,
+                           (sequence "W%u", 0, 31)>;
+def MSA128H: RegisterClass<"Mips", [v8i16, v8f16], 128,
+                           (sequence "W%u", 0, 31)>;
+def MSA128W: RegisterClass<"Mips", [v4i32, v4f32], 128,
+                           (sequence "W%u", 0, 31)>;
+def MSA128D: RegisterClass<"Mips", [v2i64, v2f64], 128,
+                           (sequence "W%u", 0, 31)>;
+
+def MSACtrl: RegisterClass<"Mips", [i32], 32, (add
+  MSAIR, MSACSR, MSAAccess, MSASave, MSAModify, MSARequest, MSAMap, MSAUnmap)>;
+
+// Hi/Lo Registers
+def LO32 : RegisterClass<"Mips", [i32], 32, (add LO0)>;
+def HI32 : RegisterClass<"Mips", [i32], 32, (add HI0)>;
+def LO32DSP : RegisterClass<"Mips", [i32], 32, (sequence "LO%u", 0, 3)>;
+def HI32DSP : RegisterClass<"Mips", [i32], 32, (sequence "HI%u", 0, 3)>;
+def LO64 : RegisterClass<"Mips", [i64], 64, (add LO0_64)>;
+def HI64 : RegisterClass<"Mips", [i64], 64, (add HI0_64)>;
+
+// Hardware registers
+def HWRegs : RegisterClass<"Mips", [i32], 32, (sequence "HWR%u", 0, 31)>,
+             Unallocatable;
+
+// Accumulator Registers
+def ACC64 : RegisterClass<"Mips", [untyped], 64, (add AC0)> {
+  let Size = 64;
+}
+
+def ACC128 : RegisterClass<"Mips", [untyped], 128, (add AC0_64)> {
+  let Size = 128;
+}
+
+def ACC64DSP : RegisterClass<"Mips", [untyped], 64, (sequence "AC%u", 0, 3)> {
+  let Size = 64;
+}
+
+def DSPCC : RegisterClass<"Mips", [v4i8, v2i16], 32, (add DSPCCond)>;
+
+// Coprocessor 2 registers.
+def COP2 : RegisterClass<"Mips", [i32], 32, (sequence "COP2%u", 0, 31)>,
+           Unallocatable;
+
+// Coprocessor 3 registers.
+def COP3 : RegisterClass<"Mips", [i32], 32, (sequence "COP3%u", 0, 31)>,
+           Unallocatable;
+
+// Octeon multiplier and product registers
+def OCTEON_MPL : RegisterClass<"Mips", [i64], 64, (add MPL0, MPL1, MPL2)>,
+                 Unallocatable;
+def OCTEON_P : RegisterClass<"Mips", [i64], 64, (add P0, P1, P2)>,
+               Unallocatable;
+
+// Register Operands.
+
+class MipsAsmRegOperand : AsmOperandClass {
+  let ParserMethod = "ParseAnyRegister";
+}
+
+def GPR64AsmOperand : MipsAsmRegOperand {
+  let Name = "GPR64AsmReg";
+  let PredicateMethod = "isGPRAsmReg";
+}
+
+def GPR32AsmOperand : MipsAsmRegOperand {
+  let Name = "GPR32AsmReg";
+  let PredicateMethod = "isGPRAsmReg";
+}
+
+def ACC64DSPAsmOperand : MipsAsmRegOperand {
+  let Name = "ACC64DSPAsmReg";
+  let PredicateMethod = "isACCAsmReg";
+}
+
+def HI32DSPAsmOperand : MipsAsmRegOperand {
+  let Name = "HI32DSPAsmReg";
+  let PredicateMethod = "isACCAsmReg";
+}
+
+def LO32DSPAsmOperand : MipsAsmRegOperand {
+  let Name = "LO32DSPAsmReg";
+  let PredicateMethod = "isACCAsmReg";
+}
+
+def CCRAsmOperand : MipsAsmRegOperand {
+  let Name = "CCRAsmReg";
+}
+
+def AFGR64AsmOperand : MipsAsmRegOperand {
+  let Name = "AFGR64AsmReg";
+  let PredicateMethod = "isFGRAsmReg";
+}
+
+def FGR64AsmOperand : MipsAsmRegOperand {
+  let Name = "FGR64AsmReg";
+  let PredicateMethod = "isFGRAsmReg";
+}
+
+def FGR32AsmOperand : MipsAsmRegOperand {
+  let Name = "FGR32AsmReg";
+  let PredicateMethod = "isFGRAsmReg";
+}
+
+def FGRH32AsmOperand : MipsAsmRegOperand {
+  let Name = "FGRH32AsmReg";
+  let PredicateMethod = "isFGRAsmReg";
+}
+
+def FCCRegsAsmOperand : MipsAsmRegOperand {
+  let Name = "FCCAsmReg";
+}
+
+def MSA128AsmOperand : MipsAsmRegOperand {
+  let Name = "MSA128AsmReg";
+}
+
+def MSACtrlAsmOperand : MipsAsmRegOperand {
+  let Name = "MSACtrlAsmReg";
+}
+
+def GPR32Opnd : RegisterOperand<GPR32> {
+  let ParserMatchClass = GPR32AsmOperand;
+}
+
+def GPR64Opnd : RegisterOperand<GPR64> {
+  let ParserMatchClass = GPR64AsmOperand;
+}
+
+def DSPROpnd : RegisterOperand<DSPR> {
+  let ParserMatchClass = GPR32AsmOperand;
+}
+
+def CCROpnd : RegisterOperand<CCR> {
+  let ParserMatchClass = CCRAsmOperand;
+}
+
+def HWRegsAsmOperand : MipsAsmRegOperand {
+  let Name = "HWRegsAsmReg";
+}
+
+def COP2AsmOperand : MipsAsmRegOperand {
+  let Name = "COP2AsmReg";
+}
+
+def COP3AsmOperand : MipsAsmRegOperand {
+  let Name = "COP3AsmReg";
+}
+
+def HWRegsOpnd : RegisterOperand<HWRegs> {
+  let ParserMatchClass = HWRegsAsmOperand;
+}
+
+def AFGR64Opnd : RegisterOperand<AFGR64> {
+  let ParserMatchClass = AFGR64AsmOperand;
+}
+
+def FGR64Opnd : RegisterOperand<FGR64> {
+  let ParserMatchClass = FGR64AsmOperand;
+}
+
+def FGR32Opnd : RegisterOperand<FGR32> {
+  let ParserMatchClass = FGR32AsmOperand;
+}
+
+def FGRCCOpnd : RegisterOperand<FGRCC> {
+  // The assembler doesn't use register classes so we can re-use
+  // FGR32AsmOperand.
+  let ParserMatchClass = FGR32AsmOperand;
+}
+
+def FGRH32Opnd : RegisterOperand<FGRH32> {
+  let ParserMatchClass = FGRH32AsmOperand;
+}
+
+def FCCRegsOpnd : RegisterOperand<FCC> {
+  let ParserMatchClass = FCCRegsAsmOperand;
+}
+
+def LO32DSPOpnd : RegisterOperand<LO32DSP> {
+  let ParserMatchClass = LO32DSPAsmOperand;
+}
+
+def HI32DSPOpnd : RegisterOperand<HI32DSP> {
+  let ParserMatchClass = HI32DSPAsmOperand;
+}
+
+def ACC64DSPOpnd : RegisterOperand<ACC64DSP> {
+  let ParserMatchClass = ACC64DSPAsmOperand;
+}
+
+def COP2Opnd : RegisterOperand<COP2> {
+  let ParserMatchClass = COP2AsmOperand;
+}
+
+def COP3Opnd : RegisterOperand<COP3> {
+  let ParserMatchClass = COP3AsmOperand;
+}
+
+def MSA128BOpnd : RegisterOperand<MSA128B> {
+  let ParserMatchClass = MSA128AsmOperand;
+}
+
+def MSA128HOpnd : RegisterOperand<MSA128H> {
+  let ParserMatchClass = MSA128AsmOperand;
+}
+
+def MSA128WOpnd : RegisterOperand<MSA128W> {
+  let ParserMatchClass = MSA128AsmOperand;
+}
+
+def MSA128DOpnd : RegisterOperand<MSA128D> {
+  let ParserMatchClass = MSA128AsmOperand;
+}
+
+def MSA128CROpnd : RegisterOperand<MSACtrl> {
+  let ParserMatchClass = MSACtrlAsmOperand;
+}
+
diff --git a/contrib/llvm/lib/Target/Mips/MipsRelocations.h b/contrib/llvm/lib/Target/Mips/MipsRelocations.h
new file mode 100644
index 0000000..0787ed3
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsRelocations.h
@@ -0,0 +1,41 @@
+//===-- MipsRelocations.h - Mips Code Relocations ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the Mips target-specific relocation types
+// (for relocation-model=static).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSRELOCATIONS_H_
+#define MIPSRELOCATIONS_H_
+
+#include "llvm/CodeGen/MachineRelocation.h"
+
+namespace llvm {
+  namespace Mips{
+    enum RelocationType {
+      // reloc_mips_pc16 - pc relative relocation for branches. The lower 18
+      // bits of the difference between the branch target and the branch
+      // instruction, shifted right by 2.
+      reloc_mips_pc16 = 1,
+
+      // reloc_mips_hi - upper 16 bits of the address (modified by +1 if the
+      // lower 16 bits of the address is negative).
+      reloc_mips_hi = 2,
+
+      // reloc_mips_lo - lower 16 bits of the address.
+      reloc_mips_lo = 3,
+
+      // reloc_mips_26 - lower 28 bits of the address, shifted right by 2.
+      reloc_mips_26 = 4
+    };
+  }
+}
+
+#endif /* MIPSRELOCATIONS_H_ */
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp b/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
new file mode 100644
index 0000000..d0a17cd
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
@@ -0,0 +1,705 @@
+//===-- MipsSEFrameLowering.cpp - Mips32/64 Frame Information -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips32/64 implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsSEFrameLowering.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MipsAnalyzeImmediate.h"
+#include "MipsMachineFunction.h"
+#include "MipsSEInstrInfo.h"
+#include "MipsSubtarget.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+namespace {
+typedef MachineBasicBlock::iterator Iter;
+
+static std::pair<unsigned, unsigned> getMFHiLoOpc(unsigned Src) {
+  if (Mips::ACC64RegClass.contains(Src))
+    return std::make_pair((unsigned)Mips::PseudoMFHI,
+                          (unsigned)Mips::PseudoMFLO);
+
+  if (Mips::ACC64DSPRegClass.contains(Src))
+    return std::make_pair((unsigned)Mips::MFHI_DSP, (unsigned)Mips::MFLO_DSP);
+
+  if (Mips::ACC128RegClass.contains(Src))
+    return std::make_pair((unsigned)Mips::PseudoMFHI64,
+                          (unsigned)Mips::PseudoMFLO64);
+
+  return std::make_pair(0, 0);
+}
+
+/// Helper class to expand pseudos.
+class ExpandPseudo {
+public:
+  ExpandPseudo(MachineFunction &MF);
+  bool expand();
+
+private:
+  bool expandInstr(MachineBasicBlock &MBB, Iter I);
+  void expandLoadCCond(MachineBasicBlock &MBB, Iter I);
+  void expandStoreCCond(MachineBasicBlock &MBB, Iter I);
+  void expandLoadACC(MachineBasicBlock &MBB, Iter I, unsigned RegSize);
+  void expandStoreACC(MachineBasicBlock &MBB, Iter I, unsigned MFHiOpc,
+                      unsigned MFLoOpc, unsigned RegSize);
+  bool expandCopy(MachineBasicBlock &MBB, Iter I);
+  bool expandCopyACC(MachineBasicBlock &MBB, Iter I, unsigned MFHiOpc,
+                     unsigned MFLoOpc);
+  bool expandBuildPairF64(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator I, bool FP64) const;
+  bool expandExtractElementF64(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator I, bool FP64) const;
+
+  MachineFunction &MF;
+  MachineRegisterInfo &MRI;
+};
+}
+
+ExpandPseudo::ExpandPseudo(MachineFunction &MF_)
+  : MF(MF_), MRI(MF.getRegInfo()) {}
+
+bool ExpandPseudo::expand() {
+  bool Expanded = false;
+
+  for (MachineFunction::iterator BB = MF.begin(), BBEnd = MF.end();
+       BB != BBEnd; ++BB)
+    for (Iter I = BB->begin(), End = BB->end(); I != End;)
+      Expanded |= expandInstr(*BB, I++);
+
+  return Expanded;
+}
+
+bool ExpandPseudo::expandInstr(MachineBasicBlock &MBB, Iter I) {
+  switch(I->getOpcode()) {
+  case Mips::LOAD_CCOND_DSP:
+    expandLoadCCond(MBB, I);
+    break;
+  case Mips::STORE_CCOND_DSP:
+    expandStoreCCond(MBB, I);
+    break;
+  case Mips::LOAD_ACC64:
+  case Mips::LOAD_ACC64DSP:
+    expandLoadACC(MBB, I, 4);
+    break;
+  case Mips::LOAD_ACC128:
+    expandLoadACC(MBB, I, 8);
+    break;
+  case Mips::STORE_ACC64:
+    expandStoreACC(MBB, I, Mips::PseudoMFHI, Mips::PseudoMFLO, 4);
+    break;
+  case Mips::STORE_ACC64DSP:
+    expandStoreACC(MBB, I, Mips::MFHI_DSP, Mips::MFLO_DSP, 4);
+    break;
+  case Mips::STORE_ACC128:
+    expandStoreACC(MBB, I, Mips::PseudoMFHI64, Mips::PseudoMFLO64, 8);
+    break;
+  case Mips::BuildPairF64:
+    if (expandBuildPairF64(MBB, I, false))
+      MBB.erase(I);
+    return false;
+  case Mips::BuildPairF64_64:
+    if (expandBuildPairF64(MBB, I, true))
+      MBB.erase(I);
+    return false;
+  case Mips::ExtractElementF64:
+    if (expandExtractElementF64(MBB, I, false))
+      MBB.erase(I);
+    return false;
+  case Mips::ExtractElementF64_64:
+    if (expandExtractElementF64(MBB, I, true))
+      MBB.erase(I);
+    return false;
+  case TargetOpcode::COPY:
+    if (!expandCopy(MBB, I))
+      return false;
+    break;
+  default:
+    return false;
+  }
+
+  MBB.erase(I);
+  return true;
+}
+
+void ExpandPseudo::expandLoadCCond(MachineBasicBlock &MBB, Iter I) {
+  //  load $vr, FI
+  //  copy ccond, $vr
+
+  assert(I->getOperand(0).isReg() && I->getOperand(1).isFI());
+
+  const MipsSEInstrInfo &TII =
+    *static_cast<const MipsSEInstrInfo*>(MF.getTarget().getInstrInfo());
+  const MipsRegisterInfo &RegInfo =
+    *static_cast<const MipsRegisterInfo*>(MF.getTarget().getRegisterInfo());
+
+  const TargetRegisterClass *RC = RegInfo.intRegClass(4);
+  unsigned VR = MRI.createVirtualRegister(RC);
+  unsigned Dst = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex();
+
+  TII.loadRegFromStack(MBB, I, VR, FI, RC, &RegInfo, 0);
+  BuildMI(MBB, I, I->getDebugLoc(), TII.get(TargetOpcode::COPY), Dst)
+    .addReg(VR, RegState::Kill);
+}
+
+void ExpandPseudo::expandStoreCCond(MachineBasicBlock &MBB, Iter I) {
+  //  copy $vr, ccond
+  //  store $vr, FI
+
+  assert(I->getOperand(0).isReg() && I->getOperand(1).isFI());
+
+  const MipsSEInstrInfo &TII =
+    *static_cast<const MipsSEInstrInfo*>(MF.getTarget().getInstrInfo());
+  const MipsRegisterInfo &RegInfo =
+    *static_cast<const MipsRegisterInfo*>(MF.getTarget().getRegisterInfo());
+
+  const TargetRegisterClass *RC = RegInfo.intRegClass(4);
+  unsigned VR = MRI.createVirtualRegister(RC);
+  unsigned Src = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex();
+
+  BuildMI(MBB, I, I->getDebugLoc(), TII.get(TargetOpcode::COPY), VR)
+    .addReg(Src, getKillRegState(I->getOperand(0).isKill()));
+  TII.storeRegToStack(MBB, I, VR, true, FI, RC, &RegInfo, 0);
+}
+
+void ExpandPseudo::expandLoadACC(MachineBasicBlock &MBB, Iter I,
+                                 unsigned RegSize) {
+  //  load $vr0, FI
+  //  copy lo, $vr0
+  //  load $vr1, FI + 4
+  //  copy hi, $vr1
+
+  assert(I->getOperand(0).isReg() && I->getOperand(1).isFI());
+
+  const MipsSEInstrInfo &TII =
+    *static_cast<const MipsSEInstrInfo*>(MF.getTarget().getInstrInfo());
+  const MipsRegisterInfo &RegInfo =
+    *static_cast<const MipsRegisterInfo*>(MF.getTarget().getRegisterInfo());
+
+  const TargetRegisterClass *RC = RegInfo.intRegClass(RegSize);
+  unsigned VR0 = MRI.createVirtualRegister(RC);
+  unsigned VR1 = MRI.createVirtualRegister(RC);
+  unsigned Dst = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex();
+  unsigned Lo = RegInfo.getSubReg(Dst, Mips::sub_lo);
+  unsigned Hi = RegInfo.getSubReg(Dst, Mips::sub_hi);
+  DebugLoc DL = I->getDebugLoc();
+  const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY);
+
+  TII.loadRegFromStack(MBB, I, VR0, FI, RC, &RegInfo, 0);
+  BuildMI(MBB, I, DL, Desc, Lo).addReg(VR0, RegState::Kill);
+  TII.loadRegFromStack(MBB, I, VR1, FI, RC, &RegInfo, RegSize);
+  BuildMI(MBB, I, DL, Desc, Hi).addReg(VR1, RegState::Kill);
+}
+
+void ExpandPseudo::expandStoreACC(MachineBasicBlock &MBB, Iter I,
+                                  unsigned MFHiOpc, unsigned MFLoOpc,
+                                  unsigned RegSize) {
+  //  mflo $vr0, src
+  //  store $vr0, FI
+  //  mfhi $vr1, src
+  //  store $vr1, FI + 4
+
+  assert(I->getOperand(0).isReg() && I->getOperand(1).isFI());
+
+  const MipsSEInstrInfo &TII =
+    *static_cast<const MipsSEInstrInfo*>(MF.getTarget().getInstrInfo());
+  const MipsRegisterInfo &RegInfo =
+    *static_cast<const MipsRegisterInfo*>(MF.getTarget().getRegisterInfo());
+
+  const TargetRegisterClass *RC = RegInfo.intRegClass(RegSize);
+  unsigned VR0 = MRI.createVirtualRegister(RC);
+  unsigned VR1 = MRI.createVirtualRegister(RC);
+  unsigned Src = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex();
+  unsigned SrcKill = getKillRegState(I->getOperand(0).isKill());
+  DebugLoc DL = I->getDebugLoc();
+
+  BuildMI(MBB, I, DL, TII.get(MFLoOpc), VR0).addReg(Src);
+  TII.storeRegToStack(MBB, I, VR0, true, FI, RC, &RegInfo, 0);
+  BuildMI(MBB, I, DL, TII.get(MFHiOpc), VR1).addReg(Src, SrcKill);
+  TII.storeRegToStack(MBB, I, VR1, true, FI, RC, &RegInfo, RegSize);
+}
+
+bool ExpandPseudo::expandCopy(MachineBasicBlock &MBB, Iter I) {
+  unsigned Src = I->getOperand(1).getReg();
+  std::pair<unsigned, unsigned> Opcodes = getMFHiLoOpc(Src);
+
+  if (!Opcodes.first)
+    return false;
+
+  return expandCopyACC(MBB, I, Opcodes.first, Opcodes.second);
+}
+
+bool ExpandPseudo::expandCopyACC(MachineBasicBlock &MBB, Iter I,
+                                 unsigned MFHiOpc, unsigned MFLoOpc) {
+  //  mflo $vr0, src
+  //  copy dst_lo, $vr0
+  //  mfhi $vr1, src
+  //  copy dst_hi, $vr1
+
+  const MipsSEInstrInfo &TII =
+    *static_cast<const MipsSEInstrInfo*>(MF.getTarget().getInstrInfo());
+  const MipsRegisterInfo &RegInfo =
+    *static_cast<const MipsRegisterInfo*>(MF.getTarget().getRegisterInfo());
+
+  unsigned Dst = I->getOperand(0).getReg(), Src = I->getOperand(1).getReg();
+  unsigned VRegSize = RegInfo.getMinimalPhysRegClass(Dst)->getSize() / 2;
+  const TargetRegisterClass *RC = RegInfo.intRegClass(VRegSize);
+  unsigned VR0 = MRI.createVirtualRegister(RC);
+  unsigned VR1 = MRI.createVirtualRegister(RC);
+  unsigned SrcKill = getKillRegState(I->getOperand(1).isKill());
+  unsigned DstLo = RegInfo.getSubReg(Dst, Mips::sub_lo);
+  unsigned DstHi = RegInfo.getSubReg(Dst, Mips::sub_hi);
+  DebugLoc DL = I->getDebugLoc();
+
+  BuildMI(MBB, I, DL, TII.get(MFLoOpc), VR0).addReg(Src);
+  BuildMI(MBB, I, DL, TII.get(TargetOpcode::COPY), DstLo)
+    .addReg(VR0, RegState::Kill);
+  BuildMI(MBB, I, DL, TII.get(MFHiOpc), VR1).addReg(Src, SrcKill);
+  BuildMI(MBB, I, DL, TII.get(TargetOpcode::COPY), DstHi)
+    .addReg(VR1, RegState::Kill);
+  return true;
+}
+
+/// This method expands the same instruction that MipsSEInstrInfo::
+/// expandBuildPairF64 does, for the case when ABI is fpxx and mthc1 is not
+/// available and the case where the ABI is FP64A. It is implemented here
+/// because frame indexes are eliminated before MipsSEInstrInfo::
+/// expandBuildPairF64 is called.
+bool ExpandPseudo::expandBuildPairF64(MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator I,
+                                      bool FP64) const {
+  // For fpxx and when mthc1 is not available, use:
+  //   spill + reload via ldc1
+  //
+  // The case where dmtc1 is available doesn't need to be handled here
+  // because it never creates a BuildPairF64 node.
+  //
+  // The FP64A ABI (fp64 with nooddspreg) must also use a spill/reload sequence
+  // for odd-numbered double precision values (because the lower 32-bits is
+  // transferred with mtc1 which is redirected to the upper half of the even
+  // register). Unfortunately, we have to make this decision before register
+  // allocation so for now we use a spill/reload sequence for all
+  // double-precision values in regardless of being an odd/even register.
+
+  const TargetMachine &TM = MF.getTarget();
+  const MipsSubtarget &Subtarget = TM.getSubtarget<MipsSubtarget>();
+  if ((Subtarget.isABI_FPXX() && !Subtarget.hasMTHC1()) ||
+      (FP64 && !Subtarget.useOddSPReg())) {
+    const MipsSEInstrInfo &TII =
+      *static_cast<const MipsSEInstrInfo*>(TM.getInstrInfo());
+    const MipsRegisterInfo &TRI =
+      *static_cast<const MipsRegisterInfo*>(TM.getRegisterInfo());
+
+    unsigned DstReg = I->getOperand(0).getReg();
+    unsigned LoReg = I->getOperand(1).getReg();
+    unsigned HiReg = I->getOperand(2).getReg();
+
+    // It should be impossible to have FGR64 on MIPS-II or MIPS32r1 (which are
+    // the cases where mthc1 is not available). 64-bit architectures and
+    // MIPS32r2 or later can use FGR64 though.
+    assert(Subtarget.isGP64bit() || Subtarget.hasMTHC1() ||
+           !Subtarget.isFP64bit());
+
+    const TargetRegisterClass *RC = &Mips::GPR32RegClass;
+    const TargetRegisterClass *RC2 =
+        FP64 ? &Mips::FGR64RegClass : &Mips::AFGR64RegClass;
+
+    // We re-use the same spill slot each time so that the stack frame doesn't
+    // grow too much in functions with a large number of moves.
+    int FI = MF.getInfo<MipsFunctionInfo>()->getMoveF64ViaSpillFI(RC2);
+    TII.storeRegToStack(MBB, I, LoReg, I->getOperand(1).isKill(), FI, RC, &TRI,
+                        0);
+    TII.storeRegToStack(MBB, I, HiReg, I->getOperand(2).isKill(), FI, RC, &TRI,
+                        4);
+    TII.loadRegFromStack(MBB, I, DstReg, FI, RC2, &TRI, 0);
+    return true;
+  }
+
+  return false;
+}
+
+/// This method expands the same instruction that MipsSEInstrInfo::
+/// expandExtractElementF64 does, for the case when ABI is fpxx and mfhc1 is not
+/// available and the case where the ABI is FP64A. It is implemented here
+/// because frame indexes are eliminated before MipsSEInstrInfo::
+/// expandExtractElementF64 is called.
+bool ExpandPseudo::expandExtractElementF64(MachineBasicBlock &MBB,
+                                           MachineBasicBlock::iterator I,
+                                           bool FP64) const {
+  // For fpxx and when mfhc1 is not available, use:
+  //   spill + reload via ldc1
+  //
+  // The case where dmfc1 is available doesn't need to be handled here
+  // because it never creates a ExtractElementF64 node.
+  //
+  // The FP64A ABI (fp64 with nooddspreg) must also use a spill/reload sequence
+  // for odd-numbered double precision values (because the lower 32-bits is
+  // transferred with mfc1 which is redirected to the upper half of the even
+  // register). Unfortunately, we have to make this decision before register
+  // allocation so for now we use a spill/reload sequence for all
+  // double-precision values in regardless of being an odd/even register.
+
+  const TargetMachine &TM = MF.getTarget();
+  const MipsSubtarget &Subtarget = TM.getSubtarget<MipsSubtarget>();
+  if ((Subtarget.isABI_FPXX() && !Subtarget.hasMTHC1()) ||
+      (FP64 && !Subtarget.useOddSPReg())) {
+    const MipsSEInstrInfo &TII =
+        *static_cast<const MipsSEInstrInfo *>(TM.getInstrInfo());
+    const MipsRegisterInfo &TRI =
+        *static_cast<const MipsRegisterInfo *>(TM.getRegisterInfo());
+
+    unsigned DstReg = I->getOperand(0).getReg();
+    unsigned SrcReg = I->getOperand(1).getReg();
+    unsigned N = I->getOperand(2).getImm();
+
+    // It should be impossible to have FGR64 on MIPS-II or MIPS32r1 (which are
+    // the cases where mfhc1 is not available). 64-bit architectures and
+    // MIPS32r2 or later can use FGR64 though.
+    assert(Subtarget.isGP64bit() || Subtarget.hasMTHC1() ||
+           !Subtarget.isFP64bit());
+
+    const TargetRegisterClass *RC =
+        FP64 ? &Mips::FGR64RegClass : &Mips::AFGR64RegClass;
+    const TargetRegisterClass *RC2 = &Mips::GPR32RegClass;
+
+    // We re-use the same spill slot each time so that the stack frame doesn't
+    // grow too much in functions with a large number of moves.
+    int FI = MF.getInfo<MipsFunctionInfo>()->getMoveF64ViaSpillFI(RC);
+    TII.storeRegToStack(MBB, I, SrcReg, I->getOperand(1).isKill(), FI, RC, &TRI,
+                        0);
+    TII.loadRegFromStack(MBB, I, DstReg, FI, RC2, &TRI, N * 4);
+    return true;
+  }
+
+  return false;
+}
+
+MipsSEFrameLowering::MipsSEFrameLowering(const MipsSubtarget &STI)
+    : MipsFrameLowering(STI, STI.stackAlignment()) {}
+
+unsigned MipsSEFrameLowering::ehDataReg(unsigned I) const {
+  static const unsigned EhDataReg[] = {
+    Mips::A0, Mips::A1, Mips::A2, Mips::A3
+  };
+  static const unsigned EhDataReg64[] = {
+    Mips::A0_64, Mips::A1_64, Mips::A2_64, Mips::A3_64
+  };
+
+  return STI.isABI_N64() ? EhDataReg64[I] : EhDataReg[I];
+}
+
+void MipsSEFrameLowering::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB   = MF.front();
+  MachineFrameInfo *MFI    = MF.getFrameInfo();
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+
+  const MipsSEInstrInfo &TII =
+    *static_cast<const MipsSEInstrInfo*>(MF.getTarget().getInstrInfo());
+  const MipsRegisterInfo &RegInfo =
+    *static_cast<const MipsRegisterInfo*>(MF.getTarget().getRegisterInfo());
+
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+  unsigned SP = STI.isABI_N64() ? Mips::SP_64 : Mips::SP;
+  unsigned FP = STI.isABI_N64() ? Mips::FP_64 : Mips::FP;
+  unsigned ZERO = STI.isABI_N64() ? Mips::ZERO_64 : Mips::ZERO;
+  unsigned ADDu = STI.isABI_N64() ? Mips::DADDu : Mips::ADDu;
+
+  // First, compute final stack size.
+  uint64_t StackSize = MFI->getStackSize();
+
+  // No need to allocate space on the stack.
+  if (StackSize == 0 && !MFI->adjustsStack()) return;
+
+  MachineModuleInfo &MMI = MF.getMMI();
+  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+  MachineLocation DstML, SrcML;
+
+  // Adjust stack.
+  TII.adjustStackPtr(SP, -StackSize, MBB, MBBI);
+
+  // emit ".cfi_def_cfa_offset StackSize"
+  unsigned CFIIndex = MMI.addFrameInst(
+      MCCFIInstruction::createDefCfaOffset(nullptr, -StackSize));
+  BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+
+  if (CSI.size()) {
+    // Find the instruction past the last instruction that saves a callee-saved
+    // register to the stack.
+    for (unsigned i = 0; i < CSI.size(); ++i)
+      ++MBBI;
+
+    // Iterate over list of callee-saved registers and emit .cfi_offset
+    // directives.
+    for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(),
+           E = CSI.end(); I != E; ++I) {
+      int64_t Offset = MFI->getObjectOffset(I->getFrameIdx());
+      unsigned Reg = I->getReg();
+
+      // If Reg is a double precision register, emit two cfa_offsets,
+      // one for each of the paired single precision registers.
+      if (Mips::AFGR64RegClass.contains(Reg)) {
+        unsigned Reg0 =
+            MRI->getDwarfRegNum(RegInfo.getSubReg(Reg, Mips::sub_lo), true);
+        unsigned Reg1 =
+            MRI->getDwarfRegNum(RegInfo.getSubReg(Reg, Mips::sub_hi), true);
+
+        if (!STI.isLittle())
+          std::swap(Reg0, Reg1);
+
+        unsigned CFIIndex = MMI.addFrameInst(
+            MCCFIInstruction::createOffset(nullptr, Reg0, Offset));
+        BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+            .addCFIIndex(CFIIndex);
+
+        CFIIndex = MMI.addFrameInst(
+            MCCFIInstruction::createOffset(nullptr, Reg1, Offset + 4));
+        BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+            .addCFIIndex(CFIIndex);
+      } else if (Mips::FGR64RegClass.contains(Reg)) {
+        unsigned Reg0 = MRI->getDwarfRegNum(Reg, true);
+        unsigned Reg1 = MRI->getDwarfRegNum(Reg, true) + 1;
+
+        if (!STI.isLittle())
+          std::swap(Reg0, Reg1);
+
+        unsigned CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createOffset(nullptr, Reg0, Offset));
+        BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+            .addCFIIndex(CFIIndex);
+
+        CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createOffset(nullptr, Reg1, Offset + 4));
+        BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+            .addCFIIndex(CFIIndex);
+      } else {
+        // Reg is either in GPR32 or FGR32.
+        unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
+            nullptr, MRI->getDwarfRegNum(Reg, 1), Offset));
+        BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+            .addCFIIndex(CFIIndex);
+      }
+    }
+  }
+
+  if (MipsFI->callsEhReturn()) {
+    const TargetRegisterClass *RC = STI.isABI_N64() ?
+        &Mips::GPR64RegClass : &Mips::GPR32RegClass;
+
+    // Insert instructions that spill eh data registers.
+    for (int I = 0; I < 4; ++I) {
+      if (!MBB.isLiveIn(ehDataReg(I)))
+        MBB.addLiveIn(ehDataReg(I));
+      TII.storeRegToStackSlot(MBB, MBBI, ehDataReg(I), false,
+                              MipsFI->getEhDataRegFI(I), RC, &RegInfo);
+    }
+
+    // Emit .cfi_offset directives for eh data registers.
+    for (int I = 0; I < 4; ++I) {
+      int64_t Offset = MFI->getObjectOffset(MipsFI->getEhDataRegFI(I));
+      unsigned Reg = MRI->getDwarfRegNum(ehDataReg(I), true);
+      unsigned CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createOffset(nullptr, Reg, Offset));
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    }
+  }
+
+  // if framepointer enabled, set it to point to the stack pointer.
+  if (hasFP(MF)) {
+    // Insert instruction "move $fp, $sp" at this location.
+    BuildMI(MBB, MBBI, dl, TII.get(ADDu), FP).addReg(SP).addReg(ZERO)
+      .setMIFlag(MachineInstr::FrameSetup);
+
+    // emit ".cfi_def_cfa_register $fp"
+    unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createDefCfaRegister(
+        nullptr, MRI->getDwarfRegNum(FP, true)));
+    BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
+  }
+}
+
+void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF,
+                                       MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  MachineFrameInfo *MFI            = MF.getFrameInfo();
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+
+  const MipsSEInstrInfo &TII =
+    *static_cast<const MipsSEInstrInfo*>(MF.getTarget().getInstrInfo());
+  const MipsRegisterInfo &RegInfo =
+    *static_cast<const MipsRegisterInfo*>(MF.getTarget().getRegisterInfo());
+
+  DebugLoc dl = MBBI->getDebugLoc();
+  unsigned SP = STI.isABI_N64() ? Mips::SP_64 : Mips::SP;
+  unsigned FP = STI.isABI_N64() ? Mips::FP_64 : Mips::FP;
+  unsigned ZERO = STI.isABI_N64() ? Mips::ZERO_64 : Mips::ZERO;
+  unsigned ADDu = STI.isABI_N64() ? Mips::DADDu : Mips::ADDu;
+
+  // if framepointer enabled, restore the stack pointer.
+  if (hasFP(MF)) {
+    // Find the first instruction that restores a callee-saved register.
+    MachineBasicBlock::iterator I = MBBI;
+
+    for (unsigned i = 0; i < MFI->getCalleeSavedInfo().size(); ++i)
+      --I;
+
+    // Insert instruction "move $sp, $fp" at this location.
+    BuildMI(MBB, I, dl, TII.get(ADDu), SP).addReg(FP).addReg(ZERO);
+  }
+
+  if (MipsFI->callsEhReturn()) {
+    const TargetRegisterClass *RC = STI.isABI_N64() ?
+        &Mips::GPR64RegClass : &Mips::GPR32RegClass;
+
+    // Find first instruction that restores a callee-saved register.
+    MachineBasicBlock::iterator I = MBBI;
+    for (unsigned i = 0; i < MFI->getCalleeSavedInfo().size(); ++i)
+      --I;
+
+    // Insert instructions that restore eh data registers.
+    for (int J = 0; J < 4; ++J) {
+      TII.loadRegFromStackSlot(MBB, I, ehDataReg(J), MipsFI->getEhDataRegFI(J),
+                               RC, &RegInfo);
+    }
+  }
+
+  // Get the number of bytes from FrameInfo
+  uint64_t StackSize = MFI->getStackSize();
+
+  if (!StackSize)
+    return;
+
+  // Adjust stack.
+  TII.adjustStackPtr(SP, StackSize, MBB, MBBI);
+}
+
+bool MipsSEFrameLowering::
+spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MI,
+                          const std::vector<CalleeSavedInfo> &CSI,
+                          const TargetRegisterInfo *TRI) const {
+  MachineFunction *MF = MBB.getParent();
+  MachineBasicBlock *EntryBlock = MF->begin();
+  const TargetInstrInfo &TII = *MF->getTarget().getInstrInfo();
+
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    // Add the callee-saved register as live-in. Do not add if the register is
+    // RA and return address is taken, because it has already been added in
+    // method MipsTargetLowering::LowerRETURNADDR.
+    // It's killed at the spill, unless the register is RA and return address
+    // is taken.
+    unsigned Reg = CSI[i].getReg();
+    bool IsRAAndRetAddrIsTaken = (Reg == Mips::RA || Reg == Mips::RA_64)
+        && MF->getFrameInfo()->isReturnAddressTaken();
+    if (!IsRAAndRetAddrIsTaken)
+      EntryBlock->addLiveIn(Reg);
+
+    // Insert the spill to the stack frame.
+    bool IsKill = !IsRAAndRetAddrIsTaken;
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+    TII.storeRegToStackSlot(*EntryBlock, MI, Reg, IsKill,
+                            CSI[i].getFrameIdx(), RC, TRI);
+  }
+
+  return true;
+}
+
+bool
+MipsSEFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Reserve call frame if the size of the maximum call frame fits into 16-bit
+  // immediate field and there are no variable sized objects on the stack.
+  // Make sure the second register scavenger spill slot can be accessed with one
+  // instruction.
+  return isInt<16>(MFI->getMaxCallFrameSize() + getStackAlignment()) &&
+    !MFI->hasVarSizedObjects();
+}
+
+// Eliminate ADJCALLSTACKDOWN, ADJCALLSTACKUP pseudo instructions
+void MipsSEFrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  const MipsSEInstrInfo &TII =
+    *static_cast<const MipsSEInstrInfo*>(MF.getTarget().getInstrInfo());
+
+  if (!hasReservedCallFrame(MF)) {
+    int64_t Amount = I->getOperand(0).getImm();
+
+    if (I->getOpcode() == Mips::ADJCALLSTACKDOWN)
+      Amount = -Amount;
+
+    unsigned SP = STI.isABI_N64() ? Mips::SP_64 : Mips::SP;
+    TII.adjustStackPtr(SP, Amount, MBB, I);
+  }
+
+  MBB.erase(I);
+}
+
+void MipsSEFrameLowering::
+processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                     RegScavenger *RS) const {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+  unsigned FP = STI.isABI_N64() ? Mips::FP_64 : Mips::FP;
+
+  // Mark $fp as used if function has dedicated frame pointer.
+  if (hasFP(MF))
+    MRI.setPhysRegUsed(FP);
+
+  // Create spill slots for eh data registers if function calls eh_return.
+  if (MipsFI->callsEhReturn())
+    MipsFI->createEhDataRegsFI();
+
+  // Expand pseudo instructions which load, store or copy accumulators.
+  // Add an emergency spill slot if a pseudo was expanded.
+  if (ExpandPseudo(MF).expand()) {
+    // The spill slot should be half the size of the accumulator. If target is
+    // mips64, it should be 64-bit, otherwise it should be 32-bt.
+    const TargetRegisterClass *RC = STI.hasMips64() ?
+      &Mips::GPR64RegClass : &Mips::GPR32RegClass;
+    int FI = MF.getFrameInfo()->CreateStackObject(RC->getSize(),
+                                                  RC->getAlignment(), false);
+    RS->addScavengingFrameIndex(FI);
+  }
+
+  // Set scavenging frame index if necessary.
+  uint64_t MaxSPOffset = MF.getInfo<MipsFunctionInfo>()->getIncomingArgSize() +
+    estimateStackSize(MF);
+
+  if (isInt<16>(MaxSPOffset))
+    return;
+
+  const TargetRegisterClass *RC = STI.isABI_N64() ?
+    &Mips::GPR64RegClass : &Mips::GPR32RegClass;
+  int FI = MF.getFrameInfo()->CreateStackObject(RC->getSize(),
+                                                RC->getAlignment(), false);
+  RS->addScavengingFrameIndex(FI);
+}
+
+const MipsFrameLowering *
+llvm::createMipsSEFrameLowering(const MipsSubtarget &ST) {
+  return new MipsSEFrameLowering(ST);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.h b/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.h
new file mode 100644
index 0000000..e832848
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.h
@@ -0,0 +1,48 @@
+//===-- MipsSEFrameLowering.h - Mips32/64 frame lowering --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSSE_FRAMEINFO_H
+#define MIPSSE_FRAMEINFO_H
+
+#include "MipsFrameLowering.h"
+
+namespace llvm {
+
+class MipsSEFrameLowering : public MipsFrameLowering {
+public:
+  explicit MipsSEFrameLowering(const MipsSubtarget &STI);
+
+  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+  /// the function.
+  void emitPrologue(MachineFunction &MF) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                  MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I) const override;
+
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 const TargetRegisterInfo *TRI) const override;
+
+  bool hasReservedCallFrame(const MachineFunction &MF) const override;
+
+  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                            RegScavenger *RS) const override;
+  unsigned ehDataReg(unsigned I) const;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
new file mode 100644
index 0000000..47e1931
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@@ -0,0 +1,878 @@
+//===-- MipsSEISelDAGToDAG.cpp - A Dag to Dag Inst Selector for MipsSE ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Subclass of MipsDAGToDAGISel specialized for mips32/64.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsSEISelDAGToDAG.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "Mips.h"
+#include "MipsAnalyzeImmediate.h"
+#include "MipsMachineFunction.h"
+#include "MipsRegisterInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-isel"
+
+bool MipsSEDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
+  Subtarget = &TM.getSubtarget<MipsSubtarget>();
+  if (Subtarget->inMips16Mode())
+    return false;
+  return MipsDAGToDAGISel::runOnMachineFunction(MF);
+}
+
+void MipsSEDAGToDAGISel::addDSPCtrlRegOperands(bool IsDef, MachineInstr &MI,
+                                               MachineFunction &MF) {
+  MachineInstrBuilder MIB(MF, &MI);
+  unsigned Mask = MI.getOperand(1).getImm();
+  unsigned Flag = IsDef ? RegState::ImplicitDefine : RegState::Implicit;
+
+  if (Mask & 1)
+    MIB.addReg(Mips::DSPPos, Flag);
+
+  if (Mask & 2)
+    MIB.addReg(Mips::DSPSCount, Flag);
+
+  if (Mask & 4)
+    MIB.addReg(Mips::DSPCarry, Flag);
+
+  if (Mask & 8)
+    MIB.addReg(Mips::DSPOutFlag, Flag);
+
+  if (Mask & 16)
+    MIB.addReg(Mips::DSPCCond, Flag);
+
+  if (Mask & 32)
+    MIB.addReg(Mips::DSPEFI, Flag);
+}
+
+unsigned MipsSEDAGToDAGISel::getMSACtrlReg(const SDValue RegIdx) const {
+  switch (cast<ConstantSDNode>(RegIdx)->getZExtValue()) {
+  default:
+    llvm_unreachable("Could not map int to register");
+  case 0: return Mips::MSAIR;
+  case 1: return Mips::MSACSR;
+  case 2: return Mips::MSAAccess;
+  case 3: return Mips::MSASave;
+  case 4: return Mips::MSAModify;
+  case 5: return Mips::MSARequest;
+  case 6: return Mips::MSAMap;
+  case 7: return Mips::MSAUnmap;
+  }
+}
+
+bool MipsSEDAGToDAGISel::replaceUsesWithZeroReg(MachineRegisterInfo *MRI,
+                                                const MachineInstr& MI) {
+  unsigned DstReg = 0, ZeroReg = 0;
+
+  // Check if MI is "addiu $dst, $zero, 0" or "daddiu $dst, $zero, 0".
+  if ((MI.getOpcode() == Mips::ADDiu) &&
+      (MI.getOperand(1).getReg() == Mips::ZERO) &&
+      (MI.getOperand(2).getImm() == 0)) {
+    DstReg = MI.getOperand(0).getReg();
+    ZeroReg = Mips::ZERO;
+  } else if ((MI.getOpcode() == Mips::DADDiu) &&
+             (MI.getOperand(1).getReg() == Mips::ZERO_64) &&
+             (MI.getOperand(2).getImm() == 0)) {
+    DstReg = MI.getOperand(0).getReg();
+    ZeroReg = Mips::ZERO_64;
+  }
+
+  if (!DstReg)
+    return false;
+
+  // Replace uses with ZeroReg.
+  for (MachineRegisterInfo::use_iterator U = MRI->use_begin(DstReg),
+       E = MRI->use_end(); U != E;) {
+    MachineOperand &MO = *U;
+    unsigned OpNo = U.getOperandNo();
+    MachineInstr *MI = MO.getParent();
+    ++U;
+
+    // Do not replace if it is a phi's operand or is tied to def operand.
+    if (MI->isPHI() || MI->isRegTiedToDefOperand(OpNo) || MI->isPseudo())
+      continue;
+
+    MO.setReg(ZeroReg);
+  }
+
+  return true;
+}
+
+void MipsSEDAGToDAGISel::initGlobalBaseReg(MachineFunction &MF) {
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+
+  if (!MipsFI->globalBaseRegSet())
+    return;
+
+  MachineBasicBlock &MBB = MF.front();
+  MachineBasicBlock::iterator I = MBB.begin();
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
+  unsigned V0, V1, GlobalBaseReg = MipsFI->getGlobalBaseReg();
+  const TargetRegisterClass *RC;
+
+  if (Subtarget->isABI_N64())
+    RC = (const TargetRegisterClass*)&Mips::GPR64RegClass;
+  else
+    RC = (const TargetRegisterClass*)&Mips::GPR32RegClass;
+
+  V0 = RegInfo.createVirtualRegister(RC);
+  V1 = RegInfo.createVirtualRegister(RC);
+
+  if (Subtarget->isABI_N64()) {
+    MF.getRegInfo().addLiveIn(Mips::T9_64);
+    MBB.addLiveIn(Mips::T9_64);
+
+    // lui $v0, %hi(%neg(%gp_rel(fname)))
+    // daddu $v1, $v0, $t9
+    // daddiu $globalbasereg, $v1, %lo(%neg(%gp_rel(fname)))
+    const GlobalValue *FName = MF.getFunction();
+    BuildMI(MBB, I, DL, TII.get(Mips::LUi64), V0)
+      .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_HI);
+    BuildMI(MBB, I, DL, TII.get(Mips::DADDu), V1).addReg(V0)
+      .addReg(Mips::T9_64);
+    BuildMI(MBB, I, DL, TII.get(Mips::DADDiu), GlobalBaseReg).addReg(V1)
+      .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_LO);
+    return;
+  }
+
+  if (MF.getTarget().getRelocationModel() == Reloc::Static) {
+    // Set global register to __gnu_local_gp.
+    //
+    // lui   $v0, %hi(__gnu_local_gp)
+    // addiu $globalbasereg, $v0, %lo(__gnu_local_gp)
+    BuildMI(MBB, I, DL, TII.get(Mips::LUi), V0)
+      .addExternalSymbol("__gnu_local_gp", MipsII::MO_ABS_HI);
+    BuildMI(MBB, I, DL, TII.get(Mips::ADDiu), GlobalBaseReg).addReg(V0)
+      .addExternalSymbol("__gnu_local_gp", MipsII::MO_ABS_LO);
+    return;
+  }
+
+  MF.getRegInfo().addLiveIn(Mips::T9);
+  MBB.addLiveIn(Mips::T9);
+
+  if (Subtarget->isABI_N32()) {
+    // lui $v0, %hi(%neg(%gp_rel(fname)))
+    // addu $v1, $v0, $t9
+    // addiu $globalbasereg, $v1, %lo(%neg(%gp_rel(fname)))
+    const GlobalValue *FName = MF.getFunction();
+    BuildMI(MBB, I, DL, TII.get(Mips::LUi), V0)
+      .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_HI);
+    BuildMI(MBB, I, DL, TII.get(Mips::ADDu), V1).addReg(V0).addReg(Mips::T9);
+    BuildMI(MBB, I, DL, TII.get(Mips::ADDiu), GlobalBaseReg).addReg(V1)
+      .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_LO);
+    return;
+  }
+
+  assert(Subtarget->isABI_O32());
+
+  // For O32 ABI, the following instruction sequence is emitted to initialize
+  // the global base register:
+  //
+  //  0. lui   $2, %hi(_gp_disp)
+  //  1. addiu $2, $2, %lo(_gp_disp)
+  //  2. addu  $globalbasereg, $2, $t9
+  //
+  // We emit only the last instruction here.
+  //
+  // GNU linker requires that the first two instructions appear at the beginning
+  // of a function and no instructions be inserted before or between them.
+  // The two instructions are emitted during lowering to MC layer in order to
+  // avoid any reordering.
+  //
+  // Register $2 (Mips::V0) is added to the list of live-in registers to ensure
+  // the value instruction 1 (addiu) defines is valid when instruction 2 (addu)
+  // reads it.
+  MF.getRegInfo().addLiveIn(Mips::V0);
+  MBB.addLiveIn(Mips::V0);
+  BuildMI(MBB, I, DL, TII.get(Mips::ADDu), GlobalBaseReg)
+    .addReg(Mips::V0).addReg(Mips::T9);
+}
+
+void MipsSEDAGToDAGISel::processFunctionAfterISel(MachineFunction &MF) {
+  initGlobalBaseReg(MF);
+
+  MachineRegisterInfo *MRI = &MF.getRegInfo();
+
+  for (MachineFunction::iterator MFI = MF.begin(), MFE = MF.end(); MFI != MFE;
+       ++MFI)
+    for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I) {
+      if (I->getOpcode() == Mips::RDDSP)
+        addDSPCtrlRegOperands(false, *I, MF);
+      else if (I->getOpcode() == Mips::WRDSP)
+        addDSPCtrlRegOperands(true, *I, MF);
+      else
+        replaceUsesWithZeroReg(MRI, *I);
+    }
+}
+
+SDNode *MipsSEDAGToDAGISel::selectAddESubE(unsigned MOp, SDValue InFlag,
+                                           SDValue CmpLHS, SDLoc DL,
+                                           SDNode *Node) const {
+  unsigned Opc = InFlag.getOpcode(); (void)Opc;
+
+  assert(((Opc == ISD::ADDC || Opc == ISD::ADDE) ||
+          (Opc == ISD::SUBC || Opc == ISD::SUBE)) &&
+         "(ADD|SUB)E flag operand must come from (ADD|SUB)C/E insn");
+
+  SDValue Ops[] = { CmpLHS, InFlag.getOperand(1) };
+  SDValue LHS = Node->getOperand(0), RHS = Node->getOperand(1);
+  EVT VT = LHS.getValueType();
+
+  SDNode *Carry = CurDAG->getMachineNode(Mips::SLTu, DL, VT, Ops);
+  SDNode *AddCarry = CurDAG->getMachineNode(Mips::ADDu, DL, VT,
+                                            SDValue(Carry, 0), RHS);
+  return CurDAG->SelectNodeTo(Node, MOp, VT, MVT::Glue, LHS,
+                              SDValue(AddCarry, 0));
+}
+
+/// Match frameindex
+bool MipsSEDAGToDAGISel::selectAddrFrameIndex(SDValue Addr, SDValue &Base,
+                                              SDValue &Offset) const {
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    EVT ValTy = Addr.getValueType();
+
+    Base   = CurDAG->getTargetFrameIndex(FIN->getIndex(), ValTy);
+    Offset = CurDAG->getTargetConstant(0, ValTy);
+    return true;
+  }
+  return false;
+}
+
+/// Match frameindex+offset and frameindex|offset
+bool MipsSEDAGToDAGISel::selectAddrFrameIndexOffset(SDValue Addr, SDValue &Base,
+                                                    SDValue &Offset,
+                                                    unsigned OffsetBits) const {
+  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
+    if (isIntN(OffsetBits, CN->getSExtValue())) {
+      EVT ValTy = Addr.getValueType();
+
+      // If the first operand is a FI, get the TargetFI Node
+      if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>
+                                  (Addr.getOperand(0)))
+        Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), ValTy);
+      else
+        Base = Addr.getOperand(0);
+
+      Offset = CurDAG->getTargetConstant(CN->getZExtValue(), ValTy);
+      return true;
+    }
+  }
+  return false;
+}
+
+/// ComplexPattern used on MipsInstrInfo
+/// Used on Mips Load/Store instructions
+bool MipsSEDAGToDAGISel::selectAddrRegImm(SDValue Addr, SDValue &Base,
+                                          SDValue &Offset) const {
+  // if Address is FI, get the TargetFrameIndex.
+  if (selectAddrFrameIndex(Addr, Base, Offset))
+    return true;
+
+  // on PIC code Load GA
+  if (Addr.getOpcode() == MipsISD::Wrapper) {
+    Base   = Addr.getOperand(0);
+    Offset = Addr.getOperand(1);
+    return true;
+  }
+
+  if (TM.getRelocationModel() != Reloc::PIC_) {
+    if ((Addr.getOpcode() == ISD::TargetExternalSymbol ||
+        Addr.getOpcode() == ISD::TargetGlobalAddress))
+      return false;
+  }
+
+  // Addresses of the form FI+const or FI|const
+  if (selectAddrFrameIndexOffset(Addr, Base, Offset, 16))
+    return true;
+
+  // Operand is a result from an ADD.
+  if (Addr.getOpcode() == ISD::ADD) {
+    // When loading from constant pools, load the lower address part in
+    // the instruction itself. Example, instead of:
+    //  lui $2, %hi($CPI1_0)
+    //  addiu $2, $2, %lo($CPI1_0)
+    //  lwc1 $f0, 0($2)
+    // Generate:
+    //  lui $2, %hi($CPI1_0)
+    //  lwc1 $f0, %lo($CPI1_0)($2)
+    if (Addr.getOperand(1).getOpcode() == MipsISD::Lo ||
+        Addr.getOperand(1).getOpcode() == MipsISD::GPRel) {
+      SDValue Opnd0 = Addr.getOperand(1).getOperand(0);
+      if (isa<ConstantPoolSDNode>(Opnd0) || isa<GlobalAddressSDNode>(Opnd0) ||
+          isa<JumpTableSDNode>(Opnd0)) {
+        Base = Addr.getOperand(0);
+        Offset = Opnd0;
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+/// ComplexPattern used on MipsInstrInfo
+/// Used on Mips Load/Store instructions
+bool MipsSEDAGToDAGISel::selectAddrRegReg(SDValue Addr, SDValue &Base,
+                                          SDValue &Offset) const {
+  // Operand is a result from an ADD.
+  if (Addr.getOpcode() == ISD::ADD) {
+    Base = Addr.getOperand(0);
+    Offset = Addr.getOperand(1);
+    return true;
+  }
+
+  return false;
+}
+
+bool MipsSEDAGToDAGISel::selectAddrDefault(SDValue Addr, SDValue &Base,
+                                           SDValue &Offset) const {
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, Addr.getValueType());
+  return true;
+}
+
+bool MipsSEDAGToDAGISel::selectIntAddr(SDValue Addr, SDValue &Base,
+                                       SDValue &Offset) const {
+  return selectAddrRegImm(Addr, Base, Offset) ||
+    selectAddrDefault(Addr, Base, Offset);
+}
+
+bool MipsSEDAGToDAGISel::selectAddrRegImm10(SDValue Addr, SDValue &Base,
+                                            SDValue &Offset) const {
+  if (selectAddrFrameIndex(Addr, Base, Offset))
+    return true;
+
+  if (selectAddrFrameIndexOffset(Addr, Base, Offset, 10))
+    return true;
+
+  return false;
+}
+
+/// Used on microMIPS Load/Store unaligned instructions (12-bit offset)
+bool MipsSEDAGToDAGISel::selectAddrRegImm12(SDValue Addr, SDValue &Base,
+                                            SDValue &Offset) const {
+  if (selectAddrFrameIndex(Addr, Base, Offset))
+    return true;
+
+  if (selectAddrFrameIndexOffset(Addr, Base, Offset, 12))
+    return true;
+
+  return false;
+}
+
+bool MipsSEDAGToDAGISel::selectIntAddrMM(SDValue Addr, SDValue &Base,
+                                         SDValue &Offset) const {
+  return selectAddrRegImm12(Addr, Base, Offset) ||
+    selectAddrDefault(Addr, Base, Offset);
+}
+
+bool MipsSEDAGToDAGISel::selectIntAddrMSA(SDValue Addr, SDValue &Base,
+                                          SDValue &Offset) const {
+  if (selectAddrRegImm10(Addr, Base, Offset))
+    return true;
+
+  if (selectAddrDefault(Addr, Base, Offset))
+    return true;
+
+  return false;
+}
+
+// Select constant vector splats.
+//
+// Returns true and sets Imm if:
+// * MSA is enabled
+// * N is a ISD::BUILD_VECTOR representing a constant splat
+bool MipsSEDAGToDAGISel::selectVSplat(SDNode *N, APInt &Imm) const {
+  if (!Subtarget->hasMSA())
+    return false;
+
+  BuildVectorSDNode *Node = dyn_cast<BuildVectorSDNode>(N);
+
+  if (!Node)
+    return false;
+
+  APInt SplatValue, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+
+  if (!Node->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
+                             HasAnyUndefs, 8,
+                             !Subtarget->isLittle()))
+    return false;
+
+  Imm = SplatValue;
+
+  return true;
+}
+
+// Select constant vector splats.
+//
+// In addition to the requirements of selectVSplat(), this function returns
+// true and sets Imm if:
+// * The splat value is the same width as the elements of the vector
+// * The splat value fits in an integer with the specified signed-ness and
+//   width.
+//
+// This function looks through ISD::BITCAST nodes.
+// TODO: This might not be appropriate for big-endian MSA since BITCAST is
+//       sometimes a shuffle in big-endian mode.
+//
+// It's worth noting that this function is not used as part of the selection
+// of ldi.[bhwd] since it does not permit using the wrong-typed ldi.[bhwd]
+// instruction to achieve the desired bit pattern. ldi.[bhwd] is selected in
+// MipsSEDAGToDAGISel::selectNode.
+bool MipsSEDAGToDAGISel::
+selectVSplatCommon(SDValue N, SDValue &Imm, bool Signed,
+                   unsigned ImmBitSize) const {
+  APInt ImmValue;
+  EVT EltTy = N->getValueType(0).getVectorElementType();
+
+  if (N->getOpcode() == ISD::BITCAST)
+    N = N->getOperand(0);
+
+  if (selectVSplat (N.getNode(), ImmValue) &&
+      ImmValue.getBitWidth() == EltTy.getSizeInBits()) {
+    if (( Signed && ImmValue.isSignedIntN(ImmBitSize)) ||
+        (!Signed && ImmValue.isIntN(ImmBitSize))) {
+      Imm = CurDAG->getTargetConstant(ImmValue, EltTy);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+// Select constant vector splats.
+bool MipsSEDAGToDAGISel::
+selectVSplatUimm1(SDValue N, SDValue &Imm) const {
+  return selectVSplatCommon(N, Imm, false, 1);
+}
+
+bool MipsSEDAGToDAGISel::
+selectVSplatUimm2(SDValue N, SDValue &Imm) const {
+  return selectVSplatCommon(N, Imm, false, 2);
+}
+
+bool MipsSEDAGToDAGISel::
+selectVSplatUimm3(SDValue N, SDValue &Imm) const {
+  return selectVSplatCommon(N, Imm, false, 3);
+}
+
+// Select constant vector splats.
+bool MipsSEDAGToDAGISel::
+selectVSplatUimm4(SDValue N, SDValue &Imm) const {
+  return selectVSplatCommon(N, Imm, false, 4);
+}
+
+// Select constant vector splats.
+bool MipsSEDAGToDAGISel::
+selectVSplatUimm5(SDValue N, SDValue &Imm) const {
+  return selectVSplatCommon(N, Imm, false, 5);
+}
+
+// Select constant vector splats.
+bool MipsSEDAGToDAGISel::
+selectVSplatUimm6(SDValue N, SDValue &Imm) const {
+  return selectVSplatCommon(N, Imm, false, 6);
+}
+
+// Select constant vector splats.
+bool MipsSEDAGToDAGISel::
+selectVSplatUimm8(SDValue N, SDValue &Imm) const {
+  return selectVSplatCommon(N, Imm, false, 8);
+}
+
+// Select constant vector splats.
+bool MipsSEDAGToDAGISel::
+selectVSplatSimm5(SDValue N, SDValue &Imm) const {
+  return selectVSplatCommon(N, Imm, true, 5);
+}
+
+// Select constant vector splats whose value is a power of 2.
+//
+// In addition to the requirements of selectVSplat(), this function returns
+// true and sets Imm if:
+// * The splat value is the same width as the elements of the vector
+// * The splat value is a power of two.
+//
+// This function looks through ISD::BITCAST nodes.
+// TODO: This might not be appropriate for big-endian MSA since BITCAST is
+//       sometimes a shuffle in big-endian mode.
+bool MipsSEDAGToDAGISel::selectVSplatUimmPow2(SDValue N, SDValue &Imm) const {
+  APInt ImmValue;
+  EVT EltTy = N->getValueType(0).getVectorElementType();
+
+  if (N->getOpcode() == ISD::BITCAST)
+    N = N->getOperand(0);
+
+  if (selectVSplat (N.getNode(), ImmValue) &&
+      ImmValue.getBitWidth() == EltTy.getSizeInBits()) {
+    int32_t Log2 = ImmValue.exactLogBase2();
+
+    if (Log2 != -1) {
+      Imm = CurDAG->getTargetConstant(Log2, EltTy);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+// Select constant vector splats whose value only has a consecutive sequence
+// of left-most bits set (e.g. 0b11...1100...00).
+//
+// In addition to the requirements of selectVSplat(), this function returns
+// true and sets Imm if:
+// * The splat value is the same width as the elements of the vector
+// * The splat value is a consecutive sequence of left-most bits.
+//
+// This function looks through ISD::BITCAST nodes.
+// TODO: This might not be appropriate for big-endian MSA since BITCAST is
+//       sometimes a shuffle in big-endian mode.
+bool MipsSEDAGToDAGISel::selectVSplatMaskL(SDValue N, SDValue &Imm) const {
+  APInt ImmValue;
+  EVT EltTy = N->getValueType(0).getVectorElementType();
+
+  if (N->getOpcode() == ISD::BITCAST)
+    N = N->getOperand(0);
+
+  if (selectVSplat(N.getNode(), ImmValue) &&
+      ImmValue.getBitWidth() == EltTy.getSizeInBits()) {
+    // Extract the run of set bits starting with bit zero from the bitwise
+    // inverse of ImmValue, and test that the inverse of this is the same
+    // as the original value.
+    if (ImmValue == ~(~ImmValue & ~(~ImmValue + 1))) {
+
+      Imm = CurDAG->getTargetConstant(ImmValue.countPopulation(), EltTy);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+// Select constant vector splats whose value only has a consecutive sequence
+// of right-most bits set (e.g. 0b00...0011...11).
+//
+// In addition to the requirements of selectVSplat(), this function returns
+// true and sets Imm if:
+// * The splat value is the same width as the elements of the vector
+// * The splat value is a consecutive sequence of right-most bits.
+//
+// This function looks through ISD::BITCAST nodes.
+// TODO: This might not be appropriate for big-endian MSA since BITCAST is
+//       sometimes a shuffle in big-endian mode.
+bool MipsSEDAGToDAGISel::selectVSplatMaskR(SDValue N, SDValue &Imm) const {
+  APInt ImmValue;
+  EVT EltTy = N->getValueType(0).getVectorElementType();
+
+  if (N->getOpcode() == ISD::BITCAST)
+    N = N->getOperand(0);
+
+  if (selectVSplat(N.getNode(), ImmValue) &&
+      ImmValue.getBitWidth() == EltTy.getSizeInBits()) {
+    // Extract the run of set bits starting with bit zero, and test that the
+    // result is the same as the original value
+    if (ImmValue == (ImmValue & ~(ImmValue + 1))) {
+      Imm = CurDAG->getTargetConstant(ImmValue.countPopulation(), EltTy);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool MipsSEDAGToDAGISel::selectVSplatUimmInvPow2(SDValue N,
+                                                 SDValue &Imm) const {
+  APInt ImmValue;
+  EVT EltTy = N->getValueType(0).getVectorElementType();
+
+  if (N->getOpcode() == ISD::BITCAST)
+    N = N->getOperand(0);
+
+  if (selectVSplat(N.getNode(), ImmValue) &&
+      ImmValue.getBitWidth() == EltTy.getSizeInBits()) {
+    int32_t Log2 = (~ImmValue).exactLogBase2();
+
+    if (Log2 != -1) {
+      Imm = CurDAG->getTargetConstant(Log2, EltTy);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) {
+  unsigned Opcode = Node->getOpcode();
+  SDLoc DL(Node);
+
+  ///
+  // Instruction Selection not handled by the auto-generated
+  // tablegen selection should be handled here.
+  ///
+  SDNode *Result;
+
+  switch(Opcode) {
+  default: break;
+
+  case ISD::SUBE: {
+    SDValue InFlag = Node->getOperand(2);
+    Result = selectAddESubE(Mips::SUBu, InFlag, InFlag.getOperand(0), DL, Node);
+    return std::make_pair(true, Result);
+  }
+
+  case ISD::ADDE: {
+    if (Subtarget->hasDSP()) // Select DSP instructions, ADDSC and ADDWC.
+      break;
+    SDValue InFlag = Node->getOperand(2);
+    Result = selectAddESubE(Mips::ADDu, InFlag, InFlag.getValue(0), DL, Node);
+    return std::make_pair(true, Result);
+  }
+
+  case ISD::ConstantFP: {
+    ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(Node);
+    if (Node->getValueType(0) == MVT::f64 && CN->isExactlyValue(+0.0)) {
+      if (Subtarget->isGP64bit()) {
+        SDValue Zero = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
+                                              Mips::ZERO_64, MVT::i64);
+        Result = CurDAG->getMachineNode(Mips::DMTC1, DL, MVT::f64, Zero);
+      } else if (Subtarget->isFP64bit()) {
+        SDValue Zero = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
+                                              Mips::ZERO, MVT::i32);
+        Result = CurDAG->getMachineNode(Mips::BuildPairF64_64, DL, MVT::f64,
+                                        Zero, Zero);
+      } else {
+        SDValue Zero = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
+                                              Mips::ZERO, MVT::i32);
+        Result = CurDAG->getMachineNode(Mips::BuildPairF64, DL, MVT::f64, Zero,
+                                        Zero);
+      }
+
+      return std::make_pair(true, Result);
+    }
+    break;
+  }
+
+  case ISD::Constant: {
+    const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Node);
+    unsigned Size = CN->getValueSizeInBits(0);
+
+    if (Size == 32)
+      break;
+
+    MipsAnalyzeImmediate AnalyzeImm;
+    int64_t Imm = CN->getSExtValue();
+
+    const MipsAnalyzeImmediate::InstSeq &Seq =
+      AnalyzeImm.Analyze(Imm, Size, false);
+
+    MipsAnalyzeImmediate::InstSeq::const_iterator Inst = Seq.begin();
+    SDLoc DL(CN);
+    SDNode *RegOpnd;
+    SDValue ImmOpnd = CurDAG->getTargetConstant(SignExtend64<16>(Inst->ImmOpnd),
+                                                MVT::i64);
+
+    // The first instruction can be a LUi which is different from other
+    // instructions (ADDiu, ORI and SLL) in that it does not have a register
+    // operand.
+    if (Inst->Opc == Mips::LUi64)
+      RegOpnd = CurDAG->getMachineNode(Inst->Opc, DL, MVT::i64, ImmOpnd);
+    else
+      RegOpnd =
+        CurDAG->getMachineNode(Inst->Opc, DL, MVT::i64,
+                               CurDAG->getRegister(Mips::ZERO_64, MVT::i64),
+                               ImmOpnd);
+
+    // The remaining instructions in the sequence are handled here.
+    for (++Inst; Inst != Seq.end(); ++Inst) {
+      ImmOpnd = CurDAG->getTargetConstant(SignExtend64<16>(Inst->ImmOpnd),
+                                          MVT::i64);
+      RegOpnd = CurDAG->getMachineNode(Inst->Opc, DL, MVT::i64,
+                                       SDValue(RegOpnd, 0), ImmOpnd);
+    }
+
+    return std::make_pair(true, RegOpnd);
+  }
+
+  case ISD::INTRINSIC_W_CHAIN: {
+    switch (cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue()) {
+    default:
+      break;
+
+    case Intrinsic::mips_cfcmsa: {
+      SDValue ChainIn = Node->getOperand(0);
+      SDValue RegIdx = Node->getOperand(2);
+      SDValue Reg = CurDAG->getCopyFromReg(ChainIn, DL,
+                                           getMSACtrlReg(RegIdx), MVT::i32);
+      return std::make_pair(true, Reg.getNode());
+    }
+    }
+    break;
+  }
+
+  case ISD::INTRINSIC_WO_CHAIN: {
+    switch (cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue()) {
+    default:
+      break;
+
+    case Intrinsic::mips_move_v:
+      // Like an assignment but will always produce a move.v even if
+      // unnecessary.
+      return std::make_pair(true,
+                            CurDAG->getMachineNode(Mips::MOVE_V, DL,
+                                                   Node->getValueType(0),
+                                                   Node->getOperand(1)));
+    }
+    break;
+  }
+
+  case ISD::INTRINSIC_VOID: {
+    switch (cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue()) {
+    default:
+      break;
+
+    case Intrinsic::mips_ctcmsa: {
+      SDValue ChainIn = Node->getOperand(0);
+      SDValue RegIdx  = Node->getOperand(2);
+      SDValue Value   = Node->getOperand(3);
+      SDValue ChainOut = CurDAG->getCopyToReg(ChainIn, DL,
+                                              getMSACtrlReg(RegIdx), Value);
+      return std::make_pair(true, ChainOut.getNode());
+    }
+    }
+    break;
+  }
+
+  case MipsISD::ThreadPointer: {
+    EVT PtrVT = getTargetLowering()->getPointerTy();
+    unsigned RdhwrOpc, DestReg;
+
+    if (PtrVT == MVT::i32) {
+      RdhwrOpc = Mips::RDHWR;
+      DestReg = Mips::V1;
+    } else {
+      RdhwrOpc = Mips::RDHWR64;
+      DestReg = Mips::V1_64;
+    }
+
+    SDNode *Rdhwr =
+      CurDAG->getMachineNode(RdhwrOpc, SDLoc(Node),
+                             Node->getValueType(0),
+                             CurDAG->getRegister(Mips::HWR29, MVT::i32));
+    SDValue Chain = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, DestReg,
+                                         SDValue(Rdhwr, 0));
+    SDValue ResNode = CurDAG->getCopyFromReg(Chain, DL, DestReg, PtrVT);
+    ReplaceUses(SDValue(Node, 0), ResNode);
+    return std::make_pair(true, ResNode.getNode());
+  }
+
+  case ISD::BUILD_VECTOR: {
+    // Select appropriate ldi.[bhwd] instructions for constant splats of
+    // 128-bit when MSA is enabled. Fixup any register class mismatches that
+    // occur as a result.
+    //
+    // This allows the compiler to use a wider range of immediates than would
+    // otherwise be allowed. If, for example, v4i32 could only use ldi.h then
+    // it would not be possible to load { 0x01010101, 0x01010101, 0x01010101,
+    // 0x01010101 } without using a constant pool. This would be sub-optimal
+    // when // 'ldi.b wd, 1' is capable of producing that bit-pattern in the
+    // same set/ of registers. Similarly, ldi.h isn't capable of producing {
+    // 0x00000000, 0x00000001, 0x00000000, 0x00000001 } but 'ldi.d wd, 1' can.
+
+    BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Node);
+    APInt SplatValue, SplatUndef;
+    unsigned SplatBitSize;
+    bool HasAnyUndefs;
+    unsigned LdiOp;
+    EVT ResVecTy = BVN->getValueType(0);
+    EVT ViaVecTy;
+
+    if (!Subtarget->hasMSA() || !BVN->getValueType(0).is128BitVector())
+      return std::make_pair(false, nullptr);
+
+    if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
+                              HasAnyUndefs, 8,
+                              !Subtarget->isLittle()))
+      return std::make_pair(false, nullptr);
+
+    switch (SplatBitSize) {
+    default:
+      return std::make_pair(false, nullptr);
+    case 8:
+      LdiOp = Mips::LDI_B;
+      ViaVecTy = MVT::v16i8;
+      break;
+    case 16:
+      LdiOp = Mips::LDI_H;
+      ViaVecTy = MVT::v8i16;
+      break;
+    case 32:
+      LdiOp = Mips::LDI_W;
+      ViaVecTy = MVT::v4i32;
+      break;
+    case 64:
+      LdiOp = Mips::LDI_D;
+      ViaVecTy = MVT::v2i64;
+      break;
+    }
+
+    if (!SplatValue.isSignedIntN(10))
+      return std::make_pair(false, nullptr);
+
+    SDValue Imm = CurDAG->getTargetConstant(SplatValue,
+                                            ViaVecTy.getVectorElementType());
+
+    SDNode *Res = CurDAG->getMachineNode(LdiOp, SDLoc(Node), ViaVecTy, Imm);
+
+    if (ResVecTy != ViaVecTy) {
+      // If LdiOp is writing to a different register class to ResVecTy, then
+      // fix it up here. This COPY_TO_REGCLASS should never cause a move.v
+      // since the source and destination register sets contain the same
+      // registers.
+      const TargetLowering *TLI = getTargetLowering();
+      MVT ResVecTySimple = ResVecTy.getSimpleVT();
+      const TargetRegisterClass *RC = TLI->getRegClassFor(ResVecTySimple);
+      Res = CurDAG->getMachineNode(Mips::COPY_TO_REGCLASS, SDLoc(Node),
+                                   ResVecTy, SDValue(Res, 0),
+                                   CurDAG->getTargetConstant(RC->getID(),
+                                                             MVT::i32));
+    }
+
+    return std::make_pair(true, Res);
+  }
+
+  }
+
+  return std::make_pair(false, nullptr);
+}
+
+FunctionPass *llvm::createMipsSEISelDag(MipsTargetMachine &TM) {
+  return new MipsSEDAGToDAGISel(TM);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.h b/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.h
new file mode 100644
index 0000000..57328d2
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.h
@@ -0,0 +1,117 @@
+//===-- MipsSEISelDAGToDAG.h - A Dag to Dag Inst Selector for MipsSE -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Subclass of MipsDAGToDAGISel specialized for mips32/64.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSSEISELDAGTODAG_H
+#define MIPSSEISELDAGTODAG_H
+
+#include "MipsISelDAGToDAG.h"
+
+namespace llvm {
+
+class MipsSEDAGToDAGISel : public MipsDAGToDAGISel {
+
+public:
+  explicit MipsSEDAGToDAGISel(MipsTargetMachine &TM) : MipsDAGToDAGISel(TM) {}
+
+private:
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void addDSPCtrlRegOperands(bool IsDef, MachineInstr &MI,
+                             MachineFunction &MF);
+
+  unsigned getMSACtrlReg(const SDValue RegIdx) const;
+
+  bool replaceUsesWithZeroReg(MachineRegisterInfo *MRI, const MachineInstr&);
+
+  std::pair<SDNode*, SDNode*> selectMULT(SDNode *N, unsigned Opc, SDLoc dl,
+                                         EVT Ty, bool HasLo, bool HasHi);
+
+  SDNode *selectAddESubE(unsigned MOp, SDValue InFlag, SDValue CmpLHS,
+                         SDLoc DL, SDNode *Node) const;
+
+  bool selectAddrFrameIndex(SDValue Addr, SDValue &Base, SDValue &Offset) const;
+  bool selectAddrFrameIndexOffset(SDValue Addr, SDValue &Base, SDValue &Offset,
+                                  unsigned OffsetBits) const;
+
+  bool selectAddrRegImm(SDValue Addr, SDValue &Base,
+                        SDValue &Offset) const override;
+
+  bool selectAddrRegReg(SDValue Addr, SDValue &Base,
+                        SDValue &Offset) const override;
+
+  bool selectAddrDefault(SDValue Addr, SDValue &Base,
+                         SDValue &Offset) const override;
+
+  bool selectIntAddr(SDValue Addr, SDValue &Base,
+                     SDValue &Offset) const override;
+
+  bool selectAddrRegImm10(SDValue Addr, SDValue &Base,
+                          SDValue &Offset) const;
+
+  bool selectAddrRegImm12(SDValue Addr, SDValue &Base,
+                          SDValue &Offset) const;
+
+  bool selectIntAddrMM(SDValue Addr, SDValue &Base,
+                       SDValue &Offset) const override;
+
+  bool selectIntAddrMSA(SDValue Addr, SDValue &Base,
+                        SDValue &Offset) const override;
+
+  /// \brief Select constant vector splats.
+  bool selectVSplat(SDNode *N, APInt &Imm) const override;
+  /// \brief Select constant vector splats whose value fits in a given integer.
+  bool selectVSplatCommon(SDValue N, SDValue &Imm, bool Signed,
+                                  unsigned ImmBitSize) const;
+  /// \brief Select constant vector splats whose value fits in a uimm1.
+  bool selectVSplatUimm1(SDValue N, SDValue &Imm) const override;
+  /// \brief Select constant vector splats whose value fits in a uimm2.
+  bool selectVSplatUimm2(SDValue N, SDValue &Imm) const override;
+  /// \brief Select constant vector splats whose value fits in a uimm3.
+  bool selectVSplatUimm3(SDValue N, SDValue &Imm) const override;
+  /// \brief Select constant vector splats whose value fits in a uimm4.
+  bool selectVSplatUimm4(SDValue N, SDValue &Imm) const override;
+  /// \brief Select constant vector splats whose value fits in a uimm5.
+  bool selectVSplatUimm5(SDValue N, SDValue &Imm) const override;
+  /// \brief Select constant vector splats whose value fits in a uimm6.
+  bool selectVSplatUimm6(SDValue N, SDValue &Imm) const override;
+  /// \brief Select constant vector splats whose value fits in a uimm8.
+  bool selectVSplatUimm8(SDValue N, SDValue &Imm) const override;
+  /// \brief Select constant vector splats whose value fits in a simm5.
+  bool selectVSplatSimm5(SDValue N, SDValue &Imm) const override;
+  /// \brief Select constant vector splats whose value is a power of 2.
+  bool selectVSplatUimmPow2(SDValue N, SDValue &Imm) const override;
+  /// \brief Select constant vector splats whose value is the inverse of a
+  /// power of 2.
+  bool selectVSplatUimmInvPow2(SDValue N, SDValue &Imm) const override;
+  /// \brief Select constant vector splats whose value is a run of set bits
+  /// ending at the most significant bit
+  bool selectVSplatMaskL(SDValue N, SDValue &Imm) const override;
+  /// \brief Select constant vector splats whose value is a run of set bits
+  /// starting at bit zero.
+  bool selectVSplatMaskR(SDValue N, SDValue &Imm) const override;
+
+  std::pair<bool, SDNode*> selectNode(SDNode *Node) override;
+
+  void processFunctionAfterISel(MachineFunction &MF) override;
+
+  // Insert instructions to initialize the global base register in the
+  // first MBB of the function.
+  void initGlobalBaseReg(MachineFunction &MF);
+};
+
+FunctionPass *createMipsSEISelDag(MipsTargetMachine &TM);
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp b/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
new file mode 100644
index 0000000..8173615
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -0,0 +1,3236 @@
+//===-- MipsSEISelLowering.cpp - MipsSE DAG Lowering Interface --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Subclass of MipsTargetLowering specialized for mips32/64.
+//
+//===----------------------------------------------------------------------===//
+#include "MipsSEISelLowering.h"
+#include "MipsRegisterInfo.h"
+#include "MipsTargetMachine.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-isel"
+
+static cl::opt<bool>
+EnableMipsTailCalls("enable-mips-tail-calls", cl::Hidden,
+                    cl::desc("MIPS: Enable tail calls."), cl::init(false));
+
+static cl::opt<bool> NoDPLoadStore("mno-ldc1-sdc1", cl::init(false),
+                                   cl::desc("Expand double precision loads and "
+                                            "stores to their single precision "
+                                            "counterparts"));
+
+MipsSETargetLowering::MipsSETargetLowering(MipsTargetMachine &TM,
+                                           const MipsSubtarget &STI)
+    : MipsTargetLowering(TM, STI) {
+  // Set up the register classes
+  addRegisterClass(MVT::i32, &Mips::GPR32RegClass);
+
+  if (Subtarget.isGP64bit())
+    addRegisterClass(MVT::i64, &Mips::GPR64RegClass);
+
+  if (Subtarget.hasDSP() || Subtarget.hasMSA()) {
+    // Expand all truncating stores and extending loads.
+    unsigned FirstVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+    unsigned LastVT = (unsigned)MVT::LAST_VECTOR_VALUETYPE;
+
+    for (unsigned VT0 = FirstVT; VT0 <= LastVT; ++VT0) {
+      for (unsigned VT1 = FirstVT; VT1 <= LastVT; ++VT1)
+        setTruncStoreAction((MVT::SimpleValueType)VT0,
+                            (MVT::SimpleValueType)VT1, Expand);
+
+      setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT0, Expand);
+      setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT0, Expand);
+      setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT0, Expand);
+    }
+  }
+
+  if (Subtarget.hasDSP()) {
+    MVT::SimpleValueType VecTys[2] = {MVT::v2i16, MVT::v4i8};
+
+    for (unsigned i = 0; i < array_lengthof(VecTys); ++i) {
+      addRegisterClass(VecTys[i], &Mips::DSPRRegClass);
+
+      // Expand all builtin opcodes.
+      for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
+        setOperationAction(Opc, VecTys[i], Expand);
+
+      setOperationAction(ISD::ADD, VecTys[i], Legal);
+      setOperationAction(ISD::SUB, VecTys[i], Legal);
+      setOperationAction(ISD::LOAD, VecTys[i], Legal);
+      setOperationAction(ISD::STORE, VecTys[i], Legal);
+      setOperationAction(ISD::BITCAST, VecTys[i], Legal);
+    }
+
+    setTargetDAGCombine(ISD::SHL);
+    setTargetDAGCombine(ISD::SRA);
+    setTargetDAGCombine(ISD::SRL);
+    setTargetDAGCombine(ISD::SETCC);
+    setTargetDAGCombine(ISD::VSELECT);
+  }
+
+  if (Subtarget.hasDSPR2())
+    setOperationAction(ISD::MUL, MVT::v2i16, Legal);
+
+  if (Subtarget.hasMSA()) {
+    addMSAIntType(MVT::v16i8, &Mips::MSA128BRegClass);
+    addMSAIntType(MVT::v8i16, &Mips::MSA128HRegClass);
+    addMSAIntType(MVT::v4i32, &Mips::MSA128WRegClass);
+    addMSAIntType(MVT::v2i64, &Mips::MSA128DRegClass);
+    addMSAFloatType(MVT::v8f16, &Mips::MSA128HRegClass);
+    addMSAFloatType(MVT::v4f32, &Mips::MSA128WRegClass);
+    addMSAFloatType(MVT::v2f64, &Mips::MSA128DRegClass);
+
+    setTargetDAGCombine(ISD::AND);
+    setTargetDAGCombine(ISD::OR);
+    setTargetDAGCombine(ISD::SRA);
+    setTargetDAGCombine(ISD::VSELECT);
+    setTargetDAGCombine(ISD::XOR);
+  }
+
+  if (!Subtarget.abiUsesSoftFloat()) {
+    addRegisterClass(MVT::f32, &Mips::FGR32RegClass);
+
+    // When dealing with single precision only, use libcalls
+    if (!Subtarget.isSingleFloat()) {
+      if (Subtarget.isFP64bit())
+        addRegisterClass(MVT::f64, &Mips::FGR64RegClass);
+      else
+        addRegisterClass(MVT::f64, &Mips::AFGR64RegClass);
+    }
+  }
+
+  setOperationAction(ISD::SMUL_LOHI,          MVT::i32, Custom);
+  setOperationAction(ISD::UMUL_LOHI,          MVT::i32, Custom);
+  setOperationAction(ISD::MULHS,              MVT::i32, Custom);
+  setOperationAction(ISD::MULHU,              MVT::i32, Custom);
+
+  if (Subtarget.hasCnMips())
+    setOperationAction(ISD::MUL,              MVT::i64, Legal);
+  else if (Subtarget.isGP64bit())
+    setOperationAction(ISD::MUL,              MVT::i64, Custom);
+
+  if (Subtarget.isGP64bit()) {
+    setOperationAction(ISD::MULHS,            MVT::i64, Custom);
+    setOperationAction(ISD::MULHU,            MVT::i64, Custom);
+  }
+
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
+  setOperationAction(ISD::INTRINSIC_W_CHAIN,  MVT::i64, Custom);
+
+  setOperationAction(ISD::SDIVREM, MVT::i32, Custom);
+  setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
+  setOperationAction(ISD::SDIVREM, MVT::i64, Custom);
+  setOperationAction(ISD::UDIVREM, MVT::i64, Custom);
+  setOperationAction(ISD::ATOMIC_FENCE,       MVT::Other, Custom);
+  setOperationAction(ISD::LOAD,               MVT::i32, Custom);
+  setOperationAction(ISD::STORE,              MVT::i32, Custom);
+
+  setTargetDAGCombine(ISD::ADDE);
+  setTargetDAGCombine(ISD::SUBE);
+  setTargetDAGCombine(ISD::MUL);
+
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+
+  if (NoDPLoadStore) {
+    setOperationAction(ISD::LOAD, MVT::f64, Custom);
+    setOperationAction(ISD::STORE, MVT::f64, Custom);
+  }
+
+  if (Subtarget.hasMips32r6()) {
+    // MIPS32r6 replaces the accumulator-based multiplies with a three register
+    // instruction
+    setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
+    setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
+    setOperationAction(ISD::MUL, MVT::i32, Legal);
+    setOperationAction(ISD::MULHS, MVT::i32, Legal);
+    setOperationAction(ISD::MULHU, MVT::i32, Legal);
+
+    // MIPS32r6 replaces the accumulator-based division/remainder with separate
+    // three register division and remainder instructions.
+    setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+    setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+    setOperationAction(ISD::SDIV, MVT::i32, Legal);
+    setOperationAction(ISD::UDIV, MVT::i32, Legal);
+    setOperationAction(ISD::SREM, MVT::i32, Legal);
+    setOperationAction(ISD::UREM, MVT::i32, Legal);
+
+    // MIPS32r6 replaces conditional moves with an equivalent that removes the
+    // need for three GPR read ports.
+    setOperationAction(ISD::SETCC, MVT::i32, Legal);
+    setOperationAction(ISD::SELECT, MVT::i32, Legal);
+    setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
+
+    setOperationAction(ISD::SETCC, MVT::f32, Legal);
+    setOperationAction(ISD::SELECT, MVT::f32, Legal);
+    setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
+
+    assert(Subtarget.isFP64bit() && "FR=1 is required for MIPS32r6");
+    setOperationAction(ISD::SETCC, MVT::f64, Legal);
+    setOperationAction(ISD::SELECT, MVT::f64, Legal);
+    setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
+
+    setOperationAction(ISD::BRCOND, MVT::Other, Legal);
+
+    // Floating point > and >= are supported via < and <=
+    setCondCodeAction(ISD::SETOGE, MVT::f32, Expand);
+    setCondCodeAction(ISD::SETOGT, MVT::f32, Expand);
+    setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
+    setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
+
+    setCondCodeAction(ISD::SETOGE, MVT::f64, Expand);
+    setCondCodeAction(ISD::SETOGT, MVT::f64, Expand);
+    setCondCodeAction(ISD::SETUGE, MVT::f64, Expand);
+    setCondCodeAction(ISD::SETUGT, MVT::f64, Expand);
+  }
+
+  if (Subtarget.hasMips64r6()) {
+    // MIPS64r6 replaces the accumulator-based multiplies with a three register
+    // instruction
+    setOperationAction(ISD::MUL, MVT::i64, Legal);
+    setOperationAction(ISD::MULHS, MVT::i64, Legal);
+    setOperationAction(ISD::MULHU, MVT::i64, Legal);
+
+    // MIPS32r6 replaces the accumulator-based division/remainder with separate
+    // three register division and remainder instructions.
+    setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
+    setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
+    setOperationAction(ISD::SDIV, MVT::i64, Legal);
+    setOperationAction(ISD::UDIV, MVT::i64, Legal);
+    setOperationAction(ISD::SREM, MVT::i64, Legal);
+    setOperationAction(ISD::UREM, MVT::i64, Legal);
+
+    // MIPS64r6 replaces conditional moves with an equivalent that removes the
+    // need for three GPR read ports.
+    setOperationAction(ISD::SETCC, MVT::i64, Legal);
+    setOperationAction(ISD::SELECT, MVT::i64, Legal);
+    setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
+  }
+
+  computeRegisterProperties();
+}
+
+const MipsTargetLowering *
+llvm::createMipsSETargetLowering(MipsTargetMachine &TM,
+                                 const MipsSubtarget &STI) {
+  return new MipsSETargetLowering(TM, STI);
+}
+
+const TargetRegisterClass *
+MipsSETargetLowering::getRepRegClassFor(MVT VT) const {
+  if (VT == MVT::Untyped)
+    return Subtarget.hasDSP() ? &Mips::ACC64DSPRegClass : &Mips::ACC64RegClass;
+
+  return TargetLowering::getRepRegClassFor(VT);
+}
+
+// Enable MSA support for the given integer type and Register class.
+void MipsSETargetLowering::
+addMSAIntType(MVT::SimpleValueType Ty, const TargetRegisterClass *RC) {
+  addRegisterClass(Ty, RC);
+
+  // Expand all builtin opcodes.
+  for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
+    setOperationAction(Opc, Ty, Expand);
+
+  setOperationAction(ISD::BITCAST, Ty, Legal);
+  setOperationAction(ISD::LOAD, Ty, Legal);
+  setOperationAction(ISD::STORE, Ty, Legal);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, Ty, Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, Ty, Legal);
+  setOperationAction(ISD::BUILD_VECTOR, Ty, Custom);
+
+  setOperationAction(ISD::ADD, Ty, Legal);
+  setOperationAction(ISD::AND, Ty, Legal);
+  setOperationAction(ISD::CTLZ, Ty, Legal);
+  setOperationAction(ISD::CTPOP, Ty, Legal);
+  setOperationAction(ISD::MUL, Ty, Legal);
+  setOperationAction(ISD::OR, Ty, Legal);
+  setOperationAction(ISD::SDIV, Ty, Legal);
+  setOperationAction(ISD::SREM, Ty, Legal);
+  setOperationAction(ISD::SHL, Ty, Legal);
+  setOperationAction(ISD::SRA, Ty, Legal);
+  setOperationAction(ISD::SRL, Ty, Legal);
+  setOperationAction(ISD::SUB, Ty, Legal);
+  setOperationAction(ISD::UDIV, Ty, Legal);
+  setOperationAction(ISD::UREM, Ty, Legal);
+  setOperationAction(ISD::VECTOR_SHUFFLE, Ty, Custom);
+  setOperationAction(ISD::VSELECT, Ty, Legal);
+  setOperationAction(ISD::XOR, Ty, Legal);
+
+  if (Ty == MVT::v4i32 || Ty == MVT::v2i64) {
+    setOperationAction(ISD::FP_TO_SINT, Ty, Legal);
+    setOperationAction(ISD::FP_TO_UINT, Ty, Legal);
+    setOperationAction(ISD::SINT_TO_FP, Ty, Legal);
+    setOperationAction(ISD::UINT_TO_FP, Ty, Legal);
+  }
+
+  setOperationAction(ISD::SETCC, Ty, Legal);
+  setCondCodeAction(ISD::SETNE, Ty, Expand);
+  setCondCodeAction(ISD::SETGE, Ty, Expand);
+  setCondCodeAction(ISD::SETGT, Ty, Expand);
+  setCondCodeAction(ISD::SETUGE, Ty, Expand);
+  setCondCodeAction(ISD::SETUGT, Ty, Expand);
+}
+
+// Enable MSA support for the given floating-point type and Register class.
+void MipsSETargetLowering::
+addMSAFloatType(MVT::SimpleValueType Ty, const TargetRegisterClass *RC) {
+  addRegisterClass(Ty, RC);
+
+  // Expand all builtin opcodes.
+  for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
+    setOperationAction(Opc, Ty, Expand);
+
+  setOperationAction(ISD::LOAD, Ty, Legal);
+  setOperationAction(ISD::STORE, Ty, Legal);
+  setOperationAction(ISD::BITCAST, Ty, Legal);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, Ty, Legal);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, Ty, Legal);
+  setOperationAction(ISD::BUILD_VECTOR, Ty, Custom);
+
+  if (Ty != MVT::v8f16) {
+    setOperationAction(ISD::FABS,  Ty, Legal);
+    setOperationAction(ISD::FADD,  Ty, Legal);
+    setOperationAction(ISD::FDIV,  Ty, Legal);
+    setOperationAction(ISD::FEXP2, Ty, Legal);
+    setOperationAction(ISD::FLOG2, Ty, Legal);
+    setOperationAction(ISD::FMA,   Ty, Legal);
+    setOperationAction(ISD::FMUL,  Ty, Legal);
+    setOperationAction(ISD::FRINT, Ty, Legal);
+    setOperationAction(ISD::FSQRT, Ty, Legal);
+    setOperationAction(ISD::FSUB,  Ty, Legal);
+    setOperationAction(ISD::VSELECT, Ty, Legal);
+
+    setOperationAction(ISD::SETCC, Ty, Legal);
+    setCondCodeAction(ISD::SETOGE, Ty, Expand);
+    setCondCodeAction(ISD::SETOGT, Ty, Expand);
+    setCondCodeAction(ISD::SETUGE, Ty, Expand);
+    setCondCodeAction(ISD::SETUGT, Ty, Expand);
+    setCondCodeAction(ISD::SETGE,  Ty, Expand);
+    setCondCodeAction(ISD::SETGT,  Ty, Expand);
+  }
+}
+
+bool
+MipsSETargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
+                                                    unsigned,
+                                                    bool *Fast) const {
+  MVT::SimpleValueType SVT = VT.getSimpleVT().SimpleTy;
+
+  if (Subtarget.systemSupportsUnalignedAccess()) {
+    // MIPS32r6/MIPS64r6 is required to support unaligned access. It's
+    // implementation defined whether this is handled by hardware, software, or
+    // a hybrid of the two but it's expected that most implementations will
+    // handle the majority of cases in hardware.
+    if (Fast)
+      *Fast = true;
+    return true;
+  }
+
+  switch (SVT) {
+  case MVT::i64:
+  case MVT::i32:
+    if (Fast)
+      *Fast = true;
+    return true;
+  default:
+    return false;
+  }
+}
+
+SDValue MipsSETargetLowering::LowerOperation(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  switch(Op.getOpcode()) {
+  case ISD::LOAD:  return lowerLOAD(Op, DAG);
+  case ISD::STORE: return lowerSTORE(Op, DAG);
+  case ISD::SMUL_LOHI: return lowerMulDiv(Op, MipsISD::Mult, true, true, DAG);
+  case ISD::UMUL_LOHI: return lowerMulDiv(Op, MipsISD::Multu, true, true, DAG);
+  case ISD::MULHS:     return lowerMulDiv(Op, MipsISD::Mult, false, true, DAG);
+  case ISD::MULHU:     return lowerMulDiv(Op, MipsISD::Multu, false, true, DAG);
+  case ISD::MUL:       return lowerMulDiv(Op, MipsISD::Mult, true, false, DAG);
+  case ISD::SDIVREM:   return lowerMulDiv(Op, MipsISD::DivRem, true, true, DAG);
+  case ISD::UDIVREM:   return lowerMulDiv(Op, MipsISD::DivRemU, true, true,
+                                          DAG);
+  case ISD::INTRINSIC_WO_CHAIN: return lowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::INTRINSIC_W_CHAIN:  return lowerINTRINSIC_W_CHAIN(Op, DAG);
+  case ISD::INTRINSIC_VOID:     return lowerINTRINSIC_VOID(Op, DAG);
+  case ISD::EXTRACT_VECTOR_ELT: return lowerEXTRACT_VECTOR_ELT(Op, DAG);
+  case ISD::BUILD_VECTOR:       return lowerBUILD_VECTOR(Op, DAG);
+  case ISD::VECTOR_SHUFFLE:     return lowerVECTOR_SHUFFLE(Op, DAG);
+  }
+
+  return MipsTargetLowering::LowerOperation(Op, DAG);
+}
+
+// selectMADD -
+// Transforms a subgraph in CurDAG if the following pattern is found:
+//  (addc multLo, Lo0), (adde multHi, Hi0),
+// where,
+//  multHi/Lo: product of multiplication
+//  Lo0: initial value of Lo register
+//  Hi0: initial value of Hi register
+// Return true if pattern matching was successful.
+static bool selectMADD(SDNode *ADDENode, SelectionDAG *CurDAG) {
+  // ADDENode's second operand must be a flag output of an ADDC node in order
+  // for the matching to be successful.
+  SDNode *ADDCNode = ADDENode->getOperand(2).getNode();
+
+  if (ADDCNode->getOpcode() != ISD::ADDC)
+    return false;
+
+  SDValue MultHi = ADDENode->getOperand(0);
+  SDValue MultLo = ADDCNode->getOperand(0);
+  SDNode *MultNode = MultHi.getNode();
+  unsigned MultOpc = MultHi.getOpcode();
+
+  // MultHi and MultLo must be generated by the same node,
+  if (MultLo.getNode() != MultNode)
+    return false;
+
+  // and it must be a multiplication.
+  if (MultOpc != ISD::SMUL_LOHI && MultOpc != ISD::UMUL_LOHI)
+    return false;
+
+  // MultLo amd MultHi must be the first and second output of MultNode
+  // respectively.
+  if (MultHi.getResNo() != 1 || MultLo.getResNo() != 0)
+    return false;
+
+  // Transform this to a MADD only if ADDENode and ADDCNode are the only users
+  // of the values of MultNode, in which case MultNode will be removed in later
+  // phases.
+  // If there exist users other than ADDENode or ADDCNode, this function returns
+  // here, which will result in MultNode being mapped to a single MULT
+  // instruction node rather than a pair of MULT and MADD instructions being
+  // produced.
+  if (!MultHi.hasOneUse() || !MultLo.hasOneUse())
+    return false;
+
+  SDLoc DL(ADDENode);
+
+  // Initialize accumulator.
+  SDValue ACCIn = CurDAG->getNode(MipsISD::MTLOHI, DL, MVT::Untyped,
+                                  ADDCNode->getOperand(1),
+                                  ADDENode->getOperand(1));
+
+  // create MipsMAdd(u) node
+  MultOpc = MultOpc == ISD::UMUL_LOHI ? MipsISD::MAddu : MipsISD::MAdd;
+
+  SDValue MAdd = CurDAG->getNode(MultOpc, DL, MVT::Untyped,
+                                 MultNode->getOperand(0),// Factor 0
+                                 MultNode->getOperand(1),// Factor 1
+                                 ACCIn);
+
+  // replace uses of adde and addc here
+  if (!SDValue(ADDCNode, 0).use_empty()) {
+    SDValue LoOut = CurDAG->getNode(MipsISD::MFLO, DL, MVT::i32, MAdd);
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(ADDCNode, 0), LoOut);
+  }
+  if (!SDValue(ADDENode, 0).use_empty()) {
+    SDValue HiOut = CurDAG->getNode(MipsISD::MFHI, DL, MVT::i32, MAdd);
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(ADDENode, 0), HiOut);
+  }
+
+  return true;
+}
+
+// selectMSUB -
+// Transforms a subgraph in CurDAG if the following pattern is found:
+//  (addc Lo0, multLo), (sube Hi0, multHi),
+// where,
+//  multHi/Lo: product of multiplication
+//  Lo0: initial value of Lo register
+//  Hi0: initial value of Hi register
+// Return true if pattern matching was successful.
+static bool selectMSUB(SDNode *SUBENode, SelectionDAG *CurDAG) {
+  // SUBENode's second operand must be a flag output of an SUBC node in order
+  // for the matching to be successful.
+  SDNode *SUBCNode = SUBENode->getOperand(2).getNode();
+
+  if (SUBCNode->getOpcode() != ISD::SUBC)
+    return false;
+
+  SDValue MultHi = SUBENode->getOperand(1);
+  SDValue MultLo = SUBCNode->getOperand(1);
+  SDNode *MultNode = MultHi.getNode();
+  unsigned MultOpc = MultHi.getOpcode();
+
+  // MultHi and MultLo must be generated by the same node,
+  if (MultLo.getNode() != MultNode)
+    return false;
+
+  // and it must be a multiplication.
+  if (MultOpc != ISD::SMUL_LOHI && MultOpc != ISD::UMUL_LOHI)
+    return false;
+
+  // MultLo amd MultHi must be the first and second output of MultNode
+  // respectively.
+  if (MultHi.getResNo() != 1 || MultLo.getResNo() != 0)
+    return false;
+
+  // Transform this to a MSUB only if SUBENode and SUBCNode are the only users
+  // of the values of MultNode, in which case MultNode will be removed in later
+  // phases.
+  // If there exist users other than SUBENode or SUBCNode, this function returns
+  // here, which will result in MultNode being mapped to a single MULT
+  // instruction node rather than a pair of MULT and MSUB instructions being
+  // produced.
+  if (!MultHi.hasOneUse() || !MultLo.hasOneUse())
+    return false;
+
+  SDLoc DL(SUBENode);
+
+  // Initialize accumulator.
+  SDValue ACCIn = CurDAG->getNode(MipsISD::MTLOHI, DL, MVT::Untyped,
+                                  SUBCNode->getOperand(0),
+                                  SUBENode->getOperand(0));
+
+  // create MipsSub(u) node
+  MultOpc = MultOpc == ISD::UMUL_LOHI ? MipsISD::MSubu : MipsISD::MSub;
+
+  SDValue MSub = CurDAG->getNode(MultOpc, DL, MVT::Glue,
+                                 MultNode->getOperand(0),// Factor 0
+                                 MultNode->getOperand(1),// Factor 1
+                                 ACCIn);
+
+  // replace uses of sube and subc here
+  if (!SDValue(SUBCNode, 0).use_empty()) {
+    SDValue LoOut = CurDAG->getNode(MipsISD::MFLO, DL, MVT::i32, MSub);
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(SUBCNode, 0), LoOut);
+  }
+  if (!SDValue(SUBENode, 0).use_empty()) {
+    SDValue HiOut = CurDAG->getNode(MipsISD::MFHI, DL, MVT::i32, MSub);
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(SUBENode, 0), HiOut);
+  }
+
+  return true;
+}
+
+static SDValue performADDECombine(SDNode *N, SelectionDAG &DAG,
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const MipsSubtarget &Subtarget) {
+  if (DCI.isBeforeLegalize())
+    return SDValue();
+
+  if (Subtarget.hasMips32() && !Subtarget.hasMips32r6() &&
+      N->getValueType(0) == MVT::i32 && selectMADD(N, &DAG))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
+// Fold zero extensions into MipsISD::VEXTRACT_[SZ]EXT_ELT
+//
+// Performs the following transformations:
+// - Changes MipsISD::VEXTRACT_[SZ]EXT_ELT to zero extension if its
+//   sign/zero-extension is completely overwritten by the new one performed by
+//   the ISD::AND.
+// - Removes redundant zero extensions performed by an ISD::AND.
+static SDValue performANDCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const MipsSubtarget &Subtarget) {
+  if (!Subtarget.hasMSA())
+    return SDValue();
+
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  unsigned Op0Opcode = Op0->getOpcode();
+
+  // (and (MipsVExtract[SZ]Ext $a, $b, $c), imm:$d)
+  // where $d + 1 == 2^n and n == 32
+  // or    $d + 1 == 2^n and n <= 32 and ZExt
+  // -> (MipsVExtractZExt $a, $b, $c)
+  if (Op0Opcode == MipsISD::VEXTRACT_SEXT_ELT ||
+      Op0Opcode == MipsISD::VEXTRACT_ZEXT_ELT) {
+    ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Op1);
+
+    if (!Mask)
+      return SDValue();
+
+    int32_t Log2IfPositive = (Mask->getAPIntValue() + 1).exactLogBase2();
+
+    if (Log2IfPositive <= 0)
+      return SDValue(); // Mask+1 is not a power of 2
+
+    SDValue Op0Op2 = Op0->getOperand(2);
+    EVT ExtendTy = cast<VTSDNode>(Op0Op2)->getVT();
+    unsigned ExtendTySize = ExtendTy.getSizeInBits();
+    unsigned Log2 = Log2IfPositive;
+
+    if ((Op0Opcode == MipsISD::VEXTRACT_ZEXT_ELT && Log2 >= ExtendTySize) ||
+        Log2 == ExtendTySize) {
+      SDValue Ops[] = { Op0->getOperand(0), Op0->getOperand(1), Op0Op2 };
+      DAG.MorphNodeTo(Op0.getNode(), MipsISD::VEXTRACT_ZEXT_ELT,
+                      Op0->getVTList(),
+                      makeArrayRef(Ops, Op0->getNumOperands()));
+      return Op0;
+    }
+  }
+
+  return SDValue();
+}
+
+// Determine if the specified node is a constant vector splat.
+//
+// Returns true and sets Imm if:
+// * N is a ISD::BUILD_VECTOR representing a constant splat
+//
+// This function is quite similar to MipsSEDAGToDAGISel::selectVSplat. The
+// differences are that it assumes the MSA has already been checked and the
+// arbitrary requirement for a maximum of 32-bit integers isn't applied (and
+// must not be in order for binsri.d to be selectable).
+static bool isVSplat(SDValue N, APInt &Imm, bool IsLittleEndian) {
+  BuildVectorSDNode *Node = dyn_cast<BuildVectorSDNode>(N.getNode());
+
+  if (!Node)
+    return false;
+
+  APInt SplatValue, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+
+  if (!Node->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs,
+                             8, !IsLittleEndian))
+    return false;
+
+  Imm = SplatValue;
+
+  return true;
+}
+
+// Test whether the given node is an all-ones build_vector.
+static bool isVectorAllOnes(SDValue N) {
+  // Look through bitcasts. Endianness doesn't matter because we are looking
+  // for an all-ones value.
+  if (N->getOpcode() == ISD::BITCAST)
+    N = N->getOperand(0);
+
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N);
+
+  if (!BVN)
+    return false;
+
+  APInt SplatValue, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+
+  // Endianness doesn't matter in this context because we are looking for
+  // an all-ones value.
+  if (BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs))
+    return SplatValue.isAllOnesValue();
+
+  return false;
+}
+
+// Test whether N is the bitwise inverse of OfNode.
+static bool isBitwiseInverse(SDValue N, SDValue OfNode) {
+  if (N->getOpcode() != ISD::XOR)
+    return false;
+
+  if (isVectorAllOnes(N->getOperand(0)))
+    return N->getOperand(1) == OfNode;
+
+  if (isVectorAllOnes(N->getOperand(1)))
+    return N->getOperand(0) == OfNode;
+
+  return false;
+}
+
+// Perform combines where ISD::OR is the root node.
+//
+// Performs the following transformations:
+// - (or (and $a, $mask), (and $b, $inv_mask)) => (vselect $mask, $a, $b)
+//   where $inv_mask is the bitwise inverse of $mask and the 'or' has a 128-bit
+//   vector type.
+static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
+                                TargetLowering::DAGCombinerInfo &DCI,
+                                const MipsSubtarget &Subtarget) {
+  if (!Subtarget.hasMSA())
+    return SDValue();
+
+  EVT Ty = N->getValueType(0);
+
+  if (!Ty.is128BitVector())
+    return SDValue();
+
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+
+  if (Op0->getOpcode() == ISD::AND && Op1->getOpcode() == ISD::AND) {
+    SDValue Op0Op0 = Op0->getOperand(0);
+    SDValue Op0Op1 = Op0->getOperand(1);
+    SDValue Op1Op0 = Op1->getOperand(0);
+    SDValue Op1Op1 = Op1->getOperand(1);
+    bool IsLittleEndian = !Subtarget.isLittle();
+
+    SDValue IfSet, IfClr, Cond;
+    bool IsConstantMask = false;
+    APInt Mask, InvMask;
+
+    // If Op0Op0 is an appropriate mask, try to find it's inverse in either
+    // Op1Op0, or Op1Op1. Keep track of the Cond, IfSet, and IfClr nodes, while
+    // looking.
+    // IfClr will be set if we find a valid match.
+    if (isVSplat(Op0Op0, Mask, IsLittleEndian)) {
+      Cond = Op0Op0;
+      IfSet = Op0Op1;
+
+      if (isVSplat(Op1Op0, InvMask, IsLittleEndian) &&
+          Mask.getBitWidth() == InvMask.getBitWidth() && Mask == ~InvMask)
+        IfClr = Op1Op1;
+      else if (isVSplat(Op1Op1, InvMask, IsLittleEndian) &&
+               Mask.getBitWidth() == InvMask.getBitWidth() && Mask == ~InvMask)
+        IfClr = Op1Op0;
+
+      IsConstantMask = true;
+    }
+
+    // If IfClr is not yet set, and Op0Op1 is an appropriate mask, try the same
+    // thing again using this mask.
+    // IfClr will be set if we find a valid match.
+    if (!IfClr.getNode() && isVSplat(Op0Op1, Mask, IsLittleEndian)) {
+      Cond = Op0Op1;
+      IfSet = Op0Op0;
+
+      if (isVSplat(Op1Op0, InvMask, IsLittleEndian) &&
+          Mask.getBitWidth() == InvMask.getBitWidth() && Mask == ~InvMask)
+        IfClr = Op1Op1;
+      else if (isVSplat(Op1Op1, InvMask, IsLittleEndian) &&
+               Mask.getBitWidth() == InvMask.getBitWidth() && Mask == ~InvMask)
+        IfClr = Op1Op0;
+
+      IsConstantMask = true;
+    }
+
+    // If IfClr is not yet set, try looking for a non-constant match.
+    // IfClr will be set if we find a valid match amongst the eight
+    // possibilities.
+    if (!IfClr.getNode()) {
+      if (isBitwiseInverse(Op0Op0, Op1Op0)) {
+        Cond = Op1Op0;
+        IfSet = Op1Op1;
+        IfClr = Op0Op1;
+      } else if (isBitwiseInverse(Op0Op1, Op1Op0)) {
+        Cond = Op1Op0;
+        IfSet = Op1Op1;
+        IfClr = Op0Op0;
+      } else if (isBitwiseInverse(Op0Op0, Op1Op1)) {
+        Cond = Op1Op1;
+        IfSet = Op1Op0;
+        IfClr = Op0Op1;
+      } else if (isBitwiseInverse(Op0Op1, Op1Op1)) {
+        Cond = Op1Op1;
+        IfSet = Op1Op0;
+        IfClr = Op0Op0;
+      } else if (isBitwiseInverse(Op1Op0, Op0Op0)) {
+        Cond = Op0Op0;
+        IfSet = Op0Op1;
+        IfClr = Op1Op1;
+      } else if (isBitwiseInverse(Op1Op1, Op0Op0)) {
+        Cond = Op0Op0;
+        IfSet = Op0Op1;
+        IfClr = Op1Op0;
+      } else if (isBitwiseInverse(Op1Op0, Op0Op1)) {
+        Cond = Op0Op1;
+        IfSet = Op0Op0;
+        IfClr = Op1Op1;
+      } else if (isBitwiseInverse(Op1Op1, Op0Op1)) {
+        Cond = Op0Op1;
+        IfSet = Op0Op0;
+        IfClr = Op1Op0;
+      }
+    }
+
+    // At this point, IfClr will be set if we have a valid match.
+    if (!IfClr.getNode())
+      return SDValue();
+
+    assert(Cond.getNode() && IfSet.getNode());
+
+    // Fold degenerate cases.
+    if (IsConstantMask) {
+      if (Mask.isAllOnesValue())
+        return IfSet;
+      else if (Mask == 0)
+        return IfClr;
+    }
+
+    // Transform the DAG into an equivalent VSELECT.
+    return DAG.getNode(ISD::VSELECT, SDLoc(N), Ty, Cond, IfSet, IfClr);
+  }
+
+  return SDValue();
+}
+
+static SDValue performSUBECombine(SDNode *N, SelectionDAG &DAG,
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const MipsSubtarget &Subtarget) {
+  if (DCI.isBeforeLegalize())
+    return SDValue();
+
+  if (Subtarget.hasMips32() && N->getValueType(0) == MVT::i32 &&
+      selectMSUB(N, &DAG))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
+static SDValue genConstMult(SDValue X, uint64_t C, SDLoc DL, EVT VT,
+                            EVT ShiftTy, SelectionDAG &DAG) {
+  // Clear the upper (64 - VT.sizeInBits) bits.
+  C &= ((uint64_t)-1) >> (64 - VT.getSizeInBits());
+
+  // Return 0.
+  if (C == 0)
+    return DAG.getConstant(0, VT);
+
+  // Return x.
+  if (C == 1)
+    return X;
+
+  // If c is power of 2, return (shl x, log2(c)).
+  if (isPowerOf2_64(C))
+    return DAG.getNode(ISD::SHL, DL, VT, X,
+                       DAG.getConstant(Log2_64(C), ShiftTy));
+
+  unsigned Log2Ceil = Log2_64_Ceil(C);
+  uint64_t Floor = 1LL << Log2_64(C);
+  uint64_t Ceil = Log2Ceil == 64 ? 0LL : 1LL << Log2Ceil;
+
+  // If |c - floor_c| <= |c - ceil_c|,
+  // where floor_c = pow(2, floor(log2(c))) and ceil_c = pow(2, ceil(log2(c))),
+  // return (add constMult(x, floor_c), constMult(x, c - floor_c)).
+  if (C - Floor <= Ceil - C) {
+    SDValue Op0 = genConstMult(X, Floor, DL, VT, ShiftTy, DAG);
+    SDValue Op1 = genConstMult(X, C - Floor, DL, VT, ShiftTy, DAG);
+    return DAG.getNode(ISD::ADD, DL, VT, Op0, Op1);
+  }
+
+  // If |c - floor_c| > |c - ceil_c|,
+  // return (sub constMult(x, ceil_c), constMult(x, ceil_c - c)).
+  SDValue Op0 = genConstMult(X, Ceil, DL, VT, ShiftTy, DAG);
+  SDValue Op1 = genConstMult(X, Ceil - C, DL, VT, ShiftTy, DAG);
+  return DAG.getNode(ISD::SUB, DL, VT, Op0, Op1);
+}
+
+static SDValue performMULCombine(SDNode *N, SelectionDAG &DAG,
+                                 const TargetLowering::DAGCombinerInfo &DCI,
+                                 const MipsSETargetLowering *TL) {
+  EVT VT = N->getValueType(0);
+
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)))
+    if (!VT.isVector())
+      return genConstMult(N->getOperand(0), C->getZExtValue(), SDLoc(N),
+                          VT, TL->getScalarShiftAmountTy(VT), DAG);
+
+  return SDValue(N, 0);
+}
+
+static SDValue performDSPShiftCombine(unsigned Opc, SDNode *N, EVT Ty,
+                                      SelectionDAG &DAG,
+                                      const MipsSubtarget &Subtarget) {
+  // See if this is a vector splat immediate node.
+  APInt SplatValue, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  unsigned EltSize = Ty.getVectorElementType().getSizeInBits();
+  BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
+
+  if (!Subtarget.hasDSP())
+    return SDValue();
+
+  if (!BV ||
+      !BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs,
+                           EltSize, !Subtarget.isLittle()) ||
+      (SplatBitSize != EltSize) ||
+      (SplatValue.getZExtValue() >= EltSize))
+    return SDValue();
+
+  return DAG.getNode(Opc, SDLoc(N), Ty, N->getOperand(0),
+                     DAG.getConstant(SplatValue.getZExtValue(), MVT::i32));
+}
+
+static SDValue performSHLCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const MipsSubtarget &Subtarget) {
+  EVT Ty = N->getValueType(0);
+
+  if ((Ty != MVT::v2i16) && (Ty != MVT::v4i8))
+    return SDValue();
+
+  return performDSPShiftCombine(MipsISD::SHLL_DSP, N, Ty, DAG, Subtarget);
+}
+
+// Fold sign-extensions into MipsISD::VEXTRACT_[SZ]EXT_ELT for MSA and fold
+// constant splats into MipsISD::SHRA_DSP for DSPr2.
+//
+// Performs the following transformations:
+// - Changes MipsISD::VEXTRACT_[SZ]EXT_ELT to sign extension if its
+//   sign/zero-extension is completely overwritten by the new one performed by
+//   the ISD::SRA and ISD::SHL nodes.
+// - Removes redundant sign extensions performed by an ISD::SRA and ISD::SHL
+//   sequence.
+//
+// See performDSPShiftCombine for more information about the transformation
+// used for DSPr2.
+static SDValue performSRACombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const MipsSubtarget &Subtarget) {
+  EVT Ty = N->getValueType(0);
+
+  if (Subtarget.hasMSA()) {
+    SDValue Op0 = N->getOperand(0);
+    SDValue Op1 = N->getOperand(1);
+
+    // (sra (shl (MipsVExtract[SZ]Ext $a, $b, $c), imm:$d), imm:$d)
+    // where $d + sizeof($c) == 32
+    // or    $d + sizeof($c) <= 32 and SExt
+    // -> (MipsVExtractSExt $a, $b, $c)
+    if (Op0->getOpcode() == ISD::SHL && Op1 == Op0->getOperand(1)) {
+      SDValue Op0Op0 = Op0->getOperand(0);
+      ConstantSDNode *ShAmount = dyn_cast<ConstantSDNode>(Op1);
+
+      if (!ShAmount)
+        return SDValue();
+
+      if (Op0Op0->getOpcode() != MipsISD::VEXTRACT_SEXT_ELT &&
+          Op0Op0->getOpcode() != MipsISD::VEXTRACT_ZEXT_ELT)
+        return SDValue();
+
+      EVT ExtendTy = cast<VTSDNode>(Op0Op0->getOperand(2))->getVT();
+      unsigned TotalBits = ShAmount->getZExtValue() + ExtendTy.getSizeInBits();
+
+      if (TotalBits == 32 ||
+          (Op0Op0->getOpcode() == MipsISD::VEXTRACT_SEXT_ELT &&
+           TotalBits <= 32)) {
+        SDValue Ops[] = { Op0Op0->getOperand(0), Op0Op0->getOperand(1),
+                          Op0Op0->getOperand(2) };
+        DAG.MorphNodeTo(Op0Op0.getNode(), MipsISD::VEXTRACT_SEXT_ELT,
+                        Op0Op0->getVTList(),
+                        makeArrayRef(Ops, Op0Op0->getNumOperands()));
+        return Op0Op0;
+      }
+    }
+  }
+
+  if ((Ty != MVT::v2i16) && ((Ty != MVT::v4i8) || !Subtarget.hasDSPR2()))
+    return SDValue();
+
+  return performDSPShiftCombine(MipsISD::SHRA_DSP, N, Ty, DAG, Subtarget);
+}
+
+
+static SDValue performSRLCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const MipsSubtarget &Subtarget) {
+  EVT Ty = N->getValueType(0);
+
+  if (((Ty != MVT::v2i16) || !Subtarget.hasDSPR2()) && (Ty != MVT::v4i8))
+    return SDValue();
+
+  return performDSPShiftCombine(MipsISD::SHRL_DSP, N, Ty, DAG, Subtarget);
+}
+
+static bool isLegalDSPCondCode(EVT Ty, ISD::CondCode CC) {
+  bool IsV216 = (Ty == MVT::v2i16);
+
+  switch (CC) {
+  case ISD::SETEQ:
+  case ISD::SETNE:  return true;
+  case ISD::SETLT:
+  case ISD::SETLE:
+  case ISD::SETGT:
+  case ISD::SETGE:  return IsV216;
+  case ISD::SETULT:
+  case ISD::SETULE:
+  case ISD::SETUGT:
+  case ISD::SETUGE: return !IsV216;
+  default:          return false;
+  }
+}
+
+static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG) {
+  EVT Ty = N->getValueType(0);
+
+  if ((Ty != MVT::v2i16) && (Ty != MVT::v4i8))
+    return SDValue();
+
+  if (!isLegalDSPCondCode(Ty, cast<CondCodeSDNode>(N->getOperand(2))->get()))
+    return SDValue();
+
+  return DAG.getNode(MipsISD::SETCC_DSP, SDLoc(N), Ty, N->getOperand(0),
+                     N->getOperand(1), N->getOperand(2));
+}
+
+static SDValue performVSELECTCombine(SDNode *N, SelectionDAG &DAG) {
+  EVT Ty = N->getValueType(0);
+
+  if (Ty.is128BitVector() && Ty.isInteger()) {
+    // Try the following combines:
+    //   (vselect (setcc $a, $b, SETLT), $b, $a)) -> (vsmax $a, $b)
+    //   (vselect (setcc $a, $b, SETLE), $b, $a)) -> (vsmax $a, $b)
+    //   (vselect (setcc $a, $b, SETLT), $a, $b)) -> (vsmin $a, $b)
+    //   (vselect (setcc $a, $b, SETLE), $a, $b)) -> (vsmin $a, $b)
+    //   (vselect (setcc $a, $b, SETULT), $b, $a)) -> (vumax $a, $b)
+    //   (vselect (setcc $a, $b, SETULE), $b, $a)) -> (vumax $a, $b)
+    //   (vselect (setcc $a, $b, SETULT), $a, $b)) -> (vumin $a, $b)
+    //   (vselect (setcc $a, $b, SETULE), $a, $b)) -> (vumin $a, $b)
+    // SETGT/SETGE/SETUGT/SETUGE variants of these will show up initially but
+    // will be expanded to equivalent SETLT/SETLE/SETULT/SETULE versions by the
+    // legalizer.
+    SDValue Op0 = N->getOperand(0);
+
+    if (Op0->getOpcode() != ISD::SETCC)
+      return SDValue();
+
+    ISD::CondCode CondCode = cast<CondCodeSDNode>(Op0->getOperand(2))->get();
+    bool Signed;
+
+    if (CondCode == ISD::SETLT  || CondCode == ISD::SETLE)
+      Signed = true;
+    else if (CondCode == ISD::SETULT || CondCode == ISD::SETULE)
+      Signed = false;
+    else
+      return SDValue();
+
+    SDValue Op1 = N->getOperand(1);
+    SDValue Op2 = N->getOperand(2);
+    SDValue Op0Op0 = Op0->getOperand(0);
+    SDValue Op0Op1 = Op0->getOperand(1);
+
+    if (Op1 == Op0Op0 && Op2 == Op0Op1)
+      return DAG.getNode(Signed ? MipsISD::VSMIN : MipsISD::VUMIN, SDLoc(N),
+                         Ty, Op1, Op2);
+    else if (Op1 == Op0Op1 && Op2 == Op0Op0)
+      return DAG.getNode(Signed ? MipsISD::VSMAX : MipsISD::VUMAX, SDLoc(N),
+                         Ty, Op1, Op2);
+  } else if ((Ty == MVT::v2i16) || (Ty == MVT::v4i8)) {
+    SDValue SetCC = N->getOperand(0);
+
+    if (SetCC.getOpcode() != MipsISD::SETCC_DSP)
+      return SDValue();
+
+    return DAG.getNode(MipsISD::SELECT_CC_DSP, SDLoc(N), Ty,
+                       SetCC.getOperand(0), SetCC.getOperand(1),
+                       N->getOperand(1), N->getOperand(2), SetCC.getOperand(2));
+  }
+
+  return SDValue();
+}
+
+static SDValue performXORCombine(SDNode *N, SelectionDAG &DAG,
+                                 const MipsSubtarget &Subtarget) {
+  EVT Ty = N->getValueType(0);
+
+  if (Subtarget.hasMSA() && Ty.is128BitVector() && Ty.isInteger()) {
+    // Try the following combines:
+    //   (xor (or $a, $b), (build_vector allones))
+    //   (xor (or $a, $b), (bitcast (build_vector allones)))
+    SDValue Op0 = N->getOperand(0);
+    SDValue Op1 = N->getOperand(1);
+    SDValue NotOp;
+
+    if (ISD::isBuildVectorAllOnes(Op0.getNode()))
+      NotOp = Op1;
+    else if (ISD::isBuildVectorAllOnes(Op1.getNode()))
+      NotOp = Op0;
+    else
+      return SDValue();
+
+    if (NotOp->getOpcode() == ISD::OR)
+      return DAG.getNode(MipsISD::VNOR, SDLoc(N), Ty, NotOp->getOperand(0),
+                         NotOp->getOperand(1));
+  }
+
+  return SDValue();
+}
+
+SDValue
+MipsSETargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue Val;
+
+  switch (N->getOpcode()) {
+  case ISD::ADDE:
+    return performADDECombine(N, DAG, DCI, Subtarget);
+  case ISD::AND:
+    Val = performANDCombine(N, DAG, DCI, Subtarget);
+    break;
+  case ISD::OR:
+    Val = performORCombine(N, DAG, DCI, Subtarget);
+    break;
+  case ISD::SUBE:
+    return performSUBECombine(N, DAG, DCI, Subtarget);
+  case ISD::MUL:
+    return performMULCombine(N, DAG, DCI, this);
+  case ISD::SHL:
+    return performSHLCombine(N, DAG, DCI, Subtarget);
+  case ISD::SRA:
+    return performSRACombine(N, DAG, DCI, Subtarget);
+  case ISD::SRL:
+    return performSRLCombine(N, DAG, DCI, Subtarget);
+  case ISD::VSELECT:
+    return performVSELECTCombine(N, DAG);
+  case ISD::XOR:
+    Val = performXORCombine(N, DAG, Subtarget);
+    break;
+  case ISD::SETCC:
+    Val = performSETCCCombine(N, DAG);
+    break;
+  }
+
+  if (Val.getNode()) {
+    DEBUG(dbgs() << "\nMipsSE DAG Combine:\n";
+          N->printrWithDepth(dbgs(), &DAG);
+          dbgs() << "\n=> \n";
+          Val.getNode()->printrWithDepth(dbgs(), &DAG);
+          dbgs() << "\n");
+    return Val;
+  }
+
+  return MipsTargetLowering::PerformDAGCombine(N, DCI);
+}
+
+MachineBasicBlock *
+MipsSETargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                  MachineBasicBlock *BB) const {
+  switch (MI->getOpcode()) {
+  default:
+    return MipsTargetLowering::EmitInstrWithCustomInserter(MI, BB);
+  case Mips::BPOSGE32_PSEUDO:
+    return emitBPOSGE32(MI, BB);
+  case Mips::SNZ_B_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BNZ_B);
+  case Mips::SNZ_H_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BNZ_H);
+  case Mips::SNZ_W_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BNZ_W);
+  case Mips::SNZ_D_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BNZ_D);
+  case Mips::SNZ_V_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BNZ_V);
+  case Mips::SZ_B_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BZ_B);
+  case Mips::SZ_H_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BZ_H);
+  case Mips::SZ_W_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BZ_W);
+  case Mips::SZ_D_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BZ_D);
+  case Mips::SZ_V_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BZ_V);
+  case Mips::COPY_FW_PSEUDO:
+    return emitCOPY_FW(MI, BB);
+  case Mips::COPY_FD_PSEUDO:
+    return emitCOPY_FD(MI, BB);
+  case Mips::INSERT_FW_PSEUDO:
+    return emitINSERT_FW(MI, BB);
+  case Mips::INSERT_FD_PSEUDO:
+    return emitINSERT_FD(MI, BB);
+  case Mips::INSERT_B_VIDX_PSEUDO:
+    return emitINSERT_DF_VIDX(MI, BB, 1, false);
+  case Mips::INSERT_H_VIDX_PSEUDO:
+    return emitINSERT_DF_VIDX(MI, BB, 2, false);
+  case Mips::INSERT_W_VIDX_PSEUDO:
+    return emitINSERT_DF_VIDX(MI, BB, 4, false);
+  case Mips::INSERT_D_VIDX_PSEUDO:
+    return emitINSERT_DF_VIDX(MI, BB, 8, false);
+  case Mips::INSERT_FW_VIDX_PSEUDO:
+    return emitINSERT_DF_VIDX(MI, BB, 4, true);
+  case Mips::INSERT_FD_VIDX_PSEUDO:
+    return emitINSERT_DF_VIDX(MI, BB, 8, true);
+  case Mips::FILL_FW_PSEUDO:
+    return emitFILL_FW(MI, BB);
+  case Mips::FILL_FD_PSEUDO:
+    return emitFILL_FD(MI, BB);
+  case Mips::FEXP2_W_1_PSEUDO:
+    return emitFEXP2_W_1(MI, BB);
+  case Mips::FEXP2_D_1_PSEUDO:
+    return emitFEXP2_D_1(MI, BB);
+  }
+}
+
+bool MipsSETargetLowering::
+isEligibleForTailCallOptimization(const MipsCC &MipsCCInfo,
+                                  unsigned NextStackOffset,
+                                  const MipsFunctionInfo& FI) const {
+  if (!EnableMipsTailCalls)
+    return false;
+
+  // Return false if either the callee or caller has a byval argument.
+  if (MipsCCInfo.hasByValArg() || FI.hasByvalArg())
+    return false;
+
+  // Return true if the callee's argument area is no larger than the
+  // caller's.
+  return NextStackOffset <= FI.getIncomingArgSize();
+}
+
+void MipsSETargetLowering::
+getOpndList(SmallVectorImpl<SDValue> &Ops,
+            std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
+            bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
+            CallLoweringInfo &CLI, SDValue Callee, SDValue Chain) const {
+  Ops.push_back(Callee);
+  MipsTargetLowering::getOpndList(Ops, RegsToPass, IsPICCall, GlobalOrExternal,
+                                  InternalLinkage, CLI, Callee, Chain);
+}
+
+SDValue MipsSETargetLowering::lowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+  LoadSDNode &Nd = *cast<LoadSDNode>(Op);
+
+  if (Nd.getMemoryVT() != MVT::f64 || !NoDPLoadStore)
+    return MipsTargetLowering::lowerLOAD(Op, DAG);
+
+  // Replace a double precision load with two i32 loads and a buildpair64.
+  SDLoc DL(Op);
+  SDValue Ptr = Nd.getBasePtr(), Chain = Nd.getChain();
+  EVT PtrVT = Ptr.getValueType();
+
+  // i32 load from lower address.
+  SDValue Lo = DAG.getLoad(MVT::i32, DL, Chain, Ptr,
+                           MachinePointerInfo(), Nd.isVolatile(),
+                           Nd.isNonTemporal(), Nd.isInvariant(),
+                           Nd.getAlignment());
+
+  // i32 load from higher address.
+  Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr, DAG.getConstant(4, PtrVT));
+  SDValue Hi = DAG.getLoad(MVT::i32, DL, Lo.getValue(1), Ptr,
+                           MachinePointerInfo(), Nd.isVolatile(),
+                           Nd.isNonTemporal(), Nd.isInvariant(),
+                           std::min(Nd.getAlignment(), 4U));
+
+  if (!Subtarget.isLittle())
+    std::swap(Lo, Hi);
+
+  SDValue BP = DAG.getNode(MipsISD::BuildPairF64, DL, MVT::f64, Lo, Hi);
+  SDValue Ops[2] = {BP, Hi.getValue(1)};
+  return DAG.getMergeValues(Ops, DL);
+}
+
+SDValue MipsSETargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+  StoreSDNode &Nd = *cast<StoreSDNode>(Op);
+
+  if (Nd.getMemoryVT() != MVT::f64 || !NoDPLoadStore)
+    return MipsTargetLowering::lowerSTORE(Op, DAG);
+
+  // Replace a double precision store with two extractelement64s and i32 stores.
+  SDLoc DL(Op);
+  SDValue Val = Nd.getValue(), Ptr = Nd.getBasePtr(), Chain = Nd.getChain();
+  EVT PtrVT = Ptr.getValueType();
+  SDValue Lo = DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32,
+                           Val, DAG.getConstant(0, MVT::i32));
+  SDValue Hi = DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32,
+                           Val, DAG.getConstant(1, MVT::i32));
+
+  if (!Subtarget.isLittle())
+    std::swap(Lo, Hi);
+
+  // i32 store to lower address.
+  Chain = DAG.getStore(Chain, DL, Lo, Ptr, MachinePointerInfo(),
+                       Nd.isVolatile(), Nd.isNonTemporal(), Nd.getAlignment(),
+                       Nd.getTBAAInfo());
+
+  // i32 store to higher address.
+  Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr, DAG.getConstant(4, PtrVT));
+  return DAG.getStore(Chain, DL, Hi, Ptr, MachinePointerInfo(),
+                      Nd.isVolatile(), Nd.isNonTemporal(),
+                      std::min(Nd.getAlignment(), 4U), Nd.getTBAAInfo());
+}
+
+SDValue MipsSETargetLowering::lowerMulDiv(SDValue Op, unsigned NewOpc,
+                                          bool HasLo, bool HasHi,
+                                          SelectionDAG &DAG) const {
+  // MIPS32r6/MIPS64r6 removed accumulator based multiplies.
+  assert(!Subtarget.hasMips32r6());
+
+  EVT Ty = Op.getOperand(0).getValueType();
+  SDLoc DL(Op);
+  SDValue Mult = DAG.getNode(NewOpc, DL, MVT::Untyped,
+                             Op.getOperand(0), Op.getOperand(1));
+  SDValue Lo, Hi;
+
+  if (HasLo)
+    Lo = DAG.getNode(MipsISD::MFLO, DL, Ty, Mult);
+  if (HasHi)
+    Hi = DAG.getNode(MipsISD::MFHI, DL, Ty, Mult);
+
+  if (!HasLo || !HasHi)
+    return HasLo ? Lo : Hi;
+
+  SDValue Vals[] = { Lo, Hi };
+  return DAG.getMergeValues(Vals, DL);
+}
+
+
+static SDValue initAccumulator(SDValue In, SDLoc DL, SelectionDAG &DAG) {
+  SDValue InLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, In,
+                             DAG.getConstant(0, MVT::i32));
+  SDValue InHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, In,
+                             DAG.getConstant(1, MVT::i32));
+  return DAG.getNode(MipsISD::MTLOHI, DL, MVT::Untyped, InLo, InHi);
+}
+
+static SDValue extractLOHI(SDValue Op, SDLoc DL, SelectionDAG &DAG) {
+  SDValue Lo = DAG.getNode(MipsISD::MFLO, DL, MVT::i32, Op);
+  SDValue Hi = DAG.getNode(MipsISD::MFHI, DL, MVT::i32, Op);
+  return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi);
+}
+
+// This function expands mips intrinsic nodes which have 64-bit input operands
+// or output values.
+//
+// out64 = intrinsic-node in64
+// =>
+// lo = copy (extract-element (in64, 0))
+// hi = copy (extract-element (in64, 1))
+// mips-specific-node
+// v0 = copy lo
+// v1 = copy hi
+// out64 = merge-values (v0, v1)
+//
+static SDValue lowerDSPIntr(SDValue Op, SelectionDAG &DAG, unsigned Opc) {
+  SDLoc DL(Op);
+  bool HasChainIn = Op->getOperand(0).getValueType() == MVT::Other;
+  SmallVector<SDValue, 3> Ops;
+  unsigned OpNo = 0;
+
+  // See if Op has a chain input.
+  if (HasChainIn)
+    Ops.push_back(Op->getOperand(OpNo++));
+
+  // The next operand is the intrinsic opcode.
+  assert(Op->getOperand(OpNo).getOpcode() == ISD::TargetConstant);
+
+  // See if the next operand has type i64.
+  SDValue Opnd = Op->getOperand(++OpNo), In64;
+
+  if (Opnd.getValueType() == MVT::i64)
+    In64 = initAccumulator(Opnd, DL, DAG);
+  else
+    Ops.push_back(Opnd);
+
+  // Push the remaining operands.
+  for (++OpNo ; OpNo < Op->getNumOperands(); ++OpNo)
+    Ops.push_back(Op->getOperand(OpNo));
+
+  // Add In64 to the end of the list.
+  if (In64.getNode())
+    Ops.push_back(In64);
+
+  // Scan output.
+  SmallVector<EVT, 2> ResTys;
+
+  for (SDNode::value_iterator I = Op->value_begin(), E = Op->value_end();
+       I != E; ++I)
+    ResTys.push_back((*I == MVT::i64) ? MVT::Untyped : *I);
+
+  // Create node.
+  SDValue Val = DAG.getNode(Opc, DL, ResTys, Ops);
+  SDValue Out = (ResTys[0] == MVT::Untyped) ? extractLOHI(Val, DL, DAG) : Val;
+
+  if (!HasChainIn)
+    return Out;
+
+  assert(Val->getValueType(1) == MVT::Other);
+  SDValue Vals[] = { Out, SDValue(Val.getNode(), 1) };
+  return DAG.getMergeValues(Vals, DL);
+}
+
+// Lower an MSA copy intrinsic into the specified SelectionDAG node
+static SDValue lowerMSACopyIntr(SDValue Op, SelectionDAG &DAG, unsigned Opc) {
+  SDLoc DL(Op);
+  SDValue Vec = Op->getOperand(1);
+  SDValue Idx = Op->getOperand(2);
+  EVT ResTy = Op->getValueType(0);
+  EVT EltTy = Vec->getValueType(0).getVectorElementType();
+
+  SDValue Result = DAG.getNode(Opc, DL, ResTy, Vec, Idx,
+                               DAG.getValueType(EltTy));
+
+  return Result;
+}
+
+static SDValue lowerMSASplatZExt(SDValue Op, unsigned OpNr, SelectionDAG &DAG) {
+  EVT ResVecTy = Op->getValueType(0);
+  EVT ViaVecTy = ResVecTy;
+  SDLoc DL(Op);
+
+  // When ResVecTy == MVT::v2i64, LaneA is the upper 32 bits of the lane and
+  // LaneB is the lower 32-bits. Otherwise LaneA and LaneB are alternating
+  // lanes.
+  SDValue LaneA;
+  SDValue LaneB = Op->getOperand(2);
+
+  if (ResVecTy == MVT::v2i64) {
+    LaneA = DAG.getConstant(0, MVT::i32);
+    ViaVecTy = MVT::v4i32;
+  } else
+    LaneA = LaneB;
+
+  SDValue Ops[16] = { LaneA, LaneB, LaneA, LaneB, LaneA, LaneB, LaneA, LaneB,
+                      LaneA, LaneB, LaneA, LaneB, LaneA, LaneB, LaneA, LaneB };
+
+  SDValue Result = DAG.getNode(ISD::BUILD_VECTOR, DL, ViaVecTy,
+                       makeArrayRef(Ops, ViaVecTy.getVectorNumElements()));
+
+  if (ViaVecTy != ResVecTy)
+    Result = DAG.getNode(ISD::BITCAST, DL, ResVecTy, Result);
+
+  return Result;
+}
+
+static SDValue lowerMSASplatImm(SDValue Op, unsigned ImmOp, SelectionDAG &DAG) {
+  return DAG.getConstant(Op->getConstantOperandVal(ImmOp), Op->getValueType(0));
+}
+
+static SDValue getBuildVectorSplat(EVT VecTy, SDValue SplatValue,
+                                   bool BigEndian, SelectionDAG &DAG) {
+  EVT ViaVecTy = VecTy;
+  SDValue SplatValueA = SplatValue;
+  SDValue SplatValueB = SplatValue;
+  SDLoc DL(SplatValue);
+
+  if (VecTy == MVT::v2i64) {
+    // v2i64 BUILD_VECTOR must be performed via v4i32 so split into i32's.
+    ViaVecTy = MVT::v4i32;
+
+    SplatValueA = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, SplatValue);
+    SplatValueB = DAG.getNode(ISD::SRL, DL, MVT::i64, SplatValue,
+                              DAG.getConstant(32, MVT::i32));
+    SplatValueB = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, SplatValueB);
+  }
+
+  // We currently hold the parts in little endian order. Swap them if
+  // necessary.
+  if (BigEndian)
+    std::swap(SplatValueA, SplatValueB);
+
+  SDValue Ops[16] = { SplatValueA, SplatValueB, SplatValueA, SplatValueB,
+                      SplatValueA, SplatValueB, SplatValueA, SplatValueB,
+                      SplatValueA, SplatValueB, SplatValueA, SplatValueB,
+                      SplatValueA, SplatValueB, SplatValueA, SplatValueB };
+
+  SDValue Result = DAG.getNode(ISD::BUILD_VECTOR, DL, ViaVecTy,
+                       makeArrayRef(Ops, ViaVecTy.getVectorNumElements()));
+
+  if (VecTy != ViaVecTy)
+    Result = DAG.getNode(ISD::BITCAST, DL, VecTy, Result);
+
+  return Result;
+}
+
+static SDValue lowerMSABinaryBitImmIntr(SDValue Op, SelectionDAG &DAG,
+                                        unsigned Opc, SDValue Imm,
+                                        bool BigEndian) {
+  EVT VecTy = Op->getValueType(0);
+  SDValue Exp2Imm;
+  SDLoc DL(Op);
+
+  // The DAG Combiner can't constant fold bitcasted vectors yet so we must do it
+  // here for now.
+  if (VecTy == MVT::v2i64) {
+    if (ConstantSDNode *CImm = dyn_cast<ConstantSDNode>(Imm)) {
+      APInt BitImm = APInt(64, 1) << CImm->getAPIntValue();
+
+      SDValue BitImmHiOp = DAG.getConstant(BitImm.lshr(32).trunc(32), MVT::i32);
+      SDValue BitImmLoOp = DAG.getConstant(BitImm.trunc(32), MVT::i32);
+
+      if (BigEndian)
+        std::swap(BitImmLoOp, BitImmHiOp);
+
+      Exp2Imm =
+          DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
+                      DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, BitImmLoOp,
+                                  BitImmHiOp, BitImmLoOp, BitImmHiOp));
+    }
+  }
+
+  if (!Exp2Imm.getNode()) {
+    // We couldnt constant fold, do a vector shift instead
+
+    // Extend i32 to i64 if necessary. Sign or zero extend doesn't matter since
+    // only values 0-63 are valid.
+    if (VecTy == MVT::v2i64)
+      Imm = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Imm);
+
+    Exp2Imm = getBuildVectorSplat(VecTy, Imm, BigEndian, DAG);
+
+    Exp2Imm =
+        DAG.getNode(ISD::SHL, DL, VecTy, DAG.getConstant(1, VecTy), Exp2Imm);
+  }
+
+  return DAG.getNode(Opc, DL, VecTy, Op->getOperand(1), Exp2Imm);
+}
+
+static SDValue lowerMSABitClear(SDValue Op, SelectionDAG &DAG) {
+  EVT ResTy = Op->getValueType(0);
+  SDLoc DL(Op);
+  SDValue One = DAG.getConstant(1, ResTy);
+  SDValue Bit = DAG.getNode(ISD::SHL, DL, ResTy, One, Op->getOperand(2));
+
+  return DAG.getNode(ISD::AND, DL, ResTy, Op->getOperand(1),
+                     DAG.getNOT(DL, Bit, ResTy));
+}
+
+static SDValue lowerMSABitClearImm(SDValue Op, SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  EVT ResTy = Op->getValueType(0);
+  APInt BitImm = APInt(ResTy.getVectorElementType().getSizeInBits(), 1)
+                 << cast<ConstantSDNode>(Op->getOperand(2))->getAPIntValue();
+  SDValue BitMask = DAG.getConstant(~BitImm, ResTy);
+
+  return DAG.getNode(ISD::AND, DL, ResTy, Op->getOperand(1), BitMask);
+}
+
+SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
+                                                      SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+
+  switch (cast<ConstantSDNode>(Op->getOperand(0))->getZExtValue()) {
+  default:
+    return SDValue();
+  case Intrinsic::mips_shilo:
+    return lowerDSPIntr(Op, DAG, MipsISD::SHILO);
+  case Intrinsic::mips_dpau_h_qbl:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPAU_H_QBL);
+  case Intrinsic::mips_dpau_h_qbr:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPAU_H_QBR);
+  case Intrinsic::mips_dpsu_h_qbl:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPSU_H_QBL);
+  case Intrinsic::mips_dpsu_h_qbr:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPSU_H_QBR);
+  case Intrinsic::mips_dpa_w_ph:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPA_W_PH);
+  case Intrinsic::mips_dps_w_ph:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPS_W_PH);
+  case Intrinsic::mips_dpax_w_ph:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPAX_W_PH);
+  case Intrinsic::mips_dpsx_w_ph:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPSX_W_PH);
+  case Intrinsic::mips_mulsa_w_ph:
+    return lowerDSPIntr(Op, DAG, MipsISD::MULSA_W_PH);
+  case Intrinsic::mips_mult:
+    return lowerDSPIntr(Op, DAG, MipsISD::Mult);
+  case Intrinsic::mips_multu:
+    return lowerDSPIntr(Op, DAG, MipsISD::Multu);
+  case Intrinsic::mips_madd:
+    return lowerDSPIntr(Op, DAG, MipsISD::MAdd);
+  case Intrinsic::mips_maddu:
+    return lowerDSPIntr(Op, DAG, MipsISD::MAddu);
+  case Intrinsic::mips_msub:
+    return lowerDSPIntr(Op, DAG, MipsISD::MSub);
+  case Intrinsic::mips_msubu:
+    return lowerDSPIntr(Op, DAG, MipsISD::MSubu);
+  case Intrinsic::mips_addv_b:
+  case Intrinsic::mips_addv_h:
+  case Intrinsic::mips_addv_w:
+  case Intrinsic::mips_addv_d:
+    return DAG.getNode(ISD::ADD, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_addvi_b:
+  case Intrinsic::mips_addvi_h:
+  case Intrinsic::mips_addvi_w:
+  case Intrinsic::mips_addvi_d:
+    return DAG.getNode(ISD::ADD, DL, Op->getValueType(0), Op->getOperand(1),
+                       lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_and_v:
+    return DAG.getNode(ISD::AND, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_andi_b:
+    return DAG.getNode(ISD::AND, DL, Op->getValueType(0), Op->getOperand(1),
+                       lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_bclr_b:
+  case Intrinsic::mips_bclr_h:
+  case Intrinsic::mips_bclr_w:
+  case Intrinsic::mips_bclr_d:
+    return lowerMSABitClear(Op, DAG);
+  case Intrinsic::mips_bclri_b:
+  case Intrinsic::mips_bclri_h:
+  case Intrinsic::mips_bclri_w:
+  case Intrinsic::mips_bclri_d:
+    return lowerMSABitClearImm(Op, DAG);
+  case Intrinsic::mips_binsli_b:
+  case Intrinsic::mips_binsli_h:
+  case Intrinsic::mips_binsli_w:
+  case Intrinsic::mips_binsli_d: {
+    // binsli_x(IfClear, IfSet, nbits) -> (vselect LBitsMask, IfSet, IfClear)
+    EVT VecTy = Op->getValueType(0);
+    EVT EltTy = VecTy.getVectorElementType();
+    APInt Mask = APInt::getHighBitsSet(EltTy.getSizeInBits(),
+                                       Op->getConstantOperandVal(3));
+    return DAG.getNode(ISD::VSELECT, DL, VecTy,
+                       DAG.getConstant(Mask, VecTy, true), Op->getOperand(2),
+                       Op->getOperand(1));
+  }
+  case Intrinsic::mips_binsri_b:
+  case Intrinsic::mips_binsri_h:
+  case Intrinsic::mips_binsri_w:
+  case Intrinsic::mips_binsri_d: {
+    // binsri_x(IfClear, IfSet, nbits) -> (vselect RBitsMask, IfSet, IfClear)
+    EVT VecTy = Op->getValueType(0);
+    EVT EltTy = VecTy.getVectorElementType();
+    APInt Mask = APInt::getLowBitsSet(EltTy.getSizeInBits(),
+                                      Op->getConstantOperandVal(3));
+    return DAG.getNode(ISD::VSELECT, DL, VecTy,
+                       DAG.getConstant(Mask, VecTy, true), Op->getOperand(2),
+                       Op->getOperand(1));
+  }
+  case Intrinsic::mips_bmnz_v:
+    return DAG.getNode(ISD::VSELECT, DL, Op->getValueType(0), Op->getOperand(3),
+                       Op->getOperand(2), Op->getOperand(1));
+  case Intrinsic::mips_bmnzi_b:
+    return DAG.getNode(ISD::VSELECT, DL, Op->getValueType(0),
+                       lowerMSASplatImm(Op, 3, DAG), Op->getOperand(2),
+                       Op->getOperand(1));
+  case Intrinsic::mips_bmz_v:
+    return DAG.getNode(ISD::VSELECT, DL, Op->getValueType(0), Op->getOperand(3),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_bmzi_b:
+    return DAG.getNode(ISD::VSELECT, DL, Op->getValueType(0),
+                       lowerMSASplatImm(Op, 3, DAG), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_bneg_b:
+  case Intrinsic::mips_bneg_h:
+  case Intrinsic::mips_bneg_w:
+  case Intrinsic::mips_bneg_d: {
+    EVT VecTy = Op->getValueType(0);
+    SDValue One = DAG.getConstant(1, VecTy);
+
+    return DAG.getNode(ISD::XOR, DL, VecTy, Op->getOperand(1),
+                       DAG.getNode(ISD::SHL, DL, VecTy, One,
+                                   Op->getOperand(2)));
+  }
+  case Intrinsic::mips_bnegi_b:
+  case Intrinsic::mips_bnegi_h:
+  case Intrinsic::mips_bnegi_w:
+  case Intrinsic::mips_bnegi_d:
+    return lowerMSABinaryBitImmIntr(Op, DAG, ISD::XOR, Op->getOperand(2),
+                                    !Subtarget.isLittle());
+  case Intrinsic::mips_bnz_b:
+  case Intrinsic::mips_bnz_h:
+  case Intrinsic::mips_bnz_w:
+  case Intrinsic::mips_bnz_d:
+    return DAG.getNode(MipsISD::VALL_NONZERO, DL, Op->getValueType(0),
+                       Op->getOperand(1));
+  case Intrinsic::mips_bnz_v:
+    return DAG.getNode(MipsISD::VANY_NONZERO, DL, Op->getValueType(0),
+                       Op->getOperand(1));
+  case Intrinsic::mips_bsel_v:
+    // bsel_v(Mask, IfClear, IfSet) -> (vselect Mask, IfSet, IfClear)
+    return DAG.getNode(ISD::VSELECT, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(3),
+                       Op->getOperand(2));
+  case Intrinsic::mips_bseli_b:
+    // bseli_v(Mask, IfClear, IfSet) -> (vselect Mask, IfSet, IfClear)
+    return DAG.getNode(ISD::VSELECT, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 3, DAG),
+                       Op->getOperand(2));
+  case Intrinsic::mips_bset_b:
+  case Intrinsic::mips_bset_h:
+  case Intrinsic::mips_bset_w:
+  case Intrinsic::mips_bset_d: {
+    EVT VecTy = Op->getValueType(0);
+    SDValue One = DAG.getConstant(1, VecTy);
+
+    return DAG.getNode(ISD::OR, DL, VecTy, Op->getOperand(1),
+                       DAG.getNode(ISD::SHL, DL, VecTy, One,
+                                   Op->getOperand(2)));
+  }
+  case Intrinsic::mips_bseti_b:
+  case Intrinsic::mips_bseti_h:
+  case Intrinsic::mips_bseti_w:
+  case Intrinsic::mips_bseti_d:
+    return lowerMSABinaryBitImmIntr(Op, DAG, ISD::OR, Op->getOperand(2),
+                                    !Subtarget.isLittle());
+  case Intrinsic::mips_bz_b:
+  case Intrinsic::mips_bz_h:
+  case Intrinsic::mips_bz_w:
+  case Intrinsic::mips_bz_d:
+    return DAG.getNode(MipsISD::VALL_ZERO, DL, Op->getValueType(0),
+                       Op->getOperand(1));
+  case Intrinsic::mips_bz_v:
+    return DAG.getNode(MipsISD::VANY_ZERO, DL, Op->getValueType(0),
+                       Op->getOperand(1));
+  case Intrinsic::mips_ceq_b:
+  case Intrinsic::mips_ceq_h:
+  case Intrinsic::mips_ceq_w:
+  case Intrinsic::mips_ceq_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETEQ);
+  case Intrinsic::mips_ceqi_b:
+  case Intrinsic::mips_ceqi_h:
+  case Intrinsic::mips_ceqi_w:
+  case Intrinsic::mips_ceqi_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        lowerMSASplatImm(Op, 2, DAG), ISD::SETEQ);
+  case Intrinsic::mips_cle_s_b:
+  case Intrinsic::mips_cle_s_h:
+  case Intrinsic::mips_cle_s_w:
+  case Intrinsic::mips_cle_s_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETLE);
+  case Intrinsic::mips_clei_s_b:
+  case Intrinsic::mips_clei_s_h:
+  case Intrinsic::mips_clei_s_w:
+  case Intrinsic::mips_clei_s_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        lowerMSASplatImm(Op, 2, DAG), ISD::SETLE);
+  case Intrinsic::mips_cle_u_b:
+  case Intrinsic::mips_cle_u_h:
+  case Intrinsic::mips_cle_u_w:
+  case Intrinsic::mips_cle_u_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETULE);
+  case Intrinsic::mips_clei_u_b:
+  case Intrinsic::mips_clei_u_h:
+  case Intrinsic::mips_clei_u_w:
+  case Intrinsic::mips_clei_u_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        lowerMSASplatImm(Op, 2, DAG), ISD::SETULE);
+  case Intrinsic::mips_clt_s_b:
+  case Intrinsic::mips_clt_s_h:
+  case Intrinsic::mips_clt_s_w:
+  case Intrinsic::mips_clt_s_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETLT);
+  case Intrinsic::mips_clti_s_b:
+  case Intrinsic::mips_clti_s_h:
+  case Intrinsic::mips_clti_s_w:
+  case Intrinsic::mips_clti_s_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        lowerMSASplatImm(Op, 2, DAG), ISD::SETLT);
+  case Intrinsic::mips_clt_u_b:
+  case Intrinsic::mips_clt_u_h:
+  case Intrinsic::mips_clt_u_w:
+  case Intrinsic::mips_clt_u_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETULT);
+  case Intrinsic::mips_clti_u_b:
+  case Intrinsic::mips_clti_u_h:
+  case Intrinsic::mips_clti_u_w:
+  case Intrinsic::mips_clti_u_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        lowerMSASplatImm(Op, 2, DAG), ISD::SETULT);
+  case Intrinsic::mips_copy_s_b:
+  case Intrinsic::mips_copy_s_h:
+  case Intrinsic::mips_copy_s_w:
+    return lowerMSACopyIntr(Op, DAG, MipsISD::VEXTRACT_SEXT_ELT);
+  case Intrinsic::mips_copy_s_d:
+    if (Subtarget.hasMips64())
+      // Lower directly into VEXTRACT_SEXT_ELT since i64 is legal on Mips64.
+      return lowerMSACopyIntr(Op, DAG, MipsISD::VEXTRACT_SEXT_ELT);
+    else {
+      // Lower into the generic EXTRACT_VECTOR_ELT node and let the type
+      // legalizer and EXTRACT_VECTOR_ELT lowering sort it out.
+      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op),
+                         Op->getValueType(0), Op->getOperand(1),
+                         Op->getOperand(2));
+    }
+  case Intrinsic::mips_copy_u_b:
+  case Intrinsic::mips_copy_u_h:
+  case Intrinsic::mips_copy_u_w:
+    return lowerMSACopyIntr(Op, DAG, MipsISD::VEXTRACT_ZEXT_ELT);
+  case Intrinsic::mips_copy_u_d:
+    if (Subtarget.hasMips64())
+      // Lower directly into VEXTRACT_ZEXT_ELT since i64 is legal on Mips64.
+      return lowerMSACopyIntr(Op, DAG, MipsISD::VEXTRACT_ZEXT_ELT);
+    else {
+      // Lower into the generic EXTRACT_VECTOR_ELT node and let the type
+      // legalizer and EXTRACT_VECTOR_ELT lowering sort it out.
+      // Note: When i64 is illegal, this results in copy_s.w instructions
+      // instead of copy_u.w instructions. This makes no difference to the
+      // behaviour since i64 is only illegal when the register file is 32-bit.
+      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op),
+                         Op->getValueType(0), Op->getOperand(1),
+                         Op->getOperand(2));
+    }
+  case Intrinsic::mips_div_s_b:
+  case Intrinsic::mips_div_s_h:
+  case Intrinsic::mips_div_s_w:
+  case Intrinsic::mips_div_s_d:
+    return DAG.getNode(ISD::SDIV, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_div_u_b:
+  case Intrinsic::mips_div_u_h:
+  case Intrinsic::mips_div_u_w:
+  case Intrinsic::mips_div_u_d:
+    return DAG.getNode(ISD::UDIV, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_fadd_w:
+  case Intrinsic::mips_fadd_d:
+    return DAG.getNode(ISD::FADD, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  // Don't lower mips_fcaf_[wd] since LLVM folds SETFALSE condcodes away
+  case Intrinsic::mips_fceq_w:
+  case Intrinsic::mips_fceq_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETOEQ);
+  case Intrinsic::mips_fcle_w:
+  case Intrinsic::mips_fcle_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETOLE);
+  case Intrinsic::mips_fclt_w:
+  case Intrinsic::mips_fclt_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETOLT);
+  case Intrinsic::mips_fcne_w:
+  case Intrinsic::mips_fcne_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETONE);
+  case Intrinsic::mips_fcor_w:
+  case Intrinsic::mips_fcor_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETO);
+  case Intrinsic::mips_fcueq_w:
+  case Intrinsic::mips_fcueq_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETUEQ);
+  case Intrinsic::mips_fcule_w:
+  case Intrinsic::mips_fcule_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETULE);
+  case Intrinsic::mips_fcult_w:
+  case Intrinsic::mips_fcult_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETULT);
+  case Intrinsic::mips_fcun_w:
+  case Intrinsic::mips_fcun_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETUO);
+  case Intrinsic::mips_fcune_w:
+  case Intrinsic::mips_fcune_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETUNE);
+  case Intrinsic::mips_fdiv_w:
+  case Intrinsic::mips_fdiv_d:
+    return DAG.getNode(ISD::FDIV, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_ffint_u_w:
+  case Intrinsic::mips_ffint_u_d:
+    return DAG.getNode(ISD::UINT_TO_FP, DL, Op->getValueType(0),
+                       Op->getOperand(1));
+  case Intrinsic::mips_ffint_s_w:
+  case Intrinsic::mips_ffint_s_d:
+    return DAG.getNode(ISD::SINT_TO_FP, DL, Op->getValueType(0),
+                       Op->getOperand(1));
+  case Intrinsic::mips_fill_b:
+  case Intrinsic::mips_fill_h:
+  case Intrinsic::mips_fill_w:
+  case Intrinsic::mips_fill_d: {
+    SmallVector<SDValue, 16> Ops;
+    EVT ResTy = Op->getValueType(0);
+
+    for (unsigned i = 0; i < ResTy.getVectorNumElements(); ++i)
+      Ops.push_back(Op->getOperand(1));
+
+    // If ResTy is v2i64 then the type legalizer will break this node down into
+    // an equivalent v4i32.
+    return DAG.getNode(ISD::BUILD_VECTOR, DL, ResTy, Ops);
+  }
+  case Intrinsic::mips_fexp2_w:
+  case Intrinsic::mips_fexp2_d: {
+    EVT ResTy = Op->getValueType(0);
+    return DAG.getNode(
+        ISD::FMUL, SDLoc(Op), ResTy, Op->getOperand(1),
+        DAG.getNode(ISD::FEXP2, SDLoc(Op), ResTy, Op->getOperand(2)));
+  }
+  case Intrinsic::mips_flog2_w:
+  case Intrinsic::mips_flog2_d:
+    return DAG.getNode(ISD::FLOG2, DL, Op->getValueType(0), Op->getOperand(1));
+  case Intrinsic::mips_fmadd_w:
+  case Intrinsic::mips_fmadd_d:
+    return DAG.getNode(ISD::FMA, SDLoc(Op), Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2), Op->getOperand(3));
+  case Intrinsic::mips_fmul_w:
+  case Intrinsic::mips_fmul_d:
+    return DAG.getNode(ISD::FMUL, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_fmsub_w:
+  case Intrinsic::mips_fmsub_d: {
+    EVT ResTy = Op->getValueType(0);
+    return DAG.getNode(ISD::FSUB, SDLoc(Op), ResTy, Op->getOperand(1),
+                       DAG.getNode(ISD::FMUL, SDLoc(Op), ResTy,
+                                   Op->getOperand(2), Op->getOperand(3)));
+  }
+  case Intrinsic::mips_frint_w:
+  case Intrinsic::mips_frint_d:
+    return DAG.getNode(ISD::FRINT, DL, Op->getValueType(0), Op->getOperand(1));
+  case Intrinsic::mips_fsqrt_w:
+  case Intrinsic::mips_fsqrt_d:
+    return DAG.getNode(ISD::FSQRT, DL, Op->getValueType(0), Op->getOperand(1));
+  case Intrinsic::mips_fsub_w:
+  case Intrinsic::mips_fsub_d:
+    return DAG.getNode(ISD::FSUB, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_ftrunc_u_w:
+  case Intrinsic::mips_ftrunc_u_d:
+    return DAG.getNode(ISD::FP_TO_UINT, DL, Op->getValueType(0),
+                       Op->getOperand(1));
+  case Intrinsic::mips_ftrunc_s_w:
+  case Intrinsic::mips_ftrunc_s_d:
+    return DAG.getNode(ISD::FP_TO_SINT, DL, Op->getValueType(0),
+                       Op->getOperand(1));
+  case Intrinsic::mips_ilvev_b:
+  case Intrinsic::mips_ilvev_h:
+  case Intrinsic::mips_ilvev_w:
+  case Intrinsic::mips_ilvev_d:
+    return DAG.getNode(MipsISD::ILVEV, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_ilvl_b:
+  case Intrinsic::mips_ilvl_h:
+  case Intrinsic::mips_ilvl_w:
+  case Intrinsic::mips_ilvl_d:
+    return DAG.getNode(MipsISD::ILVL, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_ilvod_b:
+  case Intrinsic::mips_ilvod_h:
+  case Intrinsic::mips_ilvod_w:
+  case Intrinsic::mips_ilvod_d:
+    return DAG.getNode(MipsISD::ILVOD, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_ilvr_b:
+  case Intrinsic::mips_ilvr_h:
+  case Intrinsic::mips_ilvr_w:
+  case Intrinsic::mips_ilvr_d:
+    return DAG.getNode(MipsISD::ILVR, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_insert_b:
+  case Intrinsic::mips_insert_h:
+  case Intrinsic::mips_insert_w:
+  case Intrinsic::mips_insert_d:
+    return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(3), Op->getOperand(2));
+  case Intrinsic::mips_insve_b:
+  case Intrinsic::mips_insve_h:
+  case Intrinsic::mips_insve_w:
+  case Intrinsic::mips_insve_d:
+    return DAG.getNode(MipsISD::INSVE, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2), Op->getOperand(3),
+                       DAG.getConstant(0, MVT::i32));
+  case Intrinsic::mips_ldi_b:
+  case Intrinsic::mips_ldi_h:
+  case Intrinsic::mips_ldi_w:
+  case Intrinsic::mips_ldi_d:
+    return lowerMSASplatImm(Op, 1, DAG);
+  case Intrinsic::mips_lsa:
+  case Intrinsic::mips_dlsa: {
+    EVT ResTy = Op->getValueType(0);
+    return DAG.getNode(ISD::ADD, SDLoc(Op), ResTy, Op->getOperand(1),
+                       DAG.getNode(ISD::SHL, SDLoc(Op), ResTy,
+                                   Op->getOperand(2), Op->getOperand(3)));
+  }
+  case Intrinsic::mips_maddv_b:
+  case Intrinsic::mips_maddv_h:
+  case Intrinsic::mips_maddv_w:
+  case Intrinsic::mips_maddv_d: {
+    EVT ResTy = Op->getValueType(0);
+    return DAG.getNode(ISD::ADD, SDLoc(Op), ResTy, Op->getOperand(1),
+                       DAG.getNode(ISD::MUL, SDLoc(Op), ResTy,
+                                   Op->getOperand(2), Op->getOperand(3)));
+  }
+  case Intrinsic::mips_max_s_b:
+  case Intrinsic::mips_max_s_h:
+  case Intrinsic::mips_max_s_w:
+  case Intrinsic::mips_max_s_d:
+    return DAG.getNode(MipsISD::VSMAX, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_max_u_b:
+  case Intrinsic::mips_max_u_h:
+  case Intrinsic::mips_max_u_w:
+  case Intrinsic::mips_max_u_d:
+    return DAG.getNode(MipsISD::VUMAX, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_maxi_s_b:
+  case Intrinsic::mips_maxi_s_h:
+  case Intrinsic::mips_maxi_s_w:
+  case Intrinsic::mips_maxi_s_d:
+    return DAG.getNode(MipsISD::VSMAX, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_maxi_u_b:
+  case Intrinsic::mips_maxi_u_h:
+  case Intrinsic::mips_maxi_u_w:
+  case Intrinsic::mips_maxi_u_d:
+    return DAG.getNode(MipsISD::VUMAX, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_min_s_b:
+  case Intrinsic::mips_min_s_h:
+  case Intrinsic::mips_min_s_w:
+  case Intrinsic::mips_min_s_d:
+    return DAG.getNode(MipsISD::VSMIN, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_min_u_b:
+  case Intrinsic::mips_min_u_h:
+  case Intrinsic::mips_min_u_w:
+  case Intrinsic::mips_min_u_d:
+    return DAG.getNode(MipsISD::VUMIN, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_mini_s_b:
+  case Intrinsic::mips_mini_s_h:
+  case Intrinsic::mips_mini_s_w:
+  case Intrinsic::mips_mini_s_d:
+    return DAG.getNode(MipsISD::VSMIN, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_mini_u_b:
+  case Intrinsic::mips_mini_u_h:
+  case Intrinsic::mips_mini_u_w:
+  case Intrinsic::mips_mini_u_d:
+    return DAG.getNode(MipsISD::VUMIN, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_mod_s_b:
+  case Intrinsic::mips_mod_s_h:
+  case Intrinsic::mips_mod_s_w:
+  case Intrinsic::mips_mod_s_d:
+    return DAG.getNode(ISD::SREM, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_mod_u_b:
+  case Intrinsic::mips_mod_u_h:
+  case Intrinsic::mips_mod_u_w:
+  case Intrinsic::mips_mod_u_d:
+    return DAG.getNode(ISD::UREM, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_mulv_b:
+  case Intrinsic::mips_mulv_h:
+  case Intrinsic::mips_mulv_w:
+  case Intrinsic::mips_mulv_d:
+    return DAG.getNode(ISD::MUL, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_msubv_b:
+  case Intrinsic::mips_msubv_h:
+  case Intrinsic::mips_msubv_w:
+  case Intrinsic::mips_msubv_d: {
+    EVT ResTy = Op->getValueType(0);
+    return DAG.getNode(ISD::SUB, SDLoc(Op), ResTy, Op->getOperand(1),
+                       DAG.getNode(ISD::MUL, SDLoc(Op), ResTy,
+                                   Op->getOperand(2), Op->getOperand(3)));
+  }
+  case Intrinsic::mips_nlzc_b:
+  case Intrinsic::mips_nlzc_h:
+  case Intrinsic::mips_nlzc_w:
+  case Intrinsic::mips_nlzc_d:
+    return DAG.getNode(ISD::CTLZ, DL, Op->getValueType(0), Op->getOperand(1));
+  case Intrinsic::mips_nor_v: {
+    SDValue Res = DAG.getNode(ISD::OR, DL, Op->getValueType(0),
+                              Op->getOperand(1), Op->getOperand(2));
+    return DAG.getNOT(DL, Res, Res->getValueType(0));
+  }
+  case Intrinsic::mips_nori_b: {
+    SDValue Res =  DAG.getNode(ISD::OR, DL, Op->getValueType(0),
+                               Op->getOperand(1),
+                               lowerMSASplatImm(Op, 2, DAG));
+    return DAG.getNOT(DL, Res, Res->getValueType(0));
+  }
+  case Intrinsic::mips_or_v:
+    return DAG.getNode(ISD::OR, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_ori_b:
+    return DAG.getNode(ISD::OR, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_pckev_b:
+  case Intrinsic::mips_pckev_h:
+  case Intrinsic::mips_pckev_w:
+  case Intrinsic::mips_pckev_d:
+    return DAG.getNode(MipsISD::PCKEV, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_pckod_b:
+  case Intrinsic::mips_pckod_h:
+  case Intrinsic::mips_pckod_w:
+  case Intrinsic::mips_pckod_d:
+    return DAG.getNode(MipsISD::PCKOD, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_pcnt_b:
+  case Intrinsic::mips_pcnt_h:
+  case Intrinsic::mips_pcnt_w:
+  case Intrinsic::mips_pcnt_d:
+    return DAG.getNode(ISD::CTPOP, DL, Op->getValueType(0), Op->getOperand(1));
+  case Intrinsic::mips_shf_b:
+  case Intrinsic::mips_shf_h:
+  case Intrinsic::mips_shf_w:
+    return DAG.getNode(MipsISD::SHF, DL, Op->getValueType(0),
+                       Op->getOperand(2), Op->getOperand(1));
+  case Intrinsic::mips_sll_b:
+  case Intrinsic::mips_sll_h:
+  case Intrinsic::mips_sll_w:
+  case Intrinsic::mips_sll_d:
+    return DAG.getNode(ISD::SHL, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_slli_b:
+  case Intrinsic::mips_slli_h:
+  case Intrinsic::mips_slli_w:
+  case Intrinsic::mips_slli_d:
+    return DAG.getNode(ISD::SHL, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_splat_b:
+  case Intrinsic::mips_splat_h:
+  case Intrinsic::mips_splat_w:
+  case Intrinsic::mips_splat_d:
+    // We can't lower via VECTOR_SHUFFLE because it requires constant shuffle
+    // masks, nor can we lower via BUILD_VECTOR & EXTRACT_VECTOR_ELT because
+    // EXTRACT_VECTOR_ELT can't extract i64's on MIPS32.
+    // Instead we lower to MipsISD::VSHF and match from there.
+    return DAG.getNode(MipsISD::VSHF, DL, Op->getValueType(0),
+                       lowerMSASplatZExt(Op, 2, DAG), Op->getOperand(1),
+                       Op->getOperand(1));
+  case Intrinsic::mips_splati_b:
+  case Intrinsic::mips_splati_h:
+  case Intrinsic::mips_splati_w:
+  case Intrinsic::mips_splati_d:
+    return DAG.getNode(MipsISD::VSHF, DL, Op->getValueType(0),
+                       lowerMSASplatImm(Op, 2, DAG), Op->getOperand(1),
+                       Op->getOperand(1));
+  case Intrinsic::mips_sra_b:
+  case Intrinsic::mips_sra_h:
+  case Intrinsic::mips_sra_w:
+  case Intrinsic::mips_sra_d:
+    return DAG.getNode(ISD::SRA, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_srai_b:
+  case Intrinsic::mips_srai_h:
+  case Intrinsic::mips_srai_w:
+  case Intrinsic::mips_srai_d:
+    return DAG.getNode(ISD::SRA, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_srl_b:
+  case Intrinsic::mips_srl_h:
+  case Intrinsic::mips_srl_w:
+  case Intrinsic::mips_srl_d:
+    return DAG.getNode(ISD::SRL, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_srli_b:
+  case Intrinsic::mips_srli_h:
+  case Intrinsic::mips_srli_w:
+  case Intrinsic::mips_srli_d:
+    return DAG.getNode(ISD::SRL, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_subv_b:
+  case Intrinsic::mips_subv_h:
+  case Intrinsic::mips_subv_w:
+  case Intrinsic::mips_subv_d:
+    return DAG.getNode(ISD::SUB, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_subvi_b:
+  case Intrinsic::mips_subvi_h:
+  case Intrinsic::mips_subvi_w:
+  case Intrinsic::mips_subvi_d:
+    return DAG.getNode(ISD::SUB, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_vshf_b:
+  case Intrinsic::mips_vshf_h:
+  case Intrinsic::mips_vshf_w:
+  case Intrinsic::mips_vshf_d:
+    return DAG.getNode(MipsISD::VSHF, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2), Op->getOperand(3));
+  case Intrinsic::mips_xor_v:
+    return DAG.getNode(ISD::XOR, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_xori_b:
+    return DAG.getNode(ISD::XOR, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  }
+}
+
+static SDValue lowerMSALoadIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr) {
+  SDLoc DL(Op);
+  SDValue ChainIn = Op->getOperand(0);
+  SDValue Address = Op->getOperand(2);
+  SDValue Offset  = Op->getOperand(3);
+  EVT ResTy = Op->getValueType(0);
+  EVT PtrTy = Address->getValueType(0);
+
+  Address = DAG.getNode(ISD::ADD, DL, PtrTy, Address, Offset);
+
+  return DAG.getLoad(ResTy, DL, ChainIn, Address, MachinePointerInfo(), false,
+                     false, false, 16);
+}
+
+SDValue MipsSETargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  unsigned Intr = cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue();
+  switch (Intr) {
+  default:
+    return SDValue();
+  case Intrinsic::mips_extp:
+    return lowerDSPIntr(Op, DAG, MipsISD::EXTP);
+  case Intrinsic::mips_extpdp:
+    return lowerDSPIntr(Op, DAG, MipsISD::EXTPDP);
+  case Intrinsic::mips_extr_w:
+    return lowerDSPIntr(Op, DAG, MipsISD::EXTR_W);
+  case Intrinsic::mips_extr_r_w:
+    return lowerDSPIntr(Op, DAG, MipsISD::EXTR_R_W);
+  case Intrinsic::mips_extr_rs_w:
+    return lowerDSPIntr(Op, DAG, MipsISD::EXTR_RS_W);
+  case Intrinsic::mips_extr_s_h:
+    return lowerDSPIntr(Op, DAG, MipsISD::EXTR_S_H);
+  case Intrinsic::mips_mthlip:
+    return lowerDSPIntr(Op, DAG, MipsISD::MTHLIP);
+  case Intrinsic::mips_mulsaq_s_w_ph:
+    return lowerDSPIntr(Op, DAG, MipsISD::MULSAQ_S_W_PH);
+  case Intrinsic::mips_maq_s_w_phl:
+    return lowerDSPIntr(Op, DAG, MipsISD::MAQ_S_W_PHL);
+  case Intrinsic::mips_maq_s_w_phr:
+    return lowerDSPIntr(Op, DAG, MipsISD::MAQ_S_W_PHR);
+  case Intrinsic::mips_maq_sa_w_phl:
+    return lowerDSPIntr(Op, DAG, MipsISD::MAQ_SA_W_PHL);
+  case Intrinsic::mips_maq_sa_w_phr:
+    return lowerDSPIntr(Op, DAG, MipsISD::MAQ_SA_W_PHR);
+  case Intrinsic::mips_dpaq_s_w_ph:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPAQ_S_W_PH);
+  case Intrinsic::mips_dpsq_s_w_ph:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPSQ_S_W_PH);
+  case Intrinsic::mips_dpaq_sa_l_w:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPAQ_SA_L_W);
+  case Intrinsic::mips_dpsq_sa_l_w:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPSQ_SA_L_W);
+  case Intrinsic::mips_dpaqx_s_w_ph:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPAQX_S_W_PH);
+  case Intrinsic::mips_dpaqx_sa_w_ph:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPAQX_SA_W_PH);
+  case Intrinsic::mips_dpsqx_s_w_ph:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPSQX_S_W_PH);
+  case Intrinsic::mips_dpsqx_sa_w_ph:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPSQX_SA_W_PH);
+  case Intrinsic::mips_ld_b:
+  case Intrinsic::mips_ld_h:
+  case Intrinsic::mips_ld_w:
+  case Intrinsic::mips_ld_d:
+   return lowerMSALoadIntr(Op, DAG, Intr);
+  }
+}
+
+static SDValue lowerMSAStoreIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr) {
+  SDLoc DL(Op);
+  SDValue ChainIn = Op->getOperand(0);
+  SDValue Value   = Op->getOperand(2);
+  SDValue Address = Op->getOperand(3);
+  SDValue Offset  = Op->getOperand(4);
+  EVT PtrTy = Address->getValueType(0);
+
+  Address = DAG.getNode(ISD::ADD, DL, PtrTy, Address, Offset);
+
+  return DAG.getStore(ChainIn, DL, Value, Address, MachinePointerInfo(), false,
+                      false, 16);
+}
+
+SDValue MipsSETargetLowering::lowerINTRINSIC_VOID(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  unsigned Intr = cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue();
+  switch (Intr) {
+  default:
+    return SDValue();
+  case Intrinsic::mips_st_b:
+  case Intrinsic::mips_st_h:
+  case Intrinsic::mips_st_w:
+  case Intrinsic::mips_st_d:
+    return lowerMSAStoreIntr(Op, DAG, Intr);
+  }
+}
+
+/// \brief Check if the given BuildVectorSDNode is a splat.
+/// This method currently relies on DAG nodes being reused when equivalent,
+/// so it's possible for this to return false even when isConstantSplat returns
+/// true.
+static bool isSplatVector(const BuildVectorSDNode *N) {
+  unsigned int nOps = N->getNumOperands();
+  assert(nOps > 1 && "isSplatVector has 0 or 1 sized build vector");
+
+  SDValue Operand0 = N->getOperand(0);
+
+  for (unsigned int i = 1; i < nOps; ++i) {
+    if (N->getOperand(i) != Operand0)
+      return false;
+  }
+
+  return true;
+}
+
+// Lower ISD::EXTRACT_VECTOR_ELT into MipsISD::VEXTRACT_SEXT_ELT.
+//
+// The non-value bits resulting from ISD::EXTRACT_VECTOR_ELT are undefined. We
+// choose to sign-extend but we could have equally chosen zero-extend. The
+// DAGCombiner will fold any sign/zero extension of the ISD::EXTRACT_VECTOR_ELT
+// result into this node later (possibly changing it to a zero-extend in the
+// process).
+SDValue MipsSETargetLowering::
+lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT ResTy = Op->getValueType(0);
+  SDValue Op0 = Op->getOperand(0);
+  EVT VecTy = Op0->getValueType(0);
+
+  if (!VecTy.is128BitVector())
+    return SDValue();
+
+  if (ResTy.isInteger()) {
+    SDValue Op1 = Op->getOperand(1);
+    EVT EltTy = VecTy.getVectorElementType();
+    return DAG.getNode(MipsISD::VEXTRACT_SEXT_ELT, DL, ResTy, Op0, Op1,
+                       DAG.getValueType(EltTy));
+  }
+
+  return Op;
+}
+
+static bool isConstantOrUndef(const SDValue Op) {
+  if (Op->getOpcode() == ISD::UNDEF)
+    return true;
+  if (dyn_cast<ConstantSDNode>(Op))
+    return true;
+  if (dyn_cast<ConstantFPSDNode>(Op))
+    return true;
+  return false;
+}
+
+static bool isConstantOrUndefBUILD_VECTOR(const BuildVectorSDNode *Op) {
+  for (unsigned i = 0; i < Op->getNumOperands(); ++i)
+    if (isConstantOrUndef(Op->getOperand(i)))
+      return true;
+  return false;
+}
+
+// Lowers ISD::BUILD_VECTOR into appropriate SelectionDAG nodes for the
+// backend.
+//
+// Lowers according to the following rules:
+// - Constant splats are legal as-is as long as the SplatBitSize is a power of
+//   2 less than or equal to 64 and the value fits into a signed 10-bit
+//   immediate
+// - Constant splats are lowered to bitconverted BUILD_VECTORs if SplatBitSize
+//   is a power of 2 less than or equal to 64 and the value does not fit into a
+//   signed 10-bit immediate
+// - Non-constant splats are legal as-is.
+// - Non-constant non-splats are lowered to sequences of INSERT_VECTOR_ELT.
+// - All others are illegal and must be expanded.
+SDValue MipsSETargetLowering::lowerBUILD_VECTOR(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  BuildVectorSDNode *Node = cast<BuildVectorSDNode>(Op);
+  EVT ResTy = Op->getValueType(0);
+  SDLoc DL(Op);
+  APInt SplatValue, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+
+  if (!Subtarget.hasMSA() || !ResTy.is128BitVector())
+    return SDValue();
+
+  if (Node->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
+                            HasAnyUndefs, 8,
+                            !Subtarget.isLittle()) && SplatBitSize <= 64) {
+    // We can only cope with 8, 16, 32, or 64-bit elements
+    if (SplatBitSize != 8 && SplatBitSize != 16 && SplatBitSize != 32 &&
+        SplatBitSize != 64)
+      return SDValue();
+
+    // If the value fits into a simm10 then we can use ldi.[bhwd]
+    // However, if it isn't an integer type we will have to bitcast from an
+    // integer type first. Also, if there are any undefs, we must lower them
+    // to defined values first.
+    if (ResTy.isInteger() && !HasAnyUndefs && SplatValue.isSignedIntN(10))
+      return Op;
+
+    EVT ViaVecTy;
+
+    switch (SplatBitSize) {
+    default:
+      return SDValue();
+    case 8:
+      ViaVecTy = MVT::v16i8;
+      break;
+    case 16:
+      ViaVecTy = MVT::v8i16;
+      break;
+    case 32:
+      ViaVecTy = MVT::v4i32;
+      break;
+    case 64:
+      // There's no fill.d to fall back on for 64-bit values
+      return SDValue();
+    }
+
+    // SelectionDAG::getConstant will promote SplatValue appropriately.
+    SDValue Result = DAG.getConstant(SplatValue, ViaVecTy);
+
+    // Bitcast to the type we originally wanted
+    if (ViaVecTy != ResTy)
+      Result = DAG.getNode(ISD::BITCAST, SDLoc(Node), ResTy, Result);
+
+    return Result;
+  } else if (isSplatVector(Node))
+    return Op;
+  else if (!isConstantOrUndefBUILD_VECTOR(Node)) {
+    // Use INSERT_VECTOR_ELT operations rather than expand to stores.
+    // The resulting code is the same length as the expansion, but it doesn't
+    // use memory operations
+    EVT ResTy = Node->getValueType(0);
+
+    assert(ResTy.isVector());
+
+    unsigned NumElts = ResTy.getVectorNumElements();
+    SDValue Vector = DAG.getUNDEF(ResTy);
+    for (unsigned i = 0; i < NumElts; ++i) {
+      Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ResTy, Vector,
+                           Node->getOperand(i),
+                           DAG.getConstant(i, MVT::i32));
+    }
+    return Vector;
+  }
+
+  return SDValue();
+}
+
+// Lower VECTOR_SHUFFLE into SHF (if possible).
+//
+// SHF splits the vector into blocks of four elements, then shuffles these
+// elements according to a <4 x i2> constant (encoded as an integer immediate).
+//
+// It is therefore possible to lower into SHF when the mask takes the form:
+//   <a, b, c, d, a+4, b+4, c+4, d+4, a+8, b+8, c+8, d+8, ...>
+// When undef's appear they are treated as if they were whatever value is
+// necessary in order to fit the above form.
+//
+// For example:
+//   %2 = shufflevector <8 x i16> %0, <8 x i16> undef,
+//                      <8 x i32> <i32 3, i32 2, i32 1, i32 0,
+//                                 i32 7, i32 6, i32 5, i32 4>
+// is lowered to:
+//   (SHF_H $w0, $w1, 27)
+// where the 27 comes from:
+//   3 + (2 << 2) + (1 << 4) + (0 << 6)
+static SDValue lowerVECTOR_SHUFFLE_SHF(SDValue Op, EVT ResTy,
+                                       SmallVector<int, 16> Indices,
+                                       SelectionDAG &DAG) {
+  int SHFIndices[4] = { -1, -1, -1, -1 };
+
+  if (Indices.size() < 4)
+    return SDValue();
+
+  for (unsigned i = 0; i < 4; ++i) {
+    for (unsigned j = i; j < Indices.size(); j += 4) {
+      int Idx = Indices[j];
+
+      // Convert from vector index to 4-element subvector index
+      // If an index refers to an element outside of the subvector then give up
+      if (Idx != -1) {
+        Idx -= 4 * (j / 4);
+        if (Idx < 0 || Idx >= 4)
+          return SDValue();
+      }
+
+      // If the mask has an undef, replace it with the current index.
+      // Note that it might still be undef if the current index is also undef
+      if (SHFIndices[i] == -1)
+        SHFIndices[i] = Idx;
+
+      // Check that non-undef values are the same as in the mask. If they
+      // aren't then give up
+      if (!(Idx == -1 || Idx == SHFIndices[i]))
+        return SDValue();
+    }
+  }
+
+  // Calculate the immediate. Replace any remaining undefs with zero
+  APInt Imm(32, 0);
+  for (int i = 3; i >= 0; --i) {
+    int Idx = SHFIndices[i];
+
+    if (Idx == -1)
+      Idx = 0;
+
+    Imm <<= 2;
+    Imm |= Idx & 0x3;
+  }
+
+  return DAG.getNode(MipsISD::SHF, SDLoc(Op), ResTy,
+                     DAG.getConstant(Imm, MVT::i32), Op->getOperand(0));
+}
+
+// Lower VECTOR_SHUFFLE into ILVEV (if possible).
+//
+// ILVEV interleaves the even elements from each vector.
+//
+// It is possible to lower into ILVEV when the mask takes the form:
+//   <0, n, 2, n+2, 4, n+4, ...>
+// where n is the number of elements in the vector.
+//
+// When undef's appear in the mask they are treated as if they were whatever
+// value is necessary in order to fit the above form.
+static SDValue lowerVECTOR_SHUFFLE_ILVEV(SDValue Op, EVT ResTy,
+                                         SmallVector<int, 16> Indices,
+                                         SelectionDAG &DAG) {
+  assert ((Indices.size() % 2) == 0);
+  int WsIdx = 0;
+  int WtIdx = ResTy.getVectorNumElements();
+
+  for (unsigned i = 0; i < Indices.size(); i += 2) {
+    if (Indices[i] != -1 && Indices[i] != WsIdx)
+      return SDValue();
+    if (Indices[i+1] != -1 && Indices[i+1] != WtIdx)
+      return SDValue();
+    WsIdx += 2;
+    WtIdx += 2;
+  }
+
+  return DAG.getNode(MipsISD::ILVEV, SDLoc(Op), ResTy, Op->getOperand(0),
+                     Op->getOperand(1));
+}
+
+// Lower VECTOR_SHUFFLE into ILVOD (if possible).
+//
+// ILVOD interleaves the odd elements from each vector.
+//
+// It is possible to lower into ILVOD when the mask takes the form:
+//   <1, n+1, 3, n+3, 5, n+5, ...>
+// where n is the number of elements in the vector.
+//
+// When undef's appear in the mask they are treated as if they were whatever
+// value is necessary in order to fit the above form.
+static SDValue lowerVECTOR_SHUFFLE_ILVOD(SDValue Op, EVT ResTy,
+                                         SmallVector<int, 16> Indices,
+                                         SelectionDAG &DAG) {
+  assert ((Indices.size() % 2) == 0);
+  int WsIdx = 1;
+  int WtIdx = ResTy.getVectorNumElements() + 1;
+
+  for (unsigned i = 0; i < Indices.size(); i += 2) {
+    if (Indices[i] != -1 && Indices[i] != WsIdx)
+      return SDValue();
+    if (Indices[i+1] != -1 && Indices[i+1] != WtIdx)
+      return SDValue();
+    WsIdx += 2;
+    WtIdx += 2;
+  }
+
+  return DAG.getNode(MipsISD::ILVOD, SDLoc(Op), ResTy, Op->getOperand(0),
+                     Op->getOperand(1));
+}
+
+// Lower VECTOR_SHUFFLE into ILVL (if possible).
+//
+// ILVL interleaves consecutive elements from the left half of each vector.
+//
+// It is possible to lower into ILVL when the mask takes the form:
+//   <0, n, 1, n+1, 2, n+2, ...>
+// where n is the number of elements in the vector.
+//
+// When undef's appear in the mask they are treated as if they were whatever
+// value is necessary in order to fit the above form.
+static SDValue lowerVECTOR_SHUFFLE_ILVL(SDValue Op, EVT ResTy,
+                                        SmallVector<int, 16> Indices,
+                                        SelectionDAG &DAG) {
+  assert ((Indices.size() % 2) == 0);
+  int WsIdx = 0;
+  int WtIdx = ResTy.getVectorNumElements();
+
+  for (unsigned i = 0; i < Indices.size(); i += 2) {
+    if (Indices[i] != -1 && Indices[i] != WsIdx)
+      return SDValue();
+    if (Indices[i+1] != -1 && Indices[i+1] != WtIdx)
+      return SDValue();
+    WsIdx ++;
+    WtIdx ++;
+  }
+
+  return DAG.getNode(MipsISD::ILVL, SDLoc(Op), ResTy, Op->getOperand(0),
+                     Op->getOperand(1));
+}
+
+// Lower VECTOR_SHUFFLE into ILVR (if possible).
+//
+// ILVR interleaves consecutive elements from the right half of each vector.
+//
+// It is possible to lower into ILVR when the mask takes the form:
+//   <x, n+x, x+1, n+x+1, x+2, n+x+2, ...>
+// where n is the number of elements in the vector and x is half n.
+//
+// When undef's appear in the mask they are treated as if they were whatever
+// value is necessary in order to fit the above form.
+static SDValue lowerVECTOR_SHUFFLE_ILVR(SDValue Op, EVT ResTy,
+                                        SmallVector<int, 16> Indices,
+                                        SelectionDAG &DAG) {
+  assert ((Indices.size() % 2) == 0);
+  unsigned NumElts = ResTy.getVectorNumElements();
+  int WsIdx = NumElts / 2;
+  int WtIdx = NumElts + NumElts / 2;
+
+  for (unsigned i = 0; i < Indices.size(); i += 2) {
+    if (Indices[i] != -1 && Indices[i] != WsIdx)
+      return SDValue();
+    if (Indices[i+1] != -1 && Indices[i+1] != WtIdx)
+      return SDValue();
+    WsIdx ++;
+    WtIdx ++;
+  }
+
+  return DAG.getNode(MipsISD::ILVR, SDLoc(Op), ResTy, Op->getOperand(0),
+                     Op->getOperand(1));
+}
+
+// Lower VECTOR_SHUFFLE into PCKEV (if possible).
+//
+// PCKEV copies the even elements of each vector into the result vector.
+//
+// It is possible to lower into PCKEV when the mask takes the form:
+//   <0, 2, 4, ..., n, n+2, n+4, ...>
+// where n is the number of elements in the vector.
+//
+// When undef's appear in the mask they are treated as if they were whatever
+// value is necessary in order to fit the above form.
+static SDValue lowerVECTOR_SHUFFLE_PCKEV(SDValue Op, EVT ResTy,
+                                         SmallVector<int, 16> Indices,
+                                         SelectionDAG &DAG) {
+  assert ((Indices.size() % 2) == 0);
+  int Idx = 0;
+
+  for (unsigned i = 0; i < Indices.size(); ++i) {
+    if (Indices[i] != -1 && Indices[i] != Idx)
+      return SDValue();
+    Idx += 2;
+  }
+
+  return DAG.getNode(MipsISD::PCKEV, SDLoc(Op), ResTy, Op->getOperand(0),
+                     Op->getOperand(1));
+}
+
+// Lower VECTOR_SHUFFLE into PCKOD (if possible).
+//
+// PCKOD copies the odd elements of each vector into the result vector.
+//
+// It is possible to lower into PCKOD when the mask takes the form:
+//   <1, 3, 5, ..., n+1, n+3, n+5, ...>
+// where n is the number of elements in the vector.
+//
+// When undef's appear in the mask they are treated as if they were whatever
+// value is necessary in order to fit the above form.
+static SDValue lowerVECTOR_SHUFFLE_PCKOD(SDValue Op, EVT ResTy,
+                                         SmallVector<int, 16> Indices,
+                                         SelectionDAG &DAG) {
+  assert ((Indices.size() % 2) == 0);
+  int Idx = 1;
+
+  for (unsigned i = 0; i < Indices.size(); ++i) {
+    if (Indices[i] != -1 && Indices[i] != Idx)
+      return SDValue();
+    Idx += 2;
+  }
+
+  return DAG.getNode(MipsISD::PCKOD, SDLoc(Op), ResTy, Op->getOperand(0),
+                     Op->getOperand(1));
+}
+
+// Lower VECTOR_SHUFFLE into VSHF.
+//
+// This mostly consists of converting the shuffle indices in Indices into a
+// BUILD_VECTOR and adding it as an operand to the resulting VSHF. There is
+// also code to eliminate unused operands of the VECTOR_SHUFFLE. For example,
+// if the type is v8i16 and all the indices are less than 8 then the second
+// operand is unused and can be replaced with anything. We choose to replace it
+// with the used operand since this reduces the number of instructions overall.
+static SDValue lowerVECTOR_SHUFFLE_VSHF(SDValue Op, EVT ResTy,
+                                        SmallVector<int, 16> Indices,
+                                        SelectionDAG &DAG) {
+  SmallVector<SDValue, 16> Ops;
+  SDValue Op0;
+  SDValue Op1;
+  EVT MaskVecTy = ResTy.changeVectorElementTypeToInteger();
+  EVT MaskEltTy = MaskVecTy.getVectorElementType();
+  bool Using1stVec = false;
+  bool Using2ndVec = false;
+  SDLoc DL(Op);
+  int ResTyNumElts = ResTy.getVectorNumElements();
+
+  for (int i = 0; i < ResTyNumElts; ++i) {
+    // Idx == -1 means UNDEF
+    int Idx = Indices[i];
+
+    if (0 <= Idx && Idx < ResTyNumElts)
+      Using1stVec = true;
+    if (ResTyNumElts <= Idx && Idx < ResTyNumElts * 2)
+      Using2ndVec = true;
+  }
+
+  for (SmallVector<int, 16>::iterator I = Indices.begin(); I != Indices.end();
+       ++I)
+    Ops.push_back(DAG.getTargetConstant(*I, MaskEltTy));
+
+  SDValue MaskVec = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskVecTy, Ops);
+
+  if (Using1stVec && Using2ndVec) {
+    Op0 = Op->getOperand(0);
+    Op1 = Op->getOperand(1);
+  } else if (Using1stVec)
+    Op0 = Op1 = Op->getOperand(0);
+  else if (Using2ndVec)
+    Op0 = Op1 = Op->getOperand(1);
+  else
+    llvm_unreachable("shuffle vector mask references neither vector operand?");
+
+  // VECTOR_SHUFFLE concatenates the vectors in an vectorwise fashion.
+  // <0b00, 0b01> + <0b10, 0b11> -> <0b00, 0b01, 0b10, 0b11>
+  // VSHF concatenates the vectors in a bitwise fashion:
+  // <0b00, 0b01> + <0b10, 0b11> ->
+  // 0b0100       + 0b1110       -> 0b01001110
+  //                                <0b10, 0b11, 0b00, 0b01>
+  // We must therefore swap the operands to get the correct result.
+  return DAG.getNode(MipsISD::VSHF, DL, ResTy, MaskVec, Op1, Op0);
+}
+
+// Lower VECTOR_SHUFFLE into one of a number of instructions depending on the
+// indices in the shuffle.
+SDValue MipsSETargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  ShuffleVectorSDNode *Node = cast<ShuffleVectorSDNode>(Op);
+  EVT ResTy = Op->getValueType(0);
+
+  if (!ResTy.is128BitVector())
+    return SDValue();
+
+  int ResTyNumElts = ResTy.getVectorNumElements();
+  SmallVector<int, 16> Indices;
+
+  for (int i = 0; i < ResTyNumElts; ++i)
+    Indices.push_back(Node->getMaskElt(i));
+
+  SDValue Result = lowerVECTOR_SHUFFLE_SHF(Op, ResTy, Indices, DAG);
+  if (Result.getNode())
+    return Result;
+  Result = lowerVECTOR_SHUFFLE_ILVEV(Op, ResTy, Indices, DAG);
+  if (Result.getNode())
+    return Result;
+  Result = lowerVECTOR_SHUFFLE_ILVOD(Op, ResTy, Indices, DAG);
+  if (Result.getNode())
+    return Result;
+  Result = lowerVECTOR_SHUFFLE_ILVL(Op, ResTy, Indices, DAG);
+  if (Result.getNode())
+    return Result;
+  Result = lowerVECTOR_SHUFFLE_ILVR(Op, ResTy, Indices, DAG);
+  if (Result.getNode())
+    return Result;
+  Result = lowerVECTOR_SHUFFLE_PCKEV(Op, ResTy, Indices, DAG);
+  if (Result.getNode())
+    return Result;
+  Result = lowerVECTOR_SHUFFLE_PCKOD(Op, ResTy, Indices, DAG);
+  if (Result.getNode())
+    return Result;
+  return lowerVECTOR_SHUFFLE_VSHF(Op, ResTy, Indices, DAG);
+}
+
+MachineBasicBlock * MipsSETargetLowering::
+emitBPOSGE32(MachineInstr *MI, MachineBasicBlock *BB) const{
+  // $bb:
+  //  bposge32_pseudo $vr0
+  //  =>
+  // $bb:
+  //  bposge32 $tbb
+  // $fbb:
+  //  li $vr2, 0
+  //  b $sink
+  // $tbb:
+  //  li $vr1, 1
+  // $sink:
+  //  $vr0 = phi($vr2, $fbb, $vr1, $tbb)
+
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetRegisterClass *RC = &Mips::GPR32RegClass;
+  DebugLoc DL = MI->getDebugLoc();
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = std::next(MachineFunction::iterator(BB));
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *FBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *TBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *Sink  = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, FBB);
+  F->insert(It, TBB);
+  F->insert(It, Sink);
+
+  // Transfer the remainder of BB and its successor edges to Sink.
+  Sink->splice(Sink->begin(), BB, std::next(MachineBasicBlock::iterator(MI)),
+               BB->end());
+  Sink->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Add successors.
+  BB->addSuccessor(FBB);
+  BB->addSuccessor(TBB);
+  FBB->addSuccessor(Sink);
+  TBB->addSuccessor(Sink);
+
+  // Insert the real bposge32 instruction to $BB.
+  BuildMI(BB, DL, TII->get(Mips::BPOSGE32)).addMBB(TBB);
+
+  // Fill $FBB.
+  unsigned VR2 = RegInfo.createVirtualRegister(RC);
+  BuildMI(*FBB, FBB->end(), DL, TII->get(Mips::ADDiu), VR2)
+    .addReg(Mips::ZERO).addImm(0);
+  BuildMI(*FBB, FBB->end(), DL, TII->get(Mips::B)).addMBB(Sink);
+
+  // Fill $TBB.
+  unsigned VR1 = RegInfo.createVirtualRegister(RC);
+  BuildMI(*TBB, TBB->end(), DL, TII->get(Mips::ADDiu), VR1)
+    .addReg(Mips::ZERO).addImm(1);
+
+  // Insert phi function to $Sink.
+  BuildMI(*Sink, Sink->begin(), DL, TII->get(Mips::PHI),
+          MI->getOperand(0).getReg())
+    .addReg(VR2).addMBB(FBB).addReg(VR1).addMBB(TBB);
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return Sink;
+}
+
+MachineBasicBlock * MipsSETargetLowering::
+emitMSACBranchPseudo(MachineInstr *MI, MachineBasicBlock *BB,
+                     unsigned BranchOp) const{
+  // $bb:
+  //  vany_nonzero $rd, $ws
+  //  =>
+  // $bb:
+  //  bnz.b $ws, $tbb
+  //  b $fbb
+  // $fbb:
+  //  li $rd1, 0
+  //  b $sink
+  // $tbb:
+  //  li $rd2, 1
+  // $sink:
+  //  $rd = phi($rd1, $fbb, $rd2, $tbb)
+
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetRegisterClass *RC = &Mips::GPR32RegClass;
+  DebugLoc DL = MI->getDebugLoc();
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = std::next(MachineFunction::iterator(BB));
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *FBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *TBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *Sink  = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, FBB);
+  F->insert(It, TBB);
+  F->insert(It, Sink);
+
+  // Transfer the remainder of BB and its successor edges to Sink.
+  Sink->splice(Sink->begin(), BB, std::next(MachineBasicBlock::iterator(MI)),
+               BB->end());
+  Sink->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Add successors.
+  BB->addSuccessor(FBB);
+  BB->addSuccessor(TBB);
+  FBB->addSuccessor(Sink);
+  TBB->addSuccessor(Sink);
+
+  // Insert the real bnz.b instruction to $BB.
+  BuildMI(BB, DL, TII->get(BranchOp))
+    .addReg(MI->getOperand(1).getReg())
+    .addMBB(TBB);
+
+  // Fill $FBB.
+  unsigned RD1 = RegInfo.createVirtualRegister(RC);
+  BuildMI(*FBB, FBB->end(), DL, TII->get(Mips::ADDiu), RD1)
+    .addReg(Mips::ZERO).addImm(0);
+  BuildMI(*FBB, FBB->end(), DL, TII->get(Mips::B)).addMBB(Sink);
+
+  // Fill $TBB.
+  unsigned RD2 = RegInfo.createVirtualRegister(RC);
+  BuildMI(*TBB, TBB->end(), DL, TII->get(Mips::ADDiu), RD2)
+    .addReg(Mips::ZERO).addImm(1);
+
+  // Insert phi function to $Sink.
+  BuildMI(*Sink, Sink->begin(), DL, TII->get(Mips::PHI),
+          MI->getOperand(0).getReg())
+    .addReg(RD1).addMBB(FBB).addReg(RD2).addMBB(TBB);
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return Sink;
+}
+
+// Emit the COPY_FW pseudo instruction.
+//
+// copy_fw_pseudo $fd, $ws, n
+// =>
+// copy_u_w $rt, $ws, $n
+// mtc1     $rt, $fd
+//
+// When n is zero, the equivalent operation can be performed with (potentially)
+// zero instructions due to register overlaps. This optimization is never valid
+// for lane 1 because it would require FR=0 mode which isn't supported by MSA.
+MachineBasicBlock * MipsSETargetLowering::
+emitCOPY_FW(MachineInstr *MI, MachineBasicBlock *BB) const{
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned Fd = MI->getOperand(0).getReg();
+  unsigned Ws = MI->getOperand(1).getReg();
+  unsigned Lane = MI->getOperand(2).getImm();
+
+  if (Lane == 0)
+    BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Ws, 0, Mips::sub_lo);
+  else {
+    unsigned Wt = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
+
+    BuildMI(*BB, MI, DL, TII->get(Mips::SPLATI_W), Wt).addReg(Ws).addImm(Lane);
+    BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Wt, 0, Mips::sub_lo);
+  }
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+}
+
+// Emit the COPY_FD pseudo instruction.
+//
+// copy_fd_pseudo $fd, $ws, n
+// =>
+// splati.d $wt, $ws, $n
+// copy $fd, $wt:sub_64
+//
+// When n is zero, the equivalent operation can be performed with (potentially)
+// zero instructions due to register overlaps. This optimization is always
+// valid because FR=1 mode which is the only supported mode in MSA.
+MachineBasicBlock * MipsSETargetLowering::
+emitCOPY_FD(MachineInstr *MI, MachineBasicBlock *BB) const{
+  assert(Subtarget.isFP64bit());
+
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  unsigned Fd  = MI->getOperand(0).getReg();
+  unsigned Ws  = MI->getOperand(1).getReg();
+  unsigned Lane = MI->getOperand(2).getImm() * 2;
+  DebugLoc DL = MI->getDebugLoc();
+
+  if (Lane == 0)
+    BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Ws, 0, Mips::sub_64);
+  else {
+    unsigned Wt = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass);
+
+    BuildMI(*BB, MI, DL, TII->get(Mips::SPLATI_D), Wt).addReg(Ws).addImm(1);
+    BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Wt, 0, Mips::sub_64);
+  }
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+}
+
+// Emit the INSERT_FW pseudo instruction.
+//
+// insert_fw_pseudo $wd, $wd_in, $n, $fs
+// =>
+// subreg_to_reg $wt:sub_lo, $fs
+// insve_w $wd[$n], $wd_in, $wt[0]
+MachineBasicBlock *
+MipsSETargetLowering::emitINSERT_FW(MachineInstr *MI,
+                                    MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned Wd = MI->getOperand(0).getReg();
+  unsigned Wd_in = MI->getOperand(1).getReg();
+  unsigned Lane = MI->getOperand(2).getImm();
+  unsigned Fs = MI->getOperand(3).getReg();
+  unsigned Wt = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
+
+  BuildMI(*BB, MI, DL, TII->get(Mips::SUBREG_TO_REG), Wt)
+      .addImm(0)
+      .addReg(Fs)
+      .addImm(Mips::sub_lo);
+  BuildMI(*BB, MI, DL, TII->get(Mips::INSVE_W), Wd)
+      .addReg(Wd_in)
+      .addImm(Lane)
+      .addReg(Wt)
+      .addImm(0);
+
+  MI->eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
+
+// Emit the INSERT_FD pseudo instruction.
+//
+// insert_fd_pseudo $wd, $fs, n
+// =>
+// subreg_to_reg $wt:sub_64, $fs
+// insve_d $wd[$n], $wd_in, $wt[0]
+MachineBasicBlock *
+MipsSETargetLowering::emitINSERT_FD(MachineInstr *MI,
+                                    MachineBasicBlock *BB) const {
+  assert(Subtarget.isFP64bit());
+
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned Wd = MI->getOperand(0).getReg();
+  unsigned Wd_in = MI->getOperand(1).getReg();
+  unsigned Lane = MI->getOperand(2).getImm();
+  unsigned Fs = MI->getOperand(3).getReg();
+  unsigned Wt = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass);
+
+  BuildMI(*BB, MI, DL, TII->get(Mips::SUBREG_TO_REG), Wt)
+      .addImm(0)
+      .addReg(Fs)
+      .addImm(Mips::sub_64);
+  BuildMI(*BB, MI, DL, TII->get(Mips::INSVE_D), Wd)
+      .addReg(Wd_in)
+      .addImm(Lane)
+      .addReg(Wt)
+      .addImm(0);
+
+  MI->eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
+
+// Emit the INSERT_([BHWD]|F[WD])_VIDX pseudo instruction.
+//
+// For integer:
+// (INSERT_([BHWD]|F[WD])_PSEUDO $wd, $wd_in, $n, $rs)
+// =>
+// (SLL $lanetmp1, $lane, <log2size)
+// (SLD_B $wdtmp1, $wd_in, $wd_in, $lanetmp1)
+// (INSERT_[BHWD], $wdtmp2, $wdtmp1, 0, $rs)
+// (NEG $lanetmp2, $lanetmp1)
+// (SLD_B $wd, $wdtmp2, $wdtmp2,  $lanetmp2)
+//
+// For floating point:
+// (INSERT_([BHWD]|F[WD])_PSEUDO $wd, $wd_in, $n, $fs)
+// =>
+// (SUBREG_TO_REG $wt, $fs, <subreg>)
+// (SLL $lanetmp1, $lane, <log2size)
+// (SLD_B $wdtmp1, $wd_in, $wd_in, $lanetmp1)
+// (INSVE_[WD], $wdtmp2, 0, $wdtmp1, 0)
+// (NEG $lanetmp2, $lanetmp1)
+// (SLD_B $wd, $wdtmp2, $wdtmp2,  $lanetmp2)
+MachineBasicBlock *
+MipsSETargetLowering::emitINSERT_DF_VIDX(MachineInstr *MI,
+                                         MachineBasicBlock *BB,
+                                         unsigned EltSizeInBytes,
+                                         bool IsFP) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned Wd = MI->getOperand(0).getReg();
+  unsigned SrcVecReg = MI->getOperand(1).getReg();
+  unsigned LaneReg = MI->getOperand(2).getReg();
+  unsigned SrcValReg = MI->getOperand(3).getReg();
+
+  const TargetRegisterClass *VecRC = nullptr;
+  const TargetRegisterClass *GPRRC =
+      Subtarget.isGP64bit() ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
+  unsigned EltLog2Size;
+  unsigned InsertOp = 0;
+  unsigned InsveOp = 0;
+  switch (EltSizeInBytes) {
+  default:
+    llvm_unreachable("Unexpected size");
+  case 1:
+    EltLog2Size = 0;
+    InsertOp = Mips::INSERT_B;
+    InsveOp = Mips::INSVE_B;
+    VecRC = &Mips::MSA128BRegClass;
+    break;
+  case 2:
+    EltLog2Size = 1;
+    InsertOp = Mips::INSERT_H;
+    InsveOp = Mips::INSVE_H;
+    VecRC = &Mips::MSA128HRegClass;
+    break;
+  case 4:
+    EltLog2Size = 2;
+    InsertOp = Mips::INSERT_W;
+    InsveOp = Mips::INSVE_W;
+    VecRC = &Mips::MSA128WRegClass;
+    break;
+  case 8:
+    EltLog2Size = 3;
+    InsertOp = Mips::INSERT_D;
+    InsveOp = Mips::INSVE_D;
+    VecRC = &Mips::MSA128DRegClass;
+    break;
+  }
+
+  if (IsFP) {
+    unsigned Wt = RegInfo.createVirtualRegister(VecRC);
+    BuildMI(*BB, MI, DL, TII->get(Mips::SUBREG_TO_REG), Wt)
+        .addImm(0)
+        .addReg(SrcValReg)
+        .addImm(EltSizeInBytes == 8 ? Mips::sub_64 : Mips::sub_lo);
+    SrcValReg = Wt;
+  }
+
+  // Convert the lane index into a byte index
+  if (EltSizeInBytes != 1) {
+    unsigned LaneTmp1 = RegInfo.createVirtualRegister(GPRRC);
+    BuildMI(*BB, MI, DL, TII->get(Mips::SLL), LaneTmp1)
+        .addReg(LaneReg)
+        .addImm(EltLog2Size);
+    LaneReg = LaneTmp1;
+  }
+
+  // Rotate bytes around so that the desired lane is element zero
+  unsigned WdTmp1 = RegInfo.createVirtualRegister(VecRC);
+  BuildMI(*BB, MI, DL, TII->get(Mips::SLD_B), WdTmp1)
+      .addReg(SrcVecReg)
+      .addReg(SrcVecReg)
+      .addReg(LaneReg);
+
+  unsigned WdTmp2 = RegInfo.createVirtualRegister(VecRC);
+  if (IsFP) {
+    // Use insve.df to insert to element zero
+    BuildMI(*BB, MI, DL, TII->get(InsveOp), WdTmp2)
+        .addReg(WdTmp1)
+        .addImm(0)
+        .addReg(SrcValReg)
+        .addImm(0);
+  } else {
+    // Use insert.df to insert to element zero
+    BuildMI(*BB, MI, DL, TII->get(InsertOp), WdTmp2)
+        .addReg(WdTmp1)
+        .addReg(SrcValReg)
+        .addImm(0);
+  }
+
+  // Rotate elements the rest of the way for a full rotation.
+  // sld.df inteprets $rt modulo the number of columns so we only need to negate
+  // the lane index to do this.
+  unsigned LaneTmp2 = RegInfo.createVirtualRegister(GPRRC);
+  BuildMI(*BB, MI, DL, TII->get(Mips::SUB), LaneTmp2)
+      .addReg(Mips::ZERO)
+      .addReg(LaneReg);
+  BuildMI(*BB, MI, DL, TII->get(Mips::SLD_B), Wd)
+      .addReg(WdTmp2)
+      .addReg(WdTmp2)
+      .addReg(LaneTmp2);
+
+  MI->eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
+
+// Emit the FILL_FW pseudo instruction.
+//
+// fill_fw_pseudo $wd, $fs
+// =>
+// implicit_def $wt1
+// insert_subreg $wt2:subreg_lo, $wt1, $fs
+// splati.w $wd, $wt2[0]
+MachineBasicBlock *
+MipsSETargetLowering::emitFILL_FW(MachineInstr *MI,
+                                  MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned Wd = MI->getOperand(0).getReg();
+  unsigned Fs = MI->getOperand(1).getReg();
+  unsigned Wt1 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
+  unsigned Wt2 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
+
+  BuildMI(*BB, MI, DL, TII->get(Mips::IMPLICIT_DEF), Wt1);
+  BuildMI(*BB, MI, DL, TII->get(Mips::INSERT_SUBREG), Wt2)
+      .addReg(Wt1)
+      .addReg(Fs)
+      .addImm(Mips::sub_lo);
+  BuildMI(*BB, MI, DL, TII->get(Mips::SPLATI_W), Wd).addReg(Wt2).addImm(0);
+
+  MI->eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
+
+// Emit the FILL_FD pseudo instruction.
+//
+// fill_fd_pseudo $wd, $fs
+// =>
+// implicit_def $wt1
+// insert_subreg $wt2:subreg_64, $wt1, $fs
+// splati.d $wd, $wt2[0]
+MachineBasicBlock *
+MipsSETargetLowering::emitFILL_FD(MachineInstr *MI,
+                                  MachineBasicBlock *BB) const {
+  assert(Subtarget.isFP64bit());
+
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned Wd = MI->getOperand(0).getReg();
+  unsigned Fs = MI->getOperand(1).getReg();
+  unsigned Wt1 = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass);
+  unsigned Wt2 = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass);
+
+  BuildMI(*BB, MI, DL, TII->get(Mips::IMPLICIT_DEF), Wt1);
+  BuildMI(*BB, MI, DL, TII->get(Mips::INSERT_SUBREG), Wt2)
+      .addReg(Wt1)
+      .addReg(Fs)
+      .addImm(Mips::sub_64);
+  BuildMI(*BB, MI, DL, TII->get(Mips::SPLATI_D), Wd).addReg(Wt2).addImm(0);
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+}
+
+// Emit the FEXP2_W_1 pseudo instructions.
+//
+// fexp2_w_1_pseudo $wd, $wt
+// =>
+// ldi.w $ws, 1
+// fexp2.w $wd, $ws, $wt
+MachineBasicBlock *
+MipsSETargetLowering::emitFEXP2_W_1(MachineInstr *MI,
+                                    MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  const TargetRegisterClass *RC = &Mips::MSA128WRegClass;
+  unsigned Ws1 = RegInfo.createVirtualRegister(RC);
+  unsigned Ws2 = RegInfo.createVirtualRegister(RC);
+  DebugLoc DL = MI->getDebugLoc();
+
+  // Splat 1.0 into a vector
+  BuildMI(*BB, MI, DL, TII->get(Mips::LDI_W), Ws1).addImm(1);
+  BuildMI(*BB, MI, DL, TII->get(Mips::FFINT_U_W), Ws2).addReg(Ws1);
+
+  // Emit 1.0 * fexp2(Wt)
+  BuildMI(*BB, MI, DL, TII->get(Mips::FEXP2_W), MI->getOperand(0).getReg())
+      .addReg(Ws2)
+      .addReg(MI->getOperand(1).getReg());
+
+  MI->eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
+
+// Emit the FEXP2_D_1 pseudo instructions.
+//
+// fexp2_d_1_pseudo $wd, $wt
+// =>
+// ldi.d $ws, 1
+// fexp2.d $wd, $ws, $wt
+MachineBasicBlock *
+MipsSETargetLowering::emitFEXP2_D_1(MachineInstr *MI,
+                                    MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  const TargetRegisterClass *RC = &Mips::MSA128DRegClass;
+  unsigned Ws1 = RegInfo.createVirtualRegister(RC);
+  unsigned Ws2 = RegInfo.createVirtualRegister(RC);
+  DebugLoc DL = MI->getDebugLoc();
+
+  // Splat 1.0 into a vector
+  BuildMI(*BB, MI, DL, TII->get(Mips::LDI_D), Ws1).addImm(1);
+  BuildMI(*BB, MI, DL, TII->get(Mips::FFINT_U_D), Ws2).addReg(Ws1);
+
+  // Emit 1.0 * fexp2(Wt)
+  BuildMI(*BB, MI, DL, TII->get(Mips::FEXP2_D), MI->getOperand(0).getReg())
+      .addReg(Ws2)
+      .addReg(MI->getOperand(1).getReg());
+
+  MI->eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.h b/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.h
new file mode 100644
index 0000000..00d8683
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.h
@@ -0,0 +1,116 @@
+//===-- MipsSEISelLowering.h - MipsSE DAG Lowering Interface ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Subclass of MipsTargetLowering specialized for mips32/64.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSSEISELLOWERING_H
+#define MIPSSEISELLOWERING_H
+
+#include "MipsISelLowering.h"
+#include "MipsRegisterInfo.h"
+
+namespace llvm {
+  class MipsSETargetLowering : public MipsTargetLowering  {
+  public:
+    explicit MipsSETargetLowering(MipsTargetMachine &TM,
+                                  const MipsSubtarget &STI);
+
+    /// \brief Enable MSA support for the given integer type and Register
+    /// class.
+    void addMSAIntType(MVT::SimpleValueType Ty, const TargetRegisterClass *RC);
+    /// \brief Enable MSA support for the given floating-point type and
+    /// Register class.
+    void addMSAFloatType(MVT::SimpleValueType Ty,
+                         const TargetRegisterClass *RC);
+
+    bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AS = 0,
+                                       bool *Fast = nullptr) const override;
+
+    SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+    SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
+    MachineBasicBlock *
+    EmitInstrWithCustomInserter(MachineInstr *MI,
+                                MachineBasicBlock *MBB) const override;
+
+    bool isShuffleMaskLegal(const SmallVectorImpl<int> &Mask,
+                            EVT VT) const override {
+      return false;
+    }
+
+    const TargetRegisterClass *getRepRegClassFor(MVT VT) const override;
+
+  private:
+    bool isEligibleForTailCallOptimization(const MipsCC &MipsCCInfo,
+                                     unsigned NextStackOffset,
+                                     const MipsFunctionInfo& FI) const override;
+
+    void
+    getOpndList(SmallVectorImpl<SDValue> &Ops,
+                std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
+                bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
+                CallLoweringInfo &CLI, SDValue Callee,
+                SDValue Chain) const override;
+
+    SDValue lowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+
+    SDValue lowerMulDiv(SDValue Op, unsigned NewOpc, bool HasLo, bool HasHi,
+                        SelectionDAG &DAG) const;
+
+    SDValue lowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+    /// \brief Lower VECTOR_SHUFFLE into one of a number of instructions
+    /// depending on the indices in the shuffle.
+    SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+
+    MachineBasicBlock *emitBPOSGE32(MachineInstr *MI,
+                                    MachineBasicBlock *BB) const;
+    MachineBasicBlock *emitMSACBranchPseudo(MachineInstr *MI,
+                                            MachineBasicBlock *BB,
+                                            unsigned BranchOp) const;
+    /// \brief Emit the COPY_FW pseudo instruction
+    MachineBasicBlock *emitCOPY_FW(MachineInstr *MI,
+                                   MachineBasicBlock *BB) const;
+    /// \brief Emit the COPY_FD pseudo instruction
+    MachineBasicBlock *emitCOPY_FD(MachineInstr *MI,
+                                   MachineBasicBlock *BB) const;
+    /// \brief Emit the INSERT_FW pseudo instruction
+    MachineBasicBlock *emitINSERT_FW(MachineInstr *MI,
+                                     MachineBasicBlock *BB) const;
+    /// \brief Emit the INSERT_FD pseudo instruction
+    MachineBasicBlock *emitINSERT_FD(MachineInstr *MI,
+                                     MachineBasicBlock *BB) const;
+    /// \brief Emit the INSERT_([BHWD]|F[WD])_VIDX pseudo instruction
+    MachineBasicBlock *emitINSERT_DF_VIDX(MachineInstr *MI,
+                                          MachineBasicBlock *BB,
+                                          unsigned EltSizeInBytes,
+                                          bool IsFP) const;
+    /// \brief Emit the FILL_FW pseudo instruction
+    MachineBasicBlock *emitFILL_FW(MachineInstr *MI,
+                                   MachineBasicBlock *BB) const;
+    /// \brief Emit the FILL_FD pseudo instruction
+    MachineBasicBlock *emitFILL_FD(MachineInstr *MI,
+                                   MachineBasicBlock *BB) const;
+    /// \brief Emit the FEXP2_W_1 pseudo instructions.
+    MachineBasicBlock *emitFEXP2_W_1(MachineInstr *MI,
+                                     MachineBasicBlock *BB) const;
+    /// \brief Emit the FEXP2_D_1 pseudo instructions.
+    MachineBasicBlock *emitFEXP2_D_1(MachineInstr *MI,
+                                     MachineBasicBlock *BB) const;
+  };
+}
+
+#endif // MipsSEISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp b/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp
new file mode 100644
index 0000000..69cb74c
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp
@@ -0,0 +1,637 @@
+//===-- MipsSEInstrInfo.cpp - Mips32/64 Instruction Information -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips32/64 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsSEInstrInfo.h"
+#include "InstPrinter/MipsInstPrinter.h"
+#include "MipsMachineFunction.h"
+#include "MipsTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+MipsSEInstrInfo::MipsSEInstrInfo(const MipsSubtarget &STI)
+    : MipsInstrInfo(STI, STI.getRelocationModel() == Reloc::PIC_ ? Mips::B
+                                                                 : Mips::J),
+      RI(STI), IsN64(STI.isABI_N64()) {}
+
+const MipsRegisterInfo &MipsSEInstrInfo::getRegisterInfo() const {
+  return RI;
+}
+
+/// isLoadFromStackSlot - If the specified machine instruction is a direct
+/// load from a stack slot, return the virtual or physical register number of
+/// the destination along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than loading from the stack slot.
+unsigned MipsSEInstrInfo::
+isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const
+{
+  unsigned Opc = MI->getOpcode();
+
+  if ((Opc == Mips::LW)   || (Opc == Mips::LD)   ||
+      (Opc == Mips::LWC1) || (Opc == Mips::LDC1) || (Opc == Mips::LDC164)) {
+    if ((MI->getOperand(1).isFI()) && // is a stack slot
+        (MI->getOperand(2).isImm()) &&  // the imm is zero
+        (isZeroImm(MI->getOperand(2)))) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+  }
+
+  return 0;
+}
+
+/// isStoreToStackSlot - If the specified machine instruction is a direct
+/// store to a stack slot, return the virtual or physical register number of
+/// the source reg along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than storing to the stack slot.
+unsigned MipsSEInstrInfo::
+isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const
+{
+  unsigned Opc = MI->getOpcode();
+
+  if ((Opc == Mips::SW)   || (Opc == Mips::SD)   ||
+      (Opc == Mips::SWC1) || (Opc == Mips::SDC1) || (Opc == Mips::SDC164)) {
+    if ((MI->getOperand(1).isFI()) && // is a stack slot
+        (MI->getOperand(2).isImm()) &&  // the imm is zero
+        (isZeroImm(MI->getOperand(2)))) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+  }
+  return 0;
+}
+
+void MipsSEInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I, DebugLoc DL,
+                                  unsigned DestReg, unsigned SrcReg,
+                                  bool KillSrc) const {
+  unsigned Opc = 0, ZeroReg = 0;
+  bool isMicroMips = Subtarget.inMicroMipsMode();
+
+  if (Mips::GPR32RegClass.contains(DestReg)) { // Copy to CPU Reg.
+    if (Mips::GPR32RegClass.contains(SrcReg)) {
+      if (isMicroMips)
+        Opc = Mips::MOVE16_MM;
+      else
+        Opc = Mips::ADDu, ZeroReg = Mips::ZERO;
+    } else if (Mips::CCRRegClass.contains(SrcReg))
+      Opc = Mips::CFC1;
+    else if (Mips::FGR32RegClass.contains(SrcReg))
+      Opc = Mips::MFC1;
+    else if (Mips::HI32RegClass.contains(SrcReg)) {
+      Opc = isMicroMips ? Mips::MFHI16_MM : Mips::MFHI;
+      SrcReg = 0;
+    } else if (Mips::LO32RegClass.contains(SrcReg)) {
+      Opc = isMicroMips ? Mips::MFLO16_MM : Mips::MFLO;
+      SrcReg = 0;
+    } else if (Mips::HI32DSPRegClass.contains(SrcReg))
+      Opc = Mips::MFHI_DSP;
+    else if (Mips::LO32DSPRegClass.contains(SrcReg))
+      Opc = Mips::MFLO_DSP;
+    else if (Mips::DSPCCRegClass.contains(SrcReg)) {
+      BuildMI(MBB, I, DL, get(Mips::RDDSP), DestReg).addImm(1 << 4)
+        .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+      return;
+    }
+    else if (Mips::MSACtrlRegClass.contains(SrcReg))
+      Opc = Mips::CFCMSA;
+  }
+  else if (Mips::GPR32RegClass.contains(SrcReg)) { // Copy from CPU Reg.
+    if (Mips::CCRRegClass.contains(DestReg))
+      Opc = Mips::CTC1;
+    else if (Mips::FGR32RegClass.contains(DestReg))
+      Opc = Mips::MTC1;
+    else if (Mips::HI32RegClass.contains(DestReg))
+      Opc = Mips::MTHI, DestReg = 0;
+    else if (Mips::LO32RegClass.contains(DestReg))
+      Opc = Mips::MTLO, DestReg = 0;
+    else if (Mips::HI32DSPRegClass.contains(DestReg))
+      Opc = Mips::MTHI_DSP;
+    else if (Mips::LO32DSPRegClass.contains(DestReg))
+      Opc = Mips::MTLO_DSP;
+    else if (Mips::DSPCCRegClass.contains(DestReg)) {
+      BuildMI(MBB, I, DL, get(Mips::WRDSP))
+        .addReg(SrcReg, getKillRegState(KillSrc)).addImm(1 << 4)
+        .addReg(DestReg, RegState::ImplicitDefine);
+      return;
+    }
+    else if (Mips::MSACtrlRegClass.contains(DestReg))
+      Opc = Mips::CTCMSA;
+  }
+  else if (Mips::FGR32RegClass.contains(DestReg, SrcReg))
+    Opc = Mips::FMOV_S;
+  else if (Mips::AFGR64RegClass.contains(DestReg, SrcReg))
+    Opc = Mips::FMOV_D32;
+  else if (Mips::FGR64RegClass.contains(DestReg, SrcReg))
+    Opc = Mips::FMOV_D64;
+  else if (Mips::GPR64RegClass.contains(DestReg)) { // Copy to CPU64 Reg.
+    if (Mips::GPR64RegClass.contains(SrcReg))
+      Opc = Mips::DADDu, ZeroReg = Mips::ZERO_64;
+    else if (Mips::HI64RegClass.contains(SrcReg))
+      Opc = Mips::MFHI64, SrcReg = 0;
+    else if (Mips::LO64RegClass.contains(SrcReg))
+      Opc = Mips::MFLO64, SrcReg = 0;
+    else if (Mips::FGR64RegClass.contains(SrcReg))
+      Opc = Mips::DMFC1;
+  }
+  else if (Mips::GPR64RegClass.contains(SrcReg)) { // Copy from CPU64 Reg.
+    if (Mips::HI64RegClass.contains(DestReg))
+      Opc = Mips::MTHI64, DestReg = 0;
+    else if (Mips::LO64RegClass.contains(DestReg))
+      Opc = Mips::MTLO64, DestReg = 0;
+    else if (Mips::FGR64RegClass.contains(DestReg))
+      Opc = Mips::DMTC1;
+  }
+  else if (Mips::MSA128BRegClass.contains(DestReg)) { // Copy to MSA reg
+    if (Mips::MSA128BRegClass.contains(SrcReg))
+      Opc = Mips::MOVE_V;
+  }
+
+  assert(Opc && "Cannot copy registers");
+
+  MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opc));
+
+  if (DestReg)
+    MIB.addReg(DestReg, RegState::Define);
+
+  if (SrcReg)
+    MIB.addReg(SrcReg, getKillRegState(KillSrc));
+
+  if (ZeroReg)
+    MIB.addReg(ZeroReg);
+}
+
+void MipsSEInstrInfo::
+storeRegToStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                unsigned SrcReg, bool isKill, int FI,
+                const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
+                int64_t Offset) const {
+  DebugLoc DL;
+  if (I != MBB.end()) DL = I->getDebugLoc();
+  MachineMemOperand *MMO = GetMemOperand(MBB, FI, MachineMemOperand::MOStore);
+
+  unsigned Opc = 0;
+
+  if (Mips::GPR32RegClass.hasSubClassEq(RC))
+    Opc = Mips::SW;
+  else if (Mips::GPR64RegClass.hasSubClassEq(RC))
+    Opc = Mips::SD;
+  else if (Mips::ACC64RegClass.hasSubClassEq(RC))
+    Opc = Mips::STORE_ACC64;
+  else if (Mips::ACC64DSPRegClass.hasSubClassEq(RC))
+    Opc = Mips::STORE_ACC64DSP;
+  else if (Mips::ACC128RegClass.hasSubClassEq(RC))
+    Opc = Mips::STORE_ACC128;
+  else if (Mips::DSPCCRegClass.hasSubClassEq(RC))
+    Opc = Mips::STORE_CCOND_DSP;
+  else if (Mips::FGR32RegClass.hasSubClassEq(RC))
+    Opc = Mips::SWC1;
+  else if (Mips::AFGR64RegClass.hasSubClassEq(RC))
+    Opc = Mips::SDC1;
+  else if (Mips::FGR64RegClass.hasSubClassEq(RC))
+    Opc = Mips::SDC164;
+  else if (RC->hasType(MVT::v16i8))
+    Opc = Mips::ST_B;
+  else if (RC->hasType(MVT::v8i16) || RC->hasType(MVT::v8f16))
+    Opc = Mips::ST_H;
+  else if (RC->hasType(MVT::v4i32) || RC->hasType(MVT::v4f32))
+    Opc = Mips::ST_W;
+  else if (RC->hasType(MVT::v2i64) || RC->hasType(MVT::v2f64))
+    Opc = Mips::ST_D;
+
+  assert(Opc && "Register class not handled!");
+  BuildMI(MBB, I, DL, get(Opc)).addReg(SrcReg, getKillRegState(isKill))
+    .addFrameIndex(FI).addImm(Offset).addMemOperand(MMO);
+}
+
+void MipsSEInstrInfo::
+loadRegFromStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                 unsigned DestReg, int FI, const TargetRegisterClass *RC,
+                 const TargetRegisterInfo *TRI, int64_t Offset) const {
+  DebugLoc DL;
+  if (I != MBB.end()) DL = I->getDebugLoc();
+  MachineMemOperand *MMO = GetMemOperand(MBB, FI, MachineMemOperand::MOLoad);
+  unsigned Opc = 0;
+
+  if (Mips::GPR32RegClass.hasSubClassEq(RC))
+    Opc = Mips::LW;
+  else if (Mips::GPR64RegClass.hasSubClassEq(RC))
+    Opc = Mips::LD;
+  else if (Mips::ACC64RegClass.hasSubClassEq(RC))
+    Opc = Mips::LOAD_ACC64;
+  else if (Mips::ACC64DSPRegClass.hasSubClassEq(RC))
+    Opc = Mips::LOAD_ACC64DSP;
+  else if (Mips::ACC128RegClass.hasSubClassEq(RC))
+    Opc = Mips::LOAD_ACC128;
+  else if (Mips::DSPCCRegClass.hasSubClassEq(RC))
+    Opc = Mips::LOAD_CCOND_DSP;
+  else if (Mips::FGR32RegClass.hasSubClassEq(RC))
+    Opc = Mips::LWC1;
+  else if (Mips::AFGR64RegClass.hasSubClassEq(RC))
+    Opc = Mips::LDC1;
+  else if (Mips::FGR64RegClass.hasSubClassEq(RC))
+    Opc = Mips::LDC164;
+  else if (RC->hasType(MVT::v16i8))
+    Opc = Mips::LD_B;
+  else if (RC->hasType(MVT::v8i16) || RC->hasType(MVT::v8f16))
+    Opc = Mips::LD_H;
+  else if (RC->hasType(MVT::v4i32) || RC->hasType(MVT::v4f32))
+    Opc = Mips::LD_W;
+  else if (RC->hasType(MVT::v2i64) || RC->hasType(MVT::v2f64))
+    Opc = Mips::LD_D;
+
+  assert(Opc && "Register class not handled!");
+  BuildMI(MBB, I, DL, get(Opc), DestReg).addFrameIndex(FI).addImm(Offset)
+    .addMemOperand(MMO);
+}
+
+bool MipsSEInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
+  MachineBasicBlock &MBB = *MI->getParent();
+  bool isMicroMips = Subtarget.inMicroMipsMode();
+  unsigned Opc;
+
+  switch(MI->getDesc().getOpcode()) {
+  default:
+    return false;
+  case Mips::RetRA:
+    expandRetRA(MBB, MI);
+    break;
+  case Mips::PseudoMFHI:
+    Opc = isMicroMips ? Mips::MFHI16_MM : Mips::MFHI;
+    expandPseudoMFHiLo(MBB, MI, Opc);
+    break;
+  case Mips::PseudoMFLO:
+    Opc = isMicroMips ? Mips::MFLO16_MM : Mips::MFLO;
+    expandPseudoMFHiLo(MBB, MI, Opc);
+    break;
+  case Mips::PseudoMFHI64:
+    expandPseudoMFHiLo(MBB, MI, Mips::MFHI64);
+    break;
+  case Mips::PseudoMFLO64:
+    expandPseudoMFHiLo(MBB, MI, Mips::MFLO64);
+    break;
+  case Mips::PseudoMTLOHI:
+    expandPseudoMTLoHi(MBB, MI, Mips::MTLO, Mips::MTHI, false);
+    break;
+  case Mips::PseudoMTLOHI64:
+    expandPseudoMTLoHi(MBB, MI, Mips::MTLO64, Mips::MTHI64, false);
+    break;
+  case Mips::PseudoMTLOHI_DSP:
+    expandPseudoMTLoHi(MBB, MI, Mips::MTLO_DSP, Mips::MTHI_DSP, true);
+    break;
+  case Mips::PseudoCVT_S_W:
+    expandCvtFPInt(MBB, MI, Mips::CVT_S_W, Mips::MTC1, false);
+    break;
+  case Mips::PseudoCVT_D32_W:
+    expandCvtFPInt(MBB, MI, Mips::CVT_D32_W, Mips::MTC1, false);
+    break;
+  case Mips::PseudoCVT_S_L:
+    expandCvtFPInt(MBB, MI, Mips::CVT_S_L, Mips::DMTC1, true);
+    break;
+  case Mips::PseudoCVT_D64_W:
+    expandCvtFPInt(MBB, MI, Mips::CVT_D64_W, Mips::MTC1, true);
+    break;
+  case Mips::PseudoCVT_D64_L:
+    expandCvtFPInt(MBB, MI, Mips::CVT_D64_L, Mips::DMTC1, true);
+    break;
+  case Mips::BuildPairF64:
+    expandBuildPairF64(MBB, MI, false);
+    break;
+  case Mips::BuildPairF64_64:
+    expandBuildPairF64(MBB, MI, true);
+    break;
+  case Mips::ExtractElementF64:
+    expandExtractElementF64(MBB, MI, false);
+    break;
+  case Mips::ExtractElementF64_64:
+    expandExtractElementF64(MBB, MI, true);
+    break;
+  case Mips::MIPSeh_return32:
+  case Mips::MIPSeh_return64:
+    expandEhReturn(MBB, MI);
+    break;
+  }
+
+  MBB.erase(MI);
+  return true;
+}
+
+/// getOppositeBranchOpc - Return the inverse of the specified
+/// opcode, e.g. turning BEQ to BNE.
+unsigned MipsSEInstrInfo::getOppositeBranchOpc(unsigned Opc) const {
+  switch (Opc) {
+  default:           llvm_unreachable("Illegal opcode!");
+  case Mips::BEQ:    return Mips::BNE;
+  case Mips::BNE:    return Mips::BEQ;
+  case Mips::BGTZ:   return Mips::BLEZ;
+  case Mips::BGEZ:   return Mips::BLTZ;
+  case Mips::BLTZ:   return Mips::BGEZ;
+  case Mips::BLEZ:   return Mips::BGTZ;
+  case Mips::BEQ64:  return Mips::BNE64;
+  case Mips::BNE64:  return Mips::BEQ64;
+  case Mips::BGTZ64: return Mips::BLEZ64;
+  case Mips::BGEZ64: return Mips::BLTZ64;
+  case Mips::BLTZ64: return Mips::BGEZ64;
+  case Mips::BLEZ64: return Mips::BGTZ64;
+  case Mips::BC1T:   return Mips::BC1F;
+  case Mips::BC1F:   return Mips::BC1T;
+  }
+}
+
+/// Adjust SP by Amount bytes.
+void MipsSEInstrInfo::adjustStackPtr(unsigned SP, int64_t Amount,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const {
+  const MipsSubtarget &STI = Subtarget;
+  DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
+  unsigned ADDu = STI.isABI_N64() ? Mips::DADDu : Mips::ADDu;
+  unsigned ADDiu = STI.isABI_N64() ? Mips::DADDiu : Mips::ADDiu;
+
+  if (isInt<16>(Amount))// addi sp, sp, amount
+    BuildMI(MBB, I, DL, get(ADDiu), SP).addReg(SP).addImm(Amount);
+  else { // Expand immediate that doesn't fit in 16-bit.
+    unsigned Reg = loadImmediate(Amount, MBB, I, DL, nullptr);
+    BuildMI(MBB, I, DL, get(ADDu), SP).addReg(SP).addReg(Reg, RegState::Kill);
+  }
+}
+
+/// This function generates the sequence of instructions needed to get the
+/// result of adding register REG and immediate IMM.
+unsigned
+MipsSEInstrInfo::loadImmediate(int64_t Imm, MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator II, DebugLoc DL,
+                               unsigned *NewImm) const {
+  MipsAnalyzeImmediate AnalyzeImm;
+  const MipsSubtarget &STI = Subtarget;
+  MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
+  unsigned Size = STI.isABI_N64() ? 64 : 32;
+  unsigned LUi = STI.isABI_N64() ? Mips::LUi64 : Mips::LUi;
+  unsigned ZEROReg = STI.isABI_N64() ? Mips::ZERO_64 : Mips::ZERO;
+  const TargetRegisterClass *RC = STI.isABI_N64() ?
+    &Mips::GPR64RegClass : &Mips::GPR32RegClass;
+  bool LastInstrIsADDiu = NewImm;
+
+  const MipsAnalyzeImmediate::InstSeq &Seq =
+    AnalyzeImm.Analyze(Imm, Size, LastInstrIsADDiu);
+  MipsAnalyzeImmediate::InstSeq::const_iterator Inst = Seq.begin();
+
+  assert(Seq.size() && (!LastInstrIsADDiu || (Seq.size() > 1)));
+
+  // The first instruction can be a LUi, which is different from other
+  // instructions (ADDiu, ORI and SLL) in that it does not have a register
+  // operand.
+  unsigned Reg = RegInfo.createVirtualRegister(RC);
+
+  if (Inst->Opc == LUi)
+    BuildMI(MBB, II, DL, get(LUi), Reg).addImm(SignExtend64<16>(Inst->ImmOpnd));
+  else
+    BuildMI(MBB, II, DL, get(Inst->Opc), Reg).addReg(ZEROReg)
+      .addImm(SignExtend64<16>(Inst->ImmOpnd));
+
+  // Build the remaining instructions in Seq.
+  for (++Inst; Inst != Seq.end() - LastInstrIsADDiu; ++Inst)
+    BuildMI(MBB, II, DL, get(Inst->Opc), Reg).addReg(Reg, RegState::Kill)
+      .addImm(SignExtend64<16>(Inst->ImmOpnd));
+
+  if (LastInstrIsADDiu)
+    *NewImm = Inst->ImmOpnd;
+
+  return Reg;
+}
+
+unsigned MipsSEInstrInfo::getAnalyzableBrOpc(unsigned Opc) const {
+  return (Opc == Mips::BEQ    || Opc == Mips::BNE    || Opc == Mips::BGTZ   ||
+          Opc == Mips::BGEZ   || Opc == Mips::BLTZ   || Opc == Mips::BLEZ   ||
+          Opc == Mips::BEQ64  || Opc == Mips::BNE64  || Opc == Mips::BGTZ64 ||
+          Opc == Mips::BGEZ64 || Opc == Mips::BLTZ64 || Opc == Mips::BLEZ64 ||
+          Opc == Mips::BC1T   || Opc == Mips::BC1F   || Opc == Mips::B      ||
+          Opc == Mips::J) ?
+         Opc : 0;
+}
+
+void MipsSEInstrInfo::expandRetRA(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I) const {
+  if (Subtarget.isGP64bit())
+    BuildMI(MBB, I, I->getDebugLoc(), get(Mips::PseudoReturn64))
+        .addReg(Mips::RA_64);
+  else
+    BuildMI(MBB, I, I->getDebugLoc(), get(Mips::PseudoReturn)).addReg(Mips::RA);
+}
+
+std::pair<bool, bool>
+MipsSEInstrInfo::compareOpndSize(unsigned Opc,
+                                 const MachineFunction &MF) const {
+  const MCInstrDesc &Desc = get(Opc);
+  assert(Desc.NumOperands == 2 && "Unary instruction expected.");
+  const MipsRegisterInfo *RI = &getRegisterInfo();
+  unsigned DstRegSize = getRegClass(Desc, 0, RI, MF)->getSize();
+  unsigned SrcRegSize = getRegClass(Desc, 1, RI, MF)->getSize();
+
+  return std::make_pair(DstRegSize > SrcRegSize, DstRegSize < SrcRegSize);
+}
+
+void MipsSEInstrInfo::expandPseudoMFHiLo(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator I,
+                                         unsigned NewOpc) const {
+  BuildMI(MBB, I, I->getDebugLoc(), get(NewOpc), I->getOperand(0).getReg());
+}
+
+void MipsSEInstrInfo::expandPseudoMTLoHi(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator I,
+                                         unsigned LoOpc,
+                                         unsigned HiOpc,
+                                         bool HasExplicitDef) const {
+  // Expand
+  //  lo_hi pseudomtlohi $gpr0, $gpr1
+  // to these two instructions:
+  //  mtlo $gpr0
+  //  mthi $gpr1
+
+  DebugLoc DL = I->getDebugLoc();
+  const MachineOperand &SrcLo = I->getOperand(1), &SrcHi = I->getOperand(2);
+  MachineInstrBuilder LoInst = BuildMI(MBB, I, DL, get(LoOpc));
+  MachineInstrBuilder HiInst = BuildMI(MBB, I, DL, get(HiOpc));
+  LoInst.addReg(SrcLo.getReg(), getKillRegState(SrcLo.isKill()));
+  HiInst.addReg(SrcHi.getReg(), getKillRegState(SrcHi.isKill()));
+
+  // Add lo/hi registers if the mtlo/hi instructions created have explicit
+  // def registers.
+  if (HasExplicitDef) {
+    unsigned DstReg = I->getOperand(0).getReg();
+    unsigned DstLo = getRegisterInfo().getSubReg(DstReg, Mips::sub_lo);
+    unsigned DstHi = getRegisterInfo().getSubReg(DstReg, Mips::sub_hi);
+    LoInst.addReg(DstLo, RegState::Define);
+    HiInst.addReg(DstHi, RegState::Define);
+  }
+}
+
+void MipsSEInstrInfo::expandCvtFPInt(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I,
+                                     unsigned CvtOpc, unsigned MovOpc,
+                                     bool IsI64) const {
+  const MCInstrDesc &CvtDesc = get(CvtOpc), &MovDesc = get(MovOpc);
+  const MachineOperand &Dst = I->getOperand(0), &Src = I->getOperand(1);
+  unsigned DstReg = Dst.getReg(), SrcReg = Src.getReg(), TmpReg = DstReg;
+  unsigned KillSrc =  getKillRegState(Src.isKill());
+  DebugLoc DL = I->getDebugLoc();
+  bool DstIsLarger, SrcIsLarger;
+
+  std::tie(DstIsLarger, SrcIsLarger) =
+      compareOpndSize(CvtOpc, *MBB.getParent());
+
+  if (DstIsLarger)
+    TmpReg = getRegisterInfo().getSubReg(DstReg, Mips::sub_lo);
+
+  if (SrcIsLarger)
+    DstReg = getRegisterInfo().getSubReg(DstReg, Mips::sub_lo);
+
+  BuildMI(MBB, I, DL, MovDesc, TmpReg).addReg(SrcReg, KillSrc);
+  BuildMI(MBB, I, DL, CvtDesc, DstReg).addReg(TmpReg, RegState::Kill);
+}
+
+void MipsSEInstrInfo::expandExtractElementF64(MachineBasicBlock &MBB,
+                                              MachineBasicBlock::iterator I,
+                                              bool FP64) const {
+  unsigned DstReg = I->getOperand(0).getReg();
+  unsigned SrcReg = I->getOperand(1).getReg();
+  unsigned N = I->getOperand(2).getImm();
+  DebugLoc dl = I->getDebugLoc();
+
+  assert(N < 2 && "Invalid immediate");
+  unsigned SubIdx = N ? Mips::sub_hi : Mips::sub_lo;
+  unsigned SubReg = getRegisterInfo().getSubReg(SrcReg, SubIdx);
+
+  // FPXX on MIPS-II or MIPS32r1 should have been handled with a spill/reload
+  // in MipsSEFrameLowering.cpp.
+  assert(!(Subtarget.isABI_FPXX() && !Subtarget.hasMips32r2()));
+
+  // FP64A (FP64 with nooddspreg) should have been handled with a spill/reload
+  // in MipsSEFrameLowering.cpp.
+  assert(!(Subtarget.isFP64bit() && !Subtarget.useOddSPReg()));
+
+  if (SubIdx == Mips::sub_hi && Subtarget.hasMTHC1()) {
+    // FIXME: Strictly speaking MFHC1 only reads the top 32-bits however, we
+    //        claim to read the whole 64-bits as part of a white lie used to
+    //        temporarily work around a widespread bug in the -mfp64 support.
+    //        The problem is that none of the 32-bit fpu ops mention the fact
+    //        that they clobber the upper 32-bits of the 64-bit FPR. Fixing that
+    //        requires a major overhaul of the FPU implementation which can't
+    //        be done right now due to time constraints.
+    //        MFHC1 is one of two instructions that are affected since they are
+    //        the only instructions that don't read the lower 32-bits.
+    //        We therefore pretend that it reads the bottom 32-bits to
+    //        artificially create a dependency and prevent the scheduler
+    //        changing the behaviour of the code.
+    BuildMI(MBB, I, dl, get(FP64 ? Mips::MFHC1_D64 : Mips::MFHC1_D32), DstReg)
+        .addReg(SrcReg);
+  } else
+    BuildMI(MBB, I, dl, get(Mips::MFC1), DstReg).addReg(SubReg);
+}
+
+void MipsSEInstrInfo::expandBuildPairF64(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator I,
+                                         bool FP64) const {
+  unsigned DstReg = I->getOperand(0).getReg();
+  unsigned LoReg = I->getOperand(1).getReg(), HiReg = I->getOperand(2).getReg();
+  const MCInstrDesc& Mtc1Tdd = get(Mips::MTC1);
+  DebugLoc dl = I->getDebugLoc();
+  const TargetRegisterInfo &TRI = getRegisterInfo();
+
+  // When mthc1 is available, use:
+  //   mtc1 Lo, $fp
+  //   mthc1 Hi, $fp
+  //
+  // Otherwise, for O32 FPXX ABI:
+  //   spill + reload via ldc1
+  // This case is handled by the frame lowering code.
+  //
+  // Otherwise, for FP32:
+  //   mtc1 Lo, $fp
+  //   mtc1 Hi, $fp + 1
+  //
+  // The case where dmtc1 is available doesn't need to be handled here
+  // because it never creates a BuildPairF64 node.
+
+  // FPXX on MIPS-II or MIPS32r1 should have been handled with a spill/reload
+  // in MipsSEFrameLowering.cpp.
+  assert(!(Subtarget.isABI_FPXX() && !Subtarget.hasMips32r2()));
+
+  // FP64A (FP64 with nooddspreg) should have been handled with a spill/reload
+  // in MipsSEFrameLowering.cpp.
+  assert(!(Subtarget.isFP64bit() && !Subtarget.useOddSPReg()));
+
+  BuildMI(MBB, I, dl, Mtc1Tdd, TRI.getSubReg(DstReg, Mips::sub_lo))
+    .addReg(LoReg);
+
+  if (Subtarget.hasMTHC1()) {
+    // FIXME: The .addReg(DstReg) is a white lie used to temporarily work
+    //        around a widespread bug in the -mfp64 support.
+    //        The problem is that none of the 32-bit fpu ops mention the fact
+    //        that they clobber the upper 32-bits of the 64-bit FPR. Fixing that
+    //        requires a major overhaul of the FPU implementation which can't
+    //        be done right now due to time constraints.
+    //        MTHC1 is one of two instructions that are affected since they are
+    //        the only instructions that don't read the lower 32-bits.
+    //        We therefore pretend that it reads the bottom 32-bits to
+    //        artificially create a dependency and prevent the scheduler
+    //        changing the behaviour of the code.
+    BuildMI(MBB, I, dl, get(FP64 ? Mips::MTHC1_D64 : Mips::MTHC1_D32), DstReg)
+        .addReg(DstReg)
+        .addReg(HiReg);
+  } else if (Subtarget.isABI_FPXX())
+    llvm_unreachable("BuildPairF64 not expanded in frame lowering code!");
+  else
+    BuildMI(MBB, I, dl, Mtc1Tdd, TRI.getSubReg(DstReg, Mips::sub_hi))
+      .addReg(HiReg);
+}
+
+void MipsSEInstrInfo::expandEhReturn(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const {
+  // This pseudo instruction is generated as part of the lowering of
+  // ISD::EH_RETURN. We convert it to a stack increment by OffsetReg, and
+  // indirect jump to TargetReg
+  unsigned ADDU = Subtarget.isABI_N64() ? Mips::DADDu : Mips::ADDu;
+  unsigned SP = Subtarget.isGP64bit() ? Mips::SP_64 : Mips::SP;
+  unsigned RA = Subtarget.isGP64bit() ? Mips::RA_64 : Mips::RA;
+  unsigned T9 = Subtarget.isGP64bit() ? Mips::T9_64 : Mips::T9;
+  unsigned ZERO = Subtarget.isGP64bit() ? Mips::ZERO_64 : Mips::ZERO;
+  unsigned OffsetReg = I->getOperand(0).getReg();
+  unsigned TargetReg = I->getOperand(1).getReg();
+
+  // addu $ra, $v0, $zero
+  // addu $sp, $sp, $v1
+  // jr   $ra (via RetRA)
+  const TargetMachine &TM = MBB.getParent()->getTarget();
+  if (TM.getRelocationModel() == Reloc::PIC_)
+    BuildMI(MBB, I, I->getDebugLoc(), TM.getInstrInfo()->get(ADDU), T9)
+        .addReg(TargetReg)
+        .addReg(ZERO);
+  BuildMI(MBB, I, I->getDebugLoc(), TM.getInstrInfo()->get(ADDU), RA)
+      .addReg(TargetReg)
+      .addReg(ZERO);
+  BuildMI(MBB, I, I->getDebugLoc(), TM.getInstrInfo()->get(ADDU), SP)
+      .addReg(SP)
+      .addReg(OffsetReg);
+  expandRetRA(MBB, I);
+}
+
+const MipsInstrInfo *llvm::createMipsSEInstrInfo(const MipsSubtarget &STI) {
+  return new MipsSEInstrInfo(STI);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.h b/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.h
new file mode 100644
index 0000000..9576fef
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.h
@@ -0,0 +1,119 @@
+//===-- MipsSEInstrInfo.h - Mips32/64 Instruction Information ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips32/64 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSSEINSTRUCTIONINFO_H
+#define MIPSSEINSTRUCTIONINFO_H
+
+#include "MipsInstrInfo.h"
+#include "MipsSERegisterInfo.h"
+
+namespace llvm {
+
+class MipsSEInstrInfo : public MipsInstrInfo {
+  const MipsSERegisterInfo RI;
+  bool IsN64;
+
+public:
+  explicit MipsSEInstrInfo(const MipsSubtarget &STI);
+
+  const MipsRegisterInfo &getRegisterInfo() const override;
+
+  /// isLoadFromStackSlot - If the specified machine instruction is a direct
+  /// load from a stack slot, return the virtual or physical register number of
+  /// the destination along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than loading from the stack slot.
+  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                               int &FrameIndex) const override;
+
+  /// isStoreToStackSlot - If the specified machine instruction is a direct
+  /// store to a stack slot, return the virtual or physical register number of
+  /// the source reg along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than storing to the stack slot.
+  unsigned isStoreToStackSlot(const MachineInstr *MI,
+                              int &FrameIndex) const override;
+
+  void copyPhysReg(MachineBasicBlock &MBB,
+                   MachineBasicBlock::iterator MI, DebugLoc DL,
+                   unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+
+  void storeRegToStack(MachineBasicBlock &MBB,
+                       MachineBasicBlock::iterator MI,
+                       unsigned SrcReg, bool isKill, int FrameIndex,
+                       const TargetRegisterClass *RC,
+                       const TargetRegisterInfo *TRI,
+                       int64_t Offset) const override;
+
+  void loadRegFromStack(MachineBasicBlock &MBB,
+                        MachineBasicBlock::iterator MI,
+                        unsigned DestReg, int FrameIndex,
+                        const TargetRegisterClass *RC,
+                        const TargetRegisterInfo *TRI,
+                        int64_t Offset) const override;
+
+  bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
+
+  unsigned getOppositeBranchOpc(unsigned Opc) const override;
+
+  /// Adjust SP by Amount bytes.
+  void adjustStackPtr(unsigned SP, int64_t Amount, MachineBasicBlock &MBB,
+                      MachineBasicBlock::iterator I) const;
+
+  /// Emit a series of instructions to load an immediate. If NewImm is a
+  /// non-NULL parameter, the last instruction is not emitted, but instead
+  /// its immediate operand is returned in NewImm.
+  unsigned loadImmediate(int64_t Imm, MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator II, DebugLoc DL,
+                         unsigned *NewImm) const;
+
+private:
+  unsigned getAnalyzableBrOpc(unsigned Opc) const override;
+
+  void expandRetRA(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const;
+
+  std::pair<bool, bool> compareOpndSize(unsigned Opc,
+                                        const MachineFunction &MF) const;
+
+  void expandPseudoMFHiLo(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                          unsigned NewOpc) const;
+
+  void expandPseudoMTLoHi(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                          unsigned LoOpc, unsigned HiOpc,
+                          bool HasExplicitDef) const;
+
+  /// Expand pseudo Int-to-FP conversion instructions.
+  ///
+  /// For example, the following pseudo instruction
+  ///  PseudoCVT_D32_W D2, A5
+  /// gets expanded into these two instructions:
+  ///  MTC1 F4, A5
+  ///  CVT_D32_W D2, F4
+  ///
+  /// We do this expansion post-RA to avoid inserting a floating point copy
+  /// instruction between MTC1 and CVT_D32_W.
+  void expandCvtFPInt(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                      unsigned CvtOpc, unsigned MovOpc, bool IsI64) const;
+
+  void expandExtractElementF64(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator I, bool FP64) const;
+  void expandBuildPairF64(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator I, bool FP64) const;
+  void expandEhReturn(MachineBasicBlock &MBB,
+                      MachineBasicBlock::iterator I) const;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp b/contrib/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp
new file mode 100644
index 0000000..0af1a6b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp
@@ -0,0 +1,204 @@
+//===-- MipsSERegisterInfo.cpp - MIPS32/64 Register Information -== -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MIPS32/64 implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsSERegisterInfo.h"
+#include "Mips.h"
+#include "MipsAnalyzeImmediate.h"
+#include "MipsMachineFunction.h"
+#include "MipsSEInstrInfo.h"
+#include "MipsSubtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-reg-info"
+
+MipsSERegisterInfo::MipsSERegisterInfo(const MipsSubtarget &ST)
+  : MipsRegisterInfo(ST) {}
+
+bool MipsSERegisterInfo::
+requiresRegisterScavenging(const MachineFunction &MF) const {
+  return true;
+}
+
+bool MipsSERegisterInfo::
+requiresFrameIndexScavenging(const MachineFunction &MF) const {
+  return true;
+}
+
+const TargetRegisterClass *
+MipsSERegisterInfo::intRegClass(unsigned Size) const {
+  if (Size == 4)
+    return &Mips::GPR32RegClass;
+
+  assert(Size == 8);
+  return &Mips::GPR64RegClass;
+}
+
+/// Get the size of the offset supported by the given load/store.
+/// The result includes the effects of any scale factors applied to the
+/// instruction immediate.
+static inline unsigned getLoadStoreOffsetSizeInBits(const unsigned Opcode) {
+  switch (Opcode) {
+  case Mips::LD_B:
+  case Mips::ST_B:
+    return 10;
+  case Mips::LD_H:
+  case Mips::ST_H:
+    return 10 + 1 /* scale factor */;
+  case Mips::LD_W:
+  case Mips::ST_W:
+    return 10 + 2 /* scale factor */;
+  case Mips::LD_D:
+  case Mips::ST_D:
+    return 10 + 3 /* scale factor */;
+  default:
+    return 16;
+  }
+}
+
+/// Get the scale factor applied to the immediate in the given load/store.
+static inline unsigned getLoadStoreOffsetAlign(const unsigned Opcode) {
+  switch (Opcode) {
+  case Mips::LD_H:
+  case Mips::ST_H:
+    return 2;
+  case Mips::LD_W:
+  case Mips::ST_W:
+    return 4;
+  case Mips::LD_D:
+  case Mips::ST_D:
+    return 8;
+  default:
+    return 1;
+  }
+}
+
+void MipsSERegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
+                                     unsigned OpNo, int FrameIndex,
+                                     uint64_t StackSize,
+                                     int64_t SPOffset) const {
+  MachineInstr &MI = *II;
+  MachineFunction &MF = *MI.getParent()->getParent();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+  int MinCSFI = 0;
+  int MaxCSFI = -1;
+
+  if (CSI.size()) {
+    MinCSFI = CSI[0].getFrameIdx();
+    MaxCSFI = CSI[CSI.size() - 1].getFrameIdx();
+  }
+
+  bool EhDataRegFI = MipsFI->isEhDataRegFI(FrameIndex);
+
+  // The following stack frame objects are always referenced relative to $sp:
+  //  1. Outgoing arguments.
+  //  2. Pointer to dynamically allocated stack space.
+  //  3. Locations for callee-saved registers.
+  //  4. Locations for eh data registers.
+  // Everything else is referenced relative to whatever register
+  // getFrameRegister() returns.
+  unsigned FrameReg;
+
+  if ((FrameIndex >= MinCSFI && FrameIndex <= MaxCSFI) || EhDataRegFI)
+    FrameReg = Subtarget.isABI_N64() ? Mips::SP_64 : Mips::SP;
+  else
+    FrameReg = getFrameRegister(MF);
+
+  // Calculate final offset.
+  // - There is no need to change the offset if the frame object is one of the
+  //   following: an outgoing argument, pointer to a dynamically allocated
+  //   stack space or a $gp restore location,
+  // - If the frame object is any of the following, its offset must be adjusted
+  //   by adding the size of the stack:
+  //   incoming argument, callee-saved register location or local variable.
+  bool IsKill = false;
+  int64_t Offset;
+
+  Offset = SPOffset + (int64_t)StackSize;
+  Offset += MI.getOperand(OpNo + 1).getImm();
+
+  DEBUG(errs() << "Offset     : " << Offset << "\n" << "<--------->\n");
+
+  if (!MI.isDebugValue()) {
+    // Make sure Offset fits within the field available.
+    // For MSA instructions, this is a 10-bit signed immediate (scaled by
+    // element size), otherwise it is a 16-bit signed immediate.
+    unsigned OffsetBitSize = getLoadStoreOffsetSizeInBits(MI.getOpcode());
+    unsigned OffsetAlign = getLoadStoreOffsetAlign(MI.getOpcode());
+
+    if (OffsetBitSize < 16 && isInt<16>(Offset) &&
+        (!isIntN(OffsetBitSize, Offset) ||
+         OffsetToAlignment(Offset, OffsetAlign) != 0)) {
+      // If we have an offset that needs to fit into a signed n-bit immediate
+      // (where n < 16) and doesn't, but does fit into 16-bits then use an ADDiu
+      MachineBasicBlock &MBB = *MI.getParent();
+      DebugLoc DL = II->getDebugLoc();
+      unsigned ADDiu = Subtarget.isABI_N64() ? Mips::DADDiu : Mips::ADDiu;
+      const TargetRegisterClass *RC =
+          Subtarget.isABI_N64() ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
+      MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
+      unsigned Reg = RegInfo.createVirtualRegister(RC);
+      const MipsSEInstrInfo &TII =
+          *static_cast<const MipsSEInstrInfo *>(
+               MBB.getParent()->getTarget().getInstrInfo());
+      BuildMI(MBB, II, DL, TII.get(ADDiu), Reg).addReg(FrameReg).addImm(Offset);
+
+      FrameReg = Reg;
+      Offset = 0;
+      IsKill = true;
+    } else if (!isInt<16>(Offset)) {
+      // Otherwise split the offset into 16-bit pieces and add it in multiple
+      // instructions.
+      MachineBasicBlock &MBB = *MI.getParent();
+      DebugLoc DL = II->getDebugLoc();
+      unsigned ADDu = Subtarget.isABI_N64() ? Mips::DADDu : Mips::ADDu;
+      unsigned NewImm = 0;
+      const MipsSEInstrInfo &TII =
+          *static_cast<const MipsSEInstrInfo *>(
+               MBB.getParent()->getTarget().getInstrInfo());
+      unsigned Reg = TII.loadImmediate(Offset, MBB, II, DL,
+                                       OffsetBitSize == 16 ? &NewImm : nullptr);
+      BuildMI(MBB, II, DL, TII.get(ADDu), Reg).addReg(FrameReg)
+        .addReg(Reg, RegState::Kill);
+
+      FrameReg = Reg;
+      Offset = SignExtend64<16>(NewImm);
+      IsKill = true;
+    }
+  }
+
+  MI.getOperand(OpNo).ChangeToRegister(FrameReg, false, false, IsKill);
+  MI.getOperand(OpNo + 1).ChangeToImmediate(Offset);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsSERegisterInfo.h b/contrib/llvm/lib/Target/Mips/MipsSERegisterInfo.h
new file mode 100644
index 0000000..f2f3a7e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSERegisterInfo.h
@@ -0,0 +1,41 @@
+//===-- MipsSERegisterInfo.h - Mips32/64 Register Information ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips32/64 implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSSEREGISTERINFO_H
+#define MIPSSEREGISTERINFO_H
+
+#include "MipsRegisterInfo.h"
+
+namespace llvm {
+class MipsSEInstrInfo;
+
+class MipsSERegisterInfo : public MipsRegisterInfo {
+public:
+  MipsSERegisterInfo(const MipsSubtarget &Subtarget);
+
+  bool requiresRegisterScavenging(const MachineFunction &MF) const override;
+
+  bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
+
+  const TargetRegisterClass *intRegClass(unsigned Size) const override;
+
+private:
+  void eliminateFI(MachineBasicBlock::iterator II, unsigned OpNo,
+                   int FrameIndex, uint64_t StackSize,
+                   int64_t SPOffset) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsSchedule.td b/contrib/llvm/lib/Target/Mips/MipsSchedule.td
new file mode 100644
index 0000000..ea98199
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSchedule.td
@@ -0,0 +1,311 @@
+//===-- MipsSchedule.td - Mips Scheduling Definitions ------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Functional units across Mips chips sets. Based on GCC/Mips backend files.
+//===----------------------------------------------------------------------===//
+def ALU     : FuncUnit;
+def IMULDIV : FuncUnit;
+
+//===----------------------------------------------------------------------===//
+// Instruction Itinerary classes used for Mips
+//===----------------------------------------------------------------------===//
+def IIAlu              : InstrItinClass;
+def IIBranch           : InstrItinClass;
+def IIPseudo           : InstrItinClass;
+
+def II_ABS              : InstrItinClass;
+def II_ADDI             : InstrItinClass;
+def II_ADDIU            : InstrItinClass;
+def II_ADDU             : InstrItinClass;
+def II_ADD_D            : InstrItinClass;
+def II_ADD_S            : InstrItinClass;
+def II_AND              : InstrItinClass;
+def II_ANDI             : InstrItinClass;
+def II_BADDU            : InstrItinClass;
+def II_CEIL             : InstrItinClass;
+def II_CFC1             : InstrItinClass;
+def II_CLO              : InstrItinClass;
+def II_CLZ              : InstrItinClass;
+def II_CTC1             : InstrItinClass;
+def II_CVT              : InstrItinClass;
+def II_C_CC_D           : InstrItinClass; // Any c.<cc>.d instruction
+def II_C_CC_S           : InstrItinClass; // Any c.<cc>.s instruction
+def II_DADDIU           : InstrItinClass;
+def II_DADDU            : InstrItinClass;
+def II_DADD             : InstrItinClass;
+def II_DDIV             : InstrItinClass;
+def II_DDIVU            : InstrItinClass;
+def II_DIV              : InstrItinClass;
+def II_DIVU             : InstrItinClass;
+def II_DIV_D            : InstrItinClass;
+def II_DIV_S            : InstrItinClass;
+def II_DMFC1            : InstrItinClass;
+def II_DMTC1            : InstrItinClass;
+def II_DMUL             : InstrItinClass;
+def II_DMULT            : InstrItinClass;
+def II_DMULTU           : InstrItinClass;
+def II_DROTR            : InstrItinClass;
+def II_DROTR32          : InstrItinClass;
+def II_DROTRV           : InstrItinClass;
+def II_DSLL             : InstrItinClass;
+def II_DSLL32           : InstrItinClass;
+def II_DSLLV            : InstrItinClass;
+def II_DSRA             : InstrItinClass;
+def II_DSRA32           : InstrItinClass;
+def II_DSRAV            : InstrItinClass;
+def II_DSRL             : InstrItinClass;
+def II_DSRL32           : InstrItinClass;
+def II_DSRLV            : InstrItinClass;
+def II_DSUBU            : InstrItinClass;
+def II_DSUB             : InstrItinClass;
+def II_FLOOR            : InstrItinClass;
+def II_LB               : InstrItinClass;
+def II_LBU              : InstrItinClass;
+def II_LD               : InstrItinClass;
+def II_LDC1             : InstrItinClass;
+def II_LDL              : InstrItinClass;
+def II_LDR              : InstrItinClass;
+def II_LDXC1            : InstrItinClass;
+def II_LH               : InstrItinClass;
+def II_LHU              : InstrItinClass;
+def II_LUI              : InstrItinClass;
+def II_LUXC1            : InstrItinClass;
+def II_LW               : InstrItinClass;
+def II_LWC1             : InstrItinClass;
+def II_LWL              : InstrItinClass;
+def II_LWR              : InstrItinClass;
+def II_LWU              : InstrItinClass;
+def II_LWXC1            : InstrItinClass;
+def II_MADD             : InstrItinClass;
+def II_MADDU            : InstrItinClass;
+def II_MADD_D           : InstrItinClass;
+def II_MADD_S           : InstrItinClass;
+def II_MFC1             : InstrItinClass;
+def II_MFHC1            : InstrItinClass;
+def II_MFHI_MFLO        : InstrItinClass; // mfhi and mflo
+def II_MOVF             : InstrItinClass;
+def II_MOVF_D           : InstrItinClass;
+def II_MOVF_S           : InstrItinClass;
+def II_MOVN             : InstrItinClass;
+def II_MOVN_D           : InstrItinClass;
+def II_MOVN_S           : InstrItinClass;
+def II_MOVT             : InstrItinClass;
+def II_MOVT_D           : InstrItinClass;
+def II_MOVT_S           : InstrItinClass;
+def II_MOVZ             : InstrItinClass;
+def II_MOVZ_D           : InstrItinClass;
+def II_MOVZ_S           : InstrItinClass;
+def II_MOV_D            : InstrItinClass;
+def II_MOV_S            : InstrItinClass;
+def II_MSUB             : InstrItinClass;
+def II_MSUBU            : InstrItinClass;
+def II_MSUB_D           : InstrItinClass;
+def II_MSUB_S           : InstrItinClass;
+def II_MTC1             : InstrItinClass;
+def II_MTHC1            : InstrItinClass;
+def II_MTHI_MTLO        : InstrItinClass; // mthi and mtlo
+def II_MUL              : InstrItinClass;
+def II_MULT             : InstrItinClass;
+def II_MULTU            : InstrItinClass;
+def II_MUL_D            : InstrItinClass;
+def II_MUL_S            : InstrItinClass;
+def II_NEG              : InstrItinClass;
+def II_NMADD_D          : InstrItinClass;
+def II_NMADD_S          : InstrItinClass;
+def II_NMSUB_D          : InstrItinClass;
+def II_NMSUB_S          : InstrItinClass;
+def II_NOR              : InstrItinClass;
+def II_OR               : InstrItinClass;
+def II_ORI              : InstrItinClass;
+def II_POP              : InstrItinClass;
+def II_RDHWR            : InstrItinClass;
+def II_RESTORE          : InstrItinClass;
+def II_ROTR             : InstrItinClass;
+def II_ROTRV            : InstrItinClass;
+def II_ROUND            : InstrItinClass;
+def II_SAVE             : InstrItinClass;
+def II_SB               : InstrItinClass;
+def II_SD               : InstrItinClass;
+def II_SDC1             : InstrItinClass;
+def II_SDL              : InstrItinClass;
+def II_SDR              : InstrItinClass;
+def II_SDXC1            : InstrItinClass;
+def II_SEB              : InstrItinClass;
+def II_SEH              : InstrItinClass;
+def II_SEQ_SNE          : InstrItinClass; // seq and sne
+def II_SEQI_SNEI        : InstrItinClass; // seqi and snei
+def II_SH               : InstrItinClass;
+def II_SLL              : InstrItinClass;
+def II_SLLV             : InstrItinClass;
+def II_SLTI_SLTIU       : InstrItinClass; // slti and sltiu
+def II_SLT_SLTU         : InstrItinClass; // slt and sltu
+def II_SQRT_D           : InstrItinClass;
+def II_SQRT_S           : InstrItinClass;
+def II_SRA              : InstrItinClass;
+def II_SRAV             : InstrItinClass;
+def II_SRL              : InstrItinClass;
+def II_SRLV             : InstrItinClass;
+def II_SUBU             : InstrItinClass;
+def II_SUB_D            : InstrItinClass;
+def II_SUB_S            : InstrItinClass;
+def II_SUXC1            : InstrItinClass;
+def II_SW               : InstrItinClass;
+def II_SWC1             : InstrItinClass;
+def II_SWL              : InstrItinClass;
+def II_SWR              : InstrItinClass;
+def II_SWXC1            : InstrItinClass;
+def II_TRUNC            : InstrItinClass;
+def II_XOR              : InstrItinClass;
+def II_XORI             : InstrItinClass;
+
+//===----------------------------------------------------------------------===//
+// Mips Generic instruction itineraries.
+//===----------------------------------------------------------------------===//
+def MipsGenericItineraries : ProcessorItineraries<[ALU, IMULDIV], [], [
+  InstrItinData<IIAlu              , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_ADDI            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_ADDIU           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_ADDU            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_AND             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_BADDU           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SLL             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SRA             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SRL             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_ROTR            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SLLV            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SRAV            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SRLV            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_ROTRV           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_CLO             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_CLZ             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DADDIU          , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DADDU           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DADD            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DSLL            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DSRL            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DSRA            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DSLLV           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DSRLV           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DSRAV           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DSUBU           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DSUB            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DROTR           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DROTRV          , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_LUI             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_MOVF            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_MOVN            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_MOVN_S          , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_MOVN_D          , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_MOVT            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_MOVZ            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_NOR             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_OR              , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_POP             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_RDHWR           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SUBU            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_XOR             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_ANDI            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_ORI             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_XORI            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_LB              , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LBU             , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LH              , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LHU             , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LW              , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LWL             , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LWR             , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LD              , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LDL             , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LDR             , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_RESTORE         , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_SB              , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SH              , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SW              , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SWL             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SWR             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SDL             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SDR             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SD              , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SAVE            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SEQ_SNE         , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SEQI_SNEI       , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<IIBranch           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DMUL            , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_DMULT           , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_DMULTU          , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_MADD            , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_MADDU           , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_MFHI_MFLO       , [InstrStage<1,  [IMULDIV]>]>,
+  InstrItinData<II_MSUB            , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_MSUBU           , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_MTHI_MTLO       , [InstrStage<1,  [IMULDIV]>]>,
+  InstrItinData<II_MUL             , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_MULT            , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_MULTU           , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_MSUB            , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_MSUBU           , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_DIV             , [InstrStage<38, [IMULDIV]>]>,
+  InstrItinData<II_DIVU            , [InstrStage<38, [IMULDIV]>]>,
+  InstrItinData<II_DDIV            , [InstrStage<38, [IMULDIV]>]>,
+  InstrItinData<II_DDIVU           , [InstrStage<38, [IMULDIV]>]>,
+  InstrItinData<II_CEIL            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_CVT             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_ABS             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_FLOOR           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_NEG             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_ROUND           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_TRUNC           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_MOV_D           , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_MOV_S           , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_CFC1            , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_CTC1            , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_MOVF_D          , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_MOVF_S          , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_MOVT_D          , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_MOVT_S          , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_MOVZ_D          , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_MOVZ_S          , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_C_CC_S          , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_C_CC_D          , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_ADD_D           , [InstrStage<4,  [ALU]>]>,
+  InstrItinData<II_ADD_S           , [InstrStage<4,  [ALU]>]>,
+  InstrItinData<II_SUB_D           , [InstrStage<4,  [ALU]>]>,
+  InstrItinData<II_SUB_S           , [InstrStage<4,  [ALU]>]>,
+  InstrItinData<II_MUL_S           , [InstrStage<7,  [ALU]>]>,
+  InstrItinData<II_MADD_S          , [InstrStage<7,  [ALU]>]>,
+  InstrItinData<II_MSUB_S          , [InstrStage<7,  [ALU]>]>,
+  InstrItinData<II_NMADD_S         , [InstrStage<7,  [ALU]>]>,
+  InstrItinData<II_NMSUB_S         , [InstrStage<7,  [ALU]>]>,
+  InstrItinData<II_MUL_D           , [InstrStage<8,  [ALU]>]>,
+  InstrItinData<II_MADD_D          , [InstrStage<8,  [ALU]>]>,
+  InstrItinData<II_MSUB_D          , [InstrStage<8,  [ALU]>]>,
+  InstrItinData<II_NMADD_D         , [InstrStage<8,  [ALU]>]>,
+  InstrItinData<II_NMSUB_D         , [InstrStage<8,  [ALU]>]>,
+  InstrItinData<II_DIV_S           , [InstrStage<23, [ALU]>]>,
+  InstrItinData<II_DIV_D           , [InstrStage<36, [ALU]>]>,
+  InstrItinData<II_SQRT_S          , [InstrStage<54, [ALU]>]>,
+  InstrItinData<II_SQRT_D          , [InstrStage<12, [ALU]>]>,
+  InstrItinData<II_LDC1            , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LWC1            , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LDXC1           , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LWXC1           , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LUXC1           , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_SDC1            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SWC1            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SDXC1           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SWXC1           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SUXC1           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DMFC1           , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_DMTC1           , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_MFC1            , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_MTC1            , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_MFHC1           , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_MTHC1           , [InstrStage<2,  [ALU]>]>
+]>;
diff --git a/contrib/llvm/lib/Target/Mips/MipsSelectionDAGInfo.cpp b/contrib/llvm/lib/Target/Mips/MipsSelectionDAGInfo.cpp
new file mode 100644
index 0000000..edd8f67
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSelectionDAGInfo.cpp
@@ -0,0 +1,23 @@
+//===-- MipsSelectionDAGInfo.cpp - Mips SelectionDAG Info -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MipsSelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsTargetMachine.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-selectiondag-info"
+
+MipsSelectionDAGInfo::MipsSelectionDAGInfo(const DataLayout &DL)
+    : TargetSelectionDAGInfo(&DL) {}
+
+MipsSelectionDAGInfo::~MipsSelectionDAGInfo() {
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsSelectionDAGInfo.h b/contrib/llvm/lib/Target/Mips/MipsSelectionDAGInfo.h
new file mode 100644
index 0000000..2b3d527
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSelectionDAGInfo.h
@@ -0,0 +1,31 @@
+//===-- MipsSelectionDAGInfo.h - Mips SelectionDAG Info ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the Mips subclass for TargetSelectionDAGInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSSELECTIONDAGINFO_H
+#define MIPSSELECTIONDAGINFO_H
+
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+
+namespace llvm {
+
+class MipsTargetMachine;
+
+class MipsSelectionDAGInfo : public TargetSelectionDAGInfo {
+public:
+  explicit MipsSelectionDAGInfo(const DataLayout &DL);
+  ~MipsSelectionDAGInfo();
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsSubtarget.cpp b/contrib/llvm/lib/Target/Mips/MipsSubtarget.cpp
new file mode 100644
index 0000000..5bf875d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSubtarget.cpp
@@ -0,0 +1,219 @@
+//===-- MipsSubtarget.cpp - Mips Subtarget Information --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Mips specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsMachineFunction.h"
+#include "Mips.h"
+#include "MipsRegisterInfo.h"
+#include "MipsSubtarget.h"
+#include "MipsTargetMachine.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-subtarget"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "MipsGenSubtargetInfo.inc"
+
+// FIXME: Maybe this should be on by default when Mips16 is specified
+//
+static cl::opt<bool> Mixed16_32(
+  "mips-mixed-16-32",
+  cl::init(false),
+  cl::desc("Allow for a mixture of Mips16 "
+           "and Mips32 code in a single source file"),
+  cl::Hidden);
+
+static cl::opt<bool> Mips_Os16(
+  "mips-os16",
+  cl::init(false),
+  cl::desc("Compile all functions that don' use "
+           "floating point as Mips 16"),
+  cl::Hidden);
+
+static cl::opt<bool>
+Mips16HardFloat("mips16-hard-float", cl::NotHidden,
+                cl::desc("MIPS: mips16 hard float enable."),
+                cl::init(false));
+
+static cl::opt<bool>
+Mips16ConstantIslands(
+  "mips16-constant-islands", cl::NotHidden,
+  cl::desc("MIPS: mips16 constant islands enable."),
+  cl::init(true));
+
+/// Select the Mips CPU for the given triple and cpu name.
+/// FIXME: Merge with the copy in MipsMCTargetDesc.cpp
+static StringRef selectMipsCPU(Triple TT, StringRef CPU) {
+  if (CPU.empty() || CPU == "generic") {
+    if (TT.getArch() == Triple::mips || TT.getArch() == Triple::mipsel)
+      CPU = "mips32";
+    else
+      CPU = "mips64";
+  }
+  return CPU;
+}
+
+void MipsSubtarget::anchor() { }
+
+static std::string computeDataLayout(const MipsSubtarget &ST) {
+  std::string Ret = "";
+
+  // There are both little and big endian mips.
+  if (ST.isLittle())
+    Ret += "e";
+  else
+    Ret += "E";
+
+  Ret += "-m:m";
+
+  // Pointers are 32 bit on some ABIs.
+  if (!ST.isABI_N64())
+    Ret += "-p:32:32";
+
+  // 8 and 16 bit integers only need no have natural alignment, but try to
+  // align them to 32 bits. 64 bit integers have natural alignment.
+  Ret += "-i8:8:32-i16:16:32-i64:64";
+
+  // 32 bit registers are always available and the stack is at least 64 bit
+  // aligned. On N64 64 bit registers are also available and the stack is
+  // 128 bit aligned.
+  if (ST.isABI_N64() || ST.isABI_N32())
+    Ret += "-n32:64-S128";
+  else
+    Ret += "-n32-S64";
+
+  return Ret;
+}
+
+MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU,
+                             const std::string &FS, bool little,
+                             MipsTargetMachine *_TM)
+    : MipsGenSubtargetInfo(TT, CPU, FS), MipsArchVersion(Mips32),
+      MipsABI(UnknownABI), IsLittle(little), IsSingleFloat(false),
+      IsFPXX(false), IsFP64bit(false), UseOddSPReg(true), IsNaN2008bit(false),
+      IsGP64bit(false), HasVFPU(false), HasCnMips(false), IsLinux(true),
+      HasMips3_32(false), HasMips3_32r2(false), HasMips4_32(false),
+      HasMips4_32r2(false), HasMips5_32r2(false), InMips16Mode(false),
+      InMips16HardFloat(Mips16HardFloat), InMicroMipsMode(false), HasDSP(false),
+      HasDSPR2(false), AllowMixed16_32(Mixed16_32 | Mips_Os16), Os16(Mips_Os16),
+      HasMSA(false), TM(_TM), TargetTriple(TT),
+      DL(computeDataLayout(initializeSubtargetDependencies(CPU, FS, TM))),
+      TSInfo(DL), JITInfo(), InstrInfo(MipsInstrInfo::create(*this)),
+      FrameLowering(MipsFrameLowering::create(*this)),
+      TLInfo(MipsTargetLowering::create(*TM, *this)) {
+
+  PreviousInMips16Mode = InMips16Mode;
+
+  // Don't even attempt to generate code for MIPS-I, MIPS-II, MIPS-III, and
+  // MIPS-V. They have not been tested and currently exist for the integrated
+  // assembler only.
+  if (MipsArchVersion == Mips1)
+    report_fatal_error("Code generation for MIPS-I is not implemented", false);
+  if (MipsArchVersion == Mips2)
+    report_fatal_error("Code generation for MIPS-II is not implemented", false);
+  if (MipsArchVersion == Mips3)
+    report_fatal_error("Code generation for MIPS-III is not implemented",
+                       false);
+  if (MipsArchVersion == Mips5)
+    report_fatal_error("Code generation for MIPS-V is not implemented", false);
+
+  // Assert exactly one ABI was chosen.
+  assert(MipsABI != UnknownABI);
+  assert((((getFeatureBits() & Mips::FeatureO32) != 0) +
+          ((getFeatureBits() & Mips::FeatureEABI) != 0) +
+          ((getFeatureBits() & Mips::FeatureN32) != 0) +
+          ((getFeatureBits() & Mips::FeatureN64) != 0)) == 1);
+
+  // Check if Architecture and ABI are compatible.
+  assert(((!isGP64bit() && (isABI_O32() || isABI_EABI())) ||
+          (isGP64bit() && (isABI_N32() || isABI_N64()))) &&
+         "Invalid  Arch & ABI pair.");
+
+  if (hasMSA() && !isFP64bit())
+    report_fatal_error("MSA requires a 64-bit FPU register file (FR=1 mode). "
+                       "See -mattr=+fp64.",
+                       false);
+
+  if (!isABI_O32() && !useOddSPReg())
+    report_fatal_error("-mattr=+nooddspreg requires the O32 ABI.", false);
+
+  if (IsFPXX && (isABI_N32() || isABI_N64()))
+    report_fatal_error("FPXX is not permitted for the N32/N64 ABI's.", false);
+
+  if (hasMips32r6()) {
+    StringRef ISA = hasMips64r6() ? "MIPS64r6" : "MIPS32r6";
+
+    assert(isFP64bit());
+    assert(isNaN2008());
+    if (hasDSP())
+      report_fatal_error(ISA + " is not compatible with the DSP ASE", false);
+  }
+
+  // Is the target system Linux ?
+  if (TT.find("linux") == std::string::npos)
+    IsLinux = false;
+
+  // Set UseSmallSection.
+  // TODO: Investigate the IsLinux check. I suspect it's really checking for
+  //       bare-metal.
+  UseSmallSection = !IsLinux && (TM->getRelocationModel() == Reloc::Static);
+}
+
+/// This overrides the PostRAScheduler bit in the SchedModel for any CPU.
+bool MipsSubtarget::enablePostMachineScheduler() const { return true; }
+
+void MipsSubtarget::getCriticalPathRCs(RegClassVector &CriticalPathRCs) const {
+  CriticalPathRCs.clear();
+  CriticalPathRCs.push_back(isGP64bit() ?
+                            &Mips::GPR64RegClass : &Mips::GPR32RegClass);
+}
+
+CodeGenOpt::Level MipsSubtarget::getOptLevelToEnablePostRAScheduler() const {
+  return CodeGenOpt::Aggressive;
+}
+
+MipsSubtarget &
+MipsSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS,
+                                               const TargetMachine *TM) {
+  std::string CPUName = selectMipsCPU(TargetTriple, CPU);
+  
+  // Parse features string.
+  ParseSubtargetFeatures(CPUName, FS);
+  // Initialize scheduling itinerary for the specified CPU.
+  InstrItins = getInstrItineraryForCPU(CPUName);
+
+  if (InMips16Mode && !TM->Options.UseSoftFloat)
+    InMips16HardFloat = true;
+
+  return *this;
+}
+
+bool MipsSubtarget::abiUsesSoftFloat() const {
+  return TM->Options.UseSoftFloat && !InMips16HardFloat;
+}
+
+bool MipsSubtarget::useConstantIslands() {
+  DEBUG(dbgs() << "use constant islands " << Mips16ConstantIslands << "\n");
+  return Mips16ConstantIslands;
+}
+
+Reloc::Model MipsSubtarget::getRelocationModel() const {
+  return TM->getRelocationModel();
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsSubtarget.h b/contrib/llvm/lib/Target/Mips/MipsSubtarget.h
new file mode 100644
index 0000000..f326462
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSubtarget.h
@@ -0,0 +1,290 @@
+//===-- MipsSubtarget.h - Define Subtarget for the Mips ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Mips specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSSUBTARGET_H
+#define MIPSSUBTARGET_H
+
+#include "MipsFrameLowering.h"
+#include "MipsISelLowering.h"
+#include "MipsInstrInfo.h"
+#include "MipsJITInfo.h"
+#include "MipsSelectionDAGInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "MipsGenSubtargetInfo.inc"
+
+namespace llvm {
+class StringRef;
+
+class MipsTargetMachine;
+
+class MipsSubtarget : public MipsGenSubtargetInfo {
+  virtual void anchor();
+
+public:
+  // NOTE: O64 will not be supported.
+  enum MipsABIEnum {
+    UnknownABI, O32, N32, N64, EABI
+  };
+
+protected:
+  enum MipsArchEnum {
+    Mips1, Mips2, Mips32, Mips32r2, Mips32r6, Mips3, Mips4, Mips5, Mips64,
+    Mips64r2, Mips64r6
+  };
+
+  // Mips architecture version
+  MipsArchEnum MipsArchVersion;
+
+  // Mips supported ABIs
+  MipsABIEnum MipsABI;
+
+  // IsLittle - The target is Little Endian
+  bool IsLittle;
+
+  // IsSingleFloat - The target only supports single precision float
+  // point operations. This enable the target to use all 32 32-bit
+  // floating point registers instead of only using even ones.
+  bool IsSingleFloat;
+
+  // IsFPXX - MIPS O32 modeless ABI.
+  bool IsFPXX;
+
+  // IsFP64bit - The target processor has 64-bit floating point registers.
+  bool IsFP64bit;
+
+  /// Are odd single-precision registers permitted?
+  /// This corresponds to -modd-spreg and -mno-odd-spreg
+  bool UseOddSPReg;
+
+  // IsNan2008 - IEEE 754-2008 NaN encoding.
+  bool IsNaN2008bit;
+
+  // IsFP64bit - General-purpose registers are 64 bits wide
+  bool IsGP64bit;
+
+  // HasVFPU - Processor has a vector floating point unit.
+  bool HasVFPU;
+
+  // CPU supports cnMIPS (Cavium Networks Octeon CPU).
+  bool HasCnMips;
+
+  // isLinux - Target system is Linux. Is false we consider ELFOS for now.
+  bool IsLinux;
+
+  // UseSmallSection - Small section is used.
+  bool UseSmallSection;
+
+  /// Features related to the presence of specific instructions.
+
+  // HasMips3_32 - The subset of MIPS-III instructions added to MIPS32
+  bool HasMips3_32;
+
+  // HasMips3_32r2 - The subset of MIPS-III instructions added to MIPS32r2
+  bool HasMips3_32r2;
+
+  // HasMips4_32 - Has the subset of MIPS-IV present in MIPS32
+  bool HasMips4_32;
+
+  // HasMips4_32r2 - Has the subset of MIPS-IV present in MIPS32r2
+  bool HasMips4_32r2;
+
+  // HasMips5_32r2 - Has the subset of MIPS-V present in MIPS32r2
+  bool HasMips5_32r2;
+
+  // InMips16 -- can process Mips16 instructions
+  bool InMips16Mode;
+
+  // Mips16 hard float
+  bool InMips16HardFloat;
+
+  // PreviousInMips16 -- the function we just processed was in Mips 16 Mode
+  bool PreviousInMips16Mode;
+
+  // InMicroMips -- can process MicroMips instructions
+  bool InMicroMipsMode;
+
+  // HasDSP, HasDSPR2 -- supports DSP ASE.
+  bool HasDSP, HasDSPR2;
+
+  // Allow mixed Mips16 and Mips32 in one source file
+  bool AllowMixed16_32;
+
+  // Optimize for space by compiling all functions as Mips 16 unless
+  // it needs floating point. Functions needing floating point are
+  // compiled as Mips32
+  bool Os16;
+
+  // HasMSA -- supports MSA ASE.
+  bool HasMSA;
+
+  InstrItineraryData InstrItins;
+
+  // We can override the determination of whether we are in mips16 mode
+  // as from the command line
+  enum {NoOverride, Mips16Override, NoMips16Override} OverrideMode;
+
+  MipsTargetMachine *TM;
+
+  Triple TargetTriple;
+
+  const DataLayout DL; // Calculates type size & alignment
+  const MipsSelectionDAGInfo TSInfo;
+  MipsJITInfo JITInfo;
+  std::unique_ptr<const MipsInstrInfo> InstrInfo;
+  std::unique_ptr<const MipsFrameLowering> FrameLowering;
+  std::unique_ptr<const MipsTargetLowering> TLInfo;
+
+public:
+  /// This overrides the PostRAScheduler bit in the SchedModel for each CPU.
+  bool enablePostMachineScheduler() const override;
+  void getCriticalPathRCs(RegClassVector &CriticalPathRCs) const override;
+  CodeGenOpt::Level getOptLevelToEnablePostRAScheduler() const override;
+
+  /// Only O32 and EABI supported right now.
+  bool isABI_EABI() const { return MipsABI == EABI; }
+  bool isABI_N64() const { return MipsABI == N64; }
+  bool isABI_N32() const { return MipsABI == N32; }
+  bool isABI_O32() const { return MipsABI == O32; }
+  bool isABI_FPXX() const { return isABI_O32() && IsFPXX; }
+  unsigned getTargetABI() const { return MipsABI; }
+
+  /// This constructor initializes the data members to match that
+  /// of the specified triple.
+  MipsSubtarget(const std::string &TT, const std::string &CPU,
+                const std::string &FS, bool little, MipsTargetMachine *TM);
+
+  /// ParseSubtargetFeatures - Parses features string setting specified
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  bool hasMips1() const { return MipsArchVersion >= Mips1; }
+  bool hasMips2() const { return MipsArchVersion >= Mips2; }
+  bool hasMips3() const { return MipsArchVersion >= Mips3; }
+  bool hasMips4() const { return MipsArchVersion >= Mips4; }
+  bool hasMips5() const { return MipsArchVersion >= Mips5; }
+  bool hasMips4_32() const { return HasMips4_32; }
+  bool hasMips4_32r2() const { return HasMips4_32r2; }
+  bool hasMips32() const {
+    return MipsArchVersion >= Mips32 && MipsArchVersion != Mips3 &&
+           MipsArchVersion != Mips4 && MipsArchVersion != Mips5;
+  }
+  bool hasMips32r2() const {
+    return MipsArchVersion == Mips32r2 || MipsArchVersion == Mips32r6 ||
+           MipsArchVersion == Mips64r2 || MipsArchVersion == Mips64r6;
+  }
+  bool hasMips32r6() const {
+    return MipsArchVersion == Mips32r6 || MipsArchVersion == Mips64r6;
+  }
+  bool hasMips64() const { return MipsArchVersion >= Mips64; }
+  bool hasMips64r2() const {
+    return MipsArchVersion == Mips64r2 || MipsArchVersion == Mips64r6;
+  }
+  bool hasMips64r6() const { return MipsArchVersion == Mips64r6; }
+
+  bool hasCnMips() const { return HasCnMips; }
+
+  bool isLittle() const { return IsLittle; }
+  bool isFPXX() const { return IsFPXX; }
+  bool isFP64bit() const { return IsFP64bit; }
+  bool useOddSPReg() const { return UseOddSPReg; }
+  bool noOddSPReg() const { return !UseOddSPReg; }
+  bool isNaN2008() const { return IsNaN2008bit; }
+  bool isNotFP64bit() const { return !IsFP64bit; }
+  bool isGP64bit() const { return IsGP64bit; }
+  bool isGP32bit() const { return !IsGP64bit; }
+  bool isSingleFloat() const { return IsSingleFloat; }
+  bool isNotSingleFloat() const { return !IsSingleFloat; }
+  bool hasVFPU() const { return HasVFPU; }
+  bool inMips16Mode() const { return InMips16Mode; }
+  bool inMips16ModeDefault() const {
+    return InMips16Mode;
+  }
+  // Hard float for mips16 means essentially to compile as soft float
+  // but to use a runtime library for soft float that is written with
+  // native mips32 floating point instructions (those runtime routines
+  // run in mips32 hard float mode).
+  bool inMips16HardFloat() const {
+    return inMips16Mode() && InMips16HardFloat;
+  }
+  bool inMicroMipsMode() const { return InMicroMipsMode; }
+  bool hasDSP() const { return HasDSP; }
+  bool hasDSPR2() const { return HasDSPR2; }
+  bool hasMSA() const { return HasMSA; }
+  bool isLinux() const { return IsLinux; }
+  bool useSmallSection() const { return UseSmallSection; }
+
+  bool hasStandardEncoding() const { return !inMips16Mode(); }
+
+  bool abiUsesSoftFloat() const;
+
+  bool enableLongBranchPass() const {
+    return hasStandardEncoding() || allowMixed16_32();
+  }
+
+  /// Features related to the presence of specific instructions.
+  bool hasExtractInsert() const { return !inMips16Mode() && hasMips32r2(); }
+  bool hasMTHC1() const { return hasMips32r2(); }
+
+  const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
+  bool allowMixed16_32() const { return inMips16ModeDefault() |
+                                        AllowMixed16_32;}
+
+  bool os16() const { return Os16;};
+
+  bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
+  bool isNotTargetNaCl() const { return !TargetTriple.isOSNaCl(); }
+
+  // for now constant islands are on for the whole compilation unit but we only
+  // really use them if in addition we are in mips16 mode
+  static bool useConstantIslands();
+
+  unsigned stackAlignment() const { return hasMips64() ? 16 : 8; }
+
+  // Grab relocation model
+  Reloc::Model getRelocationModel() const;
+
+  MipsSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS,
+                                                 const TargetMachine *TM);
+
+  /// Does the system support unaligned memory access.
+  ///
+  /// MIPS32r6/MIPS64r6 require full unaligned access support but does not
+  /// specify which component of the system provides it. Hardware, software, and
+  /// hybrid implementations are all valid.
+  bool systemSupportsUnalignedAccess() const { return hasMips32r6(); }
+
+  // Set helper classes
+  void setHelperClassesMips16();
+  void setHelperClassesMipsSE();
+
+  MipsJITInfo *getJITInfo() { return &JITInfo; }
+  const MipsSelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; }
+  const DataLayout *getDataLayout() const { return &DL; }
+  const MipsInstrInfo *getInstrInfo() const { return InstrInfo.get(); }
+  const TargetFrameLowering *getFrameLowering() const {
+    return FrameLowering.get();
+  }
+  const MipsRegisterInfo *getRegisterInfo() const {
+    return &InstrInfo->getRegisterInfo();
+  }
+  const MipsTargetLowering *getTargetLowering() const { return TLInfo.get(); }
+};
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsTargetMachine.cpp b/contrib/llvm/lib/Target/Mips/MipsTargetMachine.cpp
new file mode 100644
index 0000000..bb1870e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsTargetMachine.cpp
@@ -0,0 +1,198 @@
+//===-- MipsTargetMachine.cpp - Define TargetMachine for Mips -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements the info about Mips target spec.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsTargetMachine.h"
+#include "Mips.h"
+#include "Mips16FrameLowering.h"
+#include "Mips16HardFloat.h"
+#include "Mips16ISelDAGToDAG.h"
+#include "Mips16ISelLowering.h"
+#include "Mips16InstrInfo.h"
+#include "MipsFrameLowering.h"
+#include "MipsInstrInfo.h"
+#include "MipsModuleISelDAGToDAG.h"
+#include "MipsOs16.h"
+#include "MipsSEFrameLowering.h"
+#include "MipsSEISelDAGToDAG.h"
+#include "MipsSEISelLowering.h"
+#include "MipsSEInstrInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "mips"
+
+extern "C" void LLVMInitializeMipsTarget() {
+  // Register the target.
+  RegisterTargetMachine<MipsebTargetMachine> X(TheMipsTarget);
+  RegisterTargetMachine<MipselTargetMachine> Y(TheMipselTarget);
+  RegisterTargetMachine<MipsebTargetMachine> A(TheMips64Target);
+  RegisterTargetMachine<MipselTargetMachine> B(TheMips64elTarget);
+}
+
+// On function prologue, the stack is created by decrementing
+// its pointer. Once decremented, all references are done with positive
+// offset from the stack/frame pointer, using StackGrowsUp enables
+// an easier handling.
+// Using CodeModel::Large enables different CALL behavior.
+MipsTargetMachine::MipsTargetMachine(const Target &T, StringRef TT,
+                                     StringRef CPU, StringRef FS,
+                                     const TargetOptions &Options,
+                                     Reloc::Model RM, CodeModel::Model CM,
+                                     CodeGenOpt::Level OL, bool isLittle)
+    : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+      Subtarget(nullptr), DefaultSubtarget(TT, CPU, FS, isLittle, this),
+      NoMips16Subtarget(TT, CPU, FS.empty() ? "-mips16" : FS.str() + ",-mips16",
+                        isLittle, this),
+      Mips16Subtarget(TT, CPU, FS.empty() ? "+mips16" : FS.str() + ",+mips16",
+                      isLittle, this) {
+  Subtarget = &DefaultSubtarget;
+  initAsmInfo();
+}
+
+void MipsebTargetMachine::anchor() { }
+
+MipsebTargetMachine::
+MipsebTargetMachine(const Target &T, StringRef TT,
+                    StringRef CPU, StringRef FS, const TargetOptions &Options,
+                    Reloc::Model RM, CodeModel::Model CM,
+                    CodeGenOpt::Level OL)
+  : MipsTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
+
+void MipselTargetMachine::anchor() { }
+
+MipselTargetMachine::
+MipselTargetMachine(const Target &T, StringRef TT,
+                    StringRef CPU, StringRef FS, const TargetOptions &Options,
+                    Reloc::Model RM, CodeModel::Model CM,
+                    CodeGenOpt::Level OL)
+  : MipsTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
+
+void MipsTargetMachine::resetSubtarget(MachineFunction *MF) {
+  DEBUG(dbgs() << "resetSubtarget\n");
+  AttributeSet FnAttrs = MF->getFunction()->getAttributes();
+  bool Mips16Attr = FnAttrs.hasAttribute(AttributeSet::FunctionIndex, "mips16");
+  bool NoMips16Attr =
+      FnAttrs.hasAttribute(AttributeSet::FunctionIndex, "nomips16");
+  assert(!(Mips16Attr && NoMips16Attr) &&
+         "mips16 and nomips16 specified on the same function");
+  if (Mips16Attr)
+    Subtarget = &Mips16Subtarget;
+  else if (NoMips16Attr)
+    Subtarget = &NoMips16Subtarget;
+  else
+    Subtarget = &DefaultSubtarget;
+  return;
+}
+
+namespace {
+/// Mips Code Generator Pass Configuration Options.
+class MipsPassConfig : public TargetPassConfig {
+public:
+  MipsPassConfig(MipsTargetMachine *TM, PassManagerBase &PM)
+    : TargetPassConfig(TM, PM) {
+    // The current implementation of long branch pass requires a scratch
+    // register ($at) to be available before branch instructions. Tail merging
+    // can break this requirement, so disable it when long branch pass is
+    // enabled.
+    EnableTailMerge = !getMipsSubtarget().enableLongBranchPass();
+  }
+
+  MipsTargetMachine &getMipsTargetMachine() const {
+    return getTM<MipsTargetMachine>();
+  }
+
+  const MipsSubtarget &getMipsSubtarget() const {
+    return *getMipsTargetMachine().getSubtargetImpl();
+  }
+
+  void addIRPasses() override;
+  bool addInstSelector() override;
+  void addMachineSSAOptimization() override;
+  bool addPreEmitPass() override;
+
+  bool addPreRegAlloc() override;
+
+};
+} // namespace
+
+TargetPassConfig *MipsTargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new MipsPassConfig(this, PM);
+}
+
+void MipsPassConfig::addIRPasses() {
+  TargetPassConfig::addIRPasses();
+  if (getMipsSubtarget().os16())
+    addPass(createMipsOs16(getMipsTargetMachine()));
+  if (getMipsSubtarget().inMips16HardFloat())
+    addPass(createMips16HardFloat(getMipsTargetMachine()));
+  addPass(createPartiallyInlineLibCallsPass());
+}
+// Install an instruction selector pass using
+// the ISelDag to gen Mips code.
+bool MipsPassConfig::addInstSelector() {
+  addPass(createMipsModuleISelDag(getMipsTargetMachine()));
+  addPass(createMips16ISelDag(getMipsTargetMachine()));
+  addPass(createMipsSEISelDag(getMipsTargetMachine()));
+  return false;
+}
+
+void MipsPassConfig::addMachineSSAOptimization() {
+  addPass(createMipsOptimizePICCallPass(getMipsTargetMachine()));
+  TargetPassConfig::addMachineSSAOptimization();
+}
+
+bool MipsPassConfig::addPreRegAlloc() {
+  if (getOptLevel() == CodeGenOpt::None) {
+    addPass(createMipsOptimizePICCallPass(getMipsTargetMachine()));
+    return true;
+  }
+  else
+    return false;
+}
+
+void MipsTargetMachine::addAnalysisPasses(PassManagerBase &PM) {
+  if (Subtarget->allowMixed16_32()) {
+    DEBUG(errs() << "No ");
+    //FIXME: The Basic Target Transform Info
+    // pass needs to become a function pass instead of
+    // being an immutable pass and then this method as it exists now
+    // would be unnecessary.
+    PM.add(createNoTargetTransformInfoPass());
+  } else
+    LLVMTargetMachine::addAnalysisPasses(PM);
+  DEBUG(errs() << "Target Transform Info Pass Added\n");
+}
+
+// Implemented by targets that want to run passes immediately before
+// machine code is emitted. return true if -print-machineinstrs should
+// print out the code after the passes.
+bool MipsPassConfig::addPreEmitPass() {
+  MipsTargetMachine &TM = getMipsTargetMachine();
+  addPass(createMipsDelaySlotFillerPass(TM));
+  addPass(createMipsLongBranchPass(TM));
+  addPass(createMipsConstantIslandPass(TM));
+  return true;
+}
+
+bool MipsTargetMachine::addCodeEmitter(PassManagerBase &PM,
+                                       JITCodeEmitter &JCE) {
+  // Machine code emitter pass for Mips.
+  PM.add(createMipsJITCodeEmitterPass(*this, JCE));
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsTargetMachine.h b/contrib/llvm/lib/Target/Mips/MipsTargetMachine.h
new file mode 100644
index 0000000..bcf411f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsTargetMachine.h
@@ -0,0 +1,105 @@
+//===-- MipsTargetMachine.h - Define TargetMachine for Mips -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Mips specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSTARGETMACHINE_H
+#define MIPSTARGETMACHINE_H
+
+#include "MipsSubtarget.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+class formatted_raw_ostream;
+class MipsRegisterInfo;
+
+class MipsTargetMachine : public LLVMTargetMachine {
+  MipsSubtarget *Subtarget;
+  MipsSubtarget DefaultSubtarget;
+  MipsSubtarget NoMips16Subtarget;
+  MipsSubtarget Mips16Subtarget;
+
+public:
+  MipsTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS,
+                    const TargetOptions &Options, Reloc::Model RM,
+                    CodeModel::Model CM, CodeGenOpt::Level OL, bool isLittle);
+
+  virtual ~MipsTargetMachine() {}
+
+  void addAnalysisPasses(PassManagerBase &PM) override;
+
+  const MipsInstrInfo *getInstrInfo() const override {
+    return getSubtargetImpl()->getInstrInfo();
+  }
+  const TargetFrameLowering *getFrameLowering() const override {
+    return getSubtargetImpl()->getFrameLowering();
+  }
+  const MipsSubtarget *getSubtargetImpl() const override {
+    if (Subtarget)
+      return Subtarget;
+    return &DefaultSubtarget;
+  }
+  const InstrItineraryData *getInstrItineraryData() const override {
+    return Subtarget->inMips16Mode()
+               ? nullptr
+               : &getSubtargetImpl()->getInstrItineraryData();
+  }
+  MipsJITInfo *getJITInfo() override {
+    return Subtarget->getJITInfo();
+  }
+  const MipsRegisterInfo *getRegisterInfo()  const override {
+    return getSubtargetImpl()->getRegisterInfo();
+  }
+  const MipsTargetLowering *getTargetLowering() const override {
+    return getSubtargetImpl()->getTargetLowering();
+  }
+  const DataLayout *getDataLayout() const override {
+    return getSubtargetImpl()->getDataLayout();
+  }
+  const MipsSelectionDAGInfo* getSelectionDAGInfo() const override {
+    return getSubtargetImpl()->getSelectionDAGInfo();
+  }
+  /// \brief Reset the subtarget for the Mips target.
+  void resetSubtarget(MachineFunction *MF);
+
+  // Pass Pipeline Configuration
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+  bool addCodeEmitter(PassManagerBase &PM, JITCodeEmitter &JCE) override;
+};
+
+/// MipsebTargetMachine - Mips32/64 big endian target machine.
+///
+class MipsebTargetMachine : public MipsTargetMachine {
+  virtual void anchor();
+public:
+  MipsebTargetMachine(const Target &T, StringRef TT,
+                      StringRef CPU, StringRef FS, const TargetOptions &Options,
+                      Reloc::Model RM, CodeModel::Model CM,
+                      CodeGenOpt::Level OL);
+};
+
+/// MipselTargetMachine - Mips32/64 little endian target machine.
+///
+class MipselTargetMachine : public MipsTargetMachine {
+  virtual void anchor();
+public:
+  MipselTargetMachine(const Target &T, StringRef TT,
+                      StringRef CPU, StringRef FS, const TargetOptions &Options,
+                      Reloc::Model RM, CodeModel::Model CM,
+                      CodeGenOpt::Level OL);
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp b/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp
new file mode 100644
index 0000000..13f9408
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp
@@ -0,0 +1,103 @@
+//===-- MipsTargetObjectFile.cpp - Mips Object Files ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsTargetObjectFile.h"
+#include "MipsSubtarget.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+static cl::opt<unsigned>
+SSThreshold("mips-ssection-threshold", cl::Hidden,
+            cl::desc("Small data and bss section threshold size (default=8)"),
+            cl::init(8));
+
+void MipsTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM){
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+  InitializeELF(TM.Options.UseInitArray);
+
+  SmallDataSection =
+    getContext().getELFSection(".sdata", ELF::SHT_PROGBITS,
+                               ELF::SHF_WRITE |ELF::SHF_ALLOC,
+                               SectionKind::getDataRel());
+
+  SmallBSSSection =
+    getContext().getELFSection(".sbss", ELF::SHT_NOBITS,
+                               ELF::SHF_WRITE |ELF::SHF_ALLOC,
+                               SectionKind::getBSS());
+}
+
+// A address must be loaded from a small section if its size is less than the
+// small section size threshold. Data in this section must be addressed using
+// gp_rel operator.
+static bool IsInSmallSection(uint64_t Size) {
+  return Size > 0 && Size <= SSThreshold;
+}
+
+bool MipsTargetObjectFile::IsGlobalInSmallSection(const GlobalValue *GV,
+                                                const TargetMachine &TM) const {
+  if (GV->isDeclaration() || GV->hasAvailableExternallyLinkage())
+    return false;
+
+  return IsGlobalInSmallSection(GV, TM, getKindForGlobal(GV, TM));
+}
+
+/// IsGlobalInSmallSection - Return true if this global address should be
+/// placed into small data/bss section.
+bool MipsTargetObjectFile::
+IsGlobalInSmallSection(const GlobalValue *GV, const TargetMachine &TM,
+                       SectionKind Kind) const {
+
+  const MipsSubtarget &Subtarget = TM.getSubtarget<MipsSubtarget>();
+
+  // Return if small section is not available.
+  if (!Subtarget.useSmallSection())
+    return false;
+
+  // Only global variables, not functions.
+  const GlobalVariable *GVA = dyn_cast<GlobalVariable>(GV);
+  if (!GVA)
+    return false;
+
+  // We can only do this for datarel or BSS objects for now.
+  if (!Kind.isBSS() && !Kind.isDataRel())
+    return false;
+
+  // If this is a internal constant string, there is a special
+  // section for it, but not in small data/bss.
+  if (Kind.isMergeable1ByteCString())
+    return false;
+
+  Type *Ty = GV->getType()->getElementType();
+  return IsInSmallSection(TM.getDataLayout()->getTypeAllocSize(Ty));
+}
+
+
+
+const MCSection *MipsTargetObjectFile::
+SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
+                       Mangler &Mang, const TargetMachine &TM) const {
+  // TODO: Could also support "weak" symbols as well with ".gnu.linkonce.s.*"
+  // sections?
+
+  // Handle Small Section classification here.
+  if (Kind.isBSS() && IsGlobalInSmallSection(GV, TM, Kind))
+    return SmallBSSSection;
+  if (Kind.isDataNoRel() && IsGlobalInSmallSection(GV, TM, Kind))
+    return SmallDataSection;
+
+  // Otherwise, we work the same as ELF.
+  return TargetLoweringObjectFileELF::SelectSectionForGlobal(GV, Kind, Mang,TM);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.h b/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.h
new file mode 100644
index 0000000..2bf5a75
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.h
@@ -0,0 +1,38 @@
+//===-- llvm/Target/MipsTargetObjectFile.h - Mips Object Info ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_MIPS_TARGETOBJECTFILE_H
+#define LLVM_TARGET_MIPS_TARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+
+namespace llvm {
+
+  class MipsTargetObjectFile : public TargetLoweringObjectFileELF {
+    const MCSection *SmallDataSection;
+    const MCSection *SmallBSSSection;
+  public:
+
+    void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+
+
+    /// IsGlobalInSmallSection - Return true if this global address should be
+    /// placed into small data/bss section.
+    bool IsGlobalInSmallSection(const GlobalValue *GV,
+                                const TargetMachine &TM, SectionKind Kind)const;
+    bool IsGlobalInSmallSection(const GlobalValue *GV,
+                                const TargetMachine &TM) const;
+
+    const MCSection *SelectSectionForGlobal(const GlobalValue *GV,
+                                        SectionKind Kind, Mangler &Mang,
+                                        const TargetMachine &TM) const override;
+  };
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsTargetStreamer.h b/contrib/llvm/lib/Target/Mips/MipsTargetStreamer.h
new file mode 100644
index 0000000..99f7d4c
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsTargetStreamer.h
@@ -0,0 +1,201 @@
+//===-- MipsTargetStreamer.h - Mips Target Streamer ------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSTARGETSTREAMER_H
+#define MIPSTARGETSTREAMER_H
+
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCStreamer.h"
+#include "MCTargetDesc/MipsABIFlagsSection.h"
+
+namespace llvm {
+
+struct MipsABIFlagsSection;
+
+class MipsTargetStreamer : public MCTargetStreamer {
+public:
+  MipsTargetStreamer(MCStreamer &S);
+  virtual void emitDirectiveSetMicroMips();
+  virtual void emitDirectiveSetNoMicroMips();
+  virtual void emitDirectiveSetMips16();
+  virtual void emitDirectiveSetNoMips16();
+
+  virtual void emitDirectiveSetReorder();
+  virtual void emitDirectiveSetNoReorder();
+  virtual void emitDirectiveSetMacro();
+  virtual void emitDirectiveSetNoMacro();
+  virtual void emitDirectiveSetAt();
+  virtual void emitDirectiveSetNoAt();
+  virtual void emitDirectiveEnd(StringRef Name);
+
+  virtual void emitDirectiveEnt(const MCSymbol &Symbol);
+  virtual void emitDirectiveAbiCalls();
+  virtual void emitDirectiveNaN2008();
+  virtual void emitDirectiveNaNLegacy();
+  virtual void emitDirectiveOptionPic0();
+  virtual void emitDirectiveOptionPic2();
+  virtual void emitFrame(unsigned StackReg, unsigned StackSize,
+                         unsigned ReturnReg);
+  virtual void emitMask(unsigned CPUBitmask, int CPUTopSavedRegOff);
+  virtual void emitFMask(unsigned FPUBitmask, int FPUTopSavedRegOff);
+
+  virtual void emitDirectiveSetMips32R2();
+  virtual void emitDirectiveSetMips64();
+  virtual void emitDirectiveSetMips64R2();
+  virtual void emitDirectiveSetDsp();
+
+  // PIC support
+  virtual void emitDirectiveCpload(unsigned RegNo);
+  virtual void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset,
+                                    const MCSymbol &Sym, bool IsReg);
+
+  /// Emit a '.module fp=value' directive using the given values.
+  /// Updates the .MIPS.abiflags section
+  virtual void emitDirectiveModuleFP(MipsABIFlagsSection::FpABIKind Value,
+                                     bool Is32BitABI) {
+    ABIFlagsSection.setFpABI(Value, Is32BitABI);
+  }
+
+  /// Emit a '.module fp=value' directive using the current values of the
+  /// .MIPS.abiflags section.
+  void emitDirectiveModuleFP() {
+    emitDirectiveModuleFP(ABIFlagsSection.getFpABI(),
+                          ABIFlagsSection.Is32BitABI);
+  }
+
+  virtual void emitDirectiveModuleOddSPReg(bool Enabled, bool IsO32ABI);
+  virtual void emitDirectiveSetFp(MipsABIFlagsSection::FpABIKind Value){};
+  virtual void emitMipsAbiFlags(){};
+  void setCanHaveModuleDir(bool Can) { canHaveModuleDirective = Can; }
+  bool getCanHaveModuleDir() { return canHaveModuleDirective; }
+
+  // This method enables template classes to set internal abi flags
+  // structure values.
+  template <class PredicateLibrary>
+  void updateABIInfo(const PredicateLibrary &P) {
+    ABIFlagsSection.setAllFromPredicates(P);
+  }
+
+  MipsABIFlagsSection &getABIFlagsSection() { return ABIFlagsSection; }
+
+protected:
+  MipsABIFlagsSection ABIFlagsSection;
+
+private:
+  bool canHaveModuleDirective;
+};
+
+// This part is for ascii assembly output
+class MipsTargetAsmStreamer : public MipsTargetStreamer {
+  formatted_raw_ostream &OS;
+
+public:
+  MipsTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
+  void emitDirectiveSetMicroMips() override;
+  void emitDirectiveSetNoMicroMips() override;
+  void emitDirectiveSetMips16() override;
+  void emitDirectiveSetNoMips16() override;
+
+  void emitDirectiveSetReorder() override;
+  void emitDirectiveSetNoReorder() override;
+  void emitDirectiveSetMacro() override;
+  void emitDirectiveSetNoMacro() override;
+  void emitDirectiveSetAt() override;
+  void emitDirectiveSetNoAt() override;
+  void emitDirectiveEnd(StringRef Name) override;
+
+  void emitDirectiveEnt(const MCSymbol &Symbol) override;
+  void emitDirectiveAbiCalls() override;
+  void emitDirectiveNaN2008() override;
+  void emitDirectiveNaNLegacy() override;
+  void emitDirectiveOptionPic0() override;
+  void emitDirectiveOptionPic2() override;
+  void emitFrame(unsigned StackReg, unsigned StackSize,
+                 unsigned ReturnReg) override;
+  void emitMask(unsigned CPUBitmask, int CPUTopSavedRegOff) override;
+  void emitFMask(unsigned FPUBitmask, int FPUTopSavedRegOff) override;
+
+  void emitDirectiveSetMips32R2() override;
+  void emitDirectiveSetMips64() override;
+  void emitDirectiveSetMips64R2() override;
+  void emitDirectiveSetDsp() override;
+
+  // PIC support
+  virtual void emitDirectiveCpload(unsigned RegNo);
+  void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset,
+                            const MCSymbol &Sym, bool IsReg) override;
+
+  // ABI Flags
+  void emitDirectiveModuleFP(MipsABIFlagsSection::FpABIKind Value,
+                             bool Is32BitABI) override;
+  void emitDirectiveModuleOddSPReg(bool Enabled, bool IsO32ABI) override;
+  void emitDirectiveSetFp(MipsABIFlagsSection::FpABIKind Value) override;
+  void emitMipsAbiFlags() override;
+};
+
+// This part is for ELF object output
+class MipsTargetELFStreamer : public MipsTargetStreamer {
+  bool MicroMipsEnabled;
+  const MCSubtargetInfo &STI;
+  bool Pic;
+
+public:
+  bool isMicroMipsEnabled() const { return MicroMipsEnabled; }
+  MCELFStreamer &getStreamer();
+  MipsTargetELFStreamer(MCStreamer &S, const MCSubtargetInfo &STI);
+
+  void emitLabel(MCSymbol *Symbol) override;
+  void emitAssignment(MCSymbol *Symbol, const MCExpr *Value) override;
+  void finish() override;
+
+  void emitDirectiveSetMicroMips() override;
+  void emitDirectiveSetNoMicroMips() override;
+  void emitDirectiveSetMips16() override;
+  void emitDirectiveSetNoMips16() override;
+
+  void emitDirectiveSetReorder() override;
+  void emitDirectiveSetNoReorder() override;
+  void emitDirectiveSetMacro() override;
+  void emitDirectiveSetNoMacro() override;
+  void emitDirectiveSetAt() override;
+  void emitDirectiveSetNoAt() override;
+  void emitDirectiveEnd(StringRef Name) override;
+
+  void emitDirectiveEnt(const MCSymbol &Symbol) override;
+  void emitDirectiveAbiCalls() override;
+  void emitDirectiveNaN2008() override;
+  void emitDirectiveNaNLegacy() override;
+  void emitDirectiveOptionPic0() override;
+  void emitDirectiveOptionPic2() override;
+  void emitFrame(unsigned StackReg, unsigned StackSize,
+                 unsigned ReturnReg) override;
+  void emitMask(unsigned CPUBitmask, int CPUTopSavedRegOff) override;
+  void emitFMask(unsigned FPUBitmask, int FPUTopSavedRegOff) override;
+
+  void emitDirectiveSetMips32R2() override;
+  void emitDirectiveSetMips64() override;
+  void emitDirectiveSetMips64R2() override;
+  void emitDirectiveSetDsp() override;
+
+  // PIC support
+  virtual void emitDirectiveCpload(unsigned RegNo);
+  void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset,
+                            const MCSymbol &Sym, bool IsReg) override;
+
+  // ABI Flags
+  void emitDirectiveModuleOddSPReg(bool Enabled, bool IsO32ABI) override;
+  void emitMipsAbiFlags() override;
+
+protected:
+  bool isO32() const { return STI.getFeatureBits() & Mips::FeatureO32; }
+  bool isN32() const { return STI.getFeatureBits() & Mips::FeatureN32; }
+  bool isN64() const { return STI.getFeatureBits() & Mips::FeatureN64; }
+};
+}
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp b/contrib/llvm/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp
new file mode 100644
index 0000000..3615c14
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp
@@ -0,0 +1,31 @@
+//===-- MipsTargetInfo.cpp - Mips Target Implementation -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mips.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheMipsTarget, llvm::TheMipselTarget;
+Target llvm::TheMips64Target, llvm::TheMips64elTarget;
+
+extern "C" void LLVMInitializeMipsTargetInfo() {
+  RegisterTarget<Triple::mips,
+        /*HasJIT=*/true> X(TheMipsTarget, "mips", "Mips");
+
+  RegisterTarget<Triple::mipsel,
+        /*HasJIT=*/true> Y(TheMipselTarget, "mipsel", "Mipsel");
+
+  RegisterTarget<Triple::mips64,
+        /*HasJIT=*/false> A(TheMips64Target, "mips64", "Mips64 [experimental]");
+
+  RegisterTarget<Triple::mips64el,
+        /*HasJIT=*/false> B(TheMips64elTarget,
+                            "mips64el", "Mips64el [experimental]");
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp b/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
new file mode 100644
index 0000000..80b2f62
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
@@ -0,0 +1,290 @@
+//===-- NVPTXInstPrinter.cpp - PTX assembly instruction printing ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Print MCInst instructions to .ptx format.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstPrinter/NVPTXInstPrinter.h"
+#include "MCTargetDesc/NVPTXBaseInfo.h"
+#include "NVPTX.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include <cctype>
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+#include "NVPTXGenAsmWriter.inc"
+
+
+NVPTXInstPrinter::NVPTXInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                                   const MCRegisterInfo &MRI,
+                                   const MCSubtargetInfo &STI)
+  : MCInstPrinter(MAI, MII, MRI) {
+  setAvailableFeatures(STI.getFeatureBits());
+}
+
+void NVPTXInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+  // Decode the virtual register
+  // Must be kept in sync with NVPTXAsmPrinter::encodeVirtualRegister
+  unsigned RCId = (RegNo >> 28);
+  switch (RCId) {
+  default: report_fatal_error("Bad virtual register encoding");
+  case 0:
+    // This is actually a physical register, so defer to the autogenerated
+    // register printer
+    OS << getRegisterName(RegNo);
+    return;
+  case 1:
+    OS << "%p";
+    break;
+  case 2:
+    OS << "%rs";
+    break;
+  case 3:
+    OS << "%r";
+    break;
+  case 4:
+    OS << "%rd";
+    break;
+  case 5:
+    OS << "%f";
+    break;
+  case 6:
+    OS << "%fd";
+    break;
+  }
+
+  unsigned VReg = RegNo & 0x0FFFFFFF;
+  OS << VReg;
+}
+
+void NVPTXInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
+                                 StringRef Annot) {
+  printInstruction(MI, OS);
+
+  // Next always print the annotation.
+  printAnnotation(OS, Annot);
+}
+
+void NVPTXInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                    raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    unsigned Reg = Op.getReg();
+    printRegName(O, Reg);
+  } else if (Op.isImm()) {
+    O << markup("<imm:") << formatImm(Op.getImm()) << markup(">");
+  } else {
+    assert(Op.isExpr() && "Unknown operand kind in printOperand");
+    O << *Op.getExpr();
+  }
+}
+
+void NVPTXInstPrinter::printCvtMode(const MCInst *MI, int OpNum, raw_ostream &O,
+                                    const char *Modifier) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  int64_t Imm = MO.getImm();
+
+  if (strcmp(Modifier, "ftz") == 0) {
+    // FTZ flag
+    if (Imm & NVPTX::PTXCvtMode::FTZ_FLAG)
+      O << ".ftz";
+  } else if (strcmp(Modifier, "sat") == 0) {
+    // SAT flag
+    if (Imm & NVPTX::PTXCvtMode::SAT_FLAG)
+      O << ".sat";
+  } else if (strcmp(Modifier, "base") == 0) {
+    // Default operand
+    switch (Imm & NVPTX::PTXCvtMode::BASE_MASK) {
+    default:
+      return;
+    case NVPTX::PTXCvtMode::NONE:
+      break;
+    case NVPTX::PTXCvtMode::RNI:
+      O << ".rni";
+      break;
+    case NVPTX::PTXCvtMode::RZI:
+      O << ".rzi";
+      break;
+    case NVPTX::PTXCvtMode::RMI:
+      O << ".rmi";
+      break;
+    case NVPTX::PTXCvtMode::RPI:
+      O << ".rpi";
+      break;
+    case NVPTX::PTXCvtMode::RN:
+      O << ".rn";
+      break;
+    case NVPTX::PTXCvtMode::RZ:
+      O << ".rz";
+      break;
+    case NVPTX::PTXCvtMode::RM:
+      O << ".rm";
+      break;
+    case NVPTX::PTXCvtMode::RP:
+      O << ".rp";
+      break;
+    }
+  } else {
+    llvm_unreachable("Invalid conversion modifier");
+  }
+}
+
+void NVPTXInstPrinter::printCmpMode(const MCInst *MI, int OpNum, raw_ostream &O,
+                                    const char *Modifier) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  int64_t Imm = MO.getImm();
+
+  if (strcmp(Modifier, "ftz") == 0) {
+    // FTZ flag
+    if (Imm & NVPTX::PTXCmpMode::FTZ_FLAG)
+      O << ".ftz";
+  } else if (strcmp(Modifier, "base") == 0) {
+    switch (Imm & NVPTX::PTXCmpMode::BASE_MASK) {
+    default:
+      return;
+    case NVPTX::PTXCmpMode::EQ:
+      O << ".eq";
+      break;
+    case NVPTX::PTXCmpMode::NE:
+      O << ".ne";
+      break;
+    case NVPTX::PTXCmpMode::LT:
+      O << ".lt";
+      break;
+    case NVPTX::PTXCmpMode::LE:
+      O << ".le";
+      break;
+    case NVPTX::PTXCmpMode::GT:
+      O << ".gt";
+      break;
+    case NVPTX::PTXCmpMode::GE:
+      O << ".ge";
+      break;
+    case NVPTX::PTXCmpMode::LO:
+      O << ".lo";
+      break;
+    case NVPTX::PTXCmpMode::LS:
+      O << ".ls";
+      break;
+    case NVPTX::PTXCmpMode::HI:
+      O << ".hi";
+      break;
+    case NVPTX::PTXCmpMode::HS:
+      O << ".hs";
+      break;
+    case NVPTX::PTXCmpMode::EQU:
+      O << ".equ";
+      break;
+    case NVPTX::PTXCmpMode::NEU:
+      O << ".neu";
+      break;
+    case NVPTX::PTXCmpMode::LTU:
+      O << ".ltu";
+      break;
+    case NVPTX::PTXCmpMode::LEU:
+      O << ".leu";
+      break;
+    case NVPTX::PTXCmpMode::GTU:
+      O << ".gtu";
+      break;
+    case NVPTX::PTXCmpMode::GEU:
+      O << ".geu";
+      break;
+    case NVPTX::PTXCmpMode::NUM:
+      O << ".num";
+      break;
+    case NVPTX::PTXCmpMode::NotANumber:
+      O << ".nan";
+      break;
+    }
+  } else {
+    llvm_unreachable("Empty Modifier");
+  }
+}
+
+void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
+                                     raw_ostream &O, const char *Modifier) {
+  if (Modifier) {
+    const MCOperand &MO = MI->getOperand(OpNum);
+    int Imm = (int) MO.getImm();
+    if (!strcmp(Modifier, "volatile")) {
+      if (Imm)
+        O << ".volatile";
+    } else if (!strcmp(Modifier, "addsp")) {
+      switch (Imm) {
+      case NVPTX::PTXLdStInstCode::GLOBAL:
+        O << ".global";
+        break;
+      case NVPTX::PTXLdStInstCode::SHARED:
+        O << ".shared";
+        break;
+      case NVPTX::PTXLdStInstCode::LOCAL:
+        O << ".local";
+        break;
+      case NVPTX::PTXLdStInstCode::PARAM:
+        O << ".param";
+        break;
+      case NVPTX::PTXLdStInstCode::CONSTANT:
+        O << ".const";
+        break;
+      case NVPTX::PTXLdStInstCode::GENERIC:
+        break;
+      default:
+        llvm_unreachable("Wrong Address Space");
+      }
+    } else if (!strcmp(Modifier, "sign")) {
+      if (Imm == NVPTX::PTXLdStInstCode::Signed)
+        O << "s";
+      else if (Imm == NVPTX::PTXLdStInstCode::Unsigned)
+        O << "u";
+      else
+        O << "f";
+    } else if (!strcmp(Modifier, "vec")) {
+      if (Imm == NVPTX::PTXLdStInstCode::V2)
+        O << ".v2";
+      else if (Imm == NVPTX::PTXLdStInstCode::V4)
+        O << ".v4";
+    } else
+      llvm_unreachable("Unknown Modifier");
+  } else
+    llvm_unreachable("Empty Modifier");
+}
+
+void NVPTXInstPrinter::printMemOperand(const MCInst *MI, int OpNum,
+                                       raw_ostream &O, const char *Modifier) {
+  printOperand(MI, OpNum, O);
+
+  if (Modifier && !strcmp(Modifier, "add")) {
+    O << ", ";
+    printOperand(MI, OpNum + 1, O);
+  } else {
+    if (MI->getOperand(OpNum + 1).isImm() &&
+        MI->getOperand(OpNum + 1).getImm() == 0)
+      return; // don't print ',0' or '+0'
+    O << "+";
+    printOperand(MI, OpNum + 1, O);
+  }
+}
+
+void NVPTXInstPrinter::printProtoIdent(const MCInst *MI, int OpNum,
+                                       raw_ostream &O, const char *Modifier) {
+  const MCOperand &Op = MI->getOperand(OpNum);
+  assert(Op.isExpr() && "Call prototype is not an MCExpr?");
+  const MCExpr *Expr = Op.getExpr();
+  const MCSymbol &Sym = cast<MCSymbolRefExpr>(Expr)->getSymbol();
+  O << Sym.getName();
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h b/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h
new file mode 100644
index 0000000..1fb3c57
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h
@@ -0,0 +1,53 @@
+//= NVPTXInstPrinter.h - Convert NVPTX MCInst to assembly syntax --*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an NVPTX MCInst to .ptx file syntax.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTX_INST_PRINTER_H
+#define NVPTX_INST_PRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+class MCOperand;
+class MCSubtargetInfo;
+
+class NVPTXInstPrinter : public MCInstPrinter {
+public:
+  NVPTXInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                   const MCRegisterInfo &MRI, const MCSubtargetInfo &STI);
+
+  void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+  void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot) override;
+
+  // Autogenerated by tblgen.
+  void printInstruction(const MCInst *MI, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo);
+  // End
+
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printCvtMode(const MCInst *MI, int OpNum, raw_ostream &O,
+                    const char *Modifier = nullptr);
+  void printCmpMode(const MCInst *MI, int OpNum, raw_ostream &O,
+                    const char *Modifier = nullptr);
+  void printLdStCode(const MCInst *MI, int OpNum,
+                     raw_ostream &O, const char *Modifier = nullptr);
+  void printMemOperand(const MCInst *MI, int OpNum,
+                       raw_ostream &O, const char *Modifier = nullptr);
+  void printProtoIdent(const MCInst *MI, int OpNum,
+                       raw_ostream &O, const char *Modifier = nullptr);
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h
new file mode 100644
index 0000000..16ec19c
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h
@@ -0,0 +1,100 @@
+//===-- NVPTXBaseInfo.h - Top-level definitions for NVPTX -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the NVPTX target useful for the compiler back-end and the MC libraries.
+// As such, it deliberately does not include references to LLVM core
+// code gen types, passes, etc..
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTXBASEINFO_H
+#define NVPTXBASEINFO_H
+
+namespace llvm {
+
+enum AddressSpace {
+  ADDRESS_SPACE_GENERIC = 0,
+  ADDRESS_SPACE_GLOBAL = 1,
+  ADDRESS_SPACE_SHARED = 3,
+  ADDRESS_SPACE_CONST = 4,
+  ADDRESS_SPACE_LOCAL = 5,
+
+  // NVVM Internal
+  ADDRESS_SPACE_PARAM = 101
+};
+
+enum PropertyAnnotation {
+  PROPERTY_MAXNTID_X = 0,
+  PROPERTY_MAXNTID_Y,
+  PROPERTY_MAXNTID_Z,
+  PROPERTY_REQNTID_X,
+  PROPERTY_REQNTID_Y,
+  PROPERTY_REQNTID_Z,
+  PROPERTY_MINNCTAPERSM,
+  PROPERTY_ISTEXTURE,
+  PROPERTY_ISSURFACE,
+  PROPERTY_ISSAMPLER,
+  PROPERTY_ISREADONLY_IMAGE_PARAM,
+  PROPERTY_ISWRITEONLY_IMAGE_PARAM,
+  PROPERTY_ISREADWRITE_IMAGE_PARAM,
+  PROPERTY_ISKERNEL_FUNCTION,
+  PROPERTY_ALIGN,
+  PROPERTY_MANAGED,
+
+  // last property
+  PROPERTY_LAST
+};
+
+const unsigned AnnotationNameLen = 9; // length of each annotation name
+const char PropertyAnnotationNames[PROPERTY_LAST + 1][AnnotationNameLen + 1] = {
+  "maxntidx",                         // PROPERTY_MAXNTID_X
+  "maxntidy",                         // PROPERTY_MAXNTID_Y
+  "maxntidz",                         // PROPERTY_MAXNTID_Z
+  "reqntidx",                         // PROPERTY_REQNTID_X
+  "reqntidy",                         // PROPERTY_REQNTID_Y
+  "reqntidz",                         // PROPERTY_REQNTID_Z
+  "minctasm",                         // PROPERTY_MINNCTAPERSM
+  "texture",                          // PROPERTY_ISTEXTURE
+  "surface",                          // PROPERTY_ISSURFACE
+  "sampler",                          // PROPERTY_ISSAMPLER
+  "rdoimage",                         // PROPERTY_ISREADONLY_IMAGE_PARAM
+  "wroimage",                         // PROPERTY_ISWRITEONLY_IMAGE_PARAM
+  "rdwrimage",                        // PROPERTY_ISREADWRITE_IMAGE_PARAM
+  "kernel",                           // PROPERTY_ISKERNEL_FUNCTION
+  "align",                            // PROPERTY_ALIGN
+  "managed",                          // PROPERTY_MANAGED
+
+              // last property
+  "proplast", // PROPERTY_LAST
+};
+
+// name of named metadata used for global annotations
+#if defined(__GNUC__)
+// As this is declared to be static but some of the .cpp files that
+// include NVVM.h do not use this array, gcc gives a warning when
+// compiling those .cpp files, hence __attribute__((unused)).
+__attribute__((unused))
+#endif
+    static const char *NamedMDForAnnotations = "nvvm.annotations";
+
+namespace NVPTXII {
+enum {
+  // These must be kept in sync with TSFlags in NVPTXInstrFormats.td
+  IsTexFlag = 0x80,
+  IsSuldMask = 0x300,
+  IsSuldShift = 8,
+  IsSustFlag = 0x400,
+  IsSurfTexQueryFlag = 0x800,
+  IsTexModeUnifiedFlag = 0x1000
+};
+}
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
new file mode 100644
index 0000000..366341a
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
@@ -0,0 +1,56 @@
+//===-- NVPTXMCAsmInfo.cpp - NVPTX asm properties -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the NVPTXMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXMCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+// -debug-compile - Command line option to inform opt and llc passes to
+// compile for debugging
+static cl::opt<bool> CompileForDebugging("debug-compile",
+                                         cl::desc("Compile for debugging"),
+                                         cl::Hidden, cl::init(false));
+
+void NVPTXMCAsmInfo::anchor() {}
+
+NVPTXMCAsmInfo::NVPTXMCAsmInfo(const StringRef &TT) {
+  Triple TheTriple(TT);
+  if (TheTriple.getArch() == Triple::nvptx64) {
+    PointerSize = CalleeSaveStackSlotSize = 8;
+  }
+
+  CommentString = "//";
+
+  HasSetDirective = false;
+
+  HasSingleParameterDotFile = false;
+
+  InlineAsmStart = " inline asm";
+  InlineAsmEnd = " inline asm";
+
+  SupportsDebugInformation = CompileForDebugging;
+  HasDotTypeDotSizeDirective = false;
+
+  Data8bitsDirective = " .b8 ";
+  Data16bitsDirective = " .b16 ";
+  Data32bitsDirective = " .b32 ";
+  Data64bitsDirective = " .b64 ";
+  ZeroDirective = " .b8";
+  AsciiDirective = " .b8";
+  AscizDirective = " .b8";
+
+  // @TODO: Can we just disable this?
+  GlobalDirective = "\t// .globl\t";
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
new file mode 100644
index 0000000..7d1633f
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
@@ -0,0 +1,30 @@
+//===-- NVPTXMCAsmInfo.h - NVPTX asm properties ----------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the NVPTXMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTX_MCASM_INFO_H
+#define NVPTX_MCASM_INFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+
+namespace llvm {
+class Target;
+class StringRef;
+
+class NVPTXMCAsmInfo : public MCAsmInfo {
+  virtual void anchor();
+public:
+  explicit NVPTXMCAsmInfo(const StringRef &TT);
+};
+} // namespace llvm
+
+#endif // NVPTX_MCASM_INFO_H
diff --git a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
new file mode 100644
index 0000000..158ca90
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
@@ -0,0 +1,105 @@
+//===-- NVPTXMCTargetDesc.cpp - NVPTX Target Descriptions -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides NVPTX specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXMCTargetDesc.h"
+#include "InstPrinter/NVPTXInstPrinter.h"
+#include "NVPTXMCAsmInfo.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_MC_DESC
+#include "NVPTXGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "NVPTXGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "NVPTXGenRegisterInfo.inc"
+
+static MCInstrInfo *createNVPTXMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitNVPTXMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createNVPTXMCRegisterInfo(StringRef TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  // PTX does not have a return address register.
+  InitNVPTXMCRegisterInfo(X, 0);
+  return X;
+}
+
+static MCSubtargetInfo *
+createNVPTXMCSubtargetInfo(StringRef TT, StringRef CPU, StringRef FS) {
+  MCSubtargetInfo *X = new MCSubtargetInfo();
+  InitNVPTXMCSubtargetInfo(X, TT, CPU, FS);
+  return X;
+}
+
+static MCCodeGenInfo *createNVPTXMCCodeGenInfo(
+    StringRef TT, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  X->InitMCCodeGenInfo(RM, CM, OL);
+  return X;
+}
+
+static MCInstPrinter *createNVPTXMCInstPrinter(const Target &T,
+                                               unsigned SyntaxVariant,
+                                               const MCAsmInfo &MAI,
+                                               const MCInstrInfo &MII,
+                                               const MCRegisterInfo &MRI,
+                                               const MCSubtargetInfo &STI) {
+  if (SyntaxVariant == 0)
+    return new NVPTXInstPrinter(MAI, MII, MRI, STI);
+  return nullptr;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeNVPTXTargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfo<NVPTXMCAsmInfo> X(TheNVPTXTarget32);
+  RegisterMCAsmInfo<NVPTXMCAsmInfo> Y(TheNVPTXTarget64);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheNVPTXTarget32,
+                                        createNVPTXMCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheNVPTXTarget64,
+                                        createNVPTXMCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheNVPTXTarget32, createNVPTXMCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheNVPTXTarget64, createNVPTXMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheNVPTXTarget32,
+                                    createNVPTXMCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheNVPTXTarget64,
+                                    createNVPTXMCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheNVPTXTarget32,
+                                          createNVPTXMCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(TheNVPTXTarget64,
+                                          createNVPTXMCSubtargetInfo);
+
+  // Register the MCInstPrinter.
+  TargetRegistry::RegisterMCInstPrinter(TheNVPTXTarget32,
+                                        createNVPTXMCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheNVPTXTarget64,
+                                        createNVPTXMCInstPrinter);
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h
new file mode 100644
index 0000000..af95c76
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h
@@ -0,0 +1,36 @@
+//===-- NVPTXMCTargetDesc.h - NVPTX Target Descriptions ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides NVPTX specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTXMCTARGETDESC_H
+#define NVPTXMCTARGETDESC_H
+
+namespace llvm {
+class Target;
+
+extern Target TheNVPTXTarget32;
+extern Target TheNVPTXTarget64;
+
+} // End llvm namespace
+
+// Defines symbolic names for PTX registers.
+#define GET_REGINFO_ENUM
+#include "NVPTXGenRegisterInfo.inc"
+
+// Defines symbolic names for the PTX instructions.
+#define GET_INSTRINFO_ENUM
+#include "NVPTXGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "NVPTXGenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/ManagedStringPool.h b/contrib/llvm/lib/Target/NVPTX/ManagedStringPool.h
new file mode 100644
index 0000000..f9fb059
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/ManagedStringPool.h
@@ -0,0 +1,48 @@
+//===-- ManagedStringPool.h - Managed String Pool ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The strings allocated from a managed string pool are owned by the string
+// pool and will be deleted together with the managed string pool.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_MANAGED_STRING_H
+#define LLVM_SUPPORT_MANAGED_STRING_H
+
+#include "llvm/ADT/SmallVector.h"
+#include <string>
+
+namespace llvm {
+
+/// ManagedStringPool - The strings allocated from a managed string pool are
+/// owned by the string pool and will be deleted together with the managed
+/// string pool.
+class ManagedStringPool {
+  SmallVector<std::string *, 8> Pool;
+
+public:
+  ManagedStringPool() {}
+  ~ManagedStringPool() {
+    SmallVectorImpl<std::string *>::iterator Current = Pool.begin();
+    while (Current != Pool.end()) {
+      delete *Current;
+      Current++;
+    }
+  }
+
+  std::string *getManagedString(const char *S) {
+    std::string *Str = new std::string(S);
+    Pool.push_back(Str);
+    return Str;
+  }
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTX.h b/contrib/llvm/lib/Target/NVPTX/NVPTX.h
new file mode 100644
index 0000000..e74c808
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTX.h
@@ -0,0 +1,194 @@
+//===-- NVPTX.h - Top-level interface for NVPTX representation --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in
+// the LLVM NVPTX back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_NVPTX_H
+#define LLVM_TARGET_NVPTX_H
+
+#include "MCTargetDesc/NVPTXBaseInfo.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetMachine.h"
+#include <cassert>
+#include <iosfwd>
+
+namespace llvm {
+class NVPTXTargetMachine;
+class FunctionPass;
+class MachineFunctionPass;
+class formatted_raw_ostream;
+
+namespace NVPTXCC {
+enum CondCodes {
+  EQ,
+  NE,
+  LT,
+  LE,
+  GT,
+  GE
+};
+}
+
+inline static const char *NVPTXCondCodeToString(NVPTXCC::CondCodes CC) {
+  switch (CC) {
+  case NVPTXCC::NE:
+    return "ne";
+  case NVPTXCC::EQ:
+    return "eq";
+  case NVPTXCC::LT:
+    return "lt";
+  case NVPTXCC::LE:
+    return "le";
+  case NVPTXCC::GT:
+    return "gt";
+  case NVPTXCC::GE:
+    return "ge";
+  }
+  llvm_unreachable("Unknown condition code");
+}
+
+FunctionPass *
+createNVPTXISelDag(NVPTXTargetMachine &TM, llvm::CodeGenOpt::Level OptLevel);
+ModulePass *createNVPTXAssignValidGlobalNamesPass();
+ModulePass *createGenericToNVVMPass();
+FunctionPass *createNVPTXFavorNonGenericAddrSpacesPass();
+ModulePass *createNVVMReflectPass();
+ModulePass *createNVVMReflectPass(const StringMap<int>& Mapping);
+MachineFunctionPass *createNVPTXPrologEpilogPass();
+MachineFunctionPass *createNVPTXReplaceImageHandlesPass();
+FunctionPass *createNVPTXImageOptimizerPass();
+
+bool isImageOrSamplerVal(const Value *, const Module *);
+
+extern Target TheNVPTXTarget32;
+extern Target TheNVPTXTarget64;
+
+namespace NVPTX {
+enum DrvInterface {
+  NVCL,
+  CUDA
+};
+
+// A field inside TSFlags needs a shift and a mask. The usage is
+// always as follows :
+// ((TSFlags & fieldMask) >> fieldShift)
+// The enum keeps the mask, the shift, and all valid values of the
+// field in one place.
+enum VecInstType {
+  VecInstTypeShift = 0,
+  VecInstTypeMask = 0xF,
+
+  VecNOP = 0,
+  VecLoad = 1,
+  VecStore = 2,
+  VecBuild = 3,
+  VecShuffle = 4,
+  VecExtract = 5,
+  VecInsert = 6,
+  VecDest = 7,
+  VecOther = 15
+};
+
+enum SimpleMove {
+  SimpleMoveMask = 0x10,
+  SimpleMoveShift = 4
+};
+enum LoadStore {
+  isLoadMask = 0x20,
+  isLoadShift = 5,
+  isStoreMask = 0x40,
+  isStoreShift = 6
+};
+
+namespace PTXLdStInstCode {
+enum AddressSpace {
+  GENERIC = 0,
+  GLOBAL = 1,
+  CONSTANT = 2,
+  SHARED = 3,
+  PARAM = 4,
+  LOCAL = 5
+};
+enum FromType {
+  Unsigned = 0,
+  Signed,
+  Float
+};
+enum VecType {
+  Scalar = 1,
+  V2 = 2,
+  V4 = 4
+};
+}
+
+/// PTXCvtMode - Conversion code enumeration
+namespace PTXCvtMode {
+enum CvtMode {
+  NONE = 0,
+  RNI,
+  RZI,
+  RMI,
+  RPI,
+  RN,
+  RZ,
+  RM,
+  RP,
+
+  BASE_MASK = 0x0F,
+  FTZ_FLAG = 0x10,
+  SAT_FLAG = 0x20
+};
+}
+
+/// PTXCmpMode - Comparison mode enumeration
+namespace PTXCmpMode {
+enum CmpMode {
+  EQ = 0,
+  NE,
+  LT,
+  LE,
+  GT,
+  GE,
+  LO,
+  LS,
+  HI,
+  HS,
+  EQU,
+  NEU,
+  LTU,
+  LEU,
+  GTU,
+  GEU,
+  NUM,
+  // NAN is a MACRO
+  NotANumber,
+
+  BASE_MASK = 0xFF,
+  FTZ_FLAG = 0x100
+};
+}
+}
+} // end namespace llvm;
+
+// Defines symbolic names for NVPTX registers.  This defines a mapping from
+// register name to register number.
+#define GET_REGINFO_ENUM
+#include "NVPTXGenRegisterInfo.inc"
+
+// Defines symbolic names for the NVPTX instructions.
+#define GET_INSTRINFO_ENUM
+#include "NVPTXGenInstrInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTX.td b/contrib/llvm/lib/Target/NVPTX/NVPTX.td
new file mode 100644
index 0000000..93fabf6
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTX.td
@@ -0,0 +1,69 @@
+//===- NVPTX.td - Describe the NVPTX Target Machine -----------*- tblgen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This is the top level entry point for the NVPTX target.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+include "NVPTXRegisterInfo.td"
+include "NVPTXInstrInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Subtarget Features.
+// - We use the SM version number instead of explicit feature table.
+// - Need at least one feature to avoid generating zero sized array by
+//   TableGen in NVPTXGenSubtarget.inc.
+//===----------------------------------------------------------------------===//
+
+// SM Versions
+def SM20 : SubtargetFeature<"sm_20", "SmVersion", "20",
+                            "Target SM 2.0">;
+def SM21 : SubtargetFeature<"sm_21", "SmVersion", "21",
+                            "Target SM 2.1">;
+def SM30 : SubtargetFeature<"sm_30", "SmVersion", "30",
+                            "Target SM 3.0">;
+def SM35 : SubtargetFeature<"sm_35", "SmVersion", "35",
+                            "Target SM 3.5">;
+def SM50 : SubtargetFeature<"sm_50", "SmVersion", "50",
+                            "Target SM 5.0">;
+
+// PTX Versions
+def PTX30 : SubtargetFeature<"ptx30", "PTXVersion", "30",
+                             "Use PTX version 3.0">;
+def PTX31 : SubtargetFeature<"ptx31", "PTXVersion", "31",
+                             "Use PTX version 3.1">;
+def PTX32 : SubtargetFeature<"ptx32", "PTXVersion", "32",
+                             "Use PTX version 3.2">;
+def PTX40 : SubtargetFeature<"ptx40", "PTXVersion", "40",
+                             "Use PTX version 4.0">;
+
+//===----------------------------------------------------------------------===//
+// NVPTX supported processors.
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+def : Proc<"sm_20", [SM20]>;
+def : Proc<"sm_21", [SM21]>;
+def : Proc<"sm_30", [SM30]>;
+def : Proc<"sm_35", [SM35]>;
+def : Proc<"sm_50", [SM50]>;
+
+
+def NVPTXInstrInfo : InstrInfo {
+}
+
+def NVPTX : Target {
+  let InstructionSet = NVPTXInstrInfo;
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp
new file mode 100644
index 0000000..1f37696
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp
@@ -0,0 +1,46 @@
+//===-- AllocaHoisting.cpp - Hoist allocas to the entry block --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Hoist the alloca instructions in the non-entry blocks to the entry blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXAllocaHoisting.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+
+namespace llvm {
+
+bool NVPTXAllocaHoisting::runOnFunction(Function &function) {
+  bool functionModified = false;
+  Function::iterator I = function.begin();
+  TerminatorInst *firstTerminatorInst = (I++)->getTerminator();
+
+  for (Function::iterator E = function.end(); I != E; ++I) {
+    for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
+      AllocaInst *allocaInst = dyn_cast<AllocaInst>(BI++);
+      if (allocaInst && isa<ConstantInt>(allocaInst->getArraySize())) {
+        allocaInst->moveBefore(firstTerminatorInst);
+        functionModified = true;
+      }
+    }
+  }
+
+  return functionModified;
+}
+
+char NVPTXAllocaHoisting::ID = 1;
+static RegisterPass<NVPTXAllocaHoisting>
+X("alloca-hoisting", "Hoisting alloca instructions in non-entry "
+                     "blocks to the entry block");
+
+FunctionPass *createAllocaHoisting() { return new NVPTXAllocaHoisting(); }
+
+} // end namespace llvm
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.h b/contrib/llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.h
new file mode 100644
index 0000000..5b61068
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.h
@@ -0,0 +1,50 @@
+//===-- AllocaHoisting.h - Hosist allocas to the entry block ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Hoist the alloca instructions in the non-entry blocks to the entry blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTX_ALLOCA_HOISTING_H_
+#define NVPTX_ALLOCA_HOISTING_H_
+
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Pass.h"
+
+namespace llvm {
+
+class FunctionPass;
+class Function;
+
+// Hoisting the alloca instructions in the non-entry blocks to the entry
+// block.
+class NVPTXAllocaHoisting : public FunctionPass {
+public:
+  static char ID; // Pass ID
+  NVPTXAllocaHoisting() : FunctionPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DataLayoutPass>();
+    AU.addPreserved("stack-protector");
+    AU.addPreserved<MachineFunctionAnalysis>();
+  }
+
+  const char *getPassName() const override {
+    return "NVPTX specific alloca hoisting";
+  }
+
+  bool runOnFunction(Function &function) override;
+};
+
+extern FunctionPass *createAllocaHoisting();
+
+} // end namespace llvm
+
+#endif // NVPTX_ALLOCA_HOISTING_H_
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
new file mode 100644
index 0000000..187b88c
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -0,0 +1,2292 @@
+//===-- NVPTXAsmPrinter.cpp - NVPTX LLVM assembly writer ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to NVPTX assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXAsmPrinter.h"
+#include "InstPrinter/NVPTXInstPrinter.h"
+#include "MCTargetDesc/NVPTXMCAsmInfo.h"
+#include "NVPTX.h"
+#include "NVPTXInstrInfo.h"
+#include "NVPTXMachineFunctionInfo.h"
+#include "NVPTXMCExpr.h"
+#include "NVPTXRegisterInfo.h"
+#include "NVPTXTargetMachine.h"
+#include "NVPTXUtilities.h"
+#include "cl_common_defines.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TimeValue.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include <sstream>
+using namespace llvm;
+
+#define DEPOTNAME "__local_depot"
+
+static cl::opt<bool>
+EmitLineNumbers("nvptx-emit-line-numbers", cl::Hidden,
+                cl::desc("NVPTX Specific: Emit Line numbers even without -G"),
+                cl::init(true));
+
+static cl::opt<bool>
+InterleaveSrc("nvptx-emit-src", cl::ZeroOrMore, cl::Hidden,
+              cl::desc("NVPTX Specific: Emit source line in ptx file"),
+              cl::init(false));
+
+namespace {
+/// DiscoverDependentGlobals - Return a set of GlobalVariables on which \p V
+/// depends.
+void DiscoverDependentGlobals(const Value *V,
+                              DenseSet<const GlobalVariable *> &Globals) {
+  if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
+    Globals.insert(GV);
+  else {
+    if (const User *U = dyn_cast<User>(V)) {
+      for (unsigned i = 0, e = U->getNumOperands(); i != e; ++i) {
+        DiscoverDependentGlobals(U->getOperand(i), Globals);
+      }
+    }
+  }
+}
+
+/// VisitGlobalVariableForEmission - Add \p GV to the list of GlobalVariable
+/// instances to be emitted, but only after any dependents have been added
+/// first.
+void VisitGlobalVariableForEmission(
+    const GlobalVariable *GV, SmallVectorImpl<const GlobalVariable *> &Order,
+    DenseSet<const GlobalVariable *> &Visited,
+    DenseSet<const GlobalVariable *> &Visiting) {
+  // Have we already visited this one?
+  if (Visited.count(GV))
+    return;
+
+  // Do we have a circular dependency?
+  if (Visiting.count(GV))
+    report_fatal_error("Circular dependency found in global variable set");
+
+  // Start visiting this global
+  Visiting.insert(GV);
+
+  // Make sure we visit all dependents first
+  DenseSet<const GlobalVariable *> Others;
+  for (unsigned i = 0, e = GV->getNumOperands(); i != e; ++i)
+    DiscoverDependentGlobals(GV->getOperand(i), Others);
+
+  for (DenseSet<const GlobalVariable *>::iterator I = Others.begin(),
+                                                  E = Others.end();
+       I != E; ++I)
+    VisitGlobalVariableForEmission(*I, Order, Visited, Visiting);
+
+  // Now we can visit ourself
+  Order.push_back(GV);
+  Visited.insert(GV);
+  Visiting.erase(GV);
+}
+}
+
+// @TODO: This is a copy from AsmPrinter.cpp.  The function is static, so we
+// cannot just link to the existing version.
+/// LowerConstant - Lower the specified LLVM Constant to an MCExpr.
+///
+using namespace nvptx;
+const MCExpr *nvptx::LowerConstant(const Constant *CV, AsmPrinter &AP) {
+  MCContext &Ctx = AP.OutContext;
+
+  if (CV->isNullValue() || isa<UndefValue>(CV))
+    return MCConstantExpr::Create(0, Ctx);
+
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV))
+    return MCConstantExpr::Create(CI->getZExtValue(), Ctx);
+
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(CV))
+    return MCSymbolRefExpr::Create(AP.getSymbol(GV), Ctx);
+
+  if (const BlockAddress *BA = dyn_cast<BlockAddress>(CV))
+    return MCSymbolRefExpr::Create(AP.GetBlockAddressSymbol(BA), Ctx);
+
+  const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV);
+  if (!CE)
+    llvm_unreachable("Unknown constant value to lower!");
+
+  switch (CE->getOpcode()) {
+  default:
+    // If the code isn't optimized, there may be outstanding folding
+    // opportunities. Attempt to fold the expression using DataLayout as a
+    // last resort before giving up.
+    if (Constant *C = ConstantFoldConstantExpression(CE, AP.TM.getDataLayout()))
+      if (C != CE)
+        return LowerConstant(C, AP);
+
+    // Otherwise report the problem to the user.
+    {
+      std::string S;
+      raw_string_ostream OS(S);
+      OS << "Unsupported expression in static initializer: ";
+      CE->printAsOperand(OS, /*PrintType=*/ false,
+                         !AP.MF ? nullptr : AP.MF->getFunction()->getParent());
+      report_fatal_error(OS.str());
+    }
+  case Instruction::AddrSpaceCast: {
+    // Strip any addrspace(1)->addrspace(0) addrspace casts. These will be
+    // handled by the generic() logic in the MCExpr printer
+    PointerType *DstTy            = cast<PointerType>(CE->getType());
+    PointerType *SrcTy            = cast<PointerType>(CE->getOperand(0)->getType());
+    if (SrcTy->getAddressSpace() == 1 && DstTy->getAddressSpace() == 0) {
+      return LowerConstant(cast<const Constant>(CE->getOperand(0)), AP);
+    }
+    std::string S;
+    raw_string_ostream OS(S);
+    OS << "Unsupported expression in static initializer: ";
+    CE->printAsOperand(OS, /*PrintType=*/ false,
+                       !AP.MF ? nullptr : AP.MF->getFunction()->getParent());
+    report_fatal_error(OS.str());
+  }
+  case Instruction::GetElementPtr: {
+    const DataLayout &TD = *AP.TM.getDataLayout();
+    // Generate a symbolic expression for the byte address
+    APInt OffsetAI(TD.getPointerSizeInBits(), 0);
+    cast<GEPOperator>(CE)->accumulateConstantOffset(TD, OffsetAI);
+
+    const MCExpr *Base = LowerConstant(CE->getOperand(0), AP);
+    if (!OffsetAI)
+      return Base;
+
+    int64_t Offset = OffsetAI.getSExtValue();
+    return MCBinaryExpr::CreateAdd(Base, MCConstantExpr::Create(Offset, Ctx),
+                                   Ctx);
+  }
+
+  case Instruction::Trunc:
+    // We emit the value and depend on the assembler to truncate the generated
+    // expression properly.  This is important for differences between
+    // blockaddress labels.  Since the two labels are in the same function, it
+    // is reasonable to treat their delta as a 32-bit value.
+  // FALL THROUGH.
+  case Instruction::BitCast:
+    return LowerConstant(CE->getOperand(0), AP);
+
+  case Instruction::IntToPtr: {
+    const DataLayout &TD = *AP.TM.getDataLayout();
+    // Handle casts to pointers by changing them into casts to the appropriate
+    // integer type.  This promotes constant folding and simplifies this code.
+    Constant *Op = CE->getOperand(0);
+    Op = ConstantExpr::getIntegerCast(Op, TD.getIntPtrType(CV->getContext()),
+                                      false /*ZExt*/);
+    return LowerConstant(Op, AP);
+  }
+
+  case Instruction::PtrToInt: {
+    const DataLayout &TD = *AP.TM.getDataLayout();
+    // Support only foldable casts to/from pointers that can be eliminated by
+    // changing the pointer to the appropriately sized integer type.
+    Constant *Op = CE->getOperand(0);
+    Type *Ty = CE->getType();
+
+    const MCExpr *OpExpr = LowerConstant(Op, AP);
+
+    // We can emit the pointer value into this slot if the slot is an
+    // integer slot equal to the size of the pointer.
+    if (TD.getTypeAllocSize(Ty) == TD.getTypeAllocSize(Op->getType()))
+      return OpExpr;
+
+    // Otherwise the pointer is smaller than the resultant integer, mask off
+    // the high bits so we are sure to get a proper truncation if the input is
+    // a constant expr.
+    unsigned InBits = TD.getTypeAllocSizeInBits(Op->getType());
+    const MCExpr *MaskExpr =
+        MCConstantExpr::Create(~0ULL >> (64 - InBits), Ctx);
+    return MCBinaryExpr::CreateAnd(OpExpr, MaskExpr, Ctx);
+  }
+
+    // The MC library also has a right-shift operator, but it isn't consistently
+  // signed or unsigned between different targets.
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::SDiv:
+  case Instruction::SRem:
+  case Instruction::Shl:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor: {
+    const MCExpr *LHS = LowerConstant(CE->getOperand(0), AP);
+    const MCExpr *RHS = LowerConstant(CE->getOperand(1), AP);
+    switch (CE->getOpcode()) {
+    default:
+      llvm_unreachable("Unknown binary operator constant cast expr");
+    case Instruction::Add:
+      return MCBinaryExpr::CreateAdd(LHS, RHS, Ctx);
+    case Instruction::Sub:
+      return MCBinaryExpr::CreateSub(LHS, RHS, Ctx);
+    case Instruction::Mul:
+      return MCBinaryExpr::CreateMul(LHS, RHS, Ctx);
+    case Instruction::SDiv:
+      return MCBinaryExpr::CreateDiv(LHS, RHS, Ctx);
+    case Instruction::SRem:
+      return MCBinaryExpr::CreateMod(LHS, RHS, Ctx);
+    case Instruction::Shl:
+      return MCBinaryExpr::CreateShl(LHS, RHS, Ctx);
+    case Instruction::And:
+      return MCBinaryExpr::CreateAnd(LHS, RHS, Ctx);
+    case Instruction::Or:
+      return MCBinaryExpr::CreateOr(LHS, RHS, Ctx);
+    case Instruction::Xor:
+      return MCBinaryExpr::CreateXor(LHS, RHS, Ctx);
+    }
+  }
+  }
+}
+
+void NVPTXAsmPrinter::emitLineNumberAsDotLoc(const MachineInstr &MI) {
+  if (!EmitLineNumbers)
+    return;
+  if (ignoreLoc(MI))
+    return;
+
+  DebugLoc curLoc = MI.getDebugLoc();
+
+  if (prevDebugLoc.isUnknown() && curLoc.isUnknown())
+    return;
+
+  if (prevDebugLoc == curLoc)
+    return;
+
+  prevDebugLoc = curLoc;
+
+  if (curLoc.isUnknown())
+    return;
+
+  const MachineFunction *MF = MI.getParent()->getParent();
+  //const TargetMachine &TM = MF->getTarget();
+
+  const LLVMContext &ctx = MF->getFunction()->getContext();
+  DIScope Scope(curLoc.getScope(ctx));
+
+  assert((!Scope || Scope.isScope()) &&
+    "Scope of a DebugLoc should be null or a DIScope.");
+  if (!Scope)
+     return;
+
+  StringRef fileName(Scope.getFilename());
+  StringRef dirName(Scope.getDirectory());
+  SmallString<128> FullPathName = dirName;
+  if (!dirName.empty() && !sys::path::is_absolute(fileName)) {
+    sys::path::append(FullPathName, fileName);
+    fileName = FullPathName.str();
+  }
+
+  if (filenameMap.find(fileName.str()) == filenameMap.end())
+    return;
+
+  // Emit the line from the source file.
+  if (InterleaveSrc)
+    this->emitSrcInText(fileName.str(), curLoc.getLine());
+
+  std::stringstream temp;
+  temp << "\t.loc " << filenameMap[fileName.str()] << " " << curLoc.getLine()
+       << " " << curLoc.getCol();
+  OutStreamer.EmitRawText(Twine(temp.str().c_str()));
+}
+
+void NVPTXAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  SmallString<128> Str;
+  raw_svector_ostream OS(Str);
+  if (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA)
+    emitLineNumberAsDotLoc(*MI);
+
+  MCInst Inst;
+  lowerToMCInst(MI, Inst);
+  EmitToStreamer(OutStreamer, Inst);
+}
+
+// Handle symbol backtracking for targets that do not support image handles
+bool NVPTXAsmPrinter::lowerImageHandleOperand(const MachineInstr *MI,
+                                           unsigned OpNo, MCOperand &MCOp) {
+  const MachineOperand &MO = MI->getOperand(OpNo);
+  const MCInstrDesc &MCID = MI->getDesc();
+
+  if (MCID.TSFlags & NVPTXII::IsTexFlag) {
+    // This is a texture fetch, so operand 4 is a texref and operand 5 is
+    // a samplerref
+    if (OpNo == 4 && MO.isImm()) {
+      lowerImageHandleSymbol(MO.getImm(), MCOp);
+      return true;
+    }
+    if (OpNo == 5 && MO.isImm() && !(MCID.TSFlags & NVPTXII::IsTexModeUnifiedFlag)) {
+      lowerImageHandleSymbol(MO.getImm(), MCOp);
+      return true;
+    }
+
+    return false;
+  } else if (MCID.TSFlags & NVPTXII::IsSuldMask) {
+    unsigned VecSize =
+      1 << (((MCID.TSFlags & NVPTXII::IsSuldMask) >> NVPTXII::IsSuldShift) - 1);
+
+    // For a surface load of vector size N, the Nth operand will be the surfref
+    if (OpNo == VecSize && MO.isImm()) {
+      lowerImageHandleSymbol(MO.getImm(), MCOp);
+      return true;
+    }
+
+    return false;
+  } else if (MCID.TSFlags & NVPTXII::IsSustFlag) {
+    // This is a surface store, so operand 0 is a surfref
+    if (OpNo == 0 && MO.isImm()) {
+      lowerImageHandleSymbol(MO.getImm(), MCOp);
+      return true;
+    }
+
+    return false;
+  } else if (MCID.TSFlags & NVPTXII::IsSurfTexQueryFlag) {
+    // This is a query, so operand 1 is a surfref/texref
+    if (OpNo == 1 && MO.isImm()) {
+      lowerImageHandleSymbol(MO.getImm(), MCOp);
+      return true;
+    }
+
+    return false;
+  }
+
+  return false;
+}
+
+void NVPTXAsmPrinter::lowerImageHandleSymbol(unsigned Index, MCOperand &MCOp) {
+  // Ewwww
+  TargetMachine &TM = const_cast<TargetMachine&>(MF->getTarget());
+  NVPTXTargetMachine &nvTM = static_cast<NVPTXTargetMachine&>(TM);
+  const NVPTXMachineFunctionInfo *MFI = MF->getInfo<NVPTXMachineFunctionInfo>();
+  const char *Sym = MFI->getImageHandleSymbol(Index);
+  std::string *SymNamePtr =
+    nvTM.getManagedStrPool()->getManagedString(Sym);
+  MCOp = GetSymbolRef(OutContext.GetOrCreateSymbol(
+    StringRef(SymNamePtr->c_str())));
+}
+
+void NVPTXAsmPrinter::lowerToMCInst(const MachineInstr *MI, MCInst &OutMI) {
+  OutMI.setOpcode(MI->getOpcode());
+  const NVPTXSubtarget &ST = TM.getSubtarget<NVPTXSubtarget>();
+
+  // Special: Do not mangle symbol operand of CALL_PROTOTYPE
+  if (MI->getOpcode() == NVPTX::CALL_PROTOTYPE) {
+    const MachineOperand &MO = MI->getOperand(0);
+    OutMI.addOperand(GetSymbolRef(
+      OutContext.GetOrCreateSymbol(Twine(MO.getSymbolName()))));
+    return;
+  }
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+
+    MCOperand MCOp;
+    if (!ST.hasImageHandles()) {
+      if (lowerImageHandleOperand(MI, i, MCOp)) {
+        OutMI.addOperand(MCOp);
+        continue;
+      }
+    }
+
+    if (lowerOperand(MO, MCOp))
+      OutMI.addOperand(MCOp);
+  }
+}
+
+bool NVPTXAsmPrinter::lowerOperand(const MachineOperand &MO,
+                                   MCOperand &MCOp) {
+  switch (MO.getType()) {
+  default: llvm_unreachable("unknown operand type");
+  case MachineOperand::MO_Register:
+    MCOp = MCOperand::CreateReg(encodeVirtualRegister(MO.getReg()));
+    break;
+  case MachineOperand::MO_Immediate:
+    MCOp = MCOperand::CreateImm(MO.getImm());
+    break;
+  case MachineOperand::MO_MachineBasicBlock:
+    MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(
+        MO.getMBB()->getSymbol(), OutContext));
+    break;
+  case MachineOperand::MO_ExternalSymbol:
+    MCOp = GetSymbolRef(GetExternalSymbolSymbol(MO.getSymbolName()));
+    break;
+  case MachineOperand::MO_GlobalAddress:
+    MCOp = GetSymbolRef(getSymbol(MO.getGlobal()));
+    break;
+  case MachineOperand::MO_FPImmediate: {
+    const ConstantFP *Cnt = MO.getFPImm();
+    APFloat Val = Cnt->getValueAPF();
+
+    switch (Cnt->getType()->getTypeID()) {
+    default: report_fatal_error("Unsupported FP type"); break;
+    case Type::FloatTyID:
+      MCOp = MCOperand::CreateExpr(
+        NVPTXFloatMCExpr::CreateConstantFPSingle(Val, OutContext));
+      break;
+    case Type::DoubleTyID:
+      MCOp = MCOperand::CreateExpr(
+        NVPTXFloatMCExpr::CreateConstantFPDouble(Val, OutContext));
+      break;
+    }
+    break;
+  }
+  }
+  return true;
+}
+
+unsigned NVPTXAsmPrinter::encodeVirtualRegister(unsigned Reg) {
+  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+    const TargetRegisterClass *RC = MRI->getRegClass(Reg);
+
+    DenseMap<unsigned, unsigned> &RegMap = VRegMapping[RC];
+    unsigned RegNum = RegMap[Reg];
+
+    // Encode the register class in the upper 4 bits
+    // Must be kept in sync with NVPTXInstPrinter::printRegName
+    unsigned Ret = 0;
+    if (RC == &NVPTX::Int1RegsRegClass) {
+      Ret = (1 << 28);
+    } else if (RC == &NVPTX::Int16RegsRegClass) {
+      Ret = (2 << 28);
+    } else if (RC == &NVPTX::Int32RegsRegClass) {
+      Ret = (3 << 28);
+    } else if (RC == &NVPTX::Int64RegsRegClass) {
+      Ret = (4 << 28);
+    } else if (RC == &NVPTX::Float32RegsRegClass) {
+      Ret = (5 << 28);
+    } else if (RC == &NVPTX::Float64RegsRegClass) {
+      Ret = (6 << 28);
+    } else {
+      report_fatal_error("Bad register class");
+    }
+
+    // Insert the vreg number
+    Ret |= (RegNum & 0x0FFFFFFF);
+    return Ret;
+  } else {
+    // Some special-use registers are actually physical registers.
+    // Encode this as the register class ID of 0 and the real register ID.
+    return Reg & 0x0FFFFFFF;
+  }
+}
+
+MCOperand NVPTXAsmPrinter::GetSymbolRef(const MCSymbol *Symbol) {
+  const MCExpr *Expr;
+  Expr = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_None,
+                                 OutContext);
+  return MCOperand::CreateExpr(Expr);
+}
+
+void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) {
+  const DataLayout *TD = TM.getDataLayout();
+  const TargetLowering *TLI = TM.getTargetLowering();
+
+  Type *Ty = F->getReturnType();
+
+  bool isABI = (nvptxSubtarget.getSmVersion() >= 20);
+
+  if (Ty->getTypeID() == Type::VoidTyID)
+    return;
+
+  O << " (";
+
+  if (isABI) {
+    if (Ty->isFloatingPointTy() || Ty->isIntegerTy()) {
+      unsigned size = 0;
+      if (const IntegerType *ITy = dyn_cast<IntegerType>(Ty)) {
+        size = ITy->getBitWidth();
+        if (size < 32)
+          size = 32;
+      } else {
+        assert(Ty->isFloatingPointTy() && "Floating point type expected here");
+        size = Ty->getPrimitiveSizeInBits();
+      }
+
+      O << ".param .b" << size << " func_retval0";
+    } else if (isa<PointerType>(Ty)) {
+      O << ".param .b" << TLI->getPointerTy().getSizeInBits()
+        << " func_retval0";
+    } else {
+      if ((Ty->getTypeID() == Type::StructTyID) || isa<VectorType>(Ty)) {
+        unsigned totalsz = TD->getTypeAllocSize(Ty);
+        unsigned retAlignment = 0;
+        if (!llvm::getAlign(*F, 0, retAlignment))
+          retAlignment = TD->getABITypeAlignment(Ty);
+        O << ".param .align " << retAlignment << " .b8 func_retval0[" << totalsz
+          << "]";
+      } else
+        assert(false && "Unknown return type");
+    }
+  } else {
+    SmallVector<EVT, 16> vtparts;
+    ComputeValueVTs(*TLI, Ty, vtparts);
+    unsigned idx = 0;
+    for (unsigned i = 0, e = vtparts.size(); i != e; ++i) {
+      unsigned elems = 1;
+      EVT elemtype = vtparts[i];
+      if (vtparts[i].isVector()) {
+        elems = vtparts[i].getVectorNumElements();
+        elemtype = vtparts[i].getVectorElementType();
+      }
+
+      for (unsigned j = 0, je = elems; j != je; ++j) {
+        unsigned sz = elemtype.getSizeInBits();
+        if (elemtype.isInteger() && (sz < 32))
+          sz = 32;
+        O << ".reg .b" << sz << " func_retval" << idx;
+        if (j < je - 1)
+          O << ", ";
+        ++idx;
+      }
+      if (i < e - 1)
+        O << ", ";
+    }
+  }
+  O << ") ";
+  return;
+}
+
+void NVPTXAsmPrinter::printReturnValStr(const MachineFunction &MF,
+                                        raw_ostream &O) {
+  const Function *F = MF.getFunction();
+  printReturnValStr(F, O);
+}
+
+void NVPTXAsmPrinter::EmitFunctionEntryLabel() {
+  SmallString<128> Str;
+  raw_svector_ostream O(Str);
+
+  if (!GlobalsEmitted) {
+    emitGlobals(*MF->getFunction()->getParent());
+    GlobalsEmitted = true;
+  }
+  
+  // Set up
+  MRI = &MF->getRegInfo();
+  F = MF->getFunction();
+  emitLinkageDirective(F, O);
+  if (llvm::isKernelFunction(*F))
+    O << ".entry ";
+  else {
+    O << ".func ";
+    printReturnValStr(*MF, O);
+  }
+
+  O << *CurrentFnSym;
+
+  emitFunctionParamList(*MF, O);
+
+  if (llvm::isKernelFunction(*F))
+    emitKernelFunctionDirectives(*F, O);
+
+  OutStreamer.EmitRawText(O.str());
+
+  prevDebugLoc = DebugLoc();
+}
+
+void NVPTXAsmPrinter::EmitFunctionBodyStart() {
+  VRegMapping.clear();
+  OutStreamer.EmitRawText(StringRef("{\n"));
+  setAndEmitFunctionVirtualRegisters(*MF);
+
+  SmallString<128> Str;
+  raw_svector_ostream O(Str);
+  emitDemotedVars(MF->getFunction(), O);
+  OutStreamer.EmitRawText(O.str());
+}
+
+void NVPTXAsmPrinter::EmitFunctionBodyEnd() {
+  OutStreamer.EmitRawText(StringRef("}\n"));
+  VRegMapping.clear();
+}
+
+void NVPTXAsmPrinter::emitImplicitDef(const MachineInstr *MI) const {
+  unsigned RegNo = MI->getOperand(0).getReg();
+  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+  if (TRI->isVirtualRegister(RegNo)) {
+    OutStreamer.AddComment(Twine("implicit-def: ") +
+                           getVirtualRegisterName(RegNo));
+  } else {
+    OutStreamer.AddComment(Twine("implicit-def: ") +
+                           TM.getRegisterInfo()->getName(RegNo));
+  }
+  OutStreamer.AddBlankLine();
+}
+
+void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
+                                                   raw_ostream &O) const {
+  // If the NVVM IR has some of reqntid* specified, then output
+  // the reqntid directive, and set the unspecified ones to 1.
+  // If none of reqntid* is specified, don't output reqntid directive.
+  unsigned reqntidx, reqntidy, reqntidz;
+  bool specified = false;
+  if (llvm::getReqNTIDx(F, reqntidx) == false)
+    reqntidx = 1;
+  else
+    specified = true;
+  if (llvm::getReqNTIDy(F, reqntidy) == false)
+    reqntidy = 1;
+  else
+    specified = true;
+  if (llvm::getReqNTIDz(F, reqntidz) == false)
+    reqntidz = 1;
+  else
+    specified = true;
+
+  if (specified)
+    O << ".reqntid " << reqntidx << ", " << reqntidy << ", " << reqntidz
+      << "\n";
+
+  // If the NVVM IR has some of maxntid* specified, then output
+  // the maxntid directive, and set the unspecified ones to 1.
+  // If none of maxntid* is specified, don't output maxntid directive.
+  unsigned maxntidx, maxntidy, maxntidz;
+  specified = false;
+  if (llvm::getMaxNTIDx(F, maxntidx) == false)
+    maxntidx = 1;
+  else
+    specified = true;
+  if (llvm::getMaxNTIDy(F, maxntidy) == false)
+    maxntidy = 1;
+  else
+    specified = true;
+  if (llvm::getMaxNTIDz(F, maxntidz) == false)
+    maxntidz = 1;
+  else
+    specified = true;
+
+  if (specified)
+    O << ".maxntid " << maxntidx << ", " << maxntidy << ", " << maxntidz
+      << "\n";
+
+  unsigned mincta;
+  if (llvm::getMinCTASm(F, mincta))
+    O << ".minnctapersm " << mincta << "\n";
+}
+
+std::string
+NVPTXAsmPrinter::getVirtualRegisterName(unsigned Reg) const {
+  const TargetRegisterClass *RC = MRI->getRegClass(Reg);
+
+  std::string Name;
+  raw_string_ostream NameStr(Name);
+
+  VRegRCMap::const_iterator I = VRegMapping.find(RC);
+  assert(I != VRegMapping.end() && "Bad register class");
+  const DenseMap<unsigned, unsigned> &RegMap = I->second;
+
+  VRegMap::const_iterator VI = RegMap.find(Reg);
+  assert(VI != RegMap.end() && "Bad virtual register");
+  unsigned MappedVR = VI->second;
+
+  NameStr << getNVPTXRegClassStr(RC) << MappedVR;
+
+  NameStr.flush();
+  return Name;
+}
+
+void NVPTXAsmPrinter::emitVirtualRegister(unsigned int vr,
+                                          raw_ostream &O) {
+  O << getVirtualRegisterName(vr);
+}
+
+void NVPTXAsmPrinter::printVecModifiedImmediate(
+    const MachineOperand &MO, const char *Modifier, raw_ostream &O) {
+  static const char vecelem[] = { '0', '1', '2', '3', '0', '1', '2', '3' };
+  int Imm = (int) MO.getImm();
+  if (0 == strcmp(Modifier, "vecelem"))
+    O << "_" << vecelem[Imm];
+  else if (0 == strcmp(Modifier, "vecv4comm1")) {
+    if ((Imm < 0) || (Imm > 3))
+      O << "//";
+  } else if (0 == strcmp(Modifier, "vecv4comm2")) {
+    if ((Imm < 4) || (Imm > 7))
+      O << "//";
+  } else if (0 == strcmp(Modifier, "vecv4pos")) {
+    if (Imm < 0)
+      Imm = 0;
+    O << "_" << vecelem[Imm % 4];
+  } else if (0 == strcmp(Modifier, "vecv2comm1")) {
+    if ((Imm < 0) || (Imm > 1))
+      O << "//";
+  } else if (0 == strcmp(Modifier, "vecv2comm2")) {
+    if ((Imm < 2) || (Imm > 3))
+      O << "//";
+  } else if (0 == strcmp(Modifier, "vecv2pos")) {
+    if (Imm < 0)
+      Imm = 0;
+    O << "_" << vecelem[Imm % 2];
+  } else
+    llvm_unreachable("Unknown Modifier on immediate operand");
+}
+
+
+
+void NVPTXAsmPrinter::emitDeclaration(const Function *F, raw_ostream &O) {
+
+  emitLinkageDirective(F, O);
+  if (llvm::isKernelFunction(*F))
+    O << ".entry ";
+  else
+    O << ".func ";
+  printReturnValStr(F, O);
+  O << *getSymbol(F) << "\n";
+  emitFunctionParamList(F, O);
+  O << ";\n";
+}
+
+static bool usedInGlobalVarDef(const Constant *C) {
+  if (!C)
+    return false;
+
+  if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) {
+    if (GV->getName().str() == "llvm.used")
+      return false;
+    return true;
+  }
+
+  for (const User *U : C->users())
+    if (const Constant *C = dyn_cast<Constant>(U))
+      if (usedInGlobalVarDef(C))
+        return true;
+
+  return false;
+}
+
+static bool usedInOneFunc(const User *U, Function const *&oneFunc) {
+  if (const GlobalVariable *othergv = dyn_cast<GlobalVariable>(U)) {
+    if (othergv->getName().str() == "llvm.used")
+      return true;
+  }
+
+  if (const Instruction *instr = dyn_cast<Instruction>(U)) {
+    if (instr->getParent() && instr->getParent()->getParent()) {
+      const Function *curFunc = instr->getParent()->getParent();
+      if (oneFunc && (curFunc != oneFunc))
+        return false;
+      oneFunc = curFunc;
+      return true;
+    } else
+      return false;
+  }
+
+  if (const MDNode *md = dyn_cast<MDNode>(U))
+    if (md->hasName() && ((md->getName().str() == "llvm.dbg.gv") ||
+                          (md->getName().str() == "llvm.dbg.sp")))
+      return true;
+
+  for (const User *UU : U->users())
+    if (usedInOneFunc(UU, oneFunc) == false)
+      return false;
+
+  return true;
+}
+
+/* Find out if a global variable can be demoted to local scope.
+ * Currently, this is valid for CUDA shared variables, which have local
+ * scope and global lifetime. So the conditions to check are :
+ * 1. Is the global variable in shared address space?
+ * 2. Does it have internal linkage?
+ * 3. Is the global variable referenced only in one function?
+ */
+static bool canDemoteGlobalVar(const GlobalVariable *gv, Function const *&f) {
+  if (gv->hasInternalLinkage() == false)
+    return false;
+  const PointerType *Pty = gv->getType();
+  if (Pty->getAddressSpace() != llvm::ADDRESS_SPACE_SHARED)
+    return false;
+
+  const Function *oneFunc = nullptr;
+
+  bool flag = usedInOneFunc(gv, oneFunc);
+  if (flag == false)
+    return false;
+  if (!oneFunc)
+    return false;
+  f = oneFunc;
+  return true;
+}
+
+static bool useFuncSeen(const Constant *C,
+                        llvm::DenseMap<const Function *, bool> &seenMap) {
+  for (const User *U : C->users()) {
+    if (const Constant *cu = dyn_cast<Constant>(U)) {
+      if (useFuncSeen(cu, seenMap))
+        return true;
+    } else if (const Instruction *I = dyn_cast<Instruction>(U)) {
+      const BasicBlock *bb = I->getParent();
+      if (!bb)
+        continue;
+      const Function *caller = bb->getParent();
+      if (!caller)
+        continue;
+      if (seenMap.find(caller) != seenMap.end())
+        return true;
+    }
+  }
+  return false;
+}
+
+void NVPTXAsmPrinter::emitDeclarations(const Module &M, raw_ostream &O) {
+  llvm::DenseMap<const Function *, bool> seenMap;
+  for (Module::const_iterator FI = M.begin(), FE = M.end(); FI != FE; ++FI) {
+    const Function *F = FI;
+
+    if (F->isDeclaration()) {
+      if (F->use_empty())
+        continue;
+      if (F->getIntrinsicID())
+        continue;
+      emitDeclaration(F, O);
+      continue;
+    }
+    for (const User *U : F->users()) {
+      if (const Constant *C = dyn_cast<Constant>(U)) {
+        if (usedInGlobalVarDef(C)) {
+          // The use is in the initialization of a global variable
+          // that is a function pointer, so print a declaration
+          // for the original function
+          emitDeclaration(F, O);
+          break;
+        }
+        // Emit a declaration of this function if the function that
+        // uses this constant expr has already been seen.
+        if (useFuncSeen(C, seenMap)) {
+          emitDeclaration(F, O);
+          break;
+        }
+      }
+
+      if (!isa<Instruction>(U))
+        continue;
+      const Instruction *instr = cast<Instruction>(U);
+      const BasicBlock *bb = instr->getParent();
+      if (!bb)
+        continue;
+      const Function *caller = bb->getParent();
+      if (!caller)
+        continue;
+
+      // If a caller has already been seen, then the caller is
+      // appearing in the module before the callee. so print out
+      // a declaration for the callee.
+      if (seenMap.find(caller) != seenMap.end()) {
+        emitDeclaration(F, O);
+        break;
+      }
+    }
+    seenMap[F] = true;
+  }
+}
+
+void NVPTXAsmPrinter::recordAndEmitFilenames(Module &M) {
+  DebugInfoFinder DbgFinder;
+  DbgFinder.processModule(M);
+
+  unsigned i = 1;
+  for (DICompileUnit DIUnit : DbgFinder.compile_units()) {
+    StringRef Filename(DIUnit.getFilename());
+    StringRef Dirname(DIUnit.getDirectory());
+    SmallString<128> FullPathName = Dirname;
+    if (!Dirname.empty() && !sys::path::is_absolute(Filename)) {
+      sys::path::append(FullPathName, Filename);
+      Filename = FullPathName.str();
+    }
+    if (filenameMap.find(Filename.str()) != filenameMap.end())
+      continue;
+    filenameMap[Filename.str()] = i;
+    OutStreamer.EmitDwarfFileDirective(i, "", Filename.str());
+    ++i;
+  }
+
+  for (DISubprogram SP : DbgFinder.subprograms()) {
+    StringRef Filename(SP.getFilename());
+    StringRef Dirname(SP.getDirectory());
+    SmallString<128> FullPathName = Dirname;
+    if (!Dirname.empty() && !sys::path::is_absolute(Filename)) {
+      sys::path::append(FullPathName, Filename);
+      Filename = FullPathName.str();
+    }
+    if (filenameMap.find(Filename.str()) != filenameMap.end())
+      continue;
+    filenameMap[Filename.str()] = i;
+    ++i;
+  }
+}
+
+bool NVPTXAsmPrinter::doInitialization(Module &M) {
+
+  SmallString<128> Str1;
+  raw_svector_ostream OS1(Str1);
+
+  MMI = getAnalysisIfAvailable<MachineModuleInfo>();
+  MMI->AnalyzeModule(M);
+
+  // We need to call the parent's one explicitly.
+  //bool Result = AsmPrinter::doInitialization(M);
+
+  // Initialize TargetLoweringObjectFile.
+  const_cast<TargetLoweringObjectFile &>(getObjFileLowering())
+      .Initialize(OutContext, TM);
+
+  Mang = new Mangler(TM.getDataLayout());
+
+  // Emit header before any dwarf directives are emitted below.
+  emitHeader(M, OS1);
+  OutStreamer.EmitRawText(OS1.str());
+
+  // Already commented out
+  //bool Result = AsmPrinter::doInitialization(M);
+
+  // Emit module-level inline asm if it exists.
+  if (!M.getModuleInlineAsm().empty()) {
+    OutStreamer.AddComment("Start of file scope inline assembly");
+    OutStreamer.AddBlankLine();
+    OutStreamer.EmitRawText(StringRef(M.getModuleInlineAsm()));
+    OutStreamer.AddBlankLine();
+    OutStreamer.AddComment("End of file scope inline assembly");
+    OutStreamer.AddBlankLine();
+  }
+
+  if (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA)
+    recordAndEmitFilenames(M);
+
+  GlobalsEmitted = false;
+    
+  return false; // success
+}
+
+void NVPTXAsmPrinter::emitGlobals(const Module &M) {
+  SmallString<128> Str2;
+  raw_svector_ostream OS2(Str2);
+
+  emitDeclarations(M, OS2);
+
+  // As ptxas does not support forward references of globals, we need to first
+  // sort the list of module-level globals in def-use order. We visit each
+  // global variable in order, and ensure that we emit it *after* its dependent
+  // globals. We use a little extra memory maintaining both a set and a list to
+  // have fast searches while maintaining a strict ordering.
+  SmallVector<const GlobalVariable *, 8> Globals;
+  DenseSet<const GlobalVariable *> GVVisited;
+  DenseSet<const GlobalVariable *> GVVisiting;
+
+  // Visit each global variable, in order
+  for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I)
+    VisitGlobalVariableForEmission(I, Globals, GVVisited, GVVisiting);
+
+  assert(GVVisited.size() == M.getGlobalList().size() &&
+         "Missed a global variable");
+  assert(GVVisiting.size() == 0 && "Did not fully process a global variable");
+
+  // Print out module-level global variables in proper order
+  for (unsigned i = 0, e = Globals.size(); i != e; ++i)
+    printModuleLevelGV(Globals[i], OS2);
+
+  OS2 << '\n';
+
+  OutStreamer.EmitRawText(OS2.str());
+}
+
+void NVPTXAsmPrinter::emitHeader(Module &M, raw_ostream &O) {
+  O << "//\n";
+  O << "// Generated by LLVM NVPTX Back-End\n";
+  O << "//\n";
+  O << "\n";
+
+  unsigned PTXVersion = nvptxSubtarget.getPTXVersion();
+  O << ".version " << (PTXVersion / 10) << "." << (PTXVersion % 10) << "\n";
+
+  O << ".target ";
+  O << nvptxSubtarget.getTargetName();
+
+  if (nvptxSubtarget.getDrvInterface() == NVPTX::NVCL)
+    O << ", texmode_independent";
+  if (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA) {
+    if (!nvptxSubtarget.hasDouble())
+      O << ", map_f64_to_f32";
+  }
+
+  if (MAI->doesSupportDebugInformation())
+    O << ", debug";
+
+  O << "\n";
+
+  O << ".address_size ";
+  if (nvptxSubtarget.is64Bit())
+    O << "64";
+  else
+    O << "32";
+  O << "\n";
+
+  O << "\n";
+}
+
+bool NVPTXAsmPrinter::doFinalization(Module &M) {
+
+  // If we did not emit any functions, then the global declarations have not
+  // yet been emitted.
+  if (!GlobalsEmitted) {
+    emitGlobals(M);
+    GlobalsEmitted = true;
+  }
+
+  // XXX Temproarily remove global variables so that doFinalization() will not
+  // emit them again (global variables are emitted at beginning).
+
+  Module::GlobalListType &global_list = M.getGlobalList();
+  int i, n = global_list.size();
+  GlobalVariable **gv_array = new GlobalVariable *[n];
+
+  // first, back-up GlobalVariable in gv_array
+  i = 0;
+  for (Module::global_iterator I = global_list.begin(), E = global_list.end();
+       I != E; ++I)
+    gv_array[i++] = &*I;
+
+  // second, empty global_list
+  while (!global_list.empty())
+    global_list.remove(global_list.begin());
+
+  // call doFinalization
+  bool ret = AsmPrinter::doFinalization(M);
+
+  // now we restore global variables
+  for (i = 0; i < n; i++)
+    global_list.insert(global_list.end(), gv_array[i]);
+
+  clearAnnotationCache(&M);
+
+  delete[] gv_array;
+  return ret;
+
+  //bool Result = AsmPrinter::doFinalization(M);
+  // Instead of calling the parents doFinalization, we may
+  // clone parents doFinalization and customize here.
+  // Currently, we if NVISA out the EmitGlobals() in
+  // parent's doFinalization, which is too intrusive.
+  //
+  // Same for the doInitialization.
+  //return Result;
+}
+
+// This function emits appropriate linkage directives for
+// functions and global variables.
+//
+// extern function declaration            -> .extern
+// extern function definition             -> .visible
+// external global variable with init     -> .visible
+// external without init                  -> .extern
+// appending                              -> not allowed, assert.
+// for any linkage other than
+// internal, private, linker_private,
+// linker_private_weak, linker_private_weak_def_auto,
+// we emit                                -> .weak.
+
+void NVPTXAsmPrinter::emitLinkageDirective(const GlobalValue *V,
+                                           raw_ostream &O) {
+  if (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA) {
+    if (V->hasExternalLinkage()) {
+      if (isa<GlobalVariable>(V)) {
+        const GlobalVariable *GVar = cast<GlobalVariable>(V);
+        if (GVar) {
+          if (GVar->hasInitializer())
+            O << ".visible ";
+          else
+            O << ".extern ";
+        }
+      } else if (V->isDeclaration())
+        O << ".extern ";
+      else
+        O << ".visible ";
+    } else if (V->hasAppendingLinkage()) {
+      std::string msg;
+      msg.append("Error: ");
+      msg.append("Symbol ");
+      if (V->hasName())
+        msg.append(V->getName().str());
+      msg.append("has unsupported appending linkage type");
+      llvm_unreachable(msg.c_str());
+    } else if (!V->hasInternalLinkage() &&
+               !V->hasPrivateLinkage()) {
+      O << ".weak ";
+    }
+  }
+}
+
+void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
+                                         raw_ostream &O,
+                                         bool processDemoted) {
+
+  // Skip meta data
+  if (GVar->hasSection()) {
+    if (GVar->getSection() == StringRef("llvm.metadata"))
+      return;
+  }
+
+  // Skip LLVM intrinsic global variables
+  if (GVar->getName().startswith("llvm.") ||
+      GVar->getName().startswith("nvvm."))
+    return;
+
+  const DataLayout *TD = TM.getDataLayout();
+
+  // GlobalVariables are always constant pointers themselves.
+  const PointerType *PTy = GVar->getType();
+  Type *ETy = PTy->getElementType();
+
+  if (GVar->hasExternalLinkage()) {
+    if (GVar->hasInitializer())
+      O << ".visible ";
+    else
+      O << ".extern ";
+  } else if (GVar->hasLinkOnceLinkage() || GVar->hasWeakLinkage() ||
+             GVar->hasAvailableExternallyLinkage() ||
+             GVar->hasCommonLinkage()) {
+    O << ".weak ";
+  }
+
+  if (llvm::isTexture(*GVar)) {
+    O << ".global .texref " << llvm::getTextureName(*GVar) << ";\n";
+    return;
+  }
+
+  if (llvm::isSurface(*GVar)) {
+    O << ".global .surfref " << llvm::getSurfaceName(*GVar) << ";\n";
+    return;
+  }
+
+  if (GVar->isDeclaration()) {
+    // (extern) declarations, no definition or initializer
+    // Currently the only known declaration is for an automatic __local
+    // (.shared) promoted to global.
+    emitPTXGlobalVariable(GVar, O);
+    O << ";\n";
+    return;
+  }
+
+  if (llvm::isSampler(*GVar)) {
+    O << ".global .samplerref " << llvm::getSamplerName(*GVar);
+
+    const Constant *Initializer = nullptr;
+    if (GVar->hasInitializer())
+      Initializer = GVar->getInitializer();
+    const ConstantInt *CI = nullptr;
+    if (Initializer)
+      CI = dyn_cast<ConstantInt>(Initializer);
+    if (CI) {
+      unsigned sample = CI->getZExtValue();
+
+      O << " = { ";
+
+      for (int i = 0,
+               addr = ((sample & __CLK_ADDRESS_MASK) >> __CLK_ADDRESS_BASE);
+           i < 3; i++) {
+        O << "addr_mode_" << i << " = ";
+        switch (addr) {
+        case 0:
+          O << "wrap";
+          break;
+        case 1:
+          O << "clamp_to_border";
+          break;
+        case 2:
+          O << "clamp_to_edge";
+          break;
+        case 3:
+          O << "wrap";
+          break;
+        case 4:
+          O << "mirror";
+          break;
+        }
+        O << ", ";
+      }
+      O << "filter_mode = ";
+      switch ((sample & __CLK_FILTER_MASK) >> __CLK_FILTER_BASE) {
+      case 0:
+        O << "nearest";
+        break;
+      case 1:
+        O << "linear";
+        break;
+      case 2:
+        llvm_unreachable("Anisotropic filtering is not supported");
+      default:
+        O << "nearest";
+        break;
+      }
+      if (!((sample & __CLK_NORMALIZED_MASK) >> __CLK_NORMALIZED_BASE)) {
+        O << ", force_unnormalized_coords = 1";
+      }
+      O << " }";
+    }
+
+    O << ";\n";
+    return;
+  }
+
+  if (GVar->hasPrivateLinkage()) {
+
+    if (!strncmp(GVar->getName().data(), "unrollpragma", 12))
+      return;
+
+    // FIXME - need better way (e.g. Metadata) to avoid generating this global
+    if (!strncmp(GVar->getName().data(), "filename", 8))
+      return;
+    if (GVar->use_empty())
+      return;
+  }
+
+  const Function *demotedFunc = nullptr;
+  if (!processDemoted && canDemoteGlobalVar(GVar, demotedFunc)) {
+    O << "// " << GVar->getName().str() << " has been demoted\n";
+    if (localDecls.find(demotedFunc) != localDecls.end())
+      localDecls[demotedFunc].push_back(GVar);
+    else {
+      std::vector<const GlobalVariable *> temp;
+      temp.push_back(GVar);
+      localDecls[demotedFunc] = temp;
+    }
+    return;
+  }
+
+  O << ".";
+  emitPTXAddressSpace(PTy->getAddressSpace(), O);
+
+  if (isManaged(*GVar)) {
+    O << " .attribute(.managed)";
+  }
+
+  if (GVar->getAlignment() == 0)
+    O << " .align " << (int) TD->getPrefTypeAlignment(ETy);
+  else
+    O << " .align " << GVar->getAlignment();
+
+  if (ETy->isSingleValueType()) {
+    O << " .";
+    // Special case: ABI requires that we use .u8 for predicates
+    if (ETy->isIntegerTy(1))
+      O << "u8";
+    else
+      O << getPTXFundamentalTypeStr(ETy, false);
+    O << " ";
+    O << *getSymbol(GVar);
+
+    // Ptx allows variable initilization only for constant and global state
+    // spaces.
+    if (GVar->hasInitializer()) {
+      if ((PTy->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL) ||
+          (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST)) {
+        const Constant *Initializer = GVar->getInitializer();
+        // 'undef' is treated as there is no value spefied.
+        if (!Initializer->isNullValue() && !isa<UndefValue>(Initializer)) {
+          O << " = ";
+          printScalarConstant(Initializer, O);
+        }
+      } else {
+        // The frontend adds zero-initializer to variables that don't have an
+        // initial value, so skip warning for this case.
+        if (!GVar->getInitializer()->isNullValue()) {
+          std::string warnMsg = "initial value of '" + GVar->getName().str() +
+              "' is not allowed in addrspace(" +
+              llvm::utostr_32(PTy->getAddressSpace()) + ")";
+          report_fatal_error(warnMsg.c_str());
+        }
+      }
+    }
+  } else {
+    unsigned int ElementSize = 0;
+
+    // Although PTX has direct support for struct type and array type and
+    // LLVM IR is very similar to PTX, the LLVM CodeGen does not support for
+    // targets that support these high level field accesses. Structs, arrays
+    // and vectors are lowered into arrays of bytes.
+    switch (ETy->getTypeID()) {
+    case Type::StructTyID:
+    case Type::ArrayTyID:
+    case Type::VectorTyID:
+      ElementSize = TD->getTypeStoreSize(ETy);
+      // Ptx allows variable initilization only for constant and
+      // global state spaces.
+      if (((PTy->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL) ||
+           (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST)) &&
+          GVar->hasInitializer()) {
+        const Constant *Initializer = GVar->getInitializer();
+        if (!isa<UndefValue>(Initializer) && !Initializer->isNullValue()) {
+          AggBuffer aggBuffer(ElementSize, O, *this);
+          bufferAggregateConstant(Initializer, &aggBuffer);
+          if (aggBuffer.numSymbols) {
+            if (nvptxSubtarget.is64Bit()) {
+              O << " .u64 " << *getSymbol(GVar) << "[";
+              O << ElementSize / 8;
+            } else {
+              O << " .u32 " << *getSymbol(GVar) << "[";
+              O << ElementSize / 4;
+            }
+            O << "]";
+          } else {
+            O << " .b8 " << *getSymbol(GVar) << "[";
+            O << ElementSize;
+            O << "]";
+          }
+          O << " = {";
+          aggBuffer.print();
+          O << "}";
+        } else {
+          O << " .b8 " << *getSymbol(GVar);
+          if (ElementSize) {
+            O << "[";
+            O << ElementSize;
+            O << "]";
+          }
+        }
+      } else {
+        O << " .b8 " << *getSymbol(GVar);
+        if (ElementSize) {
+          O << "[";
+          O << ElementSize;
+          O << "]";
+        }
+      }
+      break;
+    default:
+      llvm_unreachable("type not supported yet");
+    }
+
+  }
+  O << ";\n";
+}
+
+void NVPTXAsmPrinter::emitDemotedVars(const Function *f, raw_ostream &O) {
+  if (localDecls.find(f) == localDecls.end())
+    return;
+
+  std::vector<const GlobalVariable *> &gvars = localDecls[f];
+
+  for (unsigned i = 0, e = gvars.size(); i != e; ++i) {
+    O << "\t// demoted variable\n\t";
+    printModuleLevelGV(gvars[i], O, true);
+  }
+}
+
+void NVPTXAsmPrinter::emitPTXAddressSpace(unsigned int AddressSpace,
+                                          raw_ostream &O) const {
+  switch (AddressSpace) {
+  case llvm::ADDRESS_SPACE_LOCAL:
+    O << "local";
+    break;
+  case llvm::ADDRESS_SPACE_GLOBAL:
+    O << "global";
+    break;
+  case llvm::ADDRESS_SPACE_CONST:
+    O << "const";
+    break;
+  case llvm::ADDRESS_SPACE_SHARED:
+    O << "shared";
+    break;
+  default:
+    report_fatal_error("Bad address space found while emitting PTX");
+    break;
+  }
+}
+
+std::string
+NVPTXAsmPrinter::getPTXFundamentalTypeStr(const Type *Ty, bool useB4PTR) const {
+  switch (Ty->getTypeID()) {
+  default:
+    llvm_unreachable("unexpected type");
+    break;
+  case Type::IntegerTyID: {
+    unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth();
+    if (NumBits == 1)
+      return "pred";
+    else if (NumBits <= 64) {
+      std::string name = "u";
+      return name + utostr(NumBits);
+    } else {
+      llvm_unreachable("Integer too large");
+      break;
+    }
+    break;
+  }
+  case Type::FloatTyID:
+    return "f32";
+  case Type::DoubleTyID:
+    return "f64";
+  case Type::PointerTyID:
+    if (nvptxSubtarget.is64Bit())
+      if (useB4PTR)
+        return "b64";
+      else
+        return "u64";
+    else if (useB4PTR)
+      return "b32";
+    else
+      return "u32";
+  }
+  llvm_unreachable("unexpected type");
+  return nullptr;
+}
+
+void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
+                                            raw_ostream &O) {
+
+  const DataLayout *TD = TM.getDataLayout();
+
+  // GlobalVariables are always constant pointers themselves.
+  const PointerType *PTy = GVar->getType();
+  Type *ETy = PTy->getElementType();
+
+  O << ".";
+  emitPTXAddressSpace(PTy->getAddressSpace(), O);
+  if (GVar->getAlignment() == 0)
+    O << " .align " << (int) TD->getPrefTypeAlignment(ETy);
+  else
+    O << " .align " << GVar->getAlignment();
+
+  if (ETy->isSingleValueType()) {
+    O << " .";
+    O << getPTXFundamentalTypeStr(ETy);
+    O << " ";
+    O << *getSymbol(GVar);
+    return;
+  }
+
+  int64_t ElementSize = 0;
+
+  // Although PTX has direct support for struct type and array type and LLVM IR
+  // is very similar to PTX, the LLVM CodeGen does not support for targets that
+  // support these high level field accesses. Structs and arrays are lowered
+  // into arrays of bytes.
+  switch (ETy->getTypeID()) {
+  case Type::StructTyID:
+  case Type::ArrayTyID:
+  case Type::VectorTyID:
+    ElementSize = TD->getTypeStoreSize(ETy);
+    O << " .b8 " << *getSymbol(GVar) << "[";
+    if (ElementSize) {
+      O << itostr(ElementSize);
+    }
+    O << "]";
+    break;
+  default:
+    llvm_unreachable("type not supported yet");
+  }
+  return;
+}
+
+static unsigned int getOpenCLAlignment(const DataLayout *TD, Type *Ty) {
+  if (Ty->isSingleValueType())
+    return TD->getPrefTypeAlignment(Ty);
+
+  const ArrayType *ATy = dyn_cast<ArrayType>(Ty);
+  if (ATy)
+    return getOpenCLAlignment(TD, ATy->getElementType());
+
+  const VectorType *VTy = dyn_cast<VectorType>(Ty);
+  if (VTy) {
+    Type *ETy = VTy->getElementType();
+    unsigned int numE = VTy->getNumElements();
+    unsigned int alignE = TD->getPrefTypeAlignment(ETy);
+    if (numE == 3)
+      return 4 * alignE;
+    else
+      return numE * alignE;
+  }
+
+  const StructType *STy = dyn_cast<StructType>(Ty);
+  if (STy) {
+    unsigned int alignStruct = 1;
+    // Go through each element of the struct and find the
+    // largest alignment.
+    for (unsigned i = 0, e = STy->getNumElements(); i != e; i++) {
+      Type *ETy = STy->getElementType(i);
+      unsigned int align = getOpenCLAlignment(TD, ETy);
+      if (align > alignStruct)
+        alignStruct = align;
+    }
+    return alignStruct;
+  }
+
+  const FunctionType *FTy = dyn_cast<FunctionType>(Ty);
+  if (FTy)
+    return TD->getPointerPrefAlignment();
+  return TD->getPrefTypeAlignment(Ty);
+}
+
+void NVPTXAsmPrinter::printParamName(Function::const_arg_iterator I,
+                                     int paramIndex, raw_ostream &O) {
+  if ((nvptxSubtarget.getDrvInterface() == NVPTX::NVCL) ||
+      (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA))
+    O << *getSymbol(I->getParent()) << "_param_" << paramIndex;
+  else {
+    std::string argName = I->getName();
+    const char *p = argName.c_str();
+    while (*p) {
+      if (*p == '.')
+        O << "_";
+      else
+        O << *p;
+      p++;
+    }
+  }
+}
+
+void NVPTXAsmPrinter::printParamName(int paramIndex, raw_ostream &O) {
+  Function::const_arg_iterator I, E;
+  int i = 0;
+
+  if ((nvptxSubtarget.getDrvInterface() == NVPTX::NVCL) ||
+      (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA)) {
+    O << *CurrentFnSym << "_param_" << paramIndex;
+    return;
+  }
+
+  for (I = F->arg_begin(), E = F->arg_end(); I != E; ++I, i++) {
+    if (i == paramIndex) {
+      printParamName(I, paramIndex, O);
+      return;
+    }
+  }
+  llvm_unreachable("paramIndex out of bound");
+}
+
+void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
+  const DataLayout *TD = TM.getDataLayout();
+  const AttributeSet &PAL = F->getAttributes();
+  const TargetLowering *TLI = TM.getTargetLowering();
+  Function::const_arg_iterator I, E;
+  unsigned paramIndex = 0;
+  bool first = true;
+  bool isKernelFunc = llvm::isKernelFunction(*F);
+  bool isABI = (nvptxSubtarget.getSmVersion() >= 20);
+  MVT thePointerTy = TLI->getPointerTy();
+
+  O << "(\n";
+
+  for (I = F->arg_begin(), E = F->arg_end(); I != E; ++I, paramIndex++) {
+    Type *Ty = I->getType();
+
+    if (!first)
+      O << ",\n";
+
+    first = false;
+
+    // Handle image/sampler parameters
+    if (isKernelFunction(*F)) {
+      if (isSampler(*I) || isImage(*I)) {
+        if (isImage(*I)) {
+          std::string sname = I->getName();
+          if (isImageWriteOnly(*I) || isImageReadWrite(*I)) {
+            if (nvptxSubtarget.hasImageHandles())
+              O << "\t.param .u64 .ptr .surfref ";
+            else
+              O << "\t.param .surfref ";
+            O << *CurrentFnSym << "_param_" << paramIndex;
+          }
+          else { // Default image is read_only
+            if (nvptxSubtarget.hasImageHandles())
+              O << "\t.param .u64 .ptr .texref ";
+            else
+              O << "\t.param .texref ";
+            O << *CurrentFnSym << "_param_" << paramIndex;
+          }
+        } else {
+          if (nvptxSubtarget.hasImageHandles())
+            O << "\t.param .u64 .ptr .samplerref ";
+          else
+            O << "\t.param .samplerref ";
+          O << *CurrentFnSym << "_param_" << paramIndex;
+        }
+        continue;
+      }
+    }
+
+    if (PAL.hasAttribute(paramIndex + 1, Attribute::ByVal) == false) {
+      if (Ty->isAggregateType() || Ty->isVectorTy()) {
+        // Just print .param .align <a> .b8 .param[size];
+        // <a> = PAL.getparamalignment
+        // size = typeallocsize of element type
+        unsigned align = PAL.getParamAlignment(paramIndex + 1);
+        if (align == 0)
+          align = TD->getABITypeAlignment(Ty);
+
+        unsigned sz = TD->getTypeAllocSize(Ty);
+        O << "\t.param .align " << align << " .b8 ";
+        printParamName(I, paramIndex, O);
+        O << "[" << sz << "]";
+
+        continue;
+      }
+      // Just a scalar
+      const PointerType *PTy = dyn_cast<PointerType>(Ty);
+      if (isKernelFunc) {
+        if (PTy) {
+          // Special handling for pointer arguments to kernel
+          O << "\t.param .u" << thePointerTy.getSizeInBits() << " ";
+
+          if (nvptxSubtarget.getDrvInterface() != NVPTX::CUDA) {
+            Type *ETy = PTy->getElementType();
+            int addrSpace = PTy->getAddressSpace();
+            switch (addrSpace) {
+            default:
+              O << ".ptr ";
+              break;
+            case llvm::ADDRESS_SPACE_CONST:
+              O << ".ptr .const ";
+              break;
+            case llvm::ADDRESS_SPACE_SHARED:
+              O << ".ptr .shared ";
+              break;
+            case llvm::ADDRESS_SPACE_GLOBAL:
+              O << ".ptr .global ";
+              break;
+            }
+            O << ".align " << (int) getOpenCLAlignment(TD, ETy) << " ";
+          }
+          printParamName(I, paramIndex, O);
+          continue;
+        }
+
+        // non-pointer scalar to kernel func
+        O << "\t.param .";
+        // Special case: predicate operands become .u8 types
+        if (Ty->isIntegerTy(1))
+          O << "u8";
+        else
+          O << getPTXFundamentalTypeStr(Ty);
+        O << " ";
+        printParamName(I, paramIndex, O);
+        continue;
+      }
+      // Non-kernel function, just print .param .b<size> for ABI
+      // and .reg .b<size> for non-ABI
+      unsigned sz = 0;
+      if (isa<IntegerType>(Ty)) {
+        sz = cast<IntegerType>(Ty)->getBitWidth();
+        if (sz < 32)
+          sz = 32;
+      } else if (isa<PointerType>(Ty))
+        sz = thePointerTy.getSizeInBits();
+      else
+        sz = Ty->getPrimitiveSizeInBits();
+      if (isABI)
+        O << "\t.param .b" << sz << " ";
+      else
+        O << "\t.reg .b" << sz << " ";
+      printParamName(I, paramIndex, O);
+      continue;
+    }
+
+    // param has byVal attribute. So should be a pointer
+    const PointerType *PTy = dyn_cast<PointerType>(Ty);
+    assert(PTy && "Param with byval attribute should be a pointer type");
+    Type *ETy = PTy->getElementType();
+
+    if (isABI || isKernelFunc) {
+      // Just print .param .align <a> .b8 .param[size];
+      // <a> = PAL.getparamalignment
+      // size = typeallocsize of element type
+      unsigned align = PAL.getParamAlignment(paramIndex + 1);
+      if (align == 0)
+        align = TD->getABITypeAlignment(ETy);
+
+      unsigned sz = TD->getTypeAllocSize(ETy);
+      O << "\t.param .align " << align << " .b8 ";
+      printParamName(I, paramIndex, O);
+      O << "[" << sz << "]";
+      continue;
+    } else {
+      // Split the ETy into constituent parts and
+      // print .param .b<size> <name> for each part.
+      // Further, if a part is vector, print the above for
+      // each vector element.
+      SmallVector<EVT, 16> vtparts;
+      ComputeValueVTs(*TLI, ETy, vtparts);
+      for (unsigned i = 0, e = vtparts.size(); i != e; ++i) {
+        unsigned elems = 1;
+        EVT elemtype = vtparts[i];
+        if (vtparts[i].isVector()) {
+          elems = vtparts[i].getVectorNumElements();
+          elemtype = vtparts[i].getVectorElementType();
+        }
+
+        for (unsigned j = 0, je = elems; j != je; ++j) {
+          unsigned sz = elemtype.getSizeInBits();
+          if (elemtype.isInteger() && (sz < 32))
+            sz = 32;
+          O << "\t.reg .b" << sz << " ";
+          printParamName(I, paramIndex, O);
+          if (j < je - 1)
+            O << ",\n";
+          ++paramIndex;
+        }
+        if (i < e - 1)
+          O << ",\n";
+      }
+      --paramIndex;
+      continue;
+    }
+  }
+
+  O << "\n)\n";
+}
+
+void NVPTXAsmPrinter::emitFunctionParamList(const MachineFunction &MF,
+                                            raw_ostream &O) {
+  const Function *F = MF.getFunction();
+  emitFunctionParamList(F, O);
+}
+
+void NVPTXAsmPrinter::setAndEmitFunctionVirtualRegisters(
+    const MachineFunction &MF) {
+  SmallString<128> Str;
+  raw_svector_ostream O(Str);
+
+  // Map the global virtual register number to a register class specific
+  // virtual register number starting from 1 with that class.
+  const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+  //unsigned numRegClasses = TRI->getNumRegClasses();
+
+  // Emit the Fake Stack Object
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  int NumBytes = (int) MFI->getStackSize();
+  if (NumBytes) {
+    O << "\t.local .align " << MFI->getMaxAlignment() << " .b8 \t" << DEPOTNAME
+      << getFunctionNumber() << "[" << NumBytes << "];\n";
+    if (nvptxSubtarget.is64Bit()) {
+      O << "\t.reg .b64 \t%SP;\n";
+      O << "\t.reg .b64 \t%SPL;\n";
+    } else {
+      O << "\t.reg .b32 \t%SP;\n";
+      O << "\t.reg .b32 \t%SPL;\n";
+    }
+  }
+
+  // Go through all virtual registers to establish the mapping between the
+  // global virtual
+  // register number and the per class virtual register number.
+  // We use the per class virtual register number in the ptx output.
+  unsigned int numVRs = MRI->getNumVirtRegs();
+  for (unsigned i = 0; i < numVRs; i++) {
+    unsigned int vr = TRI->index2VirtReg(i);
+    const TargetRegisterClass *RC = MRI->getRegClass(vr);
+    DenseMap<unsigned, unsigned> &regmap = VRegMapping[RC];
+    int n = regmap.size();
+    regmap.insert(std::make_pair(vr, n + 1));
+  }
+
+  // Emit register declarations
+  // @TODO: Extract out the real register usage
+  // O << "\t.reg .pred %p<" << NVPTXNumRegisters << ">;\n";
+  // O << "\t.reg .s16 %rc<" << NVPTXNumRegisters << ">;\n";
+  // O << "\t.reg .s16 %rs<" << NVPTXNumRegisters << ">;\n";
+  // O << "\t.reg .s32 %r<" << NVPTXNumRegisters << ">;\n";
+  // O << "\t.reg .s64 %rd<" << NVPTXNumRegisters << ">;\n";
+  // O << "\t.reg .f32 %f<" << NVPTXNumRegisters << ">;\n";
+  // O << "\t.reg .f64 %fd<" << NVPTXNumRegisters << ">;\n";
+
+  // Emit declaration of the virtual registers or 'physical' registers for
+  // each register class
+  for (unsigned i=0; i< TRI->getNumRegClasses(); i++) {
+    const TargetRegisterClass *RC = TRI->getRegClass(i);
+    DenseMap<unsigned, unsigned> &regmap = VRegMapping[RC];
+    std::string rcname = getNVPTXRegClassName(RC);
+    std::string rcStr = getNVPTXRegClassStr(RC);
+    int n = regmap.size();
+
+    // Only declare those registers that may be used.
+    if (n) {
+       O << "\t.reg " << rcname << " \t" << rcStr << "<" << (n+1)
+         << ">;\n";
+    }
+  }
+
+  OutStreamer.EmitRawText(O.str());
+}
+
+void NVPTXAsmPrinter::printFPConstant(const ConstantFP *Fp, raw_ostream &O) {
+  APFloat APF = APFloat(Fp->getValueAPF()); // make a copy
+  bool ignored;
+  unsigned int numHex;
+  const char *lead;
+
+  if (Fp->getType()->getTypeID() == Type::FloatTyID) {
+    numHex = 8;
+    lead = "0f";
+    APF.convert(APFloat::IEEEsingle, APFloat::rmNearestTiesToEven, &ignored);
+  } else if (Fp->getType()->getTypeID() == Type::DoubleTyID) {
+    numHex = 16;
+    lead = "0d";
+    APF.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &ignored);
+  } else
+    llvm_unreachable("unsupported fp type");
+
+  APInt API = APF.bitcastToAPInt();
+  std::string hexstr(utohexstr(API.getZExtValue()));
+  O << lead;
+  if (hexstr.length() < numHex)
+    O << std::string(numHex - hexstr.length(), '0');
+  O << utohexstr(API.getZExtValue());
+}
+
+void NVPTXAsmPrinter::printScalarConstant(const Constant *CPV, raw_ostream &O) {
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(CPV)) {
+    O << CI->getValue();
+    return;
+  }
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CPV)) {
+    printFPConstant(CFP, O);
+    return;
+  }
+  if (isa<ConstantPointerNull>(CPV)) {
+    O << "0";
+    return;
+  }
+  if (const GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) {
+    PointerType *PTy = dyn_cast<PointerType>(GVar->getType());
+    bool IsNonGenericPointer = false;
+    if (PTy && PTy->getAddressSpace() != 0) {
+      IsNonGenericPointer = true;
+    }
+    if (EmitGeneric && !isa<Function>(CPV) && !IsNonGenericPointer) {
+      O << "generic(";
+      O << *getSymbol(GVar);
+      O << ")";
+    } else {
+      O << *getSymbol(GVar);
+    }
+    return;
+  }
+  if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
+    const Value *v = Cexpr->stripPointerCasts();
+    PointerType *PTy = dyn_cast<PointerType>(Cexpr->getType());
+    bool IsNonGenericPointer = false;
+    if (PTy && PTy->getAddressSpace() != 0) {
+      IsNonGenericPointer = true;
+    }
+    if (const GlobalValue *GVar = dyn_cast<GlobalValue>(v)) {
+      if (EmitGeneric && !isa<Function>(v) && !IsNonGenericPointer) {
+        O << "generic(";
+        O << *getSymbol(GVar);
+        O << ")";
+      } else {
+        O << *getSymbol(GVar);
+      }
+      return;
+    } else {
+      O << *LowerConstant(CPV, *this);
+      return;
+    }
+  }
+  llvm_unreachable("Not scalar type found in printScalarConstant()");
+}
+
+void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
+                                   AggBuffer *aggBuffer) {
+
+  const DataLayout *TD = TM.getDataLayout();
+
+  if (isa<UndefValue>(CPV) || CPV->isNullValue()) {
+    int s = TD->getTypeAllocSize(CPV->getType());
+    if (s < Bytes)
+      s = Bytes;
+    aggBuffer->addZeros(s);
+    return;
+  }
+
+  unsigned char *ptr;
+  switch (CPV->getType()->getTypeID()) {
+
+  case Type::IntegerTyID: {
+    const Type *ETy = CPV->getType();
+    if (ETy == Type::getInt8Ty(CPV->getContext())) {
+      unsigned char c =
+          (unsigned char)(dyn_cast<ConstantInt>(CPV))->getZExtValue();
+      ptr = &c;
+      aggBuffer->addBytes(ptr, 1, Bytes);
+    } else if (ETy == Type::getInt16Ty(CPV->getContext())) {
+      short int16 = (short)(dyn_cast<ConstantInt>(CPV))->getZExtValue();
+      ptr = (unsigned char *)&int16;
+      aggBuffer->addBytes(ptr, 2, Bytes);
+    } else if (ETy == Type::getInt32Ty(CPV->getContext())) {
+      if (const ConstantInt *constInt = dyn_cast<ConstantInt>(CPV)) {
+        int int32 = (int)(constInt->getZExtValue());
+        ptr = (unsigned char *)&int32;
+        aggBuffer->addBytes(ptr, 4, Bytes);
+        break;
+      } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
+        if (const ConstantInt *constInt = dyn_cast<ConstantInt>(
+                ConstantFoldConstantExpression(Cexpr, TD))) {
+          int int32 = (int)(constInt->getZExtValue());
+          ptr = (unsigned char *)&int32;
+          aggBuffer->addBytes(ptr, 4, Bytes);
+          break;
+        }
+        if (Cexpr->getOpcode() == Instruction::PtrToInt) {
+          Value *v = Cexpr->getOperand(0)->stripPointerCasts();
+          aggBuffer->addSymbol(v);
+          aggBuffer->addZeros(4);
+          break;
+        }
+      }
+      llvm_unreachable("unsupported integer const type");
+    } else if (ETy == Type::getInt64Ty(CPV->getContext())) {
+      if (const ConstantInt *constInt = dyn_cast<ConstantInt>(CPV)) {
+        long long int64 = (long long)(constInt->getZExtValue());
+        ptr = (unsigned char *)&int64;
+        aggBuffer->addBytes(ptr, 8, Bytes);
+        break;
+      } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
+        if (const ConstantInt *constInt = dyn_cast<ConstantInt>(
+                ConstantFoldConstantExpression(Cexpr, TD))) {
+          long long int64 = (long long)(constInt->getZExtValue());
+          ptr = (unsigned char *)&int64;
+          aggBuffer->addBytes(ptr, 8, Bytes);
+          break;
+        }
+        if (Cexpr->getOpcode() == Instruction::PtrToInt) {
+          Value *v = Cexpr->getOperand(0)->stripPointerCasts();
+          aggBuffer->addSymbol(v);
+          aggBuffer->addZeros(8);
+          break;
+        }
+      }
+      llvm_unreachable("unsupported integer const type");
+    } else
+      llvm_unreachable("unsupported integer const type");
+    break;
+  }
+  case Type::FloatTyID:
+  case Type::DoubleTyID: {
+    const ConstantFP *CFP = dyn_cast<ConstantFP>(CPV);
+    const Type *Ty = CFP->getType();
+    if (Ty == Type::getFloatTy(CPV->getContext())) {
+      float float32 = (float) CFP->getValueAPF().convertToFloat();
+      ptr = (unsigned char *)&float32;
+      aggBuffer->addBytes(ptr, 4, Bytes);
+    } else if (Ty == Type::getDoubleTy(CPV->getContext())) {
+      double float64 = CFP->getValueAPF().convertToDouble();
+      ptr = (unsigned char *)&float64;
+      aggBuffer->addBytes(ptr, 8, Bytes);
+    } else {
+      llvm_unreachable("unsupported fp const type");
+    }
+    break;
+  }
+  case Type::PointerTyID: {
+    if (const GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) {
+      aggBuffer->addSymbol(GVar);
+    } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
+      const Value *v = Cexpr->stripPointerCasts();
+      aggBuffer->addSymbol(v);
+    }
+    unsigned int s = TD->getTypeAllocSize(CPV->getType());
+    aggBuffer->addZeros(s);
+    break;
+  }
+
+  case Type::ArrayTyID:
+  case Type::VectorTyID:
+  case Type::StructTyID: {
+    if (isa<ConstantArray>(CPV) || isa<ConstantVector>(CPV) ||
+        isa<ConstantStruct>(CPV) || isa<ConstantDataSequential>(CPV)) {
+      int ElementSize = TD->getTypeAllocSize(CPV->getType());
+      bufferAggregateConstant(CPV, aggBuffer);
+      if (Bytes > ElementSize)
+        aggBuffer->addZeros(Bytes - ElementSize);
+    } else if (isa<ConstantAggregateZero>(CPV))
+      aggBuffer->addZeros(Bytes);
+    else
+      llvm_unreachable("Unexpected Constant type");
+    break;
+  }
+
+  default:
+    llvm_unreachable("unsupported type");
+  }
+}
+
+void NVPTXAsmPrinter::bufferAggregateConstant(const Constant *CPV,
+                                              AggBuffer *aggBuffer) {
+  const DataLayout *TD = TM.getDataLayout();
+  int Bytes;
+
+  // Old constants
+  if (isa<ConstantArray>(CPV) || isa<ConstantVector>(CPV)) {
+    if (CPV->getNumOperands())
+      for (unsigned i = 0, e = CPV->getNumOperands(); i != e; ++i)
+        bufferLEByte(cast<Constant>(CPV->getOperand(i)), 0, aggBuffer);
+    return;
+  }
+
+  if (const ConstantDataSequential *CDS =
+          dyn_cast<ConstantDataSequential>(CPV)) {
+    if (CDS->getNumElements())
+      for (unsigned i = 0; i < CDS->getNumElements(); ++i)
+        bufferLEByte(cast<Constant>(CDS->getElementAsConstant(i)), 0,
+                     aggBuffer);
+    return;
+  }
+
+  if (isa<ConstantStruct>(CPV)) {
+    if (CPV->getNumOperands()) {
+      StructType *ST = cast<StructType>(CPV->getType());
+      for (unsigned i = 0, e = CPV->getNumOperands(); i != e; ++i) {
+        if (i == (e - 1))
+          Bytes = TD->getStructLayout(ST)->getElementOffset(0) +
+                  TD->getTypeAllocSize(ST) -
+                  TD->getStructLayout(ST)->getElementOffset(i);
+        else
+          Bytes = TD->getStructLayout(ST)->getElementOffset(i + 1) -
+                  TD->getStructLayout(ST)->getElementOffset(i);
+        bufferLEByte(cast<Constant>(CPV->getOperand(i)), Bytes, aggBuffer);
+      }
+    }
+    return;
+  }
+  llvm_unreachable("unsupported constant type in printAggregateConstant()");
+}
+
+// buildTypeNameMap - Run through symbol table looking for type names.
+//
+
+bool NVPTXAsmPrinter::isImageType(const Type *Ty) {
+
+  std::map<const Type *, std::string>::iterator PI = TypeNameMap.find(Ty);
+
+  if (PI != TypeNameMap.end() && (!PI->second.compare("struct._image1d_t") ||
+                                  !PI->second.compare("struct._image2d_t") ||
+                                  !PI->second.compare("struct._image3d_t")))
+    return true;
+
+  return false;
+}
+
+
+bool NVPTXAsmPrinter::ignoreLoc(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  case NVPTX::CallArgBeginInst:
+  case NVPTX::CallArgEndInst0:
+  case NVPTX::CallArgEndInst1:
+  case NVPTX::CallArgF32:
+  case NVPTX::CallArgF64:
+  case NVPTX::CallArgI16:
+  case NVPTX::CallArgI32:
+  case NVPTX::CallArgI32imm:
+  case NVPTX::CallArgI64:
+  case NVPTX::CallArgParam:
+  case NVPTX::CallVoidInst:
+  case NVPTX::CallVoidInstReg:
+  case NVPTX::Callseq_End:
+  case NVPTX::CallVoidInstReg64:
+  case NVPTX::DeclareParamInst:
+  case NVPTX::DeclareRetMemInst:
+  case NVPTX::DeclareRetRegInst:
+  case NVPTX::DeclareRetScalarInst:
+  case NVPTX::DeclareScalarParamInst:
+  case NVPTX::DeclareScalarRegInst:
+  case NVPTX::StoreParamF32:
+  case NVPTX::StoreParamF64:
+  case NVPTX::StoreParamI16:
+  case NVPTX::StoreParamI32:
+  case NVPTX::StoreParamI64:
+  case NVPTX::StoreParamI8:
+  case NVPTX::StoreRetvalF32:
+  case NVPTX::StoreRetvalF64:
+  case NVPTX::StoreRetvalI16:
+  case NVPTX::StoreRetvalI32:
+  case NVPTX::StoreRetvalI64:
+  case NVPTX::StoreRetvalI8:
+  case NVPTX::LastCallArgF32:
+  case NVPTX::LastCallArgF64:
+  case NVPTX::LastCallArgI16:
+  case NVPTX::LastCallArgI32:
+  case NVPTX::LastCallArgI32imm:
+  case NVPTX::LastCallArgI64:
+  case NVPTX::LastCallArgParam:
+  case NVPTX::LoadParamMemF32:
+  case NVPTX::LoadParamMemF64:
+  case NVPTX::LoadParamMemI16:
+  case NVPTX::LoadParamMemI32:
+  case NVPTX::LoadParamMemI64:
+  case NVPTX::LoadParamMemI8:
+  case NVPTX::PrototypeInst:
+  case NVPTX::DBG_VALUE:
+    return true;
+  }
+  return false;
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool NVPTXAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                      unsigned AsmVariant,
+                                      const char *ExtraCode, raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0)
+      return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
+    case 'r':
+      break;
+    }
+  }
+
+  printOperand(MI, OpNo, O);
+
+  return false;
+}
+
+bool NVPTXAsmPrinter::PrintAsmMemoryOperand(
+    const MachineInstr *MI, unsigned OpNo, unsigned AsmVariant,
+    const char *ExtraCode, raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0])
+    return true; // Unknown modifier
+
+  O << '[';
+  printMemOperand(MI, OpNo, O);
+  O << ']';
+
+  return false;
+}
+
+void NVPTXAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
+                                   raw_ostream &O, const char *Modifier) {
+  const MachineOperand &MO = MI->getOperand(opNum);
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+      if (MO.getReg() == NVPTX::VRDepot)
+        O << DEPOTNAME << getFunctionNumber();
+      else
+        O << NVPTXInstPrinter::getRegisterName(MO.getReg());
+    } else {
+      emitVirtualRegister(MO.getReg(), O);
+    }
+    return;
+
+  case MachineOperand::MO_Immediate:
+    if (!Modifier)
+      O << MO.getImm();
+    else if (strstr(Modifier, "vec") == Modifier)
+      printVecModifiedImmediate(MO, Modifier, O);
+    else
+      llvm_unreachable(
+          "Don't know how to handle modifier on immediate operand");
+    return;
+
+  case MachineOperand::MO_FPImmediate:
+    printFPConstant(MO.getFPImm(), O);
+    break;
+
+  case MachineOperand::MO_GlobalAddress:
+    O << *getSymbol(MO.getGlobal());
+    break;
+
+  case MachineOperand::MO_MachineBasicBlock:
+    O << *MO.getMBB()->getSymbol();
+    return;
+
+  default:
+    llvm_unreachable("Operand type not supported.");
+  }
+}
+
+void NVPTXAsmPrinter::printMemOperand(const MachineInstr *MI, int opNum,
+                                      raw_ostream &O, const char *Modifier) {
+  printOperand(MI, opNum, O);
+
+  if (Modifier && !strcmp(Modifier, "add")) {
+    O << ", ";
+    printOperand(MI, opNum + 1, O);
+  } else {
+    if (MI->getOperand(opNum + 1).isImm() &&
+        MI->getOperand(opNum + 1).getImm() == 0)
+      return; // don't print ',0' or '+0'
+    O << "+";
+    printOperand(MI, opNum + 1, O);
+  }
+}
+
+
+// Force static initialization.
+extern "C" void LLVMInitializeNVPTXBackendAsmPrinter() {
+  RegisterAsmPrinter<NVPTXAsmPrinter> X(TheNVPTXTarget32);
+  RegisterAsmPrinter<NVPTXAsmPrinter> Y(TheNVPTXTarget64);
+}
+
+void NVPTXAsmPrinter::emitSrcInText(StringRef filename, unsigned line) {
+  std::stringstream temp;
+  LineReader *reader = this->getReader(filename.str());
+  temp << "\n//";
+  temp << filename.str();
+  temp << ":";
+  temp << line;
+  temp << " ";
+  temp << reader->readLine(line);
+  temp << "\n";
+  this->OutStreamer.EmitRawText(Twine(temp.str()));
+}
+
+LineReader *NVPTXAsmPrinter::getReader(std::string filename) {
+  if (!reader) {
+    reader = new LineReader(filename);
+  }
+
+  if (reader->fileName() != filename) {
+    delete reader;
+    reader = new LineReader(filename);
+  }
+
+  return reader;
+}
+
+std::string LineReader::readLine(unsigned lineNum) {
+  if (lineNum < theCurLine) {
+    theCurLine = 0;
+    fstr.seekg(0, std::ios::beg);
+  }
+  while (theCurLine < lineNum) {
+    fstr.getline(buff, 500);
+    theCurLine++;
+  }
+  return buff;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeNVPTXAsmPrinter() {
+  RegisterAsmPrinter<NVPTXAsmPrinter> X(TheNVPTXTarget32);
+  RegisterAsmPrinter<NVPTXAsmPrinter> Y(TheNVPTXTarget64);
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
new file mode 100644
index 0000000..a9f9bdd
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -0,0 +1,333 @@
+//===-- NVPTXAsmPrinter.h - NVPTX LLVM assembly writer --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to NVPTX assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTXASMPRINTER_H
+#define NVPTXASMPRINTER_H
+
+#include "NVPTX.h"
+#include "NVPTXSubtarget.h"
+#include "NVPTXTargetMachine.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <fstream>
+
+// The ptx syntax and format is very different from that usually seem in a .s
+// file,
+// therefore we are not able to use the MCAsmStreamer interface here.
+//
+// We are handcrafting the output method here.
+//
+// A better approach is to clone the MCAsmStreamer to a MCPTXAsmStreamer
+// (subclass of MCStreamer).
+
+// This is defined in AsmPrinter.cpp.
+// Used to process the constant expressions in initializers.
+namespace nvptx {
+const llvm::MCExpr *
+LowerConstant(const llvm::Constant *CV, llvm::AsmPrinter &AP);
+}
+
+namespace llvm {
+
+class LineReader {
+private:
+  unsigned theCurLine;
+  std::ifstream fstr;
+  char buff[512];
+  std::string theFileName;
+  SmallVector<unsigned, 32> lineOffset;
+public:
+  LineReader(std::string filename) {
+    theCurLine = 0;
+    fstr.open(filename.c_str());
+    theFileName = filename;
+  }
+  std::string fileName() { return theFileName; }
+  ~LineReader() { fstr.close(); }
+  std::string readLine(unsigned line);
+};
+
+class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
+
+  class AggBuffer {
+    // Used to buffer the emitted string for initializing global
+    // aggregates.
+    //
+    // Normally an aggregate (array, vector or structure) is emitted
+    // as a u8[]. However, if one element/field of the aggregate
+    // is a non-NULL address, then the aggregate is emitted as u32[]
+    // or u64[].
+    //
+    // We first layout the aggregate in 'buffer' in bytes, except for
+    // those symbol addresses. For the i-th symbol address in the
+    //aggregate, its corresponding 4-byte or 8-byte elements in 'buffer'
+    // are filled with 0s. symbolPosInBuffer[i-1] records its position
+    // in 'buffer', and Symbols[i-1] records the Value*.
+    //
+    // Once we have this AggBuffer setup, we can choose how to print
+    // it out.
+  public:
+    unsigned size;         // size of the buffer in bytes
+    unsigned char *buffer; // the buffer
+    unsigned numSymbols;   // number of symbol addresses
+    SmallVector<unsigned, 4> symbolPosInBuffer;
+    SmallVector<const Value *, 4> Symbols;
+
+  private:
+    unsigned curpos;
+    raw_ostream &O;
+    NVPTXAsmPrinter &AP;
+    bool EmitGeneric;
+
+  public:
+    AggBuffer(unsigned _size, raw_ostream &_O, NVPTXAsmPrinter &_AP)
+        : O(_O), AP(_AP) {
+      buffer = new unsigned char[_size];
+      size = _size;
+      curpos = 0;
+      numSymbols = 0;
+      EmitGeneric = AP.EmitGeneric;
+    }
+    ~AggBuffer() { delete[] buffer; }
+    unsigned addBytes(unsigned char *Ptr, int Num, int Bytes) {
+      assert((curpos + Num) <= size);
+      assert((curpos + Bytes) <= size);
+      for (int i = 0; i < Num; ++i) {
+        buffer[curpos] = Ptr[i];
+        curpos++;
+      }
+      for (int i = Num; i < Bytes; ++i) {
+        buffer[curpos] = 0;
+        curpos++;
+      }
+      return curpos;
+    }
+    unsigned addZeros(int Num) {
+      assert((curpos + Num) <= size);
+      for (int i = 0; i < Num; ++i) {
+        buffer[curpos] = 0;
+        curpos++;
+      }
+      return curpos;
+    }
+    void addSymbol(const Value *GVar) {
+      symbolPosInBuffer.push_back(curpos);
+      Symbols.push_back(GVar);
+      numSymbols++;
+    }
+    void print() {
+      if (numSymbols == 0) {
+        // print out in bytes
+        for (unsigned i = 0; i < size; i++) {
+          if (i)
+            O << ", ";
+          O << (unsigned int) buffer[i];
+        }
+      } else {
+        // print out in 4-bytes or 8-bytes
+        unsigned int pos = 0;
+        unsigned int nSym = 0;
+        unsigned int nextSymbolPos = symbolPosInBuffer[nSym];
+        unsigned int nBytes = 4;
+        if (AP.nvptxSubtarget.is64Bit())
+          nBytes = 8;
+        for (pos = 0; pos < size; pos += nBytes) {
+          if (pos)
+            O << ", ";
+          if (pos == nextSymbolPos) {
+            const Value *v = Symbols[nSym];
+            if (const GlobalValue *GVar = dyn_cast<GlobalValue>(v)) {
+              MCSymbol *Name = AP.getSymbol(GVar);
+              PointerType *PTy = dyn_cast<PointerType>(GVar->getType());
+              bool IsNonGenericPointer = false;
+              if (PTy && PTy->getAddressSpace() != 0) {
+                IsNonGenericPointer = true;
+              }
+              if (EmitGeneric && !isa<Function>(v) && !IsNonGenericPointer) {
+                O << "generic(";
+                O << *Name;
+                O << ")";
+              } else {
+                O << *Name;
+              }
+            } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(v)) {
+              O << *nvptx::LowerConstant(Cexpr, AP);
+            } else
+              llvm_unreachable("symbol type unknown");
+            nSym++;
+            if (nSym >= numSymbols)
+              nextSymbolPos = size + 1;
+            else
+              nextSymbolPos = symbolPosInBuffer[nSym];
+          } else if (nBytes == 4)
+            O << *(unsigned int *)(buffer + pos);
+          else
+            O << *(unsigned long long *)(buffer + pos);
+        }
+      }
+    }
+  };
+
+  friend class AggBuffer;
+
+  void emitSrcInText(StringRef filename, unsigned line);
+
+private:
+  const char *getPassName() const override { return "NVPTX Assembly Printer"; }
+
+  const Function *F;
+  std::string CurrentFnName;
+
+  void EmitFunctionEntryLabel() override;
+  void EmitFunctionBodyStart() override;
+  void EmitFunctionBodyEnd() override;
+  void emitImplicitDef(const MachineInstr *MI) const override;
+
+  void EmitInstruction(const MachineInstr *) override;
+  void lowerToMCInst(const MachineInstr *MI, MCInst &OutMI);
+  bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp);
+  MCOperand GetSymbolRef(const MCSymbol *Symbol);
+  unsigned encodeVirtualRegister(unsigned Reg);
+
+  void EmitAlignment(unsigned NumBits, const GlobalValue *GV = nullptr) const {}
+
+  void printVecModifiedImmediate(const MachineOperand &MO, const char *Modifier,
+                                 raw_ostream &O);
+  void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
+                       const char *Modifier = nullptr);
+  void printImplicitDef(const MachineInstr *MI, raw_ostream &O) const;
+  void printModuleLevelGV(const GlobalVariable *GVar, raw_ostream &O,
+                          bool = false);
+  void printParamName(int paramIndex, raw_ostream &O);
+  void printParamName(Function::const_arg_iterator I, int paramIndex,
+                      raw_ostream &O);
+  void emitGlobals(const Module &M);
+  void emitHeader(Module &M, raw_ostream &O);
+  void emitKernelFunctionDirectives(const Function &F, raw_ostream &O) const;
+  void emitVirtualRegister(unsigned int vr, raw_ostream &);
+  void emitFunctionExternParamList(const MachineFunction &MF);
+  void emitFunctionParamList(const Function *, raw_ostream &O);
+  void emitFunctionParamList(const MachineFunction &MF, raw_ostream &O);
+  void setAndEmitFunctionVirtualRegisters(const MachineFunction &MF);
+  void emitFunctionTempData(const MachineFunction &MF, unsigned &FrameSize);
+  bool isImageType(const Type *Ty);
+  void printReturnValStr(const Function *, raw_ostream &O);
+  void printReturnValStr(const MachineFunction &MF, raw_ostream &O);
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                       unsigned AsmVariant, const char *ExtraCode,
+                       raw_ostream &) override;
+  void printOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
+                    const char *Modifier = nullptr);
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                             unsigned AsmVariant, const char *ExtraCode,
+                             raw_ostream &) override;
+protected:
+  bool doInitialization(Module &M) override;
+  bool doFinalization(Module &M) override;
+
+private:
+  std::string CurrentBankselLabelInBasicBlock;
+
+  bool GlobalsEmitted;
+  
+  // This is specific per MachineFunction.
+  const MachineRegisterInfo *MRI;
+  // The contents are specific for each
+  // MachineFunction. But the size of the
+  // array is not.
+  typedef DenseMap<unsigned, unsigned> VRegMap;
+  typedef DenseMap<const TargetRegisterClass *, VRegMap> VRegRCMap;
+  VRegRCMap VRegMapping;
+  // cache the subtarget here.
+  const NVPTXSubtarget &nvptxSubtarget;
+  // Build the map between type name and ID based on module's type
+  // symbol table.
+  std::map<const Type *, std::string> TypeNameMap;
+
+  // List of variables demoted to a function scope.
+  std::map<const Function *, std::vector<const GlobalVariable *> > localDecls;
+
+  // To record filename to ID mapping
+  std::map<std::string, unsigned> filenameMap;
+  void recordAndEmitFilenames(Module &);
+
+  void emitPTXGlobalVariable(const GlobalVariable *GVar, raw_ostream &O);
+  void emitPTXAddressSpace(unsigned int AddressSpace, raw_ostream &O) const;
+  std::string getPTXFundamentalTypeStr(const Type *Ty, bool = true) const;
+  void printScalarConstant(const Constant *CPV, raw_ostream &O);
+  void printFPConstant(const ConstantFP *Fp, raw_ostream &O);
+  void bufferLEByte(const Constant *CPV, int Bytes, AggBuffer *aggBuffer);
+  void bufferAggregateConstant(const Constant *CV, AggBuffer *aggBuffer);
+
+  void printOperandProper(const MachineOperand &MO);
+
+  void emitLinkageDirective(const GlobalValue *V, raw_ostream &O);
+  void emitDeclarations(const Module &, raw_ostream &O);
+  void emitDeclaration(const Function *, raw_ostream &O);
+
+  static const char *getRegisterName(unsigned RegNo);
+  void emitDemotedVars(const Function *, raw_ostream &);
+
+  bool lowerImageHandleOperand(const MachineInstr *MI, unsigned OpNo,
+                               MCOperand &MCOp);
+  void lowerImageHandleSymbol(unsigned Index, MCOperand &MCOp);
+
+  LineReader *reader;
+  LineReader *getReader(std::string);
+
+  // Used to control the need to emit .generic() in the initializer of
+  // module scope variables.
+  // Although ptx supports the hybrid mode like the following,
+  //    .global .u32 a;
+  //    .global .u32 b;
+  //    .global .u32 addr[] = {a, generic(b)}
+  // we have difficulty representing the difference in the NVVM IR.
+  //
+  // Since the address value should always be generic in CUDA C and always
+  // be specific in OpenCL, we use this simple control here.
+  //
+  bool EmitGeneric;
+
+public:
+  NVPTXAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+      : AsmPrinter(TM, Streamer),
+        nvptxSubtarget(TM.getSubtarget<NVPTXSubtarget>()) {
+    CurrentBankselLabelInBasicBlock = "";
+    reader = nullptr;
+    EmitGeneric = (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA);
+  }
+
+  ~NVPTXAsmPrinter() {
+    if (!reader)
+      delete reader;
+  }
+
+  bool ignoreLoc(const MachineInstr &);
+
+  std::string getVirtualRegisterName(unsigned) const;
+
+  DebugLoc prevDebugLoc;
+  void emitLineNumberAsDotLoc(const MachineInstr &);
+};
+} // end of namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp
new file mode 100644
index 0000000..962b123
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp
@@ -0,0 +1,84 @@
+//===-- NVPTXAssignValidGlobalNames.cpp - Assign valid names to globals ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Clean up the names of global variables in the module to not contain symbols
+// that are invalid in PTX.
+//
+// Currently NVPTX, like other backends, relies on generic symbol name
+// sanitizing done by MC. However, the ptxas assembler is more stringent and
+// disallows some additional characters in symbol names. This pass makes sure
+// such names do not reach MC at all.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Module.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/raw_ostream.h"
+#include <string>
+
+using namespace llvm;
+
+namespace {
+/// \brief NVPTXAssignValidGlobalNames
+class NVPTXAssignValidGlobalNames : public ModulePass {
+public:
+  static char ID;
+  NVPTXAssignValidGlobalNames() : ModulePass(ID) {}
+
+  bool runOnModule(Module &M) override;
+
+  /// \brief Clean up the name to remove symbols invalid in PTX.
+  std::string cleanUpName(StringRef Name);
+};
+}
+
+char NVPTXAssignValidGlobalNames::ID = 0;
+
+namespace llvm {
+void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry &);
+}
+
+INITIALIZE_PASS(NVPTXAssignValidGlobalNames, "nvptx-assign-valid-global-names",
+                "Assign valid PTX names to globals", false, false)
+
+bool NVPTXAssignValidGlobalNames::runOnModule(Module &M) {
+  for (GlobalVariable &GV : M.globals()) {
+    // We are only allowed to rename local symbols.
+    if (GV.hasLocalLinkage()) {
+      // setName doesn't do extra work if the name does not change.
+      // Note: this does not create collisions - if setName is asked to set the
+      // name to something that already exists, it adds a proper postfix to
+      // avoid collisions.
+      GV.setName(cleanUpName(GV.getName()));
+    }
+  }
+
+  return true;
+}
+
+std::string NVPTXAssignValidGlobalNames::cleanUpName(StringRef Name) {
+  std::string ValidName;
+  raw_string_ostream ValidNameStream(ValidName);
+  for (unsigned I = 0, E = Name.size(); I != E; ++I) {
+    char C = Name[I];
+    if (C == '.' || C == '@') {
+      ValidNameStream << "_$_";
+    } else {
+      ValidNameStream << C;
+    }
+  }
+
+  return ValidNameStream.str();
+}
+
+ModulePass *llvm::createNVPTXAssignValidGlobalNamesPass() {
+  return new NVPTXAssignValidGlobalNames();
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp
new file mode 100644
index 0000000..f3a095d
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp
@@ -0,0 +1,195 @@
+//===-- NVPTXFavorNonGenericAddrSpace.cpp - ---------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// When a load/store accesses the generic address space, checks whether the
+// address is casted from a non-generic address space. If so, remove this
+// addrspacecast because accessing non-generic address spaces is typically
+// faster. Besides seeking addrspacecasts, this optimization also traces into
+// the base pointer of a GEP.
+//
+// For instance, the code below loads a float from an array allocated in
+// addrspace(3).
+//
+// %0 = addrspacecast [10 x float] addrspace(3)* @a to [10 x float]*
+// %1 = gep [10 x float]* %0, i64 0, i64 %i
+// %2 = load float* %1 ; emits ld.f32
+//
+// First, function hoistAddrSpaceCastFromGEP reorders the addrspacecast
+// and the GEP to expose more optimization opportunities to function
+// optimizeMemoryInst. The intermediate code looks like:
+//
+// %0 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i
+// %1 = addrspacecast float addrspace(3)* %0 to float*
+// %2 = load float* %1 ; still emits ld.f32, but will be optimized shortly
+//
+// Then, function optimizeMemoryInstruction detects a load from addrspacecast'ed
+// generic pointers, and folds the load and the addrspacecast into a load from
+// the original address space. The final code looks like:
+//
+// %0 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i
+// %2 = load float addrspace(3)* %0 ; emits ld.shared.f32
+//
+// This pass may remove an addrspacecast in a different BB. Therefore, we
+// implement it as a FunctionPass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+// An option to disable this optimization. Enable it by default.
+static cl::opt<bool> DisableFavorNonGeneric(
+  "disable-nvptx-favor-non-generic",
+  cl::init(false),
+  cl::desc("Do not convert generic address space usage "
+           "to non-generic address space usage"),
+  cl::Hidden);
+
+namespace {
+/// \brief NVPTXFavorNonGenericAddrSpaces
+class NVPTXFavorNonGenericAddrSpaces : public FunctionPass {
+public:
+  static char ID;
+  NVPTXFavorNonGenericAddrSpaces() : FunctionPass(ID) {}
+
+  bool runOnFunction(Function &F) override;
+
+  /// Optimizes load/store instructions. Idx is the index of the pointer operand
+  /// (0 for load, and 1 for store). Returns true if it changes anything.
+  bool optimizeMemoryInstruction(Instruction *I, unsigned Idx);
+  /// Transforms "gep (addrspacecast X), indices" into "addrspacecast (gep X,
+  /// indices)".  This reordering exposes to optimizeMemoryInstruction more
+  /// optimization opportunities on loads and stores. Returns true if it changes
+  /// the program.
+  bool hoistAddrSpaceCastFromGEP(GEPOperator *GEP);
+};
+}
+
+char NVPTXFavorNonGenericAddrSpaces::ID = 0;
+
+namespace llvm {
+void initializeNVPTXFavorNonGenericAddrSpacesPass(PassRegistry &);
+}
+INITIALIZE_PASS(NVPTXFavorNonGenericAddrSpaces, "nvptx-favor-non-generic",
+                "Remove unnecessary non-generic-to-generic addrspacecasts",
+                false, false)
+
+// Decides whether removing Cast is valid and beneficial. Cast can be an
+// instruction or a constant expression.
+static bool IsEliminableAddrSpaceCast(Operator *Cast) {
+  // Returns false if not even an addrspacecast.
+  if (Cast->getOpcode() != Instruction::AddrSpaceCast)
+    return false;
+
+  Value *Src = Cast->getOperand(0);
+  PointerType *SrcTy = cast<PointerType>(Src->getType());
+  PointerType *DestTy = cast<PointerType>(Cast->getType());
+  // TODO: For now, we only handle the case where the addrspacecast only changes
+  // the address space but not the type. If the type also changes, we could
+  // still get rid of the addrspacecast by adding an extra bitcast, but we
+  // rarely see such scenarios.
+  if (SrcTy->getElementType() != DestTy->getElementType())
+    return false;
+
+  // Checks whether the addrspacecast is from a non-generic address space to the
+  // generic address space.
+  return (SrcTy->getAddressSpace() != AddressSpace::ADDRESS_SPACE_GENERIC &&
+          DestTy->getAddressSpace() == AddressSpace::ADDRESS_SPACE_GENERIC);
+}
+
+bool NVPTXFavorNonGenericAddrSpaces::hoistAddrSpaceCastFromGEP(
+    GEPOperator *GEP) {
+  Operator *Cast = dyn_cast<Operator>(GEP->getPointerOperand());
+  if (!Cast)
+    return false;
+
+  if (!IsEliminableAddrSpaceCast(Cast))
+    return false;
+
+  SmallVector<Value *, 8> Indices(GEP->idx_begin(), GEP->idx_end());
+  if (Instruction *GEPI = dyn_cast<Instruction>(GEP)) {
+    // %1 = gep (addrspacecast X), indices
+    // =>
+    // %0 = gep X, indices
+    // %1 = addrspacecast %0
+    GetElementPtrInst *NewGEPI = GetElementPtrInst::Create(Cast->getOperand(0),
+                                                           Indices,
+                                                           GEP->getName(),
+                                                           GEPI);
+    NewGEPI->setIsInBounds(GEP->isInBounds());
+    GEP->replaceAllUsesWith(
+        new AddrSpaceCastInst(NewGEPI, GEP->getType(), "", GEPI));
+  } else {
+    // GEP is a constant expression.
+    Constant *NewGEPCE = ConstantExpr::getGetElementPtr(
+        cast<Constant>(Cast->getOperand(0)),
+        Indices,
+        GEP->isInBounds());
+    GEP->replaceAllUsesWith(
+        ConstantExpr::getAddrSpaceCast(NewGEPCE, GEP->getType()));
+  }
+
+  return true;
+}
+
+bool NVPTXFavorNonGenericAddrSpaces::optimizeMemoryInstruction(Instruction *MI,
+                                                               unsigned Idx) {
+  // If the pointer operand is a GEP, hoist the addrspacecast if any from the
+  // GEP to expose more optimization opportunites.
+  if (GEPOperator *GEP = dyn_cast<GEPOperator>(MI->getOperand(Idx))) {
+    hoistAddrSpaceCastFromGEP(GEP);
+  }
+
+  // load/store (addrspacecast X) => load/store X if shortcutting the
+  // addrspacecast is valid and can improve performance.
+  //
+  // e.g.,
+  // %1 = addrspacecast float addrspace(3)* %0 to float*
+  // %2 = load float* %1
+  // ->
+  // %2 = load float addrspace(3)* %0
+  //
+  // Note: the addrspacecast can also be a constant expression.
+  if (Operator *Cast = dyn_cast<Operator>(MI->getOperand(Idx))) {
+    if (IsEliminableAddrSpaceCast(Cast)) {
+      MI->setOperand(Idx, Cast->getOperand(0));
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool NVPTXFavorNonGenericAddrSpaces::runOnFunction(Function &F) {
+  if (DisableFavorNonGeneric)
+    return false;
+
+  bool Changed = false;
+  for (Function::iterator B = F.begin(), BE = F.end(); B != BE; ++B) {
+    for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE; ++I) {
+      if (isa<LoadInst>(I)) {
+        // V = load P
+        Changed |= optimizeMemoryInstruction(I, 0);
+      } else if (isa<StoreInst>(I)) {
+        // store V, P
+        Changed |= optimizeMemoryInstruction(I, 1);
+      }
+    }
+  }
+  return Changed;
+}
+
+FunctionPass *llvm::createNVPTXFavorNonGenericAddrSpacesPass() {
+  return new NVPTXFavorNonGenericAddrSpaces();
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp
new file mode 100644
index 0000000..8b08841
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp
@@ -0,0 +1,81 @@
+//=======- NVPTXFrameLowering.cpp - NVPTX Frame Information ---*- C++ -*-=====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the NVPTX implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXFrameLowering.h"
+#include "NVPTX.h"
+#include "NVPTXRegisterInfo.h"
+#include "NVPTXSubtarget.h"
+#include "NVPTXTargetMachine.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+NVPTXFrameLowering::NVPTXFrameLowering(NVPTXSubtarget &STI)
+    : TargetFrameLowering(TargetFrameLowering::StackGrowsUp, 8, 0),
+      is64bit(STI.is64Bit()) {}
+
+bool NVPTXFrameLowering::hasFP(const MachineFunction &MF) const { return true; }
+
+void NVPTXFrameLowering::emitPrologue(MachineFunction &MF) const {
+  if (MF.getFrameInfo()->hasStackObjects()) {
+    MachineBasicBlock &MBB = MF.front();
+    // Insert "mov.u32 %SP, %Depot"
+    MachineBasicBlock::iterator MBBI = MBB.begin();
+    // This instruction really occurs before first instruction
+    // in the BB, so giving it no debug location.
+    DebugLoc dl = DebugLoc();
+
+    MachineRegisterInfo &MRI = MF.getRegInfo();
+
+    // mov %SPL, %depot;
+    // cvta.local %SP, %SPL;
+    if (is64bit) {
+      unsigned LocalReg = MRI.createVirtualRegister(&NVPTX::Int64RegsRegClass);
+      MachineInstr *MI =
+          BuildMI(MBB, MBBI, dl,
+                  MF.getTarget().getInstrInfo()->get(NVPTX::cvta_local_yes_64),
+                  NVPTX::VRFrame).addReg(LocalReg);
+      BuildMI(MBB, MI, dl,
+              MF.getTarget().getInstrInfo()->get(NVPTX::MOV_DEPOT_ADDR_64),
+              LocalReg).addImm(MF.getFunctionNumber());
+    } else {
+      unsigned LocalReg = MRI.createVirtualRegister(&NVPTX::Int32RegsRegClass);
+      MachineInstr *MI =
+          BuildMI(MBB, MBBI, dl,
+                  MF.getTarget().getInstrInfo()->get(NVPTX::cvta_local_yes),
+                  NVPTX::VRFrame).addReg(LocalReg);
+      BuildMI(MBB, MI, dl,
+              MF.getTarget().getInstrInfo()->get(NVPTX::MOV_DEPOT_ADDR),
+              LocalReg).addImm(MF.getFunctionNumber());
+    }
+  }
+}
+
+void NVPTXFrameLowering::emitEpilogue(MachineFunction &MF,
+                                      MachineBasicBlock &MBB) const {}
+
+// This function eliminates ADJCALLSTACKDOWN,
+// ADJCALLSTACKUP pseudo instructions
+void NVPTXFrameLowering::eliminateCallFramePseudoInstr(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator I) const {
+  // Simply discard ADJCALLSTACKDOWN,
+  // ADJCALLSTACKUP instructions.
+  MBB.erase(I);
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h b/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h
new file mode 100644
index 0000000..56fb673
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h
@@ -0,0 +1,38 @@
+//===--- NVPTXFrameLowering.h - Define frame lowering for NVPTX -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTX_FRAMELOWERING_H
+#define NVPTX_FRAMELOWERING_H
+
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+class NVPTXSubtarget;
+class NVPTXFrameLowering : public TargetFrameLowering {
+  bool is64bit;
+
+public:
+  explicit NVPTXFrameLowering(NVPTXSubtarget &STI);
+
+  bool hasFP(const MachineFunction &MF) const override;
+  void emitPrologue(MachineFunction &MF) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                  MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I) const override;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
new file mode 100644
index 0000000..faa9fdb
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
@@ -0,0 +1,433 @@
+//===-- GenericToNVVM.cpp - Convert generic module to NVVM module - C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Convert generic global variables into either .global or .const access based
+// on the variable's "constant" qualifier.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "MCTargetDesc/NVPTXBaseInfo.h"
+#include "NVPTXUtilities.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/ValueMap.h"
+#include "llvm/PassManager.h"
+
+using namespace llvm;
+
+namespace llvm {
+void initializeGenericToNVVMPass(PassRegistry &);
+}
+
+namespace {
+class GenericToNVVM : public ModulePass {
+public:
+  static char ID;
+
+  GenericToNVVM() : ModulePass(ID) {}
+
+  bool runOnModule(Module &M) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {}
+
+private:
+  Value *getOrInsertCVTA(Module *M, Function *F, GlobalVariable *GV,
+                         IRBuilder<> &Builder);
+  Value *remapConstant(Module *M, Function *F, Constant *C,
+                       IRBuilder<> &Builder);
+  Value *remapConstantVectorOrConstantAggregate(Module *M, Function *F,
+                                                Constant *C,
+                                                IRBuilder<> &Builder);
+  Value *remapConstantExpr(Module *M, Function *F, ConstantExpr *C,
+                           IRBuilder<> &Builder);
+  void remapNamedMDNode(Module *M, NamedMDNode *N);
+  MDNode *remapMDNode(Module *M, MDNode *N);
+
+  typedef ValueMap<GlobalVariable *, GlobalVariable *> GVMapTy;
+  typedef ValueMap<Constant *, Value *> ConstantToValueMapTy;
+  GVMapTy GVMap;
+  ConstantToValueMapTy ConstantToValueMap;
+};
+} // end namespace
+
+char GenericToNVVM::ID = 0;
+
+ModulePass *llvm::createGenericToNVVMPass() { return new GenericToNVVM(); }
+
+INITIALIZE_PASS(
+    GenericToNVVM, "generic-to-nvvm",
+    "Ensure that the global variables are in the global address space", false,
+    false)
+
+bool GenericToNVVM::runOnModule(Module &M) {
+  // Create a clone of each global variable that has the default address space.
+  // The clone is created with the global address space  specifier, and the pair
+  // of original global variable and its clone is placed in the GVMap for later
+  // use.
+
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+       I != E;) {
+    GlobalVariable *GV = I++;
+    if (GV->getType()->getAddressSpace() == llvm::ADDRESS_SPACE_GENERIC &&
+        !llvm::isTexture(*GV) && !llvm::isSurface(*GV) &&
+        !llvm::isSampler(*GV) && !GV->getName().startswith("llvm.")) {
+      GlobalVariable *NewGV = new GlobalVariable(
+          M, GV->getType()->getElementType(), GV->isConstant(),
+          GV->getLinkage(),
+          GV->hasInitializer() ? GV->getInitializer() : nullptr,
+          "", GV, GV->getThreadLocalMode(), llvm::ADDRESS_SPACE_GLOBAL);
+      NewGV->copyAttributesFrom(GV);
+      GVMap[GV] = NewGV;
+    }
+  }
+
+  // Return immediately, if every global variable has a specific address space
+  // specifier.
+  if (GVMap.empty()) {
+    return false;
+  }
+
+  // Walk through the instructions in function defitinions, and replace any use
+  // of original global variables in GVMap with a use of the corresponding
+  // copies in GVMap.  If necessary, promote constants to instructions.
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
+    if (I->isDeclaration()) {
+      continue;
+    }
+    IRBuilder<> Builder(I->getEntryBlock().getFirstNonPHIOrDbg());
+    for (Function::iterator BBI = I->begin(), BBE = I->end(); BBI != BBE;
+         ++BBI) {
+      for (BasicBlock::iterator II = BBI->begin(), IE = BBI->end(); II != IE;
+           ++II) {
+        for (unsigned i = 0, e = II->getNumOperands(); i < e; ++i) {
+          Value *Operand = II->getOperand(i);
+          if (isa<Constant>(Operand)) {
+            II->setOperand(
+                i, remapConstant(&M, I, cast<Constant>(Operand), Builder));
+          }
+        }
+      }
+    }
+    ConstantToValueMap.clear();
+  }
+
+  // Walk through the metadata section and update the debug information
+  // associated with the global variables in the default address space.
+  for (Module::named_metadata_iterator I = M.named_metadata_begin(),
+                                       E = M.named_metadata_end();
+       I != E; I++) {
+    remapNamedMDNode(&M, I);
+  }
+
+  // Walk through the global variable  initializers, and replace any use of
+  // original global variables in GVMap with a use of the corresponding copies
+  // in GVMap.  The copies need to be bitcast to the original global variable
+  // types, as we cannot use cvta in global variable initializers.
+  for (GVMapTy::iterator I = GVMap.begin(), E = GVMap.end(); I != E;) {
+    GlobalVariable *GV = I->first;
+    GlobalVariable *NewGV = I->second;
+    ++I;
+    Constant *BitCastNewGV = ConstantExpr::getPointerCast(NewGV, GV->getType());
+    // At this point, the remaining uses of GV should be found only in global
+    // variable initializers, as other uses have been already been removed
+    // while walking through the instructions in function definitions.
+    for (Value::use_iterator UI = GV->use_begin(), UE = GV->use_end();
+         UI != UE;)
+      (UI++)->set(BitCastNewGV);
+    std::string Name = GV->getName();
+    GV->removeDeadConstantUsers();
+    GV->eraseFromParent();
+    NewGV->setName(Name);
+  }
+  GVMap.clear();
+
+  return true;
+}
+
+Value *GenericToNVVM::getOrInsertCVTA(Module *M, Function *F,
+                                      GlobalVariable *GV,
+                                      IRBuilder<> &Builder) {
+  PointerType *GVType = GV->getType();
+  Value *CVTA = nullptr;
+
+  // See if the address space conversion requires the operand to be bitcast
+  // to i8 addrspace(n)* first.
+  EVT ExtendedGVType = EVT::getEVT(GVType->getElementType(), true);
+  if (!ExtendedGVType.isInteger() && !ExtendedGVType.isFloatingPoint()) {
+    // A bitcast to i8 addrspace(n)* on the operand is needed.
+    LLVMContext &Context = M->getContext();
+    unsigned int AddrSpace = GVType->getAddressSpace();
+    Type *DestTy = PointerType::get(Type::getInt8Ty(Context), AddrSpace);
+    CVTA = Builder.CreateBitCast(GV, DestTy, "cvta");
+    // Insert the address space conversion.
+    Type *ResultType =
+        PointerType::get(Type::getInt8Ty(Context), llvm::ADDRESS_SPACE_GENERIC);
+    SmallVector<Type *, 2> ParamTypes;
+    ParamTypes.push_back(ResultType);
+    ParamTypes.push_back(DestTy);
+    Function *CVTAFunction = Intrinsic::getDeclaration(
+        M, Intrinsic::nvvm_ptr_global_to_gen, ParamTypes);
+    CVTA = Builder.CreateCall(CVTAFunction, CVTA, "cvta");
+    // Another bitcast from i8 * to <the element type of GVType> * is
+    // required.
+    DestTy =
+        PointerType::get(GVType->getElementType(), llvm::ADDRESS_SPACE_GENERIC);
+    CVTA = Builder.CreateBitCast(CVTA, DestTy, "cvta");
+  } else {
+    // A simple CVTA is enough.
+    SmallVector<Type *, 2> ParamTypes;
+    ParamTypes.push_back(PointerType::get(GVType->getElementType(),
+                                          llvm::ADDRESS_SPACE_GENERIC));
+    ParamTypes.push_back(GVType);
+    Function *CVTAFunction = Intrinsic::getDeclaration(
+        M, Intrinsic::nvvm_ptr_global_to_gen, ParamTypes);
+    CVTA = Builder.CreateCall(CVTAFunction, GV, "cvta");
+  }
+
+  return CVTA;
+}
+
+Value *GenericToNVVM::remapConstant(Module *M, Function *F, Constant *C,
+                                    IRBuilder<> &Builder) {
+  // If the constant C has been converted already in the given function  F, just
+  // return the converted value.
+  ConstantToValueMapTy::iterator CTII = ConstantToValueMap.find(C);
+  if (CTII != ConstantToValueMap.end()) {
+    return CTII->second;
+  }
+
+  Value *NewValue = C;
+  if (isa<GlobalVariable>(C)) {
+    // If the constant C is a global variable and is found in  GVMap, generate a
+    // set set of instructions that convert the clone of C with the global
+    // address space specifier to a generic pointer.
+    // The constant C cannot be used here, as it will be erased from the
+    // module eventually.  And the clone of C with the global address space
+    // specifier cannot be used here either, as it will affect the types of
+    // other instructions in the function.  Hence, this address space conversion
+    // is required.
+    GVMapTy::iterator I = GVMap.find(cast<GlobalVariable>(C));
+    if (I != GVMap.end()) {
+      NewValue = getOrInsertCVTA(M, F, I->second, Builder);
+    }
+  } else if (isa<ConstantVector>(C) || isa<ConstantArray>(C) ||
+             isa<ConstantStruct>(C)) {
+    // If any element in the constant vector or aggregate C is or uses a global
+    // variable in GVMap, the constant C needs to be reconstructed, using a set
+    // of instructions.
+    NewValue = remapConstantVectorOrConstantAggregate(M, F, C, Builder);
+  } else if (isa<ConstantExpr>(C)) {
+    // If any operand in the constant expression C is or uses a global variable
+    // in GVMap, the constant expression C needs to be reconstructed, using a
+    // set of instructions.
+    NewValue = remapConstantExpr(M, F, cast<ConstantExpr>(C), Builder);
+  }
+
+  ConstantToValueMap[C] = NewValue;
+  return NewValue;
+}
+
+Value *GenericToNVVM::remapConstantVectorOrConstantAggregate(
+    Module *M, Function *F, Constant *C, IRBuilder<> &Builder) {
+  bool OperandChanged = false;
+  SmallVector<Value *, 4> NewOperands;
+  unsigned NumOperands = C->getNumOperands();
+
+  // Check if any element is or uses a global variable in  GVMap, and thus
+  // converted to another value.
+  for (unsigned i = 0; i < NumOperands; ++i) {
+    Value *Operand = C->getOperand(i);
+    Value *NewOperand = remapConstant(M, F, cast<Constant>(Operand), Builder);
+    OperandChanged |= Operand != NewOperand;
+    NewOperands.push_back(NewOperand);
+  }
+
+  // If none of the elements has been modified, return C as it is.
+  if (!OperandChanged) {
+    return C;
+  }
+
+  // If any of the elements has been  modified, construct the equivalent
+  // vector or aggregate value with a set instructions and the converted
+  // elements.
+  Value *NewValue = UndefValue::get(C->getType());
+  if (isa<ConstantVector>(C)) {
+    for (unsigned i = 0; i < NumOperands; ++i) {
+      Value *Idx = ConstantInt::get(Type::getInt32Ty(M->getContext()), i);
+      NewValue = Builder.CreateInsertElement(NewValue, NewOperands[i], Idx);
+    }
+  } else {
+    for (unsigned i = 0; i < NumOperands; ++i) {
+      NewValue =
+          Builder.CreateInsertValue(NewValue, NewOperands[i], makeArrayRef(i));
+    }
+  }
+
+  return NewValue;
+}
+
+Value *GenericToNVVM::remapConstantExpr(Module *M, Function *F, ConstantExpr *C,
+                                        IRBuilder<> &Builder) {
+  bool OperandChanged = false;
+  SmallVector<Value *, 4> NewOperands;
+  unsigned NumOperands = C->getNumOperands();
+
+  // Check if any operand is or uses a global variable in  GVMap, and thus
+  // converted to another value.
+  for (unsigned i = 0; i < NumOperands; ++i) {
+    Value *Operand = C->getOperand(i);
+    Value *NewOperand = remapConstant(M, F, cast<Constant>(Operand), Builder);
+    OperandChanged |= Operand != NewOperand;
+    NewOperands.push_back(NewOperand);
+  }
+
+  // If none of the operands has been modified, return C as it is.
+  if (!OperandChanged) {
+    return C;
+  }
+
+  // If any of the operands has been modified, construct the instruction with
+  // the converted operands.
+  unsigned Opcode = C->getOpcode();
+  switch (Opcode) {
+  case Instruction::ICmp:
+    // CompareConstantExpr (icmp)
+    return Builder.CreateICmp(CmpInst::Predicate(C->getPredicate()),
+                              NewOperands[0], NewOperands[1]);
+  case Instruction::FCmp:
+    // CompareConstantExpr (fcmp)
+    assert(false && "Address space conversion should have no effect "
+                    "on float point CompareConstantExpr (fcmp)!");
+    return C;
+  case Instruction::ExtractElement:
+    // ExtractElementConstantExpr
+    return Builder.CreateExtractElement(NewOperands[0], NewOperands[1]);
+  case Instruction::InsertElement:
+    // InsertElementConstantExpr
+    return Builder.CreateInsertElement(NewOperands[0], NewOperands[1],
+                                       NewOperands[2]);
+  case Instruction::ShuffleVector:
+    // ShuffleVector
+    return Builder.CreateShuffleVector(NewOperands[0], NewOperands[1],
+                                       NewOperands[2]);
+  case Instruction::ExtractValue:
+    // ExtractValueConstantExpr
+    return Builder.CreateExtractValue(NewOperands[0], C->getIndices());
+  case Instruction::InsertValue:
+    // InsertValueConstantExpr
+    return Builder.CreateInsertValue(NewOperands[0], NewOperands[1],
+                                     C->getIndices());
+  case Instruction::GetElementPtr:
+    // GetElementPtrConstantExpr
+    return cast<GEPOperator>(C)->isInBounds()
+               ? Builder.CreateGEP(
+                     NewOperands[0],
+                     makeArrayRef(&NewOperands[1], NumOperands - 1))
+               : Builder.CreateInBoundsGEP(
+                     NewOperands[0],
+                     makeArrayRef(&NewOperands[1], NumOperands - 1));
+  case Instruction::Select:
+    // SelectConstantExpr
+    return Builder.CreateSelect(NewOperands[0], NewOperands[1], NewOperands[2]);
+  default:
+    // BinaryConstantExpr
+    if (Instruction::isBinaryOp(Opcode)) {
+      return Builder.CreateBinOp(Instruction::BinaryOps(C->getOpcode()),
+                                 NewOperands[0], NewOperands[1]);
+    }
+    // UnaryConstantExpr
+    if (Instruction::isCast(Opcode)) {
+      return Builder.CreateCast(Instruction::CastOps(C->getOpcode()),
+                                NewOperands[0], C->getType());
+    }
+    assert(false && "GenericToNVVM encountered an unsupported ConstantExpr");
+    return C;
+  }
+}
+
+void GenericToNVVM::remapNamedMDNode(Module *M, NamedMDNode *N) {
+
+  bool OperandChanged = false;
+  SmallVector<MDNode *, 16> NewOperands;
+  unsigned NumOperands = N->getNumOperands();
+
+  // Check if any operand is or contains a global variable in  GVMap, and thus
+  // converted to another value.
+  for (unsigned i = 0; i < NumOperands; ++i) {
+    MDNode *Operand = N->getOperand(i);
+    MDNode *NewOperand = remapMDNode(M, Operand);
+    OperandChanged |= Operand != NewOperand;
+    NewOperands.push_back(NewOperand);
+  }
+
+  // If none of the operands has been modified, return immediately.
+  if (!OperandChanged) {
+    return;
+  }
+
+  // Replace the old operands with the new operands.
+  N->dropAllReferences();
+  for (SmallVectorImpl<MDNode *>::iterator I = NewOperands.begin(),
+                                           E = NewOperands.end();
+       I != E; ++I) {
+    N->addOperand(*I);
+  }
+}
+
+MDNode *GenericToNVVM::remapMDNode(Module *M, MDNode *N) {
+
+  bool OperandChanged = false;
+  SmallVector<Value *, 8> NewOperands;
+  unsigned NumOperands = N->getNumOperands();
+
+  // Check if any operand is or contains a global variable in  GVMap, and thus
+  // converted to another value.
+  for (unsigned i = 0; i < NumOperands; ++i) {
+    Value *Operand = N->getOperand(i);
+    Value *NewOperand = Operand;
+    if (Operand) {
+      if (isa<GlobalVariable>(Operand)) {
+        GVMapTy::iterator I = GVMap.find(cast<GlobalVariable>(Operand));
+        if (I != GVMap.end()) {
+          NewOperand = I->second;
+          if (++i < NumOperands) {
+            NewOperands.push_back(NewOperand);
+            // Address space of the global variable follows the global variable
+            // in the global variable debug info (see createGlobalVariable in
+            // lib/Analysis/DIBuilder.cpp).
+            NewOperand =
+                ConstantInt::get(Type::getInt32Ty(M->getContext()),
+                                 I->second->getType()->getAddressSpace());
+          }
+        }
+      } else if (isa<MDNode>(Operand)) {
+        NewOperand = remapMDNode(M, cast<MDNode>(Operand));
+      }
+    }
+    OperandChanged |= Operand != NewOperand;
+    NewOperands.push_back(NewOperand);
+  }
+
+  // If none of the operands has been modified, return N as it is.
+  if (!OperandChanged) {
+    return N;
+  }
+
+  // If any of the operands has been modified, create a new MDNode with the new
+  // operands.
+  return MDNode::get(M->getContext(), makeArrayRef(NewOperands));
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
new file mode 100644
index 0000000..05205fb
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -0,0 +1,5085 @@
+//===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the NVPTX target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXISelDAGToDAG.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetIntrinsicInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "nvptx-isel"
+
+static cl::opt<int> UsePrecDivF32(
+    "nvptx-prec-divf32", cl::ZeroOrMore, cl::Hidden,
+    cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
+             " IEEE Compliant F32 div.rnd if avaiable."),
+    cl::init(2));
+
+static cl::opt<bool>
+UsePrecSqrtF32("nvptx-prec-sqrtf32", cl::Hidden,
+          cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
+          cl::init(true));
+
+static cl::opt<bool>
+FtzEnabled("nvptx-f32ftz", cl::ZeroOrMore, cl::Hidden,
+           cl::desc("NVPTX Specific: Flush f32 subnormals to sign-preserving zero."),
+           cl::init(false));
+
+
+/// createNVPTXISelDag - This pass converts a legalized DAG into a
+/// NVPTX-specific DAG, ready for instruction scheduling.
+FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM,
+                                       llvm::CodeGenOpt::Level OptLevel) {
+  return new NVPTXDAGToDAGISel(TM, OptLevel);
+}
+
+NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
+                                     CodeGenOpt::Level OptLevel)
+    : SelectionDAGISel(tm, OptLevel),
+      Subtarget(tm.getSubtarget<NVPTXSubtarget>()) {
+  doMulWide = (OptLevel > 0);
+}
+
+int NVPTXDAGToDAGISel::getDivF32Level() const {
+  if (UsePrecDivF32.getNumOccurrences() > 0) {
+    // If nvptx-prec-div32=N is used on the command-line, always honor it
+    return UsePrecDivF32;
+  } else {
+    // Otherwise, use div.approx if fast math is enabled
+    if (TM.Options.UnsafeFPMath)
+      return 0;
+    else
+      return 2;
+  }
+}
+
+bool NVPTXDAGToDAGISel::usePrecSqrtF32() const {
+  if (UsePrecSqrtF32.getNumOccurrences() > 0) {
+    // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
+    return UsePrecSqrtF32;
+  } else {
+    // Otherwise, use sqrt.approx if fast math is enabled
+    if (TM.Options.UnsafeFPMath)
+      return false;
+    else
+      return true;
+  }
+}
+
+bool NVPTXDAGToDAGISel::useF32FTZ() const {
+  if (FtzEnabled.getNumOccurrences() > 0) {
+    // If nvptx-f32ftz is used on the command-line, always honor it
+    return FtzEnabled;
+  } else {
+    const Function *F = MF->getFunction();
+    // Otherwise, check for an nvptx-f32ftz attribute on the function
+    if (F->hasFnAttribute("nvptx-f32ftz"))
+      return (F->getAttributes().getAttribute(AttributeSet::FunctionIndex,
+                                              "nvptx-f32ftz")
+                                              .getValueAsString() == "true");
+    else
+      return false;
+  }
+}
+
+bool NVPTXDAGToDAGISel::allowFMA() const {
+  const NVPTXTargetLowering *TL = Subtarget.getTargetLowering();
+  return TL->allowFMA(*MF, OptLevel);
+}
+
+/// Select - Select instructions not customized! Used for
+/// expanded, promoted and normal instructions.
+SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) {
+
+  if (N->isMachineOpcode()) {
+    N->setNodeId(-1);
+    return nullptr; // Already selected.
+  }
+
+  SDNode *ResNode = nullptr;
+  switch (N->getOpcode()) {
+  case ISD::LOAD:
+    ResNode = SelectLoad(N);
+    break;
+  case ISD::STORE:
+    ResNode = SelectStore(N);
+    break;
+  case NVPTXISD::LoadV2:
+  case NVPTXISD::LoadV4:
+    ResNode = SelectLoadVector(N);
+    break;
+  case NVPTXISD::LDGV2:
+  case NVPTXISD::LDGV4:
+  case NVPTXISD::LDUV2:
+  case NVPTXISD::LDUV4:
+    ResNode = SelectLDGLDU(N);
+    break;
+  case NVPTXISD::StoreV2:
+  case NVPTXISD::StoreV4:
+    ResNode = SelectStoreVector(N);
+    break;
+  case NVPTXISD::LoadParam:
+  case NVPTXISD::LoadParamV2:
+  case NVPTXISD::LoadParamV4:
+    ResNode = SelectLoadParam(N);
+    break;
+  case NVPTXISD::StoreRetval:
+  case NVPTXISD::StoreRetvalV2:
+  case NVPTXISD::StoreRetvalV4:
+    ResNode = SelectStoreRetval(N);
+    break;
+  case NVPTXISD::StoreParam:
+  case NVPTXISD::StoreParamV2:
+  case NVPTXISD::StoreParamV4:
+  case NVPTXISD::StoreParamS32:
+  case NVPTXISD::StoreParamU32:
+    ResNode = SelectStoreParam(N);
+    break;
+  case ISD::INTRINSIC_WO_CHAIN:
+    ResNode = SelectIntrinsicNoChain(N);
+    break;
+  case ISD::INTRINSIC_W_CHAIN:
+    ResNode = SelectIntrinsicChain(N);
+    break;
+  case NVPTXISD::Tex1DFloatS32:
+  case NVPTXISD::Tex1DFloatFloat:
+  case NVPTXISD::Tex1DFloatFloatLevel:
+  case NVPTXISD::Tex1DFloatFloatGrad:
+  case NVPTXISD::Tex1DS32S32:
+  case NVPTXISD::Tex1DS32Float:
+  case NVPTXISD::Tex1DS32FloatLevel:
+  case NVPTXISD::Tex1DS32FloatGrad:
+  case NVPTXISD::Tex1DU32S32:
+  case NVPTXISD::Tex1DU32Float:
+  case NVPTXISD::Tex1DU32FloatLevel:
+  case NVPTXISD::Tex1DU32FloatGrad:
+  case NVPTXISD::Tex1DArrayFloatS32:
+  case NVPTXISD::Tex1DArrayFloatFloat:
+  case NVPTXISD::Tex1DArrayFloatFloatLevel:
+  case NVPTXISD::Tex1DArrayFloatFloatGrad:
+  case NVPTXISD::Tex1DArrayS32S32:
+  case NVPTXISD::Tex1DArrayS32Float:
+  case NVPTXISD::Tex1DArrayS32FloatLevel:
+  case NVPTXISD::Tex1DArrayS32FloatGrad:
+  case NVPTXISD::Tex1DArrayU32S32:
+  case NVPTXISD::Tex1DArrayU32Float:
+  case NVPTXISD::Tex1DArrayU32FloatLevel:
+  case NVPTXISD::Tex1DArrayU32FloatGrad:
+  case NVPTXISD::Tex2DFloatS32:
+  case NVPTXISD::Tex2DFloatFloat:
+  case NVPTXISD::Tex2DFloatFloatLevel:
+  case NVPTXISD::Tex2DFloatFloatGrad:
+  case NVPTXISD::Tex2DS32S32:
+  case NVPTXISD::Tex2DS32Float:
+  case NVPTXISD::Tex2DS32FloatLevel:
+  case NVPTXISD::Tex2DS32FloatGrad:
+  case NVPTXISD::Tex2DU32S32:
+  case NVPTXISD::Tex2DU32Float:
+  case NVPTXISD::Tex2DU32FloatLevel:
+  case NVPTXISD::Tex2DU32FloatGrad:
+  case NVPTXISD::Tex2DArrayFloatS32:
+  case NVPTXISD::Tex2DArrayFloatFloat:
+  case NVPTXISD::Tex2DArrayFloatFloatLevel:
+  case NVPTXISD::Tex2DArrayFloatFloatGrad:
+  case NVPTXISD::Tex2DArrayS32S32:
+  case NVPTXISD::Tex2DArrayS32Float:
+  case NVPTXISD::Tex2DArrayS32FloatLevel:
+  case NVPTXISD::Tex2DArrayS32FloatGrad:
+  case NVPTXISD::Tex2DArrayU32S32:
+  case NVPTXISD::Tex2DArrayU32Float:
+  case NVPTXISD::Tex2DArrayU32FloatLevel:
+  case NVPTXISD::Tex2DArrayU32FloatGrad:
+  case NVPTXISD::Tex3DFloatS32:
+  case NVPTXISD::Tex3DFloatFloat:
+  case NVPTXISD::Tex3DFloatFloatLevel:
+  case NVPTXISD::Tex3DFloatFloatGrad:
+  case NVPTXISD::Tex3DS32S32:
+  case NVPTXISD::Tex3DS32Float:
+  case NVPTXISD::Tex3DS32FloatLevel:
+  case NVPTXISD::Tex3DS32FloatGrad:
+  case NVPTXISD::Tex3DU32S32:
+  case NVPTXISD::Tex3DU32Float:
+  case NVPTXISD::Tex3DU32FloatLevel:
+  case NVPTXISD::Tex3DU32FloatGrad:
+  case NVPTXISD::TexCubeFloatFloat:
+  case NVPTXISD::TexCubeFloatFloatLevel:
+  case NVPTXISD::TexCubeS32Float:
+  case NVPTXISD::TexCubeS32FloatLevel:
+  case NVPTXISD::TexCubeU32Float:
+  case NVPTXISD::TexCubeU32FloatLevel:
+  case NVPTXISD::TexCubeArrayFloatFloat:
+  case NVPTXISD::TexCubeArrayFloatFloatLevel:
+  case NVPTXISD::TexCubeArrayS32Float:
+  case NVPTXISD::TexCubeArrayS32FloatLevel:
+  case NVPTXISD::TexCubeArrayU32Float:
+  case NVPTXISD::TexCubeArrayU32FloatLevel:
+  case NVPTXISD::Tld4R2DFloatFloat:
+  case NVPTXISD::Tld4G2DFloatFloat:
+  case NVPTXISD::Tld4B2DFloatFloat:
+  case NVPTXISD::Tld4A2DFloatFloat:
+  case NVPTXISD::Tld4R2DS64Float:
+  case NVPTXISD::Tld4G2DS64Float:
+  case NVPTXISD::Tld4B2DS64Float:
+  case NVPTXISD::Tld4A2DS64Float:
+  case NVPTXISD::Tld4R2DU64Float:
+  case NVPTXISD::Tld4G2DU64Float:
+  case NVPTXISD::Tld4B2DU64Float:
+  case NVPTXISD::Tld4A2DU64Float:
+  case NVPTXISD::TexUnified1DFloatS32:
+  case NVPTXISD::TexUnified1DFloatFloat:
+  case NVPTXISD::TexUnified1DFloatFloatLevel:
+  case NVPTXISD::TexUnified1DFloatFloatGrad:
+  case NVPTXISD::TexUnified1DS32S32:
+  case NVPTXISD::TexUnified1DS32Float:
+  case NVPTXISD::TexUnified1DS32FloatLevel:
+  case NVPTXISD::TexUnified1DS32FloatGrad:
+  case NVPTXISD::TexUnified1DU32S32:
+  case NVPTXISD::TexUnified1DU32Float:
+  case NVPTXISD::TexUnified1DU32FloatLevel:
+  case NVPTXISD::TexUnified1DU32FloatGrad:
+  case NVPTXISD::TexUnified1DArrayFloatS32:
+  case NVPTXISD::TexUnified1DArrayFloatFloat:
+  case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
+  case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
+  case NVPTXISD::TexUnified1DArrayS32S32:
+  case NVPTXISD::TexUnified1DArrayS32Float:
+  case NVPTXISD::TexUnified1DArrayS32FloatLevel:
+  case NVPTXISD::TexUnified1DArrayS32FloatGrad:
+  case NVPTXISD::TexUnified1DArrayU32S32:
+  case NVPTXISD::TexUnified1DArrayU32Float:
+  case NVPTXISD::TexUnified1DArrayU32FloatLevel:
+  case NVPTXISD::TexUnified1DArrayU32FloatGrad:
+  case NVPTXISD::TexUnified2DFloatS32:
+  case NVPTXISD::TexUnified2DFloatFloat:
+  case NVPTXISD::TexUnified2DFloatFloatLevel:
+  case NVPTXISD::TexUnified2DFloatFloatGrad:
+  case NVPTXISD::TexUnified2DS32S32:
+  case NVPTXISD::TexUnified2DS32Float:
+  case NVPTXISD::TexUnified2DS32FloatLevel:
+  case NVPTXISD::TexUnified2DS32FloatGrad:
+  case NVPTXISD::TexUnified2DU32S32:
+  case NVPTXISD::TexUnified2DU32Float:
+  case NVPTXISD::TexUnified2DU32FloatLevel:
+  case NVPTXISD::TexUnified2DU32FloatGrad:
+  case NVPTXISD::TexUnified2DArrayFloatS32:
+  case NVPTXISD::TexUnified2DArrayFloatFloat:
+  case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
+  case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
+  case NVPTXISD::TexUnified2DArrayS32S32:
+  case NVPTXISD::TexUnified2DArrayS32Float:
+  case NVPTXISD::TexUnified2DArrayS32FloatLevel:
+  case NVPTXISD::TexUnified2DArrayS32FloatGrad:
+  case NVPTXISD::TexUnified2DArrayU32S32:
+  case NVPTXISD::TexUnified2DArrayU32Float:
+  case NVPTXISD::TexUnified2DArrayU32FloatLevel:
+  case NVPTXISD::TexUnified2DArrayU32FloatGrad:
+  case NVPTXISD::TexUnified3DFloatS32:
+  case NVPTXISD::TexUnified3DFloatFloat:
+  case NVPTXISD::TexUnified3DFloatFloatLevel:
+  case NVPTXISD::TexUnified3DFloatFloatGrad:
+  case NVPTXISD::TexUnified3DS32S32:
+  case NVPTXISD::TexUnified3DS32Float:
+  case NVPTXISD::TexUnified3DS32FloatLevel:
+  case NVPTXISD::TexUnified3DS32FloatGrad:
+  case NVPTXISD::TexUnified3DU32S32:
+  case NVPTXISD::TexUnified3DU32Float:
+  case NVPTXISD::TexUnified3DU32FloatLevel:
+  case NVPTXISD::TexUnified3DU32FloatGrad:
+  case NVPTXISD::TexUnifiedCubeFloatFloat:
+  case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
+  case NVPTXISD::TexUnifiedCubeS32Float:
+  case NVPTXISD::TexUnifiedCubeS32FloatLevel:
+  case NVPTXISD::TexUnifiedCubeU32Float:
+  case NVPTXISD::TexUnifiedCubeU32FloatLevel:
+  case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
+  case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
+  case NVPTXISD::TexUnifiedCubeArrayS32Float:
+  case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
+  case NVPTXISD::TexUnifiedCubeArrayU32Float:
+  case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
+  case NVPTXISD::Tld4UnifiedR2DFloatFloat:
+  case NVPTXISD::Tld4UnifiedG2DFloatFloat:
+  case NVPTXISD::Tld4UnifiedB2DFloatFloat:
+  case NVPTXISD::Tld4UnifiedA2DFloatFloat:
+  case NVPTXISD::Tld4UnifiedR2DS64Float:
+  case NVPTXISD::Tld4UnifiedG2DS64Float:
+  case NVPTXISD::Tld4UnifiedB2DS64Float:
+  case NVPTXISD::Tld4UnifiedA2DS64Float:
+  case NVPTXISD::Tld4UnifiedR2DU64Float:
+  case NVPTXISD::Tld4UnifiedG2DU64Float:
+  case NVPTXISD::Tld4UnifiedB2DU64Float:
+  case NVPTXISD::Tld4UnifiedA2DU64Float:
+    ResNode = SelectTextureIntrinsic(N);
+    break;
+  case NVPTXISD::Suld1DI8Clamp:
+  case NVPTXISD::Suld1DI16Clamp:
+  case NVPTXISD::Suld1DI32Clamp:
+  case NVPTXISD::Suld1DI64Clamp:
+  case NVPTXISD::Suld1DV2I8Clamp:
+  case NVPTXISD::Suld1DV2I16Clamp:
+  case NVPTXISD::Suld1DV2I32Clamp:
+  case NVPTXISD::Suld1DV2I64Clamp:
+  case NVPTXISD::Suld1DV4I8Clamp:
+  case NVPTXISD::Suld1DV4I16Clamp:
+  case NVPTXISD::Suld1DV4I32Clamp:
+  case NVPTXISD::Suld1DArrayI8Clamp:
+  case NVPTXISD::Suld1DArrayI16Clamp:
+  case NVPTXISD::Suld1DArrayI32Clamp:
+  case NVPTXISD::Suld1DArrayI64Clamp:
+  case NVPTXISD::Suld1DArrayV2I8Clamp:
+  case NVPTXISD::Suld1DArrayV2I16Clamp:
+  case NVPTXISD::Suld1DArrayV2I32Clamp:
+  case NVPTXISD::Suld1DArrayV2I64Clamp:
+  case NVPTXISD::Suld1DArrayV4I8Clamp:
+  case NVPTXISD::Suld1DArrayV4I16Clamp:
+  case NVPTXISD::Suld1DArrayV4I32Clamp:
+  case NVPTXISD::Suld2DI8Clamp:
+  case NVPTXISD::Suld2DI16Clamp:
+  case NVPTXISD::Suld2DI32Clamp:
+  case NVPTXISD::Suld2DI64Clamp:
+  case NVPTXISD::Suld2DV2I8Clamp:
+  case NVPTXISD::Suld2DV2I16Clamp:
+  case NVPTXISD::Suld2DV2I32Clamp:
+  case NVPTXISD::Suld2DV2I64Clamp:
+  case NVPTXISD::Suld2DV4I8Clamp:
+  case NVPTXISD::Suld2DV4I16Clamp:
+  case NVPTXISD::Suld2DV4I32Clamp:
+  case NVPTXISD::Suld2DArrayI8Clamp:
+  case NVPTXISD::Suld2DArrayI16Clamp:
+  case NVPTXISD::Suld2DArrayI32Clamp:
+  case NVPTXISD::Suld2DArrayI64Clamp:
+  case NVPTXISD::Suld2DArrayV2I8Clamp:
+  case NVPTXISD::Suld2DArrayV2I16Clamp:
+  case NVPTXISD::Suld2DArrayV2I32Clamp:
+  case NVPTXISD::Suld2DArrayV2I64Clamp:
+  case NVPTXISD::Suld2DArrayV4I8Clamp:
+  case NVPTXISD::Suld2DArrayV4I16Clamp:
+  case NVPTXISD::Suld2DArrayV4I32Clamp:
+  case NVPTXISD::Suld3DI8Clamp:
+  case NVPTXISD::Suld3DI16Clamp:
+  case NVPTXISD::Suld3DI32Clamp:
+  case NVPTXISD::Suld3DI64Clamp:
+  case NVPTXISD::Suld3DV2I8Clamp:
+  case NVPTXISD::Suld3DV2I16Clamp:
+  case NVPTXISD::Suld3DV2I32Clamp:
+  case NVPTXISD::Suld3DV2I64Clamp:
+  case NVPTXISD::Suld3DV4I8Clamp:
+  case NVPTXISD::Suld3DV4I16Clamp:
+  case NVPTXISD::Suld3DV4I32Clamp:
+  case NVPTXISD::Suld1DI8Trap:
+  case NVPTXISD::Suld1DI16Trap:
+  case NVPTXISD::Suld1DI32Trap:
+  case NVPTXISD::Suld1DI64Trap:
+  case NVPTXISD::Suld1DV2I8Trap:
+  case NVPTXISD::Suld1DV2I16Trap:
+  case NVPTXISD::Suld1DV2I32Trap:
+  case NVPTXISD::Suld1DV2I64Trap:
+  case NVPTXISD::Suld1DV4I8Trap:
+  case NVPTXISD::Suld1DV4I16Trap:
+  case NVPTXISD::Suld1DV4I32Trap:
+  case NVPTXISD::Suld1DArrayI8Trap:
+  case NVPTXISD::Suld1DArrayI16Trap:
+  case NVPTXISD::Suld1DArrayI32Trap:
+  case NVPTXISD::Suld1DArrayI64Trap:
+  case NVPTXISD::Suld1DArrayV2I8Trap:
+  case NVPTXISD::Suld1DArrayV2I16Trap:
+  case NVPTXISD::Suld1DArrayV2I32Trap:
+  case NVPTXISD::Suld1DArrayV2I64Trap:
+  case NVPTXISD::Suld1DArrayV4I8Trap:
+  case NVPTXISD::Suld1DArrayV4I16Trap:
+  case NVPTXISD::Suld1DArrayV4I32Trap:
+  case NVPTXISD::Suld2DI8Trap:
+  case NVPTXISD::Suld2DI16Trap:
+  case NVPTXISD::Suld2DI32Trap:
+  case NVPTXISD::Suld2DI64Trap:
+  case NVPTXISD::Suld2DV2I8Trap:
+  case NVPTXISD::Suld2DV2I16Trap:
+  case NVPTXISD::Suld2DV2I32Trap:
+  case NVPTXISD::Suld2DV2I64Trap:
+  case NVPTXISD::Suld2DV4I8Trap:
+  case NVPTXISD::Suld2DV4I16Trap:
+  case NVPTXISD::Suld2DV4I32Trap:
+  case NVPTXISD::Suld2DArrayI8Trap:
+  case NVPTXISD::Suld2DArrayI16Trap:
+  case NVPTXISD::Suld2DArrayI32Trap:
+  case NVPTXISD::Suld2DArrayI64Trap:
+  case NVPTXISD::Suld2DArrayV2I8Trap:
+  case NVPTXISD::Suld2DArrayV2I16Trap:
+  case NVPTXISD::Suld2DArrayV2I32Trap:
+  case NVPTXISD::Suld2DArrayV2I64Trap:
+  case NVPTXISD::Suld2DArrayV4I8Trap:
+  case NVPTXISD::Suld2DArrayV4I16Trap:
+  case NVPTXISD::Suld2DArrayV4I32Trap:
+  case NVPTXISD::Suld3DI8Trap:
+  case NVPTXISD::Suld3DI16Trap:
+  case NVPTXISD::Suld3DI32Trap:
+  case NVPTXISD::Suld3DI64Trap:
+  case NVPTXISD::Suld3DV2I8Trap:
+  case NVPTXISD::Suld3DV2I16Trap:
+  case NVPTXISD::Suld3DV2I32Trap:
+  case NVPTXISD::Suld3DV2I64Trap:
+  case NVPTXISD::Suld3DV4I8Trap:
+  case NVPTXISD::Suld3DV4I16Trap:
+  case NVPTXISD::Suld3DV4I32Trap:
+  case NVPTXISD::Suld1DI8Zero:
+  case NVPTXISD::Suld1DI16Zero:
+  case NVPTXISD::Suld1DI32Zero:
+  case NVPTXISD::Suld1DI64Zero:
+  case NVPTXISD::Suld1DV2I8Zero:
+  case NVPTXISD::Suld1DV2I16Zero:
+  case NVPTXISD::Suld1DV2I32Zero:
+  case NVPTXISD::Suld1DV2I64Zero:
+  case NVPTXISD::Suld1DV4I8Zero:
+  case NVPTXISD::Suld1DV4I16Zero:
+  case NVPTXISD::Suld1DV4I32Zero:
+  case NVPTXISD::Suld1DArrayI8Zero:
+  case NVPTXISD::Suld1DArrayI16Zero:
+  case NVPTXISD::Suld1DArrayI32Zero:
+  case NVPTXISD::Suld1DArrayI64Zero:
+  case NVPTXISD::Suld1DArrayV2I8Zero:
+  case NVPTXISD::Suld1DArrayV2I16Zero:
+  case NVPTXISD::Suld1DArrayV2I32Zero:
+  case NVPTXISD::Suld1DArrayV2I64Zero:
+  case NVPTXISD::Suld1DArrayV4I8Zero:
+  case NVPTXISD::Suld1DArrayV4I16Zero:
+  case NVPTXISD::Suld1DArrayV4I32Zero:
+  case NVPTXISD::Suld2DI8Zero:
+  case NVPTXISD::Suld2DI16Zero:
+  case NVPTXISD::Suld2DI32Zero:
+  case NVPTXISD::Suld2DI64Zero:
+  case NVPTXISD::Suld2DV2I8Zero:
+  case NVPTXISD::Suld2DV2I16Zero:
+  case NVPTXISD::Suld2DV2I32Zero:
+  case NVPTXISD::Suld2DV2I64Zero:
+  case NVPTXISD::Suld2DV4I8Zero:
+  case NVPTXISD::Suld2DV4I16Zero:
+  case NVPTXISD::Suld2DV4I32Zero:
+  case NVPTXISD::Suld2DArrayI8Zero:
+  case NVPTXISD::Suld2DArrayI16Zero:
+  case NVPTXISD::Suld2DArrayI32Zero:
+  case NVPTXISD::Suld2DArrayI64Zero:
+  case NVPTXISD::Suld2DArrayV2I8Zero:
+  case NVPTXISD::Suld2DArrayV2I16Zero:
+  case NVPTXISD::Suld2DArrayV2I32Zero:
+  case NVPTXISD::Suld2DArrayV2I64Zero:
+  case NVPTXISD::Suld2DArrayV4I8Zero:
+  case NVPTXISD::Suld2DArrayV4I16Zero:
+  case NVPTXISD::Suld2DArrayV4I32Zero:
+  case NVPTXISD::Suld3DI8Zero:
+  case NVPTXISD::Suld3DI16Zero:
+  case NVPTXISD::Suld3DI32Zero:
+  case NVPTXISD::Suld3DI64Zero:
+  case NVPTXISD::Suld3DV2I8Zero:
+  case NVPTXISD::Suld3DV2I16Zero:
+  case NVPTXISD::Suld3DV2I32Zero:
+  case NVPTXISD::Suld3DV2I64Zero:
+  case NVPTXISD::Suld3DV4I8Zero:
+  case NVPTXISD::Suld3DV4I16Zero:
+  case NVPTXISD::Suld3DV4I32Zero:
+    ResNode = SelectSurfaceIntrinsic(N);
+    break;
+  case ISD::AND:
+  case ISD::SRA:
+  case ISD::SRL:
+    // Try to select BFE
+    ResNode = SelectBFE(N);
+    break;
+  case ISD::ADDRSPACECAST:
+    ResNode = SelectAddrSpaceCast(N);
+    break;
+  default:
+    break;
+  }
+  if (ResNode)
+    return ResNode;
+  return SelectCode(N);
+}
+
+SDNode *NVPTXDAGToDAGISel::SelectIntrinsicChain(SDNode *N) {
+  unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+  switch (IID) {
+  default:
+    return NULL;
+  case Intrinsic::nvvm_ldg_global_f:
+  case Intrinsic::nvvm_ldg_global_i:
+  case Intrinsic::nvvm_ldg_global_p:
+  case Intrinsic::nvvm_ldu_global_f:
+  case Intrinsic::nvvm_ldu_global_i:
+  case Intrinsic::nvvm_ldu_global_p:
+    return SelectLDGLDU(N);
+  }
+}
+
+static unsigned int getCodeAddrSpace(MemSDNode *N,
+                                     const NVPTXSubtarget &Subtarget) {
+  const Value *Src = N->getMemOperand()->getValue();
+
+  if (!Src)
+    return NVPTX::PTXLdStInstCode::GENERIC;
+
+  if (const PointerType *PT = dyn_cast<PointerType>(Src->getType())) {
+    switch (PT->getAddressSpace()) {
+    case llvm::ADDRESS_SPACE_LOCAL: return NVPTX::PTXLdStInstCode::LOCAL;
+    case llvm::ADDRESS_SPACE_GLOBAL: return NVPTX::PTXLdStInstCode::GLOBAL;
+    case llvm::ADDRESS_SPACE_SHARED: return NVPTX::PTXLdStInstCode::SHARED;
+    case llvm::ADDRESS_SPACE_GENERIC: return NVPTX::PTXLdStInstCode::GENERIC;
+    case llvm::ADDRESS_SPACE_PARAM: return NVPTX::PTXLdStInstCode::PARAM;
+    case llvm::ADDRESS_SPACE_CONST: return NVPTX::PTXLdStInstCode::CONSTANT;
+    default: break;
+    }
+  }
+  return NVPTX::PTXLdStInstCode::GENERIC;
+}
+
+SDNode *NVPTXDAGToDAGISel::SelectIntrinsicNoChain(SDNode *N) {
+  unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+  switch (IID) {
+  default:
+    return nullptr;
+  case Intrinsic::nvvm_texsurf_handle_internal:
+    return SelectTexSurfHandle(N);
+  }
+}
+
+SDNode *NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) {
+  // Op 0 is the intrinsic ID
+  SDValue Wrapper = N->getOperand(1);
+  SDValue GlobalVal = Wrapper.getOperand(0);
+  return CurDAG->getMachineNode(NVPTX::texsurf_handles, SDLoc(N), MVT::i64,
+                                GlobalVal);
+}
+
+SDNode *NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
+  SDValue Src = N->getOperand(0);
+  AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
+  unsigned SrcAddrSpace = CastN->getSrcAddressSpace();
+  unsigned DstAddrSpace = CastN->getDestAddressSpace();
+
+  assert(SrcAddrSpace != DstAddrSpace &&
+         "addrspacecast must be between different address spaces");
+
+  if (DstAddrSpace == ADDRESS_SPACE_GENERIC) {
+    // Specific to generic
+    unsigned Opc;
+    switch (SrcAddrSpace) {
+    default: report_fatal_error("Bad address space in addrspacecast");
+    case ADDRESS_SPACE_GLOBAL:
+      Opc = Subtarget.is64Bit() ? NVPTX::cvta_global_yes_64
+                                : NVPTX::cvta_global_yes;
+      break;
+    case ADDRESS_SPACE_SHARED:
+      Opc = Subtarget.is64Bit() ? NVPTX::cvta_shared_yes_64
+                                : NVPTX::cvta_shared_yes;
+      break;
+    case ADDRESS_SPACE_CONST:
+      Opc = Subtarget.is64Bit() ? NVPTX::cvta_const_yes_64
+                                : NVPTX::cvta_const_yes;
+      break;
+    case ADDRESS_SPACE_LOCAL:
+      Opc = Subtarget.is64Bit() ? NVPTX::cvta_local_yes_64
+                                : NVPTX::cvta_local_yes;
+      break;
+    }
+    return CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0), Src);
+  } else {
+    // Generic to specific
+    if (SrcAddrSpace != 0)
+      report_fatal_error("Cannot cast between two non-generic address spaces");
+    unsigned Opc;
+    switch (DstAddrSpace) {
+    default: report_fatal_error("Bad address space in addrspacecast");
+    case ADDRESS_SPACE_GLOBAL:
+      Opc = Subtarget.is64Bit() ? NVPTX::cvta_to_global_yes_64
+                                : NVPTX::cvta_to_global_yes;
+      break;
+    case ADDRESS_SPACE_SHARED:
+      Opc = Subtarget.is64Bit() ? NVPTX::cvta_to_shared_yes_64
+                                : NVPTX::cvta_to_shared_yes;
+      break;
+    case ADDRESS_SPACE_CONST:
+      Opc = Subtarget.is64Bit() ? NVPTX::cvta_to_const_yes_64
+                                : NVPTX::cvta_to_const_yes;
+      break;
+    case ADDRESS_SPACE_LOCAL:
+      Opc = Subtarget.is64Bit() ? NVPTX::cvta_to_local_yes_64
+                                : NVPTX::cvta_to_local_yes;
+      break;
+    }
+    return CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0), Src);
+  }
+}
+
+SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
+  SDLoc dl(N);
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  EVT LoadedVT = LD->getMemoryVT();
+  SDNode *NVPTXLD = nullptr;
+
+  // do not support pre/post inc/dec
+  if (LD->isIndexed())
+    return nullptr;
+
+  if (!LoadedVT.isSimple())
+    return nullptr;
+
+  // Address Space Setting
+  unsigned int codeAddrSpace = getCodeAddrSpace(LD, Subtarget);
+
+  // Volatile Setting
+  // - .volatile is only availalble for .global and .shared
+  bool isVolatile = LD->isVolatile();
+  if (codeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
+      codeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
+      codeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
+    isVolatile = false;
+
+  // Vector Setting
+  MVT SimpleVT = LoadedVT.getSimpleVT();
+  unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
+  if (SimpleVT.isVector()) {
+    unsigned num = SimpleVT.getVectorNumElements();
+    if (num == 2)
+      vecType = NVPTX::PTXLdStInstCode::V2;
+    else if (num == 4)
+      vecType = NVPTX::PTXLdStInstCode::V4;
+    else
+      return nullptr;
+  }
+
+  // Type Setting: fromType + fromTypeWidth
+  //
+  // Sign   : ISD::SEXTLOAD
+  // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
+  //          type is integer
+  // Float  : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
+  MVT ScalarVT = SimpleVT.getScalarType();
+  // Read at least 8 bits (predicates are stored as 8-bit values)
+  unsigned fromTypeWidth = std::max(8U, ScalarVT.getSizeInBits());
+  unsigned int fromType;
+  if ((LD->getExtensionType() == ISD::SEXTLOAD))
+    fromType = NVPTX::PTXLdStInstCode::Signed;
+  else if (ScalarVT.isFloatingPoint())
+    fromType = NVPTX::PTXLdStInstCode::Float;
+  else
+    fromType = NVPTX::PTXLdStInstCode::Unsigned;
+
+  // Create the machine instruction DAG
+  SDValue Chain = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue Addr;
+  SDValue Offset, Base;
+  unsigned Opcode;
+  MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
+
+  if (SelectDirectAddr(N1, Addr)) {
+    switch (TargetVT) {
+    case MVT::i8:
+      Opcode = NVPTX::LD_i8_avar;
+      break;
+    case MVT::i16:
+      Opcode = NVPTX::LD_i16_avar;
+      break;
+    case MVT::i32:
+      Opcode = NVPTX::LD_i32_avar;
+      break;
+    case MVT::i64:
+      Opcode = NVPTX::LD_i64_avar;
+      break;
+    case MVT::f32:
+      Opcode = NVPTX::LD_f32_avar;
+      break;
+    case MVT::f64:
+      Opcode = NVPTX::LD_f64_avar;
+      break;
+    default:
+      return nullptr;
+    }
+    SDValue Ops[] = { getI32Imm(isVolatile), getI32Imm(codeAddrSpace),
+                      getI32Imm(vecType), getI32Imm(fromType),
+                      getI32Imm(fromTypeWidth), Addr, Chain };
+    NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT, MVT::Other, Ops);
+  } else if (Subtarget.is64Bit()
+                 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)
+                 : SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
+    switch (TargetVT) {
+    case MVT::i8:
+      Opcode = NVPTX::LD_i8_asi;
+      break;
+    case MVT::i16:
+      Opcode = NVPTX::LD_i16_asi;
+      break;
+    case MVT::i32:
+      Opcode = NVPTX::LD_i32_asi;
+      break;
+    case MVT::i64:
+      Opcode = NVPTX::LD_i64_asi;
+      break;
+    case MVT::f32:
+      Opcode = NVPTX::LD_f32_asi;
+      break;
+    case MVT::f64:
+      Opcode = NVPTX::LD_f64_asi;
+      break;
+    default:
+      return nullptr;
+    }
+    SDValue Ops[] = { getI32Imm(isVolatile), getI32Imm(codeAddrSpace),
+                      getI32Imm(vecType), getI32Imm(fromType),
+                      getI32Imm(fromTypeWidth), Base, Offset, Chain };
+    NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT, MVT::Other, Ops);
+  } else if (Subtarget.is64Bit()
+                 ? SelectADDRri64(N1.getNode(), N1, Base, Offset)
+                 : SelectADDRri(N1.getNode(), N1, Base, Offset)) {
+    if (Subtarget.is64Bit()) {
+      switch (TargetVT) {
+      case MVT::i8:
+        Opcode = NVPTX::LD_i8_ari_64;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::LD_i16_ari_64;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::LD_i32_ari_64;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::LD_i64_ari_64;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::LD_f32_ari_64;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::LD_f64_ari_64;
+        break;
+      default:
+        return nullptr;
+      }
+    } else {
+      switch (TargetVT) {
+      case MVT::i8:
+        Opcode = NVPTX::LD_i8_ari;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::LD_i16_ari;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::LD_i32_ari;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::LD_i64_ari;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::LD_f32_ari;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::LD_f64_ari;
+        break;
+      default:
+        return nullptr;
+      }
+    }
+    SDValue Ops[] = { getI32Imm(isVolatile), getI32Imm(codeAddrSpace),
+                      getI32Imm(vecType), getI32Imm(fromType),
+                      getI32Imm(fromTypeWidth), Base, Offset, Chain };
+    NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT, MVT::Other, Ops);
+  } else {
+    if (Subtarget.is64Bit()) {
+      switch (TargetVT) {
+      case MVT::i8:
+        Opcode = NVPTX::LD_i8_areg_64;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::LD_i16_areg_64;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::LD_i32_areg_64;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::LD_i64_areg_64;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::LD_f32_areg_64;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::LD_f64_areg_64;
+        break;
+      default:
+        return nullptr;
+      }
+    } else {
+      switch (TargetVT) {
+      case MVT::i8:
+        Opcode = NVPTX::LD_i8_areg;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::LD_i16_areg;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::LD_i32_areg;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::LD_i64_areg;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::LD_f32_areg;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::LD_f64_areg;
+        break;
+      default:
+        return nullptr;
+      }
+    }
+    SDValue Ops[] = { getI32Imm(isVolatile), getI32Imm(codeAddrSpace),
+                      getI32Imm(vecType), getI32Imm(fromType),
+                      getI32Imm(fromTypeWidth), N1, Chain };
+    NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT, MVT::Other, Ops);
+  }
+
+  if (NVPTXLD) {
+    MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+    MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+    cast<MachineSDNode>(NVPTXLD)->setMemRefs(MemRefs0, MemRefs0 + 1);
+  }
+
+  return NVPTXLD;
+}
+
+SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
+
+  SDValue Chain = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  SDValue Addr, Offset, Base;
+  unsigned Opcode;
+  SDLoc DL(N);
+  SDNode *LD;
+  MemSDNode *MemSD = cast<MemSDNode>(N);
+  EVT LoadedVT = MemSD->getMemoryVT();
+
+  if (!LoadedVT.isSimple())
+    return nullptr;
+
+  // Address Space Setting
+  unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD, Subtarget);
+
+  // Volatile Setting
+  // - .volatile is only availalble for .global and .shared
+  bool IsVolatile = MemSD->isVolatile();
+  if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
+      CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
+      CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
+    IsVolatile = false;
+
+  // Vector Setting
+  MVT SimpleVT = LoadedVT.getSimpleVT();
+
+  // Type Setting: fromType + fromTypeWidth
+  //
+  // Sign   : ISD::SEXTLOAD
+  // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
+  //          type is integer
+  // Float  : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
+  MVT ScalarVT = SimpleVT.getScalarType();
+  // Read at least 8 bits (predicates are stored as 8-bit values)
+  unsigned FromTypeWidth = std::max(8U, ScalarVT.getSizeInBits());
+  unsigned int FromType;
+  // The last operand holds the original LoadSDNode::getExtensionType() value
+  unsigned ExtensionType = cast<ConstantSDNode>(
+      N->getOperand(N->getNumOperands() - 1))->getZExtValue();
+  if (ExtensionType == ISD::SEXTLOAD)
+    FromType = NVPTX::PTXLdStInstCode::Signed;
+  else if (ScalarVT.isFloatingPoint())
+    FromType = NVPTX::PTXLdStInstCode::Float;
+  else
+    FromType = NVPTX::PTXLdStInstCode::Unsigned;
+
+  unsigned VecType;
+
+  switch (N->getOpcode()) {
+  case NVPTXISD::LoadV2:
+    VecType = NVPTX::PTXLdStInstCode::V2;
+    break;
+  case NVPTXISD::LoadV4:
+    VecType = NVPTX::PTXLdStInstCode::V4;
+    break;
+  default:
+    return nullptr;
+  }
+
+  EVT EltVT = N->getValueType(0);
+
+  if (SelectDirectAddr(Op1, Addr)) {
+    switch (N->getOpcode()) {
+    default:
+      return nullptr;
+    case NVPTXISD::LoadV2:
+      switch (EltVT.getSimpleVT().SimpleTy) {
+      default:
+        return nullptr;
+      case MVT::i8:
+        Opcode = NVPTX::LDV_i8_v2_avar;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::LDV_i16_v2_avar;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::LDV_i32_v2_avar;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::LDV_i64_v2_avar;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::LDV_f32_v2_avar;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::LDV_f64_v2_avar;
+        break;
+      }
+      break;
+    case NVPTXISD::LoadV4:
+      switch (EltVT.getSimpleVT().SimpleTy) {
+      default:
+        return nullptr;
+      case MVT::i8:
+        Opcode = NVPTX::LDV_i8_v4_avar;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::LDV_i16_v4_avar;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::LDV_i32_v4_avar;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::LDV_f32_v4_avar;
+        break;
+      }
+      break;
+    }
+
+    SDValue Ops[] = { getI32Imm(IsVolatile), getI32Imm(CodeAddrSpace),
+                      getI32Imm(VecType), getI32Imm(FromType),
+                      getI32Imm(FromTypeWidth), Addr, Chain };
+    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
+  } else if (Subtarget.is64Bit()
+                 ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset)
+                 : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
+    switch (N->getOpcode()) {
+    default:
+      return nullptr;
+    case NVPTXISD::LoadV2:
+      switch (EltVT.getSimpleVT().SimpleTy) {
+      default:
+        return nullptr;
+      case MVT::i8:
+        Opcode = NVPTX::LDV_i8_v2_asi;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::LDV_i16_v2_asi;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::LDV_i32_v2_asi;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::LDV_i64_v2_asi;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::LDV_f32_v2_asi;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::LDV_f64_v2_asi;
+        break;
+      }
+      break;
+    case NVPTXISD::LoadV4:
+      switch (EltVT.getSimpleVT().SimpleTy) {
+      default:
+        return nullptr;
+      case MVT::i8:
+        Opcode = NVPTX::LDV_i8_v4_asi;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::LDV_i16_v4_asi;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::LDV_i32_v4_asi;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::LDV_f32_v4_asi;
+        break;
+      }
+      break;
+    }
+
+    SDValue Ops[] = { getI32Imm(IsVolatile), getI32Imm(CodeAddrSpace),
+                      getI32Imm(VecType), getI32Imm(FromType),
+                      getI32Imm(FromTypeWidth), Base, Offset, Chain };
+    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
+  } else if (Subtarget.is64Bit()
+                 ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
+                 : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
+    if (Subtarget.is64Bit()) {
+      switch (N->getOpcode()) {
+      default:
+        return nullptr;
+      case NVPTXISD::LoadV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return nullptr;
+        case MVT::i8:
+          Opcode = NVPTX::LDV_i8_v2_ari_64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::LDV_i16_v2_ari_64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::LDV_i32_v2_ari_64;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::LDV_i64_v2_ari_64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::LDV_f32_v2_ari_64;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::LDV_f64_v2_ari_64;
+          break;
+        }
+        break;
+      case NVPTXISD::LoadV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return nullptr;
+        case MVT::i8:
+          Opcode = NVPTX::LDV_i8_v4_ari_64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::LDV_i16_v4_ari_64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::LDV_i32_v4_ari_64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::LDV_f32_v4_ari_64;
+          break;
+        }
+        break;
+      }
+    } else {
+      switch (N->getOpcode()) {
+      default:
+        return nullptr;
+      case NVPTXISD::LoadV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return nullptr;
+        case MVT::i8:
+          Opcode = NVPTX::LDV_i8_v2_ari;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::LDV_i16_v2_ari;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::LDV_i32_v2_ari;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::LDV_i64_v2_ari;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::LDV_f32_v2_ari;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::LDV_f64_v2_ari;
+          break;
+        }
+        break;
+      case NVPTXISD::LoadV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return nullptr;
+        case MVT::i8:
+          Opcode = NVPTX::LDV_i8_v4_ari;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::LDV_i16_v4_ari;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::LDV_i32_v4_ari;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::LDV_f32_v4_ari;
+          break;
+        }
+        break;
+      }
+    }
+
+    SDValue Ops[] = { getI32Imm(IsVolatile), getI32Imm(CodeAddrSpace),
+                      getI32Imm(VecType), getI32Imm(FromType),
+                      getI32Imm(FromTypeWidth), Base, Offset, Chain };
+
+    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
+  } else {
+    if (Subtarget.is64Bit()) {
+      switch (N->getOpcode()) {
+      default:
+        return nullptr;
+      case NVPTXISD::LoadV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return nullptr;
+        case MVT::i8:
+          Opcode = NVPTX::LDV_i8_v2_areg_64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::LDV_i16_v2_areg_64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::LDV_i32_v2_areg_64;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::LDV_i64_v2_areg_64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::LDV_f32_v2_areg_64;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::LDV_f64_v2_areg_64;
+          break;
+        }
+        break;
+      case NVPTXISD::LoadV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return nullptr;
+        case MVT::i8:
+          Opcode = NVPTX::LDV_i8_v4_areg_64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::LDV_i16_v4_areg_64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::LDV_i32_v4_areg_64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::LDV_f32_v4_areg_64;
+          break;
+        }
+        break;
+      }
+    } else {
+      switch (N->getOpcode()) {
+      default:
+        return nullptr;
+      case NVPTXISD::LoadV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return nullptr;
+        case MVT::i8:
+          Opcode = NVPTX::LDV_i8_v2_areg;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::LDV_i16_v2_areg;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::LDV_i32_v2_areg;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::LDV_i64_v2_areg;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::LDV_f32_v2_areg;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::LDV_f64_v2_areg;
+          break;
+        }
+        break;
+      case NVPTXISD::LoadV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return nullptr;
+        case MVT::i8:
+          Opcode = NVPTX::LDV_i8_v4_areg;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::LDV_i16_v4_areg;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::LDV_i32_v4_areg;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::LDV_f32_v4_areg;
+          break;
+        }
+        break;
+      }
+    }
+
+    SDValue Ops[] = { getI32Imm(IsVolatile), getI32Imm(CodeAddrSpace),
+                      getI32Imm(VecType), getI32Imm(FromType),
+                      getI32Imm(FromTypeWidth), Op1, Chain };
+    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
+  }
+
+  MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+  MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(LD)->setMemRefs(MemRefs0, MemRefs0 + 1);
+
+  return LD;
+}
+
+SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
+
+  SDValue Chain = N->getOperand(0);
+  SDValue Op1;
+  MemSDNode *Mem;
+  bool IsLDG = true;
+
+  // If this is an LDG intrinsic, the address is the third operand. Its its an
+  // LDG/LDU SD node (from custom vector handling), then its the second operand
+  if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
+    Op1 = N->getOperand(2);
+    Mem = cast<MemIntrinsicSDNode>(N);
+    unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+    switch (IID) {
+    default:
+      return NULL;
+    case Intrinsic::nvvm_ldg_global_f:
+    case Intrinsic::nvvm_ldg_global_i:
+    case Intrinsic::nvvm_ldg_global_p:
+      IsLDG = true;
+      break;
+    case Intrinsic::nvvm_ldu_global_f:
+    case Intrinsic::nvvm_ldu_global_i:
+    case Intrinsic::nvvm_ldu_global_p:
+      IsLDG = false;
+      break;
+    }
+  } else {
+    Op1 = N->getOperand(1);
+    Mem = cast<MemSDNode>(N);
+  }
+
+  unsigned Opcode;
+  SDLoc DL(N);
+  SDNode *LD;
+  SDValue Base, Offset, Addr;
+
+  EVT EltVT = Mem->getMemoryVT();
+  if (EltVT.isVector()) {
+    EltVT = EltVT.getVectorElementType();
+  }
+
+  if (SelectDirectAddr(Op1, Addr)) {
+    switch (N->getOpcode()) {
+    default:
+      return nullptr;
+    case ISD::INTRINSIC_W_CHAIN:
+      if (IsLDG) {
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return nullptr;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8avar;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16avar;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32avar;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64avar;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32avar;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64avar;
+          break;
+        }
+      } else {
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return nullptr;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8avar;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16avar;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32avar;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64avar;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32avar;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64avar;
+          break;
+        }
+      }
+      break;
+    case NVPTXISD::LDGV2:
+      switch (EltVT.getSimpleVT().SimpleTy) {
+      default:
+        return nullptr;
+      case MVT::i8:
+        Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar;
+        break;
+      }
+      break;
+    case NVPTXISD::LDUV2:
+      switch (EltVT.getSimpleVT().SimpleTy) {
+      default:
+        return nullptr;
+      case MVT::i8:
+        Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar;
+        break;
+      }
+      break;
+    case NVPTXISD::LDGV4:
+      switch (EltVT.getSimpleVT().SimpleTy) {
+      default:
+        return nullptr;
+      case MVT::i8:
+        Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar;
+        break;
+      }
+      break;
+    case NVPTXISD::LDUV4:
+      switch (EltVT.getSimpleVT().SimpleTy) {
+      default:
+        return nullptr;
+      case MVT::i8:
+        Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar;
+        break;
+      }
+      break;
+    }
+
+    SDValue Ops[] = { Addr, Chain };
+    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
+  } else if (Subtarget.is64Bit()
+                 ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
+                 : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
+    if (Subtarget.is64Bit()) {
+      switch (N->getOpcode()) {
+      default:
+        return nullptr;
+      case ISD::INTRINSIC_W_CHAIN:
+        if (IsLDG) {
+          switch (EltVT.getSimpleVT().SimpleTy) {
+          default:
+            return nullptr;
+          case MVT::i8:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8ari64;
+            break;
+          case MVT::i16:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16ari64;
+            break;
+          case MVT::i32:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32ari64;
+            break;
+          case MVT::i64:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64ari64;
+            break;
+          case MVT::f32:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32ari64;
+            break;
+          case MVT::f64:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64ari64;
+            break;
+          }
+        } else {
+          switch (EltVT.getSimpleVT().SimpleTy) {
+          default:
+            return nullptr;
+          case MVT::i8:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8ari64;
+            break;
+          case MVT::i16:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16ari64;
+            break;
+          case MVT::i32:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32ari64;
+            break;
+          case MVT::i64:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64ari64;
+            break;
+          case MVT::f32:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32ari64;
+            break;
+          case MVT::f64:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64ari64;
+            break;
+          }
+        }
+        break;
+      case NVPTXISD::LDGV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return nullptr;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari64;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari64;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari64;
+          break;
+        }
+        break;
+      case NVPTXISD::LDUV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return nullptr;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari64;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari64;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64;
+          break;
+        }
+        break;
+      case NVPTXISD::LDGV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return nullptr;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari64;
+          break;
+        }
+        break;
+      case NVPTXISD::LDUV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return nullptr;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari64;
+          break;
+        }
+        break;
+      }
+    } else {
+      switch (N->getOpcode()) {
+      default:
+        return nullptr;
+      case ISD::INTRINSIC_W_CHAIN:
+        if (IsLDG) {
+          switch (EltVT.getSimpleVT().SimpleTy) {
+          default:
+            return nullptr;
+          case MVT::i8:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8ari;
+            break;
+          case MVT::i16:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16ari;
+            break;
+          case MVT::i32:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32ari;
+            break;
+          case MVT::i64:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64ari;
+            break;
+          case MVT::f32:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32ari;
+            break;
+          case MVT::f64:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64ari;
+            break;
+          }
+        } else {
+          switch (EltVT.getSimpleVT().SimpleTy) {
+          default:
+            return nullptr;
+          case MVT::i8:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8ari;
+            break;
+          case MVT::i16:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16ari;
+            break;
+          case MVT::i32:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32ari;
+            break;
+          case MVT::i64:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64ari;
+            break;
+          case MVT::f32:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32ari;
+            break;
+          case MVT::f64:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64ari;
+            break;
+          }
+        }
+        break;
+      case NVPTXISD::LDGV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return nullptr;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32;
+          break;
+        }
+        break;
+      case NVPTXISD::LDUV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return nullptr;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32;
+          break;
+        }
+        break;
+      case NVPTXISD::LDGV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return nullptr;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari32;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari32;
+          break;
+        }
+        break;
+      case NVPTXISD::LDUV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return nullptr;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari32;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari32;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari32;
+          break;
+        }
+        break;
+      }
+    }
+
+    SDValue Ops[] = { Base, Offset, Chain };
+
+    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
+  } else {
+    if (Subtarget.is64Bit()) {
+      switch (N->getOpcode()) {
+      default:
+        return nullptr;
+      case ISD::INTRINSIC_W_CHAIN:
+        if (IsLDG) {
+          switch (EltVT.getSimpleVT().SimpleTy) {
+          default:
+            return nullptr;
+          case MVT::i8:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8areg64;
+            break;
+          case MVT::i16:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16areg64;
+            break;
+          case MVT::i32:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32areg64;
+            break;
+          case MVT::i64:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64areg64;
+            break;
+          case MVT::f32:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32areg64;
+            break;
+          case MVT::f64:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64areg64;
+            break;
+          }
+        } else {
+          switch (EltVT.getSimpleVT().SimpleTy) {
+          default:
+            return nullptr;
+          case MVT::i8:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8areg64;
+            break;
+          case MVT::i16:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16areg64;
+            break;
+          case MVT::i32:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32areg64;
+            break;
+          case MVT::i64:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64areg64;
+            break;
+          case MVT::f32:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32areg64;
+            break;
+          case MVT::f64:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64areg64;
+            break;
+          }
+        }
+        break;
+      case NVPTXISD::LDGV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return nullptr;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg64;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg64;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg64;
+          break;
+        }
+        break;
+      case NVPTXISD::LDUV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return nullptr;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg64;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg64;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64;
+          break;
+        }
+        break;
+      case NVPTXISD::LDGV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return nullptr;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg64;
+          break;
+        }
+        break;
+      case NVPTXISD::LDUV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return nullptr;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg64;
+          break;
+        }
+        break;
+      }
+    } else {
+      switch (N->getOpcode()) {
+      default:
+        return nullptr;
+      case ISD::INTRINSIC_W_CHAIN:
+        if (IsLDG) {
+          switch (EltVT.getSimpleVT().SimpleTy) {
+          default:
+            return nullptr;
+          case MVT::i8:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8areg;
+            break;
+          case MVT::i16:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16areg;
+            break;
+          case MVT::i32:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32areg;
+            break;
+          case MVT::i64:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64areg;
+            break;
+          case MVT::f32:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32areg;
+            break;
+          case MVT::f64:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64areg;
+            break;
+          }
+        } else {
+          switch (EltVT.getSimpleVT().SimpleTy) {
+          default:
+            return nullptr;
+          case MVT::i8:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8areg;
+            break;
+          case MVT::i16:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16areg;
+            break;
+          case MVT::i32:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32areg;
+            break;
+          case MVT::i64:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64areg;
+            break;
+          case MVT::f32:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32areg;
+            break;
+          case MVT::f64:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64areg;
+            break;
+          }
+        }
+        break;
+      case NVPTXISD::LDGV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return nullptr;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg32;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg32;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg32;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg32;
+          break;
+        }
+        break;
+      case NVPTXISD::LDUV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return nullptr;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg32;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg32;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg32;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg32;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32;
+          break;
+        }
+        break;
+      case NVPTXISD::LDGV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return nullptr;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg32;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg32;
+          break;
+        }
+        break;
+      case NVPTXISD::LDUV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return nullptr;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg32;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg32;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg32;
+          break;
+        }
+        break;
+      }
+    }
+
+    SDValue Ops[] = { Op1, Chain };
+    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
+  }
+
+  MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+  MemRefs0[0] = Mem->getMemOperand();
+  cast<MachineSDNode>(LD)->setMemRefs(MemRefs0, MemRefs0 + 1);
+
+  return LD;
+}
+
+SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
+  SDLoc dl(N);
+  StoreSDNode *ST = cast<StoreSDNode>(N);
+  EVT StoreVT = ST->getMemoryVT();
+  SDNode *NVPTXST = nullptr;
+
+  // do not support pre/post inc/dec
+  if (ST->isIndexed())
+    return nullptr;
+
+  if (!StoreVT.isSimple())
+    return nullptr;
+
+  // Address Space Setting
+  unsigned int codeAddrSpace = getCodeAddrSpace(ST, Subtarget);
+
+  // Volatile Setting
+  // - .volatile is only availalble for .global and .shared
+  bool isVolatile = ST->isVolatile();
+  if (codeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
+      codeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
+      codeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
+    isVolatile = false;
+
+  // Vector Setting
+  MVT SimpleVT = StoreVT.getSimpleVT();
+  unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
+  if (SimpleVT.isVector()) {
+    unsigned num = SimpleVT.getVectorNumElements();
+    if (num == 2)
+      vecType = NVPTX::PTXLdStInstCode::V2;
+    else if (num == 4)
+      vecType = NVPTX::PTXLdStInstCode::V4;
+    else
+      return nullptr;
+  }
+
+  // Type Setting: toType + toTypeWidth
+  // - for integer type, always use 'u'
+  //
+  MVT ScalarVT = SimpleVT.getScalarType();
+  unsigned toTypeWidth = ScalarVT.getSizeInBits();
+  unsigned int toType;
+  if (ScalarVT.isFloatingPoint())
+    toType = NVPTX::PTXLdStInstCode::Float;
+  else
+    toType = NVPTX::PTXLdStInstCode::Unsigned;
+
+  // Create the machine instruction DAG
+  SDValue Chain = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+  SDValue Addr;
+  SDValue Offset, Base;
+  unsigned Opcode;
+  MVT::SimpleValueType SourceVT = N1.getNode()->getSimpleValueType(0).SimpleTy;
+
+  if (SelectDirectAddr(N2, Addr)) {
+    switch (SourceVT) {
+    case MVT::i8:
+      Opcode = NVPTX::ST_i8_avar;
+      break;
+    case MVT::i16:
+      Opcode = NVPTX::ST_i16_avar;
+      break;
+    case MVT::i32:
+      Opcode = NVPTX::ST_i32_avar;
+      break;
+    case MVT::i64:
+      Opcode = NVPTX::ST_i64_avar;
+      break;
+    case MVT::f32:
+      Opcode = NVPTX::ST_f32_avar;
+      break;
+    case MVT::f64:
+      Opcode = NVPTX::ST_f64_avar;
+      break;
+    default:
+      return nullptr;
+    }
+    SDValue Ops[] = { N1, getI32Imm(isVolatile), getI32Imm(codeAddrSpace),
+                      getI32Imm(vecType), getI32Imm(toType),
+                      getI32Imm(toTypeWidth), Addr, Chain };
+    NVPTXST = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops);
+  } else if (Subtarget.is64Bit()
+                 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
+                 : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
+    switch (SourceVT) {
+    case MVT::i8:
+      Opcode = NVPTX::ST_i8_asi;
+      break;
+    case MVT::i16:
+      Opcode = NVPTX::ST_i16_asi;
+      break;
+    case MVT::i32:
+      Opcode = NVPTX::ST_i32_asi;
+      break;
+    case MVT::i64:
+      Opcode = NVPTX::ST_i64_asi;
+      break;
+    case MVT::f32:
+      Opcode = NVPTX::ST_f32_asi;
+      break;
+    case MVT::f64:
+      Opcode = NVPTX::ST_f64_asi;
+      break;
+    default:
+      return nullptr;
+    }
+    SDValue Ops[] = { N1, getI32Imm(isVolatile), getI32Imm(codeAddrSpace),
+                      getI32Imm(vecType), getI32Imm(toType),
+                      getI32Imm(toTypeWidth), Base, Offset, Chain };
+    NVPTXST = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops);
+  } else if (Subtarget.is64Bit()
+                 ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
+                 : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
+    if (Subtarget.is64Bit()) {
+      switch (SourceVT) {
+      case MVT::i8:
+        Opcode = NVPTX::ST_i8_ari_64;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::ST_i16_ari_64;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::ST_i32_ari_64;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::ST_i64_ari_64;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::ST_f32_ari_64;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::ST_f64_ari_64;
+        break;
+      default:
+        return nullptr;
+      }
+    } else {
+      switch (SourceVT) {
+      case MVT::i8:
+        Opcode = NVPTX::ST_i8_ari;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::ST_i16_ari;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::ST_i32_ari;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::ST_i64_ari;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::ST_f32_ari;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::ST_f64_ari;
+        break;
+      default:
+        return nullptr;
+      }
+    }
+    SDValue Ops[] = { N1, getI32Imm(isVolatile), getI32Imm(codeAddrSpace),
+                      getI32Imm(vecType), getI32Imm(toType),
+                      getI32Imm(toTypeWidth), Base, Offset, Chain };
+    NVPTXST = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops);
+  } else {
+    if (Subtarget.is64Bit()) {
+      switch (SourceVT) {
+      case MVT::i8:
+        Opcode = NVPTX::ST_i8_areg_64;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::ST_i16_areg_64;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::ST_i32_areg_64;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::ST_i64_areg_64;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::ST_f32_areg_64;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::ST_f64_areg_64;
+        break;
+      default:
+        return nullptr;
+      }
+    } else {
+      switch (SourceVT) {
+      case MVT::i8:
+        Opcode = NVPTX::ST_i8_areg;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::ST_i16_areg;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::ST_i32_areg;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::ST_i64_areg;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::ST_f32_areg;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::ST_f64_areg;
+        break;
+      default:
+        return nullptr;
+      }
+    }
+    SDValue Ops[] = { N1, getI32Imm(isVolatile), getI32Imm(codeAddrSpace),
+                      getI32Imm(vecType), getI32Imm(toType),
+                      getI32Imm(toTypeWidth), N2, Chain };
+    NVPTXST = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops);
+  }
+
+  if (NVPTXST) {
+    MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+    MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+    cast<MachineSDNode>(NVPTXST)->setMemRefs(MemRefs0, MemRefs0 + 1);
+  }
+
+  return NVPTXST;
+}
+
+SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
+  SDValue Chain = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  SDValue Addr, Offset, Base;
+  unsigned Opcode;
+  SDLoc DL(N);
+  SDNode *ST;
+  EVT EltVT = Op1.getValueType();
+  MemSDNode *MemSD = cast<MemSDNode>(N);
+  EVT StoreVT = MemSD->getMemoryVT();
+
+  // Address Space Setting
+  unsigned CodeAddrSpace = getCodeAddrSpace(MemSD, Subtarget);
+
+  if (CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT) {
+    report_fatal_error("Cannot store to pointer that points to constant "
+                       "memory space");
+  }
+
+  // Volatile Setting
+  // - .volatile is only availalble for .global and .shared
+  bool IsVolatile = MemSD->isVolatile();
+  if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
+      CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
+      CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
+    IsVolatile = false;
+
+  // Type Setting: toType + toTypeWidth
+  // - for integer type, always use 'u'
+  assert(StoreVT.isSimple() && "Store value is not simple");
+  MVT ScalarVT = StoreVT.getSimpleVT().getScalarType();
+  unsigned ToTypeWidth = ScalarVT.getSizeInBits();
+  unsigned ToType;
+  if (ScalarVT.isFloatingPoint())
+    ToType = NVPTX::PTXLdStInstCode::Float;
+  else
+    ToType = NVPTX::PTXLdStInstCode::Unsigned;
+
+  SmallVector<SDValue, 12> StOps;
+  SDValue N2;
+  unsigned VecType;
+
+  switch (N->getOpcode()) {
+  case NVPTXISD::StoreV2:
+    VecType = NVPTX::PTXLdStInstCode::V2;
+    StOps.push_back(N->getOperand(1));
+    StOps.push_back(N->getOperand(2));
+    N2 = N->getOperand(3);
+    break;
+  case NVPTXISD::StoreV4:
+    VecType = NVPTX::PTXLdStInstCode::V4;
+    StOps.push_back(N->getOperand(1));
+    StOps.push_back(N->getOperand(2));
+    StOps.push_back(N->getOperand(3));
+    StOps.push_back(N->getOperand(4));
+    N2 = N->getOperand(5);
+    break;
+  default:
+    return nullptr;
+  }
+
+  StOps.push_back(getI32Imm(IsVolatile));
+  StOps.push_back(getI32Imm(CodeAddrSpace));
+  StOps.push_back(getI32Imm(VecType));
+  StOps.push_back(getI32Imm(ToType));
+  StOps.push_back(getI32Imm(ToTypeWidth));
+
+  if (SelectDirectAddr(N2, Addr)) {
+    switch (N->getOpcode()) {
+    default:
+      return nullptr;
+    case NVPTXISD::StoreV2:
+      switch (EltVT.getSimpleVT().SimpleTy) {
+      default:
+        return nullptr;
+      case MVT::i8:
+        Opcode = NVPTX::STV_i8_v2_avar;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::STV_i16_v2_avar;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::STV_i32_v2_avar;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::STV_i64_v2_avar;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::STV_f32_v2_avar;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::STV_f64_v2_avar;
+        break;
+      }
+      break;
+    case NVPTXISD::StoreV4:
+      switch (EltVT.getSimpleVT().SimpleTy) {
+      default:
+        return nullptr;
+      case MVT::i8:
+        Opcode = NVPTX::STV_i8_v4_avar;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::STV_i16_v4_avar;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::STV_i32_v4_avar;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::STV_f32_v4_avar;
+        break;
+      }
+      break;
+    }
+    StOps.push_back(Addr);
+  } else if (Subtarget.is64Bit()
+                 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
+                 : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
+    switch (N->getOpcode()) {
+    default:
+      return nullptr;
+    case NVPTXISD::StoreV2:
+      switch (EltVT.getSimpleVT().SimpleTy) {
+      default:
+        return nullptr;
+      case MVT::i8:
+        Opcode = NVPTX::STV_i8_v2_asi;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::STV_i16_v2_asi;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::STV_i32_v2_asi;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::STV_i64_v2_asi;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::STV_f32_v2_asi;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::STV_f64_v2_asi;
+        break;
+      }
+      break;
+    case NVPTXISD::StoreV4:
+      switch (EltVT.getSimpleVT().SimpleTy) {
+      default:
+        return nullptr;
+      case MVT::i8:
+        Opcode = NVPTX::STV_i8_v4_asi;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::STV_i16_v4_asi;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::STV_i32_v4_asi;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::STV_f32_v4_asi;
+        break;
+      }
+      break;
+    }
+    StOps.push_back(Base);
+    StOps.push_back(Offset);
+  } else if (Subtarget.is64Bit()
+                 ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
+                 : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
+    if (Subtarget.is64Bit()) {
+      switch (N->getOpcode()) {
+      default:
+        return nullptr;
+      case NVPTXISD::StoreV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return nullptr;
+        case MVT::i8:
+          Opcode = NVPTX::STV_i8_v2_ari_64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::STV_i16_v2_ari_64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::STV_i32_v2_ari_64;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::STV_i64_v2_ari_64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::STV_f32_v2_ari_64;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::STV_f64_v2_ari_64;
+          break;
+        }
+        break;
+      case NVPTXISD::StoreV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return nullptr;
+        case MVT::i8:
+          Opcode = NVPTX::STV_i8_v4_ari_64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::STV_i16_v4_ari_64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::STV_i32_v4_ari_64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::STV_f32_v4_ari_64;
+          break;
+        }
+        break;
+      }
+    } else {
+      switch (N->getOpcode()) {
+      default:
+        return nullptr;
+      case NVPTXISD::StoreV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return nullptr;
+        case MVT::i8:
+          Opcode = NVPTX::STV_i8_v2_ari;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::STV_i16_v2_ari;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::STV_i32_v2_ari;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::STV_i64_v2_ari;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::STV_f32_v2_ari;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::STV_f64_v2_ari;
+          break;
+        }
+        break;
+      case NVPTXISD::StoreV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return nullptr;
+        case MVT::i8:
+          Opcode = NVPTX::STV_i8_v4_ari;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::STV_i16_v4_ari;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::STV_i32_v4_ari;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::STV_f32_v4_ari;
+          break;
+        }
+        break;
+      }
+    }
+    StOps.push_back(Base);
+    StOps.push_back(Offset);
+  } else {
+    if (Subtarget.is64Bit()) {
+      switch (N->getOpcode()) {
+      default:
+        return nullptr;
+      case NVPTXISD::StoreV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return nullptr;
+        case MVT::i8:
+          Opcode = NVPTX::STV_i8_v2_areg_64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::STV_i16_v2_areg_64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::STV_i32_v2_areg_64;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::STV_i64_v2_areg_64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::STV_f32_v2_areg_64;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::STV_f64_v2_areg_64;
+          break;
+        }
+        break;
+      case NVPTXISD::StoreV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return nullptr;
+        case MVT::i8:
+          Opcode = NVPTX::STV_i8_v4_areg_64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::STV_i16_v4_areg_64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::STV_i32_v4_areg_64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::STV_f32_v4_areg_64;
+          break;
+        }
+        break;
+      }
+    } else {
+      switch (N->getOpcode()) {
+      default:
+        return nullptr;
+      case NVPTXISD::StoreV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return nullptr;
+        case MVT::i8:
+          Opcode = NVPTX::STV_i8_v2_areg;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::STV_i16_v2_areg;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::STV_i32_v2_areg;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::STV_i64_v2_areg;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::STV_f32_v2_areg;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::STV_f64_v2_areg;
+          break;
+        }
+        break;
+      case NVPTXISD::StoreV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return nullptr;
+        case MVT::i8:
+          Opcode = NVPTX::STV_i8_v4_areg;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::STV_i16_v4_areg;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::STV_i32_v4_areg;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::STV_f32_v4_areg;
+          break;
+        }
+        break;
+      }
+    }
+    StOps.push_back(N2);
+  }
+
+  StOps.push_back(Chain);
+
+  ST = CurDAG->getMachineNode(Opcode, DL, MVT::Other, StOps);
+
+  MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+  MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(ST)->setMemRefs(MemRefs0, MemRefs0 + 1);
+
+  return ST;
+}
+
+SDNode *NVPTXDAGToDAGISel::SelectLoadParam(SDNode *Node) {
+  SDValue Chain = Node->getOperand(0);
+  SDValue Offset = Node->getOperand(2);
+  SDValue Flag = Node->getOperand(3);
+  SDLoc DL(Node);
+  MemSDNode *Mem = cast<MemSDNode>(Node);
+
+  unsigned VecSize;
+  switch (Node->getOpcode()) {
+  default:
+    return nullptr;
+  case NVPTXISD::LoadParam:
+    VecSize = 1;
+    break;
+  case NVPTXISD::LoadParamV2:
+    VecSize = 2;
+    break;
+  case NVPTXISD::LoadParamV4:
+    VecSize = 4;
+    break;
+  }
+
+  EVT EltVT = Node->getValueType(0);
+  EVT MemVT = Mem->getMemoryVT();
+
+  unsigned Opc = 0;
+
+  switch (VecSize) {
+  default:
+    return nullptr;
+  case 1:
+    switch (MemVT.getSimpleVT().SimpleTy) {
+    default:
+      return nullptr;
+    case MVT::i1:
+      Opc = NVPTX::LoadParamMemI8;
+      break;
+    case MVT::i8:
+      Opc = NVPTX::LoadParamMemI8;
+      break;
+    case MVT::i16:
+      Opc = NVPTX::LoadParamMemI16;
+      break;
+    case MVT::i32:
+      Opc = NVPTX::LoadParamMemI32;
+      break;
+    case MVT::i64:
+      Opc = NVPTX::LoadParamMemI64;
+      break;
+    case MVT::f32:
+      Opc = NVPTX::LoadParamMemF32;
+      break;
+    case MVT::f64:
+      Opc = NVPTX::LoadParamMemF64;
+      break;
+    }
+    break;
+  case 2:
+    switch (MemVT.getSimpleVT().SimpleTy) {
+    default:
+      return nullptr;
+    case MVT::i1:
+      Opc = NVPTX::LoadParamMemV2I8;
+      break;
+    case MVT::i8:
+      Opc = NVPTX::LoadParamMemV2I8;
+      break;
+    case MVT::i16:
+      Opc = NVPTX::LoadParamMemV2I16;
+      break;
+    case MVT::i32:
+      Opc = NVPTX::LoadParamMemV2I32;
+      break;
+    case MVT::i64:
+      Opc = NVPTX::LoadParamMemV2I64;
+      break;
+    case MVT::f32:
+      Opc = NVPTX::LoadParamMemV2F32;
+      break;
+    case MVT::f64:
+      Opc = NVPTX::LoadParamMemV2F64;
+      break;
+    }
+    break;
+  case 4:
+    switch (MemVT.getSimpleVT().SimpleTy) {
+    default:
+      return nullptr;
+    case MVT::i1:
+      Opc = NVPTX::LoadParamMemV4I8;
+      break;
+    case MVT::i8:
+      Opc = NVPTX::LoadParamMemV4I8;
+      break;
+    case MVT::i16:
+      Opc = NVPTX::LoadParamMemV4I16;
+      break;
+    case MVT::i32:
+      Opc = NVPTX::LoadParamMemV4I32;
+      break;
+    case MVT::f32:
+      Opc = NVPTX::LoadParamMemV4F32;
+      break;
+    }
+    break;
+  }
+
+  SDVTList VTs;
+  if (VecSize == 1) {
+    VTs = CurDAG->getVTList(EltVT, MVT::Other, MVT::Glue);
+  } else if (VecSize == 2) {
+    VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue);
+  } else {
+    EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue };
+    VTs = CurDAG->getVTList(EVTs);
+  }
+
+  unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
+
+  SmallVector<SDValue, 2> Ops;
+  Ops.push_back(CurDAG->getTargetConstant(OffsetVal, MVT::i32));
+  Ops.push_back(Chain);
+  Ops.push_back(Flag);
+
+  SDNode *Ret =
+      CurDAG->getMachineNode(Opc, DL, VTs, Ops);
+  return Ret;
+}
+
+SDNode *NVPTXDAGToDAGISel::SelectStoreRetval(SDNode *N) {
+  SDLoc DL(N);
+  SDValue Chain = N->getOperand(0);
+  SDValue Offset = N->getOperand(1);
+  unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
+  MemSDNode *Mem = cast<MemSDNode>(N);
+
+  // How many elements do we have?
+  unsigned NumElts = 1;
+  switch (N->getOpcode()) {
+  default:
+    return nullptr;
+  case NVPTXISD::StoreRetval:
+    NumElts = 1;
+    break;
+  case NVPTXISD::StoreRetvalV2:
+    NumElts = 2;
+    break;
+  case NVPTXISD::StoreRetvalV4:
+    NumElts = 4;
+    break;
+  }
+
+  // Build vector of operands
+  SmallVector<SDValue, 6> Ops;
+  for (unsigned i = 0; i < NumElts; ++i)
+    Ops.push_back(N->getOperand(i + 2));
+  Ops.push_back(CurDAG->getTargetConstant(OffsetVal, MVT::i32));
+  Ops.push_back(Chain);
+
+  // Determine target opcode
+  // If we have an i1, use an 8-bit store. The lowering code in
+  // NVPTXISelLowering will have already emitted an upcast.
+  unsigned Opcode = 0;
+  switch (NumElts) {
+  default:
+    return nullptr;
+  case 1:
+    switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
+    default:
+      return nullptr;
+    case MVT::i1:
+      Opcode = NVPTX::StoreRetvalI8;
+      break;
+    case MVT::i8:
+      Opcode = NVPTX::StoreRetvalI8;
+      break;
+    case MVT::i16:
+      Opcode = NVPTX::StoreRetvalI16;
+      break;
+    case MVT::i32:
+      Opcode = NVPTX::StoreRetvalI32;
+      break;
+    case MVT::i64:
+      Opcode = NVPTX::StoreRetvalI64;
+      break;
+    case MVT::f32:
+      Opcode = NVPTX::StoreRetvalF32;
+      break;
+    case MVT::f64:
+      Opcode = NVPTX::StoreRetvalF64;
+      break;
+    }
+    break;
+  case 2:
+    switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
+    default:
+      return nullptr;
+    case MVT::i1:
+      Opcode = NVPTX::StoreRetvalV2I8;
+      break;
+    case MVT::i8:
+      Opcode = NVPTX::StoreRetvalV2I8;
+      break;
+    case MVT::i16:
+      Opcode = NVPTX::StoreRetvalV2I16;
+      break;
+    case MVT::i32:
+      Opcode = NVPTX::StoreRetvalV2I32;
+      break;
+    case MVT::i64:
+      Opcode = NVPTX::StoreRetvalV2I64;
+      break;
+    case MVT::f32:
+      Opcode = NVPTX::StoreRetvalV2F32;
+      break;
+    case MVT::f64:
+      Opcode = NVPTX::StoreRetvalV2F64;
+      break;
+    }
+    break;
+  case 4:
+    switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
+    default:
+      return nullptr;
+    case MVT::i1:
+      Opcode = NVPTX::StoreRetvalV4I8;
+      break;
+    case MVT::i8:
+      Opcode = NVPTX::StoreRetvalV4I8;
+      break;
+    case MVT::i16:
+      Opcode = NVPTX::StoreRetvalV4I16;
+      break;
+    case MVT::i32:
+      Opcode = NVPTX::StoreRetvalV4I32;
+      break;
+    case MVT::f32:
+      Opcode = NVPTX::StoreRetvalV4F32;
+      break;
+    }
+    break;
+  }
+
+  SDNode *Ret =
+      CurDAG->getMachineNode(Opcode, DL, MVT::Other, Ops);
+  MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+  MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(Ret)->setMemRefs(MemRefs0, MemRefs0 + 1);
+
+  return Ret;
+}
+
+SDNode *NVPTXDAGToDAGISel::SelectStoreParam(SDNode *N) {
+  SDLoc DL(N);
+  SDValue Chain = N->getOperand(0);
+  SDValue Param = N->getOperand(1);
+  unsigned ParamVal = cast<ConstantSDNode>(Param)->getZExtValue();
+  SDValue Offset = N->getOperand(2);
+  unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
+  MemSDNode *Mem = cast<MemSDNode>(N);
+  SDValue Flag = N->getOperand(N->getNumOperands() - 1);
+
+  // How many elements do we have?
+  unsigned NumElts = 1;
+  switch (N->getOpcode()) {
+  default:
+    return nullptr;
+  case NVPTXISD::StoreParamU32:
+  case NVPTXISD::StoreParamS32:
+  case NVPTXISD::StoreParam:
+    NumElts = 1;
+    break;
+  case NVPTXISD::StoreParamV2:
+    NumElts = 2;
+    break;
+  case NVPTXISD::StoreParamV4:
+    NumElts = 4;
+    break;
+  }
+
+  // Build vector of operands
+  SmallVector<SDValue, 8> Ops;
+  for (unsigned i = 0; i < NumElts; ++i)
+    Ops.push_back(N->getOperand(i + 3));
+  Ops.push_back(CurDAG->getTargetConstant(ParamVal, MVT::i32));
+  Ops.push_back(CurDAG->getTargetConstant(OffsetVal, MVT::i32));
+  Ops.push_back(Chain);
+  Ops.push_back(Flag);
+
+  // Determine target opcode
+  // If we have an i1, use an 8-bit store. The lowering code in
+  // NVPTXISelLowering will have already emitted an upcast.
+  unsigned Opcode = 0;
+  switch (N->getOpcode()) {
+  default:
+    switch (NumElts) {
+    default:
+      return nullptr;
+    case 1:
+      switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
+      default:
+        return nullptr;
+      case MVT::i1:
+        Opcode = NVPTX::StoreParamI8;
+        break;
+      case MVT::i8:
+        Opcode = NVPTX::StoreParamI8;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::StoreParamI16;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::StoreParamI32;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::StoreParamI64;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::StoreParamF32;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::StoreParamF64;
+        break;
+      }
+      break;
+    case 2:
+      switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
+      default:
+        return nullptr;
+      case MVT::i1:
+        Opcode = NVPTX::StoreParamV2I8;
+        break;
+      case MVT::i8:
+        Opcode = NVPTX::StoreParamV2I8;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::StoreParamV2I16;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::StoreParamV2I32;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::StoreParamV2I64;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::StoreParamV2F32;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::StoreParamV2F64;
+        break;
+      }
+      break;
+    case 4:
+      switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
+      default:
+        return nullptr;
+      case MVT::i1:
+        Opcode = NVPTX::StoreParamV4I8;
+        break;
+      case MVT::i8:
+        Opcode = NVPTX::StoreParamV4I8;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::StoreParamV4I16;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::StoreParamV4I32;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::StoreParamV4F32;
+        break;
+      }
+      break;
+    }
+    break;
+  // Special case: if we have a sign-extend/zero-extend node, insert the
+  // conversion instruction first, and use that as the value operand to
+  // the selected StoreParam node.
+  case NVPTXISD::StoreParamU32: {
+    Opcode = NVPTX::StoreParamI32;
+    SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE,
+                                                MVT::i32);
+    SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u32_u16, DL,
+                                         MVT::i32, Ops[0], CvtNone);
+    Ops[0] = SDValue(Cvt, 0);
+    break;
+  }
+  case NVPTXISD::StoreParamS32: {
+    Opcode = NVPTX::StoreParamI32;
+    SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE,
+                                                MVT::i32);
+    SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_s32_s16, DL,
+                                         MVT::i32, Ops[0], CvtNone);
+    Ops[0] = SDValue(Cvt, 0);
+    break;
+  }
+  }
+
+  SDVTList RetVTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
+  SDNode *Ret =
+      CurDAG->getMachineNode(Opcode, DL, RetVTs, Ops);
+  MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+  MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(Ret)->setMemRefs(MemRefs0, MemRefs0 + 1);
+
+  return Ret;
+}
+
+SDNode *NVPTXDAGToDAGISel::SelectTextureIntrinsic(SDNode *N) {
+  SDValue Chain = N->getOperand(0);
+  SDNode *Ret = nullptr;
+  unsigned Opc = 0;
+  SmallVector<SDValue, 8> Ops;
+
+  switch (N->getOpcode()) {
+  default: return nullptr;
+  case NVPTXISD::Tex1DFloatS32:
+    Opc = NVPTX::TEX_1D_F32_S32;
+    break;
+  case NVPTXISD::Tex1DFloatFloat:
+    Opc = NVPTX::TEX_1D_F32_F32;
+    break;
+  case NVPTXISD::Tex1DFloatFloatLevel:
+    Opc = NVPTX::TEX_1D_F32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex1DFloatFloatGrad:
+    Opc = NVPTX::TEX_1D_F32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex1DS32S32:
+    Opc = NVPTX::TEX_1D_S32_S32;
+    break;
+  case NVPTXISD::Tex1DS32Float:
+    Opc = NVPTX::TEX_1D_S32_F32;
+    break;
+  case NVPTXISD::Tex1DS32FloatLevel:
+    Opc = NVPTX::TEX_1D_S32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex1DS32FloatGrad:
+    Opc = NVPTX::TEX_1D_S32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex1DU32S32:
+    Opc = NVPTX::TEX_1D_U32_S32;
+    break;
+  case NVPTXISD::Tex1DU32Float:
+    Opc = NVPTX::TEX_1D_U32_F32;
+    break;
+  case NVPTXISD::Tex1DU32FloatLevel:
+    Opc = NVPTX::TEX_1D_U32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex1DU32FloatGrad:
+    Opc = NVPTX::TEX_1D_U32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex1DArrayFloatS32:
+    Opc = NVPTX::TEX_1D_ARRAY_F32_S32;
+    break;
+  case NVPTXISD::Tex1DArrayFloatFloat:
+    Opc = NVPTX::TEX_1D_ARRAY_F32_F32;
+    break;
+  case NVPTXISD::Tex1DArrayFloatFloatLevel:
+    Opc = NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex1DArrayFloatFloatGrad:
+    Opc = NVPTX::TEX_1D_ARRAY_F32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex1DArrayS32S32:
+    Opc = NVPTX::TEX_1D_ARRAY_S32_S32;
+    break;
+  case NVPTXISD::Tex1DArrayS32Float:
+    Opc = NVPTX::TEX_1D_ARRAY_S32_F32;
+    break;
+  case NVPTXISD::Tex1DArrayS32FloatLevel:
+    Opc = NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex1DArrayS32FloatGrad:
+    Opc = NVPTX::TEX_1D_ARRAY_S32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex1DArrayU32S32:
+    Opc = NVPTX::TEX_1D_ARRAY_U32_S32;
+    break;
+  case NVPTXISD::Tex1DArrayU32Float:
+    Opc = NVPTX::TEX_1D_ARRAY_U32_F32;
+    break;
+  case NVPTXISD::Tex1DArrayU32FloatLevel:
+    Opc = NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex1DArrayU32FloatGrad:
+    Opc = NVPTX::TEX_1D_ARRAY_U32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex2DFloatS32:
+    Opc = NVPTX::TEX_2D_F32_S32;
+    break;
+  case NVPTXISD::Tex2DFloatFloat:
+    Opc = NVPTX::TEX_2D_F32_F32;
+    break;
+  case NVPTXISD::Tex2DFloatFloatLevel:
+    Opc = NVPTX::TEX_2D_F32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex2DFloatFloatGrad:
+    Opc = NVPTX::TEX_2D_F32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex2DS32S32:
+    Opc = NVPTX::TEX_2D_S32_S32;
+    break;
+  case NVPTXISD::Tex2DS32Float:
+    Opc = NVPTX::TEX_2D_S32_F32;
+    break;
+  case NVPTXISD::Tex2DS32FloatLevel:
+    Opc = NVPTX::TEX_2D_S32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex2DS32FloatGrad:
+    Opc = NVPTX::TEX_2D_S32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex2DU32S32:
+    Opc = NVPTX::TEX_2D_U32_S32;
+    break;
+  case NVPTXISD::Tex2DU32Float:
+    Opc = NVPTX::TEX_2D_U32_F32;
+    break;
+  case NVPTXISD::Tex2DU32FloatLevel:
+    Opc = NVPTX::TEX_2D_U32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex2DU32FloatGrad:
+    Opc = NVPTX::TEX_2D_U32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex2DArrayFloatS32:
+    Opc = NVPTX::TEX_2D_ARRAY_F32_S32;
+    break;
+  case NVPTXISD::Tex2DArrayFloatFloat:
+    Opc = NVPTX::TEX_2D_ARRAY_F32_F32;
+    break;
+  case NVPTXISD::Tex2DArrayFloatFloatLevel:
+    Opc = NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex2DArrayFloatFloatGrad:
+    Opc = NVPTX::TEX_2D_ARRAY_F32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex2DArrayS32S32:
+    Opc = NVPTX::TEX_2D_ARRAY_S32_S32;
+    break;
+  case NVPTXISD::Tex2DArrayS32Float:
+    Opc = NVPTX::TEX_2D_ARRAY_S32_F32;
+    break;
+  case NVPTXISD::Tex2DArrayS32FloatLevel:
+    Opc = NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex2DArrayS32FloatGrad:
+    Opc = NVPTX::TEX_2D_ARRAY_S32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex2DArrayU32S32:
+    Opc = NVPTX::TEX_2D_ARRAY_U32_S32;
+    break;
+  case NVPTXISD::Tex2DArrayU32Float:
+    Opc = NVPTX::TEX_2D_ARRAY_U32_F32;
+    break;
+  case NVPTXISD::Tex2DArrayU32FloatLevel:
+    Opc = NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex2DArrayU32FloatGrad:
+    Opc = NVPTX::TEX_2D_ARRAY_U32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex3DFloatS32:
+    Opc = NVPTX::TEX_3D_F32_S32;
+    break;
+  case NVPTXISD::Tex3DFloatFloat:
+    Opc = NVPTX::TEX_3D_F32_F32;
+    break;
+  case NVPTXISD::Tex3DFloatFloatLevel:
+    Opc = NVPTX::TEX_3D_F32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex3DFloatFloatGrad:
+    Opc = NVPTX::TEX_3D_F32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex3DS32S32:
+    Opc = NVPTX::TEX_3D_S32_S32;
+    break;
+  case NVPTXISD::Tex3DS32Float:
+    Opc = NVPTX::TEX_3D_S32_F32;
+    break;
+  case NVPTXISD::Tex3DS32FloatLevel:
+    Opc = NVPTX::TEX_3D_S32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex3DS32FloatGrad:
+    Opc = NVPTX::TEX_3D_S32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex3DU32S32:
+    Opc = NVPTX::TEX_3D_U32_S32;
+    break;
+  case NVPTXISD::Tex3DU32Float:
+    Opc = NVPTX::TEX_3D_U32_F32;
+    break;
+  case NVPTXISD::Tex3DU32FloatLevel:
+    Opc = NVPTX::TEX_3D_U32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex3DU32FloatGrad:
+    Opc = NVPTX::TEX_3D_U32_F32_GRAD;
+    break;
+  case NVPTXISD::TexCubeFloatFloat:
+    Opc = NVPTX::TEX_CUBE_F32_F32;
+    break;
+  case NVPTXISD::TexCubeFloatFloatLevel:
+    Opc = NVPTX::TEX_CUBE_F32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexCubeS32Float:
+    Opc = NVPTX::TEX_CUBE_S32_F32;
+    break;
+  case NVPTXISD::TexCubeS32FloatLevel:
+    Opc = NVPTX::TEX_CUBE_S32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexCubeU32Float:
+    Opc = NVPTX::TEX_CUBE_U32_F32;
+    break;
+  case NVPTXISD::TexCubeU32FloatLevel:
+    Opc = NVPTX::TEX_CUBE_U32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexCubeArrayFloatFloat:
+    Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32;
+    break;
+  case NVPTXISD::TexCubeArrayFloatFloatLevel:
+    Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexCubeArrayS32Float:
+    Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32;
+    break;
+  case NVPTXISD::TexCubeArrayS32FloatLevel:
+    Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexCubeArrayU32Float:
+    Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32;
+    break;
+  case NVPTXISD::TexCubeArrayU32FloatLevel:
+    Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tld4R2DFloatFloat:
+    Opc = NVPTX::TLD4_R_2D_F32_F32;
+    break;
+  case NVPTXISD::Tld4G2DFloatFloat:
+    Opc = NVPTX::TLD4_G_2D_F32_F32;
+    break;
+  case NVPTXISD::Tld4B2DFloatFloat:
+    Opc = NVPTX::TLD4_B_2D_F32_F32;
+    break;
+  case NVPTXISD::Tld4A2DFloatFloat:
+    Opc = NVPTX::TLD4_A_2D_F32_F32;
+    break;
+  case NVPTXISD::Tld4R2DS64Float:
+    Opc = NVPTX::TLD4_R_2D_S32_F32;
+    break;
+  case NVPTXISD::Tld4G2DS64Float:
+    Opc = NVPTX::TLD4_G_2D_S32_F32;
+    break;
+  case NVPTXISD::Tld4B2DS64Float:
+    Opc = NVPTX::TLD4_B_2D_S32_F32;
+    break;
+  case NVPTXISD::Tld4A2DS64Float:
+    Opc = NVPTX::TLD4_A_2D_S32_F32;
+    break;
+  case NVPTXISD::Tld4R2DU64Float:
+    Opc = NVPTX::TLD4_R_2D_U32_F32;
+    break;
+  case NVPTXISD::Tld4G2DU64Float:
+    Opc = NVPTX::TLD4_G_2D_U32_F32;
+    break;
+  case NVPTXISD::Tld4B2DU64Float:
+    Opc = NVPTX::TLD4_B_2D_U32_F32;
+    break;
+  case NVPTXISD::Tld4A2DU64Float:
+    Opc = NVPTX::TLD4_A_2D_U32_F32;
+    break;
+  case NVPTXISD::TexUnified1DFloatS32:
+    Opc = NVPTX::TEX_UNIFIED_1D_F32_S32;
+    break;
+  case NVPTXISD::TexUnified1DFloatFloat:
+    Opc = NVPTX::TEX_UNIFIED_1D_F32_F32;
+    break;
+  case NVPTXISD::TexUnified1DFloatFloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnified1DFloatFloatGrad:
+    Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_GRAD;
+    break;
+  case NVPTXISD::TexUnified1DS32S32:
+    Opc = NVPTX::TEX_UNIFIED_1D_S32_S32;
+    break;
+  case NVPTXISD::TexUnified1DS32Float:
+    Opc = NVPTX::TEX_UNIFIED_1D_S32_F32;
+    break;
+  case NVPTXISD::TexUnified1DS32FloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnified1DS32FloatGrad:
+    Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_GRAD;
+    break;
+  case NVPTXISD::TexUnified1DU32S32:
+    Opc = NVPTX::TEX_UNIFIED_1D_U32_S32;
+    break;
+  case NVPTXISD::TexUnified1DU32Float:
+    Opc = NVPTX::TEX_UNIFIED_1D_U32_F32;
+    break;
+  case NVPTXISD::TexUnified1DU32FloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnified1DU32FloatGrad:
+    Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_GRAD;
+    break;
+  case NVPTXISD::TexUnified1DArrayFloatS32:
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_S32;
+    break;
+  case NVPTXISD::TexUnified1DArrayFloatFloat:
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32;
+    break;
+  case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD;
+    break;
+  case NVPTXISD::TexUnified1DArrayS32S32:
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_S32;
+    break;
+  case NVPTXISD::TexUnified1DArrayS32Float:
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32;
+    break;
+  case NVPTXISD::TexUnified1DArrayS32FloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnified1DArrayS32FloatGrad:
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD;
+    break;
+  case NVPTXISD::TexUnified1DArrayU32S32:
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_S32;
+    break;
+  case NVPTXISD::TexUnified1DArrayU32Float:
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32;
+    break;
+  case NVPTXISD::TexUnified1DArrayU32FloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnified1DArrayU32FloatGrad:
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD;
+    break;
+  case NVPTXISD::TexUnified2DFloatS32:
+    Opc = NVPTX::TEX_UNIFIED_2D_F32_S32;
+    break;
+  case NVPTXISD::TexUnified2DFloatFloat:
+    Opc = NVPTX::TEX_UNIFIED_2D_F32_F32;
+    break;
+  case NVPTXISD::TexUnified2DFloatFloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnified2DFloatFloatGrad:
+    Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_GRAD;
+    break;
+  case NVPTXISD::TexUnified2DS32S32:
+    Opc = NVPTX::TEX_UNIFIED_2D_S32_S32;
+    break;
+  case NVPTXISD::TexUnified2DS32Float:
+    Opc = NVPTX::TEX_UNIFIED_2D_S32_F32;
+    break;
+  case NVPTXISD::TexUnified2DS32FloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnified2DS32FloatGrad:
+    Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_GRAD;
+    break;
+  case NVPTXISD::TexUnified2DU32S32:
+    Opc = NVPTX::TEX_UNIFIED_2D_U32_S32;
+    break;
+  case NVPTXISD::TexUnified2DU32Float:
+    Opc = NVPTX::TEX_UNIFIED_2D_U32_F32;
+    break;
+  case NVPTXISD::TexUnified2DU32FloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnified2DU32FloatGrad:
+    Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_GRAD;
+    break;
+  case NVPTXISD::TexUnified2DArrayFloatS32:
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_S32;
+    break;
+  case NVPTXISD::TexUnified2DArrayFloatFloat:
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32;
+    break;
+  case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD;
+    break;
+  case NVPTXISD::TexUnified2DArrayS32S32:
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_S32;
+    break;
+  case NVPTXISD::TexUnified2DArrayS32Float:
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32;
+    break;
+  case NVPTXISD::TexUnified2DArrayS32FloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnified2DArrayS32FloatGrad:
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD;
+    break;
+  case NVPTXISD::TexUnified2DArrayU32S32:
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_S32;
+    break;
+  case NVPTXISD::TexUnified2DArrayU32Float:
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32;
+    break;
+  case NVPTXISD::TexUnified2DArrayU32FloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnified2DArrayU32FloatGrad:
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD;
+    break;
+  case NVPTXISD::TexUnified3DFloatS32:
+    Opc = NVPTX::TEX_UNIFIED_3D_F32_S32;
+    break;
+  case NVPTXISD::TexUnified3DFloatFloat:
+    Opc = NVPTX::TEX_UNIFIED_3D_F32_F32;
+    break;
+  case NVPTXISD::TexUnified3DFloatFloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnified3DFloatFloatGrad:
+    Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_GRAD;
+    break;
+  case NVPTXISD::TexUnified3DS32S32:
+    Opc = NVPTX::TEX_UNIFIED_3D_S32_S32;
+    break;
+  case NVPTXISD::TexUnified3DS32Float:
+    Opc = NVPTX::TEX_UNIFIED_3D_S32_F32;
+    break;
+  case NVPTXISD::TexUnified3DS32FloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnified3DS32FloatGrad:
+    Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_GRAD;
+    break;
+  case NVPTXISD::TexUnified3DU32S32:
+    Opc = NVPTX::TEX_UNIFIED_3D_U32_S32;
+    break;
+  case NVPTXISD::TexUnified3DU32Float:
+    Opc = NVPTX::TEX_UNIFIED_3D_U32_F32;
+    break;
+  case NVPTXISD::TexUnified3DU32FloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnified3DU32FloatGrad:
+    Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_GRAD;
+    break;
+  case NVPTXISD::TexUnifiedCubeFloatFloat:
+    Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32;
+    break;
+  case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnifiedCubeS32Float:
+    Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32;
+    break;
+  case NVPTXISD::TexUnifiedCubeS32FloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnifiedCubeU32Float:
+    Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32;
+    break;
+  case NVPTXISD::TexUnifiedCubeU32FloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
+    Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32;
+    break;
+  case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnifiedCubeArrayS32Float:
+    Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32;
+    break;
+  case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnifiedCubeArrayU32Float:
+    Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32;
+    break;
+  case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tld4UnifiedR2DFloatFloat:
+    Opc = NVPTX::TLD4_UNIFIED_R_2D_F32_F32;
+    break;
+  case NVPTXISD::Tld4UnifiedG2DFloatFloat:
+    Opc = NVPTX::TLD4_UNIFIED_G_2D_F32_F32;
+    break;
+  case NVPTXISD::Tld4UnifiedB2DFloatFloat:
+    Opc = NVPTX::TLD4_UNIFIED_B_2D_F32_F32;
+    break;
+  case NVPTXISD::Tld4UnifiedA2DFloatFloat:
+    Opc = NVPTX::TLD4_UNIFIED_A_2D_F32_F32;
+    break;
+  case NVPTXISD::Tld4UnifiedR2DS64Float:
+    Opc = NVPTX::TLD4_UNIFIED_R_2D_S32_F32;
+    break;
+  case NVPTXISD::Tld4UnifiedG2DS64Float:
+    Opc = NVPTX::TLD4_UNIFIED_G_2D_S32_F32;
+    break;
+  case NVPTXISD::Tld4UnifiedB2DS64Float:
+    Opc = NVPTX::TLD4_UNIFIED_B_2D_S32_F32;
+    break;
+  case NVPTXISD::Tld4UnifiedA2DS64Float:
+    Opc = NVPTX::TLD4_UNIFIED_A_2D_S32_F32;
+    break;
+  case NVPTXISD::Tld4UnifiedR2DU64Float:
+    Opc = NVPTX::TLD4_UNIFIED_R_2D_U32_F32;
+    break;
+  case NVPTXISD::Tld4UnifiedG2DU64Float:
+    Opc = NVPTX::TLD4_UNIFIED_G_2D_U32_F32;
+    break;
+  case NVPTXISD::Tld4UnifiedB2DU64Float:
+    Opc = NVPTX::TLD4_UNIFIED_B_2D_U32_F32;
+    break;
+  case NVPTXISD::Tld4UnifiedA2DU64Float:
+    Opc = NVPTX::TLD4_UNIFIED_A_2D_U32_F32;
+    break;
+  }
+
+  // Copy over operands
+  for (unsigned i = 1; i < N->getNumOperands(); ++i) {
+    Ops.push_back(N->getOperand(i));
+  }
+
+  Ops.push_back(Chain);
+  Ret = CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops);
+  return Ret;
+}
+
+SDNode *NVPTXDAGToDAGISel::SelectSurfaceIntrinsic(SDNode *N) {
+  SDValue Chain = N->getOperand(0);
+  SDValue TexHandle = N->getOperand(1);
+  SDNode *Ret = nullptr;
+  unsigned Opc = 0;
+  SmallVector<SDValue, 8> Ops;
+  switch (N->getOpcode()) {
+  default: return nullptr;
+  case NVPTXISD::Suld1DI8Clamp:
+    Opc = NVPTX::SULD_1D_I8_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DI16Clamp:
+    Opc = NVPTX::SULD_1D_I16_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DI32Clamp:
+    Opc = NVPTX::SULD_1D_I32_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DI64Clamp:
+    Opc = NVPTX::SULD_1D_I64_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV2I8Clamp:
+    Opc = NVPTX::SULD_1D_V2I8_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV2I16Clamp:
+    Opc = NVPTX::SULD_1D_V2I16_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV2I32Clamp:
+    Opc = NVPTX::SULD_1D_V2I32_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV2I64Clamp:
+    Opc = NVPTX::SULD_1D_V2I64_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV4I8Clamp:
+    Opc = NVPTX::SULD_1D_V4I8_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV4I16Clamp:
+    Opc = NVPTX::SULD_1D_V4I16_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV4I32Clamp:
+    Opc = NVPTX::SULD_1D_V4I32_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayI8Clamp:
+    Opc = NVPTX::SULD_1D_ARRAY_I8_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayI16Clamp:
+    Opc = NVPTX::SULD_1D_ARRAY_I16_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayI32Clamp:
+    Opc = NVPTX::SULD_1D_ARRAY_I32_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayI64Clamp:
+    Opc = NVPTX::SULD_1D_ARRAY_I64_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV2I8Clamp:
+    Opc = NVPTX::SULD_1D_ARRAY_V2I8_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV2I16Clamp:
+    Opc = NVPTX::SULD_1D_ARRAY_V2I16_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV2I32Clamp:
+    Opc = NVPTX::SULD_1D_ARRAY_V2I32_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV2I64Clamp:
+    Opc = NVPTX::SULD_1D_ARRAY_V2I64_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV4I8Clamp:
+    Opc = NVPTX::SULD_1D_ARRAY_V4I8_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV4I16Clamp:
+    Opc = NVPTX::SULD_1D_ARRAY_V4I16_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV4I32Clamp:
+    Opc = NVPTX::SULD_1D_ARRAY_V4I32_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DI8Clamp:
+    Opc = NVPTX::SULD_2D_I8_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DI16Clamp:
+    Opc = NVPTX::SULD_2D_I16_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DI32Clamp:
+    Opc = NVPTX::SULD_2D_I32_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DI64Clamp:
+    Opc = NVPTX::SULD_2D_I64_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV2I8Clamp:
+    Opc = NVPTX::SULD_2D_V2I8_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV2I16Clamp:
+    Opc = NVPTX::SULD_2D_V2I16_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV2I32Clamp:
+    Opc = NVPTX::SULD_2D_V2I32_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV2I64Clamp:
+    Opc = NVPTX::SULD_2D_V2I64_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV4I8Clamp:
+    Opc = NVPTX::SULD_2D_V4I8_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV4I16Clamp:
+    Opc = NVPTX::SULD_2D_V4I16_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV4I32Clamp:
+    Opc = NVPTX::SULD_2D_V4I32_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayI8Clamp:
+    Opc = NVPTX::SULD_2D_ARRAY_I8_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayI16Clamp:
+    Opc = NVPTX::SULD_2D_ARRAY_I16_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayI32Clamp:
+    Opc = NVPTX::SULD_2D_ARRAY_I32_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayI64Clamp:
+    Opc = NVPTX::SULD_2D_ARRAY_I64_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV2I8Clamp:
+    Opc = NVPTX::SULD_2D_ARRAY_V2I8_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV2I16Clamp:
+    Opc = NVPTX::SULD_2D_ARRAY_V2I16_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV2I32Clamp:
+    Opc = NVPTX::SULD_2D_ARRAY_V2I32_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV2I64Clamp:
+    Opc = NVPTX::SULD_2D_ARRAY_V2I64_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV4I8Clamp:
+    Opc = NVPTX::SULD_2D_ARRAY_V4I8_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV4I16Clamp:
+    Opc = NVPTX::SULD_2D_ARRAY_V4I16_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV4I32Clamp:
+    Opc = NVPTX::SULD_2D_ARRAY_V4I32_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DI8Clamp:
+    Opc = NVPTX::SULD_3D_I8_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DI16Clamp:
+    Opc = NVPTX::SULD_3D_I16_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DI32Clamp:
+    Opc = NVPTX::SULD_3D_I32_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DI64Clamp:
+    Opc = NVPTX::SULD_3D_I64_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV2I8Clamp:
+    Opc = NVPTX::SULD_3D_V2I8_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV2I16Clamp:
+    Opc = NVPTX::SULD_3D_V2I16_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV2I32Clamp:
+    Opc = NVPTX::SULD_3D_V2I32_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV2I64Clamp:
+    Opc = NVPTX::SULD_3D_V2I64_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV4I8Clamp:
+    Opc = NVPTX::SULD_3D_V4I8_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV4I16Clamp:
+    Opc = NVPTX::SULD_3D_V4I16_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV4I32Clamp:
+    Opc = NVPTX::SULD_3D_V4I32_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DI8Trap:
+    Opc = NVPTX::SULD_1D_I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DI16Trap:
+    Opc = NVPTX::SULD_1D_I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DI32Trap:
+    Opc = NVPTX::SULD_1D_I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DI64Trap:
+    Opc = NVPTX::SULD_1D_I64_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV2I8Trap:
+    Opc = NVPTX::SULD_1D_V2I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV2I16Trap:
+    Opc = NVPTX::SULD_1D_V2I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV2I32Trap:
+    Opc = NVPTX::SULD_1D_V2I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV2I64Trap:
+    Opc = NVPTX::SULD_1D_V2I64_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV4I8Trap:
+    Opc = NVPTX::SULD_1D_V4I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV4I16Trap:
+    Opc = NVPTX::SULD_1D_V4I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV4I32Trap:
+    Opc = NVPTX::SULD_1D_V4I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayI8Trap:
+    Opc = NVPTX::SULD_1D_ARRAY_I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayI16Trap:
+    Opc = NVPTX::SULD_1D_ARRAY_I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayI32Trap:
+    Opc = NVPTX::SULD_1D_ARRAY_I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayI64Trap:
+    Opc = NVPTX::SULD_1D_ARRAY_I64_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV2I8Trap:
+    Opc = NVPTX::SULD_1D_ARRAY_V2I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV2I16Trap:
+    Opc = NVPTX::SULD_1D_ARRAY_V2I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV2I32Trap:
+    Opc = NVPTX::SULD_1D_ARRAY_V2I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV2I64Trap:
+    Opc = NVPTX::SULD_1D_ARRAY_V2I64_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV4I8Trap:
+    Opc = NVPTX::SULD_1D_ARRAY_V4I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV4I16Trap:
+    Opc = NVPTX::SULD_1D_ARRAY_V4I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV4I32Trap:
+    Opc = NVPTX::SULD_1D_ARRAY_V4I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DI8Trap:
+    Opc = NVPTX::SULD_2D_I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DI16Trap:
+    Opc = NVPTX::SULD_2D_I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DI32Trap:
+    Opc = NVPTX::SULD_2D_I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DI64Trap:
+    Opc = NVPTX::SULD_2D_I64_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV2I8Trap:
+    Opc = NVPTX::SULD_2D_V2I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV2I16Trap:
+    Opc = NVPTX::SULD_2D_V2I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV2I32Trap:
+    Opc = NVPTX::SULD_2D_V2I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV2I64Trap:
+    Opc = NVPTX::SULD_2D_V2I64_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV4I8Trap:
+    Opc = NVPTX::SULD_2D_V4I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV4I16Trap:
+    Opc = NVPTX::SULD_2D_V4I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV4I32Trap:
+    Opc = NVPTX::SULD_2D_V4I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayI8Trap:
+    Opc = NVPTX::SULD_2D_ARRAY_I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayI16Trap:
+    Opc = NVPTX::SULD_2D_ARRAY_I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayI32Trap:
+    Opc = NVPTX::SULD_2D_ARRAY_I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayI64Trap:
+    Opc = NVPTX::SULD_2D_ARRAY_I64_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV2I8Trap:
+    Opc = NVPTX::SULD_2D_ARRAY_V2I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV2I16Trap:
+    Opc = NVPTX::SULD_2D_ARRAY_V2I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV2I32Trap:
+    Opc = NVPTX::SULD_2D_ARRAY_V2I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV2I64Trap:
+    Opc = NVPTX::SULD_2D_ARRAY_V2I64_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV4I8Trap:
+    Opc = NVPTX::SULD_2D_ARRAY_V4I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV4I16Trap:
+    Opc = NVPTX::SULD_2D_ARRAY_V4I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV4I32Trap:
+    Opc = NVPTX::SULD_2D_ARRAY_V4I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DI8Trap:
+    Opc = NVPTX::SULD_3D_I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DI16Trap:
+    Opc = NVPTX::SULD_3D_I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DI32Trap:
+    Opc = NVPTX::SULD_3D_I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DI64Trap:
+    Opc = NVPTX::SULD_3D_I64_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV2I8Trap:
+    Opc = NVPTX::SULD_3D_V2I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV2I16Trap:
+    Opc = NVPTX::SULD_3D_V2I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV2I32Trap:
+    Opc = NVPTX::SULD_3D_V2I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV2I64Trap:
+    Opc = NVPTX::SULD_3D_V2I64_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV4I8Trap:
+    Opc = NVPTX::SULD_3D_V4I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV4I16Trap:
+    Opc = NVPTX::SULD_3D_V4I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV4I32Trap:
+    Opc = NVPTX::SULD_3D_V4I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DI8Zero:
+    Opc = NVPTX::SULD_1D_I8_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DI16Zero:
+    Opc = NVPTX::SULD_1D_I16_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DI32Zero:
+    Opc = NVPTX::SULD_1D_I32_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DI64Zero:
+    Opc = NVPTX::SULD_1D_I64_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV2I8Zero:
+    Opc = NVPTX::SULD_1D_V2I8_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV2I16Zero:
+    Opc = NVPTX::SULD_1D_V2I16_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV2I32Zero:
+    Opc = NVPTX::SULD_1D_V2I32_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV2I64Zero:
+    Opc = NVPTX::SULD_1D_V2I64_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV4I8Zero:
+    Opc = NVPTX::SULD_1D_V4I8_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV4I16Zero:
+    Opc = NVPTX::SULD_1D_V4I16_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV4I32Zero:
+    Opc = NVPTX::SULD_1D_V4I32_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayI8Zero:
+    Opc = NVPTX::SULD_1D_ARRAY_I8_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayI16Zero:
+    Opc = NVPTX::SULD_1D_ARRAY_I16_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayI32Zero:
+    Opc = NVPTX::SULD_1D_ARRAY_I32_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayI64Zero:
+    Opc = NVPTX::SULD_1D_ARRAY_I64_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV2I8Zero:
+    Opc = NVPTX::SULD_1D_ARRAY_V2I8_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV2I16Zero:
+    Opc = NVPTX::SULD_1D_ARRAY_V2I16_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV2I32Zero:
+    Opc = NVPTX::SULD_1D_ARRAY_V2I32_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV2I64Zero:
+    Opc = NVPTX::SULD_1D_ARRAY_V2I64_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV4I8Zero:
+    Opc = NVPTX::SULD_1D_ARRAY_V4I8_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV4I16Zero:
+    Opc = NVPTX::SULD_1D_ARRAY_V4I16_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV4I32Zero:
+    Opc = NVPTX::SULD_1D_ARRAY_V4I32_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DI8Zero:
+    Opc = NVPTX::SULD_2D_I8_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DI16Zero:
+    Opc = NVPTX::SULD_2D_I16_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DI32Zero:
+    Opc = NVPTX::SULD_2D_I32_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DI64Zero:
+    Opc = NVPTX::SULD_2D_I64_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV2I8Zero:
+    Opc = NVPTX::SULD_2D_V2I8_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV2I16Zero:
+    Opc = NVPTX::SULD_2D_V2I16_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV2I32Zero:
+    Opc = NVPTX::SULD_2D_V2I32_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV2I64Zero:
+    Opc = NVPTX::SULD_2D_V2I64_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV4I8Zero:
+    Opc = NVPTX::SULD_2D_V4I8_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV4I16Zero:
+    Opc = NVPTX::SULD_2D_V4I16_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV4I32Zero:
+    Opc = NVPTX::SULD_2D_V4I32_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayI8Zero:
+    Opc = NVPTX::SULD_2D_ARRAY_I8_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayI16Zero:
+    Opc = NVPTX::SULD_2D_ARRAY_I16_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayI32Zero:
+    Opc = NVPTX::SULD_2D_ARRAY_I32_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayI64Zero:
+    Opc = NVPTX::SULD_2D_ARRAY_I64_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV2I8Zero:
+    Opc = NVPTX::SULD_2D_ARRAY_V2I8_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV2I16Zero:
+    Opc = NVPTX::SULD_2D_ARRAY_V2I16_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV2I32Zero:
+    Opc = NVPTX::SULD_2D_ARRAY_V2I32_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV2I64Zero:
+    Opc = NVPTX::SULD_2D_ARRAY_V2I64_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV4I8Zero:
+    Opc = NVPTX::SULD_2D_ARRAY_V4I8_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV4I16Zero:
+    Opc = NVPTX::SULD_2D_ARRAY_V4I16_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV4I32Zero:
+    Opc = NVPTX::SULD_2D_ARRAY_V4I32_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DI8Zero:
+    Opc = NVPTX::SULD_3D_I8_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DI16Zero:
+    Opc = NVPTX::SULD_3D_I16_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DI32Zero:
+    Opc = NVPTX::SULD_3D_I32_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DI64Zero:
+    Opc = NVPTX::SULD_3D_I64_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV2I8Zero:
+    Opc = NVPTX::SULD_3D_V2I8_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV2I16Zero:
+    Opc = NVPTX::SULD_3D_V2I16_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV2I32Zero:
+    Opc = NVPTX::SULD_3D_V2I32_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV2I64Zero:
+    Opc = NVPTX::SULD_3D_V2I64_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV4I8Zero:
+    Opc = NVPTX::SULD_3D_V4I8_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV4I16Zero:
+    Opc = NVPTX::SULD_3D_V4I16_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV4I32Zero:
+    Opc = NVPTX::SULD_3D_V4I32_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  }
+  Ret = CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops);
+  return Ret;
+}
+
+
+/// SelectBFE - Look for instruction sequences that can be made more efficient
+/// by using the 'bfe' (bit-field extract) PTX instruction
+SDNode *NVPTXDAGToDAGISel::SelectBFE(SDNode *N) {
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  SDValue Len;
+  SDValue Start;
+  SDValue Val;
+  bool IsSigned = false;
+
+  if (N->getOpcode() == ISD::AND) {
+    // Canonicalize the operands
+    // We want 'and %val, %mask'
+    if (isa<ConstantSDNode>(LHS) && !isa<ConstantSDNode>(RHS)) {
+      std::swap(LHS, RHS);
+    }
+
+    ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS);
+    if (!Mask) {
+      // We need a constant mask on the RHS of the AND
+      return NULL;
+    }
+
+    // Extract the mask bits
+    uint64_t MaskVal = Mask->getZExtValue();
+    if (!isMask_64(MaskVal)) {
+      // We *could* handle shifted masks here, but doing so would require an
+      // 'and' operation to fix up the low-order bits so we would trade
+      // shr+and for bfe+and, which has the same throughput
+      return NULL;
+    }
+
+    // How many bits are in our mask?
+    uint64_t NumBits = CountTrailingOnes_64(MaskVal);
+    Len = CurDAG->getTargetConstant(NumBits, MVT::i32);
+
+    if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) {
+      // We have a 'srl/and' pair, extract the effective start bit and length
+      Val = LHS.getNode()->getOperand(0);
+      Start = LHS.getNode()->getOperand(1);
+      ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Start);
+      if (StartConst) {
+        uint64_t StartVal = StartConst->getZExtValue();
+        // How many "good" bits do we have left?  "good" is defined here as bits
+        // that exist in the original value, not shifted in.
+        uint64_t GoodBits = Start.getValueType().getSizeInBits() - StartVal;
+        if (NumBits > GoodBits) {
+          // Do not handle the case where bits have been shifted in. In theory
+          // we could handle this, but the cost is likely higher than just
+          // emitting the srl/and pair.
+          return NULL;
+        }
+        Start = CurDAG->getTargetConstant(StartVal, MVT::i32);
+      } else {
+        // Do not handle the case where the shift amount (can be zero if no srl
+        // was found) is not constant. We could handle this case, but it would
+        // require run-time logic that would be more expensive than just
+        // emitting the srl/and pair.
+        return NULL;
+      }
+    } else {
+      // Do not handle the case where the LHS of the and is not a shift. While
+      // it would be trivial to handle this case, it would just transform
+      // 'and' -> 'bfe', but 'and' has higher-throughput.
+      return NULL;
+    }
+  } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) {
+    if (LHS->getOpcode() == ISD::AND) {
+      ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(RHS);
+      if (!ShiftCnst) {
+        // Shift amount must be constant
+        return NULL;
+      }
+
+      uint64_t ShiftAmt = ShiftCnst->getZExtValue();
+
+      SDValue AndLHS = LHS->getOperand(0);
+      SDValue AndRHS = LHS->getOperand(1);
+
+      // Canonicalize the AND to have the mask on the RHS
+      if (isa<ConstantSDNode>(AndLHS)) {
+        std::swap(AndLHS, AndRHS);
+      }
+
+      ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(AndRHS);
+      if (!MaskCnst) {
+        // Mask must be constant
+        return NULL;
+      }
+
+      uint64_t MaskVal = MaskCnst->getZExtValue();
+      uint64_t NumZeros;
+      uint64_t NumBits;
+      if (isMask_64(MaskVal)) {
+        NumZeros = 0;
+        // The number of bits in the result bitfield will be the number of
+        // trailing ones (the AND) minus the number of bits we shift off
+        NumBits = CountTrailingOnes_64(MaskVal) - ShiftAmt;
+      } else if (isShiftedMask_64(MaskVal)) {
+        NumZeros = countTrailingZeros(MaskVal);
+        unsigned NumOnes = CountTrailingOnes_64(MaskVal >> NumZeros);
+        // The number of bits in the result bitfield will be the number of
+        // trailing zeros plus the number of set bits in the mask minus the
+        // number of bits we shift off
+        NumBits = NumZeros + NumOnes - ShiftAmt;
+      } else {
+        // This is not a mask we can handle
+        return NULL;
+      }
+
+      if (ShiftAmt < NumZeros) {
+        // Handling this case would require extra logic that would make this
+        // transformation non-profitable
+        return NULL;
+      }
+
+      Val = AndLHS;
+      Start = CurDAG->getTargetConstant(ShiftAmt, MVT::i32);
+      Len = CurDAG->getTargetConstant(NumBits, MVT::i32);
+    } else if (LHS->getOpcode() == ISD::SHL) {
+      // Here, we have a pattern like:
+      //
+      // (sra (shl val, NN), MM)
+      // or
+      // (srl (shl val, NN), MM)
+      //
+      // If MM >= NN, we can efficiently optimize this with bfe
+      Val = LHS->getOperand(0);
+
+      SDValue ShlRHS = LHS->getOperand(1);
+      ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(ShlRHS);
+      if (!ShlCnst) {
+        // Shift amount must be constant
+        return NULL;
+      }
+      uint64_t InnerShiftAmt = ShlCnst->getZExtValue();
+
+      SDValue ShrRHS = RHS;
+      ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(ShrRHS);
+      if (!ShrCnst) {
+        // Shift amount must be constant
+        return NULL;
+      }
+      uint64_t OuterShiftAmt = ShrCnst->getZExtValue();
+
+      // To avoid extra codegen and be profitable, we need Outer >= Inner
+      if (OuterShiftAmt < InnerShiftAmt) {
+        return NULL;
+      }
+
+      // If the outer shift is more than the type size, we have no bitfield to
+      // extract (since we also check that the inner shift is <= the outer shift
+      // then this also implies that the inner shift is < the type size)
+      if (OuterShiftAmt >= Val.getValueType().getSizeInBits()) {
+        return NULL;
+      }
+
+      Start =
+        CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt, MVT::i32);
+      Len =
+        CurDAG->getTargetConstant(Val.getValueType().getSizeInBits() -
+                                  OuterShiftAmt, MVT::i32);
+
+      if (N->getOpcode() == ISD::SRA) {
+        // If we have a arithmetic right shift, we need to use the signed bfe
+        // variant
+        IsSigned = true;
+      }
+    } else {
+      // No can do...
+      return NULL;
+    }
+  } else {
+    // No can do...
+    return NULL;
+  }
+
+
+  unsigned Opc;
+  // For the BFE operations we form here from "and" and "srl", always use the
+  // unsigned variants.
+  if (Val.getValueType() == MVT::i32) {
+    if (IsSigned) {
+      Opc = NVPTX::BFE_S32rii;
+    } else {
+      Opc = NVPTX::BFE_U32rii;
+    }
+  } else if (Val.getValueType() == MVT::i64) {
+    if (IsSigned) {
+      Opc = NVPTX::BFE_S64rii;
+    } else {
+      Opc = NVPTX::BFE_U64rii;
+    }
+  } else {
+    // We cannot handle this type
+    return NULL;
+  }
+
+  SDValue Ops[] = {
+    Val, Start, Len
+  };
+
+  SDNode *Ret =
+    CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops);
+
+  return Ret;
+}
+
+// SelectDirectAddr - Match a direct address for DAG.
+// A direct address could be a globaladdress or externalsymbol.
+bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
+  // Return true if TGA or ES.
+  if (N.getOpcode() == ISD::TargetGlobalAddress ||
+      N.getOpcode() == ISD::TargetExternalSymbol) {
+    Address = N;
+    return true;
+  }
+  if (N.getOpcode() == NVPTXISD::Wrapper) {
+    Address = N.getOperand(0);
+    return true;
+  }
+  if (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
+    unsigned IID = cast<ConstantSDNode>(N.getOperand(0))->getZExtValue();
+    if (IID == Intrinsic::nvvm_ptr_gen_to_param)
+      if (N.getOperand(1).getOpcode() == NVPTXISD::MoveParam)
+        return (SelectDirectAddr(N.getOperand(1).getOperand(0), Address));
+  }
+  return false;
+}
+
+// symbol+offset
+bool NVPTXDAGToDAGISel::SelectADDRsi_imp(
+    SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
+  if (Addr.getOpcode() == ISD::ADD) {
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
+      SDValue base = Addr.getOperand(0);
+      if (SelectDirectAddr(base, Base)) {
+        Offset = CurDAG->getTargetConstant(CN->getZExtValue(), mvt);
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+// symbol+offset
+bool NVPTXDAGToDAGISel::SelectADDRsi(SDNode *OpNode, SDValue Addr,
+                                     SDValue &Base, SDValue &Offset) {
+  return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i32);
+}
+
+// symbol+offset
+bool NVPTXDAGToDAGISel::SelectADDRsi64(SDNode *OpNode, SDValue Addr,
+                                       SDValue &Base, SDValue &Offset) {
+  return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i64);
+}
+
+// register+offset
+bool NVPTXDAGToDAGISel::SelectADDRri_imp(
+    SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
+    Offset = CurDAG->getTargetConstant(0, mvt);
+    return true;
+  }
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress)
+    return false; // direct calls.
+
+  if (Addr.getOpcode() == ISD::ADD) {
+    if (SelectDirectAddr(Addr.getOperand(0), Addr)) {
+      return false;
+    }
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
+      if (FrameIndexSDNode *FIN =
+              dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
+        // Constant offset from frame ref.
+        Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
+      else
+        Base = Addr.getOperand(0);
+      Offset = CurDAG->getTargetConstant(CN->getZExtValue(), mvt);
+      return true;
+    }
+  }
+  return false;
+}
+
+// register+offset
+bool NVPTXDAGToDAGISel::SelectADDRri(SDNode *OpNode, SDValue Addr,
+                                     SDValue &Base, SDValue &Offset) {
+  return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i32);
+}
+
+// register+offset
+bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode *OpNode, SDValue Addr,
+                                       SDValue &Base, SDValue &Offset) {
+  return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i64);
+}
+
+bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N,
+                                                 unsigned int spN) const {
+  const Value *Src = nullptr;
+  // Even though MemIntrinsicSDNode is a subclas of MemSDNode,
+  // the classof() for MemSDNode does not include MemIntrinsicSDNode
+  // (See SelectionDAGNodes.h). So we need to check for both.
+  if (MemSDNode *mN = dyn_cast<MemSDNode>(N)) {
+    if (spN == 0 && mN->getMemOperand()->getPseudoValue())
+      return true;
+    Src = mN->getMemOperand()->getValue();
+  } else if (MemSDNode *mN = dyn_cast<MemIntrinsicSDNode>(N)) {
+    if (spN == 0 && mN->getMemOperand()->getPseudoValue())
+      return true;
+    Src = mN->getMemOperand()->getValue();
+  }
+  if (!Src)
+    return false;
+  if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+    return (PT->getAddressSpace() == spN);
+  return false;
+}
+
+/// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+/// inline asm expressions.
+bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand(
+    const SDValue &Op, char ConstraintCode, std::vector<SDValue> &OutOps) {
+  SDValue Op0, Op1;
+  switch (ConstraintCode) {
+  default:
+    return true;
+  case 'm': // memory
+    if (SelectDirectAddr(Op, Op0)) {
+      OutOps.push_back(Op0);
+      OutOps.push_back(CurDAG->getTargetConstant(0, MVT::i32));
+      return false;
+    }
+    if (SelectADDRri(Op.getNode(), Op, Op0, Op1)) {
+      OutOps.push_back(Op0);
+      OutOps.push_back(Op1);
+      return false;
+    }
+    break;
+  }
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
new file mode 100644
index 0000000..c62fc25
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -0,0 +1,94 @@
+//===-- NVPTXISelDAGToDAG.h - A dag to dag inst selector for NVPTX --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the NVPTX target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "NVPTXISelLowering.h"
+#include "NVPTXRegisterInfo.h"
+#include "NVPTXTargetMachine.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/Compiler.h"
+using namespace llvm;
+
+namespace {
+
+class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
+
+  // If true, generate mul.wide from sext and mul
+  bool doMulWide;
+
+  int getDivF32Level() const;
+  bool usePrecSqrtF32() const;
+  bool useF32FTZ() const;
+  bool allowFMA() const;
+
+public:
+  explicit NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
+                             CodeGenOpt::Level   OptLevel);
+
+  // Pass Name
+  const char *getPassName() const override {
+    return "NVPTX DAG->DAG Pattern Instruction Selection";
+  }
+
+  const NVPTXSubtarget &Subtarget;
+
+  bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                    char ConstraintCode,
+                                    std::vector<SDValue> &OutOps) override;
+private:
+// Include the pieces autogenerated from the target description.
+#include "NVPTXGenDAGISel.inc"
+
+  SDNode *Select(SDNode *N) override;
+  SDNode *SelectIntrinsicNoChain(SDNode *N);
+  SDNode *SelectIntrinsicChain(SDNode *N);
+  SDNode *SelectTexSurfHandle(SDNode *N);
+  SDNode *SelectLoad(SDNode *N);
+  SDNode *SelectLoadVector(SDNode *N);
+  SDNode *SelectLDGLDU(SDNode *N);
+  SDNode *SelectStore(SDNode *N);
+  SDNode *SelectStoreVector(SDNode *N);
+  SDNode *SelectLoadParam(SDNode *N);
+  SDNode *SelectStoreRetval(SDNode *N);
+  SDNode *SelectStoreParam(SDNode *N);
+  SDNode *SelectAddrSpaceCast(SDNode *N);
+  SDNode *SelectTextureIntrinsic(SDNode *N);
+  SDNode *SelectSurfaceIntrinsic(SDNode *N);
+  SDNode *SelectBFE(SDNode *N);
+        
+  inline SDValue getI32Imm(unsigned Imm) {
+    return CurDAG->getTargetConstant(Imm, MVT::i32);
+  }
+
+  // Match direct address complex pattern.
+  bool SelectDirectAddr(SDValue N, SDValue &Address);
+
+  bool SelectADDRri_imp(SDNode *OpNode, SDValue Addr, SDValue &Base,
+                        SDValue &Offset, MVT mvt);
+  bool SelectADDRri(SDNode *OpNode, SDValue Addr, SDValue &Base,
+                    SDValue &Offset);
+  bool SelectADDRri64(SDNode *OpNode, SDValue Addr, SDValue &Base,
+                      SDValue &Offset);
+
+  bool SelectADDRsi_imp(SDNode *OpNode, SDValue Addr, SDValue &Base,
+                        SDValue &Offset, MVT mvt);
+  bool SelectADDRsi(SDNode *OpNode, SDValue Addr, SDValue &Base,
+                    SDValue &Offset);
+  bool SelectADDRsi64(SDNode *OpNode, SDValue Addr, SDValue &Base,
+                      SDValue &Offset);
+
+  bool ChkMemSDNodeAddressSpace(SDNode *N, unsigned int spN) const;
+
+};
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
new file mode 100644
index 0000000..d76b20a
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -0,0 +1,4516 @@
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that NVPTX uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXISelLowering.h"
+#include "NVPTX.h"
+#include "NVPTXTargetMachine.h"
+#include "NVPTXTargetObjectFile.h"
+#include "NVPTXUtilities.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <sstream>
+
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "nvptx-lower"
+
+using namespace llvm;
+
+static unsigned int uniqueCallSite = 0;
+
+static cl::opt<bool> sched4reg(
+    "nvptx-sched4reg",
+    cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
+
+static cl::opt<unsigned>
+FMAContractLevelOpt("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden,
+                    cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
+                             " 1: do it  2: do it aggressively"),
+                    cl::init(2));
+
+static bool IsPTXVectorType(MVT VT) {
+  switch (VT.SimpleTy) {
+  default:
+    return false;
+  case MVT::v2i1:
+  case MVT::v4i1:
+  case MVT::v2i8:
+  case MVT::v4i8:
+  case MVT::v2i16:
+  case MVT::v4i16:
+  case MVT::v2i32:
+  case MVT::v4i32:
+  case MVT::v2i64:
+  case MVT::v2f32:
+  case MVT::v4f32:
+  case MVT::v2f64:
+    return true;
+  }
+}
+
+/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
+/// EVTs that compose it.  Unlike ComputeValueVTs, this will break apart vectors
+/// into their primitive components.
+/// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
+/// same number of types as the Ins/Outs arrays in LowerFormalArguments,
+/// LowerCall, and LowerReturn.
+static void ComputePTXValueVTs(const TargetLowering &TLI, Type *Ty,
+                               SmallVectorImpl<EVT> &ValueVTs,
+                               SmallVectorImpl<uint64_t> *Offsets = nullptr,
+                               uint64_t StartingOffset = 0) {
+  SmallVector<EVT, 16> TempVTs;
+  SmallVector<uint64_t, 16> TempOffsets;
+
+  ComputeValueVTs(TLI, Ty, TempVTs, &TempOffsets, StartingOffset);
+  for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
+    EVT VT = TempVTs[i];
+    uint64_t Off = TempOffsets[i];
+    if (VT.isVector())
+      for (unsigned j = 0, je = VT.getVectorNumElements(); j != je; ++j) {
+        ValueVTs.push_back(VT.getVectorElementType());
+        if (Offsets)
+          Offsets->push_back(Off+j*VT.getVectorElementType().getStoreSize());
+      }
+    else {
+      ValueVTs.push_back(VT);
+      if (Offsets)
+        Offsets->push_back(Off);
+    }
+  }
+}
+
+// NVPTXTargetLowering Constructor.
+NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM)
+    : TargetLowering(TM, new NVPTXTargetObjectFile()), nvTM(&TM),
+      nvptxSubtarget(TM.getSubtarget<NVPTXSubtarget>()) {
+
+  // always lower memset, memcpy, and memmove intrinsics to load/store
+  // instructions, rather
+  // then generating calls to memset, mempcy or memmove.
+  MaxStoresPerMemset = (unsigned) 0xFFFFFFFF;
+  MaxStoresPerMemcpy = (unsigned) 0xFFFFFFFF;
+  MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF;
+
+  setBooleanContents(ZeroOrNegativeOneBooleanContent);
+  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+
+  // Jump is Expensive. Don't create extra control flow for 'and', 'or'
+  // condition branches.
+  setJumpIsExpensive(true);
+
+  // By default, use the Source scheduling
+  if (sched4reg)
+    setSchedulingPreference(Sched::RegPressure);
+  else
+    setSchedulingPreference(Sched::Source);
+
+  addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
+  addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
+  addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
+  addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
+  addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
+  addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
+
+  // Operations not directly supported by NVPTX.
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i8, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
+  setOperationAction(ISD::BR_CC, MVT::f32, Expand);
+  setOperationAction(ISD::BR_CC, MVT::f64, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i1, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i8, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i16, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i32, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i64, Expand);
+  // Some SIGN_EXTEND_INREG can be done using cvt instruction.
+  // For others we will expand to a SHL/SRA pair.
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+  setOperationAction(ISD::SHL_PARTS, MVT::i32  , Custom);
+  setOperationAction(ISD::SRA_PARTS, MVT::i32  , Custom);
+  setOperationAction(ISD::SRL_PARTS, MVT::i32  , Custom);
+  setOperationAction(ISD::SHL_PARTS, MVT::i64  , Custom);
+  setOperationAction(ISD::SRA_PARTS, MVT::i64  , Custom);
+  setOperationAction(ISD::SRL_PARTS, MVT::i64  , Custom);
+
+  if (nvptxSubtarget.hasROT64()) {
+    setOperationAction(ISD::ROTL, MVT::i64, Legal);
+    setOperationAction(ISD::ROTR, MVT::i64, Legal);
+  } else {
+    setOperationAction(ISD::ROTL, MVT::i64, Expand);
+    setOperationAction(ISD::ROTR, MVT::i64, Expand);
+  }
+  if (nvptxSubtarget.hasROT32()) {
+    setOperationAction(ISD::ROTL, MVT::i32, Legal);
+    setOperationAction(ISD::ROTR, MVT::i32, Legal);
+  } else {
+    setOperationAction(ISD::ROTL, MVT::i32, Expand);
+    setOperationAction(ISD::ROTR, MVT::i32, Expand);
+  }
+
+  setOperationAction(ISD::ROTL, MVT::i16, Expand);
+  setOperationAction(ISD::ROTR, MVT::i16, Expand);
+  setOperationAction(ISD::ROTL, MVT::i8, Expand);
+  setOperationAction(ISD::ROTR, MVT::i8, Expand);
+  setOperationAction(ISD::BSWAP, MVT::i16, Expand);
+  setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+  setOperationAction(ISD::BSWAP, MVT::i64, Expand);
+
+  // Indirect branch is not supported.
+  // This also disables Jump Table creation.
+  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+  setOperationAction(ISD::BRIND, MVT::Other, Expand);
+
+  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
+
+  // We want to legalize constant related memmove and memcopy
+  // intrinsics.
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+
+  // Turn FP extload into load/fextend
+  setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+  // Turn FP truncstore into trunc + store.
+  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+  // PTX does not support load / store predicate registers
+  setOperationAction(ISD::LOAD, MVT::i1, Custom);
+  setOperationAction(ISD::STORE, MVT::i1, Custom);
+
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
+  setTruncStoreAction(MVT::i64, MVT::i1, Expand);
+  setTruncStoreAction(MVT::i32, MVT::i1, Expand);
+  setTruncStoreAction(MVT::i16, MVT::i1, Expand);
+  setTruncStoreAction(MVT::i8, MVT::i1, Expand);
+
+  // This is legal in NVPTX
+  setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
+  setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
+
+  // TRAP can be lowered to PTX trap
+  setOperationAction(ISD::TRAP, MVT::Other, Legal);
+
+  setOperationAction(ISD::ADDC, MVT::i64, Expand);
+  setOperationAction(ISD::ADDE, MVT::i64, Expand);
+
+  // Register custom handling for vector loads/stores
+  for (int i = MVT::FIRST_VECTOR_VALUETYPE; i <= MVT::LAST_VECTOR_VALUETYPE;
+       ++i) {
+    MVT VT = (MVT::SimpleValueType) i;
+    if (IsPTXVectorType(VT)) {
+      setOperationAction(ISD::LOAD, VT, Custom);
+      setOperationAction(ISD::STORE, VT, Custom);
+      setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom);
+    }
+  }
+
+  // Custom handling for i8 intrinsics
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
+
+  setOperationAction(ISD::CTLZ, MVT::i16, Legal);
+  setOperationAction(ISD::CTLZ, MVT::i32, Legal);
+  setOperationAction(ISD::CTLZ, MVT::i64, Legal);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Legal);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Legal);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Legal);
+  setOperationAction(ISD::CTTZ, MVT::i16, Expand);
+  setOperationAction(ISD::CTTZ, MVT::i32, Expand);
+  setOperationAction(ISD::CTTZ, MVT::i64, Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
+  setOperationAction(ISD::CTPOP, MVT::i16, Legal);
+  setOperationAction(ISD::CTPOP, MVT::i32, Legal);
+  setOperationAction(ISD::CTPOP, MVT::i64, Legal);
+
+  // We have some custom DAG combine patterns for these nodes
+  setTargetDAGCombine(ISD::ADD);
+  setTargetDAGCombine(ISD::AND);
+  setTargetDAGCombine(ISD::FADD);
+  setTargetDAGCombine(ISD::MUL);
+  setTargetDAGCombine(ISD::SHL);
+
+  // Now deduce the information based on the above mentioned
+  // actions
+  computeRegisterProperties();
+}
+
+const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default:
+    return nullptr;
+  case NVPTXISD::CALL:
+    return "NVPTXISD::CALL";
+  case NVPTXISD::RET_FLAG:
+    return "NVPTXISD::RET_FLAG";
+  case NVPTXISD::Wrapper:
+    return "NVPTXISD::Wrapper";
+  case NVPTXISD::DeclareParam:
+    return "NVPTXISD::DeclareParam";
+  case NVPTXISD::DeclareScalarParam:
+    return "NVPTXISD::DeclareScalarParam";
+  case NVPTXISD::DeclareRet:
+    return "NVPTXISD::DeclareRet";
+  case NVPTXISD::DeclareRetParam:
+    return "NVPTXISD::DeclareRetParam";
+  case NVPTXISD::PrintCall:
+    return "NVPTXISD::PrintCall";
+  case NVPTXISD::LoadParam:
+    return "NVPTXISD::LoadParam";
+  case NVPTXISD::LoadParamV2:
+    return "NVPTXISD::LoadParamV2";
+  case NVPTXISD::LoadParamV4:
+    return "NVPTXISD::LoadParamV4";
+  case NVPTXISD::StoreParam:
+    return "NVPTXISD::StoreParam";
+  case NVPTXISD::StoreParamV2:
+    return "NVPTXISD::StoreParamV2";
+  case NVPTXISD::StoreParamV4:
+    return "NVPTXISD::StoreParamV4";
+  case NVPTXISD::StoreParamS32:
+    return "NVPTXISD::StoreParamS32";
+  case NVPTXISD::StoreParamU32:
+    return "NVPTXISD::StoreParamU32";
+  case NVPTXISD::CallArgBegin:
+    return "NVPTXISD::CallArgBegin";
+  case NVPTXISD::CallArg:
+    return "NVPTXISD::CallArg";
+  case NVPTXISD::LastCallArg:
+    return "NVPTXISD::LastCallArg";
+  case NVPTXISD::CallArgEnd:
+    return "NVPTXISD::CallArgEnd";
+  case NVPTXISD::CallVoid:
+    return "NVPTXISD::CallVoid";
+  case NVPTXISD::CallVal:
+    return "NVPTXISD::CallVal";
+  case NVPTXISD::CallSymbol:
+    return "NVPTXISD::CallSymbol";
+  case NVPTXISD::Prototype:
+    return "NVPTXISD::Prototype";
+  case NVPTXISD::MoveParam:
+    return "NVPTXISD::MoveParam";
+  case NVPTXISD::StoreRetval:
+    return "NVPTXISD::StoreRetval";
+  case NVPTXISD::StoreRetvalV2:
+    return "NVPTXISD::StoreRetvalV2";
+  case NVPTXISD::StoreRetvalV4:
+    return "NVPTXISD::StoreRetvalV4";
+  case NVPTXISD::PseudoUseParam:
+    return "NVPTXISD::PseudoUseParam";
+  case NVPTXISD::RETURN:
+    return "NVPTXISD::RETURN";
+  case NVPTXISD::CallSeqBegin:
+    return "NVPTXISD::CallSeqBegin";
+  case NVPTXISD::CallSeqEnd:
+    return "NVPTXISD::CallSeqEnd";
+  case NVPTXISD::CallPrototype:
+    return "NVPTXISD::CallPrototype";
+  case NVPTXISD::LoadV2:
+    return "NVPTXISD::LoadV2";
+  case NVPTXISD::LoadV4:
+    return "NVPTXISD::LoadV4";
+  case NVPTXISD::LDGV2:
+    return "NVPTXISD::LDGV2";
+  case NVPTXISD::LDGV4:
+    return "NVPTXISD::LDGV4";
+  case NVPTXISD::LDUV2:
+    return "NVPTXISD::LDUV2";
+  case NVPTXISD::LDUV4:
+    return "NVPTXISD::LDUV4";
+  case NVPTXISD::StoreV2:
+    return "NVPTXISD::StoreV2";
+  case NVPTXISD::StoreV4:
+    return "NVPTXISD::StoreV4";
+  case NVPTXISD::FUN_SHFL_CLAMP:
+    return "NVPTXISD::FUN_SHFL_CLAMP";
+  case NVPTXISD::FUN_SHFR_CLAMP:
+    return "NVPTXISD::FUN_SHFR_CLAMP";
+  case NVPTXISD::IMAD:
+    return "NVPTXISD::IMAD";
+  case NVPTXISD::MUL_WIDE_SIGNED:
+    return "NVPTXISD::MUL_WIDE_SIGNED";
+  case NVPTXISD::MUL_WIDE_UNSIGNED:
+    return "NVPTXISD::MUL_WIDE_UNSIGNED";
+  case NVPTXISD::Tex1DFloatS32:        return "NVPTXISD::Tex1DFloatS32";
+  case NVPTXISD::Tex1DFloatFloat:      return "NVPTXISD::Tex1DFloatFloat";
+  case NVPTXISD::Tex1DFloatFloatLevel:
+    return "NVPTXISD::Tex1DFloatFloatLevel";
+  case NVPTXISD::Tex1DFloatFloatGrad:
+    return "NVPTXISD::Tex1DFloatFloatGrad";
+  case NVPTXISD::Tex1DS32S32:          return "NVPTXISD::Tex1DS32S32";
+  case NVPTXISD::Tex1DS32Float:        return "NVPTXISD::Tex1DS32Float";
+  case NVPTXISD::Tex1DS32FloatLevel:
+    return "NVPTXISD::Tex1DS32FloatLevel";
+  case NVPTXISD::Tex1DS32FloatGrad:
+    return "NVPTXISD::Tex1DS32FloatGrad";
+  case NVPTXISD::Tex1DU32S32:          return "NVPTXISD::Tex1DU32S32";
+  case NVPTXISD::Tex1DU32Float:        return "NVPTXISD::Tex1DU32Float";
+  case NVPTXISD::Tex1DU32FloatLevel:
+    return "NVPTXISD::Tex1DU32FloatLevel";
+  case NVPTXISD::Tex1DU32FloatGrad:
+    return "NVPTXISD::Tex1DU32FloatGrad";
+  case NVPTXISD::Tex1DArrayFloatS32:   return "NVPTXISD::Tex1DArrayFloatS32";
+  case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat";
+  case NVPTXISD::Tex1DArrayFloatFloatLevel:
+    return "NVPTXISD::Tex1DArrayFloatFloatLevel";
+  case NVPTXISD::Tex1DArrayFloatFloatGrad:
+    return "NVPTXISD::Tex1DArrayFloatFloatGrad";
+  case NVPTXISD::Tex1DArrayS32S32:     return "NVPTXISD::Tex1DArrayS32S32";
+  case NVPTXISD::Tex1DArrayS32Float:   return "NVPTXISD::Tex1DArrayS32Float";
+  case NVPTXISD::Tex1DArrayS32FloatLevel:
+    return "NVPTXISD::Tex1DArrayS32FloatLevel";
+  case NVPTXISD::Tex1DArrayS32FloatGrad:
+    return "NVPTXISD::Tex1DArrayS32FloatGrad";
+  case NVPTXISD::Tex1DArrayU32S32:     return "NVPTXISD::Tex1DArrayU32S32";
+  case NVPTXISD::Tex1DArrayU32Float:   return "NVPTXISD::Tex1DArrayU32Float";
+  case NVPTXISD::Tex1DArrayU32FloatLevel:
+    return "NVPTXISD::Tex1DArrayU32FloatLevel";
+  case NVPTXISD::Tex1DArrayU32FloatGrad:
+    return "NVPTXISD::Tex1DArrayU32FloatGrad";
+  case NVPTXISD::Tex2DFloatS32:        return "NVPTXISD::Tex2DFloatS32";
+  case NVPTXISD::Tex2DFloatFloat:      return "NVPTXISD::Tex2DFloatFloat";
+  case NVPTXISD::Tex2DFloatFloatLevel:
+    return "NVPTXISD::Tex2DFloatFloatLevel";
+  case NVPTXISD::Tex2DFloatFloatGrad:
+    return "NVPTXISD::Tex2DFloatFloatGrad";
+  case NVPTXISD::Tex2DS32S32:          return "NVPTXISD::Tex2DS32S32";
+  case NVPTXISD::Tex2DS32Float:        return "NVPTXISD::Tex2DS32Float";
+  case NVPTXISD::Tex2DS32FloatLevel:
+    return "NVPTXISD::Tex2DS32FloatLevel";
+  case NVPTXISD::Tex2DS32FloatGrad:
+    return "NVPTXISD::Tex2DS32FloatGrad";
+  case NVPTXISD::Tex2DU32S32:          return "NVPTXISD::Tex2DU32S32";
+  case NVPTXISD::Tex2DU32Float:        return "NVPTXISD::Tex2DU32Float";
+  case NVPTXISD::Tex2DU32FloatLevel:
+    return "NVPTXISD::Tex2DU32FloatLevel";
+  case NVPTXISD::Tex2DU32FloatGrad:
+    return "NVPTXISD::Tex2DU32FloatGrad";
+  case NVPTXISD::Tex2DArrayFloatS32:   return "NVPTXISD::Tex2DArrayFloatS32";
+  case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat";
+  case NVPTXISD::Tex2DArrayFloatFloatLevel:
+    return "NVPTXISD::Tex2DArrayFloatFloatLevel";
+  case NVPTXISD::Tex2DArrayFloatFloatGrad:
+    return "NVPTXISD::Tex2DArrayFloatFloatGrad";
+  case NVPTXISD::Tex2DArrayS32S32:     return "NVPTXISD::Tex2DArrayS32S32";
+  case NVPTXISD::Tex2DArrayS32Float:   return "NVPTXISD::Tex2DArrayS32Float";
+  case NVPTXISD::Tex2DArrayS32FloatLevel:
+    return "NVPTXISD::Tex2DArrayS32FloatLevel";
+  case NVPTXISD::Tex2DArrayS32FloatGrad:
+    return "NVPTXISD::Tex2DArrayS32FloatGrad";
+  case NVPTXISD::Tex2DArrayU32S32:     return "NVPTXISD::Tex2DArrayU32S32";
+  case NVPTXISD::Tex2DArrayU32Float:   return "NVPTXISD::Tex2DArrayU32Float";
+  case NVPTXISD::Tex2DArrayU32FloatLevel:
+    return "NVPTXISD::Tex2DArrayU32FloatLevel";
+  case NVPTXISD::Tex2DArrayU32FloatGrad:
+    return "NVPTXISD::Tex2DArrayU32FloatGrad";
+  case NVPTXISD::Tex3DFloatS32:        return "NVPTXISD::Tex3DFloatS32";
+  case NVPTXISD::Tex3DFloatFloat:      return "NVPTXISD::Tex3DFloatFloat";
+  case NVPTXISD::Tex3DFloatFloatLevel:
+    return "NVPTXISD::Tex3DFloatFloatLevel";
+  case NVPTXISD::Tex3DFloatFloatGrad:
+    return "NVPTXISD::Tex3DFloatFloatGrad";
+  case NVPTXISD::Tex3DS32S32:          return "NVPTXISD::Tex3DS32S32";
+  case NVPTXISD::Tex3DS32Float:        return "NVPTXISD::Tex3DS32Float";
+  case NVPTXISD::Tex3DS32FloatLevel:
+    return "NVPTXISD::Tex3DS32FloatLevel";
+  case NVPTXISD::Tex3DS32FloatGrad:
+    return "NVPTXISD::Tex3DS32FloatGrad";
+  case NVPTXISD::Tex3DU32S32:          return "NVPTXISD::Tex3DU32S32";
+  case NVPTXISD::Tex3DU32Float:        return "NVPTXISD::Tex3DU32Float";
+  case NVPTXISD::Tex3DU32FloatLevel:
+    return "NVPTXISD::Tex3DU32FloatLevel";
+  case NVPTXISD::Tex3DU32FloatGrad:
+    return "NVPTXISD::Tex3DU32FloatGrad";
+  case NVPTXISD::TexCubeFloatFloat:      return "NVPTXISD::TexCubeFloatFloat";
+  case NVPTXISD::TexCubeFloatFloatLevel:
+    return "NVPTXISD::TexCubeFloatFloatLevel";
+  case NVPTXISD::TexCubeS32Float:        return "NVPTXISD::TexCubeS32Float";
+  case NVPTXISD::TexCubeS32FloatLevel:
+    return "NVPTXISD::TexCubeS32FloatLevel";
+  case NVPTXISD::TexCubeU32Float:        return "NVPTXISD::TexCubeU32Float";
+  case NVPTXISD::TexCubeU32FloatLevel:
+    return "NVPTXISD::TexCubeU32FloatLevel";
+  case NVPTXISD::TexCubeArrayFloatFloat:
+    return "NVPTXISD::TexCubeArrayFloatFloat";
+  case NVPTXISD::TexCubeArrayFloatFloatLevel:
+    return "NVPTXISD::TexCubeArrayFloatFloatLevel";
+  case NVPTXISD::TexCubeArrayS32Float:
+    return "NVPTXISD::TexCubeArrayS32Float";
+  case NVPTXISD::TexCubeArrayS32FloatLevel:
+    return "NVPTXISD::TexCubeArrayS32FloatLevel";
+  case NVPTXISD::TexCubeArrayU32Float:
+    return "NVPTXISD::TexCubeArrayU32Float";
+  case NVPTXISD::TexCubeArrayU32FloatLevel:
+    return "NVPTXISD::TexCubeArrayU32FloatLevel";
+  case NVPTXISD::Tld4R2DFloatFloat:
+    return "NVPTXISD::Tld4R2DFloatFloat";
+  case NVPTXISD::Tld4G2DFloatFloat:
+    return "NVPTXISD::Tld4G2DFloatFloat";
+  case NVPTXISD::Tld4B2DFloatFloat:
+    return "NVPTXISD::Tld4B2DFloatFloat";
+  case NVPTXISD::Tld4A2DFloatFloat:
+    return "NVPTXISD::Tld4A2DFloatFloat";
+  case NVPTXISD::Tld4R2DS64Float:
+    return "NVPTXISD::Tld4R2DS64Float";
+  case NVPTXISD::Tld4G2DS64Float:
+    return "NVPTXISD::Tld4G2DS64Float";
+  case NVPTXISD::Tld4B2DS64Float:
+    return "NVPTXISD::Tld4B2DS64Float";
+  case NVPTXISD::Tld4A2DS64Float:
+    return "NVPTXISD::Tld4A2DS64Float";
+  case NVPTXISD::Tld4R2DU64Float:
+    return "NVPTXISD::Tld4R2DU64Float";
+  case NVPTXISD::Tld4G2DU64Float:
+    return "NVPTXISD::Tld4G2DU64Float";
+  case NVPTXISD::Tld4B2DU64Float:
+    return "NVPTXISD::Tld4B2DU64Float";
+  case NVPTXISD::Tld4A2DU64Float:
+    return "NVPTXISD::Tld4A2DU64Float";
+
+  case NVPTXISD::TexUnified1DFloatS32:
+    return "NVPTXISD::TexUnified1DFloatS32";
+  case NVPTXISD::TexUnified1DFloatFloat:
+    return "NVPTXISD::TexUnified1DFloatFloat";
+  case NVPTXISD::TexUnified1DFloatFloatLevel:
+    return "NVPTXISD::TexUnified1DFloatFloatLevel";
+  case NVPTXISD::TexUnified1DFloatFloatGrad:
+    return "NVPTXISD::TexUnified1DFloatFloatGrad";
+  case NVPTXISD::TexUnified1DS32S32:
+    return "NVPTXISD::TexUnified1DS32S32";
+  case NVPTXISD::TexUnified1DS32Float:
+    return "NVPTXISD::TexUnified1DS32Float";
+  case NVPTXISD::TexUnified1DS32FloatLevel:
+    return "NVPTXISD::TexUnified1DS32FloatLevel";
+  case NVPTXISD::TexUnified1DS32FloatGrad:
+    return "NVPTXISD::TexUnified1DS32FloatGrad";
+  case NVPTXISD::TexUnified1DU32S32:
+    return "NVPTXISD::TexUnified1DU32S32";
+  case NVPTXISD::TexUnified1DU32Float:
+    return "NVPTXISD::TexUnified1DU32Float";
+  case NVPTXISD::TexUnified1DU32FloatLevel:
+    return "NVPTXISD::TexUnified1DU32FloatLevel";
+  case NVPTXISD::TexUnified1DU32FloatGrad:
+    return "NVPTXISD::TexUnified1DU32FloatGrad";
+  case NVPTXISD::TexUnified1DArrayFloatS32:
+    return "NVPTXISD::TexUnified1DArrayFloatS32";
+  case NVPTXISD::TexUnified1DArrayFloatFloat:
+    return "NVPTXISD::TexUnified1DArrayFloatFloat";
+  case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
+    return "NVPTXISD::TexUnified1DArrayFloatFloatLevel";
+  case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
+    return "NVPTXISD::TexUnified1DArrayFloatFloatGrad";
+  case NVPTXISD::TexUnified1DArrayS32S32:
+    return "NVPTXISD::TexUnified1DArrayS32S32";
+  case NVPTXISD::TexUnified1DArrayS32Float:
+    return "NVPTXISD::TexUnified1DArrayS32Float";
+  case NVPTXISD::TexUnified1DArrayS32FloatLevel:
+    return "NVPTXISD::TexUnified1DArrayS32FloatLevel";
+  case NVPTXISD::TexUnified1DArrayS32FloatGrad:
+    return "NVPTXISD::TexUnified1DArrayS32FloatGrad";
+  case NVPTXISD::TexUnified1DArrayU32S32:
+    return "NVPTXISD::TexUnified1DArrayU32S32";
+  case NVPTXISD::TexUnified1DArrayU32Float:
+    return "NVPTXISD::TexUnified1DArrayU32Float";
+  case NVPTXISD::TexUnified1DArrayU32FloatLevel:
+    return "NVPTXISD::TexUnified1DArrayU32FloatLevel";
+  case NVPTXISD::TexUnified1DArrayU32FloatGrad:
+    return "NVPTXISD::TexUnified1DArrayU32FloatGrad";
+  case NVPTXISD::TexUnified2DFloatS32:
+    return "NVPTXISD::TexUnified2DFloatS32";
+  case NVPTXISD::TexUnified2DFloatFloat:
+    return "NVPTXISD::TexUnified2DFloatFloat";
+  case NVPTXISD::TexUnified2DFloatFloatLevel:
+    return "NVPTXISD::TexUnified2DFloatFloatLevel";
+  case NVPTXISD::TexUnified2DFloatFloatGrad:
+    return "NVPTXISD::TexUnified2DFloatFloatGrad";
+  case NVPTXISD::TexUnified2DS32S32:
+    return "NVPTXISD::TexUnified2DS32S32";
+  case NVPTXISD::TexUnified2DS32Float:
+    return "NVPTXISD::TexUnified2DS32Float";
+  case NVPTXISD::TexUnified2DS32FloatLevel:
+    return "NVPTXISD::TexUnified2DS32FloatLevel";
+  case NVPTXISD::TexUnified2DS32FloatGrad:
+    return "NVPTXISD::TexUnified2DS32FloatGrad";
+  case NVPTXISD::TexUnified2DU32S32:
+    return "NVPTXISD::TexUnified2DU32S32";
+  case NVPTXISD::TexUnified2DU32Float:
+    return "NVPTXISD::TexUnified2DU32Float";
+  case NVPTXISD::TexUnified2DU32FloatLevel:
+    return "NVPTXISD::TexUnified2DU32FloatLevel";
+  case NVPTXISD::TexUnified2DU32FloatGrad:
+    return "NVPTXISD::TexUnified2DU32FloatGrad";
+  case NVPTXISD::TexUnified2DArrayFloatS32:
+    return "NVPTXISD::TexUnified2DArrayFloatS32";
+  case NVPTXISD::TexUnified2DArrayFloatFloat:
+    return "NVPTXISD::TexUnified2DArrayFloatFloat";
+  case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
+    return "NVPTXISD::TexUnified2DArrayFloatFloatLevel";
+  case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
+    return "NVPTXISD::TexUnified2DArrayFloatFloatGrad";
+  case NVPTXISD::TexUnified2DArrayS32S32:
+    return "NVPTXISD::TexUnified2DArrayS32S32";
+  case NVPTXISD::TexUnified2DArrayS32Float:
+    return "NVPTXISD::TexUnified2DArrayS32Float";
+  case NVPTXISD::TexUnified2DArrayS32FloatLevel:
+    return "NVPTXISD::TexUnified2DArrayS32FloatLevel";
+  case NVPTXISD::TexUnified2DArrayS32FloatGrad:
+    return "NVPTXISD::TexUnified2DArrayS32FloatGrad";
+  case NVPTXISD::TexUnified2DArrayU32S32:
+    return "NVPTXISD::TexUnified2DArrayU32S32";
+  case NVPTXISD::TexUnified2DArrayU32Float:
+    return "NVPTXISD::TexUnified2DArrayU32Float";
+  case NVPTXISD::TexUnified2DArrayU32FloatLevel:
+    return "NVPTXISD::TexUnified2DArrayU32FloatLevel";
+  case NVPTXISD::TexUnified2DArrayU32FloatGrad:
+    return "NVPTXISD::TexUnified2DArrayU32FloatGrad";
+  case NVPTXISD::TexUnified3DFloatS32:
+    return "NVPTXISD::TexUnified3DFloatS32";
+  case NVPTXISD::TexUnified3DFloatFloat:
+    return "NVPTXISD::TexUnified3DFloatFloat";
+  case NVPTXISD::TexUnified3DFloatFloatLevel:
+    return "NVPTXISD::TexUnified3DFloatFloatLevel";
+  case NVPTXISD::TexUnified3DFloatFloatGrad:
+    return "NVPTXISD::TexUnified3DFloatFloatGrad";
+  case NVPTXISD::TexUnified3DS32S32:
+    return "NVPTXISD::TexUnified3DS32S32";
+  case NVPTXISD::TexUnified3DS32Float:
+    return "NVPTXISD::TexUnified3DS32Float";
+  case NVPTXISD::TexUnified3DS32FloatLevel:
+    return "NVPTXISD::TexUnified3DS32FloatLevel";
+  case NVPTXISD::TexUnified3DS32FloatGrad:
+    return "NVPTXISD::TexUnified3DS32FloatGrad";
+  case NVPTXISD::TexUnified3DU32S32:
+    return "NVPTXISD::TexUnified3DU32S32";
+  case NVPTXISD::TexUnified3DU32Float:
+    return "NVPTXISD::TexUnified3DU32Float";
+  case NVPTXISD::TexUnified3DU32FloatLevel:
+    return "NVPTXISD::TexUnified3DU32FloatLevel";
+  case NVPTXISD::TexUnified3DU32FloatGrad:
+    return "NVPTXISD::TexUnified3DU32FloatGrad";
+  case NVPTXISD::TexUnifiedCubeFloatFloat:
+    return "NVPTXISD::TexUnifiedCubeFloatFloat";
+  case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
+    return "NVPTXISD::TexUnifiedCubeFloatFloatLevel";
+  case NVPTXISD::TexUnifiedCubeS32Float:
+    return "NVPTXISD::TexUnifiedCubeS32Float";
+  case NVPTXISD::TexUnifiedCubeS32FloatLevel:
+    return "NVPTXISD::TexUnifiedCubeS32FloatLevel";
+  case NVPTXISD::TexUnifiedCubeU32Float:
+    return "NVPTXISD::TexUnifiedCubeU32Float";
+  case NVPTXISD::TexUnifiedCubeU32FloatLevel:
+    return "NVPTXISD::TexUnifiedCubeU32FloatLevel";
+  case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
+    return "NVPTXISD::TexUnifiedCubeArrayFloatFloat";
+  case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
+    return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel";
+  case NVPTXISD::TexUnifiedCubeArrayS32Float:
+    return "NVPTXISD::TexUnifiedCubeArrayS32Float";
+  case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
+    return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel";
+  case NVPTXISD::TexUnifiedCubeArrayU32Float:
+    return "NVPTXISD::TexUnifiedCubeArrayU32Float";
+  case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
+    return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel";
+  case NVPTXISD::Tld4UnifiedR2DFloatFloat:
+    return "NVPTXISD::Tld4UnifiedR2DFloatFloat";
+  case NVPTXISD::Tld4UnifiedG2DFloatFloat:
+    return "NVPTXISD::Tld4UnifiedG2DFloatFloat";
+  case NVPTXISD::Tld4UnifiedB2DFloatFloat:
+    return "NVPTXISD::Tld4UnifiedB2DFloatFloat";
+  case NVPTXISD::Tld4UnifiedA2DFloatFloat:
+    return "NVPTXISD::Tld4UnifiedA2DFloatFloat";
+  case NVPTXISD::Tld4UnifiedR2DS64Float:
+    return "NVPTXISD::Tld4UnifiedR2DS64Float";
+  case NVPTXISD::Tld4UnifiedG2DS64Float:
+    return "NVPTXISD::Tld4UnifiedG2DS64Float";
+  case NVPTXISD::Tld4UnifiedB2DS64Float:
+    return "NVPTXISD::Tld4UnifiedB2DS64Float";
+  case NVPTXISD::Tld4UnifiedA2DS64Float:
+    return "NVPTXISD::Tld4UnifiedA2DS64Float";
+  case NVPTXISD::Tld4UnifiedR2DU64Float:
+    return "NVPTXISD::Tld4UnifiedR2DU64Float";
+  case NVPTXISD::Tld4UnifiedG2DU64Float:
+    return "NVPTXISD::Tld4UnifiedG2DU64Float";
+  case NVPTXISD::Tld4UnifiedB2DU64Float:
+    return "NVPTXISD::Tld4UnifiedB2DU64Float";
+  case NVPTXISD::Tld4UnifiedA2DU64Float:
+    return "NVPTXISD::Tld4UnifiedA2DU64Float";
+
+  case NVPTXISD::Suld1DI8Clamp:          return "NVPTXISD::Suld1DI8Clamp";
+  case NVPTXISD::Suld1DI16Clamp:         return "NVPTXISD::Suld1DI16Clamp";
+  case NVPTXISD::Suld1DI32Clamp:         return "NVPTXISD::Suld1DI32Clamp";
+  case NVPTXISD::Suld1DI64Clamp:         return "NVPTXISD::Suld1DI64Clamp";
+  case NVPTXISD::Suld1DV2I8Clamp:        return "NVPTXISD::Suld1DV2I8Clamp";
+  case NVPTXISD::Suld1DV2I16Clamp:       return "NVPTXISD::Suld1DV2I16Clamp";
+  case NVPTXISD::Suld1DV2I32Clamp:       return "NVPTXISD::Suld1DV2I32Clamp";
+  case NVPTXISD::Suld1DV2I64Clamp:       return "NVPTXISD::Suld1DV2I64Clamp";
+  case NVPTXISD::Suld1DV4I8Clamp:        return "NVPTXISD::Suld1DV4I8Clamp";
+  case NVPTXISD::Suld1DV4I16Clamp:       return "NVPTXISD::Suld1DV4I16Clamp";
+  case NVPTXISD::Suld1DV4I32Clamp:       return "NVPTXISD::Suld1DV4I32Clamp";
+
+  case NVPTXISD::Suld1DArrayI8Clamp:   return "NVPTXISD::Suld1DArrayI8Clamp";
+  case NVPTXISD::Suld1DArrayI16Clamp:  return "NVPTXISD::Suld1DArrayI16Clamp";
+  case NVPTXISD::Suld1DArrayI32Clamp:  return "NVPTXISD::Suld1DArrayI32Clamp";
+  case NVPTXISD::Suld1DArrayI64Clamp:  return "NVPTXISD::Suld1DArrayI64Clamp";
+  case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp";
+  case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp";
+  case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp";
+  case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp";
+  case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp";
+  case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp";
+  case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp";
+
+  case NVPTXISD::Suld2DI8Clamp:          return "NVPTXISD::Suld2DI8Clamp";
+  case NVPTXISD::Suld2DI16Clamp:         return "NVPTXISD::Suld2DI16Clamp";
+  case NVPTXISD::Suld2DI32Clamp:         return "NVPTXISD::Suld2DI32Clamp";
+  case NVPTXISD::Suld2DI64Clamp:         return "NVPTXISD::Suld2DI64Clamp";
+  case NVPTXISD::Suld2DV2I8Clamp:        return "NVPTXISD::Suld2DV2I8Clamp";
+  case NVPTXISD::Suld2DV2I16Clamp:       return "NVPTXISD::Suld2DV2I16Clamp";
+  case NVPTXISD::Suld2DV2I32Clamp:       return "NVPTXISD::Suld2DV2I32Clamp";
+  case NVPTXISD::Suld2DV2I64Clamp:       return "NVPTXISD::Suld2DV2I64Clamp";
+  case NVPTXISD::Suld2DV4I8Clamp:        return "NVPTXISD::Suld2DV4I8Clamp";
+  case NVPTXISD::Suld2DV4I16Clamp:       return "NVPTXISD::Suld2DV4I16Clamp";
+  case NVPTXISD::Suld2DV4I32Clamp:       return "NVPTXISD::Suld2DV4I32Clamp";
+
+  case NVPTXISD::Suld2DArrayI8Clamp:   return "NVPTXISD::Suld2DArrayI8Clamp";
+  case NVPTXISD::Suld2DArrayI16Clamp:  return "NVPTXISD::Suld2DArrayI16Clamp";
+  case NVPTXISD::Suld2DArrayI32Clamp:  return "NVPTXISD::Suld2DArrayI32Clamp";
+  case NVPTXISD::Suld2DArrayI64Clamp:  return "NVPTXISD::Suld2DArrayI64Clamp";
+  case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp";
+  case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp";
+  case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp";
+  case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp";
+  case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp";
+  case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp";
+  case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp";
+
+  case NVPTXISD::Suld3DI8Clamp:          return "NVPTXISD::Suld3DI8Clamp";
+  case NVPTXISD::Suld3DI16Clamp:         return "NVPTXISD::Suld3DI16Clamp";
+  case NVPTXISD::Suld3DI32Clamp:         return "NVPTXISD::Suld3DI32Clamp";
+  case NVPTXISD::Suld3DI64Clamp:         return "NVPTXISD::Suld3DI64Clamp";
+  case NVPTXISD::Suld3DV2I8Clamp:        return "NVPTXISD::Suld3DV2I8Clamp";
+  case NVPTXISD::Suld3DV2I16Clamp:       return "NVPTXISD::Suld3DV2I16Clamp";
+  case NVPTXISD::Suld3DV2I32Clamp:       return "NVPTXISD::Suld3DV2I32Clamp";
+  case NVPTXISD::Suld3DV2I64Clamp:       return "NVPTXISD::Suld3DV2I64Clamp";
+  case NVPTXISD::Suld3DV4I8Clamp:        return "NVPTXISD::Suld3DV4I8Clamp";
+  case NVPTXISD::Suld3DV4I16Clamp:       return "NVPTXISD::Suld3DV4I16Clamp";
+  case NVPTXISD::Suld3DV4I32Clamp:       return "NVPTXISD::Suld3DV4I32Clamp";
+
+  case NVPTXISD::Suld1DI8Trap:          return "NVPTXISD::Suld1DI8Trap";
+  case NVPTXISD::Suld1DI16Trap:         return "NVPTXISD::Suld1DI16Trap";
+  case NVPTXISD::Suld1DI32Trap:         return "NVPTXISD::Suld1DI32Trap";
+  case NVPTXISD::Suld1DI64Trap:         return "NVPTXISD::Suld1DI64Trap";
+  case NVPTXISD::Suld1DV2I8Trap:        return "NVPTXISD::Suld1DV2I8Trap";
+  case NVPTXISD::Suld1DV2I16Trap:       return "NVPTXISD::Suld1DV2I16Trap";
+  case NVPTXISD::Suld1DV2I32Trap:       return "NVPTXISD::Suld1DV2I32Trap";
+  case NVPTXISD::Suld1DV2I64Trap:       return "NVPTXISD::Suld1DV2I64Trap";
+  case NVPTXISD::Suld1DV4I8Trap:        return "NVPTXISD::Suld1DV4I8Trap";
+  case NVPTXISD::Suld1DV4I16Trap:       return "NVPTXISD::Suld1DV4I16Trap";
+  case NVPTXISD::Suld1DV4I32Trap:       return "NVPTXISD::Suld1DV4I32Trap";
+
+  case NVPTXISD::Suld1DArrayI8Trap:     return "NVPTXISD::Suld1DArrayI8Trap";
+  case NVPTXISD::Suld1DArrayI16Trap:    return "NVPTXISD::Suld1DArrayI16Trap";
+  case NVPTXISD::Suld1DArrayI32Trap:    return "NVPTXISD::Suld1DArrayI32Trap";
+  case NVPTXISD::Suld1DArrayI64Trap:    return "NVPTXISD::Suld1DArrayI64Trap";
+  case NVPTXISD::Suld1DArrayV2I8Trap:   return "NVPTXISD::Suld1DArrayV2I8Trap";
+  case NVPTXISD::Suld1DArrayV2I16Trap:  return "NVPTXISD::Suld1DArrayV2I16Trap";
+  case NVPTXISD::Suld1DArrayV2I32Trap:  return "NVPTXISD::Suld1DArrayV2I32Trap";
+  case NVPTXISD::Suld1DArrayV2I64Trap:  return "NVPTXISD::Suld1DArrayV2I64Trap";
+  case NVPTXISD::Suld1DArrayV4I8Trap:   return "NVPTXISD::Suld1DArrayV4I8Trap";
+  case NVPTXISD::Suld1DArrayV4I16Trap:  return "NVPTXISD::Suld1DArrayV4I16Trap";
+  case NVPTXISD::Suld1DArrayV4I32Trap:  return "NVPTXISD::Suld1DArrayV4I32Trap";
+
+  case NVPTXISD::Suld2DI8Trap:          return "NVPTXISD::Suld2DI8Trap";
+  case NVPTXISD::Suld2DI16Trap:         return "NVPTXISD::Suld2DI16Trap";
+  case NVPTXISD::Suld2DI32Trap:         return "NVPTXISD::Suld2DI32Trap";
+  case NVPTXISD::Suld2DI64Trap:         return "NVPTXISD::Suld2DI64Trap";
+  case NVPTXISD::Suld2DV2I8Trap:        return "NVPTXISD::Suld2DV2I8Trap";
+  case NVPTXISD::Suld2DV2I16Trap:       return "NVPTXISD::Suld2DV2I16Trap";
+  case NVPTXISD::Suld2DV2I32Trap:       return "NVPTXISD::Suld2DV2I32Trap";
+  case NVPTXISD::Suld2DV2I64Trap:       return "NVPTXISD::Suld2DV2I64Trap";
+  case NVPTXISD::Suld2DV4I8Trap:        return "NVPTXISD::Suld2DV4I8Trap";
+  case NVPTXISD::Suld2DV4I16Trap:       return "NVPTXISD::Suld2DV4I16Trap";
+  case NVPTXISD::Suld2DV4I32Trap:       return "NVPTXISD::Suld2DV4I32Trap";
+
+  case NVPTXISD::Suld2DArrayI8Trap:     return "NVPTXISD::Suld2DArrayI8Trap";
+  case NVPTXISD::Suld2DArrayI16Trap:    return "NVPTXISD::Suld2DArrayI16Trap";
+  case NVPTXISD::Suld2DArrayI32Trap:    return "NVPTXISD::Suld2DArrayI32Trap";
+  case NVPTXISD::Suld2DArrayI64Trap:    return "NVPTXISD::Suld2DArrayI64Trap";
+  case NVPTXISD::Suld2DArrayV2I8Trap:   return "NVPTXISD::Suld2DArrayV2I8Trap";
+  case NVPTXISD::Suld2DArrayV2I16Trap:  return "NVPTXISD::Suld2DArrayV2I16Trap";
+  case NVPTXISD::Suld2DArrayV2I32Trap:  return "NVPTXISD::Suld2DArrayV2I32Trap";
+  case NVPTXISD::Suld2DArrayV2I64Trap:  return "NVPTXISD::Suld2DArrayV2I64Trap";
+  case NVPTXISD::Suld2DArrayV4I8Trap:   return "NVPTXISD::Suld2DArrayV4I8Trap";
+  case NVPTXISD::Suld2DArrayV4I16Trap:  return "NVPTXISD::Suld2DArrayV4I16Trap";
+  case NVPTXISD::Suld2DArrayV4I32Trap:  return "NVPTXISD::Suld2DArrayV4I32Trap";
+
+  case NVPTXISD::Suld3DI8Trap:          return "NVPTXISD::Suld3DI8Trap";
+  case NVPTXISD::Suld3DI16Trap:         return "NVPTXISD::Suld3DI16Trap";
+  case NVPTXISD::Suld3DI32Trap:         return "NVPTXISD::Suld3DI32Trap";
+  case NVPTXISD::Suld3DI64Trap:         return "NVPTXISD::Suld3DI64Trap";
+  case NVPTXISD::Suld3DV2I8Trap:        return "NVPTXISD::Suld3DV2I8Trap";
+  case NVPTXISD::Suld3DV2I16Trap:       return "NVPTXISD::Suld3DV2I16Trap";
+  case NVPTXISD::Suld3DV2I32Trap:       return "NVPTXISD::Suld3DV2I32Trap";
+  case NVPTXISD::Suld3DV2I64Trap:       return "NVPTXISD::Suld3DV2I64Trap";
+  case NVPTXISD::Suld3DV4I8Trap:        return "NVPTXISD::Suld3DV4I8Trap";
+  case NVPTXISD::Suld3DV4I16Trap:       return "NVPTXISD::Suld3DV4I16Trap";
+  case NVPTXISD::Suld3DV4I32Trap:       return "NVPTXISD::Suld3DV4I32Trap";
+
+  case NVPTXISD::Suld1DI8Zero:          return "NVPTXISD::Suld1DI8Zero";
+  case NVPTXISD::Suld1DI16Zero:         return "NVPTXISD::Suld1DI16Zero";
+  case NVPTXISD::Suld1DI32Zero:         return "NVPTXISD::Suld1DI32Zero";
+  case NVPTXISD::Suld1DI64Zero:         return "NVPTXISD::Suld1DI64Zero";
+  case NVPTXISD::Suld1DV2I8Zero:        return "NVPTXISD::Suld1DV2I8Zero";
+  case NVPTXISD::Suld1DV2I16Zero:       return "NVPTXISD::Suld1DV2I16Zero";
+  case NVPTXISD::Suld1DV2I32Zero:       return "NVPTXISD::Suld1DV2I32Zero";
+  case NVPTXISD::Suld1DV2I64Zero:       return "NVPTXISD::Suld1DV2I64Zero";
+  case NVPTXISD::Suld1DV4I8Zero:        return "NVPTXISD::Suld1DV4I8Zero";
+  case NVPTXISD::Suld1DV4I16Zero:       return "NVPTXISD::Suld1DV4I16Zero";
+  case NVPTXISD::Suld1DV4I32Zero:       return "NVPTXISD::Suld1DV4I32Zero";
+
+  case NVPTXISD::Suld1DArrayI8Zero:     return "NVPTXISD::Suld1DArrayI8Zero";
+  case NVPTXISD::Suld1DArrayI16Zero:    return "NVPTXISD::Suld1DArrayI16Zero";
+  case NVPTXISD::Suld1DArrayI32Zero:    return "NVPTXISD::Suld1DArrayI32Zero";
+  case NVPTXISD::Suld1DArrayI64Zero:    return "NVPTXISD::Suld1DArrayI64Zero";
+  case NVPTXISD::Suld1DArrayV2I8Zero:   return "NVPTXISD::Suld1DArrayV2I8Zero";
+  case NVPTXISD::Suld1DArrayV2I16Zero:  return "NVPTXISD::Suld1DArrayV2I16Zero";
+  case NVPTXISD::Suld1DArrayV2I32Zero:  return "NVPTXISD::Suld1DArrayV2I32Zero";
+  case NVPTXISD::Suld1DArrayV2I64Zero:  return "NVPTXISD::Suld1DArrayV2I64Zero";
+  case NVPTXISD::Suld1DArrayV4I8Zero:   return "NVPTXISD::Suld1DArrayV4I8Zero";
+  case NVPTXISD::Suld1DArrayV4I16Zero:  return "NVPTXISD::Suld1DArrayV4I16Zero";
+  case NVPTXISD::Suld1DArrayV4I32Zero:  return "NVPTXISD::Suld1DArrayV4I32Zero";
+
+  case NVPTXISD::Suld2DI8Zero:          return "NVPTXISD::Suld2DI8Zero";
+  case NVPTXISD::Suld2DI16Zero:         return "NVPTXISD::Suld2DI16Zero";
+  case NVPTXISD::Suld2DI32Zero:         return "NVPTXISD::Suld2DI32Zero";
+  case NVPTXISD::Suld2DI64Zero:         return "NVPTXISD::Suld2DI64Zero";
+  case NVPTXISD::Suld2DV2I8Zero:        return "NVPTXISD::Suld2DV2I8Zero";
+  case NVPTXISD::Suld2DV2I16Zero:       return "NVPTXISD::Suld2DV2I16Zero";
+  case NVPTXISD::Suld2DV2I32Zero:       return "NVPTXISD::Suld2DV2I32Zero";
+  case NVPTXISD::Suld2DV2I64Zero:       return "NVPTXISD::Suld2DV2I64Zero";
+  case NVPTXISD::Suld2DV4I8Zero:        return "NVPTXISD::Suld2DV4I8Zero";
+  case NVPTXISD::Suld2DV4I16Zero:       return "NVPTXISD::Suld2DV4I16Zero";
+  case NVPTXISD::Suld2DV4I32Zero:       return "NVPTXISD::Suld2DV4I32Zero";
+
+  case NVPTXISD::Suld2DArrayI8Zero:     return "NVPTXISD::Suld2DArrayI8Zero";
+  case NVPTXISD::Suld2DArrayI16Zero:    return "NVPTXISD::Suld2DArrayI16Zero";
+  case NVPTXISD::Suld2DArrayI32Zero:    return "NVPTXISD::Suld2DArrayI32Zero";
+  case NVPTXISD::Suld2DArrayI64Zero:    return "NVPTXISD::Suld2DArrayI64Zero";
+  case NVPTXISD::Suld2DArrayV2I8Zero:   return "NVPTXISD::Suld2DArrayV2I8Zero";
+  case NVPTXISD::Suld2DArrayV2I16Zero:  return "NVPTXISD::Suld2DArrayV2I16Zero";
+  case NVPTXISD::Suld2DArrayV2I32Zero:  return "NVPTXISD::Suld2DArrayV2I32Zero";
+  case NVPTXISD::Suld2DArrayV2I64Zero:  return "NVPTXISD::Suld2DArrayV2I64Zero";
+  case NVPTXISD::Suld2DArrayV4I8Zero:   return "NVPTXISD::Suld2DArrayV4I8Zero";
+  case NVPTXISD::Suld2DArrayV4I16Zero:  return "NVPTXISD::Suld2DArrayV4I16Zero";
+  case NVPTXISD::Suld2DArrayV4I32Zero:  return "NVPTXISD::Suld2DArrayV4I32Zero";
+
+  case NVPTXISD::Suld3DI8Zero:          return "NVPTXISD::Suld3DI8Zero";
+  case NVPTXISD::Suld3DI16Zero:         return "NVPTXISD::Suld3DI16Zero";
+  case NVPTXISD::Suld3DI32Zero:         return "NVPTXISD::Suld3DI32Zero";
+  case NVPTXISD::Suld3DI64Zero:         return "NVPTXISD::Suld3DI64Zero";
+  case NVPTXISD::Suld3DV2I8Zero:        return "NVPTXISD::Suld3DV2I8Zero";
+  case NVPTXISD::Suld3DV2I16Zero:       return "NVPTXISD::Suld3DV2I16Zero";
+  case NVPTXISD::Suld3DV2I32Zero:       return "NVPTXISD::Suld3DV2I32Zero";
+  case NVPTXISD::Suld3DV2I64Zero:       return "NVPTXISD::Suld3DV2I64Zero";
+  case NVPTXISD::Suld3DV4I8Zero:        return "NVPTXISD::Suld3DV4I8Zero";
+  case NVPTXISD::Suld3DV4I16Zero:       return "NVPTXISD::Suld3DV4I16Zero";
+  case NVPTXISD::Suld3DV4I32Zero:       return "NVPTXISD::Suld3DV4I32Zero";
+  }
+}
+
+TargetLoweringBase::LegalizeTypeAction
+NVPTXTargetLowering::getPreferredVectorAction(EVT VT) const {
+  if (VT.getVectorNumElements() != 1 && VT.getScalarType() == MVT::i1)
+    return TypeSplitVector;
+
+  return TargetLoweringBase::getPreferredVectorAction(VT);
+}
+
+SDValue
+NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  Op = DAG.getTargetGlobalAddress(GV, dl, getPointerTy());
+  return DAG.getNode(NVPTXISD::Wrapper, dl, getPointerTy(), Op);
+}
+
+std::string
+NVPTXTargetLowering::getPrototype(Type *retTy, const ArgListTy &Args,
+                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                  unsigned retAlignment,
+                                  const ImmutableCallSite *CS) const {
+
+  bool isABI = (nvptxSubtarget.getSmVersion() >= 20);
+  assert(isABI && "Non-ABI compilation is not supported");
+  if (!isABI)
+    return "";
+
+  std::stringstream O;
+  O << "prototype_" << uniqueCallSite << " : .callprototype ";
+
+  if (retTy->getTypeID() == Type::VoidTyID) {
+    O << "()";
+  } else {
+    O << "(";
+    if (retTy->isFloatingPointTy() || retTy->isIntegerTy()) {
+      unsigned size = 0;
+      if (const IntegerType *ITy = dyn_cast<IntegerType>(retTy)) {
+        size = ITy->getBitWidth();
+        if (size < 32)
+          size = 32;
+      } else {
+        assert(retTy->isFloatingPointTy() &&
+               "Floating point type expected here");
+        size = retTy->getPrimitiveSizeInBits();
+      }
+
+      O << ".param .b" << size << " _";
+    } else if (isa<PointerType>(retTy)) {
+      O << ".param .b" << getPointerTy().getSizeInBits() << " _";
+    } else {
+      if((retTy->getTypeID() == Type::StructTyID) ||
+         isa<VectorType>(retTy)) {
+        O << ".param .align "
+          << retAlignment
+          << " .b8 _["
+          << getDataLayout()->getTypeAllocSize(retTy) << "]";
+      } else {
+        assert(false && "Unknown return type");
+      }
+    }
+    O << ") ";
+  }
+  O << "_ (";
+
+  bool first = true;
+  MVT thePointerTy = getPointerTy();
+
+  unsigned OIdx = 0;
+  for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
+    Type *Ty = Args[i].Ty;
+    if (!first) {
+      O << ", ";
+    }
+    first = false;
+
+    if (Outs[OIdx].Flags.isByVal() == false) {
+      if (Ty->isAggregateType() || Ty->isVectorTy()) {
+        unsigned align = 0;
+        const CallInst *CallI = cast<CallInst>(CS->getInstruction());
+        const DataLayout *TD = getDataLayout();
+        // +1 because index 0 is reserved for return type alignment
+        if (!llvm::getAlign(*CallI, i + 1, align))
+          align = TD->getABITypeAlignment(Ty);
+        unsigned sz = TD->getTypeAllocSize(Ty);
+        O << ".param .align " << align << " .b8 ";
+        O << "_";
+        O << "[" << sz << "]";
+        // update the index for Outs
+        SmallVector<EVT, 16> vtparts;
+        ComputeValueVTs(*this, Ty, vtparts);
+        if (unsigned len = vtparts.size())
+          OIdx += len - 1;
+        continue;
+      }
+       // i8 types in IR will be i16 types in SDAG
+      assert((getValueType(Ty) == Outs[OIdx].VT ||
+             (getValueType(Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
+             "type mismatch between callee prototype and arguments");
+      // scalar type
+      unsigned sz = 0;
+      if (isa<IntegerType>(Ty)) {
+        sz = cast<IntegerType>(Ty)->getBitWidth();
+        if (sz < 32)
+          sz = 32;
+      } else if (isa<PointerType>(Ty))
+        sz = thePointerTy.getSizeInBits();
+      else
+        sz = Ty->getPrimitiveSizeInBits();
+      O << ".param .b" << sz << " ";
+      O << "_";
+      continue;
+    }
+    const PointerType *PTy = dyn_cast<PointerType>(Ty);
+    assert(PTy && "Param with byval attribute should be a pointer type");
+    Type *ETy = PTy->getElementType();
+
+    unsigned align = Outs[OIdx].Flags.getByValAlign();
+    unsigned sz = getDataLayout()->getTypeAllocSize(ETy);
+    O << ".param .align " << align << " .b8 ";
+    O << "_";
+    O << "[" << sz << "]";
+  }
+  O << ");";
+  return O.str();
+}
+
+unsigned
+NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
+                                          const ImmutableCallSite *CS,
+                                          Type *Ty,
+                                          unsigned Idx) const {
+  const DataLayout *TD = getDataLayout();
+  unsigned Align = 0;
+  const Value *DirectCallee = CS->getCalledFunction();
+
+  if (!DirectCallee) {
+    // We don't have a direct function symbol, but that may be because of
+    // constant cast instructions in the call.
+    const Instruction *CalleeI = CS->getInstruction();
+    assert(CalleeI && "Call target is not a function or derived value?");
+
+    // With bitcast'd call targets, the instruction will be the call
+    if (isa<CallInst>(CalleeI)) {
+      // Check if we have call alignment metadata
+      if (llvm::getAlign(*cast<CallInst>(CalleeI), Idx, Align))
+        return Align;
+
+      const Value *CalleeV = cast<CallInst>(CalleeI)->getCalledValue();
+      // Ignore any bitcast instructions
+      while(isa<ConstantExpr>(CalleeV)) {
+        const ConstantExpr *CE = cast<ConstantExpr>(CalleeV);
+        if (!CE->isCast())
+          break;
+        // Look through the bitcast
+        CalleeV = cast<ConstantExpr>(CalleeV)->getOperand(0);
+      }
+
+      // We have now looked past all of the bitcasts.  Do we finally have a
+      // Function?
+      if (isa<Function>(CalleeV))
+        DirectCallee = CalleeV;
+    }
+  }
+
+  // Check for function alignment information if we found that the
+  // ultimate target is a Function
+  if (DirectCallee)
+    if (llvm::getAlign(*cast<Function>(DirectCallee), Idx, Align))
+      return Align;
+
+  // Call is indirect or alignment information is not available, fall back to
+  // the ABI type alignment
+  return TD->getABITypeAlignment(Ty);
+}
+
+SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                                       SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG = CLI.DAG;
+  SDLoc dl = CLI.DL;
+  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+  SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+  SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
+  SDValue Chain = CLI.Chain;
+  SDValue Callee = CLI.Callee;
+  bool &isTailCall = CLI.IsTailCall;
+  ArgListTy &Args = CLI.getArgs();
+  Type *retTy = CLI.RetTy;
+  ImmutableCallSite *CS = CLI.CS;
+
+  bool isABI = (nvptxSubtarget.getSmVersion() >= 20);
+  assert(isABI && "Non-ABI compilation is not supported");
+  if (!isABI)
+    return Chain;
+  const DataLayout *TD = getDataLayout();
+  MachineFunction &MF = DAG.getMachineFunction();
+  const Function *F = MF.getFunction();
+
+  SDValue tempChain = Chain;
+  Chain =
+      DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(uniqueCallSite, true),
+                           dl);
+  SDValue InFlag = Chain.getValue(1);
+
+  unsigned paramCount = 0;
+  // Args.size() and Outs.size() need not match.
+  // Outs.size() will be larger
+  //   * if there is an aggregate argument with multiple fields (each field
+  //     showing up separately in Outs)
+  //   * if there is a vector argument with more than typical vector-length
+  //     elements (generally if more than 4) where each vector element is
+  //     individually present in Outs.
+  // So a different index should be used for indexing into Outs/OutVals.
+  // See similar issue in LowerFormalArguments.
+  unsigned OIdx = 0;
+  // Declare the .params or .reg need to pass values
+  // to the function
+  for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
+    EVT VT = Outs[OIdx].VT;
+    Type *Ty = Args[i].Ty;
+
+    if (Outs[OIdx].Flags.isByVal() == false) {
+      if (Ty->isAggregateType()) {
+        // aggregate
+        SmallVector<EVT, 16> vtparts;
+        SmallVector<uint64_t, 16> Offsets;
+        ComputePTXValueVTs(*this, Ty, vtparts, &Offsets, 0);
+
+        unsigned align = getArgumentAlignment(Callee, CS, Ty, paramCount + 1);
+        // declare .param .align <align> .b8 .param<n>[<size>];
+        unsigned sz = TD->getTypeAllocSize(Ty);
+        SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+        SDValue DeclareParamOps[] = { Chain, DAG.getConstant(align, MVT::i32),
+                                      DAG.getConstant(paramCount, MVT::i32),
+                                      DAG.getConstant(sz, MVT::i32), InFlag };
+        Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
+                            DeclareParamOps);
+        InFlag = Chain.getValue(1);
+        for (unsigned j = 0, je = vtparts.size(); j != je; ++j) {
+          EVT elemtype = vtparts[j];
+          unsigned ArgAlign = GreatestCommonDivisor64(align, Offsets[j]);
+          if (elemtype.isInteger() && (sz < 8))
+            sz = 8;
+          SDValue StVal = OutVals[OIdx];
+          if (elemtype.getSizeInBits() < 16) {
+            StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
+          }
+          SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+          SDValue CopyParamOps[] = { Chain,
+                                     DAG.getConstant(paramCount, MVT::i32),
+                                     DAG.getConstant(Offsets[j], MVT::i32),
+                                     StVal, InFlag };
+          Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl,
+                                          CopyParamVTs, CopyParamOps,
+                                          elemtype, MachinePointerInfo(),
+                                          ArgAlign);
+          InFlag = Chain.getValue(1);
+          ++OIdx;
+        }
+        if (vtparts.size() > 0)
+          --OIdx;
+        ++paramCount;
+        continue;
+      }
+      if (Ty->isVectorTy()) {
+        EVT ObjectVT = getValueType(Ty);
+        unsigned align = getArgumentAlignment(Callee, CS, Ty, paramCount + 1);
+        // declare .param .align <align> .b8 .param<n>[<size>];
+        unsigned sz = TD->getTypeAllocSize(Ty);
+        SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+        SDValue DeclareParamOps[] = { Chain, DAG.getConstant(align, MVT::i32),
+                                      DAG.getConstant(paramCount, MVT::i32),
+                                      DAG.getConstant(sz, MVT::i32), InFlag };
+        Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
+                            DeclareParamOps);
+        InFlag = Chain.getValue(1);
+        unsigned NumElts = ObjectVT.getVectorNumElements();
+        EVT EltVT = ObjectVT.getVectorElementType();
+        EVT MemVT = EltVT;
+        bool NeedExtend = false;
+        if (EltVT.getSizeInBits() < 16) {
+          NeedExtend = true;
+          EltVT = MVT::i16;
+        }
+
+        // V1 store
+        if (NumElts == 1) {
+          SDValue Elt = OutVals[OIdx++];
+          if (NeedExtend)
+            Elt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt);
+
+          SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+          SDValue CopyParamOps[] = { Chain,
+                                     DAG.getConstant(paramCount, MVT::i32),
+                                     DAG.getConstant(0, MVT::i32), Elt,
+                                     InFlag };
+          Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl,
+                                          CopyParamVTs, CopyParamOps,
+                                          MemVT, MachinePointerInfo());
+          InFlag = Chain.getValue(1);
+        } else if (NumElts == 2) {
+          SDValue Elt0 = OutVals[OIdx++];
+          SDValue Elt1 = OutVals[OIdx++];
+          if (NeedExtend) {
+            Elt0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt0);
+            Elt1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt1);
+          }
+
+          SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+          SDValue CopyParamOps[] = { Chain,
+                                     DAG.getConstant(paramCount, MVT::i32),
+                                     DAG.getConstant(0, MVT::i32), Elt0, Elt1,
+                                     InFlag };
+          Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParamV2, dl,
+                                          CopyParamVTs, CopyParamOps,
+                                          MemVT, MachinePointerInfo());
+          InFlag = Chain.getValue(1);
+        } else {
+          unsigned curOffset = 0;
+          // V4 stores
+          // We have at least 4 elements (<3 x Ty> expands to 4 elements) and
+          // the
+          // vector will be expanded to a power of 2 elements, so we know we can
+          // always round up to the next multiple of 4 when creating the vector
+          // stores.
+          // e.g.  4 elem => 1 st.v4
+          //       6 elem => 2 st.v4
+          //       8 elem => 2 st.v4
+          //      11 elem => 3 st.v4
+          unsigned VecSize = 4;
+          if (EltVT.getSizeInBits() == 64)
+            VecSize = 2;
+
+          // This is potentially only part of a vector, so assume all elements
+          // are packed together.
+          unsigned PerStoreOffset = MemVT.getStoreSizeInBits() / 8 * VecSize;
+
+          for (unsigned i = 0; i < NumElts; i += VecSize) {
+            // Get values
+            SDValue StoreVal;
+            SmallVector<SDValue, 8> Ops;
+            Ops.push_back(Chain);
+            Ops.push_back(DAG.getConstant(paramCount, MVT::i32));
+            Ops.push_back(DAG.getConstant(curOffset, MVT::i32));
+
+            unsigned Opc = NVPTXISD::StoreParamV2;
+
+            StoreVal = OutVals[OIdx++];
+            if (NeedExtend)
+              StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
+            Ops.push_back(StoreVal);
+
+            if (i + 1 < NumElts) {
+              StoreVal = OutVals[OIdx++];
+              if (NeedExtend)
+                StoreVal =
+                    DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
+            } else {
+              StoreVal = DAG.getUNDEF(EltVT);
+            }
+            Ops.push_back(StoreVal);
+
+            if (VecSize == 4) {
+              Opc = NVPTXISD::StoreParamV4;
+              if (i + 2 < NumElts) {
+                StoreVal = OutVals[OIdx++];
+                if (NeedExtend)
+                  StoreVal =
+                      DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
+              } else {
+                StoreVal = DAG.getUNDEF(EltVT);
+              }
+              Ops.push_back(StoreVal);
+
+              if (i + 3 < NumElts) {
+                StoreVal = OutVals[OIdx++];
+                if (NeedExtend)
+                  StoreVal =
+                      DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
+              } else {
+                StoreVal = DAG.getUNDEF(EltVT);
+              }
+              Ops.push_back(StoreVal);
+            }
+
+            Ops.push_back(InFlag);
+
+            SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+            Chain = DAG.getMemIntrinsicNode(Opc, dl, CopyParamVTs, Ops,
+                                            MemVT, MachinePointerInfo());
+            InFlag = Chain.getValue(1);
+            curOffset += PerStoreOffset;
+          }
+        }
+        ++paramCount;
+        --OIdx;
+        continue;
+      }
+      // Plain scalar
+      // for ABI,    declare .param .b<size> .param<n>;
+      unsigned sz = VT.getSizeInBits();
+      bool needExtend = false;
+      if (VT.isInteger()) {
+        if (sz < 16)
+          needExtend = true;
+        if (sz < 32)
+          sz = 32;
+      }
+      SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+      SDValue DeclareParamOps[] = { Chain,
+                                    DAG.getConstant(paramCount, MVT::i32),
+                                    DAG.getConstant(sz, MVT::i32),
+                                    DAG.getConstant(0, MVT::i32), InFlag };
+      Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
+                          DeclareParamOps);
+      InFlag = Chain.getValue(1);
+      SDValue OutV = OutVals[OIdx];
+      if (needExtend) {
+        // zext/sext i1 to i16
+        unsigned opc = ISD::ZERO_EXTEND;
+        if (Outs[OIdx].Flags.isSExt())
+          opc = ISD::SIGN_EXTEND;
+        OutV = DAG.getNode(opc, dl, MVT::i16, OutV);
+      }
+      SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+      SDValue CopyParamOps[] = { Chain, DAG.getConstant(paramCount, MVT::i32),
+                                 DAG.getConstant(0, MVT::i32), OutV, InFlag };
+
+      unsigned opcode = NVPTXISD::StoreParam;
+      if (Outs[OIdx].Flags.isZExt())
+        opcode = NVPTXISD::StoreParamU32;
+      else if (Outs[OIdx].Flags.isSExt())
+        opcode = NVPTXISD::StoreParamS32;
+      Chain = DAG.getMemIntrinsicNode(opcode, dl, CopyParamVTs, CopyParamOps,
+                                      VT, MachinePointerInfo());
+
+      InFlag = Chain.getValue(1);
+      ++paramCount;
+      continue;
+    }
+    // struct or vector
+    SmallVector<EVT, 16> vtparts;
+    SmallVector<uint64_t, 16> Offsets;
+    const PointerType *PTy = dyn_cast<PointerType>(Args[i].Ty);
+    assert(PTy && "Type of a byval parameter should be pointer");
+    ComputePTXValueVTs(*this, PTy->getElementType(), vtparts, &Offsets, 0);
+
+    // declare .param .align <align> .b8 .param<n>[<size>];
+    unsigned sz = Outs[OIdx].Flags.getByValSize();
+    SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+    unsigned ArgAlign = Outs[OIdx].Flags.getByValAlign();
+    // The ByValAlign in the Outs[OIdx].Flags is alway set at this point,
+    // so we don't need to worry about natural alignment or not.
+    // See TargetLowering::LowerCallTo().
+    SDValue DeclareParamOps[] = {
+      Chain, DAG.getConstant(Outs[OIdx].Flags.getByValAlign(), MVT::i32),
+      DAG.getConstant(paramCount, MVT::i32), DAG.getConstant(sz, MVT::i32),
+      InFlag
+    };
+    Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
+                        DeclareParamOps);
+    InFlag = Chain.getValue(1);
+    for (unsigned j = 0, je = vtparts.size(); j != je; ++j) {
+      EVT elemtype = vtparts[j];
+      int curOffset = Offsets[j];
+      unsigned PartAlign = GreatestCommonDivisor64(ArgAlign, curOffset);
+      SDValue srcAddr =
+          DAG.getNode(ISD::ADD, dl, getPointerTy(), OutVals[OIdx],
+                      DAG.getConstant(curOffset, getPointerTy()));
+      SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr,
+                                   MachinePointerInfo(), false, false, false,
+                                   PartAlign);
+      if (elemtype.getSizeInBits() < 16) {
+        theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal);
+      }
+      SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+      SDValue CopyParamOps[] = { Chain, DAG.getConstant(paramCount, MVT::i32),
+                                 DAG.getConstant(curOffset, MVT::i32), theVal,
+                                 InFlag };
+      Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs,
+                                      CopyParamOps, elemtype,
+                                      MachinePointerInfo());
+
+      InFlag = Chain.getValue(1);
+    }
+    ++paramCount;
+  }
+
+  GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
+  unsigned retAlignment = 0;
+
+  // Handle Result
+  if (Ins.size() > 0) {
+    SmallVector<EVT, 16> resvtparts;
+    ComputeValueVTs(*this, retTy, resvtparts);
+
+    // Declare
+    //  .param .align 16 .b8 retval0[<size-in-bytes>], or
+    //  .param .b<size-in-bits> retval0
+    unsigned resultsz = TD->getTypeAllocSizeInBits(retTy);
+    if (retTy->isSingleValueType()) {
+      // Scalar needs to be at least 32bit wide
+      if (resultsz < 32)
+        resultsz = 32;
+      SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+      SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, MVT::i32),
+                                  DAG.getConstant(resultsz, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32), InFlag };
+      Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
+                          DeclareRetOps);
+      InFlag = Chain.getValue(1);
+    } else {
+      retAlignment = getArgumentAlignment(Callee, CS, retTy, 0);
+      SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+      SDValue DeclareRetOps[] = { Chain,
+                                  DAG.getConstant(retAlignment, MVT::i32),
+                                  DAG.getConstant(resultsz / 8, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32), InFlag };
+      Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
+                          DeclareRetOps);
+      InFlag = Chain.getValue(1);
+    }
+  }
+
+  if (!Func) {
+    // This is indirect function call case : PTX requires a prototype of the
+    // form
+    // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
+    // to be emitted, and the label has to used as the last arg of call
+    // instruction.
+    // The prototype is embedded in a string and put as the operand for a
+    // CallPrototype SDNode which will print out to the value of the string.
+    SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+    std::string Proto = getPrototype(retTy, Args, Outs, retAlignment, CS);
+    const char *ProtoStr =
+      nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str();
+    SDValue ProtoOps[] = {
+      Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InFlag,
+    };
+    Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps);
+    InFlag = Chain.getValue(1);
+  }
+  // Op to just print "call"
+  SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue PrintCallOps[] = {
+    Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, MVT::i32), InFlag
+  };
+  Chain = DAG.getNode(Func ? (NVPTXISD::PrintCallUni) : (NVPTXISD::PrintCall),
+                      dl, PrintCallVTs, PrintCallOps);
+  InFlag = Chain.getValue(1);
+
+  // Ops to print out the function name
+  SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue CallVoidOps[] = { Chain, Callee, InFlag };
+  Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps);
+  InFlag = Chain.getValue(1);
+
+  // Ops to print out the param list
+  SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue CallArgBeginOps[] = { Chain, InFlag };
+  Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,
+                      CallArgBeginOps);
+  InFlag = Chain.getValue(1);
+
+  for (unsigned i = 0, e = paramCount; i != e; ++i) {
+    unsigned opcode;
+    if (i == (e - 1))
+      opcode = NVPTXISD::LastCallArg;
+    else
+      opcode = NVPTXISD::CallArg;
+    SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+    SDValue CallArgOps[] = { Chain, DAG.getConstant(1, MVT::i32),
+                             DAG.getConstant(i, MVT::i32), InFlag };
+    Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps);
+    InFlag = Chain.getValue(1);
+  }
+  SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue CallArgEndOps[] = { Chain, DAG.getConstant(Func ? 1 : 0, MVT::i32),
+                              InFlag };
+  Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps);
+  InFlag = Chain.getValue(1);
+
+  if (!Func) {
+    SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+    SDValue PrototypeOps[] = { Chain, DAG.getConstant(uniqueCallSite, MVT::i32),
+                               InFlag };
+    Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);
+    InFlag = Chain.getValue(1);
+  }
+
+  // Generate loads from param memory/moves from registers for result
+  if (Ins.size() > 0) {
+    if (retTy && retTy->isVectorTy()) {
+      EVT ObjectVT = getValueType(retTy);
+      unsigned NumElts = ObjectVT.getVectorNumElements();
+      EVT EltVT = ObjectVT.getVectorElementType();
+      assert(nvTM->getTargetLowering()->getNumRegisters(F->getContext(),
+                                                        ObjectVT) == NumElts &&
+             "Vector was not scalarized");
+      unsigned sz = EltVT.getSizeInBits();
+      bool needTruncate = sz < 8 ? true : false;
+
+      if (NumElts == 1) {
+        // Just a simple load
+        SmallVector<EVT, 4> LoadRetVTs;
+        if (EltVT == MVT::i1 || EltVT == MVT::i8) {
+          // If loading i1/i8 result, generate
+          //   load.b8 i16
+          //   if i1
+          //   trunc i16 to i1
+          LoadRetVTs.push_back(MVT::i16);
+        } else
+          LoadRetVTs.push_back(EltVT);
+        LoadRetVTs.push_back(MVT::Other);
+        LoadRetVTs.push_back(MVT::Glue);
+        SmallVector<SDValue, 4> LoadRetOps;
+        LoadRetOps.push_back(Chain);
+        LoadRetOps.push_back(DAG.getConstant(1, MVT::i32));
+        LoadRetOps.push_back(DAG.getConstant(0, MVT::i32));
+        LoadRetOps.push_back(InFlag);
+        SDValue retval = DAG.getMemIntrinsicNode(
+            NVPTXISD::LoadParam, dl,
+            DAG.getVTList(LoadRetVTs), LoadRetOps, EltVT, MachinePointerInfo());
+        Chain = retval.getValue(1);
+        InFlag = retval.getValue(2);
+        SDValue Ret0 = retval;
+        if (needTruncate)
+          Ret0 = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Ret0);
+        InVals.push_back(Ret0);
+      } else if (NumElts == 2) {
+        // LoadV2
+        SmallVector<EVT, 4> LoadRetVTs;
+        if (EltVT == MVT::i1 || EltVT == MVT::i8) {
+          // If loading i1/i8 result, generate
+          //   load.b8 i16
+          //   if i1
+          //   trunc i16 to i1
+          LoadRetVTs.push_back(MVT::i16);
+          LoadRetVTs.push_back(MVT::i16);
+        } else {
+          LoadRetVTs.push_back(EltVT);
+          LoadRetVTs.push_back(EltVT);
+        }
+        LoadRetVTs.push_back(MVT::Other);
+        LoadRetVTs.push_back(MVT::Glue);
+        SmallVector<SDValue, 4> LoadRetOps;
+        LoadRetOps.push_back(Chain);
+        LoadRetOps.push_back(DAG.getConstant(1, MVT::i32));
+        LoadRetOps.push_back(DAG.getConstant(0, MVT::i32));
+        LoadRetOps.push_back(InFlag);
+        SDValue retval = DAG.getMemIntrinsicNode(
+            NVPTXISD::LoadParamV2, dl,
+            DAG.getVTList(LoadRetVTs), LoadRetOps, EltVT, MachinePointerInfo());
+        Chain = retval.getValue(2);
+        InFlag = retval.getValue(3);
+        SDValue Ret0 = retval.getValue(0);
+        SDValue Ret1 = retval.getValue(1);
+        if (needTruncate) {
+          Ret0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ret0);
+          InVals.push_back(Ret0);
+          Ret1 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ret1);
+          InVals.push_back(Ret1);
+        } else {
+          InVals.push_back(Ret0);
+          InVals.push_back(Ret1);
+        }
+      } else {
+        // Split into N LoadV4
+        unsigned Ofst = 0;
+        unsigned VecSize = 4;
+        unsigned Opc = NVPTXISD::LoadParamV4;
+        if (EltVT.getSizeInBits() == 64) {
+          VecSize = 2;
+          Opc = NVPTXISD::LoadParamV2;
+        }
+        EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, VecSize);
+        for (unsigned i = 0; i < NumElts; i += VecSize) {
+          SmallVector<EVT, 8> LoadRetVTs;
+          if (EltVT == MVT::i1 || EltVT == MVT::i8) {
+            // If loading i1/i8 result, generate
+            //   load.b8 i16
+            //   if i1
+            //   trunc i16 to i1
+            for (unsigned j = 0; j < VecSize; ++j)
+              LoadRetVTs.push_back(MVT::i16);
+          } else {
+            for (unsigned j = 0; j < VecSize; ++j)
+              LoadRetVTs.push_back(EltVT);
+          }
+          LoadRetVTs.push_back(MVT::Other);
+          LoadRetVTs.push_back(MVT::Glue);
+          SmallVector<SDValue, 4> LoadRetOps;
+          LoadRetOps.push_back(Chain);
+          LoadRetOps.push_back(DAG.getConstant(1, MVT::i32));
+          LoadRetOps.push_back(DAG.getConstant(Ofst, MVT::i32));
+          LoadRetOps.push_back(InFlag);
+          SDValue retval = DAG.getMemIntrinsicNode(
+              Opc, dl, DAG.getVTList(LoadRetVTs),
+              LoadRetOps, EltVT, MachinePointerInfo());
+          if (VecSize == 2) {
+            Chain = retval.getValue(2);
+            InFlag = retval.getValue(3);
+          } else {
+            Chain = retval.getValue(4);
+            InFlag = retval.getValue(5);
+          }
+
+          for (unsigned j = 0; j < VecSize; ++j) {
+            if (i + j >= NumElts)
+              break;
+            SDValue Elt = retval.getValue(j);
+            if (needTruncate)
+              Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
+            InVals.push_back(Elt);
+          }
+          Ofst += TD->getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
+        }
+      }
+    } else {
+      SmallVector<EVT, 16> VTs;
+      SmallVector<uint64_t, 16> Offsets;
+      ComputePTXValueVTs(*this, retTy, VTs, &Offsets, 0);
+      assert(VTs.size() == Ins.size() && "Bad value decomposition");
+      unsigned RetAlign = getArgumentAlignment(Callee, CS, retTy, 0);
+      for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+        unsigned sz = VTs[i].getSizeInBits();
+        unsigned AlignI = GreatestCommonDivisor64(RetAlign, Offsets[i]);
+        bool needTruncate = sz < 8 ? true : false;
+        if (VTs[i].isInteger() && (sz < 8))
+          sz = 8;
+
+        SmallVector<EVT, 4> LoadRetVTs;
+        EVT TheLoadType = VTs[i];
+        if (retTy->isIntegerTy() &&
+            TD->getTypeAllocSizeInBits(retTy) < 32) {
+          // This is for integer types only, and specifically not for
+          // aggregates.
+          LoadRetVTs.push_back(MVT::i32);
+          TheLoadType = MVT::i32;
+        } else if (sz < 16) {
+          // If loading i1/i8 result, generate
+          //   load i8 (-> i16)
+          //   trunc i16 to i1/i8
+          LoadRetVTs.push_back(MVT::i16);
+        } else
+          LoadRetVTs.push_back(Ins[i].VT);
+        LoadRetVTs.push_back(MVT::Other);
+        LoadRetVTs.push_back(MVT::Glue);
+
+        SmallVector<SDValue, 4> LoadRetOps;
+        LoadRetOps.push_back(Chain);
+        LoadRetOps.push_back(DAG.getConstant(1, MVT::i32));
+        LoadRetOps.push_back(DAG.getConstant(Offsets[i], MVT::i32));
+        LoadRetOps.push_back(InFlag);
+        SDValue retval = DAG.getMemIntrinsicNode(
+            NVPTXISD::LoadParam, dl,
+            DAG.getVTList(LoadRetVTs), LoadRetOps,
+            TheLoadType, MachinePointerInfo(), AlignI);
+        Chain = retval.getValue(1);
+        InFlag = retval.getValue(2);
+        SDValue Ret0 = retval.getValue(0);
+        if (needTruncate)
+          Ret0 = DAG.getNode(ISD::TRUNCATE, dl, Ins[i].VT, Ret0);
+        InVals.push_back(Ret0);
+      }
+    }
+  }
+
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(uniqueCallSite, true),
+                             DAG.getIntPtrConstant(uniqueCallSite + 1, true),
+                             InFlag, dl);
+  uniqueCallSite++;
+
+  // set isTailCall to false for now, until we figure out how to express
+  // tail call optimization in PTX
+  isTailCall = false;
+  return Chain;
+}
+
+// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
+// (see LegalizeDAG.cpp). This is slow and uses local memory.
+// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
+SDValue
+NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
+  SDNode *Node = Op.getNode();
+  SDLoc dl(Node);
+  SmallVector<SDValue, 8> Ops;
+  unsigned NumOperands = Node->getNumOperands();
+  for (unsigned i = 0; i < NumOperands; ++i) {
+    SDValue SubOp = Node->getOperand(i);
+    EVT VVT = SubOp.getNode()->getValueType(0);
+    EVT EltVT = VVT.getVectorElementType();
+    unsigned NumSubElem = VVT.getVectorNumElements();
+    for (unsigned j = 0; j < NumSubElem; ++j) {
+      Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
+                                DAG.getIntPtrConstant(j)));
+    }
+  }
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, Node->getValueType(0), Ops);
+}
+
+/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
+/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
+///    amount, or
+/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
+///    amount.
+SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+  assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
+
+  EVT VT = Op.getValueType();
+  unsigned VTBits = VT.getSizeInBits();
+  SDLoc dl(Op);
+  SDValue ShOpLo = Op.getOperand(0);
+  SDValue ShOpHi = Op.getOperand(1);
+  SDValue ShAmt  = Op.getOperand(2);
+  unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
+
+  if (VTBits == 32 && nvptxSubtarget.getSmVersion() >= 35) {
+
+    // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
+    // {dHi, dLo} = {aHi, aLo} >> Amt
+    //   dHi = aHi >> Amt
+    //   dLo = shf.r.clamp aLo, aHi, Amt
+
+    SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
+    SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi,
+                             ShAmt);
+
+    SDValue Ops[2] = { Lo, Hi };
+    return DAG.getMergeValues(Ops, dl);
+  }
+  else {
+
+    // {dHi, dLo} = {aHi, aLo} >> Amt
+    // - if (Amt>=size) then
+    //      dLo = aHi >> (Amt-size)
+    //      dHi = aHi >> Amt (this is either all 0 or all 1)
+    //   else
+    //      dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
+    //      dHi = aHi >> Amt
+
+    SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
+                                   DAG.getConstant(VTBits, MVT::i32), ShAmt);
+    SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
+    SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
+                                     DAG.getConstant(VTBits, MVT::i32));
+    SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
+    SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+    SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
+
+    SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
+                               DAG.getConstant(VTBits, MVT::i32), ISD::SETGE);
+    SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
+    SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
+
+    SDValue Ops[2] = { Lo, Hi };
+    return DAG.getMergeValues(Ops, dl);
+  }
+}
+
+/// LowerShiftLeftParts - Lower SHL_PARTS, which
+/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
+///    amount, or
+/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
+///    amount.
+SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+  assert(Op.getOpcode() == ISD::SHL_PARTS);
+
+  EVT VT = Op.getValueType();
+  unsigned VTBits = VT.getSizeInBits();
+  SDLoc dl(Op);
+  SDValue ShOpLo = Op.getOperand(0);
+  SDValue ShOpHi = Op.getOperand(1);
+  SDValue ShAmt  = Op.getOperand(2);
+
+  if (VTBits == 32 && nvptxSubtarget.getSmVersion() >= 35) {
+
+    // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
+    // {dHi, dLo} = {aHi, aLo} << Amt
+    //   dHi = shf.l.clamp aLo, aHi, Amt
+    //   dLo = aLo << Amt
+
+    SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi,
+                             ShAmt);
+    SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+
+    SDValue Ops[2] = { Lo, Hi };
+    return DAG.getMergeValues(Ops, dl);
+  }
+  else {
+
+    // {dHi, dLo} = {aHi, aLo} << Amt
+    // - if (Amt>=size) then
+    //      dLo = aLo << Amt (all 0)
+    //      dLo = aLo << (Amt-size)
+    //   else
+    //      dLo = aLo << Amt
+    //      dHi = (aHi << Amt) | (aLo >> (size-Amt))
+
+    SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
+                                   DAG.getConstant(VTBits, MVT::i32), ShAmt);
+    SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
+    SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
+                                     DAG.getConstant(VTBits, MVT::i32));
+    SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
+    SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+    SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
+
+    SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
+                               DAG.getConstant(VTBits, MVT::i32), ISD::SETGE);
+    SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+    SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
+
+    SDValue Ops[2] = { Lo, Hi };
+    return DAG.getMergeValues(Ops, dl);
+  }
+}
+
+SDValue
+NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  case ISD::RETURNADDR:
+    return SDValue();
+  case ISD::FRAMEADDR:
+    return SDValue();
+  case ISD::GlobalAddress:
+    return LowerGlobalAddress(Op, DAG);
+  case ISD::INTRINSIC_W_CHAIN:
+    return Op;
+  case ISD::BUILD_VECTOR:
+  case ISD::EXTRACT_SUBVECTOR:
+    return Op;
+  case ISD::CONCAT_VECTORS:
+    return LowerCONCAT_VECTORS(Op, DAG);
+  case ISD::STORE:
+    return LowerSTORE(Op, DAG);
+  case ISD::LOAD:
+    return LowerLOAD(Op, DAG);
+  case ISD::SHL_PARTS:
+    return LowerShiftLeftParts(Op, DAG);
+  case ISD::SRA_PARTS:
+  case ISD::SRL_PARTS:
+    return LowerShiftRightParts(Op, DAG);
+  default:
+    llvm_unreachable("Custom lowering not defined for operation");
+  }
+}
+
+SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+  if (Op.getValueType() == MVT::i1)
+    return LowerLOADi1(Op, DAG);
+  else
+    return SDValue();
+}
+
+// v = ld i1* addr
+//   =>
+// v1 = ld i8* addr (-> i16)
+// v = trunc i16 to i1
+SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
+  SDNode *Node = Op.getNode();
+  LoadSDNode *LD = cast<LoadSDNode>(Node);
+  SDLoc dl(Node);
+  assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
+  assert(Node->getValueType(0) == MVT::i1 &&
+         "Custom lowering for i1 load only");
+  SDValue newLD =
+      DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(),
+                  LD->getPointerInfo(), LD->isVolatile(), LD->isNonTemporal(),
+                  LD->isInvariant(), LD->getAlignment());
+  SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
+  // The legalizer (the caller) is expecting two values from the legalized
+  // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
+  // in LegalizeDAG.cpp which also uses MergeValues.
+  SDValue Ops[] = { result, LD->getChain() };
+  return DAG.getMergeValues(Ops, dl);
+}
+
+SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+  EVT ValVT = Op.getOperand(1).getValueType();
+  if (ValVT == MVT::i1)
+    return LowerSTOREi1(Op, DAG);
+  else if (ValVT.isVector())
+    return LowerSTOREVector(Op, DAG);
+  else
+    return SDValue();
+}
+
+SDValue
+NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
+  SDNode *N = Op.getNode();
+  SDValue Val = N->getOperand(1);
+  SDLoc DL(N);
+  EVT ValVT = Val.getValueType();
+
+  if (ValVT.isVector()) {
+    // We only handle "native" vector sizes for now, e.g. <4 x double> is not
+    // legal.  We can (and should) split that into 2 stores of <2 x double> here
+    // but I'm leaving that as a TODO for now.
+    if (!ValVT.isSimple())
+      return SDValue();
+    switch (ValVT.getSimpleVT().SimpleTy) {
+    default:
+      return SDValue();
+    case MVT::v2i8:
+    case MVT::v2i16:
+    case MVT::v2i32:
+    case MVT::v2i64:
+    case MVT::v2f32:
+    case MVT::v2f64:
+    case MVT::v4i8:
+    case MVT::v4i16:
+    case MVT::v4i32:
+    case MVT::v4f32:
+      // This is a "native" vector type
+      break;
+    }
+
+    MemSDNode *MemSD = cast<MemSDNode>(N);
+    const DataLayout *TD = getDataLayout();
+
+    unsigned Align = MemSD->getAlignment();
+    unsigned PrefAlign =
+      TD->getPrefTypeAlignment(ValVT.getTypeForEVT(*DAG.getContext()));
+    if (Align < PrefAlign) {
+      // This store is not sufficiently aligned, so bail out and let this vector
+      // store be scalarized.  Note that we may still be able to emit smaller
+      // vector stores.  For example, if we are storing a <4 x float> with an
+      // alignment of 8, this check will fail but the legalizer will try again
+      // with 2 x <2 x float>, which will succeed with an alignment of 8.
+      return SDValue();
+    }
+
+    unsigned Opcode = 0;
+    EVT EltVT = ValVT.getVectorElementType();
+    unsigned NumElts = ValVT.getVectorNumElements();
+
+    // Since StoreV2 is a target node, we cannot rely on DAG type legalization.
+    // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
+    // stored type to i16 and propagate the "real" type as the memory type.
+    bool NeedExt = false;
+    if (EltVT.getSizeInBits() < 16)
+      NeedExt = true;
+
+    switch (NumElts) {
+    default:
+      return SDValue();
+    case 2:
+      Opcode = NVPTXISD::StoreV2;
+      break;
+    case 4: {
+      Opcode = NVPTXISD::StoreV4;
+      break;
+    }
+    }
+
+    SmallVector<SDValue, 8> Ops;
+
+    // First is the chain
+    Ops.push_back(N->getOperand(0));
+
+    // Then the split values
+    for (unsigned i = 0; i < NumElts; ++i) {
+      SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
+                                   DAG.getIntPtrConstant(i));
+      if (NeedExt)
+        ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
+      Ops.push_back(ExtVal);
+    }
+
+    // Then any remaining arguments
+    for (unsigned i = 2, e = N->getNumOperands(); i != e; ++i) {
+      Ops.push_back(N->getOperand(i));
+    }
+
+    SDValue NewSt = DAG.getMemIntrinsicNode(
+        Opcode, DL, DAG.getVTList(MVT::Other), Ops,
+        MemSD->getMemoryVT(), MemSD->getMemOperand());
+
+    //return DCI.CombineTo(N, NewSt, true);
+    return NewSt;
+  }
+
+  return SDValue();
+}
+
+// st i1 v, addr
+//    =>
+// v1 = zxt v to i16
+// st.u8 i16, addr
+SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
+  SDNode *Node = Op.getNode();
+  SDLoc dl(Node);
+  StoreSDNode *ST = cast<StoreSDNode>(Node);
+  SDValue Tmp1 = ST->getChain();
+  SDValue Tmp2 = ST->getBasePtr();
+  SDValue Tmp3 = ST->getValue();
+  assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
+  unsigned Alignment = ST->getAlignment();
+  bool isVolatile = ST->isVolatile();
+  bool isNonTemporal = ST->isNonTemporal();
+  Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
+  SDValue Result = DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2,
+                                     ST->getPointerInfo(), MVT::i8, isNonTemporal,
+                                     isVolatile, Alignment);
+  return Result;
+}
+
+SDValue NVPTXTargetLowering::getExtSymb(SelectionDAG &DAG, const char *inname,
+                                        int idx, EVT v) const {
+  std::string *name = nvTM->getManagedStrPool()->getManagedString(inname);
+  std::stringstream suffix;
+  suffix << idx;
+  *name += suffix.str();
+  return DAG.getTargetExternalSymbol(name->c_str(), v);
+}
+
+SDValue
+NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const {
+  std::string ParamSym;
+  raw_string_ostream ParamStr(ParamSym);
+
+  ParamStr << DAG.getMachineFunction().getName() << "_param_" << idx;
+  ParamStr.flush();
+
+  std::string *SavedStr =
+    nvTM->getManagedStrPool()->getManagedString(ParamSym.c_str());
+  return DAG.getTargetExternalSymbol(SavedStr->c_str(), v);
+}
+
+SDValue NVPTXTargetLowering::getParamHelpSymbol(SelectionDAG &DAG, int idx) {
+  return getExtSymb(DAG, ".HLPPARAM", idx);
+}
+
+// Check to see if the kernel argument is image*_t or sampler_t
+
+bool llvm::isImageOrSamplerVal(const Value *arg, const Module *context) {
+  static const char *const specialTypes[] = { "struct._image2d_t",
+                                              "struct._image3d_t",
+                                              "struct._sampler_t" };
+
+  const Type *Ty = arg->getType();
+  const PointerType *PTy = dyn_cast<PointerType>(Ty);
+
+  if (!PTy)
+    return false;
+
+  if (!context)
+    return false;
+
+  const StructType *STy = dyn_cast<StructType>(PTy->getElementType());
+  const std::string TypeName = STy && !STy->isLiteral() ? STy->getName() : "";
+
+  for (int i = 0, e = array_lengthof(specialTypes); i != e; ++i)
+    if (TypeName == specialTypes[i])
+      return true;
+
+  return false;
+}
+
+SDValue NVPTXTargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc dl, SelectionDAG &DAG,
+    SmallVectorImpl<SDValue> &InVals) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  const DataLayout *TD = getDataLayout();
+
+  const Function *F = MF.getFunction();
+  const AttributeSet &PAL = F->getAttributes();
+  const TargetLowering *TLI = DAG.getTarget().getTargetLowering();
+
+  SDValue Root = DAG.getRoot();
+  std::vector<SDValue> OutChains;
+
+  bool isKernel = llvm::isKernelFunction(*F);
+  bool isABI = (nvptxSubtarget.getSmVersion() >= 20);
+  assert(isABI && "Non-ABI compilation is not supported");
+  if (!isABI)
+    return Chain;
+
+  std::vector<Type *> argTypes;
+  std::vector<const Argument *> theArgs;
+  for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+       I != E; ++I) {
+    theArgs.push_back(I);
+    argTypes.push_back(I->getType());
+  }
+  // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
+  // Ins.size() will be larger
+  //   * if there is an aggregate argument with multiple fields (each field
+  //     showing up separately in Ins)
+  //   * if there is a vector argument with more than typical vector-length
+  //     elements (generally if more than 4) where each vector element is
+  //     individually present in Ins.
+  // So a different index should be used for indexing into Ins.
+  // See similar issue in LowerCall.
+  unsigned InsIdx = 0;
+
+  int idx = 0;
+  for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) {
+    Type *Ty = argTypes[i];
+
+    // If the kernel argument is image*_t or sampler_t, convert it to
+    // a i32 constant holding the parameter position. This can later
+    // matched in the AsmPrinter to output the correct mangled name.
+    if (isImageOrSamplerVal(
+            theArgs[i],
+            (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent()
+                                     : nullptr))) {
+      assert(isKernel && "Only kernels can have image/sampler params");
+      InVals.push_back(DAG.getConstant(i + 1, MVT::i32));
+      continue;
+    }
+
+    if (theArgs[i]->use_empty()) {
+      // argument is dead
+      if (Ty->isAggregateType()) {
+        SmallVector<EVT, 16> vtparts;
+
+        ComputePTXValueVTs(*this, Ty, vtparts);
+        assert(vtparts.size() > 0 && "empty aggregate type not expected");
+        for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
+             ++parti) {
+          InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
+          ++InsIdx;
+        }
+        if (vtparts.size() > 0)
+          --InsIdx;
+        continue;
+      }
+      if (Ty->isVectorTy()) {
+        EVT ObjectVT = getValueType(Ty);
+        unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT);
+        for (unsigned parti = 0; parti < NumRegs; ++parti) {
+          InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
+          ++InsIdx;
+        }
+        if (NumRegs > 0)
+          --InsIdx;
+        continue;
+      }
+      InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
+      continue;
+    }
+
+    // In the following cases, assign a node order of "idx+1"
+    // to newly created nodes. The SDNodes for params have to
+    // appear in the same order as their order of appearance
+    // in the original function. "idx+1" holds that order.
+    if (PAL.hasAttribute(i + 1, Attribute::ByVal) == false) {
+      if (Ty->isAggregateType()) {
+        SmallVector<EVT, 16> vtparts;
+        SmallVector<uint64_t, 16> offsets;
+
+        // NOTE: Here, we lose the ability to issue vector loads for vectors
+        // that are a part of a struct.  This should be investigated in the
+        // future.
+        ComputePTXValueVTs(*this, Ty, vtparts, &offsets, 0);
+        assert(vtparts.size() > 0 && "empty aggregate type not expected");
+        bool aggregateIsPacked = false;
+        if (StructType *STy = llvm::dyn_cast<StructType>(Ty))
+          aggregateIsPacked = STy->isPacked();
+
+        SDValue Arg = getParamSymbol(DAG, idx, getPointerTy());
+        for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
+             ++parti) {
+          EVT partVT = vtparts[parti];
+          Value *srcValue = Constant::getNullValue(
+              PointerType::get(partVT.getTypeForEVT(F->getContext()),
+                               llvm::ADDRESS_SPACE_PARAM));
+          SDValue srcAddr =
+              DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg,
+                          DAG.getConstant(offsets[parti], getPointerTy()));
+          unsigned partAlign =
+              aggregateIsPacked ? 1
+                                : TD->getABITypeAlignment(
+                                      partVT.getTypeForEVT(F->getContext()));
+          SDValue p;
+          if (Ins[InsIdx].VT.getSizeInBits() > partVT.getSizeInBits()) {
+            ISD::LoadExtType ExtOp = Ins[InsIdx].Flags.isSExt() ? 
+                                     ISD::SEXTLOAD : ISD::ZEXTLOAD;
+            p = DAG.getExtLoad(ExtOp, dl, Ins[InsIdx].VT, Root, srcAddr,
+                               MachinePointerInfo(srcValue), partVT, false,
+                               false, partAlign);
+          } else {
+            p = DAG.getLoad(partVT, dl, Root, srcAddr,
+                            MachinePointerInfo(srcValue), false, false, false,
+                            partAlign);
+          }
+          if (p.getNode())
+            p.getNode()->setIROrder(idx + 1);
+          InVals.push_back(p);
+          ++InsIdx;
+        }
+        if (vtparts.size() > 0)
+          --InsIdx;
+        continue;
+      }
+      if (Ty->isVectorTy()) {
+        EVT ObjectVT = getValueType(Ty);
+        SDValue Arg = getParamSymbol(DAG, idx, getPointerTy());
+        unsigned NumElts = ObjectVT.getVectorNumElements();
+        assert(TLI->getNumRegisters(F->getContext(), ObjectVT) == NumElts &&
+               "Vector was not scalarized");
+        unsigned Ofst = 0;
+        EVT EltVT = ObjectVT.getVectorElementType();
+
+        // V1 load
+        // f32 = load ...
+        if (NumElts == 1) {
+          // We only have one element, so just directly load it
+          Value *SrcValue = Constant::getNullValue(PointerType::get(
+              EltVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
+          SDValue SrcAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg,
+                                        DAG.getConstant(Ofst, getPointerTy()));
+          SDValue P = DAG.getLoad(
+              EltVT, dl, Root, SrcAddr, MachinePointerInfo(SrcValue), false,
+              false, true,
+              TD->getABITypeAlignment(EltVT.getTypeForEVT(F->getContext())));
+          if (P.getNode())
+            P.getNode()->setIROrder(idx + 1);
+
+          if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits())
+            P = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, P);
+          InVals.push_back(P);
+          Ofst += TD->getTypeAllocSize(EltVT.getTypeForEVT(F->getContext()));
+          ++InsIdx;
+        } else if (NumElts == 2) {
+          // V2 load
+          // f32,f32 = load ...
+          EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, 2);
+          Value *SrcValue = Constant::getNullValue(PointerType::get(
+              VecVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
+          SDValue SrcAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg,
+                                        DAG.getConstant(Ofst, getPointerTy()));
+          SDValue P = DAG.getLoad(
+              VecVT, dl, Root, SrcAddr, MachinePointerInfo(SrcValue), false,
+              false, true,
+              TD->getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())));
+          if (P.getNode())
+            P.getNode()->setIROrder(idx + 1);
+
+          SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P,
+                                     DAG.getIntPtrConstant(0));
+          SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P,
+                                     DAG.getIntPtrConstant(1));
+
+          if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits()) {
+            Elt0 = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt0);
+            Elt1 = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt1);
+          }
+
+          InVals.push_back(Elt0);
+          InVals.push_back(Elt1);
+          Ofst += TD->getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
+          InsIdx += 2;
+        } else {
+          // V4 loads
+          // We have at least 4 elements (<3 x Ty> expands to 4 elements) and
+          // the
+          // vector will be expanded to a power of 2 elements, so we know we can
+          // always round up to the next multiple of 4 when creating the vector
+          // loads.
+          // e.g.  4 elem => 1 ld.v4
+          //       6 elem => 2 ld.v4
+          //       8 elem => 2 ld.v4
+          //      11 elem => 3 ld.v4
+          unsigned VecSize = 4;
+          if (EltVT.getSizeInBits() == 64) {
+            VecSize = 2;
+          }
+          EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, VecSize);
+          for (unsigned i = 0; i < NumElts; i += VecSize) {
+            Value *SrcValue = Constant::getNullValue(
+                PointerType::get(VecVT.getTypeForEVT(F->getContext()),
+                                 llvm::ADDRESS_SPACE_PARAM));
+            SDValue SrcAddr =
+                DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg,
+                            DAG.getConstant(Ofst, getPointerTy()));
+            SDValue P = DAG.getLoad(
+                VecVT, dl, Root, SrcAddr, MachinePointerInfo(SrcValue), false,
+                false, true,
+                TD->getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())));
+            if (P.getNode())
+              P.getNode()->setIROrder(idx + 1);
+
+            for (unsigned j = 0; j < VecSize; ++j) {
+              if (i + j >= NumElts)
+                break;
+              SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P,
+                                        DAG.getIntPtrConstant(j));
+              if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits())
+                Elt = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt);
+              InVals.push_back(Elt);
+            }
+            Ofst += TD->getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
+          }
+          InsIdx += NumElts;
+        }
+
+        if (NumElts > 0)
+          --InsIdx;
+        continue;
+      }
+      // A plain scalar.
+      EVT ObjectVT = getValueType(Ty);
+      // If ABI, load from the param symbol
+      SDValue Arg = getParamSymbol(DAG, idx, getPointerTy());
+      Value *srcValue = Constant::getNullValue(PointerType::get(
+          ObjectVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
+      SDValue p;
+       if (ObjectVT.getSizeInBits() < Ins[InsIdx].VT.getSizeInBits()) {
+        ISD::LoadExtType ExtOp = Ins[InsIdx].Flags.isSExt() ? 
+                                       ISD::SEXTLOAD : ISD::ZEXTLOAD;
+        p = DAG.getExtLoad(ExtOp, dl, Ins[InsIdx].VT, Root, Arg,
+                           MachinePointerInfo(srcValue), ObjectVT, false, false,
+        TD->getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext())));
+      } else {
+        p = DAG.getLoad(Ins[InsIdx].VT, dl, Root, Arg,
+                        MachinePointerInfo(srcValue), false, false, false,
+        TD->getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext())));
+      }
+      if (p.getNode())
+        p.getNode()->setIROrder(idx + 1);
+      InVals.push_back(p);
+      continue;
+    }
+
+    // Param has ByVal attribute
+    // Return MoveParam(param symbol).
+    // Ideally, the param symbol can be returned directly,
+    // but when SDNode builder decides to use it in a CopyToReg(),
+    // machine instruction fails because TargetExternalSymbol
+    // (not lowered) is target dependent, and CopyToReg assumes
+    // the source is lowered.
+    EVT ObjectVT = getValueType(Ty);
+    assert(ObjectVT == Ins[InsIdx].VT &&
+           "Ins type did not match function type");
+    SDValue Arg = getParamSymbol(DAG, idx, getPointerTy());
+    SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
+    if (p.getNode())
+      p.getNode()->setIROrder(idx + 1);
+    if (isKernel)
+      InVals.push_back(p);
+    else {
+      SDValue p2 = DAG.getNode(
+          ISD::INTRINSIC_WO_CHAIN, dl, ObjectVT,
+          DAG.getConstant(Intrinsic::nvvm_ptr_local_to_gen, MVT::i32), p);
+      InVals.push_back(p2);
+    }
+  }
+
+  // Clang will check explicit VarArg and issue error if any. However, Clang
+  // will let code with
+  // implicit var arg like f() pass. See bug 617733.
+  // We treat this case as if the arg list is empty.
+  // if (F.isVarArg()) {
+  // assert(0 && "VarArg not supported yet!");
+  //}
+
+  if (!OutChains.empty())
+    DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains));
+
+  return Chain;
+}
+
+
+SDValue
+NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                                 bool isVarArg,
+                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                 const SmallVectorImpl<SDValue> &OutVals,
+                                 SDLoc dl, SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  const Function *F = MF.getFunction();
+  Type *RetTy = F->getReturnType();
+  const DataLayout *TD = getDataLayout();
+
+  bool isABI = (nvptxSubtarget.getSmVersion() >= 20);
+  assert(isABI && "Non-ABI compilation is not supported");
+  if (!isABI)
+    return Chain;
+
+  if (VectorType *VTy = dyn_cast<VectorType>(RetTy)) {
+    // If we have a vector type, the OutVals array will be the scalarized
+    // components and we have combine them into 1 or more vector stores.
+    unsigned NumElts = VTy->getNumElements();
+    assert(NumElts == Outs.size() && "Bad scalarization of return value");
+
+    // const_cast can be removed in later LLVM versions
+    EVT EltVT = getValueType(RetTy).getVectorElementType();
+    bool NeedExtend = false;
+    if (EltVT.getSizeInBits() < 16)
+      NeedExtend = true;
+
+    // V1 store
+    if (NumElts == 1) {
+      SDValue StoreVal = OutVals[0];
+      // We only have one element, so just directly store it
+      if (NeedExtend)
+        StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
+      SDValue Ops[] = { Chain, DAG.getConstant(0, MVT::i32), StoreVal };
+      Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,
+                                      DAG.getVTList(MVT::Other), Ops,
+                                      EltVT, MachinePointerInfo());
+
+    } else if (NumElts == 2) {
+      // V2 store
+      SDValue StoreVal0 = OutVals[0];
+      SDValue StoreVal1 = OutVals[1];
+
+      if (NeedExtend) {
+        StoreVal0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal0);
+        StoreVal1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal1);
+      }
+
+      SDValue Ops[] = { Chain, DAG.getConstant(0, MVT::i32), StoreVal0,
+                        StoreVal1 };
+      Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetvalV2, dl,
+                                      DAG.getVTList(MVT::Other), Ops,
+                                      EltVT, MachinePointerInfo());
+    } else {
+      // V4 stores
+      // We have at least 4 elements (<3 x Ty> expands to 4 elements) and the
+      // vector will be expanded to a power of 2 elements, so we know we can
+      // always round up to the next multiple of 4 when creating the vector
+      // stores.
+      // e.g.  4 elem => 1 st.v4
+      //       6 elem => 2 st.v4
+      //       8 elem => 2 st.v4
+      //      11 elem => 3 st.v4
+
+      unsigned VecSize = 4;
+      if (OutVals[0].getValueType().getSizeInBits() == 64)
+        VecSize = 2;
+
+      unsigned Offset = 0;
+
+      EVT VecVT =
+          EVT::getVectorVT(F->getContext(), EltVT, VecSize);
+      unsigned PerStoreOffset =
+          TD->getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
+
+      for (unsigned i = 0; i < NumElts; i += VecSize) {
+        // Get values
+        SDValue StoreVal;
+        SmallVector<SDValue, 8> Ops;
+        Ops.push_back(Chain);
+        Ops.push_back(DAG.getConstant(Offset, MVT::i32));
+        unsigned Opc = NVPTXISD::StoreRetvalV2;
+        EVT ExtendedVT = (NeedExtend) ? MVT::i16 : OutVals[0].getValueType();
+
+        StoreVal = OutVals[i];
+        if (NeedExtend)
+          StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
+        Ops.push_back(StoreVal);
+
+        if (i + 1 < NumElts) {
+          StoreVal = OutVals[i + 1];
+          if (NeedExtend)
+            StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
+        } else {
+          StoreVal = DAG.getUNDEF(ExtendedVT);
+        }
+        Ops.push_back(StoreVal);
+
+        if (VecSize == 4) {
+          Opc = NVPTXISD::StoreRetvalV4;
+          if (i + 2 < NumElts) {
+            StoreVal = OutVals[i + 2];
+            if (NeedExtend)
+              StoreVal =
+                  DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
+          } else {
+            StoreVal = DAG.getUNDEF(ExtendedVT);
+          }
+          Ops.push_back(StoreVal);
+
+          if (i + 3 < NumElts) {
+            StoreVal = OutVals[i + 3];
+            if (NeedExtend)
+              StoreVal =
+                  DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
+          } else {
+            StoreVal = DAG.getUNDEF(ExtendedVT);
+          }
+          Ops.push_back(StoreVal);
+        }
+
+        // Chain = DAG.getNode(Opc, dl, MVT::Other, &Ops[0], Ops.size());
+        Chain =
+            DAG.getMemIntrinsicNode(Opc, dl, DAG.getVTList(MVT::Other), Ops,
+                                    EltVT, MachinePointerInfo());
+        Offset += PerStoreOffset;
+      }
+    }
+  } else {
+    SmallVector<EVT, 16> ValVTs;
+    SmallVector<uint64_t, 16> Offsets;
+    ComputePTXValueVTs(*this, RetTy, ValVTs, &Offsets, 0);
+    assert(ValVTs.size() == OutVals.size() && "Bad return value decomposition");
+
+    for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
+      SDValue theVal = OutVals[i];
+      EVT TheValType = theVal.getValueType();
+      unsigned numElems = 1;
+      if (TheValType.isVector())
+        numElems = TheValType.getVectorNumElements();
+      for (unsigned j = 0, je = numElems; j != je; ++j) {
+        SDValue TmpVal = theVal;
+        if (TheValType.isVector())
+          TmpVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                               TheValType.getVectorElementType(), TmpVal,
+                               DAG.getIntPtrConstant(j));
+        EVT TheStoreType = ValVTs[i];
+        if (RetTy->isIntegerTy() &&
+            TD->getTypeAllocSizeInBits(RetTy) < 32) {
+          // The following zero-extension is for integer types only, and
+          // specifically not for aggregates.
+          TmpVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, TmpVal);
+          TheStoreType = MVT::i32;
+        }
+        else if (TmpVal.getValueType().getSizeInBits() < 16)
+          TmpVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, TmpVal);
+
+        SDValue Ops[] = {
+          Chain,
+          DAG.getConstant(Offsets[i], MVT::i32),
+          TmpVal };
+        Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,
+                                        DAG.getVTList(MVT::Other), Ops,
+                                        TheStoreType,
+                                        MachinePointerInfo());
+      }
+    }
+  }
+
+  return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain);
+}
+
+
+void NVPTXTargetLowering::LowerAsmOperandForConstraint(
+    SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
+    SelectionDAG &DAG) const {
+  if (Constraint.length() > 1)
+    return;
+  else
+    TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+// NVPTX suuport vector of legal types of any length in Intrinsics because the
+// NVPTX specific type legalizer
+// will legalize them to the PTX supported length.
+bool NVPTXTargetLowering::isTypeSupportedInIntrinsic(MVT VT) const {
+  if (isTypeLegal(VT))
+    return true;
+  if (VT.isVector()) {
+    MVT eVT = VT.getVectorElementType();
+    if (isTypeLegal(eVT))
+      return true;
+  }
+  return false;
+}
+
+static unsigned getOpcForTextureInstr(unsigned Intrinsic) {
+  switch (Intrinsic) {
+  default:
+    return 0;
+
+  case Intrinsic::nvvm_tex_1d_v4f32_s32:
+    return NVPTXISD::Tex1DFloatS32;
+  case Intrinsic::nvvm_tex_1d_v4f32_f32:
+    return NVPTXISD::Tex1DFloatFloat;
+  case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
+    return NVPTXISD::Tex1DFloatFloatLevel;
+  case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
+    return NVPTXISD::Tex1DFloatFloatGrad;
+  case Intrinsic::nvvm_tex_1d_v4s32_s32:
+    return NVPTXISD::Tex1DS32S32;
+  case Intrinsic::nvvm_tex_1d_v4s32_f32:
+    return NVPTXISD::Tex1DS32Float;
+  case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
+    return NVPTXISD::Tex1DS32FloatLevel;
+  case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
+    return NVPTXISD::Tex1DS32FloatGrad;
+  case Intrinsic::nvvm_tex_1d_v4u32_s32:
+    return NVPTXISD::Tex1DU32S32;
+  case Intrinsic::nvvm_tex_1d_v4u32_f32:
+    return NVPTXISD::Tex1DU32Float;
+  case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
+    return NVPTXISD::Tex1DU32FloatLevel;
+  case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
+    return NVPTXISD::Tex1DU32FloatGrad;
+
+  case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
+    return NVPTXISD::Tex1DArrayFloatS32;
+  case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
+    return NVPTXISD::Tex1DArrayFloatFloat;
+  case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
+    return NVPTXISD::Tex1DArrayFloatFloatLevel;
+  case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
+    return NVPTXISD::Tex1DArrayFloatFloatGrad;
+  case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
+    return NVPTXISD::Tex1DArrayS32S32;
+  case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
+    return NVPTXISD::Tex1DArrayS32Float;
+  case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
+    return NVPTXISD::Tex1DArrayS32FloatLevel;
+  case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
+    return NVPTXISD::Tex1DArrayS32FloatGrad;
+  case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
+    return NVPTXISD::Tex1DArrayU32S32;
+  case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
+    return NVPTXISD::Tex1DArrayU32Float;
+  case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
+    return NVPTXISD::Tex1DArrayU32FloatLevel;
+  case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
+    return NVPTXISD::Tex1DArrayU32FloatGrad;
+
+  case Intrinsic::nvvm_tex_2d_v4f32_s32:
+    return NVPTXISD::Tex2DFloatS32;
+  case Intrinsic::nvvm_tex_2d_v4f32_f32:
+    return NVPTXISD::Tex2DFloatFloat;
+  case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
+    return NVPTXISD::Tex2DFloatFloatLevel;
+  case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
+    return NVPTXISD::Tex2DFloatFloatGrad;
+  case Intrinsic::nvvm_tex_2d_v4s32_s32:
+    return NVPTXISD::Tex2DS32S32;
+  case Intrinsic::nvvm_tex_2d_v4s32_f32:
+    return NVPTXISD::Tex2DS32Float;
+  case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
+    return NVPTXISD::Tex2DS32FloatLevel;
+  case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
+    return NVPTXISD::Tex2DS32FloatGrad;
+  case Intrinsic::nvvm_tex_2d_v4u32_s32:
+    return NVPTXISD::Tex2DU32S32;
+  case Intrinsic::nvvm_tex_2d_v4u32_f32:
+    return NVPTXISD::Tex2DU32Float;
+  case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
+    return NVPTXISD::Tex2DU32FloatLevel;
+  case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
+    return NVPTXISD::Tex2DU32FloatGrad;
+
+  case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
+    return NVPTXISD::Tex2DArrayFloatS32;
+  case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
+    return NVPTXISD::Tex2DArrayFloatFloat;
+  case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
+    return NVPTXISD::Tex2DArrayFloatFloatLevel;
+  case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
+    return NVPTXISD::Tex2DArrayFloatFloatGrad;
+  case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
+    return NVPTXISD::Tex2DArrayS32S32;
+  case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
+    return NVPTXISD::Tex2DArrayS32Float;
+  case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
+    return NVPTXISD::Tex2DArrayS32FloatLevel;
+  case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
+    return NVPTXISD::Tex2DArrayS32FloatGrad;
+  case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
+    return NVPTXISD::Tex2DArrayU32S32;
+  case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
+    return NVPTXISD::Tex2DArrayU32Float;
+  case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
+    return NVPTXISD::Tex2DArrayU32FloatLevel;
+  case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
+    return NVPTXISD::Tex2DArrayU32FloatGrad;
+
+  case Intrinsic::nvvm_tex_3d_v4f32_s32:
+    return NVPTXISD::Tex3DFloatS32;
+  case Intrinsic::nvvm_tex_3d_v4f32_f32:
+    return NVPTXISD::Tex3DFloatFloat;
+  case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
+    return NVPTXISD::Tex3DFloatFloatLevel;
+  case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
+    return NVPTXISD::Tex3DFloatFloatGrad;
+  case Intrinsic::nvvm_tex_3d_v4s32_s32:
+    return NVPTXISD::Tex3DS32S32;
+  case Intrinsic::nvvm_tex_3d_v4s32_f32:
+    return NVPTXISD::Tex3DS32Float;
+  case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
+    return NVPTXISD::Tex3DS32FloatLevel;
+  case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
+    return NVPTXISD::Tex3DS32FloatGrad;
+  case Intrinsic::nvvm_tex_3d_v4u32_s32:
+    return NVPTXISD::Tex3DU32S32;
+  case Intrinsic::nvvm_tex_3d_v4u32_f32:
+    return NVPTXISD::Tex3DU32Float;
+  case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
+    return NVPTXISD::Tex3DU32FloatLevel;
+  case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
+    return NVPTXISD::Tex3DU32FloatGrad;
+
+  case Intrinsic::nvvm_tex_cube_v4f32_f32:
+    return NVPTXISD::TexCubeFloatFloat;
+  case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
+    return NVPTXISD::TexCubeFloatFloatLevel;
+  case Intrinsic::nvvm_tex_cube_v4s32_f32:
+    return NVPTXISD::TexCubeS32Float;
+  case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
+    return NVPTXISD::TexCubeS32FloatLevel;
+  case Intrinsic::nvvm_tex_cube_v4u32_f32:
+    return NVPTXISD::TexCubeU32Float;
+  case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
+    return NVPTXISD::TexCubeU32FloatLevel;
+
+  case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
+    return NVPTXISD::TexCubeArrayFloatFloat;
+  case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
+    return NVPTXISD::TexCubeArrayFloatFloatLevel;
+  case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
+    return NVPTXISD::TexCubeArrayS32Float;
+  case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
+    return NVPTXISD::TexCubeArrayS32FloatLevel;
+  case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
+    return NVPTXISD::TexCubeArrayU32Float;
+  case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
+    return NVPTXISD::TexCubeArrayU32FloatLevel;
+
+  case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
+    return NVPTXISD::Tld4R2DFloatFloat;
+  case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
+    return NVPTXISD::Tld4G2DFloatFloat;
+  case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
+    return NVPTXISD::Tld4B2DFloatFloat;
+  case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
+    return NVPTXISD::Tld4A2DFloatFloat;
+  case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
+    return NVPTXISD::Tld4R2DS64Float;
+  case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
+    return NVPTXISD::Tld4G2DS64Float;
+  case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
+    return NVPTXISD::Tld4B2DS64Float;
+  case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
+    return NVPTXISD::Tld4A2DS64Float;
+  case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
+    return NVPTXISD::Tld4R2DU64Float;
+  case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
+    return NVPTXISD::Tld4G2DU64Float;
+  case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
+    return NVPTXISD::Tld4B2DU64Float;
+  case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
+    return NVPTXISD::Tld4A2DU64Float;
+
+  case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
+    return NVPTXISD::TexUnified1DFloatS32;
+  case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
+    return NVPTXISD::TexUnified1DFloatFloat;
+  case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
+    return NVPTXISD::TexUnified1DFloatFloatLevel;
+  case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
+    return NVPTXISD::TexUnified1DFloatFloatGrad;
+  case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
+    return NVPTXISD::TexUnified1DS32S32;
+  case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
+    return NVPTXISD::TexUnified1DS32Float;
+  case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
+    return NVPTXISD::TexUnified1DS32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
+    return NVPTXISD::TexUnified1DS32FloatGrad;
+  case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
+    return NVPTXISD::TexUnified1DU32S32;
+  case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
+    return NVPTXISD::TexUnified1DU32Float;
+  case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
+    return NVPTXISD::TexUnified1DU32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
+    return NVPTXISD::TexUnified1DU32FloatGrad;
+
+  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
+    return NVPTXISD::TexUnified1DArrayFloatS32;
+  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
+    return NVPTXISD::TexUnified1DArrayFloatFloat;
+  case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
+    return NVPTXISD::TexUnified1DArrayFloatFloatLevel;
+  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
+    return NVPTXISD::TexUnified1DArrayFloatFloatGrad;
+  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
+    return NVPTXISD::TexUnified1DArrayS32S32;
+  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
+    return NVPTXISD::TexUnified1DArrayS32Float;
+  case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
+    return NVPTXISD::TexUnified1DArrayS32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
+    return NVPTXISD::TexUnified1DArrayS32FloatGrad;
+  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
+    return NVPTXISD::TexUnified1DArrayU32S32;
+  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
+    return NVPTXISD::TexUnified1DArrayU32Float;
+  case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
+    return NVPTXISD::TexUnified1DArrayU32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
+    return NVPTXISD::TexUnified1DArrayU32FloatGrad;
+
+  case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
+    return NVPTXISD::TexUnified2DFloatS32;
+  case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
+    return NVPTXISD::TexUnified2DFloatFloat;
+  case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
+    return NVPTXISD::TexUnified2DFloatFloatLevel;
+  case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
+    return NVPTXISD::TexUnified2DFloatFloatGrad;
+  case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
+    return NVPTXISD::TexUnified2DS32S32;
+  case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
+    return NVPTXISD::TexUnified2DS32Float;
+  case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
+    return NVPTXISD::TexUnified2DS32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
+    return NVPTXISD::TexUnified2DS32FloatGrad;
+  case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
+    return NVPTXISD::TexUnified2DU32S32;
+  case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
+    return NVPTXISD::TexUnified2DU32Float;
+  case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
+    return NVPTXISD::TexUnified2DU32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
+    return NVPTXISD::TexUnified2DU32FloatGrad;
+
+  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
+    return NVPTXISD::TexUnified2DArrayFloatS32;
+  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
+    return NVPTXISD::TexUnified2DArrayFloatFloat;
+  case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
+    return NVPTXISD::TexUnified2DArrayFloatFloatLevel;
+  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
+    return NVPTXISD::TexUnified2DArrayFloatFloatGrad;
+  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
+    return NVPTXISD::TexUnified2DArrayS32S32;
+  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
+    return NVPTXISD::TexUnified2DArrayS32Float;
+  case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
+    return NVPTXISD::TexUnified2DArrayS32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
+    return NVPTXISD::TexUnified2DArrayS32FloatGrad;
+  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
+    return NVPTXISD::TexUnified2DArrayU32S32;
+  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
+    return NVPTXISD::TexUnified2DArrayU32Float;
+  case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
+    return NVPTXISD::TexUnified2DArrayU32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
+    return NVPTXISD::TexUnified2DArrayU32FloatGrad;
+
+  case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
+    return NVPTXISD::TexUnified3DFloatS32;
+  case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
+    return NVPTXISD::TexUnified3DFloatFloat;
+  case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
+    return NVPTXISD::TexUnified3DFloatFloatLevel;
+  case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
+    return NVPTXISD::TexUnified3DFloatFloatGrad;
+  case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
+    return NVPTXISD::TexUnified3DS32S32;
+  case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
+    return NVPTXISD::TexUnified3DS32Float;
+  case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
+    return NVPTXISD::TexUnified3DS32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
+    return NVPTXISD::TexUnified3DS32FloatGrad;
+  case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
+    return NVPTXISD::TexUnified3DU32S32;
+  case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
+    return NVPTXISD::TexUnified3DU32Float;
+  case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
+    return NVPTXISD::TexUnified3DU32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
+    return NVPTXISD::TexUnified3DU32FloatGrad;
+
+  case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
+    return NVPTXISD::TexUnifiedCubeFloatFloat;
+  case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
+    return NVPTXISD::TexUnifiedCubeFloatFloatLevel;
+  case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
+    return NVPTXISD::TexUnifiedCubeS32Float;
+  case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
+    return NVPTXISD::TexUnifiedCubeS32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
+    return NVPTXISD::TexUnifiedCubeU32Float;
+  case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
+    return NVPTXISD::TexUnifiedCubeU32FloatLevel;
+
+  case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
+    return NVPTXISD::TexUnifiedCubeArrayFloatFloat;
+  case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
+    return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel;
+  case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
+    return NVPTXISD::TexUnifiedCubeArrayS32Float;
+  case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
+    return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
+    return NVPTXISD::TexUnifiedCubeArrayU32Float;
+  case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
+    return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel;
+
+  case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
+    return NVPTXISD::Tld4UnifiedR2DFloatFloat;
+  case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
+    return NVPTXISD::Tld4UnifiedG2DFloatFloat;
+  case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
+    return NVPTXISD::Tld4UnifiedB2DFloatFloat;
+  case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
+    return NVPTXISD::Tld4UnifiedA2DFloatFloat;
+  case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
+    return NVPTXISD::Tld4UnifiedR2DS64Float;
+  case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
+    return NVPTXISD::Tld4UnifiedG2DS64Float;
+  case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
+    return NVPTXISD::Tld4UnifiedB2DS64Float;
+  case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
+    return NVPTXISD::Tld4UnifiedA2DS64Float;
+  case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
+    return NVPTXISD::Tld4UnifiedR2DU64Float;
+  case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
+    return NVPTXISD::Tld4UnifiedG2DU64Float;
+  case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
+    return NVPTXISD::Tld4UnifiedB2DU64Float;
+  case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
+    return NVPTXISD::Tld4UnifiedA2DU64Float;
+  }
+}
+
+static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) {
+  switch (Intrinsic) {
+  default:
+    return 0;
+  case Intrinsic::nvvm_suld_1d_i8_clamp:
+    return NVPTXISD::Suld1DI8Clamp;
+  case Intrinsic::nvvm_suld_1d_i16_clamp:
+    return NVPTXISD::Suld1DI16Clamp;
+  case Intrinsic::nvvm_suld_1d_i32_clamp:
+    return NVPTXISD::Suld1DI32Clamp;
+  case Intrinsic::nvvm_suld_1d_i64_clamp:
+    return NVPTXISD::Suld1DI64Clamp;
+  case Intrinsic::nvvm_suld_1d_v2i8_clamp:
+    return NVPTXISD::Suld1DV2I8Clamp;
+  case Intrinsic::nvvm_suld_1d_v2i16_clamp:
+    return NVPTXISD::Suld1DV2I16Clamp;
+  case Intrinsic::nvvm_suld_1d_v2i32_clamp:
+    return NVPTXISD::Suld1DV2I32Clamp;
+  case Intrinsic::nvvm_suld_1d_v2i64_clamp:
+    return NVPTXISD::Suld1DV2I64Clamp;
+  case Intrinsic::nvvm_suld_1d_v4i8_clamp:
+    return NVPTXISD::Suld1DV4I8Clamp;
+  case Intrinsic::nvvm_suld_1d_v4i16_clamp:
+    return NVPTXISD::Suld1DV4I16Clamp;
+  case Intrinsic::nvvm_suld_1d_v4i32_clamp:
+    return NVPTXISD::Suld1DV4I32Clamp;
+  case Intrinsic::nvvm_suld_1d_array_i8_clamp:
+    return NVPTXISD::Suld1DArrayI8Clamp;
+  case Intrinsic::nvvm_suld_1d_array_i16_clamp:
+    return NVPTXISD::Suld1DArrayI16Clamp;
+  case Intrinsic::nvvm_suld_1d_array_i32_clamp:
+    return NVPTXISD::Suld1DArrayI32Clamp;
+  case Intrinsic::nvvm_suld_1d_array_i64_clamp:
+    return NVPTXISD::Suld1DArrayI64Clamp;
+  case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
+    return NVPTXISD::Suld1DArrayV2I8Clamp;
+  case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
+    return NVPTXISD::Suld1DArrayV2I16Clamp;
+  case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
+    return NVPTXISD::Suld1DArrayV2I32Clamp;
+  case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
+    return NVPTXISD::Suld1DArrayV2I64Clamp;
+  case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
+    return NVPTXISD::Suld1DArrayV4I8Clamp;
+  case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
+    return NVPTXISD::Suld1DArrayV4I16Clamp;
+  case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
+    return NVPTXISD::Suld1DArrayV4I32Clamp;
+  case Intrinsic::nvvm_suld_2d_i8_clamp:
+    return NVPTXISD::Suld2DI8Clamp;
+  case Intrinsic::nvvm_suld_2d_i16_clamp:
+    return NVPTXISD::Suld2DI16Clamp;
+  case Intrinsic::nvvm_suld_2d_i32_clamp:
+    return NVPTXISD::Suld2DI32Clamp;
+  case Intrinsic::nvvm_suld_2d_i64_clamp:
+    return NVPTXISD::Suld2DI64Clamp;
+  case Intrinsic::nvvm_suld_2d_v2i8_clamp:
+    return NVPTXISD::Suld2DV2I8Clamp;
+  case Intrinsic::nvvm_suld_2d_v2i16_clamp:
+    return NVPTXISD::Suld2DV2I16Clamp;
+  case Intrinsic::nvvm_suld_2d_v2i32_clamp:
+    return NVPTXISD::Suld2DV2I32Clamp;
+  case Intrinsic::nvvm_suld_2d_v2i64_clamp:
+    return NVPTXISD::Suld2DV2I64Clamp;
+  case Intrinsic::nvvm_suld_2d_v4i8_clamp:
+    return NVPTXISD::Suld2DV4I8Clamp;
+  case Intrinsic::nvvm_suld_2d_v4i16_clamp:
+    return NVPTXISD::Suld2DV4I16Clamp;
+  case Intrinsic::nvvm_suld_2d_v4i32_clamp:
+    return NVPTXISD::Suld2DV4I32Clamp;
+  case Intrinsic::nvvm_suld_2d_array_i8_clamp:
+    return NVPTXISD::Suld2DArrayI8Clamp;
+  case Intrinsic::nvvm_suld_2d_array_i16_clamp:
+    return NVPTXISD::Suld2DArrayI16Clamp;
+  case Intrinsic::nvvm_suld_2d_array_i32_clamp:
+    return NVPTXISD::Suld2DArrayI32Clamp;
+  case Intrinsic::nvvm_suld_2d_array_i64_clamp:
+    return NVPTXISD::Suld2DArrayI64Clamp;
+  case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
+    return NVPTXISD::Suld2DArrayV2I8Clamp;
+  case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
+    return NVPTXISD::Suld2DArrayV2I16Clamp;
+  case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
+    return NVPTXISD::Suld2DArrayV2I32Clamp;
+  case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
+    return NVPTXISD::Suld2DArrayV2I64Clamp;
+  case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
+    return NVPTXISD::Suld2DArrayV4I8Clamp;
+  case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
+    return NVPTXISD::Suld2DArrayV4I16Clamp;
+  case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
+    return NVPTXISD::Suld2DArrayV4I32Clamp;
+  case Intrinsic::nvvm_suld_3d_i8_clamp:
+    return NVPTXISD::Suld3DI8Clamp;
+  case Intrinsic::nvvm_suld_3d_i16_clamp:
+    return NVPTXISD::Suld3DI16Clamp;
+  case Intrinsic::nvvm_suld_3d_i32_clamp:
+    return NVPTXISD::Suld3DI32Clamp;
+  case Intrinsic::nvvm_suld_3d_i64_clamp:
+    return NVPTXISD::Suld3DI64Clamp;
+  case Intrinsic::nvvm_suld_3d_v2i8_clamp:
+    return NVPTXISD::Suld3DV2I8Clamp;
+  case Intrinsic::nvvm_suld_3d_v2i16_clamp:
+    return NVPTXISD::Suld3DV2I16Clamp;
+  case Intrinsic::nvvm_suld_3d_v2i32_clamp:
+    return NVPTXISD::Suld3DV2I32Clamp;
+  case Intrinsic::nvvm_suld_3d_v2i64_clamp:
+    return NVPTXISD::Suld3DV2I64Clamp;
+  case Intrinsic::nvvm_suld_3d_v4i8_clamp:
+    return NVPTXISD::Suld3DV4I8Clamp;
+  case Intrinsic::nvvm_suld_3d_v4i16_clamp:
+    return NVPTXISD::Suld3DV4I16Clamp;
+  case Intrinsic::nvvm_suld_3d_v4i32_clamp:
+    return NVPTXISD::Suld3DV4I32Clamp;
+  case Intrinsic::nvvm_suld_1d_i8_trap:
+    return NVPTXISD::Suld1DI8Trap;
+  case Intrinsic::nvvm_suld_1d_i16_trap:
+    return NVPTXISD::Suld1DI16Trap;
+  case Intrinsic::nvvm_suld_1d_i32_trap:
+    return NVPTXISD::Suld1DI32Trap;
+  case Intrinsic::nvvm_suld_1d_i64_trap:
+    return NVPTXISD::Suld1DI64Trap;
+  case Intrinsic::nvvm_suld_1d_v2i8_trap:
+    return NVPTXISD::Suld1DV2I8Trap;
+  case Intrinsic::nvvm_suld_1d_v2i16_trap:
+    return NVPTXISD::Suld1DV2I16Trap;
+  case Intrinsic::nvvm_suld_1d_v2i32_trap:
+    return NVPTXISD::Suld1DV2I32Trap;
+  case Intrinsic::nvvm_suld_1d_v2i64_trap:
+    return NVPTXISD::Suld1DV2I64Trap;
+  case Intrinsic::nvvm_suld_1d_v4i8_trap:
+    return NVPTXISD::Suld1DV4I8Trap;
+  case Intrinsic::nvvm_suld_1d_v4i16_trap:
+    return NVPTXISD::Suld1DV4I16Trap;
+  case Intrinsic::nvvm_suld_1d_v4i32_trap:
+    return NVPTXISD::Suld1DV4I32Trap;
+  case Intrinsic::nvvm_suld_1d_array_i8_trap:
+    return NVPTXISD::Suld1DArrayI8Trap;
+  case Intrinsic::nvvm_suld_1d_array_i16_trap:
+    return NVPTXISD::Suld1DArrayI16Trap;
+  case Intrinsic::nvvm_suld_1d_array_i32_trap:
+    return NVPTXISD::Suld1DArrayI32Trap;
+  case Intrinsic::nvvm_suld_1d_array_i64_trap:
+    return NVPTXISD::Suld1DArrayI64Trap;
+  case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
+    return NVPTXISD::Suld1DArrayV2I8Trap;
+  case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
+    return NVPTXISD::Suld1DArrayV2I16Trap;
+  case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
+    return NVPTXISD::Suld1DArrayV2I32Trap;
+  case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
+    return NVPTXISD::Suld1DArrayV2I64Trap;
+  case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
+    return NVPTXISD::Suld1DArrayV4I8Trap;
+  case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
+    return NVPTXISD::Suld1DArrayV4I16Trap;
+  case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
+    return NVPTXISD::Suld1DArrayV4I32Trap;
+  case Intrinsic::nvvm_suld_2d_i8_trap:
+    return NVPTXISD::Suld2DI8Trap;
+  case Intrinsic::nvvm_suld_2d_i16_trap:
+    return NVPTXISD::Suld2DI16Trap;
+  case Intrinsic::nvvm_suld_2d_i32_trap:
+    return NVPTXISD::Suld2DI32Trap;
+  case Intrinsic::nvvm_suld_2d_i64_trap:
+    return NVPTXISD::Suld2DI64Trap;
+  case Intrinsic::nvvm_suld_2d_v2i8_trap:
+    return NVPTXISD::Suld2DV2I8Trap;
+  case Intrinsic::nvvm_suld_2d_v2i16_trap:
+    return NVPTXISD::Suld2DV2I16Trap;
+  case Intrinsic::nvvm_suld_2d_v2i32_trap:
+    return NVPTXISD::Suld2DV2I32Trap;
+  case Intrinsic::nvvm_suld_2d_v2i64_trap:
+    return NVPTXISD::Suld2DV2I64Trap;
+  case Intrinsic::nvvm_suld_2d_v4i8_trap:
+    return NVPTXISD::Suld2DV4I8Trap;
+  case Intrinsic::nvvm_suld_2d_v4i16_trap:
+    return NVPTXISD::Suld2DV4I16Trap;
+  case Intrinsic::nvvm_suld_2d_v4i32_trap:
+    return NVPTXISD::Suld2DV4I32Trap;
+  case Intrinsic::nvvm_suld_2d_array_i8_trap:
+    return NVPTXISD::Suld2DArrayI8Trap;
+  case Intrinsic::nvvm_suld_2d_array_i16_trap:
+    return NVPTXISD::Suld2DArrayI16Trap;
+  case Intrinsic::nvvm_suld_2d_array_i32_trap:
+    return NVPTXISD::Suld2DArrayI32Trap;
+  case Intrinsic::nvvm_suld_2d_array_i64_trap:
+    return NVPTXISD::Suld2DArrayI64Trap;
+  case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
+    return NVPTXISD::Suld2DArrayV2I8Trap;
+  case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
+    return NVPTXISD::Suld2DArrayV2I16Trap;
+  case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
+    return NVPTXISD::Suld2DArrayV2I32Trap;
+  case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
+    return NVPTXISD::Suld2DArrayV2I64Trap;
+  case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
+    return NVPTXISD::Suld2DArrayV4I8Trap;
+  case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
+    return NVPTXISD::Suld2DArrayV4I16Trap;
+  case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
+    return NVPTXISD::Suld2DArrayV4I32Trap;
+  case Intrinsic::nvvm_suld_3d_i8_trap:
+    return NVPTXISD::Suld3DI8Trap;
+  case Intrinsic::nvvm_suld_3d_i16_trap:
+    return NVPTXISD::Suld3DI16Trap;
+  case Intrinsic::nvvm_suld_3d_i32_trap:
+    return NVPTXISD::Suld3DI32Trap;
+  case Intrinsic::nvvm_suld_3d_i64_trap:
+    return NVPTXISD::Suld3DI64Trap;
+  case Intrinsic::nvvm_suld_3d_v2i8_trap:
+    return NVPTXISD::Suld3DV2I8Trap;
+  case Intrinsic::nvvm_suld_3d_v2i16_trap:
+    return NVPTXISD::Suld3DV2I16Trap;
+  case Intrinsic::nvvm_suld_3d_v2i32_trap:
+    return NVPTXISD::Suld3DV2I32Trap;
+  case Intrinsic::nvvm_suld_3d_v2i64_trap:
+    return NVPTXISD::Suld3DV2I64Trap;
+  case Intrinsic::nvvm_suld_3d_v4i8_trap:
+    return NVPTXISD::Suld3DV4I8Trap;
+  case Intrinsic::nvvm_suld_3d_v4i16_trap:
+    return NVPTXISD::Suld3DV4I16Trap;
+  case Intrinsic::nvvm_suld_3d_v4i32_trap:
+    return NVPTXISD::Suld3DV4I32Trap;
+  case Intrinsic::nvvm_suld_1d_i8_zero:
+    return NVPTXISD::Suld1DI8Zero;
+  case Intrinsic::nvvm_suld_1d_i16_zero:
+    return NVPTXISD::Suld1DI16Zero;
+  case Intrinsic::nvvm_suld_1d_i32_zero:
+    return NVPTXISD::Suld1DI32Zero;
+  case Intrinsic::nvvm_suld_1d_i64_zero:
+    return NVPTXISD::Suld1DI64Zero;
+  case Intrinsic::nvvm_suld_1d_v2i8_zero:
+    return NVPTXISD::Suld1DV2I8Zero;
+  case Intrinsic::nvvm_suld_1d_v2i16_zero:
+    return NVPTXISD::Suld1DV2I16Zero;
+  case Intrinsic::nvvm_suld_1d_v2i32_zero:
+    return NVPTXISD::Suld1DV2I32Zero;
+  case Intrinsic::nvvm_suld_1d_v2i64_zero:
+    return NVPTXISD::Suld1DV2I64Zero;
+  case Intrinsic::nvvm_suld_1d_v4i8_zero:
+    return NVPTXISD::Suld1DV4I8Zero;
+  case Intrinsic::nvvm_suld_1d_v4i16_zero:
+    return NVPTXISD::Suld1DV4I16Zero;
+  case Intrinsic::nvvm_suld_1d_v4i32_zero:
+    return NVPTXISD::Suld1DV4I32Zero;
+  case Intrinsic::nvvm_suld_1d_array_i8_zero:
+    return NVPTXISD::Suld1DArrayI8Zero;
+  case Intrinsic::nvvm_suld_1d_array_i16_zero:
+    return NVPTXISD::Suld1DArrayI16Zero;
+  case Intrinsic::nvvm_suld_1d_array_i32_zero:
+    return NVPTXISD::Suld1DArrayI32Zero;
+  case Intrinsic::nvvm_suld_1d_array_i64_zero:
+    return NVPTXISD::Suld1DArrayI64Zero;
+  case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
+    return NVPTXISD::Suld1DArrayV2I8Zero;
+  case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
+    return NVPTXISD::Suld1DArrayV2I16Zero;
+  case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
+    return NVPTXISD::Suld1DArrayV2I32Zero;
+  case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
+    return NVPTXISD::Suld1DArrayV2I64Zero;
+  case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
+    return NVPTXISD::Suld1DArrayV4I8Zero;
+  case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
+    return NVPTXISD::Suld1DArrayV4I16Zero;
+  case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
+    return NVPTXISD::Suld1DArrayV4I32Zero;
+  case Intrinsic::nvvm_suld_2d_i8_zero:
+    return NVPTXISD::Suld2DI8Zero;
+  case Intrinsic::nvvm_suld_2d_i16_zero:
+    return NVPTXISD::Suld2DI16Zero;
+  case Intrinsic::nvvm_suld_2d_i32_zero:
+    return NVPTXISD::Suld2DI32Zero;
+  case Intrinsic::nvvm_suld_2d_i64_zero:
+    return NVPTXISD::Suld2DI64Zero;
+  case Intrinsic::nvvm_suld_2d_v2i8_zero:
+    return NVPTXISD::Suld2DV2I8Zero;
+  case Intrinsic::nvvm_suld_2d_v2i16_zero:
+    return NVPTXISD::Suld2DV2I16Zero;
+  case Intrinsic::nvvm_suld_2d_v2i32_zero:
+    return NVPTXISD::Suld2DV2I32Zero;
+  case Intrinsic::nvvm_suld_2d_v2i64_zero:
+    return NVPTXISD::Suld2DV2I64Zero;
+  case Intrinsic::nvvm_suld_2d_v4i8_zero:
+    return NVPTXISD::Suld2DV4I8Zero;
+  case Intrinsic::nvvm_suld_2d_v4i16_zero:
+    return NVPTXISD::Suld2DV4I16Zero;
+  case Intrinsic::nvvm_suld_2d_v4i32_zero:
+    return NVPTXISD::Suld2DV4I32Zero;
+  case Intrinsic::nvvm_suld_2d_array_i8_zero:
+    return NVPTXISD::Suld2DArrayI8Zero;
+  case Intrinsic::nvvm_suld_2d_array_i16_zero:
+    return NVPTXISD::Suld2DArrayI16Zero;
+  case Intrinsic::nvvm_suld_2d_array_i32_zero:
+    return NVPTXISD::Suld2DArrayI32Zero;
+  case Intrinsic::nvvm_suld_2d_array_i64_zero:
+    return NVPTXISD::Suld2DArrayI64Zero;
+  case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
+    return NVPTXISD::Suld2DArrayV2I8Zero;
+  case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
+    return NVPTXISD::Suld2DArrayV2I16Zero;
+  case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
+    return NVPTXISD::Suld2DArrayV2I32Zero;
+  case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
+    return NVPTXISD::Suld2DArrayV2I64Zero;
+  case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
+    return NVPTXISD::Suld2DArrayV4I8Zero;
+  case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
+    return NVPTXISD::Suld2DArrayV4I16Zero;
+  case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
+    return NVPTXISD::Suld2DArrayV4I32Zero;
+  case Intrinsic::nvvm_suld_3d_i8_zero:
+    return NVPTXISD::Suld3DI8Zero;
+  case Intrinsic::nvvm_suld_3d_i16_zero:
+    return NVPTXISD::Suld3DI16Zero;
+  case Intrinsic::nvvm_suld_3d_i32_zero:
+    return NVPTXISD::Suld3DI32Zero;
+  case Intrinsic::nvvm_suld_3d_i64_zero:
+    return NVPTXISD::Suld3DI64Zero;
+  case Intrinsic::nvvm_suld_3d_v2i8_zero:
+    return NVPTXISD::Suld3DV2I8Zero;
+  case Intrinsic::nvvm_suld_3d_v2i16_zero:
+    return NVPTXISD::Suld3DV2I16Zero;
+  case Intrinsic::nvvm_suld_3d_v2i32_zero:
+    return NVPTXISD::Suld3DV2I32Zero;
+  case Intrinsic::nvvm_suld_3d_v2i64_zero:
+    return NVPTXISD::Suld3DV2I64Zero;
+  case Intrinsic::nvvm_suld_3d_v4i8_zero:
+    return NVPTXISD::Suld3DV4I8Zero;
+  case Intrinsic::nvvm_suld_3d_v4i16_zero:
+    return NVPTXISD::Suld3DV4I16Zero;
+  case Intrinsic::nvvm_suld_3d_v4i32_zero:
+    return NVPTXISD::Suld3DV4I32Zero;
+  }
+}
+
+// llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
+// TgtMemIntrinsic
+// because we need the information that is only available in the "Value" type
+// of destination
+// pointer. In particular, the address space information.
+bool NVPTXTargetLowering::getTgtMemIntrinsic(
+    IntrinsicInfo &Info, const CallInst &I, unsigned Intrinsic) const {
+  switch (Intrinsic) {
+  default:
+    return false;
+
+  case Intrinsic::nvvm_atomic_load_add_f32:
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::f32;
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.vol = 0;
+    Info.readMem = true;
+    Info.writeMem = true;
+    Info.align = 0;
+    return true;
+
+  case Intrinsic::nvvm_atomic_load_inc_32:
+  case Intrinsic::nvvm_atomic_load_dec_32:
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::i32;
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.vol = 0;
+    Info.readMem = true;
+    Info.writeMem = true;
+    Info.align = 0;
+    return true;
+
+  case Intrinsic::nvvm_ldu_global_i:
+  case Intrinsic::nvvm_ldu_global_f:
+  case Intrinsic::nvvm_ldu_global_p: {
+
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
+      Info.memVT = getValueType(I.getType());
+    else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
+      Info.memVT = getPointerTy();
+    else
+      Info.memVT = getValueType(I.getType());
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.vol = 0;
+    Info.readMem = true;
+    Info.writeMem = false;
+
+    // alignment is available as metadata.
+    // Grab it and set the alignment.
+    assert(I.hasMetadataOtherThanDebugLoc() && "Must have alignment metadata");
+    MDNode *AlignMD = I.getMetadata("align");
+    assert(AlignMD && "Must have a non-null MDNode");
+    assert(AlignMD->getNumOperands() == 1 && "Must have a single operand");
+    Value *Align = AlignMD->getOperand(0);
+    int64_t Alignment = cast<ConstantInt>(Align)->getZExtValue();
+    Info.align = Alignment;
+
+    return true;
+  }
+  case Intrinsic::nvvm_ldg_global_i:
+  case Intrinsic::nvvm_ldg_global_f:
+  case Intrinsic::nvvm_ldg_global_p: {
+
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    if (Intrinsic == Intrinsic::nvvm_ldg_global_i)
+      Info.memVT = getValueType(I.getType());
+    else if(Intrinsic == Intrinsic::nvvm_ldg_global_p)
+      Info.memVT = getPointerTy();
+    else
+      Info.memVT = getValueType(I.getType());
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.vol = 0;
+    Info.readMem = true;
+    Info.writeMem = false;
+
+    // alignment is available as metadata.
+    // Grab it and set the alignment.
+    assert(I.hasMetadataOtherThanDebugLoc() && "Must have alignment metadata");
+    MDNode *AlignMD = I.getMetadata("align");
+    assert(AlignMD && "Must have a non-null MDNode");
+    assert(AlignMD->getNumOperands() == 1 && "Must have a single operand");
+    Value *Align = AlignMD->getOperand(0);
+    int64_t Alignment = cast<ConstantInt>(Align)->getZExtValue();
+    Info.align = Alignment;
+
+    return true;
+  }
+
+  case Intrinsic::nvvm_tex_1d_v4f32_s32:
+  case Intrinsic::nvvm_tex_1d_v4f32_f32:
+  case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
+  case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
+  case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
+  case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
+  case Intrinsic::nvvm_tex_2d_v4f32_s32:
+  case Intrinsic::nvvm_tex_2d_v4f32_f32:
+  case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
+  case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
+  case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
+  case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
+  case Intrinsic::nvvm_tex_3d_v4f32_s32:
+  case Intrinsic::nvvm_tex_3d_v4f32_f32:
+  case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
+  case Intrinsic::nvvm_tex_cube_v4f32_f32:
+  case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
+  case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
+  case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
+  case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
+  case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
+  case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
+  case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
+  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
+  case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
+  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
+  case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
+  case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
+  case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
+  case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
+  case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: {
+    Info.opc = getOpcForTextureInstr(Intrinsic);
+    Info.memVT = MVT::v4f32;
+    Info.ptrVal = nullptr;
+    Info.offset = 0;
+    Info.vol = 0;
+    Info.readMem = true;
+    Info.writeMem = false;
+    Info.align = 16;
+    return true;
+  }
+  case Intrinsic::nvvm_tex_1d_v4s32_s32:
+  case Intrinsic::nvvm_tex_1d_v4s32_f32:
+  case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
+  case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
+  case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
+  case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
+  case Intrinsic::nvvm_tex_2d_v4s32_s32:
+  case Intrinsic::nvvm_tex_2d_v4s32_f32:
+  case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
+  case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
+  case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
+  case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
+  case Intrinsic::nvvm_tex_3d_v4s32_s32:
+  case Intrinsic::nvvm_tex_3d_v4s32_f32:
+  case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
+  case Intrinsic::nvvm_tex_cube_v4s32_f32:
+  case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
+  case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_cube_v4u32_f32:
+  case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
+  case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_1d_v4u32_s32:
+  case Intrinsic::nvvm_tex_1d_v4u32_f32:
+  case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
+  case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
+  case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
+  case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
+  case Intrinsic::nvvm_tex_2d_v4u32_s32:
+  case Intrinsic::nvvm_tex_2d_v4u32_f32:
+  case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
+  case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
+  case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
+  case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
+  case Intrinsic::nvvm_tex_3d_v4u32_s32:
+  case Intrinsic::nvvm_tex_3d_v4u32_f32:
+  case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
+  case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
+  case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
+  case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
+  case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
+  case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
+  case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
+  case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
+  case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
+  case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
+  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
+  case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
+  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
+  case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
+  case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
+  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
+  case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
+  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
+  case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
+  case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
+  case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
+  case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
+  case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
+  case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
+  case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
+  case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
+  case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: {
+    Info.opc = getOpcForTextureInstr(Intrinsic);
+    Info.memVT = MVT::v4i32;
+    Info.ptrVal = nullptr;
+    Info.offset = 0;
+    Info.vol = 0;
+    Info.readMem = true;
+    Info.writeMem = false;
+    Info.align = 16;
+    return true;
+  }
+  case Intrinsic::nvvm_suld_1d_i8_clamp:
+  case Intrinsic::nvvm_suld_1d_v2i8_clamp:
+  case Intrinsic::nvvm_suld_1d_v4i8_clamp:
+  case Intrinsic::nvvm_suld_1d_array_i8_clamp:
+  case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
+  case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
+  case Intrinsic::nvvm_suld_2d_i8_clamp:
+  case Intrinsic::nvvm_suld_2d_v2i8_clamp:
+  case Intrinsic::nvvm_suld_2d_v4i8_clamp:
+  case Intrinsic::nvvm_suld_2d_array_i8_clamp:
+  case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
+  case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
+  case Intrinsic::nvvm_suld_3d_i8_clamp:
+  case Intrinsic::nvvm_suld_3d_v2i8_clamp:
+  case Intrinsic::nvvm_suld_3d_v4i8_clamp:
+  case Intrinsic::nvvm_suld_1d_i8_trap:
+  case Intrinsic::nvvm_suld_1d_v2i8_trap:
+  case Intrinsic::nvvm_suld_1d_v4i8_trap:
+  case Intrinsic::nvvm_suld_1d_array_i8_trap:
+  case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
+  case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
+  case Intrinsic::nvvm_suld_2d_i8_trap:
+  case Intrinsic::nvvm_suld_2d_v2i8_trap:
+  case Intrinsic::nvvm_suld_2d_v4i8_trap:
+  case Intrinsic::nvvm_suld_2d_array_i8_trap:
+  case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
+  case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
+  case Intrinsic::nvvm_suld_3d_i8_trap:
+  case Intrinsic::nvvm_suld_3d_v2i8_trap:
+  case Intrinsic::nvvm_suld_3d_v4i8_trap:
+  case Intrinsic::nvvm_suld_1d_i8_zero:
+  case Intrinsic::nvvm_suld_1d_v2i8_zero:
+  case Intrinsic::nvvm_suld_1d_v4i8_zero:
+  case Intrinsic::nvvm_suld_1d_array_i8_zero:
+  case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
+  case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
+  case Intrinsic::nvvm_suld_2d_i8_zero:
+  case Intrinsic::nvvm_suld_2d_v2i8_zero:
+  case Intrinsic::nvvm_suld_2d_v4i8_zero:
+  case Intrinsic::nvvm_suld_2d_array_i8_zero:
+  case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
+  case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
+  case Intrinsic::nvvm_suld_3d_i8_zero:
+  case Intrinsic::nvvm_suld_3d_v2i8_zero:
+  case Intrinsic::nvvm_suld_3d_v4i8_zero: {
+    Info.opc = getOpcForSurfaceInstr(Intrinsic);
+    Info.memVT = MVT::i8;
+    Info.ptrVal = nullptr;
+    Info.offset = 0;
+    Info.vol = 0;
+    Info.readMem = true;
+    Info.writeMem = false;
+    Info.align = 16;
+    return true;
+  }
+  case Intrinsic::nvvm_suld_1d_i16_clamp:
+  case Intrinsic::nvvm_suld_1d_v2i16_clamp:
+  case Intrinsic::nvvm_suld_1d_v4i16_clamp:
+  case Intrinsic::nvvm_suld_1d_array_i16_clamp:
+  case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
+  case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
+  case Intrinsic::nvvm_suld_2d_i16_clamp:
+  case Intrinsic::nvvm_suld_2d_v2i16_clamp:
+  case Intrinsic::nvvm_suld_2d_v4i16_clamp:
+  case Intrinsic::nvvm_suld_2d_array_i16_clamp:
+  case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
+  case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
+  case Intrinsic::nvvm_suld_3d_i16_clamp:
+  case Intrinsic::nvvm_suld_3d_v2i16_clamp:
+  case Intrinsic::nvvm_suld_3d_v4i16_clamp:
+  case Intrinsic::nvvm_suld_1d_i16_trap:
+  case Intrinsic::nvvm_suld_1d_v2i16_trap:
+  case Intrinsic::nvvm_suld_1d_v4i16_trap:
+  case Intrinsic::nvvm_suld_1d_array_i16_trap:
+  case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
+  case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
+  case Intrinsic::nvvm_suld_2d_i16_trap:
+  case Intrinsic::nvvm_suld_2d_v2i16_trap:
+  case Intrinsic::nvvm_suld_2d_v4i16_trap:
+  case Intrinsic::nvvm_suld_2d_array_i16_trap:
+  case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
+  case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
+  case Intrinsic::nvvm_suld_3d_i16_trap:
+  case Intrinsic::nvvm_suld_3d_v2i16_trap:
+  case Intrinsic::nvvm_suld_3d_v4i16_trap:
+  case Intrinsic::nvvm_suld_1d_i16_zero:
+  case Intrinsic::nvvm_suld_1d_v2i16_zero:
+  case Intrinsic::nvvm_suld_1d_v4i16_zero:
+  case Intrinsic::nvvm_suld_1d_array_i16_zero:
+  case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
+  case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
+  case Intrinsic::nvvm_suld_2d_i16_zero:
+  case Intrinsic::nvvm_suld_2d_v2i16_zero:
+  case Intrinsic::nvvm_suld_2d_v4i16_zero:
+  case Intrinsic::nvvm_suld_2d_array_i16_zero:
+  case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
+  case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
+  case Intrinsic::nvvm_suld_3d_i16_zero:
+  case Intrinsic::nvvm_suld_3d_v2i16_zero:
+  case Intrinsic::nvvm_suld_3d_v4i16_zero: {
+    Info.opc = getOpcForSurfaceInstr(Intrinsic);
+    Info.memVT = MVT::i16;
+    Info.ptrVal = nullptr;
+    Info.offset = 0;
+    Info.vol = 0;
+    Info.readMem = true;
+    Info.writeMem = false;
+    Info.align = 16;
+    return true;
+  }
+  case Intrinsic::nvvm_suld_1d_i32_clamp:
+  case Intrinsic::nvvm_suld_1d_v2i32_clamp:
+  case Intrinsic::nvvm_suld_1d_v4i32_clamp:
+  case Intrinsic::nvvm_suld_1d_array_i32_clamp:
+  case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
+  case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
+  case Intrinsic::nvvm_suld_2d_i32_clamp:
+  case Intrinsic::nvvm_suld_2d_v2i32_clamp:
+  case Intrinsic::nvvm_suld_2d_v4i32_clamp:
+  case Intrinsic::nvvm_suld_2d_array_i32_clamp:
+  case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
+  case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
+  case Intrinsic::nvvm_suld_3d_i32_clamp:
+  case Intrinsic::nvvm_suld_3d_v2i32_clamp:
+  case Intrinsic::nvvm_suld_3d_v4i32_clamp:
+  case Intrinsic::nvvm_suld_1d_i32_trap:
+  case Intrinsic::nvvm_suld_1d_v2i32_trap:
+  case Intrinsic::nvvm_suld_1d_v4i32_trap:
+  case Intrinsic::nvvm_suld_1d_array_i32_trap:
+  case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
+  case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
+  case Intrinsic::nvvm_suld_2d_i32_trap:
+  case Intrinsic::nvvm_suld_2d_v2i32_trap:
+  case Intrinsic::nvvm_suld_2d_v4i32_trap:
+  case Intrinsic::nvvm_suld_2d_array_i32_trap:
+  case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
+  case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
+  case Intrinsic::nvvm_suld_3d_i32_trap:
+  case Intrinsic::nvvm_suld_3d_v2i32_trap:
+  case Intrinsic::nvvm_suld_3d_v4i32_trap:
+  case Intrinsic::nvvm_suld_1d_i32_zero:
+  case Intrinsic::nvvm_suld_1d_v2i32_zero:
+  case Intrinsic::nvvm_suld_1d_v4i32_zero:
+  case Intrinsic::nvvm_suld_1d_array_i32_zero:
+  case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
+  case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
+  case Intrinsic::nvvm_suld_2d_i32_zero:
+  case Intrinsic::nvvm_suld_2d_v2i32_zero:
+  case Intrinsic::nvvm_suld_2d_v4i32_zero:
+  case Intrinsic::nvvm_suld_2d_array_i32_zero:
+  case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
+  case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
+  case Intrinsic::nvvm_suld_3d_i32_zero:
+  case Intrinsic::nvvm_suld_3d_v2i32_zero:
+  case Intrinsic::nvvm_suld_3d_v4i32_zero: {
+    Info.opc = getOpcForSurfaceInstr(Intrinsic);
+    Info.memVT = MVT::i32;
+    Info.ptrVal = nullptr;
+    Info.offset = 0;
+    Info.vol = 0;
+    Info.readMem = true;
+    Info.writeMem = false;
+    Info.align = 16;
+    return true;
+  }
+  case Intrinsic::nvvm_suld_1d_i64_clamp:
+  case Intrinsic::nvvm_suld_1d_v2i64_clamp:
+  case Intrinsic::nvvm_suld_1d_array_i64_clamp:
+  case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
+  case Intrinsic::nvvm_suld_2d_i64_clamp:
+  case Intrinsic::nvvm_suld_2d_v2i64_clamp:
+  case Intrinsic::nvvm_suld_2d_array_i64_clamp:
+  case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
+  case Intrinsic::nvvm_suld_3d_i64_clamp:
+  case Intrinsic::nvvm_suld_3d_v2i64_clamp:
+  case Intrinsic::nvvm_suld_1d_i64_trap:
+  case Intrinsic::nvvm_suld_1d_v2i64_trap:
+  case Intrinsic::nvvm_suld_1d_array_i64_trap:
+  case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
+  case Intrinsic::nvvm_suld_2d_i64_trap:
+  case Intrinsic::nvvm_suld_2d_v2i64_trap:
+  case Intrinsic::nvvm_suld_2d_array_i64_trap:
+  case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
+  case Intrinsic::nvvm_suld_3d_i64_trap:
+  case Intrinsic::nvvm_suld_3d_v2i64_trap:
+  case Intrinsic::nvvm_suld_1d_i64_zero:
+  case Intrinsic::nvvm_suld_1d_v2i64_zero:
+  case Intrinsic::nvvm_suld_1d_array_i64_zero:
+  case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
+  case Intrinsic::nvvm_suld_2d_i64_zero:
+  case Intrinsic::nvvm_suld_2d_v2i64_zero:
+  case Intrinsic::nvvm_suld_2d_array_i64_zero:
+  case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
+  case Intrinsic::nvvm_suld_3d_i64_zero:
+  case Intrinsic::nvvm_suld_3d_v2i64_zero: {
+    Info.opc = getOpcForSurfaceInstr(Intrinsic);
+    Info.memVT = MVT::i64;
+    Info.ptrVal = nullptr;
+    Info.offset = 0;
+    Info.vol = 0;
+    Info.readMem = true;
+    Info.writeMem = false;
+    Info.align = 16;
+    return true;
+  }
+  }
+  return false;
+}
+
+/// isLegalAddressingMode - Return true if the addressing mode represented
+/// by AM is legal for this target, for a load/store of the specified type.
+/// Used to guide target specific optimizations, like loop strength reduction
+/// (LoopStrengthReduce.cpp) and memory optimization for address mode
+/// (CodeGenPrepare.cpp)
+bool NVPTXTargetLowering::isLegalAddressingMode(const AddrMode &AM,
+                                                Type *Ty) const {
+
+  // AddrMode - This represents an addressing mode of:
+  //    BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
+  //
+  // The legal address modes are
+  // - [avar]
+  // - [areg]
+  // - [areg+immoff]
+  // - [immAddr]
+
+  if (AM.BaseGV) {
+    if (AM.BaseOffs || AM.HasBaseReg || AM.Scale)
+      return false;
+    return true;
+  }
+
+  switch (AM.Scale) {
+  case 0: // "r", "r+i" or "i" is allowed
+    break;
+  case 1:
+    if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
+      return false;
+    // Otherwise we have r+i.
+    break;
+  default:
+    // No scale > 1 is allowed
+    return false;
+  }
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+//                         NVPTX Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+NVPTXTargetLowering::ConstraintType
+NVPTXTargetLowering::getConstraintType(const std::string &Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default:
+      break;
+    case 'b':
+    case 'r':
+    case 'h':
+    case 'c':
+    case 'l':
+    case 'f':
+    case 'd':
+    case '0':
+    case 'N':
+      return C_RegisterClass;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+std::pair<unsigned, const TargetRegisterClass *>
+NVPTXTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
+                                                  MVT VT) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'b':
+      return std::make_pair(0U, &NVPTX::Int1RegsRegClass);
+    case 'c':
+      return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
+    case 'h':
+      return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
+    case 'r':
+      return std::make_pair(0U, &NVPTX::Int32RegsRegClass);
+    case 'l':
+    case 'N':
+      return std::make_pair(0U, &NVPTX::Int64RegsRegClass);
+    case 'f':
+      return std::make_pair(0U, &NVPTX::Float32RegsRegClass);
+    case 'd':
+      return std::make_pair(0U, &NVPTX::Float64RegsRegClass);
+    }
+  }
+  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+}
+
+/// getFunctionAlignment - Return the Log2 alignment of this function.
+unsigned NVPTXTargetLowering::getFunctionAlignment(const Function *) const {
+  return 4;
+}
+
+//===----------------------------------------------------------------------===//
+//                         NVPTX DAG Combining
+//===----------------------------------------------------------------------===//
+
+bool NVPTXTargetLowering::allowFMA(MachineFunction &MF,
+                                   CodeGenOpt::Level OptLevel) const {
+  const Function *F = MF.getFunction();
+  const TargetOptions &TO = MF.getTarget().Options;
+
+  // Always honor command-line argument
+  if (FMAContractLevelOpt.getNumOccurrences() > 0) {
+    return FMAContractLevelOpt > 0;
+  } else if (OptLevel == 0) {
+    // Do not contract if we're not optimizing the code
+    return false;
+  } else if (TO.AllowFPOpFusion == FPOpFusion::Fast || TO.UnsafeFPMath) {
+    // Honor TargetOptions flags that explicitly say fusion is okay
+    return true;
+  } else if (F->hasFnAttribute("unsafe-fp-math")) {
+    // Check for unsafe-fp-math=true coming from Clang
+    Attribute Attr = F->getFnAttribute("unsafe-fp-math");
+    StringRef Val = Attr.getValueAsString();
+    if (Val == "true")
+      return true;
+  }
+
+  // We did not have a clear indication that fusion is allowed, so assume not
+  return false;
+}
+
+/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
+/// operands N0 and N1.  This is a helper for PerformADDCombine that is
+/// called with the default operands, and if that fails, with commuted
+/// operands.
+static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
+                                           TargetLowering::DAGCombinerInfo &DCI,
+                                             const NVPTXSubtarget &Subtarget,
+                                             CodeGenOpt::Level OptLevel) {
+  SelectionDAG  &DAG = DCI.DAG;
+  // Skip non-integer, non-scalar case
+  EVT VT=N0.getValueType();
+  if (VT.isVector())
+    return SDValue();
+
+  // fold (add (mul a, b), c) -> (mad a, b, c)
+  //
+  if (N0.getOpcode() == ISD::MUL) {
+    assert (VT.isInteger());
+    // For integer:
+    // Since integer multiply-add costs the same as integer multiply
+    // but is more costly than integer add, do the fusion only when
+    // the mul is only used in the add.
+    if (OptLevel==CodeGenOpt::None || VT != MVT::i32 ||
+        !N0.getNode()->hasOneUse())
+      return SDValue();
+
+    // Do the folding
+    return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT,
+                       N0.getOperand(0), N0.getOperand(1), N1);
+  }
+  else if (N0.getOpcode() == ISD::FMUL) {
+    if (VT == MVT::f32 || VT == MVT::f64) {
+      NVPTXTargetLowering *TLI =
+        (NVPTXTargetLowering *)&DAG.getTargetLoweringInfo();
+      if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel))
+        return SDValue();
+
+      // For floating point:
+      // Do the fusion only when the mul has less than 5 uses and all
+      // are add.
+      // The heuristic is that if a use is not an add, then that use
+      // cannot be fused into fma, therefore mul is still needed anyway.
+      // If there are more than 4 uses, even if they are all add, fusing
+      // them will increase register pressue.
+      //
+      int numUses = 0;
+      int nonAddCount = 0;
+      for (SDNode::use_iterator UI = N0.getNode()->use_begin(),
+           UE = N0.getNode()->use_end();
+           UI != UE; ++UI) {
+        numUses++;
+        SDNode *User = *UI;
+        if (User->getOpcode() != ISD::FADD)
+          ++nonAddCount;
+      }
+      if (numUses >= 5)
+        return SDValue();
+      if (nonAddCount) {
+        int orderNo = N->getIROrder();
+        int orderNo2 = N0.getNode()->getIROrder();
+        // simple heuristics here for considering potential register
+        // pressure, the logics here is that the differnce are used
+        // to measure the distance between def and use, the longer distance
+        // more likely cause register pressure.
+        if (orderNo - orderNo2 < 500)
+          return SDValue();
+
+        // Now, check if at least one of the FMUL's operands is live beyond the node N,
+        // which guarantees that the FMA will not increase register pressure at node N.
+        bool opIsLive = false;
+        const SDNode *left = N0.getOperand(0).getNode();
+        const SDNode *right = N0.getOperand(1).getNode();
+
+        if (dyn_cast<ConstantSDNode>(left) || dyn_cast<ConstantSDNode>(right))
+          opIsLive = true;
+
+        if (!opIsLive)
+          for (SDNode::use_iterator UI = left->use_begin(), UE = left->use_end(); UI != UE; ++UI) {
+            SDNode *User = *UI;
+            int orderNo3 = User->getIROrder();
+            if (orderNo3 > orderNo) {
+              opIsLive = true;
+              break;
+            }
+          }
+
+        if (!opIsLive)
+          for (SDNode::use_iterator UI = right->use_begin(), UE = right->use_end(); UI != UE; ++UI) {
+            SDNode *User = *UI;
+            int orderNo3 = User->getIROrder();
+            if (orderNo3 > orderNo) {
+              opIsLive = true;
+              break;
+            }
+          }
+
+        if (!opIsLive)
+          return SDValue();
+      }
+
+      return DAG.getNode(ISD::FMA, SDLoc(N), VT,
+                         N0.getOperand(0), N0.getOperand(1), N1);
+    }
+  }
+
+  return SDValue();
+}
+
+/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
+///
+static SDValue PerformADDCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const NVPTXSubtarget &Subtarget,
+                                 CodeGenOpt::Level OptLevel) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  // First try with the default operand order.
+  SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget,
+                                                 OptLevel);
+  if (Result.getNode())
+    return Result;
+
+  // If that didn't work, try again with the operands commuted.
+  return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel);
+}
+
+static SDValue PerformANDCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  // The type legalizer turns a vector load of i8 values into a zextload to i16
+  // registers, optionally ANY_EXTENDs it (if target type is integer),
+  // and ANDs off the high 8 bits. Since we turn this load into a
+  // target-specific DAG node, the DAG combiner fails to eliminate these AND
+  // nodes. Do that here.
+  SDValue Val = N->getOperand(0);
+  SDValue Mask = N->getOperand(1);
+
+  if (isa<ConstantSDNode>(Val)) {
+    std::swap(Val, Mask);
+  }
+
+  SDValue AExt;
+  // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
+  if (Val.getOpcode() == ISD::ANY_EXTEND) {
+    AExt = Val;
+    Val = Val->getOperand(0);
+  }
+
+  if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) {
+    Val = Val->getOperand(0);
+  }
+
+  if (Val->getOpcode() == NVPTXISD::LoadV2 ||
+      Val->getOpcode() == NVPTXISD::LoadV4) {
+    ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
+    if (!MaskCnst) {
+      // Not an AND with a constant
+      return SDValue();
+    }
+
+    uint64_t MaskVal = MaskCnst->getZExtValue();
+    if (MaskVal != 0xff) {
+      // Not an AND that chops off top 8 bits
+      return SDValue();
+    }
+
+    MemSDNode *Mem = dyn_cast<MemSDNode>(Val);
+    if (!Mem) {
+      // Not a MemSDNode?!?
+      return SDValue();
+    }
+
+    EVT MemVT = Mem->getMemoryVT();
+    if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) {
+      // We only handle the i8 case
+      return SDValue();
+    }
+
+    unsigned ExtType =
+      cast<ConstantSDNode>(Val->getOperand(Val->getNumOperands()-1))->
+        getZExtValue();
+    if (ExtType == ISD::SEXTLOAD) {
+      // If for some reason the load is a sextload, the and is needed to zero
+      // out the high 8 bits
+      return SDValue();
+    }
+
+    bool AddTo = false;
+    if (AExt.getNode() != 0) {
+      // Re-insert the ext as a zext.
+      Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
+                            AExt.getValueType(), Val);
+      AddTo = true;
+    }
+
+    // If we get here, the AND is unnecessary.  Just replace it with the load
+    DCI.CombineTo(N, Val, AddTo);
+  }
+
+  return SDValue();
+}
+
+enum OperandSignedness {
+  Signed = 0,
+  Unsigned,
+  Unknown
+};
+
+/// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
+/// that can be demoted to \p OptSize bits without loss of information. The
+/// signedness of the operand, if determinable, is placed in \p S.
+static bool IsMulWideOperandDemotable(SDValue Op,
+                                      unsigned OptSize,
+                                      OperandSignedness &S) {
+  S = Unknown;
+
+  if (Op.getOpcode() == ISD::SIGN_EXTEND ||
+      Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+    EVT OrigVT = Op.getOperand(0).getValueType();
+    if (OrigVT.getSizeInBits() == OptSize) {
+      S = Signed;
+      return true;
+    }
+  } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
+    EVT OrigVT = Op.getOperand(0).getValueType();
+    if (OrigVT.getSizeInBits() == OptSize) {
+      S = Unsigned;
+      return true;
+    }
+  }
+
+  return false;
+}
+
+/// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
+/// be demoted to \p OptSize bits without loss of information. If the operands
+/// contain a constant, it should appear as the RHS operand. The signedness of
+/// the operands is placed in \p IsSigned.
+static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS,
+                                        unsigned OptSize,
+                                        bool &IsSigned) {
+
+  OperandSignedness LHSSign;
+
+  // The LHS operand must be a demotable op
+  if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
+    return false;
+
+  // We should have been able to determine the signedness from the LHS
+  if (LHSSign == Unknown)
+    return false;
+
+  IsSigned = (LHSSign == Signed);
+
+  // The RHS can be a demotable op or a constant
+  if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {
+    APInt Val = CI->getAPIntValue();
+    if (LHSSign == Unsigned) {
+      if (Val.isIntN(OptSize)) {
+        return true;
+      }
+      return false;
+    } else {
+      if (Val.isSignedIntN(OptSize)) {
+        return true;
+      }
+      return false;
+    }
+  } else {
+    OperandSignedness RHSSign;
+    if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
+      return false;
+
+    if (LHSSign != RHSSign)
+      return false;
+
+    return true;
+  }
+}
+
+/// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
+/// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
+/// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
+/// amount.
+static SDValue TryMULWIDECombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  EVT MulType = N->getValueType(0);
+  if (MulType != MVT::i32 && MulType != MVT::i64) {
+    return SDValue();
+  }
+
+  unsigned OptSize = MulType.getSizeInBits() >> 1;
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  // Canonicalize the multiply so the constant (if any) is on the right
+  if (N->getOpcode() == ISD::MUL) {
+    if (isa<ConstantSDNode>(LHS)) {
+      std::swap(LHS, RHS);
+    }
+  }
+
+  // If we have a SHL, determine the actual multiply amount
+  if (N->getOpcode() == ISD::SHL) {
+    ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS);
+    if (!ShlRHS) {
+      return SDValue();
+    }
+
+    APInt ShiftAmt = ShlRHS->getAPIntValue();
+    unsigned BitWidth = MulType.getSizeInBits();
+    if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
+      APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
+      RHS = DCI.DAG.getConstant(MulVal, MulType);
+    } else {
+      return SDValue();
+    }
+  }
+
+  bool Signed;
+  // Verify that our operands are demotable
+  if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
+    return SDValue();
+  }
+
+  EVT DemotedVT;
+  if (MulType == MVT::i32) {
+    DemotedVT = MVT::i16;
+  } else {
+    DemotedVT = MVT::i32;
+  }
+
+  // Truncate the operands to the correct size. Note that these are just for
+  // type consistency and will (likely) be eliminated in later phases.
+  SDValue TruncLHS =
+    DCI.DAG.getNode(ISD::TRUNCATE, SDLoc(N), DemotedVT, LHS);
+  SDValue TruncRHS =
+    DCI.DAG.getNode(ISD::TRUNCATE, SDLoc(N), DemotedVT, RHS);
+
+  unsigned Opc;
+  if (Signed) {
+    Opc = NVPTXISD::MUL_WIDE_SIGNED;
+  } else {
+    Opc = NVPTXISD::MUL_WIDE_UNSIGNED;
+  }
+
+  return DCI.DAG.getNode(Opc, SDLoc(N), MulType, TruncLHS, TruncRHS);
+}
+
+/// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
+static SDValue PerformMULCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 CodeGenOpt::Level OptLevel) {
+  if (OptLevel > 0) {
+    // Try mul.wide combining at OptLevel > 0
+    SDValue Ret = TryMULWIDECombine(N, DCI);
+    if (Ret.getNode())
+      return Ret;
+  }
+
+  return SDValue();
+}
+
+/// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
+static SDValue PerformSHLCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 CodeGenOpt::Level OptLevel) {
+  if (OptLevel > 0) {
+    // Try mul.wide combining at OptLevel > 0
+    SDValue Ret = TryMULWIDECombine(N, DCI);
+    if (Ret.getNode())
+      return Ret;
+  }
+
+  return SDValue();
+}
+
+SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
+                                               DAGCombinerInfo &DCI) const {
+  CodeGenOpt::Level OptLevel = getTargetMachine().getOptLevel();
+  switch (N->getOpcode()) {
+    default: break;
+    case ISD::ADD:
+    case ISD::FADD:
+      return PerformADDCombine(N, DCI, nvptxSubtarget, OptLevel);
+    case ISD::MUL:
+      return PerformMULCombine(N, DCI, OptLevel);
+    case ISD::SHL:
+      return PerformSHLCombine(N, DCI, OptLevel);
+    case ISD::AND:
+      return PerformANDCombine(N, DCI);
+  }
+  return SDValue();
+}
+
+/// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
+static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
+                              const DataLayout *TD,
+                              SmallVectorImpl<SDValue> &Results) {
+  EVT ResVT = N->getValueType(0);
+  SDLoc DL(N);
+
+  assert(ResVT.isVector() && "Vector load must have vector type");
+
+  // We only handle "native" vector sizes for now, e.g. <4 x double> is not
+  // legal.  We can (and should) split that into 2 loads of <2 x double> here
+  // but I'm leaving that as a TODO for now.
+  assert(ResVT.isSimple() && "Can only handle simple types");
+  switch (ResVT.getSimpleVT().SimpleTy) {
+  default:
+    return;
+  case MVT::v2i8:
+  case MVT::v2i16:
+  case MVT::v2i32:
+  case MVT::v2i64:
+  case MVT::v2f32:
+  case MVT::v2f64:
+  case MVT::v4i8:
+  case MVT::v4i16:
+  case MVT::v4i32:
+  case MVT::v4f32:
+    // This is a "native" vector type
+    break;
+  }
+
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+
+  unsigned Align = LD->getAlignment();
+  unsigned PrefAlign =
+    TD->getPrefTypeAlignment(ResVT.getTypeForEVT(*DAG.getContext()));
+  if (Align < PrefAlign) {
+    // This load is not sufficiently aligned, so bail out and let this vector
+    // load be scalarized.  Note that we may still be able to emit smaller
+    // vector loads.  For example, if we are loading a <4 x float> with an
+    // alignment of 8, this check will fail but the legalizer will try again
+    // with 2 x <2 x float>, which will succeed with an alignment of 8.
+    return;
+  }
+
+  EVT EltVT = ResVT.getVectorElementType();
+  unsigned NumElts = ResVT.getVectorNumElements();
+
+  // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
+  // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
+  // loaded type to i16 and propagate the "real" type as the memory type.
+  bool NeedTrunc = false;
+  if (EltVT.getSizeInBits() < 16) {
+    EltVT = MVT::i16;
+    NeedTrunc = true;
+  }
+
+  unsigned Opcode = 0;
+  SDVTList LdResVTs;
+
+  switch (NumElts) {
+  default:
+    return;
+  case 2:
+    Opcode = NVPTXISD::LoadV2;
+    LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
+    break;
+  case 4: {
+    Opcode = NVPTXISD::LoadV4;
+    EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
+    LdResVTs = DAG.getVTList(ListVTs);
+    break;
+  }
+  }
+
+  SmallVector<SDValue, 8> OtherOps;
+
+  // Copy regular operands
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+    OtherOps.push_back(N->getOperand(i));
+
+  // The select routine does not have access to the LoadSDNode instance, so
+  // pass along the extension information
+  OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType()));
+
+  SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
+                                          LD->getMemoryVT(),
+                                          LD->getMemOperand());
+
+  SmallVector<SDValue, 4> ScalarRes;
+
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue Res = NewLD.getValue(i);
+    if (NeedTrunc)
+      Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
+    ScalarRes.push_back(Res);
+  }
+
+  SDValue LoadChain = NewLD.getValue(NumElts);
+
+  SDValue BuildVec = DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes);
+
+  Results.push_back(BuildVec);
+  Results.push_back(LoadChain);
+}
+
+static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
+                                     SmallVectorImpl<SDValue> &Results) {
+  SDValue Chain = N->getOperand(0);
+  SDValue Intrin = N->getOperand(1);
+  SDLoc DL(N);
+
+  // Get the intrinsic ID
+  unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue();
+  switch (IntrinNo) {
+  default:
+    return;
+  case Intrinsic::nvvm_ldg_global_i:
+  case Intrinsic::nvvm_ldg_global_f:
+  case Intrinsic::nvvm_ldg_global_p:
+  case Intrinsic::nvvm_ldu_global_i:
+  case Intrinsic::nvvm_ldu_global_f:
+  case Intrinsic::nvvm_ldu_global_p: {
+    EVT ResVT = N->getValueType(0);
+
+    if (ResVT.isVector()) {
+      // Vector LDG/LDU
+
+      unsigned NumElts = ResVT.getVectorNumElements();
+      EVT EltVT = ResVT.getVectorElementType();
+
+      // Since LDU/LDG are target nodes, we cannot rely on DAG type
+      // legalization.
+      // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
+      // loaded type to i16 and propagate the "real" type as the memory type.
+      bool NeedTrunc = false;
+      if (EltVT.getSizeInBits() < 16) {
+        EltVT = MVT::i16;
+        NeedTrunc = true;
+      }
+
+      unsigned Opcode = 0;
+      SDVTList LdResVTs;
+
+      switch (NumElts) {
+      default:
+        return;
+      case 2:
+        switch (IntrinNo) {
+        default:
+          return;
+        case Intrinsic::nvvm_ldg_global_i:
+        case Intrinsic::nvvm_ldg_global_f:
+        case Intrinsic::nvvm_ldg_global_p:
+          Opcode = NVPTXISD::LDGV2;
+          break;
+        case Intrinsic::nvvm_ldu_global_i:
+        case Intrinsic::nvvm_ldu_global_f:
+        case Intrinsic::nvvm_ldu_global_p:
+          Opcode = NVPTXISD::LDUV2;
+          break;
+        }
+        LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
+        break;
+      case 4: {
+        switch (IntrinNo) {
+        default:
+          return;
+        case Intrinsic::nvvm_ldg_global_i:
+        case Intrinsic::nvvm_ldg_global_f:
+        case Intrinsic::nvvm_ldg_global_p:
+          Opcode = NVPTXISD::LDGV4;
+          break;
+        case Intrinsic::nvvm_ldu_global_i:
+        case Intrinsic::nvvm_ldu_global_f:
+        case Intrinsic::nvvm_ldu_global_p:
+          Opcode = NVPTXISD::LDUV4;
+          break;
+        }
+        EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
+        LdResVTs = DAG.getVTList(ListVTs);
+        break;
+      }
+      }
+
+      SmallVector<SDValue, 8> OtherOps;
+
+      // Copy regular operands
+
+      OtherOps.push_back(Chain); // Chain
+                                 // Skip operand 1 (intrinsic ID)
+      // Others
+      for (unsigned i = 2, e = N->getNumOperands(); i != e; ++i)
+        OtherOps.push_back(N->getOperand(i));
+
+      MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
+
+      SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
+                                              MemSD->getMemoryVT(),
+                                              MemSD->getMemOperand());
+
+      SmallVector<SDValue, 4> ScalarRes;
+
+      for (unsigned i = 0; i < NumElts; ++i) {
+        SDValue Res = NewLD.getValue(i);
+        if (NeedTrunc)
+          Res =
+              DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
+        ScalarRes.push_back(Res);
+      }
+
+      SDValue LoadChain = NewLD.getValue(NumElts);
+
+      SDValue BuildVec =
+          DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes);
+
+      Results.push_back(BuildVec);
+      Results.push_back(LoadChain);
+    } else {
+      // i8 LDG/LDU
+      assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&
+             "Custom handling of non-i8 ldu/ldg?");
+
+      // Just copy all operands as-is
+      SmallVector<SDValue, 4> Ops;
+      for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+        Ops.push_back(N->getOperand(i));
+
+      // Force output to i16
+      SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);
+
+      MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
+
+      // We make sure the memory type is i8, which will be used during isel
+      // to select the proper instruction.
+      SDValue NewLD =
+          DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops,
+                                  MVT::i8, MemSD->getMemOperand());
+
+      Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
+                                    NewLD.getValue(0)));
+      Results.push_back(NewLD.getValue(1));
+    }
+  }
+  }
+}
+
+void NVPTXTargetLowering::ReplaceNodeResults(
+    SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
+  switch (N->getOpcode()) {
+  default:
+    report_fatal_error("Unhandled custom legalization");
+  case ISD::LOAD:
+    ReplaceLoadVector(N, DAG, getDataLayout(), Results);
+    return;
+  case ISD::INTRINSIC_W_CHAIN:
+    ReplaceINTRINSIC_W_CHAIN(N, DAG, Results);
+    return;
+  }
+}
+
+// Pin NVPTXSection's and NVPTXTargetObjectFile's vtables to this file.
+void NVPTXSection::anchor() {}
+
+NVPTXTargetObjectFile::~NVPTXTargetObjectFile() {
+  delete TextSection;
+  delete DataSection;
+  delete BSSSection;
+  delete ReadOnlySection;
+
+  delete StaticCtorSection;
+  delete StaticDtorSection;
+  delete LSDASection;
+  delete EHFrameSection;
+  delete DwarfAbbrevSection;
+  delete DwarfInfoSection;
+  delete DwarfLineSection;
+  delete DwarfFrameSection;
+  delete DwarfPubTypesSection;
+  delete DwarfDebugInlineSection;
+  delete DwarfStrSection;
+  delete DwarfLocSection;
+  delete DwarfARangesSection;
+  delete DwarfRangesSection;
+  delete DwarfMacroInfoSection;
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
new file mode 100644
index 0000000..bef6ed9
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -0,0 +1,541 @@
+//===-- NVPTXISelLowering.h - NVPTX DAG Lowering Interface ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that NVPTX uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTXISELLOWERING_H
+#define NVPTXISELLOWERING_H
+
+#include "NVPTX.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+namespace NVPTXISD {
+enum NodeType {
+  // Start the numbering from where ISD NodeType finishes.
+  FIRST_NUMBER = ISD::BUILTIN_OP_END,
+  Wrapper,
+  CALL,
+  RET_FLAG,
+  LOAD_PARAM,
+  DeclareParam,
+  DeclareScalarParam,
+  DeclareRetParam,
+  DeclareRet,
+  DeclareScalarRet,
+  PrintCall,
+  PrintCallUni,
+  CallArgBegin,
+  CallArg,
+  LastCallArg,
+  CallArgEnd,
+  CallVoid,
+  CallVal,
+  CallSymbol,
+  Prototype,
+  MoveParam,
+  PseudoUseParam,
+  RETURN,
+  CallSeqBegin,
+  CallSeqEnd,
+  CallPrototype,
+  FUN_SHFL_CLAMP,
+  FUN_SHFR_CLAMP,
+  MUL_WIDE_SIGNED,
+  MUL_WIDE_UNSIGNED,
+  IMAD,
+  Dummy,
+
+  LoadV2 = ISD::FIRST_TARGET_MEMORY_OPCODE,
+  LoadV4,
+  LDGV2, // LDG.v2
+  LDGV4, // LDG.v4
+  LDUV2, // LDU.v2
+  LDUV4, // LDU.v4
+  StoreV2,
+  StoreV4,
+  LoadParam,
+  LoadParamV2,
+  LoadParamV4,
+  StoreParam,
+  StoreParamV2,
+  StoreParamV4,
+  StoreParamS32, // to sext and store a <32bit value, not used currently
+  StoreParamU32, // to zext and store a <32bit value, not used currently 
+  StoreRetval,
+  StoreRetvalV2,
+  StoreRetvalV4,
+
+  // Texture intrinsics
+  Tex1DFloatS32,
+  Tex1DFloatFloat,
+  Tex1DFloatFloatLevel,
+  Tex1DFloatFloatGrad,
+  Tex1DS32S32,
+  Tex1DS32Float,
+  Tex1DS32FloatLevel,
+  Tex1DS32FloatGrad,
+  Tex1DU32S32,
+  Tex1DU32Float,
+  Tex1DU32FloatLevel,
+  Tex1DU32FloatGrad,
+  Tex1DArrayFloatS32,
+  Tex1DArrayFloatFloat,
+  Tex1DArrayFloatFloatLevel,
+  Tex1DArrayFloatFloatGrad,
+  Tex1DArrayS32S32,
+  Tex1DArrayS32Float,
+  Tex1DArrayS32FloatLevel,
+  Tex1DArrayS32FloatGrad,
+  Tex1DArrayU32S32,
+  Tex1DArrayU32Float,
+  Tex1DArrayU32FloatLevel,
+  Tex1DArrayU32FloatGrad,
+  Tex2DFloatS32,
+  Tex2DFloatFloat,
+  Tex2DFloatFloatLevel,
+  Tex2DFloatFloatGrad,
+  Tex2DS32S32,
+  Tex2DS32Float,
+  Tex2DS32FloatLevel,
+  Tex2DS32FloatGrad,
+  Tex2DU32S32,
+  Tex2DU32Float,
+  Tex2DU32FloatLevel,
+  Tex2DU32FloatGrad,
+  Tex2DArrayFloatS32,
+  Tex2DArrayFloatFloat,
+  Tex2DArrayFloatFloatLevel,
+  Tex2DArrayFloatFloatGrad,
+  Tex2DArrayS32S32,
+  Tex2DArrayS32Float,
+  Tex2DArrayS32FloatLevel,
+  Tex2DArrayS32FloatGrad,
+  Tex2DArrayU32S32,
+  Tex2DArrayU32Float,
+  Tex2DArrayU32FloatLevel,
+  Tex2DArrayU32FloatGrad,
+  Tex3DFloatS32,
+  Tex3DFloatFloat,
+  Tex3DFloatFloatLevel,
+  Tex3DFloatFloatGrad,
+  Tex3DS32S32,
+  Tex3DS32Float,
+  Tex3DS32FloatLevel,
+  Tex3DS32FloatGrad,
+  Tex3DU32S32,
+  Tex3DU32Float,
+  Tex3DU32FloatLevel,
+  Tex3DU32FloatGrad,
+  TexCubeFloatFloat,
+  TexCubeFloatFloatLevel,
+  TexCubeS32Float,
+  TexCubeS32FloatLevel,
+  TexCubeU32Float,
+  TexCubeU32FloatLevel,
+  TexCubeArrayFloatFloat,
+  TexCubeArrayFloatFloatLevel,
+  TexCubeArrayS32Float,
+  TexCubeArrayS32FloatLevel,
+  TexCubeArrayU32Float,
+  TexCubeArrayU32FloatLevel,
+  Tld4R2DFloatFloat,
+  Tld4G2DFloatFloat,
+  Tld4B2DFloatFloat,
+  Tld4A2DFloatFloat,
+  Tld4R2DS64Float,
+  Tld4G2DS64Float,
+  Tld4B2DS64Float,
+  Tld4A2DS64Float,
+  Tld4R2DU64Float,
+  Tld4G2DU64Float,
+  Tld4B2DU64Float,
+  Tld4A2DU64Float,
+  TexUnified1DFloatS32,
+  TexUnified1DFloatFloat,
+  TexUnified1DFloatFloatLevel,
+  TexUnified1DFloatFloatGrad,
+  TexUnified1DS32S32,
+  TexUnified1DS32Float,
+  TexUnified1DS32FloatLevel,
+  TexUnified1DS32FloatGrad,
+  TexUnified1DU32S32,
+  TexUnified1DU32Float,
+  TexUnified1DU32FloatLevel,
+  TexUnified1DU32FloatGrad,
+  TexUnified1DArrayFloatS32,
+  TexUnified1DArrayFloatFloat,
+  TexUnified1DArrayFloatFloatLevel,
+  TexUnified1DArrayFloatFloatGrad,
+  TexUnified1DArrayS32S32,
+  TexUnified1DArrayS32Float,
+  TexUnified1DArrayS32FloatLevel,
+  TexUnified1DArrayS32FloatGrad,
+  TexUnified1DArrayU32S32,
+  TexUnified1DArrayU32Float,
+  TexUnified1DArrayU32FloatLevel,
+  TexUnified1DArrayU32FloatGrad,
+  TexUnified2DFloatS32,
+  TexUnified2DFloatFloat,
+  TexUnified2DFloatFloatLevel,
+  TexUnified2DFloatFloatGrad,
+  TexUnified2DS32S32,
+  TexUnified2DS32Float,
+  TexUnified2DS32FloatLevel,
+  TexUnified2DS32FloatGrad,
+  TexUnified2DU32S32,
+  TexUnified2DU32Float,
+  TexUnified2DU32FloatLevel,
+  TexUnified2DU32FloatGrad,
+  TexUnified2DArrayFloatS32,
+  TexUnified2DArrayFloatFloat,
+  TexUnified2DArrayFloatFloatLevel,
+  TexUnified2DArrayFloatFloatGrad,
+  TexUnified2DArrayS32S32,
+  TexUnified2DArrayS32Float,
+  TexUnified2DArrayS32FloatLevel,
+  TexUnified2DArrayS32FloatGrad,
+  TexUnified2DArrayU32S32,
+  TexUnified2DArrayU32Float,
+  TexUnified2DArrayU32FloatLevel,
+  TexUnified2DArrayU32FloatGrad,
+  TexUnified3DFloatS32,
+  TexUnified3DFloatFloat,
+  TexUnified3DFloatFloatLevel,
+  TexUnified3DFloatFloatGrad,
+  TexUnified3DS32S32,
+  TexUnified3DS32Float,
+  TexUnified3DS32FloatLevel,
+  TexUnified3DS32FloatGrad,
+  TexUnified3DU32S32,
+  TexUnified3DU32Float,
+  TexUnified3DU32FloatLevel,
+  TexUnified3DU32FloatGrad,
+  TexUnifiedCubeFloatFloat,
+  TexUnifiedCubeFloatFloatLevel,
+  TexUnifiedCubeS32Float,
+  TexUnifiedCubeS32FloatLevel,
+  TexUnifiedCubeU32Float,
+  TexUnifiedCubeU32FloatLevel,
+  TexUnifiedCubeArrayFloatFloat,
+  TexUnifiedCubeArrayFloatFloatLevel,
+  TexUnifiedCubeArrayS32Float,
+  TexUnifiedCubeArrayS32FloatLevel,
+  TexUnifiedCubeArrayU32Float,
+  TexUnifiedCubeArrayU32FloatLevel,
+  Tld4UnifiedR2DFloatFloat,
+  Tld4UnifiedG2DFloatFloat,
+  Tld4UnifiedB2DFloatFloat,
+  Tld4UnifiedA2DFloatFloat,
+  Tld4UnifiedR2DS64Float,
+  Tld4UnifiedG2DS64Float,
+  Tld4UnifiedB2DS64Float,
+  Tld4UnifiedA2DS64Float,
+  Tld4UnifiedR2DU64Float,
+  Tld4UnifiedG2DU64Float,
+  Tld4UnifiedB2DU64Float,
+  Tld4UnifiedA2DU64Float,
+
+  // Surface intrinsics
+  Suld1DI8Clamp,
+  Suld1DI16Clamp,
+  Suld1DI32Clamp,
+  Suld1DI64Clamp,
+  Suld1DV2I8Clamp,
+  Suld1DV2I16Clamp,
+  Suld1DV2I32Clamp,
+  Suld1DV2I64Clamp,
+  Suld1DV4I8Clamp,
+  Suld1DV4I16Clamp,
+  Suld1DV4I32Clamp,
+
+  Suld1DArrayI8Clamp,
+  Suld1DArrayI16Clamp,
+  Suld1DArrayI32Clamp,
+  Suld1DArrayI64Clamp,
+  Suld1DArrayV2I8Clamp,
+  Suld1DArrayV2I16Clamp,
+  Suld1DArrayV2I32Clamp,
+  Suld1DArrayV2I64Clamp,
+  Suld1DArrayV4I8Clamp,
+  Suld1DArrayV4I16Clamp,
+  Suld1DArrayV4I32Clamp,
+
+  Suld2DI8Clamp,
+  Suld2DI16Clamp,
+  Suld2DI32Clamp,
+  Suld2DI64Clamp,
+  Suld2DV2I8Clamp,
+  Suld2DV2I16Clamp,
+  Suld2DV2I32Clamp,
+  Suld2DV2I64Clamp,
+  Suld2DV4I8Clamp,
+  Suld2DV4I16Clamp,
+  Suld2DV4I32Clamp,
+
+  Suld2DArrayI8Clamp,
+  Suld2DArrayI16Clamp,
+  Suld2DArrayI32Clamp,
+  Suld2DArrayI64Clamp,
+  Suld2DArrayV2I8Clamp,
+  Suld2DArrayV2I16Clamp,
+  Suld2DArrayV2I32Clamp,
+  Suld2DArrayV2I64Clamp,
+  Suld2DArrayV4I8Clamp,
+  Suld2DArrayV4I16Clamp,
+  Suld2DArrayV4I32Clamp,
+
+  Suld3DI8Clamp,
+  Suld3DI16Clamp,
+  Suld3DI32Clamp,
+  Suld3DI64Clamp,
+  Suld3DV2I8Clamp,
+  Suld3DV2I16Clamp,
+  Suld3DV2I32Clamp,
+  Suld3DV2I64Clamp,
+  Suld3DV4I8Clamp,
+  Suld3DV4I16Clamp,
+  Suld3DV4I32Clamp,
+
+  Suld1DI8Trap,
+  Suld1DI16Trap,
+  Suld1DI32Trap,
+  Suld1DI64Trap,
+  Suld1DV2I8Trap,
+  Suld1DV2I16Trap,
+  Suld1DV2I32Trap,
+  Suld1DV2I64Trap,
+  Suld1DV4I8Trap,
+  Suld1DV4I16Trap,
+  Suld1DV4I32Trap,
+
+  Suld1DArrayI8Trap,
+  Suld1DArrayI16Trap,
+  Suld1DArrayI32Trap,
+  Suld1DArrayI64Trap,
+  Suld1DArrayV2I8Trap,
+  Suld1DArrayV2I16Trap,
+  Suld1DArrayV2I32Trap,
+  Suld1DArrayV2I64Trap,
+  Suld1DArrayV4I8Trap,
+  Suld1DArrayV4I16Trap,
+  Suld1DArrayV4I32Trap,
+
+  Suld2DI8Trap,
+  Suld2DI16Trap,
+  Suld2DI32Trap,
+  Suld2DI64Trap,
+  Suld2DV2I8Trap,
+  Suld2DV2I16Trap,
+  Suld2DV2I32Trap,
+  Suld2DV2I64Trap,
+  Suld2DV4I8Trap,
+  Suld2DV4I16Trap,
+  Suld2DV4I32Trap,
+
+  Suld2DArrayI8Trap,
+  Suld2DArrayI16Trap,
+  Suld2DArrayI32Trap,
+  Suld2DArrayI64Trap,
+  Suld2DArrayV2I8Trap,
+  Suld2DArrayV2I16Trap,
+  Suld2DArrayV2I32Trap,
+  Suld2DArrayV2I64Trap,
+  Suld2DArrayV4I8Trap,
+  Suld2DArrayV4I16Trap,
+  Suld2DArrayV4I32Trap,
+
+  Suld3DI8Trap,
+  Suld3DI16Trap,
+  Suld3DI32Trap,
+  Suld3DI64Trap,
+  Suld3DV2I8Trap,
+  Suld3DV2I16Trap,
+  Suld3DV2I32Trap,
+  Suld3DV2I64Trap,
+  Suld3DV4I8Trap,
+  Suld3DV4I16Trap,
+  Suld3DV4I32Trap,
+
+  Suld1DI8Zero,
+  Suld1DI16Zero,
+  Suld1DI32Zero,
+  Suld1DI64Zero,
+  Suld1DV2I8Zero,
+  Suld1DV2I16Zero,
+  Suld1DV2I32Zero,
+  Suld1DV2I64Zero,
+  Suld1DV4I8Zero,
+  Suld1DV4I16Zero,
+  Suld1DV4I32Zero,
+
+  Suld1DArrayI8Zero,
+  Suld1DArrayI16Zero,
+  Suld1DArrayI32Zero,
+  Suld1DArrayI64Zero,
+  Suld1DArrayV2I8Zero,
+  Suld1DArrayV2I16Zero,
+  Suld1DArrayV2I32Zero,
+  Suld1DArrayV2I64Zero,
+  Suld1DArrayV4I8Zero,
+  Suld1DArrayV4I16Zero,
+  Suld1DArrayV4I32Zero,
+
+  Suld2DI8Zero,
+  Suld2DI16Zero,
+  Suld2DI32Zero,
+  Suld2DI64Zero,
+  Suld2DV2I8Zero,
+  Suld2DV2I16Zero,
+  Suld2DV2I32Zero,
+  Suld2DV2I64Zero,
+  Suld2DV4I8Zero,
+  Suld2DV4I16Zero,
+  Suld2DV4I32Zero,
+
+  Suld2DArrayI8Zero,
+  Suld2DArrayI16Zero,
+  Suld2DArrayI32Zero,
+  Suld2DArrayI64Zero,
+  Suld2DArrayV2I8Zero,
+  Suld2DArrayV2I16Zero,
+  Suld2DArrayV2I32Zero,
+  Suld2DArrayV2I64Zero,
+  Suld2DArrayV4I8Zero,
+  Suld2DArrayV4I16Zero,
+  Suld2DArrayV4I32Zero,
+
+  Suld3DI8Zero,
+  Suld3DI16Zero,
+  Suld3DI32Zero,
+  Suld3DI64Zero,
+  Suld3DV2I8Zero,
+  Suld3DV2I16Zero,
+  Suld3DV2I32Zero,
+  Suld3DV2I64Zero,
+  Suld3DV4I8Zero,
+  Suld3DV4I16Zero,
+  Suld3DV4I32Zero
+};
+}
+
+class NVPTXSubtarget;
+
+//===--------------------------------------------------------------------===//
+// TargetLowering Implementation
+//===--------------------------------------------------------------------===//
+class NVPTXTargetLowering : public TargetLowering {
+public:
+  explicit NVPTXTargetLowering(NVPTXTargetMachine &TM);
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+  SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerGlobalAddress(const GlobalValue *GV, int64_t Offset,
+                             SelectionDAG &DAG) const;
+
+  const char *getTargetNodeName(unsigned Opcode) const override;
+
+  bool isTypeSupportedInIntrinsic(MVT VT) const;
+
+  bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
+                          unsigned Intrinsic) const override;
+
+  /// isLegalAddressingMode - Return true if the addressing mode represented
+  /// by AM is legal for this target, for a load/store of the specified type
+  /// Used to guide target specific optimizations, like loop strength
+  /// reduction (LoopStrengthReduce.cpp) and memory optimization for
+  /// address mode (CodeGenPrepare.cpp)
+  bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
+
+  /// getFunctionAlignment - Return the Log2 alignment of this function.
+  unsigned getFunctionAlignment(const Function *F) const;
+
+  EVT getSetCCResultType(LLVMContext &Ctx, EVT VT) const override {
+    if (VT.isVector())
+      return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
+    return MVT::i1;
+  }
+
+  ConstraintType
+  getConstraintType(const std::string &Constraint) const override;
+  std::pair<unsigned, const TargetRegisterClass *>
+  getRegForInlineAsmConstraint(const std::string &Constraint,
+                               MVT VT) const override;
+
+  SDValue LowerFormalArguments(
+      SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+      const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc dl, SelectionDAG &DAG,
+      SmallVectorImpl<SDValue> &InVals) const override;
+
+  SDValue LowerCall(CallLoweringInfo &CLI,
+                    SmallVectorImpl<SDValue> &InVals) const override;
+
+  std::string getPrototype(Type *, const ArgListTy &,
+                           const SmallVectorImpl<ISD::OutputArg> &,
+                           unsigned retAlignment,
+                           const ImmutableCallSite *CS) const;
+
+  SDValue
+  LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+              const SmallVectorImpl<ISD::OutputArg> &Outs,
+              const SmallVectorImpl<SDValue> &OutVals, SDLoc dl,
+              SelectionDAG &DAG) const override;
+
+  void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
+                                    std::vector<SDValue> &Ops,
+                                    SelectionDAG &DAG) const override;
+
+  NVPTXTargetMachine *nvTM;
+
+  // PTX always uses 32-bit shift amounts
+  MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i32; }
+
+  TargetLoweringBase::LegalizeTypeAction
+  getPreferredVectorAction(EVT VT) const override;
+
+  bool allowFMA(MachineFunction &MF, CodeGenOpt::Level OptLevel) const;
+
+  virtual bool isFMAFasterThanFMulAndFAdd(EVT) const {
+    return true;
+  }
+
+private:
+  const NVPTXSubtarget &nvptxSubtarget; // cache the subtarget here
+
+  SDValue getExtSymb(SelectionDAG &DAG, const char *name, int idx,
+                     EVT = MVT::i32) const;
+  SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT) const;
+  SDValue getParamHelpSymbol(SelectionDAG &DAG, int idx);
+
+  SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerLOADi1(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
+
+  void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                          SelectionDAG &DAG) const override;
+  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
+  unsigned getArgumentAlignment(SDValue Callee, const ImmutableCallSite *CS,
+                                Type *Ty, unsigned Idx) const;
+};
+} // namespace llvm
+
+#endif // NVPTXISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
new file mode 100644
index 0000000..a98fb37
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
@@ -0,0 +1,178 @@
+//===-- NVPTXImageOptimizer.cpp - Image optimization pass -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements IR-level optimizations of image access code,
+// including:
+//
+// 1. Eliminate istypep intrinsics when image access qualifier is known
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "NVPTXUtilities.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Analysis/ConstantFolding.h"
+
+using namespace llvm;
+
+namespace {
+class NVPTXImageOptimizer : public FunctionPass {
+private:
+  static char ID;
+  SmallVector<Instruction*, 4> InstrToDelete;
+
+public:
+  NVPTXImageOptimizer();
+
+  bool runOnFunction(Function &F) override;
+
+private:
+  bool replaceIsTypePSampler(Instruction &I);
+  bool replaceIsTypePSurface(Instruction &I);
+  bool replaceIsTypePTexture(Instruction &I);
+  Value *cleanupValue(Value *V);
+  void replaceWith(Instruction *From, ConstantInt *To);
+};
+}
+
+char NVPTXImageOptimizer::ID = 0;
+
+NVPTXImageOptimizer::NVPTXImageOptimizer()
+  : FunctionPass(ID) {}
+
+bool NVPTXImageOptimizer::runOnFunction(Function &F) {
+  bool Changed = false;
+  InstrToDelete.clear();
+
+  // Look for call instructions in the function
+  for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE;
+       ++BI) {
+    for (BasicBlock::iterator I = (*BI).begin(), E = (*BI).end();
+         I != E; ++I) {
+      Instruction &Instr = *I;
+      if (CallInst *CI = dyn_cast<CallInst>(I)) {
+        Function *CalledF = CI->getCalledFunction();
+        if (CalledF && CalledF->isIntrinsic()) {
+          // This is an intrinsic function call, check if its an istypep
+          switch (CalledF->getIntrinsicID()) {
+          default: break;
+          case Intrinsic::nvvm_istypep_sampler:
+            Changed |= replaceIsTypePSampler(Instr);
+            break;
+          case Intrinsic::nvvm_istypep_surface:
+            Changed |= replaceIsTypePSurface(Instr);
+            break;
+          case Intrinsic::nvvm_istypep_texture:
+            Changed |= replaceIsTypePTexture(Instr);
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  // Delete any istypep instances we replaced in the IR
+  for (unsigned i = 0, e = InstrToDelete.size(); i != e; ++i)
+    InstrToDelete[i]->eraseFromParent();
+
+  return Changed;
+}
+
+bool NVPTXImageOptimizer::replaceIsTypePSampler(Instruction &I) {
+  Value *TexHandle = cleanupValue(I.getOperand(0));
+  if (isSampler(*TexHandle)) {
+    // This is an OpenCL sampler, so it must be a samplerref
+    replaceWith(&I, ConstantInt::getTrue(I.getContext()));
+    return true;
+  } else if (isImageWriteOnly(*TexHandle) ||
+             isImageReadWrite(*TexHandle) ||
+             isImageReadOnly(*TexHandle)) {
+    // This is an OpenCL image, so it cannot be a samplerref
+    replaceWith(&I, ConstantInt::getFalse(I.getContext()));
+    return true;
+  } else {
+    // The image type is unknown, so we cannot eliminate the intrinsic
+    return false;
+  }
+}
+
+bool NVPTXImageOptimizer::replaceIsTypePSurface(Instruction &I) {
+  Value *TexHandle = cleanupValue(I.getOperand(0));
+  if (isImageReadWrite(*TexHandle) ||
+      isImageWriteOnly(*TexHandle)) {
+    // This is an OpenCL read-only/read-write image, so it must be a surfref
+    replaceWith(&I, ConstantInt::getTrue(I.getContext()));
+    return true;
+  } else if (isImageReadOnly(*TexHandle) ||
+             isSampler(*TexHandle)) {
+    // This is an OpenCL read-only/ imageor sampler, so it cannot be
+    // a surfref
+    replaceWith(&I, ConstantInt::getFalse(I.getContext()));
+    return true;
+  } else {
+    // The image type is unknown, so we cannot eliminate the intrinsic
+    return false;
+  }
+}
+
+bool NVPTXImageOptimizer::replaceIsTypePTexture(Instruction &I) {
+  Value *TexHandle = cleanupValue(I.getOperand(0));
+  if (isImageReadOnly(*TexHandle)) {
+    // This is an OpenCL read-only image, so it must be a texref
+    replaceWith(&I, ConstantInt::getTrue(I.getContext()));
+    return true;
+  } else if (isImageWriteOnly(*TexHandle) ||
+             isImageReadWrite(*TexHandle) ||
+             isSampler(*TexHandle)) {
+    // This is an OpenCL read-write/write-only image or a sampler, so it
+    // cannot be a texref
+    replaceWith(&I, ConstantInt::getFalse(I.getContext()));
+    return true;
+  } else {
+    // The image type is unknown, so we cannot eliminate the intrinsic
+    return false;
+  }
+}
+
+void NVPTXImageOptimizer::replaceWith(Instruction *From, ConstantInt *To) {
+  // We implement "poor man's DCE" here to make sure any code that is no longer
+  // live is actually unreachable and can be trivially eliminated by the
+  // unreachable block elimination pass.
+  for (CallInst::use_iterator UI = From->use_begin(), UE = From->use_end();
+       UI != UE; ++UI) {
+    if (BranchInst *BI = dyn_cast<BranchInst>(*UI)) {
+      if (BI->isUnconditional()) continue;
+      BasicBlock *Dest;
+      if (To->isZero())
+        // Get false block
+        Dest = BI->getSuccessor(1);
+      else
+        // Get true block
+        Dest = BI->getSuccessor(0);
+      BranchInst::Create(Dest, BI);
+      InstrToDelete.push_back(BI);
+    }
+  }
+  From->replaceAllUsesWith(To);
+  InstrToDelete.push_back(From);
+}
+
+Value *NVPTXImageOptimizer::cleanupValue(Value *V) {
+  if (ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(V)) {
+    return cleanupValue(EVI->getAggregateOperand());
+  }
+  return V;
+}
+
+FunctionPass *llvm::createNVPTXImageOptimizerPass() {
+  return new NVPTXImageOptimizer();
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrFormats.td b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrFormats.td
new file mode 100644
index 0000000..ffcb5d5
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrFormats.td
@@ -0,0 +1,59 @@
+//===- NVPTXInstrFormats.td - NVPTX Instruction Formats-------*- tblgen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Describe NVPTX instructions format
+//
+//===----------------------------------------------------------------------===//
+
+// Vector instruction type enum
+class VecInstTypeEnum<bits<4> val> {
+  bits<4> Value=val;
+}
+def VecNOP : VecInstTypeEnum<0>;
+
+// Generic NVPTX Format
+
+class NVPTXInst<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : Instruction {
+  field bits<14> Inst;
+
+  let Namespace = "NVPTX";
+  dag OutOperandList = outs;
+  dag InOperandList = ins;
+  let AsmString = asmstr;
+  let Pattern = pattern;
+
+  // TSFlagFields
+  bits<4> VecInstType = VecNOP.Value;
+  bit IsSimpleMove = 0;
+  bit IsLoad = 0;
+  bit IsStore = 0;
+
+  bit IsTex = 0;
+  bit IsSust = 0;
+  bit IsSurfTexQuery = 0;
+  bit IsTexModeUnified = 0;
+
+  // The following field is encoded as log2 of the vector size minus one,
+  // with 0 meaning the operation is not a surface instruction.  For example,
+  // if IsSuld == 2, then the instruction is a suld instruction with vector size
+  // 2**(2-1) = 2.
+  bits<2> IsSuld = 0;
+
+  let TSFlags{3-0}   = VecInstType;
+  let TSFlags{4-4}   = IsSimpleMove;
+  let TSFlags{5-5}   = IsLoad;
+  let TSFlags{6-6}   = IsStore;
+  let TSFlags{7}     = IsTex;
+  let TSFlags{9-8}   = IsSuld;
+  let TSFlags{10}    = IsSust;
+  let TSFlags{11}    = IsSurfTexQuery;
+  let TSFlags{12}    = IsTexModeUnified;
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
new file mode 100644
index 0000000..b5b4fbe
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -0,0 +1,273 @@
+//===- NVPTXInstrInfo.cpp - NVPTX Instruction Information -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the NVPTX implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "NVPTXInstrInfo.h"
+#include "NVPTXTargetMachine.h"
+#include "llvm/IR/Function.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "NVPTXGenInstrInfo.inc"
+
+// Pin the vtable to this file.
+void NVPTXInstrInfo::anchor() {}
+
+// FIXME: Add the subtarget support on this constructor.
+NVPTXInstrInfo::NVPTXInstrInfo(NVPTXSubtarget &STI)
+    : NVPTXGenInstrInfo(), RegInfo(STI) {}
+
+void NVPTXInstrInfo::copyPhysReg(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL,
+    unsigned DestReg, unsigned SrcReg, bool KillSrc) const {
+  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  const TargetRegisterClass *DestRC = MRI.getRegClass(DestReg);
+  const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
+
+  if (DestRC != SrcRC)
+    report_fatal_error("Attempted to created cross-class register copy");
+
+  if (DestRC == &NVPTX::Int32RegsRegClass)
+    BuildMI(MBB, I, DL, get(NVPTX::IMOV32rr), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (DestRC == &NVPTX::Int1RegsRegClass)
+    BuildMI(MBB, I, DL, get(NVPTX::IMOV1rr), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (DestRC == &NVPTX::Float32RegsRegClass)
+    BuildMI(MBB, I, DL, get(NVPTX::FMOV32rr), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (DestRC == &NVPTX::Int16RegsRegClass)
+    BuildMI(MBB, I, DL, get(NVPTX::IMOV16rr), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (DestRC == &NVPTX::Int64RegsRegClass)
+    BuildMI(MBB, I, DL, get(NVPTX::IMOV64rr), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (DestRC == &NVPTX::Float64RegsRegClass)
+    BuildMI(MBB, I, DL, get(NVPTX::FMOV64rr), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc));
+  else {
+    llvm_unreachable("Bad register copy");
+  }
+}
+
+bool NVPTXInstrInfo::isMoveInstr(const MachineInstr &MI, unsigned &SrcReg,
+                                 unsigned &DestReg) const {
+  // Look for the appropriate part of TSFlags
+  bool isMove = false;
+
+  unsigned TSFlags =
+      (MI.getDesc().TSFlags & NVPTX::SimpleMoveMask) >> NVPTX::SimpleMoveShift;
+  isMove = (TSFlags == 1);
+
+  if (isMove) {
+    MachineOperand dest = MI.getOperand(0);
+    MachineOperand src = MI.getOperand(1);
+    assert(dest.isReg() && "dest of a movrr is not a reg");
+    assert(src.isReg() && "src of a movrr is not a reg");
+
+    SrcReg = src.getReg();
+    DestReg = dest.getReg();
+    return true;
+  }
+
+  return false;
+}
+
+bool NVPTXInstrInfo::isReadSpecialReg(MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  case NVPTX::INT_PTX_SREG_NTID_X:
+  case NVPTX::INT_PTX_SREG_NTID_Y:
+  case NVPTX::INT_PTX_SREG_NTID_Z:
+  case NVPTX::INT_PTX_SREG_TID_X:
+  case NVPTX::INT_PTX_SREG_TID_Y:
+  case NVPTX::INT_PTX_SREG_TID_Z:
+  case NVPTX::INT_PTX_SREG_CTAID_X:
+  case NVPTX::INT_PTX_SREG_CTAID_Y:
+  case NVPTX::INT_PTX_SREG_CTAID_Z:
+  case NVPTX::INT_PTX_SREG_NCTAID_X:
+  case NVPTX::INT_PTX_SREG_NCTAID_Y:
+  case NVPTX::INT_PTX_SREG_NCTAID_Z:
+  case NVPTX::INT_PTX_SREG_WARPSIZE:
+    return true;
+  }
+}
+
+bool NVPTXInstrInfo::isLoadInstr(const MachineInstr &MI,
+                                 unsigned &AddrSpace) const {
+  bool isLoad = false;
+  unsigned TSFlags =
+      (MI.getDesc().TSFlags & NVPTX::isLoadMask) >> NVPTX::isLoadShift;
+  isLoad = (TSFlags == 1);
+  if (isLoad)
+    AddrSpace = getLdStCodeAddrSpace(MI);
+  return isLoad;
+}
+
+bool NVPTXInstrInfo::isStoreInstr(const MachineInstr &MI,
+                                  unsigned &AddrSpace) const {
+  bool isStore = false;
+  unsigned TSFlags =
+      (MI.getDesc().TSFlags & NVPTX::isStoreMask) >> NVPTX::isStoreShift;
+  isStore = (TSFlags == 1);
+  if (isStore)
+    AddrSpace = getLdStCodeAddrSpace(MI);
+  return isStore;
+}
+
+bool NVPTXInstrInfo::CanTailMerge(const MachineInstr *MI) const {
+  unsigned addrspace = 0;
+  if (MI->getOpcode() == NVPTX::INT_CUDA_SYNCTHREADS)
+    return false;
+  if (isLoadInstr(*MI, addrspace))
+    if (addrspace == NVPTX::PTXLdStInstCode::SHARED)
+      return false;
+  if (isStoreInstr(*MI, addrspace))
+    if (addrspace == NVPTX::PTXLdStInstCode::SHARED)
+      return false;
+  return true;
+}
+
+/// AnalyzeBranch - Analyze the branching code at the end of MBB, returning
+/// true if it cannot be understood (e.g. it's a switch dispatch or isn't
+/// implemented for a target).  Upon success, this returns false and returns
+/// with the following information in various cases:
+///
+/// 1. If this block ends with no branches (it just falls through to its succ)
+///    just return false, leaving TBB/FBB null.
+/// 2. If this block ends with only an unconditional branch, it sets TBB to be
+///    the destination block.
+/// 3. If this block ends with an conditional branch and it falls through to
+///    an successor block, it sets TBB to be the branch destination block and a
+///    list of operands that evaluate the condition. These
+///    operands can be passed to other TargetInstrInfo methods to create new
+///    branches.
+/// 4. If this block ends with an conditional branch and an unconditional
+///    block, it returns the 'true' destination in TBB, the 'false' destination
+///    in FBB, and a list of operands that evaluate the condition. These
+///    operands can be passed to other TargetInstrInfo methods to create new
+///    branches.
+///
+/// Note that RemoveBranch and InsertBranch must be implemented to support
+/// cases where this method returns success.
+///
+bool NVPTXInstrInfo::AnalyzeBranch(
+    MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
+    SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const {
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I))
+    return false;
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = I;
+
+  // If there is only one terminator instruction, process it.
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+    if (LastInst->getOpcode() == NVPTX::GOTO) {
+      TBB = LastInst->getOperand(0).getMBB();
+      return false;
+    } else if (LastInst->getOpcode() == NVPTX::CBranch) {
+      // Block ends with fall-through condbranch.
+      TBB = LastInst->getOperand(1).getMBB();
+      Cond.push_back(LastInst->getOperand(0));
+      return false;
+    }
+    // Otherwise, don't know what this is.
+    return true;
+  }
+
+  // Get the instruction before it if it's a terminator.
+  MachineInstr *SecondLastInst = I;
+
+  // If there are three terminators, we don't know what sort of block this is.
+  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I))
+    return true;
+
+  // If the block ends with NVPTX::GOTO and NVPTX:CBranch, handle it.
+  if (SecondLastInst->getOpcode() == NVPTX::CBranch &&
+      LastInst->getOpcode() == NVPTX::GOTO) {
+    TBB = SecondLastInst->getOperand(1).getMBB();
+    Cond.push_back(SecondLastInst->getOperand(0));
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  }
+
+  // If the block ends with two NVPTX:GOTOs, handle it.  The second one is not
+  // executed, so remove it.
+  if (SecondLastInst->getOpcode() == NVPTX::GOTO &&
+      LastInst->getOpcode() == NVPTX::GOTO) {
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return false;
+  }
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+unsigned NVPTXInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin())
+    return 0;
+  --I;
+  if (I->getOpcode() != NVPTX::GOTO && I->getOpcode() != NVPTX::CBranch)
+    return 0;
+
+  // Remove the branch.
+  I->eraseFromParent();
+
+  I = MBB.end();
+
+  if (I == MBB.begin())
+    return 1;
+  --I;
+  if (I->getOpcode() != NVPTX::CBranch)
+    return 1;
+
+  // Remove the branch.
+  I->eraseFromParent();
+  return 2;
+}
+
+unsigned NVPTXInstrInfo::InsertBranch(
+    MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
+    const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const {
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 1 || Cond.size() == 0) &&
+         "NVPTX branch conditions have two components!");
+
+  // One-way branch.
+  if (!FBB) {
+    if (Cond.empty()) // Unconditional branch
+      BuildMI(&MBB, DL, get(NVPTX::GOTO)).addMBB(TBB);
+    else // Conditional branch
+      BuildMI(&MBB, DL, get(NVPTX::CBranch)).addReg(Cond[0].getReg())
+          .addMBB(TBB);
+    return 1;
+  }
+
+  // Two-way Conditional Branch.
+  BuildMI(&MBB, DL, get(NVPTX::CBranch)).addReg(Cond[0].getReg()).addMBB(TBB);
+  BuildMI(&MBB, DL, get(NVPTX::GOTO)).addMBB(FBB);
+  return 2;
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h
new file mode 100644
index 0000000..2ac2974
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h
@@ -0,0 +1,78 @@
+//===- NVPTXInstrInfo.h - NVPTX Instruction Information----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the niversity of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the NVPTX implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTXINSTRUCTIONINFO_H
+#define NVPTXINSTRUCTIONINFO_H
+
+#include "NVPTX.h"
+#include "NVPTXRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "NVPTXGenInstrInfo.inc"
+
+namespace llvm {
+
+class NVPTXInstrInfo : public NVPTXGenInstrInfo {
+  const NVPTXRegisterInfo RegInfo;
+  virtual void anchor();
+public:
+  explicit NVPTXInstrInfo(NVPTXSubtarget &STI);
+
+  const NVPTXRegisterInfo &getRegisterInfo() const { return RegInfo; }
+
+  /* The following virtual functions are used in register allocation.
+   * They are not implemented because the existing interface and the logic
+   * at the caller side do not work for the elementized vector load and store.
+   *
+   * virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
+   *                                  int &FrameIndex) const;
+   * virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
+   *                                 int &FrameIndex) const;
+   * virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+   *                              MachineBasicBlock::iterator MBBI,
+   *                             unsigned SrcReg, bool isKill, int FrameIndex,
+   *                              const TargetRegisterClass *RC) const;
+   * virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+   *                               MachineBasicBlock::iterator MBBI,
+   *                               unsigned DestReg, int FrameIndex,
+   *                               const TargetRegisterClass *RC) const;
+   */
+
+  void copyPhysReg(
+      MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL,
+      unsigned DestReg, unsigned SrcReg, bool KillSrc) const override;
+  virtual bool isMoveInstr(const MachineInstr &MI, unsigned &SrcReg,
+                           unsigned &DestReg) const;
+  bool isLoadInstr(const MachineInstr &MI, unsigned &AddrSpace) const;
+  bool isStoreInstr(const MachineInstr &MI, unsigned &AddrSpace) const;
+  bool isReadSpecialReg(MachineInstr &MI) const;
+
+  virtual bool CanTailMerge(const MachineInstr *MI) const;
+  // Branch analysis.
+  bool AnalyzeBranch(
+      MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
+      SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const override;
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
+  unsigned InsertBranch(
+      MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
+      const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const override;
+  unsigned getLdStCodeAddrSpace(const MachineInstr &MI) const {
+    return MI.getOperand(2).getImm();
+  }
+
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
new file mode 100644
index 0000000..9900b8c
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -0,0 +1,2741 @@
+//===- NVPTXInstrInfo.td - NVPTX Instruction defs -------------*- tblgen-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the PTX instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+include "NVPTXInstrFormats.td"
+
+// A NOP instruction
+def NOP : NVPTXInst<(outs), (ins), "", []>;
+
+// List of vector specific properties
+def isVecLD      : VecInstTypeEnum<1>;
+def isVecST      : VecInstTypeEnum<2>;
+def isVecBuild   : VecInstTypeEnum<3>;
+def isVecShuffle : VecInstTypeEnum<4>;
+def isVecExtract : VecInstTypeEnum<5>;
+def isVecInsert  : VecInstTypeEnum<6>;
+def isVecDest    : VecInstTypeEnum<7>;
+def isVecOther   : VecInstTypeEnum<15>;
+
+//===----------------------------------------------------------------------===//
+// NVPTX Operand Definitions.
+//===----------------------------------------------------------------------===//
+
+def brtarget    : Operand<OtherVT>;
+
+// CVT conversion modes
+// These must match the enum in NVPTX.h
+def CvtNONE : PatLeaf<(i32 0x0)>;
+def CvtRNI  : PatLeaf<(i32 0x1)>;
+def CvtRZI  : PatLeaf<(i32 0x2)>;
+def CvtRMI  : PatLeaf<(i32 0x3)>;
+def CvtRPI  : PatLeaf<(i32 0x4)>;
+def CvtRN   : PatLeaf<(i32 0x5)>;
+def CvtRZ   : PatLeaf<(i32 0x6)>;
+def CvtRM   : PatLeaf<(i32 0x7)>;
+def CvtRP   : PatLeaf<(i32 0x8)>;
+
+def CvtNONE_FTZ : PatLeaf<(i32 0x10)>;
+def CvtRNI_FTZ  : PatLeaf<(i32 0x11)>;
+def CvtRZI_FTZ  : PatLeaf<(i32 0x12)>;
+def CvtRMI_FTZ  : PatLeaf<(i32 0x13)>;
+def CvtRPI_FTZ  : PatLeaf<(i32 0x14)>;
+def CvtRN_FTZ   : PatLeaf<(i32 0x15)>;
+def CvtRZ_FTZ   : PatLeaf<(i32 0x16)>;
+def CvtRM_FTZ   : PatLeaf<(i32 0x17)>;
+def CvtRP_FTZ   : PatLeaf<(i32 0x18)>;
+
+def CvtSAT      : PatLeaf<(i32 0x20)>;
+def CvtSAT_FTZ  : PatLeaf<(i32 0x30)>;
+
+def CvtMode : Operand<i32> {
+  let PrintMethod = "printCvtMode";
+}
+
+// Compare modes
+// These must match the enum in NVPTX.h
+def CmpEQ   : PatLeaf<(i32 0)>;
+def CmpNE   : PatLeaf<(i32 1)>;
+def CmpLT   : PatLeaf<(i32 2)>;
+def CmpLE   : PatLeaf<(i32 3)>;
+def CmpGT   : PatLeaf<(i32 4)>;
+def CmpGE   : PatLeaf<(i32 5)>;
+def CmpLO   : PatLeaf<(i32 6)>;
+def CmpLS   : PatLeaf<(i32 7)>;
+def CmpHI   : PatLeaf<(i32 8)>;
+def CmpHS   : PatLeaf<(i32 9)>;
+def CmpEQU  : PatLeaf<(i32 10)>;
+def CmpNEU  : PatLeaf<(i32 11)>;
+def CmpLTU  : PatLeaf<(i32 12)>;
+def CmpLEU  : PatLeaf<(i32 13)>;
+def CmpGTU  : PatLeaf<(i32 14)>;
+def CmpGEU  : PatLeaf<(i32 15)>;
+def CmpNUM  : PatLeaf<(i32 16)>;
+def CmpNAN  : PatLeaf<(i32 17)>;
+
+def CmpEQ_FTZ   : PatLeaf<(i32 0x100)>;
+def CmpNE_FTZ   : PatLeaf<(i32 0x101)>;
+def CmpLT_FTZ   : PatLeaf<(i32 0x102)>;
+def CmpLE_FTZ   : PatLeaf<(i32 0x103)>;
+def CmpGT_FTZ   : PatLeaf<(i32 0x104)>;
+def CmpGE_FTZ   : PatLeaf<(i32 0x105)>;
+def CmpLO_FTZ   : PatLeaf<(i32 0x106)>;
+def CmpLS_FTZ   : PatLeaf<(i32 0x107)>;
+def CmpHI_FTZ   : PatLeaf<(i32 0x108)>;
+def CmpHS_FTZ   : PatLeaf<(i32 0x109)>;
+def CmpEQU_FTZ  : PatLeaf<(i32 0x10A)>;
+def CmpNEU_FTZ  : PatLeaf<(i32 0x10B)>;
+def CmpLTU_FTZ  : PatLeaf<(i32 0x10C)>;
+def CmpLEU_FTZ  : PatLeaf<(i32 0x10D)>;
+def CmpGTU_FTZ  : PatLeaf<(i32 0x10E)>;
+def CmpGEU_FTZ  : PatLeaf<(i32 0x10F)>;
+def CmpNUM_FTZ  : PatLeaf<(i32 0x110)>;
+def CmpNAN_FTZ  : PatLeaf<(i32 0x111)>;
+
+def CmpMode : Operand<i32> {
+  let PrintMethod = "printCmpMode";
+}
+
+def F32ConstZero : Operand<f32>, PatLeaf<(f32 fpimm)>, SDNodeXForm<fpimm, [{
+    return CurDAG->getTargetConstantFP(0.0, MVT::f32);
+  }]>;
+def F32ConstOne : Operand<f32>, PatLeaf<(f32 fpimm)>, SDNodeXForm<fpimm, [{
+    return CurDAG->getTargetConstantFP(1.0, MVT::f32);
+  }]>;
+
+//===----------------------------------------------------------------------===//
+// NVPTX Instruction Predicate Definitions
+//===----------------------------------------------------------------------===//
+
+
+def hasAtomRedG32 : Predicate<"Subtarget.hasAtomRedG32()">;
+def hasAtomRedS32 : Predicate<"Subtarget.hasAtomRedS32()">;
+def hasAtomRedGen32 : Predicate<"Subtarget.hasAtomRedGen32()">;
+def useAtomRedG32forGen32 :
+  Predicate<"!Subtarget.hasAtomRedGen32() && Subtarget.hasAtomRedG32()">;
+def hasBrkPt : Predicate<"Subtarget.hasBrkPt()">;
+def hasAtomRedG64 : Predicate<"Subtarget.hasAtomRedG64()">;
+def hasAtomRedS64 : Predicate<"Subtarget.hasAtomRedS64()">;
+def hasAtomRedGen64 : Predicate<"Subtarget.hasAtomRedGen64()">;
+def useAtomRedG64forGen64 :
+  Predicate<"!Subtarget.hasAtomRedGen64() && Subtarget.hasAtomRedG64()">;
+def hasAtomAddF32 : Predicate<"Subtarget.hasAtomAddF32()">;
+def hasVote : Predicate<"Subtarget.hasVote()">;
+def hasDouble : Predicate<"Subtarget.hasDouble()">;
+def reqPTX20 : Predicate<"Subtarget.reqPTX20()">;
+def hasLDG : Predicate<"Subtarget.hasLDG()">;
+def hasLDU : Predicate<"Subtarget.hasLDU()">;
+def hasGenericLdSt : Predicate<"Subtarget.hasGenericLdSt()">;
+
+def doF32FTZ : Predicate<"useF32FTZ()">;
+def doNoF32FTZ : Predicate<"!useF32FTZ()">;
+
+def doMulWide      : Predicate<"doMulWide">;
+
+def allowFMA : Predicate<"allowFMA()">;
+def noFMA : Predicate<"!allowFMA()">;
+
+def do_DIVF32_APPROX : Predicate<"getDivF32Level()==0">;
+def do_DIVF32_FULL : Predicate<"getDivF32Level()==1">;
+
+def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">;
+def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">;
+
+def hasHWROT32 : Predicate<"Subtarget.hasHWROT32()">;
+def noHWROT32 : Predicate<"!Subtarget.hasHWROT32()">;
+
+def true : Predicate<"1">;
+
+def hasPTX31 : Predicate<"Subtarget.getPTXVersion() >= 31">;
+
+
+//===----------------------------------------------------------------------===//
+// Some Common Instruction Class Templates
+//===----------------------------------------------------------------------===//
+
+multiclass I3<string OpcStr, SDNode OpNode> {
+  def i64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
+                     !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+                     [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
+                       Int64Regs:$b))]>;
+  def i64ri : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
+                     !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+                     [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
+  def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+                     !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+                     [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
+                       Int32Regs:$b))]>;
+  def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+                     !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+                     [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
+  def i16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+                     !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                     [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
+                       Int16Regs:$b))]>;
+  def i16ri : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+                     !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                     [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>;
+}
+
+multiclass ADD_SUB_INT_32<string OpcStr, SDNode OpNode> {
+   def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a,
+       Int32Regs:$b),
+                      !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
+                      [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
+                        Int32Regs:$b))]>;
+   def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+                      !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
+                      [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
+}
+
+multiclass F3<string OpcStr, SDNode OpNode> {
+   def f64rr : NVPTXInst<(outs Float64Regs:$dst),
+                      (ins Float64Regs:$a, Float64Regs:$b),
+                      !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
+                      [(set Float64Regs:$dst,
+                        (OpNode Float64Regs:$a, Float64Regs:$b))]>,
+                      Requires<[allowFMA]>;
+   def f64ri : NVPTXInst<(outs Float64Regs:$dst),
+                      (ins Float64Regs:$a, f64imm:$b),
+                      !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
+                      [(set Float64Regs:$dst,
+                        (OpNode Float64Regs:$a, fpimm:$b))]>,
+                      Requires<[allowFMA]>;
+   def f32rr_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b),
+                      !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
+                      [(set Float32Regs:$dst,
+                        (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+                      Requires<[allowFMA, doF32FTZ]>;
+   def f32ri_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, f32imm:$b),
+                      !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
+                      [(set Float32Regs:$dst,
+                        (OpNode Float32Regs:$a, fpimm:$b))]>,
+                      Requires<[allowFMA, doF32FTZ]>;
+   def f32rr : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b),
+                      !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
+                      [(set Float32Regs:$dst,
+                        (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+                      Requires<[allowFMA]>;
+   def f32ri : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, f32imm:$b),
+                      !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
+                      [(set Float32Regs:$dst,
+                        (OpNode Float32Regs:$a, fpimm:$b))]>,
+                      Requires<[allowFMA]>;
+}
+
+multiclass F3_rn<string OpcStr, SDNode OpNode> {
+   def f64rr : NVPTXInst<(outs Float64Regs:$dst),
+                      (ins Float64Regs:$a, Float64Regs:$b),
+                      !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
+                      [(set Float64Regs:$dst,
+                        (OpNode Float64Regs:$a, Float64Regs:$b))]>,
+                      Requires<[noFMA]>;
+   def f64ri : NVPTXInst<(outs Float64Regs:$dst),
+                      (ins Float64Regs:$a, f64imm:$b),
+                      !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
+                      [(set Float64Regs:$dst,
+                        (OpNode Float64Regs:$a, fpimm:$b))]>,
+                      Requires<[noFMA]>;
+   def f32rr_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b),
+                      !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
+                      [(set Float32Regs:$dst,
+                        (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+                      Requires<[noFMA, doF32FTZ]>;
+   def f32ri_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, f32imm:$b),
+                      !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
+                      [(set Float32Regs:$dst,
+                        (OpNode Float32Regs:$a, fpimm:$b))]>,
+                      Requires<[noFMA, doF32FTZ]>;
+   def f32rr : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b),
+                      !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
+                      [(set Float32Regs:$dst,
+                        (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+                      Requires<[noFMA]>;
+   def f32ri : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, f32imm:$b),
+                      !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
+                      [(set Float32Regs:$dst,
+                        (OpNode Float32Regs:$a, fpimm:$b))]>,
+                      Requires<[noFMA]>;
+}
+
+multiclass F2<string OpcStr, SDNode OpNode> {
+   def f64 : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a),
+                      !strconcat(OpcStr, ".f64 \t$dst, $a;"),
+                      [(set Float64Regs:$dst, (OpNode Float64Regs:$a))]>;
+   def f32_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
+                      !strconcat(OpcStr, ".ftz.f32 \t$dst, $a;"),
+                      [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>,
+                      Requires<[doF32FTZ]>;
+   def f32 : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
+                      !strconcat(OpcStr, ".f32 \t$dst, $a;"),
+                      [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>;
+}
+
+//===----------------------------------------------------------------------===//
+// NVPTX Instructions.
+//===----------------------------------------------------------------------===//
+
+//-----------------------------------
+// General Type Conversion
+//-----------------------------------
+
+let neverHasSideEffects = 1 in {
+// Generate a cvt to the given type from all possible types.
+// Each instance takes a CvtMode immediate that defines the conversion mode to
+// use.  It can be CvtNONE to omit a conversion mode.
+multiclass CVT_FROM_ALL<string FromName, RegisterClass RC> {
+  def _s16 : NVPTXInst<(outs RC:$dst),
+                       (ins Int16Regs:$src, CvtMode:$mode),
+                       !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                       FromName, ".s16\t$dst, $src;"),
+                       []>;
+  def _u16 : NVPTXInst<(outs RC:$dst),
+                       (ins Int16Regs:$src, CvtMode:$mode),
+                       !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                       FromName, ".u16\t$dst, $src;"),
+                       []>;
+  def _f16 : NVPTXInst<(outs RC:$dst),
+                       (ins Int16Regs:$src, CvtMode:$mode),
+                       !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                       FromName, ".f16\t$dst, $src;"),
+                       []>;
+  def _s32 : NVPTXInst<(outs RC:$dst),
+                       (ins Int32Regs:$src, CvtMode:$mode),
+                       !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                       FromName, ".s32\t$dst, $src;"),
+                       []>;
+  def _u32 : NVPTXInst<(outs RC:$dst),
+                       (ins Int32Regs:$src, CvtMode:$mode),
+                       !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                       FromName, ".u32\t$dst, $src;"),
+                       []>;
+  def _s64 : NVPTXInst<(outs RC:$dst),
+                       (ins Int64Regs:$src, CvtMode:$mode),
+                       !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                       FromName, ".s64\t$dst, $src;"),
+                       []>;
+  def _u64 : NVPTXInst<(outs RC:$dst),
+                       (ins Int64Regs:$src, CvtMode:$mode),
+                       !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                       FromName, ".u64\t$dst, $src;"),
+                       []>;
+  def _f32 : NVPTXInst<(outs RC:$dst),
+                       (ins Float32Regs:$src, CvtMode:$mode),
+                       !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                       FromName, ".f32\t$dst, $src;"),
+                       []>;
+  def _f64 : NVPTXInst<(outs RC:$dst),
+                       (ins Float64Regs:$src, CvtMode:$mode),
+                       !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                       FromName, ".f64\t$dst, $src;"),
+                       []>;
+}
+
+// Generate a cvt to all possible types.
+defm CVT_s16 : CVT_FROM_ALL<"s16", Int16Regs>;
+defm CVT_u16 : CVT_FROM_ALL<"u16", Int16Regs>;
+defm CVT_f16 : CVT_FROM_ALL<"f16", Int16Regs>;
+defm CVT_s32 : CVT_FROM_ALL<"s32", Int32Regs>;
+defm CVT_u32 : CVT_FROM_ALL<"u32", Int32Regs>;
+defm CVT_s64 : CVT_FROM_ALL<"s64", Int64Regs>;
+defm CVT_u64 : CVT_FROM_ALL<"u64", Int64Regs>;
+defm CVT_f32 : CVT_FROM_ALL<"f32", Float32Regs>;
+defm CVT_f64 : CVT_FROM_ALL<"f64", Float64Regs>;
+
+// This set of cvt is different from the above. The type of the source
+// and target are the same.
+//
+def CVT_INREG_s16_s8 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
+                        "cvt.s16.s8 \t$dst, $src;", []>;
+def CVT_INREG_s32_s8 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
+                        "cvt.s32.s8 \t$dst, $src;", []>;
+def CVT_INREG_s32_s16 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
+                        "cvt.s32.s16 \t$dst, $src;", []>;
+def CVT_INREG_s64_s8 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+                        "cvt.s64.s8 \t$dst, $src;", []>;
+def CVT_INREG_s64_s16 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+                        "cvt.s64.s16 \t$dst, $src;", []>;
+def CVT_INREG_s64_s32 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+                        "cvt.s64.s32 \t$dst, $src;", []>;
+}
+
+//-----------------------------------
+// Integer Arithmetic
+//-----------------------------------
+
+multiclass ADD_SUB_i1<SDNode OpNode> {
+   def _rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
+          "xor.pred \t$dst, $a, $b;",
+      [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
+   def _ri: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
+          "xor.pred \t$dst, $a, $b;",
+      [(set Int1Regs:$dst, (OpNode Int1Regs:$a, (imm):$b))]>;
+}
+
+defm ADD_i1 : ADD_SUB_i1<add>;
+defm SUB_i1 : ADD_SUB_i1<sub>;
+
+
+defm ADD : I3<"add.s", add>;
+defm SUB : I3<"sub.s", sub>;
+
+defm ADDCC : ADD_SUB_INT_32<"add.cc", addc>;
+defm SUBCC : ADD_SUB_INT_32<"sub.cc", subc>;
+
+defm ADDCCC : ADD_SUB_INT_32<"addc.cc", adde>;
+defm SUBCCC : ADD_SUB_INT_32<"subc.cc", sube>;
+
+//mul.wide PTX instruction
+def SInt32Const : PatLeaf<(imm), [{
+  const APInt &v = N->getAPIntValue();
+  if (v.isSignedIntN(32))
+    return true;
+  return false;
+}]>;
+
+def UInt32Const : PatLeaf<(imm), [{
+  const APInt &v = N->getAPIntValue();
+  if (v.isIntN(32))
+    return true;
+  return false;
+}]>;
+
+def SInt16Const : PatLeaf<(imm), [{
+  const APInt &v = N->getAPIntValue();
+  if (v.isSignedIntN(16))
+    return true;
+  return false;
+}]>;
+
+def UInt16Const : PatLeaf<(imm), [{
+  const APInt &v = N->getAPIntValue();
+  if (v.isIntN(16))
+    return true;
+  return false;
+}]>;
+
+def Int5Const : PatLeaf<(imm), [{
+  const APInt &v = N->getAPIntValue();
+  // Check if 0 <= v < 32
+  // Only then the result from (x << v) will be i32
+  if (v.sge(0) && v.slt(32))
+    return true;
+  return false;
+}]>;
+
+def Int4Const : PatLeaf<(imm), [{
+  const APInt &v = N->getAPIntValue();
+  // Check if 0 <= v < 16
+  // Only then the result from (x << v) will be i16
+  if (v.sge(0) && v.slt(16))
+    return true;
+  return false;
+}]>;
+
+def SHL2MUL32 : SDNodeXForm<imm, [{
+  const APInt &v = N->getAPIntValue();
+  APInt temp(32, 1);
+  return CurDAG->getTargetConstant(temp.shl(v), MVT::i32);
+}]>;
+
+def SHL2MUL16 : SDNodeXForm<imm, [{
+  const APInt &v = N->getAPIntValue();
+  APInt temp(16, 1);
+  return CurDAG->getTargetConstant(temp.shl(v), MVT::i16);
+}]>;
+
+def MULWIDES64
+  : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+              "mul.wide.s32 \t$dst, $a, $b;", []>;
+def MULWIDES64Imm
+  : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+                           "mul.wide.s32 \t$dst, $a, $b;", []>;
+def MULWIDES64Imm64
+  : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
+                           "mul.wide.s32 \t$dst, $a, $b;", []>;
+
+def MULWIDEU64
+  : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+              "mul.wide.u32 \t$dst, $a, $b;", []>;
+def MULWIDEU64Imm
+  : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+                           "mul.wide.u32 \t$dst, $a, $b;", []>;
+def MULWIDEU64Imm64
+  : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
+                           "mul.wide.u32 \t$dst, $a, $b;", []>;
+
+def MULWIDES32
+  : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+                           "mul.wide.s16 \t$dst, $a, $b;", []>;
+def MULWIDES32Imm
+  : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+              "mul.wide.s16 \t$dst, $a, $b;", []>;
+def MULWIDES32Imm32
+  : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
+                           "mul.wide.s16 \t$dst, $a, $b;", []>;
+
+def MULWIDEU32
+  : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+              "mul.wide.u16 \t$dst, $a, $b;", []>;
+def MULWIDEU32Imm
+  : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+                           "mul.wide.u16 \t$dst, $a, $b;", []>;
+def MULWIDEU32Imm32
+  : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
+                            "mul.wide.u16 \t$dst, $a, $b;", []>;
+
+def : Pat<(shl (sext Int32Regs:$a), (i32 Int5Const:$b)),
+          (MULWIDES64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
+          Requires<[doMulWide]>;
+def : Pat<(shl (zext Int32Regs:$a), (i32 Int5Const:$b)),
+          (MULWIDEU64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
+          Requires<[doMulWide]>;
+
+def : Pat<(shl (sext Int16Regs:$a), (i16 Int4Const:$b)),
+          (MULWIDES32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
+          Requires<[doMulWide]>;
+def : Pat<(shl (zext Int16Regs:$a), (i16 Int4Const:$b)),
+          (MULWIDEU32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
+          Requires<[doMulWide]>;
+
+def : Pat<(mul (sext Int32Regs:$a), (sext Int32Regs:$b)),
+          (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
+          Requires<[doMulWide]>;
+def : Pat<(mul (sext Int32Regs:$a), (i64 SInt32Const:$b)),
+          (MULWIDES64Imm64 Int32Regs:$a, (i64 SInt32Const:$b))>,
+          Requires<[doMulWide]>;
+
+def : Pat<(mul (zext Int32Regs:$a), (zext Int32Regs:$b)),
+          (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
+      Requires<[doMulWide]>;
+def : Pat<(mul (zext Int32Regs:$a), (i64 UInt32Const:$b)),
+          (MULWIDEU64Imm64 Int32Regs:$a, (i64 UInt32Const:$b))>,
+          Requires<[doMulWide]>;
+
+def : Pat<(mul (sext Int16Regs:$a), (sext Int16Regs:$b)),
+          (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
+      Requires<[doMulWide]>;
+def : Pat<(mul (sext Int16Regs:$a), (i32 SInt16Const:$b)),
+          (MULWIDES32Imm32 Int16Regs:$a, (i32 SInt16Const:$b))>,
+          Requires<[doMulWide]>;
+
+def : Pat<(mul (zext Int16Regs:$a), (zext Int16Regs:$b)),
+          (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
+      Requires<[doMulWide]>;
+def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)),
+          (MULWIDEU32Imm32 Int16Regs:$a, (i32 UInt16Const:$b))>,
+          Requires<[doMulWide]>;
+
+
+def SDTMulWide
+  : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>;
+def mul_wide_signed
+  : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>;
+def mul_wide_unsigned
+  : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>;
+
+def : Pat<(i32 (mul_wide_signed Int16Regs:$a, Int16Regs:$b)),
+          (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
+      Requires<[doMulWide]>;
+def : Pat<(i32 (mul_wide_signed Int16Regs:$a, imm:$b)),
+          (MULWIDES32Imm Int16Regs:$a, imm:$b)>,
+          Requires<[doMulWide]>;
+def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, Int16Regs:$b)),
+          (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
+          Requires<[doMulWide]>;
+def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, imm:$b)),
+          (MULWIDEU32Imm Int16Regs:$a, imm:$b)>,
+          Requires<[doMulWide]>;
+
+
+def : Pat<(i64 (mul_wide_signed Int32Regs:$a, Int32Regs:$b)),
+          (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
+          Requires<[doMulWide]>;
+def : Pat<(i64 (mul_wide_signed Int32Regs:$a, imm:$b)),
+          (MULWIDES64Imm Int32Regs:$a, imm:$b)>,
+          Requires<[doMulWide]>;
+def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, Int32Regs:$b)),
+          (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
+          Requires<[doMulWide]>;
+def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, imm:$b)),
+          (MULWIDEU64Imm Int32Regs:$a, imm:$b)>,
+          Requires<[doMulWide]>;
+
+defm MULT : I3<"mul.lo.s", mul>;
+
+defm MULTHS : I3<"mul.hi.s", mulhs>;
+defm MULTHU : I3<"mul.hi.u", mulhu>;
+
+defm SDIV : I3<"div.s", sdiv>;
+defm UDIV : I3<"div.u", udiv>;
+
+defm SREM : I3<"rem.s", srem>;
+// The ri version will not be selected as DAGCombiner::visitSREM will lower it.
+defm UREM : I3<"rem.u", urem>;
+// The ri version will not be selected as DAGCombiner::visitUREM will lower it.
+
+def SDTIMAD
+  : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>,
+                         SDTCisInt<2>, SDTCisSameAs<0, 2>,
+                         SDTCisSameAs<0, 3>]>;
+def imad
+  : SDNode<"NVPTXISD::IMAD", SDTIMAD>;
+
+def MAD16rrr : NVPTXInst<(outs Int16Regs:$dst),
+                      (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
+                      "mad.lo.s16 \t$dst, $a, $b, $c;",
+                      [(set Int16Regs:$dst,
+                         (imad Int16Regs:$a, Int16Regs:$b, Int16Regs:$c))]>;
+def MAD16rri : NVPTXInst<(outs Int16Regs:$dst),
+                      (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c),
+                      "mad.lo.s16 \t$dst, $a, $b, $c;",
+                      [(set Int16Regs:$dst,
+                         (imad Int16Regs:$a, Int16Regs:$b, imm:$c))]>;
+def MAD16rir : NVPTXInst<(outs Int16Regs:$dst),
+                      (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c),
+                      "mad.lo.s16 \t$dst, $a, $b, $c;",
+                      [(set Int16Regs:$dst,
+                        (imad Int16Regs:$a, imm:$b, Int16Regs:$c))]>;
+def MAD16rii : NVPTXInst<(outs Int16Regs:$dst),
+    (ins Int16Regs:$a, i16imm:$b, i16imm:$c),
+                      "mad.lo.s16 \t$dst, $a, $b, $c;",
+                      [(set Int16Regs:$dst,
+                        (imad Int16Regs:$a, imm:$b, imm:$c))]>;
+
+def MAD32rrr : NVPTXInst<(outs Int32Regs:$dst),
+                      (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c),
+                      "mad.lo.s32 \t$dst, $a, $b, $c;",
+                      [(set Int32Regs:$dst,
+                        (imad Int32Regs:$a, Int32Regs:$b, Int32Regs:$c))]>;
+def MAD32rri : NVPTXInst<(outs Int32Regs:$dst),
+                      (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c),
+                      "mad.lo.s32 \t$dst, $a, $b, $c;",
+                      [(set Int32Regs:$dst,
+                        (imad Int32Regs:$a, Int32Regs:$b, imm:$c))]>;
+def MAD32rir : NVPTXInst<(outs Int32Regs:$dst),
+                      (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c),
+                      "mad.lo.s32 \t$dst, $a, $b, $c;",
+                      [(set Int32Regs:$dst,
+                        (imad Int32Regs:$a, imm:$b, Int32Regs:$c))]>;
+def MAD32rii : NVPTXInst<(outs Int32Regs:$dst),
+                      (ins Int32Regs:$a, i32imm:$b, i32imm:$c),
+                      "mad.lo.s32 \t$dst, $a, $b, $c;",
+                      [(set Int32Regs:$dst,
+                        (imad Int32Regs:$a, imm:$b, imm:$c))]>;
+
+def MAD64rrr : NVPTXInst<(outs Int64Regs:$dst),
+                      (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c),
+                      "mad.lo.s64 \t$dst, $a, $b, $c;",
+                      [(set Int64Regs:$dst,
+                        (imad Int64Regs:$a, Int64Regs:$b, Int64Regs:$c))]>;
+def MAD64rri : NVPTXInst<(outs Int64Regs:$dst),
+                      (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c),
+                      "mad.lo.s64 \t$dst, $a, $b, $c;",
+                      [(set Int64Regs:$dst,
+                        (imad Int64Regs:$a, Int64Regs:$b, imm:$c))]>;
+def MAD64rir : NVPTXInst<(outs Int64Regs:$dst),
+                      (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c),
+                      "mad.lo.s64 \t$dst, $a, $b, $c;",
+                      [(set Int64Regs:$dst,
+                        (imad Int64Regs:$a, imm:$b, Int64Regs:$c))]>;
+def MAD64rii : NVPTXInst<(outs Int64Regs:$dst),
+                      (ins Int64Regs:$a, i64imm:$b, i64imm:$c),
+                      "mad.lo.s64 \t$dst, $a, $b, $c;",
+                      [(set Int64Regs:$dst,
+                        (imad Int64Regs:$a, imm:$b, imm:$c))]>;
+
+def INEG16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
+                     "neg.s16 \t$dst, $src;",
+         [(set Int16Regs:$dst, (ineg Int16Regs:$src))]>;
+def INEG32 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
+                     "neg.s32 \t$dst, $src;",
+         [(set Int32Regs:$dst, (ineg Int32Regs:$src))]>;
+def INEG64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+                     "neg.s64 \t$dst, $src;",
+         [(set Int64Regs:$dst, (ineg Int64Regs:$src))]>;
+
+//-----------------------------------
+// Floating Point Arithmetic
+//-----------------------------------
+
+// Constant 1.0f
+def FloatConst1 : PatLeaf<(fpimm), [{
+    if (&(N->getValueAPF().getSemantics()) != &llvm::APFloat::IEEEsingle)
+      return false;
+    float f = (float)N->getValueAPF().convertToFloat();
+    return (f==1.0f);
+}]>;
+// Constand (double)1.0
+def DoubleConst1 : PatLeaf<(fpimm), [{
+    if (&(N->getValueAPF().getSemantics()) != &llvm::APFloat::IEEEdouble)
+      return false;
+    double d = (double)N->getValueAPF().convertToDouble();
+    return (d==1.0);
+}]>;
+
+defm FADD : F3<"add", fadd>;
+defm FSUB : F3<"sub", fsub>;
+defm FMUL : F3<"mul", fmul>;
+
+defm FADD_rn : F3_rn<"add", fadd>;
+defm FSUB_rn : F3_rn<"sub", fsub>;
+defm FMUL_rn : F3_rn<"mul", fmul>;
+
+defm FABS : F2<"abs", fabs>;
+defm FNEG : F2<"neg", fneg>;
+defm FSQRT : F2<"sqrt.rn", fsqrt>;
+
+//
+// F64 division
+//
+def FDIV641r : NVPTXInst<(outs Float64Regs:$dst),
+                      (ins f64imm:$a, Float64Regs:$b),
+                      "rcp.rn.f64 \t$dst, $b;",
+                      [(set Float64Regs:$dst,
+                        (fdiv DoubleConst1:$a, Float64Regs:$b))]>;
+def FDIV64rr : NVPTXInst<(outs Float64Regs:$dst),
+                      (ins Float64Regs:$a, Float64Regs:$b),
+                      "div.rn.f64 \t$dst, $a, $b;",
+                      [(set Float64Regs:$dst,
+                        (fdiv Float64Regs:$a, Float64Regs:$b))]>;
+def FDIV64ri : NVPTXInst<(outs Float64Regs:$dst),
+                      (ins Float64Regs:$a, f64imm:$b),
+                      "div.rn.f64 \t$dst, $a, $b;",
+                      [(set Float64Regs:$dst,
+                        (fdiv Float64Regs:$a, fpimm:$b))]>;
+
+//
+// F32 Approximate reciprocal
+//
+def FDIV321r_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins f32imm:$a, Float32Regs:$b),
+                      "rcp.approx.ftz.f32 \t$dst, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+                      Requires<[do_DIVF32_APPROX, doF32FTZ]>;
+def FDIV321r : NVPTXInst<(outs Float32Regs:$dst),
+                        (ins f32imm:$a, Float32Regs:$b),
+                       "rcp.approx.f32 \t$dst, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+                      Requires<[do_DIVF32_APPROX]>;
+//
+// F32 Approximate division
+//
+def FDIV32approxrr_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b),
+                      "div.approx.ftz.f32 \t$dst, $a, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+                      Requires<[do_DIVF32_APPROX, doF32FTZ]>;
+def FDIV32approxri_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, f32imm:$b),
+                      "div.approx.ftz.f32 \t$dst, $a, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv Float32Regs:$a, fpimm:$b))]>,
+                      Requires<[do_DIVF32_APPROX, doF32FTZ]>;
+def FDIV32approxrr     : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b),
+                      "div.approx.f32 \t$dst, $a, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+                      Requires<[do_DIVF32_APPROX]>;
+def FDIV32approxri : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, f32imm:$b),
+                      "div.approx.f32 \t$dst, $a, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv Float32Regs:$a, fpimm:$b))]>,
+                      Requires<[do_DIVF32_APPROX]>;
+//
+// F32 Semi-accurate reciprocal
+//
+// rcp.approx gives the same result as div.full(1.0f, a) and is faster.
+//
+def FDIV321r_approx_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins f32imm:$a, Float32Regs:$b),
+                      "rcp.approx.ftz.f32 \t$dst, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+                      Requires<[do_DIVF32_FULL, doF32FTZ]>;
+def FDIV321r_approx : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins f32imm:$a, Float32Regs:$b),
+                      "rcp.approx.f32 \t$dst, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+                      Requires<[do_DIVF32_FULL]>;
+//
+// F32 Semi-accurate division
+//
+def FDIV32rr_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b),
+                      "div.full.ftz.f32 \t$dst, $a, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+                      Requires<[do_DIVF32_FULL, doF32FTZ]>;
+def FDIV32ri_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, f32imm:$b),
+                      "div.full.ftz.f32 \t$dst, $a, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv Float32Regs:$a, fpimm:$b))]>,
+                      Requires<[do_DIVF32_FULL, doF32FTZ]>;
+def FDIV32rr : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b),
+                      "div.full.f32 \t$dst, $a, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+                      Requires<[do_DIVF32_FULL]>;
+def FDIV32ri : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, f32imm:$b),
+                      "div.full.f32 \t$dst, $a, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv Float32Regs:$a, fpimm:$b))]>,
+                      Requires<[do_DIVF32_FULL]>;
+//
+// F32 Accurate reciprocal
+//
+def FDIV321r_prec_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                        (ins f32imm:$a, Float32Regs:$b),
+                       "rcp.rn.ftz.f32 \t$dst, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+                      Requires<[reqPTX20, doF32FTZ]>;
+def FDIV321r_prec : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins f32imm:$a, Float32Regs:$b),
+                       "rcp.rn.f32 \t$dst, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+                      Requires<[reqPTX20]>;
+//
+// F32 Accurate division
+//
+def FDIV32rr_prec_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b),
+                      "div.rn.ftz.f32 \t$dst, $a, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+                      Requires<[doF32FTZ, reqPTX20]>;
+def FDIV32ri_prec_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, f32imm:$b),
+                      "div.rn.ftz.f32 \t$dst, $a, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv Float32Regs:$a, fpimm:$b))]>,
+                      Requires<[doF32FTZ, reqPTX20]>;
+def FDIV32rr_prec : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b),
+                      "div.rn.f32 \t$dst, $a, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+                      Requires<[reqPTX20]>;
+def FDIV32ri_prec : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, f32imm:$b),
+                      "div.rn.f32 \t$dst, $a, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv Float32Regs:$a, fpimm:$b))]>,
+                      Requires<[reqPTX20]>;
+
+//
+// F32 rsqrt
+//
+
+def RSQRTF32approx1r : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$b),
+                       "rsqrt.approx.f32 \t$dst, $b;", []>;
+
+def: Pat<(fdiv FloatConst1, (int_nvvm_sqrt_f Float32Regs:$b)),
+         (RSQRTF32approx1r Float32Regs:$b)>,
+         Requires<[do_DIVF32_FULL, do_SQRTF32_APPROX, doNoF32FTZ]>;
+
+multiclass FPCONTRACT32<string OpcStr, Predicate Pred> {
+   def rrr : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b, Float32Regs:$c),
+                      !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                      [(set Float32Regs:$dst,
+                        (fma Float32Regs:$a, Float32Regs:$b, Float32Regs:$c))]>,
+                      Requires<[Pred]>;
+   def rri : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b, f32imm:$c),
+                      !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                      [(set Float32Regs:$dst,
+                        (fma Float32Regs:$a, Float32Regs:$b, fpimm:$c))]>,
+                      Requires<[Pred]>;
+   def rir : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, f32imm:$b, Float32Regs:$c),
+                      !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                      [(set Float32Regs:$dst,
+                        (fma Float32Regs:$a, fpimm:$b, Float32Regs:$c))]>,
+                      Requires<[Pred]>;
+   def rii : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, f32imm:$b, f32imm:$c),
+                      !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                      [(set Float32Regs:$dst,
+                        (fma Float32Regs:$a, fpimm:$b, fpimm:$c))]>,
+                      Requires<[Pred]>;
+}
+
+multiclass FPCONTRACT64<string OpcStr, Predicate Pred> {
+   def rrr : NVPTXInst<(outs Float64Regs:$dst),
+                      (ins Float64Regs:$a, Float64Regs:$b, Float64Regs:$c),
+                      !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                      [(set Float64Regs:$dst,
+                        (fma Float64Regs:$a, Float64Regs:$b, Float64Regs:$c))]>,
+                      Requires<[Pred]>;
+   def rri : NVPTXInst<(outs Float64Regs:$dst),
+                      (ins Float64Regs:$a, Float64Regs:$b, f64imm:$c),
+                      !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                      [(set Float64Regs:$dst,
+                        (fma Float64Regs:$a, Float64Regs:$b, fpimm:$c))]>,
+                      Requires<[Pred]>;
+   def rir : NVPTXInst<(outs Float64Regs:$dst),
+                      (ins Float64Regs:$a, f64imm:$b, Float64Regs:$c),
+                      !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                      [(set Float64Regs:$dst,
+                        (fma Float64Regs:$a, fpimm:$b, Float64Regs:$c))]>,
+                      Requires<[Pred]>;
+   def rii : NVPTXInst<(outs Float64Regs:$dst),
+                      (ins Float64Regs:$a, f64imm:$b, f64imm:$c),
+                      !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                      [(set Float64Regs:$dst,
+                        (fma Float64Regs:$a, fpimm:$b, fpimm:$c))]>,
+                      Requires<[Pred]>;
+}
+
+defm FMA32_ftz  : FPCONTRACT32<"fma.rn.ftz.f32", doF32FTZ>;
+defm FMA32  : FPCONTRACT32<"fma.rn.f32", true>;
+defm FMA64  : FPCONTRACT64<"fma.rn.f64", true>;
+
+def SINF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
+                      "sin.approx.f32 \t$dst, $src;",
+                      [(set Float32Regs:$dst, (fsin Float32Regs:$src))]>;
+def COSF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
+                      "cos.approx.f32 \t$dst, $src;",
+                      [(set Float32Regs:$dst, (fcos Float32Regs:$src))]>;
+
+// Lower (frem x, y) into (sub x, (mul (floor (div x, y)) y))
+// e.g. "poor man's fmod()"
+
+// frem - f32 FTZ
+def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
+          (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32
+            (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRMI_FTZ),
+             Float32Regs:$y))>,
+          Requires<[doF32FTZ]>;
+def : Pat<(frem Float32Regs:$x, fpimm:$y),
+          (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32
+            (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRMI_FTZ),
+             fpimm:$y))>,
+          Requires<[doF32FTZ]>;
+
+// frem - f32
+def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
+          (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32
+            (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRMI),
+             Float32Regs:$y))>;
+def : Pat<(frem Float32Regs:$x, fpimm:$y),
+          (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32
+            (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRMI),
+             fpimm:$y))>;
+
+// frem - f64
+def : Pat<(frem Float64Regs:$x, Float64Regs:$y),
+          (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64
+            (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRMI),
+             Float64Regs:$y))>;
+def : Pat<(frem Float64Regs:$x, fpimm:$y),
+          (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64
+            (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRMI),
+             fpimm:$y))>;
+
+//-----------------------------------
+// Logical Arithmetic
+//-----------------------------------
+
+multiclass LOG_FORMAT<string OpcStr, SDNode OpNode> {
+  def b1rr:  NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
+                      !strconcat(OpcStr, ".pred  \t$dst, $a, $b;"),
+                      [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
+  def b1ri:  NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
+                      !strconcat(OpcStr, ".pred  \t$dst, $a, $b;"),
+                      [(set Int1Regs:$dst, (OpNode Int1Regs:$a, imm:$b))]>;
+  def b16rr:  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+                      !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
+                      [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
+                        Int16Regs:$b))]>;
+  def b16ri:  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+                      !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
+                      [(set Int16Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>;
+  def b32rr:  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+                      !strconcat(OpcStr, ".b32  \t$dst, $a, $b;"),
+                      [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
+                        Int32Regs:$b))]>;
+  def b32ri:  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+                      !strconcat(OpcStr, ".b32  \t$dst, $a, $b;"),
+                      [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
+  def b64rr:  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
+                      !strconcat(OpcStr, ".b64  \t$dst, $a, $b;"),
+                      [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
+                        Int64Regs:$b))]>;
+  def b64ri:  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
+                      !strconcat(OpcStr, ".b64  \t$dst, $a, $b;"),
+                      [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
+}
+
+defm OR  : LOG_FORMAT<"or", or>;
+defm AND : LOG_FORMAT<"and", and>;
+defm XOR : LOG_FORMAT<"xor", xor>;
+
+def NOT1:  NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src),
+                      "not.pred \t$dst, $src;",
+                      [(set Int1Regs:$dst, (not Int1Regs:$src))]>;
+def NOT16:  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
+                      "not.b16 \t$dst, $src;",
+                      [(set Int16Regs:$dst, (not Int16Regs:$src))]>;
+def NOT32:  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
+                      "not.b32 \t$dst, $src;",
+                      [(set Int32Regs:$dst, (not Int32Regs:$src))]>;
+def NOT64:  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+                      "not.b64 \t$dst, $src;",
+                      [(set Int64Regs:$dst, (not Int64Regs:$src))]>;
+
+// For shifts, the second src operand must be 32-bit value
+multiclass LSHIFT_FORMAT<string OpcStr, SDNode OpNode> {
+   def i64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a,
+                      Int32Regs:$b),
+                      !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+                      [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
+                        Int32Regs:$b))]>;
+   def i64ri : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b),
+                      !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+                      [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
+                        (i32 imm:$b)))]>;
+   def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a,
+                      Int32Regs:$b),
+                      !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+                      [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
+                        Int32Regs:$b))]>;
+   def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+                      !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+                      [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
+                        (i32 imm:$b)))]>;
+   def i32ii : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b),
+                      !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+                      [(set Int32Regs:$dst, (OpNode (i32 imm:$a),
+                        (i32 imm:$b)))]>;
+   def i16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a,
+                      Int32Regs:$b),
+                      !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                      [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
+                        Int32Regs:$b))]>;
+   def i16ri : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
+                      !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                      [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
+                        (i32 imm:$b)))]>;
+}
+
+defm SHL : LSHIFT_FORMAT<"shl.b", shl>;
+
+// For shifts, the second src operand must be 32-bit value
+// Need to add cvt for the 8-bits.
+multiclass RSHIFT_FORMAT<string OpcStr, SDNode OpNode> {
+   def i64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a,
+                      Int32Regs:$b),
+                      !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+                      [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
+                        Int32Regs:$b))]>;
+   def i64ri : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b),
+                      !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+                      [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
+                        (i32 imm:$b)))]>;
+   def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a,
+                      Int32Regs:$b),
+                      !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+                      [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
+                        Int32Regs:$b))]>;
+   def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+                      !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+                      [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
+                        (i32 imm:$b)))]>;
+   def i32ii : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b),
+                      !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+                      [(set Int32Regs:$dst, (OpNode (i32 imm:$a),
+                        (i32 imm:$b)))]>;
+   def i16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a,
+                      Int32Regs:$b),
+                      !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                      [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
+                        Int32Regs:$b))]>;
+   def i16ri : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
+                      !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                      [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
+                        (i32 imm:$b)))]>;
+}
+
+defm SRA : RSHIFT_FORMAT<"shr.s", sra>;
+defm SRL : RSHIFT_FORMAT<"shr.u", srl>;
+
+//
+// Rotate: use ptx shf instruction if available.
+//
+
+// 32 bit r2 = rotl r1, n
+//    =>
+//        r2 = shf.l r1, r1, n
+def ROTL32imm_hw : NVPTXInst<(outs Int32Regs:$dst),
+                             (ins Int32Regs:$src, i32imm:$amt),
+              "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
+    [(set Int32Regs:$dst, (rotl Int32Regs:$src, (i32 imm:$amt)))]>,
+    Requires<[hasHWROT32]> ;
+
+def ROTL32reg_hw : NVPTXInst<(outs Int32Regs:$dst),
+                             (ins Int32Regs:$src, Int32Regs:$amt),
+              "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
+    [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
+    Requires<[hasHWROT32]>;
+
+// 32 bit r2 = rotr r1, n
+//    =>
+//        r2 = shf.r r1, r1, n
+def ROTR32imm_hw : NVPTXInst<(outs Int32Regs:$dst),
+                             (ins Int32Regs:$src, i32imm:$amt),
+              "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
+    [(set Int32Regs:$dst, (rotr Int32Regs:$src, (i32 imm:$amt)))]>,
+    Requires<[hasHWROT32]>;
+
+def ROTR32reg_hw : NVPTXInst<(outs Int32Regs:$dst),
+                             (ins Int32Regs:$src, Int32Regs:$amt),
+              "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
+    [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
+    Requires<[hasHWROT32]>;
+
+//
+// Rotate: if ptx shf instruction is not available, then use shift+add
+//
+// 32bit
+def ROT32imm_sw : NVPTXInst<(outs Int32Regs:$dst),
+  (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2),
+    !strconcat("{{\n\t",
+    !strconcat(".reg .b32 %lhs;\n\t",
+    !strconcat(".reg .b32 %rhs;\n\t",
+    !strconcat("shl.b32 \t%lhs, $src, $amt1;\n\t",
+    !strconcat("shr.b32 \t%rhs, $src, $amt2;\n\t",
+    !strconcat("add.u32 \t$dst, %lhs, %rhs;\n\t",
+    !strconcat("}}", ""))))))),
+    []>;
+
+def SUB_FRM_32 : SDNodeXForm<imm, [{
+    return CurDAG->getTargetConstant(32-N->getZExtValue(), MVT::i32);
+}]>;
+
+def : Pat<(rotl Int32Regs:$src, (i32 imm:$amt)),
+          (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>,
+      Requires<[noHWROT32]>;
+def : Pat<(rotr Int32Regs:$src, (i32 imm:$amt)),
+          (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>,
+      Requires<[noHWROT32]>;
+
+def ROTL32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src,
+    Int32Regs:$amt),
+    !strconcat("{{\n\t",
+    !strconcat(".reg .b32 %lhs;\n\t",
+    !strconcat(".reg .b32 %rhs;\n\t",
+    !strconcat(".reg .b32 %amt2;\n\t",
+    !strconcat("shl.b32 \t%lhs, $src, $amt;\n\t",
+    !strconcat("sub.s32 \t%amt2, 32, $amt;\n\t",
+    !strconcat("shr.b32 \t%rhs, $src, %amt2;\n\t",
+    !strconcat("add.u32 \t$dst, %lhs, %rhs;\n\t",
+    !strconcat("}}", ""))))))))),
+    [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
+    Requires<[noHWROT32]>;
+
+def ROTR32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src,
+    Int32Regs:$amt),
+    !strconcat("{{\n\t",
+    !strconcat(".reg .b32 %lhs;\n\t",
+    !strconcat(".reg .b32 %rhs;\n\t",
+    !strconcat(".reg .b32 %amt2;\n\t",
+    !strconcat("shr.b32 \t%lhs, $src, $amt;\n\t",
+    !strconcat("sub.s32 \t%amt2, 32, $amt;\n\t",
+    !strconcat("shl.b32 \t%rhs, $src, %amt2;\n\t",
+    !strconcat("add.u32 \t$dst, %lhs, %rhs;\n\t",
+    !strconcat("}}", ""))))))))),
+    [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
+    Requires<[noHWROT32]>;
+
+// 64bit
+def ROT64imm_sw : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src,
+    i32imm:$amt1, i32imm:$amt2),
+    !strconcat("{{\n\t",
+    !strconcat(".reg .b64 %lhs;\n\t",
+    !strconcat(".reg .b64 %rhs;\n\t",
+    !strconcat("shl.b64 \t%lhs, $src, $amt1;\n\t",
+    !strconcat("shr.b64 \t%rhs, $src, $amt2;\n\t",
+    !strconcat("add.u64 \t$dst, %lhs, %rhs;\n\t",
+    !strconcat("}}", ""))))))),
+    []>;
+
+def SUB_FRM_64 : SDNodeXForm<imm, [{
+    return CurDAG->getTargetConstant(64-N->getZExtValue(), MVT::i32);
+}]>;
+
+def : Pat<(rotl Int64Regs:$src, (i32 imm:$amt)),
+          (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_64 node:$amt))>;
+def : Pat<(rotr Int64Regs:$src, (i32 imm:$amt)),
+          (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>;
+
+def ROTL64reg_sw : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src,
+    Int32Regs:$amt),
+    !strconcat("{{\n\t",
+    !strconcat(".reg .b64 %lhs;\n\t",
+    !strconcat(".reg .b64 %rhs;\n\t",
+    !strconcat(".reg .u32 %amt2;\n\t",
+    !strconcat("shl.b64 \t%lhs, $src, $amt;\n\t",
+    !strconcat("sub.u32 \t%amt2, 64, $amt;\n\t",
+    !strconcat("shr.b64 \t%rhs, $src, %amt2;\n\t",
+    !strconcat("add.u64 \t$dst, %lhs, %rhs;\n\t",
+    !strconcat("}}", ""))))))))),
+    [(set Int64Regs:$dst, (rotl Int64Regs:$src, Int32Regs:$amt))]>;
+
+def ROTR64reg_sw : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src,
+    Int32Regs:$amt),
+    !strconcat("{{\n\t",
+    !strconcat(".reg .b64 %lhs;\n\t",
+    !strconcat(".reg .b64 %rhs;\n\t",
+    !strconcat(".reg .u32 %amt2;\n\t",
+    !strconcat("shr.b64 \t%lhs, $src, $amt;\n\t",
+    !strconcat("sub.u32 \t%amt2, 64, $amt;\n\t",
+    !strconcat("shl.b64 \t%rhs, $src, %amt2;\n\t",
+    !strconcat("add.u64 \t$dst, %lhs, %rhs;\n\t",
+    !strconcat("}}", ""))))))))),
+    [(set Int64Regs:$dst, (rotr Int64Regs:$src, Int32Regs:$amt))]>;
+
+// BFE - bit-field extract
+
+multiclass BFE<string TyStr, RegisterClass RC> {
+  // BFE supports both 32-bit and 64-bit values, but the start and length
+  // operands are always 32-bit
+  def rrr
+    : NVPTXInst<(outs RC:$d),
+                (ins RC:$a, Int32Regs:$b, Int32Regs:$c),
+                !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
+  def rri
+    : NVPTXInst<(outs RC:$d),
+                (ins RC:$a, Int32Regs:$b, i32imm:$c),
+                !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
+  def rii
+    : NVPTXInst<(outs RC:$d),
+                (ins RC:$a, i32imm:$b, i32imm:$c),
+                !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
+}
+
+defm BFE_S32 : BFE<"s32", Int32Regs>;
+defm BFE_U32 : BFE<"u32", Int32Regs>;
+defm BFE_S64 : BFE<"s64", Int64Regs>;
+defm BFE_U64 : BFE<"u64", Int64Regs>;
+
+//-----------------------------------
+// General Comparison
+//-----------------------------------
+
+// General setp instructions
+multiclass SETP<string TypeStr, RegisterClass RC, Operand ImmCls> {
+  def rr : NVPTXInst<(outs Int1Regs:$dst),
+                     (ins RC:$a, RC:$b, CmpMode:$cmp),
+            !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, "\t$dst, $a, $b;"),
+                     []>;
+  def ri : NVPTXInst<(outs Int1Regs:$dst),
+                     (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
+            !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, "\t$dst, $a, $b;"),
+                     []>;
+  def ir : NVPTXInst<(outs Int1Regs:$dst),
+                     (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
+            !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, "\t$dst, $a, $b;"),
+                     []>;
+}
+
+defm SETP_b16 : SETP<"b16", Int16Regs, i16imm>;
+defm SETP_s16 : SETP<"s16", Int16Regs, i16imm>;
+defm SETP_u16 : SETP<"u16", Int16Regs, i16imm>;
+defm SETP_b32 : SETP<"b32", Int32Regs, i32imm>;
+defm SETP_s32 : SETP<"s32", Int32Regs, i32imm>;
+defm SETP_u32 : SETP<"u32", Int32Regs, i32imm>;
+defm SETP_b64 : SETP<"b64", Int64Regs, i64imm>;
+defm SETP_s64 : SETP<"s64", Int64Regs, i64imm>;
+defm SETP_u64 : SETP<"u64", Int64Regs, i64imm>;
+defm SETP_f32 : SETP<"f32", Float32Regs, f32imm>;
+defm SETP_f64 : SETP<"f64", Float64Regs, f64imm>;
+
+// General set instructions
+multiclass SET<string TypeStr, RegisterClass RC, Operand ImmCls> {
+  def rr : NVPTXInst<(outs Int32Regs:$dst),
+                     (ins RC:$a, RC:$b, CmpMode:$cmp),
+                     !strconcat("set$cmp.", TypeStr, "\t$dst, $a, $b;"), []>;
+  def ri : NVPTXInst<(outs Int32Regs:$dst),
+                     (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
+                     !strconcat("set$cmp.", TypeStr, "\t$dst, $a, $b;"), []>;
+  def ir : NVPTXInst<(outs Int32Regs:$dst),
+                     (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
+                     !strconcat("set$cmp.", TypeStr, "\t$dst, $a, $b;"), []>;
+}
+
+defm SET_b16 : SET<"b16", Int16Regs, i16imm>;
+defm SET_s16 : SET<"s16", Int16Regs, i16imm>;
+defm SET_u16 : SET<"u16", Int16Regs, i16imm>;
+defm SET_b32 : SET<"b32", Int32Regs, i32imm>;
+defm SET_s32 : SET<"s32", Int32Regs, i32imm>;
+defm SET_u32 : SET<"u32", Int32Regs, i32imm>;
+defm SET_b64 : SET<"b64", Int64Regs, i64imm>;
+defm SET_s64 : SET<"s64", Int64Regs, i64imm>;
+defm SET_u64 : SET<"u64", Int64Regs, i64imm>;
+defm SET_f32 : SET<"f32", Float32Regs, f32imm>;
+defm SET_f64 : SET<"f64", Float64Regs, f64imm>;
+
+//-----------------------------------
+// General Selection
+//-----------------------------------
+
+// General selp instructions
+multiclass SELP<string TypeStr, RegisterClass RC, Operand ImmCls> {
+  def rr : NVPTXInst<(outs RC:$dst),
+                     (ins RC:$a, RC:$b, Int1Regs:$p),
+                     !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
+  def ri : NVPTXInst<(outs RC:$dst),
+                     (ins RC:$a, ImmCls:$b, Int1Regs:$p),
+                     !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
+  def ir : NVPTXInst<(outs RC:$dst),
+                     (ins ImmCls:$a, RC:$b, Int1Regs:$p),
+                     !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
+  def ii : NVPTXInst<(outs RC:$dst),
+                     (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
+                     !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
+}
+
+multiclass SELP_PATTERN<string TypeStr, RegisterClass RC, Operand ImmCls,
+                        SDNode ImmNode> {
+  def rr : NVPTXInst<(outs RC:$dst),
+                     (ins RC:$a, RC:$b, Int1Regs:$p),
+                     !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
+                     [(set RC:$dst, (select Int1Regs:$p, RC:$a, RC:$b))]>;
+  def ri : NVPTXInst<(outs RC:$dst),
+                     (ins RC:$a, ImmCls:$b, Int1Regs:$p),
+                     !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
+                     [(set RC:$dst, (select Int1Regs:$p, RC:$a, ImmNode:$b))]>;
+  def ir : NVPTXInst<(outs RC:$dst),
+                     (ins ImmCls:$a, RC:$b, Int1Regs:$p),
+                     !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
+                     [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, RC:$b))]>;
+  def ii : NVPTXInst<(outs RC:$dst),
+                     (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
+                     !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
+                 [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, ImmNode:$b))]>;
+}
+
+defm SELP_b16 : SELP_PATTERN<"b16", Int16Regs, i16imm, imm>;
+defm SELP_s16 : SELP<"s16", Int16Regs, i16imm>;
+defm SELP_u16 : SELP<"u16", Int16Regs, i16imm>;
+defm SELP_b32 : SELP_PATTERN<"b32", Int32Regs, i32imm, imm>;
+defm SELP_s32 : SELP<"s32", Int32Regs, i32imm>;
+defm SELP_u32 : SELP<"u32", Int32Regs, i32imm>;
+defm SELP_b64 : SELP_PATTERN<"b64", Int64Regs, i64imm, imm>;
+defm SELP_s64 : SELP<"s64", Int64Regs, i64imm>;
+defm SELP_u64 : SELP<"u64", Int64Regs, i64imm>;
+defm SELP_f32 : SELP_PATTERN<"f32", Float32Regs, f32imm, fpimm>;
+defm SELP_f64 : SELP_PATTERN<"f64", Float64Regs, f64imm, fpimm>;
+
+// Special select for predicate operands
+def : Pat<(i1 (select Int1Regs:$p, Int1Regs:$a, Int1Regs:$b)),
+              (ORb1rr (ANDb1rr Int1Regs:$p, Int1Regs:$a),
+              (ANDb1rr (NOT1 Int1Regs:$p), Int1Regs:$b))>;
+
+//
+// Funnnel shift in clamp mode
+//
+// - SDNodes are created so they can be used in the DAG code,
+//   e.g. NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts)
+//
+def SDTIntShiftDOp: SDTypeProfile<1, 3,
+                                  [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+                                   SDTCisInt<0>, SDTCisInt<3>]>;
+def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>;
+def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>;
+
+def FUNSHFLCLAMP : NVPTXInst<(outs Int32Regs:$dst),
+                             (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
+                  "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;",
+                  [(set Int32Regs:$dst,
+                     (FUN_SHFL_CLAMP Int32Regs:$lo,
+                        Int32Regs:$hi, Int32Regs:$amt))]>;
+
+def FUNSHFRCLAMP : NVPTXInst<(outs Int32Regs:$dst),
+                             (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
+                  "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;",
+                  [(set Int32Regs:$dst,
+                     (FUN_SHFR_CLAMP Int32Regs:$lo,
+                        Int32Regs:$hi, Int32Regs:$amt))]>;
+
+//-----------------------------------
+// Data Movement (Load / Store, Move)
+//-----------------------------------
+
+def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex],
+  [SDNPWantRoot]>;
+def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri64", [frameindex],
+  [SDNPWantRoot]>;
+
+def MEMri : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops Int32Regs, i32imm);
+}
+def MEMri64 : Operand<i64> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops Int64Regs, i64imm);
+}
+
+def imem : Operand<iPTR> {
+    let PrintMethod = "printOperand";
+}
+
+def imemAny : Operand<iPTRAny> {
+    let PrintMethod = "printOperand";
+}
+
+def LdStCode : Operand<i32> {
+    let PrintMethod = "printLdStCode";
+}
+
+def SDTWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
+def Wrapper    : SDNode<"NVPTXISD::Wrapper", SDTWrapper>;
+
+def MOV_ADDR : NVPTXInst<(outs Int32Regs:$dst), (ins imem:$a),
+                     "mov.u32 \t$dst, $a;",
+                     [(set Int32Regs:$dst, (Wrapper tglobaladdr:$a))]>;
+
+def MOV_ADDR64 : NVPTXInst<(outs Int64Regs:$dst), (ins imem:$a),
+                     "mov.u64 \t$dst, $a;",
+                     [(set Int64Regs:$dst, (Wrapper tglobaladdr:$a))]>;
+
+// Get pointer to local stack
+def MOV_DEPOT_ADDR
+  : NVPTXInst<(outs Int32Regs:$d), (ins i32imm:$num),
+              "mov.u32 \t$d, __local_depot$num;", []>;
+def MOV_DEPOT_ADDR_64
+  : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num),
+              "mov.u64 \t$d, __local_depot$num;", []>;
+
+
+// copyPhysreg is hard-coded in NVPTXInstrInfo.cpp
+let IsSimpleMove=1 in {
+def IMOV1rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss),
+                   "mov.pred \t$dst, $sss;", []>;
+def IMOV16rr: NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss),
+                    "mov.u16 \t$dst, $sss;", []>;
+def IMOV32rr: NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss),
+                    "mov.u32 \t$dst, $sss;", []>;
+def IMOV64rr: NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss),
+                    "mov.u64 \t$dst, $sss;", []>;
+
+def FMOV32rr: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
+                    "mov.f32 \t$dst, $src;", []>;
+def FMOV64rr: NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src),
+                    "mov.f64 \t$dst, $src;", []>;
+}
+def IMOV1ri: NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src),
+                    "mov.pred \t$dst, $src;",
+          [(set Int1Regs:$dst, imm:$src)]>;
+def IMOV16ri: NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
+                    "mov.u16 \t$dst, $src;",
+          [(set Int16Regs:$dst, imm:$src)]>;
+def IMOV32ri: NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
+                    "mov.u32 \t$dst, $src;",
+          [(set Int32Regs:$dst, imm:$src)]>;
+def IMOV64i: NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
+                    "mov.u64 \t$dst, $src;",
+          [(set Int64Regs:$dst, imm:$src)]>;
+
+def FMOV32ri: NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src),
+                    "mov.f32 \t$dst, $src;",
+          [(set Float32Regs:$dst, fpimm:$src)]>;
+def FMOV64ri: NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src),
+                    "mov.f64 \t$dst, $src;",
+          [(set Float64Regs:$dst, fpimm:$src)]>;
+
+def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32ri texternalsym:$dst)>;
+
+//---- Copy Frame Index ----
+def LEA_ADDRi : NVPTXInst<(outs Int32Regs:$dst), (ins MEMri:$addr),
+                        "add.u32 \t$dst, ${addr:add};",
+                        [(set Int32Regs:$dst, ADDRri:$addr)]>;
+def LEA_ADDRi64 : NVPTXInst<(outs Int64Regs:$dst), (ins MEMri64:$addr),
+                        "add.u64 \t$dst, ${addr:add};",
+                        [(set Int64Regs:$dst, ADDRri64:$addr)]>;
+
+//-----------------------------------
+// Comparison and Selection
+//-----------------------------------
+
+multiclass ISET_FORMAT<PatFrag OpNode, PatLeaf Mode,
+                       Instruction setp_16rr,
+                       Instruction setp_16ri,
+                       Instruction setp_16ir,
+                       Instruction setp_32rr,
+                       Instruction setp_32ri,
+                       Instruction setp_32ir,
+                       Instruction setp_64rr,
+                       Instruction setp_64ri,
+                       Instruction setp_64ir,
+                       Instruction set_16rr,
+                       Instruction set_16ri,
+                       Instruction set_16ir,
+                       Instruction set_32rr,
+                       Instruction set_32ri,
+                       Instruction set_32ir,
+                       Instruction set_64rr,
+                       Instruction set_64ri,
+                       Instruction set_64ir> {
+  // i16 -> pred
+  def : Pat<(i1 (OpNode Int16Regs:$a, Int16Regs:$b)),
+            (setp_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
+  def : Pat<(i1 (OpNode Int16Regs:$a, imm:$b)),
+            (setp_16ri Int16Regs:$a, imm:$b, Mode)>;
+  def : Pat<(i1 (OpNode imm:$a, Int16Regs:$b)),
+            (setp_16ir imm:$a, Int16Regs:$b, Mode)>;
+  // i32 -> pred
+  def : Pat<(i1 (OpNode Int32Regs:$a, Int32Regs:$b)),
+            (setp_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
+  def : Pat<(i1 (OpNode Int32Regs:$a, imm:$b)),
+            (setp_32ri Int32Regs:$a, imm:$b, Mode)>;
+  def : Pat<(i1 (OpNode imm:$a, Int32Regs:$b)),
+            (setp_32ir imm:$a, Int32Regs:$b, Mode)>;
+  // i64 -> pred
+  def : Pat<(i1 (OpNode Int64Regs:$a, Int64Regs:$b)),
+            (setp_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
+  def : Pat<(i1 (OpNode Int64Regs:$a, imm:$b)),
+            (setp_64ri Int64Regs:$a, imm:$b, Mode)>;
+  def : Pat<(i1 (OpNode imm:$a, Int64Regs:$b)),
+            (setp_64ir imm:$a, Int64Regs:$b, Mode)>;
+
+  // i16 -> i32
+  def : Pat<(i32 (OpNode Int16Regs:$a, Int16Regs:$b)),
+            (set_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
+  def : Pat<(i32 (OpNode Int16Regs:$a, imm:$b)),
+            (set_16ri Int16Regs:$a, imm:$b, Mode)>;
+  def : Pat<(i32 (OpNode imm:$a, Int16Regs:$b)),
+            (set_16ir imm:$a, Int16Regs:$b, Mode)>;
+  // i32 -> i32
+  def : Pat<(i32 (OpNode Int32Regs:$a, Int32Regs:$b)),
+            (set_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
+  def : Pat<(i32 (OpNode Int32Regs:$a, imm:$b)),
+            (set_32ri Int32Regs:$a, imm:$b, Mode)>;
+  def : Pat<(i32 (OpNode imm:$a, Int32Regs:$b)),
+            (set_32ir imm:$a, Int32Regs:$b, Mode)>;
+  // i64 -> i32
+  def : Pat<(i32 (OpNode Int64Regs:$a, Int64Regs:$b)),
+            (set_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
+  def : Pat<(i32 (OpNode Int64Regs:$a, imm:$b)),
+            (set_64ri Int64Regs:$a, imm:$b, Mode)>;
+  def : Pat<(i32 (OpNode imm:$a, Int64Regs:$b)),
+            (set_64ir imm:$a, Int64Regs:$b, Mode)>;
+}
+
+multiclass ISET_FORMAT_SIGNED<PatFrag OpNode, PatLeaf Mode>
+  : ISET_FORMAT<OpNode, Mode,
+                SETP_s16rr, SETP_s16ri, SETP_s16ir,
+                SETP_s32rr, SETP_s32ri, SETP_s32ir,
+                SETP_s64rr, SETP_s64ri, SETP_s64ir,
+                SET_s16rr, SET_s16ri, SET_s16ir,
+                SET_s32rr, SET_s32ri, SET_s32ir,
+                SET_s64rr, SET_s64ri, SET_s64ir> {
+  // TableGen doesn't like empty multiclasses
+  def : PatLeaf<(i32 0)>;
+}
+
+multiclass ISET_FORMAT_UNSIGNED<PatFrag OpNode, PatLeaf Mode>
+  : ISET_FORMAT<OpNode, Mode,
+                SETP_u16rr, SETP_u16ri, SETP_u16ir,
+                SETP_u32rr, SETP_u32ri, SETP_u32ir,
+                SETP_u64rr, SETP_u64ri, SETP_u64ir,
+                SET_u16rr, SET_u16ri, SET_u16ir,
+                SET_u32rr, SET_u32ri, SET_u32ir,
+                SET_u64rr, SET_u64ri, SET_u64ir> {
+  // TableGen doesn't like empty multiclasses
+  def : PatLeaf<(i32 0)>;
+}
+
+defm : ISET_FORMAT_SIGNED<setgt, CmpGT>;
+defm : ISET_FORMAT_UNSIGNED<setugt, CmpGT>;
+defm : ISET_FORMAT_SIGNED<setlt, CmpLT>;
+defm : ISET_FORMAT_UNSIGNED<setult, CmpLT>;
+defm : ISET_FORMAT_SIGNED<setge, CmpGE>;
+defm : ISET_FORMAT_UNSIGNED<setuge, CmpGE>;
+defm : ISET_FORMAT_SIGNED<setle, CmpLE>;
+defm : ISET_FORMAT_UNSIGNED<setule, CmpLE>;
+defm : ISET_FORMAT_SIGNED<seteq, CmpEQ>;
+defm : ISET_FORMAT_UNSIGNED<setueq, CmpEQ>;
+defm : ISET_FORMAT_SIGNED<setne, CmpNE>;
+defm : ISET_FORMAT_UNSIGNED<setune, CmpNE>;
+
+// i1 compares
+def : Pat<(setne Int1Regs:$a, Int1Regs:$b),
+          (XORb1rr Int1Regs:$a, Int1Regs:$b)>;
+def : Pat<(setune Int1Regs:$a, Int1Regs:$b),
+          (XORb1rr Int1Regs:$a, Int1Regs:$b)>;
+
+def : Pat<(seteq Int1Regs:$a, Int1Regs:$b),
+          (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
+def : Pat<(setueq Int1Regs:$a, Int1Regs:$b),
+          (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
+
+// i1 compare -> i32
+def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
+          (SELP_u32ii -1, 0, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
+def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
+          (SELP_u32ii 0, -1, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
+
+
+
+multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
+  // f32 -> pred
+  def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
+            (SETP_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
+        Requires<[doF32FTZ]>;
+  def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
+            (SETP_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
+  def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
+            (SETP_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
+        Requires<[doF32FTZ]>;
+  def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
+            (SETP_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
+  def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)),
+            (SETP_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
+        Requires<[doF32FTZ]>;
+  def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)),
+            (SETP_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
+
+  // f64 -> pred
+  def : Pat<(i1 (OpNode Float64Regs:$a, Float64Regs:$b)),
+            (SETP_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
+  def : Pat<(i1 (OpNode Float64Regs:$a, fpimm:$b)),
+            (SETP_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
+  def : Pat<(i1 (OpNode fpimm:$a, Float64Regs:$b)),
+            (SETP_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
+
+  // f32 -> i32
+  def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
+            (SET_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
+        Requires<[doF32FTZ]>;
+  def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
+            (SET_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
+  def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)),
+            (SET_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
+        Requires<[doF32FTZ]>;
+  def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)),
+            (SET_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
+  def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)),
+            (SET_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
+        Requires<[doF32FTZ]>;
+  def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)),
+            (SET_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
+
+  // f64 -> i32
+  def : Pat<(i32 (OpNode Float64Regs:$a, Float64Regs:$b)),
+            (SET_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
+  def : Pat<(i32 (OpNode Float64Regs:$a, fpimm:$b)),
+            (SET_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
+  def : Pat<(i32 (OpNode fpimm:$a, Float64Regs:$b)),
+            (SET_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
+}
+
+defm FSetGT : FSET_FORMAT<setogt, CmpGT, CmpGT_FTZ>;
+defm FSetLT : FSET_FORMAT<setolt, CmpLT, CmpLT_FTZ>;
+defm FSetGE : FSET_FORMAT<setoge, CmpGE, CmpGE_FTZ>;
+defm FSetLE : FSET_FORMAT<setole, CmpLE, CmpLE_FTZ>;
+defm FSetEQ : FSET_FORMAT<setoeq, CmpEQ, CmpEQ_FTZ>;
+defm FSetNE : FSET_FORMAT<setone, CmpNE, CmpNE_FTZ>;
+
+defm FSetUGT : FSET_FORMAT<setugt, CmpGTU, CmpGTU_FTZ>;
+defm FSetULT : FSET_FORMAT<setult, CmpLTU, CmpLTU_FTZ>;
+defm FSetUGE : FSET_FORMAT<setuge, CmpGEU, CmpGEU_FTZ>;
+defm FSetULE : FSET_FORMAT<setule, CmpLEU, CmpLEU_FTZ>;
+defm FSetUEQ : FSET_FORMAT<setueq, CmpEQU, CmpEQU_FTZ>;
+defm FSetUNE : FSET_FORMAT<setune, CmpNEU, CmpNEU_FTZ>;
+
+defm FSetNUM : FSET_FORMAT<seto, CmpNUM, CmpNUM_FTZ>;
+defm FSetNAN : FSET_FORMAT<setuo, CmpNAN, CmpNAN_FTZ>;
+
+//def ld_param         : SDNode<"NVPTXISD::LOAD_PARAM", SDTLoad,
+//                        [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+def SDTDeclareParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>,
+  SDTCisInt<2>]>;
+def SDTDeclareScalarParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>,
+  SDTCisInt<1>, SDTCisInt<2>]>;
+def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
+def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>;
+def SDTLoadParamV4Profile : SDTypeProfile<4, 2, [SDTCisInt<4>, SDTCisInt<5>]>;
+def SDTPrintCallProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+def SDTPrintCallUniProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
+def SDTStoreParamV2Profile : SDTypeProfile<0, 4, [SDTCisInt<0>, SDTCisInt<1>]>;
+def SDTStoreParamV4Profile : SDTypeProfile<0, 6, [SDTCisInt<0>, SDTCisInt<1>]>;
+def SDTStoreParam32Profile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
+def SDTCallArgProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
+def SDTCallArgMarkProfile : SDTypeProfile<0, 0, []>;
+def SDTCallVoidProfile : SDTypeProfile<0, 1, []>;
+def SDTCallValProfile : SDTypeProfile<1, 0, []>;
+def SDTMoveParamProfile : SDTypeProfile<1, 1, []>;
+def SDTStoreRetvalProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
+def SDTStoreRetvalV2Profile : SDTypeProfile<0, 3, [SDTCisInt<0>]>;
+def SDTStoreRetvalV4Profile : SDTypeProfile<0, 5, [SDTCisInt<0>]>;
+def SDTPseudoUseParamProfile : SDTypeProfile<0, 1, []>;
+
+def DeclareParam : SDNode<"NVPTXISD::DeclareParam", SDTDeclareParamProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def DeclareScalarParam : SDNode<"NVPTXISD::DeclareScalarParam",
+  SDTDeclareScalarParamProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def DeclareRetParam : SDNode<"NVPTXISD::DeclareRetParam",
+  SDTDeclareParamProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def DeclareRet   : SDNode<"NVPTXISD::DeclareRet", SDTDeclareScalarParamProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def LoadParam    : SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile,
+                         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
+def LoadParamV2  : SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile,
+                         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
+def LoadParamV4  : SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile,
+                         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
+def PrintCall    : SDNode<"NVPTXISD::PrintCall", SDTPrintCallProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def PrintCallUni : SDNode<"NVPTXISD::PrintCallUni", SDTPrintCallUniProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def StoreParam   : SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def StoreParamV2 : SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile,
+                         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def StoreParamV4 : SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile,
+                         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def StoreParamU32 : SDNode<"NVPTXISD::StoreParamU32", SDTStoreParam32Profile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def StoreParamS32 : SDNode<"NVPTXISD::StoreParamS32", SDTStoreParam32Profile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallArgBegin : SDNode<"NVPTXISD::CallArgBegin", SDTCallArgMarkProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallArg      : SDNode<"NVPTXISD::CallArg", SDTCallArgProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def LastCallArg  : SDNode<"NVPTXISD::LastCallArg", SDTCallArgProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallArgEnd   : SDNode<"NVPTXISD::CallArgEnd", SDTCallVoidProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallVoid     : SDNode<"NVPTXISD::CallVoid", SDTCallVoidProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def Prototype    : SDNode<"NVPTXISD::Prototype", SDTCallVoidProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallVal      : SDNode<"NVPTXISD::CallVal", SDTCallValProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def MoveParam    : SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile,
+                         []>;
+def StoreRetval  : SDNode<"NVPTXISD::StoreRetval", SDTStoreRetvalProfile,
+                         [SDNPHasChain, SDNPSideEffect]>;
+def StoreRetvalV2  : SDNode<"NVPTXISD::StoreRetvalV2", SDTStoreRetvalV2Profile,
+                           [SDNPHasChain, SDNPSideEffect]>;
+def StoreRetvalV4  : SDNode<"NVPTXISD::StoreRetvalV4", SDTStoreRetvalV4Profile,
+                           [SDNPHasChain, SDNPSideEffect]>;
+def PseudoUseParam : SDNode<"NVPTXISD::PseudoUseParam",
+  SDTPseudoUseParamProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def RETURNNode   : SDNode<"NVPTXISD::RETURN", SDTCallArgMarkProfile,
+                         [SDNPHasChain, SDNPSideEffect]>;
+
+class LoadParamMemInst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
+                !strconcat(!strconcat("ld.param", opstr),
+                "\t$dst, [retval0+$b];"),
+                []>;
+
+class LoadParamRegInst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
+                !strconcat(!strconcat("mov", opstr),
+                "\t$dst, retval$b;"),
+                [(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>;
+
+class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins i32imm:$b),
+                !strconcat(!strconcat("ld.param.v2", opstr),
+                "\t{{$dst, $dst2}}, [retval0+$b];"), []>;
+
+class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3,
+                      regclass:$dst4),
+                (ins i32imm:$b),
+                !strconcat(!strconcat("ld.param.v4", opstr),
+                "\t{{$dst, $dst2, $dst3, $dst4}}, [retval0+$b];"), []>;
+
+class StoreParamInst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b),
+                !strconcat(!strconcat("st.param", opstr),
+                "\t[param$a+$b], $val;"),
+                []>;
+
+class StoreParamV2Inst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs), (ins regclass:$val, regclass:$val2,
+                             i32imm:$a, i32imm:$b),
+                !strconcat(!strconcat("st.param.v2", opstr),
+                "\t[param$a+$b], {{$val, $val2}};"),
+                []>;
+
+class StoreParamV4Inst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs), (ins regclass:$val, regclass:$val1, regclass:$val2,
+                             regclass:$val3, i32imm:$a, i32imm:$b),
+                !strconcat(!strconcat("st.param.v4", opstr),
+                "\t[param$a+$b], {{$val, $val2, $val3, $val4}};"),
+                []>;
+
+class StoreRetvalInst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs), (ins regclass:$val, i32imm:$a),
+                !strconcat(!strconcat("st.param", opstr),
+                "\t[func_retval0+$a], $val;"),
+                []>;
+
+class StoreRetvalV2Inst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, i32imm:$a),
+                !strconcat(!strconcat("st.param.v2", opstr),
+                "\t[func_retval0+$a], {{$val, $val2}};"),
+                []>;
+
+class StoreRetvalV4Inst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs),
+                (ins regclass:$val, regclass:$val2, regclass:$val3,
+                     regclass:$val4, i32imm:$a),
+                !strconcat(!strconcat("st.param.v4", opstr),
+                "\t[func_retval0+$a], {{$val, $val2, $val3, $val4}};"),
+                []>;
+
+def PrintCallRetInst1 : NVPTXInst<(outs), (ins),
+"call (retval0), ",
+                                [(PrintCall (i32 1))]>;
+def PrintCallRetInst2 : NVPTXInst<(outs), (ins),
+"call (retval0, retval1), ",
+                                [(PrintCall (i32 2))]>;
+def PrintCallRetInst3 : NVPTXInst<(outs), (ins),
+"call (retval0, retval1, retval2), ",
+                                [(PrintCall (i32 3))]>;
+def PrintCallRetInst4 : NVPTXInst<(outs), (ins),
+"call (retval0, retval1, retval2, retval3), ",
+                                [(PrintCall (i32 4))]>;
+def PrintCallRetInst5 : NVPTXInst<(outs), (ins),
+"call (retval0, retval1, retval2, retval3, retval4), ",
+                                [(PrintCall (i32 5))]>;
+def PrintCallRetInst6 : NVPTXInst<(outs), (ins),
+"call (retval0, retval1, retval2, retval3, retval4, retval5), ",
+                                [(PrintCall (i32 6))]>;
+def PrintCallRetInst7 : NVPTXInst<(outs), (ins),
+"call (retval0, retval1, retval2, retval3, retval4, retval5, retval6), ",
+                                [(PrintCall (i32 7))]>;
+def PrintCallRetInst8 : NVPTXInst<(outs), (ins),
+!strconcat("call (retval0, retval1, retval2, retval3, retval4",
+           ", retval5, retval6, retval7), "),
+                                [(PrintCall (i32 8))]>;
+
+def PrintCallNoRetInst : NVPTXInst<(outs), (ins), "call ",
+                                [(PrintCall (i32 0))]>;
+
+def PrintCallUniRetInst1 : NVPTXInst<(outs), (ins),
+"call.uni (retval0), ",
+                                [(PrintCallUni (i32 1))]>;
+def PrintCallUniRetInst2 : NVPTXInst<(outs), (ins),
+"call.uni (retval0, retval1), ",
+                                [(PrintCallUni (i32 2))]>;
+def PrintCallUniRetInst3 : NVPTXInst<(outs), (ins),
+"call.uni (retval0, retval1, retval2), ",
+                                [(PrintCallUni (i32 3))]>;
+def PrintCallUniRetInst4 : NVPTXInst<(outs), (ins),
+"call.uni (retval0, retval1, retval2, retval3), ",
+                                [(PrintCallUni (i32 4))]>;
+def PrintCallUniRetInst5 : NVPTXInst<(outs), (ins),
+"call.uni (retval0, retval1, retval2, retval3, retval4), ",
+                                [(PrintCallUni (i32 5))]>;
+def PrintCallUniRetInst6 : NVPTXInst<(outs), (ins),
+"call.uni (retval0, retval1, retval2, retval3, retval4, retval5), ",
+                                [(PrintCallUni (i32 6))]>;
+def PrintCallUniRetInst7 : NVPTXInst<(outs), (ins),
+"call.uni (retval0, retval1, retval2, retval3, retval4, retval5, retval6), ",
+                                [(PrintCallUni (i32 7))]>;
+def PrintCallUniRetInst8 : NVPTXInst<(outs), (ins),
+!strconcat("call.uni (retval0, retval1, retval2, retval3, retval4",
+           ", retval5, retval6, retval7), "),
+                                [(PrintCallUni (i32 8))]>;
+
+def PrintCallUniNoRetInst : NVPTXInst<(outs), (ins), "call.uni ",
+                                [(PrintCallUni (i32 0))]>;
+
+def LoadParamMemI64    : LoadParamMemInst<Int64Regs, ".b64">;
+def LoadParamMemI32    : LoadParamMemInst<Int32Regs, ".b32">;
+def LoadParamMemI16    : LoadParamMemInst<Int16Regs, ".b16">;
+def LoadParamMemI8     : LoadParamMemInst<Int16Regs, ".b8">;
+def LoadParamMemV2I64  : LoadParamV2MemInst<Int64Regs, ".b64">;
+def LoadParamMemV2I32  : LoadParamV2MemInst<Int32Regs, ".b32">;
+def LoadParamMemV2I16  : LoadParamV2MemInst<Int16Regs, ".b16">;
+def LoadParamMemV2I8   : LoadParamV2MemInst<Int16Regs, ".b8">;
+def LoadParamMemV4I32  : LoadParamV4MemInst<Int32Regs, ".b32">;
+def LoadParamMemV4I16  : LoadParamV4MemInst<Int16Regs, ".b16">;
+def LoadParamMemV4I8   : LoadParamV4MemInst<Int16Regs, ".b8">;
+def LoadParamMemF32    : LoadParamMemInst<Float32Regs, ".f32">;
+def LoadParamMemF64    : LoadParamMemInst<Float64Regs, ".f64">;
+def LoadParamMemV2F32  : LoadParamV2MemInst<Float32Regs, ".f32">;
+def LoadParamMemV2F64  : LoadParamV2MemInst<Float64Regs, ".f64">;
+def LoadParamMemV4F32  : LoadParamV4MemInst<Float32Regs, ".f32">;
+
+def StoreParamI64    : StoreParamInst<Int64Regs, ".b64">;
+def StoreParamI32    : StoreParamInst<Int32Regs, ".b32">;
+
+def StoreParamI16    : StoreParamInst<Int16Regs, ".b16">;
+def StoreParamI8     : StoreParamInst<Int16Regs, ".b8">;
+def StoreParamV2I64  : StoreParamV2Inst<Int64Regs, ".b64">;
+def StoreParamV2I32  : StoreParamV2Inst<Int32Regs, ".b32">;
+def StoreParamV2I16  : StoreParamV2Inst<Int16Regs, ".b16">;
+def StoreParamV2I8   : StoreParamV2Inst<Int16Regs, ".b8">;
+
+// FIXME: StoreParamV4Inst crashes llvm-tblgen :(
+//def StoreParamV4I32    : StoreParamV4Inst<Int32Regs, ".b32">;
+def StoreParamV4I32    : NVPTXInst<(outs), (ins Int32Regs:$val, Int32Regs:$val2,
+                                               Int32Regs:$val3, Int32Regs:$val4,
+                                                i32imm:$a, i32imm:$b),
+                "st.param.v4.b32\t[param$a+$b], {{$val, $val2, $val3, $val4}};",
+                         []>;
+
+def StoreParamV4I16    : NVPTXInst<(outs), (ins Int16Regs:$val, Int16Regs:$val2,
+                                               Int16Regs:$val3, Int16Regs:$val4,
+                                                i32imm:$a, i32imm:$b),
+                "st.param.v4.b16\t[param$a+$b], {{$val, $val2, $val3, $val4}};",
+                         []>;
+
+def StoreParamV4I8     : NVPTXInst<(outs), (ins Int16Regs:$val, Int16Regs:$val2,
+                                                Int16Regs:$val3, Int16Regs:$val4,
+                                                i32imm:$a, i32imm:$b),
+                 "st.param.v4.b8\t[param$a+$b], {{$val, $val2, $val3, $val4}};",
+                         []>;
+
+def StoreParamF32    : StoreParamInst<Float32Regs, ".f32">;
+def StoreParamF64    : StoreParamInst<Float64Regs, ".f64">;
+def StoreParamV2F32    : StoreParamV2Inst<Float32Regs, ".f32">;
+def StoreParamV2F64    : StoreParamV2Inst<Float64Regs, ".f64">;
+// FIXME: StoreParamV4Inst crashes llvm-tblgen :(
+//def StoreParamV4F32    : StoreParamV4Inst<Float32Regs, ".f32">;
+def StoreParamV4F32    : NVPTXInst<(outs),
+                                   (ins Float32Regs:$val, Float32Regs:$val2,
+                                        Float32Regs:$val3, Float32Regs:$val4,
+                                        i32imm:$a, i32imm:$b),
+                "st.param.v4.f32\t[param$a+$b], {{$val, $val2, $val3, $val4}};",
+                        []>;
+
+
+def StoreRetvalI64    : StoreRetvalInst<Int64Regs, ".b64">;
+def StoreRetvalI32    : StoreRetvalInst<Int32Regs, ".b32">;
+def StoreRetvalI16    : StoreRetvalInst<Int16Regs, ".b16">;
+def StoreRetvalI8     : StoreRetvalInst<Int16Regs, ".b8">;
+def StoreRetvalV2I64  : StoreRetvalV2Inst<Int64Regs, ".b64">;
+def StoreRetvalV2I32  : StoreRetvalV2Inst<Int32Regs, ".b32">;
+def StoreRetvalV2I16  : StoreRetvalV2Inst<Int16Regs, ".b16">;
+def StoreRetvalV2I8   : StoreRetvalV2Inst<Int16Regs, ".b8">;
+def StoreRetvalV4I32  : StoreRetvalV4Inst<Int32Regs, ".b32">;
+def StoreRetvalV4I16  : StoreRetvalV4Inst<Int16Regs, ".b16">;
+def StoreRetvalV4I8   : StoreRetvalV4Inst<Int16Regs, ".b8">;
+
+def StoreRetvalF64    : StoreRetvalInst<Float64Regs, ".f64">;
+def StoreRetvalF32    : StoreRetvalInst<Float32Regs, ".f32">;
+def StoreRetvalV2F64  : StoreRetvalV2Inst<Float64Regs, ".f64">;
+def StoreRetvalV2F32  : StoreRetvalV2Inst<Float32Regs, ".f32">;
+def StoreRetvalV4F32  : StoreRetvalV4Inst<Float32Regs, ".f32">;
+
+def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>;
+def CallArgEndInst1  : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>;
+def CallArgEndInst0  : NVPTXInst<(outs), (ins), ")", [(CallArgEnd (i32 0))]>;
+def RETURNInst       : NVPTXInst<(outs), (ins), "ret;", [(RETURNNode)]>;
+
+class CallArgInst<NVPTXRegClass regclass> :
+      NVPTXInst<(outs), (ins regclass:$a), "$a, ",
+                [(CallArg (i32 0), regclass:$a)]>;
+
+class LastCallArgInst<NVPTXRegClass regclass> :
+      NVPTXInst<(outs), (ins regclass:$a), "$a",
+                [(LastCallArg (i32 0), regclass:$a)]>;
+
+def CallArgI64     : CallArgInst<Int64Regs>;
+def CallArgI32     : CallArgInst<Int32Regs>;
+def CallArgI16     : CallArgInst<Int16Regs>;
+
+def CallArgF64     : CallArgInst<Float64Regs>;
+def CallArgF32     : CallArgInst<Float32Regs>;
+
+def LastCallArgI64 : LastCallArgInst<Int64Regs>;
+def LastCallArgI32 : LastCallArgInst<Int32Regs>;
+def LastCallArgI16 : LastCallArgInst<Int16Regs>;
+
+def LastCallArgF64 : LastCallArgInst<Float64Regs>;
+def LastCallArgF32 : LastCallArgInst<Float32Regs>;
+
+def CallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a, ",
+                              [(CallArg (i32 0), (i32 imm:$a))]>;
+def LastCallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a",
+                              [(LastCallArg (i32 0), (i32 imm:$a))]>;
+
+def CallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a, ",
+                             [(CallArg (i32 1), (i32 imm:$a))]>;
+def LastCallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a",
+                             [(LastCallArg (i32 1), (i32 imm:$a))]>;
+
+def CallVoidInst : NVPTXInst<(outs), (ins imem:$addr),
+                             "$addr, ",
+                             [(CallVoid (Wrapper tglobaladdr:$addr))]>;
+def CallVoidInstReg : NVPTXInst<(outs), (ins Int32Regs:$addr),
+                             "$addr, ",
+                             [(CallVoid Int32Regs:$addr)]>;
+def CallVoidInstReg64 : NVPTXInst<(outs), (ins Int64Regs:$addr),
+                             "$addr, ",
+                             [(CallVoid Int64Regs:$addr)]>;
+def PrototypeInst : NVPTXInst<(outs), (ins i32imm:$val),
+                             ", prototype_$val;",
+                             [(Prototype (i32 imm:$val))]>;
+
+def DeclareRetMemInst : NVPTXInst<(outs),
+  (ins i32imm:$align, i32imm:$size, i32imm:$num),
+         ".param .align $align .b8 retval$num[$size];",
+         [(DeclareRetParam (i32 imm:$align), (i32 imm:$size), (i32 imm:$num))]>;
+def DeclareRetScalarInst : NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
+         ".param .b$size retval$num;",
+         [(DeclareRet (i32 1), (i32 imm:$size), (i32 imm:$num))]>;
+def DeclareRetRegInst : NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
+         ".reg .b$size retval$num;",
+         [(DeclareRet (i32 2), (i32 imm:$size), (i32 imm:$num))]>;
+
+def DeclareParamInst : NVPTXInst<(outs),
+  (ins i32imm:$align, i32imm:$a, i32imm:$size),
+         ".param .align $align .b8 param$a[$size];",
+         [(DeclareParam (i32 imm:$align), (i32 imm:$a), (i32 imm:$size))]>;
+def DeclareScalarParamInst : NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
+         ".param .b$size param$a;",
+         [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 0))]>;
+def DeclareScalarRegInst : NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
+         ".reg .b$size param$a;",
+         [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>;
+
+class MoveParamInst<NVPTXRegClass regclass, string asmstr> :
+      NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
+                !strconcat(!strconcat("mov", asmstr), "\t$dst, $src;"),
+                [(set regclass:$dst, (MoveParam regclass:$src))]>;
+
+def MoveParamI64 : MoveParamInst<Int64Regs, ".b64">;
+def MoveParamI32 : MoveParamInst<Int32Regs, ".b32">;
+def MoveParamI16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
+                   "cvt.u16.u32\t$dst, $src;",
+                   [(set Int16Regs:$dst, (MoveParam Int16Regs:$src))]>;
+def MoveParamF64 : MoveParamInst<Float64Regs, ".f64">;
+def MoveParamF32 : MoveParamInst<Float32Regs, ".f32">;
+
+class PseudoUseParamInst<NVPTXRegClass regclass> :
+      NVPTXInst<(outs), (ins regclass:$src),
+      "// Pseudo use of $src",
+      [(PseudoUseParam regclass:$src)]>;
+
+def PseudoUseParamI64 : PseudoUseParamInst<Int64Regs>;
+def PseudoUseParamI32 : PseudoUseParamInst<Int32Regs>;
+def PseudoUseParamI16 : PseudoUseParamInst<Int16Regs>;
+def PseudoUseParamF64 : PseudoUseParamInst<Float64Regs>;
+def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs>;
+
+
+//
+// Load / Store Handling
+//
+multiclass LD<NVPTXRegClass regclass> {
+  def _avar : NVPTXInst<(outs regclass:$dst),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, imem:$addr),
+!strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+           "$fromWidth \t$dst, [$addr];"), []>;
+  def _areg : NVPTXInst<(outs regclass:$dst),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, Int32Regs:$addr),
+!strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+           "$fromWidth \t$dst, [$addr];"), []>;
+  def _areg_64 : NVPTXInst<(outs regclass:$dst),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+     i32imm:$fromWidth, Int64Regs:$addr),
+     !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth",
+                " \t$dst, [$addr];"), []>;
+  def _ari : NVPTXInst<(outs regclass:$dst),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+!strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+           "$fromWidth \t$dst, [$addr+$offset];"), []>;
+  def _ari_64 : NVPTXInst<(outs regclass:$dst),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+     i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
+    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth",
+               " \t$dst, [$addr+$offset];"), []>;
+  def _asi : NVPTXInst<(outs regclass:$dst),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+!strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+           "$fromWidth \t$dst, [$addr+$offset];"), []>;
+}
+
+let mayLoad=1, neverHasSideEffects=1 in {
+defm LD_i8  : LD<Int16Regs>;
+defm LD_i16 : LD<Int16Regs>;
+defm LD_i32 : LD<Int32Regs>;
+defm LD_i64 : LD<Int64Regs>;
+defm LD_f32 : LD<Float32Regs>;
+defm LD_f64 : LD<Float64Regs>;
+}
+
+multiclass ST<NVPTXRegClass regclass> {
+  def _avar : NVPTXInst<(outs),
+    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+      LdStCode:$Sign, i32imm:$toWidth, imem:$addr),
+!strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth",
+           " \t[$addr], $src;"), []>;
+  def _areg : NVPTXInst<(outs),
+    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+      LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr),
+!strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth",
+           " \t[$addr], $src;"), []>;
+  def _areg_64 : NVPTXInst<(outs),
+    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+     LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr),
+  !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth ",
+               "\t[$addr], $src;"), []>;
+  def _ari : NVPTXInst<(outs),
+    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+      LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset),
+!strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth",
+           " \t[$addr+$offset], $src;"), []>;
+  def _ari_64 : NVPTXInst<(outs),
+    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+     LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset),
+  !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth ",
+               "\t[$addr+$offset], $src;"), []>;
+  def _asi : NVPTXInst<(outs),
+    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+      LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset),
+!strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth",
+           " \t[$addr+$offset], $src;"), []>;
+}
+
+let mayStore=1, neverHasSideEffects=1 in {
+defm ST_i8  : ST<Int16Regs>;
+defm ST_i16 : ST<Int16Regs>;
+defm ST_i32 : ST<Int32Regs>;
+defm ST_i64 : ST<Int64Regs>;
+defm ST_f32 : ST<Float32Regs>;
+defm ST_f64 : ST<Float64Regs>;
+}
+
+// The following is used only in and after vector elementizations.
+// Vector elementization happens at the machine instruction level, so the
+// following instruction
+// never appears in the DAG.
+multiclass LD_VEC<NVPTXRegClass regclass> {
+  def _v2_avar : NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, imem:$addr),
+    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t{{$dst1, $dst2}}, [$addr];"), []>;
+  def _v2_areg : NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, Int32Regs:$addr),
+    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t{{$dst1, $dst2}}, [$addr];"), []>;
+  def _v2_areg_64 : NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+     i32imm:$fromWidth, Int64Regs:$addr),
+    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t{{$dst1, $dst2}}, [$addr];"), []>;
+  def _v2_ari : NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t{{$dst1, $dst2}}, [$addr+$offset];"), []>;
+  def _v2_ari_64 : NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+     i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
+    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t{{$dst1, $dst2}}, [$addr+$offset];"), []>;
+  def _v2_asi : NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t{{$dst1, $dst2}}, [$addr+$offset];"), []>;
+  def _v4_avar : NVPTXInst<(outs regclass:$dst1, regclass:$dst2,
+      regclass:$dst3, regclass:$dst4),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, imem:$addr),
+    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];"), []>;
+  def _v4_areg : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+      regclass:$dst4),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, Int32Regs:$addr),
+    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];"), []>;
+  def _v4_areg_64 : NVPTXInst<(outs regclass:$dst1, regclass:$dst2,
+                               regclass:$dst3, regclass:$dst4),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+     i32imm:$fromWidth, Int64Regs:$addr),
+    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];"), []>;
+  def _v4_ari : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+      regclass:$dst4),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];"),
+                []>;
+  def _v4_ari_64 : NVPTXInst<(outs regclass:$dst1, regclass:$dst2,
+                              regclass:$dst3, regclass:$dst4),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+     i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
+    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];"),
+    []>;
+  def _v4_asi : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+      regclass:$dst4),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];"),
+                []>;
+}
+let mayLoad=1, neverHasSideEffects=1 in {
+defm LDV_i8  : LD_VEC<Int16Regs>;
+defm LDV_i16 : LD_VEC<Int16Regs>;
+defm LDV_i32 : LD_VEC<Int32Regs>;
+defm LDV_i64 : LD_VEC<Int64Regs>;
+defm LDV_f32 : LD_VEC<Float32Regs>;
+defm LDV_f64 : LD_VEC<Float64Regs>;
+}
+
+multiclass ST_VEC<NVPTXRegClass regclass> {
+  def _v2_avar : NVPTXInst<(outs),
+    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+      LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
+    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t[$addr], {{$src1, $src2}};"), []>;
+  def _v2_areg : NVPTXInst<(outs),
+    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+      LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
+    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t[$addr], {{$src1, $src2}};"), []>;
+  def _v2_areg_64 : NVPTXInst<(outs),
+    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+     LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
+    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t[$addr], {{$src1, $src2}};"), []>;
+  def _v2_ari : NVPTXInst<(outs),
+    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+      LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr,
+      i32imm:$offset),
+    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t[$addr+$offset], {{$src1, $src2}};"), []>;
+  def _v2_ari_64 : NVPTXInst<(outs),
+    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+     LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr,
+     i32imm:$offset),
+    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t[$addr+$offset], {{$src1, $src2}};"), []>;
+  def _v2_asi : NVPTXInst<(outs),
+    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+      LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr,
+      i32imm:$offset),
+    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t[$addr+$offset], {{$src1, $src2}};"), []>;
+  def _v4_avar : NVPTXInst<(outs),
+    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+      LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, imem:$addr),
+    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t[$addr], {{$src1, $src2, $src3, $src4}};"), []>;
+  def _v4_areg : NVPTXInst<(outs),
+    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+      LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, Int32Regs:$addr),
+    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t[$addr], {{$src1, $src2, $src3, $src4}};"), []>;
+  def _v4_areg_64 : NVPTXInst<(outs),
+    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+     LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+     i32imm:$fromWidth, Int64Regs:$addr),
+    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t[$addr], {{$src1, $src2, $src3, $src4}};"), []>;
+  def _v4_ari : NVPTXInst<(outs),
+    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+      LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};"),
+    []>;
+  def _v4_ari_64 : NVPTXInst<(outs),
+    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+     LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+     i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
+    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};"),
+     []>;
+  def _v4_asi : NVPTXInst<(outs),
+    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+      LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};"),
+    []>;
+}
+let mayStore=1, neverHasSideEffects=1 in {
+defm STV_i8  : ST_VEC<Int16Regs>;
+defm STV_i16 : ST_VEC<Int16Regs>;
+defm STV_i32 : ST_VEC<Int32Regs>;
+defm STV_i64 : ST_VEC<Int64Regs>;
+defm STV_f32 : ST_VEC<Float32Regs>;
+defm STV_f64 : ST_VEC<Float64Regs>;
+}
+
+
+//---- Conversion ----
+
+class F_BITCONVERT<string SzStr, NVPTXRegClass regclassIn,
+  NVPTXRegClass regclassOut> :
+           NVPTXInst<(outs regclassOut:$d), (ins regclassIn:$a),
+           !strconcat("mov.b", !strconcat(SzStr, " \t $d, $a;")),
+     [(set regclassOut:$d, (bitconvert regclassIn:$a))]>;
+
+def BITCONVERT_32_I2F : F_BITCONVERT<"32", Int32Regs, Float32Regs>;
+def BITCONVERT_32_F2I : F_BITCONVERT<"32", Float32Regs, Int32Regs>;
+def BITCONVERT_64_I2F : F_BITCONVERT<"64", Int64Regs, Float64Regs>;
+def BITCONVERT_64_F2I : F_BITCONVERT<"64", Float64Regs, Int64Regs>;
+
+// NOTE: pred->fp are currently sub-optimal due to an issue in TableGen where
+// we cannot specify floating-point literals in isel patterns.  Therefore, we
+// use an integer selp to select either 1 or 0 and then cvt to floating-point.
+
+// sint -> f32
+def : Pat<(f32 (sint_to_fp Int1Regs:$a)),
+          (CVT_f32_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
+def : Pat<(f32 (sint_to_fp Int16Regs:$a)),
+          (CVT_f32_s16 Int16Regs:$a, CvtRN)>;
+def : Pat<(f32 (sint_to_fp Int32Regs:$a)),
+          (CVT_f32_s32 Int32Regs:$a, CvtRN)>;
+def : Pat<(f32 (sint_to_fp Int64Regs:$a)),
+          (CVT_f32_s64 Int64Regs:$a, CvtRN)>;
+
+// uint -> f32
+def : Pat<(f32 (uint_to_fp Int1Regs:$a)),
+          (CVT_f32_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
+def : Pat<(f32 (uint_to_fp Int16Regs:$a)),
+          (CVT_f32_u16 Int16Regs:$a, CvtRN)>;
+def : Pat<(f32 (uint_to_fp Int32Regs:$a)),
+          (CVT_f32_u32 Int32Regs:$a, CvtRN)>;
+def : Pat<(f32 (uint_to_fp Int64Regs:$a)),
+          (CVT_f32_u64 Int64Regs:$a, CvtRN)>;
+
+// sint -> f64
+def : Pat<(f64 (sint_to_fp Int1Regs:$a)),
+          (CVT_f64_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
+def : Pat<(f64 (sint_to_fp Int16Regs:$a)),
+          (CVT_f64_s16 Int16Regs:$a, CvtRN)>;
+def : Pat<(f64 (sint_to_fp Int32Regs:$a)),
+          (CVT_f64_s32 Int32Regs:$a, CvtRN)>;
+def : Pat<(f64 (sint_to_fp Int64Regs:$a)),
+          (CVT_f64_s64 Int64Regs:$a, CvtRN)>;
+
+// uint -> f64
+def : Pat<(f64 (uint_to_fp Int1Regs:$a)),
+          (CVT_f64_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
+def : Pat<(f64 (uint_to_fp Int16Regs:$a)),
+          (CVT_f64_u16 Int16Regs:$a, CvtRN)>;
+def : Pat<(f64 (uint_to_fp Int32Regs:$a)),
+          (CVT_f64_u32 Int32Regs:$a, CvtRN)>;
+def : Pat<(f64 (uint_to_fp Int64Regs:$a)),
+          (CVT_f64_u64 Int64Regs:$a, CvtRN)>;
+
+
+// f32 -> sint
+def : Pat<(i1 (fp_to_sint Float32Regs:$a)),
+          (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
+def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
+          (CVT_s16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
+          (CVT_s16_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(i32 (fp_to_sint Float32Regs:$a)),
+          (CVT_s32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i32 (fp_to_sint Float32Regs:$a)),
+          (CVT_s32_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(i64 (fp_to_sint Float32Regs:$a)),
+          (CVT_s64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i64 (fp_to_sint Float32Regs:$a)),
+          (CVT_s64_f32 Float32Regs:$a, CvtRZI)>;
+
+// f32 -> uint
+def : Pat<(i1 (fp_to_uint Float32Regs:$a)),
+          (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
+def : Pat<(i16 (fp_to_uint Float32Regs:$a)),
+          (CVT_u16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i16 (fp_to_uint Float32Regs:$a)),
+          (CVT_u16_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(i32 (fp_to_uint Float32Regs:$a)),
+          (CVT_u32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i32 (fp_to_uint Float32Regs:$a)),
+          (CVT_u32_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(i64 (fp_to_uint Float32Regs:$a)),
+          (CVT_u64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i64 (fp_to_uint Float32Regs:$a)),
+          (CVT_u64_f32 Float32Regs:$a, CvtRZI)>;
+
+// f64 -> sint
+def : Pat<(i1 (fp_to_sint Float64Regs:$a)),
+          (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
+def : Pat<(i16 (fp_to_sint Float64Regs:$a)),
+          (CVT_s16_f64 Float64Regs:$a, CvtRZI)>;
+def : Pat<(i32 (fp_to_sint Float64Regs:$a)),
+          (CVT_s32_f64 Float64Regs:$a, CvtRZI)>;
+def : Pat<(i64 (fp_to_sint Float64Regs:$a)),
+          (CVT_s64_f64 Float64Regs:$a, CvtRZI)>;
+
+// f64 -> uint
+def : Pat<(i1 (fp_to_uint Float64Regs:$a)),
+          (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
+def : Pat<(i16 (fp_to_uint Float64Regs:$a)),
+          (CVT_u16_f64 Float64Regs:$a, CvtRZI)>;
+def : Pat<(i32 (fp_to_uint Float64Regs:$a)),
+          (CVT_u32_f64 Float64Regs:$a, CvtRZI)>;
+def : Pat<(i64 (fp_to_uint Float64Regs:$a)),
+          (CVT_u64_f64 Float64Regs:$a, CvtRZI)>;
+
+// sext i1
+def : Pat<(i16 (sext Int1Regs:$a)),
+          (SELP_s16ii -1, 0, Int1Regs:$a)>;
+def : Pat<(i32 (sext Int1Regs:$a)),
+          (SELP_s32ii -1, 0, Int1Regs:$a)>;
+def : Pat<(i64 (sext Int1Regs:$a)),
+          (SELP_s64ii -1, 0, Int1Regs:$a)>;
+
+// zext i1
+def : Pat<(i16 (zext Int1Regs:$a)),
+          (SELP_u16ii 1, 0, Int1Regs:$a)>;
+def : Pat<(i32 (zext Int1Regs:$a)),
+          (SELP_u32ii 1, 0, Int1Regs:$a)>;
+def : Pat<(i64 (zext Int1Regs:$a)),
+          (SELP_u64ii 1, 0, Int1Regs:$a)>;
+
+// anyext i1
+def : Pat<(i16 (anyext Int1Regs:$a)),
+          (SELP_u16ii -1, 0, Int1Regs:$a)>;
+def : Pat<(i32 (anyext Int1Regs:$a)),
+          (SELP_u32ii -1, 0, Int1Regs:$a)>;
+def : Pat<(i64 (anyext Int1Regs:$a)),
+          (SELP_u64ii -1, 0, Int1Regs:$a)>;
+
+// sext i16
+def : Pat<(i32 (sext Int16Regs:$a)),
+          (CVT_s32_s16 Int16Regs:$a, CvtNONE)>;
+def : Pat<(i64 (sext Int16Regs:$a)),
+          (CVT_s64_s16 Int16Regs:$a, CvtNONE)>;
+
+// zext i16
+def : Pat<(i32 (zext Int16Regs:$a)),
+          (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
+def : Pat<(i64 (zext Int16Regs:$a)),
+          (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
+
+// anyext i16
+def : Pat<(i32 (anyext Int16Regs:$a)),
+          (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
+def : Pat<(i64 (anyext Int16Regs:$a)),
+          (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
+
+// sext i32
+def : Pat<(i64 (sext Int32Regs:$a)),
+          (CVT_s64_s32 Int32Regs:$a, CvtNONE)>;
+
+// zext i32
+def : Pat<(i64 (zext Int32Regs:$a)),
+          (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
+
+// anyext i32
+def : Pat<(i64 (anyext Int32Regs:$a)),
+          (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
+
+
+// truncate i64
+def : Pat<(i32 (trunc Int64Regs:$a)),
+          (CVT_u32_u64 Int64Regs:$a, CvtNONE)>;
+def : Pat<(i16 (trunc Int64Regs:$a)),
+          (CVT_u16_u64 Int64Regs:$a, CvtNONE)>;
+def : Pat<(i1 (trunc Int64Regs:$a)),
+          (SETP_b64ri (ANDb64ri Int64Regs:$a, 1), 1, CmpEQ)>;
+
+// truncate i32
+def : Pat<(i16 (trunc Int32Regs:$a)),
+          (CVT_u16_u32 Int32Regs:$a, CvtNONE)>;
+def : Pat<(i1 (trunc Int32Regs:$a)),
+          (SETP_b32ri (ANDb32ri Int32Regs:$a, 1), 1, CmpEQ)>;
+
+// truncate i16
+def : Pat<(i1 (trunc Int16Regs:$a)),
+          (SETP_b16ri (ANDb16ri Int16Regs:$a, 1), 1, CmpEQ)>;
+
+// sext_inreg
+def : Pat<(sext_inreg Int16Regs:$a, i8), (CVT_INREG_s16_s8 Int16Regs:$a)>;
+def : Pat<(sext_inreg Int32Regs:$a, i8), (CVT_INREG_s32_s8 Int32Regs:$a)>;
+def : Pat<(sext_inreg Int32Regs:$a, i16), (CVT_INREG_s32_s16 Int32Regs:$a)>;
+def : Pat<(sext_inreg Int64Regs:$a, i8), (CVT_INREG_s64_s8 Int64Regs:$a)>;
+def : Pat<(sext_inreg Int64Regs:$a, i16), (CVT_INREG_s64_s16 Int64Regs:$a)>;
+def : Pat<(sext_inreg Int64Regs:$a, i32), (CVT_INREG_s64_s32 Int64Regs:$a)>;
+
+
+// Select instructions with 32-bit predicates
+def : Pat<(select Int32Regs:$pred, Int16Regs:$a, Int16Regs:$b),
+          (SELP_b16rr Int16Regs:$a, Int16Regs:$b,
+          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+def : Pat<(select Int32Regs:$pred, Int32Regs:$a, Int32Regs:$b),
+          (SELP_b32rr Int32Regs:$a, Int32Regs:$b,
+          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+def : Pat<(select Int32Regs:$pred, Int64Regs:$a, Int64Regs:$b),
+          (SELP_b64rr Int64Regs:$a, Int64Regs:$b,
+          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+def : Pat<(select Int32Regs:$pred, Float32Regs:$a, Float32Regs:$b),
+          (SELP_f32rr Float32Regs:$a, Float32Regs:$b,
+          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+def : Pat<(select Int32Regs:$pred, Float64Regs:$a, Float64Regs:$b),
+          (SELP_f64rr Float64Regs:$a, Float64Regs:$b,
+          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+
+
+// pack a set of smaller int registers to a larger int register
+def V4I16toI64 : NVPTXInst<(outs Int64Regs:$d),
+                          (ins Int16Regs:$s1, Int16Regs:$s2,
+                               Int16Regs:$s3, Int16Regs:$s4),
+                          "mov.b64\t$d, {{$s1, $s2, $s3, $s4}};",
+                          []>;
+def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d),
+                          (ins Int16Regs:$s1, Int16Regs:$s2),
+                          "mov.b32\t$d, {{$s1, $s2}};",
+                          []>;
+def V2I32toI64 : NVPTXInst<(outs Int64Regs:$d),
+                          (ins Int32Regs:$s1, Int32Regs:$s2),
+                          "mov.b64\t$d, {{$s1, $s2}};",
+                          []>;
+def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d),
+                          (ins Float32Regs:$s1, Float32Regs:$s2),
+                          "mov.b64\t$d, {{$s1, $s2}};",
+                          []>;
+
+// unpack a larger int register to a set of smaller int registers
+def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2,
+                                 Int16Regs:$d3, Int16Regs:$d4),
+                           (ins Int64Regs:$s),
+                           "mov.b64\t{{$d1, $d2, $d3, $d4}}, $s;",
+                          []>;
+def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2),
+                           (ins Int32Regs:$s),
+                           "mov.b32\t{{$d1, $d2}}, $s;",
+                          []>;
+def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2),
+                           (ins Int64Regs:$s),
+                           "mov.b64\t{{$d1, $d2}}, $s;",
+                          []>;
+def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2),
+                           (ins Float64Regs:$s),
+                           "mov.b64\t{{$d1, $d2}}, $s;",
+                          []>;
+
+// Count leading zeros
+def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
+                       "clz.b32\t$d, $a;",
+                       []>;
+def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+                       "clz.b64\t$d, $a;",
+                       []>;
+
+// 32-bit has a direct PTX instruction
+def : Pat<(ctlz Int32Regs:$a),
+          (CLZr32 Int32Regs:$a)>;
+def : Pat<(ctlz_zero_undef Int32Regs:$a),
+          (CLZr32 Int32Regs:$a)>;
+
+// For 64-bit, the result in PTX is actually 32-bit so we zero-extend
+// to 64-bit to match the LLVM semantics
+def : Pat<(ctlz Int64Regs:$a),
+          (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
+def : Pat<(ctlz_zero_undef Int64Regs:$a),
+          (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
+
+// For 16-bit, we zero-extend to 32-bit, then trunc the result back
+// to 16-bits (ctlz of a 16-bit value is guaranteed to require less
+// than 16 bits to store). We also need to subtract 16 because the
+// high-order 16 zeros were counted.
+def : Pat<(ctlz Int16Regs:$a),
+          (SUBi16ri (CVT_u16_u32 (CLZr32
+            (CVT_u32_u16 Int16Regs:$a, CvtNONE)),
+           CvtNONE), 16)>;
+def : Pat<(ctlz_zero_undef Int16Regs:$a),
+          (SUBi16ri (CVT_u16_u32 (CLZr32
+            (CVT_u32_u16 Int16Regs:$a, CvtNONE)),
+           CvtNONE), 16)>;
+
+// Population count
+def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
+                        "popc.b32\t$d, $a;",
+                        []>;
+def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+                        "popc.b64\t$d, $a;",
+                        []>;
+
+// 32-bit has a direct PTX instruction
+def : Pat<(ctpop Int32Regs:$a),
+          (POPCr32 Int32Regs:$a)>;
+
+// For 64-bit, the result in PTX is actually 32-bit so we zero-extend
+// to 64-bit to match the LLVM semantics
+def : Pat<(ctpop Int64Regs:$a),
+          (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>;
+
+// For 16-bit, we zero-extend to 32-bit, then trunc the result back
+// to 16-bits (ctpop of a 16-bit value is guaranteed to require less
+// than 16 bits to store)
+def : Pat<(ctpop Int16Regs:$a),
+          (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)),
+           CvtNONE)>;
+
+// fround f64 -> f32
+def : Pat<(f32 (fround Float64Regs:$a)),
+          (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(f32 (fround Float64Regs:$a)),
+          (CVT_f32_f64 Float64Regs:$a, CvtRN)>;
+
+// fextend f32 -> f64
+def : Pat<(f64 (fextend Float32Regs:$a)),
+          (CVT_f64_f32 Float32Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(f64 (fextend Float32Regs:$a)),
+          (CVT_f64_f32 Float32Regs:$a, CvtNONE)>;
+
+def retflag       : SDNode<"NVPTXISD::RET_FLAG", SDTNone,
+                           [SDNPHasChain, SDNPOptInGlue]>;
+
+//-----------------------------------
+// Control-flow
+//-----------------------------------
+
+let isTerminator=1 in {
+   let isReturn=1, isBarrier=1 in
+      def Return : NVPTXInst<(outs), (ins), "ret;", [(retflag)]>;
+
+   let isBranch=1 in
+      def CBranch : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
+                          "@$a bra \t$target;",
+                           [(brcond Int1Regs:$a, bb:$target)]>;
+   let isBranch=1 in
+      def CBranchOther : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
+                          "@!$a bra \t$target;",
+                           []>;
+
+   let isBranch=1, isBarrier=1 in
+      def GOTO : NVPTXInst<(outs), (ins brtarget:$target),
+                        "bra.uni \t$target;",
+                  [(br bb:$target)]>;
+}
+
+def : Pat<(brcond Int32Regs:$a, bb:$target),
+          (CBranch (SETP_u32ri Int32Regs:$a, 0, CmpNE), bb:$target)>;
+
+// SelectionDAGBuilder::visitSWitchCase() will invert the condition of a
+// conditional branch if
+// the target block is the next block so that the code can fall through to the
+// target block.
+// The invertion is done by 'xor condition, 1', which will be translated to
+// (setne condition, -1).
+// Since ptx supports '@!pred bra target', we should use it.
+def : Pat<(brcond (i1 (setne Int1Regs:$a, -1)), bb:$target),
+  (CBranchOther Int1Regs:$a, bb:$target)>;
+
+// Call
+def SDT_NVPTXCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_NVPTXCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>,
+                                        SDTCisVT<1, i32> ]>;
+
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart,
+                           [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
+def callseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_NVPTXCallSeqEnd,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                           SDNPSideEffect]>;
+
+def SDT_NVPTXCall : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
+def call          : SDNode<"NVPTXISD::CALL", SDT_NVPTXCall,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def calltarget : Operand<i32>;
+let isCall=1 in {
+   def CALL : NVPTXInst<(outs), (ins calltarget:$dst),
+                  "call \t$dst, (1);", []>;
+}
+
+def : Pat<(call tglobaladdr:$dst),
+          (CALL tglobaladdr:$dst)>;
+def : Pat<(call texternalsym:$dst),
+          (CALL texternalsym:$dst)>;
+
+// Pseudo instructions.
+class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+   : NVPTXInst<outs, ins, asmstr, pattern>;
+
+// @TODO: We use some tricks here to emit curly braces.  Can we clean this up
+// a bit without TableGen modifications?
+def Callseq_Start : NVPTXInst<(outs), (ins i32imm:$amt),
+  "// Callseq Start $amt\n\t{{\n\t.reg .b32 temp_param_reg;\n\t// <end>}}",
+                               [(callseq_start timm:$amt)]>;
+def Callseq_End : NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+  "\n\t//{{\n\t}}// Callseq End $amt1",
+                            [(callseq_end timm:$amt1, timm:$amt2)]>;
+
+// trap instruction
+
+def trapinst : NVPTXInst<(outs), (ins),
+                         "trap;",
+                         [(trap)]>;
+
+// Call prototype wrapper
+def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+def CallPrototype
+  : SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype,
+           [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def ProtoIdent : Operand<i32> {
+  let PrintMethod = "printProtoIdent";
+}
+def CALL_PROTOTYPE
+  : NVPTXInst<(outs), (ins ProtoIdent:$ident),
+              "$ident", [(CallPrototype (i32 texternalsym:$ident))]>;
+
+
+
+include "NVPTXIntrinsics.td"
+
+
+//-----------------------------------
+// Notes
+//-----------------------------------
+// BSWAP is currently expanded. The following is a more efficient
+// - for < sm_20, use vector scalar mov, as tesla support native 16-bit register
+// - for sm_20, use pmpt (use vector scalar mov to get the pack and
+//   unpack). sm_20 supports native 32-bit register, but not native 16-bit
+// register.
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/contrib/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
new file mode 100644
index 0000000..14e51aa
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -0,0 +1,7047 @@
+//===- NVPTXIntrinsics.td - PTX Intrinsics Instructions -------*- tblgen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def immFloat0 : PatLeaf<(fpimm), [{
+    float f = (float)N->getValueAPF().convertToFloat();
+    return (f==0.0f);
+}]>;
+
+def immFloat1 : PatLeaf<(fpimm), [{
+    float f = (float)N->getValueAPF().convertToFloat();
+    return (f==1.0f);
+}]>;
+
+def immDouble0 : PatLeaf<(fpimm), [{
+    double d = (double)N->getValueAPF().convertToDouble();
+    return (d==0.0);
+}]>;
+
+def immDouble1 : PatLeaf<(fpimm), [{
+    double d = (double)N->getValueAPF().convertToDouble();
+    return (d==1.0);
+}]>;
+
+
+
+//-----------------------------------
+// Synchronization Functions
+//-----------------------------------
+def INT_CUDA_SYNCTHREADS : NVPTXInst<(outs), (ins),
+                  "bar.sync \t0;",
+      [(int_cuda_syncthreads)]>;
+def INT_BARRIER0 : NVPTXInst<(outs), (ins),
+                  "bar.sync \t0;",
+      [(int_nvvm_barrier0)]>;
+def INT_BARRIER0_POPC : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred),
+  !strconcat("{{ \n\t",
+      !strconcat(".reg .pred \t%p1; \n\t",
+      !strconcat("setp.ne.u32 \t%p1, $pred, 0; \n\t",
+      !strconcat("bar.red.popc.u32 \t$dst, 0, %p1; \n\t",
+        !strconcat("}}", ""))))),
+      [(set Int32Regs:$dst, (int_nvvm_barrier0_popc Int32Regs:$pred))]>;
+def INT_BARRIER0_AND : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred),
+  !strconcat("{{ \n\t",
+      !strconcat(".reg .pred \t%p1; \n\t",
+      !strconcat(".reg .pred \t%p2; \n\t",
+      !strconcat("setp.ne.u32 \t%p1, $pred, 0; \n\t",
+      !strconcat("bar.red.and.pred \t%p2, 0, %p1; \n\t",
+      !strconcat("selp.u32 \t$dst, 1, 0, %p2; \n\t",
+        !strconcat("}}", ""))))))),
+      [(set Int32Regs:$dst, (int_nvvm_barrier0_and Int32Regs:$pred))]>;
+def INT_BARRIER0_OR : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred),
+  !strconcat("{{ \n\t",
+      !strconcat(".reg .pred \t%p1; \n\t",
+      !strconcat(".reg .pred \t%p2; \n\t",
+      !strconcat("setp.ne.u32 \t%p1, $pred, 0; \n\t",
+      !strconcat("bar.red.or.pred \t%p2, 0, %p1; \n\t",
+      !strconcat("selp.u32 \t$dst, 1, 0, %p2; \n\t",
+        !strconcat("}}", ""))))))),
+      [(set Int32Regs:$dst, (int_nvvm_barrier0_or Int32Regs:$pred))]>;
+
+
+//-----------------------------------
+// Explicit Memory Fence Functions
+//-----------------------------------
+class MEMBAR<string StrOp, Intrinsic IntOP> :
+              NVPTXInst<(outs), (ins),
+            StrOp, [(IntOP)]>;
+
+def INT_MEMBAR_CTA : MEMBAR<"membar.cta;", int_nvvm_membar_cta>;
+def INT_MEMBAR_GL  : MEMBAR<"membar.gl;",  int_nvvm_membar_gl>;
+def INT_MEMBAR_SYS : MEMBAR<"membar.sys;", int_nvvm_membar_sys>;
+
+
+//-----------------------------------
+// Math Functions
+//-----------------------------------
+
+// Map min(1.0, max(0.0, x)) to sat(x)
+// Note that max(0.0, min(x, 1.0)) cannot be mapped to sat(x) because when x is
+// NaN
+// max(0.0, min(x, 1.0)) is 1.0 while sat(x) is 0.
+// Same story for fmax, fmin.
+
+def : Pat<(int_nvvm_fmin_f immFloat1,
+            (int_nvvm_fmax_f immFloat0, Float32Regs:$a)),
+          (CVT_f32_f32 Float32Regs:$a, CvtSAT)>;
+def : Pat<(int_nvvm_fmin_f immFloat1,
+            (int_nvvm_fmax_f Float32Regs:$a, immFloat0)),
+          (CVT_f32_f32 Float32Regs:$a, CvtSAT)>;
+def : Pat<(int_nvvm_fmin_f
+            (int_nvvm_fmax_f immFloat0, Float32Regs:$a), immFloat1),
+          (CVT_f32_f32 Float32Regs:$a, CvtSAT)>;
+def : Pat<(int_nvvm_fmin_f
+            (int_nvvm_fmax_f Float32Regs:$a, immFloat0), immFloat1),
+          (CVT_f32_f32 Float32Regs:$a, CvtSAT)>;
+
+def : Pat<(int_nvvm_fmin_d immDouble1,
+            (int_nvvm_fmax_d immDouble0, Float64Regs:$a)),
+          (CVT_f64_f64 Float64Regs:$a, CvtSAT)>;
+def : Pat<(int_nvvm_fmin_d immDouble1,
+            (int_nvvm_fmax_d Float64Regs:$a, immDouble0)),
+          (CVT_f64_f64 Float64Regs:$a, CvtSAT)>;
+def : Pat<(int_nvvm_fmin_d
+            (int_nvvm_fmax_d immDouble0, Float64Regs:$a), immDouble1),
+          (CVT_f64_f64 Float64Regs:$a, CvtSAT)>;
+def : Pat<(int_nvvm_fmin_d
+            (int_nvvm_fmax_d Float64Regs:$a, immDouble0), immDouble1),
+          (CVT_f64_f64 Float64Regs:$a, CvtSAT)>;
+
+
+// We need a full string for OpcStr here because we need to deal with case like
+// INT_PTX_RECIP.
+class F_MATH_1<string OpcStr, NVPTXRegClass target_regclass,
+  NVPTXRegClass src_regclass, Intrinsic IntOP>
+            : NVPTXInst<(outs target_regclass:$dst), (ins src_regclass:$src0),
+            OpcStr,
+        [(set target_regclass:$dst, (IntOP src_regclass:$src0))]>;
+
+// We need a full string for OpcStr here because we need to deal with the case
+// like INT_PTX_NATIVE_POWR_F.
+class F_MATH_2<string OpcStr, NVPTXRegClass t_regclass,
+  NVPTXRegClass s0_regclass, NVPTXRegClass s1_regclass, Intrinsic IntOP>
+            : NVPTXInst<(outs t_regclass:$dst),
+              (ins s0_regclass:$src0, s1_regclass:$src1),
+            OpcStr,
+        [(set t_regclass:$dst, (IntOP s0_regclass:$src0, s1_regclass:$src1))]>;
+
+class F_MATH_3<string OpcStr, NVPTXRegClass t_regclass,
+  NVPTXRegClass s0_regclass, NVPTXRegClass s1_regclass,
+  NVPTXRegClass s2_regclass, Intrinsic IntOP>
+            : NVPTXInst<(outs t_regclass:$dst),
+              (ins s0_regclass:$src0, s1_regclass:$src1, s2_regclass:$src2),
+            OpcStr,
+        [(set t_regclass:$dst,
+          (IntOP s0_regclass:$src0, s1_regclass:$src1, s2_regclass:$src2))]>;
+
+//
+// MISC
+//
+
+def INT_NVVM_CLZ_I : F_MATH_1<"clz.b32 \t$dst, $src0;", Int32Regs, Int32Regs,
+  int_nvvm_clz_i>;
+def INT_NVVM_CLZ_LL : F_MATH_1<"clz.b64 \t$dst, $src0;", Int32Regs, Int64Regs,
+  int_nvvm_clz_ll>;
+
+def INT_NVVM_POPC_I : F_MATH_1<"popc.b32 \t$dst, $src0;", Int32Regs, Int32Regs,
+  int_nvvm_popc_i>;
+def INT_NVVM_POPC_LL : F_MATH_1<"popc.b64 \t$dst, $src0;", Int32Regs, Int64Regs,
+  int_nvvm_popc_ll>;
+
+def INT_NVVM_PRMT : F_MATH_3<"prmt.b32 \t$dst, $src0, $src1, $src2;", Int32Regs,
+  Int32Regs, Int32Regs, Int32Regs, int_nvvm_prmt>;
+
+//
+// Min Max
+//
+
+def INT_NVVM_MIN_I : F_MATH_2<"min.s32 \t$dst, $src0, $src1;", Int32Regs,
+  Int32Regs, Int32Regs, int_nvvm_min_i>;
+def INT_NVVM_MIN_UI : F_MATH_2<"min.u32 \t$dst, $src0, $src1;", Int32Regs,
+  Int32Regs, Int32Regs, int_nvvm_min_ui>;
+
+def INT_NVVM_MIN_LL : F_MATH_2<"min.s64 \t$dst, $src0, $src1;", Int64Regs,
+  Int64Regs, Int64Regs, int_nvvm_min_ll>;
+def INT_NVVM_MIN_ULL : F_MATH_2<"min.u64 \t$dst, $src0, $src1;", Int64Regs,
+  Int64Regs, Int64Regs, int_nvvm_min_ull>;
+
+def INT_NVVM_MAX_I : F_MATH_2<"max.s32 \t$dst, $src0, $src1;", Int32Regs,
+  Int32Regs, Int32Regs, int_nvvm_max_i>;
+def INT_NVVM_MAX_UI : F_MATH_2<"max.u32 \t$dst, $src0, $src1;", Int32Regs,
+  Int32Regs, Int32Regs, int_nvvm_max_ui>;
+
+def INT_NVVM_MAX_LL : F_MATH_2<"max.s64 \t$dst, $src0, $src1;", Int64Regs,
+  Int64Regs, Int64Regs, int_nvvm_max_ll>;
+def INT_NVVM_MAX_ULL : F_MATH_2<"max.u64 \t$dst, $src0, $src1;", Int64Regs,
+  Int64Regs, Int64Regs, int_nvvm_max_ull>;
+
+def INT_NVVM_FMIN_F : F_MATH_2<"min.f32 \t$dst, $src0, $src1;", Float32Regs,
+  Float32Regs, Float32Regs, int_nvvm_fmin_f>;
+def INT_NVVM_FMIN_FTZ_F : F_MATH_2<"min.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmin_ftz_f>;
+
+def INT_NVVM_FMAX_F : F_MATH_2<"max.f32 \t$dst, $src0, $src1;", Float32Regs,
+  Float32Regs, Float32Regs, int_nvvm_fmax_f>;
+def INT_NVVM_FMAX_FTZ_F : F_MATH_2<"max.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_ftz_f>;
+
+def INT_NVVM_FMIN_D : F_MATH_2<"min.f64 \t$dst, $src0, $src1;", Float64Regs,
+  Float64Regs, Float64Regs, int_nvvm_fmin_d>;
+def INT_NVVM_FMAX_D : F_MATH_2<"max.f64 \t$dst, $src0, $src1;", Float64Regs,
+  Float64Regs, Float64Regs, int_nvvm_fmax_d>;
+
+//
+// Multiplication
+//
+
+def INT_NVVM_MULHI_I : F_MATH_2<"mul.hi.s32 \t$dst, $src0, $src1;", Int32Regs,
+  Int32Regs, Int32Regs, int_nvvm_mulhi_i>;
+def INT_NVVM_MULHI_UI : F_MATH_2<"mul.hi.u32 \t$dst, $src0, $src1;", Int32Regs,
+  Int32Regs, Int32Regs, int_nvvm_mulhi_ui>;
+
+def INT_NVVM_MULHI_LL : F_MATH_2<"mul.hi.s64 \t$dst, $src0, $src1;", Int64Regs,
+  Int64Regs, Int64Regs, int_nvvm_mulhi_ll>;
+def INT_NVVM_MULHI_ULL : F_MATH_2<"mul.hi.u64 \t$dst, $src0, $src1;", Int64Regs,
+  Int64Regs, Int64Regs, int_nvvm_mulhi_ull>;
+
+def INT_NVVM_MUL_RN_FTZ_F : F_MATH_2<"mul.rn.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rn_ftz_f>;
+def INT_NVVM_MUL_RN_F : F_MATH_2<"mul.rn.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rn_f>;
+def INT_NVVM_MUL_RZ_FTZ_F : F_MATH_2<"mul.rz.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rz_ftz_f>;
+def INT_NVVM_MUL_RZ_F : F_MATH_2<"mul.rz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rz_f>;
+def INT_NVVM_MUL_RM_FTZ_F : F_MATH_2<"mul.rm.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rm_ftz_f>;
+def INT_NVVM_MUL_RM_F : F_MATH_2<"mul.rm.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rm_f>;
+def INT_NVVM_MUL_RP_FTZ_F : F_MATH_2<"mul.rp.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rp_ftz_f>;
+def INT_NVVM_MUL_RP_F : F_MATH_2<"mul.rp.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rp_f>;
+
+def INT_NVVM_MUL_RN_D : F_MATH_2<"mul.rn.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_mul_rn_d>;
+def INT_NVVM_MUL_RZ_D : F_MATH_2<"mul.rz.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_mul_rz_d>;
+def INT_NVVM_MUL_RM_D : F_MATH_2<"mul.rm.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_mul_rm_d>;
+def INT_NVVM_MUL_RP_D : F_MATH_2<"mul.rp.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_mul_rp_d>;
+
+def INT_NVVM_MUL24_I : F_MATH_2<"mul24.lo.s32 \t$dst, $src0, $src1;",
+  Int32Regs, Int32Regs, Int32Regs, int_nvvm_mul24_i>;
+def INT_NVVM_MUL24_UI : F_MATH_2<"mul24.lo.u32 \t$dst, $src0, $src1;",
+  Int32Regs, Int32Regs, Int32Regs, int_nvvm_mul24_ui>;
+
+//
+// Div
+//
+
+def INT_NVVM_DIV_APPROX_FTZ_F
+  : F_MATH_2<"div.approx.ftz.f32 \t$dst, $src0, $src1;", Float32Regs,
+    Float32Regs, Float32Regs, int_nvvm_div_approx_ftz_f>;
+def INT_NVVM_DIV_APPROX_F : F_MATH_2<"div.approx.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_approx_f>;
+
+def INT_NVVM_DIV_RN_FTZ_F : F_MATH_2<"div.rn.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rn_ftz_f>;
+def INT_NVVM_DIV_RN_F     : F_MATH_2<"div.rn.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rn_f>;
+def INT_NVVM_DIV_RZ_FTZ_F : F_MATH_2<"div.rz.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rz_ftz_f>;
+def INT_NVVM_DIV_RZ_F     : F_MATH_2<"div.rz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rz_f>;
+def INT_NVVM_DIV_RM_FTZ_F : F_MATH_2<"div.rm.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rm_ftz_f>;
+def INT_NVVM_DIV_RM_F     : F_MATH_2<"div.rm.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rm_f>;
+def INT_NVVM_DIV_RP_FTZ_F : F_MATH_2<"div.rp.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rp_ftz_f>;
+def INT_NVVM_DIV_RP_F     : F_MATH_2<"div.rp.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rp_f>;
+
+def INT_NVVM_DIV_RN_D : F_MATH_2<"div.rn.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_div_rn_d>;
+def INT_NVVM_DIV_RZ_D : F_MATH_2<"div.rz.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_div_rz_d>;
+def INT_NVVM_DIV_RM_D : F_MATH_2<"div.rm.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_div_rm_d>;
+def INT_NVVM_DIV_RP_D : F_MATH_2<"div.rp.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_div_rp_d>;
+
+//
+// Brev
+//
+
+def INT_NVVM_BREV32 : F_MATH_1<"brev.b32 \t$dst, $src0;", Int32Regs, Int32Regs,
+  int_nvvm_brev32>;
+def INT_NVVM_BREV64 : F_MATH_1<"brev.b64 \t$dst, $src0;", Int64Regs, Int64Regs,
+  int_nvvm_brev64>;
+
+//
+// Sad
+//
+
+def INT_NVVM_SAD_I : F_MATH_3<"sad.s32 \t$dst, $src0, $src1, $src2;",
+  Int32Regs, Int32Regs, Int32Regs, Int32Regs, int_nvvm_sad_i>;
+def INT_NVVM_SAD_UI : F_MATH_3<"sad.u32 \t$dst, $src0, $src1, $src2;",
+  Int32Regs, Int32Regs, Int32Regs, Int32Regs, int_nvvm_sad_ui>;
+
+//
+// Floor  Ceil
+//
+
+def : Pat<(int_nvvm_floor_ftz_f Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtRMI_FTZ)>;
+def : Pat<(int_nvvm_floor_f Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtRMI)>;
+def : Pat<(int_nvvm_floor_d Float64Regs:$a),
+          (CVT_f64_f64 Float64Regs:$a, CvtRMI)>;
+
+def : Pat<(int_nvvm_ceil_ftz_f Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtRPI_FTZ)>;
+def : Pat<(int_nvvm_ceil_f Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtRPI)>;
+def : Pat<(int_nvvm_ceil_d Float64Regs:$a),
+          (CVT_f64_f64 Float64Regs:$a, CvtRPI)>;
+
+//
+// Abs
+//
+
+def INT_NVVM_ABS_I : F_MATH_1<"abs.s32 \t$dst, $src0;", Int32Regs, Int32Regs,
+  int_nvvm_abs_i>;
+def INT_NVVM_ABS_LL : F_MATH_1<"abs.s64 \t$dst, $src0;", Int64Regs, Int64Regs,
+  int_nvvm_abs_ll>;
+
+def INT_NVVM_FABS_FTZ_F : F_MATH_1<"abs.ftz.f32 \t$dst, $src0;", Float32Regs,
+  Float32Regs, int_nvvm_fabs_ftz_f>;
+def INT_NVVM_FABS_F : F_MATH_1<"abs.f32 \t$dst, $src0;", Float32Regs,
+  Float32Regs, int_nvvm_fabs_f>;
+
+def INT_NVVM_FABS_D : F_MATH_1<"abs.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_fabs_d>;
+
+//
+// Round
+//
+
+def : Pat<(int_nvvm_round_ftz_f Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>;
+def : Pat<(int_nvvm_round_f Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtRNI)>;
+def : Pat<(int_nvvm_round_d Float64Regs:$a),
+          (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
+
+//
+// Trunc
+//
+
+def : Pat<(int_nvvm_trunc_ftz_f Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtRZI_FTZ)>;
+def : Pat<(int_nvvm_trunc_f Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(int_nvvm_trunc_d Float64Regs:$a),
+          (CVT_f64_f64 Float64Regs:$a, CvtRZI)>;
+
+//
+// Saturate
+//
+
+def : Pat<(int_nvvm_saturate_ftz_f Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtSAT_FTZ)>;
+def : Pat<(int_nvvm_saturate_f Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtSAT)>;
+def : Pat<(int_nvvm_saturate_d Float64Regs:$a),
+          (CVT_f64_f64 Float64Regs:$a, CvtSAT)>;
+
+//
+// Exp2  Log2
+//
+
+def INT_NVVM_EX2_APPROX_FTZ_F : F_MATH_1<"ex2.approx.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_ex2_approx_ftz_f>;
+def INT_NVVM_EX2_APPROX_F : F_MATH_1<"ex2.approx.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_ex2_approx_f>;
+def INT_NVVM_EX2_APPROX_D : F_MATH_1<"ex2.approx.f64 \t$dst, $src0;",
+  Float64Regs, Float64Regs, int_nvvm_ex2_approx_d>;
+
+def INT_NVVM_LG2_APPROX_FTZ_F : F_MATH_1<"lg2.approx.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_lg2_approx_ftz_f>;
+def INT_NVVM_LG2_APPROX_F : F_MATH_1<"lg2.approx.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_lg2_approx_f>;
+def INT_NVVM_LG2_APPROX_D : F_MATH_1<"lg2.approx.f64 \t$dst, $src0;",
+  Float64Regs, Float64Regs, int_nvvm_lg2_approx_d>;
+
+//
+// Sin  Cos
+//
+
+def INT_NVVM_SIN_APPROX_FTZ_F : F_MATH_1<"sin.approx.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_sin_approx_ftz_f>;
+def INT_NVVM_SIN_APPROX_F : F_MATH_1<"sin.approx.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_sin_approx_f>;
+
+def INT_NVVM_COS_APPROX_FTZ_F : F_MATH_1<"cos.approx.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_cos_approx_ftz_f>;
+def INT_NVVM_COS_APPROX_F : F_MATH_1<"cos.approx.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_cos_approx_f>;
+
+//
+// Fma
+//
+
+def INT_NVVM_FMA_RN_FTZ_F
+  : F_MATH_3<"fma.rn.ftz.f32 \t$dst, $src0, $src1, $src2;", Float32Regs,
+    Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rn_ftz_f>;
+def INT_NVVM_FMA_RN_F : F_MATH_3<"fma.rn.f32 \t$dst, $src0, $src1, $src2;",
+  Float32Regs, Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rn_f>;
+def INT_NVVM_FMA_RZ_FTZ_F
+  : F_MATH_3<"fma.rz.ftz.f32 \t$dst, $src0, $src1, $src2;", Float32Regs,
+    Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rz_ftz_f>;
+def INT_NVVM_FMA_RZ_F : F_MATH_3<"fma.rz.f32 \t$dst, $src0, $src1, $src2;",
+  Float32Regs, Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rz_f>;
+def INT_NVVM_FMA_RM_FTZ_F
+  : F_MATH_3<"fma.rm.ftz.f32 \t$dst, $src0, $src1, $src2;", Float32Regs,
+    Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rm_ftz_f>;
+def INT_NVVM_FMA_RM_F : F_MATH_3<"fma.rm.f32 \t$dst, $src0, $src1, $src2;",
+  Float32Regs, Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rm_f>;
+def INT_NVVM_FMA_RP_FTZ_F
+  : F_MATH_3<"fma.rp.ftz.f32 \t$dst, $src0, $src1, $src2;", Float32Regs,
+    Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rp_ftz_f>;
+def INT_NVVM_FMA_RP_F : F_MATH_3<"fma.rp.f32 \t$dst, $src0, $src1, $src2;",
+  Float32Regs, Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rp_f>;
+
+def INT_NVVM_FMA_RN_D : F_MATH_3<"fma.rn.f64 \t$dst, $src0, $src1, $src2;",
+  Float64Regs, Float64Regs, Float64Regs, Float64Regs, int_nvvm_fma_rn_d>;
+def INT_NVVM_FMA_RZ_D : F_MATH_3<"fma.rz.f64 \t$dst, $src0, $src1, $src2;",
+  Float64Regs, Float64Regs, Float64Regs, Float64Regs, int_nvvm_fma_rz_d>;
+def INT_NVVM_FMA_RM_D : F_MATH_3<"fma.rm.f64 \t$dst, $src0, $src1, $src2;",
+  Float64Regs, Float64Regs, Float64Regs, Float64Regs, int_nvvm_fma_rm_d>;
+def INT_NVVM_FMA_RP_D : F_MATH_3<"fma.rp.f64 \t$dst, $src0, $src1, $src2;",
+  Float64Regs, Float64Regs, Float64Regs, Float64Regs, int_nvvm_fma_rp_d>;
+
+//
+// Rcp
+//
+
+def INT_NVVM_RCP_RN_FTZ_F : F_MATH_1<"rcp.rn.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rcp_rn_ftz_f>;
+def INT_NVVM_RCP_RN_F : F_MATH_1<"rcp.rn.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rcp_rn_f>;
+def INT_NVVM_RCP_RZ_FTZ_F : F_MATH_1<"rcp.rz.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rcp_rz_ftz_f>;
+def INT_NVVM_RCP_RZ_F : F_MATH_1<"rcp.rz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rcp_rz_f>;
+def INT_NVVM_RCP_RM_FTZ_F : F_MATH_1<"rcp.rm.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rcp_rm_ftz_f>;
+def INT_NVVM_RCP_RM_F : F_MATH_1<"rcp.rm.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rcp_rm_f>;
+def INT_NVVM_RCP_RP_FTZ_F : F_MATH_1<"rcp.rp.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rcp_rp_ftz_f>;
+def INT_NVVM_RCP_RP_F : F_MATH_1<"rcp.rp.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rcp_rp_f>;
+
+def INT_NVVM_RCP_RN_D : F_MATH_1<"rcp.rn.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_rcp_rn_d>;
+def INT_NVVM_RCP_RZ_D : F_MATH_1<"rcp.rz.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_rcp_rz_d>;
+def INT_NVVM_RCP_RM_D : F_MATH_1<"rcp.rm.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_rcp_rm_d>;
+def INT_NVVM_RCP_RP_D : F_MATH_1<"rcp.rp.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_rcp_rp_d>;
+
+def INT_NVVM_RCP_APPROX_FTZ_D : F_MATH_1<"rcp.approx.ftz.f64 \t$dst, $src0;",
+  Float64Regs, Float64Regs, int_nvvm_rcp_approx_ftz_d>;
+
+//
+// Sqrt
+//
+
+def INT_NVVM_SQRT_RN_FTZ_F : F_MATH_1<"sqrt.rn.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_sqrt_rn_ftz_f>;
+def INT_NVVM_SQRT_RN_F : F_MATH_1<"sqrt.rn.f32 \t$dst, $src0;", Float32Regs,
+  Float32Regs, int_nvvm_sqrt_rn_f>;
+def INT_NVVM_SQRT_RZ_FTZ_F : F_MATH_1<"sqrt.rz.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_sqrt_rz_ftz_f>;
+def INT_NVVM_SQRT_RZ_F : F_MATH_1<"sqrt.rz.f32 \t$dst, $src0;", Float32Regs,
+  Float32Regs, int_nvvm_sqrt_rz_f>;
+def INT_NVVM_SQRT_RM_FTZ_F : F_MATH_1<"sqrt.rm.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_sqrt_rm_ftz_f>;
+def INT_NVVM_SQRT_RM_F : F_MATH_1<"sqrt.rm.f32 \t$dst, $src0;", Float32Regs,
+  Float32Regs, int_nvvm_sqrt_rm_f>;
+def INT_NVVM_SQRT_RP_FTZ_F : F_MATH_1<"sqrt.rp.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_sqrt_rp_ftz_f>;
+def INT_NVVM_SQRT_RP_F : F_MATH_1<"sqrt.rp.f32 \t$dst, $src0;", Float32Regs,
+  Float32Regs, int_nvvm_sqrt_rp_f>;
+def INT_NVVM_SQRT_APPROX_FTZ_F : F_MATH_1<"sqrt.approx.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_sqrt_approx_ftz_f>;
+def INT_NVVM_SQRT_APPROX_F : F_MATH_1<"sqrt.approx.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_sqrt_approx_f>;
+
+def INT_NVVM_SQRT_RN_D : F_MATH_1<"sqrt.rn.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_sqrt_rn_d>;
+def INT_NVVM_SQRT_RZ_D : F_MATH_1<"sqrt.rz.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_sqrt_rz_d>;
+def INT_NVVM_SQRT_RM_D : F_MATH_1<"sqrt.rm.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_sqrt_rm_d>;
+def INT_NVVM_SQRT_RP_D : F_MATH_1<"sqrt.rp.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_sqrt_rp_d>;
+
+// nvvm_sqrt intrinsic
+def : Pat<(int_nvvm_sqrt_f Float32Regs:$a),
+          (INT_NVVM_SQRT_RN_FTZ_F Float32Regs:$a)>, Requires<[doF32FTZ, do_SQRTF32_RN]>;
+def : Pat<(int_nvvm_sqrt_f Float32Regs:$a),
+          (INT_NVVM_SQRT_RN_F Float32Regs:$a)>, Requires<[do_SQRTF32_RN]>;
+def : Pat<(int_nvvm_sqrt_f Float32Regs:$a),
+          (INT_NVVM_SQRT_APPROX_FTZ_F Float32Regs:$a)>, Requires<[doF32FTZ]>;
+def : Pat<(int_nvvm_sqrt_f Float32Regs:$a),
+          (INT_NVVM_SQRT_APPROX_F Float32Regs:$a)>;
+
+//
+// Rsqrt
+//
+
+def INT_NVVM_RSQRT_APPROX_FTZ_F
+  : F_MATH_1<"rsqrt.approx.ftz.f32 \t$dst, $src0;", Float32Regs, Float32Regs,
+    int_nvvm_rsqrt_approx_ftz_f>;
+def INT_NVVM_RSQRT_APPROX_F : F_MATH_1<"rsqrt.approx.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rsqrt_approx_f>;
+def INT_NVVM_RSQRT_APPROX_D : F_MATH_1<"rsqrt.approx.f64 \t$dst, $src0;",
+  Float64Regs, Float64Regs, int_nvvm_rsqrt_approx_d>;
+
+//
+// Add
+//
+
+def INT_NVVM_ADD_RN_FTZ_F : F_MATH_2<"add.rn.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rn_ftz_f>;
+def INT_NVVM_ADD_RN_F : F_MATH_2<"add.rn.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rn_f>;
+def INT_NVVM_ADD_RZ_FTZ_F : F_MATH_2<"add.rz.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rz_ftz_f>;
+def INT_NVVM_ADD_RZ_F : F_MATH_2<"add.rz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rz_f>;
+def INT_NVVM_ADD_RM_FTZ_F : F_MATH_2<"add.rm.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rm_ftz_f>;
+def INT_NVVM_ADD_RM_F : F_MATH_2<"add.rm.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rm_f>;
+def INT_NVVM_ADD_RP_FTZ_F : F_MATH_2<"add.rp.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rp_ftz_f>;
+def INT_NVVM_ADD_RP_F : F_MATH_2<"add.rp.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rp_f>;
+
+def INT_NVVM_ADD_RN_D : F_MATH_2<"add.rn.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_add_rn_d>;
+def INT_NVVM_ADD_RZ_D : F_MATH_2<"add.rz.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_add_rz_d>;
+def INT_NVVM_ADD_RM_D : F_MATH_2<"add.rm.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_add_rm_d>;
+def INT_NVVM_ADD_RP_D : F_MATH_2<"add.rp.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_add_rp_d>;
+
+//
+// Convert
+//
+
+def : Pat<(int_nvvm_d2f_rn_ftz Float64Regs:$a),
+          (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>;
+def : Pat<(int_nvvm_d2f_rn Float64Regs:$a),
+          (CVT_f32_f64 Float64Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_d2f_rz_ftz Float64Regs:$a),
+          (CVT_f32_f64 Float64Regs:$a, CvtRZ_FTZ)>;
+def : Pat<(int_nvvm_d2f_rz Float64Regs:$a),
+          (CVT_f32_f64 Float64Regs:$a, CvtRZ)>;
+def : Pat<(int_nvvm_d2f_rm_ftz Float64Regs:$a),
+          (CVT_f32_f64 Float64Regs:$a, CvtRM_FTZ)>;
+def : Pat<(int_nvvm_d2f_rm Float64Regs:$a),
+          (CVT_f32_f64 Float64Regs:$a, CvtRM)>;
+def : Pat<(int_nvvm_d2f_rp_ftz Float64Regs:$a),
+          (CVT_f32_f64 Float64Regs:$a, CvtRP_FTZ)>;
+def : Pat<(int_nvvm_d2f_rp Float64Regs:$a),
+          (CVT_f32_f64 Float64Regs:$a, CvtRP)>;
+
+def : Pat<(int_nvvm_d2i_rn Float64Regs:$a),
+          (CVT_s32_f64 Float64Regs:$a, CvtRNI)>;
+def : Pat<(int_nvvm_d2i_rz Float64Regs:$a),
+          (CVT_s32_f64 Float64Regs:$a, CvtRZI)>;
+def : Pat<(int_nvvm_d2i_rm Float64Regs:$a),
+          (CVT_s32_f64 Float64Regs:$a, CvtRMI)>;
+def : Pat<(int_nvvm_d2i_rp Float64Regs:$a),
+          (CVT_s32_f64 Float64Regs:$a, CvtRPI)>;
+
+def : Pat<(int_nvvm_d2ui_rn Float64Regs:$a),
+          (CVT_u32_f64 Float64Regs:$a, CvtRNI)>;
+def : Pat<(int_nvvm_d2ui_rz Float64Regs:$a),
+          (CVT_u32_f64 Float64Regs:$a, CvtRZI)>;
+def : Pat<(int_nvvm_d2ui_rm Float64Regs:$a),
+          (CVT_u32_f64 Float64Regs:$a, CvtRMI)>;
+def : Pat<(int_nvvm_d2ui_rp Float64Regs:$a),
+          (CVT_u32_f64 Float64Regs:$a, CvtRPI)>;
+
+def : Pat<(int_nvvm_i2d_rn Int32Regs:$a),
+          (CVT_f64_s32 Int32Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_i2d_rz Int32Regs:$a),
+          (CVT_f64_s32 Int32Regs:$a, CvtRZ)>;
+def : Pat<(int_nvvm_i2d_rm Int32Regs:$a),
+          (CVT_f64_s32 Int32Regs:$a, CvtRM)>;
+def : Pat<(int_nvvm_i2d_rp Int32Regs:$a),
+          (CVT_f64_s32 Int32Regs:$a, CvtRP)>;
+
+def : Pat<(int_nvvm_ui2d_rn Int32Regs:$a),
+          (CVT_f64_u32 Int32Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_ui2d_rz Int32Regs:$a),
+          (CVT_f64_u32 Int32Regs:$a, CvtRZ)>;
+def : Pat<(int_nvvm_ui2d_rm Int32Regs:$a),
+          (CVT_f64_u32 Int32Regs:$a, CvtRM)>;
+def : Pat<(int_nvvm_ui2d_rp Int32Regs:$a),
+          (CVT_f64_u32 Int32Regs:$a, CvtRP)>;
+
+def : Pat<(int_nvvm_f2i_rn_ftz Float32Regs:$a),
+          (CVT_s32_f32 Float32Regs:$a, CvtRNI_FTZ)>;
+def : Pat<(int_nvvm_f2i_rn Float32Regs:$a),
+          (CVT_s32_f32 Float32Regs:$a, CvtRNI)>;
+def : Pat<(int_nvvm_f2i_rz_ftz Float32Regs:$a),
+          (CVT_s32_f32 Float32Regs:$a, CvtRZI_FTZ)>;
+def : Pat<(int_nvvm_f2i_rz Float32Regs:$a),
+          (CVT_s32_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(int_nvvm_f2i_rm_ftz Float32Regs:$a),
+          (CVT_s32_f32 Float32Regs:$a, CvtRMI_FTZ)>;
+def : Pat<(int_nvvm_f2i_rm Float32Regs:$a),
+          (CVT_s32_f32 Float32Regs:$a, CvtRMI)>;
+def : Pat<(int_nvvm_f2i_rp_ftz Float32Regs:$a),
+          (CVT_s32_f32 Float32Regs:$a, CvtRPI_FTZ)>;
+def : Pat<(int_nvvm_f2i_rp Float32Regs:$a),
+          (CVT_s32_f32 Float32Regs:$a, CvtRPI)>;
+
+def : Pat<(int_nvvm_f2ui_rn_ftz Float32Regs:$a),
+          (CVT_u32_f32 Float32Regs:$a, CvtRNI_FTZ)>;
+def : Pat<(int_nvvm_f2ui_rn Float32Regs:$a),
+          (CVT_u32_f32 Float32Regs:$a, CvtRNI)>;
+def : Pat<(int_nvvm_f2ui_rz_ftz Float32Regs:$a),
+          (CVT_u32_f32 Float32Regs:$a, CvtRZI_FTZ)>;
+def : Pat<(int_nvvm_f2ui_rz Float32Regs:$a),
+          (CVT_u32_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(int_nvvm_f2ui_rm_ftz Float32Regs:$a),
+          (CVT_u32_f32 Float32Regs:$a, CvtRMI_FTZ)>;
+def : Pat<(int_nvvm_f2ui_rm Float32Regs:$a),
+          (CVT_u32_f32 Float32Regs:$a, CvtRMI)>;
+def : Pat<(int_nvvm_f2ui_rp_ftz Float32Regs:$a),
+          (CVT_u32_f32 Float32Regs:$a, CvtRPI_FTZ)>;
+def : Pat<(int_nvvm_f2ui_rp Float32Regs:$a),
+          (CVT_u32_f32 Float32Regs:$a, CvtRPI)>;
+
+def : Pat<(int_nvvm_i2f_rn Int32Regs:$a),
+          (CVT_f32_s32 Int32Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_i2f_rz Int32Regs:$a),
+          (CVT_f32_s32 Int32Regs:$a, CvtRZ)>;
+def : Pat<(int_nvvm_i2f_rm Int32Regs:$a),
+          (CVT_f32_s32 Int32Regs:$a, CvtRM)>;
+def : Pat<(int_nvvm_i2f_rp Int32Regs:$a),
+          (CVT_f32_s32 Int32Regs:$a, CvtRP)>;
+
+def : Pat<(int_nvvm_ui2f_rn Int32Regs:$a),
+          (CVT_f32_u32 Int32Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_ui2f_rz Int32Regs:$a),
+          (CVT_f32_u32 Int32Regs:$a, CvtRZ)>;
+def : Pat<(int_nvvm_ui2f_rm Int32Regs:$a),
+          (CVT_f32_u32 Int32Regs:$a, CvtRM)>;
+def : Pat<(int_nvvm_ui2f_rp Int32Regs:$a),
+          (CVT_f32_u32 Int32Regs:$a, CvtRP)>;
+
+def INT_NVVM_LOHI_I2D : F_MATH_2<"mov.b64 \t$dst, {{$src0, $src1}};",
+  Float64Regs, Int32Regs, Int32Regs, int_nvvm_lohi_i2d>;
+
+def INT_NVVM_D2I_LO : F_MATH_1<!strconcat("{{\n\t",
+                       !strconcat(".reg .b32 %temp; \n\t",
+             !strconcat("mov.b64 \t{$dst, %temp}, $src0;\n\t",
+               "}}"))),
+             Int32Regs, Float64Regs, int_nvvm_d2i_lo>;
+def INT_NVVM_D2I_HI : F_MATH_1<!strconcat("{{\n\t",
+                       !strconcat(".reg .b32 %temp; \n\t",
+                         !strconcat("mov.b64 \t{%temp, $dst}, $src0;\n\t",
+                           "}}"))),
+             Int32Regs, Float64Regs, int_nvvm_d2i_hi>;
+
+def : Pat<(int_nvvm_f2ll_rn_ftz Float32Regs:$a),
+          (CVT_s64_f32 Float32Regs:$a, CvtRNI_FTZ)>;
+def : Pat<(int_nvvm_f2ll_rn Float32Regs:$a),
+          (CVT_s64_f32 Float32Regs:$a, CvtRNI)>;
+def : Pat<(int_nvvm_f2ll_rz_ftz Float32Regs:$a),
+          (CVT_s64_f32 Float32Regs:$a, CvtRZI_FTZ)>;
+def : Pat<(int_nvvm_f2ll_rz Float32Regs:$a),
+          (CVT_s64_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(int_nvvm_f2ll_rm_ftz Float32Regs:$a),
+          (CVT_s64_f32 Float32Regs:$a, CvtRMI_FTZ)>;
+def : Pat<(int_nvvm_f2ll_rm Float32Regs:$a),
+          (CVT_s64_f32 Float32Regs:$a, CvtRMI)>;
+def : Pat<(int_nvvm_f2ll_rp_ftz Float32Regs:$a),
+          (CVT_s64_f32 Float32Regs:$a, CvtRPI_FTZ)>;
+def : Pat<(int_nvvm_f2ll_rp Float32Regs:$a),
+          (CVT_s64_f32 Float32Regs:$a, CvtRPI)>;
+
+def : Pat<(int_nvvm_f2ull_rn_ftz Float32Regs:$a),
+          (CVT_u64_f32 Float32Regs:$a, CvtRNI_FTZ)>;
+def : Pat<(int_nvvm_f2ull_rn Float32Regs:$a),
+          (CVT_u64_f32 Float32Regs:$a, CvtRNI)>;
+def : Pat<(int_nvvm_f2ull_rz_ftz Float32Regs:$a),
+          (CVT_u64_f32 Float32Regs:$a, CvtRZI_FTZ)>;
+def : Pat<(int_nvvm_f2ull_rz Float32Regs:$a),
+          (CVT_u64_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(int_nvvm_f2ull_rm_ftz Float32Regs:$a),
+          (CVT_u64_f32 Float32Regs:$a, CvtRMI_FTZ)>;
+def : Pat<(int_nvvm_f2ull_rm Float32Regs:$a),
+          (CVT_u64_f32 Float32Regs:$a, CvtRMI)>;
+def : Pat<(int_nvvm_f2ull_rp_ftz Float32Regs:$a),
+          (CVT_u64_f32 Float32Regs:$a, CvtRPI_FTZ)>;
+def : Pat<(int_nvvm_f2ull_rp Float32Regs:$a),
+          (CVT_u64_f32 Float32Regs:$a, CvtRPI)>;
+
+def : Pat<(int_nvvm_d2ll_rn Float64Regs:$a),
+          (CVT_s64_f64 Float64Regs:$a, CvtRNI)>;
+def : Pat<(int_nvvm_d2ll_rz Float64Regs:$a),
+          (CVT_s64_f64 Float64Regs:$a, CvtRZI)>;
+def : Pat<(int_nvvm_d2ll_rm Float64Regs:$a),
+          (CVT_s64_f64 Float64Regs:$a, CvtRMI)>;
+def : Pat<(int_nvvm_d2ll_rp Float64Regs:$a),
+          (CVT_s64_f64 Float64Regs:$a, CvtRPI)>;
+
+def : Pat<(int_nvvm_d2ull_rn Float64Regs:$a),
+          (CVT_u64_f64 Float64Regs:$a, CvtRNI)>;
+def : Pat<(int_nvvm_d2ull_rz Float64Regs:$a),
+          (CVT_u64_f64 Float64Regs:$a, CvtRZI)>;
+def : Pat<(int_nvvm_d2ull_rm Float64Regs:$a),
+          (CVT_u64_f64 Float64Regs:$a, CvtRMI)>;
+def : Pat<(int_nvvm_d2ull_rp Float64Regs:$a),
+          (CVT_u64_f64 Float64Regs:$a, CvtRPI)>;
+
+def : Pat<(int_nvvm_ll2f_rn Int64Regs:$a),
+          (CVT_f32_s64 Int64Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_ll2f_rz Int64Regs:$a),
+          (CVT_f32_s64 Int64Regs:$a, CvtRZ)>;
+def : Pat<(int_nvvm_ll2f_rm Int64Regs:$a),
+          (CVT_f32_s64 Int64Regs:$a, CvtRM)>;
+def : Pat<(int_nvvm_ll2f_rp Int64Regs:$a),
+          (CVT_f32_s64 Int64Regs:$a, CvtRP)>;
+
+def : Pat<(int_nvvm_ull2f_rn Int64Regs:$a),
+          (CVT_f32_u64 Int64Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_ull2f_rz Int64Regs:$a),
+          (CVT_f32_u64 Int64Regs:$a, CvtRZ)>;
+def : Pat<(int_nvvm_ull2f_rm Int64Regs:$a),
+          (CVT_f32_u64 Int64Regs:$a, CvtRM)>;
+def : Pat<(int_nvvm_ull2f_rp Int64Regs:$a),
+          (CVT_f32_u64 Int64Regs:$a, CvtRP)>;
+
+def : Pat<(int_nvvm_ll2d_rn Int64Regs:$a),
+          (CVT_f64_s64 Int64Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_ll2d_rz Int64Regs:$a),
+          (CVT_f64_s64 Int64Regs:$a, CvtRZ)>;
+def : Pat<(int_nvvm_ll2d_rm Int64Regs:$a),
+          (CVT_f64_s64 Int64Regs:$a, CvtRM)>;
+def : Pat<(int_nvvm_ll2d_rp Int64Regs:$a),
+          (CVT_f64_s64 Int64Regs:$a, CvtRP)>;
+
+def : Pat<(int_nvvm_ull2d_rn Int64Regs:$a),
+          (CVT_f64_u64 Int64Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_ull2d_rz Int64Regs:$a),
+          (CVT_f64_u64 Int64Regs:$a, CvtRZ)>;
+def : Pat<(int_nvvm_ull2d_rm Int64Regs:$a),
+          (CVT_f64_u64 Int64Regs:$a, CvtRM)>;
+def : Pat<(int_nvvm_ull2d_rp Int64Regs:$a),
+          (CVT_f64_u64 Int64Regs:$a, CvtRP)>;
+
+
+// FIXME: Ideally, we could use these patterns instead of the scope-creating
+// patterns, but ptxas does not like these since .s16 is not compatible with
+// .f16.  The solution is to use .bXX for all integer register types, but we
+// are not there yet.
+//def : Pat<(int_nvvm_f2h_rn_ftz Float32Regs:$a),
+//          (CVT_f16_f32 Float32Regs:$a, CvtRN_FTZ)>;
+//def : Pat<(int_nvvm_f2h_rn Float32Regs:$a),
+//          (CVT_f16_f32 Float32Regs:$a, CvtRN)>;
+//
+//def : Pat<(int_nvvm_h2f Int16Regs:$a),
+//          (CVT_f32_f16 Int16Regs:$a, CvtNONE)>;
+
+def INT_NVVM_F2H_RN_FTZ : F_MATH_1<!strconcat("{{\n\t",
+                                   !strconcat(".reg .b16 %temp;\n\t",
+           !strconcat("cvt.rn.ftz.f16.f32 \t%temp, $src0;\n\t",
+           !strconcat("mov.b16 \t$dst, %temp;\n",
+             "}}")))),
+                                   Int16Regs, Float32Regs, int_nvvm_f2h_rn_ftz>;
+def INT_NVVM_F2H_RN : F_MATH_1<!strconcat("{{\n\t",
+                                   !strconcat(".reg .b16 %temp;\n\t",
+           !strconcat("cvt.rn.f16.f32 \t%temp, $src0;\n\t",
+           !strconcat("mov.b16 \t$dst, %temp;\n",
+             "}}")))),
+           Int16Regs, Float32Regs, int_nvvm_f2h_rn>;
+
+def INT_NVVM_H2F : F_MATH_1<!strconcat("{{\n\t",
+                            !strconcat(".reg .b16 %temp;\n\t",
+          !strconcat("mov.b16 \t%temp, $src0;\n\t",
+          !strconcat("cvt.f32.f16 \t$dst, %temp;\n\t",
+            "}}")))),
+          Float32Regs, Int16Regs, int_nvvm_h2f>;
+
+def : Pat<(f32 (f16_to_fp Int16Regs:$a)),
+          (CVT_f32_f16 Int16Regs:$a, CvtNONE)>;
+def : Pat<(i16 (fp_to_f16 Float32Regs:$a)),
+          (CVT_f16_f32 Float32Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i16 (fp_to_f16 Float32Regs:$a)),
+          (CVT_f16_f32 Float32Regs:$a, CvtRN)>;
+
+def : Pat<(f64 (f16_to_fp Int16Regs:$a)),
+          (CVT_f64_f16 Int16Regs:$a, CvtNONE)>;
+def : Pat<(i16 (fp_to_f16 Float64Regs:$a)),
+          (CVT_f16_f64 Float64Regs:$a, CvtRN)>;
+
+//
+// Bitcast
+//
+
+def INT_NVVM_BITCAST_F2I : F_MATH_1<"mov.b32 \t$dst, $src0;", Int32Regs,
+  Float32Regs, int_nvvm_bitcast_f2i>;
+def INT_NVVM_BITCAST_I2F : F_MATH_1<"mov.b32 \t$dst, $src0;", Float32Regs,
+  Int32Regs, int_nvvm_bitcast_i2f>;
+
+def INT_NVVM_BITCAST_LL2D : F_MATH_1<"mov.b64 \t$dst, $src0;", Float64Regs,
+  Int64Regs, int_nvvm_bitcast_ll2d>;
+def INT_NVVM_BITCAST_D2LL : F_MATH_1<"mov.b64 \t$dst, $src0;", Int64Regs,
+  Float64Regs, int_nvvm_bitcast_d2ll>;
+
+//-----------------------------------
+// Atomic Functions
+//-----------------------------------
+
+class ATOMIC_GLOBAL_CHK <dag ops, dag frag>
+ : PatFrag<ops, frag, [{
+   return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GLOBAL);
+}]>;
+class ATOMIC_SHARED_CHK <dag ops, dag frag>
+ : PatFrag<ops, frag, [{
+   return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_SHARED);
+}]>;
+class ATOMIC_GENERIC_CHK <dag ops, dag frag>
+ : PatFrag<ops, frag, [{
+   return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GENERIC);
+}]>;
+
+multiclass F_ATOMIC_2_imp<NVPTXRegClass ptrclass, NVPTXRegClass regclass,
+  string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp,
+  Operand IMMType, SDNode IMM, Predicate Pred> {
+  def reg : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, regclass:$b),
+               !strconcat("atom",
+         !strconcat(SpaceStr,
+         !strconcat(OpcStr,
+         !strconcat(TypeStr,
+         !strconcat(" \t$dst, [$addr], $b;", ""))))),
+         [(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b))]>,
+  Requires<[Pred]>;
+  def imm : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, IMMType:$b),
+               !strconcat("atom",
+         !strconcat(SpaceStr,
+         !strconcat(OpcStr,
+         !strconcat(TypeStr,
+         !strconcat(" \t$dst, [$addr], $b;", ""))))),
+         [(set regclass:$dst, (IntOp ptrclass:$addr, IMM:$b))]>,
+  Requires<[Pred]>;
+}
+multiclass F_ATOMIC_2<NVPTXRegClass regclass, string SpaceStr, string TypeStr,
+  string OpcStr, PatFrag IntOp, Operand IMMType, SDNode IMM, Predicate Pred> {
+  defm p32 : F_ATOMIC_2_imp<Int32Regs, regclass, SpaceStr, TypeStr, OpcStr,
+    IntOp, IMMType, IMM, Pred>;
+  defm p64 : F_ATOMIC_2_imp<Int64Regs, regclass, SpaceStr, TypeStr, OpcStr,
+    IntOp, IMMType, IMM, Pred>;
+}
+
+// has 2 operands, neg the second one
+multiclass F_ATOMIC_2_NEG_imp<NVPTXRegClass ptrclass, NVPTXRegClass regclass,
+  string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp,
+  Operand IMMType, Predicate Pred> {
+  def reg : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, regclass:$b),
+    !strconcat("{{ \n\t",
+         !strconcat(".reg \t.s",
+         !strconcat(TypeStr,
+         !strconcat(" temp; \n\t",
+         !strconcat("neg.s",
+         !strconcat(TypeStr,
+         !strconcat(" \ttemp, $b; \n\t",
+               !strconcat("atom",
+         !strconcat(SpaceStr,
+         !strconcat(OpcStr,
+         !strconcat(".u",
+         !strconcat(TypeStr,
+         !strconcat(" \t$dst, [$addr], temp; \n\t",
+           !strconcat("}}", "")))))))))))))),
+         [(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b))]>,
+  Requires<[Pred]>;
+}
+multiclass F_ATOMIC_2_NEG<NVPTXRegClass regclass, string SpaceStr,
+  string TypeStr, string OpcStr, PatFrag IntOp, Operand IMMType,
+  Predicate Pred> {
+ defm p32: F_ATOMIC_2_NEG_imp<Int32Regs, regclass, SpaceStr, TypeStr, OpcStr,
+   IntOp, IMMType, Pred> ;
+ defm p64: F_ATOMIC_2_NEG_imp<Int64Regs, regclass, SpaceStr, TypeStr, OpcStr,
+   IntOp, IMMType, Pred> ;
+}
+
+// has 3 operands
+multiclass F_ATOMIC_3_imp<NVPTXRegClass ptrclass, NVPTXRegClass regclass,
+  string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp,
+  Operand IMMType, Predicate Pred> {
+  def reg : NVPTXInst<(outs regclass:$dst),
+    (ins ptrclass:$addr, regclass:$b, regclass:$c),
+               !strconcat("atom",
+         !strconcat(SpaceStr,
+         !strconcat(OpcStr,
+         !strconcat(TypeStr,
+         !strconcat(" \t$dst, [$addr], $b, $c;", ""))))),
+         [(set regclass:$dst,
+           (IntOp ptrclass:$addr, regclass:$b, regclass:$c))]>,
+         Requires<[Pred]>;
+  def imm1 : NVPTXInst<(outs regclass:$dst),
+    (ins ptrclass:$addr, IMMType:$b, regclass:$c),
+               !strconcat("atom",
+         !strconcat(SpaceStr,
+         !strconcat(OpcStr,
+         !strconcat(TypeStr,
+         !strconcat(" \t$dst, [$addr], $b, $c;", ""))))),
+         [(set regclass:$dst, (IntOp ptrclass:$addr, imm:$b, regclass:$c))]>,
+  Requires<[Pred]>;
+  def imm2 : NVPTXInst<(outs regclass:$dst),
+    (ins ptrclass:$addr, regclass:$b, IMMType:$c),
+               !strconcat("atom",
+         !strconcat(SpaceStr,
+         !strconcat(OpcStr,
+         !strconcat(TypeStr,
+         !strconcat(" \t$dst, [$addr], $b, $c;", ""))))),
+         [(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b, imm:$c))]>,
+  Requires<[Pred]>;
+  def imm3 : NVPTXInst<(outs regclass:$dst),
+    (ins ptrclass:$addr, IMMType:$b, IMMType:$c),
+               !strconcat("atom",
+         !strconcat(SpaceStr,
+         !strconcat(OpcStr,
+         !strconcat(TypeStr,
+         !strconcat(" \t$dst, [$addr], $b, $c;", ""))))),
+         [(set regclass:$dst, (IntOp ptrclass:$addr, imm:$b, imm:$c))]>,
+  Requires<[Pred]>;
+}
+multiclass F_ATOMIC_3<NVPTXRegClass regclass, string SpaceStr, string TypeStr,
+  string OpcStr, PatFrag IntOp, Operand IMMType, Predicate Pred> {
+  defm p32 : F_ATOMIC_3_imp<Int32Regs, regclass, SpaceStr, TypeStr, OpcStr,
+    IntOp, IMMType, Pred>;
+  defm p64 : F_ATOMIC_3_imp<Int64Regs, regclass, SpaceStr, TypeStr, OpcStr,
+    IntOp, IMMType, Pred>;
+}
+
+// atom_add
+
+def atomic_load_add_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_add_32 node:$a, node:$b)>;
+def atomic_load_add_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_add_32 node:$a, node:$b)>;
+def atomic_load_add_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_add_32 node:$a, node:$b)>;
+def atomic_load_add_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_add_64 node:$a, node:$b)>;
+def atomic_load_add_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_add_64 node:$a, node:$b)>;
+def atomic_load_add_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_add_64 node:$a, node:$b)>;
+def atomic_load_add_f32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_add_f32 node:$a, node:$b)>;
+def atomic_load_add_f32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_add_f32 node:$a, node:$b)>;
+def atomic_load_add_f32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_add_f32 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_ADD_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".add",
+  atomic_load_add_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_ADD_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32", ".add",
+  atomic_load_add_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_ADD_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".add",
+  atomic_load_add_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_ADD_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".u32",
+  ".add", atomic_load_add_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+
+defm INT_PTX_ATOM_ADD_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".u64", ".add",
+  atomic_load_add_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_ADD_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".u64", ".add",
+  atomic_load_add_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_ADD_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".u64", ".add",
+  atomic_load_add_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_ADD_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".u64",
+  ".add", atomic_load_add_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+
+defm INT_PTX_ATOM_ADD_G_F32 : F_ATOMIC_2<Float32Regs, ".global", ".f32", ".add",
+  atomic_load_add_f32_g, f32imm, fpimm, hasAtomAddF32>;
+defm INT_PTX_ATOM_ADD_S_F32 : F_ATOMIC_2<Float32Regs, ".shared", ".f32", ".add",
+  atomic_load_add_f32_s, f32imm, fpimm, hasAtomAddF32>;
+defm INT_PTX_ATOM_ADD_GEN_F32 : F_ATOMIC_2<Float32Regs, "", ".f32", ".add",
+  atomic_load_add_f32_gen, f32imm, fpimm, hasAtomAddF32>;
+
+// atom_sub
+
+def atomic_load_sub_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_sub_32 node:$a, node:$b)>;
+def atomic_load_sub_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_sub_32 node:$a, node:$b)>;
+def atomic_load_sub_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_sub_32 node:$a, node:$b)>;
+def atomic_load_sub_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_sub_64 node:$a, node:$b)>;
+def atomic_load_sub_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_sub_64 node:$a, node:$b)>;
+def atomic_load_sub_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_sub_64 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_SUB_G_32 : F_ATOMIC_2_NEG<Int32Regs, ".global", "32", ".add",
+  atomic_load_sub_32_g, i32imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_SUB_G_64 : F_ATOMIC_2_NEG<Int64Regs, ".global", "64", ".add",
+  atomic_load_sub_64_g, i64imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_SUB_GEN_32 : F_ATOMIC_2_NEG<Int32Regs, "", "32", ".add",
+  atomic_load_sub_32_gen, i32imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_SUB_GEN_32_USE_G : F_ATOMIC_2_NEG<Int32Regs, ".global", "32",
+  ".add", atomic_load_sub_32_gen, i32imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_SUB_S_32 : F_ATOMIC_2_NEG<Int32Regs, ".shared", "32", ".add",
+  atomic_load_sub_32_s, i32imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_SUB_S_64 : F_ATOMIC_2_NEG<Int64Regs, ".shared", "64", ".add",
+  atomic_load_sub_64_s, i64imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_SUB_GEN_64 : F_ATOMIC_2_NEG<Int64Regs, "", "64", ".add",
+  atomic_load_sub_64_gen, i64imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_SUB_GEN_64_USE_G : F_ATOMIC_2_NEG<Int64Regs, ".global", "64",
+  ".add", atomic_load_sub_64_gen, i64imm, useAtomRedG64forGen64>;
+
+// atom_swap
+
+def atomic_swap_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_swap_32 node:$a, node:$b)>;
+def atomic_swap_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_swap_32 node:$a, node:$b)>;
+def atomic_swap_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_swap_32 node:$a, node:$b)>;
+def atomic_swap_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_swap_64 node:$a, node:$b)>;
+def atomic_swap_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_swap_64 node:$a, node:$b)>;
+def atomic_swap_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_swap_64 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_SWAP_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".exch",
+  atomic_swap_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_SWAP_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".b32", ".exch",
+  atomic_swap_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_SWAP_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".exch",
+  atomic_swap_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_SWAP_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32",
+  ".exch", atomic_swap_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_SWAP_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".b64", ".exch",
+  atomic_swap_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_SWAP_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".b64", ".exch",
+  atomic_swap_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_SWAP_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".b64", ".exch",
+  atomic_swap_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_SWAP_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".b64",
+  ".exch", atomic_swap_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+
+// atom_max
+
+def atomic_load_max_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b)
+  , (atomic_load_max_32 node:$a, node:$b)>;
+def atomic_load_max_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_max_32 node:$a, node:$b)>;
+def atomic_load_max_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_max_32 node:$a, node:$b)>;
+def atomic_load_max_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b)
+  , (atomic_load_max_64 node:$a, node:$b)>;
+def atomic_load_max_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_max_64 node:$a, node:$b)>;
+def atomic_load_max_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_max_64 node:$a, node:$b)>;
+def atomic_load_umax_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_umax_32 node:$a, node:$b)>;
+def atomic_load_umax_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_umax_32 node:$a, node:$b)>;
+def atomic_load_umax_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_umax_32 node:$a, node:$b)>;
+def atomic_load_umax_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_umax_64 node:$a, node:$b)>;
+def atomic_load_umax_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_umax_64 node:$a, node:$b)>;
+def atomic_load_umax_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_umax_64 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_LOAD_MAX_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".s32",
+  ".max", atomic_load_max_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_LOAD_MAX_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".s32",
+  ".max", atomic_load_max_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_LOAD_MAX_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".s32", ".max",
+  atomic_load_max_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_LOAD_MAX_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
+  ".s32", ".max", atomic_load_max_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_LOAD_MAX_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".s64",
+  ".max", atomic_load_max_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_LOAD_MAX_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".s64",
+  ".max", atomic_load_max_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_LOAD_MAX_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".s64", ".max",
+  atomic_load_max_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_LOAD_MAX_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global",
+  ".s64", ".max", atomic_load_max_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+defm INT_PTX_ATOM_LOAD_UMAX_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32",
+  ".max", atomic_load_umax_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_LOAD_UMAX_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32",
+  ".max", atomic_load_umax_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_LOAD_UMAX_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".max",
+  atomic_load_umax_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_LOAD_UMAX_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
+  ".u32", ".max", atomic_load_umax_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_LOAD_UMAX_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".u64",
+  ".max", atomic_load_umax_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_LOAD_UMAX_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".u64",
+  ".max", atomic_load_umax_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_LOAD_UMAX_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".u64", ".max",
+  atomic_load_umax_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_LOAD_UMAX_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global",
+  ".u64", ".max", atomic_load_umax_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+
+// atom_min
+
+def atomic_load_min_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_min_32 node:$a, node:$b)>;
+def atomic_load_min_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_min_32 node:$a, node:$b)>;
+def atomic_load_min_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_min_32 node:$a, node:$b)>;
+def atomic_load_min_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_min_64 node:$a, node:$b)>;
+def atomic_load_min_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_min_64 node:$a, node:$b)>;
+def atomic_load_min_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_min_64 node:$a, node:$b)>;
+def atomic_load_umin_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_umin_32 node:$a, node:$b)>;
+def atomic_load_umin_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_umin_32 node:$a, node:$b)>;
+def atomic_load_umin_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_umin_32 node:$a, node:$b)>;
+def atomic_load_umin_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_umin_64 node:$a, node:$b)>;
+def atomic_load_umin_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_umin_64 node:$a, node:$b)>;
+def atomic_load_umin_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_umin_64 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_LOAD_MIN_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".s32",
+  ".min", atomic_load_min_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_LOAD_MIN_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".s32",
+  ".min", atomic_load_min_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_LOAD_MIN_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".s32", ".min",
+  atomic_load_min_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_LOAD_MIN_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
+  ".s32", ".min", atomic_load_min_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_LOAD_MIN_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".s64",
+  ".min", atomic_load_min_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_LOAD_MIN_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".s64",
+  ".min", atomic_load_min_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_LOAD_MIN_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".s64", ".min",
+  atomic_load_min_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_LOAD_MIN_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global",
+  ".s64", ".min", atomic_load_min_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+defm INT_PTX_ATOM_LOAD_UMIN_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32",
+  ".min", atomic_load_umin_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_LOAD_UMIN_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32",
+  ".min", atomic_load_umin_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_LOAD_UMIN_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".min",
+  atomic_load_umin_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_LOAD_UMIN_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
+  ".u32", ".min", atomic_load_umin_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_LOAD_UMIN_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".u64",
+  ".min", atomic_load_umin_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_LOAD_UMIN_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".u64",
+  ".min", atomic_load_umin_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_LOAD_UMIN_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".u64", ".min",
+  atomic_load_umin_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_LOAD_UMIN_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global",
+  ".u64", ".min", atomic_load_umin_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+
+// atom_inc  atom_dec
+
+def atomic_load_inc_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_inc_32 node:$a, node:$b)>;
+def atomic_load_inc_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_inc_32 node:$a, node:$b)>;
+def atomic_load_inc_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_inc_32 node:$a, node:$b)>;
+def atomic_load_dec_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_dec_32 node:$a, node:$b)>;
+def atomic_load_dec_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_dec_32 node:$a, node:$b)>;
+def atomic_load_dec_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_dec_32 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_INC_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".inc",
+  atomic_load_inc_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_INC_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32", ".inc",
+  atomic_load_inc_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_INC_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".inc",
+  atomic_load_inc_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_INC_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".u32",
+  ".inc", atomic_load_inc_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_DEC_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".dec",
+  atomic_load_dec_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_DEC_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32", ".dec",
+  atomic_load_dec_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_DEC_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".dec",
+  atomic_load_dec_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_DEC_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".u32",
+  ".dec", atomic_load_dec_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+
+// atom_and
+
+def atomic_load_and_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_and_32 node:$a, node:$b)>;
+def atomic_load_and_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_and_32 node:$a, node:$b)>;
+def atomic_load_and_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_and_32 node:$a, node:$b)>;
+def atomic_load_and_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_and_64 node:$a, node:$b)>;
+def atomic_load_and_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_and_64 node:$a, node:$b)>;
+def atomic_load_and_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_and_64 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_AND_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".and",
+  atomic_load_and_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_AND_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".b32", ".and",
+  atomic_load_and_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_AND_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".and",
+  atomic_load_and_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_AND_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32",
+  ".and", atomic_load_and_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_AND_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".b64", ".and",
+  atomic_load_and_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_AND_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".b64", ".and",
+  atomic_load_and_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_AND_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".b64", ".and",
+  atomic_load_and_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_AND_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".b64",
+  ".and", atomic_load_and_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+
+// atom_or
+
+def atomic_load_or_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_or_32 node:$a, node:$b)>;
+def atomic_load_or_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_or_32 node:$a, node:$b)>;
+def atomic_load_or_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_or_32 node:$a, node:$b)>;
+def atomic_load_or_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_or_64 node:$a, node:$b)>;
+def atomic_load_or_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_or_64 node:$a, node:$b)>;
+def atomic_load_or_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_or_64 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_OR_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".or",
+  atomic_load_or_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_OR_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".or",
+  atomic_load_or_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_OR_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32",
+  ".or", atomic_load_or_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_OR_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".b32", ".or",
+  atomic_load_or_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_OR_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".b64", ".or",
+  atomic_load_or_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_OR_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".b64", ".or",
+  atomic_load_or_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_OR_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".b64",
+  ".or", atomic_load_or_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+defm INT_PTX_ATOM_OR_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".b64", ".or",
+  atomic_load_or_64_s, i64imm, imm, hasAtomRedS64>;
+
+// atom_xor
+
+def atomic_load_xor_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_xor_32 node:$a, node:$b)>;
+def atomic_load_xor_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_xor_32 node:$a, node:$b)>;
+def atomic_load_xor_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_xor_32 node:$a, node:$b)>;
+def atomic_load_xor_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_xor_64 node:$a, node:$b)>;
+def atomic_load_xor_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_xor_64 node:$a, node:$b)>;
+def atomic_load_xor_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_xor_64 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_XOR_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".xor",
+  atomic_load_xor_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_XOR_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".b32", ".xor",
+  atomic_load_xor_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_XOR_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".xor",
+  atomic_load_xor_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_XOR_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32",
+  ".xor", atomic_load_xor_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_XOR_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".b64", ".xor",
+  atomic_load_xor_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_XOR_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".b64", ".xor",
+  atomic_load_xor_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_XOR_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".b64", ".xor",
+  atomic_load_xor_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_XOR_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".b64",
+  ".xor", atomic_load_xor_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+
+// atom_cas
+
+def atomic_cmp_swap_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c),
+  (atomic_cmp_swap_32 node:$a, node:$b, node:$c)>;
+def atomic_cmp_swap_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
+  (atomic_cmp_swap_32 node:$a, node:$b, node:$c)>;
+def atomic_cmp_swap_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
+  (atomic_cmp_swap_32 node:$a, node:$b, node:$c)>;
+def atomic_cmp_swap_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c),
+  (atomic_cmp_swap_64 node:$a, node:$b, node:$c)>;
+def atomic_cmp_swap_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
+  (atomic_cmp_swap_64 node:$a, node:$b, node:$c)>;
+def atomic_cmp_swap_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
+  (atomic_cmp_swap_64 node:$a, node:$b, node:$c)>;
+
+defm INT_PTX_ATOM_CAS_G_32 : F_ATOMIC_3<Int32Regs, ".global", ".b32", ".cas",
+  atomic_cmp_swap_32_g, i32imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_CAS_S_32 : F_ATOMIC_3<Int32Regs, ".shared", ".b32", ".cas",
+  atomic_cmp_swap_32_s, i32imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_CAS_GEN_32 : F_ATOMIC_3<Int32Regs, "", ".b32", ".cas",
+  atomic_cmp_swap_32_gen, i32imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_CAS_GEN_32_USE_G : F_ATOMIC_3<Int32Regs, ".global", ".b32",
+  ".cas", atomic_cmp_swap_32_gen, i32imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_CAS_G_64 : F_ATOMIC_3<Int64Regs, ".global", ".b64", ".cas",
+  atomic_cmp_swap_64_g, i64imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_CAS_S_64 : F_ATOMIC_3<Int64Regs, ".shared", ".b64", ".cas",
+  atomic_cmp_swap_64_s, i64imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_CAS_GEN_64 : F_ATOMIC_3<Int64Regs, "", ".b64", ".cas",
+  atomic_cmp_swap_64_gen, i64imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_CAS_GEN_64_USE_G : F_ATOMIC_3<Int64Regs, ".global", ".b64",
+  ".cas", atomic_cmp_swap_64_gen, i64imm, useAtomRedG64forGen64>;
+
+
+//-----------------------------------
+// Read Special Registers
+//-----------------------------------
+class F_SREG<string OpStr, NVPTXRegClass regclassOut, Intrinsic IntOp> :
+      NVPTXInst<(outs regclassOut:$dst), (ins),
+               OpStr,
+         [(set regclassOut:$dst, (IntOp))]>;
+
+def INT_PTX_SREG_TID_X : F_SREG<"mov.u32 \t$dst, %tid.x;", Int32Regs,
+  int_nvvm_read_ptx_sreg_tid_x>;
+def INT_PTX_SREG_TID_Y : F_SREG<"mov.u32 \t$dst, %tid.y;", Int32Regs,
+  int_nvvm_read_ptx_sreg_tid_y>;
+def INT_PTX_SREG_TID_Z : F_SREG<"mov.u32 \t$dst, %tid.z;", Int32Regs,
+  int_nvvm_read_ptx_sreg_tid_z>;
+
+def INT_PTX_SREG_NTID_X : F_SREG<"mov.u32 \t$dst, %ntid.x;", Int32Regs,
+  int_nvvm_read_ptx_sreg_ntid_x>;
+def INT_PTX_SREG_NTID_Y : F_SREG<"mov.u32 \t$dst, %ntid.y;", Int32Regs,
+  int_nvvm_read_ptx_sreg_ntid_y>;
+def INT_PTX_SREG_NTID_Z : F_SREG<"mov.u32 \t$dst, %ntid.z;", Int32Regs,
+  int_nvvm_read_ptx_sreg_ntid_z>;
+
+def INT_PTX_SREG_CTAID_X : F_SREG<"mov.u32 \t$dst, %ctaid.x;", Int32Regs,
+  int_nvvm_read_ptx_sreg_ctaid_x>;
+def INT_PTX_SREG_CTAID_Y : F_SREG<"mov.u32 \t$dst, %ctaid.y;", Int32Regs,
+  int_nvvm_read_ptx_sreg_ctaid_y>;
+def INT_PTX_SREG_CTAID_Z : F_SREG<"mov.u32 \t$dst, %ctaid.z;", Int32Regs,
+  int_nvvm_read_ptx_sreg_ctaid_z>;
+
+def INT_PTX_SREG_NCTAID_X : F_SREG<"mov.u32 \t$dst, %nctaid.x;", Int32Regs,
+  int_nvvm_read_ptx_sreg_nctaid_x>;
+def INT_PTX_SREG_NCTAID_Y : F_SREG<"mov.u32 \t$dst, %nctaid.y;", Int32Regs,
+  int_nvvm_read_ptx_sreg_nctaid_y>;
+def INT_PTX_SREG_NCTAID_Z : F_SREG<"mov.u32 \t$dst, %nctaid.z;", Int32Regs,
+  int_nvvm_read_ptx_sreg_nctaid_z>;
+
+def INT_PTX_SREG_WARPSIZE : F_SREG<"mov.u32 \t$dst, WARP_SZ;", Int32Regs,
+  int_nvvm_read_ptx_sreg_warpsize>;
+
+
+//-----------------------------------
+// Support for ldu on sm_20 or later
+//-----------------------------------
+
+// Scalar
+multiclass LDU_G<string TyStr, NVPTXRegClass regclass> {
+  def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src),
+               !strconcat("ldu.global.", TyStr),
+                      []>, Requires<[hasLDU]>;
+  def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src),
+               !strconcat("ldu.global.", TyStr),
+                        []>, Requires<[hasLDU]>;
+ def avar:  NVPTXInst<(outs regclass:$result), (ins imemAny:$src),
+               !strconcat("ldu.global.", TyStr),
+                      []>, Requires<[hasLDU]>;
+ def ari :  NVPTXInst<(outs regclass:$result), (ins MEMri:$src),
+               !strconcat("ldu.global.", TyStr),
+                      []>, Requires<[hasLDU]>;
+ def ari64 :  NVPTXInst<(outs regclass:$result), (ins MEMri64:$src),
+               !strconcat("ldu.global.", TyStr),
+                        []>, Requires<[hasLDU]>;
+}
+
+defm INT_PTX_LDU_GLOBAL_i8  : LDU_G<"u8 \t$result, [$src];", Int16Regs>;
+defm INT_PTX_LDU_GLOBAL_i16 : LDU_G<"u16 \t$result, [$src];", Int16Regs>;
+defm INT_PTX_LDU_GLOBAL_i32 : LDU_G<"u32 \t$result, [$src];", Int32Regs>;
+defm INT_PTX_LDU_GLOBAL_i64 : LDU_G<"u64 \t$result, [$src];", Int64Regs>;
+defm INT_PTX_LDU_GLOBAL_f32 : LDU_G<"f32 \t$result, [$src];", Float32Regs>;
+defm INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64 \t$result, [$src];", Float64Regs>;
+defm INT_PTX_LDU_GLOBAL_p32 : LDU_G<"u32 \t$result, [$src];", Int32Regs>;
+defm INT_PTX_LDU_GLOBAL_p64 : LDU_G<"u64 \t$result, [$src];", Int64Regs>;
+
+// vector
+
+// Elementized vector ldu
+multiclass VLDU_G_ELE_V2<string TyStr, NVPTXRegClass regclass> {
+ def _areg32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+                     (ins Int32Regs:$src),
+                     !strconcat("ldu.global.", TyStr), []>;
+ def _areg64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+                     (ins Int64Regs:$src),
+                     !strconcat("ldu.global.", TyStr), []>;
+ def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+                     (ins MEMri:$src),
+                     !strconcat("ldu.global.", TyStr), []>;
+ def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+                     (ins MEMri64:$src),
+                     !strconcat("ldu.global.", TyStr), []>;
+ def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+                     (ins imemAny:$src),
+                     !strconcat("ldu.global.", TyStr), []>;
+}
+
+multiclass VLDU_G_ELE_V4<string TyStr, NVPTXRegClass regclass> { 
+ def _areg32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+                            regclass:$dst4), (ins Int32Regs:$src), 
+               !strconcat("ldu.global.", TyStr), []>;
+ def _areg64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+                            regclass:$dst4), (ins Int64Regs:$src), 
+               !strconcat("ldu.global.", TyStr), []>;
+ def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+                            regclass:$dst4), (ins MEMri:$src), 
+               !strconcat("ldu.global.", TyStr), []>;
+ def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+                            regclass:$dst4), (ins MEMri64:$src), 
+               !strconcat("ldu.global.", TyStr), []>;
+ def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+                            regclass:$dst4), (ins imemAny:$src), 
+               !strconcat("ldu.global.", TyStr), []>;
+}
+
+defm INT_PTX_LDU_G_v2i8_ELE
+  : VLDU_G_ELE_V2<"v2.u8 \t{{$dst1, $dst2}}, [$src];",  Int16Regs>;
+defm INT_PTX_LDU_G_v2i16_ELE
+  : VLDU_G_ELE_V2<"v2.u16 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
+defm INT_PTX_LDU_G_v2i32_ELE
+  : VLDU_G_ELE_V2<"v2.u32 \t{{$dst1, $dst2}}, [$src];", Int32Regs>;
+defm INT_PTX_LDU_G_v2f32_ELE
+  : VLDU_G_ELE_V2<"v2.f32 \t{{$dst1, $dst2}}, [$src];", Float32Regs>;
+defm INT_PTX_LDU_G_v2i64_ELE
+  : VLDU_G_ELE_V2<"v2.u64 \t{{$dst1, $dst2}}, [$src];", Int64Regs>;
+defm INT_PTX_LDU_G_v2f64_ELE
+  : VLDU_G_ELE_V2<"v2.f64 \t{{$dst1, $dst2}}, [$src];", Float64Regs>;
+defm INT_PTX_LDU_G_v4i8_ELE
+  : VLDU_G_ELE_V4<"v4.u8 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>;
+defm INT_PTX_LDU_G_v4i16_ELE
+  : VLDU_G_ELE_V4<"v4.u16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
+    Int16Regs>;
+defm INT_PTX_LDU_G_v4i32_ELE
+  : VLDU_G_ELE_V4<"v4.u32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
+    Int32Regs>;
+defm INT_PTX_LDU_G_v4f32_ELE
+  : VLDU_G_ELE_V4<"v4.f32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
+    Float32Regs>;
+
+
+//-----------------------------------
+// Support for ldg on sm_35 or later 
+//-----------------------------------
+
+multiclass LDG_G<string TyStr, NVPTXRegClass regclass> {
+  def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src),
+               !strconcat("ld.global.nc.", TyStr),
+                      []>, Requires<[hasLDG]>;
+  def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src),
+               !strconcat("ld.global.nc.", TyStr),
+                        []>, Requires<[hasLDG]>;
+ def avar:  NVPTXInst<(outs regclass:$result), (ins imemAny:$src),
+               !strconcat("ld.global.nc.", TyStr),
+                      []>, Requires<[hasLDG]>;
+ def ari :  NVPTXInst<(outs regclass:$result), (ins MEMri:$src),
+               !strconcat("ld.global.nc.", TyStr),
+                      []>, Requires<[hasLDG]>;
+ def ari64 :  NVPTXInst<(outs regclass:$result), (ins MEMri64:$src),
+               !strconcat("ld.global.nc.", TyStr),
+                        []>, Requires<[hasLDG]>;
+}
+
+defm INT_PTX_LDG_GLOBAL_i8
+  : LDG_G<"u8 \t$result, [$src];", Int16Regs>;
+defm INT_PTX_LDG_GLOBAL_i16
+  : LDG_G<"u16 \t$result, [$src];", Int16Regs>;
+defm INT_PTX_LDG_GLOBAL_i32
+  : LDG_G<"u32 \t$result, [$src];", Int32Regs>;
+defm INT_PTX_LDG_GLOBAL_i64
+  : LDG_G<"u64 \t$result, [$src];", Int64Regs>;
+defm INT_PTX_LDG_GLOBAL_f32
+  : LDG_G<"f32 \t$result, [$src];", Float32Regs>;
+defm INT_PTX_LDG_GLOBAL_f64
+  : LDG_G<"f64 \t$result, [$src];", Float64Regs>;
+defm INT_PTX_LDG_GLOBAL_p32
+  : LDG_G<"u32 \t$result, [$src];", Int32Regs>;
+defm INT_PTX_LDG_GLOBAL_p64
+  : LDG_G<"u64 \t$result, [$src];", Int64Regs>;
+
+// vector
+
+// Elementized vector ldg 
+multiclass VLDG_G_ELE_V2<string TyStr, NVPTXRegClass regclass> {
+ def _areg32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+                     (ins Int32Regs:$src),
+                     !strconcat("ld.global.nc.", TyStr), []>;
+ def _areg64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+                     (ins Int64Regs:$src),
+                     !strconcat("ld.global.nc.", TyStr), []>;
+ def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+                     (ins MEMri:$src),
+                     !strconcat("ld.global.nc.", TyStr), []>;
+ def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+                     (ins MEMri64:$src),
+                     !strconcat("ld.global.nc.", TyStr), []>;
+ def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+                     (ins imemAny:$src),
+                     !strconcat("ld.global.nc.", TyStr), []>;
+}
+
+multiclass VLDG_G_ELE_V4<string TyStr, NVPTXRegClass regclass> { 
+  def _areg32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+                              regclass:$dst4), (ins Int32Regs:$src), 
+               !strconcat("ld.global.nc.", TyStr), []>;
+  def _areg64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+                               regclass:$dst4), (ins Int64Regs:$src), 
+               !strconcat("ld.global.nc.", TyStr), []>;
+  def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+                              regclass:$dst4), (ins MEMri:$src), 
+               !strconcat("ld.global.nc.", TyStr), []>;
+  def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+                              regclass:$dst4), (ins MEMri64:$src), 
+               !strconcat("ld.global.nc.", TyStr), []>;
+  def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+                             regclass:$dst4), (ins imemAny:$src), 
+               !strconcat("ld.global.nc.", TyStr), []>;
+}
+
+// FIXME: 8-bit LDG should be fixed once LDG/LDU nodes are made into proper loads.
+defm INT_PTX_LDG_G_v2i8_ELE
+  : VLDG_G_ELE_V2<"v2.u8 \t{{$dst1, $dst2}}, [$src];",  Int16Regs>;
+defm INT_PTX_LDG_G_v2i16_ELE
+  : VLDG_G_ELE_V2<"v2.u16 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
+defm INT_PTX_LDG_G_v2i32_ELE
+  : VLDG_G_ELE_V2<"v2.u32 \t{{$dst1, $dst2}}, [$src];", Int32Regs>;
+defm INT_PTX_LDG_G_v2f32_ELE
+  : VLDG_G_ELE_V2<"v2.f32 \t{{$dst1, $dst2}}, [$src];", Float32Regs>;
+defm INT_PTX_LDG_G_v2i64_ELE
+  : VLDG_G_ELE_V2<"v2.u64 \t{{$dst1, $dst2}}, [$src];", Int64Regs>;
+defm INT_PTX_LDG_G_v2f64_ELE
+  : VLDG_G_ELE_V2<"v2.f64 \t{{$dst1, $dst2}}, [$src];", Float64Regs>;
+defm INT_PTX_LDG_G_v4i8_ELE
+  : VLDG_G_ELE_V4<"v4.u8 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>;
+defm INT_PTX_LDG_G_v4i16_ELE
+  : VLDG_G_ELE_V4<"v4.u16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>;
+defm INT_PTX_LDG_G_v4i32_ELE
+  : VLDG_G_ELE_V4<"v4.u32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int32Regs>;
+defm INT_PTX_LDG_G_v4f32_ELE
+  : VLDG_G_ELE_V4<"v4.f32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Float32Regs>;
+
+
+multiclass NG_TO_G<string Str, Intrinsic Intrin> {
+   def _yes : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
+          !strconcat("cvta.", !strconcat(Str, ".u32 \t$result, $src;")),
+      [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>,
+   Requires<[hasGenericLdSt]>;
+   def _yes_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
+          !strconcat("cvta.", !strconcat(Str, ".u64 \t$result, $src;")),
+      [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>,
+   Requires<[hasGenericLdSt]>;
+
+// @TODO: Are these actually needed?  I believe global addresses will be copied
+// to register values anyway.
+   /*def __addr_yes : NVPTXInst<(outs Int32Regs:$result), (ins imemAny:$src),
+          !strconcat("cvta.", !strconcat(Str, ".u32 \t$result, $src;")),
+      [(set Int32Regs:$result, (Intrin (Wrapper tglobaladdr:$src)))]>,
+      Requires<[hasGenericLdSt]>;
+   def __addr_yes_64 : NVPTXInst<(outs Int64Regs:$result), (ins imemAny:$src),
+          !strconcat("cvta.", !strconcat(Str, ".u64 \t$result, $src;")),
+      [(set Int64Regs:$result, (Intrin (Wrapper tglobaladdr:$src)))]>,
+      Requires<[hasGenericLdSt]>;*/
+
+   def _no : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
+          "mov.u32 \t$result, $src;",
+      [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>;
+   def _no_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
+          "mov.u64 \t$result, $src;",
+      [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>;
+
+// @TODO: Are these actually needed?  I believe global addresses will be copied
+// to register values anyway.
+   /*def _addr_no : NVPTXInst<(outs Int32Regs:$result), (ins imem:$src),
+          "mov.u32 \t$result, $src;",
+      [(set Int32Regs:$result, (Intrin (Wrapper tglobaladdr:$src)))]>;
+   def _addr_no_64 : NVPTXInst<(outs Int64Regs:$result), (ins imem:$src),
+          "mov.u64 \t$result, $src;",
+      [(set Int64Regs:$result, (Intrin (Wrapper tglobaladdr:$src)))]>;*/
+}
+
+multiclass G_TO_NG<string Str, Intrinsic Intrin> {
+   def _yes : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
+          !strconcat("cvta.to.", !strconcat(Str, ".u32 \t$result, $src;")),
+      [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>,
+   Requires<[hasGenericLdSt]>;
+   def _yes_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
+          !strconcat("cvta.to.", !strconcat(Str, ".u64 \t$result, $src;")),
+      [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>,
+   Requires<[hasGenericLdSt]>;
+   def _no : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
+          "mov.u32 \t$result, $src;",
+      [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>;
+   def _no_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
+          "mov.u64 \t$result, $src;",
+      [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>;
+}
+
+defm cvta_local  : NG_TO_G<"local", int_nvvm_ptr_local_to_gen>;
+defm cvta_shared : NG_TO_G<"shared", int_nvvm_ptr_shared_to_gen>;
+defm cvta_global : NG_TO_G<"global", int_nvvm_ptr_global_to_gen>;
+defm cvta_const  : NG_TO_G<"const", int_nvvm_ptr_constant_to_gen>;
+
+defm cvta_to_local   : G_TO_NG<"local", int_nvvm_ptr_gen_to_local>;
+defm cvta_to_shared : G_TO_NG<"shared", int_nvvm_ptr_gen_to_shared>;
+defm cvta_to_global : G_TO_NG<"global", int_nvvm_ptr_gen_to_global>;
+defm cvta_to_const  : G_TO_NG<"const", int_nvvm_ptr_gen_to_constant>;
+
+
+// nvvm.ptr.gen.to.param
+def nvvm_ptr_gen_to_param : NVPTXInst<(outs Int32Regs:$result),
+  (ins Int32Regs:$src),
+                        "mov.u32 \t$result, $src;",
+                              [(set Int32Regs:$result,
+                                (int_nvvm_ptr_gen_to_param Int32Regs:$src))]>;
+def nvvm_ptr_gen_to_param_64 : NVPTXInst<(outs Int64Regs:$result),
+  (ins Int64Regs:$src),
+                        "mov.u64 \t$result, $src;",
+                              [(set Int64Regs:$result,
+                                (int_nvvm_ptr_gen_to_param Int64Regs:$src))]>;
+
+
+// nvvm.move intrinsicc
+def nvvm_move_i16 : NVPTXInst<(outs Int16Regs:$r), (ins Int16Regs:$s),
+                             "mov.b16 \t$r, $s;",
+                             [(set Int16Regs:$r,
+                               (int_nvvm_move_i16 Int16Regs:$s))]>;
+def nvvm_move_i32 : NVPTXInst<(outs Int32Regs:$r), (ins Int32Regs:$s),
+                             "mov.b32 \t$r, $s;",
+                             [(set Int32Regs:$r,
+                               (int_nvvm_move_i32 Int32Regs:$s))]>;
+def nvvm_move_i64 : NVPTXInst<(outs Int64Regs:$r), (ins Int64Regs:$s),
+                             "mov.b64 \t$r, $s;",
+                             [(set Int64Regs:$r,
+                               (int_nvvm_move_i64 Int64Regs:$s))]>;
+def nvvm_move_float : NVPTXInst<(outs Float32Regs:$r), (ins Float32Regs:$s),
+                             "mov.f32 \t$r, $s;",
+                             [(set Float32Regs:$r,
+                               (int_nvvm_move_float Float32Regs:$s))]>;
+def nvvm_move_double : NVPTXInst<(outs Float64Regs:$r), (ins Float64Regs:$s),
+                             "mov.f64 \t$r, $s;",
+                             [(set Float64Regs:$r,
+                               (int_nvvm_move_double Float64Regs:$s))]>;
+def nvvm_move_ptr32 : NVPTXInst<(outs Int32Regs:$r), (ins Int32Regs:$s),
+                             "mov.u32 \t$r, $s;",
+                             [(set Int32Regs:$r,
+                               (int_nvvm_move_ptr Int32Regs:$s))]>;
+def nvvm_move_ptr64 : NVPTXInst<(outs Int64Regs:$r), (ins Int64Regs:$s),
+                             "mov.u64 \t$r, $s;",
+                             [(set Int64Regs:$r,
+                               (int_nvvm_move_ptr Int64Regs:$s))]>;
+
+// @TODO: Are these actually needed, or will we always just see symbols
+// copied to registers first?
+/*def nvvm_move_sym32 : NVPTXInst<(outs Int32Regs:$r), (ins imem:$s),
+                             "mov.u32 \t$r, $s;",
+                             [(set Int32Regs:$r,
+                             (int_nvvm_move_ptr texternalsym:$s))]>;
+def nvvm_move_sym64 : NVPTXInst<(outs Int64Regs:$r), (ins imem:$s),
+                             "mov.u64 \t$r, $s;",
+                             [(set Int64Regs:$r,
+                             (int_nvvm_move_ptr texternalsym:$s))]>;*/
+
+
+// MoveParam        %r1, param
+// ptr_local_to_gen %r2, %r1
+// ptr_gen_to_local %r3, %r2
+// ->
+// mov %r1, param
+
+// @TODO: Revisit this.  There is a type
+// contradiction between iPTRAny and iPTR for the addr defs, so the move_sym
+// instructions are not currently defined. However, we can use the ptr
+// variants and the asm printer will do the right thing.
+def : Pat<(i64 (int_nvvm_ptr_gen_to_local (int_nvvm_ptr_local_to_gen
+                (MoveParam texternalsym:$src)))),
+               (nvvm_move_ptr64  texternalsym:$src)>;
+def : Pat<(i32 (int_nvvm_ptr_gen_to_local (int_nvvm_ptr_local_to_gen
+                (MoveParam texternalsym:$src)))),
+               (nvvm_move_ptr32  texternalsym:$src)>;
+
+def texsurf_handles
+  : NVPTXInst<(outs Int64Regs:$result), (ins imem:$src),
+              "mov.u64 \t$result, $src;", []>;
+
+//-----------------------------------
+// Compiler Error Warn
+// - Just ignore them in codegen
+//-----------------------------------
+
+def INT_NVVM_COMPILER_WARN_32 : NVPTXInst<(outs), (ins Int32Regs:$a),
+                "// llvm.nvvm.compiler.warn()",
+                [(int_nvvm_compiler_warn Int32Regs:$a)]>;
+def INT_NVVM_COMPILER_WARN_64 : NVPTXInst<(outs), (ins Int64Regs:$a),
+                "// llvm.nvvm.compiler.warn()",
+                [(int_nvvm_compiler_warn Int64Regs:$a)]>;
+def INT_NVVM_COMPILER_ERROR_32 : NVPTXInst<(outs), (ins Int32Regs:$a),
+                "// llvm.nvvm.compiler.error()",
+                [(int_nvvm_compiler_error Int32Regs:$a)]>;
+def INT_NVVM_COMPILER_ERROR_64 : NVPTXInst<(outs), (ins Int64Regs:$a),
+                "// llvm.nvvm.compiler.error()",
+                [(int_nvvm_compiler_error Int64Regs:$a)]>;
+
+
+// isspacep
+
+def ISSPACEP_CONST_32
+  : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a),
+              "isspacep.const \t$d, $a;",
+              [(set Int1Regs:$d, (int_nvvm_isspacep_const Int32Regs:$a))]>,
+    Requires<[hasPTX31]>;
+def ISSPACEP_CONST_64
+  : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+              "isspacep.const \t$d, $a;",
+              [(set Int1Regs:$d, (int_nvvm_isspacep_const Int64Regs:$a))]>,
+    Requires<[hasPTX31]>;
+def ISSPACEP_GLOBAL_32
+  : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a),
+              "isspacep.global \t$d, $a;",
+              [(set Int1Regs:$d, (int_nvvm_isspacep_global Int32Regs:$a))]>;
+def ISSPACEP_GLOBAL_64
+  : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+              "isspacep.global \t$d, $a;",
+              [(set Int1Regs:$d, (int_nvvm_isspacep_global Int64Regs:$a))]>;
+def ISSPACEP_LOCAL_32
+  : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a),
+              "isspacep.local \t$d, $a;",
+              [(set Int1Regs:$d, (int_nvvm_isspacep_local Int32Regs:$a))]>;
+def ISSPACEP_LOCAL_64
+  : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+              "isspacep.local \t$d, $a;",
+              [(set Int1Regs:$d, (int_nvvm_isspacep_local Int64Regs:$a))]>;
+def ISSPACEP_SHARED_32
+  : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a),
+              "isspacep.shared \t$d, $a;",
+              [(set Int1Regs:$d, (int_nvvm_isspacep_shared Int32Regs:$a))]>;
+def ISSPACEP_SHARED_64
+  : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+              "isspacep.shared \t$d, $a;",
+              [(set Int1Regs:$d, (int_nvvm_isspacep_shared Int64Regs:$a))]>;
+
+
+// Special register reads
+def MOV_SPECIAL : NVPTXInst<(outs Int32Regs:$d),
+                            (ins SpecialRegs:$r),
+                            "mov.b32\t$d, $r;", []>;
+
+def : Pat<(int_nvvm_read_ptx_sreg_envreg0), (MOV_SPECIAL ENVREG0)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg1), (MOV_SPECIAL ENVREG1)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg2), (MOV_SPECIAL ENVREG2)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg3), (MOV_SPECIAL ENVREG3)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg4), (MOV_SPECIAL ENVREG4)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg5), (MOV_SPECIAL ENVREG5)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg6), (MOV_SPECIAL ENVREG6)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg7), (MOV_SPECIAL ENVREG7)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg8), (MOV_SPECIAL ENVREG8)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg9), (MOV_SPECIAL ENVREG9)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg10), (MOV_SPECIAL ENVREG10)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg11), (MOV_SPECIAL ENVREG11)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg12), (MOV_SPECIAL ENVREG12)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg13), (MOV_SPECIAL ENVREG13)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg14), (MOV_SPECIAL ENVREG14)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg15), (MOV_SPECIAL ENVREG15)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg16), (MOV_SPECIAL ENVREG16)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg17), (MOV_SPECIAL ENVREG17)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg18), (MOV_SPECIAL ENVREG18)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg19), (MOV_SPECIAL ENVREG19)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg20), (MOV_SPECIAL ENVREG20)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg21), (MOV_SPECIAL ENVREG21)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg22), (MOV_SPECIAL ENVREG22)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg23), (MOV_SPECIAL ENVREG23)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg24), (MOV_SPECIAL ENVREG24)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg25), (MOV_SPECIAL ENVREG25)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg26), (MOV_SPECIAL ENVREG26)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg27), (MOV_SPECIAL ENVREG27)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg28), (MOV_SPECIAL ENVREG28)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg29), (MOV_SPECIAL ENVREG29)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg30), (MOV_SPECIAL ENVREG30)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg31), (MOV_SPECIAL ENVREG31)>;
+
+
+// rotate builtin support
+
+def ROTATE_B32_HW_IMM
+  : NVPTXInst<(outs Int32Regs:$dst),
+              (ins  Int32Regs:$src, i32imm:$amt),
+              "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
+              [(set Int32Regs:$dst,
+                 (int_nvvm_rotate_b32 Int32Regs:$src, (i32 imm:$amt)))]>,
+              Requires<[hasHWROT32]> ;
+
+def ROTATE_B32_HW_REG
+  : NVPTXInst<(outs Int32Regs:$dst),
+              (ins  Int32Regs:$src, Int32Regs:$amt),
+              "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
+              [(set Int32Regs:$dst,
+                 (int_nvvm_rotate_b32 Int32Regs:$src, Int32Regs:$amt))]>,
+              Requires<[hasHWROT32]> ;
+
+def : Pat<(int_nvvm_rotate_b32 Int32Regs:$src, (i32 imm:$amt)),
+          (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>,
+      Requires<[noHWROT32]> ;
+
+def : Pat<(int_nvvm_rotate_b32 Int32Regs:$src, Int32Regs:$amt),
+          (ROTL32reg_sw Int32Regs:$src, Int32Regs:$amt)>,
+      Requires<[noHWROT32]> ;
+
+def GET_LO_INT64
+  : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src),
+              !strconcat("{{\n\t",
+              !strconcat(".reg .b32 %dummy;\n\t",
+              !strconcat("mov.b64 \t{$dst,%dummy}, $src;\n\t",
+        !strconcat("}}", "")))),
+        []> ;
+
+def GET_HI_INT64
+  : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src),
+              !strconcat("{{\n\t",
+              !strconcat(".reg .b32 %dummy;\n\t",
+              !strconcat("mov.b64 \t{%dummy,$dst}, $src;\n\t",
+        !strconcat("}}", "")))),
+        []> ;
+
+def PACK_TWO_INT32
+  : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$lo, Int32Regs:$hi),
+              "mov.b64 \t$dst, {{$lo, $hi}};", []> ;
+
+def : Pat<(int_nvvm_swap_lo_hi_b64 Int64Regs:$src),
+          (PACK_TWO_INT32 (GET_HI_INT64 Int64Regs:$src),
+                          (GET_LO_INT64 Int64Regs:$src))> ;
+
+// funnel shift, requires >= sm_32
+def SHF_L_WRAP_B32_IMM
+  : NVPTXInst<(outs Int32Regs:$dst),
+              (ins  Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt),
+              "shf.l.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
+    Requires<[hasHWROT32]>;
+
+def SHF_L_WRAP_B32_REG
+  : NVPTXInst<(outs Int32Regs:$dst),
+              (ins  Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
+              "shf.l.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
+    Requires<[hasHWROT32]>;
+
+def SHF_R_WRAP_B32_IMM
+  : NVPTXInst<(outs Int32Regs:$dst),
+              (ins  Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt),
+              "shf.r.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
+    Requires<[hasHWROT32]>;
+
+def SHF_R_WRAP_B32_REG
+  : NVPTXInst<(outs Int32Regs:$dst),
+              (ins  Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
+              "shf.r.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
+    Requires<[hasHWROT32]>;
+
+// HW version of rotate 64
+def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, (i32 imm:$amt)),
+          (PACK_TWO_INT32
+            (SHF_L_WRAP_B32_IMM (GET_HI_INT64 Int64Regs:$src),
+                                (GET_LO_INT64 Int64Regs:$src), imm:$amt),
+            (SHF_L_WRAP_B32_IMM (GET_LO_INT64 Int64Regs:$src),
+                                (GET_HI_INT64 Int64Regs:$src), imm:$amt))>,
+      Requires<[hasHWROT32]>;
+
+def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, Int32Regs:$amt),
+          (PACK_TWO_INT32
+            (SHF_L_WRAP_B32_REG (GET_HI_INT64 Int64Regs:$src),
+                                (GET_LO_INT64 Int64Regs:$src), Int32Regs:$amt),
+            (SHF_L_WRAP_B32_REG (GET_LO_INT64 Int64Regs:$src),
+                               (GET_HI_INT64 Int64Regs:$src), Int32Regs:$amt))>,
+      Requires<[hasHWROT32]>;
+
+
+def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, (i32 imm:$amt)),
+          (PACK_TWO_INT32
+            (SHF_R_WRAP_B32_IMM (GET_LO_INT64 Int64Regs:$src),
+                                (GET_HI_INT64 Int64Regs:$src), imm:$amt),
+            (SHF_R_WRAP_B32_IMM (GET_HI_INT64 Int64Regs:$src),
+                                (GET_LO_INT64 Int64Regs:$src), imm:$amt))>,
+      Requires<[hasHWROT32]>;
+
+def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, Int32Regs:$amt),
+          (PACK_TWO_INT32
+            (SHF_R_WRAP_B32_REG (GET_LO_INT64 Int64Regs:$src),
+                                (GET_HI_INT64 Int64Regs:$src), Int32Regs:$amt),
+            (SHF_R_WRAP_B32_REG (GET_HI_INT64 Int64Regs:$src),
+                               (GET_LO_INT64 Int64Regs:$src), Int32Regs:$amt))>,
+      Requires<[hasHWROT32]>;
+
+// SW version of rotate 64
+def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, (i32 imm:$amt)),
+          (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>,
+      Requires<[noHWROT32]>;
+def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, Int32Regs:$amt),
+          (ROTL64reg_sw Int64Regs:$src, Int32Regs:$amt)>,
+      Requires<[noHWROT32]>;
+def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, (i32 imm:$amt)),
+          (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>,
+      Requires<[noHWROT32]>;
+def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, Int32Regs:$amt),
+          (ROTR64reg_sw Int64Regs:$src, Int32Regs:$amt)>,
+      Requires<[noHWROT32]>;
+
+
+//-----------------------------------
+// Texture Intrinsics
+//-----------------------------------
+
+// NOTE: For Fermi support, any new texture/surface/sampler intrinsics must be
+// also defined in NVPTXReplaceImageHandles.cpp
+
+// texmode_independent
+let IsTex = 1, IsTexModeUnified = 0 in {
+// Texture fetch instructions using handles
+def TEX_1D_F32_S32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x),
+              "tex.1d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
+              []>;
+def TEX_1D_F32_F32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x),
+              "tex.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
+              []>;
+def TEX_1D_F32_F32_LEVEL
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$lod),
+              "tex.level.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x\\}], $lod;",
+              []>;
+def TEX_1D_F32_F32_GRAD
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x,
+                   Float32Regs:$gradx, Float32Regs:$grady),
+              "tex.grad.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
+              []>;
+def TEX_1D_S32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x),
+              "tex.1d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
+              []>;
+def TEX_1D_S32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x),
+              "tex.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
+              []>;
+def TEX_1D_S32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x,
+                   Float32Regs:$lod),
+              "tex.level.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x\\}], $lod;",
+              []>;
+def TEX_1D_S32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x,
+                   Float32Regs:$gradx, Float32Regs:$grady),
+              "tex.grad.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
+              []>;
+def TEX_1D_U32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x),
+              "tex.1d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
+              []>;
+def TEX_1D_U32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x),
+              "tex.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
+              []>;
+def TEX_1D_U32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x,
+                   Float32Regs:$lod),
+              "tex.level.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x\\}], $lod;",
+              []>;
+def TEX_1D_U32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x,
+                   Float32Regs:$gradx, Float32Regs:$grady),
+              "tex.grad.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
+              []>;
+
+def TEX_1D_ARRAY_F32_S32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "tex.a1d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x\\}];",
+              []>;
+def TEX_1D_ARRAY_F32_F32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x),
+              "tex.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x\\}];",
+              []>;
+def TEX_1D_ARRAY_F32_F32_LEVEL
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$lod),
+              "tex.level.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x\\}], $lod;",
+              []>;
+def TEX_1D_ARRAY_F32_F32_GRAD
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$gradx, Float32Regs:$grady),
+              "tex.grad.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
+              []>;
+def TEX_1D_ARRAY_S32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "tex.a1d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x\\}];",
+              []>;
+def TEX_1D_ARRAY_S32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x),
+              "tex.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x\\}];",
+              []>;
+def TEX_1D_ARRAY_S32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$lod),
+              "tex.level.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x\\}], $lod;",
+              []>;
+def TEX_1D_ARRAY_S32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$gradx, Float32Regs:$grady),
+              "tex.grad.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
+              []>;
+def TEX_1D_ARRAY_U32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "tex.a1d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x\\}];",
+              []>;
+def TEX_1D_ARRAY_U32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x),
+              "tex.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x\\}];",
+              []>;
+def TEX_1D_ARRAY_U32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$lod),
+              "tex.level.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x\\}], $lod;",
+              []>;
+def TEX_1D_ARRAY_U32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$gradx, Float32Regs:$grady),
+              "tex.grad.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
+              []>;
+
+def TEX_2D_F32_S32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "tex.2d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TEX_2D_F32_F32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+              "tex.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TEX_2D_F32_F32_LEVEL
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$lod),
+              "tex.level.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y\\}], $lod;",
+              []>;
+def TEX_2D_F32_F32_GRAD
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$grady0, Float32Regs:$grady1),
+              "tex.grad.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
+              "\\{$grady0, $grady1\\};",
+              []>;
+def TEX_2D_S32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "tex.2d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TEX_2D_S32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+              "tex.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TEX_2D_S32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$lod),
+              "tex.level.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y\\}], $lod;",
+              []>;
+def TEX_2D_S32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$grady0, Float32Regs:$grady1),
+              "tex.grad.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
+              "\\{$grady0, $grady1\\};",
+              []>;
+def TEX_2D_U32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "tex.2d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TEX_2D_U32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+              "tex.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TEX_2D_U32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$lod),
+              "tex.level.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y\\}], $lod;",
+              []>;
+def TEX_2D_U32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$grady0, Float32Regs:$grady1),
+              "tex.grad.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
+              "\\{$grady0, $grady1\\};",
+              []>;
+
+def TEX_2D_ARRAY_F32_S32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+                   Int32Regs:$y),
+              "tex.a2d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def TEX_2D_ARRAY_F32_F32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y),
+              "tex.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def TEX_2D_ARRAY_F32_F32_LEVEL
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y, Float32Regs:$lod),
+              "tex.level.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $y\\}], $lod;",
+              []>;
+def TEX_2D_ARRAY_F32_F32_GRAD
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y, Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$grady0, Float32Regs:$grady1),
+              "tex.grad.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
+              "\\{$grady0, $grady1\\};",
+              []>;
+def TEX_2D_ARRAY_S32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+                   Int32Regs:$y),
+              "tex.a2d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def TEX_2D_ARRAY_S32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y),
+              "tex.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def TEX_2D_ARRAY_S32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y, Float32Regs:$lod),
+              "tex.level.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $y\\}], $lod;",
+              []>;
+def TEX_2D_ARRAY_S32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$grady0, Float32Regs:$grady1),
+              "tex.grad.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
+              "\\{$grady0, $grady1\\};",
+              []>;
+def TEX_2D_ARRAY_U32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+                   Int32Regs:$y),
+              "tex.a2d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def TEX_2D_ARRAY_U32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y),
+              "tex.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def TEX_2D_ARRAY_U32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y, Float32Regs:$lod),
+              "tex.level.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $y\\}], $lod;",
+              []>;
+def TEX_2D_ARRAY_U32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$grady0, Float32Regs:$grady1),
+              "tex.grad.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
+              "\\{$grady0, $grady1\\};",
+              []>;
+
+def TEX_3D_F32_S32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$z),
+              "tex.3d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_3D_F32_F32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z),
+              "tex.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_3D_F32_F32_LEVEL
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z, Float32Regs:$lod),
+              "tex.level.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
+              []>;
+def TEX_3D_F32_F32_GRAD
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$gradx2, Float32Regs:$grady0,
+                   Float32Regs:$grady1, Float32Regs:$grady2),
+              "tex.grad.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}], "
+              "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
+              "\\{$grady0, $grady1, $grady2, $grady2\\};",
+              []>;
+def TEX_3D_S32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$z),
+              "tex.3d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_3D_S32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z),
+              "tex.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_3D_S32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z, Float32Regs:$lod),
+              "tex.level.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
+              []>;
+def TEX_3D_S32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$gradx2, Float32Regs:$grady0,
+                   Float32Regs:$grady1, Float32Regs:$grady2),
+              "tex.grad.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}], "
+              "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
+              "\\{$grady0, $grady1, $grady2, $grady2\\};",
+              []>;
+def TEX_3D_U32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$z),
+              "tex.3d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_3D_U32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z),
+              "tex.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_3D_U32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z, Float32Regs:$lod),
+              "tex.level.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
+              []>;
+def TEX_3D_U32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$gradx2, Float32Regs:$grady0,
+                   Float32Regs:$grady1, Float32Regs:$grady2),
+              "tex.grad.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}], "
+              "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
+              "\\{$grady0, $grady1, $grady2, $grady2\\};",
+              []>;
+
+def TEX_CUBE_F32_F32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s,
+               Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+              "tex.cube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_CUBE_F32_F32_LEVEL
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+                   Float32Regs:$lod),
+              "tex.level.cube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
+              []>;
+def TEX_CUBE_S32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+              "tex.cube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_CUBE_S32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+                   Float32Regs:$lod),
+              "tex.level.cube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
+              []>;
+def TEX_CUBE_U32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+              "tex.cube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_CUBE_U32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+                   Float32Regs:$lod),
+              "tex.level.cube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
+              []>;
+
+def TEX_CUBE_ARRAY_F32_F32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
+               Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+              "tex.acube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $z\\}];",
+              []>;
+def TEX_CUBE_ARRAY_F32_F32_LEVEL
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+                   Float32Regs:$lod),
+              "tex.level.acube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $z\\}], $lod;",
+              []>;
+def TEX_CUBE_ARRAY_S32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+              "tex.acube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $z\\}];",
+              []>;
+def TEX_CUBE_ARRAY_S32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+                   Float32Regs:$lod),
+              "tex.level.acube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $z\\}], $lod;",
+              []>;
+def TEX_CUBE_ARRAY_U32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+              "tex.acube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $z\\}];",
+              []>;
+def TEX_CUBE_ARRAY_U32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+                   Float32Regs:$lod),
+              "tex.level.acube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $z\\}], $lod;",
+              []>;
+
+def TLD4_R_2D_F32_F32
+  : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
+                    Float32Regs:$v2, Float32Regs:$v3),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+              "tld4.r.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TLD4_G_2D_F32_F32
+  : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
+                    Float32Regs:$v2, Float32Regs:$v3),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+              "tld4.g.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TLD4_B_2D_F32_F32
+  : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
+                    Float32Regs:$v2, Float32Regs:$v3),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+              "tld4.b.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TLD4_A_2D_F32_F32
+  : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
+                    Float32Regs:$v2, Float32Regs:$v3),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+              "tld4.a.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TLD4_R_2D_S32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+              "tld4.r.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TLD4_G_2D_S32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+              "tld4.g.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TLD4_B_2D_S32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+              "tld4.b.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TLD4_A_2D_S32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+              "tld4.a.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TLD4_R_2D_U32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+              "tld4.r.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TLD4_G_2D_U32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+              "tld4.g.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TLD4_B_2D_U32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+              "tld4.b.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TLD4_A_2D_U32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+              "tld4.a.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+}
+
+
+// texmode_unified
+let IsTex = 1, IsTexModeUnified = 1 in {
+// Texture fetch instructions using handles
+def TEX_UNIFIED_1D_F32_S32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$x),
+              "tex.1d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
+              []>;
+def TEX_UNIFIED_1D_F32_F32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x),
+              "tex.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
+              []>;
+def TEX_UNIFIED_1D_F32_F32_LEVEL
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$lod),
+              "tex.level.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x\\}], $lod;",
+              []>;
+def TEX_UNIFIED_1D_F32_F32_GRAD
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x,
+                   Float32Regs:$gradx, Float32Regs:$grady),
+              "tex.grad.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
+              []>;
+def TEX_UNIFIED_1D_S32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$x),
+              "tex.1d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
+              []>;
+def TEX_UNIFIED_1D_S32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x),
+              "tex.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
+              []>;
+def TEX_UNIFIED_1D_S32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x,
+                   Float32Regs:$lod),
+              "tex.level.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x\\}], $lod;",
+              []>;
+def TEX_UNIFIED_1D_S32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x,
+                   Float32Regs:$gradx, Float32Regs:$grady),
+              "tex.grad.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
+              []>;
+def TEX_UNIFIED_1D_U32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$x),
+              "tex.1d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
+              []>;
+def TEX_UNIFIED_1D_U32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x),
+              "tex.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
+              []>;
+def TEX_UNIFIED_1D_U32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x,
+                   Float32Regs:$lod),
+              "tex.level.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x\\}], $lod;",
+              []>;
+def TEX_UNIFIED_1D_U32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x,
+                   Float32Regs:$gradx, Float32Regs:$grady),
+              "tex.grad.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
+              []>;
+
+def TEX_UNIFIED_1D_ARRAY_F32_S32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x),
+              "tex.a1d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x\\}];",
+              []>;
+def TEX_UNIFIED_1D_ARRAY_F32_F32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x),
+              "tex.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x\\}];",
+              []>;
+def TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$lod),
+              "tex.level.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x\\}], $lod;",
+              []>;
+def TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$gradx, Float32Regs:$grady),
+              "tex.grad.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
+              []>;
+def TEX_UNIFIED_1D_ARRAY_S32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x),
+              "tex.a1d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x\\}];",
+              []>;
+def TEX_UNIFIED_1D_ARRAY_S32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x),
+              "tex.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x\\}];",
+              []>;
+def TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$lod),
+              "tex.level.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x\\}], $lod;",
+              []>;
+def TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$gradx, Float32Regs:$grady),
+              "tex.grad.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
+              []>;
+def TEX_UNIFIED_1D_ARRAY_U32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x),
+              "tex.a1d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x\\}];",
+              []>;
+def TEX_UNIFIED_1D_ARRAY_U32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x),
+              "tex.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x\\}];",
+              []>;
+def TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$lod),
+              "tex.level.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x\\}], $lod;",
+              []>;
+def TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$gradx, Float32Regs:$grady),
+              "tex.grad.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
+              []>;
+
+def TEX_UNIFIED_2D_F32_S32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y),
+              "tex.2d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TEX_UNIFIED_2D_F32_F32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+              "tex.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TEX_UNIFIED_2D_F32_F32_LEVEL
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$lod),
+              "tex.level.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y\\}], $lod;",
+              []>;
+def TEX_UNIFIED_2D_F32_F32_GRAD
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$grady0, Float32Regs:$grady1),
+              "tex.grad.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
+              "\\{$grady0, $grady1\\};",
+              []>;
+def TEX_UNIFIED_2D_S32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y),
+              "tex.2d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TEX_UNIFIED_2D_S32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+              "tex.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TEX_UNIFIED_2D_S32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$lod),
+              "tex.level.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y\\}], $lod;",
+              []>;
+def TEX_UNIFIED_2D_S32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$grady0, Float32Regs:$grady1),
+              "tex.grad.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
+              "\\{$grady0, $grady1\\};",
+              []>;
+def TEX_UNIFIED_2D_U32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y),
+              "tex.2d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TEX_UNIFIED_2D_U32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+              "tex.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TEX_UNIFIED_2D_U32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$lod),
+              "tex.level.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y\\}], $lod;",
+              []>;
+def TEX_UNIFIED_2D_U32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$grady0, Float32Regs:$grady1),
+              "tex.grad.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
+              "\\{$grady0, $grady1\\};",
+              []>;
+
+def TEX_UNIFIED_2D_ARRAY_F32_S32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x,
+                   Int32Regs:$y),
+              "tex.a2d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $y\\}];",
+              []>;
+def TEX_UNIFIED_2D_ARRAY_F32_F32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y),
+              "tex.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $y\\}];",
+              []>;
+def TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y, Float32Regs:$lod),
+              "tex.level.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $y\\}], $lod;",
+              []>;
+def TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y, Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$grady0, Float32Regs:$grady1),
+              "tex.grad.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
+              "\\{$grady0, $grady1\\};",
+              []>;
+def TEX_UNIFIED_2D_ARRAY_S32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x,
+                   Int32Regs:$y),
+              "tex.a2d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $y\\}];",
+              []>;
+def TEX_UNIFIED_2D_ARRAY_S32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y),
+              "tex.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $y\\}];",
+              []>;
+def TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y, Float32Regs:$lod),
+              "tex.level.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $y\\}], $lod;",
+              []>;
+def TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$grady0, Float32Regs:$grady1),
+              "tex.grad.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
+              "\\{$grady0, $grady1\\};",
+              []>;
+def TEX_UNIFIED_2D_ARRAY_U32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x,
+                   Int32Regs:$y),
+              "tex.a2d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $y\\}];",
+              []>;
+def TEX_UNIFIED_2D_ARRAY_U32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y),
+              "tex.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $y\\}];",
+              []>;
+def TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y, Float32Regs:$lod),
+              "tex.level.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $y\\}], $lod;",
+              []>;
+def TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$grady0, Float32Regs:$grady1),
+              "tex.grad.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
+              "\\{$grady0, $grady1\\};",
+              []>;
+
+def TEX_UNIFIED_3D_F32_S32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$z),
+              "tex.3d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_UNIFIED_3D_F32_F32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z),
+              "tex.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_UNIFIED_3D_F32_F32_LEVEL
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z, Float32Regs:$lod),
+              "tex.level.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}], $lod;",
+              []>;
+def TEX_UNIFIED_3D_F32_F32_GRAD
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$gradx2, Float32Regs:$grady0,
+                   Float32Regs:$grady1, Float32Regs:$grady2),
+              "tex.grad.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}], "
+              "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
+              "\\{$grady0, $grady1, $grady2, $grady2\\};",
+              []>;
+def TEX_UNIFIED_3D_S32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$z),
+              "tex.3d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_UNIFIED_3D_S32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z),
+              "tex.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_UNIFIED_3D_S32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z, Float32Regs:$lod),
+              "tex.level.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}], $lod;",
+              []>;
+def TEX_UNIFIED_3D_S32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$gradx2, Float32Regs:$grady0,
+                   Float32Regs:$grady1, Float32Regs:$grady2),
+              "tex.grad.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}], "
+              "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
+              "\\{$grady0, $grady1, $grady2, $grady2\\};",
+              []>;
+def TEX_UNIFIED_3D_U32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$z),
+              "tex.3d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_UNIFIED_3D_U32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z),
+              "tex.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_UNIFIED_3D_U32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z, Float32Regs:$lod),
+              "tex.level.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}], $lod;",
+              []>;
+def TEX_UNIFIED_3D_U32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$gradx2, Float32Regs:$grady0,
+                   Float32Regs:$grady1, Float32Regs:$grady2),
+              "tex.grad.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}], "
+              "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
+              "\\{$grady0, $grady1, $grady2, $grady2\\};",
+              []>;
+
+def TEX_UNIFIED_CUBE_F32_F32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t,
+               Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+              "tex.cube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_UNIFIED_CUBE_F32_F32_LEVEL
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+                   Float32Regs:$lod),
+              "tex.level.cube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}], $lod;",
+              []>;
+def TEX_UNIFIED_CUBE_S32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+              "tex.cube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_UNIFIED_CUBE_S32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+                   Float32Regs:$lod),
+              "tex.level.cube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}], $lod;",
+              []>;
+def TEX_UNIFIED_CUBE_U32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+              "tex.cube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_UNIFIED_CUBE_U32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+                   Float32Regs:$lod),
+              "tex.level.cube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}], $lod;",
+              []>;
+
+def TEX_UNIFIED_CUBE_ARRAY_F32_F32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l,
+               Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+              "tex.acube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $z\\}];",
+              []>;
+def TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+                   Float32Regs:$lod),
+              "tex.level.acube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $z\\}], $lod;",
+              []>;
+def TEX_UNIFIED_CUBE_ARRAY_S32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+              "tex.acube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $z\\}];",
+              []>;
+def TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+                   Float32Regs:$lod),
+              "tex.level.acube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $z\\}], $lod;",
+              []>;
+def TEX_UNIFIED_CUBE_ARRAY_U32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+              "tex.acube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $z\\}];",
+              []>;
+def TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+                   Float32Regs:$lod),
+              "tex.level.acube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $z\\}], $lod;",
+              []>;
+
+def TLD4_UNIFIED_R_2D_F32_F32
+  : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
+                    Float32Regs:$v2, Float32Regs:$v3),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+              "tld4.r.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TLD4_UNIFIED_G_2D_F32_F32
+  : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
+                    Float32Regs:$v2, Float32Regs:$v3),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+              "tld4.g.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TLD4_UNIFIED_B_2D_F32_F32
+  : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
+                    Float32Regs:$v2, Float32Regs:$v3),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+              "tld4.b.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TLD4_UNIFIED_A_2D_F32_F32
+  : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
+                    Float32Regs:$v2, Float32Regs:$v3),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+              "tld4.a.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TLD4_UNIFIED_R_2D_S32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+              "tld4.r.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TLD4_UNIFIED_G_2D_S32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+              "tld4.g.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TLD4_UNIFIED_B_2D_S32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+              "tld4.b.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TLD4_UNIFIED_A_2D_S32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+              "tld4.a.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TLD4_UNIFIED_R_2D_U32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+              "tld4.r.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TLD4_UNIFIED_G_2D_U32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+              "tld4.g.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TLD4_UNIFIED_B_2D_U32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+              "tld4.b.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TLD4_UNIFIED_A_2D_U32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+              "tld4.a.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+}
+
+
+
+//=== Surface load instructions
+// .clamp variant
+let IsSuld = 1 in {
+def SULD_1D_I8_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.b8.clamp \\{$r\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_I16_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.b16.clamp \\{$r\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_I32_CLAMP
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.b32.clamp \\{$r\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_I64_CLAMP
+  : NVPTXInst<(outs Int64Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.b64.clamp \\{$r\\}, [$s, \\{$x\\}];",
+              []>;
+
+def SULD_1D_ARRAY_I8_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.b8.clamp \\{$r\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_I16_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.b16.clamp \\{$r\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_I32_CLAMP
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.b32.clamp \\{$r\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_I64_CLAMP
+  : NVPTXInst<(outs Int64Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.b64.clamp \\{$r\\}, [$s, \\{$l, $x\\}];",
+              []>;
+
+def SULD_2D_I8_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.b8.clamp \\{$r\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_I16_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.b16.clamp \\{$r\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_I32_CLAMP
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.b32.clamp \\{$r\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_I64_CLAMP
+  : NVPTXInst<(outs Int64Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.b64.clamp \\{$r\\}, [$s, \\{$x, $y\\}];",
+              []>;
+
+def SULD_2D_ARRAY_I8_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.b8.clamp \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_I16_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.b16.clamp \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_I32_CLAMP
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.b32.clamp \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_I64_CLAMP
+  : NVPTXInst<(outs Int64Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.b64.clamp \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+
+def SULD_3D_I8_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.b8.clamp \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_I16_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.b16.clamp \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_I32_CLAMP
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.b32.clamp \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_I64_CLAMP
+  : NVPTXInst<(outs Int64Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.b64.clamp \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+}
+
+let IsSuld = 2 in {
+def SULD_1D_V2I8_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v2.b8.clamp \\{$r, $g\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V2I16_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v2.b16.clamp \\{$r, $g\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V2I32_CLAMP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v2.b32.clamp \\{$r, $g\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V2I64_CLAMP
+  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v2.b64.clamp \\{$r, $g\\}, [$s, \\{$x\\}];",
+              []>;
+
+def SULD_1D_ARRAY_V2I8_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v2.b8.clamp \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V2I16_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v2.b16.clamp \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V2I32_CLAMP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v2.b32.clamp \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V2I64_CLAMP
+  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v2.b64.clamp \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+              []>;
+
+def SULD_2D_V2I8_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v2.b8.clamp \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V2I16_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v2.b16.clamp \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V2I32_CLAMP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v2.b32.clamp \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V2I64_CLAMP
+  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v2.b64.clamp \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+              []>;
+
+def SULD_2D_ARRAY_V2I8_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v2.b8.clamp \\{$r, $g\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V2I16_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v2.b16.clamp \\{$r, $g\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V2I32_CLAMP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v2.b32.clamp \\{$r, $g\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V2I64_CLAMP
+  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v2.b64.clamp \\{$r, $g\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+
+def SULD_3D_V2I8_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v2.b8.clamp \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V2I16_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v2.b16.clamp \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V2I32_CLAMP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v2.b32.clamp \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V2I64_CLAMP
+  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v2.b64.clamp \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+}
+
+let IsSuld = 3 in {
+def SULD_1D_V4I8_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v4.b8.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V4I16_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v4.b16.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V4I32_CLAMP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v4.b32.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+              []>;
+
+def SULD_1D_ARRAY_V4I8_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v4.b8.clamp \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V4I16_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v4.b16.clamp \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V4I32_CLAMP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v4.b32.clamp \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x\\}];",
+              []>;
+
+def SULD_2D_V4I8_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v4.b8.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V4I16_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v4.b16.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V4I32_CLAMP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v4.b32.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+              []>;
+
+def SULD_2D_ARRAY_V4I8_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v4.b8.clamp \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V4I16_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v4.b16.clamp \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V4I32_CLAMP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v4.b32.clamp \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+
+
+def SULD_3D_V4I8_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v4.b8.clamp \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V4I16_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v4.b16.clamp \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V4I32_CLAMP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v4.b32.clamp \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+}
+
+
+// .trap variant
+let IsSuld = 1 in {
+def SULD_1D_I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.b8.trap \\{$r\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.b16.trap \\{$r\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.b32.trap \\{$r\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_I64_TRAP
+  : NVPTXInst<(outs Int64Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.b64.trap \\{$r\\}, [$s, \\{$x\\}];",
+              []>;
+
+def SULD_1D_ARRAY_I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.b8.trap \\{$r\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.b16.trap \\{$r\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.b32.trap \\{$r\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_I64_TRAP
+  : NVPTXInst<(outs Int64Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.b64.trap \\{$r\\}, [$s, \\{$l, $x\\}];",
+              []>;
+
+def SULD_2D_I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.b8.trap \\{$r\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.b16.trap \\{$r\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.b32.trap \\{$r\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_I64_TRAP
+  : NVPTXInst<(outs Int64Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.b64.trap \\{$r\\}, [$s, \\{$x, $y\\}];",
+              []>;
+
+def SULD_2D_ARRAY_I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.b8.trap \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.b16.trap \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.b32.trap \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_I64_TRAP
+  : NVPTXInst<(outs Int64Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.b64.trap \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+
+def SULD_3D_I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.b8.trap \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.b16.trap \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.b32.trap \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_I64_TRAP
+  : NVPTXInst<(outs Int64Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.b64.trap \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+}
+
+let IsSuld = 2 in {
+def SULD_1D_V2I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v2.b8.trap \\{$r, $g\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V2I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v2.b16.trap \\{$r, $g\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V2I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v2.b32.trap \\{$r, $g\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V2I64_TRAP
+  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v2.b64.trap \\{$r, $g\\}, [$s, \\{$x\\}];",
+              []>;
+
+def SULD_1D_ARRAY_V2I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v2.b8.trap \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V2I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v2.b16.trap \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V2I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v2.b32.trap \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V2I64_TRAP
+  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v2.b64.trap \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+              []>;
+
+def SULD_2D_V2I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v2.b8.trap \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V2I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v2.b16.trap \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V2I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v2.b32.trap \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V2I64_TRAP
+  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v2.b64.trap \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+              []>;
+
+def SULD_2D_ARRAY_V2I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v2.b8.trap \\{$r, $g\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V2I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v2.b16.trap \\{$r, $g\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V2I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v2.b32.trap \\{$r, $g\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V2I64_TRAP
+  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v2.b64.trap \\{$r, $g\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+
+def SULD_3D_V2I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v2.b8.trap \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V2I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v2.b16.trap \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V2I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v2.b32.trap \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V2I64_TRAP
+  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v2.b64.trap \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+}
+
+let IsSuld = 3 in {
+def SULD_1D_V4I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v4.b8.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V4I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v4.b16.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V4I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v4.b32.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+              []>;
+
+def SULD_1D_ARRAY_V4I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v4.b8.trap \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V4I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v4.b16.trap \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V4I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v4.b32.trap \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x\\}];",
+              []>;
+
+def SULD_2D_V4I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v4.b8.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V4I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v4.b16.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V4I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v4.b32.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+              []>;
+
+def SULD_2D_ARRAY_V4I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v4.b8.trap \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V4I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v4.b16.trap \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V4I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v4.b32.trap \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+
+
+def SULD_3D_V4I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v4.b8.trap \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V4I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v4.b16.trap \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V4I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v4.b32.trap \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+}
+
+// .zero variant
+let IsSuld = 1 in {
+def SULD_1D_I8_ZERO
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.b8.zero \\{$r\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_I16_ZERO
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.b16.zero \\{$r\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_I32_ZERO
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.b32.zero \\{$r\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_I64_ZERO
+  : NVPTXInst<(outs Int64Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.b64.zero \\{$r\\}, [$s, \\{$x\\}];",
+              []>;
+
+def SULD_1D_ARRAY_I8_ZERO
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.b8.zero \\{$r\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_I16_ZERO
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.b16.zero \\{$r\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_I32_ZERO
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.b32.zero \\{$r\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_I64_ZERO
+  : NVPTXInst<(outs Int64Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.b64.zero \\{$r\\}, [$s, \\{$l, $x\\}];",
+              []>;
+
+def SULD_2D_I8_ZERO
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.b8.zero \\{$r\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_I16_ZERO
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.b16.zero \\{$r\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_I32_ZERO
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.b32.zero \\{$r\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_I64_ZERO
+  : NVPTXInst<(outs Int64Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.b64.zero \\{$r\\}, [$s, \\{$x, $y\\}];",
+              []>;
+
+def SULD_2D_ARRAY_I8_ZERO
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.b8.zero \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_I16_ZERO
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.b16.zero \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_I32_ZERO
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.b32.zero \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_I64_ZERO
+  : NVPTXInst<(outs Int64Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.b64.zero \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+
+def SULD_3D_I8_ZERO
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.b8.zero \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_I16_ZERO
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.b16.zero \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_I32_ZERO
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.b32.zero \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_I64_ZERO
+  : NVPTXInst<(outs Int64Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.b64.zero \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+}
+
+let IsSuld = 2 in {
+def SULD_1D_V2I8_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v2.b8.zero \\{$r, $g\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V2I16_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v2.b16.zero \\{$r, $g\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V2I32_ZERO
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v2.b32.zero \\{$r, $g\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V2I64_ZERO
+  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v2.b64.zero \\{$r, $g\\}, [$s, \\{$x\\}];",
+              []>;
+
+def SULD_1D_ARRAY_V2I8_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v2.b8.zero \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V2I16_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v2.b16.zero \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V2I32_ZERO
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v2.b32.zero \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V2I64_ZERO
+  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v2.b64.zero \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+              []>;
+
+def SULD_2D_V2I8_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v2.b8.zero \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V2I16_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v2.b16.zero \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V2I32_ZERO
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v2.b32.zero \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V2I64_ZERO
+  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v2.b64.zero \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+              []>;
+
+def SULD_2D_ARRAY_V2I8_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v2.b8.zero \\{$r, $g\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V2I16_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v2.b16.zero \\{$r, $g\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V2I32_ZERO
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v2.b32.zero \\{$r, $g\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V2I64_ZERO
+  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v2.b64.zero \\{$r, $g\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+
+def SULD_3D_V2I8_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v2.b8.zero \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V2I16_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v2.b16.zero \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V2I32_ZERO
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v2.b32.zero \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V2I64_ZERO
+  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v2.b64.zero \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+}
+
+let IsSuld = 3 in {
+def SULD_1D_V4I8_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v4.b8.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V4I16_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v4.b16.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V4I32_ZERO
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v4.b32.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+              []>;
+
+def SULD_1D_ARRAY_V4I8_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v4.b8.zero \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V4I16_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v4.b16.zero \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V4I32_ZERO
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v4.b32.zero \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x\\}];",
+              []>;
+
+def SULD_2D_V4I8_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v4.b8.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V4I16_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v4.b16.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V4I32_ZERO
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v4.b32.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+              []>;
+
+def SULD_2D_ARRAY_V4I8_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v4.b8.zero \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V4I16_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v4.b16.zero \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V4I32_ZERO
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v4.b32.zero \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+
+
+def SULD_3D_V4I8_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v4.b8.zero \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V4I16_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v4.b16.zero \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V4I32_ZERO
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v4.b32.zero \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+}
+
+//-----------------------------------
+// Texture Query Intrinsics
+//-----------------------------------
+
+let IsSurfTexQuery = 1 in {
+def TXQ_CHANNEL_ORDER
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "txq.channel_order.b32 \t$d, [$a];",
+              []>;
+def TXQ_CHANNEL_DATA_TYPE
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "txq.channel_data_type.b32 \t$d, [$a];",
+              []>;
+def TXQ_WIDTH
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "txq.width.b32 \t$d, [$a];",
+              []>;
+def TXQ_HEIGHT
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "txq.height.b32 \t$d, [$a];",
+              []>;
+def TXQ_DEPTH
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "txq.depth.b32 \t$d, [$a];",
+              []>;
+def TXQ_ARRAY_SIZE
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "txq.array_size.b32 \t$d, [$a];",
+              []>;
+def TXQ_NUM_SAMPLES
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "txq.num_samples.b32 \t$d, [$a];",
+              []>;
+def TXQ_NUM_MIPMAP_LEVELS
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "txq.num_mipmap_levels.b32 \t$d, [$a];",
+              []>;
+}
+
+def : Pat<(int_nvvm_txq_channel_order Int64Regs:$a),
+          (TXQ_CHANNEL_ORDER Int64Regs:$a)>;
+def : Pat<(int_nvvm_txq_channel_data_type Int64Regs:$a),
+          (TXQ_CHANNEL_DATA_TYPE Int64Regs:$a)>;
+def : Pat<(int_nvvm_txq_width Int64Regs:$a),
+          (TXQ_WIDTH Int64Regs:$a)>;
+def : Pat<(int_nvvm_txq_height Int64Regs:$a),
+          (TXQ_HEIGHT Int64Regs:$a)>;
+def : Pat<(int_nvvm_txq_depth Int64Regs:$a),
+          (TXQ_DEPTH Int64Regs:$a)>;
+def : Pat<(int_nvvm_txq_array_size Int64Regs:$a),
+          (TXQ_ARRAY_SIZE Int64Regs:$a)>;
+def : Pat<(int_nvvm_txq_num_samples Int64Regs:$a),
+          (TXQ_NUM_SAMPLES Int64Regs:$a)>;
+def : Pat<(int_nvvm_txq_num_mipmap_levels Int64Regs:$a),
+          (TXQ_NUM_MIPMAP_LEVELS Int64Regs:$a)>;
+
+
+//-----------------------------------
+// Surface Query Intrinsics
+//-----------------------------------
+
+let IsSurfTexQuery = 1 in {
+def SUQ_CHANNEL_ORDER
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "suq.channel_order.b32 \t$d, [$a];",
+              []>;
+def SUQ_CHANNEL_DATA_TYPE
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "suq.channel_data_type.b32 \t$d, [$a];",
+              []>;
+def SUQ_WIDTH
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "suq.width.b32 \t$d, [$a];",
+              []>;
+def SUQ_HEIGHT
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "suq.height.b32 \t$d, [$a];",
+              []>;
+def SUQ_DEPTH
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "suq.depth.b32 \t$d, [$a];",
+              []>;
+def SUQ_ARRAY_SIZE
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "suq.array_size.b32 \t$d, [$a];",
+              []>;
+}
+
+def : Pat<(int_nvvm_suq_channel_order Int64Regs:$a),
+          (SUQ_CHANNEL_ORDER Int64Regs:$a)>;
+def : Pat<(int_nvvm_suq_channel_data_type Int64Regs:$a),
+          (SUQ_CHANNEL_DATA_TYPE Int64Regs:$a)>;
+def : Pat<(int_nvvm_suq_width Int64Regs:$a),
+          (SUQ_WIDTH Int64Regs:$a)>;
+def : Pat<(int_nvvm_suq_height Int64Regs:$a),
+          (SUQ_HEIGHT Int64Regs:$a)>;
+def : Pat<(int_nvvm_suq_depth Int64Regs:$a),
+          (SUQ_DEPTH Int64Regs:$a)>;
+def : Pat<(int_nvvm_suq_array_size Int64Regs:$a),
+          (SUQ_ARRAY_SIZE Int64Regs:$a)>;
+
+
+//===- Handle Query -------------------------------------------------------===//
+
+// TODO: These intrinsics are not yet finalized, pending PTX ISA design work
+def ISTYPEP_SAMPLER
+  : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+              "istypep.samplerref \t$d, $a;",
+              [(set Int1Regs:$d, (int_nvvm_istypep_sampler Int64Regs:$a))]>;
+def ISTYPEP_SURFACE
+  : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+              "istypep.surfref \t$d, $a;",
+              [(set Int1Regs:$d, (int_nvvm_istypep_surface Int64Regs:$a))]>;
+def ISTYPEP_TEXTURE
+  : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+              "istypep.texref \t$d, $a;",
+              [(set Int1Regs:$d, (int_nvvm_istypep_texture Int64Regs:$a))]>;
+
+//===- Surface Stores -----------------------------------------------------===//
+
+let IsSust = 1 in {
+// Unformatted
+// .clamp variant
+def SUST_B_1D_B8_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+              "sust.b.1d.b8.clamp \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_B16_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+              "sust.b.1d.b16.clamp \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_B32_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
+              "sust.b.1d.b32.clamp \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_B64_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r),
+              "sust.b.1d.b64.clamp \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_V2B8_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+              "sust.b.1d.v2.b8.clamp \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_V2B16_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+              "sust.b.1d.v2.b16.clamp \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_V2B32_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+              "sust.b.1d.v2.b32.clamp \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_V2B64_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
+              "sust.b.1d.v2.b64.clamp \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_V4B8_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
+                   Int16Regs:$b, Int16Regs:$a),
+              "sust.b.1d.v4.b8.clamp \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_1D_V4B16_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
+                   Int16Regs:$b, Int16Regs:$a),
+              "sust.b.1d.v4.b16.clamp \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_1D_V4B32_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g,
+                   Int32Regs:$b, Int32Regs:$a),
+              "sust.b.1d.v4.b32.clamp \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_B_1D_ARRAY_B8_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
+              "sust.b.a1d.b8.clamp \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_ARRAY_B16_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
+              "sust.b.a1d.b16.clamp \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_ARRAY_B32_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r),
+              "sust.b.a1d.b32.clamp \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_ARRAY_B64_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r),
+              "sust.b.a1d.b64.clamp \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_ARRAY_V2B8_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.b.a1d.v2.b8.clamp \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_ARRAY_V2B16_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.b.a1d.v2.b16.clamp \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_ARRAY_V2B32_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
+                   Int32Regs:$g),
+              "sust.b.a1d.v2.b32.clamp \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_ARRAY_V2B64_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r,
+                   Int64Regs:$g),
+              "sust.b.a1d.v2.b64.clamp \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_ARRAY_V4B8_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              "sust.b.a1d.v4.b8.clamp \t[$s, \\{$idx, $x\\}], "
+              "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_1D_ARRAY_V4B16_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+             "sust.b.a1d.v4.b16.clamp \t[$s, \\{$idx, $x\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_1D_ARRAY_V4B32_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
+                   Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+             "sust.b.a1d.v4.b32.clamp \t[$s, \\{$idx, $x\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_B_2D_B8_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+              "sust.b.2d.b8.clamp \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_B16_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+              "sust.b.2d.b16.clamp \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_B32_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+              "sust.b.2d.b32.clamp \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_B64_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
+              "sust.b.2d.b64.clamp \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_V2B8_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.b.2d.v2.b8.clamp \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_2D_V2B16_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.b.2d.v2.b16.clamp \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_2D_V2B32_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+                   Int32Regs:$g),
+              "sust.b.2d.v2.b32.clamp \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_2D_V2B64_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r,
+                   Int64Regs:$g),
+              "sust.b.2d.v2.b64.clamp \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_2D_V4B8_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              "sust.b.2d.v4.b8.clamp \t[$s, \\{$x, $y\\}], "
+              "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_2D_V4B16_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+             "sust.b.2d.v4.b16.clamp \t[$s, \\{$x, $y\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_2D_V4B32_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+                   Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+             "sust.b.2d.v4.b32.clamp \t[$s, \\{$x, $y\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_B_2D_ARRAY_B8_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r),
+              "sust.b.a2d.b8.clamp \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_ARRAY_B16_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r),
+              "sust.b.a2d.b16.clamp \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_ARRAY_B32_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$r),
+              "sust.b.a2d.b32.clamp \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_ARRAY_B64_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int64Regs:$r),
+              "sust.b.a2d.b64.clamp \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_ARRAY_V2B8_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g),
+              "sust.b.a2d.v2.b8.clamp \t[$s, \\{$idx, $x, $y, $y\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_2D_ARRAY_V2B16_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g),
+             "sust.b.a2d.v2.b16.clamp \t[$s, \\{$idx, $x, $y, $y\\}], "
+             "\\{$r, $g\\};",
+              []>;
+def SUST_B_2D_ARRAY_V2B32_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$r, Int32Regs:$g),
+             "sust.b.a2d.v2.b32.clamp \t[$s, \\{$idx, $x, $y, $y\\}], "
+             "\\{$r, $g\\};",
+              []>;
+def SUST_B_2D_ARRAY_V2B64_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int64Regs:$r, Int64Regs:$g),
+             "sust.b.a2d.v2.b64.clamp \t[$s, \\{$idx, $x, $y, $y\\}], "
+             "\\{$r, $g\\};",
+              []>;
+def SUST_B_2D_ARRAY_V4B8_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+      "sust.b.a2d.v4.b8.clamp \t[$s, \\{$idx, $x, $y, $y\\}], "
+      "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_2D_ARRAY_V4B16_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+     "sust.b.a2d.v4.b16.clamp \t[$s, \\{$idx, $x, $y, $y\\}], "
+     "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_2D_ARRAY_V4B32_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+     "sust.b.a2d.v4.b32.clamp \t[$s, \\{$idx, $x, $y, $y\\}], "
+     "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_B_3D_B8_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r),
+              "sust.b.3d.b8.clamp \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_B_3D_B16_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r),
+              "sust.b.3d.b16.clamp \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_B_3D_B32_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int32Regs:$r),
+              "sust.b.3d.b32.clamp \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_B_3D_B64_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int64Regs:$r),
+              "sust.b.3d.b64.clamp \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_B_3D_V2B8_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g),
+              "sust.b.3d.v2.b8.clamp \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_3D_V2B16_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g),
+              "sust.b.3d.v2.b16.clamp \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_3D_V2B32_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int32Regs:$r, Int32Regs:$g),
+              "sust.b.3d.v2.b32.clamp \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_3D_V2B64_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int64Regs:$r, Int64Regs:$g),
+              "sust.b.3d.v2.b64.clamp \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_3D_V4B8_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+         "sust.b.3d.v4.b8.clamp \t[$s, \\{$x, $y, $z, $z\\}], "
+         "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_3D_V4B16_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+        "sust.b.3d.v4.b16.clamp \t[$s, \\{$x, $y, $z, $z\\}], "
+        "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_3D_V4B32_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+        "sust.b.3d.v4.b32.clamp \t[$s, \\{$x, $y, $z, $z\\}], "
+        "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+// .trap variant
+def SUST_B_1D_B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+              "sust.b.1d.b8.trap \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+              "sust.b.1d.b16.trap \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
+              "sust.b.1d.b32.trap \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_B64_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r),
+              "sust.b.1d.b64.trap \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_V2B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+              "sust.b.1d.v2.b8.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_V2B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+              "sust.b.1d.v2.b16.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_V2B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+              "sust.b.1d.v2.b32.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_V2B64_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
+              "sust.b.1d.v2.b64.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_V4B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
+                   Int16Regs:$b, Int16Regs:$a),
+              "sust.b.1d.v4.b8.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_1D_V4B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
+                   Int16Regs:$b, Int16Regs:$a),
+              "sust.b.1d.v4.b16.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_1D_V4B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g,
+                   Int32Regs:$b, Int32Regs:$a),
+              "sust.b.1d.v4.b32.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_B_1D_ARRAY_B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
+              "sust.b.a1d.b8.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_ARRAY_B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
+              "sust.b.a1d.b16.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_ARRAY_B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r),
+              "sust.b.a1d.b32.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_ARRAY_B64_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r),
+              "sust.b.a1d.b64.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_ARRAY_V2B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.b.a1d.v2.b8.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_ARRAY_V2B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.b.a1d.v2.b16.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_ARRAY_V2B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
+                   Int32Regs:$g),
+              "sust.b.a1d.v2.b32.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_ARRAY_V2B64_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r,
+                   Int64Regs:$g),
+              "sust.b.a1d.v2.b64.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_ARRAY_V4B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              "sust.b.a1d.v4.b8.trap \t[$s, \\{$idx, $x\\}], "
+              "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_1D_ARRAY_V4B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+             "sust.b.a1d.v4.b16.trap \t[$s, \\{$idx, $x\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_1D_ARRAY_V4B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
+                   Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+             "sust.b.a1d.v4.b32.trap \t[$s, \\{$idx, $x\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_B_2D_B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+              "sust.b.2d.b8.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+              "sust.b.2d.b16.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+              "sust.b.2d.b32.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_B64_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
+              "sust.b.2d.b64.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_V2B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.b.2d.v2.b8.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_2D_V2B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.b.2d.v2.b16.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_2D_V2B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+                   Int32Regs:$g),
+              "sust.b.2d.v2.b32.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_2D_V2B64_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r,
+                   Int64Regs:$g),
+              "sust.b.2d.v2.b64.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_2D_V4B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              "sust.b.2d.v4.b8.trap \t[$s, \\{$x, $y\\}], "
+              "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_2D_V4B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+             "sust.b.2d.v4.b16.trap \t[$s, \\{$x, $y\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_2D_V4B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+                   Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+             "sust.b.2d.v4.b32.trap \t[$s, \\{$x, $y\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_B_2D_ARRAY_B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r),
+              "sust.b.a2d.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_ARRAY_B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r),
+              "sust.b.a2d.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_ARRAY_B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$r),
+              "sust.b.a2d.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_ARRAY_B64_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int64Regs:$r),
+              "sust.b.a2d.b64.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_ARRAY_V2B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g),
+              "sust.b.a2d.v2.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_2D_ARRAY_V2B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g),
+             "sust.b.a2d.v2.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+             "\\{$r, $g\\};",
+              []>;
+def SUST_B_2D_ARRAY_V2B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$r, Int32Regs:$g),
+             "sust.b.a2d.v2.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+             "\\{$r, $g\\};",
+              []>;
+def SUST_B_2D_ARRAY_V2B64_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int64Regs:$r, Int64Regs:$g),
+             "sust.b.a2d.v2.b64.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+             "\\{$r, $g\\};",
+              []>;
+def SUST_B_2D_ARRAY_V4B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+      "sust.b.a2d.v4.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+      "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_2D_ARRAY_V4B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+     "sust.b.a2d.v4.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+     "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_2D_ARRAY_V4B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+     "sust.b.a2d.v4.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+     "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_B_3D_B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r),
+              "sust.b.3d.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_B_3D_B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r),
+              "sust.b.3d.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_B_3D_B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int32Regs:$r),
+              "sust.b.3d.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_B_3D_B64_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int64Regs:$r),
+              "sust.b.3d.b64.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_B_3D_V2B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g),
+              "sust.b.3d.v2.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_3D_V2B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g),
+              "sust.b.3d.v2.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_3D_V2B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int32Regs:$r, Int32Regs:$g),
+              "sust.b.3d.v2.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_3D_V2B64_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int64Regs:$r, Int64Regs:$g),
+              "sust.b.3d.v2.b64.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_3D_V4B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+         "sust.b.3d.v4.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+         "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_3D_V4B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+        "sust.b.3d.v4.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+        "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_3D_V4B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+        "sust.b.3d.v4.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+        "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+// .zero variant
+def SUST_B_1D_B8_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+              "sust.b.1d.b8.zero \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_B16_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+              "sust.b.1d.b16.zero \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_B32_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
+              "sust.b.1d.b32.zero \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_B64_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r),
+              "sust.b.1d.b64.zero \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_V2B8_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+              "sust.b.1d.v2.b8.zero \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_V2B16_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+              "sust.b.1d.v2.b16.zero \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_V2B32_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+              "sust.b.1d.v2.b32.zero \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_V2B64_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
+              "sust.b.1d.v2.b64.zero \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_V4B8_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
+                   Int16Regs:$b, Int16Regs:$a),
+              "sust.b.1d.v4.b8.zero \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_1D_V4B16_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
+                   Int16Regs:$b, Int16Regs:$a),
+              "sust.b.1d.v4.b16.zero \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_1D_V4B32_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g,
+                   Int32Regs:$b, Int32Regs:$a),
+              "sust.b.1d.v4.b32.zero \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_B_1D_ARRAY_B8_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
+              "sust.b.a1d.b8.zero \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_ARRAY_B16_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
+              "sust.b.a1d.b16.zero \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_ARRAY_B32_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r),
+              "sust.b.a1d.b32.zero \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_ARRAY_B64_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r),
+              "sust.b.a1d.b64.zero \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_ARRAY_V2B8_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.b.a1d.v2.b8.zero \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_ARRAY_V2B16_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.b.a1d.v2.b16.zero \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_ARRAY_V2B32_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
+                   Int32Regs:$g),
+              "sust.b.a1d.v2.b32.zero \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_ARRAY_V2B64_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r,
+                   Int64Regs:$g),
+              "sust.b.a1d.v2.b64.zero \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_ARRAY_V4B8_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              "sust.b.a1d.v4.b8.zero \t[$s, \\{$idx, $x\\}], "
+              "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_1D_ARRAY_V4B16_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+             "sust.b.a1d.v4.b16.zero \t[$s, \\{$idx, $x\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_1D_ARRAY_V4B32_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
+                   Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+             "sust.b.a1d.v4.b32.zero \t[$s, \\{$idx, $x\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_B_2D_B8_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+              "sust.b.2d.b8.zero \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_B16_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+              "sust.b.2d.b16.zero \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_B32_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+              "sust.b.2d.b32.zero \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_B64_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
+              "sust.b.2d.b64.zero \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_V2B8_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.b.2d.v2.b8.zero \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_2D_V2B16_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.b.2d.v2.b16.zero \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_2D_V2B32_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+                   Int32Regs:$g),
+              "sust.b.2d.v2.b32.zero \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_2D_V2B64_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r,
+                   Int64Regs:$g),
+              "sust.b.2d.v2.b64.zero \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_2D_V4B8_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              "sust.b.2d.v4.b8.zero \t[$s, \\{$x, $y\\}], "
+              "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_2D_V4B16_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+             "sust.b.2d.v4.b16.zero \t[$s, \\{$x, $y\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_2D_V4B32_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+                   Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+             "sust.b.2d.v4.b32.zero \t[$s, \\{$x, $y\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_B_2D_ARRAY_B8_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r),
+              "sust.b.a2d.b8.zero \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_ARRAY_B16_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r),
+              "sust.b.a2d.b16.zero \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_ARRAY_B32_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$r),
+              "sust.b.a2d.b32.zero \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_ARRAY_B64_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int64Regs:$r),
+              "sust.b.a2d.b64.zero \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_ARRAY_V2B8_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g),
+              "sust.b.a2d.v2.b8.zero \t[$s, \\{$idx, $x, $y, $y\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_2D_ARRAY_V2B16_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g),
+             "sust.b.a2d.v2.b16.zero \t[$s, \\{$idx, $x, $y, $y\\}], "
+             "\\{$r, $g\\};",
+              []>;
+def SUST_B_2D_ARRAY_V2B32_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$r, Int32Regs:$g),
+             "sust.b.a2d.v2.b32.zero \t[$s, \\{$idx, $x, $y, $y\\}], "
+             "\\{$r, $g\\};",
+              []>;
+def SUST_B_2D_ARRAY_V2B64_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int64Regs:$r, Int64Regs:$g),
+             "sust.b.a2d.v2.b64.zero \t[$s, \\{$idx, $x, $y, $y\\}], "
+             "\\{$r, $g\\};",
+              []>;
+def SUST_B_2D_ARRAY_V4B8_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+      "sust.b.a2d.v4.b8.zero \t[$s, \\{$idx, $x, $y, $y\\}], "
+      "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_2D_ARRAY_V4B16_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+     "sust.b.a2d.v4.b16.zero \t[$s, \\{$idx, $x, $y, $y\\}], "
+     "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_2D_ARRAY_V4B32_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+     "sust.b.a2d.v4.b32.zero \t[$s, \\{$idx, $x, $y, $y\\}], "
+     "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_B_3D_B8_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r),
+              "sust.b.3d.b8.zero \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_B_3D_B16_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r),
+              "sust.b.3d.b16.zero \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_B_3D_B32_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int32Regs:$r),
+              "sust.b.3d.b32.zero \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_B_3D_B64_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int64Regs:$r),
+              "sust.b.3d.b64.zero \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_B_3D_V2B8_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g),
+              "sust.b.3d.v2.b8.zero \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_3D_V2B16_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g),
+              "sust.b.3d.v2.b16.zero \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_3D_V2B32_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int32Regs:$r, Int32Regs:$g),
+              "sust.b.3d.v2.b32.zero \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_3D_V2B64_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int64Regs:$r, Int64Regs:$g),
+              "sust.b.3d.v2.b64.zero \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_3D_V4B8_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+         "sust.b.3d.v4.b8.zero \t[$s, \\{$x, $y, $z, $z\\}], "
+         "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_3D_V4B16_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+        "sust.b.3d.v4.b16.zero \t[$s, \\{$x, $y, $z, $z\\}], "
+        "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_3D_V4B32_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+        "sust.b.3d.v4.b32.zero \t[$s, \\{$x, $y, $z, $z\\}], "
+        "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+
+// Formatted
+
+def SUST_P_1D_B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+              "sust.p.1d.b8.trap \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_P_1D_B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+              "sust.p.1d.b16.trap \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_P_1D_B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
+              "sust.p.1d.b32.trap \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_P_1D_V2B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+              "sust.p.1d.v2.b8.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_P_1D_V2B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+              "sust.p.1d.v2.b16.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_P_1D_V2B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+              "sust.p.1d.v2.b32.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_P_1D_V4B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
+                   Int16Regs:$b, Int16Regs:$a),
+              "sust.p.1d.v4.b8.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_P_1D_V4B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
+                   Int16Regs:$b, Int16Regs:$a),
+              "sust.p.1d.v4.b16.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_P_1D_V4B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g,
+                   Int32Regs:$b, Int32Regs:$a),
+              "sust.p.1d.v4.b32.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_P_1D_ARRAY_B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
+              "sust.p.a1d.b8.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_P_1D_ARRAY_B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
+              "sust.p.a1d.b16.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_P_1D_ARRAY_B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r),
+              "sust.p.a1d.b32.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_P_1D_ARRAY_V2B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.p.a1d.v2.b8.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_P_1D_ARRAY_V2B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.p.a1d.v2.b16.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_P_1D_ARRAY_V2B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
+                   Int32Regs:$g),
+              "sust.p.a1d.v2.b32.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_P_1D_ARRAY_V4B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              "sust.p.a1d.v4.b8.trap \t[$s, \\{$idx, $x\\}], "
+              "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_P_1D_ARRAY_V4B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+             "sust.p.a1d.v4.b16.trap \t[$s, \\{$idx, $x\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_P_1D_ARRAY_V4B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
+                   Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+             "sust.p.a1d.v4.b32.trap \t[$s, \\{$idx, $x\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_P_2D_B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+              "sust.p.2d.b8.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_P_2D_B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+              "sust.p.2d.b16.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_P_2D_B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+              "sust.p.2d.b32.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_P_2D_V2B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.p.2d.v2.b8.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_P_2D_V2B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.p.2d.v2.b16.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_P_2D_V2B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+                   Int32Regs:$g),
+              "sust.p.2d.v2.b32.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_P_2D_V4B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              "sust.p.2d.v4.b8.trap \t[$s, \\{$x, $y\\}], "
+              "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_P_2D_V4B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+             "sust.p.2d.v4.b16.trap \t[$s, \\{$x, $y\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_P_2D_V4B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+                   Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+             "sust.p.2d.v4.b32.trap \t[$s, \\{$x, $y\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_P_2D_ARRAY_B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r),
+              "sust.p.a2d.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_P_2D_ARRAY_B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r),
+              "sust.p.a2d.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_P_2D_ARRAY_B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$r),
+              "sust.p.a2d.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_P_2D_ARRAY_V2B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g),
+              "sust.p.a2d.v2.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_P_2D_ARRAY_V2B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g),
+             "sust.p.a2d.v2.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+             "\\{$r, $g\\};",
+              []>;
+def SUST_P_2D_ARRAY_V2B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$r, Int32Regs:$g),
+             "sust.p.a2d.v2.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+             "\\{$r, $g\\};",
+              []>;
+def SUST_P_2D_ARRAY_V4B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+      "sust.p.a2d.v4.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+      "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_P_2D_ARRAY_V4B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+     "sust.p.a2d.v4.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+     "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_P_2D_ARRAY_V4B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+     "sust.p.a2d.v4.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+     "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_P_3D_B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r),
+              "sust.p.3d.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_P_3D_B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r),
+              "sust.p.3d.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_P_3D_B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int32Regs:$r),
+              "sust.p.3d.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_P_3D_V2B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g),
+              "sust.p.3d.v2.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_P_3D_V2B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g),
+              "sust.p.3d.v2.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_P_3D_V2B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int32Regs:$r, Int32Regs:$g),
+              "sust.p.3d.v2.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_P_3D_V4B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+         "sust.p.3d.v4.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+         "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_P_3D_V4B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+        "sust.p.3d.v4.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+        "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_P_3D_V4B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+        "sust.p.3d.v4.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+        "\\{$r, $g, $b, $a\\};",
+              []>;
+}
+
+// Surface store instruction patterns
+// I'm not sure why we can't just include these in the instruction definitions,
+// but TableGen complains of type errors :(
+
+// .clamp variant
+def : Pat<(int_nvvm_sust_b_1d_i8_clamp
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+          (SUST_B_1D_B8_CLAMP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_i16_clamp
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+          (SUST_B_1D_B16_CLAMP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_i32_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
+          (SUST_B_1D_B32_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_i64_clamp
+           Int64Regs:$s, Int32Regs:$x, Int64Regs:$r),
+          (SUST_B_1D_B64_CLAMP Int64Regs:$s, Int32Regs:$x, Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i8_clamp
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_1D_V2B8_CLAMP Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i16_clamp
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_1D_V2B16_CLAMP Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i32_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+          (SUST_B_1D_V2B32_CLAMP Int64Regs:$s, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i64_clamp
+           Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
+          (SUST_B_1D_V2B64_CLAMP Int64Regs:$s, Int32Regs:$x,
+           Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i8_clamp
+           Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_1D_V4B8_CLAMP Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i16_clamp
+           Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_1D_V4B16_CLAMP Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i32_clamp
+           Int64Regs:$s, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_1D_V4B32_CLAMP Int64Regs:$s, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_1d_array_i8_clamp
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
+          (SUST_B_1D_ARRAY_B8_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_i16_clamp
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
+          (SUST_B_1D_ARRAY_B16_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_i32_clamp
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r),
+          (SUST_B_1D_ARRAY_B32_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_i64_clamp
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r),
+          (SUST_B_1D_ARRAY_B64_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i8_clamp
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_1D_ARRAY_V2B8_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i16_clamp
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_1D_ARRAY_V2B16_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i32_clamp
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+          (SUST_B_1D_ARRAY_V2B32_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i64_clamp
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
+          (SUST_B_1D_ARRAY_V2B64_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i8_clamp
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_1D_ARRAY_V4B8_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i16_clamp
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_1D_ARRAY_V4B16_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i32_clamp
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_1D_ARRAY_V4B32_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_2d_i8_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_B_2D_B8_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_i16_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_B_2D_B16_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_i32_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+          (SUST_B_2D_B32_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_i64_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
+          (SUST_B_2D_B64_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i8_clamp
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_2D_V2B8_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i16_clamp
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_2D_V2B16_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i32_clamp
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g),
+          (SUST_B_2D_V2B32_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i64_clamp
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g),
+          (SUST_B_2D_V2B64_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i8_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_2D_V4B8_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i16_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_2D_V4B16_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i32_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_2D_V4B32_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_2d_array_i8_clamp
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_B_2D_ARRAY_B8_CLAMP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_i16_clamp
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_B_2D_ARRAY_B16_CLAMP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_i32_clamp
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+          (SUST_B_2D_ARRAY_B32_CLAMP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_i64_clamp
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
+          (SUST_B_2D_ARRAY_B64_CLAMP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i8_clamp
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_2D_ARRAY_V2B8_CLAMP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i16_clamp
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_2D_ARRAY_V2B16_CLAMP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i32_clamp
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+           Int32Regs:$g),
+          (SUST_B_2D_ARRAY_V2B32_CLAMP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i64_clamp
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r,
+           Int64Regs:$g),
+          (SUST_B_2D_ARRAY_V2B64_CLAMP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i8_clamp
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_2D_ARRAY_V4B8_CLAMP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i16_clamp
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_2D_ARRAY_V4B16_CLAMP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i32_clamp
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_2D_ARRAY_V4B32_CLAMP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_3d_i8_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r),
+          (SUST_B_3D_B8_CLAMP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_i16_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r),
+          (SUST_B_3D_B16_CLAMP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_i32_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r),
+          (SUST_B_3D_B32_CLAMP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_i64_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int64Regs:$r),
+          (SUST_B_3D_B64_CLAMP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i8_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_3D_V2B8_CLAMP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i16_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_3D_V2B16_CLAMP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i32_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g),
+          (SUST_B_3D_V2B32_CLAMP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i64_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int64Regs:$r, Int64Regs:$g),
+          (SUST_B_3D_V2B64_CLAMP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i8_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_3D_V4B8_CLAMP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i16_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_3D_V4B16_CLAMP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i32_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_3D_V4B32_CLAMP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+// .trap variant
+def : Pat<(int_nvvm_sust_b_1d_i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+          (SUST_B_1D_B8_TRAP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+          (SUST_B_1D_B16_TRAP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
+          (SUST_B_1D_B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_i64_trap
+           Int64Regs:$s, Int32Regs:$x, Int64Regs:$r),
+          (SUST_B_1D_B64_TRAP Int64Regs:$s, Int32Regs:$x, Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_1D_V2B8_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_1D_V2B16_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+          (SUST_B_1D_V2B32_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i64_trap
+           Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
+          (SUST_B_1D_V2B64_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i8_trap
+           Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_1D_V4B8_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i16_trap
+           Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_1D_V4B16_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i32_trap
+           Int64Regs:$s, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_1D_V4B32_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_1d_array_i8_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
+          (SUST_B_1D_ARRAY_B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_i16_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
+          (SUST_B_1D_ARRAY_B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_i32_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r),
+          (SUST_B_1D_ARRAY_B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_i64_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r),
+          (SUST_B_1D_ARRAY_B64_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i8_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_1D_ARRAY_V2B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i16_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_1D_ARRAY_V2B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i32_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+          (SUST_B_1D_ARRAY_V2B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i64_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
+          (SUST_B_1D_ARRAY_V2B64_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i8_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_1D_ARRAY_V4B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i16_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_1D_ARRAY_V4B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i32_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_1D_ARRAY_V4B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_2d_i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_B_2D_B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_B_2D_B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+          (SUST_B_2D_B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_i64_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
+          (SUST_B_2D_B64_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i8_trap
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_2D_V2B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i16_trap
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_2D_V2B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i32_trap
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g),
+          (SUST_B_2D_V2B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i64_trap
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g),
+          (SUST_B_2D_V2B64_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_2D_V4B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_2D_V4B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_2D_V4B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_2d_array_i8_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_B_2D_ARRAY_B8_TRAP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_i16_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_B_2D_ARRAY_B16_TRAP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_i32_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+          (SUST_B_2D_ARRAY_B32_TRAP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_i64_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
+          (SUST_B_2D_ARRAY_B64_TRAP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i8_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_2D_ARRAY_V2B8_TRAP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i16_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_2D_ARRAY_V2B16_TRAP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i32_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+           Int32Regs:$g),
+          (SUST_B_2D_ARRAY_V2B32_TRAP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i64_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r,
+           Int64Regs:$g),
+          (SUST_B_2D_ARRAY_V2B64_TRAP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i8_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_2D_ARRAY_V4B8_TRAP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i16_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_2D_ARRAY_V4B16_TRAP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i32_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_2D_ARRAY_V4B32_TRAP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_3d_i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r),
+          (SUST_B_3D_B8_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r),
+          (SUST_B_3D_B16_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r),
+          (SUST_B_3D_B32_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_i64_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int64Regs:$r),
+          (SUST_B_3D_B64_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_3D_V2B8_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_3D_V2B16_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g),
+          (SUST_B_3D_V2B32_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i64_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int64Regs:$r, Int64Regs:$g),
+          (SUST_B_3D_V2B64_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_3D_V4B8_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_3D_V4B16_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_3D_V4B32_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+// .zero variant
+def : Pat<(int_nvvm_sust_b_1d_i8_zero
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+          (SUST_B_1D_B8_ZERO Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_i16_zero
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+          (SUST_B_1D_B16_ZERO Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_i32_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
+          (SUST_B_1D_B32_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_i64_zero
+           Int64Regs:$s, Int32Regs:$x, Int64Regs:$r),
+          (SUST_B_1D_B64_ZERO Int64Regs:$s, Int32Regs:$x, Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i8_zero
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_1D_V2B8_ZERO Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i16_zero
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_1D_V2B16_ZERO Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i32_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+          (SUST_B_1D_V2B32_ZERO Int64Regs:$s, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i64_zero
+           Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
+          (SUST_B_1D_V2B64_ZERO Int64Regs:$s, Int32Regs:$x,
+           Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i8_zero
+           Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_1D_V4B8_ZERO Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i16_zero
+           Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_1D_V4B16_ZERO Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i32_zero
+           Int64Regs:$s, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_1D_V4B32_ZERO Int64Regs:$s, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_1d_array_i8_zero
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
+          (SUST_B_1D_ARRAY_B8_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_i16_zero
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
+          (SUST_B_1D_ARRAY_B16_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_i32_zero
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r),
+          (SUST_B_1D_ARRAY_B32_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_i64_zero
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r),
+          (SUST_B_1D_ARRAY_B64_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i8_zero
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_1D_ARRAY_V2B8_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i16_zero
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_1D_ARRAY_V2B16_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i32_zero
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+          (SUST_B_1D_ARRAY_V2B32_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i64_zero
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
+          (SUST_B_1D_ARRAY_V2B64_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i8_zero
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_1D_ARRAY_V4B8_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i16_zero
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_1D_ARRAY_V4B16_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i32_zero
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_1D_ARRAY_V4B32_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_2d_i8_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_B_2D_B8_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_i16_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_B_2D_B16_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_i32_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+          (SUST_B_2D_B32_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_i64_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
+          (SUST_B_2D_B64_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i8_zero
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_2D_V2B8_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i16_zero
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_2D_V2B16_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i32_zero
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g),
+          (SUST_B_2D_V2B32_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i64_zero
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g),
+          (SUST_B_2D_V2B64_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i8_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_2D_V4B8_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i16_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_2D_V4B16_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i32_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_2D_V4B32_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_2d_array_i8_zero
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_B_2D_ARRAY_B8_ZERO Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_i16_zero
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_B_2D_ARRAY_B16_ZERO Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_i32_zero
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+          (SUST_B_2D_ARRAY_B32_ZERO Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_i64_zero
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
+          (SUST_B_2D_ARRAY_B64_ZERO Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i8_zero
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_2D_ARRAY_V2B8_ZERO Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i16_zero
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_2D_ARRAY_V2B16_ZERO Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i32_zero
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+           Int32Regs:$g),
+          (SUST_B_2D_ARRAY_V2B32_ZERO Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i64_zero
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r,
+           Int64Regs:$g),
+          (SUST_B_2D_ARRAY_V2B64_ZERO Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i8_zero
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_2D_ARRAY_V4B8_ZERO Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i16_zero
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_2D_ARRAY_V4B16_ZERO Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i32_zero
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_2D_ARRAY_V4B32_ZERO Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_3d_i8_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r),
+          (SUST_B_3D_B8_ZERO Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_i16_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r),
+          (SUST_B_3D_B16_ZERO Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_i32_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r),
+          (SUST_B_3D_B32_ZERO Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_i64_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int64Regs:$r),
+          (SUST_B_3D_B64_ZERO Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i8_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_3D_V2B8_ZERO Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i16_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_3D_V2B16_ZERO Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i32_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g),
+          (SUST_B_3D_V2B32_ZERO Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i64_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int64Regs:$r, Int64Regs:$g),
+          (SUST_B_3D_V2B64_ZERO Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i8_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_3D_V4B8_ZERO Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i16_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_3D_V4B16_ZERO Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i32_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_3D_V4B32_ZERO Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+
+def : Pat<(int_nvvm_sust_p_1d_i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+          (SUST_P_1D_B8_TRAP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_1d_i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+          (SUST_P_1D_B16_TRAP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_1d_i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
+          (SUST_P_1D_B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_1d_v2i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_P_1D_V2B8_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_1d_v2i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_P_1D_V2B16_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_1d_v2i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+          (SUST_P_1D_V2B32_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_1d_v4i8_trap
+           Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_P_1D_V4B8_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_1d_v4i16_trap
+           Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_P_1D_V4B16_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_1d_v4i32_trap
+           Int64Regs:$s, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_P_1D_V4B32_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_p_1d_array_i8_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
+          (SUST_P_1D_ARRAY_B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_i16_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
+          (SUST_P_1D_ARRAY_B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_i32_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r),
+          (SUST_P_1D_ARRAY_B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_v2i8_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_P_1D_ARRAY_V2B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_v2i16_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_P_1D_ARRAY_V2B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_v2i32_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+          (SUST_P_1D_ARRAY_V2B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_v4i8_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_P_1D_ARRAY_V4B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_v4i16_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_P_1D_ARRAY_V4B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_v4i32_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_P_1D_ARRAY_V4B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_p_2d_i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_P_2D_B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_2d_i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_P_2D_B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_2d_i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+          (SUST_P_2D_B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_2d_v2i8_trap
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
+          (SUST_P_2D_V2B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_2d_v2i16_trap
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
+          (SUST_P_2D_V2B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_2d_v2i32_trap
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g),
+          (SUST_P_2D_V2B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_2d_v4i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_P_2D_V4B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_2d_v4i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_P_2D_V4B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_2d_v4i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_P_2D_V4B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_p_2d_array_i8_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_P_2D_ARRAY_B8_TRAP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_i16_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_P_2D_ARRAY_B16_TRAP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_i32_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+          (SUST_P_2D_ARRAY_B32_TRAP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_v2i8_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_P_2D_ARRAY_V2B8_TRAP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_v2i16_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_P_2D_ARRAY_V2B16_TRAP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_v2i32_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+           Int32Regs:$g),
+          (SUST_P_2D_ARRAY_V2B32_TRAP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_v4i8_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_P_2D_ARRAY_V4B8_TRAP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_v4i16_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_P_2D_ARRAY_V4B16_TRAP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_v4i32_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_P_2D_ARRAY_V4B32_TRAP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_p_3d_i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r),
+          (SUST_P_3D_B8_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_3d_i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r),
+          (SUST_P_3D_B16_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_3d_i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r),
+          (SUST_P_3D_B32_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_3d_v2i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_P_3D_V2B8_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_3d_v2i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_P_3D_V2B16_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_3d_v2i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g),
+          (SUST_P_3D_V2B32_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_3d_v4i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_P_3D_V4B8_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_3d_v4i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_P_3D_V4B16_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_3d_v4i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_P_3D_V4B32_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+//===-- Old PTX Back-end Intrinsics ---------------------------------------===//
+
+// These intrinsics are handled to retain compatibility with the old backend.
+
+// PTX Special Purpose Register Accessor Intrinsics
+
+class PTX_READ_SPECIAL_REGISTER_R64<string regname, Intrinsic intop>
+  : NVPTXInst<(outs Int64Regs:$d), (ins),
+              !strconcat(!strconcat("mov.u64\t$d, %", regname), ";"),
+              [(set Int64Regs:$d, (intop))]>;
+
+class PTX_READ_SPECIAL_REGISTER_R32<string regname, Intrinsic intop>
+  : NVPTXInst<(outs Int32Regs:$d), (ins),
+              !strconcat(!strconcat("mov.u32\t$d, %", regname), ";"),
+              [(set Int32Regs:$d, (intop))]>;
+
+// TODO Add read vector-version of special registers
+
+def PTX_READ_TID_X   : PTX_READ_SPECIAL_REGISTER_R32<"tid.x",
+                                                     int_ptx_read_tid_x>;
+def PTX_READ_TID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"tid.y",
+                                                     int_ptx_read_tid_y>;
+def PTX_READ_TID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"tid.z",
+                                                     int_ptx_read_tid_z>;
+def PTX_READ_TID_W   : PTX_READ_SPECIAL_REGISTER_R32<"tid.w",
+                                                     int_ptx_read_tid_w>;
+
+def PTX_READ_NTID_X   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.x",
+                                                      int_ptx_read_ntid_x>;
+def PTX_READ_NTID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.y",
+                                                      int_ptx_read_ntid_y>;
+def PTX_READ_NTID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.z",
+                                                      int_ptx_read_ntid_z>;
+def PTX_READ_NTID_W   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.w",
+                                                      int_ptx_read_ntid_w>;
+
+def PTX_READ_LANEID  : PTX_READ_SPECIAL_REGISTER_R32<"laneid",
+                                                     int_ptx_read_laneid>;
+def PTX_READ_WARPID  : PTX_READ_SPECIAL_REGISTER_R32<"warpid",
+                                                     int_ptx_read_warpid>;
+def PTX_READ_NWARPID : PTX_READ_SPECIAL_REGISTER_R32<"nwarpid",
+                                                     int_ptx_read_nwarpid>;
+
+def PTX_READ_CTAID_X   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.x",
+                                                       int_ptx_read_ctaid_x>;
+def PTX_READ_CTAID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.y",
+                                                       int_ptx_read_ctaid_y>;
+def PTX_READ_CTAID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.z",
+                                                       int_ptx_read_ctaid_z>;
+def PTX_READ_CTAID_W   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.w",
+                                                       int_ptx_read_ctaid_w>;
+
+def PTX_READ_NCTAID_X   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.x",
+                                                        int_ptx_read_nctaid_x>;
+def PTX_READ_NCTAID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.y",
+                                                        int_ptx_read_nctaid_y>;
+def PTX_READ_NCTAID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.z",
+                                                        int_ptx_read_nctaid_z>;
+def PTX_READ_NCTAID_W   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.w",
+                                                        int_ptx_read_nctaid_w>;
+
+def PTX_READ_SMID  : PTX_READ_SPECIAL_REGISTER_R32<"smid",
+                                                   int_ptx_read_smid>;
+def PTX_READ_NSMID  : PTX_READ_SPECIAL_REGISTER_R32<"nsmid",
+                                                    int_ptx_read_nsmid>;
+def PTX_READ_GRIDID  : PTX_READ_SPECIAL_REGISTER_R32<"gridid",
+                                                     int_ptx_read_gridid>;
+
+def PTX_READ_LANEMASK_EQ
+  : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_eq", int_ptx_read_lanemask_eq>;
+def PTX_READ_LANEMASK_LE
+  : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_le", int_ptx_read_lanemask_le>;
+def PTX_READ_LANEMASK_LT
+  : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_lt", int_ptx_read_lanemask_lt>;
+def PTX_READ_LANEMASK_GE
+  : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_ge", int_ptx_read_lanemask_ge>;
+def PTX_READ_LANEMASK_GT
+  : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_gt", int_ptx_read_lanemask_gt>;
+
+def PTX_READ_CLOCK
+  : PTX_READ_SPECIAL_REGISTER_R32<"clock", int_ptx_read_clock>;
+def PTX_READ_CLOCK64
+  : PTX_READ_SPECIAL_REGISTER_R64<"clock64", int_ptx_read_clock64>;
+
+def PTX_READ_PM0 : PTX_READ_SPECIAL_REGISTER_R32<"pm0", int_ptx_read_pm0>;
+def PTX_READ_PM1 : PTX_READ_SPECIAL_REGISTER_R32<"pm1", int_ptx_read_pm1>;
+def PTX_READ_PM2 : PTX_READ_SPECIAL_REGISTER_R32<"pm2", int_ptx_read_pm2>;
+def PTX_READ_PM3 : PTX_READ_SPECIAL_REGISTER_R32<"pm3", int_ptx_read_pm3>;
+
+// PTX Parallel Synchronization and Communication Intrinsics
+
+def PTX_BAR_SYNC : NVPTXInst<(outs), (ins i32imm:$i), "bar.sync\t$i;",
+                             [(int_ptx_bar_sync imm:$i)]>;
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
new file mode 100644
index 0000000..f0c3663
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
@@ -0,0 +1,205 @@
+//===- NVPTXLowerAggrCopies.cpp - ------------------------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Lower aggregate copies, memset, memcpy, memmov intrinsics into loops when
+// the size is large or is not a compile-time constant.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXLowerAggrCopies.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+
+using namespace llvm;
+
+namespace llvm { FunctionPass *createLowerAggrCopies(); }
+
+char NVPTXLowerAggrCopies::ID = 0;
+
+// Lower MemTransferInst or load-store pair to loop
+static void convertTransferToLoop(
+    Instruction *splitAt, Value *srcAddr, Value *dstAddr, Value *len,
+    //unsigned numLoads,
+    bool srcVolatile, bool dstVolatile, LLVMContext &Context, Function &F) {
+  Type *indType = len->getType();
+
+  BasicBlock *origBB = splitAt->getParent();
+  BasicBlock *newBB = splitAt->getParent()->splitBasicBlock(splitAt, "split");
+  BasicBlock *loopBB = BasicBlock::Create(Context, "loadstoreloop", &F, newBB);
+
+  origBB->getTerminator()->setSuccessor(0, loopBB);
+  IRBuilder<> builder(origBB, origBB->getTerminator());
+
+  // srcAddr and dstAddr are expected to be pointer types,
+  // so no check is made here.
+  unsigned srcAS = dyn_cast<PointerType>(srcAddr->getType())->getAddressSpace();
+  unsigned dstAS = dyn_cast<PointerType>(dstAddr->getType())->getAddressSpace();
+
+  // Cast pointers to (char *)
+  srcAddr = builder.CreateBitCast(srcAddr, Type::getInt8PtrTy(Context, srcAS));
+  dstAddr = builder.CreateBitCast(dstAddr, Type::getInt8PtrTy(Context, dstAS));
+
+  IRBuilder<> loop(loopBB);
+  // The loop index (ind) is a phi node.
+  PHINode *ind = loop.CreatePHI(indType, 0);
+  // Incoming value for ind is 0
+  ind->addIncoming(ConstantInt::get(indType, 0), origBB);
+
+  // load from srcAddr+ind
+  Value *val = loop.CreateLoad(loop.CreateGEP(srcAddr, ind), srcVolatile);
+  // store at dstAddr+ind
+  loop.CreateStore(val, loop.CreateGEP(dstAddr, ind), dstVolatile);
+
+  // The value for ind coming from backedge is (ind + 1)
+  Value *newind = loop.CreateAdd(ind, ConstantInt::get(indType, 1));
+  ind->addIncoming(newind, loopBB);
+
+  loop.CreateCondBr(loop.CreateICmpULT(newind, len), loopBB, newBB);
+}
+
+// Lower MemSetInst to loop
+static void convertMemSetToLoop(Instruction *splitAt, Value *dstAddr,
+                                Value *len, Value *val, LLVMContext &Context,
+                                Function &F) {
+  BasicBlock *origBB = splitAt->getParent();
+  BasicBlock *newBB = splitAt->getParent()->splitBasicBlock(splitAt, "split");
+  BasicBlock *loopBB = BasicBlock::Create(Context, "loadstoreloop", &F, newBB);
+
+  origBB->getTerminator()->setSuccessor(0, loopBB);
+  IRBuilder<> builder(origBB, origBB->getTerminator());
+
+  unsigned dstAS = dyn_cast<PointerType>(dstAddr->getType())->getAddressSpace();
+
+  // Cast pointer to the type of value getting stored
+  dstAddr =
+      builder.CreateBitCast(dstAddr, PointerType::get(val->getType(), dstAS));
+
+  IRBuilder<> loop(loopBB);
+  PHINode *ind = loop.CreatePHI(len->getType(), 0);
+  ind->addIncoming(ConstantInt::get(len->getType(), 0), origBB);
+
+  loop.CreateStore(val, loop.CreateGEP(dstAddr, ind), false);
+
+  Value *newind = loop.CreateAdd(ind, ConstantInt::get(len->getType(), 1));
+  ind->addIncoming(newind, loopBB);
+
+  loop.CreateCondBr(loop.CreateICmpULT(newind, len), loopBB, newBB);
+}
+
+bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
+  SmallVector<LoadInst *, 4> aggrLoads;
+  SmallVector<MemTransferInst *, 4> aggrMemcpys;
+  SmallVector<MemSetInst *, 4> aggrMemsets;
+
+  const DataLayout *DL = &getAnalysis<DataLayoutPass>().getDataLayout();
+  LLVMContext &Context = F.getParent()->getContext();
+
+  //
+  // Collect all the aggrLoads, aggrMemcpys and addrMemsets.
+  //
+  //const BasicBlock *firstBB = &F.front();  // first BB in F
+  for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
+    //BasicBlock *bb = BI;
+    for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE;
+         ++II) {
+      if (LoadInst *load = dyn_cast<LoadInst>(II)) {
+
+        if (load->hasOneUse() == false)
+          continue;
+
+        if (DL->getTypeStoreSize(load->getType()) < MaxAggrCopySize)
+          continue;
+
+        User *use = load->user_back();
+        if (StoreInst *store = dyn_cast<StoreInst>(use)) {
+          if (store->getOperand(0) != load) //getValueOperand
+            continue;
+          aggrLoads.push_back(load);
+        }
+      } else if (MemTransferInst *intr = dyn_cast<MemTransferInst>(II)) {
+        Value *len = intr->getLength();
+        // If the number of elements being copied is greater
+        // than MaxAggrCopySize, lower it to a loop
+        if (ConstantInt *len_int = dyn_cast<ConstantInt>(len)) {
+          if (len_int->getZExtValue() >= MaxAggrCopySize) {
+            aggrMemcpys.push_back(intr);
+          }
+        } else {
+          // turn variable length memcpy/memmov into loop
+          aggrMemcpys.push_back(intr);
+        }
+      } else if (MemSetInst *memsetintr = dyn_cast<MemSetInst>(II)) {
+        Value *len = memsetintr->getLength();
+        if (ConstantInt *len_int = dyn_cast<ConstantInt>(len)) {
+          if (len_int->getZExtValue() >= MaxAggrCopySize) {
+            aggrMemsets.push_back(memsetintr);
+          }
+        } else {
+          // turn variable length memset into loop
+          aggrMemsets.push_back(memsetintr);
+        }
+      }
+    }
+  }
+  if ((aggrLoads.size() == 0) && (aggrMemcpys.size() == 0) &&
+      (aggrMemsets.size() == 0))
+    return false;
+
+  //
+  // Do the transformation of an aggr load/copy/set to a loop
+  //
+  for (unsigned i = 0, e = aggrLoads.size(); i != e; ++i) {
+    LoadInst *load = aggrLoads[i];
+    StoreInst *store = dyn_cast<StoreInst>(*load->user_begin());
+    Value *srcAddr = load->getOperand(0);
+    Value *dstAddr = store->getOperand(1);
+    unsigned numLoads = DL->getTypeStoreSize(load->getType());
+    Value *len = ConstantInt::get(Type::getInt32Ty(Context), numLoads);
+
+    convertTransferToLoop(store, srcAddr, dstAddr, len, load->isVolatile(),
+                          store->isVolatile(), Context, F);
+
+    store->eraseFromParent();
+    load->eraseFromParent();
+  }
+
+  for (unsigned i = 0, e = aggrMemcpys.size(); i != e; ++i) {
+    MemTransferInst *cpy = aggrMemcpys[i];
+    Value *len = cpy->getLength();
+    // llvm 2.7 version of memcpy does not have volatile
+    // operand yet. So always making it non-volatile
+    // optimistically, so that we don't see unnecessary
+    // st.volatile in ptx
+    convertTransferToLoop(cpy, cpy->getSource(), cpy->getDest(), len, false,
+                          false, Context, F);
+    cpy->eraseFromParent();
+  }
+
+  for (unsigned i = 0, e = aggrMemsets.size(); i != e; ++i) {
+    MemSetInst *memsetinst = aggrMemsets[i];
+    Value *len = memsetinst->getLength();
+    Value *val = memsetinst->getValue();
+    convertMemSetToLoop(memsetinst, memsetinst->getDest(), len, val, Context,
+                        F);
+    memsetinst->eraseFromParent();
+  }
+
+  return true;
+}
+
+FunctionPass *llvm::createLowerAggrCopies() {
+  return new NVPTXLowerAggrCopies();
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.h b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.h
new file mode 100644
index 0000000..5ec1fc9
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.h
@@ -0,0 +1,48 @@
+//===-- llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.h ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the NVIDIA specific lowering of
+// aggregate copies
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTX_LOWER_AGGR_COPIES_H
+#define NVPTX_LOWER_AGGR_COPIES_H
+
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Pass.h"
+
+namespace llvm {
+
+// actual analysis class, which is a functionpass
+struct NVPTXLowerAggrCopies : public FunctionPass {
+  static char ID;
+
+  NVPTXLowerAggrCopies() : FunctionPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DataLayoutPass>();
+    AU.addPreserved("stack-protector");
+    AU.addPreserved<MachineFunctionAnalysis>();
+  }
+
+  bool runOnFunction(Function &F) override;
+
+  static const unsigned MaxAggrCopySize = 128;
+
+  const char *getPassName() const override {
+    return "Lower aggregate copies/intrinsics into loops";
+  }
+};
+
+extern FunctionPass *createLowerAggrCopies();
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
new file mode 100644
index 0000000..137248b
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
@@ -0,0 +1,47 @@
+//===-- NVPTXMCExpr.cpp - NVPTX specific MC expression classes ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXMCExpr.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "nvptx-mcexpr"
+
+const NVPTXFloatMCExpr*
+NVPTXFloatMCExpr::Create(VariantKind Kind, APFloat Flt, MCContext &Ctx) {
+  return new (Ctx) NVPTXFloatMCExpr(Kind, Flt);
+}
+
+void NVPTXFloatMCExpr::PrintImpl(raw_ostream &OS) const {
+  bool Ignored;
+  unsigned NumHex;
+  APFloat APF = getAPFloat();
+
+  switch (Kind) {
+  default: llvm_unreachable("Invalid kind!");
+  case VK_NVPTX_SINGLE_PREC_FLOAT:
+    OS << "0f";
+    NumHex = 8;
+    APF.convert(APFloat::IEEEsingle, APFloat::rmNearestTiesToEven, &Ignored);
+    break;
+  case VK_NVPTX_DOUBLE_PREC_FLOAT:
+    OS << "0d";
+    NumHex = 16;
+    APF.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &Ignored);
+    break;
+  }
+
+  APInt API = APF.bitcastToAPInt();
+  std::string HexStr(utohexstr(API.getZExtValue()));
+  if (HexStr.length() < NumHex)
+    OS << std::string(NumHex - HexStr.length(), '0');
+  OS << utohexstr(API.getZExtValue());
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.h b/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.h
new file mode 100644
index 0000000..5547649
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.h
@@ -0,0 +1,83 @@
+//===-- NVPTXMCExpr.h - NVPTX specific MC expression classes ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// Modeled after ARMMCExpr
+
+#ifndef NVPTXMCEXPR_H
+#define NVPTXMCEXPR_H
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/MC/MCExpr.h"
+
+namespace llvm {
+
+class NVPTXFloatMCExpr : public MCTargetExpr {
+public:
+  enum VariantKind {
+    VK_NVPTX_None,
+    VK_NVPTX_SINGLE_PREC_FLOAT,   // FP constant in single-precision
+    VK_NVPTX_DOUBLE_PREC_FLOAT    // FP constant in double-precision
+  };
+
+private:
+  const VariantKind Kind;
+  const APFloat Flt;
+
+  explicit NVPTXFloatMCExpr(VariantKind _Kind, APFloat _Flt)
+    : Kind(_Kind), Flt(_Flt) {}
+
+public:
+  /// @name Construction
+  /// @{
+
+  static const NVPTXFloatMCExpr *Create(VariantKind Kind, APFloat Flt,
+                                        MCContext &Ctx);
+
+  static const NVPTXFloatMCExpr *CreateConstantFPSingle(APFloat Flt,
+                                                        MCContext &Ctx) {
+    return Create(VK_NVPTX_SINGLE_PREC_FLOAT, Flt, Ctx);
+  }
+
+  static const NVPTXFloatMCExpr *CreateConstantFPDouble(APFloat Flt,
+                                                        MCContext &Ctx) {
+    return Create(VK_NVPTX_DOUBLE_PREC_FLOAT, Flt, Ctx);
+  }
+
+  /// @}
+  /// @name Accessors
+  /// @{
+
+  /// getOpcode - Get the kind of this expression.
+  VariantKind getKind() const { return Kind; }
+
+  /// getSubExpr - Get the child of this expression.
+  APFloat getAPFloat() const { return Flt; }
+
+/// @}
+
+  void PrintImpl(raw_ostream &OS) const override;
+  bool EvaluateAsRelocatableImpl(MCValue &Res,
+                                 const MCAsmLayout *Layout) const override {
+    return false;
+  }
+  void visitUsedExpr(MCStreamer &Streamer) const override {};
+  const MCSection *FindAssociatedSection() const override {
+    return nullptr;
+  }
+
+  // There are no TLS NVPTXMCExprs at the moment.
+  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override {}
+
+  static bool classof(const MCExpr *E) {
+    return E->getKind() == MCExpr::Target;
+  }
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h b/contrib/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h
new file mode 100644
index 0000000..67fb390
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h
@@ -0,0 +1,46 @@
+//===-- NVPTXMachineFunctionInfo.h - NVPTX-specific Function Info  --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class is attached to a MachineFunction instance and tracks target-
+// dependent information
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+class NVPTXMachineFunctionInfo : public MachineFunctionInfo {
+private:
+  /// Stores a mapping from index to symbol name for removing image handles
+  /// on Fermi.
+  SmallVector<std::string, 8> ImageHandleList;
+
+public:
+  NVPTXMachineFunctionInfo(MachineFunction &MF) {}
+
+  /// Returns the index for the symbol \p Symbol. If the symbol was previously,
+  /// added, the same index is returned. Otherwise, the symbol is added and the
+  /// new index is returned.
+  unsigned getImageHandleSymbolIndex(const char *Symbol) {
+    // Is the symbol already present?
+    for (unsigned i = 0, e = ImageHandleList.size(); i != e; ++i)
+      if (ImageHandleList[i] == std::string(Symbol))
+        return i;
+    // Nope, insert it
+    ImageHandleList.push_back(Symbol);
+    return ImageHandleList.size()-1;
+  }
+
+  /// Returns the symbol name at the given index.
+  const char *getImageHandleSymbol(unsigned Idx) const {
+    assert(ImageHandleList.size() > Idx && "Bad index");
+    return ImageHandleList[Idx].c_str();
+  }
+};
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
new file mode 100644
index 0000000..348ab0c
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
@@ -0,0 +1,227 @@
+//===-- NVPTXPrologEpilogPass.cpp - NVPTX prolog/epilog inserter ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a copy of the generic LLVM PrologEpilogInserter pass, modified
+// to remove unneeded functionality and to handle virtual registers. Most code
+// here is a copy of PrologEpilogInserter.cpp.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "nvptx-prolog-epilog"
+
+namespace {
+class NVPTXPrologEpilogPass : public MachineFunctionPass {
+public:
+  static char ID;
+  NVPTXPrologEpilogPass() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+  void calculateFrameObjectOffsets(MachineFunction &Fn);
+};
+}
+
+MachineFunctionPass *llvm::createNVPTXPrologEpilogPass() {
+  return new NVPTXPrologEpilogPass();
+}
+
+char NVPTXPrologEpilogPass::ID = 0;
+
+bool NVPTXPrologEpilogPass::runOnMachineFunction(MachineFunction &MF) {
+  const TargetMachine &TM = MF.getTarget();
+  const TargetFrameLowering &TFI = *TM.getFrameLowering();
+  const TargetRegisterInfo &TRI = *TM.getRegisterInfo();
+  bool Modified = false;
+
+  calculateFrameObjectOffsets(MF);
+
+  for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB) {
+    for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) {
+      MachineInstr *MI = I;
+      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+        if (!MI->getOperand(i).isFI())
+          continue;
+        TRI.eliminateFrameIndex(MI, 0, i, nullptr);
+        Modified = true;
+      }
+    }
+  }
+
+  // Add function prolog/epilog
+  TFI.emitPrologue(MF);
+
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
+    // If last instruction is a return instruction, add an epilogue
+    if (!I->empty() && I->back().isReturn())
+      TFI.emitEpilogue(MF, *I);
+  }
+
+  return Modified;
+}
+
+/// AdjustStackOffset - Helper function used to adjust the stack frame offset.
+static inline void
+AdjustStackOffset(MachineFrameInfo *MFI, int FrameIdx,
+                  bool StackGrowsDown, int64_t &Offset,
+                  unsigned &MaxAlign) {
+  // If the stack grows down, add the object size to find the lowest address.
+  if (StackGrowsDown)
+    Offset += MFI->getObjectSize(FrameIdx);
+
+  unsigned Align = MFI->getObjectAlignment(FrameIdx);
+
+  // If the alignment of this object is greater than that of the stack, then
+  // increase the stack alignment to match.
+  MaxAlign = std::max(MaxAlign, Align);
+
+  // Adjust to alignment boundary.
+  Offset = (Offset + Align - 1) / Align * Align;
+
+  if (StackGrowsDown) {
+    DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << -Offset << "]\n");
+    MFI->setObjectOffset(FrameIdx, -Offset); // Set the computed offset
+  } else {
+    DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << Offset << "]\n");
+    MFI->setObjectOffset(FrameIdx, Offset);
+    Offset += MFI->getObjectSize(FrameIdx);
+  }
+}
+
+void
+NVPTXPrologEpilogPass::calculateFrameObjectOffsets(MachineFunction &Fn) {
+  const TargetFrameLowering &TFI = *Fn.getTarget().getFrameLowering();
+  const TargetRegisterInfo *RegInfo = Fn.getTarget().getRegisterInfo();
+
+  bool StackGrowsDown =
+    TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
+
+  // Loop over all of the stack objects, assigning sequential addresses...
+  MachineFrameInfo *MFI = Fn.getFrameInfo();
+
+  // Start at the beginning of the local area.
+  // The Offset is the distance from the stack top in the direction
+  // of stack growth -- so it's always nonnegative.
+  int LocalAreaOffset = TFI.getOffsetOfLocalArea();
+  if (StackGrowsDown)
+    LocalAreaOffset = -LocalAreaOffset;
+  assert(LocalAreaOffset >= 0
+         && "Local area offset should be in direction of stack growth");
+  int64_t Offset = LocalAreaOffset;
+
+  // If there are fixed sized objects that are preallocated in the local area,
+  // non-fixed objects can't be allocated right at the start of local area.
+  // We currently don't support filling in holes in between fixed sized
+  // objects, so we adjust 'Offset' to point to the end of last fixed sized
+  // preallocated object.
+  for (int i = MFI->getObjectIndexBegin(); i != 0; ++i) {
+    int64_t FixedOff;
+    if (StackGrowsDown) {
+      // The maximum distance from the stack pointer is at lower address of
+      // the object -- which is given by offset. For down growing stack
+      // the offset is negative, so we negate the offset to get the distance.
+      FixedOff = -MFI->getObjectOffset(i);
+    } else {
+      // The maximum distance from the start pointer is at the upper
+      // address of the object.
+      FixedOff = MFI->getObjectOffset(i) + MFI->getObjectSize(i);
+    }
+    if (FixedOff > Offset) Offset = FixedOff;
+  }
+
+  // NOTE: We do not have a call stack
+
+  unsigned MaxAlign = MFI->getMaxAlignment();
+
+  // No scavenger
+
+  // FIXME: Once this is working, then enable flag will change to a target
+  // check for whether the frame is large enough to want to use virtual
+  // frame index registers. Functions which don't want/need this optimization
+  // will continue to use the existing code path.
+  if (MFI->getUseLocalStackAllocationBlock()) {
+    unsigned Align = MFI->getLocalFrameMaxAlign();
+
+    // Adjust to alignment boundary.
+    Offset = (Offset + Align - 1) / Align * Align;
+
+    DEBUG(dbgs() << "Local frame base offset: " << Offset << "\n");
+
+    // Resolve offsets for objects in the local block.
+    for (unsigned i = 0, e = MFI->getLocalFrameObjectCount(); i != e; ++i) {
+      std::pair<int, int64_t> Entry = MFI->getLocalFrameObjectMap(i);
+      int64_t FIOffset = (StackGrowsDown ? -Offset : Offset) + Entry.second;
+      DEBUG(dbgs() << "alloc FI(" << Entry.first << ") at SP[" <<
+            FIOffset << "]\n");
+      MFI->setObjectOffset(Entry.first, FIOffset);
+    }
+    // Allocate the local block
+    Offset += MFI->getLocalFrameSize();
+
+    MaxAlign = std::max(Align, MaxAlign);
+  }
+
+  // No stack protector
+
+  // Then assign frame offsets to stack objects that are not used to spill
+  // callee saved registers.
+  for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) {
+    if (MFI->isObjectPreAllocated(i) &&
+        MFI->getUseLocalStackAllocationBlock())
+      continue;
+    if (MFI->isDeadObjectIndex(i))
+      continue;
+
+    AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign);
+  }
+
+  // No scavenger
+
+  if (!TFI.targetHandlesStackFrameRounding()) {
+    // If we have reserved argument space for call sites in the function
+    // immediately on entry to the current function, count it as part of the
+    // overall stack size.
+    if (MFI->adjustsStack() && TFI.hasReservedCallFrame(Fn))
+      Offset += MFI->getMaxCallFrameSize();
+
+    // Round up the size to a multiple of the alignment.  If the function has
+    // any calls or alloca's, align to the target's StackAlignment value to
+    // ensure that the callee's frame or the alloca data is suitably aligned;
+    // otherwise, for leaf functions, align to the TransientStackAlignment
+    // value.
+    unsigned StackAlign;
+    if (MFI->adjustsStack() || MFI->hasVarSizedObjects() ||
+        (RegInfo->needsStackRealignment(Fn) && MFI->getObjectIndexEnd() != 0))
+      StackAlign = TFI.getStackAlignment();
+    else
+      StackAlign = TFI.getTransientStackAlignment();
+
+    // If the frame pointer is eliminated, all frame offsets will be relative to
+    // SP not FP. Align to MaxAlign so this works.
+    StackAlign = std::max(StackAlign, MaxAlign);
+    unsigned AlignMask = StackAlign - 1;
+    Offset = (Offset + AlignMask) & ~uint64_t(AlignMask);
+  }
+
+  // Update frame info to pretend that this is part of the stack...
+  int64_t StackSize = Offset - LocalAreaOffset;
+  MFI->setStackSize(StackSize);
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
new file mode 100644
index 0000000..358ccce
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
@@ -0,0 +1,111 @@
+//===- NVPTXRegisterInfo.cpp - NVPTX Register Information -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the NVPTX implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXRegisterInfo.h"
+#include "NVPTX.h"
+#include "NVPTXSubtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "nvptx-reg-info"
+
+namespace llvm {
+std::string getNVPTXRegClassName(TargetRegisterClass const *RC) {
+  if (RC == &NVPTX::Float32RegsRegClass) {
+    return ".f32";
+  }
+  if (RC == &NVPTX::Float64RegsRegClass) {
+    return ".f64";
+  } else if (RC == &NVPTX::Int64RegsRegClass) {
+    return ".s64";
+  } else if (RC == &NVPTX::Int32RegsRegClass) {
+    return ".s32";
+  } else if (RC == &NVPTX::Int16RegsRegClass) {
+    return ".s16";
+  } else if (RC == &NVPTX::Int1RegsRegClass) {
+    return ".pred";
+  } else if (RC == &NVPTX::SpecialRegsRegClass) {
+    return "!Special!";
+  } else {
+    return "INTERNAL";
+  }
+  return "";
+}
+
+std::string getNVPTXRegClassStr(TargetRegisterClass const *RC) {
+  if (RC == &NVPTX::Float32RegsRegClass) {
+    return "%f";
+  }
+  if (RC == &NVPTX::Float64RegsRegClass) {
+    return "%fd";
+  } else if (RC == &NVPTX::Int64RegsRegClass) {
+    return "%rd";
+  } else if (RC == &NVPTX::Int32RegsRegClass) {
+    return "%r";
+  } else if (RC == &NVPTX::Int16RegsRegClass) {
+    return "%rs";
+  } else if (RC == &NVPTX::Int1RegsRegClass) {
+    return "%p";
+  } else if (RC == &NVPTX::SpecialRegsRegClass) {
+    return "!Special!";
+  } else {
+    return "INTERNAL";
+  }
+  return "";
+}
+}
+
+NVPTXRegisterInfo::NVPTXRegisterInfo(const NVPTXSubtarget &st)
+    : NVPTXGenRegisterInfo(0), Is64Bit(st.is64Bit()) {}
+
+#define GET_REGINFO_TARGET_DESC
+#include "NVPTXGenRegisterInfo.inc"
+
+/// NVPTX Callee Saved Registers
+const MCPhysReg *
+NVPTXRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  static const MCPhysReg CalleeSavedRegs[] = { 0 };
+  return CalleeSavedRegs;
+}
+
+BitVector NVPTXRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  return Reserved;
+}
+
+void NVPTXRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                            int SPAdj, unsigned FIOperandNum,
+                                            RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected");
+
+  MachineInstr &MI = *II;
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+
+  MachineFunction &MF = *MI.getParent()->getParent();
+  int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) +
+               MI.getOperand(FIOperandNum + 1).getImm();
+
+  // Using I0 as the frame pointer
+  MI.getOperand(FIOperandNum).ChangeToRegister(NVPTX::VRFrame, false);
+  MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+}
+
+unsigned NVPTXRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  return NVPTX::VRFrame;
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.h b/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.h
new file mode 100644
index 0000000..a7594be
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.h
@@ -0,0 +1,72 @@
+//===- NVPTXRegisterInfo.h - NVPTX Register Information Impl ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the NVPTX implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTXREGISTERINFO_H
+#define NVPTXREGISTERINFO_H
+
+#include "ManagedStringPool.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <sstream>
+
+#define GET_REGINFO_HEADER
+#include "NVPTXGenRegisterInfo.inc"
+
+namespace llvm {
+
+// Forward Declarations.
+class TargetInstrInfo;
+class NVPTXSubtarget;
+
+class NVPTXRegisterInfo : public NVPTXGenRegisterInfo {
+private:
+  bool Is64Bit;
+  // Hold Strings that can be free'd all together with NVPTXRegisterInfo
+  ManagedStringPool ManagedStrPool;
+
+public:
+  NVPTXRegisterInfo(const NVPTXSubtarget &st);
+
+  //------------------------------------------------------
+  // Pure virtual functions from TargetRegisterInfo
+  //------------------------------------------------------
+
+  // NVPTX callee saved registers
+  const MCPhysReg *
+  getCalleeSavedRegs(const MachineFunction *MF = nullptr) const override;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
+                           unsigned FIOperandNum,
+                           RegScavenger *RS = nullptr) const override;
+
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
+
+  ManagedStringPool *getStrPool() const {
+    return const_cast<ManagedStringPool *>(&ManagedStrPool);
+  }
+
+  const char *getName(unsigned RegNo) const {
+    std::stringstream O;
+    O << "reg" << RegNo;
+    return getStrPool()->getManagedString(O.str().c_str())->c_str();
+  }
+
+};
+
+std::string getNVPTXRegClassName(const TargetRegisterClass *RC);
+std::string getNVPTXRegClassStr(const TargetRegisterClass *RC);
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td b/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
new file mode 100644
index 0000000..efcee6b
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
@@ -0,0 +1,69 @@
+//===-- NVPTXRegisterInfo.td - NVPTX Register defs ---------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the PTX register file
+//===----------------------------------------------------------------------===//
+
+class NVPTXReg<string n> : Register<n> {
+  let Namespace = "NVPTX";
+}
+
+class NVPTXRegClass<list<ValueType> regTypes, int alignment, dag regList>
+     : RegisterClass <"NVPTX", regTypes, alignment, regList>;
+
+//===----------------------------------------------------------------------===//
+//  Registers
+//===----------------------------------------------------------------------===//
+
+// Special Registers used as stack pointer
+def VRFrame         : NVPTXReg<"%SP">;
+def VRFrameLocal    : NVPTXReg<"%SPL">;
+
+// Special Registers used as the stack
+def VRDepot  : NVPTXReg<"%Depot">;
+
+// We use virtual registers, but define a few physical registers here to keep
+// SDAG and the MachineInstr layers happy.
+foreach i = 0-4 in {
+  def P#i  : NVPTXReg<"%p"#i>;  // Predicate
+  def RS#i : NVPTXReg<"%rs"#i>; // 16-bit
+  def R#i  : NVPTXReg<"%r"#i>;  // 32-bit
+  def RL#i : NVPTXReg<"%rd"#i>; // 64-bit
+  def F#i  : NVPTXReg<"%f"#i>;  // 32-bit float
+  def FL#i : NVPTXReg<"%fd"#i>; // 64-bit float
+
+  // Arguments
+  def ia#i : NVPTXReg<"%ia"#i>;
+  def la#i : NVPTXReg<"%la"#i>;
+  def fa#i : NVPTXReg<"%fa"#i>;
+  def da#i : NVPTXReg<"%da"#i>;
+}
+
+foreach i = 0-31 in {
+  def ENVREG#i : NVPTXReg<"%envreg"#i>;
+}
+
+//===----------------------------------------------------------------------===//
+//  Register classes
+//===----------------------------------------------------------------------===//
+def Int1Regs : NVPTXRegClass<[i1], 8, (add (sequence "P%u", 0, 4))>;
+def Int16Regs : NVPTXRegClass<[i16], 16, (add (sequence "RS%u", 0, 4))>;
+def Int32Regs : NVPTXRegClass<[i32], 32, (add (sequence "R%u", 0, 4))>;
+def Int64Regs : NVPTXRegClass<[i64], 64, (add (sequence "RL%u", 0, 4))>;
+def Float32Regs : NVPTXRegClass<[f32], 32, (add (sequence "F%u", 0, 4))>;
+def Float64Regs : NVPTXRegClass<[f64], 64, (add (sequence "FL%u", 0, 4))>;
+def Int32ArgRegs : NVPTXRegClass<[i32], 32, (add (sequence "ia%u", 0, 4))>;
+def Int64ArgRegs : NVPTXRegClass<[i64], 64, (add (sequence "la%u", 0, 4))>;
+def Float32ArgRegs : NVPTXRegClass<[f32], 32, (add (sequence "fa%u", 0, 4))>;
+def Float64ArgRegs : NVPTXRegClass<[f64], 64, (add (sequence "da%u", 0, 4))>;
+
+// Read NVPTXRegisterInfo.cpp to see how VRFrame and VRDepot are used.
+def SpecialRegs : NVPTXRegClass<[i32], 32, (add VRFrame, VRDepot,
+                                            (sequence "ENVREG%u", 0, 31))>;
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
new file mode 100644
index 0000000..20d4e27
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
@@ -0,0 +1,189 @@
+//===-- NVPTXReplaceImageHandles.cpp - Replace image handles for Fermi ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// On Fermi, image handles are not supported. To work around this, we traverse
+// the machine code and replace image handles with concrete symbols. For this
+// to work reliably, inlining of all function call must be performed.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "NVPTXMachineFunctionInfo.h"
+#include "NVPTXSubtarget.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/DenseSet.h"
+
+using namespace llvm;
+
+namespace {
+class NVPTXReplaceImageHandles : public MachineFunctionPass {
+private:
+  static char ID;
+  DenseSet<MachineInstr *> InstrsToRemove;
+
+public:
+  NVPTXReplaceImageHandles();
+
+  bool runOnMachineFunction(MachineFunction &MF);
+
+  virtual const char *getPassName() const {
+    return "NVPTX Replace Image Handles";
+  }
+private:
+  bool processInstr(MachineInstr &MI);
+  void replaceImageHandle(MachineOperand &Op, MachineFunction &MF);
+  bool findIndexForHandle(MachineOperand &Op, MachineFunction &MF,
+                          unsigned &Idx);
+};
+}
+
+char NVPTXReplaceImageHandles::ID = 0;
+
+NVPTXReplaceImageHandles::NVPTXReplaceImageHandles()
+  : MachineFunctionPass(ID) {}
+
+bool NVPTXReplaceImageHandles::runOnMachineFunction(MachineFunction &MF) {
+  bool Changed = false;
+  InstrsToRemove.clear();
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
+       ++BI) {
+    for (MachineBasicBlock::iterator I = (*BI).begin(), E = (*BI).end();
+         I != E; ++I) {
+      MachineInstr &MI = *I;
+      Changed |= processInstr(MI);
+    }
+  }
+
+  // Now clean up any handle-access instructions
+  // This is needed in debug mode when code cleanup passes are not executed,
+  // but we need the handle access to be eliminated because they are not
+  // valid instructions when image handles are disabled.
+  for (DenseSet<MachineInstr *>::iterator I = InstrsToRemove.begin(),
+       E = InstrsToRemove.end(); I != E; ++I) {
+    (*I)->eraseFromParent();
+  }
+  return Changed;
+}
+
+bool NVPTXReplaceImageHandles::processInstr(MachineInstr &MI) {
+  MachineFunction &MF = *MI.getParent()->getParent();
+  const MCInstrDesc &MCID = MI.getDesc();
+
+  if (MCID.TSFlags & NVPTXII::IsTexFlag) {
+    // This is a texture fetch, so operand 4 is a texref and operand 5 is
+    // a samplerref
+    MachineOperand &TexHandle = MI.getOperand(4);
+    replaceImageHandle(TexHandle, MF);
+
+    if (!(MCID.TSFlags & NVPTXII::IsTexModeUnifiedFlag)) {
+      MachineOperand &SampHandle = MI.getOperand(5);
+      replaceImageHandle(SampHandle, MF);
+    }
+
+    return true;
+  } else if (MCID.TSFlags & NVPTXII::IsSuldMask) {
+    unsigned VecSize =
+      1 << (((MCID.TSFlags & NVPTXII::IsSuldMask) >> NVPTXII::IsSuldShift) - 1);
+
+    // For a surface load of vector size N, the Nth operand will be the surfref
+    MachineOperand &SurfHandle = MI.getOperand(VecSize);
+
+    replaceImageHandle(SurfHandle, MF);
+
+    return true;
+  } else if (MCID.TSFlags & NVPTXII::IsSustFlag) {
+    // This is a surface store, so operand 0 is a surfref
+    MachineOperand &SurfHandle = MI.getOperand(0);
+
+    replaceImageHandle(SurfHandle, MF);
+
+    return true;
+  } else if (MCID.TSFlags & NVPTXII::IsSurfTexQueryFlag) {
+    // This is a query, so operand 1 is a surfref/texref
+    MachineOperand &Handle = MI.getOperand(1);
+
+    replaceImageHandle(Handle, MF);
+
+    return true; 
+  }
+
+  return false;
+}
+
+void NVPTXReplaceImageHandles::
+replaceImageHandle(MachineOperand &Op, MachineFunction &MF) {
+  unsigned Idx;
+  if (findIndexForHandle(Op, MF, Idx)) {
+    Op.ChangeToImmediate(Idx);
+  }
+}
+
+bool NVPTXReplaceImageHandles::
+findIndexForHandle(MachineOperand &Op, MachineFunction &MF, unsigned &Idx) {
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  NVPTXMachineFunctionInfo *MFI = MF.getInfo<NVPTXMachineFunctionInfo>();
+
+  assert(Op.isReg() && "Handle is not in a reg?");
+
+  // Which instruction defines the handle?
+  MachineInstr &TexHandleDef = *MRI.getVRegDef(Op.getReg());
+
+  switch (TexHandleDef.getOpcode()) {
+  case NVPTX::LD_i64_avar: {
+    // The handle is a parameter value being loaded, replace with the
+    // parameter symbol
+    const NVPTXSubtarget &ST = MF.getTarget().getSubtarget<NVPTXSubtarget>();
+    if (ST.getDrvInterface() == NVPTX::CUDA) {
+      // For CUDA, we preserve the param loads coming from function arguments
+      return false;
+    }
+
+    assert(TexHandleDef.getOperand(6).isSymbol() && "Load is not a symbol!");
+    StringRef Sym = TexHandleDef.getOperand(6).getSymbolName();
+    std::string ParamBaseName = MF.getName();
+    ParamBaseName += "_param_";
+    assert(Sym.startswith(ParamBaseName) && "Invalid symbol reference");
+    unsigned Param = atoi(Sym.data()+ParamBaseName.size());
+    std::string NewSym;
+    raw_string_ostream NewSymStr(NewSym);
+    NewSymStr << MF.getFunction()->getName() << "_param_" << Param;
+
+    InstrsToRemove.insert(&TexHandleDef);
+    Idx = MFI->getImageHandleSymbolIndex(NewSymStr.str().c_str());
+    return true;
+  }
+  case NVPTX::texsurf_handles: {
+    // The handle is a global variable, replace with the global variable name
+    assert(TexHandleDef.getOperand(1).isGlobal() && "Load is not a global!");
+    const GlobalValue *GV = TexHandleDef.getOperand(1).getGlobal();
+    assert(GV->hasName() && "Global sampler must be named!");
+    InstrsToRemove.insert(&TexHandleDef);
+    Idx = MFI->getImageHandleSymbolIndex(GV->getName().data());
+    return true;
+  }
+  case NVPTX::nvvm_move_i64:
+  case TargetOpcode::COPY: {
+    bool Res = findIndexForHandle(TexHandleDef.getOperand(1), MF, Idx);
+    if (Res) {
+      InstrsToRemove.insert(&TexHandleDef);
+    }
+    return Res;
+  }
+  default:
+    llvm_unreachable("Unknown instruction operating on handle");
+  }
+}
+
+MachineFunctionPass *llvm::createNVPTXReplaceImageHandlesPass() {
+  return new NVPTXReplaceImageHandles();
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXSection.h b/contrib/llvm/lib/Target/NVPTX/NVPTXSection.h
new file mode 100644
index 0000000..aa0436b
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXSection.h
@@ -0,0 +1,48 @@
+//===- NVPTXSection.h - NVPTX-specific section representation -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the NVPTXSection class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_NVPTXSECTION_H
+#define LLVM_NVPTXSECTION_H
+
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/MC/MCSection.h"
+#include <vector>
+
+namespace llvm {
+/// NVPTXSection - Represents a section in PTX
+/// PTX does not have sections. We create this class in order to use
+/// the ASMPrint interface.
+///
+class NVPTXSection : public MCSection {
+  virtual void anchor();
+public:
+  NVPTXSection(SectionVariant V, SectionKind K) : MCSection(V, K) {}
+  virtual ~NVPTXSection() {}
+
+  /// Override this as NVPTX has its own way of printing switching
+  /// to a section.
+  void PrintSwitchToSection(const MCAsmInfo &MAI,
+                            raw_ostream &OS,
+                            const MCExpr *Subsection) const override {}
+
+  /// Base address of PTX sections is zero.
+  bool isBaseAddressKnownZero() const override { return true; }
+  bool UseCodeAlign() const override { return false; }
+  bool isVirtualSection() const override { return false; }
+  std::string getLabelBeginName() const override { return ""; }
+  std::string getLabelEndName() const override { return ""; }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
new file mode 100644
index 0000000..d5cded2
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -0,0 +1,70 @@
+//===- NVPTXSubtarget.cpp - NVPTX Subtarget Information -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the NVPTX specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXSubtarget.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "nvptx-subtarget"
+
+#define GET_SUBTARGETINFO_ENUM
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "NVPTXGenSubtargetInfo.inc"
+
+// Pin the vtable to this file.
+void NVPTXSubtarget::anchor() {}
+
+static std::string computeDataLayout(bool is64Bit) {
+  std::string Ret = "e";
+
+  if (!is64Bit)
+    Ret += "-p:32:32";
+
+  Ret += "-i64:64-v16:16-v32:32-n16:32:64";
+
+  return Ret;
+}
+
+NVPTXSubtarget &NVPTXSubtarget::initializeSubtargetDependencies(StringRef CPU,
+                                                                StringRef FS) {
+    // Provide the default CPU if we don't have one.
+  if (CPU.empty() && FS.size())
+    llvm_unreachable("we are not using FeatureStr");
+  TargetName = CPU.empty() ? "sm_20" : CPU;
+
+  ParseSubtargetFeatures(TargetName, FS);
+
+  // Set default to PTX 3.2 (CUDA 5.5)
+  if (PTXVersion == 0) {
+    PTXVersion = 32;
+  }
+
+  return *this;
+}
+
+NVPTXSubtarget::NVPTXSubtarget(const std::string &TT, const std::string &CPU,
+                               const std::string &FS, const TargetMachine &TM,
+                               bool is64Bit)
+    : NVPTXGenSubtargetInfo(TT, CPU, FS), Is64Bit(is64Bit), PTXVersion(0),
+      SmVersion(20), DL(computeDataLayout(is64Bit)),
+      InstrInfo(initializeSubtargetDependencies(CPU, FS)),
+      TLInfo((NVPTXTargetMachine &)TM), TSInfo(&DL), FrameLowering(*this) {
+
+  Triple T(TT);
+
+  if (T.getOS() == Triple::NVCL)
+    drvInterface = NVPTX::NVCL;
+  else
+    drvInterface = NVPTX::CUDA;
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
new file mode 100644
index 0000000..4c41e4e
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -0,0 +1,116 @@
+//=====-- NVPTXSubtarget.h - Define Subtarget for the NVPTX ---*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the NVPTX specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTXSUBTARGET_H
+#define NVPTXSUBTARGET_H
+
+#include "NVPTX.h"
+#include "NVPTXFrameLowering.h"
+#include "NVPTXISelLowering.h"
+#include "NVPTXInstrInfo.h"
+#include "NVPTXRegisterInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "NVPTXGenSubtargetInfo.inc"
+
+namespace llvm {
+
+class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
+  virtual void anchor();
+  std::string TargetName;
+  NVPTX::DrvInterface drvInterface;
+  bool Is64Bit;
+
+  // PTX version x.y is represented as 10*x+y, e.g. 3.1 == 31
+  unsigned PTXVersion;
+
+  // SM version x.y is represented as 10*x+y, e.g. 3.1 == 31
+  unsigned int SmVersion;
+
+  const DataLayout DL; // Calculates type size & alignment
+  NVPTXInstrInfo InstrInfo;
+  NVPTXTargetLowering TLInfo;
+  TargetSelectionDAGInfo TSInfo;
+
+  // NVPTX does not have any call stack frame, but need a NVPTX specific
+  // FrameLowering class because TargetFrameLowering is abstract.
+  NVPTXFrameLowering FrameLowering;
+
+public:
+  /// This constructor initializes the data members to match that
+  /// of the specified module.
+  ///
+  NVPTXSubtarget(const std::string &TT, const std::string &CPU,
+                 const std::string &FS, const TargetMachine &TM, bool is64Bit);
+
+  const TargetFrameLowering *getFrameLowering() const { return &FrameLowering; }
+  const NVPTXInstrInfo *getInstrInfo() const { return &InstrInfo; }
+  const DataLayout *getDataLayout() const { return &DL; }
+  const NVPTXRegisterInfo *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+  const NVPTXTargetLowering *getTargetLowering() const { return &TLInfo; }
+  const TargetSelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; }
+
+  bool hasBrkPt() const { return SmVersion >= 11; }
+  bool hasAtomRedG32() const { return SmVersion >= 11; }
+  bool hasAtomRedS32() const { return SmVersion >= 12; }
+  bool hasAtomRedG64() const { return SmVersion >= 12; }
+  bool hasAtomRedS64() const { return SmVersion >= 20; }
+  bool hasAtomRedGen32() const { return SmVersion >= 20; }
+  bool hasAtomRedGen64() const { return SmVersion >= 20; }
+  bool hasAtomAddF32() const { return SmVersion >= 20; }
+  bool hasVote() const { return SmVersion >= 12; }
+  bool hasDouble() const { return SmVersion >= 13; }
+  bool reqPTX20() const { return SmVersion >= 20; }
+  bool hasF32FTZ() const { return SmVersion >= 20; }
+  bool hasFMAF32() const { return SmVersion >= 20; }
+  bool hasFMAF64() const { return SmVersion >= 13; }
+  bool hasLDG() const { return SmVersion >= 32; }
+  bool hasLDU() const { return ((SmVersion >= 20) && (SmVersion < 30)); }
+  bool hasGenericLdSt() const { return SmVersion >= 20; }
+  inline bool hasHWROT32() const { return SmVersion >= 32; }
+  inline bool hasSWROT32() const {
+    return ((SmVersion >= 20) && (SmVersion < 32));
+  }
+  inline bool hasROT32() const { return hasHWROT32() || hasSWROT32(); }
+  inline bool hasROT64() const { return SmVersion >= 20; }
+
+  bool hasImageHandles() const {
+    // Enable handles for Kepler+, where CUDA supports indirect surfaces and
+    // textures
+    if (getDrvInterface() == NVPTX::CUDA)
+      return (SmVersion >= 30);
+
+    // Disabled, otherwise
+    return false;
+  }
+  bool is64Bit() const { return Is64Bit; }
+
+  unsigned int getSmVersion() const { return SmVersion; }
+  NVPTX::DrvInterface getDrvInterface() const { return drvInterface; }
+  std::string getTargetName() const { return TargetName; }
+
+  unsigned getPTXVersion() const { return PTXVersion; }
+
+  NVPTXSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+};
+
+} // End llvm namespace
+
+#endif // NVPTXSUBTARGET_H
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
new file mode 100644
index 0000000..069a1b9
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -0,0 +1,250 @@
+//===-- NVPTXTargetMachine.cpp - Define TargetMachine for NVPTX -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Top-level implementation for the NVPTX target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXTargetMachine.h"
+#include "MCTargetDesc/NVPTXMCAsmInfo.h"
+#include "NVPTX.h"
+#include "NVPTXAllocaHoisting.h"
+#include "NVPTXLowerAggrCopies.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IRPrintingPasses.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+
+namespace llvm {
+void initializeNVVMReflectPass(PassRegistry&);
+void initializeGenericToNVVMPass(PassRegistry&);
+void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&);
+void initializeNVPTXFavorNonGenericAddrSpacesPass(PassRegistry &);
+}
+
+extern "C" void LLVMInitializeNVPTXTarget() {
+  // Register the target.
+  RegisterTargetMachine<NVPTXTargetMachine32> X(TheNVPTXTarget32);
+  RegisterTargetMachine<NVPTXTargetMachine64> Y(TheNVPTXTarget64);
+
+  // FIXME: This pass is really intended to be invoked during IR optimization,
+  // but it's very NVPTX-specific.
+  initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
+  initializeGenericToNVVMPass(*PassRegistry::getPassRegistry());
+  initializeNVPTXAssignValidGlobalNamesPass(*PassRegistry::getPassRegistry());
+  initializeNVPTXFavorNonGenericAddrSpacesPass(
+    *PassRegistry::getPassRegistry());
+}
+
+NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, StringRef TT,
+                                       StringRef CPU, StringRef FS,
+                                       const TargetOptions &Options,
+                                       Reloc::Model RM, CodeModel::Model CM,
+                                       CodeGenOpt::Level OL, bool is64bit)
+    : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+      Subtarget(TT, CPU, FS, *this, is64bit) {
+  initAsmInfo();
+}
+
+void NVPTXTargetMachine32::anchor() {}
+
+NVPTXTargetMachine32::NVPTXTargetMachine32(
+    const Target &T, StringRef TT, StringRef CPU, StringRef FS,
+    const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM,
+    CodeGenOpt::Level OL)
+    : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
+
+void NVPTXTargetMachine64::anchor() {}
+
+NVPTXTargetMachine64::NVPTXTargetMachine64(
+    const Target &T, StringRef TT, StringRef CPU, StringRef FS,
+    const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM,
+    CodeGenOpt::Level OL)
+    : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
+
+namespace {
+class NVPTXPassConfig : public TargetPassConfig {
+public:
+  NVPTXPassConfig(NVPTXTargetMachine *TM, PassManagerBase &PM)
+      : TargetPassConfig(TM, PM) {}
+
+  NVPTXTargetMachine &getNVPTXTargetMachine() const {
+    return getTM<NVPTXTargetMachine>();
+  }
+
+  void addIRPasses() override;
+  bool addInstSelector() override;
+  bool addPreRegAlloc() override;
+  bool addPostRegAlloc() override;
+  void addMachineSSAOptimization() override;
+
+  FunctionPass *createTargetRegisterAllocator(bool) override;
+  void addFastRegAlloc(FunctionPass *RegAllocPass) override;
+  void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
+};
+} // end anonymous namespace
+
+TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) {
+  NVPTXPassConfig *PassConfig = new NVPTXPassConfig(this, PM);
+  return PassConfig;
+}
+
+void NVPTXPassConfig::addIRPasses() {
+  // The following passes are known to not play well with virtual regs hanging
+  // around after register allocation (which in our case, is *all* registers).
+  // We explicitly disable them here.  We do, however, need some functionality
+  // of the PrologEpilogCodeInserter pass, so we emulate that behavior in the
+  // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp).
+  disablePass(&PrologEpilogCodeInserterID);
+  disablePass(&MachineCopyPropagationID);
+  disablePass(&BranchFolderPassID);
+  disablePass(&TailDuplicateID);
+
+  addPass(createNVPTXImageOptimizerPass());
+  TargetPassConfig::addIRPasses();
+  addPass(createNVPTXAssignValidGlobalNamesPass());
+  addPass(createGenericToNVVMPass());
+  addPass(createNVPTXFavorNonGenericAddrSpacesPass());
+  addPass(createSeparateConstOffsetFromGEPPass());
+  // The SeparateConstOffsetFromGEP pass creates variadic bases that can be used
+  // by multiple GEPs. Run GVN or EarlyCSE to really reuse them. GVN generates
+  // significantly better code than EarlyCSE for some of our benchmarks.
+  if (getOptLevel() == CodeGenOpt::Aggressive)
+    addPass(createGVNPass());
+  else
+    addPass(createEarlyCSEPass());
+  // Both FavorNonGenericAddrSpaces and SeparateConstOffsetFromGEP may leave
+  // some dead code.  We could remove dead code in an ad-hoc manner, but that
+  // requires manual work and might be error-prone.
+  //
+  // The FavorNonGenericAddrSpaces pass shortcuts unnecessary addrspacecasts,
+  // and leave them unused.
+  //
+  // SeparateConstOffsetFromGEP rebuilds a new index from the old index, and the
+  // old index and some of its intermediate results may become unused.
+  addPass(createDeadCodeEliminationPass());
+}
+
+bool NVPTXPassConfig::addInstSelector() {
+  const NVPTXSubtarget &ST =
+    getTM<NVPTXTargetMachine>().getSubtarget<NVPTXSubtarget>();
+
+  addPass(createLowerAggrCopies());
+  addPass(createAllocaHoisting());
+  addPass(createNVPTXISelDag(getNVPTXTargetMachine(), getOptLevel()));
+
+  if (!ST.hasImageHandles())
+    addPass(createNVPTXReplaceImageHandlesPass());
+
+  return false;
+}
+
+bool NVPTXPassConfig::addPreRegAlloc() { return false; }
+bool NVPTXPassConfig::addPostRegAlloc() {
+  addPass(createNVPTXPrologEpilogPass());
+  return false;
+}
+
+FunctionPass *NVPTXPassConfig::createTargetRegisterAllocator(bool) {
+  return nullptr; // No reg alloc
+}
+
+void NVPTXPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
+  assert(!RegAllocPass && "NVPTX uses no regalloc!");
+  addPass(&PHIEliminationID);
+  addPass(&TwoAddressInstructionPassID);
+}
+
+void NVPTXPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
+  assert(!RegAllocPass && "NVPTX uses no regalloc!");
+
+  addPass(&ProcessImplicitDefsID);
+  addPass(&LiveVariablesID);
+  addPass(&MachineLoopInfoID);
+  addPass(&PHIEliminationID);
+
+  addPass(&TwoAddressInstructionPassID);
+  addPass(&RegisterCoalescerID);
+
+  // PreRA instruction scheduling.
+  if (addPass(&MachineSchedulerID))
+    printAndVerify("After Machine Scheduling");
+
+
+  addPass(&StackSlotColoringID);
+
+  // FIXME: Needs physical registers
+  //addPass(&PostRAMachineLICMID);
+
+  printAndVerify("After StackSlotColoring");
+}
+
+void NVPTXPassConfig::addMachineSSAOptimization() {
+  // Pre-ra tail duplication.
+  if (addPass(&EarlyTailDuplicateID))
+    printAndVerify("After Pre-RegAlloc TailDuplicate");
+
+  // Optimize PHIs before DCE: removing dead PHI cycles may make more
+  // instructions dead.
+  addPass(&OptimizePHIsID);
+
+  // This pass merges large allocas. StackSlotColoring is a different pass
+  // which merges spill slots.
+  addPass(&StackColoringID);
+
+  // If the target requests it, assign local variables to stack slots relative
+  // to one another and simplify frame index references where possible.
+  addPass(&LocalStackSlotAllocationID);
+
+  // With optimization, dead code should already be eliminated. However
+  // there is one known exception: lowered code for arguments that are only
+  // used by tail calls, where the tail calls reuse the incoming stack
+  // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll).
+  addPass(&DeadMachineInstructionElimID);
+  printAndVerify("After codegen DCE pass");
+
+  // Allow targets to insert passes that improve instruction level parallelism,
+  // like if-conversion. Such passes will typically need dominator trees and
+  // loop info, just like LICM and CSE below.
+  if (addILPOpts())
+    printAndVerify("After ILP optimizations");
+
+  addPass(&MachineLICMID);
+  addPass(&MachineCSEID);
+
+  addPass(&MachineSinkingID);
+  printAndVerify("After Machine LICM, CSE and Sinking passes");
+
+  addPass(&PeepholeOptimizerID);
+  printAndVerify("After codegen peephole optimization pass");
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
new file mode 100644
index 0000000..a7a1c8f
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
@@ -0,0 +1,100 @@
+//===-- NVPTXTargetMachine.h - Define TargetMachine for NVPTX ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the NVPTX specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTX_TARGETMACHINE_H
+#define NVPTX_TARGETMACHINE_H
+
+#include "NVPTXSubtarget.h"
+#include "ManagedStringPool.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+
+namespace llvm {
+
+/// NVPTXTargetMachine
+///
+class NVPTXTargetMachine : public LLVMTargetMachine {
+  NVPTXSubtarget Subtarget;
+
+  // Hold Strings that can be free'd all together with NVPTXTargetMachine
+  ManagedStringPool ManagedStrPool;
+
+public:
+  NVPTXTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS,
+                     const TargetOptions &Options, Reloc::Model RM,
+                     CodeModel::Model CM, CodeGenOpt::Level OP, bool is64bit);
+
+  const TargetFrameLowering *getFrameLowering() const override {
+    return getSubtargetImpl()->getFrameLowering();
+  }
+  const NVPTXInstrInfo *getInstrInfo() const override {
+    return getSubtargetImpl()->getInstrInfo();
+  }
+  const DataLayout *getDataLayout() const override {
+    return getSubtargetImpl()->getDataLayout();
+  }
+  const NVPTXSubtarget *getSubtargetImpl() const override { return &Subtarget; }
+  const NVPTXRegisterInfo *getRegisterInfo() const override {
+    return getSubtargetImpl()->getRegisterInfo();
+  }
+
+  const NVPTXTargetLowering *getTargetLowering() const override {
+    return getSubtargetImpl()->getTargetLowering();
+  }
+
+  const TargetSelectionDAGInfo *getSelectionDAGInfo() const override {
+    return getSubtargetImpl()->getSelectionDAGInfo();
+  }
+
+  ManagedStringPool *getManagedStrPool() const {
+    return const_cast<ManagedStringPool *>(&ManagedStrPool);
+  }
+
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+  // Emission of machine code through JITCodeEmitter is not supported.
+  bool addPassesToEmitMachineCode(PassManagerBase &, JITCodeEmitter &,
+                                  bool = true) override {
+    return true;
+  }
+
+  // Emission of machine code through MCJIT is not supported.
+  bool addPassesToEmitMC(PassManagerBase &, MCContext *&, raw_ostream &,
+                         bool = true) override {
+    return true;
+  }
+
+}; // NVPTXTargetMachine.
+
+class NVPTXTargetMachine32 : public NVPTXTargetMachine {
+  virtual void anchor();
+public:
+  NVPTXTargetMachine32(const Target &T, StringRef TT, StringRef CPU,
+                       StringRef FS, const TargetOptions &Options,
+                       Reloc::Model RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL);
+};
+
+class NVPTXTargetMachine64 : public NVPTXTargetMachine {
+  virtual void anchor();
+public:
+  NVPTXTargetMachine64(const Target &T, StringRef TT, StringRef CPU,
+                       StringRef FS, const TargetOptions &Options,
+                       Reloc::Model RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h
new file mode 100644
index 0000000..ba8086d
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h
@@ -0,0 +1,105 @@
+//===-- NVPTXTargetObjectFile.h - NVPTX Object Info -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_NVPTX_TARGETOBJECTFILE_H
+#define LLVM_TARGET_NVPTX_TARGETOBJECTFILE_H
+
+#include "NVPTXSection.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include <string>
+
+namespace llvm {
+class GlobalVariable;
+class Module;
+
+class NVPTXTargetObjectFile : public TargetLoweringObjectFile {
+
+public:
+  NVPTXTargetObjectFile() {
+    TextSection = nullptr;
+    DataSection = nullptr;
+    BSSSection = nullptr;
+    ReadOnlySection = nullptr;
+
+    StaticCtorSection = nullptr;
+    StaticDtorSection = nullptr;
+    LSDASection = nullptr;
+    EHFrameSection = nullptr;
+    DwarfAbbrevSection = nullptr;
+    DwarfInfoSection = nullptr;
+    DwarfLineSection = nullptr;
+    DwarfFrameSection = nullptr;
+    DwarfPubTypesSection = nullptr;
+    DwarfDebugInlineSection = nullptr;
+    DwarfStrSection = nullptr;
+    DwarfLocSection = nullptr;
+    DwarfARangesSection = nullptr;
+    DwarfRangesSection = nullptr;
+    DwarfMacroInfoSection = nullptr;
+  }
+
+  virtual ~NVPTXTargetObjectFile();
+
+  void Initialize(MCContext &ctx, const TargetMachine &TM) override {
+    TargetLoweringObjectFile::Initialize(ctx, TM);
+    TextSection = new NVPTXSection(MCSection::SV_ELF, SectionKind::getText());
+    DataSection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getDataRel());
+    BSSSection = new NVPTXSection(MCSection::SV_ELF, SectionKind::getBSS());
+    ReadOnlySection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getReadOnly());
+
+    StaticCtorSection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+    StaticDtorSection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+    LSDASection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+    EHFrameSection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+    DwarfAbbrevSection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+    DwarfInfoSection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+    DwarfLineSection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+    DwarfFrameSection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+    DwarfPubTypesSection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+    DwarfDebugInlineSection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+    DwarfStrSection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+    DwarfLocSection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+    DwarfARangesSection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+    DwarfRangesSection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+    DwarfMacroInfoSection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+  }
+
+  const MCSection *getSectionForConstant(SectionKind Kind,
+                                         const Constant *C) const override {
+    return ReadOnlySection;
+  }
+
+  const MCSection *getExplicitSectionGlobal(const GlobalValue *GV,
+                                       SectionKind Kind, Mangler &Mang,
+                                       const TargetMachine &TM) const override {
+    return DataSection;
+  }
+
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
new file mode 100644
index 0000000..a9fd190
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -0,0 +1,543 @@
+//===- NVPTXUtilities.cpp - Utility Functions -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains miscellaneous utility functions
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXUtilities.h"
+#include "NVPTX.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include <algorithm>
+#include <cstring>
+#include <map>
+#include <string>
+#include <vector>
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/Support/MutexGuard.h"
+
+using namespace llvm;
+
+typedef std::map<std::string, std::vector<unsigned> > key_val_pair_t;
+typedef std::map<const GlobalValue *, key_val_pair_t> global_val_annot_t;
+typedef std::map<const Module *, global_val_annot_t> per_module_annot_t;
+
+ManagedStatic<per_module_annot_t> annotationCache;
+static sys::Mutex Lock;
+
+void llvm::clearAnnotationCache(const llvm::Module *Mod) {
+  MutexGuard Guard(Lock);
+  annotationCache->erase(Mod);
+}
+
+static void cacheAnnotationFromMD(const MDNode *md, key_val_pair_t &retval) {
+  MutexGuard Guard(Lock);
+  assert(md && "Invalid mdnode for annotation");
+  assert((md->getNumOperands() % 2) == 1 && "Invalid number of operands");
+  // start index = 1, to skip the global variable key
+  // increment = 2, to skip the value for each property-value pairs
+  for (unsigned i = 1, e = md->getNumOperands(); i != e; i += 2) {
+    // property
+    const MDString *prop = dyn_cast<MDString>(md->getOperand(i));
+    assert(prop && "Annotation property not a string");
+
+    // value
+    ConstantInt *Val = dyn_cast<ConstantInt>(md->getOperand(i + 1));
+    assert(Val && "Value operand not a constant int");
+
+    std::string keyname = prop->getString().str();
+    if (retval.find(keyname) != retval.end())
+      retval[keyname].push_back(Val->getZExtValue());
+    else {
+      std::vector<unsigned> tmp;
+      tmp.push_back(Val->getZExtValue());
+      retval[keyname] = tmp;
+    }
+  }
+}
+
+static void cacheAnnotationFromMD(const Module *m, const GlobalValue *gv) {
+  MutexGuard Guard(Lock);
+  NamedMDNode *NMD = m->getNamedMetadata(llvm::NamedMDForAnnotations);
+  if (!NMD)
+    return;
+  key_val_pair_t tmp;
+  for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) {
+    const MDNode *elem = NMD->getOperand(i);
+
+    Value *entity = elem->getOperand(0);
+    // entity may be null due to DCE
+    if (!entity)
+      continue;
+    if (entity != gv)
+      continue;
+
+    // accumulate annotations for entity in tmp
+    cacheAnnotationFromMD(elem, tmp);
+  }
+
+  if (tmp.empty()) // no annotations for this gv
+    return;
+
+  if ((*annotationCache).find(m) != (*annotationCache).end())
+    (*annotationCache)[m][gv] = tmp;
+  else {
+    global_val_annot_t tmp1;
+    tmp1[gv] = tmp;
+    (*annotationCache)[m] = tmp1;
+  }
+}
+
+bool llvm::findOneNVVMAnnotation(const GlobalValue *gv, std::string prop,
+                                 unsigned &retval) {
+  MutexGuard Guard(Lock);
+  const Module *m = gv->getParent();
+  if ((*annotationCache).find(m) == (*annotationCache).end())
+    cacheAnnotationFromMD(m, gv);
+  else if ((*annotationCache)[m].find(gv) == (*annotationCache)[m].end())
+    cacheAnnotationFromMD(m, gv);
+  if ((*annotationCache)[m][gv].find(prop) == (*annotationCache)[m][gv].end())
+    return false;
+  retval = (*annotationCache)[m][gv][prop][0];
+  return true;
+}
+
+bool llvm::findAllNVVMAnnotation(const GlobalValue *gv, std::string prop,
+                                 std::vector<unsigned> &retval) {
+  MutexGuard Guard(Lock);
+  const Module *m = gv->getParent();
+  if ((*annotationCache).find(m) == (*annotationCache).end())
+    cacheAnnotationFromMD(m, gv);
+  else if ((*annotationCache)[m].find(gv) == (*annotationCache)[m].end())
+    cacheAnnotationFromMD(m, gv);
+  if ((*annotationCache)[m][gv].find(prop) == (*annotationCache)[m][gv].end())
+    return false;
+  retval = (*annotationCache)[m][gv][prop];
+  return true;
+}
+
+bool llvm::isTexture(const llvm::Value &val) {
+  if (const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) {
+    unsigned annot;
+    if (llvm::findOneNVVMAnnotation(
+            gv, llvm::PropertyAnnotationNames[llvm::PROPERTY_ISTEXTURE],
+            annot)) {
+      assert((annot == 1) && "Unexpected annotation on a texture symbol");
+      return true;
+    }
+  }
+  return false;
+}
+
+bool llvm::isSurface(const llvm::Value &val) {
+  if (const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) {
+    unsigned annot;
+    if (llvm::findOneNVVMAnnotation(
+            gv, llvm::PropertyAnnotationNames[llvm::PROPERTY_ISSURFACE],
+            annot)) {
+      assert((annot == 1) && "Unexpected annotation on a surface symbol");
+      return true;
+    }
+  }
+  return false;
+}
+
+bool llvm::isSampler(const llvm::Value &val) {
+  if (const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) {
+    unsigned annot;
+    if (llvm::findOneNVVMAnnotation(
+            gv, llvm::PropertyAnnotationNames[llvm::PROPERTY_ISSAMPLER],
+            annot)) {
+      assert((annot == 1) && "Unexpected annotation on a sampler symbol");
+      return true;
+    }
+  }
+  if (const Argument *arg = dyn_cast<Argument>(&val)) {
+    const Function *func = arg->getParent();
+    std::vector<unsigned> annot;
+    if (llvm::findAllNVVMAnnotation(
+            func, llvm::PropertyAnnotationNames[llvm::PROPERTY_ISSAMPLER],
+            annot)) {
+      if (std::find(annot.begin(), annot.end(), arg->getArgNo()) != annot.end())
+        return true;
+    }
+  }
+  return false;
+}
+
+bool llvm::isImageReadOnly(const llvm::Value &val) {
+  if (const Argument *arg = dyn_cast<Argument>(&val)) {
+    const Function *func = arg->getParent();
+    std::vector<unsigned> annot;
+    if (llvm::findAllNVVMAnnotation(func,
+                                    llvm::PropertyAnnotationNames[
+                                        llvm::PROPERTY_ISREADONLY_IMAGE_PARAM],
+                                    annot)) {
+      if (std::find(annot.begin(), annot.end(), arg->getArgNo()) != annot.end())
+        return true;
+    }
+  }
+  return false;
+}
+
+bool llvm::isImageWriteOnly(const llvm::Value &val) {
+  if (const Argument *arg = dyn_cast<Argument>(&val)) {
+    const Function *func = arg->getParent();
+    std::vector<unsigned> annot;
+    if (llvm::findAllNVVMAnnotation(func,
+                                    llvm::PropertyAnnotationNames[
+                                        llvm::PROPERTY_ISWRITEONLY_IMAGE_PARAM],
+                                    annot)) {
+      if (std::find(annot.begin(), annot.end(), arg->getArgNo()) != annot.end())
+        return true;
+    }
+  }
+  return false;
+}
+
+bool llvm::isImageReadWrite(const llvm::Value &val) {
+  if (const Argument *arg = dyn_cast<Argument>(&val)) {
+    const Function *func = arg->getParent();
+    std::vector<unsigned> annot;
+    if (llvm::findAllNVVMAnnotation(func,
+                                    llvm::PropertyAnnotationNames[
+                                        llvm::PROPERTY_ISREADWRITE_IMAGE_PARAM],
+                                    annot)) {
+      if (std::find(annot.begin(), annot.end(), arg->getArgNo()) != annot.end())
+        return true;
+    }
+  }
+  return false;
+}
+
+bool llvm::isImage(const llvm::Value &val) {
+  return llvm::isImageReadOnly(val) || llvm::isImageWriteOnly(val) ||
+         llvm::isImageReadWrite(val);
+}
+
+bool llvm::isManaged(const llvm::Value &val) {
+  if(const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) {
+    unsigned annot;
+    if(llvm::findOneNVVMAnnotation(gv,
+                          llvm::PropertyAnnotationNames[llvm::PROPERTY_MANAGED],
+                                   annot)) {
+      assert((annot == 1) && "Unexpected annotation on a managed symbol");
+      return true;
+    }
+  }
+  return false;
+}
+
+std::string llvm::getTextureName(const llvm::Value &val) {
+  assert(val.hasName() && "Found texture variable with no name");
+  return val.getName();
+}
+
+std::string llvm::getSurfaceName(const llvm::Value &val) {
+  assert(val.hasName() && "Found surface variable with no name");
+  return val.getName();
+}
+
+std::string llvm::getSamplerName(const llvm::Value &val) {
+  assert(val.hasName() && "Found sampler variable with no name");
+  return val.getName();
+}
+
+bool llvm::getMaxNTIDx(const Function &F, unsigned &x) {
+  return (llvm::findOneNVVMAnnotation(
+      &F, llvm::PropertyAnnotationNames[llvm::PROPERTY_MAXNTID_X], x));
+}
+
+bool llvm::getMaxNTIDy(const Function &F, unsigned &y) {
+  return (llvm::findOneNVVMAnnotation(
+      &F, llvm::PropertyAnnotationNames[llvm::PROPERTY_MAXNTID_Y], y));
+}
+
+bool llvm::getMaxNTIDz(const Function &F, unsigned &z) {
+  return (llvm::findOneNVVMAnnotation(
+      &F, llvm::PropertyAnnotationNames[llvm::PROPERTY_MAXNTID_Z], z));
+}
+
+bool llvm::getReqNTIDx(const Function &F, unsigned &x) {
+  return (llvm::findOneNVVMAnnotation(
+      &F, llvm::PropertyAnnotationNames[llvm::PROPERTY_REQNTID_X], x));
+}
+
+bool llvm::getReqNTIDy(const Function &F, unsigned &y) {
+  return (llvm::findOneNVVMAnnotation(
+      &F, llvm::PropertyAnnotationNames[llvm::PROPERTY_REQNTID_Y], y));
+}
+
+bool llvm::getReqNTIDz(const Function &F, unsigned &z) {
+  return (llvm::findOneNVVMAnnotation(
+      &F, llvm::PropertyAnnotationNames[llvm::PROPERTY_REQNTID_Z], z));
+}
+
+bool llvm::getMinCTASm(const Function &F, unsigned &x) {
+  return (llvm::findOneNVVMAnnotation(
+      &F, llvm::PropertyAnnotationNames[llvm::PROPERTY_MINNCTAPERSM], x));
+}
+
+bool llvm::isKernelFunction(const Function &F) {
+  unsigned x = 0;
+  bool retval = llvm::findOneNVVMAnnotation(
+      &F, llvm::PropertyAnnotationNames[llvm::PROPERTY_ISKERNEL_FUNCTION], x);
+  if (retval == false) {
+    // There is no NVVM metadata, check the calling convention
+    if (F.getCallingConv() == llvm::CallingConv::PTX_Kernel)
+      return true;
+    else
+      return false;
+  }
+  return (x == 1);
+}
+
+bool llvm::getAlign(const Function &F, unsigned index, unsigned &align) {
+  std::vector<unsigned> Vs;
+  bool retval = llvm::findAllNVVMAnnotation(
+      &F, llvm::PropertyAnnotationNames[llvm::PROPERTY_ALIGN], Vs);
+  if (retval == false)
+    return false;
+  for (int i = 0, e = Vs.size(); i < e; i++) {
+    unsigned v = Vs[i];
+    if ((v >> 16) == index) {
+      align = v & 0xFFFF;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool llvm::getAlign(const CallInst &I, unsigned index, unsigned &align) {
+  if (MDNode *alignNode = I.getMetadata("callalign")) {
+    for (int i = 0, n = alignNode->getNumOperands(); i < n; i++) {
+      if (const ConstantInt *CI =
+              dyn_cast<ConstantInt>(alignNode->getOperand(i))) {
+        unsigned v = CI->getZExtValue();
+        if ((v >> 16) == index) {
+          align = v & 0xFFFF;
+          return true;
+        }
+        if ((v >> 16) > index) {
+          return false;
+        }
+      }
+    }
+  }
+  return false;
+}
+
+bool llvm::isBarrierIntrinsic(Intrinsic::ID id) {
+  if ((id == Intrinsic::nvvm_barrier0) ||
+      (id == Intrinsic::nvvm_barrier0_popc) ||
+      (id == Intrinsic::nvvm_barrier0_and) ||
+      (id == Intrinsic::nvvm_barrier0_or) ||
+      (id == Intrinsic::cuda_syncthreads))
+    return true;
+  return false;
+}
+
+// Interface for checking all memory space transfer related intrinsics
+bool llvm::isMemorySpaceTransferIntrinsic(Intrinsic::ID id) {
+  if (id == Intrinsic::nvvm_ptr_local_to_gen ||
+      id == Intrinsic::nvvm_ptr_shared_to_gen ||
+      id == Intrinsic::nvvm_ptr_global_to_gen ||
+      id == Intrinsic::nvvm_ptr_constant_to_gen ||
+      id == Intrinsic::nvvm_ptr_gen_to_global ||
+      id == Intrinsic::nvvm_ptr_gen_to_shared ||
+      id == Intrinsic::nvvm_ptr_gen_to_local ||
+      id == Intrinsic::nvvm_ptr_gen_to_constant ||
+      id == Intrinsic::nvvm_ptr_gen_to_param) {
+    return true;
+  }
+
+  return false;
+}
+
+// consider several special intrinsics in striping pointer casts, and
+// provide an option to ignore GEP indicies for find out the base address only
+// which could be used in simple alias disambigurate.
+const Value *
+llvm::skipPointerTransfer(const Value *V, bool ignore_GEP_indices) {
+  V = V->stripPointerCasts();
+  while (true) {
+    if (const IntrinsicInst *IS = dyn_cast<IntrinsicInst>(V)) {
+      if (isMemorySpaceTransferIntrinsic(IS->getIntrinsicID())) {
+        V = IS->getArgOperand(0)->stripPointerCasts();
+        continue;
+      }
+    } else if (ignore_GEP_indices)
+      if (const GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
+        V = GEP->getPointerOperand()->stripPointerCasts();
+        continue;
+      }
+    break;
+  }
+  return V;
+}
+
+// consider several special intrinsics in striping pointer casts, and
+// - ignore GEP indicies for find out the base address only, and
+// - tracking PHINode
+// which could be used in simple alias disambigurate.
+const Value *
+llvm::skipPointerTransfer(const Value *V, std::set<const Value *> &processed) {
+  if (processed.find(V) != processed.end())
+    return nullptr;
+  processed.insert(V);
+
+  const Value *V2 = V->stripPointerCasts();
+  if (V2 != V && processed.find(V2) != processed.end())
+    return nullptr;
+  processed.insert(V2);
+
+  V = V2;
+
+  while (true) {
+    if (const IntrinsicInst *IS = dyn_cast<IntrinsicInst>(V)) {
+      if (isMemorySpaceTransferIntrinsic(IS->getIntrinsicID())) {
+        V = IS->getArgOperand(0)->stripPointerCasts();
+        continue;
+      }
+    } else if (const GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
+      V = GEP->getPointerOperand()->stripPointerCasts();
+      continue;
+    } else if (const PHINode *PN = dyn_cast<PHINode>(V)) {
+      if (V != V2 && processed.find(V) != processed.end())
+        return nullptr;
+      processed.insert(PN);
+      const Value *common = nullptr;
+      for (unsigned i = 0; i != PN->getNumIncomingValues(); ++i) {
+        const Value *pv = PN->getIncomingValue(i);
+        const Value *base = skipPointerTransfer(pv, processed);
+        if (base) {
+          if (!common)
+            common = base;
+          else if (common != base)
+            return PN;
+        }
+      }
+      if (!common)
+        return PN;
+      V = common;
+    }
+    break;
+  }
+  return V;
+}
+
+// The following are some useful utilities for debuggung
+
+BasicBlock *llvm::getParentBlock(Value *v) {
+  if (BasicBlock *B = dyn_cast<BasicBlock>(v))
+    return B;
+
+  if (Instruction *I = dyn_cast<Instruction>(v))
+    return I->getParent();
+
+  return nullptr;
+}
+
+Function *llvm::getParentFunction(Value *v) {
+  if (Function *F = dyn_cast<Function>(v))
+    return F;
+
+  if (Instruction *I = dyn_cast<Instruction>(v))
+    return I->getParent()->getParent();
+
+  if (BasicBlock *B = dyn_cast<BasicBlock>(v))
+    return B->getParent();
+
+  return nullptr;
+}
+
+// Dump a block by name
+void llvm::dumpBlock(Value *v, char *blockName) {
+  Function *F = getParentFunction(v);
+  if (!F)
+    return;
+
+  for (Function::iterator it = F->begin(), ie = F->end(); it != ie; ++it) {
+    BasicBlock *B = it;
+    if (strcmp(B->getName().data(), blockName) == 0) {
+      B->dump();
+      return;
+    }
+  }
+}
+
+// Find an instruction by name
+Instruction *llvm::getInst(Value *base, char *instName) {
+  Function *F = getParentFunction(base);
+  if (!F)
+    return nullptr;
+
+  for (inst_iterator it = inst_begin(F), ie = inst_end(F); it != ie; ++it) {
+    Instruction *I = &*it;
+    if (strcmp(I->getName().data(), instName) == 0) {
+      return I;
+    }
+  }
+
+  return nullptr;
+}
+
+// Dump an instruction by nane
+void llvm::dumpInst(Value *base, char *instName) {
+  Instruction *I = getInst(base, instName);
+  if (I)
+    I->dump();
+}
+
+// Dump an instruction and all dependent instructions
+void llvm::dumpInstRec(Value *v, std::set<Instruction *> *visited) {
+  if (Instruction *I = dyn_cast<Instruction>(v)) {
+
+    if (visited->find(I) != visited->end())
+      return;
+
+    visited->insert(I);
+
+    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+      dumpInstRec(I->getOperand(i), visited);
+
+    I->dump();
+  }
+}
+
+// Dump an instruction and all dependent instructions
+void llvm::dumpInstRec(Value *v) {
+  std::set<Instruction *> visited;
+
+  //BasicBlock *B = getParentBlock(v);
+
+  dumpInstRec(v, &visited);
+}
+
+// Dump the parent for Instruction, block or function
+void llvm::dumpParent(Value *v) {
+  if (Instruction *I = dyn_cast<Instruction>(v)) {
+    I->getParent()->dump();
+    return;
+  }
+
+  if (BasicBlock *B = dyn_cast<BasicBlock>(v)) {
+    B->getParent()->dump();
+    return;
+  }
+
+  if (Function *F = dyn_cast<Function>(v)) {
+    F->getParent()->dump();
+    return;
+  }
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.h
new file mode 100644
index 0000000..446bfa1
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.h
@@ -0,0 +1,96 @@
+//===-- NVPTXUtilities - Utilities -----------------------------*- C++ -*-====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the NVVM specific utility functions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTXUTILITIES_H
+#define NVPTXUTILITIES_H
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Value.h"
+#include <cstdarg>
+#include <set>
+#include <string>
+#include <vector>
+
+namespace llvm {
+
+#define NVCL_IMAGE2D_READONLY_FUNCNAME "__is_image2D_readonly"
+#define NVCL_IMAGE3D_READONLY_FUNCNAME "__is_image3D_readonly"
+
+void clearAnnotationCache(const llvm::Module *);
+
+bool findOneNVVMAnnotation(const llvm::GlobalValue *, std::string, unsigned &);
+bool findAllNVVMAnnotation(const llvm::GlobalValue *, std::string,
+                           std::vector<unsigned> &);
+
+bool isTexture(const llvm::Value &);
+bool isSurface(const llvm::Value &);
+bool isSampler(const llvm::Value &);
+bool isImage(const llvm::Value &);
+bool isImageReadOnly(const llvm::Value &);
+bool isImageWriteOnly(const llvm::Value &);
+bool isImageReadWrite(const llvm::Value &);
+bool isManaged(const llvm::Value &);
+
+std::string getTextureName(const llvm::Value &);
+std::string getSurfaceName(const llvm::Value &);
+std::string getSamplerName(const llvm::Value &);
+
+bool getMaxNTIDx(const llvm::Function &, unsigned &);
+bool getMaxNTIDy(const llvm::Function &, unsigned &);
+bool getMaxNTIDz(const llvm::Function &, unsigned &);
+
+bool getReqNTIDx(const llvm::Function &, unsigned &);
+bool getReqNTIDy(const llvm::Function &, unsigned &);
+bool getReqNTIDz(const llvm::Function &, unsigned &);
+
+bool getMinCTASm(const llvm::Function &, unsigned &);
+bool isKernelFunction(const llvm::Function &);
+
+bool getAlign(const llvm::Function &, unsigned index, unsigned &);
+bool getAlign(const llvm::CallInst &, unsigned index, unsigned &);
+
+bool isBarrierIntrinsic(llvm::Intrinsic::ID);
+
+/// make_vector - Helper function which is useful for building temporary vectors
+/// to pass into type construction of CallInst ctors.  This turns a null
+/// terminated list of pointers (or other value types) into a real live vector.
+///
+template <typename T> inline std::vector<T> make_vector(T A, ...) {
+  va_list Args;
+  va_start(Args, A);
+  std::vector<T> Result;
+  Result.push_back(A);
+  while (T Val = va_arg(Args, T))
+    Result.push_back(Val);
+  va_end(Args);
+  return Result;
+}
+
+bool isMemorySpaceTransferIntrinsic(Intrinsic::ID id);
+const Value *skipPointerTransfer(const Value *V, bool ignore_GEP_indices);
+const Value *
+skipPointerTransfer(const Value *V, std::set<const Value *> &processed);
+BasicBlock *getParentBlock(Value *v);
+Function *getParentFunction(Value *v);
+void dumpBlock(Value *v, char *blockName);
+Instruction *getInst(Value *base, char *instName);
+void dumpInst(Value *base, char *instName);
+void dumpInstRec(Value *v, std::set<Instruction *> *visited);
+void dumpInstRec(Value *v);
+void dumpParent(Value *v);
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXVector.td b/contrib/llvm/lib/Target/NVPTX/NVPTXVector.td
new file mode 100644
index 0000000..775df19
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXVector.td
@@ -0,0 +1,1481 @@
+//===- NVPTXVector.td - NVPTX Vector Specific Instruction defs -*- tblgen-*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//-----------------------------------
+// Vector Specific
+//-----------------------------------
+
+//
+// All vector instructions derive from NVPTXVecInst
+//
+
+class NVPTXVecInst<dag outs, dag ins, string asmstr, list<dag> pattern,
+  NVPTXInst sInst=NOP>
+  : NVPTXInst<outs, ins, asmstr, pattern> {
+  NVPTXInst scalarInst=sInst;
+}
+
+let isAsCheapAsAMove=1, VecInstType=isVecExtract.Value in {
+// Extract v2i16
+def V2i16Extract : NVPTXVecInst<(outs Int16Regs:$dst),
+  (ins V2I16Regs:$src, i8imm:$c),
+                         "mov.u16 \t$dst, $src${c:vecelem};",
+                         [(set Int16Regs:$dst, (vector_extract
+                           (v2i16 V2I16Regs:$src), imm:$c))],
+                         IMOV16rr>;
+
+// Extract v4i16
+def V4i16Extract : NVPTXVecInst<(outs Int16Regs:$dst),
+  (ins V4I16Regs:$src, i8imm:$c),
+                         "mov.u16 \t$dst, $src${c:vecelem};",
+                         [(set Int16Regs:$dst, (vector_extract
+                           (v4i16 V4I16Regs:$src), imm:$c))],
+                         IMOV16rr>;
+
+// Extract v2i8
+def V2i8Extract : NVPTXVecInst<(outs Int8Regs:$dst),
+  (ins V2I8Regs:$src, i8imm:$c),
+                         "mov.u16 \t$dst, $src${c:vecelem};",
+                         [(set Int8Regs:$dst, (vector_extract
+                           (v2i8 V2I8Regs:$src), imm:$c))],
+                         IMOV8rr>;
+
+// Extract v4i8
+def V4i8Extract : NVPTXVecInst<(outs Int8Regs:$dst),
+  (ins V4I8Regs:$src, i8imm:$c),
+                         "mov.u16 \t$dst, $src${c:vecelem};",
+                         [(set Int8Regs:$dst, (vector_extract
+                           (v4i8 V4I8Regs:$src), imm:$c))],
+                         IMOV8rr>;
+
+// Extract v2i32
+def V2i32Extract : NVPTXVecInst<(outs Int32Regs:$dst),
+  (ins V2I32Regs:$src, i8imm:$c),
+                         "mov.u32 \t$dst, $src${c:vecelem};",
+                         [(set Int32Regs:$dst, (vector_extract
+                           (v2i32 V2I32Regs:$src), imm:$c))],
+                         IMOV32rr>;
+
+// Extract v2f32
+def V2f32Extract : NVPTXVecInst<(outs Float32Regs:$dst),
+  (ins V2F32Regs:$src, i8imm:$c),
+                         "mov.f32 \t$dst, $src${c:vecelem};",
+                         [(set Float32Regs:$dst, (vector_extract
+                           (v2f32 V2F32Regs:$src), imm:$c))],
+                         FMOV32rr>;
+
+// Extract v2i64
+def V2i64Extract : NVPTXVecInst<(outs Int64Regs:$dst),
+  (ins V2I64Regs:$src, i8imm:$c),
+                         "mov.u64 \t$dst, $src${c:vecelem};",
+                         [(set Int64Regs:$dst, (vector_extract
+                           (v2i64 V2I64Regs:$src), imm:$c))],
+                         IMOV64rr>;
+
+// Extract v2f64
+def V2f64Extract : NVPTXVecInst<(outs Float64Regs:$dst),
+  (ins V2F64Regs:$src, i8imm:$c),
+                         "mov.f64 \t$dst, $src${c:vecelem};",
+                         [(set Float64Regs:$dst, (vector_extract
+                           (v2f64 V2F64Regs:$src), imm:$c))],
+                         FMOV64rr>;
+
+// Extract v4i32
+def V4i32Extract : NVPTXVecInst<(outs Int32Regs:$dst),
+  (ins V4I32Regs:$src, i8imm:$c),
+                         "mov.u32 \t$dst, $src${c:vecelem};",
+                         [(set Int32Regs:$dst, (vector_extract
+                           (v4i32 V4I32Regs:$src), imm:$c))],
+                         IMOV32rr>;
+
+// Extract v4f32
+def V4f32Extract : NVPTXVecInst<(outs Float32Regs:$dst),
+  (ins V4F32Regs:$src, i8imm:$c),
+                         "mov.f32 \t$dst, $src${c:vecelem};",
+                         [(set Float32Regs:$dst, (vector_extract
+                           (v4f32 V4F32Regs:$src), imm:$c))],
+                         FMOV32rr>;
+}
+
+let isAsCheapAsAMove=1, VecInstType=isVecInsert.Value in {
+// Insert v2i8
+def V2i8Insert : NVPTXVecInst<(outs V2I8Regs:$dst),
+  (ins V2I8Regs:$src, Int8Regs:$val, i8imm:$c),
+        "mov.v2.u16 \t${dst:vecfull}, ${src:vecfull};"
+        "\n\tmov.u16 \t$dst${c:vecelem}, $val;",
+       [(set V2I8Regs:$dst,
+         (vector_insert V2I8Regs:$src, Int8Regs:$val, imm:$c))],
+                         IMOV8rr>;
+
+// Insert v4i8
+def V4i8Insert : NVPTXVecInst<(outs V4I8Regs:$dst),
+  (ins V4I8Regs:$src, Int8Regs:$val, i8imm:$c),
+                       "mov.v4.u16 \t${dst:vecfull}, ${src:vecfull};"
+                       "\n\tmov.u16 \t$dst${c:vecelem}, $val;",
+       [(set V4I8Regs:$dst,
+         (vector_insert V4I8Regs:$src, Int8Regs:$val, imm:$c))],
+                         IMOV8rr>;
+
+// Insert v2i16
+def V2i16Insert : NVPTXVecInst<(outs V2I16Regs:$dst),
+  (ins V2I16Regs:$src, Int16Regs:$val, i8imm:$c),
+                       "mov.v2.u16 \t${dst:vecfull}, ${src:vecfull};"
+                       "\n\tmov.u16 \t$dst${c:vecelem}, $val;",
+       [(set V2I16Regs:$dst,
+         (vector_insert V2I16Regs:$src, Int16Regs:$val, imm:$c))],
+                         IMOV16rr>;
+
+// Insert v4i16
+def V4i16Insert : NVPTXVecInst<(outs V4I16Regs:$dst),
+  (ins V4I16Regs:$src, Int16Regs:$val, i8imm:$c),
+                       "mov.v4.u16 \t${dst:vecfull}, ${src:vecfull};"
+                       "\n\tmov.u16 \t$dst${c:vecelem}, $val;",
+       [(set V4I16Regs:$dst,
+         (vector_insert V4I16Regs:$src, Int16Regs:$val, imm:$c))],
+                         IMOV16rr>;
+
+// Insert v2i32
+def V2i32Insert : NVPTXVecInst<(outs V2I32Regs:$dst),
+  (ins V2I32Regs:$src, Int32Regs:$val, i8imm:$c),
+                       "mov.v2.u32 \t${dst:vecfull}, ${src:vecfull};"
+                       "\n\tmov.u32 \t$dst${c:vecelem}, $val;",
+       [(set V2I32Regs:$dst,
+         (vector_insert V2I32Regs:$src, Int32Regs:$val, imm:$c))],
+                         IMOV32rr>;
+
+// Insert v2f32
+def V2f32Insert : NVPTXVecInst<(outs V2F32Regs:$dst),
+  (ins V2F32Regs:$src, Float32Regs:$val, i8imm:$c),
+                       "mov.v2.f32 \t${dst:vecfull}, ${src:vecfull};"
+                       "\n\tmov.f32 \t$dst${c:vecelem}, $val;",
+       [(set V2F32Regs:$dst,
+         (vector_insert V2F32Regs:$src, Float32Regs:$val, imm:$c))],
+                         FMOV32rr>;
+
+// Insert v2i64
+def V2i64Insert : NVPTXVecInst<(outs V2I64Regs:$dst),
+  (ins V2I64Regs:$src, Int64Regs:$val, i8imm:$c),
+                       "mov.v2.u64 \t${dst:vecfull}, ${src:vecfull};"
+                       "\n\tmov.u64 \t$dst${c:vecelem}, $val;",
+       [(set V2I64Regs:$dst,
+         (vector_insert V2I64Regs:$src, Int64Regs:$val, imm:$c))],
+                         IMOV64rr>;
+
+// Insert v2f64
+def V2f64Insert : NVPTXVecInst<(outs V2F64Regs:$dst),
+  (ins V2F64Regs:$src, Float64Regs:$val, i8imm:$c),
+                       "mov.v2.f64 \t${dst:vecfull}, ${src:vecfull};"
+                       "\n\tmov.f64 \t$dst${c:vecelem}, $val;",
+       [(set V2F64Regs:$dst,
+         (vector_insert V2F64Regs:$src, Float64Regs:$val, imm:$c))],
+                         FMOV64rr>;
+
+// Insert v4i32
+def V4i32Insert : NVPTXVecInst<(outs V4I32Regs:$dst),
+  (ins V4I32Regs:$src, Int32Regs:$val, i8imm:$c),
+                       "mov.v4.u32 \t${dst:vecfull}, ${src:vecfull};"
+                       "\n\tmov.u32 \t$dst${c:vecelem}, $val;",
+       [(set V4I32Regs:$dst,
+         (vector_insert V4I32Regs:$src, Int32Regs:$val, imm:$c))],
+                         IMOV32rr>;
+
+// Insert v4f32
+def V4f32Insert : NVPTXVecInst<(outs V4F32Regs:$dst),
+  (ins V4F32Regs:$src, Float32Regs:$val, i8imm:$c),
+                       "mov.v4.f32 \t${dst:vecfull}, ${src:vecfull};"
+                       "\n\tmov.f32 \t$dst${c:vecelem}, $val;",
+       [(set V4F32Regs:$dst,
+         (vector_insert V4F32Regs:$src, Float32Regs:$val, imm:$c))],
+                         FMOV32rr>;
+}
+
+class BinOpAsmString<string c> {
+  string s = c;
+}
+
+class V4AsmStr<string opcode> : BinOpAsmString<
+                          !strconcat(!strconcat(!strconcat(!strconcat(
+                            !strconcat(!strconcat(!strconcat(
+                          opcode,  " \t${dst}_0, ${a}_0, ${b}_0;\n\t"),
+                          opcode), " \t${dst}_1, ${a}_1, ${b}_1;\n\t"),
+                          opcode), " \t${dst}_2, ${a}_2, ${b}_2;\n\t"),
+                          opcode), " \t${dst}_3, ${a}_3, ${b}_3;")>;
+
+class V2AsmStr<string opcode> : BinOpAsmString<
+                           !strconcat(!strconcat(!strconcat(
+                           opcode,  " \t${dst}_0, ${a}_0, ${b}_0;\n\t"),
+                           opcode), " \t${dst}_1, ${a}_1, ${b}_1;")>;
+
+class V4MADStr<string opcode> : BinOpAsmString<
+                          !strconcat(!strconcat(!strconcat(!strconcat(
+                            !strconcat(!strconcat(!strconcat(
+                          opcode,  " \t${dst}_0, ${a}_0, ${b}_0, ${c}_0;\n\t"),
+                          opcode), " \t${dst}_1, ${a}_1, ${b}_1, ${c}_1;\n\t"),
+                          opcode), " \t${dst}_2, ${a}_2, ${b}_2, ${c}_2;\n\t"),
+                          opcode), " \t${dst}_3, ${a}_3, ${b}_3, ${c}_3;")>;
+
+class V2MADStr<string opcode> : BinOpAsmString<
+                           !strconcat(!strconcat(!strconcat(
+                           opcode,  " \t${dst}_0, ${a}_0, ${b}_0, ${c}_0;\n\t"),
+                           opcode), " \t${dst}_1, ${a}_1, ${b}_1, ${c}_1;")>;
+
+class V4UnaryStr<string opcode> : BinOpAsmString<
+                          !strconcat(!strconcat(!strconcat(!strconcat(
+                            !strconcat(!strconcat(!strconcat(
+                          opcode,  " \t${dst}_0, ${a}_0;\n\t"),
+                          opcode), " \t${dst}_1, ${a}_1;\n\t"),
+                          opcode), " \t${dst}_2, ${a}_2;\n\t"),
+                          opcode), " \t${dst}_3, ${a}_3;")>;
+
+class V2UnaryStr<string opcode> : BinOpAsmString<
+                           !strconcat(!strconcat(!strconcat(
+                           opcode,  " \t${dst}_0, ${a}_0;\n\t"),
+                           opcode), " \t${dst}_1, ${a}_1;")>;
+
+class VecBinaryOp<BinOpAsmString asmstr, SDNode OpNode, NVPTXRegClass regclass,
+  NVPTXInst sInst=NOP> :
+      NVPTXVecInst<(outs regclass:$dst), (ins regclass:$a, regclass:$b),
+                 asmstr.s,
+                 [(set regclass:$dst, (OpNode regclass:$a, regclass:$b))],
+                 sInst>;
+
+class VecShiftOp<BinOpAsmString asmstr, SDNode OpNode, NVPTXRegClass regclass1,
+                 NVPTXRegClass regclass2, NVPTXInst sInst=NOP> :
+      NVPTXVecInst<(outs regclass1:$dst), (ins regclass1:$a, regclass2:$b),
+                 asmstr.s,
+                 [(set regclass1:$dst, (OpNode regclass1:$a, regclass2:$b))],
+                 sInst>;
+
+class VecUnaryOp<BinOpAsmString asmstr, PatFrag OpNode, NVPTXRegClass regclass,
+  NVPTXInst sInst=NOP> :
+      NVPTXVecInst<(outs regclass:$dst), (ins regclass:$a),
+                 asmstr.s,
+                 [(set regclass:$dst, (OpNode regclass:$a))], sInst>;
+
+multiclass IntBinVOp<string asmstr, SDNode OpNode,
+                     NVPTXInst i64op=NOP, NVPTXInst i32op=NOP, NVPTXInst
+                     i16op=NOP, NVPTXInst i8op=NOP> {
+  def V2I64 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "64")>, OpNode, V2I64Regs,
+    i64op>;
+  def V4I32 : VecBinaryOp<V4AsmStr<!strconcat(asmstr, "32")>, OpNode, V4I32Regs,
+    i32op>;
+  def V2I32 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "32")>, OpNode, V2I32Regs,
+    i32op>;
+  def V4I16 : VecBinaryOp<V4AsmStr<!strconcat(asmstr, "16")>, OpNode, V4I16Regs,
+    i16op>;
+  def V2I16 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "16")>, OpNode, V2I16Regs,
+    i16op>;
+  def V4I8 : VecBinaryOp<V4AsmStr<!strconcat(asmstr, "16")>, OpNode, V4I8Regs,
+    i8op>;
+  def V2I8 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "16")>, OpNode, V2I8Regs,
+    i8op>;
+}
+
+multiclass FloatBinVOp<string asmstr, SDNode OpNode,
+                       NVPTXInst f64=NOP, NVPTXInst f32=NOP,
+                       NVPTXInst f32_ftz=NOP> {
+  def V2F64 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "f64")>, OpNode,
+    V2F64Regs, f64>;
+  def V4F32_ftz : VecBinaryOp<V4AsmStr<!strconcat(asmstr, "ftz.f32")>, OpNode,
+    V4F32Regs, f32_ftz>, Requires<[doF32FTZ]>;
+  def V2F32_ftz : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "ftz.f32")>, OpNode,
+    V2F32Regs, f32_ftz>, Requires<[doF32FTZ]>;
+  def V4F32 : VecBinaryOp<V4AsmStr<!strconcat(asmstr, "f32")>, OpNode,
+    V4F32Regs, f32>;
+  def V2F32 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "f32")>, OpNode,
+    V2F32Regs, f32>;
+}
+
+multiclass IntUnaryVOp<string asmstr, PatFrag OpNode,
+                       NVPTXInst i64op=NOP, NVPTXInst i32op=NOP,
+                       NVPTXInst i16op=NOP, NVPTXInst i8op=NOP> {
+  def V2I64 : VecUnaryOp<V2UnaryStr<!strconcat(asmstr, "64")>, OpNode,
+    V2I64Regs, i64op>;
+  def V4I32 : VecUnaryOp<V4UnaryStr<!strconcat(asmstr, "32")>, OpNode,
+    V4I32Regs, i32op>;
+  def V2I32 : VecUnaryOp<V2UnaryStr<!strconcat(asmstr, "32")>, OpNode,
+    V2I32Regs, i32op>;
+  def V4I16 : VecUnaryOp<V4UnaryStr<!strconcat(asmstr, "16")>, OpNode,
+    V4I16Regs, i16op>;
+  def V2I16 : VecUnaryOp<V2UnaryStr<!strconcat(asmstr, "16")>, OpNode,
+    V2I16Regs, i16op>;
+  def V4I8  : VecUnaryOp<V4UnaryStr<!strconcat(asmstr, "16")>, OpNode,
+    V4I8Regs,   i8op>;
+  def V2I8  : VecUnaryOp<V2UnaryStr<!strconcat(asmstr, "16")>, OpNode,
+    V2I8Regs,   i8op>;
+}
+
+
+// Integer Arithmetic
+let VecInstType=isVecOther.Value in {
+defm VAdd : IntBinVOp<"add.s", add, ADDi64rr, ADDi32rr, ADDi16rr, ADDi8rr>;
+defm VSub : IntBinVOp<"sub.s", sub, SUBi64rr, SUBi32rr, SUBi16rr, SUBi8rr>;
+
+def AddCCV4I32 : VecBinaryOp<V4AsmStr<"add.cc.s32">, addc, V4I32Regs,
+  ADDCCi32rr>;
+def AddCCV2I32 : VecBinaryOp<V2AsmStr<"add.cc.s32">, addc, V2I32Regs,
+  ADDCCi32rr>;
+def SubCCV4I32 : VecBinaryOp<V4AsmStr<"sub.cc.s32">, subc, V4I32Regs,
+  SUBCCi32rr>;
+def SubCCV2I32 : VecBinaryOp<V2AsmStr<"sub.cc.s32">, subc, V2I32Regs,
+  SUBCCi32rr>;
+def AddCCCV4I32 : VecBinaryOp<V4AsmStr<"addc.cc.s32">, adde, V4I32Regs,
+  ADDCCCi32rr>;
+def AddCCCV2I32 : VecBinaryOp<V2AsmStr<"addc.cc.s32">, adde, V2I32Regs,
+  ADDCCCi32rr>;
+def SubCCCV4I32 : VecBinaryOp<V4AsmStr<"subc.cc.s32">, sube, V4I32Regs,
+  SUBCCCi32rr>;
+def SubCCCV2I32 : VecBinaryOp<V2AsmStr<"subc.cc.s32">, sube, V2I32Regs,
+  SUBCCCi32rr>;
+
+def ShiftLV2I64 : VecShiftOp<V2AsmStr<"shl.b64">, shl, V2I64Regs, V2I32Regs,
+  SHLi64rr>;
+def ShiftLV2I32 : VecShiftOp<V2AsmStr<"shl.b32">, shl, V2I32Regs, V2I32Regs,
+  SHLi32rr>;
+def ShiftLV4I32 : VecShiftOp<V4AsmStr<"shl.b32">, shl, V4I32Regs, V4I32Regs,
+  SHLi32rr>;
+def ShiftLV2I16 : VecShiftOp<V2AsmStr<"shl.b16">, shl, V2I16Regs, V2I32Regs,
+  SHLi16rr>;
+def ShiftLV4I16 : VecShiftOp<V4AsmStr<"shl.b16">, shl, V4I16Regs, V4I32Regs,
+  SHLi16rr>;
+def ShiftLV2I8  : VecShiftOp<V2AsmStr<"shl.b16">, shl, V2I8Regs,  V2I32Regs,
+  SHLi8rr>;
+def ShiftLV4I8  : VecShiftOp<V4AsmStr<"shl.b16">, shl, V4I8Regs,  V4I32Regs,
+  SHLi8rr>;
+}
+
+// cvt to v*i32, helpers for shift
+class CVTtoVeci32<NVPTXRegClass inclass, NVPTXRegClass outclass, string asmstr,
+  NVPTXInst sInst=NOP> :
+      NVPTXVecInst<(outs outclass:$d), (ins inclass:$s), asmstr, [], sInst>;
+
+class VecCVTStrHelper<string op, string dest, string src> {
+  string s=!strconcat(op, !strconcat("\t",
+           !strconcat(dest, !strconcat(", ", !strconcat(src, ";")))));
+}
+
+class Vec2CVTStr<string op> {
+  string s=!strconcat(VecCVTStrHelper<op, "${d}_0", "${s}_0">.s,
+           !strconcat("\n\t", VecCVTStrHelper<op, "${d}_1", "${s}_1">.s));
+}
+
+class Vec4CVTStr<string op> {
+  string s=!strconcat(VecCVTStrHelper<op, "${d}_0", "${s}_0">.s,
+           !strconcat("\n\t",
+           !strconcat(VecCVTStrHelper<op, "${d}_1", "${s}_1">.s,
+           !strconcat("\n\t",
+           !strconcat(VecCVTStrHelper<op, "${d}_2", "${s}_2">.s,
+           !strconcat("\n\t", VecCVTStrHelper<op, "${d}_3", "${s}_3">.s))))));
+}
+
+let VecInstType=isVecOther.Value in {
+def CVTv2i8tov2i32 : CVTtoVeci32<V2I8Regs, V2I32Regs,
+  Vec2CVTStr<"cvt.u32.u16">.s, Zint_extendext8to32>;
+def CVTv2i16tov2i32 : CVTtoVeci32<V2I16Regs, V2I32Regs,
+  Vec2CVTStr<"cvt.u32.u16">.s, Zint_extendext16to32>;
+def CVTv4i8tov4i32 : CVTtoVeci32<V4I8Regs, V4I32Regs,
+  Vec4CVTStr<"cvt.u32.u16">.s, Zint_extendext8to32>;
+def CVTv4i16tov4i32 : CVTtoVeci32<V4I16Regs, V4I32Regs,
+  Vec4CVTStr<"cvt.u32.u16">.s, Zint_extendext16to32>;
+def CVTv2i64tov2i32 : CVTtoVeci32<V2I64Regs, V2I32Regs,
+  Vec2CVTStr<"cvt.u32.u64">.s, TRUNC_64to32>;
+}
+
+def : Pat<(shl V2I16Regs:$src1, V2I16Regs:$src2),
+          (ShiftLV2I16 V2I16Regs:$src1, (CVTv2i16tov2i32 V2I16Regs:$src2))>;
+def : Pat<(shl V2I8Regs:$src1, V2I8Regs:$src2),
+          (ShiftLV2I8 V2I8Regs:$src1, (CVTv2i8tov2i32 V2I8Regs:$src2))>;
+def : Pat<(shl V2I64Regs:$src1, V2I64Regs:$src2),
+          (ShiftLV2I64 V2I64Regs:$src1, (CVTv2i64tov2i32 V2I64Regs:$src2))>;
+
+def : Pat<(shl V4I16Regs:$src1, V4I16Regs:$src2),
+          (ShiftLV4I16 V4I16Regs:$src1, (CVTv4i16tov4i32 V4I16Regs:$src2))>;
+def : Pat<(shl V4I8Regs:$src1, V4I8Regs:$src2),
+          (ShiftLV4I8 V4I8Regs:$src1, (CVTv4i8tov4i32 V4I8Regs:$src2))>;
+
+let VecInstType=isVecOther.Value in {
+def ShiftRAV2I64 : VecShiftOp<V2AsmStr<"shr.s64">, sra, V2I64Regs, V2I32Regs,
+  SRAi64rr>;
+def ShiftRAV2I32 : VecShiftOp<V2AsmStr<"shr.s32">, sra, V2I32Regs, V2I32Regs,
+  SRAi32rr>;
+def ShiftRAV4I32 : VecShiftOp<V4AsmStr<"shr.s32">, sra, V4I32Regs, V4I32Regs,
+  SRAi32rr>;
+def ShiftRAV2I16 : VecShiftOp<V2AsmStr<"shr.s16">, sra, V2I16Regs, V2I32Regs,
+  SRAi16rr>;
+def ShiftRAV4I16 : VecShiftOp<V4AsmStr<"shr.s16">, sra, V4I16Regs, V4I32Regs,
+  SRAi16rr>;
+def ShiftRAV2I8  : VecShiftOp<V2AsmStr<"shr.s16">, sra, V2I8Regs,  V2I32Regs,
+  SRAi8rr>;
+def ShiftRAV4I8  : VecShiftOp<V4AsmStr<"shr.s16">, sra, V4I8Regs,  V4I32Regs,
+  SRAi8rr>;
+
+def ShiftRLV2I64 : VecShiftOp<V2AsmStr<"shr.u64">, srl, V2I64Regs, V2I32Regs,
+  SRLi64rr>;
+def ShiftRLV2I32 : VecShiftOp<V2AsmStr<"shr.u32">, srl, V2I32Regs, V2I32Regs,
+  SRLi32rr>;
+def ShiftRLV4I32 : VecShiftOp<V4AsmStr<"shr.u32">, srl, V4I32Regs, V4I32Regs,
+  SRLi32rr>;
+def ShiftRLV2I16 : VecShiftOp<V2AsmStr<"shr.u16">, srl, V2I16Regs, V2I32Regs,
+  SRLi16rr>;
+def ShiftRLV4I16 : VecShiftOp<V4AsmStr<"shr.u16">, srl, V4I16Regs, V4I32Regs,
+  SRLi16rr>;
+def ShiftRLV2I8  : VecShiftOp<V2AsmStr<"shr.u16">, srl, V2I8Regs,  V2I32Regs,
+  SRLi8rr>;
+def ShiftRLV4I8  : VecShiftOp<V4AsmStr<"shr.u16">, srl, V4I8Regs,  V4I32Regs,
+  SRLi8rr>;
+
+defm VMult   : IntBinVOp<"mul.lo.s", mul, MULTi64rr, MULTi32rr, MULTi16rr,
+  MULTi8rr>;
+defm VMultHS : IntBinVOp<"mul.hi.s", mulhs, MULTHSi64rr, MULTHSi32rr,
+  MULTHSi16rr,
+  MULTHSi8rr>;
+defm VMultHU : IntBinVOp<"mul.hi.u", mulhu, MULTHUi64rr, MULTHUi32rr,
+  MULTHUi16rr,
+  MULTHUi8rr>;
+defm VSDiv   : IntBinVOp<"div.s", sdiv, SDIVi64rr, SDIVi32rr, SDIVi16rr,
+  SDIVi8rr>;
+defm VUDiv   : IntBinVOp<"div.u", udiv, UDIVi64rr, UDIVi32rr, UDIVi16rr,
+  UDIVi8rr>;
+defm VSRem   : IntBinVOp<"rem.s", srem, SREMi64rr, SREMi32rr, SREMi16rr,
+  SREMi8rr>;
+defm VURem   : IntBinVOp<"rem.u", urem, UREMi64rr, UREMi32rr, UREMi16rr,
+  UREMi8rr>;
+}
+
+def : Pat<(sra V2I16Regs:$src1, V2I16Regs:$src2),
+          (ShiftRAV2I16 V2I16Regs:$src1, (CVTv2i16tov2i32 V2I16Regs:$src2))>;
+def : Pat<(sra V2I8Regs:$src1, V2I8Regs:$src2),
+          (ShiftRAV2I8 V2I8Regs:$src1, (CVTv2i8tov2i32 V2I8Regs:$src2))>;
+def : Pat<(sra V2I64Regs:$src1, V2I64Regs:$src2),
+          (ShiftRAV2I64 V2I64Regs:$src1, (CVTv2i64tov2i32 V2I64Regs:$src2))>;
+
+def : Pat<(sra V4I16Regs:$src1, V4I16Regs:$src2),
+          (ShiftRAV4I16 V4I16Regs:$src1, (CVTv4i16tov4i32 V4I16Regs:$src2))>;
+def : Pat<(sra V4I8Regs:$src1, V4I8Regs:$src2),
+          (ShiftRAV4I8 V4I8Regs:$src1, (CVTv4i8tov4i32 V4I8Regs:$src2))>;
+
+def : Pat<(srl V2I16Regs:$src1, V2I16Regs:$src2),
+          (ShiftRLV2I16 V2I16Regs:$src1, (CVTv2i16tov2i32 V2I16Regs:$src2))>;
+def : Pat<(srl V2I8Regs:$src1, V2I8Regs:$src2),
+          (ShiftRLV2I8 V2I8Regs:$src1, (CVTv2i8tov2i32 V2I8Regs:$src2))>;
+def : Pat<(srl V2I64Regs:$src1, V2I64Regs:$src2),
+          (ShiftRLV2I64 V2I64Regs:$src1, (CVTv2i64tov2i32 V2I64Regs:$src2))>;
+
+def : Pat<(srl V4I16Regs:$src1, V4I16Regs:$src2),
+          (ShiftRLV4I16 V4I16Regs:$src1, (CVTv4i16tov4i32 V4I16Regs:$src2))>;
+def : Pat<(srl V4I8Regs:$src1, V4I8Regs:$src2),
+          (ShiftRLV4I8 V4I8Regs:$src1, (CVTv4i8tov4i32 V4I8Regs:$src2))>;
+
+multiclass VMAD<string asmstr, NVPTXRegClass regclassv4,
+  NVPTXRegClass regclassv2,
+                SDNode an=add, SDNode mn=mul, NVPTXInst sop=NOP,
+                Predicate Pred> {
+  def V4 : NVPTXVecInst<(outs regclassv4:$dst),
+    (ins regclassv4:$a, regclassv4:$b, regclassv4:$c),
+                      V4MADStr<asmstr>.s,
+                      [(set regclassv4:$dst,
+                        (an (mn regclassv4:$a, regclassv4:$b), regclassv4:$c))],
+                      sop>,
+           Requires<[Pred]>;
+  def V2 : NVPTXVecInst<(outs regclassv2:$dst),
+    (ins regclassv2:$a, regclassv2:$b, regclassv2:$c),
+                      V2MADStr<asmstr>.s,
+                      [(set regclassv2:$dst,
+                        (an (mn regclassv2:$a, regclassv2:$b), regclassv2:$c))],
+                      sop>,
+           Requires<[Pred]>;
+}
+
+multiclass VMADV2Only<string asmstr, NVPTXRegClass regclass, NVPTXInst sop=NOP,
+  Predicate Pred> {
+  def V2 : NVPTXVecInst<(outs regclass:$dst),
+    (ins regclass:$a, regclass:$b, regclass:$c),
+                      V2MADStr<asmstr>.s,
+                      [(set regclass:$dst, (add
+                        (mul regclass:$a, regclass:$b), regclass:$c))], sop>,
+           Requires<[Pred]>;
+}
+multiclass VFMADV2Only<string asmstr, NVPTXRegClass regclass, NVPTXInst sop=NOP,
+  Predicate Pred> {
+  def V2 : NVPTXVecInst<(outs regclass:$dst),
+    (ins regclass:$a, regclass:$b, regclass:$c),
+                      V2MADStr<asmstr>.s,
+                      [(set regclass:$dst, (fadd
+                        (fmul regclass:$a, regclass:$b), regclass:$c))], sop>,
+           Requires<[Pred]>;
+}
+
+let VecInstType=isVecOther.Value in {
+defm I8MAD  : VMAD<"mad.lo.s16", V4I8Regs, V2I8Regs, add, mul, MAD8rrr, true>;
+defm I16MAD : VMAD<"mad.lo.s16", V4I16Regs, V2I16Regs, add, mul, MAD16rrr,
+  true>;
+defm I32MAD : VMAD<"mad.lo.s32", V4I32Regs, V2I32Regs, add, mul, MAD32rrr,
+  true>;
+defm I64MAD : VMADV2Only<"mad.lo.s64", V2I64Regs, MAD64rrr, true>;
+
+defm VNeg : IntUnaryVOp<"neg.s", ineg, INEG64, INEG32, INEG16, INEG8>;
+
+defm VAddf : FloatBinVOp<"add.", fadd, FADDf64rr, FADDf32rr, FADDf32rr_ftz>;
+defm VSubf : FloatBinVOp<"sub.", fsub, FSUBf64rr, FSUBf32rr, FSUBf32rr_ftz>;
+defm VMulf : FloatBinVOp<"mul.", fmul, FMULf64rr, FMULf32rr, FMULf32rr_ftz>;
+
+defm F32MAD_ftz : VMAD<"mad.ftz.f32", V4F32Regs, V2F32Regs, fadd, fmul,
+  FMAD32_ftzrrr, doFMADF32_ftz>;
+defm F32FMA_ftz : VMAD<"fma.rn.ftz.f32", V4F32Regs, V2F32Regs, fadd, fmul,
+  FMA32_ftzrrr, doFMAF32_ftz>;
+defm F32MAD : VMAD<"mad.f32", V4F32Regs, V2F32Regs, fadd, fmul, FMAD32rrr,
+  doFMADF32>;
+defm F32FMA : VMAD<"fma.rn.f32", V4F32Regs, V2F32Regs, fadd, fmul, FMA32rrr,
+  doFMAF32>;
+defm F64FMA : VFMADV2Only<"fma.rn.f64", V2F64Regs, FMA64rrr, doFMAF64>;
+}
+
+let VecInstType=isVecOther.Value in {
+def V4F32Div_prec_ftz : VecBinaryOp<V4AsmStr<"div.rn.ftz.f32">, fdiv, V4F32Regs,
+  FDIV32rr_prec_ftz>, Requires<[doF32FTZ, reqPTX20]>;
+def V2F32Div_prec_ftz : VecBinaryOp<V2AsmStr<"div.rn.ftz.f32">, fdiv, V2F32Regs,
+  FDIV32rr_prec_ftz>, Requires<[doF32FTZ, reqPTX20]>;
+def V4F32Div_prec : VecBinaryOp<V4AsmStr<"div.rn.f32">, fdiv, V4F32Regs,
+  FDIV32rr_prec>, Requires<[reqPTX20]>;
+def V2F32Div_prec : VecBinaryOp<V2AsmStr<"div.rn.f32">, fdiv, V2F32Regs,
+  FDIV32rr_prec>, Requires<[reqPTX20]>;
+def V2F32Div_ftz : VecBinaryOp<V2AsmStr<"div.full.ftz.f32">, fdiv, V2F32Regs,
+  FDIV32rr_ftz>, Requires<[doF32FTZ]>;
+def V4F32Div_ftz : VecBinaryOp<V4AsmStr<"div.full.ftz.f32">, fdiv, V4F32Regs,
+  FDIV32rr_ftz>, Requires<[doF32FTZ]>;
+def V2F32Div : VecBinaryOp<V2AsmStr<"div.full.f32">, fdiv, V2F32Regs, FDIV32rr>;
+def V4F32Div : VecBinaryOp<V4AsmStr<"div.full.f32">, fdiv, V4F32Regs, FDIV32rr>;
+def V2F64Div : VecBinaryOp<V2AsmStr<"div.rn.f64">, fdiv, V2F64Regs, FDIV64rr>;
+}
+
+def fnegpat : PatFrag<(ops node:$in), (fneg node:$in)>;
+
+let VecInstType=isVecOther.Value in {
+def VNegv2f32_ftz : VecUnaryOp<V2UnaryStr<"neg.ftz.f32">, fnegpat, V2F32Regs,
+  FNEGf32_ftz>, Requires<[doF32FTZ]>;
+def VNegv4f32_ftz : VecUnaryOp<V4UnaryStr<"neg.ftz.f32">, fnegpat, V4F32Regs,
+  FNEGf32_ftz>, Requires<[doF32FTZ]>;
+def VNegv2f32 : VecUnaryOp<V2UnaryStr<"neg.f32">, fnegpat, V2F32Regs, FNEGf32>;
+def VNegv4f32 : VecUnaryOp<V4UnaryStr<"neg.f32">, fnegpat, V4F32Regs, FNEGf32>;
+def VNegv2f64 : VecUnaryOp<V2UnaryStr<"neg.f64">, fnegpat, V2F64Regs, FNEGf64>;
+
+// Logical Arithmetic
+defm VAnd : IntBinVOp<"and.b", and, ANDb64rr, ANDb32rr, ANDb16rr, ANDb8rr>;
+defm VOr  : IntBinVOp<"or.b", or, ORb64rr, ORb32rr, ORb16rr, ORb8rr>;
+defm VXor : IntBinVOp<"xor.b", xor, XORb64rr, XORb32rr, XORb16rr, XORb8rr>;
+
+defm VNot : IntUnaryVOp<"not.b", not, NOT64, NOT32, NOT16, NOT8>;
+}
+
+
+multiclass V2FPCONTRACT32_SUB_PAT<NVPTXInst Inst, Predicate Pred> {
+  def : Pat<(fsub V2F32Regs:$a, (fmul V2F32Regs:$b, V2F32Regs:$c)),
+          (Inst (VNegv2f32 V2F32Regs:$b), V2F32Regs:$c,  V2F32Regs:$a)>,
+          Requires<[Pred]>;
+
+  def : Pat<(fsub (fmul V2F32Regs:$a, V2F32Regs:$b), V2F32Regs:$c),
+          (Inst V2F32Regs:$a, V2F32Regs:$b, (VNegv2f32 V2F32Regs:$c))>,
+          Requires<[Pred]>;
+}
+
+defm V2FMAF32ext_ftz  : V2FPCONTRACT32_SUB_PAT<F32FMA_ftzV2, doFMAF32AGG_ftz>;
+defm V2FMADF32ext_ftz : V2FPCONTRACT32_SUB_PAT<F32MAD_ftzV2, doFMADF32_ftz>;
+defm V2FMAF32ext  : V2FPCONTRACT32_SUB_PAT<F32FMAV2, doFMAF32AGG>;
+defm V2FMADF32ext : V2FPCONTRACT32_SUB_PAT<F32MADV2, doFMADF32>;
+
+multiclass V4FPCONTRACT32_SUB_PAT<NVPTXInst Inst, Predicate Pred> {
+  def : Pat<(fsub V4F32Regs:$a, (fmul V4F32Regs:$b, V4F32Regs:$c)),
+          (Inst (VNegv4f32 V4F32Regs:$b), V4F32Regs:$c,  V4F32Regs:$a)>,
+          Requires<[Pred]>;
+
+  def : Pat<(fsub (fmul V4F32Regs:$a, V4F32Regs:$b), V4F32Regs:$c),
+          (Inst V4F32Regs:$a, V4F32Regs:$b, (VNegv4f32 V4F32Regs:$c))>,
+          Requires<[Pred]>;
+}
+
+defm V4FMAF32ext_ftz  : V4FPCONTRACT32_SUB_PAT<F32FMA_ftzV4, doFMAF32AGG_ftz>;
+defm V4FMADF32ext_ftz : V4FPCONTRACT32_SUB_PAT<F32MAD_ftzV4, doFMADF32_ftz>;
+defm V4FMAF32ext  : V4FPCONTRACT32_SUB_PAT<F32FMAV4, doFMAF32AGG>;
+defm V4FMADF32ext : V4FPCONTRACT32_SUB_PAT<F32MADV4, doFMADF32>;
+
+multiclass V2FPCONTRACT64_SUB_PAT<NVPTXInst Inst, Predicate Pred> {
+  def : Pat<(fsub V2F64Regs:$a, (fmul V2F64Regs:$b, V2F64Regs:$c)),
+          (Inst (VNegv2f64 V2F64Regs:$b), V2F64Regs:$c, V2F64Regs:$a)>,
+          Requires<[Pred]>;
+
+  def : Pat<(fsub (fmul V2F64Regs:$a, V2F64Regs:$b), V2F64Regs:$c),
+          (Inst V2F64Regs:$a, V2F64Regs:$b, (VNegv2f64 V2F64Regs:$c))>,
+          Requires<[Pred]>;
+}
+
+defm V2FMAF64ext : V2FPCONTRACT64_SUB_PAT<F64FMAV2, doFMAF64AGG>;
+
+class VecModStr<string vecsize, string elem, string extra, string l="">
+{
+  string t1 = !strconcat("${c", elem);
+  string t2 = !strconcat(t1, ":vecv");
+  string t3 = !strconcat(t2, vecsize);
+  string t4 = !strconcat(t3, extra);
+  string t5 = !strconcat(t4, l);
+  string s =  !strconcat(t5, "}");
+}
+class ShuffleOneLine<string vecsize, string elem, string type>
+{
+  string t1 = VecModStr<vecsize, elem, "comm", "1">.s;
+  string t2 = !strconcat(t1, "mov.");
+  string t3 = !strconcat(t2, type);
+  string t4 = !strconcat(t3, " \t${dst}_");
+  string t5 = !strconcat(t4, elem);
+  string t6 = !strconcat(t5, ", $src1");
+  string t7 = !strconcat(t6, VecModStr<vecsize, elem, "pos">.s);
+  string t8 = !strconcat(t7, ";\n\t");
+  string t9 = !strconcat(t8, VecModStr<vecsize, elem, "comm", "2">.s);
+  string t10 = !strconcat(t9, "mov.");
+  string t11 = !strconcat(t10, type);
+  string t12 = !strconcat(t11, " \t${dst}_");
+  string t13 = !strconcat(t12, elem);
+  string t14 = !strconcat(t13, ", $src2");
+  string t15 = !strconcat(t14, VecModStr<vecsize, elem, "pos">.s);
+  string s =   !strconcat(t15, ";");
+}
+class ShuffleAsmStr2<string type>
+{
+  string t1 = ShuffleOneLine<"2", "0", type>.s;
+  string t2 = !strconcat(t1, "\n\t");
+  string s  = !strconcat(t2, ShuffleOneLine<"2", "1", type>.s);
+}
+class ShuffleAsmStr4<string type>
+{
+  string t1 = ShuffleOneLine<"4", "0", type>.s;
+  string t2 = !strconcat(t1, "\n\t");
+  string t3 = !strconcat(t2, ShuffleOneLine<"4", "1", type>.s);
+  string t4 = !strconcat(t3, "\n\t");
+  string t5 = !strconcat(t4, ShuffleOneLine<"4", "2", type>.s);
+  string t6 = !strconcat(t5, "\n\t");
+  string s  = !strconcat(t6, ShuffleOneLine<"4", "3", type>.s);
+}
+
+let neverHasSideEffects=1, VecInstType=isVecShuffle.Value in {
+def VecShuffle_v4f32 : NVPTXVecInst<(outs V4F32Regs:$dst),
+                       (ins  V4F32Regs:$src1, V4F32Regs:$src2,
+                             i8imm:$c0, i8imm:$c1, i8imm:$c2, i8imm:$c3),
+                 !strconcat("//Mov $dst, $src1, $src2, $c0, $c1, $c2, $c3;\n\t",
+                                 ShuffleAsmStr4<"f32">.s),
+                       [], FMOV32rr>;
+
+def VecShuffle_v4i32 : NVPTXVecInst<(outs V4I32Regs:$dst),
+                       (ins  V4I32Regs:$src1, V4I32Regs:$src2,
+                             i8imm:$c0, i8imm:$c1, i8imm:$c2, i8imm:$c3),
+                 !strconcat("//Mov $dst, $src1, $src2, $c0, $c1, $c2, $c3;\n\t",
+                                 ShuffleAsmStr4<"u32">.s),
+                       [], IMOV32rr>;
+
+def VecShuffle_v4i16 : NVPTXVecInst<(outs V4I16Regs:$dst),
+                       (ins  V4I16Regs:$src1, V4I16Regs:$src2,
+                             i8imm:$c0, i8imm:$c1, i8imm:$c2, i8imm:$c3),
+                 !strconcat("//Mov $dst, $src1, $src2, $c0, $c1, $c2, $c3;\n\t",
+                                 ShuffleAsmStr4<"u16">.s),
+                       [], IMOV16rr>;
+
+def VecShuffle_v4i8 : NVPTXVecInst<(outs V4I8Regs:$dst),
+                       (ins  V4I8Regs:$src1, V4I8Regs:$src2,
+                             i8imm:$c0, i8imm:$c1, i8imm:$c2, i8imm:$c3),
+                 !strconcat("//Mov $dst, $src1, $src2, $c0, $c1, $c2, $c3;\n\t",
+                                 ShuffleAsmStr4<"u16">.s),
+                       [], IMOV8rr>;
+
+def VecShuffle_v2f32 : NVPTXVecInst<(outs V2F32Regs:$dst),
+                       (ins  V2F32Regs:$src1, V2F32Regs:$src2,
+                             i8imm:$c0, i8imm:$c1),
+                       !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t",
+                                 ShuffleAsmStr2<"f32">.s),
+                       [], FMOV32rr>;
+
+def VecShuffle_v2i32 : NVPTXVecInst<(outs V2I32Regs:$dst),
+                       (ins  V2I32Regs:$src1, V2I32Regs:$src2,
+                             i8imm:$c0, i8imm:$c1),
+                       !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t",
+                                 ShuffleAsmStr2<"u32">.s),
+                       [], IMOV32rr>;
+
+def VecShuffle_v2i8 : NVPTXVecInst<(outs V2I8Regs:$dst),
+                       (ins  V2I8Regs:$src1, V2I8Regs:$src2,
+                             i8imm:$c0, i8imm:$c1),
+                       !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t",
+                                 ShuffleAsmStr2<"u16">.s),
+                       [], IMOV8rr>;
+
+def VecShuffle_v2i16 : NVPTXVecInst<(outs V2I16Regs:$dst),
+                       (ins  V2I16Regs:$src1, V2I16Regs:$src2,
+                             i8imm:$c0, i8imm:$c1),
+                       !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t",
+                                 ShuffleAsmStr2<"u16">.s),
+                       [], IMOV16rr>;
+
+def VecShuffle_v2f64 : NVPTXVecInst<(outs V2F64Regs:$dst),
+                       (ins  V2F64Regs:$src1, V2F64Regs:$src2,
+                             i8imm:$c0, i8imm:$c1),
+                       !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t",
+                                 ShuffleAsmStr2<"f64">.s),
+                       [], FMOV64rr>;
+
+def VecShuffle_v2i64 : NVPTXVecInst<(outs V2I64Regs:$dst),
+                       (ins  V2I64Regs:$src1, V2I64Regs:$src2,
+                             i8imm:$c0, i8imm:$c1),
+                       !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t",
+                                 ShuffleAsmStr2<"u64">.s),
+                       [], IMOV64rr>;
+}
+
+def ShuffleMask0 : SDNodeXForm<vector_shuffle, [{
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  return CurDAG->getTargetConstant(SVOp->getMaskElt(0), MVT::i32);
+}]>;
+def ShuffleMask1 : SDNodeXForm<vector_shuffle, [{
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  return CurDAG->getTargetConstant(SVOp->getMaskElt(1), MVT::i32);
+}]>;
+def ShuffleMask2 : SDNodeXForm<vector_shuffle, [{
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  return CurDAG->getTargetConstant(SVOp->getMaskElt(2), MVT::i32);
+}]>;
+def ShuffleMask3 : SDNodeXForm<vector_shuffle, [{
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  return CurDAG->getTargetConstant(SVOp->getMaskElt(3), MVT::i32);
+}]>;
+
+// The spurious call is here to silence a compiler warning about N being
+// unused.
+def vec_shuf : PatFrag<(ops node:$lhs, node:$rhs),
+                       (vector_shuffle node:$lhs, node:$rhs),
+                       [{ N->getGluedNode(); return true; }]>;
+
+def : Pat<(v2f64 (vec_shuf:$op V2F64Regs:$src1, V2F64Regs:$src2)),
+          (VecShuffle_v2f64 V2F64Regs:$src1, V2F64Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>;
+
+def : Pat<(v4f32 (vec_shuf:$op V4F32Regs:$src1, V4F32Regs:$src2)),
+          (VecShuffle_v4f32 V4F32Regs:$src1, V4F32Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op),
+                            (ShuffleMask2 node:$op), (ShuffleMask3 node:$op))>;
+
+def : Pat<(v2f32 (vec_shuf:$op V2F32Regs:$src1, V2F32Regs:$src2)),
+          (VecShuffle_v2f32 V2F32Regs:$src1, V2F32Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>;
+
+def : Pat<(v2i64 (vec_shuf:$op V2I64Regs:$src1, V2I64Regs:$src2)),
+          (VecShuffle_v2i64 V2I64Regs:$src1, V2I64Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>;
+
+def : Pat<(v4i32 (vec_shuf:$op V4I32Regs:$src1, V4I32Regs:$src2)),
+          (VecShuffle_v4i32 V4I32Regs:$src1, V4I32Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op),
+                            (ShuffleMask2 node:$op), (ShuffleMask3 node:$op))>;
+
+def : Pat<(v2i32 (vec_shuf:$op V2I32Regs:$src1, V2I32Regs:$src2)),
+          (VecShuffle_v2i32 V2I32Regs:$src1, V2I32Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>;
+
+def : Pat<(v4i16 (vec_shuf:$op V4I16Regs:$src1, V4I16Regs:$src2)),
+          (VecShuffle_v4i16 V4I16Regs:$src1, V4I16Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op),
+                            (ShuffleMask2 node:$op), (ShuffleMask3 node:$op))>;
+
+def : Pat<(v2i16 (vec_shuf:$op V2I16Regs:$src1, V2I16Regs:$src2)),
+          (VecShuffle_v2i16 V2I16Regs:$src1, V2I16Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>;
+
+def : Pat<(v4i8 (vec_shuf:$op V4I8Regs:$src1, V4I8Regs:$src2)),
+          (VecShuffle_v4i8 V4I8Regs:$src1, V4I8Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op),
+                            (ShuffleMask2 node:$op), (ShuffleMask3 node:$op))>;
+
+def : Pat<(v2i8 (vec_shuf:$op V2I8Regs:$src1, V2I8Regs:$src2)),
+          (VecShuffle_v2i8 V2I8Regs:$src1, V2I8Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>;
+
+class Build_Vector2<string asmstr, NVPTXRegClass vclass, NVPTXRegClass sclass,
+  NVPTXInst si>
+                   : NVPTXVecInst<(outs vclass:$dst),
+                   (ins  sclass:$a1, sclass:$a2),
+                   !strconcat(asmstr, "\t${dst:vecfull}, {{$a1, $a2}};"),
+                   [(set vclass:$dst, (build_vector sclass:$a1, sclass:$a2))],
+                   si>;
+class Build_Vector4<string asmstr, NVPTXRegClass vclass, NVPTXRegClass sclass,
+  NVPTXInst si>
+                   : NVPTXVecInst<(outs vclass:$dst),
+                   (ins  sclass:$a1, sclass:$a2, sclass:$a3, sclass:$a4),
+               !strconcat(asmstr, "\t${dst:vecfull}, {{$a1, $a2, $a3, $a4}};"),
+                   [(set vclass:$dst,
+                     (build_vector sclass:$a1, sclass:$a2,
+                       sclass:$a3, sclass:$a4))], si>;
+
+let isAsCheapAsAMove=1, VecInstType=isVecBuild.Value in {
+def Build_Vector2_f32 : Build_Vector2<"mov.v2.f32", V2F32Regs, Float32Regs,
+  FMOV32rr>;
+def Build_Vector2_f64 : Build_Vector2<"mov.v2.f64", V2F64Regs, Float64Regs,
+  FMOV64rr>;
+
+def Build_Vector2_i32 : Build_Vector2<"mov.v2.u32", V2I32Regs, Int32Regs,
+  IMOV32rr>;
+def Build_Vector2_i64 : Build_Vector2<"mov.v2.u64", V2I64Regs, Int64Regs,
+  IMOV64rr>;
+def Build_Vector2_i16 : Build_Vector2<"mov.v2.u16", V2I16Regs, Int16Regs,
+  IMOV16rr>;
+def Build_Vector2_i8  : Build_Vector2<"mov.v2.u16",  V2I8Regs,  Int8Regs,
+  IMOV8rr>;
+
+def Build_Vector4_f32 : Build_Vector4<"mov.v4.f32", V4F32Regs, Float32Regs,
+  FMOV32rr>;
+
+def Build_Vector4_i32 : Build_Vector4<"mov.v4.u32", V4I32Regs, Int32Regs,
+  IMOV32rr>;
+def Build_Vector4_i16 : Build_Vector4<"mov.v4.u16", V4I16Regs, Int16Regs,
+  IMOV16rr>;
+def Build_Vector4_i8  : Build_Vector4<"mov.v4.u16", V4I8Regs, Int8Regs,
+  IMOV8rr>;
+}
+
+class Vec_Move<string asmstr, NVPTXRegClass vclass, NVPTXInst sop=NOP>
+                 : NVPTXVecInst<(outs vclass:$dst), (ins vclass:$src),
+                   !strconcat(asmstr, "\t${dst:vecfull}, ${src:vecfull};"),
+                   [], sop>;
+
+let isAsCheapAsAMove=1, neverHasSideEffects=1, IsSimpleMove=1,
+  VecInstType=isVecOther.Value in {
+def V4f32Mov : Vec_Move<"mov.v4.f32", V4F32Regs, FMOV32rr>;
+def V2f32Mov : Vec_Move<"mov.v2.f32", V2F32Regs, FMOV32rr>;
+
+def V4i32Mov : Vec_Move<"mov.v4.u32", V4I32Regs, IMOV32rr>;
+def V2i32Mov : Vec_Move<"mov.v2.u32", V2I32Regs, IMOV32rr>;
+
+def V4i16Mov : Vec_Move<"mov.v4.u16", V4I16Regs, IMOV16rr>;
+def V2i16Mov : Vec_Move<"mov.v2.u16", V2I16Regs, IMOV16rr>;
+
+def V4i8Mov : Vec_Move<"mov.v4.u16", V4I8Regs, IMOV8rr>;
+def V2i8Mov : Vec_Move<"mov.v2.u16", V2I8Regs, IMOV8rr>;
+
+def V2f64Mov : Vec_Move<"mov.v2.f64", V2F64Regs, FMOV64rr>;
+def V2i64Mov : Vec_Move<"mov.v2.u64", V2I64Regs, IMOV64rr>;
+}
+
+// extract subvector patterns
+def extract_subvec : SDNode<"ISD::EXTRACT_SUBVECTOR",
+                        SDTypeProfile<1, 2, [SDTCisPtrTy<2>]>>;
+
+def : Pat<(v2f32 (extract_subvec V4F32Regs:$src, 0)),
+                 (Build_Vector2_f32 (V4f32Extract V4F32Regs:$src, 0),
+                                    (V4f32Extract V4F32Regs:$src, 1))>;
+def : Pat<(v2f32 (extract_subvec V4F32Regs:$src, 2)),
+                 (Build_Vector2_f32 (V4f32Extract V4F32Regs:$src, 2),
+                                    (V4f32Extract V4F32Regs:$src, 3))>;
+def : Pat<(v2i32 (extract_subvec V4I32Regs:$src, 0)),
+                 (Build_Vector2_i32 (V4i32Extract V4I32Regs:$src, 0),
+                                    (V4i32Extract V4I32Regs:$src, 1))>;
+def : Pat<(v2i32 (extract_subvec V4I32Regs:$src, 2)),
+                 (Build_Vector2_i32 (V4i32Extract V4I32Regs:$src, 2),
+                                    (V4i32Extract V4I32Regs:$src, 3))>;
+def : Pat<(v2i16 (extract_subvec V4I16Regs:$src, 0)),
+                 (Build_Vector2_i16 (V4i16Extract V4I16Regs:$src, 0),
+                                    (V4i16Extract V4I16Regs:$src, 1))>;
+def : Pat<(v2i16 (extract_subvec V4I16Regs:$src, 2)),
+                 (Build_Vector2_i16 (V4i16Extract V4I16Regs:$src, 2),
+                                    (V4i16Extract V4I16Regs:$src, 3))>;
+def : Pat<(v2i8 (extract_subvec V4I8Regs:$src, 0)),
+                 (Build_Vector2_i8 (V4i8Extract V4I8Regs:$src, 0),
+                                    (V4i8Extract V4I8Regs:$src, 1))>;
+def : Pat<(v2i8 (extract_subvec V4I8Regs:$src, 2)),
+                 (Build_Vector2_i8 (V4i8Extract V4I8Regs:$src, 2),
+                                    (V4i8Extract V4I8Regs:$src, 3))>;
+
+// Select instructions
+class Select_OneLine<string type, string pos> {
+  string t1 = !strconcat("selp.", type);
+  string t2 = !strconcat(t1, " \t${dst}_");
+  string t3 = !strconcat(t2, pos);
+  string t4 = !strconcat(t3, ", ${src1}_");
+  string t5 = !strconcat(t4, pos);
+  string t6 = !strconcat(t5, ", ${src2}_");
+  string t7 = !strconcat(t6, pos);
+  string s  = !strconcat(t7, ", $p;");
+}
+
+class Select_Str2<string type> {
+  string t1 = Select_OneLine<type, "0">.s;
+  string t2 = !strconcat(t1, "\n\t");
+  string s  = !strconcat(t2, Select_OneLine<type, "1">.s);
+}
+
+class Select_Str4<string type> {
+  string t1 = Select_OneLine<type, "0">.s;
+  string t2 = !strconcat(t1, "\n\t");
+  string t3 = !strconcat(t2, Select_OneLine<type, "1">.s);
+  string t4 = !strconcat(t3, "\n\t");
+  string t5 = !strconcat(t4, Select_OneLine<type, "2">.s);
+  string t6 = !strconcat(t5, "\n\t");
+  string s  = !strconcat(t6, Select_OneLine<type, "3">.s);
+
+}
+
+class Vec_Select<NVPTXRegClass vclass, string asmstr, NVPTXInst sop>
+      : NVPTXVecInst<(outs vclass:$dst),
+                     (ins  vclass:$src1, vclass:$src2, Int1Regs:$p),
+                     asmstr,
+                     [(set vclass:$dst, (select Int1Regs:$p, vclass:$src1,
+                       vclass:$src2))],
+                     sop>;
+
+let VecInstType=isVecOther.Value in {
+def V2I64_Select : Vec_Select<V2I64Regs, Select_Str2<"b64">.s, SELECTi64rr>;
+def V4I32_Select : Vec_Select<V4I32Regs, Select_Str4<"b32">.s, SELECTi32rr>;
+def V2I32_Select : Vec_Select<V2I32Regs, Select_Str2<"b32">.s, SELECTi32rr>;
+def V4I16_Select : Vec_Select<V4I16Regs, Select_Str4<"b16">.s, SELECTi16rr>;
+def V2I16_Select : Vec_Select<V2I16Regs, Select_Str2<"b16">.s, SELECTi16rr>;
+def V4I8_Select  : Vec_Select<V4I8Regs,  Select_Str4<"b16">.s, SELECTi8rr>;
+def V2I8_Select  : Vec_Select<V2I8Regs,  Select_Str2<"b16">.s, SELECTi8rr>;
+
+def V2F64_Select : Vec_Select<V2F64Regs, Select_Str2<"f64">.s, SELECTf64rr>;
+def V4F32_Select : Vec_Select<V4F32Regs, Select_Str4<"f32">.s, SELECTf32rr>;
+def V2F32_Select : Vec_Select<V2F32Regs, Select_Str2<"f32">.s, SELECTf32rr>;
+}
+
+// Comparison instructions
+
+// setcc convenience fragments.
+def vsetoeq : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETOEQ)>;
+def vsetogt : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETOGT)>;
+def vsetoge : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETOGE)>;
+def vsetolt : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETOLT)>;
+def vsetole : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETOLE)>;
+def vsetone : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETONE)>;
+def vseto   : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETO)>;
+def vsetuo  : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETUO)>;
+def vsetueq : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETUEQ)>;
+def vsetugt : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETUGT)>;
+def vsetuge : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETUGE)>;
+def vsetult : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETULT)>;
+def vsetule : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETULE)>;
+def vsetune : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETUNE)>;
+def vseteq  : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETEQ)>;
+def vsetgt  : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETGT)>;
+def vsetge  : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETGE)>;
+def vsetlt  : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETLT)>;
+def vsetle  : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETLE)>;
+def vsetne  : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETNE)>;
+
+class Vec_Compare<PatFrag op, NVPTXRegClass outrclass, NVPTXRegClass inrclass,
+  NVPTXInst sop>
+    : NVPTXVecInst<(outs outrclass:$dst),
+                   (ins  inrclass:$a, inrclass:$b),
+                   "Unsupported",
+                   [(set outrclass:$dst, (op inrclass:$a, inrclass:$b))],
+                   sop>;
+
+multiclass Vec_Compare_All<PatFrag op,
+                           NVPTXInst inst8,
+                           NVPTXInst inst16,
+                           NVPTXInst inst32,
+                           NVPTXInst inst64>
+{
+  def  V2I8 : Vec_Compare<op, V2I8Regs,  V2I8Regs,  inst8>;
+  def  V4I8 : Vec_Compare<op, V4I8Regs,  V4I8Regs,  inst8>;
+  def V2I16 : Vec_Compare<op, V2I16Regs, V2I16Regs, inst16>;
+  def V4I16 : Vec_Compare<op, V4I16Regs, V4I16Regs, inst16>;
+  def V2I32 : Vec_Compare<op, V2I32Regs, V2I32Regs, inst32>;
+  def V4I32 : Vec_Compare<op, V4I32Regs, V4I32Regs, inst32>;
+  def V2I64 : Vec_Compare<op, V2I64Regs, V2I64Regs, inst64>;
+}
+
+let VecInstType=isVecOther.Value in {
+  defm VecSGT : Vec_Compare_All<vsetgt,  ISetSGTi8rr_toi8, ISetSGTi16rr_toi16,
+    ISetSGTi32rr_toi32, ISetSGTi64rr_toi64>;
+  defm VecUGT : Vec_Compare_All<vsetugt, ISetUGTi8rr_toi8, ISetUGTi16rr_toi16,
+    ISetUGTi32rr_toi32, ISetUGTi64rr_toi64>;
+  defm VecSLT : Vec_Compare_All<vsetlt,  ISetSLTi8rr_toi8, ISetSLTi16rr_toi16,
+    ISetSLTi32rr_toi32, ISetSLTi64rr_toi64>;
+  defm VecULT : Vec_Compare_All<vsetult, ISetULTi8rr_toi8, ISetULTi16rr_toi16,
+    ISetULTi32rr_toi32, ISetULTi64rr_toi64>;
+  defm VecSGE : Vec_Compare_All<vsetge,  ISetSGEi8rr_toi8, ISetSGEi16rr_toi16,
+    ISetSGEi32rr_toi32, ISetSGEi64rr_toi64>;
+  defm VecUGE : Vec_Compare_All<vsetuge, ISetUGEi8rr_toi8, ISetUGEi16rr_toi16,
+    ISetUGEi32rr_toi32, ISetUGEi64rr_toi64>;
+  defm VecSLE : Vec_Compare_All<vsetle,  ISetSLEi8rr_toi8, ISetSLEi16rr_toi16,
+    ISetSLEi32rr_toi32, ISetSLEi64rr_toi64>;
+  defm VecULE : Vec_Compare_All<vsetule, ISetULEi8rr_toi8, ISetULEi16rr_toi16,
+    ISetULEi32rr_toi32, ISetULEi64rr_toi64>;
+  defm VecSEQ : Vec_Compare_All<vseteq,  ISetSEQi8rr_toi8, ISetSEQi16rr_toi16,
+    ISetSEQi32rr_toi32, ISetSEQi64rr_toi64>;
+  defm VecUEQ : Vec_Compare_All<vsetueq, ISetUEQi8rr_toi8, ISetUEQi16rr_toi16,
+    ISetUEQi32rr_toi32, ISetUEQi64rr_toi64>;
+  defm VecSNE : Vec_Compare_All<vsetne,  ISetSNEi8rr_toi8, ISetSNEi16rr_toi16,
+    ISetSNEi32rr_toi32, ISetSNEi64rr_toi64>;
+  defm VecUNE : Vec_Compare_All<vsetune, ISetUNEi8rr_toi8, ISetUNEi16rr_toi16,
+    ISetUNEi32rr_toi32, ISetUNEi64rr_toi64>;
+}
+
+multiclass FVec_Compare_All<PatFrag op,
+                            NVPTXInst instf32,
+                            NVPTXInst instf64>
+{
+  def V2F32 : Vec_Compare<op, V2I32Regs, V2F32Regs, instf32>;
+  def V4F32 : Vec_Compare<op, V4I32Regs, V4F32Regs, instf32>;
+  def V2F64 : Vec_Compare<op, V2I64Regs, V2F64Regs, instf64>;
+}
+
+let VecInstType=isVecOther.Value in {
+  defm FVecGT :  FVec_Compare_All<vsetogt, FSetGTf32rr_toi32,
+    FSetGTf64rr_toi64>;
+  defm FVecLT :  FVec_Compare_All<vsetolt, FSetLTf32rr_toi32,
+    FSetLTf64rr_toi64>;
+  defm FVecGE :  FVec_Compare_All<vsetoge, FSetGEf32rr_toi32,
+    FSetGEf64rr_toi64>;
+  defm FVecLE :  FVec_Compare_All<vsetole, FSetLEf32rr_toi32,
+    FSetLEf64rr_toi64>;
+  defm FVecEQ :  FVec_Compare_All<vsetoeq, FSetEQf32rr_toi32,
+    FSetEQf64rr_toi64>;
+  defm FVecNE :  FVec_Compare_All<vsetone, FSetNEf32rr_toi32,
+    FSetNEf64rr_toi64>;
+
+  defm FVecUGT :  FVec_Compare_All<vsetugt, FSetUGTf32rr_toi32,
+    FSetUGTf64rr_toi64>;
+  defm FVecULT :  FVec_Compare_All<vsetult, FSetULTf32rr_toi32,
+    FSetULTf64rr_toi64>;
+  defm FVecUGE :  FVec_Compare_All<vsetuge, FSetUGEf32rr_toi32,
+    FSetUGEf64rr_toi64>;
+  defm FVecULE :  FVec_Compare_All<vsetule, FSetULEf32rr_toi32,
+    FSetULEf64rr_toi64>;
+  defm FVecUEQ :  FVec_Compare_All<vsetueq, FSetUEQf32rr_toi32,
+    FSetUEQf64rr_toi64>;
+  defm FVecUNE :  FVec_Compare_All<vsetune, FSetUNEf32rr_toi32,
+    FSetUNEf64rr_toi64>;
+
+  defm FVecNUM :  FVec_Compare_All<vseto,  FSetNUMf32rr_toi32,
+    FSetNUMf64rr_toi64>;
+  defm FVecNAN :  FVec_Compare_All<vsetuo, FSetNANf32rr_toi32,
+    FSetNANf64rr_toi64>;
+}
+
+class LoadParamScalar4Inst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs regclass:$d1, regclass:$d2, regclass:$d3, regclass:$d4),
+                (ins i32imm:$a, i32imm:$b),
+                !strconcat(!strconcat("ld.param", opstr),
+                  "\t{{$d1, $d2, $d3, $d4}}, [retval0+$b];"), []>;
+
+class LoadParamScalar2Inst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs regclass:$d1, regclass:$d2),
+                (ins i32imm:$a, i32imm:$b),
+                !strconcat(!strconcat("ld.param", opstr),
+                  "\t{{$d1, $d2}}, [retval0+$b];"), []>;
+
+
+class StoreParamScalar4Inst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs),
+                (ins regclass:$s1, regclass:$s2, regclass:$s3, regclass:$s4,
+                  i32imm:$a, i32imm:$b),
+                !strconcat(!strconcat("st.param", opstr),
+                  "\t[param$a+$b], {{$s1, $s2, $s3, $s4}};"), []>;
+
+class StoreParamScalar2Inst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs),
+                (ins regclass:$s1, regclass:$s2, i32imm:$a, i32imm:$b),
+                !strconcat(!strconcat("st.param", opstr),
+                  "\t[param$a+$b], {{$s1, $s2}};"), []>;
+
+class StoreRetvalScalar4Inst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs),
+                (ins regclass:$s1, regclass:$s2, regclass:$s3, regclass:$s4,
+                  i32imm:$a),
+                !strconcat(!strconcat("st.param", opstr),
+                  "\t[func_retval+$a], {{$s1, $s2, $s3, $s4}};"), []>;
+
+class StoreRetvalScalar2Inst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs),
+                (ins regclass:$s1, regclass:$s2, i32imm:$a),
+                !strconcat(!strconcat("st.param", opstr),
+                  "\t[func_retval+$a], {{$s1, $s2}};"), []>;
+
+def LoadParamScalar4I32 : LoadParamScalar4Inst<Int32Regs, ".v4.b32">;
+def LoadParamScalar4I16 : LoadParamScalar4Inst<Int16Regs, ".v4.b16">;
+def LoadParamScalar4I8  : LoadParamScalar4Inst<Int8Regs, ".v4.b8">;
+
+def LoadParamScalar2I64 : LoadParamScalar2Inst<Int32Regs, ".v2.b64">;
+def LoadParamScalar2I32 : LoadParamScalar2Inst<Int32Regs, ".v2.b32">;
+def LoadParamScalar2I16 : LoadParamScalar2Inst<Int32Regs, ".v2.b16">;
+def LoadParamScalar2I8  : LoadParamScalar2Inst<Int32Regs, ".v2.b8">;
+
+def LoadParamScalar4F32 : LoadParamScalar4Inst<Float32Regs, ".v4.f32">;
+def LoadParamScalar2F32 : LoadParamScalar2Inst<Float32Regs, ".v2.f32">;
+def LoadParamScalar2F64 : LoadParamScalar2Inst<Float64Regs, ".v2.f64">;
+
+def StoreParamScalar4I32 : StoreParamScalar4Inst<Int32Regs, ".v4.b32">;
+def StoreParamScalar4I16 : StoreParamScalar4Inst<Int16Regs, ".v4.b16">;
+def StoreParamScalar4I8  : StoreParamScalar4Inst<Int8Regs, ".v4.b8">;
+
+def StoreParamScalar2I64 : StoreParamScalar2Inst<Int64Regs, ".v2.b64">;
+def StoreParamScalar2I32 : StoreParamScalar2Inst<Int32Regs, ".v2.b32">;
+def StoreParamScalar2I16 : StoreParamScalar2Inst<Int16Regs, ".v2.b16">;
+def StoreParamScalar2I8  : StoreParamScalar2Inst<Int8Regs, ".v2.b8">;
+
+def StoreParamScalar4F32 : StoreParamScalar4Inst<Float32Regs, ".v4.f32">;
+def StoreParamScalar2F32 : StoreParamScalar2Inst<Float32Regs, ".v2.f32">;
+def StoreParamScalar2F64 : StoreParamScalar2Inst<Float64Regs, ".v2.f64">;
+
+def StoreRetvalScalar4I32 : StoreRetvalScalar4Inst<Int32Regs, ".v4.b32">;
+def StoreRetvalScalar4I16 : StoreRetvalScalar4Inst<Int16Regs, ".v4.b16">;
+def StoreRetvalScalar4I8  : StoreRetvalScalar4Inst<Int8Regs, ".v4.b8">;
+
+def StoreRetvalScalar2I64 : StoreRetvalScalar2Inst<Int64Regs, ".v2.b64">;
+def StoreRetvalScalar2I32 : StoreRetvalScalar2Inst<Int32Regs, ".v2.b32">;
+def StoreRetvalScalar2I16 : StoreRetvalScalar2Inst<Int16Regs, ".v2.b16">;
+def StoreRetvalScalar2I8  : StoreRetvalScalar2Inst<Int8Regs, ".v2.b8">;
+
+def StoreRetvalScalar4F32 : StoreRetvalScalar4Inst<Float32Regs, ".v4.f32">;
+def StoreRetvalScalar2F32 : StoreRetvalScalar2Inst<Float32Regs, ".v2.f32">;
+def StoreRetvalScalar2F64 : StoreRetvalScalar2Inst<Float64Regs, ".v2.f64">;
+
+class LoadParamVecInst<NVPTXRegClass regclass, string opstr, NVPTXInst sop=NOP>:
+      NVPTXVecInst<(outs regclass:$dst), (ins i32imm:$a, i32imm:$b),
+                "loadparam : $dst <- [$a, $b]",
+                [(set regclass:$dst, (LoadParam (i32 imm:$a), (i32 imm:$b)))],
+                sop>;
+
+class StoreParamVecInst<NVPTXRegClass regclass, string opstr, NVPTXInst sop=NOP>
+      : NVPTXVecInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b),
+                "storeparam : [$a, $b] <- $val",
+                [(StoreParam (i32 imm:$a), (i32 imm:$b), regclass:$val)], sop>;
+
+class StoreRetvalVecInst<NVPTXRegClass regclass, string opstr,
+  NVPTXInst sop=NOP>
+      : NVPTXVecInst<(outs), (ins regclass:$val, i32imm:$a),
+                "storeretval : retval[$a] <- $val",
+                [(StoreRetval (i32 imm:$a), regclass:$val)], sop>;
+
+let VecInstType=isVecLD.Value in {
+def LoadParamV4I32  : LoadParamVecInst<V4I32Regs, ".v4.b32",
+  LoadParamScalar4I32>;
+def LoadParamV4I16  : LoadParamVecInst<V4I16Regs, ".v4.b16",
+  LoadParamScalar4I16>;
+def LoadParamV4I8   : LoadParamVecInst<V4I8Regs, ".v4.b8",
+  LoadParamScalar4I8>;
+
+def LoadParamV2I64  : LoadParamVecInst<V2I64Regs, ".v2.b64",
+  LoadParamScalar2I64>;
+def LoadParamV2I32  : LoadParamVecInst<V2I32Regs, ".v2.b32",
+  LoadParamScalar2I32>;
+def LoadParamV2I16  : LoadParamVecInst<V2I16Regs, ".v2.b16",
+  LoadParamScalar2I16>;
+def LoadParamV2I8   : LoadParamVecInst<V2I8Regs, ".v2.b8",
+  LoadParamScalar2I8>;
+
+def LoadParamV4F32  : LoadParamVecInst<V4F32Regs, ".v4.f32",
+  LoadParamScalar4F32>;
+def LoadParamV2F32  : LoadParamVecInst<V2F32Regs, ".v2.f32",
+  LoadParamScalar2F32>;
+def LoadParamV2F64  : LoadParamVecInst<V2F64Regs, ".v2.f64",
+  LoadParamScalar2F64>;
+}
+
+let VecInstType=isVecST.Value in {
+def StoreParamV4I32  : StoreParamVecInst<V4I32Regs, ".v4.b32",
+  StoreParamScalar4I32>;
+def StoreParamV4I16  : StoreParamVecInst<V4I16Regs, ".v4.b16",
+  StoreParamScalar4I16>;
+def StoreParamV4I8   : StoreParamVecInst<V4I8Regs, ".v4.b8",
+  StoreParamScalar4I8>;
+
+def StoreParamV2I64  : StoreParamVecInst<V2I64Regs, ".v2.b64",
+  StoreParamScalar2I64>;
+def StoreParamV2I32  : StoreParamVecInst<V2I32Regs, ".v2.b32",
+  StoreParamScalar2I32>;
+def StoreParamV2I16  : StoreParamVecInst<V2I16Regs, ".v2.b16",
+  StoreParamScalar2I16>;
+def StoreParamV2I8   : StoreParamVecInst<V2I8Regs, ".v2.b8",
+  StoreParamScalar2I8>;
+
+def StoreParamV4F32  : StoreParamVecInst<V4F32Regs, ".v4.f32",
+  StoreParamScalar4F32>;
+def StoreParamV2F32  : StoreParamVecInst<V2F32Regs, ".v2.f32",
+  StoreParamScalar2F32>;
+def StoreParamV2F64  : StoreParamVecInst<V2F64Regs, ".v2.f64",
+  StoreParamScalar2F64>;
+
+def StoreRetvalV4I32  : StoreRetvalVecInst<V4I32Regs, ".v4.b32",
+  StoreRetvalScalar4I32>;
+def StoreRetvalV4I16  : StoreRetvalVecInst<V4I16Regs, ".v4.b16",
+  StoreRetvalScalar4I16>;
+def StoreRetvalV4I8   : StoreRetvalVecInst<V4I8Regs,  ".v4.b8",
+  StoreRetvalScalar4I8>;
+
+def StoreRetvalV2I64  : StoreRetvalVecInst<V2I64Regs, ".v2.b64",
+  StoreRetvalScalar2I64>;
+def StoreRetvalV2I32  : StoreRetvalVecInst<V2I32Regs, ".v2.b32",
+  StoreRetvalScalar2I32>;
+def StoreRetvalV2I16  : StoreRetvalVecInst<V2I16Regs, ".v2.b16",
+  StoreRetvalScalar2I16>;
+def StoreRetvalV2I8   : StoreRetvalVecInst<V2I8Regs,  ".v2.b8",
+  StoreRetvalScalar2I8>;
+
+def StoreRetvalV4F32  : StoreRetvalVecInst<V4F32Regs, ".v4.f32",
+  StoreRetvalScalar4F32>;
+def StoreRetvalV2F32  : StoreRetvalVecInst<V2F32Regs, ".v2.f32",
+  StoreRetvalScalar2F32>;
+def StoreRetvalV2F64  : StoreRetvalVecInst<V2F64Regs, ".v2.f64",
+  StoreRetvalScalar2F64>;
+
+}
+
+
+// Int vector to int scalar bit convert
+// v4i8 -> i32
+def : Pat<(i32 (bitconvert V4I8Regs:$s)),
+          (V4I8toI32 (V4i8Extract V4I8Regs:$s,0), (V4i8Extract V4I8Regs:$s,1),
+                     (V4i8Extract V4I8Regs:$s,2), (V4i8Extract V4I8Regs:$s,3))>;
+// v4i16 -> i64
+def : Pat<(i64 (bitconvert V4I16Regs:$s)),
+          (V4I16toI64 (V4i16Extract V4I16Regs:$s,0),
+            (V4i16Extract V4I16Regs:$s,1),
+                     (V4i16Extract V4I16Regs:$s,2),
+                     (V4i16Extract V4I16Regs:$s,3))>;
+// v2i8 -> i16
+def : Pat<(i16 (bitconvert V2I8Regs:$s)),
+          (V2I8toI16 (V2i8Extract V2I8Regs:$s,0), (V2i8Extract V2I8Regs:$s,1))>;
+// v2i16 -> i32
+def : Pat<(i32 (bitconvert V2I16Regs:$s)),
+          (V2I16toI32 (V2i16Extract V2I16Regs:$s,0),
+            (V2i16Extract V2I16Regs:$s,1))>;
+// v2i32 -> i64
+def : Pat<(i64 (bitconvert V2I32Regs:$s)),
+          (V2I32toI64 (V2i32Extract V2I32Regs:$s,0),
+            (V2i32Extract V2I32Regs:$s,1))>;
+
+// Int scalar to int vector bit convert
+let VecInstType=isVecDest.Value in {
+// i32 -> v4i8
+def VecI32toV4I8 : NVPTXVecInst<(outs V4I8Regs:$d), (ins Int32Regs:$s),
+                                "Error!",
+                                [(set V4I8Regs:$d, (bitconvert Int32Regs:$s))],
+                                I32toV4I8>;
+// i64 -> v4i16
+def VecI64toV4I16 : NVPTXVecInst<(outs V4I16Regs:$d), (ins Int64Regs:$s),
+                                 "Error!",
+                                [(set V4I16Regs:$d, (bitconvert Int64Regs:$s))],
+                                 I64toV4I16>;
+// i16 -> v2i8
+def VecI16toV2I8 : NVPTXVecInst<(outs V2I8Regs:$d), (ins Int16Regs:$s),
+                                "Error!",
+                               [(set V2I8Regs:$d, (bitconvert Int16Regs:$s))],
+                                I16toV2I8>;
+// i32 -> v2i16
+def VecI32toV2I16 : NVPTXVecInst<(outs V2I16Regs:$d), (ins Int32Regs:$s),
+                                 "Error!",
+                                [(set V2I16Regs:$d, (bitconvert Int32Regs:$s))],
+                                 I32toV2I16>;
+// i64 -> v2i32
+def VecI64toV2I32 : NVPTXVecInst<(outs V2I32Regs:$d), (ins Int64Regs:$s),
+                                  "Error!",
+                                [(set V2I32Regs:$d, (bitconvert Int64Regs:$s))],
+                                  I64toV2I32>;
+}
+
+// Int vector to int vector bit convert
+// v4i8 -> v2i16
+def : Pat<(v2i16 (bitconvert V4I8Regs:$s)),
+          (VecI32toV2I16
+          (V4I8toI32 (V4i8Extract V4I8Regs:$s,0), (V4i8Extract V4I8Regs:$s,1),
+                    (V4i8Extract V4I8Regs:$s,2), (V4i8Extract V4I8Regs:$s,3)))>;
+// v4i16 -> v2i32
+def : Pat<(v2i32 (bitconvert V4I16Regs:$s)),
+          (VecI64toV2I32
+       (V4I16toI64 (V4i16Extract V4I16Regs:$s,0), (V4i16Extract V4I16Regs:$s,1),
+                (V4i16Extract V4I16Regs:$s,2), (V4i16Extract V4I16Regs:$s,3)))>;
+// v2i16 -> v4i8
+def : Pat<(v4i8 (bitconvert V2I16Regs:$s)),
+          (VecI32toV4I8
+    (V2I16toI32 (V2i16Extract V2I16Regs:$s,0), (V2i16Extract V2I16Regs:$s,1)))>;
+// v2i32 -> v4i16
+def : Pat<(v4i16 (bitconvert V2I32Regs:$s)),
+          (VecI64toV4I16
+    (V2I32toI64 (V2i32Extract V2I32Regs:$s,0), (V2i32Extract V2I32Regs:$s,1)))>;
+// v2i64 -> v4i32
+def : Pat<(v4i32 (bitconvert V2I64Regs:$s)),
+          (Build_Vector4_i32
+            (V2i32Extract (VecI64toV2I32 (V2i64Extract V2I64Regs:$s, 0)), 0),
+            (V2i32Extract (VecI64toV2I32 (V2i64Extract V2I64Regs:$s, 0)), 1),
+            (V2i32Extract (VecI64toV2I32 (V2i64Extract V2I64Regs:$s, 1)), 0),
+            (V2i32Extract (VecI64toV2I32 (V2i64Extract V2I64Regs:$s, 1)), 1))>;
+// v4i32 -> v2i64
+def : Pat<(v2i64 (bitconvert V4I32Regs:$s)),
+          (Build_Vector2_i64
+      (V2I32toI64 (V4i32Extract V4I32Regs:$s,0), (V4i32Extract V4I32Regs:$s,1)),
+    (V2I32toI64 (V4i32Extract V4I32Regs:$s,2), (V4i32Extract V4I32Regs:$s,3)))>;
+
+// Fp scalar to fp vector convert
+// f64 -> v2f32
+let VecInstType=isVecDest.Value in {
+def VecF64toV2F32 : NVPTXVecInst<(outs V2F32Regs:$d), (ins Float64Regs:$s),
+                                  "Error!",
+                              [(set V2F32Regs:$d, (bitconvert Float64Regs:$s))],
+                                  F64toV2F32>;
+}
+
+// Fp vector to fp scalar convert
+// v2f32 -> f64
+def : Pat<(f64 (bitconvert V2F32Regs:$s)),
+     (V2F32toF64 (V2f32Extract V2F32Regs:$s,0), (V2f32Extract V2F32Regs:$s,1))>;
+
+// Fp scalar to int vector convert
+// f32 -> v4i8
+def : Pat<(v4i8 (bitconvert Float32Regs:$s)),
+          (VecI32toV4I8 (BITCONVERT_32_F2I Float32Regs:$s))>;
+// f32 -> v2i16
+def : Pat<(v2i16 (bitconvert Float32Regs:$s)),
+          (VecI32toV2I16 (BITCONVERT_32_F2I Float32Regs:$s))>;
+// f64 -> v4i16
+def : Pat<(v4i16 (bitconvert Float64Regs:$s)),
+          (VecI64toV4I16 (BITCONVERT_64_F2I Float64Regs:$s))>;
+// f64 -> v2i32
+def : Pat<(v2i32 (bitconvert Float64Regs:$s)),
+          (VecI64toV2I32 (BITCONVERT_64_F2I Float64Regs:$s))>;
+
+// Int vector to fp scalar convert
+// v4i8 -> f32
+def : Pat<(f32 (bitconvert V4I8Regs:$s)),
+          (BITCONVERT_32_I2F
+          (V4I8toI32 (V4i8Extract V4I8Regs:$s,0), (V4i8Extract V4I8Regs:$s,1),
+                    (V4i8Extract V4I8Regs:$s,2), (V4i8Extract V4I8Regs:$s,3)))>;
+// v4i16 -> f64
+def : Pat<(f64 (bitconvert V4I16Regs:$s)),
+          (BITCONVERT_64_I2F
+       (V4I16toI64 (V4i16Extract V4I16Regs:$s,0), (V4i16Extract V4I16Regs:$s,1),
+                (V4i16Extract V4I16Regs:$s,2), (V4i16Extract V4I16Regs:$s,3)))>;
+// v2i16 -> f32
+def : Pat<(f32 (bitconvert V2I16Regs:$s)),
+          (BITCONVERT_32_I2F
+    (V2I16toI32 (V2i16Extract V2I16Regs:$s,0), (V2i16Extract V2I16Regs:$s,1)))>;
+// v2i32 -> f64
+def : Pat<(f64 (bitconvert V2I32Regs:$s)),
+          (BITCONVERT_64_I2F
+    (V2I32toI64 (V2i32Extract V2I32Regs:$s,0), (V2i32Extract V2I32Regs:$s,1)))>;
+
+// Int scalar to fp vector convert
+// i64 -> v2f32
+def : Pat<(v2f32 (bitconvert Int64Regs:$s)),
+          (VecF64toV2F32 (BITCONVERT_64_I2F Int64Regs:$s))>;
+
+// Fp vector to int scalar convert
+// v2f32 -> i64
+def : Pat<(i64 (bitconvert V2F32Regs:$s)),
+          (BITCONVERT_64_F2I
+    (V2F32toF64 (V2f32Extract V2F32Regs:$s,0), (V2f32Extract V2F32Regs:$s,1)))>;
+
+// Int vector to fp vector convert
+// v2i64 -> v4f32
+def : Pat<(v4f32 (bitconvert V2I64Regs:$s)),
+          (Build_Vector4_f32
+            (BITCONVERT_32_I2F (V2i32Extract (VecI64toV2I32
+              (V2i64Extract V2I64Regs:$s, 0)), 0)),
+            (BITCONVERT_32_I2F (V2i32Extract (VecI64toV2I32
+              (V2i64Extract V2I64Regs:$s, 0)), 1)),
+            (BITCONVERT_32_I2F (V2i32Extract (VecI64toV2I32
+              (V2i64Extract V2I64Regs:$s, 1)), 0)),
+            (BITCONVERT_32_I2F (V2i32Extract (VecI64toV2I32
+              (V2i64Extract V2I64Regs:$s, 1)), 1)))>;
+// v2i64 -> v2f64
+def : Pat<(v2f64 (bitconvert V2I64Regs:$s)),
+    (Build_Vector2_f64
+            (BITCONVERT_64_I2F (V2i64Extract V2I64Regs:$s,0)),
+            (BITCONVERT_64_I2F (V2i64Extract V2I64Regs:$s,1)))>;
+// v2i32 -> v2f32
+def : Pat<(v2f32 (bitconvert V2I32Regs:$s)),
+    (Build_Vector2_f32
+            (BITCONVERT_32_I2F (V2i32Extract V2I32Regs:$s,0)),
+            (BITCONVERT_32_I2F (V2i32Extract V2I32Regs:$s,1)))>;
+// v4i32 -> v2f64
+def : Pat<(v2f64 (bitconvert V4I32Regs:$s)),
+          (Build_Vector2_f64
+           (BITCONVERT_64_I2F (V2I32toI64 (V4i32Extract V4I32Regs:$s,0),
+             (V4i32Extract V4I32Regs:$s,1))),
+           (BITCONVERT_64_I2F (V2I32toI64 (V4i32Extract V4I32Regs:$s,2),
+             (V4i32Extract V4I32Regs:$s,3))))>;
+// v4i32 -> v4f32
+def : Pat<(v4f32 (bitconvert V4I32Regs:$s)),
+    (Build_Vector4_f32
+            (BITCONVERT_32_I2F (V4i32Extract V4I32Regs:$s,0)),
+            (BITCONVERT_32_I2F (V4i32Extract V4I32Regs:$s,1)),
+            (BITCONVERT_32_I2F (V4i32Extract V4I32Regs:$s,2)),
+            (BITCONVERT_32_I2F (V4i32Extract V4I32Regs:$s,3)))>;
+// v4i16 -> v2f32
+def : Pat<(v2f32 (bitconvert V4I16Regs:$s)),
+          (VecF64toV2F32 (BITCONVERT_64_I2F
+          (V4I16toI64 (V4i16Extract V4I16Regs:$s,0),
+            (V4i16Extract V4I16Regs:$s,1),
+                      (V4i16Extract V4I16Regs:$s,2),
+                      (V4i16Extract V4I16Regs:$s,3))))>;
+
+// Fp vector to int vector convert
+// v2i64 <- v4f32
+def : Pat<(v2i64 (bitconvert V4F32Regs:$s)),
+          (Build_Vector2_i64
+           (BITCONVERT_64_F2I (V2F32toF64 (V4f32Extract V4F32Regs:$s,0),
+             (V4f32Extract V4F32Regs:$s,1))),
+           (BITCONVERT_64_F2I (V2F32toF64 (V4f32Extract V4F32Regs:$s,2),
+             (V4f32Extract V4F32Regs:$s,3))))>;
+// v2i64 <- v2f64
+def : Pat<(v2i64 (bitconvert V2F64Regs:$s)),
+    (Build_Vector2_i64
+            (BITCONVERT_64_F2I (V2f64Extract V2F64Regs:$s,0)),
+            (BITCONVERT_64_F2I (V2f64Extract V2F64Regs:$s,1)))>;
+// v2i32 <- v2f32
+def : Pat<(v2i32 (bitconvert V2F32Regs:$s)),
+    (Build_Vector2_i32
+            (BITCONVERT_32_F2I (V2f32Extract V2F32Regs:$s,0)),
+            (BITCONVERT_32_F2I (V2f32Extract V2F32Regs:$s,1)))>;
+// v4i32 <- v2f64
+def : Pat<(v4i32 (bitconvert V2F64Regs:$s)),
+          (Build_Vector4_i32
+            (BITCONVERT_32_F2I (V2f32Extract (VecF64toV2F32
+              (V2f64Extract V2F64Regs:$s, 0)), 0)),
+            (BITCONVERT_32_F2I (V2f32Extract (VecF64toV2F32
+              (V2f64Extract V2F64Regs:$s, 0)), 1)),
+            (BITCONVERT_32_F2I (V2f32Extract (VecF64toV2F32
+              (V2f64Extract V2F64Regs:$s, 1)), 0)),
+            (BITCONVERT_32_F2I (V2f32Extract (VecF64toV2F32
+              (V2f64Extract V2F64Regs:$s, 1)), 1)))>;
+// v4i32 <- v4f32
+def : Pat<(v4i32 (bitconvert V4F32Regs:$s)),
+          (Build_Vector4_i32
+            (BITCONVERT_32_F2I (V4f32Extract V4F32Regs:$s,0)),
+            (BITCONVERT_32_F2I (V4f32Extract V4F32Regs:$s,1)),
+            (BITCONVERT_32_F2I (V4f32Extract V4F32Regs:$s,2)),
+            (BITCONVERT_32_F2I (V4f32Extract V4F32Regs:$s,3)))>;
+// v4i16 <- v2f32
+def : Pat<(v4i16 (bitconvert V2F32Regs:$s)),
+          (VecI64toV4I16 (BITCONVERT_64_F2I
+          (V2F32toF64 (V2f32Extract V2F32Regs:$s,0),
+            (V2f32Extract V2F32Regs:$s,1))))>;
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXutil.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXutil.cpp
new file mode 100644
index 0000000..5f074b3
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXutil.cpp
@@ -0,0 +1,90 @@
+//===-- NVPTXutil.cpp - Functions exported to CodeGen --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the functions that can be used in CodeGen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXutil.h"
+#include "NVPTX.h"
+
+using namespace llvm;
+
+namespace llvm {
+
+bool isParamLoad(const MachineInstr *MI) {
+  if ((MI->getOpcode() != NVPTX::LD_i32_avar) &&
+      (MI->getOpcode() != NVPTX::LD_i64_avar))
+    return false;
+  if (MI->getOperand(2).isImm() == false)
+    return false;
+  if (MI->getOperand(2).getImm() != NVPTX::PTXLdStInstCode::PARAM)
+    return false;
+  return true;
+}
+
+#define DATA_MASK 0x7f
+#define DIGIT_WIDTH 7
+#define MORE_BYTES 0x80
+
+static int encode_leb128(uint64_t val, int *nbytes, char *space, int splen) {
+  char *a;
+  char *end = space + splen;
+
+  a = space;
+  do {
+    unsigned char uc;
+
+    if (a >= end)
+      return 1;
+    uc = val & DATA_MASK;
+    val >>= DIGIT_WIDTH;
+    if (val != 0)
+      uc |= MORE_BYTES;
+    *a = uc;
+    a++;
+  } while (val);
+  *nbytes = a - space;
+  return 0;
+}
+
+#undef DATA_MASK
+#undef DIGIT_WIDTH
+#undef MORE_BYTES
+
+uint64_t encode_leb128(const char *str) {
+  union {
+    uint64_t x;
+    char a[8];
+  } temp64;
+
+  temp64.x = 0;
+
+  for (unsigned i = 0, e = strlen(str); i != e; ++i)
+    temp64.a[i] = str[e - 1 - i];
+
+  char encoded[16];
+  int nbytes;
+
+  int retval = encode_leb128(temp64.x, &nbytes, encoded, 16);
+
+  (void) retval;
+  assert(retval == 0 && "Encoding to leb128 failed");
+
+  assert(nbytes <= 8 &&
+         "Cannot support register names with leb128 encoding > 8 bytes");
+
+  temp64.x = 0;
+  for (int i = 0; i < nbytes; ++i)
+    temp64.a[i] = encoded[i];
+
+  return temp64.x;
+}
+
+} // end namespace llvm
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXutil.h b/contrib/llvm/lib/Target/NVPTX/NVPTXutil.h
new file mode 100644
index 0000000..d1d1171
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXutil.h
@@ -0,0 +1,25 @@
+//===-- NVPTXutil.h - Functions exported to CodeGen --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the functions that can be used in CodeGen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_NVPTX_UTIL_H
+#define LLVM_TARGET_NVPTX_UTIL_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+
+namespace llvm {
+bool isParamLoad(const MachineInstr *);
+uint64_t encode_leb128(const char *str);
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/contrib/llvm/lib/Target/NVPTX/NVVMReflect.cpp
new file mode 100644
index 0000000..a8d6b95
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -0,0 +1,222 @@
+//===- NVVMReflect.cpp - NVVM Emulate conditional compilation -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass replaces occurrences of __nvvm_reflect("string") with an
+// integer based on -nvvm-reflect-list string=<int> option given to this pass.
+// If an undefined string value is seen in a call to __nvvm_reflect("string"),
+// a default value of 0 will be used.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_os_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include <map>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#define NVVM_REFLECT_FUNCTION "__nvvm_reflect"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "nvptx-reflect"
+
+namespace llvm { void initializeNVVMReflectPass(PassRegistry &); }
+
+namespace {
+class NVVMReflect : public ModulePass {
+private:
+  StringMap<int> VarMap;
+  typedef DenseMap<std::string, int>::iterator VarMapIter;
+
+public:
+  static char ID;
+  NVVMReflect() : ModulePass(ID) {
+    initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
+    VarMap.clear();
+  }
+
+  NVVMReflect(const StringMap<int> &Mapping)
+  : ModulePass(ID) {
+    initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
+    for (StringMap<int>::const_iterator I = Mapping.begin(), E = Mapping.end();
+         I != E; ++I) {
+      VarMap[(*I).getKey()] = (*I).getValue();
+    }
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
+  bool runOnModule(Module &) override;
+
+private:
+  bool handleFunction(Function *ReflectFunction);
+  void setVarMap();
+};
+}
+
+ModulePass *llvm::createNVVMReflectPass() {
+  return new NVVMReflect();
+}
+
+ModulePass *llvm::createNVVMReflectPass(const StringMap<int>& Mapping) {
+  return new NVVMReflect(Mapping);
+}
+
+static cl::opt<bool>
+NVVMReflectEnabled("nvvm-reflect-enable", cl::init(true), cl::Hidden,
+                   cl::desc("NVVM reflection, enabled by default"));
+
+char NVVMReflect::ID = 0;
+INITIALIZE_PASS(NVVMReflect, "nvvm-reflect",
+                "Replace occurrences of __nvvm_reflect() calls with 0/1", false,
+                false)
+
+static cl::list<std::string>
+ReflectList("nvvm-reflect-list", cl::value_desc("name=<int>"), cl::Hidden,
+            cl::desc("A list of string=num assignments"),
+            cl::ValueRequired);
+
+/// The command line can look as follows :
+/// -nvvm-reflect-list a=1,b=2 -nvvm-reflect-list c=3,d=0 -R e=2
+/// The strings "a=1,b=2", "c=3,d=0", "e=2" are available in the
+/// ReflectList vector. First, each of ReflectList[i] is 'split'
+/// using "," as the delimiter. Then each of this part is split
+/// using "=" as the delimiter.
+void NVVMReflect::setVarMap() {
+  for (unsigned i = 0, e = ReflectList.size(); i != e; ++i) {
+    DEBUG(dbgs() << "Option : "  << ReflectList[i] << "\n");
+    SmallVector<StringRef, 4> NameValList;
+    StringRef(ReflectList[i]).split(NameValList, ",");
+    for (unsigned j = 0, ej = NameValList.size(); j != ej; ++j) {
+      SmallVector<StringRef, 2> NameValPair;
+      NameValList[j].split(NameValPair, "=");
+      assert(NameValPair.size() == 2 && "name=val expected");
+      std::stringstream ValStream(NameValPair[1]);
+      int Val;
+      ValStream >> Val;
+      assert((!(ValStream.fail())) && "integer value expected");
+      VarMap[NameValPair[0]] = Val;
+    }
+  }
+}
+
+bool NVVMReflect::handleFunction(Function *ReflectFunction) {
+  // Validate _reflect function
+  assert(ReflectFunction->isDeclaration() &&
+         "_reflect function should not have a body");
+  assert(ReflectFunction->getReturnType()->isIntegerTy() &&
+         "_reflect's return type should be integer");
+
+  std::vector<Instruction *> ToRemove;
+
+  // Go through the uses of ReflectFunction in this Function.
+  // Each of them should a CallInst with a ConstantArray argument.
+  // First validate that. If the c-string corresponding to the
+  // ConstantArray can be found successfully, see if it can be
+  // found in VarMap. If so, replace the uses of CallInst with the
+  // value found in VarMap. If not, replace the use  with value 0.
+  for (User *U : ReflectFunction->users()) {
+    assert(isa<CallInst>(U) && "Only a call instruction can use _reflect");
+    CallInst *Reflect = cast<CallInst>(U);
+
+    assert((Reflect->getNumOperands() == 2) &&
+           "Only one operand expect for _reflect function");
+    // In cuda, we will have an extra constant-to-generic conversion of
+    // the string.
+    const Value *Str = Reflect->getArgOperand(0);
+    if (isa<CallInst>(Str)) {
+      // CUDA path
+      const CallInst *ConvCall = cast<CallInst>(Str);
+      Str = ConvCall->getArgOperand(0);
+    }
+    assert(isa<ConstantExpr>(Str) &&
+           "Format of _reflect function not recognized");
+    const ConstantExpr *GEP = cast<ConstantExpr>(Str);
+
+    const Value *Sym = GEP->getOperand(0);
+    assert(isa<Constant>(Sym) && "Format of _reflect function not recognized");
+
+    const Constant *SymStr = cast<Constant>(Sym);
+
+    assert(isa<ConstantDataSequential>(SymStr->getOperand(0)) &&
+           "Format of _reflect function not recognized");
+
+    assert(cast<ConstantDataSequential>(SymStr->getOperand(0))->isCString() &&
+           "Format of _reflect function not recognized");
+
+    std::string ReflectArg =
+        cast<ConstantDataSequential>(SymStr->getOperand(0))->getAsString();
+
+    ReflectArg = ReflectArg.substr(0, ReflectArg.size() - 1);
+    DEBUG(dbgs() << "Arg of _reflect : " << ReflectArg << "\n");
+
+    int ReflectVal = 0; // The default value is 0
+    if (VarMap.find(ReflectArg) != VarMap.end()) {
+      ReflectVal = VarMap[ReflectArg];
+    }
+    Reflect->replaceAllUsesWith(
+        ConstantInt::get(Reflect->getType(), ReflectVal));
+    ToRemove.push_back(Reflect);
+  }
+  if (ToRemove.size() == 0)
+    return false;
+
+  for (unsigned i = 0, e = ToRemove.size(); i != e; ++i)
+    ToRemove[i]->eraseFromParent();
+  return true;
+}
+
+bool NVVMReflect::runOnModule(Module &M) {
+  if (!NVVMReflectEnabled)
+    return false;
+
+  setVarMap();
+
+
+  bool Res = false;
+  std::string Name;
+  Type *Tys[1];
+  Type *I8Ty = Type::getInt8Ty(M.getContext());
+  Function *ReflectFunction;
+
+  // Check for standard overloaded versions of llvm.nvvm.reflect
+
+  for (unsigned i = 0; i != 5; ++i) {
+    Tys[0] = PointerType::get(I8Ty, i);
+    Name = Intrinsic::getName(Intrinsic::nvvm_reflect, Tys);
+    ReflectFunction = M.getFunction(Name);
+    if(ReflectFunction != 0) {
+      Res |= handleFunction(ReflectFunction);
+    }
+  }
+
+  ReflectFunction = M.getFunction(NVVM_REFLECT_FUNCTION);
+  // If reflect function is not used, then there will be
+  // no entry in the module.
+  if (ReflectFunction != 0)
+    Res |= handleFunction(ReflectFunction);
+
+  return Res;
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp b/contrib/llvm/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp
new file mode 100644
index 0000000..cc7d4dc
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp
@@ -0,0 +1,23 @@
+//===-- NVPTXTargetInfo.cpp - NVPTX Target Implementation -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheNVPTXTarget32;
+Target llvm::TheNVPTXTarget64;
+
+extern "C" void LLVMInitializeNVPTXTargetInfo() {
+  RegisterTarget<Triple::nvptx> X(TheNVPTXTarget32, "nvptx",
+                                  "NVIDIA PTX 32-bit");
+  RegisterTarget<Triple::nvptx64> Y(TheNVPTXTarget64, "nvptx64",
+                                    "NVIDIA PTX 64-bit");
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/cl_common_defines.h b/contrib/llvm/lib/Target/NVPTX/cl_common_defines.h
new file mode 100644
index 0000000..02c5a94
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/cl_common_defines.h
@@ -0,0 +1,122 @@
+#ifndef CL_COMMON_DEFINES_H
+#define CL_COMMON_DEFINES_H
+// This file includes defines that are common to both kernel code and
+// the NVPTX back-end.
+
+//
+// Common defines for Image intrinsics
+// Channel order
+enum {
+  CLK_R = 0x10B0,
+  CLK_A = 0x10B1,
+  CLK_RG = 0x10B2,
+  CLK_RA = 0x10B3,
+  CLK_RGB = 0x10B4,
+  CLK_RGBA = 0x10B5,
+  CLK_BGRA = 0x10B6,
+  CLK_ARGB = 0x10B7,
+
+#if (__NV_CL_C_VERSION == __NV_CL_C_VERSION_1_0)
+  CLK_xRGB = 0x10B7,
+#endif
+
+  CLK_INTENSITY = 0x10B8,
+  CLK_LUMINANCE = 0x10B9
+
+#if (__NV_CL_C_VERSION >= __NV_CL_C_VERSION_1_1)
+      ,
+  CLK_Rx = 0x10BA,
+  CLK_RGx = 0x10BB,
+  CLK_RGBx = 0x10BC
+#endif
+};
+
+typedef enum clk_channel_type {
+  // valid formats for float return types
+  CLK_SNORM_INT8 = 0x10D0,  // four channel RGBA unorm8
+  CLK_SNORM_INT16 = 0x10D1, // four channel RGBA unorm16
+  CLK_UNORM_INT8 = 0x10D2,  // four channel RGBA unorm8
+  CLK_UNORM_INT16 = 0x10D3, // four channel RGBA unorm16
+  CLK_HALF_FLOAT = 0x10DD,  // four channel RGBA half
+  CLK_FLOAT = 0x10DE,       // four channel RGBA float
+
+#if (__NV_CL_C_VERSION >= __NV_CL_C_VERSION_1_1)
+  CLK_UNORM_SHORT_565 = 0x10D4,
+  CLK_UNORM_SHORT_555 = 0x10D5,
+  CLK_UNORM_INT_101010 = 0x10D6,
+#endif
+
+  // valid only for integer return types
+  CLK_SIGNED_INT8 = 0x10D7,
+  CLK_SIGNED_INT16 = 0x10D8,
+  CLK_SIGNED_INT32 = 0x10D9,
+  CLK_UNSIGNED_INT8 = 0x10DA,
+  CLK_UNSIGNED_INT16 = 0x10DB,
+  CLK_UNSIGNED_INT32 = 0x10DC,
+
+  // CI SPI for CPU
+  __CLK_UNORM_INT8888,  // four channel ARGB unorm8
+  __CLK_UNORM_INT8888R, // four channel BGRA unorm8
+
+  __CLK_VALID_IMAGE_TYPE_COUNT,
+  __CLK_INVALID_IMAGE_TYPE = __CLK_VALID_IMAGE_TYPE_COUNT,
+  __CLK_VALID_IMAGE_TYPE_MASK_BITS = 4, // number of bits required to
+                                        // represent any image type
+  __CLK_VALID_IMAGE_TYPE_MASK = (1 << __CLK_VALID_IMAGE_TYPE_MASK_BITS) - 1
+} clk_channel_type;
+
+typedef enum clk_sampler_type {
+  __CLK_ADDRESS_BASE = 0,
+  CLK_ADDRESS_NONE = 0 << __CLK_ADDRESS_BASE,
+  CLK_ADDRESS_CLAMP = 1 << __CLK_ADDRESS_BASE,
+  CLK_ADDRESS_CLAMP_TO_EDGE = 2 << __CLK_ADDRESS_BASE,
+  CLK_ADDRESS_REPEAT = 3 << __CLK_ADDRESS_BASE,
+  CLK_ADDRESS_MIRROR = 4 << __CLK_ADDRESS_BASE,
+
+#if (__NV_CL_C_VERSION >= __NV_CL_C_VERSION_1_1)
+  CLK_ADDRESS_MIRRORED_REPEAT = CLK_ADDRESS_MIRROR,
+#endif
+  __CLK_ADDRESS_MASK =
+      CLK_ADDRESS_NONE | CLK_ADDRESS_CLAMP | CLK_ADDRESS_CLAMP_TO_EDGE |
+      CLK_ADDRESS_REPEAT | CLK_ADDRESS_MIRROR,
+  __CLK_ADDRESS_BITS = 3, // number of bits required to
+                          // represent address info
+
+  __CLK_NORMALIZED_BASE = __CLK_ADDRESS_BITS,
+  CLK_NORMALIZED_COORDS_FALSE = 0,
+  CLK_NORMALIZED_COORDS_TRUE = 1 << __CLK_NORMALIZED_BASE,
+  __CLK_NORMALIZED_MASK =
+      CLK_NORMALIZED_COORDS_FALSE | CLK_NORMALIZED_COORDS_TRUE,
+  __CLK_NORMALIZED_BITS = 1, // number of bits required to
+                             // represent normalization
+
+  __CLK_FILTER_BASE = __CLK_NORMALIZED_BASE + __CLK_NORMALIZED_BITS,
+  CLK_FILTER_NEAREST = 0 << __CLK_FILTER_BASE,
+  CLK_FILTER_LINEAR = 1 << __CLK_FILTER_BASE,
+  CLK_FILTER_ANISOTROPIC = 2 << __CLK_FILTER_BASE,
+  __CLK_FILTER_MASK =
+      CLK_FILTER_NEAREST | CLK_FILTER_LINEAR | CLK_FILTER_ANISOTROPIC,
+  __CLK_FILTER_BITS = 2, // number of bits required to
+                         // represent address info
+
+  __CLK_MIP_BASE = __CLK_FILTER_BASE + __CLK_FILTER_BITS,
+  CLK_MIP_NEAREST = 0 << __CLK_MIP_BASE,
+  CLK_MIP_LINEAR = 1 << __CLK_MIP_BASE,
+  CLK_MIP_ANISOTROPIC = 2 << __CLK_MIP_BASE,
+  __CLK_MIP_MASK = CLK_MIP_NEAREST | CLK_MIP_LINEAR | CLK_MIP_ANISOTROPIC,
+  __CLK_MIP_BITS = 2,
+
+  __CLK_SAMPLER_BITS = __CLK_MIP_BASE + __CLK_MIP_BITS,
+  __CLK_SAMPLER_MASK = __CLK_MIP_MASK | __CLK_FILTER_MASK |
+                       __CLK_NORMALIZED_MASK | __CLK_ADDRESS_MASK,
+
+  __CLK_ANISOTROPIC_RATIO_BITS = 5,
+  __CLK_ANISOTROPIC_RATIO_MASK =
+      (int) 0x80000000 >> (__CLK_ANISOTROPIC_RATIO_BITS - 1)
+} clk_sampler_type;
+
+// Memory synchronization
+#define CLK_LOCAL_MEM_FENCE (1 << 0)
+#define CLK_GLOBAL_MEM_FENCE (1 << 1)
+
+#endif // CL_COMMON_DEFINES_H
diff --git a/contrib/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/contrib/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
new file mode 100644
index 0000000..d7066d5
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -0,0 +1,1658 @@
+//===-- PPCAsmParser.cpp - Parse PowerPC asm to MCInst instructions ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/PPCMCTargetDesc.h"
+#include "MCTargetDesc/PPCMCExpr.h"
+#include "PPCTargetStreamer.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+
+static unsigned RRegs[32] = {
+  PPC::R0,  PPC::R1,  PPC::R2,  PPC::R3,
+  PPC::R4,  PPC::R5,  PPC::R6,  PPC::R7,
+  PPC::R8,  PPC::R9,  PPC::R10, PPC::R11,
+  PPC::R12, PPC::R13, PPC::R14, PPC::R15,
+  PPC::R16, PPC::R17, PPC::R18, PPC::R19,
+  PPC::R20, PPC::R21, PPC::R22, PPC::R23,
+  PPC::R24, PPC::R25, PPC::R26, PPC::R27,
+  PPC::R28, PPC::R29, PPC::R30, PPC::R31
+};
+static unsigned RRegsNoR0[32] = {
+  PPC::ZERO,
+            PPC::R1,  PPC::R2,  PPC::R3,
+  PPC::R4,  PPC::R5,  PPC::R6,  PPC::R7,
+  PPC::R8,  PPC::R9,  PPC::R10, PPC::R11,
+  PPC::R12, PPC::R13, PPC::R14, PPC::R15,
+  PPC::R16, PPC::R17, PPC::R18, PPC::R19,
+  PPC::R20, PPC::R21, PPC::R22, PPC::R23,
+  PPC::R24, PPC::R25, PPC::R26, PPC::R27,
+  PPC::R28, PPC::R29, PPC::R30, PPC::R31
+};
+static unsigned XRegs[32] = {
+  PPC::X0,  PPC::X1,  PPC::X2,  PPC::X3,
+  PPC::X4,  PPC::X5,  PPC::X6,  PPC::X7,
+  PPC::X8,  PPC::X9,  PPC::X10, PPC::X11,
+  PPC::X12, PPC::X13, PPC::X14, PPC::X15,
+  PPC::X16, PPC::X17, PPC::X18, PPC::X19,
+  PPC::X20, PPC::X21, PPC::X22, PPC::X23,
+  PPC::X24, PPC::X25, PPC::X26, PPC::X27,
+  PPC::X28, PPC::X29, PPC::X30, PPC::X31
+};
+static unsigned XRegsNoX0[32] = {
+  PPC::ZERO8,
+            PPC::X1,  PPC::X2,  PPC::X3,
+  PPC::X4,  PPC::X5,  PPC::X6,  PPC::X7,
+  PPC::X8,  PPC::X9,  PPC::X10, PPC::X11,
+  PPC::X12, PPC::X13, PPC::X14, PPC::X15,
+  PPC::X16, PPC::X17, PPC::X18, PPC::X19,
+  PPC::X20, PPC::X21, PPC::X22, PPC::X23,
+  PPC::X24, PPC::X25, PPC::X26, PPC::X27,
+  PPC::X28, PPC::X29, PPC::X30, PPC::X31
+};
+static unsigned FRegs[32] = {
+  PPC::F0,  PPC::F1,  PPC::F2,  PPC::F3,
+  PPC::F4,  PPC::F5,  PPC::F6,  PPC::F7,
+  PPC::F8,  PPC::F9,  PPC::F10, PPC::F11,
+  PPC::F12, PPC::F13, PPC::F14, PPC::F15,
+  PPC::F16, PPC::F17, PPC::F18, PPC::F19,
+  PPC::F20, PPC::F21, PPC::F22, PPC::F23,
+  PPC::F24, PPC::F25, PPC::F26, PPC::F27,
+  PPC::F28, PPC::F29, PPC::F30, PPC::F31
+};
+static unsigned VRegs[32] = {
+  PPC::V0,  PPC::V1,  PPC::V2,  PPC::V3,
+  PPC::V4,  PPC::V5,  PPC::V6,  PPC::V7,
+  PPC::V8,  PPC::V9,  PPC::V10, PPC::V11,
+  PPC::V12, PPC::V13, PPC::V14, PPC::V15,
+  PPC::V16, PPC::V17, PPC::V18, PPC::V19,
+  PPC::V20, PPC::V21, PPC::V22, PPC::V23,
+  PPC::V24, PPC::V25, PPC::V26, PPC::V27,
+  PPC::V28, PPC::V29, PPC::V30, PPC::V31
+};
+static unsigned VSRegs[64] = {
+  PPC::VSL0,  PPC::VSL1,  PPC::VSL2,  PPC::VSL3,
+  PPC::VSL4,  PPC::VSL5,  PPC::VSL6,  PPC::VSL7,
+  PPC::VSL8,  PPC::VSL9,  PPC::VSL10, PPC::VSL11,
+  PPC::VSL12, PPC::VSL13, PPC::VSL14, PPC::VSL15,
+  PPC::VSL16, PPC::VSL17, PPC::VSL18, PPC::VSL19,
+  PPC::VSL20, PPC::VSL21, PPC::VSL22, PPC::VSL23,
+  PPC::VSL24, PPC::VSL25, PPC::VSL26, PPC::VSL27,
+  PPC::VSL28, PPC::VSL29, PPC::VSL30, PPC::VSL31,
+
+  PPC::VSH0,  PPC::VSH1,  PPC::VSH2,  PPC::VSH3,
+  PPC::VSH4,  PPC::VSH5,  PPC::VSH6,  PPC::VSH7,
+  PPC::VSH8,  PPC::VSH9,  PPC::VSH10, PPC::VSH11,
+  PPC::VSH12, PPC::VSH13, PPC::VSH14, PPC::VSH15,
+  PPC::VSH16, PPC::VSH17, PPC::VSH18, PPC::VSH19,
+  PPC::VSH20, PPC::VSH21, PPC::VSH22, PPC::VSH23,
+  PPC::VSH24, PPC::VSH25, PPC::VSH26, PPC::VSH27,
+  PPC::VSH28, PPC::VSH29, PPC::VSH30, PPC::VSH31
+};
+static unsigned VSFRegs[64] = {
+  PPC::F0,  PPC::F1,  PPC::F2,  PPC::F3,
+  PPC::F4,  PPC::F5,  PPC::F6,  PPC::F7,
+  PPC::F8,  PPC::F9,  PPC::F10, PPC::F11,
+  PPC::F12, PPC::F13, PPC::F14, PPC::F15,
+  PPC::F16, PPC::F17, PPC::F18, PPC::F19,
+  PPC::F20, PPC::F21, PPC::F22, PPC::F23,
+  PPC::F24, PPC::F25, PPC::F26, PPC::F27,
+  PPC::F28, PPC::F29, PPC::F30, PPC::F31,
+
+  PPC::VF0,  PPC::VF1,  PPC::VF2,  PPC::VF3,
+  PPC::VF4,  PPC::VF5,  PPC::VF6,  PPC::VF7,
+  PPC::VF8,  PPC::VF9,  PPC::VF10, PPC::VF11,
+  PPC::VF12, PPC::VF13, PPC::VF14, PPC::VF15,
+  PPC::VF16, PPC::VF17, PPC::VF18, PPC::VF19,
+  PPC::VF20, PPC::VF21, PPC::VF22, PPC::VF23,
+  PPC::VF24, PPC::VF25, PPC::VF26, PPC::VF27,
+  PPC::VF28, PPC::VF29, PPC::VF30, PPC::VF31
+};
+static unsigned CRBITRegs[32] = {
+  PPC::CR0LT, PPC::CR0GT, PPC::CR0EQ, PPC::CR0UN,
+  PPC::CR1LT, PPC::CR1GT, PPC::CR1EQ, PPC::CR1UN,
+  PPC::CR2LT, PPC::CR2GT, PPC::CR2EQ, PPC::CR2UN,
+  PPC::CR3LT, PPC::CR3GT, PPC::CR3EQ, PPC::CR3UN,
+  PPC::CR4LT, PPC::CR4GT, PPC::CR4EQ, PPC::CR4UN,
+  PPC::CR5LT, PPC::CR5GT, PPC::CR5EQ, PPC::CR5UN,
+  PPC::CR6LT, PPC::CR6GT, PPC::CR6EQ, PPC::CR6UN,
+  PPC::CR7LT, PPC::CR7GT, PPC::CR7EQ, PPC::CR7UN
+};
+static unsigned CRRegs[8] = {
+  PPC::CR0, PPC::CR1, PPC::CR2, PPC::CR3,
+  PPC::CR4, PPC::CR5, PPC::CR6, PPC::CR7
+};
+
+// Evaluate an expression containing condition register
+// or condition register field symbols.  Returns positive
+// value on success, or -1 on error.
+static int64_t
+EvaluateCRExpr(const MCExpr *E) {
+  switch (E->getKind()) {
+  case MCExpr::Target:
+    return -1;
+
+  case MCExpr::Constant: {
+    int64_t Res = cast<MCConstantExpr>(E)->getValue();
+    return Res < 0 ? -1 : Res;
+  }
+
+  case MCExpr::SymbolRef: {
+    const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(E);
+    StringRef Name = SRE->getSymbol().getName();
+
+    if (Name == "lt") return 0;
+    if (Name == "gt") return 1;
+    if (Name == "eq") return 2;
+    if (Name == "so") return 3;
+    if (Name == "un") return 3;
+
+    if (Name == "cr0") return 0;
+    if (Name == "cr1") return 1;
+    if (Name == "cr2") return 2;
+    if (Name == "cr3") return 3;
+    if (Name == "cr4") return 4;
+    if (Name == "cr5") return 5;
+    if (Name == "cr6") return 6;
+    if (Name == "cr7") return 7;
+
+    return -1;
+  }
+
+  case MCExpr::Unary:
+    return -1;
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(E);
+    int64_t LHSVal = EvaluateCRExpr(BE->getLHS());
+    int64_t RHSVal = EvaluateCRExpr(BE->getRHS());
+    int64_t Res;
+
+    if (LHSVal < 0 || RHSVal < 0)
+      return -1;
+
+    switch (BE->getOpcode()) {
+    default: return -1;
+    case MCBinaryExpr::Add: Res = LHSVal + RHSVal; break;
+    case MCBinaryExpr::Mul: Res = LHSVal * RHSVal; break;
+    }
+
+    return Res < 0 ? -1 : Res;
+  }
+  }
+
+  llvm_unreachable("Invalid expression kind!");
+}
+
+struct PPCOperand;
+
+class PPCAsmParser : public MCTargetAsmParser {
+  MCSubtargetInfo &STI;
+  MCAsmParser &Parser;
+  const MCInstrInfo &MII;
+  bool IsPPC64;
+  bool IsDarwin;
+
+  MCAsmParser &getParser() const { return Parser; }
+  MCAsmLexer &getLexer() const { return Parser.getLexer(); }
+
+  void Warning(SMLoc L, const Twine &Msg) { Parser.Warning(L, Msg); }
+  bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); }
+
+  bool isPPC64() const { return IsPPC64; }
+  bool isDarwin() const { return IsDarwin; }
+
+  bool MatchRegisterName(const AsmToken &Tok,
+                         unsigned &RegNo, int64_t &IntVal);
+
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+
+  const MCExpr *ExtractModifierFromExpr(const MCExpr *E,
+                                        PPCMCExpr::VariantKind &Variant);
+  const MCExpr *FixupVariantKind(const MCExpr *E);
+  bool ParseExpression(const MCExpr *&EVal);
+  bool ParseDarwinExpression(const MCExpr *&EVal);
+
+  bool ParseOperand(OperandVector &Operands);
+
+  bool ParseDirectiveWord(unsigned Size, SMLoc L);
+  bool ParseDirectiveTC(unsigned Size, SMLoc L);
+  bool ParseDirectiveMachine(SMLoc L);
+  bool ParseDarwinDirectiveMachine(SMLoc L);
+  bool ParseDirectiveAbiVersion(SMLoc L);
+  bool ParseDirectiveLocalEntry(SMLoc L);
+
+  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                               OperandVector &Operands, MCStreamer &Out,
+                               unsigned &ErrorInfo,
+                               bool MatchingInlineAsm) override;
+
+  void ProcessInstruction(MCInst &Inst, const OperandVector &Ops);
+
+  /// @name Auto-generated Match Functions
+  /// {
+
+#define GET_ASSEMBLER_HEADER
+#include "PPCGenAsmMatcher.inc"
+
+  /// }
+
+
+public:
+  PPCAsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser,
+               const MCInstrInfo &_MII,
+               const MCTargetOptions &Options)
+      : MCTargetAsmParser(), STI(_STI), Parser(_Parser), MII(_MII) {
+    // Check for 64-bit vs. 32-bit pointer mode.
+    Triple TheTriple(STI.getTargetTriple());
+    IsPPC64 = (TheTriple.getArch() == Triple::ppc64 ||
+               TheTriple.getArch() == Triple::ppc64le);
+    IsDarwin = TheTriple.isMacOSX();
+    // Initialize the set of available features.
+    setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+  }
+
+  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                        SMLoc NameLoc, OperandVector &Operands) override;
+
+  bool ParseDirective(AsmToken DirectiveID) override;
+
+  unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
+                                      unsigned Kind) override;
+
+  const MCExpr *applyModifierToExpr(const MCExpr *E,
+                                    MCSymbolRefExpr::VariantKind,
+                                    MCContext &Ctx) override;
+};
+
+/// PPCOperand - Instances of this class represent a parsed PowerPC machine
+/// instruction.
+struct PPCOperand : public MCParsedAsmOperand {
+  enum KindTy {
+    Token,
+    Immediate,
+    Expression,
+    TLSRegister
+  } Kind;
+
+  SMLoc StartLoc, EndLoc;
+  bool IsPPC64;
+
+  struct TokOp {
+    const char *Data;
+    unsigned Length;
+  };
+
+  struct ImmOp {
+    int64_t Val;
+  };
+
+  struct ExprOp {
+    const MCExpr *Val;
+    int64_t CRVal;     // Cached result of EvaluateCRExpr(Val)
+  };
+
+  struct TLSRegOp {
+    const MCSymbolRefExpr *Sym;
+  };
+
+  union {
+    struct TokOp Tok;
+    struct ImmOp Imm;
+    struct ExprOp Expr;
+    struct TLSRegOp TLSReg;
+  };
+
+  PPCOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
+public:
+  PPCOperand(const PPCOperand &o) : MCParsedAsmOperand() {
+    Kind = o.Kind;
+    StartLoc = o.StartLoc;
+    EndLoc = o.EndLoc;
+    IsPPC64 = o.IsPPC64;
+    switch (Kind) {
+    case Token:
+      Tok = o.Tok;
+      break;
+    case Immediate:
+      Imm = o.Imm;
+      break;
+    case Expression:
+      Expr = o.Expr;
+      break;
+    case TLSRegister:
+      TLSReg = o.TLSReg;
+      break;
+    }
+  }
+
+  /// getStartLoc - Get the location of the first token of this operand.
+  SMLoc getStartLoc() const override { return StartLoc; }
+
+  /// getEndLoc - Get the location of the last token of this operand.
+  SMLoc getEndLoc() const override { return EndLoc; }
+
+  /// isPPC64 - True if this operand is for an instruction in 64-bit mode.
+  bool isPPC64() const { return IsPPC64; }
+
+  int64_t getImm() const {
+    assert(Kind == Immediate && "Invalid access!");
+    return Imm.Val;
+  }
+
+  const MCExpr *getExpr() const {
+    assert(Kind == Expression && "Invalid access!");
+    return Expr.Val;
+  }
+
+  int64_t getExprCRVal() const {
+    assert(Kind == Expression && "Invalid access!");
+    return Expr.CRVal;
+  }
+
+  const MCExpr *getTLSReg() const {
+    assert(Kind == TLSRegister && "Invalid access!");
+    return TLSReg.Sym;
+  }
+
+  unsigned getReg() const override {
+    assert(isRegNumber() && "Invalid access!");
+    return (unsigned) Imm.Val;
+  }
+
+  unsigned getVSReg() const {
+    assert(isVSRegNumber() && "Invalid access!");
+    return (unsigned) Imm.Val;
+  }
+
+  unsigned getCCReg() const {
+    assert(isCCRegNumber() && "Invalid access!");
+    return (unsigned) (Kind == Immediate ? Imm.Val : Expr.CRVal);
+  }
+
+  unsigned getCRBit() const {
+    assert(isCRBitNumber() && "Invalid access!");
+    return (unsigned) (Kind == Immediate ? Imm.Val : Expr.CRVal);
+  }
+
+  unsigned getCRBitMask() const {
+    assert(isCRBitMask() && "Invalid access!");
+    return 7 - countTrailingZeros<uint64_t>(Imm.Val);
+  }
+
+  bool isToken() const override { return Kind == Token; }
+  bool isImm() const override { return Kind == Immediate || Kind == Expression; }
+  bool isU2Imm() const { return Kind == Immediate && isUInt<2>(getImm()); }
+  bool isU5Imm() const { return Kind == Immediate && isUInt<5>(getImm()); }
+  bool isS5Imm() const { return Kind == Immediate && isInt<5>(getImm()); }
+  bool isU6Imm() const { return Kind == Immediate && isUInt<6>(getImm()); }
+  bool isU16Imm() const { return Kind == Expression ||
+                                 (Kind == Immediate && isUInt<16>(getImm())); }
+  bool isS16Imm() const { return Kind == Expression ||
+                                 (Kind == Immediate && isInt<16>(getImm())); }
+  bool isS16ImmX4() const { return Kind == Expression ||
+                                   (Kind == Immediate && isInt<16>(getImm()) &&
+                                    (getImm() & 3) == 0); }
+  bool isS17Imm() const { return Kind == Expression ||
+                                 (Kind == Immediate && isInt<17>(getImm())); }
+  bool isTLSReg() const { return Kind == TLSRegister; }
+  bool isDirectBr() const { return Kind == Expression ||
+                                   (Kind == Immediate && isInt<26>(getImm()) &&
+                                    (getImm() & 3) == 0); }
+  bool isCondBr() const { return Kind == Expression ||
+                                 (Kind == Immediate && isInt<16>(getImm()) &&
+                                  (getImm() & 3) == 0); }
+  bool isRegNumber() const { return Kind == Immediate && isUInt<5>(getImm()); }
+  bool isVSRegNumber() const { return Kind == Immediate && isUInt<6>(getImm()); }
+  bool isCCRegNumber() const { return (Kind == Expression
+                                       && isUInt<3>(getExprCRVal())) ||
+                                      (Kind == Immediate
+                                       && isUInt<3>(getImm())); }
+  bool isCRBitNumber() const { return (Kind == Expression
+                                       && isUInt<5>(getExprCRVal())) ||
+                                      (Kind == Immediate
+                                       && isUInt<5>(getImm())); }
+  bool isCRBitMask() const { return Kind == Immediate && isUInt<8>(getImm()) &&
+                                    isPowerOf2_32(getImm()); }
+  bool isMem() const override { return false; }
+  bool isReg() const override { return false; }
+
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    llvm_unreachable("addRegOperands");
+  }
+
+  void addRegGPRCOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(RRegs[getReg()]));
+  }
+
+  void addRegGPRCNoR0Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(RRegsNoR0[getReg()]));
+  }
+
+  void addRegG8RCOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(XRegs[getReg()]));
+  }
+
+  void addRegG8RCNoX0Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(XRegsNoX0[getReg()]));
+  }
+
+  void addRegGxRCOperands(MCInst &Inst, unsigned N) const {
+    if (isPPC64())
+      addRegG8RCOperands(Inst, N);
+    else
+      addRegGPRCOperands(Inst, N);
+  }
+
+  void addRegGxRCNoR0Operands(MCInst &Inst, unsigned N) const {
+    if (isPPC64())
+      addRegG8RCNoX0Operands(Inst, N);
+    else
+      addRegGPRCNoR0Operands(Inst, N);
+  }
+
+  void addRegF4RCOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(FRegs[getReg()]));
+  }
+
+  void addRegF8RCOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(FRegs[getReg()]));
+  }
+
+  void addRegVRRCOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(VRegs[getReg()]));
+  }
+
+  void addRegVSRCOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(VSRegs[getVSReg()]));
+  }
+
+  void addRegVSFRCOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(VSFRegs[getVSReg()]));
+  }
+
+  void addRegCRBITRCOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(CRBITRegs[getCRBit()]));
+  }
+
+  void addRegCRRCOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(CRRegs[getCCReg()]));
+  }
+
+  void addCRBitMaskOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(CRRegs[getCRBitMask()]));
+  }
+
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    if (Kind == Immediate)
+      Inst.addOperand(MCOperand::CreateImm(getImm()));
+    else
+      Inst.addOperand(MCOperand::CreateExpr(getExpr()));
+  }
+
+  void addBranchTargetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    if (Kind == Immediate)
+      Inst.addOperand(MCOperand::CreateImm(getImm() / 4));
+    else
+      Inst.addOperand(MCOperand::CreateExpr(getExpr()));
+  }
+
+  void addTLSRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateExpr(getTLSReg()));
+  }
+
+  StringRef getToken() const {
+    assert(Kind == Token && "Invalid access!");
+    return StringRef(Tok.Data, Tok.Length);
+  }
+
+  void print(raw_ostream &OS) const override;
+
+  static std::unique_ptr<PPCOperand> CreateToken(StringRef Str, SMLoc S,
+                                                 bool IsPPC64) {
+    auto Op = make_unique<PPCOperand>(Token);
+    Op->Tok.Data = Str.data();
+    Op->Tok.Length = Str.size();
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    Op->IsPPC64 = IsPPC64;
+    return Op;
+  }
+
+  static std::unique_ptr<PPCOperand>
+  CreateTokenWithStringCopy(StringRef Str, SMLoc S, bool IsPPC64) {
+    // Allocate extra memory for the string and copy it.
+    // FIXME: This is incorrect, Operands are owned by unique_ptr with a default
+    // deleter which will destroy them by simply using "delete", not correctly
+    // calling operator delete on this extra memory after calling the dtor
+    // explicitly.
+    void *Mem = ::operator new(sizeof(PPCOperand) + Str.size());
+    std::unique_ptr<PPCOperand> Op(new (Mem) PPCOperand(Token));
+    Op->Tok.Data = (const char *)(Op.get() + 1);
+    Op->Tok.Length = Str.size();
+    std::memcpy((void *)Op->Tok.Data, Str.data(), Str.size());
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    Op->IsPPC64 = IsPPC64;
+    return Op;
+  }
+
+  static std::unique_ptr<PPCOperand> CreateImm(int64_t Val, SMLoc S, SMLoc E,
+                                               bool IsPPC64) {
+    auto Op = make_unique<PPCOperand>(Immediate);
+    Op->Imm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    Op->IsPPC64 = IsPPC64;
+    return Op;
+  }
+
+  static std::unique_ptr<PPCOperand> CreateExpr(const MCExpr *Val, SMLoc S,
+                                                SMLoc E, bool IsPPC64) {
+    auto Op = make_unique<PPCOperand>(Expression);
+    Op->Expr.Val = Val;
+    Op->Expr.CRVal = EvaluateCRExpr(Val);
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    Op->IsPPC64 = IsPPC64;
+    return Op;
+  }
+
+  static std::unique_ptr<PPCOperand>
+  CreateTLSReg(const MCSymbolRefExpr *Sym, SMLoc S, SMLoc E, bool IsPPC64) {
+    auto Op = make_unique<PPCOperand>(TLSRegister);
+    Op->TLSReg.Sym = Sym;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    Op->IsPPC64 = IsPPC64;
+    return Op;
+  }
+
+  static std::unique_ptr<PPCOperand>
+  CreateFromMCExpr(const MCExpr *Val, SMLoc S, SMLoc E, bool IsPPC64) {
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Val))
+      return CreateImm(CE->getValue(), S, E, IsPPC64);
+
+    if (const MCSymbolRefExpr *SRE = dyn_cast<MCSymbolRefExpr>(Val))
+      if (SRE->getKind() == MCSymbolRefExpr::VK_PPC_TLS)
+        return CreateTLSReg(SRE, S, E, IsPPC64);
+
+    return CreateExpr(Val, S, E, IsPPC64);
+  }
+};
+
+} // end anonymous namespace.
+
+void PPCOperand::print(raw_ostream &OS) const {
+  switch (Kind) {
+  case Token:
+    OS << "'" << getToken() << "'";
+    break;
+  case Immediate:
+    OS << getImm();
+    break;
+  case Expression:
+    getExpr()->print(OS);
+    break;
+  case TLSRegister:
+    getTLSReg()->print(OS);
+    break;
+  }
+}
+
+void PPCAsmParser::ProcessInstruction(MCInst &Inst,
+                                      const OperandVector &Operands) {
+  int Opcode = Inst.getOpcode();
+  switch (Opcode) {
+  case PPC::LAx: {
+    MCInst TmpInst;
+    TmpInst.setOpcode(PPC::LA);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(2));
+    TmpInst.addOperand(Inst.getOperand(1));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::SUBI: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    TmpInst.setOpcode(PPC::ADDI);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::CreateImm(-N));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::SUBIS: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    TmpInst.setOpcode(PPC::ADDIS);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::CreateImm(-N));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::SUBIC: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    TmpInst.setOpcode(PPC::ADDIC);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::CreateImm(-N));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::SUBICo: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    TmpInst.setOpcode(PPC::ADDICo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::CreateImm(-N));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::EXTLWI:
+  case PPC::EXTLWIo: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    int64_t B = Inst.getOperand(3).getImm();
+    TmpInst.setOpcode(Opcode == PPC::EXTLWI? PPC::RLWINM : PPC::RLWINMo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::CreateImm(B));
+    TmpInst.addOperand(MCOperand::CreateImm(0));
+    TmpInst.addOperand(MCOperand::CreateImm(N - 1));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::EXTRWI:
+  case PPC::EXTRWIo: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    int64_t B = Inst.getOperand(3).getImm();
+    TmpInst.setOpcode(Opcode == PPC::EXTRWI? PPC::RLWINM : PPC::RLWINMo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::CreateImm(B + N));
+    TmpInst.addOperand(MCOperand::CreateImm(32 - N));
+    TmpInst.addOperand(MCOperand::CreateImm(31));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::INSLWI:
+  case PPC::INSLWIo: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    int64_t B = Inst.getOperand(3).getImm();
+    TmpInst.setOpcode(Opcode == PPC::INSLWI? PPC::RLWIMI : PPC::RLWIMIo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::CreateImm(32 - B));
+    TmpInst.addOperand(MCOperand::CreateImm(B));
+    TmpInst.addOperand(MCOperand::CreateImm((B + N) - 1));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::INSRWI:
+  case PPC::INSRWIo: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    int64_t B = Inst.getOperand(3).getImm();
+    TmpInst.setOpcode(Opcode == PPC::INSRWI? PPC::RLWIMI : PPC::RLWIMIo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::CreateImm(32 - (B + N)));
+    TmpInst.addOperand(MCOperand::CreateImm(B));
+    TmpInst.addOperand(MCOperand::CreateImm((B + N) - 1));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::ROTRWI:
+  case PPC::ROTRWIo: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    TmpInst.setOpcode(Opcode == PPC::ROTRWI? PPC::RLWINM : PPC::RLWINMo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::CreateImm(32 - N));
+    TmpInst.addOperand(MCOperand::CreateImm(0));
+    TmpInst.addOperand(MCOperand::CreateImm(31));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::SLWI:
+  case PPC::SLWIo: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    TmpInst.setOpcode(Opcode == PPC::SLWI? PPC::RLWINM : PPC::RLWINMo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::CreateImm(N));
+    TmpInst.addOperand(MCOperand::CreateImm(0));
+    TmpInst.addOperand(MCOperand::CreateImm(31 - N));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::SRWI:
+  case PPC::SRWIo: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    TmpInst.setOpcode(Opcode == PPC::SRWI? PPC::RLWINM : PPC::RLWINMo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::CreateImm(32 - N));
+    TmpInst.addOperand(MCOperand::CreateImm(N));
+    TmpInst.addOperand(MCOperand::CreateImm(31));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::CLRRWI:
+  case PPC::CLRRWIo: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    TmpInst.setOpcode(Opcode == PPC::CLRRWI? PPC::RLWINM : PPC::RLWINMo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::CreateImm(0));
+    TmpInst.addOperand(MCOperand::CreateImm(0));
+    TmpInst.addOperand(MCOperand::CreateImm(31 - N));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::CLRLSLWI:
+  case PPC::CLRLSLWIo: {
+    MCInst TmpInst;
+    int64_t B = Inst.getOperand(2).getImm();
+    int64_t N = Inst.getOperand(3).getImm();
+    TmpInst.setOpcode(Opcode == PPC::CLRLSLWI? PPC::RLWINM : PPC::RLWINMo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::CreateImm(N));
+    TmpInst.addOperand(MCOperand::CreateImm(B - N));
+    TmpInst.addOperand(MCOperand::CreateImm(31 - N));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::EXTLDI:
+  case PPC::EXTLDIo: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    int64_t B = Inst.getOperand(3).getImm();
+    TmpInst.setOpcode(Opcode == PPC::EXTLDI? PPC::RLDICR : PPC::RLDICRo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::CreateImm(B));
+    TmpInst.addOperand(MCOperand::CreateImm(N - 1));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::EXTRDI:
+  case PPC::EXTRDIo: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    int64_t B = Inst.getOperand(3).getImm();
+    TmpInst.setOpcode(Opcode == PPC::EXTRDI? PPC::RLDICL : PPC::RLDICLo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::CreateImm(B + N));
+    TmpInst.addOperand(MCOperand::CreateImm(64 - N));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::INSRDI:
+  case PPC::INSRDIo: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    int64_t B = Inst.getOperand(3).getImm();
+    TmpInst.setOpcode(Opcode == PPC::INSRDI? PPC::RLDIMI : PPC::RLDIMIo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::CreateImm(64 - (B + N)));
+    TmpInst.addOperand(MCOperand::CreateImm(B));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::ROTRDI:
+  case PPC::ROTRDIo: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    TmpInst.setOpcode(Opcode == PPC::ROTRDI? PPC::RLDICL : PPC::RLDICLo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::CreateImm(64 - N));
+    TmpInst.addOperand(MCOperand::CreateImm(0));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::SLDI:
+  case PPC::SLDIo: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    TmpInst.setOpcode(Opcode == PPC::SLDI? PPC::RLDICR : PPC::RLDICRo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::CreateImm(N));
+    TmpInst.addOperand(MCOperand::CreateImm(63 - N));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::SRDI:
+  case PPC::SRDIo: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    TmpInst.setOpcode(Opcode == PPC::SRDI? PPC::RLDICL : PPC::RLDICLo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::CreateImm(64 - N));
+    TmpInst.addOperand(MCOperand::CreateImm(N));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::CLRRDI:
+  case PPC::CLRRDIo: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    TmpInst.setOpcode(Opcode == PPC::CLRRDI? PPC::RLDICR : PPC::RLDICRo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::CreateImm(0));
+    TmpInst.addOperand(MCOperand::CreateImm(63 - N));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::CLRLSLDI:
+  case PPC::CLRLSLDIo: {
+    MCInst TmpInst;
+    int64_t B = Inst.getOperand(2).getImm();
+    int64_t N = Inst.getOperand(3).getImm();
+    TmpInst.setOpcode(Opcode == PPC::CLRLSLDI? PPC::RLDIC : PPC::RLDICo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::CreateImm(N));
+    TmpInst.addOperand(MCOperand::CreateImm(B - N));
+    Inst = TmpInst;
+    break;
+  }
+  }
+}
+
+bool PPCAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                           OperandVector &Operands,
+                                           MCStreamer &Out, unsigned &ErrorInfo,
+                                           bool MatchingInlineAsm) {
+  MCInst Inst;
+
+  switch (MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm)) {
+  default: break;
+  case Match_Success:
+    // Post-process instructions (typically extended mnemonics)
+    ProcessInstruction(Inst, Operands);
+    Inst.setLoc(IDLoc);
+    Out.EmitInstruction(Inst, STI);
+    return false;
+  case Match_MissingFeature:
+    return Error(IDLoc, "instruction use requires an option to be enabled");
+  case Match_MnemonicFail:
+      return Error(IDLoc, "unrecognized instruction mnemonic");
+  case Match_InvalidOperand: {
+    SMLoc ErrorLoc = IDLoc;
+    if (ErrorInfo != ~0U) {
+      if (ErrorInfo >= Operands.size())
+        return Error(IDLoc, "too few operands for instruction");
+
+      ErrorLoc = ((PPCOperand &)*Operands[ErrorInfo]).getStartLoc();
+      if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
+    }
+
+    return Error(ErrorLoc, "invalid operand for instruction");
+  }
+  }
+
+  llvm_unreachable("Implement any new match types added!");
+}
+
+bool PPCAsmParser::
+MatchRegisterName(const AsmToken &Tok, unsigned &RegNo, int64_t &IntVal) {
+  if (Tok.is(AsmToken::Identifier)) {
+    StringRef Name = Tok.getString();
+
+    if (Name.equals_lower("lr")) {
+      RegNo = isPPC64()? PPC::LR8 : PPC::LR;
+      IntVal = 8;
+      return false;
+    } else if (Name.equals_lower("ctr")) {
+      RegNo = isPPC64()? PPC::CTR8 : PPC::CTR;
+      IntVal = 9;
+      return false;
+    } else if (Name.equals_lower("vrsave")) {
+      RegNo = PPC::VRSAVE;
+      IntVal = 256;
+      return false;
+    } else if (Name.startswith_lower("r") &&
+               !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) {
+      RegNo = isPPC64()? XRegs[IntVal] : RRegs[IntVal];
+      return false;
+    } else if (Name.startswith_lower("f") &&
+               !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) {
+      RegNo = FRegs[IntVal];
+      return false;
+    } else if (Name.startswith_lower("v") &&
+               !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) {
+      RegNo = VRegs[IntVal];
+      return false;
+    } else if (Name.startswith_lower("cr") &&
+               !Name.substr(2).getAsInteger(10, IntVal) && IntVal < 8) {
+      RegNo = CRRegs[IntVal];
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool PPCAsmParser::
+ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) {
+  const AsmToken &Tok = Parser.getTok();
+  StartLoc = Tok.getLoc();
+  EndLoc = Tok.getEndLoc();
+  RegNo = 0;
+  int64_t IntVal;
+
+  if (!MatchRegisterName(Tok, RegNo, IntVal)) {
+    Parser.Lex(); // Eat identifier token.
+    return false;
+  }
+
+  return Error(StartLoc, "invalid register name");
+}
+
+/// Extract \code @l/@ha \endcode modifier from expression.  Recursively scan
+/// the expression and check for VK_PPC_LO/HI/HA
+/// symbol variants.  If all symbols with modifier use the same
+/// variant, return the corresponding PPCMCExpr::VariantKind,
+/// and a modified expression using the default symbol variant.
+/// Otherwise, return NULL.
+const MCExpr *PPCAsmParser::
+ExtractModifierFromExpr(const MCExpr *E,
+                        PPCMCExpr::VariantKind &Variant) {
+  MCContext &Context = getParser().getContext();
+  Variant = PPCMCExpr::VK_PPC_None;
+
+  switch (E->getKind()) {
+  case MCExpr::Target:
+  case MCExpr::Constant:
+    return nullptr;
+
+  case MCExpr::SymbolRef: {
+    const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(E);
+
+    switch (SRE->getKind()) {
+    case MCSymbolRefExpr::VK_PPC_LO:
+      Variant = PPCMCExpr::VK_PPC_LO;
+      break;
+    case MCSymbolRefExpr::VK_PPC_HI:
+      Variant = PPCMCExpr::VK_PPC_HI;
+      break;
+    case MCSymbolRefExpr::VK_PPC_HA:
+      Variant = PPCMCExpr::VK_PPC_HA;
+      break;
+    case MCSymbolRefExpr::VK_PPC_HIGHER:
+      Variant = PPCMCExpr::VK_PPC_HIGHER;
+      break;
+    case MCSymbolRefExpr::VK_PPC_HIGHERA:
+      Variant = PPCMCExpr::VK_PPC_HIGHERA;
+      break;
+    case MCSymbolRefExpr::VK_PPC_HIGHEST:
+      Variant = PPCMCExpr::VK_PPC_HIGHEST;
+      break;
+    case MCSymbolRefExpr::VK_PPC_HIGHESTA:
+      Variant = PPCMCExpr::VK_PPC_HIGHESTA;
+      break;
+    default:
+      return nullptr;
+    }
+
+    return MCSymbolRefExpr::Create(&SRE->getSymbol(), Context);
+  }
+
+  case MCExpr::Unary: {
+    const MCUnaryExpr *UE = cast<MCUnaryExpr>(E);
+    const MCExpr *Sub = ExtractModifierFromExpr(UE->getSubExpr(), Variant);
+    if (!Sub)
+      return nullptr;
+    return MCUnaryExpr::Create(UE->getOpcode(), Sub, Context);
+  }
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(E);
+    PPCMCExpr::VariantKind LHSVariant, RHSVariant;
+    const MCExpr *LHS = ExtractModifierFromExpr(BE->getLHS(), LHSVariant);
+    const MCExpr *RHS = ExtractModifierFromExpr(BE->getRHS(), RHSVariant);
+
+    if (!LHS && !RHS)
+      return nullptr;
+
+    if (!LHS) LHS = BE->getLHS();
+    if (!RHS) RHS = BE->getRHS();
+
+    if (LHSVariant == PPCMCExpr::VK_PPC_None)
+      Variant = RHSVariant;
+    else if (RHSVariant == PPCMCExpr::VK_PPC_None)
+      Variant = LHSVariant;
+    else if (LHSVariant == RHSVariant)
+      Variant = LHSVariant;
+    else
+      return nullptr;
+
+    return MCBinaryExpr::Create(BE->getOpcode(), LHS, RHS, Context);
+  }
+  }
+
+  llvm_unreachable("Invalid expression kind!");
+}
+
+/// Find all VK_TLSGD/VK_TLSLD symbol references in expression and replace
+/// them by VK_PPC_TLSGD/VK_PPC_TLSLD.  This is necessary to avoid having
+/// _GLOBAL_OFFSET_TABLE_ created via ELFObjectWriter::RelocNeedsGOT.
+/// FIXME: This is a hack.
+const MCExpr *PPCAsmParser::
+FixupVariantKind(const MCExpr *E) {
+  MCContext &Context = getParser().getContext();
+
+  switch (E->getKind()) {
+  case MCExpr::Target:
+  case MCExpr::Constant:
+    return E;
+
+  case MCExpr::SymbolRef: {
+    const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(E);
+    MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
+
+    switch (SRE->getKind()) {
+    case MCSymbolRefExpr::VK_TLSGD:
+      Variant = MCSymbolRefExpr::VK_PPC_TLSGD;
+      break;
+    case MCSymbolRefExpr::VK_TLSLD:
+      Variant = MCSymbolRefExpr::VK_PPC_TLSLD;
+      break;
+    default:
+      return E;
+    }
+    return MCSymbolRefExpr::Create(&SRE->getSymbol(), Variant, Context);
+  }
+
+  case MCExpr::Unary: {
+    const MCUnaryExpr *UE = cast<MCUnaryExpr>(E);
+    const MCExpr *Sub = FixupVariantKind(UE->getSubExpr());
+    if (Sub == UE->getSubExpr())
+      return E;
+    return MCUnaryExpr::Create(UE->getOpcode(), Sub, Context);
+  }
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(E);
+    const MCExpr *LHS = FixupVariantKind(BE->getLHS());
+    const MCExpr *RHS = FixupVariantKind(BE->getRHS());
+    if (LHS == BE->getLHS() && RHS == BE->getRHS())
+      return E;
+    return MCBinaryExpr::Create(BE->getOpcode(), LHS, RHS, Context);
+  }
+  }
+
+  llvm_unreachable("Invalid expression kind!");
+}
+
+/// ParseExpression.  This differs from the default "parseExpression" in that
+/// it handles modifiers.
+bool PPCAsmParser::
+ParseExpression(const MCExpr *&EVal) {
+
+  if (isDarwin())
+    return ParseDarwinExpression(EVal);
+
+  // (ELF Platforms)
+  // Handle \code @l/@ha \endcode
+  if (getParser().parseExpression(EVal))
+    return true;
+
+  EVal = FixupVariantKind(EVal);
+
+  PPCMCExpr::VariantKind Variant;
+  const MCExpr *E = ExtractModifierFromExpr(EVal, Variant);
+  if (E)
+    EVal = PPCMCExpr::Create(Variant, E, false, getParser().getContext());
+
+  return false;
+}
+
+/// ParseDarwinExpression.  (MachO Platforms)
+/// This differs from the default "parseExpression" in that it handles detection
+/// of the \code hi16(), ha16() and lo16() \endcode modifiers.  At present,
+/// parseExpression() doesn't recognise the modifiers when in the Darwin/MachO
+/// syntax form so it is done here.  TODO: Determine if there is merit in arranging
+/// for this to be done at a higher level.
+bool PPCAsmParser::
+ParseDarwinExpression(const MCExpr *&EVal) {
+  PPCMCExpr::VariantKind Variant = PPCMCExpr::VK_PPC_None;
+  switch (getLexer().getKind()) {
+  default:
+    break;
+  case AsmToken::Identifier:
+    // Compiler-generated Darwin identifiers begin with L,l,_ or "; thus
+    // something starting with any other char should be part of the
+    // asm syntax.  If handwritten asm includes an identifier like lo16,
+    // then all bets are off - but no-one would do that, right?
+    StringRef poss = Parser.getTok().getString();
+    if (poss.equals_lower("lo16")) {
+      Variant = PPCMCExpr::VK_PPC_LO;
+    } else if (poss.equals_lower("hi16")) {
+      Variant = PPCMCExpr::VK_PPC_HI;
+    } else if (poss.equals_lower("ha16")) {
+      Variant = PPCMCExpr::VK_PPC_HA;
+    }
+    if (Variant != PPCMCExpr::VK_PPC_None) {
+      Parser.Lex(); // Eat the xx16
+      if (getLexer().isNot(AsmToken::LParen))
+        return Error(Parser.getTok().getLoc(), "expected '('");
+      Parser.Lex(); // Eat the '('
+    }
+    break;
+  }
+
+  if (getParser().parseExpression(EVal))
+    return true;
+
+  if (Variant != PPCMCExpr::VK_PPC_None) {
+    if (getLexer().isNot(AsmToken::RParen))
+      return Error(Parser.getTok().getLoc(), "expected ')'");
+    Parser.Lex(); // Eat the ')'
+    EVal = PPCMCExpr::Create(Variant, EVal, false, getParser().getContext());
+  }
+  return false;
+}
+
+/// ParseOperand
+/// This handles registers in the form 'NN', '%rNN' for ELF platforms and
+/// rNN for MachO.
+bool PPCAsmParser::ParseOperand(OperandVector &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+  const MCExpr *EVal;
+
+  // Attempt to parse the next token as an immediate
+  switch (getLexer().getKind()) {
+  // Special handling for register names.  These are interpreted
+  // as immediates corresponding to the register number.
+  case AsmToken::Percent:
+    Parser.Lex(); // Eat the '%'.
+    unsigned RegNo;
+    int64_t IntVal;
+    if (!MatchRegisterName(Parser.getTok(), RegNo, IntVal)) {
+      Parser.Lex(); // Eat the identifier token.
+      Operands.push_back(PPCOperand::CreateImm(IntVal, S, E, isPPC64()));
+      return false;
+    }
+    return Error(S, "invalid register name");
+
+  case AsmToken::Identifier:
+    // Note that non-register-name identifiers from the compiler will begin
+    // with '_', 'L'/'l' or '"'.  Of course, handwritten asm could include
+    // identifiers like r31foo - so we fall through in the event that parsing
+    // a register name fails.
+    if (isDarwin()) {
+      unsigned RegNo;
+      int64_t IntVal;
+      if (!MatchRegisterName(Parser.getTok(), RegNo, IntVal)) {
+        Parser.Lex(); // Eat the identifier token.
+        Operands.push_back(PPCOperand::CreateImm(IntVal, S, E, isPPC64()));
+        return false;
+      }
+    }
+  // Fall-through to process non-register-name identifiers as expression.
+  // All other expressions
+  case AsmToken::LParen:
+  case AsmToken::Plus:
+  case AsmToken::Minus:
+  case AsmToken::Integer:
+  case AsmToken::Dot:
+  case AsmToken::Dollar:
+  case AsmToken::Exclaim:
+  case AsmToken::Tilde:
+    if (!ParseExpression(EVal))
+      break;
+    /* fall through */
+  default:
+    return Error(S, "unknown operand");
+  }
+
+  // Push the parsed operand into the list of operands
+  Operands.push_back(PPCOperand::CreateFromMCExpr(EVal, S, E, isPPC64()));
+
+  // Check whether this is a TLS call expression
+  bool TLSCall = false;
+  if (const MCSymbolRefExpr *Ref = dyn_cast<MCSymbolRefExpr>(EVal))
+    TLSCall = Ref->getSymbol().getName() == "__tls_get_addr";
+
+  if (TLSCall && getLexer().is(AsmToken::LParen)) {
+    const MCExpr *TLSSym;
+
+    Parser.Lex(); // Eat the '('.
+    S = Parser.getTok().getLoc();
+    if (ParseExpression(TLSSym))
+      return Error(S, "invalid TLS call expression");
+    if (getLexer().isNot(AsmToken::RParen))
+      return Error(Parser.getTok().getLoc(), "missing ')'");
+    E = Parser.getTok().getLoc();
+    Parser.Lex(); // Eat the ')'.
+
+    Operands.push_back(PPCOperand::CreateFromMCExpr(TLSSym, S, E, isPPC64()));
+  }
+
+  // Otherwise, check for D-form memory operands
+  if (!TLSCall && getLexer().is(AsmToken::LParen)) {
+    Parser.Lex(); // Eat the '('.
+    S = Parser.getTok().getLoc();
+
+    int64_t IntVal;
+    switch (getLexer().getKind()) {
+    case AsmToken::Percent:
+      Parser.Lex(); // Eat the '%'.
+      unsigned RegNo;
+      if (MatchRegisterName(Parser.getTok(), RegNo, IntVal))
+        return Error(S, "invalid register name");
+      Parser.Lex(); // Eat the identifier token.
+      break;
+
+    case AsmToken::Integer:
+      if (!isDarwin()) {
+        if (getParser().parseAbsoluteExpression(IntVal) ||
+          IntVal < 0 || IntVal > 31)
+        return Error(S, "invalid register number");
+      } else {
+        return Error(S, "unexpected integer value");
+      }
+      break;
+
+   case AsmToken::Identifier:
+    if (isDarwin()) {
+      unsigned RegNo;
+      if (!MatchRegisterName(Parser.getTok(), RegNo, IntVal)) {
+        Parser.Lex(); // Eat the identifier token.
+        break;
+      }
+    }
+    // Fall-through..
+
+    default:
+      return Error(S, "invalid memory operand");
+    }
+
+    if (getLexer().isNot(AsmToken::RParen))
+      return Error(Parser.getTok().getLoc(), "missing ')'");
+    E = Parser.getTok().getLoc();
+    Parser.Lex(); // Eat the ')'.
+
+    Operands.push_back(PPCOperand::CreateImm(IntVal, S, E, isPPC64()));
+  }
+
+  return false;
+}
+
+/// Parse an instruction mnemonic followed by its operands.
+bool PPCAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                                    SMLoc NameLoc, OperandVector &Operands) {
+  // The first operand is the token for the instruction name.
+  // If the next character is a '+' or '-', we need to add it to the
+  // instruction name, to match what TableGen is doing.
+  std::string NewOpcode;
+  if (getLexer().is(AsmToken::Plus)) {
+    getLexer().Lex();
+    NewOpcode = Name;
+    NewOpcode += '+';
+    Name = NewOpcode;
+  }
+  if (getLexer().is(AsmToken::Minus)) {
+    getLexer().Lex();
+    NewOpcode = Name;
+    NewOpcode += '-';
+    Name = NewOpcode;
+  }
+  // If the instruction ends in a '.', we need to create a separate
+  // token for it, to match what TableGen is doing.
+  size_t Dot = Name.find('.');
+  StringRef Mnemonic = Name.slice(0, Dot);
+  if (!NewOpcode.empty()) // Underlying memory for Name is volatile.
+    Operands.push_back(
+        PPCOperand::CreateTokenWithStringCopy(Mnemonic, NameLoc, isPPC64()));
+  else
+    Operands.push_back(PPCOperand::CreateToken(Mnemonic, NameLoc, isPPC64()));
+  if (Dot != StringRef::npos) {
+    SMLoc DotLoc = SMLoc::getFromPointer(NameLoc.getPointer() + Dot);
+    StringRef DotStr = Name.slice(Dot, StringRef::npos);
+    if (!NewOpcode.empty()) // Underlying memory for Name is volatile.
+      Operands.push_back(
+          PPCOperand::CreateTokenWithStringCopy(DotStr, DotLoc, isPPC64()));
+    else
+      Operands.push_back(PPCOperand::CreateToken(DotStr, DotLoc, isPPC64()));
+  }
+
+  // If there are no more operands then finish
+  if (getLexer().is(AsmToken::EndOfStatement))
+    return false;
+
+  // Parse the first operand
+  if (ParseOperand(Operands))
+    return true;
+
+  while (getLexer().isNot(AsmToken::EndOfStatement) &&
+         getLexer().is(AsmToken::Comma)) {
+    // Consume the comma token
+    getLexer().Lex();
+
+    // Parse the next operand
+    if (ParseOperand(Operands))
+      return true;
+  }
+
+  return false;
+}
+
+/// ParseDirective parses the PPC specific directives
+bool PPCAsmParser::ParseDirective(AsmToken DirectiveID) {
+  StringRef IDVal = DirectiveID.getIdentifier();
+  if (!isDarwin()) {
+    if (IDVal == ".word")
+      return ParseDirectiveWord(2, DirectiveID.getLoc());
+    if (IDVal == ".llong")
+      return ParseDirectiveWord(8, DirectiveID.getLoc());
+    if (IDVal == ".tc")
+      return ParseDirectiveTC(isPPC64()? 8 : 4, DirectiveID.getLoc());
+    if (IDVal == ".machine")
+      return ParseDirectiveMachine(DirectiveID.getLoc());
+    if (IDVal == ".abiversion")
+      return ParseDirectiveAbiVersion(DirectiveID.getLoc());
+    if (IDVal == ".localentry")
+      return ParseDirectiveLocalEntry(DirectiveID.getLoc());
+  } else {
+    if (IDVal == ".machine")
+      return ParseDarwinDirectiveMachine(DirectiveID.getLoc());
+  }
+  return true;
+}
+
+/// ParseDirectiveWord
+///  ::= .word [ expression (, expression)* ]
+bool PPCAsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    for (;;) {
+      const MCExpr *Value;
+      if (getParser().parseExpression(Value))
+        return false;
+
+      getParser().getStreamer().EmitValue(Value, Size);
+
+      if (getLexer().is(AsmToken::EndOfStatement))
+        break;
+
+      if (getLexer().isNot(AsmToken::Comma))
+        return Error(L, "unexpected token in directive");
+      Parser.Lex();
+    }
+  }
+
+  Parser.Lex();
+  return false;
+}
+
+/// ParseDirectiveTC
+///  ::= .tc [ symbol (, expression)* ]
+bool PPCAsmParser::ParseDirectiveTC(unsigned Size, SMLoc L) {
+  // Skip TC symbol, which is only used with XCOFF.
+  while (getLexer().isNot(AsmToken::EndOfStatement)
+         && getLexer().isNot(AsmToken::Comma))
+    Parser.Lex();
+  if (getLexer().isNot(AsmToken::Comma)) {
+    Error(L, "unexpected token in directive");
+    return false;
+  }
+  Parser.Lex();
+
+  // Align to word size.
+  getParser().getStreamer().EmitValueToAlignment(Size);
+
+  // Emit expressions.
+  return ParseDirectiveWord(Size, L);
+}
+
+/// ParseDirectiveMachine (ELF platforms)
+///  ::= .machine [ cpu | "push" | "pop" ]
+bool PPCAsmParser::ParseDirectiveMachine(SMLoc L) {
+  if (getLexer().isNot(AsmToken::Identifier) &&
+      getLexer().isNot(AsmToken::String)) {
+    Error(L, "unexpected token in directive");
+    return false;
+  }
+
+  StringRef CPU = Parser.getTok().getIdentifier();
+  Parser.Lex();
+
+  // FIXME: Right now, the parser always allows any available
+  // instruction, so the .machine directive is not useful.
+  // Implement ".machine any" (by doing nothing) for the benefit
+  // of existing assembler code.  Likewise, we can then implement
+  // ".machine push" and ".machine pop" as no-op.
+  if (CPU != "any" && CPU != "push" && CPU != "pop") {
+    Error(L, "unrecognized machine type");
+    return false;
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    Error(L, "unexpected token in directive");
+    return false;
+  }
+  PPCTargetStreamer &TStreamer =
+      *static_cast<PPCTargetStreamer *>(
+           getParser().getStreamer().getTargetStreamer());
+  TStreamer.emitMachine(CPU);
+
+  return false;
+}
+
+/// ParseDarwinDirectiveMachine (Mach-o platforms)
+///  ::= .machine cpu-identifier
+bool PPCAsmParser::ParseDarwinDirectiveMachine(SMLoc L) {
+  if (getLexer().isNot(AsmToken::Identifier) &&
+      getLexer().isNot(AsmToken::String)) {
+    Error(L, "unexpected token in directive");
+    return false;
+  }
+
+  StringRef CPU = Parser.getTok().getIdentifier();
+  Parser.Lex();
+
+  // FIXME: this is only the 'default' set of cpu variants.
+  // However we don't act on this information at present, this is simply
+  // allowing parsing to proceed with minimal sanity checking.
+  if (CPU != "ppc7400" && CPU != "ppc" && CPU != "ppc64") {
+    Error(L, "unrecognized cpu type");
+    return false;
+  }
+
+  if (isPPC64() && (CPU == "ppc7400" || CPU == "ppc")) {
+    Error(L, "wrong cpu type specified for 64bit");
+    return false;
+  }
+  if (!isPPC64() && CPU == "ppc64") {
+    Error(L, "wrong cpu type specified for 32bit");
+    return false;
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    Error(L, "unexpected token in directive");
+    return false;
+  }
+
+  return false;
+}
+
+/// ParseDirectiveAbiVersion
+///  ::= .abiversion constant-expression
+bool PPCAsmParser::ParseDirectiveAbiVersion(SMLoc L) {
+  int64_t AbiVersion;
+  if (getParser().parseAbsoluteExpression(AbiVersion)){
+    Error(L, "expected constant expression");
+    return false;
+  }
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    Error(L, "unexpected token in directive");
+    return false;
+  }
+
+  PPCTargetStreamer &TStreamer =
+      *static_cast<PPCTargetStreamer *>(
+           getParser().getStreamer().getTargetStreamer());
+  TStreamer.emitAbiVersion(AbiVersion);
+
+  return false;
+}
+
+/// ParseDirectiveLocalEntry
+///  ::= .localentry symbol, expression
+bool PPCAsmParser::ParseDirectiveLocalEntry(SMLoc L) {
+  StringRef Name;
+  if (getParser().parseIdentifier(Name)) {
+    Error(L, "expected identifier in directive");
+    return false;
+  }
+  MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
+
+  if (getLexer().isNot(AsmToken::Comma)) {
+    Error(L, "unexpected token in directive");
+    return false;
+  }
+  Lex();
+
+  const MCExpr *Expr;
+  if (getParser().parseExpression(Expr)) {
+    Error(L, "expected expression");
+    return false;
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    Error(L, "unexpected token in directive");
+    return false;
+  }
+
+  PPCTargetStreamer &TStreamer =
+      *static_cast<PPCTargetStreamer *>(
+           getParser().getStreamer().getTargetStreamer());
+  TStreamer.emitLocalEntry(Sym, Expr);
+
+  return false;
+}
+
+
+
+/// Force static initialization.
+extern "C" void LLVMInitializePowerPCAsmParser() {
+  RegisterMCAsmParser<PPCAsmParser> A(ThePPC32Target);
+  RegisterMCAsmParser<PPCAsmParser> B(ThePPC64Target);
+  RegisterMCAsmParser<PPCAsmParser> C(ThePPC64LETarget);
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_MATCHER_IMPLEMENTATION
+#include "PPCGenAsmMatcher.inc"
+
+// Define this matcher function after the auto-generated include so we
+// have the match class enum definitions.
+unsigned PPCAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
+                                                  unsigned Kind) {
+  // If the kind is a token for a literal immediate, check if our asm
+  // operand matches. This is for InstAliases which have a fixed-value
+  // immediate in the syntax.
+  int64_t ImmVal;
+  switch (Kind) {
+    case MCK_0: ImmVal = 0; break;
+    case MCK_1: ImmVal = 1; break;
+    case MCK_2: ImmVal = 2; break;
+    case MCK_3: ImmVal = 3; break;
+    default: return Match_InvalidOperand;
+  }
+
+  PPCOperand &Op = static_cast<PPCOperand &>(AsmOp);
+  if (Op.isImm() && Op.getImm() == ImmVal)
+    return Match_Success;
+
+  return Match_InvalidOperand;
+}
+
+const MCExpr *
+PPCAsmParser::applyModifierToExpr(const MCExpr *E,
+                                  MCSymbolRefExpr::VariantKind Variant,
+                                  MCContext &Ctx) {
+  switch (Variant) {
+  case MCSymbolRefExpr::VK_PPC_LO:
+    return PPCMCExpr::Create(PPCMCExpr::VK_PPC_LO, E, false, Ctx);
+  case MCSymbolRefExpr::VK_PPC_HI:
+    return PPCMCExpr::Create(PPCMCExpr::VK_PPC_HI, E, false, Ctx);
+  case MCSymbolRefExpr::VK_PPC_HA:
+    return PPCMCExpr::Create(PPCMCExpr::VK_PPC_HA, E, false, Ctx);
+  case MCSymbolRefExpr::VK_PPC_HIGHER:
+    return PPCMCExpr::Create(PPCMCExpr::VK_PPC_HIGHER, E, false, Ctx);
+  case MCSymbolRefExpr::VK_PPC_HIGHERA:
+    return PPCMCExpr::Create(PPCMCExpr::VK_PPC_HIGHERA, E, false, Ctx);
+  case MCSymbolRefExpr::VK_PPC_HIGHEST:
+    return PPCMCExpr::Create(PPCMCExpr::VK_PPC_HIGHEST, E, false, Ctx);
+  case MCSymbolRefExpr::VK_PPC_HIGHESTA:
+    return PPCMCExpr::Create(PPCMCExpr::VK_PPC_HIGHESTA, E, false, Ctx);
+  default:
+    return nullptr;
+  }
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/Disassembler/CMakeLists.txt b/contrib/llvm/lib/Target/PowerPC/Disassembler/CMakeLists.txt
new file mode 100644
index 0000000..ca457df
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/Disassembler/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_llvm_library(LLVMPowerPCDisassembler
+  PPCDisassembler.cpp
+  )
diff --git a/contrib/llvm/lib/Target/PowerPC/Disassembler/LLVMBuild.txt b/contrib/llvm/lib/Target/PowerPC/Disassembler/LLVMBuild.txt
new file mode 100644
index 0000000..b0978c2
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/Disassembler/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===-- ./lib/Target/PowerPC/Disassembler/LLVMBuild.txt ---------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = PowerPCDisassembler
+parent = PowerPC
+required_libraries = MC PowerPCInfo Support
+add_to_library_groups = PowerPC
diff --git a/contrib/llvm/lib/Target/PowerPC/Disassembler/Makefile b/contrib/llvm/lib/Target/PowerPC/Disassembler/Makefile
new file mode 100644
index 0000000..86e3b47
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/Disassembler/Makefile
@@ -0,0 +1,16 @@
+##===-- lib/Target/PowerPC/Disassembler/Makefile -----------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../../..
+LIBRARYNAME = LLVMPowerPCDisassembler
+
+# Hack: we need to include 'main' PPC target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/contrib/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/contrib/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
new file mode 100644
index 0000000..a2305a9
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
@@ -0,0 +1,348 @@
+//===------ PPCDisassembler.cpp - Disassembler for PowerPC ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/MemoryObject.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-disassembler"
+
+typedef MCDisassembler::DecodeStatus DecodeStatus;
+
+namespace {
+class PPCDisassembler : public MCDisassembler {
+public:
+  PPCDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
+    : MCDisassembler(STI, Ctx) {}
+  virtual ~PPCDisassembler() {}
+
+  // Override MCDisassembler.
+  virtual DecodeStatus getInstruction(MCInst &instr,
+                                      uint64_t &size,
+                                      const MemoryObject &region,
+                                      uint64_t address,
+                                      raw_ostream &vStream,
+                                      raw_ostream &cStream) const override;
+};
+} // end anonymous namespace
+
+static MCDisassembler *createPPCDisassembler(const Target &T,
+                                             const MCSubtargetInfo &STI,
+                                             MCContext &Ctx) {
+  return new PPCDisassembler(STI, Ctx);
+}
+
+extern "C" void LLVMInitializePowerPCDisassembler() {
+  // Register the disassembler for each target.
+  TargetRegistry::RegisterMCDisassembler(ThePPC32Target,
+                                         createPPCDisassembler);
+  TargetRegistry::RegisterMCDisassembler(ThePPC64Target,
+                                         createPPCDisassembler);
+  TargetRegistry::RegisterMCDisassembler(ThePPC64LETarget,
+                                         createPPCDisassembler);
+}
+
+// FIXME: These can be generated by TableGen from the existing register
+// encoding values!
+
+static const unsigned CRRegs[] = {
+  PPC::CR0, PPC::CR1, PPC::CR2, PPC::CR3,
+  PPC::CR4, PPC::CR5, PPC::CR6, PPC::CR7
+};
+
+static const unsigned CRBITRegs[] = {
+  PPC::CR0LT, PPC::CR0GT, PPC::CR0EQ, PPC::CR0UN,
+  PPC::CR1LT, PPC::CR1GT, PPC::CR1EQ, PPC::CR1UN,
+  PPC::CR2LT, PPC::CR2GT, PPC::CR2EQ, PPC::CR2UN,
+  PPC::CR3LT, PPC::CR3GT, PPC::CR3EQ, PPC::CR3UN,
+  PPC::CR4LT, PPC::CR4GT, PPC::CR4EQ, PPC::CR4UN,
+  PPC::CR5LT, PPC::CR5GT, PPC::CR5EQ, PPC::CR5UN,
+  PPC::CR6LT, PPC::CR6GT, PPC::CR6EQ, PPC::CR6UN,
+  PPC::CR7LT, PPC::CR7GT, PPC::CR7EQ, PPC::CR7UN
+};
+
+static const unsigned FRegs[] = {
+  PPC::F0, PPC::F1, PPC::F2, PPC::F3,
+  PPC::F4, PPC::F5, PPC::F6, PPC::F7,
+  PPC::F8, PPC::F9, PPC::F10, PPC::F11,
+  PPC::F12, PPC::F13, PPC::F14, PPC::F15,
+  PPC::F16, PPC::F17, PPC::F18, PPC::F19,
+  PPC::F20, PPC::F21, PPC::F22, PPC::F23,
+  PPC::F24, PPC::F25, PPC::F26, PPC::F27,
+  PPC::F28, PPC::F29, PPC::F30, PPC::F31
+};
+
+static const unsigned VRegs[] = {
+  PPC::V0, PPC::V1, PPC::V2, PPC::V3,
+  PPC::V4, PPC::V5, PPC::V6, PPC::V7,
+  PPC::V8, PPC::V9, PPC::V10, PPC::V11,
+  PPC::V12, PPC::V13, PPC::V14, PPC::V15,
+  PPC::V16, PPC::V17, PPC::V18, PPC::V19,
+  PPC::V20, PPC::V21, PPC::V22, PPC::V23,
+  PPC::V24, PPC::V25, PPC::V26, PPC::V27,
+  PPC::V28, PPC::V29, PPC::V30, PPC::V31
+};
+
+static const unsigned VSRegs[] = {
+  PPC::VSL0, PPC::VSL1, PPC::VSL2, PPC::VSL3,
+  PPC::VSL4, PPC::VSL5, PPC::VSL6, PPC::VSL7,
+  PPC::VSL8, PPC::VSL9, PPC::VSL10, PPC::VSL11,
+  PPC::VSL12, PPC::VSL13, PPC::VSL14, PPC::VSL15,
+  PPC::VSL16, PPC::VSL17, PPC::VSL18, PPC::VSL19,
+  PPC::VSL20, PPC::VSL21, PPC::VSL22, PPC::VSL23,
+  PPC::VSL24, PPC::VSL25, PPC::VSL26, PPC::VSL27,
+  PPC::VSL28, PPC::VSL29, PPC::VSL30, PPC::VSL31,
+
+  PPC::VSH0, PPC::VSH1, PPC::VSH2, PPC::VSH3,
+  PPC::VSH4, PPC::VSH5, PPC::VSH6, PPC::VSH7,
+  PPC::VSH8, PPC::VSH9, PPC::VSH10, PPC::VSH11,
+  PPC::VSH12, PPC::VSH13, PPC::VSH14, PPC::VSH15,
+  PPC::VSH16, PPC::VSH17, PPC::VSH18, PPC::VSH19,
+  PPC::VSH20, PPC::VSH21, PPC::VSH22, PPC::VSH23,
+  PPC::VSH24, PPC::VSH25, PPC::VSH26, PPC::VSH27,
+  PPC::VSH28, PPC::VSH29, PPC::VSH30, PPC::VSH31
+};
+
+static const unsigned VSFRegs[] = {
+  PPC::F0, PPC::F1, PPC::F2, PPC::F3,
+  PPC::F4, PPC::F5, PPC::F6, PPC::F7,
+  PPC::F8, PPC::F9, PPC::F10, PPC::F11,
+  PPC::F12, PPC::F13, PPC::F14, PPC::F15,
+  PPC::F16, PPC::F17, PPC::F18, PPC::F19,
+  PPC::F20, PPC::F21, PPC::F22, PPC::F23,
+  PPC::F24, PPC::F25, PPC::F26, PPC::F27,
+  PPC::F28, PPC::F29, PPC::F30, PPC::F31,
+
+  PPC::VF0, PPC::VF1, PPC::VF2, PPC::VF3,
+  PPC::VF4, PPC::VF5, PPC::VF6, PPC::VF7,
+  PPC::VF8, PPC::VF9, PPC::VF10, PPC::VF11,
+  PPC::VF12, PPC::VF13, PPC::VF14, PPC::VF15,
+  PPC::VF16, PPC::VF17, PPC::VF18, PPC::VF19,
+  PPC::VF20, PPC::VF21, PPC::VF22, PPC::VF23,
+  PPC::VF24, PPC::VF25, PPC::VF26, PPC::VF27,
+  PPC::VF28, PPC::VF29, PPC::VF30, PPC::VF31
+};
+
+static const unsigned GPRegs[] = {
+  PPC::R0, PPC::R1, PPC::R2, PPC::R3,
+  PPC::R4, PPC::R5, PPC::R6, PPC::R7,
+  PPC::R8, PPC::R9, PPC::R10, PPC::R11,
+  PPC::R12, PPC::R13, PPC::R14, PPC::R15,
+  PPC::R16, PPC::R17, PPC::R18, PPC::R19,
+  PPC::R20, PPC::R21, PPC::R22, PPC::R23,
+  PPC::R24, PPC::R25, PPC::R26, PPC::R27,
+  PPC::R28, PPC::R29, PPC::R30, PPC::R31
+};
+
+static const unsigned GP0Regs[] = {
+  PPC::ZERO, PPC::R1, PPC::R2, PPC::R3,
+  PPC::R4, PPC::R5, PPC::R6, PPC::R7,
+  PPC::R8, PPC::R9, PPC::R10, PPC::R11,
+  PPC::R12, PPC::R13, PPC::R14, PPC::R15,
+  PPC::R16, PPC::R17, PPC::R18, PPC::R19,
+  PPC::R20, PPC::R21, PPC::R22, PPC::R23,
+  PPC::R24, PPC::R25, PPC::R26, PPC::R27,
+  PPC::R28, PPC::R29, PPC::R30, PPC::R31
+};
+
+static const unsigned G8Regs[] = {
+  PPC::X0, PPC::X1, PPC::X2, PPC::X3,
+  PPC::X4, PPC::X5, PPC::X6, PPC::X7,
+  PPC::X8, PPC::X9, PPC::X10, PPC::X11,
+  PPC::X12, PPC::X13, PPC::X14, PPC::X15,
+  PPC::X16, PPC::X17, PPC::X18, PPC::X19,
+  PPC::X20, PPC::X21, PPC::X22, PPC::X23,
+  PPC::X24, PPC::X25, PPC::X26, PPC::X27,
+  PPC::X28, PPC::X29, PPC::X30, PPC::X31
+};
+
+template <std::size_t N>
+static DecodeStatus decodeRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                        const unsigned (&Regs)[N]) {
+  assert(RegNo < N && "Invalid register number");
+  Inst.addOperand(MCOperand::CreateReg(Regs[RegNo]));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCRRCRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, CRRegs);
+}
+
+static DecodeStatus DecodeCRBITRCRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, CRBITRegs);
+}
+
+static DecodeStatus DecodeF4RCRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, FRegs);
+}
+
+static DecodeStatus DecodeF8RCRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, FRegs);
+}
+
+static DecodeStatus DecodeVRRCRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, VRegs);
+}
+
+static DecodeStatus DecodeVSRCRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, VSRegs);
+}
+
+static DecodeStatus DecodeVSFRCRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, VSFRegs);
+}
+
+static DecodeStatus DecodeGPRCRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, GPRegs);
+}
+
+static DecodeStatus DecodeGPRC_NOR0RegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, GP0Regs);
+}
+
+static DecodeStatus DecodeG8RCRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, G8Regs);
+}
+
+#define DecodePointerLikeRegClass0 DecodeGPRCRegisterClass
+#define DecodePointerLikeRegClass1 DecodeGPRC_NOR0RegisterClass
+
+template<unsigned N>
+static DecodeStatus decodeUImmOperand(MCInst &Inst, uint64_t Imm,
+                                      int64_t Address, const void *Decoder) {
+  assert(isUInt<N>(Imm) && "Invalid immediate");
+  Inst.addOperand(MCOperand::CreateImm(Imm));
+  return MCDisassembler::Success;
+}
+
+template<unsigned N>
+static DecodeStatus decodeSImmOperand(MCInst &Inst, uint64_t Imm,
+                                      int64_t Address, const void *Decoder) {
+  assert(isUInt<N>(Imm) && "Invalid immediate");
+  Inst.addOperand(MCOperand::CreateImm(SignExtend64<N>(Imm)));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeMemRIOperands(MCInst &Inst, uint64_t Imm,
+                                        int64_t Address, const void *Decoder) {
+  // Decode the memri field (imm, reg), which has the low 16-bits as the
+  // displacement and the next 5 bits as the register #.
+
+  uint64_t Base = Imm >> 16;
+  uint64_t Disp = Imm & 0xFFFF;
+
+  assert(Base < 32 && "Invalid base register");
+
+  switch (Inst.getOpcode()) {
+  default: break;
+  case PPC::LBZU:
+  case PPC::LHAU:
+  case PPC::LHZU:
+  case PPC::LWZU:
+  case PPC::LFSU:
+  case PPC::LFDU:
+    // Add the tied output operand.
+    Inst.addOperand(MCOperand::CreateReg(GP0Regs[Base]));
+    break;
+  case PPC::STBU:
+  case PPC::STHU:
+  case PPC::STWU:
+  case PPC::STFSU:
+  case PPC::STFDU:
+    Inst.insert(Inst.begin(), MCOperand::CreateReg(GP0Regs[Base]));
+    break;
+  }
+
+  Inst.addOperand(MCOperand::CreateImm(SignExtend64<16>(Disp)));
+  Inst.addOperand(MCOperand::CreateReg(GP0Regs[Base]));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeMemRIXOperands(MCInst &Inst, uint64_t Imm,
+                                         int64_t Address, const void *Decoder) {
+  // Decode the memrix field (imm, reg), which has the low 14-bits as the
+  // displacement and the next 5 bits as the register #.
+
+  uint64_t Base = Imm >> 14;
+  uint64_t Disp = Imm & 0x3FFF;
+
+  assert(Base < 32 && "Invalid base register");
+
+  if (Inst.getOpcode() == PPC::LDU)
+    // Add the tied output operand.
+    Inst.addOperand(MCOperand::CreateReg(GP0Regs[Base]));
+  else if (Inst.getOpcode() == PPC::STDU)
+    Inst.insert(Inst.begin(), MCOperand::CreateReg(GP0Regs[Base]));
+
+  Inst.addOperand(MCOperand::CreateImm(SignExtend64<16>(Disp << 2)));
+  Inst.addOperand(MCOperand::CreateReg(GP0Regs[Base]));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeCRBitMOperand(MCInst &Inst, uint64_t Imm,
+                                        int64_t Address, const void *Decoder) {
+  // The cr bit encoding is 0x80 >> cr_reg_num.
+
+  unsigned Zeros = countTrailingZeros(Imm);
+  assert(Zeros < 8 && "Invalid CR bit value");
+
+  Inst.addOperand(MCOperand::CreateReg(CRRegs[7 - Zeros]));
+  return MCDisassembler::Success;
+}
+
+#include "PPCGenDisassemblerTables.inc"
+
+DecodeStatus PPCDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
+                                                 const MemoryObject &Region,
+                                                 uint64_t Address,
+                                                 raw_ostream &os,
+                                                 raw_ostream &cs) const {
+  // Get the four bytes of the instruction.
+  uint8_t Bytes[4];
+  Size = 4;
+  if (Region.readBytes(Address, Size, Bytes) == -1) {
+    Size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  // The instruction is big-endian encoded.
+  uint32_t Inst = (Bytes[0] << 24) |
+                  (Bytes[1] << 16) |
+                  (Bytes[2] <<  8) |
+                  (Bytes[3] <<  0);
+
+  return decodeInstruction(DecoderTable32, MI, Inst, Address, this, STI);
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
new file mode 100644
index 0000000..771b6f5
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
@@ -0,0 +1,365 @@
+//===-- PPCInstPrinter.cpp - Convert PPC MCInst to assembly syntax --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an PPC MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCInstPrinter.h"
+#include "MCTargetDesc/PPCMCTargetDesc.h"
+#include "MCTargetDesc/PPCPredicates.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOpcodes.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+// FIXME: Once the integrated assembler supports full register names, tie this
+// to the verbose-asm setting.
+static cl::opt<bool>
+FullRegNames("ppc-asm-full-reg-names", cl::Hidden, cl::init(false),
+             cl::desc("Use full register names when printing assembly"));
+
+#include "PPCGenAsmWriter.inc"
+
+void PPCInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+  OS << getRegisterName(RegNo);
+}
+
+void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                               StringRef Annot) {
+  // Check for slwi/srwi mnemonics.
+  if (MI->getOpcode() == PPC::RLWINM) {
+    unsigned char SH = MI->getOperand(2).getImm();
+    unsigned char MB = MI->getOperand(3).getImm();
+    unsigned char ME = MI->getOperand(4).getImm();
+    bool useSubstituteMnemonic = false;
+    if (SH <= 31 && MB == 0 && ME == (31-SH)) {
+      O << "\tslwi "; useSubstituteMnemonic = true;
+    }
+    if (SH <= 31 && MB == (32-SH) && ME == 31) {
+      O << "\tsrwi "; useSubstituteMnemonic = true;
+      SH = 32-SH;
+    }
+    if (useSubstituteMnemonic) {
+      printOperand(MI, 0, O);
+      O << ", ";
+      printOperand(MI, 1, O);
+      O << ", " << (unsigned int)SH;
+
+      printAnnotation(O, Annot);
+      return;
+    }
+  }
+  
+  if ((MI->getOpcode() == PPC::OR || MI->getOpcode() == PPC::OR8) &&
+      MI->getOperand(1).getReg() == MI->getOperand(2).getReg()) {
+    O << "\tmr ";
+    printOperand(MI, 0, O);
+    O << ", ";
+    printOperand(MI, 1, O);
+    printAnnotation(O, Annot);
+    return;
+  }
+  
+  if (MI->getOpcode() == PPC::RLDICR) {
+    unsigned char SH = MI->getOperand(2).getImm();
+    unsigned char ME = MI->getOperand(3).getImm();
+    // rldicr RA, RS, SH, 63-SH == sldi RA, RS, SH
+    if (63-SH == ME) {
+      O << "\tsldi ";
+      printOperand(MI, 0, O);
+      O << ", ";
+      printOperand(MI, 1, O);
+      O << ", " << (unsigned int)SH;
+      printAnnotation(O, Annot);
+      return;
+    }
+  }
+  
+  // For fast-isel, a COPY_TO_REGCLASS may survive this long.  This is
+  // used when converting a 32-bit float to a 64-bit float as part of
+  // conversion to an integer (see PPCFastISel.cpp:SelectFPToI()),
+  // as otherwise we have problems with incorrect register classes
+  // in machine instruction verification.  For now, just avoid trying
+  // to print it as such an instruction has no effect (a 32-bit float
+  // in a register is already in 64-bit form, just with lower
+  // precision).  FIXME: Is there a better solution?
+  if (MI->getOpcode() == TargetOpcode::COPY_TO_REGCLASS)
+    return;
+  
+  printInstruction(MI, O);
+  printAnnotation(O, Annot);
+}
+
+
+void PPCInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O, 
+                                           const char *Modifier) {
+  unsigned Code = MI->getOperand(OpNo).getImm();
+
+  if (StringRef(Modifier) == "cc") {
+    switch ((PPC::Predicate)Code) {
+    case PPC::PRED_LT_MINUS:
+    case PPC::PRED_LT_PLUS:
+    case PPC::PRED_LT:
+      O << "lt";
+      return;
+    case PPC::PRED_LE_MINUS:
+    case PPC::PRED_LE_PLUS:
+    case PPC::PRED_LE:
+      O << "le";
+      return;
+    case PPC::PRED_EQ_MINUS:
+    case PPC::PRED_EQ_PLUS:
+    case PPC::PRED_EQ:
+      O << "eq";
+      return;
+    case PPC::PRED_GE_MINUS:
+    case PPC::PRED_GE_PLUS:
+    case PPC::PRED_GE:
+      O << "ge";
+      return;
+    case PPC::PRED_GT_MINUS:
+    case PPC::PRED_GT_PLUS:
+    case PPC::PRED_GT:
+      O << "gt";
+      return;
+    case PPC::PRED_NE_MINUS:
+    case PPC::PRED_NE_PLUS:
+    case PPC::PRED_NE:
+      O << "ne";
+      return;
+    case PPC::PRED_UN_MINUS:
+    case PPC::PRED_UN_PLUS:
+    case PPC::PRED_UN:
+      O << "un";
+      return;
+    case PPC::PRED_NU_MINUS:
+    case PPC::PRED_NU_PLUS:
+    case PPC::PRED_NU:
+      O << "nu";
+      return;
+    case PPC::PRED_BIT_SET:
+    case PPC::PRED_BIT_UNSET:
+      llvm_unreachable("Invalid use of bit predicate code");
+    }
+    llvm_unreachable("Invalid predicate code");
+  }
+
+  if (StringRef(Modifier) == "pm") {
+    switch ((PPC::Predicate)Code) {
+    case PPC::PRED_LT:
+    case PPC::PRED_LE:
+    case PPC::PRED_EQ:
+    case PPC::PRED_GE:
+    case PPC::PRED_GT:
+    case PPC::PRED_NE:
+    case PPC::PRED_UN:
+    case PPC::PRED_NU:
+      return;
+    case PPC::PRED_LT_MINUS:
+    case PPC::PRED_LE_MINUS:
+    case PPC::PRED_EQ_MINUS:
+    case PPC::PRED_GE_MINUS:
+    case PPC::PRED_GT_MINUS:
+    case PPC::PRED_NE_MINUS:
+    case PPC::PRED_UN_MINUS:
+    case PPC::PRED_NU_MINUS:
+      O << "-";
+      return;
+    case PPC::PRED_LT_PLUS:
+    case PPC::PRED_LE_PLUS:
+    case PPC::PRED_EQ_PLUS:
+    case PPC::PRED_GE_PLUS:
+    case PPC::PRED_GT_PLUS:
+    case PPC::PRED_NE_PLUS:
+    case PPC::PRED_UN_PLUS:
+    case PPC::PRED_NU_PLUS:
+      O << "+";
+      return;
+    case PPC::PRED_BIT_SET:
+    case PPC::PRED_BIT_UNSET:
+      llvm_unreachable("Invalid use of bit predicate code");
+    }
+    llvm_unreachable("Invalid predicate code");
+  }
+  
+  assert(StringRef(Modifier) == "reg" &&
+         "Need to specify 'cc', 'pm' or 'reg' as predicate op modifier!");
+  printOperand(MI, OpNo+1, O);
+}
+
+void PPCInstPrinter::printU2ImmOperand(const MCInst *MI, unsigned OpNo,
+                                       raw_ostream &O) {
+  unsigned int Value = MI->getOperand(OpNo).getImm();
+  assert(Value <= 3 && "Invalid u2imm argument!");
+  O << (unsigned int)Value;
+}
+
+void PPCInstPrinter::printS5ImmOperand(const MCInst *MI, unsigned OpNo,
+                                       raw_ostream &O) {
+  int Value = MI->getOperand(OpNo).getImm();
+  Value = SignExtend32<5>(Value);
+  O << (int)Value;
+}
+
+void PPCInstPrinter::printU5ImmOperand(const MCInst *MI, unsigned OpNo,
+                                       raw_ostream &O) {
+  unsigned int Value = MI->getOperand(OpNo).getImm();
+  assert(Value <= 31 && "Invalid u5imm argument!");
+  O << (unsigned int)Value;
+}
+
+void PPCInstPrinter::printU6ImmOperand(const MCInst *MI, unsigned OpNo,
+                                       raw_ostream &O) {
+  unsigned int Value = MI->getOperand(OpNo).getImm();
+  assert(Value <= 63 && "Invalid u6imm argument!");
+  O << (unsigned int)Value;
+}
+
+void PPCInstPrinter::printS16ImmOperand(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) {
+  if (MI->getOperand(OpNo).isImm())
+    O << (short)MI->getOperand(OpNo).getImm();
+  else
+    printOperand(MI, OpNo, O);
+}
+
+void PPCInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) {
+  if (MI->getOperand(OpNo).isImm())
+    O << (unsigned short)MI->getOperand(OpNo).getImm();
+  else
+    printOperand(MI, OpNo, O);
+}
+
+void PPCInstPrinter::printBranchOperand(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) {
+  if (!MI->getOperand(OpNo).isImm())
+    return printOperand(MI, OpNo, O);
+
+  // Branches can take an immediate operand.  This is used by the branch
+  // selection pass to print .+8, an eight byte displacement from the PC.
+  O << ".+";
+  printAbsBranchOperand(MI, OpNo, O);
+}
+
+void PPCInstPrinter::printAbsBranchOperand(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O) {
+  if (!MI->getOperand(OpNo).isImm())
+    return printOperand(MI, OpNo, O);
+
+  O << (int)MI->getOperand(OpNo).getImm()*4;
+}
+
+
+void PPCInstPrinter::printcrbitm(const MCInst *MI, unsigned OpNo,
+                                 raw_ostream &O) {
+  unsigned CCReg = MI->getOperand(OpNo).getReg();
+  unsigned RegNo;
+  switch (CCReg) {
+  default: llvm_unreachable("Unknown CR register");
+  case PPC::CR0: RegNo = 0; break;
+  case PPC::CR1: RegNo = 1; break;
+  case PPC::CR2: RegNo = 2; break;
+  case PPC::CR3: RegNo = 3; break;
+  case PPC::CR4: RegNo = 4; break;
+  case PPC::CR5: RegNo = 5; break;
+  case PPC::CR6: RegNo = 6; break;
+  case PPC::CR7: RegNo = 7; break;
+  }
+  O << (0x80 >> RegNo);
+}
+
+void PPCInstPrinter::printMemRegImm(const MCInst *MI, unsigned OpNo,
+                                    raw_ostream &O) {
+  printS16ImmOperand(MI, OpNo, O);
+  O << '(';
+  if (MI->getOperand(OpNo+1).getReg() == PPC::R0)
+    O << "0";
+  else
+    printOperand(MI, OpNo+1, O);
+  O << ')';
+}
+
+void PPCInstPrinter::printMemRegReg(const MCInst *MI, unsigned OpNo,
+                                    raw_ostream &O) {
+  // When used as the base register, r0 reads constant zero rather than
+  // the value contained in the register.  For this reason, the darwin
+  // assembler requires that we print r0 as 0 (no r) when used as the base.
+  if (MI->getOperand(OpNo).getReg() == PPC::R0)
+    O << "0";
+  else
+    printOperand(MI, OpNo, O);
+  O << ", ";
+  printOperand(MI, OpNo+1, O);
+}
+
+void PPCInstPrinter::printTLSCall(const MCInst *MI, unsigned OpNo,
+                                  raw_ostream &O) {
+  // On PPC64, VariantKind is VK_None, but on PPC32, it's VK_PLT, and it must
+  // come at the _end_ of the expression.
+  const MCOperand &Op = MI->getOperand(OpNo);
+  const MCSymbolRefExpr &refExp = cast<MCSymbolRefExpr>(*Op.getExpr());
+  O << refExp.getSymbol().getName();
+  O << '(';
+  printOperand(MI, OpNo+1, O);
+  O << ')';
+  if (refExp.getKind() != MCSymbolRefExpr::VK_None)
+    O << '@' << MCSymbolRefExpr::getVariantKindName(refExp.getKind());
+}
+
+
+/// stripRegisterPrefix - This method strips the character prefix from a
+/// register name so that only the number is left.  Used by for linux asm.
+static const char *stripRegisterPrefix(const char *RegName) {
+  if (FullRegNames)
+    return RegName;
+
+  switch (RegName[0]) {
+  case 'r':
+  case 'f':
+  case 'v':
+    if (RegName[1] == 's')
+      return RegName + 2;
+    return RegName + 1;
+  case 'c': if (RegName[1] == 'r') return RegName + 2;
+  }
+  
+  return RegName;
+}
+
+void PPCInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                  raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    const char *RegName = getRegisterName(Op.getReg());
+    // The linux and AIX assembler does not take register prefixes.
+    if (!isDarwinSyntax())
+      RegName = stripRegisterPrefix(RegName);
+    
+    O << RegName;
+    return;
+  }
+  
+  if (Op.isImm()) {
+    O << Op.getImm();
+    return;
+  }
+  
+  assert(Op.isExpr() && "unknown operand kind in printOperand");
+  O << *Op.getExpr();
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h b/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
new file mode 100644
index 0000000..211a628
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
@@ -0,0 +1,63 @@
+//===- PPCInstPrinter.h - Convert PPC MCInst to assembly syntax -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an PPC MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PPCINSTPRINTER_H
+#define PPCINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+
+class MCOperand;
+
+class PPCInstPrinter : public MCInstPrinter {
+  bool IsDarwin;
+public:
+  PPCInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                 const MCRegisterInfo &MRI, bool isDarwin)
+    : MCInstPrinter(MAI, MII, MRI), IsDarwin(isDarwin) {}
+  
+  bool isDarwinSyntax() const {
+    return IsDarwin;
+  }
+  
+  void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override;
+  
+  // Autogenerated by tblgen.
+  void printInstruction(const MCInst *MI, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo);
+  
+
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printPredicateOperand(const MCInst *MI, unsigned OpNo,
+                             raw_ostream &O, const char *Modifier = nullptr);
+
+  void printU2ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printS5ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printU5ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printU6ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printS16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printU16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printBranchOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printAbsBranchOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printTLSCall(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+
+  void printcrbitm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+
+  void printMemRegImm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printMemRegReg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
new file mode 100644
index 0000000..c54d5e7
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -0,0 +1,245 @@
+//===-- PPCAsmBackend.cpp - PPC Assembler Backend -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/PPCMCTargetDesc.h"
+#include "MCTargetDesc/PPCFixupKinds.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCELF.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachO.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
+  switch (Kind) {
+  default:
+    llvm_unreachable("Unknown fixup kind!");
+  case FK_Data_1:
+  case FK_Data_2:
+  case FK_Data_4:
+  case FK_Data_8:
+  case PPC::fixup_ppc_nofixup:
+    return Value;
+  case PPC::fixup_ppc_brcond14:
+  case PPC::fixup_ppc_brcond14abs:
+    return Value & 0xfffc;
+  case PPC::fixup_ppc_br24:
+  case PPC::fixup_ppc_br24abs:
+    return Value & 0x3fffffc;
+  case PPC::fixup_ppc_half16:
+    return Value & 0xffff;
+  case PPC::fixup_ppc_half16ds:
+    return Value & 0xfffc;
+  }
+}
+
+static unsigned getFixupKindNumBytes(unsigned Kind) {
+  switch (Kind) {
+  default:
+    llvm_unreachable("Unknown fixup kind!");
+  case FK_Data_1:
+    return 1;
+  case FK_Data_2:
+  case PPC::fixup_ppc_half16:
+  case PPC::fixup_ppc_half16ds:
+    return 2;
+  case FK_Data_4:
+  case PPC::fixup_ppc_brcond14:
+  case PPC::fixup_ppc_brcond14abs:
+  case PPC::fixup_ppc_br24:
+  case PPC::fixup_ppc_br24abs:
+    return 4;
+  case FK_Data_8:
+    return 8;
+  case PPC::fixup_ppc_nofixup:
+    return 0;
+  }
+}
+
+namespace {
+
+class PPCAsmBackend : public MCAsmBackend {
+  const Target &TheTarget;
+  bool IsLittleEndian;
+public:
+  PPCAsmBackend(const Target &T, bool isLittle) : MCAsmBackend(), TheTarget(T),
+    IsLittleEndian(isLittle) {}
+
+  unsigned getNumFixupKinds() const override {
+    return PPC::NumTargetFixupKinds;
+  }
+
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
+    const static MCFixupKindInfo InfosBE[PPC::NumTargetFixupKinds] = {
+      // name                    offset  bits  flags
+      { "fixup_ppc_br24",        6,      24,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_ppc_brcond14",    16,     14,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_ppc_br24abs",     6,      24,   0 },
+      { "fixup_ppc_brcond14abs", 16,     14,   0 },
+      { "fixup_ppc_half16",       0,     16,   0 },
+      { "fixup_ppc_half16ds",     0,     14,   0 },
+      { "fixup_ppc_nofixup",      0,      0,   0 }
+    };
+    const static MCFixupKindInfo InfosLE[PPC::NumTargetFixupKinds] = {
+      // name                    offset  bits  flags
+      { "fixup_ppc_br24",        2,      24,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_ppc_brcond14",    2,      14,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_ppc_br24abs",     2,      24,   0 },
+      { "fixup_ppc_brcond14abs", 2,      14,   0 },
+      { "fixup_ppc_half16",      0,      16,   0 },
+      { "fixup_ppc_half16ds",    2,      14,   0 },
+      { "fixup_ppc_nofixup",     0,       0,   0 }
+    };
+
+    if (Kind < FirstTargetFixupKind)
+      return MCAsmBackend::getFixupKindInfo(Kind);
+
+    assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+           "Invalid kind!");
+    return (IsLittleEndian? InfosLE : InfosBE)[Kind - FirstTargetFixupKind];
+  }
+
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value, bool IsPCRel) const override {
+    Value = adjustFixupValue(Fixup.getKind(), Value);
+    if (!Value) return;           // Doesn't change encoding.
+
+    unsigned Offset = Fixup.getOffset();
+    unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
+
+    // For each byte of the fragment that the fixup touches, mask in the bits
+    // from the fixup value. The Value has been "split up" into the appropriate
+    // bitfields above.
+    for (unsigned i = 0; i != NumBytes; ++i) {
+      unsigned Idx = IsLittleEndian ? i : (NumBytes - 1 - i);
+      Data[Offset + i] |= uint8_t((Value >> (Idx * 8)) & 0xff);
+    }
+  }
+
+  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
+                         const MCFixup &Fixup, const MCFragment *DF,
+                         const MCValue &Target, uint64_t &Value,
+                         bool &IsResolved) override {
+    switch ((PPC::Fixups)Fixup.getKind()) {
+    default: break;
+    case PPC::fixup_ppc_br24:
+    case PPC::fixup_ppc_br24abs:
+      // If the target symbol has a local entry point we must not attempt
+      // to resolve the fixup directly.  Emit a relocation and leave
+      // resolution of the final target address to the linker.
+      if (const MCSymbolRefExpr *A = Target.getSymA()) {
+        const MCSymbolData &Data = Asm.getSymbolData(A->getSymbol());
+        // The "other" values are stored in the last 6 bits of the second byte.
+        // The traditional defines for STO values assume the full byte and thus
+        // the shift to pack it.
+        unsigned Other = MCELF::getOther(Data) << 2;
+        if ((Other & ELF::STO_PPC64_LOCAL_MASK) != 0)
+          IsResolved = false;
+      }
+      break;
+    }
+  }
+
+  bool mayNeedRelaxation(const MCInst &Inst) const override {
+    // FIXME.
+    return false;
+  }
+
+  bool fixupNeedsRelaxation(const MCFixup &Fixup,
+                            uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const override {
+    // FIXME.
+    llvm_unreachable("relaxInstruction() unimplemented");
+  }
+
+
+  void relaxInstruction(const MCInst &Inst, MCInst &Res) const override {
+    // FIXME.
+    llvm_unreachable("relaxInstruction() unimplemented");
+  }
+
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override {
+    uint64_t NumNops = Count / 4;
+    for (uint64_t i = 0; i != NumNops; ++i)
+      OW->Write32(0x60000000);
+
+    switch (Count % 4) {
+    default: break; // No leftover bytes to write
+    case 1: OW->Write8(0); break;
+    case 2: OW->Write16(0); break;
+    case 3: OW->Write16(0); OW->Write8(0); break;
+    }
+
+    return true;
+  }
+
+  unsigned getPointerSize() const {
+    StringRef Name = TheTarget.getName();
+    if (Name == "ppc64" || Name == "ppc64le") return 8;
+    assert(Name == "ppc32" && "Unknown target name!");
+    return 4;
+  }
+
+  bool isLittleEndian() const {
+    return IsLittleEndian;
+  }
+};
+} // end anonymous namespace
+
+
+// FIXME: This should be in a separate file.
+namespace {
+  class DarwinPPCAsmBackend : public PPCAsmBackend {
+  public:
+    DarwinPPCAsmBackend(const Target &T) : PPCAsmBackend(T, false) { }
+
+    MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
+      bool is64 = getPointerSize() == 8;
+      return createPPCMachObjectWriter(
+          OS,
+          /*Is64Bit=*/is64,
+          (is64 ? MachO::CPU_TYPE_POWERPC64 : MachO::CPU_TYPE_POWERPC),
+          MachO::CPU_SUBTYPE_POWERPC_ALL);
+    }
+  };
+
+  class ELFPPCAsmBackend : public PPCAsmBackend {
+    uint8_t OSABI;
+  public:
+    ELFPPCAsmBackend(const Target &T, bool IsLittleEndian, uint8_t OSABI) :
+      PPCAsmBackend(T, IsLittleEndian), OSABI(OSABI) { }
+
+
+    MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
+      bool is64 = getPointerSize() == 8;
+      return createPPCELFObjectWriter(OS, is64, isLittleEndian(), OSABI);
+    }
+  };
+
+} // end anonymous namespace
+
+MCAsmBackend *llvm::createPPCAsmBackend(const Target &T,
+                                        const MCRegisterInfo &MRI,
+                                        StringRef TT, StringRef CPU) {
+  if (Triple(TT).isOSDarwin())
+    return new DarwinPPCAsmBackend(T);
+
+  uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(Triple(TT).getOS());
+  bool IsLittleEndian = Triple(TT).getArch() == Triple::ppc64le;
+  return new ELFPPCAsmBackend(T, IsLittleEndian, OSABI);
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
new file mode 100644
index 0000000..ca81317
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
@@ -0,0 +1,432 @@
+//===-- PPCELFObjectWriter.cpp - PPC ELF Writer ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/PPCMCTargetDesc.h"
+#include "MCTargetDesc/PPCFixupKinds.h"
+#include "MCTargetDesc/PPCMCExpr.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/MC/MCELF.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace {
+  class PPCELFObjectWriter : public MCELFObjectTargetWriter {
+  public:
+    PPCELFObjectWriter(bool Is64Bit, uint8_t OSABI);
+
+    virtual ~PPCELFObjectWriter();
+  protected:
+    virtual unsigned getRelocTypeInner(const MCValue &Target,
+                                       const MCFixup &Fixup,
+                                       bool IsPCRel) const;
+    unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
+                          bool IsPCRel) const override;
+
+    bool needsRelocateWithSymbol(const MCSymbolData &SD,
+                                 unsigned Type) const override;
+  };
+}
+
+PPCELFObjectWriter::PPCELFObjectWriter(bool Is64Bit, uint8_t OSABI)
+  : MCELFObjectTargetWriter(Is64Bit, OSABI,
+                            Is64Bit ?  ELF::EM_PPC64 : ELF::EM_PPC,
+                            /*HasRelocationAddend*/ true) {}
+
+PPCELFObjectWriter::~PPCELFObjectWriter() {
+}
+
+static MCSymbolRefExpr::VariantKind getAccessVariant(const MCValue &Target,
+                                                     const MCFixup &Fixup) {
+  const MCExpr *Expr = Fixup.getValue();
+
+  if (Expr->getKind() != MCExpr::Target)
+    return Target.getAccessVariant();
+
+  switch (cast<PPCMCExpr>(Expr)->getKind()) {
+  case PPCMCExpr::VK_PPC_None:
+    return MCSymbolRefExpr::VK_None;
+  case PPCMCExpr::VK_PPC_LO:
+    return MCSymbolRefExpr::VK_PPC_LO;
+  case PPCMCExpr::VK_PPC_HI:
+    return MCSymbolRefExpr::VK_PPC_HI;
+  case PPCMCExpr::VK_PPC_HA:
+    return MCSymbolRefExpr::VK_PPC_HA;
+  case PPCMCExpr::VK_PPC_HIGHERA:
+    return MCSymbolRefExpr::VK_PPC_HIGHERA;
+  case PPCMCExpr::VK_PPC_HIGHER:
+    return MCSymbolRefExpr::VK_PPC_HIGHER;
+  case PPCMCExpr::VK_PPC_HIGHEST:
+    return MCSymbolRefExpr::VK_PPC_HIGHEST;
+  case PPCMCExpr::VK_PPC_HIGHESTA:
+    return MCSymbolRefExpr::VK_PPC_HIGHESTA;
+  }
+  llvm_unreachable("unknown PPCMCExpr kind");
+}
+
+unsigned PPCELFObjectWriter::getRelocTypeInner(const MCValue &Target,
+                                               const MCFixup &Fixup,
+                                               bool IsPCRel) const
+{
+  MCSymbolRefExpr::VariantKind Modifier = getAccessVariant(Target, Fixup);
+
+  // determine the type of the relocation
+  unsigned Type;
+  if (IsPCRel) {
+    switch ((unsigned)Fixup.getKind()) {
+    default:
+      llvm_unreachable("Unimplemented");
+    case PPC::fixup_ppc_br24:
+    case PPC::fixup_ppc_br24abs:
+      switch (Modifier) {
+      default: llvm_unreachable("Unsupported Modifier");
+      case MCSymbolRefExpr::VK_None:
+        Type = ELF::R_PPC_REL24;
+        break;
+      case MCSymbolRefExpr::VK_PLT:
+        Type = ELF::R_PPC_PLTREL24;
+        break;
+      }
+      break;
+    case PPC::fixup_ppc_brcond14:
+    case PPC::fixup_ppc_brcond14abs:
+      Type = ELF::R_PPC_REL14;
+      break;
+    case PPC::fixup_ppc_half16:
+      switch (Modifier) {
+      default: llvm_unreachable("Unsupported Modifier");
+      case MCSymbolRefExpr::VK_None:
+        Type = ELF::R_PPC_REL16;
+        break;
+      case MCSymbolRefExpr::VK_PPC_LO:
+        Type = ELF::R_PPC_REL16_LO;
+        break;
+      case MCSymbolRefExpr::VK_PPC_HI:
+        Type = ELF::R_PPC_REL16_HI;
+        break;
+      case MCSymbolRefExpr::VK_PPC_HA:
+        Type = ELF::R_PPC_REL16_HA;
+        break;
+      }
+      break;
+    case FK_Data_4:
+    case FK_PCRel_4:
+      Type = ELF::R_PPC_REL32;
+      break;
+    case FK_Data_8:
+    case FK_PCRel_8:
+      Type = ELF::R_PPC64_REL64;
+      break;
+    }
+  } else {
+    switch ((unsigned)Fixup.getKind()) {
+      default: llvm_unreachable("invalid fixup kind!");
+    case PPC::fixup_ppc_br24abs:
+      Type = ELF::R_PPC_ADDR24;
+      break;
+    case PPC::fixup_ppc_brcond14abs:
+      Type = ELF::R_PPC_ADDR14; // XXX: or BRNTAKEN?_
+      break;
+    case PPC::fixup_ppc_half16:
+      switch (Modifier) {
+      default: llvm_unreachable("Unsupported Modifier");
+      case MCSymbolRefExpr::VK_None:
+        Type = ELF::R_PPC_ADDR16;
+        break;
+      case MCSymbolRefExpr::VK_PPC_LO:
+        Type = ELF::R_PPC_ADDR16_LO;
+        break;
+      case MCSymbolRefExpr::VK_PPC_HI:
+        Type = ELF::R_PPC_ADDR16_HI;
+        break;
+      case MCSymbolRefExpr::VK_PPC_HA:
+        Type = ELF::R_PPC_ADDR16_HA;
+        break;
+      case MCSymbolRefExpr::VK_PPC_HIGHER:
+        Type = ELF::R_PPC64_ADDR16_HIGHER;
+        break;
+      case MCSymbolRefExpr::VK_PPC_HIGHERA:
+        Type = ELF::R_PPC64_ADDR16_HIGHERA;
+        break;
+      case MCSymbolRefExpr::VK_PPC_HIGHEST:
+        Type = ELF::R_PPC64_ADDR16_HIGHEST;
+        break;
+      case MCSymbolRefExpr::VK_PPC_HIGHESTA:
+        Type = ELF::R_PPC64_ADDR16_HIGHESTA;
+        break;
+      case MCSymbolRefExpr::VK_GOT:
+        Type = ELF::R_PPC_GOT16;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_LO:
+        Type = ELF::R_PPC_GOT16_LO;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_HI:
+        Type = ELF::R_PPC_GOT16_HI;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_HA:
+        Type = ELF::R_PPC_GOT16_HA;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TOC:
+        Type = ELF::R_PPC64_TOC16;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TOC_LO:
+        Type = ELF::R_PPC64_TOC16_LO;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TOC_HI:
+        Type = ELF::R_PPC64_TOC16_HI;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TOC_HA:
+        Type = ELF::R_PPC64_TOC16_HA;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TPREL:
+        Type = ELF::R_PPC_TPREL16;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TPREL_LO:
+        Type = ELF::R_PPC_TPREL16_LO;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TPREL_HI:
+        Type = ELF::R_PPC_TPREL16_HI;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TPREL_HA:
+        Type = ELF::R_PPC_TPREL16_HA;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TPREL_HIGHER:
+        Type = ELF::R_PPC64_TPREL16_HIGHER;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TPREL_HIGHERA:
+        Type = ELF::R_PPC64_TPREL16_HIGHERA;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TPREL_HIGHEST:
+        Type = ELF::R_PPC64_TPREL16_HIGHEST;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TPREL_HIGHESTA:
+        Type = ELF::R_PPC64_TPREL16_HIGHESTA;
+        break;
+      case MCSymbolRefExpr::VK_PPC_DTPREL:
+        Type = ELF::R_PPC64_DTPREL16;
+        break;
+      case MCSymbolRefExpr::VK_PPC_DTPREL_LO:
+        Type = ELF::R_PPC64_DTPREL16_LO;
+        break;
+      case MCSymbolRefExpr::VK_PPC_DTPREL_HI:
+        Type = ELF::R_PPC64_DTPREL16_HI;
+        break;
+      case MCSymbolRefExpr::VK_PPC_DTPREL_HA:
+        Type = ELF::R_PPC64_DTPREL16_HA;
+        break;
+      case MCSymbolRefExpr::VK_PPC_DTPREL_HIGHER:
+        Type = ELF::R_PPC64_DTPREL16_HIGHER;
+        break;
+      case MCSymbolRefExpr::VK_PPC_DTPREL_HIGHERA:
+        Type = ELF::R_PPC64_DTPREL16_HIGHERA;
+        break;
+      case MCSymbolRefExpr::VK_PPC_DTPREL_HIGHEST:
+        Type = ELF::R_PPC64_DTPREL16_HIGHEST;
+        break;
+      case MCSymbolRefExpr::VK_PPC_DTPREL_HIGHESTA:
+        Type = ELF::R_PPC64_DTPREL16_HIGHESTA;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TLSGD:
+        if (is64Bit())
+          Type = ELF::R_PPC64_GOT_TLSGD16;
+        else
+          Type = ELF::R_PPC_GOT_TLSGD16;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TLSGD_LO:
+        Type = ELF::R_PPC64_GOT_TLSGD16_LO;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TLSGD_HI:
+        Type = ELF::R_PPC64_GOT_TLSGD16_HI;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TLSGD_HA:
+        Type = ELF::R_PPC64_GOT_TLSGD16_HA;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TLSLD:
+        if (is64Bit())
+          Type = ELF::R_PPC64_GOT_TLSLD16;
+        else
+          Type = ELF::R_PPC_GOT_TLSLD16;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TLSLD_LO:
+        Type = ELF::R_PPC64_GOT_TLSLD16_LO;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TLSLD_HI:
+        Type = ELF::R_PPC64_GOT_TLSLD16_HI;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TLSLD_HA:
+        Type = ELF::R_PPC64_GOT_TLSLD16_HA;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TPREL:
+        /* We don't have R_PPC64_GOT_TPREL16, but since GOT offsets
+           are always 4-aligned, we can use R_PPC64_GOT_TPREL16_DS.  */
+        Type = ELF::R_PPC64_GOT_TPREL16_DS;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TPREL_LO:
+        /* We don't have R_PPC64_GOT_TPREL16_LO, but since GOT offsets
+           are always 4-aligned, we can use R_PPC64_GOT_TPREL16_LO_DS.  */
+        Type = ELF::R_PPC64_GOT_TPREL16_LO_DS;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TPREL_HI:
+        Type = ELF::R_PPC64_GOT_TPREL16_HI;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_DTPREL:
+        /* We don't have R_PPC64_GOT_DTPREL16, but since GOT offsets
+           are always 4-aligned, we can use R_PPC64_GOT_DTPREL16_DS.  */
+        Type = ELF::R_PPC64_GOT_DTPREL16_DS;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_DTPREL_LO:
+        /* We don't have R_PPC64_GOT_DTPREL16_LO, but since GOT offsets
+           are always 4-aligned, we can use R_PPC64_GOT_DTPREL16_LO_DS.  */
+        Type = ELF::R_PPC64_GOT_DTPREL16_LO_DS;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TPREL_HA:
+        Type = ELF::R_PPC64_GOT_TPREL16_HA;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_DTPREL_HI:
+        Type = ELF::R_PPC64_GOT_DTPREL16_HI;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_DTPREL_HA:
+        Type = ELF::R_PPC64_GOT_DTPREL16_HA;
+        break;
+      }
+      break;
+    case PPC::fixup_ppc_half16ds:
+      switch (Modifier) {
+      default: llvm_unreachable("Unsupported Modifier");
+      case MCSymbolRefExpr::VK_None:
+        Type = ELF::R_PPC64_ADDR16_DS;
+        break;
+      case MCSymbolRefExpr::VK_PPC_LO:
+        Type = ELF::R_PPC64_ADDR16_LO_DS;
+        break;
+      case MCSymbolRefExpr::VK_GOT:
+        Type = ELF::R_PPC64_GOT16_DS;
+	break;
+      case MCSymbolRefExpr::VK_PPC_GOT_LO:
+        Type = ELF::R_PPC64_GOT16_LO_DS;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TOC:
+        Type = ELF::R_PPC64_TOC16_DS;
+	break;
+      case MCSymbolRefExpr::VK_PPC_TOC_LO:
+        Type = ELF::R_PPC64_TOC16_LO_DS;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TPREL:
+        Type = ELF::R_PPC64_TPREL16_DS;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TPREL_LO:
+        Type = ELF::R_PPC64_TPREL16_LO_DS;
+        break;
+      case MCSymbolRefExpr::VK_PPC_DTPREL:
+        Type = ELF::R_PPC64_DTPREL16_DS;
+        break;
+      case MCSymbolRefExpr::VK_PPC_DTPREL_LO:
+        Type = ELF::R_PPC64_DTPREL16_LO_DS;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TPREL:
+        Type = ELF::R_PPC64_GOT_TPREL16_DS;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TPREL_LO:
+        Type = ELF::R_PPC64_GOT_TPREL16_LO_DS;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_DTPREL:
+        Type = ELF::R_PPC64_GOT_DTPREL16_DS;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_DTPREL_LO:
+        Type = ELF::R_PPC64_GOT_DTPREL16_LO_DS;
+        break;
+      }
+      break;
+    case PPC::fixup_ppc_nofixup:
+      switch (Modifier) {
+      default: llvm_unreachable("Unsupported Modifier");
+      case MCSymbolRefExpr::VK_PPC_TLSGD:
+        if (is64Bit())
+          Type = ELF::R_PPC64_TLSGD;
+        else
+          Type = ELF::R_PPC_TLSGD;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TLSLD:
+        if (is64Bit())
+          Type = ELF::R_PPC64_TLSLD;
+        else
+          Type = ELF::R_PPC_TLSLD;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TLS:
+        if (is64Bit())
+          Type = ELF::R_PPC64_TLS;
+        else
+          Type = ELF::R_PPC_TLS;
+        break;
+      }
+      break;
+    case FK_Data_8:
+      switch (Modifier) {
+      default: llvm_unreachable("Unsupported Modifier");
+      case MCSymbolRefExpr::VK_PPC_TOCBASE:
+        Type = ELF::R_PPC64_TOC;
+        break;
+      case MCSymbolRefExpr::VK_None:
+        Type = ELF::R_PPC64_ADDR64;
+	break;
+      case MCSymbolRefExpr::VK_PPC_DTPMOD:
+        Type = ELF::R_PPC64_DTPMOD64;
+	break;
+      case MCSymbolRefExpr::VK_PPC_TPREL:
+        Type = ELF::R_PPC64_TPREL64;
+	break;
+      case MCSymbolRefExpr::VK_PPC_DTPREL:
+        Type = ELF::R_PPC64_DTPREL64;
+	break;
+      }
+      break;
+    case FK_Data_4:
+      Type = ELF::R_PPC_ADDR32;
+      break;
+    case FK_Data_2:
+      Type = ELF::R_PPC_ADDR16;
+      break;
+    }
+  }
+  return Type;
+}
+
+unsigned PPCELFObjectWriter::GetRelocType(const MCValue &Target,
+                                          const MCFixup &Fixup,
+                                          bool IsPCRel) const {
+  return getRelocTypeInner(Target, Fixup, IsPCRel);
+}
+
+bool PPCELFObjectWriter::needsRelocateWithSymbol(const MCSymbolData &SD,
+                                                 unsigned Type) const {
+  switch (Type) {
+    default:
+      return false;
+
+    case ELF::R_PPC_REL24:
+      // If the target symbol has a local entry point, we must keep the
+      // target symbol to preserve that information for the linker.
+      // The "other" values are stored in the last 6 bits of the second byte.
+      // The traditional defines for STO values assume the full byte and thus
+      // the shift to pack it.
+      unsigned Other = MCELF::getOther(SD) << 2;
+      return (Other & ELF::STO_PPC64_LOCAL_MASK) != 0;
+  }
+}
+
+MCObjectWriter *llvm::createPPCELFObjectWriter(raw_ostream &OS,
+                                               bool Is64Bit,
+                                               bool IsLittleEndian,
+                                               uint8_t OSABI) {
+  MCELFObjectTargetWriter *MOTW = new PPCELFObjectWriter(Is64Bit, OSABI);
+  return createELFObjectWriter(MOTW, OS, IsLittleEndian);
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
new file mode 100644
index 0000000..68de8c1
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
@@ -0,0 +1,56 @@
+//===-- PPCFixupKinds.h - PPC Specific Fixup Entries ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_PPC_PPCFIXUPKINDS_H
+#define LLVM_PPC_PPCFIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+#undef PPC
+
+namespace llvm {
+namespace PPC {
+enum Fixups {
+  // fixup_ppc_br24 - 24-bit PC relative relocation for direct branches like 'b'
+  // and 'bl'.
+  fixup_ppc_br24 = FirstTargetFixupKind,
+  
+  /// fixup_ppc_brcond14 - 14-bit PC relative relocation for conditional
+  /// branches.
+  fixup_ppc_brcond14,
+  
+  /// fixup_ppc_br24abs - 24-bit absolute relocation for direct branches
+  /// like 'ba' and 'bla'.
+  fixup_ppc_br24abs,
+
+  /// fixup_ppc_brcond14abs - 14-bit absolute relocation for conditional
+  /// branches.
+  fixup_ppc_brcond14abs,
+
+  /// fixup_ppc_half16 - A 16-bit fixup corresponding to lo16(_foo)
+  /// or ha16(_foo) for instrs like 'li' or 'addis'.
+  fixup_ppc_half16,
+  
+  /// fixup_ppc_half16ds - A 14-bit fixup corresponding to lo16(_foo) with
+  /// implied 2 zero bits for instrs like 'std'.
+  fixup_ppc_half16ds,
+
+  /// fixup_ppc_nofixup - Not a true fixup, but ties a symbol to a call
+  /// to __tls_get_addr for the TLS general and local dynamic models,
+  /// or inserts the thread-pointer register number.
+  fixup_ppc_nofixup,
+  
+  // Marker
+  LastTargetFixupKind,
+  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+}
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
new file mode 100644
index 0000000..b95a2ac
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
@@ -0,0 +1,82 @@
+//===-- PPCMCAsmInfo.cpp - PPC asm properties -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the MCAsmInfoDarwin properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCMCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
+
+using namespace llvm;
+
+void PPCMCAsmInfoDarwin::anchor() { }
+
+PPCMCAsmInfoDarwin::PPCMCAsmInfoDarwin(bool is64Bit, const Triple& T) {
+  if (is64Bit) {
+    PointerSize = CalleeSaveStackSlotSize = 8;
+  }
+  IsLittleEndian = false;
+
+  CommentString = ";";
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+
+  if (!is64Bit)
+    Data64bitsDirective = nullptr; // We can't emit a 64-bit unit in PPC32 mode.
+
+  AssemblerDialect = 1;           // New-Style mnemonics.
+  SupportsDebugInformation= true; // Debug information.
+
+  // The installed assembler for OSX < 10.6 lacks some directives.
+  // FIXME: this should really be a check on the assembler characteristics
+  // rather than OS version
+  if (T.isMacOSX() && T.isMacOSXVersionLT(10, 6))
+    HasWeakDefCanBeHiddenDirective = false;
+
+  UseIntegratedAssembler = true;
+}
+
+void PPCLinuxMCAsmInfo::anchor() { }
+
+PPCLinuxMCAsmInfo::PPCLinuxMCAsmInfo(bool is64Bit, const Triple& T) {
+  if (is64Bit) {
+    PointerSize = CalleeSaveStackSlotSize = 8;
+  }
+  IsLittleEndian = T.getArch() == Triple::ppc64le;
+
+  // ".comm align is in bytes but .align is pow-2."
+  AlignmentIsInBytes = false;
+
+  CommentString = "#";
+
+  // Uses '.section' before '.bss' directive
+  UsesELFSectionDirectiveForBSS = true;  
+
+  // Debug Information
+  SupportsDebugInformation = true;
+
+  DollarIsPC = true;
+
+  // Set up DWARF directives
+  HasLEB128 = true;  // Target asm supports leb128 directives (little-endian)
+  MinInstAlignment = 4;
+
+  // Exceptions handling
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+    
+  ZeroDirective = "\t.space\t";
+  Data64bitsDirective = is64Bit ? "\t.quad\t" : nullptr;
+  AssemblerDialect = 1;           // New-Style mnemonics.
+
+  if (T.getOS() == llvm::Triple::FreeBSD ||
+      (T.getOS() == llvm::Triple::NetBSD && !is64Bit) ||
+      (T.getOS() == llvm::Triple::OpenBSD && !is64Bit))
+    UseIntegratedAssembler = true;
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
new file mode 100644
index 0000000..754330b
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
@@ -0,0 +1,37 @@
+//===-- PPCMCAsmInfo.h - PPC asm properties --------------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the MCAsmInfoDarwin class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PPCTARGETASMINFO_H
+#define PPCTARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfoDarwin.h"
+#include "llvm/MC/MCAsmInfoELF.h"
+
+namespace llvm {
+class Triple;
+
+  class PPCMCAsmInfoDarwin : public MCAsmInfoDarwin {
+    void anchor() override;
+  public:
+    explicit PPCMCAsmInfoDarwin(bool is64Bit, const Triple&);
+  };
+
+  class PPCLinuxMCAsmInfo : public MCAsmInfoELF {
+    void anchor() override;
+  public:
+    explicit PPCLinuxMCAsmInfo(bool is64Bit, const Triple&);
+  };
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
new file mode 100644
index 0000000..435a93f
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
@@ -0,0 +1,322 @@
+//===-- PPCMCCodeEmitter.cpp - Convert PPC code to machine code -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PPCMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/PPCMCTargetDesc.h"
+#include "MCTargetDesc/PPCFixupKinds.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOpcodes.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "mccodeemitter"
+
+STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
+
+namespace {
+class PPCMCCodeEmitter : public MCCodeEmitter {
+  PPCMCCodeEmitter(const PPCMCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  void operator=(const PPCMCCodeEmitter &) LLVM_DELETED_FUNCTION;
+
+  const MCInstrInfo &MCII;
+  const MCContext &CTX;
+  bool IsLittleEndian;
+
+public:
+  PPCMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx, bool isLittle)
+    : MCII(mcii), CTX(ctx), IsLittleEndian(isLittle) {
+  }
+  
+  ~PPCMCCodeEmitter() {}
+
+  unsigned getDirectBrEncoding(const MCInst &MI, unsigned OpNo,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
+  unsigned getCondBrEncoding(const MCInst &MI, unsigned OpNo,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+  unsigned getAbsDirectBrEncoding(const MCInst &MI, unsigned OpNo,
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const;
+  unsigned getAbsCondBrEncoding(const MCInst &MI, unsigned OpNo,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+  unsigned getImm16Encoding(const MCInst &MI, unsigned OpNo,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+  unsigned getMemRIEncoding(const MCInst &MI, unsigned OpNo,
+                            SmallVectorImpl<MCFixup> &Fixups,
+                            const MCSubtargetInfo &STI) const;
+  unsigned getMemRIXEncoding(const MCInst &MI, unsigned OpNo,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+  unsigned getTLSRegEncoding(const MCInst &MI, unsigned OpNo,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+  unsigned getTLSCallEncoding(const MCInst &MI, unsigned OpNo,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
+  unsigned get_crbitm_encoding(const MCInst &MI, unsigned OpNo,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
+
+  /// getMachineOpValue - Return binary encoding of operand. If the machine
+  /// operand requires relocation, record the relocation and return zero.
+  unsigned getMachineOpValue(const MCInst &MI,const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+  
+  // getBinaryCodeForInstr - TableGen'erated function for getting the
+  // binary encoding for an instruction.
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const override {
+    // For fast-isel, a float COPY_TO_REGCLASS can survive this long.
+    // It's just a nop to keep the register classes happy, so don't
+    // generate anything.
+    unsigned Opcode = MI.getOpcode();
+    const MCInstrDesc &Desc = MCII.get(Opcode);
+    if (Opcode == TargetOpcode::COPY_TO_REGCLASS)
+      return;
+
+    uint64_t Bits = getBinaryCodeForInstr(MI, Fixups, STI);
+
+    // Output the constant in big/little endian byte order.
+    unsigned Size = Desc.getSize();
+    switch (Size) {
+    case 4:
+      if (IsLittleEndian) {
+        OS << (char)(Bits);
+        OS << (char)(Bits >> 8);
+        OS << (char)(Bits >> 16);
+        OS << (char)(Bits >> 24);
+      } else {
+        OS << (char)(Bits >> 24);
+        OS << (char)(Bits >> 16);
+        OS << (char)(Bits >> 8);
+        OS << (char)(Bits);
+      }
+      break;
+    case 8:
+      // If we emit a pair of instructions, the first one is
+      // always in the top 32 bits, even on little-endian.
+      if (IsLittleEndian) {
+        OS << (char)(Bits >> 32);
+        OS << (char)(Bits >> 40);
+        OS << (char)(Bits >> 48);
+        OS << (char)(Bits >> 56);
+        OS << (char)(Bits);
+        OS << (char)(Bits >> 8);
+        OS << (char)(Bits >> 16);
+        OS << (char)(Bits >> 24);
+      } else {
+        OS << (char)(Bits >> 56);
+        OS << (char)(Bits >> 48);
+        OS << (char)(Bits >> 40);
+        OS << (char)(Bits >> 32);
+        OS << (char)(Bits >> 24);
+        OS << (char)(Bits >> 16);
+        OS << (char)(Bits >> 8);
+        OS << (char)(Bits);
+      }
+      break;
+    default:
+      llvm_unreachable ("Invalid instruction size");
+    }
+    
+    ++MCNumEmitted;  // Keep track of the # of mi's emitted.
+  }
+  
+};
+  
+} // end anonymous namespace
+  
+MCCodeEmitter *llvm::createPPCMCCodeEmitter(const MCInstrInfo &MCII,
+                                            const MCRegisterInfo &MRI,
+                                            const MCSubtargetInfo &STI,
+                                            MCContext &Ctx) {
+  Triple TT(STI.getTargetTriple());
+  bool IsLittleEndian = TT.getArch() == Triple::ppc64le;
+  return new PPCMCCodeEmitter(MCII, Ctx, IsLittleEndian);
+}
+
+unsigned PPCMCCodeEmitter::
+getDirectBrEncoding(const MCInst &MI, unsigned OpNo,
+                    SmallVectorImpl<MCFixup> &Fixups,
+                    const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups, STI);
+  
+  // Add a fixup for the branch target.
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
+                                   (MCFixupKind)PPC::fixup_ppc_br24));
+  return 0;
+}
+
+unsigned PPCMCCodeEmitter::getCondBrEncoding(const MCInst &MI, unsigned OpNo,
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups, STI);
+
+  // Add a fixup for the branch target.
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
+                                   (MCFixupKind)PPC::fixup_ppc_brcond14));
+  return 0;
+}
+
+unsigned PPCMCCodeEmitter::
+getAbsDirectBrEncoding(const MCInst &MI, unsigned OpNo,
+                       SmallVectorImpl<MCFixup> &Fixups,
+                       const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups, STI);
+
+  // Add a fixup for the branch target.
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
+                                   (MCFixupKind)PPC::fixup_ppc_br24abs));
+  return 0;
+}
+
+unsigned PPCMCCodeEmitter::
+getAbsCondBrEncoding(const MCInst &MI, unsigned OpNo,
+                     SmallVectorImpl<MCFixup> &Fixups,
+                     const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups, STI);
+
+  // Add a fixup for the branch target.
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
+                                   (MCFixupKind)PPC::fixup_ppc_brcond14abs));
+  return 0;
+}
+
+unsigned PPCMCCodeEmitter::getImm16Encoding(const MCInst &MI, unsigned OpNo,
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups, STI);
+  
+  // Add a fixup for the immediate field.
+  Fixups.push_back(MCFixup::Create(IsLittleEndian? 0 : 2, MO.getExpr(),
+                                   (MCFixupKind)PPC::fixup_ppc_half16));
+  return 0;
+}
+
+unsigned PPCMCCodeEmitter::getMemRIEncoding(const MCInst &MI, unsigned OpNo,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  // Encode (imm, reg) as a memri, which has the low 16-bits as the
+  // displacement and the next 5 bits as the register #.
+  assert(MI.getOperand(OpNo+1).isReg());
+  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI) << 16;
+  
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isImm())
+    return (getMachineOpValue(MI, MO, Fixups, STI) & 0xFFFF) | RegBits;
+  
+  // Add a fixup for the displacement field.
+  Fixups.push_back(MCFixup::Create(IsLittleEndian? 0 : 2, MO.getExpr(),
+                                   (MCFixupKind)PPC::fixup_ppc_half16));
+  return RegBits;
+}
+
+
+unsigned PPCMCCodeEmitter::getMemRIXEncoding(const MCInst &MI, unsigned OpNo,
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const {
+  // Encode (imm, reg) as a memrix, which has the low 14-bits as the
+  // displacement and the next 5 bits as the register #.
+  assert(MI.getOperand(OpNo+1).isReg());
+  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI) << 14;
+  
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isImm())
+    return ((getMachineOpValue(MI, MO, Fixups, STI) >> 2) & 0x3FFF) | RegBits;
+  
+  // Add a fixup for the displacement field.
+  Fixups.push_back(MCFixup::Create(IsLittleEndian? 0 : 2, MO.getExpr(),
+                                   (MCFixupKind)PPC::fixup_ppc_half16ds));
+  return RegBits;
+}
+
+
+unsigned PPCMCCodeEmitter::getTLSRegEncoding(const MCInst &MI, unsigned OpNo,
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isReg()) return getMachineOpValue(MI, MO, Fixups, STI);
+  
+  // Add a fixup for the TLS register, which simply provides a relocation
+  // hint to the linker that this statement is part of a relocation sequence.
+  // Return the thread-pointer register's encoding.
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
+                                   (MCFixupKind)PPC::fixup_ppc_nofixup));
+  Triple TT(STI.getTargetTriple());
+  bool isPPC64 = TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le;
+  return CTX.getRegisterInfo()->getEncodingValue(isPPC64 ? PPC::X13 : PPC::R2);
+}
+
+unsigned PPCMCCodeEmitter::getTLSCallEncoding(const MCInst &MI, unsigned OpNo,
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const {
+  // For special TLS calls, we need two fixups; one for the branch target
+  // (__tls_get_addr), which we create via getDirectBrEncoding as usual,
+  // and one for the TLSGD or TLSLD symbol, which is emitted here.
+  const MCOperand &MO = MI.getOperand(OpNo+1);
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
+                                   (MCFixupKind)PPC::fixup_ppc_nofixup));
+  return getDirectBrEncoding(MI, OpNo, Fixups, STI);
+}
+
+unsigned PPCMCCodeEmitter::
+get_crbitm_encoding(const MCInst &MI, unsigned OpNo,
+                    SmallVectorImpl<MCFixup> &Fixups,
+                    const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  assert((MI.getOpcode() == PPC::MTOCRF || MI.getOpcode() == PPC::MTOCRF8 ||
+          MI.getOpcode() == PPC::MFOCRF || MI.getOpcode() == PPC::MFOCRF8) &&
+         (MO.getReg() >= PPC::CR0 && MO.getReg() <= PPC::CR7));
+  return 0x80 >> CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
+}
+
+
+unsigned PPCMCCodeEmitter::
+getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                  SmallVectorImpl<MCFixup> &Fixups,
+                  const MCSubtargetInfo &STI) const {
+  if (MO.isReg()) {
+    // MTOCRF/MFOCRF should go through get_crbitm_encoding for the CR operand.
+    // The GPR operand should come through here though.
+    assert((MI.getOpcode() != PPC::MTOCRF && MI.getOpcode() != PPC::MTOCRF8 &&
+            MI.getOpcode() != PPC::MFOCRF && MI.getOpcode() != PPC::MFOCRF8) ||
+           MO.getReg() < PPC::CR0 || MO.getReg() > PPC::CR7);
+    return CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
+  }
+  
+  assert(MO.isImm() &&
+         "Relocation required in an instruction that we cannot encode!");
+  return MO.getImm();
+}
+
+
+#include "PPCGenMCCodeEmitter.inc"
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
new file mode 100644
index 0000000..3ac0aca
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
@@ -0,0 +1,133 @@
+//===-- PPCMCExpr.cpp - PPC specific MC expression classes ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCMCExpr.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCObjectStreamer.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ppcmcexpr"
+
+const PPCMCExpr*
+PPCMCExpr::Create(VariantKind Kind, const MCExpr *Expr,
+                  bool isDarwin, MCContext &Ctx) {
+  return new (Ctx) PPCMCExpr(Kind, Expr, isDarwin);
+}
+
+void PPCMCExpr::PrintImpl(raw_ostream &OS) const {
+  if (isDarwinSyntax()) {
+    switch (Kind) {
+    default: llvm_unreachable("Invalid kind!");
+    case VK_PPC_LO: OS << "lo16"; break;
+    case VK_PPC_HI: OS << "hi16"; break;
+    case VK_PPC_HA: OS << "ha16"; break;
+    }
+
+    OS << '(';
+    getSubExpr()->print(OS);
+    OS << ')';
+  } else {
+    getSubExpr()->print(OS);
+
+    switch (Kind) {
+    default: llvm_unreachable("Invalid kind!");
+    case VK_PPC_LO: OS << "@l"; break;
+    case VK_PPC_HI: OS << "@h"; break;
+    case VK_PPC_HA: OS << "@ha"; break;
+    case VK_PPC_HIGHER: OS << "@higher"; break;
+    case VK_PPC_HIGHERA: OS << "@highera"; break;
+    case VK_PPC_HIGHEST: OS << "@highest"; break;
+    case VK_PPC_HIGHESTA: OS << "@highesta"; break;
+    }
+  }
+}
+
+bool
+PPCMCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
+                                     const MCAsmLayout *Layout) const {
+  MCValue Value;
+
+  if (!getSubExpr()->EvaluateAsRelocatable(Value, Layout))
+    return false;
+
+  if (Value.isAbsolute()) {
+    int64_t Result = Value.getConstant();
+    switch (Kind) {
+      default:
+        llvm_unreachable("Invalid kind!");
+      case VK_PPC_LO:
+        Result = Result & 0xffff;
+        break;
+      case VK_PPC_HI:
+        Result = (Result >> 16) & 0xffff;
+        break;
+      case VK_PPC_HA:
+        Result = ((Result + 0x8000) >> 16) & 0xffff;
+        break;
+      case VK_PPC_HIGHER:
+        Result = (Result >> 32) & 0xffff;
+        break;
+      case VK_PPC_HIGHERA:
+        Result = ((Result + 0x8000) >> 32) & 0xffff;
+        break;
+      case VK_PPC_HIGHEST:
+        Result = (Result >> 48) & 0xffff;
+        break;
+      case VK_PPC_HIGHESTA:
+        Result = ((Result + 0x8000) >> 48) & 0xffff;
+        break;
+    }
+    Res = MCValue::get(Result);
+  } else {
+    if (!Layout)
+      return false;
+
+    MCContext &Context = Layout->getAssembler().getContext();
+    const MCSymbolRefExpr *Sym = Value.getSymA();
+    MCSymbolRefExpr::VariantKind Modifier = Sym->getKind();
+    if (Modifier != MCSymbolRefExpr::VK_None)
+      return false;
+    switch (Kind) {
+      default:
+        llvm_unreachable("Invalid kind!");
+      case VK_PPC_LO:
+        Modifier = MCSymbolRefExpr::VK_PPC_LO;
+        break;
+      case VK_PPC_HI:
+        Modifier = MCSymbolRefExpr::VK_PPC_HI;
+        break;
+      case VK_PPC_HA:
+        Modifier = MCSymbolRefExpr::VK_PPC_HA;
+        break;
+      case VK_PPC_HIGHERA:
+        Modifier = MCSymbolRefExpr::VK_PPC_HIGHERA;
+        break;
+      case VK_PPC_HIGHER:
+        Modifier = MCSymbolRefExpr::VK_PPC_HIGHER;
+        break;
+      case VK_PPC_HIGHEST:
+        Modifier = MCSymbolRefExpr::VK_PPC_HIGHEST;
+        break;
+      case VK_PPC_HIGHESTA:
+        Modifier = MCSymbolRefExpr::VK_PPC_HIGHESTA;
+        break;
+    }
+    Sym = MCSymbolRefExpr::Create(&Sym->getSymbol(), Modifier, Context);
+    Res = MCValue::get(Sym, Value.getSymB(), Value.getConstant());
+  }
+
+  return true;
+}
+
+void PPCMCExpr::visitUsedExpr(MCStreamer &Streamer) const {
+  Streamer.visitUsedExpr(*getSubExpr());
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
new file mode 100644
index 0000000..bca4085
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
@@ -0,0 +1,96 @@
+//===-- PPCMCExpr.h - PPC specific MC expression classes --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PPCMCEXPR_H
+#define PPCMCEXPR_H
+
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
+
+namespace llvm {
+
+class PPCMCExpr : public MCTargetExpr {
+public:
+  enum VariantKind {
+    VK_PPC_None,
+    VK_PPC_LO,
+    VK_PPC_HI,
+    VK_PPC_HA,
+    VK_PPC_HIGHER,
+    VK_PPC_HIGHERA,
+    VK_PPC_HIGHEST,
+    VK_PPC_HIGHESTA
+  };
+
+private:
+  const VariantKind Kind;
+  const MCExpr *Expr;
+  bool IsDarwin;
+
+  explicit PPCMCExpr(VariantKind _Kind, const MCExpr *_Expr,
+                     bool _IsDarwin)
+    : Kind(_Kind), Expr(_Expr), IsDarwin(_IsDarwin) {}
+
+public:
+  /// @name Construction
+  /// @{
+
+  static const PPCMCExpr *Create(VariantKind Kind, const MCExpr *Expr,
+                                 bool isDarwin, MCContext &Ctx);
+
+  static const PPCMCExpr *CreateLo(const MCExpr *Expr,
+                                   bool isDarwin, MCContext &Ctx) {
+    return Create(VK_PPC_LO, Expr, isDarwin, Ctx);
+  }
+
+  static const PPCMCExpr *CreateHi(const MCExpr *Expr,
+                                   bool isDarwin, MCContext &Ctx) {
+    return Create(VK_PPC_HI, Expr, isDarwin, Ctx);
+  }
+
+  static const PPCMCExpr *CreateHa(const MCExpr *Expr,
+                                   bool isDarwin, MCContext &Ctx) {
+    return Create(VK_PPC_HA, Expr, isDarwin, Ctx);
+  }
+
+  /// @}
+  /// @name Accessors
+  /// @{
+
+  /// getOpcode - Get the kind of this expression.
+  VariantKind getKind() const { return Kind; }
+
+  /// getSubExpr - Get the child of this expression.
+  const MCExpr *getSubExpr() const { return Expr; }
+
+  /// isDarwinSyntax - True if expression is to be printed using Darwin syntax.
+  bool isDarwinSyntax() const { return IsDarwin; }
+
+
+  /// @}
+
+  void PrintImpl(raw_ostream &OS) const override;
+  bool EvaluateAsRelocatableImpl(MCValue &Res,
+                                 const MCAsmLayout *Layout) const override;
+  void visitUsedExpr(MCStreamer &Streamer) const override;
+  const MCSection *FindAssociatedSection() const override {
+    return getSubExpr()->FindAssociatedSection();
+  }
+
+  // There are no TLS PPCMCExprs at the moment.
+  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override {}
+
+  static bool classof(const MCExpr *E) {
+    return E->getKind() == MCExpr::Target;
+  }
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
new file mode 100644
index 0000000..4c6780f
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -0,0 +1,307 @@
+//===-- PPCMCTargetDesc.cpp - PowerPC Target Descriptions -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides PowerPC specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCMCTargetDesc.h"
+#include "InstPrinter/PPCInstPrinter.h"
+#include "PPCMCAsmInfo.h"
+#include "PPCTargetStreamer.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCELF.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_MC_DESC
+#include "PPCGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "PPCGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "PPCGenRegisterInfo.inc"
+
+// Pin the vtable to this file.
+PPCTargetStreamer::~PPCTargetStreamer() {}
+PPCTargetStreamer::PPCTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
+
+static MCInstrInfo *createPPCMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitPPCMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createPPCMCRegisterInfo(StringRef TT) {
+  Triple TheTriple(TT);
+  bool isPPC64 = (TheTriple.getArch() == Triple::ppc64 ||
+                  TheTriple.getArch() == Triple::ppc64le);
+  unsigned Flavour = isPPC64 ? 0 : 1;
+  unsigned RA = isPPC64 ? PPC::LR8 : PPC::LR;
+
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitPPCMCRegisterInfo(X, RA, Flavour, Flavour);
+  return X;
+}
+
+static MCSubtargetInfo *createPPCMCSubtargetInfo(StringRef TT, StringRef CPU,
+                                                 StringRef FS) {
+  MCSubtargetInfo *X = new MCSubtargetInfo();
+  InitPPCMCSubtargetInfo(X, TT, CPU, FS);
+  return X;
+}
+
+static MCAsmInfo *createPPCMCAsmInfo(const MCRegisterInfo &MRI, StringRef TT) {
+  Triple TheTriple(TT);
+  bool isPPC64 = (TheTriple.getArch() == Triple::ppc64 ||
+                  TheTriple.getArch() == Triple::ppc64le);
+
+  MCAsmInfo *MAI;
+  if (TheTriple.isOSDarwin())
+    MAI = new PPCMCAsmInfoDarwin(isPPC64, TheTriple);
+  else
+    MAI = new PPCLinuxMCAsmInfo(isPPC64, TheTriple);
+
+  // Initial state of the frame pointer is R1.
+  unsigned Reg = isPPC64 ? PPC::X1 : PPC::R1;
+  MCCFIInstruction Inst =
+      MCCFIInstruction::createDefCfa(nullptr, MRI.getDwarfRegNum(Reg, true), 0);
+  MAI->addInitialFrameState(Inst);
+
+  return MAI;
+}
+
+static MCCodeGenInfo *createPPCMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                             CodeModel::Model CM,
+                                             CodeGenOpt::Level OL) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+
+  if (RM == Reloc::Default) {
+    Triple T(TT);
+    if (T.isOSDarwin())
+      RM = Reloc::DynamicNoPIC;
+    else
+      RM = Reloc::Static;
+  }
+  if (CM == CodeModel::Default) {
+    Triple T(TT);
+    if (!T.isOSDarwin() &&
+        (T.getArch() == Triple::ppc64 || T.getArch() == Triple::ppc64le))
+      CM = CodeModel::Medium;
+  }
+  X->InitMCCodeGenInfo(RM, CM, OL);
+  return X;
+}
+
+namespace {
+class PPCTargetAsmStreamer : public PPCTargetStreamer {
+  formatted_raw_ostream &OS;
+
+public:
+  PPCTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS)
+      : PPCTargetStreamer(S), OS(OS) {}
+  void emitTCEntry(const MCSymbol &S) override {
+    OS << "\t.tc ";
+    OS << S.getName();
+    OS << "[TC],";
+    OS << S.getName();
+    OS << '\n';
+  }
+  void emitMachine(StringRef CPU) override {
+    OS << "\t.machine " << CPU << '\n';
+  }
+  virtual void emitAbiVersion(int AbiVersion) override {
+    OS << "\t.abiversion " << AbiVersion << '\n';
+  }
+  virtual void emitLocalEntry(MCSymbol *S, const MCExpr *LocalOffset) {
+    OS << "\t.localentry\t" << *S << ", " << *LocalOffset << '\n';
+  }
+};
+
+class PPCTargetELFStreamer : public PPCTargetStreamer {
+public:
+  PPCTargetELFStreamer(MCStreamer &S) : PPCTargetStreamer(S) {}
+  MCELFStreamer &getStreamer() {
+    return static_cast<MCELFStreamer &>(Streamer);
+  }
+  virtual void emitTCEntry(const MCSymbol &S) override {
+    // Creates a R_PPC64_TOC relocation
+    Streamer.EmitSymbolValue(&S, 8);
+  }
+  void emitMachine(StringRef CPU) override {
+    // FIXME: Is there anything to do in here or does this directive only
+    // limit the parser?
+  }
+  virtual void emitAbiVersion(int AbiVersion) override {
+    MCAssembler &MCA = getStreamer().getAssembler();
+    unsigned Flags = MCA.getELFHeaderEFlags();
+    Flags &= ~ELF::EF_PPC64_ABI;
+    Flags |= (AbiVersion & ELF::EF_PPC64_ABI);
+    MCA.setELFHeaderEFlags(Flags);
+  }
+  virtual void emitLocalEntry(MCSymbol *S, const MCExpr *LocalOffset) {
+    MCAssembler &MCA = getStreamer().getAssembler();
+    MCSymbolData &Data = getStreamer().getOrCreateSymbolData(S);
+
+    int64_t Res;
+    if (!LocalOffset->EvaluateAsAbsolute(Res, MCA))
+      report_fatal_error(".localentry expression must be absolute.");
+
+    unsigned Encoded = ELF::encodePPC64LocalEntryOffset(Res);
+    if (Res != ELF::decodePPC64LocalEntryOffset(Encoded))
+      report_fatal_error(".localentry expression cannot be encoded.");
+
+    // The "other" values are stored in the last 6 bits of the second byte.
+    // The traditional defines for STO values assume the full byte and thus
+    // the shift to pack it.
+    unsigned Other = MCELF::getOther(Data) << 2;
+    Other &= ~ELF::STO_PPC64_LOCAL_MASK;
+    Other |= Encoded;
+    MCELF::setOther(Data, Other >> 2);
+
+    // For GAS compatibility, unless we already saw a .abiversion directive,
+    // set e_flags to indicate ELFv2 ABI.
+    unsigned Flags = MCA.getELFHeaderEFlags();
+    if ((Flags & ELF::EF_PPC64_ABI) == 0)
+      MCA.setELFHeaderEFlags(Flags | 2);
+  }
+};
+
+class PPCTargetMachOStreamer : public PPCTargetStreamer {
+public:
+  PPCTargetMachOStreamer(MCStreamer &S) : PPCTargetStreamer(S) {}
+  void emitTCEntry(const MCSymbol &S) override {
+    llvm_unreachable("Unknown pseudo-op: .tc");
+  }
+  void emitMachine(StringRef CPU) override {
+    // FIXME: We should update the CPUType, CPUSubType in the Object file if
+    // the new values are different from the defaults.
+  }
+  virtual void emitAbiVersion(int AbiVersion) override {
+    llvm_unreachable("Unknown pseudo-op: .abiversion");
+  }
+  virtual void emitLocalEntry(MCSymbol *S, const MCExpr *LocalOffset) {
+    llvm_unreachable("Unknown pseudo-op: .localentry");
+  }
+};
+}
+
+// This is duplicated code. Refactor this.
+static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
+                                    MCContext &Ctx, MCAsmBackend &MAB,
+                                    raw_ostream &OS,
+                                    MCCodeEmitter *Emitter,
+                                    const MCSubtargetInfo &STI,
+                                    bool RelaxAll,
+                                    bool NoExecStack) {
+  if (Triple(TT).isOSDarwin()) {
+    MCStreamer *S = createMachOStreamer(Ctx, MAB, OS, Emitter, RelaxAll);
+    new PPCTargetMachOStreamer(*S);
+    return S;
+  }
+
+  MCStreamer *S =
+      createELFStreamer(Ctx, MAB, OS, Emitter, RelaxAll, NoExecStack);
+  new PPCTargetELFStreamer(*S);
+  return S;
+}
+
+static MCStreamer *
+createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
+                    bool isVerboseAsm, bool useDwarfDirectory,
+                    MCInstPrinter *InstPrint, MCCodeEmitter *CE,
+                    MCAsmBackend *TAB, bool ShowInst) {
+
+  MCStreamer *S = llvm::createAsmStreamer(
+      Ctx, OS, isVerboseAsm, useDwarfDirectory, InstPrint, CE, TAB, ShowInst);
+  new PPCTargetAsmStreamer(*S, OS);
+  return S;
+}
+
+static MCInstPrinter *createPPCMCInstPrinter(const Target &T,
+                                             unsigned SyntaxVariant,
+                                             const MCAsmInfo &MAI,
+                                             const MCInstrInfo &MII,
+                                             const MCRegisterInfo &MRI,
+                                             const MCSubtargetInfo &STI) {
+  bool isDarwin = Triple(STI.getTargetTriple()).isOSDarwin();
+  return new PPCInstPrinter(MAI, MII, MRI, isDarwin);
+}
+
+extern "C" void LLVMInitializePowerPCTargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfoFn C(ThePPC32Target, createPPCMCAsmInfo);
+  RegisterMCAsmInfoFn D(ThePPC64Target, createPPCMCAsmInfo);  
+  RegisterMCAsmInfoFn E(ThePPC64LETarget, createPPCMCAsmInfo);  
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(ThePPC32Target, createPPCMCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(ThePPC64Target, createPPCMCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(ThePPC64LETarget,
+                                        createPPCMCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(ThePPC32Target, createPPCMCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(ThePPC64Target, createPPCMCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(ThePPC64LETarget,
+                                      createPPCMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(ThePPC32Target, createPPCMCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(ThePPC64Target, createPPCMCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(ThePPC64LETarget, createPPCMCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(ThePPC32Target,
+                                          createPPCMCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(ThePPC64Target,
+                                          createPPCMCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(ThePPC64LETarget,
+                                          createPPCMCSubtargetInfo);
+
+  // Register the MC Code Emitter
+  TargetRegistry::RegisterMCCodeEmitter(ThePPC32Target, createPPCMCCodeEmitter);
+  TargetRegistry::RegisterMCCodeEmitter(ThePPC64Target, createPPCMCCodeEmitter);
+  TargetRegistry::RegisterMCCodeEmitter(ThePPC64LETarget,
+                                        createPPCMCCodeEmitter);
+  
+    // Register the asm backend.
+  TargetRegistry::RegisterMCAsmBackend(ThePPC32Target, createPPCAsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(ThePPC64Target, createPPCAsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(ThePPC64LETarget, createPPCAsmBackend);
+  
+  // Register the object streamer.
+  TargetRegistry::RegisterMCObjectStreamer(ThePPC32Target, createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(ThePPC64Target, createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(ThePPC64LETarget, createMCStreamer);
+
+  // Register the asm streamer.
+  TargetRegistry::RegisterAsmStreamer(ThePPC32Target, createMCAsmStreamer);
+  TargetRegistry::RegisterAsmStreamer(ThePPC64Target, createMCAsmStreamer);
+  TargetRegistry::RegisterAsmStreamer(ThePPC64LETarget, createMCAsmStreamer);
+
+  // Register the MCInstPrinter.
+  TargetRegistry::RegisterMCInstPrinter(ThePPC32Target, createPPCMCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(ThePPC64Target, createPPCMCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(ThePPC64LETarget,
+                                        createPPCMCInstPrinter);
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
new file mode 100644
index 0000000..474395b
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
@@ -0,0 +1,75 @@
+//===-- PPCMCTargetDesc.h - PowerPC Target Descriptions ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides PowerPC specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PPCMCTARGETDESC_H
+#define PPCMCTARGETDESC_H
+
+// GCC #defines PPC on Linux but we use it as our namespace name
+#undef PPC
+
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectWriter;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class Target;
+class StringRef;
+class raw_ostream;
+
+extern Target ThePPC32Target;
+extern Target ThePPC64Target;
+extern Target ThePPC64LETarget;
+  
+MCCodeEmitter *createPPCMCCodeEmitter(const MCInstrInfo &MCII,
+                                      const MCRegisterInfo &MRI,
+                                      const MCSubtargetInfo &STI,
+                                      MCContext &Ctx);
+
+MCAsmBackend *createPPCAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                  StringRef TT, StringRef CPU);
+
+/// createPPCELFObjectWriter - Construct an PPC ELF object writer.
+MCObjectWriter *createPPCELFObjectWriter(raw_ostream &OS,
+                                         bool Is64Bit,
+                                         bool IsLittleEndian,
+                                         uint8_t OSABI);
+/// createPPCELFObjectWriter - Construct a PPC Mach-O object writer.
+MCObjectWriter *createPPCMachObjectWriter(raw_ostream &OS, bool Is64Bit,
+                                          uint32_t CPUType,
+                                          uint32_t CPUSubtype);
+} // End llvm namespace
+
+// Generated files will use "namespace PPC". To avoid symbol clash,
+// undefine PPC here. PPC may be predefined on some hosts.
+#undef PPC
+
+// Defines symbolic names for PowerPC registers.  This defines a mapping from
+// register name to register number.
+//
+#define GET_REGINFO_ENUM
+#include "PPCGenRegisterInfo.inc"
+
+// Defines symbolic names for the PowerPC instructions.
+//
+#define GET_INSTRINFO_ENUM
+#include "PPCGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "PPCGenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
new file mode 100644
index 0000000..cff27ba
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
@@ -0,0 +1,389 @@
+//===-- PPCMachObjectWriter.cpp - PPC Mach-O Writer -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/PPCMCTargetDesc.h"
+#include "MCTargetDesc/PPCFixupKinds.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/MachO.h"
+
+using namespace llvm;
+
+namespace {
+class PPCMachObjectWriter : public MCMachObjectTargetWriter {
+  bool RecordScatteredRelocation(MachObjectWriter *Writer,
+                                 const MCAssembler &Asm,
+                                 const MCAsmLayout &Layout,
+                                 const MCFragment *Fragment,
+                                 const MCFixup &Fixup, MCValue Target,
+                                 unsigned Log2Size, uint64_t &FixedValue);
+
+  void RecordPPCRelocation(MachObjectWriter *Writer, const MCAssembler &Asm,
+                           const MCAsmLayout &Layout,
+                           const MCFragment *Fragment, const MCFixup &Fixup,
+                           MCValue Target, uint64_t &FixedValue);
+
+public:
+  PPCMachObjectWriter(bool Is64Bit, uint32_t CPUType, uint32_t CPUSubtype)
+      : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype,
+                                 /*UseAggressiveSymbolFolding=*/Is64Bit) {}
+
+  void RecordRelocation(MachObjectWriter *Writer, const MCAssembler &Asm,
+                        const MCAsmLayout &Layout, const MCFragment *Fragment,
+                        const MCFixup &Fixup, MCValue Target,
+                        uint64_t &FixedValue) override {
+    if (Writer->is64Bit()) {
+      report_fatal_error("Relocation emission for MachO/PPC64 unimplemented.");
+    } else
+      RecordPPCRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+                          FixedValue);
+  }
+};
+}
+
+/// computes the log2 of the size of the relocation,
+/// used for relocation_info::r_length.
+static unsigned getFixupKindLog2Size(unsigned Kind) {
+  switch (Kind) {
+  default:
+    report_fatal_error("log2size(FixupKind): Unhandled fixup kind!");
+  case FK_PCRel_1:
+  case FK_Data_1:
+    return 0;
+  case FK_PCRel_2:
+  case FK_Data_2:
+    return 1;
+  case FK_PCRel_4:
+  case PPC::fixup_ppc_brcond14:
+  case PPC::fixup_ppc_half16:
+  case PPC::fixup_ppc_br24:
+  case FK_Data_4:
+    return 2;
+  case FK_PCRel_8:
+  case FK_Data_8:
+    return 3;
+  }
+  return 0;
+}
+
+/// Translates generic PPC fixup kind to Mach-O/PPC relocation type enum.
+/// Outline based on PPCELFObjectWriter::getRelocTypeInner().
+static unsigned getRelocType(const MCValue &Target,
+                             const MCFixupKind FixupKind, // from
+                                                          // Fixup.getKind()
+                             const bool IsPCRel) {
+  const MCSymbolRefExpr::VariantKind Modifier =
+      Target.isAbsolute() ? MCSymbolRefExpr::VK_None
+                          : Target.getSymA()->getKind();
+  // determine the type of the relocation
+  unsigned Type = MachO::GENERIC_RELOC_VANILLA;
+  if (IsPCRel) { // relative to PC
+    switch ((unsigned)FixupKind) {
+    default:
+      report_fatal_error("Unimplemented fixup kind (relative)");
+    case PPC::fixup_ppc_br24:
+      Type = MachO::PPC_RELOC_BR24; // R_PPC_REL24
+      break;
+    case PPC::fixup_ppc_brcond14:
+      Type = MachO::PPC_RELOC_BR14;
+      break;
+    case PPC::fixup_ppc_half16:
+      switch (Modifier) {
+      default:
+        llvm_unreachable("Unsupported modifier for half16 fixup");
+      case MCSymbolRefExpr::VK_PPC_HA:
+        Type = MachO::PPC_RELOC_HA16;
+        break;
+      case MCSymbolRefExpr::VK_PPC_LO:
+        Type = MachO::PPC_RELOC_LO16;
+        break;
+      case MCSymbolRefExpr::VK_PPC_HI:
+        Type = MachO::PPC_RELOC_HI16;
+        break;
+      }
+      break;
+    }
+  } else {
+    switch ((unsigned)FixupKind) {
+    default:
+      report_fatal_error("Unimplemented fixup kind (absolute)!");
+    case PPC::fixup_ppc_half16:
+      switch (Modifier) {
+      default:
+        llvm_unreachable("Unsupported modifier for half16 fixup");
+      case MCSymbolRefExpr::VK_PPC_HA:
+        Type = MachO::PPC_RELOC_HA16_SECTDIFF;
+        break;
+      case MCSymbolRefExpr::VK_PPC_LO:
+        Type = MachO::PPC_RELOC_LO16_SECTDIFF;
+        break;
+      case MCSymbolRefExpr::VK_PPC_HI:
+        Type = MachO::PPC_RELOC_HI16_SECTDIFF;
+        break;
+      }
+      break;
+    case FK_Data_4:
+      break;
+    case FK_Data_2:
+      break;
+    }
+  }
+  return Type;
+}
+
+static void makeRelocationInfo(MachO::any_relocation_info &MRE,
+                               const uint32_t FixupOffset, const uint32_t Index,
+                               const unsigned IsPCRel, const unsigned Log2Size,
+                               const unsigned IsExtern, const unsigned Type) {
+  MRE.r_word0 = FixupOffset;
+  // The bitfield offsets that work (as determined by trial-and-error)
+  // are different than what is documented in the mach-o manuals.
+  // This appears to be an endianness issue; reversing the order of the
+  // documented bitfields in <llvm/Support/MachO.h> fixes this (but
+  // breaks x86/ARM assembly).
+  MRE.r_word1 = ((Index << 8) |    // was << 0
+                 (IsPCRel << 7) |  // was << 24
+                 (Log2Size << 5) | // was << 25
+                 (IsExtern << 4) | // was << 27
+                 (Type << 0));     // was << 28
+}
+
+static void
+makeScatteredRelocationInfo(MachO::any_relocation_info &MRE,
+                            const uint32_t Addr, const unsigned Type,
+                            const unsigned Log2Size, const unsigned IsPCRel,
+                            const uint32_t Value2) {
+  // For notes on bitfield positions and endianness, see:
+  // https://developer.apple.com/library/mac/documentation/developertools/conceptual/MachORuntime/Reference/reference.html#//apple_ref/doc/uid/20001298-scattered_relocation_entry
+  MRE.r_word0 = ((Addr << 0) | (Type << 24) | (Log2Size << 28) |
+                 (IsPCRel << 30) | MachO::R_SCATTERED);
+  MRE.r_word1 = Value2;
+}
+
+/// Compute fixup offset (address).
+static uint32_t getFixupOffset(const MCAsmLayout &Layout,
+                               const MCFragment *Fragment,
+                               const MCFixup &Fixup) {
+  uint32_t FixupOffset = Layout.getFragmentOffset(Fragment) + Fixup.getOffset();
+  // On Mach-O, ppc_fixup_half16 relocations must refer to the
+  // start of the instruction, not the second halfword, as ELF does
+  if (unsigned(Fixup.getKind()) == PPC::fixup_ppc_half16)
+    FixupOffset &= ~uint32_t(3);
+  return FixupOffset;
+}
+
+/// \return false if falling back to using non-scattered relocation,
+/// otherwise true for normal scattered relocation.
+/// based on X86MachObjectWriter::RecordScatteredRelocation
+/// and ARMMachObjectWriter::RecordScatteredRelocation
+bool PPCMachObjectWriter::RecordScatteredRelocation(
+    MachObjectWriter *Writer, const MCAssembler &Asm, const MCAsmLayout &Layout,
+    const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target,
+    unsigned Log2Size, uint64_t &FixedValue) {
+  // caller already computes these, can we just pass and reuse?
+  const uint32_t FixupOffset = getFixupOffset(Layout, Fragment, Fixup);
+  const MCFixupKind FK = Fixup.getKind();
+  const unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, FK);
+  const unsigned Type = getRelocType(Target, FK, IsPCRel);
+
+  // Is this a local or SECTDIFF relocation entry?
+  // SECTDIFF relocation entries have symbol subtractions,
+  // and require two entries, the first for the add-symbol value,
+  // the second for the subtract-symbol value.
+
+  // See <reloc.h>.
+  const MCSymbol *A = &Target.getSymA()->getSymbol();
+  const MCSymbolData *A_SD = &Asm.getSymbolData(*A);
+
+  if (!A_SD->getFragment())
+    report_fatal_error("symbol '" + A->getName() +
+                       "' can not be undefined in a subtraction expression");
+
+  uint32_t Value = Writer->getSymbolAddress(A_SD, Layout);
+  uint64_t SecAddr =
+      Writer->getSectionAddress(A_SD->getFragment()->getParent());
+  FixedValue += SecAddr;
+  uint32_t Value2 = 0;
+
+  if (const MCSymbolRefExpr *B = Target.getSymB()) {
+    const MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol());
+
+    if (!B_SD->getFragment())
+      report_fatal_error("symbol '" + B->getSymbol().getName() +
+                         "' can not be undefined in a subtraction expression");
+
+    // FIXME: is Type correct? see include/llvm/Support/MachO.h
+    Value2 = Writer->getSymbolAddress(B_SD, Layout);
+    FixedValue -= Writer->getSectionAddress(B_SD->getFragment()->getParent());
+  }
+  // FIXME: does FixedValue get used??
+
+  // Relocations are written out in reverse order, so the PAIR comes first.
+  if (Type == MachO::PPC_RELOC_SECTDIFF ||
+      Type == MachO::PPC_RELOC_HI16_SECTDIFF ||
+      Type == MachO::PPC_RELOC_LO16_SECTDIFF ||
+      Type == MachO::PPC_RELOC_HA16_SECTDIFF ||
+      Type == MachO::PPC_RELOC_LO14_SECTDIFF ||
+      Type == MachO::PPC_RELOC_LOCAL_SECTDIFF) {
+    // X86 had this piece, but ARM does not
+    // If the offset is too large to fit in a scattered relocation,
+    // we're hosed. It's an unfortunate limitation of the MachO format.
+    if (FixupOffset > 0xffffff) {
+      char Buffer[32];
+      format("0x%x", FixupOffset).print(Buffer, sizeof(Buffer));
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                                  Twine("Section too large, can't encode "
+                                        "r_address (") +
+                                      Buffer + ") into 24 bits of scattered "
+                                               "relocation entry.");
+      llvm_unreachable("fatal error returned?!");
+    }
+
+    // Is this supposed to follow MCTarget/PPCAsmBackend.cpp:adjustFixupValue()?
+    // see PPCMCExpr::EvaluateAsRelocatableImpl()
+    uint32_t other_half = 0;
+    switch (Type) {
+    case MachO::PPC_RELOC_LO16_SECTDIFF:
+      other_half = (FixedValue >> 16) & 0xffff;
+      // applyFixupOffset longer extracts the high part because it now assumes
+      // this was already done.
+      // It looks like this is not true for the FixedValue needed with Mach-O
+      // relocs.
+      // So we need to adjust FixedValue again here.
+      FixedValue &= 0xffff;
+      break;
+    case MachO::PPC_RELOC_HA16_SECTDIFF:
+      other_half = FixedValue & 0xffff;
+      FixedValue =
+          ((FixedValue >> 16) + ((FixedValue & 0x8000) ? 1 : 0)) & 0xffff;
+      break;
+    case MachO::PPC_RELOC_HI16_SECTDIFF:
+      other_half = FixedValue & 0xffff;
+      FixedValue = (FixedValue >> 16) & 0xffff;
+      break;
+    default:
+      llvm_unreachable("Invalid PPC scattered relocation type.");
+      break;
+    }
+
+    MachO::any_relocation_info MRE;
+    makeScatteredRelocationInfo(MRE, other_half, MachO::GENERIC_RELOC_PAIR,
+                                Log2Size, IsPCRel, Value2);
+    Writer->addRelocation(Fragment->getParent(), MRE);
+  } else {
+    // If the offset is more than 24-bits, it won't fit in a scattered
+    // relocation offset field, so we fall back to using a non-scattered
+    // relocation. This is a bit risky, as if the offset reaches out of
+    // the block and the linker is doing scattered loading on this
+    // symbol, things can go badly.
+    //
+    // Required for 'as' compatibility.
+    if (FixupOffset > 0xffffff)
+      return false;
+  }
+  MachO::any_relocation_info MRE;
+  makeScatteredRelocationInfo(MRE, FixupOffset, Type, Log2Size, IsPCRel, Value);
+  Writer->addRelocation(Fragment->getParent(), MRE);
+  return true;
+}
+
+// see PPCELFObjectWriter for a general outline of cases
+void PPCMachObjectWriter::RecordPPCRelocation(
+    MachObjectWriter *Writer, const MCAssembler &Asm, const MCAsmLayout &Layout,
+    const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target,
+    uint64_t &FixedValue) {
+  const MCFixupKind FK = Fixup.getKind(); // unsigned
+  const unsigned Log2Size = getFixupKindLog2Size(FK);
+  const bool IsPCRel = Writer->isFixupKindPCRel(Asm, FK);
+  const unsigned RelocType = getRelocType(Target, FK, IsPCRel);
+
+  // If this is a difference or a defined symbol plus an offset, then we need a
+  // scattered relocation entry. Differences always require scattered
+  // relocations.
+  if (Target.getSymB() &&
+      // Q: are branch targets ever scattered?
+      RelocType != MachO::PPC_RELOC_BR24 &&
+      RelocType != MachO::PPC_RELOC_BR14) {
+    RecordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+                              Log2Size, FixedValue);
+    return;
+  }
+
+  // this doesn't seem right for RIT_PPC_BR24
+  // Get the symbol data, if any.
+  const MCSymbolData *SD = nullptr;
+  if (Target.getSymA())
+    SD = &Asm.getSymbolData(Target.getSymA()->getSymbol());
+
+  // See <reloc.h>.
+  const uint32_t FixupOffset = getFixupOffset(Layout, Fragment, Fixup);
+  unsigned Index = 0;
+  unsigned IsExtern = 0;
+  unsigned Type = RelocType;
+
+  if (Target.isAbsolute()) { // constant
+                             // SymbolNum of 0 indicates the absolute section.
+                             //
+    // FIXME: Currently, these are never generated (see code below). I cannot
+    // find a case where they are actually emitted.
+    report_fatal_error("FIXME: relocations to absolute targets "
+                       "not yet implemented");
+    // the above line stolen from ARM, not sure
+  } else {
+    // Resolve constant variables.
+    if (SD->getSymbol().isVariable()) {
+      int64_t Res;
+      if (SD->getSymbol().getVariableValue()->EvaluateAsAbsolute(
+              Res, Layout, Writer->getSectionAddressMap())) {
+        FixedValue = Res;
+        return;
+      }
+    }
+
+    // Check whether we need an external or internal relocation.
+    if (Writer->doesSymbolRequireExternRelocation(SD)) {
+      IsExtern = 1;
+      Index = SD->getIndex();
+      // For external relocations, make sure to offset the fixup value to
+      // compensate for the addend of the symbol address, if it was
+      // undefined. This occurs with weak definitions, for example.
+      if (!SD->Symbol->isUndefined())
+        FixedValue -= Layout.getSymbolOffset(SD);
+    } else {
+      // The index is the section ordinal (1-based).
+      const MCSectionData &SymSD =
+          Asm.getSectionData(SD->getSymbol().getSection());
+      Index = SymSD.getOrdinal() + 1;
+      FixedValue += Writer->getSectionAddress(&SymSD);
+    }
+    if (IsPCRel)
+      FixedValue -= Writer->getSectionAddress(Fragment->getParent());
+  }
+
+  // struct relocation_info (8 bytes)
+  MachO::any_relocation_info MRE;
+  makeRelocationInfo(MRE, FixupOffset, Index, IsPCRel, Log2Size, IsExtern,
+                     Type);
+  Writer->addRelocation(Fragment->getParent(), MRE);
+}
+
+MCObjectWriter *llvm::createPPCMachObjectWriter(raw_ostream &OS, bool Is64Bit,
+                                                uint32_t CPUType,
+                                                uint32_t CPUSubtype) {
+  return createMachObjectWriter(
+      new PPCMachObjectWriter(Is64Bit, CPUType, CPUSubtype), OS,
+      /*IsLittleEndian=*/false);
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp
new file mode 100644
index 0000000..c2987b6
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp
@@ -0,0 +1,86 @@
+//===-- PPCPredicates.cpp - PPC Branch Predicate Information --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PowerPC branch predicates.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCPredicates.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cassert>
+using namespace llvm;
+
+PPC::Predicate PPC::InvertPredicate(PPC::Predicate Opcode) {
+  switch (Opcode) {
+  case PPC::PRED_EQ: return PPC::PRED_NE;
+  case PPC::PRED_NE: return PPC::PRED_EQ;
+  case PPC::PRED_LT: return PPC::PRED_GE;
+  case PPC::PRED_GE: return PPC::PRED_LT;
+  case PPC::PRED_GT: return PPC::PRED_LE;
+  case PPC::PRED_LE: return PPC::PRED_GT;
+  case PPC::PRED_NU: return PPC::PRED_UN;
+  case PPC::PRED_UN: return PPC::PRED_NU;
+  case PPC::PRED_EQ_MINUS: return PPC::PRED_NE_PLUS;
+  case PPC::PRED_NE_MINUS: return PPC::PRED_EQ_PLUS;
+  case PPC::PRED_LT_MINUS: return PPC::PRED_GE_PLUS;
+  case PPC::PRED_GE_MINUS: return PPC::PRED_LT_PLUS;
+  case PPC::PRED_GT_MINUS: return PPC::PRED_LE_PLUS;
+  case PPC::PRED_LE_MINUS: return PPC::PRED_GT_PLUS;
+  case PPC::PRED_NU_MINUS: return PPC::PRED_UN_PLUS;
+  case PPC::PRED_UN_MINUS: return PPC::PRED_NU_PLUS;
+  case PPC::PRED_EQ_PLUS: return PPC::PRED_NE_MINUS;
+  case PPC::PRED_NE_PLUS: return PPC::PRED_EQ_MINUS;
+  case PPC::PRED_LT_PLUS: return PPC::PRED_GE_MINUS;
+  case PPC::PRED_GE_PLUS: return PPC::PRED_LT_MINUS;
+  case PPC::PRED_GT_PLUS: return PPC::PRED_LE_MINUS;
+  case PPC::PRED_LE_PLUS: return PPC::PRED_GT_MINUS;
+  case PPC::PRED_NU_PLUS: return PPC::PRED_UN_MINUS;
+  case PPC::PRED_UN_PLUS: return PPC::PRED_NU_MINUS;
+
+  // Simple predicates for single condition-register bits.
+  case PPC::PRED_BIT_SET:   return PPC::PRED_BIT_UNSET;
+  case PPC::PRED_BIT_UNSET: return PPC::PRED_BIT_SET;
+  }
+  llvm_unreachable("Unknown PPC branch opcode!");
+}
+
+PPC::Predicate PPC::getSwappedPredicate(PPC::Predicate Opcode) {
+  switch (Opcode) {
+  case PPC::PRED_EQ: return PPC::PRED_EQ;
+  case PPC::PRED_NE: return PPC::PRED_NE;
+  case PPC::PRED_LT: return PPC::PRED_GT;
+  case PPC::PRED_GE: return PPC::PRED_LE;
+  case PPC::PRED_GT: return PPC::PRED_LT;
+  case PPC::PRED_LE: return PPC::PRED_GE;
+  case PPC::PRED_NU: return PPC::PRED_NU;
+  case PPC::PRED_UN: return PPC::PRED_UN;
+  case PPC::PRED_EQ_MINUS: return PPC::PRED_EQ_MINUS;
+  case PPC::PRED_NE_MINUS: return PPC::PRED_NE_MINUS;
+  case PPC::PRED_LT_MINUS: return PPC::PRED_GT_MINUS;
+  case PPC::PRED_GE_MINUS: return PPC::PRED_LE_MINUS;
+  case PPC::PRED_GT_MINUS: return PPC::PRED_LT_MINUS;
+  case PPC::PRED_LE_MINUS: return PPC::PRED_GE_MINUS;
+  case PPC::PRED_NU_MINUS: return PPC::PRED_NU_MINUS;
+  case PPC::PRED_UN_MINUS: return PPC::PRED_UN_MINUS;
+  case PPC::PRED_EQ_PLUS: return PPC::PRED_EQ_PLUS;
+  case PPC::PRED_NE_PLUS: return PPC::PRED_NE_PLUS;
+  case PPC::PRED_LT_PLUS: return PPC::PRED_GT_PLUS;
+  case PPC::PRED_GE_PLUS: return PPC::PRED_LE_PLUS;
+  case PPC::PRED_GT_PLUS: return PPC::PRED_LT_PLUS;
+  case PPC::PRED_LE_PLUS: return PPC::PRED_GE_PLUS;
+  case PPC::PRED_NU_PLUS: return PPC::PRED_NU_PLUS;
+  case PPC::PRED_UN_PLUS: return PPC::PRED_UN_PLUS;
+
+  case PPC::PRED_BIT_SET:
+  case PPC::PRED_BIT_UNSET:
+    llvm_unreachable("Invalid use of bit predicate code");
+  }
+  llvm_unreachable("Unknown PPC branch opcode!");
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
new file mode 100644
index 0000000..10e328a
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
@@ -0,0 +1,68 @@
+//===-- PPCPredicates.h - PPC Branch Predicate Information ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the PowerPC branch predicates.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_POWERPC_PPCPREDICATES_H
+#define LLVM_TARGET_POWERPC_PPCPREDICATES_H
+
+// GCC #defines PPC on Linux but we use it as our namespace name
+#undef PPC
+
+// Generated files will use "namespace PPC". To avoid symbol clash,
+// undefine PPC here. PPC may be predefined on some hosts.
+#undef PPC
+
+namespace llvm {
+namespace PPC {
+  /// Predicate - These are "(BI << 5) | BO"  for various predicates.
+  enum Predicate {
+    PRED_LT       = (0 << 5) | 12,
+    PRED_LE       = (1 << 5) |  4,
+    PRED_EQ       = (2 << 5) | 12,
+    PRED_GE       = (0 << 5) |  4,
+    PRED_GT       = (1 << 5) | 12,
+    PRED_NE       = (2 << 5) |  4,
+    PRED_UN       = (3 << 5) | 12,
+    PRED_NU       = (3 << 5) |  4,
+    PRED_LT_MINUS = (0 << 5) | 14,
+    PRED_LE_MINUS = (1 << 5) |  6,
+    PRED_EQ_MINUS = (2 << 5) | 14,
+    PRED_GE_MINUS = (0 << 5) |  6,
+    PRED_GT_MINUS = (1 << 5) | 14,
+    PRED_NE_MINUS = (2 << 5) |  6,
+    PRED_UN_MINUS = (3 << 5) | 14,
+    PRED_NU_MINUS = (3 << 5) |  6,
+    PRED_LT_PLUS  = (0 << 5) | 15,
+    PRED_LE_PLUS  = (1 << 5) |  7,
+    PRED_EQ_PLUS  = (2 << 5) | 15,
+    PRED_GE_PLUS  = (0 << 5) |  7,
+    PRED_GT_PLUS  = (1 << 5) | 15,
+    PRED_NE_PLUS  = (2 << 5) |  7,
+    PRED_UN_PLUS  = (3 << 5) | 15,
+    PRED_NU_PLUS  = (3 << 5) |  7,
+
+    // When dealing with individual condition-register bits, we have simple set
+    // and unset predicates.
+    PRED_BIT_SET =   1024,
+    PRED_BIT_UNSET = 1025
+  };
+  
+  /// Invert the specified predicate.  != -> ==, < -> >=.
+  Predicate InvertPredicate(Predicate Opcode);
+
+  /// Assume the condition register is set by MI(a,b), return the predicate if
+  /// we modify the instructions such that condition register is set by MI(b,a).
+  Predicate getSwappedPredicate(Predicate Opcode);
+}
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPC.h b/contrib/llvm/lib/Target/PowerPC/PPC.h
new file mode 100644
index 0000000..ba5fa4f
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPC.h
@@ -0,0 +1,105 @@
+//===-- PPC.h - Top-level interface for PowerPC Target ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// PowerPC back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_POWERPC_H
+#define LLVM_TARGET_POWERPC_H
+
+#include "MCTargetDesc/PPCMCTargetDesc.h"
+#include <string>
+
+// GCC #defines PPC on Linux but we use it as our namespace name
+#undef PPC
+
+namespace llvm {
+  class PPCTargetMachine;
+  class PassRegistry;
+  class FunctionPass;
+  class ImmutablePass;
+  class JITCodeEmitter;
+  class MachineInstr;
+  class AsmPrinter;
+  class MCInst;
+
+  FunctionPass *createPPCCTRLoops(PPCTargetMachine &TM);
+#ifndef NDEBUG
+  FunctionPass *createPPCCTRLoopsVerify();
+#endif
+  FunctionPass *createPPCEarlyReturnPass();
+  FunctionPass *createPPCVSXCopyPass();
+  FunctionPass *createPPCVSXCopyCleanupPass();
+  FunctionPass *createPPCVSXFMAMutatePass();
+  FunctionPass *createPPCBranchSelectionPass();
+  FunctionPass *createPPCISelDag(PPCTargetMachine &TM);
+  FunctionPass *createPPCJITCodeEmitterPass(PPCTargetMachine &TM,
+                                            JITCodeEmitter &MCE);
+  void LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
+                                    AsmPrinter &AP, bool isDarwin);
+
+  /// \brief Creates an PPC-specific Target Transformation Info pass.
+  ImmutablePass *createPPCTargetTransformInfoPass(const PPCTargetMachine *TM);
+
+  void initializePPCVSXFMAMutatePass(PassRegistry&);
+  extern char &PPCVSXFMAMutateID;
+
+  namespace PPCII {
+    
+  /// Target Operand Flag enum.
+  enum TOF {
+    //===------------------------------------------------------------------===//
+    // PPC Specific MachineOperand flags.
+    MO_NO_FLAG,
+    
+    /// MO_PLT_OR_STUB - On a symbol operand "FOO", this indicates that the
+    /// reference is actually to the "FOO$stub" or "FOO@plt" symbol.  This is
+    /// used for calls and jumps to external functions on Tiger and earlier, and
+    /// for PIC calls on Linux and ELF systems.
+    MO_PLT_OR_STUB = 1,
+    
+    /// MO_PIC_FLAG - If this bit is set, the symbol reference is relative to
+    /// the function's picbase, e.g. lo16(symbol-picbase).
+    MO_PIC_FLAG = 2,
+
+    /// MO_NLP_FLAG - If this bit is set, the symbol reference is actually to
+    /// the non_lazy_ptr for the global, e.g. lo16(symbol$non_lazy_ptr-picbase).
+    MO_NLP_FLAG = 4,
+    
+    /// MO_NLP_HIDDEN_FLAG - If this bit is set, the symbol reference is to a
+    /// symbol with hidden visibility.  This causes a different kind of
+    /// non-lazy-pointer to be generated.
+    MO_NLP_HIDDEN_FLAG = 8,
+
+    /// The next are not flags but distinct values.
+    MO_ACCESS_MASK = 0xf0,
+
+    /// MO_LO, MO_HA - lo16(symbol) and ha16(symbol)
+    MO_LO = 1 << 4,
+    MO_HA = 2 << 4,
+
+    MO_TPREL_LO = 4 << 4,
+    MO_TPREL_HA = 3 << 4,
+
+    /// These values identify relocations on immediates folded
+    /// into memory operations.
+    MO_DTPREL_LO = 5 << 4,
+    MO_TLSLD_LO  = 6 << 4,
+    MO_TOC_LO    = 7 << 4,
+
+    // Symbol for VK_PPC_TLS fixup attached to an ADD instruction
+    MO_TLS       = 8 << 4
+  };
+  } // end namespace PPCII
+  
+} // end namespace llvm;
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPC.td b/contrib/llvm/lib/Target/PowerPC/PPC.td
new file mode 100644
index 0000000..a9842b2
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPC.td
@@ -0,0 +1,344 @@
+//===-- PPC.td - Describe the PowerPC Target Machine -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the top level entry point for the PowerPC target.
+//
+//===----------------------------------------------------------------------===//
+
+// Get the target-independent interfaces which we are implementing.
+//
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// PowerPC Subtarget features.
+//
+ 
+//===----------------------------------------------------------------------===//
+// CPU Directives                                                             //
+//===----------------------------------------------------------------------===//
+
+def Directive440 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_440", "">;
+def Directive601 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_601", "">;
+def Directive602 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_602", "">;
+def Directive603 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_603", "">;
+def Directive604 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_603", "">;
+def Directive620 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_603", "">;
+def Directive7400: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_7400", "">;
+def Directive750 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_750", "">;
+def Directive970 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_970", "">;
+def Directive32  : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_32", "">;
+def Directive64  : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_64", "">;
+def DirectiveA2  : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_A2", "">;
+def DirectiveE500mc : SubtargetFeature<"", "DarwinDirective",
+                                       "PPC::DIR_E500mc", "">;
+def DirectiveE5500  : SubtargetFeature<"", "DarwinDirective", 
+                                       "PPC::DIR_E5500", "">;
+def DirectivePwr3: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR3", "">;
+def DirectivePwr4: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR4", "">;
+def DirectivePwr5: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR5", "">;
+def DirectivePwr5x: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR5X", "">;
+def DirectivePwr6: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR6", "">;
+def DirectivePwr6x: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR6X", "">;
+def DirectivePwr7: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR7", "">;
+def DirectivePwr8: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR8", "">;
+
+def Feature64Bit     : SubtargetFeature<"64bit","Has64BitSupport", "true",
+                                        "Enable 64-bit instructions">;
+def Feature64BitRegs : SubtargetFeature<"64bitregs","Use64BitRegs", "true",
+                              "Enable 64-bit registers usage for ppc32 [beta]">;
+def FeatureCRBits    : SubtargetFeature<"crbits", "UseCRBits", "true",
+                              "Use condition-register bits individually">;
+def FeatureAltivec   : SubtargetFeature<"altivec","HasAltivec", "true",
+                                        "Enable Altivec instructions">;
+def FeatureMFOCRF    : SubtargetFeature<"mfocrf","HasMFOCRF", "true",
+                                        "Enable the MFOCRF instruction">;
+def FeatureFSqrt     : SubtargetFeature<"fsqrt","HasFSQRT", "true",
+                                        "Enable the fsqrt instruction">;
+def FeatureFCPSGN    : SubtargetFeature<"fcpsgn", "HasFCPSGN", "true",
+                                        "Enable the fcpsgn instruction">;
+def FeatureFRE       : SubtargetFeature<"fre", "HasFRE", "true",
+                                        "Enable the fre instruction">;
+def FeatureFRES      : SubtargetFeature<"fres", "HasFRES", "true",
+                                        "Enable the fres instruction">;
+def FeatureFRSQRTE   : SubtargetFeature<"frsqrte", "HasFRSQRTE", "true",
+                                        "Enable the frsqrte instruction">;
+def FeatureFRSQRTES  : SubtargetFeature<"frsqrtes", "HasFRSQRTES", "true",
+                                        "Enable the frsqrtes instruction">;
+def FeatureRecipPrec : SubtargetFeature<"recipprec", "HasRecipPrec", "true",
+                              "Assume higher precision reciprocal estimates">;
+def FeatureSTFIWX    : SubtargetFeature<"stfiwx","HasSTFIWX", "true",
+                                        "Enable the stfiwx instruction">;
+def FeatureLFIWAX    : SubtargetFeature<"lfiwax","HasLFIWAX", "true",
+                                        "Enable the lfiwax instruction">;
+def FeatureFPRND     : SubtargetFeature<"fprnd", "HasFPRND", "true",
+                                        "Enable the fri[mnpz] instructions">;
+def FeatureFPCVT     : SubtargetFeature<"fpcvt", "HasFPCVT", "true",
+  "Enable fc[ft]* (unsigned and single-precision) and lfiwzx instructions">;
+def FeatureISEL      : SubtargetFeature<"isel","HasISEL", "true",
+                                        "Enable the isel instruction">;
+def FeaturePOPCNTD   : SubtargetFeature<"popcntd","HasPOPCNTD", "true",
+                                        "Enable the popcnt[dw] instructions">;
+def FeatureLDBRX     : SubtargetFeature<"ldbrx","HasLDBRX", "true",
+                                        "Enable the ldbrx instruction">;
+def FeatureBookE     : SubtargetFeature<"booke", "IsBookE", "true",
+                                        "Enable Book E instructions">;
+def FeatureQPX       : SubtargetFeature<"qpx","HasQPX", "true",
+                                        "Enable QPX instructions">;
+def FeatureVSX       : SubtargetFeature<"vsx","HasVSX", "true",
+                                        "Enable VSX instructions",
+                                        [FeatureAltivec]>;
+
+def DeprecatedMFTB   : SubtargetFeature<"", "DeprecatedMFTB", "true",
+                                        "Treat mftb as deprecated">;
+def DeprecatedDST    : SubtargetFeature<"", "DeprecatedDST", "true",
+  "Treat vector data stream cache control instructions as deprecated">;
+
+// Note: Future features to add when support is extended to more
+// recent ISA levels:
+//
+// CMPB         p6, p6x, p7        cmpb
+// DFP          p6, p6x, p7        decimal floating-point instructions
+// POPCNTB      p5 through p7      popcntb and related instructions
+// VSX          p7                 vector-scalar instruction set
+
+//===----------------------------------------------------------------------===//
+// Classes used for relation maps.
+//===----------------------------------------------------------------------===//
+// RecFormRel - Filter class used to relate non-record-form instructions with
+// their record-form variants.
+class RecFormRel;
+
+// AltVSXFMARel - Filter class used to relate the primary addend-killing VSX
+// FMA instruction forms with their corresponding factor-killing forms.
+class AltVSXFMARel {
+  bit IsVSXFMAAlt = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Relation Map Definitions.
+//===----------------------------------------------------------------------===//
+
+def getRecordFormOpcode : InstrMapping {
+  let FilterClass = "RecFormRel";
+  // Instructions with the same BaseName and Interpretation64Bit values
+  // form a row.
+  let RowFields = ["BaseName", "Interpretation64Bit"];
+  // Instructions with the same RC value form a column.
+  let ColFields = ["RC"];
+  // The key column are the non-record-form instructions.
+  let KeyCol = ["0"];
+  // Value columns RC=1
+  let ValueCols = [["1"]];
+}
+
+def getNonRecordFormOpcode : InstrMapping {
+  let FilterClass = "RecFormRel";
+  // Instructions with the same BaseName and Interpretation64Bit values
+  // form a row.
+  let RowFields = ["BaseName", "Interpretation64Bit"];
+  // Instructions with the same RC value form a column.
+  let ColFields = ["RC"];
+  // The key column are the record-form instructions.
+  let KeyCol = ["1"];
+  // Value columns are RC=0
+  let ValueCols = [["0"]];
+}
+
+def getAltVSXFMAOpcode : InstrMapping {
+  let FilterClass = "AltVSXFMARel";
+  // Instructions with the same BaseName and Interpretation64Bit values
+  // form a row.
+  let RowFields = ["BaseName"];
+  // Instructions with the same RC value form a column.
+  let ColFields = ["IsVSXFMAAlt"];
+  // The key column are the (default) addend-killing instructions.
+  let KeyCol = ["0"];
+  // Value columns IsVSXFMAAlt=1
+  let ValueCols = [["1"]];
+}
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "PPCRegisterInfo.td"
+include "PPCSchedule.td"
+include "PPCInstrInfo.td"
+
+//===----------------------------------------------------------------------===//
+// PowerPC processors supported.
+//
+
+def : Processor<"generic", G3Itineraries, [Directive32]>;
+def : ProcessorModel<"440", PPC440Model, [Directive440, FeatureISEL,
+                                          FeatureFRES, FeatureFRSQRTE,
+                                          FeatureBookE, DeprecatedMFTB]>;
+def : ProcessorModel<"450", PPC440Model, [Directive440, FeatureISEL,
+                                          FeatureFRES, FeatureFRSQRTE,
+                                          FeatureBookE, DeprecatedMFTB]>;
+def : Processor<"601", G3Itineraries, [Directive601]>;
+def : Processor<"602", G3Itineraries, [Directive602]>;
+def : Processor<"603", G3Itineraries, [Directive603,
+                                       FeatureFRES, FeatureFRSQRTE]>;
+def : Processor<"603e", G3Itineraries, [Directive603,
+                                        FeatureFRES, FeatureFRSQRTE]>;
+def : Processor<"603ev", G3Itineraries, [Directive603,
+                                         FeatureFRES, FeatureFRSQRTE]>;
+def : Processor<"604", G3Itineraries, [Directive604,
+                                       FeatureFRES, FeatureFRSQRTE]>;
+def : Processor<"604e", G3Itineraries, [Directive604,
+                                        FeatureFRES, FeatureFRSQRTE]>;
+def : Processor<"620", G3Itineraries, [Directive620,
+                                       FeatureFRES, FeatureFRSQRTE]>;
+def : Processor<"750", G4Itineraries, [Directive750,
+                                       FeatureFRES, FeatureFRSQRTE]>;
+def : Processor<"g3", G3Itineraries, [Directive750,
+                                      FeatureFRES, FeatureFRSQRTE]>;
+def : Processor<"7400", G4Itineraries, [Directive7400, FeatureAltivec,
+                                        FeatureFRES, FeatureFRSQRTE]>;
+def : Processor<"g4", G4Itineraries, [Directive7400, FeatureAltivec,
+                                      FeatureFRES, FeatureFRSQRTE]>;
+def : Processor<"7450", G4PlusItineraries, [Directive7400, FeatureAltivec,
+                                            FeatureFRES, FeatureFRSQRTE]>;
+def : Processor<"g4+", G4PlusItineraries, [Directive7400, FeatureAltivec,
+                                           FeatureFRES, FeatureFRSQRTE]>;
+def : ProcessorModel<"970", G5Model,
+                  [Directive970, FeatureAltivec,
+                   FeatureMFOCRF, FeatureFSqrt,
+                   FeatureFRES, FeatureFRSQRTE, FeatureSTFIWX,
+                   Feature64Bit /*, Feature64BitRegs */]>;
+def : ProcessorModel<"g5", G5Model,
+                  [Directive970, FeatureAltivec,
+                   FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX,
+                   FeatureFRES, FeatureFRSQRTE,
+                   Feature64Bit /*, Feature64BitRegs */,
+                   DeprecatedMFTB, DeprecatedDST]>;
+def : ProcessorModel<"e500mc", PPCE500mcModel,
+                  [DirectiveE500mc, FeatureMFOCRF,
+                   FeatureSTFIWX, FeatureBookE, FeatureISEL,
+                   DeprecatedMFTB]>;
+def : ProcessorModel<"e5500", PPCE5500Model,
+                  [DirectiveE5500, FeatureMFOCRF, Feature64Bit,
+                   FeatureSTFIWX, FeatureBookE, FeatureISEL,
+                   DeprecatedMFTB]>;
+def : ProcessorModel<"a2", PPCA2Model,
+                  [DirectiveA2, FeatureBookE, FeatureMFOCRF,
+                   FeatureFCPSGN, FeatureFSqrt, FeatureFRE, FeatureFRES,
+                   FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec,
+                   FeatureSTFIWX, FeatureLFIWAX,
+                   FeatureFPRND, FeatureFPCVT, FeatureISEL,
+                   FeaturePOPCNTD, FeatureLDBRX, Feature64Bit
+               /*, Feature64BitRegs */, DeprecatedMFTB]>;
+def : ProcessorModel<"a2q", PPCA2Model,
+                  [DirectiveA2, FeatureBookE, FeatureMFOCRF,
+                   FeatureFCPSGN, FeatureFSqrt, FeatureFRE, FeatureFRES,
+                   FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec,
+                   FeatureSTFIWX, FeatureLFIWAX,
+                   FeatureFPRND, FeatureFPCVT, FeatureISEL,
+                   FeaturePOPCNTD, FeatureLDBRX, Feature64Bit
+               /*, Feature64BitRegs */, FeatureQPX, DeprecatedMFTB]>;
+def : ProcessorModel<"pwr3", G5Model,
+                  [DirectivePwr3, FeatureAltivec,
+                   FeatureFRES, FeatureFRSQRTE, FeatureMFOCRF,
+                   FeatureSTFIWX, Feature64Bit]>;
+def : ProcessorModel<"pwr4", G5Model,
+                  [DirectivePwr4, FeatureAltivec, FeatureMFOCRF,
+                   FeatureFSqrt, FeatureFRES, FeatureFRSQRTE,
+                   FeatureSTFIWX, Feature64Bit]>;
+def : ProcessorModel<"pwr5", G5Model,
+                  [DirectivePwr5, FeatureAltivec, FeatureMFOCRF,
+                   FeatureFSqrt, FeatureFRE, FeatureFRES,
+                   FeatureFRSQRTE, FeatureFRSQRTES,
+                   FeatureSTFIWX, Feature64Bit,
+                   DeprecatedMFTB, DeprecatedDST]>;
+def : ProcessorModel<"pwr5x", G5Model,
+                  [DirectivePwr5x, FeatureAltivec, FeatureMFOCRF,
+                   FeatureFSqrt, FeatureFRE, FeatureFRES,
+                   FeatureFRSQRTE, FeatureFRSQRTES,
+                   FeatureSTFIWX, FeatureFPRND, Feature64Bit,
+                   DeprecatedMFTB, DeprecatedDST]>;
+def : ProcessorModel<"pwr6", G5Model,
+                  [DirectivePwr6, FeatureAltivec,
+                   FeatureMFOCRF, FeatureFCPSGN, FeatureFSqrt, FeatureFRE,
+                   FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES,
+                   FeatureRecipPrec, FeatureSTFIWX, FeatureLFIWAX,
+                   FeatureFPRND, Feature64Bit /*, Feature64BitRegs */,
+                   DeprecatedMFTB, DeprecatedDST]>;
+def : ProcessorModel<"pwr6x", G5Model,
+                  [DirectivePwr5x, FeatureAltivec, FeatureMFOCRF,
+                   FeatureFCPSGN, FeatureFSqrt, FeatureFRE, FeatureFRES,
+                   FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec,
+                   FeatureSTFIWX, FeatureLFIWAX,
+                   FeatureFPRND, Feature64Bit,
+                   DeprecatedMFTB, DeprecatedDST]>;
+def : ProcessorModel<"pwr7", P7Model,
+                  [DirectivePwr7, FeatureAltivec,
+                   FeatureMFOCRF, FeatureFCPSGN, FeatureFSqrt, FeatureFRE,
+                   FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES,
+                   FeatureRecipPrec, FeatureSTFIWX, FeatureLFIWAX,
+                   FeatureFPRND, FeatureFPCVT, FeatureISEL,
+                   FeaturePOPCNTD, FeatureLDBRX,
+                   Feature64Bit /*, Feature64BitRegs */,
+                   DeprecatedMFTB, DeprecatedDST]>;
+def : ProcessorModel<"pwr8", P7Model /* FIXME: Update to P8Model when available */,
+                  [DirectivePwr8, FeatureAltivec,
+                   FeatureMFOCRF, FeatureFCPSGN, FeatureFSqrt, FeatureFRE,
+                   FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES,
+                   FeatureRecipPrec, FeatureSTFIWX, FeatureLFIWAX,
+                   FeatureFPRND, FeatureFPCVT, FeatureISEL,
+                   FeaturePOPCNTD, FeatureLDBRX,
+                   Feature64Bit /*, Feature64BitRegs */,
+                   DeprecatedMFTB, DeprecatedDST]>;
+def : Processor<"ppc", G3Itineraries, [Directive32]>;
+def : ProcessorModel<"ppc64", G5Model,
+                  [Directive64, FeatureAltivec,
+                   FeatureMFOCRF, FeatureFSqrt, FeatureFRES,
+                   FeatureFRSQRTE, FeatureSTFIWX,
+                   Feature64Bit /*, Feature64BitRegs */]>;
+def : ProcessorModel<"ppc64le", G5Model,
+                  [Directive64, FeatureAltivec,
+                   FeatureMFOCRF, FeatureFSqrt, FeatureFRES,
+                   FeatureFRSQRTE, FeatureSTFIWX,
+                   Feature64Bit /*, Feature64BitRegs */]>;
+
+//===----------------------------------------------------------------------===//
+// Calling Conventions
+//===----------------------------------------------------------------------===//
+
+include "PPCCallingConv.td"
+
+def PPCInstrInfo : InstrInfo {
+  let isLittleEndianEncoding = 1;
+
+  // FIXME: Unset this when no longer needed!
+  let decodePositionallyEncodedOperands = 1;
+
+  let noNamedPositionallyEncodedOperands = 1;
+}
+
+def PPCAsmParser : AsmParser {
+  let ShouldEmitMatchRegisterName = 0;
+}
+
+def PPCAsmParserVariant : AsmParserVariant {
+  int Variant = 0;
+
+  // We do not use hard coded registers in asm strings.  However, some
+  // InstAlias definitions use immediate literals.  Set RegisterPrefix
+  // so that those are not misinterpreted as registers.
+  string RegisterPrefix = "%";
+}
+
+def PPC : Target {
+  // Information about the instructions.
+  let InstructionSet = PPCInstrInfo;
+
+  let AssemblyParsers = [PPCAsmParser];
+  let AssemblyParserVariants = [PPCAsmParserVariant];
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
new file mode 100644
index 0000000..1384022
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -0,0 +1,1397 @@
+//===-- PPCAsmPrinter.cpp - Print machine instrs to PowerPC assembly ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to PowerPC assembly language. This printer is
+// the output mechanism used by `llc'.
+//
+// Documentation at http://developer.apple.com/documentation/DeveloperTools/
+// Reference/Assembler/ASMIntroduction/chapter_1_section_1.html
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "InstPrinter/PPCInstPrinter.h"
+#include "PPCMachineFunctionInfo.h"
+#include "MCTargetDesc/PPCMCExpr.h"
+#include "MCTargetDesc/PPCPredicates.h"
+#include "PPCSubtarget.h"
+#include "PPCTargetMachine.h"
+#include "PPCTargetStreamer.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asmprinter"
+
+namespace {
+  class PPCAsmPrinter : public AsmPrinter {
+  protected:
+    MapVector<MCSymbol*, MCSymbol*> TOC;
+    const PPCSubtarget &Subtarget;
+    uint64_t TOCLabelID;
+  public:
+    explicit PPCAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+      : AsmPrinter(TM, Streamer),
+        Subtarget(TM.getSubtarget<PPCSubtarget>()), TOCLabelID(0) {}
+
+    const char *getPassName() const override {
+      return "PowerPC Assembly Printer";
+    }
+
+    MCSymbol *lookUpOrCreateTOCEntry(MCSymbol *Sym);
+
+    void EmitInstruction(const MachineInstr *MI) override;
+
+    void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O);
+
+    bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                         unsigned AsmVariant, const char *ExtraCode,
+                         raw_ostream &O) override;
+    bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                               unsigned AsmVariant, const char *ExtraCode,
+                               raw_ostream &O) override;
+  };
+
+  /// PPCLinuxAsmPrinter - PowerPC assembly printer, customized for Linux
+  class PPCLinuxAsmPrinter : public PPCAsmPrinter {
+  public:
+    explicit PPCLinuxAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+      : PPCAsmPrinter(TM, Streamer) {}
+
+    const char *getPassName() const override {
+      return "Linux PPC Assembly Printer";
+    }
+
+    bool doFinalization(Module &M) override;
+    void EmitStartOfAsmFile(Module &M) override;
+
+    void EmitFunctionEntryLabel() override;
+
+    void EmitFunctionBodyStart() override;
+    void EmitFunctionBodyEnd() override;
+  };
+
+  /// PPCDarwinAsmPrinter - PowerPC assembly printer, customized for Darwin/Mac
+  /// OS X
+  class PPCDarwinAsmPrinter : public PPCAsmPrinter {
+  public:
+    explicit PPCDarwinAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+      : PPCAsmPrinter(TM, Streamer) {}
+
+    const char *getPassName() const override {
+      return "Darwin PPC Assembly Printer";
+    }
+
+    bool doFinalization(Module &M) override;
+    void EmitStartOfAsmFile(Module &M) override;
+
+    void EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs);
+  };
+} // end of anonymous namespace
+
+/// stripRegisterPrefix - This method strips the character prefix from a
+/// register name so that only the number is left.  Used by for linux asm.
+static const char *stripRegisterPrefix(const char *RegName) {
+  switch (RegName[0]) {
+    case 'r':
+    case 'f':
+    case 'v':
+      if (RegName[1] == 's')
+        return RegName + 2;
+      return RegName + 1;
+    case 'c': if (RegName[1] == 'r') return RegName + 2;
+  }
+  
+  return RegName;
+}
+
+void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
+                                 raw_ostream &O) {
+  const DataLayout *DL = TM.getDataLayout();
+  const MachineOperand &MO = MI->getOperand(OpNo);
+  
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register: {
+    const char *RegName = PPCInstPrinter::getRegisterName(MO.getReg());
+    // Linux assembler (Others?) does not take register mnemonics.
+    // FIXME - What about special registers used in mfspr/mtspr?
+    if (!Subtarget.isDarwin()) RegName = stripRegisterPrefix(RegName);
+    O << RegName;
+    return;
+  }
+  case MachineOperand::MO_Immediate:
+    O << MO.getImm();
+    return;
+
+  case MachineOperand::MO_MachineBasicBlock:
+    O << *MO.getMBB()->getSymbol();
+    return;
+  case MachineOperand::MO_ConstantPoolIndex:
+    O << DL->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber()
+      << '_' << MO.getIndex();
+    return;
+  case MachineOperand::MO_BlockAddress:
+    O << *GetBlockAddressSymbol(MO.getBlockAddress());
+    return;
+  case MachineOperand::MO_GlobalAddress: {
+    // Computing the address of a global symbol, not calling it.
+    const GlobalValue *GV = MO.getGlobal();
+    MCSymbol *SymToPrint;
+
+    // External or weakly linked global variables need non-lazily-resolved stubs
+    if (TM.getRelocationModel() != Reloc::Static &&
+        (GV->isDeclaration() || GV->isWeakForLinker())) {
+      if (!GV->hasHiddenVisibility()) {
+        SymToPrint = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+        MachineModuleInfoImpl::StubValueTy &StubSym = 
+          MMI->getObjFileInfo<MachineModuleInfoMachO>()
+            .getGVStubEntry(SymToPrint);
+        if (!StubSym.getPointer())
+          StubSym = MachineModuleInfoImpl::
+            StubValueTy(getSymbol(GV), !GV->hasInternalLinkage());
+      } else if (GV->isDeclaration() || GV->hasCommonLinkage() ||
+                 GV->hasAvailableExternallyLinkage()) {
+        SymToPrint = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+        
+        MachineModuleInfoImpl::StubValueTy &StubSym = 
+          MMI->getObjFileInfo<MachineModuleInfoMachO>().
+                    getHiddenGVStubEntry(SymToPrint);
+        if (!StubSym.getPointer())
+          StubSym = MachineModuleInfoImpl::
+            StubValueTy(getSymbol(GV), !GV->hasInternalLinkage());
+      } else {
+        SymToPrint = getSymbol(GV);
+      }
+    } else {
+      SymToPrint = getSymbol(GV);
+    }
+    
+    O << *SymToPrint;
+
+    printOffset(MO.getOffset(), O);
+    return;
+  }
+
+  default:
+    O << "<unknown operand type: " << (unsigned)MO.getType() << ">";
+    return;
+  }
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool PPCAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                    unsigned AsmVariant,
+                                    const char *ExtraCode, raw_ostream &O) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
+    case 'c': // Don't print "$" before a global var name or constant.
+      break; // PPC never has a prefix.
+    case 'L': // Write second word of DImode reference.
+      // Verify that this operand has two consecutive registers.
+      if (!MI->getOperand(OpNo).isReg() ||
+          OpNo+1 == MI->getNumOperands() ||
+          !MI->getOperand(OpNo+1).isReg())
+        return true;
+      ++OpNo;   // Return the high-part.
+      break;
+    case 'I':
+      // Write 'i' if an integer constant, otherwise nothing.  Used to print
+      // addi vs add, etc.
+      if (MI->getOperand(OpNo).isImm())
+        O << "i";
+      return false;
+    }
+  }
+
+  printOperand(MI, OpNo, O);
+  return false;
+}
+
+// At the moment, all inline asm memory operands are a single register.
+// In any case, the output of this routine should always be just one
+// assembler operand.
+
+bool PPCAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                                          unsigned AsmVariant,
+                                          const char *ExtraCode,
+                                          raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default: return true;  // Unknown modifier.
+    case 'y': // A memory reference for an X-form instruction
+      {
+        const char *RegName = "r0";
+        if (!Subtarget.isDarwin()) RegName = stripRegisterPrefix(RegName);
+        O << RegName << ", ";
+        printOperand(MI, OpNo, O);
+        return false;
+      }
+    }
+  }
+
+  assert(MI->getOperand(OpNo).isReg());
+  O << "0(";
+  printOperand(MI, OpNo, O);
+  O << ")";
+  return false;
+}
+
+
+/// lookUpOrCreateTOCEntry -- Given a symbol, look up whether a TOC entry
+/// exists for it.  If not, create one.  Then return a symbol that references
+/// the TOC entry.
+MCSymbol *PPCAsmPrinter::lookUpOrCreateTOCEntry(MCSymbol *Sym) {
+  const DataLayout *DL = TM.getDataLayout();
+  MCSymbol *&TOCEntry = TOC[Sym];
+
+  // To avoid name clash check if the name already exists.
+  while (!TOCEntry) {
+    if (OutContext.LookupSymbol(Twine(DL->getPrivateGlobalPrefix()) +
+                                "C" + Twine(TOCLabelID++)) == nullptr) {
+      TOCEntry = GetTempSymbol("C", TOCLabelID);
+    }
+  }
+
+  return TOCEntry;
+}
+
+
+/// EmitInstruction -- Print out a single PowerPC MI in Darwin syntax to
+/// the current output stream.
+///
+void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  MCInst TmpInst;
+  bool isPPC64 = Subtarget.isPPC64();
+  
+  // Lower multi-instruction pseudo operations.
+  switch (MI->getOpcode()) {
+  default: break;
+  case TargetOpcode::DBG_VALUE:
+    llvm_unreachable("Should be handled target independently");
+  case PPC::MovePCtoLR:
+  case PPC::MovePCtoLR8: {
+    // Transform %LR = MovePCtoLR
+    // Into this, where the label is the PIC base: 
+    //     bl L1$pb
+    // L1$pb:
+    MCSymbol *PICBase = MF->getPICBaseSymbol();
+    
+    // Emit the 'bl'.
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BL)
+      // FIXME: We would like an efficient form for this, so we don't have to do
+      // a lot of extra uniquing.
+      .addExpr(MCSymbolRefExpr::Create(PICBase, OutContext)));
+    
+    // Emit the label.
+    OutStreamer.EmitLabel(PICBase);
+    return;
+  }
+  case PPC::GetGBRO: {
+    // Get the offset from the GOT Base Register to the GOT
+    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin());
+    MCSymbol *PICOffset = MF->getInfo<PPCFunctionInfo>()->getPICOffsetSymbol();
+    TmpInst.setOpcode(PPC::LWZ);
+    const MCExpr *Exp =
+      MCSymbolRefExpr::Create(PICOffset, MCSymbolRefExpr::VK_None, OutContext);
+    const MCExpr *PB =
+      MCSymbolRefExpr::Create(MF->getPICBaseSymbol(),
+                              MCSymbolRefExpr::VK_None,
+                              OutContext);
+    const MCOperand MO = TmpInst.getOperand(1);
+    TmpInst.getOperand(1) = MCOperand::CreateExpr(MCBinaryExpr::CreateSub(Exp,
+                                                                          PB,
+                                                                          OutContext));
+    TmpInst.addOperand(MO);
+    EmitToStreamer(OutStreamer, TmpInst);
+    return;
+  }
+  case PPC::UpdateGBR: {
+    // Update the GOT Base Register to point to the GOT.  It may be possible to
+    // merge this with the PPC::GetGBRO, doing it all in one step.
+    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin());
+    TmpInst.setOpcode(PPC::ADD4);
+    TmpInst.addOperand(TmpInst.getOperand(0));
+    EmitToStreamer(OutStreamer, TmpInst);
+    return;
+  }
+  case PPC::LWZtoc: {
+    // Transform %X3 = LWZtoc <ga:@min1>, %X2
+    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin());
+
+    // Change the opcode to LWZ, and the global address operand to be a
+    // reference to the GOT entry we will synthesize later.
+    TmpInst.setOpcode(PPC::LWZ);
+    const MachineOperand &MO = MI->getOperand(1);
+
+    // Map symbol -> label of TOC entry
+    assert(MO.isGlobal() || MO.isCPI() || MO.isJTI());
+    MCSymbol *MOSymbol = nullptr;
+    if (MO.isGlobal())
+      MOSymbol = getSymbol(MO.getGlobal());
+    else if (MO.isCPI())
+      MOSymbol = GetCPISymbol(MO.getIndex());
+    else if (MO.isJTI())
+      MOSymbol = GetJTISymbol(MO.getIndex());
+
+    MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(MOSymbol);
+
+    const MCExpr *Exp =
+      MCSymbolRefExpr::Create(TOCEntry, MCSymbolRefExpr::VK_None,
+                              OutContext);
+    const MCExpr *PB =
+      MCSymbolRefExpr::Create(OutContext.GetOrCreateSymbol(Twine(".L.TOC.")),
+                                                           OutContext);
+    Exp = MCBinaryExpr::CreateSub(Exp, PB, OutContext);
+    TmpInst.getOperand(1) = MCOperand::CreateExpr(Exp);
+    EmitToStreamer(OutStreamer, TmpInst);
+    return;
+  }
+  case PPC::LDtocJTI:
+  case PPC::LDtocCPT:
+  case PPC::LDtoc: {
+    // Transform %X3 = LDtoc <ga:@min1>, %X2
+    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin());
+
+    // Change the opcode to LD, and the global address operand to be a
+    // reference to the TOC entry we will synthesize later.
+    TmpInst.setOpcode(PPC::LD);
+    const MachineOperand &MO = MI->getOperand(1);
+
+    // Map symbol -> label of TOC entry
+    assert(MO.isGlobal() || MO.isCPI() || MO.isJTI());
+    MCSymbol *MOSymbol = nullptr;
+    if (MO.isGlobal())
+      MOSymbol = getSymbol(MO.getGlobal());
+    else if (MO.isCPI())
+      MOSymbol = GetCPISymbol(MO.getIndex());
+    else if (MO.isJTI())
+      MOSymbol = GetJTISymbol(MO.getIndex());
+
+    MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(MOSymbol);
+
+    const MCExpr *Exp =
+      MCSymbolRefExpr::Create(TOCEntry, MCSymbolRefExpr::VK_PPC_TOC,
+                              OutContext);
+    TmpInst.getOperand(1) = MCOperand::CreateExpr(Exp);
+    EmitToStreamer(OutStreamer, TmpInst);
+    return;
+  }
+      
+  case PPC::ADDIStocHA: {
+    // Transform %Xd = ADDIStocHA %X2, <ga:@sym>
+    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin());
+
+    // Change the opcode to ADDIS8.  If the global address is external, has
+    // common linkage, is a non-local function address, or is a jump table
+    // address, then generate a TOC entry and reference that.  Otherwise
+    // reference the symbol directly.
+    TmpInst.setOpcode(PPC::ADDIS8);
+    const MachineOperand &MO = MI->getOperand(2);
+    assert((MO.isGlobal() || MO.isCPI() || MO.isJTI()) &&
+           "Invalid operand for ADDIStocHA!");
+    MCSymbol *MOSymbol = nullptr;
+    bool IsExternal = false;
+    bool IsNonLocalFunction = false;
+    bool IsCommon = false;
+    bool IsAvailExt = false;
+
+    if (MO.isGlobal()) {
+      const GlobalValue *GV = MO.getGlobal();
+      MOSymbol = getSymbol(GV);
+      IsExternal = GV->isDeclaration();
+      IsCommon = GV->hasCommonLinkage();
+      IsNonLocalFunction = GV->getType()->getElementType()->isFunctionTy() &&
+        (GV->isDeclaration() || GV->isWeakForLinker());
+      IsAvailExt = GV->hasAvailableExternallyLinkage();
+    } else if (MO.isCPI())
+      MOSymbol = GetCPISymbol(MO.getIndex());
+    else if (MO.isJTI())
+      MOSymbol = GetJTISymbol(MO.getIndex());
+
+    if (IsExternal || IsNonLocalFunction || IsCommon || IsAvailExt ||
+        MO.isJTI() || TM.getCodeModel() == CodeModel::Large)
+      MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
+
+    const MCExpr *Exp =
+      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC_HA,
+                              OutContext);
+    TmpInst.getOperand(2) = MCOperand::CreateExpr(Exp);
+    EmitToStreamer(OutStreamer, TmpInst);
+    return;
+  }
+  case PPC::LDtocL: {
+    // Transform %Xd = LDtocL <ga:@sym>, %Xs
+    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin());
+
+    // Change the opcode to LD.  If the global address is external, has
+    // common linkage, or is a jump table address, then reference the
+    // associated TOC entry.  Otherwise reference the symbol directly.
+    TmpInst.setOpcode(PPC::LD);
+    const MachineOperand &MO = MI->getOperand(1);
+    assert((MO.isGlobal() || MO.isJTI() || MO.isCPI()) &&
+           "Invalid operand for LDtocL!");
+    MCSymbol *MOSymbol = nullptr;
+
+    if (MO.isJTI())
+      MOSymbol = lookUpOrCreateTOCEntry(GetJTISymbol(MO.getIndex()));
+    else if (MO.isCPI()) {
+      MOSymbol = GetCPISymbol(MO.getIndex());
+      if (TM.getCodeModel() == CodeModel::Large)
+        MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
+    }
+    else if (MO.isGlobal()) {
+      const GlobalValue *GValue = MO.getGlobal();
+      MOSymbol = getSymbol(GValue);
+      if (GValue->getType()->getElementType()->isFunctionTy() ||
+          GValue->isDeclaration() || GValue->hasCommonLinkage() ||
+          GValue->hasAvailableExternallyLinkage() ||
+          TM.getCodeModel() == CodeModel::Large)
+        MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
+    }
+
+    const MCExpr *Exp =
+      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC_LO,
+                              OutContext);
+    TmpInst.getOperand(1) = MCOperand::CreateExpr(Exp);
+    EmitToStreamer(OutStreamer, TmpInst);
+    return;
+  }
+  case PPC::ADDItocL: {
+    // Transform %Xd = ADDItocL %Xs, <ga:@sym>
+    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin());
+
+    // Change the opcode to ADDI8.  If the global address is external, then
+    // generate a TOC entry and reference that.  Otherwise reference the
+    // symbol directly.
+    TmpInst.setOpcode(PPC::ADDI8);
+    const MachineOperand &MO = MI->getOperand(2);
+    assert((MO.isGlobal() || MO.isCPI()) && "Invalid operand for ADDItocL");
+    MCSymbol *MOSymbol = nullptr;
+    bool IsExternal = false;
+    bool IsNonLocalFunction = false;
+
+    if (MO.isGlobal()) {
+      const GlobalValue *GV = MO.getGlobal();
+      MOSymbol = getSymbol(GV);
+      IsExternal = GV->isDeclaration();
+      IsNonLocalFunction = GV->getType()->getElementType()->isFunctionTy() &&
+        (GV->isDeclaration() || GV->isWeakForLinker());
+    } else if (MO.isCPI())
+      MOSymbol = GetCPISymbol(MO.getIndex());
+
+    if (IsNonLocalFunction || IsExternal ||
+        TM.getCodeModel() == CodeModel::Large)
+      MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
+
+    const MCExpr *Exp =
+      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC_LO,
+                              OutContext);
+    TmpInst.getOperand(2) = MCOperand::CreateExpr(Exp);
+    EmitToStreamer(OutStreamer, TmpInst);
+    return;
+  }
+  case PPC::ADDISgotTprelHA: {
+    // Transform: %Xd = ADDISgotTprelHA %X2, <ga:@sym>
+    // Into:      %Xd = ADDIS8 %X2, sym@got@tlsgd@ha
+    assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
+    const MachineOperand &MO = MI->getOperand(2);
+    const GlobalValue *GValue = MO.getGlobal();
+    MCSymbol *MOSymbol = getSymbol(GValue);
+    const MCExpr *SymGotTprel =
+      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TPREL_HA,
+                              OutContext);
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ADDIS8)
+                                .addReg(MI->getOperand(0).getReg())
+                                .addReg(PPC::X2)
+                                .addExpr(SymGotTprel));
+    return;
+  }
+  case PPC::LDgotTprelL:
+  case PPC::LDgotTprelL32: {
+    // Transform %Xd = LDgotTprelL <ga:@sym>, %Xs
+    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin());
+
+    // Change the opcode to LD.
+    TmpInst.setOpcode(isPPC64 ? PPC::LD : PPC::LWZ);
+    const MachineOperand &MO = MI->getOperand(1);
+    const GlobalValue *GValue = MO.getGlobal();
+    MCSymbol *MOSymbol = getSymbol(GValue);
+    const MCExpr *Exp =
+      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TPREL_LO,
+                              OutContext);
+    TmpInst.getOperand(1) = MCOperand::CreateExpr(Exp);
+    EmitToStreamer(OutStreamer, TmpInst);
+    return;
+  }
+
+  case PPC::PPC32PICGOT: {
+    MCSymbol *GOTSymbol = OutContext.GetOrCreateSymbol(StringRef("_GLOBAL_OFFSET_TABLE_"));
+    MCSymbol *GOTRef = OutContext.CreateTempSymbol();
+    MCSymbol *NextInstr = OutContext.CreateTempSymbol();
+
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BL)
+      // FIXME: We would like an efficient form for this, so we don't have to do
+      // a lot of extra uniquing.
+      .addExpr(MCSymbolRefExpr::Create(NextInstr, OutContext)));
+    const MCExpr *OffsExpr =
+      MCBinaryExpr::CreateSub(MCSymbolRefExpr::Create(GOTSymbol, OutContext),
+                                MCSymbolRefExpr::Create(GOTRef, OutContext),
+        OutContext);
+    OutStreamer.EmitLabel(GOTRef);
+    OutStreamer.EmitValue(OffsExpr, 4);
+    OutStreamer.EmitLabel(NextInstr);
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::MFLR)
+                                .addReg(MI->getOperand(0).getReg()));
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LWZ)
+                                .addReg(MI->getOperand(1).getReg())
+                                .addImm(0)
+                                .addReg(MI->getOperand(0).getReg()));
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ADD4)
+                                .addReg(MI->getOperand(0).getReg())
+                                .addReg(MI->getOperand(1).getReg())
+                                .addReg(MI->getOperand(0).getReg()));
+    return;
+  }
+  case PPC::PPC32GOT: {
+    MCSymbol *GOTSymbol = OutContext.GetOrCreateSymbol(StringRef("_GLOBAL_OFFSET_TABLE_"));
+    const MCExpr *SymGotTlsL =
+      MCSymbolRefExpr::Create(GOTSymbol, MCSymbolRefExpr::VK_PPC_LO,
+                              OutContext);
+    const MCExpr *SymGotTlsHA =                               
+      MCSymbolRefExpr::Create(GOTSymbol, MCSymbolRefExpr::VK_PPC_HA,
+                              OutContext);
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LI)
+                                .addReg(MI->getOperand(0).getReg())
+                                .addExpr(SymGotTlsL));
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ADDIS)
+                                .addReg(MI->getOperand(0).getReg())
+                                .addReg(MI->getOperand(0).getReg())
+                                .addExpr(SymGotTlsHA));
+    return;
+  }
+  case PPC::ADDIStlsgdHA: {
+    // Transform: %Xd = ADDIStlsgdHA %X2, <ga:@sym>
+    // Into:      %Xd = ADDIS8 %X2, sym@got@tlsgd@ha
+    assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
+    const MachineOperand &MO = MI->getOperand(2);
+    const GlobalValue *GValue = MO.getGlobal();
+    MCSymbol *MOSymbol = getSymbol(GValue);
+    const MCExpr *SymGotTlsGD =
+      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSGD_HA,
+                              OutContext);
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ADDIS8)
+                                .addReg(MI->getOperand(0).getReg())
+                                .addReg(PPC::X2)
+                                .addExpr(SymGotTlsGD));
+    return;
+  }
+  case PPC::ADDItlsgdL:
+    // Transform: %Xd = ADDItlsgdL %Xs, <ga:@sym>
+    // Into:      %Xd = ADDI8 %Xs, sym@got@tlsgd@l
+  case PPC::ADDItlsgdL32: {
+    // Transform: %Rd = ADDItlsgdL32 %Rs, <ga:@sym>
+    // Into:      %Rd = ADDI %Rs, sym@got@tlsgd
+    const MachineOperand &MO = MI->getOperand(2);
+    const GlobalValue *GValue = MO.getGlobal();
+    MCSymbol *MOSymbol = getSymbol(GValue);
+    const MCExpr *SymGotTlsGD =
+      MCSymbolRefExpr::Create(MOSymbol, Subtarget.isPPC64() ?
+                                         MCSymbolRefExpr::VK_PPC_GOT_TLSGD_LO :
+                                         MCSymbolRefExpr::VK_PPC_GOT_TLSGD,
+                              OutContext);
+    EmitToStreamer(OutStreamer,
+                   MCInstBuilder(Subtarget.isPPC64() ? PPC::ADDI8 : PPC::ADDI)
+                   .addReg(MI->getOperand(0).getReg())
+                   .addReg(MI->getOperand(1).getReg())
+                   .addExpr(SymGotTlsGD));
+    return;
+  }
+  case PPC::GETtlsADDR:
+    // Transform: %X3 = GETtlsADDR %X3, <ga:@sym>
+    // Into:      BL8_NOP_TLS __tls_get_addr(sym@tlsgd)
+  case PPC::GETtlsADDR32: {
+    // Transform: %R3 = GETtlsADDR32 %R3, <ga:@sym>
+    // Into:      BL_TLS __tls_get_addr(sym@tlsgd)@PLT
+
+    StringRef Name = "__tls_get_addr";
+    MCSymbol *TlsGetAddr = OutContext.GetOrCreateSymbol(Name);
+    MCSymbolRefExpr::VariantKind Kind = MCSymbolRefExpr::VK_None;
+
+    if (!Subtarget.isPPC64() && !Subtarget.isDarwin() &&
+        TM.getRelocationModel() == Reloc::PIC_)
+      Kind = MCSymbolRefExpr::VK_PLT;
+    const MCSymbolRefExpr *TlsRef = 
+      MCSymbolRefExpr::Create(TlsGetAddr, Kind, OutContext);
+    const MachineOperand &MO = MI->getOperand(2);
+    const GlobalValue *GValue = MO.getGlobal();
+    MCSymbol *MOSymbol = getSymbol(GValue);
+    const MCExpr *SymVar =
+      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_TLSGD,
+                              OutContext);
+    EmitToStreamer(OutStreamer,
+                   MCInstBuilder(Subtarget.isPPC64() ?
+                                  PPC::BL8_NOP_TLS : PPC::BL_TLS)
+                   .addExpr(TlsRef)
+                   .addExpr(SymVar));
+    return;
+  }
+  case PPC::ADDIStlsldHA: {
+    // Transform: %Xd = ADDIStlsldHA %X2, <ga:@sym>
+    // Into:      %Xd = ADDIS8 %X2, sym@got@tlsld@ha
+    assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
+    const MachineOperand &MO = MI->getOperand(2);
+    const GlobalValue *GValue = MO.getGlobal();
+    MCSymbol *MOSymbol = getSymbol(GValue);
+    const MCExpr *SymGotTlsLD =
+      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSLD_HA,
+                              OutContext);
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ADDIS8)
+                                .addReg(MI->getOperand(0).getReg())
+                                .addReg(PPC::X2)
+                                .addExpr(SymGotTlsLD));
+    return;
+  }
+  case PPC::ADDItlsldL:
+    // Transform: %Xd = ADDItlsldL %Xs, <ga:@sym>
+    // Into:      %Xd = ADDI8 %Xs, sym@got@tlsld@l
+  case PPC::ADDItlsldL32: {
+    // Transform: %Rd = ADDItlsldL32 %Rs, <ga:@sym>
+    // Into:      %Rd = ADDI %Rs, sym@got@tlsld
+    const MachineOperand &MO = MI->getOperand(2);
+    const GlobalValue *GValue = MO.getGlobal();
+    MCSymbol *MOSymbol = getSymbol(GValue);
+    const MCExpr *SymGotTlsLD =
+      MCSymbolRefExpr::Create(MOSymbol, Subtarget.isPPC64() ?
+                                         MCSymbolRefExpr::VK_PPC_GOT_TLSLD_LO :
+                                         MCSymbolRefExpr::VK_PPC_GOT_TLSLD,
+                              OutContext);
+    EmitToStreamer(OutStreamer,
+                   MCInstBuilder(Subtarget.isPPC64() ? PPC::ADDI8 : PPC::ADDI)
+                   .addReg(MI->getOperand(0).getReg())
+                   .addReg(MI->getOperand(1).getReg())
+                   .addExpr(SymGotTlsLD));
+    return;
+  }
+  case PPC::GETtlsldADDR:
+    // Transform: %X3 = GETtlsldADDR %X3, <ga:@sym>
+    // Into:      BL8_NOP_TLS __tls_get_addr(sym@tlsld)
+  case PPC::GETtlsldADDR32: {
+    // Transform: %R3 = GETtlsldADDR32 %R3, <ga:@sym>
+    // Into:      BL_TLS __tls_get_addr(sym@tlsld)@PLT
+
+    StringRef Name = "__tls_get_addr";
+    MCSymbol *TlsGetAddr = OutContext.GetOrCreateSymbol(Name);
+    MCSymbolRefExpr::VariantKind Kind = MCSymbolRefExpr::VK_None;
+
+    if (!Subtarget.isPPC64() && !Subtarget.isDarwin() &&
+        TM.getRelocationModel() == Reloc::PIC_)
+      Kind = MCSymbolRefExpr::VK_PLT;
+
+    const MCSymbolRefExpr *TlsRef = 
+      MCSymbolRefExpr::Create(TlsGetAddr, Kind, OutContext);
+    const MachineOperand &MO = MI->getOperand(2);
+    const GlobalValue *GValue = MO.getGlobal();
+    MCSymbol *MOSymbol = getSymbol(GValue);
+    const MCExpr *SymVar =
+      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_TLSLD,
+                              OutContext);
+    EmitToStreamer(OutStreamer,
+                   MCInstBuilder(Subtarget.isPPC64() ?
+                                  PPC::BL8_NOP_TLS : PPC::BL_TLS)
+                   .addExpr(TlsRef)
+                   .addExpr(SymVar));
+    return;
+  }
+  case PPC::ADDISdtprelHA:
+    // Transform: %Xd = ADDISdtprelHA %X3, <ga:@sym>
+    // Into:      %Xd = ADDIS8 %X3, sym@dtprel@ha
+  case PPC::ADDISdtprelHA32: {
+    // Transform: %Rd = ADDISdtprelHA32 %R3, <ga:@sym>
+    // Into:      %Rd = ADDIS %R3, sym@dtprel@ha
+    const MachineOperand &MO = MI->getOperand(2);
+    const GlobalValue *GValue = MO.getGlobal();
+    MCSymbol *MOSymbol = getSymbol(GValue);
+    const MCExpr *SymDtprel =
+      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_DTPREL_HA,
+                              OutContext);
+    EmitToStreamer(OutStreamer,
+                   MCInstBuilder(Subtarget.isPPC64() ? PPC::ADDIS8 : PPC::ADDIS)
+                   .addReg(MI->getOperand(0).getReg())
+                   .addReg(Subtarget.isPPC64() ? PPC::X3 : PPC::R3)
+                   .addExpr(SymDtprel));
+    return;
+  }
+  case PPC::ADDIdtprelL:
+    // Transform: %Xd = ADDIdtprelL %Xs, <ga:@sym>
+    // Into:      %Xd = ADDI8 %Xs, sym@dtprel@l
+  case PPC::ADDIdtprelL32: {
+    // Transform: %Rd = ADDIdtprelL32 %Rs, <ga:@sym>
+    // Into:      %Rd = ADDI %Rs, sym@dtprel@l
+    const MachineOperand &MO = MI->getOperand(2);
+    const GlobalValue *GValue = MO.getGlobal();
+    MCSymbol *MOSymbol = getSymbol(GValue);
+    const MCExpr *SymDtprel =
+      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_DTPREL_LO,
+                              OutContext);
+    EmitToStreamer(OutStreamer,
+                   MCInstBuilder(Subtarget.isPPC64() ? PPC::ADDI8 : PPC::ADDI)
+                   .addReg(MI->getOperand(0).getReg())
+                   .addReg(MI->getOperand(1).getReg())
+                   .addExpr(SymDtprel));
+    return;
+  }
+  case PPC::MFOCRF:
+  case PPC::MFOCRF8:
+    if (!Subtarget.hasMFOCRF()) {
+      // Transform: %R3 = MFOCRF %CR7
+      // Into:      %R3 = MFCR   ;; cr7
+      unsigned NewOpcode =
+        MI->getOpcode() == PPC::MFOCRF ? PPC::MFCR : PPC::MFCR8;
+      OutStreamer.AddComment(PPCInstPrinter::
+                             getRegisterName(MI->getOperand(1).getReg()));
+      EmitToStreamer(OutStreamer, MCInstBuilder(NewOpcode)
+                                  .addReg(MI->getOperand(0).getReg()));
+      return;
+    }
+    break;
+  case PPC::MTOCRF:
+  case PPC::MTOCRF8:
+    if (!Subtarget.hasMFOCRF()) {
+      // Transform: %CR7 = MTOCRF %R3
+      // Into:      MTCRF mask, %R3 ;; cr7
+      unsigned NewOpcode =
+        MI->getOpcode() == PPC::MTOCRF ? PPC::MTCRF : PPC::MTCRF8;
+      unsigned Mask = 0x80 >> OutContext.getRegisterInfo()
+                              ->getEncodingValue(MI->getOperand(0).getReg());
+      OutStreamer.AddComment(PPCInstPrinter::
+                             getRegisterName(MI->getOperand(0).getReg()));
+      EmitToStreamer(OutStreamer, MCInstBuilder(NewOpcode)
+                                  .addImm(Mask)
+                                  .addReg(MI->getOperand(1).getReg()));
+      return;
+    }
+    break;
+  case PPC::LD:
+  case PPC::STD:
+  case PPC::LWA_32:
+  case PPC::LWA: {
+    // Verify alignment is legal, so we don't create relocations
+    // that can't be supported.
+    // FIXME:  This test is currently disabled for Darwin.  The test
+    // suite shows a handful of test cases that fail this check for
+    // Darwin.  Those need to be investigated before this sanity test
+    // can be enabled for those subtargets.
+    if (!Subtarget.isDarwin()) {
+      unsigned OpNum = (MI->getOpcode() == PPC::STD) ? 2 : 1;
+      const MachineOperand &MO = MI->getOperand(OpNum);
+      if (MO.isGlobal() && MO.getGlobal()->getAlignment() < 4)
+        llvm_unreachable("Global must be word-aligned for LD, STD, LWA!");
+    }
+    // Now process the instruction normally.
+    break;
+  }
+  }
+
+  LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin());
+  EmitToStreamer(OutStreamer, TmpInst);
+}
+
+void PPCLinuxAsmPrinter::EmitStartOfAsmFile(Module &M) {
+  if (Subtarget.isELFv2ABI()) {
+    PPCTargetStreamer *TS =
+      static_cast<PPCTargetStreamer *>(OutStreamer.getTargetStreamer());
+
+    if (TS)
+      TS->emitAbiVersion(2);
+  }
+
+  if (Subtarget.isPPC64() || TM.getRelocationModel() != Reloc::PIC_)
+    return AsmPrinter::EmitStartOfAsmFile(M);
+
+  // FIXME: The use of .got2 assumes large GOT model (-fPIC), which is not
+  // optimal for some cases.  We should consider supporting small model (-fpic)
+  // as well in the future.
+  assert(TM.getCodeModel() != CodeModel::Small &&
+         "Small code model PIC is currently unsupported.");
+  OutStreamer.SwitchSection(OutContext.getELFSection(".got2",
+         ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC,
+         SectionKind::getReadOnly()));
+
+  MCSymbol *TOCSym = OutContext.GetOrCreateSymbol(Twine(".L.TOC."));
+  MCSymbol *CurrentPos = OutContext.CreateTempSymbol();
+
+  OutStreamer.EmitLabel(CurrentPos);
+
+  // The GOT pointer points to the middle of the GOT, in order to reference the
+  // entire 64kB range.  0x8000 is the midpoint.
+  const MCExpr *tocExpr =
+    MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(CurrentPos, OutContext),
+                            MCConstantExpr::Create(0x8000, OutContext),
+                            OutContext);
+
+  OutStreamer.EmitAssignment(TOCSym, tocExpr);
+
+  OutStreamer.SwitchSection(getObjFileLowering().getTextSection());
+}
+
+void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() {
+  // linux/ppc32 - Normal entry label.
+  if (!Subtarget.isPPC64() && TM.getRelocationModel() != Reloc::PIC_)
+    return AsmPrinter::EmitFunctionEntryLabel();
+
+  if (!Subtarget.isPPC64()) {
+    const PPCFunctionInfo *PPCFI = MF->getInfo<PPCFunctionInfo>();
+  	if (PPCFI->usesPICBase()) {
+      MCSymbol *RelocSymbol = PPCFI->getPICOffsetSymbol();
+      MCSymbol *PICBase = MF->getPICBaseSymbol();
+      OutStreamer.EmitLabel(RelocSymbol);
+
+      const MCExpr *OffsExpr =
+        MCBinaryExpr::CreateSub(
+          MCSymbolRefExpr::Create(OutContext.GetOrCreateSymbol(Twine(".L.TOC.")),
+                                                               OutContext),
+                                  MCSymbolRefExpr::Create(PICBase, OutContext),
+          OutContext);
+      OutStreamer.EmitValue(OffsExpr, 4);
+      OutStreamer.EmitLabel(CurrentFnSym);
+      return;
+    } else
+      return AsmPrinter::EmitFunctionEntryLabel();
+  }
+
+  // ELFv2 ABI - Normal entry label.
+  if (Subtarget.isELFv2ABI())
+    return AsmPrinter::EmitFunctionEntryLabel();
+
+  // Emit an official procedure descriptor.
+  MCSectionSubPair Current = OutStreamer.getCurrentSection();
+  const MCSectionELF *Section = OutStreamer.getContext().getELFSection(".opd",
+      ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC,
+      SectionKind::getReadOnly());
+  OutStreamer.SwitchSection(Section);
+  OutStreamer.EmitLabel(CurrentFnSym);
+  OutStreamer.EmitValueToAlignment(8);
+  MCSymbol *Symbol1 = 
+    OutContext.GetOrCreateSymbol(".L." + Twine(CurrentFnSym->getName()));
+  // Generates a R_PPC64_ADDR64 (from FK_DATA_8) relocation for the function
+  // entry point.
+  OutStreamer.EmitValue(MCSymbolRefExpr::Create(Symbol1, OutContext),
+			8 /*size*/);
+  MCSymbol *Symbol2 = OutContext.GetOrCreateSymbol(StringRef(".TOC."));
+  // Generates a R_PPC64_TOC relocation for TOC base insertion.
+  OutStreamer.EmitValue(MCSymbolRefExpr::Create(Symbol2,
+                        MCSymbolRefExpr::VK_PPC_TOCBASE, OutContext),
+                        8/*size*/);
+  // Emit a null environment pointer.
+  OutStreamer.EmitIntValue(0, 8 /* size */);
+  OutStreamer.SwitchSection(Current.first, Current.second);
+
+  MCSymbol *RealFnSym = OutContext.GetOrCreateSymbol(
+                          ".L." + Twine(CurrentFnSym->getName()));
+  OutStreamer.EmitLabel(RealFnSym);
+  CurrentFnSymForSize = RealFnSym;
+}
+
+
+bool PPCLinuxAsmPrinter::doFinalization(Module &M) {
+  const DataLayout *TD = TM.getDataLayout();
+
+  bool isPPC64 = TD->getPointerSizeInBits() == 64;
+
+  PPCTargetStreamer &TS =
+      static_cast<PPCTargetStreamer &>(*OutStreamer.getTargetStreamer());
+
+  if (!TOC.empty()) {
+    const MCSectionELF *Section;
+    
+    if (isPPC64)
+      Section = OutStreamer.getContext().getELFSection(".toc",
+        ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC,
+        SectionKind::getReadOnly());
+	else
+      Section = OutStreamer.getContext().getELFSection(".got2",
+        ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC,
+        SectionKind::getReadOnly());
+    OutStreamer.SwitchSection(Section);
+
+    for (MapVector<MCSymbol*, MCSymbol*>::iterator I = TOC.begin(),
+         E = TOC.end(); I != E; ++I) {
+      OutStreamer.EmitLabel(I->second);
+      MCSymbol *S = OutContext.GetOrCreateSymbol(I->first->getName());
+      if (isPPC64)
+        TS.emitTCEntry(*S);
+      else
+        OutStreamer.EmitSymbolValue(S, 4);
+    }
+  }
+
+  MachineModuleInfoELF &MMIELF =
+    MMI->getObjFileInfo<MachineModuleInfoELF>();
+
+  MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList();
+  if (!Stubs.empty()) {
+    OutStreamer.SwitchSection(getObjFileLowering().getDataSection());
+    for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+      // L_foo$stub:
+      OutStreamer.EmitLabel(Stubs[i].first);
+      //   .long _foo
+      OutStreamer.EmitValue(MCSymbolRefExpr::Create(Stubs[i].second.getPointer(),
+                                                    OutContext),
+                            isPPC64 ? 8 : 4/*size*/);
+    }
+
+    Stubs.clear();
+    OutStreamer.AddBlankLine();
+  }
+
+  return AsmPrinter::doFinalization(M);
+}
+
+/// EmitFunctionBodyStart - Emit a global entry point prefix for ELFv2.
+void PPCLinuxAsmPrinter::EmitFunctionBodyStart() {
+  // In the ELFv2 ABI, in functions that use the TOC register, we need to
+  // provide two entry points.  The ABI guarantees that when calling the
+  // local entry point, r2 is set up by the caller to contain the TOC base
+  // for this function, and when calling the global entry point, r12 is set
+  // up by the caller to hold the address of the global entry point.  We
+  // thus emit a prefix sequence along the following lines:
+  //
+  // func:
+  //         # global entry point
+  //         addis r2,r12,(.TOC.-func)@ha
+  //         addi  r2,r2,(.TOC.-func)@l
+  //         .localentry func, .-func
+  //         # local entry point, followed by function body
+  //
+  // This ensures we have r2 set up correctly while executing the function
+  // body, no matter which entry point is called.
+  if (Subtarget.isELFv2ABI()
+      // Only do all that if the function uses r2 in the first place.
+      && !MF->getRegInfo().use_empty(PPC::X2)) {
+
+    MCSymbol *GlobalEntryLabel = OutContext.CreateTempSymbol();
+    OutStreamer.EmitLabel(GlobalEntryLabel);
+    const MCSymbolRefExpr *GlobalEntryLabelExp =
+      MCSymbolRefExpr::Create(GlobalEntryLabel, OutContext);
+
+    MCSymbol *TOCSymbol = OutContext.GetOrCreateSymbol(StringRef(".TOC."));
+    const MCExpr *TOCDeltaExpr =
+      MCBinaryExpr::CreateSub(MCSymbolRefExpr::Create(TOCSymbol, OutContext),
+                              GlobalEntryLabelExp, OutContext);
+
+    const MCExpr *TOCDeltaHi =
+      PPCMCExpr::CreateHa(TOCDeltaExpr, false, OutContext);
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ADDIS)
+                                .addReg(PPC::X2)
+                                .addReg(PPC::X12)
+                                .addExpr(TOCDeltaHi));
+
+    const MCExpr *TOCDeltaLo =
+      PPCMCExpr::CreateLo(TOCDeltaExpr, false, OutContext);
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ADDI)
+                                .addReg(PPC::X2)
+                                .addReg(PPC::X2)
+                                .addExpr(TOCDeltaLo));
+
+    MCSymbol *LocalEntryLabel = OutContext.CreateTempSymbol();
+    OutStreamer.EmitLabel(LocalEntryLabel);
+    const MCSymbolRefExpr *LocalEntryLabelExp =
+       MCSymbolRefExpr::Create(LocalEntryLabel, OutContext);
+    const MCExpr *LocalOffsetExp =
+      MCBinaryExpr::CreateSub(LocalEntryLabelExp,
+                              GlobalEntryLabelExp, OutContext);
+
+    PPCTargetStreamer *TS =
+      static_cast<PPCTargetStreamer *>(OutStreamer.getTargetStreamer());
+
+    if (TS)
+      TS->emitLocalEntry(CurrentFnSym, LocalOffsetExp);
+  }
+}
+
+/// EmitFunctionBodyEnd - Print the traceback table before the .size
+/// directive.
+///
+void PPCLinuxAsmPrinter::EmitFunctionBodyEnd() {
+  // Only the 64-bit target requires a traceback table.  For now,
+  // we only emit the word of zeroes that GDB requires to find
+  // the end of the function, and zeroes for the eight-byte
+  // mandatory fields.
+  // FIXME: We should fill in the eight-byte mandatory fields as described in
+  // the PPC64 ELF ABI (this is a low-priority item because GDB does not
+  // currently make use of these fields).
+  if (Subtarget.isPPC64()) {
+    OutStreamer.EmitIntValue(0, 4/*size*/);
+    OutStreamer.EmitIntValue(0, 8/*size*/);
+  }
+}
+
+void PPCDarwinAsmPrinter::EmitStartOfAsmFile(Module &M) {
+  static const char *const CPUDirectives[] = {
+    "",
+    "ppc",
+    "ppc440",
+    "ppc601",
+    "ppc602",
+    "ppc603",
+    "ppc7400",
+    "ppc750",
+    "ppc970",
+    "ppcA2",
+    "ppce500mc",
+    "ppce5500",
+    "power3",
+    "power4",
+    "power5",
+    "power5x",
+    "power6",
+    "power6x",
+    "power7",
+    "ppc64",
+    "ppc64le"
+  };
+
+  unsigned Directive = Subtarget.getDarwinDirective();
+  if (Subtarget.hasMFOCRF() && Directive < PPC::DIR_970)
+    Directive = PPC::DIR_970;
+  if (Subtarget.hasAltivec() && Directive < PPC::DIR_7400)
+    Directive = PPC::DIR_7400;
+  if (Subtarget.isPPC64() && Directive < PPC::DIR_64)
+    Directive = PPC::DIR_64;
+  assert(Directive <= PPC::DIR_64 && "Directive out of range.");
+
+  assert(Directive < array_lengthof(CPUDirectives) &&
+         "CPUDirectives[] might not be up-to-date!");
+  PPCTargetStreamer &TStreamer =
+      *static_cast<PPCTargetStreamer *>(OutStreamer.getTargetStreamer());
+  TStreamer.emitMachine(CPUDirectives[Directive]);
+
+  // Prime text sections so they are adjacent.  This reduces the likelihood a
+  // large data or debug section causes a branch to exceed 16M limit.
+  const TargetLoweringObjectFileMachO &TLOFMacho = 
+    static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering());
+  OutStreamer.SwitchSection(TLOFMacho.getTextCoalSection());
+  if (TM.getRelocationModel() == Reloc::PIC_) {
+    OutStreamer.SwitchSection(
+           OutContext.getMachOSection("__TEXT", "__picsymbolstub1",
+                                      MachO::S_SYMBOL_STUBS |
+                                      MachO::S_ATTR_PURE_INSTRUCTIONS,
+                                      32, SectionKind::getText()));
+  } else if (TM.getRelocationModel() == Reloc::DynamicNoPIC) {
+    OutStreamer.SwitchSection(
+           OutContext.getMachOSection("__TEXT","__symbol_stub1",
+                                      MachO::S_SYMBOL_STUBS |
+                                      MachO::S_ATTR_PURE_INSTRUCTIONS,
+                                      16, SectionKind::getText()));
+  }
+  OutStreamer.SwitchSection(getObjFileLowering().getTextSection());
+}
+
+static MCSymbol *GetLazyPtr(MCSymbol *Sym, MCContext &Ctx) {
+  // Remove $stub suffix, add $lazy_ptr.
+  StringRef NoStub = Sym->getName().substr(0, Sym->getName().size()-5);
+  return Ctx.GetOrCreateSymbol(NoStub + "$lazy_ptr");
+}
+
+static MCSymbol *GetAnonSym(MCSymbol *Sym, MCContext &Ctx) {
+  // Add $tmp suffix to $stub, yielding $stub$tmp.
+  return Ctx.GetOrCreateSymbol(Sym->getName() + "$tmp");
+}
+
+void PPCDarwinAsmPrinter::
+EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) {
+  bool isPPC64 = TM.getDataLayout()->getPointerSizeInBits() == 64;
+  bool isDarwin = Subtarget.isDarwin();
+  
+  const TargetLoweringObjectFileMachO &TLOFMacho = 
+    static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering());
+
+  // .lazy_symbol_pointer
+  const MCSection *LSPSection = TLOFMacho.getLazySymbolPointerSection();
+  
+  // Output stubs for dynamically-linked functions
+  if (TM.getRelocationModel() == Reloc::PIC_) {
+    const MCSection *StubSection = 
+    OutContext.getMachOSection("__TEXT", "__picsymbolstub1",
+                               MachO::S_SYMBOL_STUBS |
+                               MachO::S_ATTR_PURE_INSTRUCTIONS,
+                               32, SectionKind::getText());
+    for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+      OutStreamer.SwitchSection(StubSection);
+      EmitAlignment(4);
+      
+      MCSymbol *Stub = Stubs[i].first;
+      MCSymbol *RawSym = Stubs[i].second.getPointer();
+      MCSymbol *LazyPtr = GetLazyPtr(Stub, OutContext);
+      MCSymbol *AnonSymbol = GetAnonSym(Stub, OutContext);
+                                           
+      OutStreamer.EmitLabel(Stub);
+      OutStreamer.EmitSymbolAttribute(RawSym, MCSA_IndirectSymbol);
+
+      const MCExpr *Anon = MCSymbolRefExpr::Create(AnonSymbol, OutContext);
+      const MCExpr *LazyPtrExpr = MCSymbolRefExpr::Create(LazyPtr, OutContext);
+      const MCExpr *Sub =
+        MCBinaryExpr::CreateSub(LazyPtrExpr, Anon, OutContext);
+
+      // mflr r0
+      EmitToStreamer(OutStreamer, MCInstBuilder(PPC::MFLR).addReg(PPC::R0));
+      // bcl 20, 31, AnonSymbol
+      EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BCLalways).addExpr(Anon));
+      OutStreamer.EmitLabel(AnonSymbol);
+      // mflr r11
+      EmitToStreamer(OutStreamer, MCInstBuilder(PPC::MFLR).addReg(PPC::R11));
+      // addis r11, r11, ha16(LazyPtr - AnonSymbol)
+      const MCExpr *SubHa16 = PPCMCExpr::CreateHa(Sub, isDarwin, OutContext);
+      EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ADDIS)
+        .addReg(PPC::R11)
+        .addReg(PPC::R11)
+        .addExpr(SubHa16));
+      // mtlr r0
+      EmitToStreamer(OutStreamer, MCInstBuilder(PPC::MTLR).addReg(PPC::R0));
+
+      // ldu r12, lo16(LazyPtr - AnonSymbol)(r11)
+      // lwzu r12, lo16(LazyPtr - AnonSymbol)(r11)
+      const MCExpr *SubLo16 = PPCMCExpr::CreateLo(Sub, isDarwin, OutContext);
+      EmitToStreamer(OutStreamer, MCInstBuilder(isPPC64 ? PPC::LDU : PPC::LWZU)
+        .addReg(PPC::R12)
+        .addExpr(SubLo16).addExpr(SubLo16)
+        .addReg(PPC::R11));
+      // mtctr r12
+      EmitToStreamer(OutStreamer, MCInstBuilder(PPC::MTCTR).addReg(PPC::R12));
+      // bctr
+      EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BCTR));
+
+      OutStreamer.SwitchSection(LSPSection);
+      OutStreamer.EmitLabel(LazyPtr);
+      OutStreamer.EmitSymbolAttribute(RawSym, MCSA_IndirectSymbol);
+
+      MCSymbol *DyldStubBindingHelper =
+        OutContext.GetOrCreateSymbol(StringRef("dyld_stub_binding_helper"));
+      if (isPPC64) {
+        // .quad dyld_stub_binding_helper
+        OutStreamer.EmitSymbolValue(DyldStubBindingHelper, 8);
+      } else {
+        // .long dyld_stub_binding_helper
+        OutStreamer.EmitSymbolValue(DyldStubBindingHelper, 4);
+      }
+    }
+    OutStreamer.AddBlankLine();
+    return;
+  }
+  
+  const MCSection *StubSection =
+    OutContext.getMachOSection("__TEXT","__symbol_stub1",
+                               MachO::S_SYMBOL_STUBS |
+                               MachO::S_ATTR_PURE_INSTRUCTIONS,
+                               16, SectionKind::getText());
+  for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+    MCSymbol *Stub = Stubs[i].first;
+    MCSymbol *RawSym = Stubs[i].second.getPointer();
+    MCSymbol *LazyPtr = GetLazyPtr(Stub, OutContext);
+    const MCExpr *LazyPtrExpr = MCSymbolRefExpr::Create(LazyPtr, OutContext);
+
+    OutStreamer.SwitchSection(StubSection);
+    EmitAlignment(4);
+    OutStreamer.EmitLabel(Stub);
+    OutStreamer.EmitSymbolAttribute(RawSym, MCSA_IndirectSymbol);
+
+    // lis r11, ha16(LazyPtr)
+    const MCExpr *LazyPtrHa16 =
+      PPCMCExpr::CreateHa(LazyPtrExpr, isDarwin, OutContext);
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LIS)
+      .addReg(PPC::R11)
+      .addExpr(LazyPtrHa16));
+
+    // ldu r12, lo16(LazyPtr)(r11)
+    // lwzu r12, lo16(LazyPtr)(r11)
+    const MCExpr *LazyPtrLo16 =
+      PPCMCExpr::CreateLo(LazyPtrExpr, isDarwin, OutContext);
+    EmitToStreamer(OutStreamer, MCInstBuilder(isPPC64 ? PPC::LDU : PPC::LWZU)
+      .addReg(PPC::R12)
+      .addExpr(LazyPtrLo16).addExpr(LazyPtrLo16)
+      .addReg(PPC::R11));
+
+    // mtctr r12
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::MTCTR).addReg(PPC::R12));
+    // bctr
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BCTR));
+
+    OutStreamer.SwitchSection(LSPSection);
+    OutStreamer.EmitLabel(LazyPtr);
+    OutStreamer.EmitSymbolAttribute(RawSym, MCSA_IndirectSymbol);
+
+    MCSymbol *DyldStubBindingHelper =
+      OutContext.GetOrCreateSymbol(StringRef("dyld_stub_binding_helper"));
+    if (isPPC64) {
+      // .quad dyld_stub_binding_helper
+      OutStreamer.EmitSymbolValue(DyldStubBindingHelper, 8);
+    } else {
+      // .long dyld_stub_binding_helper
+      OutStreamer.EmitSymbolValue(DyldStubBindingHelper, 4);
+    }
+  }
+  
+  OutStreamer.AddBlankLine();
+}
+
+
+bool PPCDarwinAsmPrinter::doFinalization(Module &M) {
+  bool isPPC64 = TM.getDataLayout()->getPointerSizeInBits() == 64;
+
+  // Darwin/PPC always uses mach-o.
+  const TargetLoweringObjectFileMachO &TLOFMacho = 
+    static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering());
+  MachineModuleInfoMachO &MMIMacho =
+    MMI->getObjFileInfo<MachineModuleInfoMachO>();
+  
+  MachineModuleInfoMachO::SymbolListTy Stubs = MMIMacho.GetFnStubList();
+  if (!Stubs.empty())
+    EmitFunctionStubs(Stubs);
+
+  if (MAI->doesSupportExceptionHandling() && MMI) {
+    // Add the (possibly multiple) personalities to the set of global values.
+    // Only referenced functions get into the Personalities list.
+    const std::vector<const Function*> &Personalities = MMI->getPersonalities();
+    for (std::vector<const Function*>::const_iterator I = Personalities.begin(),
+         E = Personalities.end(); I != E; ++I) {
+      if (*I) {
+        MCSymbol *NLPSym = getSymbolWithGlobalValueBase(*I, "$non_lazy_ptr");
+        MachineModuleInfoImpl::StubValueTy &StubSym =
+          MMIMacho.getGVStubEntry(NLPSym);
+        StubSym = MachineModuleInfoImpl::StubValueTy(getSymbol(*I), true);
+      }
+    }
+  }
+
+  // Output stubs for dynamically-linked functions.
+  Stubs = MMIMacho.GetGVStubList();
+  
+  // Output macho stubs for external and common global variables.
+  if (!Stubs.empty()) {
+    // Switch with ".non_lazy_symbol_pointer" directive.
+    OutStreamer.SwitchSection(TLOFMacho.getNonLazySymbolPointerSection());
+    EmitAlignment(isPPC64 ? 3 : 2);
+    
+    for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+      // L_foo$stub:
+      OutStreamer.EmitLabel(Stubs[i].first);
+      //   .indirect_symbol _foo
+      MachineModuleInfoImpl::StubValueTy &MCSym = Stubs[i].second;
+      OutStreamer.EmitSymbolAttribute(MCSym.getPointer(), MCSA_IndirectSymbol);
+
+      if (MCSym.getInt())
+        // External to current translation unit.
+        OutStreamer.EmitIntValue(0, isPPC64 ? 8 : 4/*size*/);
+      else
+        // Internal to current translation unit.
+        //
+        // When we place the LSDA into the TEXT section, the type info pointers
+        // need to be indirect and pc-rel. We accomplish this by using NLPs.
+        // However, sometimes the types are local to the file. So we need to
+        // fill in the value for the NLP in those cases.
+        OutStreamer.EmitValue(MCSymbolRefExpr::Create(MCSym.getPointer(),
+                                                      OutContext),
+                              isPPC64 ? 8 : 4/*size*/);
+    }
+
+    Stubs.clear();
+    OutStreamer.AddBlankLine();
+  }
+
+  Stubs = MMIMacho.GetHiddenGVStubList();
+  if (!Stubs.empty()) {
+    OutStreamer.SwitchSection(getObjFileLowering().getDataSection());
+    EmitAlignment(isPPC64 ? 3 : 2);
+    
+    for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+      // L_foo$stub:
+      OutStreamer.EmitLabel(Stubs[i].first);
+      //   .long _foo
+      OutStreamer.EmitValue(MCSymbolRefExpr::
+                            Create(Stubs[i].second.getPointer(),
+                                   OutContext),
+                            isPPC64 ? 8 : 4/*size*/);
+    }
+
+    Stubs.clear();
+    OutStreamer.AddBlankLine();
+  }
+
+  // Funny Darwin hack: This flag tells the linker that no global symbols
+  // contain code that falls through to other global symbols (e.g. the obvious
+  // implementation of multiple entry points).  If this doesn't occur, the
+  // linker can safely perform dead code stripping.  Since LLVM never generates
+  // code that does this, it is always safe to set.
+  OutStreamer.EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
+
+  return AsmPrinter::doFinalization(M);
+}
+
+/// createPPCAsmPrinterPass - Returns a pass that prints the PPC assembly code
+/// for a MachineFunction to the given output stream, in a format that the
+/// Darwin assembler can deal with.
+///
+static AsmPrinter *createPPCAsmPrinterPass(TargetMachine &tm,
+                                           MCStreamer &Streamer) {
+  const PPCSubtarget *Subtarget = &tm.getSubtarget<PPCSubtarget>();
+
+  if (Subtarget->isDarwin())
+    return new PPCDarwinAsmPrinter(tm, Streamer);
+  return new PPCLinuxAsmPrinter(tm, Streamer);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializePowerPCAsmPrinter() { 
+  TargetRegistry::RegisterAsmPrinter(ThePPC32Target, createPPCAsmPrinterPass);
+  TargetRegistry::RegisterAsmPrinter(ThePPC64Target, createPPCAsmPrinterPass);
+  TargetRegistry::RegisterAsmPrinter(ThePPC64LETarget, createPPCAsmPrinterPass);
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp b/contrib/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp
new file mode 100644
index 0000000..ee90671
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp
@@ -0,0 +1,211 @@
+//===-- PPCBranchSelector.cpp - Emit long conditional branches ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that scans a machine function to determine which
+// conditional branches need more than 16 bits of displacement to reach their
+// target basic block.  It does this in two passes; a calculation of basic block
+// positions pass, and a branch pseudo op to machine branch opcode pass.  This
+// pass should be run last, just before the assembly printer.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "MCTargetDesc/PPCPredicates.h"
+#include "PPCInstrBuilder.h"
+#include "PPCInstrInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-branch-select"
+
+STATISTIC(NumExpanded, "Number of branches expanded to long format");
+
+namespace llvm {
+  void initializePPCBSelPass(PassRegistry&);
+}
+
+namespace {
+  struct PPCBSel : public MachineFunctionPass {
+    static char ID;
+    PPCBSel() : MachineFunctionPass(ID) {
+      initializePPCBSelPass(*PassRegistry::getPassRegistry());
+    }
+
+    /// BlockSizes - The sizes of the basic blocks in the function.
+    std::vector<unsigned> BlockSizes;
+
+    bool runOnMachineFunction(MachineFunction &Fn) override;
+
+    const char *getPassName() const override {
+      return "PowerPC Branch Selector";
+    }
+  };
+  char PPCBSel::ID = 0;
+}
+
+INITIALIZE_PASS(PPCBSel, "ppc-branch-select", "PowerPC Branch Selector",
+                false, false)
+
+/// createPPCBranchSelectionPass - returns an instance of the Branch Selection
+/// Pass
+///
+FunctionPass *llvm::createPPCBranchSelectionPass() {
+  return new PPCBSel();
+}
+
+bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) {
+  const PPCInstrInfo *TII =
+                static_cast<const PPCInstrInfo*>(Fn.getTarget().getInstrInfo());
+  // Give the blocks of the function a dense, in-order, numbering.
+  Fn.RenumberBlocks();
+  BlockSizes.resize(Fn.getNumBlockIDs());
+
+  // Measure each MBB and compute a size for the entire function.
+  unsigned FuncSize = 0;
+  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+       ++MFI) {
+    MachineBasicBlock *MBB = MFI;
+
+    unsigned BlockSize = 0;
+    for (MachineBasicBlock::iterator MBBI = MBB->begin(), EE = MBB->end();
+         MBBI != EE; ++MBBI)
+      BlockSize += TII->GetInstSizeInBytes(MBBI);
+    
+    BlockSizes[MBB->getNumber()] = BlockSize;
+    FuncSize += BlockSize;
+  }
+  
+  // If the entire function is smaller than the displacement of a branch field,
+  // we know we don't need to shrink any branches in this function.  This is a
+  // common case.
+  if (FuncSize < (1 << 15)) {
+    BlockSizes.clear();
+    return false;
+  }
+  
+  // For each conditional branch, if the offset to its destination is larger
+  // than the offset field allows, transform it into a long branch sequence
+  // like this:
+  //   short branch:
+  //     bCC MBB
+  //   long branch:
+  //     b!CC $PC+8
+  //     b MBB
+  //
+  bool MadeChange = true;
+  bool EverMadeChange = false;
+  while (MadeChange) {
+    // Iteratively expand branches until we reach a fixed point.
+    MadeChange = false;
+  
+    for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+         ++MFI) {
+      MachineBasicBlock &MBB = *MFI;
+      unsigned MBBStartOffset = 0;
+      for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+           I != E; ++I) {
+        MachineBasicBlock *Dest = nullptr;
+        if (I->getOpcode() == PPC::BCC && !I->getOperand(2).isImm())
+          Dest = I->getOperand(2).getMBB();
+        else if ((I->getOpcode() == PPC::BC || I->getOpcode() == PPC::BCn) &&
+                 !I->getOperand(1).isImm())
+          Dest = I->getOperand(1).getMBB();
+        else if ((I->getOpcode() == PPC::BDNZ8 || I->getOpcode() == PPC::BDNZ ||
+                  I->getOpcode() == PPC::BDZ8  || I->getOpcode() == PPC::BDZ) &&
+                 !I->getOperand(0).isImm())
+          Dest = I->getOperand(0).getMBB();
+
+        if (!Dest) {
+          MBBStartOffset += TII->GetInstSizeInBytes(I);
+          continue;
+        }
+        
+        // Determine the offset from the current branch to the destination
+        // block.
+        int BranchSize;
+        if (Dest->getNumber() <= MBB.getNumber()) {
+          // If this is a backwards branch, the delta is the offset from the
+          // start of this block to this branch, plus the sizes of all blocks
+          // from this block to the dest.
+          BranchSize = MBBStartOffset;
+          
+          for (unsigned i = Dest->getNumber(), e = MBB.getNumber(); i != e; ++i)
+            BranchSize += BlockSizes[i];
+        } else {
+          // Otherwise, add the size of the blocks between this block and the
+          // dest to the number of bytes left in this block.
+          BranchSize = -MBBStartOffset;
+
+          for (unsigned i = MBB.getNumber(), e = Dest->getNumber(); i != e; ++i)
+            BranchSize += BlockSizes[i];
+        }
+
+        // If this branch is in range, ignore it.
+        if (isInt<16>(BranchSize)) {
+          MBBStartOffset += 4;
+          continue;
+        }
+
+        // Otherwise, we have to expand it to a long branch.
+        MachineInstr *OldBranch = I;
+        DebugLoc dl = OldBranch->getDebugLoc();
+ 
+        if (I->getOpcode() == PPC::BCC) {
+          // The BCC operands are:
+          // 0. PPC branch predicate
+          // 1. CR register
+          // 2. Target MBB
+          PPC::Predicate Pred = (PPC::Predicate)I->getOperand(0).getImm();
+          unsigned CRReg = I->getOperand(1).getReg();
+       
+          // Jump over the uncond branch inst (i.e. $PC+8) on opposite condition.
+          BuildMI(MBB, I, dl, TII->get(PPC::BCC))
+            .addImm(PPC::InvertPredicate(Pred)).addReg(CRReg).addImm(2);
+        } else if (I->getOpcode() == PPC::BC) {
+          unsigned CRBit = I->getOperand(0).getReg();
+          BuildMI(MBB, I, dl, TII->get(PPC::BCn)).addReg(CRBit).addImm(2);
+        } else if (I->getOpcode() == PPC::BCn) {
+          unsigned CRBit = I->getOperand(0).getReg();
+          BuildMI(MBB, I, dl, TII->get(PPC::BC)).addReg(CRBit).addImm(2);
+        } else if (I->getOpcode() == PPC::BDNZ) {
+          BuildMI(MBB, I, dl, TII->get(PPC::BDZ)).addImm(2);
+        } else if (I->getOpcode() == PPC::BDNZ8) {
+          BuildMI(MBB, I, dl, TII->get(PPC::BDZ8)).addImm(2);
+        } else if (I->getOpcode() == PPC::BDZ) {
+          BuildMI(MBB, I, dl, TII->get(PPC::BDNZ)).addImm(2);
+        } else if (I->getOpcode() == PPC::BDZ8) {
+          BuildMI(MBB, I, dl, TII->get(PPC::BDNZ8)).addImm(2);
+        } else {
+           llvm_unreachable("Unhandled branch type!");
+        }
+        
+        // Uncond branch to the real destination.
+        I = BuildMI(MBB, I, dl, TII->get(PPC::B)).addMBB(Dest);
+
+        // Remove the old branch from the function.
+        OldBranch->eraseFromParent();
+        
+        // Remember that this instruction is 8-bytes, increase the size of the
+        // block by 4, remember to iterate.
+        BlockSizes[MBB.getNumber()] += 4;
+        MBBStartOffset += 8;
+        ++NumExpanded;
+        MadeChange = true;
+      }
+    }
+    EverMadeChange |= MadeChange;
+  }
+  
+  BlockSizes.clear();
+  return true;
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp b/contrib/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp
new file mode 100644
index 0000000..ec1e34d
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp
@@ -0,0 +1,663 @@
+//===-- PPCCTRLoops.cpp - Identify and generate CTR loops -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass identifies loops where we can generate the PPC branch instructions
+// that decrement and test the count register (CTR) (bdnz and friends).
+//
+// The pattern that defines the induction variable can changed depending on
+// prior optimizations.  For example, the IndVarSimplify phase run by 'opt'
+// normalizes induction variables, and the Loop Strength Reduction pass
+// run by 'llc' may also make changes to the induction variable.
+//
+// Criteria for CTR loops:
+//  - Countable loops (w/ ind. var for a trip count)
+//  - Try inner-most loops first
+//  - No nested CTR loops.
+//  - No function calls in loops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "PPC.h"
+#include "PPCTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/PassSupport.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+
+#ifndef NDEBUG
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#endif
+
+#include <algorithm>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ctrloops"
+
+#ifndef NDEBUG
+static cl::opt<int> CTRLoopLimit("ppc-max-ctrloop", cl::Hidden, cl::init(-1));
+#endif
+
+STATISTIC(NumCTRLoops, "Number of loops converted to CTR loops");
+
+namespace llvm {
+  void initializePPCCTRLoopsPass(PassRegistry&);
+#ifndef NDEBUG
+  void initializePPCCTRLoopsVerifyPass(PassRegistry&);
+#endif
+}
+
+namespace {
+  struct PPCCTRLoops : public FunctionPass {
+
+#ifndef NDEBUG
+    static int Counter;
+#endif
+
+  public:
+    static char ID;
+
+    PPCCTRLoops() : FunctionPass(ID), TM(nullptr) {
+      initializePPCCTRLoopsPass(*PassRegistry::getPassRegistry());
+    }
+    PPCCTRLoops(PPCTargetMachine &TM) : FunctionPass(ID), TM(&TM) {
+      initializePPCCTRLoopsPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<LoopInfo>();
+      AU.addPreserved<LoopInfo>();
+      AU.addRequired<DominatorTreeWrapperPass>();
+      AU.addPreserved<DominatorTreeWrapperPass>();
+      AU.addRequired<ScalarEvolution>();
+    }
+
+  private:
+    bool mightUseCTR(const Triple &TT, BasicBlock *BB);
+    bool convertToCTRLoop(Loop *L);
+
+  private:
+    PPCTargetMachine *TM;
+    LoopInfo *LI;
+    ScalarEvolution *SE;
+    const DataLayout *DL;
+    DominatorTree *DT;
+    const TargetLibraryInfo *LibInfo;
+  };
+
+  char PPCCTRLoops::ID = 0;
+#ifndef NDEBUG
+  int PPCCTRLoops::Counter = 0;
+#endif
+
+#ifndef NDEBUG
+  struct PPCCTRLoopsVerify : public MachineFunctionPass {
+  public:
+    static char ID;
+
+    PPCCTRLoopsVerify() : MachineFunctionPass(ID) {
+      initializePPCCTRLoopsVerifyPass(*PassRegistry::getPassRegistry());
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<MachineDominatorTree>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+  private:
+    MachineDominatorTree *MDT;
+  };
+
+  char PPCCTRLoopsVerify::ID = 0;
+#endif // NDEBUG
+} // end anonymous namespace
+
+INITIALIZE_PASS_BEGIN(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_END(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops",
+                    false, false)
+
+FunctionPass *llvm::createPPCCTRLoops(PPCTargetMachine &TM) {
+  return new PPCCTRLoops(TM);
+}
+
+#ifndef NDEBUG
+INITIALIZE_PASS_BEGIN(PPCCTRLoopsVerify, "ppc-ctr-loops-verify",
+                      "PowerPC CTR Loops Verify", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(PPCCTRLoopsVerify, "ppc-ctr-loops-verify",
+                    "PowerPC CTR Loops Verify", false, false)
+
+FunctionPass *llvm::createPPCCTRLoopsVerify() {
+  return new PPCCTRLoopsVerify();
+}
+#endif // NDEBUG
+
+bool PPCCTRLoops::runOnFunction(Function &F) {
+  LI = &getAnalysis<LoopInfo>();
+  SE = &getAnalysis<ScalarEvolution>();
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
+  DL = DLP ? &DLP->getDataLayout() : nullptr;
+  LibInfo = getAnalysisIfAvailable<TargetLibraryInfo>();
+
+  bool MadeChange = false;
+
+  for (LoopInfo::iterator I = LI->begin(), E = LI->end();
+       I != E; ++I) {
+    Loop *L = *I;
+    if (!L->getParentLoop())
+      MadeChange |= convertToCTRLoop(L);
+  }
+
+  return MadeChange;
+}
+
+static bool isLargeIntegerTy(bool Is32Bit, Type *Ty) {
+  if (IntegerType *ITy = dyn_cast<IntegerType>(Ty))
+    return ITy->getBitWidth() > (Is32Bit ? 32U : 64U);
+
+  return false;
+}
+
+bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
+  for (BasicBlock::iterator J = BB->begin(), JE = BB->end();
+       J != JE; ++J) {
+    if (CallInst *CI = dyn_cast<CallInst>(J)) {
+      if (InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue())) {
+        // Inline ASM is okay, unless it clobbers the ctr register.
+        InlineAsm::ConstraintInfoVector CIV = IA->ParseConstraints();
+        for (unsigned i = 0, ie = CIV.size(); i < ie; ++i) {
+          InlineAsm::ConstraintInfo &C = CIV[i];
+          if (C.Type != InlineAsm::isInput)
+            for (unsigned j = 0, je = C.Codes.size(); j < je; ++j)
+              if (StringRef(C.Codes[j]).equals_lower("{ctr}"))
+                return true;
+        }
+
+        continue;
+      }
+
+      if (!TM)
+        return true;
+      const TargetLowering *TLI = TM->getTargetLowering();
+
+      if (Function *F = CI->getCalledFunction()) {
+        // Most intrinsics don't become function calls, but some might.
+        // sin, cos, exp and log are always calls.
+        unsigned Opcode;
+        if (F->getIntrinsicID() != Intrinsic::not_intrinsic) {
+          switch (F->getIntrinsicID()) {
+          default: continue;
+
+// VisualStudio defines setjmp as _setjmp
+#if defined(_MSC_VER) && defined(setjmp) && \
+                       !defined(setjmp_undefined_for_msvc)
+#  pragma push_macro("setjmp")
+#  undef setjmp
+#  define setjmp_undefined_for_msvc
+#endif
+
+          case Intrinsic::setjmp:
+
+#if defined(_MSC_VER) && defined(setjmp_undefined_for_msvc)
+ // let's return it to _setjmp state
+#  pragma pop_macro("setjmp")
+#  undef setjmp_undefined_for_msvc
+#endif
+
+          case Intrinsic::longjmp:
+
+          // Exclude eh_sjlj_setjmp; we don't need to exclude eh_sjlj_longjmp
+          // because, although it does clobber the counter register, the
+          // control can't then return to inside the loop unless there is also
+          // an eh_sjlj_setjmp.
+          case Intrinsic::eh_sjlj_setjmp:
+
+          case Intrinsic::memcpy:
+          case Intrinsic::memmove:
+          case Intrinsic::memset:
+          case Intrinsic::powi:
+          case Intrinsic::log:
+          case Intrinsic::log2:
+          case Intrinsic::log10:
+          case Intrinsic::exp:
+          case Intrinsic::exp2:
+          case Intrinsic::pow:
+          case Intrinsic::sin:
+          case Intrinsic::cos:
+            return true;
+          case Intrinsic::copysign:
+            if (CI->getArgOperand(0)->getType()->getScalarType()->
+                isPPC_FP128Ty())
+              return true;
+            else
+              continue; // ISD::FCOPYSIGN is never a library call.
+          case Intrinsic::sqrt:      Opcode = ISD::FSQRT;      break;
+          case Intrinsic::floor:     Opcode = ISD::FFLOOR;     break;
+          case Intrinsic::ceil:      Opcode = ISD::FCEIL;      break;
+          case Intrinsic::trunc:     Opcode = ISD::FTRUNC;     break;
+          case Intrinsic::rint:      Opcode = ISD::FRINT;      break;
+          case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break;
+          case Intrinsic::round:     Opcode = ISD::FROUND;     break;
+          }
+        }
+
+        // PowerPC does not use [US]DIVREM or other library calls for
+        // operations on regular types which are not otherwise library calls
+        // (i.e. soft float or atomics). If adapting for targets that do,
+        // additional care is required here.
+
+        LibFunc::Func Func;
+        if (!F->hasLocalLinkage() && F->hasName() && LibInfo &&
+            LibInfo->getLibFunc(F->getName(), Func) &&
+            LibInfo->hasOptimizedCodeGen(Func)) {
+          // Non-read-only functions are never treated as intrinsics.
+          if (!CI->onlyReadsMemory())
+            return true;
+
+          // Conversion happens only for FP calls.
+          if (!CI->getArgOperand(0)->getType()->isFloatingPointTy())
+            return true;
+
+          switch (Func) {
+          default: return true;
+          case LibFunc::copysign:
+          case LibFunc::copysignf:
+            continue; // ISD::FCOPYSIGN is never a library call.
+          case LibFunc::copysignl:
+            return true;
+          case LibFunc::fabs:
+          case LibFunc::fabsf:
+          case LibFunc::fabsl:
+            continue; // ISD::FABS is never a library call.
+          case LibFunc::sqrt:
+          case LibFunc::sqrtf:
+          case LibFunc::sqrtl:
+            Opcode = ISD::FSQRT; break;
+          case LibFunc::floor:
+          case LibFunc::floorf:
+          case LibFunc::floorl:
+            Opcode = ISD::FFLOOR; break;
+          case LibFunc::nearbyint:
+          case LibFunc::nearbyintf:
+          case LibFunc::nearbyintl:
+            Opcode = ISD::FNEARBYINT; break;
+          case LibFunc::ceil:
+          case LibFunc::ceilf:
+          case LibFunc::ceill:
+            Opcode = ISD::FCEIL; break;
+          case LibFunc::rint:
+          case LibFunc::rintf:
+          case LibFunc::rintl:
+            Opcode = ISD::FRINT; break;
+          case LibFunc::round:
+          case LibFunc::roundf:
+          case LibFunc::roundl:
+            Opcode = ISD::FROUND; break;
+          case LibFunc::trunc:
+          case LibFunc::truncf:
+          case LibFunc::truncl:
+            Opcode = ISD::FTRUNC; break;
+          }
+
+          MVT VTy =
+            TLI->getSimpleValueType(CI->getArgOperand(0)->getType(), true);
+          if (VTy == MVT::Other)
+            return true;
+          
+          if (TLI->isOperationLegalOrCustom(Opcode, VTy))
+            continue;
+          else if (VTy.isVector() &&
+                   TLI->isOperationLegalOrCustom(Opcode, VTy.getScalarType()))
+            continue;
+
+          return true;
+        }
+      }
+
+      return true;
+    } else if (isa<BinaryOperator>(J) &&
+               J->getType()->getScalarType()->isPPC_FP128Ty()) {
+      // Most operations on ppc_f128 values become calls.
+      return true;
+    } else if (isa<UIToFPInst>(J) || isa<SIToFPInst>(J) ||
+               isa<FPToUIInst>(J) || isa<FPToSIInst>(J)) {
+      CastInst *CI = cast<CastInst>(J);
+      if (CI->getSrcTy()->getScalarType()->isPPC_FP128Ty() ||
+          CI->getDestTy()->getScalarType()->isPPC_FP128Ty() ||
+          isLargeIntegerTy(TT.isArch32Bit(), CI->getSrcTy()->getScalarType()) ||
+          isLargeIntegerTy(TT.isArch32Bit(), CI->getDestTy()->getScalarType()))
+        return true;
+    } else if (isLargeIntegerTy(TT.isArch32Bit(),
+                                J->getType()->getScalarType()) &&
+               (J->getOpcode() == Instruction::UDiv ||
+                J->getOpcode() == Instruction::SDiv ||
+                J->getOpcode() == Instruction::URem ||
+                J->getOpcode() == Instruction::SRem)) {
+      return true;
+    } else if (TT.isArch32Bit() &&
+               isLargeIntegerTy(false, J->getType()->getScalarType()) &&
+               (J->getOpcode() == Instruction::Shl ||
+                J->getOpcode() == Instruction::AShr ||
+                J->getOpcode() == Instruction::LShr)) {
+      // Only on PPC32, for 128-bit integers (specifically not 64-bit
+      // integers), these might be runtime calls.
+      return true;
+    } else if (isa<IndirectBrInst>(J) || isa<InvokeInst>(J)) {
+      // On PowerPC, indirect jumps use the counter register.
+      return true;
+    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(J)) {
+      if (!TM)
+        return true;
+      const TargetLowering *TLI = TM->getTargetLowering();
+
+      if (TLI->supportJumpTables() &&
+          SI->getNumCases()+1 >= (unsigned) TLI->getMinimumJumpTableEntries())
+        return true;
+    }
+  }
+
+  return false;
+}
+
+bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
+  bool MadeChange = false;
+
+  Triple TT = Triple(L->getHeader()->getParent()->getParent()->
+                     getTargetTriple());
+  if (!TT.isArch32Bit() && !TT.isArch64Bit())
+    return MadeChange; // Unknown arch. type.
+
+  // Process nested loops first.
+  for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) {
+    MadeChange |= convertToCTRLoop(*I);
+  }
+
+  // If a nested loop has been converted, then we can't convert this loop.
+  if (MadeChange)
+    return MadeChange;
+
+#ifndef NDEBUG
+  // Stop trying after reaching the limit (if any).
+  int Limit = CTRLoopLimit;
+  if (Limit >= 0) {
+    if (Counter >= CTRLoopLimit)
+      return false;
+    Counter++;
+  }
+#endif
+
+  // We don't want to spill/restore the counter register, and so we don't
+  // want to use the counter register if the loop contains calls.
+  for (Loop::block_iterator I = L->block_begin(), IE = L->block_end();
+       I != IE; ++I)
+    if (mightUseCTR(TT, *I))
+      return MadeChange;
+
+  SmallVector<BasicBlock*, 4> ExitingBlocks;
+  L->getExitingBlocks(ExitingBlocks);
+
+  BasicBlock *CountedExitBlock = nullptr;
+  const SCEV *ExitCount = nullptr;
+  BranchInst *CountedExitBranch = nullptr;
+  for (SmallVectorImpl<BasicBlock *>::iterator I = ExitingBlocks.begin(),
+       IE = ExitingBlocks.end(); I != IE; ++I) {
+    const SCEV *EC = SE->getExitCount(L, *I);
+    DEBUG(dbgs() << "Exit Count for " << *L << " from block " <<
+                    (*I)->getName() << ": " << *EC << "\n");
+    if (isa<SCEVCouldNotCompute>(EC))
+      continue;
+    if (const SCEVConstant *ConstEC = dyn_cast<SCEVConstant>(EC)) {
+      if (ConstEC->getValue()->isZero())
+        continue;
+    } else if (!SE->isLoopInvariant(EC, L))
+      continue;
+
+    if (SE->getTypeSizeInBits(EC->getType()) > (TT.isArch64Bit() ? 64 : 32))
+      continue;
+
+    // We now have a loop-invariant count of loop iterations (which is not the
+    // constant zero) for which we know that this loop will not exit via this
+    // exisiting block.
+
+    // We need to make sure that this block will run on every loop iteration.
+    // For this to be true, we must dominate all blocks with backedges. Such
+    // blocks are in-loop predecessors to the header block.
+    bool NotAlways = false;
+    for (pred_iterator PI = pred_begin(L->getHeader()),
+         PIE = pred_end(L->getHeader()); PI != PIE; ++PI) {
+      if (!L->contains(*PI))
+        continue;
+
+      if (!DT->dominates(*I, *PI)) {
+        NotAlways = true;
+        break;
+      }
+    }
+
+    if (NotAlways)
+      continue;
+
+    // Make sure this blocks ends with a conditional branch.
+    Instruction *TI = (*I)->getTerminator();
+    if (!TI)
+      continue;
+
+    if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+      if (!BI->isConditional())
+        continue;
+
+      CountedExitBranch = BI;
+    } else
+      continue;
+
+    // Note that this block may not be the loop latch block, even if the loop
+    // has a latch block.
+    CountedExitBlock = *I;
+    ExitCount = EC;
+    break;
+  }
+
+  if (!CountedExitBlock)
+    return MadeChange;
+
+  BasicBlock *Preheader = L->getLoopPreheader();
+
+  // If we don't have a preheader, then insert one. If we already have a
+  // preheader, then we can use it (except if the preheader contains a use of
+  // the CTR register because some such uses might be reordered by the
+  // selection DAG after the mtctr instruction).
+  if (!Preheader || mightUseCTR(TT, Preheader))
+    Preheader = InsertPreheaderForLoop(L, this);
+  if (!Preheader)
+    return MadeChange;
+
+  DEBUG(dbgs() << "Preheader for exit count: " << Preheader->getName() << "\n");
+
+  // Insert the count into the preheader and replace the condition used by the
+  // selected branch.
+  MadeChange = true;
+
+  SCEVExpander SCEVE(*SE, "loopcnt");
+  LLVMContext &C = SE->getContext();
+  Type *CountType = TT.isArch64Bit() ? Type::getInt64Ty(C) :
+                                       Type::getInt32Ty(C);
+  if (!ExitCount->getType()->isPointerTy() &&
+      ExitCount->getType() != CountType)
+    ExitCount = SE->getZeroExtendExpr(ExitCount, CountType);
+  ExitCount = SE->getAddExpr(ExitCount,
+                             SE->getConstant(CountType, 1)); 
+  Value *ECValue = SCEVE.expandCodeFor(ExitCount, CountType,
+                                       Preheader->getTerminator());
+
+  IRBuilder<> CountBuilder(Preheader->getTerminator());
+  Module *M = Preheader->getParent()->getParent();
+  Value *MTCTRFunc = Intrinsic::getDeclaration(M, Intrinsic::ppc_mtctr,
+                                               CountType);
+  CountBuilder.CreateCall(MTCTRFunc, ECValue);
+
+  IRBuilder<> CondBuilder(CountedExitBranch);
+  Value *DecFunc =
+    Intrinsic::getDeclaration(M, Intrinsic::ppc_is_decremented_ctr_nonzero);
+  Value *NewCond = CondBuilder.CreateCall(DecFunc);
+  Value *OldCond = CountedExitBranch->getCondition();
+  CountedExitBranch->setCondition(NewCond);
+
+  // The false branch must exit the loop.
+  if (!L->contains(CountedExitBranch->getSuccessor(0)))
+    CountedExitBranch->swapSuccessors();
+
+  // The old condition may be dead now, and may have even created a dead PHI
+  // (the original induction variable).
+  RecursivelyDeleteTriviallyDeadInstructions(OldCond);
+  DeleteDeadPHIs(CountedExitBlock);
+
+  ++NumCTRLoops;
+  return MadeChange;
+}
+
+#ifndef NDEBUG
+static bool clobbersCTR(const MachineInstr *MI) {
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg()) {
+      if (MO.isDef() && (MO.getReg() == PPC::CTR || MO.getReg() == PPC::CTR8))
+        return true;
+    } else if (MO.isRegMask()) {
+      if (MO.clobbersPhysReg(PPC::CTR) || MO.clobbersPhysReg(PPC::CTR8))
+        return true;
+    }
+  }
+
+  return false;
+}
+
+static bool verifyCTRBranch(MachineBasicBlock *MBB,
+                            MachineBasicBlock::iterator I) {
+  MachineBasicBlock::iterator BI = I;
+  SmallSet<MachineBasicBlock *, 16>   Visited;
+  SmallVector<MachineBasicBlock *, 8> Preds;
+  bool CheckPreds;
+
+  if (I == MBB->begin()) {
+    Visited.insert(MBB);
+    goto queue_preds;
+  } else
+    --I;
+
+check_block:
+  Visited.insert(MBB);
+  if (I == MBB->end())
+    goto queue_preds;
+
+  CheckPreds = true;
+  for (MachineBasicBlock::iterator IE = MBB->begin();; --I) {
+    unsigned Opc = I->getOpcode();
+    if (Opc == PPC::MTCTRloop || Opc == PPC::MTCTR8loop) {
+      CheckPreds = false;
+      break;
+    }
+
+    if (I != BI && clobbersCTR(I)) {
+      DEBUG(dbgs() << "BB#" << MBB->getNumber() << " (" <<
+                      MBB->getFullName() << ") instruction " << *I <<
+                      " clobbers CTR, invalidating " << "BB#" <<
+                      BI->getParent()->getNumber() << " (" <<
+                      BI->getParent()->getFullName() << ") instruction " <<
+                      *BI << "\n");
+      return false;
+    }
+
+    if (I == IE)
+      break;
+  }
+
+  if (!CheckPreds && Preds.empty())
+    return true;
+
+  if (CheckPreds) {
+queue_preds:
+    if (MachineFunction::iterator(MBB) == MBB->getParent()->begin()) {
+      DEBUG(dbgs() << "Unable to find a MTCTR instruction for BB#" <<
+                      BI->getParent()->getNumber() << " (" <<
+                      BI->getParent()->getFullName() << ") instruction " <<
+                      *BI << "\n");
+      return false;
+    }
+
+    for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
+         PIE = MBB->pred_end(); PI != PIE; ++PI)
+      Preds.push_back(*PI);
+  }
+
+  do {
+    MBB = Preds.pop_back_val();
+    if (!Visited.count(MBB)) {
+      I = MBB->getLastNonDebugInstr();
+      goto check_block;
+    }
+  } while (!Preds.empty());
+
+  return true;
+}
+
+bool PPCCTRLoopsVerify::runOnMachineFunction(MachineFunction &MF) {
+  MDT = &getAnalysis<MachineDominatorTree>();
+
+  // Verify that all bdnz/bdz instructions are dominated by a loop mtctr before
+  // any other instructions that might clobber the ctr register.
+  for (MachineFunction::iterator I = MF.begin(), IE = MF.end();
+       I != IE; ++I) {
+    MachineBasicBlock *MBB = I;
+    if (!MDT->isReachableFromEntry(MBB))
+      continue;
+
+    for (MachineBasicBlock::iterator MII = MBB->getFirstTerminator(),
+      MIIE = MBB->end(); MII != MIIE; ++MII) {
+      unsigned Opc = MII->getOpcode();
+      if (Opc == PPC::BDNZ8 || Opc == PPC::BDNZ ||
+          Opc == PPC::BDZ8  || Opc == PPC::BDZ)
+        if (!verifyCTRBranch(MBB, MII))
+          llvm_unreachable("Invalid PPC CTR loop!");
+    }
+  }
+
+  return false;
+}
+#endif // NDEBUG
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCCallingConv.td b/contrib/llvm/lib/Target/PowerPC/PPCCallingConv.td
new file mode 100644
index 0000000..222760a
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCCallingConv.td
@@ -0,0 +1,199 @@
+//===- PPCCallingConv.td - Calling Conventions for PowerPC -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the PowerPC 32- and 64-bit
+// architectures.
+//
+//===----------------------------------------------------------------------===//
+
+/// CCIfSubtarget - Match if the current subtarget has a feature F.
+class CCIfSubtarget<string F, CCAction A>
+ : CCIf<!strconcat("State.getTarget().getSubtarget<PPCSubtarget>().", F), A>;
+class CCIfNotSubtarget<string F, CCAction A>
+ : CCIf<!strconcat("!State.getTarget().getSubtarget<PPCSubtarget>().", F), A>;
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Convention
+//===----------------------------------------------------------------------===//
+
+// Return-value convention for PowerPC
+def RetCC_PPC : CallingConv<[
+  // On PPC64, integer return values are always promoted to i64
+  CCIfType<[i32, i1], CCIfSubtarget<"isPPC64()", CCPromoteToType<i64>>>,
+  CCIfType<[i1], CCIfNotSubtarget<"isPPC64()", CCPromoteToType<i32>>>,
+
+  CCIfType<[i32], CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10]>>,
+  CCIfType<[i64], CCAssignToReg<[X3, X4, X5, X6]>>,
+  CCIfType<[i128], CCAssignToReg<[X3, X4, X5, X6]>>,
+
+  // Floating point types returned as "direct" go into F1 .. F8; note that
+  // only the ELFv2 ABI fully utilizes all these registers.
+  CCIfType<[f32], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
+  CCIfType<[f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
+  
+  // Vector types returned as "direct" go into V2 .. V9; note that only the
+  // ELFv2 ABI fully utilizes all these registers.
+  CCIfType<[v16i8, v8i16, v4i32, v4f32],
+           CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>,
+  CCIfType<[v2f64, v2i64],
+           CCAssignToReg<[VSH2, VSH3, VSH4, VSH5, VSH6, VSH7, VSH8, VSH9]>>
+]>;
+
+
+// Note that we don't currently have calling conventions for 64-bit
+// PowerPC, but handle all the complexities of the ABI in the lowering
+// logic.  FIXME: See if the logic can be simplified with use of CCs.
+// This may require some extensions to current table generation.
+
+// Simple calling convention for 64-bit ELF PowerPC fast isel.
+// Only handle ints and floats.  All ints are promoted to i64.
+// Vector types and quadword ints are not handled.
+def CC_PPC64_ELF_FIS : CallingConv<[
+  CCIfType<[i1],  CCPromoteToType<i64>>,
+  CCIfType<[i8],  CCPromoteToType<i64>>,
+  CCIfType<[i16], CCPromoteToType<i64>>,
+  CCIfType<[i32], CCPromoteToType<i64>>,
+  CCIfType<[i64], CCAssignToReg<[X3, X4, X5, X6, X7, X8, X9, X10]>>,
+  CCIfType<[f32, f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>
+]>;
+
+// Simple return-value convention for 64-bit ELF PowerPC fast isel.
+// All small ints are promoted to i64.  Vector types, quadword ints,
+// and multiple register returns are "supported" to avoid compile
+// errors, but none are handled by the fast selector.
+def RetCC_PPC64_ELF_FIS : CallingConv<[
+  CCIfType<[i1],   CCPromoteToType<i64>>,
+  CCIfType<[i8],   CCPromoteToType<i64>>,
+  CCIfType<[i16],  CCPromoteToType<i64>>,
+  CCIfType<[i32],  CCPromoteToType<i64>>,
+  CCIfType<[i64],  CCAssignToReg<[X3, X4]>>,
+  CCIfType<[i128], CCAssignToReg<[X3, X4, X5, X6]>>,
+  CCIfType<[f32],  CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
+  CCIfType<[f64],  CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
+  CCIfType<[v16i8, v8i16, v4i32, v4f32],
+           CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>,
+  CCIfType<[v2f64, v2i64],
+           CCAssignToReg<[VSH2, VSH3, VSH4, VSH5, VSH6, VSH7, VSH8, VSH9]>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// PowerPC System V Release 4 32-bit ABI
+//===----------------------------------------------------------------------===//
+
+def CC_PPC32_SVR4_Common : CallingConv<[
+  CCIfType<[i1], CCPromoteToType<i32>>,
+
+  // The ABI requires i64 to be passed in two adjacent registers with the first
+  // register having an odd register number.
+  CCIfType<[i32], CCIfSplit<CCCustom<"CC_PPC32_SVR4_Custom_AlignArgRegs">>>,
+
+  // The first 8 integer arguments are passed in integer registers.
+  CCIfType<[i32], CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10]>>,
+
+  // Make sure the i64 words from a long double are either both passed in
+  // registers or both passed on the stack.
+  CCIfType<[f64], CCIfSplit<CCCustom<"CC_PPC32_SVR4_Custom_AlignFPArgRegs">>>,
+  
+  // FP values are passed in F1 - F8.
+  CCIfType<[f32, f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
+
+  // Split arguments have an alignment of 8 bytes on the stack.
+  CCIfType<[i32], CCIfSplit<CCAssignToStack<4, 8>>>,
+  
+  CCIfType<[i32], CCAssignToStack<4, 4>>,
+  
+  // Floats are stored in double precision format, thus they have the same
+  // alignment and size as doubles.
+  CCIfType<[f32,f64], CCAssignToStack<8, 8>>,  
+
+  // Vectors get 16-byte stack slots that are 16-byte aligned.
+  CCIfType<[v16i8, v8i16, v4i32, v4f32, v2f64, v2i64], CCAssignToStack<16, 16>>
+]>;
+
+// This calling convention puts vector arguments always on the stack. It is used
+// to assign vector arguments which belong to the variable portion of the
+// parameter list of a variable argument function.
+def CC_PPC32_SVR4_VarArg : CallingConv<[
+  CCDelegateTo<CC_PPC32_SVR4_Common>
+]>;
+
+// In contrast to CC_PPC32_SVR4_VarArg, this calling convention first tries to
+// put vector arguments in vector registers before putting them on the stack.
+def CC_PPC32_SVR4 : CallingConv<[
+  // The first 12 Vector arguments are passed in AltiVec registers.
+  CCIfType<[v16i8, v8i16, v4i32, v4f32],
+           CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13]>>,
+  CCIfType<[v2f64, v2i64],
+           CCAssignToReg<[VSH2, VSH3, VSH4, VSH5, VSH6, VSH7, VSH8, VSH9,
+                          VSH10, VSH11, VSH12, VSH13]>>,
+           
+  CCDelegateTo<CC_PPC32_SVR4_Common>
+]>;  
+
+// Helper "calling convention" to handle aggregate by value arguments.
+// Aggregate by value arguments are always placed in the local variable space
+// of the caller. This calling convention is only used to assign those stack
+// offsets in the callers stack frame.
+//
+// Still, the address of the aggregate copy in the callers stack frame is passed
+// in a GPR (or in the parameter list area if all GPRs are allocated) from the
+// caller to the callee. The location for the address argument is assigned by
+// the CC_PPC32_SVR4 calling convention.
+//
+// The only purpose of CC_PPC32_SVR4_Custom_Dummy is to skip arguments which are
+// not passed by value.
+ 
+def CC_PPC32_SVR4_ByVal : CallingConv<[
+  CCIfByVal<CCPassByVal<4, 4>>,
+  
+  CCCustom<"CC_PPC32_SVR4_Custom_Dummy">
+]>;
+
+def CSR_Altivec : CalleeSavedRegs<(add V20, V21, V22, V23, V24, V25, V26, V27,
+                                       V28, V29, V30, V31)>;
+
+def CSR_Darwin32 : CalleeSavedRegs<(add R13, R14, R15, R16, R17, R18, R19, R20,
+                                        R21, R22, R23, R24, R25, R26, R27, R28,
+                                        R29, R30, R31, F14, F15, F16, F17, F18,
+                                        F19, F20, F21, F22, F23, F24, F25, F26,
+                                        F27, F28, F29, F30, F31, CR2, CR3, CR4
+                                   )>;
+
+def CSR_Darwin32_Altivec : CalleeSavedRegs<(add CSR_Darwin32, CSR_Altivec)>;
+
+def CSR_SVR432   : CalleeSavedRegs<(add R14, R15, R16, R17, R18, R19, R20,
+                                        R21, R22, R23, R24, R25, R26, R27, R28,
+                                        R29, R30, R31, F14, F15, F16, F17, F18,
+                                        F19, F20, F21, F22, F23, F24, F25, F26,
+                                        F27, F28, F29, F30, F31, CR2, CR3, CR4
+                                   )>;
+
+def CSR_SVR432_Altivec : CalleeSavedRegs<(add CSR_SVR432, CSR_Altivec)>;
+
+def CSR_Darwin64 : CalleeSavedRegs<(add X13, X14, X15, X16, X17, X18, X19, X20,
+                                        X21, X22, X23, X24, X25, X26, X27, X28,
+                                        X29, X30, X31, F14, F15, F16, F17, F18,
+                                        F19, F20, F21, F22, F23, F24, F25, F26,
+                                        F27, F28, F29, F30, F31, CR2, CR3, CR4
+                                   )>;
+
+def CSR_Darwin64_Altivec : CalleeSavedRegs<(add CSR_Darwin64, CSR_Altivec)>;
+
+def CSR_SVR464   : CalleeSavedRegs<(add X14, X15, X16, X17, X18, X19, X20,
+                                        X21, X22, X23, X24, X25, X26, X27, X28,
+                                        X29, X30, X31, F14, F15, F16, F17, F18,
+                                        F19, F20, F21, F22, F23, F24, F25, F26,
+                                        F27, F28, F29, F30, F31, CR2, CR3, CR4
+                                   )>;
+
+
+def CSR_SVR464_Altivec : CalleeSavedRegs<(add CSR_SVR464, CSR_Altivec)>;
+
+def CSR_NoRegs : CalleeSavedRegs<(add)>;
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCCodeEmitter.cpp b/contrib/llvm/lib/Target/PowerPC/PPCCodeEmitter.cpp
new file mode 100644
index 0000000..0875523
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCCodeEmitter.cpp
@@ -0,0 +1,293 @@
+//===-- PPCCodeEmitter.cpp - JIT Code Emitter for PowerPC -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the PowerPC 32-bit CodeEmitter and associated machinery to
+// JIT-compile bitcode to native PowerPC.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "PPCRelocations.h"
+#include "PPCTargetMachine.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/Module.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+namespace {
+  class PPCCodeEmitter : public MachineFunctionPass {
+    TargetMachine &TM;
+    JITCodeEmitter &MCE;
+    MachineModuleInfo *MMI;
+    
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<MachineModuleInfo>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+    
+    static char ID;
+    
+    /// MovePCtoLROffset - When/if we see a MovePCtoLR instruction, we record
+    /// its address in the function into this pointer.
+    void *MovePCtoLROffset;
+  public:
+    
+    PPCCodeEmitter(TargetMachine &tm, JITCodeEmitter &mce)
+      : MachineFunctionPass(ID), TM(tm), MCE(mce) {}
+
+    /// getBinaryCodeForInstr - This function, generated by the
+    /// CodeEmitterGenerator using TableGen, produces the binary encoding for
+    /// machine instructions.
+    uint64_t getBinaryCodeForInstr(const MachineInstr &MI) const;
+
+    
+    MachineRelocation GetRelocation(const MachineOperand &MO,
+                                    unsigned RelocID) const;
+    
+    /// getMachineOpValue - evaluates the MachineOperand of a given MachineInstr
+    unsigned getMachineOpValue(const MachineInstr &MI,
+                               const MachineOperand &MO) const;
+
+    unsigned get_crbitm_encoding(const MachineInstr &MI, unsigned OpNo) const;
+    unsigned getDirectBrEncoding(const MachineInstr &MI, unsigned OpNo) const;
+    unsigned getCondBrEncoding(const MachineInstr &MI, unsigned OpNo) const;
+    unsigned getAbsDirectBrEncoding(const MachineInstr &MI,
+                                    unsigned OpNo) const;
+    unsigned getAbsCondBrEncoding(const MachineInstr &MI, unsigned OpNo) const;
+
+    unsigned getImm16Encoding(const MachineInstr &MI, unsigned OpNo) const;
+    unsigned getMemRIEncoding(const MachineInstr &MI, unsigned OpNo) const;
+    unsigned getMemRIXEncoding(const MachineInstr &MI, unsigned OpNo) const;
+    unsigned getTLSRegEncoding(const MachineInstr &MI, unsigned OpNo) const;
+    unsigned getTLSCallEncoding(const MachineInstr &MI, unsigned OpNo) const;
+
+    const char *getPassName() const override {
+      return "PowerPC Machine Code Emitter";
+    }
+
+    /// runOnMachineFunction - emits the given MachineFunction to memory
+    ///
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    /// emitBasicBlock - emits the given MachineBasicBlock to memory
+    ///
+    void emitBasicBlock(MachineBasicBlock &MBB);
+  };
+}
+
+char PPCCodeEmitter::ID = 0;
+
+/// createPPCCodeEmitterPass - Return a pass that emits the collected PPC code
+/// to the specified MCE object.
+FunctionPass *llvm::createPPCJITCodeEmitterPass(PPCTargetMachine &TM,
+                                                JITCodeEmitter &JCE) {
+  return new PPCCodeEmitter(TM, JCE);
+}
+
+bool PPCCodeEmitter::runOnMachineFunction(MachineFunction &MF) {
+  assert((MF.getTarget().getRelocationModel() != Reloc::Default ||
+          MF.getTarget().getRelocationModel() != Reloc::Static) &&
+         "JIT relocation model must be set to static or default!");
+
+  MMI = &getAnalysis<MachineModuleInfo>();
+  MCE.setModuleInfo(MMI);
+  do {
+    MovePCtoLROffset = nullptr;
+    MCE.startFunction(MF);
+    for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB)
+      emitBasicBlock(*BB);
+  } while (MCE.finishFunction(MF));
+
+  return false;
+}
+
+void PPCCodeEmitter::emitBasicBlock(MachineBasicBlock &MBB) {
+  MCE.StartMachineBasicBlock(&MBB);
+
+  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; ++I){
+    const MachineInstr &MI = *I;
+    MCE.processDebugLoc(MI.getDebugLoc(), true);
+    switch (MI.getOpcode()) {
+    default:
+      MCE.emitWordBE(getBinaryCodeForInstr(MI));
+      break;
+    case TargetOpcode::CFI_INSTRUCTION:
+      break;
+    case TargetOpcode::EH_LABEL:
+      MCE.emitLabel(MI.getOperand(0).getMCSymbol());
+      break;
+    case TargetOpcode::IMPLICIT_DEF:
+    case TargetOpcode::KILL:
+      break; // pseudo opcode, no side effects
+    case PPC::MovePCtoLR:
+    case PPC::MovePCtoLR8:
+      assert(TM.getRelocationModel() == Reloc::PIC_);
+      MovePCtoLROffset = (void*)MCE.getCurrentPCValue();
+      MCE.emitWordBE(0x48000005);   // bl 1
+      break;
+    }
+    MCE.processDebugLoc(MI.getDebugLoc(), false);
+  }
+}
+
+unsigned PPCCodeEmitter::get_crbitm_encoding(const MachineInstr &MI,
+                                             unsigned OpNo) const {
+  const MachineOperand &MO = MI.getOperand(OpNo);
+  assert((MI.getOpcode() == PPC::MTOCRF || MI.getOpcode() == PPC::MTOCRF8 ||
+          MI.getOpcode() == PPC::MFOCRF || MI.getOpcode() == PPC::MFOCRF8) &&
+         (MO.getReg() >= PPC::CR0 && MO.getReg() <= PPC::CR7));
+  return 0x80 >> TM.getRegisterInfo()->getEncodingValue(MO.getReg());
+}
+
+MachineRelocation PPCCodeEmitter::GetRelocation(const MachineOperand &MO, 
+                                                unsigned RelocID) const {
+  // If in PIC mode, we need to encode the negated address of the
+  // 'movepctolr' into the unrelocated field.  After relocation, we'll have
+  // &gv-&movepctolr-4 in the imm field.  Once &movepctolr is added to the imm
+  // field, we get &gv.  This doesn't happen for branch relocations, which are
+  // always implicitly pc relative.
+  intptr_t Cst = 0;
+  if (TM.getRelocationModel() == Reloc::PIC_) {
+    assert(MovePCtoLROffset && "MovePCtoLR not seen yet?");
+    Cst = -(intptr_t)MovePCtoLROffset - 4;
+  }
+  
+  if (MO.isGlobal())
+    return MachineRelocation::getGV(MCE.getCurrentPCOffset(), RelocID,
+                                    const_cast<GlobalValue *>(MO.getGlobal()),
+                                    Cst, isa<Function>(MO.getGlobal()));
+  if (MO.isSymbol())
+    return MachineRelocation::getExtSym(MCE.getCurrentPCOffset(),
+                                        RelocID, MO.getSymbolName(), Cst);
+  if (MO.isCPI())
+    return MachineRelocation::getConstPool(MCE.getCurrentPCOffset(),
+                                           RelocID, MO.getIndex(), Cst);
+
+  if (MO.isMBB())
+    return MachineRelocation::getBB(MCE.getCurrentPCOffset(),
+                                    RelocID, MO.getMBB());
+  
+  assert(MO.isJTI());
+  return MachineRelocation::getJumpTable(MCE.getCurrentPCOffset(),
+                                         RelocID, MO.getIndex(), Cst);
+}
+
+unsigned PPCCodeEmitter::getDirectBrEncoding(const MachineInstr &MI,
+                                             unsigned OpNo) const {
+  const MachineOperand &MO = MI.getOperand(OpNo);
+  if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO);
+  
+  MCE.addRelocation(GetRelocation(MO, PPC::reloc_pcrel_bx));
+  return 0;
+}
+
+unsigned PPCCodeEmitter::getCondBrEncoding(const MachineInstr &MI,
+                                           unsigned OpNo) const {
+  const MachineOperand &MO = MI.getOperand(OpNo);
+  MCE.addRelocation(GetRelocation(MO, PPC::reloc_pcrel_bcx));
+  return 0;
+}
+
+unsigned PPCCodeEmitter::getAbsDirectBrEncoding(const MachineInstr &MI,
+                                                unsigned OpNo) const {
+  const MachineOperand &MO = MI.getOperand(OpNo);
+  if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO);
+
+  llvm_unreachable("Absolute branch relocations unsupported on the old JIT.");
+}
+
+unsigned PPCCodeEmitter::getAbsCondBrEncoding(const MachineInstr &MI,
+                                              unsigned OpNo) const {
+  llvm_unreachable("Absolute branch relocations unsupported on the old JIT.");
+}
+
+unsigned PPCCodeEmitter::getImm16Encoding(const MachineInstr &MI,
+                                          unsigned OpNo) const {
+  const MachineOperand &MO = MI.getOperand(OpNo);
+  if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO);
+
+  unsigned RelocID;
+  switch (MO.getTargetFlags() & PPCII::MO_ACCESS_MASK) {
+    default: llvm_unreachable("Unsupported target operand flags!");
+    case PPCII::MO_LO: RelocID = PPC::reloc_absolute_low; break;
+    case PPCII::MO_HA: RelocID = PPC::reloc_absolute_high; break;
+  }
+
+  MCE.addRelocation(GetRelocation(MO, RelocID));
+  return 0;
+}
+
+unsigned PPCCodeEmitter::getMemRIEncoding(const MachineInstr &MI,
+                                          unsigned OpNo) const {
+  // Encode (imm, reg) as a memri, which has the low 16-bits as the
+  // displacement and the next 5 bits as the register #.
+  assert(MI.getOperand(OpNo+1).isReg());
+  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1)) << 16;
+  
+  const MachineOperand &MO = MI.getOperand(OpNo);
+  if (MO.isImm())
+    return (getMachineOpValue(MI, MO) & 0xFFFF) | RegBits;
+  
+  // Add a fixup for the displacement field.
+  MCE.addRelocation(GetRelocation(MO, PPC::reloc_absolute_low));
+  return RegBits;
+}
+
+unsigned PPCCodeEmitter::getMemRIXEncoding(const MachineInstr &MI,
+                                           unsigned OpNo) const {
+  // Encode (imm, reg) as a memrix, which has the low 14-bits as the
+  // displacement and the next 5 bits as the register #.
+  assert(MI.getOperand(OpNo+1).isReg());
+  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1)) << 14;
+  
+  const MachineOperand &MO = MI.getOperand(OpNo);
+  if (MO.isImm())
+    return ((getMachineOpValue(MI, MO) >> 2) & 0x3FFF) | RegBits;
+  
+  MCE.addRelocation(GetRelocation(MO, PPC::reloc_absolute_low_ix));
+  return RegBits;
+}
+
+
+unsigned PPCCodeEmitter::getTLSRegEncoding(const MachineInstr &MI,
+                                           unsigned OpNo) const {
+  llvm_unreachable("TLS not supported on the old JIT.");
+  return 0;
+}
+
+unsigned PPCCodeEmitter::getTLSCallEncoding(const MachineInstr &MI,
+                                            unsigned OpNo) const {
+  llvm_unreachable("TLS not supported on the old JIT.");
+  return 0;
+}
+
+unsigned PPCCodeEmitter::getMachineOpValue(const MachineInstr &MI,
+                                           const MachineOperand &MO) const {
+
+  if (MO.isReg()) {
+    // MTOCRF/MFOCRF should go through get_crbitm_encoding for the CR operand.
+    // The GPR operand should come through here though.
+    assert((MI.getOpcode() != PPC::MTOCRF && MI.getOpcode() != PPC::MTOCRF8 &&
+            MI.getOpcode() != PPC::MFOCRF && MI.getOpcode() != PPC::MFOCRF8) ||
+           MO.getReg() < PPC::CR0 || MO.getReg() > PPC::CR7);
+    return TM.getRegisterInfo()->getEncodingValue(MO.getReg());
+  }
+  
+  assert(MO.isImm() &&
+         "Relocation required in an instruction that we cannot encode!");
+  return MO.getImm();
+}
+
+#include "PPCGenCodeEmitter.inc"
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp b/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp
new file mode 100644
index 0000000..2e524d6
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp
@@ -0,0 +1,2282 @@
+//===-- PPCFastISel.cpp - PowerPC FastISel implementation -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the PowerPC-specific support for the FastISel class. Some
+// of the target-specific code is generated by tablegen in the file
+// PPCGenFastISel.inc, which is #included here.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "MCTargetDesc/PPCPredicates.h"
+#include "PPCISelLowering.h"
+#include "PPCSubtarget.h"
+#include "PPCTargetMachine.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+
+//===----------------------------------------------------------------------===//
+//
+// TBD:
+//   FastLowerArguments: Handle simple cases.
+//   PPCMaterializeGV: Handle TLS.
+//   SelectCall: Handle function pointers.
+//   SelectCall: Handle multi-register return values.
+//   SelectCall: Optimize away nops for local calls.
+//   processCallArgs: Handle bit-converted arguments.
+//   finishCall: Handle multi-register return values.
+//   PPCComputeAddress: Handle parameter references as FrameIndex's.
+//   PPCEmitCmp: Handle immediate as operand 1.
+//   SelectCall: Handle small byval arguments.
+//   SelectIntrinsicCall: Implement.
+//   SelectSelect: Implement.
+//   Consider factoring isTypeLegal into the base class.
+//   Implement switches and jump tables.
+//
+//===----------------------------------------------------------------------===//
+using namespace llvm;
+
+#define DEBUG_TYPE "ppcfastisel"
+
+namespace {
+
+typedef struct Address {
+  enum {
+    RegBase,
+    FrameIndexBase
+  } BaseType;
+
+  union {
+    unsigned Reg;
+    int FI;
+  } Base;
+
+  long Offset;
+
+  // Innocuous defaults for our address.
+  Address()
+   : BaseType(RegBase), Offset(0) {
+     Base.Reg = 0;
+   }
+} Address;
+
+class PPCFastISel final : public FastISel {
+
+  const TargetMachine &TM;
+  const TargetInstrInfo &TII;
+  const TargetLowering &TLI;
+  const PPCSubtarget *PPCSubTarget;
+  LLVMContext *Context;
+
+  public:
+    explicit PPCFastISel(FunctionLoweringInfo &FuncInfo,
+                         const TargetLibraryInfo *LibInfo)
+    : FastISel(FuncInfo, LibInfo),
+      TM(FuncInfo.MF->getTarget()),
+      TII(*TM.getInstrInfo()),
+      TLI(*TM.getTargetLowering()),
+      PPCSubTarget(&TM.getSubtarget<PPCSubtarget>()),
+      Context(&FuncInfo.Fn->getContext()) { }
+
+  // Backend specific FastISel code.
+  private:
+    bool TargetSelectInstruction(const Instruction *I) override;
+    unsigned TargetMaterializeConstant(const Constant *C) override;
+    unsigned TargetMaterializeAlloca(const AllocaInst *AI) override;
+    bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
+                             const LoadInst *LI) override;
+    bool FastLowerArguments() override;
+    unsigned FastEmit_i(MVT Ty, MVT RetTy, unsigned Opc, uint64_t Imm) override;
+    unsigned FastEmitInst_ri(unsigned MachineInstOpcode,
+                             const TargetRegisterClass *RC,
+                             unsigned Op0, bool Op0IsKill,
+                             uint64_t Imm);
+    unsigned FastEmitInst_r(unsigned MachineInstOpcode,
+                            const TargetRegisterClass *RC,
+                            unsigned Op0, bool Op0IsKill);
+    unsigned FastEmitInst_rr(unsigned MachineInstOpcode,
+                             const TargetRegisterClass *RC,
+                             unsigned Op0, bool Op0IsKill,
+                             unsigned Op1, bool Op1IsKill);
+
+  // Instruction selection routines.
+  private:
+    bool SelectLoad(const Instruction *I);
+    bool SelectStore(const Instruction *I);
+    bool SelectBranch(const Instruction *I);
+    bool SelectIndirectBr(const Instruction *I);
+    bool SelectFPExt(const Instruction *I);
+    bool SelectFPTrunc(const Instruction *I);
+    bool SelectIToFP(const Instruction *I, bool IsSigned);
+    bool SelectFPToI(const Instruction *I, bool IsSigned);
+    bool SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode);
+    bool SelectCall(const Instruction *I);
+    bool SelectRet(const Instruction *I);
+    bool SelectTrunc(const Instruction *I);
+    bool SelectIntExt(const Instruction *I);
+
+  // Utility routines.
+  private:
+    bool isTypeLegal(Type *Ty, MVT &VT);
+    bool isLoadTypeLegal(Type *Ty, MVT &VT);
+    bool PPCEmitCmp(const Value *Src1Value, const Value *Src2Value,
+                    bool isZExt, unsigned DestReg);
+    bool PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
+                     const TargetRegisterClass *RC, bool IsZExt = true,
+                     unsigned FP64LoadOpc = PPC::LFD);
+    bool PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr);
+    bool PPCComputeAddress(const Value *Obj, Address &Addr);
+    void PPCSimplifyAddress(Address &Addr, MVT VT, bool &UseOffset,
+                            unsigned &IndexReg);
+    bool PPCEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+                           unsigned DestReg, bool IsZExt);
+    unsigned PPCMaterializeFP(const ConstantFP *CFP, MVT VT);
+    unsigned PPCMaterializeGV(const GlobalValue *GV, MVT VT);
+    unsigned PPCMaterializeInt(const Constant *C, MVT VT);
+    unsigned PPCMaterialize32BitInt(int64_t Imm,
+                                    const TargetRegisterClass *RC);
+    unsigned PPCMaterialize64BitInt(int64_t Imm,
+                                    const TargetRegisterClass *RC);
+    unsigned PPCMoveToIntReg(const Instruction *I, MVT VT,
+                             unsigned SrcReg, bool IsSigned);
+    unsigned PPCMoveToFPReg(MVT VT, unsigned SrcReg, bool IsSigned);
+
+  // Call handling routines.
+  private:
+    bool processCallArgs(SmallVectorImpl<Value*> &Args,
+                         SmallVectorImpl<unsigned> &ArgRegs,
+                         SmallVectorImpl<MVT> &ArgVTs,
+                         SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
+                         SmallVectorImpl<unsigned> &RegArgs,
+                         CallingConv::ID CC,
+                         unsigned &NumBytes,
+                         bool IsVarArg);
+    void finishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
+                    const Instruction *I, CallingConv::ID CC,
+                    unsigned &NumBytes, bool IsVarArg);
+    CCAssignFn *usePPC32CCs(unsigned Flag);
+
+  private:
+  #include "PPCGenFastISel.inc"
+
+};
+
+} // end anonymous namespace
+
+#include "PPCGenCallingConv.inc"
+
+// Function whose sole purpose is to kill compiler warnings 
+// stemming from unused functions included from PPCGenCallingConv.inc.
+CCAssignFn *PPCFastISel::usePPC32CCs(unsigned Flag) {
+  if (Flag == 1)
+    return CC_PPC32_SVR4;
+  else if (Flag == 2)
+    return CC_PPC32_SVR4_ByVal;
+  else if (Flag == 3)
+    return CC_PPC32_SVR4_VarArg;
+  else
+    return RetCC_PPC;
+}
+
+static Optional<PPC::Predicate> getComparePred(CmpInst::Predicate Pred) {
+  switch (Pred) {
+    // These are not representable with any single compare.
+    case CmpInst::FCMP_FALSE:
+    case CmpInst::FCMP_UEQ:
+    case CmpInst::FCMP_UGT:
+    case CmpInst::FCMP_UGE:
+    case CmpInst::FCMP_ULT:
+    case CmpInst::FCMP_ULE:
+    case CmpInst::FCMP_UNE:
+    case CmpInst::FCMP_TRUE:
+    default:
+      return Optional<PPC::Predicate>();
+
+    case CmpInst::FCMP_OEQ:
+    case CmpInst::ICMP_EQ:
+      return PPC::PRED_EQ;
+
+    case CmpInst::FCMP_OGT:
+    case CmpInst::ICMP_UGT:
+    case CmpInst::ICMP_SGT:
+      return PPC::PRED_GT;
+
+    case CmpInst::FCMP_OGE:
+    case CmpInst::ICMP_UGE:
+    case CmpInst::ICMP_SGE:
+      return PPC::PRED_GE;
+
+    case CmpInst::FCMP_OLT:
+    case CmpInst::ICMP_ULT:
+    case CmpInst::ICMP_SLT:
+      return PPC::PRED_LT;
+
+    case CmpInst::FCMP_OLE:
+    case CmpInst::ICMP_ULE:
+    case CmpInst::ICMP_SLE:
+      return PPC::PRED_LE;
+
+    case CmpInst::FCMP_ONE:
+    case CmpInst::ICMP_NE:
+      return PPC::PRED_NE;
+
+    case CmpInst::FCMP_ORD:
+      return PPC::PRED_NU;
+
+    case CmpInst::FCMP_UNO:
+      return PPC::PRED_UN;
+  }
+}
+
+// Determine whether the type Ty is simple enough to be handled by
+// fast-isel, and return its equivalent machine type in VT.
+// FIXME: Copied directly from ARM -- factor into base class?
+bool PPCFastISel::isTypeLegal(Type *Ty, MVT &VT) {
+  EVT Evt = TLI.getValueType(Ty, true);
+
+  // Only handle simple types.
+  if (Evt == MVT::Other || !Evt.isSimple()) return false;
+  VT = Evt.getSimpleVT();
+
+  // Handle all legal types, i.e. a register that will directly hold this
+  // value.
+  return TLI.isTypeLegal(VT);
+}
+
+// Determine whether the type Ty is simple enough to be handled by
+// fast-isel as a load target, and return its equivalent machine type in VT.
+bool PPCFastISel::isLoadTypeLegal(Type *Ty, MVT &VT) {
+  if (isTypeLegal(Ty, VT)) return true;
+
+  // If this is a type than can be sign or zero-extended to a basic operation
+  // go ahead and accept it now.
+  if (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) {
+    return true;
+  }
+
+  return false;
+}
+
+// Given a value Obj, create an Address object Addr that represents its
+// address.  Return false if we can't handle it.
+bool PPCFastISel::PPCComputeAddress(const Value *Obj, Address &Addr) {
+  const User *U = nullptr;
+  unsigned Opcode = Instruction::UserOp1;
+  if (const Instruction *I = dyn_cast<Instruction>(Obj)) {
+    // Don't walk into other basic blocks unless the object is an alloca from
+    // another block, otherwise it may not have a virtual register assigned.
+    if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(Obj)) ||
+        FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
+      Opcode = I->getOpcode();
+      U = I;
+    }
+  } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(Obj)) {
+    Opcode = C->getOpcode();
+    U = C;
+  }
+
+  switch (Opcode) {
+    default:
+      break;
+    case Instruction::BitCast:
+      // Look through bitcasts.
+      return PPCComputeAddress(U->getOperand(0), Addr);
+    case Instruction::IntToPtr:
+      // Look past no-op inttoptrs.
+      if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
+        return PPCComputeAddress(U->getOperand(0), Addr);
+      break;
+    case Instruction::PtrToInt:
+      // Look past no-op ptrtoints.
+      if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
+        return PPCComputeAddress(U->getOperand(0), Addr);
+      break;
+    case Instruction::GetElementPtr: {
+      Address SavedAddr = Addr;
+      long TmpOffset = Addr.Offset;
+
+      // Iterate through the GEP folding the constants into offsets where
+      // we can.
+      gep_type_iterator GTI = gep_type_begin(U);
+      for (User::const_op_iterator II = U->op_begin() + 1, IE = U->op_end();
+           II != IE; ++II, ++GTI) {
+        const Value *Op = *II;
+        if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+          const StructLayout *SL = DL.getStructLayout(STy);
+          unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
+          TmpOffset += SL->getElementOffset(Idx);
+        } else {
+          uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
+          for (;;) {
+            if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+              // Constant-offset addressing.
+              TmpOffset += CI->getSExtValue() * S;
+              break;
+            }
+            if (canFoldAddIntoGEP(U, Op)) {
+              // A compatible add with a constant operand. Fold the constant.
+              ConstantInt *CI =
+              cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
+              TmpOffset += CI->getSExtValue() * S;
+              // Iterate on the other operand.
+              Op = cast<AddOperator>(Op)->getOperand(0);
+              continue;
+            }
+            // Unsupported
+            goto unsupported_gep;
+          }
+        }
+      }
+
+      // Try to grab the base operand now.
+      Addr.Offset = TmpOffset;
+      if (PPCComputeAddress(U->getOperand(0), Addr)) return true;
+
+      // We failed, restore everything and try the other options.
+      Addr = SavedAddr;
+
+      unsupported_gep:
+      break;
+    }
+    case Instruction::Alloca: {
+      const AllocaInst *AI = cast<AllocaInst>(Obj);
+      DenseMap<const AllocaInst*, int>::iterator SI =
+        FuncInfo.StaticAllocaMap.find(AI);
+      if (SI != FuncInfo.StaticAllocaMap.end()) {
+        Addr.BaseType = Address::FrameIndexBase;
+        Addr.Base.FI = SI->second;
+        return true;
+      }
+      break;
+    }
+  }
+
+  // FIXME: References to parameters fall through to the behavior
+  // below.  They should be able to reference a frame index since
+  // they are stored to the stack, so we can get "ld rx, offset(r1)"
+  // instead of "addi ry, r1, offset / ld rx, 0(ry)".  Obj will
+  // just contain the parameter.  Try to handle this with a FI.
+
+  // Try to get this in a register if nothing else has worked.
+  if (Addr.Base.Reg == 0)
+    Addr.Base.Reg = getRegForValue(Obj);
+
+  // Prevent assignment of base register to X0, which is inappropriate
+  // for loads and stores alike.
+  if (Addr.Base.Reg != 0)
+    MRI.setRegClass(Addr.Base.Reg, &PPC::G8RC_and_G8RC_NOX0RegClass);
+
+  return Addr.Base.Reg != 0;
+}
+
+// Fix up some addresses that can't be used directly.  For example, if
+// an offset won't fit in an instruction field, we may need to move it
+// into an index register.
+void PPCFastISel::PPCSimplifyAddress(Address &Addr, MVT VT, bool &UseOffset,
+                                     unsigned &IndexReg) {
+
+  // Check whether the offset fits in the instruction field.
+  if (!isInt<16>(Addr.Offset))
+    UseOffset = false;
+
+  // If this is a stack pointer and the offset needs to be simplified then
+  // put the alloca address into a register, set the base type back to
+  // register and continue. This should almost never happen.
+  if (!UseOffset && Addr.BaseType == Address::FrameIndexBase) {
+    unsigned ResultReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDI8),
+            ResultReg).addFrameIndex(Addr.Base.FI).addImm(0);
+    Addr.Base.Reg = ResultReg;
+    Addr.BaseType = Address::RegBase;
+  }
+
+  if (!UseOffset) {
+    IntegerType *OffsetTy = ((VT == MVT::i32) ? Type::getInt32Ty(*Context)
+                             : Type::getInt64Ty(*Context));
+    const ConstantInt *Offset =
+      ConstantInt::getSigned(OffsetTy, (int64_t)(Addr.Offset));
+    IndexReg = PPCMaterializeInt(Offset, MVT::i64);
+    assert(IndexReg && "Unexpected error in PPCMaterializeInt!");
+  }
+}
+
+// Emit a load instruction if possible, returning true if we succeeded,
+// otherwise false.  See commentary below for how the register class of
+// the load is determined. 
+bool PPCFastISel::PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
+                              const TargetRegisterClass *RC,
+                              bool IsZExt, unsigned FP64LoadOpc) {
+  unsigned Opc;
+  bool UseOffset = true;
+
+  // If ResultReg is given, it determines the register class of the load.
+  // Otherwise, RC is the register class to use.  If the result of the
+  // load isn't anticipated in this block, both may be zero, in which
+  // case we must make a conservative guess.  In particular, don't assign
+  // R0 or X0 to the result register, as the result may be used in a load,
+  // store, add-immediate, or isel that won't permit this.  (Though
+  // perhaps the spill and reload of live-exit values would handle this?)
+  const TargetRegisterClass *UseRC =
+    (ResultReg ? MRI.getRegClass(ResultReg) :
+     (RC ? RC :
+      (VT == MVT::f64 ? &PPC::F8RCRegClass :
+       (VT == MVT::f32 ? &PPC::F4RCRegClass :
+        (VT == MVT::i64 ? &PPC::G8RC_and_G8RC_NOX0RegClass :
+         &PPC::GPRC_and_GPRC_NOR0RegClass)))));
+
+  bool Is32BitInt = UseRC->hasSuperClassEq(&PPC::GPRCRegClass);
+
+  switch (VT.SimpleTy) {
+    default: // e.g., vector types not handled
+      return false;
+    case MVT::i8:
+      Opc = Is32BitInt ? PPC::LBZ : PPC::LBZ8;
+      break;
+    case MVT::i16:
+      Opc = (IsZExt ?
+             (Is32BitInt ? PPC::LHZ : PPC::LHZ8) : 
+             (Is32BitInt ? PPC::LHA : PPC::LHA8));
+      break;
+    case MVT::i32:
+      Opc = (IsZExt ? 
+             (Is32BitInt ? PPC::LWZ : PPC::LWZ8) :
+             (Is32BitInt ? PPC::LWA_32 : PPC::LWA));
+      if ((Opc == PPC::LWA || Opc == PPC::LWA_32) && ((Addr.Offset & 3) != 0))
+        UseOffset = false;
+      break;
+    case MVT::i64:
+      Opc = PPC::LD;
+      assert(UseRC->hasSuperClassEq(&PPC::G8RCRegClass) && 
+             "64-bit load with 32-bit target??");
+      UseOffset = ((Addr.Offset & 3) == 0);
+      break;
+    case MVT::f32:
+      Opc = PPC::LFS;
+      break;
+    case MVT::f64:
+      Opc = FP64LoadOpc;
+      break;
+  }
+
+  // If necessary, materialize the offset into a register and use
+  // the indexed form.  Also handle stack pointers with special needs.
+  unsigned IndexReg = 0;
+  PPCSimplifyAddress(Addr, VT, UseOffset, IndexReg);
+  if (ResultReg == 0)
+    ResultReg = createResultReg(UseRC);
+
+  // Note: If we still have a frame index here, we know the offset is
+  // in range, as otherwise PPCSimplifyAddress would have converted it
+  // into a RegBase.
+  if (Addr.BaseType == Address::FrameIndexBase) {
+
+    MachineMemOperand *MMO =
+      FuncInfo.MF->getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(Addr.Base.FI, Addr.Offset),
+        MachineMemOperand::MOLoad, MFI.getObjectSize(Addr.Base.FI),
+        MFI.getObjectAlignment(Addr.Base.FI));
+
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+      .addImm(Addr.Offset).addFrameIndex(Addr.Base.FI).addMemOperand(MMO);
+
+  // Base reg with offset in range.
+  } else if (UseOffset) {
+
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+      .addImm(Addr.Offset).addReg(Addr.Base.Reg);
+
+  // Indexed form.
+  } else {
+    // Get the RR opcode corresponding to the RI one.  FIXME: It would be
+    // preferable to use the ImmToIdxMap from PPCRegisterInfo.cpp, but it
+    // is hard to get at.
+    switch (Opc) {
+      default:        llvm_unreachable("Unexpected opcode!");
+      case PPC::LBZ:    Opc = PPC::LBZX;    break;
+      case PPC::LBZ8:   Opc = PPC::LBZX8;   break;
+      case PPC::LHZ:    Opc = PPC::LHZX;    break;
+      case PPC::LHZ8:   Opc = PPC::LHZX8;   break;
+      case PPC::LHA:    Opc = PPC::LHAX;    break;
+      case PPC::LHA8:   Opc = PPC::LHAX8;   break;
+      case PPC::LWZ:    Opc = PPC::LWZX;    break;
+      case PPC::LWZ8:   Opc = PPC::LWZX8;   break;
+      case PPC::LWA:    Opc = PPC::LWAX;    break;
+      case PPC::LWA_32: Opc = PPC::LWAX_32; break;
+      case PPC::LD:     Opc = PPC::LDX;     break;
+      case PPC::LFS:    Opc = PPC::LFSX;    break;
+      case PPC::LFD:    Opc = PPC::LFDX;    break;
+    }
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+      .addReg(Addr.Base.Reg).addReg(IndexReg);
+  }
+
+  return true;
+}
+
+// Attempt to fast-select a load instruction.
+bool PPCFastISel::SelectLoad(const Instruction *I) {
+  // FIXME: No atomic loads are supported.
+  if (cast<LoadInst>(I)->isAtomic())
+    return false;
+
+  // Verify we have a legal type before going any further.
+  MVT VT;
+  if (!isLoadTypeLegal(I->getType(), VT))
+    return false;
+
+  // See if we can handle this address.
+  Address Addr;
+  if (!PPCComputeAddress(I->getOperand(0), Addr))
+    return false;
+
+  // Look at the currently assigned register for this instruction
+  // to determine the required register class.  This is necessary
+  // to constrain RA from using R0/X0 when this is not legal.
+  unsigned AssignedReg = FuncInfo.ValueMap[I];
+  const TargetRegisterClass *RC =
+    AssignedReg ? MRI.getRegClass(AssignedReg) : nullptr;
+
+  unsigned ResultReg = 0;
+  if (!PPCEmitLoad(VT, ResultReg, Addr, RC))
+    return false;
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+// Emit a store instruction to store SrcReg at Addr.
+bool PPCFastISel::PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr) {
+  assert(SrcReg && "Nothing to store!");
+  unsigned Opc;
+  bool UseOffset = true;
+
+  const TargetRegisterClass *RC = MRI.getRegClass(SrcReg);
+  bool Is32BitInt = RC->hasSuperClassEq(&PPC::GPRCRegClass);
+
+  switch (VT.SimpleTy) {
+    default: // e.g., vector types not handled
+      return false;
+    case MVT::i8:
+      Opc = Is32BitInt ? PPC::STB : PPC::STB8;
+      break;
+    case MVT::i16:
+      Opc = Is32BitInt ? PPC::STH : PPC::STH8;
+      break;
+    case MVT::i32:
+      assert(Is32BitInt && "Not GPRC for i32??");
+      Opc = PPC::STW;
+      break;
+    case MVT::i64:
+      Opc = PPC::STD;
+      UseOffset = ((Addr.Offset & 3) == 0);
+      break;
+    case MVT::f32:
+      Opc = PPC::STFS;
+      break;
+    case MVT::f64:
+      Opc = PPC::STFD;
+      break;
+  }
+
+  // If necessary, materialize the offset into a register and use
+  // the indexed form.  Also handle stack pointers with special needs.
+  unsigned IndexReg = 0;
+  PPCSimplifyAddress(Addr, VT, UseOffset, IndexReg);
+
+  // Note: If we still have a frame index here, we know the offset is
+  // in range, as otherwise PPCSimplifyAddress would have converted it
+  // into a RegBase.
+  if (Addr.BaseType == Address::FrameIndexBase) {
+    MachineMemOperand *MMO =
+      FuncInfo.MF->getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(Addr.Base.FI, Addr.Offset),
+        MachineMemOperand::MOStore, MFI.getObjectSize(Addr.Base.FI),
+        MFI.getObjectAlignment(Addr.Base.FI));
+
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
+        .addReg(SrcReg)
+        .addImm(Addr.Offset)
+        .addFrameIndex(Addr.Base.FI)
+        .addMemOperand(MMO);
+
+  // Base reg with offset in range.
+  } else if (UseOffset)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
+      .addReg(SrcReg).addImm(Addr.Offset).addReg(Addr.Base.Reg);
+
+  // Indexed form.
+  else {
+    // Get the RR opcode corresponding to the RI one.  FIXME: It would be
+    // preferable to use the ImmToIdxMap from PPCRegisterInfo.cpp, but it
+    // is hard to get at.
+    switch (Opc) {
+      default:        llvm_unreachable("Unexpected opcode!");
+      case PPC::STB:  Opc = PPC::STBX;  break;
+      case PPC::STH : Opc = PPC::STHX;  break;
+      case PPC::STW : Opc = PPC::STWX;  break;
+      case PPC::STB8: Opc = PPC::STBX8; break;
+      case PPC::STH8: Opc = PPC::STHX8; break;
+      case PPC::STW8: Opc = PPC::STWX8; break;
+      case PPC::STD:  Opc = PPC::STDX;  break;
+      case PPC::STFS: Opc = PPC::STFSX; break;
+      case PPC::STFD: Opc = PPC::STFDX; break;
+    }
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
+      .addReg(SrcReg).addReg(Addr.Base.Reg).addReg(IndexReg);
+  }
+
+  return true;
+}
+
+// Attempt to fast-select a store instruction.
+bool PPCFastISel::SelectStore(const Instruction *I) {
+  Value *Op0 = I->getOperand(0);
+  unsigned SrcReg = 0;
+
+  // FIXME: No atomics loads are supported.
+  if (cast<StoreInst>(I)->isAtomic())
+    return false;
+
+  // Verify we have a legal type before going any further.
+  MVT VT;
+  if (!isLoadTypeLegal(Op0->getType(), VT))
+    return false;
+
+  // Get the value to be stored into a register.
+  SrcReg = getRegForValue(Op0);
+  if (SrcReg == 0)
+    return false;
+
+  // See if we can handle this address.
+  Address Addr;
+  if (!PPCComputeAddress(I->getOperand(1), Addr))
+    return false;
+
+  if (!PPCEmitStore(VT, SrcReg, Addr))
+    return false;
+
+  return true;
+}
+
+// Attempt to fast-select a branch instruction.
+bool PPCFastISel::SelectBranch(const Instruction *I) {
+  const BranchInst *BI = cast<BranchInst>(I);
+  MachineBasicBlock *BrBB = FuncInfo.MBB;
+  MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
+  MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
+
+  // For now, just try the simplest case where it's fed by a compare.
+  if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
+    Optional<PPC::Predicate> OptPPCPred = getComparePred(CI->getPredicate());
+    if (!OptPPCPred)
+      return false;
+
+    PPC::Predicate PPCPred = OptPPCPred.getValue();
+
+    // Take advantage of fall-through opportunities.
+    if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
+      std::swap(TBB, FBB);
+      PPCPred = PPC::InvertPredicate(PPCPred);
+    }
+
+    unsigned CondReg = createResultReg(&PPC::CRRCRegClass);
+
+    if (!PPCEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned(),
+                    CondReg))
+      return false;
+
+    BuildMI(*BrBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::BCC))
+      .addImm(PPCPred).addReg(CondReg).addMBB(TBB);
+    FastEmitBranch(FBB, DbgLoc);
+    FuncInfo.MBB->addSuccessor(TBB);
+    return true;
+
+  } else if (const ConstantInt *CI =
+             dyn_cast<ConstantInt>(BI->getCondition())) {
+    uint64_t Imm = CI->getZExtValue();
+    MachineBasicBlock *Target = (Imm == 0) ? FBB : TBB;
+    FastEmitBranch(Target, DbgLoc);
+    return true;
+  }
+
+  // FIXME: ARM looks for a case where the block containing the compare
+  // has been split from the block containing the branch.  If this happens,
+  // there is a vreg available containing the result of the compare.  I'm
+  // not sure we can do much, as we've lost the predicate information with
+  // the compare instruction -- we have a 4-bit CR but don't know which bit
+  // to test here.
+  return false;
+}
+
+// Attempt to emit a compare of the two source values.  Signed and unsigned
+// comparisons are supported.  Return false if we can't handle it.
+bool PPCFastISel::PPCEmitCmp(const Value *SrcValue1, const Value *SrcValue2,
+                             bool IsZExt, unsigned DestReg) {
+  Type *Ty = SrcValue1->getType();
+  EVT SrcEVT = TLI.getValueType(Ty, true);
+  if (!SrcEVT.isSimple())
+    return false;
+  MVT SrcVT = SrcEVT.getSimpleVT();
+
+  if (SrcVT == MVT::i1 && PPCSubTarget->useCRBits())
+    return false;
+
+  // See if operand 2 is an immediate encodeable in the compare.
+  // FIXME: Operands are not in canonical order at -O0, so an immediate
+  // operand in position 1 is a lost opportunity for now.  We are
+  // similar to ARM in this regard.
+  long Imm = 0;
+  bool UseImm = false;
+
+  // Only 16-bit integer constants can be represented in compares for 
+  // PowerPC.  Others will be materialized into a register.
+  if (const ConstantInt *ConstInt = dyn_cast<ConstantInt>(SrcValue2)) {
+    if (SrcVT == MVT::i64 || SrcVT == MVT::i32 || SrcVT == MVT::i16 ||
+        SrcVT == MVT::i8 || SrcVT == MVT::i1) {
+      const APInt &CIVal = ConstInt->getValue();
+      Imm = (IsZExt) ? (long)CIVal.getZExtValue() : (long)CIVal.getSExtValue();
+      if ((IsZExt && isUInt<16>(Imm)) || (!IsZExt && isInt<16>(Imm)))
+        UseImm = true;
+    }
+  }
+
+  unsigned CmpOpc;
+  bool NeedsExt = false;
+  switch (SrcVT.SimpleTy) {
+    default: return false;
+    case MVT::f32:
+      CmpOpc = PPC::FCMPUS;
+      break;
+    case MVT::f64:
+      CmpOpc = PPC::FCMPUD;
+      break;
+    case MVT::i1:
+    case MVT::i8:
+    case MVT::i16:
+      NeedsExt = true;
+      // Intentional fall-through.
+    case MVT::i32:
+      if (!UseImm)
+        CmpOpc = IsZExt ? PPC::CMPLW : PPC::CMPW;
+      else
+        CmpOpc = IsZExt ? PPC::CMPLWI : PPC::CMPWI;
+      break;
+    case MVT::i64:
+      if (!UseImm)
+        CmpOpc = IsZExt ? PPC::CMPLD : PPC::CMPD;
+      else
+        CmpOpc = IsZExt ? PPC::CMPLDI : PPC::CMPDI;
+      break;
+  }
+
+  unsigned SrcReg1 = getRegForValue(SrcValue1);
+  if (SrcReg1 == 0)
+    return false;
+
+  unsigned SrcReg2 = 0;
+  if (!UseImm) {
+    SrcReg2 = getRegForValue(SrcValue2);
+    if (SrcReg2 == 0)
+      return false;
+  }
+
+  if (NeedsExt) {
+    unsigned ExtReg = createResultReg(&PPC::GPRCRegClass);
+    if (!PPCEmitIntExt(SrcVT, SrcReg1, MVT::i32, ExtReg, IsZExt))
+      return false;
+    SrcReg1 = ExtReg;
+
+    if (!UseImm) {
+      unsigned ExtReg = createResultReg(&PPC::GPRCRegClass);
+      if (!PPCEmitIntExt(SrcVT, SrcReg2, MVT::i32, ExtReg, IsZExt))
+        return false;
+      SrcReg2 = ExtReg;
+    }
+  }
+
+  if (!UseImm)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc), DestReg)
+      .addReg(SrcReg1).addReg(SrcReg2);
+  else
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc), DestReg)
+      .addReg(SrcReg1).addImm(Imm);
+
+  return true;
+}
+
+// Attempt to fast-select a floating-point extend instruction.
+bool PPCFastISel::SelectFPExt(const Instruction *I) {
+  Value *Src  = I->getOperand(0);
+  EVT SrcVT  = TLI.getValueType(Src->getType(), true);
+  EVT DestVT = TLI.getValueType(I->getType(), true);
+
+  if (SrcVT != MVT::f32 || DestVT != MVT::f64)
+    return false;
+
+  unsigned SrcReg = getRegForValue(Src);
+  if (!SrcReg)
+    return false;
+
+  // No code is generated for a FP extend.
+  UpdateValueMap(I, SrcReg);
+  return true;
+}
+
+// Attempt to fast-select a floating-point truncate instruction.
+bool PPCFastISel::SelectFPTrunc(const Instruction *I) {
+  Value *Src  = I->getOperand(0);
+  EVT SrcVT  = TLI.getValueType(Src->getType(), true);
+  EVT DestVT = TLI.getValueType(I->getType(), true);
+
+  if (SrcVT != MVT::f64 || DestVT != MVT::f32)
+    return false;
+
+  unsigned SrcReg = getRegForValue(Src);
+  if (!SrcReg)
+    return false;
+
+  // Round the result to single precision.
+  unsigned DestReg = createResultReg(&PPC::F4RCRegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::FRSP), DestReg)
+    .addReg(SrcReg);
+
+  UpdateValueMap(I, DestReg);
+  return true;
+}
+
+// Move an i32 or i64 value in a GPR to an f64 value in an FPR.
+// FIXME: When direct register moves are implemented (see PowerISA 2.08),
+// those should be used instead of moving via a stack slot when the
+// subtarget permits.
+// FIXME: The code here is sloppy for the 4-byte case.  Can use a 4-byte
+// stack slot and 4-byte store/load sequence.  Or just sext the 4-byte
+// case to 8 bytes which produces tighter code but wastes stack space.
+unsigned PPCFastISel::PPCMoveToFPReg(MVT SrcVT, unsigned SrcReg,
+                                     bool IsSigned) {
+
+  // If necessary, extend 32-bit int to 64-bit.
+  if (SrcVT == MVT::i32) {
+    unsigned TmpReg = createResultReg(&PPC::G8RCRegClass);
+    if (!PPCEmitIntExt(MVT::i32, SrcReg, MVT::i64, TmpReg, !IsSigned))
+      return 0;
+    SrcReg = TmpReg;
+  }
+
+  // Get a stack slot 8 bytes wide, aligned on an 8-byte boundary.
+  Address Addr;
+  Addr.BaseType = Address::FrameIndexBase;
+  Addr.Base.FI = MFI.CreateStackObject(8, 8, false);
+
+  // Store the value from the GPR.
+  if (!PPCEmitStore(MVT::i64, SrcReg, Addr))
+    return 0;
+
+  // Load the integer value into an FPR.  The kind of load used depends
+  // on a number of conditions.
+  unsigned LoadOpc = PPC::LFD;
+
+  if (SrcVT == MVT::i32) {
+    if (!IsSigned) {
+      LoadOpc = PPC::LFIWZX;
+      Addr.Offset = 4;
+    } else if (PPCSubTarget->hasLFIWAX()) {
+      LoadOpc = PPC::LFIWAX;
+      Addr.Offset = 4;
+    }
+  }
+
+  const TargetRegisterClass *RC = &PPC::F8RCRegClass;
+  unsigned ResultReg = 0;
+  if (!PPCEmitLoad(MVT::f64, ResultReg, Addr, RC, !IsSigned, LoadOpc))
+    return 0;
+
+  return ResultReg;
+}
+
+// Attempt to fast-select an integer-to-floating-point conversion.
+bool PPCFastISel::SelectIToFP(const Instruction *I, bool IsSigned) {
+  MVT DstVT;
+  Type *DstTy = I->getType();
+  if (!isTypeLegal(DstTy, DstVT))
+    return false;
+
+  if (DstVT != MVT::f32 && DstVT != MVT::f64)
+    return false;
+
+  Value *Src = I->getOperand(0);
+  EVT SrcEVT = TLI.getValueType(Src->getType(), true);
+  if (!SrcEVT.isSimple())
+    return false;
+
+  MVT SrcVT = SrcEVT.getSimpleVT();
+
+  if (SrcVT != MVT::i8  && SrcVT != MVT::i16 &&
+      SrcVT != MVT::i32 && SrcVT != MVT::i64)
+    return false;
+
+  unsigned SrcReg = getRegForValue(Src);
+  if (SrcReg == 0)
+    return false;
+
+  // We can only lower an unsigned convert if we have the newer
+  // floating-point conversion operations.
+  if (!IsSigned && !PPCSubTarget->hasFPCVT())
+    return false;
+
+  // FIXME: For now we require the newer floating-point conversion operations
+  // (which are present only on P7 and A2 server models) when converting
+  // to single-precision float.  Otherwise we have to generate a lot of
+  // fiddly code to avoid double rounding.  If necessary, the fiddly code
+  // can be found in PPCTargetLowering::LowerINT_TO_FP().
+  if (DstVT == MVT::f32 && !PPCSubTarget->hasFPCVT())
+    return false;
+
+  // Extend the input if necessary.
+  if (SrcVT == MVT::i8 || SrcVT == MVT::i16) {
+    unsigned TmpReg = createResultReg(&PPC::G8RCRegClass);
+    if (!PPCEmitIntExt(SrcVT, SrcReg, MVT::i64, TmpReg, !IsSigned))
+      return false;
+    SrcVT = MVT::i64;
+    SrcReg = TmpReg;
+  }
+
+  // Move the integer value to an FPR.
+  unsigned FPReg = PPCMoveToFPReg(SrcVT, SrcReg, IsSigned);
+  if (FPReg == 0)
+    return false;
+
+  // Determine the opcode for the conversion.
+  const TargetRegisterClass *RC = &PPC::F8RCRegClass;
+  unsigned DestReg = createResultReg(RC);
+  unsigned Opc;
+
+  if (DstVT == MVT::f32)
+    Opc = IsSigned ? PPC::FCFIDS : PPC::FCFIDUS;
+  else
+    Opc = IsSigned ? PPC::FCFID : PPC::FCFIDU;
+
+  // Generate the convert.
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
+    .addReg(FPReg);
+
+  UpdateValueMap(I, DestReg);
+  return true;
+}
+
+// Move the floating-point value in SrcReg into an integer destination
+// register, and return the register (or zero if we can't handle it).
+// FIXME: When direct register moves are implemented (see PowerISA 2.08),
+// those should be used instead of moving via a stack slot when the
+// subtarget permits.
+unsigned PPCFastISel::PPCMoveToIntReg(const Instruction *I, MVT VT,
+                                      unsigned SrcReg, bool IsSigned) {
+  // Get a stack slot 8 bytes wide, aligned on an 8-byte boundary.
+  // Note that if have STFIWX available, we could use a 4-byte stack
+  // slot for i32, but this being fast-isel we'll just go with the
+  // easiest code gen possible.
+  Address Addr;
+  Addr.BaseType = Address::FrameIndexBase;
+  Addr.Base.FI = MFI.CreateStackObject(8, 8, false);
+
+  // Store the value from the FPR.
+  if (!PPCEmitStore(MVT::f64, SrcReg, Addr))
+    return 0;
+
+  // Reload it into a GPR.  If we want an i32, modify the address
+  // to have a 4-byte offset so we load from the right place.
+  if (VT == MVT::i32)
+    Addr.Offset = 4;
+
+  // Look at the currently assigned register for this instruction
+  // to determine the required register class.
+  unsigned AssignedReg = FuncInfo.ValueMap[I];
+  const TargetRegisterClass *RC =
+    AssignedReg ? MRI.getRegClass(AssignedReg) : nullptr;
+
+  unsigned ResultReg = 0;
+  if (!PPCEmitLoad(VT, ResultReg, Addr, RC, !IsSigned))
+    return 0;
+
+  return ResultReg;
+}
+
+// Attempt to fast-select a floating-point-to-integer conversion.
+bool PPCFastISel::SelectFPToI(const Instruction *I, bool IsSigned) {
+  MVT DstVT, SrcVT;
+  Type *DstTy = I->getType();
+  if (!isTypeLegal(DstTy, DstVT))
+    return false;
+
+  if (DstVT != MVT::i32 && DstVT != MVT::i64)
+    return false;
+
+  // If we don't have FCTIDUZ and we need it, punt to SelectionDAG.
+  if (DstVT == MVT::i64 && !IsSigned && !PPCSubTarget->hasFPCVT())
+    return false;
+
+  Value *Src = I->getOperand(0);
+  Type *SrcTy = Src->getType();
+  if (!isTypeLegal(SrcTy, SrcVT))
+    return false;
+
+  if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
+    return false;
+
+  unsigned SrcReg = getRegForValue(Src);
+  if (SrcReg == 0)
+    return false;
+
+  // Convert f32 to f64 if necessary.  This is just a meaningless copy
+  // to get the register class right.  COPY_TO_REGCLASS is needed since
+  // a COPY from F4RC to F8RC is converted to a F4RC-F4RC copy downstream.
+  const TargetRegisterClass *InRC = MRI.getRegClass(SrcReg);
+  if (InRC == &PPC::F4RCRegClass) {
+    unsigned TmpReg = createResultReg(&PPC::F8RCRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY_TO_REGCLASS), TmpReg)
+      .addReg(SrcReg).addImm(PPC::F8RCRegClassID);
+    SrcReg = TmpReg;
+  }
+
+  // Determine the opcode for the conversion, which takes place
+  // entirely within FPRs.
+  unsigned DestReg = createResultReg(&PPC::F8RCRegClass);
+  unsigned Opc;
+
+  if (DstVT == MVT::i32)
+    if (IsSigned)
+      Opc = PPC::FCTIWZ;
+    else
+      Opc = PPCSubTarget->hasFPCVT() ? PPC::FCTIWUZ : PPC::FCTIDZ;
+  else
+    Opc = IsSigned ? PPC::FCTIDZ : PPC::FCTIDUZ;
+
+  // Generate the convert.
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
+    .addReg(SrcReg);
+
+  // Now move the integer value from a float register to an integer register.
+  unsigned IntReg = PPCMoveToIntReg(I, DstVT, DestReg, IsSigned);
+  if (IntReg == 0)
+    return false;
+
+  UpdateValueMap(I, IntReg);
+  return true;
+}
+
+// Attempt to fast-select a binary integer operation that isn't already
+// handled automatically.
+bool PPCFastISel::SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode) {
+  EVT DestVT  = TLI.getValueType(I->getType(), true);
+
+  // We can get here in the case when we have a binary operation on a non-legal
+  // type and the target independent selector doesn't know how to handle it.
+  if (DestVT != MVT::i16 && DestVT != MVT::i8)
+    return false;
+
+  // Look at the currently assigned register for this instruction
+  // to determine the required register class.  If there is no register,
+  // make a conservative choice (don't assign R0).
+  unsigned AssignedReg = FuncInfo.ValueMap[I];
+  const TargetRegisterClass *RC =
+    (AssignedReg ? MRI.getRegClass(AssignedReg) :
+     &PPC::GPRC_and_GPRC_NOR0RegClass);
+  bool IsGPRC = RC->hasSuperClassEq(&PPC::GPRCRegClass);
+
+  unsigned Opc;
+  switch (ISDOpcode) {
+    default: return false;
+    case ISD::ADD:
+      Opc = IsGPRC ? PPC::ADD4 : PPC::ADD8;
+      break;
+    case ISD::OR:
+      Opc = IsGPRC ? PPC::OR : PPC::OR8;
+      break;
+    case ISD::SUB:
+      Opc = IsGPRC ? PPC::SUBF : PPC::SUBF8;
+      break;
+  }
+
+  unsigned ResultReg = createResultReg(RC ? RC : &PPC::G8RCRegClass);
+  unsigned SrcReg1 = getRegForValue(I->getOperand(0));
+  if (SrcReg1 == 0) return false;
+
+  // Handle case of small immediate operand.
+  if (const ConstantInt *ConstInt = dyn_cast<ConstantInt>(I->getOperand(1))) {
+    const APInt &CIVal = ConstInt->getValue();
+    int Imm = (int)CIVal.getSExtValue();
+    bool UseImm = true;
+    if (isInt<16>(Imm)) {
+      switch (Opc) {
+        default:
+          llvm_unreachable("Missing case!");
+        case PPC::ADD4:
+          Opc = PPC::ADDI;
+          MRI.setRegClass(SrcReg1, &PPC::GPRC_and_GPRC_NOR0RegClass);
+          break;
+        case PPC::ADD8:
+          Opc = PPC::ADDI8;
+          MRI.setRegClass(SrcReg1, &PPC::G8RC_and_G8RC_NOX0RegClass);
+          break;
+        case PPC::OR:
+          Opc = PPC::ORI;
+          break;
+        case PPC::OR8:
+          Opc = PPC::ORI8;
+          break;
+        case PPC::SUBF:
+          if (Imm == -32768)
+            UseImm = false;
+          else {
+            Opc = PPC::ADDI;
+            MRI.setRegClass(SrcReg1, &PPC::GPRC_and_GPRC_NOR0RegClass);
+            Imm = -Imm;
+          }
+          break;
+        case PPC::SUBF8:
+          if (Imm == -32768)
+            UseImm = false;
+          else {
+            Opc = PPC::ADDI8;
+            MRI.setRegClass(SrcReg1, &PPC::G8RC_and_G8RC_NOX0RegClass);
+            Imm = -Imm;
+          }
+          break;
+      }
+
+      if (UseImm) {
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
+                ResultReg)
+            .addReg(SrcReg1)
+            .addImm(Imm);
+        UpdateValueMap(I, ResultReg);
+        return true;
+      }
+    }
+  }
+
+  // Reg-reg case.
+  unsigned SrcReg2 = getRegForValue(I->getOperand(1));
+  if (SrcReg2 == 0) return false;
+
+  // Reverse operands for subtract-from.
+  if (ISDOpcode == ISD::SUB)
+    std::swap(SrcReg1, SrcReg2);
+
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+    .addReg(SrcReg1).addReg(SrcReg2);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+// Handle arguments to a call that we're attempting to fast-select.
+// Return false if the arguments are too complex for us at the moment.
+bool PPCFastISel::processCallArgs(SmallVectorImpl<Value*> &Args,
+                                  SmallVectorImpl<unsigned> &ArgRegs,
+                                  SmallVectorImpl<MVT> &ArgVTs,
+                                  SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
+                                  SmallVectorImpl<unsigned> &RegArgs,
+                                  CallingConv::ID CC,
+                                  unsigned &NumBytes,
+                                  bool IsVarArg) {
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, TM, ArgLocs, *Context);
+
+  // Reserve space for the linkage area on the stack.
+  bool isELFv2ABI = PPCSubTarget->isELFv2ABI();
+  unsigned LinkageSize = PPCFrameLowering::getLinkageSize(true, false,
+                                                          isELFv2ABI);
+  CCInfo.AllocateStack(LinkageSize, 8);
+
+  CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CC_PPC64_ELF_FIS);
+
+  // Bail out if we can't handle any of the arguments.
+  for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
+    CCValAssign &VA = ArgLocs[I];
+    MVT ArgVT = ArgVTs[VA.getValNo()];
+
+    // Skip vector arguments for now, as well as long double and
+    // uint128_t, and anything that isn't passed in a register.
+    if (ArgVT.isVector() || ArgVT.getSizeInBits() > 64 || ArgVT == MVT::i1 ||
+        !VA.isRegLoc() || VA.needsCustom())
+      return false;
+
+    // Skip bit-converted arguments for now.
+    if (VA.getLocInfo() == CCValAssign::BCvt)
+      return false;
+  }
+
+  // Get a count of how many bytes are to be pushed onto the stack.
+  NumBytes = CCInfo.getNextStackOffset();
+
+  // The prolog code of the callee may store up to 8 GPR argument registers to
+  // the stack, allowing va_start to index over them in memory if its varargs.
+  // Because we cannot tell if this is needed on the caller side, we have to
+  // conservatively assume that it is needed.  As such, make sure we have at
+  // least enough stack space for the caller to store the 8 GPRs.
+  // FIXME: On ELFv2, it may be unnecessary to allocate the parameter area.
+  NumBytes = std::max(NumBytes, LinkageSize + 64);
+
+  // Issue CALLSEQ_START.
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+          TII.get(TII.getCallFrameSetupOpcode()))
+    .addImm(NumBytes);
+
+  // Prepare to assign register arguments.  Every argument uses up a
+  // GPR protocol register even if it's passed in a floating-point
+  // register.
+  unsigned NextGPR = PPC::X3;
+  unsigned NextFPR = PPC::F1;
+
+  // Process arguments.
+  for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
+    CCValAssign &VA = ArgLocs[I];
+    unsigned Arg = ArgRegs[VA.getValNo()];
+    MVT ArgVT = ArgVTs[VA.getValNo()];
+
+    // Handle argument promotion and bitcasts.
+    switch (VA.getLocInfo()) {
+      default:
+        llvm_unreachable("Unknown loc info!");
+      case CCValAssign::Full:
+        break;
+      case CCValAssign::SExt: {
+        MVT DestVT = VA.getLocVT();
+        const TargetRegisterClass *RC =
+          (DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
+        unsigned TmpReg = createResultReg(RC);
+        if (!PPCEmitIntExt(ArgVT, Arg, DestVT, TmpReg, /*IsZExt*/false))
+          llvm_unreachable("Failed to emit a sext!");
+        ArgVT = DestVT;
+        Arg = TmpReg;
+        break;
+      }
+      case CCValAssign::AExt:
+      case CCValAssign::ZExt: {
+        MVT DestVT = VA.getLocVT();
+        const TargetRegisterClass *RC =
+          (DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
+        unsigned TmpReg = createResultReg(RC);
+        if (!PPCEmitIntExt(ArgVT, Arg, DestVT, TmpReg, /*IsZExt*/true))
+          llvm_unreachable("Failed to emit a zext!");
+        ArgVT = DestVT;
+        Arg = TmpReg;
+        break;
+      }
+      case CCValAssign::BCvt: {
+        // FIXME: Not yet handled.
+        llvm_unreachable("Should have bailed before getting here!");
+        break;
+      }
+    }
+
+    // Copy this argument to the appropriate register.
+    unsigned ArgReg;
+    if (ArgVT == MVT::f32 || ArgVT == MVT::f64) {
+      ArgReg = NextFPR++;
+      ++NextGPR;
+    } else
+      ArgReg = NextGPR++;
+
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), ArgReg).addReg(Arg);
+    RegArgs.push_back(ArgReg);
+  }
+
+  return true;
+}
+
+// For a call that we've determined we can fast-select, finish the
+// call sequence and generate a copy to obtain the return value (if any).
+void PPCFastISel::finishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
+                             const Instruction *I, CallingConv::ID CC,
+                             unsigned &NumBytes, bool IsVarArg) {
+  // Issue CallSEQ_END.
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+          TII.get(TII.getCallFrameDestroyOpcode()))
+    .addImm(NumBytes).addImm(0);
+
+  // Next, generate a copy to obtain the return value.
+  // FIXME: No multi-register return values yet, though I don't foresee
+  // any real difficulties there.
+  if (RetVT != MVT::isVoid) {
+    SmallVector<CCValAssign, 16> RVLocs;
+    CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, TM, RVLocs, *Context);
+    CCInfo.AnalyzeCallResult(RetVT, RetCC_PPC64_ELF_FIS);
+    CCValAssign &VA = RVLocs[0];
+    assert(RVLocs.size() == 1 && "No support for multi-reg return values!");
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    MVT DestVT = VA.getValVT();
+    MVT CopyVT = DestVT;
+
+    // Ints smaller than a register still arrive in a full 64-bit
+    // register, so make sure we recognize this.
+    if (RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32)
+      CopyVT = MVT::i64;
+
+    unsigned SourcePhysReg = VA.getLocReg();
+    unsigned ResultReg = 0;
+
+    if (RetVT == CopyVT) {
+      const TargetRegisterClass *CpyRC = TLI.getRegClassFor(CopyVT);
+      ResultReg = createResultReg(CpyRC);
+
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), ResultReg)
+        .addReg(SourcePhysReg);
+
+    // If necessary, round the floating result to single precision.
+    } else if (CopyVT == MVT::f64) {
+      ResultReg = createResultReg(TLI.getRegClassFor(RetVT));
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::FRSP),
+              ResultReg).addReg(SourcePhysReg);
+
+    // If only the low half of a general register is needed, generate
+    // a GPRC copy instead of a G8RC copy.  (EXTRACT_SUBREG can't be
+    // used along the fast-isel path (not lowered), and downstream logic
+    // also doesn't like a direct subreg copy on a physical reg.)
+    } else if (RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32) {
+      ResultReg = createResultReg(&PPC::GPRCRegClass);
+      // Convert physical register from G8RC to GPRC.
+      SourcePhysReg -= PPC::X0 - PPC::R0;
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), ResultReg)
+        .addReg(SourcePhysReg);
+    }
+
+    assert(ResultReg && "ResultReg unset!");
+    UsedRegs.push_back(SourcePhysReg);
+    UpdateValueMap(I, ResultReg);
+  }
+}
+
+// Attempt to fast-select a call instruction.
+bool PPCFastISel::SelectCall(const Instruction *I) {
+  const CallInst *CI = cast<CallInst>(I);
+  const Value *Callee = CI->getCalledValue();
+
+  // Can't handle inline asm.
+  if (isa<InlineAsm>(Callee))
+    return false;
+
+  // Allow SelectionDAG isel to handle tail calls.
+  if (CI->isTailCall())
+    return false;
+
+  // Obtain calling convention.
+  ImmutableCallSite CS(CI);
+  CallingConv::ID CC = CS.getCallingConv();
+
+  PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
+  FunctionType *FTy = cast<FunctionType>(PT->getElementType());
+  bool IsVarArg = FTy->isVarArg();
+
+  // Not ready for varargs yet.
+  if (IsVarArg)
+    return false;
+
+  // Handle simple calls for now, with legal return types and
+  // those that can be extended.
+  Type *RetTy = I->getType();
+  MVT RetVT;
+  if (RetTy->isVoidTy())
+    RetVT = MVT::isVoid;
+  else if (!isTypeLegal(RetTy, RetVT) && RetVT != MVT::i16 &&
+           RetVT != MVT::i8)
+    return false;
+
+  // FIXME: No multi-register return values yet.
+  if (RetVT != MVT::isVoid && RetVT != MVT::i8 && RetVT != MVT::i16 &&
+      RetVT != MVT::i32 && RetVT != MVT::i64 && RetVT != MVT::f32 &&
+      RetVT != MVT::f64) {
+    SmallVector<CCValAssign, 16> RVLocs;
+    CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, TM, RVLocs, *Context);
+    CCInfo.AnalyzeCallResult(RetVT, RetCC_PPC64_ELF_FIS);
+    if (RVLocs.size() > 1)
+      return false;
+  }
+
+  // Bail early if more than 8 arguments, as we only currently
+  // handle arguments passed in registers.
+  unsigned NumArgs = CS.arg_size();
+  if (NumArgs > 8)
+    return false;
+
+  // Set up the argument vectors.
+  SmallVector<Value*, 8> Args;
+  SmallVector<unsigned, 8> ArgRegs;
+  SmallVector<MVT, 8> ArgVTs;
+  SmallVector<ISD::ArgFlagsTy, 8> ArgFlags;
+
+  Args.reserve(NumArgs);
+  ArgRegs.reserve(NumArgs);
+  ArgVTs.reserve(NumArgs);
+  ArgFlags.reserve(NumArgs);
+
+  for (ImmutableCallSite::arg_iterator II = CS.arg_begin(), IE = CS.arg_end();
+       II != IE; ++II) {
+    // FIXME: ARM does something for intrinsic calls here, check into that.
+
+    unsigned AttrIdx = II - CS.arg_begin() + 1;
+    
+    // Only handle easy calls for now.  It would be reasonably easy
+    // to handle <= 8-byte structures passed ByVal in registers, but we
+    // have to ensure they are right-justified in the register.
+    if (CS.paramHasAttr(AttrIdx, Attribute::InReg) ||
+        CS.paramHasAttr(AttrIdx, Attribute::StructRet) ||
+        CS.paramHasAttr(AttrIdx, Attribute::Nest) ||
+        CS.paramHasAttr(AttrIdx, Attribute::ByVal))
+      return false;
+
+    ISD::ArgFlagsTy Flags;
+    if (CS.paramHasAttr(AttrIdx, Attribute::SExt))
+      Flags.setSExt();
+    if (CS.paramHasAttr(AttrIdx, Attribute::ZExt))
+      Flags.setZExt();
+
+    Type *ArgTy = (*II)->getType();
+    MVT ArgVT;
+    if (!isTypeLegal(ArgTy, ArgVT) && ArgVT != MVT::i16 && ArgVT != MVT::i8)
+      return false;
+
+    if (ArgVT.isVector())
+      return false;
+
+    unsigned Arg = getRegForValue(*II);
+    if (Arg == 0)
+      return false;
+
+    unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy);
+    Flags.setOrigAlign(OriginalAlignment);
+
+    Args.push_back(*II);
+    ArgRegs.push_back(Arg);
+    ArgVTs.push_back(ArgVT);
+    ArgFlags.push_back(Flags);
+  }
+
+  // Process the arguments.
+  SmallVector<unsigned, 8> RegArgs;
+  unsigned NumBytes;
+
+  if (!processCallArgs(Args, ArgRegs, ArgVTs, ArgFlags,
+                       RegArgs, CC, NumBytes, IsVarArg))
+    return false;
+
+  // FIXME: No handling for function pointers yet.  This requires
+  // implementing the function descriptor (OPD) setup.
+  const GlobalValue *GV = dyn_cast<GlobalValue>(Callee);
+  if (!GV)
+    return false;
+
+  // Build direct call with NOP for TOC restore.
+  // FIXME: We can and should optimize away the NOP for local calls.
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                                    TII.get(PPC::BL8_NOP));
+  // Add callee.
+  MIB.addGlobalAddress(GV);
+
+  // Add implicit physical register uses to the call.
+  for (unsigned II = 0, IE = RegArgs.size(); II != IE; ++II)
+    MIB.addReg(RegArgs[II], RegState::Implicit);
+
+  // Direct calls in the ELFv2 ABI need the TOC register live into the call.
+  if (PPCSubTarget->isELFv2ABI())
+    MIB.addReg(PPC::X2, RegState::Implicit);
+
+  // Add a register mask with the call-preserved registers.  Proper
+  // defs for return values will be added by setPhysRegsDeadExcept().
+  MIB.addRegMask(TRI.getCallPreservedMask(CC));
+
+  // Finish off the call including any return values.
+  SmallVector<unsigned, 4> UsedRegs;
+  finishCall(RetVT, UsedRegs, I, CC, NumBytes, IsVarArg);
+
+  // Set all unused physregs defs as dead.
+  static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI);
+
+  return true;
+}
+
+// Attempt to fast-select a return instruction.
+bool PPCFastISel::SelectRet(const Instruction *I) {
+
+  if (!FuncInfo.CanLowerReturn)
+    return false;
+
+  const ReturnInst *Ret = cast<ReturnInst>(I);
+  const Function &F = *I->getParent()->getParent();
+
+  // Build a list of return value registers.
+  SmallVector<unsigned, 4> RetRegs;
+  CallingConv::ID CC = F.getCallingConv();
+
+  if (Ret->getNumOperands() > 0) {
+    SmallVector<ISD::OutputArg, 4> Outs;
+    GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI);
+
+    // Analyze operands of the call, assigning locations to each operand.
+    SmallVector<CCValAssign, 16> ValLocs;
+    CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs, *Context);
+    CCInfo.AnalyzeReturn(Outs, RetCC_PPC64_ELF_FIS);
+    const Value *RV = Ret->getOperand(0);
+    
+    // FIXME: Only one output register for now.
+    if (ValLocs.size() > 1)
+      return false;
+
+    // Special case for returning a constant integer of any size.
+    // Materialize the constant as an i64 and copy it to the return
+    // register.  This avoids an unnecessary extend or truncate.
+    if (isa<ConstantInt>(*RV)) {
+      const Constant *C = cast<Constant>(RV);
+      unsigned SrcReg = PPCMaterializeInt(C, MVT::i64);
+      unsigned RetReg = ValLocs[0].getLocReg();
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), RetReg).addReg(SrcReg);
+      RetRegs.push_back(RetReg);
+
+    } else {
+      unsigned Reg = getRegForValue(RV);
+
+      if (Reg == 0)
+        return false;
+
+      // Copy the result values into the output registers.
+      for (unsigned i = 0; i < ValLocs.size(); ++i) {
+
+        CCValAssign &VA = ValLocs[i];
+        assert(VA.isRegLoc() && "Can only return in registers!");
+        RetRegs.push_back(VA.getLocReg());
+        unsigned SrcReg = Reg + VA.getValNo();
+
+        EVT RVEVT = TLI.getValueType(RV->getType());
+        if (!RVEVT.isSimple())
+          return false;
+        MVT RVVT = RVEVT.getSimpleVT();
+        MVT DestVT = VA.getLocVT();
+
+        if (RVVT != DestVT && RVVT != MVT::i8 &&
+            RVVT != MVT::i16 && RVVT != MVT::i32)
+          return false;
+      
+        if (RVVT != DestVT) {
+          switch (VA.getLocInfo()) {
+            default:
+              llvm_unreachable("Unknown loc info!");
+            case CCValAssign::Full:
+              llvm_unreachable("Full value assign but types don't match?");
+            case CCValAssign::AExt:
+            case CCValAssign::ZExt: {
+              const TargetRegisterClass *RC =
+                (DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
+              unsigned TmpReg = createResultReg(RC);
+              if (!PPCEmitIntExt(RVVT, SrcReg, DestVT, TmpReg, true))
+                return false;
+              SrcReg = TmpReg;
+              break;
+            }
+            case CCValAssign::SExt: {
+              const TargetRegisterClass *RC =
+                (DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
+              unsigned TmpReg = createResultReg(RC);
+              if (!PPCEmitIntExt(RVVT, SrcReg, DestVT, TmpReg, false))
+                return false;
+              SrcReg = TmpReg;
+              break;
+            }
+          }
+        }
+
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                TII.get(TargetOpcode::COPY), RetRegs[i])
+          .addReg(SrcReg);
+      }
+    }
+  }
+
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                                    TII.get(PPC::BLR));
+
+  for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
+    MIB.addReg(RetRegs[i], RegState::Implicit);
+
+  return true;
+}
+
+// Attempt to emit an integer extend of SrcReg into DestReg.  Both
+// signed and zero extensions are supported.  Return false if we
+// can't handle it.
+bool PPCFastISel::PPCEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+                                unsigned DestReg, bool IsZExt) {
+  if (DestVT != MVT::i32 && DestVT != MVT::i64)
+    return false;
+  if (SrcVT != MVT::i8 && SrcVT != MVT::i16 && SrcVT != MVT::i32)
+    return false;
+
+  // Signed extensions use EXTSB, EXTSH, EXTSW.
+  if (!IsZExt) {
+    unsigned Opc;
+    if (SrcVT == MVT::i8)
+      Opc = (DestVT == MVT::i32) ? PPC::EXTSB : PPC::EXTSB8_32_64;
+    else if (SrcVT == MVT::i16)
+      Opc = (DestVT == MVT::i32) ? PPC::EXTSH : PPC::EXTSH8_32_64;
+    else {
+      assert(DestVT == MVT::i64 && "Signed extend from i32 to i32??");
+      Opc = PPC::EXTSW_32_64;
+    }
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
+      .addReg(SrcReg);
+
+  // Unsigned 32-bit extensions use RLWINM.
+  } else if (DestVT == MVT::i32) {
+    unsigned MB;
+    if (SrcVT == MVT::i8)
+      MB = 24;
+    else {
+      assert(SrcVT == MVT::i16 && "Unsigned extend from i32 to i32??");
+      MB = 16;
+    }
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::RLWINM),
+            DestReg)
+      .addReg(SrcReg).addImm(/*SH=*/0).addImm(MB).addImm(/*ME=*/31);
+
+  // Unsigned 64-bit extensions use RLDICL (with a 32-bit source).
+  } else {
+    unsigned MB;
+    if (SrcVT == MVT::i8)
+      MB = 56;
+    else if (SrcVT == MVT::i16)
+      MB = 48;
+    else
+      MB = 32;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(PPC::RLDICL_32_64), DestReg)
+      .addReg(SrcReg).addImm(/*SH=*/0).addImm(MB);
+  }
+
+  return true;
+}
+
+// Attempt to fast-select an indirect branch instruction.
+bool PPCFastISel::SelectIndirectBr(const Instruction *I) {
+  unsigned AddrReg = getRegForValue(I->getOperand(0));
+  if (AddrReg == 0)
+    return false;
+
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::MTCTR8))
+    .addReg(AddrReg);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::BCTR8));
+
+  const IndirectBrInst *IB = cast<IndirectBrInst>(I);
+  for (unsigned i = 0, e = IB->getNumSuccessors(); i != e; ++i)
+    FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[IB->getSuccessor(i)]);
+
+  return true;
+}
+
+// Attempt to fast-select an integer truncate instruction.
+bool PPCFastISel::SelectTrunc(const Instruction *I) {
+  Value *Src  = I->getOperand(0);
+  EVT SrcVT  = TLI.getValueType(Src->getType(), true);
+  EVT DestVT = TLI.getValueType(I->getType(), true);
+
+  if (SrcVT != MVT::i64 && SrcVT != MVT::i32 && SrcVT != MVT::i16)
+    return false;
+
+  if (DestVT != MVT::i32 && DestVT != MVT::i16 && DestVT != MVT::i8)
+    return false;
+
+  unsigned SrcReg = getRegForValue(Src);
+  if (!SrcReg)
+    return false;
+
+  // The only interesting case is when we need to switch register classes.
+  if (SrcVT == MVT::i64) {
+    unsigned ResultReg = createResultReg(&PPC::GPRCRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY),
+            ResultReg).addReg(SrcReg, 0, PPC::sub_32);
+    SrcReg = ResultReg;
+  }
+
+  UpdateValueMap(I, SrcReg);
+  return true;
+}
+
+// Attempt to fast-select an integer extend instruction.
+bool PPCFastISel::SelectIntExt(const Instruction *I) {
+  Type *DestTy = I->getType();
+  Value *Src = I->getOperand(0);
+  Type *SrcTy = Src->getType();
+
+  bool IsZExt = isa<ZExtInst>(I);
+  unsigned SrcReg = getRegForValue(Src);
+  if (!SrcReg) return false;
+
+  EVT SrcEVT, DestEVT;
+  SrcEVT = TLI.getValueType(SrcTy, true);
+  DestEVT = TLI.getValueType(DestTy, true);
+  if (!SrcEVT.isSimple())
+    return false;
+  if (!DestEVT.isSimple())
+    return false;
+
+  MVT SrcVT = SrcEVT.getSimpleVT();
+  MVT DestVT = DestEVT.getSimpleVT();
+
+  // If we know the register class needed for the result of this
+  // instruction, use it.  Otherwise pick the register class of the
+  // correct size that does not contain X0/R0, since we don't know
+  // whether downstream uses permit that assignment.
+  unsigned AssignedReg = FuncInfo.ValueMap[I];
+  const TargetRegisterClass *RC =
+    (AssignedReg ? MRI.getRegClass(AssignedReg) :
+     (DestVT == MVT::i64 ? &PPC::G8RC_and_G8RC_NOX0RegClass :
+      &PPC::GPRC_and_GPRC_NOR0RegClass));
+  unsigned ResultReg = createResultReg(RC);
+
+  if (!PPCEmitIntExt(SrcVT, SrcReg, DestVT, ResultReg, IsZExt))
+    return false;
+
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+// Attempt to fast-select an instruction that wasn't handled by
+// the table-generated machinery.
+bool PPCFastISel::TargetSelectInstruction(const Instruction *I) {
+
+  switch (I->getOpcode()) {
+    case Instruction::Load:
+      return SelectLoad(I);
+    case Instruction::Store:
+      return SelectStore(I);
+    case Instruction::Br:
+      return SelectBranch(I);
+    case Instruction::IndirectBr:
+      return SelectIndirectBr(I);
+    case Instruction::FPExt:
+      return SelectFPExt(I);
+    case Instruction::FPTrunc:
+      return SelectFPTrunc(I);
+    case Instruction::SIToFP:
+      return SelectIToFP(I, /*IsSigned*/ true);
+    case Instruction::UIToFP:
+      return SelectIToFP(I, /*IsSigned*/ false);
+    case Instruction::FPToSI:
+      return SelectFPToI(I, /*IsSigned*/ true);
+    case Instruction::FPToUI:
+      return SelectFPToI(I, /*IsSigned*/ false);
+    case Instruction::Add:
+      return SelectBinaryIntOp(I, ISD::ADD);
+    case Instruction::Or:
+      return SelectBinaryIntOp(I, ISD::OR);
+    case Instruction::Sub:
+      return SelectBinaryIntOp(I, ISD::SUB);
+    case Instruction::Call:
+      if (dyn_cast<IntrinsicInst>(I))
+        return false;
+      return SelectCall(I);
+    case Instruction::Ret:
+      return SelectRet(I);
+    case Instruction::Trunc:
+      return SelectTrunc(I);
+    case Instruction::ZExt:
+    case Instruction::SExt:
+      return SelectIntExt(I);
+    // Here add other flavors of Instruction::XXX that automated
+    // cases don't catch.  For example, switches are terminators
+    // that aren't yet handled.
+    default:
+      break;
+  }
+  return false;
+}
+
+// Materialize a floating-point constant into a register, and return
+// the register number (or zero if we failed to handle it).
+unsigned PPCFastISel::PPCMaterializeFP(const ConstantFP *CFP, MVT VT) {
+  // No plans to handle long double here.
+  if (VT != MVT::f32 && VT != MVT::f64)
+    return 0;
+
+  // All FP constants are loaded from the constant pool.
+  unsigned Align = DL.getPrefTypeAlignment(CFP->getType());
+  assert(Align > 0 && "Unexpectedly missing alignment information!");
+  unsigned Idx = MCP.getConstantPoolIndex(cast<Constant>(CFP), Align);
+  unsigned DestReg = createResultReg(TLI.getRegClassFor(VT));
+  CodeModel::Model CModel = TM.getCodeModel();
+
+  MachineMemOperand *MMO =
+    FuncInfo.MF->getMachineMemOperand(
+      MachinePointerInfo::getConstantPool(), MachineMemOperand::MOLoad,
+      (VT == MVT::f32) ? 4 : 8, Align);
+
+  unsigned Opc = (VT == MVT::f32) ? PPC::LFS : PPC::LFD;
+  unsigned TmpReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass);
+
+  // For small code model, generate a LF[SD](0, LDtocCPT(Idx, X2)).
+  if (CModel == CodeModel::Small || CModel == CodeModel::JITDefault) {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtocCPT),
+            TmpReg)
+      .addConstantPoolIndex(Idx).addReg(PPC::X2);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
+      .addImm(0).addReg(TmpReg).addMemOperand(MMO);
+  } else {
+    // Otherwise we generate LF[SD](Idx[lo], ADDIStocHA(X2, Idx)).
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDIStocHA),
+            TmpReg).addReg(PPC::X2).addConstantPoolIndex(Idx);
+    // But for large code model, we must generate a LDtocL followed
+    // by the LF[SD].
+    if (CModel == CodeModel::Large) {
+      unsigned TmpReg2 = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtocL),
+              TmpReg2).addConstantPoolIndex(Idx).addReg(TmpReg);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
+        .addImm(0).addReg(TmpReg2);
+    } else 
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
+        .addConstantPoolIndex(Idx, 0, PPCII::MO_TOC_LO)
+        .addReg(TmpReg)
+        .addMemOperand(MMO);
+  }
+
+  return DestReg;
+}
+
+// Materialize the address of a global value into a register, and return
+// the register number (or zero if we failed to handle it).
+unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
+  assert(VT == MVT::i64 && "Non-address!");
+  const TargetRegisterClass *RC = &PPC::G8RC_and_G8RC_NOX0RegClass;
+  unsigned DestReg = createResultReg(RC);
+
+  // Global values may be plain old object addresses, TLS object
+  // addresses, constant pool entries, or jump tables.  How we generate
+  // code for these may depend on small, medium, or large code model.
+  CodeModel::Model CModel = TM.getCodeModel();
+
+  // FIXME: Jump tables are not yet required because fast-isel doesn't
+  // handle switches; if that changes, we need them as well.  For now,
+  // what follows assumes everything's a generic (or TLS) global address.
+
+  // FIXME: We don't yet handle the complexity of TLS.
+  if (GV->isThreadLocal())
+    return 0;
+
+  // For small code model, generate a simple TOC load.
+  if (CModel == CodeModel::Small || CModel == CodeModel::JITDefault)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtoc),
+            DestReg)
+        .addGlobalAddress(GV)
+        .addReg(PPC::X2);
+  else {
+    // If the address is an externally defined symbol, a symbol with common
+    // or externally available linkage, a non-local function address, or a
+    // jump table address (not yet needed), or if we are generating code
+    // for large code model, we generate:
+    //       LDtocL(GV, ADDIStocHA(%X2, GV))
+    // Otherwise we generate:
+    //       ADDItocL(ADDIStocHA(%X2, GV), GV)
+    // Either way, start with the ADDIStocHA:
+    unsigned HighPartReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDIStocHA),
+            HighPartReg).addReg(PPC::X2).addGlobalAddress(GV);
+
+    // If/when switches are implemented, jump tables should be handled
+    // on the "if" path here.
+    if (CModel == CodeModel::Large ||
+        (GV->getType()->getElementType()->isFunctionTy() &&
+         (GV->isDeclaration() || GV->isWeakForLinker())) ||
+        GV->isDeclaration() || GV->hasCommonLinkage() ||
+        GV->hasAvailableExternallyLinkage())
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtocL),
+              DestReg).addGlobalAddress(GV).addReg(HighPartReg);
+    else
+      // Otherwise generate the ADDItocL.
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDItocL),
+              DestReg).addReg(HighPartReg).addGlobalAddress(GV);
+  }
+
+  return DestReg;
+}
+
+// Materialize a 32-bit integer constant into a register, and return
+// the register number (or zero if we failed to handle it).
+unsigned PPCFastISel::PPCMaterialize32BitInt(int64_t Imm,
+                                             const TargetRegisterClass *RC) {
+  unsigned Lo = Imm & 0xFFFF;
+  unsigned Hi = (Imm >> 16) & 0xFFFF;
+
+  unsigned ResultReg = createResultReg(RC);
+  bool IsGPRC = RC->hasSuperClassEq(&PPC::GPRCRegClass);
+
+  if (isInt<16>(Imm))
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(IsGPRC ? PPC::LI : PPC::LI8), ResultReg)
+      .addImm(Imm);
+  else if (Lo) {
+    // Both Lo and Hi have nonzero bits.
+    unsigned TmpReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(IsGPRC ? PPC::LIS : PPC::LIS8), TmpReg)
+      .addImm(Hi);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(IsGPRC ? PPC::ORI : PPC::ORI8), ResultReg)
+      .addReg(TmpReg).addImm(Lo);
+  } else
+    // Just Hi bits.
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(IsGPRC ? PPC::LIS : PPC::LIS8), ResultReg)
+      .addImm(Hi);
+  
+  return ResultReg;
+}
+
+// Materialize a 64-bit integer constant into a register, and return
+// the register number (or zero if we failed to handle it).
+unsigned PPCFastISel::PPCMaterialize64BitInt(int64_t Imm,
+                                             const TargetRegisterClass *RC) {
+  unsigned Remainder = 0;
+  unsigned Shift = 0;
+
+  // If the value doesn't fit in 32 bits, see if we can shift it
+  // so that it fits in 32 bits.
+  if (!isInt<32>(Imm)) {
+    Shift = countTrailingZeros<uint64_t>(Imm);
+    int64_t ImmSh = static_cast<uint64_t>(Imm) >> Shift;
+
+    if (isInt<32>(ImmSh))
+      Imm = ImmSh;
+    else {
+      Remainder = Imm;
+      Shift = 32;
+      Imm >>= 32;
+    }
+  }
+
+  // Handle the high-order 32 bits (if shifted) or the whole 32 bits
+  // (if not shifted).
+  unsigned TmpReg1 = PPCMaterialize32BitInt(Imm, RC);
+  if (!Shift)
+    return TmpReg1;
+
+  // If upper 32 bits were not zero, we've built them and need to shift
+  // them into place.
+  unsigned TmpReg2;
+  if (Imm) {
+    TmpReg2 = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::RLDICR),
+            TmpReg2).addReg(TmpReg1).addImm(Shift).addImm(63 - Shift);
+  } else
+    TmpReg2 = TmpReg1;
+
+  unsigned TmpReg3, Hi, Lo;
+  if ((Hi = (Remainder >> 16) & 0xFFFF)) {
+    TmpReg3 = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ORIS8),
+            TmpReg3).addReg(TmpReg2).addImm(Hi);
+  } else
+    TmpReg3 = TmpReg2;
+
+  if ((Lo = Remainder & 0xFFFF)) {
+    unsigned ResultReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ORI8),
+            ResultReg).addReg(TmpReg3).addImm(Lo);
+    return ResultReg;
+  }
+
+  return TmpReg3;
+}
+
+
+// Materialize an integer constant into a register, and return
+// the register number (or zero if we failed to handle it).
+unsigned PPCFastISel::PPCMaterializeInt(const Constant *C, MVT VT) {
+  // If we're using CR bit registers for i1 values, handle that as a special
+  // case first.
+  if (VT == MVT::i1 && PPCSubTarget->useCRBits()) {
+    const ConstantInt *CI = cast<ConstantInt>(C);
+    unsigned ImmReg = createResultReg(&PPC::CRBITRCRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(CI->isZero() ? PPC::CRUNSET : PPC::CRSET), ImmReg);
+    return ImmReg;
+  }
+
+  if (VT != MVT::i64 && VT != MVT::i32 && VT != MVT::i16 &&
+      VT != MVT::i8 && VT != MVT::i1) 
+    return 0;
+
+  const TargetRegisterClass *RC = ((VT == MVT::i64) ? &PPC::G8RCRegClass :
+                                   &PPC::GPRCRegClass);
+
+  // If the constant is in range, use a load-immediate.
+  const ConstantInt *CI = cast<ConstantInt>(C);
+  if (isInt<16>(CI->getSExtValue())) {
+    unsigned Opc = (VT == MVT::i64) ? PPC::LI8 : PPC::LI;
+    unsigned ImmReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ImmReg)
+      .addImm(CI->getSExtValue());
+    return ImmReg;
+  }
+
+  // Construct the constant piecewise.
+  int64_t Imm = CI->getZExtValue();
+
+  if (VT == MVT::i64)
+    return PPCMaterialize64BitInt(Imm, RC);
+  else if (VT == MVT::i32)
+    return PPCMaterialize32BitInt(Imm, RC);
+
+  return 0;
+}
+
+// Materialize a constant into a register, and return the register
+// number (or zero if we failed to handle it).
+unsigned PPCFastISel::TargetMaterializeConstant(const Constant *C) {
+  EVT CEVT = TLI.getValueType(C->getType(), true);
+
+  // Only handle simple types.
+  if (!CEVT.isSimple()) return 0;
+  MVT VT = CEVT.getSimpleVT();
+
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
+    return PPCMaterializeFP(CFP, VT);
+  else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+    return PPCMaterializeGV(GV, VT);
+  else if (isa<ConstantInt>(C))
+    return PPCMaterializeInt(C, VT);
+
+  return 0;
+}
+
+// Materialize the address created by an alloca into a register, and
+// return the register number (or zero if we failed to handle it).
+unsigned PPCFastISel::TargetMaterializeAlloca(const AllocaInst *AI) {
+  // Don't handle dynamic allocas.
+  if (!FuncInfo.StaticAllocaMap.count(AI)) return 0;
+
+  MVT VT;
+  if (!isLoadTypeLegal(AI->getType(), VT)) return 0;
+
+  DenseMap<const AllocaInst*, int>::iterator SI =
+    FuncInfo.StaticAllocaMap.find(AI);
+
+  if (SI != FuncInfo.StaticAllocaMap.end()) {
+    unsigned ResultReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDI8),
+            ResultReg).addFrameIndex(SI->second).addImm(0);
+    return ResultReg;
+  }
+
+  return 0;
+}
+
+// Fold loads into extends when possible.
+// FIXME: We can have multiple redundant extend/trunc instructions
+// following a load.  The folding only picks up one.  Extend this
+// to check subsequent instructions for the same pattern and remove
+// them.  Thus ResultReg should be the def reg for the last redundant
+// instruction in a chain, and all intervening instructions can be
+// removed from parent.  Change test/CodeGen/PowerPC/fast-isel-fold.ll
+// to add ELF64-NOT: rldicl to the appropriate tests when this works.
+bool PPCFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
+                                      const LoadInst *LI) {
+  // Verify we have a legal type before going any further.
+  MVT VT;
+  if (!isLoadTypeLegal(LI->getType(), VT))
+    return false;
+
+  // Combine load followed by zero- or sign-extend.
+  bool IsZExt = false;
+  switch(MI->getOpcode()) {
+    default:
+      return false;
+
+    case PPC::RLDICL:
+    case PPC::RLDICL_32_64: {
+      IsZExt = true;
+      unsigned MB = MI->getOperand(3).getImm();
+      if ((VT == MVT::i8 && MB <= 56) ||
+          (VT == MVT::i16 && MB <= 48) ||
+          (VT == MVT::i32 && MB <= 32))
+        break;
+      return false;
+    }
+
+    case PPC::RLWINM:
+    case PPC::RLWINM8: {
+      IsZExt = true;
+      unsigned MB = MI->getOperand(3).getImm();
+      if ((VT == MVT::i8 && MB <= 24) ||
+          (VT == MVT::i16 && MB <= 16))
+        break;
+      return false;
+    }
+
+    case PPC::EXTSB:
+    case PPC::EXTSB8:
+    case PPC::EXTSB8_32_64:
+      /* There is no sign-extending load-byte instruction. */
+      return false;
+
+    case PPC::EXTSH:
+    case PPC::EXTSH8:
+    case PPC::EXTSH8_32_64: {
+      if (VT != MVT::i16 && VT != MVT::i8)
+        return false;
+      break;
+    }
+
+    case PPC::EXTSW:
+    case PPC::EXTSW_32_64: {
+      if (VT != MVT::i32 && VT != MVT::i16 && VT != MVT::i8)
+        return false;
+      break;
+    }
+  }
+
+  // See if we can handle this address.
+  Address Addr;
+  if (!PPCComputeAddress(LI->getOperand(0), Addr))
+    return false;
+
+  unsigned ResultReg = MI->getOperand(0).getReg();
+
+  if (!PPCEmitLoad(VT, ResultReg, Addr, nullptr, IsZExt))
+    return false;
+
+  MI->eraseFromParent();
+  return true;
+}
+
+// Attempt to lower call arguments in a faster way than done by
+// the selection DAG code.
+bool PPCFastISel::FastLowerArguments() {
+  // Defer to normal argument lowering for now.  It's reasonably
+  // efficient.  Consider doing something like ARM to handle the
+  // case where all args fit in registers, no varargs, no float
+  // or vector args.
+  return false;
+}
+
+// Handle materializing integer constants into a register.  This is not
+// automatically generated for PowerPC, so must be explicitly created here.
+unsigned PPCFastISel::FastEmit_i(MVT Ty, MVT VT, unsigned Opc, uint64_t Imm) {
+  
+  if (Opc != ISD::Constant)
+    return 0;
+
+  // If we're using CR bit registers for i1 values, handle that as a special
+  // case first.
+  if (VT == MVT::i1 && PPCSubTarget->useCRBits()) {
+    unsigned ImmReg = createResultReg(&PPC::CRBITRCRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(Imm == 0 ? PPC::CRUNSET : PPC::CRSET), ImmReg);
+    return ImmReg;
+  }
+
+  if (VT != MVT::i64 && VT != MVT::i32 && VT != MVT::i16 &&
+      VT != MVT::i8 && VT != MVT::i1) 
+    return 0;
+
+  const TargetRegisterClass *RC = ((VT == MVT::i64) ? &PPC::G8RCRegClass :
+                                   &PPC::GPRCRegClass);
+  if (VT == MVT::i64)
+    return PPCMaterialize64BitInt(Imm, RC);
+  else
+    return PPCMaterialize32BitInt(Imm, RC);
+}
+
+// Override for ADDI and ADDI8 to set the correct register class
+// on RHS operand 0.  The automatic infrastructure naively assumes
+// GPRC for i32 and G8RC for i64; the concept of "no R0" is lost
+// for these cases.  At the moment, none of the other automatically
+// generated RI instructions require special treatment.  However, once
+// SelectSelect is implemented, "isel" requires similar handling.
+//
+// Also be conservative about the output register class.  Avoid
+// assigning R0 or X0 to the output register for GPRC and G8RC
+// register classes, as any such result could be used in ADDI, etc.,
+// where those regs have another meaning.
+unsigned PPCFastISel::FastEmitInst_ri(unsigned MachineInstOpcode,
+                                      const TargetRegisterClass *RC,
+                                      unsigned Op0, bool Op0IsKill,
+                                      uint64_t Imm) {
+  if (MachineInstOpcode == PPC::ADDI)
+    MRI.setRegClass(Op0, &PPC::GPRC_and_GPRC_NOR0RegClass);
+  else if (MachineInstOpcode == PPC::ADDI8)
+    MRI.setRegClass(Op0, &PPC::G8RC_and_G8RC_NOX0RegClass);
+
+  const TargetRegisterClass *UseRC =
+    (RC == &PPC::GPRCRegClass ? &PPC::GPRC_and_GPRC_NOR0RegClass :
+     (RC == &PPC::G8RCRegClass ? &PPC::G8RC_and_G8RC_NOX0RegClass : RC));
+
+  return FastISel::FastEmitInst_ri(MachineInstOpcode, UseRC,
+                                   Op0, Op0IsKill, Imm);
+}
+
+// Override for instructions with one register operand to avoid use of
+// R0/X0.  The automatic infrastructure isn't aware of the context so
+// we must be conservative.
+unsigned PPCFastISel::FastEmitInst_r(unsigned MachineInstOpcode,
+                                     const TargetRegisterClass* RC,
+                                     unsigned Op0, bool Op0IsKill) {
+  const TargetRegisterClass *UseRC =
+    (RC == &PPC::GPRCRegClass ? &PPC::GPRC_and_GPRC_NOR0RegClass :
+     (RC == &PPC::G8RCRegClass ? &PPC::G8RC_and_G8RC_NOX0RegClass : RC));
+
+  return FastISel::FastEmitInst_r(MachineInstOpcode, UseRC, Op0, Op0IsKill);
+}
+
+// Override for instructions with two register operands to avoid use
+// of R0/X0.  The automatic infrastructure isn't aware of the context
+// so we must be conservative.
+unsigned PPCFastISel::FastEmitInst_rr(unsigned MachineInstOpcode,
+                                      const TargetRegisterClass* RC,
+                                      unsigned Op0, bool Op0IsKill,
+                                      unsigned Op1, bool Op1IsKill) {
+  const TargetRegisterClass *UseRC =
+    (RC == &PPC::GPRCRegClass ? &PPC::GPRC_and_GPRC_NOR0RegClass :
+     (RC == &PPC::G8RCRegClass ? &PPC::G8RC_and_G8RC_NOX0RegClass : RC));
+
+  return FastISel::FastEmitInst_rr(MachineInstOpcode, UseRC, Op0, Op0IsKill,
+                                   Op1, Op1IsKill);
+}
+
+namespace llvm {
+  // Create the fast instruction selector for PowerPC64 ELF.
+  FastISel *PPC::createFastISel(FunctionLoweringInfo &FuncInfo,
+                                const TargetLibraryInfo *LibInfo) {
+    const TargetMachine &TM = FuncInfo.MF->getTarget();
+
+    // Only available on 64-bit ELF for now.
+    const PPCSubtarget *Subtarget = &TM.getSubtarget<PPCSubtarget>();
+    if (Subtarget->isPPC64() && Subtarget->isSVR4ABI())
+      return new PPCFastISel(FuncInfo, LibInfo);
+
+    return nullptr;
+  }
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
new file mode 100644
index 0000000..b2577a9
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -0,0 +1,1604 @@
+//===-- PPCFrameLowering.cpp - PPC Frame Information ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PPC implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCFrameLowering.h"
+#include "PPCInstrBuilder.h"
+#include "PPCInstrInfo.h"
+#include "PPCMachineFunctionInfo.h"
+#include "PPCSubtarget.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+/// VRRegNo - Map from a numbered VR register to its enum value.
+///
+static const uint16_t VRRegNo[] = {
+ PPC::V0 , PPC::V1 , PPC::V2 , PPC::V3 , PPC::V4 , PPC::V5 , PPC::V6 , PPC::V7 ,
+ PPC::V8 , PPC::V9 , PPC::V10, PPC::V11, PPC::V12, PPC::V13, PPC::V14, PPC::V15,
+ PPC::V16, PPC::V17, PPC::V18, PPC::V19, PPC::V20, PPC::V21, PPC::V22, PPC::V23,
+ PPC::V24, PPC::V25, PPC::V26, PPC::V27, PPC::V28, PPC::V29, PPC::V30, PPC::V31
+};
+
+PPCFrameLowering::PPCFrameLowering(const PPCSubtarget &STI)
+    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown,
+                          (STI.hasQPX() || STI.isBGQ()) ? 32 : 16, 0),
+      Subtarget(STI) {}
+
+// With the SVR4 ABI, callee-saved registers have fixed offsets on the stack.
+const PPCFrameLowering::SpillSlot *PPCFrameLowering::getCalleeSavedSpillSlots(
+    unsigned &NumEntries) const {
+  if (Subtarget.isDarwinABI()) {
+    NumEntries = 1;
+    if (Subtarget.isPPC64()) {
+      static const SpillSlot darwin64Offsets = {PPC::X31, -8};
+      return &darwin64Offsets;
+    } else {
+      static const SpillSlot darwinOffsets = {PPC::R31, -4};
+      return &darwinOffsets;
+    }
+  }
+
+  // Early exit if not using the SVR4 ABI.
+  if (!Subtarget.isSVR4ABI()) {
+    NumEntries = 0;
+    return nullptr;
+  }
+
+  // Note that the offsets here overlap, but this is fixed up in
+  // processFunctionBeforeFrameFinalized.
+
+  static const SpillSlot Offsets[] = {
+      // Floating-point register save area offsets.
+      {PPC::F31, -8},
+      {PPC::F30, -16},
+      {PPC::F29, -24},
+      {PPC::F28, -32},
+      {PPC::F27, -40},
+      {PPC::F26, -48},
+      {PPC::F25, -56},
+      {PPC::F24, -64},
+      {PPC::F23, -72},
+      {PPC::F22, -80},
+      {PPC::F21, -88},
+      {PPC::F20, -96},
+      {PPC::F19, -104},
+      {PPC::F18, -112},
+      {PPC::F17, -120},
+      {PPC::F16, -128},
+      {PPC::F15, -136},
+      {PPC::F14, -144},
+
+      // General register save area offsets.
+      {PPC::R31, -4},
+      {PPC::R30, -8},
+      {PPC::R29, -12},
+      {PPC::R28, -16},
+      {PPC::R27, -20},
+      {PPC::R26, -24},
+      {PPC::R25, -28},
+      {PPC::R24, -32},
+      {PPC::R23, -36},
+      {PPC::R22, -40},
+      {PPC::R21, -44},
+      {PPC::R20, -48},
+      {PPC::R19, -52},
+      {PPC::R18, -56},
+      {PPC::R17, -60},
+      {PPC::R16, -64},
+      {PPC::R15, -68},
+      {PPC::R14, -72},
+
+      // CR save area offset.  We map each of the nonvolatile CR fields
+      // to the slot for CR2, which is the first of the nonvolatile CR
+      // fields to be assigned, so that we only allocate one save slot.
+      // See PPCRegisterInfo::hasReservedSpillSlot() for more information.
+      {PPC::CR2, -4},
+
+      // VRSAVE save area offset.
+      {PPC::VRSAVE, -4},
+
+      // Vector register save area
+      {PPC::V31, -16},
+      {PPC::V30, -32},
+      {PPC::V29, -48},
+      {PPC::V28, -64},
+      {PPC::V27, -80},
+      {PPC::V26, -96},
+      {PPC::V25, -112},
+      {PPC::V24, -128},
+      {PPC::V23, -144},
+      {PPC::V22, -160},
+      {PPC::V21, -176},
+      {PPC::V20, -192}};
+
+  static const SpillSlot Offsets64[] = {
+      // Floating-point register save area offsets.
+      {PPC::F31, -8},
+      {PPC::F30, -16},
+      {PPC::F29, -24},
+      {PPC::F28, -32},
+      {PPC::F27, -40},
+      {PPC::F26, -48},
+      {PPC::F25, -56},
+      {PPC::F24, -64},
+      {PPC::F23, -72},
+      {PPC::F22, -80},
+      {PPC::F21, -88},
+      {PPC::F20, -96},
+      {PPC::F19, -104},
+      {PPC::F18, -112},
+      {PPC::F17, -120},
+      {PPC::F16, -128},
+      {PPC::F15, -136},
+      {PPC::F14, -144},
+
+      // General register save area offsets.
+      {PPC::X31, -8},
+      {PPC::X30, -16},
+      {PPC::X29, -24},
+      {PPC::X28, -32},
+      {PPC::X27, -40},
+      {PPC::X26, -48},
+      {PPC::X25, -56},
+      {PPC::X24, -64},
+      {PPC::X23, -72},
+      {PPC::X22, -80},
+      {PPC::X21, -88},
+      {PPC::X20, -96},
+      {PPC::X19, -104},
+      {PPC::X18, -112},
+      {PPC::X17, -120},
+      {PPC::X16, -128},
+      {PPC::X15, -136},
+      {PPC::X14, -144},
+
+      // VRSAVE save area offset.
+      {PPC::VRSAVE, -4},
+
+      // Vector register save area
+      {PPC::V31, -16},
+      {PPC::V30, -32},
+      {PPC::V29, -48},
+      {PPC::V28, -64},
+      {PPC::V27, -80},
+      {PPC::V26, -96},
+      {PPC::V25, -112},
+      {PPC::V24, -128},
+      {PPC::V23, -144},
+      {PPC::V22, -160},
+      {PPC::V21, -176},
+      {PPC::V20, -192}};
+
+  if (Subtarget.isPPC64()) {
+    NumEntries = array_lengthof(Offsets64);
+
+    return Offsets64;
+  } else {
+    NumEntries = array_lengthof(Offsets);
+
+    return Offsets;
+  }
+}
+
+/// RemoveVRSaveCode - We have found that this function does not need any code
+/// to manipulate the VRSAVE register, even though it uses vector registers.
+/// This can happen when the only registers used are known to be live in or out
+/// of the function.  Remove all of the VRSAVE related code from the function.
+/// FIXME: The removal of the code results in a compile failure at -O0 when the
+/// function contains a function call, as the GPR containing original VRSAVE
+/// contents is spilled and reloaded around the call.  Without the prolog code,
+/// the spill instruction refers to an undefined register.  This code needs
+/// to account for all uses of that GPR.
+static void RemoveVRSaveCode(MachineInstr *MI) {
+  MachineBasicBlock *Entry = MI->getParent();
+  MachineFunction *MF = Entry->getParent();
+
+  // We know that the MTVRSAVE instruction immediately follows MI.  Remove it.
+  MachineBasicBlock::iterator MBBI = MI;
+  ++MBBI;
+  assert(MBBI != Entry->end() && MBBI->getOpcode() == PPC::MTVRSAVE);
+  MBBI->eraseFromParent();
+
+  bool RemovedAllMTVRSAVEs = true;
+  // See if we can find and remove the MTVRSAVE instruction from all of the
+  // epilog blocks.
+  for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I) {
+    // If last instruction is a return instruction, add an epilogue
+    if (!I->empty() && I->back().isReturn()) {
+      bool FoundIt = false;
+      for (MBBI = I->end(); MBBI != I->begin(); ) {
+        --MBBI;
+        if (MBBI->getOpcode() == PPC::MTVRSAVE) {
+          MBBI->eraseFromParent();  // remove it.
+          FoundIt = true;
+          break;
+        }
+      }
+      RemovedAllMTVRSAVEs &= FoundIt;
+    }
+  }
+
+  // If we found and removed all MTVRSAVE instructions, remove the read of
+  // VRSAVE as well.
+  if (RemovedAllMTVRSAVEs) {
+    MBBI = MI;
+    assert(MBBI != Entry->begin() && "UPDATE_VRSAVE is first instr in block?");
+    --MBBI;
+    assert(MBBI->getOpcode() == PPC::MFVRSAVE && "VRSAVE instrs wandered?");
+    MBBI->eraseFromParent();
+  }
+
+  // Finally, nuke the UPDATE_VRSAVE.
+  MI->eraseFromParent();
+}
+
+// HandleVRSaveUpdate - MI is the UPDATE_VRSAVE instruction introduced by the
+// instruction selector.  Based on the vector registers that have been used,
+// transform this into the appropriate ORI instruction.
+static void HandleVRSaveUpdate(MachineInstr *MI, const TargetInstrInfo &TII) {
+  MachineFunction *MF = MI->getParent()->getParent();
+  const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
+  DebugLoc dl = MI->getDebugLoc();
+
+  unsigned UsedRegMask = 0;
+  for (unsigned i = 0; i != 32; ++i)
+    if (MF->getRegInfo().isPhysRegUsed(VRRegNo[i]))
+      UsedRegMask |= 1 << (31-i);
+
+  // Live in and live out values already must be in the mask, so don't bother
+  // marking them.
+  for (MachineRegisterInfo::livein_iterator
+       I = MF->getRegInfo().livein_begin(),
+       E = MF->getRegInfo().livein_end(); I != E; ++I) {
+    unsigned RegNo = TRI->getEncodingValue(I->first);
+    if (VRRegNo[RegNo] == I->first)        // If this really is a vector reg.
+      UsedRegMask &= ~(1 << (31-RegNo));   // Doesn't need to be marked.
+  }
+
+  // Live out registers appear as use operands on return instructions.
+  for (MachineFunction::const_iterator BI = MF->begin(), BE = MF->end();
+       UsedRegMask != 0 && BI != BE; ++BI) {
+    const MachineBasicBlock &MBB = *BI;
+    if (MBB.empty() || !MBB.back().isReturn())
+      continue;
+    const MachineInstr &Ret = MBB.back();
+    for (unsigned I = 0, E = Ret.getNumOperands(); I != E; ++I) {
+      const MachineOperand &MO = Ret.getOperand(I);
+      if (!MO.isReg() || !PPC::VRRCRegClass.contains(MO.getReg()))
+        continue;
+      unsigned RegNo = TRI->getEncodingValue(MO.getReg());
+      UsedRegMask &= ~(1 << (31-RegNo));
+    }
+  }
+
+  // If no registers are used, turn this into a copy.
+  if (UsedRegMask == 0) {
+    // Remove all VRSAVE code.
+    RemoveVRSaveCode(MI);
+    return;
+  }
+
+  unsigned SrcReg = MI->getOperand(1).getReg();
+  unsigned DstReg = MI->getOperand(0).getReg();
+
+  if ((UsedRegMask & 0xFFFF) == UsedRegMask) {
+    if (DstReg != SrcReg)
+      BuildMI(*MI->getParent(), MI, dl, TII.get(PPC::ORI), DstReg)
+        .addReg(SrcReg)
+        .addImm(UsedRegMask);
+    else
+      BuildMI(*MI->getParent(), MI, dl, TII.get(PPC::ORI), DstReg)
+        .addReg(SrcReg, RegState::Kill)
+        .addImm(UsedRegMask);
+  } else if ((UsedRegMask & 0xFFFF0000) == UsedRegMask) {
+    if (DstReg != SrcReg)
+      BuildMI(*MI->getParent(), MI, dl, TII.get(PPC::ORIS), DstReg)
+        .addReg(SrcReg)
+        .addImm(UsedRegMask >> 16);
+    else
+      BuildMI(*MI->getParent(), MI, dl, TII.get(PPC::ORIS), DstReg)
+        .addReg(SrcReg, RegState::Kill)
+        .addImm(UsedRegMask >> 16);
+  } else {
+    if (DstReg != SrcReg)
+      BuildMI(*MI->getParent(), MI, dl, TII.get(PPC::ORIS), DstReg)
+        .addReg(SrcReg)
+        .addImm(UsedRegMask >> 16);
+    else
+      BuildMI(*MI->getParent(), MI, dl, TII.get(PPC::ORIS), DstReg)
+        .addReg(SrcReg, RegState::Kill)
+        .addImm(UsedRegMask >> 16);
+
+    BuildMI(*MI->getParent(), MI, dl, TII.get(PPC::ORI), DstReg)
+      .addReg(DstReg, RegState::Kill)
+      .addImm(UsedRegMask & 0xFFFF);
+  }
+
+  // Remove the old UPDATE_VRSAVE instruction.
+  MI->eraseFromParent();
+}
+
+static bool spillsCR(const MachineFunction &MF) {
+  const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+  return FuncInfo->isCRSpilled();
+}
+
+static bool spillsVRSAVE(const MachineFunction &MF) {
+  const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+  return FuncInfo->isVRSAVESpilled();
+}
+
+static bool hasSpills(const MachineFunction &MF) {
+  const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+  return FuncInfo->hasSpills();
+}
+
+static bool hasNonRISpills(const MachineFunction &MF) {
+  const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+  return FuncInfo->hasNonRISpills();
+}
+
+/// determineFrameLayout - Determine the size of the frame and maximum call
+/// frame size.
+unsigned PPCFrameLowering::determineFrameLayout(MachineFunction &MF,
+                                                bool UpdateMF,
+                                                bool UseEstimate) const {
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Get the number of bytes to allocate from the FrameInfo
+  unsigned FrameSize =
+    UseEstimate ? MFI->estimateStackSize(MF) : MFI->getStackSize();
+
+  // Get stack alignments. The frame must be aligned to the greatest of these:
+  unsigned TargetAlign = getStackAlignment(); // alignment required per the ABI
+  unsigned MaxAlign = MFI->getMaxAlignment(); // algmt required by data in frame
+  unsigned AlignMask = std::max(MaxAlign, TargetAlign) - 1;
+
+  const PPCRegisterInfo *RegInfo =
+    static_cast<const PPCRegisterInfo*>(MF.getTarget().getRegisterInfo());
+
+  // If we are a leaf function, and use up to 224 bytes of stack space,
+  // don't have a frame pointer, calls, or dynamic alloca then we do not need
+  // to adjust the stack pointer (we fit in the Red Zone).
+  // The 32-bit SVR4 ABI has no Red Zone. However, it can still generate
+  // stackless code if all local vars are reg-allocated.
+  bool DisableRedZone = MF.getFunction()->getAttributes().
+    hasAttribute(AttributeSet::FunctionIndex, Attribute::NoRedZone);
+  if (!DisableRedZone &&
+      (Subtarget.isPPC64() ||                      // 32-bit SVR4, no stack-
+       !Subtarget.isSVR4ABI() ||                   //   allocated locals.
+        FrameSize == 0) &&
+      FrameSize <= 224 &&                          // Fits in red zone.
+      !MFI->hasVarSizedObjects() &&                // No dynamic alloca.
+      !MFI->adjustsStack() &&                      // No calls.
+      !RegInfo->hasBasePointer(MF)) { // No special alignment.
+    // No need for frame
+    if (UpdateMF)
+      MFI->setStackSize(0);
+    return 0;
+  }
+
+  // Get the maximum call frame size of all the calls.
+  unsigned maxCallFrameSize = MFI->getMaxCallFrameSize();
+
+  // Maximum call frame needs to be at least big enough for linkage area.
+  unsigned minCallFrameSize = getLinkageSize(Subtarget.isPPC64(),
+                                             Subtarget.isDarwinABI(),
+                                             Subtarget.isELFv2ABI());
+  maxCallFrameSize = std::max(maxCallFrameSize, minCallFrameSize);
+
+  // If we have dynamic alloca then maxCallFrameSize needs to be aligned so
+  // that allocations will be aligned.
+  if (MFI->hasVarSizedObjects())
+    maxCallFrameSize = (maxCallFrameSize + AlignMask) & ~AlignMask;
+
+  // Update maximum call frame size.
+  if (UpdateMF)
+    MFI->setMaxCallFrameSize(maxCallFrameSize);
+
+  // Include call frame size in total.
+  FrameSize += maxCallFrameSize;
+
+  // Make sure the frame is aligned.
+  FrameSize = (FrameSize + AlignMask) & ~AlignMask;
+
+  // Update frame info.
+  if (UpdateMF)
+    MFI->setStackSize(FrameSize);
+
+  return FrameSize;
+}
+
+// hasFP - Return true if the specified function actually has a dedicated frame
+// pointer register.
+bool PPCFrameLowering::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  // FIXME: This is pretty much broken by design: hasFP() might be called really
+  // early, before the stack layout was calculated and thus hasFP() might return
+  // true or false here depending on the time of call.
+  return (MFI->getStackSize()) && needsFP(MF);
+}
+
+// needsFP - Return true if the specified function should have a dedicated frame
+// pointer register.  This is true if the function has variable sized allocas or
+// if frame pointer elimination is disabled.
+bool PPCFrameLowering::needsFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Naked functions have no stack frame pushed, so we don't have a frame
+  // pointer.
+  if (MF.getFunction()->getAttributes().hasAttribute(
+          AttributeSet::FunctionIndex, Attribute::Naked))
+    return false;
+
+  return MF.getTarget().Options.DisableFramePointerElim(MF) ||
+    MFI->hasVarSizedObjects() ||
+    (MF.getTarget().Options.GuaranteedTailCallOpt &&
+     MF.getInfo<PPCFunctionInfo>()->hasFastCall());
+}
+
+void PPCFrameLowering::replaceFPWithRealFP(MachineFunction &MF) const {
+  bool is31 = needsFP(MF);
+  unsigned FPReg  = is31 ? PPC::R31 : PPC::R1;
+  unsigned FP8Reg = is31 ? PPC::X31 : PPC::X1;
+
+  const PPCRegisterInfo *RegInfo =
+    static_cast<const PPCRegisterInfo*>(MF.getTarget().getRegisterInfo());
+  bool HasBP = RegInfo->hasBasePointer(MF);
+  unsigned BPReg  = HasBP ? (unsigned) RegInfo->getBaseRegister(MF) : FPReg;
+  unsigned BP8Reg = HasBP ? (unsigned) PPC::X30 : FPReg;
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+       BI != BE; ++BI)
+    for (MachineBasicBlock::iterator MBBI = BI->end(); MBBI != BI->begin(); ) {
+      --MBBI;
+      for (unsigned I = 0, E = MBBI->getNumOperands(); I != E; ++I) {
+        MachineOperand &MO = MBBI->getOperand(I);
+        if (!MO.isReg())
+          continue;
+
+        switch (MO.getReg()) {
+        case PPC::FP:
+          MO.setReg(FPReg);
+          break;
+        case PPC::FP8:
+          MO.setReg(FP8Reg);
+          break;
+        case PPC::BP:
+          MO.setReg(BPReg);
+          break;
+        case PPC::BP8:
+          MO.setReg(BP8Reg);
+          break;
+
+        }
+      }
+    }
+}
+
+void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();   // Prolog goes in entry BB
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const PPCInstrInfo &TII =
+    *static_cast<const PPCInstrInfo*>(MF.getTarget().getInstrInfo());
+  const PPCRegisterInfo *RegInfo =
+    static_cast<const PPCRegisterInfo*>(MF.getTarget().getRegisterInfo());
+
+  MachineModuleInfo &MMI = MF.getMMI();
+  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+  DebugLoc dl;
+  bool needsFrameMoves = MMI.hasDebugInfo() ||
+    MF.getFunction()->needsUnwindTableEntry();
+  bool isPIC = MF.getTarget().getRelocationModel() == Reloc::PIC_;
+
+  // Get processor type.
+  bool isPPC64 = Subtarget.isPPC64();
+  // Get the ABI.
+  bool isDarwinABI = Subtarget.isDarwinABI();
+  bool isSVR4ABI = Subtarget.isSVR4ABI();
+  bool isELFv2ABI = Subtarget.isELFv2ABI();
+  assert((isDarwinABI || isSVR4ABI) &&
+         "Currently only Darwin and SVR4 ABIs are supported for PowerPC.");
+
+  // Scan the prolog, looking for an UPDATE_VRSAVE instruction.  If we find it,
+  // process it.
+  if (!isSVR4ABI)
+    for (unsigned i = 0; MBBI != MBB.end(); ++i, ++MBBI) {
+      if (MBBI->getOpcode() == PPC::UPDATE_VRSAVE) {
+        HandleVRSaveUpdate(MBBI, TII);
+        break;
+      }
+    }
+
+  // Move MBBI back to the beginning of the function.
+  MBBI = MBB.begin();
+
+  // Work out frame sizes.
+  unsigned FrameSize = determineFrameLayout(MF);
+  int NegFrameSize = -FrameSize;
+  if (!isInt<32>(NegFrameSize))
+    llvm_unreachable("Unhandled stack size!");
+
+  if (MFI->isFrameAddressTaken())
+    replaceFPWithRealFP(MF);
+
+  // Check if the link register (LR) must be saved.
+  PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+  bool MustSaveLR = FI->mustSaveLR();
+  const SmallVectorImpl<unsigned> &MustSaveCRs = FI->getMustSaveCRs();
+  // Do we have a frame pointer and/or base pointer for this function?
+  bool HasFP = hasFP(MF);
+  bool HasBP = RegInfo->hasBasePointer(MF);
+
+  unsigned SPReg       = isPPC64 ? PPC::X1  : PPC::R1;
+  unsigned BPReg       = RegInfo->getBaseRegister(MF);
+  unsigned FPReg       = isPPC64 ? PPC::X31 : PPC::R31;
+  unsigned LRReg       = isPPC64 ? PPC::LR8 : PPC::LR;
+  unsigned ScratchReg  = isPPC64 ? PPC::X0  : PPC::R0;
+  unsigned TempReg     = isPPC64 ? PPC::X12 : PPC::R12; // another scratch reg
+  //  ...(R12/X12 is volatile in both Darwin & SVR4, & can't be a function arg.)
+  const MCInstrDesc& MFLRInst = TII.get(isPPC64 ? PPC::MFLR8
+                                                : PPC::MFLR );
+  const MCInstrDesc& StoreInst = TII.get(isPPC64 ? PPC::STD
+                                                 : PPC::STW );
+  const MCInstrDesc& StoreUpdtInst = TII.get(isPPC64 ? PPC::STDU
+                                                     : PPC::STWU );
+  const MCInstrDesc& StoreUpdtIdxInst = TII.get(isPPC64 ? PPC::STDUX
+                                                        : PPC::STWUX);
+  const MCInstrDesc& LoadImmShiftedInst = TII.get(isPPC64 ? PPC::LIS8
+                                                          : PPC::LIS );
+  const MCInstrDesc& OrImmInst = TII.get(isPPC64 ? PPC::ORI8
+                                                 : PPC::ORI );
+  const MCInstrDesc& OrInst = TII.get(isPPC64 ? PPC::OR8
+                                              : PPC::OR );
+  const MCInstrDesc& SubtractCarryingInst = TII.get(isPPC64 ? PPC::SUBFC8
+                                                            : PPC::SUBFC);
+  const MCInstrDesc& SubtractImmCarryingInst = TII.get(isPPC64 ? PPC::SUBFIC8
+                                                               : PPC::SUBFIC);
+
+  // Regarding this assert: Even though LR is saved in the caller's frame (i.e.,
+  // LROffset is positive), that slot is callee-owned. Because PPC32 SVR4 has no
+  // Red Zone, an asynchronous event (a form of "callee") could claim a frame &
+  // overwrite it, so PPC32 SVR4 must claim at least a minimal frame to save LR.
+  assert((isPPC64 || !isSVR4ABI || !(!FrameSize && (MustSaveLR || HasFP))) &&
+         "FrameSize must be >0 to save/restore the FP or LR for 32-bit SVR4.");
+
+  int LROffset = PPCFrameLowering::getReturnSaveOffset(isPPC64, isDarwinABI);
+
+  int FPOffset = 0;
+  if (HasFP) {
+    if (isSVR4ABI) {
+      MachineFrameInfo *FFI = MF.getFrameInfo();
+      int FPIndex = FI->getFramePointerSaveIndex();
+      assert(FPIndex && "No Frame Pointer Save Slot!");
+      FPOffset = FFI->getObjectOffset(FPIndex);
+    } else {
+      FPOffset =
+          PPCFrameLowering::getFramePointerSaveOffset(isPPC64, isDarwinABI);
+    }
+  }
+
+  int BPOffset = 0;
+  if (HasBP) {
+    if (isSVR4ABI) {
+      MachineFrameInfo *FFI = MF.getFrameInfo();
+      int BPIndex = FI->getBasePointerSaveIndex();
+      assert(BPIndex && "No Base Pointer Save Slot!");
+      BPOffset = FFI->getObjectOffset(BPIndex);
+    } else {
+      BPOffset =
+        PPCFrameLowering::getBasePointerSaveOffset(isPPC64,
+                                                   isDarwinABI,
+                                                   isPIC);
+    }
+  }
+
+  // Get stack alignments.
+  unsigned MaxAlign = MFI->getMaxAlignment();
+  if (HasBP && MaxAlign > 1)
+    assert(isPowerOf2_32(MaxAlign) && isInt<16>(MaxAlign) &&
+           "Invalid alignment!");
+
+  // Frames of 32KB & larger require special handling because they cannot be
+  // indexed into with a simple STDU/STWU/STD/STW immediate offset operand.
+  bool isLargeFrame = !isInt<16>(NegFrameSize);
+
+  if (MustSaveLR)
+    BuildMI(MBB, MBBI, dl, MFLRInst, ScratchReg);
+
+  assert((isPPC64 || MustSaveCRs.empty()) &&
+         "Prologue CR saving supported only in 64-bit mode");
+
+  if (!MustSaveCRs.empty()) { // will only occur for PPC64
+    // FIXME: In the ELFv2 ABI, we are not required to save all CR fields.
+    // If only one or two CR fields are clobbered, it could be more
+    // efficient to use mfocrf to selectively save just those fields.
+    MachineInstrBuilder MIB =
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::MFCR8), TempReg);
+    for (unsigned i = 0, e = MustSaveCRs.size(); i != e; ++i)
+      MIB.addReg(MustSaveCRs[i], RegState::ImplicitKill);
+  }
+
+  if (HasFP)
+    // FIXME: On PPC32 SVR4, we must not spill before claiming the stackframe.
+    BuildMI(MBB, MBBI, dl, StoreInst)
+      .addReg(FPReg)
+      .addImm(FPOffset)
+      .addReg(SPReg);
+
+  if (HasBP)
+    // FIXME: On PPC32 SVR4, we must not spill before claiming the stackframe.
+    BuildMI(MBB, MBBI, dl, StoreInst)
+      .addReg(BPReg)
+      .addImm(BPOffset)
+      .addReg(SPReg);
+
+  if (MustSaveLR)
+    // FIXME: On PPC32 SVR4, we must not spill before claiming the stackframe.
+    BuildMI(MBB, MBBI, dl, StoreInst)
+      .addReg(ScratchReg)
+      .addImm(LROffset)
+      .addReg(SPReg);
+
+  if (!MustSaveCRs.empty()) // will only occur for PPC64
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::STW8))
+      .addReg(TempReg, getKillRegState(true))
+      .addImm(8)
+      .addReg(SPReg);
+
+  // Skip the rest if this is a leaf function & all spills fit in the Red Zone.
+  if (!FrameSize) return;
+
+  // Adjust stack pointer: r1 += NegFrameSize.
+  // If there is a preferred stack alignment, align R1 now
+
+  if (HasBP) {
+    // Save a copy of r1 as the base pointer.
+    BuildMI(MBB, MBBI, dl, OrInst, BPReg)
+      .addReg(SPReg)
+      .addReg(SPReg);
+  }
+
+  if (HasBP && MaxAlign > 1) {
+    if (isPPC64)
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::RLDICL), ScratchReg)
+        .addReg(SPReg)
+        .addImm(0)
+        .addImm(64 - Log2_32(MaxAlign));
+    else // PPC32...
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::RLWINM), ScratchReg)
+        .addReg(SPReg)
+        .addImm(0)
+        .addImm(32 - Log2_32(MaxAlign))
+        .addImm(31);
+    if (!isLargeFrame) {
+      BuildMI(MBB, MBBI, dl, SubtractImmCarryingInst, ScratchReg)
+        .addReg(ScratchReg, RegState::Kill)
+        .addImm(NegFrameSize);
+    } else {
+      BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, TempReg)
+        .addImm(NegFrameSize >> 16);
+      BuildMI(MBB, MBBI, dl, OrImmInst, TempReg)
+        .addReg(TempReg, RegState::Kill)
+        .addImm(NegFrameSize & 0xFFFF);
+      BuildMI(MBB, MBBI, dl, SubtractCarryingInst, ScratchReg)
+        .addReg(ScratchReg, RegState::Kill)
+        .addReg(TempReg, RegState::Kill);
+    }
+    BuildMI(MBB, MBBI, dl, StoreUpdtIdxInst, SPReg)
+      .addReg(SPReg, RegState::Kill)
+      .addReg(SPReg)
+      .addReg(ScratchReg);
+
+  } else if (!isLargeFrame) {
+    BuildMI(MBB, MBBI, dl, StoreUpdtInst, SPReg)
+      .addReg(SPReg)
+      .addImm(NegFrameSize)
+      .addReg(SPReg);
+
+  } else {
+    BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, ScratchReg)
+      .addImm(NegFrameSize >> 16);
+    BuildMI(MBB, MBBI, dl, OrImmInst, ScratchReg)
+      .addReg(ScratchReg, RegState::Kill)
+      .addImm(NegFrameSize & 0xFFFF);
+    BuildMI(MBB, MBBI, dl, StoreUpdtIdxInst, SPReg)
+      .addReg(SPReg, RegState::Kill)
+      .addReg(SPReg)
+      .addReg(ScratchReg);
+  }
+
+  // Add the "machine moves" for the instructions we generated above, but in
+  // reverse order.
+  if (needsFrameMoves) {
+    // Show update of SP.
+    assert(NegFrameSize);
+    unsigned CFIIndex = MMI.addFrameInst(
+        MCCFIInstruction::createDefCfaOffset(nullptr, NegFrameSize));
+    BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
+
+    if (HasFP) {
+      unsigned Reg = MRI->getDwarfRegNum(FPReg, true);
+      CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createOffset(nullptr, Reg, FPOffset));
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    }
+
+    if (HasBP) {
+      unsigned Reg = MRI->getDwarfRegNum(BPReg, true);
+      CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createOffset(nullptr, Reg, BPOffset));
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    }
+
+    if (MustSaveLR) {
+      unsigned Reg = MRI->getDwarfRegNum(LRReg, true);
+      CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createOffset(nullptr, Reg, LROffset));
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    }
+  }
+
+  // If there is a frame pointer, copy R1 into R31
+  if (HasFP) {
+    BuildMI(MBB, MBBI, dl, OrInst, FPReg)
+      .addReg(SPReg)
+      .addReg(SPReg);
+
+    if (needsFrameMoves) {
+      // Mark effective beginning of when frame pointer is ready.
+      unsigned Reg = MRI->getDwarfRegNum(FPReg, true);
+      unsigned CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createDefCfaRegister(nullptr, Reg));
+
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    }
+  }
+
+  if (needsFrameMoves) {
+    // Add callee saved registers to move list.
+    const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+    for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
+      unsigned Reg = CSI[I].getReg();
+      if (Reg == PPC::LR || Reg == PPC::LR8 || Reg == PPC::RM) continue;
+
+      // This is a bit of a hack: CR2LT, CR2GT, CR2EQ and CR2UN are just
+      // subregisters of CR2. We just need to emit a move of CR2.
+      if (PPC::CRBITRCRegClass.contains(Reg))
+        continue;
+
+      // For SVR4, don't emit a move for the CR spill slot if we haven't
+      // spilled CRs.
+      if (isSVR4ABI && (PPC::CR2 <= Reg && Reg <= PPC::CR4)
+          && MustSaveCRs.empty())
+        continue;
+
+      // For 64-bit SVR4 when we have spilled CRs, the spill location
+      // is SP+8, not a frame-relative slot.
+      if (isSVR4ABI && isPPC64 && (PPC::CR2 <= Reg && Reg <= PPC::CR4)) {
+        // In the ELFv1 ABI, only CR2 is noted in CFI and stands in for
+        // the whole CR word.  In the ELFv2 ABI, every CR that was
+        // actually saved gets its own CFI record.
+        unsigned CRReg = isELFv2ABI? Reg : (unsigned) PPC::CR2;
+        unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
+            nullptr, MRI->getDwarfRegNum(CRReg, true), 8));
+        BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+            .addCFIIndex(CFIIndex);
+        continue;
+      }
+
+      int Offset = MFI->getObjectOffset(CSI[I].getFrameIdx());
+      unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
+          nullptr, MRI->getDwarfRegNum(Reg, true), Offset));
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    }
+  }
+}
+
+void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
+                                MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  assert(MBBI != MBB.end() && "Returning block has no terminator");
+  const PPCInstrInfo &TII =
+    *static_cast<const PPCInstrInfo*>(MF.getTarget().getInstrInfo());
+  const PPCRegisterInfo *RegInfo =
+    static_cast<const PPCRegisterInfo*>(MF.getTarget().getRegisterInfo());
+
+  unsigned RetOpcode = MBBI->getOpcode();
+  DebugLoc dl;
+
+  assert((RetOpcode == PPC::BLR ||
+          RetOpcode == PPC::TCRETURNri ||
+          RetOpcode == PPC::TCRETURNdi ||
+          RetOpcode == PPC::TCRETURNai ||
+          RetOpcode == PPC::TCRETURNri8 ||
+          RetOpcode == PPC::TCRETURNdi8 ||
+          RetOpcode == PPC::TCRETURNai8) &&
+         "Can only insert epilog into returning blocks");
+
+  // Get alignment info so we know how to restore the SP.
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Get the number of bytes allocated from the FrameInfo.
+  int FrameSize = MFI->getStackSize();
+
+  // Get processor type.
+  bool isPPC64 = Subtarget.isPPC64();
+  // Get the ABI.
+  bool isDarwinABI = Subtarget.isDarwinABI();
+  bool isSVR4ABI = Subtarget.isSVR4ABI();
+  bool isPIC = MF.getTarget().getRelocationModel() == Reloc::PIC_;
+
+  // Check if the link register (LR) has been saved.
+  PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+  bool MustSaveLR = FI->mustSaveLR();
+  const SmallVectorImpl<unsigned> &MustSaveCRs = FI->getMustSaveCRs();
+  // Do we have a frame pointer and/or base pointer for this function?
+  bool HasFP = hasFP(MF);
+  bool HasBP = RegInfo->hasBasePointer(MF);
+
+  unsigned SPReg      = isPPC64 ? PPC::X1  : PPC::R1;
+  unsigned BPReg      = RegInfo->getBaseRegister(MF);
+  unsigned FPReg      = isPPC64 ? PPC::X31 : PPC::R31;
+  unsigned ScratchReg  = isPPC64 ? PPC::X0  : PPC::R0;
+  unsigned TempReg     = isPPC64 ? PPC::X12 : PPC::R12; // another scratch reg
+  const MCInstrDesc& MTLRInst = TII.get( isPPC64 ? PPC::MTLR8
+                                                 : PPC::MTLR );
+  const MCInstrDesc& LoadInst = TII.get( isPPC64 ? PPC::LD
+                                                 : PPC::LWZ );
+  const MCInstrDesc& LoadImmShiftedInst = TII.get( isPPC64 ? PPC::LIS8
+                                                           : PPC::LIS );
+  const MCInstrDesc& OrImmInst = TII.get( isPPC64 ? PPC::ORI8
+                                                  : PPC::ORI );
+  const MCInstrDesc& AddImmInst = TII.get( isPPC64 ? PPC::ADDI8
+                                                   : PPC::ADDI );
+  const MCInstrDesc& AddInst = TII.get( isPPC64 ? PPC::ADD8
+                                                : PPC::ADD4 );
+
+  int LROffset = PPCFrameLowering::getReturnSaveOffset(isPPC64, isDarwinABI);
+
+  int FPOffset = 0;
+  if (HasFP) {
+    if (isSVR4ABI) {
+      MachineFrameInfo *FFI = MF.getFrameInfo();
+      int FPIndex = FI->getFramePointerSaveIndex();
+      assert(FPIndex && "No Frame Pointer Save Slot!");
+      FPOffset = FFI->getObjectOffset(FPIndex);
+    } else {
+      FPOffset =
+          PPCFrameLowering::getFramePointerSaveOffset(isPPC64, isDarwinABI);
+    }
+  }
+
+  int BPOffset = 0;
+  if (HasBP) {
+    if (isSVR4ABI) {
+      MachineFrameInfo *FFI = MF.getFrameInfo();
+      int BPIndex = FI->getBasePointerSaveIndex();
+      assert(BPIndex && "No Base Pointer Save Slot!");
+      BPOffset = FFI->getObjectOffset(BPIndex);
+    } else {
+      BPOffset =
+        PPCFrameLowering::getBasePointerSaveOffset(isPPC64,
+                                                   isDarwinABI,
+                                                   isPIC);
+    }
+  }
+
+  bool UsesTCRet =  RetOpcode == PPC::TCRETURNri ||
+    RetOpcode == PPC::TCRETURNdi ||
+    RetOpcode == PPC::TCRETURNai ||
+    RetOpcode == PPC::TCRETURNri8 ||
+    RetOpcode == PPC::TCRETURNdi8 ||
+    RetOpcode == PPC::TCRETURNai8;
+
+  if (UsesTCRet) {
+    int MaxTCRetDelta = FI->getTailCallSPDelta();
+    MachineOperand &StackAdjust = MBBI->getOperand(1);
+    assert(StackAdjust.isImm() && "Expecting immediate value.");
+    // Adjust stack pointer.
+    int StackAdj = StackAdjust.getImm();
+    int Delta = StackAdj - MaxTCRetDelta;
+    assert((Delta >= 0) && "Delta must be positive");
+    if (MaxTCRetDelta>0)
+      FrameSize += (StackAdj +Delta);
+    else
+      FrameSize += StackAdj;
+  }
+
+  // Frames of 32KB & larger require special handling because they cannot be
+  // indexed into with a simple LD/LWZ immediate offset operand.
+  bool isLargeFrame = !isInt<16>(FrameSize);
+
+  if (FrameSize) {
+    // In the prologue, the loaded (or persistent) stack pointer value is offset
+    // by the STDU/STDUX/STWU/STWUX instruction.  Add this offset back now.
+
+    // If this function contained a fastcc call and GuaranteedTailCallOpt is
+    // enabled (=> hasFastCall()==true) the fastcc call might contain a tail
+    // call which invalidates the stack pointer value in SP(0). So we use the
+    // value of R31 in this case.
+    if (FI->hasFastCall()) {
+      assert(HasFP && "Expecting a valid frame pointer.");
+      if (!isLargeFrame) {
+        BuildMI(MBB, MBBI, dl, AddImmInst, SPReg)
+          .addReg(FPReg).addImm(FrameSize);
+      } else {
+        BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, ScratchReg)
+          .addImm(FrameSize >> 16);
+        BuildMI(MBB, MBBI, dl, OrImmInst, ScratchReg)
+          .addReg(ScratchReg, RegState::Kill)
+          .addImm(FrameSize & 0xFFFF);
+        BuildMI(MBB, MBBI, dl, AddInst)
+          .addReg(SPReg)
+          .addReg(FPReg)
+          .addReg(ScratchReg);
+      }
+    } else if (!isLargeFrame && !HasBP && !MFI->hasVarSizedObjects()) {
+      BuildMI(MBB, MBBI, dl, AddImmInst, SPReg)
+        .addReg(SPReg)
+        .addImm(FrameSize);
+    } else {
+      BuildMI(MBB, MBBI, dl, LoadInst, SPReg)
+        .addImm(0)
+        .addReg(SPReg);
+    }
+
+  }
+
+  if (MustSaveLR)
+    BuildMI(MBB, MBBI, dl, LoadInst, ScratchReg)
+      .addImm(LROffset)
+      .addReg(SPReg);
+
+  assert((isPPC64 || MustSaveCRs.empty()) &&
+         "Epilogue CR restoring supported only in 64-bit mode");
+
+  if (!MustSaveCRs.empty()) // will only occur for PPC64
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ8), TempReg)
+      .addImm(8)
+      .addReg(SPReg);
+
+  if (HasFP)
+    BuildMI(MBB, MBBI, dl, LoadInst, FPReg)
+      .addImm(FPOffset)
+      .addReg(SPReg);
+
+  if (HasBP)
+    BuildMI(MBB, MBBI, dl, LoadInst, BPReg)
+      .addImm(BPOffset)
+      .addReg(SPReg);
+
+  if (!MustSaveCRs.empty()) // will only occur for PPC64
+    for (unsigned i = 0, e = MustSaveCRs.size(); i != e; ++i)
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::MTOCRF8), MustSaveCRs[i])
+        .addReg(TempReg, getKillRegState(i == e-1));
+
+  if (MustSaveLR)
+    BuildMI(MBB, MBBI, dl, MTLRInst).addReg(ScratchReg);
+
+  // Callee pop calling convention. Pop parameter/linkage area. Used for tail
+  // call optimization
+  if (MF.getTarget().Options.GuaranteedTailCallOpt && RetOpcode == PPC::BLR &&
+      MF.getFunction()->getCallingConv() == CallingConv::Fast) {
+     PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+     unsigned CallerAllocatedAmt = FI->getMinReservedArea();
+
+     if (CallerAllocatedAmt && isInt<16>(CallerAllocatedAmt)) {
+       BuildMI(MBB, MBBI, dl, AddImmInst, SPReg)
+         .addReg(SPReg).addImm(CallerAllocatedAmt);
+     } else {
+       BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, ScratchReg)
+          .addImm(CallerAllocatedAmt >> 16);
+       BuildMI(MBB, MBBI, dl, OrImmInst, ScratchReg)
+          .addReg(ScratchReg, RegState::Kill)
+          .addImm(CallerAllocatedAmt & 0xFFFF);
+       BuildMI(MBB, MBBI, dl, AddInst)
+          .addReg(SPReg)
+          .addReg(FPReg)
+          .addReg(ScratchReg);
+     }
+  } else if (RetOpcode == PPC::TCRETURNdi) {
+    MBBI = MBB.getLastNonDebugInstr();
+    MachineOperand &JumpTarget = MBBI->getOperand(0);
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB)).
+      addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset());
+  } else if (RetOpcode == PPC::TCRETURNri) {
+    MBBI = MBB.getLastNonDebugInstr();
+    assert(MBBI->getOperand(0).isReg() && "Expecting register operand.");
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBCTR));
+  } else if (RetOpcode == PPC::TCRETURNai) {
+    MBBI = MBB.getLastNonDebugInstr();
+    MachineOperand &JumpTarget = MBBI->getOperand(0);
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBA)).addImm(JumpTarget.getImm());
+  } else if (RetOpcode == PPC::TCRETURNdi8) {
+    MBBI = MBB.getLastNonDebugInstr();
+    MachineOperand &JumpTarget = MBBI->getOperand(0);
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB8)).
+      addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset());
+  } else if (RetOpcode == PPC::TCRETURNri8) {
+    MBBI = MBB.getLastNonDebugInstr();
+    assert(MBBI->getOperand(0).isReg() && "Expecting register operand.");
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBCTR8));
+  } else if (RetOpcode == PPC::TCRETURNai8) {
+    MBBI = MBB.getLastNonDebugInstr();
+    MachineOperand &JumpTarget = MBBI->getOperand(0);
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBA8)).addImm(JumpTarget.getImm());
+  }
+}
+
+/// MustSaveLR - Return true if this function requires that we save the LR
+/// register onto the stack in the prolog and restore it in the epilog of the
+/// function.
+static bool MustSaveLR(const MachineFunction &MF, unsigned LR) {
+  const PPCFunctionInfo *MFI = MF.getInfo<PPCFunctionInfo>();
+
+  // We need a save/restore of LR if there is any def of LR (which is
+  // defined by calls, including the PIC setup sequence), or if there is
+  // some use of the LR stack slot (e.g. for builtin_return_address).
+  // (LR comes in 32 and 64 bit versions.)
+  MachineRegisterInfo::def_iterator RI = MF.getRegInfo().def_begin(LR);
+  return RI !=MF.getRegInfo().def_end() || MFI->isLRStoreRequired();
+}
+
+void
+PPCFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                                   RegScavenger *) const {
+  const PPCRegisterInfo *RegInfo =
+    static_cast<const PPCRegisterInfo*>(MF.getTarget().getRegisterInfo());
+
+  //  Save and clear the LR state.
+  PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+  unsigned LR = RegInfo->getRARegister();
+  FI->setMustSaveLR(MustSaveLR(MF, LR));
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  MRI.setPhysRegUnused(LR);
+
+  //  Save R31 if necessary
+  int FPSI = FI->getFramePointerSaveIndex();
+  bool isPPC64 = Subtarget.isPPC64();
+  bool isDarwinABI  = Subtarget.isDarwinABI();
+  bool isPIC = MF.getTarget().getRelocationModel() == Reloc::PIC_;
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // If the frame pointer save index hasn't been defined yet.
+  if (!FPSI && needsFP(MF)) {
+    // Find out what the fix offset of the frame pointer save area.
+    int FPOffset = getFramePointerSaveOffset(isPPC64, isDarwinABI);
+    // Allocate the frame index for frame pointer save area.
+    FPSI = MFI->CreateFixedObject(isPPC64? 8 : 4, FPOffset, true);
+    // Save the result.
+    FI->setFramePointerSaveIndex(FPSI);
+  }
+
+  int BPSI = FI->getBasePointerSaveIndex();
+  if (!BPSI && RegInfo->hasBasePointer(MF)) {
+    int BPOffset = getBasePointerSaveOffset(isPPC64, isDarwinABI, isPIC);
+    // Allocate the frame index for the base pointer save area.
+    BPSI = MFI->CreateFixedObject(isPPC64? 8 : 4, BPOffset, true);
+    // Save the result.
+    FI->setBasePointerSaveIndex(BPSI);
+  }
+
+  // Reserve stack space to move the linkage area to in case of a tail call.
+  int TCSPDelta = 0;
+  if (MF.getTarget().Options.GuaranteedTailCallOpt &&
+      (TCSPDelta = FI->getTailCallSPDelta()) < 0) {
+    MFI->CreateFixedObject(-1 * TCSPDelta, TCSPDelta, true);
+  }
+
+  // For 32-bit SVR4, allocate the nonvolatile CR spill slot iff the
+  // function uses CR 2, 3, or 4.
+  if (!isPPC64 && !isDarwinABI &&
+      (MRI.isPhysRegUsed(PPC::CR2) ||
+       MRI.isPhysRegUsed(PPC::CR3) ||
+       MRI.isPhysRegUsed(PPC::CR4))) {
+    int FrameIdx = MFI->CreateFixedObject((uint64_t)4, (int64_t)-4, true);
+    FI->setCRSpillFrameIndex(FrameIdx);
+  }
+}
+
+void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
+                                                       RegScavenger *RS) const {
+  // Early exit if not using the SVR4 ABI.
+  if (!Subtarget.isSVR4ABI()) {
+    addScavengingSpillSlot(MF, RS);
+    return;
+  }
+
+  // Get callee saved register information.
+  MachineFrameInfo *FFI = MF.getFrameInfo();
+  const std::vector<CalleeSavedInfo> &CSI = FFI->getCalleeSavedInfo();
+
+  // Early exit if no callee saved registers are modified!
+  if (CSI.empty() && !needsFP(MF)) {
+    addScavengingSpillSlot(MF, RS);
+    return;
+  }
+
+  unsigned MinGPR = PPC::R31;
+  unsigned MinG8R = PPC::X31;
+  unsigned MinFPR = PPC::F31;
+  unsigned MinVR = PPC::V31;
+
+  bool HasGPSaveArea = false;
+  bool HasG8SaveArea = false;
+  bool HasFPSaveArea = false;
+  bool HasVRSAVESaveArea = false;
+  bool HasVRSaveArea = false;
+
+  SmallVector<CalleeSavedInfo, 18> GPRegs;
+  SmallVector<CalleeSavedInfo, 18> G8Regs;
+  SmallVector<CalleeSavedInfo, 18> FPRegs;
+  SmallVector<CalleeSavedInfo, 18> VRegs;
+
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    if (PPC::GPRCRegClass.contains(Reg)) {
+      HasGPSaveArea = true;
+
+      GPRegs.push_back(CSI[i]);
+
+      if (Reg < MinGPR) {
+        MinGPR = Reg;
+      }
+    } else if (PPC::G8RCRegClass.contains(Reg)) {
+      HasG8SaveArea = true;
+
+      G8Regs.push_back(CSI[i]);
+
+      if (Reg < MinG8R) {
+        MinG8R = Reg;
+      }
+    } else if (PPC::F8RCRegClass.contains(Reg)) {
+      HasFPSaveArea = true;
+
+      FPRegs.push_back(CSI[i]);
+
+      if (Reg < MinFPR) {
+        MinFPR = Reg;
+      }
+    } else if (PPC::CRBITRCRegClass.contains(Reg) ||
+               PPC::CRRCRegClass.contains(Reg)) {
+      ; // do nothing, as we already know whether CRs are spilled
+    } else if (PPC::VRSAVERCRegClass.contains(Reg)) {
+      HasVRSAVESaveArea = true;
+    } else if (PPC::VRRCRegClass.contains(Reg)) {
+      HasVRSaveArea = true;
+
+      VRegs.push_back(CSI[i]);
+
+      if (Reg < MinVR) {
+        MinVR = Reg;
+      }
+    } else {
+      llvm_unreachable("Unknown RegisterClass!");
+    }
+  }
+
+  PPCFunctionInfo *PFI = MF.getInfo<PPCFunctionInfo>();
+  const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+
+  int64_t LowerBound = 0;
+
+  // Take into account stack space reserved for tail calls.
+  int TCSPDelta = 0;
+  if (MF.getTarget().Options.GuaranteedTailCallOpt &&
+      (TCSPDelta = PFI->getTailCallSPDelta()) < 0) {
+    LowerBound = TCSPDelta;
+  }
+
+  // The Floating-point register save area is right below the back chain word
+  // of the previous stack frame.
+  if (HasFPSaveArea) {
+    for (unsigned i = 0, e = FPRegs.size(); i != e; ++i) {
+      int FI = FPRegs[i].getFrameIdx();
+
+      FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI));
+    }
+
+    LowerBound -= (31 - TRI->getEncodingValue(MinFPR) + 1) * 8;
+  }
+
+  // Check whether the frame pointer register is allocated. If so, make sure it
+  // is spilled to the correct offset.
+  if (needsFP(MF)) {
+    HasGPSaveArea = true;
+
+    int FI = PFI->getFramePointerSaveIndex();
+    assert(FI && "No Frame Pointer Save Slot!");
+
+    FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI));
+  }
+
+  const PPCRegisterInfo *RegInfo =
+    static_cast<const PPCRegisterInfo*>(MF.getTarget().getRegisterInfo());
+  if (RegInfo->hasBasePointer(MF)) {
+    HasGPSaveArea = true;
+
+    int FI = PFI->getBasePointerSaveIndex();
+    assert(FI && "No Base Pointer Save Slot!");
+
+    FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI));
+  }
+
+  // General register save area starts right below the Floating-point
+  // register save area.
+  if (HasGPSaveArea || HasG8SaveArea) {
+    // Move general register save area spill slots down, taking into account
+    // the size of the Floating-point register save area.
+    for (unsigned i = 0, e = GPRegs.size(); i != e; ++i) {
+      int FI = GPRegs[i].getFrameIdx();
+
+      FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI));
+    }
+
+    // Move general register save area spill slots down, taking into account
+    // the size of the Floating-point register save area.
+    for (unsigned i = 0, e = G8Regs.size(); i != e; ++i) {
+      int FI = G8Regs[i].getFrameIdx();
+
+      FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI));
+    }
+
+    unsigned MinReg =
+      std::min<unsigned>(TRI->getEncodingValue(MinGPR),
+                         TRI->getEncodingValue(MinG8R));
+
+    if (Subtarget.isPPC64()) {
+      LowerBound -= (31 - MinReg + 1) * 8;
+    } else {
+      LowerBound -= (31 - MinReg + 1) * 4;
+    }
+  }
+
+  // For 32-bit only, the CR save area is below the general register
+  // save area.  For 64-bit SVR4, the CR save area is addressed relative
+  // to the stack pointer and hence does not need an adjustment here.
+  // Only CR2 (the first nonvolatile spilled) has an associated frame
+  // index so that we have a single uniform save area.
+  if (spillsCR(MF) && !(Subtarget.isPPC64() && Subtarget.isSVR4ABI())) {
+    // Adjust the frame index of the CR spill slot.
+    for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+      unsigned Reg = CSI[i].getReg();
+
+      if ((Subtarget.isSVR4ABI() && Reg == PPC::CR2)
+          // Leave Darwin logic as-is.
+          || (!Subtarget.isSVR4ABI() &&
+              (PPC::CRBITRCRegClass.contains(Reg) ||
+               PPC::CRRCRegClass.contains(Reg)))) {
+        int FI = CSI[i].getFrameIdx();
+
+        FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI));
+      }
+    }
+
+    LowerBound -= 4; // The CR save area is always 4 bytes long.
+  }
+
+  if (HasVRSAVESaveArea) {
+    // FIXME SVR4: Is it actually possible to have multiple elements in CSI
+    //             which have the VRSAVE register class?
+    // Adjust the frame index of the VRSAVE spill slot.
+    for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+      unsigned Reg = CSI[i].getReg();
+
+      if (PPC::VRSAVERCRegClass.contains(Reg)) {
+        int FI = CSI[i].getFrameIdx();
+
+        FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI));
+      }
+    }
+
+    LowerBound -= 4; // The VRSAVE save area is always 4 bytes long.
+  }
+
+  if (HasVRSaveArea) {
+    // Insert alignment padding, we need 16-byte alignment.
+    LowerBound = (LowerBound - 15) & ~(15);
+
+    for (unsigned i = 0, e = VRegs.size(); i != e; ++i) {
+      int FI = VRegs[i].getFrameIdx();
+
+      FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI));
+    }
+  }
+
+  addScavengingSpillSlot(MF, RS);
+}
+
+void
+PPCFrameLowering::addScavengingSpillSlot(MachineFunction &MF,
+                                         RegScavenger *RS) const {
+  // Reserve a slot closest to SP or frame pointer if we have a dynalloc or
+  // a large stack, which will require scavenging a register to materialize a
+  // large offset.
+
+  // We need to have a scavenger spill slot for spills if the frame size is
+  // large. In case there is no free register for large-offset addressing,
+  // this slot is used for the necessary emergency spill. Also, we need the
+  // slot for dynamic stack allocations.
+
+  // The scavenger might be invoked if the frame offset does not fit into
+  // the 16-bit immediate. We don't know the complete frame size here
+  // because we've not yet computed callee-saved register spills or the
+  // needed alignment padding.
+  unsigned StackSize = determineFrameLayout(MF, false, true);
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  if (MFI->hasVarSizedObjects() || spillsCR(MF) || spillsVRSAVE(MF) ||
+      hasNonRISpills(MF) || (hasSpills(MF) && !isInt<16>(StackSize))) {
+    const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+    const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
+    const TargetRegisterClass *RC = Subtarget.isPPC64() ? G8RC : GPRC;
+    RS->addScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(),
+                                                       RC->getAlignment(),
+                                                       false));
+
+    // Might we have over-aligned allocas?
+    bool HasAlVars = MFI->hasVarSizedObjects() &&
+                     MFI->getMaxAlignment() > getStackAlignment();
+
+    // These kinds of spills might need two registers.
+    if (spillsCR(MF) || spillsVRSAVE(MF) || HasAlVars)
+      RS->addScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(),
+                                                         RC->getAlignment(),
+                                                         false));
+
+  }
+}
+
+bool
+PPCFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MI,
+                                     const std::vector<CalleeSavedInfo> &CSI,
+                                     const TargetRegisterInfo *TRI) const {
+
+  // Currently, this function only handles SVR4 32- and 64-bit ABIs.
+  // Return false otherwise to maintain pre-existing behavior.
+  if (!Subtarget.isSVR4ABI())
+    return false;
+
+  MachineFunction *MF = MBB.getParent();
+  const PPCInstrInfo &TII =
+    *static_cast<const PPCInstrInfo*>(MF->getTarget().getInstrInfo());
+  DebugLoc DL;
+  bool CRSpilled = false;
+  MachineInstrBuilder CRMIB;
+
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    // Only Darwin actually uses the VRSAVE register, but it can still appear
+    // here if, for example, @llvm.eh.unwind.init() is used.  If we're not on
+    // Darwin, ignore it.
+    if (Reg == PPC::VRSAVE && !Subtarget.isDarwinABI())
+      continue;
+
+    // CR2 through CR4 are the nonvolatile CR fields.
+    bool IsCRField = PPC::CR2 <= Reg && Reg <= PPC::CR4;
+
+    // Add the callee-saved register as live-in; it's killed at the spill.
+    MBB.addLiveIn(Reg);
+
+    if (CRSpilled && IsCRField) {
+      CRMIB.addReg(Reg, RegState::ImplicitKill);
+      continue;
+    }
+
+    // Insert the spill to the stack frame.
+    if (IsCRField) {
+      PPCFunctionInfo *FuncInfo = MF->getInfo<PPCFunctionInfo>();
+      if (Subtarget.isPPC64()) {
+        // The actual spill will happen at the start of the prologue.
+        FuncInfo->addMustSaveCR(Reg);
+      } else {
+        CRSpilled = true;
+        FuncInfo->setSpillsCR();
+
+        // 32-bit:  FP-relative.  Note that we made sure CR2-CR4 all have
+        // the same frame index in PPCRegisterInfo::hasReservedSpillSlot.
+        CRMIB = BuildMI(*MF, DL, TII.get(PPC::MFCR), PPC::R12)
+                  .addReg(Reg, RegState::ImplicitKill);
+
+        MBB.insert(MI, CRMIB);
+        MBB.insert(MI, addFrameReference(BuildMI(*MF, DL, TII.get(PPC::STW))
+                                         .addReg(PPC::R12,
+                                                 getKillRegState(true)),
+                                         CSI[i].getFrameIdx()));
+      }
+    } else {
+      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+      TII.storeRegToStackSlot(MBB, MI, Reg, true,
+                              CSI[i].getFrameIdx(), RC, TRI);
+    }
+  }
+  return true;
+}
+
+static void
+restoreCRs(bool isPPC64, bool is31,
+           bool CR2Spilled, bool CR3Spilled, bool CR4Spilled,
+           MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+           const std::vector<CalleeSavedInfo> &CSI, unsigned CSIIndex) {
+
+  MachineFunction *MF = MBB.getParent();
+  const PPCInstrInfo &TII =
+    *static_cast<const PPCInstrInfo*>(MF->getTarget().getInstrInfo());
+  DebugLoc DL;
+  unsigned RestoreOp, MoveReg;
+
+  if (isPPC64)
+    // This is handled during epilogue generation.
+    return;
+  else {
+    // 32-bit:  FP-relative
+    MBB.insert(MI, addFrameReference(BuildMI(*MF, DL, TII.get(PPC::LWZ),
+                                             PPC::R12),
+                                     CSI[CSIIndex].getFrameIdx()));
+    RestoreOp = PPC::MTOCRF;
+    MoveReg = PPC::R12;
+  }
+
+  if (CR2Spilled)
+    MBB.insert(MI, BuildMI(*MF, DL, TII.get(RestoreOp), PPC::CR2)
+               .addReg(MoveReg, getKillRegState(!CR3Spilled && !CR4Spilled)));
+
+  if (CR3Spilled)
+    MBB.insert(MI, BuildMI(*MF, DL, TII.get(RestoreOp), PPC::CR3)
+               .addReg(MoveReg, getKillRegState(!CR4Spilled)));
+
+  if (CR4Spilled)
+    MBB.insert(MI, BuildMI(*MF, DL, TII.get(RestoreOp), PPC::CR4)
+               .addReg(MoveReg, getKillRegState(true)));
+}
+
+void PPCFrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  const PPCInstrInfo &TII =
+    *static_cast<const PPCInstrInfo*>(MF.getTarget().getInstrInfo());
+  if (MF.getTarget().Options.GuaranteedTailCallOpt &&
+      I->getOpcode() == PPC::ADJCALLSTACKUP) {
+    // Add (actually subtract) back the amount the callee popped on return.
+    if (int CalleeAmt =  I->getOperand(1).getImm()) {
+      bool is64Bit = Subtarget.isPPC64();
+      CalleeAmt *= -1;
+      unsigned StackReg = is64Bit ? PPC::X1 : PPC::R1;
+      unsigned TmpReg = is64Bit ? PPC::X0 : PPC::R0;
+      unsigned ADDIInstr = is64Bit ? PPC::ADDI8 : PPC::ADDI;
+      unsigned ADDInstr = is64Bit ? PPC::ADD8 : PPC::ADD4;
+      unsigned LISInstr = is64Bit ? PPC::LIS8 : PPC::LIS;
+      unsigned ORIInstr = is64Bit ? PPC::ORI8 : PPC::ORI;
+      MachineInstr *MI = I;
+      DebugLoc dl = MI->getDebugLoc();
+
+      if (isInt<16>(CalleeAmt)) {
+        BuildMI(MBB, I, dl, TII.get(ADDIInstr), StackReg)
+          .addReg(StackReg, RegState::Kill)
+          .addImm(CalleeAmt);
+      } else {
+        MachineBasicBlock::iterator MBBI = I;
+        BuildMI(MBB, MBBI, dl, TII.get(LISInstr), TmpReg)
+          .addImm(CalleeAmt >> 16);
+        BuildMI(MBB, MBBI, dl, TII.get(ORIInstr), TmpReg)
+          .addReg(TmpReg, RegState::Kill)
+          .addImm(CalleeAmt & 0xFFFF);
+        BuildMI(MBB, MBBI, dl, TII.get(ADDInstr), StackReg)
+          .addReg(StackReg, RegState::Kill)
+          .addReg(TmpReg);
+      }
+    }
+  }
+  // Simply discard ADJCALLSTACKDOWN, ADJCALLSTACKUP instructions.
+  MBB.erase(I);
+}
+
+bool
+PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MI,
+                                        const std::vector<CalleeSavedInfo> &CSI,
+                                        const TargetRegisterInfo *TRI) const {
+
+  // Currently, this function only handles SVR4 32- and 64-bit ABIs.
+  // Return false otherwise to maintain pre-existing behavior.
+  if (!Subtarget.isSVR4ABI())
+    return false;
+
+  MachineFunction *MF = MBB.getParent();
+  const PPCInstrInfo &TII =
+    *static_cast<const PPCInstrInfo*>(MF->getTarget().getInstrInfo());
+  bool CR2Spilled = false;
+  bool CR3Spilled = false;
+  bool CR4Spilled = false;
+  unsigned CSIIndex = 0;
+
+  // Initialize insertion-point logic; we will be restoring in reverse
+  // order of spill.
+  MachineBasicBlock::iterator I = MI, BeforeI = I;
+  bool AtStart = I == MBB.begin();
+
+  if (!AtStart)
+    --BeforeI;
+
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+
+    // Only Darwin actually uses the VRSAVE register, but it can still appear
+    // here if, for example, @llvm.eh.unwind.init() is used.  If we're not on
+    // Darwin, ignore it.
+    if (Reg == PPC::VRSAVE && !Subtarget.isDarwinABI())
+      continue;
+
+    if (Reg == PPC::CR2) {
+      CR2Spilled = true;
+      // The spill slot is associated only with CR2, which is the
+      // first nonvolatile spilled.  Save it here.
+      CSIIndex = i;
+      continue;
+    } else if (Reg == PPC::CR3) {
+      CR3Spilled = true;
+      continue;
+    } else if (Reg == PPC::CR4) {
+      CR4Spilled = true;
+      continue;
+    } else {
+      // When we first encounter a non-CR register after seeing at
+      // least one CR register, restore all spilled CRs together.
+      if ((CR2Spilled || CR3Spilled || CR4Spilled)
+          && !(PPC::CR2 <= Reg && Reg <= PPC::CR4)) {
+        bool is31 = needsFP(*MF);
+        restoreCRs(Subtarget.isPPC64(), is31,
+                   CR2Spilled, CR3Spilled, CR4Spilled,
+                   MBB, I, CSI, CSIIndex);
+        CR2Spilled = CR3Spilled = CR4Spilled = false;
+      }
+
+      // Default behavior for non-CR saves.
+      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+      TII.loadRegFromStackSlot(MBB, I, Reg, CSI[i].getFrameIdx(),
+                               RC, TRI);
+      assert(I != MBB.begin() &&
+             "loadRegFromStackSlot didn't insert any code!");
+      }
+
+    // Insert in reverse order.
+    if (AtStart)
+      I = MBB.begin();
+    else {
+      I = BeforeI;
+      ++I;
+    }
+  }
+
+  // If we haven't yet spilled the CRs, do so now.
+  if (CR2Spilled || CR3Spilled || CR4Spilled) {
+    bool is31 = needsFP(*MF);
+    restoreCRs(Subtarget.isPPC64(), is31, CR2Spilled, CR3Spilled, CR4Spilled,
+               MBB, I, CSI, CSIIndex);
+  }
+
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.h b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.h
new file mode 100644
index 0000000..c0c7d24
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.h
@@ -0,0 +1,126 @@
+//===-- PPCFrameLowering.h - Define frame lowering for PowerPC --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef POWERPC_FRAMEINFO_H
+#define POWERPC_FRAMEINFO_H
+
+#include "PPC.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+class PPCSubtarget;
+
+class PPCFrameLowering: public TargetFrameLowering {
+  const PPCSubtarget &Subtarget;
+
+public:
+  PPCFrameLowering(const PPCSubtarget &STI);
+
+  unsigned determineFrameLayout(MachineFunction &MF,
+                                bool UpdateMF = true,
+                                bool UseEstimate = false) const;
+
+  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+  /// the function.
+  void emitPrologue(MachineFunction &MF) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+  bool hasFP(const MachineFunction &MF) const override;
+  bool needsFP(const MachineFunction &MF) const;
+  void replaceFPWithRealFP(MachineFunction &MF) const;
+
+  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                     RegScavenger *RS = nullptr) const override;
+  void processFunctionBeforeFrameFinalized(MachineFunction &MF,
+                                     RegScavenger *RS = nullptr) const override;
+  void addScavengingSpillSlot(MachineFunction &MF, RegScavenger *RS) const;
+
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 const TargetRegisterInfo *TRI) const override;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                  MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I) const override;
+
+  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator MI,
+                                  const std::vector<CalleeSavedInfo> &CSI,
+                                  const TargetRegisterInfo *TRI) const override;
+
+  /// targetHandlesStackFrameRounding - Returns true if the target is
+  /// responsible for rounding up the stack frame (probably at emitPrologue
+  /// time).
+  bool targetHandlesStackFrameRounding() const override { return true; }
+
+  /// getReturnSaveOffset - Return the previous frame offset to save the
+  /// return address.
+  static unsigned getReturnSaveOffset(bool isPPC64, bool isDarwinABI) {
+    if (isDarwinABI)
+      return isPPC64 ? 16 : 8;
+    // SVR4 ABI:
+    return isPPC64 ? 16 : 4;
+  }
+
+  /// getTOCSaveOffset - Return the previous frame offset to save the
+  /// TOC register -- 64-bit SVR4 ABI only.
+  static unsigned getTOCSaveOffset(bool isELFv2ABI) {
+    return isELFv2ABI ? 24 : 40;
+  }
+
+  /// getFramePointerSaveOffset - Return the previous frame offset to save the
+  /// frame pointer.
+  static unsigned getFramePointerSaveOffset(bool isPPC64, bool isDarwinABI) {
+    // For the Darwin ABI:
+    // We cannot use the TOC save slot (offset +20) in the PowerPC linkage area
+    // for saving the frame pointer (if needed.)  While the published ABI has
+    // not used this slot since at least MacOSX 10.2, there is older code
+    // around that does use it, and that needs to continue to work.
+    if (isDarwinABI)
+      return isPPC64 ? -8U : -4U;
+
+    // SVR4 ABI: First slot in the general register save area.
+    return isPPC64 ? -8U : -4U;
+  }
+
+  /// getBasePointerSaveOffset - Return the previous frame offset to save the
+  /// base pointer.
+  static unsigned getBasePointerSaveOffset(bool isPPC64,
+                                           bool isDarwinABI,
+                                           bool isPIC) {
+    if (isDarwinABI)
+      return isPPC64 ? -16U : -8U;
+
+    // SVR4 ABI: First slot in the general register save area.
+    return isPPC64 ? -16U : isPIC ? -12U : -8U;
+  }
+
+  /// getLinkageSize - Return the size of the PowerPC ABI linkage area.
+  ///
+  static unsigned getLinkageSize(bool isPPC64, bool isDarwinABI,
+                                 bool isELFv2ABI) {
+    if (isDarwinABI || isPPC64)
+      return (isELFv2ABI ? 4 : 6) * (isPPC64 ? 8 : 4);
+
+    // SVR4 ABI:
+    return 8;
+  }
+
+  const SpillSlot *
+  getCalleeSavedSpillSlots(unsigned &NumEntries) const override;
+};
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp b/contrib/llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp
new file mode 100644
index 0000000..d9b242c
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp
@@ -0,0 +1,433 @@
+//===-- PPCHazardRecognizers.cpp - PowerPC Hazard Recognizer Impls --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements hazard recognizers for scheduling on PowerPC processors.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCHazardRecognizers.h"
+#include "PPC.h"
+#include "PPCInstrInfo.h"
+#include "PPCTargetMachine.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "pre-RA-sched"
+
+bool PPCDispatchGroupSBHazardRecognizer::isLoadAfterStore(SUnit *SU) {
+  // FIXME: Move this.
+  if (isBCTRAfterSet(SU))
+    return true;
+
+  const MCInstrDesc *MCID = DAG->getInstrDesc(SU);
+  if (!MCID)
+    return false;
+
+  if (!MCID->mayLoad())
+    return false;
+
+  // SU is a load; for any predecessors in this dispatch group, that are stores,
+  // and with which we have an ordering dependency, return true.
+  for (unsigned i = 0, ie = (unsigned) SU->Preds.size(); i != ie; ++i) {
+    const MCInstrDesc *PredMCID = DAG->getInstrDesc(SU->Preds[i].getSUnit());
+    if (!PredMCID || !PredMCID->mayStore())
+      continue;
+
+    if (!SU->Preds[i].isNormalMemory() && !SU->Preds[i].isBarrier())
+      continue;
+
+    for (unsigned j = 0, je = CurGroup.size(); j != je; ++j)
+      if (SU->Preds[i].getSUnit() == CurGroup[j])
+        return true;
+  }
+
+  return false; 
+}
+
+bool PPCDispatchGroupSBHazardRecognizer::isBCTRAfterSet(SUnit *SU) {
+  const MCInstrDesc *MCID = DAG->getInstrDesc(SU);
+  if (!MCID)
+    return false;
+
+  if (!MCID->isBranch())
+    return false;
+
+  // SU is a branch; for any predecessors in this dispatch group, with which we
+  // have a data dependence and set the counter register, return true.
+  for (unsigned i = 0, ie = (unsigned) SU->Preds.size(); i != ie; ++i) {
+    const MCInstrDesc *PredMCID = DAG->getInstrDesc(SU->Preds[i].getSUnit());
+    if (!PredMCID || PredMCID->getSchedClass() != PPC::Sched::IIC_SprMTSPR)
+      continue;
+
+    if (SU->Preds[i].isCtrl())
+      continue;
+
+    for (unsigned j = 0, je = CurGroup.size(); j != je; ++j)
+      if (SU->Preds[i].getSUnit() == CurGroup[j])
+        return true;
+  }
+
+  return false; 
+}
+
+// FIXME: Remove this when we don't need this:
+namespace llvm { namespace PPC { extern int getNonRecordFormOpcode(uint16_t); } }
+
+// FIXME: A lot of code in PPCDispatchGroupSBHazardRecognizer is P7 specific.
+
+bool PPCDispatchGroupSBHazardRecognizer::mustComeFirst(const MCInstrDesc *MCID,
+                                                       unsigned &NSlots) {
+  // FIXME: Indirectly, this information is contained in the itinerary, and
+  // we should derive it from there instead of separately specifying it
+  // here.
+  unsigned IIC = MCID->getSchedClass();
+  switch (IIC) {
+  default:
+    NSlots = 1;
+    break;
+  case PPC::Sched::IIC_IntDivW:
+  case PPC::Sched::IIC_IntDivD:
+  case PPC::Sched::IIC_LdStLoadUpd:
+  case PPC::Sched::IIC_LdStLDU:
+  case PPC::Sched::IIC_LdStLFDU:
+  case PPC::Sched::IIC_LdStLFDUX:
+  case PPC::Sched::IIC_LdStLHA:
+  case PPC::Sched::IIC_LdStLHAU:
+  case PPC::Sched::IIC_LdStLWA:
+  case PPC::Sched::IIC_LdStSTDU:
+  case PPC::Sched::IIC_LdStSTFDU:
+    NSlots = 2;
+    break;
+  case PPC::Sched::IIC_LdStLoadUpdX:
+  case PPC::Sched::IIC_LdStLDUX:
+  case PPC::Sched::IIC_LdStLHAUX:
+  case PPC::Sched::IIC_LdStLWARX:
+  case PPC::Sched::IIC_LdStLDARX:
+  case PPC::Sched::IIC_LdStSTDUX:
+  case PPC::Sched::IIC_LdStSTDCX:
+  case PPC::Sched::IIC_LdStSTWCX:
+  case PPC::Sched::IIC_BrMCRX: // mtcr
+  // FIXME: Add sync/isync (here and in the itinerary).
+    NSlots = 4;
+    break;
+  }
+
+  // FIXME: record-form instructions need a different itinerary class.
+  if (NSlots == 1 && PPC::getNonRecordFormOpcode(MCID->getOpcode()) != -1)
+    NSlots = 2;
+
+  switch (IIC) {
+  default:
+    // All multi-slot instructions must come first.
+    return NSlots > 1;
+  case PPC::Sched::IIC_BrCR: // cr logicals
+  case PPC::Sched::IIC_SprMFCR:
+  case PPC::Sched::IIC_SprMFCRF:
+  case PPC::Sched::IIC_SprMTSPR:
+    return true;
+  }
+}
+
+ScheduleHazardRecognizer::HazardType
+PPCDispatchGroupSBHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
+  if (Stalls == 0 && isLoadAfterStore(SU))
+    return NoopHazard;
+
+  return ScoreboardHazardRecognizer::getHazardType(SU, Stalls);
+}
+
+bool PPCDispatchGroupSBHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
+  const MCInstrDesc *MCID = DAG->getInstrDesc(SU);
+  unsigned NSlots;
+  if (MCID && mustComeFirst(MCID, NSlots) && CurSlots)
+    return true;
+
+  return ScoreboardHazardRecognizer::ShouldPreferAnother(SU);
+}
+
+unsigned PPCDispatchGroupSBHazardRecognizer::PreEmitNoops(SUnit *SU) {
+  // We only need to fill out a maximum of 5 slots here: The 6th slot could
+  // only be a second branch, and otherwise the next instruction will start a
+  // new group.
+  if (isLoadAfterStore(SU) && CurSlots < 6) {
+    unsigned Directive =
+      DAG->TM.getSubtarget<PPCSubtarget>().getDarwinDirective();
+    // If we're using a special group-terminating nop, then we need only one.
+    if (Directive == PPC::DIR_PWR6 || Directive == PPC::DIR_PWR7 ||
+        Directive == PPC::DIR_PWR8 )
+      return 1;
+
+    return 5 - CurSlots;
+  }
+
+  return ScoreboardHazardRecognizer::PreEmitNoops(SU);
+}
+
+void PPCDispatchGroupSBHazardRecognizer::EmitInstruction(SUnit *SU) {
+  const MCInstrDesc *MCID = DAG->getInstrDesc(SU);
+  if (MCID) {
+    if (CurSlots == 5 || (MCID->isBranch() && CurBranches == 1)) {
+      CurGroup.clear();
+      CurSlots = CurBranches = 0;
+    } else {
+      DEBUG(dbgs() << "**** Adding to dispatch group: SU(" <<
+                      SU->NodeNum << "): ");
+      DEBUG(DAG->dumpNode(SU));
+
+      unsigned NSlots;
+      bool MustBeFirst = mustComeFirst(MCID, NSlots);
+
+      // If this instruction must come first, but does not, then it starts a
+      // new group.
+      if (MustBeFirst && CurSlots) {
+        CurSlots = CurBranches = 0;
+        CurGroup.clear();
+      }
+
+      CurSlots += NSlots;
+      CurGroup.push_back(SU);
+
+      if (MCID->isBranch())
+        ++CurBranches;
+    }
+  }
+
+  return ScoreboardHazardRecognizer::EmitInstruction(SU);
+}
+
+void PPCDispatchGroupSBHazardRecognizer::AdvanceCycle() {
+  return ScoreboardHazardRecognizer::AdvanceCycle();
+}
+
+void PPCDispatchGroupSBHazardRecognizer::RecedeCycle() {
+  llvm_unreachable("Bottom-up scheduling not supported");
+}
+
+void PPCDispatchGroupSBHazardRecognizer::Reset() {
+  CurGroup.clear();
+  CurSlots = CurBranches = 0;
+  return ScoreboardHazardRecognizer::Reset();
+}
+
+void PPCDispatchGroupSBHazardRecognizer::EmitNoop() {
+  unsigned Directive =
+    DAG->TM.getSubtarget<PPCSubtarget>().getDarwinDirective();
+  // If the group has now filled all of its slots, or if we're using a special
+  // group-terminating nop, the group is complete.
+  if (Directive == PPC::DIR_PWR6 || Directive == PPC::DIR_PWR7 ||
+      Directive == PPC::DIR_PWR8 || CurSlots == 6)  {
+    CurGroup.clear();
+    CurSlots = CurBranches = 0;
+  } else {
+    CurGroup.push_back(nullptr);
+    ++CurSlots;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// PowerPC 970 Hazard Recognizer
+//
+// This models the dispatch group formation of the PPC970 processor.  Dispatch
+// groups are bundles of up to five instructions that can contain various mixes
+// of instructions.  The PPC970 can dispatch a peak of 4 non-branch and one
+// branch instruction per-cycle.
+//
+// There are a number of restrictions to dispatch group formation: some
+// instructions can only be issued in the first slot of a dispatch group, & some
+// instructions fill an entire dispatch group.  Additionally, only branches can
+// issue in the 5th (last) slot.
+//
+// Finally, there are a number of "structural" hazards on the PPC970.  These
+// conditions cause large performance penalties due to misprediction, recovery,
+// and replay logic that has to happen.  These cases include setting a CTR and
+// branching through it in the same dispatch group, and storing to an address,
+// then loading from the same address within a dispatch group.  To avoid these
+// conditions, we insert no-op instructions when appropriate.
+//
+// FIXME: This is missing some significant cases:
+//   1. Modeling of microcoded instructions.
+//   2. Handling of serialized operations.
+//   3. Handling of the esoteric cases in "Resource-based Instruction Grouping".
+//
+
+PPCHazardRecognizer970::PPCHazardRecognizer970(const ScheduleDAG &DAG)
+    : DAG(DAG) {
+  EndDispatchGroup();
+}
+
+void PPCHazardRecognizer970::EndDispatchGroup() {
+  DEBUG(errs() << "=== Start of dispatch group\n");
+  NumIssued = 0;
+
+  // Structural hazard info.
+  HasCTRSet = false;
+  NumStores = 0;
+}
+
+
+PPCII::PPC970_Unit
+PPCHazardRecognizer970::GetInstrType(unsigned Opcode,
+                                     bool &isFirst, bool &isSingle,
+                                     bool &isCracked,
+                                     bool &isLoad, bool &isStore) {
+  const MCInstrDesc &MCID = DAG.TII->get(Opcode);
+
+  isLoad  = MCID.mayLoad();
+  isStore = MCID.mayStore();
+
+  uint64_t TSFlags = MCID.TSFlags;
+
+  isFirst   = TSFlags & PPCII::PPC970_First;
+  isSingle  = TSFlags & PPCII::PPC970_Single;
+  isCracked = TSFlags & PPCII::PPC970_Cracked;
+  return (PPCII::PPC970_Unit)(TSFlags & PPCII::PPC970_Mask);
+}
+
+/// isLoadOfStoredAddress - If we have a load from the previously stored pointer
+/// as indicated by StorePtr1/StorePtr2/StoreSize, return true.
+bool PPCHazardRecognizer970::
+isLoadOfStoredAddress(uint64_t LoadSize, int64_t LoadOffset,
+  const Value *LoadValue) const {
+  for (unsigned i = 0, e = NumStores; i != e; ++i) {
+    // Handle exact and commuted addresses.
+    if (LoadValue == StoreValue[i] && LoadOffset == StoreOffset[i])
+      return true;
+
+    // Okay, we don't have an exact match, if this is an indexed offset, see if
+    // we have overlap (which happens during fp->int conversion for example).
+    if (StoreValue[i] == LoadValue) {
+      // Okay the base pointers match, so we have [c1+r] vs [c2+r].  Check
+      // to see if the load and store actually overlap.
+      if (StoreOffset[i] < LoadOffset) {
+        if (int64_t(StoreOffset[i]+StoreSize[i]) > LoadOffset) return true;
+      } else {
+        if (int64_t(LoadOffset+LoadSize) > StoreOffset[i]) return true;
+      }
+    }
+  }
+  return false;
+}
+
+/// getHazardType - We return hazard for any non-branch instruction that would
+/// terminate the dispatch group.  We turn NoopHazard for any
+/// instructions that wouldn't terminate the dispatch group that would cause a
+/// pipeline flush.
+ScheduleHazardRecognizer::HazardType PPCHazardRecognizer970::
+getHazardType(SUnit *SU, int Stalls) {
+  assert(Stalls == 0 && "PPC hazards don't support scoreboard lookahead");
+
+  MachineInstr *MI = SU->getInstr();
+
+  if (MI->isDebugValue())
+    return NoHazard;
+
+  unsigned Opcode = MI->getOpcode();
+  bool isFirst, isSingle, isCracked, isLoad, isStore;
+  PPCII::PPC970_Unit InstrType =
+    GetInstrType(Opcode, isFirst, isSingle, isCracked,
+                 isLoad, isStore);
+  if (InstrType == PPCII::PPC970_Pseudo) return NoHazard;
+
+  // We can only issue a PPC970_First/PPC970_Single instruction (such as
+  // crand/mtspr/etc) if this is the first cycle of the dispatch group.
+  if (NumIssued != 0 && (isFirst || isSingle))
+    return Hazard;
+
+  // If this instruction is cracked into two ops by the decoder, we know that
+  // it is not a branch and that it cannot issue if 3 other instructions are
+  // already in the dispatch group.
+  if (isCracked && NumIssued > 2)
+    return Hazard;
+
+  switch (InstrType) {
+  default: llvm_unreachable("Unknown instruction type!");
+  case PPCII::PPC970_FXU:
+  case PPCII::PPC970_LSU:
+  case PPCII::PPC970_FPU:
+  case PPCII::PPC970_VALU:
+  case PPCII::PPC970_VPERM:
+    // We can only issue a branch as the last instruction in a group.
+    if (NumIssued == 4) return Hazard;
+    break;
+  case PPCII::PPC970_CRU:
+    // We can only issue a CR instruction in the first two slots.
+    if (NumIssued >= 2) return Hazard;
+    break;
+  case PPCII::PPC970_BRU:
+    break;
+  }
+
+  // Do not allow MTCTR and BCTRL to be in the same dispatch group.
+  if (HasCTRSet && Opcode == PPC::BCTRL)
+    return NoopHazard;
+
+  // If this is a load following a store, make sure it's not to the same or
+  // overlapping address.
+  if (isLoad && NumStores && !MI->memoperands_empty()) {
+    MachineMemOperand *MO = *MI->memoperands_begin();
+    if (isLoadOfStoredAddress(MO->getSize(),
+                              MO->getOffset(), MO->getValue()))
+      return NoopHazard;
+  }
+
+  return NoHazard;
+}
+
+void PPCHazardRecognizer970::EmitInstruction(SUnit *SU) {
+  MachineInstr *MI = SU->getInstr();
+
+  if (MI->isDebugValue())
+    return;
+
+  unsigned Opcode = MI->getOpcode();
+  bool isFirst, isSingle, isCracked, isLoad, isStore;
+  PPCII::PPC970_Unit InstrType =
+    GetInstrType(Opcode, isFirst, isSingle, isCracked,
+                 isLoad, isStore);
+  if (InstrType == PPCII::PPC970_Pseudo) return;
+
+  // Update structural hazard information.
+  if (Opcode == PPC::MTCTR || Opcode == PPC::MTCTR8) HasCTRSet = true;
+
+  // Track the address stored to.
+  if (isStore && NumStores < 4 && !MI->memoperands_empty()) {
+    MachineMemOperand *MO = *MI->memoperands_begin();
+    StoreSize[NumStores] = MO->getSize();
+    StoreOffset[NumStores] = MO->getOffset();
+    StoreValue[NumStores] = MO->getValue();
+    ++NumStores;
+  }
+
+  if (InstrType == PPCII::PPC970_BRU || isSingle)
+    NumIssued = 4;  // Terminate a d-group.
+  ++NumIssued;
+
+  // If this instruction is cracked into two ops by the decoder, remember that
+  // we issued two pieces.
+  if (isCracked)
+    ++NumIssued;
+
+  if (NumIssued == 5)
+    EndDispatchGroup();
+}
+
+void PPCHazardRecognizer970::AdvanceCycle() {
+  assert(NumIssued < 5 && "Illegal dispatch group!");
+  ++NumIssued;
+  if (NumIssued == 5)
+    EndDispatchGroup();
+}
+
+void PPCHazardRecognizer970::Reset() {
+  EndDispatchGroup();
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCHazardRecognizers.h b/contrib/llvm/lib/Target/PowerPC/PPCHazardRecognizers.h
new file mode 100644
index 0000000..23f76c16
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCHazardRecognizers.h
@@ -0,0 +1,102 @@
+//===-- PPCHazardRecognizers.h - PowerPC Hazard Recognizers -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines hazard recognizers for scheduling on PowerPC processors.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PPCHAZRECS_H
+#define PPCHAZRECS_H
+
+#include "PPCInstrInfo.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/ScoreboardHazardRecognizer.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+
+namespace llvm {
+
+/// PPCDispatchGroupSBHazardRecognizer - This class implements a scoreboard-based
+/// hazard recognizer for PPC ooo processors with dispatch-group hazards.
+class PPCDispatchGroupSBHazardRecognizer : public ScoreboardHazardRecognizer {
+  const ScheduleDAG *DAG;
+  SmallVector<SUnit *, 7> CurGroup;
+  unsigned CurSlots, CurBranches;
+
+  bool isLoadAfterStore(SUnit *SU);
+  bool isBCTRAfterSet(SUnit *SU);
+  bool mustComeFirst(const MCInstrDesc *MCID, unsigned &NSlots);
+public:
+  PPCDispatchGroupSBHazardRecognizer(const InstrItineraryData *ItinData,
+                         const ScheduleDAG *DAG_) :
+    ScoreboardHazardRecognizer(ItinData, DAG_), DAG(DAG_),
+    CurSlots(0), CurBranches(0) {}
+
+  HazardType getHazardType(SUnit *SU, int Stalls) override;
+  bool ShouldPreferAnother(SUnit* SU) override;
+  unsigned PreEmitNoops(SUnit *SU) override;
+  void EmitInstruction(SUnit *SU) override;
+  void AdvanceCycle() override;
+  void RecedeCycle() override;
+  void Reset() override;
+  void EmitNoop() override;
+};
+
+/// PPCHazardRecognizer970 - This class defines a finite state automata that
+/// models the dispatch logic on the PowerPC 970 (aka G5) processor.  This
+/// promotes good dispatch group formation and implements noop insertion to
+/// avoid structural hazards that cause significant performance penalties (e.g.
+/// setting the CTR register then branching through it within a dispatch group),
+/// or storing then loading from the same address within a dispatch group.
+class PPCHazardRecognizer970 : public ScheduleHazardRecognizer {
+  const ScheduleDAG &DAG;
+
+  unsigned NumIssued;  // Number of insts issued, including advanced cycles.
+
+  // Various things that can cause a structural hazard.
+
+  // HasCTRSet - If the CTR register is set in this group, disallow BCTRL.
+  bool HasCTRSet;
+
+  // StoredPtr - Keep track of the address of any store.  If we see a load from
+  // the same address (or one that aliases it), disallow the store.  We can have
+  // up to four stores in one dispatch group, hence we track up to 4.
+  //
+  // This is null if we haven't seen a store yet.  We keep track of both
+  // operands of the store here, since we support [r+r] and [r+i] addressing.
+  const Value *StoreValue[4];
+  int64_t StoreOffset[4];
+  uint64_t StoreSize[4];
+  unsigned NumStores;
+
+public:
+  PPCHazardRecognizer970(const ScheduleDAG &DAG);
+  virtual HazardType getHazardType(SUnit *SU, int Stalls) override;
+  virtual void EmitInstruction(SUnit *SU) override;
+  virtual void AdvanceCycle() override;
+  virtual void Reset() override;
+
+private:
+  /// EndDispatchGroup - Called when we are finishing a new dispatch group.
+  ///
+  void EndDispatchGroup();
+
+  /// GetInstrType - Classify the specified powerpc opcode according to its
+  /// pipeline.
+  PPCII::PPC970_Unit GetInstrType(unsigned Opcode,
+                                  bool &isFirst, bool &isSingle,bool &isCracked,
+                                  bool &isLoad, bool &isStore);
+
+  bool isLoadOfStoredAddress(uint64_t LoadSize, int64_t LoadOffset,
+                             const Value *LoadValue) const;
+};
+
+} // end namespace llvm
+
+#endif
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
new file mode 100644
index 0000000..490f6d2
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -0,0 +1,2205 @@
+//===-- PPCISelDAGToDAG.cpp - PPC --pattern matching inst selector --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pattern matching instruction selector for PowerPC,
+// converting from a legalized dag to a PPC dag.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "MCTargetDesc/PPCPredicates.h"
+#include "PPCMachineFunctionInfo.h"
+#include "PPCTargetMachine.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-codegen"
+
+// FIXME: Remove this once the bug has been fixed!
+cl::opt<bool> ANDIGlueBug("expose-ppc-andi-glue-bug",
+cl::desc("expose the ANDI glue bug on PPC"), cl::Hidden);
+
+namespace llvm {
+  void initializePPCDAGToDAGISelPass(PassRegistry&);
+}
+
+namespace {
+  //===--------------------------------------------------------------------===//
+  /// PPCDAGToDAGISel - PPC specific code to select PPC machine
+  /// instructions for SelectionDAG operations.
+  ///
+  class PPCDAGToDAGISel : public SelectionDAGISel {
+    const PPCTargetMachine &TM;
+    const PPCTargetLowering *PPCLowering;
+    const PPCSubtarget *PPCSubTarget;
+    unsigned GlobalBaseReg;
+  public:
+    explicit PPCDAGToDAGISel(PPCTargetMachine &tm)
+      : SelectionDAGISel(tm), TM(tm),
+        PPCLowering(TM.getTargetLowering()),
+        PPCSubTarget(TM.getSubtargetImpl()) {
+      initializePPCDAGToDAGISelPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override {
+      // Make sure we re-emit a set of the global base reg if necessary
+      GlobalBaseReg = 0;
+      PPCLowering = TM.getTargetLowering();
+      PPCSubTarget = TM.getSubtargetImpl();
+      SelectionDAGISel::runOnMachineFunction(MF);
+
+      if (!PPCSubTarget->isSVR4ABI())
+        InsertVRSaveCode(MF);
+
+      return true;
+    }
+
+    void PostprocessISelDAG() override;
+
+    /// getI32Imm - Return a target constant with the specified value, of type
+    /// i32.
+    inline SDValue getI32Imm(unsigned Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i32);
+    }
+
+    /// getI64Imm - Return a target constant with the specified value, of type
+    /// i64.
+    inline SDValue getI64Imm(uint64_t Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i64);
+    }
+
+    /// getSmallIPtrImm - Return a target constant of pointer type.
+    inline SDValue getSmallIPtrImm(unsigned Imm) {
+      return CurDAG->getTargetConstant(Imm, PPCLowering->getPointerTy());
+    }
+
+    /// isRunOfOnes - Returns true iff Val consists of one contiguous run of 1s
+    /// with any number of 0s on either side.  The 1s are allowed to wrap from
+    /// LSB to MSB, so 0x000FFF0, 0x0000FFFF, and 0xFF0000FF are all runs.
+    /// 0x0F0F0000 is not, since all 1s are not contiguous.
+    static bool isRunOfOnes(unsigned Val, unsigned &MB, unsigned &ME);
+
+
+    /// isRotateAndMask - Returns true if Mask and Shift can be folded into a
+    /// rotate and mask opcode and mask operation.
+    static bool isRotateAndMask(SDNode *N, unsigned Mask, bool isShiftMask,
+                                unsigned &SH, unsigned &MB, unsigned &ME);
+
+    /// getGlobalBaseReg - insert code into the entry mbb to materialize the PIC
+    /// base register.  Return the virtual register that holds this value.
+    SDNode *getGlobalBaseReg();
+
+    // Select - Convert the specified operand from a target-independent to a
+    // target-specific node if it hasn't already been changed.
+    SDNode *Select(SDNode *N) override;
+
+    SDNode *SelectBitfieldInsert(SDNode *N);
+
+    /// SelectCC - Select a comparison of the specified values with the
+    /// specified condition code, returning the CR# of the expression.
+    SDValue SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDLoc dl);
+
+    /// SelectAddrImm - Returns true if the address N can be represented by
+    /// a base register plus a signed 16-bit displacement [r+imm].
+    bool SelectAddrImm(SDValue N, SDValue &Disp,
+                       SDValue &Base) {
+      return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, false);
+    }
+
+    /// SelectAddrImmOffs - Return true if the operand is valid for a preinc
+    /// immediate field.  Note that the operand at this point is already the
+    /// result of a prior SelectAddressRegImm call.
+    bool SelectAddrImmOffs(SDValue N, SDValue &Out) const {
+      if (N.getOpcode() == ISD::TargetConstant ||
+          N.getOpcode() == ISD::TargetGlobalAddress) {
+        Out = N;
+        return true;
+      }
+
+      return false;
+    }
+
+    /// SelectAddrIdx - Given the specified addressed, check to see if it can be
+    /// represented as an indexed [r+r] operation.  Returns false if it can
+    /// be represented by [r+imm], which are preferred.
+    bool SelectAddrIdx(SDValue N, SDValue &Base, SDValue &Index) {
+      return PPCLowering->SelectAddressRegReg(N, Base, Index, *CurDAG);
+    }
+
+    /// SelectAddrIdxOnly - Given the specified addressed, force it to be
+    /// represented as an indexed [r+r] operation.
+    bool SelectAddrIdxOnly(SDValue N, SDValue &Base, SDValue &Index) {
+      return PPCLowering->SelectAddressRegRegOnly(N, Base, Index, *CurDAG);
+    }
+
+    /// SelectAddrImmX4 - Returns true if the address N can be represented by
+    /// a base register plus a signed 16-bit displacement that is a multiple of 4.
+    /// Suitable for use by STD and friends.
+    bool SelectAddrImmX4(SDValue N, SDValue &Disp, SDValue &Base) {
+      return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, true);
+    }
+
+    // Select an address into a single register.
+    bool SelectAddr(SDValue N, SDValue &Base) {
+      Base = N;
+      return true;
+    }
+
+    /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+    /// inline asm expressions.  It is always correct to compute the value into
+    /// a register.  The case of adding a (possibly relocatable) constant to a
+    /// register can be improved, but it is wrong to substitute Reg+Reg for
+    /// Reg in an asm, because the load or store opcode would have to change.
+   bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                      char ConstraintCode,
+                                      std::vector<SDValue> &OutOps) override {
+      OutOps.push_back(Op);
+      return false;
+    }
+
+    void InsertVRSaveCode(MachineFunction &MF);
+
+    const char *getPassName() const override {
+      return "PowerPC DAG->DAG Pattern Instruction Selection";
+    }
+
+// Include the pieces autogenerated from the target description.
+#include "PPCGenDAGISel.inc"
+
+private:
+    SDNode *SelectSETCC(SDNode *N);
+
+    void PeepholePPC64();
+    void PeepholeCROps();
+
+    bool AllUsersSelectZero(SDNode *N);
+    void SwapAllSelectUsers(SDNode *N);
+  };
+}
+
+/// InsertVRSaveCode - Once the entire function has been instruction selected,
+/// all virtual registers are created and all machine instructions are built,
+/// check to see if we need to save/restore VRSAVE.  If so, do it.
+void PPCDAGToDAGISel::InsertVRSaveCode(MachineFunction &Fn) {
+  // Check to see if this function uses vector registers, which means we have to
+  // save and restore the VRSAVE register and update it with the regs we use.
+  //
+  // In this case, there will be virtual registers of vector type created
+  // by the scheduler.  Detect them now.
+  bool HasVectorVReg = false;
+  for (unsigned i = 0, e = RegInfo->getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    if (RegInfo->getRegClass(Reg) == &PPC::VRRCRegClass) {
+      HasVectorVReg = true;
+      break;
+    }
+  }
+  if (!HasVectorVReg) return;  // nothing to do.
+
+  // If we have a vector register, we want to emit code into the entry and exit
+  // blocks to save and restore the VRSAVE register.  We do this here (instead
+  // of marking all vector instructions as clobbering VRSAVE) for two reasons:
+  //
+  // 1. This (trivially) reduces the load on the register allocator, by not
+  //    having to represent the live range of the VRSAVE register.
+  // 2. This (more significantly) allows us to create a temporary virtual
+  //    register to hold the saved VRSAVE value, allowing this temporary to be
+  //    register allocated, instead of forcing it to be spilled to the stack.
+
+  // Create two vregs - one to hold the VRSAVE register that is live-in to the
+  // function and one for the value after having bits or'd into it.
+  unsigned InVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
+  unsigned UpdatedVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
+
+  const TargetInstrInfo &TII = *TM.getInstrInfo();
+  MachineBasicBlock &EntryBB = *Fn.begin();
+  DebugLoc dl;
+  // Emit the following code into the entry block:
+  // InVRSAVE = MFVRSAVE
+  // UpdatedVRSAVE = UPDATE_VRSAVE InVRSAVE
+  // MTVRSAVE UpdatedVRSAVE
+  MachineBasicBlock::iterator IP = EntryBB.begin();  // Insert Point
+  BuildMI(EntryBB, IP, dl, TII.get(PPC::MFVRSAVE), InVRSAVE);
+  BuildMI(EntryBB, IP, dl, TII.get(PPC::UPDATE_VRSAVE),
+          UpdatedVRSAVE).addReg(InVRSAVE);
+  BuildMI(EntryBB, IP, dl, TII.get(PPC::MTVRSAVE)).addReg(UpdatedVRSAVE);
+
+  // Find all return blocks, outputting a restore in each epilog.
+  for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) {
+    if (!BB->empty() && BB->back().isReturn()) {
+      IP = BB->end(); --IP;
+
+      // Skip over all terminator instructions, which are part of the return
+      // sequence.
+      MachineBasicBlock::iterator I2 = IP;
+      while (I2 != BB->begin() && (--I2)->isTerminator())
+        IP = I2;
+
+      // Emit: MTVRSAVE InVRSave
+      BuildMI(*BB, IP, dl, TII.get(PPC::MTVRSAVE)).addReg(InVRSAVE);
+    }
+  }
+}
+
+
+/// getGlobalBaseReg - Output the instructions required to put the
+/// base address to use for accessing globals into a register.
+///
+SDNode *PPCDAGToDAGISel::getGlobalBaseReg() {
+  if (!GlobalBaseReg) {
+    const TargetInstrInfo &TII = *TM.getInstrInfo();
+    // Insert the set of GlobalBaseReg into the first MBB of the function
+    MachineBasicBlock &FirstMBB = MF->front();
+    MachineBasicBlock::iterator MBBI = FirstMBB.begin();
+    DebugLoc dl;
+
+    if (PPCLowering->getPointerTy() == MVT::i32) {
+      if (PPCSubTarget->isTargetELF())
+        GlobalBaseReg = PPC::R30;
+      else
+        GlobalBaseReg =
+          RegInfo->createVirtualRegister(&PPC::GPRC_NOR0RegClass);
+      BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR));
+      BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg);
+      if (PPCSubTarget->isTargetELF()) {
+        unsigned TempReg = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
+        BuildMI(FirstMBB, MBBI, dl,
+                TII.get(PPC::GetGBRO), TempReg).addReg(GlobalBaseReg);
+        BuildMI(FirstMBB, MBBI, dl,
+                TII.get(PPC::UpdateGBR)).addReg(GlobalBaseReg).addReg(TempReg);
+        MF->getInfo<PPCFunctionInfo>()->setUsesPICBase(true);
+      }
+    } else {
+      GlobalBaseReg = RegInfo->createVirtualRegister(&PPC::G8RC_NOX0RegClass);
+      BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR8));
+      BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR8), GlobalBaseReg);
+    }
+  }
+  return CurDAG->getRegister(GlobalBaseReg,
+                             PPCLowering->getPointerTy()).getNode();
+}
+
+/// isIntS16Immediate - This method tests to see if the node is either a 32-bit
+/// or 64-bit immediate, and if the value can be accurately represented as a
+/// sign extension from a 16-bit value.  If so, this returns true and the
+/// immediate.
+static bool isIntS16Immediate(SDNode *N, short &Imm) {
+  if (N->getOpcode() != ISD::Constant)
+    return false;
+
+  Imm = (short)cast<ConstantSDNode>(N)->getZExtValue();
+  if (N->getValueType(0) == MVT::i32)
+    return Imm == (int32_t)cast<ConstantSDNode>(N)->getZExtValue();
+  else
+    return Imm == (int64_t)cast<ConstantSDNode>(N)->getZExtValue();
+}
+
+static bool isIntS16Immediate(SDValue Op, short &Imm) {
+  return isIntS16Immediate(Op.getNode(), Imm);
+}
+
+
+/// isInt32Immediate - This method tests to see if the node is a 32-bit constant
+/// operand. If so Imm will receive the 32-bit value.
+static bool isInt32Immediate(SDNode *N, unsigned &Imm) {
+  if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i32) {
+    Imm = cast<ConstantSDNode>(N)->getZExtValue();
+    return true;
+  }
+  return false;
+}
+
+/// isInt64Immediate - This method tests to see if the node is a 64-bit constant
+/// operand.  If so Imm will receive the 64-bit value.
+static bool isInt64Immediate(SDNode *N, uint64_t &Imm) {
+  if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i64) {
+    Imm = cast<ConstantSDNode>(N)->getZExtValue();
+    return true;
+  }
+  return false;
+}
+
+// isInt32Immediate - This method tests to see if a constant operand.
+// If so Imm will receive the 32 bit value.
+static bool isInt32Immediate(SDValue N, unsigned &Imm) {
+  return isInt32Immediate(N.getNode(), Imm);
+}
+
+
+// isOpcWithIntImmediate - This method tests to see if the node is a specific
+// opcode and that it has a immediate integer right operand.
+// If so Imm will receive the 32 bit value.
+static bool isOpcWithIntImmediate(SDNode *N, unsigned Opc, unsigned& Imm) {
+  return N->getOpcode() == Opc
+         && isInt32Immediate(N->getOperand(1).getNode(), Imm);
+}
+
+bool PPCDAGToDAGISel::isRunOfOnes(unsigned Val, unsigned &MB, unsigned &ME) {
+  if (!Val)
+    return false;
+
+  if (isShiftedMask_32(Val)) {
+    // look for the first non-zero bit
+    MB = countLeadingZeros(Val);
+    // look for the first zero bit after the run of ones
+    ME = countLeadingZeros((Val - 1) ^ Val);
+    return true;
+  } else {
+    Val = ~Val; // invert mask
+    if (isShiftedMask_32(Val)) {
+      // effectively look for the first zero bit
+      ME = countLeadingZeros(Val) - 1;
+      // effectively look for the first one bit after the run of zeros
+      MB = countLeadingZeros((Val - 1) ^ Val) + 1;
+      return true;
+    }
+  }
+  // no run present
+  return false;
+}
+
+bool PPCDAGToDAGISel::isRotateAndMask(SDNode *N, unsigned Mask,
+                                      bool isShiftMask, unsigned &SH,
+                                      unsigned &MB, unsigned &ME) {
+  // Don't even go down this path for i64, since different logic will be
+  // necessary for rldicl/rldicr/rldimi.
+  if (N->getValueType(0) != MVT::i32)
+    return false;
+
+  unsigned Shift  = 32;
+  unsigned Indeterminant = ~0;  // bit mask marking indeterminant results
+  unsigned Opcode = N->getOpcode();
+  if (N->getNumOperands() != 2 ||
+      !isInt32Immediate(N->getOperand(1).getNode(), Shift) || (Shift > 31))
+    return false;
+
+  if (Opcode == ISD::SHL) {
+    // apply shift left to mask if it comes first
+    if (isShiftMask) Mask = Mask << Shift;
+    // determine which bits are made indeterminant by shift
+    Indeterminant = ~(0xFFFFFFFFu << Shift);
+  } else if (Opcode == ISD::SRL) {
+    // apply shift right to mask if it comes first
+    if (isShiftMask) Mask = Mask >> Shift;
+    // determine which bits are made indeterminant by shift
+    Indeterminant = ~(0xFFFFFFFFu >> Shift);
+    // adjust for the left rotate
+    Shift = 32 - Shift;
+  } else if (Opcode == ISD::ROTL) {
+    Indeterminant = 0;
+  } else {
+    return false;
+  }
+
+  // if the mask doesn't intersect any Indeterminant bits
+  if (Mask && !(Mask & Indeterminant)) {
+    SH = Shift & 31;
+    // make sure the mask is still a mask (wrap arounds may not be)
+    return isRunOfOnes(Mask, MB, ME);
+  }
+  return false;
+}
+
+/// SelectBitfieldInsert - turn an or of two masked values into
+/// the rotate left word immediate then mask insert (rlwimi) instruction.
+SDNode *PPCDAGToDAGISel::SelectBitfieldInsert(SDNode *N) {
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  SDLoc dl(N);
+
+  APInt LKZ, LKO, RKZ, RKO;
+  CurDAG->computeKnownBits(Op0, LKZ, LKO);
+  CurDAG->computeKnownBits(Op1, RKZ, RKO);
+
+  unsigned TargetMask = LKZ.getZExtValue();
+  unsigned InsertMask = RKZ.getZExtValue();
+
+  if ((TargetMask | InsertMask) == 0xFFFFFFFF) {
+    unsigned Op0Opc = Op0.getOpcode();
+    unsigned Op1Opc = Op1.getOpcode();
+    unsigned Value, SH = 0;
+    TargetMask = ~TargetMask;
+    InsertMask = ~InsertMask;
+
+    // If the LHS has a foldable shift and the RHS does not, then swap it to the
+    // RHS so that we can fold the shift into the insert.
+    if (Op0Opc == ISD::AND && Op1Opc == ISD::AND) {
+      if (Op0.getOperand(0).getOpcode() == ISD::SHL ||
+          Op0.getOperand(0).getOpcode() == ISD::SRL) {
+        if (Op1.getOperand(0).getOpcode() != ISD::SHL &&
+            Op1.getOperand(0).getOpcode() != ISD::SRL) {
+          std::swap(Op0, Op1);
+          std::swap(Op0Opc, Op1Opc);
+          std::swap(TargetMask, InsertMask);
+        }
+      }
+    } else if (Op0Opc == ISD::SHL || Op0Opc == ISD::SRL) {
+      if (Op1Opc == ISD::AND && Op1.getOperand(0).getOpcode() != ISD::SHL &&
+          Op1.getOperand(0).getOpcode() != ISD::SRL) {
+        std::swap(Op0, Op1);
+        std::swap(Op0Opc, Op1Opc);
+        std::swap(TargetMask, InsertMask);
+      }
+    }
+
+    unsigned MB, ME;
+    if (isRunOfOnes(InsertMask, MB, ME)) {
+      SDValue Tmp1, Tmp2;
+
+      if ((Op1Opc == ISD::SHL || Op1Opc == ISD::SRL) &&
+          isInt32Immediate(Op1.getOperand(1), Value)) {
+        Op1 = Op1.getOperand(0);
+        SH  = (Op1Opc == ISD::SHL) ? Value : 32 - Value;
+      }
+      if (Op1Opc == ISD::AND) {
+       // The AND mask might not be a constant, and we need to make sure that
+       // if we're going to fold the masking with the insert, all bits not
+       // know to be zero in the mask are known to be one.
+        APInt MKZ, MKO;
+        CurDAG->computeKnownBits(Op1.getOperand(1), MKZ, MKO);
+        bool CanFoldMask = InsertMask == MKO.getZExtValue();
+
+        unsigned SHOpc = Op1.getOperand(0).getOpcode();
+        if ((SHOpc == ISD::SHL || SHOpc == ISD::SRL) && CanFoldMask &&
+            isInt32Immediate(Op1.getOperand(0).getOperand(1), Value)) {
+          // Note that Value must be in range here (less than 32) because
+          // otherwise there would not be any bits set in InsertMask.
+          Op1 = Op1.getOperand(0).getOperand(0);
+          SH  = (SHOpc == ISD::SHL) ? Value : 32 - Value;
+        }
+      }
+
+      SH &= 31;
+      SDValue Ops[] = { Op0, Op1, getI32Imm(SH), getI32Imm(MB),
+                          getI32Imm(ME) };
+      return CurDAG->getMachineNode(PPC::RLWIMI, dl, MVT::i32, Ops);
+    }
+  }
+  return nullptr;
+}
+
+/// SelectCC - Select a comparison of the specified values with the specified
+/// condition code, returning the CR# of the expression.
+SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS,
+                                    ISD::CondCode CC, SDLoc dl) {
+  // Always select the LHS.
+  unsigned Opc;
+
+  if (LHS.getValueType() == MVT::i32) {
+    unsigned Imm;
+    if (CC == ISD::SETEQ || CC == ISD::SETNE) {
+      if (isInt32Immediate(RHS, Imm)) {
+        // SETEQ/SETNE comparison with 16-bit immediate, fold it.
+        if (isUInt<16>(Imm))
+          return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, LHS,
+                                                getI32Imm(Imm & 0xFFFF)), 0);
+        // If this is a 16-bit signed immediate, fold it.
+        if (isInt<16>((int)Imm))
+          return SDValue(CurDAG->getMachineNode(PPC::CMPWI, dl, MVT::i32, LHS,
+                                                getI32Imm(Imm & 0xFFFF)), 0);
+
+        // For non-equality comparisons, the default code would materialize the
+        // constant, then compare against it, like this:
+        //   lis r2, 4660
+        //   ori r2, r2, 22136
+        //   cmpw cr0, r3, r2
+        // Since we are just comparing for equality, we can emit this instead:
+        //   xoris r0,r3,0x1234
+        //   cmplwi cr0,r0,0x5678
+        //   beq cr0,L6
+        SDValue Xor(CurDAG->getMachineNode(PPC::XORIS, dl, MVT::i32, LHS,
+                                           getI32Imm(Imm >> 16)), 0);
+        return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, Xor,
+                                              getI32Imm(Imm & 0xFFFF)), 0);
+      }
+      Opc = PPC::CMPLW;
+    } else if (ISD::isUnsignedIntSetCC(CC)) {
+      if (isInt32Immediate(RHS, Imm) && isUInt<16>(Imm))
+        return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, LHS,
+                                              getI32Imm(Imm & 0xFFFF)), 0);
+      Opc = PPC::CMPLW;
+    } else {
+      short SImm;
+      if (isIntS16Immediate(RHS, SImm))
+        return SDValue(CurDAG->getMachineNode(PPC::CMPWI, dl, MVT::i32, LHS,
+                                              getI32Imm((int)SImm & 0xFFFF)),
+                         0);
+      Opc = PPC::CMPW;
+    }
+  } else if (LHS.getValueType() == MVT::i64) {
+    uint64_t Imm;
+    if (CC == ISD::SETEQ || CC == ISD::SETNE) {
+      if (isInt64Immediate(RHS.getNode(), Imm)) {
+        // SETEQ/SETNE comparison with 16-bit immediate, fold it.
+        if (isUInt<16>(Imm))
+          return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, LHS,
+                                                getI32Imm(Imm & 0xFFFF)), 0);
+        // If this is a 16-bit signed immediate, fold it.
+        if (isInt<16>(Imm))
+          return SDValue(CurDAG->getMachineNode(PPC::CMPDI, dl, MVT::i64, LHS,
+                                                getI32Imm(Imm & 0xFFFF)), 0);
+
+        // For non-equality comparisons, the default code would materialize the
+        // constant, then compare against it, like this:
+        //   lis r2, 4660
+        //   ori r2, r2, 22136
+        //   cmpd cr0, r3, r2
+        // Since we are just comparing for equality, we can emit this instead:
+        //   xoris r0,r3,0x1234
+        //   cmpldi cr0,r0,0x5678
+        //   beq cr0,L6
+        if (isUInt<32>(Imm)) {
+          SDValue Xor(CurDAG->getMachineNode(PPC::XORIS8, dl, MVT::i64, LHS,
+                                             getI64Imm(Imm >> 16)), 0);
+          return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, Xor,
+                                                getI64Imm(Imm & 0xFFFF)), 0);
+        }
+      }
+      Opc = PPC::CMPLD;
+    } else if (ISD::isUnsignedIntSetCC(CC)) {
+      if (isInt64Immediate(RHS.getNode(), Imm) && isUInt<16>(Imm))
+        return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, LHS,
+                                              getI64Imm(Imm & 0xFFFF)), 0);
+      Opc = PPC::CMPLD;
+    } else {
+      short SImm;
+      if (isIntS16Immediate(RHS, SImm))
+        return SDValue(CurDAG->getMachineNode(PPC::CMPDI, dl, MVT::i64, LHS,
+                                              getI64Imm(SImm & 0xFFFF)),
+                         0);
+      Opc = PPC::CMPD;
+    }
+  } else if (LHS.getValueType() == MVT::f32) {
+    Opc = PPC::FCMPUS;
+  } else {
+    assert(LHS.getValueType() == MVT::f64 && "Unknown vt!");
+    Opc = PPCSubTarget->hasVSX() ? PPC::XSCMPUDP : PPC::FCMPUD;
+  }
+  return SDValue(CurDAG->getMachineNode(Opc, dl, MVT::i32, LHS, RHS), 0);
+}
+
+static PPC::Predicate getPredicateForSetCC(ISD::CondCode CC) {
+  switch (CC) {
+  case ISD::SETUEQ:
+  case ISD::SETONE:
+  case ISD::SETOLE:
+  case ISD::SETOGE:
+    llvm_unreachable("Should be lowered by legalize!");
+  default: llvm_unreachable("Unknown condition!");
+  case ISD::SETOEQ:
+  case ISD::SETEQ:  return PPC::PRED_EQ;
+  case ISD::SETUNE:
+  case ISD::SETNE:  return PPC::PRED_NE;
+  case ISD::SETOLT:
+  case ISD::SETLT:  return PPC::PRED_LT;
+  case ISD::SETULE:
+  case ISD::SETLE:  return PPC::PRED_LE;
+  case ISD::SETOGT:
+  case ISD::SETGT:  return PPC::PRED_GT;
+  case ISD::SETUGE:
+  case ISD::SETGE:  return PPC::PRED_GE;
+  case ISD::SETO:   return PPC::PRED_NU;
+  case ISD::SETUO:  return PPC::PRED_UN;
+    // These two are invalid for floating point.  Assume we have int.
+  case ISD::SETULT: return PPC::PRED_LT;
+  case ISD::SETUGT: return PPC::PRED_GT;
+  }
+}
+
+/// getCRIdxForSetCC - Return the index of the condition register field
+/// associated with the SetCC condition, and whether or not the field is
+/// treated as inverted.  That is, lt = 0; ge = 0 inverted.
+static unsigned getCRIdxForSetCC(ISD::CondCode CC, bool &Invert) {
+  Invert = false;
+  switch (CC) {
+  default: llvm_unreachable("Unknown condition!");
+  case ISD::SETOLT:
+  case ISD::SETLT:  return 0;                  // Bit #0 = SETOLT
+  case ISD::SETOGT:
+  case ISD::SETGT:  return 1;                  // Bit #1 = SETOGT
+  case ISD::SETOEQ:
+  case ISD::SETEQ:  return 2;                  // Bit #2 = SETOEQ
+  case ISD::SETUO:  return 3;                  // Bit #3 = SETUO
+  case ISD::SETUGE:
+  case ISD::SETGE:  Invert = true; return 0;   // !Bit #0 = SETUGE
+  case ISD::SETULE:
+  case ISD::SETLE:  Invert = true; return 1;   // !Bit #1 = SETULE
+  case ISD::SETUNE:
+  case ISD::SETNE:  Invert = true; return 2;   // !Bit #2 = SETUNE
+  case ISD::SETO:   Invert = true; return 3;   // !Bit #3 = SETO
+  case ISD::SETUEQ:
+  case ISD::SETOGE:
+  case ISD::SETOLE:
+  case ISD::SETONE:
+    llvm_unreachable("Invalid branch code: should be expanded by legalize");
+  // These are invalid for floating point.  Assume integer.
+  case ISD::SETULT: return 0;
+  case ISD::SETUGT: return 1;
+  }
+}
+
+// getVCmpInst: return the vector compare instruction for the specified
+// vector type and condition code. Since this is for altivec specific code,
+// only support the altivec types (v16i8, v8i16, v4i32, and v4f32).
+static unsigned int getVCmpInst(MVT VecVT, ISD::CondCode CC,
+                                bool HasVSX, bool &Swap, bool &Negate) {
+  Swap = false;
+  Negate = false;
+
+  if (VecVT.isFloatingPoint()) {
+    /* Handle some cases by swapping input operands.  */
+    switch (CC) {
+      case ISD::SETLE: CC = ISD::SETGE; Swap = true; break;
+      case ISD::SETLT: CC = ISD::SETGT; Swap = true; break;
+      case ISD::SETOLE: CC = ISD::SETOGE; Swap = true; break;
+      case ISD::SETOLT: CC = ISD::SETOGT; Swap = true; break;
+      case ISD::SETUGE: CC = ISD::SETULE; Swap = true; break;
+      case ISD::SETUGT: CC = ISD::SETULT; Swap = true; break;
+      default: break;
+    }
+    /* Handle some cases by negating the result.  */
+    switch (CC) {
+      case ISD::SETNE: CC = ISD::SETEQ; Negate = true; break;
+      case ISD::SETUNE: CC = ISD::SETOEQ; Negate = true; break;
+      case ISD::SETULE: CC = ISD::SETOGT; Negate = true; break;
+      case ISD::SETULT: CC = ISD::SETOGE; Negate = true; break;
+      default: break;
+    }
+    /* We have instructions implementing the remaining cases.  */
+    switch (CC) {
+      case ISD::SETEQ:
+      case ISD::SETOEQ:
+        if (VecVT == MVT::v4f32)
+          return HasVSX ? PPC::XVCMPEQSP : PPC::VCMPEQFP;
+        else if (VecVT == MVT::v2f64)
+          return PPC::XVCMPEQDP;
+        break;
+      case ISD::SETGT:
+      case ISD::SETOGT:
+        if (VecVT == MVT::v4f32)
+          return HasVSX ? PPC::XVCMPGTSP : PPC::VCMPGTFP;
+        else if (VecVT == MVT::v2f64)
+          return PPC::XVCMPGTDP;
+        break;
+      case ISD::SETGE:
+      case ISD::SETOGE:
+        if (VecVT == MVT::v4f32)
+          return HasVSX ? PPC::XVCMPGESP : PPC::VCMPGEFP;
+        else if (VecVT == MVT::v2f64)
+          return PPC::XVCMPGEDP;
+        break;
+      default:
+        break;
+    }
+    llvm_unreachable("Invalid floating-point vector compare condition");
+  } else {
+    /* Handle some cases by swapping input operands.  */
+    switch (CC) {
+      case ISD::SETGE: CC = ISD::SETLE; Swap = true; break;
+      case ISD::SETLT: CC = ISD::SETGT; Swap = true; break;
+      case ISD::SETUGE: CC = ISD::SETULE; Swap = true; break;
+      case ISD::SETULT: CC = ISD::SETUGT; Swap = true; break;
+      default: break;
+    }
+    /* Handle some cases by negating the result.  */
+    switch (CC) {
+      case ISD::SETNE: CC = ISD::SETEQ; Negate = true; break;
+      case ISD::SETUNE: CC = ISD::SETUEQ; Negate = true; break;
+      case ISD::SETLE: CC = ISD::SETGT; Negate = true; break;
+      case ISD::SETULE: CC = ISD::SETUGT; Negate = true; break;
+      default: break;
+    }
+    /* We have instructions implementing the remaining cases.  */
+    switch (CC) {
+      case ISD::SETEQ:
+      case ISD::SETUEQ:
+        if (VecVT == MVT::v16i8)
+          return PPC::VCMPEQUB;
+        else if (VecVT == MVT::v8i16)
+          return PPC::VCMPEQUH;
+        else if (VecVT == MVT::v4i32)
+          return PPC::VCMPEQUW;
+        break;
+      case ISD::SETGT:
+        if (VecVT == MVT::v16i8)
+          return PPC::VCMPGTSB;
+        else if (VecVT == MVT::v8i16)
+          return PPC::VCMPGTSH;
+        else if (VecVT == MVT::v4i32)
+          return PPC::VCMPGTSW;
+        break;
+      case ISD::SETUGT:
+        if (VecVT == MVT::v16i8)
+          return PPC::VCMPGTUB;
+        else if (VecVT == MVT::v8i16)
+          return PPC::VCMPGTUH;
+        else if (VecVT == MVT::v4i32)
+          return PPC::VCMPGTUW;
+        break;
+      default:
+        break;
+    }
+    llvm_unreachable("Invalid integer vector compare condition");
+  }
+}
+
+SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
+  SDLoc dl(N);
+  unsigned Imm;
+  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  EVT PtrVT = CurDAG->getTargetLoweringInfo().getPointerTy();
+  bool isPPC64 = (PtrVT == MVT::i64);
+
+  if (!PPCSubTarget->useCRBits() &&
+      isInt32Immediate(N->getOperand(1), Imm)) {
+    // We can codegen setcc op, imm very efficiently compared to a brcond.
+    // Check for those cases here.
+    // setcc op, 0
+    if (Imm == 0) {
+      SDValue Op = N->getOperand(0);
+      switch (CC) {
+      default: break;
+      case ISD::SETEQ: {
+        Op = SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Op), 0);
+        SDValue Ops[] = { Op, getI32Imm(27), getI32Imm(5), getI32Imm(31) };
+        return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+      }
+      case ISD::SETNE: {
+        if (isPPC64) break;
+        SDValue AD =
+          SDValue(CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue,
+                                         Op, getI32Imm(~0U)), 0);
+        return CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, AD, Op,
+                                    AD.getValue(1));
+      }
+      case ISD::SETLT: {
+        SDValue Ops[] = { Op, getI32Imm(1), getI32Imm(31), getI32Imm(31) };
+        return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+      }
+      case ISD::SETGT: {
+        SDValue T =
+          SDValue(CurDAG->getMachineNode(PPC::NEG, dl, MVT::i32, Op), 0);
+        T = SDValue(CurDAG->getMachineNode(PPC::ANDC, dl, MVT::i32, T, Op), 0);
+        SDValue Ops[] = { T, getI32Imm(1), getI32Imm(31), getI32Imm(31) };
+        return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+      }
+      }
+    } else if (Imm == ~0U) {        // setcc op, -1
+      SDValue Op = N->getOperand(0);
+      switch (CC) {
+      default: break;
+      case ISD::SETEQ:
+        if (isPPC64) break;
+        Op = SDValue(CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue,
+                                            Op, getI32Imm(1)), 0);
+        return CurDAG->SelectNodeTo(N, PPC::ADDZE, MVT::i32,
+                              SDValue(CurDAG->getMachineNode(PPC::LI, dl,
+                                                             MVT::i32,
+                                                             getI32Imm(0)), 0),
+                                      Op.getValue(1));
+      case ISD::SETNE: {
+        if (isPPC64) break;
+        Op = SDValue(CurDAG->getMachineNode(PPC::NOR, dl, MVT::i32, Op, Op), 0);
+        SDNode *AD = CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue,
+                                            Op, getI32Imm(~0U));
+        return CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, SDValue(AD, 0),
+                                    Op, SDValue(AD, 1));
+      }
+      case ISD::SETLT: {
+        SDValue AD = SDValue(CurDAG->getMachineNode(PPC::ADDI, dl, MVT::i32, Op,
+                                                    getI32Imm(1)), 0);
+        SDValue AN = SDValue(CurDAG->getMachineNode(PPC::AND, dl, MVT::i32, AD,
+                                                    Op), 0);
+        SDValue Ops[] = { AN, getI32Imm(1), getI32Imm(31), getI32Imm(31) };
+        return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+      }
+      case ISD::SETGT: {
+        SDValue Ops[] = { Op, getI32Imm(1), getI32Imm(31), getI32Imm(31) };
+        Op = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops),
+                     0);
+        return CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Op,
+                                    getI32Imm(1));
+      }
+      }
+    }
+  }
+
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  // Altivec Vector compare instructions do not set any CR register by default and
+  // vector compare operations return the same type as the operands.
+  if (LHS.getValueType().isVector()) {
+    EVT VecVT = LHS.getValueType();
+    bool Swap, Negate;
+    unsigned int VCmpInst = getVCmpInst(VecVT.getSimpleVT(), CC,
+                                        PPCSubTarget->hasVSX(), Swap, Negate);
+    if (Swap)
+      std::swap(LHS, RHS);
+
+    if (Negate) {
+      SDValue VCmp(CurDAG->getMachineNode(VCmpInst, dl, VecVT, LHS, RHS), 0);
+      return CurDAG->SelectNodeTo(N, PPCSubTarget->hasVSX() ? PPC::XXLNOR :
+                                                              PPC::VNOR,
+                                  VecVT, VCmp, VCmp);
+    }
+
+    return CurDAG->SelectNodeTo(N, VCmpInst, VecVT, LHS, RHS);
+  }
+
+  if (PPCSubTarget->useCRBits())
+    return nullptr;
+
+  bool Inv;
+  unsigned Idx = getCRIdxForSetCC(CC, Inv);
+  SDValue CCReg = SelectCC(LHS, RHS, CC, dl);
+  SDValue IntCR;
+
+  // Force the ccreg into CR7.
+  SDValue CR7Reg = CurDAG->getRegister(PPC::CR7, MVT::i32);
+
+  SDValue InFlag(nullptr, 0);  // Null incoming flag value.
+  CCReg = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, CR7Reg, CCReg,
+                               InFlag).getValue(1);
+
+  IntCR = SDValue(CurDAG->getMachineNode(PPC::MFOCRF, dl, MVT::i32, CR7Reg,
+                                         CCReg), 0);
+
+  SDValue Ops[] = { IntCR, getI32Imm((32-(3-Idx)) & 31),
+                      getI32Imm(31), getI32Imm(31) };
+  if (!Inv)
+    return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+
+  // Get the specified bit.
+  SDValue Tmp =
+    SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops), 0);
+  return CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Tmp, getI32Imm(1));
+}
+
+
+// Select - Convert the specified operand from a target-independent to a
+// target-specific node if it hasn't already been changed.
+SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
+  SDLoc dl(N);
+  if (N->isMachineOpcode()) {
+    N->setNodeId(-1);
+    return nullptr;   // Already selected.
+  }
+
+  switch (N->getOpcode()) {
+  default: break;
+
+  case ISD::Constant: {
+    if (N->getValueType(0) == MVT::i64) {
+      // Get 64 bit value.
+      int64_t Imm = cast<ConstantSDNode>(N)->getZExtValue();
+      // Assume no remaining bits.
+      unsigned Remainder = 0;
+      // Assume no shift required.
+      unsigned Shift = 0;
+
+      // If it can't be represented as a 32 bit value.
+      if (!isInt<32>(Imm)) {
+        Shift = countTrailingZeros<uint64_t>(Imm);
+        int64_t ImmSh = static_cast<uint64_t>(Imm) >> Shift;
+
+        // If the shifted value fits 32 bits.
+        if (isInt<32>(ImmSh)) {
+          // Go with the shifted value.
+          Imm = ImmSh;
+        } else {
+          // Still stuck with a 64 bit value.
+          Remainder = Imm;
+          Shift = 32;
+          Imm >>= 32;
+        }
+      }
+
+      // Intermediate operand.
+      SDNode *Result;
+
+      // Handle first 32 bits.
+      unsigned Lo = Imm & 0xFFFF;
+      unsigned Hi = (Imm >> 16) & 0xFFFF;
+
+      // Simple value.
+      if (isInt<16>(Imm)) {
+       // Just the Lo bits.
+        Result = CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64, getI32Imm(Lo));
+      } else if (Lo) {
+        // Handle the Hi bits.
+        unsigned OpC = Hi ? PPC::LIS8 : PPC::LI8;
+        Result = CurDAG->getMachineNode(OpC, dl, MVT::i64, getI32Imm(Hi));
+        // And Lo bits.
+        Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64,
+                                        SDValue(Result, 0), getI32Imm(Lo));
+      } else {
+       // Just the Hi bits.
+        Result = CurDAG->getMachineNode(PPC::LIS8, dl, MVT::i64, getI32Imm(Hi));
+      }
+
+      // If no shift, we're done.
+      if (!Shift) return Result;
+
+      // Shift for next step if the upper 32-bits were not zero.
+      if (Imm) {
+        Result = CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64,
+                                        SDValue(Result, 0),
+                                        getI32Imm(Shift),
+                                        getI32Imm(63 - Shift));
+      }
+
+      // Add in the last bits as required.
+      if ((Hi = (Remainder >> 16) & 0xFFFF)) {
+        Result = CurDAG->getMachineNode(PPC::ORIS8, dl, MVT::i64,
+                                        SDValue(Result, 0), getI32Imm(Hi));
+      }
+      if ((Lo = Remainder & 0xFFFF)) {
+        Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64,
+                                        SDValue(Result, 0), getI32Imm(Lo));
+      }
+
+      return Result;
+    }
+    break;
+  }
+
+  case ISD::SETCC: {
+    SDNode *SN = SelectSETCC(N);
+    if (SN)
+      return SN;
+    break;
+  }
+  case PPCISD::GlobalBaseReg:
+    return getGlobalBaseReg();
+
+  case ISD::FrameIndex: {
+    int FI = cast<FrameIndexSDNode>(N)->getIndex();
+    SDValue TFI = CurDAG->getTargetFrameIndex(FI, N->getValueType(0));
+    unsigned Opc = N->getValueType(0) == MVT::i32 ? PPC::ADDI : PPC::ADDI8;
+    if (N->hasOneUse())
+      return CurDAG->SelectNodeTo(N, Opc, N->getValueType(0), TFI,
+                                  getSmallIPtrImm(0));
+    return CurDAG->getMachineNode(Opc, dl, N->getValueType(0), TFI,
+                                  getSmallIPtrImm(0));
+  }
+
+  case PPCISD::MFOCRF: {
+    SDValue InFlag = N->getOperand(1);
+    return CurDAG->getMachineNode(PPC::MFOCRF, dl, MVT::i32,
+                                  N->getOperand(0), InFlag);
+  }
+
+  case ISD::SDIV: {
+    // FIXME: since this depends on the setting of the carry flag from the srawi
+    //        we should really be making notes about that for the scheduler.
+    // FIXME: It sure would be nice if we could cheaply recognize the
+    //        srl/add/sra pattern the dag combiner will generate for this as
+    //        sra/addze rather than having to handle sdiv ourselves.  oh well.
+    unsigned Imm;
+    if (isInt32Immediate(N->getOperand(1), Imm)) {
+      SDValue N0 = N->getOperand(0);
+      if ((signed)Imm > 0 && isPowerOf2_32(Imm)) {
+        SDNode *Op =
+          CurDAG->getMachineNode(PPC::SRAWI, dl, MVT::i32, MVT::Glue,
+                                 N0, getI32Imm(Log2_32(Imm)));
+        return CurDAG->SelectNodeTo(N, PPC::ADDZE, MVT::i32,
+                                    SDValue(Op, 0), SDValue(Op, 1));
+      } else if ((signed)Imm < 0 && isPowerOf2_32(-Imm)) {
+        SDNode *Op =
+          CurDAG->getMachineNode(PPC::SRAWI, dl, MVT::i32, MVT::Glue,
+                                 N0, getI32Imm(Log2_32(-Imm)));
+        SDValue PT =
+          SDValue(CurDAG->getMachineNode(PPC::ADDZE, dl, MVT::i32,
+                                         SDValue(Op, 0), SDValue(Op, 1)),
+                    0);
+        return CurDAG->SelectNodeTo(N, PPC::NEG, MVT::i32, PT);
+      }
+    }
+
+    // Other cases are autogenerated.
+    break;
+  }
+
+  case ISD::LOAD: {
+    // Handle preincrement loads.
+    LoadSDNode *LD = cast<LoadSDNode>(N);
+    EVT LoadedVT = LD->getMemoryVT();
+
+    // Normal loads are handled by code generated from the .td file.
+    if (LD->getAddressingMode() != ISD::PRE_INC)
+      break;
+
+    SDValue Offset = LD->getOffset();
+    if (Offset.getOpcode() == ISD::TargetConstant ||
+        Offset.getOpcode() == ISD::TargetGlobalAddress) {
+
+      unsigned Opcode;
+      bool isSExt = LD->getExtensionType() == ISD::SEXTLOAD;
+      if (LD->getValueType(0) != MVT::i64) {
+        // Handle PPC32 integer and normal FP loads.
+        assert((!isSExt || LoadedVT == MVT::i16) && "Invalid sext update load");
+        switch (LoadedVT.getSimpleVT().SimpleTy) {
+          default: llvm_unreachable("Invalid PPC load type!");
+          case MVT::f64: Opcode = PPC::LFDU; break;
+          case MVT::f32: Opcode = PPC::LFSU; break;
+          case MVT::i32: Opcode = PPC::LWZU; break;
+          case MVT::i16: Opcode = isSExt ? PPC::LHAU : PPC::LHZU; break;
+          case MVT::i1:
+          case MVT::i8:  Opcode = PPC::LBZU; break;
+        }
+      } else {
+        assert(LD->getValueType(0) == MVT::i64 && "Unknown load result type!");
+        assert((!isSExt || LoadedVT == MVT::i16) && "Invalid sext update load");
+        switch (LoadedVT.getSimpleVT().SimpleTy) {
+          default: llvm_unreachable("Invalid PPC load type!");
+          case MVT::i64: Opcode = PPC::LDU; break;
+          case MVT::i32: Opcode = PPC::LWZU8; break;
+          case MVT::i16: Opcode = isSExt ? PPC::LHAU8 : PPC::LHZU8; break;
+          case MVT::i1:
+          case MVT::i8:  Opcode = PPC::LBZU8; break;
+        }
+      }
+
+      SDValue Chain = LD->getChain();
+      SDValue Base = LD->getBasePtr();
+      SDValue Ops[] = { Offset, Base, Chain };
+      return CurDAG->getMachineNode(Opcode, dl, LD->getValueType(0),
+                                    PPCLowering->getPointerTy(),
+                                    MVT::Other, Ops);
+    } else {
+      unsigned Opcode;
+      bool isSExt = LD->getExtensionType() == ISD::SEXTLOAD;
+      if (LD->getValueType(0) != MVT::i64) {
+        // Handle PPC32 integer and normal FP loads.
+        assert((!isSExt || LoadedVT == MVT::i16) && "Invalid sext update load");
+        switch (LoadedVT.getSimpleVT().SimpleTy) {
+          default: llvm_unreachable("Invalid PPC load type!");
+          case MVT::f64: Opcode = PPC::LFDUX; break;
+          case MVT::f32: Opcode = PPC::LFSUX; break;
+          case MVT::i32: Opcode = PPC::LWZUX; break;
+          case MVT::i16: Opcode = isSExt ? PPC::LHAUX : PPC::LHZUX; break;
+          case MVT::i1:
+          case MVT::i8:  Opcode = PPC::LBZUX; break;
+        }
+      } else {
+        assert(LD->getValueType(0) == MVT::i64 && "Unknown load result type!");
+        assert((!isSExt || LoadedVT == MVT::i16 || LoadedVT == MVT::i32) &&
+               "Invalid sext update load");
+        switch (LoadedVT.getSimpleVT().SimpleTy) {
+          default: llvm_unreachable("Invalid PPC load type!");
+          case MVT::i64: Opcode = PPC::LDUX; break;
+          case MVT::i32: Opcode = isSExt ? PPC::LWAUX  : PPC::LWZUX8; break;
+          case MVT::i16: Opcode = isSExt ? PPC::LHAUX8 : PPC::LHZUX8; break;
+          case MVT::i1:
+          case MVT::i8:  Opcode = PPC::LBZUX8; break;
+        }
+      }
+
+      SDValue Chain = LD->getChain();
+      SDValue Base = LD->getBasePtr();
+      SDValue Ops[] = { Base, Offset, Chain };
+      return CurDAG->getMachineNode(Opcode, dl, LD->getValueType(0),
+                                    PPCLowering->getPointerTy(),
+                                    MVT::Other, Ops);
+    }
+  }
+
+  case ISD::AND: {
+    unsigned Imm, Imm2, SH, MB, ME;
+    uint64_t Imm64;
+
+    // If this is an and of a value rotated between 0 and 31 bits and then and'd
+    // with a mask, emit rlwinm
+    if (isInt32Immediate(N->getOperand(1), Imm) &&
+        isRotateAndMask(N->getOperand(0).getNode(), Imm, false, SH, MB, ME)) {
+      SDValue Val = N->getOperand(0).getOperand(0);
+      SDValue Ops[] = { Val, getI32Imm(SH), getI32Imm(MB), getI32Imm(ME) };
+      return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+    }
+    // If this is just a masked value where the input is not handled above, and
+    // is not a rotate-left (handled by a pattern in the .td file), emit rlwinm
+    if (isInt32Immediate(N->getOperand(1), Imm) &&
+        isRunOfOnes(Imm, MB, ME) &&
+        N->getOperand(0).getOpcode() != ISD::ROTL) {
+      SDValue Val = N->getOperand(0);
+      SDValue Ops[] = { Val, getI32Imm(0), getI32Imm(MB), getI32Imm(ME) };
+      return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+    }
+    // If this is a 64-bit zero-extension mask, emit rldicl.
+    if (isInt64Immediate(N->getOperand(1).getNode(), Imm64) &&
+        isMask_64(Imm64)) {
+      SDValue Val = N->getOperand(0);
+      MB = 64 - CountTrailingOnes_64(Imm64);
+      SH = 0;
+
+      // If the operand is a logical right shift, we can fold it into this
+      // instruction: rldicl(rldicl(x, 64-n, n), 0, mb) -> rldicl(x, 64-n, mb)
+      // for n <= mb. The right shift is really a left rotate followed by a
+      // mask, and this mask is a more-restrictive sub-mask of the mask implied
+      // by the shift.
+      if (Val.getOpcode() == ISD::SRL &&
+          isInt32Immediate(Val.getOperand(1).getNode(), Imm) && Imm <= MB) {
+        assert(Imm < 64 && "Illegal shift amount");
+        Val = Val.getOperand(0);
+        SH = 64 - Imm;
+      }
+
+      SDValue Ops[] = { Val, getI32Imm(SH), getI32Imm(MB) };
+      return CurDAG->SelectNodeTo(N, PPC::RLDICL, MVT::i64, Ops);
+    }
+    // AND X, 0 -> 0, not "rlwinm 32".
+    if (isInt32Immediate(N->getOperand(1), Imm) && (Imm == 0)) {
+      ReplaceUses(SDValue(N, 0), N->getOperand(1));
+      return nullptr;
+    }
+    // ISD::OR doesn't get all the bitfield insertion fun.
+    // (and (or x, c1), c2) where isRunOfOnes(~(c1^c2)) is a bitfield insert
+    if (isInt32Immediate(N->getOperand(1), Imm) &&
+        N->getOperand(0).getOpcode() == ISD::OR &&
+        isInt32Immediate(N->getOperand(0).getOperand(1), Imm2)) {
+      unsigned MB, ME;
+      Imm = ~(Imm^Imm2);
+      if (isRunOfOnes(Imm, MB, ME)) {
+        SDValue Ops[] = { N->getOperand(0).getOperand(0),
+                            N->getOperand(0).getOperand(1),
+                            getI32Imm(0), getI32Imm(MB),getI32Imm(ME) };
+        return CurDAG->getMachineNode(PPC::RLWIMI, dl, MVT::i32, Ops);
+      }
+    }
+
+    // Other cases are autogenerated.
+    break;
+  }
+  case ISD::OR:
+    if (N->getValueType(0) == MVT::i32)
+      if (SDNode *I = SelectBitfieldInsert(N))
+        return I;
+
+    // Other cases are autogenerated.
+    break;
+  case ISD::SHL: {
+    unsigned Imm, SH, MB, ME;
+    if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, Imm) &&
+        isRotateAndMask(N, Imm, true, SH, MB, ME)) {
+      SDValue Ops[] = { N->getOperand(0).getOperand(0),
+                          getI32Imm(SH), getI32Imm(MB), getI32Imm(ME) };
+      return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+    }
+
+    // Other cases are autogenerated.
+    break;
+  }
+  case ISD::SRL: {
+    unsigned Imm, SH, MB, ME;
+    if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, Imm) &&
+        isRotateAndMask(N, Imm, true, SH, MB, ME)) {
+      SDValue Ops[] = { N->getOperand(0).getOperand(0),
+                          getI32Imm(SH), getI32Imm(MB), getI32Imm(ME) };
+      return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+    }
+
+    // Other cases are autogenerated.
+    break;
+  }
+  // FIXME: Remove this once the ANDI glue bug is fixed:
+  case PPCISD::ANDIo_1_EQ_BIT:
+  case PPCISD::ANDIo_1_GT_BIT: {
+    if (!ANDIGlueBug)
+      break;
+
+    EVT InVT = N->getOperand(0).getValueType();
+    assert((InVT == MVT::i64 || InVT == MVT::i32) &&
+           "Invalid input type for ANDIo_1_EQ_BIT");
+
+    unsigned Opcode = (InVT == MVT::i64) ? PPC::ANDIo8 : PPC::ANDIo;
+    SDValue AndI(CurDAG->getMachineNode(Opcode, dl, InVT, MVT::Glue,
+                                        N->getOperand(0),
+                                        CurDAG->getTargetConstant(1, InVT)), 0);
+    SDValue CR0Reg = CurDAG->getRegister(PPC::CR0, MVT::i32);
+    SDValue SRIdxVal =
+      CurDAG->getTargetConstant(N->getOpcode() == PPCISD::ANDIo_1_EQ_BIT ?
+                                PPC::sub_eq : PPC::sub_gt, MVT::i32);
+
+    return CurDAG->SelectNodeTo(N, TargetOpcode::EXTRACT_SUBREG, MVT::i1,
+                                CR0Reg, SRIdxVal,
+                                SDValue(AndI.getNode(), 1) /* glue */);
+  }
+  case ISD::SELECT_CC: {
+    ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
+    EVT PtrVT = CurDAG->getTargetLoweringInfo().getPointerTy();
+    bool isPPC64 = (PtrVT == MVT::i64);
+
+    // If this is a select of i1 operands, we'll pattern match it.
+    if (PPCSubTarget->useCRBits() &&
+        N->getOperand(0).getValueType() == MVT::i1)
+      break;
+
+    // Handle the setcc cases here.  select_cc lhs, 0, 1, 0, cc
+    if (!isPPC64)
+      if (ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1)))
+        if (ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N->getOperand(2)))
+          if (ConstantSDNode *N3C = dyn_cast<ConstantSDNode>(N->getOperand(3)))
+            if (N1C->isNullValue() && N3C->isNullValue() &&
+                N2C->getZExtValue() == 1ULL && CC == ISD::SETNE &&
+                // FIXME: Implement this optzn for PPC64.
+                N->getValueType(0) == MVT::i32) {
+              SDNode *Tmp =
+                CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue,
+                                       N->getOperand(0), getI32Imm(~0U));
+              return CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32,
+                                          SDValue(Tmp, 0), N->getOperand(0),
+                                          SDValue(Tmp, 1));
+            }
+
+    SDValue CCReg = SelectCC(N->getOperand(0), N->getOperand(1), CC, dl);
+
+    if (N->getValueType(0) == MVT::i1) {
+      // An i1 select is: (c & t) | (!c & f).
+      bool Inv;
+      unsigned Idx = getCRIdxForSetCC(CC, Inv);
+
+      unsigned SRI;
+      switch (Idx) {
+      default: llvm_unreachable("Invalid CC index");
+      case 0: SRI = PPC::sub_lt; break;
+      case 1: SRI = PPC::sub_gt; break;
+      case 2: SRI = PPC::sub_eq; break;
+      case 3: SRI = PPC::sub_un; break;
+      }
+
+      SDValue CCBit = CurDAG->getTargetExtractSubreg(SRI, dl, MVT::i1, CCReg);
+
+      SDValue NotCCBit(CurDAG->getMachineNode(PPC::CRNOR, dl, MVT::i1,
+                                              CCBit, CCBit), 0);
+      SDValue C =    Inv ? NotCCBit : CCBit,
+              NotC = Inv ? CCBit    : NotCCBit;
+
+      SDValue CAndT(CurDAG->getMachineNode(PPC::CRAND, dl, MVT::i1,
+                                           C, N->getOperand(2)), 0);
+      SDValue NotCAndF(CurDAG->getMachineNode(PPC::CRAND, dl, MVT::i1,
+                                              NotC, N->getOperand(3)), 0);
+
+      return CurDAG->SelectNodeTo(N, PPC::CROR, MVT::i1, CAndT, NotCAndF);
+    }
+
+    unsigned BROpc = getPredicateForSetCC(CC);
+
+    unsigned SelectCCOp;
+    if (N->getValueType(0) == MVT::i32)
+      SelectCCOp = PPC::SELECT_CC_I4;
+    else if (N->getValueType(0) == MVT::i64)
+      SelectCCOp = PPC::SELECT_CC_I8;
+    else if (N->getValueType(0) == MVT::f32)
+      SelectCCOp = PPC::SELECT_CC_F4;
+    else if (N->getValueType(0) == MVT::f64)
+      SelectCCOp = PPC::SELECT_CC_F8;
+    else
+      SelectCCOp = PPC::SELECT_CC_VRRC;
+
+    SDValue Ops[] = { CCReg, N->getOperand(2), N->getOperand(3),
+                        getI32Imm(BROpc) };
+    return CurDAG->SelectNodeTo(N, SelectCCOp, N->getValueType(0), Ops);
+  }
+  case ISD::VSELECT:
+    if (PPCSubTarget->hasVSX()) {
+      SDValue Ops[] = { N->getOperand(2), N->getOperand(1), N->getOperand(0) };
+      return CurDAG->SelectNodeTo(N, PPC::XXSEL, N->getValueType(0), Ops);
+    }
+
+    break;
+  case ISD::VECTOR_SHUFFLE:
+    if (PPCSubTarget->hasVSX() && (N->getValueType(0) == MVT::v2f64 ||
+                                  N->getValueType(0) == MVT::v2i64)) {
+      ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
+      
+      SDValue Op1 = N->getOperand(SVN->getMaskElt(0) < 2 ? 0 : 1),
+              Op2 = N->getOperand(SVN->getMaskElt(1) < 2 ? 0 : 1);
+      unsigned DM[2];
+
+      for (int i = 0; i < 2; ++i)
+        if (SVN->getMaskElt(i) <= 0 || SVN->getMaskElt(i) == 2)
+          DM[i] = 0;
+        else
+          DM[i] = 1;
+
+      SDValue DMV = CurDAG->getTargetConstant(DM[1] | (DM[0] << 1), MVT::i32);
+
+      if (Op1 == Op2 && DM[0] == 0 && DM[1] == 0 &&
+          Op1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+          isa<LoadSDNode>(Op1.getOperand(0))) {
+        LoadSDNode *LD = cast<LoadSDNode>(Op1.getOperand(0));
+        SDValue Base, Offset;
+
+        if (LD->isUnindexed() &&
+            SelectAddrIdxOnly(LD->getBasePtr(), Base, Offset)) {
+          SDValue Chain = LD->getChain();
+          SDValue Ops[] = { Base, Offset, Chain };
+          return CurDAG->SelectNodeTo(N, PPC::LXVDSX,
+                                      N->getValueType(0), Ops);
+        }
+      }
+
+      SDValue Ops[] = { Op1, Op2, DMV };
+      return CurDAG->SelectNodeTo(N, PPC::XXPERMDI, N->getValueType(0), Ops);
+    }
+
+    break;
+  case PPCISD::BDNZ:
+  case PPCISD::BDZ: {
+    bool IsPPC64 = PPCSubTarget->isPPC64();
+    SDValue Ops[] = { N->getOperand(1), N->getOperand(0) };
+    return CurDAG->SelectNodeTo(N, N->getOpcode() == PPCISD::BDNZ ?
+                                   (IsPPC64 ? PPC::BDNZ8 : PPC::BDNZ) :
+                                   (IsPPC64 ? PPC::BDZ8 : PPC::BDZ),
+                                MVT::Other, Ops);
+  }
+  case PPCISD::COND_BRANCH: {
+    // Op #0 is the Chain.
+    // Op #1 is the PPC::PRED_* number.
+    // Op #2 is the CR#
+    // Op #3 is the Dest MBB
+    // Op #4 is the Flag.
+    // Prevent PPC::PRED_* from being selected into LI.
+    SDValue Pred =
+      getI32Imm(cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());
+    SDValue Ops[] = { Pred, N->getOperand(2), N->getOperand(3),
+      N->getOperand(0), N->getOperand(4) };
+    return CurDAG->SelectNodeTo(N, PPC::BCC, MVT::Other, Ops);
+  }
+  case ISD::BR_CC: {
+    ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
+    unsigned PCC = getPredicateForSetCC(CC);
+
+    if (N->getOperand(2).getValueType() == MVT::i1) {
+      unsigned Opc;
+      bool Swap;
+      switch (PCC) {
+      default: llvm_unreachable("Unexpected Boolean-operand predicate");
+      case PPC::PRED_LT: Opc = PPC::CRANDC; Swap = true;  break;
+      case PPC::PRED_LE: Opc = PPC::CRORC;  Swap = true;  break;
+      case PPC::PRED_EQ: Opc = PPC::CREQV;  Swap = false; break;
+      case PPC::PRED_GE: Opc = PPC::CRORC;  Swap = false; break;
+      case PPC::PRED_GT: Opc = PPC::CRANDC; Swap = false; break;
+      case PPC::PRED_NE: Opc = PPC::CRXOR;  Swap = false; break;
+      }
+
+      SDValue BitComp(CurDAG->getMachineNode(Opc, dl, MVT::i1,
+                                             N->getOperand(Swap ? 3 : 2),
+                                             N->getOperand(Swap ? 2 : 3)), 0);
+      return CurDAG->SelectNodeTo(N, PPC::BC, MVT::Other,
+                                  BitComp, N->getOperand(4), N->getOperand(0));
+    }
+
+    SDValue CondCode = SelectCC(N->getOperand(2), N->getOperand(3), CC, dl);
+    SDValue Ops[] = { getI32Imm(PCC), CondCode,
+                        N->getOperand(4), N->getOperand(0) };
+    return CurDAG->SelectNodeTo(N, PPC::BCC, MVT::Other, Ops);
+  }
+  case ISD::BRIND: {
+    // FIXME: Should custom lower this.
+    SDValue Chain = N->getOperand(0);
+    SDValue Target = N->getOperand(1);
+    unsigned Opc = Target.getValueType() == MVT::i32 ? PPC::MTCTR : PPC::MTCTR8;
+    unsigned Reg = Target.getValueType() == MVT::i32 ? PPC::BCTR : PPC::BCTR8;
+    Chain = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, Target,
+                                           Chain), 0);
+    return CurDAG->SelectNodeTo(N, Reg, MVT::Other, Chain);
+  }
+  case PPCISD::TOC_ENTRY: {
+    if (PPCSubTarget->isSVR4ABI() && !PPCSubTarget->isPPC64()) {
+      SDValue GA = N->getOperand(0);
+      return CurDAG->getMachineNode(PPC::LWZtoc, dl, MVT::i32, GA,
+                                    N->getOperand(1));
+	}
+    assert (PPCSubTarget->isPPC64() &&
+            "Only supported for 64-bit ABI and 32-bit SVR4");
+
+    // For medium and large code model, we generate two instructions as
+    // described below.  Otherwise we allow SelectCodeCommon to handle this,
+    // selecting one of LDtoc, LDtocJTI, and LDtocCPT.
+    CodeModel::Model CModel = TM.getCodeModel();
+    if (CModel != CodeModel::Medium && CModel != CodeModel::Large)
+      break;
+
+    // The first source operand is a TargetGlobalAddress or a TargetJumpTable.
+    // If it is an externally defined symbol, a symbol with common linkage,
+    // a non-local function address, or a jump table address, or if we are
+    // generating code for large code model, we generate:
+    //   LDtocL(<ga:@sym>, ADDIStocHA(%X2, <ga:@sym>))
+    // Otherwise we generate:
+    //   ADDItocL(ADDIStocHA(%X2, <ga:@sym>), <ga:@sym>)
+    SDValue GA = N->getOperand(0);
+    SDValue TOCbase = N->getOperand(1);
+    SDNode *Tmp = CurDAG->getMachineNode(PPC::ADDIStocHA, dl, MVT::i64,
+                                        TOCbase, GA);
+
+    if (isa<JumpTableSDNode>(GA) || CModel == CodeModel::Large)
+      return CurDAG->getMachineNode(PPC::LDtocL, dl, MVT::i64, GA,
+                                    SDValue(Tmp, 0));
+
+    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(GA)) {
+      const GlobalValue *GValue = G->getGlobal();
+      if ((GValue->getType()->getElementType()->isFunctionTy() &&
+           (GValue->isDeclaration() || GValue->isWeakForLinker())) ||
+          GValue->isDeclaration() || GValue->hasCommonLinkage() ||
+          GValue->hasAvailableExternallyLinkage())
+        return CurDAG->getMachineNode(PPC::LDtocL, dl, MVT::i64, GA,
+                                      SDValue(Tmp, 0));
+    }
+
+    return CurDAG->getMachineNode(PPC::ADDItocL, dl, MVT::i64,
+                                  SDValue(Tmp, 0), GA);
+  }
+  case PPCISD::PPC32_PICGOT: {
+    // Generate a PIC-safe GOT reference.
+    assert(!PPCSubTarget->isPPC64() && PPCSubTarget->isSVR4ABI() &&
+      "PPCISD::PPC32_PICGOT is only supported for 32-bit SVR4");
+    return CurDAG->SelectNodeTo(N, PPC::PPC32PICGOT, PPCLowering->getPointerTy(),  MVT::i32);
+  }
+  case PPCISD::VADD_SPLAT: {
+    // This expands into one of three sequences, depending on whether
+    // the first operand is odd or even, positive or negative.
+    assert(isa<ConstantSDNode>(N->getOperand(0)) &&
+           isa<ConstantSDNode>(N->getOperand(1)) &&
+           "Invalid operand on VADD_SPLAT!");
+
+    int Elt     = N->getConstantOperandVal(0);
+    int EltSize = N->getConstantOperandVal(1);
+    unsigned Opc1, Opc2, Opc3;
+    EVT VT;
+
+    if (EltSize == 1) {
+      Opc1 = PPC::VSPLTISB;
+      Opc2 = PPC::VADDUBM;
+      Opc3 = PPC::VSUBUBM;
+      VT = MVT::v16i8;
+    } else if (EltSize == 2) {
+      Opc1 = PPC::VSPLTISH;
+      Opc2 = PPC::VADDUHM;
+      Opc3 = PPC::VSUBUHM;
+      VT = MVT::v8i16;
+    } else {
+      assert(EltSize == 4 && "Invalid element size on VADD_SPLAT!");
+      Opc1 = PPC::VSPLTISW;
+      Opc2 = PPC::VADDUWM;
+      Opc3 = PPC::VSUBUWM;
+      VT = MVT::v4i32;
+    }
+
+    if ((Elt & 1) == 0) {
+      // Elt is even, in the range [-32,-18] + [16,30].
+      //
+      // Convert: VADD_SPLAT elt, size
+      // Into:    tmp = VSPLTIS[BHW] elt
+      //          VADDU[BHW]M tmp, tmp
+      // Where:   [BHW] = B for size = 1, H for size = 2, W for size = 4
+      SDValue EltVal = getI32Imm(Elt >> 1);
+      SDNode *Tmp = CurDAG->getMachineNode(Opc1, dl, VT, EltVal);
+      SDValue TmpVal = SDValue(Tmp, 0);
+      return CurDAG->getMachineNode(Opc2, dl, VT, TmpVal, TmpVal);
+
+    } else if (Elt > 0) {
+      // Elt is odd and positive, in the range [17,31].
+      //
+      // Convert: VADD_SPLAT elt, size
+      // Into:    tmp1 = VSPLTIS[BHW] elt-16
+      //          tmp2 = VSPLTIS[BHW] -16
+      //          VSUBU[BHW]M tmp1, tmp2
+      SDValue EltVal = getI32Imm(Elt - 16);
+      SDNode *Tmp1 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal);
+      EltVal = getI32Imm(-16);
+      SDNode *Tmp2 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal);
+      return CurDAG->getMachineNode(Opc3, dl, VT, SDValue(Tmp1, 0),
+                                    SDValue(Tmp2, 0));
+
+    } else {
+      // Elt is odd and negative, in the range [-31,-17].
+      //
+      // Convert: VADD_SPLAT elt, size
+      // Into:    tmp1 = VSPLTIS[BHW] elt+16
+      //          tmp2 = VSPLTIS[BHW] -16
+      //          VADDU[BHW]M tmp1, tmp2
+      SDValue EltVal = getI32Imm(Elt + 16);
+      SDNode *Tmp1 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal);
+      EltVal = getI32Imm(-16);
+      SDNode *Tmp2 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal);
+      return CurDAG->getMachineNode(Opc2, dl, VT, SDValue(Tmp1, 0),
+                                    SDValue(Tmp2, 0));
+    }
+  }
+  }
+
+  return SelectCode(N);
+}
+
+/// PostprocessISelDAG - Perform some late peephole optimizations
+/// on the DAG representation.
+void PPCDAGToDAGISel::PostprocessISelDAG() {
+
+  // Skip peepholes at -O0.
+  if (TM.getOptLevel() == CodeGenOpt::None)
+    return;
+
+  PeepholePPC64();
+  PeepholeCROps();
+}
+
+// Check if all users of this node will become isel where the second operand
+// is the constant zero. If this is so, and if we can negate the condition,
+// then we can flip the true and false operands. This will allow the zero to
+// be folded with the isel so that we don't need to materialize a register
+// containing zero.
+bool PPCDAGToDAGISel::AllUsersSelectZero(SDNode *N) {
+  // If we're not using isel, then this does not matter.
+  if (!PPCSubTarget->hasISEL())
+    return false;
+
+  for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
+       UI != UE; ++UI) {
+    SDNode *User = *UI;
+    if (!User->isMachineOpcode())
+      return false;
+    if (User->getMachineOpcode() != PPC::SELECT_I4 &&
+        User->getMachineOpcode() != PPC::SELECT_I8)
+      return false;
+
+    SDNode *Op2 = User->getOperand(2).getNode();
+    if (!Op2->isMachineOpcode())
+      return false;
+
+    if (Op2->getMachineOpcode() != PPC::LI &&
+        Op2->getMachineOpcode() != PPC::LI8)
+      return false;
+
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op2->getOperand(0));
+    if (!C)
+      return false;
+
+    if (!C->isNullValue())
+      return false;
+  }
+
+  return true;
+}
+
+void PPCDAGToDAGISel::SwapAllSelectUsers(SDNode *N) {
+  SmallVector<SDNode *, 4> ToReplace;
+  for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
+       UI != UE; ++UI) {
+    SDNode *User = *UI;
+    assert((User->getMachineOpcode() == PPC::SELECT_I4 ||
+            User->getMachineOpcode() == PPC::SELECT_I8) &&
+           "Must have all select users");
+    ToReplace.push_back(User);
+  }
+
+  for (SmallVector<SDNode *, 4>::iterator UI = ToReplace.begin(),
+       UE = ToReplace.end(); UI != UE; ++UI) {
+    SDNode *User = *UI;
+    SDNode *ResNode =
+      CurDAG->getMachineNode(User->getMachineOpcode(), SDLoc(User),
+                             User->getValueType(0), User->getOperand(0),
+                             User->getOperand(2),
+                             User->getOperand(1));
+
+      DEBUG(dbgs() << "CR Peephole replacing:\nOld:    ");
+      DEBUG(User->dump(CurDAG));
+      DEBUG(dbgs() << "\nNew: ");
+      DEBUG(ResNode->dump(CurDAG));
+      DEBUG(dbgs() << "\n");
+
+      ReplaceUses(User, ResNode);
+  }
+}
+
+void PPCDAGToDAGISel::PeepholeCROps() {
+  bool IsModified;
+  do {
+    IsModified = false;
+    for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
+         E = CurDAG->allnodes_end(); I != E; ++I) {
+      MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(I);
+      if (!MachineNode || MachineNode->use_empty())
+        continue;
+      SDNode *ResNode = MachineNode;
+
+      bool Op1Set   = false, Op1Unset = false,
+           Op1Not   = false,
+           Op2Set   = false, Op2Unset = false,
+           Op2Not   = false;
+
+      unsigned Opcode = MachineNode->getMachineOpcode();
+      switch (Opcode) {
+      default: break;
+      case PPC::CRAND:
+      case PPC::CRNAND:
+      case PPC::CROR:
+      case PPC::CRXOR:
+      case PPC::CRNOR:
+      case PPC::CREQV:
+      case PPC::CRANDC:
+      case PPC::CRORC: {
+        SDValue Op = MachineNode->getOperand(1);
+        if (Op.isMachineOpcode()) {
+          if (Op.getMachineOpcode() == PPC::CRSET)
+            Op2Set = true;
+          else if (Op.getMachineOpcode() == PPC::CRUNSET)
+            Op2Unset = true;
+          else if (Op.getMachineOpcode() == PPC::CRNOR &&
+                   Op.getOperand(0) == Op.getOperand(1))
+            Op2Not = true;
+        }
+        }  // fallthrough
+      case PPC::BC:
+      case PPC::BCn:
+      case PPC::SELECT_I4:
+      case PPC::SELECT_I8:
+      case PPC::SELECT_F4:
+      case PPC::SELECT_F8:
+      case PPC::SELECT_VRRC: {
+        SDValue Op = MachineNode->getOperand(0);
+        if (Op.isMachineOpcode()) {
+          if (Op.getMachineOpcode() == PPC::CRSET)
+            Op1Set = true;
+          else if (Op.getMachineOpcode() == PPC::CRUNSET)
+            Op1Unset = true;
+          else if (Op.getMachineOpcode() == PPC::CRNOR &&
+                   Op.getOperand(0) == Op.getOperand(1))
+            Op1Not = true;
+        }
+        }
+        break;
+      }
+
+      bool SelectSwap = false;
+      switch (Opcode) {
+      default: break;
+      case PPC::CRAND:
+        if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
+          // x & x = x
+          ResNode = MachineNode->getOperand(0).getNode();
+        else if (Op1Set)
+          // 1 & y = y
+          ResNode = MachineNode->getOperand(1).getNode();
+        else if (Op2Set)
+          // x & 1 = x
+          ResNode = MachineNode->getOperand(0).getNode();
+        else if (Op1Unset || Op2Unset)
+          // x & 0 = 0 & y = 0
+          ResNode = CurDAG->getMachineNode(PPC::CRUNSET, SDLoc(MachineNode),
+                                           MVT::i1);
+        else if (Op1Not)
+          // ~x & y = andc(y, x)
+          ResNode = CurDAG->getMachineNode(PPC::CRANDC, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(1),
+                                           MachineNode->getOperand(0).
+                                             getOperand(0));
+        else if (Op2Not)
+          // x & ~y = andc(x, y)
+          ResNode = CurDAG->getMachineNode(PPC::CRANDC, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(1).
+                                             getOperand(0));
+        else if (AllUsersSelectZero(MachineNode))
+          ResNode = CurDAG->getMachineNode(PPC::CRNAND, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(1)),
+          SelectSwap = true;
+        break;
+      case PPC::CRNAND:
+        if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
+          // nand(x, x) -> nor(x, x)
+          ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(0));
+        else if (Op1Set)
+          // nand(1, y) -> nor(y, y)
+          ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(1),
+                                           MachineNode->getOperand(1));
+        else if (Op2Set)
+          // nand(x, 1) -> nor(x, x)
+          ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(0));
+        else if (Op1Unset || Op2Unset)
+          // nand(x, 0) = nand(0, y) = 1
+          ResNode = CurDAG->getMachineNode(PPC::CRSET, SDLoc(MachineNode),
+                                           MVT::i1);
+        else if (Op1Not)
+          // nand(~x, y) = ~(~x & y) = x | ~y = orc(x, y)
+          ResNode = CurDAG->getMachineNode(PPC::CRORC, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0).
+                                                      getOperand(0),
+                                           MachineNode->getOperand(1));
+        else if (Op2Not)
+          // nand(x, ~y) = ~x | y = orc(y, x)
+          ResNode = CurDAG->getMachineNode(PPC::CRORC, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(1).
+                                                      getOperand(0),
+                                           MachineNode->getOperand(0));
+        else if (AllUsersSelectZero(MachineNode))
+          ResNode = CurDAG->getMachineNode(PPC::CRAND, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(1)),
+          SelectSwap = true;
+        break;
+      case PPC::CROR:
+        if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
+          // x | x = x
+          ResNode = MachineNode->getOperand(0).getNode();
+        else if (Op1Set || Op2Set)
+          // x | 1 = 1 | y = 1
+          ResNode = CurDAG->getMachineNode(PPC::CRSET, SDLoc(MachineNode),
+                                           MVT::i1);
+        else if (Op1Unset)
+          // 0 | y = y
+          ResNode = MachineNode->getOperand(1).getNode();
+        else if (Op2Unset)
+          // x | 0 = x
+          ResNode = MachineNode->getOperand(0).getNode();
+        else if (Op1Not)
+          // ~x | y = orc(y, x)
+          ResNode = CurDAG->getMachineNode(PPC::CRORC, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(1),
+                                           MachineNode->getOperand(0).
+                                             getOperand(0));
+        else if (Op2Not)
+          // x | ~y = orc(x, y)
+          ResNode = CurDAG->getMachineNode(PPC::CRORC, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(1).
+                                             getOperand(0));
+        else if (AllUsersSelectZero(MachineNode))
+          ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(1)),
+          SelectSwap = true;
+        break;
+      case PPC::CRXOR:
+        if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
+          // xor(x, x) = 0
+          ResNode = CurDAG->getMachineNode(PPC::CRUNSET, SDLoc(MachineNode),
+                                           MVT::i1);
+        else if (Op1Set)
+          // xor(1, y) -> nor(y, y)
+          ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(1),
+                                           MachineNode->getOperand(1));
+        else if (Op2Set)
+          // xor(x, 1) -> nor(x, x)
+          ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(0));
+        else if (Op1Unset)
+          // xor(0, y) = y
+          ResNode = MachineNode->getOperand(1).getNode();
+        else if (Op2Unset)
+          // xor(x, 0) = x
+          ResNode = MachineNode->getOperand(0).getNode();
+        else if (Op1Not)
+          // xor(~x, y) = eqv(x, y)
+          ResNode = CurDAG->getMachineNode(PPC::CREQV, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0).
+                                                      getOperand(0),
+                                           MachineNode->getOperand(1));
+        else if (Op2Not)
+          // xor(x, ~y) = eqv(x, y)
+          ResNode = CurDAG->getMachineNode(PPC::CREQV, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(1).
+                                             getOperand(0));
+        else if (AllUsersSelectZero(MachineNode))
+          ResNode = CurDAG->getMachineNode(PPC::CREQV, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(1)),
+          SelectSwap = true;
+        break;
+      case PPC::CRNOR:
+        if (Op1Set || Op2Set)
+          // nor(1, y) -> 0
+          ResNode = CurDAG->getMachineNode(PPC::CRUNSET, SDLoc(MachineNode),
+                                           MVT::i1);
+        else if (Op1Unset)
+          // nor(0, y) = ~y -> nor(y, y)
+          ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(1),
+                                           MachineNode->getOperand(1));
+        else if (Op2Unset)
+          // nor(x, 0) = ~x
+          ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(0));
+        else if (Op1Not)
+          // nor(~x, y) = andc(x, y)
+          ResNode = CurDAG->getMachineNode(PPC::CRANDC, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0).
+                                                      getOperand(0),
+                                           MachineNode->getOperand(1));
+        else if (Op2Not)
+          // nor(x, ~y) = andc(y, x)
+          ResNode = CurDAG->getMachineNode(PPC::CRANDC, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(1).
+                                                      getOperand(0),
+                                           MachineNode->getOperand(0));
+        else if (AllUsersSelectZero(MachineNode))
+          ResNode = CurDAG->getMachineNode(PPC::CROR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(1)),
+          SelectSwap = true;
+        break;
+      case PPC::CREQV:
+        if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
+          // eqv(x, x) = 1
+          ResNode = CurDAG->getMachineNode(PPC::CRSET, SDLoc(MachineNode),
+                                           MVT::i1);
+        else if (Op1Set)
+          // eqv(1, y) = y
+          ResNode = MachineNode->getOperand(1).getNode();
+        else if (Op2Set)
+          // eqv(x, 1) = x
+          ResNode = MachineNode->getOperand(0).getNode();
+        else if (Op1Unset)
+          // eqv(0, y) = ~y -> nor(y, y)
+          ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(1),
+                                           MachineNode->getOperand(1));
+        else if (Op2Unset)
+          // eqv(x, 0) = ~x
+          ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(0));
+        else if (Op1Not)
+          // eqv(~x, y) = xor(x, y)
+          ResNode = CurDAG->getMachineNode(PPC::CRXOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0).
+                                                      getOperand(0),
+                                           MachineNode->getOperand(1));
+        else if (Op2Not)
+          // eqv(x, ~y) = xor(x, y)
+          ResNode = CurDAG->getMachineNode(PPC::CRXOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(1).
+                                             getOperand(0));
+        else if (AllUsersSelectZero(MachineNode))
+          ResNode = CurDAG->getMachineNode(PPC::CRXOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(1)),
+          SelectSwap = true;
+        break;
+      case PPC::CRANDC:
+        if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
+          // andc(x, x) = 0
+          ResNode = CurDAG->getMachineNode(PPC::CRUNSET, SDLoc(MachineNode),
+                                           MVT::i1);
+        else if (Op1Set)
+          // andc(1, y) = ~y
+          ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(1),
+                                           MachineNode->getOperand(1));
+        else if (Op1Unset || Op2Set)
+          // andc(0, y) = andc(x, 1) = 0
+          ResNode = CurDAG->getMachineNode(PPC::CRUNSET, SDLoc(MachineNode),
+                                           MVT::i1);
+        else if (Op2Unset)
+          // andc(x, 0) = x
+          ResNode = MachineNode->getOperand(0).getNode();
+        else if (Op1Not)
+          // andc(~x, y) = ~(x | y) = nor(x, y)
+          ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0).
+                                                      getOperand(0),
+                                           MachineNode->getOperand(1));
+        else if (Op2Not)
+          // andc(x, ~y) = x & y
+          ResNode = CurDAG->getMachineNode(PPC::CRAND, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(1).
+                                             getOperand(0));
+        else if (AllUsersSelectZero(MachineNode))
+          ResNode = CurDAG->getMachineNode(PPC::CRORC, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(1),
+                                           MachineNode->getOperand(0)),
+          SelectSwap = true;
+        break;
+      case PPC::CRORC:
+        if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
+          // orc(x, x) = 1
+          ResNode = CurDAG->getMachineNode(PPC::CRSET, SDLoc(MachineNode),
+                                           MVT::i1);
+        else if (Op1Set || Op2Unset)
+          // orc(1, y) = orc(x, 0) = 1
+          ResNode = CurDAG->getMachineNode(PPC::CRSET, SDLoc(MachineNode),
+                                           MVT::i1);
+        else if (Op2Set)
+          // orc(x, 1) = x
+          ResNode = MachineNode->getOperand(0).getNode();
+        else if (Op1Unset)
+          // orc(0, y) = ~y
+          ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(1),
+                                           MachineNode->getOperand(1));
+        else if (Op1Not)
+          // orc(~x, y) = ~(x & y) = nand(x, y)
+          ResNode = CurDAG->getMachineNode(PPC::CRNAND, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0).
+                                                      getOperand(0),
+                                           MachineNode->getOperand(1));
+        else if (Op2Not)
+          // orc(x, ~y) = x | y
+          ResNode = CurDAG->getMachineNode(PPC::CROR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(1).
+                                             getOperand(0));
+        else if (AllUsersSelectZero(MachineNode))
+          ResNode = CurDAG->getMachineNode(PPC::CRANDC, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(1),
+                                           MachineNode->getOperand(0)),
+          SelectSwap = true;
+        break;
+      case PPC::SELECT_I4:
+      case PPC::SELECT_I8:
+      case PPC::SELECT_F4:
+      case PPC::SELECT_F8:
+      case PPC::SELECT_VRRC:
+        if (Op1Set)
+          ResNode = MachineNode->getOperand(1).getNode();
+        else if (Op1Unset)
+          ResNode = MachineNode->getOperand(2).getNode();
+        else if (Op1Not)
+          ResNode = CurDAG->getMachineNode(MachineNode->getMachineOpcode(),
+                                           SDLoc(MachineNode),
+                                           MachineNode->getValueType(0),
+                                           MachineNode->getOperand(0).
+                                             getOperand(0),
+                                           MachineNode->getOperand(2),
+                                           MachineNode->getOperand(1));
+        break;
+      case PPC::BC:
+      case PPC::BCn:
+        if (Op1Not)
+          ResNode = CurDAG->getMachineNode(Opcode == PPC::BC ? PPC::BCn :
+                                                               PPC::BC,
+                                           SDLoc(MachineNode),
+                                           MVT::Other,
+                                           MachineNode->getOperand(0).
+                                             getOperand(0),
+                                           MachineNode->getOperand(1),
+                                           MachineNode->getOperand(2));
+        // FIXME: Handle Op1Set, Op1Unset here too.
+        break;
+      }
+
+      // If we're inverting this node because it is used only by selects that
+      // we'd like to swap, then swap the selects before the node replacement.
+      if (SelectSwap)
+        SwapAllSelectUsers(MachineNode);
+
+      if (ResNode != MachineNode) {
+        DEBUG(dbgs() << "CR Peephole replacing:\nOld:    ");
+        DEBUG(MachineNode->dump(CurDAG));
+        DEBUG(dbgs() << "\nNew: ");
+        DEBUG(ResNode->dump(CurDAG));
+        DEBUG(dbgs() << "\n");
+
+        ReplaceUses(MachineNode, ResNode);
+        IsModified = true;
+      }
+    }
+    if (IsModified)
+      CurDAG->RemoveDeadNodes();
+  } while (IsModified);
+}
+
+void PPCDAGToDAGISel::PeepholePPC64() {
+  // These optimizations are currently supported only for 64-bit SVR4.
+  if (PPCSubTarget->isDarwin() || !PPCSubTarget->isPPC64())
+    return;
+
+  SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
+  ++Position;
+
+  while (Position != CurDAG->allnodes_begin()) {
+    SDNode *N = --Position;
+    // Skip dead nodes and any non-machine opcodes.
+    if (N->use_empty() || !N->isMachineOpcode())
+      continue;
+
+    unsigned FirstOp;
+    unsigned StorageOpcode = N->getMachineOpcode();
+
+    switch (StorageOpcode) {
+    default: continue;
+
+    case PPC::LBZ:
+    case PPC::LBZ8:
+    case PPC::LD:
+    case PPC::LFD:
+    case PPC::LFS:
+    case PPC::LHA:
+    case PPC::LHA8:
+    case PPC::LHZ:
+    case PPC::LHZ8:
+    case PPC::LWA:
+    case PPC::LWZ:
+    case PPC::LWZ8:
+      FirstOp = 0;
+      break;
+
+    case PPC::STB:
+    case PPC::STB8:
+    case PPC::STD:
+    case PPC::STFD:
+    case PPC::STFS:
+    case PPC::STH:
+    case PPC::STH8:
+    case PPC::STW:
+    case PPC::STW8:
+      FirstOp = 1;
+      break;
+    }
+
+    // If this is a load or store with a zero offset, we may be able to
+    // fold an add-immediate into the memory operation.
+    if (!isa<ConstantSDNode>(N->getOperand(FirstOp)) ||
+        N->getConstantOperandVal(FirstOp) != 0)
+      continue;
+
+    SDValue Base = N->getOperand(FirstOp + 1);
+    if (!Base.isMachineOpcode())
+      continue;
+
+    unsigned Flags = 0;
+    bool ReplaceFlags = true;
+
+    // When the feeding operation is an add-immediate of some sort,
+    // determine whether we need to add relocation information to the
+    // target flags on the immediate operand when we fold it into the
+    // load instruction.
+    //
+    // For something like ADDItocL, the relocation information is
+    // inferred from the opcode; when we process it in the AsmPrinter,
+    // we add the necessary relocation there.  A load, though, can receive
+    // relocation from various flavors of ADDIxxx, so we need to carry
+    // the relocation information in the target flags.
+    switch (Base.getMachineOpcode()) {
+    default: continue;
+
+    case PPC::ADDI8:
+    case PPC::ADDI:
+      // In some cases (such as TLS) the relocation information
+      // is already in place on the operand, so copying the operand
+      // is sufficient.
+      ReplaceFlags = false;
+      // For these cases, the immediate may not be divisible by 4, in
+      // which case the fold is illegal for DS-form instructions.  (The
+      // other cases provide aligned addresses and are always safe.)
+      if ((StorageOpcode == PPC::LWA ||
+           StorageOpcode == PPC::LD  ||
+           StorageOpcode == PPC::STD) &&
+          (!isa<ConstantSDNode>(Base.getOperand(1)) ||
+           Base.getConstantOperandVal(1) % 4 != 0))
+        continue;
+      break;
+    case PPC::ADDIdtprelL:
+      Flags = PPCII::MO_DTPREL_LO;
+      break;
+    case PPC::ADDItlsldL:
+      Flags = PPCII::MO_TLSLD_LO;
+      break;
+    case PPC::ADDItocL:
+      Flags = PPCII::MO_TOC_LO;
+      break;
+    }
+
+    // We found an opportunity.  Reverse the operands from the add
+    // immediate and substitute them into the load or store.  If
+    // needed, update the target flags for the immediate operand to
+    // reflect the necessary relocation information.
+    DEBUG(dbgs() << "Folding add-immediate into mem-op:\nBase:    ");
+    DEBUG(Base->dump(CurDAG));
+    DEBUG(dbgs() << "\nN: ");
+    DEBUG(N->dump(CurDAG));
+    DEBUG(dbgs() << "\n");
+
+    SDValue ImmOpnd = Base.getOperand(1);
+
+    // If the relocation information isn't already present on the
+    // immediate operand, add it now.
+    if (ReplaceFlags) {
+      if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(ImmOpnd)) {
+        SDLoc dl(GA);
+        const GlobalValue *GV = GA->getGlobal();
+        // We can't perform this optimization for data whose alignment
+        // is insufficient for the instruction encoding.
+        if (GV->getAlignment() < 4 &&
+            (StorageOpcode == PPC::LD || StorageOpcode == PPC::STD ||
+             StorageOpcode == PPC::LWA)) {
+          DEBUG(dbgs() << "Rejected this candidate for alignment.\n\n");
+          continue;
+        }
+        ImmOpnd = CurDAG->getTargetGlobalAddress(GV, dl, MVT::i64, 0, Flags);
+      } else if (ConstantPoolSDNode *CP =
+                 dyn_cast<ConstantPoolSDNode>(ImmOpnd)) {
+        const Constant *C = CP->getConstVal();
+        ImmOpnd = CurDAG->getTargetConstantPool(C, MVT::i64,
+                                                CP->getAlignment(),
+                                                0, Flags);
+      }
+    }
+
+    if (FirstOp == 1) // Store
+      (void)CurDAG->UpdateNodeOperands(N, N->getOperand(0), ImmOpnd,
+                                       Base.getOperand(0), N->getOperand(3));
+    else // Load
+      (void)CurDAG->UpdateNodeOperands(N, ImmOpnd, Base.getOperand(0),
+                                       N->getOperand(2));
+
+    // The add-immediate may now be dead, in which case remove it.
+    if (Base.getNode()->use_empty())
+      CurDAG->RemoveDeadNode(Base.getNode());
+  }
+}
+
+
+/// createPPCISelDag - This pass converts a legalized DAG into a
+/// PowerPC-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createPPCISelDag(PPCTargetMachine &TM) {
+  return new PPCDAGToDAGISel(TM);
+}
+
+static void initializePassOnce(PassRegistry &Registry) {
+  const char *Name = "PowerPC DAG->DAG Pattern Instruction Selection";
+  PassInfo *PI = new PassInfo(Name, "ppc-codegen", &SelectionDAGISel::ID,
+                              nullptr, false, false);
+  Registry.registerPass(*PI, true);
+}
+
+void llvm::initializePPCDAGToDAGISelPass(PassRegistry &Registry) {
+  CALL_ONCE_INITIALIZATION(initializePassOnce);
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
new file mode 100644
index 0000000..1247e86
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -0,0 +1,9290 @@
+//===-- PPCISelLowering.cpp - PPC DAG Lowering Implementation -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PPCISelLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCISelLowering.h"
+#include "MCTargetDesc/PPCPredicates.h"
+#include "PPCMachineFunctionInfo.h"
+#include "PPCPerfectShuffle.h"
+#include "PPCTargetMachine.h"
+#include "PPCTargetObjectFile.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+static cl::opt<bool> DisablePPCPreinc("disable-ppc-preinc",
+cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden);
+
+static cl::opt<bool> DisableILPPref("disable-ppc-ilp-pref",
+cl::desc("disable setting the node scheduling preference to ILP on PPC"), cl::Hidden);
+
+static cl::opt<bool> DisablePPCUnaligned("disable-ppc-unaligned",
+cl::desc("disable unaligned load/store generation on PPC"), cl::Hidden);
+
+// FIXME: Remove this once the bug has been fixed!
+extern cl::opt<bool> ANDIGlueBug;
+
+static TargetLoweringObjectFile *createTLOF(const Triple &TT) {
+  // If it isn't a Mach-O file then it's going to be a linux ELF
+  // object file.
+  if (TT.isOSDarwin())
+    return new TargetLoweringObjectFileMachO();
+
+  return new PPC64LinuxTargetObjectFile();
+}
+
+PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
+    : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))),
+      Subtarget(*TM.getSubtargetImpl()) {
+  setPow2DivIsCheap();
+
+  // Use _setjmp/_longjmp instead of setjmp/longjmp.
+  setUseUnderscoreSetJmp(true);
+  setUseUnderscoreLongJmp(true);
+
+  // On PPC32/64, arguments smaller than 4/8 bytes are extended, so all
+  // arguments are at least 4/8 bytes aligned.
+  bool isPPC64 = Subtarget.isPPC64();
+  setMinStackArgumentAlignment(isPPC64 ? 8:4);
+
+  // Set up the register classes.
+  addRegisterClass(MVT::i32, &PPC::GPRCRegClass);
+  addRegisterClass(MVT::f32, &PPC::F4RCRegClass);
+  addRegisterClass(MVT::f64, &PPC::F8RCRegClass);
+
+  // PowerPC has an i16 but no i8 (or i1) SEXTLOAD
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Expand);
+
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+  // PowerPC has pre-inc load and store's.
+  setIndexedLoadAction(ISD::PRE_INC, MVT::i1, Legal);
+  setIndexedLoadAction(ISD::PRE_INC, MVT::i8, Legal);
+  setIndexedLoadAction(ISD::PRE_INC, MVT::i16, Legal);
+  setIndexedLoadAction(ISD::PRE_INC, MVT::i32, Legal);
+  setIndexedLoadAction(ISD::PRE_INC, MVT::i64, Legal);
+  setIndexedStoreAction(ISD::PRE_INC, MVT::i1, Legal);
+  setIndexedStoreAction(ISD::PRE_INC, MVT::i8, Legal);
+  setIndexedStoreAction(ISD::PRE_INC, MVT::i16, Legal);
+  setIndexedStoreAction(ISD::PRE_INC, MVT::i32, Legal);
+  setIndexedStoreAction(ISD::PRE_INC, MVT::i64, Legal);
+
+  if (Subtarget.useCRBits()) {
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+    if (isPPC64 || Subtarget.hasFPCVT()) {
+      setOperationAction(ISD::SINT_TO_FP, MVT::i1, Promote);
+      AddPromotedToType (ISD::SINT_TO_FP, MVT::i1,
+                         isPPC64 ? MVT::i64 : MVT::i32);
+      setOperationAction(ISD::UINT_TO_FP, MVT::i1, Promote);
+      AddPromotedToType (ISD::UINT_TO_FP, MVT::i1, 
+                         isPPC64 ? MVT::i64 : MVT::i32);
+    } else {
+      setOperationAction(ISD::SINT_TO_FP, MVT::i1, Custom);
+      setOperationAction(ISD::UINT_TO_FP, MVT::i1, Custom);
+    }
+
+    // PowerPC does not support direct load / store of condition registers
+    setOperationAction(ISD::LOAD, MVT::i1, Custom);
+    setOperationAction(ISD::STORE, MVT::i1, Custom);
+
+    // FIXME: Remove this once the ANDI glue bug is fixed:
+    if (ANDIGlueBug)
+      setOperationAction(ISD::TRUNCATE, MVT::i1, Custom);
+
+    setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
+    setTruncStoreAction(MVT::i64, MVT::i1, Expand);
+    setTruncStoreAction(MVT::i32, MVT::i1, Expand);
+    setTruncStoreAction(MVT::i16, MVT::i1, Expand);
+    setTruncStoreAction(MVT::i8, MVT::i1, Expand);
+
+    addRegisterClass(MVT::i1, &PPC::CRBITRCRegClass);
+  }
+
+  // This is used in the ppcf128->int sequence.  Note it has different semantics
+  // from FP_ROUND:  that rounds to nearest, this rounds to zero.
+  setOperationAction(ISD::FP_ROUND_INREG, MVT::ppcf128, Custom);
+
+  // We do not currently implement these libm ops for PowerPC.
+  setOperationAction(ISD::FFLOOR, MVT::ppcf128, Expand);
+  setOperationAction(ISD::FCEIL,  MVT::ppcf128, Expand);
+  setOperationAction(ISD::FTRUNC, MVT::ppcf128, Expand);
+  setOperationAction(ISD::FRINT,  MVT::ppcf128, Expand);
+  setOperationAction(ISD::FNEARBYINT, MVT::ppcf128, Expand);
+  setOperationAction(ISD::FREM, MVT::ppcf128, Expand);
+
+  // PowerPC has no SREM/UREM instructions
+  setOperationAction(ISD::SREM, MVT::i32, Expand);
+  setOperationAction(ISD::UREM, MVT::i32, Expand);
+  setOperationAction(ISD::SREM, MVT::i64, Expand);
+  setOperationAction(ISD::UREM, MVT::i64, Expand);
+
+  // Don't use SMUL_LOHI/UMUL_LOHI or SDIVREM/UDIVREM to lower SREM/UREM.
+  setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
+  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
+
+  // We don't support sin/cos/sqrt/fmod/pow
+  setOperationAction(ISD::FSIN , MVT::f64, Expand);
+  setOperationAction(ISD::FCOS , MVT::f64, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+  setOperationAction(ISD::FREM , MVT::f64, Expand);
+  setOperationAction(ISD::FPOW , MVT::f64, Expand);
+  setOperationAction(ISD::FMA  , MVT::f64, Legal);
+  setOperationAction(ISD::FSIN , MVT::f32, Expand);
+  setOperationAction(ISD::FCOS , MVT::f32, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
+  setOperationAction(ISD::FREM , MVT::f32, Expand);
+  setOperationAction(ISD::FPOW , MVT::f32, Expand);
+  setOperationAction(ISD::FMA  , MVT::f32, Legal);
+
+  setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
+
+  // If we're enabling GP optimizations, use hardware square root
+  if (!Subtarget.hasFSQRT() &&
+      !(TM.Options.UnsafeFPMath &&
+        Subtarget.hasFRSQRTE() && Subtarget.hasFRE()))
+    setOperationAction(ISD::FSQRT, MVT::f64, Expand);
+
+  if (!Subtarget.hasFSQRT() &&
+      !(TM.Options.UnsafeFPMath &&
+        Subtarget.hasFRSQRTES() && Subtarget.hasFRES()))
+    setOperationAction(ISD::FSQRT, MVT::f32, Expand);
+
+  if (Subtarget.hasFCPSGN()) {
+    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Legal);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Legal);
+  } else {
+    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+  }
+
+  if (Subtarget.hasFPRND()) {
+    setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
+    setOperationAction(ISD::FCEIL,  MVT::f64, Legal);
+    setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
+    setOperationAction(ISD::FROUND, MVT::f64, Legal);
+
+    setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
+    setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
+    setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
+    setOperationAction(ISD::FROUND, MVT::f32, Legal);
+  }
+
+  // PowerPC does not have BSWAP, CTPOP or CTTZ
+  setOperationAction(ISD::BSWAP, MVT::i32  , Expand);
+  setOperationAction(ISD::CTTZ , MVT::i32  , Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
+  setOperationAction(ISD::BSWAP, MVT::i64  , Expand);
+  setOperationAction(ISD::CTTZ , MVT::i64  , Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
+
+  if (Subtarget.hasPOPCNTD()) {
+    setOperationAction(ISD::CTPOP, MVT::i32  , Legal);
+    setOperationAction(ISD::CTPOP, MVT::i64  , Legal);
+  } else {
+    setOperationAction(ISD::CTPOP, MVT::i32  , Expand);
+    setOperationAction(ISD::CTPOP, MVT::i64  , Expand);
+  }
+
+  // PowerPC does not have ROTR
+  setOperationAction(ISD::ROTR, MVT::i32   , Expand);
+  setOperationAction(ISD::ROTR, MVT::i64   , Expand);
+
+  if (!Subtarget.useCRBits()) {
+    // PowerPC does not have Select
+    setOperationAction(ISD::SELECT, MVT::i32, Expand);
+    setOperationAction(ISD::SELECT, MVT::i64, Expand);
+    setOperationAction(ISD::SELECT, MVT::f32, Expand);
+    setOperationAction(ISD::SELECT, MVT::f64, Expand);
+  }
+
+  // PowerPC wants to turn select_cc of FP into fsel when possible.
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
+
+  // PowerPC wants to optimize integer setcc a bit
+  if (!Subtarget.useCRBits())
+    setOperationAction(ISD::SETCC, MVT::i32, Custom);
+
+  // PowerPC does not have BRCOND which requires SetCC
+  if (!Subtarget.useCRBits())
+    setOperationAction(ISD::BRCOND, MVT::Other, Expand);
+
+  setOperationAction(ISD::BR_JT,  MVT::Other, Expand);
+
+  // PowerPC turns FP_TO_SINT into FCTIWZ and some load/stores.
+  setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+
+  // PowerPC does not have [U|S]INT_TO_FP
+  setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);
+
+  setOperationAction(ISD::BITCAST, MVT::f32, Expand);
+  setOperationAction(ISD::BITCAST, MVT::i32, Expand);
+  setOperationAction(ISD::BITCAST, MVT::i64, Expand);
+  setOperationAction(ISD::BITCAST, MVT::f64, Expand);
+
+  // We cannot sextinreg(i1).  Expand to shifts.
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+  // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
+  // SjLj exception handling but a light-weight setjmp/longjmp replacement to
+  // support continuation, user-level threading, and etc.. As a result, no
+  // other SjLj exception interfaces are implemented and please don't build
+  // your own exception handling based on them.
+  // LLVM/Clang supports zero-cost DWARF exception handling.
+  setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
+  setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
+
+  // We want to legalize GlobalAddress and ConstantPool nodes into the
+  // appropriate instructions to materialize the address.
+  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+  setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
+  setOperationAction(ISD::BlockAddress,  MVT::i32, Custom);
+  setOperationAction(ISD::ConstantPool,  MVT::i32, Custom);
+  setOperationAction(ISD::JumpTable,     MVT::i32, Custom);
+  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
+  setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
+  setOperationAction(ISD::BlockAddress,  MVT::i64, Custom);
+  setOperationAction(ISD::ConstantPool,  MVT::i64, Custom);
+  setOperationAction(ISD::JumpTable,     MVT::i64, Custom);
+
+  // TRAP is legal.
+  setOperationAction(ISD::TRAP, MVT::Other, Legal);
+
+  // TRAMPOLINE is custom lowered.
+  setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
+  setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
+
+  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
+  setOperationAction(ISD::VASTART           , MVT::Other, Custom);
+
+  if (Subtarget.isSVR4ABI()) {
+    if (isPPC64) {
+      // VAARG always uses double-word chunks, so promote anything smaller.
+      setOperationAction(ISD::VAARG, MVT::i1, Promote);
+      AddPromotedToType (ISD::VAARG, MVT::i1, MVT::i64);
+      setOperationAction(ISD::VAARG, MVT::i8, Promote);
+      AddPromotedToType (ISD::VAARG, MVT::i8, MVT::i64);
+      setOperationAction(ISD::VAARG, MVT::i16, Promote);
+      AddPromotedToType (ISD::VAARG, MVT::i16, MVT::i64);
+      setOperationAction(ISD::VAARG, MVT::i32, Promote);
+      AddPromotedToType (ISD::VAARG, MVT::i32, MVT::i64);
+      setOperationAction(ISD::VAARG, MVT::Other, Expand);
+    } else {
+      // VAARG is custom lowered with the 32-bit SVR4 ABI.
+      setOperationAction(ISD::VAARG, MVT::Other, Custom);
+      setOperationAction(ISD::VAARG, MVT::i64, Custom);
+    }
+  } else
+    setOperationAction(ISD::VAARG, MVT::Other, Expand);
+
+  if (Subtarget.isSVR4ABI() && !isPPC64)
+    // VACOPY is custom lowered with the 32-bit SVR4 ABI.
+    setOperationAction(ISD::VACOPY            , MVT::Other, Custom);
+  else
+    setOperationAction(ISD::VACOPY            , MVT::Other, Expand);
+
+  // Use the default implementation.
+  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
+  setOperationAction(ISD::STACKSAVE         , MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE      , MVT::Other, Custom);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32  , Custom);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64  , Custom);
+
+  // We want to custom lower some of our intrinsics.
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+
+  // To handle counter-based loop conditions.
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i1, Custom);
+
+  // Comparisons that require checking two conditions.
+  setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETULT, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUGT, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUEQ, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETOGE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETOGE, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETOLE, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETONE, MVT::f64, Expand);
+
+  if (Subtarget.has64BitSupport()) {
+    // They also have instructions for converting between i64 and fp.
+    setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
+    setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
+    setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
+    // This is just the low 32 bits of a (signed) fp->i64 conversion.
+    // We cannot do this with Promote because i64 is not a legal type.
+    setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+
+    if (Subtarget.hasLFIWAX() || Subtarget.isPPC64())
+      setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+  } else {
+    // PowerPC does not have FP_TO_UINT on 32-bit implementations.
+    setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
+  }
+
+  // With the instructions enabled under FPCVT, we can do everything.
+  if (Subtarget.hasFPCVT()) {
+    if (Subtarget.has64BitSupport()) {
+      setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
+      setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
+      setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
+      setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
+    }
+
+    setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+    setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+    setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
+  }
+
+  if (Subtarget.use64BitRegs()) {
+    // 64-bit PowerPC implementations can support i64 types directly
+    addRegisterClass(MVT::i64, &PPC::G8RCRegClass);
+    // BUILD_PAIR can't be handled natively, and should be expanded to shl/or
+    setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
+    // 64-bit PowerPC wants to expand i128 shifts itself.
+    setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
+    setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
+    setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
+  } else {
+    // 32-bit PowerPC wants to expand i64 shifts itself.
+    setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
+    setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
+    setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
+  }
+
+  if (Subtarget.hasAltivec()) {
+    // First set operation action for all vector types to expand. Then we
+    // will selectively turn on ones that can be effectively codegen'd.
+    for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+         i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
+      MVT::SimpleValueType VT = (MVT::SimpleValueType)i;
+
+      // add/sub are legal for all supported vector VT's.
+      setOperationAction(ISD::ADD , VT, Legal);
+      setOperationAction(ISD::SUB , VT, Legal);
+
+      // We promote all shuffles to v16i8.
+      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Promote);
+      AddPromotedToType (ISD::VECTOR_SHUFFLE, VT, MVT::v16i8);
+
+      // We promote all non-typed operations to v4i32.
+      setOperationAction(ISD::AND   , VT, Promote);
+      AddPromotedToType (ISD::AND   , VT, MVT::v4i32);
+      setOperationAction(ISD::OR    , VT, Promote);
+      AddPromotedToType (ISD::OR    , VT, MVT::v4i32);
+      setOperationAction(ISD::XOR   , VT, Promote);
+      AddPromotedToType (ISD::XOR   , VT, MVT::v4i32);
+      setOperationAction(ISD::LOAD  , VT, Promote);
+      AddPromotedToType (ISD::LOAD  , VT, MVT::v4i32);
+      setOperationAction(ISD::SELECT, VT, Promote);
+      AddPromotedToType (ISD::SELECT, VT, MVT::v4i32);
+      setOperationAction(ISD::STORE, VT, Promote);
+      AddPromotedToType (ISD::STORE, VT, MVT::v4i32);
+
+      // No other operations are legal.
+      setOperationAction(ISD::MUL , VT, Expand);
+      setOperationAction(ISD::SDIV, VT, Expand);
+      setOperationAction(ISD::SREM, VT, Expand);
+      setOperationAction(ISD::UDIV, VT, Expand);
+      setOperationAction(ISD::UREM, VT, Expand);
+      setOperationAction(ISD::FDIV, VT, Expand);
+      setOperationAction(ISD::FREM, VT, Expand);
+      setOperationAction(ISD::FNEG, VT, Expand);
+      setOperationAction(ISD::FSQRT, VT, Expand);
+      setOperationAction(ISD::FLOG, VT, Expand);
+      setOperationAction(ISD::FLOG10, VT, Expand);
+      setOperationAction(ISD::FLOG2, VT, Expand);
+      setOperationAction(ISD::FEXP, VT, Expand);
+      setOperationAction(ISD::FEXP2, VT, Expand);
+      setOperationAction(ISD::FSIN, VT, Expand);
+      setOperationAction(ISD::FCOS, VT, Expand);
+      setOperationAction(ISD::FABS, VT, Expand);
+      setOperationAction(ISD::FPOWI, VT, Expand);
+      setOperationAction(ISD::FFLOOR, VT, Expand);
+      setOperationAction(ISD::FCEIL,  VT, Expand);
+      setOperationAction(ISD::FTRUNC, VT, Expand);
+      setOperationAction(ISD::FRINT,  VT, Expand);
+      setOperationAction(ISD::FNEARBYINT, VT, Expand);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Expand);
+      setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
+      setOperationAction(ISD::BUILD_VECTOR, VT, Expand);
+      setOperationAction(ISD::MULHU, VT, Expand);
+      setOperationAction(ISD::MULHS, VT, Expand);
+      setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+      setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+      setOperationAction(ISD::UDIVREM, VT, Expand);
+      setOperationAction(ISD::SDIVREM, VT, Expand);
+      setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand);
+      setOperationAction(ISD::FPOW, VT, Expand);
+      setOperationAction(ISD::BSWAP, VT, Expand);
+      setOperationAction(ISD::CTPOP, VT, Expand);
+      setOperationAction(ISD::CTLZ, VT, Expand);
+      setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
+      setOperationAction(ISD::CTTZ, VT, Expand);
+      setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
+      setOperationAction(ISD::VSELECT, VT, Expand);
+      setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
+
+      for (unsigned j = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+           j <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++j) {
+        MVT::SimpleValueType InnerVT = (MVT::SimpleValueType)j;
+        setTruncStoreAction(VT, InnerVT, Expand);
+      }
+      setLoadExtAction(ISD::SEXTLOAD, VT, Expand);
+      setLoadExtAction(ISD::ZEXTLOAD, VT, Expand);
+      setLoadExtAction(ISD::EXTLOAD, VT, Expand);
+    }
+
+    // We can custom expand all VECTOR_SHUFFLEs to VPERM, others we can handle
+    // with merges, splats, etc.
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom);
+
+    setOperationAction(ISD::AND   , MVT::v4i32, Legal);
+    setOperationAction(ISD::OR    , MVT::v4i32, Legal);
+    setOperationAction(ISD::XOR   , MVT::v4i32, Legal);
+    setOperationAction(ISD::LOAD  , MVT::v4i32, Legal);
+    setOperationAction(ISD::SELECT, MVT::v4i32,
+                       Subtarget.useCRBits() ? Legal : Expand);
+    setOperationAction(ISD::STORE , MVT::v4i32, Legal);
+    setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
+    setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
+    setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
+    setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
+    setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
+    setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
+
+    addRegisterClass(MVT::v4f32, &PPC::VRRCRegClass);
+    addRegisterClass(MVT::v4i32, &PPC::VRRCRegClass);
+    addRegisterClass(MVT::v8i16, &PPC::VRRCRegClass);
+    addRegisterClass(MVT::v16i8, &PPC::VRRCRegClass);
+
+    setOperationAction(ISD::MUL, MVT::v4f32, Legal);
+    setOperationAction(ISD::FMA, MVT::v4f32, Legal);
+
+    if (TM.Options.UnsafeFPMath || Subtarget.hasVSX()) {
+      setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
+      setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
+    }
+
+    setOperationAction(ISD::MUL, MVT::v4i32, Custom);
+    setOperationAction(ISD::MUL, MVT::v8i16, Custom);
+    setOperationAction(ISD::MUL, MVT::v16i8, Custom);
+
+    setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);
+    setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Custom);
+
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
+
+    // Altivec does not contain unordered floating-point compare instructions
+    setCondCodeAction(ISD::SETUO, MVT::v4f32, Expand);
+    setCondCodeAction(ISD::SETUEQ, MVT::v4f32, Expand);
+    setCondCodeAction(ISD::SETO,   MVT::v4f32, Expand);
+    setCondCodeAction(ISD::SETONE, MVT::v4f32, Expand);
+
+    if (Subtarget.hasVSX()) {
+      setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal);
+
+      setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);
+      setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);
+      setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal);
+      setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal);
+      setOperationAction(ISD::FROUND, MVT::v2f64, Legal);
+
+      setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
+
+      setOperationAction(ISD::MUL, MVT::v2f64, Legal);
+      setOperationAction(ISD::FMA, MVT::v2f64, Legal);
+
+      setOperationAction(ISD::FDIV, MVT::v2f64, Legal);
+      setOperationAction(ISD::FSQRT, MVT::v2f64, Legal);
+
+      setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
+      setOperationAction(ISD::VSELECT, MVT::v8i16, Legal);
+      setOperationAction(ISD::VSELECT, MVT::v4i32, Legal);
+      setOperationAction(ISD::VSELECT, MVT::v4f32, Legal);
+      setOperationAction(ISD::VSELECT, MVT::v2f64, Legal);
+
+      // Share the Altivec comparison restrictions.
+      setCondCodeAction(ISD::SETUO, MVT::v2f64, Expand);
+      setCondCodeAction(ISD::SETUEQ, MVT::v2f64, Expand);
+      setCondCodeAction(ISD::SETO,   MVT::v2f64, Expand);
+      setCondCodeAction(ISD::SETONE, MVT::v2f64, Expand);
+
+      setOperationAction(ISD::LOAD, MVT::v2f64, Legal);
+      setOperationAction(ISD::STORE, MVT::v2f64, Legal);
+
+      setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Legal);
+
+      addRegisterClass(MVT::f64, &PPC::VSFRCRegClass);
+
+      addRegisterClass(MVT::v4f32, &PPC::VSRCRegClass);
+      addRegisterClass(MVT::v2f64, &PPC::VSRCRegClass);
+
+      // VSX v2i64 only supports non-arithmetic operations.
+      setOperationAction(ISD::ADD, MVT::v2i64, Expand);
+      setOperationAction(ISD::SUB, MVT::v2i64, Expand);
+
+      setOperationAction(ISD::SHL, MVT::v2i64, Expand);
+      setOperationAction(ISD::SRA, MVT::v2i64, Expand);
+      setOperationAction(ISD::SRL, MVT::v2i64, Expand);
+
+      setOperationAction(ISD::SETCC, MVT::v2i64, Custom);
+
+      setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
+      AddPromotedToType (ISD::LOAD, MVT::v2i64, MVT::v2f64);
+      setOperationAction(ISD::STORE, MVT::v2i64, Promote);
+      AddPromotedToType (ISD::STORE, MVT::v2i64, MVT::v2f64);
+
+      setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Legal);
+
+      setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal);
+      setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal);
+      setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal);
+      setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal);
+
+      // Vector operation legalization checks the result type of
+      // SIGN_EXTEND_INREG, overall legalization checks the inner type.
+      setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i64, Legal);
+      setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Legal);
+      setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
+      setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
+
+      addRegisterClass(MVT::v2i64, &PPC::VSRCRegClass);
+    }
+  }
+
+  if (Subtarget.has64BitSupport()) {
+    setOperationAction(ISD::PREFETCH, MVT::Other, Legal);
+    setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
+  }
+
+  setOperationAction(ISD::ATOMIC_LOAD,  MVT::i32, Expand);
+  setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Expand);
+  setOperationAction(ISD::ATOMIC_LOAD,  MVT::i64, Expand);
+  setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand);
+
+  setBooleanContents(ZeroOrOneBooleanContent);
+  // Altivec instructions set fields to all zeros or all ones.
+  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+
+  if (!isPPC64) {
+    // These libcalls are not available in 32-bit.
+    setLibcallName(RTLIB::SHL_I128, nullptr);
+    setLibcallName(RTLIB::SRL_I128, nullptr);
+    setLibcallName(RTLIB::SRA_I128, nullptr);
+  }
+
+  if (isPPC64) {
+    setStackPointerRegisterToSaveRestore(PPC::X1);
+    setExceptionPointerRegister(PPC::X3);
+    setExceptionSelectorRegister(PPC::X4);
+  } else {
+    setStackPointerRegisterToSaveRestore(PPC::R1);
+    setExceptionPointerRegister(PPC::R3);
+    setExceptionSelectorRegister(PPC::R4);
+  }
+
+  // We have target-specific dag combine patterns for the following nodes:
+  setTargetDAGCombine(ISD::SINT_TO_FP);
+  setTargetDAGCombine(ISD::LOAD);
+  setTargetDAGCombine(ISD::STORE);
+  setTargetDAGCombine(ISD::BR_CC);
+  if (Subtarget.useCRBits())
+    setTargetDAGCombine(ISD::BRCOND);
+  setTargetDAGCombine(ISD::BSWAP);
+  setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
+
+  setTargetDAGCombine(ISD::SIGN_EXTEND);
+  setTargetDAGCombine(ISD::ZERO_EXTEND);
+  setTargetDAGCombine(ISD::ANY_EXTEND);
+
+  if (Subtarget.useCRBits()) {
+    setTargetDAGCombine(ISD::TRUNCATE);
+    setTargetDAGCombine(ISD::SETCC);
+    setTargetDAGCombine(ISD::SELECT_CC);
+  }
+
+  // Use reciprocal estimates.
+  if (TM.Options.UnsafeFPMath) {
+    setTargetDAGCombine(ISD::FDIV);
+    setTargetDAGCombine(ISD::FSQRT);
+  }
+
+  // Darwin long double math library functions have $LDBL128 appended.
+  if (Subtarget.isDarwin()) {
+    setLibcallName(RTLIB::COS_PPCF128, "cosl$LDBL128");
+    setLibcallName(RTLIB::POW_PPCF128, "powl$LDBL128");
+    setLibcallName(RTLIB::REM_PPCF128, "fmodl$LDBL128");
+    setLibcallName(RTLIB::SIN_PPCF128, "sinl$LDBL128");
+    setLibcallName(RTLIB::SQRT_PPCF128, "sqrtl$LDBL128");
+    setLibcallName(RTLIB::LOG_PPCF128, "logl$LDBL128");
+    setLibcallName(RTLIB::LOG2_PPCF128, "log2l$LDBL128");
+    setLibcallName(RTLIB::LOG10_PPCF128, "log10l$LDBL128");
+    setLibcallName(RTLIB::EXP_PPCF128, "expl$LDBL128");
+    setLibcallName(RTLIB::EXP2_PPCF128, "exp2l$LDBL128");
+  }
+
+  // With 32 condition bits, we don't need to sink (and duplicate) compares
+  // aggressively in CodeGenPrep.
+  if (Subtarget.useCRBits())
+    setHasMultipleConditionRegisters();
+
+  setMinFunctionAlignment(2);
+  if (Subtarget.isDarwin())
+    setPrefFunctionAlignment(4);
+
+  if (isPPC64 && Subtarget.isJITCodeModel())
+    // Temporary workaround for the inability of PPC64 JIT to handle jump
+    // tables.
+    setSupportJumpTables(false);
+
+  setInsertFencesForAtomic(true);
+
+  if (Subtarget.enableMachineScheduler())
+    setSchedulingPreference(Sched::Source);
+  else
+    setSchedulingPreference(Sched::Hybrid);
+
+  computeRegisterProperties();
+
+  // The Freescale cores does better with aggressive inlining of memcpy and
+  // friends. Gcc uses same threshold of 128 bytes (= 32 word stores).
+  if (Subtarget.getDarwinDirective() == PPC::DIR_E500mc ||
+      Subtarget.getDarwinDirective() == PPC::DIR_E5500) {
+    MaxStoresPerMemset = 32;
+    MaxStoresPerMemsetOptSize = 16;
+    MaxStoresPerMemcpy = 32;
+    MaxStoresPerMemcpyOptSize = 8;
+    MaxStoresPerMemmove = 32;
+    MaxStoresPerMemmoveOptSize = 8;
+
+    setPrefFunctionAlignment(4);
+  }
+}
+
+/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
+/// the desired ByVal argument alignment.
+static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign,
+                             unsigned MaxMaxAlign) {
+  if (MaxAlign == MaxMaxAlign)
+    return;
+  if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
+    if (MaxMaxAlign >= 32 && VTy->getBitWidth() >= 256)
+      MaxAlign = 32;
+    else if (VTy->getBitWidth() >= 128 && MaxAlign < 16)
+      MaxAlign = 16;
+  } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+    unsigned EltAlign = 0;
+    getMaxByValAlign(ATy->getElementType(), EltAlign, MaxMaxAlign);
+    if (EltAlign > MaxAlign)
+      MaxAlign = EltAlign;
+  } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
+    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+      unsigned EltAlign = 0;
+      getMaxByValAlign(STy->getElementType(i), EltAlign, MaxMaxAlign);
+      if (EltAlign > MaxAlign)
+        MaxAlign = EltAlign;
+      if (MaxAlign == MaxMaxAlign)
+        break;
+    }
+  }
+}
+
+/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
+/// function arguments in the caller parameter area.
+unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty) const {
+  // Darwin passes everything on 4 byte boundary.
+  if (Subtarget.isDarwin())
+    return 4;
+
+  // 16byte and wider vectors are passed on 16byte boundary.
+  // The rest is 8 on PPC64 and 4 on PPC32 boundary.
+  unsigned Align = Subtarget.isPPC64() ? 8 : 4;
+  if (Subtarget.hasAltivec() || Subtarget.hasQPX())
+    getMaxByValAlign(Ty, Align, Subtarget.hasQPX() ? 32 : 16);
+  return Align;
+}
+
+const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default: return nullptr;
+  case PPCISD::FSEL:            return "PPCISD::FSEL";
+  case PPCISD::FCFID:           return "PPCISD::FCFID";
+  case PPCISD::FCTIDZ:          return "PPCISD::FCTIDZ";
+  case PPCISD::FCTIWZ:          return "PPCISD::FCTIWZ";
+  case PPCISD::FRE:             return "PPCISD::FRE";
+  case PPCISD::FRSQRTE:         return "PPCISD::FRSQRTE";
+  case PPCISD::STFIWX:          return "PPCISD::STFIWX";
+  case PPCISD::VMADDFP:         return "PPCISD::VMADDFP";
+  case PPCISD::VNMSUBFP:        return "PPCISD::VNMSUBFP";
+  case PPCISD::VPERM:           return "PPCISD::VPERM";
+  case PPCISD::Hi:              return "PPCISD::Hi";
+  case PPCISD::Lo:              return "PPCISD::Lo";
+  case PPCISD::TOC_ENTRY:       return "PPCISD::TOC_ENTRY";
+  case PPCISD::LOAD:            return "PPCISD::LOAD";
+  case PPCISD::LOAD_TOC:        return "PPCISD::LOAD_TOC";
+  case PPCISD::DYNALLOC:        return "PPCISD::DYNALLOC";
+  case PPCISD::GlobalBaseReg:   return "PPCISD::GlobalBaseReg";
+  case PPCISD::SRL:             return "PPCISD::SRL";
+  case PPCISD::SRA:             return "PPCISD::SRA";
+  case PPCISD::SHL:             return "PPCISD::SHL";
+  case PPCISD::CALL:            return "PPCISD::CALL";
+  case PPCISD::CALL_NOP:        return "PPCISD::CALL_NOP";
+  case PPCISD::MTCTR:           return "PPCISD::MTCTR";
+  case PPCISD::BCTRL:           return "PPCISD::BCTRL";
+  case PPCISD::RET_FLAG:        return "PPCISD::RET_FLAG";
+  case PPCISD::EH_SJLJ_SETJMP:  return "PPCISD::EH_SJLJ_SETJMP";
+  case PPCISD::EH_SJLJ_LONGJMP: return "PPCISD::EH_SJLJ_LONGJMP";
+  case PPCISD::MFOCRF:          return "PPCISD::MFOCRF";
+  case PPCISD::VCMP:            return "PPCISD::VCMP";
+  case PPCISD::VCMPo:           return "PPCISD::VCMPo";
+  case PPCISD::LBRX:            return "PPCISD::LBRX";
+  case PPCISD::STBRX:           return "PPCISD::STBRX";
+  case PPCISD::LARX:            return "PPCISD::LARX";
+  case PPCISD::STCX:            return "PPCISD::STCX";
+  case PPCISD::COND_BRANCH:     return "PPCISD::COND_BRANCH";
+  case PPCISD::BDNZ:            return "PPCISD::BDNZ";
+  case PPCISD::BDZ:             return "PPCISD::BDZ";
+  case PPCISD::MFFS:            return "PPCISD::MFFS";
+  case PPCISD::FADDRTZ:         return "PPCISD::FADDRTZ";
+  case PPCISD::TC_RETURN:       return "PPCISD::TC_RETURN";
+  case PPCISD::CR6SET:          return "PPCISD::CR6SET";
+  case PPCISD::CR6UNSET:        return "PPCISD::CR6UNSET";
+  case PPCISD::ADDIS_TOC_HA:    return "PPCISD::ADDIS_TOC_HA";
+  case PPCISD::LD_TOC_L:        return "PPCISD::LD_TOC_L";
+  case PPCISD::ADDI_TOC_L:      return "PPCISD::ADDI_TOC_L";
+  case PPCISD::PPC32_GOT:       return "PPCISD::PPC32_GOT";
+  case PPCISD::ADDIS_GOT_TPREL_HA: return "PPCISD::ADDIS_GOT_TPREL_HA";
+  case PPCISD::LD_GOT_TPREL_L:  return "PPCISD::LD_GOT_TPREL_L";
+  case PPCISD::ADD_TLS:         return "PPCISD::ADD_TLS";
+  case PPCISD::ADDIS_TLSGD_HA:  return "PPCISD::ADDIS_TLSGD_HA";
+  case PPCISD::ADDI_TLSGD_L:    return "PPCISD::ADDI_TLSGD_L";
+  case PPCISD::GET_TLS_ADDR:    return "PPCISD::GET_TLS_ADDR";
+  case PPCISD::ADDIS_TLSLD_HA:  return "PPCISD::ADDIS_TLSLD_HA";
+  case PPCISD::ADDI_TLSLD_L:    return "PPCISD::ADDI_TLSLD_L";
+  case PPCISD::GET_TLSLD_ADDR:  return "PPCISD::GET_TLSLD_ADDR";
+  case PPCISD::ADDIS_DTPREL_HA: return "PPCISD::ADDIS_DTPREL_HA";
+  case PPCISD::ADDI_DTPREL_L:   return "PPCISD::ADDI_DTPREL_L";
+  case PPCISD::VADD_SPLAT:      return "PPCISD::VADD_SPLAT";
+  case PPCISD::SC:              return "PPCISD::SC";
+  }
+}
+
+EVT PPCTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
+  if (!VT.isVector())
+    return Subtarget.useCRBits() ? MVT::i1 : MVT::i32;
+  return VT.changeVectorElementTypeToInteger();
+}
+
+//===----------------------------------------------------------------------===//
+// Node matching predicates, for use by the tblgen matching code.
+//===----------------------------------------------------------------------===//
+
+/// isFloatingPointZero - Return true if this is 0.0 or -0.0.
+static bool isFloatingPointZero(SDValue Op) {
+  if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op))
+    return CFP->getValueAPF().isZero();
+  else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
+    // Maybe this has already been legalized into the constant pool?
+    if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Op.getOperand(1)))
+      if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
+        return CFP->getValueAPF().isZero();
+  }
+  return false;
+}
+
+/// isConstantOrUndef - Op is either an undef node or a ConstantSDNode.  Return
+/// true if Op is undef or if it matches the specified value.
+static bool isConstantOrUndef(int Op, int Val) {
+  return Op < 0 || Op == Val;
+}
+
+/// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a
+/// VPKUHUM instruction.
+/// The ShuffleKind distinguishes between big-endian operations with
+/// two different inputs (0), either-endian operations with two identical
+/// inputs (1), and little-endian operantion with two different inputs (2).
+/// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
+bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
+                               SelectionDAG &DAG) {
+  if (ShuffleKind == 0) {
+    if (DAG.getTarget().getDataLayout()->isLittleEndian())
+      return false;
+    for (unsigned i = 0; i != 16; ++i)
+      if (!isConstantOrUndef(N->getMaskElt(i), i*2+1))
+        return false;
+  } else if (ShuffleKind == 2) {
+    if (!DAG.getTarget().getDataLayout()->isLittleEndian())
+      return false;
+    for (unsigned i = 0; i != 16; ++i)
+      if (!isConstantOrUndef(N->getMaskElt(i), i*2))
+        return false;
+  } else if (ShuffleKind == 1) {
+    unsigned j = DAG.getTarget().getDataLayout()->isLittleEndian() ? 0 : 1;
+    for (unsigned i = 0; i != 8; ++i)
+      if (!isConstantOrUndef(N->getMaskElt(i),    i*2+j) ||
+          !isConstantOrUndef(N->getMaskElt(i+8),  i*2+j))
+        return false;
+  }
+  return true;
+}
+
+/// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a
+/// VPKUWUM instruction.
+/// The ShuffleKind distinguishes between big-endian operations with
+/// two different inputs (0), either-endian operations with two identical
+/// inputs (1), and little-endian operantion with two different inputs (2).
+/// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
+bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
+                               SelectionDAG &DAG) {
+  if (ShuffleKind == 0) {
+    if (DAG.getTarget().getDataLayout()->isLittleEndian())
+      return false;
+    for (unsigned i = 0; i != 16; i += 2)
+      if (!isConstantOrUndef(N->getMaskElt(i  ),  i*2+2) ||
+          !isConstantOrUndef(N->getMaskElt(i+1),  i*2+3))
+        return false;
+  } else if (ShuffleKind == 2) {
+    if (!DAG.getTarget().getDataLayout()->isLittleEndian())
+      return false;
+    for (unsigned i = 0; i != 16; i += 2)
+      if (!isConstantOrUndef(N->getMaskElt(i  ),  i*2) ||
+          !isConstantOrUndef(N->getMaskElt(i+1),  i*2+1))
+        return false;
+  } else if (ShuffleKind == 1) {
+    unsigned j = DAG.getTarget().getDataLayout()->isLittleEndian() ? 0 : 2;
+    for (unsigned i = 0; i != 8; i += 2)
+      if (!isConstantOrUndef(N->getMaskElt(i  ),  i*2+j)   ||
+          !isConstantOrUndef(N->getMaskElt(i+1),  i*2+j+1) ||
+          !isConstantOrUndef(N->getMaskElt(i+8),  i*2+j)   ||
+          !isConstantOrUndef(N->getMaskElt(i+9),  i*2+j+1))
+        return false;
+  }
+  return true;
+}
+
+/// isVMerge - Common function, used to match vmrg* shuffles.
+///
+static bool isVMerge(ShuffleVectorSDNode *N, unsigned UnitSize,
+                     unsigned LHSStart, unsigned RHSStart) {
+  if (N->getValueType(0) != MVT::v16i8)
+    return false;
+  assert((UnitSize == 1 || UnitSize == 2 || UnitSize == 4) &&
+         "Unsupported merge size!");
+
+  for (unsigned i = 0; i != 8/UnitSize; ++i)     // Step over units
+    for (unsigned j = 0; j != UnitSize; ++j) {   // Step over bytes within unit
+      if (!isConstantOrUndef(N->getMaskElt(i*UnitSize*2+j),
+                             LHSStart+j+i*UnitSize) ||
+          !isConstantOrUndef(N->getMaskElt(i*UnitSize*2+UnitSize+j),
+                             RHSStart+j+i*UnitSize))
+        return false;
+    }
+  return true;
+}
+
+/// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for
+/// a VMRGL* instruction with the specified unit size (1,2 or 4 bytes).
+/// The ShuffleKind distinguishes between big-endian merges with two 
+/// different inputs (0), either-endian merges with two identical inputs (1),
+/// and little-endian merges with two different inputs (2).  For the latter,
+/// the input operands are swapped (see PPCInstrAltivec.td).
+bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
+                             unsigned ShuffleKind, SelectionDAG &DAG) {
+  if (DAG.getTarget().getDataLayout()->isLittleEndian()) {
+    if (ShuffleKind == 1) // unary
+      return isVMerge(N, UnitSize, 0, 0);
+    else if (ShuffleKind == 2) // swapped
+      return isVMerge(N, UnitSize, 0, 16);
+    else
+      return false;
+  } else {
+    if (ShuffleKind == 1) // unary
+      return isVMerge(N, UnitSize, 8, 8);
+    else if (ShuffleKind == 0) // normal
+      return isVMerge(N, UnitSize, 8, 24);
+    else
+      return false;
+  }
+}
+
+/// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for
+/// a VMRGH* instruction with the specified unit size (1,2 or 4 bytes).
+/// The ShuffleKind distinguishes between big-endian merges with two 
+/// different inputs (0), either-endian merges with two identical inputs (1),
+/// and little-endian merges with two different inputs (2).  For the latter,
+/// the input operands are swapped (see PPCInstrAltivec.td).
+bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
+                             unsigned ShuffleKind, SelectionDAG &DAG) {
+  if (DAG.getTarget().getDataLayout()->isLittleEndian()) {
+    if (ShuffleKind == 1) // unary
+      return isVMerge(N, UnitSize, 8, 8);
+    else if (ShuffleKind == 2) // swapped
+      return isVMerge(N, UnitSize, 8, 24);
+    else
+      return false;
+  } else {
+    if (ShuffleKind == 1) // unary
+      return isVMerge(N, UnitSize, 0, 0);
+    else if (ShuffleKind == 0) // normal
+      return isVMerge(N, UnitSize, 0, 16);
+    else
+      return false;
+  }
+}
+
+
+/// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift
+/// amount, otherwise return -1.
+/// The ShuffleKind distinguishes between big-endian operations with two 
+/// different inputs (0), either-endian operations with two identical inputs
+/// (1), and little-endian operations with two different inputs (2).  For the
+/// latter, the input operands are swapped (see PPCInstrAltivec.td).
+int PPC::isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind,
+                             SelectionDAG &DAG) {
+  if (N->getValueType(0) != MVT::v16i8)
+    return -1;
+
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+
+  // Find the first non-undef value in the shuffle mask.
+  unsigned i;
+  for (i = 0; i != 16 && SVOp->getMaskElt(i) < 0; ++i)
+    /*search*/;
+
+  if (i == 16) return -1;  // all undef.
+
+  // Otherwise, check to see if the rest of the elements are consecutively
+  // numbered from this value.
+  unsigned ShiftAmt = SVOp->getMaskElt(i);
+  if (ShiftAmt < i) return -1;
+
+  ShiftAmt -= i;
+  bool isLE = DAG.getTarget().getDataLayout()->isLittleEndian();
+
+  if ((ShuffleKind == 0 && !isLE) || (ShuffleKind == 2 && isLE)) {
+    // Check the rest of the elements to see if they are consecutive.
+    for (++i; i != 16; ++i)
+      if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i))
+        return -1;
+  } else if (ShuffleKind == 1) {
+    // Check the rest of the elements to see if they are consecutive.
+    for (++i; i != 16; ++i)
+      if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt+i) & 15))
+        return -1;
+  } else
+    return -1;
+
+  if (ShuffleKind == 2 && isLE)
+    ShiftAmt = 16 - ShiftAmt;
+
+  return ShiftAmt;
+}
+
+/// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a splat of a single element that is suitable for input to
+/// VSPLTB/VSPLTH/VSPLTW.
+bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) {
+  assert(N->getValueType(0) == MVT::v16i8 &&
+         (EltSize == 1 || EltSize == 2 || EltSize == 4));
+
+  // This is a splat operation if each element of the permute is the same, and
+  // if the value doesn't reference the second vector.
+  unsigned ElementBase = N->getMaskElt(0);
+
+  // FIXME: Handle UNDEF elements too!
+  if (ElementBase >= 16)
+    return false;
+
+  // Check that the indices are consecutive, in the case of a multi-byte element
+  // splatted with a v16i8 mask.
+  for (unsigned i = 1; i != EltSize; ++i)
+    if (N->getMaskElt(i) < 0 || N->getMaskElt(i) != (int)(i+ElementBase))
+      return false;
+
+  for (unsigned i = EltSize, e = 16; i != e; i += EltSize) {
+    if (N->getMaskElt(i) < 0) continue;
+    for (unsigned j = 0; j != EltSize; ++j)
+      if (N->getMaskElt(i+j) != N->getMaskElt(j))
+        return false;
+  }
+  return true;
+}
+
+/// isAllNegativeZeroVector - Returns true if all elements of build_vector
+/// are -0.0.
+bool PPC::isAllNegativeZeroVector(SDNode *N) {
+  BuildVectorSDNode *BV = cast<BuildVectorSDNode>(N);
+
+  APInt APVal, APUndef;
+  unsigned BitSize;
+  bool HasAnyUndefs;
+
+  if (BV->isConstantSplat(APVal, APUndef, BitSize, HasAnyUndefs, 32, true))
+    if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
+      return CFP->getValueAPF().isNegZero();
+
+  return false;
+}
+
+/// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the
+/// specified isSplatShuffleMask VECTOR_SHUFFLE mask.
+unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize,
+                                SelectionDAG &DAG) {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  assert(isSplatShuffleMask(SVOp, EltSize));
+  if (DAG.getTarget().getDataLayout()->isLittleEndian())
+    return (16 / EltSize) - 1 - (SVOp->getMaskElt(0) / EltSize);
+  else
+    return SVOp->getMaskElt(0) / EltSize;
+}
+
+/// get_VSPLTI_elt - If this is a build_vector of constants which can be formed
+/// by using a vspltis[bhw] instruction of the specified element size, return
+/// the constant being splatted.  The ByteSize field indicates the number of
+/// bytes of each element [124] -> [bhw].
+SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
+  SDValue OpVal(nullptr, 0);
+
+  // If ByteSize of the splat is bigger than the element size of the
+  // build_vector, then we have a case where we are checking for a splat where
+  // multiple elements of the buildvector are folded together into a single
+  // logical element of the splat (e.g. "vsplish 1" to splat {0,1}*8).
+  unsigned EltSize = 16/N->getNumOperands();
+  if (EltSize < ByteSize) {
+    unsigned Multiple = ByteSize/EltSize;   // Number of BV entries per spltval.
+    SDValue UniquedVals[4];
+    assert(Multiple > 1 && Multiple <= 4 && "How can this happen?");
+
+    // See if all of the elements in the buildvector agree across.
+    for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+      if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue;
+      // If the element isn't a constant, bail fully out.
+      if (!isa<ConstantSDNode>(N->getOperand(i))) return SDValue();
+
+
+      if (!UniquedVals[i&(Multiple-1)].getNode())
+        UniquedVals[i&(Multiple-1)] = N->getOperand(i);
+      else if (UniquedVals[i&(Multiple-1)] != N->getOperand(i))
+        return SDValue();  // no match.
+    }
+
+    // Okay, if we reached this point, UniquedVals[0..Multiple-1] contains
+    // either constant or undef values that are identical for each chunk.  See
+    // if these chunks can form into a larger vspltis*.
+
+    // Check to see if all of the leading entries are either 0 or -1.  If
+    // neither, then this won't fit into the immediate field.
+    bool LeadingZero = true;
+    bool LeadingOnes = true;
+    for (unsigned i = 0; i != Multiple-1; ++i) {
+      if (!UniquedVals[i].getNode()) continue;  // Must have been undefs.
+
+      LeadingZero &= cast<ConstantSDNode>(UniquedVals[i])->isNullValue();
+      LeadingOnes &= cast<ConstantSDNode>(UniquedVals[i])->isAllOnesValue();
+    }
+    // Finally, check the least significant entry.
+    if (LeadingZero) {
+      if (!UniquedVals[Multiple-1].getNode())
+        return DAG.getTargetConstant(0, MVT::i32);  // 0,0,0,undef
+      int Val = cast<ConstantSDNode>(UniquedVals[Multiple-1])->getZExtValue();
+      if (Val < 16)
+        return DAG.getTargetConstant(Val, MVT::i32);  // 0,0,0,4 -> vspltisw(4)
+    }
+    if (LeadingOnes) {
+      if (!UniquedVals[Multiple-1].getNode())
+        return DAG.getTargetConstant(~0U, MVT::i32);  // -1,-1,-1,undef
+      int Val =cast<ConstantSDNode>(UniquedVals[Multiple-1])->getSExtValue();
+      if (Val >= -16)                            // -1,-1,-1,-2 -> vspltisw(-2)
+        return DAG.getTargetConstant(Val, MVT::i32);
+    }
+
+    return SDValue();
+  }
+
+  // Check to see if this buildvec has a single non-undef value in its elements.
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue;
+    if (!OpVal.getNode())
+      OpVal = N->getOperand(i);
+    else if (OpVal != N->getOperand(i))
+      return SDValue();
+  }
+
+  if (!OpVal.getNode()) return SDValue();  // All UNDEF: use implicit def.
+
+  unsigned ValSizeInBytes = EltSize;
+  uint64_t Value = 0;
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) {
+    Value = CN->getZExtValue();
+  } else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(OpVal)) {
+    assert(CN->getValueType(0) == MVT::f32 && "Only one legal FP vector type!");
+    Value = FloatToBits(CN->getValueAPF().convertToFloat());
+  }
+
+  // If the splat value is larger than the element value, then we can never do
+  // this splat.  The only case that we could fit the replicated bits into our
+  // immediate field for would be zero, and we prefer to use vxor for it.
+  if (ValSizeInBytes < ByteSize) return SDValue();
+
+  // If the element value is larger than the splat value, cut it in half and
+  // check to see if the two halves are equal.  Continue doing this until we
+  // get to ByteSize.  This allows us to handle 0x01010101 as 0x01.
+  while (ValSizeInBytes > ByteSize) {
+    ValSizeInBytes >>= 1;
+
+    // If the top half equals the bottom half, we're still ok.
+    if (((Value >> (ValSizeInBytes*8)) & ((1 << (8*ValSizeInBytes))-1)) !=
+         (Value                        & ((1 << (8*ValSizeInBytes))-1)))
+      return SDValue();
+  }
+
+  // Properly sign extend the value.
+  int MaskVal = SignExtend32(Value, ByteSize * 8);
+
+  // If this is zero, don't match, zero matches ISD::isBuildVectorAllZeros.
+  if (MaskVal == 0) return SDValue();
+
+  // Finally, if this value fits in a 5 bit sext field, return it
+  if (SignExtend32<5>(MaskVal) == MaskVal)
+    return DAG.getTargetConstant(MaskVal, MVT::i32);
+  return SDValue();
+}
+
+//===----------------------------------------------------------------------===//
+//  Addressing Mode Selection
+//===----------------------------------------------------------------------===//
+
+/// isIntS16Immediate - This method tests to see if the node is either a 32-bit
+/// or 64-bit immediate, and if the value can be accurately represented as a
+/// sign extension from a 16-bit value.  If so, this returns true and the
+/// immediate.
+static bool isIntS16Immediate(SDNode *N, short &Imm) {
+  if (!isa<ConstantSDNode>(N))
+    return false;
+
+  Imm = (short)cast<ConstantSDNode>(N)->getZExtValue();
+  if (N->getValueType(0) == MVT::i32)
+    return Imm == (int32_t)cast<ConstantSDNode>(N)->getZExtValue();
+  else
+    return Imm == (int64_t)cast<ConstantSDNode>(N)->getZExtValue();
+}
+static bool isIntS16Immediate(SDValue Op, short &Imm) {
+  return isIntS16Immediate(Op.getNode(), Imm);
+}
+
+
+/// SelectAddressRegReg - Given the specified addressed, check to see if it
+/// can be represented as an indexed [r+r] operation.  Returns false if it
+/// can be more efficiently represented with [r+imm].
+bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base,
+                                            SDValue &Index,
+                                            SelectionDAG &DAG) const {
+  short imm = 0;
+  if (N.getOpcode() == ISD::ADD) {
+    if (isIntS16Immediate(N.getOperand(1), imm))
+      return false;    // r+i
+    if (N.getOperand(1).getOpcode() == PPCISD::Lo)
+      return false;    // r+i
+
+    Base = N.getOperand(0);
+    Index = N.getOperand(1);
+    return true;
+  } else if (N.getOpcode() == ISD::OR) {
+    if (isIntS16Immediate(N.getOperand(1), imm))
+      return false;    // r+i can fold it if we can.
+
+    // If this is an or of disjoint bitfields, we can codegen this as an add
+    // (for better address arithmetic) if the LHS and RHS of the OR are provably
+    // disjoint.
+    APInt LHSKnownZero, LHSKnownOne;
+    APInt RHSKnownZero, RHSKnownOne;
+    DAG.computeKnownBits(N.getOperand(0),
+                         LHSKnownZero, LHSKnownOne);
+
+    if (LHSKnownZero.getBoolValue()) {
+      DAG.computeKnownBits(N.getOperand(1),
+                           RHSKnownZero, RHSKnownOne);
+      // If all of the bits are known zero on the LHS or RHS, the add won't
+      // carry.
+      if (~(LHSKnownZero | RHSKnownZero) == 0) {
+        Base = N.getOperand(0);
+        Index = N.getOperand(1);
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+// If we happen to be doing an i64 load or store into a stack slot that has
+// less than a 4-byte alignment, then the frame-index elimination may need to
+// use an indexed load or store instruction (because the offset may not be a
+// multiple of 4). The extra register needed to hold the offset comes from the
+// register scavenger, and it is possible that the scavenger will need to use
+// an emergency spill slot. As a result, we need to make sure that a spill slot
+// is allocated when doing an i64 load/store into a less-than-4-byte-aligned
+// stack slot.
+static void fixupFuncForFI(SelectionDAG &DAG, int FrameIdx, EVT VT) {
+  // FIXME: This does not handle the LWA case.
+  if (VT != MVT::i64)
+    return;
+
+  // NOTE: We'll exclude negative FIs here, which come from argument
+  // lowering, because there are no known test cases triggering this problem
+  // using packed structures (or similar). We can remove this exclusion if
+  // we find such a test case. The reason why this is so test-case driven is
+  // because this entire 'fixup' is only to prevent crashes (from the
+  // register scavenger) on not-really-valid inputs. For example, if we have:
+  //   %a = alloca i1
+  //   %b = bitcast i1* %a to i64*
+  //   store i64* a, i64 b
+  // then the store should really be marked as 'align 1', but is not. If it
+  // were marked as 'align 1' then the indexed form would have been
+  // instruction-selected initially, and the problem this 'fixup' is preventing
+  // won't happen regardless.
+  if (FrameIdx < 0)
+    return;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  unsigned Align = MFI->getObjectAlignment(FrameIdx);
+  if (Align >= 4)
+    return;
+
+  PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+  FuncInfo->setHasNonRISpills();
+}
+
+/// Returns true if the address N can be represented by a base register plus
+/// a signed 16-bit displacement [r+imm], and if it is not better
+/// represented as reg+reg.  If Aligned is true, only accept displacements
+/// suitable for STD and friends, i.e. multiples of 4.
+bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
+                                            SDValue &Base,
+                                            SelectionDAG &DAG,
+                                            bool Aligned) const {
+  // FIXME dl should come from parent load or store, not from address
+  SDLoc dl(N);
+  // If this can be more profitably realized as r+r, fail.
+  if (SelectAddressRegReg(N, Disp, Base, DAG))
+    return false;
+
+  if (N.getOpcode() == ISD::ADD) {
+    short imm = 0;
+    if (isIntS16Immediate(N.getOperand(1), imm) &&
+        (!Aligned || (imm & 3) == 0)) {
+      Disp = DAG.getTargetConstant(imm, N.getValueType());
+      if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
+        Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
+        fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
+      } else {
+        Base = N.getOperand(0);
+      }
+      return true; // [r+i]
+    } else if (N.getOperand(1).getOpcode() == PPCISD::Lo) {
+      // Match LOAD (ADD (X, Lo(G))).
+      assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getZExtValue()
+             && "Cannot handle constant offsets yet!");
+      Disp = N.getOperand(1).getOperand(0);  // The global address.
+      assert(Disp.getOpcode() == ISD::TargetGlobalAddress ||
+             Disp.getOpcode() == ISD::TargetGlobalTLSAddress ||
+             Disp.getOpcode() == ISD::TargetConstantPool ||
+             Disp.getOpcode() == ISD::TargetJumpTable);
+      Base = N.getOperand(0);
+      return true;  // [&g+r]
+    }
+  } else if (N.getOpcode() == ISD::OR) {
+    short imm = 0;
+    if (isIntS16Immediate(N.getOperand(1), imm) &&
+        (!Aligned || (imm & 3) == 0)) {
+      // If this is an or of disjoint bitfields, we can codegen this as an add
+      // (for better address arithmetic) if the LHS and RHS of the OR are
+      // provably disjoint.
+      APInt LHSKnownZero, LHSKnownOne;
+      DAG.computeKnownBits(N.getOperand(0), LHSKnownZero, LHSKnownOne);
+
+      if ((LHSKnownZero.getZExtValue()|~(uint64_t)imm) == ~0ULL) {
+        // If all of the bits are known zero on the LHS or RHS, the add won't
+        // carry.
+        if (FrameIndexSDNode *FI =
+              dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
+          Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
+          fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
+        } else {
+          Base = N.getOperand(0);
+        }
+        Disp = DAG.getTargetConstant(imm, N.getValueType());
+        return true;
+      }
+    }
+  } else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
+    // Loading from a constant address.
+
+    // If this address fits entirely in a 16-bit sext immediate field, codegen
+    // this as "d, 0"
+    short Imm;
+    if (isIntS16Immediate(CN, Imm) && (!Aligned || (Imm & 3) == 0)) {
+      Disp = DAG.getTargetConstant(Imm, CN->getValueType(0));
+      Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
+                             CN->getValueType(0));
+      return true;
+    }
+
+    // Handle 32-bit sext immediates with LIS + addr mode.
+    if ((CN->getValueType(0) == MVT::i32 ||
+         (int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) &&
+        (!Aligned || (CN->getZExtValue() & 3) == 0)) {
+      int Addr = (int)CN->getZExtValue();
+
+      // Otherwise, break this down into an LIS + disp.
+      Disp = DAG.getTargetConstant((short)Addr, MVT::i32);
+
+      Base = DAG.getTargetConstant((Addr - (signed short)Addr) >> 16, MVT::i32);
+      unsigned Opc = CN->getValueType(0) == MVT::i32 ? PPC::LIS : PPC::LIS8;
+      Base = SDValue(DAG.getMachineNode(Opc, dl, CN->getValueType(0), Base), 0);
+      return true;
+    }
+  }
+
+  Disp = DAG.getTargetConstant(0, getPointerTy());
+  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N)) {
+    Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
+    fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
+  } else
+    Base = N;
+  return true;      // [r+0]
+}
+
+/// SelectAddressRegRegOnly - Given the specified addressed, force it to be
+/// represented as an indexed [r+r] operation.
+bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base,
+                                                SDValue &Index,
+                                                SelectionDAG &DAG) const {
+  // Check to see if we can easily represent this as an [r+r] address.  This
+  // will fail if it thinks that the address is more profitably represented as
+  // reg+imm, e.g. where imm = 0.
+  if (SelectAddressRegReg(N, Base, Index, DAG))
+    return true;
+
+  // If the operand is an addition, always emit this as [r+r], since this is
+  // better (for code size, and execution, as the memop does the add for free)
+  // than emitting an explicit add.
+  if (N.getOpcode() == ISD::ADD) {
+    Base = N.getOperand(0);
+    Index = N.getOperand(1);
+    return true;
+  }
+
+  // Otherwise, do it the hard way, using R0 as the base register.
+  Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
+                         N.getValueType());
+  Index = N;
+  return true;
+}
+
+/// getPreIndexedAddressParts - returns true by value, base pointer and
+/// offset pointer and addressing mode by reference if the node's address
+/// can be legally represented as pre-indexed load / store address.
+bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
+                                                  SDValue &Offset,
+                                                  ISD::MemIndexedMode &AM,
+                                                  SelectionDAG &DAG) const {
+  if (DisablePPCPreinc) return false;
+
+  bool isLoad = true;
+  SDValue Ptr;
+  EVT VT;
+  unsigned Alignment;
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    Ptr = LD->getBasePtr();
+    VT = LD->getMemoryVT();
+    Alignment = LD->getAlignment();
+  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    Ptr = ST->getBasePtr();
+    VT  = ST->getMemoryVT();
+    Alignment = ST->getAlignment();
+    isLoad = false;
+  } else
+    return false;
+
+  // PowerPC doesn't have preinc load/store instructions for vectors.
+  if (VT.isVector())
+    return false;
+
+  if (SelectAddressRegReg(Ptr, Base, Offset, DAG)) {
+
+    // Common code will reject creating a pre-inc form if the base pointer
+    // is a frame index, or if N is a store and the base pointer is either
+    // the same as or a predecessor of the value being stored.  Check for
+    // those situations here, and try with swapped Base/Offset instead.
+    bool Swap = false;
+
+    if (isa<FrameIndexSDNode>(Base) || isa<RegisterSDNode>(Base))
+      Swap = true;
+    else if (!isLoad) {
+      SDValue Val = cast<StoreSDNode>(N)->getValue();
+      if (Val == Base || Base.getNode()->isPredecessorOf(Val.getNode()))
+        Swap = true;
+    }
+
+    if (Swap)
+      std::swap(Base, Offset);
+
+    AM = ISD::PRE_INC;
+    return true;
+  }
+
+  // LDU/STU can only handle immediates that are a multiple of 4.
+  if (VT != MVT::i64) {
+    if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, false))
+      return false;
+  } else {
+    // LDU/STU need an address with at least 4-byte alignment.
+    if (Alignment < 4)
+      return false;
+
+    if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, true))
+      return false;
+  }
+
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    // PPC64 doesn't have lwau, but it does have lwaux.  Reject preinc load of
+    // sext i32 to i64 when addr mode is r+i.
+    if (LD->getValueType(0) == MVT::i64 && LD->getMemoryVT() == MVT::i32 &&
+        LD->getExtensionType() == ISD::SEXTLOAD &&
+        isa<ConstantSDNode>(Offset))
+      return false;
+  }
+
+  AM = ISD::PRE_INC;
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+//  LowerOperation implementation
+//===----------------------------------------------------------------------===//
+
+/// GetLabelAccessInfo - Return true if we should reference labels using a
+/// PICBase, set the HiOpFlags and LoOpFlags to the target MO flags.
+static bool GetLabelAccessInfo(const TargetMachine &TM, unsigned &HiOpFlags,
+                               unsigned &LoOpFlags,
+                               const GlobalValue *GV = nullptr) {
+  HiOpFlags = PPCII::MO_HA;
+  LoOpFlags = PPCII::MO_LO;
+
+  // Don't use the pic base if not in PIC relocation model.
+  bool isPIC = TM.getRelocationModel() == Reloc::PIC_;
+
+  if (isPIC) {
+    HiOpFlags |= PPCII::MO_PIC_FLAG;
+    LoOpFlags |= PPCII::MO_PIC_FLAG;
+  }
+
+  // If this is a reference to a global value that requires a non-lazy-ptr, make
+  // sure that instruction lowering adds it.
+  if (GV && TM.getSubtarget<PPCSubtarget>().hasLazyResolverStub(GV, TM)) {
+    HiOpFlags |= PPCII::MO_NLP_FLAG;
+    LoOpFlags |= PPCII::MO_NLP_FLAG;
+
+    if (GV->hasHiddenVisibility()) {
+      HiOpFlags |= PPCII::MO_NLP_HIDDEN_FLAG;
+      LoOpFlags |= PPCII::MO_NLP_HIDDEN_FLAG;
+    }
+  }
+
+  return isPIC;
+}
+
+static SDValue LowerLabelRef(SDValue HiPart, SDValue LoPart, bool isPIC,
+                             SelectionDAG &DAG) {
+  EVT PtrVT = HiPart.getValueType();
+  SDValue Zero = DAG.getConstant(0, PtrVT);
+  SDLoc DL(HiPart);
+
+  SDValue Hi = DAG.getNode(PPCISD::Hi, DL, PtrVT, HiPart, Zero);
+  SDValue Lo = DAG.getNode(PPCISD::Lo, DL, PtrVT, LoPart, Zero);
+
+  // With PIC, the first instruction is actually "GR+hi(&G)".
+  if (isPIC)
+    Hi = DAG.getNode(ISD::ADD, DL, PtrVT,
+                     DAG.getNode(PPCISD::GlobalBaseReg, DL, PtrVT), Hi);
+
+  // Generate non-pic code that has direct accesses to the constant pool.
+  // The address of the global is just (hi(&g)+lo(&g)).
+  return DAG.getNode(ISD::ADD, DL, PtrVT, Hi, Lo);
+}
+
+SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  EVT PtrVT = Op.getValueType();
+  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+  const Constant *C = CP->getConstVal();
+
+  // 64-bit SVR4 ABI code is always position-independent.
+  // The actual address of the GlobalValue is stored in the TOC.
+  if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
+    SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0);
+    return DAG.getNode(PPCISD::TOC_ENTRY, SDLoc(CP), MVT::i64, GA,
+                       DAG.getRegister(PPC::X2, MVT::i64));
+  }
+
+  unsigned MOHiFlag, MOLoFlag;
+  bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag);
+
+  if (isPIC && Subtarget.isSVR4ABI()) {
+    SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(),
+                                           PPCII::MO_PIC_FLAG);
+    SDLoc DL(CP);
+    return DAG.getNode(PPCISD::TOC_ENTRY, DL, MVT::i32, GA,
+                       DAG.getNode(PPCISD::GlobalBaseReg, DL, PtrVT));
+  }
+
+  SDValue CPIHi =
+    DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0, MOHiFlag);
+  SDValue CPILo =
+    DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0, MOLoFlag);
+  return LowerLabelRef(CPIHi, CPILo, isPIC, DAG);
+}
+
+SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
+  EVT PtrVT = Op.getValueType();
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+
+  // 64-bit SVR4 ABI code is always position-independent.
+  // The actual address of the GlobalValue is stored in the TOC.
+  if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
+    SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
+    return DAG.getNode(PPCISD::TOC_ENTRY, SDLoc(JT), MVT::i64, GA,
+                       DAG.getRegister(PPC::X2, MVT::i64));
+  }
+
+  unsigned MOHiFlag, MOLoFlag;
+  bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag);
+
+  if (isPIC && Subtarget.isSVR4ABI()) {
+    SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
+                                        PPCII::MO_PIC_FLAG);
+    SDLoc DL(GA);
+    return DAG.getNode(PPCISD::TOC_ENTRY, SDLoc(JT), PtrVT, GA,
+                       DAG.getNode(PPCISD::GlobalBaseReg, DL, PtrVT));
+  }
+
+  SDValue JTIHi = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOHiFlag);
+  SDValue JTILo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOLoFlag);
+  return LowerLabelRef(JTIHi, JTILo, isPIC, DAG);
+}
+
+SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  EVT PtrVT = Op.getValueType();
+
+  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+
+  unsigned MOHiFlag, MOLoFlag;
+  bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag);
+  SDValue TgtBAHi = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOHiFlag);
+  SDValue TgtBALo = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOLoFlag);
+  return LowerLabelRef(TgtBAHi, TgtBALo, isPIC, DAG);
+}
+
+SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
+                                              SelectionDAG &DAG) const {
+
+  // FIXME: TLS addresses currently use medium model code sequences,
+  // which is the most useful form.  Eventually support for small and
+  // large models could be added if users need it, at the cost of
+  // additional complexity.
+  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+  SDLoc dl(GA);
+  const GlobalValue *GV = GA->getGlobal();
+  EVT PtrVT = getPointerTy();
+  bool is64bit = Subtarget.isPPC64();
+
+  TLSModel::Model Model = getTargetMachine().getTLSModel(GV);
+
+  if (Model == TLSModel::LocalExec) {
+    SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
+                                               PPCII::MO_TPREL_HA);
+    SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
+                                               PPCII::MO_TPREL_LO);
+    SDValue TLSReg = DAG.getRegister(is64bit ? PPC::X13 : PPC::R2,
+                                     is64bit ? MVT::i64 : MVT::i32);
+    SDValue Hi = DAG.getNode(PPCISD::Hi, dl, PtrVT, TGAHi, TLSReg);
+    return DAG.getNode(PPCISD::Lo, dl, PtrVT, TGALo, Hi);
+  }
+
+  if (Model == TLSModel::InitialExec) {
+    SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
+    SDValue TGATLS = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
+                                                PPCII::MO_TLS);
+    SDValue GOTPtr;
+    if (is64bit) {
+      SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
+      GOTPtr = DAG.getNode(PPCISD::ADDIS_GOT_TPREL_HA, dl,
+                           PtrVT, GOTReg, TGA);
+    } else
+      GOTPtr = DAG.getNode(PPCISD::PPC32_GOT, dl, PtrVT);
+    SDValue TPOffset = DAG.getNode(PPCISD::LD_GOT_TPREL_L, dl,
+                                   PtrVT, TGA, GOTPtr);
+    return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TPOffset, TGATLS);
+  }
+
+  if (Model == TLSModel::GeneralDynamic) {
+    SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
+    SDValue GOTPtr;
+    if (is64bit) {
+      SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
+      GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSGD_HA, dl, PtrVT,
+                                   GOTReg, TGA);
+    } else {
+      GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);
+    }
+    SDValue GOTEntry = DAG.getNode(PPCISD::ADDI_TLSGD_L, dl, PtrVT,
+                                   GOTPtr, TGA);
+
+    // We need a chain node, and don't have one handy.  The underlying
+    // call has no side effects, so using the function entry node
+    // suffices.
+    SDValue Chain = DAG.getEntryNode();
+    Chain = DAG.getCopyToReg(Chain, dl,
+                             is64bit ? PPC::X3 : PPC::R3, GOTEntry);
+    SDValue ParmReg = DAG.getRegister(is64bit ? PPC::X3 : PPC::R3,
+                                      is64bit ? MVT::i64 : MVT::i32);
+    SDValue TLSAddr = DAG.getNode(PPCISD::GET_TLS_ADDR, dl,
+                                  PtrVT, ParmReg, TGA);
+    // The return value from GET_TLS_ADDR really is in X3 already, but
+    // some hacks are needed here to tie everything together.  The extra
+    // copies dissolve during subsequent transforms.
+    Chain = DAG.getCopyToReg(Chain, dl, is64bit ? PPC::X3 : PPC::R3, TLSAddr);
+    return DAG.getCopyFromReg(Chain, dl, is64bit ? PPC::X3 : PPC::R3, PtrVT);
+  }
+
+  if (Model == TLSModel::LocalDynamic) {
+    SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
+    SDValue GOTPtr;
+    if (is64bit) {
+      SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
+      GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSLD_HA, dl, PtrVT,
+                           GOTReg, TGA);
+    } else {
+      GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);
+    }
+    SDValue GOTEntry = DAG.getNode(PPCISD::ADDI_TLSLD_L, dl, PtrVT,
+                                   GOTPtr, TGA);
+
+    // We need a chain node, and don't have one handy.  The underlying
+    // call has no side effects, so using the function entry node
+    // suffices.
+    SDValue Chain = DAG.getEntryNode();
+    Chain = DAG.getCopyToReg(Chain, dl,
+                             is64bit ? PPC::X3 : PPC::R3, GOTEntry);
+    SDValue ParmReg = DAG.getRegister(is64bit ? PPC::X3 : PPC::R3,
+                                      is64bit ? MVT::i64 : MVT::i32);
+    SDValue TLSAddr = DAG.getNode(PPCISD::GET_TLSLD_ADDR, dl,
+                                  PtrVT, ParmReg, TGA);
+    // The return value from GET_TLSLD_ADDR really is in X3 already, but
+    // some hacks are needed here to tie everything together.  The extra
+    // copies dissolve during subsequent transforms.
+    Chain = DAG.getCopyToReg(Chain, dl, is64bit ? PPC::X3 : PPC::R3, TLSAddr);
+    SDValue DtvOffsetHi = DAG.getNode(PPCISD::ADDIS_DTPREL_HA, dl, PtrVT,
+                                      Chain, ParmReg, TGA);
+    return DAG.getNode(PPCISD::ADDI_DTPREL_L, dl, PtrVT, DtvOffsetHi, TGA);
+  }
+
+  llvm_unreachable("Unknown TLS model!");
+}
+
+SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  EVT PtrVT = Op.getValueType();
+  GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);
+  SDLoc DL(GSDN);
+  const GlobalValue *GV = GSDN->getGlobal();
+
+  // 64-bit SVR4 ABI code is always position-independent.
+  // The actual address of the GlobalValue is stored in the TOC.
+  if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
+    SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset());
+    return DAG.getNode(PPCISD::TOC_ENTRY, DL, MVT::i64, GA,
+                       DAG.getRegister(PPC::X2, MVT::i64));
+  }
+
+  unsigned MOHiFlag, MOLoFlag;
+  bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag, GV);
+
+  if (isPIC && Subtarget.isSVR4ABI()) {
+    SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT,
+                                            GSDN->getOffset(),
+                                            PPCII::MO_PIC_FLAG);
+    return DAG.getNode(PPCISD::TOC_ENTRY, DL, MVT::i32, GA,
+                       DAG.getNode(PPCISD::GlobalBaseReg, DL, MVT::i32));
+  }
+
+  SDValue GAHi =
+    DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOHiFlag);
+  SDValue GALo =
+    DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOLoFlag);
+
+  SDValue Ptr = LowerLabelRef(GAHi, GALo, isPIC, DAG);
+
+  // If the global reference is actually to a non-lazy-pointer, we have to do an
+  // extra load to get the address of the global.
+  if (MOHiFlag & PPCII::MO_NLP_FLAG)
+    Ptr = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Ptr, MachinePointerInfo(),
+                      false, false, false, 0);
+  return Ptr;
+}
+
+SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+  SDLoc dl(Op);
+
+  if (Op.getValueType() == MVT::v2i64) {
+    // When the operands themselves are v2i64 values, we need to do something
+    // special because VSX has no underlying comparison operations for these.
+    if (Op.getOperand(0).getValueType() == MVT::v2i64) {
+      // Equality can be handled by casting to the legal type for Altivec
+      // comparisons, everything else needs to be expanded.
+      if (CC == ISD::SETEQ || CC == ISD::SETNE) {
+        return DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
+                 DAG.getSetCC(dl, MVT::v4i32,
+                   DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(0)),
+                   DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(1)),
+                   CC));
+      }
+
+      return SDValue();
+    }
+
+    // We handle most of these in the usual way.
+    return Op;
+  }
+
+  // If we're comparing for equality to zero, expose the fact that this is
+  // implented as a ctlz/srl pair on ppc, so that the dag combiner can
+  // fold the new nodes.
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+    if (C->isNullValue() && CC == ISD::SETEQ) {
+      EVT VT = Op.getOperand(0).getValueType();
+      SDValue Zext = Op.getOperand(0);
+      if (VT.bitsLT(MVT::i32)) {
+        VT = MVT::i32;
+        Zext = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Op.getOperand(0));
+      }
+      unsigned Log2b = Log2_32(VT.getSizeInBits());
+      SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Zext);
+      SDValue Scc = DAG.getNode(ISD::SRL, dl, VT, Clz,
+                                DAG.getConstant(Log2b, MVT::i32));
+      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Scc);
+    }
+    // Leave comparisons against 0 and -1 alone for now, since they're usually
+    // optimized.  FIXME: revisit this when we can custom lower all setcc
+    // optimizations.
+    if (C->isAllOnesValue() || C->isNullValue())
+      return SDValue();
+  }
+
+  // If we have an integer seteq/setne, turn it into a compare against zero
+  // by xor'ing the rhs with the lhs, which is faster than setting a
+  // condition register, reading it back out, and masking the correct bit.  The
+  // normal approach here uses sub to do this instead of xor.  Using xor exposes
+  // the result to other bit-twiddling opportunities.
+  EVT LHSVT = Op.getOperand(0).getValueType();
+  if (LHSVT.isInteger() && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+    EVT VT = Op.getValueType();
+    SDValue Sub = DAG.getNode(ISD::XOR, dl, LHSVT, Op.getOperand(0),
+                                Op.getOperand(1));
+    return DAG.getSetCC(dl, VT, Sub, DAG.getConstant(0, LHSVT), CC);
+  }
+  return SDValue();
+}
+
+SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG,
+                                      const PPCSubtarget &Subtarget) const {
+  SDNode *Node = Op.getNode();
+  EVT VT = Node->getValueType(0);
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  SDValue InChain = Node->getOperand(0);
+  SDValue VAListPtr = Node->getOperand(1);
+  const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
+  SDLoc dl(Node);
+
+  assert(!Subtarget.isPPC64() && "LowerVAARG is PPC32 only");
+
+  // gpr_index
+  SDValue GprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain,
+                                    VAListPtr, MachinePointerInfo(SV), MVT::i8,
+                                    false, false, 0);
+  InChain = GprIndex.getValue(1);
+
+  if (VT == MVT::i64) {
+    // Check if GprIndex is even
+    SDValue GprAnd = DAG.getNode(ISD::AND, dl, MVT::i32, GprIndex,
+                                 DAG.getConstant(1, MVT::i32));
+    SDValue CC64 = DAG.getSetCC(dl, MVT::i32, GprAnd,
+                                DAG.getConstant(0, MVT::i32), ISD::SETNE);
+    SDValue GprIndexPlusOne = DAG.getNode(ISD::ADD, dl, MVT::i32, GprIndex,
+                                          DAG.getConstant(1, MVT::i32));
+    // Align GprIndex to be even if it isn't
+    GprIndex = DAG.getNode(ISD::SELECT, dl, MVT::i32, CC64, GprIndexPlusOne,
+                           GprIndex);
+  }
+
+  // fpr index is 1 byte after gpr
+  SDValue FprPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr,
+                               DAG.getConstant(1, MVT::i32));
+
+  // fpr
+  SDValue FprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain,
+                                    FprPtr, MachinePointerInfo(SV), MVT::i8,
+                                    false, false, 0);
+  InChain = FprIndex.getValue(1);
+
+  SDValue RegSaveAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr,
+                                       DAG.getConstant(8, MVT::i32));
+
+  SDValue OverflowAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr,
+                                        DAG.getConstant(4, MVT::i32));
+
+  // areas
+  SDValue OverflowArea = DAG.getLoad(MVT::i32, dl, InChain, OverflowAreaPtr,
+                                     MachinePointerInfo(), false, false,
+                                     false, 0);
+  InChain = OverflowArea.getValue(1);
+
+  SDValue RegSaveArea = DAG.getLoad(MVT::i32, dl, InChain, RegSaveAreaPtr,
+                                    MachinePointerInfo(), false, false,
+                                    false, 0);
+  InChain = RegSaveArea.getValue(1);
+
+  // select overflow_area if index > 8
+  SDValue CC = DAG.getSetCC(dl, MVT::i32, VT.isInteger() ? GprIndex : FprIndex,
+                            DAG.getConstant(8, MVT::i32), ISD::SETLT);
+
+  // adjustment constant gpr_index * 4/8
+  SDValue RegConstant = DAG.getNode(ISD::MUL, dl, MVT::i32,
+                                    VT.isInteger() ? GprIndex : FprIndex,
+                                    DAG.getConstant(VT.isInteger() ? 4 : 8,
+                                                    MVT::i32));
+
+  // OurReg = RegSaveArea + RegConstant
+  SDValue OurReg = DAG.getNode(ISD::ADD, dl, PtrVT, RegSaveArea,
+                               RegConstant);
+
+  // Floating types are 32 bytes into RegSaveArea
+  if (VT.isFloatingPoint())
+    OurReg = DAG.getNode(ISD::ADD, dl, PtrVT, OurReg,
+                         DAG.getConstant(32, MVT::i32));
+
+  // increase {f,g}pr_index by 1 (or 2 if VT is i64)
+  SDValue IndexPlus1 = DAG.getNode(ISD::ADD, dl, MVT::i32,
+                                   VT.isInteger() ? GprIndex : FprIndex,
+                                   DAG.getConstant(VT == MVT::i64 ? 2 : 1,
+                                                   MVT::i32));
+
+  InChain = DAG.getTruncStore(InChain, dl, IndexPlus1,
+                              VT.isInteger() ? VAListPtr : FprPtr,
+                              MachinePointerInfo(SV),
+                              MVT::i8, false, false, 0);
+
+  // determine if we should load from reg_save_area or overflow_area
+  SDValue Result = DAG.getNode(ISD::SELECT, dl, PtrVT, CC, OurReg, OverflowArea);
+
+  // increase overflow_area by 4/8 if gpr/fpr > 8
+  SDValue OverflowAreaPlusN = DAG.getNode(ISD::ADD, dl, PtrVT, OverflowArea,
+                                          DAG.getConstant(VT.isInteger() ? 4 : 8,
+                                          MVT::i32));
+
+  OverflowArea = DAG.getNode(ISD::SELECT, dl, MVT::i32, CC, OverflowArea,
+                             OverflowAreaPlusN);
+
+  InChain = DAG.getTruncStore(InChain, dl, OverflowArea,
+                              OverflowAreaPtr,
+                              MachinePointerInfo(),
+                              MVT::i32, false, false, 0);
+
+  return DAG.getLoad(VT, dl, InChain, Result, MachinePointerInfo(),
+                     false, false, false, 0);
+}
+
+SDValue PPCTargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG,
+                                       const PPCSubtarget &Subtarget) const {
+  assert(!Subtarget.isPPC64() && "LowerVACOPY is PPC32 only");
+
+  // We have to copy the entire va_list struct:
+  // 2*sizeof(char) + 2 Byte alignment + 2*sizeof(char*) = 12 Byte
+  return DAG.getMemcpy(Op.getOperand(0), Op,
+                       Op.getOperand(1), Op.getOperand(2),
+                       DAG.getConstant(12, MVT::i32), 8, false, true,
+                       MachinePointerInfo(), MachinePointerInfo());
+}
+
+SDValue PPCTargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  return Op.getOperand(0);
+}
+
+SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  SDValue Trmp = Op.getOperand(1); // trampoline
+  SDValue FPtr = Op.getOperand(2); // nested function
+  SDValue Nest = Op.getOperand(3); // 'nest' parameter value
+  SDLoc dl(Op);
+
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  bool isPPC64 = (PtrVT == MVT::i64);
+  Type *IntPtrTy =
+    DAG.getTargetLoweringInfo().getDataLayout()->getIntPtrType(
+                                                             *DAG.getContext());
+
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+
+  Entry.Ty = IntPtrTy;
+  Entry.Node = Trmp; Args.push_back(Entry);
+
+  // TrampSize == (isPPC64 ? 48 : 40);
+  Entry.Node = DAG.getConstant(isPPC64 ? 48 : 40,
+                               isPPC64 ? MVT::i64 : MVT::i32);
+  Args.push_back(Entry);
+
+  Entry.Node = FPtr; Args.push_back(Entry);
+  Entry.Node = Nest; Args.push_back(Entry);
+
+  // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg)
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(Chain)
+    .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+               DAG.getExternalSymbol("__trampoline_setup", PtrVT),
+               std::move(Args), 0);
+
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+  return CallResult.second;
+}
+
+SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG,
+                                        const PPCSubtarget &Subtarget) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+
+  SDLoc dl(Op);
+
+  if (Subtarget.isDarwinABI() || Subtarget.isPPC64()) {
+    // vastart just stores the address of the VarArgsFrameIndex slot into the
+    // memory location argument.
+    EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+    SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
+    const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+    return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
+                        MachinePointerInfo(SV),
+                        false, false, 0);
+  }
+
+  // For the 32-bit SVR4 ABI we follow the layout of the va_list struct.
+  // We suppose the given va_list is already allocated.
+  //
+  // typedef struct {
+  //  char gpr;     /* index into the array of 8 GPRs
+  //                 * stored in the register save area
+  //                 * gpr=0 corresponds to r3,
+  //                 * gpr=1 to r4, etc.
+  //                 */
+  //  char fpr;     /* index into the array of 8 FPRs
+  //                 * stored in the register save area
+  //                 * fpr=0 corresponds to f1,
+  //                 * fpr=1 to f2, etc.
+  //                 */
+  //  char *overflow_arg_area;
+  //                /* location on stack that holds
+  //                 * the next overflow argument
+  //                 */
+  //  char *reg_save_area;
+  //               /* where r3:r10 and f1:f8 (if saved)
+  //                * are stored
+  //                */
+  // } va_list[1];
+
+
+  SDValue ArgGPR = DAG.getConstant(FuncInfo->getVarArgsNumGPR(), MVT::i32);
+  SDValue ArgFPR = DAG.getConstant(FuncInfo->getVarArgsNumFPR(), MVT::i32);
+
+
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+
+  SDValue StackOffsetFI = DAG.getFrameIndex(FuncInfo->getVarArgsStackOffset(),
+                                            PtrVT);
+  SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
+                                 PtrVT);
+
+  uint64_t FrameOffset = PtrVT.getSizeInBits()/8;
+  SDValue ConstFrameOffset = DAG.getConstant(FrameOffset, PtrVT);
+
+  uint64_t StackOffset = PtrVT.getSizeInBits()/8 - 1;
+  SDValue ConstStackOffset = DAG.getConstant(StackOffset, PtrVT);
+
+  uint64_t FPROffset = 1;
+  SDValue ConstFPROffset = DAG.getConstant(FPROffset, PtrVT);
+
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+
+  // Store first byte : number of int regs
+  SDValue firstStore = DAG.getTruncStore(Op.getOperand(0), dl, ArgGPR,
+                                         Op.getOperand(1),
+                                         MachinePointerInfo(SV),
+                                         MVT::i8, false, false, 0);
+  uint64_t nextOffset = FPROffset;
+  SDValue nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, Op.getOperand(1),
+                                  ConstFPROffset);
+
+  // Store second byte : number of float regs
+  SDValue secondStore =
+    DAG.getTruncStore(firstStore, dl, ArgFPR, nextPtr,
+                      MachinePointerInfo(SV, nextOffset), MVT::i8,
+                      false, false, 0);
+  nextOffset += StackOffset;
+  nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstStackOffset);
+
+  // Store second word : arguments given on stack
+  SDValue thirdStore =
+    DAG.getStore(secondStore, dl, StackOffsetFI, nextPtr,
+                 MachinePointerInfo(SV, nextOffset),
+                 false, false, 0);
+  nextOffset += FrameOffset;
+  nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstFrameOffset);
+
+  // Store third word : arguments given in registers
+  return DAG.getStore(thirdStore, dl, FR, nextPtr,
+                      MachinePointerInfo(SV, nextOffset),
+                      false, false, 0);
+
+}
+
+#include "PPCGenCallingConv.inc"
+
+// Function whose sole purpose is to kill compiler warnings 
+// stemming from unused functions included from PPCGenCallingConv.inc.
+CCAssignFn *PPCTargetLowering::useFastISelCCs(unsigned Flag) const {
+  return Flag ? CC_PPC64_ELF_FIS : RetCC_PPC64_ELF_FIS;
+}
+
+bool llvm::CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                      CCValAssign::LocInfo &LocInfo,
+                                      ISD::ArgFlagsTy &ArgFlags,
+                                      CCState &State) {
+  return true;
+}
+
+bool llvm::CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT,
+                                             MVT &LocVT,
+                                             CCValAssign::LocInfo &LocInfo,
+                                             ISD::ArgFlagsTy &ArgFlags,
+                                             CCState &State) {
+  static const MCPhysReg ArgRegs[] = {
+    PPC::R3, PPC::R4, PPC::R5, PPC::R6,
+    PPC::R7, PPC::R8, PPC::R9, PPC::R10,
+  };
+  const unsigned NumArgRegs = array_lengthof(ArgRegs);
+
+  unsigned RegNum = State.getFirstUnallocated(ArgRegs, NumArgRegs);
+
+  // Skip one register if the first unallocated register has an even register
+  // number and there are still argument registers available which have not been
+  // allocated yet. RegNum is actually an index into ArgRegs, which means we
+  // need to skip a register if RegNum is odd.
+  if (RegNum != NumArgRegs && RegNum % 2 == 1) {
+    State.AllocateReg(ArgRegs[RegNum]);
+  }
+
+  // Always return false here, as this function only makes sure that the first
+  // unallocated register has an odd register number and does not actually
+  // allocate a register for the current argument.
+  return false;
+}
+
+bool llvm::CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT,
+                                               MVT &LocVT,
+                                               CCValAssign::LocInfo &LocInfo,
+                                               ISD::ArgFlagsTy &ArgFlags,
+                                               CCState &State) {
+  static const MCPhysReg ArgRegs[] = {
+    PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
+    PPC::F8
+  };
+
+  const unsigned NumArgRegs = array_lengthof(ArgRegs);
+
+  unsigned RegNum = State.getFirstUnallocated(ArgRegs, NumArgRegs);
+
+  // If there is only one Floating-point register left we need to put both f64
+  // values of a split ppc_fp128 value on the stack.
+  if (RegNum != NumArgRegs && ArgRegs[RegNum] == PPC::F8) {
+    State.AllocateReg(ArgRegs[RegNum]);
+  }
+
+  // Always return false here, as this function only makes sure that the two f64
+  // values a ppc_fp128 value is split into are both passed in registers or both
+  // passed on the stack and does not actually allocate a register for the
+  // current argument.
+  return false;
+}
+
+/// GetFPR - Get the set of FP registers that should be allocated for arguments,
+/// on Darwin.
+static const MCPhysReg *GetFPR() {
+  static const MCPhysReg FPR[] = {
+    PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
+    PPC::F8, PPC::F9, PPC::F10, PPC::F11, PPC::F12, PPC::F13
+  };
+
+  return FPR;
+}
+
+/// CalculateStackSlotSize - Calculates the size reserved for this argument on
+/// the stack.
+static unsigned CalculateStackSlotSize(EVT ArgVT, ISD::ArgFlagsTy Flags,
+                                       unsigned PtrByteSize) {
+  unsigned ArgSize = ArgVT.getStoreSize();
+  if (Flags.isByVal())
+    ArgSize = Flags.getByValSize();
+
+  // Round up to multiples of the pointer size, except for array members,
+  // which are always packed.
+  if (!Flags.isInConsecutiveRegs())
+    ArgSize = ((ArgSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
+
+  return ArgSize;
+}
+
+/// CalculateStackSlotAlignment - Calculates the alignment of this argument
+/// on the stack.
+static unsigned CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT,
+                                            ISD::ArgFlagsTy Flags,
+                                            unsigned PtrByteSize) {
+  unsigned Align = PtrByteSize;
+
+  // Altivec parameters are padded to a 16 byte boundary.
+  if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
+      ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
+      ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64)
+    Align = 16;
+
+  // ByVal parameters are aligned as requested.
+  if (Flags.isByVal()) {
+    unsigned BVAlign = Flags.getByValAlign();
+    if (BVAlign > PtrByteSize) {
+      if (BVAlign % PtrByteSize != 0)
+          llvm_unreachable(
+            "ByVal alignment is not a multiple of the pointer size");
+
+      Align = BVAlign;
+    }
+  }
+
+  // Array members are always packed to their original alignment.
+  if (Flags.isInConsecutiveRegs()) {
+    // If the array member was split into multiple registers, the first
+    // needs to be aligned to the size of the full type.  (Except for
+    // ppcf128, which is only aligned as its f64 components.)
+    if (Flags.isSplit() && OrigVT != MVT::ppcf128)
+      Align = OrigVT.getStoreSize();
+    else
+      Align = ArgVT.getStoreSize();
+  }
+
+  return Align;
+}
+
+/// CalculateStackSlotUsed - Return whether this argument will use its
+/// stack slot (instead of being passed in registers).  ArgOffset,
+/// AvailableFPRs, and AvailableVRs must hold the current argument
+/// position, and will be updated to account for this argument.
+static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT,
+                                   ISD::ArgFlagsTy Flags,
+                                   unsigned PtrByteSize,
+                                   unsigned LinkageSize,
+                                   unsigned ParamAreaSize,
+                                   unsigned &ArgOffset,
+                                   unsigned &AvailableFPRs,
+                                   unsigned &AvailableVRs) {
+  bool UseMemory = false;
+
+  // Respect alignment of argument on the stack.
+  unsigned Align =
+    CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
+  ArgOffset = ((ArgOffset + Align - 1) / Align) * Align;
+  // If there's no space left in the argument save area, we must
+  // use memory (this check also catches zero-sized arguments).
+  if (ArgOffset >= LinkageSize + ParamAreaSize)
+    UseMemory = true;
+
+  // Allocate argument on the stack.
+  ArgOffset += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
+  if (Flags.isInConsecutiveRegsLast())
+    ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
+  // If we overran the argument save area, we must use memory
+  // (this check catches arguments passed partially in memory)
+  if (ArgOffset > LinkageSize + ParamAreaSize)
+    UseMemory = true;
+
+  // However, if the argument is actually passed in an FPR or a VR,
+  // we don't use memory after all.
+  if (!Flags.isByVal()) {
+    if (ArgVT == MVT::f32 || ArgVT == MVT::f64)
+      if (AvailableFPRs > 0) {
+        --AvailableFPRs;
+        return false;
+      }
+    if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
+        ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
+        ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64)
+      if (AvailableVRs > 0) {
+        --AvailableVRs;
+        return false;
+      }
+  }
+
+  return UseMemory;
+}
+
+/// EnsureStackAlignment - Round stack frame size up from NumBytes to
+/// ensure minimum alignment required for target.
+static unsigned EnsureStackAlignment(const TargetMachine &Target,
+                                     unsigned NumBytes) {
+  unsigned TargetAlign = Target.getFrameLowering()->getStackAlignment();
+  unsigned AlignMask = TargetAlign - 1;
+  NumBytes = (NumBytes + AlignMask) & ~AlignMask;
+  return NumBytes;
+}
+
+SDValue
+PPCTargetLowering::LowerFormalArguments(SDValue Chain,
+                                        CallingConv::ID CallConv, bool isVarArg,
+                                        const SmallVectorImpl<ISD::InputArg>
+                                          &Ins,
+                                        SDLoc dl, SelectionDAG &DAG,
+                                        SmallVectorImpl<SDValue> &InVals)
+                                          const {
+  if (Subtarget.isSVR4ABI()) {
+    if (Subtarget.isPPC64())
+      return LowerFormalArguments_64SVR4(Chain, CallConv, isVarArg, Ins,
+                                         dl, DAG, InVals);
+    else
+      return LowerFormalArguments_32SVR4(Chain, CallConv, isVarArg, Ins,
+                                         dl, DAG, InVals);
+  } else {
+    return LowerFormalArguments_Darwin(Chain, CallConv, isVarArg, Ins,
+                                       dl, DAG, InVals);
+  }
+}
+
+SDValue
+PPCTargetLowering::LowerFormalArguments_32SVR4(
+                                      SDValue Chain,
+                                      CallingConv::ID CallConv, bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg>
+                                        &Ins,
+                                      SDLoc dl, SelectionDAG &DAG,
+                                      SmallVectorImpl<SDValue> &InVals) const {
+
+  // 32-bit SVR4 ABI Stack Frame Layout:
+  //              +-----------------------------------+
+  //        +-->  |            Back chain             |
+  //        |     +-----------------------------------+
+  //        |     | Floating-point register save area |
+  //        |     +-----------------------------------+
+  //        |     |    General register save area     |
+  //        |     +-----------------------------------+
+  //        |     |          CR save word             |
+  //        |     +-----------------------------------+
+  //        |     |         VRSAVE save word          |
+  //        |     +-----------------------------------+
+  //        |     |         Alignment padding         |
+  //        |     +-----------------------------------+
+  //        |     |     Vector register save area     |
+  //        |     +-----------------------------------+
+  //        |     |       Local variable space        |
+  //        |     +-----------------------------------+
+  //        |     |        Parameter list area        |
+  //        |     +-----------------------------------+
+  //        |     |           LR save word            |
+  //        |     +-----------------------------------+
+  // SP-->  +---  |            Back chain             |
+  //              +-----------------------------------+
+  //
+  // Specifications:
+  //   System V Application Binary Interface PowerPC Processor Supplement
+  //   AltiVec Technology Programming Interface Manual
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  // Potential tail calls could cause overwriting of argument stack slots.
+  bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
+                       (CallConv == CallingConv::Fast));
+  unsigned PtrByteSize = 4;
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+
+  // Reserve space for the linkage area on the stack.
+  unsigned LinkageSize = PPCFrameLowering::getLinkageSize(false, false, false);
+  CCInfo.AllocateStack(LinkageSize, PtrByteSize);
+
+  CCInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4);
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+
+    // Arguments stored in registers.
+    if (VA.isRegLoc()) {
+      const TargetRegisterClass *RC;
+      EVT ValVT = VA.getValVT();
+
+      switch (ValVT.getSimpleVT().SimpleTy) {
+        default:
+          llvm_unreachable("ValVT not supported by formal arguments Lowering");
+        case MVT::i1:
+        case MVT::i32:
+          RC = &PPC::GPRCRegClass;
+          break;
+        case MVT::f32:
+          RC = &PPC::F4RCRegClass;
+          break;
+        case MVT::f64:
+          if (Subtarget.hasVSX())
+            RC = &PPC::VSFRCRegClass;
+          else
+            RC = &PPC::F8RCRegClass;
+          break;
+        case MVT::v16i8:
+        case MVT::v8i16:
+        case MVT::v4i32:
+        case MVT::v4f32:
+          RC = &PPC::VRRCRegClass;
+          break;
+        case MVT::v2f64:
+        case MVT::v2i64:
+          RC = &PPC::VSHRCRegClass;
+          break;
+      }
+
+      // Transform the arguments stored in physical registers into virtual ones.
+      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+      SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg,
+                                            ValVT == MVT::i1 ? MVT::i32 : ValVT);
+
+      if (ValVT == MVT::i1)
+        ArgValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgValue);
+
+      InVals.push_back(ArgValue);
+    } else {
+      // Argument stored in memory.
+      assert(VA.isMemLoc());
+
+      unsigned ArgSize = VA.getLocVT().getStoreSize();
+      int FI = MFI->CreateFixedObject(ArgSize, VA.getLocMemOffset(),
+                                      isImmutable);
+
+      // Create load nodes to retrieve arguments from the stack.
+      SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+      InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
+                                   MachinePointerInfo(),
+                                   false, false, false, 0));
+    }
+  }
+
+  // Assign locations to all of the incoming aggregate by value arguments.
+  // Aggregates passed by value are stored in the local variable space of the
+  // caller's stack frame, right above the parameter list area.
+  SmallVector<CCValAssign, 16> ByValArgLocs;
+  CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                      getTargetMachine(), ByValArgLocs, *DAG.getContext());
+
+  // Reserve stack space for the allocations in CCInfo.
+  CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);
+
+  CCByValInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4_ByVal);
+
+  // Area that is at least reserved in the caller of this function.
+  unsigned MinReservedArea = CCByValInfo.getNextStackOffset();
+  MinReservedArea = std::max(MinReservedArea, LinkageSize);
+
+  // Set the size that is at least reserved in caller of this function.  Tail
+  // call optimized function's reserved stack space needs to be aligned so that
+  // taking the difference between two stack areas will result in an aligned
+  // stack.
+  MinReservedArea = EnsureStackAlignment(MF.getTarget(), MinReservedArea);
+  FuncInfo->setMinReservedArea(MinReservedArea);
+
+  SmallVector<SDValue, 8> MemOps;
+
+  // If the function takes variable number of arguments, make a frame index for
+  // the start of the first vararg value... for expansion of llvm.va_start.
+  if (isVarArg) {
+    static const MCPhysReg GPArgRegs[] = {
+      PPC::R3, PPC::R4, PPC::R5, PPC::R6,
+      PPC::R7, PPC::R8, PPC::R9, PPC::R10,
+    };
+    const unsigned NumGPArgRegs = array_lengthof(GPArgRegs);
+
+    static const MCPhysReg FPArgRegs[] = {
+      PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
+      PPC::F8
+    };
+    const unsigned NumFPArgRegs = array_lengthof(FPArgRegs);
+
+    FuncInfo->setVarArgsNumGPR(CCInfo.getFirstUnallocated(GPArgRegs,
+                                                          NumGPArgRegs));
+    FuncInfo->setVarArgsNumFPR(CCInfo.getFirstUnallocated(FPArgRegs,
+                                                          NumFPArgRegs));
+
+    // Make room for NumGPArgRegs and NumFPArgRegs.
+    int Depth = NumGPArgRegs * PtrVT.getSizeInBits()/8 +
+                NumFPArgRegs * EVT(MVT::f64).getSizeInBits()/8;
+
+    FuncInfo->setVarArgsStackOffset(
+      MFI->CreateFixedObject(PtrVT.getSizeInBits()/8,
+                             CCInfo.getNextStackOffset(), true));
+
+    FuncInfo->setVarArgsFrameIndex(MFI->CreateStackObject(Depth, 8, false));
+    SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
+
+    // The fixed integer arguments of a variadic function are stored to the
+    // VarArgsFrameIndex on the stack so that they may be loaded by deferencing
+    // the result of va_next.
+    for (unsigned GPRIndex = 0; GPRIndex != NumGPArgRegs; ++GPRIndex) {
+      // Get an existing live-in vreg, or add a new one.
+      unsigned VReg = MF.getRegInfo().getLiveInVirtReg(GPArgRegs[GPRIndex]);
+      if (!VReg)
+        VReg = MF.addLiveIn(GPArgRegs[GPRIndex], &PPC::GPRCRegClass);
+
+      SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+      SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                                   MachinePointerInfo(), false, false, 0);
+      MemOps.push_back(Store);
+      // Increment the address by four for the next argument to store
+      SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, PtrVT);
+      FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
+    }
+
+    // FIXME 32-bit SVR4: We only need to save FP argument registers if CR bit 6
+    // is set.
+    // The double arguments are stored to the VarArgsFrameIndex
+    // on the stack.
+    for (unsigned FPRIndex = 0; FPRIndex != NumFPArgRegs; ++FPRIndex) {
+      // Get an existing live-in vreg, or add a new one.
+      unsigned VReg = MF.getRegInfo().getLiveInVirtReg(FPArgRegs[FPRIndex]);
+      if (!VReg)
+        VReg = MF.addLiveIn(FPArgRegs[FPRIndex], &PPC::F8RCRegClass);
+
+      SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::f64);
+      SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                                   MachinePointerInfo(), false, false, 0);
+      MemOps.push_back(Store);
+      // Increment the address by eight for the next argument to store
+      SDValue PtrOff = DAG.getConstant(EVT(MVT::f64).getSizeInBits()/8,
+                                         PtrVT);
+      FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
+    }
+  }
+
+  if (!MemOps.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
+
+  return Chain;
+}
+
+// PPC64 passes i8, i16, and i32 values in i64 registers. Promote
+// value to MVT::i64 and then truncate to the correct register size.
+SDValue
+PPCTargetLowering::extendArgForPPC64(ISD::ArgFlagsTy Flags, EVT ObjectVT,
+                                     SelectionDAG &DAG, SDValue ArgVal,
+                                     SDLoc dl) const {
+  if (Flags.isSExt())
+    ArgVal = DAG.getNode(ISD::AssertSext, dl, MVT::i64, ArgVal,
+                         DAG.getValueType(ObjectVT));
+  else if (Flags.isZExt())
+    ArgVal = DAG.getNode(ISD::AssertZext, dl, MVT::i64, ArgVal,
+                         DAG.getValueType(ObjectVT));
+
+  return DAG.getNode(ISD::TRUNCATE, dl, ObjectVT, ArgVal);
+}
+
+SDValue
+PPCTargetLowering::LowerFormalArguments_64SVR4(
+                                      SDValue Chain,
+                                      CallingConv::ID CallConv, bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg>
+                                        &Ins,
+                                      SDLoc dl, SelectionDAG &DAG,
+                                      SmallVectorImpl<SDValue> &InVals) const {
+  // TODO: add description of PPC stack frame format, or at least some docs.
+  //
+  bool isELFv2ABI = Subtarget.isELFv2ABI();
+  bool isLittleEndian = Subtarget.isLittleEndian();
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  // Potential tail calls could cause overwriting of argument stack slots.
+  bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
+                       (CallConv == CallingConv::Fast));
+  unsigned PtrByteSize = 8;
+
+  unsigned LinkageSize = PPCFrameLowering::getLinkageSize(true, false,
+                                                          isELFv2ABI);
+
+  static const MCPhysReg GPR[] = {
+    PPC::X3, PPC::X4, PPC::X5, PPC::X6,
+    PPC::X7, PPC::X8, PPC::X9, PPC::X10,
+  };
+
+  static const MCPhysReg *FPR = GetFPR();
+
+  static const MCPhysReg VR[] = {
+    PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
+    PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
+  };
+  static const MCPhysReg VSRH[] = {
+    PPC::VSH2, PPC::VSH3, PPC::VSH4, PPC::VSH5, PPC::VSH6, PPC::VSH7, PPC::VSH8,
+    PPC::VSH9, PPC::VSH10, PPC::VSH11, PPC::VSH12, PPC::VSH13
+  };
+
+  const unsigned Num_GPR_Regs = array_lengthof(GPR);
+  const unsigned Num_FPR_Regs = 13;
+  const unsigned Num_VR_Regs  = array_lengthof(VR);
+
+  // Do a first pass over the arguments to determine whether the ABI
+  // guarantees that our caller has allocated the parameter save area
+  // on its stack frame.  In the ELFv1 ABI, this is always the case;
+  // in the ELFv2 ABI, it is true if this is a vararg function or if
+  // any parameter is located in a stack slot.
+
+  bool HasParameterArea = !isELFv2ABI || isVarArg;
+  unsigned ParamAreaSize = Num_GPR_Regs * PtrByteSize;
+  unsigned NumBytes = LinkageSize;
+  unsigned AvailableFPRs = Num_FPR_Regs;
+  unsigned AvailableVRs = Num_VR_Regs;
+  for (unsigned i = 0, e = Ins.size(); i != e; ++i)
+    if (CalculateStackSlotUsed(Ins[i].VT, Ins[i].ArgVT, Ins[i].Flags,
+                               PtrByteSize, LinkageSize, ParamAreaSize,
+                               NumBytes, AvailableFPRs, AvailableVRs))
+      HasParameterArea = true;
+
+  // Add DAG nodes to load the arguments or copy them out of registers.  On
+  // entry to a function on PPC, the arguments start after the linkage area,
+  // although the first ones are often in registers.
+
+  unsigned ArgOffset = LinkageSize;
+  unsigned GPR_idx, FPR_idx = 0, VR_idx = 0;
+  SmallVector<SDValue, 8> MemOps;
+  Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin();
+  unsigned CurArgIdx = 0;
+  for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
+    SDValue ArgVal;
+    bool needsLoad = false;
+    EVT ObjectVT = Ins[ArgNo].VT;
+    EVT OrigVT = Ins[ArgNo].ArgVT;
+    unsigned ObjSize = ObjectVT.getStoreSize();
+    unsigned ArgSize = ObjSize;
+    ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
+    std::advance(FuncArg, Ins[ArgNo].OrigArgIndex - CurArgIdx);
+    CurArgIdx = Ins[ArgNo].OrigArgIndex;
+
+    /* Respect alignment of argument on the stack.  */
+    unsigned Align =
+      CalculateStackSlotAlignment(ObjectVT, OrigVT, Flags, PtrByteSize);
+    ArgOffset = ((ArgOffset + Align - 1) / Align) * Align;
+    unsigned CurArgOffset = ArgOffset;
+
+    /* Compute GPR index associated with argument offset.  */
+    GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
+    GPR_idx = std::min(GPR_idx, Num_GPR_Regs);
+
+    // FIXME the codegen can be much improved in some cases.
+    // We do not have to keep everything in memory.
+    if (Flags.isByVal()) {
+      // ObjSize is the true size, ArgSize rounded up to multiple of registers.
+      ObjSize = Flags.getByValSize();
+      ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
+      // Empty aggregate parameters do not take up registers.  Examples:
+      //   struct { } a;
+      //   union  { } b;
+      //   int c[0];
+      // etc.  However, we have to provide a place-holder in InVals, so
+      // pretend we have an 8-byte item at the current address for that
+      // purpose.
+      if (!ObjSize) {
+        int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true);
+        SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+        InVals.push_back(FIN);
+        continue;
+      }
+
+      // Create a stack object covering all stack doublewords occupied
+      // by the argument.  If the argument is (fully or partially) on
+      // the stack, or if the argument is fully in registers but the
+      // caller has allocated the parameter save anyway, we can refer
+      // directly to the caller's stack frame.  Otherwise, create a
+      // local copy in our own frame.
+      int FI;
+      if (HasParameterArea ||
+          ArgSize + ArgOffset > LinkageSize + Num_GPR_Regs * PtrByteSize)
+        FI = MFI->CreateFixedObject(ArgSize, ArgOffset, true);
+      else
+        FI = MFI->CreateStackObject(ArgSize, Align, false);
+      SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+
+      // Handle aggregates smaller than 8 bytes.
+      if (ObjSize < PtrByteSize) {
+        // The value of the object is its address, which differs from the
+        // address of the enclosing doubleword on big-endian systems.
+        SDValue Arg = FIN;
+        if (!isLittleEndian) {
+          SDValue ArgOff = DAG.getConstant(PtrByteSize - ObjSize, PtrVT);
+          Arg = DAG.getNode(ISD::ADD, dl, ArgOff.getValueType(), Arg, ArgOff);
+        }
+        InVals.push_back(Arg);
+
+        if (GPR_idx != Num_GPR_Regs) {
+          unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+          SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+          SDValue Store;
+
+          if (ObjSize==1 || ObjSize==2 || ObjSize==4) {
+            EVT ObjType = (ObjSize == 1 ? MVT::i8 :
+                           (ObjSize == 2 ? MVT::i16 : MVT::i32));
+            Store = DAG.getTruncStore(Val.getValue(1), dl, Val, Arg,
+                                      MachinePointerInfo(FuncArg),
+                                      ObjType, false, false, 0);
+          } else {
+            // For sizes that don't fit a truncating store (3, 5, 6, 7),
+            // store the whole register as-is to the parameter save area
+            // slot.
+            Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                                 MachinePointerInfo(FuncArg),
+                                 false, false, 0);
+          }
+
+          MemOps.push_back(Store);
+        }
+        // Whether we copied from a register or not, advance the offset
+        // into the parameter save area by a full doubleword.
+        ArgOffset += PtrByteSize;
+        continue;
+      }
+
+      // The value of the object is its address, which is the address of
+      // its first stack doubleword.
+      InVals.push_back(FIN);
+
+      // Store whatever pieces of the object are in registers to memory.
+      for (unsigned j = 0; j < ArgSize; j += PtrByteSize) {
+        if (GPR_idx == Num_GPR_Regs)
+          break;
+
+        unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+        SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+        SDValue Addr = FIN;
+        if (j) {
+          SDValue Off = DAG.getConstant(j, PtrVT);
+          Addr = DAG.getNode(ISD::ADD, dl, Off.getValueType(), Addr, Off);
+        }
+        SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, Addr,
+                                     MachinePointerInfo(FuncArg, j),
+                                     false, false, 0);
+        MemOps.push_back(Store);
+        ++GPR_idx;
+      }
+      ArgOffset += ArgSize;
+      continue;
+    }
+
+    switch (ObjectVT.getSimpleVT().SimpleTy) {
+    default: llvm_unreachable("Unhandled argument type!");
+    case MVT::i1:
+    case MVT::i32:
+    case MVT::i64:
+      // These can be scalar arguments or elements of an integer array type
+      // passed directly.  Clang may use those instead of "byval" aggregate
+      // types to avoid forcing arguments to memory unnecessarily.
+      if (GPR_idx != Num_GPR_Regs) {
+        unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+        ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
+
+        if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1)
+          // PPC64 passes i8, i16, and i32 values in i64 registers. Promote
+          // value to MVT::i64 and then truncate to the correct register size.
+          ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
+      } else {
+        needsLoad = true;
+        ArgSize = PtrByteSize;
+      }
+      ArgOffset += 8;
+      break;
+
+    case MVT::f32:
+    case MVT::f64:
+      // These can be scalar arguments or elements of a float array type
+      // passed directly.  The latter are used to implement ELFv2 homogenous
+      // float aggregates.
+      if (FPR_idx != Num_FPR_Regs) {
+        unsigned VReg;
+
+        if (ObjectVT == MVT::f32)
+          VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F4RCRegClass);
+        else
+          VReg = MF.addLiveIn(FPR[FPR_idx], Subtarget.hasVSX() ?
+                                            &PPC::VSFRCRegClass :
+                                            &PPC::F8RCRegClass);
+
+        ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
+        ++FPR_idx;
+      } else if (GPR_idx != Num_GPR_Regs) {
+        // This can only ever happen in the presence of f32 array types,
+        // since otherwise we never run out of FPRs before running out
+        // of GPRs.
+        unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+        ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
+
+        if (ObjectVT == MVT::f32) {
+          if ((ArgOffset % PtrByteSize) == (isLittleEndian ? 4 : 0))
+            ArgVal = DAG.getNode(ISD::SRL, dl, MVT::i64, ArgVal,
+                                 DAG.getConstant(32, MVT::i32));
+          ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, ArgVal);
+        }
+
+        ArgVal = DAG.getNode(ISD::BITCAST, dl, ObjectVT, ArgVal);
+      } else {
+        needsLoad = true;
+      }
+
+      // When passing an array of floats, the array occupies consecutive
+      // space in the argument area; only round up to the next doubleword
+      // at the end of the array.  Otherwise, each float takes 8 bytes.
+      ArgSize = Flags.isInConsecutiveRegs() ? ObjSize : PtrByteSize;
+      ArgOffset += ArgSize;
+      if (Flags.isInConsecutiveRegsLast())
+        ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
+      break;
+    case MVT::v4f32:
+    case MVT::v4i32:
+    case MVT::v8i16:
+    case MVT::v16i8:
+    case MVT::v2f64:
+    case MVT::v2i64:
+      // These can be scalar arguments or elements of a vector array type
+      // passed directly.  The latter are used to implement ELFv2 homogenous
+      // vector aggregates.
+      if (VR_idx != Num_VR_Regs) {
+        unsigned VReg = (ObjectVT == MVT::v2f64 || ObjectVT == MVT::v2i64) ?
+                        MF.addLiveIn(VSRH[VR_idx], &PPC::VSHRCRegClass) :
+                        MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);
+        ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
+        ++VR_idx;
+      } else {
+        needsLoad = true;
+      }
+      ArgOffset += 16;
+      break;
+    }
+
+    // We need to load the argument to a virtual register if we determined
+    // above that we ran out of physical registers of the appropriate type.
+    if (needsLoad) {
+      if (ObjSize < ArgSize && !isLittleEndian)
+        CurArgOffset += ArgSize - ObjSize;
+      int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, isImmutable);
+      SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+      ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo(),
+                           false, false, false, 0);
+    }
+
+    InVals.push_back(ArgVal);
+  }
+
+  // Area that is at least reserved in the caller of this function.
+  unsigned MinReservedArea;
+  if (HasParameterArea)
+    MinReservedArea = std::max(ArgOffset, LinkageSize + 8 * PtrByteSize);
+  else
+    MinReservedArea = LinkageSize;
+
+  // Set the size that is at least reserved in caller of this function.  Tail
+  // call optimized functions' reserved stack space needs to be aligned so that
+  // taking the difference between two stack areas will result in an aligned
+  // stack.
+  MinReservedArea = EnsureStackAlignment(MF.getTarget(), MinReservedArea);
+  FuncInfo->setMinReservedArea(MinReservedArea);
+
+  // If the function takes variable number of arguments, make a frame index for
+  // the start of the first vararg value... for expansion of llvm.va_start.
+  if (isVarArg) {
+    int Depth = ArgOffset;
+
+    FuncInfo->setVarArgsFrameIndex(
+      MFI->CreateFixedObject(PtrByteSize, Depth, true));
+    SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
+
+    // If this function is vararg, store any remaining integer argument regs
+    // to their spots on the stack so that they may be loaded by deferencing the
+    // result of va_next.
+    for (GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
+         GPR_idx < Num_GPR_Regs; ++GPR_idx) {
+      unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+      SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+      SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                                   MachinePointerInfo(), false, false, 0);
+      MemOps.push_back(Store);
+      // Increment the address by four for the next argument to store
+      SDValue PtrOff = DAG.getConstant(PtrByteSize, PtrVT);
+      FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
+    }
+  }
+
+  if (!MemOps.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
+
+  return Chain;
+}
+
+SDValue
+PPCTargetLowering::LowerFormalArguments_Darwin(
+                                      SDValue Chain,
+                                      CallingConv::ID CallConv, bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg>
+                                        &Ins,
+                                      SDLoc dl, SelectionDAG &DAG,
+                                      SmallVectorImpl<SDValue> &InVals) const {
+  // TODO: add description of PPC stack frame format, or at least some docs.
+  //
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  bool isPPC64 = PtrVT == MVT::i64;
+  // Potential tail calls could cause overwriting of argument stack slots.
+  bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
+                       (CallConv == CallingConv::Fast));
+  unsigned PtrByteSize = isPPC64 ? 8 : 4;
+
+  unsigned LinkageSize = PPCFrameLowering::getLinkageSize(isPPC64, true,
+                                                          false);
+  unsigned ArgOffset = LinkageSize;
+  // Area that is at least reserved in caller of this function.
+  unsigned MinReservedArea = ArgOffset;
+
+  static const MCPhysReg GPR_32[] = {           // 32-bit registers.
+    PPC::R3, PPC::R4, PPC::R5, PPC::R6,
+    PPC::R7, PPC::R8, PPC::R9, PPC::R10,
+  };
+  static const MCPhysReg GPR_64[] = {           // 64-bit registers.
+    PPC::X3, PPC::X4, PPC::X5, PPC::X6,
+    PPC::X7, PPC::X8, PPC::X9, PPC::X10,
+  };
+
+  static const MCPhysReg *FPR = GetFPR();
+
+  static const MCPhysReg VR[] = {
+    PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
+    PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
+  };
+
+  const unsigned Num_GPR_Regs = array_lengthof(GPR_32);
+  const unsigned Num_FPR_Regs = 13;
+  const unsigned Num_VR_Regs  = array_lengthof( VR);
+
+  unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
+
+  const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32;
+
+  // In 32-bit non-varargs functions, the stack space for vectors is after the
+  // stack space for non-vectors.  We do not use this space unless we have
+  // too many vectors to fit in registers, something that only occurs in
+  // constructed examples:), but we have to walk the arglist to figure
+  // that out...for the pathological case, compute VecArgOffset as the
+  // start of the vector parameter area.  Computing VecArgOffset is the
+  // entire point of the following loop.
+  unsigned VecArgOffset = ArgOffset;
+  if (!isVarArg && !isPPC64) {
+    for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e;
+         ++ArgNo) {
+      EVT ObjectVT = Ins[ArgNo].VT;
+      ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
+
+      if (Flags.isByVal()) {
+        // ObjSize is the true size, ArgSize rounded up to multiple of regs.
+        unsigned ObjSize = Flags.getByValSize();
+        unsigned ArgSize =
+                ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
+        VecArgOffset += ArgSize;
+        continue;
+      }
+
+      switch(ObjectVT.getSimpleVT().SimpleTy) {
+      default: llvm_unreachable("Unhandled argument type!");
+      case MVT::i1:
+      case MVT::i32:
+      case MVT::f32:
+        VecArgOffset += 4;
+        break;
+      case MVT::i64:  // PPC64
+      case MVT::f64:
+        // FIXME: We are guaranteed to be !isPPC64 at this point.
+        // Does MVT::i64 apply?
+        VecArgOffset += 8;
+        break;
+      case MVT::v4f32:
+      case MVT::v4i32:
+      case MVT::v8i16:
+      case MVT::v16i8:
+        // Nothing to do, we're only looking at Nonvector args here.
+        break;
+      }
+    }
+  }
+  // We've found where the vector parameter area in memory is.  Skip the
+  // first 12 parameters; these don't use that memory.
+  VecArgOffset = ((VecArgOffset+15)/16)*16;
+  VecArgOffset += 12*16;
+
+  // Add DAG nodes to load the arguments or copy them out of registers.  On
+  // entry to a function on PPC, the arguments start after the linkage area,
+  // although the first ones are often in registers.
+
+  SmallVector<SDValue, 8> MemOps;
+  unsigned nAltivecParamsAtEnd = 0;
+  Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin();
+  unsigned CurArgIdx = 0;
+  for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
+    SDValue ArgVal;
+    bool needsLoad = false;
+    EVT ObjectVT = Ins[ArgNo].VT;
+    unsigned ObjSize = ObjectVT.getSizeInBits()/8;
+    unsigned ArgSize = ObjSize;
+    ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
+    std::advance(FuncArg, Ins[ArgNo].OrigArgIndex - CurArgIdx);
+    CurArgIdx = Ins[ArgNo].OrigArgIndex;
+
+    unsigned CurArgOffset = ArgOffset;
+
+    // Varargs or 64 bit Altivec parameters are padded to a 16 byte boundary.
+    if (ObjectVT==MVT::v4f32 || ObjectVT==MVT::v4i32 ||
+        ObjectVT==MVT::v8i16 || ObjectVT==MVT::v16i8) {
+      if (isVarArg || isPPC64) {
+        MinReservedArea = ((MinReservedArea+15)/16)*16;
+        MinReservedArea += CalculateStackSlotSize(ObjectVT,
+                                                  Flags,
+                                                  PtrByteSize);
+      } else  nAltivecParamsAtEnd++;
+    } else
+      // Calculate min reserved area.
+      MinReservedArea += CalculateStackSlotSize(Ins[ArgNo].VT,
+                                                Flags,
+                                                PtrByteSize);
+
+    // FIXME the codegen can be much improved in some cases.
+    // We do not have to keep everything in memory.
+    if (Flags.isByVal()) {
+      // ObjSize is the true size, ArgSize rounded up to multiple of registers.
+      ObjSize = Flags.getByValSize();
+      ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
+      // Objects of size 1 and 2 are right justified, everything else is
+      // left justified.  This means the memory address is adjusted forwards.
+      if (ObjSize==1 || ObjSize==2) {
+        CurArgOffset = CurArgOffset + (4 - ObjSize);
+      }
+      // The value of the object is its address.
+      int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, true);
+      SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+      InVals.push_back(FIN);
+      if (ObjSize==1 || ObjSize==2) {
+        if (GPR_idx != Num_GPR_Regs) {
+          unsigned VReg;
+          if (isPPC64)
+            VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+          else
+            VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
+          SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+          EVT ObjType = ObjSize == 1 ? MVT::i8 : MVT::i16;
+          SDValue Store = DAG.getTruncStore(Val.getValue(1), dl, Val, FIN,
+                                            MachinePointerInfo(FuncArg),
+                                            ObjType, false, false, 0);
+          MemOps.push_back(Store);
+          ++GPR_idx;
+        }
+
+        ArgOffset += PtrByteSize;
+
+        continue;
+      }
+      for (unsigned j = 0; j < ArgSize; j += PtrByteSize) {
+        // Store whatever pieces of the object are in registers
+        // to memory.  ArgOffset will be the address of the beginning
+        // of the object.
+        if (GPR_idx != Num_GPR_Regs) {
+          unsigned VReg;
+          if (isPPC64)
+            VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+          else
+            VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
+          int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true);
+          SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+          SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+          SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                                       MachinePointerInfo(FuncArg, j),
+                                       false, false, 0);
+          MemOps.push_back(Store);
+          ++GPR_idx;
+          ArgOffset += PtrByteSize;
+        } else {
+          ArgOffset += ArgSize - (ArgOffset-CurArgOffset);
+          break;
+        }
+      }
+      continue;
+    }
+
+    switch (ObjectVT.getSimpleVT().SimpleTy) {
+    default: llvm_unreachable("Unhandled argument type!");
+    case MVT::i1:
+    case MVT::i32:
+      if (!isPPC64) {
+        if (GPR_idx != Num_GPR_Regs) {
+          unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
+          ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
+
+          if (ObjectVT == MVT::i1)
+            ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgVal);
+
+          ++GPR_idx;
+        } else {
+          needsLoad = true;
+          ArgSize = PtrByteSize;
+        }
+        // All int arguments reserve stack space in the Darwin ABI.
+        ArgOffset += PtrByteSize;
+        break;
+      }
+      // FALLTHROUGH
+    case MVT::i64:  // PPC64
+      if (GPR_idx != Num_GPR_Regs) {
+        unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+        ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
+
+        if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1)
+          // PPC64 passes i8, i16, and i32 values in i64 registers. Promote
+          // value to MVT::i64 and then truncate to the correct register size.
+          ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
+
+        ++GPR_idx;
+      } else {
+        needsLoad = true;
+        ArgSize = PtrByteSize;
+      }
+      // All int arguments reserve stack space in the Darwin ABI.
+      ArgOffset += 8;
+      break;
+
+    case MVT::f32:
+    case MVT::f64:
+      // Every 4 bytes of argument space consumes one of the GPRs available for
+      // argument passing.
+      if (GPR_idx != Num_GPR_Regs) {
+        ++GPR_idx;
+        if (ObjSize == 8 && GPR_idx != Num_GPR_Regs && !isPPC64)
+          ++GPR_idx;
+      }
+      if (FPR_idx != Num_FPR_Regs) {
+        unsigned VReg;
+
+        if (ObjectVT == MVT::f32)
+          VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F4RCRegClass);
+        else
+          VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F8RCRegClass);
+
+        ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
+        ++FPR_idx;
+      } else {
+        needsLoad = true;
+      }
+
+      // All FP arguments reserve stack space in the Darwin ABI.
+      ArgOffset += isPPC64 ? 8 : ObjSize;
+      break;
+    case MVT::v4f32:
+    case MVT::v4i32:
+    case MVT::v8i16:
+    case MVT::v16i8:
+      // Note that vector arguments in registers don't reserve stack space,
+      // except in varargs functions.
+      if (VR_idx != Num_VR_Regs) {
+        unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);
+        ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
+        if (isVarArg) {
+          while ((ArgOffset % 16) != 0) {
+            ArgOffset += PtrByteSize;
+            if (GPR_idx != Num_GPR_Regs)
+              GPR_idx++;
+          }
+          ArgOffset += 16;
+          GPR_idx = std::min(GPR_idx+4, Num_GPR_Regs); // FIXME correct for ppc64?
+        }
+        ++VR_idx;
+      } else {
+        if (!isVarArg && !isPPC64) {
+          // Vectors go after all the nonvectors.
+          CurArgOffset = VecArgOffset;
+          VecArgOffset += 16;
+        } else {
+          // Vectors are aligned.
+          ArgOffset = ((ArgOffset+15)/16)*16;
+          CurArgOffset = ArgOffset;
+          ArgOffset += 16;
+        }
+        needsLoad = true;
+      }
+      break;
+    }
+
+    // We need to load the argument to a virtual register if we determined above
+    // that we ran out of physical registers of the appropriate type.
+    if (needsLoad) {
+      int FI = MFI->CreateFixedObject(ObjSize,
+                                      CurArgOffset + (ArgSize - ObjSize),
+                                      isImmutable);
+      SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+      ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo(),
+                           false, false, false, 0);
+    }
+
+    InVals.push_back(ArgVal);
+  }
+
+  // Allow for Altivec parameters at the end, if needed.
+  if (nAltivecParamsAtEnd) {
+    MinReservedArea = ((MinReservedArea+15)/16)*16;
+    MinReservedArea += 16*nAltivecParamsAtEnd;
+  }
+
+  // Area that is at least reserved in the caller of this function.
+  MinReservedArea = std::max(MinReservedArea, LinkageSize + 8 * PtrByteSize);
+
+  // Set the size that is at least reserved in caller of this function.  Tail
+  // call optimized functions' reserved stack space needs to be aligned so that
+  // taking the difference between two stack areas will result in an aligned
+  // stack.
+  MinReservedArea = EnsureStackAlignment(MF.getTarget(), MinReservedArea);
+  FuncInfo->setMinReservedArea(MinReservedArea);
+
+  // If the function takes variable number of arguments, make a frame index for
+  // the start of the first vararg value... for expansion of llvm.va_start.
+  if (isVarArg) {
+    int Depth = ArgOffset;
+
+    FuncInfo->setVarArgsFrameIndex(
+      MFI->CreateFixedObject(PtrVT.getSizeInBits()/8,
+                             Depth, true));
+    SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
+
+    // If this function is vararg, store any remaining integer argument regs
+    // to their spots on the stack so that they may be loaded by deferencing the
+    // result of va_next.
+    for (; GPR_idx != Num_GPR_Regs; ++GPR_idx) {
+      unsigned VReg;
+
+      if (isPPC64)
+        VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+      else
+        VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
+
+      SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+      SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                                   MachinePointerInfo(), false, false, 0);
+      MemOps.push_back(Store);
+      // Increment the address by four for the next argument to store
+      SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, PtrVT);
+      FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
+    }
+  }
+
+  if (!MemOps.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
+
+  return Chain;
+}
+
+/// CalculateTailCallSPDiff - Get the amount the stack pointer has to be
+/// adjusted to accommodate the arguments for the tailcall.
+static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall,
+                                   unsigned ParamSize) {
+
+  if (!isTailCall) return 0;
+
+  PPCFunctionInfo *FI = DAG.getMachineFunction().getInfo<PPCFunctionInfo>();
+  unsigned CallerMinReservedArea = FI->getMinReservedArea();
+  int SPDiff = (int)CallerMinReservedArea - (int)ParamSize;
+  // Remember only if the new adjustement is bigger.
+  if (SPDiff < FI->getTailCallSPDelta())
+    FI->setTailCallSPDelta(SPDiff);
+
+  return SPDiff;
+}
+
+/// IsEligibleForTailCallOptimization - Check whether the call is eligible
+/// for tail call optimization. Targets which want to do tail call
+/// optimization should implement this function.
+bool
+PPCTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
+                                                     CallingConv::ID CalleeCC,
+                                                     bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                                     SelectionDAG& DAG) const {
+  if (!getTargetMachine().Options.GuaranteedTailCallOpt)
+    return false;
+
+  // Variable argument functions are not supported.
+  if (isVarArg)
+    return false;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  CallingConv::ID CallerCC = MF.getFunction()->getCallingConv();
+  if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) {
+    // Functions containing by val parameters are not supported.
+    for (unsigned i = 0; i != Ins.size(); i++) {
+       ISD::ArgFlagsTy Flags = Ins[i].Flags;
+       if (Flags.isByVal()) return false;
+    }
+
+    // Non-PIC/GOT tail calls are supported.
+    if (getTargetMachine().getRelocationModel() != Reloc::PIC_)
+      return true;
+
+    // At the moment we can only do local tail calls (in same module, hidden
+    // or protected) if we are generating PIC.
+    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+      return G->getGlobal()->hasHiddenVisibility()
+          || G->getGlobal()->hasProtectedVisibility();
+  }
+
+  return false;
+}
+
+/// isCallCompatibleAddress - Return the immediate to use if the specified
+/// 32-bit value is representable in the immediate field of a BxA instruction.
+static SDNode *isBLACompatibleAddress(SDValue Op, SelectionDAG &DAG) {
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
+  if (!C) return nullptr;
+
+  int Addr = C->getZExtValue();
+  if ((Addr & 3) != 0 ||  // Low 2 bits are implicitly zero.
+      SignExtend32<26>(Addr) != Addr)
+    return nullptr;  // Top 6 bits have to be sext of immediate.
+
+  return DAG.getConstant((int)C->getZExtValue() >> 2,
+                         DAG.getTargetLoweringInfo().getPointerTy()).getNode();
+}
+
+namespace {
+
+struct TailCallArgumentInfo {
+  SDValue Arg;
+  SDValue FrameIdxOp;
+  int       FrameIdx;
+
+  TailCallArgumentInfo() : FrameIdx(0) {}
+};
+
+}
+
+/// StoreTailCallArgumentsToStackSlot - Stores arguments to their stack slot.
+static void
+StoreTailCallArgumentsToStackSlot(SelectionDAG &DAG,
+                                           SDValue Chain,
+                   const SmallVectorImpl<TailCallArgumentInfo> &TailCallArgs,
+                   SmallVectorImpl<SDValue> &MemOpChains,
+                   SDLoc dl) {
+  for (unsigned i = 0, e = TailCallArgs.size(); i != e; ++i) {
+    SDValue Arg = TailCallArgs[i].Arg;
+    SDValue FIN = TailCallArgs[i].FrameIdxOp;
+    int FI = TailCallArgs[i].FrameIdx;
+    // Store relative to framepointer.
+    MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, FIN,
+                                       MachinePointerInfo::getFixedStack(FI),
+                                       false, false, 0));
+  }
+}
+
+/// EmitTailCallStoreFPAndRetAddr - Move the frame pointer and return address to
+/// the appropriate stack slot for the tail call optimized function call.
+static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG,
+                                               MachineFunction &MF,
+                                               SDValue Chain,
+                                               SDValue OldRetAddr,
+                                               SDValue OldFP,
+                                               int SPDiff,
+                                               bool isPPC64,
+                                               bool isDarwinABI,
+                                               SDLoc dl) {
+  if (SPDiff) {
+    // Calculate the new stack slot for the return address.
+    int SlotSize = isPPC64 ? 8 : 4;
+    int NewRetAddrLoc = SPDiff + PPCFrameLowering::getReturnSaveOffset(isPPC64,
+                                                                   isDarwinABI);
+    int NewRetAddr = MF.getFrameInfo()->CreateFixedObject(SlotSize,
+                                                          NewRetAddrLoc, true);
+    EVT VT = isPPC64 ? MVT::i64 : MVT::i32;
+    SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewRetAddr, VT);
+    Chain = DAG.getStore(Chain, dl, OldRetAddr, NewRetAddrFrIdx,
+                         MachinePointerInfo::getFixedStack(NewRetAddr),
+                         false, false, 0);
+
+    // When using the 32/64-bit SVR4 ABI there is no need to move the FP stack
+    // slot as the FP is never overwritten.
+    if (isDarwinABI) {
+      int NewFPLoc =
+        SPDiff + PPCFrameLowering::getFramePointerSaveOffset(isPPC64, isDarwinABI);
+      int NewFPIdx = MF.getFrameInfo()->CreateFixedObject(SlotSize, NewFPLoc,
+                                                          true);
+      SDValue NewFramePtrIdx = DAG.getFrameIndex(NewFPIdx, VT);
+      Chain = DAG.getStore(Chain, dl, OldFP, NewFramePtrIdx,
+                           MachinePointerInfo::getFixedStack(NewFPIdx),
+                           false, false, 0);
+    }
+  }
+  return Chain;
+}
+
+/// CalculateTailCallArgDest - Remember Argument for later processing. Calculate
+/// the position of the argument.
+static void
+CalculateTailCallArgDest(SelectionDAG &DAG, MachineFunction &MF, bool isPPC64,
+                         SDValue Arg, int SPDiff, unsigned ArgOffset,
+                     SmallVectorImpl<TailCallArgumentInfo>& TailCallArguments) {
+  int Offset = ArgOffset + SPDiff;
+  uint32_t OpSize = (Arg.getValueType().getSizeInBits()+7)/8;
+  int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
+  EVT VT = isPPC64 ? MVT::i64 : MVT::i32;
+  SDValue FIN = DAG.getFrameIndex(FI, VT);
+  TailCallArgumentInfo Info;
+  Info.Arg = Arg;
+  Info.FrameIdxOp = FIN;
+  Info.FrameIdx = FI;
+  TailCallArguments.push_back(Info);
+}
+
+/// EmitTCFPAndRetAddrLoad - Emit load from frame pointer and return address
+/// stack slot. Returns the chain as result and the loaded frame pointers in
+/// LROpOut/FPOpout. Used when tail calling.
+SDValue PPCTargetLowering::EmitTailCallLoadFPAndRetAddr(SelectionDAG & DAG,
+                                                        int SPDiff,
+                                                        SDValue Chain,
+                                                        SDValue &LROpOut,
+                                                        SDValue &FPOpOut,
+                                                        bool isDarwinABI,
+                                                        SDLoc dl) const {
+  if (SPDiff) {
+    // Load the LR and FP stack slot for later adjusting.
+    EVT VT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32;
+    LROpOut = getReturnAddrFrameIndex(DAG);
+    LROpOut = DAG.getLoad(VT, dl, Chain, LROpOut, MachinePointerInfo(),
+                          false, false, false, 0);
+    Chain = SDValue(LROpOut.getNode(), 1);
+
+    // When using the 32/64-bit SVR4 ABI there is no need to load the FP stack
+    // slot as the FP is never overwritten.
+    if (isDarwinABI) {
+      FPOpOut = getFramePointerFrameIndex(DAG);
+      FPOpOut = DAG.getLoad(VT, dl, Chain, FPOpOut, MachinePointerInfo(),
+                            false, false, false, 0);
+      Chain = SDValue(FPOpOut.getNode(), 1);
+    }
+  }
+  return Chain;
+}
+
+/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
+/// by "Src" to address "Dst" of size "Size".  Alignment information is
+/// specified by the specific parameter attribute. The copy will be passed as
+/// a byval function parameter.
+/// Sometimes what we are copying is the end of a larger object, the part that
+/// does not fit in registers.
+static SDValue
+CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
+                          ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
+                          SDLoc dl) {
+  SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
+  return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
+                       false, false, MachinePointerInfo(),
+                       MachinePointerInfo());
+}
+
+/// LowerMemOpCallTo - Store the argument to the stack or remember it in case of
+/// tail calls.
+static void
+LowerMemOpCallTo(SelectionDAG &DAG, MachineFunction &MF, SDValue Chain,
+                 SDValue Arg, SDValue PtrOff, int SPDiff,
+                 unsigned ArgOffset, bool isPPC64, bool isTailCall,
+                 bool isVector, SmallVectorImpl<SDValue> &MemOpChains,
+                 SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments,
+                 SDLoc dl) {
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  if (!isTailCall) {
+    if (isVector) {
+      SDValue StackPtr;
+      if (isPPC64)
+        StackPtr = DAG.getRegister(PPC::X1, MVT::i64);
+      else
+        StackPtr = DAG.getRegister(PPC::R1, MVT::i32);
+      PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr,
+                           DAG.getConstant(ArgOffset, PtrVT));
+    }
+    MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
+                                       MachinePointerInfo(), false, false, 0));
+  // Calculate and remember argument location.
+  } else CalculateTailCallArgDest(DAG, MF, isPPC64, Arg, SPDiff, ArgOffset,
+                                  TailCallArguments);
+}
+
+static
+void PrepareTailCall(SelectionDAG &DAG, SDValue &InFlag, SDValue &Chain,
+                     SDLoc dl, bool isPPC64, int SPDiff, unsigned NumBytes,
+                     SDValue LROp, SDValue FPOp, bool isDarwinABI,
+                     SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments) {
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  // Emit a sequence of copyto/copyfrom virtual registers for arguments that
+  // might overwrite each other in case of tail call optimization.
+  SmallVector<SDValue, 8> MemOpChains2;
+  // Do not flag preceding copytoreg stuff together with the following stuff.
+  InFlag = SDValue();
+  StoreTailCallArgumentsToStackSlot(DAG, Chain, TailCallArguments,
+                                    MemOpChains2, dl);
+  if (!MemOpChains2.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
+
+  // Store the return address to the appropriate stack slot.
+  Chain = EmitTailCallStoreFPAndRetAddr(DAG, MF, Chain, LROp, FPOp, SPDiff,
+                                        isPPC64, isDarwinABI, dl);
+
+  // Emit callseq_end just before tailcall node.
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                             DAG.getIntPtrConstant(0, true), InFlag, dl);
+  InFlag = Chain.getValue(1);
+}
+
+static
+unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
+                     SDValue &Chain, SDLoc dl, int SPDiff, bool isTailCall,
+                     SmallVectorImpl<std::pair<unsigned, SDValue> > &RegsToPass,
+                     SmallVectorImpl<SDValue> &Ops, std::vector<EVT> &NodeTys,
+                     const PPCSubtarget &Subtarget) {
+
+  bool isPPC64 = Subtarget.isPPC64();
+  bool isSVR4ABI = Subtarget.isSVR4ABI();
+  bool isELFv2ABI = Subtarget.isELFv2ABI();
+
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  NodeTys.push_back(MVT::Other);   // Returns a chain
+  NodeTys.push_back(MVT::Glue);    // Returns a flag for retval copy to use.
+
+  unsigned CallOpc = PPCISD::CALL;
+
+  bool needIndirectCall = true;
+  if (!isSVR4ABI || !isPPC64)
+    if (SDNode *Dest = isBLACompatibleAddress(Callee, DAG)) {
+      // If this is an absolute destination address, use the munged value.
+      Callee = SDValue(Dest, 0);
+      needIndirectCall = false;
+    }
+
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    // XXX Work around for http://llvm.org/bugs/show_bug.cgi?id=5201
+    // Use indirect calls for ALL functions calls in JIT mode, since the
+    // far-call stubs may be outside relocation limits for a BL instruction.
+    if (!DAG.getTarget().getSubtarget<PPCSubtarget>().isJITCodeModel()) {
+      unsigned OpFlags = 0;
+      if ((DAG.getTarget().getRelocationModel() != Reloc::Static &&
+          (Subtarget.getTargetTriple().isMacOSX() &&
+           Subtarget.getTargetTriple().isMacOSXVersionLT(10, 5)) &&
+          (G->getGlobal()->isDeclaration() ||
+           G->getGlobal()->isWeakForLinker())) ||
+          (Subtarget.isTargetELF() && !isPPC64 &&
+           !G->getGlobal()->hasLocalLinkage() &&
+           DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
+        // PC-relative references to external symbols should go through $stub,
+        // unless we're building with the leopard linker or later, which
+        // automatically synthesizes these stubs.
+        OpFlags = PPCII::MO_PLT_OR_STUB;
+      }
+
+      // If the callee is a GlobalAddress/ExternalSymbol node (quite common,
+      // every direct call is) turn it into a TargetGlobalAddress /
+      // TargetExternalSymbol node so that legalize doesn't hack it.
+      Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl,
+                                          Callee.getValueType(),
+                                          0, OpFlags);
+      needIndirectCall = false;
+    }
+  }
+
+  if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    unsigned char OpFlags = 0;
+
+    if ((DAG.getTarget().getRelocationModel() != Reloc::Static &&
+         (Subtarget.getTargetTriple().isMacOSX() &&
+          Subtarget.getTargetTriple().isMacOSXVersionLT(10, 5))) ||
+        (Subtarget.isTargetELF() && !isPPC64 &&
+         DAG.getTarget().getRelocationModel() == Reloc::PIC_)	) {
+      // PC-relative references to external symbols should go through $stub,
+      // unless we're building with the leopard linker or later, which
+      // automatically synthesizes these stubs.
+      OpFlags = PPCII::MO_PLT_OR_STUB;
+    }
+
+    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), Callee.getValueType(),
+                                         OpFlags);
+    needIndirectCall = false;
+  }
+
+  if (needIndirectCall) {
+    // Otherwise, this is an indirect call.  We have to use a MTCTR/BCTRL pair
+    // to do the call, we can't use PPCISD::CALL.
+    SDValue MTCTROps[] = {Chain, Callee, InFlag};
+
+    if (isSVR4ABI && isPPC64 && !isELFv2ABI) {
+      // Function pointers in the 64-bit SVR4 ABI do not point to the function
+      // entry point, but to the function descriptor (the function entry point
+      // address is part of the function descriptor though).
+      // The function descriptor is a three doubleword structure with the
+      // following fields: function entry point, TOC base address and
+      // environment pointer.
+      // Thus for a call through a function pointer, the following actions need
+      // to be performed:
+      //   1. Save the TOC of the caller in the TOC save area of its stack
+      //      frame (this is done in LowerCall_Darwin() or LowerCall_64SVR4()).
+      //   2. Load the address of the function entry point from the function
+      //      descriptor.
+      //   3. Load the TOC of the callee from the function descriptor into r2.
+      //   4. Load the environment pointer from the function descriptor into
+      //      r11.
+      //   5. Branch to the function entry point address.
+      //   6. On return of the callee, the TOC of the caller needs to be
+      //      restored (this is done in FinishCall()).
+      //
+      // All those operations are flagged together to ensure that no other
+      // operations can be scheduled in between. E.g. without flagging the
+      // operations together, a TOC access in the caller could be scheduled
+      // between the load of the callee TOC and the branch to the callee, which
+      // results in the TOC access going through the TOC of the callee instead
+      // of going through the TOC of the caller, which leads to incorrect code.
+
+      // Load the address of the function entry point from the function
+      // descriptor.
+      SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Other, MVT::Glue);
+      SDValue LoadFuncPtr = DAG.getNode(PPCISD::LOAD, dl, VTs,
+                              makeArrayRef(MTCTROps, InFlag.getNode() ? 3 : 2));
+      Chain = LoadFuncPtr.getValue(1);
+      InFlag = LoadFuncPtr.getValue(2);
+
+      // Load environment pointer into r11.
+      // Offset of the environment pointer within the function descriptor.
+      SDValue PtrOff = DAG.getIntPtrConstant(16);
+
+      SDValue AddPtr = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee, PtrOff);
+      SDValue LoadEnvPtr = DAG.getNode(PPCISD::LOAD, dl, VTs, Chain, AddPtr,
+                                       InFlag);
+      Chain = LoadEnvPtr.getValue(1);
+      InFlag = LoadEnvPtr.getValue(2);
+
+      SDValue EnvVal = DAG.getCopyToReg(Chain, dl, PPC::X11, LoadEnvPtr,
+                                        InFlag);
+      Chain = EnvVal.getValue(0);
+      InFlag = EnvVal.getValue(1);
+
+      // Load TOC of the callee into r2. We are using a target-specific load
+      // with r2 hard coded, because the result of a target-independent load
+      // would never go directly into r2, since r2 is a reserved register (which
+      // prevents the register allocator from allocating it), resulting in an
+      // additional register being allocated and an unnecessary move instruction
+      // being generated.
+      VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+      SDValue TOCOff = DAG.getIntPtrConstant(8);
+      SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee, TOCOff);
+      SDValue LoadTOCPtr = DAG.getNode(PPCISD::LOAD_TOC, dl, VTs, Chain,
+                                       AddTOC, InFlag);
+      Chain = LoadTOCPtr.getValue(0);
+      InFlag = LoadTOCPtr.getValue(1);
+
+      MTCTROps[0] = Chain;
+      MTCTROps[1] = LoadFuncPtr;
+      MTCTROps[2] = InFlag;
+    }
+
+    Chain = DAG.getNode(PPCISD::MTCTR, dl, NodeTys,
+                        makeArrayRef(MTCTROps, InFlag.getNode() ? 3 : 2));
+    InFlag = Chain.getValue(1);
+
+    NodeTys.clear();
+    NodeTys.push_back(MVT::Other);
+    NodeTys.push_back(MVT::Glue);
+    Ops.push_back(Chain);
+    CallOpc = PPCISD::BCTRL;
+    Callee.setNode(nullptr);
+    // Add use of X11 (holding environment pointer)
+    if (isSVR4ABI && isPPC64 && !isELFv2ABI)
+      Ops.push_back(DAG.getRegister(PPC::X11, PtrVT));
+    // Add CTR register as callee so a bctr can be emitted later.
+    if (isTailCall)
+      Ops.push_back(DAG.getRegister(isPPC64 ? PPC::CTR8 : PPC::CTR, PtrVT));
+  }
+
+  // If this is a direct call, pass the chain and the callee.
+  if (Callee.getNode()) {
+    Ops.push_back(Chain);
+    Ops.push_back(Callee);
+  }
+  // If this is a tail call add stack pointer delta.
+  if (isTailCall)
+    Ops.push_back(DAG.getConstant(SPDiff, MVT::i32));
+
+  // Add argument registers to the end of the list so that they are known live
+  // into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  // Direct calls in the ELFv2 ABI need the TOC register live into the call.
+  if (Callee.getNode() && isELFv2ABI)
+    Ops.push_back(DAG.getRegister(PPC::X2, PtrVT));
+
+  return CallOpc;
+}
+
+static
+bool isLocalCall(const SDValue &Callee)
+{
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    return !G->getGlobal()->isDeclaration() &&
+           !G->getGlobal()->isWeakForLinker();
+  return false;
+}
+
+SDValue
+PPCTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
+                                   CallingConv::ID CallConv, bool isVarArg,
+                                   const SmallVectorImpl<ISD::InputArg> &Ins,
+                                   SDLoc dl, SelectionDAG &DAG,
+                                   SmallVectorImpl<SDValue> &InVals) const {
+
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                    getTargetMachine(), RVLocs, *DAG.getContext());
+  CCRetInfo.AnalyzeCallResult(Ins, RetCC_PPC);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    SDValue Val = DAG.getCopyFromReg(Chain, dl,
+                                     VA.getLocReg(), VA.getLocVT(), InFlag);
+    Chain = Val.getValue(1);
+    InFlag = Val.getValue(2);
+
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::AExt:
+      Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
+      break;
+    case CCValAssign::ZExt:
+      Val = DAG.getNode(ISD::AssertZext, dl, VA.getLocVT(), Val,
+                        DAG.getValueType(VA.getValVT()));
+      Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
+      break;
+    case CCValAssign::SExt:
+      Val = DAG.getNode(ISD::AssertSext, dl, VA.getLocVT(), Val,
+                        DAG.getValueType(VA.getValVT()));
+      Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
+      break;
+    }
+
+    InVals.push_back(Val);
+  }
+
+  return Chain;
+}
+
+SDValue
+PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl,
+                              bool isTailCall, bool isVarArg,
+                              SelectionDAG &DAG,
+                              SmallVector<std::pair<unsigned, SDValue>, 8>
+                                &RegsToPass,
+                              SDValue InFlag, SDValue Chain,
+                              SDValue &Callee,
+                              int SPDiff, unsigned NumBytes,
+                              const SmallVectorImpl<ISD::InputArg> &Ins,
+                              SmallVectorImpl<SDValue> &InVals) const {
+
+  bool isELFv2ABI = Subtarget.isELFv2ABI();
+  std::vector<EVT> NodeTys;
+  SmallVector<SDValue, 8> Ops;
+  unsigned CallOpc = PrepareCall(DAG, Callee, InFlag, Chain, dl, SPDiff,
+                                 isTailCall, RegsToPass, Ops, NodeTys,
+                                 Subtarget);
+
+  // Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls
+  if (isVarArg && Subtarget.isSVR4ABI() && !Subtarget.isPPC64())
+    Ops.push_back(DAG.getRegister(PPC::CR1EQ, MVT::i32));
+
+  // When performing tail call optimization the callee pops its arguments off
+  // the stack. Account for this here so these bytes can be pushed back on in
+  // PPCFrameLowering::eliminateCallFramePseudoInstr.
+  int BytesCalleePops =
+    (CallConv == CallingConv::Fast &&
+     getTargetMachine().Options.GuaranteedTailCallOpt) ? NumBytes : 0;
+
+  // Add a register mask operand representing the call-preserved registers.
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  // Emit tail call.
+  if (isTailCall) {
+    assert(((Callee.getOpcode() == ISD::Register &&
+             cast<RegisterSDNode>(Callee)->getReg() == PPC::CTR) ||
+            Callee.getOpcode() == ISD::TargetExternalSymbol ||
+            Callee.getOpcode() == ISD::TargetGlobalAddress ||
+            isa<ConstantSDNode>(Callee)) &&
+    "Expecting an global address, external symbol, absolute value or register");
+
+    return DAG.getNode(PPCISD::TC_RETURN, dl, MVT::Other, Ops);
+  }
+
+  // Add a NOP immediately after the branch instruction when using the 64-bit
+  // SVR4 ABI. At link time, if caller and callee are in a different module and
+  // thus have a different TOC, the call will be replaced with a call to a stub
+  // function which saves the current TOC, loads the TOC of the callee and
+  // branches to the callee. The NOP will be replaced with a load instruction
+  // which restores the TOC of the caller from the TOC save slot of the current
+  // stack frame. If caller and callee belong to the same module (and have the
+  // same TOC), the NOP will remain unchanged.
+
+  bool needsTOCRestore = false;
+  if (!isTailCall && Subtarget.isSVR4ABI()&& Subtarget.isPPC64()) {
+    if (CallOpc == PPCISD::BCTRL) {
+      // This is a call through a function pointer.
+      // Restore the caller TOC from the save area into R2.
+      // See PrepareCall() for more information about calls through function
+      // pointers in the 64-bit SVR4 ABI.
+      // We are using a target-specific load with r2 hard coded, because the
+      // result of a target-independent load would never go directly into r2,
+      // since r2 is a reserved register (which prevents the register allocator
+      // from allocating it), resulting in an additional register being
+      // allocated and an unnecessary move instruction being generated.
+      needsTOCRestore = true;
+    } else if ((CallOpc == PPCISD::CALL) &&
+               (!isLocalCall(Callee) ||
+                DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
+      // Otherwise insert NOP for non-local calls.
+      CallOpc = PPCISD::CALL_NOP;
+    }
+  }
+
+  Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
+  InFlag = Chain.getValue(1);
+
+  if (needsTOCRestore) {
+    SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+    EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+    SDValue StackPtr = DAG.getRegister(PPC::X1, PtrVT);
+    unsigned TOCSaveOffset = PPCFrameLowering::getTOCSaveOffset(isELFv2ABI);
+    SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset);
+    SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, StackPtr, TOCOff);
+    Chain = DAG.getNode(PPCISD::LOAD_TOC, dl, VTs, Chain, AddTOC, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                             DAG.getIntPtrConstant(BytesCalleePops, true),
+                             InFlag, dl);
+  if (!Ins.empty())
+    InFlag = Chain.getValue(1);
+
+  return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
+                         Ins, dl, DAG, InVals);
+}
+
+SDValue
+PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                             SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  SDLoc &dl                             = CLI.DL;
+  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+  SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
+  SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  bool &isTailCall                      = CLI.IsTailCall;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool isVarArg                         = CLI.IsVarArg;
+
+  if (isTailCall)
+    isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg,
+                                                   Ins, DAG);
+
+  if (!isTailCall && CLI.CS && CLI.CS->isMustTailCall())
+    report_fatal_error("failed to perform tail call elimination on a call "
+                       "site marked musttail");
+
+  if (Subtarget.isSVR4ABI()) {
+    if (Subtarget.isPPC64())
+      return LowerCall_64SVR4(Chain, Callee, CallConv, isVarArg,
+                              isTailCall, Outs, OutVals, Ins,
+                              dl, DAG, InVals);
+    else
+      return LowerCall_32SVR4(Chain, Callee, CallConv, isVarArg,
+                              isTailCall, Outs, OutVals, Ins,
+                              dl, DAG, InVals);
+  }
+
+  return LowerCall_Darwin(Chain, Callee, CallConv, isVarArg,
+                          isTailCall, Outs, OutVals, Ins,
+                          dl, DAG, InVals);
+}
+
+SDValue
+PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee,
+                                    CallingConv::ID CallConv, bool isVarArg,
+                                    bool isTailCall,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<SDValue> &OutVals,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                    SDLoc dl, SelectionDAG &DAG,
+                                    SmallVectorImpl<SDValue> &InVals) const {
+  // See PPCTargetLowering::LowerFormalArguments_32SVR4() for a description
+  // of the 32-bit SVR4 ABI stack frame layout.
+
+  assert((CallConv == CallingConv::C ||
+          CallConv == CallingConv::Fast) && "Unknown calling convention!");
+
+  unsigned PtrByteSize = 4;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  // Mark this function as potentially containing a function that contains a
+  // tail call. As a consequence the frame pointer will be used for dynamicalloc
+  // and restoring the callers stack pointer in this functions epilog. This is
+  // done because by tail calling the called function might overwrite the value
+  // in this function's (MF) stack pointer stack slot 0(SP).
+  if (getTargetMachine().Options.GuaranteedTailCallOpt &&
+      CallConv == CallingConv::Fast)
+    MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
+
+  // Count how many bytes are to be pushed on the stack, including the linkage
+  // area, parameter list area and the part of the local variable space which
+  // contains copies of aggregates which are passed by value.
+
+  // Assign locations to all of the outgoing arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+
+  // Reserve space for the linkage area on the stack.
+  CCInfo.AllocateStack(PPCFrameLowering::getLinkageSize(false, false, false),
+                       PtrByteSize);
+
+  if (isVarArg) {
+    // Handle fixed and variable vector arguments differently.
+    // Fixed vector arguments go into registers as long as registers are
+    // available. Variable vector arguments always go into memory.
+    unsigned NumArgs = Outs.size();
+
+    for (unsigned i = 0; i != NumArgs; ++i) {
+      MVT ArgVT = Outs[i].VT;
+      ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+      bool Result;
+
+      if (Outs[i].IsFixed) {
+        Result = CC_PPC32_SVR4(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags,
+                               CCInfo);
+      } else {
+        Result = CC_PPC32_SVR4_VarArg(i, ArgVT, ArgVT, CCValAssign::Full,
+                                      ArgFlags, CCInfo);
+      }
+
+      if (Result) {
+#ifndef NDEBUG
+        errs() << "Call operand #" << i << " has unhandled type "
+             << EVT(ArgVT).getEVTString() << "\n";
+#endif
+        llvm_unreachable(nullptr);
+      }
+    }
+  } else {
+    // All arguments are treated the same.
+    CCInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4);
+  }
+
+  // Assign locations to all of the outgoing aggregate by value arguments.
+  SmallVector<CCValAssign, 16> ByValArgLocs;
+  CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                      getTargetMachine(), ByValArgLocs, *DAG.getContext());
+
+  // Reserve stack space for the allocations in CCInfo.
+  CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);
+
+  CCByValInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4_ByVal);
+
+  // Size of the linkage area, parameter list area and the part of the local
+  // space variable where copies of aggregates which are passed by value are
+  // stored.
+  unsigned NumBytes = CCByValInfo.getNextStackOffset();
+
+  // Calculate by how many bytes the stack has to be adjusted in case of tail
+  // call optimization.
+  int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);
+
+  // Adjust the stack pointer for the new arguments...
+  // These operations are automatically eliminated by the prolog/epilog pass
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                               dl);
+  SDValue CallSeqStart = Chain;
+
+  // Load the return address and frame pointer so it can be moved somewhere else
+  // later.
+  SDValue LROp, FPOp;
+  Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, false,
+                                       dl);
+
+  // Set up a copy of the stack pointer for use loading and storing any
+  // arguments that may not fit in the registers available for argument
+  // passing.
+  SDValue StackPtr = DAG.getRegister(PPC::R1, MVT::i32);
+
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
+  SmallVector<SDValue, 8> MemOpChains;
+
+  bool seenFloatArg = false;
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, j = 0, e = ArgLocs.size();
+       i != e;
+       ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    SDValue Arg = OutVals[i];
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
+
+    if (Flags.isByVal()) {
+      // Argument is an aggregate which is passed by value, thus we need to
+      // create a copy of it in the local variable space of the current stack
+      // frame (which is the stack frame of the caller) and pass the address of
+      // this copy to the callee.
+      assert((j < ByValArgLocs.size()) && "Index out of bounds!");
+      CCValAssign &ByValVA = ByValArgLocs[j++];
+      assert((VA.getValNo() == ByValVA.getValNo()) && "ValNo mismatch!");
+
+      // Memory reserved in the local variable space of the callers stack frame.
+      unsigned LocMemOffset = ByValVA.getLocMemOffset();
+
+      SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
+      PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
+
+      // Create a copy of the argument in the local area of the current
+      // stack frame.
+      SDValue MemcpyCall =
+        CreateCopyOfByValArgument(Arg, PtrOff,
+                                  CallSeqStart.getNode()->getOperand(0),
+                                  Flags, DAG, dl);
+
+      // This must go outside the CALLSEQ_START..END.
+      SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall,
+                           CallSeqStart.getNode()->getOperand(1),
+                           SDLoc(MemcpyCall));
+      DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
+                             NewCallSeqStart.getNode());
+      Chain = CallSeqStart = NewCallSeqStart;
+
+      // Pass the address of the aggregate copy on the stack either in a
+      // physical register or in the parameter list area of the current stack
+      // frame to the callee.
+      Arg = PtrOff;
+    }
+
+    if (VA.isRegLoc()) {
+      if (Arg.getValueType() == MVT::i1)
+        Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Arg);
+
+      seenFloatArg |= VA.getLocVT().isFloatingPoint();
+      // Put argument in a physical register.
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else {
+      // Put argument in the parameter list area of the current stack frame.
+      assert(VA.isMemLoc());
+      unsigned LocMemOffset = VA.getLocMemOffset();
+
+      if (!isTailCall) {
+        SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
+        PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
+
+        MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
+                                           MachinePointerInfo(),
+                                           false, false, 0));
+      } else {
+        // Calculate and remember argument location.
+        CalculateTailCallArgDest(DAG, MF, false, Arg, SPDiff, LocMemOffset,
+                                 TailCallArguments);
+      }
+    }
+  }
+
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into the appropriate regs.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // Set CR bit 6 to true if this is a vararg call with floating args passed in
+  // registers.
+  if (isVarArg) {
+    SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+    SDValue Ops[] = { Chain, InFlag };
+
+    Chain = DAG.getNode(seenFloatArg ? PPCISD::CR6SET : PPCISD::CR6UNSET,
+                        dl, VTs, makeArrayRef(Ops, InFlag.getNode() ? 2 : 1));
+
+    InFlag = Chain.getValue(1);
+  }
+
+  if (isTailCall)
+    PrepareTailCall(DAG, InFlag, Chain, dl, false, SPDiff, NumBytes, LROp, FPOp,
+                    false, TailCallArguments);
+
+  return FinishCall(CallConv, dl, isTailCall, isVarArg, DAG,
+                    RegsToPass, InFlag, Chain, Callee, SPDiff, NumBytes,
+                    Ins, InVals);
+}
+
+// Copy an argument into memory, being careful to do this outside the
+// call sequence for the call to which the argument belongs.
+SDValue
+PPCTargetLowering::createMemcpyOutsideCallSeq(SDValue Arg, SDValue PtrOff,
+                                              SDValue CallSeqStart,
+                                              ISD::ArgFlagsTy Flags,
+                                              SelectionDAG &DAG,
+                                              SDLoc dl) const {
+  SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, PtrOff,
+                        CallSeqStart.getNode()->getOperand(0),
+                        Flags, DAG, dl);
+  // The MEMCPY must go outside the CALLSEQ_START..END.
+  SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall,
+                             CallSeqStart.getNode()->getOperand(1),
+                             SDLoc(MemcpyCall));
+  DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
+                         NewCallSeqStart.getNode());
+  return NewCallSeqStart;
+}
+
+SDValue
+PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
+                                    CallingConv::ID CallConv, bool isVarArg,
+                                    bool isTailCall,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<SDValue> &OutVals,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                    SDLoc dl, SelectionDAG &DAG,
+                                    SmallVectorImpl<SDValue> &InVals) const {
+
+  bool isELFv2ABI = Subtarget.isELFv2ABI();
+  bool isLittleEndian = Subtarget.isLittleEndian();
+  unsigned NumOps = Outs.size();
+
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  unsigned PtrByteSize = 8;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  // Mark this function as potentially containing a function that contains a
+  // tail call. As a consequence the frame pointer will be used for dynamicalloc
+  // and restoring the callers stack pointer in this functions epilog. This is
+  // done because by tail calling the called function might overwrite the value
+  // in this function's (MF) stack pointer stack slot 0(SP).
+  if (getTargetMachine().Options.GuaranteedTailCallOpt &&
+      CallConv == CallingConv::Fast)
+    MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
+
+  // Count how many bytes are to be pushed on the stack, including the linkage
+  // area, and parameter passing area.  On ELFv1, the linkage area is 48 bytes
+  // reserved space for [SP][CR][LR][2 x unused][TOC]; on ELFv2, the linkage
+  // area is 32 bytes reserved space for [SP][CR][LR][TOC].
+  unsigned LinkageSize = PPCFrameLowering::getLinkageSize(true, false,
+                                                          isELFv2ABI);
+  unsigned NumBytes = LinkageSize;
+
+  // Add up all the space actually used.
+  for (unsigned i = 0; i != NumOps; ++i) {
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
+    EVT ArgVT = Outs[i].VT;
+    EVT OrigVT = Outs[i].ArgVT;
+
+    /* Respect alignment of argument on the stack.  */
+    unsigned Align =
+      CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
+    NumBytes = ((NumBytes + Align - 1) / Align) * Align;
+
+    NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
+    if (Flags.isInConsecutiveRegsLast())
+      NumBytes = ((NumBytes + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
+  }
+
+  unsigned NumBytesActuallyUsed = NumBytes;
+
+  // The prolog code of the callee may store up to 8 GPR argument registers to
+  // the stack, allowing va_start to index over them in memory if its varargs.
+  // Because we cannot tell if this is needed on the caller side, we have to
+  // conservatively assume that it is needed.  As such, make sure we have at
+  // least enough stack space for the caller to store the 8 GPRs.
+  // FIXME: On ELFv2, it may be unnecessary to allocate the parameter area.
+  NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize);
+
+  // Tail call needs the stack to be aligned.
+  if (getTargetMachine().Options.GuaranteedTailCallOpt &&
+      CallConv == CallingConv::Fast)
+    NumBytes = EnsureStackAlignment(MF.getTarget(), NumBytes);
+
+  // Calculate by how many bytes the stack has to be adjusted in case of tail
+  // call optimization.
+  int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);
+
+  // To protect arguments on the stack from being clobbered in a tail call,
+  // force all the loads to happen before doing any other lowering.
+  if (isTailCall)
+    Chain = DAG.getStackArgumentTokenFactor(Chain);
+
+  // Adjust the stack pointer for the new arguments...
+  // These operations are automatically eliminated by the prolog/epilog pass
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                               dl);
+  SDValue CallSeqStart = Chain;
+
+  // Load the return address and frame pointer so it can be move somewhere else
+  // later.
+  SDValue LROp, FPOp;
+  Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, true,
+                                       dl);
+
+  // Set up a copy of the stack pointer for use loading and storing any
+  // arguments that may not fit in the registers available for argument
+  // passing.
+  SDValue StackPtr = DAG.getRegister(PPC::X1, MVT::i64);
+
+  // Figure out which arguments are going to go in registers, and which in
+  // memory.  Also, if this is a vararg function, floating point operations
+  // must be stored to our stack, and loaded into integer regs as well, if
+  // any integer regs are available for argument passing.
+  unsigned ArgOffset = LinkageSize;
+  unsigned GPR_idx, FPR_idx = 0, VR_idx = 0;
+
+  static const MCPhysReg GPR[] = {
+    PPC::X3, PPC::X4, PPC::X5, PPC::X6,
+    PPC::X7, PPC::X8, PPC::X9, PPC::X10,
+  };
+  static const MCPhysReg *FPR = GetFPR();
+
+  static const MCPhysReg VR[] = {
+    PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
+    PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
+  };
+  static const MCPhysReg VSRH[] = {
+    PPC::VSH2, PPC::VSH3, PPC::VSH4, PPC::VSH5, PPC::VSH6, PPC::VSH7, PPC::VSH8,
+    PPC::VSH9, PPC::VSH10, PPC::VSH11, PPC::VSH12, PPC::VSH13
+  };
+
+  const unsigned NumGPRs = array_lengthof(GPR);
+  const unsigned NumFPRs = 13;
+  const unsigned NumVRs  = array_lengthof(VR);
+
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
+
+  SmallVector<SDValue, 8> MemOpChains;
+  for (unsigned i = 0; i != NumOps; ++i) {
+    SDValue Arg = OutVals[i];
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
+    EVT ArgVT = Outs[i].VT;
+    EVT OrigVT = Outs[i].ArgVT;
+
+    /* Respect alignment of argument on the stack.  */
+    unsigned Align =
+      CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
+    ArgOffset = ((ArgOffset + Align - 1) / Align) * Align;
+
+    /* Compute GPR index associated with argument offset.  */
+    GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
+    GPR_idx = std::min(GPR_idx, NumGPRs);
+
+    // PtrOff will be used to store the current argument to the stack if a
+    // register cannot be found for it.
+    SDValue PtrOff;
+
+    PtrOff = DAG.getConstant(ArgOffset, StackPtr.getValueType());
+
+    PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
+
+    // Promote integers to 64-bit values.
+    if (Arg.getValueType() == MVT::i32 || Arg.getValueType() == MVT::i1) {
+      // FIXME: Should this use ANY_EXTEND if neither sext nor zext?
+      unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+      Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg);
+    }
+
+    // FIXME memcpy is used way more than necessary.  Correctness first.
+    // Note: "by value" is code for passing a structure by value, not
+    // basic types.
+    if (Flags.isByVal()) {
+      // Note: Size includes alignment padding, so
+      //   struct x { short a; char b; }
+      // will have Size = 4.  With #pragma pack(1), it will have Size = 3.
+      // These are the proper values we need for right-justifying the
+      // aggregate in a parameter register.
+      unsigned Size = Flags.getByValSize();
+
+      // An empty aggregate parameter takes up no storage and no
+      // registers.
+      if (Size == 0)
+        continue;
+
+      // All aggregates smaller than 8 bytes must be passed right-justified.
+      if (Size==1 || Size==2 || Size==4) {
+        EVT VT = (Size==1) ? MVT::i8 : ((Size==2) ? MVT::i16 : MVT::i32);
+        if (GPR_idx != NumGPRs) {
+          SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg,
+                                        MachinePointerInfo(), VT,
+                                        false, false, 0);
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Load));
+
+          ArgOffset += PtrByteSize;
+          continue;
+        }
+      }
+
+      if (GPR_idx == NumGPRs && Size < 8) {
+        SDValue AddPtr = PtrOff;
+        if (!isLittleEndian) {
+          SDValue Const = DAG.getConstant(PtrByteSize - Size,
+                                          PtrOff.getValueType());
+          AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
+        }
+        Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
+                                                          CallSeqStart,
+                                                          Flags, DAG, dl);
+        ArgOffset += PtrByteSize;
+        continue;
+      }
+      // Copy entire object into memory.  There are cases where gcc-generated
+      // code assumes it is there, even if it could be put entirely into
+      // registers.  (This is not what the doc says.)
+
+      // FIXME: The above statement is likely due to a misunderstanding of the
+      // documents.  All arguments must be copied into the parameter area BY
+      // THE CALLEE in the event that the callee takes the address of any
+      // formal argument.  That has not yet been implemented.  However, it is
+      // reasonable to use the stack area as a staging area for the register
+      // load.
+
+      // Skip this for small aggregates, as we will use the same slot for a
+      // right-justified copy, below.
+      if (Size >= 8)
+        Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff,
+                                                          CallSeqStart,
+                                                          Flags, DAG, dl);
+
+      // When a register is available, pass a small aggregate right-justified.
+      if (Size < 8 && GPR_idx != NumGPRs) {
+        // The easiest way to get this right-justified in a register
+        // is to copy the structure into the rightmost portion of a
+        // local variable slot, then load the whole slot into the
+        // register.
+        // FIXME: The memcpy seems to produce pretty awful code for
+        // small aggregates, particularly for packed ones.
+        // FIXME: It would be preferable to use the slot in the
+        // parameter save area instead of a new local variable.
+        SDValue AddPtr = PtrOff;
+        if (!isLittleEndian) {
+          SDValue Const = DAG.getConstant(8 - Size, PtrOff.getValueType());
+          AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
+        }
+        Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
+                                                          CallSeqStart,
+                                                          Flags, DAG, dl);
+
+        // Load the slot into the register.
+        SDValue Load = DAG.getLoad(PtrVT, dl, Chain, PtrOff,
+                                   MachinePointerInfo(),
+                                   false, false, false, 0);
+        MemOpChains.push_back(Load.getValue(1));
+        RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Load));
+
+        // Done with this argument.
+        ArgOffset += PtrByteSize;
+        continue;
+      }
+
+      // For aggregates larger than PtrByteSize, copy the pieces of the
+      // object that fit into registers from the parameter save area.
+      for (unsigned j=0; j<Size; j+=PtrByteSize) {
+        SDValue Const = DAG.getConstant(j, PtrOff.getValueType());
+        SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
+        if (GPR_idx != NumGPRs) {
+          SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg,
+                                     MachinePointerInfo(),
+                                     false, false, false, 0);
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+          ArgOffset += PtrByteSize;
+        } else {
+          ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize;
+          break;
+        }
+      }
+      continue;
+    }
+
+    switch (Arg.getSimpleValueType().SimpleTy) {
+    default: llvm_unreachable("Unexpected ValueType for argument!");
+    case MVT::i1:
+    case MVT::i32:
+    case MVT::i64:
+      // These can be scalar arguments or elements of an integer array type
+      // passed directly.  Clang may use those instead of "byval" aggregate
+      // types to avoid forcing arguments to memory unnecessarily.
+      if (GPR_idx != NumGPRs) {
+        RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Arg));
+      } else {
+        LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+                         true, isTailCall, false, MemOpChains,
+                         TailCallArguments, dl);
+      }
+      ArgOffset += PtrByteSize;
+      break;
+    case MVT::f32:
+    case MVT::f64: {
+      // These can be scalar arguments or elements of a float array type
+      // passed directly.  The latter are used to implement ELFv2 homogenous
+      // float aggregates.
+
+      // Named arguments go into FPRs first, and once they overflow, the
+      // remaining arguments go into GPRs and then the parameter save area.
+      // Unnamed arguments for vararg functions always go to GPRs and
+      // then the parameter save area.  For now, put all arguments to vararg
+      // routines always in both locations (FPR *and* GPR or stack slot).
+      bool NeedGPROrStack = isVarArg || FPR_idx == NumFPRs;
+
+      // First load the argument into the next available FPR.
+      if (FPR_idx != NumFPRs)
+        RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg));
+
+      // Next, load the argument into GPR or stack slot if needed.
+      if (!NeedGPROrStack)
+        ;
+      else if (GPR_idx != NumGPRs) {
+        // In the non-vararg case, this can only ever happen in the
+        // presence of f32 array types, since otherwise we never run
+        // out of FPRs before running out of GPRs.
+        SDValue ArgVal;
+
+        // Double values are always passed in a single GPR.
+        if (Arg.getValueType() != MVT::f32) {
+          ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
+
+        // Non-array float values are extended and passed in a GPR.
+        } else if (!Flags.isInConsecutiveRegs()) {
+          ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
+          ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal);
+
+        // If we have an array of floats, we collect every odd element
+        // together with its predecessor into one GPR.
+        } else if (ArgOffset % PtrByteSize != 0) {
+          SDValue Lo, Hi;
+          Lo = DAG.getNode(ISD::BITCAST, dl, MVT::i32, OutVals[i - 1]);
+          Hi = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
+          if (!isLittleEndian)
+            std::swap(Lo, Hi);
+          ArgVal = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
+
+        // The final element, if even, goes into the first half of a GPR.
+        } else if (Flags.isInConsecutiveRegsLast()) {
+          ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
+          ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal);
+          if (!isLittleEndian)
+            ArgVal = DAG.getNode(ISD::SHL, dl, MVT::i64, ArgVal,
+                                 DAG.getConstant(32, MVT::i32));
+
+        // Non-final even elements are skipped; they will be handled
+        // together the with subsequent argument on the next go-around.
+        } else
+          ArgVal = SDValue();
+
+        if (ArgVal.getNode())
+          RegsToPass.push_back(std::make_pair(GPR[GPR_idx], ArgVal));
+      } else {
+        // Single-precision floating-point values are mapped to the
+        // second (rightmost) word of the stack doubleword.
+        if (Arg.getValueType() == MVT::f32 &&
+            !isLittleEndian && !Flags.isInConsecutiveRegs()) {
+          SDValue ConstFour = DAG.getConstant(4, PtrOff.getValueType());
+          PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
+        }
+
+        LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+                         true, isTailCall, false, MemOpChains,
+                         TailCallArguments, dl);
+      }
+      // When passing an array of floats, the array occupies consecutive
+      // space in the argument area; only round up to the next doubleword
+      // at the end of the array.  Otherwise, each float takes 8 bytes.
+      ArgOffset += (Arg.getValueType() == MVT::f32 &&
+                    Flags.isInConsecutiveRegs()) ? 4 : 8;
+      if (Flags.isInConsecutiveRegsLast())
+        ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
+      break;
+    }
+    case MVT::v4f32:
+    case MVT::v4i32:
+    case MVT::v8i16:
+    case MVT::v16i8:
+    case MVT::v2f64:
+    case MVT::v2i64:
+      // These can be scalar arguments or elements of a vector array type
+      // passed directly.  The latter are used to implement ELFv2 homogenous
+      // vector aggregates.
+
+      // For a varargs call, named arguments go into VRs or on the stack as
+      // usual; unnamed arguments always go to the stack or the corresponding
+      // GPRs when within range.  For now, we always put the value in both
+      // locations (or even all three).
+      if (isVarArg) {
+        // We could elide this store in the case where the object fits
+        // entirely in R registers.  Maybe later.
+        SDValue Store = DAG.getStore(Chain, dl, Arg, PtrOff,
+                                     MachinePointerInfo(), false, false, 0);
+        MemOpChains.push_back(Store);
+        if (VR_idx != NumVRs) {
+          SDValue Load = DAG.getLoad(MVT::v4f32, dl, Store, PtrOff,
+                                     MachinePointerInfo(),
+                                     false, false, false, 0);
+          MemOpChains.push_back(Load.getValue(1));
+
+          unsigned VReg = (Arg.getSimpleValueType() == MVT::v2f64 ||
+                           Arg.getSimpleValueType() == MVT::v2i64) ?
+                          VSRH[VR_idx] : VR[VR_idx];
+          ++VR_idx;
+
+          RegsToPass.push_back(std::make_pair(VReg, Load));
+        }
+        ArgOffset += 16;
+        for (unsigned i=0; i<16; i+=PtrByteSize) {
+          if (GPR_idx == NumGPRs)
+            break;
+          SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
+                                  DAG.getConstant(i, PtrVT));
+          SDValue Load = DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo(),
+                                     false, false, false, 0);
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+        }
+        break;
+      }
+
+      // Non-varargs Altivec params go into VRs or on the stack.
+      if (VR_idx != NumVRs) {
+        unsigned VReg = (Arg.getSimpleValueType() == MVT::v2f64 ||
+                         Arg.getSimpleValueType() == MVT::v2i64) ?
+                        VSRH[VR_idx] : VR[VR_idx];
+        ++VR_idx;
+
+        RegsToPass.push_back(std::make_pair(VReg, Arg));
+      } else {
+        LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+                         true, isTailCall, true, MemOpChains,
+                         TailCallArguments, dl);
+      }
+      ArgOffset += 16;
+      break;
+    }
+  }
+
+  assert(NumBytesActuallyUsed == ArgOffset);
+  (void)NumBytesActuallyUsed;
+
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
+
+  // Check if this is an indirect call (MTCTR/BCTRL).
+  // See PrepareCall() for more information about calls through function
+  // pointers in the 64-bit SVR4 ABI.
+  if (!isTailCall &&
+      !dyn_cast<GlobalAddressSDNode>(Callee) &&
+      !dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    // Load r2 into a virtual register and store it to the TOC save area.
+    SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64);
+    // TOC save area offset.
+    unsigned TOCSaveOffset = PPCFrameLowering::getTOCSaveOffset(isELFv2ABI);
+    SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset);
+    SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
+    Chain = DAG.getStore(Val.getValue(1), dl, Val, AddPtr, MachinePointerInfo(),
+                         false, false, 0);
+    // In the ELFv2 ABI, R12 must contain the address of an indirect callee.
+    // This does not mean the MTCTR instruction must use R12; it's easier
+    // to model this as an extra parameter, so do that.
+    if (isELFv2ABI)
+      RegsToPass.push_back(std::make_pair((unsigned)PPC::X12, Callee));
+  }
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into the appropriate regs.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  if (isTailCall)
+    PrepareTailCall(DAG, InFlag, Chain, dl, true, SPDiff, NumBytes, LROp,
+                    FPOp, true, TailCallArguments);
+
+  return FinishCall(CallConv, dl, isTailCall, isVarArg, DAG,
+                    RegsToPass, InFlag, Chain, Callee, SPDiff, NumBytes,
+                    Ins, InVals);
+}
+
+SDValue
+PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
+                                    CallingConv::ID CallConv, bool isVarArg,
+                                    bool isTailCall,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<SDValue> &OutVals,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                    SDLoc dl, SelectionDAG &DAG,
+                                    SmallVectorImpl<SDValue> &InVals) const {
+
+  unsigned NumOps = Outs.size();
+
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  bool isPPC64 = PtrVT == MVT::i64;
+  unsigned PtrByteSize = isPPC64 ? 8 : 4;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  // Mark this function as potentially containing a function that contains a
+  // tail call. As a consequence the frame pointer will be used for dynamicalloc
+  // and restoring the callers stack pointer in this functions epilog. This is
+  // done because by tail calling the called function might overwrite the value
+  // in this function's (MF) stack pointer stack slot 0(SP).
+  if (getTargetMachine().Options.GuaranteedTailCallOpt &&
+      CallConv == CallingConv::Fast)
+    MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
+
+  // Count how many bytes are to be pushed on the stack, including the linkage
+  // area, and parameter passing area.  We start with 24/48 bytes, which is
+  // prereserved space for [SP][CR][LR][3 x unused].
+  unsigned LinkageSize = PPCFrameLowering::getLinkageSize(isPPC64, true,
+                                                          false);
+  unsigned NumBytes = LinkageSize;
+
+  // Add up all the space actually used.
+  // In 32-bit non-varargs calls, Altivec parameters all go at the end; usually
+  // they all go in registers, but we must reserve stack space for them for
+  // possible use by the caller.  In varargs or 64-bit calls, parameters are
+  // assigned stack space in order, with padding so Altivec parameters are
+  // 16-byte aligned.
+  unsigned nAltivecParamsAtEnd = 0;
+  for (unsigned i = 0; i != NumOps; ++i) {
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
+    EVT ArgVT = Outs[i].VT;
+    // Varargs Altivec parameters are padded to a 16 byte boundary.
+    if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
+        ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
+        ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64) {
+      if (!isVarArg && !isPPC64) {
+        // Non-varargs Altivec parameters go after all the non-Altivec
+        // parameters; handle those later so we know how much padding we need.
+        nAltivecParamsAtEnd++;
+        continue;
+      }
+      // Varargs and 64-bit Altivec parameters are padded to 16 byte boundary.
+      NumBytes = ((NumBytes+15)/16)*16;
+    }
+    NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
+  }
+
+  // Allow for Altivec parameters at the end, if needed.
+  if (nAltivecParamsAtEnd) {
+    NumBytes = ((NumBytes+15)/16)*16;
+    NumBytes += 16*nAltivecParamsAtEnd;
+  }
+
+  // The prolog code of the callee may store up to 8 GPR argument registers to
+  // the stack, allowing va_start to index over them in memory if its varargs.
+  // Because we cannot tell if this is needed on the caller side, we have to
+  // conservatively assume that it is needed.  As such, make sure we have at
+  // least enough stack space for the caller to store the 8 GPRs.
+  NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize);
+
+  // Tail call needs the stack to be aligned.
+  if (getTargetMachine().Options.GuaranteedTailCallOpt &&
+      CallConv == CallingConv::Fast)
+    NumBytes = EnsureStackAlignment(MF.getTarget(), NumBytes);
+
+  // Calculate by how many bytes the stack has to be adjusted in case of tail
+  // call optimization.
+  int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);
+
+  // To protect arguments on the stack from being clobbered in a tail call,
+  // force all the loads to happen before doing any other lowering.
+  if (isTailCall)
+    Chain = DAG.getStackArgumentTokenFactor(Chain);
+
+  // Adjust the stack pointer for the new arguments...
+  // These operations are automatically eliminated by the prolog/epilog pass
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                               dl);
+  SDValue CallSeqStart = Chain;
+
+  // Load the return address and frame pointer so it can be move somewhere else
+  // later.
+  SDValue LROp, FPOp;
+  Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, true,
+                                       dl);
+
+  // Set up a copy of the stack pointer for use loading and storing any
+  // arguments that may not fit in the registers available for argument
+  // passing.
+  SDValue StackPtr;
+  if (isPPC64)
+    StackPtr = DAG.getRegister(PPC::X1, MVT::i64);
+  else
+    StackPtr = DAG.getRegister(PPC::R1, MVT::i32);
+
+  // Figure out which arguments are going to go in registers, and which in
+  // memory.  Also, if this is a vararg function, floating point operations
+  // must be stored to our stack, and loaded into integer regs as well, if
+  // any integer regs are available for argument passing.
+  unsigned ArgOffset = LinkageSize;
+  unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
+
+  static const MCPhysReg GPR_32[] = {           // 32-bit registers.
+    PPC::R3, PPC::R4, PPC::R5, PPC::R6,
+    PPC::R7, PPC::R8, PPC::R9, PPC::R10,
+  };
+  static const MCPhysReg GPR_64[] = {           // 64-bit registers.
+    PPC::X3, PPC::X4, PPC::X5, PPC::X6,
+    PPC::X7, PPC::X8, PPC::X9, PPC::X10,
+  };
+  static const MCPhysReg *FPR = GetFPR();
+
+  static const MCPhysReg VR[] = {
+    PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
+    PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
+  };
+  const unsigned NumGPRs = array_lengthof(GPR_32);
+  const unsigned NumFPRs = 13;
+  const unsigned NumVRs  = array_lengthof(VR);
+
+  const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32;
+
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
+
+  SmallVector<SDValue, 8> MemOpChains;
+  for (unsigned i = 0; i != NumOps; ++i) {
+    SDValue Arg = OutVals[i];
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
+
+    // PtrOff will be used to store the current argument to the stack if a
+    // register cannot be found for it.
+    SDValue PtrOff;
+
+    PtrOff = DAG.getConstant(ArgOffset, StackPtr.getValueType());
+
+    PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
+
+    // On PPC64, promote integers to 64-bit values.
+    if (isPPC64 && Arg.getValueType() == MVT::i32) {
+      // FIXME: Should this use ANY_EXTEND if neither sext nor zext?
+      unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+      Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg);
+    }
+
+    // FIXME memcpy is used way more than necessary.  Correctness first.
+    // Note: "by value" is code for passing a structure by value, not
+    // basic types.
+    if (Flags.isByVal()) {
+      unsigned Size = Flags.getByValSize();
+      // Very small objects are passed right-justified.  Everything else is
+      // passed left-justified.
+      if (Size==1 || Size==2) {
+        EVT VT = (Size==1) ? MVT::i8 : MVT::i16;
+        if (GPR_idx != NumGPRs) {
+          SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg,
+                                        MachinePointerInfo(), VT,
+                                        false, false, 0);
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+
+          ArgOffset += PtrByteSize;
+        } else {
+          SDValue Const = DAG.getConstant(PtrByteSize - Size,
+                                          PtrOff.getValueType());
+          SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
+          Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
+                                                            CallSeqStart,
+                                                            Flags, DAG, dl);
+          ArgOffset += PtrByteSize;
+        }
+        continue;
+      }
+      // Copy entire object into memory.  There are cases where gcc-generated
+      // code assumes it is there, even if it could be put entirely into
+      // registers.  (This is not what the doc says.)
+      Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff,
+                                                        CallSeqStart,
+                                                        Flags, DAG, dl);
+
+      // For small aggregates (Darwin only) and aggregates >= PtrByteSize,
+      // copy the pieces of the object that fit into registers from the
+      // parameter save area.
+      for (unsigned j=0; j<Size; j+=PtrByteSize) {
+        SDValue Const = DAG.getConstant(j, PtrOff.getValueType());
+        SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
+        if (GPR_idx != NumGPRs) {
+          SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg,
+                                     MachinePointerInfo(),
+                                     false, false, false, 0);
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+          ArgOffset += PtrByteSize;
+        } else {
+          ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize;
+          break;
+        }
+      }
+      continue;
+    }
+
+    switch (Arg.getSimpleValueType().SimpleTy) {
+    default: llvm_unreachable("Unexpected ValueType for argument!");
+    case MVT::i1:
+    case MVT::i32:
+    case MVT::i64:
+      if (GPR_idx != NumGPRs) {
+        if (Arg.getValueType() == MVT::i1)
+          Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, PtrVT, Arg);
+
+        RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg));
+      } else {
+        LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+                         isPPC64, isTailCall, false, MemOpChains,
+                         TailCallArguments, dl);
+      }
+      ArgOffset += PtrByteSize;
+      break;
+    case MVT::f32:
+    case MVT::f64:
+      if (FPR_idx != NumFPRs) {
+        RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg));
+
+        if (isVarArg) {
+          SDValue Store = DAG.getStore(Chain, dl, Arg, PtrOff,
+                                       MachinePointerInfo(), false, false, 0);
+          MemOpChains.push_back(Store);
+
+          // Float varargs are always shadowed in available integer registers
+          if (GPR_idx != NumGPRs) {
+            SDValue Load = DAG.getLoad(PtrVT, dl, Store, PtrOff,
+                                       MachinePointerInfo(), false, false,
+                                       false, 0);
+            MemOpChains.push_back(Load.getValue(1));
+            RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+          }
+          if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 && !isPPC64){
+            SDValue ConstFour = DAG.getConstant(4, PtrOff.getValueType());
+            PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
+            SDValue Load = DAG.getLoad(PtrVT, dl, Store, PtrOff,
+                                       MachinePointerInfo(),
+                                       false, false, false, 0);
+            MemOpChains.push_back(Load.getValue(1));
+            RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+          }
+        } else {
+          // If we have any FPRs remaining, we may also have GPRs remaining.
+          // Args passed in FPRs consume either 1 (f32) or 2 (f64) available
+          // GPRs.
+          if (GPR_idx != NumGPRs)
+            ++GPR_idx;
+          if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 &&
+              !isPPC64)  // PPC64 has 64-bit GPR's obviously :)
+            ++GPR_idx;
+        }
+      } else
+        LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+                         isPPC64, isTailCall, false, MemOpChains,
+                         TailCallArguments, dl);
+      if (isPPC64)
+        ArgOffset += 8;
+      else
+        ArgOffset += Arg.getValueType() == MVT::f32 ? 4 : 8;
+      break;
+    case MVT::v4f32:
+    case MVT::v4i32:
+    case MVT::v8i16:
+    case MVT::v16i8:
+      if (isVarArg) {
+        // These go aligned on the stack, or in the corresponding R registers
+        // when within range.  The Darwin PPC ABI doc claims they also go in
+        // V registers; in fact gcc does this only for arguments that are
+        // prototyped, not for those that match the ...  We do it for all
+        // arguments, seems to work.
+        while (ArgOffset % 16 !=0) {
+          ArgOffset += PtrByteSize;
+          if (GPR_idx != NumGPRs)
+            GPR_idx++;
+        }
+        // We could elide this store in the case where the object fits
+        // entirely in R registers.  Maybe later.
+        PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr,
+                            DAG.getConstant(ArgOffset, PtrVT));
+        SDValue Store = DAG.getStore(Chain, dl, Arg, PtrOff,
+                                     MachinePointerInfo(), false, false, 0);
+        MemOpChains.push_back(Store);
+        if (VR_idx != NumVRs) {
+          SDValue Load = DAG.getLoad(MVT::v4f32, dl, Store, PtrOff,
+                                     MachinePointerInfo(),
+                                     false, false, false, 0);
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(VR[VR_idx++], Load));
+        }
+        ArgOffset += 16;
+        for (unsigned i=0; i<16; i+=PtrByteSize) {
+          if (GPR_idx == NumGPRs)
+            break;
+          SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
+                                  DAG.getConstant(i, PtrVT));
+          SDValue Load = DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo(),
+                                     false, false, false, 0);
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+        }
+        break;
+      }
+
+      // Non-varargs Altivec params generally go in registers, but have
+      // stack space allocated at the end.
+      if (VR_idx != NumVRs) {
+        // Doesn't have GPR space allocated.
+        RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg));
+      } else if (nAltivecParamsAtEnd==0) {
+        // We are emitting Altivec params in order.
+        LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+                         isPPC64, isTailCall, true, MemOpChains,
+                         TailCallArguments, dl);
+        ArgOffset += 16;
+      }
+      break;
+    }
+  }
+  // If all Altivec parameters fit in registers, as they usually do,
+  // they get stack space following the non-Altivec parameters.  We
+  // don't track this here because nobody below needs it.
+  // If there are more Altivec parameters than fit in registers emit
+  // the stores here.
+  if (!isVarArg && nAltivecParamsAtEnd > NumVRs) {
+    unsigned j = 0;
+    // Offset is aligned; skip 1st 12 params which go in V registers.
+    ArgOffset = ((ArgOffset+15)/16)*16;
+    ArgOffset += 12*16;
+    for (unsigned i = 0; i != NumOps; ++i) {
+      SDValue Arg = OutVals[i];
+      EVT ArgType = Outs[i].VT;
+      if (ArgType==MVT::v4f32 || ArgType==MVT::v4i32 ||
+          ArgType==MVT::v8i16 || ArgType==MVT::v16i8) {
+        if (++j > NumVRs) {
+          SDValue PtrOff;
+          // We are emitting Altivec params in order.
+          LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+                           isPPC64, isTailCall, true, MemOpChains,
+                           TailCallArguments, dl);
+          ArgOffset += 16;
+        }
+      }
+    }
+  }
+
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
+
+  // On Darwin, R12 must contain the address of an indirect callee.  This does
+  // not mean the MTCTR instruction must use R12; it's easier to model this as
+  // an extra parameter, so do that.
+  if (!isTailCall &&
+      !dyn_cast<GlobalAddressSDNode>(Callee) &&
+      !dyn_cast<ExternalSymbolSDNode>(Callee) &&
+      !isBLACompatibleAddress(Callee, DAG))
+    RegsToPass.push_back(std::make_pair((unsigned)(isPPC64 ? PPC::X12 :
+                                                   PPC::R12), Callee));
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into the appropriate regs.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  if (isTailCall)
+    PrepareTailCall(DAG, InFlag, Chain, dl, isPPC64, SPDiff, NumBytes, LROp,
+                    FPOp, true, TailCallArguments);
+
+  return FinishCall(CallConv, dl, isTailCall, isVarArg, DAG,
+                    RegsToPass, InFlag, Chain, Callee, SPDiff, NumBytes,
+                    Ins, InVals);
+}
+
+bool
+PPCTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
+                                  MachineFunction &MF, bool isVarArg,
+                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                  LLVMContext &Context) const {
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
+                 RVLocs, Context);
+  return CCInfo.CheckReturn(Outs, RetCC_PPC);
+}
+
+SDValue
+PPCTargetLowering::LowerReturn(SDValue Chain,
+                               CallingConv::ID CallConv, bool isVarArg,
+                               const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               const SmallVectorImpl<SDValue> &OutVals,
+                               SDLoc dl, SelectionDAG &DAG) const {
+
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), RVLocs, *DAG.getContext());
+  CCInfo.AnalyzeReturn(Outs, RetCC_PPC);
+
+  SDValue Flag;
+  SmallVector<SDValue, 4> RetOps(1, Chain);
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    SDValue Arg = OutVals[i];
+
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::AExt:
+      Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    }
+
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
+
+  RetOps[0] = Chain;  // Update chain.
+
+  // Add the flag if we have it.
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
+  return DAG.getNode(PPCISD::RET_FLAG, dl, MVT::Other, RetOps);
+}
+
+SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG,
+                                   const PPCSubtarget &Subtarget) const {
+  // When we pop the dynamic allocation we need to restore the SP link.
+  SDLoc dl(Op);
+
+  // Get the corect type for pointers.
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+
+  // Construct the stack pointer operand.
+  bool isPPC64 = Subtarget.isPPC64();
+  unsigned SP = isPPC64 ? PPC::X1 : PPC::R1;
+  SDValue StackPtr = DAG.getRegister(SP, PtrVT);
+
+  // Get the operands for the STACKRESTORE.
+  SDValue Chain = Op.getOperand(0);
+  SDValue SaveSP = Op.getOperand(1);
+
+  // Load the old link SP.
+  SDValue LoadLinkSP = DAG.getLoad(PtrVT, dl, Chain, StackPtr,
+                                   MachinePointerInfo(),
+                                   false, false, false, 0);
+
+  // Restore the stack pointer.
+  Chain = DAG.getCopyToReg(LoadLinkSP.getValue(1), dl, SP, SaveSP);
+
+  // Store the old link SP.
+  return DAG.getStore(Chain, dl, LoadLinkSP, StackPtr, MachinePointerInfo(),
+                      false, false, 0);
+}
+
+
+
+SDValue
+PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG & DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool isPPC64 = Subtarget.isPPC64();
+  bool isDarwinABI = Subtarget.isDarwinABI();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+
+  // Get current frame pointer save index.  The users of this index will be
+  // primarily DYNALLOC instructions.
+  PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+  int RASI = FI->getReturnAddrSaveIndex();
+
+  // If the frame pointer save index hasn't been defined yet.
+  if (!RASI) {
+    // Find out what the fix offset of the frame pointer save area.
+    int LROffset = PPCFrameLowering::getReturnSaveOffset(isPPC64, isDarwinABI);
+    // Allocate the frame index for frame pointer save area.
+    RASI = MF.getFrameInfo()->CreateFixedObject(isPPC64? 8 : 4, LROffset, true);
+    // Save the result.
+    FI->setReturnAddrSaveIndex(RASI);
+  }
+  return DAG.getFrameIndex(RASI, PtrVT);
+}
+
+SDValue
+PPCTargetLowering::getFramePointerFrameIndex(SelectionDAG & DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool isPPC64 = Subtarget.isPPC64();
+  bool isDarwinABI = Subtarget.isDarwinABI();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+
+  // Get current frame pointer save index.  The users of this index will be
+  // primarily DYNALLOC instructions.
+  PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+  int FPSI = FI->getFramePointerSaveIndex();
+
+  // If the frame pointer save index hasn't been defined yet.
+  if (!FPSI) {
+    // Find out what the fix offset of the frame pointer save area.
+    int FPOffset = PPCFrameLowering::getFramePointerSaveOffset(isPPC64,
+                                                           isDarwinABI);
+
+    // Allocate the frame index for frame pointer save area.
+    FPSI = MF.getFrameInfo()->CreateFixedObject(isPPC64? 8 : 4, FPOffset, true);
+    // Save the result.
+    FI->setFramePointerSaveIndex(FPSI);
+  }
+  return DAG.getFrameIndex(FPSI, PtrVT);
+}
+
+SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+                                         SelectionDAG &DAG,
+                                         const PPCSubtarget &Subtarget) const {
+  // Get the inputs.
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size  = Op.getOperand(1);
+  SDLoc dl(Op);
+
+  // Get the corect type for pointers.
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  // Negate the size.
+  SDValue NegSize = DAG.getNode(ISD::SUB, dl, PtrVT,
+                                  DAG.getConstant(0, PtrVT), Size);
+  // Construct a node for the frame pointer save index.
+  SDValue FPSIdx = getFramePointerFrameIndex(DAG);
+  // Build a DYNALLOC node.
+  SDValue Ops[3] = { Chain, NegSize, FPSIdx };
+  SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other);
+  return DAG.getNode(PPCISD::DYNALLOC, dl, VTs, Ops);
+}
+
+SDValue PPCTargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  return DAG.getNode(PPCISD::EH_SJLJ_SETJMP, DL,
+                     DAG.getVTList(MVT::i32, MVT::Other),
+                     Op.getOperand(0), Op.getOperand(1));
+}
+
+SDValue PPCTargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  return DAG.getNode(PPCISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
+                     Op.getOperand(0), Op.getOperand(1));
+}
+
+SDValue PPCTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+  assert(Op.getValueType() == MVT::i1 &&
+         "Custom lowering only for i1 loads");
+
+  // First, load 8 bits into 32 bits, then truncate to 1 bit.
+
+  SDLoc dl(Op);
+  LoadSDNode *LD = cast<LoadSDNode>(Op);
+
+  SDValue Chain = LD->getChain();
+  SDValue BasePtr = LD->getBasePtr();
+  MachineMemOperand *MMO = LD->getMemOperand();
+
+  SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, dl, getPointerTy(), Chain,
+                                 BasePtr, MVT::i8, MMO);
+  SDValue Result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewLD);
+
+  SDValue Ops[] = { Result, SDValue(NewLD.getNode(), 1) };
+  return DAG.getMergeValues(Ops, dl);
+}
+
+SDValue PPCTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+  assert(Op.getOperand(1).getValueType() == MVT::i1 &&
+         "Custom lowering only for i1 stores");
+
+  // First, zero extend to 32 bits, then use a truncating store to 8 bits.
+
+  SDLoc dl(Op);
+  StoreSDNode *ST = cast<StoreSDNode>(Op);
+
+  SDValue Chain = ST->getChain();
+  SDValue BasePtr = ST->getBasePtr();
+  SDValue Value = ST->getValue();
+  MachineMemOperand *MMO = ST->getMemOperand();
+
+  Value = DAG.getNode(ISD::ZERO_EXTEND, dl, getPointerTy(), Value);
+  return DAG.getTruncStore(Chain, dl, Value, BasePtr, MVT::i8, MMO);
+}
+
+// FIXME: Remove this once the ANDI glue bug is fixed:
+SDValue PPCTargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
+  assert(Op.getValueType() == MVT::i1 &&
+         "Custom lowering only for i1 results");
+
+  SDLoc DL(Op);
+  return DAG.getNode(PPCISD::ANDIo_1_GT_BIT, DL, MVT::i1,
+                     Op.getOperand(0));
+}
+
+/// LowerSELECT_CC - Lower floating point select_cc's into fsel instruction when
+/// possible.
+SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
+  // Not FP? Not a fsel.
+  if (!Op.getOperand(0).getValueType().isFloatingPoint() ||
+      !Op.getOperand(2).getValueType().isFloatingPoint())
+    return Op;
+
+  // We might be able to do better than this under some circumstances, but in
+  // general, fsel-based lowering of select is a finite-math-only optimization.
+  // For more information, see section F.3 of the 2.06 ISA specification.
+  if (!DAG.getTarget().Options.NoInfsFPMath ||
+      !DAG.getTarget().Options.NoNaNsFPMath)
+    return Op;
+
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+
+  EVT ResVT = Op.getValueType();
+  EVT CmpVT = Op.getOperand(0).getValueType();
+  SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
+  SDValue TV  = Op.getOperand(2), FV  = Op.getOperand(3);
+  SDLoc dl(Op);
+
+  // If the RHS of the comparison is a 0.0, we don't need to do the
+  // subtraction at all.
+  SDValue Sel1;
+  if (isFloatingPointZero(RHS))
+    switch (CC) {
+    default: break;       // SETUO etc aren't handled by fsel.
+    case ISD::SETNE:
+      std::swap(TV, FV);
+    case ISD::SETEQ:
+      if (LHS.getValueType() == MVT::f32)   // Comparison is always 64-bits
+        LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS);
+      Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, LHS, TV, FV);
+      if (Sel1.getValueType() == MVT::f32)   // Comparison is always 64-bits
+        Sel1 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Sel1);
+      return DAG.getNode(PPCISD::FSEL, dl, ResVT,
+                         DAG.getNode(ISD::FNEG, dl, MVT::f64, LHS), Sel1, FV);
+    case ISD::SETULT:
+    case ISD::SETLT:
+      std::swap(TV, FV);  // fsel is natively setge, swap operands for setlt
+    case ISD::SETOGE:
+    case ISD::SETGE:
+      if (LHS.getValueType() == MVT::f32)   // Comparison is always 64-bits
+        LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS);
+      return DAG.getNode(PPCISD::FSEL, dl, ResVT, LHS, TV, FV);
+    case ISD::SETUGT:
+    case ISD::SETGT:
+      std::swap(TV, FV);  // fsel is natively setge, swap operands for setlt
+    case ISD::SETOLE:
+    case ISD::SETLE:
+      if (LHS.getValueType() == MVT::f32)   // Comparison is always 64-bits
+        LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS);
+      return DAG.getNode(PPCISD::FSEL, dl, ResVT,
+                         DAG.getNode(ISD::FNEG, dl, MVT::f64, LHS), TV, FV);
+    }
+
+  SDValue Cmp;
+  switch (CC) {
+  default: break;       // SETUO etc aren't handled by fsel.
+  case ISD::SETNE:
+    std::swap(TV, FV);
+  case ISD::SETEQ:
+    Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS);
+    if (Cmp.getValueType() == MVT::f32)   // Comparison is always 64-bits
+      Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
+    Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
+    if (Sel1.getValueType() == MVT::f32)   // Comparison is always 64-bits
+      Sel1 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Sel1);
+    return DAG.getNode(PPCISD::FSEL, dl, ResVT,
+                       DAG.getNode(ISD::FNEG, dl, MVT::f64, Cmp), Sel1, FV);
+  case ISD::SETULT:
+  case ISD::SETLT:
+    Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS);
+    if (Cmp.getValueType() == MVT::f32)   // Comparison is always 64-bits
+      Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
+    return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV);
+  case ISD::SETOGE:
+  case ISD::SETGE:
+    Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS);
+    if (Cmp.getValueType() == MVT::f32)   // Comparison is always 64-bits
+      Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
+    return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
+  case ISD::SETUGT:
+  case ISD::SETGT:
+    Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS);
+    if (Cmp.getValueType() == MVT::f32)   // Comparison is always 64-bits
+      Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
+    return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV);
+  case ISD::SETOLE:
+  case ISD::SETLE:
+    Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS);
+    if (Cmp.getValueType() == MVT::f32)   // Comparison is always 64-bits
+      Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
+    return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
+  }
+  return Op;
+}
+
+// FIXME: Split this code up when LegalizeDAGTypes lands.
+SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
+                                           SDLoc dl) const {
+  assert(Op.getOperand(0).getValueType().isFloatingPoint());
+  SDValue Src = Op.getOperand(0);
+  if (Src.getValueType() == MVT::f32)
+    Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);
+
+  SDValue Tmp;
+  switch (Op.getSimpleValueType().SimpleTy) {
+  default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!");
+  case MVT::i32:
+    Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIWZ :
+                        (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ :
+                                                   PPCISD::FCTIDZ),
+                      dl, MVT::f64, Src);
+    break;
+  case MVT::i64:
+    assert((Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()) &&
+           "i64 FP_TO_UINT is supported only with FPCVT");
+    Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIDZ :
+                                                        PPCISD::FCTIDUZ,
+                      dl, MVT::f64, Src);
+    break;
+  }
+
+  // Convert the FP value to an int value through memory.
+  bool i32Stack = Op.getValueType() == MVT::i32 && Subtarget.hasSTFIWX() &&
+    (Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT());
+  SDValue FIPtr = DAG.CreateStackTemporary(i32Stack ? MVT::i32 : MVT::f64);
+  int FI = cast<FrameIndexSDNode>(FIPtr)->getIndex();
+  MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(FI);
+
+  // Emit a store to the stack slot.
+  SDValue Chain;
+  if (i32Stack) {
+    MachineFunction &MF = DAG.getMachineFunction();
+    MachineMemOperand *MMO =
+      MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 4, 4);
+    SDValue Ops[] = { DAG.getEntryNode(), Tmp, FIPtr };
+    Chain = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl,
+              DAG.getVTList(MVT::Other), Ops, MVT::i32, MMO);
+  } else
+    Chain = DAG.getStore(DAG.getEntryNode(), dl, Tmp, FIPtr,
+                         MPI, false, false, 0);
+
+  // Result is a load from the stack slot.  If loading 4 bytes, make sure to
+  // add in a bias.
+  if (Op.getValueType() == MVT::i32 && !i32Stack) {
+    FIPtr = DAG.getNode(ISD::ADD, dl, FIPtr.getValueType(), FIPtr,
+                        DAG.getConstant(4, FIPtr.getValueType()));
+    MPI = MachinePointerInfo();
+  }
+
+  return DAG.getLoad(Op.getValueType(), dl, Chain, FIPtr, MPI,
+                     false, false, false, 0);
+}
+
+SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  // Don't handle ppc_fp128 here; let it be lowered to a libcall.
+  if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
+    return SDValue();
+
+  if (Op.getOperand(0).getValueType() == MVT::i1)
+    return DAG.getNode(ISD::SELECT, dl, Op.getValueType(), Op.getOperand(0),
+                       DAG.getConstantFP(1.0, Op.getValueType()),
+                       DAG.getConstantFP(0.0, Op.getValueType()));
+
+  assert((Op.getOpcode() == ISD::SINT_TO_FP || Subtarget.hasFPCVT()) &&
+         "UINT_TO_FP is supported only with FPCVT");
+
+  // If we have FCFIDS, then use it when converting to single-precision.
+  // Otherwise, convert to double-precision and then round.
+  unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) ?
+                   (Op.getOpcode() == ISD::UINT_TO_FP ?
+                    PPCISD::FCFIDUS : PPCISD::FCFIDS) :
+                   (Op.getOpcode() == ISD::UINT_TO_FP ?
+                    PPCISD::FCFIDU : PPCISD::FCFID);
+  MVT      FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) ?
+                   MVT::f32 : MVT::f64;
+
+  if (Op.getOperand(0).getValueType() == MVT::i64) {
+    SDValue SINT = Op.getOperand(0);
+    // When converting to single-precision, we actually need to convert
+    // to double-precision first and then round to single-precision.
+    // To avoid double-rounding effects during that operation, we have
+    // to prepare the input operand.  Bits that might be truncated when
+    // converting to double-precision are replaced by a bit that won't
+    // be lost at this stage, but is below the single-precision rounding
+    // position.
+    //
+    // However, if -enable-unsafe-fp-math is in effect, accept double
+    // rounding to avoid the extra overhead.
+    if (Op.getValueType() == MVT::f32 &&
+        !Subtarget.hasFPCVT() &&
+        !DAG.getTarget().Options.UnsafeFPMath) {
+
+      // Twiddle input to make sure the low 11 bits are zero.  (If this
+      // is the case, we are guaranteed the value will fit into the 53 bit
+      // mantissa of an IEEE double-precision value without rounding.)
+      // If any of those low 11 bits were not zero originally, make sure
+      // bit 12 (value 2048) is set instead, so that the final rounding
+      // to single-precision gets the correct result.
+      SDValue Round = DAG.getNode(ISD::AND, dl, MVT::i64,
+                                  SINT, DAG.getConstant(2047, MVT::i64));
+      Round = DAG.getNode(ISD::ADD, dl, MVT::i64,
+                          Round, DAG.getConstant(2047, MVT::i64));
+      Round = DAG.getNode(ISD::OR, dl, MVT::i64, Round, SINT);
+      Round = DAG.getNode(ISD::AND, dl, MVT::i64,
+                          Round, DAG.getConstant(-2048, MVT::i64));
+
+      // However, we cannot use that value unconditionally: if the magnitude
+      // of the input value is small, the bit-twiddling we did above might
+      // end up visibly changing the output.  Fortunately, in that case, we
+      // don't need to twiddle bits since the original input will convert
+      // exactly to double-precision floating-point already.  Therefore,
+      // construct a conditional to use the original value if the top 11
+      // bits are all sign-bit copies, and use the rounded value computed
+      // above otherwise.
+      SDValue Cond = DAG.getNode(ISD::SRA, dl, MVT::i64,
+                                 SINT, DAG.getConstant(53, MVT::i32));
+      Cond = DAG.getNode(ISD::ADD, dl, MVT::i64,
+                         Cond, DAG.getConstant(1, MVT::i64));
+      Cond = DAG.getSetCC(dl, MVT::i32,
+                          Cond, DAG.getConstant(1, MVT::i64), ISD::SETUGT);
+
+      SINT = DAG.getNode(ISD::SELECT, dl, MVT::i64, Cond, Round, SINT);
+    }
+
+    SDValue Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SINT);
+    SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Bits);
+
+    if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT())
+      FP = DAG.getNode(ISD::FP_ROUND, dl,
+                       MVT::f32, FP, DAG.getIntPtrConstant(0));
+    return FP;
+  }
+
+  assert(Op.getOperand(0).getValueType() == MVT::i32 &&
+         "Unhandled INT_TO_FP type in custom expander!");
+  // Since we only generate this in 64-bit mode, we can take advantage of
+  // 64-bit registers.  In particular, sign extend the input value into the
+  // 64-bit register with extsw, store the WHOLE 64-bit value into the stack
+  // then lfd it and fcfid it.
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *FrameInfo = MF.getFrameInfo();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+
+  SDValue Ld;
+  if (Subtarget.hasLFIWAX() || Subtarget.hasFPCVT()) {
+    int FrameIdx = FrameInfo->CreateStackObject(4, 4, false);
+    SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
+
+    SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx,
+                                 MachinePointerInfo::getFixedStack(FrameIdx),
+                                 false, false, 0);
+
+    assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 &&
+           "Expected an i32 store");
+    MachineMemOperand *MMO =
+      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx),
+                              MachineMemOperand::MOLoad, 4, 4);
+    SDValue Ops[] = { Store, FIdx };
+    Ld = DAG.getMemIntrinsicNode(Op.getOpcode() == ISD::UINT_TO_FP ?
+                                   PPCISD::LFIWZX : PPCISD::LFIWAX,
+                                 dl, DAG.getVTList(MVT::f64, MVT::Other),
+                                 Ops, MVT::i32, MMO);
+  } else {
+    assert(Subtarget.isPPC64() &&
+           "i32->FP without LFIWAX supported only on PPC64");
+
+    int FrameIdx = FrameInfo->CreateStackObject(8, 8, false);
+    SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
+
+    SDValue Ext64 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i64,
+                                Op.getOperand(0));
+
+    // STD the extended value into the stack slot.
+    SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Ext64, FIdx,
+                                 MachinePointerInfo::getFixedStack(FrameIdx),
+                                 false, false, 0);
+
+    // Load the value as a double.
+    Ld = DAG.getLoad(MVT::f64, dl, Store, FIdx,
+                     MachinePointerInfo::getFixedStack(FrameIdx),
+                     false, false, false, 0);
+  }
+
+  // FCFID it and return it.
+  SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Ld);
+  if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT())
+    FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP, DAG.getIntPtrConstant(0));
+  return FP;
+}
+
+SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  /*
+   The rounding mode is in bits 30:31 of FPSR, and has the following
+   settings:
+     00 Round to nearest
+     01 Round to 0
+     10 Round to +inf
+     11 Round to -inf
+
+  FLT_ROUNDS, on the other hand, expects the following:
+    -1 Undefined
+     0 Round to 0
+     1 Round to nearest
+     2 Round to +inf
+     3 Round to -inf
+
+  To perform the conversion, we do:
+    ((FPSCR & 0x3) ^ ((~FPSCR & 0x3) >> 1))
+  */
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  EVT VT = Op.getValueType();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+
+  // Save FP Control Word to register
+  EVT NodeTys[] = {
+    MVT::f64,    // return register
+    MVT::Glue    // unused in this context
+  };
+  SDValue Chain = DAG.getNode(PPCISD::MFFS, dl, NodeTys, None);
+
+  // Save FP register to stack slot
+  int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8, false);
+  SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Chain,
+                               StackSlot, MachinePointerInfo(), false, false,0);
+
+  // Load FP Control Word from low 32 bits of stack slot.
+  SDValue Four = DAG.getConstant(4, PtrVT);
+  SDValue Addr = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, Four);
+  SDValue CWD = DAG.getLoad(MVT::i32, dl, Store, Addr, MachinePointerInfo(),
+                            false, false, false, 0);
+
+  // Transform as necessary
+  SDValue CWD1 =
+    DAG.getNode(ISD::AND, dl, MVT::i32,
+                CWD, DAG.getConstant(3, MVT::i32));
+  SDValue CWD2 =
+    DAG.getNode(ISD::SRL, dl, MVT::i32,
+                DAG.getNode(ISD::AND, dl, MVT::i32,
+                            DAG.getNode(ISD::XOR, dl, MVT::i32,
+                                        CWD, DAG.getConstant(3, MVT::i32)),
+                            DAG.getConstant(3, MVT::i32)),
+                DAG.getConstant(1, MVT::i32));
+
+  SDValue RetVal =
+    DAG.getNode(ISD::XOR, dl, MVT::i32, CWD1, CWD2);
+
+  return DAG.getNode((VT.getSizeInBits() < 16 ?
+                      ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal);
+}
+
+SDValue PPCTargetLowering::LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  unsigned BitWidth = VT.getSizeInBits();
+  SDLoc dl(Op);
+  assert(Op.getNumOperands() == 3 &&
+         VT == Op.getOperand(1).getValueType() &&
+         "Unexpected SHL!");
+
+  // Expand into a bunch of logical ops.  Note that these ops
+  // depend on the PPC behavior for oversized shift amounts.
+  SDValue Lo = Op.getOperand(0);
+  SDValue Hi = Op.getOperand(1);
+  SDValue Amt = Op.getOperand(2);
+  EVT AmtVT = Amt.getValueType();
+
+  SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
+                             DAG.getConstant(BitWidth, AmtVT), Amt);
+  SDValue Tmp2 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Amt);
+  SDValue Tmp3 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Tmp1);
+  SDValue Tmp4 = DAG.getNode(ISD::OR , dl, VT, Tmp2, Tmp3);
+  SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,
+                             DAG.getConstant(-BitWidth, AmtVT));
+  SDValue Tmp6 = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Tmp5);
+  SDValue OutHi = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6);
+  SDValue OutLo = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Amt);
+  SDValue OutOps[] = { OutLo, OutHi };
+  return DAG.getMergeValues(OutOps, dl);
+}
+
+SDValue PPCTargetLowering::LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  SDLoc dl(Op);
+  unsigned BitWidth = VT.getSizeInBits();
+  assert(Op.getNumOperands() == 3 &&
+         VT == Op.getOperand(1).getValueType() &&
+         "Unexpected SRL!");
+
+  // Expand into a bunch of logical ops.  Note that these ops
+  // depend on the PPC behavior for oversized shift amounts.
+  SDValue Lo = Op.getOperand(0);
+  SDValue Hi = Op.getOperand(1);
+  SDValue Amt = Op.getOperand(2);
+  EVT AmtVT = Amt.getValueType();
+
+  SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
+                             DAG.getConstant(BitWidth, AmtVT), Amt);
+  SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt);
+  SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1);
+  SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
+  SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,
+                             DAG.getConstant(-BitWidth, AmtVT));
+  SDValue Tmp6 = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Tmp5);
+  SDValue OutLo = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6);
+  SDValue OutHi = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Amt);
+  SDValue OutOps[] = { OutLo, OutHi };
+  return DAG.getMergeValues(OutOps, dl);
+}
+
+SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+  unsigned BitWidth = VT.getSizeInBits();
+  assert(Op.getNumOperands() == 3 &&
+         VT == Op.getOperand(1).getValueType() &&
+         "Unexpected SRA!");
+
+  // Expand into a bunch of logical ops, followed by a select_cc.
+  SDValue Lo = Op.getOperand(0);
+  SDValue Hi = Op.getOperand(1);
+  SDValue Amt = Op.getOperand(2);
+  EVT AmtVT = Amt.getValueType();
+
+  SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
+                             DAG.getConstant(BitWidth, AmtVT), Amt);
+  SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt);
+  SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1);
+  SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
+  SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,
+                             DAG.getConstant(-BitWidth, AmtVT));
+  SDValue Tmp6 = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Tmp5);
+  SDValue OutHi = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Amt);
+  SDValue OutLo = DAG.getSelectCC(dl, Tmp5, DAG.getConstant(0, AmtVT),
+                                  Tmp4, Tmp6, ISD::SETLE);
+  SDValue OutOps[] = { OutLo, OutHi };
+  return DAG.getMergeValues(OutOps, dl);
+}
+
+//===----------------------------------------------------------------------===//
+// Vector related lowering.
+//
+
+/// BuildSplatI - Build a canonical splati of Val with an element size of
+/// SplatSize.  Cast the result to VT.
+static SDValue BuildSplatI(int Val, unsigned SplatSize, EVT VT,
+                             SelectionDAG &DAG, SDLoc dl) {
+  assert(Val >= -16 && Val <= 15 && "vsplti is out of range!");
+
+  static const EVT VTys[] = { // canonical VT to use for each size.
+    MVT::v16i8, MVT::v8i16, MVT::Other, MVT::v4i32
+  };
+
+  EVT ReqVT = VT != MVT::Other ? VT : VTys[SplatSize-1];
+
+  // Force vspltis[hw] -1 to vspltisb -1 to canonicalize.
+  if (Val == -1)
+    SplatSize = 1;
+
+  EVT CanonicalVT = VTys[SplatSize-1];
+
+  // Build a canonical splat for this value.
+  SDValue Elt = DAG.getConstant(Val, MVT::i32);
+  SmallVector<SDValue, 8> Ops;
+  Ops.assign(CanonicalVT.getVectorNumElements(), Elt);
+  SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, dl, CanonicalVT, Ops);
+  return DAG.getNode(ISD::BITCAST, dl, ReqVT, Res);
+}
+
+/// BuildIntrinsicOp - Return a unary operator intrinsic node with the
+/// specified intrinsic ID.
+static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op,
+                                SelectionDAG &DAG, SDLoc dl,
+                                EVT DestVT = MVT::Other) {
+  if (DestVT == MVT::Other) DestVT = Op.getValueType();
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
+                     DAG.getConstant(IID, MVT::i32), Op);
+}
+
+/// BuildIntrinsicOp - Return a binary operator intrinsic node with the
+/// specified intrinsic ID.
+static SDValue BuildIntrinsicOp(unsigned IID, SDValue LHS, SDValue RHS,
+                                SelectionDAG &DAG, SDLoc dl,
+                                EVT DestVT = MVT::Other) {
+  if (DestVT == MVT::Other) DestVT = LHS.getValueType();
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
+                     DAG.getConstant(IID, MVT::i32), LHS, RHS);
+}
+
+/// BuildIntrinsicOp - Return a ternary operator intrinsic node with the
+/// specified intrinsic ID.
+static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op0, SDValue Op1,
+                                SDValue Op2, SelectionDAG &DAG,
+                                SDLoc dl, EVT DestVT = MVT::Other) {
+  if (DestVT == MVT::Other) DestVT = Op0.getValueType();
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
+                     DAG.getConstant(IID, MVT::i32), Op0, Op1, Op2);
+}
+
+
+/// BuildVSLDOI - Return a VECTOR_SHUFFLE that is a vsldoi of the specified
+/// amount.  The result has the specified value type.
+static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt,
+                             EVT VT, SelectionDAG &DAG, SDLoc dl) {
+  // Force LHS/RHS to be the right type.
+  LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, LHS);
+  RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, RHS);
+
+  int Ops[16];
+  for (unsigned i = 0; i != 16; ++i)
+    Ops[i] = i + Amt;
+  SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, LHS, RHS, Ops);
+  return DAG.getNode(ISD::BITCAST, dl, VT, T);
+}
+
+// If this is a case we can't handle, return null and let the default
+// expansion code take care of it.  If we CAN select this case, and if it
+// selects to a single instruction, return Op.  Otherwise, if we can codegen
+// this case more efficiently than a constant pool load, lower it to the
+// sequence of ops that should be used.
+SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
+  assert(BVN && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR");
+
+  // Check if this is a splat of a constant value.
+  APInt APSplatBits, APSplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (! BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
+                             HasAnyUndefs, 0, true) || SplatBitSize > 32)
+    return SDValue();
+
+  unsigned SplatBits = APSplatBits.getZExtValue();
+  unsigned SplatUndef = APSplatUndef.getZExtValue();
+  unsigned SplatSize = SplatBitSize / 8;
+
+  // First, handle single instruction cases.
+
+  // All zeros?
+  if (SplatBits == 0) {
+    // Canonicalize all zero vectors to be v4i32.
+    if (Op.getValueType() != MVT::v4i32 || HasAnyUndefs) {
+      SDValue Z = DAG.getConstant(0, MVT::i32);
+      Z = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Z, Z, Z, Z);
+      Op = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Z);
+    }
+    return Op;
+  }
+
+  // If the sign extended value is in the range [-16,15], use VSPLTI[bhw].
+  int32_t SextVal= (int32_t(SplatBits << (32-SplatBitSize)) >>
+                    (32-SplatBitSize));
+  if (SextVal >= -16 && SextVal <= 15)
+    return BuildSplatI(SextVal, SplatSize, Op.getValueType(), DAG, dl);
+
+
+  // Two instruction sequences.
+
+  // If this value is in the range [-32,30] and is even, use:
+  //     VSPLTI[bhw](val/2) + VSPLTI[bhw](val/2)
+  // If this value is in the range [17,31] and is odd, use:
+  //     VSPLTI[bhw](val-16) - VSPLTI[bhw](-16)
+  // If this value is in the range [-31,-17] and is odd, use:
+  //     VSPLTI[bhw](val+16) + VSPLTI[bhw](-16)
+  // Note the last two are three-instruction sequences.
+  if (SextVal >= -32 && SextVal <= 31) {
+    // To avoid having these optimizations undone by constant folding,
+    // we convert to a pseudo that will be expanded later into one of
+    // the above forms.
+    SDValue Elt = DAG.getConstant(SextVal, MVT::i32);
+    EVT VT = (SplatSize == 1 ? MVT::v16i8 :
+              (SplatSize == 2 ? MVT::v8i16 : MVT::v4i32));
+    SDValue EltSize = DAG.getConstant(SplatSize, MVT::i32);
+    SDValue RetVal = DAG.getNode(PPCISD::VADD_SPLAT, dl, VT, Elt, EltSize);
+    if (VT == Op.getValueType())
+      return RetVal;
+    else
+      return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), RetVal);
+  }
+
+  // If this is 0x8000_0000 x 4, turn into vspltisw + vslw.  If it is
+  // 0x7FFF_FFFF x 4, turn it into not(0x8000_0000).  This is important
+  // for fneg/fabs.
+  if (SplatSize == 4 && SplatBits == (0x7FFFFFFF&~SplatUndef)) {
+    // Make -1 and vspltisw -1:
+    SDValue OnesV = BuildSplatI(-1, 4, MVT::v4i32, DAG, dl);
+
+    // Make the VSLW intrinsic, computing 0x8000_0000.
+    SDValue Res = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, OnesV,
+                                   OnesV, DAG, dl);
+
+    // xor by OnesV to invert it.
+    Res = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Res, OnesV);
+    return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
+  }
+
+  // The remaining cases assume either big endian element order or
+  // a splat-size that equates to the element size of the vector
+  // to be built.  An example that doesn't work for little endian is
+  // {0, -1, 0, -1, 0, -1, 0, -1} which has a splat size of 32 bits
+  // and a vector element size of 16 bits.  The code below will
+  // produce the vector in big endian element order, which for little
+  // endian is {-1, 0, -1, 0, -1, 0, -1, 0}.
+
+  // For now, just avoid these optimizations in that case.
+  // FIXME: Develop correct optimizations for LE with mismatched
+  // splat and element sizes.
+
+  if (Subtarget.isLittleEndian() &&
+      SplatSize != Op.getValueType().getVectorElementType().getSizeInBits())
+    return SDValue();
+
+  // Check to see if this is a wide variety of vsplti*, binop self cases.
+  static const signed char SplatCsts[] = {
+    -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7,
+    -8, 8, -9, 9, -10, 10, -11, 11, -12, 12, -13, 13, 14, -14, 15, -15, -16
+  };
+
+  for (unsigned idx = 0; idx < array_lengthof(SplatCsts); ++idx) {
+    // Indirect through the SplatCsts array so that we favor 'vsplti -1' for
+    // cases which are ambiguous (e.g. formation of 0x8000_0000).  'vsplti -1'
+    int i = SplatCsts[idx];
+
+    // Figure out what shift amount will be used by altivec if shifted by i in
+    // this splat size.
+    unsigned TypeShiftAmt = i & (SplatBitSize-1);
+
+    // vsplti + shl self.
+    if (SextVal == (int)((unsigned)i << TypeShiftAmt)) {
+      SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
+      static const unsigned IIDs[] = { // Intrinsic to use for each size.
+        Intrinsic::ppc_altivec_vslb, Intrinsic::ppc_altivec_vslh, 0,
+        Intrinsic::ppc_altivec_vslw
+      };
+      Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
+      return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
+    }
+
+    // vsplti + srl self.
+    if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) {
+      SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
+      static const unsigned IIDs[] = { // Intrinsic to use for each size.
+        Intrinsic::ppc_altivec_vsrb, Intrinsic::ppc_altivec_vsrh, 0,
+        Intrinsic::ppc_altivec_vsrw
+      };
+      Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
+      return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
+    }
+
+    // vsplti + sra self.
+    if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) {
+      SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
+      static const unsigned IIDs[] = { // Intrinsic to use for each size.
+        Intrinsic::ppc_altivec_vsrab, Intrinsic::ppc_altivec_vsrah, 0,
+        Intrinsic::ppc_altivec_vsraw
+      };
+      Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
+      return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
+    }
+
+    // vsplti + rol self.
+    if (SextVal == (int)(((unsigned)i << TypeShiftAmt) |
+                         ((unsigned)i >> (SplatBitSize-TypeShiftAmt)))) {
+      SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
+      static const unsigned IIDs[] = { // Intrinsic to use for each size.
+        Intrinsic::ppc_altivec_vrlb, Intrinsic::ppc_altivec_vrlh, 0,
+        Intrinsic::ppc_altivec_vrlw
+      };
+      Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
+      return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
+    }
+
+    // t = vsplti c, result = vsldoi t, t, 1
+    if (SextVal == (int)(((unsigned)i << 8) | (i < 0 ? 0xFF : 0))) {
+      SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
+      return BuildVSLDOI(T, T, 1, Op.getValueType(), DAG, dl);
+    }
+    // t = vsplti c, result = vsldoi t, t, 2
+    if (SextVal == (int)(((unsigned)i << 16) | (i < 0 ? 0xFFFF : 0))) {
+      SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
+      return BuildVSLDOI(T, T, 2, Op.getValueType(), DAG, dl);
+    }
+    // t = vsplti c, result = vsldoi t, t, 3
+    if (SextVal == (int)(((unsigned)i << 24) | (i < 0 ? 0xFFFFFF : 0))) {
+      SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
+      return BuildVSLDOI(T, T, 3, Op.getValueType(), DAG, dl);
+    }
+  }
+
+  return SDValue();
+}
+
+/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
+/// the specified operations to build the shuffle.
+static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
+                                      SDValue RHS, SelectionDAG &DAG,
+                                      SDLoc dl) {
+  unsigned OpNum = (PFEntry >> 26) & 0x0F;
+  unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
+  unsigned RHSID = (PFEntry >>  0) & ((1 << 13)-1);
+
+  enum {
+    OP_COPY = 0,  // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
+    OP_VMRGHW,
+    OP_VMRGLW,
+    OP_VSPLTISW0,
+    OP_VSPLTISW1,
+    OP_VSPLTISW2,
+    OP_VSPLTISW3,
+    OP_VSLDOI4,
+    OP_VSLDOI8,
+    OP_VSLDOI12
+  };
+
+  if (OpNum == OP_COPY) {
+    if (LHSID == (1*9+2)*9+3) return LHS;
+    assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
+    return RHS;
+  }
+
+  SDValue OpLHS, OpRHS;
+  OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
+  OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
+
+  int ShufIdxs[16];
+  switch (OpNum) {
+  default: llvm_unreachable("Unknown i32 permute!");
+  case OP_VMRGHW:
+    ShufIdxs[ 0] =  0; ShufIdxs[ 1] =  1; ShufIdxs[ 2] =  2; ShufIdxs[ 3] =  3;
+    ShufIdxs[ 4] = 16; ShufIdxs[ 5] = 17; ShufIdxs[ 6] = 18; ShufIdxs[ 7] = 19;
+    ShufIdxs[ 8] =  4; ShufIdxs[ 9] =  5; ShufIdxs[10] =  6; ShufIdxs[11] =  7;
+    ShufIdxs[12] = 20; ShufIdxs[13] = 21; ShufIdxs[14] = 22; ShufIdxs[15] = 23;
+    break;
+  case OP_VMRGLW:
+    ShufIdxs[ 0] =  8; ShufIdxs[ 1] =  9; ShufIdxs[ 2] = 10; ShufIdxs[ 3] = 11;
+    ShufIdxs[ 4] = 24; ShufIdxs[ 5] = 25; ShufIdxs[ 6] = 26; ShufIdxs[ 7] = 27;
+    ShufIdxs[ 8] = 12; ShufIdxs[ 9] = 13; ShufIdxs[10] = 14; ShufIdxs[11] = 15;
+    ShufIdxs[12] = 28; ShufIdxs[13] = 29; ShufIdxs[14] = 30; ShufIdxs[15] = 31;
+    break;
+  case OP_VSPLTISW0:
+    for (unsigned i = 0; i != 16; ++i)
+      ShufIdxs[i] = (i&3)+0;
+    break;
+  case OP_VSPLTISW1:
+    for (unsigned i = 0; i != 16; ++i)
+      ShufIdxs[i] = (i&3)+4;
+    break;
+  case OP_VSPLTISW2:
+    for (unsigned i = 0; i != 16; ++i)
+      ShufIdxs[i] = (i&3)+8;
+    break;
+  case OP_VSPLTISW3:
+    for (unsigned i = 0; i != 16; ++i)
+      ShufIdxs[i] = (i&3)+12;
+    break;
+  case OP_VSLDOI4:
+    return BuildVSLDOI(OpLHS, OpRHS, 4, OpLHS.getValueType(), DAG, dl);
+  case OP_VSLDOI8:
+    return BuildVSLDOI(OpLHS, OpRHS, 8, OpLHS.getValueType(), DAG, dl);
+  case OP_VSLDOI12:
+    return BuildVSLDOI(OpLHS, OpRHS, 12, OpLHS.getValueType(), DAG, dl);
+  }
+  EVT VT = OpLHS.getValueType();
+  OpLHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLHS);
+  OpRHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpRHS);
+  SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, OpLHS, OpRHS, ShufIdxs);
+  return DAG.getNode(ISD::BITCAST, dl, VT, T);
+}
+
+/// LowerVECTOR_SHUFFLE - Return the code we lower for VECTOR_SHUFFLE.  If this
+/// is a shuffle we can handle in a single instruction, return it.  Otherwise,
+/// return the code it can be lowered into.  Worst case, it can always be
+/// lowered into a vperm.
+SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  EVT VT = Op.getValueType();
+  bool isLittleEndian = Subtarget.isLittleEndian();
+
+  // Cases that are handled by instructions that take permute immediates
+  // (such as vsplt*) should be left as VECTOR_SHUFFLE nodes so they can be
+  // selected by the instruction selector.
+  if (V2.getOpcode() == ISD::UNDEF) {
+    if (PPC::isSplatShuffleMask(SVOp, 1) ||
+        PPC::isSplatShuffleMask(SVOp, 2) ||
+        PPC::isSplatShuffleMask(SVOp, 4) ||
+        PPC::isVPKUWUMShuffleMask(SVOp, 1, DAG) ||
+        PPC::isVPKUHUMShuffleMask(SVOp, 1, DAG) ||
+        PPC::isVSLDOIShuffleMask(SVOp, 1, DAG) != -1 ||
+        PPC::isVMRGLShuffleMask(SVOp, 1, 1, DAG) ||
+        PPC::isVMRGLShuffleMask(SVOp, 2, 1, DAG) ||
+        PPC::isVMRGLShuffleMask(SVOp, 4, 1, DAG) ||
+        PPC::isVMRGHShuffleMask(SVOp, 1, 1, DAG) ||
+        PPC::isVMRGHShuffleMask(SVOp, 2, 1, DAG) ||
+        PPC::isVMRGHShuffleMask(SVOp, 4, 1, DAG)) {
+      return Op;
+    }
+  }
+
+  // Altivec has a variety of "shuffle immediates" that take two vector inputs
+  // and produce a fixed permutation.  If any of these match, do not lower to
+  // VPERM.
+  unsigned int ShuffleKind = isLittleEndian ? 2 : 0;
+  if (PPC::isVPKUWUMShuffleMask(SVOp, ShuffleKind, DAG) ||
+      PPC::isVPKUHUMShuffleMask(SVOp, ShuffleKind, DAG) ||
+      PPC::isVSLDOIShuffleMask(SVOp, ShuffleKind, DAG) != -1 ||
+      PPC::isVMRGLShuffleMask(SVOp, 1, ShuffleKind, DAG) ||
+      PPC::isVMRGLShuffleMask(SVOp, 2, ShuffleKind, DAG) ||
+      PPC::isVMRGLShuffleMask(SVOp, 4, ShuffleKind, DAG) ||
+      PPC::isVMRGHShuffleMask(SVOp, 1, ShuffleKind, DAG) ||
+      PPC::isVMRGHShuffleMask(SVOp, 2, ShuffleKind, DAG) ||
+      PPC::isVMRGHShuffleMask(SVOp, 4, ShuffleKind, DAG))
+    return Op;
+
+  // Check to see if this is a shuffle of 4-byte values.  If so, we can use our
+  // perfect shuffle table to emit an optimal matching sequence.
+  ArrayRef<int> PermMask = SVOp->getMask();
+
+  unsigned PFIndexes[4];
+  bool isFourElementShuffle = true;
+  for (unsigned i = 0; i != 4 && isFourElementShuffle; ++i) { // Element number
+    unsigned EltNo = 8;   // Start out undef.
+    for (unsigned j = 0; j != 4; ++j) {  // Intra-element byte.
+      if (PermMask[i*4+j] < 0)
+        continue;   // Undef, ignore it.
+
+      unsigned ByteSource = PermMask[i*4+j];
+      if ((ByteSource & 3) != j) {
+        isFourElementShuffle = false;
+        break;
+      }
+
+      if (EltNo == 8) {
+        EltNo = ByteSource/4;
+      } else if (EltNo != ByteSource/4) {
+        isFourElementShuffle = false;
+        break;
+      }
+    }
+    PFIndexes[i] = EltNo;
+  }
+
+  // If this shuffle can be expressed as a shuffle of 4-byte elements, use the
+  // perfect shuffle vector to determine if it is cost effective to do this as
+  // discrete instructions, or whether we should use a vperm.
+  // For now, we skip this for little endian until such time as we have a
+  // little-endian perfect shuffle table.
+  if (isFourElementShuffle && !isLittleEndian) {
+    // Compute the index in the perfect shuffle table.
+    unsigned PFTableIndex =
+      PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
+
+    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
+    unsigned Cost  = (PFEntry >> 30);
+
+    // Determining when to avoid vperm is tricky.  Many things affect the cost
+    // of vperm, particularly how many times the perm mask needs to be computed.
+    // For example, if the perm mask can be hoisted out of a loop or is already
+    // used (perhaps because there are multiple permutes with the same shuffle
+    // mask?) the vperm has a cost of 1.  OTOH, hoisting the permute mask out of
+    // the loop requires an extra register.
+    //
+    // As a compromise, we only emit discrete instructions if the shuffle can be
+    // generated in 3 or fewer operations.  When we have loop information
+    // available, if this block is within a loop, we should avoid using vperm
+    // for 3-operation perms and use a constant pool load instead.
+    if (Cost < 3)
+      return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
+  }
+
+  // Lower this to a VPERM(V1, V2, V3) expression, where V3 is a constant
+  // vector that will get spilled to the constant pool.
+  if (V2.getOpcode() == ISD::UNDEF) V2 = V1;
+
+  // The SHUFFLE_VECTOR mask is almost exactly what we want for vperm, except
+  // that it is in input element units, not in bytes.  Convert now.
+
+  // For little endian, the order of the input vectors is reversed, and
+  // the permutation mask is complemented with respect to 31.  This is
+  // necessary to produce proper semantics with the big-endian-biased vperm
+  // instruction.
+  EVT EltVT = V1.getValueType().getVectorElementType();
+  unsigned BytesPerElement = EltVT.getSizeInBits()/8;
+
+  SmallVector<SDValue, 16> ResultMask;
+  for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
+    unsigned SrcElt = PermMask[i] < 0 ? 0 : PermMask[i];
+
+    for (unsigned j = 0; j != BytesPerElement; ++j)
+      if (isLittleEndian)
+        ResultMask.push_back(DAG.getConstant(31 - (SrcElt*BytesPerElement+j),
+                                             MVT::i32));
+      else
+        ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement+j,
+                                             MVT::i32));
+  }
+
+  SDValue VPermMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8,
+                                  ResultMask);
+  if (isLittleEndian)
+    return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(),
+                       V2, V1, VPermMask);
+  else
+    return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(),
+                       V1, V2, VPermMask);
+}
+
+/// getAltivecCompareInfo - Given an intrinsic, return false if it is not an
+/// altivec comparison.  If it is, return true and fill in Opc/isDot with
+/// information about the intrinsic.
+static bool getAltivecCompareInfo(SDValue Intrin, int &CompareOpc,
+                                  bool &isDot) {
+  unsigned IntrinsicID =
+    cast<ConstantSDNode>(Intrin.getOperand(0))->getZExtValue();
+  CompareOpc = -1;
+  isDot = false;
+  switch (IntrinsicID) {
+  default: return false;
+    // Comparison predicates.
+  case Intrinsic::ppc_altivec_vcmpbfp_p:  CompareOpc = 966; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpeqfp_p: CompareOpc = 198; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpequb_p: CompareOpc =   6; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpequh_p: CompareOpc =  70; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpequw_p: CompareOpc = 134; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgefp_p: CompareOpc = 454; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgtfp_p: CompareOpc = 710; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgtsb_p: CompareOpc = 774; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgtsh_p: CompareOpc = 838; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgtsw_p: CompareOpc = 902; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgtub_p: CompareOpc = 518; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgtuh_p: CompareOpc = 582; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgtuw_p: CompareOpc = 646; isDot = 1; break;
+
+    // Normal Comparisons.
+  case Intrinsic::ppc_altivec_vcmpbfp:    CompareOpc = 966; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpeqfp:   CompareOpc = 198; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpequb:   CompareOpc =   6; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpequh:   CompareOpc =  70; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpequw:   CompareOpc = 134; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgefp:   CompareOpc = 454; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgtfp:   CompareOpc = 710; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgtsb:   CompareOpc = 774; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgtsh:   CompareOpc = 838; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgtsw:   CompareOpc = 902; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgtub:   CompareOpc = 518; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgtuh:   CompareOpc = 582; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgtuw:   CompareOpc = 646; isDot = 0; break;
+  }
+  return true;
+}
+
+/// LowerINTRINSIC_WO_CHAIN - If this is an intrinsic that we want to custom
+/// lower, do it, otherwise return null.
+SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  // If this is a lowered altivec predicate compare, CompareOpc is set to the
+  // opcode number of the comparison.
+  SDLoc dl(Op);
+  int CompareOpc;
+  bool isDot;
+  if (!getAltivecCompareInfo(Op, CompareOpc, isDot))
+    return SDValue();    // Don't custom lower most intrinsics.
+
+  // If this is a non-dot comparison, make the VCMP node and we are done.
+  if (!isDot) {
+    SDValue Tmp = DAG.getNode(PPCISD::VCMP, dl, Op.getOperand(2).getValueType(),
+                              Op.getOperand(1), Op.getOperand(2),
+                              DAG.getConstant(CompareOpc, MVT::i32));
+    return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Tmp);
+  }
+
+  // Create the PPCISD altivec 'dot' comparison node.
+  SDValue Ops[] = {
+    Op.getOperand(2),  // LHS
+    Op.getOperand(3),  // RHS
+    DAG.getConstant(CompareOpc, MVT::i32)
+  };
+  EVT VTs[] = { Op.getOperand(2).getValueType(), MVT::Glue };
+  SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops);
+
+  // Now that we have the comparison, emit a copy from the CR to a GPR.
+  // This is flagged to the above dot comparison.
+  SDValue Flags = DAG.getNode(PPCISD::MFOCRF, dl, MVT::i32,
+                                DAG.getRegister(PPC::CR6, MVT::i32),
+                                CompNode.getValue(1));
+
+  // Unpack the result based on how the target uses it.
+  unsigned BitNo;   // Bit # of CR6.
+  bool InvertBit;   // Invert result?
+  switch (cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()) {
+  default:  // Can't happen, don't crash on invalid number though.
+  case 0:   // Return the value of the EQ bit of CR6.
+    BitNo = 0; InvertBit = false;
+    break;
+  case 1:   // Return the inverted value of the EQ bit of CR6.
+    BitNo = 0; InvertBit = true;
+    break;
+  case 2:   // Return the value of the LT bit of CR6.
+    BitNo = 2; InvertBit = false;
+    break;
+  case 3:   // Return the inverted value of the LT bit of CR6.
+    BitNo = 2; InvertBit = true;
+    break;
+  }
+
+  // Shift the bit into the low position.
+  Flags = DAG.getNode(ISD::SRL, dl, MVT::i32, Flags,
+                      DAG.getConstant(8-(3-BitNo), MVT::i32));
+  // Isolate the bit.
+  Flags = DAG.getNode(ISD::AND, dl, MVT::i32, Flags,
+                      DAG.getConstant(1, MVT::i32));
+
+  // If we are supposed to, toggle the bit.
+  if (InvertBit)
+    Flags = DAG.getNode(ISD::XOR, dl, MVT::i32, Flags,
+                        DAG.getConstant(1, MVT::i32));
+  return Flags;
+}
+
+SDValue PPCTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  // For v2i64 (VSX), we can pattern patch the v2i32 case (using fp <-> int
+  // instructions), but for smaller types, we need to first extend up to v2i32
+  // before doing going farther.
+  if (Op.getValueType() == MVT::v2i64) {
+    EVT ExtVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+    if (ExtVT != MVT::v2i32) {
+      Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(0));
+      Op = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, Op,
+                       DAG.getValueType(EVT::getVectorVT(*DAG.getContext(),
+                                        ExtVT.getVectorElementType(), 4)));
+      Op = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Op);
+      Op = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v2i64, Op,
+                       DAG.getValueType(MVT::v2i32));
+    }
+
+    return Op;
+  }
+
+  return SDValue();
+}
+
+SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  // Create a stack slot that is 16-byte aligned.
+  MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
+  int FrameIdx = FrameInfo->CreateStackObject(16, 16, false);
+  EVT PtrVT = getPointerTy();
+  SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
+
+  // Store the input value into Value#0 of the stack slot.
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl,
+                               Op.getOperand(0), FIdx, MachinePointerInfo(),
+                               false, false, 0);
+  // Load it out.
+  return DAG.getLoad(Op.getValueType(), dl, Store, FIdx, MachinePointerInfo(),
+                     false, false, false, 0);
+}
+
+SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  if (Op.getValueType() == MVT::v4i32) {
+    SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
+
+    SDValue Zero  = BuildSplatI(  0, 1, MVT::v4i32, DAG, dl);
+    SDValue Neg16 = BuildSplatI(-16, 4, MVT::v4i32, DAG, dl);//+16 as shift amt.
+
+    SDValue RHSSwap =   // = vrlw RHS, 16
+      BuildIntrinsicOp(Intrinsic::ppc_altivec_vrlw, RHS, Neg16, DAG, dl);
+
+    // Shrinkify inputs to v8i16.
+    LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, LHS);
+    RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, RHS);
+    RHSSwap = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, RHSSwap);
+
+    // Low parts multiplied together, generating 32-bit results (we ignore the
+    // top parts).
+    SDValue LoProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmulouh,
+                                        LHS, RHS, DAG, dl, MVT::v4i32);
+
+    SDValue HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmsumuhm,
+                                      LHS, RHSSwap, Zero, DAG, dl, MVT::v4i32);
+    // Shift the high parts up 16 bits.
+    HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, HiProd,
+                              Neg16, DAG, dl);
+    return DAG.getNode(ISD::ADD, dl, MVT::v4i32, LoProd, HiProd);
+  } else if (Op.getValueType() == MVT::v8i16) {
+    SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
+
+    SDValue Zero = BuildSplatI(0, 1, MVT::v8i16, DAG, dl);
+
+    return BuildIntrinsicOp(Intrinsic::ppc_altivec_vmladduhm,
+                            LHS, RHS, Zero, DAG, dl);
+  } else if (Op.getValueType() == MVT::v16i8) {
+    SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
+    bool isLittleEndian = Subtarget.isLittleEndian();
+
+    // Multiply the even 8-bit parts, producing 16-bit sums.
+    SDValue EvenParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuleub,
+                                           LHS, RHS, DAG, dl, MVT::v8i16);
+    EvenParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, EvenParts);
+
+    // Multiply the odd 8-bit parts, producing 16-bit sums.
+    SDValue OddParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuloub,
+                                          LHS, RHS, DAG, dl, MVT::v8i16);
+    OddParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OddParts);
+
+    // Merge the results together.  Because vmuleub and vmuloub are
+    // instructions with a big-endian bias, we must reverse the
+    // element numbering and reverse the meaning of "odd" and "even"
+    // when generating little endian code.
+    int Ops[16];
+    for (unsigned i = 0; i != 8; ++i) {
+      if (isLittleEndian) {
+        Ops[i*2  ] = 2*i;
+        Ops[i*2+1] = 2*i+16;
+      } else {
+        Ops[i*2  ] = 2*i+1;
+        Ops[i*2+1] = 2*i+1+16;
+      }
+    }
+    if (isLittleEndian)
+      return DAG.getVectorShuffle(MVT::v16i8, dl, OddParts, EvenParts, Ops);
+    else
+      return DAG.getVectorShuffle(MVT::v16i8, dl, EvenParts, OddParts, Ops);
+  } else {
+    llvm_unreachable("Unknown mul to lower!");
+  }
+}
+
+/// LowerOperation - Provide custom lowering hooks for some operations.
+///
+SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Wasn't expecting to be able to lower this!");
+  case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
+  case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
+  case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
+  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
+  case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
+  case ISD::SETCC:              return LowerSETCC(Op, DAG);
+  case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
+  case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
+  case ISD::VASTART:
+    return LowerVASTART(Op, DAG, Subtarget);
+
+  case ISD::VAARG:
+    return LowerVAARG(Op, DAG, Subtarget);
+
+  case ISD::VACOPY:
+    return LowerVACOPY(Op, DAG, Subtarget);
+
+  case ISD::STACKRESTORE:       return LowerSTACKRESTORE(Op, DAG, Subtarget);
+  case ISD::DYNAMIC_STACKALLOC:
+    return LowerDYNAMIC_STACKALLOC(Op, DAG, Subtarget);
+
+  case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
+  case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
+
+  case ISD::LOAD:               return LowerLOAD(Op, DAG);
+  case ISD::STORE:              return LowerSTORE(Op, DAG);
+  case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
+  case ISD::SELECT_CC:          return LowerSELECT_CC(Op, DAG);
+  case ISD::FP_TO_UINT:
+  case ISD::FP_TO_SINT:         return LowerFP_TO_INT(Op, DAG,
+                                                       SDLoc(Op));
+  case ISD::UINT_TO_FP:
+  case ISD::SINT_TO_FP:         return LowerINT_TO_FP(Op, DAG);
+  case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
+
+  // Lower 64-bit shifts.
+  case ISD::SHL_PARTS:          return LowerSHL_PARTS(Op, DAG);
+  case ISD::SRL_PARTS:          return LowerSRL_PARTS(Op, DAG);
+  case ISD::SRA_PARTS:          return LowerSRA_PARTS(Op, DAG);
+
+  // Vector-related lowering.
+  case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
+  case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
+  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
+  case ISD::SIGN_EXTEND_INREG:  return LowerSIGN_EXTEND_INREG(Op, DAG);
+  case ISD::MUL:                return LowerMUL(Op, DAG);
+
+  // For counter-based loop handling.
+  case ISD::INTRINSIC_W_CHAIN:  return SDValue();
+
+  // Frame & Return address.
+  case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
+  case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
+  }
+}
+
+void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
+                                           SmallVectorImpl<SDValue>&Results,
+                                           SelectionDAG &DAG) const {
+  const TargetMachine &TM = getTargetMachine();
+  SDLoc dl(N);
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Do not know how to custom type legalize this operation!");
+  case ISD::INTRINSIC_W_CHAIN: {
+    if (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() !=
+        Intrinsic::ppc_is_decremented_ctr_nonzero)
+      break;
+
+    assert(N->getValueType(0) == MVT::i1 &&
+           "Unexpected result type for CTR decrement intrinsic");
+    EVT SVT = getSetCCResultType(*DAG.getContext(), N->getValueType(0));
+    SDVTList VTs = DAG.getVTList(SVT, MVT::Other);
+    SDValue NewInt = DAG.getNode(N->getOpcode(), dl, VTs, N->getOperand(0),
+                                 N->getOperand(1)); 
+
+    Results.push_back(NewInt);
+    Results.push_back(NewInt.getValue(1));
+    break;
+  }
+  case ISD::VAARG: {
+    if (!TM.getSubtarget<PPCSubtarget>().isSVR4ABI()
+        || TM.getSubtarget<PPCSubtarget>().isPPC64())
+      return;
+
+    EVT VT = N->getValueType(0);
+
+    if (VT == MVT::i64) {
+      SDValue NewNode = LowerVAARG(SDValue(N, 1), DAG, Subtarget);
+
+      Results.push_back(NewNode);
+      Results.push_back(NewNode.getValue(1));
+    }
+    return;
+  }
+  case ISD::FP_ROUND_INREG: {
+    assert(N->getValueType(0) == MVT::ppcf128);
+    assert(N->getOperand(0).getValueType() == MVT::ppcf128);
+    SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl,
+                             MVT::f64, N->getOperand(0),
+                             DAG.getIntPtrConstant(0));
+    SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl,
+                             MVT::f64, N->getOperand(0),
+                             DAG.getIntPtrConstant(1));
+
+    // Add the two halves of the long double in round-to-zero mode.
+    SDValue FPreg = DAG.getNode(PPCISD::FADDRTZ, dl, MVT::f64, Lo, Hi);
+
+    // We know the low half is about to be thrown away, so just use something
+    // convenient.
+    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::ppcf128,
+                                FPreg, FPreg));
+    return;
+  }
+  case ISD::FP_TO_SINT:
+    // LowerFP_TO_INT() can only handle f32 and f64.
+    if (N->getOperand(0).getValueType() == MVT::ppcf128)
+      return;
+    Results.push_back(LowerFP_TO_INT(SDValue(N, 0), DAG, dl));
+    return;
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Other Lowering Code
+//===----------------------------------------------------------------------===//
+
+MachineBasicBlock *
+PPCTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
+                                    bool is64bit, unsigned BinOpcode) const {
+  // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction *F = BB->getParent();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  unsigned dest = MI->getOperand(0).getReg();
+  unsigned ptrA = MI->getOperand(1).getReg();
+  unsigned ptrB = MI->getOperand(2).getReg();
+  unsigned incr = MI->getOperand(3).getReg();
+  DebugLoc dl = MI->getDebugLoc();
+
+  MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, loopMBB);
+  F->insert(It, exitMBB);
+  exitMBB->splice(exitMBB->begin(), BB,
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  MachineRegisterInfo &RegInfo = F->getRegInfo();
+  unsigned TmpReg = (!BinOpcode) ? incr :
+    RegInfo.createVirtualRegister(
+       is64bit ? (const TargetRegisterClass *) &PPC::G8RCRegClass :
+                 (const TargetRegisterClass *) &PPC::GPRCRegClass);
+
+  //  thisMBB:
+  //   ...
+  //   fallthrough --> loopMBB
+  BB->addSuccessor(loopMBB);
+
+  //  loopMBB:
+  //   l[wd]arx dest, ptr
+  //   add r0, dest, incr
+  //   st[wd]cx. r0, ptr
+  //   bne- loopMBB
+  //   fallthrough --> exitMBB
+  BB = loopMBB;
+  BuildMI(BB, dl, TII->get(is64bit ? PPC::LDARX : PPC::LWARX), dest)
+    .addReg(ptrA).addReg(ptrB);
+  if (BinOpcode)
+    BuildMI(BB, dl, TII->get(BinOpcode), TmpReg).addReg(incr).addReg(dest);
+  BuildMI(BB, dl, TII->get(is64bit ? PPC::STDCX : PPC::STWCX))
+    .addReg(TmpReg).addReg(ptrA).addReg(ptrB);
+  BuildMI(BB, dl, TII->get(PPC::BCC))
+    .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB);
+  BB->addSuccessor(loopMBB);
+  BB->addSuccessor(exitMBB);
+
+  //  exitMBB:
+  //   ...
+  BB = exitMBB;
+  return BB;
+}
+
+MachineBasicBlock *
+PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr *MI,
+                                            MachineBasicBlock *BB,
+                                            bool is8bit,    // operation
+                                            unsigned BinOpcode) const {
+  // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  // In 64 bit mode we have to use 64 bits for addresses, even though the
+  // lwarx/stwcx are 32 bits.  With the 32-bit atomics we can use address
+  // registers without caring whether they're 32 or 64, but here we're
+  // doing actual arithmetic on the addresses.
+  bool is64bit = Subtarget.isPPC64();
+  unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO;
+
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction *F = BB->getParent();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  unsigned dest = MI->getOperand(0).getReg();
+  unsigned ptrA = MI->getOperand(1).getReg();
+  unsigned ptrB = MI->getOperand(2).getReg();
+  unsigned incr = MI->getOperand(3).getReg();
+  DebugLoc dl = MI->getDebugLoc();
+
+  MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, loopMBB);
+  F->insert(It, exitMBB);
+  exitMBB->splice(exitMBB->begin(), BB,
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  MachineRegisterInfo &RegInfo = F->getRegInfo();
+  const TargetRegisterClass *RC =
+    is64bit ? (const TargetRegisterClass *) &PPC::G8RCRegClass :
+              (const TargetRegisterClass *) &PPC::GPRCRegClass;
+  unsigned PtrReg = RegInfo.createVirtualRegister(RC);
+  unsigned Shift1Reg = RegInfo.createVirtualRegister(RC);
+  unsigned ShiftReg = RegInfo.createVirtualRegister(RC);
+  unsigned Incr2Reg = RegInfo.createVirtualRegister(RC);
+  unsigned MaskReg = RegInfo.createVirtualRegister(RC);
+  unsigned Mask2Reg = RegInfo.createVirtualRegister(RC);
+  unsigned Mask3Reg = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp2Reg = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp3Reg = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp4Reg = RegInfo.createVirtualRegister(RC);
+  unsigned TmpDestReg = RegInfo.createVirtualRegister(RC);
+  unsigned Ptr1Reg;
+  unsigned TmpReg = (!BinOpcode) ? Incr2Reg : RegInfo.createVirtualRegister(RC);
+
+  //  thisMBB:
+  //   ...
+  //   fallthrough --> loopMBB
+  BB->addSuccessor(loopMBB);
+
+  // The 4-byte load must be aligned, while a char or short may be
+  // anywhere in the word.  Hence all this nasty bookkeeping code.
+  //   add ptr1, ptrA, ptrB [copy if ptrA==0]
+  //   rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27]
+  //   xori shift, shift1, 24 [16]
+  //   rlwinm ptr, ptr1, 0, 0, 29
+  //   slw incr2, incr, shift
+  //   li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535]
+  //   slw mask, mask2, shift
+  //  loopMBB:
+  //   lwarx tmpDest, ptr
+  //   add tmp, tmpDest, incr2
+  //   andc tmp2, tmpDest, mask
+  //   and tmp3, tmp, mask
+  //   or tmp4, tmp3, tmp2
+  //   stwcx. tmp4, ptr
+  //   bne- loopMBB
+  //   fallthrough --> exitMBB
+  //   srw dest, tmpDest, shift
+  if (ptrA != ZeroReg) {
+    Ptr1Reg = RegInfo.createVirtualRegister(RC);
+    BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg)
+      .addReg(ptrA).addReg(ptrB);
+  } else {
+    Ptr1Reg = ptrB;
+  }
+  BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg).addReg(Ptr1Reg)
+      .addImm(3).addImm(27).addImm(is8bit ? 28 : 27);
+  BuildMI(BB, dl, TII->get(is64bit ? PPC::XORI8 : PPC::XORI), ShiftReg)
+      .addReg(Shift1Reg).addImm(is8bit ? 24 : 16);
+  if (is64bit)
+    BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg)
+      .addReg(Ptr1Reg).addImm(0).addImm(61);
+  else
+    BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg)
+      .addReg(Ptr1Reg).addImm(0).addImm(0).addImm(29);
+  BuildMI(BB, dl, TII->get(PPC::SLW), Incr2Reg)
+      .addReg(incr).addReg(ShiftReg);
+  if (is8bit)
+    BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255);
+  else {
+    BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0);
+    BuildMI(BB, dl, TII->get(PPC::ORI),Mask2Reg).addReg(Mask3Reg).addImm(65535);
+  }
+  BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg)
+      .addReg(Mask2Reg).addReg(ShiftReg);
+
+  BB = loopMBB;
+  BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg)
+    .addReg(ZeroReg).addReg(PtrReg);
+  if (BinOpcode)
+    BuildMI(BB, dl, TII->get(BinOpcode), TmpReg)
+      .addReg(Incr2Reg).addReg(TmpDestReg);
+  BuildMI(BB, dl, TII->get(is64bit ? PPC::ANDC8 : PPC::ANDC), Tmp2Reg)
+    .addReg(TmpDestReg).addReg(MaskReg);
+  BuildMI(BB, dl, TII->get(is64bit ? PPC::AND8 : PPC::AND), Tmp3Reg)
+    .addReg(TmpReg).addReg(MaskReg);
+  BuildMI(BB, dl, TII->get(is64bit ? PPC::OR8 : PPC::OR), Tmp4Reg)
+    .addReg(Tmp3Reg).addReg(Tmp2Reg);
+  BuildMI(BB, dl, TII->get(PPC::STWCX))
+    .addReg(Tmp4Reg).addReg(ZeroReg).addReg(PtrReg);
+  BuildMI(BB, dl, TII->get(PPC::BCC))
+    .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB);
+  BB->addSuccessor(loopMBB);
+  BB->addSuccessor(exitMBB);
+
+  //  exitMBB:
+  //   ...
+  BB = exitMBB;
+  BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW), dest).addReg(TmpDestReg)
+    .addReg(ShiftReg);
+  return BB;
+}
+
+llvm::MachineBasicBlock*
+PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
+                                    MachineBasicBlock *MBB) const {
+  DebugLoc DL = MI->getDebugLoc();
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+
+  MachineFunction *MF = MBB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  const BasicBlock *BB = MBB->getBasicBlock();
+  MachineFunction::iterator I = MBB;
+  ++I;
+
+  // Memory Reference
+  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
+  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+
+  unsigned DstReg = MI->getOperand(0).getReg();
+  const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
+  assert(RC->hasType(MVT::i32) && "Invalid destination!");
+  unsigned mainDstReg = MRI.createVirtualRegister(RC);
+  unsigned restoreDstReg = MRI.createVirtualRegister(RC);
+
+  MVT PVT = getPointerTy();
+  assert((PVT == MVT::i64 || PVT == MVT::i32) &&
+         "Invalid Pointer Size!");
+  // For v = setjmp(buf), we generate
+  //
+  // thisMBB:
+  //  SjLjSetup mainMBB
+  //  bl mainMBB
+  //  v_restore = 1
+  //  b sinkMBB
+  //
+  // mainMBB:
+  //  buf[LabelOffset] = LR
+  //  v_main = 0
+  //
+  // sinkMBB:
+  //  v = phi(main, restore)
+  //
+
+  MachineBasicBlock *thisMBB = MBB;
+  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
+  MF->insert(I, mainMBB);
+  MF->insert(I, sinkMBB);
+
+  MachineInstrBuilder MIB;
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), MBB,
+                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  // Note that the structure of the jmp_buf used here is not compatible
+  // with that used by libc, and is not designed to be. Specifically, it
+  // stores only those 'reserved' registers that LLVM does not otherwise
+  // understand how to spill. Also, by convention, by the time this
+  // intrinsic is called, Clang has already stored the frame address in the
+  // first slot of the buffer and stack address in the third. Following the
+  // X86 target code, we'll store the jump address in the second slot. We also
+  // need to save the TOC pointer (R2) to handle jumps between shared
+  // libraries, and that will be stored in the fourth slot. The thread
+  // identifier (R13) is not affected.
+
+  // thisMBB:
+  const int64_t LabelOffset = 1 * PVT.getStoreSize();
+  const int64_t TOCOffset   = 3 * PVT.getStoreSize();
+  const int64_t BPOffset    = 4 * PVT.getStoreSize();
+
+  // Prepare IP either in reg.
+  const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
+  unsigned LabelReg = MRI.createVirtualRegister(PtrRC);
+  unsigned BufReg = MI->getOperand(1).getReg();
+
+  if (Subtarget.isPPC64() && Subtarget.isSVR4ABI()) {
+    MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::STD))
+            .addReg(PPC::X2)
+            .addImm(TOCOffset)
+            .addReg(BufReg);
+    MIB.setMemRefs(MMOBegin, MMOEnd);
+  }
+
+  // Naked functions never have a base pointer, and so we use r1. For all
+  // other functions, this decision must be delayed until during PEI.
+  unsigned BaseReg;
+  if (MF->getFunction()->getAttributes().hasAttribute(
+          AttributeSet::FunctionIndex, Attribute::Naked))
+    BaseReg = Subtarget.isPPC64() ? PPC::X1 : PPC::R1;
+  else
+    BaseReg = Subtarget.isPPC64() ? PPC::BP8 : PPC::BP;
+
+  MIB = BuildMI(*thisMBB, MI, DL,
+                TII->get(Subtarget.isPPC64() ? PPC::STD : PPC::STW))
+          .addReg(BaseReg)
+          .addImm(BPOffset)
+          .addReg(BufReg);
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+
+  // Setup
+  MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::BCLalways)).addMBB(mainMBB);
+  const PPCRegisterInfo *TRI =
+    static_cast<const PPCRegisterInfo*>(getTargetMachine().getRegisterInfo());
+  MIB.addRegMask(TRI->getNoPreservedMask());
+
+  BuildMI(*thisMBB, MI, DL, TII->get(PPC::LI), restoreDstReg).addImm(1);
+
+  MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::EH_SjLj_Setup))
+          .addMBB(mainMBB);
+  MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::B)).addMBB(sinkMBB);
+
+  thisMBB->addSuccessor(mainMBB, /* weight */ 0);
+  thisMBB->addSuccessor(sinkMBB, /* weight */ 1);
+
+  // mainMBB:
+  //  mainDstReg = 0
+  MIB = BuildMI(mainMBB, DL,
+    TII->get(Subtarget.isPPC64() ? PPC::MFLR8 : PPC::MFLR), LabelReg);
+
+  // Store IP
+  if (Subtarget.isPPC64()) {
+    MIB = BuildMI(mainMBB, DL, TII->get(PPC::STD))
+            .addReg(LabelReg)
+            .addImm(LabelOffset)
+            .addReg(BufReg);
+  } else {
+    MIB = BuildMI(mainMBB, DL, TII->get(PPC::STW))
+            .addReg(LabelReg)
+            .addImm(LabelOffset)
+            .addReg(BufReg);
+  }
+
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+
+  BuildMI(mainMBB, DL, TII->get(PPC::LI), mainDstReg).addImm(0);
+  mainMBB->addSuccessor(sinkMBB);
+
+  // sinkMBB:
+  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+          TII->get(PPC::PHI), DstReg)
+    .addReg(mainDstReg).addMBB(mainMBB)
+    .addReg(restoreDstReg).addMBB(thisMBB);
+
+  MI->eraseFromParent();
+  return sinkMBB;
+}
+
+MachineBasicBlock *
+PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
+                                     MachineBasicBlock *MBB) const {
+  DebugLoc DL = MI->getDebugLoc();
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+
+  MachineFunction *MF = MBB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  // Memory Reference
+  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
+  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+
+  MVT PVT = getPointerTy();
+  assert((PVT == MVT::i64 || PVT == MVT::i32) &&
+         "Invalid Pointer Size!");
+
+  const TargetRegisterClass *RC =
+    (PVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
+  unsigned Tmp = MRI.createVirtualRegister(RC);
+  // Since FP is only updated here but NOT referenced, it's treated as GPR.
+  unsigned FP  = (PVT == MVT::i64) ? PPC::X31 : PPC::R31;
+  unsigned SP  = (PVT == MVT::i64) ? PPC::X1 : PPC::R1;
+  unsigned BP  = (PVT == MVT::i64) ? PPC::X30 :
+                  (Subtarget.isSVR4ABI() &&
+                   MF->getTarget().getRelocationModel() == Reloc::PIC_ ?
+                     PPC::R29 : PPC::R30);
+
+  MachineInstrBuilder MIB;
+
+  const int64_t LabelOffset = 1 * PVT.getStoreSize();
+  const int64_t SPOffset    = 2 * PVT.getStoreSize();
+  const int64_t TOCOffset   = 3 * PVT.getStoreSize();
+  const int64_t BPOffset    = 4 * PVT.getStoreSize();
+
+  unsigned BufReg = MI->getOperand(0).getReg();
+
+  // Reload FP (the jumped-to function may not have had a
+  // frame pointer, and if so, then its r31 will be restored
+  // as necessary).
+  if (PVT == MVT::i64) {
+    MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), FP)
+            .addImm(0)
+            .addReg(BufReg);
+  } else {
+    MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), FP)
+            .addImm(0)
+            .addReg(BufReg);
+  }
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+
+  // Reload IP
+  if (PVT == MVT::i64) {
+    MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), Tmp)
+            .addImm(LabelOffset)
+            .addReg(BufReg);
+  } else {
+    MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), Tmp)
+            .addImm(LabelOffset)
+            .addReg(BufReg);
+  }
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+
+  // Reload SP
+  if (PVT == MVT::i64) {
+    MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), SP)
+            .addImm(SPOffset)
+            .addReg(BufReg);
+  } else {
+    MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), SP)
+            .addImm(SPOffset)
+            .addReg(BufReg);
+  }
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+
+  // Reload BP
+  if (PVT == MVT::i64) {
+    MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), BP)
+            .addImm(BPOffset)
+            .addReg(BufReg);
+  } else {
+    MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), BP)
+            .addImm(BPOffset)
+            .addReg(BufReg);
+  }
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+
+  // Reload TOC
+  if (PVT == MVT::i64 && Subtarget.isSVR4ABI()) {
+    MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), PPC::X2)
+            .addImm(TOCOffset)
+            .addReg(BufReg);
+
+    MIB.setMemRefs(MMOBegin, MMOEnd);
+  }
+
+  // Jump
+  BuildMI(*MBB, MI, DL,
+          TII->get(PVT == MVT::i64 ? PPC::MTCTR8 : PPC::MTCTR)).addReg(Tmp);
+  BuildMI(*MBB, MI, DL, TII->get(PVT == MVT::i64 ? PPC::BCTR8 : PPC::BCTR));
+
+  MI->eraseFromParent();
+  return MBB;
+}
+
+MachineBasicBlock *
+PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                               MachineBasicBlock *BB) const {
+  if (MI->getOpcode() == PPC::EH_SjLj_SetJmp32 ||
+      MI->getOpcode() == PPC::EH_SjLj_SetJmp64) {
+    return emitEHSjLjSetJmp(MI, BB);
+  } else if (MI->getOpcode() == PPC::EH_SjLj_LongJmp32 ||
+             MI->getOpcode() == PPC::EH_SjLj_LongJmp64) {
+    return emitEHSjLjLongJmp(MI, BB);
+  }
+
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+
+  // To "insert" these instructions we actually have to insert their
+  // control-flow patterns.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  MachineFunction *F = BB->getParent();
+
+  if (Subtarget.hasISEL() && (MI->getOpcode() == PPC::SELECT_CC_I4 ||
+                                 MI->getOpcode() == PPC::SELECT_CC_I8 ||
+                                 MI->getOpcode() == PPC::SELECT_I4 ||
+                                 MI->getOpcode() == PPC::SELECT_I8)) {
+    SmallVector<MachineOperand, 2> Cond;
+    if (MI->getOpcode() == PPC::SELECT_CC_I4 ||
+        MI->getOpcode() == PPC::SELECT_CC_I8)
+      Cond.push_back(MI->getOperand(4));
+    else
+      Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_SET));
+    Cond.push_back(MI->getOperand(1));
+
+    DebugLoc dl = MI->getDebugLoc();
+    const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+    TII->insertSelect(*BB, MI, dl, MI->getOperand(0).getReg(),
+                      Cond, MI->getOperand(2).getReg(),
+                      MI->getOperand(3).getReg());
+  } else if (MI->getOpcode() == PPC::SELECT_CC_I4 ||
+             MI->getOpcode() == PPC::SELECT_CC_I8 ||
+             MI->getOpcode() == PPC::SELECT_CC_F4 ||
+             MI->getOpcode() == PPC::SELECT_CC_F8 ||
+             MI->getOpcode() == PPC::SELECT_CC_VRRC ||
+             MI->getOpcode() == PPC::SELECT_I4 ||
+             MI->getOpcode() == PPC::SELECT_I8 ||
+             MI->getOpcode() == PPC::SELECT_F4 ||
+             MI->getOpcode() == PPC::SELECT_F8 ||
+             MI->getOpcode() == PPC::SELECT_VRRC) {
+    // The incoming instruction knows the destination vreg to set, the
+    // condition code register to branch on, the true/false values to
+    // select between, and a branch opcode to use.
+
+    //  thisMBB:
+    //  ...
+    //   TrueVal = ...
+    //   cmpTY ccX, r1, r2
+    //   bCC copy1MBB
+    //   fallthrough --> copy0MBB
+    MachineBasicBlock *thisMBB = BB;
+    MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+    DebugLoc dl = MI->getDebugLoc();
+    F->insert(It, copy0MBB);
+    F->insert(It, sinkMBB);
+
+    // Transfer the remainder of BB and its successor edges to sinkMBB.
+    sinkMBB->splice(sinkMBB->begin(), BB,
+                    std::next(MachineBasicBlock::iterator(MI)), BB->end());
+    sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+    // Next, add the true and fallthrough blocks as its successors.
+    BB->addSuccessor(copy0MBB);
+    BB->addSuccessor(sinkMBB);
+
+    if (MI->getOpcode() == PPC::SELECT_I4 ||
+        MI->getOpcode() == PPC::SELECT_I8 ||
+        MI->getOpcode() == PPC::SELECT_F4 ||
+        MI->getOpcode() == PPC::SELECT_F8 ||
+        MI->getOpcode() == PPC::SELECT_VRRC) {
+      BuildMI(BB, dl, TII->get(PPC::BC))
+        .addReg(MI->getOperand(1).getReg()).addMBB(sinkMBB);
+    } else {
+      unsigned SelectPred = MI->getOperand(4).getImm();
+      BuildMI(BB, dl, TII->get(PPC::BCC))
+        .addImm(SelectPred).addReg(MI->getOperand(1).getReg()).addMBB(sinkMBB);
+    }
+
+    //  copy0MBB:
+    //   %FalseValue = ...
+    //   # fallthrough to sinkMBB
+    BB = copy0MBB;
+
+    // Update machine-CFG edges
+    BB->addSuccessor(sinkMBB);
+
+    //  sinkMBB:
+    //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+    //  ...
+    BB = sinkMBB;
+    BuildMI(*BB, BB->begin(), dl,
+            TII->get(PPC::PHI), MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(3).getReg()).addMBB(copy0MBB)
+      .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
+  }
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_ADD_I8)
+    BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::ADD4);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_ADD_I16)
+    BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::ADD4);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_ADD_I32)
+    BB = EmitAtomicBinary(MI, BB, false, PPC::ADD4);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_ADD_I64)
+    BB = EmitAtomicBinary(MI, BB, true, PPC::ADD8);
+
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_AND_I8)
+    BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::AND);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_AND_I16)
+    BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::AND);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_AND_I32)
+    BB = EmitAtomicBinary(MI, BB, false, PPC::AND);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_AND_I64)
+    BB = EmitAtomicBinary(MI, BB, true, PPC::AND8);
+
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_OR_I8)
+    BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::OR);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_OR_I16)
+    BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::OR);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_OR_I32)
+    BB = EmitAtomicBinary(MI, BB, false, PPC::OR);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_OR_I64)
+    BB = EmitAtomicBinary(MI, BB, true, PPC::OR8);
+
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_XOR_I8)
+    BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::XOR);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_XOR_I16)
+    BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::XOR);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_XOR_I32)
+    BB = EmitAtomicBinary(MI, BB, false, PPC::XOR);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_XOR_I64)
+    BB = EmitAtomicBinary(MI, BB, true, PPC::XOR8);
+
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I8)
+    BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::NAND);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I16)
+    BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::NAND);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I32)
+    BB = EmitAtomicBinary(MI, BB, false, PPC::NAND);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I64)
+    BB = EmitAtomicBinary(MI, BB, true, PPC::NAND8);
+
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_SUB_I8)
+    BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::SUBF);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_SUB_I16)
+    BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::SUBF);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_SUB_I32)
+    BB = EmitAtomicBinary(MI, BB, false, PPC::SUBF);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_SUB_I64)
+    BB = EmitAtomicBinary(MI, BB, true, PPC::SUBF8);
+
+  else if (MI->getOpcode() == PPC::ATOMIC_SWAP_I8)
+    BB = EmitPartwordAtomicBinary(MI, BB, true, 0);
+  else if (MI->getOpcode() == PPC::ATOMIC_SWAP_I16)
+    BB = EmitPartwordAtomicBinary(MI, BB, false, 0);
+  else if (MI->getOpcode() == PPC::ATOMIC_SWAP_I32)
+    BB = EmitAtomicBinary(MI, BB, false, 0);
+  else if (MI->getOpcode() == PPC::ATOMIC_SWAP_I64)
+    BB = EmitAtomicBinary(MI, BB, true, 0);
+
+  else if (MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I32 ||
+           MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I64) {
+    bool is64bit = MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I64;
+
+    unsigned dest   = MI->getOperand(0).getReg();
+    unsigned ptrA   = MI->getOperand(1).getReg();
+    unsigned ptrB   = MI->getOperand(2).getReg();
+    unsigned oldval = MI->getOperand(3).getReg();
+    unsigned newval = MI->getOperand(4).getReg();
+    DebugLoc dl     = MI->getDebugLoc();
+
+    MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
+    F->insert(It, loop1MBB);
+    F->insert(It, loop2MBB);
+    F->insert(It, midMBB);
+    F->insert(It, exitMBB);
+    exitMBB->splice(exitMBB->begin(), BB,
+                    std::next(MachineBasicBlock::iterator(MI)), BB->end());
+    exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+    //  thisMBB:
+    //   ...
+    //   fallthrough --> loopMBB
+    BB->addSuccessor(loop1MBB);
+
+    // loop1MBB:
+    //   l[wd]arx dest, ptr
+    //   cmp[wd] dest, oldval
+    //   bne- midMBB
+    // loop2MBB:
+    //   st[wd]cx. newval, ptr
+    //   bne- loopMBB
+    //   b exitBB
+    // midMBB:
+    //   st[wd]cx. dest, ptr
+    // exitBB:
+    BB = loop1MBB;
+    BuildMI(BB, dl, TII->get(is64bit ? PPC::LDARX : PPC::LWARX), dest)
+      .addReg(ptrA).addReg(ptrB);
+    BuildMI(BB, dl, TII->get(is64bit ? PPC::CMPD : PPC::CMPW), PPC::CR0)
+      .addReg(oldval).addReg(dest);
+    BuildMI(BB, dl, TII->get(PPC::BCC))
+      .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(midMBB);
+    BB->addSuccessor(loop2MBB);
+    BB->addSuccessor(midMBB);
+
+    BB = loop2MBB;
+    BuildMI(BB, dl, TII->get(is64bit ? PPC::STDCX : PPC::STWCX))
+      .addReg(newval).addReg(ptrA).addReg(ptrB);
+    BuildMI(BB, dl, TII->get(PPC::BCC))
+      .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loop1MBB);
+    BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB);
+    BB->addSuccessor(loop1MBB);
+    BB->addSuccessor(exitMBB);
+
+    BB = midMBB;
+    BuildMI(BB, dl, TII->get(is64bit ? PPC::STDCX : PPC::STWCX))
+      .addReg(dest).addReg(ptrA).addReg(ptrB);
+    BB->addSuccessor(exitMBB);
+
+    //  exitMBB:
+    //   ...
+    BB = exitMBB;
+  } else if (MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I8 ||
+             MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I16) {
+    // We must use 64-bit registers for addresses when targeting 64-bit,
+    // since we're actually doing arithmetic on them.  Other registers
+    // can be 32-bit.
+    bool is64bit = Subtarget.isPPC64();
+    bool is8bit = MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I8;
+
+    unsigned dest   = MI->getOperand(0).getReg();
+    unsigned ptrA   = MI->getOperand(1).getReg();
+    unsigned ptrB   = MI->getOperand(2).getReg();
+    unsigned oldval = MI->getOperand(3).getReg();
+    unsigned newval = MI->getOperand(4).getReg();
+    DebugLoc dl     = MI->getDebugLoc();
+
+    MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
+    F->insert(It, loop1MBB);
+    F->insert(It, loop2MBB);
+    F->insert(It, midMBB);
+    F->insert(It, exitMBB);
+    exitMBB->splice(exitMBB->begin(), BB,
+                    std::next(MachineBasicBlock::iterator(MI)), BB->end());
+    exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+    MachineRegisterInfo &RegInfo = F->getRegInfo();
+    const TargetRegisterClass *RC =
+      is64bit ? (const TargetRegisterClass *) &PPC::G8RCRegClass :
+                (const TargetRegisterClass *) &PPC::GPRCRegClass;
+    unsigned PtrReg = RegInfo.createVirtualRegister(RC);
+    unsigned Shift1Reg = RegInfo.createVirtualRegister(RC);
+    unsigned ShiftReg = RegInfo.createVirtualRegister(RC);
+    unsigned NewVal2Reg = RegInfo.createVirtualRegister(RC);
+    unsigned NewVal3Reg = RegInfo.createVirtualRegister(RC);
+    unsigned OldVal2Reg = RegInfo.createVirtualRegister(RC);
+    unsigned OldVal3Reg = RegInfo.createVirtualRegister(RC);
+    unsigned MaskReg = RegInfo.createVirtualRegister(RC);
+    unsigned Mask2Reg = RegInfo.createVirtualRegister(RC);
+    unsigned Mask3Reg = RegInfo.createVirtualRegister(RC);
+    unsigned Tmp2Reg = RegInfo.createVirtualRegister(RC);
+    unsigned Tmp4Reg = RegInfo.createVirtualRegister(RC);
+    unsigned TmpDestReg = RegInfo.createVirtualRegister(RC);
+    unsigned Ptr1Reg;
+    unsigned TmpReg = RegInfo.createVirtualRegister(RC);
+    unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO;
+    //  thisMBB:
+    //   ...
+    //   fallthrough --> loopMBB
+    BB->addSuccessor(loop1MBB);
+
+    // The 4-byte load must be aligned, while a char or short may be
+    // anywhere in the word.  Hence all this nasty bookkeeping code.
+    //   add ptr1, ptrA, ptrB [copy if ptrA==0]
+    //   rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27]
+    //   xori shift, shift1, 24 [16]
+    //   rlwinm ptr, ptr1, 0, 0, 29
+    //   slw newval2, newval, shift
+    //   slw oldval2, oldval,shift
+    //   li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535]
+    //   slw mask, mask2, shift
+    //   and newval3, newval2, mask
+    //   and oldval3, oldval2, mask
+    // loop1MBB:
+    //   lwarx tmpDest, ptr
+    //   and tmp, tmpDest, mask
+    //   cmpw tmp, oldval3
+    //   bne- midMBB
+    // loop2MBB:
+    //   andc tmp2, tmpDest, mask
+    //   or tmp4, tmp2, newval3
+    //   stwcx. tmp4, ptr
+    //   bne- loop1MBB
+    //   b exitBB
+    // midMBB:
+    //   stwcx. tmpDest, ptr
+    // exitBB:
+    //   srw dest, tmpDest, shift
+    if (ptrA != ZeroReg) {
+      Ptr1Reg = RegInfo.createVirtualRegister(RC);
+      BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg)
+        .addReg(ptrA).addReg(ptrB);
+    } else {
+      Ptr1Reg = ptrB;
+    }
+    BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg).addReg(Ptr1Reg)
+        .addImm(3).addImm(27).addImm(is8bit ? 28 : 27);
+    BuildMI(BB, dl, TII->get(is64bit ? PPC::XORI8 : PPC::XORI), ShiftReg)
+        .addReg(Shift1Reg).addImm(is8bit ? 24 : 16);
+    if (is64bit)
+      BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg)
+        .addReg(Ptr1Reg).addImm(0).addImm(61);
+    else
+      BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg)
+        .addReg(Ptr1Reg).addImm(0).addImm(0).addImm(29);
+    BuildMI(BB, dl, TII->get(PPC::SLW), NewVal2Reg)
+        .addReg(newval).addReg(ShiftReg);
+    BuildMI(BB, dl, TII->get(PPC::SLW), OldVal2Reg)
+        .addReg(oldval).addReg(ShiftReg);
+    if (is8bit)
+      BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255);
+    else {
+      BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0);
+      BuildMI(BB, dl, TII->get(PPC::ORI), Mask2Reg)
+        .addReg(Mask3Reg).addImm(65535);
+    }
+    BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg)
+        .addReg(Mask2Reg).addReg(ShiftReg);
+    BuildMI(BB, dl, TII->get(PPC::AND), NewVal3Reg)
+        .addReg(NewVal2Reg).addReg(MaskReg);
+    BuildMI(BB, dl, TII->get(PPC::AND), OldVal3Reg)
+        .addReg(OldVal2Reg).addReg(MaskReg);
+
+    BB = loop1MBB;
+    BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg)
+        .addReg(ZeroReg).addReg(PtrReg);
+    BuildMI(BB, dl, TII->get(PPC::AND),TmpReg)
+        .addReg(TmpDestReg).addReg(MaskReg);
+    BuildMI(BB, dl, TII->get(PPC::CMPW), PPC::CR0)
+        .addReg(TmpReg).addReg(OldVal3Reg);
+    BuildMI(BB, dl, TII->get(PPC::BCC))
+        .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(midMBB);
+    BB->addSuccessor(loop2MBB);
+    BB->addSuccessor(midMBB);
+
+    BB = loop2MBB;
+    BuildMI(BB, dl, TII->get(PPC::ANDC),Tmp2Reg)
+        .addReg(TmpDestReg).addReg(MaskReg);
+    BuildMI(BB, dl, TII->get(PPC::OR),Tmp4Reg)
+        .addReg(Tmp2Reg).addReg(NewVal3Reg);
+    BuildMI(BB, dl, TII->get(PPC::STWCX)).addReg(Tmp4Reg)
+        .addReg(ZeroReg).addReg(PtrReg);
+    BuildMI(BB, dl, TII->get(PPC::BCC))
+      .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loop1MBB);
+    BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB);
+    BB->addSuccessor(loop1MBB);
+    BB->addSuccessor(exitMBB);
+
+    BB = midMBB;
+    BuildMI(BB, dl, TII->get(PPC::STWCX)).addReg(TmpDestReg)
+      .addReg(ZeroReg).addReg(PtrReg);
+    BB->addSuccessor(exitMBB);
+
+    //  exitMBB:
+    //   ...
+    BB = exitMBB;
+    BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW),dest).addReg(TmpReg)
+      .addReg(ShiftReg);
+  } else if (MI->getOpcode() == PPC::FADDrtz) {
+    // This pseudo performs an FADD with rounding mode temporarily forced
+    // to round-to-zero.  We emit this via custom inserter since the FPSCR
+    // is not modeled at the SelectionDAG level.
+    unsigned Dest = MI->getOperand(0).getReg();
+    unsigned Src1 = MI->getOperand(1).getReg();
+    unsigned Src2 = MI->getOperand(2).getReg();
+    DebugLoc dl   = MI->getDebugLoc();
+
+    MachineRegisterInfo &RegInfo = F->getRegInfo();
+    unsigned MFFSReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass);
+
+    // Save FPSCR value.
+    BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), MFFSReg);
+
+    // Set rounding mode to round-to-zero.
+    BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB1)).addImm(31);
+    BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB0)).addImm(30);
+
+    // Perform addition.
+    BuildMI(*BB, MI, dl, TII->get(PPC::FADD), Dest).addReg(Src1).addReg(Src2);
+
+    // Restore FPSCR value.
+    BuildMI(*BB, MI, dl, TII->get(PPC::MTFSF)).addImm(1).addReg(MFFSReg);
+  } else if (MI->getOpcode() == PPC::ANDIo_1_EQ_BIT ||
+             MI->getOpcode() == PPC::ANDIo_1_GT_BIT ||
+             MI->getOpcode() == PPC::ANDIo_1_EQ_BIT8 ||
+             MI->getOpcode() == PPC::ANDIo_1_GT_BIT8) {
+    unsigned Opcode = (MI->getOpcode() == PPC::ANDIo_1_EQ_BIT8 ||
+                       MI->getOpcode() == PPC::ANDIo_1_GT_BIT8) ?
+                      PPC::ANDIo8 : PPC::ANDIo;
+    bool isEQ = (MI->getOpcode() == PPC::ANDIo_1_EQ_BIT ||
+                 MI->getOpcode() == PPC::ANDIo_1_EQ_BIT8);
+
+    MachineRegisterInfo &RegInfo = F->getRegInfo();
+    unsigned Dest = RegInfo.createVirtualRegister(Opcode == PPC::ANDIo ?
+                                                  &PPC::GPRCRegClass :
+                                                  &PPC::G8RCRegClass);
+
+    DebugLoc dl   = MI->getDebugLoc();
+    BuildMI(*BB, MI, dl, TII->get(Opcode), Dest)
+      .addReg(MI->getOperand(1).getReg()).addImm(1);
+    BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY),
+            MI->getOperand(0).getReg())
+      .addReg(isEQ ? PPC::CR0EQ : PPC::CR0GT);
+  } else {
+    llvm_unreachable("Unexpected instr type to insert");
+  }
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+}
+
+//===----------------------------------------------------------------------===//
+// Target Optimization Hooks
+//===----------------------------------------------------------------------===//
+
+SDValue PPCTargetLowering::DAGCombineFastRecip(SDValue Op,
+                                               DAGCombinerInfo &DCI) const {
+  if (DCI.isAfterLegalizeVectorOps())
+    return SDValue();
+
+  EVT VT = Op.getValueType();
+
+  if ((VT == MVT::f32 && Subtarget.hasFRES()) ||
+      (VT == MVT::f64 && Subtarget.hasFRE())  ||
+      (VT == MVT::v4f32 && Subtarget.hasAltivec()) ||
+      (VT == MVT::v2f64 && Subtarget.hasVSX())) {
+
+    // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
+    // For the reciprocal, we need to find the zero of the function:
+    //   F(X) = A X - 1 [which has a zero at X = 1/A]
+    //     =>
+    //   X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form
+    //     does not require additional intermediate precision]
+
+    // Convergence is quadratic, so we essentially double the number of digits
+    // correct after every iteration. The minimum architected relative
+    // accuracy is 2^-5. When hasRecipPrec(), this is 2^-14. IEEE float has
+    // 23 digits and double has 52 digits.
+    int Iterations = Subtarget.hasRecipPrec() ? 1 : 3;
+    if (VT.getScalarType() == MVT::f64)
+      ++Iterations;
+
+    SelectionDAG &DAG = DCI.DAG;
+    SDLoc dl(Op);
+
+    SDValue FPOne =
+      DAG.getConstantFP(1.0, VT.getScalarType());
+    if (VT.isVector()) {
+      assert(VT.getVectorNumElements() == 4 &&
+             "Unknown vector type");
+      FPOne = DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
+                          FPOne, FPOne, FPOne, FPOne);
+    }
+
+    SDValue Est = DAG.getNode(PPCISD::FRE, dl, VT, Op);
+    DCI.AddToWorklist(Est.getNode());
+
+    // Newton iterations: Est = Est + Est (1 - Arg * Est)
+    for (int i = 0; i < Iterations; ++i) {
+      SDValue NewEst = DAG.getNode(ISD::FMUL, dl, VT, Op, Est);
+      DCI.AddToWorklist(NewEst.getNode());
+
+      NewEst = DAG.getNode(ISD::FSUB, dl, VT, FPOne, NewEst);
+      DCI.AddToWorklist(NewEst.getNode());
+
+      NewEst = DAG.getNode(ISD::FMUL, dl, VT, Est, NewEst);
+      DCI.AddToWorklist(NewEst.getNode());
+
+      Est = DAG.getNode(ISD::FADD, dl, VT, Est, NewEst);
+      DCI.AddToWorklist(Est.getNode());
+    }
+
+    return Est;
+  }
+
+  return SDValue();
+}
+
+SDValue PPCTargetLowering::DAGCombineFastRecipFSQRT(SDValue Op,
+                                             DAGCombinerInfo &DCI) const {
+  if (DCI.isAfterLegalizeVectorOps())
+    return SDValue();
+
+  EVT VT = Op.getValueType();
+
+  if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) ||
+      (VT == MVT::f64 && Subtarget.hasFRSQRTE())  ||
+      (VT == MVT::v4f32 && Subtarget.hasAltivec()) ||
+      (VT == MVT::v2f64 && Subtarget.hasVSX())) {
+
+    // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
+    // For the reciprocal sqrt, we need to find the zero of the function:
+    //   F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)]
+    //     =>
+    //   X_{i+1} = X_i (1.5 - A X_i^2 / 2)
+    // As a result, we precompute A/2 prior to the iteration loop.
+
+    // Convergence is quadratic, so we essentially double the number of digits
+    // correct after every iteration. The minimum architected relative
+    // accuracy is 2^-5. When hasRecipPrec(), this is 2^-14. IEEE float has
+    // 23 digits and double has 52 digits.
+    int Iterations = Subtarget.hasRecipPrec() ? 1 : 3;
+    if (VT.getScalarType() == MVT::f64)
+      ++Iterations;
+
+    SelectionDAG &DAG = DCI.DAG;
+    SDLoc dl(Op);
+
+    SDValue FPThreeHalves =
+      DAG.getConstantFP(1.5, VT.getScalarType());
+    if (VT.isVector()) {
+      assert(VT.getVectorNumElements() == 4 &&
+             "Unknown vector type");
+      FPThreeHalves = DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
+                                  FPThreeHalves, FPThreeHalves,
+                                  FPThreeHalves, FPThreeHalves);
+    }
+
+    SDValue Est = DAG.getNode(PPCISD::FRSQRTE, dl, VT, Op);
+    DCI.AddToWorklist(Est.getNode());
+
+    // We now need 0.5*Arg which we can write as (1.5*Arg - Arg) so that
+    // this entire sequence requires only one FP constant.
+    SDValue HalfArg = DAG.getNode(ISD::FMUL, dl, VT, FPThreeHalves, Op);
+    DCI.AddToWorklist(HalfArg.getNode());
+
+    HalfArg = DAG.getNode(ISD::FSUB, dl, VT, HalfArg, Op);
+    DCI.AddToWorklist(HalfArg.getNode());
+
+    // Newton iterations: Est = Est * (1.5 - HalfArg * Est * Est)
+    for (int i = 0; i < Iterations; ++i) {
+      SDValue NewEst = DAG.getNode(ISD::FMUL, dl, VT, Est, Est);
+      DCI.AddToWorklist(NewEst.getNode());
+
+      NewEst = DAG.getNode(ISD::FMUL, dl, VT, HalfArg, NewEst);
+      DCI.AddToWorklist(NewEst.getNode());
+
+      NewEst = DAG.getNode(ISD::FSUB, dl, VT, FPThreeHalves, NewEst);
+      DCI.AddToWorklist(NewEst.getNode());
+
+      Est = DAG.getNode(ISD::FMUL, dl, VT, Est, NewEst);
+      DCI.AddToWorklist(Est.getNode());
+    }
+
+    return Est;
+  }
+
+  return SDValue();
+}
+
+// Like SelectionDAG::isConsecutiveLoad, but also works for stores, and does
+// not enforce equality of the chain operands.
+static bool isConsecutiveLS(LSBaseSDNode *LS, LSBaseSDNode *Base,
+                            unsigned Bytes, int Dist,
+                            SelectionDAG &DAG) {
+  EVT VT = LS->getMemoryVT();
+  if (VT.getSizeInBits() / 8 != Bytes)
+    return false;
+
+  SDValue Loc = LS->getBasePtr();
+  SDValue BaseLoc = Base->getBasePtr();
+  if (Loc.getOpcode() == ISD::FrameIndex) {
+    if (BaseLoc.getOpcode() != ISD::FrameIndex)
+      return false;
+    const MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+    int FI  = cast<FrameIndexSDNode>(Loc)->getIndex();
+    int BFI = cast<FrameIndexSDNode>(BaseLoc)->getIndex();
+    int FS  = MFI->getObjectSize(FI);
+    int BFS = MFI->getObjectSize(BFI);
+    if (FS != BFS || FS != (int)Bytes) return false;
+    return MFI->getObjectOffset(FI) == (MFI->getObjectOffset(BFI) + Dist*Bytes);
+  }
+
+  // Handle X+C
+  if (DAG.isBaseWithConstantOffset(Loc) && Loc.getOperand(0) == BaseLoc &&
+      cast<ConstantSDNode>(Loc.getOperand(1))->getSExtValue() == Dist*Bytes)
+    return true;
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  const GlobalValue *GV1 = nullptr;
+  const GlobalValue *GV2 = nullptr;
+  int64_t Offset1 = 0;
+  int64_t Offset2 = 0;
+  bool isGA1 = TLI.isGAPlusOffset(Loc.getNode(), GV1, Offset1);
+  bool isGA2 = TLI.isGAPlusOffset(BaseLoc.getNode(), GV2, Offset2);
+  if (isGA1 && isGA2 && GV1 == GV2)
+    return Offset1 == (Offset2 + Dist*Bytes);
+  return false;
+}
+
+// Return true is there is a nearyby consecutive load to the one provided
+// (regardless of alignment). We search up and down the chain, looking though
+// token factors and other loads (but nothing else). As a result, a true
+// results indicates that it is safe to create a new consecutive load adjacent
+// to the load provided.
+static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) {
+  SDValue Chain = LD->getChain();
+  EVT VT = LD->getMemoryVT();
+
+  SmallSet<SDNode *, 16> LoadRoots;
+  SmallVector<SDNode *, 8> Queue(1, Chain.getNode());
+  SmallSet<SDNode *, 16> Visited;
+
+  // First, search up the chain, branching to follow all token-factor operands.
+  // If we find a consecutive load, then we're done, otherwise, record all
+  // nodes just above the top-level loads and token factors.
+  while (!Queue.empty()) {
+    SDNode *ChainNext = Queue.pop_back_val();
+    if (!Visited.insert(ChainNext))
+      continue;
+
+    if (LoadSDNode *ChainLD = dyn_cast<LoadSDNode>(ChainNext)) {
+      if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG))
+        return true;
+
+      if (!Visited.count(ChainLD->getChain().getNode()))
+        Queue.push_back(ChainLD->getChain().getNode());
+    } else if (ChainNext->getOpcode() == ISD::TokenFactor) {
+      for (const SDUse &O : ChainNext->ops())
+        if (!Visited.count(O.getNode()))
+          Queue.push_back(O.getNode());
+    } else
+      LoadRoots.insert(ChainNext);
+  }
+
+  // Second, search down the chain, starting from the top-level nodes recorded
+  // in the first phase. These top-level nodes are the nodes just above all
+  // loads and token factors. Starting with their uses, recursively look though
+  // all loads (just the chain uses) and token factors to find a consecutive
+  // load.
+  Visited.clear();
+  Queue.clear();
+
+  for (SmallSet<SDNode *, 16>::iterator I = LoadRoots.begin(),
+       IE = LoadRoots.end(); I != IE; ++I) {
+    Queue.push_back(*I);
+       
+    while (!Queue.empty()) {
+      SDNode *LoadRoot = Queue.pop_back_val();
+      if (!Visited.insert(LoadRoot))
+        continue;
+
+      if (LoadSDNode *ChainLD = dyn_cast<LoadSDNode>(LoadRoot))
+        if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG))
+          return true;
+
+      for (SDNode::use_iterator UI = LoadRoot->use_begin(),
+           UE = LoadRoot->use_end(); UI != UE; ++UI)
+        if (((isa<LoadSDNode>(*UI) &&
+            cast<LoadSDNode>(*UI)->getChain().getNode() == LoadRoot) ||
+            UI->getOpcode() == ISD::TokenFactor) && !Visited.count(*UI))
+          Queue.push_back(*UI);
+    }
+  }
+
+  return false;
+}
+
+SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N,
+                                                  DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc dl(N);
+
+  assert(Subtarget.useCRBits() &&
+         "Expecting to be tracking CR bits");
+  // If we're tracking CR bits, we need to be careful that we don't have:
+  //   trunc(binary-ops(zext(x), zext(y)))
+  // or
+  //   trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
+  // such that we're unnecessarily moving things into GPRs when it would be
+  // better to keep them in CR bits.
+
+  // Note that trunc here can be an actual i1 trunc, or can be the effective
+  // truncation that comes from a setcc or select_cc.
+  if (N->getOpcode() == ISD::TRUNCATE &&
+      N->getValueType(0) != MVT::i1)
+    return SDValue();
+
+  if (N->getOperand(0).getValueType() != MVT::i32 &&
+      N->getOperand(0).getValueType() != MVT::i64)
+    return SDValue();
+
+  if (N->getOpcode() == ISD::SETCC ||
+      N->getOpcode() == ISD::SELECT_CC) {
+    // If we're looking at a comparison, then we need to make sure that the
+    // high bits (all except for the first) don't matter the result.
+    ISD::CondCode CC =
+      cast<CondCodeSDNode>(N->getOperand(
+        N->getOpcode() == ISD::SETCC ? 2 : 4))->get();
+    unsigned OpBits = N->getOperand(0).getValueSizeInBits();
+
+    if (ISD::isSignedIntSetCC(CC)) {
+      if (DAG.ComputeNumSignBits(N->getOperand(0)) != OpBits ||
+          DAG.ComputeNumSignBits(N->getOperand(1)) != OpBits)
+        return SDValue();
+    } else if (ISD::isUnsignedIntSetCC(CC)) {
+      if (!DAG.MaskedValueIsZero(N->getOperand(0),
+                                 APInt::getHighBitsSet(OpBits, OpBits-1)) ||
+          !DAG.MaskedValueIsZero(N->getOperand(1),
+                                 APInt::getHighBitsSet(OpBits, OpBits-1)))
+        return SDValue();
+    } else {
+      // This is neither a signed nor an unsigned comparison, just make sure
+      // that the high bits are equal.
+      APInt Op1Zero, Op1One;
+      APInt Op2Zero, Op2One;
+      DAG.computeKnownBits(N->getOperand(0), Op1Zero, Op1One);
+      DAG.computeKnownBits(N->getOperand(1), Op2Zero, Op2One);
+
+      // We don't really care about what is known about the first bit (if
+      // anything), so clear it in all masks prior to comparing them.
+      Op1Zero.clearBit(0); Op1One.clearBit(0);
+      Op2Zero.clearBit(0); Op2One.clearBit(0);
+
+      if (Op1Zero != Op2Zero || Op1One != Op2One)
+        return SDValue();
+    }
+  }
+
+  // We now know that the higher-order bits are irrelevant, we just need to
+  // make sure that all of the intermediate operations are bit operations, and
+  // all inputs are extensions.
+  if (N->getOperand(0).getOpcode() != ISD::AND &&
+      N->getOperand(0).getOpcode() != ISD::OR  &&
+      N->getOperand(0).getOpcode() != ISD::XOR &&
+      N->getOperand(0).getOpcode() != ISD::SELECT &&
+      N->getOperand(0).getOpcode() != ISD::SELECT_CC &&
+      N->getOperand(0).getOpcode() != ISD::TRUNCATE &&
+      N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND &&
+      N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
+      N->getOperand(0).getOpcode() != ISD::ANY_EXTEND)
+    return SDValue();
+
+  if ((N->getOpcode() == ISD::SETCC || N->getOpcode() == ISD::SELECT_CC) &&
+      N->getOperand(1).getOpcode() != ISD::AND &&
+      N->getOperand(1).getOpcode() != ISD::OR  &&
+      N->getOperand(1).getOpcode() != ISD::XOR &&
+      N->getOperand(1).getOpcode() != ISD::SELECT &&
+      N->getOperand(1).getOpcode() != ISD::SELECT_CC &&
+      N->getOperand(1).getOpcode() != ISD::TRUNCATE &&
+      N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND &&
+      N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
+      N->getOperand(1).getOpcode() != ISD::ANY_EXTEND)
+    return SDValue();
+
+  SmallVector<SDValue, 4> Inputs;
+  SmallVector<SDValue, 8> BinOps, PromOps;
+  SmallPtrSet<SDNode *, 16> Visited;
+
+  for (unsigned i = 0; i < 2; ++i) {
+    if (((N->getOperand(i).getOpcode() == ISD::SIGN_EXTEND ||
+          N->getOperand(i).getOpcode() == ISD::ZERO_EXTEND ||
+          N->getOperand(i).getOpcode() == ISD::ANY_EXTEND) &&
+          N->getOperand(i).getOperand(0).getValueType() == MVT::i1) ||
+        isa<ConstantSDNode>(N->getOperand(i)))
+      Inputs.push_back(N->getOperand(i));
+    else
+      BinOps.push_back(N->getOperand(i));
+
+    if (N->getOpcode() == ISD::TRUNCATE)
+      break;
+  }
+
+  // Visit all inputs, collect all binary operations (and, or, xor and
+  // select) that are all fed by extensions. 
+  while (!BinOps.empty()) {
+    SDValue BinOp = BinOps.back();
+    BinOps.pop_back();
+
+    if (!Visited.insert(BinOp.getNode()))
+      continue;
+
+    PromOps.push_back(BinOp);
+
+    for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) {
+      // The condition of the select is not promoted.
+      if (BinOp.getOpcode() == ISD::SELECT && i == 0)
+        continue;
+      if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3)
+        continue;
+
+      if (((BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND ||
+            BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND ||
+            BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) &&
+           BinOp.getOperand(i).getOperand(0).getValueType() == MVT::i1) ||
+          isa<ConstantSDNode>(BinOp.getOperand(i))) {
+        Inputs.push_back(BinOp.getOperand(i)); 
+      } else if (BinOp.getOperand(i).getOpcode() == ISD::AND ||
+                 BinOp.getOperand(i).getOpcode() == ISD::OR  ||
+                 BinOp.getOperand(i).getOpcode() == ISD::XOR ||
+                 BinOp.getOperand(i).getOpcode() == ISD::SELECT ||
+                 BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC ||
+                 BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE ||
+                 BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND ||
+                 BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND ||
+                 BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) {
+        BinOps.push_back(BinOp.getOperand(i));
+      } else {
+        // We have an input that is not an extension or another binary
+        // operation; we'll abort this transformation.
+        return SDValue();
+      }
+    }
+  }
+
+  // Make sure that this is a self-contained cluster of operations (which
+  // is not quite the same thing as saying that everything has only one
+  // use).
+  for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
+    if (isa<ConstantSDNode>(Inputs[i]))
+      continue;
+
+    for (SDNode::use_iterator UI = Inputs[i].getNode()->use_begin(),
+                              UE = Inputs[i].getNode()->use_end();
+         UI != UE; ++UI) {
+      SDNode *User = *UI;
+      if (User != N && !Visited.count(User))
+        return SDValue();
+
+      // Make sure that we're not going to promote the non-output-value
+      // operand(s) or SELECT or SELECT_CC.
+      // FIXME: Although we could sometimes handle this, and it does occur in
+      // practice that one of the condition inputs to the select is also one of
+      // the outputs, we currently can't deal with this.
+      if (User->getOpcode() == ISD::SELECT) {
+        if (User->getOperand(0) == Inputs[i])
+          return SDValue();
+      } else if (User->getOpcode() == ISD::SELECT_CC) {
+        if (User->getOperand(0) == Inputs[i] ||
+            User->getOperand(1) == Inputs[i])
+          return SDValue();
+      }
+    }
+  }
+
+  for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) {
+    for (SDNode::use_iterator UI = PromOps[i].getNode()->use_begin(),
+                              UE = PromOps[i].getNode()->use_end();
+         UI != UE; ++UI) {
+      SDNode *User = *UI;
+      if (User != N && !Visited.count(User))
+        return SDValue();
+
+      // Make sure that we're not going to promote the non-output-value
+      // operand(s) or SELECT or SELECT_CC.
+      // FIXME: Although we could sometimes handle this, and it does occur in
+      // practice that one of the condition inputs to the select is also one of
+      // the outputs, we currently can't deal with this.
+      if (User->getOpcode() == ISD::SELECT) {
+        if (User->getOperand(0) == PromOps[i])
+          return SDValue();
+      } else if (User->getOpcode() == ISD::SELECT_CC) {
+        if (User->getOperand(0) == PromOps[i] ||
+            User->getOperand(1) == PromOps[i])
+          return SDValue();
+      }
+    }
+  }
+
+  // Replace all inputs with the extension operand.
+  for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
+    // Constants may have users outside the cluster of to-be-promoted nodes,
+    // and so we need to replace those as we do the promotions.
+    if (isa<ConstantSDNode>(Inputs[i]))
+      continue;
+    else
+      DAG.ReplaceAllUsesOfValueWith(Inputs[i], Inputs[i].getOperand(0)); 
+  }
+
+  // Replace all operations (these are all the same, but have a different
+  // (i1) return type). DAG.getNode will validate that the types of
+  // a binary operator match, so go through the list in reverse so that
+  // we've likely promoted both operands first. Any intermediate truncations or
+  // extensions disappear.
+  while (!PromOps.empty()) {
+    SDValue PromOp = PromOps.back();
+    PromOps.pop_back();
+
+    if (PromOp.getOpcode() == ISD::TRUNCATE ||
+        PromOp.getOpcode() == ISD::SIGN_EXTEND ||
+        PromOp.getOpcode() == ISD::ZERO_EXTEND ||
+        PromOp.getOpcode() == ISD::ANY_EXTEND) {
+      if (!isa<ConstantSDNode>(PromOp.getOperand(0)) &&
+          PromOp.getOperand(0).getValueType() != MVT::i1) {
+        // The operand is not yet ready (see comment below).
+        PromOps.insert(PromOps.begin(), PromOp);
+        continue;
+      }
+
+      SDValue RepValue = PromOp.getOperand(0);
+      if (isa<ConstantSDNode>(RepValue))
+        RepValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, RepValue);
+
+      DAG.ReplaceAllUsesOfValueWith(PromOp, RepValue);
+      continue;
+    }
+
+    unsigned C;
+    switch (PromOp.getOpcode()) {
+    default:             C = 0; break;
+    case ISD::SELECT:    C = 1; break;
+    case ISD::SELECT_CC: C = 2; break;
+    }
+
+    if ((!isa<ConstantSDNode>(PromOp.getOperand(C)) &&
+         PromOp.getOperand(C).getValueType() != MVT::i1) ||
+        (!isa<ConstantSDNode>(PromOp.getOperand(C+1)) &&
+         PromOp.getOperand(C+1).getValueType() != MVT::i1)) {
+      // The to-be-promoted operands of this node have not yet been
+      // promoted (this should be rare because we're going through the
+      // list backward, but if one of the operands has several users in
+      // this cluster of to-be-promoted nodes, it is possible).
+      PromOps.insert(PromOps.begin(), PromOp);
+      continue;
+    }
+
+    SmallVector<SDValue, 3> Ops(PromOp.getNode()->op_begin(),
+                                PromOp.getNode()->op_end());
+
+    // If there are any constant inputs, make sure they're replaced now.
+    for (unsigned i = 0; i < 2; ++i)
+      if (isa<ConstantSDNode>(Ops[C+i]))
+        Ops[C+i] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ops[C+i]);
+
+    DAG.ReplaceAllUsesOfValueWith(PromOp,
+      DAG.getNode(PromOp.getOpcode(), dl, MVT::i1, Ops));
+  }
+
+  // Now we're left with the initial truncation itself.
+  if (N->getOpcode() == ISD::TRUNCATE)
+    return N->getOperand(0);
+
+  // Otherwise, this is a comparison. The operands to be compared have just
+  // changed type (to i1), but everything else is the same.
+  return SDValue(N, 0);
+}
+
+SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
+                                                  DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc dl(N);
+
+  // If we're tracking CR bits, we need to be careful that we don't have:
+  //   zext(binary-ops(trunc(x), trunc(y)))
+  // or
+  //   zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
+  // such that we're unnecessarily moving things into CR bits that can more
+  // efficiently stay in GPRs. Note that if we're not certain that the high
+  // bits are set as required by the final extension, we still may need to do
+  // some masking to get the proper behavior.
+
+  // This same functionality is important on PPC64 when dealing with
+  // 32-to-64-bit extensions; these occur often when 32-bit values are used as
+  // the return values of functions. Because it is so similar, it is handled
+  // here as well.
+
+  if (N->getValueType(0) != MVT::i32 &&
+      N->getValueType(0) != MVT::i64)
+    return SDValue();
+
+  if (!((N->getOperand(0).getValueType() == MVT::i1 &&
+        Subtarget.useCRBits()) ||
+       (N->getOperand(0).getValueType() == MVT::i32 &&
+        Subtarget.isPPC64())))
+    return SDValue();
+
+  if (N->getOperand(0).getOpcode() != ISD::AND &&
+      N->getOperand(0).getOpcode() != ISD::OR  &&
+      N->getOperand(0).getOpcode() != ISD::XOR &&
+      N->getOperand(0).getOpcode() != ISD::SELECT &&
+      N->getOperand(0).getOpcode() != ISD::SELECT_CC)
+    return SDValue();
+
+  SmallVector<SDValue, 4> Inputs;
+  SmallVector<SDValue, 8> BinOps(1, N->getOperand(0)), PromOps;
+  SmallPtrSet<SDNode *, 16> Visited;
+
+  // Visit all inputs, collect all binary operations (and, or, xor and
+  // select) that are all fed by truncations. 
+  while (!BinOps.empty()) {
+    SDValue BinOp = BinOps.back();
+    BinOps.pop_back();
+
+    if (!Visited.insert(BinOp.getNode()))
+      continue;
+
+    PromOps.push_back(BinOp);
+
+    for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) {
+      // The condition of the select is not promoted.
+      if (BinOp.getOpcode() == ISD::SELECT && i == 0)
+        continue;
+      if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3)
+        continue;
+
+      if (BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE ||
+          isa<ConstantSDNode>(BinOp.getOperand(i))) {
+        Inputs.push_back(BinOp.getOperand(i)); 
+      } else if (BinOp.getOperand(i).getOpcode() == ISD::AND ||
+                 BinOp.getOperand(i).getOpcode() == ISD::OR  ||
+                 BinOp.getOperand(i).getOpcode() == ISD::XOR ||
+                 BinOp.getOperand(i).getOpcode() == ISD::SELECT ||
+                 BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC) {
+        BinOps.push_back(BinOp.getOperand(i));
+      } else {
+        // We have an input that is not a truncation or another binary
+        // operation; we'll abort this transformation.
+        return SDValue();
+      }
+    }
+  }
+
+  // Make sure that this is a self-contained cluster of operations (which
+  // is not quite the same thing as saying that everything has only one
+  // use).
+  for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
+    if (isa<ConstantSDNode>(Inputs[i]))
+      continue;
+
+    for (SDNode::use_iterator UI = Inputs[i].getNode()->use_begin(),
+                              UE = Inputs[i].getNode()->use_end();
+         UI != UE; ++UI) {
+      SDNode *User = *UI;
+      if (User != N && !Visited.count(User))
+        return SDValue();
+
+      // Make sure that we're not going to promote the non-output-value
+      // operand(s) or SELECT or SELECT_CC.
+      // FIXME: Although we could sometimes handle this, and it does occur in
+      // practice that one of the condition inputs to the select is also one of
+      // the outputs, we currently can't deal with this.
+      if (User->getOpcode() == ISD::SELECT) {
+        if (User->getOperand(0) == Inputs[i])
+          return SDValue();
+      } else if (User->getOpcode() == ISD::SELECT_CC) {
+        if (User->getOperand(0) == Inputs[i] ||
+            User->getOperand(1) == Inputs[i])
+          return SDValue();
+      }
+    }
+  }
+
+  for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) {
+    for (SDNode::use_iterator UI = PromOps[i].getNode()->use_begin(),
+                              UE = PromOps[i].getNode()->use_end();
+         UI != UE; ++UI) {
+      SDNode *User = *UI;
+      if (User != N && !Visited.count(User))
+        return SDValue();
+
+      // Make sure that we're not going to promote the non-output-value
+      // operand(s) or SELECT or SELECT_CC.
+      // FIXME: Although we could sometimes handle this, and it does occur in
+      // practice that one of the condition inputs to the select is also one of
+      // the outputs, we currently can't deal with this.
+      if (User->getOpcode() == ISD::SELECT) {
+        if (User->getOperand(0) == PromOps[i])
+          return SDValue();
+      } else if (User->getOpcode() == ISD::SELECT_CC) {
+        if (User->getOperand(0) == PromOps[i] ||
+            User->getOperand(1) == PromOps[i])
+          return SDValue();
+      }
+    }
+  }
+
+  unsigned PromBits = N->getOperand(0).getValueSizeInBits();
+  bool ReallyNeedsExt = false;
+  if (N->getOpcode() != ISD::ANY_EXTEND) {
+    // If all of the inputs are not already sign/zero extended, then
+    // we'll still need to do that at the end.
+    for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
+      if (isa<ConstantSDNode>(Inputs[i]))
+        continue;
+
+      unsigned OpBits =
+        Inputs[i].getOperand(0).getValueSizeInBits();
+      assert(PromBits < OpBits && "Truncation not to a smaller bit count?");
+
+      if ((N->getOpcode() == ISD::ZERO_EXTEND &&
+           !DAG.MaskedValueIsZero(Inputs[i].getOperand(0),
+                                  APInt::getHighBitsSet(OpBits,
+                                                        OpBits-PromBits))) ||
+          (N->getOpcode() == ISD::SIGN_EXTEND &&
+           DAG.ComputeNumSignBits(Inputs[i].getOperand(0)) <
+             (OpBits-(PromBits-1)))) {
+        ReallyNeedsExt = true;
+        break;
+      }
+    }
+  }
+
+  // Replace all inputs, either with the truncation operand, or a
+  // truncation or extension to the final output type.
+  for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
+    // Constant inputs need to be replaced with the to-be-promoted nodes that
+    // use them because they might have users outside of the cluster of
+    // promoted nodes.
+    if (isa<ConstantSDNode>(Inputs[i]))
+      continue;
+
+    SDValue InSrc = Inputs[i].getOperand(0);
+    if (Inputs[i].getValueType() == N->getValueType(0))
+      DAG.ReplaceAllUsesOfValueWith(Inputs[i], InSrc);
+    else if (N->getOpcode() == ISD::SIGN_EXTEND)
+      DAG.ReplaceAllUsesOfValueWith(Inputs[i],
+        DAG.getSExtOrTrunc(InSrc, dl, N->getValueType(0)));
+    else if (N->getOpcode() == ISD::ZERO_EXTEND)
+      DAG.ReplaceAllUsesOfValueWith(Inputs[i],
+        DAG.getZExtOrTrunc(InSrc, dl, N->getValueType(0)));
+    else
+      DAG.ReplaceAllUsesOfValueWith(Inputs[i],
+        DAG.getAnyExtOrTrunc(InSrc, dl, N->getValueType(0)));
+  }
+
+  // Replace all operations (these are all the same, but have a different
+  // (promoted) return type). DAG.getNode will validate that the types of
+  // a binary operator match, so go through the list in reverse so that
+  // we've likely promoted both operands first.
+  while (!PromOps.empty()) {
+    SDValue PromOp = PromOps.back();
+    PromOps.pop_back();
+
+    unsigned C;
+    switch (PromOp.getOpcode()) {
+    default:             C = 0; break;
+    case ISD::SELECT:    C = 1; break;
+    case ISD::SELECT_CC: C = 2; break;
+    }
+
+    if ((!isa<ConstantSDNode>(PromOp.getOperand(C)) &&
+         PromOp.getOperand(C).getValueType() != N->getValueType(0)) ||
+        (!isa<ConstantSDNode>(PromOp.getOperand(C+1)) &&
+         PromOp.getOperand(C+1).getValueType() != N->getValueType(0))) {
+      // The to-be-promoted operands of this node have not yet been
+      // promoted (this should be rare because we're going through the
+      // list backward, but if one of the operands has several users in
+      // this cluster of to-be-promoted nodes, it is possible).
+      PromOps.insert(PromOps.begin(), PromOp);
+      continue;
+    }
+
+    SmallVector<SDValue, 3> Ops(PromOp.getNode()->op_begin(),
+                                PromOp.getNode()->op_end());
+
+    // If this node has constant inputs, then they'll need to be promoted here.
+    for (unsigned i = 0; i < 2; ++i) {
+      if (!isa<ConstantSDNode>(Ops[C+i]))
+        continue;
+      if (Ops[C+i].getValueType() == N->getValueType(0))
+        continue;
+
+      if (N->getOpcode() == ISD::SIGN_EXTEND)
+        Ops[C+i] = DAG.getSExtOrTrunc(Ops[C+i], dl, N->getValueType(0));
+      else if (N->getOpcode() == ISD::ZERO_EXTEND)
+        Ops[C+i] = DAG.getZExtOrTrunc(Ops[C+i], dl, N->getValueType(0));
+      else
+        Ops[C+i] = DAG.getAnyExtOrTrunc(Ops[C+i], dl, N->getValueType(0));
+    }
+
+    DAG.ReplaceAllUsesOfValueWith(PromOp,
+      DAG.getNode(PromOp.getOpcode(), dl, N->getValueType(0), Ops));
+  }
+
+  // Now we're left with the initial extension itself.
+  if (!ReallyNeedsExt)
+    return N->getOperand(0);
+
+  // To zero extend, just mask off everything except for the first bit (in the
+  // i1 case).
+  if (N->getOpcode() == ISD::ZERO_EXTEND)
+    return DAG.getNode(ISD::AND, dl, N->getValueType(0), N->getOperand(0),
+                       DAG.getConstant(APInt::getLowBitsSet(
+                                         N->getValueSizeInBits(0), PromBits),
+                                       N->getValueType(0)));
+
+  assert(N->getOpcode() == ISD::SIGN_EXTEND &&
+         "Invalid extension type");
+  EVT ShiftAmountTy = getShiftAmountTy(N->getValueType(0));
+  SDValue ShiftCst =
+    DAG.getConstant(N->getValueSizeInBits(0)-PromBits, ShiftAmountTy);
+  return DAG.getNode(ISD::SRA, dl, N->getValueType(0), 
+                     DAG.getNode(ISD::SHL, dl, N->getValueType(0),
+                                 N->getOperand(0), ShiftCst), ShiftCst);
+}
+
+SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
+                                             DAGCombinerInfo &DCI) const {
+  const TargetMachine &TM = getTargetMachine();
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc dl(N);
+  switch (N->getOpcode()) {
+  default: break;
+  case PPCISD::SHL:
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
+      if (C->isNullValue())   // 0 << V -> 0.
+        return N->getOperand(0);
+    }
+    break;
+  case PPCISD::SRL:
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
+      if (C->isNullValue())   // 0 >>u V -> 0.
+        return N->getOperand(0);
+    }
+    break;
+  case PPCISD::SRA:
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
+      if (C->isNullValue() ||   //  0 >>s V -> 0.
+          C->isAllOnesValue())    // -1 >>s V -> -1.
+        return N->getOperand(0);
+    }
+    break;
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::ANY_EXTEND: 
+    return DAGCombineExtBoolTrunc(N, DCI);
+  case ISD::TRUNCATE:
+  case ISD::SETCC:
+  case ISD::SELECT_CC:
+    return DAGCombineTruncBoolExt(N, DCI);
+  case ISD::FDIV: {
+    assert(TM.Options.UnsafeFPMath &&
+           "Reciprocal estimates require UnsafeFPMath");
+
+    if (N->getOperand(1).getOpcode() == ISD::FSQRT) {
+      SDValue RV =
+        DAGCombineFastRecipFSQRT(N->getOperand(1).getOperand(0), DCI);
+      if (RV.getNode()) {
+        DCI.AddToWorklist(RV.getNode());
+        return DAG.getNode(ISD::FMUL, dl, N->getValueType(0),
+                           N->getOperand(0), RV);
+      }
+    } else if (N->getOperand(1).getOpcode() == ISD::FP_EXTEND &&
+               N->getOperand(1).getOperand(0).getOpcode() == ISD::FSQRT) {
+      SDValue RV =
+        DAGCombineFastRecipFSQRT(N->getOperand(1).getOperand(0).getOperand(0),
+                                 DCI);
+      if (RV.getNode()) {
+        DCI.AddToWorklist(RV.getNode());
+        RV = DAG.getNode(ISD::FP_EXTEND, SDLoc(N->getOperand(1)),
+                         N->getValueType(0), RV);
+        DCI.AddToWorklist(RV.getNode());
+        return DAG.getNode(ISD::FMUL, dl, N->getValueType(0),
+                           N->getOperand(0), RV);
+      }
+    } else if (N->getOperand(1).getOpcode() == ISD::FP_ROUND &&
+               N->getOperand(1).getOperand(0).getOpcode() == ISD::FSQRT) {
+      SDValue RV =
+        DAGCombineFastRecipFSQRT(N->getOperand(1).getOperand(0).getOperand(0),
+                                 DCI);
+      if (RV.getNode()) {
+        DCI.AddToWorklist(RV.getNode());
+        RV = DAG.getNode(ISD::FP_ROUND, SDLoc(N->getOperand(1)),
+                         N->getValueType(0), RV,
+                         N->getOperand(1).getOperand(1));
+        DCI.AddToWorklist(RV.getNode());
+        return DAG.getNode(ISD::FMUL, dl, N->getValueType(0),
+                           N->getOperand(0), RV);
+      }
+    }
+
+    SDValue RV = DAGCombineFastRecip(N->getOperand(1), DCI);
+    if (RV.getNode()) {
+      DCI.AddToWorklist(RV.getNode());
+      return DAG.getNode(ISD::FMUL, dl, N->getValueType(0),
+                         N->getOperand(0), RV);
+    }
+
+    }
+    break;
+  case ISD::FSQRT: {
+    assert(TM.Options.UnsafeFPMath &&
+           "Reciprocal estimates require UnsafeFPMath");
+
+    // Compute this as 1/(1/sqrt(X)), which is the reciprocal of the
+    // reciprocal sqrt.
+    SDValue RV = DAGCombineFastRecipFSQRT(N->getOperand(0), DCI);
+    if (RV.getNode()) {
+      DCI.AddToWorklist(RV.getNode());
+      RV = DAGCombineFastRecip(RV, DCI);
+      if (RV.getNode()) {
+        // Unfortunately, RV is now NaN if the input was exactly 0. Select out
+        // this case and force the answer to 0.
+
+        EVT VT = RV.getValueType();
+
+        SDValue Zero = DAG.getConstantFP(0.0, VT.getScalarType());
+        if (VT.isVector()) {
+          assert(VT.getVectorNumElements() == 4 && "Unknown vector type");
+          Zero = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Zero, Zero, Zero, Zero);
+        }
+
+        SDValue ZeroCmp =
+          DAG.getSetCC(dl, getSetCCResultType(*DAG.getContext(), VT),
+                       N->getOperand(0), Zero, ISD::SETEQ);
+        DCI.AddToWorklist(ZeroCmp.getNode());
+        DCI.AddToWorklist(RV.getNode());
+
+        RV = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, dl, VT,
+                         ZeroCmp, Zero, RV);
+        return RV;
+      }
+    }
+
+    }
+    break;
+  case ISD::SINT_TO_FP:
+    if (TM.getSubtarget<PPCSubtarget>().has64BitSupport()) {
+      if (N->getOperand(0).getOpcode() == ISD::FP_TO_SINT) {
+        // Turn (sint_to_fp (fp_to_sint X)) -> fctidz/fcfid without load/stores.
+        // We allow the src/dst to be either f32/f64, but the intermediate
+        // type must be i64.
+        if (N->getOperand(0).getValueType() == MVT::i64 &&
+            N->getOperand(0).getOperand(0).getValueType() != MVT::ppcf128) {
+          SDValue Val = N->getOperand(0).getOperand(0);
+          if (Val.getValueType() == MVT::f32) {
+            Val = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Val);
+            DCI.AddToWorklist(Val.getNode());
+          }
+
+          Val = DAG.getNode(PPCISD::FCTIDZ, dl, MVT::f64, Val);
+          DCI.AddToWorklist(Val.getNode());
+          Val = DAG.getNode(PPCISD::FCFID, dl, MVT::f64, Val);
+          DCI.AddToWorklist(Val.getNode());
+          if (N->getValueType(0) == MVT::f32) {
+            Val = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, Val,
+                              DAG.getIntPtrConstant(0));
+            DCI.AddToWorklist(Val.getNode());
+          }
+          return Val;
+        } else if (N->getOperand(0).getValueType() == MVT::i32) {
+          // If the intermediate type is i32, we can avoid the load/store here
+          // too.
+        }
+      }
+    }
+    break;
+  case ISD::STORE:
+    // Turn STORE (FP_TO_SINT F) -> STFIWX(FCTIWZ(F)).
+    if (TM.getSubtarget<PPCSubtarget>().hasSTFIWX() &&
+        !cast<StoreSDNode>(N)->isTruncatingStore() &&
+        N->getOperand(1).getOpcode() == ISD::FP_TO_SINT &&
+        N->getOperand(1).getValueType() == MVT::i32 &&
+        N->getOperand(1).getOperand(0).getValueType() != MVT::ppcf128) {
+      SDValue Val = N->getOperand(1).getOperand(0);
+      if (Val.getValueType() == MVT::f32) {
+        Val = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Val);
+        DCI.AddToWorklist(Val.getNode());
+      }
+      Val = DAG.getNode(PPCISD::FCTIWZ, dl, MVT::f64, Val);
+      DCI.AddToWorklist(Val.getNode());
+
+      SDValue Ops[] = {
+        N->getOperand(0), Val, N->getOperand(2),
+        DAG.getValueType(N->getOperand(1).getValueType())
+      };
+
+      Val = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl,
+              DAG.getVTList(MVT::Other), Ops,
+              cast<StoreSDNode>(N)->getMemoryVT(),
+              cast<StoreSDNode>(N)->getMemOperand());
+      DCI.AddToWorklist(Val.getNode());
+      return Val;
+    }
+
+    // Turn STORE (BSWAP) -> sthbrx/stwbrx.
+    if (cast<StoreSDNode>(N)->isUnindexed() &&
+        N->getOperand(1).getOpcode() == ISD::BSWAP &&
+        N->getOperand(1).getNode()->hasOneUse() &&
+        (N->getOperand(1).getValueType() == MVT::i32 ||
+         N->getOperand(1).getValueType() == MVT::i16 ||
+         (TM.getSubtarget<PPCSubtarget>().hasLDBRX() &&
+          TM.getSubtarget<PPCSubtarget>().isPPC64() &&
+          N->getOperand(1).getValueType() == MVT::i64))) {
+      SDValue BSwapOp = N->getOperand(1).getOperand(0);
+      // Do an any-extend to 32-bits if this is a half-word input.
+      if (BSwapOp.getValueType() == MVT::i16)
+        BSwapOp = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, BSwapOp);
+
+      SDValue Ops[] = {
+        N->getOperand(0), BSwapOp, N->getOperand(2),
+        DAG.getValueType(N->getOperand(1).getValueType())
+      };
+      return
+        DAG.getMemIntrinsicNode(PPCISD::STBRX, dl, DAG.getVTList(MVT::Other),
+                                Ops, cast<StoreSDNode>(N)->getMemoryVT(),
+                                cast<StoreSDNode>(N)->getMemOperand());
+    }
+    break;
+  case ISD::LOAD: {
+    LoadSDNode *LD = cast<LoadSDNode>(N);
+    EVT VT = LD->getValueType(0);
+    Type *Ty = LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
+    unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(Ty);
+    if (ISD::isNON_EXTLoad(N) && VT.isVector() &&
+        TM.getSubtarget<PPCSubtarget>().hasAltivec() &&
+        (VT == MVT::v16i8 || VT == MVT::v8i16 ||
+         VT == MVT::v4i32 || VT == MVT::v4f32) &&
+        LD->getAlignment() < ABIAlignment) {
+      // This is a type-legal unaligned Altivec load.
+      SDValue Chain = LD->getChain();
+      SDValue Ptr = LD->getBasePtr();
+      bool isLittleEndian = Subtarget.isLittleEndian();
+
+      // This implements the loading of unaligned vectors as described in
+      // the venerable Apple Velocity Engine overview. Specifically:
+      // https://developer.apple.com/hardwaredrivers/ve/alignment.html
+      // https://developer.apple.com/hardwaredrivers/ve/code_optimization.html
+      //
+      // The general idea is to expand a sequence of one or more unaligned
+      // loads into an alignment-based permutation-control instruction (lvsl
+      // or lvsr), a series of regular vector loads (which always truncate
+      // their input address to an aligned address), and a series of
+      // permutations.  The results of these permutations are the requested
+      // loaded values.  The trick is that the last "extra" load is not taken
+      // from the address you might suspect (sizeof(vector) bytes after the
+      // last requested load), but rather sizeof(vector) - 1 bytes after the
+      // last requested vector. The point of this is to avoid a page fault if
+      // the base address happened to be aligned. This works because if the
+      // base address is aligned, then adding less than a full vector length
+      // will cause the last vector in the sequence to be (re)loaded.
+      // Otherwise, the next vector will be fetched as you might suspect was
+      // necessary.
+
+      // We might be able to reuse the permutation generation from
+      // a different base address offset from this one by an aligned amount.
+      // The INTRINSIC_WO_CHAIN DAG combine will attempt to perform this
+      // optimization later.
+      Intrinsic::ID Intr = (isLittleEndian ?
+                            Intrinsic::ppc_altivec_lvsr :
+                            Intrinsic::ppc_altivec_lvsl);
+      SDValue PermCntl = BuildIntrinsicOp(Intr, Ptr, DAG, dl, MVT::v16i8);
+
+      // Refine the alignment of the original load (a "new" load created here
+      // which was identical to the first except for the alignment would be
+      // merged with the existing node regardless).
+      MachineFunction &MF = DAG.getMachineFunction();
+      MachineMemOperand *MMO =
+        MF.getMachineMemOperand(LD->getPointerInfo(),
+                                LD->getMemOperand()->getFlags(),
+                                LD->getMemoryVT().getStoreSize(),
+                                ABIAlignment);
+      LD->refineAlignment(MMO);
+      SDValue BaseLoad = SDValue(LD, 0);
+
+      // Note that the value of IncOffset (which is provided to the next
+      // load's pointer info offset value, and thus used to calculate the
+      // alignment), and the value of IncValue (which is actually used to
+      // increment the pointer value) are different! This is because we
+      // require the next load to appear to be aligned, even though it
+      // is actually offset from the base pointer by a lesser amount.
+      int IncOffset = VT.getSizeInBits() / 8;
+      int IncValue = IncOffset;
+
+      // Walk (both up and down) the chain looking for another load at the real
+      // (aligned) offset (the alignment of the other load does not matter in
+      // this case). If found, then do not use the offset reduction trick, as
+      // that will prevent the loads from being later combined (as they would
+      // otherwise be duplicates).
+      if (!findConsecutiveLoad(LD, DAG))
+        --IncValue;
+
+      SDValue Increment = DAG.getConstant(IncValue, getPointerTy());
+      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+
+      SDValue ExtraLoad =
+        DAG.getLoad(VT, dl, Chain, Ptr,
+                    LD->getPointerInfo().getWithOffset(IncOffset),
+                    LD->isVolatile(), LD->isNonTemporal(),
+                    LD->isInvariant(), ABIAlignment);
+
+      SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+        BaseLoad.getValue(1), ExtraLoad.getValue(1));
+
+      if (BaseLoad.getValueType() != MVT::v4i32)
+        BaseLoad = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, BaseLoad);
+
+      if (ExtraLoad.getValueType() != MVT::v4i32)
+        ExtraLoad = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, ExtraLoad);
+
+      // Because vperm has a big-endian bias, we must reverse the order
+      // of the input vectors and complement the permute control vector
+      // when generating little endian code.  We have already handled the
+      // latter by using lvsr instead of lvsl, so just reverse BaseLoad
+      // and ExtraLoad here.
+      SDValue Perm;
+      if (isLittleEndian)
+        Perm = BuildIntrinsicOp(Intrinsic::ppc_altivec_vperm,
+                                ExtraLoad, BaseLoad, PermCntl, DAG, dl);
+      else
+        Perm = BuildIntrinsicOp(Intrinsic::ppc_altivec_vperm,
+                                BaseLoad, ExtraLoad, PermCntl, DAG, dl);
+
+      if (VT != MVT::v4i32)
+        Perm = DAG.getNode(ISD::BITCAST, dl, VT, Perm);
+
+      // Now we need to be really careful about how we update the users of the
+      // original load. We cannot just call DCI.CombineTo (or
+      // DAG.ReplaceAllUsesWith for that matter), because the load still has
+      // uses created here (the permutation for example) that need to stay.
+      SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
+      while (UI != UE) {
+        SDUse &Use = UI.getUse();
+        SDNode *User = *UI;
+        // Note: BaseLoad is checked here because it might not be N, but a
+        // bitcast of N.
+        if (User == Perm.getNode() || User == BaseLoad.getNode() ||
+            User == TF.getNode() || Use.getResNo() > 1) {
+          ++UI;
+          continue;
+        }
+
+        SDValue To = Use.getResNo() ? TF : Perm;
+        ++UI;
+
+        SmallVector<SDValue, 8> Ops;
+        for (const SDUse &O : User->ops()) {
+          if (O == Use)
+            Ops.push_back(To);
+          else
+            Ops.push_back(O);
+        }
+
+        DAG.UpdateNodeOperands(User, Ops);
+      }
+
+      return SDValue(N, 0);
+    }
+    }
+    break;
+  case ISD::INTRINSIC_WO_CHAIN: {
+    bool isLittleEndian = Subtarget.isLittleEndian();
+    Intrinsic::ID Intr = (isLittleEndian ?
+                          Intrinsic::ppc_altivec_lvsr :
+                          Intrinsic::ppc_altivec_lvsl);
+    if (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue() == Intr &&
+        N->getOperand(1)->getOpcode() == ISD::ADD) {
+      SDValue Add = N->getOperand(1);
+
+      if (DAG.MaskedValueIsZero(Add->getOperand(1),
+            APInt::getAllOnesValue(4 /* 16 byte alignment */).zext(
+              Add.getValueType().getScalarType().getSizeInBits()))) {
+        SDNode *BasePtr = Add->getOperand(0).getNode();
+        for (SDNode::use_iterator UI = BasePtr->use_begin(),
+             UE = BasePtr->use_end(); UI != UE; ++UI) {
+          if (UI->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
+              cast<ConstantSDNode>(UI->getOperand(0))->getZExtValue() ==
+                Intr) {
+            // We've found another LVSL/LVSR, and this address is an aligned
+            // multiple of that one. The results will be the same, so use the
+            // one we've just found instead.
+
+            return SDValue(*UI, 0);
+          }
+        }
+      }
+    }
+    }
+
+    break;
+  case ISD::BSWAP:
+    // Turn BSWAP (LOAD) -> lhbrx/lwbrx.
+    if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) &&
+        N->getOperand(0).hasOneUse() &&
+        (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i16 ||
+         (TM.getSubtarget<PPCSubtarget>().hasLDBRX() &&
+          TM.getSubtarget<PPCSubtarget>().isPPC64() &&
+          N->getValueType(0) == MVT::i64))) {
+      SDValue Load = N->getOperand(0);
+      LoadSDNode *LD = cast<LoadSDNode>(Load);
+      // Create the byte-swapping load.
+      SDValue Ops[] = {
+        LD->getChain(),    // Chain
+        LD->getBasePtr(),  // Ptr
+        DAG.getValueType(N->getValueType(0)) // VT
+      };
+      SDValue BSLoad =
+        DAG.getMemIntrinsicNode(PPCISD::LBRX, dl,
+                                DAG.getVTList(N->getValueType(0) == MVT::i64 ?
+                                              MVT::i64 : MVT::i32, MVT::Other),
+                                Ops, LD->getMemoryVT(), LD->getMemOperand());
+
+      // If this is an i16 load, insert the truncate.
+      SDValue ResVal = BSLoad;
+      if (N->getValueType(0) == MVT::i16)
+        ResVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, BSLoad);
+
+      // First, combine the bswap away.  This makes the value produced by the
+      // load dead.
+      DCI.CombineTo(N, ResVal);
+
+      // Next, combine the load away, we give it a bogus result value but a real
+      // chain result.  The result value is dead because the bswap is dead.
+      DCI.CombineTo(Load.getNode(), ResVal, BSLoad.getValue(1));
+
+      // Return N so it doesn't get rechecked!
+      return SDValue(N, 0);
+    }
+
+    break;
+  case PPCISD::VCMP: {
+    // If a VCMPo node already exists with exactly the same operands as this
+    // node, use its result instead of this node (VCMPo computes both a CR6 and
+    // a normal output).
+    //
+    if (!N->getOperand(0).hasOneUse() &&
+        !N->getOperand(1).hasOneUse() &&
+        !N->getOperand(2).hasOneUse()) {
+
+      // Scan all of the users of the LHS, looking for VCMPo's that match.
+      SDNode *VCMPoNode = nullptr;
+
+      SDNode *LHSN = N->getOperand(0).getNode();
+      for (SDNode::use_iterator UI = LHSN->use_begin(), E = LHSN->use_end();
+           UI != E; ++UI)
+        if (UI->getOpcode() == PPCISD::VCMPo &&
+            UI->getOperand(1) == N->getOperand(1) &&
+            UI->getOperand(2) == N->getOperand(2) &&
+            UI->getOperand(0) == N->getOperand(0)) {
+          VCMPoNode = *UI;
+          break;
+        }
+
+      // If there is no VCMPo node, or if the flag value has a single use, don't
+      // transform this.
+      if (!VCMPoNode || VCMPoNode->hasNUsesOfValue(0, 1))
+        break;
+
+      // Look at the (necessarily single) use of the flag value.  If it has a
+      // chain, this transformation is more complex.  Note that multiple things
+      // could use the value result, which we should ignore.
+      SDNode *FlagUser = nullptr;
+      for (SDNode::use_iterator UI = VCMPoNode->use_begin();
+           FlagUser == nullptr; ++UI) {
+        assert(UI != VCMPoNode->use_end() && "Didn't find user!");
+        SDNode *User = *UI;
+        for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) {
+          if (User->getOperand(i) == SDValue(VCMPoNode, 1)) {
+            FlagUser = User;
+            break;
+          }
+        }
+      }
+
+      // If the user is a MFOCRF instruction, we know this is safe.
+      // Otherwise we give up for right now.
+      if (FlagUser->getOpcode() == PPCISD::MFOCRF)
+        return SDValue(VCMPoNode, 0);
+    }
+    break;
+  }
+  case ISD::BRCOND: {
+    SDValue Cond = N->getOperand(1);
+    SDValue Target = N->getOperand(2);
+ 
+    if (Cond.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
+        cast<ConstantSDNode>(Cond.getOperand(1))->getZExtValue() ==
+          Intrinsic::ppc_is_decremented_ctr_nonzero) {
+
+      // We now need to make the intrinsic dead (it cannot be instruction
+      // selected).
+      DAG.ReplaceAllUsesOfValueWith(Cond.getValue(1), Cond.getOperand(0));
+      assert(Cond.getNode()->hasOneUse() &&
+             "Counter decrement has more than one use");
+
+      return DAG.getNode(PPCISD::BDNZ, dl, MVT::Other,
+                         N->getOperand(0), Target);
+    }
+  }
+  break;
+  case ISD::BR_CC: {
+    // If this is a branch on an altivec predicate comparison, lower this so
+    // that we don't have to do a MFOCRF: instead, branch directly on CR6.  This
+    // lowering is done pre-legalize, because the legalizer lowers the predicate
+    // compare down to code that is difficult to reassemble.
+    ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
+    SDValue LHS = N->getOperand(2), RHS = N->getOperand(3);
+
+    // Sometimes the promoted value of the intrinsic is ANDed by some non-zero
+    // value. If so, pass-through the AND to get to the intrinsic.
+    if (LHS.getOpcode() == ISD::AND &&
+        LHS.getOperand(0).getOpcode() == ISD::INTRINSIC_W_CHAIN &&
+        cast<ConstantSDNode>(LHS.getOperand(0).getOperand(1))->getZExtValue() ==
+          Intrinsic::ppc_is_decremented_ctr_nonzero &&
+        isa<ConstantSDNode>(LHS.getOperand(1)) &&
+        !cast<ConstantSDNode>(LHS.getOperand(1))->getConstantIntValue()->
+          isZero())
+      LHS = LHS.getOperand(0);
+
+    if (LHS.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
+        cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() ==
+          Intrinsic::ppc_is_decremented_ctr_nonzero &&
+        isa<ConstantSDNode>(RHS)) {
+      assert((CC == ISD::SETEQ || CC == ISD::SETNE) &&
+             "Counter decrement comparison is not EQ or NE");
+
+      unsigned Val = cast<ConstantSDNode>(RHS)->getZExtValue();
+      bool isBDNZ = (CC == ISD::SETEQ && Val) ||
+                    (CC == ISD::SETNE && !Val);
+
+      // We now need to make the intrinsic dead (it cannot be instruction
+      // selected).
+      DAG.ReplaceAllUsesOfValueWith(LHS.getValue(1), LHS.getOperand(0));
+      assert(LHS.getNode()->hasOneUse() &&
+             "Counter decrement has more than one use");
+
+      return DAG.getNode(isBDNZ ? PPCISD::BDNZ : PPCISD::BDZ, dl, MVT::Other,
+                         N->getOperand(0), N->getOperand(4));
+    }
+
+    int CompareOpc;
+    bool isDot;
+
+    if (LHS.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
+        isa<ConstantSDNode>(RHS) && (CC == ISD::SETEQ || CC == ISD::SETNE) &&
+        getAltivecCompareInfo(LHS, CompareOpc, isDot)) {
+      assert(isDot && "Can't compare against a vector result!");
+
+      // If this is a comparison against something other than 0/1, then we know
+      // that the condition is never/always true.
+      unsigned Val = cast<ConstantSDNode>(RHS)->getZExtValue();
+      if (Val != 0 && Val != 1) {
+        if (CC == ISD::SETEQ)      // Cond never true, remove branch.
+          return N->getOperand(0);
+        // Always !=, turn it into an unconditional branch.
+        return DAG.getNode(ISD::BR, dl, MVT::Other,
+                           N->getOperand(0), N->getOperand(4));
+      }
+
+      bool BranchOnWhenPredTrue = (CC == ISD::SETEQ) ^ (Val == 0);
+
+      // Create the PPCISD altivec 'dot' comparison node.
+      SDValue Ops[] = {
+        LHS.getOperand(2),  // LHS of compare
+        LHS.getOperand(3),  // RHS of compare
+        DAG.getConstant(CompareOpc, MVT::i32)
+      };
+      EVT VTs[] = { LHS.getOperand(2).getValueType(), MVT::Glue };
+      SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops);
+
+      // Unpack the result based on how the target uses it.
+      PPC::Predicate CompOpc;
+      switch (cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue()) {
+      default:  // Can't happen, don't crash on invalid number though.
+      case 0:   // Branch on the value of the EQ bit of CR6.
+        CompOpc = BranchOnWhenPredTrue ? PPC::PRED_EQ : PPC::PRED_NE;
+        break;
+      case 1:   // Branch on the inverted value of the EQ bit of CR6.
+        CompOpc = BranchOnWhenPredTrue ? PPC::PRED_NE : PPC::PRED_EQ;
+        break;
+      case 2:   // Branch on the value of the LT bit of CR6.
+        CompOpc = BranchOnWhenPredTrue ? PPC::PRED_LT : PPC::PRED_GE;
+        break;
+      case 3:   // Branch on the inverted value of the LT bit of CR6.
+        CompOpc = BranchOnWhenPredTrue ? PPC::PRED_GE : PPC::PRED_LT;
+        break;
+      }
+
+      return DAG.getNode(PPCISD::COND_BRANCH, dl, MVT::Other, N->getOperand(0),
+                         DAG.getConstant(CompOpc, MVT::i32),
+                         DAG.getRegister(PPC::CR6, MVT::i32),
+                         N->getOperand(4), CompNode.getValue(1));
+    }
+    break;
+  }
+  }
+
+  return SDValue();
+}
+
+//===----------------------------------------------------------------------===//
+// Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
+                                                      APInt &KnownZero,
+                                                      APInt &KnownOne,
+                                                      const SelectionDAG &DAG,
+                                                      unsigned Depth) const {
+  KnownZero = KnownOne = APInt(KnownZero.getBitWidth(), 0);
+  switch (Op.getOpcode()) {
+  default: break;
+  case PPCISD::LBRX: {
+    // lhbrx is known to have the top bits cleared out.
+    if (cast<VTSDNode>(Op.getOperand(2))->getVT() == MVT::i16)
+      KnownZero = 0xFFFF0000;
+    break;
+  }
+  case ISD::INTRINSIC_WO_CHAIN: {
+    switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) {
+    default: break;
+    case Intrinsic::ppc_altivec_vcmpbfp_p:
+    case Intrinsic::ppc_altivec_vcmpeqfp_p:
+    case Intrinsic::ppc_altivec_vcmpequb_p:
+    case Intrinsic::ppc_altivec_vcmpequh_p:
+    case Intrinsic::ppc_altivec_vcmpequw_p:
+    case Intrinsic::ppc_altivec_vcmpgefp_p:
+    case Intrinsic::ppc_altivec_vcmpgtfp_p:
+    case Intrinsic::ppc_altivec_vcmpgtsb_p:
+    case Intrinsic::ppc_altivec_vcmpgtsh_p:
+    case Intrinsic::ppc_altivec_vcmpgtsw_p:
+    case Intrinsic::ppc_altivec_vcmpgtub_p:
+    case Intrinsic::ppc_altivec_vcmpgtuh_p:
+    case Intrinsic::ppc_altivec_vcmpgtuw_p:
+      KnownZero = ~1U;  // All bits but the low one are known to be zero.
+      break;
+    }
+  }
+  }
+}
+
+
+/// getConstraintType - Given a constraint, return the type of
+/// constraint it is for this target.
+PPCTargetLowering::ConstraintType
+PPCTargetLowering::getConstraintType(const std::string &Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default: break;
+    case 'b':
+    case 'r':
+    case 'f':
+    case 'v':
+    case 'y':
+      return C_RegisterClass;
+    case 'Z':
+      // FIXME: While Z does indicate a memory constraint, it specifically
+      // indicates an r+r address (used in conjunction with the 'y' modifier
+      // in the replacement string). Currently, we're forcing the base
+      // register to be r0 in the asm printer (which is interpreted as zero)
+      // and forming the complete address in the second register. This is
+      // suboptimal.
+      return C_Memory;
+    }
+  } else if (Constraint == "wc") { // individual CR bits.
+    return C_RegisterClass;
+  } else if (Constraint == "wa" || Constraint == "wd" ||
+             Constraint == "wf" || Constraint == "ws") {
+    return C_RegisterClass; // VSX registers.
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+/// Examine constraint type and operand type and determine a weight value.
+/// This object must already have been set up with the operand type
+/// and the current alternative constraint selected.
+TargetLowering::ConstraintWeight
+PPCTargetLowering::getSingleConstraintMatchWeight(
+    AsmOperandInfo &info, const char *constraint) const {
+  ConstraintWeight weight = CW_Invalid;
+  Value *CallOperandVal = info.CallOperandVal;
+    // If we don't have a value, we can't do a match,
+    // but allow it at the lowest weight.
+  if (!CallOperandVal)
+    return CW_Default;
+  Type *type = CallOperandVal->getType();
+
+  // Look at the constraint type.
+  if (StringRef(constraint) == "wc" && type->isIntegerTy(1))
+    return CW_Register; // an individual CR bit.
+  else if ((StringRef(constraint) == "wa" ||
+            StringRef(constraint) == "wd" ||
+            StringRef(constraint) == "wf") &&
+           type->isVectorTy())
+    return CW_Register;
+  else if (StringRef(constraint) == "ws" && type->isDoubleTy())
+    return CW_Register;
+
+  switch (*constraint) {
+  default:
+    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+    break;
+  case 'b':
+    if (type->isIntegerTy())
+      weight = CW_Register;
+    break;
+  case 'f':
+    if (type->isFloatTy())
+      weight = CW_Register;
+    break;
+  case 'd':
+    if (type->isDoubleTy())
+      weight = CW_Register;
+    break;
+  case 'v':
+    if (type->isVectorTy())
+      weight = CW_Register;
+    break;
+  case 'y':
+    weight = CW_Register;
+    break;
+  case 'Z':
+    weight = CW_Memory;
+    break;
+  }
+  return weight;
+}
+
+std::pair<unsigned, const TargetRegisterClass*>
+PPCTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
+                                                MVT VT) const {
+  if (Constraint.size() == 1) {
+    // GCC RS6000 Constraint Letters
+    switch (Constraint[0]) {
+    case 'b':   // R1-R31
+      if (VT == MVT::i64 && Subtarget.isPPC64())
+        return std::make_pair(0U, &PPC::G8RC_NOX0RegClass);
+      return std::make_pair(0U, &PPC::GPRC_NOR0RegClass);
+    case 'r':   // R0-R31
+      if (VT == MVT::i64 && Subtarget.isPPC64())
+        return std::make_pair(0U, &PPC::G8RCRegClass);
+      return std::make_pair(0U, &PPC::GPRCRegClass);
+    case 'f':
+      if (VT == MVT::f32 || VT == MVT::i32)
+        return std::make_pair(0U, &PPC::F4RCRegClass);
+      if (VT == MVT::f64 || VT == MVT::i64)
+        return std::make_pair(0U, &PPC::F8RCRegClass);
+      break;
+    case 'v':
+      return std::make_pair(0U, &PPC::VRRCRegClass);
+    case 'y':   // crrc
+      return std::make_pair(0U, &PPC::CRRCRegClass);
+    }
+  } else if (Constraint == "wc") { // an individual CR bit.
+    return std::make_pair(0U, &PPC::CRBITRCRegClass);
+  } else if (Constraint == "wa" || Constraint == "wd" ||
+             Constraint == "wf") {
+    return std::make_pair(0U, &PPC::VSRCRegClass);
+  } else if (Constraint == "ws") {
+    return std::make_pair(0U, &PPC::VSFRCRegClass);
+  }
+
+  std::pair<unsigned, const TargetRegisterClass*> R =
+    TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+
+  // r[0-9]+ are used, on PPC64, to refer to the corresponding 64-bit registers
+  // (which we call X[0-9]+). If a 64-bit value has been requested, and a
+  // 32-bit GPR has been selected, then 'upgrade' it to the 64-bit parent
+  // register.
+  // FIXME: If TargetLowering::getRegForInlineAsmConstraint could somehow use
+  // the AsmName field from *RegisterInfo.td, then this would not be necessary.
+  if (R.first && VT == MVT::i64 && Subtarget.isPPC64() &&
+      PPC::GPRCRegClass.contains(R.first)) {
+    const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+    return std::make_pair(TRI->getMatchingSuperReg(R.first,
+                            PPC::sub_32, &PPC::G8RCRegClass),
+                          &PPC::G8RCRegClass);
+  }
+
+  return R;
+}
+
+
+/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+/// vector.  If it is invalid, don't add anything to Ops.
+void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+                                                     std::string &Constraint,
+                                                     std::vector<SDValue>&Ops,
+                                                     SelectionDAG &DAG) const {
+  SDValue Result;
+
+  // Only support length 1 constraints.
+  if (Constraint.length() > 1) return;
+
+  char Letter = Constraint[0];
+  switch (Letter) {
+  default: break;
+  case 'I':
+  case 'J':
+  case 'K':
+  case 'L':
+  case 'M':
+  case 'N':
+  case 'O':
+  case 'P': {
+    ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op);
+    if (!CST) return; // Must be an immediate to match.
+    unsigned Value = CST->getZExtValue();
+    switch (Letter) {
+    default: llvm_unreachable("Unknown constraint letter!");
+    case 'I':  // "I" is a signed 16-bit constant.
+      if ((short)Value == (int)Value)
+        Result = DAG.getTargetConstant(Value, Op.getValueType());
+      break;
+    case 'J':  // "J" is a constant with only the high-order 16 bits nonzero.
+    case 'L':  // "L" is a signed 16-bit constant shifted left 16 bits.
+      if ((short)Value == 0)
+        Result = DAG.getTargetConstant(Value, Op.getValueType());
+      break;
+    case 'K':  // "K" is a constant with only the low-order 16 bits nonzero.
+      if ((Value >> 16) == 0)
+        Result = DAG.getTargetConstant(Value, Op.getValueType());
+      break;
+    case 'M':  // "M" is a constant that is greater than 31.
+      if (Value > 31)
+        Result = DAG.getTargetConstant(Value, Op.getValueType());
+      break;
+    case 'N':  // "N" is a positive constant that is an exact power of two.
+      if ((int)Value > 0 && isPowerOf2_32(Value))
+        Result = DAG.getTargetConstant(Value, Op.getValueType());
+      break;
+    case 'O':  // "O" is the constant zero.
+      if (Value == 0)
+        Result = DAG.getTargetConstant(Value, Op.getValueType());
+      break;
+    case 'P':  // "P" is a constant whose negation is a signed 16-bit constant.
+      if ((short)-Value == (int)-Value)
+        Result = DAG.getTargetConstant(Value, Op.getValueType());
+      break;
+    }
+    break;
+  }
+  }
+
+  if (Result.getNode()) {
+    Ops.push_back(Result);
+    return;
+  }
+
+  // Handle standard constraint letters.
+  TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+// isLegalAddressingMode - Return true if the addressing mode represented
+// by AM is legal for this target, for a load/store of the specified type.
+bool PPCTargetLowering::isLegalAddressingMode(const AddrMode &AM,
+                                              Type *Ty) const {
+  // FIXME: PPC does not allow r+i addressing modes for vectors!
+
+  // PPC allows a sign-extended 16-bit immediate field.
+  if (AM.BaseOffs <= -(1LL << 16) || AM.BaseOffs >= (1LL << 16)-1)
+    return false;
+
+  // No global is ever allowed as a base.
+  if (AM.BaseGV)
+    return false;
+
+  // PPC only support r+r,
+  switch (AM.Scale) {
+  case 0:  // "r+i" or just "i", depending on HasBaseReg.
+    break;
+  case 1:
+    if (AM.HasBaseReg && AM.BaseOffs)  // "r+r+i" is not allowed.
+      return false;
+    // Otherwise we have r+r or r+i.
+    break;
+  case 2:
+    if (AM.HasBaseReg || AM.BaseOffs)  // 2*r+r  or  2*r+i is not allowed.
+      return false;
+    // Allow 2*r as r+r.
+    break;
+  default:
+    // No other scales are supported.
+    return false;
+  }
+
+  return true;
+}
+
+SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MFI->setReturnAddressIsTaken(true);
+
+  if (verifyReturnAddressArgumentIsConstant(Op, DAG))
+    return SDValue();
+
+  SDLoc dl(Op);
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+
+  // Make sure the function does not optimize away the store of the RA to
+  // the stack.
+  PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+  FuncInfo->setLRStoreRequired();
+  bool isPPC64 = Subtarget.isPPC64();
+  bool isDarwinABI = Subtarget.isDarwinABI();
+
+  if (Depth > 0) {
+    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+    SDValue Offset =
+
+      DAG.getConstant(PPCFrameLowering::getReturnSaveOffset(isPPC64, isDarwinABI),
+                      isPPC64? MVT::i64 : MVT::i32);
+    return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
+                       DAG.getNode(ISD::ADD, dl, getPointerTy(),
+                                   FrameAddr, Offset),
+                       MachinePointerInfo(), false, false, false, 0);
+  }
+
+  // Just load the return address off the stack.
+  SDValue RetAddrFI = getReturnAddrFrameIndex(DAG);
+  return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
+                     RetAddrFI, MachinePointerInfo(), false, false, false, 0);
+}
+
+SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op,
+                                          SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  bool isPPC64 = PtrVT == MVT::i64;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MFI->setFrameAddressIsTaken(true);
+
+  // Naked functions never have a frame pointer, and so we use r1. For all
+  // other functions, this decision must be delayed until during PEI.
+  unsigned FrameReg;
+  if (MF.getFunction()->getAttributes().hasAttribute(
+        AttributeSet::FunctionIndex, Attribute::Naked))
+    FrameReg = isPPC64 ? PPC::X1 : PPC::R1;
+  else
+    FrameReg = isPPC64 ? PPC::FP8 : PPC::FP;
+
+  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg,
+                                         PtrVT);
+  while (Depth--)
+    FrameAddr = DAG.getLoad(Op.getValueType(), dl, DAG.getEntryNode(),
+                            FrameAddr, MachinePointerInfo(), false, false,
+                            false, 0);
+  return FrameAddr;
+}
+
+// FIXME? Maybe this could be a TableGen attribute on some registers and
+// this table could be generated automatically from RegInfo.
+unsigned PPCTargetLowering::getRegisterByName(const char* RegName,
+                                              EVT VT) const {
+  bool isPPC64 = Subtarget.isPPC64();
+  bool isDarwinABI = Subtarget.isDarwinABI();
+
+  if ((isPPC64 && VT != MVT::i64 && VT != MVT::i32) ||
+      (!isPPC64 && VT != MVT::i32))
+    report_fatal_error("Invalid register global variable type");
+
+  bool is64Bit = isPPC64 && VT == MVT::i64;
+  unsigned Reg = StringSwitch<unsigned>(RegName)
+                   .Case("r1", is64Bit ? PPC::X1 : PPC::R1)
+                   .Case("r2", isDarwinABI ? 0 : (is64Bit ? PPC::X2 : PPC::R2))
+                   .Case("r13", (!isPPC64 && isDarwinABI) ? 0 :
+                                  (is64Bit ? PPC::X13 : PPC::R13))
+                   .Default(0);
+
+  if (Reg)
+    return Reg;
+  report_fatal_error("Invalid register name global variable");
+}
+
+bool
+PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+  // The PowerPC target isn't yet aware of offsets.
+  return false;
+}
+
+/// getOptimalMemOpType - Returns the target specific optimal type for load
+/// and store operations as a result of memset, memcpy, and memmove
+/// lowering. If DstAlign is zero that means it's safe to destination
+/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
+/// means there isn't a need to check it against alignment requirement,
+/// probably because the source does not need to be loaded. If 'IsMemset' is
+/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
+/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
+/// source is constant so it does not need to be loaded.
+/// It returns EVT::Other if the type should be determined using generic
+/// target-independent logic.
+EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size,
+                                           unsigned DstAlign, unsigned SrcAlign,
+                                           bool IsMemset, bool ZeroMemset,
+                                           bool MemcpyStrSrc,
+                                           MachineFunction &MF) const {
+  if (Subtarget.isPPC64()) {
+    return MVT::i64;
+  } else {
+    return MVT::i32;
+  }
+}
+
+/// \brief Returns true if it is beneficial to convert a load of a constant
+/// to just the constant itself.
+bool PPCTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                                          Type *Ty) const {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  if (BitSize == 0 || BitSize > 64)
+    return false;
+  return true;
+}
+
+bool PPCTargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
+  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+    return false;
+  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
+  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
+  return NumBits1 == 64 && NumBits2 == 32;
+}
+
+bool PPCTargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
+  if (!VT1.isInteger() || !VT2.isInteger())
+    return false;
+  unsigned NumBits1 = VT1.getSizeInBits();
+  unsigned NumBits2 = VT2.getSizeInBits();
+  return NumBits1 == 64 && NumBits2 == 32;
+}
+
+bool PPCTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
+  return isInt<16>(Imm) || isUInt<16>(Imm);
+}
+
+bool PPCTargetLowering::isLegalAddImmediate(int64_t Imm) const {
+  return isInt<16>(Imm) || isUInt<16>(Imm);
+}
+
+bool PPCTargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
+                                                      unsigned,
+                                                      bool *Fast) const {
+  if (DisablePPCUnaligned)
+    return false;
+
+  // PowerPC supports unaligned memory access for simple non-vector types.
+  // Although accessing unaligned addresses is not as efficient as accessing
+  // aligned addresses, it is generally more efficient than manual expansion,
+  // and generally only traps for software emulation when crossing page
+  // boundaries.
+
+  if (!VT.isSimple())
+    return false;
+
+  if (VT.getSimpleVT().isVector()) {
+    if (Subtarget.hasVSX()) {
+      if (VT != MVT::v2f64 && VT != MVT::v2i64)
+        return false;
+    } else {
+      return false;
+    }
+  }
+
+  if (VT == MVT::ppcf128)
+    return false;
+
+  if (Fast)
+    *Fast = true;
+
+  return true;
+}
+
+bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+  VT = VT.getScalarType();
+
+  if (!VT.isSimple())
+    return false;
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::f32:
+  case MVT::f64:
+    return true;
+  default:
+    break;
+  }
+
+  return false;
+}
+
+bool
+PPCTargetLowering::shouldExpandBuildVectorWithShuffles(
+                     EVT VT , unsigned DefinedValues) const {
+  if (VT == MVT::v2i64)
+    return false;
+
+  return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues);
+}
+
+Sched::Preference PPCTargetLowering::getSchedulingPreference(SDNode *N) const {
+  if (DisableILPPref || Subtarget.enableMachineScheduler())
+    return TargetLowering::getSchedulingPreference(N);
+
+  return Sched::ILP;
+}
+
+// Create a fast isel object.
+FastISel *
+PPCTargetLowering::createFastISel(FunctionLoweringInfo &FuncInfo,
+                                  const TargetLibraryInfo *LibInfo) const {
+  return PPC::createFastISel(FuncInfo, LibInfo);
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h
new file mode 100644
index 0000000..74d3c65
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -0,0 +1,717 @@
+//===-- PPCISelLowering.h - PPC32 DAG Lowering Interface --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that PPC uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_POWERPC_PPC32ISELLOWERING_H
+#define LLVM_TARGET_POWERPC_PPC32ISELLOWERING_H
+
+#include "PPC.h"
+#include "PPCInstrInfo.h"
+#include "PPCRegisterInfo.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+  namespace PPCISD {
+    enum NodeType {
+      // Start the numbering where the builtin ops and target ops leave off.
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+      /// FSEL - Traditional three-operand fsel node.
+      ///
+      FSEL,
+
+      /// FCFID - The FCFID instruction, taking an f64 operand and producing
+      /// and f64 value containing the FP representation of the integer that
+      /// was temporarily in the f64 operand.
+      FCFID,
+
+      /// Newer FCFID[US] integer-to-floating-point conversion instructions for
+      /// unsigned integers and single-precision outputs.
+      FCFIDU, FCFIDS, FCFIDUS,
+
+      /// FCTI[D,W]Z - The FCTIDZ and FCTIWZ instructions, taking an f32 or f64
+      /// operand, producing an f64 value containing the integer representation
+      /// of that FP value.
+      FCTIDZ, FCTIWZ,
+
+      /// Newer FCTI[D,W]UZ floating-point-to-integer conversion instructions for
+      /// unsigned integers.
+      FCTIDUZ, FCTIWUZ,
+
+      /// Reciprocal estimate instructions (unary FP ops).
+      FRE, FRSQRTE,
+
+      // VMADDFP, VNMSUBFP - The VMADDFP and VNMSUBFP instructions, taking
+      // three v4f32 operands and producing a v4f32 result.
+      VMADDFP, VNMSUBFP,
+
+      /// VPERM - The PPC VPERM Instruction.
+      ///
+      VPERM,
+
+      /// Hi/Lo - These represent the high and low 16-bit parts of a global
+      /// address respectively.  These nodes have two operands, the first of
+      /// which must be a TargetGlobalAddress, and the second of which must be a
+      /// Constant.  Selected naively, these turn into 'lis G+C' and 'li G+C',
+      /// though these are usually folded into other nodes.
+      Hi, Lo,
+
+      TOC_ENTRY,
+
+      /// The following two target-specific nodes are used for calls through
+      /// function pointers in the 64-bit SVR4 ABI.
+
+      /// Like a regular LOAD but additionally taking/producing a flag.
+      LOAD,
+
+      /// Like LOAD (taking/producing a flag), but using r2 as hard-coded
+      /// destination.
+      LOAD_TOC,
+
+      /// OPRC, CHAIN = DYNALLOC(CHAIN, NEGSIZE, FRAME_INDEX)
+      /// This instruction is lowered in PPCRegisterInfo::eliminateFrameIndex to
+      /// compute an allocation on the stack.
+      DYNALLOC,
+
+      /// GlobalBaseReg - On Darwin, this node represents the result of the mflr
+      /// at function entry, used for PIC code.
+      GlobalBaseReg,
+
+      /// These nodes represent the 32-bit PPC shifts that operate on 6-bit
+      /// shift amounts.  These nodes are generated by the multi-precision shift
+      /// code.
+      SRL, SRA, SHL,
+
+      /// CALL - A direct function call.
+      /// CALL_NOP is a call with the special NOP which follows 64-bit
+      /// SVR4 calls.
+      CALL, CALL_NOP,
+
+      /// CHAIN,FLAG = MTCTR(VAL, CHAIN[, INFLAG]) - Directly corresponds to a
+      /// MTCTR instruction.
+      MTCTR,
+
+      /// CHAIN,FLAG = BCTRL(CHAIN, INFLAG) - Directly corresponds to a
+      /// BCTRL instruction.
+      BCTRL,
+
+      /// Return with a flag operand, matched by 'blr'
+      RET_FLAG,
+
+      /// R32 = MFOCRF(CRREG, INFLAG) - Represents the MFOCRF instruction.
+      /// This copies the bits corresponding to the specified CRREG into the
+      /// resultant GPR.  Bits corresponding to other CR regs are undefined.
+      MFOCRF,
+
+      // FIXME: Remove these once the ANDI glue bug is fixed:
+      /// i1 = ANDIo_1_[EQ|GT]_BIT(i32 or i64 x) - Represents the result of the
+      /// eq or gt bit of CR0 after executing andi. x, 1. This is used to
+      /// implement truncation of i32 or i64 to i1.
+      ANDIo_1_EQ_BIT, ANDIo_1_GT_BIT,
+
+      // EH_SJLJ_SETJMP - SjLj exception handling setjmp.
+      EH_SJLJ_SETJMP,
+
+      // EH_SJLJ_LONGJMP - SjLj exception handling longjmp.
+      EH_SJLJ_LONGJMP,
+
+      /// RESVEC = VCMP(LHS, RHS, OPC) - Represents one of the altivec VCMP*
+      /// instructions.  For lack of better number, we use the opcode number
+      /// encoding for the OPC field to identify the compare.  For example, 838
+      /// is VCMPGTSH.
+      VCMP,
+
+      /// RESVEC, OUTFLAG = VCMPo(LHS, RHS, OPC) - Represents one of the
+      /// altivec VCMP*o instructions.  For lack of better number, we use the
+      /// opcode number encoding for the OPC field to identify the compare.  For
+      /// example, 838 is VCMPGTSH.
+      VCMPo,
+
+      /// CHAIN = COND_BRANCH CHAIN, CRRC, OPC, DESTBB [, INFLAG] - This
+      /// corresponds to the COND_BRANCH pseudo instruction.  CRRC is the
+      /// condition register to branch on, OPC is the branch opcode to use (e.g.
+      /// PPC::BLE), DESTBB is the destination block to branch to, and INFLAG is
+      /// an optional input flag argument.
+      COND_BRANCH,
+
+      /// CHAIN = BDNZ CHAIN, DESTBB - These are used to create counter-based
+      /// loops.
+      BDNZ, BDZ,
+
+      /// F8RC = FADDRTZ F8RC, F8RC - This is an FADD done with rounding
+      /// towards zero.  Used only as part of the long double-to-int
+      /// conversion sequence.
+      FADDRTZ,
+
+      /// F8RC = MFFS - This moves the FPSCR (not modeled) into the register.
+      MFFS,
+
+      /// LARX = This corresponds to PPC l{w|d}arx instrcution: load and
+      /// reserve indexed. This is used to implement atomic operations.
+      LARX,
+
+      /// STCX = This corresponds to PPC stcx. instrcution: store conditional
+      /// indexed. This is used to implement atomic operations.
+      STCX,
+
+      /// TC_RETURN - A tail call return.
+      ///   operand #0 chain
+      ///   operand #1 callee (register or absolute)
+      ///   operand #2 stack adjustment
+      ///   operand #3 optional in flag
+      TC_RETURN,
+
+      /// ch, gl = CR6[UN]SET ch, inglue - Toggle CR bit 6 for SVR4 vararg calls
+      CR6SET,
+      CR6UNSET,
+
+      /// GPRC = address of _GLOBAL_OFFSET_TABLE_. Used by initial-exec TLS
+      /// on PPC32.
+      PPC32_GOT,
+
+      /// GPRC = address of _GLOBAL_OFFSET_TABLE_. Used by general dynamic and
+      /// local dynamic TLS  on PPC32.
+      PPC32_PICGOT,
+
+      /// G8RC = ADDIS_GOT_TPREL_HA %X2, Symbol - Used by the initial-exec
+      /// TLS model, produces an ADDIS8 instruction that adds the GOT
+      /// base to sym\@got\@tprel\@ha.
+      ADDIS_GOT_TPREL_HA,
+
+      /// G8RC = LD_GOT_TPREL_L Symbol, G8RReg - Used by the initial-exec
+      /// TLS model, produces a LD instruction with base register G8RReg
+      /// and offset sym\@got\@tprel\@l.  This completes the addition that
+      /// finds the offset of "sym" relative to the thread pointer.
+      LD_GOT_TPREL_L,
+
+      /// G8RC = ADD_TLS G8RReg, Symbol - Used by the initial-exec TLS
+      /// model, produces an ADD instruction that adds the contents of
+      /// G8RReg to the thread pointer.  Symbol contains a relocation
+      /// sym\@tls which is to be replaced by the thread pointer and
+      /// identifies to the linker that the instruction is part of a
+      /// TLS sequence.
+      ADD_TLS,
+
+      /// G8RC = ADDIS_TLSGD_HA %X2, Symbol - For the general-dynamic TLS
+      /// model, produces an ADDIS8 instruction that adds the GOT base
+      /// register to sym\@got\@tlsgd\@ha.
+      ADDIS_TLSGD_HA,
+
+      /// G8RC = ADDI_TLSGD_L G8RReg, Symbol - For the general-dynamic TLS
+      /// model, produces an ADDI8 instruction that adds G8RReg to
+      /// sym\@got\@tlsgd\@l.
+      ADDI_TLSGD_L,
+
+      /// G8RC = GET_TLS_ADDR %X3, Symbol - For the general-dynamic TLS
+      /// model, produces a call to __tls_get_addr(sym\@tlsgd).
+      GET_TLS_ADDR,
+
+      /// G8RC = ADDIS_TLSLD_HA %X2, Symbol - For the local-dynamic TLS
+      /// model, produces an ADDIS8 instruction that adds the GOT base
+      /// register to sym\@got\@tlsld\@ha.
+      ADDIS_TLSLD_HA,
+
+      /// G8RC = ADDI_TLSLD_L G8RReg, Symbol - For the local-dynamic TLS
+      /// model, produces an ADDI8 instruction that adds G8RReg to
+      /// sym\@got\@tlsld\@l.
+      ADDI_TLSLD_L,
+
+      /// G8RC = GET_TLSLD_ADDR %X3, Symbol - For the local-dynamic TLS
+      /// model, produces a call to __tls_get_addr(sym\@tlsld).
+      GET_TLSLD_ADDR,
+
+      /// G8RC = ADDIS_DTPREL_HA %X3, Symbol, Chain - For the
+      /// local-dynamic TLS model, produces an ADDIS8 instruction
+      /// that adds X3 to sym\@dtprel\@ha. The Chain operand is needed
+      /// to tie this in place following a copy to %X3 from the result
+      /// of a GET_TLSLD_ADDR.
+      ADDIS_DTPREL_HA,
+
+      /// G8RC = ADDI_DTPREL_L G8RReg, Symbol - For the local-dynamic TLS
+      /// model, produces an ADDI8 instruction that adds G8RReg to
+      /// sym\@got\@dtprel\@l.
+      ADDI_DTPREL_L,
+
+      /// VRRC = VADD_SPLAT Elt, EltSize - Temporary node to be expanded
+      /// during instruction selection to optimize a BUILD_VECTOR into
+      /// operations on splats.  This is necessary to avoid losing these
+      /// optimizations due to constant folding.
+      VADD_SPLAT,
+
+      /// CHAIN = SC CHAIN, Imm128 - System call.  The 7-bit unsigned
+      /// operand identifies the operating system entry point.
+      SC,
+
+      /// CHAIN = STBRX CHAIN, GPRC, Ptr, Type - This is a
+      /// byte-swapping store instruction.  It byte-swaps the low "Type" bits of
+      /// the GPRC input, then stores it through Ptr.  Type can be either i16 or
+      /// i32.
+      STBRX = ISD::FIRST_TARGET_MEMORY_OPCODE,
+
+      /// GPRC, CHAIN = LBRX CHAIN, Ptr, Type - This is a
+      /// byte-swapping load instruction.  It loads "Type" bits, byte swaps it,
+      /// then puts it in the bottom bits of the GPRC.  TYPE can be either i16
+      /// or i32.
+      LBRX,
+
+      /// STFIWX - The STFIWX instruction.  The first operand is an input token
+      /// chain, then an f64 value to store, then an address to store it to.
+      STFIWX,
+
+      /// GPRC, CHAIN = LFIWAX CHAIN, Ptr - This is a floating-point
+      /// load which sign-extends from a 32-bit integer value into the
+      /// destination 64-bit register.
+      LFIWAX,
+
+      /// GPRC, CHAIN = LFIWZX CHAIN, Ptr - This is a floating-point
+      /// load which zero-extends from a 32-bit integer value into the
+      /// destination 64-bit register.
+      LFIWZX,
+
+      /// G8RC = ADDIS_TOC_HA %X2, Symbol - For medium and large code model,
+      /// produces an ADDIS8 instruction that adds the TOC base register to
+      /// sym\@toc\@ha.
+      ADDIS_TOC_HA,
+
+      /// G8RC = LD_TOC_L Symbol, G8RReg - For medium and large code model,
+      /// produces a LD instruction with base register G8RReg and offset
+      /// sym\@toc\@l. Preceded by an ADDIS_TOC_HA to form a full 32-bit offset.
+      LD_TOC_L,
+
+      /// G8RC = ADDI_TOC_L G8RReg, Symbol - For medium code model, produces
+      /// an ADDI8 instruction that adds G8RReg to sym\@toc\@l.
+      /// Preceded by an ADDIS_TOC_HA to form a full 32-bit offset.
+      ADDI_TOC_L
+    };
+  }
+
+  /// Define some predicates that are used for node matching.
+  namespace PPC {
+    /// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a
+    /// VPKUHUM instruction.
+    bool isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
+                              SelectionDAG &DAG);
+
+    /// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a
+    /// VPKUWUM instruction.
+    bool isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
+                              SelectionDAG &DAG);
+
+    /// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for
+    /// a VRGL* instruction with the specified unit size (1,2 or 4 bytes).
+    bool isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
+                            unsigned ShuffleKind, SelectionDAG &DAG);
+
+    /// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for
+    /// a VRGH* instruction with the specified unit size (1,2 or 4 bytes).
+    bool isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
+                            unsigned ShuffleKind, SelectionDAG &DAG);
+
+    /// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the
+    /// shift amount, otherwise return -1.
+    int isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind,
+                            SelectionDAG &DAG);
+
+    /// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand
+    /// specifies a splat of a single element that is suitable for input to
+    /// VSPLTB/VSPLTH/VSPLTW.
+    bool isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize);
+
+    /// isAllNegativeZeroVector - Returns true if all elements of build_vector
+    /// are -0.0.
+    bool isAllNegativeZeroVector(SDNode *N);
+
+    /// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the
+    /// specified isSplatShuffleMask VECTOR_SHUFFLE mask.
+    unsigned getVSPLTImmediate(SDNode *N, unsigned EltSize, SelectionDAG &DAG);
+
+    /// get_VSPLTI_elt - If this is a build_vector of constants which can be
+    /// formed by using a vspltis[bhw] instruction of the specified element
+    /// size, return the constant being splatted.  The ByteSize field indicates
+    /// the number of bytes of each element [124] -> [bhw].
+    SDValue get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG);
+  }
+
+  class PPCSubtarget;
+  class PPCTargetLowering : public TargetLowering {
+    const PPCSubtarget &Subtarget;
+
+  public:
+    explicit PPCTargetLowering(PPCTargetMachine &TM);
+
+    /// getTargetNodeName() - This method returns the name of a target specific
+    /// DAG node.
+    const char *getTargetNodeName(unsigned Opcode) const override;
+
+    MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i32; }
+
+    /// getSetCCResultType - Return the ISD::SETCC ValueType
+    EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
+
+    /// getPreIndexedAddressParts - returns true by value, base pointer and
+    /// offset pointer and addressing mode by reference if the node's address
+    /// can be legally represented as pre-indexed load / store address.
+    bool getPreIndexedAddressParts(SDNode *N, SDValue &Base,
+                                   SDValue &Offset,
+                                   ISD::MemIndexedMode &AM,
+                                   SelectionDAG &DAG) const override;
+
+    /// SelectAddressRegReg - Given the specified addressed, check to see if it
+    /// can be represented as an indexed [r+r] operation.  Returns false if it
+    /// can be more efficiently represented with [r+imm].
+    bool SelectAddressRegReg(SDValue N, SDValue &Base, SDValue &Index,
+                             SelectionDAG &DAG) const;
+
+    /// SelectAddressRegImm - Returns true if the address N can be represented
+    /// by a base register plus a signed 16-bit displacement [r+imm], and if it
+    /// is not better represented as reg+reg.  If Aligned is true, only accept
+    /// displacements suitable for STD and friends, i.e. multiples of 4.
+    bool SelectAddressRegImm(SDValue N, SDValue &Disp, SDValue &Base,
+                             SelectionDAG &DAG, bool Aligned) const;
+
+    /// SelectAddressRegRegOnly - Given the specified addressed, force it to be
+    /// represented as an indexed [r+r] operation.
+    bool SelectAddressRegRegOnly(SDValue N, SDValue &Base, SDValue &Index,
+                                 SelectionDAG &DAG) const;
+
+    Sched::Preference getSchedulingPreference(SDNode *N) const override;
+
+    /// LowerOperation - Provide custom lowering hooks for some operations.
+    ///
+    SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+    /// ReplaceNodeResults - Replace the results of node with an illegal result
+    /// type with new values built out of custom code.
+    ///
+    void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+                            SelectionDAG &DAG) const override;
+
+    SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
+    unsigned getRegisterByName(const char* RegName, EVT VT) const override;
+
+    void computeKnownBitsForTargetNode(const SDValue Op,
+                                       APInt &KnownZero,
+                                       APInt &KnownOne,
+                                       const SelectionDAG &DAG,
+                                       unsigned Depth = 0) const override;
+
+    MachineBasicBlock *
+      EmitInstrWithCustomInserter(MachineInstr *MI,
+                                  MachineBasicBlock *MBB) const override;
+    MachineBasicBlock *EmitAtomicBinary(MachineInstr *MI,
+                                        MachineBasicBlock *MBB, bool is64Bit,
+                                        unsigned BinOpcode) const;
+    MachineBasicBlock *EmitPartwordAtomicBinary(MachineInstr *MI,
+                                                MachineBasicBlock *MBB,
+                                            bool is8bit, unsigned Opcode) const;
+
+    MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr *MI,
+                                        MachineBasicBlock *MBB) const;
+
+    MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr *MI,
+                                         MachineBasicBlock *MBB) const;
+
+    ConstraintType
+    getConstraintType(const std::string &Constraint) const override;
+
+    /// Examine constraint string and operand type and determine a weight value.
+    /// The operand object must already have been set up with the operand type.
+    ConstraintWeight getSingleConstraintMatchWeight(
+      AsmOperandInfo &info, const char *constraint) const override;
+
+    std::pair<unsigned, const TargetRegisterClass*>
+      getRegForInlineAsmConstraint(const std::string &Constraint,
+                                   MVT VT) const override;
+
+    /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
+    /// function arguments in the caller parameter area.  This is the actual
+    /// alignment, not its logarithm.
+    unsigned getByValTypeAlignment(Type *Ty) const override;
+
+    /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+    /// vector.  If it is invalid, don't add anything to Ops.
+    void LowerAsmOperandForConstraint(SDValue Op,
+                                      std::string &Constraint,
+                                      std::vector<SDValue> &Ops,
+                                      SelectionDAG &DAG) const override;
+
+    /// isLegalAddressingMode - Return true if the addressing mode represented
+    /// by AM is legal for this target, for a load/store of the specified type.
+    bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
+
+    /// isLegalICmpImmediate - Return true if the specified immediate is legal
+    /// icmp immediate, that is the target has icmp instructions which can
+    /// compare a register against the immediate without having to materialize
+    /// the immediate into a register.
+    bool isLegalICmpImmediate(int64_t Imm) const override;
+
+    /// isLegalAddImmediate - Return true if the specified immediate is legal
+    /// add immediate, that is the target has add instructions which can
+    /// add a register and the immediate without having to materialize
+    /// the immediate into a register.
+    bool isLegalAddImmediate(int64_t Imm) const override;
+
+    /// isTruncateFree - Return true if it's free to truncate a value of
+    /// type Ty1 to type Ty2. e.g. On PPC it's free to truncate a i64 value in
+    /// register X1 to i32 by referencing its sub-register R1.
+    bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
+    bool isTruncateFree(EVT VT1, EVT VT2) const override;
+
+    /// \brief Returns true if it is beneficial to convert a load of a constant
+    /// to just the constant itself.
+    bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                           Type *Ty) const override;
+
+    bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+
+    /// getOptimalMemOpType - Returns the target specific optimal type for load
+    /// and store operations as a result of memset, memcpy, and memmove
+    /// lowering. If DstAlign is zero that means it's safe to destination
+    /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
+    /// means there isn't a need to check it against alignment requirement,
+    /// probably because the source does not need to be loaded. If 'IsMemset' is
+    /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
+    /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
+    /// source is constant so it does not need to be loaded.
+    /// It returns EVT::Other if the type should be determined using generic
+    /// target-independent logic.
+    EVT
+    getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
+                        bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
+                        MachineFunction &MF) const override;
+
+    /// Is unaligned memory access allowed for the given type, and is it fast
+    /// relative to software emulation.
+    bool allowsUnalignedMemoryAccesses(EVT VT,
+                                       unsigned AddrSpace,
+                                       bool *Fast = nullptr) const override;
+
+    /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
+    /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
+    /// expanded to FMAs when this method returns true, otherwise fmuladd is
+    /// expanded to fmul + fadd.
+    bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
+
+    // Should we expand the build vector with shuffles?
+    bool
+    shouldExpandBuildVectorWithShuffles(EVT VT,
+                                        unsigned DefinedValues) const override;
+
+    /// createFastISel - This method returns a target-specific FastISel object,
+    /// or null if the target does not support "fast" instruction selection.
+    FastISel *createFastISel(FunctionLoweringInfo &FuncInfo,
+                             const TargetLibraryInfo *LibInfo) const override;
+
+    /// \brief Returns true if an argument of type Ty needs to be passed in a
+    /// contiguous block of registers in calling convention CallConv.
+    bool functionArgumentNeedsConsecutiveRegisters(
+      Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override {
+      // We support any array type as "consecutive" block in the parameter
+      // save area.  The element type defines the alignment requirement and
+      // whether the argument should go in GPRs, FPRs, or VRs if available.
+      //
+      // Note that clang uses this capability both to implement the ELFv2
+      // homogeneous float/vector aggregate ABI, and to avoid having to use
+      // "byval" when passing aggregates that might fully fit in registers.
+      return Ty->isArrayTy();
+    }
+
+  private:
+    SDValue getFramePointerFrameIndex(SelectionDAG & DAG) const;
+    SDValue getReturnAddrFrameIndex(SelectionDAG & DAG) const;
+
+    bool
+    IsEligibleForTailCallOptimization(SDValue Callee,
+                                      CallingConv::ID CalleeCC,
+                                      bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                      SelectionDAG& DAG) const;
+
+    SDValue EmitTailCallLoadFPAndRetAddr(SelectionDAG & DAG,
+                                         int SPDiff,
+                                         SDValue Chain,
+                                         SDValue &LROpOut,
+                                         SDValue &FPOpOut,
+                                         bool isDarwinABI,
+                                         SDLoc dl) const;
+
+    SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG,
+                         const PPCSubtarget &Subtarget) const;
+    SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG,
+                       const PPCSubtarget &Subtarget) const;
+    SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG,
+                        const PPCSubtarget &Subtarget) const;
+    SDValue LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG,
+                                const PPCSubtarget &Subtarget) const;
+    SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG,
+                                      const PPCSubtarget &Subtarget) const;
+    SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, SDLoc dl) const;
+    SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
+
+    SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                            CallingConv::ID CallConv, bool isVarArg,
+                            const SmallVectorImpl<ISD::InputArg> &Ins,
+                            SDLoc dl, SelectionDAG &DAG,
+                            SmallVectorImpl<SDValue> &InVals) const;
+    SDValue FinishCall(CallingConv::ID CallConv, SDLoc dl, bool isTailCall,
+                       bool isVarArg,
+                       SelectionDAG &DAG,
+                       SmallVector<std::pair<unsigned, SDValue>, 8>
+                         &RegsToPass,
+                       SDValue InFlag, SDValue Chain,
+                       SDValue &Callee,
+                       int SPDiff, unsigned NumBytes,
+                       const SmallVectorImpl<ISD::InputArg> &Ins,
+                       SmallVectorImpl<SDValue> &InVals) const;
+
+    SDValue
+      LowerFormalArguments(SDValue Chain,
+                           CallingConv::ID CallConv, bool isVarArg,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           SDLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals) const override;
+
+    SDValue
+      LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                SmallVectorImpl<SDValue> &InVals) const override;
+
+    bool
+      CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+                   bool isVarArg,
+                   const SmallVectorImpl<ISD::OutputArg> &Outs,
+                   LLVMContext &Context) const override;
+
+    SDValue
+      LowerReturn(SDValue Chain,
+                  CallingConv::ID CallConv, bool isVarArg,
+                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  const SmallVectorImpl<SDValue> &OutVals,
+                  SDLoc dl, SelectionDAG &DAG) const override;
+
+    SDValue
+      extendArgForPPC64(ISD::ArgFlagsTy Flags, EVT ObjectVT, SelectionDAG &DAG,
+                        SDValue ArgVal, SDLoc dl) const;
+
+    SDValue
+      LowerFormalArguments_Darwin(SDValue Chain,
+                                  CallingConv::ID CallConv, bool isVarArg,
+                                  const SmallVectorImpl<ISD::InputArg> &Ins,
+                                  SDLoc dl, SelectionDAG &DAG,
+                                  SmallVectorImpl<SDValue> &InVals) const;
+    SDValue
+      LowerFormalArguments_64SVR4(SDValue Chain,
+                                  CallingConv::ID CallConv, bool isVarArg,
+                                  const SmallVectorImpl<ISD::InputArg> &Ins,
+                                  SDLoc dl, SelectionDAG &DAG,
+                                  SmallVectorImpl<SDValue> &InVals) const;
+    SDValue
+      LowerFormalArguments_32SVR4(SDValue Chain,
+                                  CallingConv::ID CallConv, bool isVarArg,
+                                  const SmallVectorImpl<ISD::InputArg> &Ins,
+                                  SDLoc dl, SelectionDAG &DAG,
+                                  SmallVectorImpl<SDValue> &InVals) const;
+
+    SDValue
+      createMemcpyOutsideCallSeq(SDValue Arg, SDValue PtrOff,
+                                 SDValue CallSeqStart, ISD::ArgFlagsTy Flags,
+                                 SelectionDAG &DAG, SDLoc dl) const;
+
+    SDValue
+      LowerCall_Darwin(SDValue Chain, SDValue Callee,
+                       CallingConv::ID CallConv,
+                       bool isVarArg, bool isTailCall,
+                       const SmallVectorImpl<ISD::OutputArg> &Outs,
+                       const SmallVectorImpl<SDValue> &OutVals,
+                       const SmallVectorImpl<ISD::InputArg> &Ins,
+                       SDLoc dl, SelectionDAG &DAG,
+                       SmallVectorImpl<SDValue> &InVals) const;
+    SDValue
+      LowerCall_64SVR4(SDValue Chain, SDValue Callee,
+                       CallingConv::ID CallConv,
+                       bool isVarArg, bool isTailCall,
+                       const SmallVectorImpl<ISD::OutputArg> &Outs,
+                       const SmallVectorImpl<SDValue> &OutVals,
+                       const SmallVectorImpl<ISD::InputArg> &Ins,
+                       SDLoc dl, SelectionDAG &DAG,
+                       SmallVectorImpl<SDValue> &InVals) const;
+    SDValue
+    LowerCall_32SVR4(SDValue Chain, SDValue Callee, CallingConv::ID CallConv,
+                     bool isVarArg, bool isTailCall,
+                     const SmallVectorImpl<ISD::OutputArg> &Outs,
+                     const SmallVectorImpl<SDValue> &OutVals,
+                     const SmallVectorImpl<ISD::InputArg> &Ins,
+                     SDLoc dl, SelectionDAG &DAG,
+                     SmallVectorImpl<SDValue> &InVals) const;
+
+    SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
+
+    SDValue DAGCombineExtBoolTrunc(SDNode *N, DAGCombinerInfo &DCI) const;
+    SDValue DAGCombineTruncBoolExt(SDNode *N, DAGCombinerInfo &DCI) const;
+    SDValue DAGCombineFastRecip(SDValue Op, DAGCombinerInfo &DCI) const;
+    SDValue DAGCombineFastRecipFSQRT(SDValue Op, DAGCombinerInfo &DCI) const;
+
+    CCAssignFn *useFastISelCCs(unsigned Flag) const;
+  };
+
+  namespace PPC {
+    FastISel *createFastISel(FunctionLoweringInfo &FuncInfo,
+                             const TargetLibraryInfo *LibInfo);
+  }
+
+  bool CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                  CCValAssign::LocInfo &LocInfo,
+                                  ISD::ArgFlagsTy &ArgFlags,
+                                  CCState &State);
+
+  bool CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT,
+                                         MVT &LocVT,
+                                         CCValAssign::LocInfo &LocInfo,
+                                         ISD::ArgFlagsTy &ArgFlags,
+                                         CCState &State);
+
+  bool CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT,
+                                           MVT &LocVT,
+                                           CCValAssign::LocInfo &LocInfo,
+                                           ISD::ArgFlagsTy &ArgFlags,
+                                           CCState &State);
+}
+
+#endif   // LLVM_TARGET_POWERPC_PPC32ISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
new file mode 100644
index 0000000..9ed384f
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -0,0 +1,1137 @@
+//===-- PPCInstr64Bit.td - The PowerPC 64-bit Support ------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the PowerPC 64-bit instructions.  These patterns are used
+// both when in ppc64 mode and when in "use 64-bit extensions in 32-bit" mode.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// 64-bit operands.
+//
+def s16imm64 : Operand<i64> {
+  let PrintMethod = "printS16ImmOperand";
+  let EncoderMethod = "getImm16Encoding";
+  let ParserMatchClass = PPCS16ImmAsmOperand;
+  let DecoderMethod = "decodeSImmOperand<16>";
+}
+def u16imm64 : Operand<i64> {
+  let PrintMethod = "printU16ImmOperand";
+  let EncoderMethod = "getImm16Encoding";
+  let ParserMatchClass = PPCU16ImmAsmOperand;
+  let DecoderMethod = "decodeUImmOperand<16>";
+}
+def s17imm64 : Operand<i64> {
+  // This operand type is used for addis/lis to allow the assembler parser
+  // to accept immediates in the range -65536..65535 for compatibility with
+  // the GNU assembler.  The operand is treated as 16-bit otherwise.
+  let PrintMethod = "printS16ImmOperand";
+  let EncoderMethod = "getImm16Encoding";
+  let ParserMatchClass = PPCS17ImmAsmOperand;
+  let DecoderMethod = "decodeSImmOperand<16>";
+}
+def tocentry : Operand<iPTR> {
+  let MIOperandInfo = (ops i64imm:$imm);
+}
+def tlsreg : Operand<i64> {
+  let EncoderMethod = "getTLSRegEncoding";
+  let ParserMatchClass = PPCTLSRegOperand;
+}
+def tlsgd : Operand<i64> {}
+def tlscall : Operand<i64> {
+  let PrintMethod = "printTLSCall";
+  let MIOperandInfo = (ops calltarget:$func, tlsgd:$sym);
+  let EncoderMethod = "getTLSCallEncoding";
+}
+
+//===----------------------------------------------------------------------===//
+// 64-bit transformation functions.
+//
+
+def SHL64 : SDNodeXForm<imm, [{
+  // Transformation function: 63 - imm
+  return getI32Imm(63 - N->getZExtValue());
+}]>;
+
+def SRL64 : SDNodeXForm<imm, [{
+  // Transformation function: 64 - imm
+  return N->getZExtValue() ? getI32Imm(64 - N->getZExtValue()) : getI32Imm(0);
+}]>;
+
+def HI32_48 : SDNodeXForm<imm, [{
+  // Transformation function: shift the immediate value down into the low bits.
+  return getI32Imm((unsigned short)(N->getZExtValue() >> 32));
+}]>;
+
+def HI48_64 : SDNodeXForm<imm, [{
+  // Transformation function: shift the immediate value down into the low bits.
+  return getI32Imm((unsigned short)(N->getZExtValue() >> 48));
+}]>;
+
+
+//===----------------------------------------------------------------------===//
+// Calls.
+//
+
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
+let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7 in {
+  let isBranch = 1, isIndirectBranch = 1, Uses = [CTR8] in {
+    def BCTR8 : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", IIC_BrB,
+                             []>,
+        Requires<[In64BitMode]>;
+    def BCCCTR8 : XLForm_2_br<19, 528, 0, (outs), (ins pred:$cond),
+                              "b${cond:cc}ctr${cond:pm} ${cond:reg}", IIC_BrB,
+                              []>,
+        Requires<[In64BitMode]>;
+
+    def BCCTR8  : XLForm_2_br2<19, 528, 12, 0, (outs), (ins crbitrc:$bi),
+                               "bcctr 12, $bi, 0", IIC_BrB, []>,
+        Requires<[In64BitMode]>;
+    def BCCTR8n : XLForm_2_br2<19, 528, 4, 0, (outs), (ins crbitrc:$bi),
+                               "bcctr 4, $bi, 0", IIC_BrB, []>,
+        Requires<[In64BitMode]>;
+  }
+}
+
+let Defs = [LR8] in
+  def MovePCtoLR8 : Pseudo<(outs), (ins), "#MovePCtoLR8", []>,
+                    PPC970_Unit_BRU;
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in {
+  let Defs = [CTR8], Uses = [CTR8] in {
+    def BDZ8  : BForm_1<16, 18, 0, 0, (outs), (ins condbrtarget:$dst),
+                        "bdz $dst">;
+    def BDNZ8 : BForm_1<16, 16, 0, 0, (outs), (ins condbrtarget:$dst),
+                        "bdnz $dst">;
+  }
+
+  let isReturn = 1, Defs = [CTR8], Uses = [CTR8, LR8, RM] in {
+    def BDZLR8  : XLForm_2_ext<19, 16, 18, 0, 0, (outs), (ins),
+                              "bdzlr", IIC_BrB, []>;
+    def BDNZLR8 : XLForm_2_ext<19, 16, 16, 0, 0, (outs), (ins),
+                              "bdnzlr", IIC_BrB, []>;
+  }
+}
+
+
+
+let isCall = 1, PPC970_Unit = 7, Defs = [LR8] in {
+  // Convenient aliases for call instructions
+  let Uses = [RM] in {
+    def BL8  : IForm<18, 0, 1, (outs), (ins calltarget:$func),
+                     "bl $func", IIC_BrB, []>;  // See Pat patterns below.
+
+    def BL8_TLS  : IForm<18, 0, 1, (outs), (ins tlscall:$func),
+                         "bl $func", IIC_BrB, []>;
+
+    def BLA8 : IForm<18, 1, 1, (outs), (ins abscalltarget:$func),
+                     "bla $func", IIC_BrB, [(PPCcall (i64 imm:$func))]>;
+  }
+  let Uses = [RM], isCodeGenOnly = 1 in {
+    def BL8_NOP  : IForm_and_DForm_4_zero<18, 0, 1, 24,
+                             (outs), (ins calltarget:$func),
+                             "bl $func\n\tnop", IIC_BrB, []>;
+
+    def BL8_NOP_TLS : IForm_and_DForm_4_zero<18, 0, 1, 24,
+                                  (outs), (ins tlscall:$func),
+                                  "bl $func\n\tnop", IIC_BrB, []>;
+
+    def BLA8_NOP : IForm_and_DForm_4_zero<18, 1, 1, 24,
+                             (outs), (ins abscalltarget:$func),
+                             "bla $func\n\tnop", IIC_BrB,
+                             [(PPCcall_nop (i64 imm:$func))]>;
+  }
+  let Uses = [CTR8, RM] in {
+    def BCTRL8 : XLForm_2_ext<19, 528, 20, 0, 1, (outs), (ins),
+                              "bctrl", IIC_BrB, [(PPCbctrl)]>,
+                 Requires<[In64BitMode]>;
+
+    let isCodeGenOnly = 1 in {
+      def BCCCTRL8 : XLForm_2_br<19, 528, 1, (outs), (ins pred:$cond),
+                                 "b${cond:cc}ctrl${cond:pm} ${cond:reg}", IIC_BrB,
+                                 []>,
+          Requires<[In64BitMode]>;
+
+      def BCCTRL8  : XLForm_2_br2<19, 528, 12, 1, (outs), (ins crbitrc:$bi),
+                                  "bcctrl 12, $bi, 0", IIC_BrB, []>,
+          Requires<[In64BitMode]>;
+      def BCCTRL8n : XLForm_2_br2<19, 528, 4, 1, (outs), (ins crbitrc:$bi),
+                                  "bcctrl 4, $bi, 0", IIC_BrB, []>,
+          Requires<[In64BitMode]>;
+    }
+  }
+}
+} // Interpretation64Bit
+
+// FIXME: Duplicating this for the asm parser should be unnecessary, but the
+// previous definition must be marked as CodeGen only to prevent decoding
+// conflicts.
+let Interpretation64Bit = 1, isAsmParserOnly = 1 in
+let isCall = 1, PPC970_Unit = 7, Defs = [LR8], Uses = [RM] in
+def BL8_TLS_ : IForm<18, 0, 1, (outs), (ins tlscall:$func),
+                     "bl $func", IIC_BrB, []>;
+
+// Calls
+def : Pat<(PPCcall (i64 tglobaladdr:$dst)),
+          (BL8 tglobaladdr:$dst)>;
+def : Pat<(PPCcall_nop (i64 tglobaladdr:$dst)),
+          (BL8_NOP tglobaladdr:$dst)>;
+
+def : Pat<(PPCcall (i64 texternalsym:$dst)),
+          (BL8 texternalsym:$dst)>;
+def : Pat<(PPCcall_nop (i64 texternalsym:$dst)),
+          (BL8_NOP texternalsym:$dst)>;
+
+// Atomic operations
+let usesCustomInserter = 1 in {
+  let Defs = [CR0] in {
+    def ATOMIC_LOAD_ADD_I64 : Pseudo<
+      (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_ADD_I64",
+      [(set i64:$dst, (atomic_load_add_64 xoaddr:$ptr, i64:$incr))]>;
+    def ATOMIC_LOAD_SUB_I64 : Pseudo<
+      (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_SUB_I64",
+      [(set i64:$dst, (atomic_load_sub_64 xoaddr:$ptr, i64:$incr))]>;
+    def ATOMIC_LOAD_OR_I64 : Pseudo<
+      (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_OR_I64",
+      [(set i64:$dst, (atomic_load_or_64 xoaddr:$ptr, i64:$incr))]>;
+    def ATOMIC_LOAD_XOR_I64 : Pseudo<
+      (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_XOR_I64",
+      [(set i64:$dst, (atomic_load_xor_64 xoaddr:$ptr, i64:$incr))]>;
+    def ATOMIC_LOAD_AND_I64 : Pseudo<
+      (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_AND_i64",
+      [(set i64:$dst, (atomic_load_and_64 xoaddr:$ptr, i64:$incr))]>;
+    def ATOMIC_LOAD_NAND_I64 : Pseudo<
+      (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_NAND_I64",
+      [(set i64:$dst, (atomic_load_nand_64 xoaddr:$ptr, i64:$incr))]>;
+
+    def ATOMIC_CMP_SWAP_I64 : Pseudo<
+      (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$old, g8rc:$new), "#ATOMIC_CMP_SWAP_I64",
+      [(set i64:$dst, (atomic_cmp_swap_64 xoaddr:$ptr, i64:$old, i64:$new))]>;
+
+    def ATOMIC_SWAP_I64 : Pseudo<
+      (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$new), "#ATOMIC_SWAP_I64",
+      [(set i64:$dst, (atomic_swap_64 xoaddr:$ptr, i64:$new))]>;
+  }
+}
+
+// Instructions to support atomic operations
+def LDARX : XForm_1<31,  84, (outs g8rc:$rD), (ins memrr:$ptr),
+                   "ldarx $rD, $ptr", IIC_LdStLDARX,
+                   [(set i64:$rD, (PPClarx xoaddr:$ptr))]>;
+
+let Defs = [CR0] in
+def STDCX : XForm_1<31, 214, (outs), (ins g8rc:$rS, memrr:$dst),
+                   "stdcx. $rS, $dst", IIC_LdStSTDCX,
+                   [(PPCstcx i64:$rS, xoaddr:$dst)]>,
+                   isDOT;
+
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
+def TCRETURNdi8 :Pseudo< (outs),
+                        (ins calltarget:$dst, i32imm:$offset),
+                 "#TC_RETURNd8 $dst $offset",
+                 []>;
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
+def TCRETURNai8 :Pseudo<(outs), (ins abscalltarget:$func, i32imm:$offset),
+                 "#TC_RETURNa8 $func $offset",
+                 [(PPCtc_return (i64 imm:$func), imm:$offset)]>;
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
+def TCRETURNri8 : Pseudo<(outs), (ins CTRRC8:$dst, i32imm:$offset),
+                 "#TC_RETURNr8 $dst $offset",
+                 []>;
+
+let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7, isBranch = 1,
+    isIndirectBranch = 1, isCall = 1, isReturn = 1, Uses = [CTR8, RM] in
+def TAILBCTR8 : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", IIC_BrB,
+                             []>,
+    Requires<[In64BitMode]>;
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7,
+    isBarrier = 1, isCall = 1, isReturn = 1, Uses = [RM] in
+def TAILB8   : IForm<18, 0, 0, (outs), (ins calltarget:$dst),
+                  "b $dst", IIC_BrB,
+                  []>;
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7,
+    isBarrier = 1, isCall = 1, isReturn = 1, Uses = [RM] in
+def TAILBA8   : IForm<18, 0, 0, (outs), (ins abscalltarget:$dst),
+                  "ba $dst", IIC_BrB,
+                  []>;
+} // Interpretation64Bit
+
+def : Pat<(PPCtc_return (i64 tglobaladdr:$dst),  imm:$imm),
+          (TCRETURNdi8 tglobaladdr:$dst, imm:$imm)>;
+
+def : Pat<(PPCtc_return (i64 texternalsym:$dst), imm:$imm),
+          (TCRETURNdi8 texternalsym:$dst, imm:$imm)>;
+
+def : Pat<(PPCtc_return CTRRC8:$dst, imm:$imm),
+          (TCRETURNri8 CTRRC8:$dst, imm:$imm)>;
+
+
+// 64-bit CR instructions
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
+let neverHasSideEffects = 1 in {
+def MTOCRF8: XFXForm_5a<31, 144, (outs crbitm:$FXM), (ins g8rc:$ST),
+                        "mtocrf $FXM, $ST", IIC_BrMCRX>,
+            PPC970_DGroup_First, PPC970_Unit_CRU;
+
+def MTCRF8 : XFXForm_5<31, 144, (outs), (ins i32imm:$FXM, g8rc:$rS),
+                      "mtcrf $FXM, $rS", IIC_BrMCRX>,
+            PPC970_MicroCode, PPC970_Unit_CRU;
+
+let hasExtraSrcRegAllocReq = 1 in // to enable post-ra anti-dep breaking.
+def MFOCRF8: XFXForm_5a<31, 19, (outs g8rc:$rT), (ins crbitm:$FXM),
+                        "mfocrf $rT, $FXM", IIC_SprMFCRF>,
+             PPC970_DGroup_First, PPC970_Unit_CRU;
+
+def MFCR8 : XFXForm_3<31, 19, (outs g8rc:$rT), (ins),
+                     "mfcr $rT", IIC_SprMFCR>,
+                     PPC970_MicroCode, PPC970_Unit_CRU;
+} // neverHasSideEffects = 1
+
+let hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in {
+  let Defs = [CTR8] in
+  def EH_SjLj_SetJmp64  : Pseudo<(outs gprc:$dst), (ins memr:$buf),
+                            "#EH_SJLJ_SETJMP64",
+                            [(set i32:$dst, (PPCeh_sjlj_setjmp addr:$buf))]>,
+                          Requires<[In64BitMode]>;
+  let isTerminator = 1 in
+  def EH_SjLj_LongJmp64 : Pseudo<(outs), (ins memr:$buf),
+                            "#EH_SJLJ_LONGJMP64",
+                            [(PPCeh_sjlj_longjmp addr:$buf)]>,
+                          Requires<[In64BitMode]>;
+}
+
+//===----------------------------------------------------------------------===//
+// 64-bit SPR manipulation instrs.
+
+let Uses = [CTR8] in {
+def MFCTR8 : XFXForm_1_ext<31, 339, 9, (outs g8rc:$rT), (ins),
+                           "mfctr $rT", IIC_SprMFSPR>,
+             PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+let Pattern = [(PPCmtctr i64:$rS)], Defs = [CTR8] in {
+def MTCTR8 : XFXForm_7_ext<31, 467, 9, (outs), (ins g8rc:$rS),
+                           "mtctr $rS", IIC_SprMTSPR>,
+             PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+let hasSideEffects = 1, Defs = [CTR8] in {
+let Pattern = [(int_ppc_mtctr i64:$rS)] in
+def MTCTR8loop : XFXForm_7_ext<31, 467, 9, (outs), (ins g8rc:$rS),
+                               "mtctr $rS", IIC_SprMTSPR>,
+                 PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+
+let Pattern = [(set i64:$rT, readcyclecounter)] in
+def MFTB8 : XFXForm_1_ext<31, 339, 268, (outs g8rc:$rT), (ins),
+                          "mfspr $rT, 268", IIC_SprMFTB>,
+            PPC970_DGroup_First, PPC970_Unit_FXU;
+// Note that encoding mftb using mfspr is now the preferred form,
+// and has been since at least ISA v2.03. The mftb instruction has
+// now been phased out. Using mfspr, however, is known not to work on
+// the POWER3.
+
+let Defs = [X1], Uses = [X1] in
+def DYNALLOC8 : Pseudo<(outs g8rc:$result), (ins g8rc:$negsize, memri:$fpsi),"#DYNALLOC8",
+                       [(set i64:$result,
+                             (PPCdynalloc i64:$negsize, iaddr:$fpsi))]>;
+
+let Defs = [LR8] in {
+def MTLR8  : XFXForm_7_ext<31, 467, 8, (outs), (ins g8rc:$rS),
+                           "mtlr $rS", IIC_SprMTSPR>,
+             PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+let Uses = [LR8] in {
+def MFLR8  : XFXForm_1_ext<31, 339, 8, (outs g8rc:$rT), (ins),
+                           "mflr $rT", IIC_SprMFSPR>,
+             PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+} // Interpretation64Bit
+
+//===----------------------------------------------------------------------===//
+// Fixed point instructions.
+//
+
+let PPC970_Unit = 1 in {  // FXU Operations.
+let Interpretation64Bit = 1 in {
+let neverHasSideEffects = 1 in {
+let isCodeGenOnly = 1 in {
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
+def LI8  : DForm_2_r0<14, (outs g8rc:$rD), (ins s16imm64:$imm),
+                      "li $rD, $imm", IIC_IntSimple,
+                      [(set i64:$rD, imm64SExt16:$imm)]>;
+def LIS8 : DForm_2_r0<15, (outs g8rc:$rD), (ins s17imm64:$imm),
+                      "lis $rD, $imm", IIC_IntSimple,
+                      [(set i64:$rD, imm16ShiftedSExt:$imm)]>;
+}
+
+// Logical ops.
+let isCommutable = 1 in {
+defm NAND8: XForm_6r<31, 476, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
+                     "nand", "$rA, $rS, $rB", IIC_IntSimple,
+                     [(set i64:$rA, (not (and i64:$rS, i64:$rB)))]>;
+defm AND8 : XForm_6r<31,  28, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
+                     "and", "$rA, $rS, $rB", IIC_IntSimple,
+                     [(set i64:$rA, (and i64:$rS, i64:$rB))]>;
+} // isCommutable
+defm ANDC8: XForm_6r<31,  60, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
+                     "andc", "$rA, $rS, $rB", IIC_IntSimple,
+                     [(set i64:$rA, (and i64:$rS, (not i64:$rB)))]>;
+let isCommutable = 1 in {
+defm OR8  : XForm_6r<31, 444, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
+                     "or", "$rA, $rS, $rB", IIC_IntSimple,
+                     [(set i64:$rA, (or i64:$rS, i64:$rB))]>;
+defm NOR8 : XForm_6r<31, 124, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
+                     "nor", "$rA, $rS, $rB", IIC_IntSimple,
+                     [(set i64:$rA, (not (or i64:$rS, i64:$rB)))]>;
+} // isCommutable
+defm ORC8 : XForm_6r<31, 412, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
+                     "orc", "$rA, $rS, $rB", IIC_IntSimple,
+                     [(set i64:$rA, (or i64:$rS, (not i64:$rB)))]>;
+let isCommutable = 1 in {
+defm EQV8 : XForm_6r<31, 284, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
+                     "eqv", "$rA, $rS, $rB", IIC_IntSimple,
+                     [(set i64:$rA, (not (xor i64:$rS, i64:$rB)))]>;
+defm XOR8 : XForm_6r<31, 316, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
+                     "xor", "$rA, $rS, $rB", IIC_IntSimple,
+                     [(set i64:$rA, (xor i64:$rS, i64:$rB))]>;
+} // let isCommutable = 1
+
+// Logical ops with immediate.
+let Defs = [CR0] in {
+def ANDIo8  : DForm_4<28, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2),
+                      "andi. $dst, $src1, $src2", IIC_IntGeneral,
+                      [(set i64:$dst, (and i64:$src1, immZExt16:$src2))]>,
+                      isDOT;
+def ANDISo8 : DForm_4<29, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2),
+                     "andis. $dst, $src1, $src2", IIC_IntGeneral,
+                    [(set i64:$dst, (and i64:$src1, imm16ShiftedZExt:$src2))]>,
+                     isDOT;
+}
+def ORI8    : DForm_4<24, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2),
+                      "ori $dst, $src1, $src2", IIC_IntSimple,
+                      [(set i64:$dst, (or i64:$src1, immZExt16:$src2))]>;
+def ORIS8   : DForm_4<25, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2),
+                      "oris $dst, $src1, $src2", IIC_IntSimple,
+                    [(set i64:$dst, (or i64:$src1, imm16ShiftedZExt:$src2))]>;
+def XORI8   : DForm_4<26, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2),
+                      "xori $dst, $src1, $src2", IIC_IntSimple,
+                      [(set i64:$dst, (xor i64:$src1, immZExt16:$src2))]>;
+def XORIS8  : DForm_4<27, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2),
+                      "xoris $dst, $src1, $src2", IIC_IntSimple,
+                   [(set i64:$dst, (xor i64:$src1, imm16ShiftedZExt:$src2))]>;
+
+let isCommutable = 1 in
+defm ADD8  : XOForm_1r<31, 266, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+                       "add", "$rT, $rA, $rB", IIC_IntSimple,
+                       [(set i64:$rT, (add i64:$rA, i64:$rB))]>;
+// ADD8 has a special form: reg = ADD8(reg, sym@tls) for use by the
+// initial-exec thread-local storage model.
+def ADD8TLS  : XOForm_1<31, 266, 0, (outs g8rc:$rT), (ins g8rc:$rA, tlsreg:$rB),
+                        "add $rT, $rA, $rB", IIC_IntSimple,
+                        [(set i64:$rT, (add i64:$rA, tglobaltlsaddr:$rB))]>;
+                     
+let isCommutable = 1 in
+defm ADDC8 : XOForm_1rc<31, 10, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+                        "addc", "$rT, $rA, $rB", IIC_IntGeneral,
+                        [(set i64:$rT, (addc i64:$rA, i64:$rB))]>,
+                        PPC970_DGroup_Cracked;
+
+let Defs = [CARRY] in
+def ADDIC8 : DForm_2<12, (outs g8rc:$rD), (ins g8rc:$rA, s16imm64:$imm),
+                     "addic $rD, $rA, $imm", IIC_IntGeneral,
+                     [(set i64:$rD, (addc i64:$rA, imm64SExt16:$imm))]>;
+def ADDI8  : DForm_2<14, (outs g8rc:$rD), (ins g8rc_nox0:$rA, s16imm64:$imm),
+                     "addi $rD, $rA, $imm", IIC_IntSimple,
+                     [(set i64:$rD, (add i64:$rA, imm64SExt16:$imm))]>;
+def ADDIS8 : DForm_2<15, (outs g8rc:$rD), (ins g8rc_nox0:$rA, s17imm64:$imm),
+                     "addis $rD, $rA, $imm", IIC_IntSimple,
+                     [(set i64:$rD, (add i64:$rA, imm16ShiftedSExt:$imm))]>;
+
+let Defs = [CARRY] in {
+def SUBFIC8: DForm_2< 8, (outs g8rc:$rD), (ins g8rc:$rA, s16imm64:$imm),
+                     "subfic $rD, $rA, $imm", IIC_IntGeneral,
+                     [(set i64:$rD, (subc imm64SExt16:$imm, i64:$rA))]>;
+defm SUBFC8 : XOForm_1r<31, 8, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+                        "subfc", "$rT, $rA, $rB", IIC_IntGeneral,
+                        [(set i64:$rT, (subc i64:$rB, i64:$rA))]>,
+                        PPC970_DGroup_Cracked;
+}
+defm SUBF8 : XOForm_1r<31, 40, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+                       "subf", "$rT, $rA, $rB", IIC_IntGeneral,
+                       [(set i64:$rT, (sub i64:$rB, i64:$rA))]>;
+defm NEG8    : XOForm_3r<31, 104, 0, (outs g8rc:$rT), (ins g8rc:$rA),
+                        "neg", "$rT, $rA", IIC_IntSimple,
+                        [(set i64:$rT, (ineg i64:$rA))]>;
+let Uses = [CARRY] in {
+let isCommutable = 1 in
+defm ADDE8   : XOForm_1rc<31, 138, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+                          "adde", "$rT, $rA, $rB", IIC_IntGeneral,
+                          [(set i64:$rT, (adde i64:$rA, i64:$rB))]>;
+defm ADDME8  : XOForm_3rc<31, 234, 0, (outs g8rc:$rT), (ins g8rc:$rA),
+                          "addme", "$rT, $rA", IIC_IntGeneral,
+                          [(set i64:$rT, (adde i64:$rA, -1))]>;
+defm ADDZE8  : XOForm_3rc<31, 202, 0, (outs g8rc:$rT), (ins g8rc:$rA),
+                          "addze", "$rT, $rA", IIC_IntGeneral,
+                          [(set i64:$rT, (adde i64:$rA, 0))]>;
+defm SUBFE8  : XOForm_1rc<31, 136, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+                          "subfe", "$rT, $rA, $rB", IIC_IntGeneral,
+                          [(set i64:$rT, (sube i64:$rB, i64:$rA))]>;
+defm SUBFME8 : XOForm_3rc<31, 232, 0, (outs g8rc:$rT), (ins g8rc:$rA),
+                          "subfme", "$rT, $rA", IIC_IntGeneral,
+                          [(set i64:$rT, (sube -1, i64:$rA))]>;
+defm SUBFZE8 : XOForm_3rc<31, 200, 0, (outs g8rc:$rT), (ins g8rc:$rA),
+                          "subfze", "$rT, $rA", IIC_IntGeneral,
+                          [(set i64:$rT, (sube 0, i64:$rA))]>;
+}
+} // isCodeGenOnly
+
+// FIXME: Duplicating this for the asm parser should be unnecessary, but the
+// previous definition must be marked as CodeGen only to prevent decoding
+// conflicts.
+let isAsmParserOnly = 1 in
+def ADD8TLS_ : XOForm_1<31, 266, 0, (outs g8rc:$rT), (ins g8rc:$rA, tlsreg:$rB),
+                        "add $rT, $rA, $rB", IIC_IntSimple, []>;
+
+let isCommutable = 1 in {
+defm MULHD : XOForm_1r<31, 73, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+                       "mulhd", "$rT, $rA, $rB", IIC_IntMulHW,
+                       [(set i64:$rT, (mulhs i64:$rA, i64:$rB))]>;
+defm MULHDU : XOForm_1r<31, 9, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+                       "mulhdu", "$rT, $rA, $rB", IIC_IntMulHWU,
+                       [(set i64:$rT, (mulhu i64:$rA, i64:$rB))]>;
+} // isCommutable
+}
+} // Interpretation64Bit
+
+let isCompare = 1, neverHasSideEffects = 1 in {
+  def CMPD   : XForm_16_ext<31, 0, (outs crrc:$crD), (ins g8rc:$rA, g8rc:$rB),
+                            "cmpd $crD, $rA, $rB", IIC_IntCompare>, isPPC64;
+  def CMPLD  : XForm_16_ext<31, 32, (outs crrc:$crD), (ins g8rc:$rA, g8rc:$rB),
+                            "cmpld $crD, $rA, $rB", IIC_IntCompare>, isPPC64;
+  def CMPDI  : DForm_5_ext<11, (outs crrc:$crD), (ins g8rc:$rA, s16imm64:$imm),
+                           "cmpdi $crD, $rA, $imm", IIC_IntCompare>, isPPC64;
+  def CMPLDI : DForm_6_ext<10, (outs crrc:$dst), (ins g8rc:$src1, u16imm64:$src2),
+                           "cmpldi $dst, $src1, $src2",
+                           IIC_IntCompare>, isPPC64;
+}
+
+let neverHasSideEffects = 1 in {
+defm SLD  : XForm_6r<31,  27, (outs g8rc:$rA), (ins g8rc:$rS, gprc:$rB),
+                     "sld", "$rA, $rS, $rB", IIC_IntRotateD,
+                     [(set i64:$rA, (PPCshl i64:$rS, i32:$rB))]>, isPPC64;
+defm SRD  : XForm_6r<31, 539, (outs g8rc:$rA), (ins g8rc:$rS, gprc:$rB),
+                     "srd", "$rA, $rS, $rB", IIC_IntRotateD,
+                     [(set i64:$rA, (PPCsrl i64:$rS, i32:$rB))]>, isPPC64;
+defm SRAD : XForm_6rc<31, 794, (outs g8rc:$rA), (ins g8rc:$rS, gprc:$rB),
+                      "srad", "$rA, $rS, $rB", IIC_IntRotateD,
+                      [(set i64:$rA, (PPCsra i64:$rS, i32:$rB))]>, isPPC64;
+
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in { 
+defm EXTSB8 : XForm_11r<31, 954, (outs g8rc:$rA), (ins g8rc:$rS),
+                        "extsb", "$rA, $rS", IIC_IntSimple,
+                        [(set i64:$rA, (sext_inreg i64:$rS, i8))]>;
+defm EXTSH8 : XForm_11r<31, 922, (outs g8rc:$rA), (ins g8rc:$rS),
+                        "extsh", "$rA, $rS", IIC_IntSimple,
+                        [(set i64:$rA, (sext_inreg i64:$rS, i16))]>;
+} // Interpretation64Bit
+
+// For fast-isel:
+let isCodeGenOnly = 1 in {
+def EXTSB8_32_64 : XForm_11<31, 954, (outs g8rc:$rA), (ins gprc:$rS),
+                           "extsb $rA, $rS", IIC_IntSimple, []>, isPPC64;
+def EXTSH8_32_64 : XForm_11<31, 922, (outs g8rc:$rA), (ins gprc:$rS),
+                           "extsh $rA, $rS", IIC_IntSimple, []>, isPPC64;
+} // isCodeGenOnly for fast-isel
+
+defm EXTSW  : XForm_11r<31, 986, (outs g8rc:$rA), (ins g8rc:$rS),
+                        "extsw", "$rA, $rS", IIC_IntSimple,
+                        [(set i64:$rA, (sext_inreg i64:$rS, i32))]>, isPPC64;
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+defm EXTSW_32_64 : XForm_11r<31, 986, (outs g8rc:$rA), (ins gprc:$rS),
+                             "extsw", "$rA, $rS", IIC_IntSimple,
+                             [(set i64:$rA, (sext i32:$rS))]>, isPPC64;
+
+defm SRADI  : XSForm_1rc<31, 413, (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH),
+                         "sradi", "$rA, $rS, $SH", IIC_IntRotateDI,
+                         [(set i64:$rA, (sra i64:$rS, (i32 imm:$SH)))]>, isPPC64;
+defm CNTLZD : XForm_11r<31, 58, (outs g8rc:$rA), (ins g8rc:$rS),
+                        "cntlzd", "$rA, $rS", IIC_IntGeneral,
+                        [(set i64:$rA, (ctlz i64:$rS))]>;
+def POPCNTD : XForm_11<31, 506, (outs g8rc:$rA), (ins g8rc:$rS),
+                       "popcntd $rA, $rS", IIC_IntGeneral,
+                       [(set i64:$rA, (ctpop i64:$rS))]>;
+
+// popcntw also does a population count on the high 32 bits (storing the
+// results in the high 32-bits of the output). We'll ignore that here (which is
+// safe because we never separately use the high part of the 64-bit registers).
+def POPCNTW : XForm_11<31, 378, (outs gprc:$rA), (ins gprc:$rS),
+                       "popcntw $rA, $rS", IIC_IntGeneral,
+                       [(set i32:$rA, (ctpop i32:$rS))]>;
+
+defm DIVD  : XOForm_1r<31, 489, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+                       "divd", "$rT, $rA, $rB", IIC_IntDivD,
+                       [(set i64:$rT, (sdiv i64:$rA, i64:$rB))]>, isPPC64,
+                       PPC970_DGroup_First, PPC970_DGroup_Cracked;
+defm DIVDU : XOForm_1r<31, 457, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+                       "divdu", "$rT, $rA, $rB", IIC_IntDivD,
+                       [(set i64:$rT, (udiv i64:$rA, i64:$rB))]>, isPPC64,
+                       PPC970_DGroup_First, PPC970_DGroup_Cracked;
+let isCommutable = 1 in
+defm MULLD : XOForm_1r<31, 233, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+                       "mulld", "$rT, $rA, $rB", IIC_IntMulHD,
+                       [(set i64:$rT, (mul i64:$rA, i64:$rB))]>, isPPC64;
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+def MULLI8 : DForm_2<7, (outs g8rc:$rD), (ins g8rc:$rA, s16imm64:$imm),
+                       "mulli $rD, $rA, $imm", IIC_IntMulLI,
+                       [(set i64:$rD, (mul i64:$rA, imm64SExt16:$imm))]>;
+}
+
+let neverHasSideEffects = 1 in {
+let isCommutable = 1 in {
+defm RLDIMI : MDForm_1r<30, 3, (outs g8rc:$rA),
+                        (ins g8rc:$rSi, g8rc:$rS, u6imm:$SH, u6imm:$MBE),
+                        "rldimi", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI,
+                        []>, isPPC64, RegConstraint<"$rSi = $rA">,
+                        NoEncode<"$rSi">;
+}
+
+// Rotate instructions.
+defm RLDCL  : MDSForm_1r<30, 8,
+                        (outs g8rc:$rA), (ins g8rc:$rS, gprc:$rB, u6imm:$MBE),
+                        "rldcl", "$rA, $rS, $rB, $MBE", IIC_IntRotateD,
+                        []>, isPPC64;
+defm RLDCR  : MDSForm_1r<30, 9,
+                        (outs g8rc:$rA), (ins g8rc:$rS, gprc:$rB, u6imm:$MBE),
+                        "rldcr", "$rA, $rS, $rB, $MBE", IIC_IntRotateD,
+                        []>, isPPC64;
+defm RLDICL : MDForm_1r<30, 0,
+                        (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE),
+                        "rldicl", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI,
+                        []>, isPPC64;
+// For fast-isel:
+let isCodeGenOnly = 1 in
+def RLDICL_32_64 : MDForm_1<30, 0,
+                           (outs g8rc:$rA),
+                           (ins gprc:$rS, u6imm:$SH, u6imm:$MBE),
+                           "rldicl $rA, $rS, $SH, $MBE", IIC_IntRotateDI,
+                           []>, isPPC64;
+// End fast-isel.
+defm RLDICR : MDForm_1r<30, 1,
+                        (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE),
+                        "rldicr", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI,
+                        []>, isPPC64;
+defm RLDIC  : MDForm_1r<30, 2,
+                        (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE),
+                        "rldic", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI,
+                        []>, isPPC64;
+
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
+defm RLWINM8 : MForm_2r<21, (outs g8rc:$rA),
+                        (ins g8rc:$rS, u5imm:$SH, u5imm:$MB, u5imm:$ME),
+                        "rlwinm", "$rA, $rS, $SH, $MB, $ME", IIC_IntGeneral,
+                        []>;
+
+let isCommutable = 1 in {
+// RLWIMI can be commuted if the rotate amount is zero.
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+defm RLWIMI8 : MForm_2r<20, (outs g8rc:$rA),
+                        (ins g8rc:$rSi, g8rc:$rS, u5imm:$SH, u5imm:$MB,
+                        u5imm:$ME), "rlwimi", "$rA, $rS, $SH, $MB, $ME",
+                        IIC_IntRotate, []>, PPC970_DGroup_Cracked,
+                        RegConstraint<"$rSi = $rA">, NoEncode<"$rSi">;
+}
+
+let isSelect = 1 in
+def ISEL8   : AForm_4<31, 15,
+                     (outs g8rc:$rT), (ins g8rc_nox0:$rA, g8rc:$rB, crbitrc:$cond),
+                     "isel $rT, $rA, $rB, $cond", IIC_IntGeneral,
+                     []>;
+}  // Interpretation64Bit
+}  // neverHasSideEffects = 1
+}  // End FXU Operations.
+
+
+//===----------------------------------------------------------------------===//
+// Load/Store instructions.
+//
+
+
+// Sign extending loads.
+let canFoldAsLoad = 1, PPC970_Unit = 2 in {
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+def LHA8: DForm_1<42, (outs g8rc:$rD), (ins memri:$src),
+                  "lha $rD, $src", IIC_LdStLHA,
+                  [(set i64:$rD, (sextloadi16 iaddr:$src))]>,
+                  PPC970_DGroup_Cracked;
+def LWA  : DSForm_1<58, 2, (outs g8rc:$rD), (ins memrix:$src),
+                    "lwa $rD, $src", IIC_LdStLWA,
+                    [(set i64:$rD,
+                          (aligned4sextloadi32 ixaddr:$src))]>, isPPC64,
+                    PPC970_DGroup_Cracked;
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+def LHAX8: XForm_1<31, 343, (outs g8rc:$rD), (ins memrr:$src),
+                   "lhax $rD, $src", IIC_LdStLHA,
+                   [(set i64:$rD, (sextloadi16 xaddr:$src))]>,
+                   PPC970_DGroup_Cracked;
+def LWAX : XForm_1<31, 341, (outs g8rc:$rD), (ins memrr:$src),
+                   "lwax $rD, $src", IIC_LdStLHA,
+                   [(set i64:$rD, (sextloadi32 xaddr:$src))]>, isPPC64,
+                   PPC970_DGroup_Cracked;
+// For fast-isel:
+let isCodeGenOnly = 1, mayLoad = 1 in {
+def LWA_32  : DSForm_1<58, 2, (outs gprc:$rD), (ins memrix:$src),
+                      "lwa $rD, $src", IIC_LdStLWA, []>, isPPC64,
+                      PPC970_DGroup_Cracked;
+def LWAX_32 : XForm_1<31, 341, (outs gprc:$rD), (ins memrr:$src),
+                     "lwax $rD, $src", IIC_LdStLHA, []>, isPPC64,
+                     PPC970_DGroup_Cracked;
+} // end fast-isel isCodeGenOnly
+
+// Update forms.
+let mayLoad = 1, neverHasSideEffects = 1 in {
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+def LHAU8 : DForm_1<43, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
+                    (ins memri:$addr),
+                    "lhau $rD, $addr", IIC_LdStLHAU,
+                    []>, RegConstraint<"$addr.reg = $ea_result">,
+                    NoEncode<"$ea_result">;
+// NO LWAU!
+
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+def LHAUX8 : XForm_1<31, 375, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
+                    (ins memrr:$addr),
+                    "lhaux $rD, $addr", IIC_LdStLHAUX,
+                    []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+                    NoEncode<"$ea_result">;
+def LWAUX : XForm_1<31, 373, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
+                    (ins memrr:$addr),
+                    "lwaux $rD, $addr", IIC_LdStLHAUX,
+                    []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+                    NoEncode<"$ea_result">, isPPC64;
+}
+}
+
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
+// Zero extending loads.
+let canFoldAsLoad = 1, PPC970_Unit = 2 in {
+def LBZ8 : DForm_1<34, (outs g8rc:$rD), (ins memri:$src),
+                  "lbz $rD, $src", IIC_LdStLoad,
+                  [(set i64:$rD, (zextloadi8 iaddr:$src))]>;
+def LHZ8 : DForm_1<40, (outs g8rc:$rD), (ins memri:$src),
+                  "lhz $rD, $src", IIC_LdStLoad,
+                  [(set i64:$rD, (zextloadi16 iaddr:$src))]>;
+def LWZ8 : DForm_1<32, (outs g8rc:$rD), (ins memri:$src),
+                  "lwz $rD, $src", IIC_LdStLoad,
+                  [(set i64:$rD, (zextloadi32 iaddr:$src))]>, isPPC64;
+
+def LBZX8 : XForm_1<31,  87, (outs g8rc:$rD), (ins memrr:$src),
+                   "lbzx $rD, $src", IIC_LdStLoad,
+                   [(set i64:$rD, (zextloadi8 xaddr:$src))]>;
+def LHZX8 : XForm_1<31, 279, (outs g8rc:$rD), (ins memrr:$src),
+                   "lhzx $rD, $src", IIC_LdStLoad,
+                   [(set i64:$rD, (zextloadi16 xaddr:$src))]>;
+def LWZX8 : XForm_1<31,  23, (outs g8rc:$rD), (ins memrr:$src),
+                   "lwzx $rD, $src", IIC_LdStLoad,
+                   [(set i64:$rD, (zextloadi32 xaddr:$src))]>;
+                   
+                   
+// Update forms.
+let mayLoad = 1, neverHasSideEffects = 1 in {
+def LBZU8 : DForm_1<35, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
+                    "lbzu $rD, $addr", IIC_LdStLoadUpd,
+                    []>, RegConstraint<"$addr.reg = $ea_result">,
+                    NoEncode<"$ea_result">;
+def LHZU8 : DForm_1<41, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
+                    "lhzu $rD, $addr", IIC_LdStLoadUpd,
+                    []>, RegConstraint<"$addr.reg = $ea_result">,
+                    NoEncode<"$ea_result">;
+def LWZU8 : DForm_1<33, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
+                    "lwzu $rD, $addr", IIC_LdStLoadUpd,
+                    []>, RegConstraint<"$addr.reg = $ea_result">,
+                    NoEncode<"$ea_result">;
+
+def LBZUX8 : XForm_1<31, 119, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
+                   (ins memrr:$addr),
+                   "lbzux $rD, $addr", IIC_LdStLoadUpdX,
+                   []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+                   NoEncode<"$ea_result">;
+def LHZUX8 : XForm_1<31, 311, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
+                   (ins memrr:$addr),
+                   "lhzux $rD, $addr", IIC_LdStLoadUpdX,
+                   []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+                   NoEncode<"$ea_result">;
+def LWZUX8 : XForm_1<31, 55, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
+                   (ins memrr:$addr),
+                   "lwzux $rD, $addr", IIC_LdStLoadUpdX,
+                   []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+                   NoEncode<"$ea_result">;
+}
+}
+} // Interpretation64Bit
+
+
+// Full 8-byte loads.
+let canFoldAsLoad = 1, PPC970_Unit = 2 in {
+def LD   : DSForm_1<58, 0, (outs g8rc:$rD), (ins memrix:$src),
+                    "ld $rD, $src", IIC_LdStLD,
+                    [(set i64:$rD, (aligned4load ixaddr:$src))]>, isPPC64;
+// The following three definitions are selected for small code model only.
+// Otherwise, we need to create two instructions to form a 32-bit offset,
+// so we have a custom matcher for TOC_ENTRY in PPCDAGToDAGIsel::Select().
+def LDtoc: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg),
+                  "#LDtoc",
+                  [(set i64:$rD,
+                     (PPCtoc_entry tglobaladdr:$disp, i64:$reg))]>, isPPC64;
+def LDtocJTI: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg),
+                  "#LDtocJTI",
+                  [(set i64:$rD,
+                     (PPCtoc_entry tjumptable:$disp, i64:$reg))]>, isPPC64;
+def LDtocCPT: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg),
+                  "#LDtocCPT",
+                  [(set i64:$rD,
+                     (PPCtoc_entry tconstpool:$disp, i64:$reg))]>, isPPC64;
+
+let hasSideEffects = 1, isCodeGenOnly = 1, RST = 2, Defs = [X2] in
+def LDinto_toc: DSForm_1<58, 0, (outs), (ins memrix:$src),
+                    "ld 2, $src", IIC_LdStLD,
+                    [(PPCload_toc ixaddr:$src)]>, isPPC64;
+
+def LDX  : XForm_1<31,  21, (outs g8rc:$rD), (ins memrr:$src),
+                   "ldx $rD, $src", IIC_LdStLD,
+                   [(set i64:$rD, (load xaddr:$src))]>, isPPC64;
+def LDBRX : XForm_1<31,  532, (outs g8rc:$rD), (ins memrr:$src),
+                   "ldbrx $rD, $src", IIC_LdStLoad,
+                   [(set i64:$rD, (PPClbrx xoaddr:$src, i64))]>, isPPC64;
+
+let mayLoad = 1, neverHasSideEffects = 1 in {
+def LDU  : DSForm_1<58, 1, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), (ins memrix:$addr),
+                    "ldu $rD, $addr", IIC_LdStLDU,
+                    []>, RegConstraint<"$addr.reg = $ea_result">, isPPC64,
+                    NoEncode<"$ea_result">;
+
+def LDUX : XForm_1<31, 53, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
+                   (ins memrr:$addr),
+                   "ldux $rD, $addr", IIC_LdStLDUX,
+                   []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+                   NoEncode<"$ea_result">, isPPC64;
+}
+}
+
+def : Pat<(PPCload ixaddr:$src),
+          (LD ixaddr:$src)>;
+def : Pat<(PPCload xaddr:$src),
+          (LDX xaddr:$src)>;
+
+// Support for medium and large code model.
+def ADDIStocHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp),
+                       "#ADDIStocHA",
+                       [(set i64:$rD,
+                         (PPCaddisTocHA i64:$reg, tglobaladdr:$disp))]>,
+                       isPPC64;
+def LDtocL: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc_nox0:$reg),
+                   "#LDtocL",
+                   [(set i64:$rD,
+                     (PPCldTocL tglobaladdr:$disp, i64:$reg))]>, isPPC64;
+def ADDItocL: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp),
+                     "#ADDItocL",
+                     [(set i64:$rD,
+                       (PPCaddiTocL i64:$reg, tglobaladdr:$disp))]>, isPPC64;
+
+// Support for thread-local storage.
+def ADDISgotTprelHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
+                         "#ADDISgotTprelHA",
+                         [(set i64:$rD,
+                           (PPCaddisGotTprelHA i64:$reg,
+                                               tglobaltlsaddr:$disp))]>,
+                  isPPC64;
+def LDgotTprelL: Pseudo<(outs g8rc:$rD), (ins s16imm64:$disp, g8rc_nox0:$reg),
+                        "#LDgotTprelL",
+                        [(set i64:$rD,
+                          (PPCldGotTprelL tglobaltlsaddr:$disp, i64:$reg))]>,
+                 isPPC64;
+def : Pat<(PPCaddTls i64:$in, tglobaltlsaddr:$g),
+          (ADD8TLS $in, tglobaltlsaddr:$g)>;
+def ADDIStlsgdHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
+                         "#ADDIStlsgdHA",
+                         [(set i64:$rD,
+                           (PPCaddisTlsgdHA i64:$reg, tglobaltlsaddr:$disp))]>,
+                  isPPC64;
+def ADDItlsgdL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
+                       "#ADDItlsgdL",
+                       [(set i64:$rD,
+                         (PPCaddiTlsgdL i64:$reg, tglobaltlsaddr:$disp))]>,
+                 isPPC64;
+def GETtlsADDR : Pseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym),
+                        "#GETtlsADDR",
+                        [(set i64:$rD,
+                          (PPCgetTlsAddr i64:$reg, tglobaltlsaddr:$sym))]>,
+                 isPPC64;
+def ADDIStlsldHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
+                         "#ADDIStlsldHA",
+                         [(set i64:$rD,
+                           (PPCaddisTlsldHA i64:$reg, tglobaltlsaddr:$disp))]>,
+                  isPPC64;
+def ADDItlsldL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
+                       "#ADDItlsldL",
+                       [(set i64:$rD,
+                         (PPCaddiTlsldL i64:$reg, tglobaltlsaddr:$disp))]>,
+                 isPPC64;
+def GETtlsldADDR : Pseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym),
+                          "#GETtlsldADDR",
+                          [(set i64:$rD,
+                            (PPCgetTlsldAddr i64:$reg, tglobaltlsaddr:$sym))]>,
+                   isPPC64;
+def ADDISdtprelHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
+                          "#ADDISdtprelHA",
+                          [(set i64:$rD,
+                            (PPCaddisDtprelHA i64:$reg,
+                                              tglobaltlsaddr:$disp))]>,
+                   isPPC64;
+def ADDIdtprelL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
+                         "#ADDIdtprelL",
+                         [(set i64:$rD,
+                           (PPCaddiDtprelL i64:$reg, tglobaltlsaddr:$disp))]>,
+                  isPPC64;
+
+let PPC970_Unit = 2 in {
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
+// Truncating stores.                       
+def STB8 : DForm_1<38, (outs), (ins g8rc:$rS, memri:$src),
+                   "stb $rS, $src", IIC_LdStStore,
+                   [(truncstorei8 i64:$rS, iaddr:$src)]>;
+def STH8 : DForm_1<44, (outs), (ins g8rc:$rS, memri:$src),
+                   "sth $rS, $src", IIC_LdStStore,
+                   [(truncstorei16 i64:$rS, iaddr:$src)]>;
+def STW8 : DForm_1<36, (outs), (ins g8rc:$rS, memri:$src),
+                   "stw $rS, $src", IIC_LdStStore,
+                   [(truncstorei32 i64:$rS, iaddr:$src)]>;
+def STBX8 : XForm_8<31, 215, (outs), (ins g8rc:$rS, memrr:$dst),
+                   "stbx $rS, $dst", IIC_LdStStore,
+                   [(truncstorei8 i64:$rS, xaddr:$dst)]>,
+                   PPC970_DGroup_Cracked;
+def STHX8 : XForm_8<31, 407, (outs), (ins g8rc:$rS, memrr:$dst),
+                   "sthx $rS, $dst", IIC_LdStStore,
+                   [(truncstorei16 i64:$rS, xaddr:$dst)]>,
+                   PPC970_DGroup_Cracked;
+def STWX8 : XForm_8<31, 151, (outs), (ins g8rc:$rS, memrr:$dst),
+                   "stwx $rS, $dst", IIC_LdStStore,
+                   [(truncstorei32 i64:$rS, xaddr:$dst)]>,
+                   PPC970_DGroup_Cracked;
+} // Interpretation64Bit
+
+// Normal 8-byte stores.
+def STD  : DSForm_1<62, 0, (outs), (ins g8rc:$rS, memrix:$dst),
+                    "std $rS, $dst", IIC_LdStSTD,
+                    [(aligned4store i64:$rS, ixaddr:$dst)]>, isPPC64;
+def STDX  : XForm_8<31, 149, (outs), (ins g8rc:$rS, memrr:$dst),
+                   "stdx $rS, $dst", IIC_LdStSTD,
+                   [(store i64:$rS, xaddr:$dst)]>, isPPC64,
+                   PPC970_DGroup_Cracked;
+def STDBRX: XForm_8<31, 660, (outs), (ins g8rc:$rS, memrr:$dst),
+                   "stdbrx $rS, $dst", IIC_LdStStore,
+                   [(PPCstbrx i64:$rS, xoaddr:$dst, i64)]>, isPPC64,
+                   PPC970_DGroup_Cracked;
+}
+
+// Stores with Update (pre-inc).
+let PPC970_Unit = 2, mayStore = 1 in {
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
+def STBU8 : DForm_1<39, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memri:$dst),
+                   "stbu $rS, $dst", IIC_LdStStoreUpd, []>,
+                   RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
+def STHU8 : DForm_1<45, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memri:$dst),
+                   "sthu $rS, $dst", IIC_LdStStoreUpd, []>,
+                   RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
+def STWU8 : DForm_1<37, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memri:$dst),
+                   "stwu $rS, $dst", IIC_LdStStoreUpd, []>,
+                   RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
+
+def STBUX8: XForm_8<31, 247, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrr:$dst),
+                    "stbux $rS, $dst", IIC_LdStStoreUpd, []>,
+                    RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
+                    PPC970_DGroup_Cracked;
+def STHUX8: XForm_8<31, 439, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrr:$dst),
+                    "sthux $rS, $dst", IIC_LdStStoreUpd, []>,
+                    RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
+                    PPC970_DGroup_Cracked;
+def STWUX8: XForm_8<31, 183, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrr:$dst),
+                    "stwux $rS, $dst", IIC_LdStStoreUpd, []>,
+                    RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
+                    PPC970_DGroup_Cracked;
+} // Interpretation64Bit
+
+def STDU : DSForm_1<62, 1, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrix:$dst),
+                   "stdu $rS, $dst", IIC_LdStSTDU, []>,
+                   RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">,
+                   isPPC64;
+
+def STDUX : XForm_8<31, 181, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrr:$dst),
+                    "stdux $rS, $dst", IIC_LdStSTDUX, []>,
+                    RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
+                    PPC970_DGroup_Cracked, isPPC64;
+}
+
+// Patterns to match the pre-inc stores.  We can't put the patterns on
+// the instruction definitions directly as ISel wants the address base
+// and offset to be separate operands, not a single complex operand.
+def : Pat<(pre_truncsti8 i64:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
+          (STBU8 $rS, iaddroff:$ptroff, $ptrreg)>;
+def : Pat<(pre_truncsti16 i64:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
+          (STHU8 $rS, iaddroff:$ptroff, $ptrreg)>;
+def : Pat<(pre_truncsti32 i64:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
+          (STWU8 $rS, iaddroff:$ptroff, $ptrreg)>;
+def : Pat<(aligned4pre_store i64:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
+          (STDU $rS, iaddroff:$ptroff, $ptrreg)>;
+
+def : Pat<(pre_truncsti8 i64:$rS, iPTR:$ptrreg, iPTR:$ptroff),
+          (STBUX8 $rS, $ptrreg, $ptroff)>;
+def : Pat<(pre_truncsti16 i64:$rS, iPTR:$ptrreg, iPTR:$ptroff),
+          (STHUX8 $rS, $ptrreg, $ptroff)>;
+def : Pat<(pre_truncsti32 i64:$rS, iPTR:$ptrreg, iPTR:$ptroff),
+          (STWUX8 $rS, $ptrreg, $ptroff)>;
+def : Pat<(pre_store i64:$rS, iPTR:$ptrreg, iPTR:$ptroff),
+          (STDUX $rS, $ptrreg, $ptroff)>;
+
+
+//===----------------------------------------------------------------------===//
+// Floating point instructions.
+//
+
+
+let PPC970_Unit = 3, neverHasSideEffects = 1,
+    Uses = [RM] in {  // FPU Operations.
+defm FCFID  : XForm_26r<63, 846, (outs f8rc:$frD), (ins f8rc:$frB),
+                        "fcfid", "$frD, $frB", IIC_FPGeneral,
+                        [(set f64:$frD, (PPCfcfid f64:$frB))]>, isPPC64;
+defm FCTID  : XForm_26r<63, 814, (outs f8rc:$frD), (ins f8rc:$frB),
+                        "fctid", "$frD, $frB", IIC_FPGeneral,
+                        []>, isPPC64;
+defm FCTIDZ : XForm_26r<63, 815, (outs f8rc:$frD), (ins f8rc:$frB),
+                        "fctidz", "$frD, $frB", IIC_FPGeneral,
+                        [(set f64:$frD, (PPCfctidz f64:$frB))]>, isPPC64;
+
+defm FCFIDU  : XForm_26r<63, 974, (outs f8rc:$frD), (ins f8rc:$frB),
+                        "fcfidu", "$frD, $frB", IIC_FPGeneral,
+                        [(set f64:$frD, (PPCfcfidu f64:$frB))]>, isPPC64;
+defm FCFIDS  : XForm_26r<59, 846, (outs f4rc:$frD), (ins f8rc:$frB),
+                        "fcfids", "$frD, $frB", IIC_FPGeneral,
+                        [(set f32:$frD, (PPCfcfids f64:$frB))]>, isPPC64;
+defm FCFIDUS : XForm_26r<59, 974, (outs f4rc:$frD), (ins f8rc:$frB),
+                        "fcfidus", "$frD, $frB", IIC_FPGeneral,
+                        [(set f32:$frD, (PPCfcfidus f64:$frB))]>, isPPC64;
+defm FCTIDUZ : XForm_26r<63, 943, (outs f8rc:$frD), (ins f8rc:$frB),
+                        "fctiduz", "$frD, $frB", IIC_FPGeneral,
+                        [(set f64:$frD, (PPCfctiduz f64:$frB))]>, isPPC64;
+defm FCTIWUZ : XForm_26r<63, 143, (outs f8rc:$frD), (ins f8rc:$frB),
+                        "fctiwuz", "$frD, $frB", IIC_FPGeneral,
+                        [(set f64:$frD, (PPCfctiwuz f64:$frB))]>, isPPC64;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Instruction Patterns
+//
+
+// Extensions and truncates to/from 32-bit regs.
+def : Pat<(i64 (zext i32:$in)),
+          (RLDICL (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $in, sub_32),
+                  0, 32)>;
+def : Pat<(i64 (anyext i32:$in)),
+          (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $in, sub_32)>;
+def : Pat<(i32 (trunc i64:$in)),
+          (EXTRACT_SUBREG $in, sub_32)>;
+
+// Implement the 'not' operation with the NOR instruction.
+// (we could use the default xori pattern, but nor has lower latency on some
+// cores (such as the A2)).
+def i64not : OutPatFrag<(ops node:$in),
+                        (NOR8 $in, $in)>;
+def        : Pat<(not i64:$in),
+                 (i64not $in)>;
+
+// Extending loads with i64 targets.
+def : Pat<(zextloadi1 iaddr:$src),
+          (LBZ8 iaddr:$src)>;
+def : Pat<(zextloadi1 xaddr:$src),
+          (LBZX8 xaddr:$src)>;
+def : Pat<(extloadi1 iaddr:$src),
+          (LBZ8 iaddr:$src)>;
+def : Pat<(extloadi1 xaddr:$src),
+          (LBZX8 xaddr:$src)>;
+def : Pat<(extloadi8 iaddr:$src),
+          (LBZ8 iaddr:$src)>;
+def : Pat<(extloadi8 xaddr:$src),
+          (LBZX8 xaddr:$src)>;
+def : Pat<(extloadi16 iaddr:$src),
+          (LHZ8 iaddr:$src)>;
+def : Pat<(extloadi16 xaddr:$src),
+          (LHZX8 xaddr:$src)>;
+def : Pat<(extloadi32 iaddr:$src),
+          (LWZ8 iaddr:$src)>;
+def : Pat<(extloadi32 xaddr:$src),
+          (LWZX8 xaddr:$src)>;
+
+// Standard shifts.  These are represented separately from the real shifts above
+// so that we can distinguish between shifts that allow 6-bit and 7-bit shift
+// amounts.
+def : Pat<(sra i64:$rS, i32:$rB),
+          (SRAD $rS, $rB)>;
+def : Pat<(srl i64:$rS, i32:$rB),
+          (SRD $rS, $rB)>;
+def : Pat<(shl i64:$rS, i32:$rB),
+          (SLD $rS, $rB)>;
+
+// SHL/SRL
+def : Pat<(shl i64:$in, (i32 imm:$imm)),
+          (RLDICR $in, imm:$imm, (SHL64 imm:$imm))>;
+def : Pat<(srl i64:$in, (i32 imm:$imm)),
+          (RLDICL $in, (SRL64 imm:$imm), imm:$imm)>;
+
+// ROTL
+def : Pat<(rotl i64:$in, i32:$sh),
+          (RLDCL $in, $sh, 0)>;
+def : Pat<(rotl i64:$in, (i32 imm:$imm)),
+          (RLDICL $in, imm:$imm, 0)>;
+
+// Hi and Lo for Darwin Global Addresses.
+def : Pat<(PPChi tglobaladdr:$in, 0), (LIS8 tglobaladdr:$in)>;
+def : Pat<(PPClo tglobaladdr:$in, 0), (LI8  tglobaladdr:$in)>;
+def : Pat<(PPChi tconstpool:$in , 0), (LIS8 tconstpool:$in)>;
+def : Pat<(PPClo tconstpool:$in , 0), (LI8  tconstpool:$in)>;
+def : Pat<(PPChi tjumptable:$in , 0), (LIS8 tjumptable:$in)>;
+def : Pat<(PPClo tjumptable:$in , 0), (LI8  tjumptable:$in)>;
+def : Pat<(PPChi tblockaddress:$in, 0), (LIS8 tblockaddress:$in)>;
+def : Pat<(PPClo tblockaddress:$in, 0), (LI8  tblockaddress:$in)>;
+def : Pat<(PPChi tglobaltlsaddr:$g, i64:$in),
+          (ADDIS8 $in, tglobaltlsaddr:$g)>;
+def : Pat<(PPClo tglobaltlsaddr:$g, i64:$in),
+          (ADDI8 $in, tglobaltlsaddr:$g)>;
+def : Pat<(add i64:$in, (PPChi tglobaladdr:$g, 0)),
+          (ADDIS8 $in, tglobaladdr:$g)>;
+def : Pat<(add i64:$in, (PPChi tconstpool:$g, 0)),
+          (ADDIS8 $in, tconstpool:$g)>;
+def : Pat<(add i64:$in, (PPChi tjumptable:$g, 0)),
+          (ADDIS8 $in, tjumptable:$g)>;
+def : Pat<(add i64:$in, (PPChi tblockaddress:$g, 0)),
+          (ADDIS8 $in, tblockaddress:$g)>;
+
+// Patterns to match r+r indexed loads and stores for
+// addresses without at least 4-byte alignment.
+def : Pat<(i64 (unaligned4sextloadi32 xoaddr:$src)),
+          (LWAX xoaddr:$src)>;
+def : Pat<(i64 (unaligned4load xoaddr:$src)),
+          (LDX xoaddr:$src)>;
+def : Pat<(unaligned4store i64:$rS, xoaddr:$dst),
+          (STDX $rS, xoaddr:$dst)>;
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrAltivec.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
new file mode 100644
index 0000000..b271b5d
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -0,0 +1,939 @@
+//===-- PPCInstrAltivec.td - The PowerPC Altivec Extension -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Altivec extension to the PowerPC instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Altivec transformation functions and pattern fragments.
+//
+
+// Since we canonicalize buildvectors to v16i8, all vnots "-1" operands will be
+// of that type.
+def vnot_ppc : PatFrag<(ops node:$in),
+                       (xor node:$in, (bitconvert (v16i8 immAllOnesV)))>;
+
+def vpkuhum_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                              (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVPKUHUMShuffleMask(cast<ShuffleVectorSDNode>(N), 0, *CurDAG);
+}]>;
+def vpkuwum_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                              (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVPKUWUMShuffleMask(cast<ShuffleVectorSDNode>(N), 0, *CurDAG);
+}]>;
+def vpkuhum_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                    (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVPKUHUMShuffleMask(cast<ShuffleVectorSDNode>(N), 1, *CurDAG);
+}]>;
+def vpkuwum_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                    (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVPKUWUMShuffleMask(cast<ShuffleVectorSDNode>(N), 1, *CurDAG);
+}]>;
+
+// These fragments are provided for little-endian, where the inputs must be
+// swapped for correct semantics.
+def vpkuhum_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                      (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVPKUHUMShuffleMask(cast<ShuffleVectorSDNode>(N), 2, *CurDAG);
+}]>;
+def vpkuwum_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                      (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVPKUWUMShuffleMask(cast<ShuffleVectorSDNode>(N), 2, *CurDAG);
+}]>;
+
+def vmrglb_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 1, 0, *CurDAG);
+}]>;
+def vmrglh_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 2, 0, *CurDAG);
+}]>;
+def vmrglw_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 4, 0, *CurDAG);
+}]>;
+def vmrghb_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 1, 0, *CurDAG);
+}]>;
+def vmrghh_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 2, 0, *CurDAG);
+}]>;
+def vmrghw_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 4, 0, *CurDAG);
+}]>;
+
+
+def vmrglb_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                               (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 1, 1, *CurDAG);
+}]>;
+def vmrglh_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 2, 1, *CurDAG);
+}]>;
+def vmrglw_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 4, 1, *CurDAG);
+}]>;
+def vmrghb_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 1, 1, *CurDAG);
+}]>;
+def vmrghh_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 2, 1, *CurDAG);
+}]>;
+def vmrghw_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 4, 1, *CurDAG);
+}]>;
+
+
+// These fragments are provided for little-endian, where the inputs must be
+// swapped for correct semantics.
+def vmrglb_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                               (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 1, 2, *CurDAG);
+}]>;
+def vmrglh_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 2, 2, *CurDAG);
+}]>;
+def vmrglw_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 4, 2, *CurDAG);
+}]>;
+def vmrghb_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 1, 2, *CurDAG);
+}]>;
+def vmrghh_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 2, 2, *CurDAG);
+}]>;
+def vmrghw_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 4, 2, *CurDAG);
+}]>;
+
+
+def VSLDOI_get_imm : SDNodeXForm<vector_shuffle, [{
+  return getI32Imm(PPC::isVSLDOIShuffleMask(N, 0, *CurDAG));
+}]>;
+def vsldoi_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVSLDOIShuffleMask(N, 0, *CurDAG) != -1;
+}], VSLDOI_get_imm>;
+
+
+/// VSLDOI_unary* - These are used to match vsldoi(X,X), which is turned into
+/// vector_shuffle(X,undef,mask) by the dag combiner.
+def VSLDOI_unary_get_imm : SDNodeXForm<vector_shuffle, [{
+  return getI32Imm(PPC::isVSLDOIShuffleMask(N, 1, *CurDAG));
+}]>;
+def vsldoi_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVSLDOIShuffleMask(N, 1, *CurDAG) != -1;
+}], VSLDOI_unary_get_imm>;
+
+
+/// VSLDOI_swapped* - These fragments are provided for little-endian, where
+/// the inputs must be swapped for correct semantics.
+def VSLDOI_swapped_get_imm : SDNodeXForm<vector_shuffle, [{
+  return getI32Imm(PPC::isVSLDOIShuffleMask(N, 2, *CurDAG));
+}]>;
+def vsldoi_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                     (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVSLDOIShuffleMask(N, 2, *CurDAG) != -1;
+}], VSLDOI_get_imm>;
+
+
+// VSPLT*_get_imm xform function: convert vector_shuffle mask to VSPLT* imm.
+def VSPLTB_get_imm : SDNodeXForm<vector_shuffle, [{
+  return getI32Imm(PPC::getVSPLTImmediate(N, 1, *CurDAG));
+}]>;
+def vspltb_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isSplatShuffleMask(cast<ShuffleVectorSDNode>(N), 1);
+}], VSPLTB_get_imm>;
+def VSPLTH_get_imm : SDNodeXForm<vector_shuffle, [{
+  return getI32Imm(PPC::getVSPLTImmediate(N, 2, *CurDAG));
+}]>;
+def vsplth_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isSplatShuffleMask(cast<ShuffleVectorSDNode>(N), 2);
+}], VSPLTH_get_imm>;
+def VSPLTW_get_imm : SDNodeXForm<vector_shuffle, [{
+  return getI32Imm(PPC::getVSPLTImmediate(N, 4, *CurDAG));
+}]>;
+def vspltw_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isSplatShuffleMask(cast<ShuffleVectorSDNode>(N), 4);
+}], VSPLTW_get_imm>;
+
+
+// VSPLTISB_get_imm xform function: convert build_vector to VSPLTISB imm.
+def VSPLTISB_get_imm : SDNodeXForm<build_vector, [{
+  return PPC::get_VSPLTI_elt(N, 1, *CurDAG);
+}]>;
+def vecspltisb : PatLeaf<(build_vector), [{
+  return PPC::get_VSPLTI_elt(N, 1, *CurDAG).getNode() != 0;
+}], VSPLTISB_get_imm>;
+
+// VSPLTISH_get_imm xform function: convert build_vector to VSPLTISH imm.
+def VSPLTISH_get_imm : SDNodeXForm<build_vector, [{
+  return PPC::get_VSPLTI_elt(N, 2, *CurDAG);
+}]>;
+def vecspltish : PatLeaf<(build_vector), [{
+  return PPC::get_VSPLTI_elt(N, 2, *CurDAG).getNode() != 0;
+}], VSPLTISH_get_imm>;
+
+// VSPLTISW_get_imm xform function: convert build_vector to VSPLTISW imm.
+def VSPLTISW_get_imm : SDNodeXForm<build_vector, [{
+  return PPC::get_VSPLTI_elt(N, 4, *CurDAG);
+}]>;
+def vecspltisw : PatLeaf<(build_vector), [{
+  return PPC::get_VSPLTI_elt(N, 4, *CurDAG).getNode() != 0;
+}], VSPLTISW_get_imm>;
+
+//===----------------------------------------------------------------------===//
+// Helpers for defining instructions that directly correspond to intrinsics.
+
+// VA1a_Int_Ty - A VAForm_1a intrinsic definition of specific type.
+class VA1a_Int_Ty<bits<6> xo, string opc, Intrinsic IntID, ValueType Ty>
+  : VAForm_1a<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, vrrc:$vC),
+              !strconcat(opc, " $vD, $vA, $vB, $vC"), IIC_VecFP,
+                       [(set Ty:$vD, (IntID Ty:$vA, Ty:$vB, Ty:$vC))]>;
+
+// VA1a_Int_Ty2 - A VAForm_1a intrinsic definition where the type of the
+// inputs doesn't match the type of the output.
+class VA1a_Int_Ty2<bits<6> xo, string opc, Intrinsic IntID, ValueType OutTy,
+                   ValueType InTy>
+  : VAForm_1a<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, vrrc:$vC),
+              !strconcat(opc, " $vD, $vA, $vB, $vC"), IIC_VecFP,
+                       [(set OutTy:$vD, (IntID InTy:$vA, InTy:$vB, InTy:$vC))]>;
+
+// VA1a_Int_Ty3 - A VAForm_1a intrinsic definition where there are two
+// input types and an output type.
+class VA1a_Int_Ty3<bits<6> xo, string opc, Intrinsic IntID, ValueType OutTy,
+                   ValueType In1Ty, ValueType In2Ty>
+  : VAForm_1a<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, vrrc:$vC),
+              !strconcat(opc, " $vD, $vA, $vB, $vC"), IIC_VecFP,
+                       [(set OutTy:$vD,
+                         (IntID In1Ty:$vA, In1Ty:$vB, In2Ty:$vC))]>;
+
+// VX1_Int_Ty - A VXForm_1 intrinsic definition of specific type.
+class VX1_Int_Ty<bits<11> xo, string opc, Intrinsic IntID, ValueType Ty>
+  : VXForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+             !strconcat(opc, " $vD, $vA, $vB"), IIC_VecFP,
+             [(set Ty:$vD, (IntID Ty:$vA, Ty:$vB))]>;
+
+// VX1_Int_Ty2 - A VXForm_1 intrinsic definition where the type of the
+// inputs doesn't match the type of the output.
+class VX1_Int_Ty2<bits<11> xo, string opc, Intrinsic IntID, ValueType OutTy,
+                  ValueType InTy>
+  : VXForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+             !strconcat(opc, " $vD, $vA, $vB"), IIC_VecFP,
+             [(set OutTy:$vD, (IntID InTy:$vA, InTy:$vB))]>;
+
+// VX1_Int_Ty3 - A VXForm_1 intrinsic definition where there are two
+// input types and an output type.
+class VX1_Int_Ty3<bits<11> xo, string opc, Intrinsic IntID, ValueType OutTy,
+                  ValueType In1Ty, ValueType In2Ty>
+  : VXForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+             !strconcat(opc, " $vD, $vA, $vB"), IIC_VecFP,
+             [(set OutTy:$vD, (IntID In1Ty:$vA, In2Ty:$vB))]>;
+
+// VX2_Int_SP - A VXForm_2 intrinsic definition of vector single-precision type.
+class VX2_Int_SP<bits<11> xo, string opc, Intrinsic IntID>
+  : VXForm_2<xo, (outs vrrc:$vD), (ins vrrc:$vB),
+             !strconcat(opc, " $vD, $vB"), IIC_VecFP,
+             [(set v4f32:$vD, (IntID v4f32:$vB))]>;
+
+// VX2_Int_Ty2 - A VXForm_2 intrinsic definition where the type of the
+// inputs doesn't match the type of the output.
+class VX2_Int_Ty2<bits<11> xo, string opc, Intrinsic IntID, ValueType OutTy,
+                  ValueType InTy>
+  : VXForm_2<xo, (outs vrrc:$vD), (ins vrrc:$vB),
+             !strconcat(opc, " $vD, $vB"), IIC_VecFP,
+             [(set OutTy:$vD, (IntID InTy:$vB))]>;
+
+//===----------------------------------------------------------------------===//
+// Instruction Definitions.
+
+def HasAltivec : Predicate<"PPCSubTarget->hasAltivec()">;
+let Predicates = [HasAltivec] in {
+
+let isCodeGenOnly = 1 in {
+def DSS      : DSS_Form<822, (outs),
+                        (ins u5imm:$ZERO0, u5imm:$STRM,u5imm:$ZERO1,u5imm:$ZERO2),
+                        "dss $STRM", IIC_LdStLoad /*FIXME*/, []>,
+                        Deprecated<DeprecatedDST>;
+def DSSALL   : DSS_Form<822, (outs),
+                        (ins u5imm:$ONE, u5imm:$ZERO0,u5imm:$ZERO1,u5imm:$ZERO2),
+                        "dssall", IIC_LdStLoad /*FIXME*/, []>,
+                        Deprecated<DeprecatedDST>;
+def DST      : DSS_Form<342, (outs),
+                        (ins u5imm:$ZERO, u5imm:$STRM, gprc:$rA, gprc:$rB),
+                        "dst $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/, []>,
+                        Deprecated<DeprecatedDST>;
+def DSTT     : DSS_Form<342, (outs),
+                        (ins u5imm:$ONE, u5imm:$STRM, gprc:$rA, gprc:$rB),
+                        "dstt $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/, []>,
+                        Deprecated<DeprecatedDST>;
+def DSTST    : DSS_Form<374, (outs),
+                        (ins u5imm:$ZERO, u5imm:$STRM, gprc:$rA, gprc:$rB),
+                        "dstst $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/, []>,
+                        Deprecated<DeprecatedDST>;
+def DSTSTT   : DSS_Form<374, (outs),
+                        (ins u5imm:$ONE, u5imm:$STRM, gprc:$rA, gprc:$rB),
+                        "dststt $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/, []>,
+                        Deprecated<DeprecatedDST>;
+
+def DST64    : DSS_Form<342, (outs),
+                        (ins u5imm:$ZERO, u5imm:$STRM, g8rc:$rA, gprc:$rB),
+                        "dst $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/, []>,
+                        Deprecated<DeprecatedDST>;
+def DSTT64   : DSS_Form<342, (outs),
+                        (ins u5imm:$ONE, u5imm:$STRM, g8rc:$rA, gprc:$rB),
+                        "dstt $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/, []>,
+                        Deprecated<DeprecatedDST>;
+def DSTST64  : DSS_Form<374, (outs),
+                        (ins u5imm:$ZERO, u5imm:$STRM, g8rc:$rA, gprc:$rB),
+                        "dstst $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/, []>,
+                        Deprecated<DeprecatedDST>;
+def DSTSTT64 : DSS_Form<374, (outs),
+                        (ins u5imm:$ONE, u5imm:$STRM, g8rc:$rA, gprc:$rB),
+                        "dststt $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/, []>,
+                        Deprecated<DeprecatedDST>;
+}
+
+def MFVSCR : VXForm_4<1540, (outs vrrc:$vD), (ins),
+                      "mfvscr $vD", IIC_LdStStore,
+                      [(set v8i16:$vD, (int_ppc_altivec_mfvscr))]>; 
+def MTVSCR : VXForm_5<1604, (outs), (ins vrrc:$vB),
+                      "mtvscr $vB", IIC_LdStLoad,
+                      [(int_ppc_altivec_mtvscr v4i32:$vB)]>; 
+
+let canFoldAsLoad = 1, PPC970_Unit = 2 in {  // Loads.
+def LVEBX: XForm_1<31,   7, (outs vrrc:$vD), (ins memrr:$src),
+                   "lvebx $vD, $src", IIC_LdStLoad,
+                   [(set v16i8:$vD, (int_ppc_altivec_lvebx xoaddr:$src))]>;
+def LVEHX: XForm_1<31,  39, (outs vrrc:$vD), (ins memrr:$src),
+                   "lvehx $vD, $src", IIC_LdStLoad,
+                   [(set v8i16:$vD, (int_ppc_altivec_lvehx xoaddr:$src))]>;
+def LVEWX: XForm_1<31,  71, (outs vrrc:$vD), (ins memrr:$src),
+                   "lvewx $vD, $src", IIC_LdStLoad,
+                   [(set v4i32:$vD, (int_ppc_altivec_lvewx xoaddr:$src))]>;
+def LVX  : XForm_1<31, 103, (outs vrrc:$vD), (ins memrr:$src),
+                   "lvx $vD, $src", IIC_LdStLoad,
+                   [(set v4i32:$vD, (int_ppc_altivec_lvx xoaddr:$src))]>;
+def LVXL : XForm_1<31, 359, (outs vrrc:$vD), (ins memrr:$src),
+                   "lvxl $vD, $src", IIC_LdStLoad,
+                   [(set v4i32:$vD, (int_ppc_altivec_lvxl xoaddr:$src))]>;
+}
+
+def LVSL : XForm_1<31,   6, (outs vrrc:$vD), (ins memrr:$src),
+                   "lvsl $vD, $src", IIC_LdStLoad,
+                   [(set v16i8:$vD, (int_ppc_altivec_lvsl xoaddr:$src))]>,
+                   PPC970_Unit_LSU;
+def LVSR : XForm_1<31,  38, (outs vrrc:$vD), (ins memrr:$src),
+                   "lvsr $vD, $src", IIC_LdStLoad,
+                   [(set v16i8:$vD, (int_ppc_altivec_lvsr xoaddr:$src))]>,
+                   PPC970_Unit_LSU;
+
+let PPC970_Unit = 2 in {   // Stores.
+def STVEBX: XForm_8<31, 135, (outs), (ins vrrc:$rS, memrr:$dst),
+                   "stvebx $rS, $dst", IIC_LdStStore,
+                   [(int_ppc_altivec_stvebx v16i8:$rS, xoaddr:$dst)]>;
+def STVEHX: XForm_8<31, 167, (outs), (ins vrrc:$rS, memrr:$dst),
+                   "stvehx $rS, $dst", IIC_LdStStore,
+                   [(int_ppc_altivec_stvehx v8i16:$rS, xoaddr:$dst)]>;
+def STVEWX: XForm_8<31, 199, (outs), (ins vrrc:$rS, memrr:$dst),
+                   "stvewx $rS, $dst", IIC_LdStStore,
+                   [(int_ppc_altivec_stvewx v4i32:$rS, xoaddr:$dst)]>;
+def STVX  : XForm_8<31, 231, (outs), (ins vrrc:$rS, memrr:$dst),
+                   "stvx $rS, $dst", IIC_LdStStore,
+                   [(int_ppc_altivec_stvx v4i32:$rS, xoaddr:$dst)]>;
+def STVXL : XForm_8<31, 487, (outs), (ins vrrc:$rS, memrr:$dst),
+                   "stvxl $rS, $dst", IIC_LdStStore,
+                   [(int_ppc_altivec_stvxl v4i32:$rS, xoaddr:$dst)]>;
+}
+
+let PPC970_Unit = 5 in {  // VALU Operations.
+// VA-Form instructions.  3-input AltiVec ops.
+let isCommutable = 1 in {
+def VMADDFP : VAForm_1<46, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vC, vrrc:$vB),
+                       "vmaddfp $vD, $vA, $vC, $vB", IIC_VecFP,
+                       [(set v4f32:$vD,
+                        (fma v4f32:$vA, v4f32:$vC, v4f32:$vB))]>;
+
+// FIXME: The fma+fneg pattern won't match because fneg is not legal.
+def VNMSUBFP: VAForm_1<47, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vC, vrrc:$vB),
+                       "vnmsubfp $vD, $vA, $vC, $vB", IIC_VecFP,
+                       [(set v4f32:$vD, (fneg (fma v4f32:$vA, v4f32:$vC,
+                                                  (fneg v4f32:$vB))))]>;
+
+def VMHADDSHS  : VA1a_Int_Ty<32, "vmhaddshs", int_ppc_altivec_vmhaddshs, v8i16>;
+def VMHRADDSHS : VA1a_Int_Ty<33, "vmhraddshs", int_ppc_altivec_vmhraddshs,
+                             v8i16>;
+def VMLADDUHM  : VA1a_Int_Ty<34, "vmladduhm", int_ppc_altivec_vmladduhm, v8i16>;
+} // isCommutable
+
+def VPERM      : VA1a_Int_Ty3<43, "vperm", int_ppc_altivec_vperm,
+                              v4i32, v4i32, v16i8>;
+def VSEL       : VA1a_Int_Ty<42, "vsel",  int_ppc_altivec_vsel, v4i32>;
+
+// Shuffles.
+def VSLDOI  : VAForm_2<44, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, u5imm:$SH),
+                       "vsldoi $vD, $vA, $vB, $SH", IIC_VecFP,
+                       [(set v16i8:$vD, 
+                         (vsldoi_shuffle:$SH v16i8:$vA, v16i8:$vB))]>;
+
+// VX-Form instructions.  AltiVec arithmetic ops.
+let isCommutable = 1 in {
+def VADDFP : VXForm_1<10, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                      "vaddfp $vD, $vA, $vB", IIC_VecFP,
+                      [(set v4f32:$vD, (fadd v4f32:$vA, v4f32:$vB))]>;
+                      
+def VADDUBM : VXForm_1<0, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                      "vaddubm $vD, $vA, $vB", IIC_VecGeneral,
+                      [(set v16i8:$vD, (add v16i8:$vA, v16i8:$vB))]>;
+def VADDUHM : VXForm_1<64, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                      "vadduhm $vD, $vA, $vB", IIC_VecGeneral,
+                      [(set v8i16:$vD, (add v8i16:$vA, v8i16:$vB))]>;
+def VADDUWM : VXForm_1<128, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                      "vadduwm $vD, $vA, $vB", IIC_VecGeneral,
+                      [(set v4i32:$vD, (add v4i32:$vA, v4i32:$vB))]>;
+                      
+def VADDCUW : VX1_Int_Ty<384, "vaddcuw", int_ppc_altivec_vaddcuw, v4i32>;
+def VADDSBS : VX1_Int_Ty<768, "vaddsbs", int_ppc_altivec_vaddsbs, v16i8>;
+def VADDSHS : VX1_Int_Ty<832, "vaddshs", int_ppc_altivec_vaddshs, v8i16>;
+def VADDSWS : VX1_Int_Ty<896, "vaddsws", int_ppc_altivec_vaddsws, v4i32>;
+def VADDUBS : VX1_Int_Ty<512, "vaddubs", int_ppc_altivec_vaddubs, v16i8>;
+def VADDUHS : VX1_Int_Ty<576, "vadduhs", int_ppc_altivec_vadduhs, v8i16>;
+def VADDUWS : VX1_Int_Ty<640, "vadduws", int_ppc_altivec_vadduws, v4i32>;
+} // isCommutable
+
+let isCommutable = 1 in
+def VAND : VXForm_1<1028, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                    "vand $vD, $vA, $vB", IIC_VecFP,
+                    [(set v4i32:$vD, (and v4i32:$vA, v4i32:$vB))]>;
+def VANDC : VXForm_1<1092, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                     "vandc $vD, $vA, $vB", IIC_VecFP,
+                     [(set v4i32:$vD, (and v4i32:$vA,
+                                           (vnot_ppc v4i32:$vB)))]>;
+
+def VCFSX  : VXForm_1<842, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB),
+                      "vcfsx $vD, $vB, $UIMM", IIC_VecFP,
+                      [(set v4f32:$vD,
+                             (int_ppc_altivec_vcfsx v4i32:$vB, imm:$UIMM))]>;
+def VCFUX  : VXForm_1<778, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB),
+                      "vcfux $vD, $vB, $UIMM", IIC_VecFP,
+                      [(set v4f32:$vD,
+                             (int_ppc_altivec_vcfux v4i32:$vB, imm:$UIMM))]>;
+def VCTSXS : VXForm_1<970, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB),
+                      "vctsxs $vD, $vB, $UIMM", IIC_VecFP,
+                      [(set v4i32:$vD,
+                             (int_ppc_altivec_vctsxs v4f32:$vB, imm:$UIMM))]>;
+def VCTUXS : VXForm_1<906, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB),
+                      "vctuxs $vD, $vB, $UIMM", IIC_VecFP,
+                      [(set v4i32:$vD,
+                             (int_ppc_altivec_vctuxs v4f32:$vB, imm:$UIMM))]>;
+
+// Defines with the UIM field set to 0 for floating-point
+// to integer (fp_to_sint/fp_to_uint) conversions and integer
+// to floating-point (sint_to_fp/uint_to_fp) conversions.
+let isCodeGenOnly = 1, VA = 0 in {
+def VCFSX_0 : VXForm_1<842, (outs vrrc:$vD), (ins vrrc:$vB),
+                       "vcfsx $vD, $vB, 0", IIC_VecFP,
+                       [(set v4f32:$vD,
+                             (int_ppc_altivec_vcfsx v4i32:$vB, 0))]>;
+def VCTUXS_0 : VXForm_1<906, (outs vrrc:$vD), (ins vrrc:$vB),
+                        "vctuxs $vD, $vB, 0", IIC_VecFP,
+                        [(set v4i32:$vD,
+                               (int_ppc_altivec_vctuxs v4f32:$vB, 0))]>;
+def VCFUX_0 : VXForm_1<778, (outs vrrc:$vD), (ins vrrc:$vB),
+                       "vcfux $vD, $vB, 0", IIC_VecFP,
+                       [(set v4f32:$vD,
+                               (int_ppc_altivec_vcfux v4i32:$vB, 0))]>;
+def VCTSXS_0 : VXForm_1<970, (outs vrrc:$vD), (ins vrrc:$vB),
+                      "vctsxs $vD, $vB, 0", IIC_VecFP,
+                      [(set v4i32:$vD,
+                             (int_ppc_altivec_vctsxs v4f32:$vB, 0))]>;
+}
+def VEXPTEFP : VX2_Int_SP<394, "vexptefp", int_ppc_altivec_vexptefp>;
+def VLOGEFP  : VX2_Int_SP<458, "vlogefp",  int_ppc_altivec_vlogefp>;
+
+let isCommutable = 1 in {
+def VAVGSB : VX1_Int_Ty<1282, "vavgsb", int_ppc_altivec_vavgsb, v16i8>;
+def VAVGSH : VX1_Int_Ty<1346, "vavgsh", int_ppc_altivec_vavgsh, v8i16>;
+def VAVGSW : VX1_Int_Ty<1410, "vavgsw", int_ppc_altivec_vavgsw, v4i32>;
+def VAVGUB : VX1_Int_Ty<1026, "vavgub", int_ppc_altivec_vavgub, v16i8>;
+def VAVGUH : VX1_Int_Ty<1090, "vavguh", int_ppc_altivec_vavguh, v8i16>;
+def VAVGUW : VX1_Int_Ty<1154, "vavguw", int_ppc_altivec_vavguw, v4i32>;
+
+def VMAXFP : VX1_Int_Ty<1034, "vmaxfp", int_ppc_altivec_vmaxfp, v4f32>;
+def VMAXSB : VX1_Int_Ty< 258, "vmaxsb", int_ppc_altivec_vmaxsb, v16i8>;
+def VMAXSH : VX1_Int_Ty< 322, "vmaxsh", int_ppc_altivec_vmaxsh, v8i16>;
+def VMAXSW : VX1_Int_Ty< 386, "vmaxsw", int_ppc_altivec_vmaxsw, v4i32>;
+def VMAXUB : VX1_Int_Ty<   2, "vmaxub", int_ppc_altivec_vmaxub, v16i8>;
+def VMAXUH : VX1_Int_Ty<  66, "vmaxuh", int_ppc_altivec_vmaxuh, v8i16>;
+def VMAXUW : VX1_Int_Ty< 130, "vmaxuw", int_ppc_altivec_vmaxuw, v4i32>;
+def VMINFP : VX1_Int_Ty<1098, "vminfp", int_ppc_altivec_vminfp, v4f32>;
+def VMINSB : VX1_Int_Ty< 770, "vminsb", int_ppc_altivec_vminsb, v16i8>;
+def VMINSH : VX1_Int_Ty< 834, "vminsh", int_ppc_altivec_vminsh, v8i16>;
+def VMINSW : VX1_Int_Ty< 898, "vminsw", int_ppc_altivec_vminsw, v4i32>;
+def VMINUB : VX1_Int_Ty< 514, "vminub", int_ppc_altivec_vminub, v16i8>;
+def VMINUH : VX1_Int_Ty< 578, "vminuh", int_ppc_altivec_vminuh, v8i16>;
+def VMINUW : VX1_Int_Ty< 642, "vminuw", int_ppc_altivec_vminuw, v4i32>;
+} // isCommutable
+
+def VMRGHB : VXForm_1< 12, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                      "vmrghb $vD, $vA, $vB", IIC_VecFP,
+                      [(set v16i8:$vD, (vmrghb_shuffle v16i8:$vA, v16i8:$vB))]>;
+def VMRGHH : VXForm_1< 76, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                      "vmrghh $vD, $vA, $vB", IIC_VecFP,
+                      [(set v16i8:$vD, (vmrghh_shuffle v16i8:$vA, v16i8:$vB))]>;
+def VMRGHW : VXForm_1<140, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                      "vmrghw $vD, $vA, $vB", IIC_VecFP,
+                      [(set v16i8:$vD, (vmrghw_shuffle v16i8:$vA, v16i8:$vB))]>;
+def VMRGLB : VXForm_1<268, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                      "vmrglb $vD, $vA, $vB", IIC_VecFP,
+                      [(set v16i8:$vD, (vmrglb_shuffle v16i8:$vA, v16i8:$vB))]>;
+def VMRGLH : VXForm_1<332, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                      "vmrglh $vD, $vA, $vB", IIC_VecFP,
+                      [(set v16i8:$vD, (vmrglh_shuffle v16i8:$vA, v16i8:$vB))]>;
+def VMRGLW : VXForm_1<396, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                      "vmrglw $vD, $vA, $vB", IIC_VecFP,
+                      [(set v16i8:$vD, (vmrglw_shuffle v16i8:$vA, v16i8:$vB))]>;
+
+def VMSUMMBM : VA1a_Int_Ty3<37, "vmsummbm", int_ppc_altivec_vmsummbm,
+                            v4i32, v16i8, v4i32>;
+def VMSUMSHM : VA1a_Int_Ty3<40, "vmsumshm", int_ppc_altivec_vmsumshm,
+                            v4i32, v8i16, v4i32>;
+def VMSUMSHS : VA1a_Int_Ty3<41, "vmsumshs", int_ppc_altivec_vmsumshs,
+                            v4i32, v8i16, v4i32>;
+def VMSUMUBM : VA1a_Int_Ty3<36, "vmsumubm", int_ppc_altivec_vmsumubm,
+                            v4i32, v16i8, v4i32>;
+def VMSUMUHM : VA1a_Int_Ty3<38, "vmsumuhm", int_ppc_altivec_vmsumuhm,
+                            v4i32, v8i16, v4i32>;
+def VMSUMUHS : VA1a_Int_Ty3<39, "vmsumuhs", int_ppc_altivec_vmsumuhs,
+                            v4i32, v8i16, v4i32>;
+
+let isCommutable = 1 in {
+def VMULESB : VX1_Int_Ty2<776, "vmulesb", int_ppc_altivec_vmulesb,
+                          v8i16, v16i8>;
+def VMULESH : VX1_Int_Ty2<840, "vmulesh", int_ppc_altivec_vmulesh,
+                          v4i32, v8i16>;
+def VMULEUB : VX1_Int_Ty2<520, "vmuleub", int_ppc_altivec_vmuleub,
+                          v8i16, v16i8>;
+def VMULEUH : VX1_Int_Ty2<584, "vmuleuh", int_ppc_altivec_vmuleuh,
+                          v4i32, v8i16>;
+def VMULOSB : VX1_Int_Ty2<264, "vmulosb", int_ppc_altivec_vmulosb,
+                          v8i16, v16i8>;
+def VMULOSH : VX1_Int_Ty2<328, "vmulosh", int_ppc_altivec_vmulosh,
+                          v4i32, v8i16>;
+def VMULOUB : VX1_Int_Ty2<  8, "vmuloub", int_ppc_altivec_vmuloub,
+                          v8i16, v16i8>;
+def VMULOUH : VX1_Int_Ty2< 72, "vmulouh", int_ppc_altivec_vmulouh,
+                          v4i32, v8i16>;
+} // isCommutable
+                       
+def VREFP     : VX2_Int_SP<266, "vrefp",     int_ppc_altivec_vrefp>;
+def VRFIM     : VX2_Int_SP<714, "vrfim",     int_ppc_altivec_vrfim>;
+def VRFIN     : VX2_Int_SP<522, "vrfin",     int_ppc_altivec_vrfin>;
+def VRFIP     : VX2_Int_SP<650, "vrfip",     int_ppc_altivec_vrfip>;
+def VRFIZ     : VX2_Int_SP<586, "vrfiz",     int_ppc_altivec_vrfiz>;
+def VRSQRTEFP : VX2_Int_SP<330, "vrsqrtefp", int_ppc_altivec_vrsqrtefp>;
+
+def VSUBCUW : VX1_Int_Ty<1408, "vsubcuw", int_ppc_altivec_vsubcuw, v4i32>;
+
+def VSUBFP  : VXForm_1<74, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                      "vsubfp $vD, $vA, $vB", IIC_VecGeneral,
+                      [(set v4f32:$vD, (fsub v4f32:$vA, v4f32:$vB))]>;
+def VSUBUBM : VXForm_1<1024, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                      "vsububm $vD, $vA, $vB", IIC_VecGeneral,
+                      [(set v16i8:$vD, (sub v16i8:$vA, v16i8:$vB))]>;
+def VSUBUHM : VXForm_1<1088, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                      "vsubuhm $vD, $vA, $vB", IIC_VecGeneral,
+                      [(set v8i16:$vD, (sub v8i16:$vA, v8i16:$vB))]>;
+def VSUBUWM : VXForm_1<1152, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                      "vsubuwm $vD, $vA, $vB", IIC_VecGeneral,
+                      [(set v4i32:$vD, (sub v4i32:$vA, v4i32:$vB))]>;
+                      
+def VSUBSBS : VX1_Int_Ty<1792, "vsubsbs" , int_ppc_altivec_vsubsbs, v16i8>;
+def VSUBSHS : VX1_Int_Ty<1856, "vsubshs" , int_ppc_altivec_vsubshs, v8i16>;
+def VSUBSWS : VX1_Int_Ty<1920, "vsubsws" , int_ppc_altivec_vsubsws, v4i32>;
+def VSUBUBS : VX1_Int_Ty<1536, "vsububs" , int_ppc_altivec_vsububs, v16i8>;
+def VSUBUHS : VX1_Int_Ty<1600, "vsubuhs" , int_ppc_altivec_vsubuhs, v8i16>;
+def VSUBUWS : VX1_Int_Ty<1664, "vsubuws" , int_ppc_altivec_vsubuws, v4i32>;
+
+def VSUMSWS : VX1_Int_Ty<1928, "vsumsws" , int_ppc_altivec_vsumsws, v4i32>;
+def VSUM2SWS: VX1_Int_Ty<1672, "vsum2sws", int_ppc_altivec_vsum2sws, v4i32>;
+
+def VSUM4SBS: VX1_Int_Ty3<1800, "vsum4sbs", int_ppc_altivec_vsum4sbs,
+                          v4i32, v16i8, v4i32>;
+def VSUM4SHS: VX1_Int_Ty3<1608, "vsum4shs", int_ppc_altivec_vsum4shs,
+                          v4i32, v8i16, v4i32>;
+def VSUM4UBS: VX1_Int_Ty3<1544, "vsum4ubs", int_ppc_altivec_vsum4ubs,
+                          v4i32, v16i8, v4i32>;
+
+def VNOR : VXForm_1<1284, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                    "vnor $vD, $vA, $vB", IIC_VecFP,
+                    [(set v4i32:$vD, (vnot_ppc (or v4i32:$vA,
+                                                   v4i32:$vB)))]>;
+let isCommutable = 1 in {
+def VOR : VXForm_1<1156, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                      "vor $vD, $vA, $vB", IIC_VecFP,
+                      [(set v4i32:$vD, (or v4i32:$vA, v4i32:$vB))]>;
+def VXOR : VXForm_1<1220, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                      "vxor $vD, $vA, $vB", IIC_VecFP,
+                      [(set v4i32:$vD, (xor v4i32:$vA, v4i32:$vB))]>;
+} // isCommutable
+
+def VRLB   : VX1_Int_Ty<   4, "vrlb", int_ppc_altivec_vrlb, v16i8>;
+def VRLH   : VX1_Int_Ty<  68, "vrlh", int_ppc_altivec_vrlh, v8i16>;
+def VRLW   : VX1_Int_Ty< 132, "vrlw", int_ppc_altivec_vrlw, v4i32>;
+
+def VSL    : VX1_Int_Ty< 452, "vsl" , int_ppc_altivec_vsl,  v4i32 >;
+def VSLO   : VX1_Int_Ty<1036, "vslo", int_ppc_altivec_vslo, v4i32>;
+
+def VSLB   : VX1_Int_Ty< 260, "vslb", int_ppc_altivec_vslb, v16i8>;
+def VSLH   : VX1_Int_Ty< 324, "vslh", int_ppc_altivec_vslh, v8i16>;
+def VSLW   : VX1_Int_Ty< 388, "vslw", int_ppc_altivec_vslw, v4i32>;
+
+def VSPLTB : VXForm_1<524, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB),
+                      "vspltb $vD, $vB, $UIMM", IIC_VecPerm,
+                      [(set v16i8:$vD,
+                        (vspltb_shuffle:$UIMM v16i8:$vB, (undef)))]>;
+def VSPLTH : VXForm_1<588, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB),
+                      "vsplth $vD, $vB, $UIMM", IIC_VecPerm,
+                      [(set v16i8:$vD,
+                        (vsplth_shuffle:$UIMM v16i8:$vB, (undef)))]>;
+def VSPLTW : VXForm_1<652, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB),
+                      "vspltw $vD, $vB, $UIMM", IIC_VecPerm,
+                      [(set v16i8:$vD, 
+                        (vspltw_shuffle:$UIMM v16i8:$vB, (undef)))]>;
+
+def VSR    : VX1_Int_Ty< 708, "vsr"  , int_ppc_altivec_vsr,  v4i32>;
+def VSRO   : VX1_Int_Ty<1100, "vsro" , int_ppc_altivec_vsro, v4i32>;
+
+def VSRAB  : VX1_Int_Ty< 772, "vsrab", int_ppc_altivec_vsrab, v16i8>;
+def VSRAH  : VX1_Int_Ty< 836, "vsrah", int_ppc_altivec_vsrah, v8i16>;
+def VSRAW  : VX1_Int_Ty< 900, "vsraw", int_ppc_altivec_vsraw, v4i32>;
+def VSRB   : VX1_Int_Ty< 516, "vsrb" , int_ppc_altivec_vsrb , v16i8>;
+def VSRH   : VX1_Int_Ty< 580, "vsrh" , int_ppc_altivec_vsrh , v8i16>;
+def VSRW   : VX1_Int_Ty< 644, "vsrw" , int_ppc_altivec_vsrw , v4i32>;
+
+
+def VSPLTISB : VXForm_3<780, (outs vrrc:$vD), (ins s5imm:$SIMM),
+                       "vspltisb $vD, $SIMM", IIC_VecPerm,
+                       [(set v16i8:$vD, (v16i8 vecspltisb:$SIMM))]>;
+def VSPLTISH : VXForm_3<844, (outs vrrc:$vD), (ins s5imm:$SIMM),
+                       "vspltish $vD, $SIMM", IIC_VecPerm,
+                       [(set v8i16:$vD, (v8i16 vecspltish:$SIMM))]>;
+def VSPLTISW : VXForm_3<908, (outs vrrc:$vD), (ins s5imm:$SIMM),
+                       "vspltisw $vD, $SIMM", IIC_VecPerm,
+                       [(set v4i32:$vD, (v4i32 vecspltisw:$SIMM))]>;
+
+// Vector Pack.
+def VPKPX   : VX1_Int_Ty2<782, "vpkpx", int_ppc_altivec_vpkpx,
+                          v8i16, v4i32>;
+def VPKSHSS : VX1_Int_Ty2<398, "vpkshss", int_ppc_altivec_vpkshss,
+                          v16i8, v8i16>;
+def VPKSHUS : VX1_Int_Ty2<270, "vpkshus", int_ppc_altivec_vpkshus,
+                          v16i8, v8i16>;
+def VPKSWSS : VX1_Int_Ty2<462, "vpkswss", int_ppc_altivec_vpkswss,
+                          v16i8, v4i32>;
+def VPKSWUS : VX1_Int_Ty2<334, "vpkswus", int_ppc_altivec_vpkswus,
+                          v8i16, v4i32>;
+def VPKUHUM : VXForm_1<14, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                       "vpkuhum $vD, $vA, $vB", IIC_VecFP,
+                       [(set v16i8:$vD,
+                         (vpkuhum_shuffle v16i8:$vA, v16i8:$vB))]>;
+def VPKUHUS : VX1_Int_Ty2<142, "vpkuhus", int_ppc_altivec_vpkuhus,
+                          v16i8, v8i16>;
+def VPKUWUM : VXForm_1<78, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                       "vpkuwum $vD, $vA, $vB", IIC_VecFP,
+                       [(set v16i8:$vD,
+                         (vpkuwum_shuffle v16i8:$vA, v16i8:$vB))]>;
+def VPKUWUS : VX1_Int_Ty2<206, "vpkuwus", int_ppc_altivec_vpkuwus,
+                          v8i16, v4i32>;
+
+// Vector Unpack.
+def VUPKHPX : VX2_Int_Ty2<846, "vupkhpx", int_ppc_altivec_vupkhpx,
+                          v4i32, v8i16>;
+def VUPKHSB : VX2_Int_Ty2<526, "vupkhsb", int_ppc_altivec_vupkhsb,
+                          v8i16, v16i8>;
+def VUPKHSH : VX2_Int_Ty2<590, "vupkhsh", int_ppc_altivec_vupkhsh,
+                          v4i32, v8i16>;
+def VUPKLPX : VX2_Int_Ty2<974, "vupklpx", int_ppc_altivec_vupklpx,
+                          v4i32, v8i16>;
+def VUPKLSB : VX2_Int_Ty2<654, "vupklsb", int_ppc_altivec_vupklsb,
+                          v8i16, v16i8>;
+def VUPKLSH : VX2_Int_Ty2<718, "vupklsh", int_ppc_altivec_vupklsh,
+                          v4i32, v8i16>;
+
+
+// Altivec Comparisons.
+
+class VCMP<bits<10> xo, string asmstr, ValueType Ty>
+  : VXRForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), asmstr,
+              IIC_VecFPCompare,
+              [(set Ty:$vD, (Ty (PPCvcmp Ty:$vA, Ty:$vB, xo)))]>;
+class VCMPo<bits<10> xo, string asmstr, ValueType Ty>
+  : VXRForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), asmstr,
+              IIC_VecFPCompare,
+              [(set Ty:$vD, (Ty (PPCvcmp_o Ty:$vA, Ty:$vB, xo)))]> {
+  let Defs = [CR6];
+  let RC = 1;
+}
+
+// f32 element comparisons.0
+def VCMPBFP   : VCMP <966, "vcmpbfp $vD, $vA, $vB"  , v4f32>;
+def VCMPBFPo  : VCMPo<966, "vcmpbfp. $vD, $vA, $vB" , v4f32>;
+def VCMPEQFP  : VCMP <198, "vcmpeqfp $vD, $vA, $vB" , v4f32>;
+def VCMPEQFPo : VCMPo<198, "vcmpeqfp. $vD, $vA, $vB", v4f32>;
+def VCMPGEFP  : VCMP <454, "vcmpgefp $vD, $vA, $vB" , v4f32>;
+def VCMPGEFPo : VCMPo<454, "vcmpgefp. $vD, $vA, $vB", v4f32>;
+def VCMPGTFP  : VCMP <710, "vcmpgtfp $vD, $vA, $vB" , v4f32>;
+def VCMPGTFPo : VCMPo<710, "vcmpgtfp. $vD, $vA, $vB", v4f32>;
+
+// i8 element comparisons.
+def VCMPEQUB  : VCMP <  6, "vcmpequb $vD, $vA, $vB" , v16i8>;
+def VCMPEQUBo : VCMPo<  6, "vcmpequb. $vD, $vA, $vB", v16i8>;
+def VCMPGTSB  : VCMP <774, "vcmpgtsb $vD, $vA, $vB" , v16i8>;
+def VCMPGTSBo : VCMPo<774, "vcmpgtsb. $vD, $vA, $vB", v16i8>;
+def VCMPGTUB  : VCMP <518, "vcmpgtub $vD, $vA, $vB" , v16i8>;
+def VCMPGTUBo : VCMPo<518, "vcmpgtub. $vD, $vA, $vB", v16i8>;
+
+// i16 element comparisons.
+def VCMPEQUH  : VCMP < 70, "vcmpequh $vD, $vA, $vB" , v8i16>;
+def VCMPEQUHo : VCMPo< 70, "vcmpequh. $vD, $vA, $vB", v8i16>;
+def VCMPGTSH  : VCMP <838, "vcmpgtsh $vD, $vA, $vB" , v8i16>;
+def VCMPGTSHo : VCMPo<838, "vcmpgtsh. $vD, $vA, $vB", v8i16>;
+def VCMPGTUH  : VCMP <582, "vcmpgtuh $vD, $vA, $vB" , v8i16>;
+def VCMPGTUHo : VCMPo<582, "vcmpgtuh. $vD, $vA, $vB", v8i16>;
+
+// i32 element comparisons.
+def VCMPEQUW  : VCMP <134, "vcmpequw $vD, $vA, $vB" , v4i32>;
+def VCMPEQUWo : VCMPo<134, "vcmpequw. $vD, $vA, $vB", v4i32>;
+def VCMPGTSW  : VCMP <902, "vcmpgtsw $vD, $vA, $vB" , v4i32>;
+def VCMPGTSWo : VCMPo<902, "vcmpgtsw. $vD, $vA, $vB", v4i32>;
+def VCMPGTUW  : VCMP <646, "vcmpgtuw $vD, $vA, $vB" , v4i32>;
+def VCMPGTUWo : VCMPo<646, "vcmpgtuw. $vD, $vA, $vB", v4i32>;
+                      
+let isCodeGenOnly = 1 in {
+def V_SET0B : VXForm_setzero<1220, (outs vrrc:$vD), (ins),
+                      "vxor $vD, $vD, $vD", IIC_VecFP,
+                      [(set v16i8:$vD, (v16i8 immAllZerosV))]>;
+def V_SET0H : VXForm_setzero<1220, (outs vrrc:$vD), (ins),
+                      "vxor $vD, $vD, $vD", IIC_VecFP,
+                      [(set v8i16:$vD, (v8i16 immAllZerosV))]>;
+def V_SET0  : VXForm_setzero<1220, (outs vrrc:$vD), (ins),
+                      "vxor $vD, $vD, $vD", IIC_VecFP,
+                      [(set v4i32:$vD, (v4i32 immAllZerosV))]>;
+
+let IMM=-1 in {
+def V_SETALLONESB : VXForm_3<908, (outs vrrc:$vD), (ins),
+                      "vspltisw $vD, -1", IIC_VecFP,
+                      [(set v16i8:$vD, (v16i8 immAllOnesV))]>;
+def V_SETALLONESH : VXForm_3<908, (outs vrrc:$vD), (ins),
+                      "vspltisw $vD, -1", IIC_VecFP,
+                      [(set v8i16:$vD, (v8i16 immAllOnesV))]>;
+def V_SETALLONES  : VXForm_3<908, (outs vrrc:$vD), (ins),
+                      "vspltisw $vD, -1", IIC_VecFP,
+                      [(set v4i32:$vD, (v4i32 immAllOnesV))]>;
+}
+}
+} // VALU Operations.
+
+//===----------------------------------------------------------------------===//
+// Additional Altivec Patterns
+//
+
+// DS* intrinsics
+def : Pat<(int_ppc_altivec_dssall), (DSSALL 1, 0, 0, 0)>;
+def : Pat<(int_ppc_altivec_dss imm:$STRM), (DSS 0, imm:$STRM, 0, 0)>;
+
+//  * 32-bit
+def : Pat<(int_ppc_altivec_dst i32:$rA, i32:$rB, imm:$STRM),
+          (DST 0, imm:$STRM, $rA, $rB)>;
+def : Pat<(int_ppc_altivec_dstt i32:$rA, i32:$rB, imm:$STRM),
+          (DSTT 1, imm:$STRM, $rA, $rB)>;
+def : Pat<(int_ppc_altivec_dstst i32:$rA, i32:$rB, imm:$STRM),
+          (DSTST 0, imm:$STRM, $rA, $rB)>;
+def : Pat<(int_ppc_altivec_dststt i32:$rA, i32:$rB, imm:$STRM),
+          (DSTSTT 1, imm:$STRM, $rA, $rB)>;
+
+//  * 64-bit
+def : Pat<(int_ppc_altivec_dst i64:$rA, i32:$rB, imm:$STRM),
+          (DST64 0, imm:$STRM, $rA, $rB)>;
+def : Pat<(int_ppc_altivec_dstt i64:$rA, i32:$rB, imm:$STRM),
+          (DSTT64 1, imm:$STRM, $rA, $rB)>;
+def : Pat<(int_ppc_altivec_dstst i64:$rA, i32:$rB, imm:$STRM),
+          (DSTST64 0, imm:$STRM, $rA, $rB)>;
+def : Pat<(int_ppc_altivec_dststt i64:$rA, i32:$rB, imm:$STRM),
+          (DSTSTT64 1, imm:$STRM, $rA, $rB)>;
+
+// Loads.
+def : Pat<(v4i32 (load xoaddr:$src)), (LVX xoaddr:$src)>;
+
+// Stores.
+def : Pat<(store v4i32:$rS, xoaddr:$dst),
+          (STVX $rS, xoaddr:$dst)>;
+
+// Bit conversions.
+def : Pat<(v16i8 (bitconvert (v8i16 VRRC:$src))), (v16i8 VRRC:$src)>;
+def : Pat<(v16i8 (bitconvert (v4i32 VRRC:$src))), (v16i8 VRRC:$src)>;
+def : Pat<(v16i8 (bitconvert (v4f32 VRRC:$src))), (v16i8 VRRC:$src)>;
+
+def : Pat<(v8i16 (bitconvert (v16i8 VRRC:$src))), (v8i16 VRRC:$src)>;
+def : Pat<(v8i16 (bitconvert (v4i32 VRRC:$src))), (v8i16 VRRC:$src)>;
+def : Pat<(v8i16 (bitconvert (v4f32 VRRC:$src))), (v8i16 VRRC:$src)>;
+
+def : Pat<(v4i32 (bitconvert (v16i8 VRRC:$src))), (v4i32 VRRC:$src)>;
+def : Pat<(v4i32 (bitconvert (v8i16 VRRC:$src))), (v4i32 VRRC:$src)>;
+def : Pat<(v4i32 (bitconvert (v4f32 VRRC:$src))), (v4i32 VRRC:$src)>;
+
+def : Pat<(v4f32 (bitconvert (v16i8 VRRC:$src))), (v4f32 VRRC:$src)>;
+def : Pat<(v4f32 (bitconvert (v8i16 VRRC:$src))), (v4f32 VRRC:$src)>;
+def : Pat<(v4f32 (bitconvert (v4i32 VRRC:$src))), (v4f32 VRRC:$src)>;
+
+// Shuffles.
+
+// Match vsldoi(x,x), vpkuwum(x,x), vpkuhum(x,x)
+def:Pat<(vsldoi_unary_shuffle:$in v16i8:$vA, undef),
+        (VSLDOI $vA, $vA, (VSLDOI_unary_get_imm $in))>;
+def:Pat<(vpkuwum_unary_shuffle v16i8:$vA, undef),
+        (VPKUWUM $vA, $vA)>;
+def:Pat<(vpkuhum_unary_shuffle v16i8:$vA, undef),
+        (VPKUHUM $vA, $vA)>;
+
+// Match vsldoi(y,x), vpkuwum(y,x), vpkuhum(y,x), i.e., swapped operands.
+// These fragments are matched for little-endian, where the inputs must
+// be swapped for correct semantics.
+def:Pat<(vsldoi_swapped_shuffle:$in v16i8:$vA, v16i8:$vB),
+        (VSLDOI $vB, $vA, (VSLDOI_swapped_get_imm $in))>;
+def:Pat<(vpkuwum_swapped_shuffle v16i8:$vA, v16i8:$vB),
+        (VPKUWUM $vB, $vA)>;
+def:Pat<(vpkuhum_swapped_shuffle v16i8:$vA, v16i8:$vB),
+        (VPKUHUM $vB, $vA)>;
+
+// Match vmrg*(x,x)
+def:Pat<(vmrglb_unary_shuffle v16i8:$vA, undef),
+        (VMRGLB $vA, $vA)>;
+def:Pat<(vmrglh_unary_shuffle v16i8:$vA, undef),
+        (VMRGLH $vA, $vA)>;
+def:Pat<(vmrglw_unary_shuffle v16i8:$vA, undef),
+        (VMRGLW $vA, $vA)>;
+def:Pat<(vmrghb_unary_shuffle v16i8:$vA, undef),
+        (VMRGHB $vA, $vA)>;
+def:Pat<(vmrghh_unary_shuffle v16i8:$vA, undef),
+        (VMRGHH $vA, $vA)>;
+def:Pat<(vmrghw_unary_shuffle v16i8:$vA, undef),
+        (VMRGHW $vA, $vA)>;
+
+// Match vmrg*(y,x), i.e., swapped operands.  These fragments
+// are matched for little-endian, where the inputs must be
+// swapped for correct semantics.
+def:Pat<(vmrglb_swapped_shuffle v16i8:$vA, v16i8:$vB),
+        (VMRGLB $vB, $vA)>;
+def:Pat<(vmrglh_swapped_shuffle v16i8:$vA, v16i8:$vB),
+        (VMRGLH $vB, $vA)>;
+def:Pat<(vmrglw_swapped_shuffle v16i8:$vA, v16i8:$vB),
+        (VMRGLW $vB, $vA)>;
+def:Pat<(vmrghb_swapped_shuffle v16i8:$vA, v16i8:$vB),
+        (VMRGHB $vB, $vA)>;
+def:Pat<(vmrghh_swapped_shuffle v16i8:$vA, v16i8:$vB),
+        (VMRGHH $vB, $vA)>;
+def:Pat<(vmrghw_swapped_shuffle v16i8:$vA, v16i8:$vB),
+        (VMRGHW $vB, $vA)>;
+
+// Logical Operations
+def : Pat<(vnot_ppc v4i32:$vA), (VNOR $vA, $vA)>;
+
+def : Pat<(vnot_ppc (or v4i32:$A, v4i32:$B)),
+          (VNOR $A, $B)>;
+def : Pat<(and v4i32:$A, (vnot_ppc v4i32:$B)),
+          (VANDC $A, $B)>;
+
+def : Pat<(fmul v4f32:$vA, v4f32:$vB),
+          (VMADDFP $vA, $vB,
+             (v4i32 (VSLW (V_SETALLONES), (V_SETALLONES))))>; 
+
+// Fused multiply add and multiply sub for packed float.  These are represented
+// separately from the real instructions above, for operations that must have
+// the additional precision, such as Newton-Rhapson (used by divide, sqrt)
+def : Pat<(PPCvmaddfp v4f32:$A, v4f32:$B, v4f32:$C),
+          (VMADDFP $A, $B, $C)>;
+def : Pat<(PPCvnmsubfp v4f32:$A, v4f32:$B, v4f32:$C),
+          (VNMSUBFP $A, $B, $C)>;
+
+def : Pat<(int_ppc_altivec_vmaddfp v4f32:$A, v4f32:$B, v4f32:$C),
+          (VMADDFP $A, $B, $C)>;
+def : Pat<(int_ppc_altivec_vnmsubfp v4f32:$A, v4f32:$B, v4f32:$C),
+          (VNMSUBFP $A, $B, $C)>;
+
+def : Pat<(PPCvperm v16i8:$vA, v16i8:$vB, v16i8:$vC),
+          (VPERM $vA, $vB, $vC)>;
+
+def : Pat<(PPCfre v4f32:$A), (VREFP $A)>;
+def : Pat<(PPCfrsqrte v4f32:$A), (VRSQRTEFP $A)>;
+
+// Vector shifts
+def : Pat<(v16i8 (shl v16i8:$vA, v16i8:$vB)),
+          (v16i8 (VSLB $vA, $vB))>;
+def : Pat<(v8i16 (shl v8i16:$vA, v8i16:$vB)),
+          (v8i16 (VSLH $vA, $vB))>;
+def : Pat<(v4i32 (shl v4i32:$vA, v4i32:$vB)),
+          (v4i32 (VSLW $vA, $vB))>;
+
+def : Pat<(v16i8 (srl v16i8:$vA, v16i8:$vB)),
+          (v16i8 (VSRB $vA, $vB))>;
+def : Pat<(v8i16 (srl v8i16:$vA, v8i16:$vB)),
+          (v8i16 (VSRH $vA, $vB))>;
+def : Pat<(v4i32 (srl v4i32:$vA, v4i32:$vB)),
+          (v4i32 (VSRW $vA, $vB))>;
+
+def : Pat<(v16i8 (sra v16i8:$vA, v16i8:$vB)),
+          (v16i8 (VSRAB $vA, $vB))>;
+def : Pat<(v8i16 (sra v8i16:$vA, v8i16:$vB)),
+          (v8i16 (VSRAH $vA, $vB))>;
+def : Pat<(v4i32 (sra v4i32:$vA, v4i32:$vB)),
+          (v4i32 (VSRAW $vA, $vB))>;
+
+// Float to integer and integer to float conversions
+def : Pat<(v4i32 (fp_to_sint v4f32:$vA)),
+           (VCTSXS_0 $vA)>;
+def : Pat<(v4i32 (fp_to_uint v4f32:$vA)),
+           (VCTUXS_0 $vA)>;
+def : Pat<(v4f32 (sint_to_fp v4i32:$vA)),
+           (VCFSX_0 $vA)>;
+def : Pat<(v4f32 (uint_to_fp v4i32:$vA)),
+           (VCFUX_0 $vA)>;
+
+// Floating-point rounding
+def : Pat<(v4f32 (ffloor v4f32:$vA)),
+          (VRFIM $vA)>;
+def : Pat<(v4f32 (fceil v4f32:$vA)),
+          (VRFIP $vA)>;
+def : Pat<(v4f32 (ftrunc v4f32:$vA)),
+          (VRFIZ $vA)>;
+def : Pat<(v4f32 (fnearbyint v4f32:$vA)),
+          (VRFIN $vA)>;
+
+} // end HasAltivec
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrBuilder.h b/contrib/llvm/lib/Target/PowerPC/PPCInstrBuilder.h
new file mode 100644
index 0000000..b424d11
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrBuilder.h
@@ -0,0 +1,43 @@
+//===-- PPCInstrBuilder.h - Aides for building PPC insts --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes functions that may be used with BuildMI from the
+// MachineInstrBuilder.h file to simplify generating frame and constant pool
+// references.
+//
+// For reference, the order of operands for memory references is:
+// (Operand), Dest Reg, Base Reg, and either Reg Index or Immediate
+// Displacement.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef POWERPC_INSTRBUILDER_H
+#define POWERPC_INSTRBUILDER_H
+
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+
+namespace llvm {
+
+/// addFrameReference - This function is used to add a reference to the base of
+/// an abstract object on the stack frame of the current function.  This
+/// reference has base register as the FrameIndex offset until it is resolved.
+/// This allows a constant offset to be specified as well...
+///
+static inline const MachineInstrBuilder&
+addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0,
+                  bool mem = true) {
+  if (mem)
+    return MIB.addImm(Offset).addFrameIndex(FI);
+  else
+    return MIB.addFrameIndex(FI).addImm(Offset);
+}
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrFormats.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrFormats.td
new file mode 100644
index 0000000..1e4396c
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrFormats.td
@@ -0,0 +1,1300 @@
+//===- PowerPCInstrFormats.td - PowerPC Instruction Formats --*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//
+// PowerPC instruction formats
+
+class I<bits<6> opcode, dag OOL, dag IOL, string asmstr, InstrItinClass itin>
+        : Instruction {
+  field bits<32> Inst;
+  field bits<32> SoftFail = 0;
+  let Size = 4;
+
+  bit PPC64 = 0;  // Default value, override with isPPC64
+
+  let Namespace = "PPC";
+  let Inst{0-5} = opcode;
+  let OutOperandList = OOL;
+  let InOperandList = IOL;
+  let AsmString = asmstr;
+  let Itinerary = itin;
+
+  bits<1> PPC970_First = 0;
+  bits<1> PPC970_Single = 0;
+  bits<1> PPC970_Cracked = 0;
+  bits<3> PPC970_Unit = 0;
+
+  /// These fields correspond to the fields in PPCInstrInfo.h.  Any changes to
+  /// these must be reflected there!  See comments there for what these are.
+  let TSFlags{0}   = PPC970_First;
+  let TSFlags{1}   = PPC970_Single;
+  let TSFlags{2}   = PPC970_Cracked;
+  let TSFlags{5-3} = PPC970_Unit;
+
+  // Fields used for relation models.
+  string BaseName = "";
+
+  // For cases where multiple instruction definitions really represent the
+  // same underlying instruction but with one definition for 64-bit arguments
+  // and one for 32-bit arguments, this bit breaks the degeneracy between
+  // the two forms and allows TableGen to generate mapping tables.
+  bit Interpretation64Bit = 0;
+}
+
+class PPC970_DGroup_First   { bits<1> PPC970_First = 1;  }
+class PPC970_DGroup_Single  { bits<1> PPC970_Single = 1; }
+class PPC970_DGroup_Cracked { bits<1> PPC970_Cracked = 1; }
+class PPC970_MicroCode;
+
+class PPC970_Unit_Pseudo   { bits<3> PPC970_Unit = 0;   }
+class PPC970_Unit_FXU      { bits<3> PPC970_Unit = 1;   }
+class PPC970_Unit_LSU      { bits<3> PPC970_Unit = 2;   }
+class PPC970_Unit_FPU      { bits<3> PPC970_Unit = 3;   }
+class PPC970_Unit_CRU      { bits<3> PPC970_Unit = 4;   }
+class PPC970_Unit_VALU     { bits<3> PPC970_Unit = 5;   }
+class PPC970_Unit_VPERM    { bits<3> PPC970_Unit = 6;   }
+class PPC970_Unit_BRU      { bits<3> PPC970_Unit = 7;   }
+
+// Two joined instructions; used to emit two adjacent instructions as one.
+// The itinerary from the first instruction is used for scheduling and
+// classification.
+class I2<bits<6> opcode1, bits<6> opcode2, dag OOL, dag IOL, string asmstr,
+         InstrItinClass itin>
+        : Instruction {
+  field bits<64> Inst;
+  field bits<64> SoftFail = 0;
+  let Size = 8;
+
+  bit PPC64 = 0;  // Default value, override with isPPC64
+
+  let Namespace = "PPC";
+  let Inst{0-5} = opcode1;
+  let Inst{32-37} = opcode2;
+  let OutOperandList = OOL;
+  let InOperandList = IOL;
+  let AsmString = asmstr;
+  let Itinerary = itin;
+
+  bits<1> PPC970_First = 0;
+  bits<1> PPC970_Single = 0;
+  bits<1> PPC970_Cracked = 0;
+  bits<3> PPC970_Unit = 0;
+
+  /// These fields correspond to the fields in PPCInstrInfo.h.  Any changes to
+  /// these must be reflected there!  See comments there for what these are.
+  let TSFlags{0}   = PPC970_First;
+  let TSFlags{1}   = PPC970_Single;
+  let TSFlags{2}   = PPC970_Cracked;
+  let TSFlags{5-3} = PPC970_Unit;
+
+  // Fields used for relation models.
+  string BaseName = "";
+  bit Interpretation64Bit = 0;
+}
+
+// 1.7.1 I-Form
+class IForm<bits<6> opcode, bit aa, bit lk, dag OOL, dag IOL, string asmstr,
+            InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  let Pattern = pattern;
+  bits<24> LI;
+
+  let Inst{6-29}  = LI;
+  let Inst{30}    = aa;
+  let Inst{31}    = lk;
+}
+
+// 1.7.2 B-Form
+class BForm<bits<6> opcode, bit aa, bit lk, dag OOL, dag IOL, string asmstr>
+  : I<opcode, OOL, IOL, asmstr, IIC_BrB> {
+  bits<7> BIBO;  // 2 bits of BI and 5 bits of BO.
+  bits<3>  CR;
+  bits<14> BD;
+
+  bits<5> BI;
+  let BI{0-1} = BIBO{5-6};
+  let BI{2-4} = CR{0-2};
+
+  let Inst{6-10}  = BIBO{4-0};
+  let Inst{11-15} = BI;
+  let Inst{16-29} = BD;
+  let Inst{30}    = aa;
+  let Inst{31}    = lk;
+}
+
+class BForm_1<bits<6> opcode, bits<5> bo, bit aa, bit lk, dag OOL, dag IOL,
+             string asmstr>
+  : BForm<opcode, aa, lk, OOL, IOL, asmstr> {
+  let BIBO{4-0} = bo;
+  let BIBO{6-5} = 0;
+  let CR = 0;
+}
+
+class BForm_2<bits<6> opcode, bits<5> bo, bits<5> bi, bit aa, bit lk,
+              dag OOL, dag IOL, string asmstr>
+  : I<opcode, OOL, IOL, asmstr, IIC_BrB> {
+  bits<14> BD;
+
+  let Inst{6-10}  = bo;
+  let Inst{11-15} = bi;
+  let Inst{16-29} = BD;
+  let Inst{30}    = aa;
+  let Inst{31}    = lk;
+}
+
+class BForm_3<bits<6> opcode, bit aa, bit lk,
+              dag OOL, dag IOL, string asmstr>
+  : I<opcode, OOL, IOL, asmstr, IIC_BrB> {
+  bits<5> BO;
+  bits<5> BI;
+  bits<14> BD;
+
+  let Inst{6-10}  = BO;
+  let Inst{11-15} = BI;
+  let Inst{16-29} = BD;
+  let Inst{30}    = aa;
+  let Inst{31}    = lk;
+}
+
+class BForm_4<bits<6> opcode, bits<5> bo, bit aa, bit lk,
+              dag OOL, dag IOL, string asmstr>
+  : I<opcode, OOL, IOL, asmstr, IIC_BrB> {
+  bits<5> BI;
+  bits<14> BD;
+
+  let Inst{6-10}  = bo;
+  let Inst{11-15} = BI;
+  let Inst{16-29} = BD;
+  let Inst{30}    = aa;
+  let Inst{31}    = lk;
+}
+
+// 1.7.3 SC-Form
+class SCForm<bits<6> opcode, bits<1> xo,
+                     dag OOL, dag IOL, string asmstr, InstrItinClass itin,
+                     list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<7>  LEV;
+
+  let Pattern = pattern;
+
+  let Inst{20-26} = LEV;
+  let Inst{30}    = xo;
+}
+
+// 1.7.4 D-Form
+class DForm_base<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+                 InstrItinClass itin, list<dag> pattern> 
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5>  A;
+  bits<5>  B;
+  bits<16> C;
+
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = A;
+  let Inst{11-15} = B;
+  let Inst{16-31} = C;
+}
+
+class DForm_1<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5>  A;
+  bits<21> Addr;
+
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = A;
+  let Inst{11-15} = Addr{20-16}; // Base Reg
+  let Inst{16-31} = Addr{15-0};  // Displacement
+}
+
+class DForm_1a<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5>  A;
+  bits<16> C;
+  bits<5>  B;
+
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = A;
+  let Inst{11-15} = B;
+  let Inst{16-31} = C;
+}
+
+
+class DForm_2<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : DForm_base<opcode, OOL, IOL, asmstr, itin, pattern> {
+
+  // Even though ADDICo does not really have an RC bit, provide
+  // the declaration of one here so that isDOT has something to set.
+  bit RC = 0;
+}
+
+class DForm_2_r0<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+                 InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5>  A;
+  bits<16> B;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = A;
+  let Inst{11-15} = 0;
+  let Inst{16-31} = B;
+}
+
+class DForm_4<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5>  B;
+  bits<5>  A;
+  bits<16> C;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = A;
+  let Inst{11-15} = B;
+  let Inst{16-31} = C;
+}
+              
+class DForm_4_zero<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+                   InstrItinClass itin, list<dag> pattern>
+  : DForm_1<opcode, OOL, IOL, asmstr, itin, pattern> {
+  let A = 0;
+  let Addr = 0;
+}
+
+class DForm_4_fixedreg_zero<bits<6> opcode, bits<5> R, dag OOL, dag IOL,
+                            string asmstr, InstrItinClass itin,
+                            list<dag> pattern>
+  : DForm_4<opcode, OOL, IOL, asmstr, itin, pattern> {
+  let A = R;
+  let B = R;
+  let C = 0; 
+}
+
+class IForm_and_DForm_1<bits<6> opcode1, bit aa, bit lk, bits<6> opcode2,
+            dag OOL, dag IOL, string asmstr,
+            InstrItinClass itin, list<dag> pattern>
+         : I2<opcode1, opcode2, OOL, IOL, asmstr, itin> {
+  bits<5>  A;
+  bits<21> Addr;
+
+  let Pattern = pattern;
+  bits<24> LI;
+
+  let Inst{6-29}  = LI;
+  let Inst{30}    = aa;
+  let Inst{31}    = lk;
+
+  let Inst{38-42}  = A;
+  let Inst{43-47} = Addr{20-16}; // Base Reg
+  let Inst{48-63} = Addr{15-0};  // Displacement
+}
+
+// This is used to emit BL8+NOP.
+class IForm_and_DForm_4_zero<bits<6> opcode1, bit aa, bit lk, bits<6> opcode2,
+            dag OOL, dag IOL, string asmstr,
+            InstrItinClass itin, list<dag> pattern>
+         :  IForm_and_DForm_1<opcode1, aa, lk, opcode2,
+                              OOL, IOL, asmstr, itin, pattern> {
+  let A = 0;
+  let Addr = 0;
+}
+
+class DForm_5<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<3>  BF;
+  bits<1>  L;
+  bits<5>  RA;
+  bits<16> I;
+
+  let Inst{6-8}   = BF;
+  let Inst{9}     = 0;
+  let Inst{10}    = L;
+  let Inst{11-15} = RA;
+  let Inst{16-31} = I;
+}
+
+class DForm_5_ext<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+                  InstrItinClass itin>
+  : DForm_5<opcode, OOL, IOL, asmstr, itin> {
+  let L = PPC64;
+}
+
+class DForm_6<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin> 
+  : DForm_5<opcode, OOL, IOL, asmstr, itin>;
+
+class DForm_6_ext<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+                  InstrItinClass itin>
+  : DForm_6<opcode, OOL, IOL, asmstr, itin> {
+  let L = PPC64;
+}
+
+
+// 1.7.5 DS-Form
+class DSForm_1<bits<6> opcode, bits<2> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5>  RST;
+  bits<19> DS_RA;
+
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = RST;
+  let Inst{11-15} = DS_RA{18-14};  // Register #
+  let Inst{16-29} = DS_RA{13-0};   // Displacement.
+  let Inst{30-31} = xo;
+}
+
+
+// 1.7.6 X-Form
+class XForm_base_r3xo<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, 
+                      InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RST;
+  bits<5> A;
+  bits<5> B;
+
+  let Pattern = pattern;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = RST;
+  let Inst{11-15} = A;
+  let Inst{16-20} = B;
+  let Inst{21-30} = xo;
+  let Inst{31}    = RC;
+}
+
+// This is the same as XForm_base_r3xo, but the first two operands are swapped
+// when code is emitted.
+class XForm_base_r3xo_swapped
+        <bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+        InstrItinClass itin> 
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> A;
+  bits<5> RST;
+  bits<5> B;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = RST;
+  let Inst{11-15} = A;
+  let Inst{16-20} = B;
+  let Inst{21-30} = xo;
+  let Inst{31}    = RC;
+}
+
+
+class XForm_1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern> 
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern>;
+
+class XForm_1a<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let RST = 0;
+}
+
+class XForm_rs<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let A = 0;
+  let B = 0;
+}
+
+class XForm_6<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern> 
+  : XForm_base_r3xo_swapped<opcode, xo, OOL, IOL, asmstr, itin> {
+  let Pattern = pattern;
+}
+
+class XForm_8<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern> 
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern>;
+
+class XForm_10<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern> 
+  : XForm_base_r3xo_swapped<opcode, xo, OOL, IOL, asmstr, itin> {
+    let Pattern = pattern;
+}
+
+class XForm_11<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern> 
+  : XForm_base_r3xo_swapped<opcode, xo, OOL, IOL, asmstr, itin> {
+  let B = 0;
+  let Pattern = pattern;
+}
+
+class XForm_16<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<3> BF;
+  bits<1> L; 
+  bits<5> RA;
+  bits<5> RB;
+  
+  let Inst{6-8}   = BF;
+  let Inst{9}     = 0;
+  let Inst{10}    = L;
+  let Inst{11-15} = RA;
+  let Inst{16-20} = RB;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XForm_mtmsr<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                InstrItinClass itin>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RS;
+  bits<1> L;
+
+  let Inst{6-10} = RS;
+  let Inst{15} = L;
+  let Inst{21-30} = xo;
+}
+
+class XForm_16_ext<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                   InstrItinClass itin>
+  : XForm_16<opcode, xo, OOL, IOL, asmstr, itin> {
+  let L = PPC64;
+}
+
+class XForm_17<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<3> BF;
+  bits<5> FRA;
+  bits<5> FRB;
+  
+  let Inst{6-8}   = BF;
+  let Inst{9-10}  = 0;
+  let Inst{11-15} = FRA;
+  let Inst{16-20} = FRB;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XForm_24<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern> 
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  let Pattern = pattern;
+  let Inst{6-10}  = 31;
+  let Inst{11-15} = 0;
+  let Inst{16-20} = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XForm_24_sync<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+               string asmstr, InstrItinClass itin, list<dag> pattern> 
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<2> L;
+
+  let Pattern = pattern;
+  let Inst{6-8}   = 0;
+  let Inst{9-10}  = L;
+  let Inst{11-15} = 0;
+  let Inst{16-20} = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XForm_24_eieio<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+               string asmstr, InstrItinClass itin, list<dag> pattern> 
+  : XForm_24_sync<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let L = 0;
+}
+
+class XForm_25<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern> 
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+}
+
+class XForm_26<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let A = 0;
+}
+
+class XForm_28<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern> 
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+}
+
+// This is used for MFFS, MTFSB0, MTFSB1.  42 is arbitrary; this series of
+// numbers presumably relates to some document, but I haven't found it.
+class XForm_42<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let Pattern = pattern;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = RST;
+  let Inst{11-20} = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = RC;
+}
+class XForm_43<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let Pattern = pattern;
+  bits<5> FM;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = FM;
+  let Inst{11-20} = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = RC;
+}
+
+class XForm_0<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let RST = 0;
+  let A = 0;
+  let B = 0;
+}
+
+class XForm_16b<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let RST = 0;
+  let A = 0;
+}
+
+// XX*-Form (VSX)
+class XX1Form<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, 
+              InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<6> XT;
+  bits<5> A;
+  bits<5> B;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = XT{4-0};
+  let Inst{11-15} = A;
+  let Inst{16-20} = B;
+  let Inst{21-30} = xo;
+  let Inst{31}    = XT{5};
+}
+
+class XX2Form<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, string asmstr, 
+              InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<6> XT;
+  bits<6> XB;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = XT{4-0};
+  let Inst{11-15} = 0;
+  let Inst{16-20} = XB{4-0};
+  let Inst{21-29} = xo;
+  let Inst{30}    = XB{5};
+  let Inst{31}    = XT{5};
+}
+
+class XX2Form_1<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, string asmstr, 
+                InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<3> CR;
+  bits<6> XB;
+
+  let Pattern = pattern;
+
+  let Inst{6-8}   = CR;
+  let Inst{9-15}  = 0;
+  let Inst{16-20} = XB{4-0};
+  let Inst{21-29} = xo;
+  let Inst{30}    = XB{5};
+  let Inst{31}    = 0;
+}
+
+class XX2Form_2<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, string asmstr, 
+                InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<6> XT;
+  bits<6> XB;
+  bits<2> D;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = XT{4-0};
+  let Inst{11-13} = 0;
+  let Inst{14-15} = D;
+  let Inst{16-20} = XB{4-0};
+  let Inst{21-29} = xo;
+  let Inst{30}    = XB{5};
+  let Inst{31}    = XT{5};
+}
+
+class XX3Form<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, 
+              InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<6> XT;
+  bits<6> XA;
+  bits<6> XB;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = XT{4-0};
+  let Inst{11-15} = XA{4-0};
+  let Inst{16-20} = XB{4-0};
+  let Inst{21-28} = xo;
+  let Inst{29}    = XA{5};
+  let Inst{30}    = XB{5};
+  let Inst{31}    = XT{5};
+}
+
+class XX3Form_1<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, 
+                InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<3> CR;
+  bits<6> XA;
+  bits<6> XB;
+
+  let Pattern = pattern;
+
+  let Inst{6-8}   = CR;
+  let Inst{9-10}  = 0;
+  let Inst{11-15} = XA{4-0};
+  let Inst{16-20} = XB{4-0};
+  let Inst{21-28} = xo;
+  let Inst{29}    = XA{5};
+  let Inst{30}    = XB{5};
+  let Inst{31}    = 0;
+}
+
+class XX3Form_2<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr, 
+                InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<6> XT;
+  bits<6> XA;
+  bits<6> XB;
+  bits<2> D;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = XT{4-0};
+  let Inst{11-15} = XA{4-0};
+  let Inst{16-20} = XB{4-0};
+  let Inst{21}    = 0;
+  let Inst{22-23} = D;
+  let Inst{24-28} = xo;
+  let Inst{29}    = XA{5};
+  let Inst{30}    = XB{5};
+  let Inst{31}    = XT{5};
+}
+
+class XX3Form_Rc<bits<6> opcode, bits<7> xo, dag OOL, dag IOL, string asmstr, 
+              InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<6> XT;
+  bits<6> XA;
+  bits<6> XB;
+
+  let Pattern = pattern;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = XT{4-0};
+  let Inst{11-15} = XA{4-0};
+  let Inst{16-20} = XB{4-0};
+  let Inst{21}    = RC;
+  let Inst{22-28} = xo;
+  let Inst{29}    = XA{5};
+  let Inst{30}    = XB{5};
+  let Inst{31}    = XT{5};
+}
+
+class XX4Form<bits<6> opcode, bits<2> xo, dag OOL, dag IOL, string asmstr, 
+              InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<6> XT;
+  bits<6> XA;
+  bits<6> XB;
+  bits<6> XC;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = XT{4-0};
+  let Inst{11-15} = XA{4-0};
+  let Inst{16-20} = XB{4-0};
+  let Inst{21-25} = XC{4-0};
+  let Inst{26-27} = xo;
+  let Inst{28}    = XC{5};
+  let Inst{29}    = XA{5};
+  let Inst{30}    = XB{5};
+  let Inst{31}    = XT{5};
+}
+
+// DCB_Form - Form X instruction, used for dcb* instructions.
+class DCB_Form<bits<10> xo, bits<5> immfield, dag OOL, dag IOL, string asmstr, 
+                      InstrItinClass itin, list<dag> pattern>
+  : I<31, OOL, IOL, asmstr, itin> {
+  bits<5> A;
+  bits<5> B;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = immfield;
+  let Inst{11-15} = A;
+  let Inst{16-20} = B;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+
+// DSS_Form - Form X instruction, used for altivec dss* instructions.
+class DSS_Form<bits<10> xo, dag OOL, dag IOL, string asmstr, 
+                      InstrItinClass itin, list<dag> pattern>
+  : I<31, OOL, IOL, asmstr, itin> {
+  bits<1> T;
+  bits<2> STRM;
+  bits<5> A;
+  bits<5> B;
+
+  let Pattern = pattern;
+
+  let Inst{6}     = T;
+  let Inst{7-8}   = 0;
+  let Inst{9-10}  = STRM;
+  let Inst{11-15} = A;
+  let Inst{16-20} = B;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+// 1.7.7 XL-Form
+class XLForm_1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> CRD;
+  bits<5> CRA;
+  bits<5> CRB;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = CRD;
+  let Inst{11-15} = CRA;
+  let Inst{16-20} = CRB;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XLForm_1_ext<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> CRD;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = CRD;
+  let Inst{11-15} = CRD;
+  let Inst{16-20} = CRD;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XLForm_2<bits<6> opcode, bits<10> xo, bit lk, dag OOL, dag IOL, string asmstr, 
+               InstrItinClass itin, list<dag> pattern>
+    : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> BO;
+  bits<5> BI;
+  bits<2> BH;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = BO;
+  let Inst{11-15} = BI;
+  let Inst{16-18} = 0;
+  let Inst{19-20} = BH;
+  let Inst{21-30} = xo;
+  let Inst{31}    = lk;
+}
+
+class XLForm_2_br<bits<6> opcode, bits<10> xo, bit lk,
+                  dag OOL, dag IOL, string asmstr, InstrItinClass itin, list<dag> pattern>
+  : XLForm_2<opcode, xo, lk, OOL, IOL, asmstr, itin, pattern> {
+  bits<7> BIBO;  // 2 bits of BI and 5 bits of BO.
+  bits<3>  CR;
+  
+  let BO = BIBO{4-0};
+  let BI{0-1} = BIBO{5-6};
+  let BI{2-4} = CR{0-2};
+  let BH = 0;
+}
+
+class XLForm_2_br2<bits<6> opcode, bits<10> xo, bits<5> bo, bit lk,
+                   dag OOL, dag IOL, string asmstr, InstrItinClass itin, list<dag> pattern>
+  : XLForm_2<opcode, xo, lk, OOL, IOL, asmstr, itin, pattern> {
+  let BO = bo;
+  let BH = 0;
+}
+
+class XLForm_2_ext<bits<6> opcode, bits<10> xo, bits<5> bo,  bits<5> bi, bit lk,
+                  dag OOL, dag IOL, string asmstr, InstrItinClass itin, list<dag> pattern>
+  : XLForm_2<opcode, xo, lk, OOL, IOL, asmstr, itin, pattern> {
+  let BO = bo;
+  let BI = bi;
+  let BH = 0;
+}
+
+class XLForm_3<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<3> BF;
+  bits<3> BFA;
+  
+  let Inst{6-8}   = BF;
+  let Inst{9-10}  = 0;
+  let Inst{11-13} = BFA;
+  let Inst{14-15} = 0;
+  let Inst{16-20} = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+// 1.7.8 XFX-Form
+class XFXForm_1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                InstrItinClass itin>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5>  RT;
+  bits<10> SPR;
+
+  let Inst{6-10}  = RT;
+  let Inst{11}    = SPR{4};
+  let Inst{12}    = SPR{3};
+  let Inst{13}    = SPR{2};
+  let Inst{14}    = SPR{1};
+  let Inst{15}    = SPR{0};
+  let Inst{16}    = SPR{9};
+  let Inst{17}    = SPR{8};
+  let Inst{18}    = SPR{7};
+  let Inst{19}    = SPR{6};
+  let Inst{20}    = SPR{5};
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XFXForm_1_ext<bits<6> opcode, bits<10> xo, bits<10> spr, 
+                   dag OOL, dag IOL, string asmstr, InstrItinClass itin> 
+  : XFXForm_1<opcode, xo, OOL, IOL, asmstr, itin> {
+  let SPR = spr;
+}
+
+class XFXForm_3<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                InstrItinClass itin>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5>  RT;
+   
+  let Inst{6-10}  = RT;
+  let Inst{11-20} = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XFXForm_5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                InstrItinClass itin> 
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<8>  FXM;
+  bits<5>  rS;
+   
+  let Inst{6-10}  = rS;
+  let Inst{11}    = 0;
+  let Inst{12-19} = FXM;
+  let Inst{20}    = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XFXForm_5a<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                 InstrItinClass itin> 
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5>  ST;
+  bits<8>  FXM;
+   
+  let Inst{6-10}  = ST;
+  let Inst{11}    = 1;
+  let Inst{12-19} = FXM;
+  let Inst{20}    = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XFXForm_7<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                InstrItinClass itin>
+  : XFXForm_1<opcode, xo, OOL, IOL, asmstr, itin>;
+
+class XFXForm_7_ext<bits<6> opcode, bits<10> xo, bits<10> spr, 
+                    dag OOL, dag IOL, string asmstr, InstrItinClass itin> 
+  : XFXForm_7<opcode, xo, OOL, IOL, asmstr, itin> {
+  let SPR = spr;
+}
+
+// XFL-Form - MTFSF
+// This is probably 1.7.9, but I don't have the reference that uses this
+// numbering scheme...
+class XFLForm<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, 
+              InstrItinClass itin, list<dag>pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<8> FM;
+  bits<5> rT;
+
+  bit RC = 0;    // set by isDOT
+  let Pattern = pattern;
+
+  let Inst{6} = 0;
+  let Inst{7-14}  = FM;
+  let Inst{15} = 0;
+  let Inst{16-20} = rT;
+  let Inst{21-30} = xo;
+  let Inst{31}    = RC;
+}
+
+// 1.7.10 XS-Form - SRADI.
+class XSForm_1<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> A;
+  bits<5> RS;
+  bits<6> SH;
+
+  bit RC = 0;    // set by isDOT
+  let Pattern = pattern;
+
+  let Inst{6-10}  = RS;
+  let Inst{11-15} = A;
+  let Inst{16-20} = SH{4,3,2,1,0};
+  let Inst{21-29} = xo;
+  let Inst{30}    = SH{5};
+  let Inst{31}    = RC;
+}
+
+// 1.7.11 XO-Form
+class XOForm_1<bits<6> opcode, bits<9> xo, bit oe, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RT;
+  bits<5> RA;
+  bits<5> RB;
+
+  let Pattern = pattern;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = RT;
+  let Inst{11-15} = RA;
+  let Inst{16-20} = RB;
+  let Inst{21}    = oe;
+  let Inst{22-30} = xo;
+  let Inst{31}    = RC;  
+}
+
+class XOForm_3<bits<6> opcode, bits<9> xo, bit oe, 
+               dag OOL, dag IOL, string asmstr, InstrItinClass itin, list<dag> pattern>
+  : XOForm_1<opcode, xo, oe, OOL, IOL, asmstr, itin, pattern> {
+  let RB = 0;
+}
+
+// 1.7.12 A-Form
+class AForm_1<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr, 
+              InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> FRT;
+  bits<5> FRA;
+  bits<5> FRC;
+  bits<5> FRB;
+
+  let Pattern = pattern;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = FRT;
+  let Inst{11-15} = FRA;
+  let Inst{16-20} = FRB;
+  let Inst{21-25} = FRC;
+  let Inst{26-30} = xo;
+  let Inst{31}    = RC;
+}
+
+class AForm_2<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : AForm_1<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let FRC = 0;
+}
+
+class AForm_3<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern> 
+  : AForm_1<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let FRB = 0;
+}
+
+class AForm_4<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr, 
+              InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RT;
+  bits<5> RA;
+  bits<5> RB;
+  bits<5> COND;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = RT;
+  let Inst{11-15} = RA;
+  let Inst{16-20} = RB;
+  let Inst{21-25} = COND;
+  let Inst{26-30} = xo;
+  let Inst{31}    = 0;
+}
+
+// 1.7.13 M-Form
+class MForm_1<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+    : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RA;
+  bits<5> RS;
+  bits<5> RB;
+  bits<5> MB;
+  bits<5> ME;
+
+  let Pattern = pattern;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = RS;
+  let Inst{11-15} = RA;
+  let Inst{16-20} = RB;
+  let Inst{21-25} = MB;
+  let Inst{26-30} = ME;
+  let Inst{31}    = RC;
+}
+
+class MForm_2<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : MForm_1<opcode, OOL, IOL, asmstr, itin, pattern> {
+}
+
+// 1.7.14 MD-Form
+class MDForm_1<bits<6> opcode, bits<3> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RA;
+  bits<5> RS;
+  bits<6> SH;
+  bits<6> MBE;
+
+  let Pattern = pattern;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = RS;
+  let Inst{11-15} = RA;
+  let Inst{16-20} = SH{4,3,2,1,0};
+  let Inst{21-26} = MBE{4,3,2,1,0,5};
+  let Inst{27-29} = xo;
+  let Inst{30}    = SH{5};
+  let Inst{31}    = RC;
+}
+
+class MDSForm_1<bits<6> opcode, bits<4> xo, dag OOL, dag IOL, string asmstr,
+                InstrItinClass itin, list<dag> pattern>
+    : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RA;
+  bits<5> RS;
+  bits<5> RB;
+  bits<6> MBE;
+
+  let Pattern = pattern;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = RS;
+  let Inst{11-15} = RA;
+  let Inst{16-20} = RB;
+  let Inst{21-26} = MBE{4,3,2,1,0,5};
+  let Inst{27-30} = xo;
+  let Inst{31}    = RC;
+}
+
+
+// E-1 VA-Form
+
+// VAForm_1 - DACB ordering.
+class VAForm_1<bits<6> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> VA;
+  bits<5> VC;
+  bits<5> VB;
+
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = VA;
+  let Inst{16-20} = VB;
+  let Inst{21-25} = VC;
+  let Inst{26-31} = xo;
+}
+
+// VAForm_1a - DABC ordering.
+class VAForm_1a<bits<6> xo, dag OOL, dag IOL, string asmstr,
+                InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> VA;
+  bits<5> VB;
+  bits<5> VC;
+
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = VA;
+  let Inst{16-20} = VB;
+  let Inst{21-25} = VC;
+  let Inst{26-31} = xo;
+}
+
+class VAForm_2<bits<6> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> VA;
+  bits<5> VB;
+  bits<4> SH;
+
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = VA;
+  let Inst{16-20} = VB;
+  let Inst{21}    = 0;
+  let Inst{22-25} = SH;
+  let Inst{26-31} = xo;
+}
+
+// E-2 VX-Form
+class VXForm_1<bits<11> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> VA;
+  bits<5> VB;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = VA;
+  let Inst{16-20} = VB;
+  let Inst{21-31} = xo;
+}
+
+class VXForm_setzero<bits<11> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : VXForm_1<xo, OOL, IOL, asmstr, itin, pattern> {
+  let VA = VD;
+  let VB = VD;
+}
+
+
+class VXForm_2<bits<11> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> VB;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = 0;
+  let Inst{16-20} = VB;
+  let Inst{21-31} = xo;
+}
+
+class VXForm_3<bits<11> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> IMM;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = IMM;
+  let Inst{16-20} = 0;
+  let Inst{21-31} = xo;
+}
+
+/// VXForm_4 - VX instructions with "VD,0,0" register fields, like mfvscr.
+class VXForm_4<bits<11> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VD;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = 0;
+  let Inst{16-20} = 0;
+  let Inst{21-31} = xo;
+}
+
+/// VXForm_5 - VX instructions with "0,0,VB" register fields, like mtvscr.
+class VXForm_5<bits<11> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VB;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = 0;
+  let Inst{11-15} = 0;
+  let Inst{16-20} = VB;
+  let Inst{21-31} = xo;
+}
+
+// E-4 VXR-Form
+class VXRForm_1<bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> VA;
+  bits<5> VB;
+  bit RC = 0;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = VA;
+  let Inst{16-20} = VB;
+  let Inst{21}    = RC;
+  let Inst{22-31} = xo;
+}
+
+//===----------------------------------------------------------------------===//
+class Pseudo<dag OOL, dag IOL, string asmstr, list<dag> pattern>
+    : I<0, OOL, IOL, asmstr, NoItinerary> {
+  let isCodeGenOnly = 1;
+  let PPC64 = 0;
+  let Pattern = pattern;
+  let Inst{31-0} = 0;
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
new file mode 100644
index 0000000..9bac91d
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -0,0 +1,2246 @@
+//===-- PPCInstrInfo.cpp - PowerPC Instruction Information ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PowerPC implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCInstrInfo.h"
+#include "MCTargetDesc/PPCPredicates.h"
+#include "PPC.h"
+#include "PPCHazardRecognizers.h"
+#include "PPCInstrBuilder.h"
+#include "PPCMachineFunctionInfo.h"
+#include "PPCTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-instr-info"
+
+#define GET_INSTRMAP_INFO
+#define GET_INSTRINFO_CTOR_DTOR
+#include "PPCGenInstrInfo.inc"
+
+static cl::
+opt<bool> DisableCTRLoopAnal("disable-ppc-ctrloop-analysis", cl::Hidden,
+            cl::desc("Disable analysis for CTR loops"));
+
+static cl::opt<bool> DisableCmpOpt("disable-ppc-cmp-opt",
+cl::desc("Disable compare instruction optimization"), cl::Hidden);
+
+static cl::opt<bool> DisableVSXFMAMutate("disable-ppc-vsx-fma-mutation",
+cl::desc("Disable VSX FMA instruction mutation"), cl::Hidden);
+
+static cl::opt<bool> VSXSelfCopyCrash("crash-on-ppc-vsx-self-copy",
+cl::desc("Causes the backend to crash instead of generating a nop VSX copy"),
+cl::Hidden);
+
+// Pin the vtable to this file.
+void PPCInstrInfo::anchor() {}
+
+PPCInstrInfo::PPCInstrInfo(PPCSubtarget &STI)
+    : PPCGenInstrInfo(PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP),
+      Subtarget(STI), RI(STI) {}
+
+/// CreateTargetHazardRecognizer - Return the hazard recognizer to use for
+/// this target when scheduling the DAG.
+ScheduleHazardRecognizer *
+PPCInstrInfo::CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI,
+                                           const ScheduleDAG *DAG) const {
+  unsigned Directive =
+      static_cast<const PPCSubtarget *>(STI)->getDarwinDirective();
+  if (Directive == PPC::DIR_440 || Directive == PPC::DIR_A2 ||
+      Directive == PPC::DIR_E500mc || Directive == PPC::DIR_E5500) {
+    const InstrItineraryData *II =
+        &static_cast<const PPCSubtarget *>(STI)->getInstrItineraryData();
+    return new ScoreboardHazardRecognizer(II, DAG);
+  }
+
+  return TargetInstrInfo::CreateTargetHazardRecognizer(STI, DAG);
+}
+
+/// CreateTargetPostRAHazardRecognizer - Return the postRA hazard recognizer
+/// to use for this target when scheduling the DAG.
+ScheduleHazardRecognizer *PPCInstrInfo::CreateTargetPostRAHazardRecognizer(
+  const InstrItineraryData *II,
+  const ScheduleDAG *DAG) const {
+  unsigned Directive =
+      DAG->TM.getSubtarget<PPCSubtarget>().getDarwinDirective();
+
+  if (Directive == PPC::DIR_PWR7 || Directive == PPC::DIR_PWR8)
+    return new PPCDispatchGroupSBHazardRecognizer(II, DAG);
+
+  // Most subtargets use a PPC970 recognizer.
+  if (Directive != PPC::DIR_440 && Directive != PPC::DIR_A2 &&
+      Directive != PPC::DIR_E500mc && Directive != PPC::DIR_E5500) {
+    assert(DAG->TII && "No InstrInfo?");
+
+    return new PPCHazardRecognizer970(*DAG);
+  }
+
+  return new ScoreboardHazardRecognizer(II, DAG);
+}
+
+
+int PPCInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
+                                    const MachineInstr *DefMI, unsigned DefIdx,
+                                    const MachineInstr *UseMI,
+                                    unsigned UseIdx) const {
+  int Latency = PPCGenInstrInfo::getOperandLatency(ItinData, DefMI, DefIdx,
+                                                   UseMI, UseIdx);
+
+  const MachineOperand &DefMO = DefMI->getOperand(DefIdx);
+  unsigned Reg = DefMO.getReg();
+
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  bool IsRegCR;
+  if (TRI->isVirtualRegister(Reg)) {
+    const MachineRegisterInfo *MRI =
+      &DefMI->getParent()->getParent()->getRegInfo();
+    IsRegCR = MRI->getRegClass(Reg)->hasSuperClassEq(&PPC::CRRCRegClass) ||
+              MRI->getRegClass(Reg)->hasSuperClassEq(&PPC::CRBITRCRegClass);
+  } else {
+    IsRegCR = PPC::CRRCRegClass.contains(Reg) ||
+              PPC::CRBITRCRegClass.contains(Reg);
+  }
+
+  if (UseMI->isBranch() && IsRegCR) {
+    if (Latency < 0)
+      Latency = getInstrLatency(ItinData, DefMI);
+
+    // On some cores, there is an additional delay between writing to a condition
+    // register, and using it from a branch.
+    unsigned Directive = Subtarget.getDarwinDirective();
+    switch (Directive) {
+    default: break;
+    case PPC::DIR_7400:
+    case PPC::DIR_750:
+    case PPC::DIR_970:
+    case PPC::DIR_E5500:
+    case PPC::DIR_PWR4:
+    case PPC::DIR_PWR5:
+    case PPC::DIR_PWR5X:
+    case PPC::DIR_PWR6:
+    case PPC::DIR_PWR6X:
+    case PPC::DIR_PWR7:
+    case PPC::DIR_PWR8:
+      Latency += 2;
+      break;
+    }
+  }
+
+  return Latency;
+}
+
+// Detect 32 -> 64-bit extensions where we may reuse the low sub-register.
+bool PPCInstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
+                                         unsigned &SrcReg, unsigned &DstReg,
+                                         unsigned &SubIdx) const {
+  switch (MI.getOpcode()) {
+  default: return false;
+  case PPC::EXTSW:
+  case PPC::EXTSW_32_64:
+    SrcReg = MI.getOperand(1).getReg();
+    DstReg = MI.getOperand(0).getReg();
+    SubIdx = PPC::sub_32;
+    return true;
+  }
+}
+
+unsigned PPCInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+                                           int &FrameIndex) const {
+  // Note: This list must be kept consistent with LoadRegFromStackSlot.
+  switch (MI->getOpcode()) {
+  default: break;
+  case PPC::LD:
+  case PPC::LWZ:
+  case PPC::LFS:
+  case PPC::LFD:
+  case PPC::RESTORE_CR:
+  case PPC::RESTORE_CRBIT:
+  case PPC::LVX:
+  case PPC::LXVD2X:
+  case PPC::RESTORE_VRSAVE:
+    // Check for the operands added by addFrameReference (the immediate is the
+    // offset which defaults to 0).
+    if (MI->getOperand(1).isImm() && !MI->getOperand(1).getImm() &&
+        MI->getOperand(2).isFI()) {
+      FrameIndex = MI->getOperand(2).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+  return 0;
+}
+
+unsigned PPCInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+                                          int &FrameIndex) const {
+  // Note: This list must be kept consistent with StoreRegToStackSlot.
+  switch (MI->getOpcode()) {
+  default: break;
+  case PPC::STD:
+  case PPC::STW:
+  case PPC::STFS:
+  case PPC::STFD:
+  case PPC::SPILL_CR:
+  case PPC::SPILL_CRBIT:
+  case PPC::STVX:
+  case PPC::STXVD2X:
+  case PPC::SPILL_VRSAVE:
+    // Check for the operands added by addFrameReference (the immediate is the
+    // offset which defaults to 0).
+    if (MI->getOperand(1).isImm() && !MI->getOperand(1).getImm() &&
+        MI->getOperand(2).isFI()) {
+      FrameIndex = MI->getOperand(2).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+  return 0;
+}
+
+// commuteInstruction - We can commute rlwimi instructions, but only if the
+// rotate amt is zero.  We also have to munge the immediates a bit.
+MachineInstr *
+PPCInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
+  MachineFunction &MF = *MI->getParent()->getParent();
+
+  // Normal instructions can be commuted the obvious way.
+  if (MI->getOpcode() != PPC::RLWIMI &&
+      MI->getOpcode() != PPC::RLWIMIo &&
+      MI->getOpcode() != PPC::RLWIMI8 &&
+      MI->getOpcode() != PPC::RLWIMI8o)
+    return TargetInstrInfo::commuteInstruction(MI, NewMI);
+
+  // Cannot commute if it has a non-zero rotate count.
+  if (MI->getOperand(3).getImm() != 0)
+    return nullptr;
+
+  // If we have a zero rotate count, we have:
+  //   M = mask(MB,ME)
+  //   Op0 = (Op1 & ~M) | (Op2 & M)
+  // Change this to:
+  //   M = mask((ME+1)&31, (MB-1)&31)
+  //   Op0 = (Op2 & ~M) | (Op1 & M)
+
+  // Swap op1/op2
+  unsigned Reg0 = MI->getOperand(0).getReg();
+  unsigned Reg1 = MI->getOperand(1).getReg();
+  unsigned Reg2 = MI->getOperand(2).getReg();
+  unsigned SubReg1 = MI->getOperand(1).getSubReg();
+  unsigned SubReg2 = MI->getOperand(2).getSubReg();
+  bool Reg1IsKill = MI->getOperand(1).isKill();
+  bool Reg2IsKill = MI->getOperand(2).isKill();
+  bool ChangeReg0 = false;
+  // If machine instrs are no longer in two-address forms, update
+  // destination register as well.
+  if (Reg0 == Reg1) {
+    // Must be two address instruction!
+    assert(MI->getDesc().getOperandConstraint(0, MCOI::TIED_TO) &&
+           "Expecting a two-address instruction!");
+    assert(MI->getOperand(0).getSubReg() == SubReg1 && "Tied subreg mismatch");
+    Reg2IsKill = false;
+    ChangeReg0 = true;
+  }
+
+  // Masks.
+  unsigned MB = MI->getOperand(4).getImm();
+  unsigned ME = MI->getOperand(5).getImm();
+
+  if (NewMI) {
+    // Create a new instruction.
+    unsigned Reg0 = ChangeReg0 ? Reg2 : MI->getOperand(0).getReg();
+    bool Reg0IsDead = MI->getOperand(0).isDead();
+    return BuildMI(MF, MI->getDebugLoc(), MI->getDesc())
+      .addReg(Reg0, RegState::Define | getDeadRegState(Reg0IsDead))
+      .addReg(Reg2, getKillRegState(Reg2IsKill))
+      .addReg(Reg1, getKillRegState(Reg1IsKill))
+      .addImm((ME+1) & 31)
+      .addImm((MB-1) & 31);
+  }
+
+  if (ChangeReg0) {
+    MI->getOperand(0).setReg(Reg2);
+    MI->getOperand(0).setSubReg(SubReg2);
+  }
+  MI->getOperand(2).setReg(Reg1);
+  MI->getOperand(1).setReg(Reg2);
+  MI->getOperand(2).setSubReg(SubReg1);
+  MI->getOperand(1).setSubReg(SubReg2);
+  MI->getOperand(2).setIsKill(Reg1IsKill);
+  MI->getOperand(1).setIsKill(Reg2IsKill);
+
+  // Swap the mask around.
+  MI->getOperand(4).setImm((ME+1) & 31);
+  MI->getOperand(5).setImm((MB-1) & 31);
+  return MI;
+}
+
+bool PPCInstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
+                                         unsigned &SrcOpIdx2) const {
+  // For VSX A-Type FMA instructions, it is the first two operands that can be
+  // commuted, however, because the non-encoded tied input operand is listed
+  // first, the operands to swap are actually the second and third.
+
+  int AltOpc = PPC::getAltVSXFMAOpcode(MI->getOpcode());
+  if (AltOpc == -1)
+    return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
+
+  SrcOpIdx1 = 2;
+  SrcOpIdx2 = 3;
+  return true;
+}
+
+void PPCInstrInfo::insertNoop(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MI) const {
+  // This function is used for scheduling, and the nop wanted here is the type
+  // that terminates dispatch groups on the POWER cores.
+  unsigned Directive = Subtarget.getDarwinDirective();
+  unsigned Opcode;
+  switch (Directive) {
+  default:            Opcode = PPC::NOP; break;
+  case PPC::DIR_PWR6: Opcode = PPC::NOP_GT_PWR6; break;
+  case PPC::DIR_PWR7: Opcode = PPC::NOP_GT_PWR7; break;
+  case PPC::DIR_PWR8: Opcode = PPC::NOP_GT_PWR7; break; /* FIXME: Update when P8 InstrScheduling model is ready */
+  }
+
+  DebugLoc DL;
+  BuildMI(MBB, MI, DL, get(Opcode));
+}
+
+// Branch analysis.
+// Note: If the condition register is set to CTR or CTR8 then this is a
+// BDNZ (imm == 1) or BDZ (imm == 0) branch.
+bool PPCInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
+                                 MachineBasicBlock *&FBB,
+                                 SmallVectorImpl<MachineOperand> &Cond,
+                                 bool AllowModify) const {
+  bool isPPC64 = Subtarget.isPPC64();
+
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin())
+    return false;
+  --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return false;
+    --I;
+  }
+  if (!isUnpredicatedTerminator(I))
+    return false;
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = I;
+
+  // If there is only one terminator instruction, process it.
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+    if (LastInst->getOpcode() == PPC::B) {
+      if (!LastInst->getOperand(0).isMBB())
+        return true;
+      TBB = LastInst->getOperand(0).getMBB();
+      return false;
+    } else if (LastInst->getOpcode() == PPC::BCC) {
+      if (!LastInst->getOperand(2).isMBB())
+        return true;
+      // Block ends with fall-through condbranch.
+      TBB = LastInst->getOperand(2).getMBB();
+      Cond.push_back(LastInst->getOperand(0));
+      Cond.push_back(LastInst->getOperand(1));
+      return false;
+    } else if (LastInst->getOpcode() == PPC::BC) {
+      if (!LastInst->getOperand(1).isMBB())
+        return true;
+      // Block ends with fall-through condbranch.
+      TBB = LastInst->getOperand(1).getMBB();
+      Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_SET));
+      Cond.push_back(LastInst->getOperand(0));
+      return false;
+    } else if (LastInst->getOpcode() == PPC::BCn) {
+      if (!LastInst->getOperand(1).isMBB())
+        return true;
+      // Block ends with fall-through condbranch.
+      TBB = LastInst->getOperand(1).getMBB();
+      Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_UNSET));
+      Cond.push_back(LastInst->getOperand(0));
+      return false;
+    } else if (LastInst->getOpcode() == PPC::BDNZ8 ||
+               LastInst->getOpcode() == PPC::BDNZ) {
+      if (!LastInst->getOperand(0).isMBB())
+        return true;
+      if (DisableCTRLoopAnal)
+        return true;
+      TBB = LastInst->getOperand(0).getMBB();
+      Cond.push_back(MachineOperand::CreateImm(1));
+      Cond.push_back(MachineOperand::CreateReg(isPPC64 ? PPC::CTR8 : PPC::CTR,
+                                               true));
+      return false;
+    } else if (LastInst->getOpcode() == PPC::BDZ8 ||
+               LastInst->getOpcode() == PPC::BDZ) {
+      if (!LastInst->getOperand(0).isMBB())
+        return true;
+      if (DisableCTRLoopAnal)
+        return true;
+      TBB = LastInst->getOperand(0).getMBB();
+      Cond.push_back(MachineOperand::CreateImm(0));
+      Cond.push_back(MachineOperand::CreateReg(isPPC64 ? PPC::CTR8 : PPC::CTR,
+                                               true));
+      return false;
+    }
+
+    // Otherwise, don't know what this is.
+    return true;
+  }
+
+  // Get the instruction before it if it's a terminator.
+  MachineInstr *SecondLastInst = I;
+
+  // If there are three terminators, we don't know what sort of block this is.
+  if (SecondLastInst && I != MBB.begin() &&
+      isUnpredicatedTerminator(--I))
+    return true;
+
+  // If the block ends with PPC::B and PPC:BCC, handle it.
+  if (SecondLastInst->getOpcode() == PPC::BCC &&
+      LastInst->getOpcode() == PPC::B) {
+    if (!SecondLastInst->getOperand(2).isMBB() ||
+        !LastInst->getOperand(0).isMBB())
+      return true;
+    TBB =  SecondLastInst->getOperand(2).getMBB();
+    Cond.push_back(SecondLastInst->getOperand(0));
+    Cond.push_back(SecondLastInst->getOperand(1));
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  } else if (SecondLastInst->getOpcode() == PPC::BC &&
+      LastInst->getOpcode() == PPC::B) {
+    if (!SecondLastInst->getOperand(1).isMBB() ||
+        !LastInst->getOperand(0).isMBB())
+      return true;
+    TBB =  SecondLastInst->getOperand(1).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_SET));
+    Cond.push_back(SecondLastInst->getOperand(0));
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  } else if (SecondLastInst->getOpcode() == PPC::BCn &&
+      LastInst->getOpcode() == PPC::B) {
+    if (!SecondLastInst->getOperand(1).isMBB() ||
+        !LastInst->getOperand(0).isMBB())
+      return true;
+    TBB =  SecondLastInst->getOperand(1).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_UNSET));
+    Cond.push_back(SecondLastInst->getOperand(0));
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  } else if ((SecondLastInst->getOpcode() == PPC::BDNZ8 ||
+              SecondLastInst->getOpcode() == PPC::BDNZ) &&
+      LastInst->getOpcode() == PPC::B) {
+    if (!SecondLastInst->getOperand(0).isMBB() ||
+        !LastInst->getOperand(0).isMBB())
+      return true;
+    if (DisableCTRLoopAnal)
+      return true;
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(1));
+    Cond.push_back(MachineOperand::CreateReg(isPPC64 ? PPC::CTR8 : PPC::CTR,
+                                             true));
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  } else if ((SecondLastInst->getOpcode() == PPC::BDZ8 ||
+              SecondLastInst->getOpcode() == PPC::BDZ) &&
+      LastInst->getOpcode() == PPC::B) {
+    if (!SecondLastInst->getOperand(0).isMBB() ||
+        !LastInst->getOperand(0).isMBB())
+      return true;
+    if (DisableCTRLoopAnal)
+      return true;
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(0));
+    Cond.push_back(MachineOperand::CreateReg(isPPC64 ? PPC::CTR8 : PPC::CTR,
+                                             true));
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  }
+
+  // If the block ends with two PPC:Bs, handle it.  The second one is not
+  // executed, so remove it.
+  if (SecondLastInst->getOpcode() == PPC::B &&
+      LastInst->getOpcode() == PPC::B) {
+    if (!SecondLastInst->getOperand(0).isMBB())
+      return true;
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return false;
+  }
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+unsigned PPCInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin()) return 0;
+  --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return 0;
+    --I;
+  }
+  if (I->getOpcode() != PPC::B && I->getOpcode() != PPC::BCC &&
+      I->getOpcode() != PPC::BC && I->getOpcode() != PPC::BCn &&
+      I->getOpcode() != PPC::BDNZ8 && I->getOpcode() != PPC::BDNZ &&
+      I->getOpcode() != PPC::BDZ8  && I->getOpcode() != PPC::BDZ)
+    return 0;
+
+  // Remove the branch.
+  I->eraseFromParent();
+
+  I = MBB.end();
+
+  if (I == MBB.begin()) return 1;
+  --I;
+  if (I->getOpcode() != PPC::BCC &&
+      I->getOpcode() != PPC::BC && I->getOpcode() != PPC::BCn &&
+      I->getOpcode() != PPC::BDNZ8 && I->getOpcode() != PPC::BDNZ &&
+      I->getOpcode() != PPC::BDZ8  && I->getOpcode() != PPC::BDZ)
+    return 1;
+
+  // Remove the branch.
+  I->eraseFromParent();
+  return 2;
+}
+
+unsigned
+PPCInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                           MachineBasicBlock *FBB,
+                           const SmallVectorImpl<MachineOperand> &Cond,
+                           DebugLoc DL) const {
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 2 || Cond.size() == 0) &&
+         "PPC branch conditions have two components!");
+
+  bool isPPC64 = Subtarget.isPPC64();
+
+  // One-way branch.
+  if (!FBB) {
+    if (Cond.empty())   // Unconditional branch
+      BuildMI(&MBB, DL, get(PPC::B)).addMBB(TBB);
+    else if (Cond[1].getReg() == PPC::CTR || Cond[1].getReg() == PPC::CTR8)
+      BuildMI(&MBB, DL, get(Cond[0].getImm() ?
+                              (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ) :
+                              (isPPC64 ? PPC::BDZ8  : PPC::BDZ))).addMBB(TBB);
+    else if (Cond[0].getImm() == PPC::PRED_BIT_SET)
+      BuildMI(&MBB, DL, get(PPC::BC)).addOperand(Cond[1]).addMBB(TBB);
+    else if (Cond[0].getImm() == PPC::PRED_BIT_UNSET)
+      BuildMI(&MBB, DL, get(PPC::BCn)).addOperand(Cond[1]).addMBB(TBB);
+    else                // Conditional branch
+      BuildMI(&MBB, DL, get(PPC::BCC))
+        .addImm(Cond[0].getImm()).addOperand(Cond[1]).addMBB(TBB);
+    return 1;
+  }
+
+  // Two-way Conditional Branch.
+  if (Cond[1].getReg() == PPC::CTR || Cond[1].getReg() == PPC::CTR8)
+    BuildMI(&MBB, DL, get(Cond[0].getImm() ?
+                            (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ) :
+                            (isPPC64 ? PPC::BDZ8  : PPC::BDZ))).addMBB(TBB);
+  else if (Cond[0].getImm() == PPC::PRED_BIT_SET)
+    BuildMI(&MBB, DL, get(PPC::BC)).addOperand(Cond[1]).addMBB(TBB);
+  else if (Cond[0].getImm() == PPC::PRED_BIT_UNSET)
+    BuildMI(&MBB, DL, get(PPC::BCn)).addOperand(Cond[1]).addMBB(TBB);
+  else
+    BuildMI(&MBB, DL, get(PPC::BCC))
+      .addImm(Cond[0].getImm()).addOperand(Cond[1]).addMBB(TBB);
+  BuildMI(&MBB, DL, get(PPC::B)).addMBB(FBB);
+  return 2;
+}
+
+// Select analysis.
+bool PPCInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
+                const SmallVectorImpl<MachineOperand> &Cond,
+                unsigned TrueReg, unsigned FalseReg,
+                int &CondCycles, int &TrueCycles, int &FalseCycles) const {
+  if (!Subtarget.hasISEL())
+    return false;
+
+  if (Cond.size() != 2)
+    return false;
+
+  // If this is really a bdnz-like condition, then it cannot be turned into a
+  // select.
+  if (Cond[1].getReg() == PPC::CTR || Cond[1].getReg() == PPC::CTR8)
+    return false;
+
+  // Check register classes.
+  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  const TargetRegisterClass *RC =
+    RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
+  if (!RC)
+    return false;
+
+  // isel is for regular integer GPRs only.
+  if (!PPC::GPRCRegClass.hasSubClassEq(RC) &&
+      !PPC::GPRC_NOR0RegClass.hasSubClassEq(RC) &&
+      !PPC::G8RCRegClass.hasSubClassEq(RC) &&
+      !PPC::G8RC_NOX0RegClass.hasSubClassEq(RC))
+    return false;
+
+  // FIXME: These numbers are for the A2, how well they work for other cores is
+  // an open question. On the A2, the isel instruction has a 2-cycle latency
+  // but single-cycle throughput. These numbers are used in combination with
+  // the MispredictPenalty setting from the active SchedMachineModel.
+  CondCycles = 1;
+  TrueCycles = 1;
+  FalseCycles = 1;
+
+  return true;
+}
+
+void PPCInstrInfo::insertSelect(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MI, DebugLoc dl,
+                                unsigned DestReg,
+                                const SmallVectorImpl<MachineOperand> &Cond,
+                                unsigned TrueReg, unsigned FalseReg) const {
+  assert(Cond.size() == 2 &&
+         "PPC branch conditions have two components!");
+
+  assert(Subtarget.hasISEL() &&
+         "Cannot insert select on target without ISEL support");
+
+  // Get the register classes.
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  const TargetRegisterClass *RC =
+    RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
+  assert(RC && "TrueReg and FalseReg must have overlapping register classes");
+
+  bool Is64Bit = PPC::G8RCRegClass.hasSubClassEq(RC) ||
+                 PPC::G8RC_NOX0RegClass.hasSubClassEq(RC);
+  assert((Is64Bit ||
+          PPC::GPRCRegClass.hasSubClassEq(RC) ||
+          PPC::GPRC_NOR0RegClass.hasSubClassEq(RC)) &&
+         "isel is for regular integer GPRs only");
+
+  unsigned OpCode = Is64Bit ? PPC::ISEL8 : PPC::ISEL;
+  unsigned SelectPred = Cond[0].getImm();
+
+  unsigned SubIdx;
+  bool SwapOps;
+  switch (SelectPred) {
+  default: llvm_unreachable("invalid predicate for isel");
+  case PPC::PRED_EQ: SubIdx = PPC::sub_eq; SwapOps = false; break;
+  case PPC::PRED_NE: SubIdx = PPC::sub_eq; SwapOps = true; break;
+  case PPC::PRED_LT: SubIdx = PPC::sub_lt; SwapOps = false; break;
+  case PPC::PRED_GE: SubIdx = PPC::sub_lt; SwapOps = true; break;
+  case PPC::PRED_GT: SubIdx = PPC::sub_gt; SwapOps = false; break;
+  case PPC::PRED_LE: SubIdx = PPC::sub_gt; SwapOps = true; break;
+  case PPC::PRED_UN: SubIdx = PPC::sub_un; SwapOps = false; break;
+  case PPC::PRED_NU: SubIdx = PPC::sub_un; SwapOps = true; break;
+  case PPC::PRED_BIT_SET:   SubIdx = 0; SwapOps = false; break;
+  case PPC::PRED_BIT_UNSET: SubIdx = 0; SwapOps = true; break;
+  }
+
+  unsigned FirstReg =  SwapOps ? FalseReg : TrueReg,
+           SecondReg = SwapOps ? TrueReg  : FalseReg;
+
+  // The first input register of isel cannot be r0. If it is a member
+  // of a register class that can be r0, then copy it first (the
+  // register allocator should eliminate the copy).
+  if (MRI.getRegClass(FirstReg)->contains(PPC::R0) ||
+      MRI.getRegClass(FirstReg)->contains(PPC::X0)) {
+    const TargetRegisterClass *FirstRC =
+      MRI.getRegClass(FirstReg)->contains(PPC::X0) ?
+        &PPC::G8RC_NOX0RegClass : &PPC::GPRC_NOR0RegClass;
+    unsigned OldFirstReg = FirstReg;
+    FirstReg = MRI.createVirtualRegister(FirstRC);
+    BuildMI(MBB, MI, dl, get(TargetOpcode::COPY), FirstReg)
+      .addReg(OldFirstReg);
+  }
+
+  BuildMI(MBB, MI, dl, get(OpCode), DestReg)
+    .addReg(FirstReg).addReg(SecondReg)
+    .addReg(Cond[1].getReg(), 0, SubIdx);
+}
+
+void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator I, DebugLoc DL,
+                               unsigned DestReg, unsigned SrcReg,
+                               bool KillSrc) const {
+  // We can end up with self copies and similar things as a result of VSX copy
+  // legalization. Promote them here.
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  if (PPC::F8RCRegClass.contains(DestReg) &&
+      PPC::VSLRCRegClass.contains(SrcReg)) {
+    unsigned SuperReg =
+      TRI->getMatchingSuperReg(DestReg, PPC::sub_64, &PPC::VSRCRegClass);
+
+    if (VSXSelfCopyCrash && SrcReg == SuperReg)
+      llvm_unreachable("nop VSX copy");
+
+    DestReg = SuperReg;
+  } else if (PPC::VRRCRegClass.contains(DestReg) &&
+             PPC::VSHRCRegClass.contains(SrcReg)) {
+    unsigned SuperReg =
+      TRI->getMatchingSuperReg(DestReg, PPC::sub_128, &PPC::VSRCRegClass);
+
+    if (VSXSelfCopyCrash && SrcReg == SuperReg)
+      llvm_unreachable("nop VSX copy");
+
+    DestReg = SuperReg;
+  } else if (PPC::F8RCRegClass.contains(SrcReg) &&
+             PPC::VSLRCRegClass.contains(DestReg)) {
+    unsigned SuperReg =
+      TRI->getMatchingSuperReg(SrcReg, PPC::sub_64, &PPC::VSRCRegClass);
+
+    if (VSXSelfCopyCrash && DestReg == SuperReg)
+      llvm_unreachable("nop VSX copy");
+
+    SrcReg = SuperReg;
+  } else if (PPC::VRRCRegClass.contains(SrcReg) &&
+             PPC::VSHRCRegClass.contains(DestReg)) {
+    unsigned SuperReg =
+      TRI->getMatchingSuperReg(SrcReg, PPC::sub_128, &PPC::VSRCRegClass);
+
+    if (VSXSelfCopyCrash && DestReg == SuperReg)
+      llvm_unreachable("nop VSX copy");
+
+    SrcReg = SuperReg;
+  }
+
+  unsigned Opc;
+  if (PPC::GPRCRegClass.contains(DestReg, SrcReg))
+    Opc = PPC::OR;
+  else if (PPC::G8RCRegClass.contains(DestReg, SrcReg))
+    Opc = PPC::OR8;
+  else if (PPC::F4RCRegClass.contains(DestReg, SrcReg))
+    Opc = PPC::FMR;
+  else if (PPC::CRRCRegClass.contains(DestReg, SrcReg))
+    Opc = PPC::MCRF;
+  else if (PPC::VRRCRegClass.contains(DestReg, SrcReg))
+    Opc = PPC::VOR;
+  else if (PPC::VSRCRegClass.contains(DestReg, SrcReg))
+    // There are two different ways this can be done:
+    //   1. xxlor : This has lower latency (on the P7), 2 cycles, but can only
+    //      issue in VSU pipeline 0.
+    //   2. xmovdp/xmovsp: This has higher latency (on the P7), 6 cycles, but
+    //      can go to either pipeline.
+    // We'll always use xxlor here, because in practically all cases where
+    // copies are generated, they are close enough to some use that the
+    // lower-latency form is preferable.
+    Opc = PPC::XXLOR;
+  else if (PPC::VSFRCRegClass.contains(DestReg, SrcReg))
+    Opc = PPC::XXLORf;
+  else if (PPC::CRBITRCRegClass.contains(DestReg, SrcReg))
+    Opc = PPC::CROR;
+  else
+    llvm_unreachable("Impossible reg-to-reg copy");
+
+  const MCInstrDesc &MCID = get(Opc);
+  if (MCID.getNumOperands() == 3)
+    BuildMI(MBB, I, DL, MCID, DestReg)
+      .addReg(SrcReg).addReg(SrcReg, getKillRegState(KillSrc));
+  else
+    BuildMI(MBB, I, DL, MCID, DestReg).addReg(SrcReg, getKillRegState(KillSrc));
+}
+
+// This function returns true if a CR spill is necessary and false otherwise.
+bool
+PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
+                                  unsigned SrcReg, bool isKill,
+                                  int FrameIdx,
+                                  const TargetRegisterClass *RC,
+                                  SmallVectorImpl<MachineInstr*> &NewMIs,
+                                  bool &NonRI, bool &SpillsVRS) const{
+  // Note: If additional store instructions are added here,
+  // update isStoreToStackSlot.
+
+  DebugLoc DL;
+  if (PPC::GPRCRegClass.hasSubClassEq(RC) ||
+      PPC::GPRC_NOR0RegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STW))
+                                       .addReg(SrcReg,
+                                               getKillRegState(isKill)),
+                                       FrameIdx));
+  } else if (PPC::G8RCRegClass.hasSubClassEq(RC) ||
+             PPC::G8RC_NOX0RegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STD))
+                                       .addReg(SrcReg,
+                                               getKillRegState(isKill)),
+                                       FrameIdx));
+  } else if (PPC::F8RCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STFD))
+                                       .addReg(SrcReg,
+                                               getKillRegState(isKill)),
+                                       FrameIdx));
+  } else if (PPC::F4RCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STFS))
+                                       .addReg(SrcReg,
+                                               getKillRegState(isKill)),
+                                       FrameIdx));
+  } else if (PPC::CRRCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::SPILL_CR))
+                                       .addReg(SrcReg,
+                                               getKillRegState(isKill)),
+                                       FrameIdx));
+    return true;
+  } else if (PPC::CRBITRCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::SPILL_CRBIT))
+                                       .addReg(SrcReg,
+                                               getKillRegState(isKill)),
+                                       FrameIdx));
+    return true;
+  } else if (PPC::VRRCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STVX))
+                                       .addReg(SrcReg,
+                                               getKillRegState(isKill)),
+                                       FrameIdx));
+    NonRI = true;
+  } else if (PPC::VSRCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STXVD2X))
+                                       .addReg(SrcReg,
+                                               getKillRegState(isKill)),
+                                       FrameIdx));
+    NonRI = true;
+  } else if (PPC::VSFRCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STXSDX))
+                                       .addReg(SrcReg,
+                                               getKillRegState(isKill)),
+                                       FrameIdx));
+    NonRI = true;
+  } else if (PPC::VRSAVERCRegClass.hasSubClassEq(RC)) {
+    assert(Subtarget.isDarwin() &&
+           "VRSAVE only needs spill/restore on Darwin");
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::SPILL_VRSAVE))
+                                       .addReg(SrcReg,
+                                               getKillRegState(isKill)),
+                                       FrameIdx));
+    SpillsVRS = true;
+  } else {
+    llvm_unreachable("Unknown regclass!");
+  }
+
+  return false;
+}
+
+void
+PPCInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator MI,
+                                  unsigned SrcReg, bool isKill, int FrameIdx,
+                                  const TargetRegisterClass *RC,
+                                  const TargetRegisterInfo *TRI) const {
+  MachineFunction &MF = *MBB.getParent();
+  SmallVector<MachineInstr*, 4> NewMIs;
+
+  PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+  FuncInfo->setHasSpills();
+
+  bool NonRI = false, SpillsVRS = false;
+  if (StoreRegToStackSlot(MF, SrcReg, isKill, FrameIdx, RC, NewMIs,
+                          NonRI, SpillsVRS))
+    FuncInfo->setSpillsCR();
+
+  if (SpillsVRS)
+    FuncInfo->setSpillsVRSAVE();
+
+  if (NonRI)
+    FuncInfo->setHasNonRISpills();
+
+  for (unsigned i = 0, e = NewMIs.size(); i != e; ++i)
+    MBB.insert(MI, NewMIs[i]);
+
+  const MachineFrameInfo &MFI = *MF.getFrameInfo();
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx),
+                            MachineMemOperand::MOStore,
+                            MFI.getObjectSize(FrameIdx),
+                            MFI.getObjectAlignment(FrameIdx));
+  NewMIs.back()->addMemOperand(MF, MMO);
+}
+
+bool
+PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
+                                   unsigned DestReg, int FrameIdx,
+                                   const TargetRegisterClass *RC,
+                                   SmallVectorImpl<MachineInstr*> &NewMIs,
+                                   bool &NonRI, bool &SpillsVRS) const{
+  // Note: If additional load instructions are added here,
+  // update isLoadFromStackSlot.
+
+  if (PPC::GPRCRegClass.hasSubClassEq(RC) ||
+      PPC::GPRC_NOR0RegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LWZ),
+                                               DestReg), FrameIdx));
+  } else if (PPC::G8RCRegClass.hasSubClassEq(RC) ||
+             PPC::G8RC_NOX0RegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LD), DestReg),
+                                       FrameIdx));
+  } else if (PPC::F8RCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LFD), DestReg),
+                                       FrameIdx));
+  } else if (PPC::F4RCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LFS), DestReg),
+                                       FrameIdx));
+  } else if (PPC::CRRCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL,
+                                               get(PPC::RESTORE_CR), DestReg),
+                                       FrameIdx));
+    return true;
+  } else if (PPC::CRBITRCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL,
+                                               get(PPC::RESTORE_CRBIT), DestReg),
+                                       FrameIdx));
+    return true;
+  } else if (PPC::VRRCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LVX), DestReg),
+                                       FrameIdx));
+    NonRI = true;
+  } else if (PPC::VSRCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LXVD2X), DestReg),
+                                       FrameIdx));
+    NonRI = true;
+  } else if (PPC::VSFRCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LXSDX), DestReg),
+                                       FrameIdx));
+    NonRI = true;
+  } else if (PPC::VRSAVERCRegClass.hasSubClassEq(RC)) {
+    assert(Subtarget.isDarwin() &&
+           "VRSAVE only needs spill/restore on Darwin");
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL,
+                                               get(PPC::RESTORE_VRSAVE),
+                                               DestReg),
+                                       FrameIdx));
+    SpillsVRS = true;
+  } else {
+    llvm_unreachable("Unknown regclass!");
+  }
+
+  return false;
+}
+
+void
+PPCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   unsigned DestReg, int FrameIdx,
+                                   const TargetRegisterClass *RC,
+                                   const TargetRegisterInfo *TRI) const {
+  MachineFunction &MF = *MBB.getParent();
+  SmallVector<MachineInstr*, 4> NewMIs;
+  DebugLoc DL;
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+  FuncInfo->setHasSpills();
+
+  bool NonRI = false, SpillsVRS = false;
+  if (LoadRegFromStackSlot(MF, DL, DestReg, FrameIdx, RC, NewMIs,
+                           NonRI, SpillsVRS))
+    FuncInfo->setSpillsCR();
+
+  if (SpillsVRS)
+    FuncInfo->setSpillsVRSAVE();
+
+  if (NonRI)
+    FuncInfo->setHasNonRISpills();
+
+  for (unsigned i = 0, e = NewMIs.size(); i != e; ++i)
+    MBB.insert(MI, NewMIs[i]);
+
+  const MachineFrameInfo &MFI = *MF.getFrameInfo();
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx),
+                            MachineMemOperand::MOLoad,
+                            MFI.getObjectSize(FrameIdx),
+                            MFI.getObjectAlignment(FrameIdx));
+  NewMIs.back()->addMemOperand(MF, MMO);
+}
+
+bool PPCInstrInfo::
+ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+  assert(Cond.size() == 2 && "Invalid PPC branch opcode!");
+  if (Cond[1].getReg() == PPC::CTR8 || Cond[1].getReg() == PPC::CTR)
+    Cond[0].setImm(Cond[0].getImm() == 0 ? 1 : 0);
+  else
+    // Leave the CR# the same, but invert the condition.
+    Cond[0].setImm(PPC::InvertPredicate((PPC::Predicate)Cond[0].getImm()));
+  return false;
+}
+
+bool PPCInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
+                             unsigned Reg, MachineRegisterInfo *MRI) const {
+  // For some instructions, it is legal to fold ZERO into the RA register field.
+  // A zero immediate should always be loaded with a single li.
+  unsigned DefOpc = DefMI->getOpcode();
+  if (DefOpc != PPC::LI && DefOpc != PPC::LI8)
+    return false;
+  if (!DefMI->getOperand(1).isImm())
+    return false;
+  if (DefMI->getOperand(1).getImm() != 0)
+    return false;
+
+  // Note that we cannot here invert the arguments of an isel in order to fold
+  // a ZERO into what is presented as the second argument. All we have here
+  // is the condition bit, and that might come from a CR-logical bit operation.
+
+  const MCInstrDesc &UseMCID = UseMI->getDesc();
+
+  // Only fold into real machine instructions.
+  if (UseMCID.isPseudo())
+    return false;
+
+  unsigned UseIdx;
+  for (UseIdx = 0; UseIdx < UseMI->getNumOperands(); ++UseIdx)
+    if (UseMI->getOperand(UseIdx).isReg() &&
+        UseMI->getOperand(UseIdx).getReg() == Reg)
+      break;
+
+  assert(UseIdx < UseMI->getNumOperands() && "Cannot find Reg in UseMI");
+  assert(UseIdx < UseMCID.getNumOperands() && "No operand description for Reg");
+
+  const MCOperandInfo *UseInfo = &UseMCID.OpInfo[UseIdx];
+
+  // We can fold the zero if this register requires a GPRC_NOR0/G8RC_NOX0
+  // register (which might also be specified as a pointer class kind).
+  if (UseInfo->isLookupPtrRegClass()) {
+    if (UseInfo->RegClass /* Kind */ != 1)
+      return false;
+  } else {
+    if (UseInfo->RegClass != PPC::GPRC_NOR0RegClassID &&
+        UseInfo->RegClass != PPC::G8RC_NOX0RegClassID)
+      return false;
+  }
+
+  // Make sure this is not tied to an output register (or otherwise
+  // constrained). This is true for ST?UX registers, for example, which
+  // are tied to their output registers.
+  if (UseInfo->Constraints != 0)
+    return false;
+
+  unsigned ZeroReg;
+  if (UseInfo->isLookupPtrRegClass()) {
+    bool isPPC64 = Subtarget.isPPC64();
+    ZeroReg = isPPC64 ? PPC::ZERO8 : PPC::ZERO;
+  } else {
+    ZeroReg = UseInfo->RegClass == PPC::G8RC_NOX0RegClassID ?
+              PPC::ZERO8 : PPC::ZERO;
+  }
+
+  bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
+  UseMI->getOperand(UseIdx).setReg(ZeroReg);
+
+  if (DeleteDef)
+    DefMI->eraseFromParent();
+
+  return true;
+}
+
+static bool MBBDefinesCTR(MachineBasicBlock &MBB) {
+  for (MachineBasicBlock::iterator I = MBB.begin(), IE = MBB.end();
+       I != IE; ++I)
+    if (I->definesRegister(PPC::CTR) || I->definesRegister(PPC::CTR8))
+      return true;
+  return false;
+}
+
+// We should make sure that, if we're going to predicate both sides of a
+// condition (a diamond), that both sides don't define the counter register. We
+// can predicate counter-decrement-based branches, but while that predicates
+// the branching, it does not predicate the counter decrement. If we tried to
+// merge the triangle into one predicated block, we'd decrement the counter
+// twice.
+bool PPCInstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB,
+                     unsigned NumT, unsigned ExtraT,
+                     MachineBasicBlock &FMBB,
+                     unsigned NumF, unsigned ExtraF,
+                     const BranchProbability &Probability) const {
+  return !(MBBDefinesCTR(TMBB) && MBBDefinesCTR(FMBB));
+}
+
+
+bool PPCInstrInfo::isPredicated(const MachineInstr *MI) const {
+  // The predicated branches are identified by their type, not really by the
+  // explicit presence of a predicate. Furthermore, some of them can be
+  // predicated more than once. Because if conversion won't try to predicate
+  // any instruction which already claims to be predicated (by returning true
+  // here), always return false. In doing so, we let isPredicable() be the
+  // final word on whether not the instruction can be (further) predicated.
+
+  return false;
+}
+
+bool PPCInstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
+  if (!MI->isTerminator())
+    return false;
+
+  // Conditional branch is a special case.
+  if (MI->isBranch() && !MI->isBarrier())
+    return true;
+
+  return !isPredicated(MI);
+}
+
+bool PPCInstrInfo::PredicateInstruction(
+                     MachineInstr *MI,
+                     const SmallVectorImpl<MachineOperand> &Pred) const {
+  unsigned OpC = MI->getOpcode();
+  if (OpC == PPC::BLR) {
+    if (Pred[1].getReg() == PPC::CTR8 || Pred[1].getReg() == PPC::CTR) {
+      bool isPPC64 = Subtarget.isPPC64();
+      MI->setDesc(get(Pred[0].getImm() ?
+                      (isPPC64 ? PPC::BDNZLR8 : PPC::BDNZLR) :
+                      (isPPC64 ? PPC::BDZLR8  : PPC::BDZLR)));
+    } else if (Pred[0].getImm() == PPC::PRED_BIT_SET) {
+      MI->setDesc(get(PPC::BCLR));
+      MachineInstrBuilder(*MI->getParent()->getParent(), MI)
+        .addReg(Pred[1].getReg());
+    } else if (Pred[0].getImm() == PPC::PRED_BIT_UNSET) {
+      MI->setDesc(get(PPC::BCLRn));
+      MachineInstrBuilder(*MI->getParent()->getParent(), MI)
+        .addReg(Pred[1].getReg());
+    } else {
+      MI->setDesc(get(PPC::BCCLR));
+      MachineInstrBuilder(*MI->getParent()->getParent(), MI)
+        .addImm(Pred[0].getImm())
+        .addReg(Pred[1].getReg());
+    }
+
+    return true;
+  } else if (OpC == PPC::B) {
+    if (Pred[1].getReg() == PPC::CTR8 || Pred[1].getReg() == PPC::CTR) {
+      bool isPPC64 = Subtarget.isPPC64();
+      MI->setDesc(get(Pred[0].getImm() ?
+                      (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ) :
+                      (isPPC64 ? PPC::BDZ8  : PPC::BDZ)));
+    } else if (Pred[0].getImm() == PPC::PRED_BIT_SET) {
+      MachineBasicBlock *MBB = MI->getOperand(0).getMBB();
+      MI->RemoveOperand(0);
+
+      MI->setDesc(get(PPC::BC));
+      MachineInstrBuilder(*MI->getParent()->getParent(), MI)
+        .addReg(Pred[1].getReg())
+        .addMBB(MBB);
+    } else if (Pred[0].getImm() == PPC::PRED_BIT_UNSET) {
+      MachineBasicBlock *MBB = MI->getOperand(0).getMBB();
+      MI->RemoveOperand(0);
+
+      MI->setDesc(get(PPC::BCn));
+      MachineInstrBuilder(*MI->getParent()->getParent(), MI)
+        .addReg(Pred[1].getReg())
+        .addMBB(MBB);
+    } else {
+      MachineBasicBlock *MBB = MI->getOperand(0).getMBB();
+      MI->RemoveOperand(0);
+
+      MI->setDesc(get(PPC::BCC));
+      MachineInstrBuilder(*MI->getParent()->getParent(), MI)
+        .addImm(Pred[0].getImm())
+        .addReg(Pred[1].getReg())
+        .addMBB(MBB);
+    }
+
+    return true;
+  } else if (OpC == PPC::BCTR  || OpC == PPC::BCTR8 ||
+             OpC == PPC::BCTRL || OpC == PPC::BCTRL8) {
+    if (Pred[1].getReg() == PPC::CTR8 || Pred[1].getReg() == PPC::CTR)
+      llvm_unreachable("Cannot predicate bctr[l] on the ctr register");
+
+    bool setLR = OpC == PPC::BCTRL || OpC == PPC::BCTRL8;
+    bool isPPC64 = Subtarget.isPPC64();
+
+    if (Pred[0].getImm() == PPC::PRED_BIT_SET) {
+      MI->setDesc(get(isPPC64 ? (setLR ? PPC::BCCTRL8 : PPC::BCCTR8) :
+                                (setLR ? PPC::BCCTRL  : PPC::BCCTR)));
+      MachineInstrBuilder(*MI->getParent()->getParent(), MI)
+        .addReg(Pred[1].getReg());
+      return true;
+    } else if (Pred[0].getImm() == PPC::PRED_BIT_UNSET) {
+      MI->setDesc(get(isPPC64 ? (setLR ? PPC::BCCTRL8n : PPC::BCCTR8n) :
+                                (setLR ? PPC::BCCTRLn  : PPC::BCCTRn)));
+      MachineInstrBuilder(*MI->getParent()->getParent(), MI)
+        .addReg(Pred[1].getReg());
+      return true;
+    }
+
+    MI->setDesc(get(isPPC64 ? (setLR ? PPC::BCCCTRL8 : PPC::BCCCTR8) :
+                              (setLR ? PPC::BCCCTRL  : PPC::BCCCTR)));
+    MachineInstrBuilder(*MI->getParent()->getParent(), MI)
+      .addImm(Pred[0].getImm())
+      .addReg(Pred[1].getReg());
+    return true;
+  }
+
+  return false;
+}
+
+bool PPCInstrInfo::SubsumesPredicate(
+                     const SmallVectorImpl<MachineOperand> &Pred1,
+                     const SmallVectorImpl<MachineOperand> &Pred2) const {
+  assert(Pred1.size() == 2 && "Invalid PPC first predicate");
+  assert(Pred2.size() == 2 && "Invalid PPC second predicate");
+
+  if (Pred1[1].getReg() == PPC::CTR8 || Pred1[1].getReg() == PPC::CTR)
+    return false;
+  if (Pred2[1].getReg() == PPC::CTR8 || Pred2[1].getReg() == PPC::CTR)
+    return false;
+
+  // P1 can only subsume P2 if they test the same condition register.
+  if (Pred1[1].getReg() != Pred2[1].getReg())
+    return false;
+
+  PPC::Predicate P1 = (PPC::Predicate) Pred1[0].getImm();
+  PPC::Predicate P2 = (PPC::Predicate) Pred2[0].getImm();
+
+  if (P1 == P2)
+    return true;
+
+  // Does P1 subsume P2, e.g. GE subsumes GT.
+  if (P1 == PPC::PRED_LE &&
+      (P2 == PPC::PRED_LT || P2 == PPC::PRED_EQ))
+    return true;
+  if (P1 == PPC::PRED_GE &&
+      (P2 == PPC::PRED_GT || P2 == PPC::PRED_EQ))
+    return true;
+
+  return false;
+}
+
+bool PPCInstrInfo::DefinesPredicate(MachineInstr *MI,
+                                    std::vector<MachineOperand> &Pred) const {
+  // Note: At the present time, the contents of Pred from this function is
+  // unused by IfConversion. This implementation follows ARM by pushing the
+  // CR-defining operand. Because the 'DZ' and 'DNZ' count as types of
+  // predicate, instructions defining CTR or CTR8 are also included as
+  // predicate-defining instructions.
+
+  const TargetRegisterClass *RCs[] =
+    { &PPC::CRRCRegClass, &PPC::CRBITRCRegClass,
+      &PPC::CTRRCRegClass, &PPC::CTRRC8RegClass };
+
+  bool Found = false;
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    for (unsigned c = 0; c < array_lengthof(RCs) && !Found; ++c) {
+      const TargetRegisterClass *RC = RCs[c];
+      if (MO.isReg()) {
+        if (MO.isDef() && RC->contains(MO.getReg())) {
+          Pred.push_back(MO);
+          Found = true;
+        }
+      } else if (MO.isRegMask()) {
+        for (TargetRegisterClass::iterator I = RC->begin(),
+             IE = RC->end(); I != IE; ++I)
+          if (MO.clobbersPhysReg(*I)) {
+            Pred.push_back(MO);
+            Found = true;
+          }
+      }
+    }
+  }
+
+  return Found;
+}
+
+bool PPCInstrInfo::isPredicable(MachineInstr *MI) const {
+  unsigned OpC = MI->getOpcode();
+  switch (OpC) {
+  default:
+    return false;
+  case PPC::B:
+  case PPC::BLR:
+  case PPC::BCTR:
+  case PPC::BCTR8:
+  case PPC::BCTRL:
+  case PPC::BCTRL8:
+    return true;
+  }
+}
+
+bool PPCInstrInfo::analyzeCompare(const MachineInstr *MI,
+                                  unsigned &SrcReg, unsigned &SrcReg2,
+                                  int &Mask, int &Value) const {
+  unsigned Opc = MI->getOpcode();
+
+  switch (Opc) {
+  default: return false;
+  case PPC::CMPWI:
+  case PPC::CMPLWI:
+  case PPC::CMPDI:
+  case PPC::CMPLDI:
+    SrcReg = MI->getOperand(1).getReg();
+    SrcReg2 = 0;
+    Value = MI->getOperand(2).getImm();
+    Mask = 0xFFFF;
+    return true;
+  case PPC::CMPW:
+  case PPC::CMPLW:
+  case PPC::CMPD:
+  case PPC::CMPLD:
+  case PPC::FCMPUS:
+  case PPC::FCMPUD:
+    SrcReg = MI->getOperand(1).getReg();
+    SrcReg2 = MI->getOperand(2).getReg();
+    return true;
+  }
+}
+
+bool PPCInstrInfo::optimizeCompareInstr(MachineInstr *CmpInstr,
+                                        unsigned SrcReg, unsigned SrcReg2,
+                                        int Mask, int Value,
+                                        const MachineRegisterInfo *MRI) const {
+  if (DisableCmpOpt)
+    return false;
+
+  int OpC = CmpInstr->getOpcode();
+  unsigned CRReg = CmpInstr->getOperand(0).getReg();
+
+  // FP record forms set CR1 based on the execption status bits, not a
+  // comparison with zero.
+  if (OpC == PPC::FCMPUS || OpC == PPC::FCMPUD)
+    return false;
+
+  // The record forms set the condition register based on a signed comparison
+  // with zero (so says the ISA manual). This is not as straightforward as it
+  // seems, however, because this is always a 64-bit comparison on PPC64, even
+  // for instructions that are 32-bit in nature (like slw for example).
+  // So, on PPC32, for unsigned comparisons, we can use the record forms only
+  // for equality checks (as those don't depend on the sign). On PPC64,
+  // we are restricted to equality for unsigned 64-bit comparisons and for
+  // signed 32-bit comparisons the applicability is more restricted.
+  bool isPPC64 = Subtarget.isPPC64();
+  bool is32BitSignedCompare   = OpC ==  PPC::CMPWI || OpC == PPC::CMPW;
+  bool is32BitUnsignedCompare = OpC == PPC::CMPLWI || OpC == PPC::CMPLW;
+  bool is64BitUnsignedCompare = OpC == PPC::CMPLDI || OpC == PPC::CMPLD;
+
+  // Get the unique definition of SrcReg.
+  MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
+  if (!MI) return false;
+  int MIOpC = MI->getOpcode();
+
+  bool equalityOnly = false;
+  bool noSub = false;
+  if (isPPC64) {
+    if (is32BitSignedCompare) {
+      // We can perform this optimization only if MI is sign-extending.
+      if (MIOpC == PPC::SRAW  || MIOpC == PPC::SRAWo ||
+          MIOpC == PPC::SRAWI || MIOpC == PPC::SRAWIo ||
+          MIOpC == PPC::EXTSB || MIOpC == PPC::EXTSBo ||
+          MIOpC == PPC::EXTSH || MIOpC == PPC::EXTSHo ||
+          MIOpC == PPC::EXTSW || MIOpC == PPC::EXTSWo) {
+        noSub = true;
+      } else
+        return false;
+    } else if (is32BitUnsignedCompare) {
+      // We can perform this optimization, equality only, if MI is
+      // zero-extending.
+      if (MIOpC == PPC::CNTLZW || MIOpC == PPC::CNTLZWo ||
+          MIOpC == PPC::SLW    || MIOpC == PPC::SLWo ||
+          MIOpC == PPC::SRW    || MIOpC == PPC::SRWo) {
+        noSub = true;
+        equalityOnly = true;
+      } else
+        return false;
+    } else
+      equalityOnly = is64BitUnsignedCompare;
+  } else
+    equalityOnly = is32BitUnsignedCompare;
+
+  if (equalityOnly) {
+    // We need to check the uses of the condition register in order to reject
+    // non-equality comparisons.
+    for (MachineRegisterInfo::use_instr_iterator I =MRI->use_instr_begin(CRReg),
+         IE = MRI->use_instr_end(); I != IE; ++I) {
+      MachineInstr *UseMI = &*I;
+      if (UseMI->getOpcode() == PPC::BCC) {
+        unsigned Pred = UseMI->getOperand(0).getImm();
+        if (Pred != PPC::PRED_EQ && Pred != PPC::PRED_NE)
+          return false;
+      } else if (UseMI->getOpcode() == PPC::ISEL ||
+                 UseMI->getOpcode() == PPC::ISEL8) {
+        unsigned SubIdx = UseMI->getOperand(3).getSubReg();
+        if (SubIdx != PPC::sub_eq)
+          return false;
+      } else
+        return false;
+    }
+  }
+
+  MachineBasicBlock::iterator I = CmpInstr;
+
+  // Scan forward to find the first use of the compare.
+  for (MachineBasicBlock::iterator EL = CmpInstr->getParent()->end();
+       I != EL; ++I) {
+    bool FoundUse = false;
+    for (MachineRegisterInfo::use_instr_iterator J =MRI->use_instr_begin(CRReg),
+         JE = MRI->use_instr_end(); J != JE; ++J)
+      if (&*J == &*I) {
+        FoundUse = true;
+        break;
+      }
+
+    if (FoundUse)
+      break;
+  }
+
+  // There are two possible candidates which can be changed to set CR[01].
+  // One is MI, the other is a SUB instruction.
+  // For CMPrr(r1,r2), we are looking for SUB(r1,r2) or SUB(r2,r1).
+  MachineInstr *Sub = nullptr;
+  if (SrcReg2 != 0)
+    // MI is not a candidate for CMPrr.
+    MI = nullptr;
+  // FIXME: Conservatively refuse to convert an instruction which isn't in the
+  // same BB as the comparison. This is to allow the check below to avoid calls
+  // (and other explicit clobbers); instead we should really check for these
+  // more explicitly (in at least a few predecessors).
+  else if (MI->getParent() != CmpInstr->getParent() || Value != 0) {
+    // PPC does not have a record-form SUBri.
+    return false;
+  }
+
+  // Search for Sub.
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  --I;
+
+  // Get ready to iterate backward from CmpInstr.
+  MachineBasicBlock::iterator E = MI,
+                              B = CmpInstr->getParent()->begin();
+
+  for (; I != E && !noSub; --I) {
+    const MachineInstr &Instr = *I;
+    unsigned IOpC = Instr.getOpcode();
+
+    if (&*I != CmpInstr && (
+        Instr.modifiesRegister(PPC::CR0, TRI) ||
+        Instr.readsRegister(PPC::CR0, TRI)))
+      // This instruction modifies or uses the record condition register after
+      // the one we want to change. While we could do this transformation, it
+      // would likely not be profitable. This transformation removes one
+      // instruction, and so even forcing RA to generate one move probably
+      // makes it unprofitable.
+      return false;
+
+    // Check whether CmpInstr can be made redundant by the current instruction.
+    if ((OpC == PPC::CMPW || OpC == PPC::CMPLW ||
+         OpC == PPC::CMPD || OpC == PPC::CMPLD) &&
+        (IOpC == PPC::SUBF || IOpC == PPC::SUBF8) &&
+        ((Instr.getOperand(1).getReg() == SrcReg &&
+          Instr.getOperand(2).getReg() == SrcReg2) ||
+        (Instr.getOperand(1).getReg() == SrcReg2 &&
+         Instr.getOperand(2).getReg() == SrcReg))) {
+      Sub = &*I;
+      break;
+    }
+
+    if (I == B)
+      // The 'and' is below the comparison instruction.
+      return false;
+  }
+
+  // Return false if no candidates exist.
+  if (!MI && !Sub)
+    return false;
+
+  // The single candidate is called MI.
+  if (!MI) MI = Sub;
+
+  int NewOpC = -1;
+  MIOpC = MI->getOpcode();
+  if (MIOpC == PPC::ANDIo || MIOpC == PPC::ANDIo8)
+    NewOpC = MIOpC;
+  else {
+    NewOpC = PPC::getRecordFormOpcode(MIOpC);
+    if (NewOpC == -1 && PPC::getNonRecordFormOpcode(MIOpC) != -1)
+      NewOpC = MIOpC;
+  }
+
+  // FIXME: On the non-embedded POWER architectures, only some of the record
+  // forms are fast, and we should use only the fast ones.
+
+  // The defining instruction has a record form (or is already a record
+  // form). It is possible, however, that we'll need to reverse the condition
+  // code of the users.
+  if (NewOpC == -1)
+    return false;
+
+  SmallVector<std::pair<MachineOperand*, PPC::Predicate>, 4> PredsToUpdate;
+  SmallVector<std::pair<MachineOperand*, unsigned>, 4> SubRegsToUpdate;
+
+  // If we have SUB(r1, r2) and CMP(r2, r1), the condition code based on CMP
+  // needs to be updated to be based on SUB.  Push the condition code
+  // operands to OperandsToUpdate.  If it is safe to remove CmpInstr, the
+  // condition code of these operands will be modified.
+  bool ShouldSwap = false;
+  if (Sub) {
+    ShouldSwap = SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 &&
+      Sub->getOperand(2).getReg() == SrcReg;
+
+    // The operands to subf are the opposite of sub, so only in the fixed-point
+    // case, invert the order.
+    ShouldSwap = !ShouldSwap;
+  }
+
+  if (ShouldSwap)
+    for (MachineRegisterInfo::use_instr_iterator
+         I = MRI->use_instr_begin(CRReg), IE = MRI->use_instr_end();
+         I != IE; ++I) {
+      MachineInstr *UseMI = &*I;
+      if (UseMI->getOpcode() == PPC::BCC) {
+        PPC::Predicate Pred = (PPC::Predicate) UseMI->getOperand(0).getImm();
+        assert((!equalityOnly ||
+                Pred == PPC::PRED_EQ || Pred == PPC::PRED_NE) &&
+               "Invalid predicate for equality-only optimization");
+        PredsToUpdate.push_back(std::make_pair(&(UseMI->getOperand(0)),
+                                PPC::getSwappedPredicate(Pred)));
+      } else if (UseMI->getOpcode() == PPC::ISEL ||
+                 UseMI->getOpcode() == PPC::ISEL8) {
+        unsigned NewSubReg = UseMI->getOperand(3).getSubReg();
+        assert((!equalityOnly || NewSubReg == PPC::sub_eq) &&
+               "Invalid CR bit for equality-only optimization");
+
+        if (NewSubReg == PPC::sub_lt)
+          NewSubReg = PPC::sub_gt;
+        else if (NewSubReg == PPC::sub_gt)
+          NewSubReg = PPC::sub_lt;
+
+        SubRegsToUpdate.push_back(std::make_pair(&(UseMI->getOperand(3)),
+                                                 NewSubReg));
+      } else // We need to abort on a user we don't understand.
+        return false;
+    }
+
+  // Create a new virtual register to hold the value of the CR set by the
+  // record-form instruction. If the instruction was not previously in
+  // record form, then set the kill flag on the CR.
+  CmpInstr->eraseFromParent();
+
+  MachineBasicBlock::iterator MII = MI;
+  BuildMI(*MI->getParent(), std::next(MII), MI->getDebugLoc(),
+          get(TargetOpcode::COPY), CRReg)
+    .addReg(PPC::CR0, MIOpC != NewOpC ? RegState::Kill : 0);
+
+  if (MIOpC != NewOpC) {
+    // We need to be careful here: we're replacing one instruction with
+    // another, and we need to make sure that we get all of the right
+    // implicit uses and defs. On the other hand, the caller may be holding
+    // an iterator to this instruction, and so we can't delete it (this is
+    // specifically the case if this is the instruction directly after the
+    // compare).
+
+    const MCInstrDesc &NewDesc = get(NewOpC);
+    MI->setDesc(NewDesc);
+
+    if (NewDesc.ImplicitDefs)
+      for (const uint16_t *ImpDefs = NewDesc.getImplicitDefs();
+           *ImpDefs; ++ImpDefs)
+        if (!MI->definesRegister(*ImpDefs))
+          MI->addOperand(*MI->getParent()->getParent(),
+                         MachineOperand::CreateReg(*ImpDefs, true, true));
+    if (NewDesc.ImplicitUses)
+      for (const uint16_t *ImpUses = NewDesc.getImplicitUses();
+           *ImpUses; ++ImpUses)
+        if (!MI->readsRegister(*ImpUses))
+          MI->addOperand(*MI->getParent()->getParent(),
+                         MachineOperand::CreateReg(*ImpUses, false, true));
+  }
+
+  // Modify the condition code of operands in OperandsToUpdate.
+  // Since we have SUB(r1, r2) and CMP(r2, r1), the condition code needs to
+  // be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
+  for (unsigned i = 0, e = PredsToUpdate.size(); i < e; i++)
+    PredsToUpdate[i].first->setImm(PredsToUpdate[i].second);
+
+  for (unsigned i = 0, e = SubRegsToUpdate.size(); i < e; i++)
+    SubRegsToUpdate[i].first->setSubReg(SubRegsToUpdate[i].second);
+
+  return true;
+}
+
+/// GetInstSize - Return the number of bytes of code the specified
+/// instruction may be.  This returns the maximum number of bytes.
+///
+unsigned PPCInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
+  unsigned Opcode = MI->getOpcode();
+
+  if (Opcode == PPC::INLINEASM) {
+    const MachineFunction *MF = MI->getParent()->getParent();
+    const char *AsmStr = MI->getOperand(0).getSymbolName();
+    return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
+  } else {
+    const MCInstrDesc &Desc = get(Opcode);
+    return Desc.getSize();
+  }
+}
+
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "ppc-vsx-fma-mutate"
+
+namespace {
+  // PPCVSXFMAMutate pass - For copies between VSX registers and non-VSX registers
+  // (Altivec and scalar floating-point registers), we need to transform the
+  // copies into subregister copies with other restrictions.
+  struct PPCVSXFMAMutate : public MachineFunctionPass {
+    static char ID;
+    PPCVSXFMAMutate() : MachineFunctionPass(ID) {
+      initializePPCVSXFMAMutatePass(*PassRegistry::getPassRegistry());
+    }
+
+    LiveIntervals *LIS;
+
+    const PPCTargetMachine *TM;
+    const PPCInstrInfo *TII;
+
+protected:
+    bool processBlock(MachineBasicBlock &MBB) {
+      bool Changed = false;
+
+      MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+      for (MachineBasicBlock::iterator I = MBB.begin(), IE = MBB.end();
+           I != IE; ++I) {
+        MachineInstr *MI = I;
+
+        // The default (A-type) VSX FMA form kills the addend (it is taken from
+        // the target register, which is then updated to reflect the result of
+        // the FMA). If the instruction, however, kills one of the registers
+        // used for the product, then we can use the M-form instruction (which
+        // will take that value from the to-be-defined register).
+
+        int AltOpc = PPC::getAltVSXFMAOpcode(MI->getOpcode());
+        if (AltOpc == -1)
+          continue;
+
+        // This pass is run after register coalescing, and so we're looking for
+        // a situation like this:
+        //   ...
+        //   %vreg5<def> = COPY %vreg9; VSLRC:%vreg5,%vreg9
+        //   %vreg5<def,tied1> = XSMADDADP %vreg5<tied0>, %vreg17, %vreg16,
+        //                         %RM<imp-use>; VSLRC:%vreg5,%vreg17,%vreg16
+        //   ...
+        //   %vreg9<def,tied1> = XSMADDADP %vreg9<tied0>, %vreg17, %vreg19,
+        //                         %RM<imp-use>; VSLRC:%vreg9,%vreg17,%vreg19
+        //   ...
+        // Where we can eliminate the copy by changing from the A-type to the
+        // M-type instruction. Specifically, for this example, this means:
+        //   %vreg5<def,tied1> = XSMADDADP %vreg5<tied0>, %vreg17, %vreg16,
+        //                         %RM<imp-use>; VSLRC:%vreg5,%vreg17,%vreg16
+        // is replaced by:
+        //   %vreg16<def,tied1> = XSMADDMDP %vreg16<tied0>, %vreg18, %vreg9,
+        //                         %RM<imp-use>; VSLRC:%vreg16,%vreg18,%vreg9
+        // and we remove: %vreg5<def> = COPY %vreg9; VSLRC:%vreg5,%vreg9
+
+        SlotIndex FMAIdx = LIS->getInstructionIndex(MI);
+
+        VNInfo *AddendValNo =
+          LIS->getInterval(MI->getOperand(1).getReg()).Query(FMAIdx).valueIn();
+        MachineInstr *AddendMI = LIS->getInstructionFromIndex(AddendValNo->def);
+
+        // The addend and this instruction must be in the same block.
+
+        if (!AddendMI || AddendMI->getParent() != MI->getParent())
+          continue;
+
+        // The addend must be a full copy within the same register class.
+
+        if (!AddendMI->isFullCopy())
+          continue;
+
+        unsigned AddendSrcReg = AddendMI->getOperand(1).getReg();
+        if (TargetRegisterInfo::isVirtualRegister(AddendSrcReg)) {
+          if (MRI.getRegClass(AddendMI->getOperand(0).getReg()) !=
+              MRI.getRegClass(AddendSrcReg))
+            continue;
+        } else {
+          // If AddendSrcReg is a physical register, make sure the destination
+          // register class contains it.
+          if (!MRI.getRegClass(AddendMI->getOperand(0).getReg())
+                ->contains(AddendSrcReg))
+            continue;
+        }
+
+        // In theory, there could be other uses of the addend copy before this
+        // fma.  We could deal with this, but that would require additional
+        // logic below and I suspect it will not occur in any relevant
+        // situations.
+        bool OtherUsers = false;
+        for (auto J = std::prev(I), JE = MachineBasicBlock::iterator(AddendMI);
+             J != JE; --J)
+          if (J->readsVirtualRegister(AddendMI->getOperand(0).getReg())) {
+            OtherUsers = true;
+            break;
+          }
+
+        if (OtherUsers)
+          continue;
+
+        // Find one of the product operands that is killed by this instruction.
+
+        unsigned KilledProdOp = 0, OtherProdOp = 0;
+        if (LIS->getInterval(MI->getOperand(2).getReg())
+                     .Query(FMAIdx).isKill()) {
+          KilledProdOp = 2;
+          OtherProdOp  = 3;
+        } else if (LIS->getInterval(MI->getOperand(3).getReg())
+                     .Query(FMAIdx).isKill()) {
+          KilledProdOp = 3;
+          OtherProdOp  = 2;
+        }
+
+        // If there are no killed product operands, then this transformation is
+        // likely not profitable.
+        if (!KilledProdOp)
+          continue;
+
+        // In order to replace the addend here with the source of the copy,
+        // it must still be live here.
+        if (!LIS->getInterval(AddendMI->getOperand(1).getReg()).liveAt(FMAIdx))
+          continue;
+
+        // Transform: (O2 * O3) + O1 -> (O2 * O1) + O3.
+
+        unsigned AddReg = AddendMI->getOperand(1).getReg();
+        unsigned KilledProdReg = MI->getOperand(KilledProdOp).getReg();
+        unsigned OtherProdReg  = MI->getOperand(OtherProdOp).getReg();
+
+        unsigned AddSubReg = AddendMI->getOperand(1).getSubReg();
+        unsigned KilledProdSubReg = MI->getOperand(KilledProdOp).getSubReg();
+        unsigned OtherProdSubReg  = MI->getOperand(OtherProdOp).getSubReg();
+
+        bool AddRegKill = AddendMI->getOperand(1).isKill();
+        bool KilledProdRegKill = MI->getOperand(KilledProdOp).isKill();
+        bool OtherProdRegKill  = MI->getOperand(OtherProdOp).isKill();
+
+        bool AddRegUndef = AddendMI->getOperand(1).isUndef();
+        bool KilledProdRegUndef = MI->getOperand(KilledProdOp).isUndef();
+        bool OtherProdRegUndef  = MI->getOperand(OtherProdOp).isUndef();
+
+        unsigned OldFMAReg = MI->getOperand(0).getReg();
+
+        assert(OldFMAReg == AddendMI->getOperand(0).getReg() &&
+               "Addend copy not tied to old FMA output!");
+
+        DEBUG(dbgs() << "VSX FMA Mutation:\n    " << *MI;);
+
+        MI->getOperand(0).setReg(KilledProdReg);
+        MI->getOperand(1).setReg(KilledProdReg);
+        MI->getOperand(3).setReg(AddReg);
+        MI->getOperand(2).setReg(OtherProdReg);
+
+        MI->getOperand(0).setSubReg(KilledProdSubReg);
+        MI->getOperand(1).setSubReg(KilledProdSubReg);
+        MI->getOperand(3).setSubReg(AddSubReg);
+        MI->getOperand(2).setSubReg(OtherProdSubReg);
+
+        MI->getOperand(1).setIsKill(KilledProdRegKill);
+        MI->getOperand(3).setIsKill(AddRegKill);
+        MI->getOperand(2).setIsKill(OtherProdRegKill);
+
+        MI->getOperand(1).setIsUndef(KilledProdRegUndef);
+        MI->getOperand(3).setIsUndef(AddRegUndef);
+        MI->getOperand(2).setIsUndef(OtherProdRegUndef);
+
+        MI->setDesc(TII->get(AltOpc));
+
+        DEBUG(dbgs() << " -> " << *MI);
+
+        // The killed product operand was killed here, so we can reuse it now
+        // for the result of the fma.
+
+        LiveInterval &FMAInt = LIS->getInterval(OldFMAReg);
+        VNInfo *FMAValNo = FMAInt.getVNInfoAt(FMAIdx.getRegSlot());
+        for (auto UI = MRI.reg_nodbg_begin(OldFMAReg), UE = MRI.reg_nodbg_end();
+             UI != UE;) {
+          MachineOperand &UseMO = *UI;
+          MachineInstr *UseMI = UseMO.getParent();
+          ++UI;
+
+          // Don't replace the result register of the copy we're about to erase.
+          if (UseMI == AddendMI)
+            continue;
+
+          UseMO.setReg(KilledProdReg);
+          UseMO.setSubReg(KilledProdSubReg);
+        }
+
+        // Extend the live intervals of the killed product operand to hold the
+        // fma result.
+
+        LiveInterval &NewFMAInt = LIS->getInterval(KilledProdReg);
+        for (LiveInterval::iterator AI = FMAInt.begin(), AE = FMAInt.end();
+             AI != AE; ++AI) {
+          // Don't add the segment that corresponds to the original copy.
+          if (AI->valno == AddendValNo)
+            continue;
+
+          VNInfo *NewFMAValNo =
+            NewFMAInt.getNextValue(AI->start,
+                                   LIS->getVNInfoAllocator());
+
+          NewFMAInt.addSegment(LiveInterval::Segment(AI->start, AI->end,
+                                                     NewFMAValNo));
+        }
+        DEBUG(dbgs() << "  extended: " << NewFMAInt << '\n');
+
+        FMAInt.removeValNo(FMAValNo);
+        DEBUG(dbgs() << "  trimmed:  " << FMAInt << '\n');
+
+        // Remove the (now unused) copy.
+
+        DEBUG(dbgs() << "  removing: " << *AddendMI << '\n');
+        LIS->RemoveMachineInstrFromMaps(AddendMI);
+        AddendMI->eraseFromParent();
+
+        Changed = true;
+      }
+
+      return Changed;
+    }
+
+public:
+    bool runOnMachineFunction(MachineFunction &MF) override {
+      TM = static_cast<const PPCTargetMachine *>(&MF.getTarget());
+      // If we don't have VSX then go ahead and return without doing
+      // anything.
+      if (!TM->getSubtargetImpl()->hasVSX())
+        return false;
+
+      LIS = &getAnalysis<LiveIntervals>();
+
+      TII = TM->getInstrInfo();
+
+      bool Changed = false;
+
+      if (DisableVSXFMAMutate)
+        return Changed;
+
+      for (MachineFunction::iterator I = MF.begin(); I != MF.end();) {
+        MachineBasicBlock &B = *I++;
+        if (processBlock(B))
+          Changed = true;
+      }
+
+      return Changed;
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<LiveIntervals>();
+      AU.addPreserved<LiveIntervals>();
+      AU.addRequired<SlotIndexes>();
+      AU.addPreserved<SlotIndexes>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+  };
+}
+
+INITIALIZE_PASS_BEGIN(PPCVSXFMAMutate, DEBUG_TYPE,
+                      "PowerPC VSX FMA Mutation", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_END(PPCVSXFMAMutate, DEBUG_TYPE,
+                    "PowerPC VSX FMA Mutation", false, false)
+
+char &llvm::PPCVSXFMAMutateID = PPCVSXFMAMutate::ID;
+
+char PPCVSXFMAMutate::ID = 0;
+FunctionPass*
+llvm::createPPCVSXFMAMutatePass() { return new PPCVSXFMAMutate(); }
+
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "ppc-vsx-copy"
+
+namespace llvm {
+  void initializePPCVSXCopyPass(PassRegistry&);
+}
+
+namespace {
+  // PPCVSXCopy pass - For copies between VSX registers and non-VSX registers
+  // (Altivec and scalar floating-point registers), we need to transform the
+  // copies into subregister copies with other restrictions.
+  struct PPCVSXCopy : public MachineFunctionPass {
+    static char ID;
+    PPCVSXCopy() : MachineFunctionPass(ID) {
+      initializePPCVSXCopyPass(*PassRegistry::getPassRegistry());
+    }
+
+    const PPCTargetMachine *TM;
+    const PPCInstrInfo *TII;
+
+    bool IsRegInClass(unsigned Reg, const TargetRegisterClass *RC,
+                      MachineRegisterInfo &MRI) {
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        return RC->hasSubClassEq(MRI.getRegClass(Reg));
+      } else if (RC->contains(Reg)) {
+        return true;
+      }
+
+      return false;
+    }
+
+    bool IsVSReg(unsigned Reg, MachineRegisterInfo &MRI) {
+      return IsRegInClass(Reg, &PPC::VSRCRegClass, MRI);
+    }
+
+    bool IsVRReg(unsigned Reg, MachineRegisterInfo &MRI) {
+      return IsRegInClass(Reg, &PPC::VRRCRegClass, MRI);
+    }
+
+    bool IsF8Reg(unsigned Reg, MachineRegisterInfo &MRI) {
+      return IsRegInClass(Reg, &PPC::F8RCRegClass, MRI);
+    }
+
+protected:
+    bool processBlock(MachineBasicBlock &MBB) {
+      bool Changed = false;
+
+      MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+      for (MachineBasicBlock::iterator I = MBB.begin(), IE = MBB.end();
+           I != IE; ++I) {
+        MachineInstr *MI = I;
+        if (!MI->isFullCopy())
+          continue;
+
+        MachineOperand &DstMO = MI->getOperand(0);
+        MachineOperand &SrcMO = MI->getOperand(1);
+
+        if ( IsVSReg(DstMO.getReg(), MRI) &&
+            !IsVSReg(SrcMO.getReg(), MRI)) {
+          // This is a copy *to* a VSX register from a non-VSX register.
+          Changed = true;
+
+          const TargetRegisterClass *SrcRC =
+            IsVRReg(SrcMO.getReg(), MRI) ? &PPC::VSHRCRegClass :
+                                           &PPC::VSLRCRegClass;
+          assert((IsF8Reg(SrcMO.getReg(), MRI) ||
+                  IsVRReg(SrcMO.getReg(), MRI)) &&
+                 "Unknown source for a VSX copy");
+
+          unsigned NewVReg = MRI.createVirtualRegister(SrcRC);
+          BuildMI(MBB, MI, MI->getDebugLoc(),
+                  TII->get(TargetOpcode::SUBREG_TO_REG), NewVReg)
+            .addImm(1) // add 1, not 0, because there is no implicit clearing
+                       // of the high bits.
+            .addOperand(SrcMO)
+            .addImm(IsVRReg(SrcMO.getReg(), MRI) ? PPC::sub_128 :
+                                                   PPC::sub_64);
+
+          // The source of the original copy is now the new virtual register.
+          SrcMO.setReg(NewVReg);
+        } else if (!IsVSReg(DstMO.getReg(), MRI) &&
+                    IsVSReg(SrcMO.getReg(), MRI)) {
+          // This is a copy *from* a VSX register to a non-VSX register.
+          Changed = true;
+
+          const TargetRegisterClass *DstRC =
+            IsVRReg(DstMO.getReg(), MRI) ? &PPC::VSHRCRegClass :
+                                           &PPC::VSLRCRegClass;
+          assert((IsF8Reg(DstMO.getReg(), MRI) ||
+                  IsVRReg(DstMO.getReg(), MRI)) &&
+                 "Unknown destination for a VSX copy");
+
+          // Copy the VSX value into a new VSX register of the correct subclass.
+          unsigned NewVReg = MRI.createVirtualRegister(DstRC);
+          BuildMI(MBB, MI, MI->getDebugLoc(),
+                  TII->get(TargetOpcode::COPY), NewVReg)
+            .addOperand(SrcMO);
+
+          // Transform the original copy into a subregister extraction copy.
+          SrcMO.setReg(NewVReg);
+          SrcMO.setSubReg(IsVRReg(DstMO.getReg(), MRI) ? PPC::sub_128 :
+                                                         PPC::sub_64);
+        }
+      }
+
+      return Changed;
+    }
+
+public:
+    bool runOnMachineFunction(MachineFunction &MF) override {
+      TM = static_cast<const PPCTargetMachine *>(&MF.getTarget());
+      // If we don't have VSX on the subtarget, don't do anything.
+      if (!TM->getSubtargetImpl()->hasVSX())
+        return false;
+      TII = TM->getInstrInfo();
+
+      bool Changed = false;
+
+      for (MachineFunction::iterator I = MF.begin(); I != MF.end();) {
+        MachineBasicBlock &B = *I++;
+        if (processBlock(B))
+          Changed = true;
+      }
+
+      return Changed;
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+  };
+}
+
+INITIALIZE_PASS(PPCVSXCopy, DEBUG_TYPE,
+                "PowerPC VSX Copy Legalization", false, false)
+
+char PPCVSXCopy::ID = 0;
+FunctionPass*
+llvm::createPPCVSXCopyPass() { return new PPCVSXCopy(); }
+
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "ppc-vsx-copy-cleanup"
+
+namespace llvm {
+  void initializePPCVSXCopyCleanupPass(PassRegistry&);
+}
+
+namespace {
+  // PPCVSXCopyCleanup pass - We sometimes end up generating self copies of VSX
+  // registers (mostly because the ABI code still places all values into the
+  // "traditional" floating-point and vector registers). Remove them here.
+  struct PPCVSXCopyCleanup : public MachineFunctionPass {
+    static char ID;
+    PPCVSXCopyCleanup() : MachineFunctionPass(ID) {
+      initializePPCVSXCopyCleanupPass(*PassRegistry::getPassRegistry());
+    }
+
+    const PPCTargetMachine *TM;
+    const PPCInstrInfo *TII;
+
+protected:
+    bool processBlock(MachineBasicBlock &MBB) {
+      bool Changed = false;
+
+      SmallVector<MachineInstr *, 4> ToDelete;
+      for (MachineBasicBlock::iterator I = MBB.begin(), IE = MBB.end();
+           I != IE; ++I) {
+        MachineInstr *MI = I;
+        if (MI->getOpcode() == PPC::XXLOR &&
+            MI->getOperand(0).getReg() == MI->getOperand(1).getReg() &&
+            MI->getOperand(0).getReg() == MI->getOperand(2).getReg())
+          ToDelete.push_back(MI);
+      }
+
+      if (!ToDelete.empty())
+        Changed = true;
+
+      for (unsigned i = 0, ie = ToDelete.size(); i != ie; ++i) {
+        DEBUG(dbgs() << "Removing VSX self-copy: " << *ToDelete[i]);
+        ToDelete[i]->eraseFromParent();
+      }
+
+      return Changed;
+    }
+
+public:
+    bool runOnMachineFunction(MachineFunction &MF) override {
+      TM = static_cast<const PPCTargetMachine *>(&MF.getTarget());
+      // If we don't have VSX don't bother doing anything here.
+      if (!TM->getSubtargetImpl()->hasVSX())
+        return false;
+      TII = TM->getInstrInfo();
+
+      bool Changed = false;
+
+      for (MachineFunction::iterator I = MF.begin(); I != MF.end();) {
+        MachineBasicBlock &B = *I++;
+        if (processBlock(B))
+          Changed = true;
+      }
+
+      return Changed;
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+  };
+}
+
+INITIALIZE_PASS(PPCVSXCopyCleanup, DEBUG_TYPE,
+                "PowerPC VSX Copy Cleanup", false, false)
+
+char PPCVSXCopyCleanup::ID = 0;
+FunctionPass*
+llvm::createPPCVSXCopyCleanupPass() { return new PPCVSXCopyCleanup(); }
+
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "ppc-early-ret"
+STATISTIC(NumBCLR, "Number of early conditional returns");
+STATISTIC(NumBLR,  "Number of early returns");
+
+namespace llvm {
+  void initializePPCEarlyReturnPass(PassRegistry&);
+}
+
+namespace {
+  // PPCEarlyReturn pass - For simple functions without epilogue code, move
+  // returns up, and create conditional returns, to avoid unnecessary
+  // branch-to-blr sequences.
+  struct PPCEarlyReturn : public MachineFunctionPass {
+    static char ID;
+    PPCEarlyReturn() : MachineFunctionPass(ID) {
+      initializePPCEarlyReturnPass(*PassRegistry::getPassRegistry());
+    }
+
+    const PPCTargetMachine *TM;
+    const PPCInstrInfo *TII;
+
+protected:
+    bool processBlock(MachineBasicBlock &ReturnMBB) {
+      bool Changed = false;
+
+      MachineBasicBlock::iterator I = ReturnMBB.begin();
+      I = ReturnMBB.SkipPHIsAndLabels(I);
+
+      // The block must be essentially empty except for the blr.
+      if (I == ReturnMBB.end() || I->getOpcode() != PPC::BLR ||
+          I != ReturnMBB.getLastNonDebugInstr())
+        return Changed;
+
+      SmallVector<MachineBasicBlock*, 8> PredToRemove;
+      for (MachineBasicBlock::pred_iterator PI = ReturnMBB.pred_begin(),
+           PIE = ReturnMBB.pred_end(); PI != PIE; ++PI) {
+        bool OtherReference = false, BlockChanged = false;
+        for (MachineBasicBlock::iterator J = (*PI)->getLastNonDebugInstr();;) {
+          if (J->getOpcode() == PPC::B) {
+            if (J->getOperand(0).getMBB() == &ReturnMBB) {
+              // This is an unconditional branch to the return. Replace the
+              // branch with a blr.
+              BuildMI(**PI, J, J->getDebugLoc(), TII->get(PPC::BLR));
+              MachineBasicBlock::iterator K = J--;
+              K->eraseFromParent();
+              BlockChanged = true;
+              ++NumBLR;
+              continue;
+            }
+          } else if (J->getOpcode() == PPC::BCC) {
+            if (J->getOperand(2).getMBB() == &ReturnMBB) {
+              // This is a conditional branch to the return. Replace the branch
+              // with a bclr.
+              BuildMI(**PI, J, J->getDebugLoc(), TII->get(PPC::BCCLR))
+                .addImm(J->getOperand(0).getImm())
+                .addReg(J->getOperand(1).getReg());
+              MachineBasicBlock::iterator K = J--;
+              K->eraseFromParent();
+              BlockChanged = true;
+              ++NumBCLR;
+              continue;
+            }
+          } else if (J->getOpcode() == PPC::BC || J->getOpcode() == PPC::BCn) {
+            if (J->getOperand(1).getMBB() == &ReturnMBB) {
+              // This is a conditional branch to the return. Replace the branch
+              // with a bclr.
+              BuildMI(**PI, J, J->getDebugLoc(),
+                      TII->get(J->getOpcode() == PPC::BC ?
+                               PPC::BCLR : PPC::BCLRn))
+                .addReg(J->getOperand(0).getReg());
+              MachineBasicBlock::iterator K = J--;
+              K->eraseFromParent();
+              BlockChanged = true;
+              ++NumBCLR;
+              continue;
+            }
+          } else if (J->isBranch()) {
+            if (J->isIndirectBranch()) {
+              if (ReturnMBB.hasAddressTaken())
+                OtherReference = true;
+            } else
+              for (unsigned i = 0; i < J->getNumOperands(); ++i)
+                if (J->getOperand(i).isMBB() &&
+                    J->getOperand(i).getMBB() == &ReturnMBB)
+                  OtherReference = true;
+          } else if (!J->isTerminator() && !J->isDebugValue())
+            break;
+
+          if (J == (*PI)->begin())
+            break;
+
+          --J;
+        }
+
+        if ((*PI)->canFallThrough() && (*PI)->isLayoutSuccessor(&ReturnMBB))
+          OtherReference = true;
+
+        // Predecessors are stored in a vector and can't be removed here.
+        if (!OtherReference && BlockChanged) {
+          PredToRemove.push_back(*PI);
+        }
+
+        if (BlockChanged)
+          Changed = true;
+      }
+
+      for (unsigned i = 0, ie = PredToRemove.size(); i != ie; ++i)
+        PredToRemove[i]->removeSuccessor(&ReturnMBB);
+
+      if (Changed && !ReturnMBB.hasAddressTaken()) {
+        // We now might be able to merge this blr-only block into its
+        // by-layout predecessor.
+        if (ReturnMBB.pred_size() == 1 &&
+            (*ReturnMBB.pred_begin())->isLayoutSuccessor(&ReturnMBB)) {
+          // Move the blr into the preceding block.
+          MachineBasicBlock &PrevMBB = **ReturnMBB.pred_begin();
+          PrevMBB.splice(PrevMBB.end(), &ReturnMBB, I);
+          PrevMBB.removeSuccessor(&ReturnMBB);
+        }
+
+        if (ReturnMBB.pred_empty())
+          ReturnMBB.eraseFromParent();
+      }
+
+      return Changed;
+    }
+
+public:
+    bool runOnMachineFunction(MachineFunction &MF) override {
+      TM = static_cast<const PPCTargetMachine *>(&MF.getTarget());
+      TII = TM->getInstrInfo();
+
+      bool Changed = false;
+
+      // If the function does not have at least two blocks, then there is
+      // nothing to do.
+      if (MF.size() < 2)
+        return Changed;
+
+      for (MachineFunction::iterator I = MF.begin(); I != MF.end();) {
+        MachineBasicBlock &B = *I++;
+        if (processBlock(B))
+          Changed = true;
+      }
+
+      return Changed;
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+  };
+}
+
+INITIALIZE_PASS(PPCEarlyReturn, DEBUG_TYPE,
+                "PowerPC Early-Return Creation", false, false)
+
+char PPCEarlyReturn::ID = 0;
+FunctionPass*
+llvm::createPPCEarlyReturnPass() { return new PPCEarlyReturn(); }
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.h
new file mode 100644
index 0000000..83f14c6
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.h
@@ -0,0 +1,235 @@
+//===-- PPCInstrInfo.h - PowerPC Instruction Information --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PowerPC implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef POWERPC_INSTRUCTIONINFO_H
+#define POWERPC_INSTRUCTIONINFO_H
+
+#include "PPC.h"
+#include "PPCRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "PPCGenInstrInfo.inc"
+
+namespace llvm {
+
+/// PPCII - This namespace holds all of the PowerPC target-specific
+/// per-instruction flags.  These must match the corresponding definitions in
+/// PPC.td and PPCInstrFormats.td.
+namespace PPCII {
+enum {
+  // PPC970 Instruction Flags.  These flags describe the characteristics of the
+  // PowerPC 970 (aka G5) dispatch groups and how they are formed out of
+  // raw machine instructions.
+
+  /// PPC970_First - This instruction starts a new dispatch group, so it will
+  /// always be the first one in the group.
+  PPC970_First = 0x1,
+
+  /// PPC970_Single - This instruction starts a new dispatch group and
+  /// terminates it, so it will be the sole instruction in the group.
+  PPC970_Single = 0x2,
+
+  /// PPC970_Cracked - This instruction is cracked into two pieces, requiring
+  /// two dispatch pipes to be available to issue.
+  PPC970_Cracked = 0x4,
+
+  /// PPC970_Mask/Shift - This is a bitmask that selects the pipeline type that
+  /// an instruction is issued to.
+  PPC970_Shift = 3,
+  PPC970_Mask = 0x07 << PPC970_Shift
+};
+enum PPC970_Unit {
+  /// These are the various PPC970 execution unit pipelines.  Each instruction
+  /// is one of these.
+  PPC970_Pseudo = 0 << PPC970_Shift,   // Pseudo instruction
+  PPC970_FXU    = 1 << PPC970_Shift,   // Fixed Point (aka Integer/ALU) Unit
+  PPC970_LSU    = 2 << PPC970_Shift,   // Load Store Unit
+  PPC970_FPU    = 3 << PPC970_Shift,   // Floating Point Unit
+  PPC970_CRU    = 4 << PPC970_Shift,   // Control Register Unit
+  PPC970_VALU   = 5 << PPC970_Shift,   // Vector ALU
+  PPC970_VPERM  = 6 << PPC970_Shift,   // Vector Permute Unit
+  PPC970_BRU    = 7 << PPC970_Shift    // Branch Unit
+};
+} // end namespace PPCII
+
+
+class PPCInstrInfo : public PPCGenInstrInfo {
+  PPCSubtarget &Subtarget;
+  const PPCRegisterInfo RI;
+
+  bool StoreRegToStackSlot(MachineFunction &MF,
+                           unsigned SrcReg, bool isKill, int FrameIdx,
+                           const TargetRegisterClass *RC,
+                           SmallVectorImpl<MachineInstr*> &NewMIs,
+                           bool &NonRI, bool &SpillsVRS) const;
+  bool LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
+                            unsigned DestReg, int FrameIdx,
+                            const TargetRegisterClass *RC,
+                            SmallVectorImpl<MachineInstr*> &NewMIs,
+                            bool &NonRI, bool &SpillsVRS) const;
+  virtual void anchor();
+public:
+  explicit PPCInstrInfo(PPCSubtarget &STI);
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  const PPCRegisterInfo &getRegisterInfo() const { return RI; }
+
+  ScheduleHazardRecognizer *
+  CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI,
+                               const ScheduleDAG *DAG) const override;
+  ScheduleHazardRecognizer *
+  CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
+                                     const ScheduleDAG *DAG) const override;
+
+  int getOperandLatency(const InstrItineraryData *ItinData,
+                        const MachineInstr *DefMI, unsigned DefIdx,
+                        const MachineInstr *UseMI,
+                        unsigned UseIdx) const override;
+  int getOperandLatency(const InstrItineraryData *ItinData,
+                        SDNode *DefNode, unsigned DefIdx,
+                        SDNode *UseNode, unsigned UseIdx) const override {
+    return PPCGenInstrInfo::getOperandLatency(ItinData, DefNode, DefIdx,
+                                              UseNode, UseIdx);
+  }
+
+  bool isCoalescableExtInstr(const MachineInstr &MI,
+                             unsigned &SrcReg, unsigned &DstReg,
+                             unsigned &SubIdx) const override;
+  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                               int &FrameIndex) const override;
+  unsigned isStoreToStackSlot(const MachineInstr *MI,
+                              int &FrameIndex) const override;
+
+  // commuteInstruction - We can commute rlwimi instructions, but only if the
+  // rotate amt is zero.  We also have to munge the immediates a bit.
+  MachineInstr *commuteInstruction(MachineInstr *MI, bool NewMI) const override;
+
+  bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
+                             unsigned &SrcOpIdx2) const override;
+
+  void insertNoop(MachineBasicBlock &MBB,
+                  MachineBasicBlock::iterator MI) const override;
+
+
+  // Branch analysis.
+  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify) const override;
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
+  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB,
+                        const SmallVectorImpl<MachineOperand> &Cond,
+                        DebugLoc DL) const override;
+
+  // Select analysis.
+  bool canInsertSelect(const MachineBasicBlock&,
+                       const SmallVectorImpl<MachineOperand> &Cond,
+                       unsigned, unsigned, int&, int&, int&) const override;
+  void insertSelect(MachineBasicBlock &MBB,
+                    MachineBasicBlock::iterator MI, DebugLoc DL,
+                    unsigned DstReg,
+                    const SmallVectorImpl<MachineOperand> &Cond,
+                    unsigned TrueReg, unsigned FalseReg) const override;
+
+  void copyPhysReg(MachineBasicBlock &MBB,
+                   MachineBasicBlock::iterator I, DebugLoc DL,
+                   unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
+
+  bool
+  ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+
+  bool FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
+                     unsigned Reg, MachineRegisterInfo *MRI) const override;
+
+  // If conversion by predication (only supported by some branch instructions).
+  // All of the profitability checks always return true; it is always
+  // profitable to use the predicated branches.
+  bool isProfitableToIfCvt(MachineBasicBlock &MBB,
+                          unsigned NumCycles, unsigned ExtraPredCycles,
+                          const BranchProbability &Probability) const override {
+    return true;
+  }
+
+  bool isProfitableToIfCvt(MachineBasicBlock &TMBB,
+                           unsigned NumT, unsigned ExtraT,
+                           MachineBasicBlock &FMBB,
+                           unsigned NumF, unsigned ExtraF,
+                           const BranchProbability &Probability) const override;
+
+  bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB,
+                                 unsigned NumCycles,
+                                 const BranchProbability
+                                 &Probability) const override {
+    return true;
+  }
+
+  bool isProfitableToUnpredicate(MachineBasicBlock &TMBB,
+                                 MachineBasicBlock &FMBB) const override {
+    return false;
+  }
+
+  // Predication support.
+  bool isPredicated(const MachineInstr *MI) const override;
+
+  bool isUnpredicatedTerminator(const MachineInstr *MI) const override;
+
+  bool PredicateInstruction(MachineInstr *MI,
+                    const SmallVectorImpl<MachineOperand> &Pred) const override;
+
+  bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
+                   const SmallVectorImpl<MachineOperand> &Pred2) const override;
+
+  bool DefinesPredicate(MachineInstr *MI,
+                        std::vector<MachineOperand> &Pred) const override;
+
+  bool isPredicable(MachineInstr *MI) const override;
+
+  // Comparison optimization.
+
+
+  bool analyzeCompare(const MachineInstr *MI,
+                      unsigned &SrcReg, unsigned &SrcReg2,
+                      int &Mask, int &Value) const override;
+
+  bool optimizeCompareInstr(MachineInstr *CmpInstr,
+                            unsigned SrcReg, unsigned SrcReg2,
+                            int Mask, int Value,
+                            const MachineRegisterInfo *MRI) const override;
+
+  /// GetInstSize - Return the number of bytes of code the specified
+  /// instruction may be.  This returns the maximum number of bytes.
+  ///
+  unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td
new file mode 100644
index 0000000..636ac5d
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -0,0 +1,3424 @@
+//===-- PPCInstrInfo.td - The PowerPC Instruction Set ------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the subset of the 32-bit PowerPC instruction set, as used
+// by the PowerPC instruction selector.
+//
+//===----------------------------------------------------------------------===//
+
+include "PPCInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// PowerPC specific type constraints.
+//
+def SDT_PPCstfiwx : SDTypeProfile<0, 2, [ // stfiwx
+  SDTCisVT<0, f64>, SDTCisPtrTy<1>
+]>;
+def SDT_PPClfiwx : SDTypeProfile<1, 1, [ // lfiw[az]x
+  SDTCisVT<0, f64>, SDTCisPtrTy<1>
+]>;
+
+def SDT_PPCCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_PPCCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>,
+                                         SDTCisVT<1, i32> ]>;
+def SDT_PPCvperm   : SDTypeProfile<1, 3, [
+  SDTCisVT<3, v16i8>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>
+]>;
+
+def SDT_PPCvcmp : SDTypeProfile<1, 3, [
+  SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisVT<3, i32>
+]>;
+
+def SDT_PPCcondbr : SDTypeProfile<0, 3, [
+  SDTCisVT<0, i32>, SDTCisVT<2, OtherVT>
+]>;
+
+def SDT_PPClbrx : SDTypeProfile<1, 2, [
+  SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisVT<2, OtherVT>
+]>;
+def SDT_PPCstbrx : SDTypeProfile<0, 3, [
+  SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisVT<2, OtherVT>
+]>;
+
+def SDT_PPClarx : SDTypeProfile<1, 1, [
+  SDTCisInt<0>, SDTCisPtrTy<1>
+]>;
+def SDT_PPCstcx : SDTypeProfile<0, 2, [
+  SDTCisInt<0>, SDTCisPtrTy<1>
+]>;
+
+def SDT_PPCTC_ret : SDTypeProfile<0, 2, [
+  SDTCisPtrTy<0>, SDTCisVT<1, i32>
+]>;
+
+def tocentry32 : Operand<iPTR> {
+  let MIOperandInfo = (ops i32imm:$imm);
+}
+
+//===----------------------------------------------------------------------===//
+// PowerPC specific DAG Nodes.
+//
+
+def PPCfre    : SDNode<"PPCISD::FRE",     SDTFPUnaryOp, []>;
+def PPCfrsqrte: SDNode<"PPCISD::FRSQRTE", SDTFPUnaryOp, []>;
+
+def PPCfcfid  : SDNode<"PPCISD::FCFID",   SDTFPUnaryOp, []>;
+def PPCfcfidu : SDNode<"PPCISD::FCFIDU",  SDTFPUnaryOp, []>;
+def PPCfcfids : SDNode<"PPCISD::FCFIDS",  SDTFPRoundOp, []>;
+def PPCfcfidus: SDNode<"PPCISD::FCFIDUS", SDTFPRoundOp, []>;
+def PPCfctidz : SDNode<"PPCISD::FCTIDZ", SDTFPUnaryOp, []>;
+def PPCfctiwz : SDNode<"PPCISD::FCTIWZ", SDTFPUnaryOp, []>;
+def PPCfctiduz: SDNode<"PPCISD::FCTIDUZ",SDTFPUnaryOp, []>;
+def PPCfctiwuz: SDNode<"PPCISD::FCTIWUZ",SDTFPUnaryOp, []>;
+def PPCstfiwx : SDNode<"PPCISD::STFIWX", SDT_PPCstfiwx,
+                       [SDNPHasChain, SDNPMayStore]>;
+def PPClfiwax : SDNode<"PPCISD::LFIWAX", SDT_PPClfiwx,
+                       [SDNPHasChain, SDNPMayLoad]>;
+def PPClfiwzx : SDNode<"PPCISD::LFIWZX", SDT_PPClfiwx,
+                       [SDNPHasChain, SDNPMayLoad]>;
+
+// Extract FPSCR (not modeled at the DAG level).
+def PPCmffs   : SDNode<"PPCISD::MFFS",
+                       SDTypeProfile<1, 0, [SDTCisVT<0, f64>]>, []>;
+
+// Perform FADD in round-to-zero mode.
+def PPCfaddrtz: SDNode<"PPCISD::FADDRTZ", SDTFPBinOp, []>;
+
+
+def PPCfsel   : SDNode<"PPCISD::FSEL",  
+   // Type constraint for fsel.
+   SDTypeProfile<1, 3, [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, 
+                        SDTCisFP<0>, SDTCisVT<1, f64>]>, []>;
+
+def PPChi       : SDNode<"PPCISD::Hi", SDTIntBinOp, []>;
+def PPClo       : SDNode<"PPCISD::Lo", SDTIntBinOp, []>;
+def PPCtoc_entry: SDNode<"PPCISD::TOC_ENTRY", SDTIntBinOp, [SDNPMayLoad]>;
+def PPCvmaddfp  : SDNode<"PPCISD::VMADDFP", SDTFPTernaryOp, []>;
+def PPCvnmsubfp : SDNode<"PPCISD::VNMSUBFP", SDTFPTernaryOp, []>;
+
+def PPCppc32GOT : SDNode<"PPCISD::PPC32_GOT", SDTIntLeaf, []>;
+
+def PPCaddisGotTprelHA : SDNode<"PPCISD::ADDIS_GOT_TPREL_HA", SDTIntBinOp>;
+def PPCldGotTprelL : SDNode<"PPCISD::LD_GOT_TPREL_L", SDTIntBinOp,
+                            [SDNPMayLoad]>;
+def PPCaddTls     : SDNode<"PPCISD::ADD_TLS", SDTIntBinOp, []>;
+def PPCaddisTlsgdHA : SDNode<"PPCISD::ADDIS_TLSGD_HA", SDTIntBinOp>;
+def PPCaddiTlsgdL   : SDNode<"PPCISD::ADDI_TLSGD_L", SDTIntBinOp>;
+def PPCgetTlsAddr   : SDNode<"PPCISD::GET_TLS_ADDR", SDTIntBinOp>;
+def PPCaddisTlsldHA : SDNode<"PPCISD::ADDIS_TLSLD_HA", SDTIntBinOp>;
+def PPCaddiTlsldL   : SDNode<"PPCISD::ADDI_TLSLD_L", SDTIntBinOp>;
+def PPCgetTlsldAddr : SDNode<"PPCISD::GET_TLSLD_ADDR", SDTIntBinOp>;
+def PPCaddisDtprelHA : SDNode<"PPCISD::ADDIS_DTPREL_HA", SDTIntBinOp,
+                              [SDNPHasChain]>;
+def PPCaddiDtprelL   : SDNode<"PPCISD::ADDI_DTPREL_L", SDTIntBinOp>;
+
+def PPCvperm    : SDNode<"PPCISD::VPERM", SDT_PPCvperm, []>;
+
+// These nodes represent the 32-bit PPC shifts that operate on 6-bit shift
+// amounts.  These nodes are generated by the multi-precision shift code.
+def PPCsrl        : SDNode<"PPCISD::SRL"       , SDTIntShiftOp>;
+def PPCsra        : SDNode<"PPCISD::SRA"       , SDTIntShiftOp>;
+def PPCshl        : SDNode<"PPCISD::SHL"       , SDTIntShiftOp>;
+
+// These are target-independent nodes, but have target-specific formats.
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_PPCCallSeqStart,
+                           [SDNPHasChain, SDNPOutGlue]>;
+def callseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_PPCCallSeqEnd,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def SDT_PPCCall   : SDTypeProfile<0, -1, [SDTCisInt<0>]>;
+def PPCcall  : SDNode<"PPCISD::CALL", SDT_PPCCall,
+                      [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                       SDNPVariadic]>;
+def PPCcall_nop  : SDNode<"PPCISD::CALL_NOP", SDT_PPCCall,
+                          [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                           SDNPVariadic]>;
+def PPCload   : SDNode<"PPCISD::LOAD", SDTypeProfile<1, 1, []>,
+                       [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def PPCload_toc : SDNode<"PPCISD::LOAD_TOC", SDTypeProfile<0, 1, []>,
+                          [SDNPHasChain, SDNPSideEffect,
+                           SDNPInGlue, SDNPOutGlue]>;
+def PPCmtctr      : SDNode<"PPCISD::MTCTR", SDT_PPCCall,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def PPCbctrl : SDNode<"PPCISD::BCTRL", SDTNone,
+                      [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                       SDNPVariadic]>;
+
+def retflag       : SDNode<"PPCISD::RET_FLAG", SDTNone,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+def PPCtc_return : SDNode<"PPCISD::TC_RETURN", SDT_PPCTC_ret,
+                        [SDNPHasChain,  SDNPOptInGlue, SDNPVariadic]>;
+
+def PPCeh_sjlj_setjmp  : SDNode<"PPCISD::EH_SJLJ_SETJMP",
+                                SDTypeProfile<1, 1, [SDTCisInt<0>,
+                                                     SDTCisPtrTy<1>]>,
+                                [SDNPHasChain, SDNPSideEffect]>;
+def PPCeh_sjlj_longjmp : SDNode<"PPCISD::EH_SJLJ_LONGJMP",
+                                SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
+                                [SDNPHasChain, SDNPSideEffect]>;
+
+def SDT_PPCsc     : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+def PPCsc         : SDNode<"PPCISD::SC", SDT_PPCsc,
+                           [SDNPHasChain, SDNPSideEffect]>;
+
+def PPCvcmp       : SDNode<"PPCISD::VCMP" , SDT_PPCvcmp, []>;
+def PPCvcmp_o     : SDNode<"PPCISD::VCMPo", SDT_PPCvcmp, [SDNPOutGlue]>;
+
+def PPCcondbranch : SDNode<"PPCISD::COND_BRANCH", SDT_PPCcondbr,
+                           [SDNPHasChain, SDNPOptInGlue]>;
+
+def PPClbrx       : SDNode<"PPCISD::LBRX", SDT_PPClbrx,
+                           [SDNPHasChain, SDNPMayLoad]>;
+def PPCstbrx      : SDNode<"PPCISD::STBRX", SDT_PPCstbrx,
+                           [SDNPHasChain, SDNPMayStore]>;
+
+// Instructions to set/unset CR bit 6 for SVR4 vararg calls
+def PPCcr6set   : SDNode<"PPCISD::CR6SET", SDTNone,
+                         [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def PPCcr6unset : SDNode<"PPCISD::CR6UNSET", SDTNone,
+                         [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+// Instructions to support atomic operations
+def PPClarx      : SDNode<"PPCISD::LARX", SDT_PPClarx,
+                          [SDNPHasChain, SDNPMayLoad]>;
+def PPCstcx      : SDNode<"PPCISD::STCX", SDT_PPCstcx,
+                          [SDNPHasChain, SDNPMayStore]>;
+
+// Instructions to support medium and large code model
+def PPCaddisTocHA : SDNode<"PPCISD::ADDIS_TOC_HA", SDTIntBinOp, []>;
+def PPCldTocL     : SDNode<"PPCISD::LD_TOC_L", SDTIntBinOp, [SDNPMayLoad]>;
+def PPCaddiTocL   : SDNode<"PPCISD::ADDI_TOC_L", SDTIntBinOp, []>;
+
+
+// Instructions to support dynamic alloca.
+def SDTDynOp  : SDTypeProfile<1, 2, []>;
+def PPCdynalloc   : SDNode<"PPCISD::DYNALLOC", SDTDynOp, [SDNPHasChain]>;
+
+//===----------------------------------------------------------------------===//
+// PowerPC specific transformation functions and pattern fragments.
+//
+
+def SHL32 : SDNodeXForm<imm, [{
+  // Transformation function: 31 - imm
+  return getI32Imm(31 - N->getZExtValue());
+}]>;
+
+def SRL32 : SDNodeXForm<imm, [{
+  // Transformation function: 32 - imm
+  return N->getZExtValue() ? getI32Imm(32 - N->getZExtValue()) : getI32Imm(0);
+}]>;
+
+def LO16 : SDNodeXForm<imm, [{
+  // Transformation function: get the low 16 bits.
+  return getI32Imm((unsigned short)N->getZExtValue());
+}]>;
+
+def HI16 : SDNodeXForm<imm, [{
+  // Transformation function: shift the immediate value down into the low bits.
+  return getI32Imm((unsigned)N->getZExtValue() >> 16);
+}]>;
+
+def HA16 : SDNodeXForm<imm, [{
+  // Transformation function: shift the immediate value down into the low bits.
+  signed int Val = N->getZExtValue();
+  return getI32Imm((Val - (signed short)Val) >> 16);
+}]>;
+def MB : SDNodeXForm<imm, [{
+  // Transformation function: get the start bit of a mask
+  unsigned mb = 0, me;
+  (void)isRunOfOnes((unsigned)N->getZExtValue(), mb, me);
+  return getI32Imm(mb);
+}]>;
+
+def ME : SDNodeXForm<imm, [{
+  // Transformation function: get the end bit of a mask
+  unsigned mb, me = 0;
+  (void)isRunOfOnes((unsigned)N->getZExtValue(), mb, me);
+  return getI32Imm(me);
+}]>;
+def maskimm32 : PatLeaf<(imm), [{
+  // maskImm predicate - True if immediate is a run of ones.
+  unsigned mb, me;
+  if (N->getValueType(0) == MVT::i32)
+    return isRunOfOnes((unsigned)N->getZExtValue(), mb, me);
+  else
+    return false;
+}]>;
+
+def imm32SExt16  : Operand<i32>, ImmLeaf<i32, [{
+  // imm32SExt16 predicate - True if the i32 immediate fits in a 16-bit
+  // sign extended field.  Used by instructions like 'addi'.
+  return (int32_t)Imm == (short)Imm;
+}]>;
+def imm64SExt16  : Operand<i64>, ImmLeaf<i64, [{
+  // imm64SExt16 predicate - True if the i64 immediate fits in a 16-bit
+  // sign extended field.  Used by instructions like 'addi'.
+  return (int64_t)Imm == (short)Imm;
+}]>;
+def immZExt16  : PatLeaf<(imm), [{
+  // immZExt16 predicate - True if the immediate fits in a 16-bit zero extended
+  // field.  Used by instructions like 'ori'.
+  return (uint64_t)N->getZExtValue() == (unsigned short)N->getZExtValue();
+}], LO16>;
+
+// imm16Shifted* - These match immediates where the low 16-bits are zero.  There
+// are two forms: imm16ShiftedSExt and imm16ShiftedZExt.  These two forms are
+// identical in 32-bit mode, but in 64-bit mode, they return true if the
+// immediate fits into a sign/zero extended 32-bit immediate (with the low bits
+// clear).
+def imm16ShiftedZExt : PatLeaf<(imm), [{
+  // imm16ShiftedZExt predicate - True if only bits in the top 16-bits of the
+  // immediate are set.  Used by instructions like 'xoris'.
+  return (N->getZExtValue() & ~uint64_t(0xFFFF0000)) == 0;
+}], HI16>;
+
+def imm16ShiftedSExt : PatLeaf<(imm), [{
+  // imm16ShiftedSExt predicate - True if only bits in the top 16-bits of the
+  // immediate are set.  Used by instructions like 'addis'.  Identical to 
+  // imm16ShiftedZExt in 32-bit mode.
+  if (N->getZExtValue() & 0xFFFF) return false;
+  if (N->getValueType(0) == MVT::i32)
+    return true;
+  // For 64-bit, make sure it is sext right.
+  return N->getZExtValue() == (uint64_t)(int)N->getZExtValue();
+}], HI16>;
+
+def imm64ZExt32  : Operand<i64>, ImmLeaf<i64, [{
+  // imm64ZExt32 predicate - True if the i64 immediate fits in a 32-bit
+  // zero extended field.
+  return isUInt<32>(Imm);
+}]>;
+
+// Some r+i load/store instructions (such as LD, STD, LDU, etc.) that require
+// restricted memrix (4-aligned) constants are alignment sensitive. If these
+// offsets are hidden behind TOC entries than the values of the lower-order
+// bits cannot be checked directly. As a result, we need to also incorporate
+// an alignment check into the relevant patterns.
+
+def aligned4load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 4;
+}]>;
+def aligned4store : PatFrag<(ops node:$val, node:$ptr),
+                            (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() >= 4;
+}]>;
+def aligned4sextloadi32 : PatFrag<(ops node:$ptr), (sextloadi32 node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 4;
+}]>;
+def aligned4pre_store : PatFrag<
+                          (ops node:$val, node:$base, node:$offset),
+                          (pre_store node:$val, node:$base, node:$offset), [{
+  return cast<StoreSDNode>(N)->getAlignment() >= 4;
+}]>;
+
+def unaligned4load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() < 4;
+}]>;
+def unaligned4store : PatFrag<(ops node:$val, node:$ptr),
+                              (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() < 4;
+}]>;
+def unaligned4sextloadi32 : PatFrag<(ops node:$ptr), (sextloadi32 node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() < 4;
+}]>;
+
+//===----------------------------------------------------------------------===//
+// PowerPC Flag Definitions.
+
+class isPPC64 { bit PPC64 = 1; }
+class isDOT   { bit RC = 1; }
+
+class RegConstraint<string C> {
+  string Constraints = C;
+}
+class NoEncode<string E> {
+  string DisableEncoding = E;
+}
+
+
+//===----------------------------------------------------------------------===//
+// PowerPC Operand Definitions.
+
+// In the default PowerPC assembler syntax, registers are specified simply
+// by number, so they cannot be distinguished from immediate values (without
+// looking at the opcode).  This means that the default operand matching logic
+// for the asm parser does not work, and we need to specify custom matchers.
+// Since those can only be specified with RegisterOperand classes and not
+// directly on the RegisterClass, all instructions patterns used by the asm
+// parser need to use a RegisterOperand (instead of a RegisterClass) for
+// all their register operands.
+// For this purpose, we define one RegisterOperand for each RegisterClass,
+// using the same name as the class, just in lower case.
+
+def PPCRegGPRCAsmOperand : AsmOperandClass {
+  let Name = "RegGPRC"; let PredicateMethod = "isRegNumber";
+}
+def gprc : RegisterOperand<GPRC> {
+  let ParserMatchClass = PPCRegGPRCAsmOperand;
+}
+def PPCRegG8RCAsmOperand : AsmOperandClass {
+  let Name = "RegG8RC"; let PredicateMethod = "isRegNumber";
+}
+def g8rc : RegisterOperand<G8RC> {
+  let ParserMatchClass = PPCRegG8RCAsmOperand;
+}
+def PPCRegGPRCNoR0AsmOperand : AsmOperandClass {
+  let Name = "RegGPRCNoR0"; let PredicateMethod = "isRegNumber";
+}
+def gprc_nor0 : RegisterOperand<GPRC_NOR0> {
+  let ParserMatchClass = PPCRegGPRCNoR0AsmOperand;
+}
+def PPCRegG8RCNoX0AsmOperand : AsmOperandClass {
+  let Name = "RegG8RCNoX0"; let PredicateMethod = "isRegNumber";
+}
+def g8rc_nox0 : RegisterOperand<G8RC_NOX0> {
+  let ParserMatchClass = PPCRegG8RCNoX0AsmOperand;
+}
+def PPCRegF8RCAsmOperand : AsmOperandClass {
+  let Name = "RegF8RC"; let PredicateMethod = "isRegNumber";
+}
+def f8rc : RegisterOperand<F8RC> {
+  let ParserMatchClass = PPCRegF8RCAsmOperand;
+}
+def PPCRegF4RCAsmOperand : AsmOperandClass {
+  let Name = "RegF4RC"; let PredicateMethod = "isRegNumber";
+}
+def f4rc : RegisterOperand<F4RC> {
+  let ParserMatchClass = PPCRegF4RCAsmOperand;
+}
+def PPCRegVRRCAsmOperand : AsmOperandClass {
+  let Name = "RegVRRC"; let PredicateMethod = "isRegNumber";
+}
+def vrrc : RegisterOperand<VRRC> {
+  let ParserMatchClass = PPCRegVRRCAsmOperand;
+}
+def PPCRegCRBITRCAsmOperand : AsmOperandClass {
+  let Name = "RegCRBITRC"; let PredicateMethod = "isCRBitNumber";
+}
+def crbitrc : RegisterOperand<CRBITRC> {
+  let ParserMatchClass = PPCRegCRBITRCAsmOperand;
+}
+def PPCRegCRRCAsmOperand : AsmOperandClass {
+  let Name = "RegCRRC"; let PredicateMethod = "isCCRegNumber";
+}
+def crrc : RegisterOperand<CRRC> {
+  let ParserMatchClass = PPCRegCRRCAsmOperand;
+}
+
+def PPCU2ImmAsmOperand : AsmOperandClass {
+  let Name = "U2Imm"; let PredicateMethod = "isU2Imm";
+  let RenderMethod = "addImmOperands";
+}
+def u2imm   : Operand<i32> {
+  let PrintMethod = "printU2ImmOperand";
+  let ParserMatchClass = PPCU2ImmAsmOperand;
+}
+def PPCS5ImmAsmOperand : AsmOperandClass {
+  let Name = "S5Imm"; let PredicateMethod = "isS5Imm";
+  let RenderMethod = "addImmOperands";
+}
+def s5imm   : Operand<i32> {
+  let PrintMethod = "printS5ImmOperand";
+  let ParserMatchClass = PPCS5ImmAsmOperand;
+  let DecoderMethod = "decodeSImmOperand<5>";
+}
+def PPCU5ImmAsmOperand : AsmOperandClass {
+  let Name = "U5Imm"; let PredicateMethod = "isU5Imm";
+  let RenderMethod = "addImmOperands";
+}
+def u5imm   : Operand<i32> {
+  let PrintMethod = "printU5ImmOperand";
+  let ParserMatchClass = PPCU5ImmAsmOperand;
+  let DecoderMethod = "decodeUImmOperand<5>";
+}
+def PPCU6ImmAsmOperand : AsmOperandClass {
+  let Name = "U6Imm"; let PredicateMethod = "isU6Imm";
+  let RenderMethod = "addImmOperands";
+}
+def u6imm   : Operand<i32> {
+  let PrintMethod = "printU6ImmOperand";
+  let ParserMatchClass = PPCU6ImmAsmOperand;
+  let DecoderMethod = "decodeUImmOperand<6>";
+}
+def PPCS16ImmAsmOperand : AsmOperandClass {
+  let Name = "S16Imm"; let PredicateMethod = "isS16Imm";
+  let RenderMethod = "addImmOperands";
+}
+def s16imm  : Operand<i32> {
+  let PrintMethod = "printS16ImmOperand";
+  let EncoderMethod = "getImm16Encoding";
+  let ParserMatchClass = PPCS16ImmAsmOperand;
+  let DecoderMethod = "decodeSImmOperand<16>";
+}
+def PPCU16ImmAsmOperand : AsmOperandClass {
+  let Name = "U16Imm"; let PredicateMethod = "isU16Imm";
+  let RenderMethod = "addImmOperands";
+}
+def u16imm  : Operand<i32> {
+  let PrintMethod = "printU16ImmOperand";
+  let EncoderMethod = "getImm16Encoding";
+  let ParserMatchClass = PPCU16ImmAsmOperand;
+  let DecoderMethod = "decodeUImmOperand<16>";
+}
+def PPCS17ImmAsmOperand : AsmOperandClass {
+  let Name = "S17Imm"; let PredicateMethod = "isS17Imm";
+  let RenderMethod = "addImmOperands";
+}
+def s17imm  : Operand<i32> {
+  // This operand type is used for addis/lis to allow the assembler parser
+  // to accept immediates in the range -65536..65535 for compatibility with
+  // the GNU assembler.  The operand is treated as 16-bit otherwise.
+  let PrintMethod = "printS16ImmOperand";
+  let EncoderMethod = "getImm16Encoding";
+  let ParserMatchClass = PPCS17ImmAsmOperand;
+  let DecoderMethod = "decodeSImmOperand<16>";
+}
+def PPCDirectBrAsmOperand : AsmOperandClass {
+  let Name = "DirectBr"; let PredicateMethod = "isDirectBr";
+  let RenderMethod = "addBranchTargetOperands";
+}
+def directbrtarget : Operand<OtherVT> {
+  let PrintMethod = "printBranchOperand";
+  let EncoderMethod = "getDirectBrEncoding";
+  let ParserMatchClass = PPCDirectBrAsmOperand;
+}
+def absdirectbrtarget : Operand<OtherVT> {
+  let PrintMethod = "printAbsBranchOperand";
+  let EncoderMethod = "getAbsDirectBrEncoding";
+  let ParserMatchClass = PPCDirectBrAsmOperand;
+}
+def PPCCondBrAsmOperand : AsmOperandClass {
+  let Name = "CondBr"; let PredicateMethod = "isCondBr";
+  let RenderMethod = "addBranchTargetOperands";
+}
+def condbrtarget : Operand<OtherVT> {
+  let PrintMethod = "printBranchOperand";
+  let EncoderMethod = "getCondBrEncoding";
+  let ParserMatchClass = PPCCondBrAsmOperand;
+}
+def abscondbrtarget : Operand<OtherVT> {
+  let PrintMethod = "printAbsBranchOperand";
+  let EncoderMethod = "getAbsCondBrEncoding";
+  let ParserMatchClass = PPCCondBrAsmOperand;
+}
+def calltarget : Operand<iPTR> {
+  let PrintMethod = "printBranchOperand";
+  let EncoderMethod = "getDirectBrEncoding";
+  let ParserMatchClass = PPCDirectBrAsmOperand;
+}
+def abscalltarget : Operand<iPTR> {
+  let PrintMethod = "printAbsBranchOperand";
+  let EncoderMethod = "getAbsDirectBrEncoding";
+  let ParserMatchClass = PPCDirectBrAsmOperand;
+}
+def PPCCRBitMaskOperand : AsmOperandClass {
+ let Name = "CRBitMask"; let PredicateMethod = "isCRBitMask";
+}
+def crbitm: Operand<i8> {
+  let PrintMethod = "printcrbitm";
+  let EncoderMethod = "get_crbitm_encoding";
+  let DecoderMethod = "decodeCRBitMOperand";
+  let ParserMatchClass = PPCCRBitMaskOperand;
+}
+// Address operands
+// A version of ptr_rc which excludes R0 (or X0 in 64-bit mode).
+def PPCRegGxRCNoR0Operand : AsmOperandClass {
+  let Name = "RegGxRCNoR0"; let PredicateMethod = "isRegNumber";
+}
+def ptr_rc_nor0 : Operand<iPTR>, PointerLikeRegClass<1> {
+  let ParserMatchClass = PPCRegGxRCNoR0Operand;
+}
+// A version of ptr_rc usable with the asm parser.
+def PPCRegGxRCOperand : AsmOperandClass {
+  let Name = "RegGxRC"; let PredicateMethod = "isRegNumber";
+}
+def ptr_rc_idx : Operand<iPTR>, PointerLikeRegClass<0> {
+  let ParserMatchClass = PPCRegGxRCOperand;
+}
+
+def PPCDispRIOperand : AsmOperandClass {
+ let Name = "DispRI"; let PredicateMethod = "isS16Imm";
+ let RenderMethod = "addImmOperands";
+}
+def dispRI : Operand<iPTR> {
+  let ParserMatchClass = PPCDispRIOperand;
+}
+def PPCDispRIXOperand : AsmOperandClass {
+ let Name = "DispRIX"; let PredicateMethod = "isS16ImmX4";
+ let RenderMethod = "addImmOperands";
+}
+def dispRIX : Operand<iPTR> {
+  let ParserMatchClass = PPCDispRIXOperand;
+}
+
+def memri : Operand<iPTR> {
+  let PrintMethod = "printMemRegImm";
+  let MIOperandInfo = (ops dispRI:$imm, ptr_rc_nor0:$reg);
+  let EncoderMethod = "getMemRIEncoding";
+  let DecoderMethod = "decodeMemRIOperands";
+}
+def memrr : Operand<iPTR> {
+  let PrintMethod = "printMemRegReg";
+  let MIOperandInfo = (ops ptr_rc_nor0:$ptrreg, ptr_rc_idx:$offreg);
+}
+def memrix : Operand<iPTR> {   // memri where the imm is 4-aligned.
+  let PrintMethod = "printMemRegImm";
+  let MIOperandInfo = (ops dispRIX:$imm, ptr_rc_nor0:$reg);
+  let EncoderMethod = "getMemRIXEncoding";
+  let DecoderMethod = "decodeMemRIXOperands";
+}
+
+// A single-register address. This is used with the SjLj
+// pseudo-instructions.
+def memr : Operand<iPTR> {
+  let MIOperandInfo = (ops ptr_rc:$ptrreg);
+}
+def PPCTLSRegOperand : AsmOperandClass {
+  let Name = "TLSReg"; let PredicateMethod = "isTLSReg";
+  let RenderMethod = "addTLSRegOperands";
+}
+def tlsreg32 : Operand<i32> {
+  let EncoderMethod = "getTLSRegEncoding";
+  let ParserMatchClass = PPCTLSRegOperand;
+}
+def tlsgd32 : Operand<i32> {}
+def tlscall32 : Operand<i32> {
+  let PrintMethod = "printTLSCall";
+  let MIOperandInfo = (ops calltarget:$func, tlsgd32:$sym);
+  let EncoderMethod = "getTLSCallEncoding";
+}
+
+// PowerPC Predicate operand.
+def pred : Operand<OtherVT> {
+  let PrintMethod = "printPredicateOperand";
+  let MIOperandInfo = (ops i32imm:$bibo, crrc:$reg);
+}
+
+// Define PowerPC specific addressing mode.
+def iaddr  : ComplexPattern<iPTR, 2, "SelectAddrImm",    [], []>;
+def xaddr  : ComplexPattern<iPTR, 2, "SelectAddrIdx",    [], []>;
+def xoaddr : ComplexPattern<iPTR, 2, "SelectAddrIdxOnly",[], []>;
+def ixaddr : ComplexPattern<iPTR, 2, "SelectAddrImmX4",  [], []>; // "std"
+
+// The address in a single register. This is used with the SjLj
+// pseudo-instructions.
+def addr   : ComplexPattern<iPTR, 1, "SelectAddr",[], []>;
+
+/// This is just the offset part of iaddr, used for preinc.
+def iaddroff : ComplexPattern<iPTR, 1, "SelectAddrImmOffs", [], []>;
+
+//===----------------------------------------------------------------------===//
+// PowerPC Instruction Predicate Definitions.
+def In32BitMode  : Predicate<"!PPCSubTarget->isPPC64()">;
+def In64BitMode  : Predicate<"PPCSubTarget->isPPC64()">;
+def IsBookE  : Predicate<"PPCSubTarget->isBookE()">;
+def IsNotBookE  : Predicate<"!PPCSubTarget->isBookE()">;
+
+//===----------------------------------------------------------------------===//
+// PowerPC Multiclass Definitions.
+
+multiclass XForm_6r<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+                    string asmbase, string asmstr, InstrItinClass itin,
+                    list<dag> pattern> {
+  let BaseName = asmbase in {
+    def NAME : XForm_6<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+                       pattern>, RecFormRel;
+    let Defs = [CR0] in
+    def o    : XForm_6<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+                       []>, isDOT, RecFormRel;
+  }
+}
+
+multiclass XForm_6rc<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+                     string asmbase, string asmstr, InstrItinClass itin,
+                     list<dag> pattern> {
+  let BaseName = asmbase in {
+    let Defs = [CARRY] in
+    def NAME : XForm_6<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+                       pattern>, RecFormRel;
+    let Defs = [CARRY, CR0] in
+    def o    : XForm_6<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+                       []>, isDOT, RecFormRel;
+  }
+}
+
+multiclass XForm_10rc<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+                      string asmbase, string asmstr, InstrItinClass itin,
+                      list<dag> pattern> {
+  let BaseName = asmbase in {
+    let Defs = [CARRY] in
+    def NAME : XForm_10<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+                       pattern>, RecFormRel;
+    let Defs = [CARRY, CR0] in
+    def o    : XForm_10<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+                       []>, isDOT, RecFormRel;
+  }
+}
+
+multiclass XForm_11r<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+                    string asmbase, string asmstr, InstrItinClass itin,
+                    list<dag> pattern> {
+  let BaseName = asmbase in {
+    def NAME : XForm_11<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+                       pattern>, RecFormRel;
+    let Defs = [CR0] in
+    def o    : XForm_11<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+                       []>, isDOT, RecFormRel;
+  }
+}
+
+multiclass XOForm_1r<bits<6> opcode, bits<9> xo, bit oe, dag OOL, dag IOL,
+                    string asmbase, string asmstr, InstrItinClass itin,
+                    list<dag> pattern> {
+  let BaseName = asmbase in {
+    def NAME : XOForm_1<opcode, xo, oe, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+                       pattern>, RecFormRel;
+    let Defs = [CR0] in
+    def o    : XOForm_1<opcode, xo, oe, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+                       []>, isDOT, RecFormRel;
+  }
+}
+
+multiclass XOForm_1rc<bits<6> opcode, bits<9> xo, bit oe, dag OOL, dag IOL,
+                      string asmbase, string asmstr, InstrItinClass itin,
+                      list<dag> pattern> {
+  let BaseName = asmbase in {
+    let Defs = [CARRY] in
+    def NAME : XOForm_1<opcode, xo, oe, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+                       pattern>, RecFormRel;
+    let Defs = [CARRY, CR0] in
+    def o    : XOForm_1<opcode, xo, oe, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+                       []>, isDOT, RecFormRel;
+  }
+}
+
+multiclass XOForm_3r<bits<6> opcode, bits<9> xo, bit oe, dag OOL, dag IOL,
+                    string asmbase, string asmstr, InstrItinClass itin,
+                    list<dag> pattern> {
+  let BaseName = asmbase in {
+    def NAME : XOForm_3<opcode, xo, oe, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+                       pattern>, RecFormRel;
+    let Defs = [CR0] in
+    def o    : XOForm_3<opcode, xo, oe, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+                       []>, isDOT, RecFormRel;
+  }
+}
+
+multiclass XOForm_3rc<bits<6> opcode, bits<9> xo, bit oe, dag OOL, dag IOL,
+                      string asmbase, string asmstr, InstrItinClass itin,
+                      list<dag> pattern> {
+  let BaseName = asmbase in {
+    let Defs = [CARRY] in
+    def NAME : XOForm_3<opcode, xo, oe, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+                       pattern>, RecFormRel;
+    let Defs = [CARRY, CR0] in
+    def o    : XOForm_3<opcode, xo, oe, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+                       []>, isDOT, RecFormRel;
+  }
+}
+
+multiclass MForm_2r<bits<6> opcode, dag OOL, dag IOL,
+                    string asmbase, string asmstr, InstrItinClass itin,
+                    list<dag> pattern> {
+  let BaseName = asmbase in {
+    def NAME : MForm_2<opcode, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+                       pattern>, RecFormRel;
+    let Defs = [CR0] in
+    def o    : MForm_2<opcode, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+                       []>, isDOT, RecFormRel;
+  }
+}
+
+multiclass MDForm_1r<bits<6> opcode, bits<3> xo, dag OOL, dag IOL,
+                    string asmbase, string asmstr, InstrItinClass itin,
+                    list<dag> pattern> {
+  let BaseName = asmbase in {
+    def NAME : MDForm_1<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+                       pattern>, RecFormRel;
+    let Defs = [CR0] in
+    def o    : MDForm_1<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+                       []>, isDOT, RecFormRel;
+  }
+}
+
+multiclass MDSForm_1r<bits<6> opcode, bits<4> xo, dag OOL, dag IOL,
+                     string asmbase, string asmstr, InstrItinClass itin,
+                     list<dag> pattern> {
+  let BaseName = asmbase in {
+    def NAME : MDSForm_1<opcode, xo, OOL, IOL,
+                        !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+                        pattern>, RecFormRel;
+    let Defs = [CR0] in
+    def o    : MDSForm_1<opcode, xo, OOL, IOL,
+                        !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+                        []>, isDOT, RecFormRel;
+  }
+}
+
+multiclass XSForm_1rc<bits<6> opcode, bits<9> xo, dag OOL, dag IOL,
+                      string asmbase, string asmstr, InstrItinClass itin,
+                      list<dag> pattern> {
+  let BaseName = asmbase in {
+    let Defs = [CARRY] in
+    def NAME : XSForm_1<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+                       pattern>, RecFormRel;
+    let Defs = [CARRY, CR0] in
+    def o    : XSForm_1<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+                       []>, isDOT, RecFormRel;
+  }
+}
+
+multiclass XForm_26r<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+                    string asmbase, string asmstr, InstrItinClass itin,
+                    list<dag> pattern> {
+  let BaseName = asmbase in {
+    def NAME : XForm_26<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+                       pattern>, RecFormRel;
+    let Defs = [CR1] in
+    def o    : XForm_26<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+                       []>, isDOT, RecFormRel;
+  }
+}
+
+multiclass XForm_28r<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+                    string asmbase, string asmstr, InstrItinClass itin,
+                    list<dag> pattern> {
+  let BaseName = asmbase in {
+    def NAME : XForm_28<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+                       pattern>, RecFormRel;
+    let Defs = [CR1] in
+    def o    : XForm_28<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+                       []>, isDOT, RecFormRel;
+  }
+}
+
+multiclass AForm_1r<bits<6> opcode, bits<5> xo, dag OOL, dag IOL,
+                    string asmbase, string asmstr, InstrItinClass itin,
+                    list<dag> pattern> {
+  let BaseName = asmbase in {
+    def NAME : AForm_1<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+                       pattern>, RecFormRel;
+    let Defs = [CR1] in
+    def o    : AForm_1<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+                       []>, isDOT, RecFormRel;
+  }
+}
+
+multiclass AForm_2r<bits<6> opcode, bits<5> xo, dag OOL, dag IOL,
+                    string asmbase, string asmstr, InstrItinClass itin,
+                    list<dag> pattern> {
+  let BaseName = asmbase in {
+    def NAME : AForm_2<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+                       pattern>, RecFormRel;
+    let Defs = [CR1] in
+    def o    : AForm_2<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+                       []>, isDOT, RecFormRel;
+  }
+}
+
+multiclass AForm_3r<bits<6> opcode, bits<5> xo, dag OOL, dag IOL,
+                    string asmbase, string asmstr, InstrItinClass itin,
+                    list<dag> pattern> {
+  let BaseName = asmbase in {
+    def NAME : AForm_3<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+                       pattern>, RecFormRel;
+    let Defs = [CR1] in
+    def o    : AForm_3<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+                       []>, isDOT, RecFormRel;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// PowerPC Instruction Definitions.
+
+// Pseudo-instructions:
+
+let hasCtrlDep = 1 in {
+let Defs = [R1], Uses = [R1] in {
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins u16imm:$amt), "#ADJCALLSTACKDOWN $amt",
+                              [(callseq_start timm:$amt)]>;
+def ADJCALLSTACKUP   : Pseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2), "#ADJCALLSTACKUP $amt1 $amt2",
+                              [(callseq_end timm:$amt1, timm:$amt2)]>;
+}
+
+def UPDATE_VRSAVE    : Pseudo<(outs gprc:$rD), (ins gprc:$rS),
+                              "UPDATE_VRSAVE $rD, $rS", []>;
+}
+
+let Defs = [R1], Uses = [R1] in
+def DYNALLOC : Pseudo<(outs gprc:$result), (ins gprc:$negsize, memri:$fpsi), "#DYNALLOC",
+                       [(set i32:$result,
+                             (PPCdynalloc i32:$negsize, iaddr:$fpsi))]>;
+                         
+// SELECT_CC_* - Used to implement the SELECT_CC DAG operation.  Expanded after
+// instruction selection into a branch sequence.
+let usesCustomInserter = 1,    // Expanded after instruction selection.
+    PPC970_Single = 1 in {
+  // Note that SELECT_CC_I4 and SELECT_CC_I8 use the no-r0 register classes
+  // because either operand might become the first operand in an isel, and
+  // that operand cannot be r0.
+  def SELECT_CC_I4 : Pseudo<(outs gprc:$dst), (ins crrc:$cond,
+                              gprc_nor0:$T, gprc_nor0:$F,
+                              i32imm:$BROPC), "#SELECT_CC_I4",
+                              []>;
+  def SELECT_CC_I8 : Pseudo<(outs g8rc:$dst), (ins crrc:$cond,
+                              g8rc_nox0:$T, g8rc_nox0:$F,
+                              i32imm:$BROPC), "#SELECT_CC_I8",
+                              []>;
+  def SELECT_CC_F4  : Pseudo<(outs f4rc:$dst), (ins crrc:$cond, f4rc:$T, f4rc:$F,
+                              i32imm:$BROPC), "#SELECT_CC_F4",
+                              []>;
+  def SELECT_CC_F8  : Pseudo<(outs f8rc:$dst), (ins crrc:$cond, f8rc:$T, f8rc:$F,
+                              i32imm:$BROPC), "#SELECT_CC_F8",
+                              []>;
+  def SELECT_CC_VRRC: Pseudo<(outs vrrc:$dst), (ins crrc:$cond, vrrc:$T, vrrc:$F,
+                              i32imm:$BROPC), "#SELECT_CC_VRRC",
+                              []>;
+
+  // SELECT_* pseudo instructions, like SELECT_CC_* but taking condition
+  // register bit directly.
+  def SELECT_I4 : Pseudo<(outs gprc:$dst), (ins crbitrc:$cond,
+                          gprc_nor0:$T, gprc_nor0:$F), "#SELECT_I4",
+                          [(set i32:$dst, (select i1:$cond, i32:$T, i32:$F))]>;
+  def SELECT_I8 : Pseudo<(outs g8rc:$dst), (ins crbitrc:$cond,
+                          g8rc_nox0:$T, g8rc_nox0:$F), "#SELECT_I8",
+                          [(set i64:$dst, (select i1:$cond, i64:$T, i64:$F))]>;
+  def SELECT_F4  : Pseudo<(outs f4rc:$dst), (ins crbitrc:$cond,
+                          f4rc:$T, f4rc:$F), "#SELECT_F4",
+                          [(set f32:$dst, (select i1:$cond, f32:$T, f32:$F))]>;
+  def SELECT_F8  : Pseudo<(outs f8rc:$dst), (ins crbitrc:$cond,
+                          f8rc:$T, f8rc:$F), "#SELECT_F8",
+                          [(set f64:$dst, (select i1:$cond, f64:$T, f64:$F))]>;
+  def SELECT_VRRC: Pseudo<(outs vrrc:$dst), (ins crbitrc:$cond,
+                          vrrc:$T, vrrc:$F), "#SELECT_VRRC",
+                          [(set v4i32:$dst,
+                                (select i1:$cond, v4i32:$T, v4i32:$F))]>;
+}
+
+// SPILL_CR - Indicate that we're dumping the CR register, so we'll need to
+// scavenge a register for it.
+let mayStore = 1 in {
+def SPILL_CR : Pseudo<(outs), (ins crrc:$cond, memri:$F),
+                     "#SPILL_CR", []>;
+def SPILL_CRBIT : Pseudo<(outs), (ins crbitrc:$cond, memri:$F),
+                         "#SPILL_CRBIT", []>;
+}
+
+// RESTORE_CR - Indicate that we're restoring the CR register (previously
+// spilled), so we'll need to scavenge a register for it.
+let mayLoad = 1 in {
+def RESTORE_CR : Pseudo<(outs crrc:$cond), (ins memri:$F),
+                     "#RESTORE_CR", []>;
+def RESTORE_CRBIT : Pseudo<(outs crbitrc:$cond), (ins memri:$F),
+                           "#RESTORE_CRBIT", []>;
+}
+
+let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7 in {
+  let isReturn = 1, Uses = [LR, RM] in
+    def BLR : XLForm_2_ext<19, 16, 20, 0, 0, (outs), (ins), "blr", IIC_BrB,
+                           [(retflag)]>;
+  let isBranch = 1, isIndirectBranch = 1, Uses = [CTR] in {
+    def BCTR : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", IIC_BrB,
+                            []>;
+
+    let isCodeGenOnly = 1 in {
+      def BCCCTR : XLForm_2_br<19, 528, 0, (outs), (ins pred:$cond),
+                               "b${cond:cc}ctr${cond:pm} ${cond:reg}", IIC_BrB,
+                               []>;
+
+      def BCCTR :  XLForm_2_br2<19, 528, 12, 0, (outs), (ins crbitrc:$bi),
+                                "bcctr 12, $bi, 0", IIC_BrB, []>;
+      def BCCTRn : XLForm_2_br2<19, 528, 4, 0, (outs), (ins crbitrc:$bi),
+                                "bcctr 4, $bi, 0", IIC_BrB, []>;
+    }
+  }
+}
+
+let Defs = [LR] in
+  def MovePCtoLR : Pseudo<(outs), (ins), "#MovePCtoLR", []>,
+                   PPC970_Unit_BRU;
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in {
+  let isBarrier = 1 in {
+  def B   : IForm<18, 0, 0, (outs), (ins directbrtarget:$dst),
+                  "b $dst", IIC_BrB,
+                  [(br bb:$dst)]>;
+  def BA  : IForm<18, 1, 0, (outs), (ins absdirectbrtarget:$dst),
+                  "ba $dst", IIC_BrB, []>;
+  }
+
+  // BCC represents an arbitrary conditional branch on a predicate.
+  // FIXME: should be able to write a pattern for PPCcondbranch, but can't use
+  // a two-value operand where a dag node expects two operands. :(
+  let isCodeGenOnly = 1 in {
+    def BCC : BForm<16, 0, 0, (outs), (ins pred:$cond, condbrtarget:$dst),
+                    "b${cond:cc}${cond:pm} ${cond:reg}, $dst"
+                    /*[(PPCcondbranch crrc:$crS, imm:$opc, bb:$dst)]*/>;
+    def BCCA : BForm<16, 1, 0, (outs), (ins pred:$cond, abscondbrtarget:$dst),
+                     "b${cond:cc}a${cond:pm} ${cond:reg}, $dst">;
+
+    let isReturn = 1, Uses = [LR, RM] in
+    def BCCLR : XLForm_2_br<19, 16, 0, (outs), (ins pred:$cond),
+                           "b${cond:cc}lr${cond:pm} ${cond:reg}", IIC_BrB, []>;
+  }
+
+  let isCodeGenOnly = 1 in {
+    let Pattern = [(brcond i1:$bi, bb:$dst)] in
+    def BC  : BForm_4<16, 12, 0, 0, (outs), (ins crbitrc:$bi, condbrtarget:$dst),
+             "bc 12, $bi, $dst">;
+
+    let Pattern = [(brcond (not i1:$bi), bb:$dst)] in
+    def BCn : BForm_4<16, 4, 0, 0, (outs), (ins crbitrc:$bi, condbrtarget:$dst),
+             "bc 4, $bi, $dst">;
+
+    let isReturn = 1, Uses = [LR, RM] in
+    def BCLR  : XLForm_2_br2<19, 16, 12, 0, (outs), (ins crbitrc:$bi),
+                             "bclr 12, $bi, 0", IIC_BrB, []>;
+    def BCLRn : XLForm_2_br2<19, 16, 4, 0, (outs), (ins crbitrc:$bi),
+                             "bclr 4, $bi, 0", IIC_BrB, []>;
+  }
+
+  let isReturn = 1, Defs = [CTR], Uses = [CTR, LR, RM] in {
+   def BDZLR  : XLForm_2_ext<19, 16, 18, 0, 0, (outs), (ins),
+                             "bdzlr", IIC_BrB, []>;
+   def BDNZLR : XLForm_2_ext<19, 16, 16, 0, 0, (outs), (ins),
+                             "bdnzlr", IIC_BrB, []>;
+   def BDZLRp : XLForm_2_ext<19, 16, 27, 0, 0, (outs), (ins),
+                             "bdzlr+", IIC_BrB, []>;
+   def BDNZLRp: XLForm_2_ext<19, 16, 25, 0, 0, (outs), (ins),
+                             "bdnzlr+", IIC_BrB, []>;
+   def BDZLRm : XLForm_2_ext<19, 16, 26, 0, 0, (outs), (ins),
+                             "bdzlr-", IIC_BrB, []>;
+   def BDNZLRm: XLForm_2_ext<19, 16, 24, 0, 0, (outs), (ins),
+                             "bdnzlr-", IIC_BrB, []>;
+  }
+
+  let Defs = [CTR], Uses = [CTR] in {
+    def BDZ  : BForm_1<16, 18, 0, 0, (outs), (ins condbrtarget:$dst),
+                       "bdz $dst">;
+    def BDNZ : BForm_1<16, 16, 0, 0, (outs), (ins condbrtarget:$dst),
+                       "bdnz $dst">;
+    def BDZA  : BForm_1<16, 18, 1, 0, (outs), (ins abscondbrtarget:$dst),
+                        "bdza $dst">;
+    def BDNZA : BForm_1<16, 16, 1, 0, (outs), (ins abscondbrtarget:$dst),
+                        "bdnza $dst">;
+    def BDZp : BForm_1<16, 27, 0, 0, (outs), (ins condbrtarget:$dst),
+                       "bdz+ $dst">;
+    def BDNZp: BForm_1<16, 25, 0, 0, (outs), (ins condbrtarget:$dst),
+                       "bdnz+ $dst">;
+    def BDZAp : BForm_1<16, 27, 1, 0, (outs), (ins abscondbrtarget:$dst),
+                        "bdza+ $dst">;
+    def BDNZAp: BForm_1<16, 25, 1, 0, (outs), (ins abscondbrtarget:$dst),
+                        "bdnza+ $dst">;
+    def BDZm : BForm_1<16, 26, 0, 0, (outs), (ins condbrtarget:$dst),
+                       "bdz- $dst">;
+    def BDNZm: BForm_1<16, 24, 0, 0, (outs), (ins condbrtarget:$dst),
+                       "bdnz- $dst">;
+    def BDZAm : BForm_1<16, 26, 1, 0, (outs), (ins abscondbrtarget:$dst),
+                        "bdza- $dst">;
+    def BDNZAm: BForm_1<16, 24, 1, 0, (outs), (ins abscondbrtarget:$dst),
+                        "bdnza- $dst">;
+  }
+}
+
+// The unconditional BCL used by the SjLj setjmp code.
+let isCall = 1, hasCtrlDep = 1, isCodeGenOnly = 1, PPC970_Unit = 7 in {
+  let Defs = [LR], Uses = [RM] in {
+    def BCLalways  : BForm_2<16, 20, 31, 0, 1, (outs), (ins condbrtarget:$dst),
+                            "bcl 20, 31, $dst">;
+  }
+}
+
+let isCall = 1, PPC970_Unit = 7, Defs = [LR] in {
+  // Convenient aliases for call instructions
+  let Uses = [RM] in {
+    def BL  : IForm<18, 0, 1, (outs), (ins calltarget:$func),
+                    "bl $func", IIC_BrB, []>;  // See Pat patterns below.
+    def BLA : IForm<18, 1, 1, (outs), (ins abscalltarget:$func),
+                    "bla $func", IIC_BrB, [(PPCcall (i32 imm:$func))]>;
+
+    let isCodeGenOnly = 1 in {
+      def BL_TLS  : IForm<18, 0, 1, (outs), (ins tlscall32:$func),
+                          "bl $func", IIC_BrB, []>;
+      def BCCL : BForm<16, 0, 1, (outs), (ins pred:$cond, condbrtarget:$dst),
+                       "b${cond:cc}l${cond:pm} ${cond:reg}, $dst">;
+      def BCCLA : BForm<16, 1, 1, (outs), (ins pred:$cond, abscondbrtarget:$dst),
+                        "b${cond:cc}la${cond:pm} ${cond:reg}, $dst">;
+
+      def BCL  : BForm_4<16, 12, 0, 1, (outs),
+                         (ins crbitrc:$bi, condbrtarget:$dst),
+                         "bcl 12, $bi, $dst">;
+      def BCLn : BForm_4<16, 4, 0, 1, (outs),
+                         (ins crbitrc:$bi, condbrtarget:$dst),
+                         "bcl 4, $bi, $dst">;
+    }
+  }
+  let Uses = [CTR, RM] in {
+    def BCTRL : XLForm_2_ext<19, 528, 20, 0, 1, (outs), (ins),
+                             "bctrl", IIC_BrB, [(PPCbctrl)]>,
+                Requires<[In32BitMode]>;
+
+    let isCodeGenOnly = 1 in {
+      def BCCCTRL : XLForm_2_br<19, 528, 1, (outs), (ins pred:$cond),
+                                "b${cond:cc}ctrl${cond:pm} ${cond:reg}", IIC_BrB,
+                                []>;
+
+      def BCCTRL  : XLForm_2_br2<19, 528, 12, 1, (outs), (ins crbitrc:$bi),
+                                 "bcctrl 12, $bi, 0", IIC_BrB, []>;
+      def BCCTRLn : XLForm_2_br2<19, 528, 4, 1, (outs), (ins crbitrc:$bi),
+                                 "bcctrl 4, $bi, 0", IIC_BrB, []>;
+    }
+  }
+  let Uses = [LR, RM] in {
+    def BLRL : XLForm_2_ext<19, 16, 20, 0, 1, (outs), (ins),
+                            "blrl", IIC_BrB, []>;
+
+    let isCodeGenOnly = 1 in {
+      def BCCLRL : XLForm_2_br<19, 16, 1, (outs), (ins pred:$cond),
+                              "b${cond:cc}lrl${cond:pm} ${cond:reg}", IIC_BrB,
+                              []>;
+
+      def BCLRL  : XLForm_2_br2<19, 16, 12, 1, (outs), (ins crbitrc:$bi),
+                                "bclrl 12, $bi, 0", IIC_BrB, []>;
+      def BCLRLn : XLForm_2_br2<19, 16, 4, 1, (outs), (ins crbitrc:$bi),
+                                "bclrl 4, $bi, 0", IIC_BrB, []>;
+    }
+  }
+  let Defs = [CTR], Uses = [CTR, RM] in {
+    def BDZL  : BForm_1<16, 18, 0, 1, (outs), (ins condbrtarget:$dst),
+                        "bdzl $dst">;
+    def BDNZL : BForm_1<16, 16, 0, 1, (outs), (ins condbrtarget:$dst),
+                        "bdnzl $dst">;
+    def BDZLA  : BForm_1<16, 18, 1, 1, (outs), (ins abscondbrtarget:$dst),
+                         "bdzla $dst">;
+    def BDNZLA : BForm_1<16, 16, 1, 1, (outs), (ins abscondbrtarget:$dst),
+                         "bdnzla $dst">;
+    def BDZLp : BForm_1<16, 27, 0, 1, (outs), (ins condbrtarget:$dst),
+                        "bdzl+ $dst">;
+    def BDNZLp: BForm_1<16, 25, 0, 1, (outs), (ins condbrtarget:$dst),
+                        "bdnzl+ $dst">;
+    def BDZLAp : BForm_1<16, 27, 1, 1, (outs), (ins abscondbrtarget:$dst),
+                         "bdzla+ $dst">;
+    def BDNZLAp: BForm_1<16, 25, 1, 1, (outs), (ins abscondbrtarget:$dst),
+                         "bdnzla+ $dst">;
+    def BDZLm : BForm_1<16, 26, 0, 1, (outs), (ins condbrtarget:$dst),
+                        "bdzl- $dst">;
+    def BDNZLm: BForm_1<16, 24, 0, 1, (outs), (ins condbrtarget:$dst),
+                        "bdnzl- $dst">;
+    def BDZLAm : BForm_1<16, 26, 1, 1, (outs), (ins abscondbrtarget:$dst),
+                         "bdzla- $dst">;
+    def BDNZLAm: BForm_1<16, 24, 1, 1, (outs), (ins abscondbrtarget:$dst),
+                         "bdnzla- $dst">;
+  }
+  let Defs = [CTR], Uses = [CTR, LR, RM] in {
+    def BDZLRL  : XLForm_2_ext<19, 16, 18, 0, 1, (outs), (ins),
+                               "bdzlrl", IIC_BrB, []>;
+    def BDNZLRL : XLForm_2_ext<19, 16, 16, 0, 1, (outs), (ins),
+                               "bdnzlrl", IIC_BrB, []>;
+    def BDZLRLp : XLForm_2_ext<19, 16, 27, 0, 1, (outs), (ins),
+                               "bdzlrl+", IIC_BrB, []>;
+    def BDNZLRLp: XLForm_2_ext<19, 16, 25, 0, 1, (outs), (ins),
+                               "bdnzlrl+", IIC_BrB, []>;
+    def BDZLRLm : XLForm_2_ext<19, 16, 26, 0, 1, (outs), (ins),
+                               "bdzlrl-", IIC_BrB, []>;
+    def BDNZLRLm: XLForm_2_ext<19, 16, 24, 0, 1, (outs), (ins),
+                               "bdnzlrl-", IIC_BrB, []>;
+  }
+}
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
+def TCRETURNdi :Pseudo< (outs),
+                        (ins calltarget:$dst, i32imm:$offset),
+                 "#TC_RETURNd $dst $offset",
+                 []>;
+
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
+def TCRETURNai :Pseudo<(outs), (ins abscalltarget:$func, i32imm:$offset),
+                 "#TC_RETURNa $func $offset",
+                 [(PPCtc_return (i32 imm:$func), imm:$offset)]>;
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
+def TCRETURNri : Pseudo<(outs), (ins CTRRC:$dst, i32imm:$offset),
+                 "#TC_RETURNr $dst $offset",
+                 []>;
+
+
+let isCodeGenOnly = 1 in {
+
+let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7, isBranch = 1,
+    isIndirectBranch = 1, isCall = 1, isReturn = 1, Uses = [CTR, RM]  in
+def TAILBCTR : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", IIC_BrB,
+                            []>, Requires<[In32BitMode]>;
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7,
+    isBarrier = 1, isCall = 1, isReturn = 1, Uses = [RM] in
+def TAILB   : IForm<18, 0, 0, (outs), (ins calltarget:$dst),
+                  "b $dst", IIC_BrB,
+                  []>;
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7,
+    isBarrier = 1, isCall = 1, isReturn = 1, Uses = [RM] in
+def TAILBA   : IForm<18, 0, 0, (outs), (ins abscalltarget:$dst),
+                  "ba $dst", IIC_BrB,
+                  []>;
+
+}
+
+let hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in {
+  let Defs = [CTR] in
+  def EH_SjLj_SetJmp32  : Pseudo<(outs gprc:$dst), (ins memr:$buf),
+                            "#EH_SJLJ_SETJMP32",
+                            [(set i32:$dst, (PPCeh_sjlj_setjmp addr:$buf))]>,
+                          Requires<[In32BitMode]>;
+  let isTerminator = 1 in
+  def EH_SjLj_LongJmp32 : Pseudo<(outs), (ins memr:$buf),
+                            "#EH_SJLJ_LONGJMP32",
+                            [(PPCeh_sjlj_longjmp addr:$buf)]>,
+                          Requires<[In32BitMode]>;
+}
+
+let isBranch = 1, isTerminator = 1 in {
+  def EH_SjLj_Setup : Pseudo<(outs), (ins directbrtarget:$dst),
+                        "#EH_SjLj_Setup\t$dst", []>;
+}
+
+// System call.
+let PPC970_Unit = 7 in {
+  def SC     : SCForm<17, 1, (outs), (ins i32imm:$lev),
+                      "sc $lev", IIC_BrB, [(PPCsc (i32 imm:$lev))]>;
+}
+
+// DCB* instructions.
+def DCBA   : DCB_Form<758, 0, (outs), (ins memrr:$dst), "dcba $dst",
+                      IIC_LdStDCBF, [(int_ppc_dcba xoaddr:$dst)]>,
+                      PPC970_DGroup_Single;
+def DCBF   : DCB_Form<86, 0, (outs), (ins memrr:$dst), "dcbf $dst",
+                      IIC_LdStDCBF, [(int_ppc_dcbf xoaddr:$dst)]>,
+                      PPC970_DGroup_Single;
+def DCBI   : DCB_Form<470, 0, (outs), (ins memrr:$dst), "dcbi $dst",
+                      IIC_LdStDCBF, [(int_ppc_dcbi xoaddr:$dst)]>,
+                      PPC970_DGroup_Single;
+def DCBST  : DCB_Form<54, 0, (outs), (ins memrr:$dst), "dcbst $dst",
+                      IIC_LdStDCBF, [(int_ppc_dcbst xoaddr:$dst)]>,
+                      PPC970_DGroup_Single;
+def DCBT   : DCB_Form<278, 0, (outs), (ins memrr:$dst), "dcbt $dst",
+                      IIC_LdStDCBF, [(int_ppc_dcbt xoaddr:$dst)]>,
+                      PPC970_DGroup_Single;
+def DCBTST : DCB_Form<246, 0, (outs), (ins memrr:$dst), "dcbtst $dst",
+                      IIC_LdStDCBF, [(int_ppc_dcbtst xoaddr:$dst)]>,
+                      PPC970_DGroup_Single;
+def DCBZ   : DCB_Form<1014, 0, (outs), (ins memrr:$dst), "dcbz $dst",
+                      IIC_LdStDCBF, [(int_ppc_dcbz xoaddr:$dst)]>,
+                      PPC970_DGroup_Single;
+def DCBZL  : DCB_Form<1014, 1, (outs), (ins memrr:$dst), "dcbzl $dst",
+                      IIC_LdStDCBF, [(int_ppc_dcbzl xoaddr:$dst)]>,
+                      PPC970_DGroup_Single;
+
+def : Pat<(prefetch xoaddr:$dst, (i32 0), imm, (i32 1)),
+          (DCBT xoaddr:$dst)>;
+
+// Atomic operations
+let usesCustomInserter = 1 in {
+  let Defs = [CR0] in {
+    def ATOMIC_LOAD_ADD_I8 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_ADD_I8",
+      [(set i32:$dst, (atomic_load_add_8 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_SUB_I8 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_SUB_I8",
+      [(set i32:$dst, (atomic_load_sub_8 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_AND_I8 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_AND_I8",
+      [(set i32:$dst, (atomic_load_and_8 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_OR_I8 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_OR_I8",
+      [(set i32:$dst, (atomic_load_or_8 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_XOR_I8 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "ATOMIC_LOAD_XOR_I8",
+      [(set i32:$dst, (atomic_load_xor_8 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_NAND_I8 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_NAND_I8",
+      [(set i32:$dst, (atomic_load_nand_8 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_ADD_I16 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_ADD_I16",
+      [(set i32:$dst, (atomic_load_add_16 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_SUB_I16 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_SUB_I16",
+      [(set i32:$dst, (atomic_load_sub_16 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_AND_I16 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_AND_I16",
+      [(set i32:$dst, (atomic_load_and_16 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_OR_I16 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_OR_I16",
+      [(set i32:$dst, (atomic_load_or_16 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_XOR_I16 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_XOR_I16",
+      [(set i32:$dst, (atomic_load_xor_16 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_NAND_I16 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_NAND_I16",
+      [(set i32:$dst, (atomic_load_nand_16 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_ADD_I32 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_ADD_I32",
+      [(set i32:$dst, (atomic_load_add_32 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_SUB_I32 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_SUB_I32",
+      [(set i32:$dst, (atomic_load_sub_32 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_AND_I32 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_AND_I32",
+      [(set i32:$dst, (atomic_load_and_32 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_OR_I32 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_OR_I32",
+      [(set i32:$dst, (atomic_load_or_32 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_XOR_I32 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_XOR_I32",
+      [(set i32:$dst, (atomic_load_xor_32 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_NAND_I32 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_NAND_I32",
+      [(set i32:$dst, (atomic_load_nand_32 xoaddr:$ptr, i32:$incr))]>;
+
+    def ATOMIC_CMP_SWAP_I8 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$old, gprc:$new), "#ATOMIC_CMP_SWAP_I8",
+      [(set i32:$dst, (atomic_cmp_swap_8 xoaddr:$ptr, i32:$old, i32:$new))]>;
+    def ATOMIC_CMP_SWAP_I16 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$old, gprc:$new), "#ATOMIC_CMP_SWAP_I16 $dst $ptr $old $new",
+      [(set i32:$dst, (atomic_cmp_swap_16 xoaddr:$ptr, i32:$old, i32:$new))]>;
+    def ATOMIC_CMP_SWAP_I32 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$old, gprc:$new), "#ATOMIC_CMP_SWAP_I32 $dst $ptr $old $new",
+      [(set i32:$dst, (atomic_cmp_swap_32 xoaddr:$ptr, i32:$old, i32:$new))]>;
+
+    def ATOMIC_SWAP_I8 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$new), "#ATOMIC_SWAP_i8",
+      [(set i32:$dst, (atomic_swap_8 xoaddr:$ptr, i32:$new))]>;
+    def ATOMIC_SWAP_I16 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$new), "#ATOMIC_SWAP_I16",
+      [(set i32:$dst, (atomic_swap_16 xoaddr:$ptr, i32:$new))]>;
+    def ATOMIC_SWAP_I32 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$new), "#ATOMIC_SWAP_I32",
+      [(set i32:$dst, (atomic_swap_32 xoaddr:$ptr, i32:$new))]>;
+  }
+}
+
+// Instructions to support atomic operations
+def LWARX : XForm_1<31,  20, (outs gprc:$rD), (ins memrr:$src),
+                   "lwarx $rD, $src", IIC_LdStLWARX,
+                   [(set i32:$rD, (PPClarx xoaddr:$src))]>;
+
+let Defs = [CR0] in
+def STWCX : XForm_1<31, 150, (outs), (ins gprc:$rS, memrr:$dst),
+                   "stwcx. $rS, $dst", IIC_LdStSTWCX,
+                   [(PPCstcx i32:$rS, xoaddr:$dst)]>,
+                   isDOT;
+
+let isTerminator = 1, isBarrier = 1, hasCtrlDep = 1 in
+def TRAP  : XForm_24<31, 4, (outs), (ins), "trap", IIC_LdStLoad, [(trap)]>;
+
+def TWI : DForm_base<3, (outs), (ins u5imm:$to, gprc:$rA, s16imm:$imm),
+                     "twi $to, $rA, $imm", IIC_IntTrapW, []>;
+def TW : XForm_1<31, 4, (outs), (ins u5imm:$to, gprc:$rA, gprc:$rB),
+                 "tw $to, $rA, $rB", IIC_IntTrapW, []>;
+def TDI : DForm_base<2, (outs), (ins u5imm:$to, g8rc:$rA, s16imm:$imm),
+                     "tdi $to, $rA, $imm", IIC_IntTrapD, []>;
+def TD : XForm_1<31, 68, (outs), (ins u5imm:$to, g8rc:$rA, g8rc:$rB),
+                 "td $to, $rA, $rB", IIC_IntTrapD, []>;
+
+//===----------------------------------------------------------------------===//
+// PPC32 Load Instructions.
+//
+
+// Unindexed (r+i) Loads. 
+let canFoldAsLoad = 1, PPC970_Unit = 2 in {
+def LBZ : DForm_1<34, (outs gprc:$rD), (ins memri:$src),
+                  "lbz $rD, $src", IIC_LdStLoad,
+                  [(set i32:$rD, (zextloadi8 iaddr:$src))]>;
+def LHA : DForm_1<42, (outs gprc:$rD), (ins memri:$src),
+                  "lha $rD, $src", IIC_LdStLHA,
+                  [(set i32:$rD, (sextloadi16 iaddr:$src))]>,
+                  PPC970_DGroup_Cracked;
+def LHZ : DForm_1<40, (outs gprc:$rD), (ins memri:$src),
+                  "lhz $rD, $src", IIC_LdStLoad,
+                  [(set i32:$rD, (zextloadi16 iaddr:$src))]>;
+def LWZ : DForm_1<32, (outs gprc:$rD), (ins memri:$src),
+                  "lwz $rD, $src", IIC_LdStLoad,
+                  [(set i32:$rD, (load iaddr:$src))]>;
+
+def LFS : DForm_1<48, (outs f4rc:$rD), (ins memri:$src),
+                  "lfs $rD, $src", IIC_LdStLFD,
+                  [(set f32:$rD, (load iaddr:$src))]>;
+def LFD : DForm_1<50, (outs f8rc:$rD), (ins memri:$src),
+                  "lfd $rD, $src", IIC_LdStLFD,
+                  [(set f64:$rD, (load iaddr:$src))]>;
+
+
+// Unindexed (r+i) Loads with Update (preinc).
+let mayLoad = 1, neverHasSideEffects = 1 in {
+def LBZU : DForm_1<35, (outs gprc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
+                   "lbzu $rD, $addr", IIC_LdStLoadUpd,
+                   []>, RegConstraint<"$addr.reg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LHAU : DForm_1<43, (outs gprc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
+                   "lhau $rD, $addr", IIC_LdStLHAU,
+                   []>, RegConstraint<"$addr.reg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LHZU : DForm_1<41, (outs gprc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
+                   "lhzu $rD, $addr", IIC_LdStLoadUpd,
+                   []>, RegConstraint<"$addr.reg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LWZU : DForm_1<33, (outs gprc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
+                   "lwzu $rD, $addr", IIC_LdStLoadUpd,
+                   []>, RegConstraint<"$addr.reg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LFSU : DForm_1<49, (outs f4rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
+                  "lfsu $rD, $addr", IIC_LdStLFDU,
+                  []>, RegConstraint<"$addr.reg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LFDU : DForm_1<51, (outs f8rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
+                  "lfdu $rD, $addr", IIC_LdStLFDU,
+                  []>, RegConstraint<"$addr.reg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+
+// Indexed (r+r) Loads with Update (preinc).
+def LBZUX : XForm_1<31, 119, (outs gprc:$rD, ptr_rc_nor0:$ea_result),
+                   (ins memrr:$addr),
+                   "lbzux $rD, $addr", IIC_LdStLoadUpdX,
+                   []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LHAUX : XForm_1<31, 375, (outs gprc:$rD, ptr_rc_nor0:$ea_result),
+                   (ins memrr:$addr),
+                   "lhaux $rD, $addr", IIC_LdStLHAUX,
+                   []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LHZUX : XForm_1<31, 311, (outs gprc:$rD, ptr_rc_nor0:$ea_result),
+                   (ins memrr:$addr),
+                   "lhzux $rD, $addr", IIC_LdStLoadUpdX,
+                   []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LWZUX : XForm_1<31, 55, (outs gprc:$rD, ptr_rc_nor0:$ea_result),
+                   (ins memrr:$addr),
+                   "lwzux $rD, $addr", IIC_LdStLoadUpdX,
+                   []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LFSUX : XForm_1<31, 567, (outs f4rc:$rD, ptr_rc_nor0:$ea_result),
+                   (ins memrr:$addr),
+                   "lfsux $rD, $addr", IIC_LdStLFDUX,
+                   []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LFDUX : XForm_1<31, 631, (outs f8rc:$rD, ptr_rc_nor0:$ea_result),
+                   (ins memrr:$addr),
+                   "lfdux $rD, $addr", IIC_LdStLFDUX,
+                   []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+                   NoEncode<"$ea_result">;
+}
+}
+
+// Indexed (r+r) Loads.
+//
+let canFoldAsLoad = 1, PPC970_Unit = 2 in {
+def LBZX : XForm_1<31,  87, (outs gprc:$rD), (ins memrr:$src),
+                   "lbzx $rD, $src", IIC_LdStLoad,
+                   [(set i32:$rD, (zextloadi8 xaddr:$src))]>;
+def LHAX : XForm_1<31, 343, (outs gprc:$rD), (ins memrr:$src),
+                   "lhax $rD, $src", IIC_LdStLHA,
+                   [(set i32:$rD, (sextloadi16 xaddr:$src))]>,
+                   PPC970_DGroup_Cracked;
+def LHZX : XForm_1<31, 279, (outs gprc:$rD), (ins memrr:$src),
+                   "lhzx $rD, $src", IIC_LdStLoad,
+                   [(set i32:$rD, (zextloadi16 xaddr:$src))]>;
+def LWZX : XForm_1<31,  23, (outs gprc:$rD), (ins memrr:$src),
+                   "lwzx $rD, $src", IIC_LdStLoad,
+                   [(set i32:$rD, (load xaddr:$src))]>;
+                   
+                   
+def LHBRX : XForm_1<31, 790, (outs gprc:$rD), (ins memrr:$src),
+                   "lhbrx $rD, $src", IIC_LdStLoad,
+                   [(set i32:$rD, (PPClbrx xoaddr:$src, i16))]>;
+def LWBRX : XForm_1<31,  534, (outs gprc:$rD), (ins memrr:$src),
+                   "lwbrx $rD, $src", IIC_LdStLoad,
+                   [(set i32:$rD, (PPClbrx xoaddr:$src, i32))]>;
+
+def LFSX   : XForm_25<31, 535, (outs f4rc:$frD), (ins memrr:$src),
+                      "lfsx $frD, $src", IIC_LdStLFD,
+                      [(set f32:$frD, (load xaddr:$src))]>;
+def LFDX   : XForm_25<31, 599, (outs f8rc:$frD), (ins memrr:$src),
+                      "lfdx $frD, $src", IIC_LdStLFD,
+                      [(set f64:$frD, (load xaddr:$src))]>;
+
+def LFIWAX : XForm_25<31, 855, (outs f8rc:$frD), (ins memrr:$src),
+                      "lfiwax $frD, $src", IIC_LdStLFD,
+                      [(set f64:$frD, (PPClfiwax xoaddr:$src))]>;
+def LFIWZX : XForm_25<31, 887, (outs f8rc:$frD), (ins memrr:$src),
+                      "lfiwzx $frD, $src", IIC_LdStLFD,
+                      [(set f64:$frD, (PPClfiwzx xoaddr:$src))]>;
+}
+
+// Load Multiple
+def LMW : DForm_1<46, (outs gprc:$rD), (ins memri:$src),
+                  "lmw $rD, $src", IIC_LdStLMW, []>;
+
+//===----------------------------------------------------------------------===//
+// PPC32 Store Instructions.
+//
+
+// Unindexed (r+i) Stores.
+let PPC970_Unit = 2 in {
+def STB  : DForm_1<38, (outs), (ins gprc:$rS, memri:$src),
+                   "stb $rS, $src", IIC_LdStStore,
+                   [(truncstorei8 i32:$rS, iaddr:$src)]>;
+def STH  : DForm_1<44, (outs), (ins gprc:$rS, memri:$src),
+                   "sth $rS, $src", IIC_LdStStore,
+                   [(truncstorei16 i32:$rS, iaddr:$src)]>;
+def STW  : DForm_1<36, (outs), (ins gprc:$rS, memri:$src),
+                   "stw $rS, $src", IIC_LdStStore,
+                   [(store i32:$rS, iaddr:$src)]>;
+def STFS : DForm_1<52, (outs), (ins f4rc:$rS, memri:$dst),
+                   "stfs $rS, $dst", IIC_LdStSTFD,
+                   [(store f32:$rS, iaddr:$dst)]>;
+def STFD : DForm_1<54, (outs), (ins f8rc:$rS, memri:$dst),
+                   "stfd $rS, $dst", IIC_LdStSTFD,
+                   [(store f64:$rS, iaddr:$dst)]>;
+}
+
+// Unindexed (r+i) Stores with Update (preinc).
+let PPC970_Unit = 2, mayStore = 1 in {
+def STBU  : DForm_1<39, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memri:$dst),
+                    "stbu $rS, $dst", IIC_LdStStoreUpd, []>,
+                    RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
+def STHU  : DForm_1<45, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memri:$dst),
+                    "sthu $rS, $dst", IIC_LdStStoreUpd, []>,
+                    RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
+def STWU  : DForm_1<37, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memri:$dst),
+                    "stwu $rS, $dst", IIC_LdStStoreUpd, []>,
+                    RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
+def STFSU : DForm_1<53, (outs ptr_rc_nor0:$ea_res), (ins f4rc:$rS, memri:$dst),
+                    "stfsu $rS, $dst", IIC_LdStSTFDU, []>,
+                    RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
+def STFDU : DForm_1<55, (outs ptr_rc_nor0:$ea_res), (ins f8rc:$rS, memri:$dst),
+                    "stfdu $rS, $dst", IIC_LdStSTFDU, []>,
+                    RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
+}
+
+// Patterns to match the pre-inc stores.  We can't put the patterns on
+// the instruction definitions directly as ISel wants the address base
+// and offset to be separate operands, not a single complex operand.
+def : Pat<(pre_truncsti8 i32:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
+          (STBU $rS, iaddroff:$ptroff, $ptrreg)>;
+def : Pat<(pre_truncsti16 i32:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
+          (STHU $rS, iaddroff:$ptroff, $ptrreg)>;
+def : Pat<(pre_store i32:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
+          (STWU $rS, iaddroff:$ptroff, $ptrreg)>;
+def : Pat<(pre_store f32:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
+          (STFSU $rS, iaddroff:$ptroff, $ptrreg)>;
+def : Pat<(pre_store f64:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
+          (STFDU $rS, iaddroff:$ptroff, $ptrreg)>;
+
+// Indexed (r+r) Stores.
+let PPC970_Unit = 2 in {
+def STBX  : XForm_8<31, 215, (outs), (ins gprc:$rS, memrr:$dst),
+                   "stbx $rS, $dst", IIC_LdStStore,
+                   [(truncstorei8 i32:$rS, xaddr:$dst)]>,
+                   PPC970_DGroup_Cracked;
+def STHX  : XForm_8<31, 407, (outs), (ins gprc:$rS, memrr:$dst),
+                   "sthx $rS, $dst", IIC_LdStStore,
+                   [(truncstorei16 i32:$rS, xaddr:$dst)]>,
+                   PPC970_DGroup_Cracked;
+def STWX  : XForm_8<31, 151, (outs), (ins gprc:$rS, memrr:$dst),
+                   "stwx $rS, $dst", IIC_LdStStore,
+                   [(store i32:$rS, xaddr:$dst)]>,
+                   PPC970_DGroup_Cracked;
+ 
+def STHBRX: XForm_8<31, 918, (outs), (ins gprc:$rS, memrr:$dst),
+                   "sthbrx $rS, $dst", IIC_LdStStore,
+                   [(PPCstbrx i32:$rS, xoaddr:$dst, i16)]>,
+                   PPC970_DGroup_Cracked;
+def STWBRX: XForm_8<31, 662, (outs), (ins gprc:$rS, memrr:$dst),
+                   "stwbrx $rS, $dst", IIC_LdStStore,
+                   [(PPCstbrx i32:$rS, xoaddr:$dst, i32)]>,
+                   PPC970_DGroup_Cracked;
+
+def STFIWX: XForm_28<31, 983, (outs), (ins f8rc:$frS, memrr:$dst),
+                     "stfiwx $frS, $dst", IIC_LdStSTFD,
+                     [(PPCstfiwx f64:$frS, xoaddr:$dst)]>;
+                     
+def STFSX : XForm_28<31, 663, (outs), (ins f4rc:$frS, memrr:$dst),
+                     "stfsx $frS, $dst", IIC_LdStSTFD,
+                     [(store f32:$frS, xaddr:$dst)]>;
+def STFDX : XForm_28<31, 727, (outs), (ins f8rc:$frS, memrr:$dst),
+                     "stfdx $frS, $dst", IIC_LdStSTFD,
+                     [(store f64:$frS, xaddr:$dst)]>;
+}
+
+// Indexed (r+r) Stores with Update (preinc).
+let PPC970_Unit = 2, mayStore = 1 in {
+def STBUX : XForm_8<31, 247, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memrr:$dst),
+                    "stbux $rS, $dst", IIC_LdStStoreUpd, []>,
+                    RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
+                    PPC970_DGroup_Cracked;
+def STHUX : XForm_8<31, 439, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memrr:$dst),
+                    "sthux $rS, $dst", IIC_LdStStoreUpd, []>,
+                    RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
+                    PPC970_DGroup_Cracked;
+def STWUX : XForm_8<31, 183, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memrr:$dst),
+                    "stwux $rS, $dst", IIC_LdStStoreUpd, []>,
+                    RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
+                    PPC970_DGroup_Cracked;
+def STFSUX: XForm_8<31, 695, (outs ptr_rc_nor0:$ea_res), (ins f4rc:$rS, memrr:$dst),
+                    "stfsux $rS, $dst", IIC_LdStSTFDU, []>,
+                    RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
+                    PPC970_DGroup_Cracked;
+def STFDUX: XForm_8<31, 759, (outs ptr_rc_nor0:$ea_res), (ins f8rc:$rS, memrr:$dst),
+                    "stfdux $rS, $dst", IIC_LdStSTFDU, []>,
+                    RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
+                    PPC970_DGroup_Cracked;
+}
+
+// Patterns to match the pre-inc stores.  We can't put the patterns on
+// the instruction definitions directly as ISel wants the address base
+// and offset to be separate operands, not a single complex operand.
+def : Pat<(pre_truncsti8 i32:$rS, iPTR:$ptrreg, iPTR:$ptroff),
+          (STBUX $rS, $ptrreg, $ptroff)>;
+def : Pat<(pre_truncsti16 i32:$rS, iPTR:$ptrreg, iPTR:$ptroff),
+          (STHUX $rS, $ptrreg, $ptroff)>;
+def : Pat<(pre_store i32:$rS, iPTR:$ptrreg, iPTR:$ptroff),
+          (STWUX $rS, $ptrreg, $ptroff)>;
+def : Pat<(pre_store f32:$rS, iPTR:$ptrreg, iPTR:$ptroff),
+          (STFSUX $rS, $ptrreg, $ptroff)>;
+def : Pat<(pre_store f64:$rS, iPTR:$ptrreg, iPTR:$ptroff),
+          (STFDUX $rS, $ptrreg, $ptroff)>;
+
+// Store Multiple
+def STMW : DForm_1<47, (outs), (ins gprc:$rS, memri:$dst),
+                   "stmw $rS, $dst", IIC_LdStLMW, []>;
+
+def SYNC : XForm_24_sync<31, 598, (outs), (ins i32imm:$L),
+                        "sync $L", IIC_LdStSync, []>, Requires<[IsNotBookE]>;
+
+let isCodeGenOnly = 1 in {
+  def MSYNC : XForm_24_sync<31, 598, (outs), (ins),
+                           "msync", IIC_LdStSync, []>, Requires<[IsBookE]> {
+    let L = 0;
+  }
+}
+
+def : Pat<(int_ppc_sync), (SYNC 0)>, Requires<[IsNotBookE]>;
+def : Pat<(int_ppc_sync), (MSYNC)>, Requires<[IsBookE]>;
+
+//===----------------------------------------------------------------------===//
+// PPC32 Arithmetic Instructions.
+//
+
+let PPC970_Unit = 1 in {  // FXU Operations.
+def ADDI   : DForm_2<14, (outs gprc:$rD), (ins gprc_nor0:$rA, s16imm:$imm),
+                     "addi $rD, $rA, $imm", IIC_IntSimple,
+                     [(set i32:$rD, (add i32:$rA, imm32SExt16:$imm))]>;
+let BaseName = "addic" in {
+let Defs = [CARRY] in
+def ADDIC  : DForm_2<12, (outs gprc:$rD), (ins gprc:$rA, s16imm:$imm),
+                     "addic $rD, $rA, $imm", IIC_IntGeneral,
+                     [(set i32:$rD, (addc i32:$rA, imm32SExt16:$imm))]>,
+                     RecFormRel, PPC970_DGroup_Cracked;
+let Defs = [CARRY, CR0] in
+def ADDICo : DForm_2<13, (outs gprc:$rD), (ins gprc:$rA, s16imm:$imm),
+                     "addic. $rD, $rA, $imm", IIC_IntGeneral,
+                     []>, isDOT, RecFormRel;
+}
+def ADDIS  : DForm_2<15, (outs gprc:$rD), (ins gprc_nor0:$rA, s17imm:$imm),
+                     "addis $rD, $rA, $imm", IIC_IntSimple,
+                     [(set i32:$rD, (add i32:$rA, imm16ShiftedSExt:$imm))]>;
+let isCodeGenOnly = 1 in
+def LA     : DForm_2<14, (outs gprc:$rD), (ins gprc_nor0:$rA, s16imm:$sym),
+                     "la $rD, $sym($rA)", IIC_IntGeneral,
+                     [(set i32:$rD, (add i32:$rA,
+                                          (PPClo tglobaladdr:$sym, 0)))]>;
+def MULLI  : DForm_2< 7, (outs gprc:$rD), (ins gprc:$rA, s16imm:$imm),
+                     "mulli $rD, $rA, $imm", IIC_IntMulLI,
+                     [(set i32:$rD, (mul i32:$rA, imm32SExt16:$imm))]>;
+let Defs = [CARRY] in
+def SUBFIC : DForm_2< 8, (outs gprc:$rD), (ins gprc:$rA, s16imm:$imm),
+                     "subfic $rD, $rA, $imm", IIC_IntGeneral,
+                     [(set i32:$rD, (subc imm32SExt16:$imm, i32:$rA))]>;
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
+  def LI  : DForm_2_r0<14, (outs gprc:$rD), (ins s16imm:$imm),
+                       "li $rD, $imm", IIC_IntSimple,
+                       [(set i32:$rD, imm32SExt16:$imm)]>;
+  def LIS : DForm_2_r0<15, (outs gprc:$rD), (ins s17imm:$imm),
+                       "lis $rD, $imm", IIC_IntSimple,
+                       [(set i32:$rD, imm16ShiftedSExt:$imm)]>;
+}
+}
+
+let PPC970_Unit = 1 in {  // FXU Operations.
+let Defs = [CR0] in {
+def ANDIo : DForm_4<28, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2),
+                    "andi. $dst, $src1, $src2", IIC_IntGeneral,
+                    [(set i32:$dst, (and i32:$src1, immZExt16:$src2))]>,
+                    isDOT;
+def ANDISo : DForm_4<29, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2),
+                    "andis. $dst, $src1, $src2", IIC_IntGeneral,
+                    [(set i32:$dst, (and i32:$src1, imm16ShiftedZExt:$src2))]>,
+                    isDOT;
+}
+def ORI   : DForm_4<24, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2),
+                    "ori $dst, $src1, $src2", IIC_IntSimple,
+                    [(set i32:$dst, (or i32:$src1, immZExt16:$src2))]>;
+def ORIS  : DForm_4<25, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2),
+                    "oris $dst, $src1, $src2", IIC_IntSimple,
+                    [(set i32:$dst, (or i32:$src1, imm16ShiftedZExt:$src2))]>;
+def XORI  : DForm_4<26, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2),
+                    "xori $dst, $src1, $src2", IIC_IntSimple,
+                    [(set i32:$dst, (xor i32:$src1, immZExt16:$src2))]>;
+def XORIS : DForm_4<27, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2),
+                    "xoris $dst, $src1, $src2", IIC_IntSimple,
+                    [(set i32:$dst, (xor i32:$src1, imm16ShiftedZExt:$src2))]>;
+
+def NOP   : DForm_4_zero<24, (outs), (ins), "nop", IIC_IntSimple,
+                         []>;
+let isCodeGenOnly = 1 in {
+// The POWER6 and POWER7 have special group-terminating nops.
+def NOP_GT_PWR6 : DForm_4_fixedreg_zero<24, 1, (outs), (ins),
+                                        "ori 1, 1, 0", IIC_IntSimple, []>;
+def NOP_GT_PWR7 : DForm_4_fixedreg_zero<24, 2, (outs), (ins),
+                                        "ori 2, 2, 0", IIC_IntSimple, []>;
+}
+
+let isCompare = 1, neverHasSideEffects = 1 in {
+  def CMPWI : DForm_5_ext<11, (outs crrc:$crD), (ins gprc:$rA, s16imm:$imm),
+                          "cmpwi $crD, $rA, $imm", IIC_IntCompare>;
+  def CMPLWI : DForm_6_ext<10, (outs crrc:$dst), (ins gprc:$src1, u16imm:$src2),
+                           "cmplwi $dst, $src1, $src2", IIC_IntCompare>;
+}
+}
+
+let PPC970_Unit = 1, neverHasSideEffects = 1 in {  // FXU Operations.
+let isCommutable = 1 in {
+defm NAND : XForm_6r<31, 476, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
+                     "nand", "$rA, $rS, $rB", IIC_IntSimple,
+                     [(set i32:$rA, (not (and i32:$rS, i32:$rB)))]>;
+defm AND  : XForm_6r<31,  28, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
+                     "and", "$rA, $rS, $rB", IIC_IntSimple,
+                     [(set i32:$rA, (and i32:$rS, i32:$rB))]>;
+} // isCommutable
+defm ANDC : XForm_6r<31,  60, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
+                     "andc", "$rA, $rS, $rB", IIC_IntSimple,
+                     [(set i32:$rA, (and i32:$rS, (not i32:$rB)))]>;
+let isCommutable = 1 in {
+defm OR   : XForm_6r<31, 444, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
+                     "or", "$rA, $rS, $rB", IIC_IntSimple,
+                     [(set i32:$rA, (or i32:$rS, i32:$rB))]>;
+defm NOR  : XForm_6r<31, 124, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
+                     "nor", "$rA, $rS, $rB", IIC_IntSimple,
+                     [(set i32:$rA, (not (or i32:$rS, i32:$rB)))]>;
+} // isCommutable
+defm ORC  : XForm_6r<31, 412, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
+                     "orc", "$rA, $rS, $rB", IIC_IntSimple,
+                     [(set i32:$rA, (or i32:$rS, (not i32:$rB)))]>;
+let isCommutable = 1 in {
+defm EQV  : XForm_6r<31, 284, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
+                     "eqv", "$rA, $rS, $rB", IIC_IntSimple,
+                     [(set i32:$rA, (not (xor i32:$rS, i32:$rB)))]>;
+defm XOR  : XForm_6r<31, 316, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
+                     "xor", "$rA, $rS, $rB", IIC_IntSimple,
+                     [(set i32:$rA, (xor i32:$rS, i32:$rB))]>;
+} // isCommutable
+defm SLW  : XForm_6r<31,  24, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
+                     "slw", "$rA, $rS, $rB", IIC_IntGeneral,
+                     [(set i32:$rA, (PPCshl i32:$rS, i32:$rB))]>;
+defm SRW  : XForm_6r<31, 536, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
+                     "srw", "$rA, $rS, $rB", IIC_IntGeneral,
+                     [(set i32:$rA, (PPCsrl i32:$rS, i32:$rB))]>;
+defm SRAW : XForm_6rc<31, 792, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
+                      "sraw", "$rA, $rS, $rB", IIC_IntShift,
+                      [(set i32:$rA, (PPCsra i32:$rS, i32:$rB))]>;
+}
+
+let PPC970_Unit = 1 in {  // FXU Operations.
+let neverHasSideEffects = 1 in {
+defm SRAWI : XForm_10rc<31, 824, (outs gprc:$rA), (ins gprc:$rS, u5imm:$SH),
+                        "srawi", "$rA, $rS, $SH", IIC_IntShift,
+                        [(set i32:$rA, (sra i32:$rS, (i32 imm:$SH)))]>;
+defm CNTLZW : XForm_11r<31,  26, (outs gprc:$rA), (ins gprc:$rS),
+                        "cntlzw", "$rA, $rS", IIC_IntGeneral,
+                        [(set i32:$rA, (ctlz i32:$rS))]>;
+defm EXTSB  : XForm_11r<31, 954, (outs gprc:$rA), (ins gprc:$rS),
+                        "extsb", "$rA, $rS", IIC_IntSimple,
+                        [(set i32:$rA, (sext_inreg i32:$rS, i8))]>;
+defm EXTSH  : XForm_11r<31, 922, (outs gprc:$rA), (ins gprc:$rS),
+                        "extsh", "$rA, $rS", IIC_IntSimple,
+                        [(set i32:$rA, (sext_inreg i32:$rS, i16))]>;
+}
+let isCompare = 1, neverHasSideEffects = 1 in {
+  def CMPW   : XForm_16_ext<31, 0, (outs crrc:$crD), (ins gprc:$rA, gprc:$rB),
+                            "cmpw $crD, $rA, $rB", IIC_IntCompare>;
+  def CMPLW  : XForm_16_ext<31, 32, (outs crrc:$crD), (ins gprc:$rA, gprc:$rB),
+                            "cmplw $crD, $rA, $rB", IIC_IntCompare>;
+}
+}
+let PPC970_Unit = 3 in {  // FPU Operations.
+//def FCMPO  : XForm_17<63, 32, (outs CRRC:$crD), (ins FPRC:$fA, FPRC:$fB),
+//                      "fcmpo $crD, $fA, $fB", IIC_FPCompare>;
+let isCompare = 1, neverHasSideEffects = 1 in {
+  def FCMPUS : XForm_17<63, 0, (outs crrc:$crD), (ins f4rc:$fA, f4rc:$fB),
+                        "fcmpu $crD, $fA, $fB", IIC_FPCompare>;
+  let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+  def FCMPUD : XForm_17<63, 0, (outs crrc:$crD), (ins f8rc:$fA, f8rc:$fB),
+                        "fcmpu $crD, $fA, $fB", IIC_FPCompare>;
+}
+
+let Uses = [RM] in {
+  let neverHasSideEffects = 1 in {
+  defm FCTIW  : XForm_26r<63, 14, (outs f8rc:$frD), (ins f8rc:$frB),
+                          "fctiw", "$frD, $frB", IIC_FPGeneral,
+                          []>;
+  defm FCTIWZ : XForm_26r<63, 15, (outs f8rc:$frD), (ins f8rc:$frB),
+                          "fctiwz", "$frD, $frB", IIC_FPGeneral,
+                          [(set f64:$frD, (PPCfctiwz f64:$frB))]>;
+
+  defm FRSP   : XForm_26r<63, 12, (outs f4rc:$frD), (ins f8rc:$frB),
+                          "frsp", "$frD, $frB", IIC_FPGeneral,
+                          [(set f32:$frD, (fround f64:$frB))]>;
+
+  let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+  defm FRIND  : XForm_26r<63, 392, (outs f8rc:$frD), (ins f8rc:$frB),
+                          "frin", "$frD, $frB", IIC_FPGeneral,
+                          [(set f64:$frD, (frnd f64:$frB))]>;
+  defm FRINS  : XForm_26r<63, 392, (outs f4rc:$frD), (ins f4rc:$frB),
+                          "frin", "$frD, $frB", IIC_FPGeneral,
+                          [(set f32:$frD, (frnd f32:$frB))]>;
+  }
+
+  let neverHasSideEffects = 1 in {
+  let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+  defm FRIPD  : XForm_26r<63, 456, (outs f8rc:$frD), (ins f8rc:$frB),
+                          "frip", "$frD, $frB", IIC_FPGeneral,
+                          [(set f64:$frD, (fceil f64:$frB))]>;
+  defm FRIPS  : XForm_26r<63, 456, (outs f4rc:$frD), (ins f4rc:$frB),
+                          "frip", "$frD, $frB", IIC_FPGeneral,
+                          [(set f32:$frD, (fceil f32:$frB))]>;
+  let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+  defm FRIZD  : XForm_26r<63, 424, (outs f8rc:$frD), (ins f8rc:$frB),
+                          "friz", "$frD, $frB", IIC_FPGeneral,
+                          [(set f64:$frD, (ftrunc f64:$frB))]>;
+  defm FRIZS  : XForm_26r<63, 424, (outs f4rc:$frD), (ins f4rc:$frB),
+                          "friz", "$frD, $frB", IIC_FPGeneral,
+                          [(set f32:$frD, (ftrunc f32:$frB))]>;
+  let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+  defm FRIMD  : XForm_26r<63, 488, (outs f8rc:$frD), (ins f8rc:$frB),
+                          "frim", "$frD, $frB", IIC_FPGeneral,
+                          [(set f64:$frD, (ffloor f64:$frB))]>;
+  defm FRIMS  : XForm_26r<63, 488, (outs f4rc:$frD), (ins f4rc:$frB),
+                          "frim", "$frD, $frB", IIC_FPGeneral,
+                          [(set f32:$frD, (ffloor f32:$frB))]>;
+
+  defm FSQRT  : XForm_26r<63, 22, (outs f8rc:$frD), (ins f8rc:$frB),
+                          "fsqrt", "$frD, $frB", IIC_FPSqrtD,
+                          [(set f64:$frD, (fsqrt f64:$frB))]>;
+  defm FSQRTS : XForm_26r<59, 22, (outs f4rc:$frD), (ins f4rc:$frB),
+                          "fsqrts", "$frD, $frB", IIC_FPSqrtS,
+                          [(set f32:$frD, (fsqrt f32:$frB))]>;
+  }
+  }
+}
+
+/// Note that FMR is defined as pseudo-ops on the PPC970 because they are
+/// often coalesced away and we don't want the dispatch group builder to think
+/// that they will fill slots (which could cause the load of a LSU reject to
+/// sneak into a d-group with a store).
+let neverHasSideEffects = 1 in
+defm FMR   : XForm_26r<63, 72, (outs f4rc:$frD), (ins f4rc:$frB),
+                       "fmr", "$frD, $frB", IIC_FPGeneral,
+                       []>,  // (set f32:$frD, f32:$frB)
+                       PPC970_Unit_Pseudo;
+
+let PPC970_Unit = 3, neverHasSideEffects = 1 in {  // FPU Operations.
+// These are artificially split into two different forms, for 4/8 byte FP.
+defm FABSS  : XForm_26r<63, 264, (outs f4rc:$frD), (ins f4rc:$frB),
+                        "fabs", "$frD, $frB", IIC_FPGeneral,
+                        [(set f32:$frD, (fabs f32:$frB))]>;
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+defm FABSD  : XForm_26r<63, 264, (outs f8rc:$frD), (ins f8rc:$frB),
+                        "fabs", "$frD, $frB", IIC_FPGeneral,
+                        [(set f64:$frD, (fabs f64:$frB))]>;
+defm FNABSS : XForm_26r<63, 136, (outs f4rc:$frD), (ins f4rc:$frB),
+                        "fnabs", "$frD, $frB", IIC_FPGeneral,
+                        [(set f32:$frD, (fneg (fabs f32:$frB)))]>;
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+defm FNABSD : XForm_26r<63, 136, (outs f8rc:$frD), (ins f8rc:$frB),
+                        "fnabs", "$frD, $frB", IIC_FPGeneral,
+                        [(set f64:$frD, (fneg (fabs f64:$frB)))]>;
+defm FNEGS  : XForm_26r<63, 40, (outs f4rc:$frD), (ins f4rc:$frB),
+                        "fneg", "$frD, $frB", IIC_FPGeneral,
+                        [(set f32:$frD, (fneg f32:$frB))]>;
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+defm FNEGD  : XForm_26r<63, 40, (outs f8rc:$frD), (ins f8rc:$frB),
+                        "fneg", "$frD, $frB", IIC_FPGeneral,
+                        [(set f64:$frD, (fneg f64:$frB))]>;
+
+defm FCPSGNS : XForm_28r<63, 8, (outs f4rc:$frD), (ins f4rc:$frA, f4rc:$frB),
+                        "fcpsgn", "$frD, $frA, $frB", IIC_FPGeneral,
+                        [(set f32:$frD, (fcopysign f32:$frB, f32:$frA))]>;
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+defm FCPSGND : XForm_28r<63, 8, (outs f8rc:$frD), (ins f8rc:$frA, f8rc:$frB),
+                        "fcpsgn", "$frD, $frA, $frB", IIC_FPGeneral,
+                        [(set f64:$frD, (fcopysign f64:$frB, f64:$frA))]>;
+
+// Reciprocal estimates.
+defm FRE      : XForm_26r<63, 24, (outs f8rc:$frD), (ins f8rc:$frB),
+                          "fre", "$frD, $frB", IIC_FPGeneral,
+                          [(set f64:$frD, (PPCfre f64:$frB))]>;
+defm FRES     : XForm_26r<59, 24, (outs f4rc:$frD), (ins f4rc:$frB),
+                          "fres", "$frD, $frB", IIC_FPGeneral,
+                          [(set f32:$frD, (PPCfre f32:$frB))]>;
+defm FRSQRTE  : XForm_26r<63, 26, (outs f8rc:$frD), (ins f8rc:$frB),
+                          "frsqrte", "$frD, $frB", IIC_FPGeneral,
+                          [(set f64:$frD, (PPCfrsqrte f64:$frB))]>;
+defm FRSQRTES : XForm_26r<59, 26, (outs f4rc:$frD), (ins f4rc:$frB),
+                          "frsqrtes", "$frD, $frB", IIC_FPGeneral,
+                          [(set f32:$frD, (PPCfrsqrte f32:$frB))]>;
+}
+
+// XL-Form instructions.  condition register logical ops.
+//
+let neverHasSideEffects = 1 in
+def MCRF   : XLForm_3<19, 0, (outs crrc:$BF), (ins crrc:$BFA),
+                      "mcrf $BF, $BFA", IIC_BrMCR>,
+             PPC970_DGroup_First, PPC970_Unit_CRU;
+
+let isCommutable = 1 in {
+def CRAND  : XLForm_1<19, 257, (outs crbitrc:$CRD),
+                               (ins crbitrc:$CRA, crbitrc:$CRB),
+                      "crand $CRD, $CRA, $CRB", IIC_BrCR,
+                      [(set i1:$CRD, (and i1:$CRA, i1:$CRB))]>;
+
+def CRNAND : XLForm_1<19, 225, (outs crbitrc:$CRD),
+                               (ins crbitrc:$CRA, crbitrc:$CRB),
+                      "crnand $CRD, $CRA, $CRB", IIC_BrCR,
+                      [(set i1:$CRD, (not (and i1:$CRA, i1:$CRB)))]>;
+
+def CROR   : XLForm_1<19, 449, (outs crbitrc:$CRD),
+                               (ins crbitrc:$CRA, crbitrc:$CRB),
+                      "cror $CRD, $CRA, $CRB", IIC_BrCR,
+                      [(set i1:$CRD, (or i1:$CRA, i1:$CRB))]>;
+
+def CRXOR  : XLForm_1<19, 193, (outs crbitrc:$CRD),
+                               (ins crbitrc:$CRA, crbitrc:$CRB),
+                      "crxor $CRD, $CRA, $CRB", IIC_BrCR,
+                      [(set i1:$CRD, (xor i1:$CRA, i1:$CRB))]>;
+
+def CRNOR  : XLForm_1<19, 33, (outs crbitrc:$CRD),
+                              (ins crbitrc:$CRA, crbitrc:$CRB),
+                      "crnor $CRD, $CRA, $CRB", IIC_BrCR,
+                      [(set i1:$CRD, (not (or i1:$CRA, i1:$CRB)))]>;
+
+def CREQV  : XLForm_1<19, 289, (outs crbitrc:$CRD),
+                               (ins crbitrc:$CRA, crbitrc:$CRB),
+                      "creqv $CRD, $CRA, $CRB", IIC_BrCR,
+                      [(set i1:$CRD, (not (xor i1:$CRA, i1:$CRB)))]>;
+} // isCommutable
+
+def CRANDC : XLForm_1<19, 129, (outs crbitrc:$CRD),
+                               (ins crbitrc:$CRA, crbitrc:$CRB),
+                      "crandc $CRD, $CRA, $CRB", IIC_BrCR,
+                      [(set i1:$CRD, (and i1:$CRA, (not i1:$CRB)))]>;
+
+def CRORC  : XLForm_1<19, 417, (outs crbitrc:$CRD),
+                               (ins crbitrc:$CRA, crbitrc:$CRB),
+                      "crorc $CRD, $CRA, $CRB", IIC_BrCR,
+                      [(set i1:$CRD, (or i1:$CRA, (not i1:$CRB)))]>;
+
+let isCodeGenOnly = 1 in {
+def CRSET  : XLForm_1_ext<19, 289, (outs crbitrc:$dst), (ins),
+              "creqv $dst, $dst, $dst", IIC_BrCR,
+              [(set i1:$dst, 1)]>;
+
+def CRUNSET: XLForm_1_ext<19, 193, (outs crbitrc:$dst), (ins),
+              "crxor $dst, $dst, $dst", IIC_BrCR,
+              [(set i1:$dst, 0)]>;
+
+let Defs = [CR1EQ], CRD = 6 in {
+def CR6SET  : XLForm_1_ext<19, 289, (outs), (ins),
+              "creqv 6, 6, 6", IIC_BrCR,
+              [(PPCcr6set)]>;
+
+def CR6UNSET: XLForm_1_ext<19, 193, (outs), (ins),
+              "crxor 6, 6, 6", IIC_BrCR,
+              [(PPCcr6unset)]>;
+}
+}
+
+// XFX-Form instructions.  Instructions that deal with SPRs.
+//
+
+def MFSPR : XFXForm_1<31, 339, (outs gprc:$RT), (ins i32imm:$SPR),
+                      "mfspr $RT, $SPR", IIC_SprMFSPR>;
+def MTSPR : XFXForm_1<31, 467, (outs), (ins i32imm:$SPR, gprc:$RT),
+                      "mtspr $SPR, $RT", IIC_SprMTSPR>;
+
+def MFTB : XFXForm_1<31, 371, (outs gprc:$RT), (ins i32imm:$SPR),
+                     "mftb $RT, $SPR", IIC_SprMFTB>, Deprecated<DeprecatedMFTB>;
+
+let Uses = [CTR] in {
+def MFCTR : XFXForm_1_ext<31, 339, 9, (outs gprc:$rT), (ins),
+                          "mfctr $rT", IIC_SprMFSPR>,
+            PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+let Defs = [CTR], Pattern = [(PPCmtctr i32:$rS)] in {
+def MTCTR : XFXForm_7_ext<31, 467, 9, (outs), (ins gprc:$rS),
+                          "mtctr $rS", IIC_SprMTSPR>,
+            PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+let hasSideEffects = 1, isCodeGenOnly = 1, Defs = [CTR] in {
+let Pattern = [(int_ppc_mtctr i32:$rS)] in
+def MTCTRloop : XFXForm_7_ext<31, 467, 9, (outs), (ins gprc:$rS),
+                              "mtctr $rS", IIC_SprMTSPR>,
+                PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+
+let Defs = [LR] in {
+def MTLR  : XFXForm_7_ext<31, 467, 8, (outs), (ins gprc:$rS),
+                          "mtlr $rS", IIC_SprMTSPR>,
+            PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+let Uses = [LR] in {
+def MFLR  : XFXForm_1_ext<31, 339, 8, (outs gprc:$rT), (ins),
+                          "mflr $rT", IIC_SprMFSPR>,
+            PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+
+let isCodeGenOnly = 1 in {
+  // Move to/from VRSAVE: despite being a SPR, the VRSAVE register is renamed
+  // like a GPR on the PPC970.  As such, copies in and out have the same
+  // performance characteristics as an OR instruction.
+  def MTVRSAVE : XFXForm_7_ext<31, 467, 256, (outs), (ins gprc:$rS),
+                               "mtspr 256, $rS", IIC_IntGeneral>,
+                 PPC970_DGroup_Single, PPC970_Unit_FXU;
+  def MFVRSAVE : XFXForm_1_ext<31, 339, 256, (outs gprc:$rT), (ins),
+                               "mfspr $rT, 256", IIC_IntGeneral>,
+                 PPC970_DGroup_First, PPC970_Unit_FXU;
+
+  def MTVRSAVEv : XFXForm_7_ext<31, 467, 256,
+                                (outs VRSAVERC:$reg), (ins gprc:$rS),
+                                "mtspr 256, $rS", IIC_IntGeneral>,
+                  PPC970_DGroup_Single, PPC970_Unit_FXU;
+  def MFVRSAVEv : XFXForm_1_ext<31, 339, 256, (outs gprc:$rT),
+                                (ins VRSAVERC:$reg),
+                                "mfspr $rT, 256", IIC_IntGeneral>,
+                  PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+
+// SPILL_VRSAVE - Indicate that we're dumping the VRSAVE register,
+// so we'll need to scavenge a register for it.
+let mayStore = 1 in
+def SPILL_VRSAVE : Pseudo<(outs), (ins VRSAVERC:$vrsave, memri:$F),
+                     "#SPILL_VRSAVE", []>;
+
+// RESTORE_VRSAVE - Indicate that we're restoring the VRSAVE register (previously
+// spilled), so we'll need to scavenge a register for it.
+let mayLoad = 1 in
+def RESTORE_VRSAVE : Pseudo<(outs VRSAVERC:$vrsave), (ins memri:$F),
+                     "#RESTORE_VRSAVE", []>;
+
+let neverHasSideEffects = 1 in {
+def MTOCRF: XFXForm_5a<31, 144, (outs crbitm:$FXM), (ins gprc:$ST),
+                       "mtocrf $FXM, $ST", IIC_BrMCRX>,
+            PPC970_DGroup_First, PPC970_Unit_CRU;
+
+def MTCRF : XFXForm_5<31, 144, (outs), (ins i32imm:$FXM, gprc:$rS),
+                      "mtcrf $FXM, $rS", IIC_BrMCRX>,
+            PPC970_MicroCode, PPC970_Unit_CRU;
+
+let hasExtraSrcRegAllocReq = 1 in // to enable post-ra anti-dep breaking.
+def MFOCRF: XFXForm_5a<31, 19, (outs gprc:$rT), (ins crbitm:$FXM),
+                       "mfocrf $rT, $FXM", IIC_SprMFCRF>,
+            PPC970_DGroup_First, PPC970_Unit_CRU;
+
+def MFCR : XFXForm_3<31, 19, (outs gprc:$rT), (ins),
+                     "mfcr $rT", IIC_SprMFCR>,
+                     PPC970_MicroCode, PPC970_Unit_CRU;
+} // neverHasSideEffects = 1
+
+// Pseudo instruction to perform FADD in round-to-zero mode.
+let usesCustomInserter = 1, Uses = [RM] in {
+  def FADDrtz: Pseudo<(outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRB), "",
+                      [(set f64:$FRT, (PPCfaddrtz f64:$FRA, f64:$FRB))]>;
+}
+
+// The above pseudo gets expanded to make use of the following instructions
+// to manipulate FPSCR.  Note that FPSCR is not modeled at the DAG level.
+let Uses = [RM], Defs = [RM] in { 
+  def MTFSB0 : XForm_43<63, 70, (outs), (ins u5imm:$FM),
+                        "mtfsb0 $FM", IIC_IntMTFSB0, []>,
+               PPC970_DGroup_Single, PPC970_Unit_FPU;
+  def MTFSB1 : XForm_43<63, 38, (outs), (ins u5imm:$FM),
+                        "mtfsb1 $FM", IIC_IntMTFSB0, []>,
+               PPC970_DGroup_Single, PPC970_Unit_FPU;
+  def MTFSF  : XFLForm<63, 711, (outs), (ins i32imm:$FM, f8rc:$rT),
+                       "mtfsf $FM, $rT", IIC_IntMTFSB0, []>,
+               PPC970_DGroup_Single, PPC970_Unit_FPU;
+}
+let Uses = [RM] in {
+  def MFFS   : XForm_42<63, 583, (outs f8rc:$rT), (ins),
+                         "mffs $rT", IIC_IntMFFS,
+                         [(set f64:$rT, (PPCmffs))]>,
+               PPC970_DGroup_Single, PPC970_Unit_FPU;
+}
+
+
+let PPC970_Unit = 1, neverHasSideEffects = 1 in {  // FXU Operations.
+// XO-Form instructions.  Arithmetic instructions that can set overflow bit
+let isCommutable = 1 in
+defm ADD4  : XOForm_1r<31, 266, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+                       "add", "$rT, $rA, $rB", IIC_IntSimple,
+                       [(set i32:$rT, (add i32:$rA, i32:$rB))]>;
+let isCodeGenOnly = 1 in
+def ADD4TLS  : XOForm_1<31, 266, 0, (outs gprc:$rT), (ins gprc:$rA, tlsreg32:$rB),
+                       "add $rT, $rA, $rB", IIC_IntSimple,
+                       [(set i32:$rT, (add i32:$rA, tglobaltlsaddr:$rB))]>;
+let isCommutable = 1 in
+defm ADDC  : XOForm_1rc<31, 10, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+                        "addc", "$rT, $rA, $rB", IIC_IntGeneral,
+                        [(set i32:$rT, (addc i32:$rA, i32:$rB))]>,
+                        PPC970_DGroup_Cracked;
+
+defm DIVW  : XOForm_1r<31, 491, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+                       "divw", "$rT, $rA, $rB", IIC_IntDivW,
+                       [(set i32:$rT, (sdiv i32:$rA, i32:$rB))]>,
+                       PPC970_DGroup_First, PPC970_DGroup_Cracked;
+defm DIVWU : XOForm_1r<31, 459, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+                       "divwu", "$rT, $rA, $rB", IIC_IntDivW,
+                       [(set i32:$rT, (udiv i32:$rA, i32:$rB))]>,
+                       PPC970_DGroup_First, PPC970_DGroup_Cracked;
+let isCommutable = 1 in {
+defm MULHW : XOForm_1r<31, 75, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+                       "mulhw", "$rT, $rA, $rB", IIC_IntMulHW,
+                       [(set i32:$rT, (mulhs i32:$rA, i32:$rB))]>;
+defm MULHWU : XOForm_1r<31, 11, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+                       "mulhwu", "$rT, $rA, $rB", IIC_IntMulHWU,
+                       [(set i32:$rT, (mulhu i32:$rA, i32:$rB))]>;
+defm MULLW : XOForm_1r<31, 235, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+                       "mullw", "$rT, $rA, $rB", IIC_IntMulHW,
+                       [(set i32:$rT, (mul i32:$rA, i32:$rB))]>;
+} // isCommutable
+defm SUBF  : XOForm_1r<31, 40, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+                       "subf", "$rT, $rA, $rB", IIC_IntGeneral,
+                       [(set i32:$rT, (sub i32:$rB, i32:$rA))]>;
+defm SUBFC : XOForm_1rc<31, 8, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+                        "subfc", "$rT, $rA, $rB", IIC_IntGeneral,
+                        [(set i32:$rT, (subc i32:$rB, i32:$rA))]>,
+                        PPC970_DGroup_Cracked;
+defm NEG    : XOForm_3r<31, 104, 0, (outs gprc:$rT), (ins gprc:$rA),
+                        "neg", "$rT, $rA", IIC_IntSimple,
+                        [(set i32:$rT, (ineg i32:$rA))]>;
+let Uses = [CARRY] in {
+let isCommutable = 1 in
+defm ADDE  : XOForm_1rc<31, 138, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+                        "adde", "$rT, $rA, $rB", IIC_IntGeneral,
+                        [(set i32:$rT, (adde i32:$rA, i32:$rB))]>;
+defm ADDME  : XOForm_3rc<31, 234, 0, (outs gprc:$rT), (ins gprc:$rA),
+                         "addme", "$rT, $rA", IIC_IntGeneral,
+                         [(set i32:$rT, (adde i32:$rA, -1))]>;
+defm ADDZE  : XOForm_3rc<31, 202, 0, (outs gprc:$rT), (ins gprc:$rA),
+                         "addze", "$rT, $rA", IIC_IntGeneral,
+                         [(set i32:$rT, (adde i32:$rA, 0))]>;
+defm SUBFE : XOForm_1rc<31, 136, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+                        "subfe", "$rT, $rA, $rB", IIC_IntGeneral,
+                        [(set i32:$rT, (sube i32:$rB, i32:$rA))]>;
+defm SUBFME : XOForm_3rc<31, 232, 0, (outs gprc:$rT), (ins gprc:$rA),
+                         "subfme", "$rT, $rA", IIC_IntGeneral,
+                         [(set i32:$rT, (sube -1, i32:$rA))]>;
+defm SUBFZE : XOForm_3rc<31, 200, 0, (outs gprc:$rT), (ins gprc:$rA),
+                         "subfze", "$rT, $rA", IIC_IntGeneral,
+                         [(set i32:$rT, (sube 0, i32:$rA))]>;
+}
+}
+
+// A-Form instructions.  Most of the instructions executed in the FPU are of
+// this type.
+//
+let PPC970_Unit = 3, neverHasSideEffects = 1 in {  // FPU Operations.
+let Uses = [RM] in {
+let isCommutable = 1 in {
+  defm FMADD : AForm_1r<63, 29, 
+                      (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC, f8rc:$FRB),
+                      "fmadd", "$FRT, $FRA, $FRC, $FRB", IIC_FPFused,
+                      [(set f64:$FRT, (fma f64:$FRA, f64:$FRC, f64:$FRB))]>;
+  defm FMADDS : AForm_1r<59, 29,
+                      (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRC, f4rc:$FRB),
+                      "fmadds", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral,
+                      [(set f32:$FRT, (fma f32:$FRA, f32:$FRC, f32:$FRB))]>;
+  defm FMSUB : AForm_1r<63, 28,
+                      (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC, f8rc:$FRB),
+                      "fmsub", "$FRT, $FRA, $FRC, $FRB", IIC_FPFused,
+                      [(set f64:$FRT,
+                            (fma f64:$FRA, f64:$FRC, (fneg f64:$FRB)))]>;
+  defm FMSUBS : AForm_1r<59, 28,
+                      (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRC, f4rc:$FRB),
+                      "fmsubs", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral,
+                      [(set f32:$FRT,
+                            (fma f32:$FRA, f32:$FRC, (fneg f32:$FRB)))]>;
+  defm FNMADD : AForm_1r<63, 31,
+                      (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC, f8rc:$FRB),
+                      "fnmadd", "$FRT, $FRA, $FRC, $FRB", IIC_FPFused,
+                      [(set f64:$FRT,
+                            (fneg (fma f64:$FRA, f64:$FRC, f64:$FRB)))]>;
+  defm FNMADDS : AForm_1r<59, 31,
+                      (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRC, f4rc:$FRB),
+                      "fnmadds", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral,
+                      [(set f32:$FRT,
+                            (fneg (fma f32:$FRA, f32:$FRC, f32:$FRB)))]>;
+  defm FNMSUB : AForm_1r<63, 30,
+                      (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC, f8rc:$FRB),
+                      "fnmsub", "$FRT, $FRA, $FRC, $FRB", IIC_FPFused,
+                      [(set f64:$FRT, (fneg (fma f64:$FRA, f64:$FRC,
+                                                 (fneg f64:$FRB))))]>;
+  defm FNMSUBS : AForm_1r<59, 30,
+                      (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRC, f4rc:$FRB),
+                      "fnmsubs", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral,
+                      [(set f32:$FRT, (fneg (fma f32:$FRA, f32:$FRC,
+                                                 (fneg f32:$FRB))))]>;
+} // isCommutable
+}
+// FSEL is artificially split into 4 and 8-byte forms for the result.  To avoid
+// having 4 of these, force the comparison to always be an 8-byte double (code
+// should use an FMRSD if the input comparison value really wants to be a float)
+// and 4/8 byte forms for the result and operand type..
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+defm FSELD : AForm_1r<63, 23,
+                      (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC, f8rc:$FRB),
+                      "fsel", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral,
+                      [(set f64:$FRT, (PPCfsel f64:$FRA, f64:$FRC, f64:$FRB))]>;
+defm FSELS : AForm_1r<63, 23,
+                      (outs f4rc:$FRT), (ins f8rc:$FRA, f4rc:$FRC, f4rc:$FRB),
+                      "fsel", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral,
+                      [(set f32:$FRT, (PPCfsel f64:$FRA, f32:$FRC, f32:$FRB))]>;
+let Uses = [RM] in {
+  let isCommutable = 1 in {
+  defm FADD  : AForm_2r<63, 21,
+                        (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRB),
+                        "fadd", "$FRT, $FRA, $FRB", IIC_FPAddSub,
+                        [(set f64:$FRT, (fadd f64:$FRA, f64:$FRB))]>;
+  defm FADDS : AForm_2r<59, 21,
+                        (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRB),
+                        "fadds", "$FRT, $FRA, $FRB", IIC_FPGeneral,
+                        [(set f32:$FRT, (fadd f32:$FRA, f32:$FRB))]>;
+  } // isCommutable
+  defm FDIV  : AForm_2r<63, 18,
+                        (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRB),
+                        "fdiv", "$FRT, $FRA, $FRB", IIC_FPDivD,
+                        [(set f64:$FRT, (fdiv f64:$FRA, f64:$FRB))]>;
+  defm FDIVS : AForm_2r<59, 18,
+                        (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRB),
+                        "fdivs", "$FRT, $FRA, $FRB", IIC_FPDivS,
+                        [(set f32:$FRT, (fdiv f32:$FRA, f32:$FRB))]>;
+  let isCommutable = 1 in {
+  defm FMUL  : AForm_3r<63, 25,
+                        (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC),
+                        "fmul", "$FRT, $FRA, $FRC", IIC_FPFused,
+                        [(set f64:$FRT, (fmul f64:$FRA, f64:$FRC))]>;
+  defm FMULS : AForm_3r<59, 25,
+                        (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRC),
+                        "fmuls", "$FRT, $FRA, $FRC", IIC_FPGeneral,
+                        [(set f32:$FRT, (fmul f32:$FRA, f32:$FRC))]>;
+  } // isCommutable
+  defm FSUB  : AForm_2r<63, 20,
+                        (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRB),
+                        "fsub", "$FRT, $FRA, $FRB", IIC_FPAddSub,
+                        [(set f64:$FRT, (fsub f64:$FRA, f64:$FRB))]>;
+  defm FSUBS : AForm_2r<59, 20,
+                        (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRB),
+                        "fsubs", "$FRT, $FRA, $FRB", IIC_FPGeneral,
+                        [(set f32:$FRT, (fsub f32:$FRA, f32:$FRB))]>;
+  }
+}
+
+let neverHasSideEffects = 1 in {
+let PPC970_Unit = 1 in {  // FXU Operations.
+  let isSelect = 1 in
+  def ISEL  : AForm_4<31, 15,
+                     (outs gprc:$rT), (ins gprc_nor0:$rA, gprc:$rB, crbitrc:$cond),
+                     "isel $rT, $rA, $rB, $cond", IIC_IntGeneral,
+                     []>;
+}
+
+let PPC970_Unit = 1 in {  // FXU Operations.
+// M-Form instructions.  rotate and mask instructions.
+//
+let isCommutable = 1 in {
+// RLWIMI can be commuted if the rotate amount is zero.
+defm RLWIMI : MForm_2r<20, (outs gprc:$rA),
+                       (ins gprc:$rSi, gprc:$rS, u5imm:$SH, u5imm:$MB,
+                       u5imm:$ME), "rlwimi", "$rA, $rS, $SH, $MB, $ME",
+                       IIC_IntRotate, []>, PPC970_DGroup_Cracked,
+                       RegConstraint<"$rSi = $rA">, NoEncode<"$rSi">;
+}
+let BaseName = "rlwinm" in {
+def RLWINM : MForm_2<21,
+                     (outs gprc:$rA), (ins gprc:$rS, u5imm:$SH, u5imm:$MB, u5imm:$ME),
+                     "rlwinm $rA, $rS, $SH, $MB, $ME", IIC_IntGeneral,
+                     []>, RecFormRel;
+let Defs = [CR0] in
+def RLWINMo : MForm_2<21,
+                      (outs gprc:$rA), (ins gprc:$rS, u5imm:$SH, u5imm:$MB, u5imm:$ME),
+                      "rlwinm. $rA, $rS, $SH, $MB, $ME", IIC_IntGeneral,
+                      []>, isDOT, RecFormRel, PPC970_DGroup_Cracked;
+}
+defm RLWNM  : MForm_2r<23, (outs gprc:$rA),
+                       (ins gprc:$rS, gprc:$rB, u5imm:$MB, u5imm:$ME),
+                       "rlwnm", "$rA, $rS, $rB, $MB, $ME", IIC_IntGeneral,
+                       []>;
+}
+} // neverHasSideEffects = 1
+
+//===----------------------------------------------------------------------===//
+// PowerPC Instruction Patterns
+//
+
+// Arbitrary immediate support.  Implement in terms of LIS/ORI.
+def : Pat<(i32 imm:$imm),
+          (ORI (LIS (HI16 imm:$imm)), (LO16 imm:$imm))>;
+
+// Implement the 'not' operation with the NOR instruction.
+def i32not : OutPatFrag<(ops node:$in),
+                        (NOR $in, $in)>;
+def        : Pat<(not i32:$in),
+                 (i32not $in)>;
+
+// ADD an arbitrary immediate.
+def : Pat<(add i32:$in, imm:$imm),
+          (ADDIS (ADDI $in, (LO16 imm:$imm)), (HA16 imm:$imm))>;
+// OR an arbitrary immediate.
+def : Pat<(or i32:$in, imm:$imm),
+          (ORIS (ORI $in, (LO16 imm:$imm)), (HI16 imm:$imm))>;
+// XOR an arbitrary immediate.
+def : Pat<(xor i32:$in, imm:$imm),
+          (XORIS (XORI $in, (LO16 imm:$imm)), (HI16 imm:$imm))>;
+// SUBFIC
+def : Pat<(sub imm32SExt16:$imm, i32:$in),
+          (SUBFIC $in, imm:$imm)>;
+
+// SHL/SRL
+def : Pat<(shl i32:$in, (i32 imm:$imm)),
+          (RLWINM $in, imm:$imm, 0, (SHL32 imm:$imm))>;
+def : Pat<(srl i32:$in, (i32 imm:$imm)),
+          (RLWINM $in, (SRL32 imm:$imm), imm:$imm, 31)>;
+
+// ROTL
+def : Pat<(rotl i32:$in, i32:$sh),
+          (RLWNM $in, $sh, 0, 31)>;
+def : Pat<(rotl i32:$in, (i32 imm:$imm)),
+          (RLWINM $in, imm:$imm, 0, 31)>;
+
+// RLWNM
+def : Pat<(and (rotl i32:$in, i32:$sh), maskimm32:$imm),
+          (RLWNM $in, $sh, (MB maskimm32:$imm), (ME maskimm32:$imm))>;
+
+// Calls
+def : Pat<(PPCcall (i32 tglobaladdr:$dst)),
+          (BL tglobaladdr:$dst)>;
+def : Pat<(PPCcall (i32 texternalsym:$dst)),
+          (BL texternalsym:$dst)>;
+
+
+def : Pat<(PPCtc_return (i32 tglobaladdr:$dst),  imm:$imm),
+          (TCRETURNdi tglobaladdr:$dst, imm:$imm)>;
+
+def : Pat<(PPCtc_return (i32 texternalsym:$dst), imm:$imm),
+          (TCRETURNdi texternalsym:$dst, imm:$imm)>;
+
+def : Pat<(PPCtc_return CTRRC:$dst, imm:$imm),
+          (TCRETURNri CTRRC:$dst, imm:$imm)>;
+
+
+
+// Hi and Lo for Darwin Global Addresses.
+def : Pat<(PPChi tglobaladdr:$in, 0), (LIS tglobaladdr:$in)>;
+def : Pat<(PPClo tglobaladdr:$in, 0), (LI tglobaladdr:$in)>;
+def : Pat<(PPChi tconstpool:$in, 0), (LIS tconstpool:$in)>;
+def : Pat<(PPClo tconstpool:$in, 0), (LI tconstpool:$in)>;
+def : Pat<(PPChi tjumptable:$in, 0), (LIS tjumptable:$in)>;
+def : Pat<(PPClo tjumptable:$in, 0), (LI tjumptable:$in)>;
+def : Pat<(PPChi tblockaddress:$in, 0), (LIS tblockaddress:$in)>;
+def : Pat<(PPClo tblockaddress:$in, 0), (LI tblockaddress:$in)>;
+def : Pat<(PPChi tglobaltlsaddr:$g, i32:$in),
+          (ADDIS $in, tglobaltlsaddr:$g)>;
+def : Pat<(PPClo tglobaltlsaddr:$g, i32:$in),
+          (ADDI $in, tglobaltlsaddr:$g)>;
+def : Pat<(add i32:$in, (PPChi tglobaladdr:$g, 0)),
+          (ADDIS $in, tglobaladdr:$g)>;
+def : Pat<(add i32:$in, (PPChi tconstpool:$g, 0)),
+          (ADDIS $in, tconstpool:$g)>;
+def : Pat<(add i32:$in, (PPChi tjumptable:$g, 0)),
+          (ADDIS $in, tjumptable:$g)>;
+def : Pat<(add i32:$in, (PPChi tblockaddress:$g, 0)),
+          (ADDIS $in, tblockaddress:$g)>;
+
+// Support for thread-local storage.
+def PPC32GOT: Pseudo<(outs gprc:$rD), (ins), "#PPC32GOT", 
+                [(set i32:$rD, (PPCppc32GOT))]>;
+
+// Get the _GLOBAL_OFFSET_TABLE_ in PIC mode.
+// This uses two output registers, the first as the real output, the second as a
+// temporary register, used internally in code generation.
+def PPC32PICGOT: Pseudo<(outs gprc:$rD, gprc:$rT), (ins), "#PPC32PICGOT", 
+                []>, NoEncode<"$rT">;
+
+def LDgotTprelL32: Pseudo<(outs gprc:$rD), (ins s16imm:$disp, gprc_nor0:$reg),
+                           "#LDgotTprelL32",
+                           [(set i32:$rD,
+                             (PPCldGotTprelL tglobaltlsaddr:$disp, i32:$reg))]>;
+def : Pat<(PPCaddTls i32:$in, tglobaltlsaddr:$g),
+          (ADD4TLS $in, tglobaltlsaddr:$g)>;
+
+def ADDItlsgdL32 : Pseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp),
+                         "#ADDItlsgdL32",
+                         [(set i32:$rD,
+                           (PPCaddiTlsgdL i32:$reg, tglobaltlsaddr:$disp))]>;
+def GETtlsADDR32 : Pseudo<(outs gprc:$rD), (ins gprc:$reg, tlsgd32:$sym),
+                          "#GETtlsADDR32",
+                          [(set i32:$rD,
+                            (PPCgetTlsAddr i32:$reg, tglobaltlsaddr:$sym))]>;
+def ADDItlsldL32 : Pseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp),
+                          "#ADDItlsldL32",
+                          [(set i32:$rD,
+                            (PPCaddiTlsldL i32:$reg, tglobaltlsaddr:$disp))]>;
+def GETtlsldADDR32 : Pseudo<(outs gprc:$rD), (ins gprc:$reg, tlsgd32:$sym),
+                            "#GETtlsldADDR32",
+                            [(set i32:$rD,
+                              (PPCgetTlsldAddr i32:$reg, tglobaltlsaddr:$sym))]>;
+def ADDIdtprelL32 : Pseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp),
+                           "#ADDIdtprelL32",
+                           [(set i32:$rD,
+                             (PPCaddiDtprelL i32:$reg, tglobaltlsaddr:$disp))]>;
+def ADDISdtprelHA32 : Pseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp),
+                            "#ADDISdtprelHA32",
+                            [(set i32:$rD,
+                              (PPCaddisDtprelHA i32:$reg,
+                                                tglobaltlsaddr:$disp))]>;
+
+// Support for Position-independent code
+def LWZtoc: Pseudo<(outs gprc:$rD), (ins tocentry32:$disp, gprc:$reg),
+                  "#LWZtoc",
+                  [(set i32:$rD,
+                     (PPCtoc_entry tglobaladdr:$disp, i32:$reg))]>;
+// Get Global (GOT) Base Register offset, from the word immediately preceding
+// the function label.
+def GetGBRO:	Pseudo<(outs gprc:$rT), (ins gprc:$rI), "#GetGBRO", []>;
+// Update the Global(GOT) Base Register with the above offset.
+def UpdateGBR:	Pseudo<(outs gprc:$rT), (ins gprc:$rI), "#UpdateGBR", []>;
+
+
+// Standard shifts.  These are represented separately from the real shifts above
+// so that we can distinguish between shifts that allow 5-bit and 6-bit shift
+// amounts.
+def : Pat<(sra i32:$rS, i32:$rB),
+          (SRAW $rS, $rB)>;
+def : Pat<(srl i32:$rS, i32:$rB),
+          (SRW $rS, $rB)>;
+def : Pat<(shl i32:$rS, i32:$rB),
+          (SLW $rS, $rB)>;
+
+def : Pat<(zextloadi1 iaddr:$src),
+          (LBZ iaddr:$src)>;
+def : Pat<(zextloadi1 xaddr:$src),
+          (LBZX xaddr:$src)>;
+def : Pat<(extloadi1 iaddr:$src),
+          (LBZ iaddr:$src)>;
+def : Pat<(extloadi1 xaddr:$src),
+          (LBZX xaddr:$src)>;
+def : Pat<(extloadi8 iaddr:$src),
+          (LBZ iaddr:$src)>;
+def : Pat<(extloadi8 xaddr:$src),
+          (LBZX xaddr:$src)>;
+def : Pat<(extloadi16 iaddr:$src),
+          (LHZ iaddr:$src)>;
+def : Pat<(extloadi16 xaddr:$src),
+          (LHZX xaddr:$src)>;
+def : Pat<(f64 (extloadf32 iaddr:$src)),
+          (COPY_TO_REGCLASS (LFS iaddr:$src), F8RC)>;
+def : Pat<(f64 (extloadf32 xaddr:$src)),
+          (COPY_TO_REGCLASS (LFSX xaddr:$src), F8RC)>;
+
+def : Pat<(f64 (fextend f32:$src)),
+          (COPY_TO_REGCLASS $src, F8RC)>;
+
+def : Pat<(atomic_fence (imm), (imm)), (SYNC 0)>, Requires<[IsNotBookE]>;
+def : Pat<(atomic_fence (imm), (imm)), (MSYNC)>, Requires<[IsBookE]>;
+
+// Additional FNMSUB patterns: -a*c + b == -(a*c - b)
+def : Pat<(fma (fneg f64:$A), f64:$C, f64:$B),
+          (FNMSUB $A, $C, $B)>;
+def : Pat<(fma f64:$A, (fneg f64:$C), f64:$B),
+          (FNMSUB $A, $C, $B)>;
+def : Pat<(fma (fneg f32:$A), f32:$C, f32:$B),
+          (FNMSUBS $A, $C, $B)>;
+def : Pat<(fma f32:$A, (fneg f32:$C), f32:$B),
+          (FNMSUBS $A, $C, $B)>;
+
+// FCOPYSIGN's operand types need not agree.
+def : Pat<(fcopysign f64:$frB, f32:$frA),
+          (FCPSGND (COPY_TO_REGCLASS $frA, F8RC), $frB)>;
+def : Pat<(fcopysign f32:$frB, f64:$frA),
+          (FCPSGNS (COPY_TO_REGCLASS $frA, F4RC), $frB)>;
+
+include "PPCInstrAltivec.td"
+include "PPCInstr64Bit.td"
+include "PPCInstrVSX.td"
+
+def crnot : OutPatFrag<(ops node:$in),
+                       (CRNOR $in, $in)>;
+def       : Pat<(not i1:$in),
+                (crnot $in)>;
+
+// Patterns for arithmetic i1 operations.
+def : Pat<(add i1:$a, i1:$b),
+          (CRXOR $a, $b)>;
+def : Pat<(sub i1:$a, i1:$b),
+          (CRXOR $a, $b)>;
+def : Pat<(mul i1:$a, i1:$b),
+          (CRAND $a, $b)>;
+
+// We're sometimes asked to materialize i1 -1, which is just 1 in this case
+// (-1 is used to mean all bits set).
+def : Pat<(i1 -1), (CRSET)>;
+
+// i1 extensions, implemented in terms of isel.
+def : Pat<(i32 (zext i1:$in)),
+          (SELECT_I4 $in, (LI 1), (LI 0))>;
+def : Pat<(i32 (sext i1:$in)),
+          (SELECT_I4 $in, (LI -1), (LI 0))>;
+
+def : Pat<(i64 (zext i1:$in)),
+          (SELECT_I8 $in, (LI8 1), (LI8 0))>;
+def : Pat<(i64 (sext i1:$in)),
+          (SELECT_I8 $in, (LI8 -1), (LI8 0))>;
+
+// FIXME: We should choose either a zext or a sext based on other constants
+// already around.
+def : Pat<(i32 (anyext i1:$in)),
+          (SELECT_I4 $in, (LI 1), (LI 0))>;
+def : Pat<(i64 (anyext i1:$in)),
+          (SELECT_I8 $in, (LI8 1), (LI8 0))>;
+
+// match setcc on i1 variables.
+def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETLT)),
+          (CRANDC $s2, $s1)>;
+def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETULT)),
+          (CRANDC $s2, $s1)>;
+def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETLE)),
+          (CRORC $s2, $s1)>;
+def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETULE)),
+          (CRORC $s2, $s1)>;
+def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETEQ)),
+          (CREQV $s1, $s2)>;
+def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETGE)),
+          (CRORC $s1, $s2)>;
+def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETUGE)),
+          (CRORC $s1, $s2)>;
+def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETGT)),
+          (CRANDC $s1, $s2)>;
+def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETUGT)),
+          (CRANDC $s1, $s2)>;
+def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETNE)),
+          (CRXOR $s1, $s2)>;
+
+// match setcc on non-i1 (non-vector) variables. Note that SETUEQ, SETOGE,
+// SETOLE, SETONE, SETULT and SETUGT should be expanded by legalize for
+// floating-point types.
+
+multiclass CRNotPat<dag pattern, dag result> {
+  def : Pat<pattern, (crnot result)>;
+  def : Pat<(not pattern), result>;
+
+  // We can also fold the crnot into an extension:
+  def : Pat<(i32 (zext pattern)),
+            (SELECT_I4 result, (LI 0), (LI 1))>;
+  def : Pat<(i32 (sext pattern)),
+            (SELECT_I4 result, (LI 0), (LI -1))>;
+
+  // We can also fold the crnot into an extension:
+  def : Pat<(i64 (zext pattern)),
+            (SELECT_I8 result, (LI8 0), (LI8 1))>;
+  def : Pat<(i64 (sext pattern)),
+            (SELECT_I8 result, (LI8 0), (LI8 -1))>;
+
+  // FIXME: We should choose either a zext or a sext based on other constants
+  // already around.
+  def : Pat<(i32 (anyext pattern)),
+            (SELECT_I4 result, (LI 0), (LI 1))>;
+
+  def : Pat<(i64 (anyext pattern)),
+            (SELECT_I8 result, (LI8 0), (LI8 1))>;
+}
+
+// FIXME: Because of what seems like a bug in TableGen's type-inference code,
+// we need to write imm:$imm in the output patterns below, not just $imm, or
+// else the resulting matcher will not correctly add the immediate operand
+// (making it a register operand instead).
+
+// extended SETCC.
+multiclass ExtSetCCPat<CondCode cc, PatFrag pfrag,
+                       OutPatFrag rfrag, OutPatFrag rfrag8> {
+  def : Pat<(i32 (zext (i1 (pfrag i32:$s1, cc)))),
+            (rfrag $s1)>;
+  def : Pat<(i64 (zext (i1 (pfrag i64:$s1, cc)))),
+            (rfrag8 $s1)>;
+  def : Pat<(i64 (zext (i1 (pfrag i32:$s1, cc)))),
+            (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (rfrag $s1), sub_32)>;
+  def : Pat<(i32 (zext (i1 (pfrag i64:$s1, cc)))),
+            (EXTRACT_SUBREG (rfrag8 $s1), sub_32)>;
+
+  def : Pat<(i32 (anyext (i1 (pfrag i32:$s1, cc)))),
+            (rfrag $s1)>;
+  def : Pat<(i64 (anyext (i1 (pfrag i64:$s1, cc)))),
+            (rfrag8 $s1)>;
+  def : Pat<(i64 (anyext (i1 (pfrag i32:$s1, cc)))),
+            (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (rfrag $s1), sub_32)>;
+  def : Pat<(i32 (anyext (i1 (pfrag i64:$s1, cc)))),
+            (EXTRACT_SUBREG (rfrag8 $s1), sub_32)>;
+}
+
+// Note that we do all inversions below with i(32|64)not, instead of using
+// (xori x, 1) because on the A2 nor has single-cycle latency while xori
+// has 2-cycle latency.
+
+defm : ExtSetCCPat<SETEQ,
+                   PatFrag<(ops node:$in, node:$cc),
+                           (setcc $in, 0, $cc)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLWINM (CNTLZW $in), 27, 31, 31)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLDICL (CNTLZD $in), 58, 63)> >;
+ 
+defm : ExtSetCCPat<SETNE,
+                   PatFrag<(ops node:$in, node:$cc),
+                           (setcc $in, 0, $cc)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLWINM (i32not (CNTLZW $in)), 27, 31, 31)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLDICL (i64not (CNTLZD $in)), 58, 63)> >;
+                 
+defm : ExtSetCCPat<SETLT,
+                   PatFrag<(ops node:$in, node:$cc),
+                           (setcc $in, 0, $cc)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLWINM $in, 1, 31, 31)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLDICL $in, 1, 63)> >;
+
+defm : ExtSetCCPat<SETGE,
+                   PatFrag<(ops node:$in, node:$cc),
+                           (setcc $in, 0, $cc)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLWINM (i32not $in), 1, 31, 31)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLDICL (i64not $in), 1, 63)> >;
+
+defm : ExtSetCCPat<SETGT,
+                   PatFrag<(ops node:$in, node:$cc),
+                           (setcc $in, 0, $cc)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLWINM (ANDC (NEG $in), $in), 1, 31, 31)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLDICL (ANDC8 (NEG8 $in), $in), 1, 63)> >;
+
+defm : ExtSetCCPat<SETLE,
+                   PatFrag<(ops node:$in, node:$cc),
+                           (setcc $in, 0, $cc)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLWINM (ORC $in, (NEG $in)), 1, 31, 31)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLDICL (ORC8 $in, (NEG8 $in)), 1, 63)> >;
+
+defm : ExtSetCCPat<SETLT,
+                   PatFrag<(ops node:$in, node:$cc),
+                           (setcc $in, -1, $cc)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLWINM (AND $in, (ADDI $in, 1)), 1, 31, 31)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLDICL (AND8 $in, (ADDI8 $in, 1)), 1, 63)> >;
+
+defm : ExtSetCCPat<SETGE,
+                   PatFrag<(ops node:$in, node:$cc),
+                           (setcc $in, -1, $cc)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLWINM (NAND $in, (ADDI $in, 1)), 1, 31, 31)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLDICL (NAND8 $in, (ADDI8 $in, 1)), 1, 63)> >;
+
+defm : ExtSetCCPat<SETGT,
+                   PatFrag<(ops node:$in, node:$cc),
+                           (setcc $in, -1, $cc)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLWINM (i32not $in), 1, 31, 31)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLDICL (i64not $in), 1, 63)> >;
+
+defm : ExtSetCCPat<SETLE,
+                   PatFrag<(ops node:$in, node:$cc),
+                           (setcc $in, -1, $cc)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLWINM $in, 1, 31, 31)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLDICL $in, 1, 63)> >;
+
+// SETCC for i32.
+def : Pat<(i1 (setcc i32:$s1, immZExt16:$imm, SETULT)),
+          (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_lt)>;
+def : Pat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETLT)),
+          (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_lt)>;
+def : Pat<(i1 (setcc i32:$s1, immZExt16:$imm, SETUGT)),
+          (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_gt)>;
+def : Pat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETGT)),
+          (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_gt)>;
+def : Pat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETEQ)),
+          (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_eq)>;
+def : Pat<(i1 (setcc i32:$s1, immZExt16:$imm, SETEQ)),
+          (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_eq)>;
+
+// For non-equality comparisons, the default code would materialize the
+// constant, then compare against it, like this:
+//   lis r2, 4660
+//   ori r2, r2, 22136
+//   cmpw cr0, r3, r2
+//   beq cr0,L6
+// Since we are just comparing for equality, we can emit this instead:
+//   xoris r0,r3,0x1234
+//   cmplwi cr0,r0,0x5678
+//   beq cr0,L6
+
+def : Pat<(i1 (setcc i32:$s1, imm:$imm, SETEQ)),
+          (EXTRACT_SUBREG (CMPLWI (XORIS $s1, (HI16 imm:$imm)),
+                                  (LO16 imm:$imm)), sub_eq)>;
+
+defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETUGE)),
+                (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_lt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETGE)),
+                (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_lt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETULE)),
+                (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_gt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETLE)),
+                (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_gt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETNE)),
+                (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_eq)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETNE)),
+                (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_eq)>;
+
+defm : CRNotPat<(i1 (setcc i32:$s1, imm:$imm, SETNE)),
+                (EXTRACT_SUBREG (CMPLWI (XORIS $s1, (HI16 imm:$imm)),
+                                        (LO16 imm:$imm)), sub_eq)>;
+
+def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETULT)),
+          (EXTRACT_SUBREG (CMPLW $s1, $s2), sub_lt)>;
+def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETLT)),
+          (EXTRACT_SUBREG (CMPW $s1, $s2), sub_lt)>;
+def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETUGT)),
+          (EXTRACT_SUBREG (CMPLW $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETGT)),
+          (EXTRACT_SUBREG (CMPW $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETEQ)),
+          (EXTRACT_SUBREG (CMPW $s1, $s2), sub_eq)>;
+
+defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETUGE)),
+                (EXTRACT_SUBREG (CMPLW $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETGE)),
+                (EXTRACT_SUBREG (CMPW $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETULE)),
+                (EXTRACT_SUBREG (CMPLW $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETLE)),
+                (EXTRACT_SUBREG (CMPW $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETNE)),
+                (EXTRACT_SUBREG (CMPW $s1, $s2), sub_eq)>;
+
+// SETCC for i64.
+def : Pat<(i1 (setcc i64:$s1, immZExt16:$imm, SETULT)),
+          (EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_lt)>;
+def : Pat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETLT)),
+          (EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_lt)>;
+def : Pat<(i1 (setcc i64:$s1, immZExt16:$imm, SETUGT)),
+          (EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_gt)>;
+def : Pat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETGT)),
+          (EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_gt)>;
+def : Pat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETEQ)),
+          (EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_eq)>;
+def : Pat<(i1 (setcc i64:$s1, immZExt16:$imm, SETEQ)),
+          (EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_eq)>;
+
+// For non-equality comparisons, the default code would materialize the
+// constant, then compare against it, like this:
+//   lis r2, 4660
+//   ori r2, r2, 22136
+//   cmpd cr0, r3, r2
+//   beq cr0,L6
+// Since we are just comparing for equality, we can emit this instead:
+//   xoris r0,r3,0x1234
+//   cmpldi cr0,r0,0x5678
+//   beq cr0,L6
+
+def : Pat<(i1 (setcc i64:$s1, imm64ZExt32:$imm, SETEQ)),
+          (EXTRACT_SUBREG (CMPLDI (XORIS8 $s1, (HI16 imm:$imm)),
+                                  (LO16 imm:$imm)), sub_eq)>;
+
+defm : CRNotPat<(i1 (setcc i64:$s1, immZExt16:$imm, SETUGE)),
+                (EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_lt)>;
+defm : CRNotPat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETGE)),
+                (EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_lt)>;
+defm : CRNotPat<(i1 (setcc i64:$s1, immZExt16:$imm, SETULE)),
+                (EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_gt)>;
+defm : CRNotPat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETLE)),
+                (EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_gt)>;
+defm : CRNotPat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETNE)),
+                (EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_eq)>;
+defm : CRNotPat<(i1 (setcc i64:$s1, immZExt16:$imm, SETNE)),
+                (EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_eq)>;
+
+defm : CRNotPat<(i1 (setcc i64:$s1, imm64ZExt32:$imm, SETNE)),
+                (EXTRACT_SUBREG (CMPLDI (XORIS8 $s1, (HI16 imm:$imm)),
+                                        (LO16 imm:$imm)), sub_eq)>;
+
+def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETULT)),
+          (EXTRACT_SUBREG (CMPLD $s1, $s2), sub_lt)>;
+def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETLT)),
+          (EXTRACT_SUBREG (CMPD $s1, $s2), sub_lt)>;
+def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETUGT)),
+          (EXTRACT_SUBREG (CMPLD $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETGT)),
+          (EXTRACT_SUBREG (CMPD $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETEQ)),
+          (EXTRACT_SUBREG (CMPD $s1, $s2), sub_eq)>;
+
+defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETUGE)),
+                (EXTRACT_SUBREG (CMPLD $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETGE)),
+                (EXTRACT_SUBREG (CMPD $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETULE)),
+                (EXTRACT_SUBREG (CMPLD $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETLE)),
+                (EXTRACT_SUBREG (CMPD $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETNE)),
+                (EXTRACT_SUBREG (CMPD $s1, $s2), sub_eq)>;
+
+// SETCC for f32.
+def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOLT)),
+          (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>;
+def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETLT)),
+          (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>;
+def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOGT)),
+          (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETGT)),
+          (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOEQ)),
+          (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>;
+def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETEQ)),
+          (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>;
+def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETUO)),
+          (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_un)>;
+
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETUGE)),
+                (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETGE)),
+                (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETULE)),
+                (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETLE)),
+                (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETUNE)),
+                (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>;
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETNE)),
+                (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>;
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETO)),
+                (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_un)>;
+
+// SETCC for f64.
+def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOLT)),
+          (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>;
+def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETLT)),
+          (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>;
+def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOGT)),
+          (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETGT)),
+          (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOEQ)),
+          (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>;
+def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETEQ)),
+          (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>;
+def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETUO)),
+          (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_un)>;
+
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETUGE)),
+                (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETGE)),
+                (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETULE)),
+                (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETLE)),
+                (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETUNE)),
+                (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>;
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETNE)),
+                (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>;
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETO)),
+                (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_un)>;
+
+// match select on i1 variables:
+def : Pat<(i1 (select i1:$cond, i1:$tval, i1:$fval)),
+          (CROR (CRAND        $cond , $tval),
+                (CRAND (crnot $cond), $fval))>;
+
+// match selectcc on i1 variables:
+//   select (lhs == rhs), tval, fval is:
+//   ((lhs == rhs) & tval) | (!(lhs == rhs) & fval)
+def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETLT)),
+           (CROR (CRAND (CRANDC $rhs, $lhs), $tval),
+                 (CRAND (CRORC  $lhs, $rhs), $fval))>;
+def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETLE)),
+           (CROR (CRAND (CRORC  $rhs, $lhs), $tval),
+                 (CRAND (CRANDC $lhs, $rhs), $fval))>;
+def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETEQ)),
+           (CROR (CRAND (CREQV $lhs, $rhs), $tval),
+                 (CRAND (CRXOR $lhs, $rhs), $fval))>;
+def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETGE)),
+           (CROR (CRAND (CRORC  $lhs, $rhs), $tval),
+                 (CRAND (CRANDC $rhs, $lhs), $fval))>;
+def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETGT)),
+           (CROR (CRAND (CRANDC $lhs, $rhs), $tval),
+                 (CRAND (CRORC  $rhs, $lhs), $fval))>;
+def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETNE)),
+           (CROR (CRAND (CREQV $lhs, $rhs), $fval),
+                 (CRAND (CRXOR $lhs, $rhs), $tval))>;
+
+// match selectcc on i1 variables with non-i1 output.
+def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETLT)),
+          (SELECT_I4 (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETLE)),
+          (SELECT_I4 (CRORC  $rhs, $lhs), $tval, $fval)>;
+def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETEQ)),
+          (SELECT_I4 (CREQV $lhs, $rhs), $tval, $fval)>;
+def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETGE)),
+          (SELECT_I4 (CRORC  $lhs, $rhs), $tval, $fval)>;
+def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETGT)),
+          (SELECT_I4 (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETNE)),
+          (SELECT_I4 (CRXOR $lhs, $rhs), $tval, $fval)>;
+
+def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETLT)),
+          (SELECT_I8 (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETLE)),
+          (SELECT_I8 (CRORC  $rhs, $lhs), $tval, $fval)>;
+def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETEQ)),
+          (SELECT_I8 (CREQV $lhs, $rhs), $tval, $fval)>;
+def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETGE)),
+          (SELECT_I8 (CRORC  $lhs, $rhs), $tval, $fval)>;
+def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETGT)),
+          (SELECT_I8 (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETNE)),
+          (SELECT_I8 (CRXOR $lhs, $rhs), $tval, $fval)>;
+
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETLT)),
+          (SELECT_F4 (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETLE)),
+          (SELECT_F4 (CRORC  $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETEQ)),
+          (SELECT_F4 (CREQV $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETGE)),
+          (SELECT_F4 (CRORC  $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETGT)),
+          (SELECT_F4 (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETNE)),
+          (SELECT_F4 (CRXOR $lhs, $rhs), $tval, $fval)>;
+
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETLT)),
+          (SELECT_F8 (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETLE)),
+          (SELECT_F8 (CRORC  $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETEQ)),
+          (SELECT_F8 (CREQV $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETGE)),
+          (SELECT_F8 (CRORC  $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETGT)),
+          (SELECT_F8 (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETNE)),
+          (SELECT_F8 (CRXOR $lhs, $rhs), $tval, $fval)>;
+
+def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETLT)),
+          (SELECT_VRRC (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETLE)),
+          (SELECT_VRRC (CRORC  $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETEQ)),
+          (SELECT_VRRC (CREQV $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETGE)),
+          (SELECT_VRRC (CRORC  $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETGT)),
+          (SELECT_VRRC (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETNE)),
+          (SELECT_VRRC (CRXOR $lhs, $rhs), $tval, $fval)>;
+
+let usesCustomInserter = 1 in {
+def ANDIo_1_EQ_BIT : Pseudo<(outs crbitrc:$dst), (ins gprc:$in),
+                             "#ANDIo_1_EQ_BIT",
+                             [(set i1:$dst, (trunc (not i32:$in)))]>;
+def ANDIo_1_GT_BIT : Pseudo<(outs crbitrc:$dst), (ins gprc:$in),
+                             "#ANDIo_1_GT_BIT",
+                             [(set i1:$dst, (trunc i32:$in))]>;
+
+def ANDIo_1_EQ_BIT8 : Pseudo<(outs crbitrc:$dst), (ins g8rc:$in),
+                              "#ANDIo_1_EQ_BIT8",
+                              [(set i1:$dst, (trunc (not i64:$in)))]>;
+def ANDIo_1_GT_BIT8 : Pseudo<(outs crbitrc:$dst), (ins g8rc:$in),
+                              "#ANDIo_1_GT_BIT8",
+                              [(set i1:$dst, (trunc i64:$in))]>;
+}
+
+def : Pat<(i1 (not (trunc i32:$in))),
+           (ANDIo_1_EQ_BIT $in)>;
+def : Pat<(i1 (not (trunc i64:$in))),
+           (ANDIo_1_EQ_BIT8 $in)>;
+
+//===----------------------------------------------------------------------===//
+// PowerPC Instructions used for assembler/disassembler only
+//
+
+def ISYNC : XLForm_2_ext<19, 150, 0, 0, 0, (outs), (ins),
+                         "isync", IIC_SprISYNC, []>;
+
+def ICBI : XForm_1a<31, 982, (outs), (ins memrr:$src),
+                    "icbi $src", IIC_LdStICBI, []>;
+
+def EIEIO : XForm_24_eieio<31, 854, (outs), (ins),
+                           "eieio", IIC_LdStLoad, []>;
+
+def WAIT : XForm_24_sync<31, 62, (outs), (ins i32imm:$L),
+                         "wait $L", IIC_LdStLoad, []>;
+
+def MTMSR: XForm_mtmsr<31, 146, (outs), (ins gprc:$RS, i32imm:$L),
+                    "mtmsr $RS, $L", IIC_SprMTMSR>;
+
+def MFMSR : XForm_rs<31, 83, (outs gprc:$RT), (ins),
+                  "mfmsr $RT", IIC_SprMFMSR, []>;
+
+def MTMSRD : XForm_mtmsr<31, 178, (outs), (ins gprc:$RS, i32imm:$L),
+                    "mtmsrd $RS, $L", IIC_SprMTMSRD>;
+
+def SLBIE : XForm_16b<31, 434, (outs), (ins gprc:$RB),
+                        "slbie $RB", IIC_SprSLBIE, []>;
+
+def SLBMTE : XForm_26<31, 402, (outs), (ins gprc:$RS, gprc:$RB),
+                    "slbmte $RS, $RB", IIC_SprSLBMTE, []>;
+
+def SLBMFEE : XForm_26<31, 915, (outs gprc:$RT), (ins gprc:$RB),
+                       "slbmfee $RT, $RB", IIC_SprSLBMFEE, []>;
+
+def SLBIA : XForm_0<31, 498, (outs), (ins), "slbia", IIC_SprSLBIA, []>;
+
+def TLBSYNC : XForm_0<31, 566, (outs), (ins),
+                        "tlbsync", IIC_SprTLBSYNC, []>;
+
+def TLBIEL : XForm_16b<31, 274, (outs), (ins gprc:$RB),
+                          "tlbiel $RB", IIC_SprTLBIEL, []>;
+
+def TLBIE : XForm_26<31, 306, (outs), (ins gprc:$RS, gprc:$RB),
+                          "tlbie $RB,$RS", IIC_SprTLBIE, []>;
+
+//===----------------------------------------------------------------------===//
+// PowerPC Assembler Instruction Aliases
+//
+
+// Pseudo-instructions for alternate assembly syntax (never used by codegen).
+// These are aliases that require C++ handling to convert to the target
+// instruction, while InstAliases can be handled directly by tblgen.
+class PPCAsmPseudo<string asm, dag iops>
+  : Instruction {
+  let Namespace = "PPC";
+  bit PPC64 = 0;  // Default value, override with isPPC64
+
+  let OutOperandList = (outs);
+  let InOperandList = iops;
+  let Pattern = [];
+  let AsmString = asm;
+  let isAsmParserOnly = 1;
+  let isPseudo = 1;
+}
+
+def : InstAlias<"sc", (SC 0)>;
+
+def : InstAlias<"sync", (SYNC 0)>, Requires<[IsNotBookE]>;
+def : InstAlias<"msync", (SYNC 0)>, Requires<[IsNotBookE]>;
+def : InstAlias<"lwsync", (SYNC 1)>, Requires<[IsNotBookE]>;
+def : InstAlias<"ptesync", (SYNC 2)>, Requires<[IsNotBookE]>;
+
+def : InstAlias<"wait", (WAIT 0)>;
+def : InstAlias<"waitrsv", (WAIT 1)>;
+def : InstAlias<"waitimpl", (WAIT 2)>;
+
+def : InstAlias<"crset $bx", (CREQV crbitrc:$bx, crbitrc:$bx, crbitrc:$bx)>;
+def : InstAlias<"crclr $bx", (CRXOR crbitrc:$bx, crbitrc:$bx, crbitrc:$bx)>;
+def : InstAlias<"crmove $bx, $by", (CROR crbitrc:$bx, crbitrc:$by, crbitrc:$by)>;
+def : InstAlias<"crnot $bx, $by", (CRNOR crbitrc:$bx, crbitrc:$by, crbitrc:$by)>;
+
+def : InstAlias<"mtxer $Rx", (MTSPR 1, gprc:$Rx)>;
+def : InstAlias<"mfxer $Rx", (MFSPR gprc:$Rx, 1)>;
+
+def : InstAlias<"mftb $Rx", (MFTB gprc:$Rx, 268)>;
+def : InstAlias<"mftbu $Rx", (MFTB gprc:$Rx, 269)>;
+
+def : InstAlias<"xnop", (XORI R0, R0, 0)>;
+
+def : InstAlias<"mr $rA, $rB", (OR8 g8rc:$rA, g8rc:$rB, g8rc:$rB)>;
+def : InstAlias<"mr. $rA, $rB", (OR8o g8rc:$rA, g8rc:$rB, g8rc:$rB)>;
+
+def : InstAlias<"not $rA, $rB", (NOR8 g8rc:$rA, g8rc:$rB, g8rc:$rB)>;
+def : InstAlias<"not. $rA, $rB", (NOR8o g8rc:$rA, g8rc:$rB, g8rc:$rB)>;
+
+def : InstAlias<"mtcr $rA", (MTCRF8 255, g8rc:$rA)>;
+
+def LAx : PPCAsmPseudo<"la $rA, $addr", (ins gprc:$rA, memri:$addr)>;
+
+def SUBI : PPCAsmPseudo<"subi $rA, $rB, $imm",
+                        (ins gprc:$rA, gprc:$rB, s16imm:$imm)>;
+def SUBIS : PPCAsmPseudo<"subis $rA, $rB, $imm",
+                         (ins gprc:$rA, gprc:$rB, s16imm:$imm)>;
+def SUBIC : PPCAsmPseudo<"subic $rA, $rB, $imm",
+                         (ins gprc:$rA, gprc:$rB, s16imm:$imm)>;
+def SUBICo : PPCAsmPseudo<"subic. $rA, $rB, $imm",
+                          (ins gprc:$rA, gprc:$rB, s16imm:$imm)>;
+
+def : InstAlias<"sub $rA, $rB, $rC", (SUBF8 g8rc:$rA, g8rc:$rC, g8rc:$rB)>;
+def : InstAlias<"sub. $rA, $rB, $rC", (SUBF8o g8rc:$rA, g8rc:$rC, g8rc:$rB)>;
+def : InstAlias<"subc $rA, $rB, $rC", (SUBFC8 g8rc:$rA, g8rc:$rC, g8rc:$rB)>;
+def : InstAlias<"subc. $rA, $rB, $rC", (SUBFC8o g8rc:$rA, g8rc:$rC, g8rc:$rB)>;
+
+def : InstAlias<"mtmsrd $RS", (MTMSRD gprc:$RS, 0)>;
+def : InstAlias<"mtmsr $RS", (MTMSR gprc:$RS, 0)>;
+
+def : InstAlias<"mfsprg $RT, 0", (MFSPR gprc:$RT, 272)>;
+def : InstAlias<"mfsprg $RT, 1", (MFSPR gprc:$RT, 273)>;
+def : InstAlias<"mfsprg $RT, 2", (MFSPR gprc:$RT, 274)>;
+def : InstAlias<"mfsprg $RT, 3", (MFSPR gprc:$RT, 275)>;
+
+def : InstAlias<"mfsprg0 $RT", (MFSPR gprc:$RT, 272)>;
+def : InstAlias<"mfsprg1 $RT", (MFSPR gprc:$RT, 273)>;
+def : InstAlias<"mfsprg2 $RT", (MFSPR gprc:$RT, 274)>;
+def : InstAlias<"mfsprg3 $RT", (MFSPR gprc:$RT, 275)>;
+
+def : InstAlias<"mtsprg 0, $RT", (MTSPR 272, gprc:$RT)>;
+def : InstAlias<"mtsprg 1, $RT", (MTSPR 273, gprc:$RT)>;
+def : InstAlias<"mtsprg 2, $RT", (MTSPR 274, gprc:$RT)>;
+def : InstAlias<"mtsprg 3, $RT", (MTSPR 275, gprc:$RT)>;
+
+def : InstAlias<"mtsprg0 $RT", (MTSPR 272, gprc:$RT)>;
+def : InstAlias<"mtsprg1 $RT", (MTSPR 273, gprc:$RT)>;
+def : InstAlias<"mtsprg2 $RT", (MTSPR 274, gprc:$RT)>;
+def : InstAlias<"mtsprg3 $RT", (MTSPR 275, gprc:$RT)>;
+
+def : InstAlias<"mtasr $RS", (MTSPR 280, gprc:$RS)>;
+
+def : InstAlias<"mfdec $RT", (MFSPR gprc:$RT, 22)>;
+def : InstAlias<"mtdec $RT", (MTSPR 22, gprc:$RT)>;
+
+def : InstAlias<"mfpvr $RT", (MFSPR gprc:$RT, 287)>;
+
+def : InstAlias<"mfsdr1 $RT", (MFSPR gprc:$RT, 25)>;
+def : InstAlias<"mtsdr1 $RT", (MTSPR 25, gprc:$RT)>;
+
+def : InstAlias<"mfsrr0 $RT", (MFSPR gprc:$RT, 26)>;
+def : InstAlias<"mfsrr1 $RT", (MFSPR gprc:$RT, 27)>;
+def : InstAlias<"mtsrr0 $RT", (MTSPR 26, gprc:$RT)>;
+def : InstAlias<"mtsrr1 $RT", (MTSPR 27, gprc:$RT)>;
+
+def : InstAlias<"tlbie $RB", (TLBIE R0, gprc:$RB)>;
+
+def EXTLWI : PPCAsmPseudo<"extlwi $rA, $rS, $n, $b",
+                          (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
+def EXTLWIo : PPCAsmPseudo<"extlwi. $rA, $rS, $n, $b",
+                           (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
+def EXTRWI : PPCAsmPseudo<"extrwi $rA, $rS, $n, $b",
+                          (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
+def EXTRWIo : PPCAsmPseudo<"extrwi. $rA, $rS, $n, $b",
+                           (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
+def INSLWI : PPCAsmPseudo<"inslwi $rA, $rS, $n, $b",
+                          (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
+def INSLWIo : PPCAsmPseudo<"inslwi. $rA, $rS, $n, $b",
+                           (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
+def INSRWI : PPCAsmPseudo<"insrwi $rA, $rS, $n, $b",
+                          (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
+def INSRWIo : PPCAsmPseudo<"insrwi. $rA, $rS, $n, $b",
+                           (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
+def ROTRWI : PPCAsmPseudo<"rotrwi $rA, $rS, $n",
+                          (ins gprc:$rA, gprc:$rS, u5imm:$n)>;
+def ROTRWIo : PPCAsmPseudo<"rotrwi. $rA, $rS, $n",
+                           (ins gprc:$rA, gprc:$rS, u5imm:$n)>;
+def SLWI : PPCAsmPseudo<"slwi $rA, $rS, $n",
+                        (ins gprc:$rA, gprc:$rS, u5imm:$n)>;
+def SLWIo : PPCAsmPseudo<"slwi. $rA, $rS, $n",
+                         (ins gprc:$rA, gprc:$rS, u5imm:$n)>;
+def SRWI : PPCAsmPseudo<"srwi $rA, $rS, $n",
+                        (ins gprc:$rA, gprc:$rS, u5imm:$n)>;
+def SRWIo : PPCAsmPseudo<"srwi. $rA, $rS, $n",
+                         (ins gprc:$rA, gprc:$rS, u5imm:$n)>;
+def CLRRWI : PPCAsmPseudo<"clrrwi $rA, $rS, $n",
+                          (ins gprc:$rA, gprc:$rS, u5imm:$n)>;
+def CLRRWIo : PPCAsmPseudo<"clrrwi. $rA, $rS, $n",
+                           (ins gprc:$rA, gprc:$rS, u5imm:$n)>;
+def CLRLSLWI : PPCAsmPseudo<"clrlslwi $rA, $rS, $b, $n",
+                            (ins gprc:$rA, gprc:$rS, u5imm:$b, u5imm:$n)>;
+def CLRLSLWIo : PPCAsmPseudo<"clrlslwi. $rA, $rS, $b, $n",
+                             (ins gprc:$rA, gprc:$rS, u5imm:$b, u5imm:$n)>;
+
+def : InstAlias<"rotlwi $rA, $rS, $n", (RLWINM gprc:$rA, gprc:$rS, u5imm:$n, 0, 31)>;
+def : InstAlias<"rotlwi. $rA, $rS, $n", (RLWINMo gprc:$rA, gprc:$rS, u5imm:$n, 0, 31)>;
+def : InstAlias<"rotlw $rA, $rS, $rB", (RLWNM gprc:$rA, gprc:$rS, gprc:$rB, 0, 31)>;
+def : InstAlias<"rotlw. $rA, $rS, $rB", (RLWNMo gprc:$rA, gprc:$rS, gprc:$rB, 0, 31)>;
+def : InstAlias<"clrlwi $rA, $rS, $n", (RLWINM gprc:$rA, gprc:$rS, 0, u5imm:$n, 31)>;
+def : InstAlias<"clrlwi. $rA, $rS, $n", (RLWINMo gprc:$rA, gprc:$rS, 0, u5imm:$n, 31)>;
+
+def EXTLDI : PPCAsmPseudo<"extldi $rA, $rS, $n, $b",
+                          (ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>;
+def EXTLDIo : PPCAsmPseudo<"extldi. $rA, $rS, $n, $b",
+                           (ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>;
+def EXTRDI : PPCAsmPseudo<"extrdi $rA, $rS, $n, $b",
+                          (ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>;
+def EXTRDIo : PPCAsmPseudo<"extrdi. $rA, $rS, $n, $b",
+                           (ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>;
+def INSRDI : PPCAsmPseudo<"insrdi $rA, $rS, $n, $b",
+                          (ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>;
+def INSRDIo : PPCAsmPseudo<"insrdi. $rA, $rS, $n, $b",
+                           (ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>;
+def ROTRDI : PPCAsmPseudo<"rotrdi $rA, $rS, $n",
+                          (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
+def ROTRDIo : PPCAsmPseudo<"rotrdi. $rA, $rS, $n",
+                           (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
+def SLDI : PPCAsmPseudo<"sldi $rA, $rS, $n",
+                        (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
+def SLDIo : PPCAsmPseudo<"sldi. $rA, $rS, $n",
+                         (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
+def SRDI : PPCAsmPseudo<"srdi $rA, $rS, $n",
+                        (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
+def SRDIo : PPCAsmPseudo<"srdi. $rA, $rS, $n",
+                         (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
+def CLRRDI : PPCAsmPseudo<"clrrdi $rA, $rS, $n",
+                          (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
+def CLRRDIo : PPCAsmPseudo<"clrrdi. $rA, $rS, $n",
+                           (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
+def CLRLSLDI : PPCAsmPseudo<"clrlsldi $rA, $rS, $b, $n",
+                            (ins g8rc:$rA, g8rc:$rS, u6imm:$b, u6imm:$n)>;
+def CLRLSLDIo : PPCAsmPseudo<"clrlsldi. $rA, $rS, $b, $n",
+                             (ins g8rc:$rA, g8rc:$rS, u6imm:$b, u6imm:$n)>;
+
+def : InstAlias<"rotldi $rA, $rS, $n", (RLDICL g8rc:$rA, g8rc:$rS, u6imm:$n, 0)>;
+def : InstAlias<"rotldi. $rA, $rS, $n", (RLDICLo g8rc:$rA, g8rc:$rS, u6imm:$n, 0)>;
+def : InstAlias<"rotld $rA, $rS, $rB", (RLDCL g8rc:$rA, g8rc:$rS, gprc:$rB, 0)>;
+def : InstAlias<"rotld. $rA, $rS, $rB", (RLDCLo g8rc:$rA, g8rc:$rS, gprc:$rB, 0)>;
+def : InstAlias<"clrldi $rA, $rS, $n", (RLDICL g8rc:$rA, g8rc:$rS, 0, u6imm:$n)>;
+def : InstAlias<"clrldi. $rA, $rS, $n", (RLDICLo g8rc:$rA, g8rc:$rS, 0, u6imm:$n)>;
+
+// These generic branch instruction forms are used for the assembler parser only.
+// Defs and Uses are conservative, since we don't know the BO value.
+let PPC970_Unit = 7 in {
+  let Defs = [CTR], Uses = [CTR, RM] in {
+    def gBC : BForm_3<16, 0, 0, (outs),
+                      (ins u5imm:$bo, crbitrc:$bi, condbrtarget:$dst),
+                      "bc $bo, $bi, $dst">;
+    def gBCA : BForm_3<16, 1, 0, (outs),
+                       (ins u5imm:$bo, crbitrc:$bi, abscondbrtarget:$dst),
+                       "bca $bo, $bi, $dst">;
+  }
+  let Defs = [LR, CTR], Uses = [CTR, RM] in {
+    def gBCL : BForm_3<16, 0, 1, (outs),
+                       (ins u5imm:$bo, crbitrc:$bi, condbrtarget:$dst),
+                       "bcl $bo, $bi, $dst">;
+    def gBCLA : BForm_3<16, 1, 1, (outs),
+                        (ins u5imm:$bo, crbitrc:$bi, abscondbrtarget:$dst),
+                        "bcla $bo, $bi, $dst">;
+  }
+  let Defs = [CTR], Uses = [CTR, LR, RM] in
+    def gBCLR : XLForm_2<19, 16, 0, (outs),
+                         (ins u5imm:$bo, crbitrc:$bi, i32imm:$bh),
+                         "bclr $bo, $bi, $bh", IIC_BrB, []>;
+  let Defs = [LR, CTR], Uses = [CTR, LR, RM] in
+    def gBCLRL : XLForm_2<19, 16, 1, (outs),
+                          (ins u5imm:$bo, crbitrc:$bi, i32imm:$bh),
+                          "bclrl $bo, $bi, $bh", IIC_BrB, []>;
+  let Defs = [CTR], Uses = [CTR, LR, RM] in
+    def gBCCTR : XLForm_2<19, 528, 0, (outs),
+                          (ins u5imm:$bo, crbitrc:$bi, i32imm:$bh),
+                          "bcctr $bo, $bi, $bh", IIC_BrB, []>;
+  let Defs = [LR, CTR], Uses = [CTR, LR, RM] in
+    def gBCCTRL : XLForm_2<19, 528, 1, (outs),
+                           (ins u5imm:$bo, crbitrc:$bi, i32imm:$bh),
+                           "bcctrl $bo, $bi, $bh", IIC_BrB, []>;
+}
+def : InstAlias<"bclr $bo, $bi", (gBCLR u5imm:$bo, crbitrc:$bi, 0)>;
+def : InstAlias<"bclrl $bo, $bi", (gBCLRL u5imm:$bo, crbitrc:$bi, 0)>;
+def : InstAlias<"bcctr $bo, $bi", (gBCCTR u5imm:$bo, crbitrc:$bi, 0)>;
+def : InstAlias<"bcctrl $bo, $bi", (gBCCTRL u5imm:$bo, crbitrc:$bi, 0)>;
+
+multiclass BranchSimpleMnemonic1<string name, string pm, int bo> {
+  def : InstAlias<"b"#name#pm#" $bi, $dst", (gBC bo, crbitrc:$bi, condbrtarget:$dst)>;
+  def : InstAlias<"b"#name#"a"#pm#" $bi, $dst", (gBCA bo, crbitrc:$bi, abscondbrtarget:$dst)>;
+  def : InstAlias<"b"#name#"lr"#pm#" $bi", (gBCLR bo, crbitrc:$bi, 0)>;
+  def : InstAlias<"b"#name#"l"#pm#" $bi, $dst", (gBCL bo, crbitrc:$bi, condbrtarget:$dst)>;
+  def : InstAlias<"b"#name#"la"#pm#" $bi, $dst", (gBCLA bo, crbitrc:$bi, abscondbrtarget:$dst)>;
+  def : InstAlias<"b"#name#"lrl"#pm#" $bi", (gBCLRL bo, crbitrc:$bi, 0)>;
+}
+multiclass BranchSimpleMnemonic2<string name, string pm, int bo>
+  : BranchSimpleMnemonic1<name, pm, bo> {
+  def : InstAlias<"b"#name#"ctr"#pm#" $bi", (gBCCTR bo, crbitrc:$bi, 0)>;
+  def : InstAlias<"b"#name#"ctrl"#pm#" $bi", (gBCCTRL bo, crbitrc:$bi, 0)>;
+}
+defm : BranchSimpleMnemonic2<"t", "", 12>;
+defm : BranchSimpleMnemonic2<"f", "", 4>;
+defm : BranchSimpleMnemonic2<"t", "-", 14>;
+defm : BranchSimpleMnemonic2<"f", "-", 6>;
+defm : BranchSimpleMnemonic2<"t", "+", 15>;
+defm : BranchSimpleMnemonic2<"f", "+", 7>;
+defm : BranchSimpleMnemonic1<"dnzt", "", 8>;
+defm : BranchSimpleMnemonic1<"dnzf", "", 0>;
+defm : BranchSimpleMnemonic1<"dzt", "", 10>;
+defm : BranchSimpleMnemonic1<"dzf", "", 2>;
+
+multiclass BranchExtendedMnemonicPM<string name, string pm, int bibo> {
+  def : InstAlias<"b"#name#pm#" $cc, $dst",
+                  (BCC bibo, crrc:$cc, condbrtarget:$dst)>;
+  def : InstAlias<"b"#name#pm#" $dst",
+                  (BCC bibo, CR0, condbrtarget:$dst)>;
+
+  def : InstAlias<"b"#name#"a"#pm#" $cc, $dst",
+                  (BCCA bibo, crrc:$cc, abscondbrtarget:$dst)>;
+  def : InstAlias<"b"#name#"a"#pm#" $dst",
+                  (BCCA bibo, CR0, abscondbrtarget:$dst)>;
+
+  def : InstAlias<"b"#name#"lr"#pm#" $cc",
+                  (BCCLR bibo, crrc:$cc)>;
+  def : InstAlias<"b"#name#"lr"#pm,
+                  (BCCLR bibo, CR0)>;
+
+  def : InstAlias<"b"#name#"ctr"#pm#" $cc",
+                  (BCCCTR bibo, crrc:$cc)>;
+  def : InstAlias<"b"#name#"ctr"#pm,
+                  (BCCCTR bibo, CR0)>;
+
+  def : InstAlias<"b"#name#"l"#pm#" $cc, $dst",
+                  (BCCL bibo, crrc:$cc, condbrtarget:$dst)>;
+  def : InstAlias<"b"#name#"l"#pm#" $dst",
+                  (BCCL bibo, CR0, condbrtarget:$dst)>;
+
+  def : InstAlias<"b"#name#"la"#pm#" $cc, $dst",
+                  (BCCLA bibo, crrc:$cc, abscondbrtarget:$dst)>;
+  def : InstAlias<"b"#name#"la"#pm#" $dst",
+                  (BCCLA bibo, CR0, abscondbrtarget:$dst)>;
+
+  def : InstAlias<"b"#name#"lrl"#pm#" $cc",
+                  (BCCLRL bibo, crrc:$cc)>;
+  def : InstAlias<"b"#name#"lrl"#pm,
+                  (BCCLRL bibo, CR0)>;
+
+  def : InstAlias<"b"#name#"ctrl"#pm#" $cc",
+                  (BCCCTRL bibo, crrc:$cc)>;
+  def : InstAlias<"b"#name#"ctrl"#pm,
+                  (BCCCTRL bibo, CR0)>;
+}
+multiclass BranchExtendedMnemonic<string name, int bibo> {
+  defm : BranchExtendedMnemonicPM<name, "", bibo>;
+  defm : BranchExtendedMnemonicPM<name, "-", !add(bibo, 2)>;
+  defm : BranchExtendedMnemonicPM<name, "+", !add(bibo, 3)>;
+}
+defm : BranchExtendedMnemonic<"lt", 12>;
+defm : BranchExtendedMnemonic<"gt", 44>;
+defm : BranchExtendedMnemonic<"eq", 76>;
+defm : BranchExtendedMnemonic<"un", 108>;
+defm : BranchExtendedMnemonic<"so", 108>;
+defm : BranchExtendedMnemonic<"ge", 4>;
+defm : BranchExtendedMnemonic<"nl", 4>;
+defm : BranchExtendedMnemonic<"le", 36>;
+defm : BranchExtendedMnemonic<"ng", 36>;
+defm : BranchExtendedMnemonic<"ne", 68>;
+defm : BranchExtendedMnemonic<"nu", 100>;
+defm : BranchExtendedMnemonic<"ns", 100>;
+
+def : InstAlias<"cmpwi $rA, $imm", (CMPWI CR0, gprc:$rA, s16imm:$imm)>;
+def : InstAlias<"cmpw $rA, $rB", (CMPW CR0, gprc:$rA, gprc:$rB)>;
+def : InstAlias<"cmplwi $rA, $imm", (CMPLWI CR0, gprc:$rA, u16imm:$imm)>;
+def : InstAlias<"cmplw $rA, $rB", (CMPLW CR0, gprc:$rA, gprc:$rB)>;
+def : InstAlias<"cmpdi $rA, $imm", (CMPDI CR0, g8rc:$rA, s16imm64:$imm)>;
+def : InstAlias<"cmpd $rA, $rB", (CMPD CR0, g8rc:$rA, g8rc:$rB)>;
+def : InstAlias<"cmpldi $rA, $imm", (CMPLDI CR0, g8rc:$rA, u16imm64:$imm)>;
+def : InstAlias<"cmpld $rA, $rB", (CMPLD CR0, g8rc:$rA, g8rc:$rB)>;
+
+def : InstAlias<"cmpi $bf, 0, $rA, $imm", (CMPWI crrc:$bf, gprc:$rA, s16imm:$imm)>;
+def : InstAlias<"cmp $bf, 0, $rA, $rB", (CMPW crrc:$bf, gprc:$rA, gprc:$rB)>;
+def : InstAlias<"cmpli $bf, 0, $rA, $imm", (CMPLWI crrc:$bf, gprc:$rA, u16imm:$imm)>;
+def : InstAlias<"cmpl $bf, 0, $rA, $rB", (CMPLW crrc:$bf, gprc:$rA, gprc:$rB)>;
+def : InstAlias<"cmpi $bf, 1, $rA, $imm", (CMPDI crrc:$bf, g8rc:$rA, s16imm64:$imm)>;
+def : InstAlias<"cmp $bf, 1, $rA, $rB", (CMPD crrc:$bf, g8rc:$rA, g8rc:$rB)>;
+def : InstAlias<"cmpli $bf, 1, $rA, $imm", (CMPLDI crrc:$bf, g8rc:$rA, u16imm64:$imm)>;
+def : InstAlias<"cmpl $bf, 1, $rA, $rB", (CMPLD crrc:$bf, g8rc:$rA, g8rc:$rB)>;
+
+multiclass TrapExtendedMnemonic<string name, int to> {
+  def : InstAlias<"td"#name#"i $rA, $imm", (TDI to, g8rc:$rA, s16imm:$imm)>;
+  def : InstAlias<"td"#name#" $rA, $rB", (TD to, g8rc:$rA, g8rc:$rB)>;
+  def : InstAlias<"tw"#name#"i $rA, $imm", (TWI to, gprc:$rA, s16imm:$imm)>;
+  def : InstAlias<"tw"#name#" $rA, $rB", (TW to, gprc:$rA, gprc:$rB)>;
+}
+defm : TrapExtendedMnemonic<"lt", 16>;
+defm : TrapExtendedMnemonic<"le", 20>;
+defm : TrapExtendedMnemonic<"eq", 4>;
+defm : TrapExtendedMnemonic<"ge", 12>;
+defm : TrapExtendedMnemonic<"gt", 8>;
+defm : TrapExtendedMnemonic<"nl", 12>;
+defm : TrapExtendedMnemonic<"ne", 24>;
+defm : TrapExtendedMnemonic<"ng", 20>;
+defm : TrapExtendedMnemonic<"llt", 2>;
+defm : TrapExtendedMnemonic<"lle", 6>;
+defm : TrapExtendedMnemonic<"lge", 5>;
+defm : TrapExtendedMnemonic<"lgt", 1>;
+defm : TrapExtendedMnemonic<"lnl", 5>;
+defm : TrapExtendedMnemonic<"lng", 6>;
+defm : TrapExtendedMnemonic<"u", 31>;
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrVSX.td
new file mode 100644
index 0000000..49bcc48
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -0,0 +1,816 @@
+//===- PPCInstrVSX.td - The PowerPC VSX Extension --*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file describes the VSX extension to the PowerPC instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+def PPCRegVSRCAsmOperand : AsmOperandClass {
+  let Name = "RegVSRC"; let PredicateMethod = "isVSRegNumber";
+}
+def vsrc : RegisterOperand<VSRC> {
+  let ParserMatchClass = PPCRegVSRCAsmOperand;
+}
+
+def PPCRegVSFRCAsmOperand : AsmOperandClass {
+  let Name = "RegVSFRC"; let PredicateMethod = "isVSRegNumber";
+}
+def vsfrc : RegisterOperand<VSFRC> {
+  let ParserMatchClass = PPCRegVSFRCAsmOperand;
+}
+
+multiclass XX3Form_Rcr<bits<6> opcode, bits<7> xo, dag OOL, dag IOL,
+                    string asmbase, string asmstr, InstrItinClass itin,
+                    list<dag> pattern> {
+  let BaseName = asmbase in {
+    def NAME : XX3Form_Rc<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+                       pattern>;
+    let Defs = [CR6] in
+    def o    : XX3Form_Rc<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+                       []>, isDOT;
+  }
+}
+
+def HasVSX : Predicate<"PPCSubTarget->hasVSX()">;
+let Predicates = [HasVSX] in {
+let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
+let neverHasSideEffects = 1 in { // VSX instructions don't have side effects.
+let Uses = [RM] in {
+
+  // Load indexed instructions
+  let mayLoad = 1, canFoldAsLoad = 1 in {
+    def LXSDX : XForm_1<31, 588,
+                        (outs vsfrc:$XT), (ins memrr:$src),
+                        "lxsdx $XT, $src", IIC_LdStLFD,
+                        [(set f64:$XT, (load xoaddr:$src))]>;
+
+    def LXVD2X : XForm_1<31, 844,
+                         (outs vsrc:$XT), (ins memrr:$src),
+                         "lxvd2x $XT, $src", IIC_LdStLFD,
+                         [(set v2f64:$XT, (load xoaddr:$src))]>;
+
+    def LXVDSX : XForm_1<31, 332,
+                         (outs vsrc:$XT), (ins memrr:$src),
+                         "lxvdsx $XT, $src", IIC_LdStLFD, []>;
+
+    def LXVW4X : XForm_1<31, 780,
+                         (outs vsrc:$XT), (ins memrr:$src),
+                         "lxvw4x $XT, $src", IIC_LdStLFD, []>;
+  }
+
+  // Store indexed instructions
+  let mayStore = 1 in {
+    def STXSDX : XX1Form<31, 716,
+                        (outs), (ins vsfrc:$XT, memrr:$dst),
+                        "stxsdx $XT, $dst", IIC_LdStSTFD,
+                        [(store f64:$XT, xoaddr:$dst)]>;
+
+    def STXVD2X : XX1Form<31, 972,
+                         (outs), (ins vsrc:$XT, memrr:$dst),
+                         "stxvd2x $XT, $dst", IIC_LdStSTFD,
+                         [(store v2f64:$XT, xoaddr:$dst)]>;
+
+    def STXVW4X : XX1Form<31, 908,
+                         (outs), (ins vsrc:$XT, memrr:$dst),
+                         "stxvw4x $XT, $dst", IIC_LdStSTFD, []>;
+  }
+
+  // Add/Mul Instructions
+  let isCommutable = 1 in {
+    def XSADDDP : XX3Form<60, 32,
+                          (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
+                          "xsadddp $XT, $XA, $XB", IIC_VecFP,
+                          [(set f64:$XT, (fadd f64:$XA, f64:$XB))]>;
+    def XSMULDP : XX3Form<60, 48,
+                          (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
+                          "xsmuldp $XT, $XA, $XB", IIC_VecFP,
+                          [(set f64:$XT, (fmul f64:$XA, f64:$XB))]>;
+
+    def XVADDDP : XX3Form<60, 96,
+                          (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                          "xvadddp $XT, $XA, $XB", IIC_VecFP,
+                          [(set v2f64:$XT, (fadd v2f64:$XA, v2f64:$XB))]>;
+
+    def XVADDSP : XX3Form<60, 64,
+                          (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                          "xvaddsp $XT, $XA, $XB", IIC_VecFP,
+                          [(set v4f32:$XT, (fadd v4f32:$XA, v4f32:$XB))]>;
+
+    def XVMULDP : XX3Form<60, 112,
+                          (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                          "xvmuldp $XT, $XA, $XB", IIC_VecFP,
+                          [(set v2f64:$XT, (fmul v2f64:$XA, v2f64:$XB))]>;
+
+    def XVMULSP : XX3Form<60, 80,
+                          (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                          "xvmulsp $XT, $XA, $XB", IIC_VecFP,
+                          [(set v4f32:$XT, (fmul v4f32:$XA, v4f32:$XB))]>;
+  }
+
+  // Subtract Instructions
+  def XSSUBDP : XX3Form<60, 40,
+                        (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
+                        "xssubdp $XT, $XA, $XB", IIC_VecFP,
+                        [(set f64:$XT, (fsub f64:$XA, f64:$XB))]>;
+
+  def XVSUBDP : XX3Form<60, 104,
+                        (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                        "xvsubdp $XT, $XA, $XB", IIC_VecFP,
+                        [(set v2f64:$XT, (fsub v2f64:$XA, v2f64:$XB))]>;
+  def XVSUBSP : XX3Form<60, 72,
+                        (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                        "xvsubsp $XT, $XA, $XB", IIC_VecFP,
+                        [(set v4f32:$XT, (fsub v4f32:$XA, v4f32:$XB))]>;
+
+  // FMA Instructions
+  let BaseName = "XSMADDADP" in {
+  let isCommutable = 1 in
+  def XSMADDADP : XX3Form<60, 33,
+                          (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
+                          "xsmaddadp $XT, $XA, $XB", IIC_VecFP,
+                          [(set f64:$XT, (fma f64:$XA, f64:$XB, f64:$XTi))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XSMADDMDP : XX3Form<60, 41,
+                          (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
+                          "xsmaddmdp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
+
+  let BaseName = "XSMSUBADP" in {
+  let isCommutable = 1 in
+  def XSMSUBADP : XX3Form<60, 49,
+                          (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
+                          "xsmsubadp $XT, $XA, $XB", IIC_VecFP,
+                          [(set f64:$XT, (fma f64:$XA, f64:$XB, (fneg f64:$XTi)))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XSMSUBMDP : XX3Form<60, 57,
+                          (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
+                          "xsmsubmdp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
+
+  let BaseName = "XSNMADDADP" in {
+  let isCommutable = 1 in
+  def XSNMADDADP : XX3Form<60, 161,
+                          (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
+                          "xsnmaddadp $XT, $XA, $XB", IIC_VecFP,
+                          [(set f64:$XT, (fneg (fma f64:$XA, f64:$XB, f64:$XTi)))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XSNMADDMDP : XX3Form<60, 169,
+                          (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
+                          "xsnmaddmdp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
+
+  let BaseName = "XSNMSUBADP" in {
+  let isCommutable = 1 in
+  def XSNMSUBADP : XX3Form<60, 177,
+                          (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
+                          "xsnmsubadp $XT, $XA, $XB", IIC_VecFP,
+                          [(set f64:$XT, (fneg (fma f64:$XA, f64:$XB, (fneg f64:$XTi))))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XSNMSUBMDP : XX3Form<60, 185,
+                          (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
+                          "xsnmsubmdp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
+
+  let BaseName = "XVMADDADP" in {
+  let isCommutable = 1 in
+  def XVMADDADP : XX3Form<60, 97,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvmaddadp $XT, $XA, $XB", IIC_VecFP,
+                          [(set v2f64:$XT, (fma v2f64:$XA, v2f64:$XB, v2f64:$XTi))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XVMADDMDP : XX3Form<60, 105,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvmaddmdp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
+
+  let BaseName = "XVMADDASP" in {
+  let isCommutable = 1 in
+  def XVMADDASP : XX3Form<60, 65,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvmaddasp $XT, $XA, $XB", IIC_VecFP,
+                          [(set v4f32:$XT, (fma v4f32:$XA, v4f32:$XB, v4f32:$XTi))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XVMADDMSP : XX3Form<60, 73,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvmaddmsp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
+
+  let BaseName = "XVMSUBADP" in {
+  let isCommutable = 1 in
+  def XVMSUBADP : XX3Form<60, 113,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvmsubadp $XT, $XA, $XB", IIC_VecFP,
+                          [(set v2f64:$XT, (fma v2f64:$XA, v2f64:$XB, (fneg v2f64:$XTi)))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XVMSUBMDP : XX3Form<60, 121,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvmsubmdp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
+
+  let BaseName = "XVMSUBASP" in {
+  let isCommutable = 1 in
+  def XVMSUBASP : XX3Form<60, 81,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvmsubasp $XT, $XA, $XB", IIC_VecFP,
+                          [(set v4f32:$XT, (fma v4f32:$XA, v4f32:$XB, (fneg v4f32:$XTi)))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XVMSUBMSP : XX3Form<60, 89,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvmsubmsp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
+
+  let BaseName = "XVNMADDADP" in {
+  let isCommutable = 1 in
+  def XVNMADDADP : XX3Form<60, 225,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvnmaddadp $XT, $XA, $XB", IIC_VecFP,
+                          [(set v2f64:$XT, (fneg (fma v2f64:$XA, v2f64:$XB, v2f64:$XTi)))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XVNMADDMDP : XX3Form<60, 233,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvnmaddmdp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
+
+  let BaseName = "XVNMADDASP" in {
+  let isCommutable = 1 in
+  def XVNMADDASP : XX3Form<60, 193,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvnmaddasp $XT, $XA, $XB", IIC_VecFP,
+                          [(set v4f32:$XT, (fneg (fma v4f32:$XA, v4f32:$XB, v4f32:$XTi)))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XVNMADDMSP : XX3Form<60, 201,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvnmaddmsp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
+
+  let BaseName = "XVNMSUBADP" in {
+  let isCommutable = 1 in
+  def XVNMSUBADP : XX3Form<60, 241,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvnmsubadp $XT, $XA, $XB", IIC_VecFP,
+                          [(set v2f64:$XT, (fneg (fma v2f64:$XA, v2f64:$XB, (fneg v2f64:$XTi))))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XVNMSUBMDP : XX3Form<60, 249,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvnmsubmdp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
+
+  let BaseName = "XVNMSUBASP" in {
+  let isCommutable = 1 in
+  def XVNMSUBASP : XX3Form<60, 209,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvnmsubasp $XT, $XA, $XB", IIC_VecFP,
+                          [(set v4f32:$XT, (fneg (fma v4f32:$XA, v4f32:$XB, (fneg v4f32:$XTi))))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XVNMSUBMSP : XX3Form<60, 217,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvnmsubmsp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
+
+  // Division Instructions
+  def XSDIVDP : XX3Form<60, 56,
+                        (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
+                        "xsdivdp $XT, $XA, $XB", IIC_FPDivD,
+                        [(set f64:$XT, (fdiv f64:$XA, f64:$XB))]>;
+  def XSSQRTDP : XX2Form<60, 75,
+                        (outs vsfrc:$XT), (ins vsfrc:$XB),
+                        "xssqrtdp $XT, $XB", IIC_FPSqrtD,
+                        [(set f64:$XT, (fsqrt f64:$XB))]>;
+
+  def XSREDP : XX2Form<60, 90,
+                        (outs vsfrc:$XT), (ins vsfrc:$XB),
+                        "xsredp $XT, $XB", IIC_VecFP,
+                        [(set f64:$XT, (PPCfre f64:$XB))]>;
+  def XSRSQRTEDP : XX2Form<60, 74,
+                           (outs vsfrc:$XT), (ins vsfrc:$XB),
+                           "xsrsqrtedp $XT, $XB", IIC_VecFP,
+                           [(set f64:$XT, (PPCfrsqrte f64:$XB))]>;
+
+  def XSTDIVDP : XX3Form_1<60, 61,
+                         (outs crrc:$crD), (ins vsfrc:$XA, vsfrc:$XB),
+                         "xstdivdp $crD, $XA, $XB", IIC_FPCompare, []>;
+  def XSTSQRTDP : XX2Form_1<60, 106,
+                          (outs crrc:$crD), (ins vsfrc:$XB),
+                          "xstsqrtdp $crD, $XB", IIC_FPCompare, []>;
+
+  def XVDIVDP : XX3Form<60, 120,
+                        (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                        "xvdivdp $XT, $XA, $XB", IIC_FPDivD,
+                        [(set v2f64:$XT, (fdiv v2f64:$XA, v2f64:$XB))]>;
+  def XVDIVSP : XX3Form<60, 88,
+                        (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                        "xvdivsp $XT, $XA, $XB", IIC_FPDivS,
+                        [(set v4f32:$XT, (fdiv v4f32:$XA, v4f32:$XB))]>;
+
+  def XVSQRTDP : XX2Form<60, 203,
+                        (outs vsrc:$XT), (ins vsrc:$XB),
+                        "xvsqrtdp $XT, $XB", IIC_FPSqrtD,
+                        [(set v2f64:$XT, (fsqrt v2f64:$XB))]>;
+  def XVSQRTSP : XX2Form<60, 139,
+                        (outs vsrc:$XT), (ins vsrc:$XB),
+                        "xvsqrtsp $XT, $XB", IIC_FPSqrtS,
+                        [(set v4f32:$XT, (fsqrt v4f32:$XB))]>;
+
+  def XVTDIVDP : XX3Form_1<60, 125,
+                         (outs crrc:$crD), (ins vsrc:$XA, vsrc:$XB),
+                         "xvtdivdp $crD, $XA, $XB", IIC_FPCompare, []>;
+  def XVTDIVSP : XX3Form_1<60, 93,
+                         (outs crrc:$crD), (ins vsrc:$XA, vsrc:$XB),
+                         "xvtdivsp $crD, $XA, $XB", IIC_FPCompare, []>;
+
+  def XVTSQRTDP : XX2Form_1<60, 234,
+                          (outs crrc:$crD), (ins vsrc:$XB),
+                          "xvtsqrtdp $crD, $XB", IIC_FPCompare, []>;
+  def XVTSQRTSP : XX2Form_1<60, 170,
+                          (outs crrc:$crD), (ins vsrc:$XB),
+                          "xvtsqrtsp $crD, $XB", IIC_FPCompare, []>;
+
+  def XVREDP : XX2Form<60, 218,
+                        (outs vsrc:$XT), (ins vsrc:$XB),
+                        "xvredp $XT, $XB", IIC_VecFP,
+                        [(set v2f64:$XT, (PPCfre v2f64:$XB))]>;
+  def XVRESP : XX2Form<60, 154,
+                        (outs vsrc:$XT), (ins vsrc:$XB),
+                        "xvresp $XT, $XB", IIC_VecFP,
+                        [(set v4f32:$XT, (PPCfre v4f32:$XB))]>;
+
+  def XVRSQRTEDP : XX2Form<60, 202,
+                           (outs vsrc:$XT), (ins vsrc:$XB),
+                           "xvrsqrtedp $XT, $XB", IIC_VecFP,
+                           [(set v2f64:$XT, (PPCfrsqrte v2f64:$XB))]>;
+  def XVRSQRTESP : XX2Form<60, 138,
+                           (outs vsrc:$XT), (ins vsrc:$XB),
+                           "xvrsqrtesp $XT, $XB", IIC_VecFP,
+                           [(set v4f32:$XT, (PPCfrsqrte v4f32:$XB))]>;
+
+  // Compare Instructions
+  def XSCMPODP : XX3Form_1<60, 43,
+                           (outs crrc:$crD), (ins vsfrc:$XA, vsfrc:$XB),
+                           "xscmpodp $crD, $XA, $XB", IIC_FPCompare, []>;
+  def XSCMPUDP : XX3Form_1<60, 35,
+                           (outs crrc:$crD), (ins vsfrc:$XA, vsfrc:$XB),
+                           "xscmpudp $crD, $XA, $XB", IIC_FPCompare, []>;
+
+  defm XVCMPEQDP : XX3Form_Rcr<60, 99,
+                             (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                             "xvcmpeqdp", "$XT, $XA, $XB", IIC_VecFPCompare, []>;
+  defm XVCMPEQSP : XX3Form_Rcr<60, 67,
+                             (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                             "xvcmpeqsp", "$XT, $XA, $XB", IIC_VecFPCompare, []>;
+  defm XVCMPGEDP : XX3Form_Rcr<60, 115,
+                             (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                             "xvcmpgedp", "$XT, $XA, $XB", IIC_VecFPCompare, []>;
+  defm XVCMPGESP : XX3Form_Rcr<60, 83,
+                             (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                             "xvcmpgesp", "$XT, $XA, $XB", IIC_VecFPCompare, []>;
+  defm XVCMPGTDP : XX3Form_Rcr<60, 107,
+                             (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                             "xvcmpgtdp", "$XT, $XA, $XB", IIC_VecFPCompare, []>;
+  defm XVCMPGTSP : XX3Form_Rcr<60, 75,
+                             (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                             "xvcmpgtsp", "$XT, $XA, $XB", IIC_VecFPCompare, []>;
+
+  // Move Instructions
+  def XSABSDP : XX2Form<60, 345,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xsabsdp $XT, $XB", IIC_VecFP,
+                      [(set f64:$XT, (fabs f64:$XB))]>;
+  def XSNABSDP : XX2Form<60, 361,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xsnabsdp $XT, $XB", IIC_VecFP,
+                      [(set f64:$XT, (fneg (fabs f64:$XB)))]>;
+  def XSNEGDP : XX2Form<60, 377,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xsnegdp $XT, $XB", IIC_VecFP,
+                      [(set f64:$XT, (fneg f64:$XB))]>;
+  def XSCPSGNDP : XX3Form<60, 176,
+                      (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
+                      "xscpsgndp $XT, $XA, $XB", IIC_VecFP,
+                      [(set f64:$XT, (fcopysign f64:$XB, f64:$XA))]>;
+
+  def XVABSDP : XX2Form<60, 473,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvabsdp $XT, $XB", IIC_VecFP,
+                      [(set v2f64:$XT, (fabs v2f64:$XB))]>;
+
+  def XVABSSP : XX2Form<60, 409,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvabssp $XT, $XB", IIC_VecFP,
+                      [(set v4f32:$XT, (fabs v4f32:$XB))]>;
+
+  def XVCPSGNDP : XX3Form<60, 240,
+                      (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                      "xvcpsgndp $XT, $XA, $XB", IIC_VecFP,
+                      [(set v2f64:$XT, (fcopysign v2f64:$XB, v2f64:$XA))]>;
+  def XVCPSGNSP : XX3Form<60, 208,
+                      (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                      "xvcpsgnsp $XT, $XA, $XB", IIC_VecFP,
+                      [(set v4f32:$XT, (fcopysign v4f32:$XB, v4f32:$XA))]>;
+
+  def XVNABSDP : XX2Form<60, 489,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvnabsdp $XT, $XB", IIC_VecFP,
+                      [(set v2f64:$XT, (fneg (fabs v2f64:$XB)))]>;
+  def XVNABSSP : XX2Form<60, 425,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvnabssp $XT, $XB", IIC_VecFP,
+                      [(set v4f32:$XT, (fneg (fabs v4f32:$XB)))]>;
+
+  def XVNEGDP : XX2Form<60, 505,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvnegdp $XT, $XB", IIC_VecFP,
+                      [(set v2f64:$XT, (fneg v2f64:$XB))]>;
+  def XVNEGSP : XX2Form<60, 441,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvnegsp $XT, $XB", IIC_VecFP,
+                      [(set v4f32:$XT, (fneg v4f32:$XB))]>;
+
+  // Conversion Instructions
+  def XSCVDPSP : XX2Form<60, 265,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xscvdpsp $XT, $XB", IIC_VecFP, []>;
+  def XSCVDPSXDS : XX2Form<60, 344,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xscvdpsxds $XT, $XB", IIC_VecFP,
+                      [(set f64:$XT, (PPCfctidz f64:$XB))]>;
+  def XSCVDPSXWS : XX2Form<60, 88,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xscvdpsxws $XT, $XB", IIC_VecFP,
+                      [(set f64:$XT, (PPCfctiwz f64:$XB))]>;
+  def XSCVDPUXDS : XX2Form<60, 328,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xscvdpuxds $XT, $XB", IIC_VecFP,
+                      [(set f64:$XT, (PPCfctiduz f64:$XB))]>;
+  def XSCVDPUXWS : XX2Form<60, 72,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xscvdpuxws $XT, $XB", IIC_VecFP,
+                      [(set f64:$XT, (PPCfctiwuz f64:$XB))]>;
+  def XSCVSPDP : XX2Form<60, 329,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xscvspdp $XT, $XB", IIC_VecFP, []>;
+  def XSCVSXDDP : XX2Form<60, 376,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xscvsxddp $XT, $XB", IIC_VecFP,
+                      [(set f64:$XT, (PPCfcfid f64:$XB))]>;
+  def XSCVUXDDP : XX2Form<60, 360,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xscvuxddp $XT, $XB", IIC_VecFP,
+                      [(set f64:$XT, (PPCfcfidu f64:$XB))]>;
+
+  def XVCVDPSP : XX2Form<60, 393,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvdpsp $XT, $XB", IIC_VecFP, []>;
+  def XVCVDPSXDS : XX2Form<60, 472,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvdpsxds $XT, $XB", IIC_VecFP,
+                      [(set v2i64:$XT, (fp_to_sint v2f64:$XB))]>;
+  def XVCVDPSXWS : XX2Form<60, 216,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvdpsxws $XT, $XB", IIC_VecFP, []>;
+  def XVCVDPUXDS : XX2Form<60, 456,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvdpuxds $XT, $XB", IIC_VecFP,
+                      [(set v2i64:$XT, (fp_to_uint v2f64:$XB))]>;
+  def XVCVDPUXWS : XX2Form<60, 200,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvdpuxws $XT, $XB", IIC_VecFP, []>;
+
+  def XVCVSPDP : XX2Form<60, 457,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvspdp $XT, $XB", IIC_VecFP, []>;
+  def XVCVSPSXDS : XX2Form<60, 408,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvspsxds $XT, $XB", IIC_VecFP, []>;
+  def XVCVSPSXWS : XX2Form<60, 152,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvspsxws $XT, $XB", IIC_VecFP, []>;
+  def XVCVSPUXDS : XX2Form<60, 392,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvspuxds $XT, $XB", IIC_VecFP, []>;
+  def XVCVSPUXWS : XX2Form<60, 136,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvspuxws $XT, $XB", IIC_VecFP, []>;
+  def XVCVSXDDP : XX2Form<60, 504,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvsxddp $XT, $XB", IIC_VecFP,
+                      [(set v2f64:$XT, (sint_to_fp v2i64:$XB))]>;
+  def XVCVSXDSP : XX2Form<60, 440,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvsxdsp $XT, $XB", IIC_VecFP, []>;
+  def XVCVSXWDP : XX2Form<60, 248,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvsxwdp $XT, $XB", IIC_VecFP, []>;
+  def XVCVSXWSP : XX2Form<60, 184,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvsxwsp $XT, $XB", IIC_VecFP, []>;
+  def XVCVUXDDP : XX2Form<60, 488,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvuxddp $XT, $XB", IIC_VecFP,
+                      [(set v2f64:$XT, (uint_to_fp v2i64:$XB))]>;
+  def XVCVUXDSP : XX2Form<60, 424,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvuxdsp $XT, $XB", IIC_VecFP, []>;
+  def XVCVUXWDP : XX2Form<60, 232,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvuxwdp $XT, $XB", IIC_VecFP, []>;
+  def XVCVUXWSP : XX2Form<60, 168,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvuxwsp $XT, $XB", IIC_VecFP, []>;
+
+  // Rounding Instructions
+  def XSRDPI : XX2Form<60, 73,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xsrdpi $XT, $XB", IIC_VecFP,
+                      [(set f64:$XT, (frnd f64:$XB))]>;
+  def XSRDPIC : XX2Form<60, 107,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xsrdpic $XT, $XB", IIC_VecFP,
+                      [(set f64:$XT, (fnearbyint f64:$XB))]>;
+  def XSRDPIM : XX2Form<60, 121,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xsrdpim $XT, $XB", IIC_VecFP,
+                      [(set f64:$XT, (ffloor f64:$XB))]>;
+  def XSRDPIP : XX2Form<60, 105,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xsrdpip $XT, $XB", IIC_VecFP,
+                      [(set f64:$XT, (fceil f64:$XB))]>;
+  def XSRDPIZ : XX2Form<60, 89,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xsrdpiz $XT, $XB", IIC_VecFP,
+                      [(set f64:$XT, (ftrunc f64:$XB))]>;
+
+  def XVRDPI : XX2Form<60, 201,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvrdpi $XT, $XB", IIC_VecFP,
+                      [(set v2f64:$XT, (frnd v2f64:$XB))]>;
+  def XVRDPIC : XX2Form<60, 235,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvrdpic $XT, $XB", IIC_VecFP,
+                      [(set v2f64:$XT, (fnearbyint v2f64:$XB))]>;
+  def XVRDPIM : XX2Form<60, 249,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvrdpim $XT, $XB", IIC_VecFP,
+                      [(set v2f64:$XT, (ffloor v2f64:$XB))]>;
+  def XVRDPIP : XX2Form<60, 233,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvrdpip $XT, $XB", IIC_VecFP,
+                      [(set v2f64:$XT, (fceil v2f64:$XB))]>;
+  def XVRDPIZ : XX2Form<60, 217,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvrdpiz $XT, $XB", IIC_VecFP,
+                      [(set v2f64:$XT, (ftrunc v2f64:$XB))]>;
+
+  def XVRSPI : XX2Form<60, 137,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvrspi $XT, $XB", IIC_VecFP,
+                      [(set v4f32:$XT, (frnd v4f32:$XB))]>;
+  def XVRSPIC : XX2Form<60, 171,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvrspic $XT, $XB", IIC_VecFP,
+                      [(set v4f32:$XT, (fnearbyint v4f32:$XB))]>;
+  def XVRSPIM : XX2Form<60, 185,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvrspim $XT, $XB", IIC_VecFP,
+                      [(set v4f32:$XT, (ffloor v4f32:$XB))]>;
+  def XVRSPIP : XX2Form<60, 169,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvrspip $XT, $XB", IIC_VecFP,
+                      [(set v4f32:$XT, (fceil v4f32:$XB))]>;
+  def XVRSPIZ : XX2Form<60, 153,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvrspiz $XT, $XB", IIC_VecFP,
+                      [(set v4f32:$XT, (ftrunc v4f32:$XB))]>;
+
+  // Max/Min Instructions
+  let isCommutable = 1 in {
+  def XSMAXDP : XX3Form<60, 160,
+                        (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
+                        "xsmaxdp $XT, $XA, $XB", IIC_VecFP, []>;
+  def XSMINDP : XX3Form<60, 168,
+                        (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
+                        "xsmindp $XT, $XA, $XB", IIC_VecFP, []>;
+
+  def XVMAXDP : XX3Form<60, 224,
+                        (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                        "xvmaxdp $XT, $XA, $XB", IIC_VecFP, []>;
+  def XVMINDP : XX3Form<60, 232,
+                        (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                        "xvmindp $XT, $XA, $XB", IIC_VecFP, []>;
+
+  def XVMAXSP : XX3Form<60, 192,
+                        (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                        "xvmaxsp $XT, $XA, $XB", IIC_VecFP, []>;
+  def XVMINSP : XX3Form<60, 200,
+                        (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                        "xvminsp $XT, $XA, $XB", IIC_VecFP, []>;
+  } // isCommutable
+} // Uses = [RM]
+
+  // Logical Instructions
+  let isCommutable = 1 in
+  def XXLAND : XX3Form<60, 130,
+                       (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                       "xxland $XT, $XA, $XB", IIC_VecGeneral,
+                       [(set v4i32:$XT, (and v4i32:$XA, v4i32:$XB))]>;
+  def XXLANDC : XX3Form<60, 138,
+                        (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                        "xxlandc $XT, $XA, $XB", IIC_VecGeneral,
+                        [(set v4i32:$XT, (and v4i32:$XA,
+                                              (vnot_ppc v4i32:$XB)))]>;
+  let isCommutable = 1 in {
+  def XXLNOR : XX3Form<60, 162,
+                       (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                       "xxlnor $XT, $XA, $XB", IIC_VecGeneral,
+                       [(set v4i32:$XT, (vnot_ppc (or v4i32:$XA,
+                                                   v4i32:$XB)))]>;
+  def XXLOR : XX3Form<60, 146,
+                      (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                      "xxlor $XT, $XA, $XB", IIC_VecGeneral,
+                      [(set v4i32:$XT, (or v4i32:$XA, v4i32:$XB))]>;
+  let isCodeGenOnly = 1 in
+  def XXLORf: XX3Form<60, 146,
+                      (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
+                      "xxlor $XT, $XA, $XB", IIC_VecGeneral, []>;
+  def XXLXOR : XX3Form<60, 154,
+                       (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                       "xxlxor $XT, $XA, $XB", IIC_VecGeneral,
+                       [(set v4i32:$XT, (xor v4i32:$XA, v4i32:$XB))]>;
+  } // isCommutable
+
+  // Permutation Instructions
+  def XXMRGHW : XX3Form<60, 18,
+                       (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                       "xxmrghw $XT, $XA, $XB", IIC_VecPerm, []>;
+  def XXMRGLW : XX3Form<60, 50,
+                       (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                       "xxmrglw $XT, $XA, $XB", IIC_VecPerm, []>;
+
+  def XXPERMDI : XX3Form_2<60, 10,
+                       (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, u2imm:$DM),
+                       "xxpermdi $XT, $XA, $XB, $DM", IIC_VecPerm, []>;
+  def XXSEL : XX4Form<60, 3,
+                      (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, vsrc:$XC),
+                      "xxsel $XT, $XA, $XB, $XC", IIC_VecPerm, []>;
+
+  def XXSLDWI : XX3Form_2<60, 2,
+                       (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, u2imm:$SHW),
+                       "xxsldwi $XT, $XA, $XB, $SHW", IIC_VecPerm, []>;
+  def XXSPLTW : XX2Form_2<60, 164,
+                       (outs vsrc:$XT), (ins vsrc:$XB, u2imm:$UIM),
+                       "xxspltw $XT, $XB, $UIM", IIC_VecPerm, []>;
+} // neverHasSideEffects
+} // AddedComplexity
+
+def : InstAlias<"xvmovdp $XT, $XB",
+                (XVCPSGNDP vsrc:$XT, vsrc:$XB, vsrc:$XB)>;
+def : InstAlias<"xvmovsp $XT, $XB",
+                (XVCPSGNSP vsrc:$XT, vsrc:$XB, vsrc:$XB)>;
+
+def : InstAlias<"xxspltd $XT, $XB, 0",
+                (XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 0)>;
+def : InstAlias<"xxspltd $XT, $XB, 1",
+                (XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 3)>;
+def : InstAlias<"xxmrghd $XT, $XA, $XB",
+                (XXPERMDI vsrc:$XT, vsrc:$XA, vsrc:$XB, 0)>;
+def : InstAlias<"xxmrgld $XT, $XA, $XB",
+                (XXPERMDI vsrc:$XT, vsrc:$XA, vsrc:$XB, 3)>;
+def : InstAlias<"xxswapd $XT, $XB",
+                (XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 2)>;
+
+let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
+def : Pat<(v2f64 (scalar_to_vector f64:$A)),
+          (v2f64 (SUBREG_TO_REG (i64 1), $A, sub_64))>;
+
+def : Pat<(f64 (vector_extract v2f64:$S, 0)),
+          (f64 (EXTRACT_SUBREG $S, sub_64))>;
+def : Pat<(f64 (vector_extract v2f64:$S, 1)),
+          (f64 (EXTRACT_SUBREG (XXPERMDI $S, $S, 2), sub_64))>;
+
+// Additional fnmsub patterns: -a*c + b == -(a*c - b)
+def : Pat<(fma (fneg f64:$A), f64:$C, f64:$B),
+          (XSNMSUBADP $B, $C, $A)>;
+def : Pat<(fma f64:$A, (fneg f64:$C), f64:$B),
+          (XSNMSUBADP $B, $C, $A)>;
+
+def : Pat<(fma (fneg v2f64:$A), v2f64:$C, v2f64:$B),
+          (XVNMSUBADP $B, $C, $A)>;
+def : Pat<(fma v2f64:$A, (fneg v2f64:$C), v2f64:$B),
+          (XVNMSUBADP $B, $C, $A)>;
+
+def : Pat<(fma (fneg v4f32:$A), v4f32:$C, v4f32:$B),
+          (XVNMSUBASP $B, $C, $A)>;
+def : Pat<(fma v4f32:$A, (fneg v4f32:$C), v4f32:$B),
+          (XVNMSUBASP $B, $C, $A)>;
+
+def : Pat<(v2f64 (bitconvert v4f32:$A)),
+          (COPY_TO_REGCLASS $A, VSRC)>;
+def : Pat<(v2f64 (bitconvert v4i32:$A)),
+          (COPY_TO_REGCLASS $A, VSRC)>;
+def : Pat<(v2f64 (bitconvert v8i16:$A)),
+          (COPY_TO_REGCLASS $A, VSRC)>;
+def : Pat<(v2f64 (bitconvert v16i8:$A)),
+          (COPY_TO_REGCLASS $A, VSRC)>;
+
+def : Pat<(v4f32 (bitconvert v2f64:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v4i32 (bitconvert v2f64:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v8i16 (bitconvert v2f64:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v16i8 (bitconvert v2f64:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+
+def : Pat<(v2i64 (bitconvert v4f32:$A)),
+          (COPY_TO_REGCLASS $A, VSRC)>;
+def : Pat<(v2i64 (bitconvert v4i32:$A)),
+          (COPY_TO_REGCLASS $A, VSRC)>;
+def : Pat<(v2i64 (bitconvert v8i16:$A)),
+          (COPY_TO_REGCLASS $A, VSRC)>;
+def : Pat<(v2i64 (bitconvert v16i8:$A)),
+          (COPY_TO_REGCLASS $A, VSRC)>;
+
+def : Pat<(v4f32 (bitconvert v2i64:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v4i32 (bitconvert v2i64:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v8i16 (bitconvert v2i64:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v16i8 (bitconvert v2i64:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+
+def : Pat<(v2f64 (bitconvert v2i64:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v2i64 (bitconvert v2f64:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+
+// sign extension patterns
+// To extend "in place" from v2i32 to v2i64, we have input data like:
+// | undef | i32 | undef | i32 |
+// but xvcvsxwdp expects the input in big-Endian format:
+// | i32 | undef | i32 | undef |
+// so we need to shift everything to the left by one i32 (word) before
+// the conversion.
+def : Pat<(sext_inreg v2i64:$C, v2i32),
+          (XVCVDPSXDS (XVCVSXWDP (XXSLDWI $C, $C, 1)))>;
+def : Pat<(v2f64 (sint_to_fp (sext_inreg v2i64:$C, v2i32))),
+          (XVCVSXWDP (XXSLDWI $C, $C, 1))>;
+
+} // AddedComplexity
+} // HasVSX
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCJITInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCJITInfo.cpp
new file mode 100644
index 0000000..e5f113a
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCJITInfo.cpp
@@ -0,0 +1,482 @@
+//===-- PPCJITInfo.cpp - Implement the JIT interfaces for the PowerPC -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the JIT interfaces for the 32-bit PowerPC target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCJITInfo.h"
+#include "PPCRelocations.h"
+#include "PPCSubtarget.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Memory.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "jit"
+
+static TargetJITInfo::JITCompilerFn JITCompilerFunction;
+
+PPCJITInfo::PPCJITInfo(PPCSubtarget &STI)
+    : Subtarget(STI), is64Bit(STI.isPPC64()) {
+  useGOT = 0;
+}
+
+#define BUILD_ADDIS(RD,RS,IMM16) \
+  ((15 << 26) | ((RD) << 21) | ((RS) << 16) | ((IMM16) & 65535))
+#define BUILD_ORI(RD,RS,UIMM16) \
+  ((24 << 26) | ((RS) << 21) | ((RD) << 16) | ((UIMM16) & 65535))
+#define BUILD_ORIS(RD,RS,UIMM16) \
+  ((25 << 26) | ((RS) << 21) | ((RD) << 16) | ((UIMM16) & 65535))
+#define BUILD_RLDICR(RD,RS,SH,ME) \
+  ((30 << 26) | ((RS) << 21) | ((RD) << 16) | (((SH) & 31) << 11) | \
+   (((ME) & 63) << 6) | (1 << 2) | ((((SH) >> 5) & 1) << 1))
+#define BUILD_MTSPR(RS,SPR)      \
+  ((31 << 26) | ((RS) << 21) | ((SPR) << 16) | (467 << 1))
+#define BUILD_BCCTRx(BO,BI,LINK) \
+  ((19 << 26) | ((BO) << 21) | ((BI) << 16) | (528 << 1) | ((LINK) & 1))
+#define BUILD_B(TARGET, LINK) \
+  ((18 << 26) | (((TARGET) & 0x00FFFFFF) << 2) | ((LINK) & 1))
+
+// Pseudo-ops
+#define BUILD_LIS(RD,IMM16)    BUILD_ADDIS(RD,0,IMM16)
+#define BUILD_SLDI(RD,RS,IMM6) BUILD_RLDICR(RD,RS,IMM6,63-IMM6)
+#define BUILD_MTCTR(RS)        BUILD_MTSPR(RS,9)
+#define BUILD_BCTR(LINK)       BUILD_BCCTRx(20,0,LINK)
+
+static void EmitBranchToAt(uint64_t At, uint64_t To, bool isCall, bool is64Bit){
+  intptr_t Offset = ((intptr_t)To - (intptr_t)At) >> 2;
+  unsigned *AtI = (unsigned*)(intptr_t)At;
+
+  if (Offset >= -(1 << 23) && Offset < (1 << 23)) {   // In range?
+    AtI[0] = BUILD_B(Offset, isCall);     // b/bl target
+  } else if (!is64Bit) {
+    AtI[0] = BUILD_LIS(12, To >> 16);     // lis r12, hi16(address)
+    AtI[1] = BUILD_ORI(12, 12, To);       // ori r12, r12, lo16(address)
+    AtI[2] = BUILD_MTCTR(12);             // mtctr r12
+    AtI[3] = BUILD_BCTR(isCall);          // bctr/bctrl
+  } else {
+    AtI[0] = BUILD_LIS(12, To >> 48);      // lis r12, hi16(address)
+    AtI[1] = BUILD_ORI(12, 12, To >> 32);  // ori r12, r12, lo16(address)
+    AtI[2] = BUILD_SLDI(12, 12, 32);       // sldi r12, r12, 32
+    AtI[3] = BUILD_ORIS(12, 12, To >> 16); // oris r12, r12, hi16(address)
+    AtI[4] = BUILD_ORI(12, 12, To);        // ori r12, r12, lo16(address)
+    AtI[5] = BUILD_MTCTR(12);              // mtctr r12
+    AtI[6] = BUILD_BCTR(isCall);           // bctr/bctrl
+  }
+}
+
+extern "C" void PPC32CompilationCallback();
+extern "C" void PPC64CompilationCallback();
+
+// The first clause of the preprocessor directive looks wrong, but it is
+// necessary when compiling this code on non-PowerPC hosts.
+#if (!defined(__ppc__) && !defined(__powerpc__)) || defined(__powerpc64__) || defined(__ppc64__)
+void PPC32CompilationCallback() {
+  llvm_unreachable("This is not a 32bit PowerPC, you can't execute this!");
+}
+#elif !defined(__ELF__)
+// CompilationCallback stub - We can't use a C function with inline assembly in
+// it, because we the prolog/epilog inserted by GCC won't work for us.  Instead,
+// write our own wrapper, which does things our way, so we have complete control
+// over register saving and restoring.
+asm(
+    ".text\n"
+    ".align 2\n"
+    ".globl _PPC32CompilationCallback\n"
+"_PPC32CompilationCallback:\n"
+    // Make space for 8 ints r[3-10] and 13 doubles f[1-13] and the 
+    // FIXME: need to save v[0-19] for altivec?
+    // FIXME: could shrink frame
+    // Set up a proper stack frame
+    // FIXME Layout
+    //   PowerPC32 ABI linkage    -  24 bytes
+    //                 parameters -  32 bytes
+    //   13 double registers      - 104 bytes
+    //   8 int registers          -  32 bytes
+    "mflr r0\n"
+    "stw r0,  8(r1)\n"
+    "stwu r1, -208(r1)\n"
+    // Save all int arg registers
+    "stw r10, 204(r1)\n"    "stw r9,  200(r1)\n"
+    "stw r8,  196(r1)\n"    "stw r7,  192(r1)\n"
+    "stw r6,  188(r1)\n"    "stw r5,  184(r1)\n"
+    "stw r4,  180(r1)\n"    "stw r3,  176(r1)\n"
+    // Save all call-clobbered FP regs.
+    "stfd f13, 168(r1)\n"   "stfd f12, 160(r1)\n"
+    "stfd f11, 152(r1)\n"   "stfd f10, 144(r1)\n"
+    "stfd f9,  136(r1)\n"   "stfd f8,  128(r1)\n"
+    "stfd f7,  120(r1)\n"   "stfd f6,  112(r1)\n"
+    "stfd f5,  104(r1)\n"   "stfd f4,   96(r1)\n"
+    "stfd f3,   88(r1)\n"   "stfd f2,   80(r1)\n"
+    "stfd f1,   72(r1)\n"
+    // Arguments to Compilation Callback:
+    // r3 - our lr (address of the call instruction in stub plus 4)
+    // r4 - stub's lr (address of instruction that called the stub plus 4)
+    // r5 - is64Bit - always 0.
+    "mr   r3, r0\n"
+    "lwz  r2, 208(r1)\n" // stub's frame
+    "lwz  r4, 8(r2)\n" // stub's lr
+    "li   r5, 0\n"       // 0 == 32 bit
+    "bl _LLVMPPCCompilationCallback\n"
+    "mtctr r3\n"
+    // Restore all int arg registers
+    "lwz r10, 204(r1)\n"    "lwz r9,  200(r1)\n"
+    "lwz r8,  196(r1)\n"    "lwz r7,  192(r1)\n"
+    "lwz r6,  188(r1)\n"    "lwz r5,  184(r1)\n"
+    "lwz r4,  180(r1)\n"    "lwz r3,  176(r1)\n"
+    // Restore all FP arg registers
+    "lfd f13, 168(r1)\n"    "lfd f12, 160(r1)\n"
+    "lfd f11, 152(r1)\n"    "lfd f10, 144(r1)\n"
+    "lfd f9,  136(r1)\n"    "lfd f8,  128(r1)\n"
+    "lfd f7,  120(r1)\n"    "lfd f6,  112(r1)\n"
+    "lfd f5,  104(r1)\n"    "lfd f4,   96(r1)\n"
+    "lfd f3,   88(r1)\n"    "lfd f2,   80(r1)\n"
+    "lfd f1,   72(r1)\n"
+    // Pop 3 frames off the stack and branch to target
+    "lwz  r1, 208(r1)\n"
+    "lwz  r2, 8(r1)\n"
+    "mtlr r2\n"
+    "bctr\n"
+    );
+
+#else
+// ELF PPC 32 support
+
+// CompilationCallback stub - We can't use a C function with inline assembly in
+// it, because we the prolog/epilog inserted by GCC won't work for us.  Instead,
+// write our own wrapper, which does things our way, so we have complete control
+// over register saving and restoring.
+asm(
+    ".text\n"
+    ".align 2\n"
+    ".globl PPC32CompilationCallback\n"
+"PPC32CompilationCallback:\n"
+    // Make space for 8 ints r[3-10] and 8 doubles f[1-8] and the 
+    // FIXME: need to save v[0-19] for altivec?
+    // FIXME: could shrink frame
+    // Set up a proper stack frame
+    // FIXME Layout
+    //   8 double registers       -  64 bytes
+    //   8 int registers          -  32 bytes
+    "mflr 0\n"
+    "stw 0,  4(1)\n"
+    "stwu 1, -104(1)\n"
+    // Save all int arg registers
+    "stw 10, 100(1)\n"   "stw 9,  96(1)\n"
+    "stw 8,  92(1)\n"    "stw 7,  88(1)\n"
+    "stw 6,  84(1)\n"    "stw 5,  80(1)\n"
+    "stw 4,  76(1)\n"    "stw 3,  72(1)\n"
+    // Save all call-clobbered FP regs.
+    "stfd 8,  64(1)\n"
+    "stfd 7,  56(1)\n"   "stfd 6,  48(1)\n"
+    "stfd 5,  40(1)\n"   "stfd 4,  32(1)\n"
+    "stfd 3,  24(1)\n"   "stfd 2,  16(1)\n"
+    "stfd 1,  8(1)\n"
+    // Arguments to Compilation Callback:
+    // r3 - our lr (address of the call instruction in stub plus 4)
+    // r4 - stub's lr (address of instruction that called the stub plus 4)
+    // r5 - is64Bit - always 0.
+    "mr   3, 0\n"
+    "lwz  5, 104(1)\n" // stub's frame
+    "lwz  4, 4(5)\n" // stub's lr
+    "li   5, 0\n"       // 0 == 32 bit
+    "bl LLVMPPCCompilationCallback\n"
+    "mtctr 3\n"
+    // Restore all int arg registers
+    "lwz 10, 100(1)\n"   "lwz 9,  96(1)\n"
+    "lwz 8,  92(1)\n"    "lwz 7,  88(1)\n"
+    "lwz 6,  84(1)\n"    "lwz 5,  80(1)\n"
+    "lwz 4,  76(1)\n"    "lwz 3,  72(1)\n"
+    // Restore all FP arg registers
+    "lfd 8,  64(1)\n"
+    "lfd 7,  56(1)\n"    "lfd 6,  48(1)\n"
+    "lfd 5,  40(1)\n"    "lfd 4,  32(1)\n"
+    "lfd 3,  24(1)\n"    "lfd 2,  16(1)\n"
+    "lfd 1,  8(1)\n"
+    // Pop 3 frames off the stack and branch to target
+    "lwz  1, 104(1)\n"
+    "lwz  0, 4(1)\n"
+    "mtlr 0\n"
+    "bctr\n"
+    );
+#endif
+
+#if !defined(__powerpc64__) && !defined(__ppc64__)
+void PPC64CompilationCallback() {
+  llvm_unreachable("This is not a 64bit PowerPC, you can't execute this!");
+}
+#else
+#  ifdef __ELF__
+asm(
+    ".text\n"
+    ".align 2\n"
+    ".globl PPC64CompilationCallback\n"
+#if _CALL_ELF == 2
+    ".type PPC64CompilationCallback,@function\n"
+"PPC64CompilationCallback:\n"
+#else
+    ".section \".opd\",\"aw\",@progbits\n"
+    ".align 3\n"
+"PPC64CompilationCallback:\n"
+    ".quad .L.PPC64CompilationCallback,.TOC.@tocbase,0\n"
+    ".size PPC64CompilationCallback,24\n"
+    ".previous\n"
+    ".align 4\n"
+    ".type PPC64CompilationCallback,@function\n"
+".L.PPC64CompilationCallback:\n"
+#endif
+#  else
+asm(
+    ".text\n"
+    ".align 2\n"
+    ".globl _PPC64CompilationCallback\n"
+"_PPC64CompilationCallback:\n"
+#  endif
+    // Make space for 8 ints r[3-10] and 13 doubles f[1-13] and the 
+    // FIXME: need to save v[0-19] for altivec?
+    // Set up a proper stack frame
+    // Layout
+    //   PowerPC64 ABI linkage    -  48 bytes
+    //                 parameters -  64 bytes
+    //   13 double registers      - 104 bytes
+    //   8 int registers          -  64 bytes
+    "mflr 0\n"
+    "std  0,  16(1)\n"
+    "stdu 1, -280(1)\n"
+    // Save all int arg registers
+    "std 10, 272(1)\n"    "std 9,  264(1)\n"
+    "std 8,  256(1)\n"    "std 7,  248(1)\n"
+    "std 6,  240(1)\n"    "std 5,  232(1)\n"
+    "std 4,  224(1)\n"    "std 3,  216(1)\n"
+    // Save all call-clobbered FP regs.
+    "stfd 13, 208(1)\n"    "stfd 12, 200(1)\n"
+    "stfd 11, 192(1)\n"    "stfd 10, 184(1)\n"
+    "stfd 9,  176(1)\n"    "stfd 8,  168(1)\n"
+    "stfd 7,  160(1)\n"    "stfd 6,  152(1)\n"
+    "stfd 5,  144(1)\n"    "stfd 4,  136(1)\n"
+    "stfd 3,  128(1)\n"    "stfd 2,  120(1)\n"
+    "stfd 1,  112(1)\n"
+    // Arguments to Compilation Callback:
+    // r3 - our lr (address of the call instruction in stub plus 4)
+    // r4 - stub's lr (address of instruction that called the stub plus 4)
+    // r5 - is64Bit - always 1.
+    "mr   3, 0\n"      // return address (still in r0)
+    "ld   5, 280(1)\n" // stub's frame
+    "ld   4, 16(5)\n"  // stub's lr
+    "li   5, 1\n"      // 1 == 64 bit
+#  ifdef __ELF__
+    "bl LLVMPPCCompilationCallback\n"
+    "nop\n"
+#  else
+    "bl _LLVMPPCCompilationCallback\n"
+#  endif
+    "mtctr 3\n"
+    // Restore all int arg registers
+    "ld 10, 272(1)\n"    "ld 9,  264(1)\n"
+    "ld 8,  256(1)\n"    "ld 7,  248(1)\n"
+    "ld 6,  240(1)\n"    "ld 5,  232(1)\n"
+    "ld 4,  224(1)\n"    "ld 3,  216(1)\n"
+    // Restore all FP arg registers
+    "lfd 13, 208(1)\n"    "lfd 12, 200(1)\n"
+    "lfd 11, 192(1)\n"    "lfd 10, 184(1)\n"
+    "lfd 9,  176(1)\n"    "lfd 8,  168(1)\n"
+    "lfd 7,  160(1)\n"    "lfd 6,  152(1)\n"
+    "lfd 5,  144(1)\n"    "lfd 4,  136(1)\n"
+    "lfd 3,  128(1)\n"    "lfd 2,  120(1)\n"
+    "lfd 1,  112(1)\n"
+    // Pop 3 frames off the stack and branch to target
+    "ld  1, 280(1)\n"
+    "ld  0, 16(1)\n"
+    "mtlr 0\n"
+    // XXX: any special TOC handling in the ELF case for JIT?
+    "bctr\n"
+    );
+#endif
+
+extern "C" {
+LLVM_LIBRARY_VISIBILITY void *
+LLVMPPCCompilationCallback(unsigned *StubCallAddrPlus4,
+                           unsigned *OrigCallAddrPlus4,
+                           bool is64Bit) {
+  // Adjust the pointer to the address of the call instruction in the stub
+  // emitted by emitFunctionStub, rather than the instruction after it.
+  unsigned *StubCallAddr = StubCallAddrPlus4 - 1;
+  unsigned *OrigCallAddr = OrigCallAddrPlus4 - 1;
+
+  void *Target = JITCompilerFunction(StubCallAddr);
+
+  // Check to see if *OrigCallAddr is a 'bl' instruction, and if we can rewrite
+  // it to branch directly to the destination.  If so, rewrite it so it does not
+  // need to go through the stub anymore.
+  unsigned OrigCallInst = *OrigCallAddr;
+  if ((OrigCallInst >> 26) == 18) {     // Direct call.
+    intptr_t Offset = ((intptr_t)Target - (intptr_t)OrigCallAddr) >> 2;
+    
+    if (Offset >= -(1 << 23) && Offset < (1 << 23)) {   // In range?
+      // Clear the original target out.
+      OrigCallInst &= (63 << 26) | 3;
+      // Fill in the new target.
+      OrigCallInst |= (Offset & ((1 << 24)-1)) << 2;
+      // Replace the call.
+      *OrigCallAddr = OrigCallInst;
+    }
+  }
+
+  // Assert that we are coming from a stub that was created with our
+  // emitFunctionStub.
+  if ((*StubCallAddr >> 26) == 18)
+    StubCallAddr -= 3;
+  else {
+  assert((*StubCallAddr >> 26) == 19 && "Call in stub is not indirect!");
+    StubCallAddr -= is64Bit ? 9 : 6;
+  }
+
+  // Rewrite the stub with an unconditional branch to the target, for any users
+  // who took the address of the stub.
+  EmitBranchToAt((intptr_t)StubCallAddr, (intptr_t)Target, false, is64Bit);
+  sys::Memory::InvalidateInstructionCache(StubCallAddr, 7*4);
+
+  // Put the address of the target function to call and the address to return to
+  // after calling the target function in a place that is easy to get on the
+  // stack after we restore all regs.
+  return Target;
+}
+}
+
+
+
+TargetJITInfo::LazyResolverFn
+PPCJITInfo::getLazyResolverFunction(JITCompilerFn Fn) {
+  JITCompilerFunction = Fn;
+  return is64Bit ? PPC64CompilationCallback : PPC32CompilationCallback;
+}
+
+TargetJITInfo::StubLayout PPCJITInfo::getStubLayout() {
+  // The stub contains up to 10 4-byte instructions, aligned at 4 bytes: 3
+  // instructions to save the caller's address if this is a lazy-compilation
+  // stub, plus a 1-, 4-, or 7-instruction sequence to load an arbitrary address
+  // into a register and jump through it.
+  StubLayout Result = {10*4, 4};
+  return Result;
+}
+
+#if (defined(__POWERPC__) || defined (__ppc__) || defined(_POWER)) && \
+defined(__APPLE__)
+extern "C" void sys_icache_invalidate(const void *Addr, size_t len);
+#endif
+
+void *PPCJITInfo::emitFunctionStub(const Function* F, void *Fn,
+                                   JITCodeEmitter &JCE) {
+  // If this is just a call to an external function, emit a branch instead of a
+  // call.  The code is the same except for one bit of the last instruction.
+  if (Fn != (void*)(intptr_t)PPC32CompilationCallback && 
+      Fn != (void*)(intptr_t)PPC64CompilationCallback) {
+    void *Addr = (void*)JCE.getCurrentPCValue();
+    JCE.emitWordBE(0);
+    JCE.emitWordBE(0);
+    JCE.emitWordBE(0);
+    JCE.emitWordBE(0);
+    JCE.emitWordBE(0);
+    JCE.emitWordBE(0);
+    JCE.emitWordBE(0);
+    EmitBranchToAt((intptr_t)Addr, (intptr_t)Fn, false, is64Bit);
+    sys::Memory::InvalidateInstructionCache(Addr, 7*4);
+    return Addr;
+  }
+
+  void *Addr = (void*)JCE.getCurrentPCValue();
+  if (is64Bit) {
+    JCE.emitWordBE(0xf821ffb1);     // stdu r1,-80(r1)
+    JCE.emitWordBE(0x7d6802a6);     // mflr r11
+    JCE.emitWordBE(0xf9610060);     // std r11, 96(r1)
+  } else if (Subtarget.isDarwinABI()){
+    JCE.emitWordBE(0x9421ffe0);     // stwu r1,-32(r1)
+    JCE.emitWordBE(0x7d6802a6);     // mflr r11
+    JCE.emitWordBE(0x91610028);     // stw r11, 40(r1)
+  } else {
+    JCE.emitWordBE(0x9421ffe0);     // stwu r1,-32(r1)
+    JCE.emitWordBE(0x7d6802a6);     // mflr r11
+    JCE.emitWordBE(0x91610024);     // stw r11, 36(r1)
+  }
+  intptr_t BranchAddr = (intptr_t)JCE.getCurrentPCValue();
+  JCE.emitWordBE(0);
+  JCE.emitWordBE(0);
+  JCE.emitWordBE(0);
+  JCE.emitWordBE(0);
+  JCE.emitWordBE(0);
+  JCE.emitWordBE(0);
+  JCE.emitWordBE(0);
+  EmitBranchToAt(BranchAddr, (intptr_t)Fn, true, is64Bit);
+  sys::Memory::InvalidateInstructionCache(Addr, 10*4);
+  return Addr;
+}
+
+
+void PPCJITInfo::relocate(void *Function, MachineRelocation *MR,
+                          unsigned NumRelocs, unsigned char* GOTBase) {
+  for (unsigned i = 0; i != NumRelocs; ++i, ++MR) {
+    unsigned *RelocPos = (unsigned*)Function + MR->getMachineCodeOffset()/4;
+    intptr_t ResultPtr = (intptr_t)MR->getResultPointer();
+    switch ((PPC::RelocationType)MR->getRelocationType()) {
+    default: llvm_unreachable("Unknown relocation type!");
+    case PPC::reloc_pcrel_bx:
+      // PC-relative relocation for b and bl instructions.
+      ResultPtr = (ResultPtr-(intptr_t)RelocPos) >> 2;
+      assert(ResultPtr >= -(1 << 23) && ResultPtr < (1 << 23) &&
+             "Relocation out of range!");
+      *RelocPos |= (ResultPtr & ((1 << 24)-1))  << 2;
+      break;
+    case PPC::reloc_pcrel_bcx:
+      // PC-relative relocation for BLT,BLE,BEQ,BGE,BGT,BNE, or other
+      // bcx instructions.
+      ResultPtr = (ResultPtr-(intptr_t)RelocPos) >> 2;
+      assert(ResultPtr >= -(1 << 13) && ResultPtr < (1 << 13) &&
+             "Relocation out of range!");
+      *RelocPos |= (ResultPtr & ((1 << 14)-1))  << 2;
+      break;
+    case PPC::reloc_absolute_high:     // high bits of ref -> low 16 of instr
+    case PPC::reloc_absolute_low: {    // low bits of ref  -> low 16 of instr
+      ResultPtr += MR->getConstantVal();
+
+      // If this is a high-part access, get the high-part.
+      if (MR->getRelocationType() == PPC::reloc_absolute_high) {
+        // If the low part will have a carry (really a borrow) from the low
+        // 16-bits into the high 16, add a bit to borrow from.
+        if (((int)ResultPtr << 16) < 0)
+          ResultPtr += 1 << 16;
+        ResultPtr >>= 16;
+      }
+
+      // Do the addition then mask, so the addition does not overflow the 16-bit
+      // immediate section of the instruction.
+      unsigned LowBits  = (*RelocPos + ResultPtr) & 65535;
+      unsigned HighBits = *RelocPos & ~65535;
+      *RelocPos = LowBits | HighBits;  // Slam into low 16-bits
+      break;
+    }
+    case PPC::reloc_absolute_low_ix: {  // low bits of ref  -> low 14 of instr
+      ResultPtr += MR->getConstantVal();
+      // Do the addition then mask, so the addition does not overflow the 16-bit
+      // immediate section of the instruction.
+      unsigned LowBits  = (*RelocPos + ResultPtr) & 0xFFFC;
+      unsigned HighBits = *RelocPos & 0xFFFF0003;
+      *RelocPos = LowBits | HighBits;  // Slam into low 14-bits.
+      break;
+    }
+    }
+  }
+}
+
+void PPCJITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
+  EmitBranchToAt((intptr_t)Old, (intptr_t)New, false, is64Bit);
+  sys::Memory::InvalidateInstructionCache(Old, 7*4);
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCJITInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCJITInfo.h
new file mode 100644
index 0000000..b6b37ff
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCJITInfo.h
@@ -0,0 +1,46 @@
+//===-- PPCJITInfo.h - PowerPC impl. of the JIT interface -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PowerPC implementation of the TargetJITInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef POWERPC_JITINFO_H
+#define POWERPC_JITINFO_H
+
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/Target/TargetJITInfo.h"
+
+namespace llvm {
+class PPCSubtarget;
+class PPCJITInfo : public TargetJITInfo {
+protected:
+  PPCSubtarget &Subtarget;
+  bool is64Bit;
+
+public:
+  PPCJITInfo(PPCSubtarget &STI);
+
+  StubLayout getStubLayout() override;
+  void *emitFunctionStub(const Function *F, void *Fn,
+                         JITCodeEmitter &JCE) override;
+  LazyResolverFn getLazyResolverFunction(JITCompilerFn) override;
+  void relocate(void *Function, MachineRelocation *MR, unsigned NumRelocs,
+                unsigned char *GOTBase) override;
+
+  /// replaceMachineCodeForFunction - Make it so that calling the function
+  /// whose machine code is at OLD turns into a call to NEW, perhaps by
+  /// overwriting OLD with a branch to NEW.  This is used for self-modifying
+  /// code.
+  ///
+  void replaceMachineCodeForFunction(void *Old, void *New) override;
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp b/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
new file mode 100644
index 0000000..6680413
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
@@ -0,0 +1,216 @@
+//===-- PPCMCInstLower.cpp - Convert PPC MachineInstr to an MCInst --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower PPC MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "PPCSubtarget.h"
+#include "MCTargetDesc/PPCMCExpr.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+using namespace llvm;
+
+static MachineModuleInfoMachO &getMachOMMI(AsmPrinter &AP) {
+  return AP.MMI->getObjFileInfo<MachineModuleInfoMachO>();
+}
+
+
+static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO, AsmPrinter &AP){
+  const TargetMachine &TM = AP.TM;
+  Mangler *Mang = AP.Mang;
+  const DataLayout *DL = TM.getDataLayout();
+  MCContext &Ctx = AP.OutContext;
+  bool isDarwin = TM.getSubtarget<PPCSubtarget>().isDarwin();
+
+  SmallString<128> Name;
+  StringRef Suffix;
+  if (MO.getTargetFlags() == PPCII::MO_PLT_OR_STUB) {
+    if (isDarwin)
+      Suffix = "$stub";
+  } else if (MO.getTargetFlags() & PPCII::MO_NLP_FLAG)
+    Suffix = "$non_lazy_ptr";
+
+  if (!Suffix.empty())
+    Name += DL->getPrivateGlobalPrefix();
+
+  unsigned PrefixLen = Name.size();
+
+  if (!MO.isGlobal()) {
+    assert(MO.isSymbol() && "Isn't a symbol reference");
+    Mang->getNameWithPrefix(Name, MO.getSymbolName());
+  } else {
+    const GlobalValue *GV = MO.getGlobal();
+    TM.getNameWithPrefix(Name, GV, *Mang);
+  }
+
+  unsigned OrigLen = Name.size() - PrefixLen;
+
+  Name += Suffix;
+  MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name.str());
+  StringRef OrigName = StringRef(Name).substr(PrefixLen, OrigLen);
+
+  // If the target flags on the operand changes the name of the symbol, do that
+  // before we return the symbol.
+  if (MO.getTargetFlags() == PPCII::MO_PLT_OR_STUB && isDarwin) {
+    MachineModuleInfoImpl::StubValueTy &StubSym =
+      getMachOMMI(AP).getFnStubEntry(Sym);
+    if (StubSym.getPointer())
+      return Sym;
+    
+    if (MO.isGlobal()) {
+      StubSym =
+      MachineModuleInfoImpl::
+      StubValueTy(AP.getSymbol(MO.getGlobal()),
+                  !MO.getGlobal()->hasInternalLinkage());
+    } else {
+      StubSym =
+      MachineModuleInfoImpl::
+      StubValueTy(Ctx.GetOrCreateSymbol(OrigName), false);
+    }
+    return Sym;
+  }
+
+  // If the symbol reference is actually to a non_lazy_ptr, not to the symbol,
+  // then add the suffix.
+  if (MO.getTargetFlags() & PPCII::MO_NLP_FLAG) {
+    MachineModuleInfoMachO &MachO = getMachOMMI(AP);
+    
+    MachineModuleInfoImpl::StubValueTy &StubSym =
+      (MO.getTargetFlags() & PPCII::MO_NLP_HIDDEN_FLAG) ? 
+         MachO.getHiddenGVStubEntry(Sym) : MachO.getGVStubEntry(Sym);
+    
+    if (!StubSym.getPointer()) {
+      assert(MO.isGlobal() && "Extern symbol not handled yet");
+      StubSym = MachineModuleInfoImpl::
+                   StubValueTy(AP.getSymbol(MO.getGlobal()),
+                               !MO.getGlobal()->hasInternalLinkage());
+    }
+    return Sym;
+  }
+  
+  return Sym;
+}
+
+static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
+                              AsmPrinter &Printer, bool isDarwin) {
+  MCContext &Ctx = Printer.OutContext;
+  MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
+
+  unsigned access = MO.getTargetFlags() & PPCII::MO_ACCESS_MASK;
+
+  switch (access) {
+    case PPCII::MO_TPREL_LO:
+      RefKind = MCSymbolRefExpr::VK_PPC_TPREL_LO;
+      break;
+    case PPCII::MO_TPREL_HA:
+      RefKind = MCSymbolRefExpr::VK_PPC_TPREL_HA;
+      break;
+    case PPCII::MO_DTPREL_LO:
+      RefKind = MCSymbolRefExpr::VK_PPC_DTPREL_LO;
+      break;
+    case PPCII::MO_TLSLD_LO:
+      RefKind = MCSymbolRefExpr::VK_PPC_GOT_TLSLD_LO;
+      break;
+    case PPCII::MO_TOC_LO:
+      RefKind = MCSymbolRefExpr::VK_PPC_TOC_LO;
+      break;
+    case PPCII::MO_TLS:
+      RefKind = MCSymbolRefExpr::VK_PPC_TLS;
+      break;
+  }
+
+  if (MO.getTargetFlags() == PPCII::MO_PLT_OR_STUB && !isDarwin)
+    RefKind = MCSymbolRefExpr::VK_PLT;
+
+  const MCExpr *Expr = MCSymbolRefExpr::Create(Symbol, RefKind, Ctx);
+
+  if (!MO.isJTI() && MO.getOffset())
+    Expr = MCBinaryExpr::CreateAdd(Expr,
+                                   MCConstantExpr::Create(MO.getOffset(), Ctx),
+                                   Ctx);
+
+  // Subtract off the PIC base if required.
+  if (MO.getTargetFlags() & PPCII::MO_PIC_FLAG) {
+    const MachineFunction *MF = MO.getParent()->getParent()->getParent();
+    
+    const MCExpr *PB = MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
+    Expr = MCBinaryExpr::CreateSub(Expr, PB, Ctx);
+  }
+
+  // Add ha16() / lo16() markers if required.
+  switch (access) {
+    case PPCII::MO_LO:
+      Expr = PPCMCExpr::CreateLo(Expr, isDarwin, Ctx);
+      break;
+    case PPCII::MO_HA:
+      Expr = PPCMCExpr::CreateHa(Expr, isDarwin, Ctx);
+      break;
+  }
+
+  return MCOperand::CreateExpr(Expr);
+}
+
+void llvm::LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
+                                        AsmPrinter &AP, bool isDarwin) {
+  OutMI.setOpcode(MI->getOpcode());
+  
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    
+    MCOperand MCOp;
+    switch (MO.getType()) {
+    default:
+      MI->dump();
+      llvm_unreachable("unknown operand type");
+    case MachineOperand::MO_Register:
+      assert(!MO.getSubReg() && "Subregs should be eliminated!");
+      MCOp = MCOperand::CreateReg(MO.getReg());
+      break;
+    case MachineOperand::MO_Immediate:
+      MCOp = MCOperand::CreateImm(MO.getImm());
+      break;
+    case MachineOperand::MO_MachineBasicBlock:
+      MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(
+                                      MO.getMBB()->getSymbol(), AP.OutContext));
+      break;
+    case MachineOperand::MO_GlobalAddress:
+    case MachineOperand::MO_ExternalSymbol:
+      MCOp = GetSymbolRef(MO, GetSymbolFromOperand(MO, AP), AP, isDarwin);
+      break;
+    case MachineOperand::MO_JumpTableIndex:
+      MCOp = GetSymbolRef(MO, AP.GetJTISymbol(MO.getIndex()), AP, isDarwin);
+      break;
+    case MachineOperand::MO_ConstantPoolIndex:
+      MCOp = GetSymbolRef(MO, AP.GetCPISymbol(MO.getIndex()), AP, isDarwin);
+      break;
+    case MachineOperand::MO_BlockAddress:
+      MCOp = GetSymbolRef(MO,AP.GetBlockAddressSymbol(MO.getBlockAddress()),AP,
+                          isDarwin);
+      break;
+    case MachineOperand::MO_RegisterMask:
+      continue;
+    }
+    
+    OutMI.addOperand(MCOp);
+  }
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
new file mode 100644
index 0000000..9da1b1b
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
@@ -0,0 +1,23 @@
+//===-- PPCMachineFunctionInfo.cpp - Private data used for PowerPC --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCMachineFunctionInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+void PPCFunctionInfo::anchor() { }
+
+MCSymbol *PPCFunctionInfo::getPICOffsetSymbol() const {
+  const DataLayout *DL = MF.getTarget().getDataLayout();
+  return MF.getContext().GetOrCreateSymbol(Twine(DL->getPrivateGlobalPrefix())+
+    Twine(MF.getFunctionNumber())+"$poff");
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h
new file mode 100644
index 0000000..9a2cec7
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h
@@ -0,0 +1,191 @@
+//===-- PPCMachineFunctionInfo.h - Private data used for PowerPC --*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the PowerPC specific subclass of MachineFunctionInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PPC_MACHINE_FUNCTION_INFO_H
+#define PPC_MACHINE_FUNCTION_INFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+/// PPCFunctionInfo - This class is derived from MachineFunction private
+/// PowerPC target-specific information for each MachineFunction.
+class PPCFunctionInfo : public MachineFunctionInfo {
+  virtual void anchor();
+
+  /// FramePointerSaveIndex - Frame index of where the old frame pointer is
+  /// stored.  Also used as an anchor for instructions that need to be altered
+  /// when using frame pointers (dyna_add, dyna_sub.)
+  int FramePointerSaveIndex;
+  
+  /// ReturnAddrSaveIndex - Frame index of where the return address is stored.
+  ///
+  int ReturnAddrSaveIndex;
+
+  /// Frame index where the old base pointer is stored.
+  int BasePointerSaveIndex;
+
+  /// MustSaveLR - Indicates whether LR is defined (or clobbered) in the current
+  /// function.  This is only valid after the initial scan of the function by
+  /// PEI.
+  bool MustSaveLR;
+
+  /// Does this function have any stack spills.
+  bool HasSpills;
+
+  /// Does this function spill using instructions with only r+r (not r+i)
+  /// forms.
+  bool HasNonRISpills;
+
+  /// SpillsCR - Indicates whether CR is spilled in the current function.
+  bool SpillsCR;
+
+  /// Indicates whether VRSAVE is spilled in the current function.
+  bool SpillsVRSAVE;
+
+  /// LRStoreRequired - The bool indicates whether there is some explicit use of
+  /// the LR/LR8 stack slot that is not obvious from scanning the code.  This
+  /// requires that the code generator produce a store of LR to the stack on
+  /// entry, even though LR may otherwise apparently not be used.
+  bool LRStoreRequired;
+
+  /// MinReservedArea - This is the frame size that is at least reserved in a
+  /// potential caller (parameter+linkage area).
+  unsigned MinReservedArea;
+
+  /// TailCallSPDelta - Stack pointer delta used when tail calling. Maximum
+  /// amount the stack pointer is adjusted to make the frame bigger for tail
+  /// calls. Used for creating an area before the register spill area.
+  int TailCallSPDelta;
+
+  /// HasFastCall - Does this function contain a fast call. Used to determine
+  /// how the caller's stack pointer should be calculated (epilog/dynamicalloc).
+  bool HasFastCall;
+
+  /// VarArgsFrameIndex - FrameIndex for start of varargs area.
+  int VarArgsFrameIndex;
+  /// VarArgsStackOffset - StackOffset for start of stack
+  /// arguments.
+  int VarArgsStackOffset;
+  /// VarArgsNumGPR - Index of the first unused integer
+  /// register for parameter passing.
+  unsigned VarArgsNumGPR;
+  /// VarArgsNumFPR - Index of the first unused double
+  /// register for parameter passing.
+  unsigned VarArgsNumFPR;
+
+  /// CRSpillFrameIndex - FrameIndex for CR spill slot for 32-bit SVR4.
+  int CRSpillFrameIndex;
+
+  /// If any of CR[2-4] need to be saved in the prologue and restored in the
+  /// epilogue then they are added to this array. This is used for the
+  /// 64-bit SVR4 ABI.
+  SmallVector<unsigned, 3> MustSaveCRs;
+
+  /// Hold onto our MachineFunction context.
+  MachineFunction &MF;
+
+  /// Whether this uses the PIC Base register or not.
+  bool UsesPICBase;
+
+public:
+  explicit PPCFunctionInfo(MachineFunction &MF) 
+    : FramePointerSaveIndex(0),
+      ReturnAddrSaveIndex(0),
+      BasePointerSaveIndex(0),
+      HasSpills(false),
+      HasNonRISpills(false),
+      SpillsCR(false),
+      SpillsVRSAVE(false),
+      LRStoreRequired(false),
+      MinReservedArea(0),
+      TailCallSPDelta(0),
+      HasFastCall(false),
+      VarArgsFrameIndex(0),
+      VarArgsStackOffset(0),
+      VarArgsNumGPR(0),
+      VarArgsNumFPR(0),
+      CRSpillFrameIndex(0),
+      MF(MF),
+      UsesPICBase(0) {}
+
+  int getFramePointerSaveIndex() const { return FramePointerSaveIndex; }
+  void setFramePointerSaveIndex(int Idx) { FramePointerSaveIndex = Idx; }
+  
+  int getReturnAddrSaveIndex() const { return ReturnAddrSaveIndex; }
+  void setReturnAddrSaveIndex(int idx) { ReturnAddrSaveIndex = idx; }
+
+  int getBasePointerSaveIndex() const { return BasePointerSaveIndex; }
+  void setBasePointerSaveIndex(int Idx) { BasePointerSaveIndex = Idx; }
+
+  unsigned getMinReservedArea() const { return MinReservedArea; }
+  void setMinReservedArea(unsigned size) { MinReservedArea = size; }
+
+  int getTailCallSPDelta() const { return TailCallSPDelta; }
+  void setTailCallSPDelta(int size) { TailCallSPDelta = size; }
+
+  /// MustSaveLR - This is set when the prolog/epilog inserter does its initial
+  /// scan of the function. It is true if the LR/LR8 register is ever explicitly
+  /// defined/clobbered in the machine function (e.g. by calls and movpctolr,
+  /// which is used in PIC generation), or if the LR stack slot is explicitly
+  /// referenced by builtin_return_address.
+  void setMustSaveLR(bool U) { MustSaveLR = U; }
+  bool mustSaveLR() const    { return MustSaveLR; }
+
+  void setHasSpills()      { HasSpills = true; }
+  bool hasSpills() const   { return HasSpills; }
+
+  void setHasNonRISpills()    { HasNonRISpills = true; }
+  bool hasNonRISpills() const { return HasNonRISpills; }
+
+  void setSpillsCR()       { SpillsCR = true; }
+  bool isCRSpilled() const { return SpillsCR; }
+
+  void setSpillsVRSAVE()       { SpillsVRSAVE = true; }
+  bool isVRSAVESpilled() const { return SpillsVRSAVE; }
+
+  void setLRStoreRequired() { LRStoreRequired = true; }
+  bool isLRStoreRequired() const { return LRStoreRequired; }
+
+  void setHasFastCall() { HasFastCall = true; }
+  bool hasFastCall() const { return HasFastCall;}
+
+  int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
+  void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; }
+
+  int getVarArgsStackOffset() const { return VarArgsStackOffset; }
+  void setVarArgsStackOffset(int Offset) { VarArgsStackOffset = Offset; }
+
+  unsigned getVarArgsNumGPR() const { return VarArgsNumGPR; }
+  void setVarArgsNumGPR(unsigned Num) { VarArgsNumGPR = Num; }
+
+  unsigned getVarArgsNumFPR() const { return VarArgsNumFPR; }
+  void setVarArgsNumFPR(unsigned Num) { VarArgsNumFPR = Num; }
+
+  int getCRSpillFrameIndex() const { return CRSpillFrameIndex; }
+  void setCRSpillFrameIndex(int idx) { CRSpillFrameIndex = idx; }
+
+  const SmallVectorImpl<unsigned> &
+    getMustSaveCRs() const { return MustSaveCRs; }
+  void addMustSaveCR(unsigned Reg) { MustSaveCRs.push_back(Reg); }
+
+  void setUsesPICBase(bool uses) { UsesPICBase = uses; }
+  bool usesPICBase() const { return UsesPICBase; }
+
+  MCSymbol *getPICOffsetSymbol() const;
+};
+
+} // end of namespace llvm
+
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCPerfectShuffle.h b/contrib/llvm/lib/Target/PowerPC/PPCPerfectShuffle.h
new file mode 100644
index 0000000..17b836d
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCPerfectShuffle.h
@@ -0,0 +1,6586 @@
+//===-- PPCPerfectShuffle.h - Altivec Perfect Shuffle Table -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file, which was autogenerated by llvm-PerfectShuffle, contains data
+// for the optimal way to build a perfect shuffle without using vperm.
+//
+//===----------------------------------------------------------------------===//
+
+// 31 entries have cost 0
+// 292 entries have cost 1
+// 1384 entries have cost 2
+// 3061 entries have cost 3
+// 1733 entries have cost 4
+// 60 entries have cost 5
+
+// This table is 6561*4 = 26244 bytes in size.
+static const unsigned PerfectShuffleTable[6561+1] = {
+  202162278U,	// <0,0,0,0>: Cost 1 vspltisw0 LHS
+  1140850790U,	// <0,0,0,1>: Cost 2 vmrghw <0,0,0,0>, LHS
+  2617247181U,	// <0,0,0,2>: Cost 3 vsldoi4 <0,0,0,0>, <2,0,3,0>
+  2635163787U,	// <0,0,0,3>: Cost 3 vsldoi4 <3,0,0,0>, <3,0,0,0>
+  1543507254U,	// <0,0,0,4>: Cost 2 vsldoi4 <0,0,0,0>, RHS
+  2281701705U,	// <0,0,0,5>: Cost 3 vmrglw <0,0,0,0>, <0,4,0,5>
+  2617250133U,	// <0,0,0,6>: Cost 3 vsldoi4 <0,0,0,0>, <6,0,7,0>
+  2659054575U,	// <0,0,0,7>: Cost 3 vsldoi4 <7,0,0,0>, <7,0,0,0>
+  202162278U,	// <0,0,0,u>: Cost 1 vspltisw0 LHS
+  1141686282U,	// <0,0,1,0>: Cost 2 vmrghw LHS, <0,0,1,1>
+  67944550U,	// <0,0,1,1>: Cost 1 vmrghw LHS, LHS
+  1685241958U,	// <0,0,1,2>: Cost 2 vsldoi12 <1,2,3,0>, LHS
+  2215870716U,	// <0,0,1,3>: Cost 3 vmrghw LHS, <0,3,1,0>
+  1141727570U,	// <0,0,1,4>: Cost 2 vmrghw LHS, <0,4,1,5>
+  2215428562U,	// <0,0,1,5>: Cost 3 vmrghw LHS, <0,5,6,7>
+  2215428589U,	// <0,0,1,6>: Cost 3 vmrghw LHS, <0,6,0,7>
+  2659062768U,	// <0,0,1,7>: Cost 3 vsldoi4 <7,0,0,1>, <7,0,0,1>
+  67945117U,	// <0,0,1,u>: Cost 1 vmrghw LHS, LHS
+  2684356045U,	// <0,0,2,0>: Cost 3 vsldoi8 <0,0,0,0>, <2,0,3,0>
+  2216009830U,	// <0,0,2,1>: Cost 3 vmrghw <0,2,1,2>, LHS
+  2216009901U,	// <0,0,2,2>: Cost 3 vmrghw <0,2,1,2>, <0,2,1,2>
+  2698290853U,	// <0,0,2,3>: Cost 3 vsldoi8 <2,3,0,0>, <2,3,0,0>
+  3289751890U,	// <0,0,2,4>: Cost 4 vmrghw <0,2,1,2>, <0,4,1,5>
+  3758098275U,	// <0,0,2,5>: Cost 4 vsldoi8 <0,0,0,0>, <2,5,3,1>
+  2684356538U,	// <0,0,2,6>: Cost 3 vsldoi8 <0,0,0,0>, <2,6,3,7>
+  3758098410U,	// <0,0,2,7>: Cost 4 vsldoi8 <0,0,0,0>, <2,7,0,1>
+  2216010397U,	// <0,0,2,u>: Cost 3 vmrghw <0,2,1,2>, LHS
+  2702272651U,	// <0,0,3,0>: Cost 3 vsldoi8 <3,0,0,0>, <3,0,0,0>
+  2216656998U,	// <0,0,3,1>: Cost 3 vmrghw <0,3,1,0>, LHS
+  3844669704U,	// <0,0,3,2>: Cost 4 vsldoi12 <3,2,3,0>, <0,3,2,3>
+  2216657148U,	// <0,0,3,3>: Cost 3 vmrghw <0,3,1,0>, <0,3,1,0>
+  2684357122U,	// <0,0,3,4>: Cost 3 vsldoi8 <0,0,0,0>, <3,4,5,6>
+  3732820066U,	// <0,0,3,5>: Cost 4 vsldoi4 <7,0,0,3>, <5,6,7,0>
+  3778005624U,	// <0,0,3,6>: Cost 4 vsldoi8 <3,3,0,0>, <3,6,0,7>
+  3374713464U,	// <0,0,3,7>: Cost 4 vmrglw <3,2,0,3>, <3,6,0,7>
+  2216657565U,	// <0,0,3,u>: Cost 3 vmrghw <0,3,1,0>, LHS
+  2217361408U,	// <0,0,4,0>: Cost 3 vmrghw <0,4,1,5>, <0,0,0,0>
+  1143619686U,	// <0,0,4,1>: Cost 2 vmrghw <0,4,1,5>, LHS
+  3291103405U,	// <0,0,4,2>: Cost 4 vmrghw <0,4,1,5>, <0,2,1,2>
+  3827269988U,	// <0,0,4,3>: Cost 4 vsldoi12 <0,3,1,0>, <0,4,3,5>
+  1143619922U,	// <0,0,4,4>: Cost 2 vmrghw <0,4,1,5>, <0,4,1,5>
+  1610616118U,	// <0,0,4,5>: Cost 2 vsldoi8 <0,0,0,0>, RHS
+  3758099833U,	// <0,0,4,6>: Cost 4 vsldoi8 <0,0,0,0>, <4,6,5,2>
+  3854107016U,	// <0,0,4,7>: Cost 4 vsldoi12 <4,7,5,0>, <0,4,7,5>
+  1143620253U,	// <0,0,4,u>: Cost 2 vmrghw <0,4,1,5>, LHS
+  2284396544U,	// <0,0,5,0>: Cost 3 vmrglw <0,4,0,5>, <0,0,0,0>
+  2218025062U,	// <0,0,5,1>: Cost 3 vmrghw <0,5,1,5>, LHS
+  3758100203U,	// <0,0,5,2>: Cost 4 vsldoi8 <0,0,0,0>, <5,2,1,3>
+  3395966100U,	// <0,0,5,3>: Cost 4 vmrglw <6,7,0,5>, <7,2,0,3>
+  3804549052U,	// <0,0,5,4>: Cost 4 vsldoi8 <7,7,0,0>, <5,4,6,5>
+  2302314964U,	// <0,0,5,5>: Cost 3 vmrglw <3,4,0,5>, <3,4,0,5>
+  2785821138U,	// <0,0,5,6>: Cost 3 vsldoi12 <5,6,7,0>, <0,5,6,7>
+  3395966428U,	// <0,0,5,7>: Cost 4 vmrglw <6,7,0,5>, <7,6,0,7>
+  2787148260U,	// <0,0,5,u>: Cost 3 vsldoi12 <5,u,7,0>, <0,5,u,7>
+  2684358997U,	// <0,0,6,0>: Cost 3 vsldoi8 <0,0,0,0>, <6,0,7,0>
+  2218631270U,	// <0,0,6,1>: Cost 3 vmrghw <0,6,0,7>, LHS
+  2684359162U,	// <0,0,6,2>: Cost 3 vsldoi8 <0,0,0,0>, <6,2,7,3>
+  3758101042U,	// <0,0,6,3>: Cost 4 vsldoi8 <0,0,0,0>, <6,3,4,5>
+  3732843830U,	// <0,0,6,4>: Cost 4 vsldoi4 <7,0,0,6>, RHS
+  3758101227U,	// <0,0,6,5>: Cost 4 vsldoi8 <0,0,0,0>, <6,5,7,1>
+  2684359480U,	// <0,0,6,6>: Cost 3 vsldoi8 <0,0,0,0>, <6,6,6,6>
+  2724836173U,	// <0,0,6,7>: Cost 3 vsldoi8 <6,7,0,0>, <6,7,0,0>
+  2725499806U,	// <0,0,6,u>: Cost 3 vsldoi8 <6,u,0,0>, <6,u,0,0>
+  2726163439U,	// <0,0,7,0>: Cost 3 vsldoi8 <7,0,0,0>, <7,0,0,0>
+  2219311206U,	// <0,0,7,1>: Cost 3 vmrghw <0,7,1,0>, LHS
+  3868557900U,	// <0,0,7,2>: Cost 4 vsldoi12 <7,2,3,0>, <0,7,2,3>
+  3377400112U,	// <0,0,7,3>: Cost 4 vmrglw <3,6,0,7>, <3,2,0,3>
+  2684360038U,	// <0,0,7,4>: Cost 3 vsldoi8 <0,0,0,0>, <7,4,5,6>
+  3732852834U,	// <0,0,7,5>: Cost 4 vsldoi4 <7,0,0,7>, <5,6,7,0>
+  3871507060U,	// <0,0,7,6>: Cost 4 vsldoi12 <7,6,7,0>, <0,7,6,7>
+  2303658616U,	// <0,0,7,7>: Cost 3 vmrglw <3,6,0,7>, <3,6,0,7>
+  2726163439U,	// <0,0,7,u>: Cost 3 vsldoi8 <7,0,0,0>, <7,0,0,0>
+  202162278U,	// <0,0,u,0>: Cost 1 vspltisw0 LHS
+  72589414U,	// <0,0,u,1>: Cost 1 vmrghw LHS, LHS
+  1685242525U,	// <0,0,u,2>: Cost 2 vsldoi12 <1,2,3,0>, LHS
+  2220073212U,	// <0,0,u,3>: Cost 3 vmrghw LHS, <0,3,1,0>
+  1146331474U,	// <0,0,u,4>: Cost 2 vmrghw LHS, <0,4,1,5>
+  1610619034U,	// <0,0,u,5>: Cost 2 vsldoi8 <0,0,0,0>, RHS
+  2785821138U,	// <0,0,u,6>: Cost 3 vsldoi12 <5,6,7,0>, <0,5,6,7>
+  2659120119U,	// <0,0,u,7>: Cost 3 vsldoi4 <7,0,0,u>, <7,0,0,u>
+  72589981U,	// <0,0,u,u>: Cost 1 vmrghw LHS, LHS
+  2698297344U,	// <0,1,0,0>: Cost 3 vsldoi8 <2,3,0,1>, <0,0,0,0>
+  1624555622U,	// <0,1,0,1>: Cost 2 vsldoi8 <2,3,0,1>, LHS
+  2758984428U,	// <0,1,0,2>: Cost 3 vsldoi12 <1,2,3,0>, <1,0,2,1>
+  2635237524U,	// <0,1,0,3>: Cost 3 vsldoi4 <3,0,1,0>, <3,0,1,0>
+  2693652818U,	// <0,1,0,4>: Cost 3 vsldoi8 <1,5,0,1>, <0,4,1,5>
+  2281701714U,	// <0,1,0,5>: Cost 3 vmrglw <0,0,0,0>, <0,4,1,5>
+  2698297846U,	// <0,1,0,6>: Cost 3 vsldoi8 <2,3,0,1>, <0,6,1,7>
+  2659128312U,	// <0,1,0,7>: Cost 3 vsldoi4 <7,0,1,0>, <7,0,1,0>
+  1624556189U,	// <0,1,0,u>: Cost 2 vsldoi8 <2,3,0,1>, LHS
+  1543585802U,	// <0,1,1,0>: Cost 2 vsldoi4 <0,0,1,1>, <0,0,1,1>
+  1141728052U,	// <0,1,1,1>: Cost 2 vmrghw LHS, <1,1,1,1>
+  1141728150U,	// <0,1,1,2>: Cost 2 vmrghw LHS, <1,2,3,0>
+  2295644334U,	// <0,1,1,3>: Cost 3 vmrglw <2,3,0,1>, <0,2,1,3>
+  1543589174U,	// <0,1,1,4>: Cost 2 vsldoi4 <0,0,1,1>, RHS
+  2290999634U,	// <0,1,1,5>: Cost 3 vmrglw <1,5,0,1>, <0,4,1,5>
+  2617332135U,	// <0,1,1,6>: Cost 3 vsldoi4 <0,0,1,1>, <6,1,7,1>
+  2617332720U,	// <0,1,1,7>: Cost 3 vsldoi4 <0,0,1,1>, <7,0,0,1>
+  1142171004U,	// <0,1,1,u>: Cost 2 vmrghw LHS, <1,u,3,0>
+  1561509990U,	// <0,1,2,0>: Cost 2 vsldoi4 <3,0,1,2>, LHS
+  2623308516U,	// <0,1,2,1>: Cost 3 vsldoi4 <1,0,1,2>, <1,0,1,2>
+  2698298984U,	// <0,1,2,2>: Cost 3 vsldoi8 <2,3,0,1>, <2,2,2,2>
+  835584U,	// <0,1,2,3>: Cost 0 copy LHS
+  1561513270U,	// <0,1,2,4>: Cost 2 vsldoi4 <3,0,1,2>, RHS
+  2647199304U,	// <0,1,2,5>: Cost 3 vsldoi4 <5,0,1,2>, <5,0,1,2>
+  2698299322U,	// <0,1,2,6>: Cost 3 vsldoi8 <2,3,0,1>, <2,6,3,7>
+  1585402874U,	// <0,1,2,7>: Cost 2 vsldoi4 <7,0,1,2>, <7,0,1,2>
+  835584U,	// <0,1,2,u>: Cost 0 copy LHS
+  2698299540U,	// <0,1,3,0>: Cost 3 vsldoi8 <2,3,0,1>, <3,0,1,0>
+  3290399540U,	// <0,1,3,1>: Cost 4 vmrghw <0,3,1,0>, <1,1,1,1>
+  2698299720U,	// <0,1,3,2>: Cost 3 vsldoi8 <2,3,0,1>, <3,2,3,0>
+  2698299804U,	// <0,1,3,3>: Cost 3 vsldoi8 <2,3,0,1>, <3,3,3,3>
+  2698299906U,	// <0,1,3,4>: Cost 3 vsldoi8 <2,3,0,1>, <3,4,5,6>
+  3832726521U,	// <0,1,3,5>: Cost 4 vsldoi12 <1,2,3,0>, <1,3,5,0>
+  2724842160U,	// <0,1,3,6>: Cost 3 vsldoi8 <6,7,0,1>, <3,6,7,0>
+  2706926275U,	// <0,1,3,7>: Cost 3 vsldoi8 <3,7,0,1>, <3,7,0,1>
+  2698300190U,	// <0,1,3,u>: Cost 3 vsldoi8 <2,3,0,1>, <3,u,1,2>
+  2635268198U,	// <0,1,4,0>: Cost 3 vsldoi4 <3,0,1,4>, LHS
+  2217362228U,	// <0,1,4,1>: Cost 3 vmrghw <0,4,1,5>, <1,1,1,1>
+  2217362326U,	// <0,1,4,2>: Cost 3 vmrghw <0,4,1,5>, <1,2,3,0>
+  2635270296U,	// <0,1,4,3>: Cost 3 vsldoi4 <3,0,1,4>, <3,0,1,4>
+  2635271478U,	// <0,1,4,4>: Cost 3 vsldoi4 <3,0,1,4>, RHS
+  1624558902U,	// <0,1,4,5>: Cost 2 vsldoi8 <2,3,0,1>, RHS
+  2659160910U,	// <0,1,4,6>: Cost 3 vsldoi4 <7,0,1,4>, <6,7,0,1>
+  2659161084U,	// <0,1,4,7>: Cost 3 vsldoi4 <7,0,1,4>, <7,0,1,4>
+  1624559145U,	// <0,1,4,u>: Cost 2 vsldoi8 <2,3,0,1>, RHS
+  3832726639U,	// <0,1,5,0>: Cost 4 vsldoi12 <1,2,3,0>, <1,5,0,1>
+  2714889871U,	// <0,1,5,1>: Cost 3 vsldoi8 <5,1,0,1>, <5,1,0,1>
+  2302314646U,	// <0,1,5,2>: Cost 3 vmrglw <3,4,0,5>, <3,0,1,2>
+  3834717321U,	// <0,1,5,3>: Cost 4 vsldoi12 <1,5,3,0>, <1,5,3,0>
+  3832726679U,	// <0,1,5,4>: Cost 4 vsldoi12 <1,2,3,0>, <1,5,4,5>
+  2717544403U,	// <0,1,5,5>: Cost 3 vsldoi8 <5,5,0,1>, <5,5,0,1>
+  2718208036U,	// <0,1,5,6>: Cost 3 vsldoi8 <5,6,0,1>, <5,6,0,1>
+  3792613493U,	// <0,1,5,7>: Cost 4 vsldoi8 <5,7,0,1>, <5,7,0,1>
+  2719535302U,	// <0,1,5,u>: Cost 3 vsldoi8 <5,u,0,1>, <5,u,0,1>
+  2659172454U,	// <0,1,6,0>: Cost 3 vsldoi4 <7,0,1,6>, LHS
+  3832726735U,	// <0,1,6,1>: Cost 4 vsldoi12 <1,2,3,0>, <1,6,1,7>
+  2724844026U,	// <0,1,6,2>: Cost 3 vsldoi8 <6,7,0,1>, <6,2,7,3>
+  3775361608U,	// <0,1,6,3>: Cost 4 vsldoi8 <2,u,0,1>, <6,3,7,0>
+  2659175734U,	// <0,1,6,4>: Cost 3 vsldoi4 <7,0,1,6>, RHS
+  3832726771U,	// <0,1,6,5>: Cost 4 vsldoi12 <1,2,3,0>, <1,6,5,7>
+  2724844344U,	// <0,1,6,6>: Cost 3 vsldoi8 <6,7,0,1>, <6,6,6,6>
+  1651102542U,	// <0,1,6,7>: Cost 2 vsldoi8 <6,7,0,1>, <6,7,0,1>
+  1651766175U,	// <0,1,6,u>: Cost 2 vsldoi8 <6,u,0,1>, <6,u,0,1>
+  2724844536U,	// <0,1,7,0>: Cost 3 vsldoi8 <6,7,0,1>, <7,0,1,0>
+  3377397770U,	// <0,1,7,1>: Cost 4 vmrglw <3,6,0,7>, <0,0,1,1>
+  2698302636U,	// <0,1,7,2>: Cost 3 vsldoi8 <2,3,0,1>, <7,2,3,0>
+  2728162531U,	// <0,1,7,3>: Cost 3 vsldoi8 <7,3,0,1>, <7,3,0,1>
+  2724844902U,	// <0,1,7,4>: Cost 3 vsldoi8 <6,7,0,1>, <7,4,5,6>
+  3377398098U,	// <0,1,7,5>: Cost 4 vmrglw <3,6,0,7>, <0,4,1,5>
+  2724845076U,	// <0,1,7,6>: Cost 3 vsldoi8 <6,7,0,1>, <7,6,7,0>
+  2724845164U,	// <0,1,7,7>: Cost 3 vsldoi8 <6,7,0,1>, <7,7,7,7>
+  2724845186U,	// <0,1,7,u>: Cost 3 vsldoi8 <6,7,0,1>, <7,u,1,2>
+  1561559142U,	// <0,1,u,0>: Cost 2 vsldoi4 <3,0,1,u>, LHS
+  1146331956U,	// <0,1,u,1>: Cost 2 vmrghw LHS, <1,1,1,1>
+  1146332054U,	// <0,1,u,2>: Cost 2 vmrghw LHS, <1,2,3,0>
+  835584U,	// <0,1,u,3>: Cost 0 copy LHS
+  1561562422U,	// <0,1,u,4>: Cost 2 vsldoi4 <3,0,1,u>, RHS
+  1624561818U,	// <0,1,u,5>: Cost 2 vsldoi8 <2,3,0,1>, RHS
+  2220074191U,	// <0,1,u,6>: Cost 3 vmrghw LHS, <1,6,1,7>
+  1585452032U,	// <0,1,u,7>: Cost 2 vsldoi4 <7,0,1,u>, <7,0,1,u>
+  835584U,	// <0,1,u,u>: Cost 0 copy LHS
+  2214593997U,	// <0,2,0,0>: Cost 3 vmrghw <0,0,0,0>, <2,0,3,0>
+  2214675999U,	// <0,2,0,1>: Cost 3 vmrghw <0,0,1,1>, <2,1,3,1>
+  2214594152U,	// <0,2,0,2>: Cost 3 vmrghw <0,0,0,0>, <2,2,2,2>
+  1207959654U,	// <0,2,0,3>: Cost 2 vmrglw <0,0,0,0>, LHS
+  3709054262U,	// <0,2,0,4>: Cost 4 vsldoi4 <3,0,2,0>, RHS
+  3375350836U,	// <0,2,0,5>: Cost 4 vmrglw <3,3,0,0>, <1,4,2,5>
+  2214594490U,	// <0,2,0,6>: Cost 3 vmrghw <0,0,0,0>, <2,6,3,7>
+  3288336362U,	// <0,2,0,7>: Cost 4 vmrghw <0,0,0,0>, <2,7,0,1>
+  1207959659U,	// <0,2,0,u>: Cost 2 vmrglw <0,0,0,0>, LHS
+  2215871994U,	// <0,2,1,0>: Cost 3 vmrghw LHS, <2,0,u,0>
+  2215470623U,	// <0,2,1,1>: Cost 3 vmrghw LHS, <2,1,3,1>
+  1141728872U,	// <0,2,1,2>: Cost 2 vmrghw LHS, <2,2,2,2>
+  1141728934U,	// <0,2,1,3>: Cost 2 vmrghw LHS, <2,3,0,1>
+  2215872323U,	// <0,2,1,4>: Cost 3 vmrghw LHS, <2,4,u,5>
+  2215872405U,	// <0,2,1,5>: Cost 3 vmrghw LHS, <2,5,u,6>
+  1141729210U,	// <0,2,1,6>: Cost 2 vmrghw LHS, <2,6,3,7>
+  2215430122U,	// <0,2,1,7>: Cost 3 vmrghw LHS, <2,7,0,1>
+  1141729368U,	// <0,2,1,u>: Cost 2 vmrghw LHS, <2,u,3,3>
+  3289736698U,	// <0,2,2,0>: Cost 4 vmrghw <0,2,1,0>, <2,0,u,0>
+  3289744927U,	// <0,2,2,1>: Cost 4 vmrghw <0,2,1,1>, <2,1,3,1>
+  2216011368U,	// <0,2,2,2>: Cost 3 vmrghw <0,2,1,2>, <2,2,2,2>
+  2216019622U,	// <0,2,2,3>: Cost 3 vmrghw <0,2,1,3>, <2,3,0,1>
+  3289769795U,	// <0,2,2,4>: Cost 4 vmrghw <0,2,1,4>, <2,4,u,5>
+  3289778069U,	// <0,2,2,5>: Cost 4 vmrghw <0,2,1,5>, <2,5,u,6>
+  2216044474U,	// <0,2,2,6>: Cost 3 vmrghw <0,2,1,6>, <2,6,3,7>
+  3732960259U,	// <0,2,2,7>: Cost 4 vsldoi4 <7,0,2,2>, <7,0,2,2>
+  2216061016U,	// <0,2,2,u>: Cost 3 vmrghw <0,2,1,u>, <2,u,3,3>
+  2758985382U,	// <0,2,3,0>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,0,1>
+  2758985392U,	// <0,2,3,1>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,1,2>
+  3290400360U,	// <0,2,3,2>: Cost 4 vmrghw <0,3,1,0>, <2,2,2,2>
+  2758985408U,	// <0,2,3,3>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,3,0>
+  2758985422U,	// <0,2,3,4>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,4,5>
+  2785822424U,	// <0,2,3,5>: Cost 3 vsldoi12 <5,6,7,0>, <2,3,5,6>
+  3290400698U,	// <0,2,3,6>: Cost 4 vmrghw <0,3,1,0>, <2,6,3,7>
+  2765915876U,	// <0,2,3,7>: Cost 3 vsldoi12 <2,3,7,0>, <2,3,7,0>
+  2758985453U,	// <0,2,3,u>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,u,0>
+  3291104762U,	// <0,2,4,0>: Cost 4 vmrghw <0,4,1,5>, <2,0,u,0>
+  2217362979U,	// <0,2,4,1>: Cost 3 vmrghw <0,4,1,5>, <2,1,3,5>
+  2217363048U,	// <0,2,4,2>: Cost 3 vmrghw <0,4,1,5>, <2,2,2,2>
+  2217363110U,	// <0,2,4,3>: Cost 3 vmrghw <0,4,1,5>, <2,3,0,1>
+  3291105087U,	// <0,2,4,4>: Cost 4 vmrghw <0,4,1,5>, <2,4,u,1>
+  3291105173U,	// <0,2,4,5>: Cost 4 vmrghw <0,4,1,5>, <2,5,u,6>
+  2217363386U,	// <0,2,4,6>: Cost 3 vmrghw <0,4,1,5>, <2,6,3,7>
+  3788639688U,	// <0,2,4,7>: Cost 4 vsldoi8 <5,1,0,2>, <4,7,5,0>
+  2217363515U,	// <0,2,4,u>: Cost 3 vmrghw <0,4,1,5>, <2,u,0,1>
+  3376054371U,	// <0,2,5,0>: Cost 4 vmrglw <3,4,0,5>, <0,1,2,0>
+  3788639888U,	// <0,2,5,1>: Cost 4 vsldoi8 <5,1,0,2>, <5,1,0,2>
+  3376055912U,	// <0,2,5,2>: Cost 4 vmrglw <3,4,0,5>, <2,2,2,2>
+  2302312550U,	// <0,2,5,3>: Cost 3 vmrglw <3,4,0,5>, LHS
+  3376054375U,	// <0,2,5,4>: Cost 4 vmrglw <3,4,0,5>, <0,1,2,4>
+  3374728244U,	// <0,2,5,5>: Cost 4 vmrglw <3,2,0,5>, <1,4,2,5>
+  3805229154U,	// <0,2,5,6>: Cost 4 vsldoi8 <7,u,0,2>, <5,6,7,0>
+  3376055512U,	// <0,2,5,7>: Cost 4 vmrglw <3,4,0,5>, <1,6,2,7>
+  2302312555U,	// <0,2,5,u>: Cost 3 vmrglw <3,4,0,5>, LHS
+  3709100134U,	// <0,2,6,0>: Cost 4 vsldoi4 <3,0,2,6>, LHS
+  3709100950U,	// <0,2,6,1>: Cost 4 vsldoi4 <3,0,2,6>, <1,2,3,0>
+  3709102010U,	// <0,2,6,2>: Cost 4 vsldoi4 <3,0,2,6>, <2,6,3,7>
+  2758985658U,	// <0,2,6,3>: Cost 3 vsldoi12 <1,2,3,0>, <2,6,3,7>
+  3709103414U,	// <0,2,6,4>: Cost 4 vsldoi4 <3,0,2,6>, RHS
+  3732992098U,	// <0,2,6,5>: Cost 4 vsldoi4 <7,0,2,6>, <5,6,7,0>
+  3292374970U,	// <0,2,6,6>: Cost 4 vmrghw <0,6,0,7>, <2,6,3,7>
+  3798594383U,	// <0,2,6,7>: Cost 4 vsldoi8 <6,7,0,2>, <6,7,0,2>
+  2758985703U,	// <0,2,6,u>: Cost 3 vsldoi12 <1,2,3,0>, <2,6,u,7>
+  3788641274U,	// <0,2,7,0>: Cost 4 vsldoi8 <5,1,0,2>, <7,0,1,2>
+  3377398508U,	// <0,2,7,1>: Cost 4 vmrglw <3,6,0,7>, <1,0,2,1>
+  3377398590U,	// <0,2,7,2>: Cost 4 vmrglw <3,6,0,7>, <1,1,2,2>
+  2303656038U,	// <0,2,7,3>: Cost 3 vmrglw <3,6,0,7>, LHS
+  3709111606U,	// <0,2,7,4>: Cost 4 vsldoi4 <3,0,2,7>, RHS
+  3377398836U,	// <0,2,7,5>: Cost 4 vmrglw <3,6,0,7>, <1,4,2,5>
+  3803903447U,	// <0,2,7,6>: Cost 4 vsldoi8 <7,6,0,2>, <7,6,0,2>
+  3293054954U,	// <0,2,7,7>: Cost 4 vmrghw <0,7,1,0>, <2,7,0,1>
+  2303656043U,	// <0,2,7,u>: Cost 3 vmrglw <3,6,0,7>, LHS
+  2220074490U,	// <0,2,u,0>: Cost 3 vmrghw LHS, <2,0,u,0>
+  2220074527U,	// <0,2,u,1>: Cost 3 vmrghw LHS, <2,1,3,1>
+  1146332776U,	// <0,2,u,2>: Cost 2 vmrghw LHS, <2,2,2,2>
+  1146332838U,	// <0,2,u,3>: Cost 2 vmrghw LHS, <2,3,0,1>
+  2220074819U,	// <0,2,u,4>: Cost 3 vmrghw LHS, <2,4,u,5>
+  2220074901U,	// <0,2,u,5>: Cost 3 vmrghw LHS, <2,5,u,6>
+  1146333114U,	// <0,2,u,6>: Cost 2 vmrghw LHS, <2,6,3,7>
+  2220074986U,	// <0,2,u,7>: Cost 3 vmrghw LHS, <2,7,0,1>
+  1146333243U,	// <0,2,u,u>: Cost 2 vmrghw LHS, <2,u,0,1>
+  2629410816U,	// <0,3,0,0>: Cost 3 vsldoi4 <2,0,3,0>, <0,0,0,0>
+  2753530006U,	// <0,3,0,1>: Cost 3 vsldoi12 <0,3,1,0>, <3,0,1,2>
+  2629412301U,	// <0,3,0,2>: Cost 3 vsldoi4 <2,0,3,0>, <2,0,3,0>
+  2214594972U,	// <0,3,0,3>: Cost 3 vmrghw <0,0,0,0>, <3,3,3,3>
+  2758985908U,	// <0,3,0,4>: Cost 3 vsldoi12 <1,2,3,0>, <3,0,4,5>
+  3733016674U,	// <0,3,0,5>: Cost 4 vsldoi4 <7,0,3,0>, <5,6,7,0>
+  3777364488U,	// <0,3,0,6>: Cost 4 vsldoi8 <3,2,0,3>, <0,6,3,7>
+  2281703354U,	// <0,3,0,7>: Cost 3 vmrglw <0,0,0,0>, <2,6,3,7>
+  2758985941U,	// <0,3,0,u>: Cost 3 vsldoi12 <1,2,3,0>, <3,0,u,2>
+  1141729430U,	// <0,3,1,0>: Cost 2 vmrghw LHS, <3,0,1,2>
+  2215471334U,	// <0,3,1,1>: Cost 3 vmrghw LHS, <3,1,1,1>
+  2215471425U,	// <0,3,1,2>: Cost 3 vmrghw LHS, <3,2,2,2>
+  1141729692U,	// <0,3,1,3>: Cost 2 vmrghw LHS, <3,3,3,3>
+  1141729794U,	// <0,3,1,4>: Cost 2 vmrghw LHS, <3,4,5,6>
+  2215430738U,	// <0,3,1,5>: Cost 3 vmrghw LHS, <3,5,5,5>
+  2215430776U,	// <0,3,1,6>: Cost 3 vmrghw LHS, <3,6,0,7>
+  2295646138U,	// <0,3,1,7>: Cost 3 vmrglw <2,3,0,1>, <2,6,3,7>
+  1141730078U,	// <0,3,1,u>: Cost 2 vmrghw LHS, <3,u,1,2>
+  2758986032U,	// <0,3,2,0>: Cost 3 vsldoi12 <1,2,3,0>, <3,2,0,3>
+  3709141910U,	// <0,3,2,1>: Cost 4 vsldoi4 <3,0,3,2>, <1,2,3,0>
+  3289753921U,	// <0,3,2,2>: Cost 4 vmrghw <0,2,1,2>, <3,2,2,2>
+  2770929992U,	// <0,3,2,3>: Cost 3 vsldoi12 <3,2,3,0>, <3,2,3,0>
+  3289754114U,	// <0,3,2,4>: Cost 4 vmrghw <0,2,1,2>, <3,4,5,6>
+  3362095460U,	// <0,3,2,5>: Cost 5 vmrglw <1,1,0,2>, <0,4,3,5>
+  3832727910U,	// <0,3,2,6>: Cost 4 vsldoi12 <1,2,3,0>, <3,2,6,3>
+  3365414842U,	// <0,3,2,7>: Cost 4 vmrglw <1,6,0,2>, <2,6,3,7>
+  2771298677U,	// <0,3,2,u>: Cost 3 vsldoi12 <3,2,u,0>, <3,2,u,0>
+  2216659094U,	// <0,3,3,0>: Cost 3 vmrghw <0,3,1,0>, <3,0,1,2>
+  3290409190U,	// <0,3,3,1>: Cost 4 vmrghw <0,3,1,1>, <3,1,1,1>
+  2703624496U,	// <0,3,3,2>: Cost 3 vsldoi8 <3,2,0,3>, <3,2,0,3>
+  2216683932U,	// <0,3,3,3>: Cost 3 vmrghw <0,3,1,3>, <3,3,3,3>
+  2216692226U,	// <0,3,3,4>: Cost 3 vmrghw <0,3,1,4>, <3,4,5,6>
+  3733041250U,	// <0,3,3,5>: Cost 4 vsldoi4 <7,0,3,3>, <5,6,7,0>
+  3832727988U,	// <0,3,3,6>: Cost 4 vsldoi12 <1,2,3,0>, <3,3,6,0>
+  3374712762U,	// <0,3,3,7>: Cost 4 vmrglw <3,2,0,3>, <2,6,3,7>
+  2216725278U,	// <0,3,3,u>: Cost 3 vmrghw <0,3,1,u>, <3,u,1,2>
+  2217363606U,	// <0,3,4,0>: Cost 3 vmrghw <0,4,1,5>, <3,0,1,2>
+  3291105510U,	// <0,3,4,1>: Cost 4 vmrghw <0,4,1,5>, <3,1,1,1>
+  3291105601U,	// <0,3,4,2>: Cost 4 vmrghw <0,4,1,5>, <3,2,2,2>
+  2217363868U,	// <0,3,4,3>: Cost 3 vmrghw <0,4,1,5>, <3,3,3,3>
+  2217363970U,	// <0,3,4,4>: Cost 3 vmrghw <0,4,1,5>, <3,4,5,6>
+  2758986242U,	// <0,3,4,5>: Cost 3 vsldoi12 <1,2,3,0>, <3,4,5,6>
+  3727077685U,	// <0,3,4,6>: Cost 4 vsldoi4 <6,0,3,4>, <6,0,3,4>
+  3364767674U,	// <0,3,4,7>: Cost 4 vmrglw <1,5,0,4>, <2,6,3,7>
+  2217364254U,	// <0,3,4,u>: Cost 3 vmrghw <0,4,1,5>, <3,u,1,2>
+  3832728102U,	// <0,3,5,0>: Cost 4 vsldoi12 <1,2,3,0>, <3,5,0,6>
+  3405916003U,	// <0,3,5,1>: Cost 4 vmrglw <u,4,0,5>, <2,5,3,1>
+  3376055840U,	// <0,3,5,2>: Cost 4 vmrglw <3,4,0,5>, <2,1,3,2>
+  3376055679U,	// <0,3,5,3>: Cost 4 vmrglw <3,4,0,5>, <1,u,3,3>
+  3376055194U,	// <0,3,5,4>: Cost 4 vmrglw <3,4,0,5>, <1,2,3,4>
+  3859565138U,	// <0,3,5,5>: Cost 4 vsldoi12 <5,6,7,0>, <3,5,5,5>
+  2727514210U,	// <0,3,5,6>: Cost 3 vsldoi8 <7,2,0,3>, <5,6,7,0>
+  3376056250U,	// <0,3,5,7>: Cost 4 vmrglw <3,4,0,5>, <2,6,3,7>
+  2727514210U,	// <0,3,5,u>: Cost 3 vsldoi8 <7,2,0,3>, <5,6,7,0>
+  2758986360U,	// <0,3,6,0>: Cost 3 vsldoi12 <1,2,3,0>, <3,6,0,7>
+  3709174678U,	// <0,3,6,1>: Cost 4 vsldoi4 <3,0,3,6>, <1,2,3,0>
+  3795284411U,	// <0,3,6,2>: Cost 4 vsldoi8 <6,2,0,3>, <6,2,0,3>
+  3709175980U,	// <0,3,6,3>: Cost 4 vsldoi4 <3,0,3,6>, <3,0,3,6>
+  3833096860U,	// <0,3,6,4>: Cost 4 vsldoi12 <1,2,u,0>, <3,6,4,7>
+  3376728235U,	// <0,3,6,5>: Cost 5 vmrglw <3,5,0,6>, <3,0,3,5>
+  3859565229U,	// <0,3,6,6>: Cost 4 vsldoi12 <5,6,7,0>, <3,6,6,6>
+  2773879472U,	// <0,3,6,7>: Cost 3 vsldoi12 <3,6,7,0>, <3,6,7,0>
+  2758986360U,	// <0,3,6,u>: Cost 3 vsldoi12 <1,2,3,0>, <3,6,0,7>
+  2303656854U,	// <0,3,7,0>: Cost 3 vmrglw <3,6,0,7>, <1,2,3,0>
+  3807229018U,	// <0,3,7,1>: Cost 4 vsldoi8 <u,2,0,3>, <7,1,2,u>
+  2727515284U,	// <0,3,7,2>: Cost 3 vsldoi8 <7,2,0,3>, <7,2,0,3>
+  3377399410U,	// <0,3,7,3>: Cost 4 vmrglw <3,6,0,7>, <2,2,3,3>
+  3377398682U,	// <0,3,7,4>: Cost 4 vmrglw <3,6,0,7>, <1,2,3,4>
+  3801257409U,	// <0,3,7,5>: Cost 4 vsldoi8 <7,2,0,3>, <7,5,6,7>
+  3377399980U,	// <0,3,7,6>: Cost 4 vmrglw <3,6,0,7>, <3,0,3,6>
+  3375409082U,	// <0,3,7,7>: Cost 4 vmrglw <3,3,0,7>, <2,6,3,7>
+  2731497082U,	// <0,3,7,u>: Cost 3 vsldoi8 <7,u,0,3>, <7,u,0,3>
+  1146333334U,	// <0,3,u,0>: Cost 2 vmrghw LHS, <3,0,1,2>
+  2220075238U,	// <0,3,u,1>: Cost 3 vmrghw LHS, <3,1,1,1>
+  2220075329U,	// <0,3,u,2>: Cost 3 vmrghw LHS, <3,2,2,2>
+  1146333596U,	// <0,3,u,3>: Cost 2 vmrghw LHS, <3,3,3,3>
+  1146333698U,	// <0,3,u,4>: Cost 2 vmrghw LHS, <3,4,5,6>
+  2758986566U,	// <0,3,u,5>: Cost 3 vsldoi12 <1,2,3,0>, <3,u,5,6>
+  2803739472U,	// <0,3,u,6>: Cost 3 vsldoi12 <u,6,7,0>, <3,u,6,7>
+  2295703482U,	// <0,3,u,7>: Cost 3 vmrglw <2,3,0,u>, <2,6,3,7>
+  1146333982U,	// <0,3,u,u>: Cost 2 vmrghw LHS, <3,u,1,2>
+  2214595473U,	// <0,4,0,0>: Cost 3 vmrghw <0,0,0,0>, <4,0,5,0>
+  2693677158U,	// <0,4,0,1>: Cost 3 vsldoi8 <1,5,0,4>, LHS
+  3839437689U,	// <0,4,0,2>: Cost 4 vsldoi12 <2,3,4,0>, <4,0,2,3>
+  3709200559U,	// <0,4,0,3>: Cost 4 vsldoi4 <3,0,4,0>, <3,0,4,0>
+  2693677394U,	// <0,4,0,4>: Cost 3 vsldoi8 <1,5,0,4>, <0,4,1,5>
+  1140854070U,	// <0,4,0,5>: Cost 2 vmrghw <0,0,0,0>, RHS
+  3767419409U,	// <0,4,0,6>: Cost 4 vsldoi8 <1,5,0,4>, <0,6,4,7>
+  3854109604U,	// <0,4,0,7>: Cost 4 vsldoi12 <4,7,5,0>, <4,0,7,1>
+  1140854313U,	// <0,4,0,u>: Cost 2 vmrghw <0,0,0,0>, RHS
+  1141689234U,	// <0,4,1,0>: Cost 2 vmrghw LHS, <4,0,5,1>
+  2215431114U,	// <0,4,1,1>: Cost 3 vmrghw LHS, <4,1,2,3>
+  2215431221U,	// <0,4,1,2>: Cost 3 vmrghw LHS, <4,2,5,2>
+  2635466928U,	// <0,4,1,3>: Cost 3 vsldoi4 <3,0,4,1>, <3,0,4,1>
+  1141689552U,	// <0,4,1,4>: Cost 2 vmrghw LHS, <4,4,4,4>
+  67947830U,	// <0,4,1,5>: Cost 1 vmrghw LHS, RHS
+  2215431545U,	// <0,4,1,6>: Cost 3 vmrghw LHS, <4,6,5,2>
+  2659357716U,	// <0,4,1,7>: Cost 3 vsldoi4 <7,0,4,1>, <7,0,4,1>
+  67948073U,	// <0,4,1,u>: Cost 1 vmrghw LHS, RHS
+  3767420369U,	// <0,4,2,0>: Cost 4 vsldoi8 <1,5,0,4>, <2,0,3,4>
+  3767420451U,	// <0,4,2,1>: Cost 4 vsldoi8 <1,5,0,4>, <2,1,3,5>
+  3767420520U,	// <0,4,2,2>: Cost 4 vsldoi8 <1,5,0,4>, <2,2,2,2>
+  2698323625U,	// <0,4,2,3>: Cost 3 vsldoi8 <2,3,0,4>, <2,3,0,4>
+  3709218102U,	// <0,4,2,4>: Cost 4 vsldoi4 <3,0,4,2>, RHS
+  2216013110U,	// <0,4,2,5>: Cost 3 vmrghw <0,2,1,2>, RHS
+  3767420858U,	// <0,4,2,6>: Cost 4 vsldoi8 <1,5,0,4>, <2,6,3,7>
+  3774719981U,	// <0,4,2,7>: Cost 4 vsldoi8 <2,7,0,4>, <2,7,0,4>
+  2216013353U,	// <0,4,2,u>: Cost 3 vmrghw <0,2,1,2>, RHS
+  3767421078U,	// <0,4,3,0>: Cost 4 vsldoi8 <1,5,0,4>, <3,0,1,2>
+  3776710880U,	// <0,4,3,1>: Cost 4 vsldoi8 <3,1,0,4>, <3,1,0,4>
+  3833097325U,	// <0,4,3,2>: Cost 5 vsldoi12 <1,2,u,0>, <4,3,2,4>
+  3767421340U,	// <0,4,3,3>: Cost 4 vsldoi8 <1,5,0,4>, <3,3,3,3>
+  3767421442U,	// <0,4,3,4>: Cost 4 vsldoi8 <1,5,0,4>, <3,4,5,6>
+  2216660278U,	// <0,4,3,5>: Cost 3 vmrghw <0,3,1,0>, RHS
+  3833097361U,	// <0,4,3,6>: Cost 5 vsldoi12 <1,2,u,0>, <4,3,6,4>
+  3780692678U,	// <0,4,3,7>: Cost 4 vsldoi8 <3,7,0,4>, <3,7,0,4>
+  2216660521U,	// <0,4,3,u>: Cost 3 vmrghw <0,3,1,0>, RHS
+  2617573416U,	// <0,4,4,0>: Cost 3 vsldoi4 <0,0,4,4>, <0,0,4,4>
+  2217364450U,	// <0,4,4,1>: Cost 3 vmrghw <0,4,1,5>, <4,1,5,0>
+  3691316771U,	// <0,4,4,2>: Cost 4 vsldoi4 <0,0,4,4>, <2,1,3,5>
+  3709233331U,	// <0,4,4,3>: Cost 4 vsldoi4 <3,0,4,4>, <3,0,4,4>
+  2785823952U,	// <0,4,4,4>: Cost 3 vsldoi12 <5,6,7,0>, <4,4,4,4>
+  1143622966U,	// <0,4,4,5>: Cost 2 vmrghw <0,4,1,5>, RHS
+  3691319723U,	// <0,4,4,6>: Cost 4 vsldoi4 <0,0,4,4>, <6,1,7,5>
+  3854109932U,	// <0,4,4,7>: Cost 4 vsldoi12 <4,7,5,0>, <4,4,7,5>
+  1143623209U,	// <0,4,4,u>: Cost 2 vmrghw <0,4,1,5>, RHS
+  2635497574U,	// <0,4,5,0>: Cost 3 vsldoi4 <3,0,4,5>, LHS
+  2635498390U,	// <0,4,5,1>: Cost 3 vsldoi4 <3,0,4,5>, <1,2,3,0>
+  3709240936U,	// <0,4,5,2>: Cost 4 vsldoi4 <3,0,4,5>, <2,2,2,2>
+  2635499700U,	// <0,4,5,3>: Cost 3 vsldoi4 <3,0,4,5>, <3,0,4,5>
+  2635500854U,	// <0,4,5,4>: Cost 3 vsldoi4 <3,0,4,5>, RHS
+  2785824044U,	// <0,4,5,5>: Cost 3 vsldoi12 <5,6,7,0>, <4,5,5,6>
+  1685245238U,	// <0,4,5,6>: Cost 2 vsldoi12 <1,2,3,0>, RHS
+  2659390488U,	// <0,4,5,7>: Cost 3 vsldoi4 <7,0,4,5>, <7,0,4,5>
+  1685245256U,	// <0,4,5,u>: Cost 2 vsldoi12 <1,2,3,0>, RHS
+  3839438161U,	// <0,4,6,0>: Cost 4 vsldoi12 <2,3,4,0>, <4,6,0,7>
+  3798610347U,	// <0,4,6,1>: Cost 4 vsldoi8 <6,7,0,4>, <6,1,7,5>
+  3798610426U,	// <0,4,6,2>: Cost 4 vsldoi8 <6,7,0,4>, <6,2,7,3>
+  3795956237U,	// <0,4,6,3>: Cost 4 vsldoi8 <6,3,0,4>, <6,3,0,4>
+  3733138742U,	// <0,4,6,4>: Cost 4 vsldoi4 <7,0,4,6>, RHS
+  2218634550U,	// <0,4,6,5>: Cost 3 vmrghw <0,6,0,7>, RHS
+  3798610744U,	// <0,4,6,6>: Cost 4 vsldoi8 <6,7,0,4>, <6,6,6,6>
+  2724868945U,	// <0,4,6,7>: Cost 3 vsldoi8 <6,7,0,4>, <6,7,0,4>
+  2725532578U,	// <0,4,6,u>: Cost 3 vsldoi8 <6,u,0,4>, <6,u,0,4>
+  3383371465U,	// <0,4,7,0>: Cost 4 vmrglw <4,6,0,7>, <2,3,4,0>
+  3800601668U,	// <0,4,7,1>: Cost 4 vsldoi8 <7,1,0,4>, <7,1,0,4>
+  3775386826U,	// <0,4,7,2>: Cost 5 vsldoi8 <2,u,0,4>, <7,2,6,3>
+  3801928934U,	// <0,4,7,3>: Cost 4 vsldoi8 <7,3,0,4>, <7,3,0,4>
+  3721202998U,	// <0,4,7,4>: Cost 4 vsldoi4 <5,0,4,7>, RHS
+  2780368328U,	// <0,4,7,5>: Cost 3 vsldoi12 <4,7,5,0>, <4,7,5,0>
+  3383372686U,	// <0,4,7,6>: Cost 5 vmrglw <4,6,0,7>, <4,0,4,6>
+  3854110170U,	// <0,4,7,7>: Cost 4 vsldoi12 <4,7,5,0>, <4,7,7,0>
+  2780368328U,	// <0,4,7,u>: Cost 3 vsldoi12 <4,7,5,0>, <4,7,5,0>
+  1146334098U,	// <0,4,u,0>: Cost 2 vmrghw LHS, <4,0,5,1>
+  2220076002U,	// <0,4,u,1>: Cost 3 vmrghw LHS, <4,1,5,0>
+  2220076085U,	// <0,4,u,2>: Cost 3 vmrghw LHS, <4,2,5,2>
+  2635524279U,	// <0,4,u,3>: Cost 3 vsldoi4 <3,0,4,u>, <3,0,4,u>
+  1146334416U,	// <0,4,u,4>: Cost 2 vmrghw LHS, <4,4,4,4>
+  72592694U,	// <0,4,u,5>: Cost 1 vmrghw LHS, RHS
+  1685245481U,	// <0,4,u,6>: Cost 2 vsldoi12 <1,2,3,0>, RHS
+  2659415067U,	// <0,4,u,7>: Cost 3 vsldoi4 <7,0,4,u>, <7,0,4,u>
+  72592937U,	// <0,4,u,u>: Cost 1 vmrghw LHS, RHS
+  2281704337U,	// <0,5,0,0>: Cost 3 vmrglw <0,0,0,0>, <4,0,5,0>
+  2704965734U,	// <0,5,0,1>: Cost 3 vsldoi8 <3,4,0,5>, LHS
+  3778707666U,	// <0,5,0,2>: Cost 4 vsldoi8 <3,4,0,5>, <0,2,5,3>
+  3778707708U,	// <0,5,0,3>: Cost 4 vsldoi8 <3,4,0,5>, <0,3,1,0>
+  2687050057U,	// <0,5,0,4>: Cost 3 vsldoi8 <0,4,0,5>, <0,4,0,5>
+  2214596612U,	// <0,5,0,5>: Cost 3 vmrghw <0,0,0,0>, <5,5,5,5>
+  2785824372U,	// <0,5,0,6>: Cost 3 vsldoi12 <5,6,7,0>, <5,0,6,1>
+  3854110332U,	// <0,5,0,7>: Cost 4 vsldoi12 <4,7,5,0>, <5,0,7,0>
+  2704966301U,	// <0,5,0,u>: Cost 3 vsldoi8 <3,4,0,5>, LHS
+  1567768678U,	// <0,5,1,0>: Cost 2 vsldoi4 <4,0,5,1>, LHS
+  2312236570U,	// <0,5,1,1>: Cost 3 vmrglw <5,1,0,1>, <4,u,5,1>
+  2215431915U,	// <0,5,1,2>: Cost 3 vmrghw LHS, <5,2,1,3>
+  2641512598U,	// <0,5,1,3>: Cost 3 vsldoi4 <4,0,5,1>, <3,0,1,2>
+  1567771538U,	// <0,5,1,4>: Cost 2 vsldoi4 <4,0,5,1>, <4,0,5,1>
+  1141690372U,	// <0,5,1,5>: Cost 2 vmrghw LHS, <5,5,5,5>
+  1141690466U,	// <0,5,1,6>: Cost 2 vmrghw LHS, <5,6,7,0>
+  2641515514U,	// <0,5,1,7>: Cost 3 vsldoi4 <4,0,5,1>, <7,0,1,2>
+  1141690615U,	// <0,5,1,u>: Cost 2 vmrghw LHS, <5,u,5,5>
+  3772736973U,	// <0,5,2,0>: Cost 4 vsldoi8 <2,4,0,5>, <2,0,3,0>
+  3778709024U,	// <0,5,2,1>: Cost 4 vsldoi8 <3,4,0,5>, <2,1,3,2>
+  3778709096U,	// <0,5,2,2>: Cost 4 vsldoi8 <3,4,0,5>, <2,2,2,2>
+  3778709158U,	// <0,5,2,3>: Cost 4 vsldoi8 <3,4,0,5>, <2,3,0,1>
+  3772737275U,	// <0,5,2,4>: Cost 4 vsldoi8 <2,4,0,5>, <2,4,0,5>
+  3859566351U,	// <0,5,2,5>: Cost 4 vsldoi12 <5,6,7,0>, <5,2,5,3>
+  3778709434U,	// <0,5,2,6>: Cost 4 vsldoi8 <3,4,0,5>, <2,6,3,7>
+  3805251562U,	// <0,5,2,7>: Cost 4 vsldoi8 <7,u,0,5>, <2,7,0,1>
+  3775391807U,	// <0,5,2,u>: Cost 4 vsldoi8 <2,u,0,5>, <2,u,0,5>
+  2704967830U,	// <0,5,3,0>: Cost 3 vsldoi8 <3,4,0,5>, <3,0,1,2>
+  3776719073U,	// <0,5,3,1>: Cost 4 vsldoi8 <3,1,0,5>, <3,1,0,5>
+  3777382706U,	// <0,5,3,2>: Cost 4 vsldoi8 <3,2,0,5>, <3,2,0,5>
+  3778709887U,	// <0,5,3,3>: Cost 4 vsldoi8 <3,4,0,5>, <3,3,0,1>
+  2704968148U,	// <0,5,3,4>: Cost 3 vsldoi8 <3,4,0,5>, <3,4,0,5>
+  3857428317U,	// <0,5,3,5>: Cost 4 vsldoi12 <5,3,5,0>, <5,3,5,0>
+  3364096514U,	// <0,5,3,6>: Cost 4 vmrglw <1,4,0,3>, <3,4,5,6>
+  3780700871U,	// <0,5,3,7>: Cost 4 vsldoi8 <3,7,0,5>, <3,7,0,5>
+  2707622680U,	// <0,5,3,u>: Cost 3 vsldoi8 <3,u,0,5>, <3,u,0,5>
+  2728856466U,	// <0,5,4,0>: Cost 3 vsldoi8 <7,4,0,5>, <4,0,5,1>
+  3697361674U,	// <0,5,4,1>: Cost 4 vsldoi4 <1,0,5,4>, <1,0,5,4>
+  3697362601U,	// <0,5,4,2>: Cost 4 vsldoi4 <1,0,5,4>, <2,3,0,4>
+  3364766635U,	// <0,5,4,3>: Cost 4 vmrglw <1,5,0,4>, <1,2,5,3>
+  2217365428U,	// <0,5,4,4>: Cost 3 vmrghw <0,4,1,5>, <5,4,5,6>
+  2704969014U,	// <0,5,4,5>: Cost 3 vsldoi8 <3,4,0,5>, RHS
+  2785824700U,	// <0,5,4,6>: Cost 3 vsldoi12 <5,6,7,0>, <5,4,6,5>
+  3364766963U,	// <0,5,4,7>: Cost 4 vmrglw <1,5,0,4>, <1,6,5,7>
+  2704969257U,	// <0,5,4,u>: Cost 3 vsldoi8 <3,4,0,5>, RHS
+  3846148050U,	// <0,5,5,0>: Cost 4 vsldoi12 <3,4,5,0>, <5,5,0,0>
+  2326203282U,	// <0,5,5,1>: Cost 3 vmrglw <7,4,0,5>, <4,0,5,1>
+  3291746027U,	// <0,5,5,2>: Cost 4 vmrghw <0,5,1,2>, <5,2,1,3>
+  3376054482U,	// <0,5,5,3>: Cost 4 vmrglw <3,4,0,5>, <0,2,5,3>
+  3790655366U,	// <0,5,5,4>: Cost 4 vsldoi8 <5,4,0,5>, <5,4,0,5>
+  2785824772U,	// <0,5,5,5>: Cost 3 vsldoi12 <5,6,7,0>, <5,5,5,5>
+  2724876386U,	// <0,5,5,6>: Cost 3 vsldoi8 <6,7,0,5>, <5,6,7,0>
+  3858903057U,	// <0,5,5,7>: Cost 4 vsldoi12 <5,5,7,0>, <5,5,7,0>
+  2736820484U,	// <0,5,5,u>: Cost 3 vsldoi8 <u,7,0,5>, <5,u,7,0>
+  2659467366U,	// <0,5,6,0>: Cost 3 vsldoi4 <7,0,5,6>, LHS
+  3859566643U,	// <0,5,6,1>: Cost 4 vsldoi12 <5,6,7,0>, <5,6,1,7>
+  3798618618U,	// <0,5,6,2>: Cost 4 vsldoi8 <6,7,0,5>, <6,2,7,3>
+  3852857410U,	// <0,5,6,3>: Cost 4 vsldoi12 <4,5,6,0>, <5,6,3,4>
+  2659470646U,	// <0,5,6,4>: Cost 3 vsldoi4 <7,0,5,6>, RHS
+  2659471458U,	// <0,5,6,5>: Cost 3 vsldoi4 <7,0,5,6>, <5,6,7,0>
+  3832729696U,	// <0,5,6,6>: Cost 4 vsldoi12 <1,2,3,0>, <5,6,6,7>
+  1712083042U,	// <0,5,6,7>: Cost 2 vsldoi12 <5,6,7,0>, <5,6,7,0>
+  1712156779U,	// <0,5,6,u>: Cost 2 vsldoi12 <5,6,u,0>, <5,6,u,0>
+  2731512826U,	// <0,5,7,0>: Cost 3 vsldoi8 <7,u,0,5>, <7,0,1,2>
+  3859566717U,	// <0,5,7,1>: Cost 4 vsldoi12 <5,6,7,0>, <5,7,1,0>
+  3798619284U,	// <0,5,7,2>: Cost 4 vsldoi8 <6,7,0,5>, <7,2,0,3>
+  3778712803U,	// <0,5,7,3>: Cost 4 vsldoi8 <3,4,0,5>, <7,3,0,1>
+  2728858936U,	// <0,5,7,4>: Cost 3 vsldoi8 <7,4,0,5>, <7,4,0,5>
+  3859566753U,	// <0,5,7,5>: Cost 4 vsldoi12 <5,6,7,0>, <5,7,5,0>
+  3377398135U,	// <0,5,7,6>: Cost 4 vmrglw <3,6,0,7>, <0,4,5,6>
+  3798619686U,	// <0,5,7,7>: Cost 4 vsldoi8 <6,7,0,5>, <7,7,0,0>
+  2731513468U,	// <0,5,7,u>: Cost 3 vsldoi8 <7,u,0,5>, <7,u,0,5>
+  1567826022U,	// <0,5,u,0>: Cost 2 vsldoi4 <4,0,5,u>, LHS
+  2704971566U,	// <0,5,u,1>: Cost 3 vsldoi8 <3,4,0,5>, LHS
+  2220076779U,	// <0,5,u,2>: Cost 3 vmrghw LHS, <5,2,1,3>
+  2641569942U,	// <0,5,u,3>: Cost 3 vsldoi4 <4,0,5,u>, <3,0,1,2>
+  1567828889U,	// <0,5,u,4>: Cost 2 vsldoi4 <4,0,5,u>, <4,0,5,u>
+  1146335236U,	// <0,5,u,5>: Cost 2 vmrghw LHS, <5,5,5,5>
+  1146335330U,	// <0,5,u,6>: Cost 2 vmrghw LHS, <5,6,7,0>
+  1713410308U,	// <0,5,u,7>: Cost 2 vsldoi12 <5,u,7,0>, <5,u,7,0>
+  1713484045U,	// <0,5,u,u>: Cost 2 vsldoi12 <5,u,u,0>, <5,u,u,0>
+  2214596949U,	// <0,6,0,0>: Cost 3 vmrghw <0,0,0,0>, <6,0,7,0>
+  2214678951U,	// <0,6,0,1>: Cost 3 vmrghw <0,0,1,1>, <6,1,7,1>
+  2214597114U,	// <0,6,0,2>: Cost 3 vmrghw <0,0,0,0>, <6,2,7,3>
+  3852857653U,	// <0,6,0,3>: Cost 4 vsldoi12 <4,5,6,0>, <6,0,3,4>
+  3832729919U,	// <0,6,0,4>: Cost 4 vsldoi12 <1,2,3,0>, <6,0,4,5>
+  3721293427U,	// <0,6,0,5>: Cost 4 vsldoi4 <5,0,6,0>, <5,0,6,0>
+  2214597432U,	// <0,6,0,6>: Cost 3 vmrghw <0,0,0,0>, <6,6,6,6>
+  1207962934U,	// <0,6,0,7>: Cost 2 vmrglw <0,0,0,0>, RHS
+  1207962935U,	// <0,6,0,u>: Cost 2 vmrglw <0,0,0,0>, RHS
+  2215432481U,	// <0,6,1,0>: Cost 3 vmrghw LHS, <6,0,1,2>
+  2215432615U,	// <0,6,1,1>: Cost 3 vmrghw LHS, <6,1,7,1>
+  1141690874U,	// <0,6,1,2>: Cost 2 vmrghw LHS, <6,2,7,3>
+  2215432754U,	// <0,6,1,3>: Cost 3 vmrghw LHS, <6,3,4,5>
+  2215432817U,	// <0,6,1,4>: Cost 3 vmrghw LHS, <6,4,2,5>
+  2215432939U,	// <0,6,1,5>: Cost 3 vmrghw LHS, <6,5,7,1>
+  1141691192U,	// <0,6,1,6>: Cost 2 vmrghw LHS, <6,6,6,6>
+  1221905718U,	// <0,6,1,7>: Cost 2 vmrglw <2,3,0,1>, RHS
+  1221905719U,	// <0,6,1,u>: Cost 2 vmrglw <2,3,0,1>, RHS
+  3852857787U,	// <0,6,2,0>: Cost 4 vsldoi12 <4,5,6,0>, <6,2,0,3>
+  3289764265U,	// <0,6,2,1>: Cost 4 vmrghw <0,2,1,3>, <6,1,7,3>
+  3289690618U,	// <0,6,2,2>: Cost 4 vmrghw <0,2,0,3>, <6,2,7,3>
+  3862589907U,	// <0,6,2,3>: Cost 4 vsldoi12 <6,2,3,0>, <6,2,3,0>
+  3733253430U,	// <0,6,2,4>: Cost 4 vsldoi4 <7,0,6,2>, RHS
+  3733254242U,	// <0,6,2,5>: Cost 4 vsldoi4 <7,0,6,2>, <5,6,7,0>
+  3777390522U,	// <0,6,2,6>: Cost 4 vsldoi8 <3,2,0,6>, <2,6,3,7>
+  2785825274U,	// <0,6,2,7>: Cost 3 vsldoi12 <5,6,7,0>, <6,2,7,3>
+  2785825283U,	// <0,6,2,u>: Cost 3 vsldoi12 <5,6,7,0>, <6,2,u,3>
+  3777390742U,	// <0,6,3,0>: Cost 4 vsldoi8 <3,2,0,6>, <3,0,1,2>
+  3863106066U,	// <0,6,3,1>: Cost 4 vsldoi12 <6,3,1,0>, <6,3,1,0>
+  3777390899U,	// <0,6,3,2>: Cost 4 vsldoi8 <3,2,0,6>, <3,2,0,6>
+  3290436146U,	// <0,6,3,3>: Cost 4 vmrghw <0,3,1,4>, <6,3,4,5>
+  3779381762U,	// <0,6,3,4>: Cost 4 vsldoi8 <3,5,0,6>, <3,4,5,6>
+  3779381798U,	// <0,6,3,5>: Cost 4 vsldoi8 <3,5,0,6>, <3,5,0,6>
+  3733262920U,	// <0,6,3,6>: Cost 4 vsldoi4 <7,0,6,3>, <6,3,7,0>
+  2300972342U,	// <0,6,3,7>: Cost 3 vmrglw <3,2,0,3>, RHS
+  2300972343U,	// <0,6,3,u>: Cost 3 vmrglw <3,2,0,3>, RHS
+  3802606482U,	// <0,6,4,0>: Cost 4 vsldoi8 <7,4,0,6>, <4,0,5,1>
+  2217365931U,	// <0,6,4,1>: Cost 3 vmrghw <0,4,1,5>, <6,1,7,5>
+  2217366010U,	// <0,6,4,2>: Cost 3 vmrghw <0,4,1,5>, <6,2,7,3>
+  3291107890U,	// <0,6,4,3>: Cost 4 vmrghw <0,4,1,5>, <6,3,4,5>
+  3291099805U,	// <0,6,4,4>: Cost 4 vmrghw <0,4,1,4>, <6,4,7,4>
+  3777391926U,	// <0,6,4,5>: Cost 4 vsldoi8 <3,2,0,6>, RHS
+  2217366328U,	// <0,6,4,6>: Cost 3 vmrghw <0,4,1,5>, <6,6,6,6>
+  2291027254U,	// <0,6,4,7>: Cost 3 vmrglw <1,5,0,4>, RHS
+  2291027255U,	// <0,6,4,u>: Cost 3 vmrglw <1,5,0,4>, RHS
+  3852858033U,	// <0,6,5,0>: Cost 4 vsldoi12 <4,5,6,0>, <6,5,0,6>
+  3395964532U,	// <0,6,5,1>: Cost 4 vmrglw <6,7,0,5>, <5,0,6,1>
+  3864507069U,	// <0,6,5,2>: Cost 4 vsldoi12 <6,5,2,0>, <6,5,2,0>
+  3376056678U,	// <0,6,5,3>: Cost 5 vmrglw <3,4,0,5>, <3,2,6,3>
+  3721334070U,	// <0,6,5,4>: Cost 4 vsldoi4 <5,0,6,5>, RHS
+  3395964860U,	// <0,6,5,5>: Cost 4 vmrglw <6,7,0,5>, <5,4,6,5>
+  3864802017U,	// <0,6,5,6>: Cost 4 vsldoi12 <6,5,6,0>, <6,5,6,0>
+  2302315830U,	// <0,6,5,7>: Cost 3 vmrglw <3,4,0,5>, RHS
+  2302315831U,	// <0,6,5,u>: Cost 3 vmrglw <3,4,0,5>, RHS
+  3852858108U,	// <0,6,6,0>: Cost 4 vsldoi12 <4,5,6,0>, <6,6,0,0>
+  3398624745U,	// <0,6,6,1>: Cost 4 vmrglw <7,2,0,6>, <2,0,6,1>
+  2218668538U,	// <0,6,6,2>: Cost 3 vmrghw <0,6,1,2>, <6,2,7,3>
+  3292418610U,	// <0,6,6,3>: Cost 4 vmrghw <0,6,1,3>, <6,3,4,5>
+  3733286198U,	// <0,6,6,4>: Cost 4 vsldoi4 <7,0,6,6>, RHS
+  3797299889U,	// <0,6,6,5>: Cost 4 vsldoi8 <6,5,0,6>, <6,5,0,6>
+  2785825592U,	// <0,6,6,6>: Cost 3 vsldoi12 <5,6,7,0>, <6,6,6,6>
+  2785825602U,	// <0,6,6,7>: Cost 3 vsldoi12 <5,6,7,0>, <6,6,7,7>
+  2785825611U,	// <0,6,6,u>: Cost 3 vsldoi12 <5,6,7,0>, <6,6,u,7>
+  2785825614U,	// <0,6,7,0>: Cost 3 vsldoi12 <5,6,7,0>, <6,7,0,1>
+  2758988632U,	// <0,6,7,1>: Cost 3 vsldoi12 <1,2,3,0>, <6,7,1,2>
+  3377400084U,	// <0,6,7,2>: Cost 4 vmrglw <3,6,0,7>, <3,1,6,2>
+  2792166248U,	// <0,6,7,3>: Cost 3 vsldoi12 <6,7,3,0>, <6,7,3,0>
+  2785825654U,	// <0,6,7,4>: Cost 3 vsldoi12 <5,6,7,0>, <6,7,4,5>
+  2785825664U,	// <0,6,7,5>: Cost 3 vsldoi12 <5,6,7,0>, <6,7,5,6>
+  3859567493U,	// <0,6,7,6>: Cost 4 vsldoi12 <5,6,7,0>, <6,7,6,2>
+  2303659318U,	// <0,6,7,7>: Cost 3 vmrglw <3,6,0,7>, RHS
+  2303659319U,	// <0,6,7,u>: Cost 3 vmrglw <3,6,0,7>, RHS
+  2785825695U,	// <0,6,u,0>: Cost 3 vsldoi12 <5,6,7,0>, <6,u,0,1>
+  2220077479U,	// <0,6,u,1>: Cost 3 vmrghw LHS, <6,1,7,1>
+  1146335738U,	// <0,6,u,2>: Cost 2 vmrghw LHS, <6,2,7,3>
+  2792829881U,	// <0,6,u,3>: Cost 3 vsldoi12 <6,u,3,0>, <6,u,3,0>
+  2785825735U,	// <0,6,u,4>: Cost 3 vsldoi12 <5,6,7,0>, <6,u,4,5>
+  2785825664U,	// <0,6,u,5>: Cost 3 vsldoi12 <5,6,7,0>, <6,7,5,6>
+  1146336056U,	// <0,6,u,6>: Cost 2 vmrghw LHS, <6,6,6,6>
+  1221963062U,	// <0,6,u,7>: Cost 2 vmrglw <2,3,0,u>, RHS
+  1221963063U,	// <0,6,u,u>: Cost 2 vmrglw <2,3,0,u>, RHS
+  2653593600U,	// <0,7,0,0>: Cost 3 vsldoi4 <6,0,7,0>, <0,0,0,0>
+  2706309222U,	// <0,7,0,1>: Cost 3 vsldoi8 <3,6,0,7>, LHS
+  3709421498U,	// <0,7,0,2>: Cost 4 vsldoi4 <3,0,7,0>, <2,6,3,7>
+  2281705978U,	// <0,7,0,3>: Cost 3 vmrglw <0,0,0,0>, <6,2,7,3>
+  2785825816U,	// <0,7,0,4>: Cost 3 vsldoi12 <5,6,7,0>, <7,0,4,5>
+  2785825826U,	// <0,7,0,5>: Cost 3 vsldoi12 <5,6,7,0>, <7,0,5,6>
+  2653598037U,	// <0,7,0,6>: Cost 3 vsldoi4 <6,0,7,0>, <6,0,7,0>
+  2214598252U,	// <0,7,0,7>: Cost 3 vmrghw <0,0,0,0>, <7,7,7,7>
+  2706309789U,	// <0,7,0,u>: Cost 3 vsldoi8 <3,6,0,7>, LHS
+  1141691386U,	// <0,7,1,0>: Cost 2 vmrghw LHS, <7,0,1,2>
+  2215433290U,	// <0,7,1,1>: Cost 3 vmrghw LHS, <7,1,1,1>
+  2706310038U,	// <0,7,1,2>: Cost 3 vsldoi8 <3,6,0,7>, <1,2,3,0>
+  2322190842U,	// <0,7,1,3>: Cost 3 vmrglw <6,7,0,1>, <6,2,7,3>
+  1141691750U,	// <0,7,1,4>: Cost 2 vmrghw LHS, <7,4,5,6>
+  2215433654U,	// <0,7,1,5>: Cost 3 vmrghw LHS, <7,5,5,5>
+  2653606230U,	// <0,7,1,6>: Cost 3 vsldoi4 <6,0,7,1>, <6,0,7,1>
+  1141692012U,	// <0,7,1,7>: Cost 2 vmrghw LHS, <7,7,7,7>
+  1141692034U,	// <0,7,1,u>: Cost 2 vmrghw LHS, <7,u,1,2>
+  2785825940U,	// <0,7,2,0>: Cost 3 vsldoi12 <5,6,7,0>, <7,2,0,3>
+  3768108576U,	// <0,7,2,1>: Cost 5 vsldoi8 <1,6,0,7>, <2,1,3,2>
+  3780052584U,	// <0,7,2,2>: Cost 4 vsldoi8 <3,6,0,7>, <2,2,2,2>
+  2794820780U,	// <0,7,2,3>: Cost 3 vsldoi12 <7,2,3,0>, <7,2,3,0>
+  3859641528U,	// <0,7,2,4>: Cost 4 vsldoi12 <5,6,u,0>, <7,2,4,3>
+  3733327970U,	// <0,7,2,5>: Cost 4 vsldoi4 <7,0,7,2>, <5,6,7,0>
+  3778062266U,	// <0,7,2,6>: Cost 4 vsldoi8 <3,3,0,7>, <2,6,3,7>
+  3733328944U,	// <0,7,2,7>: Cost 4 vsldoi4 <7,0,7,2>, <7,0,7,2>
+  2795189465U,	// <0,7,2,u>: Cost 3 vsldoi12 <7,2,u,0>, <7,2,u,0>
+  2324861026U,	// <0,7,3,0>: Cost 3 vmrglw <7,2,0,3>, <5,6,7,0>
+  3780053233U,	// <0,7,3,1>: Cost 4 vsldoi8 <3,6,0,7>, <3,1,2,3>
+  3780053296U,	// <0,7,3,2>: Cost 4 vsldoi8 <3,6,0,7>, <3,2,0,3>
+  3778062725U,	// <0,7,3,3>: Cost 4 vsldoi8 <3,3,0,7>, <3,3,0,7>
+  3780053506U,	// <0,7,3,4>: Cost 4 vsldoi8 <3,6,0,7>, <3,4,5,6>
+  3803941469U,	// <0,7,3,5>: Cost 4 vsldoi8 <7,6,0,7>, <3,5,6,7>
+  2706311800U,	// <0,7,3,6>: Cost 3 vsldoi8 <3,6,0,7>, <3,6,0,7>
+  3398603586U,	// <0,7,3,7>: Cost 4 vmrglw <7,2,0,3>, <6,6,7,7>
+  2707639066U,	// <0,7,3,u>: Cost 3 vsldoi8 <3,u,0,7>, <3,u,0,7>
+  2217366522U,	// <0,7,4,0>: Cost 3 vmrghw <0,4,1,5>, <7,0,1,2>
+  3727369110U,	// <0,7,4,1>: Cost 4 vsldoi4 <6,0,7,4>, <1,2,3,0>
+  3291108500U,	// <0,7,4,2>: Cost 4 vmrghw <0,4,1,5>, <7,2,0,3>
+  3727370872U,	// <0,7,4,3>: Cost 4 vsldoi4 <6,0,7,4>, <3,6,0,7>
+  2217366886U,	// <0,7,4,4>: Cost 3 vmrghw <0,4,1,5>, <7,4,5,6>
+  2706312502U,	// <0,7,4,5>: Cost 3 vsldoi8 <3,6,0,7>, RHS
+  3786026321U,	// <0,7,4,6>: Cost 4 vsldoi8 <4,6,0,7>, <4,6,0,7>
+  2217367148U,	// <0,7,4,7>: Cost 3 vmrghw <0,4,1,5>, <7,7,7,7>
+  2706312745U,	// <0,7,4,u>: Cost 3 vsldoi8 <3,6,0,7>, RHS
+  2322223202U,	// <0,7,5,0>: Cost 3 vmrglw <6,7,0,5>, <5,6,7,0>
+  3399946987U,	// <0,7,5,1>: Cost 4 vmrglw <7,4,0,5>, <6,5,7,1>
+  3291780244U,	// <0,7,5,2>: Cost 4 vmrghw <0,5,1,6>, <7,2,0,3>
+  3727378582U,	// <0,7,5,3>: Cost 4 vsldoi4 <6,0,7,5>, <3,0,1,2>
+  3727379766U,	// <0,7,5,4>: Cost 4 vsldoi4 <6,0,7,5>, RHS
+  3859568054U,	// <0,7,5,5>: Cost 4 vsldoi12 <5,6,7,0>, <7,5,5,5>
+  2785826241U,	// <0,7,5,6>: Cost 3 vsldoi12 <5,6,7,0>, <7,5,6,7>
+  3395965762U,	// <0,7,5,7>: Cost 4 vmrglw <6,7,0,5>, <6,6,7,7>
+  2787153363U,	// <0,7,5,u>: Cost 3 vsldoi12 <5,u,7,0>, <7,5,u,7>
+  2785826268U,	// <0,7,6,0>: Cost 3 vsldoi12 <5,6,7,0>, <7,6,0,7>
+  3780055420U,	// <0,7,6,1>: Cost 5 vsldoi8 <3,6,0,7>, <6,1,2,3>
+  3859568110U,	// <0,7,6,2>: Cost 4 vsldoi12 <5,6,7,0>, <7,6,2,7>
+  3874534903U,	// <0,7,6,3>: Cost 4 vsldoi12 <u,2,3,0>, <7,6,3,7>
+  3859641856U,	// <0,7,6,4>: Cost 4 vsldoi12 <5,6,u,0>, <7,6,4,7>
+  3733360738U,	// <0,7,6,5>: Cost 4 vsldoi4 <7,0,7,6>, <5,6,7,0>
+  3859568145U,	// <0,7,6,6>: Cost 4 vsldoi12 <5,6,7,0>, <7,6,6,6>
+  2797770260U,	// <0,7,6,7>: Cost 3 vsldoi12 <7,6,7,0>, <7,6,7,0>
+  2797843997U,	// <0,7,6,u>: Cost 3 vsldoi12 <7,6,u,0>, <7,6,u,0>
+  2785826342U,	// <0,7,7,0>: Cost 3 vsldoi12 <5,6,7,0>, <7,7,0,0>
+  3727393686U,	// <0,7,7,1>: Cost 4 vsldoi4 <6,0,7,7>, <1,2,3,0>
+  3868563003U,	// <0,7,7,2>: Cost 4 vsldoi12 <7,2,3,0>, <7,7,2,3>
+  3377397988U,	// <0,7,7,3>: Cost 4 vmrglw <3,6,0,7>, <0,2,7,3>
+  2219349350U,	// <0,7,7,4>: Cost 3 vmrghw <0,7,1,4>, <7,4,5,6>
+  3859568217U,	// <0,7,7,5>: Cost 4 vsldoi12 <5,6,7,0>, <7,7,5,6>
+  2730202588U,	// <0,7,7,6>: Cost 3 vsldoi8 <7,6,0,7>, <7,6,0,7>
+  2785826412U,	// <0,7,7,7>: Cost 3 vsldoi12 <5,6,7,0>, <7,7,7,7>
+  2731529854U,	// <0,7,7,u>: Cost 3 vsldoi8 <7,u,0,7>, <7,u,0,7>
+  1146336250U,	// <0,7,u,0>: Cost 2 vmrghw LHS, <7,0,1,2>
+  2706315054U,	// <0,7,u,1>: Cost 3 vsldoi8 <3,6,0,7>, LHS
+  2653660845U,	// <0,7,u,2>: Cost 3 vsldoi4 <6,0,7,u>, <2,3,0,u>
+  2322248186U,	// <0,7,u,3>: Cost 3 vmrglw <6,7,0,u>, <6,2,7,3>
+  1146336614U,	// <0,7,u,4>: Cost 2 vmrghw LHS, <7,4,5,6>
+  2706315418U,	// <0,7,u,5>: Cost 3 vsldoi8 <3,6,0,7>, RHS
+  2653663581U,	// <0,7,u,6>: Cost 3 vsldoi4 <6,0,7,u>, <6,0,7,u>
+  1146336876U,	// <0,7,u,7>: Cost 2 vmrghw LHS, <7,7,7,7>
+  1146336898U,	// <0,7,u,u>: Cost 2 vmrghw LHS, <7,u,1,2>
+  202162278U,	// <0,u,0,0>: Cost 1 vspltisw0 LHS
+  1624612966U,	// <0,u,0,1>: Cost 2 vsldoi8 <2,3,0,u>, LHS
+  2629780986U,	// <0,u,0,2>: Cost 3 vsldoi4 <2,0,u,0>, <2,0,u,0>
+  1207959708U,	// <0,u,0,3>: Cost 2 vmrglw <0,0,0,0>, LHS
+  1544097078U,	// <0,u,0,4>: Cost 2 vsldoi4 <0,0,u,0>, RHS
+  1140856986U,	// <0,u,0,5>: Cost 2 vmrghw <0,0,0,0>, RHS
+  2698355253U,	// <0,u,0,6>: Cost 3 vsldoi8 <2,3,0,u>, <0,6,u,7>
+  1207962952U,	// <0,u,0,7>: Cost 2 vmrglw <0,0,0,0>, RHS
+  202162278U,	// <0,u,0,u>: Cost 1 vspltisw0 LHS
+  1142134483U,	// <0,u,1,0>: Cost 2 vmrghw LHS, <u,0,1,2>
+  67950382U,	// <0,u,1,1>: Cost 1 vmrghw LHS, LHS
+  1142175624U,	// <0,u,1,2>: Cost 2 vmrghw LHS, <u,2,3,3>
+  1142175676U,	// <0,u,1,3>: Cost 2 vmrghw LHS, <u,3,0,1>
+  1142134847U,	// <0,u,1,4>: Cost 2 vmrghw LHS, <u,4,5,6>
+  67950746U,	// <0,u,1,5>: Cost 1 vmrghw LHS, RHS
+  1142175952U,	// <0,u,1,6>: Cost 2 vmrghw LHS, <u,6,3,7>
+  1221905736U,	// <0,u,1,7>: Cost 2 vmrglw <2,3,0,1>, RHS
+  67950949U,	// <0,u,1,u>: Cost 1 vmrghw LHS, LHS
+  1562026086U,	// <0,u,2,0>: Cost 2 vsldoi4 <3,0,u,2>, LHS
+  2216015662U,	// <0,u,2,1>: Cost 3 vmrghw <0,2,1,2>, LHS
+  2698356328U,	// <0,u,2,2>: Cost 3 vsldoi8 <2,3,0,u>, <2,2,2,2>
+  835584U,	// <0,u,2,3>: Cost 0 copy LHS
+  1562029366U,	// <0,u,2,4>: Cost 2 vsldoi4 <3,0,u,2>, RHS
+  2216016026U,	// <0,u,2,5>: Cost 3 vmrghw <0,2,1,2>, RHS
+  2698356666U,	// <0,u,2,6>: Cost 3 vsldoi8 <2,3,0,u>, <2,6,3,7>
+  1585919033U,	// <0,u,2,7>: Cost 2 vsldoi4 <7,0,u,2>, <7,0,u,2>
+  835584U,	// <0,u,2,u>: Cost 0 copy LHS
+  2758989756U,	// <0,u,3,0>: Cost 3 vsldoi12 <1,2,3,0>, <u,3,0,1>
+  2216662830U,	// <0,u,3,1>: Cost 3 vmrghw <0,3,1,0>, LHS
+  2703665461U,	// <0,u,3,2>: Cost 3 vsldoi8 <3,2,0,u>, <3,2,0,u>
+  2758989782U,	// <0,u,3,3>: Cost 3 vsldoi12 <1,2,3,0>, <u,3,3,0>
+  2758989796U,	// <0,u,3,4>: Cost 3 vsldoi12 <1,2,3,0>, <u,3,4,5>
+  2216663194U,	// <0,u,3,5>: Cost 3 vmrghw <0,3,1,0>, RHS
+  2706319993U,	// <0,u,3,6>: Cost 3 vsldoi8 <3,6,0,u>, <3,6,0,u>
+  2300972360U,	// <0,u,3,7>: Cost 3 vmrglw <3,2,0,3>, RHS
+  2216663397U,	// <0,u,3,u>: Cost 3 vmrghw <0,3,1,0>, LHS
+  2217367251U,	// <0,u,4,0>: Cost 3 vmrghw <0,4,1,5>, <u,0,1,2>
+  1143625518U,	// <0,u,4,1>: Cost 2 vmrghw <0,4,1,5>, LHS
+  2217367432U,	// <0,u,4,2>: Cost 3 vmrghw <0,4,1,5>, <u,2,3,3>
+  2217367484U,	// <0,u,4,3>: Cost 3 vmrghw <0,4,1,5>, <u,3,0,1>
+  1143619922U,	// <0,u,4,4>: Cost 2 vmrghw <0,4,1,5>, <0,4,1,5>
+  1143625882U,	// <0,u,4,5>: Cost 2 vmrghw <0,4,1,5>, RHS
+  2217367760U,	// <0,u,4,6>: Cost 3 vmrghw <0,4,1,5>, <u,6,3,7>
+  2291027272U,	// <0,u,4,7>: Cost 3 vmrglw <1,5,0,4>, RHS
+  1143626085U,	// <0,u,4,u>: Cost 2 vmrghw <0,4,1,5>, LHS
+  2635792486U,	// <0,u,5,0>: Cost 3 vsldoi4 <3,0,u,5>, LHS
+  2635793302U,	// <0,u,5,1>: Cost 3 vsldoi4 <3,0,u,5>, <1,2,3,0>
+  2302314646U,	// <0,u,5,2>: Cost 3 vmrglw <3,4,0,5>, <3,0,1,2>
+  2635794648U,	// <0,u,5,3>: Cost 3 vsldoi4 <3,0,u,5>, <3,0,u,5>
+  2635795766U,	// <0,u,5,4>: Cost 3 vsldoi4 <3,0,u,5>, RHS
+  2717601754U,	// <0,u,5,5>: Cost 3 vsldoi8 <5,5,0,u>, <5,5,0,u>
+  1685248154U,	// <0,u,5,6>: Cost 2 vsldoi12 <1,2,3,0>, RHS
+  2302315848U,	// <0,u,5,7>: Cost 3 vmrglw <3,4,0,5>, RHS
+  1685248172U,	// <0,u,5,u>: Cost 2 vsldoi12 <1,2,3,0>, RHS
+  2759358645U,	// <0,u,6,0>: Cost 3 vsldoi12 <1,2,u,0>, <u,6,0,7>
+  2218637102U,	// <0,u,6,1>: Cost 3 vmrghw <0,6,0,7>, LHS
+  2724901370U,	// <0,u,6,2>: Cost 3 vsldoi8 <6,7,0,u>, <6,2,7,3>
+  2758990032U,	// <0,u,6,3>: Cost 3 vsldoi12 <1,2,3,0>, <u,6,3,7>
+  2659691830U,	// <0,u,6,4>: Cost 3 vsldoi4 <7,0,u,6>, RHS
+  2659471458U,	// <0,u,6,5>: Cost 3 vsldoi4 <7,0,5,6>, <5,6,7,0>
+  2724901688U,	// <0,u,6,6>: Cost 3 vsldoi8 <6,7,0,u>, <6,6,6,6>
+  1651159893U,	// <0,u,6,7>: Cost 2 vsldoi8 <6,7,0,u>, <6,7,0,u>
+  1651823526U,	// <0,u,6,u>: Cost 2 vsldoi8 <6,u,0,u>, <6,u,0,u>
+  2785827072U,	// <0,u,7,0>: Cost 3 vsldoi12 <5,6,7,0>, <u,7,0,1>
+  2803964168U,	// <0,u,7,1>: Cost 3 vsldoi12 <u,7,1,0>, <u,7,1,0>
+  2727556249U,	// <0,u,7,2>: Cost 3 vsldoi8 <7,2,0,u>, <7,2,0,u>
+  2303656092U,	// <0,u,7,3>: Cost 3 vmrglw <3,6,0,7>, LHS
+  2785827112U,	// <0,u,7,4>: Cost 3 vsldoi12 <5,6,7,0>, <u,7,4,5>
+  2785827122U,	// <0,u,7,5>: Cost 3 vsldoi12 <5,6,7,0>, <u,7,5,6>
+  2730210781U,	// <0,u,7,6>: Cost 3 vsldoi8 <7,6,0,u>, <7,6,0,u>
+  2303659336U,	// <0,u,7,7>: Cost 3 vmrglw <3,6,0,7>, RHS
+  2303656097U,	// <0,u,7,u>: Cost 3 vmrglw <3,6,0,7>, LHS
+  202162278U,	// <0,u,u,0>: Cost 1 vspltisw0 LHS
+  72595246U,	// <0,u,u,1>: Cost 1 vmrghw LHS, LHS
+  1146337160U,	// <0,u,u,2>: Cost 2 vmrghw LHS, <u,2,3,3>
+  835584U,	// <0,u,u,3>: Cost 0 copy LHS
+  1146337343U,	// <0,u,u,4>: Cost 2 vmrghw LHS, <u,4,5,6>
+  72595610U,	// <0,u,u,5>: Cost 1 vmrghw LHS, RHS
+  1146337488U,	// <0,u,u,6>: Cost 2 vmrghw LHS, <u,6,3,7>
+  1221963080U,	// <0,u,u,7>: Cost 2 vmrglw <2,3,0,u>, RHS
+  835584U,	// <0,u,u,u>: Cost 0 copy LHS
+  2756853760U,	// <1,0,0,0>: Cost 3 vsldoi12 <0,u,1,1>, <0,0,0,0>
+  1677803530U,	// <1,0,0,1>: Cost 2 vsldoi12 <0,0,1,1>, <0,0,1,1>
+  3759497387U,	// <1,0,0,2>: Cost 4 vsldoi8 <0,2,1,0>, <0,2,1,0>
+  2686419196U,	// <1,0,0,3>: Cost 3 vsldoi8 <0,3,1,0>, <0,3,1,0>
+  2751766565U,	// <1,0,0,4>: Cost 3 vsldoi12 <0,0,4,1>, <0,0,4,1>
+  2687746462U,	// <1,0,0,5>: Cost 3 vsldoi8 <0,5,1,0>, <0,5,1,0>
+  3776086518U,	// <1,0,0,6>: Cost 4 vsldoi8 <3,0,1,0>, <0,6,1,7>
+  2689073728U,	// <1,0,0,7>: Cost 3 vsldoi8 <0,7,1,0>, <0,7,1,0>
+  1678319689U,	// <1,0,0,u>: Cost 2 vsldoi12 <0,0,u,1>, <0,0,u,1>
+  2287091712U,	// <1,0,1,0>: Cost 3 vmrglw <0,u,1,1>, <0,0,0,0>
+  1147568230U,	// <1,0,1,1>: Cost 2 vmrghw <1,1,1,1>, LHS
+  1683112038U,	// <1,0,1,2>: Cost 2 vsldoi12 <0,u,1,1>, LHS
+  3294970108U,	// <1,0,1,3>: Cost 4 vmrghw <1,1,0,0>, <0,3,1,0>
+  2623892790U,	// <1,0,1,4>: Cost 3 vsldoi4 <1,1,0,1>, RHS
+  2647781007U,	// <1,0,1,5>: Cost 3 vsldoi4 <5,1,0,1>, <5,1,0,1>
+  2791948430U,	// <1,0,1,6>: Cost 3 vsldoi12 <6,7,0,1>, <0,1,6,7>
+  3721524218U,	// <1,0,1,7>: Cost 4 vsldoi4 <5,1,0,1>, <7,0,1,2>
+  1683112092U,	// <1,0,1,u>: Cost 2 vsldoi12 <0,u,1,1>, LHS
+  2222112768U,	// <1,0,2,0>: Cost 3 vmrghw <1,2,3,0>, <0,0,0,0>
+  1148371046U,	// <1,0,2,1>: Cost 2 vmrghw <1,2,3,0>, LHS
+  3356862524U,	// <1,0,2,2>: Cost 4 vmrglw <0,2,1,2>, <2,u,0,2>
+  2702345894U,	// <1,0,2,3>: Cost 3 vsldoi8 <3,0,1,0>, <2,3,0,1>
+  2222113106U,	// <1,0,2,4>: Cost 3 vmrghw <1,2,3,0>, <0,4,1,5>
+  2299709908U,	// <1,0,2,5>: Cost 3 vmrglw <3,0,1,2>, <3,4,0,5>
+  3760162746U,	// <1,0,2,6>: Cost 4 vsldoi8 <0,3,1,0>, <2,6,3,7>
+  3369470584U,	// <1,0,2,7>: Cost 4 vmrglw <2,3,1,2>, <3,6,0,7>
+  1148371613U,	// <1,0,2,u>: Cost 2 vmrghw <1,2,3,0>, LHS
+  2686421142U,	// <1,0,3,0>: Cost 3 vsldoi8 <0,3,1,0>, <3,0,1,2>
+  2283128486U,	// <1,0,3,1>: Cost 3 vmrglw <0,2,1,3>, <2,3,0,1>
+  3296305326U,	// <1,0,3,2>: Cost 4 vmrghw <1,3,0,1>, <0,2,1,3>
+  3760163199U,	// <1,0,3,3>: Cost 4 vsldoi8 <0,3,1,0>, <3,3,0,1>
+  3760163330U,	// <1,0,3,4>: Cost 4 vsldoi8 <0,3,1,0>, <3,4,5,6>
+  3779406377U,	// <1,0,3,5>: Cost 4 vsldoi8 <3,5,1,0>, <3,5,1,0>
+  3865690416U,	// <1,0,3,6>: Cost 4 vsldoi12 <6,7,0,1>, <0,3,6,7>
+  3366824568U,	// <1,0,3,7>: Cost 5 vmrglw <1,u,1,3>, <3,6,0,7>
+  2707655452U,	// <1,0,3,u>: Cost 3 vsldoi8 <3,u,1,0>, <3,u,1,0>
+  2734861202U,	// <1,0,4,0>: Cost 3 vsldoi8 <u,4,1,0>, <4,0,5,1>
+  2756854098U,	// <1,0,4,1>: Cost 3 vsldoi12 <0,u,1,1>, <0,4,1,5>
+  3830595931U,	// <1,0,4,2>: Cost 5 vsldoi12 <0,u,1,1>, <0,4,2,5>
+  3296968960U,	// <1,0,4,3>: Cost 4 vmrghw <1,4,0,1>, <0,3,1,4>
+  3830595949U,	// <1,0,4,4>: Cost 4 vsldoi12 <0,u,1,1>, <0,4,4,5>
+  2686422326U,	// <1,0,4,5>: Cost 3 vsldoi8 <0,3,1,0>, RHS
+  3297378806U,	// <1,0,4,6>: Cost 5 vmrghw <1,4,5,6>, <0,6,1,7>
+  3810594248U,	// <1,0,4,7>: Cost 4 vsldoi8 <u,7,1,0>, <4,7,5,0>
+  2686422569U,	// <1,0,4,u>: Cost 3 vsldoi8 <0,3,1,0>, RHS
+  2284470272U,	// <1,0,5,0>: Cost 3 vmrglw <0,4,1,5>, <0,0,0,0>
+  2284471974U,	// <1,0,5,1>: Cost 3 vmrglw <0,4,1,5>, <2,3,0,1>
+  3809267435U,	// <1,0,5,2>: Cost 4 vsldoi8 <u,5,1,0>, <5,2,1,3>
+  3297968384U,	// <1,0,5,3>: Cost 4 vmrghw <1,5,4,6>, <0,3,1,4>
+  2284471977U,	// <1,0,5,4>: Cost 3 vmrglw <0,4,1,5>, <2,3,0,4>
+  3721555603U,	// <1,0,5,5>: Cost 4 vsldoi4 <5,1,0,5>, <5,1,0,5>
+  3792679010U,	// <1,0,5,6>: Cost 4 vsldoi8 <5,7,1,0>, <5,6,7,0>
+  3792679037U,	// <1,0,5,7>: Cost 4 vsldoi8 <5,7,1,0>, <5,7,1,0>
+  2284471981U,	// <1,0,5,u>: Cost 3 vmrglw <0,4,1,5>, <2,3,0,u>
+  3356893184U,	// <1,0,6,0>: Cost 4 vmrglw <0,2,1,6>, <0,0,0,0>
+  2224676966U,	// <1,0,6,1>: Cost 3 vmrghw <1,6,1,7>, LHS
+  3298295985U,	// <1,0,6,2>: Cost 4 vmrghw <1,6,0,1>, <0,2,1,6>
+  3298345212U,	// <1,0,6,3>: Cost 4 vmrghw <1,6,0,7>, <0,3,1,0>
+  2224972114U,	// <1,0,6,4>: Cost 3 vmrghw <1,6,5,7>, <0,4,1,5>
+  3808604907U,	// <1,0,6,5>: Cost 4 vsldoi8 <u,4,1,0>, <6,5,7,1>
+  3799978808U,	// <1,0,6,6>: Cost 4 vsldoi8 <7,0,1,0>, <6,6,6,6>
+  2726237006U,	// <1,0,6,7>: Cost 3 vsldoi8 <7,0,1,0>, <6,7,0,1>
+  2224677522U,	// <1,0,6,u>: Cost 3 vmrghw <1,6,1,7>, <0,u,1,1>
+  2726237176U,	// <1,0,7,0>: Cost 3 vsldoi8 <7,0,1,0>, <7,0,1,0>
+  2285815462U,	// <1,0,7,1>: Cost 3 vmrglw <0,6,1,7>, <2,3,0,1>
+  3805951193U,	// <1,0,7,2>: Cost 4 vsldoi8 <u,0,1,0>, <7,2,u,0>
+  3807941859U,	// <1,0,7,3>: Cost 4 vsldoi8 <u,3,1,0>, <7,3,0,1>
+  3799979366U,	// <1,0,7,4>: Cost 4 vsldoi8 <7,0,1,0>, <7,4,5,6>
+  3803297165U,	// <1,0,7,5>: Cost 4 vsldoi8 <7,5,1,0>, <7,5,1,0>
+  3799979540U,	// <1,0,7,6>: Cost 4 vsldoi8 <7,0,1,0>, <7,6,7,0>
+  3799979628U,	// <1,0,7,7>: Cost 4 vsldoi8 <7,0,1,0>, <7,7,7,7>
+  2731546240U,	// <1,0,7,u>: Cost 3 vsldoi8 <7,u,1,0>, <7,u,1,0>
+  2284494848U,	// <1,0,u,0>: Cost 3 vmrglw <0,4,1,u>, <0,0,0,0>
+  1683112594U,	// <1,0,u,1>: Cost 2 vsldoi12 <0,u,1,1>, <0,u,1,1>
+  1683112605U,	// <1,0,u,2>: Cost 2 vsldoi12 <0,u,1,1>, LHS
+  2734200772U,	// <1,0,u,3>: Cost 3 vsldoi8 <u,3,1,0>, <u,3,1,0>
+  2757075629U,	// <1,0,u,4>: Cost 3 vsldoi12 <0,u,4,1>, <0,u,4,1>
+  2686425242U,	// <1,0,u,5>: Cost 3 vsldoi8 <0,3,1,0>, RHS
+  2791948430U,	// <1,0,u,6>: Cost 3 vsldoi12 <6,7,0,1>, <0,1,6,7>
+  2736855304U,	// <1,0,u,7>: Cost 3 vsldoi8 <u,7,1,0>, <u,7,1,0>
+  1683112659U,	// <1,0,u,u>: Cost 2 vsldoi12 <0,u,1,1>, LHS
+  1610694666U,	// <1,1,0,0>: Cost 2 vsldoi8 <0,0,1,1>, <0,0,1,1>
+  1616003174U,	// <1,1,0,1>: Cost 2 vsldoi8 <0,u,1,1>, LHS
+  2283767958U,	// <1,1,0,2>: Cost 3 vmrglw <0,3,1,0>, <3,0,1,2>
+  3357507596U,	// <1,1,0,3>: Cost 4 vmrglw <0,3,1,0>, <0,0,1,3>
+  2689745234U,	// <1,1,0,4>: Cost 3 vsldoi8 <0,u,1,1>, <0,4,1,5>
+  3357507922U,	// <1,1,0,5>: Cost 4 vmrglw <0,3,1,0>, <0,4,1,5>
+  3294397647U,	// <1,1,0,6>: Cost 4 vmrghw <1,0,1,2>, <1,6,1,7>
+  3373433334U,	// <1,1,0,7>: Cost 4 vmrglw <3,0,1,0>, <0,6,1,7>
+  1616003730U,	// <1,1,0,u>: Cost 2 vsldoi8 <0,u,1,1>, <0,u,1,1>
+  1550221414U,	// <1,1,1,0>: Cost 2 vsldoi4 <1,1,1,1>, LHS
+  269271142U,	// <1,1,1,1>: Cost 1 vspltisw1 LHS
+  2287093910U,	// <1,1,1,2>: Cost 3 vmrglw <0,u,1,1>, <3,0,1,2>
+  2287092615U,	// <1,1,1,3>: Cost 3 vmrglw <0,u,1,1>, <1,2,1,3>
+  1550224694U,	// <1,1,1,4>: Cost 2 vsldoi4 <1,1,1,1>, RHS
+  2287092050U,	// <1,1,1,5>: Cost 3 vmrglw <0,u,1,1>, <0,4,1,5>
+  2689746127U,	// <1,1,1,6>: Cost 3 vsldoi8 <0,u,1,1>, <1,6,1,7>
+  2659800138U,	// <1,1,1,7>: Cost 3 vsldoi4 <7,1,1,1>, <7,1,1,1>
+  269271142U,	// <1,1,1,u>: Cost 1 vspltisw1 LHS
+  2222113516U,	// <1,1,2,0>: Cost 3 vmrghw <1,2,3,0>, <1,0,2,1>
+  2756854663U,	// <1,1,2,1>: Cost 3 vsldoi12 <0,u,1,1>, <1,2,1,3>
+  1148371862U,	// <1,1,2,2>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0>
+  2689746598U,	// <1,1,2,3>: Cost 3 vsldoi8 <0,u,1,1>, <2,3,0,1>
+  2618002742U,	// <1,1,2,4>: Cost 3 vsldoi4 <0,1,1,2>, RHS
+  2299707730U,	// <1,1,2,5>: Cost 3 vmrglw <3,0,1,2>, <0,4,1,5>
+  2689746874U,	// <1,1,2,6>: Cost 3 vsldoi8 <0,u,1,1>, <2,6,3,7>
+  3361506511U,	// <1,1,2,7>: Cost 4 vmrglw <1,0,1,2>, <1,6,1,7>
+  1148371862U,	// <1,1,2,u>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0>
+  2689747094U,	// <1,1,3,0>: Cost 3 vsldoi8 <0,u,1,1>, <3,0,1,2>
+  2691074278U,	// <1,1,3,1>: Cost 3 vsldoi8 <1,1,1,1>, <3,1,1,1>
+  3356870806U,	// <1,1,3,2>: Cost 4 vmrglw <0,2,1,3>, <3,0,1,2>
+  2283126958U,	// <1,1,3,3>: Cost 3 vmrglw <0,2,1,3>, <0,2,1,3>
+  2689747458U,	// <1,1,3,4>: Cost 3 vsldoi8 <0,u,1,1>, <3,4,5,6>
+  3356868946U,	// <1,1,3,5>: Cost 4 vmrglw <0,2,1,3>, <0,4,1,5>
+  3811265144U,	// <1,1,3,6>: Cost 4 vsldoi8 <u,u,1,1>, <3,6,0,7>
+  3362841807U,	// <1,1,3,7>: Cost 4 vmrglw <1,2,1,3>, <1,6,1,7>
+  2689747742U,	// <1,1,3,u>: Cost 3 vsldoi8 <0,u,1,1>, <3,u,1,2>
+  2623987814U,	// <1,1,4,0>: Cost 3 vsldoi4 <1,1,1,4>, LHS
+  2758181931U,	// <1,1,4,1>: Cost 3 vsldoi12 <1,1,1,1>, <1,4,1,5>
+  2223408022U,	// <1,1,4,2>: Cost 3 vmrghw <1,4,2,5>, <1,2,3,0>
+  3697731734U,	// <1,1,4,3>: Cost 4 vsldoi4 <1,1,1,4>, <3,0,1,2>
+  2283798784U,	// <1,1,4,4>: Cost 3 vmrglw <0,3,1,4>, <0,3,1,4>
+  1616006454U,	// <1,1,4,5>: Cost 2 vsldoi8 <0,u,1,1>, RHS
+  3297379535U,	// <1,1,4,6>: Cost 4 vmrghw <1,4,5,6>, <1,6,1,7>
+  3373466102U,	// <1,1,4,7>: Cost 4 vmrglw <3,0,1,4>, <0,6,1,7>
+  1616006697U,	// <1,1,4,u>: Cost 2 vsldoi8 <0,u,1,1>, RHS
+  2760762479U,	// <1,1,5,0>: Cost 3 vsldoi12 <1,5,0,1>, <1,5,0,1>
+  2284470282U,	// <1,1,5,1>: Cost 3 vmrglw <0,4,1,5>, <0,0,1,1>
+  2284472470U,	// <1,1,5,2>: Cost 3 vmrglw <0,4,1,5>, <3,0,1,2>
+  3358212270U,	// <1,1,5,3>: Cost 4 vmrglw <0,4,1,5>, <0,2,1,3>
+  2284470285U,	// <1,1,5,4>: Cost 3 vmrglw <0,4,1,5>, <0,0,1,4>
+  1210728786U,	// <1,1,5,5>: Cost 2 vmrglw <0,4,1,5>, <0,4,1,5>
+  2737524834U,	// <1,1,5,6>: Cost 3 vsldoi8 <u,u,1,1>, <5,6,7,0>
+  3360867535U,	// <1,1,5,7>: Cost 4 vmrglw <0,u,1,5>, <1,6,1,7>
+  1210728786U,	// <1,1,5,u>: Cost 2 vmrglw <0,4,1,5>, <0,4,1,5>
+  3697746022U,	// <1,1,6,0>: Cost 4 vsldoi4 <1,1,1,6>, LHS
+  2756854991U,	// <1,1,6,1>: Cost 3 vsldoi12 <0,u,1,1>, <1,6,1,7>
+  2737525242U,	// <1,1,6,2>: Cost 3 vsldoi8 <u,u,1,1>, <6,2,7,3>
+  3839149281U,	// <1,1,6,3>: Cost 4 vsldoi12 <2,3,0,1>, <1,6,3,7>
+  3697749302U,	// <1,1,6,4>: Cost 4 vsldoi4 <1,1,1,6>, RHS
+  3356893522U,	// <1,1,6,5>: Cost 4 vmrglw <0,2,1,6>, <0,4,1,5>
+  2283151537U,	// <1,1,6,6>: Cost 3 vmrglw <0,2,1,6>, <0,2,1,6>
+  2791949566U,	// <1,1,6,7>: Cost 3 vsldoi12 <6,7,0,1>, <1,6,7,0>
+  2792613127U,	// <1,1,6,u>: Cost 3 vsldoi12 <6,u,0,1>, <1,6,u,0>
+  2737525754U,	// <1,1,7,0>: Cost 3 vsldoi8 <u,u,1,1>, <7,0,1,2>
+  2291786386U,	// <1,1,7,1>: Cost 3 vmrglw <1,6,1,7>, <0,u,1,1>
+  3365528292U,	// <1,1,7,2>: Cost 4 vmrglw <1,6,1,7>, <1,0,1,2>
+  3365528455U,	// <1,1,7,3>: Cost 4 vmrglw <1,6,1,7>, <1,2,1,3>
+  2737526118U,	// <1,1,7,4>: Cost 3 vsldoi8 <u,u,1,1>, <7,4,5,6>
+  3365527890U,	// <1,1,7,5>: Cost 4 vmrglw <1,6,1,7>, <0,4,1,5>
+  3365528377U,	// <1,1,7,6>: Cost 4 vmrglw <1,6,1,7>, <1,1,1,6>
+  2291786959U,	// <1,1,7,7>: Cost 3 vmrglw <1,6,1,7>, <1,6,1,7>
+  2737526402U,	// <1,1,7,u>: Cost 3 vsldoi8 <u,u,1,1>, <7,u,1,2>
+  1550221414U,	// <1,1,u,0>: Cost 2 vsldoi4 <1,1,1,1>, LHS
+  269271142U,	// <1,1,u,1>: Cost 1 vspltisw1 LHS
+  1148371862U,	// <1,1,u,2>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0>
+  2689750972U,	// <1,1,u,3>: Cost 3 vsldoi8 <0,u,1,1>, <u,3,0,1>
+  1550224694U,	// <1,1,u,4>: Cost 2 vsldoi4 <1,1,1,1>, RHS
+  1616009370U,	// <1,1,u,5>: Cost 2 vsldoi8 <0,u,1,1>, RHS
+  2689751248U,	// <1,1,u,6>: Cost 3 vsldoi8 <0,u,1,1>, <u,6,3,7>
+  2736863497U,	// <1,1,u,7>: Cost 3 vsldoi8 <u,7,1,1>, <u,7,1,1>
+  269271142U,	// <1,1,u,u>: Cost 1 vspltisw1 LHS
+  2702360576U,	// <1,2,0,0>: Cost 3 vsldoi8 <3,0,1,2>, <0,0,0,0>
+  1628618854U,	// <1,2,0,1>: Cost 2 vsldoi8 <3,0,1,2>, LHS
+  2685771949U,	// <1,2,0,2>: Cost 3 vsldoi8 <0,2,1,2>, <0,2,1,2>
+  2283765862U,	// <1,2,0,3>: Cost 3 vmrglw <0,3,1,0>, LHS
+  2702360914U,	// <1,2,0,4>: Cost 3 vsldoi8 <3,0,1,2>, <0,4,1,5>
+  3788046813U,	// <1,2,0,5>: Cost 4 vsldoi8 <5,0,1,2>, <0,5,u,0>
+  2688426481U,	// <1,2,0,6>: Cost 3 vsldoi8 <0,6,1,2>, <0,6,1,2>
+  2726249024U,	// <1,2,0,7>: Cost 3 vsldoi8 <7,0,1,2>, <0,7,1,0>
+  1628619421U,	// <1,2,0,u>: Cost 2 vsldoi8 <3,0,1,2>, LHS
+  2690417380U,	// <1,2,1,0>: Cost 3 vsldoi8 <1,0,1,2>, <1,0,1,2>
+  2702361396U,	// <1,2,1,1>: Cost 3 vsldoi8 <3,0,1,2>, <1,1,1,1>
+  2287093352U,	// <1,2,1,2>: Cost 3 vmrglw <0,u,1,1>, <2,2,2,2>
+  1213349990U,	// <1,2,1,3>: Cost 2 vmrglw <0,u,1,1>, LHS
+  3764159522U,	// <1,2,1,4>: Cost 4 vsldoi8 <1,0,1,2>, <1,4,0,5>
+  3295053672U,	// <1,2,1,5>: Cost 4 vmrghw <1,1,1,1>, <2,5,3,6>
+  2221311930U,	// <1,2,1,6>: Cost 3 vmrghw <1,1,1,1>, <2,6,3,7>
+  3799991593U,	// <1,2,1,7>: Cost 4 vsldoi8 <7,0,1,2>, <1,7,2,7>
+  1213349995U,	// <1,2,1,u>: Cost 2 vmrglw <0,u,1,1>, LHS
+  2624045158U,	// <1,2,2,0>: Cost 3 vsldoi4 <1,1,2,2>, LHS
+  2702362144U,	// <1,2,2,1>: Cost 3 vsldoi8 <3,0,1,2>, <2,1,3,2>
+  2283120232U,	// <1,2,2,2>: Cost 3 vmrglw <0,2,1,2>, <2,2,2,2>
+  1225965670U,	// <1,2,2,3>: Cost 2 vmrglw <3,0,1,2>, LHS
+  2624048438U,	// <1,2,2,4>: Cost 3 vsldoi4 <1,1,2,2>, RHS
+  3356860763U,	// <1,2,2,5>: Cost 4 vmrglw <0,2,1,2>, <0,4,2,5>
+  2222114746U,	// <1,2,2,6>: Cost 3 vmrghw <1,2,3,0>, <2,6,3,7>
+  2299708632U,	// <1,2,2,7>: Cost 3 vmrglw <3,0,1,2>, <1,6,2,7>
+  1225965675U,	// <1,2,2,u>: Cost 2 vmrglw <3,0,1,2>, LHS
+  470597734U,	// <1,2,3,0>: Cost 1 vsldoi4 LHS, LHS
+  1544340276U,	// <1,2,3,1>: Cost 2 vsldoi4 LHS, <1,1,1,1>
+  1544341096U,	// <1,2,3,2>: Cost 2 vsldoi4 LHS, <2,2,2,2>
+  1544341916U,	// <1,2,3,3>: Cost 2 vsldoi4 LHS, <3,3,3,3>
+  470601014U,	// <1,2,3,4>: Cost 1 vsldoi4 LHS, RHS
+  1592119300U,	// <1,2,3,5>: Cost 2 vsldoi4 LHS, <5,5,5,5>
+  1592119802U,	// <1,2,3,6>: Cost 2 vsldoi4 LHS, <6,2,7,3>
+  1592120314U,	// <1,2,3,7>: Cost 2 vsldoi4 LHS, <7,0,1,2>
+  470603566U,	// <1,2,3,u>: Cost 1 vsldoi4 LHS, LHS
+  2708335471U,	// <1,2,4,0>: Cost 3 vsldoi8 <4,0,1,2>, <4,0,1,2>
+  3838043908U,	// <1,2,4,1>: Cost 4 vsldoi12 <2,1,3,1>, <2,4,1,5>
+  3357541992U,	// <1,2,4,2>: Cost 4 vmrglw <0,3,1,4>, <2,2,2,2>
+  2283798630U,	// <1,2,4,3>: Cost 3 vmrglw <0,3,1,4>, LHS
+  2726251728U,	// <1,2,4,4>: Cost 3 vsldoi8 <7,0,1,2>, <4,4,4,4>
+  1628622134U,	// <1,2,4,5>: Cost 2 vsldoi8 <3,0,1,2>, RHS
+  3297077178U,	// <1,2,4,6>: Cost 4 vmrghw <1,4,1,5>, <2,6,3,7>
+  2726251976U,	// <1,2,4,7>: Cost 3 vsldoi8 <7,0,1,2>, <4,7,5,0>
+  1628622377U,	// <1,2,4,u>: Cost 2 vsldoi8 <3,0,1,2>, RHS
+  2714308168U,	// <1,2,5,0>: Cost 3 vsldoi8 <5,0,1,2>, <5,0,1,2>
+  3297633827U,	// <1,2,5,1>: Cost 4 vmrghw <1,5,0,1>, <2,1,3,5>
+  2284471912U,	// <1,2,5,2>: Cost 3 vmrglw <0,4,1,5>, <2,2,2,2>
+  1210728550U,	// <1,2,5,3>: Cost 2 vmrglw <0,4,1,5>, LHS
+  3776106420U,	// <1,2,5,4>: Cost 4 vsldoi8 <3,0,1,2>, <5,4,5,6>
+  2726252548U,	// <1,2,5,5>: Cost 3 vsldoi8 <7,0,1,2>, <5,5,5,5>
+  2726252642U,	// <1,2,5,6>: Cost 3 vsldoi8 <7,0,1,2>, <5,6,7,0>
+  3799994538U,	// <1,2,5,7>: Cost 4 vsldoi8 <7,0,1,2>, <5,7,6,0>
+  1210728555U,	// <1,2,5,u>: Cost 2 vmrglw <0,4,1,5>, LHS
+  2720280865U,	// <1,2,6,0>: Cost 3 vsldoi8 <6,0,1,2>, <6,0,1,2>
+  2702365096U,	// <1,2,6,1>: Cost 3 vsldoi8 <3,0,1,2>, <6,1,7,2>
+  2726253050U,	// <1,2,6,2>: Cost 3 vsldoi8 <7,0,1,2>, <6,2,7,3>
+  2283151462U,	// <1,2,6,3>: Cost 3 vmrglw <0,2,1,6>, LHS
+  3697823030U,	// <1,2,6,4>: Cost 4 vsldoi4 <1,1,2,6>, RHS
+  3298715497U,	// <1,2,6,5>: Cost 4 vmrghw <1,6,5,7>, <2,5,3,7>
+  2726253368U,	// <1,2,6,6>: Cost 3 vsldoi8 <7,0,1,2>, <6,6,6,6>
+  2724926296U,	// <1,2,6,7>: Cost 3 vsldoi8 <6,7,1,2>, <6,7,1,2>
+  2283151467U,	// <1,2,6,u>: Cost 3 vmrglw <0,2,1,6>, LHS
+  1652511738U,	// <1,2,7,0>: Cost 2 vsldoi8 <7,0,1,2>, <7,0,1,2>
+  3371500916U,	// <1,2,7,1>: Cost 4 vmrglw <2,6,1,7>, <1,u,2,1>
+  3365529192U,	// <1,2,7,2>: Cost 4 vmrglw <1,6,1,7>, <2,2,2,2>
+  2291785830U,	// <1,2,7,3>: Cost 3 vmrglw <1,6,1,7>, LHS
+  2726253926U,	// <1,2,7,4>: Cost 3 vsldoi8 <7,0,1,2>, <7,4,5,6>
+  3788051845U,	// <1,2,7,5>: Cost 4 vsldoi8 <5,0,1,2>, <7,5,0,1>
+  3794023894U,	// <1,2,7,6>: Cost 4 vsldoi8 <6,0,1,2>, <7,6,0,1>
+  2726254119U,	// <1,2,7,7>: Cost 3 vsldoi8 <7,0,1,2>, <7,7,0,1>
+  1657820802U,	// <1,2,7,u>: Cost 2 vsldoi8 <7,u,1,2>, <7,u,1,2>
+  470638699U,	// <1,2,u,0>: Cost 1 vsldoi4 LHS, LHS
+  1544381236U,	// <1,2,u,1>: Cost 2 vsldoi4 LHS, <1,1,1,1>
+  1544382056U,	// <1,2,u,2>: Cost 2 vsldoi4 LHS, <2,2,2,2>
+  1544382614U,	// <1,2,u,3>: Cost 2 vsldoi4 LHS, <3,0,1,2>
+  470641974U,	// <1,2,u,4>: Cost 1 vsldoi4 LHS, RHS
+  1628625050U,	// <1,2,u,5>: Cost 2 vsldoi8 <3,0,1,2>, RHS
+  1592160762U,	// <1,2,u,6>: Cost 2 vsldoi4 LHS, <6,2,7,3>
+  1592161274U,	// <1,2,u,7>: Cost 2 vsldoi4 LHS, <7,0,1,2>
+  470644526U,	// <1,2,u,u>: Cost 1 vsldoi4 LHS, LHS
+  2769389708U,	// <1,3,0,0>: Cost 3 vsldoi12 <3,0,0,1>, <3,0,0,1>
+  2685780070U,	// <1,3,0,1>: Cost 3 vsldoi8 <0,2,1,3>, LHS
+  2685780142U,	// <1,3,0,2>: Cost 3 vsldoi8 <0,2,1,3>, <0,2,1,3>
+  2686443775U,	// <1,3,0,3>: Cost 3 vsldoi8 <0,3,1,3>, <0,3,1,3>
+  2769684656U,	// <1,3,0,4>: Cost 3 vsldoi12 <3,0,4,1>, <3,0,4,1>
+  3357507940U,	// <1,3,0,5>: Cost 4 vmrglw <0,3,1,0>, <0,4,3,5>
+  3759522294U,	// <1,3,0,6>: Cost 4 vsldoi8 <0,2,1,3>, <0,6,1,7>
+  3357509562U,	// <1,3,0,7>: Cost 4 vmrglw <0,3,1,0>, <2,6,3,7>
+  2685780637U,	// <1,3,0,u>: Cost 3 vsldoi8 <0,2,1,3>, LHS
+  2287092630U,	// <1,3,1,0>: Cost 3 vmrglw <0,u,1,1>, <1,2,3,0>
+  2221312230U,	// <1,3,1,1>: Cost 3 vmrghw <1,1,1,1>, <3,1,1,1>
+  2691752839U,	// <1,3,1,2>: Cost 3 vsldoi8 <1,2,1,3>, <1,2,1,3>
+  2287093362U,	// <1,3,1,3>: Cost 3 vmrglw <0,u,1,1>, <2,2,3,3>
+  2287092634U,	// <1,3,1,4>: Cost 3 vmrglw <0,u,1,1>, <1,2,3,4>
+  3360835107U,	// <1,3,1,5>: Cost 4 vmrglw <0,u,1,1>, <2,1,3,5>
+  3759523041U,	// <1,3,1,6>: Cost 4 vsldoi8 <0,2,1,3>, <1,6,3,7>
+  2287093690U,	// <1,3,1,7>: Cost 3 vmrglw <0,u,1,1>, <2,6,3,7>
+  2287092638U,	// <1,3,1,u>: Cost 3 vmrglw <0,u,1,1>, <1,2,3,u>
+  2222114966U,	// <1,3,2,0>: Cost 3 vmrghw <1,2,3,0>, <3,0,1,2>
+  2222115057U,	// <1,3,2,1>: Cost 3 vmrghw <1,2,3,0>, <3,1,2,3>
+  2630092320U,	// <1,3,2,2>: Cost 3 vsldoi4 <2,1,3,2>, <2,1,3,2>
+  2685781670U,	// <1,3,2,3>: Cost 3 vsldoi8 <0,2,1,3>, <2,3,0,1>
+  2222115330U,	// <1,3,2,4>: Cost 3 vmrghw <1,2,3,0>, <3,4,5,6>
+  3373449572U,	// <1,3,2,5>: Cost 4 vmrglw <3,0,1,2>, <0,4,3,5>
+  2222115448U,	// <1,3,2,6>: Cost 3 vmrghw <1,2,3,0>, <3,6,0,7>
+  2299709370U,	// <1,3,2,7>: Cost 3 vmrglw <3,0,1,2>, <2,6,3,7>
+  2222115614U,	// <1,3,2,u>: Cost 3 vmrghw <1,2,3,0>, <3,u,1,2>
+  2771380607U,	// <1,3,3,0>: Cost 3 vsldoi12 <3,3,0,1>, <3,3,0,1>
+  3356874468U,	// <1,3,3,1>: Cost 4 vmrglw <0,2,1,3>, <u,0,3,1>
+  3759524168U,	// <1,3,3,2>: Cost 4 vsldoi8 <0,2,1,3>, <3,2,3,0>
+  2283792796U,	// <1,3,3,3>: Cost 3 vmrglw <0,3,1,3>, <3,3,3,3>
+  3356869530U,	// <1,3,3,4>: Cost 4 vmrglw <0,2,1,3>, <1,2,3,4>
+  3721760428U,	// <1,3,3,5>: Cost 4 vsldoi4 <5,1,3,3>, <5,1,3,3>
+  3296496248U,	// <1,3,3,6>: Cost 4 vmrghw <1,3,2,6>, <3,6,0,7>
+  3356870586U,	// <1,3,3,7>: Cost 4 vmrglw <0,2,1,3>, <2,6,3,7>
+  2771970503U,	// <1,3,3,u>: Cost 3 vsldoi12 <3,3,u,1>, <3,3,u,1>
+  2772044240U,	// <1,3,4,0>: Cost 3 vsldoi12 <3,4,0,1>, <3,4,0,1>
+  3362186135U,	// <1,3,4,1>: Cost 4 vmrglw <1,1,1,4>, <1,2,3,1>
+  3297151280U,	// <1,3,4,2>: Cost 4 vmrghw <1,4,2,5>, <3,2,0,3>
+  3357542002U,	// <1,3,4,3>: Cost 4 vmrglw <0,3,1,4>, <2,2,3,3>
+  3357540626U,	// <1,3,4,4>: Cost 4 vmrglw <0,3,1,4>, <0,3,3,4>
+  2685783350U,	// <1,3,4,5>: Cost 3 vsldoi8 <0,2,1,3>, RHS
+  3357546622U,	// <1,3,4,6>: Cost 4 vmrglw <0,3,1,4>, <u,5,3,6>
+  3357542330U,	// <1,3,4,7>: Cost 4 vmrglw <0,3,1,4>, <2,6,3,7>
+  2685783593U,	// <1,3,4,u>: Cost 3 vsldoi8 <0,2,1,3>, RHS
+  2284471190U,	// <1,3,5,0>: Cost 3 vmrglw <0,4,1,5>, <1,2,3,0>
+  3358213015U,	// <1,3,5,1>: Cost 4 vmrglw <0,4,1,5>, <1,2,3,1>
+  2630116899U,	// <1,3,5,2>: Cost 3 vsldoi4 <2,1,3,5>, <2,1,3,5>
+  2284471922U,	// <1,3,5,3>: Cost 3 vmrglw <0,4,1,5>, <2,2,3,3>
+  2284471194U,	// <1,3,5,4>: Cost 3 vmrglw <0,4,1,5>, <1,2,3,4>
+  2284471843U,	// <1,3,5,5>: Cost 3 vmrglw <0,4,1,5>, <2,1,3,5>
+  3358218366U,	// <1,3,5,6>: Cost 4 vmrglw <0,4,1,5>, <u,5,3,6>
+  2284472250U,	// <1,3,5,7>: Cost 3 vmrglw <0,4,1,5>, <2,6,3,7>
+  2284471198U,	// <1,3,5,u>: Cost 3 vmrglw <0,4,1,5>, <1,2,3,u>
+  2224752790U,	// <1,3,6,0>: Cost 3 vmrghw <1,6,2,7>, <3,0,1,2>
+  3832736385U,	// <1,3,6,1>: Cost 4 vsldoi12 <1,2,3,1>, <3,6,1,7>
+  3703866916U,	// <1,3,6,2>: Cost 4 vsldoi4 <2,1,3,6>, <2,1,3,6>
+  3356894834U,	// <1,3,6,3>: Cost 4 vmrglw <0,2,1,6>, <2,2,3,3>
+  3356894106U,	// <1,3,6,4>: Cost 4 vmrglw <0,2,1,6>, <1,2,3,4>
+  3356894755U,	// <1,3,6,5>: Cost 5 vmrglw <0,2,1,6>, <2,1,3,5>
+  3356899130U,	// <1,3,6,6>: Cost 4 vmrglw <0,2,1,6>, <u,1,3,6>
+  2283153338U,	// <1,3,6,7>: Cost 3 vmrglw <0,2,1,6>, <2,6,3,7>
+  2283153338U,	// <1,3,6,u>: Cost 3 vmrglw <0,2,1,6>, <2,6,3,7>
+  2774035139U,	// <1,3,7,0>: Cost 3 vsldoi12 <3,7,0,1>, <3,7,0,1>
+  3703874767U,	// <1,3,7,1>: Cost 4 vsldoi4 <2,1,3,7>, <1,6,1,7>
+  3703875109U,	// <1,3,7,2>: Cost 4 vsldoi4 <2,1,3,7>, <2,1,3,7>
+  3365529202U,	// <1,3,7,3>: Cost 4 vmrglw <1,6,1,7>, <2,2,3,3>
+  3365528474U,	// <1,3,7,4>: Cost 4 vmrglw <1,6,1,7>, <1,2,3,4>
+  3789387159U,	// <1,3,7,5>: Cost 4 vsldoi8 <5,2,1,3>, <7,5,2,1>
+  3865692927U,	// <1,3,7,6>: Cost 4 vsldoi12 <6,7,0,1>, <3,7,6,7>
+  3363538874U,	// <1,3,7,7>: Cost 4 vmrglw <1,3,1,7>, <2,6,3,7>
+  2774625035U,	// <1,3,7,u>: Cost 3 vsldoi12 <3,7,u,1>, <3,7,u,1>
+  2284495766U,	// <1,3,u,0>: Cost 3 vmrglw <0,4,1,u>, <1,2,3,0>
+  2685785902U,	// <1,3,u,1>: Cost 3 vsldoi8 <0,2,1,3>, LHS
+  2630141478U,	// <1,3,u,2>: Cost 3 vsldoi4 <2,1,3,u>, <2,1,3,u>
+  2283169880U,	// <1,3,u,3>: Cost 3 vmrglw <0,2,1,u>, <2,u,3,3>
+  2284495770U,	// <1,3,u,4>: Cost 3 vmrglw <0,4,1,u>, <1,2,3,4>
+  2685786266U,	// <1,3,u,5>: Cost 3 vsldoi8 <0,2,1,3>, RHS
+  2222115448U,	// <1,3,u,6>: Cost 3 vmrghw <1,2,3,0>, <3,6,0,7>
+  2284496826U,	// <1,3,u,7>: Cost 3 vmrglw <0,4,1,u>, <2,6,3,7>
+  2685786469U,	// <1,3,u,u>: Cost 3 vsldoi8 <0,2,1,3>, LHS
+  2684461069U,	// <1,4,0,0>: Cost 3 vsldoi8 <0,0,1,4>, <0,0,1,4>
+  2686451814U,	// <1,4,0,1>: Cost 3 vsldoi8 <0,3,1,4>, LHS
+  3759530159U,	// <1,4,0,2>: Cost 4 vsldoi8 <0,2,1,4>, <0,2,1,4>
+  2686451968U,	// <1,4,0,3>: Cost 3 vsldoi8 <0,3,1,4>, <0,3,1,4>
+  2684461394U,	// <1,4,0,4>: Cost 3 vsldoi8 <0,0,1,4>, <0,4,1,5>
+  1701989266U,	// <1,4,0,5>: Cost 2 vsldoi12 <4,0,5,1>, <4,0,5,1>
+  3776119286U,	// <1,4,0,6>: Cost 4 vsldoi8 <3,0,1,4>, <0,6,1,7>
+  2689106500U,	// <1,4,0,7>: Cost 3 vsldoi8 <0,7,1,4>, <0,7,1,4>
+  1702210477U,	// <1,4,0,u>: Cost 2 vsldoi12 <4,0,u,1>, <4,0,u,1>
+  2221312914U,	// <1,4,1,0>: Cost 3 vmrghw <1,1,1,1>, <4,0,5,1>
+  2691097399U,	// <1,4,1,1>: Cost 3 vsldoi8 <1,1,1,4>, <1,1,1,4>
+  3760194454U,	// <1,4,1,2>: Cost 4 vsldoi8 <0,3,1,4>, <1,2,3,0>
+  3766166489U,	// <1,4,1,3>: Cost 4 vsldoi8 <1,3,1,4>, <1,3,1,4>
+  2334870736U,	// <1,4,1,4>: Cost 3 vmrglw <u,u,1,1>, <4,4,4,4>
+  1147571510U,	// <1,4,1,5>: Cost 2 vmrghw <1,1,1,1>, RHS
+  3760194794U,	// <1,4,1,6>: Cost 4 vsldoi8 <0,3,1,4>, <1,6,4,7>
+  3867315188U,	// <1,4,1,7>: Cost 4 vsldoi12 <7,0,4,1>, <4,1,7,0>
+  1147571753U,	// <1,4,1,u>: Cost 2 vmrghw <1,1,1,1>, RHS
+  2222115730U,	// <1,4,2,0>: Cost 3 vmrghw <1,2,3,0>, <4,0,5,1>
+  2222115812U,	// <1,4,2,1>: Cost 3 vmrghw <1,2,3,0>, <4,1,5,2>
+  3760195176U,	// <1,4,2,2>: Cost 4 vsldoi8 <0,3,1,4>, <2,2,2,2>
+  2702378662U,	// <1,4,2,3>: Cost 3 vsldoi8 <3,0,1,4>, <2,3,0,1>
+  2323598544U,	// <1,4,2,4>: Cost 3 vmrglw <7,0,1,2>, <4,4,4,4>
+  1148374326U,	// <1,4,2,5>: Cost 2 vmrghw <1,2,3,0>, RHS
+  3760195514U,	// <1,4,2,6>: Cost 4 vsldoi8 <0,3,1,4>, <2,6,3,7>
+  3373451932U,	// <1,4,2,7>: Cost 4 vmrglw <3,0,1,2>, <3,6,4,7>
+  1148374569U,	// <1,4,2,u>: Cost 2 vmrghw <1,2,3,0>, RHS
+  2702379160U,	// <1,4,3,0>: Cost 3 vsldoi8 <3,0,1,4>, <3,0,1,4>
+  3760195840U,	// <1,4,3,1>: Cost 4 vsldoi8 <0,3,1,4>, <3,1,4,0>
+  3776121160U,	// <1,4,3,2>: Cost 4 vsldoi8 <3,0,1,4>, <3,2,3,0>
+  3760195996U,	// <1,4,3,3>: Cost 4 vsldoi8 <0,3,1,4>, <3,3,3,3>
+  2686454274U,	// <1,4,3,4>: Cost 3 vsldoi8 <0,3,1,4>, <3,4,5,6>
+  3356870350U,	// <1,4,3,5>: Cost 4 vmrglw <0,2,1,3>, <2,3,4,5>
+  3800009392U,	// <1,4,3,6>: Cost 4 vsldoi8 <7,0,1,4>, <3,6,7,0>
+  3366824604U,	// <1,4,3,7>: Cost 5 vmrglw <1,u,1,3>, <3,6,4,7>
+  2707688224U,	// <1,4,3,u>: Cost 3 vsldoi8 <3,u,1,4>, <3,u,1,4>
+  2775731368U,	// <1,4,4,0>: Cost 3 vsldoi12 <4,0,5,1>, <4,4,0,0>
+  3830820018U,	// <1,4,4,1>: Cost 4 vsldoi12 <0,u,4,1>, <4,4,1,1>
+  3691980454U,	// <1,4,4,2>: Cost 4 vsldoi4 <0,1,4,4>, <2,3,0,1>
+  3357541282U,	// <1,4,4,3>: Cost 4 vmrglw <0,3,1,4>, <1,2,4,3>
+  2781039824U,	// <1,4,4,4>: Cost 3 vsldoi12 <4,u,5,1>, <4,4,4,4>
+  2686455094U,	// <1,4,4,5>: Cost 3 vsldoi8 <0,3,1,4>, RHS
+  3357541528U,	// <1,4,4,6>: Cost 4 vmrglw <0,3,1,4>, <1,5,4,6>
+  3810627020U,	// <1,4,4,7>: Cost 4 vsldoi8 <u,7,1,4>, <4,7,5,4>
+  2686455337U,	// <1,4,4,u>: Cost 3 vsldoi8 <0,3,1,4>, RHS
+  2624217190U,	// <1,4,5,0>: Cost 3 vsldoi4 <1,1,4,5>, LHS
+  2284470309U,	// <1,4,5,1>: Cost 3 vmrglw <0,4,1,5>, <0,0,4,1>
+  2618246822U,	// <1,4,5,2>: Cost 3 vsldoi4 <0,1,4,5>, <2,3,0,1>
+  3358212297U,	// <1,4,5,3>: Cost 4 vmrglw <0,4,1,5>, <0,2,4,3>
+  2284470312U,	// <1,4,5,4>: Cost 3 vmrglw <0,4,1,5>, <0,0,4,4>
+  2284470637U,	// <1,4,5,5>: Cost 3 vmrglw <0,4,1,5>, <0,4,4,5>
+  1683115318U,	// <1,4,5,6>: Cost 2 vsldoi12 <0,u,1,1>, RHS
+  3721851898U,	// <1,4,5,7>: Cost 4 vsldoi4 <5,1,4,5>, <7,0,1,2>
+  1683115336U,	// <1,4,5,u>: Cost 2 vsldoi12 <0,u,1,1>, RHS
+  3794039075U,	// <1,4,6,0>: Cost 4 vsldoi8 <6,0,1,4>, <6,0,1,4>
+  3830820186U,	// <1,4,6,1>: Cost 4 vsldoi12 <0,u,4,1>, <4,6,1,7>
+  3800011258U,	// <1,4,6,2>: Cost 4 vsldoi8 <7,0,1,4>, <6,2,7,3>
+  3807973938U,	// <1,4,6,3>: Cost 4 vsldoi8 <u,3,1,4>, <6,3,4,5>
+  3298716880U,	// <1,4,6,4>: Cost 4 vmrghw <1,6,5,7>, <4,4,4,4>
+  2224680246U,	// <1,4,6,5>: Cost 3 vmrghw <1,6,1,7>, RHS
+  3800011576U,	// <1,4,6,6>: Cost 4 vsldoi8 <7,0,1,4>, <6,6,6,6>
+  2726269774U,	// <1,4,6,7>: Cost 3 vsldoi8 <7,0,1,4>, <6,7,0,1>
+  2224680489U,	// <1,4,6,u>: Cost 3 vmrghw <1,6,1,7>, RHS
+  2726269948U,	// <1,4,7,0>: Cost 3 vsldoi8 <7,0,1,4>, <7,0,1,4>
+  3383444141U,	// <1,4,7,1>: Cost 4 vmrglw <4,6,1,7>, <0,u,4,1>
+  3805983961U,	// <1,4,7,2>: Cost 4 vsldoi8 <u,0,1,4>, <7,2,u,0>
+  3807974667U,	// <1,4,7,3>: Cost 4 vsldoi8 <u,3,1,4>, <7,3,4,5>
+  2736887142U,	// <1,4,7,4>: Cost 3 vsldoi8 <u,7,1,4>, <7,4,5,6>
+  3365528403U,	// <1,4,7,5>: Cost 4 vmrglw <1,6,1,7>, <1,1,4,5>
+  3800012308U,	// <1,4,7,6>: Cost 4 vsldoi8 <7,0,1,4>, <7,6,7,0>
+  3800012396U,	// <1,4,7,7>: Cost 4 vsldoi8 <7,0,1,4>, <7,7,7,7>
+  2731579012U,	// <1,4,7,u>: Cost 3 vsldoi8 <7,u,1,4>, <7,u,1,4>
+  2624241766U,	// <1,4,u,0>: Cost 3 vsldoi4 <1,1,4,u>, LHS
+  2686457646U,	// <1,4,u,1>: Cost 3 vsldoi8 <0,3,1,4>, LHS
+  2618271398U,	// <1,4,u,2>: Cost 3 vsldoi4 <0,1,4,u>, <2,3,0,1>
+  2734233544U,	// <1,4,u,3>: Cost 3 vsldoi8 <u,3,1,4>, <u,3,1,4>
+  2689775679U,	// <1,4,u,4>: Cost 3 vsldoi8 <0,u,1,4>, <u,4,5,6>
+  1152355638U,	// <1,4,u,5>: Cost 2 vmrghw <1,u,3,0>, RHS
+  1683115561U,	// <1,4,u,6>: Cost 2 vsldoi12 <0,u,1,1>, RHS
+  2736888076U,	// <1,4,u,7>: Cost 3 vsldoi8 <u,7,1,4>, <u,7,1,4>
+  1683115579U,	// <1,4,u,u>: Cost 2 vsldoi12 <0,u,1,1>, RHS
+  2687123456U,	// <1,5,0,0>: Cost 3 vsldoi8 <0,4,1,5>, <0,0,0,0>
+  1613381734U,	// <1,5,0,1>: Cost 2 vsldoi8 <0,4,1,5>, LHS
+  3759538352U,	// <1,5,0,2>: Cost 4 vsldoi8 <0,2,1,5>, <0,2,1,5>
+  3760865532U,	// <1,5,0,3>: Cost 4 vsldoi8 <0,4,1,5>, <0,3,1,0>
+  1613381970U,	// <1,5,0,4>: Cost 2 vsldoi8 <0,4,1,5>, <0,4,1,5>
+  2687787427U,	// <1,5,0,5>: Cost 3 vsldoi8 <0,5,1,5>, <0,5,1,5>
+  2781777524U,	// <1,5,0,6>: Cost 3 vsldoi12 <5,0,6,1>, <5,0,6,1>
+  3733828717U,	// <1,5,0,7>: Cost 4 vsldoi4 <7,1,5,0>, <7,1,5,0>
+  1613382301U,	// <1,5,0,u>: Cost 2 vsldoi8 <0,4,1,5>, LHS
+  2781040271U,	// <1,5,1,0>: Cost 3 vsldoi12 <4,u,5,1>, <5,1,0,1>
+  2687124276U,	// <1,5,1,1>: Cost 3 vsldoi8 <0,4,1,5>, <1,1,1,1>
+  2687124374U,	// <1,5,1,2>: Cost 3 vsldoi8 <0,4,1,5>, <1,2,3,0>
+  3760866297U,	// <1,5,1,3>: Cost 4 vsldoi8 <0,4,1,5>, <1,3,5,0>
+  2693096491U,	// <1,5,1,4>: Cost 3 vsldoi8 <1,4,1,5>, <1,4,1,5>
+  2687124591U,	// <1,5,1,5>: Cost 3 vsldoi8 <0,4,1,5>, <1,5,0,1>
+  2687124723U,	// <1,5,1,6>: Cost 3 vsldoi8 <0,4,1,5>, <1,6,5,7>
+  3360834803U,	// <1,5,1,7>: Cost 4 vmrglw <0,u,1,1>, <1,6,5,7>
+  2687124860U,	// <1,5,1,u>: Cost 3 vsldoi8 <0,4,1,5>, <1,u,3,0>
+  2323598792U,	// <1,5,2,0>: Cost 3 vmrglw <7,0,1,2>, <4,7,5,0>
+  2687125027U,	// <1,5,2,1>: Cost 3 vsldoi8 <0,4,1,5>, <2,1,3,5>
+  2687125096U,	// <1,5,2,2>: Cost 3 vsldoi8 <0,4,1,5>, <2,2,2,2>
+  2687125158U,	// <1,5,2,3>: Cost 3 vsldoi8 <0,4,1,5>, <2,3,0,1>
+  2642185188U,	// <1,5,2,4>: Cost 3 vsldoi4 <4,1,5,2>, <4,1,5,2>
+  2323598554U,	// <1,5,2,5>: Cost 3 vmrglw <7,0,1,2>, <4,4,5,5>
+  2687125434U,	// <1,5,2,6>: Cost 3 vsldoi8 <0,4,1,5>, <2,6,3,7>
+  3373450483U,	// <1,5,2,7>: Cost 4 vmrglw <3,0,1,2>, <1,6,5,7>
+  2687125563U,	// <1,5,2,u>: Cost 3 vsldoi8 <0,4,1,5>, <2,u,0,1>
+  2687125654U,	// <1,5,3,0>: Cost 3 vsldoi8 <0,4,1,5>, <3,0,1,2>
+  2312990234U,	// <1,5,3,1>: Cost 3 vmrglw <5,2,1,3>, <4,u,5,1>
+  3760867649U,	// <1,5,3,2>: Cost 4 vsldoi8 <0,4,1,5>, <3,2,2,2>
+  2687125916U,	// <1,5,3,3>: Cost 3 vsldoi8 <0,4,1,5>, <3,3,3,3>
+  2687126018U,	// <1,5,3,4>: Cost 3 vsldoi8 <0,4,1,5>, <3,4,5,6>
+  3386731738U,	// <1,5,3,5>: Cost 4 vmrglw <5,2,1,3>, <4,4,5,5>
+  3356871170U,	// <1,5,3,6>: Cost 4 vmrglw <0,2,1,3>, <3,4,5,6>
+  3808643779U,	// <1,5,3,7>: Cost 4 vsldoi8 <u,4,1,5>, <3,7,0,1>
+  2687126302U,	// <1,5,3,u>: Cost 3 vsldoi8 <0,4,1,5>, <3,u,1,2>
+  2642198630U,	// <1,5,4,0>: Cost 3 vsldoi4 <4,1,5,4>, LHS
+  2687126498U,	// <1,5,4,1>: Cost 3 vsldoi8 <0,4,1,5>, <4,1,5,0>
+  3715941923U,	// <1,5,4,2>: Cost 4 vsldoi4 <4,1,5,4>, <2,1,3,5>
+  3709970701U,	// <1,5,4,3>: Cost 4 vsldoi4 <3,1,5,4>, <3,1,5,4>
+  2687126736U,	// <1,5,4,4>: Cost 3 vsldoi8 <0,4,1,5>, <4,4,4,4>
+  1613385014U,	// <1,5,4,5>: Cost 2 vsldoi8 <0,4,1,5>, RHS
+  2283801090U,	// <1,5,4,6>: Cost 3 vmrglw <0,3,1,4>, <3,4,5,6>
+  3733861489U,	// <1,5,4,7>: Cost 4 vsldoi4 <7,1,5,4>, <7,1,5,4>
+  1613385257U,	// <1,5,4,u>: Cost 2 vsldoi8 <0,4,1,5>, RHS
+  2624290918U,	// <1,5,5,0>: Cost 3 vsldoi4 <1,1,5,5>, LHS
+  2624291676U,	// <1,5,5,1>: Cost 3 vsldoi4 <1,1,5,5>, <1,1,5,5>
+  3698034211U,	// <1,5,5,2>: Cost 4 vsldoi4 <1,1,5,5>, <2,1,3,5>
+  2284471211U,	// <1,5,5,3>: Cost 3 vmrglw <0,4,1,5>, <1,2,5,3>
+  2624294198U,	// <1,5,5,4>: Cost 3 vsldoi4 <1,1,5,5>, RHS
+  2284471132U,	// <1,5,5,5>: Cost 3 vmrglw <0,4,1,5>, <1,1,5,5>
+  2284472834U,	// <1,5,5,6>: Cost 3 vmrglw <0,4,1,5>, <3,4,5,6>
+  2284471539U,	// <1,5,5,7>: Cost 3 vmrglw <0,4,1,5>, <1,6,5,7>
+  2284471216U,	// <1,5,5,u>: Cost 3 vmrglw <0,4,1,5>, <1,2,5,u>
+  2785316900U,	// <1,5,6,0>: Cost 3 vsldoi12 <5,6,0,1>, <5,6,0,1>
+  2781040691U,	// <1,5,6,1>: Cost 3 vsldoi12 <4,u,5,1>, <5,6,1,7>
+  2734903802U,	// <1,5,6,2>: Cost 3 vsldoi8 <u,4,1,5>, <6,2,7,3>
+  3848736834U,	// <1,5,6,3>: Cost 4 vsldoi12 <3,u,4,1>, <5,6,3,4>
+  3298717620U,	// <1,5,6,4>: Cost 4 vmrghw <1,6,5,7>, <5,4,5,6>
+  3298717700U,	// <1,5,6,5>: Cost 4 vmrghw <1,6,5,7>, <5,5,5,5>
+  2734904120U,	// <1,5,6,6>: Cost 3 vsldoi8 <u,4,1,5>, <6,6,6,6>
+  2781040738U,	// <1,5,6,7>: Cost 3 vsldoi12 <4,u,5,1>, <5,6,7,0>
+  2781040747U,	// <1,5,6,u>: Cost 3 vsldoi12 <4,u,5,1>, <5,6,u,0>
+  2734904314U,	// <1,5,7,0>: Cost 3 vsldoi8 <u,4,1,5>, <7,0,1,2>
+  2315677210U,	// <1,5,7,1>: Cost 3 vmrglw <5,6,1,7>, <4,u,5,1>
+  3808646292U,	// <1,5,7,2>: Cost 4 vsldoi8 <u,4,1,5>, <7,2,0,3>
+  3808646371U,	// <1,5,7,3>: Cost 4 vsldoi8 <u,4,1,5>, <7,3,0,1>
+  2734904678U,	// <1,5,7,4>: Cost 3 vsldoi8 <u,4,1,5>, <7,4,5,6>
+  3389418714U,	// <1,5,7,5>: Cost 4 vmrglw <5,6,1,7>, <4,4,5,5>
+  3365528656U,	// <1,5,7,6>: Cost 4 vmrglw <1,6,1,7>, <1,4,5,6>
+  2734904940U,	// <1,5,7,7>: Cost 3 vsldoi8 <u,4,1,5>, <7,7,7,7>
+  2734904962U,	// <1,5,7,u>: Cost 3 vsldoi8 <u,4,1,5>, <7,u,1,2>
+  2687129299U,	// <1,5,u,0>: Cost 3 vsldoi8 <0,4,1,5>, <u,0,1,2>
+  1613387566U,	// <1,5,u,1>: Cost 2 vsldoi8 <0,4,1,5>, LHS
+  2687129480U,	// <1,5,u,2>: Cost 3 vsldoi8 <0,4,1,5>, <u,2,3,3>
+  2687129532U,	// <1,5,u,3>: Cost 3 vsldoi8 <0,4,1,5>, <u,3,0,1>
+  1661163546U,	// <1,5,u,4>: Cost 2 vsldoi8 <u,4,1,5>, <u,4,1,5>
+  1613387930U,	// <1,5,u,5>: Cost 2 vsldoi8 <0,4,1,5>, RHS
+  2687129808U,	// <1,5,u,6>: Cost 3 vsldoi8 <0,4,1,5>, <u,6,3,7>
+  2781040900U,	// <1,5,u,7>: Cost 3 vsldoi12 <4,u,5,1>, <5,u,7,0>
+  1613388133U,	// <1,5,u,u>: Cost 2 vsldoi8 <0,4,1,5>, LHS
+  3759546368U,	// <1,6,0,0>: Cost 4 vsldoi8 <0,2,1,6>, <0,0,0,0>
+  2685804646U,	// <1,6,0,1>: Cost 3 vsldoi8 <0,2,1,6>, LHS
+  2685804721U,	// <1,6,0,2>: Cost 3 vsldoi8 <0,2,1,6>, <0,2,1,6>
+  3861270834U,	// <1,6,0,3>: Cost 4 vsldoi12 <6,0,3,1>, <6,0,3,1>
+  3759546706U,	// <1,6,0,4>: Cost 4 vsldoi8 <0,2,1,6>, <0,4,1,5>
+  2687795620U,	// <1,6,0,5>: Cost 3 vsldoi8 <0,5,1,6>, <0,5,1,6>
+  2688459253U,	// <1,6,0,6>: Cost 3 vsldoi8 <0,6,1,6>, <0,6,1,6>
+  2283769142U,	// <1,6,0,7>: Cost 3 vmrglw <0,3,1,0>, RHS
+  2685805213U,	// <1,6,0,u>: Cost 3 vsldoi8 <0,2,1,6>, LHS
+  3698073702U,	// <1,6,1,0>: Cost 4 vsldoi4 <1,1,6,1>, LHS
+  3759547188U,	// <1,6,1,1>: Cost 4 vsldoi8 <0,2,1,6>, <1,1,1,1>
+  2221314554U,	// <1,6,1,2>: Cost 3 vmrghw <1,1,1,1>, <6,2,7,3>
+  3759547401U,	// <1,6,1,3>: Cost 4 vsldoi8 <0,2,1,6>, <1,3,6,7>
+  3698076982U,	// <1,6,1,4>: Cost 4 vsldoi4 <1,1,6,1>, RHS
+  3767510141U,	// <1,6,1,5>: Cost 4 vsldoi8 <1,5,1,6>, <1,5,1,6>
+  2334872376U,	// <1,6,1,6>: Cost 3 vmrglw <u,u,1,1>, <6,6,6,6>
+  1213353270U,	// <1,6,1,7>: Cost 2 vmrglw <0,u,1,1>, RHS
+  1213353271U,	// <1,6,1,u>: Cost 2 vmrglw <0,u,1,1>, RHS
+  3704053862U,	// <1,6,2,0>: Cost 4 vsldoi4 <2,1,6,2>, LHS
+  3759547961U,	// <1,6,2,1>: Cost 4 vsldoi8 <0,2,1,6>, <2,1,6,0>
+  2222117370U,	// <1,6,2,2>: Cost 3 vmrghw <1,2,3,0>, <6,2,7,3>
+  3759548070U,	// <1,6,2,3>: Cost 4 vsldoi8 <0,2,1,6>, <2,3,0,1>
+  3704057142U,	// <1,6,2,4>: Cost 4 vsldoi4 <2,1,6,2>, RHS
+  3373451057U,	// <1,6,2,5>: Cost 4 vmrglw <3,0,1,2>, <2,4,6,5>
+  2685806522U,	// <1,6,2,6>: Cost 3 vsldoi8 <0,2,1,6>, <2,6,3,7>
+  1225968950U,	// <1,6,2,7>: Cost 2 vmrglw <3,0,1,2>, RHS
+  1225968951U,	// <1,6,2,u>: Cost 2 vmrglw <3,0,1,2>, RHS
+  3759548566U,	// <1,6,3,0>: Cost 4 vsldoi8 <0,2,1,6>, <3,0,1,2>
+  3842912793U,	// <1,6,3,1>: Cost 4 vsldoi12 <2,u,6,1>, <6,3,1,7>
+  3759548774U,	// <1,6,3,2>: Cost 4 vsldoi8 <0,2,1,6>, <3,2,6,3>
+  3759548828U,	// <1,6,3,3>: Cost 4 vsldoi8 <0,2,1,6>, <3,3,3,3>
+  3759548930U,	// <1,6,3,4>: Cost 4 vsldoi8 <0,2,1,6>, <3,4,5,6>
+  3809315421U,	// <1,6,3,5>: Cost 4 vsldoi8 <u,5,1,6>, <3,5,6,7>
+  3386733368U,	// <1,6,3,6>: Cost 4 vmrglw <5,2,1,3>, <6,6,6,6>
+  2283130166U,	// <1,6,3,7>: Cost 3 vmrglw <0,2,1,3>, RHS
+  2283130167U,	// <1,6,3,u>: Cost 3 vmrglw <0,2,1,3>, RHS
+  3704070246U,	// <1,6,4,0>: Cost 4 vsldoi4 <2,1,6,4>, LHS
+  3862229608U,	// <1,6,4,1>: Cost 4 vsldoi12 <6,1,7,1>, <6,4,1,5>
+  3704071741U,	// <1,6,4,2>: Cost 4 vsldoi4 <2,1,6,4>, <2,1,6,4>
+  3721988610U,	// <1,6,4,3>: Cost 4 vsldoi4 <5,1,6,4>, <3,4,5,6>
+  3704073526U,	// <1,6,4,4>: Cost 4 vsldoi4 <2,1,6,4>, RHS
+  2685807926U,	// <1,6,4,5>: Cost 3 vsldoi8 <0,2,1,6>, RHS
+  3865621141U,	// <1,6,4,6>: Cost 4 vsldoi12 <6,6,u,1>, <6,4,6,5>
+  2283801910U,	// <1,6,4,7>: Cost 3 vmrglw <0,3,1,4>, RHS
+  2685808169U,	// <1,6,4,u>: Cost 3 vsldoi8 <0,2,1,6>, RHS
+  3710050406U,	// <1,6,5,0>: Cost 4 vsldoi4 <3,1,6,5>, LHS
+  3710051571U,	// <1,6,5,1>: Cost 4 vsldoi4 <3,1,6,5>, <1,6,5,7>
+  3405989597U,	// <1,6,5,2>: Cost 4 vmrglw <u,4,1,5>, <2,3,6,2>
+  3358214502U,	// <1,6,5,3>: Cost 4 vmrglw <0,4,1,5>, <3,2,6,3>
+  3710053686U,	// <1,6,5,4>: Cost 4 vsldoi4 <3,1,6,5>, RHS
+  3721998025U,	// <1,6,5,5>: Cost 4 vsldoi4 <5,1,6,5>, <5,1,6,5>
+  2332250936U,	// <1,6,5,6>: Cost 3 vmrglw <u,4,1,5>, <6,6,6,6>
+  1210731830U,	// <1,6,5,7>: Cost 2 vmrglw <0,4,1,5>, RHS
+  1210731831U,	// <1,6,5,u>: Cost 2 vmrglw <0,4,1,5>, RHS
+  2791289597U,	// <1,6,6,0>: Cost 3 vsldoi12 <6,6,0,1>, <6,6,0,1>
+  3698115430U,	// <1,6,6,1>: Cost 4 vsldoi4 <1,1,6,6>, <1,1,6,6>
+  3698116538U,	// <1,6,6,2>: Cost 4 vsldoi4 <1,1,6,6>, <2,6,3,7>
+  3356894132U,	// <1,6,6,3>: Cost 4 vmrglw <0,2,1,6>, <1,2,6,3>
+  3698117942U,	// <1,6,6,4>: Cost 4 vsldoi4 <1,1,6,6>, RHS
+  3722006218U,	// <1,6,6,5>: Cost 4 vsldoi4 <5,1,6,6>, <5,1,6,6>
+  2781041464U,	// <1,6,6,6>: Cost 3 vsldoi12 <4,u,5,1>, <6,6,6,6>
+  2283154742U,	// <1,6,6,7>: Cost 3 vmrglw <0,2,1,6>, RHS
+  2283154743U,	// <1,6,6,u>: Cost 3 vmrglw <0,2,1,6>, RHS
+  1718211406U,	// <1,6,7,0>: Cost 2 vsldoi12 <6,7,0,1>, <6,7,0,1>
+  2792026967U,	// <1,6,7,1>: Cost 3 vsldoi12 <6,7,1,1>, <6,7,1,1>
+  2765411170U,	// <1,6,7,2>: Cost 3 vsldoi12 <2,3,0,1>, <6,7,2,3>
+  3854783336U,	// <1,6,7,3>: Cost 4 vsldoi12 <4,u,5,1>, <6,7,3,0>
+  2781041526U,	// <1,6,7,4>: Cost 3 vsldoi12 <4,u,5,1>, <6,7,4,5>
+  3365528664U,	// <1,6,7,5>: Cost 4 vmrglw <1,6,1,7>, <1,4,6,5>
+  2791953290U,	// <1,6,7,6>: Cost 3 vsldoi12 <6,7,0,1>, <6,7,6,7>
+  2291789110U,	// <1,6,7,7>: Cost 3 vmrglw <1,6,1,7>, RHS
+  1718801302U,	// <1,6,7,u>: Cost 2 vsldoi12 <6,7,u,1>, <6,7,u,1>
+  1718875039U,	// <1,6,u,0>: Cost 2 vsldoi12 <6,u,0,1>, <6,u,0,1>
+  2685810478U,	// <1,6,u,1>: Cost 3 vsldoi8 <0,2,1,6>, LHS
+  2792764337U,	// <1,6,u,2>: Cost 3 vsldoi12 <6,u,2,1>, <6,u,2,1>
+  3759552444U,	// <1,6,u,3>: Cost 4 vsldoi8 <0,2,1,6>, <u,3,0,1>
+  2781041607U,	// <1,6,u,4>: Cost 3 vsldoi12 <4,u,5,1>, <6,u,4,5>
+  2685810842U,	// <1,6,u,5>: Cost 3 vsldoi8 <0,2,1,6>, RHS
+  2689792208U,	// <1,6,u,6>: Cost 3 vsldoi8 <0,u,1,6>, <u,6,3,7>
+  1210756406U,	// <1,6,u,7>: Cost 2 vmrglw <0,4,1,u>, RHS
+  1210756407U,	// <1,6,u,u>: Cost 2 vmrglw <0,4,1,u>, RHS
+  2793280496U,	// <1,7,0,0>: Cost 3 vsldoi12 <7,0,0,1>, <7,0,0,1>
+  2694439014U,	// <1,7,0,1>: Cost 3 vsldoi8 <1,6,1,7>, LHS
+  3393343912U,	// <1,7,0,2>: Cost 4 vmrglw <6,3,1,0>, <6,1,7,2>
+  3397325306U,	// <1,7,0,3>: Cost 4 vmrglw <7,0,1,0>, <6,2,7,3>
+  2793575444U,	// <1,7,0,4>: Cost 3 vsldoi12 <7,0,4,1>, <7,0,4,1>
+  3722030797U,	// <1,7,0,5>: Cost 4 vsldoi4 <5,1,7,0>, <5,1,7,0>
+  2688467446U,	// <1,7,0,6>: Cost 3 vsldoi8 <0,6,1,7>, <0,6,1,7>
+  2689131079U,	// <1,7,0,7>: Cost 3 vsldoi8 <0,7,1,7>, <0,7,1,7>
+  2694439570U,	// <1,7,0,u>: Cost 3 vsldoi8 <1,6,1,7>, <0,u,1,1>
+  2654265354U,	// <1,7,1,0>: Cost 3 vsldoi4 <6,1,7,1>, <0,0,1,1>
+  2794017866U,	// <1,7,1,1>: Cost 3 vsldoi12 <7,1,1,1>, <7,1,1,1>
+  3768181639U,	// <1,7,1,2>: Cost 4 vsldoi8 <1,6,1,7>, <1,2,1,3>
+  2334872058U,	// <1,7,1,3>: Cost 3 vmrglw <u,u,1,1>, <6,2,7,3>
+  2654268726U,	// <1,7,1,4>: Cost 3 vsldoi4 <6,1,7,1>, RHS
+  3792069797U,	// <1,7,1,5>: Cost 4 vsldoi8 <5,6,1,7>, <1,5,6,1>
+  2694440143U,	// <1,7,1,6>: Cost 3 vsldoi8 <1,6,1,7>, <1,6,1,7>
+  2334872386U,	// <1,7,1,7>: Cost 3 vmrglw <u,u,1,1>, <6,6,7,7>
+  2695767409U,	// <1,7,1,u>: Cost 3 vsldoi8 <1,u,1,7>, <1,u,1,7>
+  2654273638U,	// <1,7,2,0>: Cost 3 vsldoi4 <6,1,7,2>, LHS
+  2222117973U,	// <1,7,2,1>: Cost 3 vmrghw <1,2,3,0>, <7,1,2,3>
+  2299711912U,	// <1,7,2,2>: Cost 3 vmrglw <3,0,1,2>, <6,1,7,2>
+  2654275734U,	// <1,7,2,3>: Cost 3 vsldoi4 <6,1,7,2>, <3,0,1,2>
+  2654276918U,	// <1,7,2,4>: Cost 3 vsldoi4 <6,1,7,2>, RHS
+  3385397675U,	// <1,7,2,5>: Cost 4 vmrglw <5,0,1,2>, <6,1,7,5>
+  2654278056U,	// <1,7,2,6>: Cost 3 vsldoi4 <6,1,7,2>, <6,1,7,2>
+  2323599627U,	// <1,7,2,7>: Cost 3 vmrglw <7,0,1,2>, <5,u,7,7>
+  2654279470U,	// <1,7,2,u>: Cost 3 vsldoi4 <6,1,7,2>, LHS
+  2795271395U,	// <1,7,3,0>: Cost 3 vsldoi12 <7,3,0,1>, <7,3,0,1>
+  3768183059U,	// <1,7,3,1>: Cost 4 vsldoi8 <1,6,1,7>, <3,1,6,1>
+  3728025254U,	// <1,7,3,2>: Cost 4 vsldoi4 <6,1,7,3>, <2,3,0,1>
+  3768183196U,	// <1,7,3,3>: Cost 4 vsldoi8 <1,6,1,7>, <3,3,3,3>
+  3768183298U,	// <1,7,3,4>: Cost 4 vsldoi8 <1,6,1,7>, <3,4,5,6>
+  3792071255U,	// <1,7,3,5>: Cost 4 vsldoi8 <5,6,1,7>, <3,5,6,1>
+  3780127361U,	// <1,7,3,6>: Cost 4 vsldoi8 <3,6,1,7>, <3,6,1,7>
+  3847779617U,	// <1,7,3,7>: Cost 4 vsldoi12 <3,7,0,1>, <7,3,7,0>
+  2795861291U,	// <1,7,3,u>: Cost 3 vsldoi12 <7,3,u,1>, <7,3,u,1>
+  2795935028U,	// <1,7,4,0>: Cost 3 vsldoi12 <7,4,0,1>, <7,4,0,1>
+  3728032975U,	// <1,7,4,1>: Cost 4 vsldoi4 <6,1,7,4>, <1,6,1,7>
+  3839153480U,	// <1,7,4,2>: Cost 4 vsldoi12 <2,3,0,1>, <7,4,2,3>
+  3397358074U,	// <1,7,4,3>: Cost 4 vmrglw <7,0,1,4>, <6,2,7,3>
+  3854783835U,	// <1,7,4,4>: Cost 4 vsldoi12 <4,u,5,1>, <7,4,4,4>
+  2694442294U,	// <1,7,4,5>: Cost 3 vsldoi8 <1,6,1,7>, RHS
+  3786100058U,	// <1,7,4,6>: Cost 4 vsldoi8 <4,6,1,7>, <4,6,1,7>
+  3722065254U,	// <1,7,4,7>: Cost 4 vsldoi4 <5,1,7,4>, <7,4,5,6>
+  2694442537U,	// <1,7,4,u>: Cost 3 vsldoi8 <1,6,1,7>, RHS
+  2654298214U,	// <1,7,5,0>: Cost 3 vsldoi4 <6,1,7,5>, LHS
+  3854783893U,	// <1,7,5,1>: Cost 4 vsldoi12 <4,u,5,1>, <7,5,1,u>
+  3710126010U,	// <1,7,5,2>: Cost 4 vsldoi4 <3,1,7,5>, <2,6,3,7>
+  2332250618U,	// <1,7,5,3>: Cost 3 vmrglw <u,4,1,5>, <6,2,7,3>
+  2654301494U,	// <1,7,5,4>: Cost 3 vsldoi4 <6,1,7,5>, RHS
+  2284474795U,	// <1,7,5,5>: Cost 3 vmrglw <0,4,1,5>, <6,1,7,5>
+  2718330931U,	// <1,7,5,6>: Cost 3 vsldoi8 <5,6,1,7>, <5,6,1,7>
+  2332250946U,	// <1,7,5,7>: Cost 3 vmrglw <u,4,1,5>, <6,6,7,7>
+  2719658197U,	// <1,7,5,u>: Cost 3 vsldoi8 <5,u,1,7>, <5,u,1,7>
+  2332921954U,	// <1,7,6,0>: Cost 3 vmrglw <u,5,1,6>, <5,6,7,0>
+  3768185254U,	// <1,7,6,1>: Cost 4 vsldoi8 <1,6,1,7>, <6,1,7,0>
+  3710134202U,	// <1,7,6,2>: Cost 4 vsldoi4 <3,1,7,6>, <2,6,3,7>
+  3710134561U,	// <1,7,6,3>: Cost 4 vsldoi4 <3,1,7,6>, <3,1,7,6>
+  3710135606U,	// <1,7,6,4>: Cost 4 vsldoi4 <3,1,7,6>, RHS
+  3864884745U,	// <1,7,6,5>: Cost 4 vsldoi12 <6,5,7,1>, <7,6,5,7>
+  3854784017U,	// <1,7,6,6>: Cost 4 vsldoi12 <4,u,5,1>, <7,6,6,6>
+  2791953940U,	// <1,7,6,7>: Cost 3 vsldoi12 <6,7,0,1>, <7,6,7,0>
+  2792617501U,	// <1,7,6,u>: Cost 3 vsldoi12 <6,u,0,1>, <7,6,u,0>
+  2797925927U,	// <1,7,7,0>: Cost 3 vsldoi12 <7,7,0,1>, <7,7,0,1>
+  3365528426U,	// <1,7,7,1>: Cost 4 vmrglw <1,6,1,7>, <1,1,7,1>
+  3728058022U,	// <1,7,7,2>: Cost 4 vsldoi4 <6,1,7,7>, <2,3,0,1>
+  3365528509U,	// <1,7,7,3>: Cost 4 vmrglw <1,6,1,7>, <1,2,7,3>
+  3854784079U,	// <1,7,7,4>: Cost 4 vsldoi12 <4,u,5,1>, <7,7,4,5>
+  3722088148U,	// <1,7,7,5>: Cost 4 vsldoi4 <5,1,7,7>, <5,1,7,7>
+  3728060845U,	// <1,7,7,6>: Cost 4 vsldoi4 <6,1,7,7>, <6,1,7,7>
+  2781042284U,	// <1,7,7,7>: Cost 3 vsldoi12 <4,u,5,1>, <7,7,7,7>
+  2798515823U,	// <1,7,7,u>: Cost 3 vsldoi12 <7,7,u,1>, <7,7,u,1>
+  2654322705U,	// <1,7,u,0>: Cost 3 vsldoi4 <6,1,7,u>, <0,0,1,u>
+  2694444846U,	// <1,7,u,1>: Cost 3 vsldoi8 <1,6,1,7>, LHS
+  2299711912U,	// <1,7,u,2>: Cost 3 vmrglw <3,0,1,2>, <6,1,7,2>
+  2323649018U,	// <1,7,u,3>: Cost 3 vmrglw <7,0,1,u>, <6,2,7,3>
+  2654326070U,	// <1,7,u,4>: Cost 3 vsldoi4 <6,1,7,u>, RHS
+  2694445210U,	// <1,7,u,5>: Cost 3 vsldoi8 <1,6,1,7>, RHS
+  2654327214U,	// <1,7,u,6>: Cost 3 vsldoi4 <6,1,7,u>, <6,1,7,u>
+  2323649346U,	// <1,7,u,7>: Cost 3 vmrglw <7,0,1,u>, <6,6,7,7>
+  2694445413U,	// <1,7,u,u>: Cost 3 vsldoi8 <1,6,1,7>, LHS
+  1610752017U,	// <1,u,0,0>: Cost 2 vsldoi8 <0,0,1,u>, <0,0,1,u>
+  1613406310U,	// <1,u,0,1>: Cost 2 vsldoi8 <0,4,1,u>, LHS
+  2685821107U,	// <1,u,0,2>: Cost 3 vsldoi8 <0,2,1,u>, <0,2,1,u>
+  2283765916U,	// <1,u,0,3>: Cost 3 vmrglw <0,3,1,0>, LHS
+  1613406549U,	// <1,u,0,4>: Cost 2 vsldoi8 <0,4,1,u>, <0,4,1,u>
+  1725880054U,	// <1,u,0,5>: Cost 2 vsldoi12 <u,0,5,1>, <u,0,5,1>
+  2688475639U,	// <1,u,0,6>: Cost 3 vsldoi8 <0,6,1,u>, <0,6,1,u>
+  2283769160U,	// <1,u,0,7>: Cost 3 vmrglw <0,3,1,0>, RHS
+  1613406877U,	// <1,u,0,u>: Cost 2 vsldoi8 <0,4,1,u>, LHS
+  1550221414U,	// <1,u,1,0>: Cost 2 vsldoi4 <1,1,1,1>, LHS
+  269271142U,	// <1,u,1,1>: Cost 1 vspltisw1 LHS
+  1683117870U,	// <1,u,1,2>: Cost 2 vsldoi12 <0,u,1,1>, LHS
+  1213350044U,	// <1,u,1,3>: Cost 2 vmrglw <0,u,1,1>, LHS
+  1550224694U,	// <1,u,1,4>: Cost 2 vsldoi4 <1,1,1,1>, RHS
+  1147574426U,	// <1,u,1,5>: Cost 2 vmrghw <1,1,1,1>, RHS
+  2687149326U,	// <1,u,1,6>: Cost 3 vsldoi8 <0,4,1,u>, <1,6,u,7>
+  1213353288U,	// <1,u,1,7>: Cost 2 vmrglw <0,u,1,1>, RHS
+  269271142U,	// <1,u,1,u>: Cost 1 vspltisw1 LHS
+  2222118611U,	// <1,u,2,0>: Cost 3 vmrghw <1,2,3,0>, <u,0,1,2>
+  1148376878U,	// <1,u,2,1>: Cost 2 vmrghw <1,2,3,0>, LHS
+  1148371862U,	// <1,u,2,2>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0>
+  1225965724U,	// <1,u,2,3>: Cost 2 vmrglw <3,0,1,2>, LHS
+  2222118975U,	// <1,u,2,4>: Cost 3 vmrghw <1,2,3,0>, <u,4,5,6>
+  1148377242U,	// <1,u,2,5>: Cost 2 vmrghw <1,2,3,0>, RHS
+  2687150010U,	// <1,u,2,6>: Cost 3 vsldoi8 <0,4,1,u>, <2,6,3,7>
+  1225968968U,	// <1,u,2,7>: Cost 2 vmrglw <3,0,1,2>, RHS
+  1148377445U,	// <1,u,2,u>: Cost 2 vmrghw <1,2,3,0>, LHS
+  471040156U,	// <1,u,3,0>: Cost 1 vsldoi4 LHS, LHS
+  1544782644U,	// <1,u,3,1>: Cost 2 vsldoi4 LHS, <1,1,1,1>
+  1544783464U,	// <1,u,3,2>: Cost 2 vsldoi4 LHS, <2,2,2,2>
+  1544784022U,	// <1,u,3,3>: Cost 2 vsldoi4 LHS, <3,0,1,2>
+  471043382U,	// <1,u,3,4>: Cost 1 vsldoi4 LHS, RHS
+  1592561668U,	// <1,u,3,5>: Cost 2 vsldoi4 LHS, <5,5,5,5>
+  1592562170U,	// <1,u,3,6>: Cost 2 vsldoi4 LHS, <6,2,7,3>
+  1592562682U,	// <1,u,3,7>: Cost 2 vsldoi4 LHS, <7,0,1,2>
+  471045934U,	// <1,u,3,u>: Cost 1 vsldoi4 LHS, LHS
+  2708384629U,	// <1,u,4,0>: Cost 3 vsldoi8 <4,0,1,u>, <4,0,1,u>
+  2687151101U,	// <1,u,4,1>: Cost 3 vsldoi8 <0,4,1,u>, <4,1,u,0>
+  2223408022U,	// <1,u,4,2>: Cost 3 vmrghw <1,4,2,5>, <1,2,3,0>
+  2283798684U,	// <1,u,4,3>: Cost 3 vmrglw <0,3,1,4>, LHS
+  2642422785U,	// <1,u,4,4>: Cost 3 vsldoi4 <4,1,u,4>, <4,1,u,4>
+  1613409590U,	// <1,u,4,5>: Cost 2 vsldoi8 <0,4,1,u>, RHS
+  2283801090U,	// <1,u,4,6>: Cost 3 vmrglw <0,3,1,4>, <3,4,5,6>
+  2283801928U,	// <1,u,4,7>: Cost 3 vmrglw <0,3,1,4>, RHS
+  1613409833U,	// <1,u,4,u>: Cost 2 vsldoi8 <0,4,1,u>, RHS
+  2284471235U,	// <1,u,5,0>: Cost 3 vmrglw <0,4,1,5>, <1,2,u,0>
+  2284472046U,	// <1,u,5,1>: Cost 3 vmrglw <0,4,1,5>, <2,3,u,1>
+  2284472533U,	// <1,u,5,2>: Cost 3 vmrglw <0,4,1,5>, <3,0,u,2>
+  1210728604U,	// <1,u,5,3>: Cost 2 vmrglw <0,4,1,5>, LHS
+  2284471239U,	// <1,u,5,4>: Cost 3 vmrglw <0,4,1,5>, <1,2,u,4>
+  1210728786U,	// <1,u,5,5>: Cost 2 vmrglw <0,4,1,5>, <0,4,1,5>
+  1683118234U,	// <1,u,5,6>: Cost 2 vsldoi12 <0,u,1,1>, RHS
+  1210731848U,	// <1,u,5,7>: Cost 2 vmrglw <0,4,1,5>, RHS
+  1210728609U,	// <1,u,5,u>: Cost 2 vmrglw <0,4,1,5>, LHS
+  2720330023U,	// <1,u,6,0>: Cost 3 vsldoi8 <6,0,1,u>, <6,0,1,u>
+  2757376190U,	// <1,u,6,1>: Cost 3 vsldoi12 <0,u,u,1>, <u,6,1,7>
+  2726302202U,	// <1,u,6,2>: Cost 3 vsldoi8 <7,0,1,u>, <6,2,7,3>
+  2283151516U,	// <1,u,6,3>: Cost 3 vmrglw <0,2,1,6>, LHS
+  2224972114U,	// <1,u,6,4>: Cost 3 vmrghw <1,6,5,7>, <0,4,1,5>
+  2224683162U,	// <1,u,6,5>: Cost 3 vmrghw <1,6,1,7>, RHS
+  2726302520U,	// <1,u,6,6>: Cost 3 vsldoi8 <7,0,1,u>, <6,6,6,6>
+  2283154760U,	// <1,u,6,7>: Cost 3 vmrglw <0,2,1,6>, RHS
+  2283151521U,	// <1,u,6,u>: Cost 3 vmrglw <0,2,1,6>, LHS
+  1652560896U,	// <1,u,7,0>: Cost 2 vsldoi8 <7,0,1,u>, <7,0,1,u>
+  2333590225U,	// <1,u,7,1>: Cost 3 vmrglw <u,6,1,7>, <0,u,u,1>
+  2765412628U,	// <1,u,7,2>: Cost 3 vsldoi12 <2,3,0,1>, <u,7,2,3>
+  2291785884U,	// <1,u,7,3>: Cost 3 vmrglw <1,6,1,7>, LHS
+  2781042984U,	// <1,u,7,4>: Cost 3 vsldoi12 <4,u,5,1>, <u,7,4,5>
+  3365527953U,	// <1,u,7,5>: Cost 4 vmrglw <1,6,1,7>, <0,4,u,5>
+  2791954748U,	// <1,u,7,6>: Cost 3 vsldoi12 <6,7,0,1>, <u,7,6,7>
+  2291789128U,	// <1,u,7,7>: Cost 3 vmrglw <1,6,1,7>, RHS
+  1657869960U,	// <1,u,7,u>: Cost 2 vsldoi8 <7,u,1,u>, <7,u,1,u>
+  471081121U,	// <1,u,u,0>: Cost 1 vsldoi4 LHS, LHS
+  269271142U,	// <1,u,u,1>: Cost 1 vspltisw1 LHS
+  1544824424U,	// <1,u,u,2>: Cost 2 vsldoi4 LHS, <2,2,2,2>
+  1544824982U,	// <1,u,u,3>: Cost 2 vsldoi4 LHS, <3,0,1,2>
+  471084342U,	// <1,u,u,4>: Cost 1 vsldoi4 LHS, RHS
+  1613412506U,	// <1,u,u,5>: Cost 2 vsldoi8 <0,4,1,u>, RHS
+  1683118477U,	// <1,u,u,6>: Cost 2 vsldoi12 <0,u,1,1>, RHS
+  1210756424U,	// <1,u,u,7>: Cost 2 vmrglw <0,4,1,u>, RHS
+  471086894U,	// <1,u,u,u>: Cost 1 vsldoi4 LHS, LHS
+  2226757632U,	// <2,0,0,0>: Cost 3 vmrghw <2,0,3,0>, <0,0,0,0>
+  2226757734U,	// <2,0,0,1>: Cost 3 vmrghw <2,0,3,0>, LHS
+  3826622483U,	// <2,0,0,2>: Cost 4 vsldoi12 <0,2,1,2>, <0,0,2,1>
+  3843211292U,	// <2,0,0,3>: Cost 4 vsldoi12 <3,0,1,2>, <0,0,3,1>
+  3300499794U,	// <2,0,0,4>: Cost 4 vmrghw <2,0,3,0>, <0,4,1,5>
+  3356256724U,	// <2,0,0,5>: Cost 4 vmrglw <0,1,2,0>, <3,4,0,5>
+  3825664056U,	// <2,0,0,6>: Cost 4 vsldoi12 <0,0,6,2>, <0,0,6,2>
+  3762889289U,	// <2,0,0,7>: Cost 4 vsldoi8 <0,7,2,0>, <0,7,2,0>
+  2226758301U,	// <2,0,0,u>: Cost 3 vmrghw <2,0,3,0>, LHS
+  2227429386U,	// <2,0,1,0>: Cost 3 vmrghw <2,1,3,1>, <0,0,1,1>
+  2227429478U,	// <2,0,1,1>: Cost 3 vmrghw <2,1,3,1>, LHS
+  1691156582U,	// <2,0,1,2>: Cost 2 vsldoi12 <2,2,2,2>, LHS
+  2666358997U,	// <2,0,1,3>: Cost 3 vsldoi4 <u,2,0,1>, <3,0,u,2>
+  2227462482U,	// <2,0,1,4>: Cost 3 vmrghw <2,1,3,5>, <0,4,1,5>
+  3722186464U,	// <2,0,1,5>: Cost 4 vsldoi4 <5,2,0,1>, <5,2,0,1>
+  3867099278U,	// <2,0,1,6>: Cost 4 vsldoi12 <7,0,1,2>, <0,1,6,7>
+  3366881912U,	// <2,0,1,7>: Cost 4 vmrglw <1,u,2,1>, <3,6,0,7>
+  1691156636U,	// <2,0,1,u>: Cost 2 vsldoi12 <2,2,2,2>, LHS
+  2228027392U,	// <2,0,2,0>: Cost 3 vmrghw <2,2,2,2>, <0,0,0,0>
+  1154285670U,	// <2,0,2,1>: Cost 2 vmrghw <2,2,2,2>, LHS
+  2228027565U,	// <2,0,2,2>: Cost 3 vmrghw <2,2,2,2>, <0,2,1,2>
+  3301769468U,	// <2,0,2,3>: Cost 4 vmrghw <2,2,2,2>, <0,3,1,0>
+  2228027730U,	// <2,0,2,4>: Cost 3 vmrghw <2,2,2,2>, <0,4,1,5>
+  3301769635U,	// <2,0,2,5>: Cost 4 vmrghw <2,2,2,2>, <0,5,1,5>
+  3780806586U,	// <2,0,2,6>: Cost 4 vsldoi8 <3,7,2,0>, <2,6,3,7>
+  3368880760U,	// <2,0,2,7>: Cost 4 vmrglw <2,2,2,2>, <3,6,0,7>
+  1154286237U,	// <2,0,2,u>: Cost 2 vmrghw <2,2,2,2>, LHS
+  1213440000U,	// <2,0,3,0>: Cost 2 vmrglw LHS, <0,0,0,0>
+  1213441702U,	// <2,0,3,1>: Cost 2 vmrglw LHS, <2,3,0,1>
+  2228535470U,	// <2,0,3,2>: Cost 3 vmrghw <2,3,0,1>, <0,2,1,3>
+  2636515632U,	// <2,0,3,3>: Cost 3 vsldoi4 <3,2,0,3>, <3,2,0,3>
+  2287182962U,	// <2,0,3,4>: Cost 3 vmrglw LHS, <1,5,0,4>
+  2660405346U,	// <2,0,3,5>: Cost 3 vsldoi4 <7,2,0,3>, <5,6,7,0>
+  2228535798U,	// <2,0,3,6>: Cost 3 vmrghw <2,3,0,1>, <0,6,1,7>
+  2660406420U,	// <2,0,3,7>: Cost 3 vsldoi4 <7,2,0,3>, <7,2,0,3>
+  1213441709U,	// <2,0,3,u>: Cost 2 vmrglw LHS, <2,3,0,u>
+  3368894464U,	// <2,0,4,0>: Cost 4 vmrglw <2,2,2,4>, <0,0,0,0>
+  2764898642U,	// <2,0,4,1>: Cost 3 vsldoi12 <2,2,2,2>, <0,4,1,5>
+  3826622811U,	// <2,0,4,2>: Cost 4 vsldoi12 <0,2,1,2>, <0,4,2,5>
+  3843211620U,	// <2,0,4,3>: Cost 4 vsldoi12 <3,0,1,2>, <0,4,3,5>
+  3838640493U,	// <2,0,4,4>: Cost 4 vsldoi12 <2,2,2,2>, <0,4,4,5>
+  2732944694U,	// <2,0,4,5>: Cost 3 vsldoi8 <u,1,2,0>, RHS
+  3797396857U,	// <2,0,4,6>: Cost 4 vsldoi8 <6,5,2,0>, <4,6,5,2>
+  3867099528U,	// <2,0,4,7>: Cost 4 vsldoi12 <7,0,1,2>, <0,4,7,5>
+  2764898705U,	// <2,0,4,u>: Cost 3 vsldoi12 <2,2,2,2>, <0,4,u,5>
+  3364257792U,	// <2,0,5,0>: Cost 4 vmrglw <1,4,2,5>, <0,0,0,0>
+  2230124646U,	// <2,0,5,1>: Cost 3 vmrghw <2,5,3,6>, LHS
+  3304235184U,	// <2,0,5,2>: Cost 4 vmrghw <2,5,u,6>, <0,2,1,5>
+  3364260144U,	// <2,0,5,3>: Cost 4 vmrglw <1,4,2,5>, <3,2,0,3>
+  3303817554U,	// <2,0,5,4>: Cost 4 vmrghw <2,5,3,0>, <0,4,1,5>
+  3364260146U,	// <2,0,5,5>: Cost 4 vmrglw <1,4,2,5>, <3,2,0,5>
+  3867099602U,	// <2,0,5,6>: Cost 4 vsldoi12 <7,0,1,2>, <0,5,6,7>
+  3364260472U,	// <2,0,5,7>: Cost 4 vmrglw <1,4,2,5>, <3,6,0,7>
+  2230125213U,	// <2,0,5,u>: Cost 3 vmrghw <2,5,3,6>, LHS
+  2230796288U,	// <2,0,6,0>: Cost 3 vmrghw <2,6,3,7>, <0,0,0,0>
+  1157054566U,	// <2,0,6,1>: Cost 2 vmrghw <2,6,3,7>, LHS
+  2230796465U,	// <2,0,6,2>: Cost 3 vmrghw <2,6,3,7>, <0,2,1,6>
+  3304538364U,	// <2,0,6,3>: Cost 4 vmrghw <2,6,3,7>, <0,3,1,0>
+  2230796626U,	// <2,0,6,4>: Cost 3 vmrghw <2,6,3,7>, <0,4,1,5>
+  3797398205U,	// <2,0,6,5>: Cost 4 vsldoi8 <6,5,2,0>, <6,5,2,0>
+  3304538614U,	// <2,0,6,6>: Cost 4 vmrghw <2,6,3,7>, <0,6,1,7>
+  3798725471U,	// <2,0,6,7>: Cost 4 vsldoi8 <6,7,2,0>, <6,7,2,0>
+  1157055133U,	// <2,0,6,u>: Cost 2 vmrghw <2,6,3,7>, LHS
+  3371573248U,	// <2,0,7,0>: Cost 4 vmrglw <2,6,2,7>, <0,0,0,0>
+  2231189606U,	// <2,0,7,1>: Cost 3 vmrghw <2,7,0,1>, LHS
+  3801380003U,	// <2,0,7,2>: Cost 4 vsldoi8 <7,2,2,0>, <7,2,2,0>
+  3802043636U,	// <2,0,7,3>: Cost 4 vsldoi8 <7,3,2,0>, <7,3,2,0>
+  3806688614U,	// <2,0,7,4>: Cost 4 vsldoi8 <u,1,2,0>, <7,4,5,6>
+  3356317308U,	// <2,0,7,5>: Cost 4 vmrglw <0,1,2,7>, <7,u,0,5>
+  3804034535U,	// <2,0,7,6>: Cost 4 vsldoi8 <7,6,2,0>, <7,6,2,0>
+  3806688876U,	// <2,0,7,7>: Cost 4 vsldoi8 <u,1,2,0>, <7,7,7,7>
+  2231190173U,	// <2,0,7,u>: Cost 3 vmrghw <2,7,0,1>, LHS
+  1208836096U,	// <2,0,u,0>: Cost 2 vmrglw LHS, <0,0,0,0>
+  1208837798U,	// <2,0,u,1>: Cost 2 vmrglw LHS, <2,3,0,1>
+  1691157149U,	// <2,0,u,2>: Cost 2 vsldoi12 <2,2,2,2>, LHS
+  2636556597U,	// <2,0,u,3>: Cost 3 vsldoi4 <3,2,0,u>, <3,2,0,u>
+  2282579625U,	// <2,0,u,4>: Cost 3 vmrglw LHS, <2,3,0,4>
+  2660446306U,	// <2,0,u,5>: Cost 3 vsldoi4 <7,2,0,u>, <5,6,7,0>
+  2228535798U,	// <2,0,u,6>: Cost 3 vmrghw <2,3,0,1>, <0,6,1,7>
+  2660447385U,	// <2,0,u,7>: Cost 3 vsldoi4 <7,2,0,u>, <7,2,0,u>
+  1208837805U,	// <2,0,u,u>: Cost 2 vmrglw LHS, <2,3,0,u>
+  3692388523U,	// <2,1,0,0>: Cost 4 vsldoi4 <0,2,1,0>, <0,2,1,0>
+  2757526244U,	// <2,1,0,1>: Cost 3 vsldoi12 <1,0,1,2>, <1,0,1,2>
+  2330290974U,	// <2,1,0,2>: Cost 3 vmrglw <u,1,2,0>, <3,u,1,2>
+  3843212020U,	// <2,1,0,3>: Cost 4 vsldoi12 <3,0,1,2>, <1,0,3,0>
+  3692391734U,	// <2,1,0,4>: Cost 4 vsldoi4 <0,2,1,0>, RHS
+  3300533362U,	// <2,1,0,5>: Cost 4 vmrghw <2,0,3,4>, <1,5,0,4>
+  3794084337U,	// <2,1,0,6>: Cost 4 vsldoi8 <6,0,2,1>, <0,6,1,2>
+  3374170614U,	// <2,1,0,7>: Cost 5 vmrglw <3,1,2,0>, <0,6,1,7>
+  2758042403U,	// <2,1,0,u>: Cost 3 vsldoi12 <1,0,u,2>, <1,0,u,2>
+  2690482924U,	// <2,1,1,0>: Cost 3 vsldoi8 <1,0,2,1>, <1,0,2,1>
+  2764899124U,	// <2,1,1,1>: Cost 3 vsldoi12 <2,2,2,2>, <1,1,1,1>
+  2695791510U,	// <2,1,1,2>: Cost 3 vsldoi8 <1,u,2,1>, <1,2,3,0>
+  3362235271U,	// <2,1,1,3>: Cost 4 vmrglw <1,1,2,1>, <1,2,1,3>
+  3692399926U,	// <2,1,1,4>: Cost 4 vsldoi4 <0,2,1,1>, RHS
+  3832226649U,	// <2,1,1,5>: Cost 4 vsldoi12 <1,1,5,2>, <1,1,5,2>
+  3301205235U,	// <2,1,1,6>: Cost 4 vmrghw <2,1,3,5>, <1,6,5,7>
+  3768870179U,	// <2,1,1,7>: Cost 4 vsldoi8 <1,7,2,1>, <1,7,2,1>
+  2695791988U,	// <2,1,1,u>: Cost 3 vsldoi8 <1,u,2,1>, <1,u,2,1>
+  2618663085U,	// <2,1,2,0>: Cost 3 vsldoi4 <0,2,1,2>, <0,2,1,2>
+  2228028212U,	// <2,1,2,1>: Cost 3 vmrghw <2,2,2,2>, <1,1,1,1>
+  2618664552U,	// <2,1,2,2>: Cost 3 vsldoi4 <0,2,1,2>, <2,2,2,2>
+  2759000984U,	// <2,1,2,3>: Cost 3 vsldoi12 <1,2,3,2>, <1,2,3,2>
+  2618666294U,	// <2,1,2,4>: Cost 3 vsldoi4 <0,2,1,2>, RHS
+  2295136594U,	// <2,1,2,5>: Cost 3 vmrglw <2,2,2,2>, <0,4,1,5>
+  3769534376U,	// <2,1,2,6>: Cost 4 vsldoi8 <1,u,2,1>, <2,6,1,7>
+  2793358266U,	// <2,1,2,7>: Cost 3 vsldoi12 <7,0,1,2>, <1,2,7,0>
+  2618668846U,	// <2,1,2,u>: Cost 3 vsldoi4 <0,2,1,2>, LHS
+  2282536969U,	// <2,1,3,0>: Cost 3 vmrglw LHS, <0,0,1,0>
+  1208795146U,	// <2,1,3,1>: Cost 2 vmrglw LHS, <0,0,1,1>
+  1213442198U,	// <2,1,3,2>: Cost 2 vmrglw LHS, <3,0,1,2>
+  2287181998U,	// <2,1,3,3>: Cost 3 vmrglw LHS, <0,2,1,3>
+  2618674486U,	// <2,1,3,4>: Cost 3 vsldoi4 <0,2,1,3>, RHS
+  1208795474U,	// <2,1,3,5>: Cost 2 vmrglw LHS, <0,4,1,5>
+  2287182001U,	// <2,1,3,6>: Cost 3 vmrglw LHS, <0,2,1,6>
+  2287183055U,	// <2,1,3,7>: Cost 3 vmrglw LHS, <1,6,1,7>
+  1208795153U,	// <2,1,3,u>: Cost 2 vmrglw LHS, <0,0,1,u>
+  3692421295U,	// <2,1,4,0>: Cost 4 vsldoi4 <0,2,1,4>, <0,2,1,4>
+  3838641195U,	// <2,1,4,1>: Cost 4 vsldoi12 <2,2,2,2>, <1,4,1,5>
+  2330323742U,	// <2,1,4,2>: Cost 3 vmrglw <u,1,2,4>, <3,u,1,2>
+  3692423318U,	// <2,1,4,3>: Cost 5 vsldoi4 <0,2,1,4>, <3,0,1,2>
+  3692424502U,	// <2,1,4,4>: Cost 4 vsldoi4 <0,2,1,4>, RHS
+  2695793974U,	// <2,1,4,5>: Cost 3 vsldoi8 <1,u,2,1>, RHS
+  3799395705U,	// <2,1,4,6>: Cost 4 vsldoi8 <6,u,2,1>, <4,6,5,2>
+  3368895695U,	// <2,1,4,7>: Cost 5 vmrglw <2,2,2,4>, <1,6,1,7>
+  2695794217U,	// <2,1,4,u>: Cost 3 vsldoi8 <1,u,2,1>, RHS
+  3692429488U,	// <2,1,5,0>: Cost 4 vsldoi4 <0,2,1,5>, <0,2,1,5>
+  3364257802U,	// <2,1,5,1>: Cost 4 vmrglw <1,4,2,5>, <0,0,1,1>
+  3692431253U,	// <2,1,5,2>: Cost 4 vsldoi4 <0,2,1,5>, <2,5,u,6>
+  3692431874U,	// <2,1,5,3>: Cost 4 vsldoi4 <0,2,1,5>, <3,4,5,6>
+  3692432694U,	// <2,1,5,4>: Cost 4 vsldoi4 <0,2,1,5>, RHS
+  3364258130U,	// <2,1,5,5>: Cost 4 vmrglw <1,4,2,5>, <0,4,1,5>
+  3303875827U,	// <2,1,5,6>: Cost 4 vmrghw <2,5,3,7>, <1,6,5,7>
+  3867100333U,	// <2,1,5,7>: Cost 4 vsldoi12 <7,0,1,2>, <1,5,7,0>
+  3692435246U,	// <2,1,5,u>: Cost 4 vsldoi4 <0,2,1,5>, LHS
+  2618695857U,	// <2,1,6,0>: Cost 3 vsldoi4 <0,2,1,6>, <0,2,1,6>
+  2230797108U,	// <2,1,6,1>: Cost 3 vmrghw <2,6,3,7>, <1,1,1,1>
+  2618697658U,	// <2,1,6,2>: Cost 3 vsldoi4 <0,2,1,6>, <2,6,3,7>
+  3692439702U,	// <2,1,6,3>: Cost 4 vsldoi4 <0,2,1,6>, <3,0,1,2>
+  2618699062U,	// <2,1,6,4>: Cost 3 vsldoi4 <0,2,1,6>, RHS
+  3364929874U,	// <2,1,6,5>: Cost 4 vmrglw <1,5,2,6>, <0,4,1,5>
+  3692442424U,	// <2,1,6,6>: Cost 4 vsldoi4 <0,2,1,6>, <6,6,6,6>
+  3798733664U,	// <2,1,6,7>: Cost 4 vsldoi8 <6,7,2,1>, <6,7,2,1>
+  2618701614U,	// <2,1,6,u>: Cost 3 vsldoi4 <0,2,1,6>, LHS
+  3799397370U,	// <2,1,7,0>: Cost 4 vsldoi8 <6,u,2,1>, <7,0,1,2>
+  3371573258U,	// <2,1,7,1>: Cost 4 vmrglw <2,6,2,7>, <0,0,1,1>
+  2330351234U,	// <2,1,7,2>: Cost 3 vmrglw <u,1,2,7>, <7,u,1,2>
+  3799397658U,	// <2,1,7,3>: Cost 4 vsldoi8 <6,u,2,1>, <7,3,6,2>
+  3799397734U,	// <2,1,7,4>: Cost 4 vsldoi8 <6,u,2,1>, <7,4,5,6>
+  3371573586U,	// <2,1,7,5>: Cost 4 vmrglw <2,6,2,7>, <0,4,1,5>
+  3799397870U,	// <2,1,7,6>: Cost 4 vsldoi8 <6,u,2,1>, <7,6,2,7>
+  3799397956U,	// <2,1,7,7>: Cost 4 vsldoi8 <6,u,2,1>, <7,7,3,3>
+  2330351234U,	// <2,1,7,u>: Cost 3 vmrglw <u,1,2,7>, <7,u,1,2>
+  2282577929U,	// <2,1,u,0>: Cost 3 vmrglw LHS, <0,0,1,0>
+  1208836106U,	// <2,1,u,1>: Cost 2 vmrglw LHS, <0,0,1,1>
+  1208838294U,	// <2,1,u,2>: Cost 2 vmrglw LHS, <3,0,1,2>
+  2282578094U,	// <2,1,u,3>: Cost 3 vmrglw LHS, <0,2,1,3>
+  2282577933U,	// <2,1,u,4>: Cost 3 vmrglw LHS, <0,0,1,4>
+  1208836434U,	// <2,1,u,5>: Cost 2 vmrglw LHS, <0,4,1,5>
+  2282578097U,	// <2,1,u,6>: Cost 3 vmrglw LHS, <0,2,1,6>
+  2287224015U,	// <2,1,u,7>: Cost 3 vmrglw LHS, <1,6,1,7>
+  1208836113U,	// <2,1,u,u>: Cost 2 vmrglw LHS, <0,0,1,u>
+  2226759117U,	// <2,2,0,0>: Cost 3 vmrghw <2,0,3,0>, <2,0,3,0>
+  1624047718U,	// <2,2,0,1>: Cost 2 vsldoi8 <2,2,2,2>, LHS
+  2697789613U,	// <2,2,0,2>: Cost 3 vsldoi8 <2,2,2,2>, <0,2,1,2>
+  2226767526U,	// <2,2,0,3>: Cost 3 vmrghw <2,0,3,1>, <2,3,0,1>
+  2697789778U,	// <2,2,0,4>: Cost 3 vsldoi8 <2,2,2,2>, <0,4,1,5>
+  3300657000U,	// <2,2,0,5>: Cost 4 vmrghw <2,0,5,1>, <2,5,3,6>
+  2226988986U,	// <2,2,0,6>: Cost 3 vmrghw <2,0,6,1>, <2,6,3,7>
+  3734271139U,	// <2,2,0,7>: Cost 4 vsldoi4 <7,2,2,0>, <7,2,2,0>
+  1624048285U,	// <2,2,0,u>: Cost 2 vsldoi8 <2,2,2,2>, LHS
+  3831268868U,	// <2,2,1,0>: Cost 4 vsldoi12 <1,0,1,2>, <2,1,0,1>
+  2293138804U,	// <2,2,1,1>: Cost 3 vmrglw <1,u,2,1>, <1,u,2,1>
+  2697790358U,	// <2,2,1,2>: Cost 3 vsldoi8 <2,2,2,2>, <1,2,3,0>
+  2293137510U,	// <2,2,1,3>: Cost 3 vmrglw <1,u,2,1>, LHS
+  3771532331U,	// <2,2,1,4>: Cost 4 vsldoi8 <2,2,2,2>, <1,4,1,5>
+  3767551106U,	// <2,2,1,5>: Cost 4 vsldoi8 <1,5,2,2>, <1,5,2,2>
+  3301173178U,	// <2,2,1,6>: Cost 4 vmrghw <2,1,3,1>, <2,6,3,7>
+  3372853169U,	// <2,2,1,7>: Cost 4 vmrglw <2,u,2,1>, <2,6,2,7>
+  2293137515U,	// <2,2,1,u>: Cost 3 vmrglw <1,u,2,1>, LHS
+  1556938854U,	// <2,2,2,0>: Cost 2 vsldoi4 <2,2,2,2>, LHS
+  2295137733U,	// <2,2,2,1>: Cost 3 vmrglw <2,2,2,2>, <2,0,2,1>
+  336380006U,	// <2,2,2,2>: Cost 1 vspltisw2 LHS
+  1221394534U,	// <2,2,2,3>: Cost 2 vmrglw <2,2,2,2>, LHS
+  1556942134U,	// <2,2,2,4>: Cost 2 vsldoi4 <2,2,2,2>, RHS
+  2295138061U,	// <2,2,2,5>: Cost 3 vmrglw <2,2,2,2>, <2,4,2,5>
+  2228029370U,	// <2,2,2,6>: Cost 3 vmrghw <2,2,2,2>, <2,6,3,7>
+  2660545701U,	// <2,2,2,7>: Cost 3 vsldoi4 <7,2,2,2>, <7,2,2,2>
+  336380006U,	// <2,2,2,u>: Cost 1 vspltisw2 LHS
+  2697791638U,	// <2,2,3,0>: Cost 3 vsldoi8 <2,2,2,2>, <3,0,1,2>
+  2765489840U,	// <2,2,3,1>: Cost 3 vsldoi12 <2,3,1,2>, <2,3,1,2>
+  1213441640U,	// <2,2,3,2>: Cost 2 vmrglw LHS, <2,2,2,2>
+  135053414U,	// <2,2,3,3>: Cost 1 vmrglw LHS, LHS
+  2697792002U,	// <2,2,3,4>: Cost 3 vsldoi8 <2,2,2,2>, <3,4,5,6>
+  2330313780U,	// <2,2,3,5>: Cost 3 vmrglw LHS, <1,4,2,5>
+  2287183549U,	// <2,2,3,6>: Cost 3 vmrglw LHS, <2,3,2,6>
+  2660553894U,	// <2,2,3,7>: Cost 3 vsldoi4 <7,2,2,3>, <7,2,2,3>
+  135053419U,	// <2,2,3,u>: Cost 1 vmrglw LHS, LHS
+  2630697062U,	// <2,2,4,0>: Cost 3 vsldoi4 <2,2,2,4>, LHS
+  3771534282U,	// <2,2,4,1>: Cost 4 vsldoi8 <2,2,2,2>, <4,1,2,3>
+  2764900109U,	// <2,2,4,2>: Cost 3 vsldoi12 <2,2,2,2>, <2,4,2,5>
+  2295152742U,	// <2,2,4,3>: Cost 3 vmrglw <2,2,2,4>, LHS
+  2295154282U,	// <2,2,4,4>: Cost 3 vmrglw <2,2,2,4>, <2,2,2,4>
+  1624050998U,	// <2,2,4,5>: Cost 2 vsldoi8 <2,2,2,2>, RHS
+  2229675962U,	// <2,2,4,6>: Cost 3 vmrghw <2,4,6,5>, <2,6,3,7>
+  3368896433U,	// <2,2,4,7>: Cost 4 vmrglw <2,2,2,4>, <2,6,2,7>
+  1624051241U,	// <2,2,4,u>: Cost 2 vsldoi8 <2,2,2,2>, RHS
+  3771534920U,	// <2,2,5,0>: Cost 4 vsldoi8 <2,2,2,2>, <5,0,1,2>
+  3364258540U,	// <2,2,5,1>: Cost 4 vmrglw <1,4,2,5>, <1,0,2,1>
+  2296489576U,	// <2,2,5,2>: Cost 3 vmrglw <2,4,2,5>, <2,2,2,2>
+  2290516070U,	// <2,2,5,3>: Cost 3 vmrglw <1,4,2,5>, LHS
+  3771535284U,	// <2,2,5,4>: Cost 4 vsldoi8 <2,2,2,2>, <5,4,5,6>
+  2290517044U,	// <2,2,5,5>: Cost 3 vmrglw <1,4,2,5>, <1,4,2,5>
+  2697793634U,	// <2,2,5,6>: Cost 3 vsldoi8 <2,2,2,2>, <5,6,7,0>
+  3370231729U,	// <2,2,5,7>: Cost 4 vmrglw <2,4,2,5>, <2,6,2,7>
+  2290516075U,	// <2,2,5,u>: Cost 3 vmrglw <1,4,2,5>, LHS
+  2230797801U,	// <2,2,6,0>: Cost 3 vmrghw <2,6,3,7>, <2,0,6,1>
+  3304539679U,	// <2,2,6,1>: Cost 4 vmrghw <2,6,3,7>, <2,1,3,1>
+  2764900273U,	// <2,2,6,2>: Cost 3 vsldoi12 <2,2,2,2>, <2,6,2,7>
+  2764900282U,	// <2,2,6,3>: Cost 3 vsldoi12 <2,2,2,2>, <2,6,3,7>
+  2230798129U,	// <2,2,6,4>: Cost 3 vmrghw <2,6,3,7>, <2,4,6,5>
+  3304540008U,	// <2,2,6,5>: Cost 4 vmrghw <2,6,3,7>, <2,5,3,6>
+  1157056442U,	// <2,2,6,6>: Cost 2 vmrghw <2,6,3,7>, <2,6,3,7>
+  2725000033U,	// <2,2,6,7>: Cost 3 vsldoi8 <6,7,2,2>, <6,7,2,2>
+  1157056442U,	// <2,2,6,u>: Cost 2 vmrghw <2,6,3,7>, <2,6,3,7>
+  2793359338U,	// <2,2,7,0>: Cost 3 vsldoi12 <7,0,1,2>, <2,7,0,1>
+  3371574725U,	// <2,2,7,1>: Cost 4 vmrglw <2,6,2,7>, <2,0,2,1>
+  2297833064U,	// <2,2,7,2>: Cost 3 vmrglw <2,6,2,7>, <2,2,2,2>
+  2297831526U,	// <2,2,7,3>: Cost 3 vmrglw <2,6,2,7>, LHS
+  2697794918U,	// <2,2,7,4>: Cost 3 vsldoi8 <2,2,2,2>, <7,4,5,6>
+  3371575053U,	// <2,2,7,5>: Cost 4 vmrglw <2,6,2,7>, <2,4,2,5>
+  3304933297U,	// <2,2,7,6>: Cost 4 vmrghw <2,7,0,1>, <2,6,2,7>
+  2297833393U,	// <2,2,7,7>: Cost 3 vmrglw <2,6,2,7>, <2,6,2,7>
+  2297831531U,	// <2,2,7,u>: Cost 3 vmrglw <2,6,2,7>, LHS
+  1556938854U,	// <2,2,u,0>: Cost 2 vsldoi4 <2,2,2,2>, LHS
+  1624053550U,	// <2,2,u,1>: Cost 2 vsldoi8 <2,2,2,2>, LHS
+  336380006U,	// <2,2,u,2>: Cost 1 vspltisw2 LHS
+  135094374U,	// <2,2,u,3>: Cost 1 vmrglw LHS, LHS
+  1556942134U,	// <2,2,u,4>: Cost 2 vsldoi4 <2,2,2,2>, RHS
+  1624053914U,	// <2,2,u,5>: Cost 2 vsldoi8 <2,2,2,2>, RHS
+  1157056442U,	// <2,2,u,6>: Cost 2 vmrghw <2,6,3,7>, <2,6,3,7>
+  2660594859U,	// <2,2,u,7>: Cost 3 vsldoi4 <7,2,2,u>, <7,2,2,u>
+  135094379U,	// <2,2,u,u>: Cost 1 vmrglw LHS, LHS
+  1611448320U,	// <2,3,0,0>: Cost 2 vsldoi8 LHS, <0,0,0,0>
+  537706598U,	// <2,3,0,1>: Cost 1 vsldoi8 LHS, LHS
+  2689835181U,	// <2,3,0,2>: Cost 3 vsldoi8 LHS, <0,2,1,2>
+  2689835260U,	// <2,3,0,3>: Cost 3 vsldoi8 LHS, <0,3,1,0>
+  1611448658U,	// <2,3,0,4>: Cost 2 vsldoi8 LHS, <0,4,1,5>
+  2732966354U,	// <2,3,0,5>: Cost 3 vsldoi8 LHS, <0,5,6,7>
+  2732966390U,	// <2,3,0,6>: Cost 3 vsldoi8 LHS, <0,6,1,7>
+  2660603052U,	// <2,3,0,7>: Cost 3 vsldoi4 <7,2,3,0>, <7,2,3,0>
+  537707165U,	// <2,3,0,u>: Cost 1 vsldoi8 LHS, LHS
+  2689835748U,	// <2,3,1,0>: Cost 3 vsldoi8 LHS, <1,0,1,2>
+  1611449140U,	// <2,3,1,1>: Cost 2 vsldoi8 LHS, <1,1,1,1>
+  1611449238U,	// <2,3,1,2>: Cost 2 vsldoi8 LHS, <1,2,3,0>
+  3763577805U,	// <2,3,1,3>: Cost 4 vsldoi8 LHS, <1,3,0,1>
+  2689836112U,	// <2,3,1,4>: Cost 3 vsldoi8 LHS, <1,4,5,6>
+  2689836143U,	// <2,3,1,5>: Cost 3 vsldoi8 LHS, <1,5,0,1>
+  2689836239U,	// <2,3,1,6>: Cost 3 vsldoi8 LHS, <1,6,1,7>
+  3366881210U,	// <2,3,1,7>: Cost 4 vmrglw <1,u,2,1>, <2,6,3,7>
+  1616094588U,	// <2,3,1,u>: Cost 2 vsldoi8 LHS, <1,u,3,0>
+  2689836493U,	// <2,3,2,0>: Cost 3 vsldoi8 LHS, <2,0,3,0>
+  2685191711U,	// <2,3,2,1>: Cost 3 vsldoi8 LHS, <2,1,3,1>
+  1611449960U,	// <2,3,2,2>: Cost 2 vsldoi8 LHS, <2,2,2,2>
+  1611450022U,	// <2,3,2,3>: Cost 2 vsldoi8 LHS, <2,3,0,1>
+  2689836822U,	// <2,3,2,4>: Cost 3 vsldoi8 LHS, <2,4,3,5>
+  2689836904U,	// <2,3,2,5>: Cost 3 vsldoi8 LHS, <2,5,3,6>
+  1611450298U,	// <2,3,2,6>: Cost 2 vsldoi8 LHS, <2,6,3,7>
+  2295138234U,	// <2,3,2,7>: Cost 3 vmrglw <2,2,2,2>, <2,6,3,7>
+  1611450456U,	// <2,3,2,u>: Cost 2 vsldoi8 LHS, <2,u,3,3>
+  1213440918U,	// <2,3,3,0>: Cost 2 vmrglw LHS, <1,2,3,0>
+  2282538527U,	// <2,3,3,1>: Cost 3 vmrglw LHS, <2,1,3,1>
+  1557022322U,	// <2,3,3,2>: Cost 2 vsldoi4 <2,2,3,3>, <2,2,3,3>
+  1208796786U,	// <2,3,3,3>: Cost 2 vmrglw LHS, <2,2,3,3>
+  1213440922U,	// <2,3,3,4>: Cost 2 vmrglw LHS, <1,2,3,4>
+  2282538531U,	// <2,3,3,5>: Cost 3 vmrglw LHS, <2,1,3,5>
+  2287188094U,	// <2,3,3,6>: Cost 3 vmrglw LHS, <u,5,3,6>
+  1213441978U,	// <2,3,3,7>: Cost 2 vmrglw LHS, <2,6,3,7>
+  1208796791U,	// <2,3,3,u>: Cost 2 vmrglw LHS, <2,2,3,u>
+  1551056998U,	// <2,3,4,0>: Cost 2 vsldoi4 <1,2,3,4>, LHS
+  1551057818U,	// <2,3,4,1>: Cost 2 vsldoi4 <1,2,3,4>, <1,2,3,4>
+  2624800360U,	// <2,3,4,2>: Cost 3 vsldoi4 <1,2,3,4>, <2,2,2,2>
+  2624800918U,	// <2,3,4,3>: Cost 3 vsldoi4 <1,2,3,4>, <3,0,1,2>
+  1551060278U,	// <2,3,4,4>: Cost 2 vsldoi4 <1,2,3,4>, RHS
+  537709878U,	// <2,3,4,5>: Cost 1 vsldoi8 LHS, RHS
+  2732969337U,	// <2,3,4,6>: Cost 3 vsldoi8 LHS, <4,6,5,2>
+  2660635824U,	// <2,3,4,7>: Cost 3 vsldoi4 <7,2,3,4>, <7,2,3,4>
+  537710121U,	// <2,3,4,u>: Cost 1 vsldoi8 LHS, RHS
+  2689838664U,	// <2,3,5,0>: Cost 3 vsldoi8 LHS, <5,0,1,2>
+  2732969615U,	// <2,3,5,1>: Cost 3 vsldoi8 LHS, <5,1,0,1>
+  2732969707U,	// <2,3,5,2>: Cost 3 vsldoi8 LHS, <5,2,1,3>
+  3763580721U,	// <2,3,5,3>: Cost 4 vsldoi8 LHS, <5,3,0,1>
+  2689839028U,	// <2,3,5,4>: Cost 3 vsldoi8 LHS, <5,4,5,6>
+  1659228164U,	// <2,3,5,5>: Cost 2 vsldoi8 LHS, <5,5,5,5>
+  1659228258U,	// <2,3,5,6>: Cost 2 vsldoi8 LHS, <5,6,7,0>
+  3364259770U,	// <2,3,5,7>: Cost 4 vmrglw <1,4,2,5>, <2,6,3,7>
+  1659228420U,	// <2,3,5,u>: Cost 2 vsldoi8 LHS, <5,u,7,0>
+  2230798486U,	// <2,3,6,0>: Cost 3 vmrghw <2,6,3,7>, <3,0,1,2>
+  2732970407U,	// <2,3,6,1>: Cost 3 vsldoi8 LHS, <6,1,7,1>
+  1659228666U,	// <2,3,6,2>: Cost 2 vsldoi8 LHS, <6,2,7,3>
+  2230798748U,	// <2,3,6,3>: Cost 3 vmrghw <2,6,3,7>, <3,3,3,3>
+  2230798850U,	// <2,3,6,4>: Cost 3 vmrghw <2,6,3,7>, <3,4,5,6>
+  2732970731U,	// <2,3,6,5>: Cost 3 vsldoi8 LHS, <6,5,7,1>
+  1659228984U,	// <2,3,6,6>: Cost 2 vsldoi8 LHS, <6,6,6,6>
+  1659229006U,	// <2,3,6,7>: Cost 2 vsldoi8 LHS, <6,7,0,1>
+  1659229087U,	// <2,3,6,u>: Cost 2 vsldoi8 LHS, <6,u,0,1>
+  1659229178U,	// <2,3,7,0>: Cost 2 vsldoi8 LHS, <7,0,1,2>
+  2726999125U,	// <2,3,7,1>: Cost 3 vsldoi8 <7,1,2,3>, <7,1,2,3>
+  2727662758U,	// <2,3,7,2>: Cost 3 vsldoi8 <7,2,2,3>, <7,2,2,3>
+  2732971235U,	// <2,3,7,3>: Cost 3 vsldoi8 LHS, <7,3,0,1>
+  1659229542U,	// <2,3,7,4>: Cost 2 vsldoi8 LHS, <7,4,5,6>
+  2732971446U,	// <2,3,7,5>: Cost 3 vsldoi8 LHS, <7,5,5,5>
+  2732971484U,	// <2,3,7,6>: Cost 3 vsldoi8 LHS, <7,6,0,7>
+  1659229804U,	// <2,3,7,7>: Cost 2 vsldoi8 LHS, <7,7,7,7>
+  1659229826U,	// <2,3,7,u>: Cost 2 vsldoi8 LHS, <7,u,1,2>
+  1208837014U,	// <2,3,u,0>: Cost 2 vmrglw LHS, <1,2,3,0>
+  537712430U,	// <2,3,u,1>: Cost 1 vsldoi8 LHS, LHS
+  1616099205U,	// <2,3,u,2>: Cost 2 vsldoi8 LHS, <u,2,3,0>
+  1208837746U,	// <2,3,u,3>: Cost 2 vmrglw LHS, <2,2,3,3>
+  1208837018U,	// <2,3,u,4>: Cost 2 vmrglw LHS, <1,2,3,4>
+  537712794U,	// <2,3,u,5>: Cost 1 vsldoi8 LHS, RHS
+  1616099536U,	// <2,3,u,6>: Cost 2 vsldoi8 LHS, <u,6,3,7>
+  1208838074U,	// <2,3,u,7>: Cost 2 vmrglw LHS, <2,6,3,7>
+  537712997U,	// <2,3,u,u>: Cost 1 vsldoi8 LHS, LHS
+  3771547648U,	// <2,4,0,0>: Cost 4 vsldoi8 <2,2,2,4>, <0,0,0,0>
+  2697805926U,	// <2,4,0,1>: Cost 3 vsldoi8 <2,2,2,4>, LHS
+  3770884269U,	// <2,4,0,2>: Cost 4 vsldoi8 <2,1,2,4>, <0,2,1,2>
+  3806716164U,	// <2,4,0,3>: Cost 4 vsldoi8 <u,1,2,4>, <0,3,1,u>
+  3771547986U,	// <2,4,0,4>: Cost 4 vsldoi8 <2,2,2,4>, <0,4,1,5>
+  2226761014U,	// <2,4,0,5>: Cost 3 vmrghw <2,0,3,0>, RHS
+  3853462427U,	// <2,4,0,6>: Cost 4 vsldoi12 <4,6,5,2>, <4,0,6,1>
+  3867102116U,	// <2,4,0,7>: Cost 4 vsldoi12 <7,0,1,2>, <4,0,7,1>
+  2226761257U,	// <2,4,0,u>: Cost 3 vmrghw <2,0,3,0>, RHS
+  3849186231U,	// <2,4,1,0>: Cost 4 vsldoi12 <4,0,1,2>, <4,1,0,2>
+  3301207010U,	// <2,4,1,1>: Cost 4 vmrghw <2,1,3,5>, <4,1,5,0>
+  3766240150U,	// <2,4,1,2>: Cost 4 vsldoi8 <1,3,2,4>, <1,2,3,0>
+  3766240226U,	// <2,4,1,3>: Cost 4 vsldoi8 <1,3,2,4>, <1,3,2,4>
+  3301207248U,	// <2,4,1,4>: Cost 4 vmrghw <2,1,3,5>, <4,4,4,4>
+  2227432758U,	// <2,4,1,5>: Cost 3 vmrghw <2,1,3,1>, RHS
+  3758941400U,	// <2,4,1,6>: Cost 4 vsldoi8 <0,1,2,4>, <1,6,2,7>
+  3768894758U,	// <2,4,1,7>: Cost 4 vsldoi8 <1,7,2,4>, <1,7,2,4>
+  2227433001U,	// <2,4,1,u>: Cost 3 vmrghw <2,1,3,1>, RHS
+  2228030354U,	// <2,4,2,0>: Cost 3 vmrghw <2,2,2,2>, <4,0,5,1>
+  3770885657U,	// <2,4,2,1>: Cost 4 vsldoi8 <2,1,2,4>, <2,1,2,4>
+  2697807466U,	// <2,4,2,2>: Cost 3 vsldoi8 <2,2,2,4>, <2,2,2,4>
+  3368880468U,	// <2,4,2,3>: Cost 4 vmrglw <2,2,2,2>, <3,2,4,3>
+  2228030672U,	// <2,4,2,4>: Cost 3 vmrghw <2,2,2,2>, <4,4,4,4>
+  1154288950U,	// <2,4,2,5>: Cost 2 vmrghw <2,2,2,2>, RHS
+  3771549617U,	// <2,4,2,6>: Cost 4 vsldoi8 <2,2,2,4>, <2,6,2,7>
+  3368880796U,	// <2,4,2,7>: Cost 4 vmrglw <2,2,2,2>, <3,6,4,7>
+  1154289193U,	// <2,4,2,u>: Cost 2 vmrghw <2,2,2,2>, RHS
+  2636808294U,	// <2,4,3,0>: Cost 3 vsldoi4 <3,2,4,3>, LHS
+  2287181861U,	// <2,4,3,1>: Cost 3 vmrglw LHS, <0,0,4,1>
+  2228866102U,	// <2,4,3,2>: Cost 3 vmrghw <2,3,4,5>, <4,2,5,3>
+  2636810580U,	// <2,4,3,3>: Cost 3 vsldoi4 <3,2,4,3>, <3,2,4,3>
+  1256574160U,	// <2,4,3,4>: Cost 2 vmrglw LHS, <4,4,4,4>
+  1213441742U,	// <2,4,3,5>: Cost 2 vmrglw LHS, <2,3,4,5>
+  2228866430U,	// <2,4,3,6>: Cost 3 vmrghw <2,3,4,5>, <4,6,5,7>
+  2660701368U,	// <2,4,3,7>: Cost 3 vsldoi4 <7,2,4,3>, <7,2,4,3>
+  1213441745U,	// <2,4,3,u>: Cost 2 vmrglw LHS, <2,3,4,u>
+  3704586342U,	// <2,4,4,0>: Cost 4 vsldoi4 <2,2,4,4>, LHS
+  3782831051U,	// <2,4,4,1>: Cost 4 vsldoi8 <4,1,2,4>, <4,1,2,4>
+  3704587900U,	// <2,4,4,2>: Cost 4 vsldoi4 <2,2,4,4>, <2,2,4,4>
+  3368896123U,	// <2,4,4,3>: Cost 4 vmrglw <2,2,2,4>, <2,2,4,3>
+  2793360592U,	// <2,4,4,4>: Cost 3 vsldoi12 <7,0,1,2>, <4,4,4,4>
+  2697809206U,	// <2,4,4,5>: Cost 3 vsldoi8 <2,2,2,4>, RHS
+  3303198078U,	// <2,4,4,6>: Cost 4 vmrghw <2,4,3,5>, <4,6,5,7>
+  3867102444U,	// <2,4,4,7>: Cost 4 vsldoi12 <7,0,1,2>, <4,4,7,5>
+  2697809449U,	// <2,4,4,u>: Cost 3 vsldoi8 <2,2,2,4>, RHS
+  2630852710U,	// <2,4,5,0>: Cost 3 vsldoi4 <2,2,4,5>, LHS
+  2624881572U,	// <2,4,5,1>: Cost 3 vsldoi4 <1,2,4,5>, <1,2,4,5>
+  2630854269U,	// <2,4,5,2>: Cost 3 vsldoi4 <2,2,4,5>, <2,2,4,5>
+  2666686677U,	// <2,4,5,3>: Cost 3 vsldoi4 <u,2,4,5>, <3,0,u,2>
+  2630855990U,	// <2,4,5,4>: Cost 3 vsldoi4 <2,2,4,5>, RHS
+  2230127926U,	// <2,4,5,5>: Cost 3 vmrghw <2,5,3,6>, RHS
+  1691159862U,	// <2,4,5,6>: Cost 2 vsldoi12 <2,2,2,2>, RHS
+  3867102520U,	// <2,4,5,7>: Cost 4 vsldoi12 <7,0,1,2>, <4,5,7,0>
+  1691159880U,	// <2,4,5,u>: Cost 2 vsldoi12 <2,2,2,2>, RHS
+  2230799250U,	// <2,4,6,0>: Cost 3 vmrghw <2,6,3,7>, <4,0,5,1>
+  3304541130U,	// <2,4,6,1>: Cost 4 vmrghw <2,6,3,7>, <4,1,2,3>
+  2230799417U,	// <2,4,6,2>: Cost 3 vmrghw <2,6,3,7>, <4,2,5,6>
+  3304541323U,	// <2,4,6,3>: Cost 4 vmrghw <2,6,3,7>, <4,3,5,7>
+  2230799568U,	// <2,4,6,4>: Cost 3 vmrghw <2,6,3,7>, <4,4,4,4>
+  1157057846U,	// <2,4,6,5>: Cost 2 vmrghw <2,6,3,7>, RHS
+  3304541566U,	// <2,4,6,6>: Cost 4 vmrghw <2,6,3,7>, <4,6,5,7>
+  3798758243U,	// <2,4,6,7>: Cost 4 vsldoi8 <6,7,2,4>, <6,7,2,4>
+  1157058089U,	// <2,4,6,u>: Cost 2 vmrghw <2,6,3,7>, RHS
+  3806721018U,	// <2,4,7,0>: Cost 4 vsldoi8 <u,1,2,4>, <7,0,1,2>
+  3853831590U,	// <2,4,7,1>: Cost 4 vsldoi12 <4,7,1,2>, <4,7,1,2>
+  3801412775U,	// <2,4,7,2>: Cost 4 vsldoi8 <7,2,2,4>, <7,2,2,4>
+  3802076408U,	// <2,4,7,3>: Cost 4 vsldoi8 <7,3,2,4>, <7,3,2,4>
+  3401436368U,	// <2,4,7,4>: Cost 4 vmrglw <7,6,2,7>, <4,4,4,4>
+  2793360840U,	// <2,4,7,5>: Cost 3 vsldoi12 <7,0,1,2>, <4,7,5,0>
+  3804067307U,	// <2,4,7,6>: Cost 4 vsldoi8 <7,6,2,4>, <7,6,2,4>
+  3867102682U,	// <2,4,7,7>: Cost 4 vsldoi12 <7,0,1,2>, <4,7,7,0>
+  2793360867U,	// <2,4,7,u>: Cost 3 vsldoi12 <7,0,1,2>, <4,7,u,0>
+  2630877286U,	// <2,4,u,0>: Cost 3 vsldoi4 <2,2,4,u>, LHS
+  2282580144U,	// <2,4,u,1>: Cost 3 vmrglw LHS, <3,0,4,1>
+  2630878848U,	// <2,4,u,2>: Cost 3 vsldoi4 <2,2,4,u>, <2,2,4,u>
+  2636851545U,	// <2,4,u,3>: Cost 3 vsldoi4 <3,2,4,u>, <3,2,4,u>
+  1256615120U,	// <2,4,u,4>: Cost 2 vmrglw LHS, <4,4,4,4>
+  1208837838U,	// <2,4,u,5>: Cost 2 vmrglw LHS, <2,3,4,5>
+  1691160105U,	// <2,4,u,6>: Cost 2 vsldoi12 <2,2,2,2>, RHS
+  2660742333U,	// <2,4,u,7>: Cost 3 vsldoi4 <7,2,4,u>, <7,2,4,u>
+  1208837841U,	// <2,4,u,u>: Cost 2 vmrglw LHS, <2,3,4,u>
+  3766910976U,	// <2,5,0,0>: Cost 4 vsldoi8 <1,4,2,5>, <0,0,0,0>
+  2693169254U,	// <2,5,0,1>: Cost 3 vsldoi8 <1,4,2,5>, LHS
+  3760939181U,	// <2,5,0,2>: Cost 4 vsldoi8 <0,4,2,5>, <0,2,1,2>
+  3843214936U,	// <2,5,0,3>: Cost 4 vsldoi12 <3,0,1,2>, <5,0,3,0>
+  3760939355U,	// <2,5,0,4>: Cost 4 vsldoi8 <0,4,2,5>, <0,4,2,5>
+  3867102827U,	// <2,5,0,5>: Cost 4 vsldoi12 <7,0,1,2>, <5,0,5,1>
+  3867102836U,	// <2,5,0,6>: Cost 4 vsldoi12 <7,0,1,2>, <5,0,6,1>
+  3867102844U,	// <2,5,0,7>: Cost 4 vsldoi12 <7,0,1,2>, <5,0,7,0>
+  2693169821U,	// <2,5,0,u>: Cost 3 vsldoi8 <1,4,2,5>, LHS
+  3766911724U,	// <2,5,1,0>: Cost 4 vsldoi8 <1,4,2,5>, <1,0,2,1>
+  3766911796U,	// <2,5,1,1>: Cost 4 vsldoi8 <1,4,2,5>, <1,1,1,1>
+  2693170070U,	// <2,5,1,2>: Cost 3 vsldoi8 <1,4,2,5>, <1,2,3,0>
+  3384798262U,	// <2,5,1,3>: Cost 4 vmrglw <4,u,2,1>, <4,2,5,3>
+  2693170228U,	// <2,5,1,4>: Cost 3 vsldoi8 <1,4,2,5>, <1,4,2,5>
+  3301208068U,	// <2,5,1,5>: Cost 4 vmrghw <2,1,3,5>, <5,5,5,5>
+  3366879607U,	// <2,5,1,6>: Cost 4 vmrglw <1,u,2,1>, <0,4,5,6>
+  3867102925U,	// <2,5,1,7>: Cost 4 vsldoi12 <7,0,1,2>, <5,1,7,0>
+  2695824760U,	// <2,5,1,u>: Cost 3 vsldoi8 <1,u,2,5>, <1,u,2,5>
+  2642845798U,	// <2,5,2,0>: Cost 3 vsldoi4 <4,2,5,2>, LHS
+  2295139218U,	// <2,5,2,1>: Cost 3 vmrglw <2,2,2,2>, <4,0,5,1>
+  2699142760U,	// <2,5,2,2>: Cost 3 vsldoi8 <2,4,2,5>, <2,2,2,2>
+  3766912678U,	// <2,5,2,3>: Cost 4 vsldoi8 <1,4,2,5>, <2,3,0,1>
+  2699142925U,	// <2,5,2,4>: Cost 3 vsldoi8 <2,4,2,5>, <2,4,2,5>
+  2228031492U,	// <2,5,2,5>: Cost 3 vmrghw <2,2,2,2>, <5,5,5,5>
+  2295138818U,	// <2,5,2,6>: Cost 3 vmrglw <2,2,2,2>, <3,4,5,6>
+  3368879347U,	// <2,5,2,7>: Cost 4 vmrglw <2,2,2,2>, <1,6,5,7>
+  2295138820U,	// <2,5,2,u>: Cost 3 vmrglw <2,2,2,2>, <3,4,5,u>
+  2287184866U,	// <2,5,3,0>: Cost 3 vmrglw LHS, <4,1,5,0>
+  1256573842U,	// <2,5,3,1>: Cost 2 vmrglw LHS, <4,0,5,1>
+  2642855630U,	// <2,5,3,2>: Cost 3 vsldoi4 <4,2,5,3>, <2,3,4,5>
+  2287182763U,	// <2,5,3,3>: Cost 3 vmrglw LHS, <1,2,5,3>
+  2287184870U,	// <2,5,3,4>: Cost 3 vmrglw LHS, <4,1,5,4>
+  1256574170U,	// <2,5,3,5>: Cost 2 vmrglw LHS, <4,4,5,5>
+  1213442562U,	// <2,5,3,6>: Cost 2 vmrglw LHS, <3,4,5,6>
+  2287183091U,	// <2,5,3,7>: Cost 3 vmrglw LHS, <1,6,5,7>
+  1213442564U,	// <2,5,3,u>: Cost 2 vmrglw LHS, <3,4,5,u>
+  3716604006U,	// <2,5,4,0>: Cost 4 vsldoi4 <4,2,5,4>, LHS
+  3716604822U,	// <2,5,4,1>: Cost 4 vsldoi4 <4,2,5,4>, <1,2,3,0>
+  3766914099U,	// <2,5,4,2>: Cost 4 vsldoi8 <1,4,2,5>, <4,2,5,0>
+  3368895403U,	// <2,5,4,3>: Cost 5 vmrglw <2,2,2,4>, <1,2,5,3>
+  3716607031U,	// <2,5,4,4>: Cost 4 vsldoi4 <4,2,5,4>, <4,2,5,4>
+  2693172534U,	// <2,5,4,5>: Cost 3 vsldoi8 <1,4,2,5>, RHS
+  3363588610U,	// <2,5,4,6>: Cost 4 vmrglw <1,3,2,4>, <3,4,5,6>
+  3368895731U,	// <2,5,4,7>: Cost 5 vmrglw <2,2,2,4>, <1,6,5,7>
+  2693172777U,	// <2,5,4,u>: Cost 3 vsldoi8 <1,4,2,5>, RHS
+  3704668262U,	// <2,5,5,0>: Cost 4 vsldoi4 <2,2,5,5>, LHS
+  3704669078U,	// <2,5,5,1>: Cost 4 vsldoi4 <2,2,5,5>, <1,2,3,0>
+  3704669830U,	// <2,5,5,2>: Cost 4 vsldoi4 <2,2,5,5>, <2,2,5,5>
+  3364259460U,	// <2,5,5,3>: Cost 4 vmrglw <1,4,2,5>, <2,2,5,3>
+  3704671542U,	// <2,5,5,4>: Cost 4 vsldoi4 <2,2,5,5>, RHS
+  2793361412U,	// <2,5,5,5>: Cost 3 vsldoi12 <7,0,1,2>, <5,5,5,5>
+  3364258167U,	// <2,5,5,6>: Cost 4 vmrglw <1,4,2,5>, <0,4,5,6>
+  3867103249U,	// <2,5,5,7>: Cost 4 vsldoi12 <7,0,1,2>, <5,5,7,0>
+  2793361412U,	// <2,5,5,u>: Cost 3 vsldoi12 <7,0,1,2>, <5,5,5,5>
+  2642878566U,	// <2,5,6,0>: Cost 3 vsldoi4 <4,2,5,6>, LHS
+  3386166810U,	// <2,5,6,1>: Cost 4 vmrglw <5,1,2,6>, <4,u,5,1>
+  2723033594U,	// <2,5,6,2>: Cost 3 vsldoi8 <6,4,2,5>, <6,2,7,3>
+  3848523842U,	// <2,5,6,3>: Cost 4 vsldoi12 <3,u,1,2>, <5,6,3,4>
+  2723033713U,	// <2,5,6,4>: Cost 3 vsldoi8 <6,4,2,5>, <6,4,2,5>
+  2230800388U,	// <2,5,6,5>: Cost 3 vmrghw <2,6,3,7>, <5,5,5,5>
+  2230800482U,	// <2,5,6,6>: Cost 3 vmrghw <2,6,3,7>, <5,6,7,0>
+  2785841252U,	// <2,5,6,7>: Cost 3 vsldoi12 <5,6,7,2>, <5,6,7,2>
+  2785914989U,	// <2,5,6,u>: Cost 3 vsldoi12 <5,6,u,2>, <5,6,u,2>
+  3796775930U,	// <2,5,7,0>: Cost 4 vsldoi8 <6,4,2,5>, <7,0,1,2>
+  3800757335U,	// <2,5,7,1>: Cost 4 vsldoi8 <7,1,2,5>, <7,1,2,5>
+  3853463689U,	// <2,5,7,2>: Cost 4 vsldoi12 <4,6,5,2>, <5,7,2,3>
+  3796776218U,	// <2,5,7,3>: Cost 4 vsldoi8 <6,4,2,5>, <7,3,6,2>
+  3796776294U,	// <2,5,7,4>: Cost 4 vsldoi8 <6,4,2,5>, <7,4,5,6>
+  3803411867U,	// <2,5,7,5>: Cost 4 vsldoi8 <7,5,2,5>, <7,5,2,5>
+  3371575081U,	// <2,5,7,6>: Cost 4 vmrglw <2,6,2,7>, <2,4,5,6>
+  3796776516U,	// <2,5,7,7>: Cost 4 vsldoi8 <6,4,2,5>, <7,7,3,3>
+  3371575083U,	// <2,5,7,u>: Cost 4 vmrglw <2,6,2,7>, <2,4,5,u>
+  2287225826U,	// <2,5,u,0>: Cost 3 vmrglw LHS, <4,1,5,0>
+  1256614802U,	// <2,5,u,1>: Cost 2 vmrglw LHS, <4,0,5,1>
+  2642896590U,	// <2,5,u,2>: Cost 3 vsldoi4 <4,2,5,u>, <2,3,4,5>
+  2287223723U,	// <2,5,u,3>: Cost 3 vmrglw LHS, <1,2,5,3>
+  2287225830U,	// <2,5,u,4>: Cost 3 vmrglw LHS, <4,1,5,4>
+  1256615130U,	// <2,5,u,5>: Cost 2 vmrglw LHS, <4,4,5,5>
+  1208838658U,	// <2,5,u,6>: Cost 2 vmrglw LHS, <3,4,5,6>
+  2287224051U,	// <2,5,u,7>: Cost 3 vmrglw LHS, <1,6,5,7>
+  1208838660U,	// <2,5,u,u>: Cost 2 vmrglw LHS, <3,4,5,u>
+  3772227584U,	// <2,6,0,0>: Cost 4 vsldoi8 <2,3,2,6>, <0,0,0,0>
+  2698485862U,	// <2,6,0,1>: Cost 3 vsldoi8 <2,3,2,6>, LHS
+  3759620282U,	// <2,6,0,2>: Cost 4 vsldoi8 <0,2,2,6>, <0,2,2,6>
+  3710675299U,	// <2,6,0,3>: Cost 4 vsldoi4 <3,2,6,0>, <3,2,6,0>
+  3767583058U,	// <2,6,0,4>: Cost 4 vsldoi8 <1,5,2,6>, <0,4,1,5>
+  3378153265U,	// <2,6,0,5>: Cost 5 vmrglw <3,7,2,0>, <2,4,6,5>
+  3865186637U,	// <2,6,0,6>: Cost 4 vsldoi12 <6,6,2,2>, <6,0,6,1>
+  2330291510U,	// <2,6,0,7>: Cost 3 vmrglw <u,1,2,0>, RHS
+  2698486429U,	// <2,6,0,u>: Cost 3 vsldoi8 <2,3,2,6>, LHS
+  3734569062U,	// <2,6,1,0>: Cost 4 vsldoi4 <7,2,6,1>, LHS
+  3764929346U,	// <2,6,1,1>: Cost 4 vsldoi8 <1,1,2,6>, <1,1,2,6>
+  3772228502U,	// <2,6,1,2>: Cost 4 vsldoi8 <2,3,2,6>, <1,2,3,0>
+  3734571158U,	// <2,6,1,3>: Cost 4 vsldoi4 <7,2,6,1>, <3,0,1,2>
+  3734572342U,	// <2,6,1,4>: Cost 4 vsldoi4 <7,2,6,1>, RHS
+  3767583878U,	// <2,6,1,5>: Cost 4 vsldoi8 <1,5,2,6>, <1,5,2,6>
+  3768247511U,	// <2,6,1,6>: Cost 4 vsldoi8 <1,6,2,6>, <1,6,2,6>
+  2293140790U,	// <2,6,1,7>: Cost 3 vmrglw <1,u,2,1>, RHS
+  2293140791U,	// <2,6,1,u>: Cost 3 vmrglw <1,u,2,1>, RHS
+  3704717414U,	// <2,6,2,0>: Cost 4 vsldoi4 <2,2,6,2>, LHS
+  3395424589U,	// <2,6,2,1>: Cost 4 vmrglw <6,6,2,2>, <6,0,6,1>
+  2228031993U,	// <2,6,2,2>: Cost 3 vmrghw <2,2,2,2>, <6,2,7,2>
+  2698487485U,	// <2,6,2,3>: Cost 3 vsldoi8 <2,3,2,6>, <2,3,2,6>
+  3704720694U,	// <2,6,2,4>: Cost 4 vsldoi4 <2,2,6,2>, RHS
+  3773556575U,	// <2,6,2,5>: Cost 4 vsldoi8 <2,5,2,6>, <2,5,2,6>
+  2698487738U,	// <2,6,2,6>: Cost 3 vsldoi8 <2,3,2,6>, <2,6,3,7>
+  1221397814U,	// <2,6,2,7>: Cost 2 vmrglw <2,2,2,2>, RHS
+  1221397815U,	// <2,6,2,u>: Cost 2 vmrglw <2,2,2,2>, RHS
+  2636955750U,	// <2,6,3,0>: Cost 3 vsldoi4 <3,2,6,3>, LHS
+  2330314217U,	// <2,6,3,1>: Cost 3 vmrglw LHS, <2,0,6,1>
+  2636957626U,	// <2,6,3,2>: Cost 3 vsldoi4 <3,2,6,3>, <2,6,3,7>
+  2287184230U,	// <2,6,3,3>: Cost 3 vmrglw LHS, <3,2,6,3>
+  2636959030U,	// <2,6,3,4>: Cost 3 vsldoi4 <3,2,6,3>, RHS
+  2648903448U,	// <2,6,3,5>: Cost 3 vsldoi4 <5,2,6,3>, <5,2,6,3>
+  1256575800U,	// <2,6,3,6>: Cost 2 vmrglw LHS, <6,6,6,6>
+  135056694U,	// <2,6,3,7>: Cost 1 vmrglw LHS, RHS
+  135056695U,	// <2,6,3,u>: Cost 1 vmrglw LHS, RHS
+  3710705766U,	// <2,6,4,0>: Cost 4 vsldoi4 <3,2,6,4>, LHS
+  3698762677U,	// <2,6,4,1>: Cost 5 vsldoi4 <1,2,6,4>, <1,2,6,4>
+  3710707389U,	// <2,6,4,2>: Cost 4 vsldoi4 <3,2,6,4>, <2,3,2,6>
+  3710708071U,	// <2,6,4,3>: Cost 4 vsldoi4 <3,2,6,4>, <3,2,6,4>
+  3710709046U,	// <2,6,4,4>: Cost 4 vsldoi4 <3,2,6,4>, RHS
+  2698489142U,	// <2,6,4,5>: Cost 3 vsldoi8 <2,3,2,6>, RHS
+  3796782457U,	// <2,6,4,6>: Cost 4 vsldoi8 <6,4,2,6>, <4,6,5,2>
+  2295156022U,	// <2,6,4,7>: Cost 3 vmrglw <2,2,2,4>, RHS
+  2295156023U,	// <2,6,4,u>: Cost 3 vmrglw <2,2,2,4>, RHS
+  3303870753U,	// <2,6,5,0>: Cost 4 vmrghw <2,5,3,6>, <6,0,1,2>
+  3788820134U,	// <2,6,5,1>: Cost 4 vsldoi8 <5,1,2,6>, <5,1,2,6>
+  3779530520U,	// <2,6,5,2>: Cost 4 vsldoi8 <3,5,2,6>, <5,2,6,3>
+  3303871026U,	// <2,6,5,3>: Cost 4 vmrghw <2,5,3,6>, <6,3,4,5>
+  3303871117U,	// <2,6,5,4>: Cost 4 vmrghw <2,5,3,6>, <6,4,5,6>
+  3791474666U,	// <2,6,5,5>: Cost 4 vsldoi8 <5,5,2,6>, <5,5,2,6>
+  3792138299U,	// <2,6,5,6>: Cost 4 vsldoi8 <5,6,2,6>, <5,6,2,6>
+  2290519350U,	// <2,6,5,7>: Cost 3 vmrglw <1,4,2,5>, RHS
+  2290519351U,	// <2,6,5,u>: Cost 3 vmrglw <1,4,2,5>, RHS
+  2631008358U,	// <2,6,6,0>: Cost 3 vsldoi4 <2,2,6,6>, LHS
+  3372893673U,	// <2,6,6,1>: Cost 4 vmrglw <2,u,2,6>, <2,0,6,1>
+  2791445264U,	// <2,6,6,2>: Cost 3 vsldoi12 <6,6,2,2>, <6,6,2,2>
+  2230800968U,	// <2,6,6,3>: Cost 3 vmrghw <2,6,3,7>, <6,3,7,0>
+  2631011638U,	// <2,6,6,4>: Cost 3 vsldoi4 <2,2,6,6>, RHS
+  3372894001U,	// <2,6,6,5>: Cost 4 vmrglw <2,u,2,6>, <2,4,6,5>
+  2793362232U,	// <2,6,6,6>: Cost 3 vsldoi12 <7,0,1,2>, <6,6,6,6>
+  2295835958U,	// <2,6,6,7>: Cost 3 vmrglw <2,3,2,6>, RHS
+  2295835959U,	// <2,6,6,u>: Cost 3 vmrglw <2,3,2,6>, RHS
+  2793362254U,	// <2,6,7,0>: Cost 3 vsldoi12 <7,0,1,2>, <6,7,0,1>
+  2792035160U,	// <2,6,7,1>: Cost 3 vsldoi12 <6,7,1,2>, <6,7,1,2>
+  2792108897U,	// <2,6,7,2>: Cost 3 vsldoi12 <6,7,2,2>, <6,7,2,2>
+  2769474408U,	// <2,6,7,3>: Cost 3 vsldoi12 <3,0,1,2>, <6,7,3,0>
+  2793362294U,	// <2,6,7,4>: Cost 3 vsldoi12 <7,0,1,2>, <6,7,4,5>
+  3371575089U,	// <2,6,7,5>: Cost 4 vmrglw <2,6,2,7>, <2,4,6,5>
+  2792403845U,	// <2,6,7,6>: Cost 3 vsldoi12 <6,7,6,2>, <6,7,6,2>
+  2297834806U,	// <2,6,7,7>: Cost 3 vmrglw <2,6,2,7>, RHS
+  2297834807U,	// <2,6,7,u>: Cost 3 vmrglw <2,6,2,7>, RHS
+  2636996710U,	// <2,6,u,0>: Cost 3 vsldoi4 <3,2,6,u>, LHS
+  2698491694U,	// <2,6,u,1>: Cost 3 vsldoi8 <2,3,2,6>, LHS
+  2636998631U,	// <2,6,u,2>: Cost 3 vsldoi4 <3,2,6,u>, <2,6,u,7>
+  2282580326U,	// <2,6,u,3>: Cost 3 vmrglw LHS, <3,2,6,3>
+  2636999990U,	// <2,6,u,4>: Cost 3 vsldoi4 <3,2,6,u>, RHS
+  2698492058U,	// <2,6,u,5>: Cost 3 vsldoi8 <2,3,2,6>, RHS
+  1256616760U,	// <2,6,u,6>: Cost 2 vmrglw LHS, <6,6,6,6>
+  135097654U,	// <2,6,u,7>: Cost 1 vmrglw LHS, RHS
+  135097655U,	// <2,6,u,u>: Cost 1 vmrglw LHS, RHS
+  2666864742U,	// <2,7,0,0>: Cost 3 vsldoi4 <u,2,7,0>, LHS
+  1719620602U,	// <2,7,0,1>: Cost 2 vsldoi12 <7,0,1,2>, <7,0,1,2>
+  3768254637U,	// <2,7,0,2>: Cost 4 vsldoi8 <1,6,2,7>, <0,2,1,2>
+  3393417722U,	// <2,7,0,3>: Cost 4 vmrglw <6,3,2,0>, <6,2,7,3>
+  2666868022U,	// <2,7,0,4>: Cost 3 vsldoi4 <u,2,7,0>, RHS
+  3867104290U,	// <2,7,0,5>: Cost 4 vsldoi12 <7,0,1,2>, <7,0,5,6>
+  3728667127U,	// <2,7,0,6>: Cost 4 vsldoi4 <6,2,7,0>, <6,2,7,0>
+  2666869817U,	// <2,7,0,7>: Cost 3 vsldoi4 <u,2,7,0>, <7,0,u,2>
+  1720136761U,	// <2,7,0,u>: Cost 2 vsldoi12 <7,0,u,2>, <7,0,u,2>
+  3728670822U,	// <2,7,1,0>: Cost 4 vsldoi4 <6,2,7,1>, LHS
+  3774227252U,	// <2,7,1,1>: Cost 4 vsldoi8 <2,6,2,7>, <1,1,1,1>
+  3774227350U,	// <2,7,1,2>: Cost 4 vsldoi8 <2,6,2,7>, <1,2,3,0>
+  2323001850U,	// <2,7,1,3>: Cost 3 vmrglw <6,u,2,1>, <6,2,7,3>
+  3728674102U,	// <2,7,1,4>: Cost 4 vsldoi4 <6,2,7,1>, RHS
+  3774227567U,	// <2,7,1,5>: Cost 5 vsldoi8 <2,6,2,7>, <1,5,0,1>
+  2694513880U,	// <2,7,1,6>: Cost 3 vsldoi8 <1,6,2,7>, <1,6,2,7>
+  3396744002U,	// <2,7,1,7>: Cost 4 vmrglw <6,u,2,1>, <6,6,7,7>
+  2323001850U,	// <2,7,1,u>: Cost 3 vmrglw <6,u,2,1>, <6,2,7,3>
+  2654937190U,	// <2,7,2,0>: Cost 3 vsldoi4 <6,2,7,2>, LHS
+  3728679732U,	// <2,7,2,1>: Cost 4 vsldoi4 <6,2,7,2>, <1,1,1,1>
+  2700486248U,	// <2,7,2,2>: Cost 3 vsldoi8 <2,6,2,7>, <2,2,2,2>
+  2321682938U,	// <2,7,2,3>: Cost 3 vmrglw <6,6,2,2>, <6,2,7,3>
+  2654940470U,	// <2,7,2,4>: Cost 3 vsldoi4 <6,2,7,2>, RHS
+  3859584196U,	// <2,7,2,5>: Cost 4 vsldoi12 <5,6,7,2>, <7,2,5,6>
+  2700486577U,	// <2,7,2,6>: Cost 3 vsldoi8 <2,6,2,7>, <2,6,2,7>
+  2228033132U,	// <2,7,2,7>: Cost 3 vmrghw <2,2,2,2>, <7,7,7,7>
+  2701813843U,	// <2,7,2,u>: Cost 3 vsldoi8 <2,u,2,7>, <2,u,2,7>
+  1581203558U,	// <2,7,3,0>: Cost 2 vsldoi4 <6,2,7,3>, LHS
+  2654946100U,	// <2,7,3,1>: Cost 3 vsldoi4 <6,2,7,3>, <1,1,1,1>
+  2637031354U,	// <2,7,3,2>: Cost 3 vsldoi4 <3,2,7,3>, <2,6,3,7>
+  1256575482U,	// <2,7,3,3>: Cost 2 vmrglw LHS, <6,2,7,3>
+  1581206838U,	// <2,7,3,4>: Cost 2 vsldoi4 <6,2,7,3>, RHS
+  2654949380U,	// <2,7,3,5>: Cost 3 vsldoi4 <6,2,7,3>, <5,5,5,5>
+  1581208058U,	// <2,7,3,6>: Cost 2 vsldoi4 <6,2,7,3>, <6,2,7,3>
+  1256575810U,	// <2,7,3,7>: Cost 2 vmrglw LHS, <6,6,7,7>
+  1581209390U,	// <2,7,3,u>: Cost 2 vsldoi4 <6,2,7,3>, LHS
+  3728695398U,	// <2,7,4,0>: Cost 4 vsldoi4 <6,2,7,4>, LHS
+  3869758782U,	// <2,7,4,1>: Cost 4 vsldoi12 <7,4,1,2>, <7,4,1,2>
+  3728696936U,	// <2,7,4,2>: Cost 4 vsldoi4 <6,2,7,4>, <2,2,2,2>
+  3393450490U,	// <2,7,4,3>: Cost 4 vmrglw <6,3,2,4>, <6,2,7,3>
+  3728698678U,	// <2,7,4,4>: Cost 4 vsldoi4 <6,2,7,4>, RHS
+  2700487990U,	// <2,7,4,5>: Cost 3 vsldoi8 <2,6,2,7>, RHS
+  3728699899U,	// <2,7,4,6>: Cost 4 vsldoi4 <6,2,7,4>, <6,2,7,4>
+  3867104626U,	// <2,7,4,7>: Cost 4 vsldoi12 <7,0,1,2>, <7,4,7,0>
+  2700488233U,	// <2,7,4,u>: Cost 3 vsldoi8 <2,6,2,7>, RHS
+  3855160709U,	// <2,7,5,0>: Cost 4 vsldoi12 <5,0,1,2>, <7,5,0,1>
+  3728704406U,	// <2,7,5,1>: Cost 4 vsldoi4 <6,2,7,5>, <1,2,3,0>
+  3370233956U,	// <2,7,5,2>: Cost 4 vmrglw <2,4,2,5>, <5,6,7,2>
+  2320380410U,	// <2,7,5,3>: Cost 3 vmrglw <6,4,2,5>, <6,2,7,3>
+  3728706870U,	// <2,7,5,4>: Cost 4 vsldoi4 <6,2,7,5>, RHS
+  3867104694U,	// <2,7,5,5>: Cost 4 vsldoi12 <7,0,1,2>, <7,5,5,5>
+  3792146492U,	// <2,7,5,6>: Cost 4 vsldoi8 <5,6,2,7>, <5,6,2,7>
+  3394122562U,	// <2,7,5,7>: Cost 4 vmrglw <6,4,2,5>, <6,6,7,7>
+  2320380410U,	// <2,7,5,u>: Cost 3 vmrglw <6,4,2,5>, <6,2,7,3>
+  2230801402U,	// <2,7,6,0>: Cost 3 vmrghw <2,6,3,7>, <7,0,1,2>
+  3768258984U,	// <2,7,6,1>: Cost 4 vsldoi8 <1,6,2,7>, <6,1,7,2>
+  2730349050U,	// <2,7,6,2>: Cost 3 vsldoi8 <7,6,2,7>, <6,2,7,3>
+  3372894575U,	// <2,7,6,3>: Cost 4 vmrglw <2,u,2,6>, <3,2,7,3>
+  2230801766U,	// <2,7,6,4>: Cost 3 vmrghw <2,6,3,7>, <7,4,5,6>
+  3304543670U,	// <2,7,6,5>: Cost 4 vmrghw <2,6,3,7>, <7,5,5,5>
+  3728716285U,	// <2,7,6,6>: Cost 4 vsldoi4 <6,2,7,6>, <6,2,7,6>
+  2230802028U,	// <2,7,6,7>: Cost 3 vmrghw <2,6,3,7>, <7,7,7,7>
+  2730349050U,	// <2,7,6,u>: Cost 3 vsldoi8 <7,6,2,7>, <6,2,7,3>
+  2793362983U,	// <2,7,7,0>: Cost 3 vsldoi12 <7,0,1,2>, <7,7,0,1>
+  3728721112U,	// <2,7,7,1>: Cost 4 vsldoi4 <6,2,7,7>, <1,6,2,7>
+  3371574933U,	// <2,7,7,2>: Cost 4 vmrglw <2,6,2,7>, <2,2,7,2>
+  2327695866U,	// <2,7,7,3>: Cost 3 vmrglw <7,6,2,7>, <6,2,7,3>
+  3728723254U,	// <2,7,7,4>: Cost 4 vsldoi4 <6,2,7,7>, RHS
+  3371574855U,	// <2,7,7,5>: Cost 5 vmrglw <2,6,2,7>, <2,1,7,5>
+  2730350062U,	// <2,7,7,6>: Cost 3 vsldoi8 <7,6,2,7>, <7,6,2,7>
+  2793363052U,	// <2,7,7,7>: Cost 3 vsldoi12 <7,0,1,2>, <7,7,7,7>
+  2798671471U,	// <2,7,7,u>: Cost 3 vsldoi12 <7,u,1,2>, <7,7,u,1>
+  1581244518U,	// <2,7,u,0>: Cost 2 vsldoi4 <6,2,7,u>, LHS
+  1724929666U,	// <2,7,u,1>: Cost 2 vsldoi12 <7,u,1,2>, <7,u,1,2>
+  2637072314U,	// <2,7,u,2>: Cost 3 vsldoi4 <3,2,7,u>, <2,6,3,7>
+  1256616442U,	// <2,7,u,3>: Cost 2 vmrglw LHS, <6,2,7,3>
+  1581247798U,	// <2,7,u,4>: Cost 2 vsldoi4 <6,2,7,u>, RHS
+  2700490906U,	// <2,7,u,5>: Cost 3 vsldoi8 <2,6,2,7>, RHS
+  1581249023U,	// <2,7,u,6>: Cost 2 vsldoi4 <6,2,7,u>, <6,2,7,u>
+  1256616770U,	// <2,7,u,7>: Cost 2 vmrglw LHS, <6,6,7,7>
+  1581250350U,	// <2,7,u,u>: Cost 2 vsldoi4 <6,2,7,u>, LHS
+  1611489280U,	// <2,u,0,0>: Cost 2 vsldoi8 LHS, <0,0,0,0>
+  537747563U,	// <2,u,0,1>: Cost 1 vsldoi8 LHS, LHS
+  2685231277U,	// <2,u,0,2>: Cost 3 vsldoi8 LHS, <0,2,1,2>
+  2685231356U,	// <2,u,0,3>: Cost 3 vsldoi8 LHS, <0,3,1,0>
+  1611489618U,	// <2,u,0,4>: Cost 2 vsldoi8 LHS, <0,4,1,5>
+  2226763930U,	// <2,u,0,5>: Cost 3 vmrghw <2,0,3,0>, RHS
+  2733007350U,	// <2,u,0,6>: Cost 3 vsldoi8 LHS, <0,6,1,7>
+  2660971737U,	// <2,u,0,7>: Cost 3 vsldoi4 <7,2,u,0>, <7,2,u,0>
+  537748125U,	// <2,u,0,u>: Cost 1 vsldoi8 LHS, LHS
+  2689876708U,	// <2,u,1,0>: Cost 3 vsldoi8 LHS, <1,0,1,2>
+  1611490100U,	// <2,u,1,1>: Cost 2 vsldoi8 LHS, <1,1,1,1>
+  1611490198U,	// <2,u,1,2>: Cost 2 vsldoi8 LHS, <1,2,3,0>
+  2293137564U,	// <2,u,1,3>: Cost 3 vmrglw <1,u,2,1>, LHS
+  2689877072U,	// <2,u,1,4>: Cost 3 vsldoi8 LHS, <1,4,5,6>
+  2689877103U,	// <2,u,1,5>: Cost 3 vsldoi8 LHS, <1,5,0,1>
+  2689877199U,	// <2,u,1,6>: Cost 3 vsldoi8 LHS, <1,6,1,7>
+  2293140808U,	// <2,u,1,7>: Cost 3 vmrglw <1,u,2,1>, RHS
+  1616135548U,	// <2,u,1,u>: Cost 2 vsldoi8 LHS, <1,u,3,0>
+  1556938854U,	// <2,u,2,0>: Cost 2 vsldoi4 <2,2,2,2>, LHS
+  1154291502U,	// <2,u,2,1>: Cost 2 vmrghw <2,2,2,2>, LHS
+  336380006U,	// <2,u,2,2>: Cost 1 vspltisw2 LHS
+  1611490982U,	// <2,u,2,3>: Cost 2 vsldoi8 LHS, <2,3,0,1>
+  1556942134U,	// <2,u,2,4>: Cost 2 vsldoi4 <2,2,2,2>, RHS
+  1154291866U,	// <2,u,2,5>: Cost 2 vmrghw <2,2,2,2>, RHS
+  1611491258U,	// <2,u,2,6>: Cost 2 vsldoi8 LHS, <2,6,3,7>
+  1221397832U,	// <2,u,2,7>: Cost 2 vmrglw <2,2,2,2>, RHS
+  336380006U,	// <2,u,2,u>: Cost 1 vspltisw2 LHS
+  1611491478U,	// <2,u,3,0>: Cost 2 vsldoi8 LHS, <3,0,1,2>
+  1213440073U,	// <2,u,3,1>: Cost 2 vmrglw LHS, <0,0,u,1>
+  1213442261U,	// <2,u,3,2>: Cost 2 vmrglw LHS, <3,0,u,2>
+  135053468U,	// <2,u,3,3>: Cost 1 vmrglw LHS, LHS
+  1611491842U,	// <2,u,3,4>: Cost 2 vsldoi8 LHS, <3,4,5,6>
+  1213440401U,	// <2,u,3,5>: Cost 2 vmrglw LHS, <0,4,u,5>
+  1213442589U,	// <2,u,3,6>: Cost 2 vmrglw LHS, <3,4,u,6>
+  135056712U,	// <2,u,3,7>: Cost 1 vmrglw LHS, RHS
+  135053473U,	// <2,u,3,u>: Cost 1 vmrglw LHS, LHS
+  1551425638U,	// <2,u,4,0>: Cost 2 vsldoi4 <1,2,u,4>, LHS
+  1551426503U,	// <2,u,4,1>: Cost 2 vsldoi4 <1,2,u,4>, <1,2,u,4>
+  2625169000U,	// <2,u,4,2>: Cost 3 vsldoi4 <1,2,u,4>, <2,2,2,2>
+  2625169558U,	// <2,u,4,3>: Cost 3 vsldoi4 <1,2,u,4>, <3,0,1,2>
+  1551428918U,	// <2,u,4,4>: Cost 2 vsldoi4 <1,2,u,4>, RHS
+  537750838U,	// <2,u,4,5>: Cost 1 vsldoi8 LHS, RHS
+  2733010297U,	// <2,u,4,6>: Cost 3 vsldoi8 LHS, <4,6,5,2>
+  2295156040U,	// <2,u,4,7>: Cost 3 vmrglw <2,2,2,4>, RHS
+  537751081U,	// <2,u,4,u>: Cost 1 vsldoi8 LHS, RHS
+  2689879624U,	// <2,u,5,0>: Cost 3 vsldoi8 LHS, <5,0,1,2>
+  2230130478U,	// <2,u,5,1>: Cost 3 vmrghw <2,5,3,6>, LHS
+  2631149217U,	// <2,u,5,2>: Cost 3 vsldoi4 <2,2,u,5>, <2,2,u,5>
+  2290516124U,	// <2,u,5,3>: Cost 3 vmrglw <1,4,2,5>, LHS
+  2689879988U,	// <2,u,5,4>: Cost 3 vsldoi8 LHS, <5,4,5,6>
+  1659269124U,	// <2,u,5,5>: Cost 2 vsldoi8 LHS, <5,5,5,5>
+  1691162778U,	// <2,u,5,6>: Cost 2 vsldoi12 <2,2,2,2>, RHS
+  2290519368U,	// <2,u,5,7>: Cost 3 vmrglw <1,4,2,5>, RHS
+  1691162796U,	// <2,u,5,u>: Cost 2 vsldoi12 <2,2,2,2>, RHS
+  2230802131U,	// <2,u,6,0>: Cost 3 vmrghw <2,6,3,7>, <u,0,1,2>
+  1157060398U,	// <2,u,6,1>: Cost 2 vmrghw <2,6,3,7>, LHS
+  1659269626U,	// <2,u,6,2>: Cost 2 vsldoi8 LHS, <6,2,7,3>
+  2764904656U,	// <2,u,6,3>: Cost 3 vsldoi12 <2,2,2,2>, <u,6,3,7>
+  2230802495U,	// <2,u,6,4>: Cost 3 vmrghw <2,6,3,7>, <u,4,5,6>
+  1157060762U,	// <2,u,6,5>: Cost 2 vmrghw <2,6,3,7>, RHS
+  1659269944U,	// <2,u,6,6>: Cost 2 vsldoi8 LHS, <6,6,6,6>
+  1659269966U,	// <2,u,6,7>: Cost 2 vsldoi8 LHS, <6,7,0,1>
+  1157060965U,	// <2,u,6,u>: Cost 2 vmrghw <2,6,3,7>, LHS
+  1659270138U,	// <2,u,7,0>: Cost 2 vsldoi8 LHS, <7,0,1,2>
+  2727040090U,	// <2,u,7,1>: Cost 3 vsldoi8 <7,1,2,u>, <7,1,2,u>
+  2727703723U,	// <2,u,7,2>: Cost 3 vsldoi8 <7,2,2,u>, <7,2,2,u>
+  2297831580U,	// <2,u,7,3>: Cost 3 vmrglw <2,6,2,7>, LHS
+  1659270502U,	// <2,u,7,4>: Cost 2 vsldoi8 LHS, <7,4,5,6>
+  2733012406U,	// <2,u,7,5>: Cost 3 vsldoi8 LHS, <7,5,5,5>
+  2730358255U,	// <2,u,7,6>: Cost 3 vsldoi8 <7,6,2,u>, <7,6,2,u>
+  1659270764U,	// <2,u,7,7>: Cost 2 vsldoi8 LHS, <7,7,7,7>
+  1659270786U,	// <2,u,7,u>: Cost 2 vsldoi8 LHS, <7,u,1,2>
+  1213481923U,	// <2,u,u,0>: Cost 2 vmrglw LHS, <1,2,u,0>
+  537753390U,	// <2,u,u,1>: Cost 1 vsldoi8 LHS, LHS
+  336380006U,	// <2,u,u,2>: Cost 1 vspltisw2 LHS
+  135094428U,	// <2,u,u,3>: Cost 1 vmrglw LHS, LHS
+  1213481927U,	// <2,u,u,4>: Cost 2 vmrglw LHS, <1,2,u,4>
+  537753754U,	// <2,u,u,5>: Cost 1 vsldoi8 LHS, RHS
+  1208838685U,	// <2,u,u,6>: Cost 2 vmrglw LHS, <3,4,u,6>
+  135097672U,	// <2,u,u,7>: Cost 1 vmrglw LHS, RHS
+  135094433U,	// <2,u,u,u>: Cost 1 vmrglw LHS, LHS
+  1678557184U,	// <3,0,0,0>: Cost 2 vsldoi12 LHS, <0,0,0,0>
+  1678557194U,	// <3,0,0,1>: Cost 2 vsldoi12 LHS, <0,0,1,1>
+  2631181989U,	// <3,0,0,2>: Cost 3 vsldoi4 <2,3,0,0>, <2,3,0,0>
+  2289223984U,	// <3,0,0,3>: Cost 3 vmrglw <1,2,3,0>, <3,2,0,3>
+  2756943909U,	// <3,0,0,4>: Cost 3 vsldoi12 LHS, <0,0,4,1>
+  3362965729U,	// <3,0,0,5>: Cost 4 vmrglw <1,2,3,0>, <3,1,0,5>
+  3362966054U,	// <3,0,0,6>: Cost 4 vmrglw <1,2,3,0>, <3,5,0,6>
+  2289224312U,	// <3,0,0,7>: Cost 3 vmrglw <1,2,3,0>, <3,6,0,7>
+  1683202121U,	// <3,0,0,u>: Cost 2 vsldoi12 LHS, <0,0,u,1>
+  1557446758U,	// <3,0,1,0>: Cost 2 vsldoi4 <2,3,0,1>, LHS
+  2752741467U,	// <3,0,1,1>: Cost 3 vsldoi12 LHS, <0,1,1,1>
+  604815462U,	// <3,0,1,2>: Cost 1 vsldoi12 LHS, LHS
+  2631190676U,	// <3,0,1,3>: Cost 3 vsldoi4 <2,3,0,1>, <3,0,1,0>
+  1557450038U,	// <3,0,1,4>: Cost 2 vsldoi4 <2,3,0,1>, RHS
+  2667024388U,	// <3,0,1,5>: Cost 3 vsldoi4 <u,3,0,1>, <5,5,5,5>
+  2800074894U,	// <3,0,1,6>: Cost 3 vsldoi12 LHS, <0,1,6,7>
+  2661053667U,	// <3,0,1,7>: Cost 3 vsldoi4 <7,3,0,1>, <7,3,0,1>
+  604815516U,	// <3,0,1,u>: Cost 1 vsldoi12 LHS, LHS
+  2696521165U,	// <3,0,2,0>: Cost 3 vsldoi8 <2,0,3,0>, <2,0,3,0>
+  2752741549U,	// <3,0,2,1>: Cost 3 vsldoi12 LHS, <0,2,1,2>
+  2691876456U,	// <3,0,2,2>: Cost 3 vsldoi8 <1,2,3,0>, <2,2,2,2>
+  2691876518U,	// <3,0,2,3>: Cost 3 vsldoi8 <1,2,3,0>, <2,3,0,1>
+  3830685895U,	// <3,0,2,4>: Cost 4 vsldoi12 LHS, <0,2,4,1>
+  3765618536U,	// <3,0,2,5>: Cost 4 vsldoi8 <1,2,3,0>, <2,5,3,6>
+  2691876794U,	// <3,0,2,6>: Cost 3 vsldoi8 <1,2,3,0>, <2,6,3,7>
+  2701166596U,	// <3,0,2,7>: Cost 3 vsldoi8 <2,7,3,0>, <2,7,3,0>
+  2756944108U,	// <3,0,2,u>: Cost 3 vsldoi12 LHS, <0,2,u,2>
+  2691877014U,	// <3,0,3,0>: Cost 3 vsldoi8 <1,2,3,0>, <3,0,1,2>
+  1161003110U,	// <3,0,3,1>: Cost 2 vmrghw <3,3,3,3>, LHS
+  2691877168U,	// <3,0,3,2>: Cost 3 vsldoi8 <1,2,3,0>, <3,2,0,3>
+  2691877246U,	// <3,0,3,3>: Cost 3 vsldoi8 <1,2,3,0>, <3,3,0,0>
+  2691877378U,	// <3,0,3,4>: Cost 3 vsldoi8 <1,2,3,0>, <3,4,5,6>
+  3765619238U,	// <3,0,3,5>: Cost 4 vsldoi8 <1,2,3,0>, <3,5,0,6>
+  2691877496U,	// <3,0,3,6>: Cost 3 vsldoi8 <1,2,3,0>, <3,6,0,7>
+  3368962680U,	// <3,0,3,7>: Cost 4 vmrglw <2,2,3,3>, <3,6,0,7>
+  1161003677U,	// <3,0,3,u>: Cost 2 vmrghw <3,3,3,3>, LHS
+  2289254400U,	// <3,0,4,0>: Cost 3 vmrglw <1,2,3,4>, <0,0,0,0>
+  1678557522U,	// <3,0,4,1>: Cost 2 vsldoi12 LHS, <0,4,1,5>
+  2631214761U,	// <3,0,4,2>: Cost 3 vsldoi4 <2,3,0,4>, <2,3,0,4>
+  2235580672U,	// <3,0,4,3>: Cost 3 vmrghw <3,4,5,6>, <0,3,1,4>
+  2756944237U,	// <3,0,4,4>: Cost 3 vsldoi12 LHS, <0,4,4,5>
+  1618136374U,	// <3,0,4,5>: Cost 2 vsldoi8 <1,2,3,0>, RHS
+  3309322742U,	// <3,0,4,6>: Cost 4 vmrghw <3,4,5,6>, <0,6,1,7>
+  3362998904U,	// <3,0,4,7>: Cost 4 vmrglw <1,2,3,4>, <3,6,0,7>
+  1683202449U,	// <3,0,4,u>: Cost 2 vsldoi12 LHS, <0,4,u,5>
+  3765620296U,	// <3,0,5,0>: Cost 4 vsldoi8 <1,2,3,0>, <5,0,1,2>
+  2752299427U,	// <3,0,5,1>: Cost 3 vsldoi12 LHS, <0,5,1,5>
+  3789508346U,	// <3,0,5,2>: Cost 4 vsldoi8 <5,2,3,0>, <5,2,3,0>
+  3403486842U,	// <3,0,5,3>: Cost 4 vmrglw <u,0,3,5>, <7,u,0,3>
+  3765620660U,	// <3,0,5,4>: Cost 4 vsldoi8 <1,2,3,0>, <5,4,5,6>
+  2733682692U,	// <3,0,5,5>: Cost 3 vsldoi8 <u,2,3,0>, <5,5,5,5>
+  2800075218U,	// <3,0,5,6>: Cost 3 vsldoi12 LHS, <0,5,6,7>
+  3873817044U,	// <3,0,5,7>: Cost 4 vsldoi12 LHS, <0,5,7,0>
+  2800075234U,	// <3,0,5,u>: Cost 3 vsldoi12 LHS, <0,5,u,5>
+  2752299501U,	// <3,0,6,0>: Cost 3 vsldoi12 LHS, <0,6,0,7>
+  2236547174U,	// <3,0,6,1>: Cost 3 vmrghw <3,6,0,7>, LHS
+  2733683194U,	// <3,0,6,2>: Cost 3 vsldoi8 <u,2,3,0>, <6,2,7,3>
+  3844473352U,	// <3,0,6,3>: Cost 4 vsldoi12 <3,2,0,3>, <0,6,3,7>
+  3310289234U,	// <3,0,6,4>: Cost 4 vmrghw <3,6,0,7>, <0,4,1,5>
+  3873817114U,	// <3,0,6,5>: Cost 4 vsldoi12 LHS, <0,6,5,7>
+  2733683512U,	// <3,0,6,6>: Cost 3 vsldoi8 <u,2,3,0>, <6,6,6,6>
+  2725057384U,	// <3,0,6,7>: Cost 3 vsldoi8 <6,7,3,0>, <6,7,3,0>
+  2236547741U,	// <3,0,6,u>: Cost 3 vmrghw <3,6,0,7>, LHS
+  2297905152U,	// <3,0,7,0>: Cost 3 vmrglw <2,6,3,7>, <0,0,0,0>
+  2297906854U,	// <3,0,7,1>: Cost 3 vmrglw <2,6,3,7>, <2,3,0,1>
+  2727711916U,	// <3,0,7,2>: Cost 3 vsldoi8 <7,2,3,0>, <7,2,3,0>
+  3371649328U,	// <3,0,7,3>: Cost 4 vmrglw <2,6,3,7>, <3,2,0,3>
+  2733684070U,	// <3,0,7,4>: Cost 3 vsldoi8 <u,2,3,0>, <7,4,5,6>
+  3734843490U,	// <3,0,7,5>: Cost 4 vsldoi4 <7,3,0,7>, <5,6,7,0>
+  3798799895U,	// <3,0,7,6>: Cost 4 vsldoi8 <6,7,3,0>, <7,6,7,3>
+  2733684332U,	// <3,0,7,7>: Cost 3 vsldoi8 <u,2,3,0>, <7,7,7,7>
+  2297906861U,	// <3,0,7,u>: Cost 3 vmrglw <2,6,3,7>, <2,3,0,u>
+  1557504102U,	// <3,0,u,0>: Cost 2 vsldoi4 <2,3,0,u>, LHS
+  1678557842U,	// <3,0,u,1>: Cost 2 vsldoi12 LHS, <0,u,1,1>
+  604816029U,	// <3,0,u,2>: Cost 1 vsldoi12 LHS, LHS
+  2691880892U,	// <3,0,u,3>: Cost 3 vsldoi8 <1,2,3,0>, <u,3,0,1>
+  1557507382U,	// <3,0,u,4>: Cost 2 vsldoi4 <2,3,0,u>, RHS
+  1618139290U,	// <3,0,u,5>: Cost 2 vsldoi8 <1,2,3,0>, RHS
+  2691881168U,	// <3,0,u,6>: Cost 3 vsldoi8 <1,2,3,0>, <u,6,3,7>
+  2661111018U,	// <3,0,u,7>: Cost 3 vsldoi4 <7,3,0,u>, <7,3,0,u>
+  604816083U,	// <3,0,u,u>: Cost 1 vsldoi12 LHS, LHS
+  2619310332U,	// <3,1,0,0>: Cost 3 vsldoi4 <0,3,1,0>, <0,3,1,0>
+  2756944612U,	// <3,1,0,1>: Cost 3 vsldoi12 LHS, <1,0,1,2>
+  2289221724U,	// <3,1,0,2>: Cost 3 vmrglw <1,2,3,0>, <0,1,1,2>
+  2619312278U,	// <3,1,0,3>: Cost 3 vsldoi4 <0,3,1,0>, <3,0,1,2>
+  2619313462U,	// <3,1,0,4>: Cost 3 vsldoi4 <0,3,1,0>, RHS
+  2289221970U,	// <3,1,0,5>: Cost 3 vmrglw <1,2,3,0>, <0,4,1,5>
+  2232599768U,	// <3,1,0,6>: Cost 3 vmrghw <3,0,1,2>, <1,6,2,7>
+  3362964687U,	// <3,1,0,7>: Cost 4 vmrglw <1,2,3,0>, <1,6,1,7>
+  2619316014U,	// <3,1,0,u>: Cost 3 vsldoi4 <0,3,1,0>, LHS
+  2756944683U,	// <3,1,1,0>: Cost 3 vsldoi12 LHS, <1,1,0,1>
+  1678558004U,	// <3,1,1,1>: Cost 2 vsldoi12 LHS, <1,1,1,1>
+  2691883927U,	// <3,1,1,2>: Cost 3 vsldoi8 <1,2,3,1>, <1,2,3,1>
+  3826631496U,	// <3,1,1,3>: Cost 4 vsldoi12 <0,2,1,3>, <1,1,3,3>
+  2756944723U,	// <3,1,1,4>: Cost 3 vsldoi12 LHS, <1,1,4,5>
+  2756944732U,	// <3,1,1,5>: Cost 3 vsldoi12 LHS, <1,1,5,5>
+  3830686561U,	// <3,1,1,6>: Cost 4 vsldoi12 LHS, <1,1,6,1>
+  3734869228U,	// <3,1,1,7>: Cost 4 vsldoi4 <7,3,1,1>, <7,3,1,1>
+  1678558004U,	// <3,1,1,u>: Cost 2 vsldoi12 LHS, <1,1,1,1>
+  2696529358U,	// <3,1,2,0>: Cost 3 vsldoi8 <2,0,3,1>, <2,0,3,1>
+  2756944775U,	// <3,1,2,1>: Cost 3 vsldoi12 LHS, <1,2,1,3>
+  2294548630U,	// <3,1,2,2>: Cost 3 vmrglw <2,1,3,2>, <3,0,1,2>
+  1678558102U,	// <3,1,2,3>: Cost 2 vsldoi12 LHS, <1,2,3,0>
+  2631273782U,	// <3,1,2,4>: Cost 3 vsldoi4 <2,3,1,2>, RHS
+  2756944811U,	// <3,1,2,5>: Cost 3 vsldoi12 LHS, <1,2,5,3>
+  3830686644U,	// <3,1,2,6>: Cost 4 vsldoi12 LHS, <1,2,6,3>
+  2800075706U,	// <3,1,2,7>: Cost 3 vsldoi12 LHS, <1,2,7,0>
+  1679000515U,	// <3,1,2,u>: Cost 2 vsldoi12 LHS, <1,2,u,0>
+  2619334911U,	// <3,1,3,0>: Cost 3 vsldoi4 <0,3,1,3>, <0,3,1,3>
+  2295218186U,	// <3,1,3,1>: Cost 3 vmrglw <2,2,3,3>, <0,0,1,1>
+  2293229718U,	// <3,1,3,2>: Cost 3 vmrglw <1,u,3,3>, <3,0,1,2>
+  2619337116U,	// <3,1,3,3>: Cost 3 vsldoi4 <0,3,1,3>, <3,3,3,3>
+  2619338038U,	// <3,1,3,4>: Cost 3 vsldoi4 <0,3,1,3>, RHS
+  2295218514U,	// <3,1,3,5>: Cost 3 vmrglw <2,2,3,3>, <0,4,1,5>
+  3830686729U,	// <3,1,3,6>: Cost 4 vsldoi12 LHS, <1,3,6,7>
+  3368961231U,	// <3,1,3,7>: Cost 4 vmrglw <2,2,3,3>, <1,6,1,7>
+  2619340590U,	// <3,1,3,u>: Cost 3 vsldoi4 <0,3,1,3>, LHS
+  2619343104U,	// <3,1,4,0>: Cost 3 vsldoi4 <0,3,1,4>, <0,3,1,4>
+  2289254410U,	// <3,1,4,1>: Cost 3 vmrglw <1,2,3,4>, <0,0,1,1>
+  2289256598U,	// <3,1,4,2>: Cost 3 vmrglw <1,2,3,4>, <3,0,1,2>
+  2619345410U,	// <3,1,4,3>: Cost 3 vsldoi4 <0,3,1,4>, <3,4,5,6>
+  2619346230U,	// <3,1,4,4>: Cost 3 vsldoi4 <0,3,1,4>, RHS
+  2756944976U,	// <3,1,4,5>: Cost 3 vsldoi12 LHS, <1,4,5,6>
+  3362996401U,	// <3,1,4,6>: Cost 4 vmrglw <1,2,3,4>, <0,2,1,6>
+  3362997455U,	// <3,1,4,7>: Cost 4 vmrglw <1,2,3,4>, <1,6,1,7>
+  2619348782U,	// <3,1,4,u>: Cost 3 vsldoi4 <0,3,1,4>, LHS
+  2756945007U,	// <3,1,5,0>: Cost 3 vsldoi12 LHS, <1,5,0,1>
+  3830686840U,	// <3,1,5,1>: Cost 4 vsldoi12 LHS, <1,5,1,1>
+  3358361750U,	// <3,1,5,2>: Cost 4 vmrglw <0,4,3,5>, <3,0,1,2>
+  3830686857U,	// <3,1,5,3>: Cost 4 vsldoi12 LHS, <1,5,3,0>
+  2756945047U,	// <3,1,5,4>: Cost 3 vsldoi12 LHS, <1,5,4,5>
+  2294571346U,	// <3,1,5,5>: Cost 3 vmrglw <2,1,3,5>, <0,4,1,5>
+  3806105698U,	// <3,1,5,6>: Cost 4 vsldoi8 <u,0,3,1>, <5,6,7,0>
+  3873817774U,	// <3,1,5,7>: Cost 4 vsldoi12 LHS, <1,5,7,1>
+  2756945079U,	// <3,1,5,u>: Cost 3 vsldoi12 LHS, <1,5,u,1>
+  3830686912U,	// <3,1,6,0>: Cost 4 vsldoi12 LHS, <1,6,0,1>
+  2756945103U,	// <3,1,6,1>: Cost 3 vsldoi12 LHS, <1,6,1,7>
+  2236547990U,	// <3,1,6,2>: Cost 3 vmrghw <3,6,0,7>, <1,2,3,0>
+  3826631905U,	// <3,1,6,3>: Cost 4 vsldoi12 <0,2,1,3>, <1,6,3,7>
+  3830686952U,	// <3,1,6,4>: Cost 4 vsldoi12 LHS, <1,6,4,5>
+  2756945139U,	// <3,1,6,5>: Cost 3 vsldoi12 LHS, <1,6,5,7>
+  3830686972U,	// <3,1,6,6>: Cost 4 vsldoi12 LHS, <1,6,6,7>
+  2800076030U,	// <3,1,6,7>: Cost 3 vsldoi12 LHS, <1,6,7,0>
+  2756945166U,	// <3,1,6,u>: Cost 3 vsldoi12 LHS, <1,6,u,7>
+  3699081318U,	// <3,1,7,0>: Cost 4 vsldoi4 <1,3,1,7>, LHS
+  2297905162U,	// <3,1,7,1>: Cost 3 vmrglw <2,6,3,7>, <0,0,1,1>
+  2297907350U,	// <3,1,7,2>: Cost 3 vmrglw <2,6,3,7>, <3,0,1,2>
+  3365675182U,	// <3,1,7,3>: Cost 4 vmrglw <1,6,3,7>, <0,2,1,3>
+  3699084598U,	// <3,1,7,4>: Cost 4 vsldoi4 <1,3,1,7>, RHS
+  2297905490U,	// <3,1,7,5>: Cost 3 vmrglw <2,6,3,7>, <0,4,1,5>
+  2297905329U,	// <3,1,7,6>: Cost 3 vmrglw <2,6,3,7>, <0,2,1,6>
+  3368330447U,	// <3,1,7,7>: Cost 4 vmrglw <2,1,3,7>, <1,6,1,7>
+  2297905169U,	// <3,1,7,u>: Cost 3 vmrglw <2,6,3,7>, <0,0,1,u>
+  2619375876U,	// <3,1,u,0>: Cost 3 vsldoi4 <0,3,1,u>, <0,3,1,u>
+  1678558004U,	// <3,1,u,1>: Cost 2 vsldoi12 LHS, <1,1,1,1>
+  2289289366U,	// <3,1,u,2>: Cost 3 vmrglw <1,2,3,u>, <3,0,1,2>
+  1679000956U,	// <3,1,u,3>: Cost 2 vsldoi12 LHS, <1,u,3,0>
+  2619378998U,	// <3,1,u,4>: Cost 3 vsldoi4 <0,3,1,u>, RHS
+  2756945297U,	// <3,1,u,5>: Cost 3 vsldoi12 LHS, <1,u,5,3>
+  2297905329U,	// <3,1,u,6>: Cost 3 vmrglw <2,6,3,7>, <0,2,1,6>
+  2800076192U,	// <3,1,u,7>: Cost 3 vsldoi12 LHS, <1,u,7,0>
+  1683203497U,	// <3,1,u,u>: Cost 2 vsldoi12 LHS, <1,u,u,0>
+  3362964203U,	// <3,2,0,0>: Cost 4 vmrglw <1,2,3,0>, <1,0,2,0>
+  2289222380U,	// <3,2,0,1>: Cost 3 vmrglw <1,2,3,0>, <1,0,2,1>
+  2289222462U,	// <3,2,0,2>: Cost 3 vmrglw <1,2,3,0>, <1,1,2,2>
+  1215479910U,	// <3,2,0,3>: Cost 2 vmrglw <1,2,3,0>, LHS
+  3362964207U,	// <3,2,0,4>: Cost 4 vmrglw <1,2,3,0>, <1,0,2,4>
+  2289222708U,	// <3,2,0,5>: Cost 3 vmrglw <1,2,3,0>, <1,4,2,5>
+  2232600506U,	// <3,2,0,6>: Cost 3 vmrghw <3,0,1,2>, <2,6,3,7>
+  3396142296U,	// <3,2,0,7>: Cost 4 vmrglw <6,7,3,0>, <1,6,2,7>
+  1215479915U,	// <3,2,0,u>: Cost 2 vmrglw <1,2,3,0>, LHS
+  3699105894U,	// <3,2,1,0>: Cost 4 vsldoi4 <1,3,2,1>, LHS
+  3765633844U,	// <3,2,1,1>: Cost 4 vsldoi8 <1,2,3,2>, <1,1,1,1>
+  2691892120U,	// <3,2,1,2>: Cost 3 vsldoi8 <1,2,3,2>, <1,2,3,2>
+  2752300575U,	// <3,2,1,3>: Cost 3 vsldoi12 LHS, <2,1,3,1>
+  3699109174U,	// <3,2,1,4>: Cost 4 vsldoi4 <1,3,2,1>, RHS
+  3830687280U,	// <3,2,1,5>: Cost 5 vsldoi12 LHS, <2,1,5,0>
+  3830687289U,	// <3,2,1,6>: Cost 4 vsldoi12 LHS, <2,1,6,0>
+  3874260548U,	// <3,2,1,7>: Cost 4 vsldoi12 LHS, <2,1,7,2>
+  2752742988U,	// <3,2,1,u>: Cost 3 vsldoi12 LHS, <2,1,u,1>
+  2631344230U,	// <3,2,2,0>: Cost 3 vsldoi4 <2,3,2,2>, LHS
+  2697201184U,	// <3,2,2,1>: Cost 3 vsldoi8 <2,1,3,2>, <2,1,3,2>
+  1678558824U,	// <3,2,2,2>: Cost 2 vsldoi12 LHS, <2,2,2,2>
+  1678558834U,	// <3,2,2,3>: Cost 2 vsldoi12 LHS, <2,2,3,3>
+  2631347510U,	// <3,2,2,4>: Cost 3 vsldoi4 <2,3,2,2>, RHS
+  3368953613U,	// <3,2,2,5>: Cost 4 vmrglw <2,2,3,2>, <2,4,2,5>
+  2234304442U,	// <3,2,2,6>: Cost 3 vmrghw <3,2,6,3>, <2,6,3,7>
+  3368953777U,	// <3,2,2,7>: Cost 4 vmrglw <2,2,3,2>, <2,6,2,7>
+  1679001247U,	// <3,2,2,u>: Cost 2 vsldoi12 LHS, <2,2,u,3>
+  1678558886U,	// <3,2,3,0>: Cost 2 vsldoi12 LHS, <2,3,0,1>
+  2752300719U,	// <3,2,3,1>: Cost 3 vsldoi12 LHS, <2,3,1,1>
+  2752300729U,	// <3,2,3,2>: Cost 3 vsldoi12 LHS, <2,3,2,2>
+  1221476454U,	// <3,2,3,3>: Cost 2 vmrglw <2,2,3,3>, LHS
+  1678558926U,	// <3,2,3,4>: Cost 2 vsldoi12 LHS, <2,3,4,5>
+  2800076503U,	// <3,2,3,5>: Cost 3 vsldoi12 LHS, <2,3,5,5>
+  2234746810U,	// <3,2,3,6>: Cost 3 vmrghw <3,3,3,3>, <2,6,3,7>
+  2800076516U,	// <3,2,3,7>: Cost 3 vsldoi12 LHS, <2,3,7,0>
+  1678558958U,	// <3,2,3,u>: Cost 2 vsldoi12 LHS, <2,3,u,1>
+  3699130470U,	// <3,2,4,0>: Cost 4 vsldoi4 <1,3,2,4>, LHS
+  3362996972U,	// <3,2,4,1>: Cost 4 vmrglw <1,2,3,4>, <1,0,2,1>
+  2289256040U,	// <3,2,4,2>: Cost 3 vmrglw <1,2,3,4>, <2,2,2,2>
+  1215512678U,	// <3,2,4,3>: Cost 2 vmrglw <1,2,3,4>, LHS
+  3362998676U,	// <3,2,4,4>: Cost 4 vmrglw <1,2,3,4>, <3,3,2,4>
+  2691894582U,	// <3,2,4,5>: Cost 3 vsldoi8 <1,2,3,2>, RHS
+  2235582394U,	// <3,2,4,6>: Cost 3 vmrghw <3,4,5,6>, <2,6,3,7>
+  3734967544U,	// <3,2,4,7>: Cost 4 vsldoi4 <7,3,2,4>, <7,3,2,4>
+  1215512683U,	// <3,2,4,u>: Cost 2 vmrglw <1,2,3,4>, LHS
+  3705110630U,	// <3,2,5,0>: Cost 4 vsldoi4 <2,3,2,5>, LHS
+  3368313985U,	// <3,2,5,1>: Cost 4 vmrglw <2,1,3,5>, <1,5,2,1>
+  3368314472U,	// <3,2,5,2>: Cost 4 vmrglw <2,1,3,5>, <2,2,2,2>
+  2756945768U,	// <3,2,5,3>: Cost 3 vsldoi12 LHS, <2,5,3,6>
+  3705113910U,	// <3,2,5,4>: Cost 4 vsldoi4 <2,3,2,5>, RHS
+  3310061416U,	// <3,2,5,5>: Cost 4 vmrghw <3,5,6,6>, <2,5,3,6>
+  3310135226U,	// <3,2,5,6>: Cost 4 vmrghw <3,5,7,6>, <2,6,3,7>
+  3370305457U,	// <3,2,5,7>: Cost 5 vmrglw <2,4,3,5>, <2,6,2,7>
+  2752743317U,	// <3,2,5,u>: Cost 3 vsldoi12 LHS, <2,5,u,6>
+  2631376998U,	// <3,2,6,0>: Cost 3 vsldoi4 <2,3,2,6>, LHS
+  3705119540U,	// <3,2,6,1>: Cost 4 vsldoi4 <2,3,2,6>, <1,1,1,1>
+  2631378621U,	// <3,2,6,2>: Cost 3 vsldoi4 <2,3,2,6>, <2,3,2,6>
+  1678559162U,	// <3,2,6,3>: Cost 2 vsldoi12 LHS, <2,6,3,7>
+  2631380278U,	// <3,2,6,4>: Cost 3 vsldoi4 <2,3,2,6>, RHS
+  3370976956U,	// <3,2,6,5>: Cost 4 vmrglw <2,5,3,6>, <2,3,2,5>
+  2237065146U,	// <3,2,6,6>: Cost 3 vmrghw <3,6,7,7>, <2,6,3,7>
+  3798815594U,	// <3,2,6,7>: Cost 4 vsldoi8 <6,7,3,2>, <6,7,3,2>
+  1679001575U,	// <3,2,6,u>: Cost 2 vsldoi12 LHS, <2,6,u,7>
+  2800076778U,	// <3,2,7,0>: Cost 3 vsldoi12 LHS, <2,7,0,1>
+  3371647724U,	// <3,2,7,1>: Cost 4 vmrglw <2,6,3,7>, <1,0,2,1>
+  2297906792U,	// <3,2,7,2>: Cost 3 vmrglw <2,6,3,7>, <2,2,2,2>
+  1224163430U,	// <3,2,7,3>: Cost 2 vmrglw <2,6,3,7>, LHS
+  3705130294U,	// <3,2,7,4>: Cost 4 vsldoi4 <2,3,2,7>, RHS
+  3371648052U,	// <3,2,7,5>: Cost 4 vmrglw <2,6,3,7>, <1,4,2,5>
+  2297906877U,	// <3,2,7,6>: Cost 3 vmrglw <2,6,3,7>, <2,3,2,6>
+  3371648702U,	// <3,2,7,7>: Cost 4 vmrglw <2,6,3,7>, <2,3,2,7>
+  1224163435U,	// <3,2,7,u>: Cost 2 vmrglw <2,6,3,7>, LHS
+  1679001659U,	// <3,2,u,0>: Cost 2 vsldoi12 LHS, <2,u,0,1>
+  2752743492U,	// <3,2,u,1>: Cost 3 vsldoi12 LHS, <2,u,1,1>
+  1678558824U,	// <3,2,u,2>: Cost 2 vsldoi12 LHS, <2,2,2,2>
+  1678559320U,	// <3,2,u,3>: Cost 2 vsldoi12 LHS, <2,u,3,3>
+  1679001699U,	// <3,2,u,4>: Cost 2 vsldoi12 LHS, <2,u,4,5>
+  2691897498U,	// <3,2,u,5>: Cost 3 vsldoi8 <1,2,3,2>, RHS
+  2237908922U,	// <3,2,u,6>: Cost 3 vmrghw <3,u,1,2>, <2,6,3,7>
+  2800519289U,	// <3,2,u,7>: Cost 3 vsldoi12 LHS, <2,u,7,0>
+  1679001731U,	// <3,2,u,u>: Cost 2 vsldoi12 LHS, <2,u,u,1>
+  1215480726U,	// <3,3,0,0>: Cost 2 vmrglw <1,2,3,0>, <1,2,3,0>
+  1678559382U,	// <3,3,0,1>: Cost 2 vsldoi12 LHS, <3,0,1,2>
+  2631403200U,	// <3,3,0,2>: Cost 3 vsldoi4 <2,3,3,0>, <2,3,3,0>
+  2289223282U,	// <3,3,0,3>: Cost 3 vmrglw <1,2,3,0>, <2,2,3,3>
+  2752301232U,	// <3,3,0,4>: Cost 3 vsldoi12 LHS, <3,0,4,1>
+  3362965027U,	// <3,3,0,5>: Cost 4 vmrglw <1,2,3,0>, <2,1,3,5>
+  3362965352U,	// <3,3,0,6>: Cost 4 vmrglw <1,2,3,0>, <2,5,3,6>
+  2289223610U,	// <3,3,0,7>: Cost 3 vmrglw <1,2,3,0>, <2,6,3,7>
+  1678559445U,	// <3,3,0,u>: Cost 2 vsldoi12 LHS, <3,0,u,2>
+  3830687964U,	// <3,3,1,0>: Cost 4 vsldoi12 LHS, <3,1,0,0>
+  2752301286U,	// <3,3,1,1>: Cost 3 vsldoi12 LHS, <3,1,1,1>
+  2752301297U,	// <3,3,1,2>: Cost 3 vsldoi12 LHS, <3,1,2,3>
+  2305157532U,	// <3,3,1,3>: Cost 3 vmrglw <3,u,3,1>, <3,3,3,3>
+  3830688000U,	// <3,3,1,4>: Cost 4 vsldoi12 LHS, <3,1,4,0>
+  3830688009U,	// <3,3,1,5>: Cost 4 vsldoi12 LHS, <3,1,5,0>
+  3830688019U,	// <3,3,1,6>: Cost 4 vsldoi12 LHS, <3,1,6,1>
+  3362973626U,	// <3,3,1,7>: Cost 4 vmrglw <1,2,3,1>, <2,6,3,7>
+  2752743719U,	// <3,3,1,u>: Cost 3 vsldoi12 LHS, <3,1,u,3>
+  2631417958U,	// <3,3,2,0>: Cost 3 vsldoi4 <2,3,3,2>, LHS
+  3826043193U,	// <3,3,2,1>: Cost 4 vsldoi12 LHS, <3,2,1,3>
+  1624131186U,	// <3,3,2,2>: Cost 2 vsldoi8 <2,2,3,3>, <2,2,3,3>
+  2752301384U,	// <3,3,2,3>: Cost 3 vsldoi12 LHS, <3,2,3,0>
+  2631421238U,	// <3,3,2,4>: Cost 3 vsldoi4 <2,3,3,2>, RHS
+  3826485602U,	// <3,3,2,5>: Cost 4 vsldoi12 LHS, <3,2,5,u>
+  2752301414U,	// <3,3,2,6>: Cost 3 vsldoi12 LHS, <3,2,6,3>
+  2771249519U,	// <3,3,2,7>: Cost 3 vsldoi12 <3,2,7,3>, <3,2,7,3>
+  1628112984U,	// <3,3,2,u>: Cost 2 vsldoi8 <2,u,3,3>, <2,u,3,3>
+  1563656294U,	// <3,3,3,0>: Cost 2 vsldoi4 <3,3,3,3>, LHS
+  2301855911U,	// <3,3,3,1>: Cost 3 vmrglw <3,3,3,3>, <3,0,3,1>
+  2697873730U,	// <3,3,3,2>: Cost 3 vsldoi8 <2,2,3,3>, <3,2,2,3>
+  403488870U,	// <3,3,3,3>: Cost 1 vspltisw3 LHS
+  1563659574U,	// <3,3,3,4>: Cost 2 vsldoi4 <3,3,3,3>, RHS
+  2301856239U,	// <3,3,3,5>: Cost 3 vmrglw <3,3,3,3>, <3,4,3,5>
+  2697874067U,	// <3,3,3,6>: Cost 3 vsldoi8 <2,2,3,3>, <3,6,3,7>
+  2295220154U,	// <3,3,3,7>: Cost 3 vmrglw <2,2,3,3>, <2,6,3,7>
+  403488870U,	// <3,3,3,u>: Cost 1 vspltisw3 LHS
+  2289255318U,	// <3,3,4,0>: Cost 3 vmrglw <1,2,3,4>, <1,2,3,0>
+  2631435162U,	// <3,3,4,1>: Cost 3 vsldoi4 <2,3,3,4>, <1,2,3,4>
+  2631435972U,	// <3,3,4,2>: Cost 3 vsldoi4 <2,3,3,4>, <2,3,3,4>
+  2289256050U,	// <3,3,4,3>: Cost 3 vmrglw <1,2,3,4>, <2,2,3,3>
+  1215513498U,	// <3,3,4,4>: Cost 2 vmrglw <1,2,3,4>, <1,2,3,4>
+  1679002114U,	// <3,3,4,5>: Cost 2 vsldoi12 LHS, <3,4,5,6>
+  3362998120U,	// <3,3,4,6>: Cost 4 vmrglw <1,2,3,4>, <2,5,3,6>
+  2289256378U,	// <3,3,4,7>: Cost 3 vmrglw <1,2,3,4>, <2,6,3,7>
+  1679002141U,	// <3,3,4,u>: Cost 2 vsldoi12 LHS, <3,4,u,6>
+  3831130657U,	// <3,3,5,0>: Cost 4 vsldoi12 LHS, <3,5,0,1>
+  3376277671U,	// <3,3,5,1>: Cost 4 vmrglw <3,4,3,5>, <3,0,3,1>
+  3771617012U,	// <3,3,5,2>: Cost 4 vsldoi8 <2,2,3,3>, <5,2,2,3>
+  2302536092U,	// <3,3,5,3>: Cost 3 vmrglw <3,4,3,5>, <3,3,3,3>
+  3831130697U,	// <3,3,5,4>: Cost 4 vsldoi12 LHS, <3,5,4,5>
+  2294572579U,	// <3,3,5,5>: Cost 3 vmrglw <2,1,3,5>, <2,1,3,5>
+  2800519773U,	// <3,3,5,6>: Cost 3 vsldoi12 LHS, <3,5,6,7>
+  3368314810U,	// <3,3,5,7>: Cost 4 vmrglw <2,1,3,5>, <2,6,3,7>
+  2800519791U,	// <3,3,5,u>: Cost 3 vsldoi12 LHS, <3,5,u,7>
+  2800077432U,	// <3,3,6,0>: Cost 3 vsldoi12 LHS, <3,6,0,7>
+  3310291185U,	// <3,3,6,1>: Cost 4 vmrghw <3,6,0,7>, <3,1,2,3>
+  2789165706U,	// <3,3,6,2>: Cost 3 vsldoi12 <6,2,7,3>, <3,6,2,7>
+  2764982931U,	// <3,3,6,3>: Cost 3 vsldoi12 <2,2,3,3>, <3,6,3,7>
+  2800077468U,	// <3,3,6,4>: Cost 3 vsldoi12 LHS, <3,6,4,7>
+  3873819301U,	// <3,3,6,5>: Cost 4 vsldoi12 LHS, <3,6,5,7>
+  2297235304U,	// <3,3,6,6>: Cost 3 vmrglw <2,5,3,6>, <2,5,3,6>
+  2725081963U,	// <3,3,6,7>: Cost 3 vsldoi8 <6,7,3,3>, <6,7,3,3>
+  2725745596U,	// <3,3,6,u>: Cost 3 vsldoi8 <6,u,3,3>, <6,u,3,3>
+  2631458918U,	// <3,3,7,0>: Cost 3 vsldoi4 <2,3,3,7>, LHS
+  3705201460U,	// <3,3,7,1>: Cost 4 vsldoi4 <2,3,3,7>, <1,1,1,1>
+  2631460551U,	// <3,3,7,2>: Cost 3 vsldoi4 <2,3,3,7>, <2,3,3,7>
+  2297906802U,	// <3,3,7,3>: Cost 3 vmrglw <2,6,3,7>, <2,2,3,3>
+  2631462198U,	// <3,3,7,4>: Cost 3 vsldoi4 <2,3,3,7>, RHS
+  3371648547U,	// <3,3,7,5>: Cost 4 vmrglw <2,6,3,7>, <2,1,3,5>
+  3371648548U,	// <3,3,7,6>: Cost 4 vmrglw <2,6,3,7>, <2,1,3,6>
+  1224165306U,	// <3,3,7,7>: Cost 2 vmrglw <2,6,3,7>, <2,6,3,7>
+  1224165306U,	// <3,3,7,u>: Cost 2 vmrglw <2,6,3,7>, <2,6,3,7>
+  1215480726U,	// <3,3,u,0>: Cost 2 vmrglw <1,2,3,0>, <1,2,3,0>
+  1679002398U,	// <3,3,u,1>: Cost 2 vsldoi12 LHS, <3,u,1,2>
+  1659967368U,	// <3,3,u,2>: Cost 2 vsldoi8 <u,2,3,3>, <u,2,3,3>
+  403488870U,	// <3,3,u,3>: Cost 1 vspltisw3 LHS
+  1563659574U,	// <3,3,u,4>: Cost 2 vsldoi4 <3,3,3,3>, RHS
+  1679002438U,	// <3,3,u,5>: Cost 2 vsldoi12 LHS, <3,u,5,6>
+  2756946764U,	// <3,3,u,6>: Cost 3 vsldoi12 LHS, <3,u,6,3>
+  1224165306U,	// <3,3,u,7>: Cost 2 vmrglw <2,6,3,7>, <2,6,3,7>
+  403488870U,	// <3,3,u,u>: Cost 1 vspltisw3 LHS
+  2691907584U,	// <3,4,0,0>: Cost 3 vsldoi8 <1,2,3,4>, <0,0,0,0>
+  1618165862U,	// <3,4,0,1>: Cost 2 vsldoi8 <1,2,3,4>, LHS
+  2631476937U,	// <3,4,0,2>: Cost 3 vsldoi4 <2,3,4,0>, <2,3,4,0>
+  2232601732U,	// <3,4,0,3>: Cost 3 vmrghw <3,0,1,2>, <4,3,5,0>
+  2691907922U,	// <3,4,0,4>: Cost 3 vsldoi8 <1,2,3,4>, <0,4,1,5>
+  1158860086U,	// <3,4,0,5>: Cost 2 vmrghw <3,0,1,2>, RHS
+  3306343806U,	// <3,4,0,6>: Cost 4 vmrghw <3,0,1,2>, <4,6,5,7>
+  3366947484U,	// <3,4,0,7>: Cost 4 vmrglw <1,u,3,0>, <3,6,4,7>
+  1618166429U,	// <3,4,0,u>: Cost 2 vsldoi8 <1,2,3,4>, LHS
+  2631483494U,	// <3,4,1,0>: Cost 3 vsldoi4 <2,3,4,1>, LHS
+  2691908404U,	// <3,4,1,1>: Cost 3 vsldoi8 <1,2,3,4>, <1,1,1,1>
+  1618166682U,	// <3,4,1,2>: Cost 2 vsldoi8 <1,2,3,4>, <1,2,3,4>
+  3765650393U,	// <3,4,1,3>: Cost 4 vsldoi8 <1,2,3,4>, <1,3,1,4>
+  2631486774U,	// <3,4,1,4>: Cost 3 vsldoi4 <2,3,4,1>, RHS
+  2756946914U,	// <3,4,1,5>: Cost 3 vsldoi12 LHS, <4,1,5,0>
+  3765650639U,	// <3,4,1,6>: Cost 4 vsldoi8 <1,2,3,4>, <1,6,1,7>
+  3735090439U,	// <3,4,1,7>: Cost 4 vsldoi4 <7,3,4,1>, <7,3,4,1>
+  1622148480U,	// <3,4,1,u>: Cost 2 vsldoi8 <1,u,3,4>, <1,u,3,4>
+  3765650893U,	// <3,4,2,0>: Cost 4 vsldoi8 <1,2,3,4>, <2,0,3,0>
+  3831131154U,	// <3,4,2,1>: Cost 4 vsldoi12 LHS, <4,2,1,3>
+  2691909224U,	// <3,4,2,2>: Cost 3 vsldoi8 <1,2,3,4>, <2,2,2,2>
+  2691909286U,	// <3,4,2,3>: Cost 3 vsldoi8 <1,2,3,4>, <2,3,0,1>
+  2699208469U,	// <3,4,2,4>: Cost 3 vsldoi8 <2,4,3,4>, <2,4,3,4>
+  2233863478U,	// <3,4,2,5>: Cost 3 vmrghw <3,2,0,3>, RHS
+  2691909562U,	// <3,4,2,6>: Cost 3 vsldoi8 <1,2,3,4>, <2,6,3,7>
+  2701199368U,	// <3,4,2,7>: Cost 3 vsldoi8 <2,7,3,4>, <2,7,3,4>
+  2691909691U,	// <3,4,2,u>: Cost 3 vsldoi8 <1,2,3,4>, <2,u,0,1>
+  2691909782U,	// <3,4,3,0>: Cost 3 vsldoi8 <1,2,3,4>, <3,0,1,2>
+  3765651686U,	// <3,4,3,1>: Cost 4 vsldoi8 <1,2,3,4>, <3,1,1,1>
+  2691909972U,	// <3,4,3,2>: Cost 3 vsldoi8 <1,2,3,4>, <3,2,4,3>
+  2691910044U,	// <3,4,3,3>: Cost 3 vsldoi8 <1,2,3,4>, <3,3,3,3>
+  2691910096U,	// <3,4,3,4>: Cost 3 vsldoi8 <1,2,3,4>, <3,4,0,1>
+  1161006390U,	// <3,4,3,5>: Cost 2 vmrghw <3,3,3,3>, RHS
+  2691910300U,	// <3,4,3,6>: Cost 3 vsldoi8 <1,2,3,4>, <3,6,4,7>
+  3368962716U,	// <3,4,3,7>: Cost 4 vmrglw <2,2,3,3>, <3,6,4,7>
+  1161006633U,	// <3,4,3,u>: Cost 2 vmrghw <3,3,3,3>, RHS
+  2631508070U,	// <3,4,4,0>: Cost 3 vsldoi4 <2,3,4,4>, LHS
+  2631508890U,	// <3,4,4,1>: Cost 3 vsldoi4 <2,3,4,4>, <1,2,3,4>
+  2631509709U,	// <3,4,4,2>: Cost 3 vsldoi4 <2,3,4,4>, <2,3,4,4>
+  2289256788U,	// <3,4,4,3>: Cost 3 vmrglw <1,2,3,4>, <3,2,4,3>
+  1726336208U,	// <3,4,4,4>: Cost 2 vsldoi12 LHS, <4,4,4,4>
+  1618169142U,	// <3,4,4,5>: Cost 2 vsldoi8 <1,2,3,4>, RHS
+  3362998858U,	// <3,4,4,6>: Cost 4 vmrglw <1,2,3,4>, <3,5,4,6>
+  2289257116U,	// <3,4,4,7>: Cost 3 vmrglw <1,2,3,4>, <3,6,4,7>
+  1618169385U,	// <3,4,4,u>: Cost 2 vsldoi8 <1,2,3,4>, RHS
+  1557774438U,	// <3,4,5,0>: Cost 2 vsldoi4 <2,3,4,5>, LHS
+  2631516980U,	// <3,4,5,1>: Cost 3 vsldoi4 <2,3,4,5>, <1,1,1,1>
+  1557776078U,	// <3,4,5,2>: Cost 2 vsldoi4 <2,3,4,5>, <2,3,4,5>
+  2631518358U,	// <3,4,5,3>: Cost 3 vsldoi4 <2,3,4,5>, <3,0,1,2>
+  1557777718U,	// <3,4,5,4>: Cost 2 vsldoi4 <2,3,4,5>, RHS
+  2296563406U,	// <3,4,5,5>: Cost 3 vmrglw <2,4,3,5>, <2,3,4,5>
+  604818742U,	// <3,4,5,6>: Cost 1 vsldoi12 LHS, RHS
+  2661381387U,	// <3,4,5,7>: Cost 3 vsldoi4 <7,3,4,5>, <7,3,4,5>
+  604818760U,	// <3,4,5,u>: Cost 1 vsldoi12 LHS, RHS
+  3705266278U,	// <3,4,6,0>: Cost 4 vsldoi4 <2,3,4,6>, LHS
+  3831131482U,	// <3,4,6,1>: Cost 4 vsldoi12 LHS, <4,6,1,7>
+  2733715962U,	// <3,4,6,2>: Cost 3 vsldoi8 <u,2,3,4>, <6,2,7,3>
+  3844771180U,	// <3,4,6,3>: Cost 4 vsldoi12 <3,2,4,3>, <4,6,3,7>
+  2800078197U,	// <3,4,6,4>: Cost 3 vsldoi12 LHS, <4,6,4,7>
+  2236550454U,	// <3,4,6,5>: Cost 3 vmrghw <3,6,0,7>, RHS
+  2733716280U,	// <3,4,6,6>: Cost 3 vsldoi8 <u,2,3,4>, <6,6,6,6>
+  2725090156U,	// <3,4,6,7>: Cost 3 vsldoi8 <6,7,3,4>, <6,7,3,4>
+  2236550697U,	// <3,4,6,u>: Cost 3 vmrghw <3,6,0,7>, RHS
+  2733716474U,	// <3,4,7,0>: Cost 3 vsldoi8 <u,2,3,4>, <7,0,1,2>
+  3371647013U,	// <3,4,7,1>: Cost 4 vmrglw <2,6,3,7>, <0,0,4,1>
+  2727744688U,	// <3,4,7,2>: Cost 3 vsldoi8 <7,2,3,4>, <7,2,3,4>
+  3371649364U,	// <3,4,7,3>: Cost 4 vmrglw <2,6,3,7>, <3,2,4,3>
+  2733716838U,	// <3,4,7,4>: Cost 3 vsldoi8 <u,2,3,4>, <7,4,5,6>
+  2297906894U,	// <3,4,7,5>: Cost 3 vmrglw <2,6,3,7>, <2,3,4,5>
+  3371647180U,	// <3,4,7,6>: Cost 4 vmrglw <2,6,3,7>, <0,2,4,6>
+  2733717100U,	// <3,4,7,7>: Cost 3 vsldoi8 <u,2,3,4>, <7,7,7,7>
+  2297906897U,	// <3,4,7,u>: Cost 3 vmrglw <2,6,3,7>, <2,3,4,u>
+  1557799014U,	// <3,4,u,0>: Cost 2 vsldoi4 <2,3,4,u>, LHS
+  1618171694U,	// <3,4,u,1>: Cost 2 vsldoi8 <1,2,3,4>, LHS
+  1557800657U,	// <3,4,u,2>: Cost 2 vsldoi4 <2,3,4,u>, <2,3,4,u>
+  2691913660U,	// <3,4,u,3>: Cost 3 vsldoi8 <1,2,3,4>, <u,3,0,1>
+  1557802294U,	// <3,4,u,4>: Cost 2 vsldoi4 <2,3,4,u>, RHS
+  1618172058U,	// <3,4,u,5>: Cost 2 vsldoi8 <1,2,3,4>, RHS
+  604818985U,	// <3,4,u,6>: Cost 1 vsldoi12 LHS, RHS
+  2661405966U,	// <3,4,u,7>: Cost 3 vsldoi4 <7,3,4,u>, <7,3,4,u>
+  604819003U,	// <3,4,u,u>: Cost 1 vsldoi12 LHS, RHS
+  2643492966U,	// <3,5,0,0>: Cost 3 vsldoi4 <4,3,5,0>, LHS
+  2756947528U,	// <3,5,0,1>: Cost 3 vsldoi12 LHS, <5,0,1,2>
+  2331029019U,	// <3,5,0,2>: Cost 3 vmrglw <u,2,3,0>, <4,u,5,2>
+  2643495062U,	// <3,5,0,3>: Cost 3 vsldoi4 <4,3,5,0>, <3,0,1,2>
+  2756947554U,	// <3,5,0,4>: Cost 3 vsldoi12 LHS, <5,0,4,1>
+  2800078443U,	// <3,5,0,5>: Cost 3 vsldoi12 LHS, <5,0,5,1>
+  2289224194U,	// <3,5,0,6>: Cost 3 vmrglw <1,2,3,0>, <3,4,5,6>
+  3362964723U,	// <3,5,0,7>: Cost 4 vmrglw <1,2,3,0>, <1,6,5,7>
+  2756947590U,	// <3,5,0,u>: Cost 3 vsldoi12 LHS, <5,0,u,1>
+  2800078479U,	// <3,5,1,0>: Cost 3 vsldoi12 LHS, <5,1,0,1>
+  2333027218U,	// <3,5,1,1>: Cost 3 vmrglw <u,5,3,1>, <4,0,5,1>
+  2691916699U,	// <3,5,1,2>: Cost 3 vsldoi8 <1,2,3,5>, <1,2,3,5>
+  3832901294U,	// <3,5,1,3>: Cost 4 vsldoi12 <1,2,5,3>, <5,1,3,5>
+  2800078519U,	// <3,5,1,4>: Cost 3 vsldoi12 LHS, <5,1,4,5>
+  3830689467U,	// <3,5,1,5>: Cost 4 vsldoi12 LHS, <5,1,5,0>
+  3830689481U,	// <3,5,1,6>: Cost 4 vsldoi12 LHS, <5,1,6,5>
+  3873820365U,	// <3,5,1,7>: Cost 4 vsldoi12 LHS, <5,1,7,0>
+  2800078551U,	// <3,5,1,u>: Cost 3 vsldoi12 LHS, <5,1,u,1>
+  3770967487U,	// <3,5,2,0>: Cost 4 vsldoi8 <2,1,3,5>, <2,0,1,4>
+  2697225763U,	// <3,5,2,1>: Cost 3 vsldoi8 <2,1,3,5>, <2,1,3,5>
+  3830689523U,	// <3,5,2,2>: Cost 4 vsldoi12 LHS, <5,2,2,2>
+  2699216590U,	// <3,5,2,3>: Cost 3 vsldoi8 <2,4,3,5>, <2,3,4,5>
+  2699216662U,	// <3,5,2,4>: Cost 3 vsldoi8 <2,4,3,5>, <2,4,3,5>
+  2783047439U,	// <3,5,2,5>: Cost 3 vsldoi12 <5,2,5,3>, <5,2,5,3>
+  2783121176U,	// <3,5,2,6>: Cost 3 vsldoi12 <5,2,6,3>, <5,2,6,3>
+  3856936737U,	// <3,5,2,7>: Cost 4 vsldoi12 <5,2,7,3>, <5,2,7,3>
+  2701871194U,	// <3,5,2,u>: Cost 3 vsldoi8 <2,u,3,5>, <2,u,3,5>
+  2643517542U,	// <3,5,3,0>: Cost 3 vsldoi4 <4,3,5,3>, LHS
+  2331052946U,	// <3,5,3,1>: Cost 3 vmrglw <u,2,3,3>, <4,0,5,1>
+  3699345010U,	// <3,5,3,2>: Cost 4 vsldoi4 <1,3,5,3>, <2,2,3,3>
+  2705189276U,	// <3,5,3,3>: Cost 3 vsldoi8 <3,4,3,5>, <3,3,3,3>
+  2705189359U,	// <3,5,3,4>: Cost 3 vsldoi8 <3,4,3,5>, <3,4,3,5>
+  2331053274U,	// <3,5,3,5>: Cost 3 vmrglw <u,2,3,3>, <4,4,5,5>
+  2295220738U,	// <3,5,3,6>: Cost 3 vmrglw <2,2,3,3>, <3,4,5,6>
+  3368961267U,	// <3,5,3,7>: Cost 4 vmrglw <2,2,3,3>, <1,6,5,7>
+  2295220740U,	// <3,5,3,u>: Cost 3 vmrglw <2,2,3,3>, <3,4,5,u>
+  2643525734U,	// <3,5,4,0>: Cost 3 vsldoi4 <4,3,5,4>, LHS
+  2331061138U,	// <3,5,4,1>: Cost 3 vmrglw <u,2,3,4>, <4,0,5,1>
+  2235584280U,	// <3,5,4,2>: Cost 3 vmrghw <3,4,5,6>, <5,2,6,3>
+  2643528194U,	// <3,5,4,3>: Cost 3 vsldoi4 <4,3,5,4>, <3,4,5,6>
+  2735713498U,	// <3,5,4,4>: Cost 3 vsldoi8 <u,5,3,5>, <4,4,5,5>
+  2756947892U,	// <3,5,4,5>: Cost 3 vsldoi12 LHS, <5,4,5,6>
+  2289256962U,	// <3,5,4,6>: Cost 3 vmrglw <1,2,3,4>, <3,4,5,6>
+  3362997491U,	// <3,5,4,7>: Cost 4 vmrglw <1,2,3,4>, <1,6,5,7>
+  2756947919U,	// <3,5,4,u>: Cost 3 vsldoi12 LHS, <5,4,u,6>
+  2800078803U,	// <3,5,5,0>: Cost 3 vsldoi12 LHS, <5,5,0,1>
+  2800078812U,	// <3,5,5,1>: Cost 3 vsldoi12 LHS, <5,5,1,1>
+  2631591639U,	// <3,5,5,2>: Cost 3 vsldoi4 <2,3,5,5>, <2,3,5,5>
+  3832901616U,	// <3,5,5,3>: Cost 4 vsldoi12 <1,2,5,3>, <5,5,3,3>
+  2800078843U,	// <3,5,5,4>: Cost 3 vsldoi12 LHS, <5,5,4,5>
+  1726337028U,	// <3,5,5,5>: Cost 2 vsldoi12 LHS, <5,5,5,5>
+  2800078862U,	// <3,5,5,6>: Cost 3 vsldoi12 LHS, <5,5,6,6>
+  3368314099U,	// <3,5,5,7>: Cost 4 vmrglw <2,1,3,5>, <1,6,5,7>
+  1726337028U,	// <3,5,5,u>: Cost 2 vsldoi12 LHS, <5,5,5,5>
+  2800078884U,	// <3,5,6,0>: Cost 3 vsldoi12 LHS, <5,6,0,1>
+  2800078899U,	// <3,5,6,1>: Cost 3 vsldoi12 LHS, <5,6,1,7>
+  2631599832U,	// <3,5,6,2>: Cost 3 vsldoi4 <2,3,5,6>, <2,3,5,6>
+  2800078914U,	// <3,5,6,3>: Cost 3 vsldoi12 LHS, <5,6,3,4>
+  2800078924U,	// <3,5,6,4>: Cost 3 vsldoi12 LHS, <5,6,4,5>
+  2800078935U,	// <3,5,6,5>: Cost 3 vsldoi12 LHS, <5,6,5,7>
+  2297235970U,	// <3,5,6,6>: Cost 3 vmrglw <2,5,3,6>, <3,4,5,6>
+  1726337122U,	// <3,5,6,7>: Cost 2 vsldoi12 LHS, <5,6,7,0>
+  1726337131U,	// <3,5,6,u>: Cost 2 vsldoi12 LHS, <5,6,u,0>
+  3699376230U,	// <3,5,7,0>: Cost 4 vsldoi4 <1,3,5,7>, LHS
+  2333739922U,	// <3,5,7,1>: Cost 3 vmrglw <u,6,3,7>, <4,0,5,1>
+  3699378106U,	// <3,5,7,2>: Cost 4 vsldoi4 <1,3,5,7>, <2,6,3,7>
+  3371647915U,	// <3,5,7,3>: Cost 4 vmrglw <2,6,3,7>, <1,2,5,3>
+  3699379510U,	// <3,5,7,4>: Cost 4 vsldoi4 <1,3,5,7>, RHS
+  2333740250U,	// <3,5,7,5>: Cost 3 vmrglw <u,6,3,7>, <4,4,5,5>
+  2297907714U,	// <3,5,7,6>: Cost 3 vmrglw <2,6,3,7>, <3,4,5,6>
+  3370984691U,	// <3,5,7,7>: Cost 4 vmrglw <2,5,3,7>, <1,6,5,7>
+  2297907716U,	// <3,5,7,u>: Cost 3 vmrglw <2,6,3,7>, <3,4,5,u>
+  2800079046U,	// <3,5,u,0>: Cost 3 vsldoi12 LHS, <5,u,0,1>
+  2756948176U,	// <3,5,u,1>: Cost 3 vsldoi12 LHS, <5,u,1,2>
+  2331029019U,	// <3,5,u,2>: Cost 3 vmrglw <u,2,3,0>, <4,u,5,2>
+  2800079076U,	// <3,5,u,3>: Cost 3 vsldoi12 LHS, <5,u,3,4>
+  2800079085U,	// <3,5,u,4>: Cost 3 vsldoi12 LHS, <5,u,4,4>
+  1726337028U,	// <3,5,u,5>: Cost 2 vsldoi12 LHS, <5,5,5,5>
+  2289289730U,	// <3,5,u,6>: Cost 3 vmrglw <1,2,3,u>, <3,4,5,6>
+  1726337284U,	// <3,5,u,7>: Cost 2 vsldoi12 LHS, <5,u,7,0>
+  1726337293U,	// <3,5,u,u>: Cost 2 vsldoi12 LHS, <5,u,u,0>
+  3773628416U,	// <3,6,0,0>: Cost 4 vsldoi8 <2,5,3,6>, <0,0,0,0>
+  2699886694U,	// <3,6,0,1>: Cost 3 vsldoi8 <2,5,3,6>, LHS
+  2789167401U,	// <3,6,0,2>: Cost 3 vsldoi12 <6,2,7,3>, <6,0,2,1>
+  3362965862U,	// <3,6,0,3>: Cost 4 vmrglw <1,2,3,0>, <3,2,6,3>
+  3773628754U,	// <3,6,0,4>: Cost 4 vsldoi8 <2,5,3,6>, <0,4,1,5>
+  3723284326U,	// <3,6,0,5>: Cost 4 vsldoi4 <5,3,6,0>, <5,3,6,0>
+  2800079181U,	// <3,6,0,6>: Cost 3 vsldoi12 LHS, <6,0,6,1>
+  1215483190U,	// <3,6,0,7>: Cost 2 vmrglw <1,2,3,0>, RHS
+  1215483191U,	// <3,6,0,u>: Cost 2 vmrglw <1,2,3,0>, RHS
+  3873821032U,	// <3,6,1,0>: Cost 4 vsldoi12 LHS, <6,1,0,1>
+  3773629236U,	// <3,6,1,1>: Cost 4 vsldoi8 <2,5,3,6>, <1,1,1,1>
+  2691924892U,	// <3,6,1,2>: Cost 3 vsldoi8 <1,2,3,6>, <1,2,3,6>
+  3830690184U,	// <3,6,1,3>: Cost 5 vsldoi12 LHS, <6,1,3,6>
+  3873821072U,	// <3,6,1,4>: Cost 4 vsldoi12 LHS, <6,1,4,5>
+  3873821082U,	// <3,6,1,5>: Cost 4 vsldoi12 LHS, <6,1,5,6>
+  3403453240U,	// <3,6,1,6>: Cost 4 vmrglw <u,0,3,1>, <6,6,6,6>
+  2289233206U,	// <3,6,1,7>: Cost 3 vmrglw <1,2,3,1>, RHS
+  2289233207U,	// <3,6,1,u>: Cost 3 vmrglw <1,2,3,1>, RHS
+  2661498982U,	// <3,6,2,0>: Cost 3 vsldoi4 <7,3,6,2>, LHS
+  3770975780U,	// <3,6,2,1>: Cost 4 vsldoi8 <2,1,3,6>, <2,1,3,6>
+  2631640797U,	// <3,6,2,2>: Cost 3 vsldoi4 <2,3,6,2>, <2,3,6,2>
+  3771639485U,	// <3,6,2,3>: Cost 4 vsldoi8 <2,2,3,6>, <2,3,2,6>
+  2661502262U,	// <3,6,2,4>: Cost 3 vsldoi4 <7,3,6,2>, RHS
+  2699888488U,	// <3,6,2,5>: Cost 3 vsldoi8 <2,5,3,6>, <2,5,3,6>
+  2661503482U,	// <3,6,2,6>: Cost 3 vsldoi4 <7,3,6,2>, <6,2,7,3>
+  1715425786U,	// <3,6,2,7>: Cost 2 vsldoi12 <6,2,7,3>, <6,2,7,3>
+  1715499523U,	// <3,6,2,u>: Cost 2 vsldoi12 <6,2,u,3>, <6,2,u,3>
+  3773630614U,	// <3,6,3,0>: Cost 4 vsldoi8 <2,5,3,6>, <3,0,1,2>
+  3372942825U,	// <3,6,3,1>: Cost 4 vmrglw <2,u,3,3>, <2,0,6,1>
+  2234749434U,	// <3,6,3,2>: Cost 3 vmrghw <3,3,3,3>, <6,2,7,3>
+  3368962406U,	// <3,6,3,3>: Cost 4 vmrglw <2,2,3,3>, <3,2,6,3>
+  2699889154U,	// <3,6,3,4>: Cost 3 vsldoi8 <2,5,3,6>, <3,4,5,6>
+  3773631068U,	// <3,6,3,5>: Cost 4 vsldoi8 <2,5,3,6>, <3,5,6,6>
+  2331054904U,	// <3,6,3,6>: Cost 3 vmrglw <u,2,3,3>, <6,6,6,6>
+  1221479734U,	// <3,6,3,7>: Cost 2 vmrglw <2,2,3,3>, RHS
+  1221479735U,	// <3,6,3,u>: Cost 2 vmrglw <2,2,3,3>, RHS
+  2235584801U,	// <3,6,4,0>: Cost 3 vmrghw <3,4,5,6>, <6,0,1,2>
+  3717342106U,	// <3,6,4,1>: Cost 4 vsldoi4 <4,3,6,4>, <1,2,3,4>
+  2789167729U,	// <3,6,4,2>: Cost 3 vsldoi12 <6,2,7,3>, <6,4,2,5>
+  2235585074U,	// <3,6,4,3>: Cost 3 vmrghw <3,4,5,6>, <6,3,4,5>
+  2235585165U,	// <3,6,4,4>: Cost 3 vmrghw <3,4,5,6>, <6,4,5,6>
+  2699889974U,	// <3,6,4,5>: Cost 3 vsldoi8 <2,5,3,6>, RHS
+  2800079509U,	// <3,6,4,6>: Cost 3 vsldoi12 LHS, <6,4,6,5>
+  1215515958U,	// <3,6,4,7>: Cost 2 vmrglw <1,2,3,4>, RHS
+  1215515959U,	// <3,6,4,u>: Cost 2 vmrglw <1,2,3,4>, RHS
+  3873821356U,	// <3,6,5,0>: Cost 4 vsldoi12 LHS, <6,5,0,1>
+  3372959209U,	// <3,6,5,1>: Cost 5 vmrglw <2,u,3,5>, <2,0,6,1>
+  3862909629U,	// <3,6,5,2>: Cost 4 vsldoi12 <6,2,7,3>, <6,5,2,0>
+  3773632358U,	// <3,6,5,3>: Cost 4 vsldoi8 <2,5,3,6>, <5,3,6,0>
+  3873821396U,	// <3,6,5,4>: Cost 4 vsldoi12 LHS, <6,5,4,5>
+  3873821405U,	// <3,6,5,5>: Cost 4 vsldoi12 LHS, <6,5,5,5>
+  3862909672U,	// <3,6,5,6>: Cost 4 vsldoi12 <6,2,7,3>, <6,5,6,7>
+  2294574390U,	// <3,6,5,7>: Cost 3 vmrglw <2,1,3,5>, RHS
+  2294574391U,	// <3,6,5,u>: Cost 3 vmrglw <2,1,3,5>, RHS
+  2800079613U,	// <3,6,6,0>: Cost 3 vsldoi12 LHS, <6,6,0,1>
+  3873821446U,	// <3,6,6,1>: Cost 4 vsldoi12 LHS, <6,6,1,1>
+  2789167888U,	// <3,6,6,2>: Cost 3 vsldoi12 <6,2,7,3>, <6,6,2,2>
+  3844920090U,	// <3,6,6,3>: Cost 4 vsldoi12 <3,2,6,3>, <6,6,3,3>
+  2800079653U,	// <3,6,6,4>: Cost 3 vsldoi12 LHS, <6,6,4,5>
+  3723333484U,	// <3,6,6,5>: Cost 4 vsldoi4 <5,3,6,6>, <5,3,6,6>
+  1726337848U,	// <3,6,6,6>: Cost 2 vsldoi12 LHS, <6,6,6,6>
+  1726337858U,	// <3,6,6,7>: Cost 2 vsldoi12 LHS, <6,6,7,7>
+  1726337867U,	// <3,6,6,u>: Cost 2 vsldoi12 LHS, <6,6,u,7>
+  1726337870U,	// <3,6,7,0>: Cost 2 vsldoi12 LHS, <6,7,0,1>
+  2297906665U,	// <3,6,7,1>: Cost 3 vmrglw <2,6,3,7>, <2,0,6,1>
+  2792117090U,	// <3,6,7,2>: Cost 3 vsldoi12 <6,7,2,3>, <6,7,2,3>
+  2297907558U,	// <3,6,7,3>: Cost 3 vmrglw <2,6,3,7>, <3,2,6,3>
+  1726337910U,	// <3,6,7,4>: Cost 2 vsldoi12 LHS, <6,7,4,5>
+  2297906993U,	// <3,6,7,5>: Cost 3 vmrglw <2,6,3,7>, <2,4,6,5>
+  2297906832U,	// <3,6,7,6>: Cost 3 vmrglw <2,6,3,7>, <2,2,6,6>
+  1224166710U,	// <3,6,7,7>: Cost 2 vmrglw <2,6,3,7>, RHS
+  1224166711U,	// <3,6,7,u>: Cost 2 vmrglw <2,6,3,7>, RHS
+  1726337951U,	// <3,6,u,0>: Cost 2 vsldoi12 LHS, <6,u,0,1>
+  2699892526U,	// <3,6,u,1>: Cost 3 vsldoi8 <2,5,3,6>, LHS
+  2789168049U,	// <3,6,u,2>: Cost 3 vsldoi12 <6,2,7,3>, <6,u,2,1>
+  2792854460U,	// <3,6,u,3>: Cost 3 vsldoi12 <6,u,3,3>, <6,u,3,3>
+  1726337991U,	// <3,6,u,4>: Cost 2 vsldoi12 LHS, <6,u,4,5>
+  2699892890U,	// <3,6,u,5>: Cost 3 vsldoi8 <2,5,3,6>, RHS
+  1726337848U,	// <3,6,u,6>: Cost 2 vsldoi12 LHS, <6,6,6,6>
+  1215548726U,	// <3,6,u,7>: Cost 2 vmrglw <1,2,3,u>, RHS
+  1215548727U,	// <3,6,u,u>: Cost 2 vmrglw <1,2,3,u>, RHS
+  2700558336U,	// <3,7,0,0>: Cost 3 vsldoi8 <2,6,3,7>, <0,0,0,0>
+  1626816614U,	// <3,7,0,1>: Cost 2 vsldoi8 <2,6,3,7>, LHS
+  2700558513U,	// <3,7,0,2>: Cost 3 vsldoi8 <2,6,3,7>, <0,2,1,6>
+  2331030010U,	// <3,7,0,3>: Cost 3 vmrglw <u,2,3,0>, <6,2,7,3>
+  2700558674U,	// <3,7,0,4>: Cost 3 vsldoi8 <2,6,3,7>, <0,4,1,5>
+  2800079906U,	// <3,7,0,5>: Cost 3 vsldoi12 LHS, <7,0,5,6>
+  2655588936U,	// <3,7,0,6>: Cost 3 vsldoi4 <6,3,7,0>, <6,3,7,0>
+  2800079919U,	// <3,7,0,7>: Cost 3 vsldoi12 LHS, <7,0,7,1>
+  1626817181U,	// <3,7,0,u>: Cost 2 vsldoi8 <2,6,3,7>, LHS
+  3774300899U,	// <3,7,1,0>: Cost 4 vsldoi8 <2,6,3,7>, <1,0,1,1>
+  2700559156U,	// <3,7,1,1>: Cost 3 vsldoi8 <2,6,3,7>, <1,1,1,1>
+  2700559254U,	// <3,7,1,2>: Cost 3 vsldoi8 <2,6,3,7>, <1,2,3,0>
+  3774301148U,	// <3,7,1,3>: Cost 4 vsldoi8 <2,6,3,7>, <1,3,1,7>
+  3774301227U,	// <3,7,1,4>: Cost 4 vsldoi8 <2,6,3,7>, <1,4,1,5>
+  3774301295U,	// <3,7,1,5>: Cost 4 vsldoi8 <2,6,3,7>, <1,5,0,1>
+  3768329441U,	// <3,7,1,6>: Cost 4 vsldoi8 <1,6,3,7>, <1,6,3,7>
+  3403453250U,	// <3,7,1,7>: Cost 4 vmrglw <u,0,3,1>, <6,6,7,7>
+  2700559740U,	// <3,7,1,u>: Cost 3 vsldoi8 <2,6,3,7>, <1,u,3,0>
+  2700559849U,	// <3,7,2,0>: Cost 3 vsldoi8 <2,6,3,7>, <2,0,6,1>
+  3770983973U,	// <3,7,2,1>: Cost 4 vsldoi8 <2,1,3,7>, <2,1,3,7>
+  2700559976U,	// <3,7,2,2>: Cost 3 vsldoi8 <2,6,3,7>, <2,2,2,2>
+  2698569415U,	// <3,7,2,3>: Cost 3 vsldoi8 <2,3,3,7>, <2,3,3,7>
+  2700560177U,	// <3,7,2,4>: Cost 3 vsldoi8 <2,6,3,7>, <2,4,6,5>
+  3773638505U,	// <3,7,2,5>: Cost 4 vsldoi8 <2,5,3,7>, <2,5,3,7>
+  1626818490U,	// <3,7,2,6>: Cost 2 vsldoi8 <2,6,3,7>, <2,6,3,7>
+  2795140307U,	// <3,7,2,7>: Cost 3 vsldoi12 <7,2,7,3>, <7,2,7,3>
+  1628145756U,	// <3,7,2,u>: Cost 2 vsldoi8 <2,u,3,7>, <2,u,3,7>
+  2700560534U,	// <3,7,3,0>: Cost 3 vsldoi8 <2,6,3,7>, <3,0,1,2>
+  3774302438U,	// <3,7,3,1>: Cost 4 vsldoi8 <2,6,3,7>, <3,1,1,1>
+  2700560742U,	// <3,7,3,2>: Cost 3 vsldoi8 <2,6,3,7>, <3,2,6,3>
+  2700560796U,	// <3,7,3,3>: Cost 3 vsldoi8 <2,6,3,7>, <3,3,3,3>
+  2700560898U,	// <3,7,3,4>: Cost 3 vsldoi8 <2,6,3,7>, <3,4,5,6>
+  3774302821U,	// <3,7,3,5>: Cost 4 vsldoi8 <2,6,3,7>, <3,5,7,6>
+  2700561079U,	// <3,7,3,6>: Cost 3 vsldoi8 <2,6,3,7>, <3,6,7,7>
+  2700561091U,	// <3,7,3,7>: Cost 3 vsldoi8 <2,6,3,7>, <3,7,0,1>
+  2700561182U,	// <3,7,3,u>: Cost 3 vsldoi8 <2,6,3,7>, <3,u,1,2>
+  2655617126U,	// <3,7,4,0>: Cost 3 vsldoi4 <6,3,7,4>, LHS
+  3774303178U,	// <3,7,4,1>: Cost 4 vsldoi8 <2,6,3,7>, <4,1,2,3>
+  2655619002U,	// <3,7,4,2>: Cost 3 vsldoi4 <6,3,7,4>, <2,6,3,7>
+  2331062778U,	// <3,7,4,3>: Cost 3 vmrglw <u,2,3,4>, <6,2,7,3>
+  2655620406U,	// <3,7,4,4>: Cost 3 vsldoi4 <6,3,7,4>, RHS
+  1626819894U,	// <3,7,4,5>: Cost 2 vsldoi8 <2,6,3,7>, RHS
+  2655621708U,	// <3,7,4,6>: Cost 3 vsldoi4 <6,3,7,4>, <6,3,7,4>
+  2800080247U,	// <3,7,4,7>: Cost 3 vsldoi12 LHS, <7,4,7,5>
+  1626820137U,	// <3,7,4,u>: Cost 2 vsldoi8 <2,6,3,7>, RHS
+  3774303816U,	// <3,7,5,0>: Cost 4 vsldoi8 <2,6,3,7>, <5,0,1,2>
+  3873822093U,	// <3,7,5,1>: Cost 4 vsldoi12 LHS, <7,5,1,0>
+  3774303998U,	// <3,7,5,2>: Cost 4 vsldoi8 <2,6,3,7>, <5,2,3,4>
+  3862910368U,	// <3,7,5,3>: Cost 4 vsldoi12 <6,2,7,3>, <7,5,3,1>
+  3774304180U,	// <3,7,5,4>: Cost 4 vsldoi8 <2,6,3,7>, <5,4,5,6>
+  2800080310U,	// <3,7,5,5>: Cost 3 vsldoi12 LHS, <7,5,5,5>
+  2800080321U,	// <3,7,5,6>: Cost 3 vsldoi12 LHS, <7,5,6,7>
+  3873822147U,	// <3,7,5,7>: Cost 4 vsldoi12 LHS, <7,5,7,0>
+  2800080339U,	// <3,7,5,u>: Cost 3 vsldoi12 LHS, <7,5,u,7>
+  2800080348U,	// <3,7,6,0>: Cost 3 vsldoi12 LHS, <7,6,0,7>
+  3873822181U,	// <3,7,6,1>: Cost 4 vsldoi12 LHS, <7,6,1,7>
+  2789168622U,	// <3,7,6,2>: Cost 3 vsldoi12 <6,2,7,3>, <7,6,2,7>
+  2700563016U,	// <3,7,6,3>: Cost 3 vsldoi8 <2,6,3,7>, <6,3,7,0>
+  2800080384U,	// <3,7,6,4>: Cost 3 vsldoi12 LHS, <7,6,4,7>
+  3862910472U,	// <3,7,6,5>: Cost 4 vsldoi12 <6,2,7,3>, <7,6,5,6>
+  2700563256U,	// <3,7,6,6>: Cost 3 vsldoi8 <2,6,3,7>, <6,6,6,6>
+  2800080404U,	// <3,7,6,7>: Cost 3 vsldoi12 LHS, <7,6,7,0>
+  2793149988U,	// <3,7,6,u>: Cost 3 vsldoi12 <6,u,7,3>, <7,6,u,7>
+  2637725798U,	// <3,7,7,0>: Cost 3 vsldoi4 <3,3,7,7>, LHS
+  3371649227U,	// <3,7,7,1>: Cost 4 vmrglw <2,6,3,7>, <3,0,7,1>
+  2637727674U,	// <3,7,7,2>: Cost 3 vsldoi4 <3,3,7,7>, <2,6,3,7>
+  2297907567U,	// <3,7,7,3>: Cost 3 vmrglw <2,6,3,7>, <3,2,7,3>
+  2637729078U,	// <3,7,7,4>: Cost 3 vsldoi4 <3,3,7,7>, RHS
+  3371649312U,	// <3,7,7,5>: Cost 4 vmrglw <2,6,3,7>, <3,1,7,5>
+  2655646287U,	// <3,7,7,6>: Cost 3 vsldoi4 <6,3,7,7>, <6,3,7,7>
+  1726338668U,	// <3,7,7,7>: Cost 2 vsldoi12 LHS, <7,7,7,7>
+  1726338668U,	// <3,7,7,u>: Cost 2 vsldoi12 LHS, <7,7,7,7>
+  2700564179U,	// <3,7,u,0>: Cost 3 vsldoi8 <2,6,3,7>, <u,0,1,2>
+  1626822446U,	// <3,7,u,1>: Cost 2 vsldoi8 <2,6,3,7>, LHS
+  2700564357U,	// <3,7,u,2>: Cost 3 vsldoi8 <2,6,3,7>, <u,2,3,0>
+  2700564412U,	// <3,7,u,3>: Cost 3 vsldoi8 <2,6,3,7>, <u,3,0,1>
+  2700564543U,	// <3,7,u,4>: Cost 3 vsldoi8 <2,6,3,7>, <u,4,5,6>
+  1626822810U,	// <3,7,u,5>: Cost 2 vsldoi8 <2,6,3,7>, RHS
+  1662654672U,	// <3,7,u,6>: Cost 2 vsldoi8 <u,6,3,7>, <u,6,3,7>
+  1726338668U,	// <3,7,u,7>: Cost 2 vsldoi12 LHS, <7,7,7,7>
+  1626823013U,	// <3,7,u,u>: Cost 2 vsldoi8 <2,6,3,7>, LHS
+  1678557184U,	// <3,u,0,0>: Cost 2 vsldoi12 LHS, <0,0,0,0>
+  1679005395U,	// <3,u,0,1>: Cost 2 vsldoi12 LHS, <u,0,1,2>
+  2289221787U,	// <3,u,0,2>: Cost 3 vmrglw <1,2,3,0>, <0,1,u,2>
+  1215479964U,	// <3,u,0,3>: Cost 2 vmrglw <1,2,3,0>, LHS
+  2752747245U,	// <3,u,0,4>: Cost 3 vsldoi12 LHS, <u,0,4,1>
+  1158863002U,	// <3,u,0,5>: Cost 2 vmrghw <3,0,1,2>, RHS
+  2289224221U,	// <3,u,0,6>: Cost 3 vmrglw <1,2,3,0>, <3,4,u,6>
+  1215483208U,	// <3,u,0,7>: Cost 2 vmrglw <1,2,3,0>, RHS
+  1679005458U,	// <3,u,0,u>: Cost 2 vsldoi12 LHS, <u,0,u,2>
+  1558036582U,	// <3,u,1,0>: Cost 2 vsldoi4 <2,3,u,1>, LHS
+  1678558004U,	// <3,u,1,1>: Cost 2 vsldoi12 LHS, <1,1,1,1>
+  604821294U,	// <3,u,1,2>: Cost 1 vsldoi12 LHS, LHS
+  2752747317U,	// <3,u,1,3>: Cost 3 vsldoi12 LHS, <u,1,3,1>
+  1558039862U,	// <3,u,1,4>: Cost 2 vsldoi4 <2,3,u,1>, RHS
+  2756949830U,	// <3,u,1,5>: Cost 3 vsldoi12 LHS, <u,1,5,0>
+  2800080726U,	// <3,u,1,6>: Cost 3 vsldoi12 LHS, <u,1,6,7>
+  2289233224U,	// <3,u,1,7>: Cost 3 vmrglw <1,2,3,1>, RHS
+  604821348U,	// <3,u,1,u>: Cost 1 vsldoi12 LHS, LHS
+  2696586709U,	// <3,u,2,0>: Cost 3 vsldoi8 <2,0,3,u>, <2,0,3,u>
+  2757392246U,	// <3,u,2,1>: Cost 3 vsldoi12 LHS, <u,2,1,3>
+  1624172151U,	// <3,u,2,2>: Cost 2 vsldoi8 <2,2,3,u>, <2,2,3,u>
+  1679005576U,	// <3,u,2,3>: Cost 2 vsldoi12 LHS, <u,2,3,3>
+  2631789878U,	// <3,u,2,4>: Cost 3 vsldoi4 <2,3,u,2>, RHS
+  2699904874U,	// <3,u,2,5>: Cost 3 vsldoi8 <2,5,3,u>, <2,5,3,u>
+  1626826683U,	// <3,u,2,6>: Cost 2 vsldoi8 <2,6,3,u>, <2,6,3,u>
+  1726338988U,	// <3,u,2,7>: Cost 2 vsldoi12 LHS, <u,2,7,3>
+  1683208117U,	// <3,u,2,u>: Cost 2 vsldoi12 LHS, <u,2,u,3>
+  1679005628U,	// <3,u,3,0>: Cost 2 vsldoi12 LHS, <u,3,0,1>
+  1161008942U,	// <3,u,3,1>: Cost 2 vmrghw <3,3,3,3>, LHS
+  2752747471U,	// <3,u,3,2>: Cost 3 vsldoi12 LHS, <u,3,2,2>
+  403488870U,	// <3,u,3,3>: Cost 1 vspltisw3 LHS
+  1679005668U,	// <3,u,3,4>: Cost 2 vsldoi12 LHS, <u,3,4,5>
+  1161009306U,	// <3,u,3,5>: Cost 2 vmrghw <3,3,3,3>, RHS
+  2691943104U,	// <3,u,3,6>: Cost 3 vsldoi8 <1,2,3,u>, <3,6,u,7>
+  1221479752U,	// <3,u,3,7>: Cost 2 vmrglw <2,2,3,3>, RHS
+  403488870U,	// <3,u,3,u>: Cost 1 vspltisw3 LHS
+  2289255363U,	// <3,u,4,0>: Cost 3 vmrglw <1,2,3,4>, <1,2,u,0>
+  1161844526U,	// <3,u,4,1>: Cost 2 vmrghw <3,4,5,6>, LHS
+  2289256661U,	// <3,u,4,2>: Cost 3 vmrglw <1,2,3,4>, <3,0,u,2>
+  1215512732U,	// <3,u,4,3>: Cost 2 vmrglw <1,2,3,4>, LHS
+  1215513498U,	// <3,u,4,4>: Cost 2 vmrglw <1,2,3,4>, <1,2,3,4>
+  1679005759U,	// <3,u,4,5>: Cost 2 vsldoi12 LHS, <u,4,5,6>
+  2289256989U,	// <3,u,4,6>: Cost 3 vmrglw <1,2,3,4>, <3,4,u,6>
+  1215515976U,	// <3,u,4,7>: Cost 2 vmrglw <1,2,3,4>, RHS
+  1679005786U,	// <3,u,4,u>: Cost 2 vsldoi12 LHS, <u,4,u,6>
+  1558069350U,	// <3,u,5,0>: Cost 2 vsldoi4 <2,3,u,5>, LHS
+  2631811892U,	// <3,u,5,1>: Cost 3 vsldoi4 <2,3,u,5>, <1,1,1,1>
+  1558071026U,	// <3,u,5,2>: Cost 2 vsldoi4 <2,3,u,5>, <2,3,u,5>
+  2752747646U,	// <3,u,5,3>: Cost 3 vsldoi12 LHS, <u,5,3,6>
+  1558072630U,	// <3,u,5,4>: Cost 2 vsldoi4 <2,3,u,5>, RHS
+  1726337028U,	// <3,u,5,5>: Cost 2 vsldoi12 LHS, <5,5,5,5>
+  604821658U,	// <3,u,5,6>: Cost 1 vsldoi12 LHS, RHS
+  2294574408U,	// <3,u,5,7>: Cost 3 vmrglw <2,1,3,5>, RHS
+  604821676U,	// <3,u,5,u>: Cost 1 vsldoi12 LHS, RHS
+  2631819366U,	// <3,u,6,0>: Cost 3 vsldoi4 <2,3,u,6>, LHS
+  2757392574U,	// <3,u,6,1>: Cost 3 vsldoi12 LHS, <u,6,1,7>
+  2631821043U,	// <3,u,6,2>: Cost 3 vsldoi4 <2,3,u,6>, <2,3,u,6>
+  1679005904U,	// <3,u,6,3>: Cost 2 vsldoi12 LHS, <u,6,3,7>
+  2631822646U,	// <3,u,6,4>: Cost 3 vsldoi4 <2,3,u,6>, RHS
+  2236553370U,	// <3,u,6,5>: Cost 3 vmrghw <3,6,0,7>, RHS
+  1726337848U,	// <3,u,6,6>: Cost 2 vsldoi12 LHS, <6,6,6,6>
+  1726339309U,	// <3,u,6,7>: Cost 2 vsldoi12 LHS, <u,6,7,0>
+  1683208445U,	// <3,u,6,u>: Cost 2 vsldoi12 LHS, <u,6,u,7>
+  1726339328U,	// <3,u,7,0>: Cost 2 vsldoi12 LHS, <u,7,0,1>
+  2297905225U,	// <3,u,7,1>: Cost 3 vmrglw <2,6,3,7>, <0,0,u,1>
+  2631829236U,	// <3,u,7,2>: Cost 3 vsldoi4 <2,3,u,7>, <2,3,u,7>
+  1224163484U,	// <3,u,7,3>: Cost 2 vmrglw <2,6,3,7>, LHS
+  1726339368U,	// <3,u,7,4>: Cost 2 vsldoi12 LHS, <u,7,4,5>
+  2297905553U,	// <3,u,7,5>: Cost 3 vmrglw <2,6,3,7>, <0,4,u,5>
+  2297905392U,	// <3,u,7,6>: Cost 3 vmrglw <2,6,3,7>, <0,2,u,6>
+  1224166728U,	// <3,u,7,7>: Cost 2 vmrglw <2,6,3,7>, RHS
+  1224163489U,	// <3,u,7,u>: Cost 2 vmrglw <2,6,3,7>, LHS
+  1683208529U,	// <3,u,u,0>: Cost 2 vsldoi12 LHS, <u,u,0,1>
+  1679006043U,	// <3,u,u,1>: Cost 2 vsldoi12 LHS, <u,u,1,2>
+  604821861U,	// <3,u,u,2>: Cost 1 vsldoi12 LHS, LHS
+  403488870U,	// <3,u,u,3>: Cost 1 vspltisw3 LHS
+  1683208569U,	// <3,u,u,4>: Cost 2 vsldoi12 LHS, <u,u,4,5>
+  1679006083U,	// <3,u,u,5>: Cost 2 vsldoi12 LHS, <u,u,5,6>
+  604821901U,	// <3,u,u,6>: Cost 1 vsldoi12 LHS, RHS
+  1215548744U,	// <3,u,u,7>: Cost 2 vmrglw <1,2,3,u>, RHS
+  604821915U,	// <3,u,u,u>: Cost 1 vsldoi12 LHS, LHS
+  2759016448U,	// <4,0,0,0>: Cost 3 vsldoi12 <1,2,3,4>, <0,0,0,0>
+  1165115494U,	// <4,0,0,1>: Cost 2 vmrghw <4,0,5,1>, LHS
+  3717531337U,	// <4,0,0,2>: Cost 4 vsldoi4 <4,4,0,0>, <2,3,4,0>
+  3369675785U,	// <4,0,0,3>: Cost 4 vmrglw <2,3,4,0>, <4,2,0,3>
+  2751791144U,	// <4,0,0,4>: Cost 3 vsldoi12 <0,0,4,4>, <0,0,4,4>
+  2238857630U,	// <4,0,0,5>: Cost 3 vmrghw <4,0,5,1>, <0,5,1,0>
+  3312591341U,	// <4,0,0,6>: Cost 4 vmrghw <4,0,5,0>, <0,6,0,7>
+  3369676113U,	// <4,0,0,7>: Cost 4 vmrglw <2,3,4,0>, <4,6,0,7>
+  1165116061U,	// <4,0,0,u>: Cost 2 vmrghw <4,0,5,1>, LHS
+  2637824102U,	// <4,0,1,0>: Cost 3 vsldoi4 <3,4,0,1>, LHS
+  2637824922U,	// <4,0,1,1>: Cost 3 vsldoi4 <3,4,0,1>, <1,2,3,4>
+  1685274726U,	// <4,0,1,2>: Cost 2 vsldoi12 <1,2,3,4>, LHS
+  2637826512U,	// <4,0,1,3>: Cost 3 vsldoi4 <3,4,0,1>, <3,4,0,1>
+  2637827382U,	// <4,0,1,4>: Cost 3 vsldoi4 <3,4,0,1>, RHS
+  2661716070U,	// <4,0,1,5>: Cost 3 vsldoi4 <7,4,0,1>, <5,6,7,4>
+  3729486427U,	// <4,0,1,6>: Cost 4 vsldoi4 <6,4,0,1>, <6,4,0,1>
+  2661717300U,	// <4,0,1,7>: Cost 3 vsldoi4 <7,4,0,1>, <7,4,0,1>
+  1685274780U,	// <4,0,1,u>: Cost 2 vsldoi12 <1,2,3,4>, LHS
+  3711574118U,	// <4,0,2,0>: Cost 4 vsldoi4 <3,4,0,2>, LHS
+  2240200806U,	// <4,0,2,1>: Cost 3 vmrghw <4,2,5,3>, LHS
+  3771663992U,	// <4,0,2,2>: Cost 4 vsldoi8 <2,2,4,0>, <2,2,4,0>
+  2698585801U,	// <4,0,2,3>: Cost 3 vsldoi8 <2,3,4,0>, <2,3,4,0>
+  3373672105U,	// <4,0,2,4>: Cost 4 vmrglw <3,0,4,2>, <2,3,0,4>
+  3810813795U,	// <4,0,2,5>: Cost 4 vsldoi8 <u,7,4,0>, <2,5,3,1>
+  3772327866U,	// <4,0,2,6>: Cost 4 vsldoi8 <2,3,4,0>, <2,6,3,7>
+  3386280568U,	// <4,0,2,7>: Cost 5 vmrglw <5,1,4,2>, <3,6,0,7>
+  2701903966U,	// <4,0,2,u>: Cost 3 vsldoi8 <2,u,4,0>, <2,u,4,0>
+  3699638374U,	// <4,0,3,0>: Cost 4 vsldoi4 <1,4,0,3>, LHS
+  2753560832U,	// <4,0,3,1>: Cost 3 vsldoi12 <0,3,1,4>, <0,3,1,4>
+  3772328276U,	// <4,0,3,2>: Cost 4 vsldoi8 <2,3,4,0>, <3,2,4,3>
+  3827302674U,	// <4,0,3,3>: Cost 4 vsldoi12 <0,3,1,4>, <0,3,3,4>
+  3699641654U,	// <4,0,3,4>: Cost 4 vsldoi4 <1,4,0,3>, RHS
+  3779627588U,	// <4,0,3,5>: Cost 4 vsldoi8 <3,5,4,0>, <3,5,4,0>
+  3772328604U,	// <4,0,3,6>: Cost 4 vsldoi8 <2,3,4,0>, <3,6,4,7>
+  3780954854U,	// <4,0,3,7>: Cost 4 vsldoi8 <3,7,4,0>, <3,7,4,0>
+  2753560832U,	// <4,0,3,u>: Cost 3 vsldoi12 <0,3,1,4>, <0,3,1,4>
+  2725129106U,	// <4,0,4,0>: Cost 3 vsldoi8 <6,7,4,0>, <4,0,5,1>
+  1167720550U,	// <4,0,4,1>: Cost 2 vmrghw <4,4,4,4>, LHS
+  3839172953U,	// <4,0,4,2>: Cost 4 vsldoi12 <2,3,0,4>, <0,4,2,3>
+  3772329051U,	// <4,0,4,3>: Cost 4 vsldoi8 <2,3,4,0>, <4,3,0,4>
+  2241462610U,	// <4,0,4,4>: Cost 3 vmrghw <4,4,4,4>, <0,4,1,5>
+  2698587446U,	// <4,0,4,5>: Cost 3 vsldoi8 <2,3,4,0>, RHS
+  3772329297U,	// <4,0,4,6>: Cost 4 vsldoi8 <2,3,4,0>, <4,6,0,7>
+  3735483703U,	// <4,0,4,7>: Cost 4 vsldoi4 <7,4,0,4>, <7,4,0,4>
+  1167721117U,	// <4,0,4,u>: Cost 2 vmrghw <4,4,4,4>, LHS
+  1168556032U,	// <4,0,5,0>: Cost 2 vmrghw RHS, <0,0,0,0>
+  94814310U,	// <4,0,5,1>: Cost 1 vmrghw RHS, LHS
+  2242298029U,	// <4,0,5,2>: Cost 3 vmrghw RHS, <0,2,1,2>
+  2637859284U,	// <4,0,5,3>: Cost 3 vsldoi4 <3,4,0,5>, <3,4,0,5>
+  1168556370U,	// <4,0,5,4>: Cost 2 vmrghw RHS, <0,4,1,5>
+  2242306530U,	// <4,0,5,5>: Cost 3 vmrghw RHS, <0,5,u,5>
+  2242298358U,	// <4,0,5,6>: Cost 3 vmrghw RHS, <0,6,1,7>
+  2661750072U,	// <4,0,5,7>: Cost 3 vsldoi4 <7,4,0,5>, <7,4,0,5>
+  94814877U,	// <4,0,5,u>: Cost 1 vmrghw RHS, LHS
+  3316580362U,	// <4,0,6,0>: Cost 4 vmrghw <4,6,5,1>, <0,0,1,1>
+  2242846822U,	// <4,0,6,1>: Cost 3 vmrghw <4,6,5,2>, LHS
+  3798872570U,	// <4,0,6,2>: Cost 4 vsldoi8 <6,7,4,0>, <6,2,7,3>
+  3796218413U,	// <4,0,6,3>: Cost 4 vsldoi8 <6,3,4,0>, <6,3,4,0>
+  3834528273U,	// <4,0,6,4>: Cost 4 vsldoi12 <1,5,0,4>, <0,6,4,7>
+  3798872811U,	// <4,0,6,5>: Cost 4 vsldoi8 <6,7,4,0>, <6,5,7,1>
+  3316621876U,	// <4,0,6,6>: Cost 4 vmrghw <4,6,5,6>, <0,6,u,6>
+  2725131121U,	// <4,0,6,7>: Cost 3 vsldoi8 <6,7,4,0>, <6,7,4,0>
+  2242847389U,	// <4,0,6,u>: Cost 3 vmrghw <4,6,5,2>, LHS
+  3377692672U,	// <4,0,7,0>: Cost 4 vmrglw <3,6,4,7>, <0,0,0,0>
+  2243493990U,	// <4,0,7,1>: Cost 3 vmrghw <4,7,5,0>, LHS
+  3775648970U,	// <4,0,7,2>: Cost 5 vsldoi8 <2,u,4,0>, <7,2,6,3>
+  3802191110U,	// <4,0,7,3>: Cost 4 vsldoi8 <7,3,4,0>, <7,3,4,0>
+  3317236050U,	// <4,0,7,4>: Cost 4 vmrghw <4,7,5,0>, <0,4,1,5>
+  3803518376U,	// <4,0,7,5>: Cost 4 vsldoi8 <7,5,4,0>, <7,5,4,0>
+  3317236214U,	// <4,0,7,6>: Cost 5 vmrghw <4,7,5,0>, <0,6,1,7>
+  3798873708U,	// <4,0,7,7>: Cost 4 vsldoi8 <6,7,4,0>, <7,7,7,7>
+  2243494557U,	// <4,0,7,u>: Cost 3 vmrghw <4,7,5,0>, LHS
+  1170546688U,	// <4,0,u,0>: Cost 2 vmrghw RHS, <0,0,0,0>
+  96804966U,	// <4,0,u,1>: Cost 1 vmrghw RHS, LHS
+  1685275293U,	// <4,0,u,2>: Cost 2 vsldoi12 <1,2,3,4>, LHS
+  2637883863U,	// <4,0,u,3>: Cost 3 vsldoi4 <3,4,0,u>, <3,4,0,u>
+  1170547026U,	// <4,0,u,4>: Cost 2 vmrghw RHS, <0,4,1,5>
+  2698590362U,	// <4,0,u,5>: Cost 3 vsldoi8 <2,3,4,0>, RHS
+  2244289014U,	// <4,0,u,6>: Cost 3 vmrghw RHS, <0,6,1,7>
+  2661774651U,	// <4,0,u,7>: Cost 3 vsldoi4 <7,4,0,u>, <7,4,0,u>
+  96805533U,	// <4,0,u,u>: Cost 1 vmrghw RHS, LHS
+  2667749478U,	// <4,1,0,0>: Cost 3 vsldoi4 <u,4,1,0>, LHS
+  2689966182U,	// <4,1,0,1>: Cost 3 vsldoi8 <0,u,4,1>, LHS
+  2238571418U,	// <4,1,0,2>: Cost 3 vmrghw <4,0,1,2>, <1,2,3,4>
+  3711633880U,	// <4,1,0,3>: Cost 4 vsldoi4 <3,4,1,0>, <3,4,1,0>
+  2689966418U,	// <4,1,0,4>: Cost 3 vsldoi8 <0,u,4,1>, <0,4,1,5>
+  3361046866U,	// <4,1,0,5>: Cost 4 vmrglw <0,u,4,0>, <0,4,1,5>
+  3741495802U,	// <4,1,0,6>: Cost 4 vsldoi4 <u,4,1,0>, <6,2,7,3>
+  3741496314U,	// <4,1,0,7>: Cost 4 vsldoi4 <u,4,1,0>, <7,0,1,2>
+  2689966765U,	// <4,1,0,u>: Cost 3 vsldoi8 <0,u,4,1>, <0,u,4,1>
+  3764372222U,	// <4,1,1,0>: Cost 4 vsldoi8 <1,0,4,1>, <1,0,4,1>
+  2758206263U,	// <4,1,1,1>: Cost 3 vsldoi12 <1,1,1,4>, <1,1,1,4>
+  2698593178U,	// <4,1,1,2>: Cost 3 vsldoi8 <2,3,4,1>, <1,2,3,4>
+  3361057810U,	// <4,1,1,3>: Cost 4 vmrglw <0,u,4,1>, <4,2,1,3>
+  3827303250U,	// <4,1,1,4>: Cost 4 vsldoi12 <0,3,1,4>, <1,1,4,4>
+  2287313234U,	// <4,1,1,5>: Cost 3 vmrglw <0,u,4,1>, <0,4,1,5>
+  3763709171U,	// <4,1,1,6>: Cost 4 vsldoi8 <0,u,4,1>, <1,6,5,7>
+  3361058138U,	// <4,1,1,7>: Cost 4 vmrglw <0,u,4,1>, <4,6,1,7>
+  2239759744U,	// <4,1,1,u>: Cost 3 vmrghw <4,1,u,3>, <1,u,3,4>
+  2637906022U,	// <4,1,2,0>: Cost 3 vsldoi4 <3,4,1,2>, LHS
+  2637906842U,	// <4,1,2,1>: Cost 3 vsldoi4 <3,4,1,2>, <1,2,3,4>
+  3763709544U,	// <4,1,2,2>: Cost 4 vsldoi8 <0,u,4,1>, <2,2,2,2>
+  1685275546U,	// <4,1,2,3>: Cost 2 vsldoi12 <1,2,3,4>, <1,2,3,4>
+  2637909302U,	// <4,1,2,4>: Cost 3 vsldoi4 <3,4,1,2>, RHS
+  3361063250U,	// <4,1,2,5>: Cost 4 vmrglw <0,u,4,2>, <0,4,1,5>
+  3763709882U,	// <4,1,2,6>: Cost 4 vsldoi8 <0,u,4,1>, <2,6,3,7>
+  3735541054U,	// <4,1,2,7>: Cost 4 vsldoi4 <7,4,1,2>, <7,4,1,2>
+  1685644231U,	// <4,1,2,u>: Cost 2 vsldoi12 <1,2,u,4>, <1,2,u,4>
+  2702575792U,	// <4,1,3,0>: Cost 3 vsldoi8 <3,0,4,1>, <3,0,4,1>
+  3832759257U,	// <4,1,3,1>: Cost 4 vsldoi12 <1,2,3,4>, <1,3,1,4>
+  3833349090U,	// <4,1,3,2>: Cost 4 vsldoi12 <1,3,2,4>, <1,3,2,4>
+  3763710364U,	// <4,1,3,3>: Cost 4 vsldoi8 <0,u,4,1>, <3,3,3,3>
+  2707884546U,	// <4,1,3,4>: Cost 3 vsldoi8 <3,u,4,1>, <3,4,5,6>
+  3361071442U,	// <4,1,3,5>: Cost 4 vmrglw <0,u,4,3>, <0,4,1,5>
+  3772336796U,	// <4,1,3,6>: Cost 4 vsldoi8 <2,3,4,1>, <3,6,4,7>
+  3775654595U,	// <4,1,3,7>: Cost 5 vsldoi8 <2,u,4,1>, <3,7,0,1>
+  2707884856U,	// <4,1,3,u>: Cost 3 vsldoi8 <3,u,4,1>, <3,u,4,1>
+  2667782246U,	// <4,1,4,0>: Cost 3 vsldoi4 <u,4,1,4>, LHS
+  2241463092U,	// <4,1,4,1>: Cost 3 vmrghw <4,4,4,4>, <1,1,1,1>
+  2241553306U,	// <4,1,4,2>: Cost 3 vmrghw <4,4,5,6>, <1,2,3,4>
+  3827303484U,	// <4,1,4,3>: Cost 4 vsldoi12 <0,3,1,4>, <1,4,3,4>
+  2667785424U,	// <4,1,4,4>: Cost 3 vsldoi4 <u,4,1,4>, <4,4,4,4>
+  2689969462U,	// <4,1,4,5>: Cost 3 vsldoi8 <0,u,4,1>, RHS
+  3763711322U,	// <4,1,4,6>: Cost 4 vsldoi8 <0,u,4,1>, <4,6,1,7>
+  3867116636U,	// <4,1,4,7>: Cost 4 vsldoi12 <7,0,1,4>, <1,4,7,0>
+  2689969705U,	// <4,1,4,u>: Cost 3 vsldoi8 <0,u,4,1>, RHS
+  1546273106U,	// <4,1,5,0>: Cost 2 vsldoi4 <0,4,1,5>, <0,4,1,5>
+  1168556852U,	// <4,1,5,1>: Cost 2 vmrghw RHS, <1,1,1,1>
+  1168556950U,	// <4,1,5,2>: Cost 2 vmrghw RHS, <1,2,3,0>
+  2620016790U,	// <4,1,5,3>: Cost 3 vsldoi4 <0,4,1,5>, <3,0,1,2>
+  1546276150U,	// <4,1,5,4>: Cost 2 vsldoi4 <0,4,1,5>, RHS
+  2620018692U,	// <4,1,5,5>: Cost 3 vsldoi4 <0,4,1,5>, <5,5,5,5>
+  2242299087U,	// <4,1,5,6>: Cost 3 vmrghw RHS, <1,6,1,7>
+  2667795450U,	// <4,1,5,7>: Cost 3 vsldoi4 <u,4,1,5>, <7,0,1,2>
+  1546278702U,	// <4,1,5,u>: Cost 2 vsldoi4 <0,4,1,5>, LHS
+  3781628193U,	// <4,1,6,0>: Cost 4 vsldoi8 <3,u,4,1>, <6,0,1,2>
+  3832759503U,	// <4,1,6,1>: Cost 4 vsldoi12 <1,2,3,4>, <1,6,1,7>
+  3316261786U,	// <4,1,6,2>: Cost 4 vmrghw <4,6,0,7>, <1,2,3,4>
+  3781628466U,	// <4,1,6,3>: Cost 4 vsldoi8 <3,u,4,1>, <6,3,4,5>
+  3827303658U,	// <4,1,6,4>: Cost 4 vsldoi12 <0,3,1,4>, <1,6,4,7>
+  3361096018U,	// <4,1,6,5>: Cost 4 vmrglw <0,u,4,6>, <0,4,1,5>
+  3788264248U,	// <4,1,6,6>: Cost 4 vsldoi8 <5,0,4,1>, <6,6,6,6>
+  3788264270U,	// <4,1,6,7>: Cost 4 vsldoi8 <5,0,4,1>, <6,7,0,1>
+  3832759566U,	// <4,1,6,u>: Cost 4 vsldoi12 <1,2,3,4>, <1,6,u,7>
+  2726466580U,	// <4,1,7,0>: Cost 3 vsldoi8 <7,0,4,1>, <7,0,4,1>
+  3377692682U,	// <4,1,7,1>: Cost 4 vmrglw <3,6,4,7>, <0,0,1,1>
+  3377694870U,	// <4,1,7,2>: Cost 4 vmrglw <3,6,4,7>, <3,0,1,2>
+  3802199303U,	// <4,1,7,3>: Cost 4 vsldoi8 <7,3,4,1>, <7,3,4,1>
+  2731775334U,	// <4,1,7,4>: Cost 3 vsldoi8 <7,u,4,1>, <7,4,5,6>
+  3377693010U,	// <4,1,7,5>: Cost 4 vmrglw <3,6,4,7>, <0,4,1,5>
+  3365749804U,	// <4,1,7,6>: Cost 5 vmrglw <1,6,4,7>, <1,4,1,6>
+  3788265068U,	// <4,1,7,7>: Cost 4 vsldoi8 <5,0,4,1>, <7,7,7,7>
+  2731775644U,	// <4,1,7,u>: Cost 3 vsldoi8 <7,u,4,1>, <7,u,4,1>
+  1546297685U,	// <4,1,u,0>: Cost 2 vsldoi4 <0,4,1,u>, <0,4,1,u>
+  1170547508U,	// <4,1,u,1>: Cost 2 vmrghw RHS, <1,1,1,1>
+  1170547606U,	// <4,1,u,2>: Cost 2 vmrghw RHS, <1,2,3,0>
+  1689257344U,	// <4,1,u,3>: Cost 2 vsldoi12 <1,u,3,4>, <1,u,3,4>
+  1546300726U,	// <4,1,u,4>: Cost 2 vsldoi4 <0,4,1,u>, RHS
+  2284716370U,	// <4,1,u,5>: Cost 3 vmrglw <0,4,4,u>, <0,4,1,5>
+  2244289743U,	// <4,1,u,6>: Cost 3 vmrghw RHS, <1,6,1,7>
+  2667820026U,	// <4,1,u,7>: Cost 3 vsldoi4 <u,4,1,u>, <7,0,1,2>
+  1546303278U,	// <4,1,u,u>: Cost 2 vsldoi4 <0,4,1,u>, LHS
+  3729621094U,	// <4,2,0,0>: Cost 4 vsldoi4 <6,4,2,0>, LHS
+  3763716198U,	// <4,2,0,1>: Cost 4 vsldoi8 <0,u,4,2>, LHS
+  2238858856U,	// <4,2,0,2>: Cost 3 vmrghw <4,0,5,1>, <2,2,2,2>
+  2295930982U,	// <4,2,0,3>: Cost 3 vmrglw <2,3,4,0>, LHS
+  3763716434U,	// <4,2,0,4>: Cost 4 vsldoi8 <0,u,4,2>, <0,4,1,5>
+  2238859107U,	// <4,2,0,5>: Cost 3 vmrghw <4,0,5,1>, <2,5,3,1>
+  2238859194U,	// <4,2,0,6>: Cost 3 vmrghw <4,0,5,1>, <2,6,3,7>
+  3312601066U,	// <4,2,0,7>: Cost 4 vmrghw <4,0,5,1>, <2,7,0,1>
+  2295930987U,	// <4,2,0,u>: Cost 3 vmrglw <2,3,4,0>, LHS
+  3699769446U,	// <4,2,1,0>: Cost 4 vsldoi4 <1,4,2,1>, LHS
+  3313255971U,	// <4,2,1,1>: Cost 4 vmrghw <4,1,5,0>, <2,1,3,5>
+  3361056360U,	// <4,2,1,2>: Cost 4 vmrglw <0,u,4,1>, <2,2,2,2>
+  2287312998U,	// <4,2,1,3>: Cost 3 vmrglw <0,u,4,1>, LHS
+  3788932148U,	// <4,2,1,4>: Cost 4 vsldoi8 <5,1,4,2>, <1,4,2,5>
+  3313256290U,	// <4,2,1,5>: Cost 4 vmrghw <4,1,5,0>, <2,5,3,0>
+  3838289469U,	// <4,2,1,6>: Cost 4 vsldoi12 <2,1,6,4>, <2,1,6,4>
+  3369682865U,	// <4,2,1,7>: Cost 5 vmrglw <2,3,4,1>, <2,6,2,7>
+  2287313003U,	// <4,2,1,u>: Cost 3 vmrglw <0,u,4,1>, LHS
+  3838658133U,	// <4,2,2,0>: Cost 4 vsldoi12 <2,2,2,4>, <2,2,0,1>
+  3711722394U,	// <4,2,2,1>: Cost 4 vsldoi4 <3,4,2,2>, <1,2,3,4>
+  2759018088U,	// <4,2,2,2>: Cost 3 vsldoi12 <1,2,3,4>, <2,2,2,2>
+  2759018098U,	// <4,2,2,3>: Cost 3 vsldoi12 <1,2,3,4>, <2,2,3,3>
+  3838658168U,	// <4,2,2,4>: Cost 4 vsldoi12 <2,2,2,4>, <2,2,4,0>
+  3369027341U,	// <4,2,2,5>: Cost 4 vmrglw <2,2,4,2>, <2,4,2,5>
+  2240227258U,	// <4,2,2,6>: Cost 3 vmrghw <4,2,5,6>, <2,6,3,7>
+  3735614791U,	// <4,2,2,7>: Cost 4 vsldoi4 <7,4,2,2>, <7,4,2,2>
+  2759018143U,	// <4,2,2,u>: Cost 3 vsldoi12 <1,2,3,4>, <2,2,u,3>
+  2759018150U,	// <4,2,3,0>: Cost 3 vsldoi12 <1,2,3,4>, <2,3,0,1>
+  3831948975U,	// <4,2,3,1>: Cost 4 vsldoi12 <1,1,1,4>, <2,3,1,1>
+  3832759993U,	// <4,2,3,2>: Cost 4 vsldoi12 <1,2,3,4>, <2,3,2,2>
+  2759018180U,	// <4,2,3,3>: Cost 3 vsldoi12 <1,2,3,4>, <2,3,3,4>
+  2759018185U,	// <4,2,3,4>: Cost 3 vsldoi12 <1,2,3,4>, <2,3,4,0>
+  3839542998U,	// <4,2,3,5>: Cost 4 vsldoi12 <2,3,5,4>, <2,3,5,4>
+  3314640826U,	// <4,2,3,6>: Cost 4 vmrghw <4,3,5,7>, <2,6,3,7>
+  2765948648U,	// <4,2,3,7>: Cost 3 vsldoi12 <2,3,7,4>, <2,3,7,4>
+  2759018222U,	// <4,2,3,u>: Cost 3 vsldoi12 <1,2,3,4>, <2,3,u,1>
+  3838658295U,	// <4,2,4,0>: Cost 4 vsldoi12 <2,2,2,4>, <2,4,0,1>
+  3315205667U,	// <4,2,4,1>: Cost 4 vmrghw <4,4,4,4>, <2,1,3,5>
+  2241463912U,	// <4,2,4,2>: Cost 3 vmrghw <4,4,4,4>, <2,2,2,2>
+  1234829414U,	// <4,2,4,3>: Cost 2 vmrglw <4,4,4,4>, LHS
+  2241464085U,	// <4,2,4,4>: Cost 3 vmrghw <4,4,4,4>, <2,4,3,4>
+  2241546087U,	// <4,2,4,5>: Cost 3 vmrghw <4,4,5,5>, <2,5,3,5>
+  2241464250U,	// <4,2,4,6>: Cost 3 vmrghw <4,4,4,4>, <2,6,3,7>
+  3741602873U,	// <4,2,4,7>: Cost 4 vsldoi4 <u,4,2,4>, <7,0,u,2>
+  1234829419U,	// <4,2,4,u>: Cost 2 vmrglw <4,4,4,4>, LHS
+  2626060390U,	// <4,2,5,0>: Cost 3 vsldoi4 <1,4,2,5>, LHS
+  2626061364U,	// <4,2,5,1>: Cost 3 vsldoi4 <1,4,2,5>, <1,4,2,5>
+  1168557672U,	// <4,2,5,2>: Cost 2 vmrghw RHS, <2,2,2,2>
+  1222230118U,	// <4,2,5,3>: Cost 2 vmrglw <2,3,4,5>, LHS
+  2626063670U,	// <4,2,5,4>: Cost 3 vsldoi4 <1,4,2,5>, RHS
+  2242299752U,	// <4,2,5,5>: Cost 3 vmrghw RHS, <2,5,3,6>
+  1168558010U,	// <4,2,5,6>: Cost 2 vmrghw RHS, <2,6,3,7>
+  2242299882U,	// <4,2,5,7>: Cost 3 vmrghw RHS, <2,7,0,1>
+  1222230123U,	// <4,2,5,u>: Cost 2 vmrglw <2,3,4,5>, LHS
+  3711754342U,	// <4,2,6,0>: Cost 4 vsldoi4 <3,4,2,6>, LHS
+  3711755162U,	// <4,2,6,1>: Cost 4 vsldoi4 <3,4,2,6>, <1,2,3,4>
+  3838658481U,	// <4,2,6,2>: Cost 4 vsldoi12 <2,2,2,4>, <2,6,2,7>
+  2759018426U,	// <4,2,6,3>: Cost 3 vsldoi12 <1,2,3,4>, <2,6,3,7>
+  3838658499U,	// <4,2,6,4>: Cost 4 vsldoi12 <2,2,2,4>, <2,6,4,7>
+  3735646310U,	// <4,2,6,5>: Cost 4 vsldoi4 <7,4,2,6>, <5,6,7,4>
+  3316590522U,	// <4,2,6,6>: Cost 4 vmrghw <4,6,5,2>, <2,6,3,7>
+  3798889331U,	// <4,2,6,7>: Cost 4 vsldoi8 <6,7,4,2>, <6,7,4,2>
+  2759018471U,	// <4,2,6,u>: Cost 3 vsldoi12 <1,2,3,4>, <2,6,u,7>
+  3874564074U,	// <4,2,7,0>: Cost 4 vsldoi12 <u,2,3,4>, <2,7,0,1>
+  3800880230U,	// <4,2,7,1>: Cost 4 vsldoi8 <7,1,4,2>, <7,1,4,2>
+  3371722344U,	// <4,2,7,2>: Cost 4 vmrglw <2,6,4,7>, <2,2,2,2>
+  2303950950U,	// <4,2,7,3>: Cost 3 vmrglw <3,6,4,7>, LHS
+  3371722346U,	// <4,2,7,4>: Cost 4 vmrglw <2,6,4,7>, <2,2,2,4>
+  3371722509U,	// <4,2,7,5>: Cost 5 vmrglw <2,6,4,7>, <2,4,2,5>
+  3317237690U,	// <4,2,7,6>: Cost 4 vmrghw <4,7,5,0>, <2,6,3,7>
+  3317237738U,	// <4,2,7,7>: Cost 4 vmrghw <4,7,5,0>, <2,7,0,1>
+  2303950955U,	// <4,2,7,u>: Cost 3 vmrglw <3,6,4,7>, LHS
+  2759018555U,	// <4,2,u,0>: Cost 3 vsldoi12 <1,2,3,4>, <2,u,0,1>
+  2626085943U,	// <4,2,u,1>: Cost 3 vsldoi4 <1,4,2,u>, <1,4,2,u>
+  1170548328U,	// <4,2,u,2>: Cost 2 vmrghw RHS, <2,2,2,2>
+  1222254694U,	// <4,2,u,3>: Cost 2 vmrglw <2,3,4,u>, LHS
+  2759018595U,	// <4,2,u,4>: Cost 3 vsldoi12 <1,2,3,4>, <2,u,4,5>
+  2244290408U,	// <4,2,u,5>: Cost 3 vmrghw RHS, <2,5,3,6>
+  1170548666U,	// <4,2,u,6>: Cost 2 vmrghw RHS, <2,6,3,7>
+  2769266813U,	// <4,2,u,7>: Cost 3 vsldoi12 <2,u,7,4>, <2,u,7,4>
+  1222254699U,	// <4,2,u,u>: Cost 2 vmrglw <2,3,4,u>, LHS
+  2238859414U,	// <4,3,0,0>: Cost 3 vmrghw <4,0,5,1>, <3,0,1,2>
+  2759018646U,	// <4,3,0,1>: Cost 3 vsldoi12 <1,2,3,4>, <3,0,1,2>
+  3312314708U,	// <4,3,0,2>: Cost 4 vmrghw <4,0,1,2>, <3,2,4,3>
+  2238859676U,	// <4,3,0,3>: Cost 3 vmrghw <4,0,5,1>, <3,3,3,3>
+  2295931802U,	// <4,3,0,4>: Cost 3 vmrglw <2,3,4,0>, <1,2,3,4>
+  3735670886U,	// <4,3,0,5>: Cost 4 vsldoi4 <7,4,3,0>, <5,6,7,4>
+  3312315036U,	// <4,3,0,6>: Cost 4 vmrghw <4,0,1,2>, <3,6,4,7>
+  3369674682U,	// <4,3,0,7>: Cost 4 vmrglw <2,3,4,0>, <2,6,3,7>
+  2759018709U,	// <4,3,0,u>: Cost 3 vsldoi12 <1,2,3,4>, <3,0,u,2>
+  3361055638U,	// <4,3,1,0>: Cost 4 vmrglw <0,u,4,1>, <1,2,3,0>
+  3831949542U,	// <4,3,1,1>: Cost 4 vsldoi12 <1,1,1,4>, <3,1,1,1>
+  2703917978U,	// <4,3,1,2>: Cost 3 vsldoi8 <3,2,4,3>, <1,2,3,4>
+  3361056370U,	// <4,3,1,3>: Cost 4 vmrglw <0,u,4,1>, <2,2,3,3>
+  2295939994U,	// <4,3,1,4>: Cost 3 vmrglw <2,3,4,1>, <1,2,3,4>
+  3361056291U,	// <4,3,1,5>: Cost 4 vmrglw <0,u,4,1>, <2,1,3,5>
+  3378972520U,	// <4,3,1,6>: Cost 4 vmrglw <3,u,4,1>, <2,5,3,6>
+  3361056698U,	// <4,3,1,7>: Cost 4 vmrglw <0,u,4,1>, <2,6,3,7>
+  2703917978U,	// <4,3,1,u>: Cost 3 vsldoi8 <3,2,4,3>, <1,2,3,4>
+  3832760624U,	// <4,3,2,0>: Cost 4 vsldoi12 <1,2,3,4>, <3,2,0,3>
+  3711796122U,	// <4,3,2,1>: Cost 4 vsldoi4 <3,4,3,2>, <1,2,3,4>
+  3832760641U,	// <4,3,2,2>: Cost 4 vsldoi12 <1,2,3,4>, <3,2,2,2>
+  2770962764U,	// <4,3,2,3>: Cost 3 vsldoi12 <3,2,3,4>, <3,2,3,4>
+  2759018836U,	// <4,3,2,4>: Cost 3 vsldoi12 <1,2,3,4>, <3,2,4,3>
+  3827304802U,	// <4,3,2,5>: Cost 5 vsldoi12 <0,3,1,4>, <3,2,5,u>
+  3832760678U,	// <4,3,2,6>: Cost 4 vsldoi12 <1,2,3,4>, <3,2,6,3>
+  3859597679U,	// <4,3,2,7>: Cost 4 vsldoi12 <5,6,7,4>, <3,2,7,3>
+  2771331449U,	// <4,3,2,u>: Cost 3 vsldoi12 <3,2,u,4>, <3,2,u,4>
+  2240841878U,	// <4,3,3,0>: Cost 3 vmrghw <4,3,5,0>, <3,0,1,2>
+  3776997635U,	// <4,3,3,1>: Cost 4 vsldoi8 <3,1,4,3>, <3,1,4,3>
+  2703919444U,	// <4,3,3,2>: Cost 3 vsldoi8 <3,2,4,3>, <3,2,4,3>
+  2759018908U,	// <4,3,3,3>: Cost 3 vsldoi12 <1,2,3,4>, <3,3,3,3>
+  2759018918U,	// <4,3,3,4>: Cost 3 vsldoi12 <1,2,3,4>, <3,3,4,4>
+  3386951446U,	// <4,3,3,5>: Cost 4 vmrglw <5,2,4,3>, <2,4,3,5>
+  3777661596U,	// <4,3,3,6>: Cost 4 vsldoi8 <3,2,4,3>, <3,6,4,7>
+  3375007674U,	// <4,3,3,7>: Cost 4 vmrglw <3,2,4,3>, <2,6,3,7>
+  2707901242U,	// <4,3,3,u>: Cost 3 vsldoi8 <3,u,4,3>, <3,u,4,3>
+  2759018960U,	// <4,3,4,0>: Cost 3 vsldoi12 <1,2,3,4>, <3,4,0,1>
+  2759018970U,	// <4,3,4,1>: Cost 3 vsldoi12 <1,2,3,4>, <3,4,1,2>
+  2632099605U,	// <4,3,4,2>: Cost 3 vsldoi4 <2,4,3,4>, <2,4,3,4>
+  2241464732U,	// <4,3,4,3>: Cost 3 vmrghw <4,4,4,4>, <3,3,3,3>
+  2759019000U,	// <4,3,4,4>: Cost 3 vsldoi12 <1,2,3,4>, <3,4,4,5>
+  2753563138U,	// <4,3,4,5>: Cost 3 vsldoi12 <0,3,1,4>, <3,4,5,6>
+  3777662316U,	// <4,3,4,6>: Cost 4 vsldoi8 <3,2,4,3>, <4,6,3,7>
+  2308573114U,	// <4,3,4,7>: Cost 3 vmrglw <4,4,4,4>, <2,6,3,7>
+  2759019032U,	// <4,3,4,u>: Cost 3 vsldoi12 <1,2,3,4>, <3,4,u,1>
+  1168558230U,	// <4,3,5,0>: Cost 2 vmrghw RHS, <3,0,1,2>
+  2242300134U,	// <4,3,5,1>: Cost 3 vmrghw RHS, <3,1,1,1>
+  2632107798U,	// <4,3,5,2>: Cost 3 vsldoi4 <2,4,3,5>, <2,4,3,5>
+  1168558492U,	// <4,3,5,3>: Cost 2 vmrghw RHS, <3,3,3,3>
+  1168558594U,	// <4,3,5,4>: Cost 2 vmrghw RHS, <3,4,5,6>
+  2295973654U,	// <4,3,5,5>: Cost 3 vmrglw <2,3,4,5>, <2,4,3,5>
+  2242300536U,	// <4,3,5,6>: Cost 3 vmrghw RHS, <3,6,0,7>
+  2295973818U,	// <4,3,5,7>: Cost 3 vmrglw <2,3,4,5>, <2,6,3,7>
+  1168558878U,	// <4,3,5,u>: Cost 2 vmrghw RHS, <3,u,1,2>
+  3832760952U,	// <4,3,6,0>: Cost 4 vsldoi12 <1,2,3,4>, <3,6,0,7>
+  3711828890U,	// <4,3,6,1>: Cost 4 vsldoi4 <3,4,3,6>, <1,2,3,4>
+  3316484436U,	// <4,3,6,2>: Cost 4 vmrghw <4,6,3,7>, <3,2,4,3>
+  3711830512U,	// <4,3,6,3>: Cost 4 vsldoi4 <3,4,3,6>, <3,4,3,6>
+  2759019164U,	// <4,3,6,4>: Cost 3 vsldoi12 <1,2,3,4>, <3,6,4,7>
+  3361097251U,	// <4,3,6,5>: Cost 5 vmrglw <0,u,4,6>, <2,1,3,5>
+  3316624045U,	// <4,3,6,6>: Cost 4 vmrghw <4,6,5,6>, <3,6,6,6>
+  2773912244U,	// <4,3,6,7>: Cost 3 vsldoi12 <3,6,7,4>, <3,6,7,4>
+  2759019164U,	// <4,3,6,u>: Cost 3 vsldoi12 <1,2,3,4>, <3,6,4,7>
+  3377693590U,	// <4,3,7,0>: Cost 4 vmrglw <3,6,4,7>, <1,2,3,0>
+  3365751680U,	// <4,3,7,1>: Cost 5 vmrglw <1,6,4,7>, <4,0,3,1>
+  2727810232U,	// <4,3,7,2>: Cost 3 vsldoi8 <7,2,4,3>, <7,2,4,3>
+  3377694322U,	// <4,3,7,3>: Cost 4 vmrglw <3,6,4,7>, <2,2,3,3>
+  2303951770U,	// <4,3,7,4>: Cost 3 vmrglw <3,6,4,7>, <1,2,3,4>
+  3741700198U,	// <4,3,7,5>: Cost 4 vsldoi4 <u,4,3,7>, <5,6,7,4>
+  3377695216U,	// <4,3,7,6>: Cost 4 vmrglw <3,6,4,7>, <3,4,3,6>
+  3375703994U,	// <4,3,7,7>: Cost 4 vmrglw <3,3,4,7>, <2,6,3,7>
+  2731792030U,	// <4,3,7,u>: Cost 3 vsldoi8 <7,u,4,3>, <7,u,4,3>
+  1170548886U,	// <4,3,u,0>: Cost 2 vmrghw RHS, <3,0,1,2>
+  2759019294U,	// <4,3,u,1>: Cost 3 vsldoi12 <1,2,3,4>, <3,u,1,2>
+  2632132377U,	// <4,3,u,2>: Cost 3 vsldoi4 <2,4,3,u>, <2,4,3,u>
+  1170549148U,	// <4,3,u,3>: Cost 2 vmrghw RHS, <3,3,3,3>
+  1170549250U,	// <4,3,u,4>: Cost 2 vmrghw RHS, <3,4,5,6>
+  2759019334U,	// <4,3,u,5>: Cost 3 vsldoi12 <1,2,3,4>, <3,u,5,6>
+  2244291192U,	// <4,3,u,6>: Cost 3 vmrghw RHS, <3,6,0,7>
+  2295998394U,	// <4,3,u,7>: Cost 3 vmrglw <2,3,4,u>, <2,6,3,7>
+  1170549534U,	// <4,3,u,u>: Cost 2 vmrghw RHS, <3,u,1,2>
+  1165118354U,	// <4,4,0,0>: Cost 2 vmrghw <4,0,5,1>, <4,0,5,1>
+  1637482598U,	// <4,4,0,1>: Cost 2 vsldoi8 <4,4,4,4>, LHS
+  3711854285U,	// <4,4,0,2>: Cost 4 vsldoi4 <3,4,4,0>, <2,3,4,4>
+  3827305344U,	// <4,4,0,3>: Cost 4 vsldoi12 <0,3,1,4>, <4,0,3,1>
+  2711224658U,	// <4,4,0,4>: Cost 3 vsldoi8 <4,4,4,4>, <0,4,1,5>
+  1165118774U,	// <4,4,0,5>: Cost 2 vmrghw <4,0,5,1>, RHS
+  3312602489U,	// <4,4,0,6>: Cost 4 vmrghw <4,0,5,1>, <4,6,5,2>
+  3369675420U,	// <4,4,0,7>: Cost 4 vmrglw <2,3,4,0>, <3,6,4,7>
+  1165119017U,	// <4,4,0,u>: Cost 2 vmrghw <4,0,5,1>, RHS
+  3369682633U,	// <4,4,1,0>: Cost 4 vmrglw <2,3,4,1>, <2,3,4,0>
+  2287313581U,	// <4,4,1,1>: Cost 3 vmrglw <0,u,4,1>, <0,u,4,1>
+  2759019466U,	// <4,4,1,2>: Cost 3 vsldoi12 <1,2,3,4>, <4,1,2,3>
+  3369683284U,	// <4,4,1,3>: Cost 4 vmrglw <2,3,4,1>, <3,2,4,3>
+  2311204048U,	// <4,4,1,4>: Cost 3 vmrglw <4,u,4,1>, <4,4,4,4>
+  2239319350U,	// <4,4,1,5>: Cost 3 vmrghw <4,1,2,3>, RHS
+  3784967411U,	// <4,4,1,6>: Cost 4 vsldoi8 <4,4,4,4>, <1,6,5,7>
+  3369683612U,	// <4,4,1,7>: Cost 4 vmrglw <2,3,4,1>, <3,6,4,7>
+  2763000832U,	// <4,4,1,u>: Cost 3 vsldoi12 <1,u,3,4>, <4,1,u,3>
+  3711869030U,	// <4,4,2,0>: Cost 4 vsldoi4 <3,4,4,2>, LHS
+  3711869850U,	// <4,4,2,1>: Cost 4 vsldoi4 <3,4,4,2>, <1,2,3,4>
+  2240203830U,	// <4,4,2,2>: Cost 3 vmrghw <4,2,5,3>, <4,2,5,3>
+  2698618573U,	// <4,4,2,3>: Cost 3 vsldoi8 <2,3,4,4>, <2,3,4,4>
+  2711226133U,	// <4,4,2,4>: Cost 3 vsldoi8 <4,4,4,4>, <2,4,3,4>
+  2240204086U,	// <4,4,2,5>: Cost 3 vmrghw <4,2,5,3>, RHS
+  2711226298U,	// <4,4,2,6>: Cost 3 vsldoi8 <4,4,4,4>, <2,6,3,7>
+  3832761416U,	// <4,4,2,7>: Cost 4 vsldoi12 <1,2,3,4>, <4,2,7,3>
+  2701936738U,	// <4,4,2,u>: Cost 3 vsldoi8 <2,u,4,4>, <2,u,4,4>
+  2711226518U,	// <4,4,3,0>: Cost 3 vsldoi8 <4,4,4,4>, <3,0,1,2>
+  3777005828U,	// <4,4,3,1>: Cost 4 vsldoi8 <3,1,4,4>, <3,1,4,4>
+  3832761453U,	// <4,4,3,2>: Cost 4 vsldoi12 <1,2,3,4>, <4,3,2,4>
+  2301266260U,	// <4,4,3,3>: Cost 3 vmrglw <3,2,4,3>, <3,2,4,3>
+  2705254903U,	// <4,4,3,4>: Cost 3 vsldoi8 <3,4,4,4>, <3,4,4,4>
+  2240843062U,	// <4,4,3,5>: Cost 3 vmrghw <4,3,5,0>, RHS
+  3832761489U,	// <4,4,3,6>: Cost 4 vsldoi12 <1,2,3,4>, <4,3,6,4>
+  3375008412U,	// <4,4,3,7>: Cost 4 vmrglw <3,2,4,3>, <3,6,4,7>
+  2301266260U,	// <4,4,3,u>: Cost 3 vmrglw <3,2,4,3>, <3,2,4,3>
+  1570373734U,	// <4,4,4,0>: Cost 2 vsldoi4 <4,4,4,4>, LHS
+  2308574089U,	// <4,4,4,1>: Cost 3 vmrglw <4,4,4,4>, <4,0,4,1>
+  2644117096U,	// <4,4,4,2>: Cost 3 vsldoi4 <4,4,4,4>, <2,2,2,2>
+  2638146039U,	// <4,4,4,3>: Cost 3 vsldoi4 <3,4,4,4>, <3,4,4,4>
+  229035318U,	// <4,4,4,4>: Cost 1 vspltisw0 RHS
+  1167723830U,	// <4,4,4,5>: Cost 2 vmrghw <4,4,4,4>, RHS
+  2644120058U,	// <4,4,4,6>: Cost 3 vsldoi4 <4,4,4,4>, <6,2,7,3>
+  2662036827U,	// <4,4,4,7>: Cost 3 vsldoi4 <7,4,4,4>, <7,4,4,4>
+  229035318U,	// <4,4,4,u>: Cost 1 vspltisw0 RHS
+  1168558994U,	// <4,4,5,0>: Cost 2 vmrghw RHS, <4,0,5,1>
+  2638152602U,	// <4,4,5,1>: Cost 3 vsldoi4 <3,4,4,5>, <1,2,3,4>
+  2242300981U,	// <4,4,5,2>: Cost 3 vmrghw RHS, <4,2,5,2>
+  2638154232U,	// <4,4,5,3>: Cost 3 vsldoi4 <3,4,4,5>, <3,4,4,5>
+  1168559322U,	// <4,4,5,4>: Cost 2 vmrghw RHS, <4,4,5,5>
+  94817590U,	// <4,4,5,5>: Cost 1 vmrghw RHS, RHS
+  1685278006U,	// <4,4,5,6>: Cost 2 vsldoi12 <1,2,3,4>, RHS
+  2242309576U,	// <4,4,5,7>: Cost 3 vmrghw RHS, <4,7,5,0>
+  94817833U,	// <4,4,5,u>: Cost 1 vmrghw RHS, RHS
+  3316591506U,	// <4,4,6,0>: Cost 4 vmrghw <4,6,5,2>, <4,0,5,1>
+  3758428587U,	// <4,4,6,1>: Cost 4 vsldoi8 <0,0,4,4>, <6,1,7,5>
+  2711228922U,	// <4,4,6,2>: Cost 3 vsldoi8 <4,4,4,4>, <6,2,7,3>
+  3796251185U,	// <4,4,6,3>: Cost 4 vsldoi8 <6,3,4,4>, <6,3,4,4>
+  2711229085U,	// <4,4,6,4>: Cost 3 vsldoi8 <4,4,4,4>, <6,4,7,4>
+  2242850102U,	// <4,4,6,5>: Cost 3 vmrghw <4,6,5,2>, RHS
+  2242850169U,	// <4,4,6,6>: Cost 3 vmrghw <4,6,5,2>, <4,6,5,2>
+  2725163893U,	// <4,4,6,7>: Cost 3 vsldoi8 <6,7,4,4>, <6,7,4,4>
+  2242850345U,	// <4,4,6,u>: Cost 3 vmrghw <4,6,5,2>, RHS
+  2711229434U,	// <4,4,7,0>: Cost 3 vsldoi8 <4,4,4,4>, <7,0,1,2>
+  3377694410U,	// <4,4,7,1>: Cost 4 vmrglw <3,6,4,7>, <2,3,4,1>
+  3868593584U,	// <4,4,7,2>: Cost 4 vsldoi12 <7,2,3,4>, <4,7,2,3>
+  3377695060U,	// <4,4,7,3>: Cost 4 vmrglw <3,6,4,7>, <3,2,4,3>
+  2729145691U,	// <4,4,7,4>: Cost 3 vsldoi8 <7,4,4,4>, <7,4,4,4>
+  2243497270U,	// <4,4,7,5>: Cost 3 vmrghw <4,7,5,0>, RHS
+  3871542744U,	// <4,4,7,6>: Cost 4 vsldoi12 <7,6,7,4>, <4,7,6,7>
+  2303953564U,	// <4,4,7,7>: Cost 3 vmrglw <3,6,4,7>, <3,6,4,7>
+  2243497513U,	// <4,4,7,u>: Cost 3 vmrghw <4,7,5,0>, RHS
+  1170549650U,	// <4,4,u,0>: Cost 2 vmrghw RHS, <4,0,5,1>
+  1637488430U,	// <4,4,u,1>: Cost 2 vsldoi8 <4,4,4,4>, LHS
+  2244291637U,	// <4,4,u,2>: Cost 3 vmrghw RHS, <4,2,5,2>
+  2638178811U,	// <4,4,u,3>: Cost 3 vsldoi4 <3,4,4,u>, <3,4,4,u>
+  229035318U,	// <4,4,u,4>: Cost 1 vspltisw0 RHS
+  96808246U,	// <4,4,u,5>: Cost 1 vmrghw RHS, RHS
+  1685278249U,	// <4,4,u,6>: Cost 2 vsldoi12 <1,2,3,4>, RHS
+  2244292040U,	// <4,4,u,7>: Cost 3 vmrghw RHS, <4,7,5,0>
+  96808489U,	// <4,4,u,u>: Cost 1 vmrghw RHS, RHS
+  2698625024U,	// <4,5,0,0>: Cost 3 vsldoi8 <2,3,4,5>, <0,0,0,0>
+  1624883302U,	// <4,5,0,1>: Cost 2 vsldoi8 <2,3,4,5>, LHS
+  2638186190U,	// <4,5,0,2>: Cost 3 vsldoi4 <3,4,5,0>, <2,3,4,5>
+  2638187004U,	// <4,5,0,3>: Cost 3 vsldoi4 <3,4,5,0>, <3,4,5,0>
+  2687345005U,	// <4,5,0,4>: Cost 3 vsldoi8 <0,4,4,5>, <0,4,4,5>
+  2238861316U,	// <4,5,0,5>: Cost 3 vmrghw <4,0,5,1>, <5,5,5,5>
+  2662077302U,	// <4,5,0,6>: Cost 3 vsldoi4 <7,4,5,0>, <6,7,4,5>
+  2662077792U,	// <4,5,0,7>: Cost 3 vsldoi4 <7,4,5,0>, <7,4,5,0>
+  1624883869U,	// <4,5,0,u>: Cost 2 vsldoi8 <2,3,4,5>, LHS
+  3361057762U,	// <4,5,1,0>: Cost 4 vmrglw <0,u,4,1>, <4,1,5,0>
+  2691326803U,	// <4,5,1,1>: Cost 3 vsldoi8 <1,1,4,5>, <1,1,4,5>
+  2698625942U,	// <4,5,1,2>: Cost 3 vsldoi8 <2,3,4,5>, <1,2,3,0>
+  3361055659U,	// <4,5,1,3>: Cost 4 vmrglw <0,u,4,1>, <1,2,5,3>
+  3761087567U,	// <4,5,1,4>: Cost 4 vsldoi8 <0,4,4,5>, <1,4,5,5>
+  2693981335U,	// <4,5,1,5>: Cost 3 vsldoi8 <1,5,4,5>, <1,5,4,5>
+  2305231362U,	// <4,5,1,6>: Cost 3 vmrglw <3,u,4,1>, <3,4,5,6>
+  3361055987U,	// <4,5,1,7>: Cost 4 vmrglw <0,u,4,1>, <1,6,5,7>
+  2695972234U,	// <4,5,1,u>: Cost 3 vsldoi8 <1,u,4,5>, <1,u,4,5>
+  2638200934U,	// <4,5,2,0>: Cost 3 vsldoi4 <3,4,5,2>, LHS
+  3761088035U,	// <4,5,2,1>: Cost 4 vsldoi8 <0,4,4,5>, <2,1,3,5>
+  2697963133U,	// <4,5,2,2>: Cost 3 vsldoi8 <2,2,4,5>, <2,2,4,5>
+  1624884942U,	// <4,5,2,3>: Cost 2 vsldoi8 <2,3,4,5>, <2,3,4,5>
+  2698626838U,	// <4,5,2,4>: Cost 3 vsldoi8 <2,3,4,5>, <2,4,3,5>
+  3772368744U,	// <4,5,2,5>: Cost 4 vsldoi8 <2,3,4,5>, <2,5,3,6>
+  2698627002U,	// <4,5,2,6>: Cost 3 vsldoi8 <2,3,4,5>, <2,6,3,7>
+  3775023122U,	// <4,5,2,7>: Cost 4 vsldoi8 <2,7,4,5>, <2,7,4,5>
+  1628203107U,	// <4,5,2,u>: Cost 2 vsldoi8 <2,u,4,5>, <2,u,4,5>
+  2698627222U,	// <4,5,3,0>: Cost 3 vsldoi8 <2,3,4,5>, <3,0,1,2>
+  3765070057U,	// <4,5,3,1>: Cost 4 vsldoi8 <1,1,4,5>, <3,1,1,4>
+  2698627404U,	// <4,5,3,2>: Cost 3 vsldoi8 <2,3,4,5>, <3,2,3,4>
+  2698627484U,	// <4,5,3,3>: Cost 3 vsldoi8 <2,3,4,5>, <3,3,3,3>
+  2698627580U,	// <4,5,3,4>: Cost 3 vsldoi8 <2,3,4,5>, <3,4,5,0>
+  3779668553U,	// <4,5,3,5>: Cost 4 vsldoi8 <3,5,4,5>, <3,5,4,5>
+  2725169844U,	// <4,5,3,6>: Cost 3 vsldoi8 <6,7,4,5>, <3,6,7,4>
+  2707253995U,	// <4,5,3,7>: Cost 3 vsldoi8 <3,7,4,5>, <3,7,4,5>
+  2698627870U,	// <4,5,3,u>: Cost 3 vsldoi8 <2,3,4,5>, <3,u,1,2>
+  2638217318U,	// <4,5,4,0>: Cost 3 vsldoi4 <3,4,5,4>, LHS
+  2308574098U,	// <4,5,4,1>: Cost 3 vmrglw <4,4,4,4>, <4,0,5,1>
+  2698628150U,	// <4,5,4,2>: Cost 3 vsldoi8 <2,3,4,5>, <4,2,5,3>
+  2638219776U,	// <4,5,4,3>: Cost 3 vsldoi4 <3,4,5,4>, <3,4,5,4>
+  2698628314U,	// <4,5,4,4>: Cost 3 vsldoi8 <2,3,4,5>, <4,4,5,5>
+  1624886582U,	// <4,5,4,5>: Cost 2 vsldoi8 <2,3,4,5>, RHS
+  2698628478U,	// <4,5,4,6>: Cost 3 vsldoi8 <2,3,4,5>, <4,6,5,7>
+  2662110564U,	// <4,5,4,7>: Cost 3 vsldoi4 <7,4,5,4>, <7,4,5,4>
+  1624886825U,	// <4,5,4,u>: Cost 2 vsldoi8 <2,3,4,5>, RHS
+  1570455654U,	// <4,5,5,0>: Cost 2 vsldoi4 <4,4,5,5>, LHS
+  2312564250U,	// <4,5,5,1>: Cost 3 vmrglw <5,1,4,5>, <4,u,5,1>
+  2644199118U,	// <4,5,5,2>: Cost 3 vsldoi4 <4,4,5,5>, <2,3,4,5>
+  2295974966U,	// <4,5,5,3>: Cost 3 vmrglw <2,3,4,5>, <4,2,5,3>
+  1570458842U,	// <4,5,5,4>: Cost 2 vsldoi4 <4,4,5,5>, <4,4,5,5>
+  1168568324U,	// <4,5,5,5>: Cost 2 vmrghw RHS, <5,5,5,5>
+  1168568418U,	// <4,5,5,6>: Cost 2 vmrghw RHS, <5,6,7,0>
+  2295975294U,	// <4,5,5,7>: Cost 3 vmrglw <2,3,4,5>, <4,6,5,7>
+  1168716036U,	// <4,5,5,u>: Cost 2 vmrghw RHS, <5,u,7,0>
+  1564491878U,	// <4,5,6,0>: Cost 2 vsldoi4 <3,4,5,6>, LHS
+  2626290768U,	// <4,5,6,1>: Cost 3 vsldoi4 <1,4,5,6>, <1,4,5,6>
+  2632263465U,	// <4,5,6,2>: Cost 3 vsldoi4 <2,4,5,6>, <2,4,5,6>
+  1564494338U,	// <4,5,6,3>: Cost 2 vsldoi4 <3,4,5,6>, <3,4,5,6>
+  1564495158U,	// <4,5,6,4>: Cost 2 vsldoi4 <3,4,5,6>, RHS
+  2638237464U,	// <4,5,6,5>: Cost 3 vsldoi4 <3,4,5,6>, <5,2,6,3>
+  2656154253U,	// <4,5,6,6>: Cost 3 vsldoi4 <6,4,5,6>, <6,4,5,6>
+  27705344U,	// <4,5,6,7>: Cost 0 copy RHS
+  27705344U,	// <4,5,6,u>: Cost 0 copy RHS
+  2725172218U,	// <4,5,7,0>: Cost 3 vsldoi8 <6,7,4,5>, <7,0,1,2>
+  3859599489U,	// <4,5,7,1>: Cost 4 vsldoi12 <5,6,7,4>, <5,7,1,4>
+  2698630320U,	// <4,5,7,2>: Cost 3 vsldoi8 <2,3,4,5>, <7,2,3,4>
+  2728490251U,	// <4,5,7,3>: Cost 3 vsldoi8 <7,3,4,5>, <7,3,4,5>
+  2725172576U,	// <4,5,7,4>: Cost 3 vsldoi8 <6,7,4,5>, <7,4,5,0>
+  3317239812U,	// <4,5,7,5>: Cost 4 vmrghw <4,7,5,0>, <5,5,5,5>
+  2725172760U,	// <4,5,7,6>: Cost 3 vsldoi8 <6,7,4,5>, <7,6,7,4>
+  2725172844U,	// <4,5,7,7>: Cost 3 vsldoi8 <6,7,4,5>, <7,7,7,7>
+  2725172866U,	// <4,5,7,u>: Cost 3 vsldoi8 <6,7,4,5>, <7,u,1,2>
+  1564508262U,	// <4,5,u,0>: Cost 2 vsldoi4 <3,4,5,u>, LHS
+  1624889134U,	// <4,5,u,1>: Cost 2 vsldoi8 <2,3,4,5>, LHS
+  2698631045U,	// <4,5,u,2>: Cost 3 vsldoi8 <2,3,4,5>, <u,2,3,0>
+  1564510724U,	// <4,5,u,3>: Cost 2 vsldoi4 <3,4,5,u>, <3,4,5,u>
+  1564511542U,	// <4,5,u,4>: Cost 2 vsldoi4 <3,4,5,u>, RHS
+  1624889498U,	// <4,5,u,5>: Cost 2 vsldoi8 <2,3,4,5>, RHS
+  1170550882U,	// <4,5,u,6>: Cost 2 vmrghw RHS, <5,6,7,0>
+  27705344U,	// <4,5,u,7>: Cost 0 copy RHS
+  27705344U,	// <4,5,u,u>: Cost 0 copy RHS
+  3312595285U,	// <4,6,0,0>: Cost 4 vmrghw <4,0,5,0>, <6,0,7,0>
+  3763748966U,	// <4,6,0,1>: Cost 4 vsldoi8 <0,u,4,6>, LHS
+  2238861818U,	// <4,6,0,2>: Cost 3 vmrghw <4,0,5,1>, <6,2,7,3>
+  3767730432U,	// <4,6,0,3>: Cost 4 vsldoi8 <1,5,4,6>, <0,3,1,4>
+  3763749202U,	// <4,6,0,4>: Cost 4 vsldoi8 <0,u,4,6>, <0,4,1,5>
+  2238862059U,	// <4,6,0,5>: Cost 3 vmrghw <4,0,5,1>, <6,5,7,1>
+  2238862136U,	// <4,6,0,6>: Cost 3 vmrghw <4,0,5,1>, <6,6,6,6>
+  2295934262U,	// <4,6,0,7>: Cost 3 vmrglw <2,3,4,0>, RHS
+  2295934263U,	// <4,6,0,u>: Cost 3 vmrglw <2,3,4,0>, RHS
+  3378973999U,	// <4,6,1,0>: Cost 4 vmrglw <3,u,4,1>, <4,5,6,0>
+  3378974648U,	// <4,6,1,1>: Cost 4 vmrglw <3,u,4,1>, <5,4,6,1>
+  3779675034U,	// <4,6,1,2>: Cost 4 vsldoi8 <3,5,4,6>, <1,2,3,4>
+  3378974002U,	// <4,6,1,3>: Cost 4 vmrglw <3,u,4,1>, <4,5,6,3>
+  3378974003U,	// <4,6,1,4>: Cost 4 vmrglw <3,u,4,1>, <4,5,6,4>
+  3767731352U,	// <4,6,1,5>: Cost 4 vsldoi8 <1,5,4,6>, <1,5,4,6>
+  3378974734U,	// <4,6,1,6>: Cost 4 vmrglw <3,u,4,1>, <5,5,6,6>
+  2287316278U,	// <4,6,1,7>: Cost 3 vmrglw <0,u,4,1>, RHS
+  2287316279U,	// <4,6,1,u>: Cost 3 vmrglw <0,u,4,1>, RHS
+  3735904358U,	// <4,6,2,0>: Cost 4 vsldoi4 <7,4,6,2>, LHS
+  3763750435U,	// <4,6,2,1>: Cost 5 vsldoi8 <0,u,4,6>, <2,1,3,5>
+  3313938937U,	// <4,6,2,2>: Cost 4 vmrghw <4,2,5,2>, <6,2,7,2>
+  3772376782U,	// <4,6,2,3>: Cost 4 vsldoi8 <2,3,4,6>, <2,3,4,5>
+  3852890591U,	// <4,6,2,4>: Cost 4 vsldoi12 <4,5,6,4>, <6,2,4,3>
+  3735908454U,	// <4,6,2,5>: Cost 4 vsldoi4 <7,4,6,2>, <5,6,7,4>
+  3801573306U,	// <4,6,2,6>: Cost 4 vsldoi8 <7,2,4,6>, <2,6,3,7>
+  2785858042U,	// <4,6,2,7>: Cost 3 vsldoi12 <5,6,7,4>, <6,2,7,3>
+  2785858051U,	// <4,6,2,u>: Cost 3 vsldoi12 <5,6,7,4>, <6,2,u,3>
+  3863065101U,	// <4,6,3,0>: Cost 4 vsldoi12 <6,3,0,4>, <6,3,0,4>
+  3314586024U,	// <4,6,3,1>: Cost 4 vmrghw <4,3,5,0>, <6,1,7,2>
+  3863212575U,	// <4,6,3,2>: Cost 4 vsldoi12 <6,3,2,4>, <6,3,2,4>
+  3863286312U,	// <4,6,3,3>: Cost 4 vsldoi12 <6,3,3,4>, <6,3,3,4>
+  3767732738U,	// <4,6,3,4>: Cost 4 vsldoi8 <1,5,4,6>, <3,4,5,6>
+  3779676746U,	// <4,6,3,5>: Cost 4 vsldoi8 <3,5,4,6>, <3,5,4,6>
+  3398898488U,	// <4,6,3,6>: Cost 4 vmrglw <7,2,4,3>, <6,6,6,6>
+  2301267254U,	// <4,6,3,7>: Cost 3 vmrglw <3,2,4,3>, RHS
+  2301267255U,	// <4,6,3,u>: Cost 3 vmrglw <3,2,4,3>, RHS
+  3852890715U,	// <4,6,4,0>: Cost 4 vsldoi12 <4,5,6,4>, <6,4,0,1>
+  3315208615U,	// <4,6,4,1>: Cost 4 vmrghw <4,4,4,4>, <6,1,7,1>
+  2241466874U,	// <4,6,4,2>: Cost 3 vmrghw <4,4,4,4>, <6,2,7,3>
+  3852890745U,	// <4,6,4,3>: Cost 4 vsldoi12 <4,5,6,4>, <6,4,3,4>
+  2241467037U,	// <4,6,4,4>: Cost 3 vmrghw <4,4,4,4>, <6,4,7,4>
+  2241549039U,	// <4,6,4,5>: Cost 3 vmrghw <4,4,5,5>, <6,5,7,5>
+  2241467192U,	// <4,6,4,6>: Cost 3 vmrghw <4,4,4,4>, <6,6,6,6>
+  1234832694U,	// <4,6,4,7>: Cost 2 vmrglw <4,4,4,4>, RHS
+  1234832695U,	// <4,6,4,u>: Cost 2 vmrglw <4,4,4,4>, RHS
+  2242302241U,	// <4,6,5,0>: Cost 3 vmrghw RHS, <6,0,1,2>
+  2242310567U,	// <4,6,5,1>: Cost 3 vmrghw RHS, <6,1,7,1>
+  1168568826U,	// <4,6,5,2>: Cost 2 vmrghw RHS, <6,2,7,3>
+  2242302514U,	// <4,6,5,3>: Cost 3 vmrghw RHS, <6,3,4,5>
+  2242302605U,	// <4,6,5,4>: Cost 3 vmrghw RHS, <6,4,5,6>
+  2242310891U,	// <4,6,5,5>: Cost 3 vmrghw RHS, <6,5,7,1>
+  1168569144U,	// <4,6,5,6>: Cost 2 vmrghw RHS, <6,6,6,6>
+  1222233398U,	// <4,6,5,7>: Cost 2 vmrglw <2,3,4,5>, RHS
+  1222233399U,	// <4,6,5,u>: Cost 2 vmrglw <2,3,4,5>, RHS
+  3316576545U,	// <4,6,6,0>: Cost 4 vmrghw <4,6,5,0>, <6,0,1,2>
+  3316584871U,	// <4,6,6,1>: Cost 4 vmrghw <4,6,5,1>, <6,1,7,1>
+  2242851322U,	// <4,6,6,2>: Cost 3 vmrghw <4,6,5,2>, <6,2,7,3>
+  3316601394U,	// <4,6,6,3>: Cost 4 vmrghw <4,6,5,3>, <6,3,4,5>
+  3852890916U,	// <4,6,6,4>: Cost 4 vsldoi12 <4,5,6,4>, <6,6,4,4>
+  3316617963U,	// <4,6,6,5>: Cost 4 vmrghw <4,6,5,5>, <6,5,7,1>
+  2242884408U,	// <4,6,6,6>: Cost 3 vmrghw <4,6,5,6>, <6,6,6,6>
+  2785858370U,	// <4,6,6,7>: Cost 3 vsldoi12 <5,6,7,4>, <6,6,7,7>
+  2785858379U,	// <4,6,6,u>: Cost 3 vsldoi12 <5,6,7,4>, <6,6,u,7>
+  2785858382U,	// <4,6,7,0>: Cost 3 vsldoi12 <5,6,7,4>, <6,7,0,1>
+  3859600215U,	// <4,6,7,1>: Cost 4 vsldoi12 <5,6,7,4>, <6,7,1,1>
+  3317240314U,	// <4,6,7,2>: Cost 4 vmrghw <4,7,5,0>, <6,2,7,3>
+  2792199020U,	// <4,6,7,3>: Cost 3 vsldoi12 <6,7,3,4>, <6,7,3,4>
+  2785858422U,	// <4,6,7,4>: Cost 3 vsldoi12 <5,6,7,4>, <6,7,4,5>
+  3856651132U,	// <4,6,7,5>: Cost 4 vsldoi12 <5,2,3,4>, <6,7,5,2>
+  3317240632U,	// <4,6,7,6>: Cost 4 vmrghw <4,7,5,0>, <6,6,6,6>
+  2303954230U,	// <4,6,7,7>: Cost 3 vmrglw <3,6,4,7>, RHS
+  2303954231U,	// <4,6,7,u>: Cost 3 vmrglw <3,6,4,7>, RHS
+  2244292897U,	// <4,6,u,0>: Cost 3 vmrghw RHS, <6,0,1,2>
+  2244293031U,	// <4,6,u,1>: Cost 3 vmrghw RHS, <6,1,7,1>
+  1170551290U,	// <4,6,u,2>: Cost 2 vmrghw RHS, <6,2,7,3>
+  2244293170U,	// <4,6,u,3>: Cost 3 vmrghw RHS, <6,3,4,5>
+  2244293261U,	// <4,6,u,4>: Cost 3 vmrghw RHS, <6,4,5,6>
+  2244293355U,	// <4,6,u,5>: Cost 3 vmrghw RHS, <6,5,7,1>
+  1170551608U,	// <4,6,u,6>: Cost 2 vmrghw RHS, <6,6,6,6>
+  1222257974U,	// <4,6,u,7>: Cost 2 vmrglw <2,3,4,u>, RHS
+  1222257975U,	// <4,6,u,u>: Cost 2 vmrglw <2,3,4,u>, RHS
+  2238862330U,	// <4,7,0,0>: Cost 3 vmrghw <4,0,5,1>, <7,0,1,2>
+  2706604134U,	// <4,7,0,1>: Cost 3 vsldoi8 <3,6,4,7>, LHS
+  3312604308U,	// <4,7,0,2>: Cost 4 vmrghw <4,0,5,1>, <7,2,0,3>
+  3768402176U,	// <4,7,0,3>: Cost 4 vsldoi8 <1,6,4,7>, <0,3,1,4>
+  2238862648U,	// <4,7,0,4>: Cost 3 vmrghw <4,0,5,1>, <7,4,0,5>
+  3859600418U,	// <4,7,0,5>: Cost 4 vsldoi12 <5,6,7,4>, <7,0,5,6>
+  3729994393U,	// <4,7,0,6>: Cost 4 vsldoi4 <6,4,7,0>, <6,4,7,0>
+  2238862956U,	// <4,7,0,7>: Cost 3 vmrghw <4,0,5,1>, <7,7,7,7>
+  2706604701U,	// <4,7,0,u>: Cost 3 vsldoi8 <3,6,4,7>, LHS
+  3385610338U,	// <4,7,1,0>: Cost 4 vmrglw <5,0,4,1>, <5,6,7,0>
+  3780346676U,	// <4,7,1,1>: Cost 4 vsldoi8 <3,6,4,7>, <1,1,1,1>
+  2706604954U,	// <4,7,1,2>: Cost 3 vsldoi8 <3,6,4,7>, <1,2,3,4>
+  3385610746U,	// <4,7,1,3>: Cost 4 vmrglw <5,0,4,1>, <6,2,7,3>
+  3385610342U,	// <4,7,1,4>: Cost 4 vmrglw <5,0,4,1>, <5,6,7,4>
+  3385610667U,	// <4,7,1,5>: Cost 4 vmrglw <5,0,4,1>, <6,1,7,5>
+  3768403178U,	// <4,7,1,6>: Cost 4 vsldoi8 <1,6,4,7>, <1,6,4,7>
+  3385611074U,	// <4,7,1,7>: Cost 4 vmrglw <5,0,4,1>, <6,6,7,7>
+  2706604954U,	// <4,7,1,u>: Cost 3 vsldoi8 <3,6,4,7>, <1,2,3,4>
+  3859600532U,	// <4,7,2,0>: Cost 4 vsldoi12 <5,6,7,4>, <7,2,0,3>
+  3712091034U,	// <4,7,2,1>: Cost 5 vsldoi4 <3,4,7,2>, <1,2,3,4>
+  3774375528U,	// <4,7,2,2>: Cost 4 vsldoi8 <2,6,4,7>, <2,2,2,2>
+  2794853552U,	// <4,7,2,3>: Cost 3 vsldoi12 <7,2,3,4>, <7,2,3,4>
+  2785858744U,	// <4,7,2,4>: Cost 3 vsldoi12 <5,6,7,4>, <7,2,4,3>
+  3735982182U,	// <4,7,2,5>: Cost 4 vsldoi4 <7,4,7,2>, <5,6,7,4>
+  3774375875U,	// <4,7,2,6>: Cost 4 vsldoi8 <2,6,4,7>, <2,6,4,7>
+  3735983476U,	// <4,7,2,7>: Cost 4 vsldoi4 <7,4,7,2>, <7,4,7,2>
+  2795222237U,	// <4,7,2,u>: Cost 3 vsldoi12 <7,2,u,4>, <7,2,u,4>
+  3780348054U,	// <4,7,3,0>: Cost 4 vsldoi8 <3,6,4,7>, <3,0,1,2>
+  3730015130U,	// <4,7,3,1>: Cost 4 vsldoi4 <6,4,7,3>, <1,2,3,4>
+  3780348244U,	// <4,7,3,2>: Cost 4 vsldoi8 <3,6,4,7>, <3,2,4,3>
+  3778357673U,	// <4,7,3,3>: Cost 4 vsldoi8 <3,3,4,7>, <3,3,4,7>
+  2325155942U,	// <4,7,3,4>: Cost 3 vmrglw <7,2,4,3>, <5,6,7,4>
+  3779684939U,	// <4,7,3,5>: Cost 5 vsldoi8 <3,5,4,7>, <3,5,4,7>
+  2706606748U,	// <4,7,3,6>: Cost 3 vsldoi8 <3,6,4,7>, <3,6,4,7>
+  3398898498U,	// <4,7,3,7>: Cost 4 vmrglw <7,2,4,3>, <6,6,7,7>
+  2707934014U,	// <4,7,3,u>: Cost 3 vsldoi8 <3,u,4,7>, <3,u,4,7>
+  2785858868U,	// <4,7,4,0>: Cost 3 vsldoi12 <5,6,7,4>, <7,4,0,1>
+  3780348874U,	// <4,7,4,1>: Cost 4 vsldoi8 <3,6,4,7>, <4,1,2,3>
+  3780349000U,	// <4,7,4,2>: Cost 4 vsldoi8 <3,6,4,7>, <4,2,7,3>
+  2308575738U,	// <4,7,4,3>: Cost 3 vmrglw <4,4,4,4>, <6,2,7,3>
+  2656283856U,	// <4,7,4,4>: Cost 3 vsldoi4 <6,4,7,4>, <4,4,4,4>
+  2706607414U,	// <4,7,4,5>: Cost 3 vsldoi8 <3,6,4,7>, RHS
+  2656285341U,	// <4,7,4,6>: Cost 3 vsldoi4 <6,4,7,4>, <6,4,7,4>
+  2241468012U,	// <4,7,4,7>: Cost 3 vmrghw <4,4,4,4>, <7,7,7,7>
+  2706607657U,	// <4,7,4,u>: Cost 3 vsldoi8 <3,6,4,7>, RHS
+  1168569338U,	// <4,7,5,0>: Cost 2 vmrghw RHS, <7,0,1,2>
+  2242311242U,	// <4,7,5,1>: Cost 3 vmrghw RHS, <7,1,1,1>
+  2242303178U,	// <4,7,5,2>: Cost 3 vmrghw RHS, <7,2,6,3>
+  2242311395U,	// <4,7,5,3>: Cost 3 vmrghw RHS, <7,3,0,1>
+  1168569702U,	// <4,7,5,4>: Cost 2 vmrghw RHS, <7,4,5,6>
+  2242311606U,	// <4,7,5,5>: Cost 3 vmrghw RHS, <7,5,5,5>
+  2242311662U,	// <4,7,5,6>: Cost 3 vmrghw RHS, <7,6,2,7>
+  1168569964U,	// <4,7,5,7>: Cost 2 vmrghw RHS, <7,7,7,7>
+  1168569986U,	// <4,7,5,u>: Cost 2 vmrghw RHS, <7,u,1,2>
+  3316593658U,	// <4,7,6,0>: Cost 4 vmrghw <4,6,5,2>, <7,0,1,2>
+  3316593738U,	// <4,7,6,1>: Cost 5 vmrghw <4,6,5,2>, <7,1,1,1>
+  3316634800U,	// <4,7,6,2>: Cost 4 vmrghw <4,6,5,7>, <7,2,3,4>
+  3386978810U,	// <4,7,6,3>: Cost 4 vmrglw <5,2,4,6>, <6,2,7,3>
+  2785859072U,	// <4,7,6,4>: Cost 3 vsldoi12 <5,6,7,4>, <7,6,4,7>
+  3736014950U,	// <4,7,6,5>: Cost 4 vsldoi4 <7,4,7,6>, <5,6,7,4>
+  3316594158U,	// <4,7,6,6>: Cost 4 vmrghw <4,6,5,2>, <7,6,2,7>
+  2797803032U,	// <4,7,6,7>: Cost 3 vsldoi12 <7,6,7,4>, <7,6,7,4>
+  2797876769U,	// <4,7,6,u>: Cost 3 vsldoi12 <7,6,u,4>, <7,6,u,4>
+  2243499002U,	// <4,7,7,0>: Cost 3 vmrghw <4,7,5,0>, <7,0,1,2>
+  3718103962U,	// <4,7,7,1>: Cost 4 vsldoi4 <4,4,7,7>, <1,2,3,4>
+  3317257418U,	// <4,7,7,2>: Cost 4 vmrghw <4,7,5,2>, <7,2,6,3>
+  3377695816U,	// <4,7,7,3>: Cost 4 vmrglw <3,6,4,7>, <4,2,7,3>
+  2243532134U,	// <4,7,7,4>: Cost 3 vmrghw <4,7,5,4>, <7,4,5,6>
+  3317282230U,	// <4,7,7,5>: Cost 4 vmrghw <4,7,5,5>, <7,5,5,5>
+  2730497536U,	// <4,7,7,6>: Cost 3 vsldoi8 <7,6,4,7>, <7,6,4,7>
+  2243556972U,	// <4,7,7,7>: Cost 3 vmrghw <4,7,5,7>, <7,7,7,7>
+  2243565186U,	// <4,7,7,u>: Cost 3 vmrghw <4,7,5,u>, <7,u,1,2>
+  1170551802U,	// <4,7,u,0>: Cost 2 vmrghw RHS, <7,0,1,2>
+  2706609966U,	// <4,7,u,1>: Cost 3 vsldoi8 <3,6,4,7>, LHS
+  2244293797U,	// <4,7,u,2>: Cost 3 vmrghw RHS, <7,2,2,2>
+  2244293859U,	// <4,7,u,3>: Cost 3 vmrghw RHS, <7,3,0,1>
+  1170552166U,	// <4,7,u,4>: Cost 2 vmrghw RHS, <7,4,5,6>
+  2706610330U,	// <4,7,u,5>: Cost 3 vsldoi8 <3,6,4,7>, RHS
+  2244294126U,	// <4,7,u,6>: Cost 3 vmrghw RHS, <7,6,2,7>
+  1170552428U,	// <4,7,u,7>: Cost 2 vmrghw RHS, <7,7,7,7>
+  1170552450U,	// <4,7,u,u>: Cost 2 vmrghw RHS, <7,u,1,2>
+  1165118354U,	// <4,u,0,0>: Cost 2 vmrghw <4,0,5,1>, <4,0,5,1>
+  1624907878U,	// <4,u,0,1>: Cost 2 vsldoi8 <2,3,4,u>, LHS
+  2638407377U,	// <4,u,0,2>: Cost 3 vsldoi4 <3,4,u,0>, <2,3,4,u>
+  2295931036U,	// <4,u,0,3>: Cost 3 vmrglw <2,3,4,0>, LHS
+  2687369584U,	// <4,u,0,4>: Cost 3 vsldoi8 <0,4,4,u>, <0,4,4,u>
+  1165121690U,	// <4,u,0,5>: Cost 2 vmrghw <4,0,5,1>, RHS
+  2662298489U,	// <4,u,0,6>: Cost 3 vsldoi4 <7,4,u,0>, <6,7,4,u>
+  2295934280U,	// <4,u,0,7>: Cost 3 vmrglw <2,3,4,0>, RHS
+  1624908445U,	// <4,u,0,u>: Cost 2 vsldoi8 <2,3,4,u>, LHS
+  2638413926U,	// <4,u,1,0>: Cost 3 vsldoi4 <3,4,u,1>, LHS
+  2691351382U,	// <4,u,1,1>: Cost 3 vsldoi8 <1,1,4,u>, <1,1,4,u>
+  1685280558U,	// <4,u,1,2>: Cost 2 vsldoi12 <1,2,3,4>, LHS
+  2287313052U,	// <4,u,1,3>: Cost 3 vmrglw <0,u,4,1>, LHS
+  2299257799U,	// <4,u,1,4>: Cost 3 vmrglw <2,u,4,1>, <1,2,u,4>
+  2694005914U,	// <4,u,1,5>: Cost 3 vsldoi8 <1,5,4,u>, <1,5,4,u>
+  2305231362U,	// <4,u,1,6>: Cost 3 vmrglw <3,u,4,1>, <3,4,5,6>
+  2287316296U,	// <4,u,1,7>: Cost 3 vmrglw <0,u,4,1>, RHS
+  1685280612U,	// <4,u,1,u>: Cost 2 vsldoi12 <1,2,3,4>, LHS
+  2638422118U,	// <4,u,2,0>: Cost 3 vsldoi4 <3,4,u,2>, LHS
+  2240206638U,	// <4,u,2,1>: Cost 3 vmrghw <4,2,5,3>, LHS
+  2697987712U,	// <4,u,2,2>: Cost 3 vsldoi8 <2,2,4,u>, <2,2,4,u>
+  1624909521U,	// <4,u,2,3>: Cost 2 vsldoi8 <2,3,4,u>, <2,3,4,u>
+  2759391121U,	// <4,u,2,4>: Cost 3 vsldoi12 <1,2,u,4>, <u,2,4,3>
+  2240207002U,	// <4,u,2,5>: Cost 3 vmrghw <4,2,5,3>, RHS
+  2698651578U,	// <4,u,2,6>: Cost 3 vsldoi8 <2,3,4,u>, <2,6,3,7>
+  2785859500U,	// <4,u,2,7>: Cost 3 vsldoi12 <5,6,7,4>, <u,2,7,3>
+  1628227686U,	// <4,u,2,u>: Cost 2 vsldoi8 <2,u,4,u>, <2,u,4,u>
+  2759022524U,	// <4,u,3,0>: Cost 3 vsldoi12 <1,2,3,4>, <u,3,0,1>
+  2801342408U,	// <4,u,3,1>: Cost 3 vsldoi12 <u,3,1,4>, <u,3,1,4>
+  2703960409U,	// <4,u,3,2>: Cost 3 vsldoi8 <3,2,4,u>, <3,2,4,u>
+  2759022554U,	// <4,u,3,3>: Cost 3 vsldoi12 <1,2,3,4>, <u,3,3,4>
+  2759022564U,	// <4,u,3,4>: Cost 3 vsldoi12 <1,2,3,4>, <u,3,4,5>
+  2240845978U,	// <4,u,3,5>: Cost 3 vmrghw <4,3,5,0>, RHS
+  2706614941U,	// <4,u,3,6>: Cost 3 vsldoi8 <3,6,4,u>, <3,6,4,u>
+  2301267272U,	// <4,u,3,7>: Cost 3 vmrglw <3,2,4,3>, RHS
+  2759022596U,	// <4,u,3,u>: Cost 3 vsldoi12 <1,2,3,4>, <u,3,u,1>
+  1570668646U,	// <4,u,4,0>: Cost 2 vsldoi4 <4,4,u,4>, LHS
+  1167726382U,	// <4,u,4,1>: Cost 2 vmrghw <4,4,4,4>, LHS
+  2698652753U,	// <4,u,4,2>: Cost 3 vsldoi8 <2,3,4,u>, <4,2,u,3>
+  1234829468U,	// <4,u,4,3>: Cost 2 vmrglw <4,4,4,4>, LHS
+  229035318U,	// <4,u,4,4>: Cost 1 vspltisw0 RHS
+  1624911158U,	// <4,u,4,5>: Cost 2 vsldoi8 <2,3,4,u>, RHS
+  2698653081U,	// <4,u,4,6>: Cost 3 vsldoi8 <2,3,4,u>, <4,6,u,7>
+  1234832712U,	// <4,u,4,7>: Cost 2 vmrglw <4,4,4,4>, RHS
+  229035318U,	// <4,u,4,u>: Cost 1 vspltisw0 RHS
+  1168561875U,	// <4,u,5,0>: Cost 2 vmrghw RHS, <u,0,1,2>
+  94820142U,	// <4,u,5,1>: Cost 1 vmrghw RHS, LHS
+  1168562053U,	// <4,u,5,2>: Cost 2 vmrghw RHS, <u,2,3,0>
+  1222230172U,	// <4,u,5,3>: Cost 2 vmrglw <2,3,4,5>, LHS
+  1168562239U,	// <4,u,5,4>: Cost 2 vmrghw RHS, <u,4,5,6>
+  94820506U,	// <4,u,5,5>: Cost 1 vmrghw RHS, RHS
+  1685280922U,	// <4,u,5,6>: Cost 2 vsldoi12 <1,2,3,4>, RHS
+  1222233416U,	// <4,u,5,7>: Cost 2 vmrglw <2,3,4,5>, RHS
+  94820709U,	// <4,u,5,u>: Cost 1 vmrghw RHS, LHS
+  1564713062U,	// <4,u,6,0>: Cost 2 vsldoi4 <3,4,u,6>, LHS
+  2626511979U,	// <4,u,6,1>: Cost 3 vsldoi4 <1,4,u,6>, <1,4,u,6>
+  2632484676U,	// <4,u,6,2>: Cost 3 vsldoi4 <2,4,u,6>, <2,4,u,6>
+  1564715549U,	// <4,u,6,3>: Cost 2 vsldoi4 <3,4,u,6>, <3,4,u,6>
+  1564716342U,	// <4,u,6,4>: Cost 2 vsldoi4 <3,4,u,6>, RHS
+  2242853018U,	// <4,u,6,5>: Cost 3 vmrghw <4,6,5,2>, RHS
+  2656375464U,	// <4,u,6,6>: Cost 3 vsldoi4 <6,4,u,6>, <6,4,u,6>
+  27705344U,	// <4,u,6,7>: Cost 0 copy RHS
+  27705344U,	// <4,u,6,u>: Cost 0 copy RHS
+  2785859840U,	// <4,u,7,0>: Cost 3 vsldoi12 <5,6,7,4>, <u,7,0,1>
+  2243499822U,	// <4,u,7,1>: Cost 3 vmrghw <4,7,5,0>, LHS
+  2727851197U,	// <4,u,7,2>: Cost 3 vsldoi8 <7,2,4,u>, <7,2,4,u>
+  2303951004U,	// <4,u,7,3>: Cost 3 vmrglw <3,6,4,7>, LHS
+  2785859880U,	// <4,u,7,4>: Cost 3 vsldoi12 <5,6,7,4>, <u,7,4,5>
+  2243500186U,	// <4,u,7,5>: Cost 3 vmrghw <4,7,5,0>, RHS
+  2730505729U,	// <4,u,7,6>: Cost 3 vsldoi8 <7,6,4,u>, <7,6,4,u>
+  2303954248U,	// <4,u,7,7>: Cost 3 vmrglw <3,6,4,7>, RHS
+  2303951009U,	// <4,u,7,u>: Cost 3 vmrglw <3,6,4,7>, LHS
+  1564729446U,	// <4,u,u,0>: Cost 2 vsldoi4 <3,4,u,u>, LHS
+  96810798U,	// <4,u,u,1>: Cost 1 vmrghw RHS, LHS
+  1685281125U,	// <4,u,u,2>: Cost 2 vsldoi12 <1,2,3,4>, LHS
+  1222254748U,	// <4,u,u,3>: Cost 2 vmrglw <2,3,4,u>, LHS
+  229035318U,	// <4,u,u,4>: Cost 1 vspltisw0 RHS
+  96811162U,	// <4,u,u,5>: Cost 1 vmrghw RHS, RHS
+  1685281165U,	// <4,u,u,6>: Cost 2 vsldoi12 <1,2,3,4>, RHS
+  27705344U,	// <4,u,u,7>: Cost 0 copy RHS
+  27705344U,	// <4,u,u,u>: Cost 0 copy RHS
+  2754232320U,	// <5,0,0,0>: Cost 3 vsldoi12 <0,4,1,5>, <0,0,0,0>
+  2754232330U,	// <5,0,0,1>: Cost 3 vsldoi12 <0,4,1,5>, <0,0,1,1>
+  3718194894U,	// <5,0,0,2>: Cost 4 vsldoi4 <4,5,0,0>, <2,3,4,5>
+  3376385762U,	// <5,0,0,3>: Cost 4 vmrglw <3,4,5,0>, <5,2,0,3>
+  2754232357U,	// <5,0,0,4>: Cost 3 vsldoi12 <0,4,1,5>, <0,0,4,1>
+  3845816370U,	// <5,0,0,5>: Cost 4 vsldoi12 <3,4,0,5>, <0,0,5,5>
+  3782353389U,	// <5,0,0,6>: Cost 4 vsldoi8 <4,0,5,0>, <0,6,0,7>
+  3376386090U,	// <5,0,0,7>: Cost 4 vmrglw <3,4,5,0>, <5,6,0,7>
+  2757402697U,	// <5,0,0,u>: Cost 3 vsldoi12 <0,u,u,5>, <0,0,u,1>
+  2626543718U,	// <5,0,1,0>: Cost 3 vsldoi4 <1,5,0,1>, LHS
+  2626544751U,	// <5,0,1,1>: Cost 3 vsldoi4 <1,5,0,1>, <1,5,0,1>
+  1680490598U,	// <5,0,1,2>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+  3766428665U,	// <5,0,1,3>: Cost 4 vsldoi8 <1,3,5,0>, <1,3,5,0>
+  2626546998U,	// <5,0,1,4>: Cost 3 vsldoi4 <1,5,0,1>, RHS
+  2650435539U,	// <5,0,1,5>: Cost 3 vsldoi4 <5,5,0,1>, <5,5,0,1>
+  3783017715U,	// <5,0,1,6>: Cost 4 vsldoi8 <4,1,5,0>, <1,6,5,7>
+  3385019000U,	// <5,0,1,7>: Cost 4 vmrglw <4,u,5,1>, <3,6,0,7>
+  1680490652U,	// <5,0,1,u>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+  3376398336U,	// <5,0,2,0>: Cost 4 vmrglw <3,4,5,2>, <0,0,0,0>
+  2245877862U,	// <5,0,2,1>: Cost 3 vmrghw <5,2,1,3>, LHS
+  3773064808U,	// <5,0,2,2>: Cost 4 vsldoi8 <2,4,5,0>, <2,2,2,2>
+  2705295054U,	// <5,0,2,3>: Cost 3 vsldoi8 <3,4,5,0>, <2,3,4,5>
+  3827974343U,	// <5,0,2,4>: Cost 4 vsldoi12 <0,4,1,5>, <0,2,4,1>
+  3845816530U,	// <5,0,2,5>: Cost 4 vsldoi12 <3,4,0,5>, <0,2,5,3>
+  3779037114U,	// <5,0,2,6>: Cost 4 vsldoi8 <3,4,5,0>, <2,6,3,7>
+  3810887658U,	// <5,0,2,7>: Cost 4 vsldoi8 <u,7,5,0>, <2,7,0,1>
+  2245878429U,	// <5,0,2,u>: Cost 3 vmrghw <5,2,1,3>, LHS
+  2710603926U,	// <5,0,3,0>: Cost 3 vsldoi8 <4,3,5,0>, <3,0,1,2>
+  3827974396U,	// <5,0,3,1>: Cost 4 vsldoi12 <0,4,1,5>, <0,3,1,0>
+  3779037516U,	// <5,0,3,2>: Cost 4 vsldoi8 <3,4,5,0>, <3,2,3,4>
+  3779037596U,	// <5,0,3,3>: Cost 4 vsldoi8 <3,4,5,0>, <3,3,3,3>
+  2705295868U,	// <5,0,3,4>: Cost 3 vsldoi8 <3,4,5,0>, <3,4,5,0>
+  3379726804U,	// <5,0,3,5>: Cost 4 vmrglw <4,0,5,3>, <3,4,0,5>
+  3802925748U,	// <5,0,3,6>: Cost 4 vsldoi8 <7,4,5,0>, <3,6,7,4>
+  3363138168U,	// <5,0,3,7>: Cost 5 vmrglw <1,2,5,3>, <3,6,0,7>
+  2707950400U,	// <5,0,3,u>: Cost 3 vsldoi8 <3,u,5,0>, <3,u,5,0>
+  2626568294U,	// <5,0,4,0>: Cost 3 vsldoi4 <1,5,0,4>, LHS
+  1680490834U,	// <5,0,4,1>: Cost 2 vsldoi12 <0,4,1,5>, <0,4,1,5>
+  3828048219U,	// <5,0,4,2>: Cost 4 vsldoi12 <0,4,2,5>, <0,4,2,5>
+  2710604932U,	// <5,0,4,3>: Cost 3 vsldoi8 <4,3,5,0>, <4,3,5,0>
+  2754232685U,	// <5,0,4,4>: Cost 3 vsldoi12 <0,4,1,5>, <0,4,4,5>
+  2705296694U,	// <5,0,4,5>: Cost 3 vsldoi8 <3,4,5,0>, RHS
+  3779038590U,	// <5,0,4,6>: Cost 4 vsldoi8 <3,4,5,0>, <4,6,5,7>
+  2713259464U,	// <5,0,4,7>: Cost 3 vsldoi8 <4,7,5,0>, <4,7,5,0>
+  1680490834U,	// <5,0,4,u>: Cost 2 vsldoi12 <0,4,1,5>, <0,4,1,5>
+  2311307264U,	// <5,0,5,0>: Cost 3 vmrglw <4,u,5,5>, <0,0,0,0>
+  1174437990U,	// <5,0,5,1>: Cost 2 vmrghw <5,5,5,5>, LHS
+  3779038946U,	// <5,0,5,2>: Cost 4 vsldoi8 <3,4,5,0>, <5,2,0,3>
+  3845816752U,	// <5,0,5,3>: Cost 4 vsldoi12 <3,4,0,5>, <0,5,3,0>
+  2248180050U,	// <5,0,5,4>: Cost 3 vmrghw <5,5,5,5>, <0,4,1,5>
+  2248180194U,	// <5,0,5,5>: Cost 3 vmrghw <5,5,5,5>, <0,5,u,5>
+  3779039274U,	// <5,0,5,6>: Cost 4 vsldoi8 <3,4,5,0>, <5,6,0,7>
+  3385051768U,	// <5,0,5,7>: Cost 4 vmrglw <4,u,5,5>, <3,6,0,7>
+  1174438557U,	// <5,0,5,u>: Cost 2 vmrghw <5,5,5,5>, LHS
+  2302689280U,	// <5,0,6,0>: Cost 3 vmrglw <3,4,5,6>, <0,0,0,0>
+  1175208038U,	// <5,0,6,1>: Cost 2 vmrghw <5,6,7,0>, LHS
+  3787002362U,	// <5,0,6,2>: Cost 4 vsldoi8 <4,7,5,0>, <6,2,7,3>
+  3376432160U,	// <5,0,6,3>: Cost 4 vmrglw <3,4,5,6>, <1,4,0,3>
+  2248950098U,	// <5,0,6,4>: Cost 3 vmrghw <5,6,7,0>, <0,4,1,5>
+  2248950180U,	// <5,0,6,5>: Cost 3 vmrghw <5,6,7,0>, <0,5,1,6>
+  3376433702U,	// <5,0,6,6>: Cost 4 vmrglw <3,4,5,6>, <3,5,0,6>
+  2729186166U,	// <5,0,6,7>: Cost 3 vsldoi8 <7,4,5,0>, <6,7,4,5>
+  1175208605U,	// <5,0,6,u>: Cost 2 vmrghw <5,6,7,0>, LHS
+  2713261050U,	// <5,0,7,0>: Cost 3 vsldoi8 <4,7,5,0>, <7,0,1,2>
+  3365823599U,	// <5,0,7,1>: Cost 4 vmrglw <1,6,5,7>, <1,5,0,1>
+  3808900317U,	// <5,0,7,2>: Cost 4 vsldoi8 <u,4,5,0>, <7,2,u,4>
+  3784348899U,	// <5,0,7,3>: Cost 4 vsldoi8 <4,3,5,0>, <7,3,0,1>
+  2729186656U,	// <5,0,7,4>: Cost 3 vsldoi8 <7,4,5,0>, <7,4,5,0>
+  3787003268U,	// <5,0,7,5>: Cost 4 vsldoi8 <4,7,5,0>, <7,5,0,0>
+  3802928664U,	// <5,0,7,6>: Cost 4 vsldoi8 <7,4,5,0>, <7,6,7,4>
+  3787003431U,	// <5,0,7,7>: Cost 4 vsldoi8 <4,7,5,0>, <7,7,0,1>
+  2731841188U,	// <5,0,7,u>: Cost 3 vsldoi8 <7,u,5,0>, <7,u,5,0>
+  2626601062U,	// <5,0,u,0>: Cost 3 vsldoi4 <1,5,0,u>, LHS
+  1683145366U,	// <5,0,u,1>: Cost 2 vsldoi12 <0,u,1,5>, <0,u,1,5>
+  1680491165U,	// <5,0,u,2>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+  2705295054U,	// <5,0,u,3>: Cost 3 vsldoi8 <3,4,5,0>, <2,3,4,5>
+  2754233005U,	// <5,0,u,4>: Cost 3 vsldoi12 <0,4,1,5>, <0,u,4,1>
+  2705299610U,	// <5,0,u,5>: Cost 3 vsldoi8 <3,4,5,0>, RHS
+  3779041488U,	// <5,0,u,6>: Cost 4 vsldoi8 <3,4,5,0>, <u,6,3,7>
+  2737150252U,	// <5,0,u,7>: Cost 3 vsldoi8 <u,7,5,0>, <u,7,5,0>
+  1680491219U,	// <5,0,u,u>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+  2713927680U,	// <5,1,0,0>: Cost 3 vsldoi8 <4,u,5,1>, <0,0,0,0>
+  1640185958U,	// <5,1,0,1>: Cost 2 vsldoi8 <4,u,5,1>, LHS
+  2310607866U,	// <5,1,0,2>: Cost 3 vmrglw <4,7,5,0>, <7,0,1,2>
+  3787669756U,	// <5,1,0,3>: Cost 4 vsldoi8 <4,u,5,1>, <0,3,1,0>
+  2713928018U,	// <5,1,0,4>: Cost 3 vsldoi8 <4,u,5,1>, <0,4,1,5>
+  2306621778U,	// <5,1,0,5>: Cost 3 vmrglw <4,1,5,0>, <0,4,1,5>
+  3787670006U,	// <5,1,0,6>: Cost 4 vsldoi8 <4,u,5,1>, <0,6,1,7>
+  3736188301U,	// <5,1,0,7>: Cost 4 vsldoi4 <7,5,1,0>, <7,5,1,0>
+  1640186525U,	// <5,1,0,u>: Cost 2 vsldoi8 <4,u,5,1>, LHS
+  2650505318U,	// <5,1,1,0>: Cost 3 vsldoi4 <5,5,1,1>, LHS
+  2754233140U,	// <5,1,1,1>: Cost 3 vsldoi12 <0,4,1,5>, <1,1,1,1>
+  2311276694U,	// <5,1,1,2>: Cost 3 vmrglw <4,u,5,1>, <3,0,1,2>
+  2311278315U,	// <5,1,1,3>: Cost 3 vmrglw <4,u,5,1>, <5,2,1,3>
+  2758435667U,	// <5,1,1,4>: Cost 3 vsldoi12 <1,1,4,5>, <1,1,4,5>
+  2754233180U,	// <5,1,1,5>: Cost 3 vsldoi12 <0,4,1,5>, <1,1,5,5>
+  3385016497U,	// <5,1,1,6>: Cost 4 vmrglw <4,u,5,1>, <0,2,1,6>
+  2311278643U,	// <5,1,1,7>: Cost 3 vmrglw <4,u,5,1>, <5,6,1,7>
+  2758730615U,	// <5,1,1,u>: Cost 3 vsldoi12 <1,1,u,5>, <1,1,u,5>
+  3700367462U,	// <5,1,2,0>: Cost 4 vsldoi4 <1,5,1,2>, LHS
+  3830629255U,	// <5,1,2,1>: Cost 4 vsldoi12 <0,u,1,5>, <1,2,1,3>
+  2713929320U,	// <5,1,2,2>: Cost 3 vsldoi8 <4,u,5,1>, <2,2,2,2>
+  2754233238U,	// <5,1,2,3>: Cost 3 vsldoi12 <0,4,1,5>, <1,2,3,0>
+  2759099300U,	// <5,1,2,4>: Cost 3 vsldoi12 <1,2,4,5>, <1,2,4,5>
+  2754233259U,	// <5,1,2,5>: Cost 3 vsldoi12 <0,4,1,5>, <1,2,5,3>
+  2713929658U,	// <5,1,2,6>: Cost 3 vsldoi8 <4,u,5,1>, <2,6,3,7>
+  3872359354U,	// <5,1,2,7>: Cost 4 vsldoi12 <7,u,0,5>, <1,2,7,0>
+  2754233283U,	// <5,1,2,u>: Cost 3 vsldoi12 <0,4,1,5>, <1,2,u,0>
+  2713929878U,	// <5,1,3,0>: Cost 3 vsldoi8 <4,u,5,1>, <3,0,1,2>
+  3363135498U,	// <5,1,3,1>: Cost 4 vmrglw <1,2,5,3>, <0,0,1,1>
+  3363137686U,	// <5,1,3,2>: Cost 4 vmrglw <1,2,5,3>, <3,0,1,2>
+  2713930140U,	// <5,1,3,3>: Cost 3 vsldoi8 <4,u,5,1>, <3,3,3,3>
+  2713930242U,	// <5,1,3,4>: Cost 3 vsldoi8 <4,u,5,1>, <3,4,5,6>
+  2289394002U,	// <5,1,3,5>: Cost 3 vmrglw <1,2,5,3>, <0,4,1,5>
+  3787672184U,	// <5,1,3,6>: Cost 4 vsldoi8 <4,u,5,1>, <3,6,0,7>
+  3787672259U,	// <5,1,3,7>: Cost 4 vsldoi8 <4,u,5,1>, <3,7,0,1>
+  2713930526U,	// <5,1,3,u>: Cost 3 vsldoi8 <4,u,5,1>, <3,u,1,2>
+  1634880402U,	// <5,1,4,0>: Cost 2 vsldoi8 <4,0,5,1>, <4,0,5,1>
+  2760205355U,	// <5,1,4,1>: Cost 3 vsldoi12 <1,4,1,5>, <1,4,1,5>
+  2760279092U,	// <5,1,4,2>: Cost 3 vsldoi12 <1,4,2,5>, <1,4,2,5>
+  3787672708U,	// <5,1,4,3>: Cost 4 vsldoi8 <4,u,5,1>, <4,3,5,0>
+  2713930960U,	// <5,1,4,4>: Cost 3 vsldoi8 <4,u,5,1>, <4,4,4,4>
+  1640189238U,	// <5,1,4,5>: Cost 2 vsldoi8 <4,u,5,1>, RHS
+  3786345848U,	// <5,1,4,6>: Cost 4 vsldoi8 <4,6,5,1>, <4,6,5,1>
+  3787009481U,	// <5,1,4,7>: Cost 4 vsldoi8 <4,7,5,1>, <4,7,5,1>
+  1640189466U,	// <5,1,4,u>: Cost 2 vsldoi8 <4,u,5,1>, <4,u,5,1>
+  2754233455U,	// <5,1,5,0>: Cost 3 vsldoi12 <0,4,1,5>, <1,5,0,1>
+  2713931407U,	// <5,1,5,1>: Cost 3 vsldoi8 <4,u,5,1>, <5,1,0,1>
+  2713931499U,	// <5,1,5,2>: Cost 3 vsldoi8 <4,u,5,1>, <5,2,1,3>
+  3827975305U,	// <5,1,5,3>: Cost 4 vsldoi12 <0,4,1,5>, <1,5,3,0>
+  2754233495U,	// <5,1,5,4>: Cost 3 vsldoi12 <0,4,1,5>, <1,5,4,5>
+  2288746834U,	// <5,1,5,5>: Cost 3 vmrglw <1,1,5,5>, <0,4,1,5>
+  2713931827U,	// <5,1,5,6>: Cost 3 vsldoi8 <4,u,5,1>, <5,6,1,7>
+  3787673725U,	// <5,1,5,7>: Cost 4 vsldoi8 <4,u,5,1>, <5,7,1,0>
+  2754233527U,	// <5,1,5,u>: Cost 3 vsldoi12 <0,4,1,5>, <1,5,u,1>
+  2668462182U,	// <5,1,6,0>: Cost 3 vsldoi4 <u,5,1,6>, LHS
+  2290746002U,	// <5,1,6,1>: Cost 3 vmrglw <1,4,5,6>, <0,u,1,1>
+  2302691478U,	// <5,1,6,2>: Cost 3 vmrglw <3,4,5,6>, <3,0,1,2>
+  3364488071U,	// <5,1,6,3>: Cost 4 vmrglw <1,4,5,6>, <1,2,1,3>
+  2302689536U,	// <5,1,6,4>: Cost 3 vmrglw <3,4,5,6>, <0,3,1,4>
+  2754233587U,	// <5,1,6,5>: Cost 3 vsldoi12 <0,4,1,5>, <1,6,5,7>
+  2713932600U,	// <5,1,6,6>: Cost 3 vsldoi8 <4,u,5,1>, <6,6,6,6>
+  2713932622U,	// <5,1,6,7>: Cost 3 vsldoi8 <4,u,5,1>, <6,7,0,1>
+  2302689297U,	// <5,1,6,u>: Cost 3 vmrglw <3,4,5,6>, <0,0,1,u>
+  2713932794U,	// <5,1,7,0>: Cost 3 vsldoi8 <4,u,5,1>, <7,0,1,2>
+  3365822474U,	// <5,1,7,1>: Cost 4 vmrglw <1,6,5,7>, <0,0,1,1>
+  3365824662U,	// <5,1,7,2>: Cost 4 vmrglw <1,6,5,7>, <3,0,1,2>
+  3787674851U,	// <5,1,7,3>: Cost 4 vsldoi8 <4,u,5,1>, <7,3,0,1>
+  2713933158U,	// <5,1,7,4>: Cost 3 vsldoi8 <4,u,5,1>, <7,4,5,6>
+  2292080978U,	// <5,1,7,5>: Cost 3 vmrglw <1,6,5,7>, <0,4,1,5>
+  3365823613U,	// <5,1,7,6>: Cost 4 vmrglw <1,6,5,7>, <1,5,1,6>
+  2713933420U,	// <5,1,7,7>: Cost 3 vsldoi8 <4,u,5,1>, <7,7,7,7>
+  2713933442U,	// <5,1,7,u>: Cost 3 vsldoi8 <4,u,5,1>, <7,u,1,2>
+  1658771190U,	// <5,1,u,0>: Cost 2 vsldoi8 <u,0,5,1>, <u,0,5,1>
+  1640191790U,	// <5,1,u,1>: Cost 2 vsldoi8 <4,u,5,1>, LHS
+  2762933624U,	// <5,1,u,2>: Cost 3 vsldoi12 <1,u,2,5>, <1,u,2,5>
+  2754233724U,	// <5,1,u,3>: Cost 3 vsldoi12 <0,4,1,5>, <1,u,3,0>
+  2763081098U,	// <5,1,u,4>: Cost 3 vsldoi12 <1,u,4,5>, <1,u,4,5>
+  1640192154U,	// <5,1,u,5>: Cost 2 vsldoi8 <4,u,5,1>, RHS
+  2713934032U,	// <5,1,u,6>: Cost 3 vsldoi8 <4,u,5,1>, <u,6,3,7>
+  2713934080U,	// <5,1,u,7>: Cost 3 vsldoi8 <4,u,5,1>, <u,7,0,1>
+  1640192357U,	// <5,1,u,u>: Cost 2 vsldoi8 <4,u,5,1>, LHS
+  3779051520U,	// <5,2,0,0>: Cost 4 vsldoi8 <3,4,5,2>, <0,0,0,0>
+  2705309798U,	// <5,2,0,1>: Cost 3 vsldoi8 <3,4,5,2>, LHS
+  3838813637U,	// <5,2,0,2>: Cost 4 vsldoi12 <2,2,4,5>, <2,0,2,1>
+  2302640230U,	// <5,2,0,3>: Cost 3 vmrglw <3,4,5,0>, LHS
+  3765117266U,	// <5,2,0,4>: Cost 4 vsldoi8 <1,1,5,2>, <0,4,1,5>
+  3381027892U,	// <5,2,0,5>: Cost 4 vmrglw <4,2,5,0>, <1,4,2,5>
+  3842794985U,	// <5,2,0,6>: Cost 4 vsldoi12 <2,u,4,5>, <2,0,6,1>
+  3408232554U,	// <5,2,0,7>: Cost 4 vmrglw <u,7,5,0>, <0,1,2,7>
+  2302640235U,	// <5,2,0,u>: Cost 3 vmrglw <3,4,5,0>, LHS
+  3700432998U,	// <5,2,1,0>: Cost 4 vsldoi4 <1,5,2,1>, LHS
+  3765117785U,	// <5,2,1,1>: Cost 4 vsldoi8 <1,1,5,2>, <1,1,5,2>
+  2311276136U,	// <5,2,1,2>: Cost 3 vmrglw <4,u,5,1>, <2,2,2,2>
+  1237532774U,	// <5,2,1,3>: Cost 2 vmrglw <4,u,5,1>, LHS
+  3700436278U,	// <5,2,1,4>: Cost 4 vsldoi4 <1,5,2,1>, RHS
+  3381036084U,	// <5,2,1,5>: Cost 4 vmrglw <4,2,5,1>, <1,4,2,5>
+  3385018045U,	// <5,2,1,6>: Cost 4 vmrglw <4,u,5,1>, <2,3,2,6>
+  3385017560U,	// <5,2,1,7>: Cost 4 vmrglw <4,u,5,1>, <1,6,2,7>
+  1237532779U,	// <5,2,1,u>: Cost 2 vmrglw <4,u,5,1>, LHS
+  3700441190U,	// <5,2,2,0>: Cost 4 vsldoi4 <1,5,2,2>, LHS
+  3700442242U,	// <5,2,2,1>: Cost 4 vsldoi4 <1,5,2,2>, <1,5,2,2>
+  2754233960U,	// <5,2,2,2>: Cost 3 vsldoi12 <0,4,1,5>, <2,2,2,2>
+  2754233970U,	// <5,2,2,3>: Cost 3 vsldoi12 <0,4,1,5>, <2,2,3,3>
+  2765071997U,	// <5,2,2,4>: Cost 3 vsldoi12 <2,2,4,5>, <2,2,4,5>
+  3834021508U,	// <5,2,2,5>: Cost 4 vsldoi12 <1,4,2,5>, <2,2,5,3>
+  3842795152U,	// <5,2,2,6>: Cost 4 vsldoi12 <2,u,4,5>, <2,2,6,6>
+  3376402492U,	// <5,2,2,7>: Cost 4 vmrglw <3,4,5,2>, <5,6,2,7>
+  2754234015U,	// <5,2,2,u>: Cost 3 vsldoi12 <0,4,1,5>, <2,2,u,3>
+  2754234022U,	// <5,2,3,0>: Cost 3 vsldoi12 <0,4,1,5>, <2,3,0,1>
+  3827975855U,	// <5,2,3,1>: Cost 4 vsldoi12 <0,4,1,5>, <2,3,1,1>
+  2644625102U,	// <5,2,3,2>: Cost 3 vsldoi4 <4,5,2,3>, <2,3,4,5>
+  2289393766U,	// <5,2,3,3>: Cost 3 vmrglw <1,2,5,3>, LHS
+  1691993806U,	// <5,2,3,4>: Cost 2 vsldoi12 <2,3,4,5>, <2,3,4,5>
+  2785052375U,	// <5,2,3,5>: Cost 3 vsldoi12 <5,5,5,5>, <2,3,5,5>
+  3854812897U,	// <5,2,3,6>: Cost 4 vsldoi12 <4,u,5,5>, <2,3,6,6>
+  3802942187U,	// <5,2,3,7>: Cost 4 vsldoi8 <7,4,5,2>, <3,7,4,5>
+  1692288754U,	// <5,2,3,u>: Cost 2 vsldoi12 <2,3,u,5>, <2,3,u,5>
+  3839846139U,	// <5,2,4,0>: Cost 4 vsldoi12 <2,4,0,5>, <2,4,0,5>
+  2709294052U,	// <5,2,4,1>: Cost 3 vsldoi8 <4,1,5,2>, <4,1,5,2>
+  2766251789U,	// <5,2,4,2>: Cost 3 vsldoi12 <2,4,2,5>, <2,4,2,5>
+  2765735702U,	// <5,2,4,3>: Cost 3 vsldoi12 <2,3,4,5>, <2,4,3,5>
+  3840141087U,	// <5,2,4,4>: Cost 4 vsldoi12 <2,4,4,5>, <2,4,4,5>
+  2705313078U,	// <5,2,4,5>: Cost 3 vsldoi8 <3,4,5,2>, RHS
+  2712612217U,	// <5,2,4,6>: Cost 3 vsldoi8 <4,6,5,2>, <4,6,5,2>
+  3787017674U,	// <5,2,4,7>: Cost 4 vsldoi8 <4,7,5,2>, <4,7,5,2>
+  2765735747U,	// <5,2,4,u>: Cost 3 vsldoi12 <2,3,4,5>, <2,4,u,5>
+  3834021704U,	// <5,2,5,0>: Cost 4 vsldoi12 <1,4,2,5>, <2,5,0,1>
+  3834021714U,	// <5,2,5,1>: Cost 4 vsldoi12 <1,4,2,5>, <2,5,1,2>
+  2311308904U,	// <5,2,5,2>: Cost 3 vmrglw <4,u,5,5>, <2,2,2,2>
+  1237565542U,	// <5,2,5,3>: Cost 2 vmrglw <4,u,5,5>, LHS
+  3834021744U,	// <5,2,5,4>: Cost 4 vsldoi12 <1,4,2,5>, <2,5,4,5>
+  3369124916U,	// <5,2,5,5>: Cost 4 vmrglw <2,2,5,5>, <1,4,2,5>
+  2248181690U,	// <5,2,5,6>: Cost 3 vmrghw <5,5,5,5>, <2,6,3,7>
+  3786354825U,	// <5,2,5,7>: Cost 4 vsldoi8 <4,6,5,2>, <5,7,2,3>
+  1237565547U,	// <5,2,5,u>: Cost 2 vmrglw <4,u,5,5>, LHS
+  3700473958U,	// <5,2,6,0>: Cost 4 vsldoi4 <1,5,2,6>, LHS
+  3700475014U,	// <5,2,6,1>: Cost 4 vsldoi4 <1,5,2,6>, <1,5,2,6>
+  2296718952U,	// <5,2,6,2>: Cost 3 vmrglw <2,4,5,6>, <2,2,2,2>
+  1228947558U,	// <5,2,6,3>: Cost 2 vmrglw <3,4,5,6>, LHS
+  3700477238U,	// <5,2,6,4>: Cost 4 vsldoi4 <1,5,2,6>, RHS
+  3834021836U,	// <5,2,6,5>: Cost 4 vsldoi12 <1,4,2,5>, <2,6,5,7>
+  2248951738U,	// <5,2,6,6>: Cost 3 vmrghw <5,6,7,0>, <2,6,3,7>
+  3370461105U,	// <5,2,6,7>: Cost 4 vmrglw <2,4,5,6>, <2,6,2,7>
+  1228947563U,	// <5,2,6,u>: Cost 2 vmrglw <3,4,5,6>, LHS
+  3786355706U,	// <5,2,7,0>: Cost 4 vsldoi8 <4,6,5,2>, <7,0,1,2>
+  3783038037U,	// <5,2,7,1>: Cost 4 vsldoi8 <4,1,5,2>, <7,1,2,3>
+  3365824104U,	// <5,2,7,2>: Cost 4 vmrglw <1,6,5,7>, <2,2,2,2>
+  2292080742U,	// <5,2,7,3>: Cost 3 vmrglw <1,6,5,7>, LHS
+  3842131986U,	// <5,2,7,4>: Cost 4 vsldoi12 <2,7,4,5>, <2,7,4,5>
+  3371795508U,	// <5,2,7,5>: Cost 4 vmrglw <2,6,5,7>, <1,4,2,5>
+  3786356206U,	// <5,2,7,6>: Cost 4 vsldoi8 <4,6,5,2>, <7,6,2,7>
+  3786356332U,	// <5,2,7,7>: Cost 4 vsldoi8 <4,6,5,2>, <7,7,7,7>
+  2292080747U,	// <5,2,7,u>: Cost 3 vmrglw <1,6,5,7>, LHS
+  2754234427U,	// <5,2,u,0>: Cost 3 vsldoi12 <0,4,1,5>, <2,u,0,1>
+  2705315630U,	// <5,2,u,1>: Cost 3 vsldoi8 <3,4,5,2>, LHS
+  2296735336U,	// <5,2,u,2>: Cost 3 vmrglw <2,4,5,u>, <2,2,2,2>
+  1228963942U,	// <5,2,u,3>: Cost 2 vmrglw <3,4,5,u>, LHS
+  1695311971U,	// <5,2,u,4>: Cost 2 vsldoi12 <2,u,4,5>, <2,u,4,5>
+  2705315994U,	// <5,2,u,5>: Cost 3 vsldoi8 <3,4,5,2>, RHS
+  2769201269U,	// <5,2,u,6>: Cost 3 vsldoi12 <2,u,6,5>, <2,u,6,5>
+  3370477489U,	// <5,2,u,7>: Cost 4 vmrglw <2,4,5,u>, <2,6,2,7>
+  1695606919U,	// <5,2,u,u>: Cost 2 vsldoi12 <2,u,u,5>, <2,u,u,5>
+  3827976331U,	// <5,3,0,0>: Cost 4 vsldoi12 <0,4,1,5>, <3,0,0,0>
+  2754234518U,	// <5,3,0,1>: Cost 3 vsldoi12 <0,4,1,5>, <3,0,1,2>
+  3706472290U,	// <5,3,0,2>: Cost 4 vsldoi4 <2,5,3,0>, <2,5,3,0>
+  3700500630U,	// <5,3,0,3>: Cost 4 vsldoi4 <1,5,3,0>, <3,0,1,2>
+  2754234544U,	// <5,3,0,4>: Cost 3 vsldoi12 <0,4,1,5>, <3,0,4,1>
+  3376383766U,	// <5,3,0,5>: Cost 4 vmrglw <3,4,5,0>, <2,4,3,5>
+  3769770513U,	// <5,3,0,6>: Cost 5 vsldoi8 <1,u,5,3>, <0,6,4,7>
+  3376383930U,	// <5,3,0,7>: Cost 4 vmrglw <3,4,5,0>, <2,6,3,7>
+  2754234581U,	// <5,3,0,u>: Cost 3 vsldoi12 <0,4,1,5>, <3,0,u,2>
+  2311275414U,	// <5,3,1,0>: Cost 3 vmrglw <4,u,5,1>, <1,2,3,0>
+  2305967971U,	// <5,3,1,1>: Cost 3 vmrglw <4,0,5,1>, <2,5,3,1>
+  2692047787U,	// <5,3,1,2>: Cost 3 vsldoi8 <1,2,5,3>, <1,2,5,3>
+  2311276146U,	// <5,3,1,3>: Cost 3 vmrglw <4,u,5,1>, <2,2,3,3>
+  2311275418U,	// <5,3,1,4>: Cost 3 vmrglw <4,u,5,1>, <1,2,3,4>
+  3765789807U,	// <5,3,1,5>: Cost 4 vsldoi8 <1,2,5,3>, <1,5,0,1>
+  3765789939U,	// <5,3,1,6>: Cost 4 vsldoi8 <1,2,5,3>, <1,6,5,7>
+  2311276474U,	// <5,3,1,7>: Cost 3 vmrglw <4,u,5,1>, <2,6,3,7>
+  2696029585U,	// <5,3,1,u>: Cost 3 vsldoi8 <1,u,5,3>, <1,u,5,3>
+  2311288709U,	// <5,3,2,0>: Cost 3 vmrglw <4,u,5,2>, <u,2,3,0>
+  3765790243U,	// <5,3,2,1>: Cost 4 vsldoi8 <1,2,5,3>, <2,1,3,5>
+  3827976513U,	// <5,3,2,2>: Cost 4 vsldoi12 <0,4,1,5>, <3,2,2,2>
+  2765736268U,	// <5,3,2,3>: Cost 3 vsldoi12 <2,3,4,5>, <3,2,3,4>
+  2246248962U,	// <5,3,2,4>: Cost 3 vmrghw <5,2,6,3>, <3,4,5,6>
+  3765790563U,	// <5,3,2,5>: Cost 4 vsldoi8 <1,2,5,3>, <2,5,3,1>
+  3827976550U,	// <5,3,2,6>: Cost 4 vsldoi12 <0,4,1,5>, <3,2,6,3>
+  3842795887U,	// <5,3,2,7>: Cost 4 vsldoi12 <2,u,4,5>, <3,2,7,3>
+  2769054073U,	// <5,3,2,u>: Cost 3 vsldoi12 <2,u,4,5>, <3,2,u,4>
+  3827976575U,	// <5,3,3,0>: Cost 4 vsldoi12 <0,4,1,5>, <3,3,0,1>
+  3765790963U,	// <5,3,3,1>: Cost 4 vsldoi8 <1,2,5,3>, <3,1,2,5>
+  3839478162U,	// <5,3,3,2>: Cost 4 vsldoi12 <2,3,4,5>, <3,3,2,2>
+  2754234780U,	// <5,3,3,3>: Cost 3 vsldoi12 <0,4,1,5>, <3,3,3,3>
+  2771708327U,	// <5,3,3,4>: Cost 3 vsldoi12 <3,3,4,5>, <3,3,4,5>
+  3363137059U,	// <5,3,3,5>: Cost 4 vmrglw <1,2,5,3>, <2,1,3,5>
+  3375081320U,	// <5,3,3,6>: Cost 4 vmrglw <3,2,5,3>, <2,5,3,6>
+  3363137466U,	// <5,3,3,7>: Cost 4 vmrglw <1,2,5,3>, <2,6,3,7>
+  2772003275U,	// <5,3,3,u>: Cost 3 vsldoi12 <3,3,u,5>, <3,3,u,5>
+  2772077012U,	// <5,3,4,0>: Cost 3 vsldoi12 <3,4,0,5>, <3,4,0,5>
+  3765791714U,	// <5,3,4,1>: Cost 4 vsldoi8 <1,2,5,3>, <4,1,5,0>
+  2709965878U,	// <5,3,4,2>: Cost 3 vsldoi8 <4,2,5,3>, <4,2,5,3>
+  2772298223U,	// <5,3,4,3>: Cost 3 vsldoi12 <3,4,3,5>, <3,4,3,5>
+  2772371960U,	// <5,3,4,4>: Cost 3 vsldoi12 <3,4,4,5>, <3,4,4,5>
+  2754234882U,	// <5,3,4,5>: Cost 3 vsldoi12 <0,4,1,5>, <3,4,5,6>
+  3839478282U,	// <5,3,4,6>: Cost 4 vsldoi12 <2,3,4,5>, <3,4,6,5>
+  3376416698U,	// <5,3,4,7>: Cost 4 vmrglw <3,4,5,4>, <2,6,3,7>
+  2754234909U,	// <5,3,4,u>: Cost 3 vsldoi12 <0,4,1,5>, <3,4,u,6>
+  2311308182U,	// <5,3,5,0>: Cost 3 vmrglw <4,u,5,5>, <1,2,3,0>
+  3765792421U,	// <5,3,5,1>: Cost 4 vsldoi8 <1,2,5,3>, <5,1,2,5>
+  2715938575U,	// <5,3,5,2>: Cost 3 vsldoi8 <5,2,5,3>, <5,2,5,3>
+  2311308914U,	// <5,3,5,3>: Cost 3 vmrglw <4,u,5,5>, <2,2,3,3>
+  2311308186U,	// <5,3,5,4>: Cost 3 vmrglw <4,u,5,5>, <1,2,3,4>
+  2248182354U,	// <5,3,5,5>: Cost 3 vmrghw <5,5,5,5>, <3,5,5,5>
+  3765792837U,	// <5,3,5,6>: Cost 4 vsldoi8 <1,2,5,3>, <5,6,3,7>
+  2311309242U,	// <5,3,5,7>: Cost 3 vmrglw <4,u,5,5>, <2,6,3,7>
+  2311308190U,	// <5,3,5,u>: Cost 3 vmrglw <4,u,5,5>, <1,2,3,u>
+  2632777830U,	// <5,3,6,0>: Cost 3 vsldoi4 <2,5,3,6>, LHS
+  3706520372U,	// <5,3,6,1>: Cost 4 vsldoi4 <2,5,3,6>, <1,1,1,1>
+  2632779624U,	// <5,3,6,2>: Cost 3 vsldoi4 <2,5,3,6>, <2,5,3,6>
+  2632780290U,	// <5,3,6,3>: Cost 3 vsldoi4 <2,5,3,6>, <3,4,5,6>
+  2632781110U,	// <5,3,6,4>: Cost 3 vsldoi4 <2,5,3,6>, RHS
+  2248952413U,	// <5,3,6,5>: Cost 3 vmrghw <5,6,7,0>, <3,5,6,7>
+  2302691176U,	// <5,3,6,6>: Cost 3 vmrglw <3,4,5,6>, <2,5,3,6>
+  2302691258U,	// <5,3,6,7>: Cost 3 vmrglw <3,4,5,6>, <2,6,3,7>
+  2632783662U,	// <5,3,6,u>: Cost 3 vsldoi4 <2,5,3,6>, LHS
+  3365823382U,	// <5,3,7,0>: Cost 4 vmrglw <1,6,5,7>, <1,2,3,0>
+  3706529011U,	// <5,3,7,1>: Cost 4 vsldoi4 <2,5,3,7>, <1,6,5,7>
+  3706529641U,	// <5,3,7,2>: Cost 4 vsldoi4 <2,5,3,7>, <2,5,3,7>
+  3365824114U,	// <5,3,7,3>: Cost 4 vmrglw <1,6,5,7>, <2,2,3,3>
+  2774362859U,	// <5,3,7,4>: Cost 3 vsldoi12 <3,7,4,5>, <3,7,4,5>
+  3365824035U,	// <5,3,7,5>: Cost 4 vmrglw <1,6,5,7>, <2,1,3,5>
+  3383740183U,	// <5,3,7,6>: Cost 4 vmrglw <4,6,5,7>, <2,4,3,6>
+  3363833786U,	// <5,3,7,7>: Cost 4 vmrglw <1,3,5,7>, <2,6,3,7>
+  2774657807U,	// <5,3,7,u>: Cost 3 vsldoi12 <3,7,u,5>, <3,7,u,5>
+  2632794214U,	// <5,3,u,0>: Cost 3 vsldoi4 <2,5,3,u>, LHS
+  2754235166U,	// <5,3,u,1>: Cost 3 vsldoi12 <0,4,1,5>, <3,u,1,2>
+  2632796010U,	// <5,3,u,2>: Cost 3 vsldoi4 <2,5,3,u>, <2,5,3,u>
+  2632796676U,	// <5,3,u,3>: Cost 3 vsldoi4 <2,5,3,u>, <3,4,5,u>
+  2632797494U,	// <5,3,u,4>: Cost 3 vsldoi4 <2,5,3,u>, RHS
+  2754235206U,	// <5,3,u,5>: Cost 3 vsldoi12 <0,4,1,5>, <3,u,5,6>
+  2302691176U,	// <5,3,u,6>: Cost 3 vmrglw <3,4,5,6>, <2,5,3,6>
+  2302707642U,	// <5,3,u,7>: Cost 3 vmrglw <3,4,5,u>, <2,6,3,7>
+  2754235229U,	// <5,3,u,u>: Cost 3 vsldoi12 <0,4,1,5>, <3,u,u,2>
+  3765133325U,	// <5,4,0,0>: Cost 4 vsldoi8 <1,1,5,4>, <0,0,1,4>
+  2705326182U,	// <5,4,0,1>: Cost 3 vsldoi8 <3,4,5,4>, LHS
+  3718489806U,	// <5,4,0,2>: Cost 4 vsldoi4 <4,5,4,0>, <2,3,4,5>
+  3718490624U,	// <5,4,0,3>: Cost 4 vsldoi4 <4,5,4,0>, <3,4,5,4>
+  2709307730U,	// <5,4,0,4>: Cost 3 vsldoi8 <4,1,5,4>, <0,4,1,5>
+  2302641870U,	// <5,4,0,5>: Cost 3 vmrglw <3,4,5,0>, <2,3,4,5>
+  3376383695U,	// <5,4,0,6>: Cost 5 vmrglw <3,4,5,0>, <2,3,4,6>
+  3384351018U,	// <5,4,0,7>: Cost 4 vmrglw <4,7,5,0>, <u,7,4,7>
+  2705326749U,	// <5,4,0,u>: Cost 3 vsldoi8 <3,4,5,4>, LHS
+  2305971057U,	// <5,4,1,0>: Cost 3 vmrglw <4,0,5,1>, <6,7,4,0>
+  3765134171U,	// <5,4,1,1>: Cost 4 vsldoi8 <1,1,5,4>, <1,1,5,4>
+  3766461338U,	// <5,4,1,2>: Cost 4 vsldoi8 <1,3,5,4>, <1,2,3,4>
+  3766461437U,	// <5,4,1,3>: Cost 4 vsldoi8 <1,3,5,4>, <1,3,5,4>
+  2311277776U,	// <5,4,1,4>: Cost 3 vmrglw <4,u,5,1>, <4,4,4,4>
+  2754235362U,	// <5,4,1,5>: Cost 3 vsldoi12 <0,4,1,5>, <4,1,5,0>
+  3783050483U,	// <5,4,1,6>: Cost 4 vsldoi8 <4,1,5,4>, <1,6,5,7>
+  3385019036U,	// <5,4,1,7>: Cost 4 vmrglw <4,u,5,1>, <3,6,4,7>
+  2311276241U,	// <5,4,1,u>: Cost 3 vmrglw <4,u,5,1>, <2,3,4,u>
+  3718504550U,	// <5,4,2,0>: Cost 4 vsldoi4 <4,5,4,2>, LHS
+  3783050787U,	// <5,4,2,1>: Cost 4 vsldoi8 <4,1,5,4>, <2,1,3,5>
+  3773097576U,	// <5,4,2,2>: Cost 4 vsldoi8 <2,4,5,4>, <2,2,2,2>
+  2705327822U,	// <5,4,2,3>: Cost 3 vsldoi8 <3,4,5,4>, <2,3,4,5>
+  3773097767U,	// <5,4,2,4>: Cost 4 vsldoi8 <2,4,5,4>, <2,4,5,4>
+  2765737014U,	// <5,4,2,5>: Cost 3 vsldoi12 <2,3,4,5>, <4,2,5,3>
+  3779069882U,	// <5,4,2,6>: Cost 4 vsldoi8 <3,4,5,4>, <2,6,3,7>
+  3376401052U,	// <5,4,2,7>: Cost 5 vmrglw <3,4,5,2>, <3,6,4,7>
+  2245881370U,	// <5,4,2,u>: Cost 3 vmrghw <5,2,1,3>, <4,u,5,1>
+  3779070102U,	// <5,4,3,0>: Cost 4 vsldoi8 <3,4,5,4>, <3,0,1,2>
+  3363135525U,	// <5,4,3,1>: Cost 4 vmrglw <1,2,5,3>, <0,0,4,1>
+  3779070284U,	// <5,4,3,2>: Cost 4 vsldoi8 <3,4,5,4>, <3,2,3,4>
+  3779070364U,	// <5,4,3,3>: Cost 4 vsldoi8 <3,4,5,4>, <3,3,3,3>
+  2705328640U,	// <5,4,3,4>: Cost 3 vsldoi8 <3,4,5,4>, <3,4,5,4>
+  2307311310U,	// <5,4,3,5>: Cost 3 vmrglw <4,2,5,3>, <2,3,4,5>
+  3866021012U,	// <5,4,3,6>: Cost 4 vsldoi12 <6,7,4,5>, <4,3,6,7>
+  3363138204U,	// <5,4,3,7>: Cost 5 vmrglw <1,2,5,3>, <3,6,4,7>
+  2707983172U,	// <5,4,3,u>: Cost 3 vsldoi8 <3,u,5,4>, <3,u,5,4>
+  2708646805U,	// <5,4,4,0>: Cost 3 vsldoi8 <4,0,5,4>, <4,0,5,4>
+  2709310438U,	// <5,4,4,1>: Cost 3 vsldoi8 <4,1,5,4>, <4,1,5,4>
+  3779071030U,	// <5,4,4,2>: Cost 4 vsldoi8 <3,4,5,4>, <4,2,5,3>
+  2710637704U,	// <5,4,4,3>: Cost 3 vsldoi8 <4,3,5,4>, <4,3,5,4>
+  2754235600U,	// <5,4,4,4>: Cost 3 vsldoi12 <0,4,1,5>, <4,4,4,4>
+  1704676570U,	// <5,4,4,5>: Cost 2 vsldoi12 <4,4,5,5>, <4,4,5,5>
+  3779071358U,	// <5,4,4,6>: Cost 4 vsldoi8 <3,4,5,4>, <4,6,5,7>
+  2713292236U,	// <5,4,4,7>: Cost 3 vsldoi8 <4,7,5,4>, <4,7,5,4>
+  1704897781U,	// <5,4,4,u>: Cost 2 vsldoi12 <4,4,u,5>, <4,4,u,5>
+  2626871398U,	// <5,4,5,0>: Cost 3 vsldoi4 <1,5,4,5>, LHS
+  2626872471U,	// <5,4,5,1>: Cost 3 vsldoi4 <1,5,4,5>, <1,5,4,5>
+  2765737230U,	// <5,4,5,2>: Cost 3 vsldoi12 <2,3,4,5>, <4,5,2,3>
+  3700615318U,	// <5,4,5,3>: Cost 4 vsldoi4 <1,5,4,5>, <3,0,1,2>
+  2626874678U,	// <5,4,5,4>: Cost 3 vsldoi4 <1,5,4,5>, RHS
+  1174441270U,	// <5,4,5,5>: Cost 2 vmrghw <5,5,5,5>, RHS
+  1680493878U,	// <5,4,5,6>: Cost 2 vsldoi12 <0,4,1,5>, RHS
+  3385051804U,	// <5,4,5,7>: Cost 4 vmrglw <4,u,5,5>, <3,6,4,7>
+  1680493896U,	// <5,4,5,u>: Cost 2 vsldoi12 <0,4,1,5>, RHS
+  2248952722U,	// <5,4,6,0>: Cost 3 vmrghw <5,6,7,0>, <4,0,5,1>
+  2302692152U,	// <5,4,6,1>: Cost 3 vmrglw <3,4,5,6>, <3,u,4,1>
+  3382406107U,	// <5,4,6,2>: Cost 4 vmrglw <4,4,5,6>, <4,1,4,2>
+  3700623874U,	// <5,4,6,3>: Cost 4 vsldoi4 <1,5,4,6>, <3,4,5,6>
+  2248953040U,	// <5,4,6,4>: Cost 3 vmrghw <5,6,7,0>, <4,4,4,4>
+  1175211318U,	// <5,4,6,5>: Cost 2 vmrghw <5,6,7,0>, RHS
+  3376432280U,	// <5,4,6,6>: Cost 4 vmrglw <3,4,5,6>, <1,5,4,6>
+  2729218934U,	// <5,4,6,7>: Cost 3 vsldoi8 <7,4,5,4>, <6,7,4,5>
+  1175211561U,	// <5,4,6,u>: Cost 2 vmrghw <5,6,7,0>, RHS
+  3787035642U,	// <5,4,7,0>: Cost 4 vsldoi8 <4,7,5,4>, <7,0,1,2>
+  3365822501U,	// <5,4,7,1>: Cost 4 vmrglw <1,6,5,7>, <0,0,4,1>
+  3808933085U,	// <5,4,7,2>: Cost 4 vsldoi8 <u,4,5,4>, <7,2,u,4>
+  3784381707U,	// <5,4,7,3>: Cost 4 vsldoi8 <4,3,5,4>, <7,3,4,5>
+  2713294182U,	// <5,4,7,4>: Cost 3 vsldoi8 <4,7,5,4>, <7,4,5,6>
+  2309998286U,	// <5,4,7,5>: Cost 3 vmrglw <4,6,5,7>, <2,3,4,5>
+  3383740111U,	// <5,4,7,6>: Cost 4 vmrglw <4,6,5,7>, <2,3,4,6>
+  3787036239U,	// <5,4,7,7>: Cost 4 vsldoi8 <4,7,5,4>, <7,7,4,5>
+  2731873960U,	// <5,4,7,u>: Cost 3 vsldoi8 <7,u,5,4>, <7,u,5,4>
+  2626895974U,	// <5,4,u,0>: Cost 3 vsldoi4 <1,5,4,u>, LHS
+  2626897050U,	// <5,4,u,1>: Cost 3 vsldoi4 <1,5,4,u>, <1,5,4,u>
+  2644813518U,	// <5,4,u,2>: Cost 3 vsldoi4 <4,5,4,u>, <2,3,4,5>
+  2705327822U,	// <5,4,u,3>: Cost 3 vsldoi8 <3,4,5,4>, <2,3,4,5>
+  2626899254U,	// <5,4,u,4>: Cost 3 vsldoi4 <1,5,4,u>, RHS
+  1707331102U,	// <5,4,u,5>: Cost 2 vsldoi12 <4,u,5,5>, <4,u,5,5>
+  1680494121U,	// <5,4,u,6>: Cost 2 vsldoi12 <0,4,1,5>, RHS
+  2737183024U,	// <5,4,u,7>: Cost 3 vsldoi8 <u,7,5,4>, <u,7,5,4>
+  1680494139U,	// <5,4,u,u>: Cost 2 vsldoi12 <0,4,1,5>, RHS
+  2302642684U,	// <5,5,0,0>: Cost 3 vmrglw <3,4,5,0>, <3,4,5,0>
+  1640218726U,	// <5,5,0,1>: Cost 2 vsldoi8 <4,u,5,5>, LHS
+  3376384510U,	// <5,5,0,2>: Cost 4 vmrglw <3,4,5,0>, <3,4,5,2>
+  3376385078U,	// <5,5,0,3>: Cost 4 vmrglw <3,4,5,0>, <4,2,5,3>
+  2754236002U,	// <5,5,0,4>: Cost 3 vsldoi12 <0,4,1,5>, <5,0,4,1>
+  2717942242U,	// <5,5,0,5>: Cost 3 vsldoi8 <5,5,5,5>, <0,5,u,5>
+  2244907106U,	// <5,5,0,6>: Cost 3 vmrghw <5,0,6,1>, <5,6,7,0>
+  3376385406U,	// <5,5,0,7>: Cost 4 vmrglw <3,4,5,0>, <4,6,5,7>
+  1640219293U,	// <5,5,0,u>: Cost 2 vsldoi8 <4,u,5,5>, LHS
+  2305969365U,	// <5,5,1,0>: Cost 3 vmrglw <4,0,5,1>, <4,4,5,0>
+  1237536282U,	// <5,5,1,1>: Cost 2 vmrglw <4,u,5,1>, <4,u,5,1>
+  2713961366U,	// <5,5,1,2>: Cost 3 vsldoi8 <4,u,5,5>, <1,2,3,0>
+  3766469630U,	// <5,5,1,3>: Cost 4 vsldoi8 <1,3,5,5>, <1,3,5,5>
+  2782326455U,	// <5,5,1,4>: Cost 3 vsldoi12 <5,1,4,5>, <5,1,4,5>
+  2311277786U,	// <5,5,1,5>: Cost 3 vmrglw <4,u,5,1>, <4,4,5,5>
+  2311277058U,	// <5,5,1,6>: Cost 3 vmrglw <4,u,5,1>, <3,4,5,6>
+  3385017587U,	// <5,5,1,7>: Cost 4 vmrglw <4,u,5,1>, <1,6,5,7>
+  1237536282U,	// <5,5,1,u>: Cost 2 vmrglw <4,u,5,1>, <4,u,5,1>
+  3376400892U,	// <5,5,2,0>: Cost 4 vmrglw <3,4,5,2>, <3,4,5,0>
+  3827977963U,	// <5,5,2,1>: Cost 4 vsldoi12 <0,4,1,5>, <5,2,1,3>
+  2302659070U,	// <5,5,2,2>: Cost 3 vmrglw <3,4,5,2>, <3,4,5,2>
+  2765737726U,	// <5,5,2,3>: Cost 3 vsldoi12 <2,3,4,5>, <5,2,3,4>
+  3839479558U,	// <5,5,2,4>: Cost 4 vsldoi12 <2,3,4,5>, <5,2,4,3>
+  2781073167U,	// <5,5,2,5>: Cost 3 vsldoi12 <4,u,5,5>, <5,2,5,3>
+  2713962426U,	// <5,5,2,6>: Cost 3 vsldoi8 <4,u,5,5>, <2,6,3,7>
+  3376401790U,	// <5,5,2,7>: Cost 4 vmrglw <3,4,5,2>, <4,6,5,7>
+  2769055531U,	// <5,5,2,u>: Cost 3 vsldoi12 <2,u,4,5>, <5,2,u,4>
+  2713962646U,	// <5,5,3,0>: Cost 3 vsldoi8 <4,u,5,5>, <3,0,1,2>
+  3765143786U,	// <5,5,3,1>: Cost 4 vsldoi8 <1,1,5,5>, <3,1,1,5>
+  3839479621U,	// <5,5,3,2>: Cost 4 vsldoi12 <2,3,4,5>, <5,3,2,3>
+  2289394603U,	// <5,5,3,3>: Cost 3 vmrglw <1,2,5,3>, <1,2,5,3>
+  2713963010U,	// <5,5,3,4>: Cost 3 vsldoi8 <4,u,5,5>, <3,4,5,6>
+  2313285150U,	// <5,5,3,5>: Cost 3 vmrglw <5,2,5,3>, <4,u,5,5>
+  3363138050U,	// <5,5,3,6>: Cost 4 vmrglw <1,2,5,3>, <3,4,5,6>
+  3363136755U,	// <5,5,3,7>: Cost 4 vmrglw <1,2,5,3>, <1,6,5,7>
+  2713963294U,	// <5,5,3,u>: Cost 3 vsldoi8 <4,u,5,5>, <3,u,1,2>
+  2713963410U,	// <5,5,4,0>: Cost 3 vsldoi8 <4,u,5,5>, <4,0,5,1>
+  3827978127U,	// <5,5,4,1>: Cost 4 vsldoi12 <0,4,1,5>, <5,4,1,5>
+  3839479704U,	// <5,5,4,2>: Cost 4 vsldoi12 <2,3,4,5>, <5,4,2,5>
+  3376417846U,	// <5,5,4,3>: Cost 4 vmrglw <3,4,5,4>, <4,2,5,3>
+  1637567706U,	// <5,5,4,4>: Cost 2 vsldoi8 <4,4,5,5>, <4,4,5,5>
+  1640222006U,	// <5,5,4,5>: Cost 2 vsldoi8 <4,u,5,5>, RHS
+  2310640998U,	// <5,5,4,6>: Cost 3 vmrglw <4,7,5,4>, <7,4,5,6>
+  3376418174U,	// <5,5,4,7>: Cost 4 vmrglw <3,4,5,4>, <4,6,5,7>
+  1640222238U,	// <5,5,4,u>: Cost 2 vsldoi8 <4,u,5,5>, <4,u,5,5>
+  1577091174U,	// <5,5,5,0>: Cost 2 vsldoi4 <5,5,5,5>, LHS
+  2311310226U,	// <5,5,5,1>: Cost 3 vmrglw <4,u,5,5>, <4,0,5,1>
+  2713964303U,	// <5,5,5,2>: Cost 3 vsldoi8 <4,u,5,5>, <5,2,5,3>
+  2311311119U,	// <5,5,5,3>: Cost 3 vmrglw <4,u,5,5>, <5,2,5,3>
+  1577094454U,	// <5,5,5,4>: Cost 2 vsldoi4 <5,5,5,5>, RHS
+  296144182U,	// <5,5,5,5>: Cost 1 vspltisw1 RHS
+  2311309826U,	// <5,5,5,6>: Cost 3 vmrglw <4,u,5,5>, <3,4,5,6>
+  2311311447U,	// <5,5,5,7>: Cost 3 vmrglw <4,u,5,5>, <5,6,5,7>
+  296144182U,	// <5,5,5,u>: Cost 1 vspltisw1 RHS
+  2248953460U,	// <5,5,6,0>: Cost 3 vmrghw <5,6,7,0>, <5,0,6,1>
+  2326580114U,	// <5,5,6,1>: Cost 3 vmrglw <7,4,5,6>, <4,0,5,1>
+  2713965050U,	// <5,5,6,2>: Cost 3 vsldoi8 <4,u,5,5>, <6,2,7,3>
+  3700697602U,	// <5,5,6,3>: Cost 4 vsldoi4 <1,5,5,6>, <3,4,5,6>
+  2785644620U,	// <5,5,6,4>: Cost 3 vsldoi12 <5,6,4,5>, <5,6,4,5>
+  2781073495U,	// <5,5,6,5>: Cost 3 vsldoi12 <4,u,5,5>, <5,6,5,7>
+  1228950018U,	// <5,5,6,6>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6>
+  2713965390U,	// <5,5,6,7>: Cost 3 vsldoi8 <4,u,5,5>, <6,7,0,1>
+  1228950018U,	// <5,5,6,u>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6>
+  2713965562U,	// <5,5,7,0>: Cost 3 vsldoi8 <4,u,5,5>, <7,0,1,2>
+  3383741330U,	// <5,5,7,1>: Cost 4 vmrglw <4,6,5,7>, <4,0,5,1>
+  3718620878U,	// <5,5,7,2>: Cost 4 vsldoi4 <4,5,5,7>, <2,3,4,5>
+  3365823403U,	// <5,5,7,3>: Cost 4 vmrglw <1,6,5,7>, <1,2,5,3>
+  2713965926U,	// <5,5,7,4>: Cost 3 vsldoi8 <4,u,5,5>, <7,4,5,6>
+  2717947318U,	// <5,5,7,5>: Cost 3 vsldoi8 <5,5,5,5>, <7,5,5,5>
+  3365825026U,	// <5,5,7,6>: Cost 4 vmrglw <1,6,5,7>, <3,4,5,6>
+  2292081907U,	// <5,5,7,7>: Cost 3 vmrglw <1,6,5,7>, <1,6,5,7>
+  2713966210U,	// <5,5,7,u>: Cost 3 vsldoi8 <4,u,5,5>, <7,u,1,2>
+  1577091174U,	// <5,5,u,0>: Cost 2 vsldoi4 <5,5,5,5>, LHS
+  1640224558U,	// <5,5,u,1>: Cost 2 vsldoi8 <4,u,5,5>, LHS
+  2713966469U,	// <5,5,u,2>: Cost 3 vsldoi8 <4,u,5,5>, <u,2,3,0>
+  2713966524U,	// <5,5,u,3>: Cost 3 vsldoi8 <4,u,5,5>, <u,3,0,1>
+  1577094454U,	// <5,5,u,4>: Cost 2 vsldoi4 <5,5,5,5>, RHS
+  296144182U,	// <5,5,u,5>: Cost 1 vspltisw1 RHS
+  1228950018U,	// <5,5,u,6>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6>
+  2713966848U,	// <5,5,u,7>: Cost 3 vsldoi8 <4,u,5,5>, <u,7,0,1>
+  296144182U,	// <5,5,u,u>: Cost 1 vspltisw1 RHS
+  2705342464U,	// <5,6,0,0>: Cost 3 vsldoi8 <3,4,5,6>, <0,0,0,0>
+  1631600742U,	// <5,6,0,1>: Cost 2 vsldoi8 <3,4,5,6>, LHS
+  3773112493U,	// <5,6,0,2>: Cost 4 vsldoi8 <2,4,5,6>, <0,2,1,2>
+  2705342720U,	// <5,6,0,3>: Cost 3 vsldoi8 <3,4,5,6>, <0,3,1,4>
+  2705342802U,	// <5,6,0,4>: Cost 3 vsldoi8 <3,4,5,6>, <0,4,1,5>
+  3779084708U,	// <5,6,0,5>: Cost 4 vsldoi8 <3,4,5,6>, <0,5,1,6>
+  3779084790U,	// <5,6,0,6>: Cost 4 vsldoi8 <3,4,5,6>, <0,6,1,7>
+  2302643510U,	// <5,6,0,7>: Cost 3 vmrglw <3,4,5,0>, RHS
+  1631601309U,	// <5,6,0,u>: Cost 2 vsldoi8 <3,4,5,6>, LHS
+  3767141092U,	// <5,6,1,0>: Cost 4 vsldoi8 <1,4,5,6>, <1,0,1,2>
+  2705343284U,	// <5,6,1,1>: Cost 3 vsldoi8 <3,4,5,6>, <1,1,1,1>
+  2705343382U,	// <5,6,1,2>: Cost 3 vsldoi8 <3,4,5,6>, <1,2,3,0>
+  3779085282U,	// <5,6,1,3>: Cost 4 vsldoi8 <3,4,5,6>, <1,3,2,4>
+  2693399632U,	// <5,6,1,4>: Cost 3 vsldoi8 <1,4,5,6>, <1,4,5,6>
+  3767805089U,	// <5,6,1,5>: Cost 4 vsldoi8 <1,5,5,6>, <1,5,5,6>
+  2311279416U,	// <5,6,1,6>: Cost 3 vmrglw <4,u,5,1>, <6,6,6,6>
+  1237536054U,	// <5,6,1,7>: Cost 2 vmrglw <4,u,5,1>, RHS
+  1237536055U,	// <5,6,1,u>: Cost 2 vmrglw <4,u,5,1>, RHS
+  3773113789U,	// <5,6,2,0>: Cost 4 vsldoi8 <2,4,5,6>, <2,0,1,2>
+  3779085855U,	// <5,6,2,1>: Cost 4 vsldoi8 <3,4,5,6>, <2,1,3,1>
+  2699372136U,	// <5,6,2,2>: Cost 3 vsldoi8 <2,4,5,6>, <2,2,2,2>
+  2705344166U,	// <5,6,2,3>: Cost 3 vsldoi8 <3,4,5,6>, <2,3,0,1>
+  2699372329U,	// <5,6,2,4>: Cost 3 vsldoi8 <2,4,5,6>, <2,4,5,6>
+  2705344360U,	// <5,6,2,5>: Cost 3 vsldoi8 <3,4,5,6>, <2,5,3,6>
+  2705344442U,	// <5,6,2,6>: Cost 3 vsldoi8 <3,4,5,6>, <2,6,3,7>
+  2302659894U,	// <5,6,2,7>: Cost 3 vmrglw <3,4,5,2>, RHS
+  2702026861U,	// <5,6,2,u>: Cost 3 vsldoi8 <2,u,5,6>, <2,u,5,6>
+  2705344662U,	// <5,6,3,0>: Cost 3 vsldoi8 <3,4,5,6>, <3,0,1,2>
+  3767142661U,	// <5,6,3,1>: Cost 4 vsldoi8 <1,4,5,6>, <3,1,4,5>
+  3773114689U,	// <5,6,3,2>: Cost 4 vsldoi8 <2,4,5,6>, <3,2,2,2>
+  2705344924U,	// <5,6,3,3>: Cost 3 vsldoi8 <3,4,5,6>, <3,3,3,3>
+  1631603202U,	// <5,6,3,4>: Cost 2 vsldoi8 <3,4,5,6>, <3,4,5,6>
+  3842945597U,	// <5,6,3,5>: Cost 4 vsldoi12 <2,u,6,5>, <6,3,5,7>
+  3779086962U,	// <5,6,3,6>: Cost 4 vsldoi8 <3,4,5,6>, <3,6,0,1>
+  2289397046U,	// <5,6,3,7>: Cost 3 vmrglw <1,2,5,3>, RHS
+  1634257734U,	// <5,6,3,u>: Cost 2 vsldoi8 <3,u,5,6>, <3,u,5,6>
+  2644926566U,	// <5,6,4,0>: Cost 3 vsldoi4 <4,5,6,4>, LHS
+  3779087306U,	// <5,6,4,1>: Cost 4 vsldoi8 <3,4,5,6>, <4,1,2,3>
+  2790142577U,	// <5,6,4,2>: Cost 3 vsldoi12 <6,4,2,5>, <6,4,2,5>
+  2644929026U,	// <5,6,4,3>: Cost 3 vsldoi4 <4,5,6,4>, <3,4,5,6>
+  2711317723U,	// <5,6,4,4>: Cost 3 vsldoi8 <4,4,5,6>, <4,4,5,6>
+  1631604022U,	// <5,6,4,5>: Cost 2 vsldoi8 <3,4,5,6>, RHS
+  2712644989U,	// <5,6,4,6>: Cost 3 vsldoi8 <4,6,5,6>, <4,6,5,6>
+  2302676278U,	// <5,6,4,7>: Cost 3 vmrglw <3,4,5,4>, RHS
+  1631604265U,	// <5,6,4,u>: Cost 2 vsldoi8 <3,4,5,6>, RHS
+  3842945708U,	// <5,6,5,0>: Cost 4 vsldoi12 <2,u,6,5>, <6,5,0,1>
+  3767144133U,	// <5,6,5,1>: Cost 4 vsldoi8 <1,4,5,6>, <5,1,6,1>
+  2705346328U,	// <5,6,5,2>: Cost 3 vsldoi8 <3,4,5,6>, <5,2,6,3>
+  3779088207U,	// <5,6,5,3>: Cost 4 vsldoi8 <3,4,5,6>, <5,3,3,4>
+  2717290420U,	// <5,6,5,4>: Cost 3 vsldoi8 <5,4,5,6>, <5,4,5,6>
+  2705346574U,	// <5,6,5,5>: Cost 3 vsldoi8 <3,4,5,6>, <5,5,6,6>
+  2705346596U,	// <5,6,5,6>: Cost 3 vsldoi8 <3,4,5,6>, <5,6,0,1>
+  1237568822U,	// <5,6,5,7>: Cost 2 vmrglw <4,u,5,5>, RHS
+  1237568823U,	// <5,6,5,u>: Cost 2 vmrglw <4,u,5,5>, RHS
+  2650914918U,	// <5,6,6,0>: Cost 3 vsldoi4 <5,5,6,6>, LHS
+  3364490949U,	// <5,6,6,1>: Cost 4 vmrglw <1,4,5,6>, <5,1,6,1>
+  2248954362U,	// <5,6,6,2>: Cost 3 vmrghw <5,6,7,0>, <6,2,7,3>
+  2302693144U,	// <5,6,6,3>: Cost 3 vmrglw <3,4,5,6>, <5,2,6,3>
+  2650918198U,	// <5,6,6,4>: Cost 3 vsldoi4 <5,5,6,6>, RHS
+  2650918926U,	// <5,6,6,5>: Cost 3 vsldoi4 <5,5,6,6>, <5,5,6,6>
+  2302693390U,	// <5,6,6,6>: Cost 3 vmrglw <3,4,5,6>, <5,5,6,6>
+  1228950838U,	// <5,6,6,7>: Cost 2 vmrglw <3,4,5,6>, RHS
+  1228950839U,	// <5,6,6,u>: Cost 2 vmrglw <3,4,5,6>, RHS
+  497467494U,	// <5,6,7,0>: Cost 1 vsldoi4 RHS, LHS
+  1571210036U,	// <5,6,7,1>: Cost 2 vsldoi4 RHS, <1,1,1,1>
+  1571210856U,	// <5,6,7,2>: Cost 2 vsldoi4 RHS, <2,2,2,2>
+  1571211414U,	// <5,6,7,3>: Cost 2 vsldoi4 RHS, <3,0,1,2>
+  497470774U,	// <5,6,7,4>: Cost 1 vsldoi4 RHS, RHS
+  1571213316U,	// <5,6,7,5>: Cost 2 vsldoi4 RHS, <5,5,5,5>
+  1571213818U,	// <5,6,7,6>: Cost 2 vsldoi4 RHS, <6,2,7,3>
+  1571214956U,	// <5,6,7,7>: Cost 2 vsldoi4 RHS, <7,7,7,7>
+  497473326U,	// <5,6,7,u>: Cost 1 vsldoi4 RHS, LHS
+  497475686U,	// <5,6,u,0>: Cost 1 vsldoi4 RHS, LHS
+  1631606574U,	// <5,6,u,1>: Cost 2 vsldoi8 <3,4,5,6>, LHS
+  1571219048U,	// <5,6,u,2>: Cost 2 vsldoi4 RHS, <2,2,2,2>
+  1571219606U,	// <5,6,u,3>: Cost 2 vsldoi4 RHS, <3,0,1,2>
+  497478967U,	// <5,6,u,4>: Cost 1 vsldoi4 RHS, RHS
+  1631606938U,	// <5,6,u,5>: Cost 2 vsldoi8 <3,4,5,6>, RHS
+  1571222010U,	// <5,6,u,6>: Cost 2 vsldoi4 RHS, <6,2,7,3>
+  1228967222U,	// <5,6,u,7>: Cost 2 vmrglw <3,4,5,u>, RHS
+  497481518U,	// <5,6,u,u>: Cost 1 vsldoi4 RHS, LHS
+  3768475648U,	// <5,7,0,0>: Cost 4 vsldoi8 <1,6,5,7>, <0,0,0,0>
+  2694733926U,	// <5,7,0,1>: Cost 3 vsldoi8 <1,6,5,7>, LHS
+  3718711395U,	// <5,7,0,2>: Cost 4 vsldoi4 <4,5,7,0>, <2,u,4,5>
+  3384349178U,	// <5,7,0,3>: Cost 4 vmrglw <4,7,5,0>, <6,2,7,3>
+  2694734162U,	// <5,7,0,4>: Cost 3 vsldoi8 <1,6,5,7>, <0,4,1,5>
+  3384347884U,	// <5,7,0,5>: Cost 4 vmrglw <4,7,5,0>, <4,4,7,5>
+  3730658026U,	// <5,7,0,6>: Cost 4 vsldoi4 <6,5,7,0>, <6,5,7,0>
+  3718714362U,	// <5,7,0,7>: Cost 4 vsldoi4 <4,5,7,0>, <7,0,1,2>
+  2694734493U,	// <5,7,0,u>: Cost 3 vsldoi8 <1,6,5,7>, LHS
+  2311278690U,	// <5,7,1,0>: Cost 3 vmrglw <4,u,5,1>, <5,6,7,0>
+  2305970923U,	// <5,7,1,1>: Cost 3 vmrglw <4,0,5,1>, <6,5,7,1>
+  3768476566U,	// <5,7,1,2>: Cost 4 vsldoi8 <1,6,5,7>, <1,2,3,0>
+  2311279098U,	// <5,7,1,3>: Cost 3 vmrglw <4,u,5,1>, <6,2,7,3>
+  2311278694U,	// <5,7,1,4>: Cost 3 vmrglw <4,u,5,1>, <5,6,7,4>
+  3768476783U,	// <5,7,1,5>: Cost 4 vsldoi8 <1,6,5,7>, <1,5,0,1>
+  2694735091U,	// <5,7,1,6>: Cost 3 vsldoi8 <1,6,5,7>, <1,6,5,7>
+  2311279426U,	// <5,7,1,7>: Cost 3 vmrglw <4,u,5,1>, <6,6,7,7>
+  2696062357U,	// <5,7,1,u>: Cost 3 vsldoi8 <1,u,5,7>, <1,u,5,7>
+  3383701602U,	// <5,7,2,0>: Cost 4 vmrglw <4,6,5,2>, <5,6,7,0>
+  3768477219U,	// <5,7,2,1>: Cost 4 vsldoi8 <1,6,5,7>, <2,1,3,5>
+  3768477288U,	// <5,7,2,2>: Cost 4 vsldoi8 <1,6,5,7>, <2,2,2,2>
+  2309960186U,	// <5,7,2,3>: Cost 3 vmrglw <4,6,5,2>, <6,2,7,3>
+  3383701606U,	// <5,7,2,4>: Cost 4 vmrglw <4,6,5,2>, <5,6,7,4>
+  3768477545U,	// <5,7,2,5>: Cost 4 vsldoi8 <1,6,5,7>, <2,5,3,7>
+  3766486970U,	// <5,7,2,6>: Cost 4 vsldoi8 <1,3,5,7>, <2,6,3,7>
+  3383702338U,	// <5,7,2,7>: Cost 4 vmrglw <4,6,5,2>, <6,6,7,7>
+  2309960186U,	// <5,7,2,u>: Cost 3 vmrglw <4,6,5,2>, <6,2,7,3>
+  3768477846U,	// <5,7,3,0>: Cost 4 vsldoi8 <1,6,5,7>, <3,0,1,2>
+  3768477975U,	// <5,7,3,1>: Cost 4 vsldoi8 <1,6,5,7>, <3,1,6,5>
+  3786393932U,	// <5,7,3,2>: Cost 4 vsldoi8 <4,6,5,7>, <3,2,3,4>
+  3768478108U,	// <5,7,3,3>: Cost 4 vsldoi8 <1,6,5,7>, <3,3,3,3>
+  2795599115U,	// <5,7,3,4>: Cost 3 vsldoi12 <7,3,4,5>, <7,3,4,5>
+  3385037470U,	// <5,7,3,5>: Cost 4 vmrglw <4,u,5,3>, <6,4,7,5>
+  3780422309U,	// <5,7,3,6>: Cost 4 vsldoi8 <3,6,5,7>, <3,6,5,7>
+  3848107301U,	// <5,7,3,7>: Cost 4 vsldoi12 <3,7,4,5>, <7,3,7,4>
+  2795894063U,	// <5,7,3,u>: Cost 3 vsldoi12 <7,3,u,5>, <7,3,u,5>
+  2795967800U,	// <5,7,4,0>: Cost 3 vsldoi12 <7,4,0,5>, <7,4,0,5>
+  3768478690U,	// <5,7,4,1>: Cost 4 vsldoi8 <1,6,5,7>, <4,1,5,0>
+  3718744163U,	// <5,7,4,2>: Cost 4 vsldoi4 <4,5,7,4>, <2,u,4,5>
+  3784404107U,	// <5,7,4,3>: Cost 4 vsldoi8 <4,3,5,7>, <4,3,5,7>
+  2796262748U,	// <5,7,4,4>: Cost 3 vsldoi12 <7,4,4,5>, <7,4,4,5>
+  2694737206U,	// <5,7,4,5>: Cost 3 vsldoi8 <1,6,5,7>, RHS
+  2712653182U,	// <5,7,4,6>: Cost 3 vsldoi8 <4,6,5,7>, <4,6,5,7>
+  2713316815U,	// <5,7,4,7>: Cost 3 vsldoi8 <4,7,5,7>, <4,7,5,7>
+  2694737449U,	// <5,7,4,u>: Cost 3 vsldoi8 <1,6,5,7>, RHS
+  2311311458U,	// <5,7,5,0>: Cost 3 vmrglw <4,u,5,5>, <5,6,7,0>
+  3768479433U,	// <5,7,5,1>: Cost 4 vsldoi8 <1,6,5,7>, <5,1,6,5>
+  3768479521U,	// <5,7,5,2>: Cost 4 vsldoi8 <1,6,5,7>, <5,2,7,3>
+  2311311866U,	// <5,7,5,3>: Cost 3 vmrglw <4,u,5,5>, <6,2,7,3>
+  2311311462U,	// <5,7,5,4>: Cost 3 vmrglw <4,u,5,5>, <5,6,7,4>
+  2248185270U,	// <5,7,5,5>: Cost 3 vmrghw <5,5,5,5>, <7,5,5,5>
+  2718625879U,	// <5,7,5,6>: Cost 3 vsldoi8 <5,6,5,7>, <5,6,5,7>
+  2311312194U,	// <5,7,5,7>: Cost 3 vmrglw <4,u,5,5>, <6,6,7,7>
+  2311311466U,	// <5,7,5,u>: Cost 3 vmrglw <4,u,5,5>, <5,6,7,u>
+  2248954874U,	// <5,7,6,0>: Cost 3 vmrghw <5,6,7,0>, <7,0,1,2>
+  3322696778U,	// <5,7,6,1>: Cost 4 vmrghw <5,6,7,0>, <7,1,1,1>
+  2248955028U,	// <5,7,6,2>: Cost 3 vmrghw <5,6,7,0>, <7,2,0,3>
+  2656963074U,	// <5,7,6,3>: Cost 3 vsldoi4 <6,5,7,6>, <3,4,5,6>
+  2248955238U,	// <5,7,6,4>: Cost 3 vmrghw <5,6,7,0>, <7,4,5,6>
+  2248955329U,	// <5,7,6,5>: Cost 3 vmrghw <5,6,7,0>, <7,5,6,7>
+  2656965360U,	// <5,7,6,6>: Cost 3 vsldoi4 <6,5,7,6>, <6,5,7,6>
+  2248955500U,	// <5,7,6,7>: Cost 3 vmrghw <5,6,7,0>, <7,7,7,7>
+  2248955522U,	// <5,7,6,u>: Cost 3 vmrghw <5,6,7,0>, <7,u,1,2>
+  3718766694U,	// <5,7,7,0>: Cost 4 vsldoi4 <4,5,7,7>, LHS
+  3724739827U,	// <5,7,7,1>: Cost 4 vsldoi4 <5,5,7,7>, <1,6,5,7>
+  3718768739U,	// <5,7,7,2>: Cost 4 vsldoi4 <4,5,7,7>, <2,u,4,5>
+  3365826337U,	// <5,7,7,3>: Cost 4 vmrglw <1,6,5,7>, <5,2,7,3>
+  2798253647U,	// <5,7,7,4>: Cost 3 vsldoi12 <7,7,4,5>, <7,7,4,5>
+  3365826258U,	// <5,7,7,5>: Cost 4 vmrglw <1,6,5,7>, <5,1,7,5>
+  3730715377U,	// <5,7,7,6>: Cost 4 vsldoi4 <6,5,7,7>, <6,5,7,7>
+  2310665836U,	// <5,7,7,7>: Cost 3 vmrglw <4,7,5,7>, <7,7,7,7>
+  2798548595U,	// <5,7,7,u>: Cost 3 vsldoi12 <7,7,u,5>, <7,7,u,5>
+  2311336034U,	// <5,7,u,0>: Cost 3 vmrglw <4,u,5,u>, <5,6,7,0>
+  2694739758U,	// <5,7,u,1>: Cost 3 vsldoi8 <1,6,5,7>, LHS
+  2248955028U,	// <5,7,u,2>: Cost 3 vmrghw <5,6,7,0>, <7,2,0,3>
+  2311336442U,	// <5,7,u,3>: Cost 3 vmrglw <4,u,5,u>, <6,2,7,3>
+  2311336038U,	// <5,7,u,4>: Cost 3 vmrglw <4,u,5,u>, <5,6,7,4>
+  2694740122U,	// <5,7,u,5>: Cost 3 vsldoi8 <1,6,5,7>, RHS
+  2656981746U,	// <5,7,u,6>: Cost 3 vsldoi4 <6,5,7,u>, <6,5,7,u>
+  2311336770U,	// <5,7,u,7>: Cost 3 vmrglw <4,u,5,u>, <6,6,7,7>
+  2694740325U,	// <5,7,u,u>: Cost 3 vsldoi8 <1,6,5,7>, LHS
+  2705358848U,	// <5,u,0,0>: Cost 3 vsldoi8 <3,4,5,u>, <0,0,0,0>
+  1631617126U,	// <5,u,0,1>: Cost 2 vsldoi8 <3,4,5,u>, LHS
+  2310607866U,	// <5,u,0,2>: Cost 3 vmrglw <4,7,5,0>, <7,0,1,2>
+  2302640284U,	// <5,u,0,3>: Cost 3 vmrglw <3,4,5,0>, LHS
+  2754238189U,	// <5,u,0,4>: Cost 3 vsldoi12 <0,4,1,5>, <u,0,4,1>
+  2305296114U,	// <5,u,0,5>: Cost 3 vmrglw <3,u,5,0>, <2,3,u,5>
+  2244907106U,	// <5,u,0,6>: Cost 3 vmrghw <5,0,6,1>, <5,6,7,0>
+  2302643528U,	// <5,u,0,7>: Cost 3 vmrglw <3,4,5,0>, RHS
+  1631617693U,	// <5,u,0,u>: Cost 2 vsldoi8 <3,4,5,u>, LHS
+  2627133542U,	// <5,u,1,0>: Cost 3 vsldoi4 <1,5,u,1>, LHS
+  1237536282U,	// <5,u,1,1>: Cost 2 vmrglw <4,u,5,1>, <4,u,5,1>
+  1680496430U,	// <5,u,1,2>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+  1237532828U,	// <5,u,1,3>: Cost 2 vmrglw <4,u,5,1>, LHS
+  2693416018U,	// <5,u,1,4>: Cost 3 vsldoi8 <1,4,5,u>, <1,4,5,u>
+  2756892486U,	// <5,u,1,5>: Cost 3 vsldoi12 <0,u,1,5>, <u,1,5,0>
+  2694743284U,	// <5,u,1,6>: Cost 3 vsldoi8 <1,6,5,u>, <1,6,5,u>
+  1237536072U,	// <5,u,1,7>: Cost 2 vmrglw <4,u,5,1>, RHS
+  1680496484U,	// <5,u,1,u>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+  2311288709U,	// <5,u,2,0>: Cost 3 vmrglw <4,u,5,2>, <u,2,3,0>
+  2245883694U,	// <5,u,2,1>: Cost 3 vmrghw <5,2,1,3>, LHS
+  2699388520U,	// <5,u,2,2>: Cost 3 vsldoi8 <2,4,5,u>, <2,2,2,2>
+  2754238344U,	// <5,u,2,3>: Cost 3 vsldoi12 <0,4,1,5>, <u,2,3,3>
+  2699388715U,	// <5,u,2,4>: Cost 3 vsldoi8 <2,4,5,u>, <2,4,5,u>
+  2757408666U,	// <5,u,2,5>: Cost 3 vsldoi12 <0,u,u,5>, <u,2,5,3>
+  2705360826U,	// <5,u,2,6>: Cost 3 vsldoi8 <3,4,5,u>, <2,6,3,7>
+  2302659912U,	// <5,u,2,7>: Cost 3 vmrglw <3,4,5,2>, RHS
+  2754238389U,	// <5,u,2,u>: Cost 3 vsldoi12 <0,4,1,5>, <u,2,u,3>
+  2754238396U,	// <5,u,3,0>: Cost 3 vsldoi12 <0,4,1,5>, <u,3,0,1>
+  3827980229U,	// <5,u,3,1>: Cost 4 vsldoi12 <0,4,1,5>, <u,3,1,1>
+  2644625102U,	// <5,u,3,2>: Cost 3 vsldoi4 <4,5,2,3>, <2,3,4,5>
+  2289393820U,	// <5,u,3,3>: Cost 3 vmrglw <1,2,5,3>, LHS
+  1631619588U,	// <5,u,3,4>: Cost 2 vsldoi8 <3,4,5,u>, <3,4,5,u>
+  2785056749U,	// <5,u,3,5>: Cost 3 vsldoi12 <5,5,5,5>, <u,3,5,5>
+  3363138077U,	// <5,u,3,6>: Cost 4 vmrglw <1,2,5,3>, <3,4,u,6>
+  2289397064U,	// <5,u,3,7>: Cost 3 vmrglw <1,2,5,3>, RHS
+  1634274120U,	// <5,u,3,u>: Cost 2 vsldoi8 <3,u,5,u>, <3,u,5,u>
+  1634937753U,	// <5,u,4,0>: Cost 2 vsldoi8 <4,0,5,u>, <4,0,5,u>
+  1728272410U,	// <5,u,4,1>: Cost 2 vsldoi12 <u,4,1,5>, <u,4,1,5>
+  2710006843U,	// <5,u,4,2>: Cost 3 vsldoi8 <4,2,5,u>, <4,2,5,u>
+  2765740076U,	// <5,u,4,3>: Cost 3 vsldoi12 <2,3,4,5>, <u,4,3,5>
+  1637592285U,	// <5,u,4,4>: Cost 2 vsldoi8 <4,4,5,u>, <4,4,5,u>
+  1631620406U,	// <5,u,4,5>: Cost 2 vsldoi8 <3,4,5,u>, RHS
+  2712661375U,	// <5,u,4,6>: Cost 3 vsldoi8 <4,6,5,u>, <4,6,5,u>
+  2302676296U,	// <5,u,4,7>: Cost 3 vmrglw <3,4,5,4>, RHS
+  1631620649U,	// <5,u,4,u>: Cost 2 vsldoi8 <3,4,5,u>, RHS
+  1577091174U,	// <5,u,5,0>: Cost 2 vsldoi4 <5,5,5,5>, LHS
+  1174443822U,	// <5,u,5,1>: Cost 2 vmrghw <5,5,5,5>, LHS
+  2766035058U,	// <5,u,5,2>: Cost 3 vsldoi12 <2,3,u,5>, <u,5,2,3>
+  1237565596U,	// <5,u,5,3>: Cost 2 vmrglw <4,u,5,5>, LHS
+  1577094454U,	// <5,u,5,4>: Cost 2 vsldoi4 <5,5,5,5>, RHS
+  296144182U,	// <5,u,5,5>: Cost 1 vspltisw1 RHS
+  1680496794U,	// <5,u,5,6>: Cost 2 vsldoi12 <0,4,1,5>, RHS
+  1237568840U,	// <5,u,5,7>: Cost 2 vmrglw <4,u,5,5>, RHS
+  296144182U,	// <5,u,5,u>: Cost 1 vspltisw1 RHS
+  2633146470U,	// <5,u,6,0>: Cost 3 vsldoi4 <2,5,u,6>, LHS
+  1175213870U,	// <5,u,6,1>: Cost 2 vmrghw <5,6,7,0>, LHS
+  2633148309U,	// <5,u,6,2>: Cost 3 vsldoi4 <2,5,u,6>, <2,5,u,6>
+  1228947612U,	// <5,u,6,3>: Cost 2 vmrglw <3,4,5,6>, LHS
+  2633149750U,	// <5,u,6,4>: Cost 3 vsldoi4 <2,5,u,6>, RHS
+  1175214234U,	// <5,u,6,5>: Cost 2 vmrghw <5,6,7,0>, RHS
+  1228950018U,	// <5,u,6,6>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6>
+  1228950856U,	// <5,u,6,7>: Cost 2 vmrglw <3,4,5,6>, RHS
+  1228947617U,	// <5,u,6,u>: Cost 2 vmrglw <3,4,5,6>, LHS
+  497614950U,	// <5,u,7,0>: Cost 1 vsldoi4 RHS, LHS
+  1571357492U,	// <5,u,7,1>: Cost 2 vsldoi4 RHS, <1,1,1,1>
+  1571358312U,	// <5,u,7,2>: Cost 2 vsldoi4 RHS, <2,2,2,2>
+  1571358870U,	// <5,u,7,3>: Cost 2 vsldoi4 RHS, <3,0,1,2>
+  497618248U,	// <5,u,7,4>: Cost 1 vsldoi4 RHS, RHS
+  1571360772U,	// <5,u,7,5>: Cost 2 vsldoi4 RHS, <5,5,5,5>
+  1571361274U,	// <5,u,7,6>: Cost 2 vsldoi4 RHS, <6,2,7,3>
+  1571361786U,	// <5,u,7,7>: Cost 2 vsldoi4 RHS, <7,0,1,2>
+  497620782U,	// <5,u,7,u>: Cost 1 vsldoi4 RHS, LHS
+  497623142U,	// <5,u,u,0>: Cost 1 vsldoi4 RHS, LHS
+  1631622958U,	// <5,u,u,1>: Cost 2 vsldoi8 <3,4,5,u>, LHS
+  1680496997U,	// <5,u,u,2>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+  1228963996U,	// <5,u,u,3>: Cost 2 vmrglw <3,4,5,u>, LHS
+  497626441U,	// <5,u,u,4>: Cost 1 vsldoi4 RHS, RHS
+  296144182U,	// <5,u,u,5>: Cost 1 vspltisw1 RHS
+  1680497037U,	// <5,u,u,6>: Cost 2 vsldoi12 <0,4,1,5>, RHS
+  1228967240U,	// <5,u,u,7>: Cost 2 vmrglw <3,4,5,u>, RHS
+  497628974U,	// <5,u,u,u>: Cost 1 vsldoi4 RHS, LHS
+  2772451328U,	// <6,0,0,0>: Cost 3 vsldoi12 <3,4,5,6>, <0,0,0,0>
+  2772451338U,	// <6,0,0,1>: Cost 3 vsldoi12 <3,4,5,6>, <0,0,1,1>
+  3771146417U,	// <6,0,0,2>: Cost 4 vsldoi8 <2,1,6,0>, <0,2,1,6>
+  3383095739U,	// <6,0,0,3>: Cost 4 vmrglw <4,5,6,0>, <6,2,0,3>
+  3846193189U,	// <6,0,0,4>: Cost 4 vsldoi12 <3,4,5,6>, <0,0,4,1>
+  3724832803U,	// <6,0,0,5>: Cost 4 vsldoi4 <5,6,0,0>, <5,6,0,0>
+  3383095985U,	// <6,0,0,6>: Cost 4 vmrglw <4,5,6,0>, <6,5,0,6>
+  3383096067U,	// <6,0,0,7>: Cost 4 vmrglw <4,5,6,0>, <6,6,0,7>
+  2772451401U,	// <6,0,0,u>: Cost 3 vsldoi12 <3,4,5,6>, <0,0,u,1>
+  2651095142U,	// <6,0,1,0>: Cost 3 vsldoi4 <5,6,0,1>, LHS
+  2251612262U,	// <6,0,1,1>: Cost 3 vmrghw <6,1,7,1>, LHS
+  1698709606U,	// <6,0,1,2>: Cost 2 vsldoi12 <3,4,5,6>, LHS
+  2651097602U,	// <6,0,1,3>: Cost 3 vsldoi4 <5,6,0,1>, <3,4,5,6>
+  2651098422U,	// <6,0,1,4>: Cost 3 vsldoi4 <5,6,0,1>, RHS
+  2651099172U,	// <6,0,1,5>: Cost 3 vsldoi4 <5,6,0,1>, <5,6,0,1>
+  2657071869U,	// <6,0,1,6>: Cost 3 vsldoi4 <6,6,0,1>, <6,6,0,1>
+  3724841978U,	// <6,0,1,7>: Cost 4 vsldoi4 <5,6,0,1>, <7,0,1,2>
+  1698709660U,	// <6,0,1,u>: Cost 2 vsldoi12 <3,4,5,6>, LHS
+  2252292096U,	// <6,0,2,0>: Cost 3 vmrghw <6,2,7,3>, <0,0,0,0>
+  1178550374U,	// <6,0,2,1>: Cost 2 vmrghw <6,2,7,3>, LHS
+  3826655418U,	// <6,0,2,2>: Cost 4 vsldoi12 <0,2,1,6>, <0,2,2,6>
+  3777783485U,	// <6,0,2,3>: Cost 4 vsldoi8 <3,2,6,0>, <2,3,2,6>
+  2252292434U,	// <6,0,2,4>: Cost 3 vmrghw <6,2,7,3>, <0,4,1,5>
+  3785746280U,	// <6,0,2,5>: Cost 4 vsldoi8 <4,5,6,0>, <2,5,3,6>
+  2252292593U,	// <6,0,2,6>: Cost 3 vmrghw <6,2,7,3>, <0,6,1,2>
+  3736794583U,	// <6,0,2,7>: Cost 4 vsldoi4 <7,6,0,2>, <7,6,0,2>
+  1178550941U,	// <6,0,2,u>: Cost 2 vmrghw <6,2,7,3>, LHS
+  3375153152U,	// <6,0,3,0>: Cost 4 vmrglw <3,2,6,3>, <0,0,0,0>
+  2772451584U,	// <6,0,3,1>: Cost 3 vsldoi12 <3,4,5,6>, <0,3,1,4>
+  3777784163U,	// <6,0,3,2>: Cost 4 vsldoi8 <3,2,6,0>, <3,2,6,0>
+  3846193426U,	// <6,0,3,3>: Cost 4 vsldoi12 <3,4,5,6>, <0,3,3,4>
+  2712005122U,	// <6,0,3,4>: Cost 3 vsldoi8 <4,5,6,0>, <3,4,5,6>
+  3724857382U,	// <6,0,3,5>: Cost 4 vsldoi4 <5,6,0,3>, <5,6,0,3>
+  3802335864U,	// <6,0,3,6>: Cost 4 vsldoi8 <7,3,6,0>, <3,6,0,7>
+  3801672410U,	// <6,0,3,7>: Cost 4 vsldoi8 <7,2,6,0>, <3,7,2,6>
+  2772451647U,	// <6,0,3,u>: Cost 3 vsldoi12 <3,4,5,6>, <0,3,u,4>
+  3383123968U,	// <6,0,4,0>: Cost 4 vmrglw <4,5,6,4>, <0,0,0,0>
+  2772451666U,	// <6,0,4,1>: Cost 3 vsldoi12 <3,4,5,6>, <0,4,1,5>
+  3773803577U,	// <6,0,4,2>: Cost 4 vsldoi8 <2,5,6,0>, <4,2,5,6>
+  3724864002U,	// <6,0,4,3>: Cost 4 vsldoi4 <5,6,0,4>, <3,4,5,6>
+  3846193517U,	// <6,0,4,4>: Cost 4 vsldoi12 <3,4,5,6>, <0,4,4,5>
+  2712005935U,	// <6,0,4,5>: Cost 3 vsldoi8 <4,5,6,0>, <4,5,6,0>
+  3327009265U,	// <6,0,4,6>: Cost 4 vmrghw <6,4,2,5>, <0,6,1,2>
+  3383126648U,	// <6,0,4,7>: Cost 5 vmrglw <4,5,6,4>, <3,6,0,7>
+  2772451729U,	// <6,0,4,u>: Cost 3 vsldoi12 <3,4,5,6>, <0,4,u,5>
+  3373178880U,	// <6,0,5,0>: Cost 4 vmrglw <2,u,6,5>, <0,0,0,0>
+  2254266470U,	// <6,0,5,1>: Cost 3 vmrghw <6,5,7,1>, LHS
+  3785748248U,	// <6,0,5,2>: Cost 4 vsldoi8 <4,5,6,0>, <5,2,6,3>
+  3790393190U,	// <6,0,5,3>: Cost 4 vsldoi8 <5,3,6,0>, <5,3,6,0>
+  3328000338U,	// <6,0,5,4>: Cost 4 vmrghw <6,5,7,0>, <0,4,1,5>
+  3785748494U,	// <6,0,5,5>: Cost 4 vsldoi8 <4,5,6,0>, <5,5,6,6>
+  3785748516U,	// <6,0,5,6>: Cost 4 vsldoi8 <4,5,6,0>, <5,6,0,1>
+  3379153528U,	// <6,0,5,7>: Cost 4 vmrglw <3,u,6,5>, <3,6,0,7>
+  2254267037U,	// <6,0,5,u>: Cost 3 vmrghw <6,5,7,1>, LHS
+  2254897152U,	// <6,0,6,0>: Cost 3 vmrghw <6,6,6,6>, <0,0,0,0>
+  1181155430U,	// <6,0,6,1>: Cost 2 vmrghw <6,6,6,6>, LHS
+  3785748923U,	// <6,0,6,2>: Cost 4 vsldoi8 <4,5,6,0>, <6,2,0,3>
+  3785749042U,	// <6,0,6,3>: Cost 4 vsldoi8 <4,5,6,0>, <6,3,4,5>
+  2254897490U,	// <6,0,6,4>: Cost 3 vmrghw <6,6,6,6>, <0,4,1,5>
+  3785749169U,	// <6,0,6,5>: Cost 4 vsldoi8 <4,5,6,0>, <6,5,0,6>
+  2724614962U,	// <6,0,6,6>: Cost 3 vsldoi8 <6,6,6,0>, <6,6,6,0>
+  3787739982U,	// <6,0,6,7>: Cost 4 vsldoi8 <4,u,6,0>, <6,7,0,1>
+  1181155997U,	// <6,0,6,u>: Cost 2 vmrghw <6,6,6,6>, LHS
+  1235664896U,	// <6,0,7,0>: Cost 2 vmrglw RHS, <0,0,0,0>
+  1235666598U,	// <6,0,7,1>: Cost 2 vmrglw RHS, <2,3,0,1>
+  3712943720U,	// <6,0,7,2>: Cost 4 vsldoi4 <3,6,0,7>, <2,2,2,2>
+  2639202936U,	// <6,0,7,3>: Cost 3 vsldoi4 <3,6,0,7>, <3,6,0,7>
+  2639203638U,	// <6,0,7,4>: Cost 3 vsldoi4 <3,6,0,7>, RHS
+  2309409236U,	// <6,0,7,5>: Cost 3 vmrglw RHS, <3,4,0,5>
+  3712946517U,	// <6,0,7,6>: Cost 4 vsldoi4 <3,6,0,7>, <6,0,7,0>
+  2309409400U,	// <6,0,7,7>: Cost 3 vmrglw RHS, <3,6,0,7>
+  1235666605U,	// <6,0,7,u>: Cost 2 vmrglw RHS, <2,3,0,u>
+  1235673088U,	// <6,0,u,0>: Cost 2 vmrglw RHS, <0,0,0,0>
+  1235674790U,	// <6,0,u,1>: Cost 2 vmrglw RHS, <2,3,0,1>
+  1698710173U,	// <6,0,u,2>: Cost 2 vsldoi12 <3,4,5,6>, LHS
+  2639211129U,	// <6,0,u,3>: Cost 3 vsldoi4 <3,6,0,u>, <3,6,0,u>
+  2639211830U,	// <6,0,u,4>: Cost 3 vsldoi4 <3,6,0,u>, RHS
+  2712008858U,	// <6,0,u,5>: Cost 3 vsldoi8 <4,5,6,0>, RHS
+  2657129220U,	// <6,0,u,6>: Cost 3 vsldoi4 <6,6,0,u>, <6,6,0,u>
+  2309417592U,	// <6,0,u,7>: Cost 3 vmrglw RHS, <3,6,0,7>
+  1698710227U,	// <6,0,u,u>: Cost 2 vsldoi12 <3,4,5,6>, LHS
+  3775799296U,	// <6,1,0,0>: Cost 4 vsldoi8 <2,u,6,1>, <0,0,0,0>
+  2702057574U,	// <6,1,0,1>: Cost 3 vsldoi8 <2,u,6,1>, LHS
+  3373143763U,	// <6,1,0,2>: Cost 4 vmrglw <2,u,6,0>, <u,0,1,2>
+  3695045122U,	// <6,1,0,3>: Cost 4 vsldoi4 <0,6,1,0>, <3,4,5,6>
+  3775799634U,	// <6,1,0,4>: Cost 4 vsldoi8 <2,u,6,1>, <0,4,1,5>
+  3383091538U,	// <6,1,0,5>: Cost 4 vmrglw <4,5,6,0>, <0,4,1,5>
+  3368493233U,	// <6,1,0,6>: Cost 4 vmrglw <2,1,6,0>, <0,2,1,6>
+  3362522319U,	// <6,1,0,7>: Cost 5 vmrglw <1,1,6,0>, <1,6,1,7>
+  2702058141U,	// <6,1,0,u>: Cost 3 vsldoi8 <2,u,6,1>, LHS
+  3834250027U,	// <6,1,1,0>: Cost 4 vsldoi12 <1,4,5,6>, <1,1,0,1>
+  2772452148U,	// <6,1,1,1>: Cost 3 vsldoi12 <3,4,5,6>, <1,1,1,1>
+  3832038210U,	// <6,1,1,2>: Cost 4 vsldoi12 <1,1,2,6>, <1,1,2,6>
+  3373150660U,	// <6,1,1,3>: Cost 4 vmrglw <2,u,6,1>, <6,2,1,3>
+  3834250067U,	// <6,1,1,4>: Cost 4 vsldoi12 <1,4,5,6>, <1,1,4,5>
+  3373146450U,	// <6,1,1,5>: Cost 4 vmrglw <2,u,6,1>, <0,4,1,5>
+  3826656102U,	// <6,1,1,6>: Cost 4 vsldoi12 <0,2,1,6>, <1,1,6,6>
+  3362530511U,	// <6,1,1,7>: Cost 4 vmrglw <1,1,6,1>, <1,6,1,7>
+  2772452148U,	// <6,1,1,u>: Cost 3 vsldoi12 <3,4,5,6>, <1,1,1,1>
+  2669092966U,	// <6,1,2,0>: Cost 3 vsldoi4 <u,6,1,2>, LHS
+  2252292916U,	// <6,1,2,1>: Cost 3 vmrghw <6,2,7,3>, <1,1,1,1>
+  2252293014U,	// <6,1,2,2>: Cost 3 vmrghw <6,2,7,3>, <1,2,3,0>
+  2772452246U,	// <6,1,2,3>: Cost 3 vsldoi12 <3,4,5,6>, <1,2,3,0>
+  2669096246U,	// <6,1,2,4>: Cost 3 vsldoi4 <u,6,1,2>, RHS
+  3846194091U,	// <6,1,2,5>: Cost 4 vsldoi12 <3,4,5,6>, <1,2,5,3>
+  2702059450U,	// <6,1,2,6>: Cost 3 vsldoi8 <2,u,6,1>, <2,6,3,7>
+  3870081978U,	// <6,1,2,7>: Cost 4 vsldoi12 <7,4,5,6>, <1,2,7,0>
+  2702059633U,	// <6,1,2,u>: Cost 3 vsldoi8 <2,u,6,1>, <2,u,6,1>
+  3775801494U,	// <6,1,3,0>: Cost 4 vsldoi8 <2,u,6,1>, <3,0,1,2>
+  3777128723U,	// <6,1,3,1>: Cost 4 vsldoi8 <3,1,6,1>, <3,1,6,1>
+  3775801702U,	// <6,1,3,2>: Cost 4 vsldoi8 <2,u,6,1>, <3,2,6,3>
+  3775801756U,	// <6,1,3,3>: Cost 4 vsldoi8 <2,u,6,1>, <3,3,3,3>
+  3775801858U,	// <6,1,3,4>: Cost 4 vsldoi8 <2,u,6,1>, <3,4,5,6>
+  3375153490U,	// <6,1,3,5>: Cost 4 vmrglw <3,2,6,3>, <0,4,1,5>
+  3826656265U,	// <6,1,3,6>: Cost 4 vsldoi12 <0,2,1,6>, <1,3,6,7>
+  3775802051U,	// <6,1,3,7>: Cost 4 vsldoi8 <2,u,6,1>, <3,7,0,1>
+  3775802142U,	// <6,1,3,u>: Cost 4 vsldoi8 <2,u,6,1>, <3,u,1,2>
+  3846194206U,	// <6,1,4,0>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,0,1>
+  3846194219U,	// <6,1,4,1>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,1,5>
+  3846194228U,	// <6,1,4,2>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,2,5>
+  3846194236U,	// <6,1,4,3>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,3,4>
+  3846194246U,	// <6,1,4,4>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,4,5>
+  2760508496U,	// <6,1,4,5>: Cost 3 vsldoi12 <1,4,5,6>, <1,4,5,6>
+  3368526001U,	// <6,1,4,6>: Cost 4 vmrglw <2,1,6,4>, <0,2,1,6>
+  3870082144U,	// <6,1,4,7>: Cost 4 vsldoi12 <7,4,5,6>, <1,4,7,4>
+  2760729707U,	// <6,1,4,u>: Cost 3 vsldoi12 <1,4,u,6>, <1,4,u,6>
+  2714668660U,	// <6,1,5,0>: Cost 3 vsldoi8 <5,0,6,1>, <5,0,6,1>
+  3834619005U,	// <6,1,5,1>: Cost 4 vsldoi12 <1,5,1,6>, <1,5,1,6>
+  3834692742U,	// <6,1,5,2>: Cost 4 vsldoi12 <1,5,2,6>, <1,5,2,6>
+  3846194317U,	// <6,1,5,3>: Cost 4 vsldoi12 <3,4,5,6>, <1,5,3,4>
+  3834840216U,	// <6,1,5,4>: Cost 4 vsldoi12 <1,5,4,6>, <1,5,4,6>
+  3834913953U,	// <6,1,5,5>: Cost 4 vsldoi12 <1,5,5,6>, <1,5,5,6>
+  2719977570U,	// <6,1,5,6>: Cost 3 vsldoi8 <5,u,6,1>, <5,6,7,0>
+  3367208143U,	// <6,1,5,7>: Cost 4 vmrglw <1,u,6,5>, <1,6,1,7>
+  2719977724U,	// <6,1,5,u>: Cost 3 vsldoi8 <5,u,6,1>, <5,u,6,1>
+  2669125734U,	// <6,1,6,0>: Cost 3 vsldoi4 <u,6,1,6>, LHS
+  2254897972U,	// <6,1,6,1>: Cost 3 vmrghw <6,6,6,6>, <1,1,1,1>
+  2254898070U,	// <6,1,6,2>: Cost 3 vmrghw <6,6,6,6>, <1,2,3,0>
+  3775803929U,	// <6,1,6,3>: Cost 4 vsldoi8 <2,u,6,1>, <6,3,1,7>
+  2669129014U,	// <6,1,6,4>: Cost 3 vsldoi4 <u,6,1,6>, RHS
+  2322006354U,	// <6,1,6,5>: Cost 3 vmrglw <6,6,6,6>, <0,4,1,5>
+  2725950264U,	// <6,1,6,6>: Cost 3 vsldoi8 <6,u,6,1>, <6,6,6,6>
+  3793720142U,	// <6,1,6,7>: Cost 4 vsldoi8 <5,u,6,1>, <6,7,0,1>
+  2254898556U,	// <6,1,6,u>: Cost 3 vmrghw <6,6,6,6>, <1,u,3,0>
+  2627330150U,	// <6,1,7,0>: Cost 3 vsldoi4 <1,6,1,7>, LHS
+  1235664906U,	// <6,1,7,1>: Cost 2 vmrglw RHS, <0,0,1,1>
+  1235667094U,	// <6,1,7,2>: Cost 2 vmrglw RHS, <3,0,1,2>
+  2309406894U,	// <6,1,7,3>: Cost 3 vmrglw RHS, <0,2,1,3>
+  2627333430U,	// <6,1,7,4>: Cost 3 vsldoi4 <1,6,1,7>, RHS
+  1235665234U,	// <6,1,7,5>: Cost 2 vmrglw RHS, <0,4,1,5>
+  2309406897U,	// <6,1,7,6>: Cost 3 vmrglw RHS, <0,2,1,6>
+  2309407222U,	// <6,1,7,7>: Cost 3 vmrglw RHS, <0,6,1,7>
+  1235664913U,	// <6,1,7,u>: Cost 2 vmrglw RHS, <0,0,1,u>
+  2627338342U,	// <6,1,u,0>: Cost 3 vsldoi4 <1,6,1,u>, LHS
+  1235673098U,	// <6,1,u,1>: Cost 2 vmrglw RHS, <0,0,1,1>
+  1235675286U,	// <6,1,u,2>: Cost 2 vmrglw RHS, <3,0,1,2>
+  2772452732U,	// <6,1,u,3>: Cost 3 vsldoi12 <3,4,5,6>, <1,u,3,0>
+  2627341622U,	// <6,1,u,4>: Cost 3 vsldoi4 <1,6,1,u>, RHS
+  1235673426U,	// <6,1,u,5>: Cost 2 vmrglw RHS, <0,4,1,5>
+  2309415089U,	// <6,1,u,6>: Cost 3 vmrglw RHS, <0,2,1,6>
+  2309415414U,	// <6,1,u,7>: Cost 3 vmrglw RHS, <0,6,1,7>
+  1235673105U,	// <6,1,u,u>: Cost 2 vmrglw RHS, <0,0,1,u>
+  3324683725U,	// <6,2,0,0>: Cost 4 vmrghw <6,0,7,0>, <2,0,3,0>
+  2725290086U,	// <6,2,0,1>: Cost 3 vsldoi8 <6,7,6,2>, LHS
+  3771162801U,	// <6,2,0,2>: Cost 4 vsldoi8 <2,1,6,2>, <0,2,1,6>
+  2309349478U,	// <6,2,0,3>: Cost 3 vmrglw <4,5,6,0>, LHS
+  3730951478U,	// <6,2,0,4>: Cost 4 vsldoi4 <6,6,2,0>, RHS
+  3840738784U,	// <6,2,0,5>: Cost 4 vsldoi12 <2,5,3,6>, <2,0,5,1>
+  3842655721U,	// <6,2,0,6>: Cost 4 vsldoi12 <2,u,2,6>, <2,0,6,1>
+  3736925671U,	// <6,2,0,7>: Cost 4 vsldoi4 <7,6,2,0>, <7,6,2,0>
+  2309349483U,	// <6,2,0,u>: Cost 3 vmrglw <4,5,6,0>, LHS
+  3367840468U,	// <6,2,1,0>: Cost 4 vmrglw <2,0,6,1>, <3,7,2,0>
+  3325355551U,	// <6,2,1,1>: Cost 4 vmrghw <6,1,7,1>, <2,1,3,1>
+  3373147752U,	// <6,2,1,2>: Cost 4 vmrglw <2,u,6,1>, <2,2,2,2>
+  2299404390U,	// <6,2,1,3>: Cost 3 vmrglw <2,u,6,1>, LHS
+  3701099830U,	// <6,2,1,4>: Cost 5 vsldoi4 <1,6,2,1>, RHS
+  3767846054U,	// <6,2,1,5>: Cost 4 vsldoi8 <1,5,6,2>, <1,5,6,2>
+  3826656825U,	// <6,2,1,6>: Cost 4 vsldoi12 <0,2,1,6>, <2,1,6,0>
+  3373147838U,	// <6,2,1,7>: Cost 5 vmrglw <2,u,6,1>, <2,3,2,7>
+  2299404395U,	// <6,2,1,u>: Cost 3 vmrglw <2,u,6,1>, LHS
+  2657222758U,	// <6,2,2,0>: Cost 3 vsldoi4 <6,6,2,2>, LHS
+  3771164219U,	// <6,2,2,1>: Cost 4 vsldoi8 <2,1,6,2>, <2,1,6,2>
+  2766481000U,	// <6,2,2,2>: Cost 3 vsldoi12 <2,4,5,6>, <2,2,2,2>
+  2772452978U,	// <6,2,2,3>: Cost 3 vsldoi12 <3,4,5,6>, <2,2,3,3>
+  2657226038U,	// <6,2,2,4>: Cost 3 vsldoi4 <6,6,2,2>, RHS
+  3790407528U,	// <6,2,2,5>: Cost 4 vsldoi8 <5,3,6,2>, <2,5,3,6>
+  2252294074U,	// <6,2,2,6>: Cost 3 vmrghw <6,2,7,3>, <2,6,3,7>
+  2252294148U,	// <6,2,2,7>: Cost 3 vmrghw <6,2,7,3>, <2,7,3,0>
+  2772453023U,	// <6,2,2,u>: Cost 3 vsldoi12 <3,4,5,6>, <2,2,u,3>
+  2772453030U,	// <6,2,3,0>: Cost 3 vsldoi12 <3,4,5,6>, <2,3,0,1>
+  3834250930U,	// <6,2,3,1>: Cost 4 vsldoi12 <1,4,5,6>, <2,3,1,4>
+  2765596349U,	// <6,2,3,2>: Cost 3 vsldoi12 <2,3,2,6>, <2,3,2,6>
+  2301411430U,	// <6,2,3,3>: Cost 3 vmrglw <3,2,6,3>, LHS
+  2772453070U,	// <6,2,3,4>: Cost 3 vsldoi12 <3,4,5,6>, <2,3,4,5>
+  2765817560U,	// <6,2,3,5>: Cost 3 vsldoi12 <2,3,5,6>, <2,3,5,6>
+  2252933050U,	// <6,2,3,6>: Cost 3 vmrghw <6,3,7,0>, <2,6,3,7>
+  2796340968U,	// <6,2,3,7>: Cost 3 vsldoi12 <7,4,5,6>, <2,3,7,4>
+  2766038771U,	// <6,2,3,u>: Cost 3 vsldoi12 <2,3,u,6>, <2,3,u,6>
+  3725008998U,	// <6,2,4,0>: Cost 4 vsldoi4 <5,6,2,4>, LHS
+  3368530217U,	// <6,2,4,1>: Cost 5 vmrglw <2,1,6,4>, <6,0,2,1>
+  3840222989U,	// <6,2,4,2>: Cost 4 vsldoi12 <2,4,5,6>, <2,4,2,5>
+  2309382246U,	// <6,2,4,3>: Cost 3 vmrglw <4,5,6,4>, LHS
+  3725012278U,	// <6,2,4,4>: Cost 4 vsldoi4 <5,6,2,4>, RHS
+  2766481193U,	// <6,2,4,5>: Cost 3 vsldoi12 <2,4,5,6>, <2,4,5,6>
+  3842656049U,	// <6,2,4,6>: Cost 4 vsldoi12 <2,u,2,6>, <2,4,6,5>
+  3327010820U,	// <6,2,4,7>: Cost 4 vmrghw <6,4,2,5>, <2,7,3,0>
+  2766702404U,	// <6,2,4,u>: Cost 3 vsldoi12 <2,4,u,6>, <2,4,u,6>
+  3713073254U,	// <6,2,5,0>: Cost 4 vsldoi4 <3,6,2,5>, LHS
+  3789082310U,	// <6,2,5,1>: Cost 4 vsldoi8 <5,1,6,2>, <5,1,6,2>
+  3840665439U,	// <6,2,5,2>: Cost 4 vsldoi12 <2,5,2,6>, <2,5,2,6>
+  2766997352U,	// <6,2,5,3>: Cost 3 vsldoi12 <2,5,3,6>, <2,5,3,6>
+  3713076534U,	// <6,2,5,4>: Cost 4 vsldoi4 <3,6,2,5>, RHS
+  3791736842U,	// <6,2,5,5>: Cost 4 vsldoi8 <5,5,6,2>, <5,5,6,2>
+  3373180605U,	// <6,2,5,6>: Cost 4 vmrglw <2,u,6,5>, <2,3,2,6>
+  3793064108U,	// <6,2,5,7>: Cost 4 vsldoi8 <5,7,6,2>, <5,7,6,2>
+  2767366037U,	// <6,2,5,u>: Cost 3 vsldoi12 <2,5,u,6>, <2,5,u,6>
+  3701137510U,	// <6,2,6,0>: Cost 4 vsldoi4 <1,6,2,6>, LHS
+  3701138647U,	// <6,2,6,1>: Cost 4 vsldoi4 <1,6,2,6>, <1,6,2,6>
+  2254898792U,	// <6,2,6,2>: Cost 3 vmrghw <6,6,6,6>, <2,2,2,2>
+  1248264294U,	// <6,2,6,3>: Cost 2 vmrglw <6,6,6,6>, LHS
+  3701140790U,	// <6,2,6,4>: Cost 4 vsldoi4 <1,6,2,6>, RHS
+  3725029435U,	// <6,2,6,5>: Cost 4 vsldoi4 <5,6,2,6>, <5,6,2,6>
+  2254899130U,	// <6,2,6,6>: Cost 3 vmrghw <6,6,6,6>, <2,6,3,7>
+  2725294981U,	// <6,2,6,7>: Cost 3 vsldoi8 <6,7,6,2>, <6,7,6,2>
+  1248264299U,	// <6,2,6,u>: Cost 2 vmrglw <6,6,6,6>, LHS
+  2633375846U,	// <6,2,7,0>: Cost 3 vsldoi4 <2,6,2,7>, LHS
+  2309407468U,	// <6,2,7,1>: Cost 3 vmrglw RHS, <1,0,2,1>
+  1235666536U,	// <6,2,7,2>: Cost 2 vmrglw RHS, <2,2,2,2>
+  161923174U,	// <6,2,7,3>: Cost 1 vmrglw RHS, LHS
+  2633379126U,	// <6,2,7,4>: Cost 3 vsldoi4 <2,6,2,7>, RHS
+  2309407796U,	// <6,2,7,5>: Cost 3 vmrglw RHS, <1,4,2,5>
+  2309408445U,	// <6,2,7,6>: Cost 3 vmrglw RHS, <2,3,2,6>
+  2309407960U,	// <6,2,7,7>: Cost 3 vmrglw RHS, <1,6,2,7>
+  161923179U,	// <6,2,7,u>: Cost 1 vmrglw RHS, LHS
+  2633384038U,	// <6,2,u,0>: Cost 3 vsldoi4 <2,6,2,u>, LHS
+  2309415660U,	// <6,2,u,1>: Cost 3 vmrglw RHS, <1,0,2,1>
+  1235674728U,	// <6,2,u,2>: Cost 2 vmrglw RHS, <2,2,2,2>
+  161931366U,	// <6,2,u,3>: Cost 1 vmrglw RHS, LHS
+  2633387318U,	// <6,2,u,4>: Cost 3 vsldoi4 <2,6,2,u>, RHS
+  2769135725U,	// <6,2,u,5>: Cost 3 vsldoi12 <2,u,5,6>, <2,u,5,6>
+  2309416637U,	// <6,2,u,6>: Cost 3 vmrglw RHS, <2,3,2,6>
+  2309416152U,	// <6,2,u,7>: Cost 3 vmrglw RHS, <1,6,2,7>
+  161931371U,	// <6,2,u,u>: Cost 1 vmrglw RHS, LHS
+  3777806336U,	// <6,3,0,0>: Cost 4 vsldoi8 <3,2,6,3>, <0,0,0,0>
+  2704064614U,	// <6,3,0,1>: Cost 3 vsldoi8 <3,2,6,3>, LHS
+  3765862577U,	// <6,3,0,2>: Cost 4 vsldoi8 <1,2,6,3>, <0,2,1,6>
+  3843393708U,	// <6,3,0,3>: Cost 4 vsldoi12 <3,0,3,6>, <3,0,3,6>
+  2250516994U,	// <6,3,0,4>: Cost 3 vmrghw <6,0,1,2>, <3,4,5,6>
+  3725054014U,	// <6,3,0,5>: Cost 4 vsldoi4 <5,6,3,0>, <5,6,3,0>
+  3383093096U,	// <6,3,0,6>: Cost 4 vmrglw <4,5,6,0>, <2,5,3,6>
+  3368495034U,	// <6,3,0,7>: Cost 4 vmrglw <2,1,6,0>, <2,6,3,7>
+  2704065181U,	// <6,3,0,u>: Cost 3 vsldoi8 <3,2,6,3>, LHS
+  2251622550U,	// <6,3,1,0>: Cost 3 vmrghw <6,1,7,2>, <3,0,1,2>
+  3777807156U,	// <6,3,1,1>: Cost 4 vsldoi8 <3,2,6,3>, <1,1,1,1>
+  3765863348U,	// <6,3,1,2>: Cost 4 vsldoi8 <1,2,6,3>, <1,2,6,3>
+  3373147762U,	// <6,3,1,3>: Cost 4 vmrglw <2,u,6,1>, <2,2,3,3>
+  3834251525U,	// <6,3,1,4>: Cost 4 vsldoi12 <1,4,5,6>, <3,1,4,5>
+  3373147683U,	// <6,3,1,5>: Cost 5 vmrglw <2,u,6,1>, <2,1,3,5>
+  3391727545U,	// <6,3,1,6>: Cost 4 vmrglw <6,0,6,1>, <2,6,3,6>
+  2299406266U,	// <6,3,1,7>: Cost 3 vmrglw <2,u,6,1>, <2,6,3,7>
+  2251622550U,	// <6,3,1,u>: Cost 3 vmrghw <6,1,7,2>, <3,0,1,2>
+  2252294294U,	// <6,3,2,0>: Cost 3 vmrghw <6,2,7,3>, <3,0,1,2>
+  3326036198U,	// <6,3,2,1>: Cost 4 vmrghw <6,2,7,3>, <3,1,1,1>
+  3771836045U,	// <6,3,2,2>: Cost 4 vsldoi8 <2,2,6,3>, <2,2,6,3>
+  2252294556U,	// <6,3,2,3>: Cost 3 vmrghw <6,2,7,3>, <3,3,3,3>
+  2252294658U,	// <6,3,2,4>: Cost 3 vmrghw <6,2,7,3>, <3,4,5,6>
+  3840739677U,	// <6,3,2,5>: Cost 4 vsldoi12 <2,5,3,6>, <3,2,5,3>
+  2704066490U,	// <6,3,2,6>: Cost 3 vsldoi8 <3,2,6,3>, <2,6,3,7>
+  3368511418U,	// <6,3,2,7>: Cost 4 vmrglw <2,1,6,2>, <2,6,3,7>
+  2252294942U,	// <6,3,2,u>: Cost 3 vmrghw <6,2,7,3>, <3,u,1,2>
+  3707158630U,	// <6,3,3,0>: Cost 4 vsldoi4 <2,6,3,3>, LHS
+  3765864692U,	// <6,3,3,1>: Cost 5 vsldoi8 <1,2,6,3>, <3,1,2,6>
+  2704066918U,	// <6,3,3,2>: Cost 3 vsldoi8 <3,2,6,3>, <3,2,6,3>
+  2772453788U,	// <6,3,3,3>: Cost 3 vsldoi12 <3,4,5,6>, <3,3,3,3>
+  2772453799U,	// <6,3,3,4>: Cost 3 vsldoi12 <3,4,5,6>, <3,3,4,5>
+  3789752888U,	// <6,3,3,5>: Cost 4 vsldoi8 <5,2,6,3>, <3,5,2,6>
+  3840739770U,	// <6,3,3,6>: Cost 4 vsldoi12 <2,5,3,6>, <3,3,6,6>
+  2301413306U,	// <6,3,3,7>: Cost 3 vmrglw <3,2,6,3>, <2,6,3,7>
+  2775108043U,	// <6,3,3,u>: Cost 3 vsldoi12 <3,u,5,6>, <3,3,u,5>
+  2651340902U,	// <6,3,4,0>: Cost 3 vsldoi4 <5,6,3,4>, LHS
+  3846195674U,	// <6,3,4,1>: Cost 4 vsldoi12 <3,4,5,6>, <3,4,1,2>
+  3845974503U,	// <6,3,4,2>: Cost 4 vsldoi12 <3,4,2,6>, <3,4,2,6>
+  2651343362U,	// <6,3,4,3>: Cost 3 vsldoi4 <5,6,3,4>, <3,4,5,6>
+  2651344182U,	// <6,3,4,4>: Cost 3 vsldoi4 <5,6,3,4>, RHS
+  1698712066U,	// <6,3,4,5>: Cost 2 vsldoi12 <3,4,5,6>, <3,4,5,6>
+  3383125864U,	// <6,3,4,6>: Cost 4 vmrglw <4,5,6,4>, <2,5,3,6>
+  3368527802U,	// <6,3,4,7>: Cost 4 vmrglw <2,1,6,4>, <2,6,3,7>
+  1698933277U,	// <6,3,4,u>: Cost 2 vsldoi12 <3,4,u,6>, <3,4,u,6>
+  3373179798U,	// <6,3,5,0>: Cost 4 vmrglw <2,u,6,5>, <1,2,3,0>
+  3707176179U,	// <6,3,5,1>: Cost 5 vsldoi4 <2,6,3,5>, <1,6,5,7>
+  2716012312U,	// <6,3,5,2>: Cost 3 vsldoi8 <5,2,6,3>, <5,2,6,3>
+  3373180530U,	// <6,3,5,3>: Cost 4 vmrglw <2,u,6,5>, <2,2,3,3>
+  2254309890U,	// <6,3,5,4>: Cost 3 vmrghw <6,5,7,6>, <3,4,5,6>
+  3785773070U,	// <6,3,5,5>: Cost 4 vsldoi8 <4,5,6,3>, <5,5,6,6>
+  3840739932U,	// <6,3,5,6>: Cost 4 vsldoi12 <2,5,3,6>, <3,5,6,6>
+  2299439034U,	// <6,3,5,7>: Cost 3 vmrglw <2,u,6,5>, <2,6,3,7>
+  2719994110U,	// <6,3,5,u>: Cost 3 vsldoi8 <5,u,6,3>, <5,u,6,3>
+  2254899350U,	// <6,3,6,0>: Cost 3 vmrghw <6,6,6,6>, <3,0,1,2>
+  3328641254U,	// <6,3,6,1>: Cost 4 vmrghw <6,6,6,6>, <3,1,1,1>
+  2633443257U,	// <6,3,6,2>: Cost 3 vsldoi4 <2,6,3,6>, <2,6,3,6>
+  2254899612U,	// <6,3,6,3>: Cost 3 vmrghw <6,6,6,6>, <3,3,3,3>
+  2254899714U,	// <6,3,6,4>: Cost 3 vmrghw <6,6,6,6>, <3,4,5,6>
+  3785773772U,	// <6,3,6,5>: Cost 4 vsldoi8 <4,5,6,3>, <6,5,3,6>
+  2725966648U,	// <6,3,6,6>: Cost 3 vsldoi8 <6,u,6,3>, <6,6,6,6>
+  2322007994U,	// <6,3,6,7>: Cost 3 vmrglw <6,6,6,6>, <2,6,3,7>
+  2254899998U,	// <6,3,6,u>: Cost 3 vmrghw <6,6,6,6>, <3,u,1,2>
+  1559707750U,	// <6,3,7,0>: Cost 2 vsldoi4 <2,6,3,7>, LHS
+  2633450292U,	// <6,3,7,1>: Cost 3 vsldoi4 <2,6,3,7>, <1,1,1,1>
+  1559709626U,	// <6,3,7,2>: Cost 2 vsldoi4 <2,6,3,7>, <2,6,3,7>
+  1235666546U,	// <6,3,7,3>: Cost 2 vmrglw RHS, <2,2,3,3>
+  1559711030U,	// <6,3,7,4>: Cost 2 vsldoi4 <2,6,3,7>, RHS
+  2309408291U,	// <6,3,7,5>: Cost 3 vmrglw RHS, <2,1,3,5>
+  2633454152U,	// <6,3,7,6>: Cost 3 vsldoi4 <2,6,3,7>, <6,3,7,0>
+  1235666874U,	// <6,3,7,7>: Cost 2 vmrglw RHS, <2,6,3,7>
+  1559713582U,	// <6,3,7,u>: Cost 2 vsldoi4 <2,6,3,7>, LHS
+  1559715942U,	// <6,3,u,0>: Cost 2 vsldoi4 <2,6,3,u>, LHS
+  2633458484U,	// <6,3,u,1>: Cost 3 vsldoi4 <2,6,3,u>, <1,1,1,1>
+  1559717819U,	// <6,3,u,2>: Cost 2 vsldoi4 <2,6,3,u>, <2,6,3,u>
+  1235674738U,	// <6,3,u,3>: Cost 2 vmrglw RHS, <2,2,3,3>
+  1559719222U,	// <6,3,u,4>: Cost 2 vsldoi4 <2,6,3,u>, RHS
+  1701366598U,	// <6,3,u,5>: Cost 2 vsldoi12 <3,u,5,6>, <3,u,5,6>
+  2633462353U,	// <6,3,u,6>: Cost 3 vsldoi4 <2,6,3,u>, <6,3,u,0>
+  1235675066U,	// <6,3,u,7>: Cost 2 vmrglw RHS, <2,6,3,7>
+  1559721774U,	// <6,3,u,u>: Cost 2 vsldoi4 <2,6,3,u>, LHS
+  3785777152U,	// <6,4,0,0>: Cost 4 vsldoi8 <4,5,6,4>, <0,0,0,0>
+  2712035430U,	// <6,4,0,1>: Cost 3 vsldoi8 <4,5,6,4>, LHS
+  3771179185U,	// <6,4,0,2>: Cost 4 vsldoi8 <2,1,6,4>, <0,2,1,6>
+  3846196096U,	// <6,4,0,3>: Cost 4 vsldoi12 <3,4,5,6>, <4,0,3,1>
+  3785777490U,	// <6,4,0,4>: Cost 4 vsldoi8 <4,5,6,4>, <0,4,1,5>
+  2250517814U,	// <6,4,0,5>: Cost 3 vmrghw <6,0,1,2>, RHS
+  3324259703U,	// <6,4,0,6>: Cost 4 vmrghw <6,0,1,2>, <4,6,5,0>
+  3383092458U,	// <6,4,0,7>: Cost 5 vmrglw <4,5,6,0>, <1,6,4,7>
+  2712035997U,	// <6,4,0,u>: Cost 3 vsldoi8 <4,5,6,4>, LHS
+  3325356946U,	// <6,4,1,0>: Cost 4 vmrghw <6,1,7,1>, <4,0,5,1>
+  3785777972U,	// <6,4,1,1>: Cost 4 vsldoi8 <4,5,6,4>, <1,1,1,1>
+  3846196170U,	// <6,4,1,2>: Cost 4 vsldoi12 <3,4,5,6>, <4,1,2,3>
+  3325365380U,	// <6,4,1,3>: Cost 4 vmrghw <6,1,7,2>, <4,3,5,0>
+  3852168155U,	// <6,4,1,4>: Cost 4 vsldoi12 <4,4,5,6>, <4,1,4,2>
+  2251615542U,	// <6,4,1,5>: Cost 3 vmrghw <6,1,7,1>, RHS
+  3325357432U,	// <6,4,1,6>: Cost 4 vmrghw <6,1,7,1>, <4,6,5,1>
+  3870084088U,	// <6,4,1,7>: Cost 4 vsldoi12 <7,4,5,6>, <4,1,7,4>
+  2251615785U,	// <6,4,1,u>: Cost 3 vmrghw <6,1,7,1>, RHS
+  2252295058U,	// <6,4,2,0>: Cost 3 vmrghw <6,2,7,3>, <4,0,5,1>
+  3771180605U,	// <6,4,2,1>: Cost 4 vsldoi8 <2,1,6,4>, <2,1,6,4>
+  3785778792U,	// <6,4,2,2>: Cost 4 vsldoi8 <4,5,6,4>, <2,2,2,2>
+  3777816253U,	// <6,4,2,3>: Cost 4 vsldoi8 <3,2,6,4>, <2,3,2,6>
+  2252295376U,	// <6,4,2,4>: Cost 3 vmrghw <6,2,7,3>, <4,4,4,4>
+  1178553654U,	// <6,4,2,5>: Cost 2 vmrghw <6,2,7,3>, RHS
+  2252295545U,	// <6,4,2,6>: Cost 3 vmrghw <6,2,7,3>, <4,6,5,2>
+  3326037448U,	// <6,4,2,7>: Cost 4 vmrghw <6,2,7,3>, <4,7,5,0>
+  1178553897U,	// <6,4,2,u>: Cost 2 vmrghw <6,2,7,3>, RHS
+  3785779350U,	// <6,4,3,0>: Cost 4 vsldoi8 <4,5,6,4>, <3,0,1,2>
+  3383118648U,	// <6,4,3,1>: Cost 4 vmrglw <4,5,6,3>, <3,u,4,1>
+  3777816935U,	// <6,4,3,2>: Cost 4 vsldoi8 <3,2,6,4>, <3,2,6,4>
+  3785779612U,	// <6,4,3,3>: Cost 4 vsldoi8 <4,5,6,4>, <3,3,3,3>
+  2712037890U,	// <6,4,3,4>: Cost 3 vsldoi8 <4,5,6,4>, <3,4,5,6>
+  2252754230U,	// <6,4,3,5>: Cost 3 vmrghw <6,3,4,5>, RHS
+  3784452764U,	// <6,4,3,6>: Cost 4 vsldoi8 <4,3,6,4>, <3,6,4,7>
+  3801705178U,	// <6,4,3,7>: Cost 4 vsldoi8 <7,2,6,4>, <3,7,2,6>
+  2252754473U,	// <6,4,3,u>: Cost 3 vmrghw <6,3,4,5>, RHS
+  3787770770U,	// <6,4,4,0>: Cost 4 vsldoi8 <4,u,6,4>, <4,0,5,1>
+  3383126840U,	// <6,4,4,1>: Cost 4 vmrglw <4,5,6,4>, <3,u,4,1>
+  3327380534U,	// <6,4,4,2>: Cost 4 vmrghw <6,4,7,5>, <4,2,5,3>
+  3784453265U,	// <6,4,4,3>: Cost 4 vsldoi8 <4,3,6,4>, <4,3,6,4>
+  2253630672U,	// <6,4,4,4>: Cost 3 vmrghw <6,4,7,4>, <4,4,4,4>
+  2778426587U,	// <6,4,4,5>: Cost 3 vsldoi12 <4,4,5,6>, <4,4,5,6>
+  3383128789U,	// <6,4,4,6>: Cost 4 vmrglw <4,5,6,4>, <6,5,4,6>
+  3381799580U,	// <6,4,4,7>: Cost 4 vmrglw <4,3,6,4>, <3,6,4,7>
+  2778647798U,	// <6,4,4,u>: Cost 3 vsldoi12 <4,4,u,6>, <4,4,u,6>
+  2651422822U,	// <6,4,5,0>: Cost 3 vsldoi4 <5,6,4,5>, LHS
+  3701277928U,	// <6,4,5,1>: Cost 4 vsldoi4 <1,6,4,5>, <1,6,4,5>
+  3701278650U,	// <6,4,5,2>: Cost 4 vsldoi4 <1,6,4,5>, <2,6,3,7>
+  2651425282U,	// <6,4,5,3>: Cost 3 vsldoi4 <5,6,4,5>, <3,4,5,6>
+  2651426102U,	// <6,4,5,4>: Cost 3 vsldoi4 <5,6,4,5>, RHS
+  2651426892U,	// <6,4,5,5>: Cost 3 vsldoi4 <5,6,4,5>, <5,6,4,5>
+  1698712886U,	// <6,4,5,6>: Cost 2 vsldoi12 <3,4,5,6>, RHS
+  3725169658U,	// <6,4,5,7>: Cost 4 vsldoi4 <5,6,4,5>, <7,0,1,2>
+  1698712904U,	// <6,4,5,u>: Cost 2 vsldoi12 <3,4,5,6>, RHS
+  2254900114U,	// <6,4,6,0>: Cost 3 vmrghw <6,6,6,6>, <4,0,5,1>
+  3389115192U,	// <6,4,6,1>: Cost 4 vmrglw <5,5,6,6>, <3,u,4,1>
+  3785781727U,	// <6,4,6,2>: Cost 4 vsldoi8 <4,5,6,4>, <6,2,4,3>
+  3785781810U,	// <6,4,6,3>: Cost 4 vsldoi8 <4,5,6,4>, <6,3,4,5>
+  2254900432U,	// <6,4,6,4>: Cost 3 vmrghw <6,6,6,6>, <4,4,4,4>
+  1181158710U,	// <6,4,6,5>: Cost 2 vmrghw <6,6,6,6>, RHS
+  2254900605U,	// <6,4,6,6>: Cost 3 vmrghw <6,6,6,6>, <4,6,5,6>
+  3787772750U,	// <6,4,6,7>: Cost 4 vsldoi8 <4,u,6,4>, <6,7,0,1>
+  1181158953U,	// <6,4,6,u>: Cost 2 vmrghw <6,6,6,6>, RHS
+  2639495270U,	// <6,4,7,0>: Cost 3 vsldoi4 <3,6,4,7>, LHS
+  2639496090U,	// <6,4,7,1>: Cost 3 vsldoi4 <3,6,4,7>, <1,2,3,4>
+  3707267011U,	// <6,4,7,2>: Cost 4 vsldoi4 <2,6,4,7>, <2,6,4,7>
+  2639497884U,	// <6,4,7,3>: Cost 3 vsldoi4 <3,6,4,7>, <3,6,4,7>
+  1237658832U,	// <6,4,7,4>: Cost 2 vmrglw RHS, <4,4,4,4>
+  1235666638U,	// <6,4,7,5>: Cost 2 vmrglw RHS, <2,3,4,5>
+  3713241753U,	// <6,4,7,6>: Cost 4 vsldoi4 <3,6,4,7>, <6,4,7,0>
+  2309409436U,	// <6,4,7,7>: Cost 3 vmrglw RHS, <3,6,4,7>
+  1235666641U,	// <6,4,7,u>: Cost 2 vmrglw RHS, <2,3,4,u>
+  2639503462U,	// <6,4,u,0>: Cost 3 vsldoi4 <3,6,4,u>, LHS
+  2639504282U,	// <6,4,u,1>: Cost 3 vsldoi4 <3,6,4,u>, <1,2,3,4>
+  3701303226U,	// <6,4,u,2>: Cost 4 vsldoi4 <1,6,4,u>, <2,6,3,7>
+  2639506077U,	// <6,4,u,3>: Cost 3 vsldoi4 <3,6,4,u>, <3,6,4,u>
+  1235676368U,	// <6,4,u,4>: Cost 2 vmrglw RHS, <4,4,4,4>
+  1235674830U,	// <6,4,u,5>: Cost 2 vmrglw RHS, <2,3,4,5>
+  1698713129U,	// <6,4,u,6>: Cost 2 vsldoi12 <3,4,5,6>, RHS
+  2309417628U,	// <6,4,u,7>: Cost 3 vmrglw RHS, <3,6,4,7>
+  1698713147U,	// <6,4,u,u>: Cost 2 vsldoi12 <3,4,5,6>, RHS
+  3775832064U,	// <6,5,0,0>: Cost 4 vsldoi8 <2,u,6,5>, <0,0,0,0>
+  2702090342U,	// <6,5,0,1>: Cost 3 vsldoi8 <2,u,6,5>, LHS
+  3775832241U,	// <6,5,0,2>: Cost 4 vsldoi8 <2,u,6,5>, <0,2,1,6>
+  3719227906U,	// <6,5,0,3>: Cost 4 vsldoi4 <4,6,5,0>, <3,4,5,6>
+  3775832402U,	// <6,5,0,4>: Cost 4 vsldoi8 <2,u,6,5>, <0,4,1,5>
+  3385085146U,	// <6,5,0,5>: Cost 4 vmrglw <4,u,6,0>, <4,4,5,5>
+  2309351938U,	// <6,5,0,6>: Cost 3 vmrglw <4,5,6,0>, <3,4,5,6>
+  3376459134U,	// <6,5,0,7>: Cost 5 vmrglw <3,4,6,0>, <4,6,5,7>
+  2702090909U,	// <6,5,0,u>: Cost 3 vsldoi8 <2,u,6,5>, LHS
+  3719233546U,	// <6,5,1,0>: Cost 4 vsldoi4 <4,6,5,1>, <0,0,1,1>
+  3775832884U,	// <6,5,1,1>: Cost 4 vsldoi8 <2,u,6,5>, <1,1,1,1>
+  3775832982U,	// <6,5,1,2>: Cost 4 vsldoi8 <2,u,6,5>, <1,2,3,0>
+  3846196909U,	// <6,5,1,3>: Cost 4 vsldoi12 <3,4,5,6>, <5,1,3,4>
+  3719236984U,	// <6,5,1,4>: Cost 4 vsldoi4 <4,6,5,1>, <4,6,5,1>
+  3856150209U,	// <6,5,1,5>: Cost 4 vsldoi12 <5,1,5,6>, <5,1,5,6>
+  3834252997U,	// <6,5,1,6>: Cost 4 vsldoi12 <1,4,5,6>, <5,1,6,1>
+  3870084817U,	// <6,5,1,7>: Cost 4 vsldoi12 <7,4,5,6>, <5,1,7,4>
+  3769861532U,	// <6,5,1,u>: Cost 4 vsldoi8 <1,u,6,5>, <1,u,6,5>
+  2645500006U,	// <6,5,2,0>: Cost 3 vsldoi4 <4,6,5,2>, LHS
+  3719242548U,	// <6,5,2,1>: Cost 4 vsldoi4 <4,6,5,2>, <1,1,1,1>
+  3775833704U,	// <6,5,2,2>: Cost 4 vsldoi8 <2,u,6,5>, <2,2,2,2>
+  3775833766U,	// <6,5,2,3>: Cost 4 vsldoi8 <2,u,6,5>, <2,3,0,1>
+  2645503353U,	// <6,5,2,4>: Cost 3 vsldoi4 <4,6,5,2>, <4,6,5,2>
+  2252296196U,	// <6,5,2,5>: Cost 3 vmrghw <6,2,7,3>, <5,5,5,5>
+  2702092218U,	// <6,5,2,6>: Cost 3 vsldoi8 <2,u,6,5>, <2,6,3,7>
+  3719246842U,	// <6,5,2,7>: Cost 4 vsldoi4 <4,6,5,2>, <7,0,1,2>
+  2702092405U,	// <6,5,2,u>: Cost 3 vsldoi8 <2,u,6,5>, <2,u,6,5>
+  3775834262U,	// <6,5,3,0>: Cost 4 vsldoi8 <2,u,6,5>, <3,0,1,2>
+  3777161495U,	// <6,5,3,1>: Cost 4 vsldoi8 <3,1,6,5>, <3,1,6,5>
+  3775834470U,	// <6,5,3,2>: Cost 4 vsldoi8 <2,u,6,5>, <3,2,6,3>
+  3775834524U,	// <6,5,3,3>: Cost 4 vsldoi8 <2,u,6,5>, <3,3,3,3>
+  3775834626U,	// <6,5,3,4>: Cost 4 vsldoi8 <2,u,6,5>, <3,4,5,6>
+  3385109722U,	// <6,5,3,5>: Cost 4 vmrglw <4,u,6,3>, <4,4,5,5>
+  2309376514U,	// <6,5,3,6>: Cost 3 vmrglw <4,5,6,3>, <3,4,5,6>
+  3775834819U,	// <6,5,3,7>: Cost 4 vsldoi8 <2,u,6,5>, <3,7,0,1>
+  2309376514U,	// <6,5,3,u>: Cost 3 vmrglw <4,5,6,3>, <3,4,5,6>
+  3719258214U,	// <6,5,4,0>: Cost 4 vsldoi4 <4,6,5,4>, LHS
+  3385117586U,	// <6,5,4,1>: Cost 4 vmrglw <4,u,6,4>, <4,0,5,1>
+  3327242008U,	// <6,5,4,2>: Cost 4 vmrghw <6,4,5,6>, <5,2,6,3>
+  3719260674U,	// <6,5,4,3>: Cost 4 vsldoi4 <4,6,5,4>, <3,4,5,6>
+  3719261563U,	// <6,5,4,4>: Cost 4 vsldoi4 <4,6,5,4>, <4,6,5,4>
+  2702093622U,	// <6,5,4,5>: Cost 3 vsldoi8 <2,u,6,5>, RHS
+  2309384706U,	// <6,5,4,6>: Cost 3 vmrglw <4,5,6,4>, <3,4,5,6>
+  3870085060U,	// <6,5,4,7>: Cost 4 vsldoi12 <7,4,5,6>, <5,4,7,4>
+  2702093865U,	// <6,5,4,u>: Cost 3 vsldoi8 <2,u,6,5>, RHS
+  3719266406U,	// <6,5,5,0>: Cost 4 vsldoi4 <4,6,5,5>, LHS
+  3789106889U,	// <6,5,5,1>: Cost 4 vsldoi8 <5,1,6,5>, <5,1,6,5>
+  3785789208U,	// <6,5,5,2>: Cost 4 vsldoi8 <4,5,6,5>, <5,2,6,3>
+  3373183950U,	// <6,5,5,3>: Cost 4 vmrglw <2,u,6,5>, <6,u,5,3>
+  2717355964U,	// <6,5,5,4>: Cost 3 vsldoi8 <5,4,6,5>, <5,4,6,5>
+  2791772164U,	// <6,5,5,5>: Cost 3 vsldoi12 <6,6,6,6>, <5,5,5,5>
+  2772455438U,	// <6,5,5,6>: Cost 3 vsldoi12 <3,4,5,6>, <5,5,6,6>
+  3373183549U,	// <6,5,5,7>: Cost 4 vmrglw <2,u,6,5>, <6,3,5,7>
+  2720010496U,	// <6,5,5,u>: Cost 3 vsldoi8 <5,u,6,5>, <5,u,6,5>
+  2772455460U,	// <6,5,6,0>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,0,1>
+  2322008978U,	// <6,5,6,1>: Cost 3 vmrglw <6,6,6,6>, <4,0,5,1>
+  3840225335U,	// <6,5,6,2>: Cost 4 vsldoi12 <2,4,5,6>, <5,6,2,2>
+  2772455490U,	// <6,5,6,3>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,3,4>
+  2772455500U,	// <6,5,6,4>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,4,5>
+  2254901252U,	// <6,5,6,5>: Cost 3 vmrghw <6,6,6,6>, <5,5,5,5>
+  2772455520U,	// <6,5,6,6>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,6,7>
+  2785874024U,	// <6,5,6,7>: Cost 3 vsldoi12 <5,6,7,6>, <5,6,7,6>
+  2772455532U,	// <6,5,6,u>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,u,1>
+  2627625062U,	// <6,5,7,0>: Cost 3 vsldoi4 <1,6,5,7>, LHS
+  1235667858U,	// <6,5,7,1>: Cost 2 vmrglw RHS, <4,0,5,1>
+  2309409278U,	// <6,5,7,2>: Cost 3 vmrglw RHS, <3,4,5,2>
+  2309407659U,	// <6,5,7,3>: Cost 3 vmrglw RHS, <1,2,5,3>
+  2627628342U,	// <6,5,7,4>: Cost 3 vsldoi4 <1,6,5,7>, RHS
+  1235668186U,	// <6,5,7,5>: Cost 2 vmrglw RHS, <4,4,5,5>
+  1235667458U,	// <6,5,7,6>: Cost 2 vmrglw RHS, <3,4,5,6>
+  2309407987U,	// <6,5,7,7>: Cost 3 vmrglw RHS, <1,6,5,7>
+  1235667460U,	// <6,5,7,u>: Cost 2 vmrglw RHS, <3,4,5,u>
+  2627633254U,	// <6,5,u,0>: Cost 3 vsldoi4 <1,6,5,u>, LHS
+  1235676050U,	// <6,5,u,1>: Cost 2 vmrglw RHS, <4,0,5,1>
+  2309417470U,	// <6,5,u,2>: Cost 3 vmrglw RHS, <3,4,5,2>
+  2309415851U,	// <6,5,u,3>: Cost 3 vmrglw RHS, <1,2,5,3>
+  2627636534U,	// <6,5,u,4>: Cost 3 vsldoi4 <1,6,5,u>, RHS
+  1235676378U,	// <6,5,u,5>: Cost 2 vmrglw RHS, <4,4,5,5>
+  1235675650U,	// <6,5,u,6>: Cost 2 vmrglw RHS, <3,4,5,6>
+  2309416179U,	// <6,5,u,7>: Cost 3 vmrglw RHS, <1,6,5,7>
+  1235675652U,	// <6,5,u,u>: Cost 2 vmrglw RHS, <3,4,5,u>
+  2309352751U,	// <6,6,0,0>: Cost 3 vmrglw <4,5,6,0>, <4,5,6,0>
+  1650917478U,	// <6,6,0,1>: Cost 2 vsldoi8 <6,6,6,6>, LHS
+  2250584570U,	// <6,6,0,2>: Cost 3 vmrghw <6,0,2,1>, <6,2,7,3>
+  3846197554U,	// <6,6,0,3>: Cost 4 vsldoi12 <3,4,5,6>, <6,0,3,1>
+  2724659538U,	// <6,6,0,4>: Cost 3 vsldoi8 <6,6,6,6>, <0,4,1,5>
+  3725275225U,	// <6,6,0,5>: Cost 4 vsldoi4 <5,6,6,0>, <5,6,6,0>
+  2791772493U,	// <6,6,0,6>: Cost 3 vsldoi12 <6,6,6,6>, <6,0,6,1>
+  2309352758U,	// <6,6,0,7>: Cost 3 vmrglw <4,5,6,0>, RHS
+  1650918045U,	// <6,6,0,u>: Cost 2 vsldoi8 <6,6,6,6>, LHS
+  3325358368U,	// <6,6,1,0>: Cost 4 vmrghw <6,1,7,1>, <6,0,1,1>
+  2299406449U,	// <6,6,1,1>: Cost 3 vmrglw <2,u,6,1>, <2,u,6,1>
+  2724660118U,	// <6,6,1,2>: Cost 3 vsldoi8 <6,6,6,6>, <1,2,3,0>
+  3373148518U,	// <6,6,1,3>: Cost 4 vmrglw <2,u,6,1>, <3,2,6,3>
+  3834253712U,	// <6,6,1,4>: Cost 4 vsldoi12 <1,4,5,6>, <6,1,4,5>
+  3373147953U,	// <6,6,1,5>: Cost 4 vmrglw <2,u,6,1>, <2,4,6,5>
+  2323297080U,	// <6,6,1,6>: Cost 3 vmrglw <6,u,6,1>, <6,6,6,6>
+  2299407670U,	// <6,6,1,7>: Cost 3 vmrglw <2,u,6,1>, RHS
+  2299407671U,	// <6,6,1,u>: Cost 3 vmrglw <2,u,6,1>, RHS
+  2252296489U,	// <6,6,2,0>: Cost 3 vmrghw <6,2,7,3>, <6,0,2,1>
+  3326038394U,	// <6,6,2,1>: Cost 4 vmrghw <6,2,7,3>, <6,1,2,1>
+  1178554874U,	// <6,6,2,2>: Cost 2 vmrghw <6,2,7,3>, <6,2,7,3>
+  2724660902U,	// <6,6,2,3>: Cost 3 vsldoi8 <6,6,6,6>, <2,3,0,1>
+  2252296817U,	// <6,6,2,4>: Cost 3 vmrghw <6,2,7,3>, <6,4,2,5>
+  3840741864U,	// <6,6,2,5>: Cost 4 vsldoi12 <2,5,3,6>, <6,2,5,3>
+  2252296976U,	// <6,6,2,6>: Cost 3 vmrghw <6,2,7,3>, <6,6,2,2>
+  2785874426U,	// <6,6,2,7>: Cost 3 vsldoi12 <5,6,7,6>, <6,2,7,3>
+  1178554874U,	// <6,6,2,u>: Cost 2 vmrghw <6,2,7,3>, <6,2,7,3>
+  2724661398U,	// <6,6,3,0>: Cost 3 vsldoi8 <6,6,6,6>, <3,0,1,2>
+  3375154665U,	// <6,6,3,1>: Cost 4 vmrglw <3,2,6,3>, <2,0,6,1>
+  3375154909U,	// <6,6,3,2>: Cost 4 vmrglw <3,2,6,3>, <2,3,6,2>
+  2301413734U,	// <6,6,3,3>: Cost 3 vmrglw <3,2,6,3>, <3,2,6,3>
+  2772455986U,	// <6,6,3,4>: Cost 3 vsldoi12 <3,4,5,6>, <6,3,4,5>
+  3375154993U,	// <6,6,3,5>: Cost 4 vmrglw <3,2,6,3>, <2,4,6,5>
+  2323313464U,	// <6,6,3,6>: Cost 3 vmrglw <6,u,6,3>, <6,6,6,6>
+  2301414710U,	// <6,6,3,7>: Cost 3 vmrglw <3,2,6,3>, RHS
+  2301414711U,	// <6,6,3,u>: Cost 3 vmrglw <3,2,6,3>, RHS
+  2724662162U,	// <6,6,4,0>: Cost 3 vsldoi8 <6,6,6,6>, <4,0,5,1>
+  3326939559U,	// <6,6,4,1>: Cost 4 vmrghw <6,4,1,5>, <6,1,7,1>
+  2253271546U,	// <6,6,4,2>: Cost 3 vmrghw <6,4,2,5>, <6,2,7,3>
+  3383127346U,	// <6,6,4,3>: Cost 4 vmrglw <4,5,6,4>, <4,5,6,3>
+  2309385523U,	// <6,6,4,4>: Cost 3 vmrglw <4,5,6,4>, <4,5,6,4>
+  1650920758U,	// <6,6,4,5>: Cost 2 vsldoi8 <6,6,6,6>, RHS
+  2724662653U,	// <6,6,4,6>: Cost 3 vsldoi8 <6,6,6,6>, <4,6,5,6>
+  2309385526U,	// <6,6,4,7>: Cost 3 vmrglw <4,5,6,4>, RHS
+  1650921001U,	// <6,6,4,u>: Cost 2 vsldoi8 <6,6,6,6>, RHS
+  3725312102U,	// <6,6,5,0>: Cost 4 vsldoi4 <5,6,6,5>, LHS
+  3373180393U,	// <6,6,5,1>: Cost 4 vmrglw <2,u,6,5>, <2,0,6,1>
+  3791769368U,	// <6,6,5,2>: Cost 4 vsldoi8 <5,5,6,6>, <5,2,6,3>
+  3373181286U,	// <6,6,5,3>: Cost 4 vmrglw <2,u,6,5>, <3,2,6,3>
+  3725315382U,	// <6,6,5,4>: Cost 4 vsldoi4 <5,6,6,5>, RHS
+  2299439221U,	// <6,6,5,5>: Cost 3 vmrglw <2,u,6,5>, <2,u,6,5>
+  2724663394U,	// <6,6,5,6>: Cost 3 vsldoi8 <6,6,6,6>, <5,6,7,0>
+  2299440438U,	// <6,6,5,7>: Cost 3 vmrglw <2,u,6,5>, RHS
+  2299440439U,	// <6,6,5,u>: Cost 3 vmrglw <2,u,6,5>, RHS
+  1583808614U,	// <6,6,6,0>: Cost 2 vsldoi4 <6,6,6,6>, LHS
+  2322010445U,	// <6,6,6,1>: Cost 3 vmrglw <6,6,6,6>, <6,0,6,1>
+  2254574074U,	// <6,6,6,2>: Cost 3 vmrghw <6,6,2,2>, <6,2,7,3>
+  2322010609U,	// <6,6,6,3>: Cost 3 vmrglw <6,6,6,6>, <6,2,6,3>
+  1583811894U,	// <6,6,6,4>: Cost 2 vsldoi4 <6,6,6,6>, RHS
+  2322010773U,	// <6,6,6,5>: Cost 3 vmrglw <6,6,6,6>, <6,4,6,5>
+  363253046U,	// <6,6,6,6>: Cost 1 vspltisw2 RHS
+  1248267574U,	// <6,6,6,7>: Cost 2 vmrglw <6,6,6,6>, RHS
+  363253046U,	// <6,6,6,u>: Cost 1 vspltisw2 RHS
+  2309410095U,	// <6,6,7,0>: Cost 3 vmrglw RHS, <4,5,6,0>
+  2309408233U,	// <6,6,7,1>: Cost 3 vmrglw RHS, <2,0,6,1>
+  2311402373U,	// <6,6,7,2>: Cost 3 vmrglw RHS, <6,7,6,2>
+  2309409126U,	// <6,6,7,3>: Cost 3 vmrglw RHS, <3,2,6,3>
+  2309410099U,	// <6,6,7,4>: Cost 3 vmrglw RHS, <4,5,6,4>
+  2309408561U,	// <6,6,7,5>: Cost 3 vmrglw RHS, <2,4,6,5>
+  1237660472U,	// <6,6,7,6>: Cost 2 vmrglw RHS, <6,6,6,6>
+  161926454U,	// <6,6,7,7>: Cost 1 vmrglw RHS, RHS
+  161926455U,	// <6,6,7,u>: Cost 1 vmrglw RHS, RHS
+  1583808614U,	// <6,6,u,0>: Cost 2 vsldoi4 <6,6,6,6>, LHS
+  1650923310U,	// <6,6,u,1>: Cost 2 vsldoi8 <6,6,6,6>, LHS
+  1178554874U,	// <6,6,u,2>: Cost 2 vmrghw <6,2,7,3>, <6,2,7,3>
+  2309417318U,	// <6,6,u,3>: Cost 3 vmrglw RHS, <3,2,6,3>
+  1583811894U,	// <6,6,u,4>: Cost 2 vsldoi4 <6,6,6,6>, RHS
+  1650923674U,	// <6,6,u,5>: Cost 2 vsldoi8 <6,6,6,6>, RHS
+  363253046U,	// <6,6,u,6>: Cost 1 vspltisw2 RHS
+  161934646U,	// <6,6,u,7>: Cost 1 vmrglw RHS, RHS
+  161934647U,	// <6,6,u,u>: Cost 1 vmrglw RHS, RHS
+  1638318080U,	// <6,7,0,0>: Cost 2 vsldoi8 RHS, <0,0,0,0>
+  564576358U,	// <6,7,0,1>: Cost 1 vsldoi8 RHS, LHS
+  2712060077U,	// <6,7,0,2>: Cost 3 vsldoi8 RHS, <0,2,1,2>
+  2712060156U,	// <6,7,0,3>: Cost 3 vsldoi8 RHS, <0,3,1,0>
+  1638318418U,	// <6,7,0,4>: Cost 2 vsldoi8 RHS, <0,4,1,5>
+  1577865314U,	// <6,7,0,5>: Cost 2 vsldoi4 <5,6,7,0>, <5,6,7,0>
+  2712060406U,	// <6,7,0,6>: Cost 3 vsldoi8 RHS, <0,6,1,7>
+  2651608058U,	// <6,7,0,7>: Cost 3 vsldoi4 <5,6,7,0>, <7,0,1,2>
+  564576925U,	// <6,7,0,u>: Cost 1 vsldoi8 RHS, LHS
+  2712060643U,	// <6,7,1,0>: Cost 3 vsldoi8 RHS, <1,0,1,1>
+  1638318900U,	// <6,7,1,1>: Cost 2 vsldoi8 RHS, <1,1,1,1>
+  1638318998U,	// <6,7,1,2>: Cost 2 vsldoi8 RHS, <1,2,3,0>
+  3766559753U,	// <6,7,1,3>: Cost 4 vsldoi8 <1,3,6,7>, <1,3,6,7>
+  2712060971U,	// <6,7,1,4>: Cost 3 vsldoi8 RHS, <1,4,1,5>
+  2712061039U,	// <6,7,1,5>: Cost 3 vsldoi8 RHS, <1,5,0,1>
+  2712061135U,	// <6,7,1,6>: Cost 3 vsldoi8 RHS, <1,6,1,7>
+  3373148612U,	// <6,7,1,7>: Cost 4 vmrglw <2,u,6,1>, <3,3,7,7>
+  1638319484U,	// <6,7,1,u>: Cost 2 vsldoi8 RHS, <1,u,3,0>
+  2712061373U,	// <6,7,2,0>: Cost 3 vsldoi8 RHS, <2,0,1,2>
+  2712061471U,	// <6,7,2,1>: Cost 3 vsldoi8 RHS, <2,1,3,1>
+  1638319720U,	// <6,7,2,2>: Cost 2 vsldoi8 RHS, <2,2,2,2>
+  1638319782U,	// <6,7,2,3>: Cost 2 vsldoi8 RHS, <2,3,0,1>
+  2712061709U,	// <6,7,2,4>: Cost 3 vsldoi8 RHS, <2,4,2,5>
+  2712061800U,	// <6,7,2,5>: Cost 3 vsldoi8 RHS, <2,5,3,6>
+  1638320058U,	// <6,7,2,6>: Cost 2 vsldoi8 RHS, <2,6,3,7>
+  2252297836U,	// <6,7,2,7>: Cost 3 vmrghw <6,2,7,3>, <7,7,7,7>
+  1638320187U,	// <6,7,2,u>: Cost 2 vsldoi8 RHS, <2,u,0,1>
+  1638320278U,	// <6,7,3,0>: Cost 2 vsldoi8 RHS, <3,0,1,2>
+  2712062182U,	// <6,7,3,1>: Cost 3 vsldoi8 RHS, <3,1,1,1>
+  2712062256U,	// <6,7,3,2>: Cost 3 vsldoi8 RHS, <3,2,0,3>
+  1638320540U,	// <6,7,3,3>: Cost 2 vsldoi8 RHS, <3,3,3,3>
+  1638320642U,	// <6,7,3,4>: Cost 2 vsldoi8 RHS, <3,4,5,6>
+  2712062546U,	// <6,7,3,5>: Cost 3 vsldoi8 RHS, <3,5,5,5>
+  2712062584U,	// <6,7,3,6>: Cost 3 vsldoi8 RHS, <3,6,0,7>
+  2712062659U,	// <6,7,3,7>: Cost 3 vsldoi8 RHS, <3,7,0,1>
+  1638320926U,	// <6,7,3,u>: Cost 2 vsldoi8 RHS, <3,u,1,2>
+  1638321042U,	// <6,7,4,0>: Cost 2 vsldoi8 RHS, <4,0,5,1>
+  2712062922U,	// <6,7,4,1>: Cost 3 vsldoi8 RHS, <4,1,2,3>
+  2712063029U,	// <6,7,4,2>: Cost 3 vsldoi8 RHS, <4,2,5,2>
+  2712063108U,	// <6,7,4,3>: Cost 3 vsldoi8 RHS, <4,3,5,0>
+  1638321360U,	// <6,7,4,4>: Cost 2 vsldoi8 RHS, <4,4,4,4>
+  564579638U,	// <6,7,4,5>: Cost 1 vsldoi8 RHS, RHS
+  2712063357U,	// <6,7,4,6>: Cost 3 vsldoi8 RHS, <4,6,5,6>
+  2712063439U,	// <6,7,4,7>: Cost 3 vsldoi8 RHS, <4,7,5,7>
+  564579881U,	// <6,7,4,u>: Cost 1 vsldoi8 RHS, RHS
+  2712063560U,	// <6,7,5,0>: Cost 3 vsldoi8 RHS, <5,0,1,2>
+  2714054287U,	// <6,7,5,1>: Cost 3 vsldoi8 RHS, <5,1,0,1>
+  2712063742U,	// <6,7,5,2>: Cost 3 vsldoi8 RHS, <5,2,3,4>
+  3373181295U,	// <6,7,5,3>: Cost 4 vmrglw <2,u,6,5>, <3,2,7,3>
+  2712063924U,	// <6,7,5,4>: Cost 3 vsldoi8 RHS, <5,4,5,6>
+  1638322180U,	// <6,7,5,5>: Cost 2 vsldoi8 RHS, <5,5,5,5>
+  1638322274U,	// <6,7,5,6>: Cost 2 vsldoi8 RHS, <5,6,7,0>
+  3373181380U,	// <6,7,5,7>: Cost 4 vmrglw <2,u,6,5>, <3,3,7,7>
+  1640313092U,	// <6,7,5,u>: Cost 2 vsldoi8 RHS, <5,u,7,0>
+  2712064289U,	// <6,7,6,0>: Cost 3 vsldoi8 RHS, <6,0,1,2>
+  2712064423U,	// <6,7,6,1>: Cost 3 vsldoi8 RHS, <6,1,7,1>
+  1638322682U,	// <6,7,6,2>: Cost 2 vsldoi8 RHS, <6,2,7,3>
+  2712064562U,	// <6,7,6,3>: Cost 3 vsldoi8 RHS, <6,3,4,5>
+  2712064653U,	// <6,7,6,4>: Cost 3 vsldoi8 RHS, <6,4,5,6>
+  2712064747U,	// <6,7,6,5>: Cost 3 vsldoi8 RHS, <6,5,7,1>
+  1638323000U,	// <6,7,6,6>: Cost 2 vsldoi8 RHS, <6,6,6,6>
+  1638323022U,	// <6,7,6,7>: Cost 2 vsldoi8 RHS, <6,7,0,1>
+  1638323168U,	// <6,7,6,u>: Cost 2 vsldoi8 RHS, <6,u,7,3>
+  1237659746U,	// <6,7,7,0>: Cost 2 vmrglw RHS, <5,6,7,0>
+  2309411158U,	// <6,7,7,1>: Cost 3 vmrglw RHS, <6,0,7,1>
+  2639718330U,	// <6,7,7,2>: Cost 3 vsldoi4 <3,6,7,7>, <2,6,3,7>
+  1235669498U,	// <6,7,7,3>: Cost 2 vmrglw RHS, <6,2,7,3>
+  1237659750U,	// <6,7,7,4>: Cost 2 vmrglw RHS, <5,6,7,4>
+  2309411243U,	// <6,7,7,5>: Cost 3 vmrglw RHS, <6,1,7,5>
+  1583895362U,	// <6,7,7,6>: Cost 2 vsldoi4 <6,6,7,7>, <6,6,7,7>
+  1235669826U,	// <6,7,7,7>: Cost 2 vmrglw RHS, <6,6,7,7>
+  1235669503U,	// <6,7,7,u>: Cost 2 vmrglw RHS, <6,2,7,u>
+  1638323923U,	// <6,7,u,0>: Cost 2 vsldoi8 RHS, <u,0,1,2>
+  564582190U,	// <6,7,u,1>: Cost 1 vsldoi8 RHS, LHS
+  1638324101U,	// <6,7,u,2>: Cost 2 vsldoi8 RHS, <u,2,3,0>
+  1638324156U,	// <6,7,u,3>: Cost 2 vsldoi8 RHS, <u,3,0,1>
+  1638324287U,	// <6,7,u,4>: Cost 2 vsldoi8 RHS, <u,4,5,6>
+  564582554U,	// <6,7,u,5>: Cost 1 vsldoi8 RHS, RHS
+  1638324432U,	// <6,7,u,6>: Cost 2 vsldoi8 RHS, <u,6,3,7>
+  1235678018U,	// <6,7,u,7>: Cost 2 vmrglw RHS, <6,6,7,7>
+  564582757U,	// <6,7,u,u>: Cost 1 vsldoi8 RHS, LHS
+  1638326272U,	// <6,u,0,0>: Cost 2 vsldoi8 RHS, <0,0,0,0>
+  564584550U,	// <6,u,0,1>: Cost 1 vsldoi8 RHS, LHS
+  2712068269U,	// <6,u,0,2>: Cost 3 vsldoi8 RHS, <0,2,1,2>
+  2309349532U,	// <6,u,0,3>: Cost 3 vmrglw <4,5,6,0>, LHS
+  1638326610U,	// <6,u,0,4>: Cost 2 vsldoi8 RHS, <0,4,1,5>
+  1577939051U,	// <6,u,0,5>: Cost 2 vsldoi4 <5,6,u,0>, <5,6,u,0>
+  2712068598U,	// <6,u,0,6>: Cost 3 vsldoi8 RHS, <0,6,1,7>
+  2309352776U,	// <6,u,0,7>: Cost 3 vmrglw <4,5,6,0>, RHS
+  564585117U,	// <6,u,0,u>: Cost 1 vsldoi8 RHS, LHS
+  2712068835U,	// <6,u,1,0>: Cost 3 vsldoi8 RHS, <1,0,1,1>
+  1638327092U,	// <6,u,1,1>: Cost 2 vsldoi8 RHS, <1,1,1,1>
+  1698715438U,	// <6,u,1,2>: Cost 2 vsldoi12 <3,4,5,6>, LHS
+  2299404444U,	// <6,u,1,3>: Cost 3 vmrglw <2,u,6,1>, LHS
+  2712069163U,	// <6,u,1,4>: Cost 3 vsldoi8 RHS, <1,4,1,5>
+  2712069231U,	// <6,u,1,5>: Cost 3 vsldoi8 RHS, <1,5,0,1>
+  2712069327U,	// <6,u,1,6>: Cost 3 vsldoi8 RHS, <1,6,1,7>
+  2299407688U,	// <6,u,1,7>: Cost 3 vmrglw <2,u,6,1>, RHS
+  1698715492U,	// <6,u,1,u>: Cost 2 vsldoi12 <3,4,5,6>, LHS
+  2712069565U,	// <6,u,2,0>: Cost 3 vsldoi8 RHS, <2,0,1,2>
+  1178556206U,	// <6,u,2,1>: Cost 2 vmrghw <6,2,7,3>, LHS
+  1638327912U,	// <6,u,2,2>: Cost 2 vsldoi8 RHS, <2,2,2,2>
+  1638327974U,	// <6,u,2,3>: Cost 2 vsldoi8 RHS, <2,3,0,1>
+  2712069901U,	// <6,u,2,4>: Cost 3 vsldoi8 RHS, <2,4,2,5>
+  1178556570U,	// <6,u,2,5>: Cost 2 vmrghw <6,2,7,3>, RHS
+  1638328250U,	// <6,u,2,6>: Cost 2 vsldoi8 RHS, <2,6,3,7>
+  2252298496U,	// <6,u,2,7>: Cost 3 vmrghw <6,2,7,3>, <u,7,0,1>
+  1638328379U,	// <6,u,2,u>: Cost 2 vsldoi8 RHS, <2,u,0,1>
+  1638328470U,	// <6,u,3,0>: Cost 2 vsldoi8 RHS, <3,0,1,2>
+  2712070374U,	// <6,u,3,1>: Cost 3 vsldoi8 RHS, <3,1,1,1>
+  2704107883U,	// <6,u,3,2>: Cost 3 vsldoi8 <3,2,6,u>, <3,2,6,u>
+  1638328732U,	// <6,u,3,3>: Cost 2 vsldoi8 RHS, <3,3,3,3>
+  1638328834U,	// <6,u,3,4>: Cost 2 vsldoi8 RHS, <3,4,5,6>
+  2712070738U,	// <6,u,3,5>: Cost 3 vsldoi8 RHS, <3,5,5,5>
+  2712070776U,	// <6,u,3,6>: Cost 3 vsldoi8 RHS, <3,6,0,7>
+  2301414728U,	// <6,u,3,7>: Cost 3 vmrglw <3,2,6,3>, RHS
+  1638329118U,	// <6,u,3,u>: Cost 2 vsldoi8 RHS, <3,u,1,2>
+  1638329234U,	// <6,u,4,0>: Cost 2 vsldoi8 RHS, <4,0,5,1>
+  2712071114U,	// <6,u,4,1>: Cost 3 vsldoi8 RHS, <4,1,2,3>
+  2712071221U,	// <6,u,4,2>: Cost 3 vsldoi8 RHS, <4,2,5,2>
+  2309382300U,	// <6,u,4,3>: Cost 3 vmrglw <4,5,6,4>, LHS
+  1638329552U,	// <6,u,4,4>: Cost 2 vsldoi8 RHS, <4,4,4,4>
+  564587831U,	// <6,u,4,5>: Cost 1 vsldoi8 RHS, RHS
+  2712071545U,	// <6,u,4,6>: Cost 3 vsldoi8 RHS, <4,6,5,2>
+  2309385544U,	// <6,u,4,7>: Cost 3 vmrglw <4,5,6,4>, RHS
+  564588073U,	// <6,u,4,u>: Cost 1 vsldoi8 RHS, RHS
+  2712071752U,	// <6,u,5,0>: Cost 3 vsldoi8 RHS, <5,0,1,2>
+  2714062479U,	// <6,u,5,1>: Cost 3 vsldoi8 RHS, <5,1,0,1>
+  2712071934U,	// <6,u,5,2>: Cost 3 vsldoi8 RHS, <5,2,3,4>
+  2299437212U,	// <6,u,5,3>: Cost 3 vmrglw <2,u,6,5>, LHS
+  2712072116U,	// <6,u,5,4>: Cost 3 vsldoi8 RHS, <5,4,5,6>
+  1638330372U,	// <6,u,5,5>: Cost 2 vsldoi8 RHS, <5,5,5,5>
+  1698715802U,	// <6,u,5,6>: Cost 2 vsldoi12 <3,4,5,6>, RHS
+  2299440456U,	// <6,u,5,7>: Cost 3 vmrglw <2,u,6,5>, RHS
+  1698715820U,	// <6,u,5,u>: Cost 2 vsldoi12 <3,4,5,6>, RHS
+  1583808614U,	// <6,u,6,0>: Cost 2 vsldoi4 <6,6,6,6>, LHS
+  1181161262U,	// <6,u,6,1>: Cost 2 vmrghw <6,6,6,6>, LHS
+  1638330874U,	// <6,u,6,2>: Cost 2 vsldoi8 RHS, <6,2,7,3>
+  1248264348U,	// <6,u,6,3>: Cost 2 vmrglw <6,6,6,6>, LHS
+  1583811894U,	// <6,u,6,4>: Cost 2 vsldoi4 <6,6,6,6>, RHS
+  1181161626U,	// <6,u,6,5>: Cost 2 vmrghw <6,6,6,6>, RHS
+  363253046U,	// <6,u,6,6>: Cost 1 vspltisw2 RHS
+  1638331214U,	// <6,u,6,7>: Cost 2 vsldoi8 RHS, <6,7,0,1>
+  363253046U,	// <6,u,6,u>: Cost 1 vspltisw2 RHS
+  1560076390U,	// <6,u,7,0>: Cost 2 vsldoi4 <2,6,u,7>, LHS
+  1235664969U,	// <6,u,7,1>: Cost 2 vmrglw RHS, <0,0,u,1>
+  1560078311U,	// <6,u,7,2>: Cost 2 vsldoi4 <2,6,u,7>, <2,6,u,7>
+  161923228U,	// <6,u,7,3>: Cost 1 vmrglw RHS, LHS
+  1560079670U,	// <6,u,7,4>: Cost 2 vsldoi4 <2,6,u,7>, RHS
+  1235665297U,	// <6,u,7,5>: Cost 2 vmrglw RHS, <0,4,u,5>
+  1235667485U,	// <6,u,7,6>: Cost 2 vmrglw RHS, <3,4,u,6>
+  161926472U,	// <6,u,7,7>: Cost 1 vmrglw RHS, RHS
+  161923233U,	// <6,u,7,u>: Cost 1 vmrglw RHS, LHS
+  1560084582U,	// <6,u,u,0>: Cost 2 vsldoi4 <2,6,u,u>, LHS
+  564590382U,	// <6,u,u,1>: Cost 1 vsldoi8 RHS, LHS
+  1560086504U,	// <6,u,u,2>: Cost 2 vsldoi4 <2,6,u,u>, <2,6,u,u>
+  161931420U,	// <6,u,u,3>: Cost 1 vmrglw RHS, LHS
+  1560087862U,	// <6,u,u,4>: Cost 2 vsldoi4 <2,6,u,u>, RHS
+  564590746U,	// <6,u,u,5>: Cost 1 vsldoi8 RHS, RHS
+  363253046U,	// <6,u,u,6>: Cost 1 vspltisw2 RHS
+  161934664U,	// <6,u,u,7>: Cost 1 vmrglw RHS, RHS
+  161931425U,	// <6,u,u,u>: Cost 1 vmrglw RHS, LHS
+  1705426944U,	// <7,0,0,0>: Cost 2 vsldoi12 RHS, <0,0,0,0>
+  1705426954U,	// <7,0,0,1>: Cost 2 vsldoi12 RHS, <0,0,1,1>
+  3713550266U,	// <7,0,0,2>: Cost 4 vsldoi4 <3,7,0,0>, <2,6,3,7>
+  2316063892U,	// <7,0,0,3>: Cost 3 vmrglw <5,6,7,0>, <7,2,0,3>
+  2779168805U,	// <7,0,0,4>: Cost 3 vsldoi12 RHS, <0,0,4,1>
+  2663698530U,	// <7,0,0,5>: Cost 3 vsldoi4 <7,7,0,0>, <5,6,7,0>
+  2657727309U,	// <7,0,0,6>: Cost 3 vsldoi4 <6,7,0,0>, <6,7,0,0>
+  2316064220U,	// <7,0,0,7>: Cost 3 vmrglw <5,6,7,0>, <7,6,0,7>
+  1705427017U,	// <7,0,0,u>: Cost 2 vsldoi12 RHS, <0,0,u,1>
+  1583988838U,	// <7,0,1,0>: Cost 2 vsldoi4 <6,7,0,1>, LHS
+  2779168859U,	// <7,0,1,1>: Cost 3 vsldoi12 RHS, <0,1,1,1>
+  631685222U,	// <7,0,1,2>: Cost 1 vsldoi12 RHS, LHS
+  2639817411U,	// <7,0,1,3>: Cost 3 vsldoi4 <3,7,0,1>, <3,7,0,1>
+  1583992118U,	// <7,0,1,4>: Cost 2 vsldoi4 <6,7,0,1>, RHS
+  2657734660U,	// <7,0,1,5>: Cost 3 vsldoi4 <6,7,0,1>, <5,5,5,5>
+  1583993678U,	// <7,0,1,6>: Cost 2 vsldoi4 <6,7,0,1>, <6,7,0,1>
+  2657735672U,	// <7,0,1,7>: Cost 3 vsldoi4 <6,7,0,1>, <7,0,1,0>
+  631685276U,	// <7,0,1,u>: Cost 1 vsldoi12 RHS, LHS
+  2779168933U,	// <7,0,2,0>: Cost 3 vsldoi12 RHS, <0,2,0,3>
+  2767667377U,	// <7,0,2,1>: Cost 3 vsldoi12 <2,6,3,7>, <0,2,1,6>
+  2718713448U,	// <7,0,2,2>: Cost 3 vsldoi8 <5,6,7,0>, <2,2,2,2>
+  2718713510U,	// <7,0,2,3>: Cost 3 vsldoi8 <5,6,7,0>, <2,3,0,1>
+  3841409228U,	// <7,0,2,4>: Cost 4 vsldoi12 <2,6,3,7>, <0,2,4,6>
+  3852910802U,	// <7,0,2,5>: Cost 4 vsldoi12 RHS, <0,2,5,3>
+  2718713786U,	// <7,0,2,6>: Cost 3 vsldoi8 <5,6,7,0>, <2,6,3,7>
+  3847160036U,	// <7,0,2,7>: Cost 4 vsldoi12 <3,6,0,7>, <0,2,7,3>
+  2767667440U,	// <7,0,2,u>: Cost 3 vsldoi12 <2,6,3,7>, <0,2,u,6>
+  2718714006U,	// <7,0,3,0>: Cost 3 vsldoi8 <5,6,7,0>, <3,0,1,2>
+  2779169020U,	// <7,0,3,1>: Cost 3 vsldoi12 RHS, <0,3,1,0>
+  3852910853U,	// <7,0,3,2>: Cost 4 vsldoi12 RHS, <0,3,2,0>
+  2718714268U,	// <7,0,3,3>: Cost 3 vsldoi8 <5,6,7,0>, <3,3,3,3>
+  2718714370U,	// <7,0,3,4>: Cost 3 vsldoi8 <5,6,7,0>, <3,4,5,6>
+  2718714461U,	// <7,0,3,5>: Cost 3 vsldoi8 <5,6,7,0>, <3,5,6,7>
+  2706770608U,	// <7,0,3,6>: Cost 3 vsldoi8 <3,6,7,0>, <3,6,7,0>
+  3847160114U,	// <7,0,3,7>: Cost 4 vsldoi12 <3,6,0,7>, <0,3,7,0>
+  2779169083U,	// <7,0,3,u>: Cost 3 vsldoi12 RHS, <0,3,u,0>
+  2718714770U,	// <7,0,4,0>: Cost 3 vsldoi8 <5,6,7,0>, <4,0,5,1>
+  1705427282U,	// <7,0,4,1>: Cost 2 vsldoi12 RHS, <0,4,1,5>
+  3713583034U,	// <7,0,4,2>: Cost 4 vsldoi4 <3,7,0,4>, <2,6,3,7>
+  3713583814U,	// <7,0,4,3>: Cost 4 vsldoi4 <3,7,0,4>, <3,7,0,4>
+  2779169133U,	// <7,0,4,4>: Cost 3 vsldoi12 RHS, <0,4,4,5>
+  1644973366U,	// <7,0,4,5>: Cost 2 vsldoi8 <5,6,7,0>, RHS
+  2657760081U,	// <7,0,4,6>: Cost 3 vsldoi4 <6,7,0,4>, <6,7,0,4>
+  2259468868U,	// <7,0,4,7>: Cost 3 vmrghw <7,4,5,6>, <0,7,1,4>
+  1705427345U,	// <7,0,4,u>: Cost 2 vsldoi12 RHS, <0,4,u,5>
+  2718715508U,	// <7,0,5,0>: Cost 3 vsldoi8 <5,6,7,0>, <5,0,6,1>
+  2260123750U,	// <7,0,5,1>: Cost 3 vmrghw <7,5,5,5>, LHS
+  3792457451U,	// <7,0,5,2>: Cost 4 vsldoi8 <5,6,7,0>, <5,2,1,3>
+  3852911024U,	// <7,0,5,3>: Cost 4 vsldoi12 RHS, <0,5,3,0>
+  2718715836U,	// <7,0,5,4>: Cost 3 vsldoi8 <5,6,7,0>, <5,4,6,5>
+  2718715908U,	// <7,0,5,5>: Cost 3 vsldoi8 <5,6,7,0>, <5,5,5,5>
+  1644974178U,	// <7,0,5,6>: Cost 2 vsldoi8 <5,6,7,0>, <5,6,7,0>
+  3792457853U,	// <7,0,5,7>: Cost 4 vsldoi8 <5,6,7,0>, <5,7,1,0>
+  1646301444U,	// <7,0,5,u>: Cost 2 vsldoi8 <5,u,7,0>, <5,u,7,0>
+  2720706901U,	// <7,0,6,0>: Cost 3 vsldoi8 <6,0,7,0>, <6,0,7,0>
+  2779169270U,	// <7,0,6,1>: Cost 3 vsldoi12 RHS, <0,6,1,7>
+  2718716410U,	// <7,0,6,2>: Cost 3 vsldoi8 <5,6,7,0>, <6,2,7,3>
+  2722697800U,	// <7,0,6,3>: Cost 3 vsldoi8 <6,3,7,0>, <6,3,7,0>
+  3852911121U,	// <7,0,6,4>: Cost 4 vsldoi12 RHS, <0,6,4,7>
+  3852911130U,	// <7,0,6,5>: Cost 4 vsldoi12 RHS, <0,6,5,7>
+  2718716728U,	// <7,0,6,6>: Cost 3 vsldoi8 <5,6,7,0>, <6,6,6,6>
+  2718716750U,	// <7,0,6,7>: Cost 3 vsldoi8 <5,6,7,0>, <6,7,0,1>
+  2779169333U,	// <7,0,6,u>: Cost 3 vsldoi12 RHS, <0,6,u,7>
+  2718716922U,	// <7,0,7,0>: Cost 3 vsldoi8 <5,6,7,0>, <7,0,1,2>
+  1187872870U,	// <7,0,7,1>: Cost 2 vmrghw <7,7,7,7>, LHS
+  2718717076U,	// <7,0,7,2>: Cost 3 vsldoi8 <5,6,7,0>, <7,2,0,3>
+  3847160408U,	// <7,0,7,3>: Cost 4 vsldoi12 <3,6,0,7>, <0,7,3,6>
+  2718717286U,	// <7,0,7,4>: Cost 3 vsldoi8 <5,6,7,0>, <7,4,5,6>
+  2718717377U,	// <7,0,7,5>: Cost 3 vsldoi8 <5,6,7,0>, <7,5,6,7>
+  2718717404U,	// <7,0,7,6>: Cost 3 vsldoi8 <5,6,7,0>, <7,6,0,7>
+  2718717478U,	// <7,0,7,7>: Cost 3 vsldoi8 <5,6,7,0>, <7,7,0,0>
+  1187873437U,	// <7,0,7,u>: Cost 2 vmrghw <7,7,7,7>, LHS
+  1584046182U,	// <7,0,u,0>: Cost 2 vsldoi4 <6,7,0,u>, LHS
+  1705427602U,	// <7,0,u,1>: Cost 2 vsldoi12 RHS, <0,u,1,1>
+  631685789U,	// <7,0,u,2>: Cost 1 vsldoi12 RHS, LHS
+  2639874762U,	// <7,0,u,3>: Cost 3 vsldoi4 <3,7,0,u>, <3,7,0,u>
+  1584049462U,	// <7,0,u,4>: Cost 2 vsldoi4 <6,7,0,u>, RHS
+  1644976282U,	// <7,0,u,5>: Cost 2 vsldoi8 <5,6,7,0>, RHS
+  1584051029U,	// <7,0,u,6>: Cost 2 vsldoi4 <6,7,0,u>, <6,7,0,u>
+  2718718208U,	// <7,0,u,7>: Cost 3 vsldoi8 <5,6,7,0>, <u,7,0,1>
+  631685843U,	// <7,0,u,u>: Cost 1 vsldoi12 RHS, LHS
+  2721374218U,	// <7,1,0,0>: Cost 3 vsldoi8 <6,1,7,1>, <0,0,1,1>
+  2779169507U,	// <7,1,0,1>: Cost 3 vsldoi12 RHS, <1,0,1,1>
+  2779169516U,	// <7,1,0,2>: Cost 3 vsldoi12 RHS, <1,0,2,1>
+  3852911348U,	// <7,1,0,3>: Cost 4 vsldoi12 RHS, <1,0,3,0>
+  2669743414U,	// <7,1,0,4>: Cost 3 vsldoi4 <u,7,1,0>, RHS
+  2316058962U,	// <7,1,0,5>: Cost 3 vmrglw <5,6,7,0>, <0,4,1,5>
+  2316059044U,	// <7,1,0,6>: Cost 3 vmrglw <5,6,7,0>, <0,5,1,6>
+  2669745146U,	// <7,1,0,7>: Cost 3 vsldoi4 <u,7,1,0>, <7,0,1,2>
+  2779169570U,	// <7,1,0,u>: Cost 3 vsldoi12 RHS, <1,0,u,1>
+  2779169579U,	// <7,1,1,0>: Cost 3 vsldoi12 RHS, <1,1,0,1>
+  1705427764U,	// <7,1,1,1>: Cost 2 vsldoi12 RHS, <1,1,1,1>
+  2779169598U,	// <7,1,1,2>: Cost 3 vsldoi12 RHS, <1,1,2,2>
+  3713632972U,	// <7,1,1,3>: Cost 4 vsldoi4 <3,7,1,1>, <3,7,1,1>
+  2779169619U,	// <7,1,1,4>: Cost 3 vsldoi12 RHS, <1,1,4,5>
+  2779169628U,	// <7,1,1,5>: Cost 3 vsldoi12 RHS, <1,1,5,5>
+  2657809239U,	// <7,1,1,6>: Cost 3 vsldoi4 <6,7,1,1>, <6,7,1,1>
+  3835290474U,	// <7,1,1,7>: Cost 4 vsldoi12 <1,6,1,7>, <1,1,7,1>
+  1705427764U,	// <7,1,1,u>: Cost 2 vsldoi12 RHS, <1,1,1,1>
+  2779169660U,	// <7,1,2,0>: Cost 3 vsldoi12 RHS, <1,2,0,1>
+  2779169671U,	// <7,1,2,1>: Cost 3 vsldoi12 RHS, <1,2,1,3>
+  2779169680U,	// <7,1,2,2>: Cost 3 vsldoi12 RHS, <1,2,2,3>
+  1705427862U,	// <7,1,2,3>: Cost 2 vsldoi12 RHS, <1,2,3,0>
+  2779169700U,	// <7,1,2,4>: Cost 3 vsldoi12 RHS, <1,2,4,5>
+  2779169707U,	// <7,1,2,5>: Cost 3 vsldoi12 RHS, <1,2,5,3>
+  2657817432U,	// <7,1,2,6>: Cost 3 vsldoi4 <6,7,1,2>, <6,7,1,2>
+  2803057594U,	// <7,1,2,7>: Cost 3 vsldoi12 RHS, <1,2,7,0>
+  1705427907U,	// <7,1,2,u>: Cost 2 vsldoi12 RHS, <1,2,u,0>
+  3776538827U,	// <7,1,3,0>: Cost 4 vsldoi8 <3,0,7,1>, <3,0,7,1>
+  2319400970U,	// <7,1,3,1>: Cost 3 vmrglw <6,2,7,3>, <0,0,1,1>
+  2316085398U,	// <7,1,3,2>: Cost 3 vmrglw <5,6,7,3>, <3,0,1,2>
+  3852911591U,	// <7,1,3,3>: Cost 4 vsldoi12 RHS, <1,3,3,0>
+  3852911600U,	// <7,1,3,4>: Cost 4 vsldoi12 RHS, <1,3,4,0>
+  2319401298U,	// <7,1,3,5>: Cost 3 vmrglw <6,2,7,3>, <0,4,1,5>
+  3833668617U,	// <7,1,3,6>: Cost 4 vsldoi12 <1,3,6,7>, <1,3,6,7>
+  3367265487U,	// <7,1,3,7>: Cost 4 vmrglw <1,u,7,3>, <1,6,1,7>
+  2319400977U,	// <7,1,3,u>: Cost 3 vmrglw <6,2,7,3>, <0,0,1,u>
+  2724031378U,	// <7,1,4,0>: Cost 3 vsldoi8 <6,5,7,1>, <4,0,5,1>
+  2779169835U,	// <7,1,4,1>: Cost 3 vsldoi12 RHS, <1,4,1,5>
+  2779169844U,	// <7,1,4,2>: Cost 3 vsldoi12 RHS, <1,4,2,5>
+  3852911672U,	// <7,1,4,3>: Cost 4 vsldoi12 RHS, <1,4,3,0>
+  2669776182U,	// <7,1,4,4>: Cost 3 vsldoi4 <u,7,1,4>, RHS
+  2779169872U,	// <7,1,4,5>: Cost 3 vsldoi12 RHS, <1,4,5,6>
+  3835290712U,	// <7,1,4,6>: Cost 4 vsldoi12 <1,6,1,7>, <1,4,6,5>
+  2669778278U,	// <7,1,4,7>: Cost 3 vsldoi4 <u,7,1,4>, <7,4,5,6>
+  2779169898U,	// <7,1,4,u>: Cost 3 vsldoi12 RHS, <1,4,u,5>
+  2779169903U,	// <7,1,5,0>: Cost 3 vsldoi12 RHS, <1,5,0,1>
+  3835585661U,	// <7,1,5,1>: Cost 4 vsldoi12 <1,6,5,7>, <1,5,1,6>
+  3841410182U,	// <7,1,5,2>: Cost 4 vsldoi12 <2,6,3,7>, <1,5,2,6>
+  3852911753U,	// <7,1,5,3>: Cost 4 vsldoi12 RHS, <1,5,3,0>
+  2779169943U,	// <7,1,5,4>: Cost 3 vsldoi12 RHS, <1,5,4,5>
+  2318754130U,	// <7,1,5,5>: Cost 3 vmrglw <6,1,7,5>, <0,4,1,5>
+  2718724195U,	// <7,1,5,6>: Cost 3 vsldoi8 <5,6,7,1>, <5,6,7,1>
+  3859178670U,	// <7,1,5,7>: Cost 4 vsldoi12 <5,6,1,7>, <1,5,7,1>
+  2779169975U,	// <7,1,5,u>: Cost 3 vsldoi12 RHS, <1,5,u,1>
+  2720715094U,	// <7,1,6,0>: Cost 3 vsldoi8 <6,0,7,1>, <6,0,7,1>
+  2761549007U,	// <7,1,6,1>: Cost 3 vsldoi12 <1,6,1,7>, <1,6,1,7>
+  2779170008U,	// <7,1,6,2>: Cost 3 vsldoi12 RHS, <1,6,2,7>
+  3835438305U,	// <7,1,6,3>: Cost 4 vsldoi12 <1,6,3,7>, <1,6,3,7>
+  3835512042U,	// <7,1,6,4>: Cost 4 vsldoi12 <1,6,4,7>, <1,6,4,7>
+  2761843955U,	// <7,1,6,5>: Cost 3 vsldoi12 <1,6,5,7>, <1,6,5,7>
+  3835659516U,	// <7,1,6,6>: Cost 4 vsldoi12 <1,6,6,7>, <1,6,6,7>
+  2803057918U,	// <7,1,6,7>: Cost 3 vsldoi12 RHS, <1,6,7,0>
+  2762065166U,	// <7,1,6,u>: Cost 3 vsldoi12 <1,6,u,7>, <1,6,u,7>
+  2669797478U,	// <7,1,7,0>: Cost 3 vsldoi4 <u,7,1,7>, LHS
+  2322087946U,	// <7,1,7,1>: Cost 3 vmrglw <6,6,7,7>, <0,0,1,1>
+  2317448186U,	// <7,1,7,2>: Cost 3 vmrglw <5,u,7,7>, <7,0,1,2>
+  3395829934U,	// <7,1,7,3>: Cost 4 vmrglw <6,6,7,7>, <0,2,1,3>
+  2669800758U,	// <7,1,7,4>: Cost 3 vsldoi4 <u,7,1,7>, RHS
+  2322088274U,	// <7,1,7,5>: Cost 3 vmrglw <6,6,7,7>, <0,4,1,5>
+  3375923377U,	// <7,1,7,6>: Cost 4 vmrglw <3,3,7,7>, <0,2,1,6>
+  2731996780U,	// <7,1,7,7>: Cost 3 vsldoi8 <7,u,7,1>, <7,7,7,7>
+  2322087953U,	// <7,1,7,u>: Cost 3 vmrglw <6,6,7,7>, <0,0,1,u>
+  2779170146U,	// <7,1,u,0>: Cost 3 vsldoi12 RHS, <1,u,0,1>
+  1705427764U,	// <7,1,u,1>: Cost 2 vsldoi12 RHS, <1,1,1,1>
+  2779170164U,	// <7,1,u,2>: Cost 3 vsldoi12 RHS, <1,u,2,1>
+  1705428348U,	// <7,1,u,3>: Cost 2 vsldoi12 RHS, <1,u,3,0>
+  2779170186U,	// <7,1,u,4>: Cost 3 vsldoi12 RHS, <1,u,4,5>
+  2763171221U,	// <7,1,u,5>: Cost 3 vsldoi12 <1,u,5,7>, <1,u,5,7>
+  2657866590U,	// <7,1,u,6>: Cost 3 vsldoi4 <6,7,1,u>, <6,7,1,u>
+  2803058080U,	// <7,1,u,7>: Cost 3 vsldoi12 RHS, <1,u,7,0>
+  1705428393U,	// <7,1,u,u>: Cost 2 vsldoi12 RHS, <1,u,u,0>
+  3713695846U,	// <7,2,0,0>: Cost 4 vsldoi4 <3,7,2,0>, LHS
+  2779170237U,	// <7,2,0,1>: Cost 3 vsldoi12 RHS, <2,0,1,2>
+  2779170245U,	// <7,2,0,2>: Cost 3 vsldoi12 RHS, <2,0,2,1>
+  1242316902U,	// <7,2,0,3>: Cost 2 vmrglw <5,6,7,0>, LHS
+  3713699126U,	// <7,2,0,4>: Cost 4 vsldoi4 <3,7,2,0>, RHS
+  3852912096U,	// <7,2,0,5>: Cost 4 vsldoi12 RHS, <2,0,5,1>
+  2767668713U,	// <7,2,0,6>: Cost 3 vsldoi12 <2,6,3,7>, <2,0,6,1>
+  2256488426U,	// <7,2,0,7>: Cost 3 vmrghw <7,0,1,2>, <2,7,0,1>
+  1242316907U,	// <7,2,0,u>: Cost 2 vmrglw <5,6,7,0>, LHS
+  3852912132U,	// <7,2,1,0>: Cost 4 vsldoi12 RHS, <2,1,0,1>
+  3852912141U,	// <7,2,1,1>: Cost 4 vsldoi12 RHS, <2,1,1,1>
+  3852912149U,	// <7,2,1,2>: Cost 4 vsldoi12 RHS, <2,1,2,0>
+  2779170335U,	// <7,2,1,3>: Cost 3 vsldoi12 RHS, <2,1,3,1>
+  3852912172U,	// <7,2,1,4>: Cost 4 vsldoi12 RHS, <2,1,4,5>
+  3840747062U,	// <7,2,1,5>: Cost 5 vsldoi12 <2,5,3,7>, <2,1,5,6>
+  3841410617U,	// <7,2,1,6>: Cost 4 vsldoi12 <2,6,3,7>, <2,1,6,0>
+  3795125538U,	// <7,2,1,7>: Cost 4 vsldoi8 <6,1,7,2>, <1,7,2,0>
+  2779170380U,	// <7,2,1,u>: Cost 3 vsldoi12 RHS, <2,1,u,1>
+  2779170389U,	// <7,2,2,0>: Cost 3 vsldoi12 RHS, <2,2,0,1>
+  3852912222U,	// <7,2,2,1>: Cost 4 vsldoi12 RHS, <2,2,1,1>
+  1705428584U,	// <7,2,2,2>: Cost 2 vsldoi12 RHS, <2,2,2,2>
+  1705428594U,	// <7,2,2,3>: Cost 2 vsldoi12 RHS, <2,2,3,3>
+  2779170429U,	// <7,2,2,4>: Cost 3 vsldoi12 RHS, <2,2,4,5>
+  3852912259U,	// <7,2,2,5>: Cost 4 vsldoi12 RHS, <2,2,5,2>
+  2767668880U,	// <7,2,2,6>: Cost 3 vsldoi12 <2,6,3,7>, <2,2,6,6>
+  3841336981U,	// <7,2,2,7>: Cost 4 vsldoi12 <2,6,2,7>, <2,2,7,2>
+  1705428639U,	// <7,2,2,u>: Cost 2 vsldoi12 RHS, <2,2,u,3>
+  1705428646U,	// <7,2,3,0>: Cost 2 vsldoi12 RHS, <2,3,0,1>
+  2779170479U,	// <7,2,3,1>: Cost 3 vsldoi12 RHS, <2,3,1,1>
+  2767668925U,	// <7,2,3,2>: Cost 3 vsldoi12 <2,6,3,7>, <2,3,2,6>
+  1245659238U,	// <7,2,3,3>: Cost 2 vmrglw <6,2,7,3>, LHS
+  1705428686U,	// <7,2,3,4>: Cost 2 vsldoi12 RHS, <2,3,4,5>
+  2779170519U,	// <7,2,3,5>: Cost 3 vsldoi12 RHS, <2,3,5,5>
+  2657899362U,	// <7,2,3,6>: Cost 3 vsldoi4 <6,7,2,3>, <6,7,2,3>
+  2319406574U,	// <7,2,3,7>: Cost 3 vmrglw <6,2,7,3>, <7,6,2,7>
+  1705428718U,	// <7,2,3,u>: Cost 2 vsldoi12 RHS, <2,3,u,1>
+  3713728614U,	// <7,2,4,0>: Cost 4 vsldoi4 <3,7,2,4>, LHS
+  3852912388U,	// <7,2,4,1>: Cost 4 vsldoi12 RHS, <2,4,1,5>
+  2779170573U,	// <7,2,4,2>: Cost 3 vsldoi12 RHS, <2,4,2,5>
+  1242349670U,	// <7,2,4,3>: Cost 2 vmrglw <5,6,7,4>, LHS
+  3713731894U,	// <7,2,4,4>: Cost 4 vsldoi4 <3,7,2,4>, RHS
+  2779170601U,	// <7,2,4,5>: Cost 3 vsldoi12 RHS, <2,4,5,6>
+  2767669041U,	// <7,2,4,6>: Cost 3 vsldoi12 <2,6,3,7>, <2,4,6,5>
+  3389834456U,	// <7,2,4,7>: Cost 4 vmrglw <5,6,7,4>, <1,6,2,7>
+  1242349675U,	// <7,2,4,u>: Cost 2 vmrglw <5,6,7,4>, LHS
+  3852912456U,	// <7,2,5,0>: Cost 4 vsldoi12 RHS, <2,5,0,1>
+  3852912466U,	// <7,2,5,1>: Cost 4 vsldoi12 RHS, <2,5,1,2>
+  3852912475U,	// <7,2,5,2>: Cost 4 vsldoi12 RHS, <2,5,2,2>
+  2779170664U,	// <7,2,5,3>: Cost 3 vsldoi12 RHS, <2,5,3,6>
+  3852912496U,	// <7,2,5,4>: Cost 4 vsldoi12 RHS, <2,5,4,5>
+  3792474116U,	// <7,2,5,5>: Cost 4 vsldoi8 <5,6,7,2>, <5,5,5,5>
+  2718732388U,	// <7,2,5,6>: Cost 3 vsldoi8 <5,6,7,2>, <5,6,7,2>
+  3841337228U,	// <7,2,5,7>: Cost 5 vsldoi12 <2,6,2,7>, <2,5,7,6>
+  2779170709U,	// <7,2,5,u>: Cost 3 vsldoi12 RHS, <2,5,u,6>
+  2640003174U,	// <7,2,6,0>: Cost 3 vsldoi4 <3,7,2,6>, LHS
+  2721386920U,	// <7,2,6,1>: Cost 3 vsldoi8 <6,1,7,2>, <6,1,7,2>
+  2767595441U,	// <7,2,6,2>: Cost 3 vsldoi12 <2,6,2,7>, <2,6,2,7>
+  1693927354U,	// <7,2,6,3>: Cost 2 vsldoi12 <2,6,3,7>, <2,6,3,7>
+  2640006454U,	// <7,2,6,4>: Cost 3 vsldoi4 <3,7,2,6>, RHS
+  3841558476U,	// <7,2,6,5>: Cost 4 vsldoi12 <2,6,5,7>, <2,6,5,7>
+  2657923941U,	// <7,2,6,6>: Cost 3 vsldoi4 <6,7,2,6>, <6,7,2,6>
+  3841337310U,	// <7,2,6,7>: Cost 4 vsldoi12 <2,6,2,7>, <2,6,7,7>
+  1694296039U,	// <7,2,6,u>: Cost 2 vsldoi12 <2,6,u,7>, <2,6,u,7>
+  2803058666U,	// <7,2,7,0>: Cost 3 vsldoi12 RHS, <2,7,0,1>
+  3852912632U,	// <7,2,7,1>: Cost 4 vsldoi12 RHS, <2,7,1,6>
+  2322089576U,	// <7,2,7,2>: Cost 3 vmrglw <6,6,7,7>, <2,2,2,2>
+  1248346214U,	// <7,2,7,3>: Cost 2 vmrglw <6,6,7,7>, LHS
+  3841337362U,	// <7,2,7,4>: Cost 4 vsldoi12 <2,6,2,7>, <2,7,4,5>
+  3395830836U,	// <7,2,7,5>: Cost 4 vmrglw <6,6,7,7>, <1,4,2,5>
+  2261616570U,	// <7,2,7,6>: Cost 3 vmrghw <7,7,7,7>, <2,6,3,7>
+  3371943857U,	// <7,2,7,7>: Cost 4 vmrglw <2,6,7,7>, <2,6,2,7>
+  1248346219U,	// <7,2,7,u>: Cost 2 vmrglw <6,6,7,7>, LHS
+  1705429051U,	// <7,2,u,0>: Cost 2 vsldoi12 RHS, <2,u,0,1>
+  2779170884U,	// <7,2,u,1>: Cost 3 vsldoi12 RHS, <2,u,1,1>
+  1705428584U,	// <7,2,u,2>: Cost 2 vsldoi12 RHS, <2,2,2,2>
+  1695254620U,	// <7,2,u,3>: Cost 2 vsldoi12 <2,u,3,7>, <2,u,3,7>
+  1705429091U,	// <7,2,u,4>: Cost 2 vsldoi12 RHS, <2,u,4,5>
+  2779170924U,	// <7,2,u,5>: Cost 3 vsldoi12 RHS, <2,u,5,5>
+  2767669361U,	// <7,2,u,6>: Cost 3 vsldoi12 <2,6,3,7>, <2,u,6,1>
+  2803058809U,	// <7,2,u,7>: Cost 3 vsldoi12 RHS, <2,u,7,0>
+  1695623305U,	// <7,2,u,u>: Cost 2 vsldoi12 <2,u,u,7>, <2,u,u,7>
+  2779170955U,	// <7,3,0,0>: Cost 3 vsldoi12 RHS, <3,0,0,0>
+  1705429142U,	// <7,3,0,1>: Cost 2 vsldoi12 RHS, <3,0,1,2>
+  2634057732U,	// <7,3,0,2>: Cost 3 vsldoi4 <2,7,3,0>, <2,7,3,0>
+  2779170983U,	// <7,3,0,3>: Cost 3 vsldoi12 RHS, <3,0,3,1>
+  2779170992U,	// <7,3,0,4>: Cost 3 vsldoi12 RHS, <3,0,4,1>
+  3852912829U,	// <7,3,0,5>: Cost 4 vsldoi12 RHS, <3,0,5,5>
+  2657948520U,	// <7,3,0,6>: Cost 3 vsldoi4 <6,7,3,0>, <6,7,3,0>
+  2316060602U,	// <7,3,0,7>: Cost 3 vmrglw <5,6,7,0>, <2,6,3,7>
+  1705429205U,	// <7,3,0,u>: Cost 2 vsldoi12 RHS, <3,0,u,2>
+  3852912860U,	// <7,3,1,0>: Cost 4 vsldoi12 RHS, <3,1,0,0>
+  2779171046U,	// <7,3,1,1>: Cost 3 vsldoi12 RHS, <3,1,1,1>
+  2779171057U,	// <7,3,1,2>: Cost 3 vsldoi12 RHS, <3,1,2,3>
+  3852912887U,	// <7,3,1,3>: Cost 4 vsldoi12 RHS, <3,1,3,0>
+  3852912896U,	// <7,3,1,4>: Cost 4 vsldoi12 RHS, <3,1,4,0>
+  3852912905U,	// <7,3,1,5>: Cost 4 vsldoi12 RHS, <3,1,5,0>
+  3835291923U,	// <7,3,1,6>: Cost 4 vsldoi12 <1,6,1,7>, <3,1,6,1>
+  3841411356U,	// <7,3,1,7>: Cost 4 vsldoi12 <2,6,3,7>, <3,1,7,1>
+  2779171111U,	// <7,3,1,u>: Cost 3 vsldoi12 RHS, <3,1,u,3>
+  2779171120U,	// <7,3,2,0>: Cost 3 vsldoi12 RHS, <3,2,0,3>
+  3852912952U,	// <7,3,2,1>: Cost 4 vsldoi12 RHS, <3,2,1,2>
+  2779171137U,	// <7,3,2,2>: Cost 3 vsldoi12 RHS, <3,2,2,2>
+  2779171144U,	// <7,3,2,3>: Cost 3 vsldoi12 RHS, <3,2,3,0>
+  2779171156U,	// <7,3,2,4>: Cost 3 vsldoi12 RHS, <3,2,4,3>
+  3852912989U,	// <7,3,2,5>: Cost 4 vsldoi12 RHS, <3,2,5,3>
+  2767669606U,	// <7,3,2,6>: Cost 3 vsldoi12 <2,6,3,7>, <3,2,6,3>
+  2767669615U,	// <7,3,2,7>: Cost 3 vsldoi12 <2,6,3,7>, <3,2,7,3>
+  2779171189U,	// <7,3,2,u>: Cost 3 vsldoi12 RHS, <3,2,u,0>
+  2779171198U,	// <7,3,3,0>: Cost 3 vsldoi12 RHS, <3,3,0,0>
+  3852913032U,	// <7,3,3,1>: Cost 4 vsldoi12 RHS, <3,3,1,1>
+  2704140655U,	// <7,3,3,2>: Cost 3 vsldoi8 <3,2,7,3>, <3,2,7,3>
+  1705429404U,	// <7,3,3,3>: Cost 2 vsldoi12 RHS, <3,3,3,3>
+  2779171238U,	// <7,3,3,4>: Cost 3 vsldoi12 RHS, <3,3,4,4>
+  3852913070U,	// <7,3,3,5>: Cost 4 vsldoi12 RHS, <3,3,5,3>
+  2657973099U,	// <7,3,3,6>: Cost 3 vsldoi4 <6,7,3,3>, <6,7,3,3>
+  2767669700U,	// <7,3,3,7>: Cost 3 vsldoi12 <2,6,3,7>, <3,3,7,7>
+  1705429404U,	// <7,3,3,u>: Cost 2 vsldoi12 RHS, <3,3,3,3>
+  2779171280U,	// <7,3,4,0>: Cost 3 vsldoi12 RHS, <3,4,0,1>
+  2779171290U,	// <7,3,4,1>: Cost 3 vsldoi12 RHS, <3,4,1,2>
+  2634090504U,	// <7,3,4,2>: Cost 3 vsldoi4 <2,7,3,4>, <2,7,3,4>
+  2779171311U,	// <7,3,4,3>: Cost 3 vsldoi12 RHS, <3,4,3,5>
+  2779171319U,	// <7,3,4,4>: Cost 3 vsldoi12 RHS, <3,4,4,4>
+  1705429506U,	// <7,3,4,5>: Cost 2 vsldoi12 RHS, <3,4,5,6>
+  2722057593U,	// <7,3,4,6>: Cost 3 vsldoi8 <6,2,7,3>, <4,6,5,2>
+  2316093370U,	// <7,3,4,7>: Cost 3 vmrglw <5,6,7,4>, <2,6,3,7>
+  1705429533U,	// <7,3,4,u>: Cost 2 vsldoi12 RHS, <3,4,u,6>
+  3852913185U,	// <7,3,5,0>: Cost 4 vsldoi12 RHS, <3,5,0,1>
+  3795799695U,	// <7,3,5,1>: Cost 4 vsldoi8 <6,2,7,3>, <5,1,0,1>
+  3852913203U,	// <7,3,5,2>: Cost 4 vsldoi12 RHS, <3,5,2,1>
+  3852913214U,	// <7,3,5,3>: Cost 4 vsldoi12 RHS, <3,5,3,3>
+  3852913225U,	// <7,3,5,4>: Cost 4 vsldoi12 RHS, <3,5,4,5>
+  2779171410U,	// <7,3,5,5>: Cost 3 vsldoi12 RHS, <3,5,5,5>
+  2718740581U,	// <7,3,5,6>: Cost 3 vsldoi8 <5,6,7,3>, <5,6,7,3>
+  3841411685U,	// <7,3,5,7>: Cost 4 vsldoi12 <2,6,3,7>, <3,5,7,6>
+  2720067847U,	// <7,3,5,u>: Cost 3 vsldoi8 <5,u,7,3>, <5,u,7,3>
+  2773420664U,	// <7,3,6,0>: Cost 3 vsldoi12 <3,6,0,7>, <3,6,0,7>
+  3847236225U,	// <7,3,6,1>: Cost 4 vsldoi12 <3,6,1,7>, <3,6,1,7>
+  1648316922U,	// <7,3,6,2>: Cost 2 vsldoi8 <6,2,7,3>, <6,2,7,3>
+  2773641875U,	// <7,3,6,3>: Cost 3 vsldoi12 <3,6,3,7>, <3,6,3,7>
+  2773715612U,	// <7,3,6,4>: Cost 3 vsldoi12 <3,6,4,7>, <3,6,4,7>
+  3847531173U,	// <7,3,6,5>: Cost 4 vsldoi12 <3,6,5,7>, <3,6,5,7>
+  2722059024U,	// <7,3,6,6>: Cost 3 vsldoi8 <6,2,7,3>, <6,6,2,2>
+  2767669943U,	// <7,3,6,7>: Cost 3 vsldoi12 <2,6,3,7>, <3,6,7,7>
+  1652298720U,	// <7,3,6,u>: Cost 2 vsldoi8 <6,u,7,3>, <6,u,7,3>
+  2767669955U,	// <7,3,7,0>: Cost 3 vsldoi12 <2,6,3,7>, <3,7,0,1>
+  3841411788U,	// <7,3,7,1>: Cost 4 vsldoi12 <2,6,3,7>, <3,7,1,1>
+  2767669978U,	// <7,3,7,2>: Cost 3 vsldoi12 <2,6,3,7>, <3,7,2,6>
+  2722059546U,	// <7,3,7,3>: Cost 3 vsldoi8 <6,2,7,3>, <7,3,6,2>
+  2767669995U,	// <7,3,7,4>: Cost 3 vsldoi12 <2,6,3,7>, <3,7,4,5>
+  3852913396U,	// <7,3,7,5>: Cost 4 vsldoi12 RHS, <3,7,5,5>
+  2722059758U,	// <7,3,7,6>: Cost 3 vsldoi8 <6,2,7,3>, <7,6,2,7>
+  2302183354U,	// <7,3,7,7>: Cost 3 vmrglw <3,3,7,7>, <2,6,3,7>
+  2767670027U,	// <7,3,7,u>: Cost 3 vsldoi12 <2,6,3,7>, <3,7,u,1>
+  2774747930U,	// <7,3,u,0>: Cost 3 vsldoi12 <3,u,0,7>, <3,u,0,7>
+  1705429790U,	// <7,3,u,1>: Cost 2 vsldoi12 RHS, <3,u,1,2>
+  1660262316U,	// <7,3,u,2>: Cost 2 vsldoi8 <u,2,7,3>, <u,2,7,3>
+  1705429404U,	// <7,3,u,3>: Cost 2 vsldoi12 RHS, <3,3,3,3>
+  2775042878U,	// <7,3,u,4>: Cost 3 vsldoi12 <3,u,4,7>, <3,u,4,7>
+  1705429830U,	// <7,3,u,5>: Cost 2 vsldoi12 RHS, <3,u,5,6>
+  2779171660U,	// <7,3,u,6>: Cost 3 vsldoi12 RHS, <3,u,6,3>
+  2767670101U,	// <7,3,u,7>: Cost 3 vsldoi12 <2,6,3,7>, <3,u,7,3>
+  1705429853U,	// <7,3,u,u>: Cost 2 vsldoi12 RHS, <3,u,u,2>
+  2718744576U,	// <7,4,0,0>: Cost 3 vsldoi8 <5,6,7,4>, <0,0,0,0>
+  1645002854U,	// <7,4,0,1>: Cost 2 vsldoi8 <5,6,7,4>, LHS
+  3852913527U,	// <7,4,0,2>: Cost 4 vsldoi12 RHS, <4,0,2,1>
+  3852913536U,	// <7,4,0,3>: Cost 4 vsldoi12 RHS, <4,0,3,1>
+  2316061904U,	// <7,4,0,4>: Cost 3 vmrglw <5,6,7,0>, <4,4,4,4>
+  1705429906U,	// <7,4,0,5>: Cost 2 vsldoi12 RHS, <4,0,5,1>
+  2658022257U,	// <7,4,0,6>: Cost 3 vsldoi4 <6,7,4,0>, <6,7,4,0>
+  2256489928U,	// <7,4,0,7>: Cost 3 vmrghw <7,0,1,2>, <4,7,5,0>
+  1707420589U,	// <7,4,0,u>: Cost 2 vsldoi12 RHS, <4,0,u,1>
+  3852913590U,	// <7,4,1,0>: Cost 4 vsldoi12 RHS, <4,1,0,1>
+  2718745396U,	// <7,4,1,1>: Cost 3 vsldoi8 <5,6,7,4>, <1,1,1,1>
+  2779171786U,	// <7,4,1,2>: Cost 3 vsldoi12 RHS, <4,1,2,3>
+  3852913616U,	// <7,4,1,3>: Cost 4 vsldoi12 RHS, <4,1,3,0>
+  3852913627U,	// <7,4,1,4>: Cost 4 vsldoi12 RHS, <4,1,4,2>
+  2779171810U,	// <7,4,1,5>: Cost 3 vsldoi12 RHS, <4,1,5,0>
+  3792487631U,	// <7,4,1,6>: Cost 4 vsldoi8 <5,6,7,4>, <1,6,1,7>
+  3394456220U,	// <7,4,1,7>: Cost 4 vmrglw <6,4,7,1>, <3,6,4,7>
+  2779171837U,	// <7,4,1,u>: Cost 3 vsldoi12 RHS, <4,1,u,0>
+  3852913673U,	// <7,4,2,0>: Cost 4 vsldoi12 RHS, <4,2,0,3>
+  3852913682U,	// <7,4,2,1>: Cost 4 vsldoi12 RHS, <4,2,1,3>
+  2718746216U,	// <7,4,2,2>: Cost 3 vsldoi8 <5,6,7,4>, <2,2,2,2>
+  2718746278U,	// <7,4,2,3>: Cost 3 vsldoi8 <5,6,7,4>, <2,3,0,1>
+  2779171885U,	// <7,4,2,4>: Cost 3 vsldoi12 RHS, <4,2,4,3>
+  2779171893U,	// <7,4,2,5>: Cost 3 vsldoi12 RHS, <4,2,5,2>
+  2718746554U,	// <7,4,2,6>: Cost 3 vsldoi8 <5,6,7,4>, <2,6,3,7>
+  3847457864U,	// <7,4,2,7>: Cost 4 vsldoi12 <3,6,4,7>, <4,2,7,3>
+  2779171921U,	// <7,4,2,u>: Cost 3 vsldoi12 RHS, <4,2,u,3>
+  2718746774U,	// <7,4,3,0>: Cost 3 vsldoi8 <5,6,7,4>, <3,0,1,2>
+  3852913762U,	// <7,4,3,1>: Cost 4 vsldoi12 RHS, <4,3,1,2>
+  3852913772U,	// <7,4,3,2>: Cost 4 vsldoi12 RHS, <4,3,2,3>
+  2718747036U,	// <7,4,3,3>: Cost 3 vsldoi8 <5,6,7,4>, <3,3,3,3>
+  2718747138U,	// <7,4,3,4>: Cost 3 vsldoi8 <5,6,7,4>, <3,4,5,6>
+  2779171972U,	// <7,4,3,5>: Cost 3 vsldoi12 RHS, <4,3,5,0>
+  2706803380U,	// <7,4,3,6>: Cost 3 vsldoi8 <3,6,7,4>, <3,6,7,4>
+  3847457946U,	// <7,4,3,7>: Cost 4 vsldoi12 <3,6,4,7>, <4,3,7,4>
+  2781162655U,	// <7,4,3,u>: Cost 3 vsldoi12 RHS, <4,3,u,0>
+  2718747538U,	// <7,4,4,0>: Cost 3 vsldoi8 <5,6,7,4>, <4,0,5,1>
+  3852913842U,	// <7,4,4,1>: Cost 4 vsldoi12 RHS, <4,4,1,1>
+  3852913852U,	// <7,4,4,2>: Cost 4 vsldoi12 RHS, <4,4,2,2>
+  2316096696U,	// <7,4,4,3>: Cost 3 vmrglw <5,6,7,4>, <7,2,4,3>
+  1705430224U,	// <7,4,4,4>: Cost 2 vsldoi12 RHS, <4,4,4,4>
+  1705430234U,	// <7,4,4,5>: Cost 2 vsldoi12 RHS, <4,4,5,5>
+  2658055029U,	// <7,4,4,6>: Cost 3 vsldoi4 <6,7,4,4>, <6,7,4,4>
+  2316097024U,	// <7,4,4,7>: Cost 3 vmrglw <5,6,7,4>, <7,6,4,7>
+  1707420917U,	// <7,4,4,u>: Cost 2 vsldoi12 RHS, <4,4,u,5>
+  1584316518U,	// <7,4,5,0>: Cost 2 vsldoi4 <6,7,4,5>, LHS
+  2658059060U,	// <7,4,5,1>: Cost 3 vsldoi4 <6,7,4,5>, <1,1,1,1>
+  2640144314U,	// <7,4,5,2>: Cost 3 vsldoi4 <3,7,4,5>, <2,6,3,7>
+  2640145131U,	// <7,4,5,3>: Cost 3 vsldoi4 <3,7,4,5>, <3,7,4,5>
+  1584319798U,	// <7,4,5,4>: Cost 2 vsldoi4 <6,7,4,5>, RHS
+  2779172134U,	// <7,4,5,5>: Cost 3 vsldoi12 RHS, <4,5,5,0>
+  631688502U,	// <7,4,5,6>: Cost 1 vsldoi12 RHS, RHS
+  2658063354U,	// <7,4,5,7>: Cost 3 vsldoi4 <6,7,4,5>, <7,0,1,2>
+  631688520U,	// <7,4,5,u>: Cost 1 vsldoi12 RHS, RHS
+  3852914001U,	// <7,4,6,0>: Cost 4 vsldoi12 RHS, <4,6,0,7>
+  3852914010U,	// <7,4,6,1>: Cost 4 vsldoi12 RHS, <4,6,1,7>
+  2718749178U,	// <7,4,6,2>: Cost 3 vsldoi8 <5,6,7,4>, <6,2,7,3>
+  2722730572U,	// <7,4,6,3>: Cost 3 vsldoi8 <6,3,7,4>, <6,3,7,4>
+  2723394205U,	// <7,4,6,4>: Cost 3 vsldoi8 <6,4,7,4>, <6,4,7,4>
+  2779172221U,	// <7,4,6,5>: Cost 3 vsldoi12 RHS, <4,6,5,6>
+  2718749496U,	// <7,4,6,6>: Cost 3 vsldoi8 <5,6,7,4>, <6,6,6,6>
+  2718749518U,	// <7,4,6,7>: Cost 3 vsldoi8 <5,6,7,4>, <6,7,0,1>
+  2779172249U,	// <7,4,6,u>: Cost 3 vsldoi12 RHS, <4,6,u,7>
+  2718749690U,	// <7,4,7,0>: Cost 3 vsldoi8 <5,6,7,4>, <7,0,1,2>
+  3847458214U,	// <7,4,7,1>: Cost 4 vsldoi12 <3,6,4,7>, <4,7,1,2>
+  2718749880U,	// <7,4,7,2>: Cost 3 vsldoi8 <5,6,7,4>, <7,2,4,3>
+  3847458236U,	// <7,4,7,3>: Cost 4 vsldoi12 <3,6,4,7>, <4,7,3,6>
+  2718750004U,	// <7,4,7,4>: Cost 3 vsldoi8 <5,6,7,4>, <7,4,0,1>
+  1187876150U,	// <7,4,7,5>: Cost 2 vmrghw <7,7,7,7>, RHS
+  2718750208U,	// <7,4,7,6>: Cost 3 vsldoi8 <5,6,7,4>, <7,6,4,7>
+  2718750286U,	// <7,4,7,7>: Cost 3 vsldoi8 <5,6,7,4>, <7,7,4,4>
+  1187876393U,	// <7,4,7,u>: Cost 2 vmrghw <7,7,7,7>, RHS
+  1584341094U,	// <7,4,u,0>: Cost 2 vsldoi4 <6,7,4,u>, LHS
+  1645008686U,	// <7,4,u,1>: Cost 2 vsldoi8 <5,6,7,4>, LHS
+  2640168890U,	// <7,4,u,2>: Cost 3 vsldoi4 <3,7,4,u>, <2,6,3,7>
+  2640169710U,	// <7,4,u,3>: Cost 3 vsldoi4 <3,7,4,u>, <3,7,4,u>
+  1584344374U,	// <7,4,u,4>: Cost 2 vsldoi4 <6,7,4,u>, RHS
+  1705430554U,	// <7,4,u,5>: Cost 2 vsldoi12 RHS, <4,u,5,1>
+  631688745U,	// <7,4,u,6>: Cost 1 vsldoi12 RHS, RHS
+  2718750976U,	// <7,4,u,7>: Cost 3 vsldoi8 <5,6,7,4>, <u,7,0,1>
+  631688763U,	// <7,4,u,u>: Cost 1 vsldoi12 RHS, RHS
+  2646147174U,	// <7,5,0,0>: Cost 3 vsldoi4 <4,7,5,0>, LHS
+  2779172424U,	// <7,5,0,1>: Cost 3 vsldoi12 RHS, <5,0,1,2>
+  3852914258U,	// <7,5,0,2>: Cost 4 vsldoi12 RHS, <5,0,2,3>
+  3852914268U,	// <7,5,0,3>: Cost 4 vsldoi12 RHS, <5,0,3,4>
+  2779172450U,	// <7,5,0,4>: Cost 3 vsldoi12 RHS, <5,0,4,1>
+  2316061914U,	// <7,5,0,5>: Cost 3 vmrglw <5,6,7,0>, <4,4,5,5>
+  2316061186U,	// <7,5,0,6>: Cost 3 vmrglw <5,6,7,0>, <3,4,5,6>
+  2646152186U,	// <7,5,0,7>: Cost 3 vsldoi4 <4,7,5,0>, <7,0,1,2>
+  2779172486U,	// <7,5,0,u>: Cost 3 vsldoi12 RHS, <5,0,u,1>
+  2781163151U,	// <7,5,1,0>: Cost 3 vsldoi12 RHS, <5,1,0,1>
+  2321378194U,	// <7,5,1,1>: Cost 3 vmrglw <6,5,7,1>, <4,0,5,1>
+  3852914339U,	// <7,5,1,2>: Cost 4 vsldoi12 RHS, <5,1,2,3>
+  3852914350U,	// <7,5,1,3>: Cost 4 vsldoi12 RHS, <5,1,3,5>
+  2781163191U,	// <7,5,1,4>: Cost 3 vsldoi12 RHS, <5,1,4,5>
+  3852914363U,	// <7,5,1,5>: Cost 4 vsldoi12 RHS, <5,1,5,0>
+  3835588297U,	// <7,5,1,6>: Cost 4 vsldoi12 <1,6,5,7>, <5,1,6,5>
+  3835588306U,	// <7,5,1,7>: Cost 4 vsldoi12 <1,6,5,7>, <5,1,7,5>
+  2781163223U,	// <7,5,1,u>: Cost 3 vsldoi12 RHS, <5,1,u,1>
+  3852914400U,	// <7,5,2,0>: Cost 4 vsldoi12 RHS, <5,2,0,1>
+  2781163243U,	// <7,5,2,1>: Cost 3 vsldoi12 RHS, <5,2,1,3>
+  3852914419U,	// <7,5,2,2>: Cost 4 vsldoi12 RHS, <5,2,2,2>
+  2779172606U,	// <7,5,2,3>: Cost 3 vsldoi12 RHS, <5,2,3,4>
+  3780552497U,	// <7,5,2,4>: Cost 4 vsldoi8 <3,6,7,5>, <2,4,6,5>
+  2781163279U,	// <7,5,2,5>: Cost 3 vsldoi12 RHS, <5,2,5,3>
+  2779172632U,	// <7,5,2,6>: Cost 3 vsldoi12 RHS, <5,2,6,3>
+  3835588385U,	// <7,5,2,7>: Cost 4 vsldoi12 <1,6,5,7>, <5,2,7,3>
+  2779172650U,	// <7,5,2,u>: Cost 3 vsldoi12 RHS, <5,2,u,3>
+  3852914481U,	// <7,5,3,0>: Cost 4 vsldoi12 RHS, <5,3,0,1>
+  2319403922U,	// <7,5,3,1>: Cost 3 vmrglw <6,2,7,3>, <4,0,5,1>
+  2319404409U,	// <7,5,3,2>: Cost 3 vmrglw <6,2,7,3>, <4,6,5,2>
+  3852914510U,	// <7,5,3,3>: Cost 4 vsldoi12 RHS, <5,3,3,3>
+  3779226131U,	// <7,5,3,4>: Cost 4 vsldoi8 <3,4,7,5>, <3,4,7,5>
+  2319404250U,	// <7,5,3,5>: Cost 3 vmrglw <6,2,7,3>, <4,4,5,5>
+  2319403522U,	// <7,5,3,6>: Cost 3 vmrglw <6,2,7,3>, <3,4,5,6>
+  3852914547U,	// <7,5,3,7>: Cost 4 vsldoi12 RHS, <5,3,7,4>
+  2319403524U,	// <7,5,3,u>: Cost 3 vmrglw <6,2,7,3>, <3,4,5,u>
+  2646179942U,	// <7,5,4,0>: Cost 3 vsldoi4 <4,7,5,4>, LHS
+  2316094354U,	// <7,5,4,1>: Cost 3 vmrglw <5,6,7,4>, <4,0,5,1>
+  3852914582U,	// <7,5,4,2>: Cost 4 vsldoi12 RHS, <5,4,2,3>
+  3852914592U,	// <7,5,4,3>: Cost 4 vsldoi12 RHS, <5,4,3,4>
+  2646183372U,	// <7,5,4,4>: Cost 3 vsldoi4 <4,7,5,4>, <4,7,5,4>
+  2779172788U,	// <7,5,4,5>: Cost 3 vsldoi12 RHS, <5,4,5,6>
+  2316093954U,	// <7,5,4,6>: Cost 3 vmrglw <5,6,7,4>, <3,4,5,6>
+  2646185318U,	// <7,5,4,7>: Cost 3 vsldoi4 <4,7,5,4>, <7,4,5,6>
+  2779172815U,	// <7,5,4,u>: Cost 3 vsldoi12 RHS, <5,4,u,6>
+  2781163475U,	// <7,5,5,0>: Cost 3 vsldoi12 RHS, <5,5,0,1>
+  2781163484U,	// <7,5,5,1>: Cost 3 vsldoi12 RHS, <5,5,1,1>
+  3852914662U,	// <7,5,5,2>: Cost 4 vsldoi12 RHS, <5,5,2,2>
+  3852914672U,	// <7,5,5,3>: Cost 4 vsldoi12 RHS, <5,5,3,3>
+  2781163515U,	// <7,5,5,4>: Cost 3 vsldoi12 RHS, <5,5,4,5>
+  1705431044U,	// <7,5,5,5>: Cost 2 vsldoi12 RHS, <5,5,5,5>
+  2779172878U,	// <7,5,5,6>: Cost 3 vsldoi12 RHS, <5,5,6,6>
+  3835588632U,	// <7,5,5,7>: Cost 4 vsldoi12 <1,6,5,7>, <5,5,7,7>
+  1705431044U,	// <7,5,5,u>: Cost 2 vsldoi12 RHS, <5,5,5,5>
+  2779172900U,	// <7,5,6,0>: Cost 3 vsldoi12 RHS, <5,6,0,1>
+  2781163571U,	// <7,5,6,1>: Cost 3 vsldoi12 RHS, <5,6,1,7>
+  3852914743U,	// <7,5,6,2>: Cost 4 vsldoi12 RHS, <5,6,2,2>
+  2779172930U,	// <7,5,6,3>: Cost 3 vsldoi12 RHS, <5,6,3,4>
+  2779172940U,	// <7,5,6,4>: Cost 3 vsldoi12 RHS, <5,6,4,5>
+  2781163607U,	// <7,5,6,5>: Cost 3 vsldoi12 RHS, <5,6,5,7>
+  2779172960U,	// <7,5,6,6>: Cost 3 vsldoi12 RHS, <5,6,6,7>
+  1705431138U,	// <7,5,6,7>: Cost 2 vsldoi12 RHS, <5,6,7,0>
+  1705578603U,	// <7,5,6,u>: Cost 2 vsldoi12 RHS, <5,6,u,0>
+  2646204518U,	// <7,5,7,0>: Cost 3 vsldoi4 <4,7,5,7>, LHS
+  2322090898U,	// <7,5,7,1>: Cost 3 vmrglw <6,6,7,7>, <4,0,5,1>
+  3719947880U,	// <7,5,7,2>: Cost 4 vsldoi4 <4,7,5,7>, <2,2,2,2>
+  3719948438U,	// <7,5,7,3>: Cost 4 vsldoi4 <4,7,5,7>, <3,0,1,2>
+  2646207951U,	// <7,5,7,4>: Cost 3 vsldoi4 <4,7,5,7>, <4,7,5,7>
+  2322091226U,	// <7,5,7,5>: Cost 3 vmrglw <6,6,7,7>, <4,4,5,5>
+  2322090498U,	// <7,5,7,6>: Cost 3 vmrglw <6,6,7,7>, <3,4,5,6>
+  2646210156U,	// <7,5,7,7>: Cost 3 vsldoi4 <4,7,5,7>, <7,7,7,7>
+  2646210350U,	// <7,5,7,u>: Cost 3 vsldoi4 <4,7,5,7>, LHS
+  2779173062U,	// <7,5,u,0>: Cost 3 vsldoi12 RHS, <5,u,0,1>
+  2779173072U,	// <7,5,u,1>: Cost 3 vsldoi12 RHS, <5,u,1,2>
+  2319404409U,	// <7,5,u,2>: Cost 3 vmrglw <6,2,7,3>, <4,6,5,2>
+  2779173092U,	// <7,5,u,3>: Cost 3 vsldoi12 RHS, <5,u,3,4>
+  2779173101U,	// <7,5,u,4>: Cost 3 vsldoi12 RHS, <5,u,4,4>
+  1705431044U,	// <7,5,u,5>: Cost 2 vsldoi12 RHS, <5,5,5,5>
+  2779173118U,	// <7,5,u,6>: Cost 3 vsldoi12 RHS, <5,u,6,3>
+  1705578756U,	// <7,5,u,7>: Cost 2 vsldoi12 RHS, <5,u,7,0>
+  1707421965U,	// <7,5,u,u>: Cost 2 vsldoi12 RHS, <5,u,u,0>
+  3852914966U,	// <7,6,0,0>: Cost 4 vsldoi12 RHS, <6,0,0,0>
+  2779173153U,	// <7,6,0,1>: Cost 3 vsldoi12 RHS, <6,0,1,2>
+  2256491002U,	// <7,6,0,2>: Cost 3 vmrghw <7,0,1,2>, <6,2,7,3>
+  3852914994U,	// <7,6,0,3>: Cost 4 vsldoi12 RHS, <6,0,3,1>
+  3852915003U,	// <7,6,0,4>: Cost 4 vsldoi12 RHS, <6,0,4,1>
+  2316062652U,	// <7,6,0,5>: Cost 3 vmrglw <5,6,7,0>, <5,4,6,5>
+  2316063544U,	// <7,6,0,6>: Cost 3 vmrglw <5,6,7,0>, <6,6,6,6>
+  1242320182U,	// <7,6,0,7>: Cost 2 vmrglw <5,6,7,0>, RHS
+  1242320183U,	// <7,6,0,u>: Cost 2 vmrglw <5,6,7,0>, RHS
+  3852915048U,	// <7,6,1,0>: Cost 4 vsldoi12 RHS, <6,1,0,1>
+  3377866217U,	// <7,6,1,1>: Cost 4 vmrglw <3,6,7,1>, <2,0,6,1>
+  3852915068U,	// <7,6,1,2>: Cost 4 vsldoi12 RHS, <6,1,2,3>
+  3833672072U,	// <7,6,1,3>: Cost 5 vsldoi12 <1,3,6,7>, <6,1,3,6>
+  3852915088U,	// <7,6,1,4>: Cost 4 vsldoi12 RHS, <6,1,4,5>
+  3395122056U,	// <7,6,1,5>: Cost 4 vmrglw <6,5,7,1>, <6,7,6,5>
+  3389813560U,	// <7,6,1,6>: Cost 4 vmrglw <5,6,7,1>, <6,6,6,6>
+  2779173287U,	// <7,6,1,7>: Cost 3 vsldoi12 RHS, <6,1,7,1>
+  2779320752U,	// <7,6,1,u>: Cost 3 vsldoi12 RHS, <6,1,u,1>
+  2658181222U,	// <7,6,2,0>: Cost 3 vsldoi4 <6,7,6,2>, LHS
+  3852915140U,	// <7,6,2,1>: Cost 4 vsldoi12 RHS, <6,2,1,3>
+  2257973754U,	// <7,6,2,2>: Cost 3 vmrghw <7,2,3,3>, <6,2,7,3>
+  3841413589U,	// <7,6,2,3>: Cost 4 vsldoi12 <2,6,3,7>, <6,2,3,2>
+  2658184502U,	// <7,6,2,4>: Cost 3 vsldoi4 <6,7,6,2>, RHS
+  3852915176U,	// <7,6,2,5>: Cost 4 vsldoi12 RHS, <6,2,5,3>
+  2658186117U,	// <7,6,2,6>: Cost 3 vsldoi4 <6,7,6,2>, <6,7,6,2>
+  1705431546U,	// <7,6,2,7>: Cost 2 vsldoi12 RHS, <6,2,7,3>
+  1705579011U,	// <7,6,2,u>: Cost 2 vsldoi12 RHS, <6,2,u,3>
+  3714015334U,	// <7,6,3,0>: Cost 4 vsldoi4 <3,7,6,3>, LHS
+  3777243425U,	// <7,6,3,1>: Cost 4 vsldoi8 <3,1,7,6>, <3,1,7,6>
+  2319405957U,	// <7,6,3,2>: Cost 3 vmrglw <6,2,7,3>, <6,7,6,2>
+  3375229286U,	// <7,6,3,3>: Cost 4 vmrglw <3,2,7,3>, <3,2,6,3>
+  2779173426U,	// <7,6,3,4>: Cost 3 vsldoi12 RHS, <6,3,4,5>
+  3375228721U,	// <7,6,3,5>: Cost 4 vmrglw <3,2,7,3>, <2,4,6,5>
+  2319405880U,	// <7,6,3,6>: Cost 3 vmrglw <6,2,7,3>, <6,6,6,6>
+  1245662518U,	// <7,6,3,7>: Cost 2 vmrglw <6,2,7,3>, RHS
+  1245662519U,	// <7,6,3,u>: Cost 2 vmrglw <6,2,7,3>, RHS
+  3852915291U,	// <7,6,4,0>: Cost 4 vsldoi12 RHS, <6,4,0,1>
+  3389834729U,	// <7,6,4,1>: Cost 4 vmrglw <5,6,7,4>, <2,0,6,1>
+  2259472890U,	// <7,6,4,2>: Cost 3 vmrghw <7,4,5,6>, <6,2,7,3>
+  3852915321U,	// <7,6,4,3>: Cost 4 vsldoi12 RHS, <6,4,3,4>
+  3852915330U,	// <7,6,4,4>: Cost 4 vsldoi12 RHS, <6,4,4,4>
+  2779173517U,	// <7,6,4,5>: Cost 3 vsldoi12 RHS, <6,4,5,6>
+  2316096312U,	// <7,6,4,6>: Cost 3 vmrglw <5,6,7,4>, <6,6,6,6>
+  1242352950U,	// <7,6,4,7>: Cost 2 vmrglw <5,6,7,4>, RHS
+  1242352951U,	// <7,6,4,u>: Cost 2 vmrglw <5,6,7,4>, RHS
+  3852915372U,	// <7,6,5,0>: Cost 4 vsldoi12 RHS, <6,5,0,1>
+  3835294392U,	// <7,6,5,1>: Cost 5 vsldoi12 <1,6,1,7>, <6,5,1,4>
+  3852915395U,	// <7,6,5,2>: Cost 4 vsldoi12 RHS, <6,5,2,6>
+  3852915404U,	// <7,6,5,3>: Cost 4 vsldoi12 RHS, <6,5,3,6>
+  3852915412U,	// <7,6,5,4>: Cost 4 vsldoi12 RHS, <6,5,4,5>
+  3377899313U,	// <7,6,5,5>: Cost 4 vmrglw <3,6,7,5>, <2,4,6,5>
+  2718765160U,	// <7,6,5,6>: Cost 3 vsldoi8 <5,6,7,6>, <5,6,7,6>
+  2779173611U,	// <7,6,5,7>: Cost 3 vsldoi12 RHS, <6,5,7,1>
+  2779321076U,	// <7,6,5,u>: Cost 3 vsldoi12 RHS, <6,5,u,1>
+  2658213990U,	// <7,6,6,0>: Cost 3 vsldoi4 <6,7,6,6>, LHS
+  3852915462U,	// <7,6,6,1>: Cost 4 vsldoi12 RHS, <6,6,1,1>
+  2718765562U,	// <7,6,6,2>: Cost 3 vsldoi8 <5,6,7,6>, <6,2,7,3>
+  3714042622U,	// <7,6,6,3>: Cost 4 vsldoi4 <3,7,6,6>, <3,7,6,6>
+  2658217270U,	// <7,6,6,4>: Cost 3 vsldoi4 <6,7,6,6>, RHS
+  2724074224U,	// <7,6,6,5>: Cost 3 vsldoi8 <6,5,7,6>, <6,5,7,6>
+  1705431864U,	// <7,6,6,6>: Cost 2 vsldoi12 RHS, <6,6,6,6>
+  1705431874U,	// <7,6,6,7>: Cost 2 vsldoi12 RHS, <6,6,7,7>
+  1705579339U,	// <7,6,6,u>: Cost 2 vsldoi12 RHS, <6,6,u,7>
+  1705431886U,	// <7,6,7,0>: Cost 2 vsldoi12 RHS, <6,7,0,1>
+  2779173719U,	// <7,6,7,1>: Cost 3 vsldoi12 RHS, <6,7,1,1>
+  2779173729U,	// <7,6,7,2>: Cost 3 vsldoi12 RHS, <6,7,2,2>
+  2779173736U,	// <7,6,7,3>: Cost 3 vsldoi12 RHS, <6,7,3,0>
+  1705431926U,	// <7,6,7,4>: Cost 2 vsldoi12 RHS, <6,7,4,5>
+  2779173759U,	// <7,6,7,5>: Cost 3 vsldoi12 RHS, <6,7,5,5>
+  2779173765U,	// <7,6,7,6>: Cost 3 vsldoi12 RHS, <6,7,6,2>
+  1248349494U,	// <7,6,7,7>: Cost 2 vmrglw <6,6,7,7>, RHS
+  1705431958U,	// <7,6,7,u>: Cost 2 vsldoi12 RHS, <6,7,u,1>
+  1705579423U,	// <7,6,u,0>: Cost 2 vsldoi12 RHS, <6,u,0,1>
+  2779173801U,	// <7,6,u,1>: Cost 3 vsldoi12 RHS, <6,u,1,2>
+  2779321266U,	// <7,6,u,2>: Cost 3 vsldoi12 RHS, <6,u,2,2>
+  2779321273U,	// <7,6,u,3>: Cost 3 vsldoi12 RHS, <6,u,3,0>
+  1705579463U,	// <7,6,u,4>: Cost 2 vsldoi12 RHS, <6,u,4,5>
+  2779173841U,	// <7,6,u,5>: Cost 3 vsldoi12 RHS, <6,u,5,6>
+  1705431864U,	// <7,6,u,6>: Cost 2 vsldoi12 RHS, <6,6,6,6>
+  1705432032U,	// <7,6,u,7>: Cost 2 vsldoi12 RHS, <6,u,7,3>
+  1705579495U,	// <7,6,u,u>: Cost 2 vsldoi12 RHS, <6,u,u,1>
+  1242320994U,	// <7,7,0,0>: Cost 2 vmrglw <5,6,7,0>, <5,6,7,0>
+  1705432058U,	// <7,7,0,1>: Cost 2 vsldoi12 RHS, <7,0,1,2>
+  3841414146U,	// <7,7,0,2>: Cost 4 vsldoi12 <2,6,3,7>, <7,0,2,1>
+  2316063226U,	// <7,7,0,3>: Cost 3 vmrglw <5,6,7,0>, <6,2,7,3>
+  2779173908U,	// <7,7,0,4>: Cost 3 vsldoi12 RHS, <7,0,4,1>
+  2658242658U,	// <7,7,0,5>: Cost 3 vsldoi4 <6,7,7,0>, <5,6,7,0>
+  2658243468U,	// <7,7,0,6>: Cost 3 vsldoi4 <6,7,7,0>, <6,7,7,0>
+  2316063554U,	// <7,7,0,7>: Cost 3 vmrglw <5,6,7,0>, <6,6,7,7>
+  1705432121U,	// <7,7,0,u>: Cost 2 vsldoi12 RHS, <7,0,u,2>
+  3852915777U,	// <7,7,1,0>: Cost 4 vsldoi12 RHS, <7,1,0,1>
+  2779173962U,	// <7,7,1,1>: Cost 3 vsldoi12 RHS, <7,1,1,1>
+  2779173973U,	// <7,7,1,2>: Cost 3 vsldoi12 RHS, <7,1,2,3>
+  3389813242U,	// <7,7,1,3>: Cost 4 vmrglw <5,6,7,1>, <6,2,7,3>
+  3852915813U,	// <7,7,1,4>: Cost 4 vsldoi12 RHS, <7,1,4,1>
+  3852915821U,	// <7,7,1,5>: Cost 4 vsldoi12 RHS, <7,1,5,0>
+  3835294839U,	// <7,7,1,6>: Cost 4 vsldoi12 <1,6,1,7>, <7,1,6,1>
+  2329343596U,	// <7,7,1,7>: Cost 3 vmrglw <7,u,7,1>, <7,7,7,7>
+  2779174027U,	// <7,7,1,u>: Cost 3 vsldoi12 RHS, <7,1,u,3>
+  2803061908U,	// <7,7,2,0>: Cost 3 vsldoi12 RHS, <7,2,0,3>
+  3852915869U,	// <7,7,2,1>: Cost 4 vsldoi12 RHS, <7,2,1,3>
+  2779174053U,	// <7,7,2,2>: Cost 3 vsldoi12 RHS, <7,2,2,2>
+  2779174060U,	// <7,7,2,3>: Cost 3 vsldoi12 RHS, <7,2,3,0>
+  2803061944U,	// <7,7,2,4>: Cost 3 vsldoi12 RHS, <7,2,4,3>
+  3852915905U,	// <7,7,2,5>: Cost 4 vsldoi12 RHS, <7,2,5,3>
+  2767672522U,	// <7,7,2,6>: Cost 3 vsldoi12 <2,6,3,7>, <7,2,6,3>
+  2791855315U,	// <7,7,2,7>: Cost 3 vsldoi12 <6,6,7,7>, <7,2,7,3>
+  2768999644U,	// <7,7,2,u>: Cost 3 vsldoi12 <2,u,3,7>, <7,2,u,3>
+  2779174115U,	// <7,7,3,0>: Cost 3 vsldoi12 RHS, <7,3,0,1>
+  3852915948U,	// <7,7,3,1>: Cost 4 vsldoi12 RHS, <7,3,1,1>
+  3841414394U,	// <7,7,3,2>: Cost 4 vsldoi12 <2,6,3,7>, <7,3,2,6>
+  1245663738U,	// <7,7,3,3>: Cost 2 vmrglw <6,2,7,3>, <6,2,7,3>
+  2779174155U,	// <7,7,3,4>: Cost 3 vsldoi12 RHS, <7,3,4,5>
+  3852915988U,	// <7,7,3,5>: Cost 4 vsldoi12 RHS, <7,3,5,5>
+  2706827959U,	// <7,7,3,6>: Cost 3 vsldoi8 <3,6,7,7>, <3,6,7,7>
+  2319405890U,	// <7,7,3,7>: Cost 3 vmrglw <6,2,7,3>, <6,6,7,7>
+  1245663738U,	// <7,7,3,u>: Cost 2 vmrglw <6,2,7,3>, <6,2,7,3>
+  2779174200U,	// <7,7,4,0>: Cost 3 vsldoi12 RHS, <7,4,0,5>
+  3852916030U,	// <7,7,4,1>: Cost 4 vsldoi12 RHS, <7,4,1,2>
+  3714099130U,	// <7,7,4,2>: Cost 4 vsldoi4 <3,7,7,4>, <2,6,3,7>
+  2316095994U,	// <7,7,4,3>: Cost 3 vmrglw <5,6,7,4>, <6,2,7,3>
+  1242353766U,	// <7,7,4,4>: Cost 2 vmrglw <5,6,7,4>, <5,6,7,4>
+  1705432422U,	// <7,7,4,5>: Cost 2 vsldoi12 RHS, <7,4,5,6>
+  2658276240U,	// <7,7,4,6>: Cost 3 vsldoi4 <6,7,7,4>, <6,7,7,4>
+  2316096322U,	// <7,7,4,7>: Cost 3 vmrglw <5,6,7,4>, <6,6,7,7>
+  1705432449U,	// <7,7,4,u>: Cost 2 vsldoi12 RHS, <7,4,u,6>
+  3852916101U,	// <7,7,5,0>: Cost 4 vsldoi12 RHS, <7,5,0,1>
+  3854906765U,	// <7,7,5,1>: Cost 4 vsldoi12 RHS, <7,5,1,0>
+  3852916121U,	// <7,7,5,2>: Cost 4 vsldoi12 RHS, <7,5,2,3>
+  3389846010U,	// <7,7,5,3>: Cost 4 vmrglw <5,6,7,5>, <6,2,7,3>
+  3852916141U,	// <7,7,5,4>: Cost 4 vsldoi12 RHS, <7,5,4,5>
+  2779174326U,	// <7,7,5,5>: Cost 3 vsldoi12 RHS, <7,5,5,5>
+  2779174337U,	// <7,7,5,6>: Cost 3 vsldoi12 RHS, <7,5,6,7>
+  2329376364U,	// <7,7,5,7>: Cost 3 vmrglw <7,u,7,5>, <7,7,7,7>
+  2779321811U,	// <7,7,5,u>: Cost 3 vsldoi12 RHS, <7,5,u,7>
+  2658287718U,	// <7,7,6,0>: Cost 3 vsldoi4 <6,7,7,6>, LHS
+  3852916197U,	// <7,7,6,1>: Cost 4 vsldoi12 RHS, <7,6,1,7>
+  2779174382U,	// <7,7,6,2>: Cost 3 vsldoi12 RHS, <7,6,2,7>
+  2316112378U,	// <7,7,6,3>: Cost 3 vmrglw <5,6,7,6>, <6,2,7,3>
+  2658290998U,	// <7,7,6,4>: Cost 3 vsldoi4 <6,7,7,6>, RHS
+  3852916233U,	// <7,7,6,5>: Cost 4 vsldoi12 RHS, <7,6,5,7>
+  1651004226U,	// <7,7,6,6>: Cost 2 vsldoi8 <6,6,7,7>, <6,6,7,7>
+  2779174420U,	// <7,7,6,7>: Cost 3 vsldoi12 RHS, <7,6,7,0>
+  1652331492U,	// <7,7,6,u>: Cost 2 vsldoi8 <6,u,7,7>, <6,u,7,7>
+  1590526054U,	// <7,7,7,0>: Cost 2 vsldoi4 <7,7,7,7>, LHS
+  2328728623U,	// <7,7,7,1>: Cost 3 vmrglw <7,7,7,7>, <7,0,7,1>
+  2724746451U,	// <7,7,7,2>: Cost 3 vsldoi8 <6,6,7,7>, <7,2,7,3>
+  2322092538U,	// <7,7,7,3>: Cost 3 vmrglw <6,6,7,7>, <6,2,7,3>
+  1590529334U,	// <7,7,7,4>: Cost 2 vsldoi4 <7,7,7,7>, RHS
+  2328728951U,	// <7,7,7,5>: Cost 3 vmrglw <7,7,7,7>, <7,4,7,5>
+  2724746770U,	// <7,7,7,6>: Cost 3 vsldoi8 <6,6,7,7>, <7,6,6,7>
+  430361910U,	// <7,7,7,7>: Cost 1 vspltisw3 RHS
+  430361910U,	// <7,7,7,u>: Cost 1 vspltisw3 RHS
+  1242320994U,	// <7,7,u,0>: Cost 2 vmrglw <5,6,7,0>, <5,6,7,0>
+  1705580162U,	// <7,7,u,1>: Cost 2 vsldoi12 RHS, <7,u,1,2>
+  2779321996U,	// <7,7,u,2>: Cost 3 vsldoi12 RHS, <7,u,2,3>
+  1245663738U,	// <7,7,u,3>: Cost 2 vmrglw <6,2,7,3>, <6,2,7,3>
+  1242353766U,	// <7,7,u,4>: Cost 2 vmrglw <5,6,7,4>, <5,6,7,4>
+  1705580202U,	// <7,7,u,5>: Cost 2 vsldoi12 RHS, <7,u,5,6>
+  1662949620U,	// <7,7,u,6>: Cost 2 vsldoi8 <u,6,7,7>, <u,6,7,7>
+  430361910U,	// <7,7,u,7>: Cost 1 vspltisw3 RHS
+  430361910U,	// <7,7,u,u>: Cost 1 vspltisw3 RHS
+  1705426944U,	// <7,u,0,0>: Cost 2 vsldoi12 RHS, <0,0,0,0>
+  1705432787U,	// <7,u,0,1>: Cost 2 vsldoi12 RHS, <u,0,1,2>
+  2316060885U,	// <7,u,0,2>: Cost 3 vmrglw <5,6,7,0>, <3,0,u,2>
+  1242316956U,	// <7,u,0,3>: Cost 2 vmrglw <5,6,7,0>, LHS
+  2779174637U,	// <7,u,0,4>: Cost 3 vsldoi12 RHS, <u,0,4,1>
+  1182750874U,	// <7,u,0,5>: Cost 2 vmrghw <7,0,1,2>, RHS
+  2316061213U,	// <7,u,0,6>: Cost 3 vmrglw <5,6,7,0>, <3,4,u,6>
+  1242320200U,	// <7,u,0,7>: Cost 2 vmrglw <5,6,7,0>, RHS
+  1705432850U,	// <7,u,0,u>: Cost 2 vsldoi12 RHS, <u,0,u,2>
+  1584578662U,	// <7,u,1,0>: Cost 2 vsldoi4 <6,7,u,1>, LHS
+  1705427764U,	// <7,u,1,1>: Cost 2 vsldoi12 RHS, <1,1,1,1>
+  631691054U,	// <7,u,1,2>: Cost 1 vsldoi12 RHS, LHS
+  2640407307U,	// <7,u,1,3>: Cost 3 vsldoi4 <3,7,u,1>, <3,7,u,1>
+  1584581942U,	// <7,u,1,4>: Cost 2 vsldoi4 <6,7,u,1>, RHS
+  2779174726U,	// <7,u,1,5>: Cost 3 vsldoi12 RHS, <u,1,5,0>
+  1584583574U,	// <7,u,1,6>: Cost 2 vsldoi4 <6,7,u,1>, <6,7,u,1>
+  2779322201U,	// <7,u,1,7>: Cost 3 vsldoi12 RHS, <u,1,7,1>
+  631691108U,	// <7,u,1,u>: Cost 1 vsldoi12 RHS, LHS
+  2779174763U,	// <7,u,2,0>: Cost 3 vsldoi12 RHS, <u,2,0,1>
+  2779174774U,	// <7,u,2,1>: Cost 3 vsldoi12 RHS, <u,2,1,3>
+  1705428584U,	// <7,u,2,2>: Cost 2 vsldoi12 RHS, <2,2,2,2>
+  1705432965U,	// <7,u,2,3>: Cost 2 vsldoi12 RHS, <u,2,3,0>
+  2779174801U,	// <7,u,2,4>: Cost 3 vsldoi12 RHS, <u,2,4,3>
+  2779174810U,	// <7,u,2,5>: Cost 3 vsldoi12 RHS, <u,2,5,3>
+  2767673251U,	// <7,u,2,6>: Cost 3 vsldoi12 <2,6,3,7>, <u,2,6,3>
+  1705580460U,	// <7,u,2,7>: Cost 2 vsldoi12 RHS, <u,2,7,3>
+  1705433010U,	// <7,u,2,u>: Cost 2 vsldoi12 RHS, <u,2,u,0>
+  1705433020U,	// <7,u,3,0>: Cost 2 vsldoi12 RHS, <u,3,0,1>
+  2779174853U,	// <7,u,3,1>: Cost 3 vsldoi12 RHS, <u,3,1,1>
+  2767673299U,	// <7,u,3,2>: Cost 3 vsldoi12 <2,6,3,7>, <u,3,2,6>
+  1245659292U,	// <7,u,3,3>: Cost 2 vmrglw <6,2,7,3>, LHS
+  1705433060U,	// <7,u,3,4>: Cost 2 vsldoi12 RHS, <u,3,4,5>
+  2779174893U,	// <7,u,3,5>: Cost 3 vsldoi12 RHS, <u,3,5,5>
+  2706836152U,	// <7,u,3,6>: Cost 3 vsldoi8 <3,6,7,u>, <3,6,7,u>
+  1245662536U,	// <7,u,3,7>: Cost 2 vmrglw <6,2,7,3>, RHS
+  1705433092U,	// <7,u,3,u>: Cost 2 vsldoi12 RHS, <u,3,u,1>
+  2779174925U,	// <7,u,4,0>: Cost 3 vsldoi12 RHS, <u,4,0,1>
+  1185732398U,	// <7,u,4,1>: Cost 2 vmrghw <7,4,5,6>, LHS
+  2316093653U,	// <7,u,4,2>: Cost 3 vmrglw <5,6,7,4>, <3,0,u,2>
+  1242349724U,	// <7,u,4,3>: Cost 2 vmrglw <5,6,7,4>, LHS
+  1705430224U,	// <7,u,4,4>: Cost 2 vsldoi12 RHS, <4,4,4,4>
+  1705433151U,	// <7,u,4,5>: Cost 2 vsldoi12 RHS, <u,4,5,6>
+  2316093981U,	// <7,u,4,6>: Cost 3 vmrglw <5,6,7,4>, <3,4,u,6>
+  1242352968U,	// <7,u,4,7>: Cost 2 vmrglw <5,6,7,4>, RHS
+  1705433178U,	// <7,u,4,u>: Cost 2 vsldoi12 RHS, <u,4,u,6>
+  1584611430U,	// <7,u,5,0>: Cost 2 vsldoi4 <6,7,u,5>, LHS
+  2781165670U,	// <7,u,5,1>: Cost 3 vsldoi12 RHS, <u,5,1,0>
+  2640439226U,	// <7,u,5,2>: Cost 3 vsldoi4 <3,7,u,5>, <2,6,3,7>
+  2640440079U,	// <7,u,5,3>: Cost 3 vsldoi4 <3,7,u,5>, <3,7,u,5>
+  1584614710U,	// <7,u,5,4>: Cost 2 vsldoi4 <6,7,u,5>, RHS
+  1705431044U,	// <7,u,5,5>: Cost 2 vsldoi12 RHS, <5,5,5,5>
+  631691418U,	// <7,u,5,6>: Cost 1 vsldoi12 RHS, RHS
+  2779322525U,	// <7,u,5,7>: Cost 3 vsldoi12 RHS, <u,5,7,1>
+  631691436U,	// <7,u,5,u>: Cost 1 vsldoi12 RHS, RHS
+  2779175087U,	// <7,u,6,0>: Cost 3 vsldoi12 RHS, <u,6,0,1>
+  2779175102U,	// <7,u,6,1>: Cost 3 vsldoi12 RHS, <u,6,1,7>
+  1648357887U,	// <7,u,6,2>: Cost 2 vsldoi8 <6,2,7,u>, <6,2,7,u>
+  1705433296U,	// <7,u,6,3>: Cost 2 vsldoi12 RHS, <u,6,3,7>
+  2779175127U,	// <7,u,6,4>: Cost 3 vsldoi12 RHS, <u,6,4,5>
+  2779175138U,	// <7,u,6,5>: Cost 3 vsldoi12 RHS, <u,6,5,7>
+  1651012419U,	// <7,u,6,6>: Cost 2 vsldoi8 <6,6,7,u>, <6,6,7,u>
+  1705580788U,	// <7,u,6,7>: Cost 2 vsldoi12 RHS, <u,6,7,7>
+  1705433341U,	// <7,u,6,u>: Cost 2 vsldoi12 RHS, <u,6,u,7>
+  1705580800U,	// <7,u,7,0>: Cost 2 vsldoi12 RHS, <u,7,0,1>
+  1187878702U,	// <7,u,7,1>: Cost 2 vmrghw <7,7,7,7>, LHS
+  2768042263U,	// <7,u,7,2>: Cost 3 vsldoi12 <2,6,u,7>, <u,7,2,6>
+  1248346268U,	// <7,u,7,3>: Cost 2 vmrglw <6,6,7,7>, LHS
+  1705580840U,	// <7,u,7,4>: Cost 2 vsldoi12 RHS, <u,7,4,5>
+  1187879066U,	// <7,u,7,5>: Cost 2 vmrghw <7,7,7,7>, RHS
+  2779322679U,	// <7,u,7,6>: Cost 3 vsldoi12 RHS, <u,7,6,2>
+  430361910U,	// <7,u,7,7>: Cost 1 vspltisw3 RHS
+  430361910U,	// <7,u,7,u>: Cost 1 vspltisw3 RHS
+  1705433425U,	// <7,u,u,0>: Cost 2 vsldoi12 RHS, <u,u,0,1>
+  1705433435U,	// <7,u,u,1>: Cost 2 vsldoi12 RHS, <u,u,1,2>
+  631691621U,	// <7,u,u,2>: Cost 1 vsldoi12 RHS, LHS
+  1705433451U,	// <7,u,u,3>: Cost 2 vsldoi12 RHS, <u,u,3,0>
+  1705433465U,	// <7,u,u,4>: Cost 2 vsldoi12 RHS, <u,u,4,5>
+  1705433475U,	// <7,u,u,5>: Cost 2 vsldoi12 RHS, <u,u,5,6>
+  631691661U,	// <7,u,u,6>: Cost 1 vsldoi12 RHS, RHS
+  430361910U,	// <7,u,u,7>: Cost 1 vspltisw3 RHS
+  631691675U,	// <7,u,u,u>: Cost 1 vsldoi12 RHS, LHS
+  202162278U,	// <u,0,0,0>: Cost 1 vspltisw0 LHS
+  1678598154U,	// <u,0,0,1>: Cost 2 vsldoi12 LHS, <0,0,1,1>
+  2634500154U,	// <u,0,0,2>: Cost 3 vsldoi4 <2,u,0,0>, <2,u,0,0>
+  2289596269U,	// <u,0,0,3>: Cost 3 vmrglw <1,2,u,0>, <u,2,0,3>
+  1548815670U,	// <u,0,0,4>: Cost 2 vsldoi4 <0,u,0,0>, RHS
+  2663698530U,	// <u,0,0,5>: Cost 3 vsldoi4 <7,7,0,0>, <5,6,7,0>
+  2658390942U,	// <u,0,0,6>: Cost 3 vsldoi4 <6,u,0,0>, <6,u,0,0>
+  2289596597U,	// <u,0,0,7>: Cost 3 vmrglw <1,2,u,0>, <u,6,0,7>
+  202162278U,	// <u,0,0,u>: Cost 1 vspltisw0 LHS
+  1560764518U,	// <u,0,1,0>: Cost 2 vsldoi4 <2,u,0,1>, LHS
+  115720294U,	// <u,0,1,1>: Cost 1 vmrghw LHS, LHS
+  604856427U,	// <u,0,1,2>: Cost 1 vsldoi12 LHS, LHS
+  2634508438U,	// <u,0,1,3>: Cost 3 vsldoi4 <2,u,0,1>, <3,0,1,2>
+  1560767798U,	// <u,0,1,4>: Cost 2 vsldoi4 <2,u,0,1>, RHS
+  2652426438U,	// <u,0,1,5>: Cost 3 vsldoi4 <5,u,0,1>, <5,u,0,1>
+  1584657311U,	// <u,0,1,6>: Cost 2 vsldoi4 <6,u,0,1>, <6,u,0,1>
+  2658399226U,	// <u,0,1,7>: Cost 3 vsldoi4 <6,u,0,1>, <7,0,1,2>
+  604856476U,	// <u,0,1,u>: Cost 1 vsldoi12 LHS, LHS
+  2696889850U,	// <u,0,2,0>: Cost 3 vsldoi8 <2,0,u,0>, <2,0,u,0>
+  1190174822U,	// <u,0,2,1>: Cost 2 vmrghw <u,2,3,0>, LHS
+  2692245096U,	// <u,0,2,2>: Cost 3 vsldoi8 <1,2,u,0>, <2,2,2,2>
+  2692245158U,	// <u,0,2,3>: Cost 3 vsldoi8 <1,2,u,0>, <2,3,0,1>
+  2263916882U,	// <u,0,2,4>: Cost 3 vmrghw <u,2,3,0>, <0,4,1,5>
+  2299709908U,	// <u,0,2,5>: Cost 3 vmrglw <3,0,1,2>, <3,4,0,5>
+  2692245434U,	// <u,0,2,6>: Cost 3 vsldoi8 <1,2,u,0>, <2,6,3,7>
+  2701535281U,	// <u,0,2,7>: Cost 3 vsldoi8 <2,7,u,0>, <2,7,u,0>
+  1190175389U,	// <u,0,2,u>: Cost 2 vmrghw <u,2,3,0>, LHS
+  1209237504U,	// <u,0,3,0>: Cost 2 vmrglw LHS, <0,0,0,0>
+  1209239206U,	// <u,0,3,1>: Cost 2 vmrglw LHS, <2,3,0,1>
+  2704189813U,	// <u,0,3,2>: Cost 3 vsldoi8 <3,2,u,0>, <3,2,u,0>
+  2692245916U,	// <u,0,3,3>: Cost 3 vsldoi8 <1,2,u,0>, <3,3,3,3>
+  2282981033U,	// <u,0,3,4>: Cost 3 vmrglw LHS, <2,3,0,4>
+  2664386658U,	// <u,0,3,5>: Cost 3 vsldoi4 <7,u,0,3>, <5,6,7,0>
+  2691877496U,	// <u,0,3,6>: Cost 3 vsldoi8 <1,2,3,0>, <3,6,0,7>
+  2664388218U,	// <u,0,3,7>: Cost 3 vsldoi4 <7,u,0,3>, <7,u,0,3>
+  1209239213U,	// <u,0,3,u>: Cost 2 vmrglw LHS, <2,3,0,u>
+  2289623040U,	// <u,0,4,0>: Cost 3 vmrglw <1,2,u,4>, <0,0,0,0>
+  1678598482U,	// <u,0,4,1>: Cost 2 vsldoi12 LHS, <0,4,1,5>
+  2634532926U,	// <u,0,4,2>: Cost 3 vsldoi4 <2,u,0,4>, <2,u,0,4>
+  2235580672U,	// <u,0,4,3>: Cost 3 vmrghw <3,4,5,6>, <0,3,1,4>
+  1143619922U,	// <u,0,4,4>: Cost 2 vmrghw <0,4,1,5>, <0,4,1,5>
+  1618505014U,	// <u,0,4,5>: Cost 2 vsldoi8 <1,2,u,0>, RHS
+  2658423714U,	// <u,0,4,6>: Cost 3 vsldoi4 <6,u,0,4>, <6,u,0,4>
+  2713259464U,	// <u,0,4,7>: Cost 3 vsldoi8 <4,7,5,0>, <4,7,5,0>
+  1683243409U,	// <u,0,4,u>: Cost 2 vsldoi12 LHS, <0,4,u,5>
+  1192443904U,	// <u,0,5,0>: Cost 2 vmrghw RHS, <0,0,0,0>
+  118702182U,	// <u,0,5,1>: Cost 1 vmrghw RHS, LHS
+  2266185901U,	// <u,0,5,2>: Cost 3 vmrghw RHS, <0,2,1,2>
+  2640513816U,	// <u,0,5,3>: Cost 3 vsldoi4 <3,u,0,5>, <3,u,0,5>
+  1192444242U,	// <u,0,5,4>: Cost 2 vmrghw RHS, <0,4,1,5>
+  2718789636U,	// <u,0,5,5>: Cost 3 vsldoi8 <5,6,u,0>, <5,5,5,5>
+  1645047915U,	// <u,0,5,6>: Cost 2 vsldoi8 <5,6,u,0>, <5,6,u,0>
+  2664404604U,	// <u,0,5,7>: Cost 3 vsldoi4 <7,u,0,5>, <7,u,0,5>
+  118702749U,	// <u,0,5,u>: Cost 1 vmrghw RHS, LHS
+  2302910464U,	// <u,0,6,0>: Cost 3 vmrglw <3,4,u,6>, <0,0,0,0>
+  1192886374U,	// <u,0,6,1>: Cost 2 vmrghw <u,6,3,7>, LHS
+  2718790138U,	// <u,0,6,2>: Cost 3 vsldoi8 <5,6,u,0>, <6,2,7,3>
+  2722771537U,	// <u,0,6,3>: Cost 3 vsldoi8 <6,3,u,0>, <6,3,u,0>
+  2266628434U,	// <u,0,6,4>: Cost 3 vmrghw <u,6,3,7>, <0,4,1,5>
+  2248950180U,	// <u,0,6,5>: Cost 3 vmrghw <5,6,7,0>, <0,5,1,6>
+  2718790456U,	// <u,0,6,6>: Cost 3 vsldoi8 <5,6,u,0>, <6,6,6,6>
+  2718790478U,	// <u,0,6,7>: Cost 3 vsldoi8 <5,6,u,0>, <6,7,0,1>
+  1192886941U,	// <u,0,6,u>: Cost 2 vmrghw <u,6,3,7>, LHS
+  1235812352U,	// <u,0,7,0>: Cost 2 vmrglw RHS, <0,0,0,0>
+  1235814054U,	// <u,0,7,1>: Cost 2 vmrglw RHS, <2,3,0,1>
+  2728080601U,	// <u,0,7,2>: Cost 3 vsldoi8 <7,2,u,0>, <7,2,u,0>
+  2640530202U,	// <u,0,7,3>: Cost 3 vsldoi4 <3,u,0,7>, <3,u,0,7>
+  2640530742U,	// <u,0,7,4>: Cost 3 vsldoi4 <3,u,0,7>, RHS
+  2309556692U,	// <u,0,7,5>: Cost 3 vmrglw RHS, <3,4,0,5>
+  2730735133U,	// <u,0,7,6>: Cost 3 vsldoi8 <7,6,u,0>, <7,6,u,0>
+  2309556856U,	// <u,0,7,7>: Cost 3 vmrglw RHS, <3,6,0,7>
+  1235814061U,	// <u,0,7,u>: Cost 2 vmrglw RHS, <2,3,0,u>
+  202162278U,	// <u,0,u,0>: Cost 1 vspltisw0 LHS
+  120365158U,	// <u,0,u,1>: Cost 1 vmrghw LHS, LHS
+  604856989U,	// <u,0,u,2>: Cost 1 vsldoi12 LHS, LHS
+  2692249532U,	// <u,0,u,3>: Cost 3 vsldoi8 <1,2,u,0>, <u,3,0,1>
+  1560825142U,	// <u,0,u,4>: Cost 2 vsldoi4 <2,u,0,u>, RHS
+  1618507930U,	// <u,0,u,5>: Cost 2 vsldoi8 <1,2,u,0>, RHS
+  1584714662U,	// <u,0,u,6>: Cost 2 vsldoi4 <6,u,0,u>, <6,u,0,u>
+  2309565048U,	// <u,0,u,7>: Cost 3 vmrglw RHS, <3,6,0,7>
+  604857043U,	// <u,0,u,u>: Cost 1 vsldoi12 LHS, LHS
+  1611210825U,	// <u,1,0,0>: Cost 2 vsldoi8 <0,0,u,1>, <0,0,u,1>
+  1616519270U,	// <u,1,0,1>: Cost 2 vsldoi8 <0,u,u,1>, LHS
+  2287605459U,	// <u,1,0,2>: Cost 3 vmrglw <0,u,u,0>, <u,0,1,2>
+  2640546588U,	// <u,1,0,3>: Cost 3 vsldoi4 <3,u,1,0>, <3,u,1,0>
+  2622631222U,	// <u,1,0,4>: Cost 3 vsldoi4 <0,u,1,0>, RHS
+  2289590610U,	// <u,1,0,5>: Cost 3 vmrglw <1,2,u,0>, <0,4,1,5>
+  2664436630U,	// <u,1,0,6>: Cost 3 vsldoi4 <7,u,1,0>, <6,7,u,1>
+  2664437376U,	// <u,1,0,7>: Cost 3 vsldoi4 <7,u,1,0>, <7,u,1,0>
+  1616519889U,	// <u,1,0,u>: Cost 2 vsldoi8 <0,u,u,1>, <0,u,u,1>
+  1548894866U,	// <u,1,1,0>: Cost 2 vsldoi4 <0,u,1,1>, <0,u,1,1>
+  269271142U,	// <u,1,1,1>: Cost 1 vspltisw1 LHS
+  1189462934U,	// <u,1,1,2>: Cost 2 vmrghw LHS, <1,2,3,0>
+  2622638230U,	// <u,1,1,3>: Cost 3 vsldoi4 <0,u,1,1>, <3,0,1,2>
+  1548897590U,	// <u,1,1,4>: Cost 2 vsldoi4 <0,u,1,1>, RHS
+  2756985692U,	// <u,1,1,5>: Cost 3 vsldoi12 LHS, <1,1,5,5>
+  2658472872U,	// <u,1,1,6>: Cost 3 vsldoi4 <6,u,1,1>, <6,u,1,1>
+  2287614142U,	// <u,1,1,7>: Cost 3 vmrglw <0,u,u,1>, <u,6,1,7>
+  269271142U,	// <u,1,1,u>: Cost 1 vspltisw1 LHS
+  1566818406U,	// <u,1,2,0>: Cost 2 vsldoi4 <3,u,1,2>, LHS
+  2756985735U,	// <u,1,2,1>: Cost 3 vsldoi12 LHS, <1,2,1,3>
+  1148371862U,	// <u,1,2,2>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0>
+  835584U,	// <u,1,2,3>: Cost 0 copy LHS
+  1566821686U,	// <u,1,2,4>: Cost 2 vsldoi4 <3,u,1,2>, RHS
+  2756985771U,	// <u,1,2,5>: Cost 3 vsldoi12 LHS, <1,2,5,3>
+  2690262970U,	// <u,1,2,6>: Cost 3 vsldoi8 <0,u,u,1>, <2,6,3,7>
+  1590711938U,	// <u,1,2,7>: Cost 2 vsldoi4 <7,u,1,2>, <7,u,1,2>
+  835584U,	// <u,1,2,u>: Cost 0 copy LHS
+  2282979337U,	// <u,1,3,0>: Cost 3 vmrglw LHS, <0,0,1,0>
+  1209237514U,	// <u,1,3,1>: Cost 2 vmrglw LHS, <0,0,1,1>
+  1209239702U,	// <u,1,3,2>: Cost 2 vmrglw LHS, <3,0,1,2>
+  2282979502U,	// <u,1,3,3>: Cost 3 vmrglw LHS, <0,2,1,3>
+  2282979341U,	// <u,1,3,4>: Cost 3 vmrglw LHS, <0,0,1,4>
+  1209237842U,	// <u,1,3,5>: Cost 2 vmrglw LHS, <0,4,1,5>
+  2282979505U,	// <u,1,3,6>: Cost 3 vmrglw LHS, <0,2,1,6>
+  2287625423U,	// <u,1,3,7>: Cost 3 vmrglw LHS, <1,6,1,7>
+  1209237521U,	// <u,1,3,u>: Cost 2 vmrglw LHS, <0,0,1,u>
+  1635101613U,	// <u,1,4,0>: Cost 2 vsldoi8 <4,0,u,1>, <4,0,u,1>
+  2289623050U,	// <u,1,4,1>: Cost 3 vmrglw <1,2,u,4>, <0,0,1,1>
+  2289625238U,	// <u,1,4,2>: Cost 3 vmrglw <1,2,u,4>, <3,0,1,2>
+  2640579360U,	// <u,1,4,3>: Cost 3 vsldoi4 <3,u,1,4>, <3,u,1,4>
+  2622663990U,	// <u,1,4,4>: Cost 3 vsldoi4 <0,u,1,4>, RHS
+  1616522550U,	// <u,1,4,5>: Cost 2 vsldoi8 <0,u,u,1>, RHS
+  2664469398U,	// <u,1,4,6>: Cost 3 vsldoi4 <7,u,1,4>, <6,7,u,1>
+  2664470148U,	// <u,1,4,7>: Cost 3 vsldoi4 <7,u,1,4>, <7,u,1,4>
+  1616522793U,	// <u,1,4,u>: Cost 2 vsldoi8 <0,u,u,1>, RHS
+  1548927638U,	// <u,1,5,0>: Cost 2 vsldoi4 <0,u,1,5>, <0,u,1,5>
+  1192444724U,	// <u,1,5,1>: Cost 2 vmrghw RHS, <1,1,1,1>
+  1192444822U,	// <u,1,5,2>: Cost 2 vmrghw RHS, <1,2,3,0>
+  2622670998U,	// <u,1,5,3>: Cost 3 vsldoi4 <0,u,1,5>, <3,0,1,2>
+  1548930358U,	// <u,1,5,4>: Cost 2 vsldoi4 <0,u,1,5>, RHS
+  1210728786U,	// <u,1,5,5>: Cost 2 vmrglw <0,4,1,5>, <0,4,1,5>
+  2714153058U,	// <u,1,5,6>: Cost 3 vsldoi8 <4,u,u,1>, <5,6,7,0>
+  2670449658U,	// <u,1,5,7>: Cost 3 vsldoi4 <u,u,1,5>, <7,0,1,2>
+  1548932910U,	// <u,1,5,u>: Cost 2 vsldoi4 <0,u,1,5>, LHS
+  2622677655U,	// <u,1,6,0>: Cost 3 vsldoi4 <0,u,1,6>, <0,u,1,6>
+  2756986063U,	// <u,1,6,1>: Cost 3 vsldoi12 LHS, <1,6,1,7>
+  2302912662U,	// <u,1,6,2>: Cost 3 vmrglw <3,4,u,6>, <3,0,1,2>
+  3696421014U,	// <u,1,6,3>: Cost 4 vsldoi4 <0,u,1,6>, <3,0,1,2>
+  2622680374U,	// <u,1,6,4>: Cost 3 vsldoi4 <0,u,1,6>, RHS
+  2756986099U,	// <u,1,6,5>: Cost 3 vsldoi12 LHS, <1,6,5,7>
+  2714153784U,	// <u,1,6,6>: Cost 3 vsldoi8 <4,u,u,1>, <6,6,6,6>
+  1651692438U,	// <u,1,6,7>: Cost 2 vsldoi8 <6,7,u,1>, <6,7,u,1>
+  1652356071U,	// <u,1,6,u>: Cost 2 vsldoi8 <6,u,u,1>, <6,u,u,1>
+  2628657254U,	// <u,1,7,0>: Cost 3 vsldoi4 <1,u,1,7>, LHS
+  1235812362U,	// <u,1,7,1>: Cost 2 vmrglw RHS, <0,0,1,1>
+  1235814550U,	// <u,1,7,2>: Cost 2 vmrglw RHS, <3,0,1,2>
+  2309554350U,	// <u,1,7,3>: Cost 3 vmrglw RHS, <0,2,1,3>
+  2628660534U,	// <u,1,7,4>: Cost 3 vsldoi4 <1,u,1,7>, RHS
+  1235812690U,	// <u,1,7,5>: Cost 2 vmrglw RHS, <0,4,1,5>
+  2309554353U,	// <u,1,7,6>: Cost 3 vmrglw RHS, <0,2,1,6>
+  2309554678U,	// <u,1,7,7>: Cost 3 vmrglw RHS, <0,6,1,7>
+  1235812369U,	// <u,1,7,u>: Cost 2 vmrglw RHS, <0,0,1,u>
+  1548952217U,	// <u,1,u,0>: Cost 2 vsldoi4 <0,u,1,u>, <0,u,1,u>
+  269271142U,	// <u,1,u,1>: Cost 1 vspltisw1 LHS
+  1209280662U,	// <u,1,u,2>: Cost 2 vmrglw LHS, <3,0,1,2>
+  835584U,	// <u,1,u,3>: Cost 0 copy LHS
+  1548954934U,	// <u,1,u,4>: Cost 2 vsldoi4 <0,u,1,u>, RHS
+  1209278802U,	// <u,1,u,5>: Cost 2 vmrglw LHS, <0,4,1,5>
+  2283020465U,	// <u,1,u,6>: Cost 3 vmrglw LHS, <0,2,1,6>
+  1590761096U,	// <u,1,u,7>: Cost 2 vsldoi4 <7,u,1,u>, <7,u,1,u>
+  835584U,	// <u,1,u,u>: Cost 0 copy LHS
+  2702876672U,	// <u,2,0,0>: Cost 3 vsldoi8 <3,0,u,2>, <0,0,0,0>
+  1629134950U,	// <u,2,0,1>: Cost 2 vsldoi8 <3,0,u,2>, LHS
+  2289591912U,	// <u,2,0,2>: Cost 3 vmrglw <1,2,u,0>, <2,2,2,2>
+  1215848550U,	// <u,2,0,3>: Cost 2 vmrglw <1,2,u,0>, LHS
+  2702877010U,	// <u,2,0,4>: Cost 3 vsldoi8 <3,0,u,2>, <0,4,1,5>
+  2289222708U,	// <u,2,0,5>: Cost 3 vmrglw <1,2,3,0>, <1,4,2,5>
+  2779178473U,	// <u,2,0,6>: Cost 3 vsldoi12 RHS, <2,0,6,1>
+  2726249024U,	// <u,2,0,7>: Cost 3 vsldoi8 <7,0,1,2>, <0,7,1,0>
+  1215848555U,	// <u,2,0,u>: Cost 2 vmrglw <1,2,u,0>, LHS
+  2690933539U,	// <u,2,1,0>: Cost 3 vsldoi8 <1,0,u,2>, <1,0,u,2>
+  2628683124U,	// <u,2,1,1>: Cost 3 vsldoi4 <1,u,2,1>, <1,u,2,1>
+  1189463656U,	// <u,2,1,2>: Cost 2 vmrghw LHS, <2,2,2,2>
+  1213866086U,	// <u,2,1,3>: Cost 2 vmrglw <0,u,u,1>, LHS
+  2628685110U,	// <u,2,1,4>: Cost 3 vsldoi4 <1,u,2,1>, RHS
+  2263205736U,	// <u,2,1,5>: Cost 3 vmrghw LHS, <2,5,3,6>
+  1189463994U,	// <u,2,1,6>: Cost 2 vmrghw LHS, <2,6,3,7>
+  2263205866U,	// <u,2,1,7>: Cost 3 vmrghw LHS, <2,7,0,1>
+  1213866091U,	// <u,2,1,u>: Cost 2 vmrglw <0,u,u,1>, LHS
+  1556938854U,	// <u,2,2,0>: Cost 2 vsldoi4 <2,2,2,2>, LHS
+  2697569869U,	// <u,2,2,1>: Cost 3 vsldoi8 <2,1,u,2>, <2,1,u,2>
+  336380006U,	// <u,2,2,2>: Cost 1 vspltisw2 LHS
+  1678599794U,	// <u,2,2,3>: Cost 2 vsldoi12 LHS, <2,2,3,3>
+  1556942134U,	// <u,2,2,4>: Cost 2 vsldoi4 <2,2,2,2>, RHS
+  2295138061U,	// <u,2,2,5>: Cost 3 vmrglw <2,2,2,2>, <2,4,2,5>
+  2702878650U,	// <u,2,2,6>: Cost 3 vsldoi8 <3,0,u,2>, <2,6,3,7>
+  2300229831U,	// <u,2,2,7>: Cost 3 vmrglw <3,0,u,2>, <u,6,2,7>
+  336380006U,	// <u,2,2,u>: Cost 1 vspltisw2 LHS
+  475243165U,	// <u,2,3,0>: Cost 1 vsldoi4 LHS, LHS
+  1548985140U,	// <u,2,3,1>: Cost 2 vsldoi4 LHS, <1,1,1,1>
+  1209239144U,	// <u,2,3,2>: Cost 2 vmrglw LHS, <2,2,2,2>
+  135495782U,	// <u,2,3,3>: Cost 1 vmrglw LHS, LHS
+  475245878U,	// <u,2,3,4>: Cost 1 vsldoi4 LHS, RHS
+  1596764164U,	// <u,2,3,5>: Cost 2 vsldoi4 LHS, <5,5,5,5>
+  1596764666U,	// <u,2,3,6>: Cost 2 vsldoi4 LHS, <6,2,7,3>
+  1596765178U,	// <u,2,3,7>: Cost 2 vsldoi4 LHS, <7,0,1,2>
+  135495787U,	// <u,2,3,u>: Cost 1 vmrglw LHS, LHS
+  2708851630U,	// <u,2,4,0>: Cost 3 vsldoi8 <4,0,u,2>, <4,0,u,2>
+  2217362979U,	// <u,2,4,1>: Cost 3 vmrghw <0,4,1,5>, <2,1,3,5>
+  2289624680U,	// <u,2,4,2>: Cost 3 vmrglw <1,2,u,4>, <2,2,2,2>
+  1215881318U,	// <u,2,4,3>: Cost 2 vmrglw <1,2,u,4>, LHS
+  2726767824U,	// <u,2,4,4>: Cost 3 vsldoi8 <7,0,u,2>, <4,4,4,4>
+  1629138230U,	// <u,2,4,5>: Cost 2 vsldoi8 <3,0,u,2>, RHS
+  2779178801U,	// <u,2,4,6>: Cost 3 vsldoi12 RHS, <2,4,6,5>
+  2726251976U,	// <u,2,4,7>: Cost 3 vsldoi8 <7,0,1,2>, <4,7,5,0>
+  1215881323U,	// <u,2,4,u>: Cost 2 vmrglw <1,2,u,4>, LHS
+  2628714598U,	// <u,2,5,0>: Cost 3 vsldoi4 <1,u,2,5>, LHS
+  2628715896U,	// <u,2,5,1>: Cost 3 vsldoi4 <1,u,2,5>, <1,u,2,5>
+  1192445544U,	// <u,2,5,2>: Cost 2 vmrghw RHS, <2,2,2,2>
+  1213898854U,	// <u,2,5,3>: Cost 2 vmrglw <0,u,u,5>, LHS
+  2628717878U,	// <u,2,5,4>: Cost 3 vsldoi4 <1,u,2,5>, RHS
+  2726768644U,	// <u,2,5,5>: Cost 3 vsldoi8 <7,0,u,2>, <5,5,5,5>
+  1192445882U,	// <u,2,5,6>: Cost 2 vmrghw RHS, <2,6,3,7>
+  2266187754U,	// <u,2,5,7>: Cost 3 vmrghw RHS, <2,7,0,1>
+  1213898859U,	// <u,2,5,u>: Cost 2 vmrglw <0,u,u,5>, LHS
+  2634694758U,	// <u,2,6,0>: Cost 3 vsldoi4 <2,u,2,6>, LHS
+  2721460657U,	// <u,2,6,1>: Cost 3 vsldoi8 <6,1,u,2>, <6,1,u,2>
+  2296940136U,	// <u,2,6,2>: Cost 3 vmrglw <2,4,u,6>, <2,2,2,2>
+  1678600122U,	// <u,2,6,3>: Cost 2 vsldoi12 LHS, <2,6,3,7>
+  2634698038U,	// <u,2,6,4>: Cost 3 vsldoi4 <2,u,2,6>, RHS
+  3370682125U,	// <u,2,6,5>: Cost 4 vmrglw <2,4,u,6>, <2,4,2,5>
+  1157056442U,	// <u,2,6,6>: Cost 2 vmrghw <2,6,3,7>, <2,6,3,7>
+  2725442455U,	// <u,2,6,7>: Cost 3 vsldoi8 <6,7,u,2>, <6,7,u,2>
+  1678600167U,	// <u,2,6,u>: Cost 2 vsldoi12 LHS, <2,6,u,7>
+  1653027897U,	// <u,2,7,0>: Cost 2 vsldoi8 <7,0,u,2>, <7,0,u,2>
+  2309554924U,	// <u,2,7,1>: Cost 3 vmrglw RHS, <1,0,2,1>
+  1235813992U,	// <u,2,7,2>: Cost 2 vmrglw RHS, <2,2,2,2>
+  162070630U,	// <u,2,7,3>: Cost 1 vmrglw RHS, LHS
+  2634706230U,	// <u,2,7,4>: Cost 3 vsldoi4 <2,u,2,7>, RHS
+  2309555252U,	// <u,2,7,5>: Cost 3 vmrglw RHS, <1,4,2,5>
+  2309555901U,	// <u,2,7,6>: Cost 3 vmrglw RHS, <2,3,2,6>
+  2309555416U,	// <u,2,7,7>: Cost 3 vmrglw RHS, <1,6,2,7>
+  162070635U,	// <u,2,7,u>: Cost 1 vmrglw RHS, LHS
+  475284130U,	// <u,2,u,0>: Cost 1 vsldoi4 LHS, LHS
+  1549026100U,	// <u,2,u,1>: Cost 2 vsldoi4 LHS, <1,1,1,1>
+  336380006U,	// <u,2,u,2>: Cost 1 vspltisw2 LHS
+  135536742U,	// <u,2,u,3>: Cost 1 vmrglw LHS, LHS
+  475286838U,	// <u,2,u,4>: Cost 1 vsldoi4 LHS, RHS
+  1629141146U,	// <u,2,u,5>: Cost 2 vsldoi8 <3,0,u,2>, RHS
+  1194108858U,	// <u,2,u,6>: Cost 2 vmrghw LHS, <2,6,3,7>
+  1596806138U,	// <u,2,u,7>: Cost 2 vsldoi4 LHS, <7,0,1,2>
+  135536747U,	// <u,2,u,u>: Cost 1 vmrglw LHS, LHS
+  1611890688U,	// <u,3,0,0>: Cost 2 vsldoi8 LHS, <0,0,0,0>
+  538149020U,	// <u,3,0,1>: Cost 1 vsldoi8 LHS, LHS
+  2685632685U,	// <u,3,0,2>: Cost 3 vsldoi8 LHS, <0,2,1,2>
+  2685632764U,	// <u,3,0,3>: Cost 3 vsldoi8 LHS, <0,3,1,0>
+  1611891026U,	// <u,3,0,4>: Cost 2 vsldoi8 LHS, <0,4,1,5>
+  2733408722U,	// <u,3,0,5>: Cost 3 vsldoi8 LHS, <0,5,6,7>
+  2658612153U,	// <u,3,0,6>: Cost 3 vsldoi4 <6,u,3,0>, <6,u,3,0>
+  2289592250U,	// <u,3,0,7>: Cost 3 vmrglw <1,2,u,0>, <2,6,3,7>
+  538149533U,	// <u,3,0,u>: Cost 1 vsldoi8 LHS, LHS
+  1189464214U,	// <u,3,1,0>: Cost 2 vmrghw LHS, <3,0,1,2>
+  1611891508U,	// <u,3,1,1>: Cost 2 vsldoi8 LHS, <1,1,1,1>
+  1611891606U,	// <u,3,1,2>: Cost 2 vsldoi8 LHS, <1,2,3,0>
+  1189464476U,	// <u,3,1,3>: Cost 2 vmrghw LHS, <3,3,3,3>
+  1189464578U,	// <u,3,1,4>: Cost 2 vmrghw LHS, <3,4,5,6>
+  2690278511U,	// <u,3,1,5>: Cost 3 vsldoi8 LHS, <1,5,0,1>
+  2690278607U,	// <u,3,1,6>: Cost 3 vsldoi8 LHS, <1,6,1,7>
+  2287609786U,	// <u,3,1,7>: Cost 3 vmrglw <0,u,u,1>, <2,6,3,7>
+  1611892092U,	// <u,3,1,u>: Cost 2 vsldoi8 LHS, <1,u,3,0>
+  2685634042U,	// <u,3,2,0>: Cost 3 vsldoi8 LHS, <2,0,u,0>
+  2685634079U,	// <u,3,2,1>: Cost 3 vsldoi8 LHS, <2,1,3,1>
+  1611892328U,	// <u,3,2,2>: Cost 2 vsldoi8 LHS, <2,2,2,2>
+  1611892390U,	// <u,3,2,3>: Cost 2 vsldoi8 LHS, <2,3,0,1>
+  2685634371U,	// <u,3,2,4>: Cost 3 vsldoi8 LHS, <2,4,u,5>
+  2685634453U,	// <u,3,2,5>: Cost 3 vsldoi8 LHS, <2,5,u,6>
+  1611892666U,	// <u,3,2,6>: Cost 2 vsldoi8 LHS, <2,6,3,7>
+  2300225466U,	// <u,3,2,7>: Cost 3 vmrglw <3,0,u,2>, <2,6,3,7>
+  1611892795U,	// <u,3,2,u>: Cost 2 vsldoi8 LHS, <2,u,0,1>
+  1209238422U,	// <u,3,3,0>: Cost 2 vmrglw LHS, <1,2,3,0>
+  2282980247U,	// <u,3,3,1>: Cost 3 vmrglw LHS, <1,2,3,1>
+  1561004120U,	// <u,3,3,2>: Cost 2 vsldoi4 <2,u,3,3>, <2,u,3,3>
+  403488870U,	// <u,3,3,3>: Cost 1 vspltisw3 LHS
+  1209238426U,	// <u,3,3,4>: Cost 2 vmrglw LHS, <1,2,3,4>
+  2282980899U,	// <u,3,3,5>: Cost 3 vmrglw LHS, <2,1,3,5>
+  2282985598U,	// <u,3,3,6>: Cost 3 vmrglw LHS, <u,5,3,6>
+  1209239482U,	// <u,3,3,7>: Cost 2 vmrglw LHS, <2,6,3,7>
+  403488870U,	// <u,3,3,u>: Cost 1 vspltisw3 LHS
+  1555038310U,	// <u,3,4,0>: Cost 2 vsldoi4 <1,u,3,4>, LHS
+  1555039616U,	// <u,3,4,1>: Cost 2 vsldoi4 <1,u,3,4>, <1,u,3,4>
+  2628781672U,	// <u,3,4,2>: Cost 3 vsldoi4 <1,u,3,4>, <2,2,2,2>
+  2289624690U,	// <u,3,4,3>: Cost 3 vmrglw <1,2,u,4>, <2,2,3,3>
+  1555041590U,	// <u,3,4,4>: Cost 2 vsldoi4 <1,u,3,4>, RHS
+  538152246U,	// <u,3,4,5>: Cost 1 vsldoi8 LHS, RHS
+  2658644925U,	// <u,3,4,6>: Cost 3 vsldoi4 <6,u,3,4>, <6,u,3,4>
+  2289625018U,	// <u,3,4,7>: Cost 3 vmrglw <1,2,u,4>, <2,6,3,7>
+  538152489U,	// <u,3,4,u>: Cost 1 vsldoi8 LHS, RHS
+  1192446102U,	// <u,3,5,0>: Cost 2 vmrghw RHS, <3,0,1,2>
+  2733411983U,	// <u,3,5,1>: Cost 3 vsldoi8 LHS, <5,1,0,1>
+  2634762330U,	// <u,3,5,2>: Cost 3 vsldoi4 <2,u,3,5>, <2,u,3,5>
+  1192446364U,	// <u,3,5,3>: Cost 2 vmrghw RHS, <3,3,3,3>
+  1192446466U,	// <u,3,5,4>: Cost 2 vmrghw RHS, <3,4,5,6>
+  1659670532U,	// <u,3,5,5>: Cost 2 vsldoi8 LHS, <5,5,5,5>
+  1659670626U,	// <u,3,5,6>: Cost 2 vsldoi8 LHS, <5,6,7,0>
+  2287642554U,	// <u,3,5,7>: Cost 3 vmrglw <0,u,u,5>, <2,6,3,7>
+  1659670788U,	// <u,3,5,u>: Cost 2 vsldoi8 LHS, <5,u,7,0>
+  2634768486U,	// <u,3,6,0>: Cost 3 vsldoi4 <2,u,3,6>, LHS
+  2733412775U,	// <u,3,6,1>: Cost 3 vsldoi8 LHS, <6,1,7,1>
+  1648390659U,	// <u,3,6,2>: Cost 2 vsldoi8 <6,2,u,3>, <6,2,u,3>
+  2634770973U,	// <u,3,6,3>: Cost 3 vsldoi4 <2,u,3,6>, <3,4,u,6>
+  2634771766U,	// <u,3,6,4>: Cost 3 vsldoi4 <2,u,3,6>, RHS
+  2733413099U,	// <u,3,6,5>: Cost 3 vsldoi8 LHS, <6,5,7,1>
+  1659671352U,	// <u,3,6,6>: Cost 2 vsldoi8 LHS, <6,6,6,6>
+  1659671374U,	// <u,3,6,7>: Cost 2 vsldoi8 LHS, <6,7,0,1>
+  1652372457U,	// <u,3,6,u>: Cost 2 vsldoi8 <6,u,u,3>, <6,u,u,3>
+  1561034854U,	// <u,3,7,0>: Cost 2 vsldoi4 <2,u,3,7>, LHS
+  2634777396U,	// <u,3,7,1>: Cost 3 vsldoi4 <2,u,3,7>, <1,1,1,1>
+  1561036892U,	// <u,3,7,2>: Cost 2 vsldoi4 <2,u,3,7>, <2,u,3,7>
+  1235814002U,	// <u,3,7,3>: Cost 2 vmrglw RHS, <2,2,3,3>
+  1561038134U,	// <u,3,7,4>: Cost 2 vsldoi4 <2,u,3,7>, RHS
+  2309555747U,	// <u,3,7,5>: Cost 3 vmrglw RHS, <2,1,3,5>
+  2309556072U,	// <u,3,7,6>: Cost 3 vmrglw RHS, <2,5,3,6>
+  1235814330U,	// <u,3,7,7>: Cost 2 vmrglw RHS, <2,6,3,7>
+  1561040686U,	// <u,3,7,u>: Cost 2 vsldoi4 <2,u,3,7>, LHS
+  1611896531U,	// <u,3,u,0>: Cost 2 vsldoi8 LHS, <u,0,1,2>
+  538154798U,	// <u,3,u,1>: Cost 1 vsldoi8 LHS, LHS
+  1611896712U,	// <u,3,u,2>: Cost 2 vsldoi8 LHS, <u,2,3,3>
+  403488870U,	// <u,3,u,3>: Cost 1 vspltisw3 LHS
+  1611896895U,	// <u,3,u,4>: Cost 2 vsldoi8 LHS, <u,4,5,6>
+  538155162U,	// <u,3,u,5>: Cost 1 vsldoi8 LHS, RHS
+  1611897040U,	// <u,3,u,6>: Cost 2 vsldoi8 LHS, <u,6,3,7>
+  1209280442U,	// <u,3,u,7>: Cost 2 vmrglw LHS, <2,6,3,7>
+  538155365U,	// <u,3,u,u>: Cost 1 vsldoi8 LHS, LHS
+  1165118354U,	// <u,4,0,0>: Cost 2 vmrghw <4,0,5,1>, <4,0,5,1>
+  1618534502U,	// <u,4,0,1>: Cost 2 vsldoi8 <1,2,u,4>, LHS
+  2634795102U,	// <u,4,0,2>: Cost 3 vsldoi4 <2,u,4,0>, <2,u,4,0>
+  2686451968U,	// <u,4,0,3>: Cost 3 vsldoi8 <0,3,1,4>, <0,3,1,4>
+  2692276562U,	// <u,4,0,4>: Cost 3 vsldoi8 <1,2,u,4>, <0,4,1,5>
+  1705438098U,	// <u,4,0,5>: Cost 2 vsldoi12 RHS, <4,0,5,1>
+  2658685890U,	// <u,4,0,6>: Cost 3 vsldoi4 <6,u,4,0>, <6,u,4,0>
+  2256489928U,	// <u,4,0,7>: Cost 3 vmrghw <7,0,1,2>, <4,7,5,0>
+  1618535069U,	// <u,4,0,u>: Cost 2 vsldoi8 <1,2,u,4>, LHS
+  1189464978U,	// <u,4,1,0>: Cost 2 vmrghw LHS, <4,0,5,1>
+  2692277044U,	// <u,4,1,1>: Cost 3 vsldoi8 <1,2,u,4>, <1,1,1,1>
+  1618535367U,	// <u,4,1,2>: Cost 2 vsldoi8 <1,2,u,4>, <1,2,u,4>
+  2640775992U,	// <u,4,1,3>: Cost 3 vsldoi4 <3,u,4,1>, <3,u,4,1>
+  1189465296U,	// <u,4,1,4>: Cost 2 vmrghw LHS, <4,4,4,4>
+  115723574U,	// <u,4,1,5>: Cost 1 vmrghw LHS, RHS
+  2263207289U,	// <u,4,1,6>: Cost 3 vmrghw LHS, <4,6,5,2>
+  2664666780U,	// <u,4,1,7>: Cost 3 vsldoi4 <7,u,4,1>, <7,u,4,1>
+  115723817U,	// <u,4,1,u>: Cost 1 vmrghw LHS, RHS
+  2263919506U,	// <u,4,2,0>: Cost 3 vmrghw <u,2,3,0>, <4,0,5,1>
+  2222115812U,	// <u,4,2,1>: Cost 3 vmrghw <1,2,3,0>, <4,1,5,2>
+  2692277864U,	// <u,4,2,2>: Cost 3 vsldoi8 <1,2,u,4>, <2,2,2,2>
+  2692277926U,	// <u,4,2,3>: Cost 3 vsldoi8 <1,2,u,4>, <2,3,0,1>
+  2324114640U,	// <u,4,2,4>: Cost 3 vmrglw <7,0,u,2>, <4,4,4,4>
+  1190178102U,	// <u,4,2,5>: Cost 2 vmrghw <u,2,3,0>, RHS
+  2692278202U,	// <u,4,2,6>: Cost 3 vsldoi8 <1,2,u,4>, <2,6,3,7>
+  2701568053U,	// <u,4,2,7>: Cost 3 vsldoi8 <2,7,u,4>, <2,7,u,4>
+  1190178345U,	// <u,4,2,u>: Cost 2 vmrghw <u,2,3,0>, RHS
+  2692278422U,	// <u,4,3,0>: Cost 3 vsldoi8 <1,2,u,4>, <3,0,1,2>
+  2282981552U,	// <u,4,3,1>: Cost 3 vmrglw LHS, <3,0,4,1>
+  2704222585U,	// <u,4,3,2>: Cost 3 vsldoi8 <3,2,u,4>, <3,2,u,4>
+  2692278684U,	// <u,4,3,3>: Cost 3 vsldoi8 <1,2,u,4>, <3,3,3,3>
+  1257016528U,	// <u,4,3,4>: Cost 2 vmrglw LHS, <4,4,4,4>
+  1209239246U,	// <u,4,3,5>: Cost 2 vmrglw LHS, <2,3,4,5>
+  2691910300U,	// <u,4,3,6>: Cost 3 vsldoi8 <1,2,3,4>, <3,6,4,7>
+  2664683166U,	// <u,4,3,7>: Cost 3 vsldoi4 <7,u,4,3>, <7,u,4,3>
+  1209239249U,	// <u,4,3,u>: Cost 2 vmrglw LHS, <2,3,4,u>
+  1573027942U,	// <u,4,4,0>: Cost 2 vsldoi4 <4,u,4,4>, LHS
+  2634826695U,	// <u,4,4,1>: Cost 3 vsldoi4 <2,u,4,4>, <1,2,u,4>
+  2634827874U,	// <u,4,4,2>: Cost 3 vsldoi4 <2,u,4,4>, <2,u,4,4>
+  2289629073U,	// <u,4,4,3>: Cost 3 vmrglw <1,2,u,4>, <u,2,4,3>
+  229035318U,	// <u,4,4,4>: Cost 1 vspltisw0 RHS
+  1618537782U,	// <u,4,4,5>: Cost 2 vsldoi8 <1,2,u,4>, RHS
+  2658718662U,	// <u,4,4,6>: Cost 3 vsldoi4 <6,u,4,4>, <6,u,4,4>
+  2289629401U,	// <u,4,4,7>: Cost 3 vmrglw <1,2,u,4>, <u,6,4,7>
+  229035318U,	// <u,4,4,u>: Cost 1 vspltisw0 RHS
+  1561092198U,	// <u,4,5,0>: Cost 2 vsldoi4 <2,u,4,5>, LHS
+  2628863370U,	// <u,4,5,1>: Cost 3 vsldoi4 <1,u,4,5>, <1,u,4,5>
+  1561094243U,	// <u,4,5,2>: Cost 2 vsldoi4 <2,u,4,5>, <2,u,4,5>
+  2634836118U,	// <u,4,5,3>: Cost 3 vsldoi4 <2,u,4,5>, <3,0,1,2>
+  1561095478U,	// <u,4,5,4>: Cost 2 vsldoi4 <2,u,4,5>, RHS
+  118705462U,	// <u,4,5,5>: Cost 1 vmrghw RHS, RHS
+  604859702U,	// <u,4,5,6>: Cost 1 vsldoi12 LHS, RHS
+  2658726906U,	// <u,4,5,7>: Cost 3 vsldoi4 <6,u,4,5>, <7,0,1,2>
+  604859720U,	// <u,4,5,u>: Cost 1 vsldoi12 LHS, RHS
+  2266631058U,	// <u,4,6,0>: Cost 3 vmrghw <u,6,3,7>, <4,0,5,1>
+  2302692152U,	// <u,4,6,1>: Cost 3 vmrglw <3,4,5,6>, <3,u,4,1>
+  2718822906U,	// <u,4,6,2>: Cost 3 vsldoi8 <5,6,u,4>, <6,2,7,3>
+  2722804309U,	// <u,4,6,3>: Cost 3 vsldoi8 <6,3,u,4>, <6,3,u,4>
+  2723467942U,	// <u,4,6,4>: Cost 3 vsldoi8 <6,4,u,4>, <6,4,u,4>
+  1192889654U,	// <u,4,6,5>: Cost 2 vmrghw <u,6,3,7>, RHS
+  2718823224U,	// <u,4,6,6>: Cost 3 vsldoi8 <5,6,u,4>, <6,6,6,6>
+  2718823246U,	// <u,4,6,7>: Cost 3 vsldoi8 <5,6,u,4>, <6,7,0,1>
+  1192889897U,	// <u,4,6,u>: Cost 2 vmrghw <u,6,3,7>, RHS
+  2640822374U,	// <u,4,7,0>: Cost 3 vsldoi4 <3,u,4,7>, LHS
+  2640823194U,	// <u,4,7,1>: Cost 3 vsldoi4 <3,u,4,7>, <1,2,3,4>
+  2728113373U,	// <u,4,7,2>: Cost 3 vsldoi8 <7,2,u,4>, <7,2,u,4>
+  2640825150U,	// <u,4,7,3>: Cost 3 vsldoi4 <3,u,4,7>, <3,u,4,7>
+  1235815632U,	// <u,4,7,4>: Cost 2 vmrglw RHS, <4,4,4,4>
+  1235814094U,	// <u,4,7,5>: Cost 2 vmrglw RHS, <2,3,4,5>
+  2730767905U,	// <u,4,7,6>: Cost 3 vsldoi8 <7,6,u,4>, <7,6,u,4>
+  2309556892U,	// <u,4,7,7>: Cost 3 vmrglw RHS, <3,6,4,7>
+  1235814097U,	// <u,4,7,u>: Cost 2 vmrglw RHS, <2,3,4,u>
+  1561116774U,	// <u,4,u,0>: Cost 2 vsldoi4 <2,u,4,u>, LHS
+  1618540334U,	// <u,4,u,1>: Cost 2 vsldoi8 <1,2,u,4>, LHS
+  1561118822U,	// <u,4,u,2>: Cost 2 vsldoi4 <2,u,4,u>, <2,u,4,u>
+  2692282300U,	// <u,4,u,3>: Cost 3 vsldoi8 <1,2,u,4>, <u,3,0,1>
+  229035318U,	// <u,4,u,4>: Cost 1 vspltisw0 RHS
+  120368438U,	// <u,4,u,5>: Cost 1 vmrghw LHS, RHS
+  604859945U,	// <u,4,u,6>: Cost 1 vsldoi12 LHS, RHS
+  2309565084U,	// <u,4,u,7>: Cost 3 vmrglw RHS, <3,6,4,7>
+  604859963U,	// <u,4,u,u>: Cost 1 vsldoi12 LHS, RHS
+  2690293760U,	// <u,5,0,0>: Cost 3 vsldoi8 <0,u,u,5>, <0,0,0,0>
+  1616552038U,	// <u,5,0,1>: Cost 2 vsldoi8 <0,u,u,5>, LHS
+  2640840434U,	// <u,5,0,2>: Cost 3 vsldoi4 <3,u,5,0>, <2,3,u,5>
+  2640841536U,	// <u,5,0,3>: Cost 3 vsldoi4 <3,u,5,0>, <3,u,5,0>
+  1613381970U,	// <u,5,0,4>: Cost 2 vsldoi8 <0,4,1,5>, <0,4,1,5>
+  2316135642U,	// <u,5,0,5>: Cost 3 vmrglw <5,6,u,0>, <4,4,5,5>
+  2289592834U,	// <u,5,0,6>: Cost 3 vmrglw <1,2,u,0>, <3,4,5,6>
+  2664732324U,	// <u,5,0,7>: Cost 3 vsldoi4 <7,u,5,0>, <7,u,5,0>
+  1616552661U,	// <u,5,0,u>: Cost 2 vsldoi8 <0,u,u,5>, <0,u,u,5>
+  1573077094U,	// <u,5,1,0>: Cost 2 vsldoi4 <4,u,5,1>, LHS
+  1237536282U,	// <u,5,1,1>: Cost 2 vmrglw <4,u,5,1>, <4,u,5,1>
+  2690294678U,	// <u,5,1,2>: Cost 3 vsldoi8 <0,u,u,5>, <1,2,3,0>
+  2646821014U,	// <u,5,1,3>: Cost 3 vsldoi4 <4,u,5,1>, <3,0,1,2>
+  1573080602U,	// <u,5,1,4>: Cost 2 vsldoi4 <4,u,5,1>, <4,u,5,1>
+  1189466116U,	// <u,5,1,5>: Cost 2 vmrghw LHS, <5,5,5,5>
+  1189466210U,	// <u,5,1,6>: Cost 2 vmrghw LHS, <5,6,7,0>
+  2646823930U,	// <u,5,1,7>: Cost 3 vsldoi4 <4,u,5,1>, <7,0,1,2>
+  1573082926U,	// <u,5,1,u>: Cost 2 vsldoi4 <4,u,5,1>, LHS
+  2640855142U,	// <u,5,2,0>: Cost 3 vsldoi4 <3,u,5,2>, LHS
+  2697594448U,	// <u,5,2,1>: Cost 3 vsldoi8 <2,1,u,5>, <2,1,u,5>
+  2690295400U,	// <u,5,2,2>: Cost 3 vsldoi8 <0,u,u,5>, <2,2,2,2>
+  1625179890U,	// <u,5,2,3>: Cost 2 vsldoi8 <2,3,u,5>, <2,3,u,5>
+  2699585347U,	// <u,5,2,4>: Cost 3 vsldoi8 <2,4,u,5>, <2,4,u,5>
+  2781171471U,	// <u,5,2,5>: Cost 3 vsldoi12 RHS, <5,2,5,3>
+  2690295738U,	// <u,5,2,6>: Cost 3 vsldoi8 <0,u,u,5>, <2,6,3,7>
+  3775318070U,	// <u,5,2,7>: Cost 4 vsldoi8 <2,7,u,5>, <2,7,u,5>
+  1628498055U,	// <u,5,2,u>: Cost 2 vsldoi8 <2,u,u,5>, <2,u,u,5>
+  2287627234U,	// <u,5,3,0>: Cost 3 vmrglw LHS, <4,1,5,0>
+  1257016210U,	// <u,5,3,1>: Cost 2 vmrglw LHS, <4,0,5,1>
+  2646836942U,	// <u,5,3,2>: Cost 3 vsldoi4 <4,u,5,3>, <2,3,4,5>
+  2287625131U,	// <u,5,3,3>: Cost 3 vmrglw LHS, <1,2,5,3>
+  2287627238U,	// <u,5,3,4>: Cost 3 vmrglw LHS, <4,1,5,4>
+  1257016538U,	// <u,5,3,5>: Cost 2 vmrglw LHS, <4,4,5,5>
+  1209240066U,	// <u,5,3,6>: Cost 2 vmrglw LHS, <3,4,5,6>
+  2287625459U,	// <u,5,3,7>: Cost 3 vmrglw LHS, <1,6,5,7>
+  1209240068U,	// <u,5,3,u>: Cost 2 vmrglw LHS, <3,4,5,u>
+  2640871526U,	// <u,5,4,0>: Cost 3 vsldoi4 <3,u,5,4>, LHS
+  2316168082U,	// <u,5,4,1>: Cost 3 vmrglw <5,6,u,4>, <4,0,5,1>
+  2640873202U,	// <u,5,4,2>: Cost 3 vsldoi4 <3,u,5,4>, <2,3,u,5>
+  2640874308U,	// <u,5,4,3>: Cost 3 vsldoi4 <3,u,5,4>, <3,u,5,4>
+  1637788917U,	// <u,5,4,4>: Cost 2 vsldoi8 <4,4,u,5>, <4,4,u,5>
+  1616555318U,	// <u,5,4,5>: Cost 2 vsldoi8 <0,u,u,5>, RHS
+  2287638591U,	// <u,5,4,6>: Cost 3 vmrglw <0,u,u,4>, <u,4,5,6>
+  2664765096U,	// <u,5,4,7>: Cost 3 vsldoi4 <7,u,5,4>, <7,u,5,4>
+  1616555561U,	// <u,5,4,u>: Cost 2 vsldoi8 <0,u,u,5>, RHS
+  1573109862U,	// <u,5,5,0>: Cost 2 vsldoi4 <4,u,5,5>, LHS
+  2646852404U,	// <u,5,5,1>: Cost 3 vsldoi4 <4,u,5,5>, <1,1,1,1>
+  2646853224U,	// <u,5,5,2>: Cost 3 vsldoi4 <4,u,5,5>, <2,2,2,2>
+  2287646618U,	// <u,5,5,3>: Cost 3 vmrglw <0,u,u,5>, <u,2,5,3>
+  1573113374U,	// <u,5,5,4>: Cost 2 vsldoi4 <4,u,5,5>, <4,u,5,5>
+  296144182U,	// <u,5,5,5>: Cost 1 vspltisw1 RHS
+  1192448098U,	// <u,5,5,6>: Cost 2 vmrghw RHS, <5,6,7,0>
+  2287646946U,	// <u,5,5,7>: Cost 3 vmrglw <0,u,u,5>, <u,6,5,7>
+  296144182U,	// <u,5,5,u>: Cost 1 vspltisw1 RHS
+  1567146086U,	// <u,5,6,0>: Cost 2 vsldoi4 <3,u,5,6>, LHS
+  2628945300U,	// <u,5,6,1>: Cost 3 vsldoi4 <1,u,5,6>, <1,u,5,6>
+  2634917997U,	// <u,5,6,2>: Cost 3 vsldoi4 <2,u,5,6>, <2,u,5,6>
+  1567148870U,	// <u,5,6,3>: Cost 2 vsldoi4 <3,u,5,6>, <3,u,5,6>
+  1567149366U,	// <u,5,6,4>: Cost 2 vsldoi4 <3,u,5,6>, RHS
+  2781171799U,	// <u,5,6,5>: Cost 3 vsldoi12 RHS, <5,6,5,7>
+  1228950018U,	// <u,5,6,6>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6>
+  27705344U,	// <u,5,6,7>: Cost 0 copy RHS
+  27705344U,	// <u,5,6,u>: Cost 0 copy RHS
+  2628952166U,	// <u,5,7,0>: Cost 3 vsldoi4 <1,u,5,7>, LHS
+  1235815314U,	// <u,5,7,1>: Cost 2 vmrglw RHS, <4,0,5,1>
+  2309556734U,	// <u,5,7,2>: Cost 3 vmrglw RHS, <3,4,5,2>
+  2309555115U,	// <u,5,7,3>: Cost 3 vmrglw RHS, <1,2,5,3>
+  2628955446U,	// <u,5,7,4>: Cost 3 vsldoi4 <1,u,5,7>, RHS
+  1235815642U,	// <u,5,7,5>: Cost 2 vmrglw RHS, <4,4,5,5>
+  1235814914U,	// <u,5,7,6>: Cost 2 vmrglw RHS, <3,4,5,6>
+  2309555443U,	// <u,5,7,7>: Cost 3 vmrglw RHS, <1,6,5,7>
+  1235814916U,	// <u,5,7,u>: Cost 2 vmrglw RHS, <3,4,5,u>
+  1567162470U,	// <u,5,u,0>: Cost 2 vsldoi4 <3,u,5,u>, LHS
+  1616557870U,	// <u,5,u,1>: Cost 2 vsldoi8 <0,u,u,5>, LHS
+  2690299781U,	// <u,5,u,2>: Cost 3 vsldoi8 <0,u,u,5>, <u,2,3,0>
+  1567165256U,	// <u,5,u,3>: Cost 2 vsldoi4 <3,u,5,u>, <3,u,5,u>
+  1567165750U,	// <u,5,u,4>: Cost 2 vsldoi4 <3,u,5,u>, RHS
+  296144182U,	// <u,5,u,5>: Cost 1 vspltisw1 RHS
+  1209281026U,	// <u,5,u,6>: Cost 2 vmrglw LHS, <3,4,5,6>
+  27705344U,	// <u,5,u,7>: Cost 0 copy RHS
+  27705344U,	// <u,5,u,u>: Cost 0 copy RHS
+  2705563648U,	// <u,6,0,0>: Cost 3 vsldoi8 <3,4,u,6>, <0,0,0,0>
+  1631821926U,	// <u,6,0,1>: Cost 2 vsldoi8 <3,4,u,6>, LHS
+  2262462970U,	// <u,6,0,2>: Cost 3 vmrghw <u,0,1,2>, <6,2,7,3>
+  2646886941U,	// <u,6,0,3>: Cost 3 vsldoi4 <4,u,6,0>, <3,4,u,6>
+  2705563986U,	// <u,6,0,4>: Cost 3 vsldoi8 <3,4,u,6>, <0,4,1,5>
+  2316062652U,	// <u,6,0,5>: Cost 3 vmrglw <5,6,7,0>, <5,4,6,5>
+  2316137272U,	// <u,6,0,6>: Cost 3 vmrglw <5,6,u,0>, <6,6,6,6>
+  1215851830U,	// <u,6,0,7>: Cost 2 vmrglw <1,2,u,0>, RHS
+  1215851831U,	// <u,6,0,u>: Cost 2 vmrglw <1,2,u,0>, RHS
+  2634948710U,	// <u,6,1,0>: Cost 3 vsldoi4 <2,u,6,1>, LHS
+  2705564468U,	// <u,6,1,1>: Cost 3 vsldoi8 <3,4,u,6>, <1,1,1,1>
+  1189466618U,	// <u,6,1,2>: Cost 2 vmrghw LHS, <6,2,7,3>
+  2263208498U,	// <u,6,1,3>: Cost 3 vmrghw LHS, <6,3,4,5>
+  2693620843U,	// <u,6,1,4>: Cost 3 vsldoi8 <1,4,u,6>, <1,4,u,6>
+  2652868860U,	// <u,6,1,5>: Cost 3 vsldoi4 <5,u,6,1>, <5,u,6,1>
+  1189466936U,	// <u,6,1,6>: Cost 2 vmrghw LHS, <6,6,6,6>
+  1213869366U,	// <u,6,1,7>: Cost 2 vmrglw <0,u,u,1>, RHS
+  1213869367U,	// <u,6,1,u>: Cost 2 vmrglw <0,u,u,1>, RHS
+  2658844774U,	// <u,6,2,0>: Cost 3 vsldoi4 <6,u,6,2>, LHS
+  3771344465U,	// <u,6,2,1>: Cost 4 vsldoi8 <2,1,u,6>, <2,1,u,6>
+  1178554874U,	// <u,6,2,2>: Cost 2 vmrghw <6,2,7,3>, <6,2,7,3>
+  2698929907U,	// <u,6,2,3>: Cost 3 vsldoi8 <2,3,u,6>, <2,3,u,6>
+  2699593540U,	// <u,6,2,4>: Cost 3 vsldoi8 <2,4,u,6>, <2,4,u,6>
+  2700257173U,	// <u,6,2,5>: Cost 3 vsldoi8 <2,5,u,6>, <2,5,u,6>
+  2705565626U,	// <u,6,2,6>: Cost 3 vsldoi8 <3,4,u,6>, <2,6,3,7>
+  1226485046U,	// <u,6,2,7>: Cost 2 vmrglw <3,0,u,2>, RHS
+  1226485047U,	// <u,6,2,u>: Cost 2 vmrglw <3,0,u,2>, RHS
+  2705565846U,	// <u,6,3,0>: Cost 3 vsldoi8 <3,4,u,6>, <3,0,1,2>
+  2330756585U,	// <u,6,3,1>: Cost 3 vmrglw LHS, <2,0,6,1>
+  2330756829U,	// <u,6,3,2>: Cost 3 vmrglw LHS, <2,3,6,2>
+  2282981734U,	// <u,6,3,3>: Cost 3 vmrglw LHS, <3,2,6,3>
+  1631824413U,	// <u,6,3,4>: Cost 2 vsldoi8 <3,4,u,6>, <3,4,u,6>
+  2652885246U,	// <u,6,3,5>: Cost 3 vsldoi4 <5,u,6,3>, <5,u,6,3>
+  1257018168U,	// <u,6,3,6>: Cost 2 vmrglw LHS, <6,6,6,6>
+  135499062U,	// <u,6,3,7>: Cost 1 vmrglw LHS, RHS
+  135499063U,	// <u,6,3,u>: Cost 1 vmrglw LHS, RHS
+  2646917222U,	// <u,6,4,0>: Cost 3 vsldoi4 <4,u,6,4>, LHS
+  2217365931U,	// <u,6,4,1>: Cost 3 vmrghw <0,4,1,5>, <6,1,7,5>
+  2790167156U,	// <u,6,4,2>: Cost 3 vsldoi12 <6,4,2,u>, <6,4,2,u>
+  2646919709U,	// <u,6,4,3>: Cost 3 vsldoi4 <4,u,6,4>, <3,4,u,6>
+  2711538934U,	// <u,6,4,4>: Cost 3 vsldoi8 <4,4,u,6>, <4,4,u,6>
+  1631825206U,	// <u,6,4,5>: Cost 2 vsldoi8 <3,4,u,6>, RHS
+  2316170040U,	// <u,6,4,6>: Cost 3 vmrglw <5,6,u,4>, <6,6,6,6>
+  1215884598U,	// <u,6,4,7>: Cost 2 vmrglw <1,2,u,4>, RHS
+  1215884599U,	// <u,6,4,u>: Cost 2 vmrglw <1,2,u,4>, RHS
+  2634981478U,	// <u,6,5,0>: Cost 3 vsldoi4 <2,u,6,5>, LHS
+  2266190247U,	// <u,6,5,1>: Cost 3 vmrghw RHS, <6,1,7,1>
+  1192448506U,	// <u,6,5,2>: Cost 2 vmrghw RHS, <6,2,7,3>
+  2266190386U,	// <u,6,5,3>: Cost 3 vmrghw RHS, <6,3,4,5>
+  2634984758U,	// <u,6,5,4>: Cost 3 vsldoi4 <2,u,6,5>, RHS
+  2652901632U,	// <u,6,5,5>: Cost 3 vsldoi4 <5,u,6,5>, <5,u,6,5>
+  1192448824U,	// <u,6,5,6>: Cost 2 vmrghw RHS, <6,6,6,6>
+  1213902134U,	// <u,6,5,7>: Cost 2 vmrglw <0,u,u,5>, RHS
+  1213902135U,	// <u,6,5,u>: Cost 2 vmrglw <0,u,u,5>, RHS
+  1583808614U,	// <u,6,6,0>: Cost 2 vsldoi4 <6,6,6,6>, LHS
+  2322010445U,	// <u,6,6,1>: Cost 3 vmrglw <6,6,6,6>, <6,0,6,1>
+  2718839290U,	// <u,6,6,2>: Cost 3 vsldoi8 <5,6,u,6>, <6,2,7,3>
+  2670823965U,	// <u,6,6,3>: Cost 3 vsldoi4 <u,u,6,6>, <3,4,u,6>
+  1583811894U,	// <u,6,6,4>: Cost 2 vsldoi4 <6,6,6,6>, RHS
+  2724147961U,	// <u,6,6,5>: Cost 3 vsldoi8 <6,5,u,6>, <6,5,u,6>
+  363253046U,	// <u,6,6,6>: Cost 1 vspltisw2 RHS
+  1229172022U,	// <u,6,6,7>: Cost 2 vmrglw <3,4,u,6>, RHS
+  363253046U,	// <u,6,6,u>: Cost 1 vspltisw2 RHS
+  499458150U,	// <u,6,7,0>: Cost 1 vsldoi4 RHS, LHS
+  1573200692U,	// <u,6,7,1>: Cost 2 vsldoi4 RHS, <1,1,1,1>
+  1573201512U,	// <u,6,7,2>: Cost 2 vsldoi4 RHS, <2,2,2,2>
+  1573202070U,	// <u,6,7,3>: Cost 2 vsldoi4 RHS, <3,0,1,2>
+  499461673U,	// <u,6,7,4>: Cost 1 vsldoi4 RHS, RHS
+  1573203972U,	// <u,6,7,5>: Cost 2 vsldoi4 RHS, <5,5,5,5>
+  1235817272U,	// <u,6,7,6>: Cost 2 vmrglw RHS, <6,6,6,6>
+  162073910U,	// <u,6,7,7>: Cost 1 vmrglw RHS, RHS
+  162073911U,	// <u,6,7,u>: Cost 1 vmrglw RHS, RHS
+  499466342U,	// <u,6,u,0>: Cost 1 vsldoi4 RHS, LHS
+  1631827758U,	// <u,6,u,1>: Cost 2 vsldoi8 <3,4,u,6>, LHS
+  1573209704U,	// <u,6,u,2>: Cost 2 vsldoi4 RHS, <2,2,2,2>
+  1573210262U,	// <u,6,u,3>: Cost 2 vsldoi4 RHS, <3,0,1,2>
+  499469866U,	// <u,6,u,4>: Cost 1 vsldoi4 RHS, RHS
+  1631828122U,	// <u,6,u,5>: Cost 2 vsldoi8 <3,4,u,6>, RHS
+  363253046U,	// <u,6,u,6>: Cost 1 vspltisw2 RHS
+  135540022U,	// <u,6,u,7>: Cost 1 vmrglw LHS, RHS
+  135540023U,	// <u,6,u,u>: Cost 1 vmrglw LHS, RHS
+  1638465536U,	// <u,7,0,0>: Cost 2 vsldoi8 RHS, <0,0,0,0>
+  564723814U,	// <u,7,0,1>: Cost 1 vsldoi8 RHS, LHS
+  2712207533U,	// <u,7,0,2>: Cost 3 vsldoi8 RHS, <0,2,1,2>
+  2712207612U,	// <u,7,0,3>: Cost 3 vsldoi8 RHS, <0,3,1,0>
+  1638465874U,	// <u,7,0,4>: Cost 2 vsldoi8 RHS, <0,4,1,5>
+  1579192580U,	// <u,7,0,5>: Cost 2 vsldoi4 <5,u,7,0>, <5,u,7,0>
+  2712207862U,	// <u,7,0,6>: Cost 3 vsldoi8 RHS, <0,6,1,7>
+  2316137282U,	// <u,7,0,7>: Cost 3 vmrglw <5,6,u,0>, <6,6,7,7>
+  564724381U,	// <u,7,0,u>: Cost 1 vsldoi8 RHS, LHS
+  1189467130U,	// <u,7,1,0>: Cost 2 vmrghw LHS, <7,0,1,2>
+  1638466356U,	// <u,7,1,1>: Cost 2 vsldoi8 RHS, <1,1,1,1>
+  1638466454U,	// <u,7,1,2>: Cost 2 vsldoi8 RHS, <1,2,3,0>
+  2311500282U,	// <u,7,1,3>: Cost 3 vmrglw <4,u,u,1>, <6,2,7,3>
+  1189467494U,	// <u,7,1,4>: Cost 2 vmrghw LHS, <7,4,5,6>
+  2712208495U,	// <u,7,1,5>: Cost 3 vsldoi8 RHS, <1,5,0,1>
+  2694956302U,	// <u,7,1,6>: Cost 3 vsldoi8 <1,6,u,7>, <1,6,u,7>
+  1189467756U,	// <u,7,1,7>: Cost 2 vmrghw LHS, <7,7,7,7>
+  1638466940U,	// <u,7,1,u>: Cost 2 vsldoi8 RHS, <1,u,3,0>
+  2712208829U,	// <u,7,2,0>: Cost 3 vsldoi8 RHS, <2,0,1,2>
+  2712208927U,	// <u,7,2,1>: Cost 3 vsldoi8 RHS, <2,1,3,1>
+  1638467176U,	// <u,7,2,2>: Cost 2 vsldoi8 RHS, <2,2,2,2>
+  1638467238U,	// <u,7,2,3>: Cost 2 vsldoi8 RHS, <2,3,0,1>
+  2712209165U,	// <u,7,2,4>: Cost 3 vsldoi8 RHS, <2,4,2,5>
+  2712209256U,	// <u,7,2,5>: Cost 3 vsldoi8 RHS, <2,5,3,6>
+  1627187175U,	// <u,7,2,6>: Cost 2 vsldoi8 <2,6,u,7>, <2,6,u,7>
+  2324116290U,	// <u,7,2,7>: Cost 3 vmrglw <7,0,u,2>, <6,6,7,7>
+  1628514441U,	// <u,7,2,u>: Cost 2 vsldoi8 <2,u,u,7>, <2,u,u,7>
+  1638467734U,	// <u,7,3,0>: Cost 2 vsldoi8 RHS, <3,0,1,2>
+  2712209638U,	// <u,7,3,1>: Cost 3 vsldoi8 RHS, <3,1,1,1>
+  2700929387U,	// <u,7,3,2>: Cost 3 vsldoi8 <2,6,u,7>, <3,2,6,u>
+  1638467996U,	// <u,7,3,3>: Cost 2 vsldoi8 RHS, <3,3,3,3>
+  1638468098U,	// <u,7,3,4>: Cost 2 vsldoi8 RHS, <3,4,5,6>
+  2712210002U,	// <u,7,3,5>: Cost 3 vsldoi8 RHS, <3,5,5,5>
+  1585189856U,	// <u,7,3,6>: Cost 2 vsldoi4 <6,u,7,3>, <6,u,7,3>
+  1257018178U,	// <u,7,3,7>: Cost 2 vmrglw LHS, <6,6,7,7>
+  1638468382U,	// <u,7,3,u>: Cost 2 vsldoi8 RHS, <3,u,1,2>
+  1638468498U,	// <u,7,4,0>: Cost 2 vsldoi8 RHS, <4,0,5,1>
+  2712210378U,	// <u,7,4,1>: Cost 3 vsldoi8 RHS, <4,1,2,3>
+  2712210485U,	// <u,7,4,2>: Cost 3 vsldoi8 RHS, <4,2,5,2>
+  2712210564U,	// <u,7,4,3>: Cost 3 vsldoi8 RHS, <4,3,5,0>
+  1638468816U,	// <u,7,4,4>: Cost 2 vsldoi8 RHS, <4,4,4,4>
+  564727112U,	// <u,7,4,5>: Cost 1 vsldoi8 RHS, RHS
+  2712210809U,	// <u,7,4,6>: Cost 3 vsldoi8 RHS, <4,6,5,2>
+  2712210888U,	// <u,7,4,7>: Cost 3 vsldoi8 RHS, <4,7,5,0>
+  564727337U,	// <u,7,4,u>: Cost 1 vsldoi8 RHS, RHS
+  1192449018U,	// <u,7,5,0>: Cost 2 vmrghw RHS, <7,0,1,2>
+  2714201743U,	// <u,7,5,1>: Cost 3 vsldoi8 RHS, <5,1,0,1>
+  2712211198U,	// <u,7,5,2>: Cost 3 vsldoi8 RHS, <5,2,3,4>
+  2311533050U,	// <u,7,5,3>: Cost 3 vmrglw <4,u,u,5>, <6,2,7,3>
+  1192449382U,	// <u,7,5,4>: Cost 2 vmrghw RHS, <7,4,5,6>
+  1638469636U,	// <u,7,5,5>: Cost 2 vsldoi8 RHS, <5,5,5,5>
+  1638469730U,	// <u,7,5,6>: Cost 2 vsldoi8 RHS, <5,6,7,0>
+  1192449644U,	// <u,7,5,7>: Cost 2 vmrghw RHS, <7,7,7,7>
+  1638469892U,	// <u,7,5,u>: Cost 2 vsldoi8 RHS, <5,u,7,0>
+  2712211745U,	// <u,7,6,0>: Cost 3 vsldoi8 RHS, <6,0,1,2>
+  2712211879U,	// <u,7,6,1>: Cost 3 vsldoi8 RHS, <6,1,7,1>
+  1638470138U,	// <u,7,6,2>: Cost 2 vsldoi8 RHS, <6,2,7,3>
+  2712212018U,	// <u,7,6,3>: Cost 3 vsldoi8 RHS, <6,3,4,5>
+  2712212109U,	// <u,7,6,4>: Cost 3 vsldoi8 RHS, <6,4,5,6>
+  2712212203U,	// <u,7,6,5>: Cost 3 vsldoi8 RHS, <6,5,7,1>
+  1638470456U,	// <u,7,6,6>: Cost 2 vsldoi8 RHS, <6,6,6,6>
+  1638470478U,	// <u,7,6,7>: Cost 2 vsldoi8 RHS, <6,7,0,1>
+  1638470559U,	// <u,7,6,u>: Cost 2 vsldoi8 RHS, <6,u,0,1>
+  1235816546U,	// <u,7,7,0>: Cost 2 vmrglw RHS, <5,6,7,0>
+  2309558371U,	// <u,7,7,1>: Cost 3 vmrglw RHS, <5,6,7,1>
+  2641045434U,	// <u,7,7,2>: Cost 3 vsldoi4 <3,u,7,7>, <2,6,3,7>
+  1235816954U,	// <u,7,7,3>: Cost 2 vmrglw RHS, <6,2,7,3>
+  1235816550U,	// <u,7,7,4>: Cost 2 vmrglw RHS, <5,6,7,4>
+  2309558375U,	// <u,7,7,5>: Cost 3 vmrglw RHS, <5,6,7,5>
+  1585222628U,	// <u,7,7,6>: Cost 2 vsldoi4 <6,u,7,7>, <6,u,7,7>
+  430361910U,	// <u,7,7,7>: Cost 1 vspltisw3 RHS
+  430361910U,	// <u,7,7,u>: Cost 1 vspltisw3 RHS
+  1638471379U,	// <u,7,u,0>: Cost 2 vsldoi8 RHS, <u,0,1,2>
+  564729646U,	// <u,7,u,1>: Cost 1 vsldoi8 RHS, LHS
+  1638471557U,	// <u,7,u,2>: Cost 2 vsldoi8 RHS, <u,2,3,0>
+  1638471612U,	// <u,7,u,3>: Cost 2 vsldoi8 RHS, <u,3,0,1>
+  1638471743U,	// <u,7,u,4>: Cost 2 vsldoi8 RHS, <u,4,5,6>
+  564730010U,	// <u,7,u,5>: Cost 1 vsldoi8 RHS, RHS
+  1638471888U,	// <u,7,u,6>: Cost 2 vsldoi8 RHS, <u,6,3,7>
+  430361910U,	// <u,7,u,7>: Cost 1 vspltisw3 RHS
+  564730213U,	// <u,7,u,u>: Cost 1 vsldoi8 RHS, LHS
+  202162278U,	// <u,u,0,0>: Cost 1 vspltisw0 LHS
+  538189985U,	// <u,u,0,1>: Cost 1 vsldoi8 LHS, LHS
+  2685673645U,	// <u,u,0,2>: Cost 3 vsldoi8 LHS, <0,2,1,2>
+  1215848604U,	// <u,u,0,3>: Cost 2 vmrglw <1,2,u,0>, LHS
+  1611931986U,	// <u,u,0,4>: Cost 2 vsldoi8 LHS, <0,4,1,5>
+  1579266317U,	// <u,u,0,5>: Cost 2 vsldoi4 <5,u,u,0>, <5,u,u,0>
+  2289592861U,	// <u,u,0,6>: Cost 3 vmrglw <1,2,u,0>, <3,4,u,6>
+  1215851848U,	// <u,u,0,7>: Cost 2 vmrglw <1,2,u,0>, RHS
+  538190493U,	// <u,u,0,u>: Cost 1 vsldoi8 LHS, LHS
+  1549411025U,	// <u,u,1,0>: Cost 2 vsldoi4 <0,u,u,1>, <0,u,u,1>
+  115726126U,	// <u,u,1,1>: Cost 1 vmrghw LHS, LHS
+  604862254U,	// <u,u,1,2>: Cost 1 vsldoi12 LHS, LHS
+  1213866140U,	// <u,u,1,3>: Cost 2 vmrglw <0,u,u,1>, LHS
+  1549413686U,	// <u,u,1,4>: Cost 2 vsldoi4 <0,u,u,1>, RHS
+  115726490U,	// <u,u,1,5>: Cost 1 vmrghw LHS, RHS
+  1585247207U,	// <u,u,1,6>: Cost 2 vsldoi4 <6,u,u,1>, <6,u,u,1>
+  1213869384U,	// <u,u,1,7>: Cost 2 vmrglw <0,u,u,1>, RHS
+  604862308U,	// <u,u,1,u>: Cost 1 vsldoi12 LHS, LHS
+  1567334502U,	// <u,u,2,0>: Cost 2 vsldoi4 <3,u,u,2>, LHS
+  1190180654U,	// <u,u,2,1>: Cost 2 vmrghw <u,2,3,0>, LHS
+  336380006U,	// <u,u,2,2>: Cost 1 vspltisw2 LHS
+  835584U,	// <u,u,2,3>: Cost 0 copy LHS
+  1567337782U,	// <u,u,2,4>: Cost 2 vsldoi4 <3,u,u,2>, RHS
+  1190181018U,	// <u,u,2,5>: Cost 2 vmrghw <u,2,3,0>, RHS
+  1611933626U,	// <u,u,2,6>: Cost 2 vsldoi8 LHS, <2,6,3,7>
+  1226485064U,	// <u,u,2,7>: Cost 2 vmrglw <3,0,u,2>, RHS
+  835584U,	// <u,u,2,u>: Cost 0 copy LHS
+  475685587U,	// <u,u,3,0>: Cost 1 vsldoi4 LHS, LHS
+  1209239278U,	// <u,u,3,1>: Cost 2 vmrglw LHS, <2,3,u,1>
+  1209239765U,	// <u,u,3,2>: Cost 2 vmrglw LHS, <3,0,u,2>
+  135495836U,	// <u,u,3,3>: Cost 1 vmrglw LHS, LHS
+  475688246U,	// <u,u,3,4>: Cost 1 vsldoi4 LHS, RHS
+  1209239282U,	// <u,u,3,5>: Cost 2 vmrglw LHS, <2,3,u,5>
+  1209240093U,	// <u,u,3,6>: Cost 2 vmrglw LHS, <3,4,u,6>
+  135499080U,	// <u,u,3,7>: Cost 1 vmrglw LHS, RHS
+  135495841U,	// <u,u,3,u>: Cost 1 vmrglw LHS, LHS
+  1555406950U,	// <u,u,4,0>: Cost 2 vsldoi4 <1,u,u,4>, LHS
+  1555408301U,	// <u,u,4,1>: Cost 2 vsldoi4 <1,u,u,4>, <1,u,u,4>
+  2289625301U,	// <u,u,4,2>: Cost 3 vmrglw <1,2,u,4>, <3,0,u,2>
+  1215881372U,	// <u,u,4,3>: Cost 2 vmrglw <1,2,u,4>, LHS
+  229035318U,	// <u,u,4,4>: Cost 1 vspltisw0 RHS
+  538193206U,	// <u,u,4,5>: Cost 1 vsldoi8 LHS, RHS
+  2289625629U,	// <u,u,4,6>: Cost 3 vmrglw <1,2,u,4>, <3,4,u,6>
+  1215884616U,	// <u,u,4,7>: Cost 2 vmrglw <1,2,u,4>, RHS
+  538193449U,	// <u,u,4,u>: Cost 1 vsldoi8 LHS, RHS
+  1549443797U,	// <u,u,5,0>: Cost 2 vsldoi4 <0,u,u,5>, <0,u,u,5>
+  118708014U,	// <u,u,5,1>: Cost 1 vmrghw RHS, LHS
+  1561389191U,	// <u,u,5,2>: Cost 2 vsldoi4 <2,u,u,5>, <2,u,u,5>
+  1213898908U,	// <u,u,5,3>: Cost 2 vmrglw <0,u,u,5>, LHS
+  1549446454U,	// <u,u,5,4>: Cost 2 vsldoi4 <0,u,u,5>, RHS
+  118708378U,	// <u,u,5,5>: Cost 1 vmrghw RHS, RHS
+  604862618U,	// <u,u,5,6>: Cost 1 vsldoi12 LHS, RHS
+  1213902152U,	// <u,u,5,7>: Cost 2 vmrglw <0,u,u,5>, RHS
+  604862636U,	// <u,u,5,u>: Cost 1 vsldoi12 LHS, RHS
+  1567367270U,	// <u,u,6,0>: Cost 2 vsldoi4 <3,u,u,6>, LHS
+  1192892206U,	// <u,u,6,1>: Cost 2 vmrghw <u,6,3,7>, LHS
+  1638478330U,	// <u,u,6,2>: Cost 2 vsldoi8 RHS, <6,2,7,3>
+  1679046864U,	// <u,u,6,3>: Cost 2 vsldoi12 LHS, <u,6,3,7>
+  1567370550U,	// <u,u,6,4>: Cost 2 vsldoi4 <3,u,u,6>, RHS
+  1192892570U,	// <u,u,6,5>: Cost 2 vmrghw <u,6,3,7>, RHS
+  363253046U,	// <u,u,6,6>: Cost 1 vspltisw2 RHS
+  27705344U,	// <u,u,6,7>: Cost 0 copy RHS
+  27705344U,	// <u,u,6,u>: Cost 0 copy RHS
+  499605606U,	// <u,u,7,0>: Cost 1 vsldoi4 RHS, LHS
+  1235812425U,	// <u,u,7,1>: Cost 2 vmrglw RHS, <0,0,u,1>
+  1561405577U,	// <u,u,7,2>: Cost 2 vsldoi4 <2,u,u,7>, <2,u,u,7>
+  162070684U,	// <u,u,7,3>: Cost 1 vmrglw RHS, LHS
+  499609147U,	// <u,u,7,4>: Cost 1 vsldoi4 RHS, RHS
+  1235812753U,	// <u,u,7,5>: Cost 2 vmrglw RHS, <0,4,u,5>
+  1235814941U,	// <u,u,7,6>: Cost 2 vmrglw RHS, <3,4,u,6>
+  162073928U,	// <u,u,7,7>: Cost 1 vmrglw RHS, RHS
+  162070689U,	// <u,u,7,u>: Cost 1 vmrglw RHS, LHS
+  475726552U,	// <u,u,u,0>: Cost 1 vsldoi4 LHS, LHS
+  538195758U,	// <u,u,u,1>: Cost 1 vsldoi8 LHS, LHS
+  604862821U,	// <u,u,u,2>: Cost 1 vsldoi12 LHS, LHS
+  835584U,	// <u,u,u,3>: Cost 0 copy LHS
+  475729206U,	// <u,u,u,4>: Cost 1 vsldoi4 LHS, RHS
+  538196122U,	// <u,u,u,5>: Cost 1 vsldoi8 LHS, RHS
+  604862861U,	// <u,u,u,6>: Cost 1 vsldoi12 LHS, RHS
+  27705344U,	// <u,u,u,7>: Cost 0 copy RHS
+  835584U,	// <u,u,u,u>: Cost 0 copy LHS
+  0
+};
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
new file mode 100644
index 0000000..9895ee6
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -0,0 +1,1007 @@
+//===-- PPCRegisterInfo.cpp - PowerPC Register Information ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PowerPC implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCRegisterInfo.h"
+#include "PPC.h"
+#include "PPCFrameLowering.h"
+#include "PPCInstrBuilder.h"
+#include "PPCMachineFunctionInfo.h"
+#include "PPCSubtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include <cstdlib>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "reginfo"
+
+#define GET_REGINFO_TARGET_DESC
+#include "PPCGenRegisterInfo.inc"
+
+static cl::opt<bool>
+EnableBasePointer("ppc-use-base-pointer", cl::Hidden, cl::init(true),
+         cl::desc("Enable use of a base pointer for complex stack frames"));
+
+static cl::opt<bool>
+AlwaysBasePointer("ppc-always-use-base-pointer", cl::Hidden, cl::init(false),
+         cl::desc("Force the use of a base pointer in every function"));
+
+PPCRegisterInfo::PPCRegisterInfo(const PPCSubtarget &ST)
+  : PPCGenRegisterInfo(ST.isPPC64() ? PPC::LR8 : PPC::LR,
+                       ST.isPPC64() ? 0 : 1,
+                       ST.isPPC64() ? 0 : 1),
+    Subtarget(ST) {
+  ImmToIdxMap[PPC::LD]   = PPC::LDX;    ImmToIdxMap[PPC::STD]  = PPC::STDX;
+  ImmToIdxMap[PPC::LBZ]  = PPC::LBZX;   ImmToIdxMap[PPC::STB]  = PPC::STBX;
+  ImmToIdxMap[PPC::LHZ]  = PPC::LHZX;   ImmToIdxMap[PPC::LHA]  = PPC::LHAX;
+  ImmToIdxMap[PPC::LWZ]  = PPC::LWZX;   ImmToIdxMap[PPC::LWA]  = PPC::LWAX;
+  ImmToIdxMap[PPC::LFS]  = PPC::LFSX;   ImmToIdxMap[PPC::LFD]  = PPC::LFDX;
+  ImmToIdxMap[PPC::STH]  = PPC::STHX;   ImmToIdxMap[PPC::STW]  = PPC::STWX;
+  ImmToIdxMap[PPC::STFS] = PPC::STFSX;  ImmToIdxMap[PPC::STFD] = PPC::STFDX;
+  ImmToIdxMap[PPC::ADDI] = PPC::ADD4;
+  ImmToIdxMap[PPC::LWA_32] = PPC::LWAX_32;
+
+  // 64-bit
+  ImmToIdxMap[PPC::LHA8] = PPC::LHAX8; ImmToIdxMap[PPC::LBZ8] = PPC::LBZX8;
+  ImmToIdxMap[PPC::LHZ8] = PPC::LHZX8; ImmToIdxMap[PPC::LWZ8] = PPC::LWZX8;
+  ImmToIdxMap[PPC::STB8] = PPC::STBX8; ImmToIdxMap[PPC::STH8] = PPC::STHX8;
+  ImmToIdxMap[PPC::STW8] = PPC::STWX8; ImmToIdxMap[PPC::STDU] = PPC::STDUX;
+  ImmToIdxMap[PPC::ADDI8] = PPC::ADD8;
+}
+
+/// getPointerRegClass - Return the register class to use to hold pointers.
+/// This is used for addressing modes.
+const TargetRegisterClass *
+PPCRegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind)
+                                                                       const {
+  // Note that PPCInstrInfo::FoldImmediate also directly uses this Kind value
+  // when it checks for ZERO folding.
+  if (Kind == 1) {
+    if (Subtarget.isPPC64())
+      return &PPC::G8RC_NOX0RegClass;
+    return &PPC::GPRC_NOR0RegClass;
+  }
+
+  if (Subtarget.isPPC64())
+    return &PPC::G8RCRegClass;
+  return &PPC::GPRCRegClass;
+}
+
+const MCPhysReg*
+PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  if (Subtarget.isDarwinABI())
+    return Subtarget.isPPC64() ? (Subtarget.hasAltivec() ?
+                                  CSR_Darwin64_Altivec_SaveList :
+                                  CSR_Darwin64_SaveList) :
+                                 (Subtarget.hasAltivec() ?
+                                  CSR_Darwin32_Altivec_SaveList :
+                                  CSR_Darwin32_SaveList);
+
+  return Subtarget.isPPC64() ? (Subtarget.hasAltivec() ?
+                                CSR_SVR464_Altivec_SaveList :
+                                CSR_SVR464_SaveList) :
+                               (Subtarget.hasAltivec() ?
+                                CSR_SVR432_Altivec_SaveList :
+                                CSR_SVR432_SaveList);
+}
+
+const uint32_t*
+PPCRegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
+  if (Subtarget.isDarwinABI())
+    return Subtarget.isPPC64() ? (Subtarget.hasAltivec() ?
+                                  CSR_Darwin64_Altivec_RegMask :
+                                  CSR_Darwin64_RegMask) :
+                                 (Subtarget.hasAltivec() ?
+                                  CSR_Darwin32_Altivec_RegMask :
+                                  CSR_Darwin32_RegMask);
+
+  return Subtarget.isPPC64() ? (Subtarget.hasAltivec() ?
+                                CSR_SVR464_Altivec_RegMask :
+                                CSR_SVR464_RegMask) :
+                               (Subtarget.hasAltivec() ?
+                                CSR_SVR432_Altivec_RegMask :
+                                CSR_SVR432_RegMask);
+}
+
+const uint32_t*
+PPCRegisterInfo::getNoPreservedMask() const {
+  return CSR_NoRegs_RegMask;
+}
+
+BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  const PPCFrameLowering *PPCFI =
+    static_cast<const PPCFrameLowering*>(MF.getTarget().getFrameLowering());
+
+  // The ZERO register is not really a register, but the representation of r0
+  // when used in instructions that treat r0 as the constant 0.
+  Reserved.set(PPC::ZERO);
+  Reserved.set(PPC::ZERO8);
+
+  // The FP register is also not really a register, but is the representation
+  // of the frame pointer register used by ISD::FRAMEADDR.
+  Reserved.set(PPC::FP);
+  Reserved.set(PPC::FP8);
+
+  // The BP register is also not really a register, but is the representation
+  // of the base pointer register used by setjmp.
+  Reserved.set(PPC::BP);
+  Reserved.set(PPC::BP8);
+
+  // The counter registers must be reserved so that counter-based loops can
+  // be correctly formed (and the mtctr instructions are not DCE'd).
+  Reserved.set(PPC::CTR);
+  Reserved.set(PPC::CTR8);
+
+  Reserved.set(PPC::R1);
+  Reserved.set(PPC::LR);
+  Reserved.set(PPC::LR8);
+  Reserved.set(PPC::RM);
+
+  if (!Subtarget.isDarwinABI() || !Subtarget.hasAltivec())
+    Reserved.set(PPC::VRSAVE);
+
+  // The SVR4 ABI reserves r2 and r13
+  if (Subtarget.isSVR4ABI()) {
+    Reserved.set(PPC::R2);  // System-reserved register
+    Reserved.set(PPC::R13); // Small Data Area pointer register
+  }
+  
+  // On PPC64, r13 is the thread pointer. Never allocate this register.
+  if (Subtarget.isPPC64()) {
+    Reserved.set(PPC::R13);
+
+    Reserved.set(PPC::X1);
+    Reserved.set(PPC::X13);
+
+    if (PPCFI->needsFP(MF))
+      Reserved.set(PPC::X31);
+
+    if (hasBasePointer(MF))
+      Reserved.set(PPC::X30);
+
+    // The 64-bit SVR4 ABI reserves r2 for the TOC pointer.
+    if (Subtarget.isSVR4ABI()) {
+      Reserved.set(PPC::X2);
+    }
+  }
+
+  if (PPCFI->needsFP(MF))
+    Reserved.set(PPC::R31);
+
+  if (hasBasePointer(MF)) {
+  	if (Subtarget.isSVR4ABI() && !Subtarget.isPPC64() &&
+        MF.getTarget().getRelocationModel() == Reloc::PIC_)
+      Reserved.set(PPC::R29);
+    else
+      Reserved.set(PPC::R30);
+  }
+
+  if (Subtarget.isSVR4ABI() && !Subtarget.isPPC64() &&
+      MF.getTarget().getRelocationModel() == Reloc::PIC_)
+    Reserved.set(PPC::R30);
+
+  // Reserve Altivec registers when Altivec is unavailable.
+  if (!Subtarget.hasAltivec())
+    for (TargetRegisterClass::iterator I = PPC::VRRCRegClass.begin(),
+         IE = PPC::VRRCRegClass.end(); I != IE; ++I)
+      Reserved.set(*I);
+
+  return Reserved;
+}
+
+unsigned
+PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
+                                         MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  const unsigned DefaultSafety = 1;
+
+  switch (RC->getID()) {
+  default:
+    return 0;
+  case PPC::G8RC_NOX0RegClassID:
+  case PPC::GPRC_NOR0RegClassID: 
+  case PPC::G8RCRegClassID:
+  case PPC::GPRCRegClassID: {
+    unsigned FP = TFI->hasFP(MF) ? 1 : 0;
+    return 32 - FP - DefaultSafety;
+  }
+  case PPC::F8RCRegClassID:
+  case PPC::F4RCRegClassID:
+  case PPC::VRRCRegClassID:
+  case PPC::VFRCRegClassID:
+  case PPC::VSLRCRegClassID:
+  case PPC::VSHRCRegClassID:
+    return 32 - DefaultSafety;
+  case PPC::VSRCRegClassID:
+  case PPC::VSFRCRegClassID:
+    return 64 - DefaultSafety;
+  case PPC::CRRCRegClassID:
+    return 8 - DefaultSafety;
+  }
+}
+
+const TargetRegisterClass*
+PPCRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC)const {
+  if (Subtarget.hasVSX()) {
+    // With VSX, we can inflate various sub-register classes to the full VSX
+    // register set.
+
+    if (RC == &PPC::F8RCRegClass)
+      return &PPC::VSFRCRegClass;
+    else if (RC == &PPC::VRRCRegClass)
+      return &PPC::VSRCRegClass;
+  }
+
+  return TargetRegisterInfo::getLargestLegalSuperClass(RC);
+}
+
+//===----------------------------------------------------------------------===//
+// Stack Frame Processing methods
+//===----------------------------------------------------------------------===//
+
+/// lowerDynamicAlloc - Generate the code for allocating an object in the
+/// current frame.  The sequence of code with be in the general form
+///
+///   addi   R0, SP, \#frameSize ; get the address of the previous frame
+///   stwxu  R0, SP, Rnegsize   ; add and update the SP with the negated size
+///   addi   Rnew, SP, \#maxCalFrameSize ; get the top of the allocation
+///
+void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const {
+  // Get the instruction.
+  MachineInstr &MI = *II;
+  // Get the instruction's basic block.
+  MachineBasicBlock &MBB = *MI.getParent();
+  // Get the basic block's function.
+  MachineFunction &MF = *MBB.getParent();
+  // Get the frame info.
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  // Get the instruction info.
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  // Determine whether 64-bit pointers are used.
+  bool LP64 = Subtarget.isPPC64();
+  DebugLoc dl = MI.getDebugLoc();
+
+  // Get the maximum call stack size.
+  unsigned maxCallFrameSize = MFI->getMaxCallFrameSize();
+  // Get the total frame size.
+  unsigned FrameSize = MFI->getStackSize();
+  
+  // Get stack alignments.
+  unsigned TargetAlign = MF.getTarget().getFrameLowering()->getStackAlignment();
+  unsigned MaxAlign = MFI->getMaxAlignment();
+  assert((maxCallFrameSize & (MaxAlign-1)) == 0 &&
+         "Maximum call-frame size not sufficiently aligned");
+
+  // Determine the previous frame's address.  If FrameSize can't be
+  // represented as 16 bits or we need special alignment, then we load the
+  // previous frame's address from 0(SP).  Why not do an addis of the hi? 
+  // Because R0 is our only safe tmp register and addi/addis treat R0 as zero. 
+  // Constructing the constant and adding would take 3 instructions. 
+  // Fortunately, a frame greater than 32K is rare.
+  const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
+  const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+  unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+  
+  if (MaxAlign < TargetAlign && isInt<16>(FrameSize)) {
+    BuildMI(MBB, II, dl, TII.get(PPC::ADDI), Reg)
+      .addReg(PPC::R31)
+      .addImm(FrameSize);
+  } else if (LP64) {
+    BuildMI(MBB, II, dl, TII.get(PPC::LD), Reg)
+      .addImm(0)
+      .addReg(PPC::X1);
+  } else {
+    BuildMI(MBB, II, dl, TII.get(PPC::LWZ), Reg)
+      .addImm(0)
+      .addReg(PPC::R1);
+  }
+
+  bool KillNegSizeReg = MI.getOperand(1).isKill();
+  unsigned NegSizeReg = MI.getOperand(1).getReg();
+
+  // Grow the stack and update the stack pointer link, then determine the
+  // address of new allocated space.
+  if (LP64) {
+    if (MaxAlign > TargetAlign) {
+      unsigned UnalNegSizeReg = NegSizeReg;
+      NegSizeReg = MF.getRegInfo().createVirtualRegister(G8RC);
+
+      // Unfortunately, there is no andi, only andi., and we can't insert that
+      // here because we might clobber cr0 while it is live.
+      BuildMI(MBB, II, dl, TII.get(PPC::LI8), NegSizeReg)
+        .addImm(~(MaxAlign-1));
+
+      unsigned NegSizeReg1 = NegSizeReg;
+      NegSizeReg = MF.getRegInfo().createVirtualRegister(G8RC);
+      BuildMI(MBB, II, dl, TII.get(PPC::AND8), NegSizeReg)
+        .addReg(UnalNegSizeReg, getKillRegState(KillNegSizeReg))
+        .addReg(NegSizeReg1, RegState::Kill);
+      KillNegSizeReg = true;
+    }
+
+    BuildMI(MBB, II, dl, TII.get(PPC::STDUX), PPC::X1)
+      .addReg(Reg, RegState::Kill)
+      .addReg(PPC::X1)
+      .addReg(NegSizeReg, getKillRegState(KillNegSizeReg));
+    BuildMI(MBB, II, dl, TII.get(PPC::ADDI8), MI.getOperand(0).getReg())
+      .addReg(PPC::X1)
+      .addImm(maxCallFrameSize);
+  } else {
+    if (MaxAlign > TargetAlign) {
+      unsigned UnalNegSizeReg = NegSizeReg;
+      NegSizeReg = MF.getRegInfo().createVirtualRegister(GPRC);
+
+      // Unfortunately, there is no andi, only andi., and we can't insert that
+      // here because we might clobber cr0 while it is live.
+      BuildMI(MBB, II, dl, TII.get(PPC::LI), NegSizeReg)
+        .addImm(~(MaxAlign-1));
+
+      unsigned NegSizeReg1 = NegSizeReg;
+      NegSizeReg = MF.getRegInfo().createVirtualRegister(GPRC);
+      BuildMI(MBB, II, dl, TII.get(PPC::AND), NegSizeReg)
+        .addReg(UnalNegSizeReg, getKillRegState(KillNegSizeReg))
+        .addReg(NegSizeReg1, RegState::Kill);
+      KillNegSizeReg = true;
+    }
+
+    BuildMI(MBB, II, dl, TII.get(PPC::STWUX), PPC::R1)
+      .addReg(Reg, RegState::Kill)
+      .addReg(PPC::R1)
+      .addReg(NegSizeReg, getKillRegState(KillNegSizeReg));
+    BuildMI(MBB, II, dl, TII.get(PPC::ADDI), MI.getOperand(0).getReg())
+      .addReg(PPC::R1)
+      .addImm(maxCallFrameSize);
+  }
+  
+  // Discard the DYNALLOC instruction.
+  MBB.erase(II);
+}
+
+/// lowerCRSpilling - Generate the code for spilling a CR register. Instead of
+/// reserving a whole register (R0), we scrounge for one here. This generates
+/// code like this:
+///
+///   mfcr rA                  ; Move the conditional register into GPR rA.
+///   rlwinm rA, rA, SB, 0, 31 ; Shift the bits left so they are in CR0's slot.
+///   stw rA, FI               ; Store rA to the frame.
+///
+void PPCRegisterInfo::lowerCRSpilling(MachineBasicBlock::iterator II,
+                                      unsigned FrameIndex) const {
+  // Get the instruction.
+  MachineInstr &MI = *II;       // ; SPILL_CR <SrcReg>, <offset>
+  // Get the instruction's basic block.
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  DebugLoc dl = MI.getDebugLoc();
+
+  bool LP64 = Subtarget.isPPC64();
+  const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
+  const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+
+  unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+  unsigned SrcReg = MI.getOperand(0).getReg();
+
+  // We need to store the CR in the low 4-bits of the saved value. First, issue
+  // an MFOCRF to save all of the CRBits and, if needed, kill the SrcReg.
+  BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MFOCRF8 : PPC::MFOCRF), Reg)
+          .addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill()));
+    
+  // If the saved register wasn't CR0, shift the bits left so that they are in
+  // CR0's slot.
+  if (SrcReg != PPC::CR0) {
+    unsigned Reg1 = Reg;
+    Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+
+    // rlwinm rA, rA, ShiftBits, 0, 31.
+    BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::RLWINM8 : PPC::RLWINM), Reg)
+      .addReg(Reg1, RegState::Kill)
+      .addImm(getEncodingValue(SrcReg) * 4)
+      .addImm(0)
+      .addImm(31);
+  }
+
+  addFrameReference(BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::STW8 : PPC::STW))
+                    .addReg(Reg, RegState::Kill),
+                    FrameIndex);
+
+  // Discard the pseudo instruction.
+  MBB.erase(II);
+}
+
+void PPCRegisterInfo::lowerCRRestore(MachineBasicBlock::iterator II,
+                                      unsigned FrameIndex) const {
+  // Get the instruction.
+  MachineInstr &MI = *II;       // ; <DestReg> = RESTORE_CR <offset>
+  // Get the instruction's basic block.
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  DebugLoc dl = MI.getDebugLoc();
+
+  bool LP64 = Subtarget.isPPC64();
+  const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
+  const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+
+  unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+  unsigned DestReg = MI.getOperand(0).getReg();
+  assert(MI.definesRegister(DestReg) &&
+    "RESTORE_CR does not define its destination");
+
+  addFrameReference(BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::LWZ8 : PPC::LWZ),
+                              Reg), FrameIndex);
+
+  // If the reloaded register isn't CR0, shift the bits right so that they are
+  // in the right CR's slot.
+  if (DestReg != PPC::CR0) {
+    unsigned Reg1 = Reg;
+    Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+
+    unsigned ShiftBits = getEncodingValue(DestReg)*4;
+    // rlwinm r11, r11, 32-ShiftBits, 0, 31.
+    BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::RLWINM8 : PPC::RLWINM), Reg)
+             .addReg(Reg1, RegState::Kill).addImm(32-ShiftBits).addImm(0)
+             .addImm(31);
+  }
+
+  BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MTOCRF8 : PPC::MTOCRF), DestReg)
+             .addReg(Reg, RegState::Kill);
+
+  // Discard the pseudo instruction.
+  MBB.erase(II);
+}
+
+static unsigned getCRFromCRBit(unsigned SrcReg) {
+  unsigned Reg = 0;
+  if (SrcReg == PPC::CR0LT || SrcReg == PPC::CR0GT ||
+      SrcReg == PPC::CR0EQ || SrcReg == PPC::CR0UN)
+    Reg = PPC::CR0;
+  else if (SrcReg == PPC::CR1LT || SrcReg == PPC::CR1GT ||
+           SrcReg == PPC::CR1EQ || SrcReg == PPC::CR1UN)
+    Reg = PPC::CR1;
+  else if (SrcReg == PPC::CR2LT || SrcReg == PPC::CR2GT ||
+           SrcReg == PPC::CR2EQ || SrcReg == PPC::CR2UN)
+    Reg = PPC::CR2;
+  else if (SrcReg == PPC::CR3LT || SrcReg == PPC::CR3GT ||
+           SrcReg == PPC::CR3EQ || SrcReg == PPC::CR3UN)
+    Reg = PPC::CR3;
+  else if (SrcReg == PPC::CR4LT || SrcReg == PPC::CR4GT ||
+           SrcReg == PPC::CR4EQ || SrcReg == PPC::CR4UN)
+    Reg = PPC::CR4;
+  else if (SrcReg == PPC::CR5LT || SrcReg == PPC::CR5GT ||
+           SrcReg == PPC::CR5EQ || SrcReg == PPC::CR5UN)
+    Reg = PPC::CR5;
+  else if (SrcReg == PPC::CR6LT || SrcReg == PPC::CR6GT ||
+           SrcReg == PPC::CR6EQ || SrcReg == PPC::CR6UN)
+    Reg = PPC::CR6;
+  else if (SrcReg == PPC::CR7LT || SrcReg == PPC::CR7GT ||
+           SrcReg == PPC::CR7EQ || SrcReg == PPC::CR7UN)
+    Reg = PPC::CR7;
+
+  assert(Reg != 0 && "Invalid CR bit register");
+  return Reg;
+}
+
+void PPCRegisterInfo::lowerCRBitSpilling(MachineBasicBlock::iterator II,
+                                         unsigned FrameIndex) const {
+  // Get the instruction.
+  MachineInstr &MI = *II;       // ; SPILL_CRBIT <SrcReg>, <offset>
+  // Get the instruction's basic block.
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  DebugLoc dl = MI.getDebugLoc();
+
+  bool LP64 = Subtarget.isPPC64();
+  const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
+  const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+
+  unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+  unsigned SrcReg = MI.getOperand(0).getReg();
+
+  BuildMI(MBB, II, dl, TII.get(TargetOpcode::KILL),
+          getCRFromCRBit(SrcReg))
+          .addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill()));
+
+  BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MFOCRF8 : PPC::MFOCRF), Reg)
+          .addReg(getCRFromCRBit(SrcReg));
+    
+  // If the saved register wasn't CR0LT, shift the bits left so that the bit to
+  // store is the first one. Mask all but that bit.
+  unsigned Reg1 = Reg;
+  Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+
+  // rlwinm rA, rA, ShiftBits, 0, 0.
+  BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::RLWINM8 : PPC::RLWINM), Reg)
+    .addReg(Reg1, RegState::Kill)
+    .addImm(getEncodingValue(SrcReg))
+    .addImm(0).addImm(0);
+
+  addFrameReference(BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::STW8 : PPC::STW))
+                    .addReg(Reg, RegState::Kill),
+                    FrameIndex);
+
+  // Discard the pseudo instruction.
+  MBB.erase(II);
+}
+
+void PPCRegisterInfo::lowerCRBitRestore(MachineBasicBlock::iterator II,
+                                      unsigned FrameIndex) const {
+  // Get the instruction.
+  MachineInstr &MI = *II;       // ; <DestReg> = RESTORE_CRBIT <offset>
+  // Get the instruction's basic block.
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  DebugLoc dl = MI.getDebugLoc();
+
+  bool LP64 = Subtarget.isPPC64();
+  const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
+  const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+
+  unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+  unsigned DestReg = MI.getOperand(0).getReg();
+  assert(MI.definesRegister(DestReg) &&
+    "RESTORE_CRBIT does not define its destination");
+
+  addFrameReference(BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::LWZ8 : PPC::LWZ),
+                              Reg), FrameIndex);
+
+  BuildMI(MBB, II, dl, TII.get(TargetOpcode::IMPLICIT_DEF), DestReg);
+
+  unsigned RegO = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+  BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MFOCRF8 : PPC::MFOCRF), RegO)
+          .addReg(getCRFromCRBit(DestReg));
+
+  unsigned ShiftBits = getEncodingValue(DestReg);
+  // rlwimi r11, r10, 32-ShiftBits, ..., ...
+  BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::RLWIMI8 : PPC::RLWIMI), RegO)
+           .addReg(RegO, RegState::Kill).addReg(Reg, RegState::Kill)
+           .addImm(ShiftBits ? 32-ShiftBits : 0)
+           .addImm(ShiftBits).addImm(ShiftBits);
+           
+  BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MTOCRF8 : PPC::MTOCRF),
+          getCRFromCRBit(DestReg))
+            .addReg(RegO, RegState::Kill)
+	    // Make sure we have a use dependency all the way through this
+	    // sequence of instructions. We can't have the other bits in the CR
+	    // modified in between the mfocrf and the mtocrf.
+            .addReg(getCRFromCRBit(DestReg), RegState::Implicit);
+
+  // Discard the pseudo instruction.
+  MBB.erase(II);
+}
+
+void PPCRegisterInfo::lowerVRSAVESpilling(MachineBasicBlock::iterator II,
+                                          unsigned FrameIndex) const {
+  // Get the instruction.
+  MachineInstr &MI = *II;       // ; SPILL_VRSAVE <SrcReg>, <offset>
+  // Get the instruction's basic block.
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  DebugLoc dl = MI.getDebugLoc();
+
+  const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+  unsigned Reg = MF.getRegInfo().createVirtualRegister(GPRC);
+  unsigned SrcReg = MI.getOperand(0).getReg();
+
+  BuildMI(MBB, II, dl, TII.get(PPC::MFVRSAVEv), Reg)
+          .addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill()));
+    
+  addFrameReference(BuildMI(MBB, II, dl, TII.get(PPC::STW))
+                    .addReg(Reg, RegState::Kill),
+                    FrameIndex);
+
+  // Discard the pseudo instruction.
+  MBB.erase(II);
+}
+
+void PPCRegisterInfo::lowerVRSAVERestore(MachineBasicBlock::iterator II,
+                                         unsigned FrameIndex) const {
+  // Get the instruction.
+  MachineInstr &MI = *II;       // ; <DestReg> = RESTORE_VRSAVE <offset>
+  // Get the instruction's basic block.
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  DebugLoc dl = MI.getDebugLoc();
+
+  const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+  unsigned Reg = MF.getRegInfo().createVirtualRegister(GPRC);
+  unsigned DestReg = MI.getOperand(0).getReg();
+  assert(MI.definesRegister(DestReg) &&
+    "RESTORE_VRSAVE does not define its destination");
+
+  addFrameReference(BuildMI(MBB, II, dl, TII.get(PPC::LWZ),
+                              Reg), FrameIndex);
+
+  BuildMI(MBB, II, dl, TII.get(PPC::MTVRSAVEv), DestReg)
+             .addReg(Reg, RegState::Kill);
+
+  // Discard the pseudo instruction.
+  MBB.erase(II);
+}
+
+bool
+PPCRegisterInfo::hasReservedSpillSlot(const MachineFunction &MF,
+				      unsigned Reg, int &FrameIdx) const {
+
+  // For the nonvolatile condition registers (CR2, CR3, CR4) in an SVR4
+  // ABI, return true to prevent allocating an additional frame slot.
+  // For 64-bit, the CR save area is at SP+8; the value of FrameIdx = 0
+  // is arbitrary and will be subsequently ignored.  For 32-bit, we have
+  // previously created the stack slot if needed, so return its FrameIdx.
+  if (Subtarget.isSVR4ABI() && PPC::CR2 <= Reg && Reg <= PPC::CR4) {
+    if (Subtarget.isPPC64())
+      FrameIdx = 0;
+    else {
+      const PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+      FrameIdx = FI->getCRSpillFrameIndex();
+    }
+    return true;
+  }
+  return false;
+}
+
+// Figure out if the offset in the instruction must be a multiple of 4.
+// This is true for instructions like "STD".
+static bool usesIXAddr(const MachineInstr &MI) {
+  unsigned OpC = MI.getOpcode();
+
+  switch (OpC) {
+  default:
+    return false;
+  case PPC::LWA:
+  case PPC::LWA_32:
+  case PPC::LD:
+  case PPC::STD:
+    return true;
+  }
+}
+
+// Return the OffsetOperandNo given the FIOperandNum (and the instruction).
+static unsigned getOffsetONFromFION(const MachineInstr &MI,
+                                    unsigned FIOperandNum) {
+  // Take into account whether it's an add or mem instruction
+  unsigned OffsetOperandNo = (FIOperandNum == 2) ? 1 : 2;
+  if (MI.isInlineAsm())
+    OffsetOperandNo = FIOperandNum-1;
+
+  return OffsetOperandNo;
+}
+
+void
+PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                     int SPAdj, unsigned FIOperandNum,
+                                     RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected");
+
+  // Get the instruction.
+  MachineInstr &MI = *II;
+  // Get the instruction's basic block.
+  MachineBasicBlock &MBB = *MI.getParent();
+  // Get the basic block's function.
+  MachineFunction &MF = *MBB.getParent();
+  // Get the instruction info.
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  // Get the frame info.
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  DebugLoc dl = MI.getDebugLoc();
+
+  unsigned OffsetOperandNo = getOffsetONFromFION(MI, FIOperandNum);
+
+  // Get the frame index.
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+
+  // Get the frame pointer save index.  Users of this index are primarily
+  // DYNALLOC instructions.
+  PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+  int FPSI = FI->getFramePointerSaveIndex();
+  // Get the instruction opcode.
+  unsigned OpC = MI.getOpcode();
+  
+  // Special case for dynamic alloca.
+  if (FPSI && FrameIndex == FPSI &&
+      (OpC == PPC::DYNALLOC || OpC == PPC::DYNALLOC8)) {
+    lowerDynamicAlloc(II);
+    return;
+  }
+
+  // Special case for pseudo-ops SPILL_CR and RESTORE_CR, etc.
+  if (OpC == PPC::SPILL_CR) {
+    lowerCRSpilling(II, FrameIndex);
+    return;
+  } else if (OpC == PPC::RESTORE_CR) {
+    lowerCRRestore(II, FrameIndex);
+    return;
+  } else if (OpC == PPC::SPILL_CRBIT) {
+    lowerCRBitSpilling(II, FrameIndex);
+    return;
+  } else if (OpC == PPC::RESTORE_CRBIT) {
+    lowerCRBitRestore(II, FrameIndex);
+    return;
+  } else if (OpC == PPC::SPILL_VRSAVE) {
+    lowerVRSAVESpilling(II, FrameIndex);
+    return;
+  } else if (OpC == PPC::RESTORE_VRSAVE) {
+    lowerVRSAVERestore(II, FrameIndex);
+    return;
+  }
+
+  // Replace the FrameIndex with base register with GPR1 (SP) or GPR31 (FP).
+  MI.getOperand(FIOperandNum).ChangeToRegister(
+    FrameIndex < 0 ? getBaseRegister(MF) : getFrameRegister(MF), false);
+
+  // Figure out if the offset in the instruction is shifted right two bits.
+  bool isIXAddr = usesIXAddr(MI);
+
+  // If the instruction is not present in ImmToIdxMap, then it has no immediate
+  // form (and must be r+r).
+  bool noImmForm = !MI.isInlineAsm() && !ImmToIdxMap.count(OpC);
+
+  // Now add the frame object offset to the offset from r1.
+  int Offset = MFI->getObjectOffset(FrameIndex);
+  Offset += MI.getOperand(OffsetOperandNo).getImm();
+
+  // If we're not using a Frame Pointer that has been set to the value of the
+  // SP before having the stack size subtracted from it, then add the stack size
+  // to Offset to get the correct offset.
+  // Naked functions have stack size 0, although getStackSize may not reflect that
+  // because we didn't call all the pieces that compute it for naked functions.
+  if (!MF.getFunction()->getAttributes().
+        hasAttribute(AttributeSet::FunctionIndex, Attribute::Naked)) {
+    if (!(hasBasePointer(MF) && FrameIndex < 0))
+      Offset += MFI->getStackSize();
+  }
+
+  // If we can, encode the offset directly into the instruction.  If this is a
+  // normal PPC "ri" instruction, any 16-bit value can be safely encoded.  If
+  // this is a PPC64 "ix" instruction, only a 16-bit value with the low two bits
+  // clear can be encoded.  This is extremely uncommon, because normally you
+  // only "std" to a stack slot that is at least 4-byte aligned, but it can
+  // happen in invalid code.
+  assert(OpC != PPC::DBG_VALUE &&
+         "This should be handle in a target independent way");
+  if (!noImmForm && isInt<16>(Offset) && (!isIXAddr || (Offset & 3) == 0)) {
+    MI.getOperand(OffsetOperandNo).ChangeToImmediate(Offset);
+    return;
+  }
+
+  // The offset doesn't fit into a single register, scavenge one to build the
+  // offset in.
+
+  bool is64Bit = Subtarget.isPPC64();
+  const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
+  const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+  const TargetRegisterClass *RC = is64Bit ? G8RC : GPRC;
+  unsigned SRegHi = MF.getRegInfo().createVirtualRegister(RC),
+           SReg = MF.getRegInfo().createVirtualRegister(RC);
+
+  // Insert a set of rA with the full offset value before the ld, st, or add
+  BuildMI(MBB, II, dl, TII.get(is64Bit ? PPC::LIS8 : PPC::LIS), SRegHi)
+    .addImm(Offset >> 16);
+  BuildMI(MBB, II, dl, TII.get(is64Bit ? PPC::ORI8 : PPC::ORI), SReg)
+    .addReg(SRegHi, RegState::Kill)
+    .addImm(Offset);
+
+  // Convert into indexed form of the instruction:
+  // 
+  //   sth 0:rA, 1:imm 2:(rB) ==> sthx 0:rA, 2:rB, 1:r0
+  //   addi 0:rA 1:rB, 2, imm ==> add 0:rA, 1:rB, 2:r0
+  unsigned OperandBase;
+
+  if (noImmForm)
+    OperandBase = 1;
+  else if (OpC != TargetOpcode::INLINEASM) {
+    assert(ImmToIdxMap.count(OpC) &&
+           "No indexed form of load or store available!");
+    unsigned NewOpcode = ImmToIdxMap.find(OpC)->second;
+    MI.setDesc(TII.get(NewOpcode));
+    OperandBase = 1;
+  } else {
+    OperandBase = OffsetOperandNo;
+  }
+
+  unsigned StackReg = MI.getOperand(FIOperandNum).getReg();
+  MI.getOperand(OperandBase).ChangeToRegister(StackReg, false);
+  MI.getOperand(OperandBase + 1).ChangeToRegister(SReg, false, false, true);
+}
+
+unsigned PPCRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  if (!Subtarget.isPPC64())
+    return TFI->hasFP(MF) ? PPC::R31 : PPC::R1;
+  else
+    return TFI->hasFP(MF) ? PPC::X31 : PPC::X1;
+}
+
+unsigned PPCRegisterInfo::getBaseRegister(const MachineFunction &MF) const {
+  if (!hasBasePointer(MF))
+    return getFrameRegister(MF);
+
+  if (Subtarget.isPPC64())
+    return PPC::X30;
+
+  if (Subtarget.isSVR4ABI() &&
+      MF.getTarget().getRelocationModel() == Reloc::PIC_)
+    return PPC::R29;
+
+  return PPC::R30;
+}
+
+bool PPCRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
+  if (!EnableBasePointer)
+    return false;
+  if (AlwaysBasePointer)
+    return true;
+
+  // If we need to realign the stack, then the stack pointer can no longer
+  // serve as an offset into the caller's stack space. As a result, we need a
+  // base pointer.
+  return needsStackRealignment(MF);
+}
+
+bool PPCRegisterInfo::canRealignStack(const MachineFunction &MF) const {
+  if (MF.getFunction()->hasFnAttribute("no-realign-stack"))
+    return false;
+
+  return true;
+}
+
+bool PPCRegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const Function *F = MF.getFunction();
+  unsigned StackAlign = MF.getTarget().getFrameLowering()->getStackAlignment();
+  bool requiresRealignment =
+    ((MFI->getMaxAlignment() > StackAlign) ||
+     F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                     Attribute::StackAlignment));
+
+  return requiresRealignment && canRealignStack(MF);
+}
+
+/// Returns true if the instruction's frame index
+/// reference would be better served by a base register other than FP
+/// or SP. Used by LocalStackFrameAllocation to determine which frame index
+/// references it should create new base registers for.
+bool PPCRegisterInfo::
+needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
+  assert(Offset < 0 && "Local offset must be negative");
+
+  // It's the load/store FI references that cause issues, as it can be difficult
+  // to materialize the offset if it won't fit in the literal field. Estimate
+  // based on the size of the local frame and some conservative assumptions
+  // about the rest of the stack frame (note, this is pre-regalloc, so
+  // we don't know everything for certain yet) whether this offset is likely
+  // to be out of range of the immediate. Return true if so.
+
+  // We only generate virtual base registers for loads and stores that have
+  // an r+i form. Return false for everything else.
+  unsigned OpC = MI->getOpcode();
+  if (!ImmToIdxMap.count(OpC))
+    return false;
+
+  // Don't generate a new virtual base register just to add zero to it.
+  if ((OpC == PPC::ADDI || OpC == PPC::ADDI8) &&
+      MI->getOperand(2).getImm() == 0)
+    return false;
+
+  MachineBasicBlock &MBB = *MI->getParent();
+  MachineFunction &MF = *MBB.getParent();
+
+  const PPCFrameLowering *PPCFI =
+    static_cast<const PPCFrameLowering*>(MF.getTarget().getFrameLowering());
+  unsigned StackEst =
+    PPCFI->determineFrameLayout(MF, false, true);
+
+  // If we likely don't need a stack frame, then we probably don't need a
+  // virtual base register either.
+  if (!StackEst)
+    return false;
+
+  // Estimate an offset from the stack pointer.
+  // The incoming offset is relating to the SP at the start of the function,
+  // but when we access the local it'll be relative to the SP after local
+  // allocation, so adjust our SP-relative offset by that allocation size.
+  Offset += StackEst;
+
+  // The frame pointer will point to the end of the stack, so estimate the
+  // offset as the difference between the object offset and the FP location.
+  return !isFrameOffsetLegal(MI, Offset);
+}
+
+/// Insert defining instruction(s) for BaseReg to
+/// be a pointer to FrameIdx at the beginning of the basic block.
+void PPCRegisterInfo::
+materializeFrameBaseRegister(MachineBasicBlock *MBB,
+                             unsigned BaseReg, int FrameIdx,
+                             int64_t Offset) const {
+  unsigned ADDriOpc = Subtarget.isPPC64() ? PPC::ADDI8 : PPC::ADDI;
+
+  MachineBasicBlock::iterator Ins = MBB->begin();
+  DebugLoc DL;                  // Defaults to "unknown"
+  if (Ins != MBB->end())
+    DL = Ins->getDebugLoc();
+
+  const MachineFunction &MF = *MBB->getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const MCInstrDesc &MCID = TII.get(ADDriOpc);
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0, this, MF));
+
+  BuildMI(*MBB, Ins, DL, MCID, BaseReg)
+    .addFrameIndex(FrameIdx).addImm(Offset);
+}
+
+void PPCRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+                                        int64_t Offset) const {
+  unsigned FIOperandNum = 0;
+  while (!MI.getOperand(FIOperandNum).isFI()) {
+    ++FIOperandNum;
+    assert(FIOperandNum < MI.getNumOperands() &&
+           "Instr doesn't have FrameIndex operand!");
+  }
+
+  MI.getOperand(FIOperandNum).ChangeToRegister(BaseReg, false);
+  unsigned OffsetOperandNo = getOffsetONFromFION(MI, FIOperandNum);
+  Offset += MI.getOperand(OffsetOperandNo).getImm();
+  MI.getOperand(OffsetOperandNo).ChangeToImmediate(Offset);
+
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const MCInstrDesc &MCID = MI.getDesc();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  MRI.constrainRegClass(BaseReg,
+                        TII.getRegClass(MCID, FIOperandNum, this, MF));
+}
+
+bool PPCRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
+                                         int64_t Offset) const {
+  unsigned FIOperandNum = 0;
+  while (!MI->getOperand(FIOperandNum).isFI()) {
+    ++FIOperandNum;
+    assert(FIOperandNum < MI->getNumOperands() &&
+           "Instr doesn't have FrameIndex operand!");
+  }
+
+  unsigned OffsetOperandNo = getOffsetONFromFION(*MI, FIOperandNum);
+  Offset += MI->getOperand(OffsetOperandNo).getImm();
+
+  return MI->getOpcode() == PPC::DBG_VALUE || // DBG_VALUE is always Reg+Imm
+         (isInt<16>(Offset) && (!usesIXAddr(*MI) || (Offset & 3) == 0));
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
new file mode 100644
index 0000000..13a35f6
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -0,0 +1,113 @@
+//===-- PPCRegisterInfo.h - PowerPC Register Information Impl ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PowerPC implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef POWERPC32_REGISTERINFO_H
+#define POWERPC32_REGISTERINFO_H
+
+#include "PPC.h"
+#include "llvm/ADT/DenseMap.h"
+
+#define GET_REGINFO_HEADER
+#include "PPCGenRegisterInfo.inc"
+
+namespace llvm {
+class PPCSubtarget;
+class TargetInstrInfo;
+class Type;
+
+class PPCRegisterInfo : public PPCGenRegisterInfo {
+  DenseMap<unsigned, unsigned> ImmToIdxMap;
+  const PPCSubtarget &Subtarget;
+public:
+  PPCRegisterInfo(const PPCSubtarget &SubTarget);
+  
+  /// getPointerRegClass - Return the register class to use to hold pointers.
+  /// This is used for addressing modes.
+  const TargetRegisterClass *
+  getPointerRegClass(const MachineFunction &MF, unsigned Kind=0) const override;
+
+  unsigned getRegPressureLimit(const TargetRegisterClass *RC,
+                               MachineFunction &MF) const override;
+
+  const TargetRegisterClass*
+  getLargestLegalSuperClass(const TargetRegisterClass *RC) const override;
+
+  /// Code Generation virtual methods...
+  const MCPhysReg *
+  getCalleeSavedRegs(const MachineFunction* MF =nullptr) const override;
+  const uint32_t *getCallPreservedMask(CallingConv::ID CC) const override;
+  const uint32_t *getNoPreservedMask() const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+  /// We require the register scavenger.
+  bool requiresRegisterScavenging(const MachineFunction &MF) const override {
+    return true;
+  }
+
+  bool requiresFrameIndexScavenging(const MachineFunction &MF) const override {
+    return true;
+  }
+
+  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override {
+    return true;
+  }
+
+  bool requiresVirtualBaseRegisters(const MachineFunction &MF) const override {
+    return true;
+  }
+
+  void lowerDynamicAlloc(MachineBasicBlock::iterator II) const;
+  void lowerCRSpilling(MachineBasicBlock::iterator II,
+                       unsigned FrameIndex) const;
+  void lowerCRRestore(MachineBasicBlock::iterator II,
+                      unsigned FrameIndex) const;
+  void lowerCRBitSpilling(MachineBasicBlock::iterator II,
+                          unsigned FrameIndex) const;
+  void lowerCRBitRestore(MachineBasicBlock::iterator II,
+                         unsigned FrameIndex) const;
+  void lowerVRSAVESpilling(MachineBasicBlock::iterator II,
+                           unsigned FrameIndex) const;
+  void lowerVRSAVERestore(MachineBasicBlock::iterator II,
+                          unsigned FrameIndex) const;
+
+  bool hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg,
+			    int &FrameIdx) const override;
+  void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                           int SPAdj, unsigned FIOperandNum,
+                           RegScavenger *RS = nullptr) const override;
+
+  // Support for virtual base registers.
+  bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override;
+  void materializeFrameBaseRegister(MachineBasicBlock *MBB,
+                                    unsigned BaseReg, int FrameIdx,
+                                    int64_t Offset) const override;
+  void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+                         int64_t Offset) const override;
+  bool isFrameOffsetLegal(const MachineInstr *MI,
+                          int64_t Offset) const override;
+
+  // Debug information queries.
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
+
+  // Base pointer (stack realignment) support.
+  unsigned getBaseRegister(const MachineFunction &MF) const;
+  bool hasBasePointer(const MachineFunction &MF) const;
+  bool canRealignStack(const MachineFunction &MF) const;
+  bool needsStackRealignment(const MachineFunction &MF) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.td b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
new file mode 100644
index 0000000..b3d145b
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
@@ -0,0 +1,314 @@
+//===-- PPCRegisterInfo.td - The PowerPC Register File -----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+let Namespace = "PPC" in {
+def sub_lt : SubRegIndex<1>;
+def sub_gt : SubRegIndex<1, 1>;
+def sub_eq : SubRegIndex<1, 2>;
+def sub_un : SubRegIndex<1, 3>;
+def sub_32 : SubRegIndex<32>;
+def sub_64 : SubRegIndex<64>;
+def sub_128 : SubRegIndex<128>;
+}
+
+
+class PPCReg<string n> : Register<n> {
+  let Namespace = "PPC";
+}
+
+// We identify all our registers with a 5-bit ID, for consistency's sake.
+
+// GPR - One of the 32 32-bit general-purpose registers
+class GPR<bits<5> num, string n> : PPCReg<n> {
+  let HWEncoding{4-0} = num;
+}
+
+// GP8 - One of the 32 64-bit general-purpose registers
+class GP8<GPR SubReg, string n> : PPCReg<n> {
+  let HWEncoding = SubReg.HWEncoding;
+  let SubRegs = [SubReg];
+  let SubRegIndices = [sub_32];
+}
+
+// SPR - One of the 32-bit special-purpose registers
+class SPR<bits<10> num, string n> : PPCReg<n> {
+  let HWEncoding{9-0} = num;
+}
+
+// FPR - One of the 32 64-bit floating-point registers
+class FPR<bits<5> num, string n> : PPCReg<n> {
+  let HWEncoding{4-0} = num;
+}
+
+// VF - One of the 32 64-bit floating-point subregisters of the vector
+// registers (used by VSX).
+class VF<bits<5> num, string n> : PPCReg<n> {
+  let HWEncoding{4-0} = num;
+  let HWEncoding{5} = 1;
+}
+
+// VR - One of the 32 128-bit vector registers
+class VR<VF SubReg, string n> : PPCReg<n> {
+  let HWEncoding{4-0} = SubReg.HWEncoding{4-0};
+  let HWEncoding{5} = 0;
+  let SubRegs = [SubReg];
+  let SubRegIndices = [sub_64];
+}
+
+// VSRL - One of the 32 128-bit VSX registers that overlap with the scalar
+// floating-point registers.
+class VSRL<FPR SubReg, string n> : PPCReg<n> {
+  let HWEncoding = SubReg.HWEncoding;
+  let SubRegs = [SubReg];
+  let SubRegIndices = [sub_64];
+}
+
+// VSRH - One of the 32 128-bit VSX registers that overlap with the vector
+// registers.
+class VSRH<VR SubReg, string n> : PPCReg<n> {
+  let HWEncoding{4-0} = SubReg.HWEncoding{4-0};
+  let HWEncoding{5} = 1;
+  let SubRegs = [SubReg];
+  let SubRegIndices = [sub_128];
+}
+
+// CR - One of the 8 4-bit condition registers
+class CR<bits<3> num, string n, list<Register> subregs> : PPCReg<n> {
+  let HWEncoding{2-0} = num;
+  let SubRegs = subregs;
+}
+
+// CRBIT - One of the 32 1-bit condition register fields
+class CRBIT<bits<5> num, string n> : PPCReg<n> {
+  let HWEncoding{4-0} = num;
+}
+
+// General-purpose registers
+foreach Index = 0-31 in {
+  def R#Index : GPR<Index, "r"#Index>, DwarfRegNum<[-2, Index]>;
+}
+
+// 64-bit General-purpose registers
+foreach Index = 0-31 in {
+  def X#Index : GP8<!cast<GPR>("R"#Index), "r"#Index>,
+                    DwarfRegNum<[Index, -2]>;
+}
+
+// Floating-point registers
+foreach Index = 0-31 in {
+  def F#Index : FPR<Index, "f"#Index>,
+                DwarfRegNum<[!add(Index, 32), !add(Index, 32)]>;
+}
+
+// Floating-point vector subregisters (for VSX)
+foreach Index = 0-31 in {
+  def VF#Index : VF<Index, "vs" # !add(Index, 32)>;
+}
+
+// Vector registers
+foreach Index = 0-31 in {
+  def V#Index : VR<!cast<VF>("VF"#Index), "v"#Index>,
+                DwarfRegNum<[!add(Index, 77), !add(Index, 77)]>;
+}
+
+// VSX registers
+foreach Index = 0-31 in {
+  def VSL#Index : VSRL<!cast<FPR>("F"#Index), "vs"#Index>,
+                  DwarfRegAlias<!cast<FPR>("F"#Index)>;
+}
+foreach Index = 0-31 in {
+  def VSH#Index : VSRH<!cast<VR>("V"#Index), "vs" # !add(Index, 32)>,
+                  DwarfRegAlias<!cast<VR>("V"#Index)>;
+}
+
+// The reprsentation of r0 when treated as the constant 0.
+def ZERO  : GPR<0, "0">;
+def ZERO8 : GP8<ZERO, "0">;
+
+// Representations of the frame pointer used by ISD::FRAMEADDR.
+def FP   : GPR<0 /* arbitrary */, "**FRAME POINTER**">;
+def FP8  : GP8<FP, "**FRAME POINTER**">;
+
+// Representations of the base pointer used by setjmp.
+def BP   : GPR<0 /* arbitrary */, "**BASE POINTER**">;
+def BP8  : GP8<BP, "**BASE POINTER**">;
+
+// Condition register bits
+def CR0LT : CRBIT< 0, "0">;
+def CR0GT : CRBIT< 1, "1">;
+def CR0EQ : CRBIT< 2, "2">;
+def CR0UN : CRBIT< 3, "3">;
+def CR1LT : CRBIT< 4, "4">;
+def CR1GT : CRBIT< 5, "5">;
+def CR1EQ : CRBIT< 6, "6">;
+def CR1UN : CRBIT< 7, "7">;
+def CR2LT : CRBIT< 8, "8">;
+def CR2GT : CRBIT< 9, "9">;
+def CR2EQ : CRBIT<10, "10">;
+def CR2UN : CRBIT<11, "11">;
+def CR3LT : CRBIT<12, "12">;
+def CR3GT : CRBIT<13, "13">;
+def CR3EQ : CRBIT<14, "14">;
+def CR3UN : CRBIT<15, "15">;
+def CR4LT : CRBIT<16, "16">;
+def CR4GT : CRBIT<17, "17">;
+def CR4EQ : CRBIT<18, "18">;
+def CR4UN : CRBIT<19, "19">;
+def CR5LT : CRBIT<20, "20">;
+def CR5GT : CRBIT<21, "21">;
+def CR5EQ : CRBIT<22, "22">;
+def CR5UN : CRBIT<23, "23">;
+def CR6LT : CRBIT<24, "24">;
+def CR6GT : CRBIT<25, "25">;
+def CR6EQ : CRBIT<26, "26">;
+def CR6UN : CRBIT<27, "27">;
+def CR7LT : CRBIT<28, "28">;
+def CR7GT : CRBIT<29, "29">;
+def CR7EQ : CRBIT<30, "30">;
+def CR7UN : CRBIT<31, "31">;
+
+// Condition registers
+let SubRegIndices = [sub_lt, sub_gt, sub_eq, sub_un] in {
+def CR0 : CR<0, "cr0", [CR0LT, CR0GT, CR0EQ, CR0UN]>, DwarfRegNum<[68, 68]>;
+def CR1 : CR<1, "cr1", [CR1LT, CR1GT, CR1EQ, CR1UN]>, DwarfRegNum<[69, 69]>;
+def CR2 : CR<2, "cr2", [CR2LT, CR2GT, CR2EQ, CR2UN]>, DwarfRegNum<[70, 70]>;
+def CR3 : CR<3, "cr3", [CR3LT, CR3GT, CR3EQ, CR3UN]>, DwarfRegNum<[71, 71]>;
+def CR4 : CR<4, "cr4", [CR4LT, CR4GT, CR4EQ, CR4UN]>, DwarfRegNum<[72, 72]>;
+def CR5 : CR<5, "cr5", [CR5LT, CR5GT, CR5EQ, CR5UN]>, DwarfRegNum<[73, 73]>;
+def CR6 : CR<6, "cr6", [CR6LT, CR6GT, CR6EQ, CR6UN]>, DwarfRegNum<[74, 74]>;
+def CR7 : CR<7, "cr7", [CR7LT, CR7GT, CR7EQ, CR7UN]>, DwarfRegNum<[75, 75]>;
+}
+
+// The full condition-code register. This is not modeled fully, but defined
+// here primarily, for compatibility with gcc, to allow the inline asm "cc"
+// clobber specification to work.
+def CC : PPCReg<"cc">, DwarfRegAlias<CR0> {
+  let Aliases = [CR0, CR1, CR2, CR3, CR4, CR5, CR6, CR7];
+}
+
+// Link register
+def LR  : SPR<8, "lr">, DwarfRegNum<[-2, 65]>;
+//let Aliases = [LR] in
+def LR8 : SPR<8, "lr">, DwarfRegNum<[65, -2]>;
+
+// Count register
+def CTR  : SPR<9, "ctr">, DwarfRegNum<[-2, 66]>;
+def CTR8 : SPR<9, "ctr">, DwarfRegNum<[66, -2]>;
+
+// VRsave register
+def VRSAVE: SPR<256, "vrsave">, DwarfRegNum<[109]>;
+
+// Carry bit.  In the architecture this is really bit 0 of the XER register
+// (which really is SPR register 1);  this is the only bit interesting to a
+// compiler.
+def CARRY: SPR<1, "ca">;
+
+// FP rounding mode:  bits 30 and 31 of the FP status and control register
+// This is not allocated as a normal register; it appears only in
+// Uses and Defs.  The ABI says it needs to be preserved by a function,
+// but this is not achieved by saving and restoring it as with
+// most registers, it has to be done in code; to make this work all the
+// return and call instructions are described as Uses of RM, so instructions
+// that do nothing but change RM will not get deleted.
+// Also, in the architecture it is not really a SPR; 512 is arbitrary.
+def RM: SPR<512, "**ROUNDING MODE**">;
+
+/// Register classes
+// Allocate volatiles first
+// then nonvolatiles in reverse order since stmw/lmw save from rN to r31
+def GPRC : RegisterClass<"PPC", [i32], 32, (add (sequence "R%u", 2, 12),
+                                                (sequence "R%u", 30, 13),
+                                                R31, R0, R1, FP, BP)>;
+
+def G8RC : RegisterClass<"PPC", [i64], 64, (add (sequence "X%u", 2, 12),
+                                                (sequence "X%u", 30, 14),
+                                                X31, X13, X0, X1, FP8, BP8)>;
+
+// For some instructions r0 is special (representing the value 0 instead of
+// the value in the r0 register), and we use these register subclasses to
+// prevent r0 from being allocated for use by those instructions.
+def GPRC_NOR0 : RegisterClass<"PPC", [i32], 32, (add (sub GPRC, R0), ZERO)>;
+def G8RC_NOX0 : RegisterClass<"PPC", [i64], 64, (add (sub G8RC, X0), ZERO8)>;
+
+// Allocate volatiles first, then non-volatiles in reverse order. With the SVR4
+// ABI the size of the Floating-point register save area is determined by the
+// allocated non-volatile register with the lowest register number, as FP
+// register N is spilled to offset 8 * (32 - N) below the back chain word of the
+// previous stack frame. By allocating non-volatiles in reverse order we make
+// sure that the Floating-point register save area is always as small as
+// possible because there aren't any unused spill slots.
+def F8RC : RegisterClass<"PPC", [f64], 64, (add (sequence "F%u", 0, 13),
+                                                (sequence "F%u", 31, 14))>;
+def F4RC : RegisterClass<"PPC", [f32], 32, (add F8RC)>;
+
+def VRRC : RegisterClass<"PPC", [v16i8,v8i16,v4i32,v4f32], 128,
+                         (add V2, V3, V4, V5, V0, V1, V6, V7, V8, V9, V10, V11,
+                             V12, V13, V14, V15, V16, V17, V18, V19, V31, V30,
+                             V29, V28, V27, V26, V25, V24, V23, V22, V21, V20)>;
+
+// VSX register classes (the allocation order mirrors that of the corresponding
+// subregister classes).
+def VSLRC : RegisterClass<"PPC", [v4i32,v4f32,v2f64,v2i64], 128,
+                          (add (sequence "VSL%u", 0, 13),
+                               (sequence "VSL%u", 31, 14))>;
+def VSHRC : RegisterClass<"PPC", [v4i32,v4f32,v2f64,v2i64], 128,
+                          (add VSH2, VSH3, VSH4, VSH5, VSH0, VSH1, VSH6, VSH7,
+			       VSH8, VSH9, VSH10, VSH11, VSH12, VSH13, VSH14,
+                               VSH15, VSH16, VSH17, VSH18, VSH19, VSH31, VSH30,
+                               VSH29, VSH28, VSH27, VSH26, VSH25, VSH24, VSH23,
+                               VSH22, VSH21, VSH20)>;
+def VSRC  : RegisterClass<"PPC", [v4i32,v4f32,v2f64,v2i64], 128,
+                          (add VSLRC, VSHRC)>;
+
+// Register classes for the 64-bit "scalar" VSX subregisters.
+def VFRC :  RegisterClass<"PPC", [f64], 64,
+                          (add VF2, VF3, VF4, VF5, VF0, VF1, VF6, VF7,
+                               VF8, VF9, VF10, VF11, VF12, VF13, VF14,
+                               VF15, VF16, VF17, VF18, VF19, VF31, VF30,
+                               VF29, VF28, VF27, VF26, VF25, VF24, VF23,
+                               VF22, VF21, VF20)>;
+def VSFRC : RegisterClass<"PPC", [f64], 64, (add F8RC, VFRC)>;
+
+def CRBITRC : RegisterClass<"PPC", [i1], 32,
+  (add CR2LT, CR2GT, CR2EQ, CR2UN,
+       CR3LT, CR3GT, CR3EQ, CR3UN,
+       CR4LT, CR4GT, CR4EQ, CR4UN,
+       CR5LT, CR5GT, CR5EQ, CR5UN,
+       CR6LT, CR6GT, CR6EQ, CR6UN,
+       CR7LT, CR7GT, CR7EQ, CR7UN,
+       CR1LT, CR1GT, CR1EQ, CR1UN,
+       CR0LT, CR0GT, CR0EQ, CR0UN)> {
+  let Size = 32;
+}
+
+def CRRC : RegisterClass<"PPC", [i32], 32, (add CR0, CR1, CR5, CR6,
+                                                CR7, CR2, CR3, CR4)>;
+
+// The CTR registers are not allocatable because they're used by the
+// decrement-and-branch instructions, and thus need to stay live across
+// multiple basic blocks.
+def CTRRC : RegisterClass<"PPC", [i32], 32, (add CTR)> {
+  let isAllocatable = 0;
+}
+def CTRRC8 : RegisterClass<"PPC", [i64], 64, (add CTR8)> {
+  let isAllocatable = 0;
+}
+
+def VRSAVERC : RegisterClass<"PPC", [i32], 32, (add VRSAVE)>;
+def CARRYRC : RegisterClass<"PPC", [i32], 32, (add CARRY)> {
+  let CopyCost = -1;
+}
+
+def CCRC : RegisterClass<"PPC", [i32], 32, (add CC)> {
+  let isAllocatable = 0;
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCRelocations.h b/contrib/llvm/lib/Target/PowerPC/PPCRelocations.h
new file mode 100644
index 0000000..0b392f9
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCRelocations.h
@@ -0,0 +1,56 @@
+//===-- PPCRelocations.h - PPC Code Relocations -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the PowerPC 32-bit target-specific relocation types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PPCRELOCATIONS_H
+#define PPCRELOCATIONS_H
+
+#include "llvm/CodeGen/MachineRelocation.h"
+
+// Hack to rid us of a PPC pre-processor symbol which is erroneously
+// defined in a PowerPC header file (bug in Linux/PPC)
+#ifdef PPC
+#undef PPC
+#endif
+
+namespace llvm {
+  namespace PPC {
+    enum RelocationType {
+      // reloc_vanilla - A standard relocation, where the address of the
+      // relocated object completely overwrites the address of the relocation.
+      reloc_vanilla,
+    
+      // reloc_pcrel_bx - PC relative relocation, for the b or bl instructions.
+      reloc_pcrel_bx,
+
+      // reloc_pcrel_bcx - PC relative relocation, for BLT,BLE,BEQ,BGE,BGT,BNE,
+      // and other bcx instructions.
+      reloc_pcrel_bcx,
+
+      // reloc_absolute_high - Absolute relocation, for the loadhi instruction
+      // (which is really addis).  Add the high 16-bits of the specified global
+      // address into the low 16-bits of the instruction.
+      reloc_absolute_high,
+
+      // reloc_absolute_low - Absolute relocation, for the la instruction (which
+      // is really an addi).  Add the low 16-bits of the specified global
+      // address into the low 16-bits of the instruction.
+      reloc_absolute_low,
+      
+      // reloc_absolute_low_ix - Absolute relocation for the 64-bit load/store
+      // instruction which have two implicit zero bits.
+      reloc_absolute_low_ix
+    };
+  }
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCSchedule.td b/contrib/llvm/lib/Target/PowerPC/PPCSchedule.td
new file mode 100644
index 0000000..1221d41
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCSchedule.td
@@ -0,0 +1,520 @@
+//===-- PPCSchedule.td - PowerPC Scheduling Definitions ----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Instruction Itinerary classes used for PowerPC
+//
+def IIC_IntSimple    : InstrItinClass;
+def IIC_IntGeneral   : InstrItinClass;
+def IIC_IntCompare   : InstrItinClass;
+def IIC_IntDivD      : InstrItinClass;
+def IIC_IntDivW      : InstrItinClass;
+def IIC_IntMFFS      : InstrItinClass;
+def IIC_IntMFVSCR    : InstrItinClass;
+def IIC_IntMTFSB0    : InstrItinClass;
+def IIC_IntMTSRD     : InstrItinClass;
+def IIC_IntMulHD     : InstrItinClass;
+def IIC_IntMulHW     : InstrItinClass;
+def IIC_IntMulHWU    : InstrItinClass;
+def IIC_IntMulLI     : InstrItinClass;
+def IIC_IntRFID      : InstrItinClass;
+def IIC_IntRotateD   : InstrItinClass;
+def IIC_IntRotateDI  : InstrItinClass;
+def IIC_IntRotate    : InstrItinClass;
+def IIC_IntShift     : InstrItinClass;
+def IIC_IntTrapD     : InstrItinClass;
+def IIC_IntTrapW     : InstrItinClass;
+def IIC_BrB          : InstrItinClass;
+def IIC_BrCR         : InstrItinClass;
+def IIC_BrMCR        : InstrItinClass;
+def IIC_BrMCRX       : InstrItinClass;
+def IIC_LdStDCBA     : InstrItinClass;
+def IIC_LdStDCBF     : InstrItinClass;
+def IIC_LdStDCBI     : InstrItinClass;
+def IIC_LdStLoad     : InstrItinClass;
+def IIC_LdStLoadUpd  : InstrItinClass;
+def IIC_LdStLoadUpdX : InstrItinClass;
+def IIC_LdStStore    : InstrItinClass;
+def IIC_LdStStoreUpd : InstrItinClass;
+def IIC_LdStDSS      : InstrItinClass;
+def IIC_LdStICBI     : InstrItinClass;
+def IIC_LdStLD       : InstrItinClass;
+def IIC_LdStLDU      : InstrItinClass;
+def IIC_LdStLDUX     : InstrItinClass;
+def IIC_LdStLDARX    : InstrItinClass;
+def IIC_LdStLFD      : InstrItinClass;
+def IIC_LdStLFDU     : InstrItinClass;
+def IIC_LdStLFDUX    : InstrItinClass;
+def IIC_LdStLHA      : InstrItinClass;
+def IIC_LdStLHAU     : InstrItinClass;
+def IIC_LdStLHAUX    : InstrItinClass;
+def IIC_LdStLMW      : InstrItinClass;
+def IIC_LdStLVecX    : InstrItinClass;
+def IIC_LdStLWA      : InstrItinClass;
+def IIC_LdStLWARX    : InstrItinClass;
+def IIC_LdStSLBIA    : InstrItinClass;
+def IIC_LdStSLBIE    : InstrItinClass;
+def IIC_LdStSTD      : InstrItinClass;
+def IIC_LdStSTDCX    : InstrItinClass;
+def IIC_LdStSTDU     : InstrItinClass;
+def IIC_LdStSTDUX    : InstrItinClass;
+def IIC_LdStSTFD     : InstrItinClass;
+def IIC_LdStSTFDU    : InstrItinClass;
+def IIC_LdStSTVEBX   : InstrItinClass;
+def IIC_LdStSTWCX    : InstrItinClass;
+def IIC_LdStSync     : InstrItinClass;
+def IIC_SprISYNC     : InstrItinClass;
+def IIC_SprMFSR      : InstrItinClass;
+def IIC_SprMTMSR     : InstrItinClass;
+def IIC_SprMTSR      : InstrItinClass;
+def IIC_SprTLBSYNC   : InstrItinClass;
+def IIC_SprMFCR      : InstrItinClass;
+def IIC_SprMFCRF     : InstrItinClass;
+def IIC_SprMFMSR     : InstrItinClass;
+def IIC_SprMFSPR     : InstrItinClass;
+def IIC_SprMFTB      : InstrItinClass;
+def IIC_SprMTSPR     : InstrItinClass;
+def IIC_SprMTSRIN    : InstrItinClass;
+def IIC_SprRFI       : InstrItinClass;
+def IIC_SprSC        : InstrItinClass;
+def IIC_FPGeneral    : InstrItinClass;
+def IIC_FPAddSub     : InstrItinClass;
+def IIC_FPCompare    : InstrItinClass;
+def IIC_FPDivD       : InstrItinClass;
+def IIC_FPDivS       : InstrItinClass;
+def IIC_FPFused      : InstrItinClass;
+def IIC_FPRes        : InstrItinClass;
+def IIC_FPSqrtD      : InstrItinClass;
+def IIC_FPSqrtS      : InstrItinClass;
+def IIC_VecGeneral   : InstrItinClass;
+def IIC_VecFP        : InstrItinClass;
+def IIC_VecFPCompare : InstrItinClass;
+def IIC_VecComplex   : InstrItinClass;
+def IIC_VecPerm      : InstrItinClass;
+def IIC_VecFPRound   : InstrItinClass;
+def IIC_VecVSL       : InstrItinClass;
+def IIC_VecVSR       : InstrItinClass;
+def IIC_SprMTMSRD    : InstrItinClass;
+def IIC_SprSLIE      : InstrItinClass;
+def IIC_SprSLBIE     : InstrItinClass;
+def IIC_SprSLBMTE    : InstrItinClass;
+def IIC_SprSLBMFEE   : InstrItinClass;
+def IIC_SprSLBIA     : InstrItinClass;
+def IIC_SprTLBIEL    : InstrItinClass;
+def IIC_SprTLBIE     : InstrItinClass;
+
+//===----------------------------------------------------------------------===//
+// Processor instruction itineraries.
+
+include "PPCScheduleG3.td"
+include "PPCSchedule440.td"
+include "PPCScheduleG4.td"
+include "PPCScheduleG4Plus.td"
+include "PPCScheduleG5.td"
+include "PPCScheduleP7.td"
+include "PPCScheduleA2.td"
+include "PPCScheduleE500mc.td"
+include "PPCScheduleE5500.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction to itinerary class map - When add new opcodes to the supported
+// set, refer to the following table to determine which itinerary class the
+// opcode belongs.
+//
+//    opcode     itinerary class
+//    ======     ===============
+//    add        IIC_IntSimple
+//    addc       IIC_IntGeneral
+//    adde       IIC_IntGeneral
+//    addi       IIC_IntSimple
+//    addic      IIC_IntGeneral
+//    addic.     IIC_IntGeneral
+//    addis      IIC_IntSimple
+//    addme      IIC_IntGeneral
+//    addze      IIC_IntGeneral
+//    and        IIC_IntSimple
+//    andc       IIC_IntSimple
+//    andi.      IIC_IntGeneral
+//    andis.     IIC_IntGeneral
+//    b          IIC_BrB
+//    bc         IIC_BrB
+//    bcctr      IIC_BrB
+//    bclr       IIC_BrB
+//    cmp        IIC_IntCompare
+//    cmpi       IIC_IntCompare
+//    cmpl       IIC_IntCompare
+//    cmpli      IIC_IntCompare
+//    cntlzd     IIC_IntRotateD
+//    cntlzw     IIC_IntGeneral
+//    crand      IIC_BrCR
+//    crandc     IIC_BrCR
+//    creqv      IIC_BrCR
+//    crnand     IIC_BrCR
+//    crnor      IIC_BrCR
+//    cror       IIC_BrCR
+//    crorc      IIC_BrCR
+//    crxor      IIC_BrCR
+//    dcba       IIC_LdStDCBA
+//    dcbf       IIC_LdStDCBF
+//    dcbi       IIC_LdStDCBI
+//    dcbst      IIC_LdStDCBF
+//    dcbt       IIC_LdStLoad
+//    dcbtst     IIC_LdStLoad
+//    dcbz       IIC_LdStDCBF
+//    divd       IIC_IntDivD
+//    divdu      IIC_IntDivD
+//    divw       IIC_IntDivW
+//    divwu      IIC_IntDivW
+//    dss        IIC_LdStDSS
+//    dst        IIC_LdStDSS
+//    dstst      IIC_LdStDSS
+//    eciwx      IIC_LdStLoad
+//    ecowx      IIC_LdStLoad
+//    eieio      IIC_LdStLoad
+//    eqv        IIC_IntSimple
+//    extsb      IIC_IntSimple
+//    extsh      IIC_IntSimple
+//    extsw      IIC_IntSimple
+//    fabs       IIC_FPGeneral
+//    fadd       IIC_FPAddSub
+//    fadds      IIC_FPGeneral
+//    fcfid      IIC_FPGeneral
+//    fcmpo      IIC_FPCompare
+//    fcmpu      IIC_FPCompare
+//    fctid      IIC_FPGeneral
+//    fctidz     IIC_FPGeneral
+//    fctiw      IIC_FPGeneral
+//    fctiwz     IIC_FPGeneral
+//    fdiv       IIC_FPDivD
+//    fdivs      IIC_FPDivS
+//    fmadd      IIC_FPFused
+//    fmadds     IIC_FPGeneral
+//    fmr        IIC_FPGeneral
+//    fmsub      IIC_FPFused
+//    fmsubs     IIC_FPGeneral
+//    fmul       IIC_FPFused
+//    fmuls      IIC_FPGeneral
+//    fnabs      IIC_FPGeneral
+//    fneg       IIC_FPGeneral
+//    fnmadd     IIC_FPFused
+//    fnmadds    IIC_FPGeneral
+//    fnmsub     IIC_FPFused
+//    fnmsubs    IIC_FPGeneral
+//    fres       IIC_FPRes
+//    frsp       IIC_FPGeneral
+//    frsqrte    IIC_FPGeneral
+//    fsel       IIC_FPGeneral
+//    fsqrt      IIC_FPSqrtD
+//    fsqrts     IIC_FPSqrtS
+//    fsub       IIC_FPAddSub
+//    fsubs      IIC_FPGeneral
+//    icbi       IIC_LdStICBI
+//    isync      IIC_SprISYNC
+//    lbz        IIC_LdStLoad
+//    lbzu       IIC_LdStLoadUpd
+//    lbzux      IIC_LdStLoadUpdX
+//    lbzx       IIC_LdStLoad
+//    ld         IIC_LdStLD
+//    ldarx      IIC_LdStLDARX
+//    ldu        IIC_LdStLDU
+//    ldux       IIC_LdStLDUX
+//    ldx        IIC_LdStLD
+//    lfd        IIC_LdStLFD
+//    lfdu       IIC_LdStLFDU
+//    lfdux      IIC_LdStLFDUX
+//    lfdx       IIC_LdStLFD
+//    lfs        IIC_LdStLFD
+//    lfsu       IIC_LdStLFDU
+//    lfsux      IIC_LdStLFDUX
+//    lfsx       IIC_LdStLFD
+//    lha        IIC_LdStLHA
+//    lhau       IIC_LdStLHAU
+//    lhaux      IIC_LdStLHAUX
+//    lhax       IIC_LdStLHA
+//    lhbrx      IIC_LdStLoad
+//    lhz        IIC_LdStLoad
+//    lhzu       IIC_LdStLoadUpd
+//    lhzux      IIC_LdStLoadUpdX
+//    lhzx       IIC_LdStLoad
+//    lmw        IIC_LdStLMW
+//    lswi       IIC_LdStLMW
+//    lswx       IIC_LdStLMW
+//    lvebx      IIC_LdStLVecX
+//    lvehx      IIC_LdStLVecX
+//    lvewx      IIC_LdStLVecX
+//    lvsl       IIC_LdStLVecX
+//    lvsr       IIC_LdStLVecX
+//    lvx        IIC_LdStLVecX
+//    lvxl       IIC_LdStLVecX
+//    lwa        IIC_LdStLWA
+//    lwarx      IIC_LdStLWARX
+//    lwaux      IIC_LdStLHAUX
+//    lwax       IIC_LdStLHA
+//    lwbrx      IIC_LdStLoad
+//    lwz        IIC_LdStLoad
+//    lwzu       IIC_LdStLoadUpd
+//    lwzux      IIC_LdStLoadUpdX
+//    lwzx       IIC_LdStLoad
+//    mcrf       IIC_BrMCR
+//    mcrfs      IIC_FPGeneral
+//    mcrxr      IIC_BrMCRX
+//    mfcr       IIC_SprMFCR
+//    mffs       IIC_IntMFFS
+//    mfmsr      IIC_SprMFMSR
+//    mfspr      IIC_SprMFSPR
+//    mfsr       IIC_SprMFSR
+//    mfsrin     IIC_SprMFSR
+//    mftb       IIC_SprMFTB
+//    mfvscr     IIC_IntMFVSCR
+//    mtcrf      IIC_BrMCRX
+//    mtfsb0     IIC_IntMTFSB0
+//    mtfsb1     IIC_IntMTFSB0
+//    mtfsf      IIC_IntMTFSB0
+//    mtfsfi     IIC_IntMTFSB0
+//    mtmsr      IIC_SprMTMSR
+//    mtmsrd     IIC_LdStLD
+//    mtspr      IIC_SprMTSPR
+//    mtsr       IIC_SprMTSR
+//    mtsrd      IIC_IntMTSRD
+//    mtsrdin    IIC_IntMTSRD
+//    mtsrin     IIC_SprMTSRIN
+//    mtvscr     IIC_IntMFVSCR
+//    mulhd      IIC_IntMulHD
+//    mulhdu     IIC_IntMulHD
+//    mulhw      IIC_IntMulHW
+//    mulhwu     IIC_IntMulHWU
+//    mulld      IIC_IntMulHD
+//    mulli      IIC_IntMulLI
+//    mullw      IIC_IntMulHW
+//    nand       IIC_IntSimple
+//    neg        IIC_IntSimple
+//    nor        IIC_IntSimple
+//    or         IIC_IntSimple
+//    orc        IIC_IntSimple
+//    ori        IIC_IntSimple
+//    oris       IIC_IntSimple
+//    rfi        IIC_SprRFI
+//    rfid       IIC_IntRFID
+//    rldcl      IIC_IntRotateD
+//    rldcr      IIC_IntRotateD
+//    rldic      IIC_IntRotateDI
+//    rldicl     IIC_IntRotateDI
+//    rldicr     IIC_IntRotateDI
+//    rldimi     IIC_IntRotateDI
+//    rlwimi     IIC_IntRotate
+//    rlwinm     IIC_IntGeneral
+//    rlwnm      IIC_IntGeneral
+//    sc         IIC_SprSC
+//    slbia      IIC_LdStSLBIA
+//    slbie      IIC_LdStSLBIE
+//    sld        IIC_IntRotateD
+//    slw        IIC_IntGeneral
+//    srad       IIC_IntRotateD
+//    sradi      IIC_IntRotateDI
+//    sraw       IIC_IntShift
+//    srawi      IIC_IntShift
+//    srd        IIC_IntRotateD
+//    srw        IIC_IntGeneral
+//    stb        IIC_LdStStore
+//    stbu       IIC_LdStStoreUpd
+//    stbux      IIC_LdStStoreUpd
+//    stbx       IIC_LdStStore
+//    std        IIC_LdStSTD
+//    stdcx.     IIC_LdStSTDCX
+//    stdu       IIC_LdStSTDU
+//    stdux      IIC_LdStSTDUX
+//    stdx       IIC_LdStSTD
+//    stfd       IIC_LdStSTFD
+//    stfdu      IIC_LdStSTFDU
+//    stfdux     IIC_LdStSTFDU
+//    stfdx      IIC_LdStSTFD
+//    stfiwx     IIC_LdStSTFD
+//    stfs       IIC_LdStSTFD
+//    stfsu      IIC_LdStSTFDU
+//    stfsux     IIC_LdStSTFDU
+//    stfsx      IIC_LdStSTFD
+//    sth        IIC_LdStStore
+//    sthbrx     IIC_LdStStore
+//    sthu       IIC_LdStStoreUpd
+//    sthux      IIC_LdStStoreUpd
+//    sthx       IIC_LdStStore
+//    stmw       IIC_LdStLMW
+//    stswi      IIC_LdStLMW
+//    stswx      IIC_LdStLMW
+//    stvebx     IIC_LdStSTVEBX
+//    stvehx     IIC_LdStSTVEBX
+//    stvewx     IIC_LdStSTVEBX
+//    stvx       IIC_LdStSTVEBX
+//    stvxl      IIC_LdStSTVEBX
+//    stw        IIC_LdStStore
+//    stwbrx     IIC_LdStStore
+//    stwcx.     IIC_LdStSTWCX
+//    stwu       IIC_LdStStoreUpd
+//    stwux      IIC_LdStStoreUpd
+//    stwx       IIC_LdStStore
+//    subf       IIC_IntGeneral
+//    subfc      IIC_IntGeneral
+//    subfe      IIC_IntGeneral
+//    subfic     IIC_IntGeneral
+//    subfme     IIC_IntGeneral
+//    subfze     IIC_IntGeneral
+//    sync       IIC_LdStSync
+//    td         IIC_IntTrapD
+//    tdi        IIC_IntTrapD
+//    tlbia      IIC_LdStSLBIA
+//    tlbie      IIC_LdStDCBF
+//    tlbsync    IIC_SprTLBSYNC
+//    tw         IIC_IntTrapW
+//    twi        IIC_IntTrapW
+//    vaddcuw    IIC_VecGeneral
+//    vaddfp     IIC_VecFP
+//    vaddsbs    IIC_VecGeneral
+//    vaddshs    IIC_VecGeneral
+//    vaddsws    IIC_VecGeneral
+//    vaddubm    IIC_VecGeneral
+//    vaddubs    IIC_VecGeneral
+//    vadduhm    IIC_VecGeneral
+//    vadduhs    IIC_VecGeneral
+//    vadduwm    IIC_VecGeneral
+//    vadduws    IIC_VecGeneral
+//    vand       IIC_VecGeneral
+//    vandc      IIC_VecGeneral
+//    vavgsb     IIC_VecGeneral
+//    vavgsh     IIC_VecGeneral
+//    vavgsw     IIC_VecGeneral
+//    vavgub     IIC_VecGeneral
+//    vavguh     IIC_VecGeneral
+//    vavguw     IIC_VecGeneral
+//    vcfsx      IIC_VecFP
+//    vcfux      IIC_VecFP
+//    vcmpbfp    IIC_VecFPCompare
+//    vcmpeqfp   IIC_VecFPCompare
+//    vcmpequb   IIC_VecGeneral
+//    vcmpequh   IIC_VecGeneral
+//    vcmpequw   IIC_VecGeneral
+//    vcmpgefp   IIC_VecFPCompare
+//    vcmpgtfp   IIC_VecFPCompare
+//    vcmpgtsb   IIC_VecGeneral
+//    vcmpgtsh   IIC_VecGeneral
+//    vcmpgtsw   IIC_VecGeneral
+//    vcmpgtub   IIC_VecGeneral
+//    vcmpgtuh   IIC_VecGeneral
+//    vcmpgtuw   IIC_VecGeneral
+//    vctsxs     IIC_VecFP
+//    vctuxs     IIC_VecFP
+//    vexptefp   IIC_VecFP
+//    vlogefp    IIC_VecFP
+//    vmaddfp    IIC_VecFP
+//    vmaxfp     IIC_VecFPCompare
+//    vmaxsb     IIC_VecGeneral
+//    vmaxsh     IIC_VecGeneral
+//    vmaxsw     IIC_VecGeneral
+//    vmaxub     IIC_VecGeneral
+//    vmaxuh     IIC_VecGeneral
+//    vmaxuw     IIC_VecGeneral
+//    vmhaddshs  IIC_VecComplex
+//    vmhraddshs IIC_VecComplex
+//    vminfp     IIC_VecFPCompare
+//    vminsb     IIC_VecGeneral
+//    vminsh     IIC_VecGeneral
+//    vminsw     IIC_VecGeneral
+//    vminub     IIC_VecGeneral
+//    vminuh     IIC_VecGeneral
+//    vminuw     IIC_VecGeneral
+//    vmladduhm  IIC_VecComplex
+//    vmrghb     IIC_VecPerm
+//    vmrghh     IIC_VecPerm
+//    vmrghw     IIC_VecPerm
+//    vmrglb     IIC_VecPerm
+//    vmrglh     IIC_VecPerm
+//    vmrglw     IIC_VecPerm
+//    vmsubfp    IIC_VecFP
+//    vmsummbm   IIC_VecComplex
+//    vmsumshm   IIC_VecComplex
+//    vmsumshs   IIC_VecComplex
+//    vmsumubm   IIC_VecComplex
+//    vmsumuhm   IIC_VecComplex
+//    vmsumuhs   IIC_VecComplex
+//    vmulesb    IIC_VecComplex
+//    vmulesh    IIC_VecComplex
+//    vmuleub    IIC_VecComplex
+//    vmuleuh    IIC_VecComplex
+//    vmulosb    IIC_VecComplex
+//    vmulosh    IIC_VecComplex
+//    vmuloub    IIC_VecComplex
+//    vmulouh    IIC_VecComplex
+//    vnor       IIC_VecGeneral
+//    vor        IIC_VecGeneral
+//    vperm      IIC_VecPerm
+//    vpkpx      IIC_VecPerm
+//    vpkshss    IIC_VecPerm
+//    vpkshus    IIC_VecPerm
+//    vpkswss    IIC_VecPerm
+//    vpkswus    IIC_VecPerm
+//    vpkuhum    IIC_VecPerm
+//    vpkuhus    IIC_VecPerm
+//    vpkuwum    IIC_VecPerm
+//    vpkuwus    IIC_VecPerm
+//    vrefp      IIC_VecFPRound
+//    vrfim      IIC_VecFPRound
+//    vrfin      IIC_VecFPRound
+//    vrfip      IIC_VecFPRound
+//    vrfiz      IIC_VecFPRound
+//    vrlb       IIC_VecGeneral
+//    vrlh       IIC_VecGeneral
+//    vrlw       IIC_VecGeneral
+//    vrsqrtefp  IIC_VecFP
+//    vsel       IIC_VecGeneral
+//    vsl        IIC_VecVSL
+//    vslb       IIC_VecGeneral
+//    vsldoi     IIC_VecPerm
+//    vslh       IIC_VecGeneral
+//    vslo       IIC_VecPerm
+//    vslw       IIC_VecGeneral
+//    vspltb     IIC_VecPerm
+//    vsplth     IIC_VecPerm
+//    vspltisb   IIC_VecPerm
+//    vspltish   IIC_VecPerm
+//    vspltisw   IIC_VecPerm
+//    vspltw     IIC_VecPerm
+//    vsr        IIC_VecVSR
+//    vsrab      IIC_VecGeneral
+//    vsrah      IIC_VecGeneral
+//    vsraw      IIC_VecGeneral
+//    vsrb       IIC_VecGeneral
+//    vsrh       IIC_VecGeneral
+//    vsro       IIC_VecPerm
+//    vsrw       IIC_VecGeneral
+//    vsubcuw    IIC_VecGeneral
+//    vsubfp     IIC_VecFP
+//    vsubsbs    IIC_VecGeneral
+//    vsubshs    IIC_VecGeneral
+//    vsubsws    IIC_VecGeneral
+//    vsububm    IIC_VecGeneral
+//    vsububs    IIC_VecGeneral
+//    vsubuhm    IIC_VecGeneral
+//    vsubuhs    IIC_VecGeneral
+//    vsubuwm    IIC_VecGeneral
+//    vsubuws    IIC_VecGeneral
+//    vsum2sws   IIC_VecComplex
+//    vsum4sbs   IIC_VecComplex
+//    vsum4shs   IIC_VecComplex
+//    vsum4ubs   IIC_VecComplex
+//    vsumsws    IIC_VecComplex
+//    vupkhpx    IIC_VecPerm
+//    vupkhsb    IIC_VecPerm
+//    vupkhsh    IIC_VecPerm
+//    vupklpx    IIC_VecPerm
+//    vupklsb    IIC_VecPerm
+//    vupklsh    IIC_VecPerm
+//    vxor       IIC_VecGeneral
+//    xor        IIC_IntSimple
+//    xori       IIC_IntSimple
+//    xoris      IIC_IntSimple
+//
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCSchedule440.td b/contrib/llvm/lib/Target/PowerPC/PPCSchedule440.td
new file mode 100644
index 0000000..218fed2
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCSchedule440.td
@@ -0,0 +1,599 @@
+//===-- PPCSchedule440.td - PPC 440 Scheduling Definitions -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// Primary reference:
+// PowerPC 440x6 Embedded Processor Core User's Manual.
+// IBM (as updated in) 2010.
+
+// The basic PPC 440 does not include a floating-point unit; the pipeline
+// timings here are constructed to match the FP2 unit shipped with the
+// PPC-440- and PPC-450-based Blue Gene (L and P) supercomputers.
+// References:
+// S. Chatterjee, et al. Design and exploitation of a high-performance
+// SIMD floating-point unit for Blue Gene/L.
+// IBM J. Res. & Dev. 49 (2/3) March/May 2005.
+// also:
+// Carlos Sosa and Brant Knudson. IBM System Blue Gene Solution:
+// Blue Gene/P Application Development.
+// IBM (as updated in) 2009.
+
+//===----------------------------------------------------------------------===//
+// Functional units on the PowerPC 440/450 chip sets
+//
+def P440_DISS1  : FuncUnit; // Issue unit 1
+def P440_DISS2  : FuncUnit; // Issue unit 2
+def P440_LRACC  : FuncUnit; // Register access and dispatch for
+                            // the simple integer (J-pipe) and
+                            // load/store (L-pipe) pipelines
+def P440_IRACC  : FuncUnit; // Register access and dispatch for
+                            // the complex integer (I-pipe) pipeline
+def P440_FRACC  : FuncUnit; // Register access and dispatch for
+                            // the floating-point execution (F-pipe) pipeline
+def P440_IEXE1  : FuncUnit; // Execution stage 1 for the I pipeline
+def P440_IEXE2  : FuncUnit; // Execution stage 2 for the I pipeline
+def P440_IWB    : FuncUnit; // Write-back unit for the I pipeline
+def P440_JEXE1  : FuncUnit; // Execution stage 1 for the J pipeline
+def P440_JEXE2  : FuncUnit; // Execution stage 2 for the J pipeline
+def P440_JWB    : FuncUnit; // Write-back unit for the J pipeline
+def P440_AGEN   : FuncUnit; // Address generation for the L pipeline
+def P440_CRD    : FuncUnit; // D-cache access for the L pipeline
+def P440_LWB    : FuncUnit; // Write-back unit for the L pipeline
+def P440_FEXE1  : FuncUnit; // Execution stage 1 for the F pipeline
+def P440_FEXE2  : FuncUnit; // Execution stage 2 for the F pipeline
+def P440_FEXE3  : FuncUnit; // Execution stage 3 for the F pipeline
+def P440_FEXE4  : FuncUnit; // Execution stage 4 for the F pipeline
+def P440_FEXE5  : FuncUnit; // Execution stage 5 for the F pipeline
+def P440_FEXE6  : FuncUnit; // Execution stage 6 for the F pipeline
+def P440_FWB    : FuncUnit; // Write-back unit for the F pipeline
+
+def P440_LWARX_Hold : FuncUnit; // This is a pseudo-unit which is used
+                                // to make sure that no lwarx/stwcx.
+                                // instructions are issued while another
+                                // lwarx/stwcx. is in the L pipe.
+
+def P440_GPR_Bypass : Bypass; // The bypass for general-purpose regs.
+def P440_FPR_Bypass : Bypass; // The bypass for floating-point regs.
+
+// Notes:
+// Instructions are held in the FRACC, LRACC and IRACC pipeline
+// stages until their source operands become ready. Exceptions:
+//  - Store instructions will hold in the AGEN stage
+//  - The integer multiply-accumulate instruction will hold in
+//    the IEXE1 stage
+//
+// For most I-pipe operations, the result is available at the end of
+// the IEXE1 stage. Operations such as multiply and divide must
+// continue to execute in IEXE2 and IWB. Divide resides in IWB for
+// 33 cycles (multiply also calculates its result in IWB). For all
+// J-pipe instructions, the result is available
+// at the end of the JEXE1 stage. Loads have a 3-cycle latency
+// (data is not available until after the LWB stage).
+//
+// The L1 cache hit latency is four cycles for floating point loads
+// and three cycles for integer loads.
+//
+// The stwcx. instruction requires both the LRACC and the IRACC
+// dispatch stages. It must be issued from DISS0.
+//
+// All lwarx/stwcx. instructions hold in LRACC if another
+// uncommitted lwarx/stwcx. is in AGEN, CRD, or LWB.
+//
+// msync (a.k.a. sync) and mbar will hold in LWB until all load/store
+// resources are empty. AGEN and CRD are held empty until the msync/mbar
+// commits.
+//
+// Most floating-point instructions, computational and move,
+// have a 5-cycle latency. Divide takes longer (30 cycles). Instructions that
+// update the CR take 2 cycles. Stores take 3 cycles and, as mentioned above,
+// loads take 4 cycles (for L1 hit).
+
+//
+// This file defines the itinerary class data for the PPC 440 processor.
+//
+//===----------------------------------------------------------------------===//
+
+
+def PPC440Itineraries : ProcessorItineraries<
+  [P440_DISS1, P440_DISS2, P440_FRACC, P440_IRACC, P440_IEXE1, P440_IEXE2,
+   P440_IWB, P440_LRACC, P440_JEXE1, P440_JEXE2, P440_JWB, P440_AGEN, P440_CRD,
+   P440_LWB, P440_FEXE1, P440_FEXE2, P440_FEXE3, P440_FEXE4, P440_FEXE5,
+   P440_FEXE6, P440_FWB, P440_LWARX_Hold],
+  [P440_GPR_Bypass, P440_FPR_Bypass], [
+  InstrItinData<IIC_IntSimple,  [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC, P440_LRACC]>,
+                                 InstrStage<1, [P440_IEXE1, P440_JEXE1]>,
+                                 InstrStage<1, [P440_IEXE2, P440_JEXE2]>,
+                                 InstrStage<1, [P440_IWB, P440_JWB]>],
+                                [2, 0, 0],
+                                [P440_GPR_Bypass,
+                                 P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_IntGeneral, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC, P440_LRACC]>,
+                                 InstrStage<1, [P440_IEXE1, P440_JEXE1]>,
+                                 InstrStage<1, [P440_IEXE2, P440_JEXE2]>,
+                                 InstrStage<1, [P440_IWB, P440_JWB]>],
+                                [2, 0, 0],
+                                [P440_GPR_Bypass,
+                                 P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_IntCompare, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC, P440_LRACC]>,
+                                 InstrStage<1, [P440_IEXE1, P440_JEXE1]>,
+                                 InstrStage<1, [P440_IEXE2, P440_JEXE2]>,
+                                 InstrStage<1, [P440_IWB, P440_JWB]>],
+                                [2, 0, 0],
+                                [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_IntDivW,    [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<33, [P440_IWB]>],
+                                [36, 0, 0],
+                                [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_IntMFFS,    [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [3, 0, 0],
+                                [P440_GPR_Bypass,
+                                 P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_IntMTFSB0,  [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [3, 0, 0],
+                                [P440_GPR_Bypass,
+                                 P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_IntMulHW,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [4, 0, 0],
+                                [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_IntMulHWU,  [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [4, 0, 0],
+                                [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_IntMulLI,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [4, 0, 0],
+                                [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_IntRotate,  [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC, P440_LRACC]>,
+                                 InstrStage<1, [P440_IEXE1, P440_JEXE1]>,
+                                 InstrStage<1, [P440_IEXE2, P440_JEXE2]>,
+                                 InstrStage<1, [P440_IWB, P440_JWB]>],
+                                [2, 0, 0],
+                                [P440_GPR_Bypass,
+                                 P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_IntShift,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC, P440_LRACC]>,
+                                 InstrStage<1, [P440_IEXE1, P440_JEXE1]>,
+                                 InstrStage<1, [P440_IEXE2, P440_JEXE2]>,
+                                 InstrStage<1, [P440_IWB, P440_JWB]>],
+                                [2, 0, 0],
+                                [P440_GPR_Bypass,
+                                 P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_IntTrapW,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [2, 0],
+                                [P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_BrB,        [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [4, 0],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_BrCR,       [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [4, 0, 0],
+                                [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_BrMCR,      [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [4, 0, 0],
+                                [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_BrMCRX,     [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [4, 0, 0],
+                                [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStDCBA,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<1, [P440_LWB]>],
+                                [1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStDCBF,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<1, [P440_LWB]>],
+                                [1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStDCBI,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<1, [P440_LWB]>],
+                                [1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLoad,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<2, [P440_LWB]>],
+                                [5, 1, 1],
+                                [P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLoadUpd,[InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<2, [P440_LWB]>],
+                                [5, 2, 1, 1],
+                                [P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLoadUpdX,[InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<2, [P440_LWB]>],
+                                [5, 2, 1, 1],
+                                [P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStStore,  [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<2, [P440_LWB]>],
+                                [1, 1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStStoreUpd,[InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<2, [P440_LWB]>],
+                                [2, 1, 1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStICBI,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<1, [P440_LWB]>],
+                                [4, 1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTFD,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<1, [P440_LWB]>],
+                                [1, 1, 1],
+                                [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTFDU,  [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<1, [P440_LWB]>],
+                                [2, 1, 1, 1],
+                                [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLFD,    [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<2, [P440_LWB]>],
+                                [5, 1, 1],
+                                [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLFDU,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<1, [P440_LWB]>],
+                                [5, 2, 1, 1],
+                                [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLFDUX,  [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<1, [P440_LWB]>],
+                                [5, 2, 1, 1],
+                                [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLHA,    [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<1, [P440_LWB]>],
+                                [4, 1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLHAU,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<1, [P440_LWB]>],
+                                [4, 1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLHAUX,  [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<1, [P440_LWB]>],
+                                [4, 1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLMW,    [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<1, [P440_LWB]>],
+                                [4, 1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLWARX,  [InstrStage<1, [P440_DISS1]>,
+                                 InstrStage<1, [P440_IRACC], 0>,
+                                 InstrStage<4, [P440_LWARX_Hold], 0>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<1, [P440_LWB]>],
+                                [4, 1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTD,    [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<2, [P440_LWB]>],
+                                [4, 1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTDU,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<2, [P440_LWB]>],
+                                [2, 1, 1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTDUX,  [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<2, [P440_LWB]>],
+                                [2, 1, 1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTDCX,  [InstrStage<1, [P440_DISS1]>,
+                                 InstrStage<1, [P440_IRACC], 0>,
+                                 InstrStage<4, [P440_LWARX_Hold], 0>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<1, [P440_LWB]>],
+                                [4, 1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTWCX,  [InstrStage<1, [P440_DISS1]>,
+                                 InstrStage<1, [P440_IRACC], 0>,
+                                 InstrStage<4, [P440_LWARX_Hold], 0>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<1, [P440_LWB]>],
+                                [4, 1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSync,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<3, [P440_AGEN], 1>,
+                                 InstrStage<2, [P440_CRD],  1>,
+                                 InstrStage<1, [P440_LWB]>]>,
+  InstrItinData<IIC_SprISYNC,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_FRACC], 0>,
+                                 InstrStage<1, [P440_LRACC], 0>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_FEXE1], 0>,
+                                 InstrStage<1, [P440_AGEN],  0>,
+                                 InstrStage<1, [P440_JEXE1], 0>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_FEXE2], 0>,
+                                 InstrStage<1, [P440_CRD],   0>,
+                                 InstrStage<1, [P440_JEXE2], 0>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<6, [P440_FEXE3], 0>,
+                                 InstrStage<6, [P440_LWB],   0>,
+                                 InstrStage<6, [P440_JWB],   0>,
+                                 InstrStage<6, [P440_IWB]>]>,
+  InstrItinData<IIC_SprMFSR,    [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [2, 0],
+                                [P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_SprMTMSR,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [2, 0],
+                                [P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_SprMTSR,    [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<3, [P440_IWB]>],
+                                [5, 0],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_SprTLBSYNC, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>]>,
+  InstrItinData<IIC_SprMFCR,    [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [4, 0],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_SprMFMSR,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [3, 0],
+                                [P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_SprMFSPR,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<3, [P440_IWB]>],
+                                [6, 0],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_SprMFTB,    [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<3, [P440_IWB]>],
+                                [6, 0],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_SprMTSPR,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<3, [P440_IWB]>],
+                                [6, 0],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_SprMTSRIN,  [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<3, [P440_IWB]>],
+                                [6, 0],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_SprRFI,     [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [4, 0],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_SprSC,      [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [4, 0],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_FPGeneral,  [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_FRACC]>,
+                                 InstrStage<1, [P440_FEXE1]>,
+                                 InstrStage<1, [P440_FEXE2]>,
+                                 InstrStage<1, [P440_FEXE3]>,
+                                 InstrStage<1, [P440_FEXE4]>,
+                                 InstrStage<1, [P440_FEXE5]>,
+                                 InstrStage<1, [P440_FEXE6]>,
+                                 InstrStage<1, [P440_FWB]>],
+                                [6, 0, 0],
+                                [P440_FPR_Bypass,
+                                 P440_FPR_Bypass, P440_FPR_Bypass]>,
+  InstrItinData<IIC_FPAddSub,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_FRACC]>,
+                                 InstrStage<1, [P440_FEXE1]>,
+                                 InstrStage<1, [P440_FEXE2]>,
+                                 InstrStage<1, [P440_FEXE3]>,
+                                 InstrStage<1, [P440_FEXE4]>,
+                                 InstrStage<1, [P440_FEXE5]>,
+                                 InstrStage<1, [P440_FEXE6]>,
+                                 InstrStage<1, [P440_FWB]>],
+                                [6, 0, 0],
+                                [P440_FPR_Bypass,
+                                 P440_FPR_Bypass, P440_FPR_Bypass]>,
+  InstrItinData<IIC_FPCompare,  [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_FRACC]>,
+                                 InstrStage<1, [P440_FEXE1]>,
+                                 InstrStage<1, [P440_FEXE2]>,
+                                 InstrStage<1, [P440_FEXE3]>,
+                                 InstrStage<1, [P440_FEXE4]>,
+                                 InstrStage<1, [P440_FEXE5]>,
+                                 InstrStage<1, [P440_FEXE6]>,
+                                 InstrStage<1, [P440_FWB]>],
+                                [6, 0, 0],
+                                [P440_FPR_Bypass, P440_FPR_Bypass,
+                                 P440_FPR_Bypass]>,
+  InstrItinData<IIC_FPDivD,     [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_FRACC]>,
+                                 InstrStage<1, [P440_FEXE1]>,
+                                 InstrStage<1, [P440_FEXE2]>,
+                                 InstrStage<1, [P440_FEXE3]>,
+                                 InstrStage<1, [P440_FEXE4]>,
+                                 InstrStage<1, [P440_FEXE5]>,
+                                 InstrStage<1, [P440_FEXE6]>,
+                                 InstrStage<25, [P440_FWB]>],
+                                [31, 0, 0],
+                                [NoBypass, P440_FPR_Bypass, P440_FPR_Bypass]>,
+  InstrItinData<IIC_FPDivS,     [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_FRACC]>,
+                                 InstrStage<1, [P440_FEXE1]>,
+                                 InstrStage<1, [P440_FEXE2]>,
+                                 InstrStage<1, [P440_FEXE3]>,
+                                 InstrStage<1, [P440_FEXE4]>,
+                                 InstrStage<1, [P440_FEXE5]>,
+                                 InstrStage<1, [P440_FEXE6]>,
+                                 InstrStage<13, [P440_FWB]>],
+                                [19, 0, 0],
+                                [NoBypass, P440_FPR_Bypass, P440_FPR_Bypass]>,
+  InstrItinData<IIC_FPFused,    [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_FRACC]>,
+                                 InstrStage<1, [P440_FEXE1]>,
+                                 InstrStage<1, [P440_FEXE2]>,
+                                 InstrStage<1, [P440_FEXE3]>,
+                                 InstrStage<1, [P440_FEXE4]>,
+                                 InstrStage<1, [P440_FEXE5]>,
+                                 InstrStage<1, [P440_FEXE6]>,
+                                 InstrStage<1, [P440_FWB]>],
+                                [6, 0, 0, 0],
+                                [P440_FPR_Bypass,
+                                 P440_FPR_Bypass, P440_FPR_Bypass,
+                                 P440_FPR_Bypass]>,
+  InstrItinData<IIC_FPRes,      [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_FRACC]>,
+                                 InstrStage<1, [P440_FEXE1]>,
+                                 InstrStage<1, [P440_FEXE2]>,
+                                 InstrStage<1, [P440_FEXE3]>,
+                                 InstrStage<1, [P440_FEXE4]>,
+                                 InstrStage<1, [P440_FEXE5]>,
+                                 InstrStage<1, [P440_FEXE6]>,
+                                 InstrStage<1, [P440_FWB]>],
+                                [6, 0],
+                                [P440_FPR_Bypass, P440_FPR_Bypass]>
+]>;
+
+// ===---------------------------------------------------------------------===//
+// PPC440 machine model for scheduling and other instruction cost heuristics.
+
+def PPC440Model : SchedMachineModel {
+  let IssueWidth = 2;  // 2 instructions are dispatched per cycle.
+  let MinLatency = -1; // OperandCycles are interpreted as MinLatency.
+  let LoadLatency = 5; // Optimistic load latency assuming bypass.
+                       // This is overriden by OperandCycles if the
+                       // Itineraries are queried instead.
+
+  let Itineraries = PPC440Itineraries;
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleA2.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleA2.td
new file mode 100644
index 0000000..1447696
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleA2.td
@@ -0,0 +1,169 @@
+//===- PPCScheduleA2.td - PPC A2 Scheduling Definitions --*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+
+// Primary reference:
+// A2 Processor User's Manual.
+// IBM (as updated in) 2010.
+
+//===----------------------------------------------------------------------===//
+// Functional units on the PowerPC A2 chip sets
+//
+def A2_XU     : FuncUnit; // A2_XU pipeline
+def A2_FU     : FuncUnit; // FI pipeline
+
+//
+// This file defines the itinerary class data for the PPC A2 processor.
+//
+//===----------------------------------------------------------------------===//
+
+
+def PPCA2Itineraries : ProcessorItineraries<
+  [A2_XU, A2_FU], [], [
+  InstrItinData<IIC_IntSimple,   [InstrStage<1, [A2_XU]>],
+                                 [1, 0, 0]>,
+  InstrItinData<IIC_IntGeneral,  [InstrStage<1, [A2_XU]>],
+                                 [2, 0, 0]>,
+  InstrItinData<IIC_IntCompare,  [InstrStage<1, [A2_XU]>],
+                                 [2, 0, 0]>,
+  InstrItinData<IIC_IntDivW,     [InstrStage<1, [A2_XU]>],
+                                 [39, 0, 0]>,
+  InstrItinData<IIC_IntDivD,     [InstrStage<1, [A2_XU]>],
+                                 [71, 0, 0]>,
+  InstrItinData<IIC_IntMulHW,    [InstrStage<1, [A2_XU]>],
+                                 [5, 0, 0]>,
+  InstrItinData<IIC_IntMulHWU,   [InstrStage<1, [A2_XU]>],
+                                 [5, 0, 0]>,
+  InstrItinData<IIC_IntMulLI,    [InstrStage<1, [A2_XU]>],
+                                 [6, 0, 0]>,
+  InstrItinData<IIC_IntRotate,   [InstrStage<1, [A2_XU]>],
+                                 [2, 0, 0]>,
+  InstrItinData<IIC_IntRotateD,  [InstrStage<1, [A2_XU]>],
+                                 [2, 0, 0]>,
+  InstrItinData<IIC_IntRotateDI, [InstrStage<1, [A2_XU]>],
+                                 [2, 0, 0]>,
+  InstrItinData<IIC_IntShift,    [InstrStage<1, [A2_XU]>],
+                                 [2, 0, 0]>,
+  InstrItinData<IIC_IntTrapW,    [InstrStage<1, [A2_XU]>],
+                                 [2, 0]>,
+  InstrItinData<IIC_IntTrapD,    [InstrStage<1, [A2_XU]>],
+                                 [2, 0]>,
+  InstrItinData<IIC_BrB,         [InstrStage<1, [A2_XU]>],
+                                 [6, 0, 0]>,
+  InstrItinData<IIC_BrCR,        [InstrStage<1, [A2_XU]>],
+                                 [1, 0, 0]>,
+  InstrItinData<IIC_BrMCR,       [InstrStage<1, [A2_XU]>],
+                                 [5, 0, 0]>,
+  InstrItinData<IIC_BrMCRX,      [InstrStage<1, [A2_XU]>],
+                                 [1, 0, 0]>,
+  InstrItinData<IIC_LdStDCBA,    [InstrStage<1, [A2_XU]>],
+                                 [1, 0, 0]>,
+  InstrItinData<IIC_LdStDCBF,    [InstrStage<1, [A2_XU]>],
+                                 [1, 0, 0]>,
+  InstrItinData<IIC_LdStDCBI,    [InstrStage<1, [A2_XU]>],
+                                 [1, 0, 0]>,
+  InstrItinData<IIC_LdStLoad,    [InstrStage<1, [A2_XU]>],
+                                 [6, 0, 0]>,
+  InstrItinData<IIC_LdStLoadUpd, [InstrStage<1, [A2_XU]>],
+                                 [6, 8, 0, 0]>,
+  InstrItinData<IIC_LdStLoadUpdX,[InstrStage<1, [A2_XU]>],
+                                 [6, 8, 0, 0]>,
+  InstrItinData<IIC_LdStLDU,     [InstrStage<1, [A2_XU]>],
+                                 [6, 0, 0]>,
+  InstrItinData<IIC_LdStLDUX,    [InstrStage<1, [A2_XU]>],
+                                 [6, 0, 0]>,
+  InstrItinData<IIC_LdStStore,   [InstrStage<1, [A2_XU]>],
+                                 [0, 0, 0]>,
+  InstrItinData<IIC_LdStStoreUpd,[InstrStage<1, [A2_XU]>],
+                                 [2, 0, 0, 0]>,
+  InstrItinData<IIC_LdStICBI,    [InstrStage<1, [A2_XU]>],
+                                 [16, 0, 0]>,
+  InstrItinData<IIC_LdStSTFD,    [InstrStage<1, [A2_XU]>],
+                                 [0, 0, 0]>,
+  InstrItinData<IIC_LdStSTFDU,   [InstrStage<1, [A2_XU]>],
+                                 [2, 0, 0, 0]>,
+  InstrItinData<IIC_LdStLFD,     [InstrStage<1, [A2_XU]>],
+                                 [7, 0, 0]>,
+  InstrItinData<IIC_LdStLFDU,    [InstrStage<1, [A2_XU]>],
+                                 [7, 9, 0, 0]>,
+  InstrItinData<IIC_LdStLFDUX,   [InstrStage<1, [A2_XU]>],
+                                 [7, 9, 0, 0]>,
+  InstrItinData<IIC_LdStLHA,     [InstrStage<1, [A2_XU]>],
+                                 [6, 0, 0]>,
+  InstrItinData<IIC_LdStLHAU,    [InstrStage<1, [A2_XU]>],
+                                 [6, 8, 0, 0]>,
+  InstrItinData<IIC_LdStLHAUX,   [InstrStage<1, [A2_XU]>],
+                                 [6, 8, 0, 0]>,
+  InstrItinData<IIC_LdStLWARX,   [InstrStage<1, [A2_XU]>],
+                                 [82, 0, 0]>, // L2 latency
+  InstrItinData<IIC_LdStSTD,     [InstrStage<1, [A2_XU]>],
+                                 [0, 0, 0]>,
+  InstrItinData<IIC_LdStSTDU,    [InstrStage<1, [A2_XU]>],
+                                 [2, 0, 0, 0]>,
+  InstrItinData<IIC_LdStSTDUX,   [InstrStage<1, [A2_XU]>],
+                                 [2, 0, 0, 0]>,
+  InstrItinData<IIC_LdStSTDCX,   [InstrStage<1, [A2_XU]>],
+                                 [82, 0, 0]>, // L2 latency
+  InstrItinData<IIC_LdStSTWCX,   [InstrStage<1, [A2_XU]>],
+                                 [82, 0, 0]>, // L2 latency
+  InstrItinData<IIC_LdStSync,    [InstrStage<1, [A2_XU]>],
+                                 [6]>,
+  InstrItinData<IIC_SprISYNC,    [InstrStage<1, [A2_XU]>],
+                                 [16]>,
+  InstrItinData<IIC_SprMTMSR,    [InstrStage<1, [A2_XU]>],
+                                 [16, 0]>,
+  InstrItinData<IIC_SprMFCR,     [InstrStage<1, [A2_XU]>],
+                                 [6, 0]>,
+  InstrItinData<IIC_SprMFCRF,    [InstrStage<1, [A2_XU]>],
+                                 [1, 0]>,
+  InstrItinData<IIC_SprMFMSR,    [InstrStage<1, [A2_XU]>],
+                                 [4, 0]>,
+  InstrItinData<IIC_SprMFSPR,    [InstrStage<1, [A2_XU]>],
+                                 [6, 0]>,
+  InstrItinData<IIC_SprMFTB,     [InstrStage<1, [A2_XU]>],
+                                 [4, 0]>,
+  InstrItinData<IIC_SprMTSPR,    [InstrStage<1, [A2_XU]>],
+                                 [6, 0]>,
+  InstrItinData<IIC_SprRFI,      [InstrStage<1, [A2_XU]>],
+                                 [16]>,
+  InstrItinData<IIC_SprSC,       [InstrStage<1, [A2_XU]>],
+                                 [16]>,
+  InstrItinData<IIC_FPGeneral,   [InstrStage<1, [A2_FU]>],
+                                 [6, 0, 0]>,
+  InstrItinData<IIC_FPAddSub,    [InstrStage<1, [A2_FU]>],
+                                 [6, 0, 0]>,
+  InstrItinData<IIC_FPCompare,   [InstrStage<1, [A2_FU]>],
+                                 [5, 0, 0]>,
+  InstrItinData<IIC_FPDivD,      [InstrStage<1, [A2_FU]>],
+                                 [72, 0, 0]>,
+  InstrItinData<IIC_FPDivS,      [InstrStage<1, [A2_FU]>],
+                                 [59, 0, 0]>,
+  InstrItinData<IIC_FPSqrtD,     [InstrStage<1, [A2_FU]>],
+                                 [69, 0, 0]>,
+  InstrItinData<IIC_FPSqrtS,     [InstrStage<1, [A2_FU]>],
+                                 [65, 0, 0]>,
+  InstrItinData<IIC_FPFused,     [InstrStage<1, [A2_FU]>],
+                                 [6, 0, 0, 0]>,
+  InstrItinData<IIC_FPRes,       [InstrStage<1, [A2_FU]>],
+                                 [6, 0]>
+]>;
+
+// ===---------------------------------------------------------------------===//
+// A2 machine model for scheduling and other instruction cost heuristics.
+
+def PPCA2Model : SchedMachineModel {
+  let IssueWidth = 1;  // 1 instruction is dispatched per cycle.
+  let MinLatency = -1; // OperandCycles are interpreted as MinLatency.
+  let LoadLatency = 6; // Optimistic load latency assuming bypass.
+                       // This is overriden by OperandCycles if the
+                       // Itineraries are queried instead.
+  let MispredictPenalty = 13;
+
+  let Itineraries = PPCA2Itineraries;
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleE500mc.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleE500mc.td
new file mode 100644
index 0000000..dab89e3
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleE500mc.td
@@ -0,0 +1,314 @@
+//===-- PPCScheduleE500mc.td - e500mc Scheduling Defs ------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the Freescale e500mc 32-bit 
+// Power processor.
+// 
+// All information is derived from the "e500mc Core Reference Manual",
+// Freescale Document Number E500MCRM, Rev. 1, 03/2012.
+//
+//===----------------------------------------------------------------------===//
+// Relevant functional units in the Freescale e500mc core:
+//
+//  * Decode & Dispatch
+//    Can dispatch up to 2 instructions per clock cycle to either the GPR Issue
+//    queues (GIQx), FP Issue Queue (FIQ), or Branch issue queue (BIQ).
+def E500_DIS0 : FuncUnit; // Dispatch stage - insn 1
+def E500_DIS1 : FuncUnit; // Dispatch stage - insn 2
+
+//  * Execute
+//    6 pipelined execution units: SFX0, SFX1, BU, FPU, LSU, CFX.
+//    Some instructions can only execute in SFX0 but not SFX1.
+//    The CFX has a bypass path, allowing non-divide instructions to execute 
+//    while a divide instruction is executed.
+def E500_SFX0  : FuncUnit; // Simple unit 0
+def E500_SFX1  : FuncUnit; // Simple unit 1
+def E500_BU    : FuncUnit; // Branch unit
+def E500_CFX_DivBypass 
+               : FuncUnit; // CFX divide bypass path
+def E500_CFX_0 : FuncUnit; // CFX pipeline
+def E500_LSU_0 : FuncUnit; // LSU pipeline
+def E500_FPU_0 : FuncUnit; // FPU pipeline
+
+def E500_GPR_Bypass : Bypass;
+def E500_FPR_Bypass : Bypass;
+def E500_CR_Bypass  : Bypass;
+
+def PPCE500mcItineraries : ProcessorItineraries<
+  [E500_DIS0, E500_DIS1, E500_SFX0, E500_SFX1, E500_BU, E500_CFX_DivBypass,
+   E500_CFX_0, E500_LSU_0, E500_FPU_0],
+  [E500_CR_Bypass, E500_GPR_Bypass, E500_FPR_Bypass], [
+  InstrItinData<IIC_IntSimple,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1]>],
+                                 [4, 1, 1], // Latency = 1
+                                 [E500_GPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_IntGeneral,  [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1]>],
+                                 [4, 1, 1], // Latency = 1
+                                 [E500_GPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_IntCompare,  [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1]>],
+                                 [5, 1, 1], // Latency = 1 or 2
+                                 [E500_CR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_IntDivW,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_CFX_0], 0>,
+                                  InstrStage<14, [E500_CFX_DivBypass]>],
+                                 [17, 1, 1], // Latency=4..35, Repeat= 4..35
+                                 [E500_GPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_IntMFFS,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<8, [E500_FPU_0]>],
+                                 [11], // Latency = 8
+                                 [E500_FPR_Bypass]>,
+  InstrItinData<IIC_IntMTFSB0,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<8, [E500_FPU_0]>],
+                                 [11, 1, 1], // Latency = 8
+                                 [NoBypass, NoBypass, NoBypass]>,
+  InstrItinData<IIC_IntMulHW,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_CFX_0]>],
+                                 [7, 1, 1], // Latency = 4, Repeat rate = 1
+                                 [E500_GPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_IntMulHWU,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_CFX_0]>],
+                                 [7, 1, 1], // Latency = 4, Repeat rate = 1
+                                 [E500_GPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_IntMulLI,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_CFX_0]>],
+                                 [7, 1, 1], // Latency = 4, Repeat rate = 1
+                                 [E500_GPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_IntRotate,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1]>],
+                                 [4, 1, 1], // Latency = 1
+                                 [E500_GPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_IntShift,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1]>],
+                                 [4, 1, 1], // Latency = 1
+                                 [E500_GPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_IntTrapW,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<2, [E500_SFX0]>],
+                                 [5, 1], // Latency = 2, Repeat rate = 2
+                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_BrB,         [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_BU]>],
+                                 [4, 1], // Latency = 1
+                                 [NoBypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_BrCR,        [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_BU]>],
+                                 [4, 1, 1], // Latency = 1
+                                 [E500_CR_Bypass,
+                                  E500_CR_Bypass, E500_CR_Bypass]>,
+  InstrItinData<IIC_BrMCR,       [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_BU]>],
+                                 [4, 1], // Latency = 1
+                                 [E500_CR_Bypass, E500_CR_Bypass]>,
+  InstrItinData<IIC_BrMCRX,      [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1]>],
+                                 [4, 1, 1], // Latency = 1
+                                 [E500_CR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStDCBA,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3, Repeat rate = 1
+                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStDCBF,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStDCBI,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLoad,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLoadUpd, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [E500_GPR_Bypass, E500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStLoadUpdX,[InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [E500_GPR_Bypass, E500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStStore,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [NoBypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStStoreUpd,[InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [NoBypass, E500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStICBI,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [NoBypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTFD,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1, 1], // Latency = 3
+                                 [E500_GPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTFDU,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1, 1], // Latency = 3
+                                 [E500_GPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStLFD,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [7, 1, 1], // Latency = 4
+                                 [E500_FPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLFDU,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [7, 1, 1], // Latency = 4
+                                 [E500_FPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStLFDUX,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [7, 1, 1], // Latency = 4
+                                 [E500_FPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStLHA,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLHAU,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLHAUX,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLMW,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [7, 1], // Latency = r+3
+                                 [NoBypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLWARX,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<3, [E500_LSU_0]>],
+                                 [6, 1, 1], // Latency = 3, Repeat rate = 3
+                                 [E500_GPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTWCX,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [NoBypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSync,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>]>,
+  InstrItinData<IIC_SprMFSR,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<4, [E500_SFX0]>],
+                                 [7, 1],
+                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_SprMTMSR,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<2, [E500_SFX0, E500_SFX1]>],
+                                 [5, 1], // Latency = 2, Repeat rate = 4
+                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_SprMTSR,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0]>],
+                                 [5, 1],
+                                 [NoBypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_SprTLBSYNC,  [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0], 0>]>,
+  InstrItinData<IIC_SprMFCR,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<5, [E500_SFX0]>],
+                                 [8, 1],
+                                 [E500_GPR_Bypass, E500_CR_Bypass]>,
+  InstrItinData<IIC_SprMFCRF,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<5, [E500_SFX0]>],
+                                 [8, 1],
+                                 [E500_GPR_Bypass, E500_CR_Bypass]>,
+  InstrItinData<IIC_SprMFMSR,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<4, [E500_SFX0]>],
+                                 [7, 1], // Latency = 4, Repeat rate = 4
+                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_SprMFSPR,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1]>],
+                                 [4, 1], // Latency = 1, Repeat rate = 1
+                                 [E500_GPR_Bypass, E500_CR_Bypass]>,
+  InstrItinData<IIC_SprMFTB,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<4, [E500_SFX0]>],
+                                 [7, 1], // Latency = 4, Repeat rate = 4
+                                 [NoBypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_SprMTSPR,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1]>],
+                                 [4, 1], // Latency = 1, Repeat rate = 1
+                                 [E500_CR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_SprMTSRIN,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0]>],
+                                 [4, 1],
+                                 [NoBypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_FPGeneral,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<2, [E500_FPU_0]>],
+                                 [11, 1, 1], // Latency = 8, Repeat rate = 2 
+                                 [E500_FPR_Bypass,
+                                  E500_FPR_Bypass, E500_FPR_Bypass]>,
+  InstrItinData<IIC_FPAddSub,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<4, [E500_FPU_0]>],
+                                 [13, 1, 1], // Latency = 10, Repeat rate = 4 
+                                 [E500_FPR_Bypass,
+                                  E500_FPR_Bypass, E500_FPR_Bypass]>,
+  InstrItinData<IIC_FPCompare,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<2, [E500_FPU_0]>],
+                                 [11, 1, 1], // Latency = 8, Repeat rate = 2
+                                 [E500_CR_Bypass,
+                                  E500_FPR_Bypass, E500_FPR_Bypass]>,
+  InstrItinData<IIC_FPDivD,      [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<68, [E500_FPU_0]>],
+                                 [71, 1, 1], // Latency = 68, Repeat rate = 68
+                                 [E500_FPR_Bypass,
+                                  E500_FPR_Bypass, E500_FPR_Bypass]>,
+  InstrItinData<IIC_FPDivS,      [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<38, [E500_FPU_0]>],
+                                 [41, 1, 1], // Latency = 38, Repeat rate = 38
+                                 [E500_FPR_Bypass,
+                                  E500_FPR_Bypass, E500_FPR_Bypass]>,
+  InstrItinData<IIC_FPFused,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<4, [E500_FPU_0]>],
+                                 [13, 1, 1, 1], // Latency = 10, Repeat rate = 4
+                                 [E500_FPR_Bypass,
+                                  E500_FPR_Bypass, E500_FPR_Bypass,
+                                  E500_FPR_Bypass]>,
+  InstrItinData<IIC_FPRes,       [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<38, [E500_FPU_0]>],
+                                 [41, 1], // Latency = 38, Repeat rate = 38
+                                 [E500_FPR_Bypass, E500_FPR_Bypass]>
+]>;
+
+// ===---------------------------------------------------------------------===//
+// e500mc machine model for scheduling and other instruction cost heuristics.
+
+def PPCE500mcModel : SchedMachineModel {
+  let IssueWidth = 2;  // 2 micro-ops are dispatched per cycle.
+  let MinLatency = -1; // OperandCycles are interpreted as MinLatency.
+  let LoadLatency = 5; // Optimistic load latency assuming bypass.
+                       // This is overriden by OperandCycles if the
+                       // Itineraries are queried instead.
+
+  let Itineraries = PPCE500mcItineraries;
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleE5500.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleE5500.td
new file mode 100644
index 0000000..de097d9
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleE5500.td
@@ -0,0 +1,374 @@
+//===-- PPCScheduleE500mc.td - e5500 Scheduling Defs -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the Freescale e5500 64-bit 
+// Power processor.
+// 
+// All information is derived from the "e5500 Core Reference Manual",
+// Freescale Document Number e5500RM, Rev. 1, 03/2012.
+//
+//===----------------------------------------------------------------------===//
+// Relevant functional units in the Freescale e5500 core
+// (These are the same as for the e500mc)
+//
+//  * Decode & Dispatch
+//    Can dispatch up to 2 instructions per clock cycle to either the GPR Issue
+//    queues (GIQx), FP Issue Queue (FIQ), or Branch issue queue (BIQ).
+def E5500_DIS0 : FuncUnit;
+def E5500_DIS1 : FuncUnit;
+
+//  * Execute
+//    6 pipelined execution units: SFX0, SFX1, BU, FPU, LSU, CFX.
+//    The CFX has a bypass path, allowing non-divide instructions to execute 
+//    while a divide instruction is being executed.
+def E5500_SFX0  : FuncUnit; // Simple unit 0
+def E5500_SFX1  : FuncUnit; // Simple unit 1
+def E5500_BU    : FuncUnit; // Branch unit
+def E5500_CFX_DivBypass 
+                : FuncUnit; // CFX divide bypass path
+def E5500_CFX_0 : FuncUnit; // CFX pipeline stage 0
+
+def E5500_CFX_1 : FuncUnit; // CFX pipeline stage 1 
+
+def E5500_LSU_0 : FuncUnit; // LSU pipeline
+def E5500_FPU_0 : FuncUnit; // FPU pipeline
+
+def E5500_GPR_Bypass : Bypass;
+def E5500_FPR_Bypass : Bypass;
+def E5500_CR_Bypass  : Bypass;
+
+def PPCE5500Itineraries : ProcessorItineraries<
+  [E5500_DIS0, E5500_DIS1, E5500_SFX0, E5500_SFX1, E5500_BU,
+   E5500_CFX_DivBypass, E5500_CFX_0, E5500_CFX_1,
+   E5500_LSU_0, E5500_FPU_0],
+  [E5500_CR_Bypass, E5500_GPR_Bypass, E5500_FPR_Bypass], [
+  InstrItinData<IIC_IntSimple,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1]>],
+                                 [5, 2, 2], // Latency = 1
+                                 [E5500_GPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_IntGeneral,  [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1]>],
+                                 [5, 2, 2], // Latency = 1
+                                 [E5500_GPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_IntCompare,  [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1]>],
+                                 [6, 2, 2], // Latency = 1 or 2
+                                 [E5500_CR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_IntDivD,     [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_CFX_0], 0>,
+                                  InstrStage<26, [E5500_CFX_DivBypass]>],
+                                 [30, 2, 2], // Latency= 4..26, Repeat rate= 4..26
+                                 [E5500_GPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_IntDivW,     [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_CFX_0], 0>,
+                                  InstrStage<16, [E5500_CFX_DivBypass]>],
+                                 [20, 2, 2], // Latency= 4..16, Repeat rate= 4..16
+                                 [E5500_GPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_IntMFFS,     [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_FPU_0]>],
+                                 [11], // Latency = 7, Repeat rate = 1
+                                 [E5500_FPR_Bypass]>,
+  InstrItinData<IIC_IntMTFSB0,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<7, [E5500_FPU_0]>],
+                                 [11, 2, 2], // Latency = 7, Repeat rate = 7
+                                 [NoBypass, NoBypass, NoBypass]>,
+  InstrItinData<IIC_IntMulHD,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_CFX_0], 0>,
+                                  InstrStage<2, [E5500_CFX_1]>],
+                                 [9, 2, 2], // Latency = 4..7, Repeat rate = 2..4
+                                 [E5500_GPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_IntMulHW,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_CFX_0], 0>,
+                                  InstrStage<1, [E5500_CFX_1]>],
+                                 [8, 2, 2], // Latency = 4, Repeat rate = 1
+                                 [E5500_GPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_IntMulHWU,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_CFX_0], 0>,
+                                  InstrStage<1, [E5500_CFX_1]>],
+                                 [8, 2, 2], // Latency = 4, Repeat rate = 1
+                                 [E5500_GPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_IntMulLI,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_CFX_0], 0>,
+                                  InstrStage<2, [E5500_CFX_1]>],
+                                 [8, 2, 2], // Latency = 4 or 5, Repeat = 2
+                                 [E5500_GPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_IntRotate,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1]>],
+                                 [5, 2, 2], // Latency = 1
+                                 [E5500_GPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_IntRotateD,  [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<2, [E5500_SFX0, E5500_SFX1]>],
+                                 [6, 2, 2], // Latency = 2, Repeat rate = 2
+                                 [E5500_GPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_IntRotateDI, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1]>],
+                                 [5, 2, 2], // Latency = 1, Repeat rate = 1
+                                 [E5500_GPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_IntShift,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<2, [E5500_SFX0, E5500_SFX1]>],
+                                 [6, 2, 2], // Latency = 2, Repeat rate = 2
+                                 [E5500_GPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_IntTrapW,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<2, [E5500_SFX0]>],
+                                 [6, 2], // Latency = 2, Repeat rate = 2
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_BrB,         [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_BU]>],
+                                 [5, 2], // Latency = 1
+                                 [NoBypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_BrCR,        [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_BU]>],
+                                 [5, 2, 2], // Latency = 1
+                                 [E5500_CR_Bypass,
+                                  E5500_CR_Bypass, E5500_CR_Bypass]>,
+  InstrItinData<IIC_BrMCR,       [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_BU]>],
+                                 [5, 2], // Latency = 1
+                                 [E5500_CR_Bypass, E5500_CR_Bypass]>,
+  InstrItinData<IIC_BrMCRX,      [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_CFX_0]>],
+                                 [5, 2, 2], // Latency = 1
+                                 [E5500_CR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStDCBA,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStDCBF,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStDCBI,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLoad,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLoadUpd, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStLoadUpdX,[InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStLD,      [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLDARX,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<3, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 3
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLDU,     [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStLDUX,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStStore,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [NoBypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStStoreUpd,[InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [NoBypass, E5500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStICBI,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [NoBypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTFD,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2, 2], // Latency = 3, Repeat rate = 1
+                                 [E5500_GPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTFDU,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2, 2], // Latency = 3, Repeat rate = 1
+                                 [E5500_GPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStLFD,     [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [8, 2, 2], // Latency = 4, Repeat rate = 1
+                                 [E5500_FPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStLFDU,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [8, 2, 2], // Latency = 4, Repeat rate = 1
+                                 [E5500_FPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStLFDUX,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [8, 2, 2], // Latency = 4, Repeat rate = 1
+                                 [E5500_FPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStLHA,     [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLHAU,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStLHAUX,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStLMW,     [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<4, [E5500_LSU_0]>],
+                                 [8, 2], // Latency = r+3, Repeat rate = r+3
+                                 [NoBypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLWARX,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<3, [E5500_LSU_0]>],
+                                 [7, 2, 2], // Latency = 3, Repeat rate = 3
+                                 [E5500_GPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTD,     [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [NoBypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTDCX,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [NoBypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTDU,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [NoBypass, E5500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStSTDUX,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [NoBypass, E5500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStSTWCX,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [NoBypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSync,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>]>,
+  InstrItinData<IIC_SprMTMSR,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<2, [E5500_CFX_0]>],
+                                 [6, 2], // Latency = 2, Repeat rate = 4
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_SprTLBSYNC,  [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_LSU_0], 0>]>,
+  InstrItinData<IIC_SprMFCR,     [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<5, [E5500_CFX_0]>],
+                                 [9, 2], // Latency = 5, Repeat rate = 5
+                                 [E5500_GPR_Bypass, E5500_CR_Bypass]>,
+  InstrItinData<IIC_SprMFCRF,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<5, [E5500_CFX_0]>],
+                                 [9, 2], // Latency = 5, Repeat rate = 5
+                                 [E5500_GPR_Bypass, E5500_CR_Bypass]>,
+  InstrItinData<IIC_SprMFMSR,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<4, [E5500_SFX0]>],
+                                 [8, 2], // Latency = 4, Repeat rate = 4
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_SprMFSPR,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_CFX_0]>],
+                                 [5], // Latency = 1, Repeat rate = 1
+                                 [E5500_GPR_Bypass]>,
+  InstrItinData<IIC_SprMFTB,     [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<4, [E5500_CFX_0]>],
+                                 [8, 2], // Latency = 4, Repeat rate = 4
+                                 [NoBypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_SprMTSPR,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1]>],
+                                 [5], // Latency = 1, Repeat rate = 1
+                                 [E5500_GPR_Bypass]>,
+  InstrItinData<IIC_FPGeneral,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_FPU_0]>],
+                                 [11, 2, 2], // Latency = 7, Repeat rate = 1 
+                                 [E5500_FPR_Bypass,
+                                  E5500_FPR_Bypass, E5500_FPR_Bypass]>,
+  InstrItinData<IIC_FPAddSub,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_FPU_0]>],
+                                 [11, 2, 2], // Latency = 7, Repeat rate = 1 
+                                 [E5500_FPR_Bypass,
+                                  E5500_FPR_Bypass, E5500_FPR_Bypass]>,
+  InstrItinData<IIC_FPCompare,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_FPU_0]>],
+                                 [11, 2, 2], // Latency = 7, Repeat rate = 1
+                                 [E5500_CR_Bypass,
+                                  E5500_FPR_Bypass, E5500_FPR_Bypass]>,
+  InstrItinData<IIC_FPDivD,      [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<31, [E5500_FPU_0]>],
+                                 [39, 2, 2], // Latency = 35, Repeat rate = 31
+                                 [E5500_FPR_Bypass,
+                                  E5500_FPR_Bypass, E5500_FPR_Bypass]>,
+  InstrItinData<IIC_FPDivS,      [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<16, [E5500_FPU_0]>],
+                                 [24, 2, 2], // Latency = 20, Repeat rate = 16 
+                                 [E5500_FPR_Bypass,
+                                  E5500_FPR_Bypass, E5500_FPR_Bypass]>,
+  InstrItinData<IIC_FPFused,     [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_FPU_0]>],
+                                 [11, 2, 2, 2], // Latency = 7, Repeat rate = 1
+                                 [E5500_FPR_Bypass,
+                                  E5500_FPR_Bypass, E5500_FPR_Bypass,
+                                  E5500_FPR_Bypass]>,
+  InstrItinData<IIC_FPRes,       [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<2, [E5500_FPU_0]>],
+                                 [12, 2], // Latency = 8, Repeat rate = 2
+                                 [E5500_FPR_Bypass, E5500_FPR_Bypass]>
+]>;
+
+// ===---------------------------------------------------------------------===//
+// e5500 machine model for scheduling and other instruction cost heuristics.
+
+def PPCE5500Model : SchedMachineModel {
+  let IssueWidth = 2;  // 2 micro-ops are dispatched per cycle.
+  let MinLatency = -1; // OperandCycles are interpreted as MinLatency.
+  let LoadLatency = 6; // Optimistic load latency assuming bypass.
+                       // This is overriden by OperandCycles if the
+                       // Itineraries are queried instead.
+
+  let Itineraries = PPCE5500Itineraries;
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleG3.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG3.td
new file mode 100644
index 0000000..21efd8f
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG3.td
@@ -0,0 +1,80 @@
+//===-- PPCScheduleG3.td - PPC G3 Scheduling Definitions ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the G3 (750) processor.
+//
+//===----------------------------------------------------------------------===//
+
+def G3_BPU    : FuncUnit; // Branch unit
+def G3_SLU    : FuncUnit; // Store/load unit
+def G3_SRU    : FuncUnit; // special register unit
+def G3_IU1    : FuncUnit; // integer unit 1 (simple)
+def G3_IU2    : FuncUnit; // integer unit 2 (complex)
+def G3_FPU1   : FuncUnit; // floating point unit 1
+
+def G3Itineraries : ProcessorItineraries<
+  [G3_IU1, G3_IU2, G3_FPU1, G3_BPU, G3_SRU, G3_SLU], [], [
+  InstrItinData<IIC_IntSimple   , [InstrStage<1, [G3_IU1, G3_IU2]>]>,
+  InstrItinData<IIC_IntGeneral  , [InstrStage<1, [G3_IU1, G3_IU2]>]>,
+  InstrItinData<IIC_IntCompare  , [InstrStage<1, [G3_IU1, G3_IU2]>]>,
+  InstrItinData<IIC_IntDivW     , [InstrStage<19, [G3_IU1]>]>,
+  InstrItinData<IIC_IntMFFS     , [InstrStage<1, [G3_FPU1]>]>,
+  InstrItinData<IIC_IntMTFSB0   , [InstrStage<3, [G3_FPU1]>]>,
+  InstrItinData<IIC_IntMulHW    , [InstrStage<5, [G3_IU1]>]>,
+  InstrItinData<IIC_IntMulHWU   , [InstrStage<6, [G3_IU1]>]>,
+  InstrItinData<IIC_IntMulLI    , [InstrStage<3, [G3_IU1]>]>,
+  InstrItinData<IIC_IntRotate   , [InstrStage<1, [G3_IU1, G3_IU2]>]>,
+  InstrItinData<IIC_IntShift    , [InstrStage<1, [G3_IU1, G3_IU2]>]>,
+  InstrItinData<IIC_IntTrapW    , [InstrStage<2, [G3_IU1, G3_IU2]>]>,
+  InstrItinData<IIC_BrB         , [InstrStage<1, [G3_BPU]>]>,
+  InstrItinData<IIC_BrCR        , [InstrStage<1, [G3_SRU]>]>,
+  InstrItinData<IIC_BrMCR       , [InstrStage<1, [G3_SRU]>]>,
+  InstrItinData<IIC_BrMCRX      , [InstrStage<1, [G3_SRU]>]>,
+  InstrItinData<IIC_LdStDCBA    , [InstrStage<2, [G3_SLU]>]>,
+  InstrItinData<IIC_LdStDCBF    , [InstrStage<3, [G3_SLU]>]>,
+  InstrItinData<IIC_LdStDCBI    , [InstrStage<3, [G3_SLU]>]>,
+  InstrItinData<IIC_LdStLoad    , [InstrStage<2, [G3_SLU]>]>,
+  InstrItinData<IIC_LdStLoadUpd , [InstrStage<2, [G3_SLU]>]>,  
+  InstrItinData<IIC_LdStLoadUpdX, [InstrStage<2, [G3_SLU]>]>,  
+  InstrItinData<IIC_LdStStore   , [InstrStage<2, [G3_SLU]>]>,
+  InstrItinData<IIC_LdStStoreUpd, [InstrStage<2, [G3_SLU]>]>,  
+  InstrItinData<IIC_LdStICBI    , [InstrStage<3, [G3_SLU]>]>,
+  InstrItinData<IIC_LdStSTFD    , [InstrStage<2, [G3_SLU]>]>,
+  InstrItinData<IIC_LdStSTFDU   , [InstrStage<2, [G3_SLU]>]>,
+  InstrItinData<IIC_LdStLFD     , [InstrStage<2, [G3_SLU]>]>,
+  InstrItinData<IIC_LdStLFDU    , [InstrStage<2, [G3_SLU]>]>,
+  InstrItinData<IIC_LdStLFDUX   , [InstrStage<2, [G3_SLU]>]>,
+  InstrItinData<IIC_LdStLHA     , [InstrStage<2, [G3_SLU]>]>,
+  InstrItinData<IIC_LdStLHAU    , [InstrStage<2, [G3_SLU]>]>,  
+  InstrItinData<IIC_LdStLHAUX   , [InstrStage<2, [G3_SLU]>]>,  
+  InstrItinData<IIC_LdStLMW     , [InstrStage<34, [G3_SLU]>]>,
+  InstrItinData<IIC_LdStLWARX   , [InstrStage<3, [G3_SLU]>]>,
+  InstrItinData<IIC_LdStSTWCX   , [InstrStage<8, [G3_SLU]>]>,
+  InstrItinData<IIC_LdStSync    , [InstrStage<3, [G3_SLU]>]>,
+  InstrItinData<IIC_SprISYNC    , [InstrStage<2, [G3_SRU]>]>,
+  InstrItinData<IIC_SprMFSR     , [InstrStage<3, [G3_SRU]>]>,
+  InstrItinData<IIC_SprMTMSR    , [InstrStage<1, [G3_SRU]>]>,
+  InstrItinData<IIC_SprMTSR     , [InstrStage<2, [G3_SRU]>]>,
+  InstrItinData<IIC_SprTLBSYNC  , [InstrStage<3, [G3_SRU]>]>,
+  InstrItinData<IIC_SprMFCR     , [InstrStage<1, [G3_SRU]>]>,
+  InstrItinData<IIC_SprMFMSR    , [InstrStage<1, [G3_SRU]>]>,
+  InstrItinData<IIC_SprMFSPR    , [InstrStage<3, [G3_SRU]>]>,
+  InstrItinData<IIC_SprMFTB     , [InstrStage<3, [G3_SRU]>]>,
+  InstrItinData<IIC_SprMTSPR    , [InstrStage<2, [G3_SRU]>]>,
+  InstrItinData<IIC_SprMTSRIN   , [InstrStage<2, [G3_SRU]>]>,
+  InstrItinData<IIC_SprRFI      , [InstrStage<2, [G3_SRU]>]>,
+  InstrItinData<IIC_SprSC       , [InstrStage<2, [G3_SRU]>]>,
+  InstrItinData<IIC_FPGeneral   , [InstrStage<1, [G3_FPU1]>]>,
+  InstrItinData<IIC_FPAddSub    , [InstrStage<1, [G3_FPU1]>]>,
+  InstrItinData<IIC_FPCompare   , [InstrStage<1, [G3_FPU1]>]>,
+  InstrItinData<IIC_FPDivD      , [InstrStage<31, [G3_FPU1]>]>,
+  InstrItinData<IIC_FPDivS      , [InstrStage<17, [G3_FPU1]>]>,
+  InstrItinData<IIC_FPFused     , [InstrStage<2, [G3_FPU1]>]>,
+  InstrItinData<IIC_FPRes       , [InstrStage<10, [G3_FPU1]>]>
+]>;
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleG4.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG4.td
new file mode 100644
index 0000000..340773e
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG4.td
@@ -0,0 +1,96 @@
+//===-- PPCScheduleG4.td - PPC G4 Scheduling Definitions ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the G4 (7400) processor.
+//
+//===----------------------------------------------------------------------===//
+
+def G4_BPU    : FuncUnit; // Branch unit
+def G4_SLU    : FuncUnit; // Store/load unit
+def G4_SRU    : FuncUnit; // special register unit
+def G4_IU1    : FuncUnit; // integer unit 1 (simple)
+def G4_IU2    : FuncUnit; // integer unit 2 (complex)
+def G4_FPU1   : FuncUnit; // floating point unit 1
+def G4_VPU    : FuncUnit; // vector permutation unit
+def G4_VIU1   : FuncUnit; // vector integer unit 1 (simple)
+def G4_VIU2   : FuncUnit; // vector integer unit 2 (complex)
+def G4_VFPU   : FuncUnit; // vector floating point unit
+
+def G4Itineraries : ProcessorItineraries<
+  [G4_IU1, G4_IU2, G4_SLU, G4_SRU, G4_BPU, G4_FPU1,
+   G4_VIU1, G4_VIU2, G4_VPU, G4_VFPU], [], [
+  InstrItinData<IIC_IntSimple   , [InstrStage<1, [G4_IU1, G4_IU2]>]>,
+  InstrItinData<IIC_IntGeneral  , [InstrStage<1, [G4_IU1, G4_IU2]>]>,
+  InstrItinData<IIC_IntCompare  , [InstrStage<1, [G4_IU1, G4_IU2]>]>,
+  InstrItinData<IIC_IntDivW     , [InstrStage<19, [G4_IU1]>]>,
+  InstrItinData<IIC_IntMFFS     , [InstrStage<3, [G4_FPU1]>]>,
+  InstrItinData<IIC_IntMFVSCR   , [InstrStage<1, [G4_VIU1]>]>,
+  InstrItinData<IIC_IntMTFSB0   , [InstrStage<3, [G4_FPU1]>]>,
+  InstrItinData<IIC_IntMulHW    , [InstrStage<5, [G4_IU1]>]>,
+  InstrItinData<IIC_IntMulHWU   , [InstrStage<6, [G4_IU1]>]>,
+  InstrItinData<IIC_IntMulLI    , [InstrStage<3, [G4_IU1]>]>,
+  InstrItinData<IIC_IntRotate   , [InstrStage<1, [G4_IU1, G4_IU2]>]>,
+  InstrItinData<IIC_IntShift    , [InstrStage<1, [G4_IU1, G4_IU2]>]>,
+  InstrItinData<IIC_IntTrapW    , [InstrStage<2, [G4_IU1, G4_IU2]>]>,
+  InstrItinData<IIC_BrB         , [InstrStage<1, [G4_BPU]>]>,
+  InstrItinData<IIC_BrCR        , [InstrStage<1, [G4_SRU]>]>,
+  InstrItinData<IIC_BrMCR       , [InstrStage<1, [G4_SRU]>]>,
+  InstrItinData<IIC_BrMCRX      , [InstrStage<1, [G4_SRU]>]>,
+  InstrItinData<IIC_LdStDCBF    , [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStDCBI    , [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStLoad    , [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStLoadUpd , [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStLoadUpdX, [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStStore   , [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStStoreUpd, [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStDSS     , [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStICBI    , [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStSTFD    , [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStSTFDU   , [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStLFD     , [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStLFDU    , [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStLFDUX   , [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStLHA     , [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStLHAU    , [InstrStage<2, [G4_SLU]>]>, 
+  InstrItinData<IIC_LdStLHAUX   , [InstrStage<2, [G4_SLU]>]>, 
+  InstrItinData<IIC_LdStLMW     , [InstrStage<34, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStLVecX   , [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStLWARX   , [InstrStage<3, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStSTVEBX  , [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStSTWCX   , [InstrStage<5, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStSync    , [InstrStage<8, [G4_SLU]>]>,
+  InstrItinData<IIC_SprISYNC    , [InstrStage<2, [G4_SRU]>]>,
+  InstrItinData<IIC_SprMFSR     , [InstrStage<3, [G4_SRU]>]>,
+  InstrItinData<IIC_SprMTMSR    , [InstrStage<1, [G4_SRU]>]>,
+  InstrItinData<IIC_SprMTSR     , [InstrStage<2, [G4_SRU]>]>,
+  InstrItinData<IIC_SprTLBSYNC  , [InstrStage<8, [G4_SRU]>]>,
+  InstrItinData<IIC_SprMFCR     , [InstrStage<1, [G4_SRU]>]>,
+  InstrItinData<IIC_SprMFMSR    , [InstrStage<1, [G4_SRU]>]>,
+  InstrItinData<IIC_SprMFSPR    , [InstrStage<3, [G4_SRU]>]>,
+  InstrItinData<IIC_SprMFTB     , [InstrStage<1, [G4_SRU]>]>,
+  InstrItinData<IIC_SprMTSPR    , [InstrStage<2, [G4_SRU]>]>,
+  InstrItinData<IIC_SprMTSRIN   , [InstrStage<2, [G4_SRU]>]>,
+  InstrItinData<IIC_SprRFI      , [InstrStage<2, [G4_SRU]>]>,
+  InstrItinData<IIC_SprSC       , [InstrStage<2, [G4_SRU]>]>,
+  InstrItinData<IIC_FPGeneral   , [InstrStage<1, [G4_FPU1]>]>,
+  InstrItinData<IIC_FPAddSub    , [InstrStage<1, [G4_FPU1]>]>,
+  InstrItinData<IIC_FPCompare   , [InstrStage<1, [G4_FPU1]>]>,
+  InstrItinData<IIC_FPDivD      , [InstrStage<31, [G4_FPU1]>]>,
+  InstrItinData<IIC_FPDivS      , [InstrStage<17, [G4_FPU1]>]>,
+  InstrItinData<IIC_FPFused     , [InstrStage<1, [G4_FPU1]>]>,
+  InstrItinData<IIC_FPRes       , [InstrStage<10, [G4_FPU1]>]>,
+  InstrItinData<IIC_VecGeneral  , [InstrStage<1, [G4_VIU1]>]>,
+  InstrItinData<IIC_VecFP       , [InstrStage<4, [G4_VFPU]>]>,
+  InstrItinData<IIC_VecFPCompare, [InstrStage<1, [G4_VIU1]>]>,
+  InstrItinData<IIC_VecComplex  , [InstrStage<3, [G4_VIU2]>]>,
+  InstrItinData<IIC_VecPerm     , [InstrStage<1, [G4_VPU]>]>,
+  InstrItinData<IIC_VecFPRound  , [InstrStage<4, [G4_VFPU]>]>,
+  InstrItinData<IIC_VecVSL      , [InstrStage<1, [G4_VIU1]>]>,
+  InstrItinData<IIC_VecVSR      , [InstrStage<1, [G4_VIU1]>]>
+]>;
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleG4Plus.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG4Plus.td
new file mode 100644
index 0000000..1d9f13f
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG4Plus.td
@@ -0,0 +1,112 @@
+//===-- PPCScheduleG4Plus.td - PPC G4+ Scheduling Defs. ----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the G4+ (7450) processor.
+//
+//===----------------------------------------------------------------------===//
+
+def G4P_BPU    : FuncUnit; // Branch unit
+def G4P_SLU    : FuncUnit; // Store/load unit
+def G4P_SRU    : FuncUnit; // special register unit
+def G4P_IU1    : FuncUnit; // integer unit 1 (simple)
+def G4P_IU2    : FuncUnit; // integer unit 2 (complex)
+def G4P_IU3    : FuncUnit; // integer unit 3 (simple)
+def G4P_IU4    : FuncUnit; // integer unit 4 (simple)
+def G4P_FPU1   : FuncUnit; // floating point unit 1
+def G4P_VPU    : FuncUnit; // vector permutation unit
+def G4P_VIU1   : FuncUnit; // vector integer unit 1 (simple)
+def G4P_VIU2   : FuncUnit; // vector integer unit 2 (complex)
+def G4P_VFPU   : FuncUnit; // vector floating point unit
+
+def G4PlusItineraries : ProcessorItineraries<
+  [G4P_IU1, G4P_IU2, G4P_IU3, G4P_IU4, G4P_BPU, G4P_SLU, G4P_FPU1,
+   G4P_VFPU, G4P_VIU1, G4P_VIU2, G4P_VPU], [], [
+  InstrItinData<IIC_IntSimple   , [InstrStage<1, [G4P_IU1, G4P_IU2,
+                                                  G4P_IU3, G4P_IU4]>]>,
+  InstrItinData<IIC_IntGeneral  , [InstrStage<1, [G4P_IU1, G4P_IU2,
+                                                  G4P_IU3, G4P_IU4]>]>,
+  InstrItinData<IIC_IntCompare  , [InstrStage<1, [G4P_IU1, G4P_IU2,
+                                                  G4P_IU3, G4P_IU4]>]>,
+  InstrItinData<IIC_IntDivW     , [InstrStage<23, [G4P_IU2]>]>,
+  InstrItinData<IIC_IntMFFS     , [InstrStage<5, [G4P_FPU1]>]>,
+  InstrItinData<IIC_IntMFVSCR   , [InstrStage<2, [G4P_VFPU]>]>,
+  InstrItinData<IIC_IntMTFSB0   , [InstrStage<5, [G4P_FPU1]>]>,
+  InstrItinData<IIC_IntMulHW    , [InstrStage<4, [G4P_IU2]>]>,
+  InstrItinData<IIC_IntMulHWU   , [InstrStage<4, [G4P_IU2]>]>,
+  InstrItinData<IIC_IntMulLI    , [InstrStage<3, [G4P_IU2]>]>,
+  InstrItinData<IIC_IntRotate   , [InstrStage<1, [G4P_IU1, G4P_IU2,
+                                                  G4P_IU3, G4P_IU4]>]>,
+  InstrItinData<IIC_IntShift    , [InstrStage<2, [G4P_IU1, G4P_IU2,
+                                                  G4P_IU3, G4P_IU4]>]>,
+  InstrItinData<IIC_IntTrapW    , [InstrStage<2, [G4P_IU1, G4P_IU2,
+                                                  G4P_IU3, G4P_IU4]>]>,
+  InstrItinData<IIC_BrB         , [InstrStage<1, [G4P_BPU]>]>,
+  InstrItinData<IIC_BrCR        , [InstrStage<2, [G4P_IU2]>]>,
+  InstrItinData<IIC_BrMCR       , [InstrStage<2, [G4P_IU2]>]>,
+  InstrItinData<IIC_BrMCRX      , [InstrStage<2, [G4P_IU2]>]>,
+  InstrItinData<IIC_LdStDCBF    , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStDCBI    , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStLoad    , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStLoadUpd , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStLoadUpdX, [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStStore   , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStStoreUpd, [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStDSS     , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStICBI    , [InstrStage<3, [G4P_IU2]>]>,
+  InstrItinData<IIC_LdStSTFD    , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStSTFDU   , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStLFD     , [InstrStage<4, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStLFDU    , [InstrStage<4, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStLFDUX   , [InstrStage<4, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStLHA     , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStLHAU    , [InstrStage<3, [G4P_SLU]>]>,  
+  InstrItinData<IIC_LdStLHAUX   , [InstrStage<3, [G4P_SLU]>]>,  
+  InstrItinData<IIC_LdStLMW     , [InstrStage<37, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStLVecX   , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStLWA     , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStLWARX   , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStSTD     , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStSTDCX   , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStSTDU    , [InstrStage<3, [G4P_SLU]>]>,  
+  InstrItinData<IIC_LdStSTDUX   , [InstrStage<3, [G4P_SLU]>]>,  
+  InstrItinData<IIC_LdStSTVEBX  , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStSTWCX   , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStSync    , [InstrStage<35, [G4P_SLU]>]>,
+  InstrItinData<IIC_SprISYNC    , [InstrStage<0, [G4P_IU1, G4P_IU2,
+                                                  G4P_IU3, G4P_IU4]>]>,
+  InstrItinData<IIC_SprMFSR     , [InstrStage<4, [G4P_IU2]>]>,
+  InstrItinData<IIC_SprMTMSR    , [InstrStage<2, [G4P_IU2]>]>,
+  InstrItinData<IIC_SprMTSR     , [InstrStage<2, [G4P_IU2]>]>,
+  InstrItinData<IIC_SprTLBSYNC  , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_SprMFCR     , [InstrStage<2, [G4P_IU2]>]>,
+  InstrItinData<IIC_SprMFMSR    , [InstrStage<3, [G4P_IU2]>]>,
+  InstrItinData<IIC_SprMFSPR    , [InstrStage<4, [G4P_IU2]>]>,
+  InstrItinData<IIC_SprMFTB     , [InstrStage<5, [G4P_IU2]>]>,
+  InstrItinData<IIC_SprMTSPR    , [InstrStage<2, [G4P_IU2]>]>,
+  InstrItinData<IIC_SprMTSRIN   , [InstrStage<2, [G4P_IU2]>]>,
+  InstrItinData<IIC_SprRFI      , [InstrStage<1, [G4P_IU1, G4P_IU2,
+                                                  G4P_IU3, G4P_IU4]>]>,
+  InstrItinData<IIC_SprSC       , [InstrStage<0, [G4P_IU1, G4P_IU2,
+                                                  G4P_IU3, G4P_IU4]>]>,
+  InstrItinData<IIC_FPGeneral   , [InstrStage<5, [G4P_FPU1]>]>,
+  InstrItinData<IIC_FPAddSub    , [InstrStage<5, [G4P_FPU1]>]>,  
+  InstrItinData<IIC_FPCompare   , [InstrStage<5, [G4P_FPU1]>]>,
+  InstrItinData<IIC_FPDivD      , [InstrStage<35, [G4P_FPU1]>]>,
+  InstrItinData<IIC_FPDivS      , [InstrStage<21, [G4P_FPU1]>]>,
+  InstrItinData<IIC_FPFused     , [InstrStage<5, [G4P_FPU1]>]>,
+  InstrItinData<IIC_FPRes       , [InstrStage<14, [G4P_FPU1]>]>,
+  InstrItinData<IIC_VecGeneral  , [InstrStage<1, [G4P_VIU1]>]>,
+  InstrItinData<IIC_VecFP       , [InstrStage<4, [G4P_VFPU]>]>,
+  InstrItinData<IIC_VecFPCompare, [InstrStage<2, [G4P_VFPU]>]>,
+  InstrItinData<IIC_VecComplex  , [InstrStage<4, [G4P_VIU2]>]>,
+  InstrItinData<IIC_VecPerm     , [InstrStage<2, [G4P_VPU]>]>,
+  InstrItinData<IIC_VecFPRound  , [InstrStage<4, [G4P_VIU1]>]>,
+  InstrItinData<IIC_VecVSL      , [InstrStage<2, [G4P_VPU]>]>,
+  InstrItinData<IIC_VecVSR      , [InstrStage<2, [G4P_VPU]>]>
+]>;
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleG5.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG5.td
new file mode 100644
index 0000000..a3b73ab
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG5.td
@@ -0,0 +1,129 @@
+//===-- PPCScheduleG5.td - PPC G5 Scheduling Definitions ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the G5 (970) processor.
+//
+//===----------------------------------------------------------------------===//
+
+def G5_BPU    : FuncUnit; // Branch unit
+def G5_SLU    : FuncUnit; // Store/load unit
+def G5_SRU    : FuncUnit; // special register unit
+def G5_IU1    : FuncUnit; // integer unit 1 (simple)
+def G5_IU2    : FuncUnit; // integer unit 2 (complex)
+def G5_FPU1   : FuncUnit; // floating point unit 1
+def G5_FPU2   : FuncUnit; // floating point unit 2
+def G5_VPU    : FuncUnit; // vector permutation unit
+def G5_VIU1   : FuncUnit; // vector integer unit 1 (simple)
+def G5_VIU2   : FuncUnit; // vector integer unit 2 (complex)
+def G5_VFPU   : FuncUnit; // vector floating point unit
+
+def G5Itineraries : ProcessorItineraries<
+  [G5_IU1, G5_IU2, G5_SLU, G5_BPU, G5_FPU1, G5_FPU2,
+   G5_VFPU, G5_VIU1, G5_VIU2, G5_VPU], [], [
+  InstrItinData<IIC_IntSimple   , [InstrStage<2, [G5_IU1, G5_IU2]>]>,
+  InstrItinData<IIC_IntGeneral  , [InstrStage<2, [G5_IU1, G5_IU2]>]>,
+  InstrItinData<IIC_IntCompare  , [InstrStage<3, [G5_IU1, G5_IU2]>]>,
+  InstrItinData<IIC_IntDivD     , [InstrStage<68, [G5_IU1]>]>,
+  InstrItinData<IIC_IntDivW     , [InstrStage<36, [G5_IU1]>]>,
+  InstrItinData<IIC_IntMFFS     , [InstrStage<6, [G5_IU2]>]>,
+  InstrItinData<IIC_IntMFVSCR   , [InstrStage<1, [G5_VFPU]>]>,
+  InstrItinData<IIC_IntMTFSB0   , [InstrStage<6, [G5_FPU1, G5_FPU2]>]>,
+  InstrItinData<IIC_IntMulHD    , [InstrStage<7, [G5_IU1, G5_IU2]>]>,
+  InstrItinData<IIC_IntMulHW    , [InstrStage<5, [G5_IU1, G5_IU2]>]>,
+  InstrItinData<IIC_IntMulHWU   , [InstrStage<5, [G5_IU1, G5_IU2]>]>,
+  InstrItinData<IIC_IntMulLI    , [InstrStage<4, [G5_IU1, G5_IU2]>]>,
+  InstrItinData<IIC_IntRFID     , [InstrStage<1, [G5_IU2]>]>,
+  InstrItinData<IIC_IntRotateD  , [InstrStage<2, [G5_IU1, G5_IU2]>]>,
+  InstrItinData<IIC_IntRotateDI , [InstrStage<2, [G5_IU1, G5_IU2]>]>,  
+  InstrItinData<IIC_IntRotate   , [InstrStage<4, [G5_IU1, G5_IU2]>]>,
+  InstrItinData<IIC_IntShift    , [InstrStage<2, [G5_IU1, G5_IU2]>]>,
+  InstrItinData<IIC_IntTrapD    , [InstrStage<1, [G5_IU1, G5_IU2]>]>,
+  InstrItinData<IIC_IntTrapW    , [InstrStage<1, [G5_IU1, G5_IU2]>]>,
+  InstrItinData<IIC_BrB         , [InstrStage<1, [G5_BPU]>]>,
+  InstrItinData<IIC_BrCR        , [InstrStage<4, [G5_BPU]>]>,
+  InstrItinData<IIC_BrMCR       , [InstrStage<2, [G5_BPU]>]>,
+  InstrItinData<IIC_BrMCRX      , [InstrStage<3, [G5_BPU]>]>,
+  InstrItinData<IIC_LdStDCBF    , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStLoad    , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStLoadUpd , [InstrStage<3, [G5_SLU]>]>,  
+  InstrItinData<IIC_LdStLoadUpdX, [InstrStage<3, [G5_SLU]>]>,  
+  InstrItinData<IIC_LdStStore   , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStStoreUpd, [InstrStage<3, [G5_SLU]>]>,  
+  InstrItinData<IIC_LdStDSS     , [InstrStage<10, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStICBI    , [InstrStage<40, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStSTFD    , [InstrStage<4, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStSTFDU   , [InstrStage<4, [G5_SLU]>]>,  
+  InstrItinData<IIC_LdStLD      , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStLDU     , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStLDUX    , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStLDARX   , [InstrStage<11, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStLFD     , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStLFDU    , [InstrStage<5, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStLFDUX   , [InstrStage<5, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStLHA     , [InstrStage<5, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStLHAU    , [InstrStage<5, [G5_SLU]>]>,  
+  InstrItinData<IIC_LdStLHAUX   , [InstrStage<5, [G5_SLU]>]>,  
+  InstrItinData<IIC_LdStLMW     , [InstrStage<64, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStLVecX   , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStLWA     , [InstrStage<5, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStLWARX   , [InstrStage<11, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStSLBIA   , [InstrStage<40, [G5_SLU]>]>, // needs work
+  InstrItinData<IIC_LdStSLBIE   , [InstrStage<2, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStSTD     , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStSTDU    , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStSTDUX   , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStSTDCX   , [InstrStage<11, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStSTVEBX  , [InstrStage<5, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStSTWCX   , [InstrStage<11, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStSync    , [InstrStage<35, [G5_SLU]>]>,
+  InstrItinData<IIC_SprISYNC    , [InstrStage<40, [G5_SLU]>]>, // needs work
+  InstrItinData<IIC_SprMFSR     , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_SprMTMSR    , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_SprMTSR     , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_SprTLBSYNC  , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_SprMFCR     , [InstrStage<2, [G5_IU2]>]>,
+  InstrItinData<IIC_SprMFCRF    , [InstrStage<2, [G5_IU2]>]>,
+  InstrItinData<IIC_SprMFMSR    , [InstrStage<3, [G5_IU2]>]>,
+  InstrItinData<IIC_SprMFSPR    , [InstrStage<3, [G5_IU2]>]>,
+  InstrItinData<IIC_SprMFTB     , [InstrStage<10, [G5_IU2]>]>,
+  InstrItinData<IIC_SprMTSPR    , [InstrStage<8, [G5_IU2]>]>,
+  InstrItinData<IIC_SprSC       , [InstrStage<1, [G5_IU2]>]>,
+  InstrItinData<IIC_FPGeneral   , [InstrStage<6, [G5_FPU1, G5_FPU2]>]>,
+  InstrItinData<IIC_FPAddSub    , [InstrStage<6, [G5_FPU1, G5_FPU2]>]>,
+  InstrItinData<IIC_FPCompare   , [InstrStage<8, [G5_FPU1, G5_FPU2]>]>,
+  InstrItinData<IIC_FPDivD      , [InstrStage<33, [G5_FPU1, G5_FPU2]>]>,
+  InstrItinData<IIC_FPDivS      , [InstrStage<33, [G5_FPU1, G5_FPU2]>]>,
+  InstrItinData<IIC_FPFused     , [InstrStage<6, [G5_FPU1, G5_FPU2]>]>,
+  InstrItinData<IIC_FPRes       , [InstrStage<6, [G5_FPU1, G5_FPU2]>]>,
+  InstrItinData<IIC_FPSqrtD     , [InstrStage<40, [G5_FPU1, G5_FPU2]>]>,
+  InstrItinData<IIC_FPSqrtS     , [InstrStage<40, [G5_FPU1, G5_FPU2]>]>,
+  InstrItinData<IIC_VecGeneral  , [InstrStage<2, [G5_VIU1]>]>,
+  InstrItinData<IIC_VecFP       , [InstrStage<8, [G5_VFPU]>]>,
+  InstrItinData<IIC_VecFPCompare, [InstrStage<2, [G5_VFPU]>]>,
+  InstrItinData<IIC_VecComplex  , [InstrStage<5, [G5_VIU2]>]>,
+  InstrItinData<IIC_VecPerm     , [InstrStage<3, [G5_VPU]>]>,
+  InstrItinData<IIC_VecFPRound  , [InstrStage<8, [G5_VFPU]>]>,
+  InstrItinData<IIC_VecVSL      , [InstrStage<2, [G5_VIU1]>]>,
+  InstrItinData<IIC_VecVSR      , [InstrStage<3, [G5_VPU]>]>
+]>;
+
+// ===---------------------------------------------------------------------===//
+// G5 machine model for scheduling and other instruction cost heuristics.
+
+def G5Model : SchedMachineModel {
+  let IssueWidth = 4;  // 4 (non-branch) instructions are dispatched per cycle.
+  let MinLatency = 0;  // Out-of-order dispatch.
+  let LoadLatency = 3; // Optimistic load latency assuming bypass.
+                       // This is overriden by OperandCycles if the
+                       // Itineraries are queried instead.
+  let MispredictPenalty = 16;
+
+  let Itineraries = G5Itineraries;
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleP7.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleP7.td
new file mode 100644
index 0000000..d3e4269
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleP7.td
@@ -0,0 +1,385 @@
+//===-- PPCScheduleP7.td - PPC P7 Scheduling Definitions ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the POWER7 processor.
+//
+//===----------------------------------------------------------------------===//
+
+// Primary reference:
+// IBM POWER7 multicore server processor
+// B. Sinharoy, et al.
+// IBM J. Res. & Dev. (55) 3. May/June 2011.
+
+// Scheduling for the P7 involves tracking two types of resources:
+//  1. The dispatch bundle slots
+//  2. The functional unit resources
+
+// Dispatch units:
+def P7_DU1    : FuncUnit;
+def P7_DU2    : FuncUnit;
+def P7_DU3    : FuncUnit;
+def P7_DU4    : FuncUnit;
+def P7_DU5    : FuncUnit;
+def P7_DU6    : FuncUnit;
+
+def P7_LS1    : FuncUnit; // Load/Store pipeline 1
+def P7_LS2    : FuncUnit; // Load/Store pipeline 2
+
+def P7_FX1    : FuncUnit; // FX pipeline 1
+def P7_FX2    : FuncUnit; // FX pipeline 2
+
+// VS pipeline 1 (vector integer ops. always here)
+def P7_VS1    : FuncUnit; // VS pipeline 1
+// VS pipeline 2 (128-bit stores and perms. here)
+def P7_VS2    : FuncUnit; // VS pipeline 2
+
+def P7_CRU    : FuncUnit; // CR unit (CR logicals and move-from-SPRs)
+def P7_BRU    : FuncUnit; // BR unit
+
+// Notes:
+// Each LSU pipeline can also execute FX add and logical instructions.
+// Each LSU pipeline can complete a load or store in one cycle.
+//
+// Each store is broken into two parts, AGEN goes to the LSU while a
+// "data steering" op. goes to the FXU or VSU.
+//
+// FX loads have a two cycle load-to-use latency (so one "bubble" cycle).
+// VSU loads have a three cycle load-to-use latency (so two "bubble" cycle).
+//
+// Frequent FX ops. take only one cycle and results can be used again in the
+// next cycle (there is a self-bypass). Getting results from the other FX
+// pipeline takes an additional cycle.
+//
+// The VSU XS is similar to the POWER6, but with a pipeline length of 2 cycles
+// (instead of 3 cycles on the POWER6). VSU XS handles vector FX-style ops.
+// Dispatch of an instruction to VS1 that uses four single prec. inputs
+// (either to a float or XC op). prevents dispatch in that cycle to VS2 of any
+// floating point instruction.
+//
+// The VSU PM is similar to the POWER6, but with a pipeline length of 3 cycles
+// (instead of 4 cycles on the POWER6). vsel is handled by the PM pipeline
+// (unlike on the POWER6).
+//
+// FMA from the VSUs can forward results in 6 cycles. VS1 XS and vector FP
+// share the same write-back, and have a 5-cycle latency difference, so the
+// IFU/IDU will not dispatch an XS instructon 5 cycles after a vector FP
+// op. has been dispatched to VS1.
+//
+// Three cycles after an L1 cache hit, a dependent VSU instruction can issue.
+//
+// Instruction dispatch groups have (at most) four non-branch instructions, and
+// two branches. Unlike on the POWER4/5, a branch does not automatically
+// end the dispatch group, but a second branch must be the last in the group.
+
+def P7Itineraries : ProcessorItineraries<
+  [P7_DU1, P7_DU2, P7_DU3, P7_DU4, P7_DU5, P7_DU6,
+   P7_LS1, P7_LS2, P7_FX1, P7_FX2, P7_VS1, P7_VS2, P7_CRU, P7_BRU], [], [
+  InstrItinData<IIC_IntSimple   , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2,
+                                                  P7_LS1, P7_LS2]>],
+                                  [1, 1, 1]>,
+  InstrItinData<IIC_IntGeneral  , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [1, 1, 1]>,
+  InstrItinData<IIC_IntCompare  , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [1, 1, 1]>,
+  // FIXME: Add record-form itinerary data.
+  InstrItinData<IIC_IntDivW     , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<36, [P7_FX1, P7_FX2]>],
+                                  [36, 1, 1]>,
+  InstrItinData<IIC_IntDivD     , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<68, [P7_FX1, P7_FX2]>],
+                                  [68, 1, 1]>,
+  InstrItinData<IIC_IntMulHW    , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [4, 1, 1]>,
+  InstrItinData<IIC_IntMulHWU   , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [4, 1, 1]>,
+  InstrItinData<IIC_IntMulLI    , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [4, 1, 1]>,
+  InstrItinData<IIC_IntRotate   , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                   [1, 1, 1]>,
+  InstrItinData<IIC_IntRotateD  , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                   [1, 1, 1]>,
+  InstrItinData<IIC_IntShift    , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [1, 1, 1]>,
+  InstrItinData<IIC_IntTrapW    , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [1, 1]>,
+  InstrItinData<IIC_IntTrapD    , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [1, 1]>,
+  InstrItinData<IIC_BrB         , [InstrStage<1, [P7_DU5, P7_DU6], 0>,
+                                   InstrStage<1, [P7_BRU]>],
+                                  [3, 1, 1]>,
+  InstrItinData<IIC_BrCR        , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_CRU]>],
+                                  [3, 1, 1]>,
+  InstrItinData<IIC_BrMCR       , [InstrStage<1, [P7_DU5, P7_DU6], 0>,
+                                   InstrStage<1, [P7_BRU]>],
+                                  [3, 1, 1]>,
+  InstrItinData<IIC_BrMCRX      , [InstrStage<1, [P7_DU5, P7_DU6], 0>,
+                                   InstrStage<1, [P7_BRU]>],
+                                  [3, 1, 1]>,
+  InstrItinData<IIC_LdStLoad    , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2]>],
+                                  [2, 1, 1]>,
+  InstrItinData<IIC_LdStLoadUpd , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [2, 2, 1, 1]>,
+  InstrItinData<IIC_LdStLoadUpdX, [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_DU3], 0>,
+                                   InstrStage<1, [P7_DU4], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>,
+                                   InstrStage<1, [P7_LS1, P7_LS2], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [3, 3, 1, 1]>,
+  InstrItinData<IIC_LdStLD      , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2]>],
+                                  [2, 1, 1]>,
+  InstrItinData<IIC_LdStLDU     , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [2, 2, 1, 1]>,
+  InstrItinData<IIC_LdStLDUX    , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_DU3], 0>,
+                                   InstrStage<1, [P7_DU4], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>,
+                                   InstrStage<1, [P7_LS1, P7_LS2], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [3, 3, 1, 1]>,
+  InstrItinData<IIC_LdStLFD     , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2]>],
+                                  [3, 1, 1]>,
+  InstrItinData<IIC_LdStLVecX   , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2]>],
+                                  [3, 1, 1]>,
+  InstrItinData<IIC_LdStLFDU    , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [3, 3, 1, 1]>,
+  InstrItinData<IIC_LdStLFDUX   , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [3, 3, 1, 1]>,
+  InstrItinData<IIC_LdStLHA     , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2]>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [3, 1, 1]>,
+  InstrItinData<IIC_LdStLHAU    , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [4, 4, 1, 1]>,
+  InstrItinData<IIC_LdStLHAUX   , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_DU3], 0>,
+                                   InstrStage<1, [P7_DU4], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>,
+                                   InstrStage<1, [P7_LS1, P7_LS2], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [4, 4, 1, 1]>,
+  InstrItinData<IIC_LdStLWA     , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2]>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [3, 1, 1]>,
+  InstrItinData<IIC_LdStLWARX,    [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_DU3], 0>,
+                                   InstrStage<1, [P7_DU4], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2]>],
+                                  [3, 1, 1]>,
+  InstrItinData<IIC_LdStLDARX,    [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_DU3], 0>,
+                                   InstrStage<1, [P7_DU4], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2]>],
+                                  [3, 1, 1]>,
+  InstrItinData<IIC_LdStLMW     , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2]>],
+                                  [2, 1, 1]>,
+  InstrItinData<IIC_LdStStore   , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [1, 1, 1]>,
+  InstrItinData<IIC_LdStSTD     , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [1, 1, 1]>,
+  InstrItinData<IIC_LdStSTDU    , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [2, 1, 1, 1]>,
+  InstrItinData<IIC_LdStSTDUX   , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_DU3], 0>,
+                                   InstrStage<1, [P7_DU4], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [2, 1, 1, 1]>,
+  InstrItinData<IIC_LdStSTFD    , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2], 0>,
+                                   InstrStage<1, [P7_VS1, P7_VS2]>],
+                                  [1, 1, 1]>,
+  InstrItinData<IIC_LdStSTFDU   , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2], 0>,
+                                   InstrStage<1, [P7_VS1, P7_VS2]>],
+                                  [2, 1, 1, 1]>,
+  InstrItinData<IIC_LdStSTVEBX  , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2], 0>,
+                                   InstrStage<1, [P7_VS2]>],
+                                  [1, 1, 1]>,
+  InstrItinData<IIC_LdStSTDCX   , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_DU3], 0>,
+                                   InstrStage<1, [P7_DU4], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2]>],
+                                  [1, 1, 1]>,
+  InstrItinData<IIC_LdStSTWCX   , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_DU3], 0>,
+                                   InstrStage<1, [P7_DU4], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2]>],
+                                  [1, 1, 1]>,
+  InstrItinData<IIC_BrMCRX      , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_DU3], 0>,
+                                   InstrStage<1, [P7_DU4], 0>,
+                                   InstrStage<1, [P7_CRU]>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [3, 1]>, // mtcr
+  InstrItinData<IIC_SprMFCR     , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_CRU]>],
+                                  [6, 1]>,
+  InstrItinData<IIC_SprMFCRF    , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_CRU]>],
+                                  [3, 1]>,
+  InstrItinData<IIC_SprMTSPR    , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_FX1]>],
+                                  [4, 1]>, // mtctr
+  InstrItinData<IIC_FPGeneral   , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_VS1, P7_VS2]>],
+                                  [5, 1, 1]>,
+  InstrItinData<IIC_FPCompare   , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_VS1, P7_VS2]>],
+                                  [8, 1, 1]>,
+  InstrItinData<IIC_FPDivD      , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_VS1, P7_VS2]>],
+                                  [33, 1, 1]>,
+  InstrItinData<IIC_FPDivS      , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_VS1, P7_VS2]>],
+                                  [27, 1, 1]>,
+  InstrItinData<IIC_FPSqrtD     , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_VS1, P7_VS2]>],
+                                  [44, 1, 1]>,
+  InstrItinData<IIC_FPSqrtS     , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_VS1, P7_VS2]>],
+                                  [32, 1, 1]>,
+  InstrItinData<IIC_FPFused     , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_VS1, P7_VS2]>],
+                                  [5, 1, 1, 1]>,
+  InstrItinData<IIC_FPRes       , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_VS1, P7_VS2]>],
+                                  [5, 1, 1]>,
+  InstrItinData<IIC_VecGeneral  , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_VS1]>],
+                                  [2, 1, 1]>,
+  InstrItinData<IIC_VecVSL      , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_VS1]>],
+                                  [2, 1, 1]>,
+  InstrItinData<IIC_VecVSR      , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_VS1]>],
+                                  [2, 1, 1]>,
+  InstrItinData<IIC_VecFP       , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_VS1, P7_VS2]>],
+                                  [6, 1, 1]>,
+  InstrItinData<IIC_VecFPCompare, [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_VS1, P7_VS2]>],
+                                  [6, 1, 1]>,
+  InstrItinData<IIC_VecFPRound  , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_VS1, P7_VS2]>],
+                                  [6, 1, 1]>,
+  InstrItinData<IIC_VecComplex  , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_VS1]>],
+                                  [7, 1, 1]>,
+  InstrItinData<IIC_VecPerm     , [InstrStage<1, [P7_DU1, P7_DU2], 0>,
+                                   InstrStage<1, [P7_VS2]>],
+                                  [3, 1, 1]>
+]>;
+
+// ===---------------------------------------------------------------------===//
+// P7 machine model for scheduling and other instruction cost heuristics.
+
+def P7Model : SchedMachineModel {
+  let IssueWidth = 6;  // 4 (non-branch) instructions are dispatched per cycle.
+                       // Note that the dispatch bundle size is 6 (including
+                       // branches), but the total internal issue bandwidth per
+                       // cycle (from all queues) is 8.
+
+  let MinLatency = 0;  // Out-of-order dispatch.
+  let LoadLatency = 3; // Optimistic load latency assuming bypass.
+                       // This is overriden by OperandCycles if the
+                       // Itineraries are queried instead.
+  let MispredictPenalty = 16;
+
+  let Itineraries = P7Itineraries;
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp
new file mode 100644
index 0000000..dc16742
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp
@@ -0,0 +1,22 @@
+//===-- PPCSelectionDAGInfo.cpp - PowerPC SelectionDAG Info ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PPCSelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCTargetMachine.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "powerpc-selectiondag-info"
+
+PPCSelectionDAGInfo::PPCSelectionDAGInfo(const DataLayout *DL)
+    : TargetSelectionDAGInfo(DL) {}
+
+PPCSelectionDAGInfo::~PPCSelectionDAGInfo() {}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.h
new file mode 100644
index 0000000..b2e7f3b
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.h
@@ -0,0 +1,31 @@
+//===-- PPCSelectionDAGInfo.h - PowerPC SelectionDAG Info -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the PowerPC subclass for TargetSelectionDAGInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef POWERPCCSELECTIONDAGINFO_H
+#define POWERPCCSELECTIONDAGINFO_H
+
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+
+namespace llvm {
+
+class PPCTargetMachine;
+
+class PPCSelectionDAGInfo : public TargetSelectionDAGInfo {
+public:
+  explicit PPCSelectionDAGInfo(const DataLayout *DL);
+  ~PPCSelectionDAGInfo();
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
new file mode 100644
index 0000000..b51512d
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -0,0 +1,277 @@
+//===-- PowerPCSubtarget.cpp - PPC Subtarget Information ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PPC specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCSubtarget.h"
+#include "PPC.h"
+#include "PPCRegisterInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetMachine.h"
+#include <cstdlib>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-subtarget"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "PPCGenSubtargetInfo.inc"
+
+/// Return the datalayout string of a subtarget.
+static std::string getDataLayoutString(const PPCSubtarget &ST) {
+  const Triple &T = ST.getTargetTriple();
+
+  std::string Ret;
+
+  // Most PPC* platforms are big endian, PPC64LE is little endian.
+  if (ST.isLittleEndian())
+    Ret = "e";
+  else
+    Ret = "E";
+
+  Ret += DataLayout::getManglingComponent(T);
+
+  // PPC32 has 32 bit pointers. The PS3 (OS Lv2) is a PPC64 machine with 32 bit
+  // pointers.
+  if (!ST.isPPC64() || T.getOS() == Triple::Lv2)
+    Ret += "-p:32:32";
+
+  // Note, the alignment values for f64 and i64 on ppc64 in Darwin
+  // documentation are wrong; these are correct (i.e. "what gcc does").
+  if (ST.isPPC64() || ST.isSVR4ABI())
+    Ret += "-i64:64";
+  else
+    Ret += "-f64:32:64";
+
+  // PPC64 has 32 and 64 bit registers, PPC32 has only 32 bit ones.
+  if (ST.isPPC64())
+    Ret += "-n32:64";
+  else
+    Ret += "-n32";
+
+  return Ret;
+}
+
+PPCSubtarget &PPCSubtarget::initializeSubtargetDependencies(StringRef CPU,
+                                                            StringRef FS) {
+  initializeEnvironment();
+  resetSubtargetFeatures(CPU, FS);
+  return *this;
+}
+
+PPCSubtarget::PPCSubtarget(const std::string &TT, const std::string &CPU,
+                           const std::string &FS, PPCTargetMachine &TM,
+                           bool is64Bit, CodeGenOpt::Level OptLevel)
+    : PPCGenSubtargetInfo(TT, CPU, FS), IsPPC64(is64Bit), TargetTriple(TT),
+      OptLevel(OptLevel),
+      FrameLowering(initializeSubtargetDependencies(CPU, FS)),
+      DL(getDataLayoutString(*this)), InstrInfo(*this), JITInfo(*this),
+      TLInfo(TM), TSInfo(&DL) {}
+
+/// SetJITMode - This is called to inform the subtarget info that we are
+/// producing code for the JIT.
+void PPCSubtarget::SetJITMode() {
+  // JIT mode doesn't want lazy resolver stubs, it knows exactly where
+  // everything is.  This matters for PPC64, which codegens in PIC mode without
+  // stubs.
+  HasLazyResolverStubs = false;
+
+  // Calls to external functions need to use indirect calls
+  IsJITCodeModel = true;
+}
+
+void PPCSubtarget::resetSubtargetFeatures(const MachineFunction *MF) {
+  AttributeSet FnAttrs = MF->getFunction()->getAttributes();
+  Attribute CPUAttr = FnAttrs.getAttribute(AttributeSet::FunctionIndex,
+                                           "target-cpu");
+  Attribute FSAttr = FnAttrs.getAttribute(AttributeSet::FunctionIndex,
+                                          "target-features");
+  std::string CPU =
+    !CPUAttr.hasAttribute(Attribute::None) ? CPUAttr.getValueAsString() : "";
+  std::string FS =
+    !FSAttr.hasAttribute(Attribute::None) ? FSAttr.getValueAsString() : "";
+  if (!FS.empty()) {
+    initializeEnvironment();
+    resetSubtargetFeatures(CPU, FS);
+  }
+}
+
+void PPCSubtarget::initializeEnvironment() {
+  StackAlignment = 16;
+  DarwinDirective = PPC::DIR_NONE;
+  HasMFOCRF = false;
+  Has64BitSupport = false;
+  Use64BitRegs = false;
+  UseCRBits = false;
+  HasAltivec = false;
+  HasQPX = false;
+  HasVSX = false;
+  HasFCPSGN = false;
+  HasFSQRT = false;
+  HasFRE = false;
+  HasFRES = false;
+  HasFRSQRTE = false;
+  HasFRSQRTES = false;
+  HasRecipPrec = false;
+  HasSTFIWX = false;
+  HasLFIWAX = false;
+  HasFPRND = false;
+  HasFPCVT = false;
+  HasISEL = false;
+  HasPOPCNTD = false;
+  HasLDBRX = false;
+  IsBookE = false;
+  DeprecatedMFTB = false;
+  DeprecatedDST = false;
+  HasLazyResolverStubs = false;
+  IsJITCodeModel = false;
+}
+
+void PPCSubtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) {
+  // Determine default and user specified characteristics
+  std::string CPUName = CPU;
+  if (CPUName.empty())
+    CPUName = "generic";
+#if (defined(__APPLE__) || defined(__linux__)) && \
+    (defined(__ppc__) || defined(__powerpc__))
+  if (CPUName == "generic")
+    CPUName = sys::getHostCPUName();
+#endif
+
+  // Initialize scheduling itinerary for the specified CPU.
+  InstrItins = getInstrItineraryForCPU(CPUName);
+
+  // Make sure 64-bit features are available when CPUname is generic
+  std::string FullFS = FS;
+
+  // If we are generating code for ppc64, verify that options make sense.
+  if (IsPPC64) {
+    Has64BitSupport = true;
+    // Silently force 64-bit register use on ppc64.
+    Use64BitRegs = true;
+    if (!FullFS.empty())
+      FullFS = "+64bit," + FullFS;
+    else
+      FullFS = "+64bit";
+  }
+
+  // At -O2 and above, track CR bits as individual registers.
+  if (OptLevel >= CodeGenOpt::Default) {
+    if (!FullFS.empty())
+      FullFS = "+crbits," + FullFS;
+    else
+      FullFS = "+crbits";
+  }
+
+  // Parse features string.
+  ParseSubtargetFeatures(CPUName, FullFS);
+
+  // If the user requested use of 64-bit regs, but the cpu selected doesn't
+  // support it, ignore.
+  if (use64BitRegs() && !has64BitSupport())
+    Use64BitRegs = false;
+
+  // Set up darwin-specific properties.
+  if (isDarwin())
+    HasLazyResolverStubs = true;
+
+  // QPX requires a 32-byte aligned stack. Note that we need to do this if
+  // we're compiling for a BG/Q system regardless of whether or not QPX
+  // is enabled because external functions will assume this alignment.
+  if (hasQPX() || isBGQ())
+    StackAlignment = 32;
+
+  // Determine endianness.
+  IsLittleEndian = (TargetTriple.getArch() == Triple::ppc64le);
+
+  // FIXME: For now, we disable VSX in little-endian mode until endian
+  // issues in those instructions can be addressed.
+  if (IsLittleEndian)
+    HasVSX = false;
+}
+
+/// hasLazyResolverStub - Return true if accesses to the specified global have
+/// to go through a dyld lazy resolution stub.  This means that an extra load
+/// is required to get the address of the global.
+bool PPCSubtarget::hasLazyResolverStub(const GlobalValue *GV,
+                                       const TargetMachine &TM) const {
+  // We never have stubs if HasLazyResolverStubs=false or if in static mode.
+  if (!HasLazyResolverStubs || TM.getRelocationModel() == Reloc::Static)
+    return false;
+  // If symbol visibility is hidden, the extra load is not needed if
+  // the symbol is definitely defined in the current translation unit.
+  bool isDecl = GV->isDeclaration() && !GV->isMaterializable();
+  if (GV->hasHiddenVisibility() && !isDecl && !GV->hasCommonLinkage())
+    return false;
+  return GV->hasWeakLinkage() || GV->hasLinkOnceLinkage() ||
+         GV->hasCommonLinkage() || isDecl;
+}
+
+// Embedded cores need aggressive scheduling (and some others also benefit).
+static bool needsAggressiveScheduling(unsigned Directive) {
+  switch (Directive) {
+  default: return false;
+  case PPC::DIR_440:
+  case PPC::DIR_A2:
+  case PPC::DIR_E500mc:
+  case PPC::DIR_E5500:
+  case PPC::DIR_PWR7:
+  case PPC::DIR_PWR8:
+    return true;
+  }
+}
+
+bool PPCSubtarget::enableMachineScheduler() const {
+  // Enable MI scheduling for the embedded cores.
+  // FIXME: Enable this for all cores (some additional modeling
+  // may be necessary).
+  return needsAggressiveScheduling(DarwinDirective);
+}
+
+// This overrides the PostRAScheduler bit in the SchedModel for each CPU.
+bool PPCSubtarget::enablePostMachineScheduler() const { return true; }
+
+PPCGenSubtargetInfo::AntiDepBreakMode PPCSubtarget::getAntiDepBreakMode() const {
+  return TargetSubtargetInfo::ANTIDEP_ALL;
+}
+
+void PPCSubtarget::getCriticalPathRCs(RegClassVector &CriticalPathRCs) const {
+  CriticalPathRCs.clear();
+  CriticalPathRCs.push_back(isPPC64() ?
+                            &PPC::G8RCRegClass : &PPC::GPRCRegClass);
+}
+
+void PPCSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
+                                       MachineInstr *begin,
+                                       MachineInstr *end,
+                                       unsigned NumRegionInstrs) const {
+  if (needsAggressiveScheduling(DarwinDirective)) {
+    Policy.OnlyTopDown = false;
+    Policy.OnlyBottomUp = false;
+  }
+
+  // Spilling is generally expensive on all PPC cores, so always enable
+  // register-pressure tracking.
+  Policy.ShouldTrackPressure = true;
+}
+
+bool PPCSubtarget::useAA() const {
+  // Use AA during code generation for the embedded cores.
+  return needsAggressiveScheduling(DarwinDirective);
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.h b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.h
new file mode 100644
index 0000000..a3cedaf
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.h
@@ -0,0 +1,251 @@
+//===-- PPCSubtarget.h - Define Subtarget for the PPC ----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the PowerPC specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef POWERPCSUBTARGET_H
+#define POWERPCSUBTARGET_H
+
+#include "PPCFrameLowering.h"
+#include "PPCInstrInfo.h"
+#include "PPCISelLowering.h"
+#include "PPCJITInfo.h"
+#include "PPCSelectionDAGInfo.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "PPCGenSubtargetInfo.inc"
+
+// GCC #defines PPC on Linux but we use it as our namespace name
+#undef PPC
+
+namespace llvm {
+class StringRef;
+
+namespace PPC {
+  // -m directive values.
+  enum {
+    DIR_NONE,
+    DIR_32,
+    DIR_440,
+    DIR_601,
+    DIR_602,
+    DIR_603,
+    DIR_7400,
+    DIR_750,
+    DIR_970,
+    DIR_A2,
+    DIR_E500mc,
+    DIR_E5500,
+    DIR_PWR3,
+    DIR_PWR4,
+    DIR_PWR5,
+    DIR_PWR5X,
+    DIR_PWR6,
+    DIR_PWR6X,
+    DIR_PWR7,
+    DIR_PWR8,
+    DIR_64
+  };
+}
+
+class GlobalValue;
+class TargetMachine;
+
+class PPCSubtarget : public PPCGenSubtargetInfo {
+protected:
+  /// stackAlignment - The minimum alignment known to hold of the stack frame on
+  /// entry to the function and which must be maintained by every function.
+  unsigned StackAlignment;
+
+  /// Selected instruction itineraries (one entry per itinerary class.)
+  InstrItineraryData InstrItins;
+
+  /// Which cpu directive was used.
+  unsigned DarwinDirective;
+
+  /// Used by the ISel to turn in optimizations for POWER4-derived architectures
+  bool HasMFOCRF;
+  bool Has64BitSupport;
+  bool Use64BitRegs;
+  bool UseCRBits;
+  bool IsPPC64;
+  bool HasAltivec;
+  bool HasQPX;
+  bool HasVSX;
+  bool HasFCPSGN;
+  bool HasFSQRT;
+  bool HasFRE, HasFRES, HasFRSQRTE, HasFRSQRTES;
+  bool HasRecipPrec;
+  bool HasSTFIWX;
+  bool HasLFIWAX;
+  bool HasFPRND;
+  bool HasFPCVT;
+  bool HasISEL;
+  bool HasPOPCNTD;
+  bool HasLDBRX;
+  bool IsBookE;
+  bool DeprecatedMFTB;
+  bool DeprecatedDST;
+  bool HasLazyResolverStubs;
+  bool IsJITCodeModel;
+  bool IsLittleEndian;
+
+  /// TargetTriple - What processor and OS we're targeting.
+  Triple TargetTriple;
+
+  /// OptLevel - What default optimization level we're emitting code for.
+  CodeGenOpt::Level OptLevel;
+
+  PPCFrameLowering FrameLowering;
+  const DataLayout DL;
+  PPCInstrInfo InstrInfo;
+  PPCJITInfo JITInfo;
+  PPCTargetLowering TLInfo;
+  PPCSelectionDAGInfo TSInfo;
+
+public:
+  /// This constructor initializes the data members to match that
+  /// of the specified triple.
+  ///
+  PPCSubtarget(const std::string &TT, const std::string &CPU,
+               const std::string &FS, PPCTargetMachine &TM, bool is64Bit,
+               CodeGenOpt::Level OptLevel);
+
+  /// ParseSubtargetFeatures - Parses features string setting specified
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  /// SetJITMode - This is called to inform the subtarget info that we are
+  /// producing code for the JIT.
+  void SetJITMode();
+
+  /// getStackAlignment - Returns the minimum alignment known to hold of the
+  /// stack frame on entry to the function and which must be maintained by every
+  /// function for this subtarget.
+  unsigned getStackAlignment() const { return StackAlignment; }
+
+  /// getDarwinDirective - Returns the -m directive specified for the cpu.
+  ///
+  unsigned getDarwinDirective() const { return DarwinDirective; }
+
+  /// getInstrItins - Return the instruction itineraries based on subtarget
+  /// selection.
+  const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
+
+  const PPCFrameLowering *getFrameLowering() const { return &FrameLowering; }
+  const DataLayout *getDataLayout() const { return &DL; }
+  const PPCInstrInfo *getInstrInfo() const { return &InstrInfo; }
+  PPCJITInfo *getJITInfo() { return &JITInfo; }
+  const PPCTargetLowering *getTargetLowering() const { return &TLInfo; }
+  const PPCSelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; }
+
+  /// initializeSubtargetDependencies - Initializes using a CPU and feature string
+  /// so that we can use initializer lists for subtarget initialization.
+  PPCSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
+
+  /// \brief Reset the features for the PowerPC target.
+  void resetSubtargetFeatures(const MachineFunction *MF) override;
+private:
+  void initializeEnvironment();
+  void resetSubtargetFeatures(StringRef CPU, StringRef FS);
+
+public:
+  /// isPPC64 - Return true if we are generating code for 64-bit pointer mode.
+  ///
+  bool isPPC64() const { return IsPPC64; }
+
+  /// has64BitSupport - Return true if the selected CPU supports 64-bit
+  /// instructions, regardless of whether we are in 32-bit or 64-bit mode.
+  bool has64BitSupport() const { return Has64BitSupport; }
+
+  /// use64BitRegs - Return true if in 64-bit mode or if we should use 64-bit
+  /// registers in 32-bit mode when possible.  This can only true if
+  /// has64BitSupport() returns true.
+  bool use64BitRegs() const { return Use64BitRegs; }
+
+  /// useCRBits - Return true if we should store and manipulate i1 values in
+  /// the individual condition register bits.
+  bool useCRBits() const { return UseCRBits; }
+
+  /// hasLazyResolverStub - Return true if accesses to the specified global have
+  /// to go through a dyld lazy resolution stub.  This means that an extra load
+  /// is required to get the address of the global.
+  bool hasLazyResolverStub(const GlobalValue *GV,
+                           const TargetMachine &TM) const;
+
+  // isJITCodeModel - True if we're generating code for the JIT
+  bool isJITCodeModel() const { return IsJITCodeModel; }
+
+  // isLittleEndian - True if generating little-endian code
+  bool isLittleEndian() const { return IsLittleEndian; }
+
+  // Specific obvious features.
+  bool hasFCPSGN() const { return HasFCPSGN; }
+  bool hasFSQRT() const { return HasFSQRT; }
+  bool hasFRE() const { return HasFRE; }
+  bool hasFRES() const { return HasFRES; }
+  bool hasFRSQRTE() const { return HasFRSQRTE; }
+  bool hasFRSQRTES() const { return HasFRSQRTES; }
+  bool hasRecipPrec() const { return HasRecipPrec; }
+  bool hasSTFIWX() const { return HasSTFIWX; }
+  bool hasLFIWAX() const { return HasLFIWAX; }
+  bool hasFPRND() const { return HasFPRND; }
+  bool hasFPCVT() const { return HasFPCVT; }
+  bool hasAltivec() const { return HasAltivec; }
+  bool hasQPX() const { return HasQPX; }
+  bool hasVSX() const { return HasVSX; }
+  bool hasMFOCRF() const { return HasMFOCRF; }
+  bool hasISEL() const { return HasISEL; }
+  bool hasPOPCNTD() const { return HasPOPCNTD; }
+  bool hasLDBRX() const { return HasLDBRX; }
+  bool isBookE() const { return IsBookE; }
+  bool isDeprecatedMFTB() const { return DeprecatedMFTB; }
+  bool isDeprecatedDST() const { return DeprecatedDST; }
+
+  const Triple &getTargetTriple() const { return TargetTriple; }
+
+  /// isDarwin - True if this is any darwin platform.
+  bool isDarwin() const { return TargetTriple.isMacOSX(); }
+  /// isBGQ - True if this is a BG/Q platform.
+  bool isBGQ() const { return TargetTriple.getVendor() == Triple::BGQ; }
+
+  bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
+  bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
+
+  bool isDarwinABI() const { return isDarwin(); }
+  bool isSVR4ABI() const { return !isDarwin(); }
+  /// FIXME: Should use a command-line option.
+  bool isELFv2ABI() const { return isPPC64() && isSVR4ABI() &&
+                                   isLittleEndian(); }
+
+  bool enableEarlyIfConversion() const override { return hasISEL(); }
+
+  // Scheduling customization.
+  bool enableMachineScheduler() const override;
+  // This overrides the PostRAScheduler bit in the SchedModel for each CPU.
+  bool enablePostMachineScheduler() const override;
+  AntiDepBreakMode getAntiDepBreakMode() const override;
+  void getCriticalPathRCs(RegClassVector &CriticalPathRCs) const override;
+
+  void overrideSchedPolicy(MachineSchedPolicy &Policy,
+                           MachineInstr *begin,
+                           MachineInstr *end,
+                           unsigned NumRegionInstrs) const override;
+  bool useAA() const override;
+};
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
new file mode 100644
index 0000000..9563b90
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -0,0 +1,170 @@
+//===-- PPCTargetMachine.cpp - Define TargetMachine for PowerPC -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Top-level implementation for the PowerPC target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCTargetMachine.h"
+#include "PPC.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+static cl::
+opt<bool> DisableCTRLoops("disable-ppc-ctrloops", cl::Hidden,
+                        cl::desc("Disable CTR loops for PPC"));
+
+static cl::opt<bool>
+VSXFMAMutateEarly("schedule-ppc-vsx-fma-mutation-early",
+  cl::Hidden, cl::desc("Schedule VSX FMA instruction mutation early"));
+
+extern "C" void LLVMInitializePowerPCTarget() {
+  // Register the targets
+  RegisterTargetMachine<PPC32TargetMachine> A(ThePPC32Target);
+  RegisterTargetMachine<PPC64TargetMachine> B(ThePPC64Target);
+  RegisterTargetMachine<PPC64TargetMachine> C(ThePPC64LETarget);
+}
+
+PPCTargetMachine::PPCTargetMachine(const Target &T, StringRef TT, StringRef CPU,
+                                   StringRef FS, const TargetOptions &Options,
+                                   Reloc::Model RM, CodeModel::Model CM,
+                                   CodeGenOpt::Level OL, bool is64Bit)
+    : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+      Subtarget(TT, CPU, FS, *this, is64Bit, OL) {
+  initAsmInfo();
+}
+
+void PPC32TargetMachine::anchor() { }
+
+PPC32TargetMachine::PPC32TargetMachine(const Target &T, StringRef TT,
+                                       StringRef CPU, StringRef FS,
+                                       const TargetOptions &Options,
+                                       Reloc::Model RM, CodeModel::Model CM,
+                                       CodeGenOpt::Level OL)
+  : PPCTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {
+}
+
+void PPC64TargetMachine::anchor() { }
+
+PPC64TargetMachine::PPC64TargetMachine(const Target &T, StringRef TT,
+                                       StringRef CPU,  StringRef FS,
+                                       const TargetOptions &Options,
+                                       Reloc::Model RM, CodeModel::Model CM,
+                                       CodeGenOpt::Level OL)
+  : PPCTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {
+}
+
+
+//===----------------------------------------------------------------------===//
+// Pass Pipeline Configuration
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// PPC Code Generator Pass Configuration Options.
+class PPCPassConfig : public TargetPassConfig {
+public:
+  PPCPassConfig(PPCTargetMachine *TM, PassManagerBase &PM)
+    : TargetPassConfig(TM, PM) {}
+
+  PPCTargetMachine &getPPCTargetMachine() const {
+    return getTM<PPCTargetMachine>();
+  }
+
+  const PPCSubtarget &getPPCSubtarget() const {
+    return *getPPCTargetMachine().getSubtargetImpl();
+  }
+
+  bool addPreISel() override;
+  bool addILPOpts() override;
+  bool addInstSelector() override;
+  bool addPreRegAlloc() override;
+  bool addPreSched2() override;
+  bool addPreEmitPass() override;
+};
+} // namespace
+
+TargetPassConfig *PPCTargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new PPCPassConfig(this, PM);
+}
+
+bool PPCPassConfig::addPreISel() {
+  if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None)
+    addPass(createPPCCTRLoops(getPPCTargetMachine()));
+
+  return false;
+}
+
+bool PPCPassConfig::addILPOpts() {
+  addPass(&EarlyIfConverterID);
+  return true;
+}
+
+bool PPCPassConfig::addInstSelector() {
+  // Install an instruction selector.
+  addPass(createPPCISelDag(getPPCTargetMachine()));
+
+#ifndef NDEBUG
+  if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None)
+    addPass(createPPCCTRLoopsVerify());
+#endif
+
+  addPass(createPPCVSXCopyPass());
+  return false;
+}
+
+bool PPCPassConfig::addPreRegAlloc() {
+  initializePPCVSXFMAMutatePass(*PassRegistry::getPassRegistry());
+  insertPass(VSXFMAMutateEarly ? &RegisterCoalescerID : &MachineSchedulerID,
+             &PPCVSXFMAMutateID);
+  return false;
+}
+
+bool PPCPassConfig::addPreSched2() {
+  addPass(createPPCVSXCopyCleanupPass());
+
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(&IfConverterID);
+
+  return true;
+}
+
+bool PPCPassConfig::addPreEmitPass() {
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(createPPCEarlyReturnPass());
+  // Must run branch selection immediately preceding the asm printer.
+  addPass(createPPCBranchSelectionPass());
+  return false;
+}
+
+bool PPCTargetMachine::addCodeEmitter(PassManagerBase &PM,
+                                      JITCodeEmitter &JCE) {
+  // Inform the subtarget that we are in JIT mode.  FIXME: does this break macho
+  // writing?
+  Subtarget.SetJITMode();
+
+  // Machine code emitter pass for PowerPC.
+  PM.add(createPPCJITCodeEmitterPass(*this, JCE));
+
+  return false;
+}
+
+void PPCTargetMachine::addAnalysisPasses(PassManagerBase &PM) {
+  // Add first the target-independent BasicTTI pass, then our PPC pass. This
+  // allows the PPC pass to delegate to the target independent layer when
+  // appropriate.
+  PM.add(createBasicTargetTransformInfoPass(this));
+  PM.add(createPPCTargetTransformInfoPass(this));
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.h b/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.h
new file mode 100644
index 0000000..4c7029c
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.h
@@ -0,0 +1,93 @@
+//===-- PPCTargetMachine.h - Define TargetMachine for PowerPC ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the PowerPC specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PPC_TARGETMACHINE_H
+#define PPC_TARGETMACHINE_H
+
+#include "PPCInstrInfo.h"
+#include "PPCSubtarget.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+/// PPCTargetMachine - Common code between 32-bit and 64-bit PowerPC targets.
+///
+class PPCTargetMachine : public LLVMTargetMachine {
+  PPCSubtarget        Subtarget;
+
+public:
+  PPCTargetMachine(const Target &T, StringRef TT,
+                   StringRef CPU, StringRef FS, const TargetOptions &Options,
+                   Reloc::Model RM, CodeModel::Model CM,
+                   CodeGenOpt::Level OL, bool is64Bit);
+
+  const PPCInstrInfo *getInstrInfo() const override {
+    return getSubtargetImpl()->getInstrInfo();
+  }
+  const PPCFrameLowering *getFrameLowering() const override {
+    return getSubtargetImpl()->getFrameLowering();
+  }
+  PPCJITInfo *getJITInfo() override { return Subtarget.getJITInfo(); }
+  const PPCTargetLowering *getTargetLowering() const override {
+    return getSubtargetImpl()->getTargetLowering();
+  }
+  const PPCSelectionDAGInfo* getSelectionDAGInfo() const override {
+    return getSubtargetImpl()->getSelectionDAGInfo();
+  }
+  const PPCRegisterInfo *getRegisterInfo() const override {
+    return &getInstrInfo()->getRegisterInfo();
+  }
+
+  const DataLayout *getDataLayout() const override {
+    return getSubtargetImpl()->getDataLayout();
+  }
+  const PPCSubtarget  *getSubtargetImpl() const override { return &Subtarget; }
+  const InstrItineraryData *getInstrItineraryData() const override {
+    return &getSubtargetImpl()->getInstrItineraryData();
+  }
+
+  // Pass Pipeline Configuration
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+  bool addCodeEmitter(PassManagerBase &PM,
+                      JITCodeEmitter &JCE) override;
+
+  /// \brief Register PPC analysis passes with a pass manager.
+  void addAnalysisPasses(PassManagerBase &PM) override;
+};
+
+/// PPC32TargetMachine - PowerPC 32-bit target machine.
+///
+class PPC32TargetMachine : public PPCTargetMachine {
+  virtual void anchor();
+public:
+  PPC32TargetMachine(const Target &T, StringRef TT,
+                     StringRef CPU, StringRef FS, const TargetOptions &Options,
+                     Reloc::Model RM, CodeModel::Model CM,
+                     CodeGenOpt::Level OL);
+};
+
+/// PPC64TargetMachine - PowerPC 64-bit target machine.
+///
+class PPC64TargetMachine : public PPCTargetMachine {
+  virtual void anchor();
+public:
+  PPC64TargetMachine(const Target &T, StringRef TT,
+                     StringRef CPU, StringRef FS, const TargetOptions &Options,
+                     Reloc::Model RM, CodeModel::Model CM,
+                     CodeGenOpt::Level OL);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp
new file mode 100644
index 0000000..2903cc1
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp
@@ -0,0 +1,63 @@
+//===-- PPCTargetObjectFile.cpp - PPC Object Info -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCTargetObjectFile.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionELF.h"
+
+using namespace llvm;
+
+void
+PPC64LinuxTargetObjectFile::
+Initialize(MCContext &Ctx, const TargetMachine &TM) {
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+  InitializeELF(TM.Options.UseInitArray);
+}
+
+const MCSection *PPC64LinuxTargetObjectFile::SelectSectionForGlobal(
+    const GlobalValue *GV, SectionKind Kind, Mangler &Mang,
+    const TargetMachine &TM) const {
+  // Here override ReadOnlySection to DataRelROSection for PPC64 SVR4 ABI
+  // when we have a constant that contains global relocations.  This is
+  // necessary because of this ABI's handling of pointers to functions in
+  // a shared library.  The address of a function is actually the address
+  // of a function descriptor, which resides in the .opd section.  Generated
+  // code uses the descriptor directly rather than going via the GOT as some
+  // other ABIs do, which means that initialized function pointers must
+  // reference the descriptor.  The linker must convert copy relocs of
+  // pointers to functions in shared libraries into dynamic relocations,
+  // because of an ordering problem with initialization of copy relocs and
+  // PLT entries.  The dynamic relocation will be initialized by the dynamic
+  // linker, so we must use DataRelROSection instead of ReadOnlySection.
+  // For more information, see the description of ELIMINATE_COPY_RELOCS in
+  // GNU ld.
+  if (Kind.isReadOnly()) {
+    const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
+
+    if (GVar && GVar->isConstant() &&
+        (GVar->getInitializer()->getRelocationInfo() ==
+         Constant::GlobalRelocations))
+      Kind = SectionKind::getReadOnlyWithRel();
+  }
+
+  return TargetLoweringObjectFileELF::SelectSectionForGlobal(GV, Kind,
+                                                             Mang, TM);
+}
+
+const MCExpr *PPC64LinuxTargetObjectFile::
+getDebugThreadLocalSymbol(const MCSymbol *Sym) const {
+  const MCExpr *Expr =
+    MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_PPC_DTPREL, getContext());
+  return MCBinaryExpr::CreateAdd(Expr,
+                                 MCConstantExpr::Create(0x8000, getContext()),
+                                 getContext());
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.h b/contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.h
new file mode 100644
index 0000000..3e71bbc
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.h
@@ -0,0 +1,35 @@
+//===-- PPCTargetObjectFile.h - PPC Object Info -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_PPC_TARGETOBJECTFILE_H
+#define LLVM_TARGET_PPC_TARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+  /// PPC64LinuxTargetObjectFile - This implementation is used for
+  /// 64-bit PowerPC Linux.
+  class PPC64LinuxTargetObjectFile : public TargetLoweringObjectFileELF {
+
+    void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+
+    const MCSection *SelectSectionForGlobal(const GlobalValue *GV,
+                                        SectionKind Kind, Mangler &Mang,
+                                        const TargetMachine &TM) const override;
+
+    /// \brief Describe a TLS variable address within debug info.
+    const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const override;
+  };
+
+}  // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetStreamer.h b/contrib/llvm/lib/Target/PowerPC/PPCTargetStreamer.h
new file mode 100644
index 0000000..73fb691
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetStreamer.h
@@ -0,0 +1,27 @@
+//===-- PPCTargetStreamer.h - PPC Target Streamer --s-----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PPCTARGETSTREAMER_H
+#define PPCTARGETSTREAMER_H
+
+#include "llvm/MC/MCStreamer.h"
+
+namespace llvm {
+class PPCTargetStreamer : public MCTargetStreamer {
+public:
+  PPCTargetStreamer(MCStreamer &S);
+  virtual ~PPCTargetStreamer();
+  virtual void emitTCEntry(const MCSymbol &S) = 0;
+  virtual void emitMachine(StringRef CPU) = 0;
+  virtual void emitAbiVersion(int AbiVersion) = 0;
+  virtual void emitLocalEntry(MCSymbol *S, const MCExpr *LocalOffset) = 0;
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
new file mode 100644
index 0000000..007901b
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -0,0 +1,420 @@
+//===-- PPCTargetTransformInfo.cpp - PPC specific TTI pass ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements a TargetTransformInfo analysis pass specific to the
+/// PPC target machine. It uses the target's detailed information to provide
+/// more precise answers to certain TTI queries, while letting the target
+/// independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "PPCTargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/CostTable.h"
+#include "llvm/Target/TargetLowering.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "ppctti"
+
+static cl::opt<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting",
+cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden);
+
+// Declare the pass initialization routine locally as target-specific passes
+// don't have a target-wide initialization entry point, and so we rely on the
+// pass constructor initialization.
+namespace llvm {
+void initializePPCTTIPass(PassRegistry &);
+}
+
+namespace {
+
+class PPCTTI final : public ImmutablePass, public TargetTransformInfo {
+  const PPCSubtarget *ST;
+  const PPCTargetLowering *TLI;
+
+public:
+  PPCTTI() : ImmutablePass(ID), ST(nullptr), TLI(nullptr) {
+    llvm_unreachable("This pass cannot be directly constructed");
+  }
+
+  PPCTTI(const PPCTargetMachine *TM)
+      : ImmutablePass(ID), ST(TM->getSubtargetImpl()),
+        TLI(TM->getTargetLowering()) {
+    initializePPCTTIPass(*PassRegistry::getPassRegistry());
+  }
+
+  virtual void initializePass() override {
+    pushTTIStack(this);
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+    TargetTransformInfo::getAnalysisUsage(AU);
+  }
+
+  /// Pass identification.
+  static char ID;
+
+  /// Provide necessary pointer adjustments for the two base classes.
+  virtual void *getAdjustedAnalysisPointer(const void *ID) override {
+    if (ID == &TargetTransformInfo::ID)
+      return (TargetTransformInfo*)this;
+    return this;
+  }
+
+  /// \name Scalar TTI Implementations
+  /// @{
+  unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override;
+
+  unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
+                         Type *Ty) const override;
+  unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+                         Type *Ty) const override;
+
+  virtual PopcntSupportKind
+  getPopcntSupport(unsigned TyWidth) const override;
+  virtual void getUnrollingPreferences(
+    Loop *L, UnrollingPreferences &UP) const override;
+
+  /// @}
+
+  /// \name Vector TTI Implementations
+  /// @{
+
+  virtual unsigned getNumberOfRegisters(bool Vector) const override;
+  virtual unsigned getRegisterBitWidth(bool Vector) const override;
+  virtual unsigned getMaximumUnrollFactor() const override;
+  virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+                                          OperandValueKind,
+                                          OperandValueKind) const override;
+  virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
+                                  int Index, Type *SubTp) const override;
+  virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
+                                    Type *Src) const override;
+  virtual unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                                      Type *CondTy) const override;
+  virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
+                                      unsigned Index) const override;
+  virtual unsigned getMemoryOpCost(unsigned Opcode, Type *Src,
+                                   unsigned Alignment,
+                                   unsigned AddressSpace) const override;
+
+  /// @}
+};
+
+} // end anonymous namespace
+
+INITIALIZE_AG_PASS(PPCTTI, TargetTransformInfo, "ppctti",
+                   "PPC Target Transform Info", true, true, false)
+char PPCTTI::ID = 0;
+
+ImmutablePass *
+llvm::createPPCTargetTransformInfoPass(const PPCTargetMachine *TM) {
+  return new PPCTTI(TM);
+}
+
+
+//===----------------------------------------------------------------------===//
+//
+// PPC cost model.
+//
+//===----------------------------------------------------------------------===//
+
+PPCTTI::PopcntSupportKind PPCTTI::getPopcntSupport(unsigned TyWidth) const {
+  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
+  if (ST->hasPOPCNTD() && TyWidth <= 64)
+    return PSK_FastHardware;
+  return PSK_Software;
+}
+
+unsigned PPCTTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
+  if (DisablePPCConstHoist)
+    return TargetTransformInfo::getIntImmCost(Imm, Ty);
+
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  if (BitSize == 0)
+    return ~0U;
+
+  if (Imm == 0)
+    return TCC_Free;
+
+  if (Imm.getBitWidth() <= 64) {
+    if (isInt<16>(Imm.getSExtValue()))
+      return TCC_Basic;
+
+    if (isInt<32>(Imm.getSExtValue())) {
+      // A constant that can be materialized using lis.
+      if ((Imm.getZExtValue() & 0xFFFF) == 0)
+        return TCC_Basic;
+
+      return 2 * TCC_Basic;
+    }
+  }
+
+  return 4 * TCC_Basic;
+}
+
+unsigned PPCTTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
+                               const APInt &Imm, Type *Ty) const {
+  if (DisablePPCConstHoist)
+    return TargetTransformInfo::getIntImmCost(IID, Idx, Imm, Ty);
+
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  if (BitSize == 0)
+    return ~0U;
+
+  switch (IID) {
+  default: return TCC_Free;
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::uadd_with_overflow:
+  case Intrinsic::ssub_with_overflow:
+  case Intrinsic::usub_with_overflow:
+    if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<16>(Imm.getSExtValue()))
+      return TCC_Free;
+    break;
+  }
+  return PPCTTI::getIntImmCost(Imm, Ty);
+}
+
+unsigned PPCTTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
+                               Type *Ty) const {
+  if (DisablePPCConstHoist)
+    return TargetTransformInfo::getIntImmCost(Opcode, Idx, Imm, Ty);
+
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  if (BitSize == 0)
+    return ~0U;
+
+  unsigned ImmIdx = ~0U;
+  bool ShiftedFree = false, RunFree = false, UnsignedFree = false,
+       ZeroFree = false;
+  switch (Opcode) {
+  default: return TCC_Free;
+  case Instruction::GetElementPtr:
+    // Always hoist the base address of a GetElementPtr. This prevents the
+    // creation of new constants for every base constant that gets constant
+    // folded with the offset.
+    if (Idx == 0)
+      return 2 * TCC_Basic;
+    return TCC_Free;
+  case Instruction::And:
+    RunFree = true; // (for the rotate-and-mask instructions)
+    // Fallthrough...
+  case Instruction::Add:
+  case Instruction::Or:
+  case Instruction::Xor:
+    ShiftedFree = true;
+    // Fallthrough...
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+    ImmIdx = 1;
+    break;
+  case Instruction::ICmp:
+    UnsignedFree = true;
+    ImmIdx = 1;
+    // Fallthrough... (zero comparisons can use record-form instructions)
+  case Instruction::Select:
+    ZeroFree = true;
+    break;
+  case Instruction::PHI:
+  case Instruction::Call:
+  case Instruction::Ret:
+  case Instruction::Load:
+  case Instruction::Store:
+    break;
+  }
+
+  if (ZeroFree && Imm == 0)
+    return TCC_Free;
+
+  if (Idx == ImmIdx && Imm.getBitWidth() <= 64) {
+    if (isInt<16>(Imm.getSExtValue()))
+      return TCC_Free;
+
+    if (RunFree) {
+      if (Imm.getBitWidth() <= 32 &&
+          (isShiftedMask_32(Imm.getZExtValue()) ||
+           isShiftedMask_32(~Imm.getZExtValue())))
+        return TCC_Free;
+
+
+      if (ST->isPPC64() &&
+          (isShiftedMask_64(Imm.getZExtValue()) ||
+           isShiftedMask_64(~Imm.getZExtValue())))
+        return TCC_Free;
+    }
+
+    if (UnsignedFree && isUInt<16>(Imm.getZExtValue()))
+      return TCC_Free;
+
+    if (ShiftedFree && (Imm.getZExtValue() & 0xFFFF) == 0)
+      return TCC_Free;
+  }
+
+  return PPCTTI::getIntImmCost(Imm, Ty);
+}
+
+void PPCTTI::getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const {
+  if (ST->getDarwinDirective() == PPC::DIR_A2) {
+    // The A2 is in-order with a deep pipeline, and concatenation unrolling
+    // helps expose latency-hiding opportunities to the instruction scheduler.
+    UP.Partial = UP.Runtime = true;
+  }
+}
+
+unsigned PPCTTI::getNumberOfRegisters(bool Vector) const {
+  if (Vector && !ST->hasAltivec())
+    return 0;
+  return ST->hasVSX() ? 64 : 32;
+}
+
+unsigned PPCTTI::getRegisterBitWidth(bool Vector) const {
+  if (Vector) {
+    if (ST->hasAltivec()) return 128;
+    return 0;
+  }
+
+  if (ST->isPPC64())
+    return 64;
+  return 32;
+
+}
+
+unsigned PPCTTI::getMaximumUnrollFactor() const {
+  unsigned Directive = ST->getDarwinDirective();
+  // The 440 has no SIMD support, but floating-point instructions
+  // have a 5-cycle latency, so unroll by 5x for latency hiding.
+  if (Directive == PPC::DIR_440)
+    return 5;
+
+  // The A2 has no SIMD support, but floating-point instructions
+  // have a 6-cycle latency, so unroll by 6x for latency hiding.
+  if (Directive == PPC::DIR_A2)
+    return 6;
+
+  // FIXME: For lack of any better information, do no harm...
+  if (Directive == PPC::DIR_E500mc || Directive == PPC::DIR_E5500)
+    return 1;
+
+  // For most things, modern systems have two execution units (and
+  // out-of-order execution).
+  return 2;
+}
+
+unsigned PPCTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+                                        OperandValueKind Op1Info,
+                                        OperandValueKind Op2Info) const {
+  assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
+
+  // Fallback to the default implementation.
+  return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Op1Info,
+                                                     Op2Info);
+}
+
+unsigned PPCTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
+                                Type *SubTp) const {
+  return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
+}
+
+unsigned PPCTTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const {
+  assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
+
+  return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
+}
+
+unsigned PPCTTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                                    Type *CondTy) const {
+  return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+}
+
+unsigned PPCTTI::getVectorInstrCost(unsigned Opcode, Type *Val,
+                                    unsigned Index) const {
+  assert(Val->isVectorTy() && "This must be a vector type");
+
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) {
+    // Double-precision scalars are already located in index #0.
+    if (Index == 0)
+      return 0;
+
+    return TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index);
+  }
+
+  // Estimated cost of a load-hit-store delay.  This was obtained
+  // experimentally as a minimum needed to prevent unprofitable
+  // vectorization for the paq8p benchmark.  It may need to be
+  // raised further if other unprofitable cases remain.
+  unsigned LHSPenalty = 2;
+  if (ISD == ISD::INSERT_VECTOR_ELT)
+    LHSPenalty += 7;
+
+  // Vector element insert/extract with Altivec is very expensive,
+  // because they require store and reload with the attendant
+  // processor stall for load-hit-store.  Until VSX is available,
+  // these need to be estimated as very costly.
+  if (ISD == ISD::EXTRACT_VECTOR_ELT ||
+      ISD == ISD::INSERT_VECTOR_ELT)
+    return LHSPenalty +
+      TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index);
+
+  return TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index);
+}
+
+unsigned PPCTTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                                 unsigned AddressSpace) const {
+  // Legalize the type.
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
+  assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
+         "Invalid Opcode");
+
+  unsigned Cost =
+    TargetTransformInfo::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
+
+  // VSX loads/stores support unaligned access.
+  if (ST->hasVSX()) {
+    if (LT.second == MVT::v2f64 || LT.second == MVT::v2i64)
+      return Cost;
+  }
+
+  bool UnalignedAltivec =
+    Src->isVectorTy() &&
+    Src->getPrimitiveSizeInBits() >= LT.second.getSizeInBits() &&
+    LT.second.getSizeInBits() == 128 &&
+    Opcode == Instruction::Load;
+
+  // PPC in general does not support unaligned loads and stores. They'll need
+  // to be decomposed based on the alignment factor.
+  unsigned SrcBytes = LT.second.getStoreSize();
+  if (SrcBytes && Alignment && Alignment < SrcBytes && !UnalignedAltivec) {
+    Cost += LT.first*(SrcBytes/Alignment-1);
+
+    // For a vector type, there is also scalarization overhead (only for
+    // stores, loads are expanded using the vector-load + permutation sequence,
+    // which is much less expensive).
+    if (Src->isVectorTy() && Opcode == Instruction::Store)
+      for (int i = 0, e = Src->getVectorNumElements(); i < e; ++i)
+        Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i);
+  }
+
+  return Cost;
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp b/contrib/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
new file mode 100644
index 0000000..5727dbc
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
@@ -0,0 +1,26 @@
+//===-- PowerPCTargetInfo.cpp - PowerPC Target Implementation -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::ThePPC32Target, llvm::ThePPC64Target, llvm::ThePPC64LETarget;
+
+extern "C" void LLVMInitializePowerPCTargetInfo() { 
+  RegisterTarget<Triple::ppc, /*HasJIT=*/true>
+    X(ThePPC32Target, "ppc32", "PowerPC 32");
+
+  RegisterTarget<Triple::ppc64, /*HasJIT=*/true>
+    Y(ThePPC64Target, "ppc64", "PowerPC 64");
+
+  RegisterTarget<Triple::ppc64le, /*HasJIT=*/true>
+    Z(ThePPC64LETarget, "ppc64le", "PowerPC 64 LE");
+}
diff --git a/contrib/llvm/lib/Target/R600/AMDGPU.h b/contrib/llvm/lib/Target/R600/AMDGPU.h
new file mode 100644
index 0000000..d7e94f7
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPU.h
@@ -0,0 +1,130 @@
+//===-- AMDGPU.h - MachineFunction passes hw codegen --------------*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPU_H
+#define AMDGPU_H
+
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class AMDGPUInstrPrinter;
+class AMDGPUSubtarget;
+class AMDGPUTargetMachine;
+class FunctionPass;
+class MCAsmInfo;
+class raw_ostream;
+class Target;
+class TargetMachine;
+
+// R600 Passes
+FunctionPass *createR600VectorRegMerger(TargetMachine &tm);
+FunctionPass *createR600TextureIntrinsicsReplacer();
+FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm);
+FunctionPass *createR600EmitClauseMarkers();
+FunctionPass *createR600ClauseMergePass(TargetMachine &tm);
+FunctionPass *createR600Packetizer(TargetMachine &tm);
+FunctionPass *createR600ControlFlowFinalizer(TargetMachine &tm);
+FunctionPass *createAMDGPUCFGStructurizerPass();
+
+// SI Passes
+FunctionPass *createSITypeRewriter();
+FunctionPass *createSIAnnotateControlFlowPass();
+FunctionPass *createSILowerI1CopiesPass();
+FunctionPass *createSIShrinkInstructionsPass();
+FunctionPass *createSILowerControlFlowPass(TargetMachine &tm);
+FunctionPass *createSIFixSGPRCopiesPass(TargetMachine &tm);
+FunctionPass *createSIFixSGPRLiveRangesPass();
+FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
+FunctionPass *createSIInsertWaits(TargetMachine &tm);
+
+void initializeSILowerI1CopiesPass(PassRegistry &);
+extern char &SILowerI1CopiesID;
+
+// Passes common to R600 and SI
+FunctionPass *createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST);
+Pass *createAMDGPUStructurizeCFGPass();
+FunctionPass *createAMDGPUISelDag(TargetMachine &tm);
+
+/// \brief Creates an AMDGPU-specific Target Transformation Info pass.
+ImmutablePass *
+createAMDGPUTargetTransformInfoPass(const AMDGPUTargetMachine *TM);
+
+void initializeSIFixSGPRLiveRangesPass(PassRegistry&);
+extern char &SIFixSGPRLiveRangesID;
+
+
+extern Target TheAMDGPUTarget;
+
+namespace AMDGPU {
+enum TargetIndex {
+  TI_CONSTDATA_START
+};
+}
+
+#define END_OF_TEXT_LABEL_NAME "EndOfTextLabel"
+
+} // End namespace llvm
+
+namespace ShaderType {
+  enum Type {
+    PIXEL = 0,
+    VERTEX = 1,
+    GEOMETRY = 2,
+    COMPUTE = 3
+  };
+}
+
+/// OpenCL uses address spaces to differentiate between
+/// various memory regions on the hardware. On the CPU
+/// all of the address spaces point to the same memory,
+/// however on the GPU, each address space points to
+/// a separate piece of memory that is unique from other
+/// memory locations.
+namespace AMDGPUAS {
+enum AddressSpaces {
+  PRIVATE_ADDRESS  = 0, ///< Address space for private memory.
+  GLOBAL_ADDRESS   = 1, ///< Address space for global memory (RAT0, VTX0).
+  CONSTANT_ADDRESS = 2, ///< Address space for constant memory
+  LOCAL_ADDRESS    = 3, ///< Address space for local memory.
+  FLAT_ADDRESS     = 4, ///< Address space for flat memory.
+  REGION_ADDRESS   = 5, ///< Address space for region memory.
+  PARAM_D_ADDRESS  = 6, ///< Address space for direct addressible parameter memory (CONST0)
+  PARAM_I_ADDRESS  = 7, ///< Address space for indirect addressible parameter memory (VTX1)
+
+  // Do not re-order the CONSTANT_BUFFER_* enums.  Several places depend on this
+  // order to be able to dynamically index a constant buffer, for example:
+  //
+  // ConstantBufferAS = CONSTANT_BUFFER_0 + CBIdx
+
+  CONSTANT_BUFFER_0 = 8,
+  CONSTANT_BUFFER_1 = 9,
+  CONSTANT_BUFFER_2 = 10,
+  CONSTANT_BUFFER_3 = 11,
+  CONSTANT_BUFFER_4 = 12,
+  CONSTANT_BUFFER_5 = 13,
+  CONSTANT_BUFFER_6 = 14,
+  CONSTANT_BUFFER_7 = 15,
+  CONSTANT_BUFFER_8 = 16,
+  CONSTANT_BUFFER_9 = 17,
+  CONSTANT_BUFFER_10 = 18,
+  CONSTANT_BUFFER_11 = 19,
+  CONSTANT_BUFFER_12 = 20,
+  CONSTANT_BUFFER_13 = 21,
+  CONSTANT_BUFFER_14 = 22,
+  CONSTANT_BUFFER_15 = 23,
+  ADDRESS_NONE = 24, ///< Address space for unknown memory.
+  LAST_ADDRESS = ADDRESS_NONE
+};
+
+} // namespace AMDGPUAS
+
+#endif // AMDGPU_H
diff --git a/contrib/llvm/lib/Target/R600/AMDGPU.td b/contrib/llvm/lib/Target/R600/AMDGPU.td
new file mode 100644
index 0000000..5645f1a
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPU.td
@@ -0,0 +1,173 @@
+//===-- AMDIL.td - AMDIL Tablegen files --*- tablegen -*-------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// Subtarget Features
+//===----------------------------------------------------------------------===//
+
+// Debugging Features
+
+def FeatureDumpCode : SubtargetFeature <"DumpCode",
+        "DumpCode",
+        "true",
+        "Dump MachineInstrs in the CodeEmitter">;
+
+def FeatureIRStructurizer : SubtargetFeature <"disable-irstructurizer",
+        "EnableIRStructurizer",
+        "false",
+        "Disable IR Structurizer">;
+
+def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca",
+        "EnablePromoteAlloca",
+        "true",
+        "Enable promote alloca pass">;
+
+// Target features
+
+def FeatureIfCvt : SubtargetFeature <"disable-ifcvt",
+        "EnableIfCvt",
+        "false",
+        "Disable the if conversion pass">;
+
+def FeatureFP64 : SubtargetFeature<"fp64",
+        "FP64",
+        "true",
+        "Enable double precision operations">;
+
+def FeatureFP64Denormals : SubtargetFeature<"fp64-denormals",
+        "FP64Denormals",
+        "true",
+        "Enable double precision denormal handling",
+        [FeatureFP64]>;
+
+// Some instructions do not support denormals despite this flag. Using
+// fp32 denormals also causes instructions to run at the double
+// precision rate for the device.
+def FeatureFP32Denormals : SubtargetFeature<"fp32-denormals",
+        "FP32Denormals",
+        "true",
+        "Enable single precision denormal handling">;
+
+def Feature64BitPtr : SubtargetFeature<"64BitPtr",
+        "Is64bit",
+        "true",
+        "Specify if 64-bit addressing should be used">;
+
+def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst",
+        "R600ALUInst",
+        "false",
+        "Older version of ALU instructions encoding">;
+
+def FeatureVertexCache : SubtargetFeature<"HasVertexCache",
+        "HasVertexCache",
+        "true",
+        "Specify use of dedicated vertex cache">;
+
+def FeatureCaymanISA : SubtargetFeature<"caymanISA",
+        "CaymanISA",
+        "true",
+        "Use Cayman ISA">;
+
+def FeatureCFALUBug : SubtargetFeature<"cfalubug",
+        "CFALUBug",
+        "true",
+        "GPU has CF_ALU bug">;
+
+class SubtargetFeatureFetchLimit <string Value> :
+                          SubtargetFeature <"fetch"#Value,
+        "TexVTXClauseSize",
+        Value,
+        "Limit the maximum number of fetches in a clause to "#Value>;
+
+def FeatureFetchLimit8 : SubtargetFeatureFetchLimit <"8">;
+def FeatureFetchLimit16 : SubtargetFeatureFetchLimit <"16">;
+
+class SubtargetFeatureWavefrontSize <int Value> : SubtargetFeature<
+        "wavefrontsize"#Value,
+        "WavefrontSize",
+        !cast<string>(Value),
+        "The number of threads per wavefront">;
+
+def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>;
+def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>;
+def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>;
+
+class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature<
+        "localmemorysize"#Value,
+        "LocalMemorySize",
+        !cast<string>(Value),
+        "The size of local memory in bytes">;
+
+class SubtargetFeatureGeneration <string Value,
+                                  list<SubtargetFeature> Implies> :
+        SubtargetFeature <Value, "Gen", "AMDGPUSubtarget::"#Value,
+                          Value#" GPU generation", Implies>;
+
+def FeatureLocalMemorySize0 : SubtargetFeatureLocalMemorySize<0>;
+def FeatureLocalMemorySize32768 : SubtargetFeatureLocalMemorySize<32768>;
+def FeatureLocalMemorySize65536 : SubtargetFeatureLocalMemorySize<65536>;
+
+def FeatureR600 : SubtargetFeatureGeneration<"R600",
+        [FeatureR600ALUInst, FeatureFetchLimit8, FeatureLocalMemorySize0]>;
+
+def FeatureR700 : SubtargetFeatureGeneration<"R700",
+        [FeatureFetchLimit16, FeatureLocalMemorySize0]>;
+
+def FeatureEvergreen : SubtargetFeatureGeneration<"EVERGREEN",
+        [FeatureFetchLimit16, FeatureLocalMemorySize32768]>;
+
+def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS",
+        [FeatureFetchLimit16, FeatureWavefrontSize64,
+         FeatureLocalMemorySize32768]
+>;
+
+def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
+        [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize32768,
+         FeatureWavefrontSize64]>;
+
+def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS",
+        [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536,
+         FeatureWavefrontSize64]>;
+//===----------------------------------------------------------------------===//
+
+def AMDGPUInstrInfo : InstrInfo {
+  let guessInstructionProperties = 1;
+}
+
+def AMDGPU : Target {
+  // Pull in Instruction Info:
+  let InstructionSet = AMDGPUInstrInfo;
+}
+
+// Dummy Instruction itineraries for pseudo instructions
+def ALU_NULL : FuncUnit;
+def NullALU : InstrItinClass;
+
+//===----------------------------------------------------------------------===//
+// Predicate helper class
+//===----------------------------------------------------------------------===//
+
+class PredicateControl {
+  Predicate SubtargetPredicate;
+  list<Predicate> OtherPredicates = [];
+  list<Predicate> Predicates = !listconcat([SubtargetPredicate],
+                                            OtherPredicates);
+}
+
+// Include AMDGPU TD files
+include "R600Schedule.td"
+include "SISchedule.td"
+include "Processors.td"
+include "AMDGPUInstrInfo.td"
+include "AMDGPUIntrinsics.td"
+include "AMDGPURegisterInfo.td"
+include "AMDGPUInstructions.td"
+include "AMDGPUCallingConv.td"
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUAsmPrinter.cpp b/contrib/llvm/lib/Target/R600/AMDGPUAsmPrinter.cpp
new file mode 100644
index 0000000..73faaa1
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUAsmPrinter.cpp
@@ -0,0 +1,415 @@
+//===-- AMDGPUAsmPrinter.cpp - AMDGPU Assebly printer  --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// The AMDGPUAsmPrinter is used to print both assembly string and also binary
+/// code.  When passed an MCAsmStreamer it prints assembly and when passed
+/// an MCObjectStreamer it outputs binary code.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#include "AMDGPUAsmPrinter.h"
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "R600Defines.h"
+#include "R600MachineFunctionInfo.h"
+#include "R600RegisterInfo.h"
+#include "SIDefines.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
+using namespace llvm;
+
+// TODO: This should get the default rounding mode from the kernel. We just set
+// the default here, but this could change if the OpenCL rounding mode pragmas
+// are used.
+//
+// The denormal mode here should match what is reported by the OpenCL runtime
+// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
+// can also be override to flush with the -cl-denorms-are-zero compiler flag.
+//
+// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
+// precision, and leaves single precision to flush all and does not report
+// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
+// CL_FP_DENORM for both.
+//
+// FIXME: It seems some instructions do not support single precision denormals
+// regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
+// and sin_f32, cos_f32 on most parts).
+
+// We want to use these instructions, and using fp32 denormals also causes
+// instructions to run at the double precision rate for the device so it's
+// probably best to just report no single precision denormals.
+static uint32_t getFPMode(const MachineFunction &F) {
+  const AMDGPUSubtarget& ST = F.getTarget().getSubtarget<AMDGPUSubtarget>();
+  // TODO: Is there any real use for the flush in only / flush out only modes?
+
+  uint32_t FP32Denormals =
+    ST.hasFP32Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
+
+  uint32_t FP64Denormals =
+    ST.hasFP64Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
+
+  return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) |
+         FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) |
+         FP_DENORM_MODE_SP(FP32Denormals) |
+         FP_DENORM_MODE_DP(FP64Denormals);
+}
+
+static AsmPrinter *createAMDGPUAsmPrinterPass(TargetMachine &tm,
+                                              MCStreamer &Streamer) {
+  return new AMDGPUAsmPrinter(tm, Streamer);
+}
+
+extern "C" void LLVMInitializeR600AsmPrinter() {
+  TargetRegistry::RegisterAsmPrinter(TheAMDGPUTarget, createAMDGPUAsmPrinterPass);
+}
+
+AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+    : AsmPrinter(TM, Streamer) {
+  DisasmEnabled = TM.getSubtarget<AMDGPUSubtarget>().dumpCode();
+}
+
+void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) {
+
+  // This label is used to mark the end of the .text section.
+  const TargetLoweringObjectFile &TLOF = getObjFileLowering();
+  OutStreamer.SwitchSection(TLOF.getTextSection());
+  MCSymbol *EndOfTextLabel =
+      OutContext.GetOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME));
+  OutStreamer.EmitLabel(EndOfTextLabel);
+}
+
+bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  SetupMachineFunction(MF);
+
+  OutStreamer.emitRawComment(Twine('@') + MF.getName() + Twine(':'));
+
+  MCContext &Context = getObjFileLowering().getContext();
+  const MCSectionELF *ConfigSection = Context.getELFSection(".AMDGPU.config",
+                                              ELF::SHT_PROGBITS, 0,
+                                              SectionKind::getReadOnly());
+  OutStreamer.SwitchSection(ConfigSection);
+
+  const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
+  SIProgramInfo KernelInfo;
+  if (STM.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
+    getSIProgramInfo(KernelInfo, MF);
+    EmitProgramInfoSI(MF, KernelInfo);
+  } else {
+    EmitProgramInfoR600(MF);
+  }
+
+  DisasmLines.clear();
+  HexLines.clear();
+  DisasmLineMaxLen = 0;
+
+  OutStreamer.SwitchSection(getObjFileLowering().getTextSection());
+  EmitFunctionBody();
+
+  if (isVerbose()) {
+    const MCSectionELF *CommentSection
+      = Context.getELFSection(".AMDGPU.csdata",
+                              ELF::SHT_PROGBITS, 0,
+                              SectionKind::getReadOnly());
+    OutStreamer.SwitchSection(CommentSection);
+
+    if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+      OutStreamer.emitRawComment(" Kernel info:", false);
+      OutStreamer.emitRawComment(" codeLenInByte = " + Twine(KernelInfo.CodeLen),
+                                 false);
+      OutStreamer.emitRawComment(" NumSgprs: " + Twine(KernelInfo.NumSGPR),
+                                 false);
+      OutStreamer.emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR),
+                                 false);
+      OutStreamer.emitRawComment(" FloatMode: " + Twine(KernelInfo.FloatMode),
+                                 false);
+      OutStreamer.emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode),
+                                 false);
+      OutStreamer.emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize),
+                                 false);
+    } else {
+      R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
+      OutStreamer.emitRawComment(
+        Twine("SQ_PGM_RESOURCES:STACK_SIZE = " + Twine(MFI->StackSize)));
+    }
+  }
+
+  if (STM.dumpCode()) {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+    MF.dump();
+#endif
+
+    if (DisasmEnabled) {
+      OutStreamer.SwitchSection(Context.getELFSection(".AMDGPU.disasm",
+                                                  ELF::SHT_NOTE, 0,
+                                                  SectionKind::getReadOnly()));
+
+      for (size_t i = 0; i < DisasmLines.size(); ++i) {
+        std::string Comment(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
+        Comment += " ; " + HexLines[i] + "\n";
+
+        OutStreamer.EmitBytes(StringRef(DisasmLines[i]));
+        OutStreamer.EmitBytes(StringRef(Comment));
+      }
+    }
+  }
+
+  return false;
+}
+
+void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
+  unsigned MaxGPR = 0;
+  bool killPixel = false;
+  const R600RegisterInfo *RI
+    = static_cast<const R600RegisterInfo*>(TM.getRegisterInfo());
+  const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
+  const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
+
+  for (const MachineBasicBlock &MBB : MF) {
+    for (const MachineInstr &MI : MBB) {
+      if (MI.getOpcode() == AMDGPU::KILLGT)
+        killPixel = true;
+      unsigned numOperands = MI.getNumOperands();
+      for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
+        const MachineOperand &MO = MI.getOperand(op_idx);
+        if (!MO.isReg())
+          continue;
+        unsigned HWReg = RI->getEncodingValue(MO.getReg()) & 0xff;
+
+        // Register with value > 127 aren't GPR
+        if (HWReg > 127)
+          continue;
+        MaxGPR = std::max(MaxGPR, HWReg);
+      }
+    }
+  }
+
+  unsigned RsrcReg;
+  if (STM.getGeneration() >= AMDGPUSubtarget::EVERGREEN) {
+    // Evergreen / Northern Islands
+    switch (MFI->getShaderType()) {
+    default: // Fall through
+    case ShaderType::COMPUTE:  RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break;
+    case ShaderType::GEOMETRY: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break;
+    case ShaderType::PIXEL:    RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break;
+    case ShaderType::VERTEX:   RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break;
+    }
+  } else {
+    // R600 / R700
+    switch (MFI->getShaderType()) {
+    default: // Fall through
+    case ShaderType::GEOMETRY: // Fall through
+    case ShaderType::COMPUTE:  // Fall through
+    case ShaderType::VERTEX:   RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break;
+    case ShaderType::PIXEL:    RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break;
+    }
+  }
+
+  OutStreamer.EmitIntValue(RsrcReg, 4);
+  OutStreamer.EmitIntValue(S_NUM_GPRS(MaxGPR + 1) |
+                           S_STACK_SIZE(MFI->StackSize), 4);
+  OutStreamer.EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4);
+  OutStreamer.EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4);
+
+  if (MFI->getShaderType() == ShaderType::COMPUTE) {
+    OutStreamer.EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4);
+    OutStreamer.EmitIntValue(RoundUpToAlignment(MFI->LDSSize, 4) >> 2, 4);
+  }
+}
+
+void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
+                                        const MachineFunction &MF) const {
+  uint64_t CodeSize = 0;
+  unsigned MaxSGPR = 0;
+  unsigned MaxVGPR = 0;
+  bool VCCUsed = false;
+  const SIRegisterInfo *RI
+    = static_cast<const SIRegisterInfo*>(TM.getRegisterInfo());
+
+  for (const MachineBasicBlock &MBB : MF) {
+    for (const MachineInstr &MI : MBB) {
+      // TODO: CodeSize should account for multiple functions.
+      CodeSize += MI.getDesc().Size;
+
+      unsigned numOperands = MI.getNumOperands();
+      for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
+        const MachineOperand &MO = MI.getOperand(op_idx);
+        unsigned width = 0;
+        bool isSGPR = false;
+
+        if (!MO.isReg()) {
+          continue;
+        }
+        unsigned reg = MO.getReg();
+        if (reg == AMDGPU::VCC || reg == AMDGPU::VCC_LO ||
+	    reg == AMDGPU::VCC_HI) {
+          VCCUsed = true;
+          continue;
+        }
+
+        switch (reg) {
+        default: break;
+        case AMDGPU::SCC:
+        case AMDGPU::EXEC:
+        case AMDGPU::M0:
+          continue;
+        }
+
+        if (AMDGPU::SReg_32RegClass.contains(reg)) {
+          isSGPR = true;
+          width = 1;
+        } else if (AMDGPU::VReg_32RegClass.contains(reg)) {
+          isSGPR = false;
+          width = 1;
+        } else if (AMDGPU::SReg_64RegClass.contains(reg)) {
+          isSGPR = true;
+          width = 2;
+        } else if (AMDGPU::VReg_64RegClass.contains(reg)) {
+          isSGPR = false;
+          width = 2;
+        } else if (AMDGPU::VReg_96RegClass.contains(reg)) {
+          isSGPR = false;
+          width = 3;
+        } else if (AMDGPU::SReg_128RegClass.contains(reg)) {
+          isSGPR = true;
+          width = 4;
+        } else if (AMDGPU::VReg_128RegClass.contains(reg)) {
+          isSGPR = false;
+          width = 4;
+        } else if (AMDGPU::SReg_256RegClass.contains(reg)) {
+          isSGPR = true;
+          width = 8;
+        } else if (AMDGPU::VReg_256RegClass.contains(reg)) {
+          isSGPR = false;
+          width = 8;
+        } else if (AMDGPU::SReg_512RegClass.contains(reg)) {
+          isSGPR = true;
+          width = 16;
+        } else if (AMDGPU::VReg_512RegClass.contains(reg)) {
+          isSGPR = false;
+          width = 16;
+        } else {
+          llvm_unreachable("Unknown register class");
+        }
+        unsigned hwReg = RI->getEncodingValue(reg) & 0xff;
+        unsigned maxUsed = hwReg + width - 1;
+        if (isSGPR) {
+          MaxSGPR = maxUsed > MaxSGPR ? maxUsed : MaxSGPR;
+        } else {
+          MaxVGPR = maxUsed > MaxVGPR ? maxUsed : MaxVGPR;
+        }
+      }
+    }
+  }
+
+  if (VCCUsed)
+    MaxSGPR += 2;
+
+  ProgInfo.NumVGPR = MaxVGPR;
+  ProgInfo.NumSGPR = MaxSGPR;
+
+  // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
+  // register.
+  ProgInfo.FloatMode = getFPMode(MF);
+
+  // XXX: Not quite sure what this does, but sc seems to unset this.
+  ProgInfo.IEEEMode = 0;
+
+  // Do not clamp NAN to 0.
+  ProgInfo.DX10Clamp = 0;
+
+  const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
+  ProgInfo.ScratchSize = FrameInfo->estimateStackSize(MF);
+
+  ProgInfo.CodeLen = CodeSize;
+}
+
+void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
+                                         const SIProgramInfo &KernelInfo) {
+  const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+  unsigned RsrcReg;
+  switch (MFI->getShaderType()) {
+  default: // Fall through
+  case ShaderType::COMPUTE:  RsrcReg = R_00B848_COMPUTE_PGM_RSRC1; break;
+  case ShaderType::GEOMETRY: RsrcReg = R_00B228_SPI_SHADER_PGM_RSRC1_GS; break;
+  case ShaderType::PIXEL:    RsrcReg = R_00B028_SPI_SHADER_PGM_RSRC1_PS; break;
+  case ShaderType::VERTEX:   RsrcReg = R_00B128_SPI_SHADER_PGM_RSRC1_VS; break;
+  }
+
+  unsigned LDSAlignShift;
+  if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
+    // LDS is allocated in 64 dword blocks.
+    LDSAlignShift = 8;
+  } else {
+    // LDS is allocated in 128 dword blocks.
+    LDSAlignShift = 9;
+  }
+
+  unsigned LDSBlocks =
+    RoundUpToAlignment(MFI->LDSSize, 1 << LDSAlignShift) >> LDSAlignShift;
+
+  // Scratch is allocated in 256 dword blocks.
+  unsigned ScratchAlignShift = 10;
+  // We need to program the hardware with the amount of scratch memory that
+  // is used by the entire wave.  KernelInfo.ScratchSize is the amount of
+  // scratch memory used per thread.
+  unsigned ScratchBlocks =
+    RoundUpToAlignment(KernelInfo.ScratchSize * STM.getWavefrontSize(),
+                       1 << ScratchAlignShift) >> ScratchAlignShift;
+
+  if (MFI->getShaderType() == ShaderType::COMPUTE) {
+    OutStreamer.EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4);
+
+    const uint32_t ComputePGMRSrc1 =
+      S_00B848_VGPRS(KernelInfo.NumVGPR / 4) |
+      S_00B848_SGPRS(KernelInfo.NumSGPR / 8) |
+      S_00B848_PRIORITY(KernelInfo.Priority) |
+      S_00B848_FLOAT_MODE(KernelInfo.FloatMode) |
+      S_00B848_PRIV(KernelInfo.Priv) |
+      S_00B848_DX10_CLAMP(KernelInfo.DX10Clamp) |
+      S_00B848_IEEE_MODE(KernelInfo.DebugMode) |
+      S_00B848_IEEE_MODE(KernelInfo.IEEEMode);
+
+    OutStreamer.EmitIntValue(ComputePGMRSrc1, 4);
+
+    OutStreamer.EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
+    const uint32_t ComputePGMRSrc2 =
+      S_00B84C_LDS_SIZE(LDSBlocks) |
+      S_00B02C_SCRATCH_EN(ScratchBlocks > 0);
+
+    OutStreamer.EmitIntValue(ComputePGMRSrc2, 4);
+
+    OutStreamer.EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4);
+    OutStreamer.EmitIntValue(S_00B860_WAVESIZE(ScratchBlocks), 4);
+  } else {
+    OutStreamer.EmitIntValue(RsrcReg, 4);
+    OutStreamer.EmitIntValue(S_00B028_VGPRS(KernelInfo.NumVGPR / 4) |
+                             S_00B028_SGPRS(KernelInfo.NumSGPR / 8), 4);
+  }
+
+  if (MFI->getShaderType() == ShaderType::PIXEL) {
+    OutStreamer.EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
+    OutStreamer.EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(LDSBlocks), 4);
+    OutStreamer.EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
+    OutStreamer.EmitIntValue(MFI->PSInputAddr, 4);
+  }
+}
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUAsmPrinter.h b/contrib/llvm/lib/Target/R600/AMDGPUAsmPrinter.h
new file mode 100644
index 0000000..19907cf
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUAsmPrinter.h
@@ -0,0 +1,85 @@
+//===-- AMDGPUAsmPrinter.h - Print AMDGPU assembly code ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief AMDGPU Assembly printer class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPU_ASMPRINTER_H
+#define AMDGPU_ASMPRINTER_H
+
+#include "llvm/CodeGen/AsmPrinter.h"
+#include <vector>
+
+namespace llvm {
+
+class AMDGPUAsmPrinter : public AsmPrinter {
+private:
+  struct SIProgramInfo {
+    SIProgramInfo() :
+      NumVGPR(0),
+      NumSGPR(0),
+      Priority(0),
+      FloatMode(0),
+      Priv(0),
+      DX10Clamp(0),
+      DebugMode(0),
+      IEEEMode(0),
+      ScratchSize(0),
+      CodeLen(0) {}
+
+    // Fields set in PGM_RSRC1 pm4 packet.
+    uint32_t NumVGPR;
+    uint32_t NumSGPR;
+    uint32_t Priority;
+    uint32_t FloatMode;
+    uint32_t Priv;
+    uint32_t DX10Clamp;
+    uint32_t DebugMode;
+    uint32_t IEEEMode;
+    uint32_t ScratchSize;
+
+    // Bonus information for debugging.
+    uint64_t CodeLen;
+  };
+
+  void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF) const;
+  void findNumUsedRegistersSI(const MachineFunction &MF,
+                              unsigned &NumSGPR,
+                              unsigned &NumVGPR) const;
+
+  /// \brief Emit register usage information so that the GPU driver
+  /// can correctly setup the GPU state.
+  void EmitProgramInfoR600(const MachineFunction &MF);
+  void EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &KernelInfo);
+
+public:
+  explicit AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer);
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "AMDGPU Assembly Printer";
+  }
+
+  /// Implemented in AMDGPUMCInstLower.cpp
+  void EmitInstruction(const MachineInstr *MI) override;
+
+  void EmitEndOfAsmFile(Module &M) override;
+
+protected:
+  bool DisasmEnabled;
+  std::vector<std::string> DisasmLines, HexLines;
+  size_t DisasmLineMaxLen;
+};
+
+} // End anonymous llvm
+
+#endif //AMDGPU_ASMPRINTER_H
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUCallingConv.td b/contrib/llvm/lib/Target/R600/AMDGPUCallingConv.td
new file mode 100644
index 0000000..3586c88
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUCallingConv.td
@@ -0,0 +1,74 @@
+//===---- AMDCallingConv.td - Calling Conventions for Radeon GPUs ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the AMD Radeon GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+// Inversion of CCIfInReg
+class CCIfNotInReg<CCAction A> : CCIf<"!ArgFlags.isInReg()", A> {}
+
+// Calling convention for SI
+def CC_SI : CallingConv<[
+
+  CCIfInReg<CCIfType<[f32, i32] , CCAssignToReg<[
+    SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
+    SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
+    SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21
+  ]>>>,
+
+  CCIfInReg<CCIfType<[i64] , CCAssignToRegWithShadow<
+    [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14 ],
+    [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15 ]
+  >>>,
+
+  CCIfNotInReg<CCIfType<[f32, i32] , CCAssignToReg<[
+    VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
+    VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
+    VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
+    VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31
+  ]>>>,
+
+  CCIfByVal<CCIfType<[i64] , CCAssignToRegWithShadow<
+    [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14 ],
+    [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15 ]
+  >>>
+
+]>;
+
+// Calling convention for R600
+def CC_R600 : CallingConv<[
+  CCIfInReg<CCIfType<[v4f32, v4i32] , CCAssignToReg<[
+    T0_XYZW, T1_XYZW, T2_XYZW, T3_XYZW, T4_XYZW, T5_XYZW, T6_XYZW, T7_XYZW,
+    T8_XYZW, T9_XYZW, T10_XYZW, T11_XYZW, T12_XYZW, T13_XYZW, T14_XYZW, T15_XYZW,
+    T16_XYZW, T17_XYZW, T18_XYZW, T19_XYZW, T20_XYZW, T21_XYZW, T22_XYZW,
+    T23_XYZW, T24_XYZW, T25_XYZW, T26_XYZW, T27_XYZW, T28_XYZW, T29_XYZW,
+    T30_XYZW, T31_XYZW, T32_XYZW
+  ]>>>
+]>;
+
+// Calling convention for compute kernels
+def CC_AMDGPU_Kernel : CallingConv<[
+  CCCustom<"allocateStack">
+]>;
+
+def CC_AMDGPU : CallingConv<[
+  CCIf<"State.getTarget().getSubtarget<AMDGPUSubtarget>().getGeneration() >= "
+       "AMDGPUSubtarget::SOUTHERN_ISLANDS && "
+       "State.getMachineFunction().getInfo<SIMachineFunctionInfo>()->"#
+       "getShaderType() == ShaderType::COMPUTE", CCDelegateTo<CC_AMDGPU_Kernel>>,
+  CCIf<"State.getTarget().getSubtarget<AMDGPUSubtarget>().getGeneration() < "
+       "AMDGPUSubtarget::SOUTHERN_ISLANDS && "
+       "State.getMachineFunction().getInfo<R600MachineFunctionInfo>()->"
+       "getShaderType() == ShaderType::COMPUTE", CCDelegateTo<CC_AMDGPU_Kernel>>,
+  CCIf<"State.getTarget().getSubtarget<AMDGPUSubtarget>()"#
+       ".getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS", CCDelegateTo<CC_SI>>,
+  CCIf<"State.getTarget().getSubtarget<AMDGPUSubtarget>()"#
+       ".getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS", CCDelegateTo<CC_R600>>
+]>;
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUFrameLowering.cpp b/contrib/llvm/lib/Target/R600/AMDGPUFrameLowering.cpp
new file mode 100644
index 0000000..9e8302e
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUFrameLowering.cpp
@@ -0,0 +1,113 @@
+//===----------------------- AMDGPUFrameLowering.cpp ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+// Interface to describe a layout of a stack frame on a AMDIL target machine
+//
+//===----------------------------------------------------------------------===//
+#include "AMDGPUFrameLowering.h"
+#include "AMDGPURegisterInfo.h"
+#include "R600MachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Instructions.h"
+
+using namespace llvm;
+AMDGPUFrameLowering::AMDGPUFrameLowering(StackDirection D, unsigned StackAl,
+    int LAO, unsigned TransAl)
+  : TargetFrameLowering(D, StackAl, LAO, TransAl) { }
+
+AMDGPUFrameLowering::~AMDGPUFrameLowering() { }
+
+unsigned AMDGPUFrameLowering::getStackWidth(const MachineFunction &MF) const {
+
+  // XXX: Hardcoding to 1 for now.
+  //
+  // I think the StackWidth should stored as metadata associated with the
+  // MachineFunction.  This metadata can either be added by a frontend, or
+  // calculated by a R600 specific LLVM IR pass.
+  //
+  // The StackWidth determines how stack objects are laid out in memory.
+  // For a vector stack variable, like: int4 stack[2], the data will be stored
+  // in the following ways depending on the StackWidth.
+  //
+  // StackWidth = 1:
+  //
+  // T0.X = stack[0].x
+  // T1.X = stack[0].y
+  // T2.X = stack[0].z
+  // T3.X = stack[0].w
+  // T4.X = stack[1].x
+  // T5.X = stack[1].y
+  // T6.X = stack[1].z
+  // T7.X = stack[1].w
+  //
+  // StackWidth = 2:
+  //
+  // T0.X = stack[0].x
+  // T0.Y = stack[0].y
+  // T1.X = stack[0].z
+  // T1.Y = stack[0].w
+  // T2.X = stack[1].x
+  // T2.Y = stack[1].y
+  // T3.X = stack[1].z
+  // T3.Y = stack[1].w
+  // 
+  // StackWidth = 4:
+  // T0.X = stack[0].x
+  // T0.Y = stack[0].y
+  // T0.Z = stack[0].z
+  // T0.W = stack[0].w
+  // T1.X = stack[1].x
+  // T1.Y = stack[1].y
+  // T1.Z = stack[1].z
+  // T1.W = stack[1].w
+  return 1;
+}
+
+/// \returns The number of registers allocated for \p FI.
+int AMDGPUFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
+                                         int FI) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  // Start the offset at 2 so we don't overwrite work group information.
+  // XXX: We should only do this when the shader actually uses this
+  // information.
+  unsigned OffsetBytes = 2 * (getStackWidth(MF) * 4);
+  int UpperBound = FI == -1 ? MFI->getNumObjects() : FI;
+
+  for (int i = MFI->getObjectIndexBegin(); i < UpperBound; ++i) {
+    OffsetBytes = RoundUpToAlignment(OffsetBytes, MFI->getObjectAlignment(i));
+    OffsetBytes += MFI->getObjectSize(i);
+    // Each register holds 4 bytes, so we must always align the offset to at
+    // least 4 bytes, so that 2 frame objects won't share the same register.
+    OffsetBytes = RoundUpToAlignment(OffsetBytes, 4);
+  }
+
+  if (FI != -1)
+    OffsetBytes = RoundUpToAlignment(OffsetBytes, MFI->getObjectAlignment(FI));
+
+  return OffsetBytes / (getStackWidth(MF) * 4);
+}
+
+const TargetFrameLowering::SpillSlot *
+AMDGPUFrameLowering::getCalleeSavedSpillSlots(unsigned &NumEntries) const {
+  NumEntries = 0;
+  return nullptr;
+}
+void
+AMDGPUFrameLowering::emitPrologue(MachineFunction &MF) const {
+}
+void
+AMDGPUFrameLowering::emitEpilogue(MachineFunction &MF,
+                                  MachineBasicBlock &MBB) const {
+}
+
+bool
+AMDGPUFrameLowering::hasFP(const MachineFunction &MF) const {
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUFrameLowering.h b/contrib/llvm/lib/Target/R600/AMDGPUFrameLowering.h
new file mode 100644
index 0000000..d18ede5
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUFrameLowering.h
@@ -0,0 +1,45 @@
+//===--------------------- AMDGPUFrameLowering.h ----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface to describe a layout of a stack frame on a AMDIL target
+/// machine.
+//
+//===----------------------------------------------------------------------===//
+#ifndef AMDILFRAME_LOWERING_H
+#define AMDILFRAME_LOWERING_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+
+/// \brief Information about the stack frame layout on the AMDGPU targets.
+///
+/// It holds the direction of the stack growth, the known stack alignment on
+/// entry to each function, and the offset to the locals area.
+/// See TargetFrameInfo for more comments.
+class AMDGPUFrameLowering : public TargetFrameLowering {
+public:
+  AMDGPUFrameLowering(StackDirection D, unsigned StackAl, int LAO,
+                      unsigned TransAl = 1);
+  virtual ~AMDGPUFrameLowering();
+
+  /// \returns The number of 32-bit sub-registers that are used when storing
+  /// values to the stack.
+  unsigned getStackWidth(const MachineFunction &MF) const;
+  int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
+  const SpillSlot *
+    getCalleeSavedSpillSlots(unsigned &NumEntries) const override;
+  void emitPrologue(MachineFunction &MF) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+  bool hasFP(const MachineFunction &MF) const override;
+};
+} // namespace llvm
+#endif // AMDILFRAME_LOWERING_H
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/contrib/llvm/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
new file mode 100644
index 0000000..cc17b7e
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
@@ -0,0 +1,906 @@
+//===-- AMDILISelDAGToDAG.cpp - A dag to dag inst selector for AMDIL ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Defines an instruction selector for the AMDGPU target.
+//
+//===----------------------------------------------------------------------===//
+#include "AMDGPUInstrInfo.h"
+#include "AMDGPUISelLowering.h" // For AMDGPUISD
+#include "AMDGPURegisterInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "R600InstrInfo.h"
+#include "SIDefines.h"
+#include "SIISelLowering.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/Function.h"
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Instruction Selector Implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// AMDGPU specific code to select AMDGPU machine instructions for
+/// SelectionDAG operations.
+class AMDGPUDAGToDAGISel : public SelectionDAGISel {
+  // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can
+  // make the right decision when generating code for different targets.
+  const AMDGPUSubtarget &Subtarget;
+public:
+  AMDGPUDAGToDAGISel(TargetMachine &TM);
+  virtual ~AMDGPUDAGToDAGISel();
+
+  SDNode *Select(SDNode *N) override;
+  const char *getPassName() const override;
+  void PostprocessISelDAG() override;
+
+private:
+  bool isInlineImmediate(SDNode *N) const;
+  inline SDValue getSmallIPtrImm(unsigned Imm);
+  bool FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg, SDValue &Abs,
+                   const R600InstrInfo *TII);
+  bool FoldOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &);
+  bool FoldDotOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &);
+
+  // Complex pattern selectors
+  bool SelectADDRParam(SDValue Addr, SDValue& R1, SDValue& R2);
+  bool SelectADDR(SDValue N, SDValue &R1, SDValue &R2);
+  bool SelectADDR64(SDValue N, SDValue &R1, SDValue &R2);
+
+  static bool checkType(const Value *ptr, unsigned int addrspace);
+  static bool checkPrivateAddress(const MachineMemOperand *Op);
+
+  static bool isGlobalStore(const StoreSDNode *N);
+  static bool isPrivateStore(const StoreSDNode *N);
+  static bool isLocalStore(const StoreSDNode *N);
+  static bool isRegionStore(const StoreSDNode *N);
+
+  bool isCPLoad(const LoadSDNode *N) const;
+  bool isConstantLoad(const LoadSDNode *N, int cbID) const;
+  bool isGlobalLoad(const LoadSDNode *N) const;
+  bool isParamLoad(const LoadSDNode *N) const;
+  bool isPrivateLoad(const LoadSDNode *N) const;
+  bool isLocalLoad(const LoadSDNode *N) const;
+  bool isRegionLoad(const LoadSDNode *N) const;
+
+  /// \returns True if the current basic block being selected is at control
+  ///          flow depth 0.  Meaning that the current block dominates the
+  //           exit block.
+  bool isCFDepth0() const;
+
+  const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const;
+  bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr);
+  bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg,
+                                       SDValue& Offset);
+  bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
+  bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
+  bool SelectMUBUFAddr64(SDValue Addr, SDValue &Ptr, SDValue &Offset,
+                         SDValue &ImmOffset) const;
+  bool SelectMUBUFScratch(SDValue Addr, SDValue &RSrc, SDValue &VAddr,
+                          SDValue &SOffset, SDValue &ImmOffset) const;
+  bool SelectMUBUFAddr32(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
+                         SDValue &SOffset, SDValue &Offset, SDValue &Offen,
+                         SDValue &Idxen, SDValue &GLC, SDValue &SLC,
+                         SDValue &TFE) const;
+
+  SDNode *SelectADD_SUB_I64(SDNode *N);
+  SDNode *SelectDIV_SCALE(SDNode *N);
+
+  // Include the pieces autogenerated from the target description.
+#include "AMDGPUGenDAGISel.inc"
+};
+}  // end anonymous namespace
+
+/// \brief This pass converts a legalized DAG into a AMDGPU-specific
+// DAG, ready for instruction scheduling.
+FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM) {
+  return new AMDGPUDAGToDAGISel(TM);
+}
+
+AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM)
+  : SelectionDAGISel(TM), Subtarget(TM.getSubtarget<AMDGPUSubtarget>()) {
+}
+
+AMDGPUDAGToDAGISel::~AMDGPUDAGToDAGISel() {
+}
+
+bool AMDGPUDAGToDAGISel::isInlineImmediate(SDNode *N) const {
+  const SITargetLowering *TL
+      = static_cast<const SITargetLowering *>(getTargetLowering());
+  return TL->analyzeImmediate(N) == 0;
+}
+
+/// \brief Determine the register class for \p OpNo
+/// \returns The register class of the virtual register that will be used for
+/// the given operand number \OpNo or NULL if the register class cannot be
+/// determined.
+const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
+                                                          unsigned OpNo) const {
+  if (!N->isMachineOpcode())
+    return nullptr;
+
+  switch (N->getMachineOpcode()) {
+  default: {
+    const MCInstrDesc &Desc = TM.getInstrInfo()->get(N->getMachineOpcode());
+    unsigned OpIdx = Desc.getNumDefs() + OpNo;
+    if (OpIdx >= Desc.getNumOperands())
+      return nullptr;
+    int RegClass = Desc.OpInfo[OpIdx].RegClass;
+    if (RegClass == -1)
+      return nullptr;
+
+    return TM.getRegisterInfo()->getRegClass(RegClass);
+  }
+  case AMDGPU::REG_SEQUENCE: {
+    unsigned RCID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+    const TargetRegisterClass *SuperRC = TM.getRegisterInfo()->getRegClass(RCID);
+
+    SDValue SubRegOp = N->getOperand(OpNo + 1);
+    unsigned SubRegIdx = cast<ConstantSDNode>(SubRegOp)->getZExtValue();
+    return TM.getRegisterInfo()->getSubClassWithSubReg(SuperRC, SubRegIdx);
+  }
+  }
+}
+
+SDValue AMDGPUDAGToDAGISel::getSmallIPtrImm(unsigned int Imm) {
+  return CurDAG->getTargetConstant(Imm, MVT::i32);
+}
+
+bool AMDGPUDAGToDAGISel::SelectADDRParam(
+  SDValue Addr, SDValue& R1, SDValue& R2) {
+
+  if (Addr.getOpcode() == ISD::FrameIndex) {
+    if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+      R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+      R2 = CurDAG->getTargetConstant(0, MVT::i32);
+    } else {
+      R1 = Addr;
+      R2 = CurDAG->getTargetConstant(0, MVT::i32);
+    }
+  } else if (Addr.getOpcode() == ISD::ADD) {
+    R1 = Addr.getOperand(0);
+    R2 = Addr.getOperand(1);
+  } else {
+    R1 = Addr;
+    R2 = CurDAG->getTargetConstant(0, MVT::i32);
+  }
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectADDR(SDValue Addr, SDValue& R1, SDValue& R2) {
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress) {
+    return false;
+  }
+  return SelectADDRParam(Addr, R1, R2);
+}
+
+
+bool AMDGPUDAGToDAGISel::SelectADDR64(SDValue Addr, SDValue& R1, SDValue& R2) {
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress) {
+    return false;
+  }
+
+  if (Addr.getOpcode() == ISD::FrameIndex) {
+    if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+      R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64);
+      R2 = CurDAG->getTargetConstant(0, MVT::i64);
+    } else {
+      R1 = Addr;
+      R2 = CurDAG->getTargetConstant(0, MVT::i64);
+    }
+  } else if (Addr.getOpcode() == ISD::ADD) {
+    R1 = Addr.getOperand(0);
+    R2 = Addr.getOperand(1);
+  } else {
+    R1 = Addr;
+    R2 = CurDAG->getTargetConstant(0, MVT::i64);
+  }
+  return true;
+}
+
+SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
+  unsigned int Opc = N->getOpcode();
+  if (N->isMachineOpcode()) {
+    N->setNodeId(-1);
+    return nullptr;   // Already selected.
+  }
+
+  const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
+  switch (Opc) {
+  default: break;
+  // We are selecting i64 ADD here instead of custom lower it during
+  // DAG legalization, so we can fold some i64 ADDs used for address
+  // calculation into the LOAD and STORE instructions.
+  case ISD::ADD:
+  case ISD::SUB: {
+    if (N->getValueType(0) != MVT::i64 ||
+        ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
+      break;
+
+    return SelectADD_SUB_I64(N);
+  }
+  case ISD::SCALAR_TO_VECTOR:
+  case AMDGPUISD::BUILD_VERTICAL_VECTOR:
+  case ISD::BUILD_VECTOR: {
+    unsigned RegClassID;
+    const AMDGPURegisterInfo *TRI =
+                   static_cast<const AMDGPURegisterInfo*>(TM.getRegisterInfo());
+    const SIRegisterInfo *SIRI =
+                   static_cast<const SIRegisterInfo*>(TM.getRegisterInfo());
+    EVT VT = N->getValueType(0);
+    unsigned NumVectorElts = VT.getVectorNumElements();
+    EVT EltVT = VT.getVectorElementType();
+    assert(EltVT.bitsEq(MVT::i32));
+    if (ST.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+      bool UseVReg = true;
+      for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
+                                                    U != E; ++U) {
+        if (!U->isMachineOpcode()) {
+          continue;
+        }
+        const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
+        if (!RC) {
+          continue;
+        }
+        if (SIRI->isSGPRClass(RC)) {
+          UseVReg = false;
+        }
+      }
+      switch(NumVectorElts) {
+      case 1: RegClassID = UseVReg ? AMDGPU::VReg_32RegClassID :
+                                     AMDGPU::SReg_32RegClassID;
+        break;
+      case 2: RegClassID = UseVReg ? AMDGPU::VReg_64RegClassID :
+                                     AMDGPU::SReg_64RegClassID;
+        break;
+      case 4: RegClassID = UseVReg ? AMDGPU::VReg_128RegClassID :
+                                     AMDGPU::SReg_128RegClassID;
+        break;
+      case 8: RegClassID = UseVReg ? AMDGPU::VReg_256RegClassID :
+                                     AMDGPU::SReg_256RegClassID;
+        break;
+      case 16: RegClassID = UseVReg ? AMDGPU::VReg_512RegClassID :
+                                      AMDGPU::SReg_512RegClassID;
+        break;
+      default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
+      }
+    } else {
+      // BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
+      // that adds a 128 bits reg copy when going through TwoAddressInstructions
+      // pass. We want to avoid 128 bits copies as much as possible because they
+      // can't be bundled by our scheduler.
+      switch(NumVectorElts) {
+      case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break;
+      case 4:
+        if (Opc == AMDGPUISD::BUILD_VERTICAL_VECTOR)
+          RegClassID = AMDGPU::R600_Reg128VerticalRegClassID;
+        else
+          RegClassID = AMDGPU::R600_Reg128RegClassID;
+        break;
+      default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
+      }
+    }
+
+    SDValue RegClass = CurDAG->getTargetConstant(RegClassID, MVT::i32);
+
+    if (NumVectorElts == 1) {
+      return CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT,
+                                  N->getOperand(0), RegClass);
+    }
+
+    assert(NumVectorElts <= 16 && "Vectors with more than 16 elements not "
+                                  "supported yet");
+    // 16 = Max Num Vector Elements
+    // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
+    // 1 = Vector Register Class
+    SmallVector<SDValue, 16 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
+
+    RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, MVT::i32);
+    bool IsRegSeq = true;
+    unsigned NOps = N->getNumOperands();
+    for (unsigned i = 0; i < NOps; i++) {
+      // XXX: Why is this here?
+      if (dyn_cast<RegisterSDNode>(N->getOperand(i))) {
+        IsRegSeq = false;
+        break;
+      }
+      RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
+      RegSeqArgs[1 + (2 * i) + 1] =
+              CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), MVT::i32);
+    }
+
+    if (NOps != NumVectorElts) {
+      // Fill in the missing undef elements if this was a scalar_to_vector.
+      assert(Opc == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
+
+      MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
+                                                     SDLoc(N), EltVT);
+      for (unsigned i = NOps; i < NumVectorElts; ++i) {
+        RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
+        RegSeqArgs[1 + (2 * i) + 1] =
+          CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), MVT::i32);
+      }
+    }
+
+    if (!IsRegSeq)
+      break;
+    return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(),
+                                RegSeqArgs);
+  }
+  case ISD::BUILD_PAIR: {
+    SDValue RC, SubReg0, SubReg1;
+    if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
+      break;
+    }
+    if (N->getValueType(0) == MVT::i128) {
+      RC = CurDAG->getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32);
+      SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, MVT::i32);
+      SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, MVT::i32);
+    } else if (N->getValueType(0) == MVT::i64) {
+      RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32);
+      SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32);
+      SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32);
+    } else {
+      llvm_unreachable("Unhandled value type for BUILD_PAIR");
+    }
+    const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
+                            N->getOperand(1), SubReg1 };
+    return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE,
+                                  SDLoc(N), N->getValueType(0), Ops);
+  }
+
+  case ISD::Constant:
+  case ISD::ConstantFP: {
+    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
+    if (ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS ||
+        N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N))
+      break;
+
+    uint64_t Imm;
+    if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N))
+      Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
+    else {
+      ConstantSDNode *C = cast<ConstantSDNode>(N);
+      Imm = C->getZExtValue();
+    }
+
+    SDNode *Lo = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SDLoc(N), MVT::i32,
+                                CurDAG->getConstant(Imm & 0xFFFFFFFF, MVT::i32));
+    SDNode *Hi = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SDLoc(N), MVT::i32,
+                                CurDAG->getConstant(Imm >> 32, MVT::i32));
+    const SDValue Ops[] = {
+      CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32),
+      SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32),
+      SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32)
+    };
+
+    return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SDLoc(N),
+                                  N->getValueType(0), Ops);
+  }
+
+  case AMDGPUISD::REGISTER_LOAD: {
+    if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
+      break;
+    SDValue Addr, Offset;
+
+    SelectADDRIndirect(N->getOperand(1), Addr, Offset);
+    const SDValue Ops[] = {
+      Addr,
+      Offset,
+      CurDAG->getTargetConstant(0, MVT::i32),
+      N->getOperand(0),
+    };
+    return CurDAG->getMachineNode(AMDGPU::SI_RegisterLoad, SDLoc(N),
+                                  CurDAG->getVTList(MVT::i32, MVT::i64, MVT::Other),
+                                  Ops);
+  }
+  case AMDGPUISD::REGISTER_STORE: {
+    if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
+      break;
+    SDValue Addr, Offset;
+    SelectADDRIndirect(N->getOperand(2), Addr, Offset);
+    const SDValue Ops[] = {
+      N->getOperand(1),
+      Addr,
+      Offset,
+      CurDAG->getTargetConstant(0, MVT::i32),
+      N->getOperand(0),
+    };
+    return CurDAG->getMachineNode(AMDGPU::SI_RegisterStorePseudo, SDLoc(N),
+                                        CurDAG->getVTList(MVT::Other),
+                                        Ops);
+  }
+
+  case AMDGPUISD::BFE_I32:
+  case AMDGPUISD::BFE_U32: {
+    if (ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
+      break;
+
+    // There is a scalar version available, but unlike the vector version which
+    // has a separate operand for the offset and width, the scalar version packs
+    // the width and offset into a single operand. Try to move to the scalar
+    // version if the offsets are constant, so that we can try to keep extended
+    // loads of kernel arguments in SGPRs.
+
+    // TODO: Technically we could try to pattern match scalar bitshifts of
+    // dynamic values, but it's probably not useful.
+    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
+    if (!Offset)
+      break;
+
+    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
+    if (!Width)
+      break;
+
+    bool Signed = Opc == AMDGPUISD::BFE_I32;
+
+    // Transformation function, pack the offset and width of a BFE into
+    // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
+    // source, bits [5:0] contain the offset and bits [22:16] the width.
+
+    uint32_t OffsetVal = Offset->getZExtValue();
+    uint32_t WidthVal = Width->getZExtValue();
+
+    uint32_t PackedVal = OffsetVal | WidthVal << 16;
+
+    SDValue PackedOffsetWidth = CurDAG->getTargetConstant(PackedVal, MVT::i32);
+    return CurDAG->getMachineNode(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32,
+                                  SDLoc(N),
+                                  MVT::i32,
+                                  N->getOperand(0),
+                                  PackedOffsetWidth);
+
+  }
+  case AMDGPUISD::DIV_SCALE: {
+    return SelectDIV_SCALE(N);
+  }
+  }
+  return SelectCode(N);
+}
+
+
+bool AMDGPUDAGToDAGISel::checkType(const Value *Ptr, unsigned AS) {
+  assert(AS != 0 && "Use checkPrivateAddress instead.");
+  if (!Ptr)
+    return false;
+
+  return Ptr->getType()->getPointerAddressSpace() == AS;
+}
+
+bool AMDGPUDAGToDAGISel::checkPrivateAddress(const MachineMemOperand *Op) {
+  if (Op->getPseudoValue())
+    return true;
+
+  if (PointerType *PT = dyn_cast<PointerType>(Op->getValue()->getType()))
+    return PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS;
+
+  return false;
+}
+
+bool AMDGPUDAGToDAGISel::isGlobalStore(const StoreSDNode *N) {
+  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::GLOBAL_ADDRESS);
+}
+
+bool AMDGPUDAGToDAGISel::isPrivateStore(const StoreSDNode *N) {
+  const Value *MemVal = N->getMemOperand()->getValue();
+  return (!checkType(MemVal, AMDGPUAS::LOCAL_ADDRESS) &&
+          !checkType(MemVal, AMDGPUAS::GLOBAL_ADDRESS) &&
+          !checkType(MemVal, AMDGPUAS::REGION_ADDRESS));
+}
+
+bool AMDGPUDAGToDAGISel::isLocalStore(const StoreSDNode *N) {
+  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS);
+}
+
+bool AMDGPUDAGToDAGISel::isRegionStore(const StoreSDNode *N) {
+  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS);
+}
+
+bool AMDGPUDAGToDAGISel::isConstantLoad(const LoadSDNode *N, int CbId) const {
+  const Value *MemVal = N->getMemOperand()->getValue();
+  if (CbId == -1)
+    return checkType(MemVal, AMDGPUAS::CONSTANT_ADDRESS);
+
+  return checkType(MemVal, AMDGPUAS::CONSTANT_BUFFER_0 + CbId);
+}
+
+bool AMDGPUDAGToDAGISel::isGlobalLoad(const LoadSDNode *N) const {
+  if (N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) {
+    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
+    if (ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS ||
+        N->getMemoryVT().bitsLT(MVT::i32)) {
+      return true;
+    }
+  }
+  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::GLOBAL_ADDRESS);
+}
+
+bool AMDGPUDAGToDAGISel::isParamLoad(const LoadSDNode *N) const {
+  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::PARAM_I_ADDRESS);
+}
+
+bool AMDGPUDAGToDAGISel::isLocalLoad(const  LoadSDNode *N) const {
+  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS);
+}
+
+bool AMDGPUDAGToDAGISel::isRegionLoad(const  LoadSDNode *N) const {
+  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS);
+}
+
+bool AMDGPUDAGToDAGISel::isCPLoad(const LoadSDNode *N) const {
+  MachineMemOperand *MMO = N->getMemOperand();
+  if (checkPrivateAddress(N->getMemOperand())) {
+    if (MMO) {
+      const PseudoSourceValue *PSV = MMO->getPseudoValue();
+      if (PSV && PSV == PseudoSourceValue::getConstantPool()) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+bool AMDGPUDAGToDAGISel::isPrivateLoad(const LoadSDNode *N) const {
+  if (checkPrivateAddress(N->getMemOperand())) {
+    // Check to make sure we are not a constant pool load or a constant load
+    // that is marked as a private load
+    if (isCPLoad(N) || isConstantLoad(N, -1)) {
+      return false;
+    }
+  }
+
+  const Value *MemVal = N->getMemOperand()->getValue();
+  if (!checkType(MemVal, AMDGPUAS::LOCAL_ADDRESS) &&
+      !checkType(MemVal, AMDGPUAS::GLOBAL_ADDRESS) &&
+      !checkType(MemVal, AMDGPUAS::REGION_ADDRESS) &&
+      !checkType(MemVal, AMDGPUAS::CONSTANT_ADDRESS) &&
+      !checkType(MemVal, AMDGPUAS::PARAM_D_ADDRESS) &&
+      !checkType(MemVal, AMDGPUAS::PARAM_I_ADDRESS)){
+    return true;
+  }
+  return false;
+}
+
+bool AMDGPUDAGToDAGISel::isCFDepth0() const {
+  // FIXME: Figure out a way to use DominatorTree analysis here.
+  const BasicBlock *CurBlock = FuncInfo->MBB->getBasicBlock();
+  const Function *Fn = FuncInfo->Fn;
+  return &Fn->front() == CurBlock || &Fn->back() == CurBlock;
+}
+
+
+const char *AMDGPUDAGToDAGISel::getPassName() const {
+  return "AMDGPU DAG->DAG Pattern Instruction Selection";
+}
+
+#ifdef DEBUGTMP
+#undef INT64_C
+#endif
+#undef DEBUGTMP
+
+//===----------------------------------------------------------------------===//
+// Complex Patterns
+//===----------------------------------------------------------------------===//
+
+bool AMDGPUDAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr,
+                                                         SDValue& IntPtr) {
+  if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Addr)) {
+    IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, true);
+    return true;
+  }
+  return false;
+}
+
+bool AMDGPUDAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr,
+    SDValue& BaseReg, SDValue &Offset) {
+  if (!isa<ConstantSDNode>(Addr)) {
+    BaseReg = Addr;
+    Offset = CurDAG->getIntPtrConstant(0, true);
+    return true;
+  }
+  return false;
+}
+
+bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
+                                           SDValue &Offset) {
+  ConstantSDNode *IMMOffset;
+
+  if (Addr.getOpcode() == ISD::ADD
+      && (IMMOffset = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
+      && isInt<16>(IMMOffset->getZExtValue())) {
+
+      Base = Addr.getOperand(0);
+      Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), MVT::i32);
+      return true;
+  // If the pointer address is constant, we can move it to the offset field.
+  } else if ((IMMOffset = dyn_cast<ConstantSDNode>(Addr))
+             && isInt<16>(IMMOffset->getZExtValue())) {
+    Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
+                                  SDLoc(CurDAG->getEntryNode()),
+                                  AMDGPU::ZERO, MVT::i32);
+    Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), MVT::i32);
+    return true;
+  }
+
+  // Default case, no offset
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
+                                            SDValue &Offset) {
+  ConstantSDNode *C;
+
+  if ((C = dyn_cast<ConstantSDNode>(Addr))) {
+    Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32);
+    Offset = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i32);
+  } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
+            (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
+    Base = Addr.getOperand(0);
+    Offset = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i32);
+  } else {
+    Base = Addr;
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  }
+
+  return true;
+}
+
+SDNode *AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
+  SDLoc DL(N);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  bool IsAdd = (N->getOpcode() == ISD::ADD);
+
+  SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32);
+  SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32);
+
+  SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                       DL, MVT::i32, LHS, Sub0);
+  SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                       DL, MVT::i32, LHS, Sub1);
+
+  SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                       DL, MVT::i32, RHS, Sub0);
+  SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                       DL, MVT::i32, RHS, Sub1);
+
+  SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
+  SDValue AddLoArgs[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
+
+
+  unsigned Opc = IsAdd ? AMDGPU::S_ADD_I32 : AMDGPU::S_SUB_I32;
+  unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
+
+  if (!isCFDepth0()) {
+    Opc = IsAdd ? AMDGPU::V_ADD_I32_e32 : AMDGPU::V_SUB_I32_e32;
+    CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e32 : AMDGPU::V_SUBB_U32_e32;
+  }
+
+  SDNode *AddLo = CurDAG->getMachineNode( Opc, DL, VTList, AddLoArgs);
+  SDValue Carry(AddLo, 1);
+  SDNode *AddHi
+    = CurDAG->getMachineNode(CarryOpc, DL, MVT::i32,
+                             SDValue(Hi0, 0), SDValue(Hi1, 0), Carry);
+
+  SDValue Args[5] = {
+    CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32),
+    SDValue(AddLo,0),
+    Sub0,
+    SDValue(AddHi,0),
+    Sub1,
+  };
+  return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args);
+}
+
+SDNode *AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
+  SDLoc SL(N);
+  EVT VT = N->getValueType(0);
+
+  assert(VT == MVT::f32 || VT == MVT::f64);
+
+  unsigned Opc
+    = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32;
+
+  const SDValue Zero = CurDAG->getTargetConstant(0, MVT::i32);
+
+  SDValue Ops[] = {
+    N->getOperand(0),
+    N->getOperand(1),
+    N->getOperand(2),
+    Zero,
+    Zero,
+    Zero,
+    Zero
+  };
+
+  return CurDAG->SelectNodeTo(N, Opc, VT, MVT::i1, Ops);
+}
+
+static SDValue wrapAddr64Rsrc(SelectionDAG *DAG, SDLoc DL, SDValue Ptr) {
+  return SDValue(DAG->getMachineNode(AMDGPU::SI_ADDR64_RSRC, DL, MVT::v4i32,
+                                     Ptr), 0);
+}
+
+static bool isLegalMUBUFImmOffset(const ConstantSDNode *Imm) {
+  return isUInt<12>(Imm->getZExtValue());
+}
+
+bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &Ptr,
+                                           SDValue &Offset,
+                                           SDValue &ImmOffset) const {
+  SDLoc DL(Addr);
+
+  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    SDValue N0 = Addr.getOperand(0);
+    SDValue N1 = Addr.getOperand(1);
+    ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
+
+    if (isLegalMUBUFImmOffset(C1)) {
+
+      if (N0.getOpcode() == ISD::ADD) {
+        // (add (add N2, N3), C1)
+        SDValue N2 = N0.getOperand(0);
+        SDValue N3 = N0.getOperand(1);
+        Ptr = wrapAddr64Rsrc(CurDAG, DL, N2);
+        Offset = N3;
+        ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16);
+        return true;
+      }
+
+      // (add N0, C1)
+      Ptr = wrapAddr64Rsrc(CurDAG, DL, CurDAG->getTargetConstant(0, MVT::i64));;
+      Offset = N0;
+      ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16);
+      return true;
+    }
+  }
+  if (Addr.getOpcode() == ISD::ADD) {
+    // (add N0, N1)
+    SDValue N0 = Addr.getOperand(0);
+    SDValue N1 = Addr.getOperand(1);
+    Ptr = wrapAddr64Rsrc(CurDAG, DL, N0);
+    Offset = N1;
+    ImmOffset = CurDAG->getTargetConstant(0, MVT::i16);
+    return true;
+  }
+
+  // default case
+  Ptr = wrapAddr64Rsrc(CurDAG, DL, CurDAG->getConstant(0, MVT::i64));
+  Offset = Addr;
+  ImmOffset = CurDAG->getTargetConstant(0, MVT::i16);
+  return true;
+}
+
+/// \brief Return a resource descriptor with the 'Add TID' bit enabled
+///        The TID (Thread ID) is multipled by the stride value (bits [61:48]
+///        of the resource descriptor) to create an offset, which is added to the
+///        resource ponter.
+static SDValue buildScratchRSRC(SelectionDAG *DAG, SDLoc DL, SDValue Ptr) {
+
+  uint64_t Rsrc = AMDGPU::RSRC_DATA_FORMAT | AMDGPU::RSRC_TID_ENABLE |
+                  0xffffffff;
+
+  SDValue PtrLo = DAG->getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
+  SDValue PtrHi = DAG->getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
+  SDValue DataLo = DAG->getTargetConstant(
+      Rsrc & APInt::getAllOnesValue(32).getZExtValue(), MVT::i32);
+  SDValue DataHi = DAG->getTargetConstant(Rsrc >> 32, MVT::i32);
+
+  const SDValue Ops[] = { PtrLo, PtrHi, DataLo, DataHi };
+  return SDValue(DAG->getMachineNode(AMDGPU::SI_BUFFER_RSRC, DL,
+                                     MVT::v4i32, Ops), 0);
+}
+
+bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
+                                            SDValue &VAddr, SDValue &SOffset,
+                                            SDValue &ImmOffset) const {
+
+  SDLoc DL(Addr);
+  MachineFunction &MF = CurDAG->getMachineFunction();
+  const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo*>(MF.getTarget().getRegisterInfo());
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+
+  unsigned ScratchPtrReg =
+      TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR);
+  unsigned ScratchOffsetReg =
+      TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET);
+
+  Rsrc = buildScratchRSRC(CurDAG, DL, CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, MRI.getLiveInVirtReg(ScratchPtrReg), MVT::i64));
+  SOffset = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
+      MRI.getLiveInVirtReg(ScratchOffsetReg), MVT::i32);
+
+  // (add n0, c1)
+  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    SDValue N1 = Addr.getOperand(1);
+    ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
+
+    if (isLegalMUBUFImmOffset(C1)) {
+      VAddr = Addr.getOperand(0);
+      ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16);
+      return true;
+    }
+  }
+
+  // (add FI, n0)
+  if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
+       isa<FrameIndexSDNode>(Addr.getOperand(0))) {
+    VAddr = Addr.getOperand(1);
+    ImmOffset = Addr.getOperand(0);
+    return true;
+  }
+
+  // (FI)
+  if (isa<FrameIndexSDNode>(Addr)) {
+    VAddr = SDValue(CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
+                                          CurDAG->getConstant(0, MVT::i32)), 0);
+    ImmOffset = Addr;
+    return true;
+  }
+
+  // (node)
+  VAddr = Addr;
+  ImmOffset = CurDAG->getTargetConstant(0, MVT::i16);
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectMUBUFAddr32(SDValue Addr, SDValue &SRsrc,
+                                           SDValue &VAddr, SDValue &SOffset,
+                                           SDValue &Offset, SDValue &Offen,
+                                           SDValue &Idxen, SDValue &GLC,
+                                           SDValue &SLC, SDValue &TFE) const {
+
+  GLC = CurDAG->getTargetConstant(0, MVT::i1);
+  SLC = CurDAG->getTargetConstant(0, MVT::i1);
+  TFE = CurDAG->getTargetConstant(0, MVT::i1);
+
+  Idxen = CurDAG->getTargetConstant(0, MVT::i1);
+  Offen = CurDAG->getTargetConstant(1, MVT::i1);
+
+  return SelectMUBUFScratch(Addr, SRsrc, VAddr, SOffset, Offset);
+}
+
+void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
+  const AMDGPUTargetLowering& Lowering =
+    *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
+  bool IsModified = false;
+  do {
+    IsModified = false;
+    // Go over all selected nodes and try to fold them a bit more
+    for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
+         E = CurDAG->allnodes_end(); I != E; ++I) {
+
+      SDNode *Node = I;
+
+      MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(I);
+      if (!MachineNode)
+        continue;
+
+      SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
+      if (ResNode != Node) {
+        ReplaceUses(Node, ResNode);
+        IsModified = true;
+      }
+    }
+    CurDAG->RemoveDeadNodes();
+  } while (IsModified);
+}
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUISelLowering.cpp b/contrib/llvm/lib/Target/R600/AMDGPUISelLowering.cpp
new file mode 100644
index 0000000..5a46297b
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -0,0 +1,2325 @@
+//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief This is the parent TargetLowering class for hardware code gen
+/// targets.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUISelLowering.h"
+#include "AMDGPU.h"
+#include "AMDGPUFrameLowering.h"
+#include "AMDGPUIntrinsicInfo.h"
+#include "AMDGPURegisterInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "R600MachineFunctionInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/DiagnosticPrinter.h"
+
+using namespace llvm;
+
+namespace {
+
+/// Diagnostic information for unimplemented or unsupported feature reporting.
+class DiagnosticInfoUnsupported : public DiagnosticInfo {
+private:
+  const Twine &Description;
+  const Function &Fn;
+
+  static int KindID;
+
+  static int getKindID() {
+    if (KindID == 0)
+      KindID = llvm::getNextAvailablePluginDiagnosticKind();
+    return KindID;
+  }
+
+public:
+  DiagnosticInfoUnsupported(const Function &Fn, const Twine &Desc,
+                          DiagnosticSeverity Severity = DS_Error)
+    : DiagnosticInfo(getKindID(), Severity),
+      Description(Desc),
+      Fn(Fn) { }
+
+  const Function &getFunction() const { return Fn; }
+  const Twine &getDescription() const { return Description; }
+
+  void print(DiagnosticPrinter &DP) const override {
+    DP << "unsupported " << getDescription() << " in " << Fn.getName();
+  }
+
+  static bool classof(const DiagnosticInfo *DI) {
+    return DI->getKind() == getKindID();
+  }
+};
+
+int DiagnosticInfoUnsupported::KindID = 0;
+}
+
+
+static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT,
+                      CCValAssign::LocInfo LocInfo,
+                      ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  unsigned Offset = State.AllocateStack(ValVT.getStoreSize(),
+                                        ArgFlags.getOrigAlign());
+  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+
+  return true;
+}
+
+#include "AMDGPUGenCallingConv.inc"
+
+// Find a larger type to do a load / store of a vector with.
+EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
+  unsigned StoreSize = VT.getStoreSizeInBits();
+  if (StoreSize <= 32)
+    return EVT::getIntegerVT(Ctx, StoreSize);
+
+  assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
+  return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
+}
+
+// Type for a vector that will be loaded to.
+EVT AMDGPUTargetLowering::getEquivalentLoadRegType(LLVMContext &Ctx, EVT VT) {
+  unsigned StoreSize = VT.getStoreSizeInBits();
+  if (StoreSize <= 32)
+    return EVT::getIntegerVT(Ctx, 32);
+
+  return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
+}
+
+AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
+  TargetLowering(TM, new TargetLoweringObjectFileELF()) {
+
+  Subtarget = &TM.getSubtarget<AMDGPUSubtarget>();
+
+  setOperationAction(ISD::Constant, MVT::i32, Legal);
+  setOperationAction(ISD::Constant, MVT::i64, Legal);
+  setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
+  setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
+
+  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+  setOperationAction(ISD::BRIND, MVT::Other, Expand);
+
+  // We need to custom lower some of the intrinsics
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+
+  // Library functions.  These default to Expand, but we have instructions
+  // for them.
+  setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
+  setOperationAction(ISD::FEXP2,  MVT::f32, Legal);
+  setOperationAction(ISD::FPOW,   MVT::f32, Legal);
+  setOperationAction(ISD::FLOG2,  MVT::f32, Legal);
+  setOperationAction(ISD::FABS,   MVT::f32, Legal);
+  setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
+  setOperationAction(ISD::FRINT,  MVT::f32, Legal);
+  setOperationAction(ISD::FROUND, MVT::f32, Legal);
+  setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
+
+  // Lower floating point store/load to integer store/load to reduce the number
+  // of patterns in tablegen.
+  setOperationAction(ISD::STORE, MVT::f32, Promote);
+  AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
+
+  setOperationAction(ISD::STORE, MVT::v2f32, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
+
+  setOperationAction(ISD::STORE, MVT::i64, Promote);
+  AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
+
+  setOperationAction(ISD::STORE, MVT::v4f32, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
+
+  setOperationAction(ISD::STORE, MVT::v8f32, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
+
+  setOperationAction(ISD::STORE, MVT::v16f32, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
+
+  setOperationAction(ISD::STORE, MVT::f64, Promote);
+  AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64);
+
+  setOperationAction(ISD::STORE, MVT::v2f64, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v2i64);
+
+  // Custom lowering of vector stores is required for local address space
+  // stores.
+  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
+  // XXX: Native v2i32 local address space stores are possible, but not
+  // currently implemented.
+  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
+
+  setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
+  setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
+  setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom);
+
+  // XXX: This can be change to Custom, once ExpandVectorStores can
+  // handle 64-bit stores.
+  setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
+
+  setTruncStoreAction(MVT::i64, MVT::i16, Expand);
+  setTruncStoreAction(MVT::i64, MVT::i8, Expand);
+  setTruncStoreAction(MVT::i64, MVT::i1, Expand);
+  setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
+  setTruncStoreAction(MVT::v4i64, MVT::v4i1, Expand);
+
+
+  setOperationAction(ISD::LOAD, MVT::f32, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
+
+  setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
+
+  setOperationAction(ISD::LOAD, MVT::i64, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
+
+  setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
+
+  setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
+
+  setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
+
+  setOperationAction(ISD::LOAD, MVT::f64, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::f64, MVT::i64);
+
+  setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v2i64);
+
+  setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
+
+  setLoadExtAction(ISD::EXTLOAD, MVT::v2i8, Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::v2i8, Expand);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i8, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::v4i8, Expand);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i8, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v2i16, Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::v2i16, Expand);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, Expand);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, Expand);
+
+  setOperationAction(ISD::BR_CC, MVT::i1, Expand);
+
+  if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
+    setOperationAction(ISD::FCEIL, MVT::f64, Custom);
+    setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
+    setOperationAction(ISD::FRINT, MVT::f64, Custom);
+    setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
+  }
+
+  if (!Subtarget->hasBFI()) {
+    // fcopysign can be done in a single instruction with BFI.
+    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+  }
+
+  setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
+
+  setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
+
+  const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
+  for (MVT VT : ScalarIntVTs) {
+    setOperationAction(ISD::SREM, VT, Expand);
+    setOperationAction(ISD::SDIV, VT, Expand);
+
+    // GPU does not have divrem function for signed or unsigned.
+    setOperationAction(ISD::SDIVREM, VT, Custom);
+    setOperationAction(ISD::UDIVREM, VT, Custom);
+
+    // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
+    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+
+    setOperationAction(ISD::BSWAP, VT, Expand);
+    setOperationAction(ISD::CTTZ, VT, Expand);
+    setOperationAction(ISD::CTLZ, VT, Expand);
+  }
+
+  if (!Subtarget->hasBCNT(32))
+    setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+
+  if (!Subtarget->hasBCNT(64))
+    setOperationAction(ISD::CTPOP, MVT::i64, Expand);
+
+  // The hardware supports 32-bit ROTR, but not ROTL.
+  setOperationAction(ISD::ROTL, MVT::i32, Expand);
+  setOperationAction(ISD::ROTL, MVT::i64, Expand);
+  setOperationAction(ISD::ROTR, MVT::i64, Expand);
+
+  setOperationAction(ISD::MUL, MVT::i64, Expand);
+  setOperationAction(ISD::MULHU, MVT::i64, Expand);
+  setOperationAction(ISD::MULHS, MVT::i64, Expand);
+  setOperationAction(ISD::UDIV, MVT::i32, Expand);
+  setOperationAction(ISD::UREM, MVT::i32, Expand);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
+
+  if (!Subtarget->hasFFBH())
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
+
+  if (!Subtarget->hasFFBL())
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
+
+  static const MVT::SimpleValueType VectorIntTypes[] = {
+    MVT::v2i32, MVT::v4i32
+  };
+
+  for (MVT VT : VectorIntTypes) {
+    // Expand the following operations for the current type by default.
+    setOperationAction(ISD::ADD,  VT, Expand);
+    setOperationAction(ISD::AND,  VT, Expand);
+    setOperationAction(ISD::FP_TO_SINT, VT, Expand);
+    setOperationAction(ISD::FP_TO_UINT, VT, Expand);
+    setOperationAction(ISD::MUL,  VT, Expand);
+    setOperationAction(ISD::OR,   VT, Expand);
+    setOperationAction(ISD::SHL,  VT, Expand);
+    setOperationAction(ISD::SRA,  VT, Expand);
+    setOperationAction(ISD::SRL,  VT, Expand);
+    setOperationAction(ISD::ROTL, VT, Expand);
+    setOperationAction(ISD::ROTR, VT, Expand);
+    setOperationAction(ISD::SUB,  VT, Expand);
+    setOperationAction(ISD::SINT_TO_FP, VT, Expand);
+    setOperationAction(ISD::UINT_TO_FP, VT, Expand);
+    // TODO: Implement custom UREM / SREM routines.
+    setOperationAction(ISD::SDIV, VT, Expand);
+    setOperationAction(ISD::UDIV, VT, Expand);
+    setOperationAction(ISD::SREM, VT, Expand);
+    setOperationAction(ISD::UREM, VT, Expand);
+    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::SDIVREM, VT, Custom);
+    setOperationAction(ISD::UDIVREM, VT, Custom);
+    setOperationAction(ISD::ADDC, VT, Expand);
+    setOperationAction(ISD::SUBC, VT, Expand);
+    setOperationAction(ISD::ADDE, VT, Expand);
+    setOperationAction(ISD::SUBE, VT, Expand);
+    setOperationAction(ISD::SELECT, VT, Expand);
+    setOperationAction(ISD::VSELECT, VT, Expand);
+    setOperationAction(ISD::SELECT_CC, VT, Expand);
+    setOperationAction(ISD::XOR,  VT, Expand);
+    setOperationAction(ISD::BSWAP, VT, Expand);
+    setOperationAction(ISD::CTPOP, VT, Expand);
+    setOperationAction(ISD::CTTZ, VT, Expand);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
+    setOperationAction(ISD::CTLZ, VT, Expand);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
+    setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
+  }
+
+  static const MVT::SimpleValueType FloatVectorTypes[] = {
+    MVT::v2f32, MVT::v4f32
+  };
+
+  for (MVT VT : FloatVectorTypes) {
+    setOperationAction(ISD::FABS, VT, Expand);
+    setOperationAction(ISD::FADD, VT, Expand);
+    setOperationAction(ISD::FCEIL, VT, Expand);
+    setOperationAction(ISD::FCOS, VT, Expand);
+    setOperationAction(ISD::FDIV, VT, Expand);
+    setOperationAction(ISD::FEXP2, VT, Expand);
+    setOperationAction(ISD::FLOG2, VT, Expand);
+    setOperationAction(ISD::FPOW, VT, Expand);
+    setOperationAction(ISD::FFLOOR, VT, Expand);
+    setOperationAction(ISD::FTRUNC, VT, Expand);
+    setOperationAction(ISD::FMUL, VT, Expand);
+    setOperationAction(ISD::FMA, VT, Expand);
+    setOperationAction(ISD::FRINT, VT, Expand);
+    setOperationAction(ISD::FNEARBYINT, VT, Expand);
+    setOperationAction(ISD::FSQRT, VT, Expand);
+    setOperationAction(ISD::FSIN, VT, Expand);
+    setOperationAction(ISD::FSUB, VT, Expand);
+    setOperationAction(ISD::FNEG, VT, Expand);
+    setOperationAction(ISD::SELECT, VT, Expand);
+    setOperationAction(ISD::VSELECT, VT, Expand);
+    setOperationAction(ISD::SELECT_CC, VT, Expand);
+    setOperationAction(ISD::FCOPYSIGN, VT, Expand);
+    setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
+  }
+
+  setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
+  setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
+
+  setTargetDAGCombine(ISD::MUL);
+  setTargetDAGCombine(ISD::SELECT_CC);
+  setTargetDAGCombine(ISD::STORE);
+
+  setSchedulingPreference(Sched::RegPressure);
+  setJumpIsExpensive(true);
+
+  setSelectIsExpensive(false);
+  PredictableSelectIsExpensive = false;
+
+  // There are no integer divide instructions, and these expand to a pretty
+  // large sequence of instructions.
+  setIntDivIsCheap(false);
+  setPow2DivIsCheap(false);
+
+  // TODO: Investigate this when 64-bit divides are implemented.
+  addBypassSlowDiv(64, 32);
+
+  // FIXME: Need to really handle these.
+  MaxStoresPerMemcpy  = 4096;
+  MaxStoresPerMemmove = 4096;
+  MaxStoresPerMemset  = 4096;
+}
+
+//===----------------------------------------------------------------------===//
+// Target Information
+//===----------------------------------------------------------------------===//
+
+MVT AMDGPUTargetLowering::getVectorIdxTy() const {
+  return MVT::i32;
+}
+
+bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
+  return true;
+}
+
+// The backend supports 32 and 64 bit floating point immediates.
+// FIXME: Why are we reporting vectors of FP immediates as legal?
+bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+  EVT ScalarVT = VT.getScalarType();
+  return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64);
+}
+
+// We don't want to shrink f64 / f32 constants.
+bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
+  EVT ScalarVT = VT.getScalarType();
+  return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
+}
+
+bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
+                                                   EVT CastTy) const {
+  if (LoadTy.getSizeInBits() != CastTy.getSizeInBits())
+    return true;
+
+  unsigned LScalarSize = LoadTy.getScalarType().getSizeInBits();
+  unsigned CastScalarSize = CastTy.getScalarType().getSizeInBits();
+
+  return ((LScalarSize <= CastScalarSize) ||
+          (CastScalarSize >= 32) ||
+          (LScalarSize < 32));
+}
+
+//===---------------------------------------------------------------------===//
+// Target Properties
+//===---------------------------------------------------------------------===//
+
+bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
+  assert(VT.isFloatingPoint());
+  return VT == MVT::f32;
+}
+
+bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
+  assert(VT.isFloatingPoint());
+  return VT == MVT::f32;
+}
+
+bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
+  // Truncate is just accessing a subregister.
+  return Dest.bitsLT(Source) && (Dest.getSizeInBits() % 32 == 0);
+}
+
+bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
+  // Truncate is just accessing a subregister.
+  return Dest->getPrimitiveSizeInBits() < Source->getPrimitiveSizeInBits() &&
+         (Dest->getPrimitiveSizeInBits() % 32 == 0);
+}
+
+bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
+  const DataLayout *DL = getDataLayout();
+  unsigned SrcSize = DL->getTypeSizeInBits(Src->getScalarType());
+  unsigned DestSize = DL->getTypeSizeInBits(Dest->getScalarType());
+
+  return SrcSize == 32 && DestSize == 64;
+}
+
+bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
+  // Any register load of a 64-bit value really requires 2 32-bit moves. For all
+  // practical purposes, the extra mov 0 to load a 64-bit is free.  As used,
+  // this will enable reducing 64-bit operations the 32-bit, which is always
+  // good.
+  return Src == MVT::i32 && Dest == MVT::i64;
+}
+
+bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+  return isZExtFree(Val.getValueType(), VT2);
+}
+
+bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
+  // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
+  // limited number of native 64-bit operations. Shrinking an operation to fit
+  // in a single 32-bit register should always be helpful. As currently used,
+  // this is much less general than the name suggests, and is only used in
+  // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
+  // not profitable, and may actually be harmful.
+  return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
+}
+
+//===---------------------------------------------------------------------===//
+// TargetLowering Callbacks
+//===---------------------------------------------------------------------===//
+
+void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State,
+                             const SmallVectorImpl<ISD::InputArg> &Ins) const {
+
+  State.AnalyzeFormalArguments(Ins, CC_AMDGPU);
+}
+
+SDValue AMDGPUTargetLowering::LowerReturn(
+                                     SDValue Chain,
+                                     CallingConv::ID CallConv,
+                                     bool isVarArg,
+                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                     const SmallVectorImpl<SDValue> &OutVals,
+                                     SDLoc DL, SelectionDAG &DAG) const {
+  return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, Chain);
+}
+
+//===---------------------------------------------------------------------===//
+// Target specific lowering
+//===---------------------------------------------------------------------===//
+
+SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
+                                        SmallVectorImpl<SDValue> &InVals) const {
+  SDValue Callee = CLI.Callee;
+  SelectionDAG &DAG = CLI.DAG;
+
+  const Function &Fn = *DAG.getMachineFunction().getFunction();
+
+  StringRef FuncName("<unknown>");
+
+  if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
+    FuncName = G->getSymbol();
+  else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    FuncName = G->getGlobal()->getName();
+
+  DiagnosticInfoUnsupported NoCalls(Fn, "call to function " + FuncName);
+  DAG.getContext()->diagnose(NoCalls);
+  return SDValue();
+}
+
+SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  default:
+    Op.getNode()->dump();
+    llvm_unreachable("Custom lowering code for this"
+                     "instruction is not implemented yet!");
+    break;
+  case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
+  case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
+  case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
+  case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
+  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::SDIV: return LowerSDIV(Op, DAG);
+  case ISD::SREM: return LowerSREM(Op, DAG);
+  case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
+  case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
+  case ISD::FCEIL: return LowerFCEIL(Op, DAG);
+  case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
+  case ISD::FRINT: return LowerFRINT(Op, DAG);
+  case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
+  case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
+  case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
+  }
+  return Op;
+}
+
+void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
+                                              SmallVectorImpl<SDValue> &Results,
+                                              SelectionDAG &DAG) const {
+  switch (N->getOpcode()) {
+  case ISD::SIGN_EXTEND_INREG:
+    // Different parts of legalization seem to interpret which type of
+    // sign_extend_inreg is the one to check for custom lowering. The extended
+    // from type is what really matters, but some places check for custom
+    // lowering of the result type. This results in trying to use
+    // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
+    // nothing here and let the illegal result integer be handled normally.
+    return;
+  case ISD::LOAD: {
+    SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
+    if (!Node)
+      return;
+
+    Results.push_back(SDValue(Node, 0));
+    Results.push_back(SDValue(Node, 1));
+    // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
+    // function
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
+    return;
+  }
+  case ISD::STORE: {
+    SDValue Lowered = LowerSTORE(SDValue(N, 0), DAG);
+    if (Lowered.getNode())
+      Results.push_back(Lowered);
+    return;
+  }
+  default:
+    return;
+  }
+}
+
+// FIXME: This implements accesses to initialized globals in the constant
+// address space by copying them to private and accessing that. It does not
+// properly handle illegal types or vectors. The private vector loads are not
+// scalarized, and the illegal scalars hit an assertion. This technique will not
+// work well with large initializers, and this should eventually be
+// removed. Initialized globals should be placed into a data section that the
+// runtime will load into a buffer before the kernel is executed. Uses of the
+// global need to be replaced with a pointer loaded from an implicit kernel
+// argument into this buffer holding the copy of the data, which will remove the
+// need for any of this.
+SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
+                                                       const GlobalValue *GV,
+                                                       const SDValue &InitPtr,
+                                                       SDValue Chain,
+                                                       SelectionDAG &DAG) const {
+  const DataLayout *TD = getTargetMachine().getDataLayout();
+  SDLoc DL(InitPtr);
+  Type *InitTy = Init->getType();
+
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(Init)) {
+    EVT VT = EVT::getEVT(InitTy);
+    PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS);
+    return DAG.getStore(Chain, DL, DAG.getConstant(*CI, VT), InitPtr,
+                        MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
+                        TD->getPrefTypeAlignment(InitTy));
+  }
+
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) {
+    EVT VT = EVT::getEVT(CFP->getType());
+    PointerType *PtrTy = PointerType::get(CFP->getType(), 0);
+    return DAG.getStore(Chain, DL, DAG.getConstantFP(*CFP, VT), InitPtr,
+                 MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
+                 TD->getPrefTypeAlignment(CFP->getType()));
+  }
+
+  if (StructType *ST = dyn_cast<StructType>(InitTy)) {
+    const StructLayout *SL = TD->getStructLayout(ST);
+
+    EVT PtrVT = InitPtr.getValueType();
+    SmallVector<SDValue, 8> Chains;
+
+    for (unsigned I = 0, N = ST->getNumElements(); I != N; ++I) {
+      SDValue Offset = DAG.getConstant(SL->getElementOffset(I), PtrVT);
+      SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset);
+
+      Constant *Elt = Init->getAggregateElement(I);
+      Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG));
+    }
+
+    return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+  }
+
+  if (SequentialType *SeqTy = dyn_cast<SequentialType>(InitTy)) {
+    EVT PtrVT = InitPtr.getValueType();
+
+    unsigned NumElements;
+    if (ArrayType *AT = dyn_cast<ArrayType>(SeqTy))
+      NumElements = AT->getNumElements();
+    else if (VectorType *VT = dyn_cast<VectorType>(SeqTy))
+      NumElements = VT->getNumElements();
+    else
+      llvm_unreachable("Unexpected type");
+
+    unsigned EltSize = TD->getTypeAllocSize(SeqTy->getElementType());
+    SmallVector<SDValue, 8> Chains;
+    for (unsigned i = 0; i < NumElements; ++i) {
+      SDValue Offset = DAG.getConstant(i * EltSize, PtrVT);
+      SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset);
+
+      Constant *Elt = Init->getAggregateElement(i);
+      Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG));
+    }
+
+    return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+  }
+
+  if (isa<UndefValue>(Init)) {
+    EVT VT = EVT::getEVT(InitTy);
+    PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS);
+    return DAG.getStore(Chain, DL, DAG.getUNDEF(VT), InitPtr,
+                        MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
+                        TD->getPrefTypeAlignment(InitTy));
+  }
+
+  Init->dump();
+  llvm_unreachable("Unhandled constant initializer");
+}
+
+SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
+                                                 SDValue Op,
+                                                 SelectionDAG &DAG) const {
+
+  const DataLayout *TD = getTargetMachine().getDataLayout();
+  GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
+  const GlobalValue *GV = G->getGlobal();
+
+  switch (G->getAddressSpace()) {
+  default: llvm_unreachable("Global Address lowering not implemented for this "
+                            "address space");
+  case AMDGPUAS::LOCAL_ADDRESS: {
+    // XXX: What does the value of G->getOffset() mean?
+    assert(G->getOffset() == 0 &&
+         "Do not know what to do with an non-zero offset");
+
+    unsigned Offset;
+    if (MFI->LocalMemoryObjects.count(GV) == 0) {
+      uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType());
+      Offset = MFI->LDSSize;
+      MFI->LocalMemoryObjects[GV] = Offset;
+      // XXX: Account for alignment?
+      MFI->LDSSize += Size;
+    } else {
+      Offset = MFI->LocalMemoryObjects[GV];
+    }
+
+    return DAG.getConstant(Offset, getPointerTy(G->getAddressSpace()));
+  }
+  case AMDGPUAS::CONSTANT_ADDRESS: {
+    MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
+    Type *EltType = GV->getType()->getElementType();
+    unsigned Size = TD->getTypeAllocSize(EltType);
+    unsigned Alignment = TD->getPrefTypeAlignment(EltType);
+
+    MVT PrivPtrVT = getPointerTy(AMDGPUAS::PRIVATE_ADDRESS);
+    MVT ConstPtrVT = getPointerTy(AMDGPUAS::CONSTANT_ADDRESS);
+
+    int FI = FrameInfo->CreateStackObject(Size, Alignment, false);
+    SDValue InitPtr = DAG.getFrameIndex(FI, PrivPtrVT);
+
+    const GlobalVariable *Var = cast<GlobalVariable>(GV);
+    if (!Var->hasInitializer()) {
+      // This has no use, but bugpoint will hit it.
+      return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT);
+    }
+
+    const Constant *Init = Var->getInitializer();
+    SmallVector<SDNode*, 8> WorkList;
+
+    for (SDNode::use_iterator I = DAG.getEntryNode()->use_begin(),
+                              E = DAG.getEntryNode()->use_end(); I != E; ++I) {
+      if (I->getOpcode() != AMDGPUISD::REGISTER_LOAD && I->getOpcode() != ISD::LOAD)
+        continue;
+      WorkList.push_back(*I);
+    }
+    SDValue Chain = LowerConstantInitializer(Init, GV, InitPtr, DAG.getEntryNode(), DAG);
+    for (SmallVector<SDNode*, 8>::iterator I = WorkList.begin(),
+                                           E = WorkList.end(); I != E; ++I) {
+      SmallVector<SDValue, 8> Ops;
+      Ops.push_back(Chain);
+      for (unsigned i = 1; i < (*I)->getNumOperands(); ++i) {
+        Ops.push_back((*I)->getOperand(i));
+      }
+      DAG.UpdateNodeOperands(*I, Ops);
+    }
+    return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT);
+  }
+  }
+}
+
+SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  SmallVector<SDValue, 8> Args;
+  SDValue A = Op.getOperand(0);
+  SDValue B = Op.getOperand(1);
+
+  DAG.ExtractVectorElements(A, Args);
+  DAG.ExtractVectorElements(B, Args);
+
+  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args);
+}
+
+SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+
+  SmallVector<SDValue, 8> Args;
+  unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  EVT VT = Op.getValueType();
+  DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
+                            VT.getVectorNumElements());
+
+  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args);
+}
+
+SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op,
+                                              SelectionDAG &DAG) const {
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  const AMDGPUFrameLowering *TFL =
+   static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering());
+
+  FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
+
+  unsigned FrameIndex = FIN->getIndex();
+  unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
+  return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF),
+                         Op.getValueType());
+}
+
+SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
+    SelectionDAG &DAG) const {
+  unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+
+  switch (IntrinsicID) {
+    default: return Op;
+    case AMDGPUIntrinsic::AMDGPU_abs:
+    case AMDGPUIntrinsic::AMDIL_abs: // Legacy name.
+      return LowerIntrinsicIABS(Op, DAG);
+    case AMDGPUIntrinsic::AMDGPU_lrp:
+      return LowerIntrinsicLRP(Op, DAG);
+    case AMDGPUIntrinsic::AMDGPU_fract:
+    case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name.
+      return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
+
+    case AMDGPUIntrinsic::AMDGPU_clamp:
+    case AMDGPUIntrinsic::AMDIL_clamp: // Legacy name.
+      return DAG.getNode(AMDGPUISD::CLAMP, DL, VT,
+                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
+    case Intrinsic::AMDGPU_div_scale: {
+      // 3rd parameter required to be a constant.
+      const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
+      if (!Param)
+        return DAG.getUNDEF(VT);
+
+      // Translate to the operands expected by the machine instruction. The
+      // first parameter must be the same as the first instruction.
+      SDValue Numerator = Op.getOperand(1);
+      SDValue Denominator = Op.getOperand(2);
+      SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
+
+      return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, VT,
+                         Src0, Denominator, Numerator);
+    }
+
+    case Intrinsic::AMDGPU_div_fmas:
+      return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
+                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
+    case Intrinsic::AMDGPU_div_fixup:
+      return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
+                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
+    case Intrinsic::AMDGPU_trig_preop:
+      return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
+                         Op.getOperand(1), Op.getOperand(2));
+
+    case Intrinsic::AMDGPU_rcp:
+      return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
+
+    case Intrinsic::AMDGPU_rsq:
+      return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
+
+    case AMDGPUIntrinsic::AMDGPU_legacy_rsq:
+      return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
+
+    case Intrinsic::AMDGPU_rsq_clamped:
+      return DAG.getNode(AMDGPUISD::RSQ_CLAMPED, DL, VT, Op.getOperand(1));
+
+    case AMDGPUIntrinsic::AMDGPU_imax:
+      return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Op.getOperand(1),
+                                                  Op.getOperand(2));
+    case AMDGPUIntrinsic::AMDGPU_umax:
+      return DAG.getNode(AMDGPUISD::UMAX, DL, VT, Op.getOperand(1),
+                                                  Op.getOperand(2));
+    case AMDGPUIntrinsic::AMDGPU_imin:
+      return DAG.getNode(AMDGPUISD::SMIN, DL, VT, Op.getOperand(1),
+                                                  Op.getOperand(2));
+    case AMDGPUIntrinsic::AMDGPU_umin:
+      return DAG.getNode(AMDGPUISD::UMIN, DL, VT, Op.getOperand(1),
+                                                  Op.getOperand(2));
+
+    case AMDGPUIntrinsic::AMDGPU_umul24:
+      return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT,
+                         Op.getOperand(1), Op.getOperand(2));
+
+    case AMDGPUIntrinsic::AMDGPU_imul24:
+      return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT,
+                         Op.getOperand(1), Op.getOperand(2));
+
+    case AMDGPUIntrinsic::AMDGPU_umad24:
+      return DAG.getNode(AMDGPUISD::MAD_U24, DL, VT,
+                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
+    case AMDGPUIntrinsic::AMDGPU_imad24:
+      return DAG.getNode(AMDGPUISD::MAD_I24, DL, VT,
+                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
+    case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte0:
+      return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Op.getOperand(1));
+
+    case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte1:
+      return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE1, DL, VT, Op.getOperand(1));
+
+    case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte2:
+      return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE2, DL, VT, Op.getOperand(1));
+
+    case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte3:
+      return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE3, DL, VT, Op.getOperand(1));
+
+    case AMDGPUIntrinsic::AMDGPU_bfe_i32:
+      return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
+                         Op.getOperand(1),
+                         Op.getOperand(2),
+                         Op.getOperand(3));
+
+    case AMDGPUIntrinsic::AMDGPU_bfe_u32:
+      return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
+                         Op.getOperand(1),
+                         Op.getOperand(2),
+                         Op.getOperand(3));
+
+    case AMDGPUIntrinsic::AMDGPU_bfi:
+      return DAG.getNode(AMDGPUISD::BFI, DL, VT,
+                         Op.getOperand(1),
+                         Op.getOperand(2),
+                         Op.getOperand(3));
+
+    case AMDGPUIntrinsic::AMDGPU_bfm:
+      return DAG.getNode(AMDGPUISD::BFM, DL, VT,
+                         Op.getOperand(1),
+                         Op.getOperand(2));
+
+    case AMDGPUIntrinsic::AMDGPU_brev:
+      return DAG.getNode(AMDGPUISD::BREV, DL, VT, Op.getOperand(1));
+
+    case AMDGPUIntrinsic::AMDIL_exp: // Legacy name.
+      return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1));
+
+    case AMDGPUIntrinsic::AMDIL_round_nearest: // Legacy name.
+      return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1));
+    case AMDGPUIntrinsic::AMDGPU_trunc: // Legacy name.
+      return DAG.getNode(ISD::FTRUNC, DL, VT, Op.getOperand(1));
+  }
+}
+
+///IABS(a) = SMAX(sub(0, a), a)
+SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT),
+                                              Op.getOperand(1));
+
+  return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Neg, Op.getOperand(1));
+}
+
+/// Linear Interpolation
+/// LRP(a, b, c) = muladd(a,  b, (1 - a) * c)
+SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT,
+                                DAG.getConstantFP(1.0f, MVT::f32),
+                                Op.getOperand(1));
+  SDValue OneSubAC = DAG.getNode(ISD::FMUL, DL, VT, OneSubA,
+                                                    Op.getOperand(3));
+  return DAG.getNode(ISD::FADD, DL, VT,
+      DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), Op.getOperand(2)),
+      OneSubAC);
+}
+
+/// \brief Generate Min/Max node
+SDValue AMDGPUTargetLowering::CombineMinMax(SDNode *N,
+                                            SelectionDAG &DAG) const {
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  SDValue True = N->getOperand(2);
+  SDValue False = N->getOperand(3);
+  SDValue CC = N->getOperand(4);
+
+  if (VT != MVT::f32 ||
+      !((LHS == True && RHS == False) || (LHS == False && RHS == True))) {
+    return SDValue();
+  }
+
+  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
+  switch (CCOpcode) {
+  case ISD::SETOEQ:
+  case ISD::SETONE:
+  case ISD::SETUNE:
+  case ISD::SETNE:
+  case ISD::SETUEQ:
+  case ISD::SETEQ:
+  case ISD::SETFALSE:
+  case ISD::SETFALSE2:
+  case ISD::SETTRUE:
+  case ISD::SETTRUE2:
+  case ISD::SETUO:
+  case ISD::SETO:
+    llvm_unreachable("Operation should already be optimised!");
+  case ISD::SETULE:
+  case ISD::SETULT:
+  case ISD::SETOLE:
+  case ISD::SETOLT:
+  case ISD::SETLE:
+  case ISD::SETLT: {
+    unsigned Opc = (LHS == True) ? AMDGPUISD::FMIN : AMDGPUISD::FMAX;
+    return DAG.getNode(Opc, DL, VT, LHS, RHS);
+  }
+  case ISD::SETGT:
+  case ISD::SETGE:
+  case ISD::SETUGE:
+  case ISD::SETOGE:
+  case ISD::SETUGT:
+  case ISD::SETOGT: {
+    unsigned Opc = (LHS == True) ? AMDGPUISD::FMAX : AMDGPUISD::FMIN;
+    return DAG.getNode(Opc, DL, VT, LHS, RHS);
+  }
+  case ISD::SETCC_INVALID:
+    llvm_unreachable("Invalid setcc condcode!");
+  }
+  return SDValue();
+}
+
+SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue &Op,
+                                              SelectionDAG &DAG) const {
+  LoadSDNode *Load = dyn_cast<LoadSDNode>(Op);
+  EVT MemEltVT = Load->getMemoryVT().getVectorElementType();
+  EVT LoadVT = Op.getValueType();
+  EVT EltVT = Op.getValueType().getVectorElementType();
+  EVT PtrVT = Load->getBasePtr().getValueType();
+
+  unsigned NumElts = Load->getMemoryVT().getVectorNumElements();
+  SmallVector<SDValue, 8> Loads;
+  SmallVector<SDValue, 8> Chains;
+
+  SDLoc SL(Op);
+
+  for (unsigned i = 0, e = NumElts; i != e; ++i) {
+    SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Load->getBasePtr(),
+                    DAG.getConstant(i * (MemEltVT.getSizeInBits() / 8), PtrVT));
+
+    SDValue NewLoad
+      = DAG.getExtLoad(Load->getExtensionType(), SL, EltVT,
+                       Load->getChain(), Ptr,
+                       MachinePointerInfo(Load->getMemOperand()->getValue()),
+                       MemEltVT, Load->isVolatile(), Load->isNonTemporal(),
+                       Load->getAlignment());
+    Loads.push_back(NewLoad.getValue(0));
+    Chains.push_back(NewLoad.getValue(1));
+  }
+
+  SDValue Ops[] = {
+    DAG.getNode(ISD::BUILD_VECTOR, SL, LoadVT, Loads),
+    DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains)
+  };
+
+  return DAG.getMergeValues(Ops, SL);
+}
+
+SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op,
+                                               SelectionDAG &DAG) const {
+  StoreSDNode *Store = cast<StoreSDNode>(Op);
+  EVT MemVT = Store->getMemoryVT();
+  unsigned MemBits = MemVT.getSizeInBits();
+
+  // Byte stores are really expensive, so if possible, try to pack 32-bit vector
+  // truncating store into an i32 store.
+  // XXX: We could also handle optimize other vector bitwidths.
+  if (!MemVT.isVector() || MemBits > 32) {
+    return SDValue();
+  }
+
+  SDLoc DL(Op);
+  SDValue Value = Store->getValue();
+  EVT VT = Value.getValueType();
+  EVT ElemVT = VT.getVectorElementType();
+  SDValue Ptr = Store->getBasePtr();
+  EVT MemEltVT = MemVT.getVectorElementType();
+  unsigned MemEltBits = MemEltVT.getSizeInBits();
+  unsigned MemNumElements = MemVT.getVectorNumElements();
+  unsigned PackedSize = MemVT.getStoreSizeInBits();
+  SDValue Mask = DAG.getConstant((1 << MemEltBits) - 1, MVT::i32);
+
+  assert(Value.getValueType().getScalarSizeInBits() >= 32);
+
+  SDValue PackedValue;
+  for (unsigned i = 0; i < MemNumElements; ++i) {
+    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, Value,
+                              DAG.getConstant(i, MVT::i32));
+    Elt = DAG.getZExtOrTrunc(Elt, DL, MVT::i32);
+    Elt = DAG.getNode(ISD::AND, DL, MVT::i32, Elt, Mask); // getZeroExtendInReg
+
+    SDValue Shift = DAG.getConstant(MemEltBits * i, MVT::i32);
+    Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt, Shift);
+
+    if (i == 0) {
+      PackedValue = Elt;
+    } else {
+      PackedValue = DAG.getNode(ISD::OR, DL, MVT::i32, PackedValue, Elt);
+    }
+  }
+
+  if (PackedSize < 32) {
+    EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), PackedSize);
+    return DAG.getTruncStore(Store->getChain(), DL, PackedValue, Ptr,
+                             Store->getMemOperand()->getPointerInfo(),
+                             PackedVT,
+                             Store->isNonTemporal(), Store->isVolatile(),
+                             Store->getAlignment());
+  }
+
+  return DAG.getStore(Store->getChain(), DL, PackedValue, Ptr,
+                      Store->getMemOperand()->getPointerInfo(),
+                      Store->isVolatile(),  Store->isNonTemporal(),
+                      Store->getAlignment());
+}
+
+SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  StoreSDNode *Store = cast<StoreSDNode>(Op);
+  EVT MemEltVT = Store->getMemoryVT().getVectorElementType();
+  EVT EltVT = Store->getValue().getValueType().getVectorElementType();
+  EVT PtrVT = Store->getBasePtr().getValueType();
+  unsigned NumElts = Store->getMemoryVT().getVectorNumElements();
+  SDLoc SL(Op);
+
+  SmallVector<SDValue, 8> Chains;
+
+  for (unsigned i = 0, e = NumElts; i != e; ++i) {
+    SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
+                              Store->getValue(), DAG.getConstant(i, MVT::i32));
+    SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT,
+                              Store->getBasePtr(),
+                            DAG.getConstant(i * (MemEltVT.getSizeInBits() / 8),
+                                            PtrVT));
+    Chains.push_back(DAG.getTruncStore(Store->getChain(), SL, Val, Ptr,
+                         MachinePointerInfo(Store->getMemOperand()->getValue()),
+                         MemEltVT, Store->isVolatile(), Store->isNonTemporal(),
+                         Store->getAlignment()));
+  }
+  return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains);
+}
+
+SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  LoadSDNode *Load = cast<LoadSDNode>(Op);
+  ISD::LoadExtType ExtType = Load->getExtensionType();
+  EVT VT = Op.getValueType();
+  EVT MemVT = Load->getMemoryVT();
+
+  if (ExtType != ISD::NON_EXTLOAD && !VT.isVector() && VT.getSizeInBits() > 32) {
+    // We can do the extload to 32-bits, and then need to separately extend to
+    // 64-bits.
+
+    SDValue ExtLoad32 = DAG.getExtLoad(ExtType, DL, MVT::i32,
+                                       Load->getChain(),
+                                       Load->getBasePtr(),
+                                       MemVT,
+                                       Load->getMemOperand());
+
+    SDValue Ops[] = {
+      DAG.getNode(ISD::getExtForLoadExtType(ExtType), DL, VT, ExtLoad32),
+      ExtLoad32.getValue(1)
+    };
+
+    return DAG.getMergeValues(Ops, DL);
+  }
+
+  if (ExtType == ISD::NON_EXTLOAD && VT.getSizeInBits() < 32) {
+    assert(VT == MVT::i1 && "Only i1 non-extloads expected");
+    // FIXME: Copied from PPC
+    // First, load into 32 bits, then truncate to 1 bit.
+
+    SDValue Chain = Load->getChain();
+    SDValue BasePtr = Load->getBasePtr();
+    MachineMemOperand *MMO = Load->getMemOperand();
+
+    SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
+                                   BasePtr, MVT::i8, MMO);
+
+    SDValue Ops[] = {
+      DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD),
+      NewLD.getValue(1)
+    };
+
+    return DAG.getMergeValues(Ops, DL);
+  }
+
+  if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS ||
+      Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS ||
+      ExtType == ISD::NON_EXTLOAD || Load->getMemoryVT().bitsGE(MVT::i32))
+    return SDValue();
+
+
+  SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(),
+                            DAG.getConstant(2, MVT::i32));
+  SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
+                            Load->getChain(), Ptr,
+                            DAG.getTargetConstant(0, MVT::i32),
+                            Op.getOperand(2));
+  SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32,
+                                Load->getBasePtr(),
+                                DAG.getConstant(0x3, MVT::i32));
+  SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
+                                 DAG.getConstant(3, MVT::i32));
+
+  Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt);
+
+  EVT MemEltVT = MemVT.getScalarType();
+  if (ExtType == ISD::SEXTLOAD) {
+    SDValue MemEltVTNode = DAG.getValueType(MemEltVT);
+
+    SDValue Ops[] = {
+      DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode),
+      Load->getChain()
+    };
+
+    return DAG.getMergeValues(Ops, DL);
+  }
+
+  SDValue Ops[] = {
+    DAG.getZeroExtendInReg(Ret, DL, MemEltVT),
+    Load->getChain()
+  };
+
+  return DAG.getMergeValues(Ops, DL);
+}
+
+SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG);
+  if (Result.getNode()) {
+    return Result;
+  }
+
+  StoreSDNode *Store = cast<StoreSDNode>(Op);
+  SDValue Chain = Store->getChain();
+  if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+       Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) &&
+      Store->getValue().getValueType().isVector()) {
+    return SplitVectorStore(Op, DAG);
+  }
+
+  EVT MemVT = Store->getMemoryVT();
+  if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS &&
+      MemVT.bitsLT(MVT::i32)) {
+    unsigned Mask = 0;
+    if (Store->getMemoryVT() == MVT::i8) {
+      Mask = 0xff;
+    } else if (Store->getMemoryVT() == MVT::i16) {
+      Mask = 0xffff;
+    }
+    SDValue BasePtr = Store->getBasePtr();
+    SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr,
+                              DAG.getConstant(2, MVT::i32));
+    SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
+                              Chain, Ptr, DAG.getTargetConstant(0, MVT::i32));
+
+    SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr,
+                                  DAG.getConstant(0x3, MVT::i32));
+
+    SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
+                                   DAG.getConstant(3, MVT::i32));
+
+    SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32,
+                                    Store->getValue());
+
+    SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT);
+
+    SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32,
+                                       MaskedValue, ShiftAmt);
+
+    SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, DAG.getConstant(Mask, MVT::i32),
+                                  ShiftAmt);
+    DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask,
+                          DAG.getConstant(0xffffffff, MVT::i32));
+    Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask);
+
+    SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue);
+    return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
+                       Chain, Value, Ptr, DAG.getTargetConstant(0, MVT::i32));
+  }
+  return SDValue();
+}
+
+SDValue AMDGPUTargetLowering::LowerSDIV24(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT OVT = Op.getValueType();
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  MVT INTTY;
+  MVT FLTTY;
+  if (!OVT.isVector()) {
+    INTTY = MVT::i32;
+    FLTTY = MVT::f32;
+  } else if (OVT.getVectorNumElements() == 2) {
+    INTTY = MVT::v2i32;
+    FLTTY = MVT::v2f32;
+  } else if (OVT.getVectorNumElements() == 4) {
+    INTTY = MVT::v4i32;
+    FLTTY = MVT::v4f32;
+  }
+  unsigned bitsize = OVT.getScalarType().getSizeInBits();
+  // char|short jq = ia ^ ib;
+  SDValue jq = DAG.getNode(ISD::XOR, DL, OVT, LHS, RHS);
+
+  // jq = jq >> (bitsize - 2)
+  jq = DAG.getNode(ISD::SRA, DL, OVT, jq, DAG.getConstant(bitsize - 2, OVT));
+
+  // jq = jq | 0x1
+  jq = DAG.getNode(ISD::OR, DL, OVT, jq, DAG.getConstant(1, OVT));
+
+  // jq = (int)jq
+  jq = DAG.getSExtOrTrunc(jq, DL, INTTY);
+
+  // int ia = (int)LHS;
+  SDValue ia = DAG.getSExtOrTrunc(LHS, DL, INTTY);
+
+  // int ib, (int)RHS;
+  SDValue ib = DAG.getSExtOrTrunc(RHS, DL, INTTY);
+
+  // float fa = (float)ia;
+  SDValue fa = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ia);
+
+  // float fb = (float)ib;
+  SDValue fb = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ib);
+
+  // float fq = native_divide(fa, fb);
+  SDValue fq = DAG.getNode(ISD::FMUL, DL, FLTTY,
+                           fa, DAG.getNode(AMDGPUISD::RCP, DL, FLTTY, fb));
+
+  // fq = trunc(fq);
+  fq = DAG.getNode(ISD::FTRUNC, DL, FLTTY, fq);
+
+  // float fqneg = -fq;
+  SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FLTTY, fq);
+
+  // float fr = mad(fqneg, fb, fa);
+  SDValue fr = DAG.getNode(ISD::FADD, DL, FLTTY,
+      DAG.getNode(ISD::MUL, DL, FLTTY, fqneg, fb), fa);
+
+  // int iq = (int)fq;
+  SDValue iq = DAG.getNode(ISD::FP_TO_SINT, DL, INTTY, fq);
+
+  // fr = fabs(fr);
+  fr = DAG.getNode(ISD::FABS, DL, FLTTY, fr);
+
+  // fb = fabs(fb);
+  fb = DAG.getNode(ISD::FABS, DL, FLTTY, fb);
+
+  // int cv = fr >= fb;
+  SDValue cv;
+  if (INTTY == MVT::i32) {
+    cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE);
+  } else {
+    cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE);
+  }
+  // jq = (cv ? jq : 0);
+  jq = DAG.getNode(ISD::SELECT, DL, OVT, cv, jq,
+      DAG.getConstant(0, OVT));
+  // dst = iq + jq;
+  iq = DAG.getSExtOrTrunc(iq, DL, OVT);
+  iq = DAG.getNode(ISD::ADD, DL, OVT, iq, jq);
+  return iq;
+}
+
+SDValue AMDGPUTargetLowering::LowerSDIV32(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT OVT = Op.getValueType();
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  // The LowerSDIV32 function generates equivalent to the following IL.
+  // mov r0, LHS
+  // mov r1, RHS
+  // ilt r10, r0, 0
+  // ilt r11, r1, 0
+  // iadd r0, r0, r10
+  // iadd r1, r1, r11
+  // ixor r0, r0, r10
+  // ixor r1, r1, r11
+  // udiv r0, r0, r1
+  // ixor r10, r10, r11
+  // iadd r0, r0, r10
+  // ixor DST, r0, r10
+
+  // mov r0, LHS
+  SDValue r0 = LHS;
+
+  // mov r1, RHS
+  SDValue r1 = RHS;
+
+  // ilt r10, r0, 0
+  SDValue r10 = DAG.getSelectCC(DL,
+      r0, DAG.getConstant(0, OVT),
+      DAG.getConstant(-1, OVT),
+      DAG.getConstant(0, OVT),
+      ISD::SETLT);
+
+  // ilt r11, r1, 0
+  SDValue r11 = DAG.getSelectCC(DL,
+      r1, DAG.getConstant(0, OVT),
+      DAG.getConstant(-1, OVT),
+      DAG.getConstant(0, OVT),
+      ISD::SETLT);
+
+  // iadd r0, r0, r10
+  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
+
+  // iadd r1, r1, r11
+  r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11);
+
+  // ixor r0, r0, r10
+  r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
+
+  // ixor r1, r1, r11
+  r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11);
+
+  // udiv r0, r0, r1
+  r0 = DAG.getNode(ISD::UDIV, DL, OVT, r0, r1);
+
+  // ixor r10, r10, r11
+  r10 = DAG.getNode(ISD::XOR, DL, OVT, r10, r11);
+
+  // iadd r0, r0, r10
+  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
+
+  // ixor DST, r0, r10
+  SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
+  return DST;
+}
+
+SDValue AMDGPUTargetLowering::LowerSDIV64(SDValue Op, SelectionDAG &DAG) const {
+  return SDValue(Op.getNode(), 0);
+}
+
+SDValue AMDGPUTargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const {
+  EVT OVT = Op.getValueType().getScalarType();
+
+  if (OVT == MVT::i64)
+    return LowerSDIV64(Op, DAG);
+
+  if (OVT.getScalarType() == MVT::i32)
+    return LowerSDIV32(Op, DAG);
+
+  if (OVT == MVT::i16 || OVT == MVT::i8) {
+    // FIXME: We should be checking for the masked bits. This isn't reached
+    // because i8 and i16 are not legal types.
+    return LowerSDIV24(Op, DAG);
+  }
+
+  return SDValue(Op.getNode(), 0);
+}
+
+SDValue AMDGPUTargetLowering::LowerSREM32(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT OVT = Op.getValueType();
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  // The LowerSREM32 function generates equivalent to the following IL.
+  // mov r0, LHS
+  // mov r1, RHS
+  // ilt r10, r0, 0
+  // ilt r11, r1, 0
+  // iadd r0, r0, r10
+  // iadd r1, r1, r11
+  // ixor r0, r0, r10
+  // ixor r1, r1, r11
+  // udiv r20, r0, r1
+  // umul r20, r20, r1
+  // sub r0, r0, r20
+  // iadd r0, r0, r10
+  // ixor DST, r0, r10
+
+  // mov r0, LHS
+  SDValue r0 = LHS;
+
+  // mov r1, RHS
+  SDValue r1 = RHS;
+
+  // ilt r10, r0, 0
+  SDValue r10 = DAG.getSetCC(DL, OVT, r0, DAG.getConstant(0, OVT), ISD::SETLT);
+
+  // ilt r11, r1, 0
+  SDValue r11 = DAG.getSetCC(DL, OVT, r1, DAG.getConstant(0, OVT), ISD::SETLT);
+
+  // iadd r0, r0, r10
+  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
+
+  // iadd r1, r1, r11
+  r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11);
+
+  // ixor r0, r0, r10
+  r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
+
+  // ixor r1, r1, r11
+  r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11);
+
+  // udiv r20, r0, r1
+  SDValue r20 = DAG.getNode(ISD::UREM, DL, OVT, r0, r1);
+
+  // umul r20, r20, r1
+  r20 = DAG.getNode(AMDGPUISD::UMUL, DL, OVT, r20, r1);
+
+  // sub r0, r0, r20
+  r0 = DAG.getNode(ISD::SUB, DL, OVT, r0, r20);
+
+  // iadd r0, r0, r10
+  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
+
+  // ixor DST, r0, r10
+  SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
+  return DST;
+}
+
+SDValue AMDGPUTargetLowering::LowerSREM64(SDValue Op, SelectionDAG &DAG) const {
+  return SDValue(Op.getNode(), 0);
+}
+
+SDValue AMDGPUTargetLowering::LowerSREM(SDValue Op, SelectionDAG &DAG) const {
+  EVT OVT = Op.getValueType();
+
+  if (OVT.getScalarType() == MVT::i64)
+    return LowerSREM64(Op, DAG);
+
+  if (OVT.getScalarType() == MVT::i32)
+    return LowerSREM32(Op, DAG);
+
+  return SDValue(Op.getNode(), 0);
+}
+
+SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+
+  SDValue Num = Op.getOperand(0);
+  SDValue Den = Op.getOperand(1);
+
+  // RCP =  URECIP(Den) = 2^32 / Den + e
+  // e is rounding error.
+  SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
+
+  // RCP_LO = umulo(RCP, Den) */
+  SDValue RCP_LO = DAG.getNode(ISD::UMULO, DL, VT, RCP, Den);
+
+  // RCP_HI = mulhu (RCP, Den) */
+  SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den);
+
+  // NEG_RCP_LO = -RCP_LO
+  SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT),
+                                                     RCP_LO);
+
+  // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
+  SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, VT),
+                                           NEG_RCP_LO, RCP_LO,
+                                           ISD::SETEQ);
+  // Calculate the rounding error from the URECIP instruction
+  // E = mulhu(ABS_RCP_LO, RCP)
+  SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP);
+
+  // RCP_A_E = RCP + E
+  SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E);
+
+  // RCP_S_E = RCP - E
+  SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E);
+
+  // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
+  SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, VT),
+                                     RCP_A_E, RCP_S_E,
+                                     ISD::SETEQ);
+  // Quotient = mulhu(Tmp0, Num)
+  SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num);
+
+  // Num_S_Remainder = Quotient * Den
+  SDValue Num_S_Remainder = DAG.getNode(ISD::UMULO, DL, VT, Quotient, Den);
+
+  // Remainder = Num - Num_S_Remainder
+  SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder);
+
+  // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
+  SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den,
+                                                 DAG.getConstant(-1, VT),
+                                                 DAG.getConstant(0, VT),
+                                                 ISD::SETUGE);
+  // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
+  SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num,
+                                                  Num_S_Remainder,
+                                                  DAG.getConstant(-1, VT),
+                                                  DAG.getConstant(0, VT),
+                                                  ISD::SETUGE);
+  // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
+  SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den,
+                                               Remainder_GE_Zero);
+
+  // Calculate Division result:
+
+  // Quotient_A_One = Quotient + 1
+  SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient,
+                                                         DAG.getConstant(1, VT));
+
+  // Quotient_S_One = Quotient - 1
+  SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient,
+                                                         DAG.getConstant(1, VT));
+
+  // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
+  SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, VT),
+                                     Quotient, Quotient_A_One, ISD::SETEQ);
+
+  // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
+  Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT),
+                            Quotient_S_One, Div, ISD::SETEQ);
+
+  // Calculate Rem result:
+
+  // Remainder_S_Den = Remainder - Den
+  SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den);
+
+  // Remainder_A_Den = Remainder + Den
+  SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den);
+
+  // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
+  SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, VT),
+                                    Remainder, Remainder_S_Den, ISD::SETEQ);
+
+  // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
+  Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT),
+                            Remainder_A_Den, Rem, ISD::SETEQ);
+  SDValue Ops[2] = {
+    Div,
+    Rem
+  };
+  return DAG.getMergeValues(Ops, DL);
+}
+
+SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+
+  SDValue Zero = DAG.getConstant(0, VT);
+  SDValue NegOne = DAG.getConstant(-1, VT);
+
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+
+  SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
+  SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
+  SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
+  SDValue RSign = LHSign; // Remainder sign is the same as LHS
+
+  LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
+  RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
+
+  LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
+  RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
+
+  SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
+  SDValue Rem = Div.getValue(1);
+
+  Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
+  Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
+
+  Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
+  Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
+
+  SDValue Res[2] = {
+    Div,
+    Rem
+  };
+  return DAG.getMergeValues(Res, DL);
+}
+
+SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  SDValue Src = Op.getOperand(0);
+
+  // result = trunc(src)
+  // if (src > 0.0 && src != result)
+  //   result += 1.0
+
+  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
+
+  const SDValue Zero = DAG.getConstantFP(0.0, MVT::f64);
+  const SDValue One = DAG.getConstantFP(1.0, MVT::f64);
+
+  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64);
+
+  SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
+  SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
+  SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
+
+  SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
+  return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
+}
+
+SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  SDValue Src = Op.getOperand(0);
+
+  assert(Op.getValueType() == MVT::f64);
+
+  const SDValue Zero = DAG.getConstant(0, MVT::i32);
+  const SDValue One = DAG.getConstant(1, MVT::i32);
+
+  SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
+
+  // Extract the upper half, since this is where we will find the sign and
+  // exponent.
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One);
+
+  const unsigned FractBits = 52;
+  const unsigned ExpBits = 11;
+
+  // Extract the exponent.
+  SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_I32, SL, MVT::i32,
+                                Hi,
+                                DAG.getConstant(FractBits - 32, MVT::i32),
+                                DAG.getConstant(ExpBits, MVT::i32));
+  SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
+                            DAG.getConstant(1023, MVT::i32));
+
+  // Extract the sign bit.
+  const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, MVT::i32);
+  SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
+
+  // Extend back to to 64-bits.
+  SDValue SignBit64 = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
+                                  Zero, SignBit);
+  SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
+
+  SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
+  const SDValue FractMask
+    = DAG.getConstant((UINT64_C(1) << FractBits) - 1, MVT::i64);
+
+  SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
+  SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
+  SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
+
+  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32);
+
+  const SDValue FiftyOne = DAG.getConstant(FractBits - 1, MVT::i32);
+
+  SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
+  SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
+
+  SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
+  SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
+
+  return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
+}
+
+SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  SDValue Src = Op.getOperand(0);
+
+  assert(Op.getValueType() == MVT::f64);
+
+  APFloat C1Val(APFloat::IEEEdouble, "0x1.0p+52");
+  SDValue C1 = DAG.getConstantFP(C1Val, MVT::f64);
+  SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
+
+  SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
+  SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
+
+  SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
+
+  APFloat C2Val(APFloat::IEEEdouble, "0x1.fffffffffffffp+51");
+  SDValue C2 = DAG.getConstantFP(C2Val, MVT::f64);
+
+  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64);
+  SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
+
+  return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
+}
+
+SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const {
+  // FNEARBYINT and FRINT are the same, except in their handling of FP
+  // exceptions. Those aren't really meaningful for us, and OpenCL only has
+  // rint, so just treat them as equivalent.
+  return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
+}
+
+SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  SDValue Src = Op.getOperand(0);
+
+  // result = trunc(src);
+  // if (src < 0.0 && src != result)
+  //   result += -1.0.
+
+  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
+
+  const SDValue Zero = DAG.getConstantFP(0.0, MVT::f64);
+  const SDValue NegOne = DAG.getConstantFP(-1.0, MVT::f64);
+
+  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64);
+
+  SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
+  SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
+  SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
+
+  SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
+  return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
+}
+
+SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDValue S0 = Op.getOperand(0);
+  SDLoc DL(Op);
+  if (Op.getValueType() != MVT::f32 || S0.getValueType() != MVT::i64)
+    return SDValue();
+
+  // f32 uint_to_fp i64
+  SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0,
+                           DAG.getConstant(0, MVT::i32));
+  SDValue FloatLo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Lo);
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0,
+                           DAG.getConstant(1, MVT::i32));
+  SDValue FloatHi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Hi);
+  FloatHi = DAG.getNode(ISD::FMUL, DL, MVT::f32, FloatHi,
+                        DAG.getConstantFP(4294967296.0f, MVT::f32)); // 2^32
+  return DAG.getNode(ISD::FADD, DL, MVT::f32, FloatLo, FloatHi);
+}
+
+SDValue AMDGPUTargetLowering::ExpandSIGN_EXTEND_INREG(SDValue Op,
+                                                      unsigned BitsDiff,
+                                                      SelectionDAG &DAG) const {
+  MVT VT = Op.getSimpleValueType();
+  SDLoc DL(Op);
+  SDValue Shift = DAG.getConstant(BitsDiff, VT);
+  // Shift left by 'Shift' bits.
+  SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, Op.getOperand(0), Shift);
+  // Signed shift Right by 'Shift' bits.
+  return DAG.getNode(ISD::SRA, DL, VT, Shl, Shift);
+}
+
+SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+  MVT VT = Op.getSimpleValueType();
+  MVT ScalarVT = VT.getScalarType();
+
+  if (!VT.isVector())
+    return SDValue();
+
+  SDValue Src = Op.getOperand(0);
+  SDLoc DL(Op);
+
+  // TODO: Don't scalarize on Evergreen?
+  unsigned NElts = VT.getVectorNumElements();
+  SmallVector<SDValue, 8> Args;
+  DAG.ExtractVectorElements(Src, Args, 0, NElts);
+
+  SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
+  for (unsigned I = 0; I < NElts; ++I)
+    Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
+
+  return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Args);
+}
+
+//===----------------------------------------------------------------------===//
+// Custom DAG optimizations
+//===----------------------------------------------------------------------===//
+
+static bool isU24(SDValue Op, SelectionDAG &DAG) {
+  APInt KnownZero, KnownOne;
+  EVT VT = Op.getValueType();
+  DAG.computeKnownBits(Op, KnownZero, KnownOne);
+
+  return (VT.getSizeInBits() - KnownZero.countLeadingOnes()) <= 24;
+}
+
+static bool isI24(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+
+  // In order for this to be a signed 24-bit value, bit 23, must
+  // be a sign bit.
+  return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
+                                     // as unsigned 24-bit values.
+         (VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24;
+}
+
+static void simplifyI24(SDValue Op, TargetLowering::DAGCombinerInfo &DCI) {
+
+  SelectionDAG &DAG = DCI.DAG;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT VT = Op.getValueType();
+
+  APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24);
+  APInt KnownZero, KnownOne;
+  TargetLowering::TargetLoweringOpt TLO(DAG, true, true);
+  if (TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO))
+    DCI.CommitTargetLoweringOpt(TLO);
+}
+
+template <typename IntTy>
+static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0,
+                               uint32_t Offset, uint32_t Width) {
+  if (Width + Offset < 32) {
+    IntTy Result = (Src0 << (32 - Offset - Width)) >> (32 - Width);
+    return DAG.getConstant(Result, MVT::i32);
+  }
+
+  return DAG.getConstant(Src0 >> Offset, MVT::i32);
+}
+
+static bool usesAllNormalStores(SDNode *LoadVal) {
+  for (SDNode::use_iterator I = LoadVal->use_begin(); !I.atEnd(); ++I) {
+    if (!ISD::isNormalStore(*I))
+      return false;
+  }
+
+  return true;
+}
+
+// If we have a copy of an illegal type, replace it with a load / store of an
+// equivalently sized legal type. This avoids intermediate bit pack / unpack
+// instructions emitted when handling extloads and truncstores. Ideally we could
+// recognize the pack / unpack pattern to eliminate it.
+SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
+                                                  DAGCombinerInfo &DCI) const {
+  if (!DCI.isBeforeLegalize())
+    return SDValue();
+
+  StoreSDNode *SN = cast<StoreSDNode>(N);
+  SDValue Value = SN->getValue();
+  EVT VT = Value.getValueType();
+
+  if (isTypeLegal(VT) || SN->isVolatile() || !ISD::isNormalLoad(Value.getNode()))
+    return SDValue();
+
+  LoadSDNode *LoadVal = cast<LoadSDNode>(Value);
+  if (LoadVal->isVolatile() || !usesAllNormalStores(LoadVal))
+    return SDValue();
+
+  EVT MemVT = LoadVal->getMemoryVT();
+
+  SDLoc SL(N);
+  SelectionDAG &DAG = DCI.DAG;
+  EVT LoadVT = getEquivalentMemType(*DAG.getContext(), MemVT);
+
+  SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD,
+                                LoadVT, SL,
+                                LoadVal->getChain(),
+                                LoadVal->getBasePtr(),
+                                LoadVal->getOffset(),
+                                LoadVT,
+                                LoadVal->getMemOperand());
+
+  SDValue CastLoad = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad.getValue(0));
+  DCI.CombineTo(LoadVal, CastLoad, NewLoad.getValue(1), false);
+
+  return DAG.getStore(SN->getChain(), SL, NewLoad,
+                      SN->getBasePtr(), SN->getMemOperand());
+}
+
+SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
+                                                DAGCombinerInfo &DCI) const {
+  EVT VT = N->getValueType(0);
+
+  if (VT.isVector() || VT.getSizeInBits() > 32)
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue Mul;
+
+  if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
+    N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
+    N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
+    Mul = DAG.getNode(AMDGPUISD::MUL_U24, DL, MVT::i32, N0, N1);
+  } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
+    N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
+    N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
+    Mul = DAG.getNode(AMDGPUISD::MUL_I24, DL, MVT::i32, N0, N1);
+  } else {
+    return SDValue();
+  }
+
+  // We need to use sext even for MUL_U24, because MUL_U24 is used
+  // for signed multiply of 8 and 16-bit types.
+  return DAG.getSExtOrTrunc(Mul, DL, VT);
+}
+
+SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
+                                                DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
+
+  switch(N->getOpcode()) {
+    default: break;
+    case ISD::MUL:
+      return performMulCombine(N, DCI);
+    case AMDGPUISD::MUL_I24:
+    case AMDGPUISD::MUL_U24: {
+      SDValue N0 = N->getOperand(0);
+      SDValue N1 = N->getOperand(1);
+      simplifyI24(N0, DCI);
+      simplifyI24(N1, DCI);
+      return SDValue();
+    }
+    case ISD::SELECT_CC: {
+      return CombineMinMax(N, DAG);
+    }
+  case AMDGPUISD::BFE_I32:
+  case AMDGPUISD::BFE_U32: {
+    assert(!N->getValueType(0).isVector() &&
+           "Vector handling of BFE not implemented");
+    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
+    if (!Width)
+      break;
+
+    uint32_t WidthVal = Width->getZExtValue() & 0x1f;
+    if (WidthVal == 0)
+      return DAG.getConstant(0, MVT::i32);
+
+    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
+    if (!Offset)
+      break;
+
+    SDValue BitsFrom = N->getOperand(0);
+    uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
+
+    bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
+
+    if (OffsetVal == 0) {
+      // This is already sign / zero extended, so try to fold away extra BFEs.
+      unsigned SignBits =  Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
+
+      unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
+      if (OpSignBits >= SignBits)
+        return BitsFrom;
+
+      EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
+      if (Signed) {
+        // This is a sign_extend_inreg. Replace it to take advantage of existing
+        // DAG Combines. If not eliminated, we will match back to BFE during
+        // selection.
+
+        // TODO: The sext_inreg of extended types ends, although we can could
+        // handle them in a single BFE.
+        return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
+                           DAG.getValueType(SmallVT));
+      }
+
+      return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
+    }
+
+    if (ConstantSDNode *Val = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
+      if (Signed) {
+        return constantFoldBFE<int32_t>(DAG,
+                                        Val->getSExtValue(),
+                                        OffsetVal,
+                                        WidthVal);
+      }
+
+      return constantFoldBFE<uint32_t>(DAG,
+                                       Val->getZExtValue(),
+                                       OffsetVal,
+                                       WidthVal);
+    }
+
+    APInt Demanded = APInt::getBitsSet(32,
+                                       OffsetVal,
+                                       OffsetVal + WidthVal);
+
+    if ((OffsetVal + WidthVal) >= 32) {
+      SDValue ShiftVal = DAG.getConstant(OffsetVal, MVT::i32);
+      return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
+                         BitsFrom, ShiftVal);
+    }
+
+    APInt KnownZero, KnownOne;
+    TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+                                          !DCI.isBeforeLegalizeOps());
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    if (TLO.ShrinkDemandedConstant(BitsFrom, Demanded) ||
+        TLI.SimplifyDemandedBits(BitsFrom, Demanded, KnownZero, KnownOne, TLO)) {
+      DCI.CommitTargetLoweringOpt(TLO);
+    }
+
+    break;
+  }
+
+  case ISD::STORE:
+    return performStoreCombine(N, DCI);
+  }
+  return SDValue();
+}
+
+//===----------------------------------------------------------------------===//
+// Helper functions
+//===----------------------------------------------------------------------===//
+
+void AMDGPUTargetLowering::getOriginalFunctionArgs(
+                               SelectionDAG &DAG,
+                               const Function *F,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               SmallVectorImpl<ISD::InputArg> &OrigIns) const {
+
+  for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
+    if (Ins[i].ArgVT == Ins[i].VT) {
+      OrigIns.push_back(Ins[i]);
+      continue;
+    }
+
+    EVT VT;
+    if (Ins[i].ArgVT.isVector() && !Ins[i].VT.isVector()) {
+      // Vector has been split into scalars.
+      VT = Ins[i].ArgVT.getVectorElementType();
+    } else if (Ins[i].VT.isVector() && Ins[i].ArgVT.isVector() &&
+               Ins[i].ArgVT.getVectorElementType() !=
+               Ins[i].VT.getVectorElementType()) {
+      // Vector elements have been promoted
+      VT = Ins[i].ArgVT;
+    } else {
+      // Vector has been spilt into smaller vectors.
+      VT = Ins[i].VT;
+    }
+
+    ISD::InputArg Arg(Ins[i].Flags, VT, VT, Ins[i].Used,
+                      Ins[i].OrigArgIndex, Ins[i].PartOffset);
+    OrigIns.push_back(Arg);
+  }
+}
+
+bool AMDGPUTargetLowering::isHWTrueValue(SDValue Op) const {
+  if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
+    return CFP->isExactlyValue(1.0);
+  }
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+    return C->isAllOnesValue();
+  }
+  return false;
+}
+
+bool AMDGPUTargetLowering::isHWFalseValue(SDValue Op) const {
+  if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
+    return CFP->getValueAPF().isZero();
+  }
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+    return C->isNullValue();
+  }
+  return false;
+}
+
+SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
+                                                  const TargetRegisterClass *RC,
+                                                   unsigned Reg, EVT VT) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  unsigned VirtualRegister;
+  if (!MRI.isLiveIn(Reg)) {
+    VirtualRegister = MRI.createVirtualRegister(RC);
+    MRI.addLiveIn(Reg, VirtualRegister);
+  } else {
+    VirtualRegister = MRI.getLiveInVirtReg(Reg);
+  }
+  return DAG.getRegister(VirtualRegister, VT);
+}
+
+#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
+
+const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default: return nullptr;
+  // AMDIL DAG nodes
+  NODE_NAME_CASE(CALL);
+  NODE_NAME_CASE(UMUL);
+  NODE_NAME_CASE(RET_FLAG);
+  NODE_NAME_CASE(BRANCH_COND);
+
+  // AMDGPU DAG nodes
+  NODE_NAME_CASE(DWORDADDR)
+  NODE_NAME_CASE(FRACT)
+  NODE_NAME_CASE(CLAMP)
+  NODE_NAME_CASE(FMAX)
+  NODE_NAME_CASE(SMAX)
+  NODE_NAME_CASE(UMAX)
+  NODE_NAME_CASE(FMIN)
+  NODE_NAME_CASE(SMIN)
+  NODE_NAME_CASE(UMIN)
+  NODE_NAME_CASE(URECIP)
+  NODE_NAME_CASE(DIV_SCALE)
+  NODE_NAME_CASE(DIV_FMAS)
+  NODE_NAME_CASE(DIV_FIXUP)
+  NODE_NAME_CASE(TRIG_PREOP)
+  NODE_NAME_CASE(RCP)
+  NODE_NAME_CASE(RSQ)
+  NODE_NAME_CASE(RSQ_LEGACY)
+  NODE_NAME_CASE(RSQ_CLAMPED)
+  NODE_NAME_CASE(DOT4)
+  NODE_NAME_CASE(BFE_U32)
+  NODE_NAME_CASE(BFE_I32)
+  NODE_NAME_CASE(BFI)
+  NODE_NAME_CASE(BFM)
+  NODE_NAME_CASE(BREV)
+  NODE_NAME_CASE(MUL_U24)
+  NODE_NAME_CASE(MUL_I24)
+  NODE_NAME_CASE(MAD_U24)
+  NODE_NAME_CASE(MAD_I24)
+  NODE_NAME_CASE(EXPORT)
+  NODE_NAME_CASE(CONST_ADDRESS)
+  NODE_NAME_CASE(REGISTER_LOAD)
+  NODE_NAME_CASE(REGISTER_STORE)
+  NODE_NAME_CASE(LOAD_CONSTANT)
+  NODE_NAME_CASE(LOAD_INPUT)
+  NODE_NAME_CASE(SAMPLE)
+  NODE_NAME_CASE(SAMPLEB)
+  NODE_NAME_CASE(SAMPLED)
+  NODE_NAME_CASE(SAMPLEL)
+  NODE_NAME_CASE(CVT_F32_UBYTE0)
+  NODE_NAME_CASE(CVT_F32_UBYTE1)
+  NODE_NAME_CASE(CVT_F32_UBYTE2)
+  NODE_NAME_CASE(CVT_F32_UBYTE3)
+  NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
+  NODE_NAME_CASE(CONST_DATA_PTR)
+  NODE_NAME_CASE(STORE_MSKOR)
+  NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
+  }
+}
+
+static void computeKnownBitsForMinMax(const SDValue Op0,
+                                      const SDValue Op1,
+                                      APInt &KnownZero,
+                                      APInt &KnownOne,
+                                      const SelectionDAG &DAG,
+                                      unsigned Depth) {
+  APInt Op0Zero, Op0One;
+  APInt Op1Zero, Op1One;
+  DAG.computeKnownBits(Op0, Op0Zero, Op0One, Depth);
+  DAG.computeKnownBits(Op1, Op1Zero, Op1One, Depth);
+
+  KnownZero = Op0Zero & Op1Zero;
+  KnownOne = Op0One & Op1One;
+}
+
+void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
+  const SDValue Op,
+  APInt &KnownZero,
+  APInt &KnownOne,
+  const SelectionDAG &DAG,
+  unsigned Depth) const {
+
+  KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything.
+
+  APInt KnownZero2;
+  APInt KnownOne2;
+  unsigned Opc = Op.getOpcode();
+
+  switch (Opc) {
+  default:
+    break;
+  case ISD::INTRINSIC_WO_CHAIN: {
+    // FIXME: The intrinsic should just use the node.
+    switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) {
+    case AMDGPUIntrinsic::AMDGPU_imax:
+    case AMDGPUIntrinsic::AMDGPU_umax:
+    case AMDGPUIntrinsic::AMDGPU_imin:
+    case AMDGPUIntrinsic::AMDGPU_umin:
+      computeKnownBitsForMinMax(Op.getOperand(1), Op.getOperand(2),
+                                KnownZero, KnownOne, DAG, Depth);
+      break;
+    default:
+      break;
+    }
+
+    break;
+  }
+  case AMDGPUISD::SMAX:
+  case AMDGPUISD::UMAX:
+  case AMDGPUISD::SMIN:
+  case AMDGPUISD::UMIN:
+    computeKnownBitsForMinMax(Op.getOperand(0), Op.getOperand(1),
+                              KnownZero, KnownOne, DAG, Depth);
+    break;
+
+  case AMDGPUISD::BFE_I32:
+  case AMDGPUISD::BFE_U32: {
+    ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+    if (!CWidth)
+      return;
+
+    unsigned BitWidth = 32;
+    uint32_t Width = CWidth->getZExtValue() & 0x1f;
+    if (Width == 0) {
+      KnownZero = APInt::getAllOnesValue(BitWidth);
+      KnownOne = APInt::getNullValue(BitWidth);
+      return;
+    }
+
+    // FIXME: This could do a lot more. If offset is 0, should be the same as
+    // sign_extend_inreg implementation, but that involves duplicating it.
+    if (Opc == AMDGPUISD::BFE_I32)
+      KnownOne = APInt::getHighBitsSet(BitWidth, BitWidth - Width);
+    else
+      KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - Width);
+
+    break;
+  }
+  }
+}
+
+unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
+  SDValue Op,
+  const SelectionDAG &DAG,
+  unsigned Depth) const {
+  switch (Op.getOpcode()) {
+  case AMDGPUISD::BFE_I32: {
+    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+    if (!Width)
+      return 1;
+
+    unsigned SignBits = 32 - Width->getZExtValue() + 1;
+    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+    if (!Offset || !Offset->isNullValue())
+      return SignBits;
+
+    // TODO: Could probably figure something out with non-0 offsets.
+    unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
+    return std::max(SignBits, Op0SignBits);
+  }
+
+  case AMDGPUISD::BFE_U32: {
+    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+    return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
+  }
+
+  default:
+    return 1;
+  }
+}
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUISelLowering.h b/contrib/llvm/lib/Target/R600/AMDGPUISelLowering.h
new file mode 100644
index 0000000..624d4e0
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUISelLowering.h
@@ -0,0 +1,251 @@
+//===-- AMDGPUISelLowering.h - AMDGPU Lowering Interface --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface definition of the TargetLowering class that is common
+/// to all AMD GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPUISELLOWERING_H
+#define AMDGPUISELLOWERING_H
+
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+class AMDGPUMachineFunction;
+class AMDGPUSubtarget;
+class MachineRegisterInfo;
+
+class AMDGPUTargetLowering : public TargetLowering {
+protected:
+  const AMDGPUSubtarget *Subtarget;
+
+private:
+  SDValue LowerConstantInitializer(const Constant* Init, const GlobalValue *GV,
+                                   const SDValue &InitPtr,
+                                   SDValue Chain,
+                                   SelectionDAG &DAG) const;
+  SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+  /// \brief Lower vector stores by merging the vector elements into an integer
+  /// of the same bitwidth.
+  SDValue MergeVectorStore(const SDValue &Op, SelectionDAG &DAG) const;
+  /// \brief Split a vector store into multiple scalar stores.
+  /// \returns The resulting chain.
+
+  SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSDIV24(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSDIV32(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSDIV64(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSREM(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSREM32(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSREM64(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue ExpandSIGN_EXTEND_INREG(SDValue Op,
+                                  unsigned BitsDiff,
+                                  SelectionDAG &DAG) const;
+  SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+protected:
+  static EVT getEquivalentMemType(LLVMContext &Context, EVT VT);
+  static EVT getEquivalentLoadRegType(LLVMContext &Context, EVT VT);
+
+  virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
+                                     SelectionDAG &DAG) const;
+  /// \brief Split a vector load into multiple scalar loads.
+  SDValue SplitVectorLoad(const SDValue &Op, SelectionDAG &DAG) const;
+  SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const;
+  bool isHWTrueValue(SDValue Op) const;
+  bool isHWFalseValue(SDValue Op) const;
+
+  /// The SelectionDAGBuilder will automatically promote function arguments
+  /// with illegal types.  However, this does not work for the AMDGPU targets
+  /// since the function arguments are stored in memory as these illegal types.
+  /// In order to handle this properly we need to get the origianl types sizes
+  /// from the LLVM IR Function and fixup the ISD:InputArg values before
+  /// passing them to AnalyzeFormalArguments()
+  void getOriginalFunctionArgs(SelectionDAG &DAG,
+                               const Function *F,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               SmallVectorImpl<ISD::InputArg> &OrigIns) const;
+  void AnalyzeFormalArguments(CCState &State,
+                              const SmallVectorImpl<ISD::InputArg> &Ins) const;
+
+public:
+  AMDGPUTargetLowering(TargetMachine &TM);
+
+  bool isFAbsFree(EVT VT) const override;
+  bool isFNegFree(EVT VT) const override;
+  bool isTruncateFree(EVT Src, EVT Dest) const override;
+  bool isTruncateFree(Type *Src, Type *Dest) const override;
+
+  bool isZExtFree(Type *Src, Type *Dest) const override;
+  bool isZExtFree(EVT Src, EVT Dest) const override;
+  bool isZExtFree(SDValue Val, EVT VT2) const override;
+
+  bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
+
+  MVT getVectorIdxTy() const override;
+  bool isSelectSupported(SelectSupportKind) const override;
+
+  bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
+  bool ShouldShrinkFPConstant(EVT VT) const override;
+
+  bool isLoadBitCastBeneficial(EVT, EVT) const override;
+  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                      bool isVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      const SmallVectorImpl<SDValue> &OutVals,
+                      SDLoc DL, SelectionDAG &DAG) const override;
+  SDValue LowerCall(CallLoweringInfo &CLI,
+                    SmallVectorImpl<SDValue> &InVals) const override;
+
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+  void ReplaceNodeResults(SDNode * N,
+                          SmallVectorImpl<SDValue> &Results,
+                          SelectionDAG &DAG) const override;
+
+  SDValue LowerIntrinsicIABS(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerIntrinsicLRP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue CombineMinMax(SDNode *N, SelectionDAG &DAG) const;
+  const char* getTargetNodeName(unsigned Opcode) const override;
+
+  virtual SDNode *PostISelFolding(MachineSDNode *N,
+                                  SelectionDAG &DAG) const {
+    return N;
+  }
+
+  /// \brief Determine which of the bits specified in \p Mask are known to be
+  /// either zero or one and return them in the \p KnownZero and \p KnownOne
+  /// bitsets.
+  void computeKnownBitsForTargetNode(const SDValue Op,
+                                     APInt &KnownZero,
+                                     APInt &KnownOne,
+                                     const SelectionDAG &DAG,
+                                     unsigned Depth = 0) const override;
+
+  virtual unsigned ComputeNumSignBitsForTargetNode(
+    SDValue Op,
+    const SelectionDAG &DAG,
+    unsigned Depth = 0) const override;
+
+  /// \brief Helper function that adds Reg to the LiveIn list of the DAG's
+  /// MachineFunction.
+  ///
+  /// \returns a RegisterSDNode representing Reg.
+  virtual SDValue CreateLiveInRegister(SelectionDAG &DAG,
+                                       const TargetRegisterClass *RC,
+                                       unsigned Reg, EVT VT) const;
+};
+
+namespace AMDGPUISD {
+
+enum {
+  // AMDIL ISD Opcodes
+  FIRST_NUMBER = ISD::BUILTIN_OP_END,
+  CALL,        // Function call based on a single integer
+  UMUL,        // 32bit unsigned multiplication
+  RET_FLAG,
+  BRANCH_COND,
+  // End AMDIL ISD Opcodes
+  DWORDADDR,
+  FRACT,
+  CLAMP,
+
+  // SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi.
+  // Denormals handled on some parts.
+  COS_HW,
+  SIN_HW,
+  FMAX,
+  SMAX,
+  UMAX,
+  FMIN,
+  SMIN,
+  UMIN,
+  URECIP,
+  DIV_SCALE,
+  DIV_FMAS,
+  DIV_FIXUP,
+  TRIG_PREOP, // 1 ULP max error for f64
+
+  // RCP, RSQ - For f32, 1 ULP max error, no denormal handling.
+  //            For f64, max error 2^29 ULP, handles denormals.
+  RCP,
+  RSQ,
+  RSQ_LEGACY,
+  RSQ_CLAMPED,
+  DOT4,
+  BFE_U32, // Extract range of bits with zero extension to 32-bits.
+  BFE_I32, // Extract range of bits with sign extension to 32-bits.
+  BFI, // (src0 & src1) | (~src0 & src2)
+  BFM, // Insert a range of bits into a 32-bit word.
+  BREV, // Reverse bits.
+  MUL_U24,
+  MUL_I24,
+  MAD_U24,
+  MAD_I24,
+  TEXTURE_FETCH,
+  EXPORT,
+  CONST_ADDRESS,
+  REGISTER_LOAD,
+  REGISTER_STORE,
+  LOAD_INPUT,
+  SAMPLE,
+  SAMPLEB,
+  SAMPLED,
+  SAMPLEL,
+
+  // These cvt_f32_ubyte* nodes need to remain consecutive and in order.
+  CVT_F32_UBYTE0,
+  CVT_F32_UBYTE1,
+  CVT_F32_UBYTE2,
+  CVT_F32_UBYTE3,
+  /// This node is for VLIW targets and it is used to represent a vector
+  /// that is stored in consecutive registers with the same channel.
+  /// For example:
+  ///   |X  |Y|Z|W|
+  /// T0|v.x| | | |
+  /// T1|v.y| | | |
+  /// T2|v.z| | | |
+  /// T3|v.w| | | |
+  BUILD_VERTICAL_VECTOR,
+  /// Pointer to the start of the shader's constant data.
+  CONST_DATA_PTR,
+  FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
+  STORE_MSKOR,
+  LOAD_CONSTANT,
+  TBUFFER_STORE_FORMAT,
+  LAST_AMDGPU_ISD_NUMBER
+};
+
+
+} // End namespace AMDGPUISD
+
+} // End namespace llvm
+
+#endif // AMDGPUISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.cpp b/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.cpp
new file mode 100644
index 0000000..fef5b8c
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.cpp
@@ -0,0 +1,346 @@
+//===-- AMDGPUInstrInfo.cpp - Base class for AMD GPU InstrInfo ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Implementation of the TargetInstrInfo class that is common to all
+/// AMD GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUInstrInfo.h"
+#include "AMDGPURegisterInfo.h"
+#include "AMDGPUTargetMachine.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_CTOR_DTOR
+#define GET_INSTRINFO_NAMED_OPS
+#define GET_INSTRMAP_INFO
+#include "AMDGPUGenInstrInfo.inc"
+
+// Pin the vtable to this file.
+void AMDGPUInstrInfo::anchor() {}
+
+AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &st)
+  : AMDGPUGenInstrInfo(-1,-1), RI(st), ST(st) { }
+
+const AMDGPURegisterInfo &AMDGPUInstrInfo::getRegisterInfo() const {
+  return RI;
+}
+
+bool AMDGPUInstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
+                                           unsigned &SrcReg, unsigned &DstReg,
+                                           unsigned &SubIdx) const {
+// TODO: Implement this function
+  return false;
+}
+
+unsigned AMDGPUInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+                                             int &FrameIndex) const {
+// TODO: Implement this function
+  return 0;
+}
+
+unsigned AMDGPUInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI,
+                                                   int &FrameIndex) const {
+// TODO: Implement this function
+  return 0;
+}
+
+bool AMDGPUInstrInfo::hasLoadFromStackSlot(const MachineInstr *MI,
+                                          const MachineMemOperand *&MMO,
+                                          int &FrameIndex) const {
+// TODO: Implement this function
+  return false;
+}
+unsigned AMDGPUInstrInfo::isStoreFromStackSlot(const MachineInstr *MI,
+                                              int &FrameIndex) const {
+// TODO: Implement this function
+  return 0;
+}
+unsigned AMDGPUInstrInfo::isStoreFromStackSlotPostFE(const MachineInstr *MI,
+                                                    int &FrameIndex) const {
+// TODO: Implement this function
+  return 0;
+}
+bool AMDGPUInstrInfo::hasStoreFromStackSlot(const MachineInstr *MI,
+                                           const MachineMemOperand *&MMO,
+                                           int &FrameIndex) const {
+// TODO: Implement this function
+  return false;
+}
+
+MachineInstr *
+AMDGPUInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
+                                      MachineBasicBlock::iterator &MBBI,
+                                      LiveVariables *LV) const {
+// TODO: Implement this function
+  return nullptr;
+}
+bool AMDGPUInstrInfo::getNextBranchInstr(MachineBasicBlock::iterator &iter,
+                                        MachineBasicBlock &MBB) const {
+  while (iter != MBB.end()) {
+    switch (iter->getOpcode()) {
+    default:
+      break;
+    case AMDGPU::BRANCH_COND_i32:
+    case AMDGPU::BRANCH_COND_f32:
+    case AMDGPU::BRANCH:
+      return true;
+    };
+    ++iter;
+  }
+  return false;
+}
+
+void
+AMDGPUInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MI,
+                                    unsigned SrcReg, bool isKill,
+                                    int FrameIndex,
+                                    const TargetRegisterClass *RC,
+                                    const TargetRegisterInfo *TRI) const {
+  llvm_unreachable("Not Implemented");
+}
+
+void
+AMDGPUInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MI,
+                                     unsigned DestReg, int FrameIndex,
+                                     const TargetRegisterClass *RC,
+                                     const TargetRegisterInfo *TRI) const {
+  llvm_unreachable("Not Implemented");
+}
+
+bool AMDGPUInstrInfo::expandPostRAPseudo (MachineBasicBlock::iterator MI) const {
+  MachineBasicBlock *MBB = MI->getParent();
+  int OffsetOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                               AMDGPU::OpName::addr);
+   // addr is a custom operand with multiple MI operands, and only the
+   // first MI operand is given a name.
+  int RegOpIdx = OffsetOpIdx + 1;
+  int ChanOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                             AMDGPU::OpName::chan);
+  if (isRegisterLoad(*MI)) {
+    int DstOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                              AMDGPU::OpName::dst);
+    unsigned RegIndex = MI->getOperand(RegOpIdx).getImm();
+    unsigned Channel = MI->getOperand(ChanOpIdx).getImm();
+    unsigned Address = calculateIndirectAddress(RegIndex, Channel);
+    unsigned OffsetReg = MI->getOperand(OffsetOpIdx).getReg();
+    if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) {
+      buildMovInstr(MBB, MI, MI->getOperand(DstOpIdx).getReg(),
+                    getIndirectAddrRegClass()->getRegister(Address));
+    } else {
+      buildIndirectRead(MBB, MI, MI->getOperand(DstOpIdx).getReg(),
+                        Address, OffsetReg);
+    }
+  } else if (isRegisterStore(*MI)) {
+    int ValOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                              AMDGPU::OpName::val);
+    AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
+    unsigned RegIndex = MI->getOperand(RegOpIdx).getImm();
+    unsigned Channel = MI->getOperand(ChanOpIdx).getImm();
+    unsigned Address = calculateIndirectAddress(RegIndex, Channel);
+    unsigned OffsetReg = MI->getOperand(OffsetOpIdx).getReg();
+    if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) {
+      buildMovInstr(MBB, MI, getIndirectAddrRegClass()->getRegister(Address),
+                    MI->getOperand(ValOpIdx).getReg());
+    } else {
+      buildIndirectWrite(MBB, MI, MI->getOperand(ValOpIdx).getReg(),
+                         calculateIndirectAddress(RegIndex, Channel),
+                         OffsetReg);
+    }
+  } else {
+    return false;
+  }
+
+  MBB->erase(MI);
+  return true;
+}
+
+
+MachineInstr *
+AMDGPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+                                      MachineInstr *MI,
+                                      const SmallVectorImpl<unsigned> &Ops,
+                                      int FrameIndex) const {
+// TODO: Implement this function
+  return nullptr;
+}
+MachineInstr*
+AMDGPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+                                      MachineInstr *MI,
+                                      const SmallVectorImpl<unsigned> &Ops,
+                                      MachineInstr *LoadMI) const {
+  // TODO: Implement this function
+  return nullptr;
+}
+bool
+AMDGPUInstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
+                                     const SmallVectorImpl<unsigned> &Ops) const {
+  // TODO: Implement this function
+  return false;
+}
+bool
+AMDGPUInstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
+                                 unsigned Reg, bool UnfoldLoad,
+                                 bool UnfoldStore,
+                                 SmallVectorImpl<MachineInstr*> &NewMIs) const {
+  // TODO: Implement this function
+  return false;
+}
+
+bool
+AMDGPUInstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
+                                    SmallVectorImpl<SDNode*> &NewNodes) const {
+  // TODO: Implement this function
+  return false;
+}
+
+unsigned
+AMDGPUInstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc,
+                                           bool UnfoldLoad, bool UnfoldStore,
+                                           unsigned *LoadRegIndex) const {
+  // TODO: Implement this function
+  return 0;
+}
+
+bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
+                                             int64_t Offset1, int64_t Offset2,
+                                             unsigned NumLoads) const {
+  assert(Offset2 > Offset1
+         && "Second offset should be larger than first offset!");
+  // If we have less than 16 loads in a row, and the offsets are within 16,
+  // then schedule together.
+  // TODO: Make the loads schedule near if it fits in a cacheline
+  return (NumLoads < 16 && (Offset2 - Offset1) < 16);
+}
+
+bool
+AMDGPUInstrInfo::ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond)
+  const {
+  // TODO: Implement this function
+  return true;
+}
+void AMDGPUInstrInfo::insertNoop(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MI) const {
+  // TODO: Implement this function
+}
+
+bool AMDGPUInstrInfo::isPredicated(const MachineInstr *MI) const {
+  // TODO: Implement this function
+  return false;
+}
+bool
+AMDGPUInstrInfo::SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
+                                  const SmallVectorImpl<MachineOperand> &Pred2)
+  const {
+  // TODO: Implement this function
+  return false;
+}
+
+bool AMDGPUInstrInfo::DefinesPredicate(MachineInstr *MI,
+                                      std::vector<MachineOperand> &Pred) const {
+  // TODO: Implement this function
+  return false;
+}
+
+bool AMDGPUInstrInfo::isPredicable(MachineInstr *MI) const {
+  // TODO: Implement this function
+  return MI->getDesc().isPredicable();
+}
+
+bool
+AMDGPUInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
+  // TODO: Implement this function
+  return true;
+}
+
+bool AMDGPUInstrInfo::isRegisterStore(const MachineInstr &MI) const {
+  return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_STORE;
+}
+
+bool AMDGPUInstrInfo::isRegisterLoad(const MachineInstr &MI) const {
+  return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_LOAD;
+}
+
+int AMDGPUInstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const {
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  int Offset = -1;
+
+  if (MFI->getNumObjects() == 0) {
+    return -1;
+  }
+
+  if (MRI.livein_empty()) {
+    return 0;
+  }
+
+  const TargetRegisterClass *IndirectRC = getIndirectAddrRegClass();
+  for (MachineRegisterInfo::livein_iterator LI = MRI.livein_begin(),
+                                            LE = MRI.livein_end();
+                                            LI != LE; ++LI) {
+    unsigned Reg = LI->first;
+    if (TargetRegisterInfo::isVirtualRegister(Reg) ||
+        !IndirectRC->contains(Reg))
+      continue;
+
+    unsigned RegIndex;
+    unsigned RegEnd;
+    for (RegIndex = 0, RegEnd = IndirectRC->getNumRegs(); RegIndex != RegEnd;
+                                                          ++RegIndex) {
+      if (IndirectRC->getRegister(RegIndex) == Reg)
+        break;
+    }
+    Offset = std::max(Offset, (int)RegIndex);
+  }
+
+  return Offset + 1;
+}
+
+int AMDGPUInstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const {
+  int Offset = 0;
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Variable sized objects are not supported
+  assert(!MFI->hasVarSizedObjects());
+
+  if (MFI->getNumObjects() == 0) {
+    return -1;
+  }
+
+  Offset = MF.getTarget().getFrameLowering()->getFrameIndexOffset(MF, -1);
+
+  return getIndirectIndexBegin(MF) + Offset;
+}
+
+int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const {
+  switch (Channels) {
+  default: return Opcode;
+  case 1: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_1);
+  case 2: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_2);
+  case 3: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_3);
+  }
+}
+
+// Wrapper for Tablegen'd function.  enum Subtarget is not defined in any
+// header files, so we need to wrap it in a function that takes unsigned 
+// instead.
+namespace llvm {
+namespace AMDGPU {
+int getMCOpcode(uint16_t Opcode, unsigned Gen) {
+  return getMCOpcode(Opcode);
+}
+}
+}
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.h b/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.h
new file mode 100644
index 0000000..d5041f5
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.h
@@ -0,0 +1,199 @@
+//===-- AMDGPUInstrInfo.h - AMDGPU Instruction Information ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Contains the definition of a TargetInstrInfo class that is common
+/// to all AMD GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPUINSTRUCTIONINFO_H
+#define AMDGPUINSTRUCTIONINFO_H
+
+#include "AMDGPUInstrInfo.h"
+#include "AMDGPURegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include <map>
+
+#define GET_INSTRINFO_HEADER
+#define GET_INSTRINFO_ENUM
+#define GET_INSTRINFO_OPERAND_ENUM
+#include "AMDGPUGenInstrInfo.inc"
+
+#define OPCODE_IS_ZERO_INT AMDGPU::PRED_SETE_INT
+#define OPCODE_IS_NOT_ZERO_INT AMDGPU::PRED_SETNE_INT
+#define OPCODE_IS_ZERO AMDGPU::PRED_SETE
+#define OPCODE_IS_NOT_ZERO AMDGPU::PRED_SETNE
+
+namespace llvm {
+
+class AMDGPUSubtarget;
+class MachineFunction;
+class MachineInstr;
+class MachineInstrBuilder;
+
+class AMDGPUInstrInfo : public AMDGPUGenInstrInfo {
+private:
+  const AMDGPURegisterInfo RI;
+  bool getNextBranchInstr(MachineBasicBlock::iterator &iter,
+                          MachineBasicBlock &MBB) const;
+  virtual void anchor();
+protected:
+  const AMDGPUSubtarget &ST;
+public:
+  explicit AMDGPUInstrInfo(const AMDGPUSubtarget &st);
+
+  virtual const AMDGPURegisterInfo &getRegisterInfo() const = 0;
+
+  bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg,
+                             unsigned &DstReg, unsigned &SubIdx) const override;
+
+  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                               int &FrameIndex) const override;
+  unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI,
+                                     int &FrameIndex) const override;
+  bool hasLoadFromStackSlot(const MachineInstr *MI,
+                            const MachineMemOperand *&MMO,
+                            int &FrameIndex) const override;
+  unsigned isStoreFromStackSlot(const MachineInstr *MI, int &FrameIndex) const;
+  unsigned isStoreFromStackSlotPostFE(const MachineInstr *MI,
+                                      int &FrameIndex) const;
+  bool hasStoreFromStackSlot(const MachineInstr *MI,
+                             const MachineMemOperand *&MMO,
+                             int &FrameIndex) const;
+
+  MachineInstr *
+  convertToThreeAddress(MachineFunction::iterator &MFI,
+                        MachineBasicBlock::iterator &MBBI,
+                        LiveVariables *LV) const override;
+
+
+  virtual void copyPhysReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg,
+                           bool KillSrc) const = 0;
+
+  bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
+
+protected:
+  MachineInstr *foldMemoryOperandImpl(MachineFunction &MF,
+                                      MachineInstr *MI,
+                                      const SmallVectorImpl<unsigned> &Ops,
+                                      int FrameIndex) const override;
+  MachineInstr *foldMemoryOperandImpl(MachineFunction &MF,
+                                      MachineInstr *MI,
+                                      const SmallVectorImpl<unsigned> &Ops,
+                                      MachineInstr *LoadMI) const override;
+  /// \returns the smallest register index that will be accessed by an indirect
+  /// read or write or -1 if indirect addressing is not used by this program.
+  int getIndirectIndexBegin(const MachineFunction &MF) const;
+
+  /// \returns the largest register index that will be accessed by an indirect
+  /// read or write or -1 if indirect addressing is not used by this program.
+  int getIndirectIndexEnd(const MachineFunction &MF) const;
+
+public:
+  bool canFoldMemoryOperand(const MachineInstr *MI,
+                           const SmallVectorImpl<unsigned> &Ops) const override;
+  bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
+                        unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
+                        SmallVectorImpl<MachineInstr *> &NewMIs) const override;
+  bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
+                           SmallVectorImpl<SDNode *> &NewNodes) const override;
+  unsigned getOpcodeAfterMemoryUnfold(unsigned Opc,
+                               bool UnfoldLoad, bool UnfoldStore,
+                               unsigned *LoadRegIndex = nullptr) const override;
+  bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
+                               int64_t Offset1, int64_t Offset2,
+                               unsigned NumLoads) const override;
+
+  bool
+  ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+  void insertNoop(MachineBasicBlock &MBB,
+                  MachineBasicBlock::iterator MI) const override;
+  bool isPredicated(const MachineInstr *MI) const override;
+  bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
+                   const SmallVectorImpl<MachineOperand> &Pred2) const override;
+  bool DefinesPredicate(MachineInstr *MI,
+                        std::vector<MachineOperand> &Pred) const override;
+  bool isPredicable(MachineInstr *MI) const override;
+  bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override;
+
+  // Helper functions that check the opcode for status information
+  bool isRegisterStore(const MachineInstr &MI) const;
+  bool isRegisterLoad(const MachineInstr &MI) const;
+
+//===---------------------------------------------------------------------===//
+// Pure virtual funtions to be implemented by sub-classes.
+//===---------------------------------------------------------------------===//
+
+  virtual bool isMov(unsigned opcode) const = 0;
+
+  /// \brief Calculate the "Indirect Address" for the given \p RegIndex and
+  ///        \p Channel
+  ///
+  /// We model indirect addressing using a virtual address space that can be
+  /// accesed with loads and stores.  The "Indirect Address" is the memory
+  /// address in this virtual address space that maps to the given \p RegIndex
+  /// and \p Channel.
+  virtual unsigned calculateIndirectAddress(unsigned RegIndex,
+                                            unsigned Channel) const = 0;
+
+  /// \returns The register class to be used for loading and storing values
+  /// from an "Indirect Address" .
+  virtual const TargetRegisterClass *getIndirectAddrRegClass() const = 0;
+
+  /// \brief Build instruction(s) for an indirect register write.
+  ///
+  /// \returns The instruction that performs the indirect register write
+  virtual MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
+                                    MachineBasicBlock::iterator I,
+                                    unsigned ValueReg, unsigned Address,
+                                    unsigned OffsetReg) const = 0;
+
+  /// \brief Build instruction(s) for an indirect register read.
+  ///
+  /// \returns The instruction that performs the indirect register read
+  virtual MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
+                                    MachineBasicBlock::iterator I,
+                                    unsigned ValueReg, unsigned Address,
+                                    unsigned OffsetReg) const = 0;
+
+  /// \brief Build a MOV instruction.
+  virtual MachineInstr *buildMovInstr(MachineBasicBlock *MBB,
+                                      MachineBasicBlock::iterator I,
+                                      unsigned DstReg, unsigned SrcReg) const = 0;
+
+  /// \brief Given a MIMG \p Opcode that writes all 4 channels, return the
+  /// equivalent opcode that writes \p Channels Channels.
+  int getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const;
+
+};
+
+namespace AMDGPU {
+  int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIndex);
+}  // End namespace AMDGPU
+
+} // End llvm namespace
+
+#define AMDGPU_FLAG_REGISTER_LOAD  (UINT64_C(1) << 63)
+#define AMDGPU_FLAG_REGISTER_STORE (UINT64_C(1) << 62)
+
+#endif // AMDGPUINSTRINFO_H
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.td b/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.td
new file mode 100644
index 0000000..820f1a8
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.td
@@ -0,0 +1,181 @@
+//===-- AMDGPUInstrInfo.td - AMDGPU DAG nodes --------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains DAG node defintions for the AMDGPU target.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// AMDGPU DAG Profiles
+//===----------------------------------------------------------------------===//
+
+def AMDGPUDTIntTernaryOp : SDTypeProfile<1, 3, [
+  SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3>
+]>;
+
+def AMDGPUTrigPreOp : SDTypeProfile<1, 2,
+  [SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>]
+>;
+
+def AMDGPUDivScaleOp : SDTypeProfile<2, 3,
+  [SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>]
+>;
+
+//===----------------------------------------------------------------------===//
+// AMDGPU DAG Nodes
+//
+
+// This argument to this node is a dword address.
+def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>;
+
+def AMDGPUcos : SDNode<"AMDGPUISD::COS_HW", SDTFPUnaryOp>;
+def AMDGPUsin : SDNode<"AMDGPUISD::SIN_HW", SDTFPUnaryOp>;
+
+// out = a - floor(a)
+def AMDGPUfract : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>;
+
+// out = 1.0 / a
+def AMDGPUrcp : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>;
+
+// out = 1.0 / sqrt(a)
+def AMDGPUrsq : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>;
+
+// out = 1.0 / sqrt(a)
+def AMDGPUrsq_legacy : SDNode<"AMDGPUISD::RSQ_LEGACY", SDTFPUnaryOp>;
+
+// out = 1.0 / sqrt(a) result clamped to +/- max_float.
+def AMDGPUrsq_clamped : SDNode<"AMDGPUISD::RSQ_CLAMPED", SDTFPUnaryOp>;
+
+// out = max(a, b) a and b are floats
+def AMDGPUfmax : SDNode<"AMDGPUISD::FMAX", SDTFPBinOp,
+  [SDNPCommutative, SDNPAssociative]
+>;
+
+def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPTernaryOp, []>;
+
+// out = max(a, b) a and b are signed ints
+def AMDGPUsmax : SDNode<"AMDGPUISD::SMAX", SDTIntBinOp,
+  [SDNPCommutative, SDNPAssociative]
+>;
+
+// out = max(a, b) a and b are unsigned ints
+def AMDGPUumax : SDNode<"AMDGPUISD::UMAX", SDTIntBinOp,
+  [SDNPCommutative, SDNPAssociative]
+>;
+
+// out = min(a, b) a and b are floats
+def AMDGPUfmin : SDNode<"AMDGPUISD::FMIN", SDTFPBinOp,
+  [SDNPCommutative, SDNPAssociative]
+>;
+
+// out = min(a, b) a snd b are signed ints
+def AMDGPUsmin : SDNode<"AMDGPUISD::SMIN", SDTIntBinOp,
+  [SDNPCommutative, SDNPAssociative]
+>;
+
+// out = min(a, b) a and b are unsigned ints
+def AMDGPUumin : SDNode<"AMDGPUISD::UMIN", SDTIntBinOp,
+  [SDNPCommutative, SDNPAssociative]
+>;
+
+
+def AMDGPUcvt_f32_ubyte0 : SDNode<"AMDGPUISD::CVT_F32_UBYTE0",
+  SDTIntToFPOp, []>;
+def AMDGPUcvt_f32_ubyte1 : SDNode<"AMDGPUISD::CVT_F32_UBYTE1",
+  SDTIntToFPOp, []>;
+def AMDGPUcvt_f32_ubyte2 : SDNode<"AMDGPUISD::CVT_F32_UBYTE2",
+  SDTIntToFPOp, []>;
+def AMDGPUcvt_f32_ubyte3 : SDNode<"AMDGPUISD::CVT_F32_UBYTE3",
+  SDTIntToFPOp, []>;
+
+
+// urecip - This operation is a helper for integer division, it returns the
+// result of 1 / a as a fractional unsigned integer.
+// out = (2^32 / a) + e
+// e is rounding error
+def AMDGPUurecip : SDNode<"AMDGPUISD::URECIP", SDTIntUnaryOp>;
+
+// Special case divide preop and flags.
+def AMDGPUdiv_scale : SDNode<"AMDGPUISD::DIV_SCALE", AMDGPUDivScaleOp>;
+
+//  Special case divide FMA with scale and flags (src0 = Quotient,
+//  src1 = Denominator, src2 = Numerator).
+def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", SDTFPTernaryOp>;
+
+// Single or double precision division fixup.
+// Special case divide fixup and flags(src0 = Quotient, src1 =
+// Denominator, src2 = Numerator).
+def AMDGPUdiv_fixup : SDNode<"AMDGPUISD::DIV_FIXUP", SDTFPTernaryOp>;
+
+// Look Up 2.0 / pi src0 with segment select src1[4:0]
+def AMDGPUtrig_preop : SDNode<"AMDGPUISD::TRIG_PREOP", AMDGPUTrigPreOp>;
+
+def AMDGPUregister_load : SDNode<"AMDGPUISD::REGISTER_LOAD",
+                          SDTypeProfile<1, 2, [SDTCisPtrTy<1>, SDTCisInt<2>]>,
+                          [SDNPHasChain, SDNPMayLoad]>;
+
+def AMDGPUregister_store : SDNode<"AMDGPUISD::REGISTER_STORE",
+                           SDTypeProfile<0, 3, [SDTCisPtrTy<1>, SDTCisInt<2>]>,
+                           [SDNPHasChain, SDNPMayStore]>;
+
+// MSKOR instructions are atomic memory instructions used mainly for storing
+// 8-bit and 16-bit values.  The definition is:
+//
+// MSKOR(dst, mask, src) MEM[dst] = ((MEM[dst] & ~mask) | src)
+//
+// src0: vec4(src, 0, 0, mask)
+// src1: dst - rat offset (aka pointer) in dwords  
+def AMDGPUstore_mskor : SDNode<"AMDGPUISD::STORE_MSKOR",
+                        SDTypeProfile<0, 2, []>,
+                        [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+def AMDGPUround : SDNode<"ISD::FROUND",
+                         SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>>;
+
+def AMDGPUbfe_u32 : SDNode<"AMDGPUISD::BFE_U32", AMDGPUDTIntTernaryOp>;
+def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>;
+def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>;
+def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>;
+
+def AMDGPUbrev : SDNode<"AMDGPUISD::BREV", SDTIntUnaryOp>;
+
+// Signed and unsigned 24-bit mulitply.  The highest 8-bits are ignore when
+// performing the mulitply.  The result is a 32-bit value.
+def AMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp,
+  [SDNPCommutative]
+>;
+def AMDGPUmul_i24 : SDNode<"AMDGPUISD::MUL_I24", SDTIntBinOp,
+  [SDNPCommutative]
+>;
+
+def AMDGPUmad_u24 : SDNode<"AMDGPUISD::MAD_U24", AMDGPUDTIntTernaryOp,
+  []
+>;
+def AMDGPUmad_i24 : SDNode<"AMDGPUISD::MAD_I24", AMDGPUDTIntTernaryOp,
+  []
+>;
+
+//===----------------------------------------------------------------------===//
+// Flow Control Profile Types
+//===----------------------------------------------------------------------===//
+// Branch instruction where second and third are basic blocks
+def SDTIL_BRCond : SDTypeProfile<0, 2, [
+    SDTCisVT<0, OtherVT>
+    ]>;
+
+//===----------------------------------------------------------------------===//
+// Flow Control DAG Nodes
+//===----------------------------------------------------------------------===//
+def IL_brcond      : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChain]>;
+
+//===----------------------------------------------------------------------===//
+// Call/Return DAG Nodes
+//===----------------------------------------------------------------------===//
+def IL_retflag       : SDNode<"AMDGPUISD::RET_FLAG", SDTNone,
+    [SDNPHasChain, SDNPOptInGlue]>;
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUInstructions.td b/contrib/llvm/lib/Target/R600/AMDGPUInstructions.td
new file mode 100644
index 0000000..cd35603
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUInstructions.td
@@ -0,0 +1,575 @@
+//===-- AMDGPUInstructions.td - Common instruction defs ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains instruction defs that are common to all hw codegen
+// targets.
+//
+//===----------------------------------------------------------------------===//
+
+class AMDGPUInst <dag outs, dag ins, string asm, list<dag> pattern> : Instruction {
+  field bit isRegisterLoad = 0;
+  field bit isRegisterStore = 0;
+
+  let Namespace = "AMDGPU";
+  let OutOperandList = outs;
+  let InOperandList = ins;
+  let AsmString = asm;
+  let Pattern = pattern;
+  let Itinerary = NullALU;
+
+  let TSFlags{63} = isRegisterLoad;
+  let TSFlags{62} = isRegisterStore;
+}
+
+class AMDGPUShaderInst <dag outs, dag ins, string asm, list<dag> pattern>
+    : AMDGPUInst<outs, ins, asm, pattern> {
+
+  field bits<32> Inst = 0xffffffff;
+
+}
+
+def FP32Denormals : Predicate<"Subtarget.hasFP32Denormals()">;
+def FP64Denormals : Predicate<"Subtarget.hasFP64Denormals()">;
+def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">;
+
+def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>;
+def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>;
+
+let OperandType = "OPERAND_IMMEDIATE" in {
+
+def u32imm : Operand<i32> {
+  let PrintMethod = "printU32ImmOperand";
+}
+
+def u16imm : Operand<i16> {
+  let PrintMethod = "printU16ImmOperand";
+}
+
+def u8imm : Operand<i8> {
+  let PrintMethod = "printU8ImmOperand";
+}
+
+} // End OperandType = "OPERAND_IMMEDIATE"
+
+//===--------------------------------------------------------------------===//
+// Custom Operands
+//===--------------------------------------------------------------------===//
+def brtarget   : Operand<OtherVT>;
+
+//===----------------------------------------------------------------------===//
+// PatLeafs for floating-point comparisons
+//===----------------------------------------------------------------------===//
+
+def COND_OEQ : PatLeaf <
+  (cond),
+  [{return N->get() == ISD::SETOEQ || N->get() == ISD::SETEQ;}]
+>;
+
+def COND_OGT : PatLeaf <
+  (cond),
+  [{return N->get() == ISD::SETOGT || N->get() == ISD::SETGT;}]
+>;
+
+def COND_OGE : PatLeaf <
+  (cond),
+  [{return N->get() == ISD::SETOGE || N->get() == ISD::SETGE;}]
+>;
+
+def COND_OLT : PatLeaf <
+  (cond),
+  [{return N->get() == ISD::SETOLT || N->get() == ISD::SETLT;}]
+>;
+
+def COND_OLE : PatLeaf <
+  (cond),
+  [{return N->get() == ISD::SETOLE || N->get() == ISD::SETLE;}]
+>;
+
+def COND_UNE : PatLeaf <
+  (cond),
+  [{return N->get() == ISD::SETUNE || N->get() == ISD::SETNE;}]
+>;
+
+def COND_O : PatLeaf <(cond), [{return N->get() == ISD::SETO;}]>;
+def COND_UO : PatLeaf <(cond), [{return N->get() == ISD::SETUO;}]>;
+
+//===----------------------------------------------------------------------===//
+// PatLeafs for unsigned comparisons
+//===----------------------------------------------------------------------===//
+
+def COND_UGT : PatLeaf <(cond), [{return N->get() == ISD::SETUGT;}]>;
+def COND_UGE : PatLeaf <(cond), [{return N->get() == ISD::SETUGE;}]>;
+def COND_ULT : PatLeaf <(cond), [{return N->get() == ISD::SETULT;}]>;
+def COND_ULE : PatLeaf <(cond), [{return N->get() == ISD::SETULE;}]>;
+
+//===----------------------------------------------------------------------===//
+// PatLeafs for signed comparisons
+//===----------------------------------------------------------------------===//
+
+def COND_SGT : PatLeaf <(cond), [{return N->get() == ISD::SETGT;}]>;
+def COND_SGE : PatLeaf <(cond), [{return N->get() == ISD::SETGE;}]>;
+def COND_SLT : PatLeaf <(cond), [{return N->get() == ISD::SETLT;}]>;
+def COND_SLE : PatLeaf <(cond), [{return N->get() == ISD::SETLE;}]>;
+
+//===----------------------------------------------------------------------===//
+// PatLeafs for integer equality
+//===----------------------------------------------------------------------===//
+
+def COND_EQ : PatLeaf <
+  (cond),
+  [{return N->get() == ISD::SETEQ || N->get() == ISD::SETUEQ;}]
+>;
+
+def COND_NE : PatLeaf <
+  (cond),
+  [{return N->get() == ISD::SETNE || N->get() == ISD::SETUNE;}]
+>;
+
+def COND_NULL : PatLeaf <
+  (cond),
+  [{return false;}]
+>;
+
+//===----------------------------------------------------------------------===//
+// Load/Store Pattern Fragments
+//===----------------------------------------------------------------------===//
+
+class PrivateMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{
+  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS;
+}]>;
+
+class PrivateLoad <SDPatternOperator op> : PrivateMemOp <
+  (ops node:$ptr), (op node:$ptr)
+>;
+
+class PrivateStore <SDPatternOperator op> : PrivateMemOp <
+  (ops node:$value, node:$ptr), (op node:$value, node:$ptr)
+>;
+
+def extloadi8_private : PrivateLoad <extloadi8>;
+def sextloadi8_private : PrivateLoad <sextloadi8>;
+def extloadi16_private : PrivateLoad <extloadi16>;
+def sextloadi16_private : PrivateLoad <sextloadi16>;
+def load_private : PrivateLoad <load>;
+
+def truncstorei8_private : PrivateStore <truncstorei8>;
+def truncstorei16_private : PrivateStore <truncstorei16>;
+def store_private : PrivateStore <store>;
+
+def global_store : PatFrag<(ops node:$val, node:$ptr),
+    (store node:$val, node:$ptr), [{
+        return isGlobalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+// Global address space loads
+def global_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+// Constant address space loads
+def constant_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+    return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
+}]>;
+
+def az_extload : PatFrag<(ops node:$ptr), (unindexedload node:$ptr), [{
+  LoadSDNode *L = cast<LoadSDNode>(N);
+  return L->getExtensionType() == ISD::ZEXTLOAD ||
+         L->getExtensionType() == ISD::EXTLOAD;
+}]>;
+
+def az_extloadi8 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def az_extloadi8_global : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{
+    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def sextloadi8_global : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{
+    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def az_extloadi8_constant : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{
+    return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
+}]>;
+
+def sextloadi8_constant : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{
+    return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
+}]>;
+
+def az_extloadi8_local : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{
+    return isLocalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def sextloadi8_local : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{
+    return isLocalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def az_extloadi16 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def az_extloadi16_global : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{
+    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def sextloadi16_global : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{
+    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def az_extloadi16_constant : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{
+    return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
+}]>;
+
+def sextloadi16_constant : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{
+    return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
+}]>;
+
+def az_extloadi16_local : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{
+    return isLocalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def sextloadi16_local : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{
+    return isLocalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def az_extloadi32 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def az_extloadi32_global : PatFrag<(ops node:$ptr),
+                                   (az_extloadi32 node:$ptr), [{
+  return isGlobalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def az_extloadi32_constant : PatFrag<(ops node:$ptr),
+                                     (az_extloadi32 node:$ptr), [{
+  return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
+}]>;
+
+def truncstorei8_global : PatFrag<(ops node:$val, node:$ptr),
+                                  (truncstorei8 node:$val, node:$ptr), [{
+  return isGlobalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+def truncstorei16_global : PatFrag<(ops node:$val, node:$ptr),
+                                  (truncstorei16 node:$val, node:$ptr), [{
+  return isGlobalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+def local_store : PatFrag<(ops node:$val, node:$ptr),
+                             (store node:$val, node:$ptr), [{
+  return isLocalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+def truncstorei8_local : PatFrag<(ops node:$val, node:$ptr),
+                                  (truncstorei8 node:$val, node:$ptr), [{
+  return isLocalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+def truncstorei16_local : PatFrag<(ops node:$val, node:$ptr),
+                                  (truncstorei16 node:$val, node:$ptr), [{
+  return isLocalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+def local_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+    return isLocalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+
+class local_binary_atomic_op<SDNode atomic_op> :
+  PatFrag<(ops node:$ptr, node:$value),
+    (atomic_op node:$ptr, node:$value), [{
+  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+}]>;
+
+
+def atomic_swap_local : local_binary_atomic_op<atomic_swap>;
+def atomic_load_add_local : local_binary_atomic_op<atomic_load_add>;
+def atomic_load_sub_local : local_binary_atomic_op<atomic_load_sub>;
+def atomic_load_and_local : local_binary_atomic_op<atomic_load_and>;
+def atomic_load_or_local : local_binary_atomic_op<atomic_load_or>;
+def atomic_load_xor_local : local_binary_atomic_op<atomic_load_xor>;
+def atomic_load_nand_local : local_binary_atomic_op<atomic_load_nand>;
+def atomic_load_min_local : local_binary_atomic_op<atomic_load_min>;
+def atomic_load_max_local : local_binary_atomic_op<atomic_load_max>;
+def atomic_load_umin_local : local_binary_atomic_op<atomic_load_umin>;
+def atomic_load_umax_local : local_binary_atomic_op<atomic_load_umax>;
+
+def mskor_global : PatFrag<(ops node:$val, node:$ptr),
+                            (AMDGPUstore_mskor node:$val, node:$ptr), [{
+  return dyn_cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
+}]>;
+
+def atomic_cmp_swap_32_local :
+  PatFrag<(ops node:$ptr, node:$cmp, node:$swap),
+          (atomic_cmp_swap node:$ptr, node:$cmp, node:$swap), [{
+  AtomicSDNode *AN = cast<AtomicSDNode>(N);
+  return AN->getMemoryVT() == MVT::i32 &&
+         AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+}]>;
+
+def atomic_cmp_swap_64_local :
+  PatFrag<(ops node:$ptr, node:$cmp, node:$swap),
+          (atomic_cmp_swap node:$ptr, node:$cmp, node:$swap), [{
+  AtomicSDNode *AN = cast<AtomicSDNode>(N);
+  return AN->getMemoryVT() == MVT::i64 &&
+         AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+}]>;
+
+
+class Constants {
+int TWO_PI = 0x40c90fdb;
+int PI = 0x40490fdb;
+int TWO_PI_INV = 0x3e22f983;
+int FP_UINT_MAX_PLUS_1 = 0x4f800000;    // 1 << 32 in floating point encoding
+int FP32_NEG_ONE = 0xbf800000;
+int FP32_ONE = 0x3f800000;
+}
+def CONST : Constants;
+
+def FP_ZERO : PatLeaf <
+  (fpimm),
+  [{return N->getValueAPF().isZero();}]
+>;
+
+def FP_ONE : PatLeaf <
+  (fpimm),
+  [{return N->isExactlyValue(1.0);}]
+>;
+
+let isCodeGenOnly = 1, isPseudo = 1 in {
+
+let usesCustomInserter = 1  in {
+
+class CLAMP <RegisterClass rc> : AMDGPUShaderInst <
+  (outs rc:$dst),
+  (ins rc:$src0),
+  "CLAMP $dst, $src0",
+  [(set f32:$dst, (AMDGPUclamp f32:$src0, (f32 FP_ZERO), (f32 FP_ONE)))]
+>;
+
+class FABS <RegisterClass rc> : AMDGPUShaderInst <
+  (outs rc:$dst),
+  (ins rc:$src0),
+  "FABS $dst, $src0",
+  [(set f32:$dst, (fabs f32:$src0))]
+>;
+
+class FNEG <RegisterClass rc> : AMDGPUShaderInst <
+  (outs rc:$dst),
+  (ins rc:$src0),
+  "FNEG $dst, $src0",
+  [(set f32:$dst, (fneg f32:$src0))]
+>;
+
+} // usesCustomInserter = 1
+
+multiclass RegisterLoadStore <RegisterClass dstClass, Operand addrClass,
+                    ComplexPattern addrPat> {
+let UseNamedOperandTable = 1 in {
+
+  def RegisterLoad : AMDGPUShaderInst <
+    (outs dstClass:$dst),
+    (ins addrClass:$addr, i32imm:$chan),
+    "RegisterLoad $dst, $addr",
+    [(set i32:$dst, (AMDGPUregister_load addrPat:$addr, (i32 timm:$chan)))]
+  > {
+    let isRegisterLoad = 1;
+  }
+
+  def RegisterStore : AMDGPUShaderInst <
+    (outs),
+    (ins dstClass:$val, addrClass:$addr, i32imm:$chan),
+    "RegisterStore $val, $addr",
+    [(AMDGPUregister_store i32:$val, addrPat:$addr, (i32 timm:$chan))]
+  > {
+    let isRegisterStore = 1;
+  }
+}
+}
+
+} // End isCodeGenOnly = 1, isPseudo = 1
+
+/* Generic helper patterns for intrinsics */
+/* -------------------------------------- */
+
+class POW_Common <AMDGPUInst log_ieee, AMDGPUInst exp_ieee, AMDGPUInst mul>
+  : Pat <
+  (fpow f32:$src0, f32:$src1),
+  (exp_ieee (mul f32:$src1, (log_ieee f32:$src0)))
+>;
+
+/* Other helper patterns */
+/* --------------------- */
+
+/* Extract element pattern */
+class Extract_Element <ValueType sub_type, ValueType vec_type, int sub_idx,
+                       SubRegIndex sub_reg>
+  : Pat<
+  (sub_type (vector_extract vec_type:$src, sub_idx)),
+  (EXTRACT_SUBREG $src, sub_reg)
+>;
+
+/* Insert element pattern */
+class Insert_Element <ValueType elem_type, ValueType vec_type,
+                      int sub_idx, SubRegIndex sub_reg>
+  : Pat <
+  (vector_insert vec_type:$vec, elem_type:$elem, sub_idx),
+  (INSERT_SUBREG $vec, $elem, sub_reg)
+>;
+
+// XXX: Convert to new syntax and use COPY_TO_REG, once the DFAPacketizer
+// can handle COPY instructions.
+// bitconvert pattern
+class BitConvert <ValueType dt, ValueType st, RegisterClass rc> : Pat <
+  (dt (bitconvert (st rc:$src0))),
+  (dt rc:$src0)
+>;
+
+// XXX: Convert to new syntax and use COPY_TO_REG, once the DFAPacketizer
+// can handle COPY instructions.
+class DwordAddrPat<ValueType vt, RegisterClass rc> : Pat <
+  (vt (AMDGPUdwordaddr (vt rc:$addr))),
+  (vt rc:$addr)
+>;
+
+// BFI_INT patterns
+
+multiclass BFIPatterns <Instruction BFI_INT, Instruction LoadImm32> {
+
+  // Definition from ISA doc:
+  // (y & x) | (z & ~x)
+  def : Pat <
+    (or (and i32:$y, i32:$x), (and i32:$z, (not i32:$x))),
+    (BFI_INT $x, $y, $z)
+  >;
+
+  // SHA-256 Ch function
+  // z ^ (x & (y ^ z))
+  def : Pat <
+    (xor i32:$z, (and i32:$x, (xor i32:$y, i32:$z))),
+    (BFI_INT $x, $y, $z)
+  >;
+
+  def : Pat <
+    (fcopysign f32:$src0, f32:$src1),
+    (BFI_INT (LoadImm32 0x7fffffff), $src0, $src1)
+  >;
+
+  def : Pat <
+    (f64 (fcopysign f64:$src0, f64:$src1)),
+      (INSERT_SUBREG (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+      (i32 (EXTRACT_SUBREG $src0, sub0)), sub0),
+      (BFI_INT (LoadImm32 0x7fffffff),
+               (i32 (EXTRACT_SUBREG $src0, sub1)),
+               (i32 (EXTRACT_SUBREG $src1, sub1))), sub1)
+  >;
+}
+
+// SHA-256 Ma patterns
+
+// ((x & z) | (y & (x | z))) -> BFI_INT (XOR x, y), z, y
+class SHA256MaPattern <Instruction BFI_INT, Instruction XOR> : Pat <
+  (or (and i32:$x, i32:$z), (and i32:$y, (or i32:$x, i32:$z))),
+  (BFI_INT (XOR i32:$x, i32:$y), i32:$z, i32:$y)
+>;
+
+// Bitfield extract patterns
+
+/*
+
+XXX: The BFE pattern is not working correctly because the XForm is not being
+applied.
+
+def legalshift32 : ImmLeaf <i32, [{return Imm >=0 && Imm < 32;}]>;
+def bfemask : PatLeaf <(imm), [{return isMask_32(N->getZExtValue());}],
+                            SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(CountTrailingOnes_32(N->getZExtValue()), MVT::i32);}]>>;
+
+class BFEPattern <Instruction BFE> : Pat <
+  (and (srl i32:$x, legalshift32:$y), bfemask:$z),
+  (BFE $x, $y, $z)
+>;
+
+*/
+
+// rotr pattern
+class ROTRPattern <Instruction BIT_ALIGN> : Pat <
+  (rotr i32:$src0, i32:$src1),
+  (BIT_ALIGN $src0, $src0, $src1)
+>;
+
+// 24-bit arithmetic patterns
+def umul24 : PatFrag <(ops node:$x, node:$y), (mul node:$x, node:$y)>;
+
+/*
+class UMUL24Pattern <Instruction UMUL24> : Pat <
+  (mul U24:$x, U24:$y),
+  (UMUL24 $x, $y)
+>;
+*/
+
+class IMad24Pat<Instruction Inst> : Pat <
+  (add (AMDGPUmul_i24 i32:$src0, i32:$src1), i32:$src2),
+  (Inst $src0, $src1, $src2)
+>;
+
+class UMad24Pat<Instruction Inst> : Pat <
+  (add (AMDGPUmul_u24 i32:$src0, i32:$src1), i32:$src2),
+  (Inst $src0, $src1, $src2)
+>;
+
+multiclass Expand24IBitOps<Instruction MulInst, Instruction AddInst> {
+  def _expand_imad24 : Pat <
+    (AMDGPUmad_i24 i32:$src0, i32:$src1, i32:$src2),
+    (AddInst (MulInst $src0, $src1), $src2)
+  >;
+
+  def _expand_imul24 : Pat <
+    (AMDGPUmul_i24 i32:$src0, i32:$src1),
+    (MulInst $src0, $src1)
+  >;
+}
+
+multiclass Expand24UBitOps<Instruction MulInst, Instruction AddInst> {
+  def _expand_umad24 : Pat <
+    (AMDGPUmad_u24 i32:$src0, i32:$src1, i32:$src2),
+    (AddInst (MulInst $src0, $src1), $src2)
+  >;
+
+  def _expand_umul24 : Pat <
+    (AMDGPUmul_u24 i32:$src0, i32:$src1),
+    (MulInst $src0, $src1)
+  >;
+}
+
+class RcpPat<Instruction RcpInst, ValueType vt> : Pat <
+  (fdiv FP_ONE, vt:$src),
+  (RcpInst $src)
+>;
+
+multiclass RsqPat<Instruction RsqInst, ValueType vt> {
+  def : Pat <
+    (fdiv FP_ONE, (fsqrt vt:$src)),
+    (RsqInst $src)
+  >;
+
+  def : Pat <
+    (AMDGPUrcp (fsqrt vt:$src)),
+    (RsqInst $src)
+  >;
+}
+
+include "R600Instructions.td"
+include "R700Instructions.td"
+include "EvergreenInstructions.td"
+include "CaymanInstructions.td"
+
+include "SIInstrInfo.td"
+
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUIntrinsicInfo.cpp b/contrib/llvm/lib/Target/R600/AMDGPUIntrinsicInfo.cpp
new file mode 100644
index 0000000..58916a9
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUIntrinsicInfo.cpp
@@ -0,0 +1,77 @@
+//===- AMDGPUIntrinsicInfo.cpp - AMDGPU Intrinsic Information ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief AMDGPU Implementation of the IntrinsicInfo class.
+//
+//===-----------------------------------------------------------------------===//
+
+#include "AMDGPUIntrinsicInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+
+using namespace llvm;
+
+#define GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN
+#include "AMDGPUGenIntrinsics.inc"
+#undef GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN
+
+AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo(TargetMachine *tm)
+    : TargetIntrinsicInfo() {}
+
+std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys,
+                                         unsigned numTys) const {
+  static const char *const names[] = {
+#define GET_INTRINSIC_NAME_TABLE
+#include "AMDGPUGenIntrinsics.inc"
+#undef GET_INTRINSIC_NAME_TABLE
+  };
+
+  if (IntrID < Intrinsic::num_intrinsics) {
+    return nullptr;
+  }
+  assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics &&
+         "Invalid intrinsic ID");
+
+  std::string Result(names[IntrID - Intrinsic::num_intrinsics]);
+  return Result;
+}
+
+unsigned AMDGPUIntrinsicInfo::lookupName(const char *Name,
+                                         unsigned Len) const {
+  if (!StringRef(Name, Len).startswith("llvm."))
+    return 0; // All intrinsics start with 'llvm.'
+
+#define GET_FUNCTION_RECOGNIZER
+#include "AMDGPUGenIntrinsics.inc"
+#undef GET_FUNCTION_RECOGNIZER
+  AMDGPUIntrinsic::ID IntrinsicID =
+      (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic;
+  IntrinsicID = getIntrinsicForGCCBuiltin("AMDGPU", Name);
+
+  if (IntrinsicID != (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic) {
+    return IntrinsicID;
+  }
+  return 0;
+}
+
+bool AMDGPUIntrinsicInfo::isOverloaded(unsigned id) const {
+// Overload Table
+#define GET_INTRINSIC_OVERLOAD_TABLE
+#include "AMDGPUGenIntrinsics.inc"
+#undef GET_INTRINSIC_OVERLOAD_TABLE
+}
+
+Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
+                                              Type **Tys,
+                                              unsigned numTys) const {
+  llvm_unreachable("Not implemented");
+}
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUIntrinsicInfo.h b/contrib/llvm/lib/Target/R600/AMDGPUIntrinsicInfo.h
new file mode 100644
index 0000000..5be68a2
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUIntrinsicInfo.h
@@ -0,0 +1,48 @@
+//===- AMDGPUIntrinsicInfo.h - AMDGPU Intrinsic Information ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface for the AMDGPU Implementation of the Intrinsic Info class.
+//
+//===-----------------------------------------------------------------------===//
+#ifndef AMDGPU_INTRINSICINFO_H
+#define AMDGPU_INTRINSICINFO_H
+
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Target/TargetIntrinsicInfo.h"
+
+namespace llvm {
+class TargetMachine;
+
+namespace AMDGPUIntrinsic {
+enum ID {
+  last_non_AMDGPU_intrinsic = Intrinsic::num_intrinsics - 1,
+#define GET_INTRINSIC_ENUM_VALUES
+#include "AMDGPUGenIntrinsics.inc"
+#undef GET_INTRINSIC_ENUM_VALUES
+      , num_AMDGPU_intrinsics
+};
+
+} // end namespace AMDGPUIntrinsic
+
+class AMDGPUIntrinsicInfo : public TargetIntrinsicInfo {
+public:
+  AMDGPUIntrinsicInfo(TargetMachine *tm);
+  std::string getName(unsigned IntrId, Type **Tys = nullptr,
+                      unsigned numTys = 0) const override;
+  unsigned lookupName(const char *Name, unsigned Len) const override;
+  bool isOverloaded(unsigned IID) const override;
+  Function *getDeclaration(Module *M, unsigned ID,
+                           Type **Tys = nullptr,
+                           unsigned numTys = 0) const override;
+};
+
+} // end namespace llvm
+
+#endif // AMDGPU_INTRINSICINFO_H
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUIntrinsics.td b/contrib/llvm/lib/Target/R600/AMDGPUIntrinsics.td
new file mode 100644
index 0000000..eee9c29
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUIntrinsics.td
@@ -0,0 +1,89 @@
+//===-- AMDGPUIntrinsics.td - Common intrinsics  -*- tablegen -*-----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines intrinsics that are used by all hw codegen targets.
+//
+//===----------------------------------------------------------------------===//
+
+let TargetPrefix = "AMDGPU", isTarget = 1 in {
+
+  def int_AMDGPU_store_output : Intrinsic<[], [llvm_float_ty, llvm_i32_ty], []>;
+  def int_AMDGPU_swizzle : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_abs : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_arl : Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_cndlt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_div : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_fract : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+  def int_AMDGPU_clamp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+
+  // This is named backwards (instead of rsq_legacy) so we don't have
+  // to define it with the public builtins intrinsics. This is a
+  // workaround for how intrinsic names are parsed. If the name is
+  // llvm.AMDGPU.rsq.legacy, the parser assumes that you meant
+  // llvm.AMDGPU.rsq.{f32 | f64} and incorrectly mangled the name.
+  def int_AMDGPU_legacy_rsq : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+
+  def int_AMDGPU_dp4 : Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+  def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>;
+  def int_AMDGPU_kilp : Intrinsic<[], [], []>;
+  def int_AMDGPU_lrp : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_mul : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_pow : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_seq : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_sgt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_sge : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_sle : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_sne : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_mullit : Intrinsic<[llvm_v4f32_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_tex : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_txb : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_txf : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_txq : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_txd : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_txl : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_trunc : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_ddx : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_ddy : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_imax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_imin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_umax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_umin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_umul24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_imul24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_imad24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_umad24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_cvt_f32_ubyte0 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_cvt_f32_ubyte1 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_cvt_f32_ubyte2 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_cvt_f32_ubyte3 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_cube : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
+  def int_AMDGPU_bfi : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_bfe_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_bfe_u32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_bfm : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_brev : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_barrier_local  : Intrinsic<[], [], []>;
+  def int_AMDGPU_barrier_global  : Intrinsic<[], [], []>;
+}
+
+// Legacy names for compatibility.
+let TargetPrefix = "AMDIL", isTarget = 1 in {
+  def int_AMDIL_abs : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+  def int_AMDIL_fraction : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+  def int_AMDIL_clamp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+  def int_AMDIL_exp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+  def int_AMDIL_round_nearest : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+}
+
+let TargetPrefix = "TGSI", isTarget = 1 in {
+
+  def int_TGSI_lit_z : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],[IntrNoMem]>;
+}
+
+include "SIIntrinsics.td"
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUMCInstLower.cpp b/contrib/llvm/lib/Target/R600/AMDGPUMCInstLower.cpp
new file mode 100644
index 0000000..ce5c41c
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUMCInstLower.cpp
@@ -0,0 +1,159 @@
+//===- AMDGPUMCInstLower.cpp - Lower AMDGPU MachineInstr to an MCInst -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Code to lower AMDGPU MachineInstrs to their corresponding MCInst.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#include "AMDGPUMCInstLower.h"
+#include "AMDGPUAsmPrinter.h"
+#include "AMDGPUTargetMachine.h"
+#include "InstPrinter/AMDGPUInstPrinter.h"
+#include "R600InstrInfo.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include <algorithm>
+
+using namespace llvm;
+
+AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &st):
+  Ctx(ctx), ST(st)
+{ }
+
+enum AMDGPUMCInstLower::SISubtarget
+AMDGPUMCInstLower::AMDGPUSubtargetToSISubtarget(unsigned) const {
+  return AMDGPUMCInstLower::SI;
+}
+
+unsigned AMDGPUMCInstLower::getMCOpcode(unsigned MIOpcode) const {
+
+  int MCOpcode = AMDGPU::getMCOpcode(MIOpcode,
+                              AMDGPUSubtargetToSISubtarget(ST.getGeneration()));
+  if (MCOpcode == -1)
+    MCOpcode = MIOpcode;
+
+  return MCOpcode;
+}
+
+void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
+
+  OutMI.setOpcode(getMCOpcode(MI->getOpcode()));
+
+  for (const MachineOperand &MO : MI->explicit_operands()) {
+    MCOperand MCOp;
+    switch (MO.getType()) {
+    default:
+      llvm_unreachable("unknown operand type");
+    case MachineOperand::MO_FPImmediate: {
+      const APFloat &FloatValue = MO.getFPImm()->getValueAPF();
+      assert(&FloatValue.getSemantics() == &APFloat::IEEEsingle &&
+             "Only floating point immediates are supported at the moment.");
+      MCOp = MCOperand::CreateFPImm(FloatValue.convertToFloat());
+      break;
+    }
+    case MachineOperand::MO_Immediate:
+      MCOp = MCOperand::CreateImm(MO.getImm());
+      break;
+    case MachineOperand::MO_Register:
+      MCOp = MCOperand::CreateReg(MO.getReg());
+      break;
+    case MachineOperand::MO_MachineBasicBlock:
+      MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(
+                                   MO.getMBB()->getSymbol(), Ctx));
+      break;
+    case MachineOperand::MO_GlobalAddress: {
+      const GlobalValue *GV = MO.getGlobal();
+      MCSymbol *Sym = Ctx.GetOrCreateSymbol(StringRef(GV->getName()));
+      MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(Sym, Ctx));
+      break;
+    }
+    case MachineOperand::MO_TargetIndex: {
+      assert(MO.getIndex() == AMDGPU::TI_CONSTDATA_START);
+      MCSymbol *Sym = Ctx.GetOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME));
+      const MCSymbolRefExpr *Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+      MCOp = MCOperand::CreateExpr(Expr);
+      break;
+    }
+    }
+    OutMI.addOperand(MCOp);
+  }
+}
+
+void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  AMDGPUMCInstLower MCInstLowering(OutContext,
+                               MF->getTarget().getSubtarget<AMDGPUSubtarget>());
+
+#ifdef _DEBUG
+  StringRef Err;
+  if (!TM.getInstrInfo()->verifyInstruction(MI, Err)) {
+    errs() << "Warning: Illegal instruction detected: " << Err << "\n";
+    MI->dump();
+  }
+#endif
+  if (MI->isBundle()) {
+    const MachineBasicBlock *MBB = MI->getParent();
+    MachineBasicBlock::const_instr_iterator I = MI;
+    ++I;
+    while (I != MBB->end() && I->isInsideBundle()) {
+      EmitInstruction(I);
+      ++I;
+    }
+  } else {
+    MCInst TmpInst;
+    MCInstLowering.lower(MI, TmpInst);
+    EmitToStreamer(OutStreamer, TmpInst);
+
+    if (DisasmEnabled) {
+      // Disassemble instruction/operands to text.
+      DisasmLines.resize(DisasmLines.size() + 1);
+      std::string &DisasmLine = DisasmLines.back();
+      raw_string_ostream DisasmStream(DisasmLine);
+
+      AMDGPUInstPrinter InstPrinter(*TM.getMCAsmInfo(), *TM.getInstrInfo(),
+                                    *TM.getRegisterInfo());
+      InstPrinter.printInst(&TmpInst, DisasmStream, StringRef());
+
+      // Disassemble instruction/operands to hex representation.
+      SmallVector<MCFixup, 4> Fixups;
+      SmallVector<char, 16> CodeBytes;
+      raw_svector_ostream CodeStream(CodeBytes);
+
+      MCObjectStreamer &ObjStreamer = (MCObjectStreamer &)OutStreamer;
+      MCCodeEmitter &InstEmitter = ObjStreamer.getAssembler().getEmitter();
+      InstEmitter.EncodeInstruction(TmpInst, CodeStream, Fixups,
+                                    TM.getSubtarget<MCSubtargetInfo>());
+      CodeStream.flush();
+
+      HexLines.resize(HexLines.size() + 1);
+      std::string &HexLine = HexLines.back();
+      raw_string_ostream HexStream(HexLine);
+
+      for (size_t i = 0; i < CodeBytes.size(); i += 4) {
+        unsigned int CodeDWord = *(unsigned int *)&CodeBytes[i];
+        HexStream << format("%s%08X", (i > 0 ? " " : ""), CodeDWord);
+      }
+
+      DisasmStream.flush();
+      DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLine.size());
+    }
+  }
+}
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUMCInstLower.h b/contrib/llvm/lib/Target/R600/AMDGPUMCInstLower.h
new file mode 100644
index 0000000..58fe34d
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUMCInstLower.h
@@ -0,0 +1,48 @@
+//===- AMDGPUMCInstLower.h MachineInstr Lowering Interface ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPU_MCINSTLOWER_H
+#define AMDGPU_MCINSTLOWER_H
+
+namespace llvm {
+
+class AMDGPUSubtarget;
+class MachineInstr;
+class MCContext;
+class MCInst;
+
+class AMDGPUMCInstLower {
+
+  // This must be kept in sync with the SISubtarget class in SIInstrInfo.td
+  enum SISubtarget {
+    SI = 0
+  };
+
+  MCContext &Ctx;
+  const AMDGPUSubtarget &ST;
+
+  /// Convert a member of the AMDGPUSubtarget::Generation enum to the
+  /// SISubtarget enum.
+  enum SISubtarget AMDGPUSubtargetToSISubtarget(unsigned Gen) const;
+
+  /// Get the MC opcode for this MachineInstr.
+  unsigned getMCOpcode(unsigned MIOpcode) const;
+
+public:
+  AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &ST);
+
+  /// \brief Lower a MachineInstr to an MCInst
+  void lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+};
+
+} // End namespace llvm
+
+#endif //AMDGPU_MCINSTLOWER_H
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUMachineFunction.cpp b/contrib/llvm/lib/Target/R600/AMDGPUMachineFunction.cpp
new file mode 100644
index 0000000..90af801
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUMachineFunction.cpp
@@ -0,0 +1,25 @@
+#include "AMDGPUMachineFunction.h"
+#include "AMDGPU.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+using namespace llvm;
+
+static const char *const ShaderTypeAttribute = "ShaderType";
+
+// Pin the vtable to this file.
+void AMDGPUMachineFunction::anchor() {}
+
+AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
+  MachineFunctionInfo(),
+  ShaderType(ShaderType::COMPUTE),
+  LDSSize(0) {
+  AttributeSet Set = MF.getFunction()->getAttributes();
+  Attribute A = Set.getAttribute(AttributeSet::FunctionIndex,
+                                 ShaderTypeAttribute);
+
+  if (A.isStringAttribute()) {
+    StringRef Str = A.getValueAsString();
+    if (Str.getAsInteger(0, ShaderType))
+      llvm_unreachable("Can't parse shader type!");
+  }
+}
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUMachineFunction.h b/contrib/llvm/lib/Target/R600/AMDGPUMachineFunction.h
new file mode 100644
index 0000000..0854d58
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUMachineFunction.h
@@ -0,0 +1,39 @@
+//===-- R600MachineFunctionInfo.h - R600 Machine Function Info ----*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPUMACHINEFUNCTION_H
+#define AMDGPUMACHINEFUNCTION_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include <map>
+
+namespace llvm {
+
+class AMDGPUMachineFunction : public MachineFunctionInfo {
+  virtual void anchor();
+  unsigned ShaderType;
+
+public:
+  AMDGPUMachineFunction(const MachineFunction &MF);
+  /// A map to keep track of local memory objects and their offsets within
+  /// the local memory space.
+  std::map<const GlobalValue *, unsigned> LocalMemoryObjects;
+  /// Number of bytes in the LDS that are being used.
+  unsigned LDSSize;
+
+  unsigned getShaderType() const {
+    return ShaderType;
+  }
+};
+
+}
+#endif // AMDGPUMACHINEFUNCTION_H
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUPromoteAlloca.cpp b/contrib/llvm/lib/Target/R600/AMDGPUPromoteAlloca.cpp
new file mode 100644
index 0000000..218750d
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUPromoteAlloca.cpp
@@ -0,0 +1,387 @@
+//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass eliminates allocas by either converting them into vectors or
+// by migrating them to local address space.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "amdgpu-promote-alloca"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUPromoteAlloca : public FunctionPass,
+                       public InstVisitor<AMDGPUPromoteAlloca> {
+
+  static char ID;
+  Module *Mod;
+  const AMDGPUSubtarget &ST;
+  int LocalMemAvailable;
+
+public:
+  AMDGPUPromoteAlloca(const AMDGPUSubtarget &st) : FunctionPass(ID), ST(st),
+                                                   LocalMemAvailable(0) { }
+  virtual bool doInitialization(Module &M);
+  virtual bool runOnFunction(Function &F);
+  virtual const char *getPassName() const {
+    return "AMDGPU Promote Alloca";
+  }
+  void visitAlloca(AllocaInst &I);
+};
+
+} // End anonymous namespace
+
+char AMDGPUPromoteAlloca::ID = 0;
+
+bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
+  Mod = &M;
+  return false;
+}
+
+bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
+
+  const FunctionType *FTy = F.getFunctionType();
+
+  LocalMemAvailable = ST.getLocalMemorySize();
+
+
+  // If the function has any arguments in the local address space, then it's
+  // possible these arguments require the entire local memory space, so
+  // we cannot use local memory in the pass.
+  for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) {
+    const Type *ParamTy = FTy->getParamType(i);
+    if (ParamTy->isPointerTy() &&
+        ParamTy->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+      LocalMemAvailable = 0;
+      DEBUG(dbgs() << "Function has local memory argument.  Promoting to "
+                      "local memory disabled.\n");
+      break;
+    }
+  }
+
+  if (LocalMemAvailable > 0) {
+    // Check how much local memory is being used by global objects
+    for (Module::global_iterator I = Mod->global_begin(),
+                                 E = Mod->global_end(); I != E; ++I) {
+      GlobalVariable *GV = I;
+      PointerType *GVTy = GV->getType();
+      if (GVTy->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
+        continue;
+      for (Value::use_iterator U = GV->use_begin(),
+                               UE = GV->use_end(); U != UE; ++U) {
+        Instruction *Use = dyn_cast<Instruction>(*U);
+        if (!Use)
+          continue;
+        if (Use->getParent()->getParent() == &F)
+          LocalMemAvailable -=
+              Mod->getDataLayout()->getTypeAllocSize(GVTy->getElementType());
+      }
+    }
+  }
+
+  LocalMemAvailable = std::max(0, LocalMemAvailable);
+  DEBUG(dbgs() << LocalMemAvailable << "bytes free in local memory.\n");
+
+  visit(F);
+
+  return false;
+}
+
+static VectorType *arrayTypeToVecType(const Type *ArrayTy) {
+  return VectorType::get(ArrayTy->getArrayElementType(),
+                         ArrayTy->getArrayNumElements());
+}
+
+static Value* calculateVectorIndex(Value *Ptr,
+                                  std::map<GetElementPtrInst*, Value*> GEPIdx) {
+  if (isa<AllocaInst>(Ptr))
+    return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext()));
+
+  GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
+
+  return GEPIdx[GEP];
+}
+
+static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
+  // FIXME we only support simple cases
+  if (GEP->getNumOperands() != 3)
+    return NULL;
+
+  ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
+  if (!I0 || !I0->isZero())
+    return NULL;
+
+  return GEP->getOperand(2);
+}
+
+// Not an instruction handled below to turn into a vector.
+//
+// TODO: Check isTriviallyVectorizable for calls and handle other
+// instructions.
+static bool canVectorizeInst(Instruction *Inst) {
+  switch (Inst->getOpcode()) {
+  case Instruction::Load:
+  case Instruction::Store:
+  case Instruction::BitCast:
+  case Instruction::AddrSpaceCast:
+    return true;
+  default:
+    return false;
+  }
+}
+
+static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
+  Type *AllocaTy = Alloca->getAllocatedType();
+
+  DEBUG(dbgs() << "Alloca Candidate for vectorization \n");
+
+  // FIXME: There is no reason why we can't support larger arrays, we
+  // are just being conservative for now.
+  if (!AllocaTy->isArrayTy() ||
+      AllocaTy->getArrayElementType()->isVectorTy() ||
+      AllocaTy->getArrayNumElements() > 4) {
+
+    DEBUG(dbgs() << "  Cannot convert type to vector");
+    return false;
+  }
+
+  std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
+  std::vector<Value*> WorkList;
+  for (User *AllocaUser : Alloca->users()) {
+    GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
+    if (!GEP) {
+      if (!canVectorizeInst(cast<Instruction>(AllocaUser)))
+        return false;
+
+      WorkList.push_back(AllocaUser);
+      continue;
+    }
+
+    Value *Index = GEPToVectorIndex(GEP);
+
+    // If we can't compute a vector index from this GEP, then we can't
+    // promote this alloca to vector.
+    if (!Index) {
+      DEBUG(dbgs() << "  Cannot compute vector index for GEP " << *GEP << '\n');
+      return false;
+    }
+
+    GEPVectorIdx[GEP] = Index;
+    for (User *GEPUser : AllocaUser->users()) {
+      if (!canVectorizeInst(cast<Instruction>(GEPUser)))
+        return false;
+
+      WorkList.push_back(GEPUser);
+    }
+  }
+
+  VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
+
+  DEBUG(dbgs() << "  Converting alloca to vector "
+        << *AllocaTy << " -> " << *VectorTy << '\n');
+
+  for (std::vector<Value*>::iterator I = WorkList.begin(),
+                                     E = WorkList.end(); I != E; ++I) {
+    Instruction *Inst = cast<Instruction>(*I);
+    IRBuilder<> Builder(Inst);
+    switch (Inst->getOpcode()) {
+    case Instruction::Load: {
+      Value *Ptr = Inst->getOperand(0);
+      Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
+      Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
+      Value *VecValue = Builder.CreateLoad(BitCast);
+      Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
+      Inst->replaceAllUsesWith(ExtractElement);
+      Inst->eraseFromParent();
+      break;
+    }
+    case Instruction::Store: {
+      Value *Ptr = Inst->getOperand(1);
+      Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
+      Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
+      Value *VecValue = Builder.CreateLoad(BitCast);
+      Value *NewVecValue = Builder.CreateInsertElement(VecValue,
+                                                       Inst->getOperand(0),
+                                                       Index);
+      Builder.CreateStore(NewVecValue, BitCast);
+      Inst->eraseFromParent();
+      break;
+    }
+    case Instruction::BitCast:
+    case Instruction::AddrSpaceCast:
+      break;
+
+    default:
+      Inst->dump();
+      llvm_unreachable("Inconsistency in instructions promotable to vector");
+    }
+  }
+  return true;
+}
+
+static void collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
+  for (User *User : Val->users()) {
+    if(std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end())
+      continue;
+    if (isa<CallInst>(User)) {
+      WorkList.push_back(User);
+      continue;
+    }
+    if (!User->getType()->isPointerTy())
+      continue;
+    WorkList.push_back(User);
+    collectUsesWithPtrTypes(User, WorkList);
+  }
+}
+
+void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) {
+  IRBuilder<> Builder(&I);
+
+  // First try to replace the alloca with a vector
+  Type *AllocaTy = I.getAllocatedType();
+
+  DEBUG(dbgs() << "Trying to promote " << I << '\n');
+
+  if (tryPromoteAllocaToVector(&I))
+    return;
+
+  DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
+
+  // FIXME: This is the maximum work group size.  We should try to get
+  // value from the reqd_work_group_size function attribute if it is
+  // available.
+  unsigned WorkGroupSize = 256;
+  int AllocaSize = WorkGroupSize *
+      Mod->getDataLayout()->getTypeAllocSize(AllocaTy);
+
+  if (AllocaSize > LocalMemAvailable) {
+    DEBUG(dbgs() << " Not enough local memory to promote alloca.\n");
+    return;
+  }
+
+  DEBUG(dbgs() << "Promoting alloca to local memory\n");
+  LocalMemAvailable -= AllocaSize;
+
+  GlobalVariable *GV = new GlobalVariable(
+      *Mod, ArrayType::get(I.getAllocatedType(), 256), false,
+      GlobalValue::ExternalLinkage, 0, I.getName(), 0,
+      GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
+
+  FunctionType *FTy = FunctionType::get(
+      Type::getInt32Ty(Mod->getContext()), false);
+  AttributeSet AttrSet;
+  AttrSet.addAttribute(Mod->getContext(), 0, Attribute::ReadNone);
+
+  Value *ReadLocalSizeY = Mod->getOrInsertFunction(
+      "llvm.r600.read.local.size.y", FTy, AttrSet);
+  Value *ReadLocalSizeZ = Mod->getOrInsertFunction(
+      "llvm.r600.read.local.size.z", FTy, AttrSet);
+  Value *ReadTIDIGX = Mod->getOrInsertFunction(
+      "llvm.r600.read.tidig.x", FTy, AttrSet);
+  Value *ReadTIDIGY = Mod->getOrInsertFunction(
+      "llvm.r600.read.tidig.y", FTy, AttrSet);
+  Value *ReadTIDIGZ = Mod->getOrInsertFunction(
+      "llvm.r600.read.tidig.z", FTy, AttrSet);
+
+
+  Value *TCntY = Builder.CreateCall(ReadLocalSizeY);
+  Value *TCntZ = Builder.CreateCall(ReadLocalSizeZ);
+  Value *TIdX  = Builder.CreateCall(ReadTIDIGX);
+  Value *TIdY  = Builder.CreateCall(ReadTIDIGY);
+  Value *TIdZ  = Builder.CreateCall(ReadTIDIGZ);
+
+  Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ);
+  Tmp0 = Builder.CreateMul(Tmp0, TIdX);
+  Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ);
+  Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
+  TID = Builder.CreateAdd(TID, TIdZ);
+
+  std::vector<Value*> Indices;
+  Indices.push_back(Constant::getNullValue(Type::getInt32Ty(Mod->getContext())));
+  Indices.push_back(TID);
+
+  Value *Offset = Builder.CreateGEP(GV, Indices);
+  I.mutateType(Offset->getType());
+  I.replaceAllUsesWith(Offset);
+  I.eraseFromParent();
+
+  std::vector<Value*> WorkList;
+
+  collectUsesWithPtrTypes(Offset, WorkList);
+
+  for (std::vector<Value*>::iterator i = WorkList.begin(),
+                                     e = WorkList.end(); i != e; ++i) {
+    Value *V = *i;
+    CallInst *Call = dyn_cast<CallInst>(V);
+    if (!Call) {
+      Type *EltTy = V->getType()->getPointerElementType();
+      PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
+      V->mutateType(NewTy);
+      continue;
+    }
+
+    IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call);
+    if (!Intr) {
+      std::vector<Type*> ArgTypes;
+      for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands();
+                                ArgIdx != ArgEnd; ++ArgIdx) {
+        ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType());
+      }
+      Function *F = Call->getCalledFunction();
+      FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes,
+                                                F->isVarArg());
+      Constant *C = Mod->getOrInsertFunction(StringRef(F->getName().str() + ".local"), NewType,
+                                             F->getAttributes());
+      Function *NewF = cast<Function>(C);
+      Call->setCalledFunction(NewF);
+      continue;
+    }
+
+    Builder.SetInsertPoint(Intr);
+    switch (Intr->getIntrinsicID()) {
+    case Intrinsic::lifetime_start:
+    case Intrinsic::lifetime_end:
+      // These intrinsics are for address space 0 only
+      Intr->eraseFromParent();
+      continue;
+    case Intrinsic::memcpy: {
+      MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
+      Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(),
+                           MemCpy->getLength(), MemCpy->getAlignment(),
+                           MemCpy->isVolatile());
+      Intr->eraseFromParent();
+      continue;
+    }
+    case Intrinsic::memset: {
+      MemSetInst *MemSet = cast<MemSetInst>(Intr);
+      Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
+                           MemSet->getLength(), MemSet->getAlignment(),
+                           MemSet->isVolatile());
+      Intr->eraseFromParent();
+      continue;
+    }
+    default:
+      Intr->dump();
+      llvm_unreachable("Don't know how to promote alloca intrinsic use.");
+    }
+  }
+}
+
+FunctionPass *llvm::createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST) {
+  return new AMDGPUPromoteAlloca(ST);
+}
diff --git a/contrib/llvm/lib/Target/R600/AMDGPURegisterInfo.cpp b/contrib/llvm/lib/Target/R600/AMDGPURegisterInfo.cpp
new file mode 100644
index 0000000..3433280
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPURegisterInfo.cpp
@@ -0,0 +1,67 @@
+//===-- AMDGPURegisterInfo.cpp - AMDGPU Register Information -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Parent TargetRegisterInfo class common to all hw codegen targets.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPURegisterInfo.h"
+#include "AMDGPUTargetMachine.h"
+
+using namespace llvm;
+
+AMDGPURegisterInfo::AMDGPURegisterInfo(const AMDGPUSubtarget &st)
+: AMDGPUGenRegisterInfo(0),
+  ST(st)
+  { }
+
+//===----------------------------------------------------------------------===//
+// Function handling callbacks - Functions are a seldom used feature of GPUS, so
+// they are not supported at this time.
+//===----------------------------------------------------------------------===//
+
+const MCPhysReg AMDGPURegisterInfo::CalleeSavedReg = AMDGPU::NoRegister;
+
+const MCPhysReg*
+AMDGPURegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  return &CalleeSavedReg;
+}
+
+void AMDGPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
+                                             int SPAdj,
+                                             unsigned FIOperandNum,
+                                             RegScavenger *RS) const {
+  llvm_unreachable("Subroutines not supported yet");
+}
+
+unsigned AMDGPURegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  assert(!"Subroutines not supported yet");
+  return 0;
+}
+
+unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const {
+  static const unsigned SubRegs[] = {
+    AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4,
+    AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, AMDGPU::sub8, AMDGPU::sub9,
+    AMDGPU::sub10, AMDGPU::sub11, AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14,
+    AMDGPU::sub15
+  };
+
+  assert(Channel < array_lengthof(SubRegs));
+  return SubRegs[Channel];
+}
+
+unsigned AMDGPURegisterInfo::getIndirectSubReg(unsigned IndirectIndex) const {
+
+  return getSubRegFromChannel(IndirectIndex);
+}
+
+#define GET_REGINFO_TARGET_DESC
+#include "AMDGPUGenRegisterInfo.inc"
diff --git a/contrib/llvm/lib/Target/R600/AMDGPURegisterInfo.h b/contrib/llvm/lib/Target/R600/AMDGPURegisterInfo.h
new file mode 100644
index 0000000..46aa7a1
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPURegisterInfo.h
@@ -0,0 +1,65 @@
+//===-- AMDGPURegisterInfo.h - AMDGPURegisterInfo Interface -*- C++ -*-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief TargetRegisterInfo interface that is implemented by all hw codegen
+/// targets.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPUREGISTERINFO_H
+#define AMDGPUREGISTERINFO_H
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#define GET_REGINFO_ENUM
+#include "AMDGPUGenRegisterInfo.inc"
+
+namespace llvm {
+
+class AMDGPUSubtarget;
+class TargetInstrInfo;
+
+struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
+  static const MCPhysReg CalleeSavedReg;
+  const AMDGPUSubtarget &ST;
+
+  AMDGPURegisterInfo(const AMDGPUSubtarget &st);
+
+  BitVector getReservedRegs(const MachineFunction &MF) const override {
+    assert(!"Unimplemented");  return BitVector();
+  }
+
+  virtual const TargetRegisterClass* getCFGStructurizerRegClass(MVT VT) const {
+    assert(!"Unimplemented"); return nullptr;
+  }
+
+  virtual unsigned getHWRegIndex(unsigned Reg) const {
+    assert(!"Unimplemented"); return 0;
+  }
+
+  /// \returns the sub reg enum value for the given \p Channel
+  /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0)
+  unsigned getSubRegFromChannel(unsigned Channel) const;
+
+  const MCPhysReg* getCalleeSavedRegs(const MachineFunction *MF) const override;
+  virtual void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
+                           unsigned FIOperandNum,
+                           RegScavenger *RS) const override;
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
+
+  unsigned getIndirectSubReg(unsigned IndirectIndex) const;
+
+};
+
+} // End namespace llvm
+
+#endif // AMDIDSAREGISTERINFO_H
diff --git a/contrib/llvm/lib/Target/R600/AMDGPURegisterInfo.td b/contrib/llvm/lib/Target/R600/AMDGPURegisterInfo.td
new file mode 100644
index 0000000..835a146
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPURegisterInfo.td
@@ -0,0 +1,26 @@
+//===-- AMDGPURegisterInfo.td - AMDGPU register info -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Tablegen register definitions common to all hw codegen targets.
+//
+//===----------------------------------------------------------------------===//
+
+let Namespace = "AMDGPU" in {
+
+foreach Index = 0-15 in {
+  // Indices are used in a variety of ways here, so don't set a size/offset.
+  def sub#Index : SubRegIndex<-1, -1>;
+}
+
+def INDIRECT_BASE_ADDR : Register <"INDIRECT_BASE_ADDR">;
+
+}
+
+include "R600RegisterInfo.td"
+include "SIRegisterInfo.td"
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUSubtarget.cpp b/contrib/llvm/lib/Target/R600/AMDGPUSubtarget.cpp
new file mode 100644
index 0000000..e3c2a50
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUSubtarget.cpp
@@ -0,0 +1,89 @@
+//===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUSubtarget.h"
+#include "R600InstrInfo.h"
+#include "SIInstrInfo.h"
+#include "llvm/ADT/SmallString.h"
+
+#include "llvm/ADT/SmallString.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-subtarget"
+
+#define GET_SUBTARGETINFO_ENUM
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "AMDGPUGenSubtargetInfo.inc"
+
+AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef GPU, StringRef FS) :
+  AMDGPUGenSubtargetInfo(TT, GPU, FS),
+  DevName(GPU),
+  Is64bit(false),
+  DumpCode(false),
+  R600ALUInst(false),
+  HasVertexCache(false),
+  TexVTXClauseSize(0),
+  Gen(AMDGPUSubtarget::R600),
+  FP64(false),
+  FP64Denormals(false),
+  FP32Denormals(false),
+  CaymanISA(false),
+  EnableIRStructurizer(true),
+  EnablePromoteAlloca(false),
+  EnableIfCvt(true),
+  WavefrontSize(0),
+  CFALUBug(false),
+  LocalMemorySize(0),
+  InstrItins(getInstrItineraryForCPU(GPU)) {
+  // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
+  // enabled, but some instructions do not respect them and they run at the
+  // double precision rate, so don't enable by default.
+  //
+  // We want to be able to turn these off, but making this a subtarget feature
+  // for SI has the unhelpful behavior that it unsets everything else if you
+  // disable it.
+
+  SmallString<256> FullFS("+promote-alloca,+fp64-denormals,");
+  FullFS += FS;
+
+  ParseSubtargetFeatures(GPU, FullFS);
+
+  if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
+    InstrInfo.reset(new R600InstrInfo(*this));
+
+    // FIXME: I don't think think Evergreen has any useful support for
+    // denormals, but should be checked. Should we issue a warning somewhere if
+    // someone tries to enable these?
+    FP32Denormals = false;
+    FP64Denormals = false;
+  } else {
+    InstrInfo.reset(new SIInstrInfo(*this));
+  }
+}
+
+unsigned AMDGPUSubtarget::getStackEntrySize() const {
+  assert(getGeneration() <= NORTHERN_ISLANDS);
+  switch(getWavefrontSize()) {
+  case 16:
+    return 8;
+  case 32:
+    return hasCaymanISA() ? 4 : 8;
+  case 64:
+    return 4;
+  default:
+    llvm_unreachable("Illegal wavefront size.");
+  }
+}
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUSubtarget.h b/contrib/llvm/lib/Target/R600/AMDGPUSubtarget.h
new file mode 100644
index 0000000..a844b37
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUSubtarget.h
@@ -0,0 +1,199 @@
+//=====-- AMDGPUSubtarget.h - Define Subtarget for the AMDIL ---*- C++ -*-====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief AMDGPU specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPUSUBTARGET_H
+#define AMDGPUSUBTARGET_H
+#include "AMDGPU.h"
+#include "AMDGPUInstrInfo.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+#define GET_SUBTARGETINFO_HEADER
+#include "AMDGPUGenSubtargetInfo.inc"
+
+#define MAX_CB_SIZE (1 << 16)
+
+namespace llvm {
+
+class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
+
+  std::unique_ptr<AMDGPUInstrInfo> InstrInfo;
+
+public:
+  enum Generation {
+    R600 = 0,
+    R700,
+    EVERGREEN,
+    NORTHERN_ISLANDS,
+    SOUTHERN_ISLANDS,
+    SEA_ISLANDS
+  };
+
+private:
+  std::string DevName;
+  bool Is64bit;
+  bool DumpCode;
+  bool R600ALUInst;
+  bool HasVertexCache;
+  short TexVTXClauseSize;
+  Generation Gen;
+  bool FP64;
+  bool FP64Denormals;
+  bool FP32Denormals;
+  bool CaymanISA;
+  bool EnableIRStructurizer;
+  bool EnablePromoteAlloca;
+  bool EnableIfCvt;
+  unsigned WavefrontSize;
+  bool CFALUBug;
+  int LocalMemorySize;
+
+  InstrItineraryData InstrItins;
+
+public:
+  AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS);
+
+  const AMDGPUInstrInfo *getInstrInfo() const {
+    return InstrInfo.get();
+  }
+
+  const InstrItineraryData &getInstrItineraryData() const {
+    return InstrItins;
+  }
+
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  bool is64bit() const {
+    return Is64bit;
+  }
+
+  bool hasVertexCache() const {
+    return HasVertexCache;
+  }
+
+  short getTexVTXClauseSize() const {
+    return TexVTXClauseSize;
+  }
+
+  Generation getGeneration() const {
+    return Gen;
+  }
+
+  bool hasHWFP64() const {
+    return FP64;
+  }
+
+  bool hasCaymanISA() const {
+    return CaymanISA;
+  }
+
+  bool hasFP32Denormals() const {
+    return FP32Denormals;
+  }
+
+  bool hasFP64Denormals() const {
+    return FP64Denormals;
+  }
+
+  bool hasBFE() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
+  bool hasBFI() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
+  bool hasBFM() const {
+    return hasBFE();
+  }
+
+  bool hasBCNT(unsigned Size) const {
+    if (Size == 32)
+      return (getGeneration() >= EVERGREEN);
+
+    if (Size == 64)
+      return (getGeneration() >= SOUTHERN_ISLANDS);
+
+    return false;
+  }
+
+  bool hasMulU24() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
+  bool hasMulI24() const {
+    return (getGeneration() >= SOUTHERN_ISLANDS ||
+            hasCaymanISA());
+  }
+
+  bool hasFFBL() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
+  bool hasFFBH() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
+  bool IsIRStructurizerEnabled() const {
+    return EnableIRStructurizer;
+  }
+
+  bool isPromoteAllocaEnabled() const {
+    return EnablePromoteAlloca;
+  }
+
+  bool isIfCvtEnabled() const {
+    return EnableIfCvt;
+  }
+
+  unsigned getWavefrontSize() const {
+    return WavefrontSize;
+  }
+
+  unsigned getStackEntrySize() const;
+
+  bool hasCFAluBug() const {
+    assert(getGeneration() <= NORTHERN_ISLANDS);
+    return CFALUBug;
+  }
+
+  int getLocalMemorySize() const {
+    return LocalMemorySize;
+  }
+
+  bool enableMachineScheduler() const override {
+    return getGeneration() <= NORTHERN_ISLANDS;
+  }
+
+  // Helper functions to simplify if statements
+  bool isTargetELF() const {
+    return false;
+  }
+
+  StringRef getDeviceName() const {
+    return DevName;
+  }
+
+  bool dumpCode() const {
+    return DumpCode;
+  }
+  bool r600ALUEncoding() const {
+    return R600ALUInst;
+  }
+};
+
+} // End namespace llvm
+
+#endif // AMDGPUSUBTARGET_H
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUTargetMachine.cpp b/contrib/llvm/lib/Target/R600/AMDGPUTargetMachine.cpp
new file mode 100644
index 0000000..56ba719
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUTargetMachine.cpp
@@ -0,0 +1,221 @@
+//===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief The AMDGPU target machine contains all of the hardware specific
+/// information  needed to emit code for R600 and SI GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUTargetMachine.h"
+#include "AMDGPU.h"
+#include "R600ISelLowering.h"
+#include "R600InstrInfo.h"
+#include "R600MachineScheduler.h"
+#include "SIISelLowering.h"
+#include "SIInstrInfo.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_os_ostream.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Scalar.h"
+#include <llvm/CodeGen/Passes.h>
+
+using namespace llvm;
+
+extern "C" void LLVMInitializeR600Target() {
+  // Register the target
+  RegisterTargetMachine<AMDGPUTargetMachine> X(TheAMDGPUTarget);
+}
+
+static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
+  return new ScheduleDAGMILive(C, make_unique<R600SchedStrategy>());
+}
+
+static MachineSchedRegistry
+SchedCustomRegistry("r600", "Run R600's custom scheduler",
+                    createR600MachineScheduler);
+
+static std::string computeDataLayout(const AMDGPUSubtarget &ST) {
+  std::string Ret = "e-p:32:32";
+
+  if (ST.is64bit()) {
+    // 32-bit local, and region pointers. 64-bit private, global, and constant.
+    Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64";
+  }
+
+  Ret += "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256"
+         "-v512:512-v1024:1024-v2048:2048-n32:64";
+
+  return Ret;
+}
+
+AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, StringRef TT,
+    StringRef CPU, StringRef FS,
+  TargetOptions Options,
+  Reloc::Model RM, CodeModel::Model CM,
+  CodeGenOpt::Level OptLevel
+)
+:
+  LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OptLevel),
+  Subtarget(TT, CPU, FS),
+  Layout(computeDataLayout(Subtarget)),
+  FrameLowering(TargetFrameLowering::StackGrowsUp,
+                64 * 16 // Maximum stack alignment (long16)
+               , 0),
+  IntrinsicInfo(this),
+  InstrItins(&Subtarget.getInstrItineraryData()) {
+  // TLInfo uses InstrInfo so it must be initialized after.
+  if (Subtarget.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
+    TLInfo.reset(new R600TargetLowering(*this));
+  } else {
+    TLInfo.reset(new SITargetLowering(*this));
+  }
+  setRequiresStructuredCFG(true);
+  initAsmInfo();
+}
+
+AMDGPUTargetMachine::~AMDGPUTargetMachine() {
+}
+
+namespace {
+class AMDGPUPassConfig : public TargetPassConfig {
+public:
+  AMDGPUPassConfig(AMDGPUTargetMachine *TM, PassManagerBase &PM)
+    : TargetPassConfig(TM, PM) {}
+
+  AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
+    return getTM<AMDGPUTargetMachine>();
+  }
+
+  ScheduleDAGInstrs *
+  createMachineScheduler(MachineSchedContext *C) const override {
+    const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
+    if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
+      return createR600MachineScheduler(C);
+    return nullptr;
+  }
+
+  virtual void addCodeGenPrepare();
+  bool addPreISel() override;
+  bool addInstSelector() override;
+  bool addPreRegAlloc() override;
+  bool addPostRegAlloc() override;
+  bool addPreSched2() override;
+  bool addPreEmitPass() override;
+};
+} // End of anonymous namespace
+
+TargetPassConfig *AMDGPUTargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new AMDGPUPassConfig(this, PM);
+}
+
+//===----------------------------------------------------------------------===//
+// AMDGPU Analysis Pass Setup
+//===----------------------------------------------------------------------===//
+
+void AMDGPUTargetMachine::addAnalysisPasses(PassManagerBase &PM) {
+  // Add first the target-independent BasicTTI pass, then our AMDGPU pass. This
+  // allows the AMDGPU pass to delegate to the target independent layer when
+  // appropriate.
+  PM.add(createBasicTargetTransformInfoPass(this));
+  PM.add(createAMDGPUTargetTransformInfoPass(this));
+}
+
+void AMDGPUPassConfig::addCodeGenPrepare() {
+  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
+  if (ST.isPromoteAllocaEnabled()) {
+    addPass(createAMDGPUPromoteAlloca(ST));
+    addPass(createSROAPass());
+  }
+
+  TargetPassConfig::addCodeGenPrepare();
+}
+
+bool
+AMDGPUPassConfig::addPreISel() {
+  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
+  addPass(createFlattenCFGPass());
+  if (ST.IsIRStructurizerEnabled())
+    addPass(createStructurizeCFGPass());
+  if (ST.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+    addPass(createSinkingPass());
+    addPass(createSITypeRewriter());
+    addPass(createSIAnnotateControlFlowPass());
+  } else {
+    addPass(createR600TextureIntrinsicsReplacer());
+  }
+  return false;
+}
+
+bool AMDGPUPassConfig::addInstSelector() {
+  addPass(createAMDGPUISelDag(getAMDGPUTargetMachine()));
+  addPass(createSILowerI1CopiesPass());
+  return false;
+}
+
+bool AMDGPUPassConfig::addPreRegAlloc() {
+  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
+
+  if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
+    addPass(createR600VectorRegMerger(*TM));
+  } else {
+    addPass(createSIFixSGPRCopiesPass(*TM));
+    // SIFixSGPRCopies can generate a lot of duplicate instructions,
+    // so we need to run MachineCSE afterwards.
+    addPass(&MachineCSEID);
+    addPass(createSIShrinkInstructionsPass());
+    initializeSIFixSGPRLiveRangesPass(*PassRegistry::getPassRegistry());
+    insertPass(&RegisterCoalescerID, &SIFixSGPRLiveRangesID);
+  }
+  return false;
+}
+
+bool AMDGPUPassConfig::addPostRegAlloc() {
+  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
+
+  addPass(createSIShrinkInstructionsPass());
+  if (ST.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
+    addPass(createSIInsertWaits(*TM));
+  }
+  return false;
+}
+
+bool AMDGPUPassConfig::addPreSched2() {
+  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
+
+  if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
+    addPass(createR600EmitClauseMarkers());
+  if (ST.isIfCvtEnabled())
+    addPass(&IfConverterID);
+  if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
+    addPass(createR600ClauseMergePass(*TM));
+  return false;
+}
+
+bool AMDGPUPassConfig::addPreEmitPass() {
+  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
+  if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
+    addPass(createAMDGPUCFGStructurizerPass());
+    addPass(createR600ExpandSpecialInstrsPass(*TM));
+    addPass(&FinalizeMachineBundlesID);
+    addPass(createR600Packetizer(*TM));
+    addPass(createR600ControlFlowFinalizer(*TM));
+  } else {
+    addPass(createSILowerControlFlowPass(*TM));
+  }
+
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUTargetMachine.h b/contrib/llvm/lib/Target/R600/AMDGPUTargetMachine.h
new file mode 100644
index 0000000..3bb15be
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUTargetMachine.h
@@ -0,0 +1,71 @@
+//===-- AMDGPUTargetMachine.h - AMDGPU TargetMachine Interface --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief The AMDGPU TargetMachine interface definition for hw codgen targets.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPU_TARGET_MACHINE_H
+#define AMDGPU_TARGET_MACHINE_H
+
+#include "AMDGPUFrameLowering.h"
+#include "AMDGPUInstrInfo.h"
+#include "AMDGPUIntrinsicInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "R600ISelLowering.h"
+#include "llvm/IR/DataLayout.h"
+
+namespace llvm {
+
+class AMDGPUTargetMachine : public LLVMTargetMachine {
+
+  AMDGPUSubtarget Subtarget;
+  const DataLayout Layout;
+  AMDGPUFrameLowering FrameLowering;
+  AMDGPUIntrinsicInfo IntrinsicInfo;
+  std::unique_ptr<AMDGPUTargetLowering> TLInfo;
+  const InstrItineraryData *InstrItins;
+
+public:
+  AMDGPUTargetMachine(const Target &T, StringRef TT, StringRef FS,
+                      StringRef CPU, TargetOptions Options, Reloc::Model RM,
+                      CodeModel::Model CM, CodeGenOpt::Level OL);
+  ~AMDGPUTargetMachine();
+  const AMDGPUFrameLowering *getFrameLowering() const override {
+    return &FrameLowering;
+  }
+  const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override {
+    return &IntrinsicInfo;
+  }
+  const AMDGPUInstrInfo *getInstrInfo() const override {
+    return getSubtargetImpl()->getInstrInfo();
+  }
+  const AMDGPUSubtarget *getSubtargetImpl() const override {
+    return &Subtarget;
+  }
+  const AMDGPURegisterInfo *getRegisterInfo() const override {
+    return &getInstrInfo()->getRegisterInfo();
+  }
+  AMDGPUTargetLowering *getTargetLowering() const override {
+    return TLInfo.get();
+  }
+  const InstrItineraryData *getInstrItineraryData() const override {
+    return InstrItins;
+  }
+  const DataLayout *getDataLayout() const override { return &Layout; }
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+  /// \brief Register R600 analysis passes with a pass manager.
+  void addAnalysisPasses(PassManagerBase &PM) override;
+};
+
+} // End namespace llvm
+
+#endif // AMDGPU_TARGET_MACHINE_H
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUTargetTransformInfo.cpp b/contrib/llvm/lib/Target/R600/AMDGPUTargetTransformInfo.cpp
new file mode 100644
index 0000000..88934b6
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUTargetTransformInfo.cpp
@@ -0,0 +1,153 @@
+//===-- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+// This file implements a TargetTransformInfo analysis pass specific to the
+// AMDGPU target machine. It uses the target's detailed information to provide
+// more precise answers to certain TTI queries, while letting the target
+// independent and default TTI implementations handle the rest.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/CostTable.h"
+#include "llvm/Target/TargetLowering.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "AMDGPUtti"
+
+// Declare the pass initialization routine locally as target-specific passes
+// don't have a target-wide initialization entry point, and so we rely on the
+// pass constructor initialization.
+namespace llvm {
+void initializeAMDGPUTTIPass(PassRegistry &);
+}
+
+namespace {
+
+class AMDGPUTTI final : public ImmutablePass, public TargetTransformInfo {
+  const AMDGPUTargetMachine *TM;
+  const AMDGPUSubtarget *ST;
+  const AMDGPUTargetLowering *TLI;
+
+  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
+  /// are set if the result needs to be inserted and/or extracted from vectors.
+  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
+
+public:
+  AMDGPUTTI() : ImmutablePass(ID), TM(nullptr), ST(nullptr), TLI(nullptr) {
+    llvm_unreachable("This pass cannot be directly constructed");
+  }
+
+  AMDGPUTTI(const AMDGPUTargetMachine *TM)
+      : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
+        TLI(TM->getTargetLowering()) {
+    initializeAMDGPUTTIPass(*PassRegistry::getPassRegistry());
+  }
+
+  void initializePass() override { pushTTIStack(this); }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    TargetTransformInfo::getAnalysisUsage(AU);
+  }
+
+  /// Pass identification.
+  static char ID;
+
+  /// Provide necessary pointer adjustments for the two base classes.
+  void *getAdjustedAnalysisPointer(const void *ID) override {
+    if (ID == &TargetTransformInfo::ID)
+      return (TargetTransformInfo *)this;
+    return this;
+  }
+
+  bool hasBranchDivergence() const override;
+
+  void getUnrollingPreferences(Loop *L,
+                               UnrollingPreferences &UP) const override;
+
+  PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const override;
+
+  unsigned getNumberOfRegisters(bool Vector) const override;
+  unsigned getRegisterBitWidth(bool Vector) const override;
+  unsigned getMaximumUnrollFactor() const override;
+
+  /// @}
+};
+
+} // end anonymous namespace
+
+INITIALIZE_AG_PASS(AMDGPUTTI, TargetTransformInfo, "AMDGPUtti",
+                   "AMDGPU Target Transform Info", true, true, false)
+char AMDGPUTTI::ID = 0;
+
+ImmutablePass *
+llvm::createAMDGPUTargetTransformInfoPass(const AMDGPUTargetMachine *TM) {
+  return new AMDGPUTTI(TM);
+}
+
+bool AMDGPUTTI::hasBranchDivergence() const { return true; }
+
+void AMDGPUTTI::getUnrollingPreferences(Loop *L,
+                                        UnrollingPreferences &UP) const {
+  for (const BasicBlock *BB : L->getBlocks()) {
+    for (const Instruction &I : *BB) {
+      const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
+      if (!GEP || GEP->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
+        continue;
+
+      const Value *Ptr = GEP->getPointerOperand();
+      const AllocaInst *Alloca = dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr));
+      if (Alloca) {
+        // We want to do whatever we can to limit the number of alloca
+        // instructions that make it through to the code generator.  allocas
+        // require us to use indirect addressing, which is slow and prone to
+        // compiler bugs.  If this loop does an address calculation on an
+        // alloca ptr, then we want to use a higher than normal loop unroll
+        // threshold. This will give SROA a better chance to eliminate these
+        // allocas.
+        //
+        // Don't use the maximum allowed value here as it will make some
+        // programs way too big.
+        UP.Threshold = 500;
+      }
+    }
+  }
+}
+
+AMDGPUTTI::PopcntSupportKind
+AMDGPUTTI::getPopcntSupport(unsigned TyWidth) const {
+  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
+  return ST->hasBCNT(TyWidth) ? PSK_FastHardware : PSK_Software;
+}
+
+unsigned AMDGPUTTI::getNumberOfRegisters(bool Vec) const {
+  if (Vec)
+    return 0;
+
+  // Number of VGPRs on SI.
+  if (ST->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)
+    return 256;
+
+  return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
+}
+
+unsigned AMDGPUTTI::getRegisterBitWidth(bool) const {
+  return 32;
+}
+
+unsigned AMDGPUTTI::getMaximumUnrollFactor() const {
+  // Semi-arbitrary large amount.
+  return 64;
+}
diff --git a/contrib/llvm/lib/Target/R600/AMDILCFGStructurizer.cpp b/contrib/llvm/lib/Target/R600/AMDILCFGStructurizer.cpp
new file mode 100644
index 0000000..f3a0391
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDILCFGStructurizer.cpp
@@ -0,0 +1,1911 @@
+//===-- AMDILCFGStructurizer.cpp - CFG Structurizer -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//==-----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUInstrInfo.h"
+#include "R600InstrInfo.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "structcfg"
+
+#define DEFAULT_VEC_SLOTS 8
+
+// TODO: move-begin.
+
+//===----------------------------------------------------------------------===//
+//
+// Statistics for CFGStructurizer.
+//
+//===----------------------------------------------------------------------===//
+
+STATISTIC(numSerialPatternMatch,    "CFGStructurizer number of serial pattern "
+    "matched");
+STATISTIC(numIfPatternMatch,        "CFGStructurizer number of if pattern "
+    "matched");
+STATISTIC(numLoopcontPatternMatch,  "CFGStructurizer number of loop-continue "
+    "pattern matched");
+STATISTIC(numClonedBlock,           "CFGStructurizer cloned blocks");
+STATISTIC(numClonedInstr,           "CFGStructurizer cloned instructions");
+
+namespace llvm {
+  void initializeAMDGPUCFGStructurizerPass(PassRegistry&);
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Miscellaneous utility for CFGStructurizer.
+//
+//===----------------------------------------------------------------------===//
+namespace {
+#define SHOWNEWINSTR(i) \
+  DEBUG(dbgs() << "New instr: " << *i << "\n");
+
+#define SHOWNEWBLK(b, msg) \
+DEBUG( \
+  dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
+  dbgs() << "\n"; \
+);
+
+#define SHOWBLK_DETAIL(b, msg) \
+DEBUG( \
+  if (b) { \
+  dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
+  b->print(dbgs()); \
+  dbgs() << "\n"; \
+  } \
+);
+
+#define INVALIDSCCNUM -1
+
+template<class NodeT>
+void ReverseVector(SmallVectorImpl<NodeT *> &Src) {
+  size_t sz = Src.size();
+  for (size_t i = 0; i < sz/2; ++i) {
+    NodeT *t = Src[i];
+    Src[i] = Src[sz - i - 1];
+    Src[sz - i - 1] = t;
+  }
+}
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+//
+// supporting data structure for CFGStructurizer
+//
+//===----------------------------------------------------------------------===//
+
+
+namespace {
+
+class BlockInformation {
+public:
+  bool IsRetired;
+  int  SccNum;
+  BlockInformation() : IsRetired(false), SccNum(INVALIDSCCNUM) {}
+};
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+//
+// CFGStructurizer
+//
+//===----------------------------------------------------------------------===//
+
+namespace {
+class AMDGPUCFGStructurizer : public MachineFunctionPass {
+public:
+  typedef SmallVector<MachineBasicBlock *, 32> MBBVector;
+  typedef std::map<MachineBasicBlock *, BlockInformation *> MBBInfoMap;
+  typedef std::map<MachineLoop *, MachineBasicBlock *> LoopLandInfoMap;
+
+  enum PathToKind {
+    Not_SinglePath = 0,
+    SinglePath_InPath = 1,
+    SinglePath_NotInPath = 2
+  };
+
+  static char ID;
+
+  AMDGPUCFGStructurizer() :
+      MachineFunctionPass(ID), TII(nullptr), TRI(nullptr) {
+    initializeAMDGPUCFGStructurizerPass(*PassRegistry::getPassRegistry());
+  }
+
+   const char *getPassName() const override {
+    return "AMDGPU Control Flow Graph structurizer Pass";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addPreserved<MachineFunctionAnalysis>();
+    AU.addRequired<MachineFunctionAnalysis>();
+    AU.addRequired<MachineDominatorTree>();
+    AU.addRequired<MachinePostDominatorTree>();
+    AU.addRequired<MachineLoopInfo>();
+  }
+
+  /// Perform the CFG structurization
+  bool run();
+
+  /// Perform the CFG preparation
+  /// This step will remove every unconditionnal/dead jump instructions and make
+  /// sure all loops have an exit block
+  bool prepare();
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
+    TRI = &TII->getRegisterInfo();
+    DEBUG(MF.dump(););
+    OrderedBlks.clear();
+    FuncRep = &MF;
+    MLI = &getAnalysis<MachineLoopInfo>();
+    DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI););
+    MDT = &getAnalysis<MachineDominatorTree>();
+    DEBUG(MDT->print(dbgs(), (const llvm::Module*)nullptr););
+    PDT = &getAnalysis<MachinePostDominatorTree>();
+    DEBUG(PDT->print(dbgs()););
+    prepare();
+    run();
+    DEBUG(MF.dump(););
+    return true;
+  }
+
+protected:
+  MachineDominatorTree *MDT;
+  MachinePostDominatorTree *PDT;
+  MachineLoopInfo *MLI;
+  const R600InstrInfo *TII;
+  const AMDGPURegisterInfo *TRI;
+
+  // PRINT FUNCTIONS
+  /// Print the ordered Blocks.
+  void printOrderedBlocks() const {
+    size_t i = 0;
+    for (MBBVector::const_iterator iterBlk = OrderedBlks.begin(),
+        iterBlkEnd = OrderedBlks.end(); iterBlk != iterBlkEnd; ++iterBlk, ++i) {
+      dbgs() << "BB" << (*iterBlk)->getNumber();
+      dbgs() << "(" << getSCCNum(*iterBlk) << "," << (*iterBlk)->size() << ")";
+      if (i != 0 && i % 10 == 0) {
+        dbgs() << "\n";
+      } else {
+        dbgs() << " ";
+      }
+    }
+  }
+  static void PrintLoopinfo(const MachineLoopInfo &LoopInfo) {
+    for (MachineLoop::iterator iter = LoopInfo.begin(),
+         iterEnd = LoopInfo.end(); iter != iterEnd; ++iter) {
+      (*iter)->print(dbgs(), 0);
+    }
+  }
+
+  // UTILITY FUNCTIONS
+  int getSCCNum(MachineBasicBlock *MBB) const;
+  MachineBasicBlock *getLoopLandInfo(MachineLoop *LoopRep) const;
+  bool hasBackEdge(MachineBasicBlock *MBB) const;
+  static unsigned getLoopDepth(MachineLoop *LoopRep);
+  bool isRetiredBlock(MachineBasicBlock *MBB) const;
+  bool isActiveLoophead(MachineBasicBlock *MBB) const;
+  PathToKind singlePathTo(MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB,
+      bool AllowSideEntry = true) const;
+  int countActiveBlock(MBBVector::const_iterator It,
+      MBBVector::const_iterator E) const;
+  bool needMigrateBlock(MachineBasicBlock *MBB) const;
+
+  // Utility Functions
+  void reversePredicateSetter(MachineBasicBlock::iterator I);
+  /// Compute the reversed DFS post order of Blocks
+  void orderBlocks(MachineFunction *MF);
+
+  // Function originally from CFGStructTraits
+  void insertInstrEnd(MachineBasicBlock *MBB, int NewOpcode,
+      DebugLoc DL = DebugLoc());
+  MachineInstr *insertInstrBefore(MachineBasicBlock *MBB, int NewOpcode,
+    DebugLoc DL = DebugLoc());
+  MachineInstr *insertInstrBefore(MachineBasicBlock::iterator I, int NewOpcode);
+  void insertCondBranchBefore(MachineBasicBlock::iterator I, int NewOpcode,
+      DebugLoc DL);
+  void insertCondBranchBefore(MachineBasicBlock *MBB,
+      MachineBasicBlock::iterator I, int NewOpcode, int RegNum,
+      DebugLoc DL);
+  void insertCondBranchEnd(MachineBasicBlock *MBB, int NewOpcode, int RegNum);
+  static int getBranchNzeroOpcode(int OldOpcode);
+  static int getBranchZeroOpcode(int OldOpcode);
+  static int getContinueNzeroOpcode(int OldOpcode);
+  static int getContinueZeroOpcode(int OldOpcode);
+  static MachineBasicBlock *getTrueBranch(MachineInstr *MI);
+  static void setTrueBranch(MachineInstr *MI, MachineBasicBlock *MBB);
+  static MachineBasicBlock *getFalseBranch(MachineBasicBlock *MBB,
+      MachineInstr *MI);
+  static bool isCondBranch(MachineInstr *MI);
+  static bool isUncondBranch(MachineInstr *MI);
+  static DebugLoc getLastDebugLocInBB(MachineBasicBlock *MBB);
+  static MachineInstr *getNormalBlockBranchInstr(MachineBasicBlock *MBB);
+  /// The correct naming for this is getPossibleLoopendBlockBranchInstr.
+  ///
+  /// BB with backward-edge could have move instructions after the branch
+  /// instruction.  Such move instruction "belong to" the loop backward-edge.
+  MachineInstr *getLoopendBlockBranchInstr(MachineBasicBlock *MBB);
+  static MachineInstr *getReturnInstr(MachineBasicBlock *MBB);
+  static MachineInstr *getContinueInstr(MachineBasicBlock *MBB);
+  static bool isReturnBlock(MachineBasicBlock *MBB);
+  static void cloneSuccessorList(MachineBasicBlock *DstMBB,
+      MachineBasicBlock *SrcMBB) ;
+  static MachineBasicBlock *clone(MachineBasicBlock *MBB);
+  /// MachineBasicBlock::ReplaceUsesOfBlockWith doesn't serve the purpose
+  /// because the AMDGPU instruction is not recognized as terminator fix this
+  /// and retire this routine
+  void replaceInstrUseOfBlockWith(MachineBasicBlock *SrcMBB,
+      MachineBasicBlock *OldMBB, MachineBasicBlock *NewBlk);
+  static void wrapup(MachineBasicBlock *MBB);
+
+
+  int patternMatch(MachineBasicBlock *MBB);
+  int patternMatchGroup(MachineBasicBlock *MBB);
+  int serialPatternMatch(MachineBasicBlock *MBB);
+  int ifPatternMatch(MachineBasicBlock *MBB);
+  int loopendPatternMatch();
+  int mergeLoop(MachineLoop *LoopRep);
+  int loopcontPatternMatch(MachineLoop *LoopRep, MachineBasicBlock *LoopHeader);
+
+  void handleLoopcontBlock(MachineBasicBlock *ContingMBB,
+      MachineLoop *ContingLoop, MachineBasicBlock *ContMBB,
+      MachineLoop *ContLoop);
+  /// return true iff src1Blk->succ_size() == 0 && src1Blk and src2Blk are in
+  /// the same loop with LoopLandInfo without explicitly keeping track of
+  /// loopContBlks and loopBreakBlks, this is a method to get the information.
+  bool isSameloopDetachedContbreak(MachineBasicBlock *Src1MBB,
+      MachineBasicBlock *Src2MBB);
+  int handleJumpintoIf(MachineBasicBlock *HeadMBB,
+      MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB);
+  int handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
+      MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB);
+  int improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
+      MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
+      MachineBasicBlock **LandMBBPtr);
+  void showImproveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
+      MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
+      MachineBasicBlock *LandMBB, bool Detail = false);
+  int cloneOnSideEntryTo(MachineBasicBlock *PreMBB,
+      MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB);
+  void mergeSerialBlock(MachineBasicBlock *DstMBB,
+      MachineBasicBlock *SrcMBB);
+
+  void mergeIfthenelseBlock(MachineInstr *BranchMI,
+      MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB,
+      MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB);
+  void mergeLooplandBlock(MachineBasicBlock *DstMBB,
+      MachineBasicBlock *LandMBB);
+  void mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
+      MachineBasicBlock *LandMBB);
+  void settleLoopcontBlock(MachineBasicBlock *ContingMBB,
+      MachineBasicBlock *ContMBB);
+  /// normalizeInfiniteLoopExit change
+  ///   B1:
+  ///        uncond_br LoopHeader
+  ///
+  /// to
+  ///   B1:
+  ///        cond_br 1 LoopHeader dummyExit
+  /// and return the newly added dummy exit block
+  MachineBasicBlock *normalizeInfiniteLoopExit(MachineLoop *LoopRep);
+  void removeUnconditionalBranch(MachineBasicBlock *MBB);
+  /// Remove duplicate branches instructions in a block.
+  /// For instance
+  /// B0:
+  ///    cond_br X B1 B2
+  ///    cond_br X B1 B2
+  /// is transformed to
+  /// B0:
+  ///    cond_br X B1 B2
+  void removeRedundantConditionalBranch(MachineBasicBlock *MBB);
+  void addDummyExitBlock(SmallVectorImpl<MachineBasicBlock *> &RetMBB);
+  void removeSuccessor(MachineBasicBlock *MBB);
+  MachineBasicBlock *cloneBlockForPredecessor(MachineBasicBlock *MBB,
+      MachineBasicBlock *PredMBB);
+  void migrateInstruction(MachineBasicBlock *SrcMBB,
+      MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I);
+  void recordSccnum(MachineBasicBlock *MBB, int SCCNum);
+  void retireBlock(MachineBasicBlock *MBB);
+  void setLoopLandBlock(MachineLoop *LoopRep, MachineBasicBlock *MBB = nullptr);
+
+  MachineBasicBlock *findNearestCommonPostDom(std::set<MachineBasicBlock *>&);
+  /// This is work around solution for findNearestCommonDominator not avaiable
+  /// to post dom a proper fix should go to Dominators.h.
+  MachineBasicBlock *findNearestCommonPostDom(MachineBasicBlock *MBB1,
+      MachineBasicBlock *MBB2);
+
+private:
+  MBBInfoMap BlockInfoMap;
+  LoopLandInfoMap LLInfoMap;
+  std::map<MachineLoop *, bool> Visited;
+  MachineFunction *FuncRep;
+  SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> OrderedBlks;
+};
+
+int AMDGPUCFGStructurizer::getSCCNum(MachineBasicBlock *MBB) const {
+  MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB);
+  if (It == BlockInfoMap.end())
+    return INVALIDSCCNUM;
+  return (*It).second->SccNum;
+}
+
+MachineBasicBlock *AMDGPUCFGStructurizer::getLoopLandInfo(MachineLoop *LoopRep)
+    const {
+  LoopLandInfoMap::const_iterator It = LLInfoMap.find(LoopRep);
+  if (It == LLInfoMap.end())
+    return nullptr;
+  return (*It).second;
+}
+
+bool AMDGPUCFGStructurizer::hasBackEdge(MachineBasicBlock *MBB) const {
+  MachineLoop *LoopRep = MLI->getLoopFor(MBB);
+  if (!LoopRep)
+    return false;
+  MachineBasicBlock *LoopHeader = LoopRep->getHeader();
+  return MBB->isSuccessor(LoopHeader);
+}
+
+unsigned AMDGPUCFGStructurizer::getLoopDepth(MachineLoop *LoopRep) {
+  return LoopRep ? LoopRep->getLoopDepth() : 0;
+}
+
+bool AMDGPUCFGStructurizer::isRetiredBlock(MachineBasicBlock *MBB) const {
+  MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB);
+  if (It == BlockInfoMap.end())
+    return false;
+  return (*It).second->IsRetired;
+}
+
+bool AMDGPUCFGStructurizer::isActiveLoophead(MachineBasicBlock *MBB) const {
+  MachineLoop *LoopRep = MLI->getLoopFor(MBB);
+  while (LoopRep && LoopRep->getHeader() == MBB) {
+    MachineBasicBlock *LoopLand = getLoopLandInfo(LoopRep);
+    if(!LoopLand)
+      return true;
+    if (!isRetiredBlock(LoopLand))
+      return true;
+    LoopRep = LoopRep->getParentLoop();
+  }
+  return false;
+}
+AMDGPUCFGStructurizer::PathToKind AMDGPUCFGStructurizer::singlePathTo(
+    MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB,
+    bool AllowSideEntry) const {
+  assert(DstMBB);
+  if (SrcMBB == DstMBB)
+    return SinglePath_InPath;
+  while (SrcMBB && SrcMBB->succ_size() == 1) {
+    SrcMBB = *SrcMBB->succ_begin();
+    if (SrcMBB == DstMBB)
+      return SinglePath_InPath;
+    if (!AllowSideEntry && SrcMBB->pred_size() > 1)
+      return Not_SinglePath;
+  }
+  if (SrcMBB && SrcMBB->succ_size()==0)
+    return SinglePath_NotInPath;
+  return Not_SinglePath;
+}
+
+int AMDGPUCFGStructurizer::countActiveBlock(MBBVector::const_iterator It,
+    MBBVector::const_iterator E) const {
+  int Count = 0;
+  while (It != E) {
+    if (!isRetiredBlock(*It))
+      ++Count;
+    ++It;
+  }
+  return Count;
+}
+
+bool AMDGPUCFGStructurizer::needMigrateBlock(MachineBasicBlock *MBB) const {
+  unsigned BlockSizeThreshold = 30;
+  unsigned CloneInstrThreshold = 100;
+  bool MultiplePreds = MBB && (MBB->pred_size() > 1);
+
+  if(!MultiplePreds)
+    return false;
+  unsigned BlkSize = MBB->size();
+  return ((BlkSize > BlockSizeThreshold) &&
+      (BlkSize * (MBB->pred_size() - 1) > CloneInstrThreshold));
+}
+
+void AMDGPUCFGStructurizer::reversePredicateSetter(
+    MachineBasicBlock::iterator I) {
+  while (I--) {
+    if (I->getOpcode() == AMDGPU::PRED_X) {
+      switch (static_cast<MachineInstr *>(I)->getOperand(2).getImm()) {
+      case OPCODE_IS_ZERO_INT:
+        static_cast<MachineInstr *>(I)->getOperand(2)
+            .setImm(OPCODE_IS_NOT_ZERO_INT);
+        return;
+      case OPCODE_IS_NOT_ZERO_INT:
+        static_cast<MachineInstr *>(I)->getOperand(2)
+            .setImm(OPCODE_IS_ZERO_INT);
+        return;
+      case OPCODE_IS_ZERO:
+        static_cast<MachineInstr *>(I)->getOperand(2)
+            .setImm(OPCODE_IS_NOT_ZERO);
+        return;
+      case OPCODE_IS_NOT_ZERO:
+        static_cast<MachineInstr *>(I)->getOperand(2)
+            .setImm(OPCODE_IS_ZERO);
+        return;
+      default:
+        llvm_unreachable("PRED_X Opcode invalid!");
+      }
+    }
+  }
+}
+
+void AMDGPUCFGStructurizer::insertInstrEnd(MachineBasicBlock *MBB,
+    int NewOpcode, DebugLoc DL) {
+ MachineInstr *MI = MBB->getParent()
+    ->CreateMachineInstr(TII->get(NewOpcode), DL);
+  MBB->push_back(MI);
+  //assume the instruction doesn't take any reg operand ...
+  SHOWNEWINSTR(MI);
+}
+
+MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(MachineBasicBlock *MBB,
+    int NewOpcode, DebugLoc DL) {
+  MachineInstr *MI =
+      MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DL);
+  if (MBB->begin() != MBB->end())
+    MBB->insert(MBB->begin(), MI);
+  else
+    MBB->push_back(MI);
+  SHOWNEWINSTR(MI);
+  return MI;
+}
+
+MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(
+    MachineBasicBlock::iterator I, int NewOpcode) {
+  MachineInstr *OldMI = &(*I);
+  MachineBasicBlock *MBB = OldMI->getParent();
+  MachineInstr *NewMBB =
+      MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DebugLoc());
+  MBB->insert(I, NewMBB);
+  //assume the instruction doesn't take any reg operand ...
+  SHOWNEWINSTR(NewMBB);
+  return NewMBB;
+}
+
+void AMDGPUCFGStructurizer::insertCondBranchBefore(
+    MachineBasicBlock::iterator I, int NewOpcode, DebugLoc DL) {
+  MachineInstr *OldMI = &(*I);
+  MachineBasicBlock *MBB = OldMI->getParent();
+  MachineFunction *MF = MBB->getParent();
+  MachineInstr *NewMI = MF->CreateMachineInstr(TII->get(NewOpcode), DL);
+  MBB->insert(I, NewMI);
+  MachineInstrBuilder MIB(*MF, NewMI);
+  MIB.addReg(OldMI->getOperand(1).getReg(), false);
+  SHOWNEWINSTR(NewMI);
+  //erase later oldInstr->eraseFromParent();
+}
+
+void AMDGPUCFGStructurizer::insertCondBranchBefore(MachineBasicBlock *blk,
+    MachineBasicBlock::iterator I, int NewOpcode, int RegNum,
+    DebugLoc DL) {
+  MachineFunction *MF = blk->getParent();
+  MachineInstr *NewInstr = MF->CreateMachineInstr(TII->get(NewOpcode), DL);
+  //insert before
+  blk->insert(I, NewInstr);
+  MachineInstrBuilder(*MF, NewInstr).addReg(RegNum, false);
+  SHOWNEWINSTR(NewInstr);
+}
+
+void AMDGPUCFGStructurizer::insertCondBranchEnd(MachineBasicBlock *MBB,
+    int NewOpcode, int RegNum) {
+  MachineFunction *MF = MBB->getParent();
+  MachineInstr *NewInstr =
+    MF->CreateMachineInstr(TII->get(NewOpcode), DebugLoc());
+  MBB->push_back(NewInstr);
+  MachineInstrBuilder(*MF, NewInstr).addReg(RegNum, false);
+  SHOWNEWINSTR(NewInstr);
+}
+
+int AMDGPUCFGStructurizer::getBranchNzeroOpcode(int OldOpcode) {
+  switch(OldOpcode) {
+  case AMDGPU::JUMP_COND:
+  case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
+  case AMDGPU::BRANCH_COND_i32:
+  case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALNZ_f32;
+  default: llvm_unreachable("internal error");
+  }
+  return -1;
+}
+
+int AMDGPUCFGStructurizer::getBranchZeroOpcode(int OldOpcode) {
+  switch(OldOpcode) {
+  case AMDGPU::JUMP_COND:
+  case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
+  case AMDGPU::BRANCH_COND_i32:
+  case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALZ_f32;
+  default: llvm_unreachable("internal error");
+  }
+  return -1;
+}
+
+int AMDGPUCFGStructurizer::getContinueNzeroOpcode(int OldOpcode) {
+  switch(OldOpcode) {
+  case AMDGPU::JUMP_COND:
+  case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALNZ_i32;
+  default: llvm_unreachable("internal error");
+  };
+  return -1;
+}
+
+int AMDGPUCFGStructurizer::getContinueZeroOpcode(int OldOpcode) {
+  switch(OldOpcode) {
+  case AMDGPU::JUMP_COND:
+  case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALZ_i32;
+  default: llvm_unreachable("internal error");
+  }
+  return -1;
+}
+
+MachineBasicBlock *AMDGPUCFGStructurizer::getTrueBranch(MachineInstr *MI) {
+  return MI->getOperand(0).getMBB();
+}
+
+void AMDGPUCFGStructurizer::setTrueBranch(MachineInstr *MI,
+    MachineBasicBlock *MBB) {
+  MI->getOperand(0).setMBB(MBB);
+}
+
+MachineBasicBlock *
+AMDGPUCFGStructurizer::getFalseBranch(MachineBasicBlock *MBB,
+    MachineInstr *MI) {
+  assert(MBB->succ_size() == 2);
+  MachineBasicBlock *TrueBranch = getTrueBranch(MI);
+  MachineBasicBlock::succ_iterator It = MBB->succ_begin();
+  MachineBasicBlock::succ_iterator Next = It;
+  ++Next;
+  return (*It == TrueBranch) ? *Next : *It;
+}
+
+bool AMDGPUCFGStructurizer::isCondBranch(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+    case AMDGPU::JUMP_COND:
+    case AMDGPU::BRANCH_COND_i32:
+    case AMDGPU::BRANCH_COND_f32: return true;
+  default:
+    return false;
+  }
+  return false;
+}
+
+bool AMDGPUCFGStructurizer::isUncondBranch(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  case AMDGPU::JUMP:
+  case AMDGPU::BRANCH:
+    return true;
+  default:
+    return false;
+  }
+  return false;
+}
+
+DebugLoc AMDGPUCFGStructurizer::getLastDebugLocInBB(MachineBasicBlock *MBB) {
+  //get DebugLoc from the first MachineBasicBlock instruction with debug info
+  DebugLoc DL;
+  for (MachineBasicBlock::iterator It = MBB->begin(); It != MBB->end();
+      ++It) {
+    MachineInstr *instr = &(*It);
+    if (instr->getDebugLoc().isUnknown() == false)
+      DL = instr->getDebugLoc();
+  }
+  return DL;
+}
+
+MachineInstr *AMDGPUCFGStructurizer::getNormalBlockBranchInstr(
+    MachineBasicBlock *MBB) {
+  MachineBasicBlock::reverse_iterator It = MBB->rbegin();
+  MachineInstr *MI = &*It;
+  if (MI && (isCondBranch(MI) || isUncondBranch(MI)))
+    return MI;
+  return nullptr;
+}
+
+MachineInstr *AMDGPUCFGStructurizer::getLoopendBlockBranchInstr(
+    MachineBasicBlock *MBB) {
+  for (MachineBasicBlock::reverse_iterator It = MBB->rbegin(), E = MBB->rend();
+      It != E; ++It) {
+    // FIXME: Simplify
+    MachineInstr *MI = &*It;
+    if (MI) {
+      if (isCondBranch(MI) || isUncondBranch(MI))
+        return MI;
+      else if (!TII->isMov(MI->getOpcode()))
+        break;
+    }
+  }
+  return nullptr;
+}
+
+MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) {
+  MachineBasicBlock::reverse_iterator It = MBB->rbegin();
+  if (It != MBB->rend()) {
+    MachineInstr *instr = &(*It);
+    if (instr->getOpcode() == AMDGPU::RETURN)
+      return instr;
+  }
+  return nullptr;
+}
+
+MachineInstr *AMDGPUCFGStructurizer::getContinueInstr(MachineBasicBlock *MBB) {
+  MachineBasicBlock::reverse_iterator It = MBB->rbegin();
+  if (It != MBB->rend()) {
+    MachineInstr *MI = &(*It);
+    if (MI->getOpcode() == AMDGPU::CONTINUE)
+      return MI;
+  }
+  return nullptr;
+}
+
+bool AMDGPUCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) {
+  MachineInstr *MI = getReturnInstr(MBB);
+  bool IsReturn = (MBB->succ_size() == 0);
+  if (MI)
+    assert(IsReturn);
+  else if (IsReturn)
+    DEBUG(
+      dbgs() << "BB" << MBB->getNumber()
+             <<" is return block without RETURN instr\n";);
+  return  IsReturn;
+}
+
+void AMDGPUCFGStructurizer::cloneSuccessorList(MachineBasicBlock *DstMBB,
+    MachineBasicBlock *SrcMBB) {
+  for (MachineBasicBlock::succ_iterator It = SrcMBB->succ_begin(),
+       iterEnd = SrcMBB->succ_end(); It != iterEnd; ++It)
+    DstMBB->addSuccessor(*It);  // *iter's predecessor is also taken care of
+}
+
+MachineBasicBlock *AMDGPUCFGStructurizer::clone(MachineBasicBlock *MBB) {
+  MachineFunction *Func = MBB->getParent();
+  MachineBasicBlock *NewMBB = Func->CreateMachineBasicBlock();
+  Func->push_back(NewMBB);  //insert to function
+  for (MachineBasicBlock::iterator It = MBB->begin(), E = MBB->end();
+      It != E; ++It) {
+    MachineInstr *MI = Func->CloneMachineInstr(It);
+    NewMBB->push_back(MI);
+  }
+  return NewMBB;
+}
+
+void AMDGPUCFGStructurizer::replaceInstrUseOfBlockWith(
+    MachineBasicBlock *SrcMBB, MachineBasicBlock *OldMBB,
+    MachineBasicBlock *NewBlk) {
+  MachineInstr *BranchMI = getLoopendBlockBranchInstr(SrcMBB);
+  if (BranchMI && isCondBranch(BranchMI) &&
+      getTrueBranch(BranchMI) == OldMBB)
+    setTrueBranch(BranchMI, NewBlk);
+}
+
+void AMDGPUCFGStructurizer::wrapup(MachineBasicBlock *MBB) {
+  assert((!MBB->getParent()->getJumpTableInfo()
+          || MBB->getParent()->getJumpTableInfo()->isEmpty())
+         && "found a jump table");
+
+   //collect continue right before endloop
+   SmallVector<MachineInstr *, DEFAULT_VEC_SLOTS> ContInstr;
+   MachineBasicBlock::iterator Pre = MBB->begin();
+   MachineBasicBlock::iterator E = MBB->end();
+   MachineBasicBlock::iterator It = Pre;
+   while (It != E) {
+     if (Pre->getOpcode() == AMDGPU::CONTINUE
+         && It->getOpcode() == AMDGPU::ENDLOOP)
+       ContInstr.push_back(Pre);
+     Pre = It;
+     ++It;
+   }
+
+   //delete continue right before endloop
+   for (unsigned i = 0; i < ContInstr.size(); ++i)
+      ContInstr[i]->eraseFromParent();
+
+   // TODO to fix up jump table so later phase won't be confused.  if
+   // (jumpTableInfo->isEmpty() == false) { need to clean the jump table, but
+   // there isn't such an interface yet.  alternatively, replace all the other
+   // blocks in the jump table with the entryBlk //}
+
+}
+
+
+bool AMDGPUCFGStructurizer::prepare() {
+  bool Changed = false;
+
+  //FIXME: if not reducible flow graph, make it so ???
+
+  DEBUG(dbgs() << "AMDGPUCFGStructurizer::prepare\n";);
+
+  orderBlocks(FuncRep);
+
+  SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> RetBlks;
+
+  // Add an ExitBlk to loop that don't have one
+  for (MachineLoopInfo::iterator It = MLI->begin(),
+       E = MLI->end(); It != E; ++It) {
+    MachineLoop *LoopRep = (*It);
+    MBBVector ExitingMBBs;
+    LoopRep->getExitingBlocks(ExitingMBBs);
+
+    if (ExitingMBBs.size() == 0) {
+      MachineBasicBlock* DummyExitBlk = normalizeInfiniteLoopExit(LoopRep);
+      if (DummyExitBlk)
+        RetBlks.push_back(DummyExitBlk);
+    }
+  }
+
+  // Remove unconditional branch instr.
+  // Add dummy exit block iff there are multiple returns.
+  for (SmallVectorImpl<MachineBasicBlock *>::const_iterator
+       It = OrderedBlks.begin(), E = OrderedBlks.end(); It != E; ++It) {
+    MachineBasicBlock *MBB = *It;
+    removeUnconditionalBranch(MBB);
+    removeRedundantConditionalBranch(MBB);
+    if (isReturnBlock(MBB)) {
+      RetBlks.push_back(MBB);
+    }
+    assert(MBB->succ_size() <= 2);
+  }
+
+  if (RetBlks.size() >= 2) {
+    addDummyExitBlock(RetBlks);
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+bool AMDGPUCFGStructurizer::run() {
+
+  //Assume reducible CFG...
+  DEBUG(dbgs() << "AMDGPUCFGStructurizer::run\n");
+
+#ifdef STRESSTEST
+  //Use the worse block ordering to test the algorithm.
+  ReverseVector(orderedBlks);
+#endif
+
+  DEBUG(dbgs() << "Ordered blocks:\n"; printOrderedBlocks(););
+  int NumIter = 0;
+  bool Finish = false;
+  MachineBasicBlock *MBB;
+  bool MakeProgress = false;
+  int NumRemainedBlk = countActiveBlock(OrderedBlks.begin(),
+                                        OrderedBlks.end());
+
+  do {
+    ++NumIter;
+    DEBUG(
+      dbgs() << "numIter = " << NumIter
+             << ", numRemaintedBlk = " << NumRemainedBlk << "\n";
+    );
+
+    SmallVectorImpl<MachineBasicBlock *>::const_iterator It =
+        OrderedBlks.begin();
+    SmallVectorImpl<MachineBasicBlock *>::const_iterator E =
+        OrderedBlks.end();
+
+    SmallVectorImpl<MachineBasicBlock *>::const_iterator SccBeginIter =
+        It;
+    MachineBasicBlock *SccBeginMBB = nullptr;
+    int SccNumBlk = 0;  // The number of active blocks, init to a
+                        // maximum possible number.
+    int SccNumIter;     // Number of iteration in this SCC.
+
+    while (It != E) {
+      MBB = *It;
+
+      if (!SccBeginMBB) {
+        SccBeginIter = It;
+        SccBeginMBB = MBB;
+        SccNumIter = 0;
+        SccNumBlk = NumRemainedBlk; // Init to maximum possible number.
+        DEBUG(
+              dbgs() << "start processing SCC" << getSCCNum(SccBeginMBB);
+              dbgs() << "\n";
+        );
+      }
+
+      if (!isRetiredBlock(MBB))
+        patternMatch(MBB);
+
+      ++It;
+
+      bool ContNextScc = true;
+      if (It == E
+          || getSCCNum(SccBeginMBB) != getSCCNum(*It)) {
+        // Just finish one scc.
+        ++SccNumIter;
+        int sccRemainedNumBlk = countActiveBlock(SccBeginIter, It);
+        if (sccRemainedNumBlk != 1 && sccRemainedNumBlk >= SccNumBlk) {
+          DEBUG(
+            dbgs() << "Can't reduce SCC " << getSCCNum(MBB)
+                   << ", sccNumIter = " << SccNumIter;
+            dbgs() << "doesn't make any progress\n";
+          );
+          ContNextScc = true;
+        } else if (sccRemainedNumBlk != 1 && sccRemainedNumBlk < SccNumBlk) {
+          SccNumBlk = sccRemainedNumBlk;
+          It = SccBeginIter;
+          ContNextScc = false;
+          DEBUG(
+            dbgs() << "repeat processing SCC" << getSCCNum(MBB)
+                   << "sccNumIter = " << SccNumIter << '\n';
+          );
+        } else {
+          // Finish the current scc.
+          ContNextScc = true;
+        }
+      } else {
+        // Continue on next component in the current scc.
+        ContNextScc = false;
+      }
+
+      if (ContNextScc)
+        SccBeginMBB = nullptr;
+    } //while, "one iteration" over the function.
+
+    MachineBasicBlock *EntryMBB =
+        GraphTraits<MachineFunction *>::nodes_begin(FuncRep);
+    if (EntryMBB->succ_size() == 0) {
+      Finish = true;
+      DEBUG(
+        dbgs() << "Reduce to one block\n";
+      );
+    } else {
+      int NewnumRemainedBlk
+        = countActiveBlock(OrderedBlks.begin(), OrderedBlks.end());
+      // consider cloned blocks ??
+      if (NewnumRemainedBlk == 1 || NewnumRemainedBlk < NumRemainedBlk) {
+        MakeProgress = true;
+        NumRemainedBlk = NewnumRemainedBlk;
+      } else {
+        MakeProgress = false;
+        DEBUG(
+          dbgs() << "No progress\n";
+        );
+      }
+    }
+  } while (!Finish && MakeProgress);
+
+  // Misc wrap up to maintain the consistency of the Function representation.
+  wrapup(GraphTraits<MachineFunction *>::nodes_begin(FuncRep));
+
+  // Detach retired Block, release memory.
+  for (MBBInfoMap::iterator It = BlockInfoMap.begin(), E = BlockInfoMap.end();
+      It != E; ++It) {
+    if ((*It).second && (*It).second->IsRetired) {
+      assert(((*It).first)->getNumber() != -1);
+      DEBUG(
+        dbgs() << "Erase BB" << ((*It).first)->getNumber() << "\n";
+      );
+      (*It).first->eraseFromParent();  //Remove from the parent Function.
+    }
+    delete (*It).second;
+  }
+  BlockInfoMap.clear();
+  LLInfoMap.clear();
+
+  if (!Finish) {
+    DEBUG(FuncRep->viewCFG());
+    llvm_unreachable("IRREDUCIBLE_CFG");
+  }
+
+  return true;
+}
+
+
+
+void AMDGPUCFGStructurizer::orderBlocks(MachineFunction *MF) {
+  int SccNum = 0;
+  MachineBasicBlock *MBB;
+  for (scc_iterator<MachineFunction *> It = scc_begin(MF); !It.isAtEnd();
+       ++It, ++SccNum) {
+    const std::vector<MachineBasicBlock *> &SccNext = *It;
+    for (std::vector<MachineBasicBlock *>::const_iterator
+         blockIter = SccNext.begin(), blockEnd = SccNext.end();
+         blockIter != blockEnd; ++blockIter) {
+      MBB = *blockIter;
+      OrderedBlks.push_back(MBB);
+      recordSccnum(MBB, SccNum);
+    }
+  }
+
+  //walk through all the block in func to check for unreachable
+  typedef GraphTraits<MachineFunction *> GTM;
+  MachineFunction::iterator It = GTM::nodes_begin(MF), E = GTM::nodes_end(MF);
+  for (; It != E; ++It) {
+    MachineBasicBlock *MBB = &(*It);
+    SccNum = getSCCNum(MBB);
+    if (SccNum == INVALIDSCCNUM)
+      dbgs() << "unreachable block BB" << MBB->getNumber() << "\n";
+  }
+}
+
+int AMDGPUCFGStructurizer::patternMatch(MachineBasicBlock *MBB) {
+  int NumMatch = 0;
+  int CurMatch;
+
+  DEBUG(
+        dbgs() << "Begin patternMatch BB" << MBB->getNumber() << "\n";
+  );
+
+  while ((CurMatch = patternMatchGroup(MBB)) > 0)
+    NumMatch += CurMatch;
+
+  DEBUG(
+        dbgs() << "End patternMatch BB" << MBB->getNumber()
+      << ", numMatch = " << NumMatch << "\n";
+  );
+
+  return NumMatch;
+}
+
+int AMDGPUCFGStructurizer::patternMatchGroup(MachineBasicBlock *MBB) {
+  int NumMatch = 0;
+  NumMatch += loopendPatternMatch();
+  NumMatch += serialPatternMatch(MBB);
+  NumMatch += ifPatternMatch(MBB);
+  return NumMatch;
+}
+
+
+int AMDGPUCFGStructurizer::serialPatternMatch(MachineBasicBlock *MBB) {
+  if (MBB->succ_size() != 1)
+    return 0;
+
+  MachineBasicBlock *childBlk = *MBB->succ_begin();
+  if (childBlk->pred_size() != 1 || isActiveLoophead(childBlk))
+    return 0;
+
+  mergeSerialBlock(MBB, childBlk);
+  ++numSerialPatternMatch;
+  return 1;
+}
+
+int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
+  //two edges
+  if (MBB->succ_size() != 2)
+    return 0;
+  if (hasBackEdge(MBB))
+    return 0;
+  MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB);
+  if (!BranchMI)
+    return 0;
+
+  assert(isCondBranch(BranchMI));
+  int NumMatch = 0;
+
+  MachineBasicBlock *TrueMBB = getTrueBranch(BranchMI);
+  NumMatch += serialPatternMatch(TrueMBB);
+  NumMatch += ifPatternMatch(TrueMBB);
+  MachineBasicBlock *FalseMBB = getFalseBranch(MBB, BranchMI);
+  NumMatch += serialPatternMatch(FalseMBB);
+  NumMatch += ifPatternMatch(FalseMBB);
+  MachineBasicBlock *LandBlk;
+  int Cloned = 0;
+
+  assert (!TrueMBB->succ_empty() || !FalseMBB->succ_empty());
+  // TODO: Simplify
+  if (TrueMBB->succ_size() == 1 && FalseMBB->succ_size() == 1
+    && *TrueMBB->succ_begin() == *FalseMBB->succ_begin()) {
+    // Diamond pattern
+    LandBlk = *TrueMBB->succ_begin();
+  } else if (TrueMBB->succ_size() == 1 && *TrueMBB->succ_begin() == FalseMBB) {
+    // Triangle pattern, false is empty
+    LandBlk = FalseMBB;
+    FalseMBB = nullptr;
+  } else if (FalseMBB->succ_size() == 1
+             && *FalseMBB->succ_begin() == TrueMBB) {
+    // Triangle pattern, true is empty
+    // We reverse the predicate to make a triangle, empty false pattern;
+    std::swap(TrueMBB, FalseMBB);
+    reversePredicateSetter(MBB->end());
+    LandBlk = FalseMBB;
+    FalseMBB = nullptr;
+  } else if (FalseMBB->succ_size() == 1
+             && isSameloopDetachedContbreak(TrueMBB, FalseMBB)) {
+    LandBlk = *FalseMBB->succ_begin();
+  } else if (TrueMBB->succ_size() == 1
+    && isSameloopDetachedContbreak(FalseMBB, TrueMBB)) {
+    LandBlk = *TrueMBB->succ_begin();
+  } else {
+    return NumMatch + handleJumpintoIf(MBB, TrueMBB, FalseMBB);
+  }
+
+  // improveSimpleJumpinfoIf can handle the case where landBlk == NULL but the
+  // new BB created for landBlk==NULL may introduce new challenge to the
+  // reduction process.
+  if (LandBlk &&
+      ((TrueMBB && TrueMBB->pred_size() > 1)
+      || (FalseMBB && FalseMBB->pred_size() > 1))) {
+     Cloned += improveSimpleJumpintoIf(MBB, TrueMBB, FalseMBB, &LandBlk);
+  }
+
+  if (TrueMBB && TrueMBB->pred_size() > 1) {
+    TrueMBB = cloneBlockForPredecessor(TrueMBB, MBB);
+    ++Cloned;
+  }
+
+  if (FalseMBB && FalseMBB->pred_size() > 1) {
+    FalseMBB = cloneBlockForPredecessor(FalseMBB, MBB);
+    ++Cloned;
+  }
+
+  mergeIfthenelseBlock(BranchMI, MBB, TrueMBB, FalseMBB, LandBlk);
+
+  ++numIfPatternMatch;
+
+  numClonedBlock += Cloned;
+
+  return 1 + Cloned + NumMatch;
+}
+
+int AMDGPUCFGStructurizer::loopendPatternMatch() {
+  std::vector<MachineLoop *> NestedLoops;
+  for (MachineLoopInfo::iterator It = MLI->begin(), E = MLI->end(); It != E;
+       ++It)
+    for (MachineLoop *ML : depth_first(*It))
+      NestedLoops.push_back(ML);
+
+  if (NestedLoops.size() == 0)
+    return 0;
+
+  // Process nested loop outside->inside, so "continue" to a outside loop won't
+  // be mistaken as "break" of the current loop.
+  int Num = 0;
+  for (std::vector<MachineLoop *>::reverse_iterator It = NestedLoops.rbegin(),
+      E = NestedLoops.rend(); It != E; ++It) {
+    MachineLoop *ExaminedLoop = *It;
+    if (ExaminedLoop->getNumBlocks() == 0 || Visited[ExaminedLoop])
+      continue;
+    DEBUG(dbgs() << "Processing:\n"; ExaminedLoop->dump(););
+    int NumBreak = mergeLoop(ExaminedLoop);
+    if (NumBreak == -1)
+      break;
+    Num += NumBreak;
+  }
+  return Num;
+}
+
+int AMDGPUCFGStructurizer::mergeLoop(MachineLoop *LoopRep) {
+  MachineBasicBlock *LoopHeader = LoopRep->getHeader();
+  MBBVector ExitingMBBs;
+  LoopRep->getExitingBlocks(ExitingMBBs);
+  assert(!ExitingMBBs.empty() && "Infinite Loop not supported");
+  DEBUG(dbgs() << "Loop has " << ExitingMBBs.size() << " exiting blocks\n";);
+  // We assume a single ExitBlk
+  MBBVector ExitBlks;
+  LoopRep->getExitBlocks(ExitBlks);
+  SmallPtrSet<MachineBasicBlock *, 2> ExitBlkSet;
+  for (unsigned i = 0, e = ExitBlks.size(); i < e; ++i)
+    ExitBlkSet.insert(ExitBlks[i]);
+  assert(ExitBlkSet.size() == 1);
+  MachineBasicBlock *ExitBlk = *ExitBlks.begin();
+  assert(ExitBlk && "Loop has several exit block");
+  MBBVector LatchBlks;
+  typedef GraphTraits<Inverse<MachineBasicBlock*> > InvMBBTraits;
+  InvMBBTraits::ChildIteratorType PI = InvMBBTraits::child_begin(LoopHeader),
+      PE = InvMBBTraits::child_end(LoopHeader);
+  for (; PI != PE; PI++) {
+    if (LoopRep->contains(*PI))
+      LatchBlks.push_back(*PI);
+  }
+
+  for (unsigned i = 0, e = ExitingMBBs.size(); i < e; ++i)
+    mergeLoopbreakBlock(ExitingMBBs[i], ExitBlk);
+  for (unsigned i = 0, e = LatchBlks.size(); i < e; ++i)
+    settleLoopcontBlock(LatchBlks[i], LoopHeader);
+  int Match = 0;
+  do {
+    Match = 0;
+    Match += serialPatternMatch(LoopHeader);
+    Match += ifPatternMatch(LoopHeader);
+  } while (Match > 0);
+  mergeLooplandBlock(LoopHeader, ExitBlk);
+  MachineLoop *ParentLoop = LoopRep->getParentLoop();
+  if (ParentLoop)
+    MLI->changeLoopFor(LoopHeader, ParentLoop);
+  else
+    MLI->removeBlock(LoopHeader);
+  Visited[LoopRep] = true;
+  return 1;
+}
+
+int AMDGPUCFGStructurizer::loopcontPatternMatch(MachineLoop *LoopRep,
+    MachineBasicBlock *LoopHeader) {
+  int NumCont = 0;
+  SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> ContMBB;
+  typedef GraphTraits<Inverse<MachineBasicBlock *> > GTIM;
+  GTIM::ChildIteratorType It = GTIM::child_begin(LoopHeader),
+      E = GTIM::child_end(LoopHeader);
+  for (; It != E; ++It) {
+    MachineBasicBlock *MBB = *It;
+    if (LoopRep->contains(MBB)) {
+      handleLoopcontBlock(MBB, MLI->getLoopFor(MBB),
+                          LoopHeader, LoopRep);
+      ContMBB.push_back(MBB);
+      ++NumCont;
+    }
+  }
+
+  for (SmallVectorImpl<MachineBasicBlock *>::iterator It = ContMBB.begin(),
+      E = ContMBB.end(); It != E; ++It) {
+    (*It)->removeSuccessor(LoopHeader);
+  }
+
+  numLoopcontPatternMatch += NumCont;
+
+  return NumCont;
+}
+
+
+bool AMDGPUCFGStructurizer::isSameloopDetachedContbreak(
+    MachineBasicBlock *Src1MBB, MachineBasicBlock *Src2MBB) {
+  if (Src1MBB->succ_size() == 0) {
+    MachineLoop *LoopRep = MLI->getLoopFor(Src1MBB);
+    if (LoopRep&& LoopRep == MLI->getLoopFor(Src2MBB)) {
+      MachineBasicBlock *&TheEntry = LLInfoMap[LoopRep];
+      if (TheEntry) {
+        DEBUG(
+          dbgs() << "isLoopContBreakBlock yes src1 = BB"
+                 << Src1MBB->getNumber()
+                 << " src2 = BB" << Src2MBB->getNumber() << "\n";
+        );
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+int AMDGPUCFGStructurizer::handleJumpintoIf(MachineBasicBlock *HeadMBB,
+    MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) {
+  int Num = handleJumpintoIfImp(HeadMBB, TrueMBB, FalseMBB);
+  if (Num == 0) {
+    DEBUG(
+      dbgs() << "handleJumpintoIf swap trueBlk and FalseBlk" << "\n";
+    );
+    Num = handleJumpintoIfImp(HeadMBB, FalseMBB, TrueMBB);
+  }
+  return Num;
+}
+
+int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
+    MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) {
+  int Num = 0;
+  MachineBasicBlock *DownBlk;
+
+  //trueBlk could be the common post dominator
+  DownBlk = TrueMBB;
+
+  DEBUG(
+    dbgs() << "handleJumpintoIfImp head = BB" << HeadMBB->getNumber()
+           << " true = BB" << TrueMBB->getNumber()
+           << ", numSucc=" << TrueMBB->succ_size()
+           << " false = BB" << FalseMBB->getNumber() << "\n";
+  );
+
+  while (DownBlk) {
+    DEBUG(
+      dbgs() << "check down = BB" << DownBlk->getNumber();
+    );
+
+    if (singlePathTo(FalseMBB, DownBlk) == SinglePath_InPath) {
+      DEBUG(
+        dbgs() << " working\n";
+      );
+
+      Num += cloneOnSideEntryTo(HeadMBB, TrueMBB, DownBlk);
+      Num += cloneOnSideEntryTo(HeadMBB, FalseMBB, DownBlk);
+
+      numClonedBlock += Num;
+      Num += serialPatternMatch(*HeadMBB->succ_begin());
+      Num += serialPatternMatch(*std::next(HeadMBB->succ_begin()));
+      Num += ifPatternMatch(HeadMBB);
+      assert(Num > 0);
+
+      break;
+    }
+    DEBUG(
+      dbgs() << " not working\n";
+    );
+    DownBlk = (DownBlk->succ_size() == 1) ? (*DownBlk->succ_begin()) : nullptr;
+  } // walk down the postDomTree
+
+  return Num;
+}
+
+void AMDGPUCFGStructurizer::showImproveSimpleJumpintoIf(
+    MachineBasicBlock *HeadMBB, MachineBasicBlock *TrueMBB,
+    MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB, bool Detail) {
+  dbgs() << "head = BB" << HeadMBB->getNumber()
+         << " size = " << HeadMBB->size();
+  if (Detail) {
+    dbgs() << "\n";
+    HeadMBB->print(dbgs());
+    dbgs() << "\n";
+  }
+
+  if (TrueMBB) {
+    dbgs() << ", true = BB" << TrueMBB->getNumber() << " size = "
+           << TrueMBB->size() << " numPred = " << TrueMBB->pred_size();
+    if (Detail) {
+      dbgs() << "\n";
+      TrueMBB->print(dbgs());
+      dbgs() << "\n";
+    }
+  }
+  if (FalseMBB) {
+    dbgs() << ", false = BB" << FalseMBB->getNumber() << " size = "
+           << FalseMBB->size() << " numPred = " << FalseMBB->pred_size();
+    if (Detail) {
+      dbgs() << "\n";
+      FalseMBB->print(dbgs());
+      dbgs() << "\n";
+    }
+  }
+  if (LandMBB) {
+    dbgs() << ", land = BB" << LandMBB->getNumber() << " size = "
+           << LandMBB->size() << " numPred = " << LandMBB->pred_size();
+    if (Detail) {
+      dbgs() << "\n";
+      LandMBB->print(dbgs());
+      dbgs() << "\n";
+    }
+  }
+
+    dbgs() << "\n";
+}
+
+int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
+    MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
+    MachineBasicBlock **LandMBBPtr) {
+  bool MigrateTrue = false;
+  bool MigrateFalse = false;
+
+  MachineBasicBlock *LandBlk = *LandMBBPtr;
+
+  assert((!TrueMBB || TrueMBB->succ_size() <= 1)
+         && (!FalseMBB || FalseMBB->succ_size() <= 1));
+
+  if (TrueMBB == FalseMBB)
+    return 0;
+
+  MigrateTrue = needMigrateBlock(TrueMBB);
+  MigrateFalse = needMigrateBlock(FalseMBB);
+
+  if (!MigrateTrue && !MigrateFalse)
+    return 0;
+
+  // If we need to migrate either trueBlk and falseBlk, migrate the rest that
+  // have more than one predecessors.  without doing this, its predecessor
+  // rather than headBlk will have undefined value in initReg.
+  if (!MigrateTrue && TrueMBB && TrueMBB->pred_size() > 1)
+    MigrateTrue = true;
+  if (!MigrateFalse && FalseMBB && FalseMBB->pred_size() > 1)
+    MigrateFalse = true;
+
+  DEBUG(
+    dbgs() << "before improveSimpleJumpintoIf: ";
+    showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0);
+  );
+
+  // org: headBlk => if () {trueBlk} else {falseBlk} => landBlk
+  //
+  // new: headBlk => if () {initReg = 1; org trueBlk branch} else
+  //      {initReg = 0; org falseBlk branch }
+  //      => landBlk => if (initReg) {org trueBlk} else {org falseBlk}
+  //      => org landBlk
+  //      if landBlk->pred_size() > 2, put the about if-else inside
+  //      if (initReg !=2) {...}
+  //
+  // add initReg = initVal to headBlk
+
+  const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
+  if (!MigrateTrue || !MigrateFalse) {
+    // XXX: We have an opportunity here to optimize the "branch into if" case
+    // here.  Branch into if looks like this:
+    //                        entry
+    //                       /     |
+    //           diamond_head       branch_from
+    //             /      \           |
+    // diamond_false        diamond_true
+    //             \      /
+    //               done
+    //
+    // The diamond_head block begins the "if" and the diamond_true block
+    // is the block being "branched into".
+    //
+    // If MigrateTrue is true, then TrueBB is the block being "branched into"
+    // and if MigrateFalse is true, then FalseBB is the block being
+    // "branched into"
+    // 
+    // Here is the pseudo code for how I think the optimization should work:
+    // 1. Insert MOV GPR0, 0 before the branch instruction in diamond_head.
+    // 2. Insert MOV GPR0, 1 before the branch instruction in branch_from.
+    // 3. Move the branch instruction from diamond_head into its own basic
+    //    block (new_block).
+    // 4. Add an unconditional branch from diamond_head to new_block
+    // 5. Replace the branch instruction in branch_from with an unconditional
+    //    branch to new_block.  If branch_from has multiple predecessors, then
+    //    we need to replace the True/False block in the branch
+    //    instruction instead of replacing it.
+    // 6. Change the condition of the branch instruction in new_block from
+    //    COND to (COND || GPR0)
+    //
+    // In order insert these MOV instruction, we will need to use the
+    // RegisterScavenger.  Usually liveness stops being tracked during
+    // the late machine optimization passes, however if we implement
+    // bool TargetRegisterInfo::requiresRegisterScavenging(
+    //                                                const MachineFunction &MF)
+    // and have it return true, liveness will be tracked correctly 
+    // by generic optimization passes.  We will also need to make sure that
+    // all of our target-specific passes that run after regalloc and before
+    // the CFGStructurizer track liveness and we will need to modify this pass
+    // to correctly track liveness.
+    //
+    // After the above changes, the new CFG should look like this:
+    //                        entry
+    //                       /     |
+    //           diamond_head       branch_from
+    //                       \     /
+    //                      new_block
+    //                      /      |
+    //         diamond_false        diamond_true
+    //                      \      /
+    //                        done
+    //
+    // Without this optimization, we are forced to duplicate the diamond_true
+    // block and we will end up with a CFG like this:
+    //
+    //                        entry
+    //                       /     |
+    //           diamond_head       branch_from
+    //             /      \                   |
+    // diamond_false        diamond_true      diamond_true (duplicate)
+    //             \      /                   |
+    //               done --------------------|
+    //
+    // Duplicating diamond_true can be very costly especially if it has a
+    // lot of instructions.
+    return 0;
+  }
+
+  int NumNewBlk = 0;
+
+  bool LandBlkHasOtherPred = (LandBlk->pred_size() > 2);
+
+  //insert AMDGPU::ENDIF to avoid special case "input landBlk == NULL"
+  MachineBasicBlock::iterator I = insertInstrBefore(LandBlk, AMDGPU::ENDIF);
+
+  if (LandBlkHasOtherPred) {
+    llvm_unreachable("Extra register needed to handle CFG");
+    unsigned CmpResReg =
+      HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
+    llvm_unreachable("Extra compare instruction needed to handle CFG");
+    insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET,
+        CmpResReg, DebugLoc());
+  }
+
+  // XXX: We are running this after RA, so creating virtual registers will
+  // cause an assertion failure in the PostRA scheduling pass.
+  unsigned InitReg =
+    HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
+  insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET, InitReg,
+      DebugLoc());
+
+  if (MigrateTrue) {
+    migrateInstruction(TrueMBB, LandBlk, I);
+    // need to uncondionally insert the assignment to ensure a path from its
+    // predecessor rather than headBlk has valid value in initReg if
+    // (initVal != 1).
+    llvm_unreachable("Extra register needed to handle CFG");
+  }
+  insertInstrBefore(I, AMDGPU::ELSE);
+
+  if (MigrateFalse) {
+    migrateInstruction(FalseMBB, LandBlk, I);
+    // need to uncondionally insert the assignment to ensure a path from its
+    // predecessor rather than headBlk has valid value in initReg if
+    // (initVal != 0)
+    llvm_unreachable("Extra register needed to handle CFG");
+  }
+
+  if (LandBlkHasOtherPred) {
+    // add endif
+    insertInstrBefore(I, AMDGPU::ENDIF);
+
+    // put initReg = 2 to other predecessors of landBlk
+    for (MachineBasicBlock::pred_iterator PI = LandBlk->pred_begin(),
+         PE = LandBlk->pred_end(); PI != PE; ++PI) {
+      MachineBasicBlock *MBB = *PI;
+      if (MBB != TrueMBB && MBB != FalseMBB)
+        llvm_unreachable("Extra register needed to handle CFG");
+    }
+  }
+  DEBUG(
+    dbgs() << "result from improveSimpleJumpintoIf: ";
+    showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0);
+  );
+
+  // update landBlk
+  *LandMBBPtr = LandBlk;
+
+  return NumNewBlk;
+}
+
+void AMDGPUCFGStructurizer::handleLoopcontBlock(MachineBasicBlock *ContingMBB,
+    MachineLoop *ContingLoop, MachineBasicBlock *ContMBB,
+    MachineLoop *ContLoop) {
+  DEBUG(dbgs() << "loopcontPattern cont = BB" << ContingMBB->getNumber()
+               << " header = BB" << ContMBB->getNumber() << "\n";
+        dbgs() << "Trying to continue loop-depth = "
+               << getLoopDepth(ContLoop)
+               << " from loop-depth = " << getLoopDepth(ContingLoop) << "\n";);
+  settleLoopcontBlock(ContingMBB, ContMBB);
+}
+
+void AMDGPUCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB,
+    MachineBasicBlock *SrcMBB) {
+  DEBUG(
+    dbgs() << "serialPattern BB" << DstMBB->getNumber()
+           << " <= BB" << SrcMBB->getNumber() << "\n";
+  );
+  DstMBB->splice(DstMBB->end(), SrcMBB, SrcMBB->begin(), SrcMBB->end());
+
+  DstMBB->removeSuccessor(SrcMBB);
+  cloneSuccessorList(DstMBB, SrcMBB);
+
+  removeSuccessor(SrcMBB);
+  MLI->removeBlock(SrcMBB);
+  retireBlock(SrcMBB);
+}
+
+void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
+    MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB,
+    MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB) {
+  assert (TrueMBB);
+  DEBUG(
+    dbgs() << "ifPattern BB" << MBB->getNumber();
+    dbgs() << "{  ";
+    if (TrueMBB) {
+      dbgs() << "BB" << TrueMBB->getNumber();
+    }
+    dbgs() << "  } else ";
+    dbgs() << "{  ";
+    if (FalseMBB) {
+      dbgs() << "BB" << FalseMBB->getNumber();
+    }
+    dbgs() << "  }\n ";
+    dbgs() << "landBlock: ";
+    if (!LandMBB) {
+      dbgs() << "NULL";
+    } else {
+      dbgs() << "BB" << LandMBB->getNumber();
+    }
+    dbgs() << "\n";
+  );
+
+  int OldOpcode = BranchMI->getOpcode();
+  DebugLoc BranchDL = BranchMI->getDebugLoc();
+
+//    transform to
+//    if cond
+//       trueBlk
+//    else
+//       falseBlk
+//    endif
+//    landBlk
+
+  MachineBasicBlock::iterator I = BranchMI;
+  insertCondBranchBefore(I, getBranchNzeroOpcode(OldOpcode),
+      BranchDL);
+
+  if (TrueMBB) {
+    MBB->splice(I, TrueMBB, TrueMBB->begin(), TrueMBB->end());
+    MBB->removeSuccessor(TrueMBB);
+    if (LandMBB && TrueMBB->succ_size()!=0)
+      TrueMBB->removeSuccessor(LandMBB);
+    retireBlock(TrueMBB);
+    MLI->removeBlock(TrueMBB);
+  }
+
+  if (FalseMBB) {
+    insertInstrBefore(I, AMDGPU::ELSE);
+    MBB->splice(I, FalseMBB, FalseMBB->begin(),
+                   FalseMBB->end());
+    MBB->removeSuccessor(FalseMBB);
+    if (LandMBB && FalseMBB->succ_size() != 0)
+      FalseMBB->removeSuccessor(LandMBB);
+    retireBlock(FalseMBB);
+    MLI->removeBlock(FalseMBB);
+  }
+  insertInstrBefore(I, AMDGPU::ENDIF);
+
+  BranchMI->eraseFromParent();
+
+  if (LandMBB && TrueMBB && FalseMBB)
+    MBB->addSuccessor(LandMBB);
+
+}
+
+void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk,
+    MachineBasicBlock *LandMBB) {
+  DEBUG(dbgs() << "loopPattern header = BB" << DstBlk->getNumber()
+               << " land = BB" << LandMBB->getNumber() << "\n";);
+
+  insertInstrBefore(DstBlk, AMDGPU::WHILELOOP, DebugLoc());
+  insertInstrEnd(DstBlk, AMDGPU::ENDLOOP, DebugLoc());
+  DstBlk->addSuccessor(LandMBB);
+  DstBlk->removeSuccessor(DstBlk);
+}
+
+
+void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
+    MachineBasicBlock *LandMBB) {
+  DEBUG(dbgs() << "loopbreakPattern exiting = BB" << ExitingMBB->getNumber()
+               << " land = BB" << LandMBB->getNumber() << "\n";);
+  MachineInstr *BranchMI = getLoopendBlockBranchInstr(ExitingMBB);
+  assert(BranchMI && isCondBranch(BranchMI));
+  DebugLoc DL = BranchMI->getDebugLoc();
+  MachineBasicBlock *TrueBranch = getTrueBranch(BranchMI);
+  MachineBasicBlock::iterator I = BranchMI;
+  if (TrueBranch != LandMBB)
+    reversePredicateSetter(I);
+  insertCondBranchBefore(ExitingMBB, I, AMDGPU::IF_PREDICATE_SET, AMDGPU::PREDICATE_BIT, DL);
+  insertInstrBefore(I, AMDGPU::BREAK);
+  insertInstrBefore(I, AMDGPU::ENDIF);
+  //now branchInst can be erase safely
+  BranchMI->eraseFromParent();
+  //now take care of successors, retire blocks
+  ExitingMBB->removeSuccessor(LandMBB);
+}
+
+void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB,
+    MachineBasicBlock *ContMBB) {
+  DEBUG(dbgs() << "settleLoopcontBlock conting = BB"
+               << ContingMBB->getNumber()
+               << ", cont = BB" << ContMBB->getNumber() << "\n";);
+
+  MachineInstr *MI = getLoopendBlockBranchInstr(ContingMBB);
+  if (MI) {
+    assert(isCondBranch(MI));
+    MachineBasicBlock::iterator I = MI;
+    MachineBasicBlock *TrueBranch = getTrueBranch(MI);
+    int OldOpcode = MI->getOpcode();
+    DebugLoc DL = MI->getDebugLoc();
+
+    bool UseContinueLogical = ((&*ContingMBB->rbegin()) == MI);
+
+    if (UseContinueLogical == false) {
+      int BranchOpcode =
+          TrueBranch == ContMBB ? getBranchNzeroOpcode(OldOpcode) :
+          getBranchZeroOpcode(OldOpcode);
+      insertCondBranchBefore(I, BranchOpcode, DL);
+      // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
+      insertInstrEnd(ContingMBB, AMDGPU::CONTINUE, DL);
+      insertInstrEnd(ContingMBB, AMDGPU::ENDIF, DL);
+    } else {
+      int BranchOpcode =
+          TrueBranch == ContMBB ? getContinueNzeroOpcode(OldOpcode) :
+          getContinueZeroOpcode(OldOpcode);
+      insertCondBranchBefore(I, BranchOpcode, DL);
+    }
+
+    MI->eraseFromParent();
+  } else {
+    // if we've arrived here then we've already erased the branch instruction
+    // travel back up the basic block to see the last reference of our debug
+    // location we've just inserted that reference here so it should be
+    // representative insertEnd to ensure phi-moves, if exist, go before the
+    // continue-instr.
+    insertInstrEnd(ContingMBB, AMDGPU::CONTINUE,
+        getLastDebugLocInBB(ContingMBB));
+  }
+}
+
+int AMDGPUCFGStructurizer::cloneOnSideEntryTo(MachineBasicBlock *PreMBB,
+    MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB) {
+  int Cloned = 0;
+  assert(PreMBB->isSuccessor(SrcMBB));
+  while (SrcMBB && SrcMBB != DstMBB) {
+    assert(SrcMBB->succ_size() == 1);
+    if (SrcMBB->pred_size() > 1) {
+      SrcMBB = cloneBlockForPredecessor(SrcMBB, PreMBB);
+      ++Cloned;
+    }
+
+    PreMBB = SrcMBB;
+    SrcMBB = *SrcMBB->succ_begin();
+  }
+
+  return Cloned;
+}
+
+MachineBasicBlock *
+AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB,
+    MachineBasicBlock *PredMBB) {
+  assert(PredMBB->isSuccessor(MBB) &&
+         "succBlk is not a prececessor of curBlk");
+
+  MachineBasicBlock *CloneMBB = clone(MBB);  //clone instructions
+  replaceInstrUseOfBlockWith(PredMBB, MBB, CloneMBB);
+  //srcBlk, oldBlk, newBlk
+
+  PredMBB->removeSuccessor(MBB);
+  PredMBB->addSuccessor(CloneMBB);
+
+  // add all successor to cloneBlk
+  cloneSuccessorList(CloneMBB, MBB);
+
+  numClonedInstr += MBB->size();
+
+  DEBUG(
+    dbgs() << "Cloned block: " << "BB"
+           << MBB->getNumber() << "size " << MBB->size() << "\n";
+  );
+
+  SHOWNEWBLK(CloneMBB, "result of Cloned block: ");
+
+  return CloneMBB;
+}
+
+void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB,
+    MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I) {
+  MachineBasicBlock::iterator SpliceEnd;
+  //look for the input branchinstr, not the AMDGPU branchinstr
+  MachineInstr *BranchMI = getNormalBlockBranchInstr(SrcMBB);
+  if (!BranchMI) {
+    DEBUG(
+      dbgs() << "migrateInstruction don't see branch instr\n" ;
+    );
+    SpliceEnd = SrcMBB->end();
+  } else {
+    DEBUG(
+      dbgs() << "migrateInstruction see branch instr\n" ;
+      BranchMI->dump();
+    );
+    SpliceEnd = BranchMI;
+  }
+  DEBUG(
+    dbgs() << "migrateInstruction before splice dstSize = " << DstMBB->size()
+      << "srcSize = " << SrcMBB->size() << "\n";
+  );
+
+  //splice insert before insertPos
+  DstMBB->splice(I, SrcMBB, SrcMBB->begin(), SpliceEnd);
+
+  DEBUG(
+    dbgs() << "migrateInstruction after splice dstSize = " << DstMBB->size()
+      << "srcSize = " << SrcMBB->size() << "\n";
+  );
+}
+
+MachineBasicBlock *
+AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) {
+  MachineBasicBlock *LoopHeader = LoopRep->getHeader();
+  MachineBasicBlock *LoopLatch = LoopRep->getLoopLatch();
+  const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
+
+  if (!LoopHeader || !LoopLatch)
+    return nullptr;
+  MachineInstr *BranchMI = getLoopendBlockBranchInstr(LoopLatch);
+  // Is LoopRep an infinite loop ?
+  if (!BranchMI || !isUncondBranch(BranchMI))
+    return nullptr;
+
+  MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock();
+  FuncRep->push_back(DummyExitBlk);  //insert to function
+  SHOWNEWBLK(DummyExitBlk, "DummyExitBlock to normalize infiniteLoop: ");
+  DEBUG(dbgs() << "Old branch instr: " << *BranchMI << "\n";);
+  MachineBasicBlock::iterator I = BranchMI;
+  unsigned ImmReg = FuncRep->getRegInfo().createVirtualRegister(I32RC);
+  llvm_unreachable("Extra register needed to handle CFG");
+  MachineInstr *NewMI = insertInstrBefore(I, AMDGPU::BRANCH_COND_i32);
+  MachineInstrBuilder MIB(*FuncRep, NewMI);
+  MIB.addMBB(LoopHeader);
+  MIB.addReg(ImmReg, false);
+  SHOWNEWINSTR(NewMI);
+  BranchMI->eraseFromParent();
+  LoopLatch->addSuccessor(DummyExitBlk);
+
+  return DummyExitBlk;
+}
+
+void AMDGPUCFGStructurizer::removeUnconditionalBranch(MachineBasicBlock *MBB) {
+  MachineInstr *BranchMI;
+
+  // I saw two unconditional branch in one basic block in example
+  // test_fc_do_while_or.c need to fix the upstream on this to remove the loop.
+  while ((BranchMI = getLoopendBlockBranchInstr(MBB))
+          && isUncondBranch(BranchMI)) {
+    DEBUG(dbgs() << "Removing uncond branch instr"; BranchMI->dump(););
+    BranchMI->eraseFromParent();
+  }
+}
+
+void AMDGPUCFGStructurizer::removeRedundantConditionalBranch(
+    MachineBasicBlock *MBB) {
+  if (MBB->succ_size() != 2)
+    return;
+  MachineBasicBlock *MBB1 = *MBB->succ_begin();
+  MachineBasicBlock *MBB2 = *std::next(MBB->succ_begin());
+  if (MBB1 != MBB2)
+    return;
+
+  MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB);
+  assert(BranchMI && isCondBranch(BranchMI));
+  DEBUG(dbgs() << "Removing unneeded cond branch instr"; BranchMI->dump(););
+  BranchMI->eraseFromParent();
+  SHOWNEWBLK(MBB1, "Removing redundant successor");
+  MBB->removeSuccessor(MBB1);
+}
+
+void AMDGPUCFGStructurizer::addDummyExitBlock(
+    SmallVectorImpl<MachineBasicBlock*> &RetMBB) {
+  MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock();
+  FuncRep->push_back(DummyExitBlk);  //insert to function
+  insertInstrEnd(DummyExitBlk, AMDGPU::RETURN);
+
+  for (SmallVectorImpl<MachineBasicBlock *>::iterator It = RetMBB.begin(),
+       E = RetMBB.end(); It != E; ++It) {
+    MachineBasicBlock *MBB = *It;
+    MachineInstr *MI = getReturnInstr(MBB);
+    if (MI)
+      MI->eraseFromParent();
+    MBB->addSuccessor(DummyExitBlk);
+    DEBUG(
+      dbgs() << "Add dummyExitBlock to BB" << MBB->getNumber()
+             << " successors\n";
+    );
+  }
+  SHOWNEWBLK(DummyExitBlk, "DummyExitBlock: ");
+}
+
+void AMDGPUCFGStructurizer::removeSuccessor(MachineBasicBlock *MBB) {
+  while (MBB->succ_size())
+    MBB->removeSuccessor(*MBB->succ_begin());
+}
+
+void AMDGPUCFGStructurizer::recordSccnum(MachineBasicBlock *MBB,
+    int SccNum) {
+  BlockInformation *&srcBlkInfo = BlockInfoMap[MBB];
+  if (!srcBlkInfo)
+    srcBlkInfo = new BlockInformation();
+  srcBlkInfo->SccNum = SccNum;
+}
+
+void AMDGPUCFGStructurizer::retireBlock(MachineBasicBlock *MBB) {
+  DEBUG(
+        dbgs() << "Retiring BB" << MBB->getNumber() << "\n";
+  );
+
+  BlockInformation *&SrcBlkInfo = BlockInfoMap[MBB];
+
+  if (!SrcBlkInfo)
+    SrcBlkInfo = new BlockInformation();
+
+  SrcBlkInfo->IsRetired = true;
+  assert(MBB->succ_size() == 0 && MBB->pred_size() == 0
+         && "can't retire block yet");
+}
+
+void AMDGPUCFGStructurizer::setLoopLandBlock(MachineLoop *loopRep,
+    MachineBasicBlock *MBB) {
+  MachineBasicBlock *&TheEntry = LLInfoMap[loopRep];
+  if (!MBB) {
+    MBB = FuncRep->CreateMachineBasicBlock();
+    FuncRep->push_back(MBB);  //insert to function
+    SHOWNEWBLK(MBB, "DummyLandingBlock for loop without break: ");
+  }
+  TheEntry = MBB;
+  DEBUG(
+    dbgs() << "setLoopLandBlock loop-header = BB"
+           << loopRep->getHeader()->getNumber()
+           << "  landing-block = BB" << MBB->getNumber() << "\n";
+  );
+}
+
+MachineBasicBlock *
+AMDGPUCFGStructurizer::findNearestCommonPostDom(MachineBasicBlock *MBB1,
+    MachineBasicBlock *MBB2) {
+
+  if (PDT->dominates(MBB1, MBB2))
+    return MBB1;
+  if (PDT->dominates(MBB2, MBB1))
+    return MBB2;
+
+  MachineDomTreeNode *Node1 = PDT->getNode(MBB1);
+  MachineDomTreeNode *Node2 = PDT->getNode(MBB2);
+
+  // Handle newly cloned node.
+  if (!Node1 && MBB1->succ_size() == 1)
+    return findNearestCommonPostDom(*MBB1->succ_begin(), MBB2);
+  if (!Node2 && MBB2->succ_size() == 1)
+    return findNearestCommonPostDom(MBB1, *MBB2->succ_begin());
+
+  if (!Node1 || !Node2)
+    return nullptr;
+
+  Node1 = Node1->getIDom();
+  while (Node1) {
+    if (PDT->dominates(Node1, Node2))
+      return Node1->getBlock();
+    Node1 = Node1->getIDom();
+  }
+
+  return nullptr;
+}
+
+MachineBasicBlock *
+AMDGPUCFGStructurizer::findNearestCommonPostDom(
+    std::set<MachineBasicBlock *> &MBBs) {
+  MachineBasicBlock *CommonDom;
+  std::set<MachineBasicBlock *>::const_iterator It = MBBs.begin();
+  std::set<MachineBasicBlock *>::const_iterator E = MBBs.end();
+  for (CommonDom = *It; It != E && CommonDom; ++It) {
+    MachineBasicBlock *MBB = *It;
+    if (MBB != CommonDom)
+      CommonDom = findNearestCommonPostDom(MBB, CommonDom);
+  }
+
+  DEBUG(
+    dbgs() << "Common post dominator for exit blocks is ";
+    if (CommonDom)
+          dbgs() << "BB" << CommonDom->getNumber() << "\n";
+    else
+      dbgs() << "NULL\n";
+  );
+
+  return CommonDom;
+}
+
+char AMDGPUCFGStructurizer::ID = 0;
+
+} // end anonymous namespace
+
+
+INITIALIZE_PASS_BEGIN(AMDGPUCFGStructurizer, "amdgpustructurizer",
+                      "AMDGPU CFG Structurizer", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(AMDGPUCFGStructurizer, "amdgpustructurizer",
+                      "AMDGPU CFG Structurizer", false, false)
+
+FunctionPass *llvm::createAMDGPUCFGStructurizerPass() {
+  return new AMDGPUCFGStructurizer();
+}
diff --git a/contrib/llvm/lib/Target/R600/CaymanInstructions.td b/contrib/llvm/lib/Target/R600/CaymanInstructions.td
new file mode 100644
index 0000000..2630345
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/CaymanInstructions.td
@@ -0,0 +1,224 @@
+//===-- CaymanInstructions.td - CM Instruction defs  -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TableGen definitions for instructions which are available only on Cayman
+// family GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+def isCayman : Predicate<"Subtarget.hasCaymanISA()">;
+
+//===----------------------------------------------------------------------===//
+// Cayman Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isCayman] in {
+
+def MULADD_INT24_cm : R600_3OP <0x08, "MULADD_INT24",
+  [(set i32:$dst, (AMDGPUmad_i24 i32:$src0, i32:$src1, i32:$src2))], VecALU
+>;
+def MUL_INT24_cm : R600_2OP <0x5B, "MUL_INT24",
+  [(set i32:$dst, (AMDGPUmul_i24 i32:$src0, i32:$src1))], VecALU
+>;
+
+def : IMad24Pat<MULADD_INT24_cm>;
+
+let isVector = 1 in {
+
+def RECIP_IEEE_cm : RECIP_IEEE_Common<0x86>;
+
+def MULLO_INT_cm : MULLO_INT_Common<0x8F>;
+def MULHI_INT_cm : MULHI_INT_Common<0x90>;
+def MULLO_UINT_cm : MULLO_UINT_Common<0x91>;
+def MULHI_UINT_cm : MULHI_UINT_Common<0x92>;
+def RECIPSQRT_CLAMPED_cm : RECIPSQRT_CLAMPED_Common<0x87>;
+def EXP_IEEE_cm : EXP_IEEE_Common<0x81>;
+def LOG_IEEE_cm : LOG_IEEE_Common<0x83>;
+def RECIP_CLAMPED_cm : RECIP_CLAMPED_Common<0x84>;
+def RECIPSQRT_IEEE_cm : RECIPSQRT_IEEE_Common<0x89>;
+def SIN_cm : SIN_Common<0x8D>;
+def COS_cm : COS_Common<0x8E>;
+} // End isVector = 1
+
+def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL>;
+
+defm DIV_cm : DIV_Common<RECIP_IEEE_cm>;
+defm : Expand24UBitOps<MULLO_UINT_cm, ADD_INT>;
+
+// RECIP_UINT emulation for Cayman
+// The multiplication scales from [0,1] to the unsigned integer range
+def : Pat <
+  (AMDGPUurecip i32:$src0),
+  (FLT_TO_UINT_eg (MUL_IEEE (RECIP_IEEE_cm (UINT_TO_FLT_eg $src0)),
+                            (MOV_IMM_I32 CONST.FP_UINT_MAX_PLUS_1)))
+>;
+
+  def CF_END_CM : CF_CLAUSE_EG<32, (ins), "CF_END"> {
+    let ADDR = 0;
+    let POP_COUNT = 0;
+    let COUNT = 0;
+  }
+
+
+def : Pat<(fsqrt f32:$src), (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm $src))>;
+
+class RAT_STORE_DWORD <RegisterClass rc, ValueType vt, bits<4> mask> :
+  CF_MEM_RAT_CACHELESS <0x14, 0, mask,
+                        (ins rc:$rw_gpr, R600_TReg32_X:$index_gpr),
+                        "STORE_DWORD $rw_gpr, $index_gpr",
+                        [(global_store vt:$rw_gpr, i32:$index_gpr)]> {
+  let eop = 0; // This bit is not used on Cayman.
+}
+
+def RAT_STORE_DWORD32 : RAT_STORE_DWORD <R600_TReg32_X, i32, 0x1>;
+def RAT_STORE_DWORD64 : RAT_STORE_DWORD <R600_Reg64, v2i32, 0x3>;
+def RAT_STORE_DWORD128 : RAT_STORE_DWORD <R600_Reg128, v4i32, 0xf>;
+
+class VTX_READ_cm <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
+    : VTX_WORD0_cm, VTX_READ<name, buffer_id, outs, pattern> {
+
+  // Static fields
+  let VC_INST = 0;
+  let FETCH_TYPE = 2;
+  let FETCH_WHOLE_QUAD = 0;
+  let BUFFER_ID = buffer_id;
+  let SRC_REL = 0;
+  // XXX: We can infer this field based on the SRC_GPR.  This would allow us
+  // to store vertex addresses in any channel, not just X.
+  let SRC_SEL_X = 0;
+  let SRC_SEL_Y = 0;
+  let STRUCTURED_READ = 0;
+  let LDS_REQ = 0;
+  let COALESCED_READ = 0;
+
+  let Inst{31-0} = Word0;
+}
+
+class VTX_READ_8_cm <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_cm <"VTX_READ_8 $dst_gpr, $src_gpr", buffer_id,
+                   (outs R600_TReg32_X:$dst_gpr), pattern> {
+
+  let DST_SEL_X = 0;
+  let DST_SEL_Y = 7;   // Masked
+  let DST_SEL_Z = 7;   // Masked
+  let DST_SEL_W = 7;   // Masked
+  let DATA_FORMAT = 1; // FMT_8
+}
+
+class VTX_READ_16_cm <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_cm <"VTX_READ_16 $dst_gpr, $src_gpr", buffer_id,
+                   (outs R600_TReg32_X:$dst_gpr), pattern> {
+  let DST_SEL_X = 0;
+  let DST_SEL_Y = 7;   // Masked
+  let DST_SEL_Z = 7;   // Masked
+  let DST_SEL_W = 7;   // Masked
+  let DATA_FORMAT = 5; // FMT_16
+
+}
+
+class VTX_READ_32_cm <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_cm <"VTX_READ_32 $dst_gpr, $src_gpr", buffer_id,
+                   (outs R600_TReg32_X:$dst_gpr), pattern> {
+
+  let DST_SEL_X        = 0;
+  let DST_SEL_Y        = 7;   // Masked
+  let DST_SEL_Z        = 7;   // Masked
+  let DST_SEL_W        = 7;   // Masked
+  let DATA_FORMAT      = 0xD; // COLOR_32
+
+  // This is not really necessary, but there were some GPU hangs that appeared
+  // to be caused by ALU instructions in the next instruction group that wrote
+  // to the $src_gpr registers of the VTX_READ.
+  // e.g.
+  // %T3_X<def> = VTX_READ_PARAM_32_eg %T2_X<kill>, 24
+  // %T2_X<def> = MOV %ZERO
+  //Adding this constraint prevents this from happening.
+  let Constraints = "$src_gpr.ptr = $dst_gpr";
+}
+
+class VTX_READ_64_cm <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_cm <"VTX_READ_64 $dst_gpr, $src_gpr", buffer_id,
+                   (outs R600_Reg64:$dst_gpr), pattern> {
+
+  let DST_SEL_X        = 0;
+  let DST_SEL_Y        = 1;
+  let DST_SEL_Z        = 7;
+  let DST_SEL_W        = 7;
+  let DATA_FORMAT      = 0x1D; // COLOR_32_32
+}
+
+class VTX_READ_128_cm <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_cm <"VTX_READ_128 $dst_gpr.XYZW, $src_gpr", buffer_id,
+                   (outs R600_Reg128:$dst_gpr), pattern> {
+
+  let DST_SEL_X        =  0;
+  let DST_SEL_Y        =  1;
+  let DST_SEL_Z        =  2;
+  let DST_SEL_W        =  3;
+  let DATA_FORMAT      =  0x22; // COLOR_32_32_32_32
+
+  // XXX: Need to force VTX_READ_128 instructions to write to the same register
+  // that holds its buffer address to avoid potential hangs.  We can't use
+  // the same constraint as VTX_READ_32_eg, because the $src_gpr.ptr and $dst
+  // registers are different sizes.
+}
+
+//===----------------------------------------------------------------------===//
+// VTX Read from parameter memory space
+//===----------------------------------------------------------------------===//
+def VTX_READ_PARAM_8_cm : VTX_READ_8_cm <0,
+  [(set i32:$dst_gpr, (load_param_exti8 ADDRVTX_READ:$src_gpr))]
+>;
+
+def VTX_READ_PARAM_16_cm : VTX_READ_16_cm <0,
+  [(set i32:$dst_gpr, (load_param_exti16 ADDRVTX_READ:$src_gpr))]
+>;
+
+def VTX_READ_PARAM_32_cm : VTX_READ_32_cm <0,
+  [(set i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
+>;
+
+def VTX_READ_PARAM_64_cm : VTX_READ_64_cm <0,
+  [(set v2i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
+>;
+
+def VTX_READ_PARAM_128_cm : VTX_READ_128_cm <0,
+  [(set v4i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
+>;
+
+//===----------------------------------------------------------------------===//
+// VTX Read from global memory space
+//===----------------------------------------------------------------------===//
+
+// 8-bit reads
+def VTX_READ_GLOBAL_8_cm : VTX_READ_8_cm <1,
+  [(set i32:$dst_gpr, (az_extloadi8_global ADDRVTX_READ:$src_gpr))]
+>;
+
+def VTX_READ_GLOBAL_16_cm : VTX_READ_16_cm <1,
+  [(set i32:$dst_gpr, (az_extloadi16_global ADDRVTX_READ:$src_gpr))]
+>;
+
+// 32-bit reads
+def VTX_READ_GLOBAL_32_cm : VTX_READ_32_cm <1,
+  [(set i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
+>;
+
+// 64-bit reads
+def VTX_READ_GLOBAL_64_cm : VTX_READ_64_cm <1,
+  [(set v2i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
+>;
+
+// 128-bit reads
+def VTX_READ_GLOBAL_128_cm : VTX_READ_128_cm <1,
+  [(set v4i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
+>;
+
+} // End isCayman
+
diff --git a/contrib/llvm/lib/Target/R600/EvergreenInstructions.td b/contrib/llvm/lib/Target/R600/EvergreenInstructions.td
new file mode 100644
index 0000000..484e522
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/EvergreenInstructions.td
@@ -0,0 +1,609 @@
+//===-- EvergreenInstructions.td - EG Instruction defs  ----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TableGen definitions for instructions which are:
+// - Available to Evergreen and newer VLIW4/VLIW5 GPUs
+// - Available only on Evergreen family GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+def isEG : Predicate<
+  "Subtarget.getGeneration() >= AMDGPUSubtarget::EVERGREEN && "
+  "Subtarget.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS && "
+  "!Subtarget.hasCaymanISA()"
+>;
+
+def isEGorCayman : Predicate<
+  "Subtarget.getGeneration() == AMDGPUSubtarget::EVERGREEN ||"
+  "Subtarget.getGeneration() ==AMDGPUSubtarget::NORTHERN_ISLANDS"
+>;
+
+//===----------------------------------------------------------------------===//
+// Evergreen / Cayman store instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isEGorCayman] in {
+
+class CF_MEM_RAT_CACHELESS <bits<6> rat_inst, bits<4> rat_id, bits<4> mask, dag ins,
+                           string name, list<dag> pattern>
+    : EG_CF_RAT <0x57, rat_inst, rat_id, mask, (outs), ins,
+                 "MEM_RAT_CACHELESS "#name, pattern>;
+
+class CF_MEM_RAT <bits<6> rat_inst, bits<4> rat_id, dag ins, string name,
+                  list<dag> pattern>
+    : EG_CF_RAT <0x56, rat_inst, rat_id, 0xf /* mask */, (outs), ins,
+                 "MEM_RAT "#name, pattern>;
+
+def RAT_MSKOR : CF_MEM_RAT <0x11, 0,
+  (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr),
+  "MSKOR $rw_gpr.XW, $index_gpr",
+  [(mskor_global v4i32:$rw_gpr, i32:$index_gpr)]
+> {
+  let eop = 0;
+}
+
+} // End let Predicates = [isEGorCayman]
+
+//===----------------------------------------------------------------------===//
+// Evergreen Only instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isEG] in {
+
+def RECIP_IEEE_eg : RECIP_IEEE_Common<0x86>;
+defm DIV_eg : DIV_Common<RECIP_IEEE_eg>;
+
+def MULLO_INT_eg : MULLO_INT_Common<0x8F>;
+def MULHI_INT_eg : MULHI_INT_Common<0x90>;
+def MULLO_UINT_eg : MULLO_UINT_Common<0x91>;
+def MULHI_UINT_eg : MULHI_UINT_Common<0x92>;
+def RECIP_UINT_eg : RECIP_UINT_Common<0x94>;
+def RECIPSQRT_CLAMPED_eg : RECIPSQRT_CLAMPED_Common<0x87>;
+def EXP_IEEE_eg : EXP_IEEE_Common<0x81>;
+def LOG_IEEE_eg : LOG_IEEE_Common<0x83>;
+def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>;
+def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>;
+def SIN_eg : SIN_Common<0x8D>;
+def COS_eg : COS_Common<0x8E>;
+
+def : POW_Common <LOG_IEEE_eg, EXP_IEEE_eg, MUL>;
+def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_eg $src))>;
+
+defm : Expand24IBitOps<MULLO_INT_eg, ADD_INT>;
+
+//===----------------------------------------------------------------------===//
+// Memory read/write instructions
+//===----------------------------------------------------------------------===//
+
+let usesCustomInserter = 1 in {
+
+// 32-bit store
+def RAT_WRITE_CACHELESS_32_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0x1,
+  (ins R600_TReg32_X:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
+  "STORE_RAW $rw_gpr, $index_gpr, $eop",
+  [(global_store i32:$rw_gpr, i32:$index_gpr)]
+>;
+
+// 64-bit store
+def RAT_WRITE_CACHELESS_64_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0x3,
+  (ins R600_Reg64:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
+  "STORE_RAW $rw_gpr.XY, $index_gpr, $eop",
+  [(global_store v2i32:$rw_gpr, i32:$index_gpr)]
+>;
+
+//128-bit store
+def RAT_WRITE_CACHELESS_128_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0xf,
+  (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
+  "STORE_RAW $rw_gpr.XYZW, $index_gpr, $eop",
+  [(global_store v4i32:$rw_gpr, i32:$index_gpr)]
+>;
+
+} // End usesCustomInserter = 1
+
+class VTX_READ_eg <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
+    : VTX_WORD0_eg, VTX_READ<name, buffer_id, outs, pattern> {
+
+  // Static fields
+  let VC_INST = 0;
+  let FETCH_TYPE = 2;
+  let FETCH_WHOLE_QUAD = 0;
+  let BUFFER_ID = buffer_id;
+  let SRC_REL = 0;
+  // XXX: We can infer this field based on the SRC_GPR.  This would allow us
+  // to store vertex addresses in any channel, not just X.
+  let SRC_SEL_X = 0;
+
+  let Inst{31-0} = Word0;
+}
+
+class VTX_READ_8_eg <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_eg <"VTX_READ_8 $dst_gpr, $src_gpr", buffer_id,
+                   (outs R600_TReg32_X:$dst_gpr), pattern> {
+
+  let MEGA_FETCH_COUNT = 1;
+  let DST_SEL_X = 0;
+  let DST_SEL_Y = 7;   // Masked
+  let DST_SEL_Z = 7;   // Masked
+  let DST_SEL_W = 7;   // Masked
+  let DATA_FORMAT = 1; // FMT_8
+}
+
+class VTX_READ_16_eg <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_eg <"VTX_READ_16 $dst_gpr, $src_gpr", buffer_id,
+                   (outs R600_TReg32_X:$dst_gpr), pattern> {
+  let MEGA_FETCH_COUNT = 2;
+  let DST_SEL_X = 0;
+  let DST_SEL_Y = 7;   // Masked
+  let DST_SEL_Z = 7;   // Masked
+  let DST_SEL_W = 7;   // Masked
+  let DATA_FORMAT = 5; // FMT_16
+
+}
+
+class VTX_READ_32_eg <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_eg <"VTX_READ_32 $dst_gpr, $src_gpr", buffer_id,
+                   (outs R600_TReg32_X:$dst_gpr), pattern> {
+
+  let MEGA_FETCH_COUNT = 4;
+  let DST_SEL_X        = 0;
+  let DST_SEL_Y        = 7;   // Masked
+  let DST_SEL_Z        = 7;   // Masked
+  let DST_SEL_W        = 7;   // Masked
+  let DATA_FORMAT      = 0xD; // COLOR_32
+
+  // This is not really necessary, but there were some GPU hangs that appeared
+  // to be caused by ALU instructions in the next instruction group that wrote
+  // to the $src_gpr registers of the VTX_READ.
+  // e.g.
+  // %T3_X<def> = VTX_READ_PARAM_32_eg %T2_X<kill>, 24
+  // %T2_X<def> = MOV %ZERO
+  //Adding this constraint prevents this from happening.
+  let Constraints = "$src_gpr.ptr = $dst_gpr";
+}
+
+class VTX_READ_64_eg <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_eg <"VTX_READ_64 $dst_gpr.XY, $src_gpr", buffer_id,
+                   (outs R600_Reg64:$dst_gpr), pattern> {
+
+  let MEGA_FETCH_COUNT = 8;
+  let DST_SEL_X        = 0;
+  let DST_SEL_Y        = 1;
+  let DST_SEL_Z        = 7;
+  let DST_SEL_W        = 7;
+  let DATA_FORMAT      = 0x1D; // COLOR_32_32
+}
+
+class VTX_READ_128_eg <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_eg <"VTX_READ_128 $dst_gpr.XYZW, $src_gpr", buffer_id,
+                   (outs R600_Reg128:$dst_gpr), pattern> {
+
+  let MEGA_FETCH_COUNT = 16;
+  let DST_SEL_X        =  0;
+  let DST_SEL_Y        =  1;
+  let DST_SEL_Z        =  2;
+  let DST_SEL_W        =  3;
+  let DATA_FORMAT      =  0x22; // COLOR_32_32_32_32
+
+  // XXX: Need to force VTX_READ_128 instructions to write to the same register
+  // that holds its buffer address to avoid potential hangs.  We can't use
+  // the same constraint as VTX_READ_32_eg, because the $src_gpr.ptr and $dst
+  // registers are different sizes.
+}
+
+//===----------------------------------------------------------------------===//
+// VTX Read from parameter memory space
+//===----------------------------------------------------------------------===//
+
+def VTX_READ_PARAM_8_eg : VTX_READ_8_eg <0,
+  [(set i32:$dst_gpr, (load_param_exti8 ADDRVTX_READ:$src_gpr))]
+>;
+
+def VTX_READ_PARAM_16_eg : VTX_READ_16_eg <0,
+  [(set i32:$dst_gpr, (load_param_exti16 ADDRVTX_READ:$src_gpr))]
+>;
+
+def VTX_READ_PARAM_32_eg : VTX_READ_32_eg <0,
+  [(set i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
+>;
+
+def VTX_READ_PARAM_64_eg : VTX_READ_64_eg <0,
+  [(set v2i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
+>;
+
+def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <0,
+  [(set v4i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
+>;
+
+//===----------------------------------------------------------------------===//
+// VTX Read from global memory space
+//===----------------------------------------------------------------------===//
+
+// 8-bit reads
+def VTX_READ_GLOBAL_8_eg : VTX_READ_8_eg <1,
+  [(set i32:$dst_gpr, (az_extloadi8_global ADDRVTX_READ:$src_gpr))]
+>;
+
+def VTX_READ_GLOBAL_16_eg : VTX_READ_16_eg <1,
+  [(set i32:$dst_gpr, (az_extloadi16_global ADDRVTX_READ:$src_gpr))]
+>;
+
+// 32-bit reads
+def VTX_READ_GLOBAL_32_eg : VTX_READ_32_eg <1,
+  [(set i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
+>;
+
+// 64-bit reads
+def VTX_READ_GLOBAL_64_eg : VTX_READ_64_eg <1,
+  [(set v2i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
+>;
+
+// 128-bit reads
+def VTX_READ_GLOBAL_128_eg : VTX_READ_128_eg <1,
+  [(set v4i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
+>;
+
+} // End Predicates = [isEG]
+
+//===----------------------------------------------------------------------===//
+// Evergreen / Cayman Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isEGorCayman] in {
+
+// BFE_UINT - bit_extract, an optimization for mask and shift
+// Src0 = Input
+// Src1 = Offset
+// Src2 = Width
+//
+// bit_extract = (Input << (32 - Offset - Width)) >> (32 - Width)
+//
+// Example Usage:
+// (Offset, Width)
+//
+// (0, 8)  = (Input << 24) >> 24 = (Input &  0xff)       >> 0
+// (8, 8)  = (Input << 16) >> 24 = (Input &  0xffff)     >> 8
+// (16, 8) = (Input <<  8) >> 24 = (Input &  0xffffff)   >> 16
+// (24, 8) = (Input <<  0) >> 24 = (Input &  0xffffffff) >> 24
+def BFE_UINT_eg : R600_3OP <0x4, "BFE_UINT",
+  [(set i32:$dst, (AMDGPUbfe_u32 i32:$src0, i32:$src1, i32:$src2))],
+  VecALU
+>;
+
+def BFE_INT_eg : R600_3OP <0x5, "BFE_INT",
+  [(set i32:$dst, (AMDGPUbfe_i32 i32:$src0, i32:$src1, i32:$src2))],
+  VecALU
+>;
+
+// XXX: This pattern is broken, disabling for now.  See comment in
+// AMDGPUInstructions.td for more info.
+//  def : BFEPattern <BFE_UINT_eg>;
+def BFI_INT_eg : R600_3OP <0x06, "BFI_INT",
+  [(set i32:$dst, (AMDGPUbfi i32:$src0, i32:$src1, i32:$src2))],
+  VecALU
+>;
+
+def : Pat<(i32 (sext_inreg i32:$src, i1)),
+  (BFE_INT_eg i32:$src, (i32 ZERO), (i32 ONE_INT))>;
+def : Pat<(i32 (sext_inreg i32:$src, i8)),
+  (BFE_INT_eg i32:$src, (i32 ZERO), (MOV_IMM_I32 8))>;
+def : Pat<(i32 (sext_inreg i32:$src, i16)),
+  (BFE_INT_eg i32:$src, (i32 ZERO), (MOV_IMM_I32 16))>;
+
+defm : BFIPatterns <BFI_INT_eg, MOV_IMM_I32>;
+
+def BFM_INT_eg : R600_2OP <0xA0, "BFM_INT",
+  [(set i32:$dst, (AMDGPUbfm i32:$src0, i32:$src1))],
+  VecALU
+>;
+
+def MULADD_UINT24_eg : R600_3OP <0x10, "MULADD_UINT24",
+  [(set i32:$dst, (AMDGPUmad_u24 i32:$src0, i32:$src1, i32:$src2))], VecALU
+>;
+
+def : UMad24Pat<MULADD_UINT24_eg>;
+
+def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT", [], VecALU>;
+def : ROTRPattern <BIT_ALIGN_INT_eg>;
+def MULADD_eg : MULADD_Common<0x14>;
+def MULADD_IEEE_eg : MULADD_IEEE_Common<0x18>;
+def ASHR_eg : ASHR_Common<0x15>;
+def LSHR_eg : LSHR_Common<0x16>;
+def LSHL_eg : LSHL_Common<0x17>;
+def CNDE_eg : CNDE_Common<0x19>;
+def CNDGT_eg : CNDGT_Common<0x1A>;
+def CNDGE_eg : CNDGE_Common<0x1B>;
+def MUL_LIT_eg : MUL_LIT_Common<0x1F>;
+def LOG_CLAMPED_eg : LOG_CLAMPED_Common<0x82>;
+def MUL_UINT24_eg : R600_2OP <0xB5, "MUL_UINT24",
+  [(set i32:$dst, (AMDGPUmul_u24 i32:$src0, i32:$src1))], VecALU
+>;
+def DOT4_eg : DOT4_Common<0xBE>;
+defm CUBE_eg : CUBE_Common<0xC0>;
+
+def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>;
+
+def FFBH_UINT : R600_1OP_Helper <0xAB, "FFBH_UINT", ctlz_zero_undef, VecALU>;
+def FFBL_INT : R600_1OP_Helper <0xAC, "FFBL_INT", cttz_zero_undef, VecALU>;
+
+let hasSideEffects = 1 in {
+  def MOVA_INT_eg : R600_1OP <0xCC, "MOVA_INT", [], VecALU>;
+}
+
+def TGSI_LIT_Z_eg : TGSI_LIT_Z_Common<MUL_LIT_eg, LOG_CLAMPED_eg, EXP_IEEE_eg>;
+
+def FLT_TO_INT_eg : FLT_TO_INT_Common<0x50> {
+  let Pattern = [];
+  let Itinerary = AnyALU;
+}
+
+def INT_TO_FLT_eg : INT_TO_FLT_Common<0x9B>;
+
+def FLT_TO_UINT_eg : FLT_TO_UINT_Common<0x9A> {
+  let Pattern = [];
+}
+
+def UINT_TO_FLT_eg : UINT_TO_FLT_Common<0x9C>;
+
+def GROUP_BARRIER : InstR600 <
+    (outs), (ins), "  GROUP_BARRIER", [(int_AMDGPU_barrier_local), (int_AMDGPU_barrier_global)], AnyALU>,
+    R600ALU_Word0,
+    R600ALU_Word1_OP2 <0x54> {
+
+  let dst = 0;
+  let dst_rel = 0;
+  let src0 = 0;
+  let src0_rel = 0;
+  let src0_neg = 0;
+  let src0_abs = 0;
+  let src1 = 0;
+  let src1_rel = 0;
+  let src1_neg = 0;
+  let src1_abs = 0;
+  let write = 0;
+  let omod = 0;
+  let clamp = 0;
+  let last = 1;
+  let bank_swizzle = 0;
+  let pred_sel = 0;
+  let update_exec_mask = 0;
+  let update_pred = 0;
+
+  let Inst{31-0}  = Word0;
+  let Inst{63-32} = Word1;
+
+  let ALUInst = 1;
+}
+
+def : Pat <
+	(int_AMDGPU_barrier_global),
+	(GROUP_BARRIER)
+>;
+
+//===----------------------------------------------------------------------===//
+// LDS Instructions
+//===----------------------------------------------------------------------===//
+class R600_LDS  <bits<6> op, dag outs, dag ins, string asm,
+                 list<dag> pattern = []> :
+
+    InstR600 <outs, ins, asm, pattern, XALU>,
+    R600_ALU_LDS_Word0,
+    R600LDS_Word1 {
+
+  bits<6>  offset = 0;
+  let lds_op = op;
+
+  let Word1{27} = offset{0};
+  let Word1{12} = offset{1};
+  let Word1{28} = offset{2};
+  let Word1{31} = offset{3};
+  let Word0{12} = offset{4};
+  let Word0{25} = offset{5};
+
+
+  let Inst{31-0}  = Word0;
+  let Inst{63-32} = Word1;
+
+  let ALUInst = 1;
+  let HasNativeOperands = 1;
+  let UseNamedOperandTable = 1;
+}
+
+class R600_LDS_1A <bits<6> lds_op, string name, list<dag> pattern> : R600_LDS <
+  lds_op,
+  (outs R600_Reg32:$dst),
+  (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel,
+       LAST:$last, R600_Pred:$pred_sel,
+       BANK_SWIZZLE:$bank_swizzle),
+  "  "#name#" $last OQAP, $src0$src0_rel $pred_sel",
+  pattern
+  > {
+
+  let src1 = 0;
+  let src1_rel = 0;
+  let src2 = 0;
+  let src2_rel = 0;
+
+  let usesCustomInserter = 1;
+  let LDS_1A = 1;
+  let DisableEncoding = "$dst";
+}
+
+class R600_LDS_1A1D <bits<6> lds_op, dag outs, string name, list<dag> pattern,
+                     string dst =""> :
+    R600_LDS <
+  lds_op, outs,
+  (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel,
+       R600_Reg32:$src1, REL:$src1_rel, SEL:$src1_sel,
+       LAST:$last, R600_Pred:$pred_sel,
+       BANK_SWIZZLE:$bank_swizzle),
+  "  "#name#" $last "#dst#"$src0$src0_rel, $src1$src1_rel, $pred_sel",
+  pattern
+  > {
+
+  field string BaseOp;
+
+  let src2 = 0;
+  let src2_rel = 0;
+  let LDS_1A1D = 1;
+}
+
+class R600_LDS_1A1D_NORET <bits<6> lds_op, string name, list<dag> pattern> :
+    R600_LDS_1A1D <lds_op, (outs), name, pattern> {
+  let BaseOp = name;
+}
+
+class R600_LDS_1A1D_RET <bits<6> lds_op, string name, list<dag> pattern> :
+    R600_LDS_1A1D <lds_op,  (outs R600_Reg32:$dst), name##"_RET", pattern, "OQAP, "> {
+
+  let BaseOp = name;
+  let usesCustomInserter = 1;
+  let DisableEncoding = "$dst";
+}
+
+class R600_LDS_1A2D <bits<6> lds_op, string name, list<dag> pattern> :
+    R600_LDS <
+  lds_op,
+  (outs),
+  (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel,
+       R600_Reg32:$src1, REL:$src1_rel, SEL:$src1_sel,
+       R600_Reg32:$src2, REL:$src2_rel, SEL:$src2_sel,
+       LAST:$last, R600_Pred:$pred_sel, BANK_SWIZZLE:$bank_swizzle),
+  "  "#name# "$last $src0$src0_rel, $src1$src1_rel, $src2$src2_rel, $pred_sel",
+  pattern> {
+  let LDS_1A2D = 1;
+}
+
+def LDS_ADD : R600_LDS_1A1D_NORET <0x0, "LDS_ADD", [] >;
+def LDS_SUB : R600_LDS_1A1D_NORET <0x1, "LDS_SUB", [] >;
+def LDS_WRITE : R600_LDS_1A1D_NORET <0xD, "LDS_WRITE",
+  [(local_store (i32 R600_Reg32:$src1), R600_Reg32:$src0)]
+>;
+def LDS_BYTE_WRITE : R600_LDS_1A1D_NORET<0x12, "LDS_BYTE_WRITE",
+  [(truncstorei8_local i32:$src1, i32:$src0)]
+>;
+def LDS_SHORT_WRITE : R600_LDS_1A1D_NORET<0x13, "LDS_SHORT_WRITE",
+  [(truncstorei16_local i32:$src1, i32:$src0)]
+>;
+def LDS_ADD_RET : R600_LDS_1A1D_RET <0x20, "LDS_ADD",
+  [(set i32:$dst, (atomic_load_add_local i32:$src0, i32:$src1))]
+>;
+def LDS_SUB_RET : R600_LDS_1A1D_RET <0x21, "LDS_SUB",
+  [(set i32:$dst, (atomic_load_sub_local i32:$src0, i32:$src1))]
+>;
+def LDS_READ_RET : R600_LDS_1A <0x32, "LDS_READ_RET",
+  [(set (i32 R600_Reg32:$dst), (local_load R600_Reg32:$src0))]
+>;
+def LDS_BYTE_READ_RET : R600_LDS_1A <0x36, "LDS_BYTE_READ_RET",
+  [(set i32:$dst, (sextloadi8_local i32:$src0))]
+>;
+def LDS_UBYTE_READ_RET : R600_LDS_1A <0x37, "LDS_UBYTE_READ_RET",
+  [(set i32:$dst, (az_extloadi8_local i32:$src0))]
+>;
+def LDS_SHORT_READ_RET : R600_LDS_1A <0x38, "LDS_SHORT_READ_RET",
+  [(set i32:$dst, (sextloadi16_local i32:$src0))]
+>;
+def LDS_USHORT_READ_RET : R600_LDS_1A <0x39, "LDS_USHORT_READ_RET",
+  [(set i32:$dst, (az_extloadi16_local i32:$src0))]
+>;
+
+// TRUNC is used for the FLT_TO_INT instructions to work around a
+// perceived problem where the rounding modes are applied differently
+// depending on the instruction and the slot they are in.
+// See:
+// https://bugs.freedesktop.org/show_bug.cgi?id=50232
+// Mesa commit: a1a0974401c467cb86ef818f22df67c21774a38c
+//
+// XXX: Lowering SELECT_CC will sometimes generate fp_to_[su]int nodes,
+// which do not need to be truncated since the fp values are 0.0f or 1.0f.
+// We should look into handling these cases separately.
+def : Pat<(fp_to_sint f32:$src0), (FLT_TO_INT_eg (TRUNC $src0))>;
+
+def : Pat<(fp_to_uint f32:$src0), (FLT_TO_UINT_eg (TRUNC $src0))>;
+
+// SHA-256 Patterns
+def : SHA256MaPattern <BFI_INT_eg, XOR_INT>;
+
+def : FROUNDPat <CNDGE_eg>;
+
+def EG_ExportSwz : ExportSwzInst {
+  let Word1{19-16} = 0; // BURST_COUNT
+  let Word1{20} = 0; // VALID_PIXEL_MODE
+  let Word1{21} = eop;
+  let Word1{29-22} = inst;
+  let Word1{30} = 0; // MARK
+  let Word1{31} = 1; // BARRIER
+}
+defm : ExportPattern<EG_ExportSwz, 83>;
+
+def EG_ExportBuf : ExportBufInst {
+  let Word1{19-16} = 0; // BURST_COUNT
+  let Word1{20} = 0; // VALID_PIXEL_MODE
+  let Word1{21} = eop;
+  let Word1{29-22} = inst;
+  let Word1{30} = 0; // MARK
+  let Word1{31} = 1; // BARRIER
+}
+defm : SteamOutputExportPattern<EG_ExportBuf, 0x40, 0x41, 0x42, 0x43>;
+
+def CF_TC_EG : CF_CLAUSE_EG<1, (ins i32imm:$ADDR, i32imm:$COUNT),
+  "TEX $COUNT @$ADDR"> {
+  let POP_COUNT = 0;
+}
+def CF_VC_EG : CF_CLAUSE_EG<2, (ins i32imm:$ADDR, i32imm:$COUNT),
+  "VTX $COUNT @$ADDR"> {
+  let POP_COUNT = 0;
+}
+def WHILE_LOOP_EG : CF_CLAUSE_EG<6, (ins i32imm:$ADDR),
+  "LOOP_START_DX10 @$ADDR"> {
+  let POP_COUNT = 0;
+  let COUNT = 0;
+}
+def END_LOOP_EG : CF_CLAUSE_EG<5, (ins i32imm:$ADDR), "END_LOOP @$ADDR"> {
+  let POP_COUNT = 0;
+  let COUNT = 0;
+}
+def LOOP_BREAK_EG : CF_CLAUSE_EG<9, (ins i32imm:$ADDR),
+  "LOOP_BREAK @$ADDR"> {
+  let POP_COUNT = 0;
+  let COUNT = 0;
+}
+def CF_CONTINUE_EG : CF_CLAUSE_EG<8, (ins i32imm:$ADDR),
+  "CONTINUE @$ADDR"> {
+  let POP_COUNT = 0;
+  let COUNT = 0;
+}
+def CF_JUMP_EG : CF_CLAUSE_EG<10, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
+  "JUMP @$ADDR POP:$POP_COUNT"> {
+  let COUNT = 0;
+}
+def CF_PUSH_EG : CF_CLAUSE_EG<11, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
+                              "PUSH @$ADDR POP:$POP_COUNT"> {
+  let COUNT = 0;
+}
+def CF_ELSE_EG : CF_CLAUSE_EG<13, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
+  "ELSE @$ADDR POP:$POP_COUNT"> {
+  let COUNT = 0;
+}
+def CF_CALL_FS_EG : CF_CLAUSE_EG<19, (ins), "CALL_FS"> {
+  let ADDR = 0;
+  let COUNT = 0;
+  let POP_COUNT = 0;
+}
+def POP_EG : CF_CLAUSE_EG<14, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
+  "POP @$ADDR POP:$POP_COUNT"> {
+  let COUNT = 0;
+}
+def CF_END_EG :  CF_CLAUSE_EG<0, (ins), "CF_END"> {
+  let COUNT = 0;
+  let POP_COUNT = 0;
+  let ADDR = 0;
+  let END_OF_PROGRAM = 1;
+}
+
+} // End Predicates = [isEGorCayman]
diff --git a/contrib/llvm/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp b/contrib/llvm/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
new file mode 100644
index 0000000..0927040
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
@@ -0,0 +1,426 @@
+//===-- AMDGPUInstPrinter.cpp - AMDGPU MC Inst -> ASM ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// \file
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUInstPrinter.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/MathExtras.h"
+
+using namespace llvm;
+
+void AMDGPUInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
+                             StringRef Annot) {
+  OS.flush();
+  printInstruction(MI, OS);
+
+  printAnnotation(OS, Annot);
+}
+
+void AMDGPUInstPrinter::printU8ImmOperand(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O) {
+  O << formatHex(MI->getOperand(OpNo).getImm() & 0xff);
+}
+
+void AMDGPUInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O) {
+  O << formatHex(MI->getOperand(OpNo).getImm() & 0xffff);
+}
+
+void AMDGPUInstPrinter::printU32ImmOperand(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O) {
+  O << formatHex(MI->getOperand(OpNo).getImm() & 0xffffffff);
+}
+
+void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O) {
+  switch (reg) {
+  case AMDGPU::VCC:
+    O << "vcc";
+    return;
+  case AMDGPU::SCC:
+    O << "scc";
+    return;
+  case AMDGPU::EXEC:
+    O << "exec";
+    return;
+  case AMDGPU::M0:
+    O << "m0";
+    return;
+  default:
+    break;
+  }
+
+  char Type;
+  unsigned NumRegs;
+
+  if (MRI.getRegClass(AMDGPU::VGPR_32RegClassID).contains(reg)) {
+    Type = 'v';
+    NumRegs = 1;
+  } else  if (MRI.getRegClass(AMDGPU::SGPR_32RegClassID).contains(reg)) {
+    Type = 's';
+    NumRegs = 1;
+  } else if (MRI.getRegClass(AMDGPU::VReg_64RegClassID).contains(reg)) {
+    Type = 'v';
+    NumRegs = 2;
+  } else  if (MRI.getRegClass(AMDGPU::SReg_64RegClassID).contains(reg)) {
+    Type = 's';
+    NumRegs = 2;
+  } else if (MRI.getRegClass(AMDGPU::VReg_128RegClassID).contains(reg)) {
+    Type = 'v';
+    NumRegs = 4;
+  } else  if (MRI.getRegClass(AMDGPU::SReg_128RegClassID).contains(reg)) {
+    Type = 's';
+    NumRegs = 4;
+  } else if (MRI.getRegClass(AMDGPU::VReg_96RegClassID).contains(reg)) {
+    Type = 'v';
+    NumRegs = 3;
+  } else if (MRI.getRegClass(AMDGPU::VReg_256RegClassID).contains(reg)) {
+    Type = 'v';
+    NumRegs = 8;
+  } else if (MRI.getRegClass(AMDGPU::SReg_256RegClassID).contains(reg)) {
+    Type = 's';
+    NumRegs = 8;
+  } else if (MRI.getRegClass(AMDGPU::VReg_512RegClassID).contains(reg)) {
+    Type = 'v';
+    NumRegs = 16;
+  } else if (MRI.getRegClass(AMDGPU::SReg_512RegClassID).contains(reg)) {
+    Type = 's';
+    NumRegs = 16;
+  } else {
+    O << getRegisterName(reg);
+    return;
+  }
+
+  // The low 8 bits of the encoding value is the register index, for both VGPRs
+  // and SGPRs.
+  unsigned RegIdx = MRI.getEncodingValue(reg) & ((1 << 8) - 1);
+  if (NumRegs == 1) {
+    O << Type << RegIdx;
+    return;
+  }
+
+  O << Type << '[' << RegIdx << ':' << (RegIdx + NumRegs - 1) << ']';
+}
+
+void AMDGPUInstPrinter::printImmediate(uint32_t Imm, raw_ostream &O) {
+  int32_t SImm = static_cast<int32_t>(Imm);
+  if (SImm >= -16 && SImm <= 64) {
+    O << SImm;
+    return;
+  }
+
+  if (Imm == FloatToBits(1.0f) ||
+      Imm == FloatToBits(-1.0f) ||
+      Imm == FloatToBits(0.5f) ||
+      Imm == FloatToBits(-0.5f) ||
+      Imm == FloatToBits(2.0f) ||
+      Imm == FloatToBits(-2.0f) ||
+      Imm == FloatToBits(4.0f) ||
+      Imm == FloatToBits(-4.0f)) {
+    O << BitsToFloat(Imm);
+    return;
+  }
+
+  O << formatHex(static_cast<uint64_t>(Imm));
+}
+
+void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                     raw_ostream &O) {
+
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    switch (Op.getReg()) {
+    // This is the default predicate state, so we don't need to print it.
+    case AMDGPU::PRED_SEL_OFF:
+      break;
+
+    default:
+      printRegOperand(Op.getReg(), O);
+      break;
+    }
+  } else if (Op.isImm()) {
+    printImmediate(Op.getImm(), O);
+  } else if (Op.isFPImm()) {
+    O << Op.getFPImm();
+  } else if (Op.isExpr()) {
+    const MCExpr *Exp = Op.getExpr();
+    Exp->print(O);
+  } else {
+    assert(!"unknown operand type in printOperand");
+  }
+}
+
+void AMDGPUInstPrinter::printOperandAndMods(const MCInst *MI, unsigned OpNo,
+                                            raw_ostream &O) {
+  unsigned InputModifiers = MI->getOperand(OpNo).getImm();
+  if (InputModifiers & 0x1)
+    O << "-";
+  if (InputModifiers & 0x2)
+    O << "|";
+  printOperand(MI, OpNo + 1, O);
+  if (InputModifiers & 0x2)
+    O << "|";
+}
+
+void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum,
+                                        raw_ostream &O) {
+  unsigned Imm = MI->getOperand(OpNum).getImm();
+
+  if (Imm == 2) {
+    O << "P0";
+  } else if (Imm == 1) {
+    O << "P20";
+  } else if (Imm == 0) {
+    O << "P10";
+  } else {
+    assert(!"Invalid interpolation parameter slot");
+  }
+}
+
+void AMDGPUInstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) {
+  printOperand(MI, OpNo, O);
+  O  << ", ";
+  printOperand(MI, OpNo + 1, O);
+}
+
+void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo,
+                                   raw_ostream &O, StringRef Asm,
+                                   StringRef Default) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  assert(Op.isImm());
+  if (Op.getImm() == 1) {
+    O << Asm;
+  } else {
+    O << Default;
+  }
+}
+
+void AMDGPUInstPrinter::printAbs(const MCInst *MI, unsigned OpNo,
+                                 raw_ostream &O) {
+  printIfSet(MI, OpNo, O, "|");
+}
+
+void AMDGPUInstPrinter::printClamp(const MCInst *MI, unsigned OpNo,
+                                   raw_ostream &O) {
+  printIfSet(MI, OpNo, O, "_SAT");
+}
+
+void AMDGPUInstPrinter::printLiteral(const MCInst *MI, unsigned OpNo,
+                                     raw_ostream &O) {
+  int32_t Imm = MI->getOperand(OpNo).getImm();
+  O << Imm << '(' << BitsToFloat(Imm) << ')';
+}
+
+void AMDGPUInstPrinter::printLast(const MCInst *MI, unsigned OpNo,
+                                  raw_ostream &O) {
+  printIfSet(MI, OpNo, O.indent(25 - O.GetNumBytesInBuffer()), "*", " ");
+}
+
+void AMDGPUInstPrinter::printNeg(const MCInst *MI, unsigned OpNo,
+                                 raw_ostream &O) {
+  printIfSet(MI, OpNo, O, "-");
+}
+
+void AMDGPUInstPrinter::printOMOD(const MCInst *MI, unsigned OpNo,
+                                  raw_ostream &O) {
+  switch (MI->getOperand(OpNo).getImm()) {
+  default: break;
+  case 1:
+    O << " * 2.0";
+    break;
+  case 2:
+    O << " * 4.0";
+    break;
+  case 3:
+    O << " / 2.0";
+    break;
+  }
+}
+
+void AMDGPUInstPrinter::printRel(const MCInst *MI, unsigned OpNo,
+                                 raw_ostream &O) {
+  printIfSet(MI, OpNo, O, "+");
+}
+
+void AMDGPUInstPrinter::printUpdateExecMask(const MCInst *MI, unsigned OpNo,
+                                            raw_ostream &O) {
+  printIfSet(MI, OpNo, O, "ExecMask,");
+}
+
+void AMDGPUInstPrinter::printUpdatePred(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) {
+  printIfSet(MI, OpNo, O, "Pred,");
+}
+
+void AMDGPUInstPrinter::printWrite(const MCInst *MI, unsigned OpNo,
+                                       raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.getImm() == 0) {
+    O << " (MASKED)";
+  }
+}
+
+void AMDGPUInstPrinter::printSel(const MCInst *MI, unsigned OpNo,
+                                  raw_ostream &O) {
+  const char * chans = "XYZW";
+  int sel = MI->getOperand(OpNo).getImm();
+
+  int chan = sel & 3;
+  sel >>= 2;
+
+  if (sel >= 512) {
+    sel -= 512;
+    int cb = sel >> 12;
+    sel &= 4095;
+    O << cb << "[" << sel << "]";
+  } else if (sel >= 448) {
+    sel -= 448;
+    O << sel;
+  } else if (sel >= 0){
+    O << sel;
+  }
+
+  if (sel >= 0)
+    O << "." << chans[chan];
+}
+
+void AMDGPUInstPrinter::printBankSwizzle(const MCInst *MI, unsigned OpNo,
+                                         raw_ostream &O) {
+  int BankSwizzle = MI->getOperand(OpNo).getImm();
+  switch (BankSwizzle) {
+  case 1:
+    O << "BS:VEC_021/SCL_122";
+    break;
+  case 2:
+    O << "BS:VEC_120/SCL_212";
+    break;
+  case 3:
+    O << "BS:VEC_102/SCL_221";
+    break;
+  case 4:
+    O << "BS:VEC_201";
+    break;
+  case 5:
+    O << "BS:VEC_210";
+    break;
+  default:
+    break;
+  }
+  return;
+}
+
+void AMDGPUInstPrinter::printRSel(const MCInst *MI, unsigned OpNo,
+                                  raw_ostream &O) {
+  unsigned Sel = MI->getOperand(OpNo).getImm();
+  switch (Sel) {
+  case 0:
+    O << "X";
+    break;
+  case 1:
+    O << "Y";
+    break;
+  case 2:
+    O << "Z";
+    break;
+  case 3:
+    O << "W";
+    break;
+  case 4:
+    O << "0";
+    break;
+  case 5:
+    O << "1";
+    break;
+  case 7:
+    O << "_";
+    break;
+  default:
+    break;
+  }
+}
+
+void AMDGPUInstPrinter::printCT(const MCInst *MI, unsigned OpNo,
+                                  raw_ostream &O) {
+  unsigned CT = MI->getOperand(OpNo).getImm();
+  switch (CT) {
+  case 0:
+    O << "U";
+    break;
+  case 1:
+    O << "N";
+    break;
+  default:
+    break;
+  }
+}
+
+void AMDGPUInstPrinter::printKCache(const MCInst *MI, unsigned OpNo,
+                                    raw_ostream &O) {
+  int KCacheMode = MI->getOperand(OpNo).getImm();
+  if (KCacheMode > 0) {
+    int KCacheBank = MI->getOperand(OpNo - 2).getImm();
+    O << "CB" << KCacheBank <<":";
+    int KCacheAddr = MI->getOperand(OpNo + 2).getImm();
+    int LineSize = (KCacheMode == 1)?16:32;
+    O << KCacheAddr * 16 << "-" << KCacheAddr * 16 + LineSize;
+  }
+}
+
+void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo,
+                                     raw_ostream &O) {
+  unsigned SImm16 = MI->getOperand(OpNo).getImm();
+  unsigned Msg = SImm16 & 0xF;
+  if (Msg == 2 || Msg == 3) {
+    unsigned Op = (SImm16 >> 4) & 0xF;
+    if (Msg == 3)
+      O << "Gs_done(";
+    else
+      O << "Gs(";
+    if (Op == 0) {
+      O << "nop";
+    } else {
+      unsigned Stream = (SImm16 >> 8) & 0x3;
+      if (Op == 1)
+	O << "cut";
+      else if (Op == 2)
+	O << "emit";
+      else if (Op == 3)
+	O << "emit-cut";
+      O << " stream " << Stream;
+    }
+    O << "), [m0] ";
+  } else if (Msg == 1)
+    O << "interrupt ";
+  else if (Msg == 15)
+    O << "system ";
+  else
+    O << "unknown(" << Msg << ") ";
+}
+
+void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo,
+                                      raw_ostream &O) {
+  // Note: Mask values are taken from SIInsertWaits.cpp and not from ISA docs
+  // SIInsertWaits.cpp bits usage does not match ISA docs description but it
+  // works so it might be a misprint in docs.
+  unsigned SImm16 = MI->getOperand(OpNo).getImm();
+  unsigned Vmcnt = SImm16 & 0xF;
+  unsigned Expcnt = (SImm16 >> 4) & 0xF;
+  unsigned Lgkmcnt = (SImm16 >> 8) & 0xF;
+  if (Vmcnt != 0xF)
+    O << "vmcnt(" << Vmcnt << ") ";
+  if (Expcnt != 0x7)
+    O << "expcnt(" << Expcnt << ") ";
+  if (Lgkmcnt != 0x7)
+    O << "lgkmcnt(" << Lgkmcnt << ")";
+}
+
+#include "AMDGPUGenAsmWriter.inc"
diff --git a/contrib/llvm/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h b/contrib/llvm/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
new file mode 100644
index 0000000..6ca7170
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
@@ -0,0 +1,68 @@
+//===-- AMDGPUInstPrinter.h - AMDGPU MC Inst -> ASM interface ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPUINSTPRINTER_H
+#define AMDGPUINSTPRINTER_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+class AMDGPUInstPrinter : public MCInstPrinter {
+public:
+  AMDGPUInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                     const MCRegisterInfo &MRI)
+    : MCInstPrinter(MAI, MII, MRI) {}
+
+  //Autogenerated by tblgen
+  void printInstruction(const MCInst *MI, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo);
+
+  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override;
+
+private:
+  void printU8ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printU16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printU32ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printRegOperand(unsigned RegNo, raw_ostream &O);
+  void printImmediate(uint32_t Imm, raw_ostream &O);
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printOperandAndMods(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printInterpSlot(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O,
+                         StringRef Asm, StringRef Default = "");
+  static void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printClamp(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printLiteral(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printLast(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printNeg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printOMOD(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printRel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printUpdateExecMask(const MCInst *MI, unsigned OpNo,
+                                  raw_ostream &O);
+  static void printUpdatePred(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printWrite(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printSel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printBankSwizzle(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printRSel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printCT(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printKCache(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printSendMsg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printWaitFlag(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+};
+
+} // End namespace llvm
+
+#endif // AMDGPUINSTRPRINTER_H
diff --git a/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp b/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
new file mode 100644
index 0000000..d55f27b
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -0,0 +1,141 @@
+//===-- AMDGPUAsmBackend.cpp - AMDGPU Assembler Backend -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "MCTargetDesc/AMDGPUFixupKinds.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUMCObjectWriter : public MCObjectWriter {
+public:
+  AMDGPUMCObjectWriter(raw_ostream &OS) : MCObjectWriter(OS, true) { }
+  void ExecutePostLayoutBinding(MCAssembler &Asm,
+                                const MCAsmLayout &Layout) override {
+    //XXX: Implement if necessary.
+  }
+  void RecordRelocation(const MCAssembler &Asm, const MCAsmLayout &Layout,
+                        const MCFragment *Fragment, const MCFixup &Fixup,
+                        MCValue Target, bool &IsPCRel,
+                        uint64_t &FixedValue) override {
+    assert(!"Not implemented");
+  }
+
+  void WriteObject(MCAssembler &Asm, const MCAsmLayout &Layout) override;
+
+};
+
+class AMDGPUAsmBackend : public MCAsmBackend {
+public:
+  AMDGPUAsmBackend(const Target &T)
+    : MCAsmBackend() {}
+
+  unsigned getNumFixupKinds() const override { return AMDGPU::NumTargetFixupKinds; };
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value, bool IsPCRel) const override;
+  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const override {
+    return false;
+  }
+  void relaxInstruction(const MCInst &Inst, MCInst &Res) const override {
+    assert(!"Not implemented");
+  }
+  bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override {
+    return true;
+  }
+
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
+};
+
+} //End anonymous namespace
+
+void AMDGPUMCObjectWriter::WriteObject(MCAssembler &Asm,
+                                       const MCAsmLayout &Layout) {
+  for (MCAssembler::iterator I = Asm.begin(), E = Asm.end(); I != E; ++I) {
+    Asm.writeSectionData(I, Layout);
+  }
+}
+
+void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+                                  unsigned DataSize, uint64_t Value,
+                                  bool IsPCRel) const {
+
+  switch ((unsigned)Fixup.getKind()) {
+    default: llvm_unreachable("Unknown fixup kind");
+    case AMDGPU::fixup_si_sopp_br: {
+      uint16_t *Dst = (uint16_t*)(Data + Fixup.getOffset());
+      *Dst = (Value - 4) / 4;
+      break;
+    }
+
+    case AMDGPU::fixup_si_rodata: {
+      uint32_t *Dst = (uint32_t*)(Data + Fixup.getOffset());
+      *Dst = Value;
+      break;
+    }
+
+    case AMDGPU::fixup_si_end_of_text: {
+      uint32_t *Dst = (uint32_t*)(Data + Fixup.getOffset());
+      // The value points to the last instruction in the text section, so we
+      // need to add 4 bytes to get to the start of the constants.
+      *Dst = Value + 4;
+      break;
+    }
+  }
+}
+
+const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo(
+                                                       MCFixupKind Kind) const {
+  const static MCFixupKindInfo Infos[AMDGPU::NumTargetFixupKinds] = {
+    // name                   offset bits  flags
+    { "fixup_si_sopp_br",     0,     16,   MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_si_rodata",      0,     32,   0 },
+    { "fixup_si_end_of_text", 0,     32,   MCFixupKindInfo::FKF_IsPCRel }
+  };
+
+  if (Kind < FirstTargetFixupKind)
+    return MCAsmBackend::getFixupKindInfo(Kind);
+
+  return Infos[Kind - FirstTargetFixupKind];
+}
+
+//===----------------------------------------------------------------------===//
+// ELFAMDGPUAsmBackend class
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+class ELFAMDGPUAsmBackend : public AMDGPUAsmBackend {
+public:
+  ELFAMDGPUAsmBackend(const Target &T) : AMDGPUAsmBackend(T) { }
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
+    return createAMDGPUELFObjectWriter(OS);
+  }
+};
+
+} // end anonymous namespace
+
+MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T,
+                                           const MCRegisterInfo &MRI,
+                                           StringRef TT,
+                                           StringRef CPU) {
+  return new ELFAMDGPUAsmBackend(T);
+}
diff --git a/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUELFObjectWriter.cpp
new file mode 100644
index 0000000..5fb94d5
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUELFObjectWriter.cpp
@@ -0,0 +1,39 @@
+//===-- AMDGPUELFObjectWriter.cpp - AMDGPU ELF Writer ----------------------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMCTargetDesc.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixup.h"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+  AMDGPUELFObjectWriter();
+protected:
+  unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
+                        bool IsPCRel) const override {
+    return Fixup.getKind();
+  }
+
+};
+
+
+} // End anonymous namespace
+
+AMDGPUELFObjectWriter::AMDGPUELFObjectWriter()
+  : MCELFObjectTargetWriter(false, 0, 0, false) { }
+
+MCObjectWriter *llvm::createAMDGPUELFObjectWriter(raw_ostream &OS) {
+  MCELFObjectTargetWriter *MOTW = new AMDGPUELFObjectWriter();
+  return createELFObjectWriter(MOTW, OS, true);
+}
diff --git a/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUFixupKinds.h b/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUFixupKinds.h
new file mode 100644
index 0000000..4b12e54
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUFixupKinds.h
@@ -0,0 +1,34 @@
+//===-- AMDGPUFixupKinds.h - AMDGPU Specific Fixup Entries ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AMDGPUFIXUPKINDS_H
+#define LLVM_AMDGPUFIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace AMDGPU {
+enum Fixups {
+  /// 16-bit PC relative fixup for SOPP branch instructions.
+  fixup_si_sopp_br = FirstTargetFixupKind,
+
+  /// fixup for global addresses with constant initializers
+  fixup_si_rodata,
+
+  /// fixup for offset from instruction to end of text section
+  fixup_si_end_of_text,
+
+  // Marker
+  LastTargetFixupKind,
+  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+}
+}
+
+#endif // LLVM_AMDGPUFIXUPKINDS_H
diff --git a/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
new file mode 100644
index 0000000..78bbe0a
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -0,0 +1,62 @@
+//===-- MCTargetDesc/AMDGPUMCAsmInfo.cpp - Assembly Info ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMCAsmInfo.h"
+
+using namespace llvm;
+AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(StringRef &TT) : MCAsmInfo() {
+  HasSingleParameterDotFile = false;
+  //===------------------------------------------------------------------===//
+  HasSubsectionsViaSymbols = true;
+  HasMachoZeroFillDirective = false;
+  HasMachoTBSSDirective = false;
+  HasStaticCtorDtorReferenceInStaticMode = false;
+  LinkerRequiresNonEmptyDwarfLines = true;
+  MaxInstLength = 16;
+  SeparatorString = "\n";
+  CommentString = ";";
+  LabelSuffix = ":";
+  InlineAsmStart = ";#ASMSTART";
+  InlineAsmEnd = ";#ASMEND";
+  AssemblerDialect = 0;
+
+  //===--- Data Emission Directives -------------------------------------===//
+  ZeroDirective = ".zero";
+  AsciiDirective = ".ascii\t";
+  AscizDirective = ".asciz\t";
+  Data8bitsDirective = ".byte\t";
+  Data16bitsDirective = ".short\t";
+  Data32bitsDirective = ".long\t";
+  Data64bitsDirective = ".quad\t";
+  GPRel32Directive = nullptr;
+  SunStyleELFSectionSwitchSyntax = true;
+  UsesELFSectionDirectiveForBSS = true;
+
+  //===--- Alignment Information ----------------------------------------===//
+  AlignmentIsInBytes = true;
+  TextAlignFillValue = 0;
+
+  //===--- Global Variable Emission Directives --------------------------===//
+  GlobalDirective = ".global";
+  HasSetDirective = false;
+  HasAggressiveSymbolFolding = true;
+  COMMDirectiveAlignmentIsInBytes = false;
+  HasDotTypeDotSizeDirective = false;
+  HasNoDeadStrip = true;
+  WeakRefDirective = ".weakref\t";
+  //===--- Dwarf Emission Directives -----------------------------------===//
+  HasLEB128 = true;
+  SupportsDebugInformation = true;
+}
+
+const MCSection*
+AMDGPUMCAsmInfo::getNonexecutableStackSection(MCContext &CTX) const {
+  return nullptr;
+}
diff --git a/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h b/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h
new file mode 100644
index 0000000..59aebec
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h
@@ -0,0 +1,28 @@
+//===-- MCTargetDesc/AMDGPUMCAsmInfo.h - AMDGPU MCAsm Interface  ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPUMCASMINFO_H
+#define AMDGPUMCASMINFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+namespace llvm {
+
+class StringRef;
+
+class AMDGPUMCAsmInfo : public MCAsmInfo {
+public:
+  explicit AMDGPUMCAsmInfo(StringRef &TT);
+  const MCSection* getNonexecutableStackSection(MCContext &CTX) const override;
+};
+} // namespace llvm
+#endif // AMDGPUMCASMINFO_H
diff --git a/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
new file mode 100644
index 0000000..521b3b3
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -0,0 +1,21 @@
+//===-- AMDGPUCodeEmitter.cpp - AMDGPU Code Emitter interface -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief CodeEmitter interface for R600 and SI codegen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMCCodeEmitter.h"
+
+using namespace llvm;
+
+// pin vtable to this file
+void AMDGPUMCCodeEmitter::anchor() {}
+
diff --git a/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h b/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h
new file mode 100644
index 0000000..d5e432d
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h
@@ -0,0 +1,50 @@
+//===-- AMDGPUCodeEmitter.h - AMDGPU Code Emitter interface -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief CodeEmitter interface for R600 and SI codegen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPUCODEEMITTER_H
+#define AMDGPUCODEEMITTER_H
+
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+class MCInst;
+class MCOperand;
+class MCSubtargetInfo;
+
+class AMDGPUMCCodeEmitter : public MCCodeEmitter {
+  virtual void anchor();
+public:
+
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const {
+    return 0;
+  }
+
+  virtual unsigned getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const {
+    return 0;
+  }
+};
+
+} // End namespace llvm
+
+#endif // AMDGPUCODEEMITTER_H
diff --git a/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
new file mode 100644
index 0000000..38a2956
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -0,0 +1,114 @@
+//===-- AMDGPUMCTargetDesc.cpp - AMDGPU Target Descriptions ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief This file provides AMDGPU specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMCTargetDesc.h"
+#include "AMDGPUMCAsmInfo.h"
+#include "InstPrinter/AMDGPUInstPrinter.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_MC_DESC
+#include "AMDGPUGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "AMDGPUGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "AMDGPUGenRegisterInfo.inc"
+
+static MCInstrInfo *createAMDGPUMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitAMDGPUMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createAMDGPUMCRegisterInfo(StringRef TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitAMDGPUMCRegisterInfo(X, 0);
+  return X;
+}
+
+static MCSubtargetInfo *createAMDGPUMCSubtargetInfo(StringRef TT, StringRef CPU,
+                                                   StringRef FS) {
+  MCSubtargetInfo * X = new MCSubtargetInfo();
+  InitAMDGPUMCSubtargetInfo(X, TT, CPU, FS);
+  return X;
+}
+
+static MCCodeGenInfo *createAMDGPUMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                               CodeModel::Model CM,
+                                               CodeGenOpt::Level OL) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  X->InitMCCodeGenInfo(RM, CM, OL);
+  return X;
+}
+
+static MCInstPrinter *createAMDGPUMCInstPrinter(const Target &T,
+                                                unsigned SyntaxVariant,
+                                                const MCAsmInfo &MAI,
+                                                const MCInstrInfo &MII,
+                                                const MCRegisterInfo &MRI,
+                                                const MCSubtargetInfo &STI) {
+  return new AMDGPUInstPrinter(MAI, MII, MRI);
+}
+
+static MCCodeEmitter *createAMDGPUMCCodeEmitter(const MCInstrInfo &MCII,
+                                                const MCRegisterInfo &MRI,
+                                                const MCSubtargetInfo &STI,
+                                                MCContext &Ctx) {
+  if (STI.getFeatureBits() & AMDGPU::Feature64BitPtr) {
+    return createSIMCCodeEmitter(MCII, MRI, STI, Ctx);
+  } else {
+    return createR600MCCodeEmitter(MCII, MRI, STI);
+  }
+}
+
+static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
+                                    MCContext &Ctx, MCAsmBackend &MAB,
+                                    raw_ostream &_OS,
+                                    MCCodeEmitter *_Emitter,
+                                    const MCSubtargetInfo &STI,
+                                    bool RelaxAll,
+                                    bool NoExecStack) {
+  return createELFStreamer(Ctx, MAB, _OS, _Emitter, false, false);
+}
+
+extern "C" void LLVMInitializeR600TargetMC() {
+
+  RegisterMCAsmInfo<AMDGPUMCAsmInfo> Y(TheAMDGPUTarget);
+
+  TargetRegistry::RegisterMCCodeGenInfo(TheAMDGPUTarget, createAMDGPUMCCodeGenInfo);
+
+  TargetRegistry::RegisterMCInstrInfo(TheAMDGPUTarget, createAMDGPUMCInstrInfo);
+
+  TargetRegistry::RegisterMCRegInfo(TheAMDGPUTarget, createAMDGPUMCRegisterInfo);
+
+  TargetRegistry::RegisterMCSubtargetInfo(TheAMDGPUTarget, createAMDGPUMCSubtargetInfo);
+
+  TargetRegistry::RegisterMCInstPrinter(TheAMDGPUTarget, createAMDGPUMCInstPrinter);
+
+  TargetRegistry::RegisterMCCodeEmitter(TheAMDGPUTarget, createAMDGPUMCCodeEmitter);
+
+  TargetRegistry::RegisterMCAsmBackend(TheAMDGPUTarget, createAMDGPUAsmBackend);
+
+  TargetRegistry::RegisterMCObjectStreamer(TheAMDGPUTarget, createMCStreamer);
+}
diff --git a/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h b/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h
new file mode 100644
index 0000000..f6b3376
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h
@@ -0,0 +1,58 @@
+//===-- AMDGPUMCTargetDesc.h - AMDGPU Target Descriptions -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Provides AMDGPU specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#ifndef AMDGPUMCTARGETDESC_H
+#define AMDGPUMCTARGETDESC_H
+
+#include "llvm/ADT/StringRef.h"
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectWriter;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class Target;
+class raw_ostream;
+
+extern Target TheAMDGPUTarget;
+
+MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII,
+                                       const MCRegisterInfo &MRI,
+                                       const MCSubtargetInfo &STI);
+
+MCCodeEmitter *createSIMCCodeEmitter(const MCInstrInfo &MCII,
+                                     const MCRegisterInfo &MRI,
+                                     const MCSubtargetInfo &STI,
+                                     MCContext &Ctx);
+
+MCAsmBackend *createAMDGPUAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                     StringRef TT, StringRef CPU);
+
+MCObjectWriter *createAMDGPUELFObjectWriter(raw_ostream &OS);
+} // End llvm namespace
+
+#define GET_REGINFO_ENUM
+#include "AMDGPUGenRegisterInfo.inc"
+
+#define GET_INSTRINFO_ENUM
+#include "AMDGPUGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "AMDGPUGenSubtargetInfo.inc"
+
+#endif // AMDGPUMCTARGETDESC_H
diff --git a/contrib/llvm/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/contrib/llvm/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
new file mode 100644
index 0000000..dc1344f
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -0,0 +1,184 @@
+//===- R600MCCodeEmitter.cpp - Code Emitter for R600->Cayman GPU families -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// \brief The R600 code emitter produces machine code that can be executed
+/// directly on the GPU device.
+//
+//===----------------------------------------------------------------------===//
+
+#include "R600Defines.h"
+#include "MCTargetDesc/AMDGPUMCCodeEmitter.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+
+class R600MCCodeEmitter : public AMDGPUMCCodeEmitter {
+  R600MCCodeEmitter(const R600MCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  void operator=(const R600MCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  const MCInstrInfo &MCII;
+  const MCRegisterInfo &MRI;
+
+public:
+
+  R600MCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri)
+    : MCII(mcii), MRI(mri) { }
+
+  /// \brief Encode the instruction and write it to the OS.
+  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const override;
+
+  /// \returns the encoding for an MCOperand.
+  uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const override;
+private:
+
+  void EmitByte(unsigned int byte, raw_ostream &OS) const;
+
+  void Emit(uint32_t value, raw_ostream &OS) const;
+  void Emit(uint64_t value, raw_ostream &OS) const;
+
+  unsigned getHWRegChan(unsigned reg) const;
+  unsigned getHWReg(unsigned regNo) const;
+
+};
+
+} // End anonymous namespace
+
+enum RegElement {
+  ELEMENT_X = 0,
+  ELEMENT_Y,
+  ELEMENT_Z,
+  ELEMENT_W
+};
+
+enum FCInstr {
+  FC_IF_PREDICATE = 0,
+  FC_ELSE,
+  FC_ENDIF,
+  FC_BGNLOOP,
+  FC_ENDLOOP,
+  FC_BREAK_PREDICATE,
+  FC_CONTINUE
+};
+
+MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo &MCII,
+                                           const MCRegisterInfo &MRI,
+                                           const MCSubtargetInfo &STI) {
+  return new R600MCCodeEmitter(MCII, MRI);
+}
+
+void R600MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const {
+  const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+  if (MI.getOpcode() == AMDGPU::RETURN ||
+    MI.getOpcode() == AMDGPU::FETCH_CLAUSE ||
+    MI.getOpcode() == AMDGPU::ALU_CLAUSE ||
+    MI.getOpcode() == AMDGPU::BUNDLE ||
+    MI.getOpcode() == AMDGPU::KILL) {
+    return;
+  } else if (IS_VTX(Desc)) {
+    uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups, STI);
+    uint32_t InstWord2 = MI.getOperand(2).getImm(); // Offset
+    if (!(STI.getFeatureBits() & AMDGPU::FeatureCaymanISA)) {
+      InstWord2 |= 1 << 19; // Mega-Fetch bit
+    }
+
+    Emit(InstWord01, OS);
+    Emit(InstWord2, OS);
+    Emit((uint32_t) 0, OS);
+  } else if (IS_TEX(Desc)) {
+      int64_t Sampler = MI.getOperand(14).getImm();
+
+      int64_t SrcSelect[4] = {
+        MI.getOperand(2).getImm(),
+        MI.getOperand(3).getImm(),
+        MI.getOperand(4).getImm(),
+        MI.getOperand(5).getImm()
+      };
+      int64_t Offsets[3] = {
+        MI.getOperand(6).getImm() & 0x1F,
+        MI.getOperand(7).getImm() & 0x1F,
+        MI.getOperand(8).getImm() & 0x1F
+      };
+
+      uint64_t Word01 = getBinaryCodeForInstr(MI, Fixups, STI);
+      uint32_t Word2 = Sampler << 15 | SrcSelect[ELEMENT_X] << 20 |
+          SrcSelect[ELEMENT_Y] << 23 | SrcSelect[ELEMENT_Z] << 26 |
+          SrcSelect[ELEMENT_W] << 29 | Offsets[0] << 0 | Offsets[1] << 5 |
+          Offsets[2] << 10;
+
+      Emit(Word01, OS);
+      Emit(Word2, OS);
+      Emit((uint32_t) 0, OS);
+  } else {
+    uint64_t Inst = getBinaryCodeForInstr(MI, Fixups, STI);
+    if ((STI.getFeatureBits() & AMDGPU::FeatureR600ALUInst) &&
+       ((Desc.TSFlags & R600_InstFlag::OP1) ||
+         Desc.TSFlags & R600_InstFlag::OP2)) {
+      uint64_t ISAOpCode = Inst & (0x3FFULL << 39);
+      Inst &= ~(0x3FFULL << 39);
+      Inst |= ISAOpCode << 1;
+    }
+    Emit(Inst, OS);
+  }
+}
+
+void R600MCCodeEmitter::EmitByte(unsigned int Byte, raw_ostream &OS) const {
+  OS.write((uint8_t) Byte & 0xff);
+}
+
+void R600MCCodeEmitter::Emit(uint32_t Value, raw_ostream &OS) const {
+  for (unsigned i = 0; i < 4; i++) {
+    OS.write((uint8_t) ((Value >> (8 * i)) & 0xff));
+  }
+}
+
+void R600MCCodeEmitter::Emit(uint64_t Value, raw_ostream &OS) const {
+  for (unsigned i = 0; i < 8; i++) {
+    EmitByte((Value >> (8 * i)) & 0xff, OS);
+  }
+}
+
+unsigned R600MCCodeEmitter::getHWRegChan(unsigned reg) const {
+  return MRI.getEncodingValue(reg) >> HW_CHAN_SHIFT;
+}
+
+unsigned R600MCCodeEmitter::getHWReg(unsigned RegNo) const {
+  return MRI.getEncodingValue(RegNo) & HW_REG_MASK;
+}
+
+uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI,
+                                              const MCOperand &MO,
+                                        SmallVectorImpl<MCFixup> &Fixup,
+                                        const MCSubtargetInfo &STI) const {
+  if (MO.isReg()) {
+    if (HAS_NATIVE_OPERANDS(MCII.get(MI.getOpcode()).TSFlags))
+      return MRI.getEncodingValue(MO.getReg());
+    return getHWReg(MO.getReg());
+  }
+
+  assert(MO.isImm());
+  return MO.getImm();
+}
+
+#include "AMDGPUGenMCCodeEmitter.inc"
diff --git a/contrib/llvm/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp b/contrib/llvm/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
new file mode 100644
index 0000000..78776c1
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -0,0 +1,245 @@
+//===-- SIMCCodeEmitter.cpp - SI Code Emitter -------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief The SI code emitter produces machine code that can be executed
+/// directly on the GPU device.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "MCTargetDesc/AMDGPUMCCodeEmitter.h"
+#include "MCTargetDesc/AMDGPUFixupKinds.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+
+/// \brief Helper type used in encoding
+typedef union {
+  int32_t I;
+  float F;
+} IntFloatUnion;
+
+class SIMCCodeEmitter : public  AMDGPUMCCodeEmitter {
+  SIMCCodeEmitter(const SIMCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  void operator=(const SIMCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  const MCInstrInfo &MCII;
+  const MCRegisterInfo &MRI;
+  MCContext &Ctx;
+
+  /// \brief Can this operand also contain immediate values?
+  bool isSrcOperand(const MCInstrDesc &Desc, unsigned OpNo) const;
+
+  /// \brief Encode an fp or int literal
+  uint32_t getLitEncoding(const MCOperand &MO) const;
+
+public:
+  SIMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri,
+                  MCContext &ctx)
+    : MCII(mcii), MRI(mri), Ctx(ctx) { }
+
+  ~SIMCCodeEmitter() { }
+
+  /// \brief Encode the instruction and write it to the OS.
+  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const override;
+
+  /// \returns the encoding for an MCOperand.
+  uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const override;
+
+  /// \brief Use a fixup to encode the simm16 field for SOPP branch
+  ///        instructions.
+  unsigned getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const override;
+};
+
+} // End anonymous namespace
+
+MCCodeEmitter *llvm::createSIMCCodeEmitter(const MCInstrInfo &MCII,
+                                           const MCRegisterInfo &MRI,
+                                           const MCSubtargetInfo &STI,
+                                           MCContext &Ctx) {
+  return new SIMCCodeEmitter(MCII, MRI, Ctx);
+}
+
+bool SIMCCodeEmitter::isSrcOperand(const MCInstrDesc &Desc,
+                                   unsigned OpNo) const {
+
+  unsigned RegClass = Desc.OpInfo[OpNo].RegClass;
+  return (AMDGPU::SSrc_32RegClassID == RegClass) ||
+         (AMDGPU::SSrc_64RegClassID == RegClass) ||
+         (AMDGPU::VSrc_32RegClassID == RegClass) ||
+         (AMDGPU::VSrc_64RegClassID == RegClass);
+}
+
+uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO) const {
+
+  IntFloatUnion Imm;
+  if (MO.isImm())
+    Imm.I = MO.getImm();
+  else if (MO.isFPImm())
+    Imm.F = MO.getFPImm();
+  else if (MO.isExpr())
+    return 255;
+  else
+    return ~0;
+
+  if (Imm.I >= 0 && Imm.I <= 64)
+    return 128 + Imm.I;
+
+  if (Imm.I >= -16 && Imm.I <= -1)
+    return 192 + abs(Imm.I);
+
+  if (Imm.F == 0.5f)
+    return 240;
+
+  if (Imm.F == -0.5f)
+    return 241;
+
+  if (Imm.F == 1.0f)
+    return 242;
+
+  if (Imm.F == -1.0f)
+    return 243;
+
+  if (Imm.F == 2.0f)
+    return 244;
+
+  if (Imm.F == -2.0f)
+    return 245;
+
+  if (Imm.F == 4.0f)
+    return 246;
+
+  if (Imm.F == -4.0f)
+    return 247;
+
+  return 255;
+}
+
+void SIMCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const {
+
+  uint64_t Encoding = getBinaryCodeForInstr(MI, Fixups, STI);
+  const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+  unsigned bytes = Desc.getSize();
+
+  for (unsigned i = 0; i < bytes; i++) {
+    OS.write((uint8_t) ((Encoding >> (8 * i)) & 0xff));
+  }
+
+  if (bytes > 4)
+    return;
+
+  // Check for additional literals in SRC0/1/2 (Op 1/2/3)
+  for (unsigned i = 0, e = MI.getNumOperands(); i < e; ++i) {
+
+    // Check if this operand should be encoded as [SV]Src
+    if (!isSrcOperand(Desc, i))
+      continue;
+
+    // Is this operand a literal immediate?
+    const MCOperand &Op = MI.getOperand(i);
+    if (getLitEncoding(Op) != 255)
+      continue;
+
+    // Yes! Encode it
+    IntFloatUnion Imm;
+    if (Op.isImm())
+      Imm.I = Op.getImm();
+    else if (Op.isFPImm())
+      Imm.F = Op.getFPImm();
+    else {
+      assert(Op.isExpr());
+      // This will be replaced with a fixup value.
+      Imm.I = 0;
+    }
+
+    for (unsigned j = 0; j < 4; j++) {
+      OS.write((uint8_t) ((Imm.I >> (8 * j)) & 0xff));
+    }
+
+    // Only one literal value allowed
+    break;
+  }
+}
+
+unsigned SIMCCodeEmitter::getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+
+  if (MO.isExpr()) {
+    const MCExpr *Expr = MO.getExpr();
+    MCFixupKind Kind = (MCFixupKind)AMDGPU::fixup_si_sopp_br;
+    Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
+    return 0;
+  }
+
+  return getMachineOpValue(MI, MO, Fixups, STI);
+}
+
+uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
+                                            const MCOperand &MO,
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const {
+  if (MO.isReg())
+    return MRI.getEncodingValue(MO.getReg());
+
+  if (MO.isExpr()) {
+    const MCSymbolRefExpr *Expr = cast<MCSymbolRefExpr>(MO.getExpr());
+    MCFixupKind Kind;
+    const MCSymbol *Sym =
+        Ctx.GetOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME));
+
+    if (&Expr->getSymbol() == Sym) {
+      // Add the offset to the beginning of the constant values.
+      Kind = (MCFixupKind)AMDGPU::fixup_si_end_of_text;
+    } else {
+      // This is used for constant data stored in .rodata.
+     Kind = (MCFixupKind)AMDGPU::fixup_si_rodata;
+    }
+    Fixups.push_back(MCFixup::Create(4, Expr, Kind, MI.getLoc()));
+  }
+
+  // Figure out the operand number, needed for isSrcOperand check
+  unsigned OpNo = 0;
+  for (unsigned e = MI.getNumOperands(); OpNo < e; ++OpNo) {
+    if (&MO == &MI.getOperand(OpNo))
+      break;
+  }
+
+  const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+  if (isSrcOperand(Desc, OpNo)) {
+    uint32_t Enc = getLitEncoding(MO);
+    if (Enc != ~0U && (Enc != 255 || Desc.getSize() == 4))
+      return Enc;
+
+  } else if (MO.isImm())
+    return MO.getImm();
+
+  llvm_unreachable("Encoding of this operand type is not supported yet.");
+  return 0;
+}
+
diff --git a/contrib/llvm/lib/Target/R600/Processors.td b/contrib/llvm/lib/Target/R600/Processors.td
new file mode 100644
index 0000000..ce17d7c
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/Processors.td
@@ -0,0 +1,110 @@
+//===-- Processors.td - R600 Processor definitions ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, ProcessorItineraries itin, list<SubtargetFeature> Features>
+: Processor<Name, itin, Features>;
+
+//===----------------------------------------------------------------------===//
+// R600
+//===----------------------------------------------------------------------===//
+def : Proc<"",           R600_VLIW5_Itin,
+    [FeatureR600, FeatureVertexCache]>;
+
+def : Proc<"r600",       R600_VLIW5_Itin,
+    [FeatureR600 , FeatureVertexCache, FeatureWavefrontSize64]>;
+
+def : Proc<"r630",       R600_VLIW5_Itin,
+    [FeatureR600, FeatureVertexCache, FeatureWavefrontSize32]>;
+
+def : Proc<"rs880",      R600_VLIW5_Itin,
+    [FeatureR600, FeatureWavefrontSize16]>;
+
+def : Proc<"rv670",      R600_VLIW5_Itin,
+    [FeatureR600, FeatureFP64, FeatureVertexCache, FeatureWavefrontSize64]>;
+
+//===----------------------------------------------------------------------===//
+// R700
+//===----------------------------------------------------------------------===//
+
+def : Proc<"rv710",      R600_VLIW5_Itin,
+    [FeatureR700, FeatureVertexCache, FeatureWavefrontSize32]>;
+
+def : Proc<"rv730",      R600_VLIW5_Itin,
+    [FeatureR700, FeatureVertexCache, FeatureWavefrontSize32]>;
+
+def : Proc<"rv770",      R600_VLIW5_Itin,
+    [FeatureR700, FeatureFP64, FeatureVertexCache, FeatureWavefrontSize64]>;
+
+//===----------------------------------------------------------------------===//
+// Evergreen
+//===----------------------------------------------------------------------===//
+
+def : Proc<"cedar",      R600_VLIW5_Itin,
+    [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize32,
+     FeatureCFALUBug]>;
+
+def : Proc<"redwood",    R600_VLIW5_Itin,
+    [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize64,
+     FeatureCFALUBug]>;
+
+def : Proc<"sumo",       R600_VLIW5_Itin,
+    [FeatureEvergreen, FeatureWavefrontSize64, FeatureCFALUBug]>;
+
+def : Proc<"juniper",    R600_VLIW5_Itin,
+    [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize64]>;
+
+def : Proc<"cypress",    R600_VLIW5_Itin,
+    [FeatureEvergreen, FeatureFP64, FeatureVertexCache,
+     FeatureWavefrontSize64]>;
+
+//===----------------------------------------------------------------------===//
+// Northern Islands
+//===----------------------------------------------------------------------===//
+
+def : Proc<"barts",      R600_VLIW5_Itin,
+    [FeatureNorthernIslands, FeatureVertexCache, FeatureCFALUBug]>;
+
+def : Proc<"turks",      R600_VLIW5_Itin,
+    [FeatureNorthernIslands, FeatureVertexCache, FeatureCFALUBug]>;
+
+def : Proc<"caicos",     R600_VLIW5_Itin,
+    [FeatureNorthernIslands, FeatureCFALUBug]>;
+
+def : Proc<"cayman",     R600_VLIW4_Itin,
+    [FeatureNorthernIslands, FeatureFP64, FeatureCaymanISA]>;
+
+//===----------------------------------------------------------------------===//
+// Southern Islands
+//===----------------------------------------------------------------------===//
+
+def : Proc<"SI",         SI_Itin, [FeatureSouthernIslands]>;
+
+def : Proc<"tahiti",     SI_Itin, [FeatureSouthernIslands]>;
+
+def : Proc<"pitcairn",   SI_Itin, [FeatureSouthernIslands]>;
+
+def : Proc<"verde",      SI_Itin, [FeatureSouthernIslands]>;
+
+def : Proc<"oland",      SI_Itin, [FeatureSouthernIslands]>;
+
+def : Proc<"hainan",     SI_Itin, [FeatureSouthernIslands]>;
+
+//===----------------------------------------------------------------------===//
+// Sea Islands
+//===----------------------------------------------------------------------===//
+
+def : Proc<"bonaire",    SI_Itin, [FeatureSeaIslands]>;
+
+def : Proc<"kabini",     SI_Itin, [FeatureSeaIslands]>;
+
+def : Proc<"kaveri",     SI_Itin, [FeatureSeaIslands]>;
+
+def : Proc<"hawaii",     SI_Itin, [FeatureSeaIslands]>;
+
+def : Proc<"mullins",    SI_Itin, [FeatureSeaIslands]>;
diff --git a/contrib/llvm/lib/Target/R600/R600ClauseMergePass.cpp b/contrib/llvm/lib/Target/R600/R600ClauseMergePass.cpp
new file mode 100644
index 0000000..92bf0df
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/R600ClauseMergePass.cpp
@@ -0,0 +1,205 @@
+//===-- R600ClauseMergePass - Merge consecutive CF_ALU -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// R600EmitClauseMarker pass emits CFAlu instruction in a conservative maneer.
+/// This pass is merging consecutive CFAlus where applicable.
+/// It needs to be called after IfCvt for best results.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "R600Defines.h"
+#include "R600InstrInfo.h"
+#include "R600MachineFunctionInfo.h"
+#include "R600RegisterInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "r600mergeclause"
+
+namespace {
+
+static bool isCFAlu(const MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  case AMDGPU::CF_ALU:
+  case AMDGPU::CF_ALU_PUSH_BEFORE:
+    return true;
+  default:
+    return false;
+  }
+}
+
+class R600ClauseMergePass : public MachineFunctionPass {
+
+private:
+  static char ID;
+  const R600InstrInfo *TII;
+
+  unsigned getCFAluSize(const MachineInstr *MI) const;
+  bool isCFAluEnabled(const MachineInstr *MI) const;
+
+  /// IfCvt pass can generate "disabled" ALU clause marker that need to be
+  /// removed and their content affected to the previous alu clause.
+  /// This function parse instructions after CFAlu until it find a disabled
+  /// CFAlu and merge the content, or an enabled CFAlu.
+  void cleanPotentialDisabledCFAlu(MachineInstr *CFAlu) const;
+
+  /// Check whether LatrCFAlu can be merged into RootCFAlu and do it if
+  /// it is the case.
+  bool mergeIfPossible(MachineInstr *RootCFAlu, const MachineInstr *LatrCFAlu)
+      const;
+
+public:
+  R600ClauseMergePass(TargetMachine &tm) : MachineFunctionPass(ID) { }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override;
+};
+
+char R600ClauseMergePass::ID = 0;
+
+unsigned R600ClauseMergePass::getCFAluSize(const MachineInstr *MI) const {
+  assert(isCFAlu(MI));
+  return MI->getOperand(
+      TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::COUNT)).getImm();
+}
+
+bool R600ClauseMergePass::isCFAluEnabled(const MachineInstr *MI) const {
+  assert(isCFAlu(MI));
+  return MI->getOperand(
+      TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::Enabled)).getImm();
+}
+
+void R600ClauseMergePass::cleanPotentialDisabledCFAlu(MachineInstr *CFAlu)
+    const {
+  int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT);
+  MachineBasicBlock::iterator I = CFAlu, E = CFAlu->getParent()->end();
+  I++;
+  do {
+    while (I!= E && !isCFAlu(I))
+      I++;
+    if (I == E)
+      return;
+    MachineInstr *MI = I++;
+    if (isCFAluEnabled(MI))
+      break;
+    CFAlu->getOperand(CntIdx).setImm(getCFAluSize(CFAlu) + getCFAluSize(MI));
+    MI->eraseFromParent();
+  } while (I != E);
+}
+
+bool R600ClauseMergePass::mergeIfPossible(MachineInstr *RootCFAlu,
+                                          const MachineInstr *LatrCFAlu) const {
+  assert(isCFAlu(RootCFAlu) && isCFAlu(LatrCFAlu));
+  int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT);
+  unsigned RootInstCount = getCFAluSize(RootCFAlu),
+      LaterInstCount = getCFAluSize(LatrCFAlu);
+  unsigned CumuledInsts = RootInstCount + LaterInstCount;
+  if (CumuledInsts >= TII->getMaxAlusPerClause()) {
+    DEBUG(dbgs() << "Excess inst counts\n");
+    return false;
+  }
+  if (RootCFAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE)
+    return false;
+  // Is KCache Bank 0 compatible ?
+  int Mode0Idx =
+      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_MODE0);
+  int KBank0Idx =
+      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK0);
+  int KBank0LineIdx =
+      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR0);
+  if (LatrCFAlu->getOperand(Mode0Idx).getImm() &&
+      RootCFAlu->getOperand(Mode0Idx).getImm() &&
+      (LatrCFAlu->getOperand(KBank0Idx).getImm() !=
+       RootCFAlu->getOperand(KBank0Idx).getImm() ||
+      LatrCFAlu->getOperand(KBank0LineIdx).getImm() !=
+      RootCFAlu->getOperand(KBank0LineIdx).getImm())) {
+    DEBUG(dbgs() << "Wrong KC0\n");
+    return false;
+  }
+  // Is KCache Bank 1 compatible ?
+  int Mode1Idx =
+      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_MODE1);
+  int KBank1Idx =
+      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK1);
+  int KBank1LineIdx =
+      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR1);
+  if (LatrCFAlu->getOperand(Mode1Idx).getImm() &&
+      RootCFAlu->getOperand(Mode1Idx).getImm() &&
+      (LatrCFAlu->getOperand(KBank1Idx).getImm() !=
+      RootCFAlu->getOperand(KBank1Idx).getImm() ||
+      LatrCFAlu->getOperand(KBank1LineIdx).getImm() !=
+      RootCFAlu->getOperand(KBank1LineIdx).getImm())) {
+    DEBUG(dbgs() << "Wrong KC0\n");
+    return false;
+  }
+  if (LatrCFAlu->getOperand(Mode0Idx).getImm()) {
+    RootCFAlu->getOperand(Mode0Idx).setImm(
+        LatrCFAlu->getOperand(Mode0Idx).getImm());
+    RootCFAlu->getOperand(KBank0Idx).setImm(
+        LatrCFAlu->getOperand(KBank0Idx).getImm());
+    RootCFAlu->getOperand(KBank0LineIdx).setImm(
+        LatrCFAlu->getOperand(KBank0LineIdx).getImm());
+  }
+  if (LatrCFAlu->getOperand(Mode1Idx).getImm()) {
+    RootCFAlu->getOperand(Mode1Idx).setImm(
+        LatrCFAlu->getOperand(Mode1Idx).getImm());
+    RootCFAlu->getOperand(KBank1Idx).setImm(
+        LatrCFAlu->getOperand(KBank1Idx).getImm());
+    RootCFAlu->getOperand(KBank1LineIdx).setImm(
+        LatrCFAlu->getOperand(KBank1LineIdx).getImm());
+  }
+  RootCFAlu->getOperand(CntIdx).setImm(CumuledInsts);
+  RootCFAlu->setDesc(TII->get(LatrCFAlu->getOpcode()));
+  return true;
+}
+
+bool R600ClauseMergePass::runOnMachineFunction(MachineFunction &MF) {
+  TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
+  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                  BB != BB_E; ++BB) {
+    MachineBasicBlock &MBB = *BB;
+    MachineBasicBlock::iterator I = MBB.begin(),  E = MBB.end();
+    MachineBasicBlock::iterator LatestCFAlu = E;
+    while (I != E) {
+      MachineInstr *MI = I++;
+      if ((!TII->canBeConsideredALU(MI) && !isCFAlu(MI)) ||
+          TII->mustBeLastInClause(MI->getOpcode()))
+        LatestCFAlu = E;
+      if (!isCFAlu(MI))
+        continue;
+      cleanPotentialDisabledCFAlu(MI);
+
+      if (LatestCFAlu != E && mergeIfPossible(LatestCFAlu, MI)) {
+        MI->eraseFromParent();
+      } else {
+        assert(MI->getOperand(8).getImm() && "CF ALU instruction disabled");
+        LatestCFAlu = MI;
+      }
+    }
+  }
+  return false;
+}
+
+const char *R600ClauseMergePass::getPassName() const {
+  return "R600 Merge Clause Markers Pass";
+}
+
+} // end anonymous namespace
+
+
+llvm::FunctionPass *llvm::createR600ClauseMergePass(TargetMachine &TM) {
+  return new R600ClauseMergePass(TM);
+}
diff --git a/contrib/llvm/lib/Target/R600/R600ControlFlowFinalizer.cpp b/contrib/llvm/lib/Target/R600/R600ControlFlowFinalizer.cpp
new file mode 100644
index 0000000..e37767a
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/R600ControlFlowFinalizer.cpp
@@ -0,0 +1,682 @@
+//===-- R600ControlFlowFinalizer.cpp - Finalize Control Flow Inst----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass compute turns all control flow pseudo instructions into native one
+/// computing their address on the fly ; it also sets STACK_SIZE info.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Debug.h"
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "R600Defines.h"
+#include "R600InstrInfo.h"
+#include "R600MachineFunctionInfo.h"
+#include "R600RegisterInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "r600cf"
+
+namespace {
+
+struct CFStack {
+
+  enum StackItem {
+    ENTRY = 0,
+    SUB_ENTRY = 1,
+    FIRST_NON_WQM_PUSH = 2,
+    FIRST_NON_WQM_PUSH_W_FULL_ENTRY = 3
+  };
+
+  const AMDGPUSubtarget &ST;
+  std::vector<StackItem> BranchStack;
+  std::vector<StackItem> LoopStack;
+  unsigned MaxStackSize;
+  unsigned CurrentEntries;
+  unsigned CurrentSubEntries;
+
+  CFStack(const AMDGPUSubtarget &st, unsigned ShaderType) : ST(st),
+      // We need to reserve a stack entry for CALL_FS in vertex shaders.
+      MaxStackSize(ShaderType == ShaderType::VERTEX ? 1 : 0),
+      CurrentEntries(0), CurrentSubEntries(0) { }
+
+  unsigned getLoopDepth();
+  bool branchStackContains(CFStack::StackItem);
+  bool requiresWorkAroundForInst(unsigned Opcode);
+  unsigned getSubEntrySize(CFStack::StackItem Item);
+  void updateMaxStackSize();
+  void pushBranch(unsigned Opcode, bool isWQM = false);
+  void pushLoop();
+  void popBranch();
+  void popLoop();
+};
+
+unsigned CFStack::getLoopDepth() {
+  return LoopStack.size();
+}
+
+bool CFStack::branchStackContains(CFStack::StackItem Item) {
+  for (std::vector<CFStack::StackItem>::const_iterator I = BranchStack.begin(),
+       E = BranchStack.end(); I != E; ++I) {
+    if (*I == Item)
+      return true;
+  }
+  return false;
+}
+
+bool CFStack::requiresWorkAroundForInst(unsigned Opcode) {
+  if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE && ST.hasCaymanISA() &&
+      getLoopDepth() > 1)
+    return true;
+
+  if (!ST.hasCFAluBug())
+    return false;
+
+  switch(Opcode) {
+  default: return false;
+  case AMDGPU::CF_ALU_PUSH_BEFORE:
+  case AMDGPU::CF_ALU_ELSE_AFTER:
+  case AMDGPU::CF_ALU_BREAK:
+  case AMDGPU::CF_ALU_CONTINUE:
+    if (CurrentSubEntries == 0)
+      return false;
+    if (ST.getWavefrontSize() == 64) {
+      // We are being conservative here.  We only require this work-around if
+      // CurrentSubEntries > 3 &&
+      // (CurrentSubEntries % 4 == 3 || CurrentSubEntries % 4 == 0)
+      //
+      // We have to be conservative, because we don't know for certain that
+      // our stack allocation algorithm for Evergreen/NI is correct.  Applying this
+      // work-around when CurrentSubEntries > 3 allows us to over-allocate stack
+      // resources without any problems.
+      return CurrentSubEntries > 3;
+    } else {
+      assert(ST.getWavefrontSize() == 32);
+      // We are being conservative here.  We only require the work-around if
+      // CurrentSubEntries > 7 &&
+      // (CurrentSubEntries % 8 == 7 || CurrentSubEntries % 8 == 0)
+      // See the comment on the wavefront size == 64 case for why we are
+      // being conservative.
+      return CurrentSubEntries > 7;
+    }
+  }
+}
+
+unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) {
+  switch(Item) {
+  default:
+    return 0;
+  case CFStack::FIRST_NON_WQM_PUSH:
+  assert(!ST.hasCaymanISA());
+  if (ST.getGeneration() <= AMDGPUSubtarget::R700) {
+    // +1 For the push operation.
+    // +2 Extra space required.
+    return 3;
+  } else {
+    // Some documentation says that this is not necessary on Evergreen,
+    // but experimentation has show that we need to allocate 1 extra
+    // sub-entry for the first non-WQM push.
+    // +1 For the push operation.
+    // +1 Extra space required.
+    return 2;
+  }
+  case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY:
+    assert(ST.getGeneration() >= AMDGPUSubtarget::EVERGREEN);
+    // +1 For the push operation.
+    // +1 Extra space required.
+    return 2;
+  case CFStack::SUB_ENTRY:
+    return 1;
+  }
+}
+
+void CFStack::updateMaxStackSize() {
+  unsigned CurrentStackSize = CurrentEntries +
+                              (RoundUpToAlignment(CurrentSubEntries, 4) / 4);
+  MaxStackSize = std::max(CurrentStackSize, MaxStackSize);
+}
+
+void CFStack::pushBranch(unsigned Opcode, bool isWQM) {
+  CFStack::StackItem Item = CFStack::ENTRY;
+  switch(Opcode) {
+  case AMDGPU::CF_PUSH_EG:
+  case AMDGPU::CF_ALU_PUSH_BEFORE:
+    if (!isWQM) {
+      if (!ST.hasCaymanISA() && !branchStackContains(CFStack::FIRST_NON_WQM_PUSH))
+        Item = CFStack::FIRST_NON_WQM_PUSH;  // May not be required on Evergreen/NI
+                                             // See comment in
+                                             // CFStack::getSubEntrySize()
+      else if (CurrentEntries > 0 &&
+               ST.getGeneration() > AMDGPUSubtarget::EVERGREEN &&
+               !ST.hasCaymanISA() &&
+               !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY))
+        Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY;
+      else
+        Item = CFStack::SUB_ENTRY;
+    } else
+      Item = CFStack::ENTRY;
+    break;
+  }
+  BranchStack.push_back(Item);
+  if (Item == CFStack::ENTRY)
+    CurrentEntries++;
+  else
+    CurrentSubEntries += getSubEntrySize(Item);
+  updateMaxStackSize();
+}
+
+void CFStack::pushLoop() {
+  LoopStack.push_back(CFStack::ENTRY);
+  CurrentEntries++;
+  updateMaxStackSize();
+}
+
+void CFStack::popBranch() {
+  CFStack::StackItem Top = BranchStack.back();
+  if (Top == CFStack::ENTRY)
+    CurrentEntries--;
+  else
+    CurrentSubEntries-= getSubEntrySize(Top);
+  BranchStack.pop_back();
+}
+
+void CFStack::popLoop() {
+  CurrentEntries--;
+  LoopStack.pop_back();
+}
+
+class R600ControlFlowFinalizer : public MachineFunctionPass {
+
+private:
+  typedef std::pair<MachineInstr *, std::vector<MachineInstr *> > ClauseFile;
+
+  enum ControlFlowInstruction {
+    CF_TC,
+    CF_VC,
+    CF_CALL_FS,
+    CF_WHILE_LOOP,
+    CF_END_LOOP,
+    CF_LOOP_BREAK,
+    CF_LOOP_CONTINUE,
+    CF_JUMP,
+    CF_ELSE,
+    CF_POP,
+    CF_END
+  };
+
+  static char ID;
+  const R600InstrInfo *TII;
+  const R600RegisterInfo *TRI;
+  unsigned MaxFetchInst;
+  const AMDGPUSubtarget &ST;
+
+  bool IsTrivialInst(MachineInstr *MI) const {
+    switch (MI->getOpcode()) {
+    case AMDGPU::KILL:
+    case AMDGPU::RETURN:
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const {
+    unsigned Opcode = 0;
+    bool isEg = (ST.getGeneration() >= AMDGPUSubtarget::EVERGREEN);
+    switch (CFI) {
+    case CF_TC:
+      Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600;
+      break;
+    case CF_VC:
+      Opcode = isEg ? AMDGPU::CF_VC_EG : AMDGPU::CF_VC_R600;
+      break;
+    case CF_CALL_FS:
+      Opcode = isEg ? AMDGPU::CF_CALL_FS_EG : AMDGPU::CF_CALL_FS_R600;
+      break;
+    case CF_WHILE_LOOP:
+      Opcode = isEg ? AMDGPU::WHILE_LOOP_EG : AMDGPU::WHILE_LOOP_R600;
+      break;
+    case CF_END_LOOP:
+      Opcode = isEg ? AMDGPU::END_LOOP_EG : AMDGPU::END_LOOP_R600;
+      break;
+    case CF_LOOP_BREAK:
+      Opcode = isEg ? AMDGPU::LOOP_BREAK_EG : AMDGPU::LOOP_BREAK_R600;
+      break;
+    case CF_LOOP_CONTINUE:
+      Opcode = isEg ? AMDGPU::CF_CONTINUE_EG : AMDGPU::CF_CONTINUE_R600;
+      break;
+    case CF_JUMP:
+      Opcode = isEg ? AMDGPU::CF_JUMP_EG : AMDGPU::CF_JUMP_R600;
+      break;
+    case CF_ELSE:
+      Opcode = isEg ? AMDGPU::CF_ELSE_EG : AMDGPU::CF_ELSE_R600;
+      break;
+    case CF_POP:
+      Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600;
+      break;
+    case CF_END:
+      if (ST.hasCaymanISA()) {
+        Opcode = AMDGPU::CF_END_CM;
+        break;
+      }
+      Opcode = isEg ? AMDGPU::CF_END_EG : AMDGPU::CF_END_R600;
+      break;
+    }
+    assert (Opcode && "No opcode selected");
+    return TII->get(Opcode);
+  }
+
+  bool isCompatibleWithClause(const MachineInstr *MI,
+      std::set<unsigned> &DstRegs) const {
+    unsigned DstMI, SrcMI;
+    for (MachineInstr::const_mop_iterator I = MI->operands_begin(),
+        E = MI->operands_end(); I != E; ++I) {
+      const MachineOperand &MO = *I;
+      if (!MO.isReg())
+        continue;
+      if (MO.isDef()) {
+        unsigned Reg = MO.getReg();
+        if (AMDGPU::R600_Reg128RegClass.contains(Reg))
+          DstMI = Reg;
+        else
+          DstMI = TRI->getMatchingSuperReg(Reg,
+              TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)),
+              &AMDGPU::R600_Reg128RegClass);
+      }
+      if (MO.isUse()) {
+        unsigned Reg = MO.getReg();
+        if (AMDGPU::R600_Reg128RegClass.contains(Reg))
+          SrcMI = Reg;
+        else
+          SrcMI = TRI->getMatchingSuperReg(Reg,
+              TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)),
+              &AMDGPU::R600_Reg128RegClass);
+      }
+    }
+    if ((DstRegs.find(SrcMI) == DstRegs.end())) {
+      DstRegs.insert(DstMI);
+      return true;
+    } else
+      return false;
+  }
+
+  ClauseFile
+  MakeFetchClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I)
+      const {
+    MachineBasicBlock::iterator ClauseHead = I;
+    std::vector<MachineInstr *> ClauseContent;
+    unsigned AluInstCount = 0;
+    bool IsTex = TII->usesTextureCache(ClauseHead);
+    std::set<unsigned> DstRegs;
+    for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) {
+      if (IsTrivialInst(I))
+        continue;
+      if (AluInstCount >= MaxFetchInst)
+        break;
+      if ((IsTex && !TII->usesTextureCache(I)) ||
+          (!IsTex && !TII->usesVertexCache(I)))
+        break;
+      if (!isCompatibleWithClause(I, DstRegs))
+        break;
+      AluInstCount ++;
+      ClauseContent.push_back(I);
+    }
+    MachineInstr *MIb = BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead),
+        getHWInstrDesc(IsTex?CF_TC:CF_VC))
+        .addImm(0) // ADDR
+        .addImm(AluInstCount - 1); // COUNT
+    return ClauseFile(MIb, ClauseContent);
+  }
+
+  void getLiteral(MachineInstr *MI, std::vector<int64_t> &Lits) const {
+    static const unsigned LiteralRegs[] = {
+      AMDGPU::ALU_LITERAL_X,
+      AMDGPU::ALU_LITERAL_Y,
+      AMDGPU::ALU_LITERAL_Z,
+      AMDGPU::ALU_LITERAL_W
+    };
+    const SmallVector<std::pair<MachineOperand *, int64_t>, 3 > Srcs =
+        TII->getSrcs(MI);
+    for (unsigned i = 0, e = Srcs.size(); i < e; ++i) {
+      if (Srcs[i].first->getReg() != AMDGPU::ALU_LITERAL_X)
+        continue;
+      int64_t Imm = Srcs[i].second;
+      std::vector<int64_t>::iterator It =
+          std::find(Lits.begin(), Lits.end(), Imm);
+      if (It != Lits.end()) {
+        unsigned Index = It - Lits.begin();
+        Srcs[i].first->setReg(LiteralRegs[Index]);
+      } else {
+        assert(Lits.size() < 4 && "Too many literals in Instruction Group");
+        Srcs[i].first->setReg(LiteralRegs[Lits.size()]);
+        Lits.push_back(Imm);
+      }
+    }
+  }
+
+  MachineBasicBlock::iterator insertLiterals(
+      MachineBasicBlock::iterator InsertPos,
+      const std::vector<unsigned> &Literals) const {
+    MachineBasicBlock *MBB = InsertPos->getParent();
+    for (unsigned i = 0, e = Literals.size(); i < e; i+=2) {
+      unsigned LiteralPair0 = Literals[i];
+      unsigned LiteralPair1 = (i + 1 < e)?Literals[i + 1]:0;
+      InsertPos = BuildMI(MBB, InsertPos->getDebugLoc(),
+          TII->get(AMDGPU::LITERALS))
+          .addImm(LiteralPair0)
+          .addImm(LiteralPair1);
+    }
+    return InsertPos;
+  }
+
+  ClauseFile
+  MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I)
+      const {
+    MachineBasicBlock::iterator ClauseHead = I;
+    std::vector<MachineInstr *> ClauseContent;
+    I++;
+    for (MachineBasicBlock::instr_iterator E = MBB.instr_end(); I != E;) {
+      if (IsTrivialInst(I)) {
+        ++I;
+        continue;
+      }
+      if (!I->isBundle() && !TII->isALUInstr(I->getOpcode()))
+        break;
+      std::vector<int64_t> Literals;
+      if (I->isBundle()) {
+        MachineInstr *DeleteMI = I;
+        MachineBasicBlock::instr_iterator BI = I.getInstrIterator();
+        while (++BI != E && BI->isBundledWithPred()) {
+          BI->unbundleFromPred();
+          for (unsigned i = 0, e = BI->getNumOperands(); i != e; ++i) {
+            MachineOperand &MO = BI->getOperand(i);
+            if (MO.isReg() && MO.isInternalRead())
+              MO.setIsInternalRead(false);
+          }
+          getLiteral(BI, Literals);
+          ClauseContent.push_back(BI);
+        }
+        I = BI;
+        DeleteMI->eraseFromParent();
+      } else {
+        getLiteral(I, Literals);
+        ClauseContent.push_back(I);
+        I++;
+      }
+      for (unsigned i = 0, e = Literals.size(); i < e; i+=2) {
+        unsigned literal0 = Literals[i];
+        unsigned literal2 = (i + 1 < e)?Literals[i + 1]:0;
+        MachineInstr *MILit = BuildMI(MBB, I, I->getDebugLoc(),
+            TII->get(AMDGPU::LITERALS))
+            .addImm(literal0)
+            .addImm(literal2);
+        ClauseContent.push_back(MILit);
+      }
+    }
+    assert(ClauseContent.size() < 128 && "ALU clause is too big");
+    ClauseHead->getOperand(7).setImm(ClauseContent.size() - 1);
+    return ClauseFile(ClauseHead, ClauseContent);
+  }
+
+  void
+  EmitFetchClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause,
+      unsigned &CfCount) {
+    CounterPropagateAddr(Clause.first, CfCount);
+    MachineBasicBlock *BB = Clause.first->getParent();
+    BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::FETCH_CLAUSE))
+        .addImm(CfCount);
+    for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
+      BB->splice(InsertPos, BB, Clause.second[i]);
+    }
+    CfCount += 2 * Clause.second.size();
+  }
+
+  void
+  EmitALUClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause,
+      unsigned &CfCount) {
+    Clause.first->getOperand(0).setImm(0);
+    CounterPropagateAddr(Clause.first, CfCount);
+    MachineBasicBlock *BB = Clause.first->getParent();
+    BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::ALU_CLAUSE))
+        .addImm(CfCount);
+    for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
+      BB->splice(InsertPos, BB, Clause.second[i]);
+    }
+    CfCount += Clause.second.size();
+  }
+
+  void CounterPropagateAddr(MachineInstr *MI, unsigned Addr) const {
+    MI->getOperand(0).setImm(Addr + MI->getOperand(0).getImm());
+  }
+  void CounterPropagateAddr(std::set<MachineInstr *> MIs, unsigned Addr)
+      const {
+    for (std::set<MachineInstr *>::iterator It = MIs.begin(), E = MIs.end();
+        It != E; ++It) {
+      MachineInstr *MI = *It;
+      CounterPropagateAddr(MI, Addr);
+    }
+  }
+
+public:
+  R600ControlFlowFinalizer(TargetMachine &tm) : MachineFunctionPass(ID),
+    TII (nullptr), TRI(nullptr),
+    ST(tm.getSubtarget<AMDGPUSubtarget>()) {
+      const AMDGPUSubtarget &ST = tm.getSubtarget<AMDGPUSubtarget>();
+      MaxFetchInst = ST.getTexVTXClauseSize();
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    TII=static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
+    TRI=static_cast<const R600RegisterInfo *>(MF.getTarget().getRegisterInfo());
+    R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
+
+    CFStack CFStack(ST, MFI->getShaderType());
+    for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME;
+        ++MB) {
+      MachineBasicBlock &MBB = *MB;
+      unsigned CfCount = 0;
+      std::vector<std::pair<unsigned, std::set<MachineInstr *> > > LoopStack;
+      std::vector<MachineInstr * > IfThenElseStack;
+      if (MFI->getShaderType() == ShaderType::VERTEX) {
+        BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()),
+            getHWInstrDesc(CF_CALL_FS));
+        CfCount++;
+      }
+      std::vector<ClauseFile> FetchClauses, AluClauses;
+      std::vector<MachineInstr *> LastAlu(1);
+      std::vector<MachineInstr *> ToPopAfter;
+      
+      for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+          I != E;) {
+        if (TII->usesTextureCache(I) || TII->usesVertexCache(I)) {
+          DEBUG(dbgs() << CfCount << ":"; I->dump(););
+          FetchClauses.push_back(MakeFetchClause(MBB, I));
+          CfCount++;
+          LastAlu.back() = nullptr;
+          continue;
+        }
+
+        MachineBasicBlock::iterator MI = I;
+        if (MI->getOpcode() != AMDGPU::ENDIF)
+          LastAlu.back() = nullptr;
+        if (MI->getOpcode() == AMDGPU::CF_ALU)
+          LastAlu.back() = MI;
+        I++;
+        bool RequiresWorkAround =
+            CFStack.requiresWorkAroundForInst(MI->getOpcode());
+        switch (MI->getOpcode()) {
+        case AMDGPU::CF_ALU_PUSH_BEFORE:
+          if (RequiresWorkAround) {
+            DEBUG(dbgs() << "Applying bug work-around for ALU_PUSH_BEFORE\n");
+            BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG))
+                .addImm(CfCount + 1)
+                .addImm(1);
+            MI->setDesc(TII->get(AMDGPU::CF_ALU));
+            CfCount++;
+            CFStack.pushBranch(AMDGPU::CF_PUSH_EG);
+          } else
+            CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE);
+
+        case AMDGPU::CF_ALU:
+          I = MI;
+          AluClauses.push_back(MakeALUClause(MBB, I));
+          DEBUG(dbgs() << CfCount << ":"; MI->dump(););
+          CfCount++;
+          break;
+        case AMDGPU::WHILELOOP: {
+          CFStack.pushLoop();
+          MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
+              getHWInstrDesc(CF_WHILE_LOOP))
+              .addImm(1);
+          std::pair<unsigned, std::set<MachineInstr *> > Pair(CfCount,
+              std::set<MachineInstr *>());
+          Pair.second.insert(MIb);
+          LoopStack.push_back(Pair);
+          MI->eraseFromParent();
+          CfCount++;
+          break;
+        }
+        case AMDGPU::ENDLOOP: {
+          CFStack.popLoop();
+          std::pair<unsigned, std::set<MachineInstr *> > Pair =
+              LoopStack.back();
+          LoopStack.pop_back();
+          CounterPropagateAddr(Pair.second, CfCount);
+          BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END_LOOP))
+              .addImm(Pair.first + 1);
+          MI->eraseFromParent();
+          CfCount++;
+          break;
+        }
+        case AMDGPU::IF_PREDICATE_SET: {
+          LastAlu.push_back(nullptr);
+          MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
+              getHWInstrDesc(CF_JUMP))
+              .addImm(0)
+              .addImm(0);
+          IfThenElseStack.push_back(MIb);
+          DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
+          MI->eraseFromParent();
+          CfCount++;
+          break;
+        }
+        case AMDGPU::ELSE: {
+          MachineInstr * JumpInst = IfThenElseStack.back();
+          IfThenElseStack.pop_back();
+          CounterPropagateAddr(JumpInst, CfCount);
+          MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
+              getHWInstrDesc(CF_ELSE))
+              .addImm(0)
+              .addImm(0);
+          DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
+          IfThenElseStack.push_back(MIb);
+          MI->eraseFromParent();
+          CfCount++;
+          break;
+        }
+        case AMDGPU::ENDIF: {
+          CFStack.popBranch();
+          if (LastAlu.back()) {
+            ToPopAfter.push_back(LastAlu.back());
+          } else {
+            MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
+                getHWInstrDesc(CF_POP))
+                .addImm(CfCount + 1)
+                .addImm(1);
+            (void)MIb;
+            DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
+            CfCount++;
+          }
+          
+          MachineInstr *IfOrElseInst = IfThenElseStack.back();
+          IfThenElseStack.pop_back();
+          CounterPropagateAddr(IfOrElseInst, CfCount);
+          IfOrElseInst->getOperand(1).setImm(1);
+          LastAlu.pop_back();
+          MI->eraseFromParent();
+          break;
+        }
+        case AMDGPU::BREAK: {
+          CfCount ++;
+          MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
+              getHWInstrDesc(CF_LOOP_BREAK))
+              .addImm(0);
+          LoopStack.back().second.insert(MIb);
+          MI->eraseFromParent();
+          break;
+        }
+        case AMDGPU::CONTINUE: {
+          MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
+              getHWInstrDesc(CF_LOOP_CONTINUE))
+              .addImm(0);
+          LoopStack.back().second.insert(MIb);
+          MI->eraseFromParent();
+          CfCount++;
+          break;
+        }
+        case AMDGPU::RETURN: {
+          BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END));
+          CfCount++;
+          MI->eraseFromParent();
+          if (CfCount % 2) {
+            BuildMI(MBB, I, MBB.findDebugLoc(MI), TII->get(AMDGPU::PAD));
+            CfCount++;
+          }
+          for (unsigned i = 0, e = FetchClauses.size(); i < e; i++)
+            EmitFetchClause(I, FetchClauses[i], CfCount);
+          for (unsigned i = 0, e = AluClauses.size(); i < e; i++)
+            EmitALUClause(I, AluClauses[i], CfCount);
+        }
+        default:
+          if (TII->isExport(MI->getOpcode())) {
+            DEBUG(dbgs() << CfCount << ":"; MI->dump(););
+            CfCount++;
+          }
+          break;
+        }
+      }
+      for (unsigned i = 0, e = ToPopAfter.size(); i < e; ++i) {
+        MachineInstr *Alu = ToPopAfter[i];
+        BuildMI(MBB, Alu, MBB.findDebugLoc((MachineBasicBlock::iterator)Alu),
+            TII->get(AMDGPU::CF_ALU_POP_AFTER))
+            .addImm(Alu->getOperand(0).getImm())
+            .addImm(Alu->getOperand(1).getImm())
+            .addImm(Alu->getOperand(2).getImm())
+            .addImm(Alu->getOperand(3).getImm())
+            .addImm(Alu->getOperand(4).getImm())
+            .addImm(Alu->getOperand(5).getImm())
+            .addImm(Alu->getOperand(6).getImm())
+            .addImm(Alu->getOperand(7).getImm())
+            .addImm(Alu->getOperand(8).getImm());
+        Alu->eraseFromParent();
+      }
+      MFI->StackSize = CFStack.MaxStackSize;
+    }
+
+    return false;
+  }
+
+  const char *getPassName() const override {
+    return "R600 Control Flow Finalizer Pass";
+  }
+};
+
+char R600ControlFlowFinalizer::ID = 0;
+
+} // end anonymous namespace
+
+
+llvm::FunctionPass *llvm::createR600ControlFlowFinalizer(TargetMachine &TM) {
+  return new R600ControlFlowFinalizer(TM);
+}
diff --git a/contrib/llvm/lib/Target/R600/R600Defines.h b/contrib/llvm/lib/Target/R600/R600Defines.h
new file mode 100644
index 0000000..f2f28fe
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/R600Defines.h
@@ -0,0 +1,171 @@
+//===-- R600Defines.h - R600 Helper Macros ----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#ifndef R600DEFINES_H_
+#define R600DEFINES_H_
+
+#include "llvm/MC/MCRegisterInfo.h"
+
+// Operand Flags
+#define MO_FLAG_CLAMP (1 << 0)
+#define MO_FLAG_NEG   (1 << 1)
+#define MO_FLAG_ABS   (1 << 2)
+#define MO_FLAG_MASK  (1 << 3)
+#define MO_FLAG_PUSH  (1 << 4)
+#define MO_FLAG_NOT_LAST  (1 << 5)
+#define MO_FLAG_LAST  (1 << 6)
+#define NUM_MO_FLAGS 7
+
+/// \brief Helper for getting the operand index for the instruction flags
+/// operand.
+#define GET_FLAG_OPERAND_IDX(Flags) (((Flags) >> 7) & 0x3)
+
+namespace R600_InstFlag {
+  enum TIF {
+    TRANS_ONLY = (1 << 0),
+    TEX = (1 << 1),
+    REDUCTION = (1 << 2),
+    FC = (1 << 3),
+    TRIG = (1 << 4),
+    OP3 = (1 << 5),
+    VECTOR = (1 << 6),
+    //FlagOperand bits 7, 8
+    NATIVE_OPERANDS = (1 << 9),
+    OP1 = (1 << 10),
+    OP2 = (1 << 11),
+    VTX_INST  = (1 << 12),
+    TEX_INST = (1 << 13),
+    ALU_INST = (1 << 14),
+    LDS_1A = (1 << 15),
+    LDS_1A1D = (1 << 16),
+    IS_EXPORT = (1 << 17),
+    LDS_1A2D = (1 << 18)
+  };
+}
+
+#define HAS_NATIVE_OPERANDS(Flags) ((Flags) & R600_InstFlag::NATIVE_OPERANDS)
+
+/// \brief Defines for extracting register information from register encoding
+#define HW_REG_MASK 0x1ff
+#define HW_CHAN_SHIFT 9
+
+#define GET_REG_CHAN(reg) ((reg) >> HW_CHAN_SHIFT)
+#define GET_REG_INDEX(reg) ((reg) & HW_REG_MASK)
+
+#define IS_VTX(desc) ((desc).TSFlags & R600_InstFlag::VTX_INST)
+#define IS_TEX(desc) ((desc).TSFlags & R600_InstFlag::TEX_INST)
+
+namespace OpName {
+
+  enum VecOps {
+    UPDATE_EXEC_MASK_X,
+    UPDATE_PREDICATE_X,
+    WRITE_X,
+    OMOD_X,
+    DST_REL_X,
+    CLAMP_X,
+    SRC0_X,
+    SRC0_NEG_X,
+    SRC0_REL_X,
+    SRC0_ABS_X,
+    SRC0_SEL_X,
+    SRC1_X,
+    SRC1_NEG_X,
+    SRC1_REL_X,
+    SRC1_ABS_X,
+    SRC1_SEL_X,
+    PRED_SEL_X,
+    UPDATE_EXEC_MASK_Y,
+    UPDATE_PREDICATE_Y,
+    WRITE_Y,
+    OMOD_Y,
+    DST_REL_Y,
+    CLAMP_Y,
+    SRC0_Y,
+    SRC0_NEG_Y,
+    SRC0_REL_Y,
+    SRC0_ABS_Y,
+    SRC0_SEL_Y,
+    SRC1_Y,
+    SRC1_NEG_Y,
+    SRC1_REL_Y,
+    SRC1_ABS_Y,
+    SRC1_SEL_Y,
+    PRED_SEL_Y,
+    UPDATE_EXEC_MASK_Z,
+    UPDATE_PREDICATE_Z,
+    WRITE_Z,
+    OMOD_Z,
+    DST_REL_Z,
+    CLAMP_Z,
+    SRC0_Z,
+    SRC0_NEG_Z,
+    SRC0_REL_Z,
+    SRC0_ABS_Z,
+    SRC0_SEL_Z,
+    SRC1_Z,
+    SRC1_NEG_Z,
+    SRC1_REL_Z,
+    SRC1_ABS_Z,
+    SRC1_SEL_Z,
+    PRED_SEL_Z,
+    UPDATE_EXEC_MASK_W,
+    UPDATE_PREDICATE_W,
+    WRITE_W,
+    OMOD_W,
+    DST_REL_W,
+    CLAMP_W,
+    SRC0_W,
+    SRC0_NEG_W,
+    SRC0_REL_W,
+    SRC0_ABS_W,
+    SRC0_SEL_W,
+    SRC1_W,
+    SRC1_NEG_W,
+    SRC1_REL_W,
+    SRC1_ABS_W,
+    SRC1_SEL_W,
+    PRED_SEL_W,
+    IMM_0,
+    IMM_1,
+    VEC_COUNT
+ };
+
+}
+
+//===----------------------------------------------------------------------===//
+// Config register definitions
+//===----------------------------------------------------------------------===//
+
+#define R_02880C_DB_SHADER_CONTROL                    0x02880C
+#define   S_02880C_KILL_ENABLE(x)                      (((x) & 0x1) << 6)
+
+// These fields are the same for all shader types and families.
+#define   S_NUM_GPRS(x)                         (((x) & 0xFF) << 0)
+#define   S_STACK_SIZE(x)                       (((x) & 0xFF) << 8)
+//===----------------------------------------------------------------------===//
+// R600, R700 Registers
+//===----------------------------------------------------------------------===//
+
+#define R_028850_SQ_PGM_RESOURCES_PS                 0x028850
+#define R_028868_SQ_PGM_RESOURCES_VS                 0x028868
+
+//===----------------------------------------------------------------------===//
+// Evergreen, Northern Islands Registers
+//===----------------------------------------------------------------------===//
+
+#define R_028844_SQ_PGM_RESOURCES_PS                 0x028844
+#define R_028860_SQ_PGM_RESOURCES_VS                 0x028860
+#define R_028878_SQ_PGM_RESOURCES_GS                 0x028878
+#define R_0288D4_SQ_PGM_RESOURCES_LS                 0x0288d4
+
+#define R_0288E8_SQ_LDS_ALLOC                        0x0288E8
+
+#endif // R600DEFINES_H_
diff --git a/contrib/llvm/lib/Target/R600/R600EmitClauseMarkers.cpp b/contrib/llvm/lib/Target/R600/R600EmitClauseMarkers.cpp
new file mode 100644
index 0000000..38afebe
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/R600EmitClauseMarkers.cpp
@@ -0,0 +1,335 @@
+//===-- R600EmitClauseMarkers.cpp - Emit CF_ALU ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Add CF_ALU. R600 Alu instructions are grouped in clause which can hold
+/// 128 Alu instructions ; these instructions can access up to 4 prefetched
+/// 4 lines of 16 registers from constant buffers. Such ALU clauses are
+/// initiated by CF_ALU instructions.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "R600Defines.h"
+#include "R600InstrInfo.h"
+#include "R600MachineFunctionInfo.h"
+#include "R600RegisterInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+namespace llvm {
+  void initializeR600EmitClauseMarkersPass(PassRegistry&);
+}
+
+namespace {
+
+class R600EmitClauseMarkers : public MachineFunctionPass {
+
+private:
+  const R600InstrInfo *TII;
+  int Address;
+
+  unsigned OccupiedDwords(MachineInstr *MI) const {
+    switch (MI->getOpcode()) {
+    case AMDGPU::INTERP_PAIR_XY:
+    case AMDGPU::INTERP_PAIR_ZW:
+    case AMDGPU::INTERP_VEC_LOAD:
+    case AMDGPU::DOT_4:
+      return 4;
+    case AMDGPU::KILL:
+      return 0;
+    default:
+      break;
+    }
+
+    // These will be expanded to two ALU instructions in the
+    // ExpandSpecialInstructions pass.
+    if (TII->isLDSRetInstr(MI->getOpcode()))
+      return 2;
+
+    if(TII->isVector(*MI) ||
+        TII->isCubeOp(MI->getOpcode()) ||
+        TII->isReductionOp(MI->getOpcode()))
+      return 4;
+
+    unsigned NumLiteral = 0;
+    for (MachineInstr::mop_iterator It = MI->operands_begin(),
+        E = MI->operands_end(); It != E; ++It) {
+      MachineOperand &MO = *It;
+      if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X)
+        ++NumLiteral;
+    }
+    return 1 + NumLiteral;
+  }
+
+  bool isALU(const MachineInstr *MI) const {
+    if (TII->isALUInstr(MI->getOpcode()))
+      return true;
+    if (TII->isVector(*MI) || TII->isCubeOp(MI->getOpcode()))
+      return true;
+    switch (MI->getOpcode()) {
+    case AMDGPU::PRED_X:
+    case AMDGPU::INTERP_PAIR_XY:
+    case AMDGPU::INTERP_PAIR_ZW:
+    case AMDGPU::INTERP_VEC_LOAD:
+    case AMDGPU::COPY:
+    case AMDGPU::DOT_4:
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  bool IsTrivialInst(MachineInstr *MI) const {
+    switch (MI->getOpcode()) {
+    case AMDGPU::KILL:
+    case AMDGPU::RETURN:
+    case AMDGPU::IMPLICIT_DEF:
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  std::pair<unsigned, unsigned> getAccessedBankLine(unsigned Sel) const {
+    // Sel is (512 + (kc_bank << 12) + ConstIndex) << 2
+    // (See also R600ISelLowering.cpp)
+    // ConstIndex value is in [0, 4095];
+    return std::pair<unsigned, unsigned>(
+        ((Sel >> 2) - 512) >> 12, // KC_BANK
+        // Line Number of ConstIndex
+        // A line contains 16 constant registers however KCX bank can lock
+        // two line at the same time ; thus we want to get an even line number.
+        // Line number can be retrieved with (>>4), using (>>5) <<1 generates
+        // an even number.
+        ((((Sel >> 2) - 512) & 4095) >> 5) << 1);
+  }
+
+  bool SubstituteKCacheBank(MachineInstr *MI,
+      std::vector<std::pair<unsigned, unsigned> > &CachedConsts,
+      bool UpdateInstr = true) const {
+    std::vector<std::pair<unsigned, unsigned> > UsedKCache;
+
+    if (!TII->isALUInstr(MI->getOpcode()) && MI->getOpcode() != AMDGPU::DOT_4)
+      return true;
+
+    const SmallVectorImpl<std::pair<MachineOperand *, int64_t> > &Consts =
+        TII->getSrcs(MI);
+    assert((TII->isALUInstr(MI->getOpcode()) ||
+        MI->getOpcode() == AMDGPU::DOT_4) && "Can't assign Const");
+    for (unsigned i = 0, n = Consts.size(); i < n; ++i) {
+      if (Consts[i].first->getReg() != AMDGPU::ALU_CONST)
+        continue;
+      unsigned Sel = Consts[i].second;
+      unsigned Chan = Sel & 3, Index = ((Sel >> 2) - 512) & 31;
+      unsigned KCacheIndex = Index * 4 + Chan;
+      const std::pair<unsigned, unsigned> &BankLine = getAccessedBankLine(Sel);
+      if (CachedConsts.empty()) {
+        CachedConsts.push_back(BankLine);
+        UsedKCache.push_back(std::pair<unsigned, unsigned>(0, KCacheIndex));
+        continue;
+      }
+      if (CachedConsts[0] == BankLine) {
+        UsedKCache.push_back(std::pair<unsigned, unsigned>(0, KCacheIndex));
+        continue;
+      }
+      if (CachedConsts.size() == 1) {
+        CachedConsts.push_back(BankLine);
+        UsedKCache.push_back(std::pair<unsigned, unsigned>(1, KCacheIndex));
+        continue;
+      }
+      if (CachedConsts[1] == BankLine) {
+        UsedKCache.push_back(std::pair<unsigned, unsigned>(1, KCacheIndex));
+        continue;
+      }
+      return false;
+    }
+
+    if (!UpdateInstr)
+      return true;
+
+    for (unsigned i = 0, j = 0, n = Consts.size(); i < n; ++i) {
+      if (Consts[i].first->getReg() != AMDGPU::ALU_CONST)
+        continue;
+      switch(UsedKCache[j].first) {
+      case 0:
+        Consts[i].first->setReg(
+            AMDGPU::R600_KC0RegClass.getRegister(UsedKCache[j].second));
+        break;
+      case 1:
+        Consts[i].first->setReg(
+            AMDGPU::R600_KC1RegClass.getRegister(UsedKCache[j].second));
+        break;
+      default:
+        llvm_unreachable("Wrong Cache Line");
+      }
+      j++;
+    }
+    return true;
+  }
+
+  bool canClauseLocalKillFitInClause(
+                        unsigned AluInstCount,
+                        std::vector<std::pair<unsigned, unsigned> > KCacheBanks,
+                        MachineBasicBlock::iterator Def,
+                        MachineBasicBlock::iterator BBEnd) {
+    const R600RegisterInfo &TRI = TII->getRegisterInfo();
+    for (MachineInstr::const_mop_iterator
+           MOI = Def->operands_begin(),
+           MOE = Def->operands_end(); MOI != MOE; ++MOI) {
+      if (!MOI->isReg() || !MOI->isDef() ||
+          TRI.isPhysRegLiveAcrossClauses(MOI->getReg()))
+        continue;
+
+      // Def defines a clause local register, so check that its use will fit
+      // in the clause.
+      unsigned LastUseCount = 0;
+      for (MachineBasicBlock::iterator UseI = Def; UseI != BBEnd; ++UseI) {
+        AluInstCount += OccupiedDwords(UseI);
+        // Make sure we won't need to end the clause due to KCache limitations.
+        if (!SubstituteKCacheBank(UseI, KCacheBanks, false))
+          return false;
+
+        // We have reached the maximum instruction limit before finding the
+        // use that kills this register, so we cannot use this def in the
+        // current clause.
+        if (AluInstCount >= TII->getMaxAlusPerClause())
+          return false;
+
+        // Register kill flags have been cleared by the time we get to this
+        // pass, but it is safe to assume that all uses of this register
+        // occur in the same basic block as its definition, because
+        // it is illegal for the scheduler to schedule them in
+        // different blocks.
+        if (UseI->findRegisterUseOperandIdx(MOI->getReg()))
+          LastUseCount = AluInstCount;
+
+        if (UseI != Def && UseI->findRegisterDefOperandIdx(MOI->getReg()) != -1)
+          break;
+      }
+      if (LastUseCount)
+        return LastUseCount <= TII->getMaxAlusPerClause();
+      llvm_unreachable("Clause local register live at end of clause.");
+    }
+    return true;
+  }
+
+  MachineBasicBlock::iterator
+  MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) {
+    MachineBasicBlock::iterator ClauseHead = I;
+    std::vector<std::pair<unsigned, unsigned> > KCacheBanks;
+    bool PushBeforeModifier = false;
+    unsigned AluInstCount = 0;
+    for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) {
+      if (IsTrivialInst(I))
+        continue;
+      if (!isALU(I))
+        break;
+      if (AluInstCount > TII->getMaxAlusPerClause())
+        break;
+      if (I->getOpcode() == AMDGPU::PRED_X) {
+        // We put PRED_X in its own clause to ensure that ifcvt won't create
+        // clauses with more than 128 insts.
+        // IfCvt is indeed checking that "then" and "else" branches of an if
+        // statement have less than ~60 insts thus converted clauses can't be
+        // bigger than ~121 insts (predicate setter needs to be in the same
+        // clause as predicated alus).
+        if (AluInstCount > 0)
+          break;
+        if (TII->getFlagOp(I).getImm() & MO_FLAG_PUSH)
+          PushBeforeModifier = true;
+        AluInstCount ++;
+        continue;
+      }
+      // XXX: GROUP_BARRIER instructions cannot be in the same ALU clause as:
+      //
+      // * KILL or INTERP instructions
+      // * Any instruction that sets UPDATE_EXEC_MASK or UPDATE_PRED bits
+      // * Uses waterfalling (i.e. INDEX_MODE = AR.X)
+      //
+      // XXX: These checks have not been implemented yet.
+      if (TII->mustBeLastInClause(I->getOpcode())) {
+        I++;
+        break;
+      }
+
+      // If this instruction defines a clause local register, make sure
+      // its use can fit in this clause.
+      if (!canClauseLocalKillFitInClause(AluInstCount, KCacheBanks, I, E))
+        break;
+
+      if (!SubstituteKCacheBank(I, KCacheBanks))
+        break;
+      AluInstCount += OccupiedDwords(I);
+    }
+    unsigned Opcode = PushBeforeModifier ?
+        AMDGPU::CF_ALU_PUSH_BEFORE : AMDGPU::CF_ALU;
+    BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), TII->get(Opcode))
+    // We don't use the ADDR field until R600ControlFlowFinalizer pass, where
+    // it is safe to assume it is 0. However if we always put 0 here, the ifcvt
+    // pass may assume that identical ALU clause starter at the beginning of a 
+    // true and false branch can be factorized which is not the case.
+        .addImm(Address++) // ADDR
+        .addImm(KCacheBanks.empty()?0:KCacheBanks[0].first) // KB0
+        .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].first) // KB1
+        .addImm(KCacheBanks.empty()?0:2) // KM0
+        .addImm((KCacheBanks.size() < 2)?0:2) // KM1
+        .addImm(KCacheBanks.empty()?0:KCacheBanks[0].second) // KLINE0
+        .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].second) // KLINE1
+        .addImm(AluInstCount) // COUNT
+        .addImm(1); // Enabled
+    return I;
+  }
+
+public:
+  static char ID;
+  R600EmitClauseMarkers() : MachineFunctionPass(ID), TII(nullptr), Address(0) {
+
+    initializeR600EmitClauseMarkersPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
+
+    for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                    BB != BB_E; ++BB) {
+      MachineBasicBlock &MBB = *BB;
+      MachineBasicBlock::iterator I = MBB.begin();
+      if (I->getOpcode() == AMDGPU::CF_ALU)
+        continue; // BB was already parsed
+      for (MachineBasicBlock::iterator E = MBB.end(); I != E;) {
+        if (isALU(I))
+          I = MakeALUClause(MBB, I);
+        else
+          ++I;
+      }
+    }
+    return false;
+  }
+
+  const char *getPassName() const override {
+    return "R600 Emit Clause Markers Pass";
+  }
+};
+
+char R600EmitClauseMarkers::ID = 0;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS_BEGIN(R600EmitClauseMarkers, "emitclausemarkers",
+                      "R600 Emit Clause Markters", false, false)
+INITIALIZE_PASS_END(R600EmitClauseMarkers, "emitclausemarkers",
+                      "R600 Emit Clause Markters", false, false)
+
+llvm::FunctionPass *llvm::createR600EmitClauseMarkers() {
+  return new R600EmitClauseMarkers();
+}
+
diff --git a/contrib/llvm/lib/Target/R600/R600ExpandSpecialInstrs.cpp b/contrib/llvm/lib/Target/R600/R600ExpandSpecialInstrs.cpp
new file mode 100644
index 0000000..732b06d
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/R600ExpandSpecialInstrs.cpp
@@ -0,0 +1,348 @@
+//===-- R600ExpandSpecialInstrs.cpp - Expand special instructions ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Vector, Reduction, and Cube instructions need to fill the entire instruction
+/// group to work correctly.  This pass expands these individual instructions
+/// into several instructions that will completely fill the instruction group.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "R600Defines.h"
+#include "R600InstrInfo.h"
+#include "R600MachineFunctionInfo.h"
+#include "R600RegisterInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+namespace {
+
+class R600ExpandSpecialInstrsPass : public MachineFunctionPass {
+
+private:
+  static char ID;
+  const R600InstrInfo *TII;
+
+  void SetFlagInNewMI(MachineInstr *NewMI, const MachineInstr *OldMI,
+      unsigned Op);
+
+public:
+  R600ExpandSpecialInstrsPass(TargetMachine &tm) : MachineFunctionPass(ID),
+    TII(nullptr) { }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "R600 Expand special instructions pass";
+  }
+};
+
+} // End anonymous namespace
+
+char R600ExpandSpecialInstrsPass::ID = 0;
+
+FunctionPass *llvm::createR600ExpandSpecialInstrsPass(TargetMachine &TM) {
+  return new R600ExpandSpecialInstrsPass(TM);
+}
+
+void R600ExpandSpecialInstrsPass::SetFlagInNewMI(MachineInstr *NewMI,
+    const MachineInstr *OldMI, unsigned Op) {
+  int OpIdx = TII->getOperandIdx(*OldMI, Op);
+  if (OpIdx > -1) {
+    uint64_t Val = OldMI->getOperand(OpIdx).getImm();
+    TII->setImmOperand(NewMI, Op, Val);
+  }
+}
+
+bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
+  TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
+
+  const R600RegisterInfo &TRI = TII->getRegisterInfo();
+
+  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                  BB != BB_E; ++BB) {
+    MachineBasicBlock &MBB = *BB;
+    MachineBasicBlock::iterator I = MBB.begin();
+    while (I != MBB.end()) {
+      MachineInstr &MI = *I;
+      I = std::next(I);
+
+      // Expand LDS_*_RET instructions
+      if (TII->isLDSRetInstr(MI.getOpcode())) {
+        int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
+        assert(DstIdx != -1);
+        MachineOperand &DstOp = MI.getOperand(DstIdx);
+        MachineInstr *Mov = TII->buildMovInstr(&MBB, I,
+                                               DstOp.getReg(), AMDGPU::OQAP);
+        DstOp.setReg(AMDGPU::OQAP);
+        int LDSPredSelIdx = TII->getOperandIdx(MI.getOpcode(),
+                                           AMDGPU::OpName::pred_sel);
+        int MovPredSelIdx = TII->getOperandIdx(Mov->getOpcode(),
+                                           AMDGPU::OpName::pred_sel);
+        // Copy the pred_sel bit
+        Mov->getOperand(MovPredSelIdx).setReg(
+            MI.getOperand(LDSPredSelIdx).getReg());
+      }
+
+      switch (MI.getOpcode()) {
+      default: break;
+      // Expand PRED_X to one of the PRED_SET instructions.
+      case AMDGPU::PRED_X: {
+        uint64_t Flags = MI.getOperand(3).getImm();
+        // The native opcode used by PRED_X is stored as an immediate in the
+        // third operand.
+        MachineInstr *PredSet = TII->buildDefaultInstruction(MBB, I,
+                                            MI.getOperand(2).getImm(), // opcode
+                                            MI.getOperand(0).getReg(), // dst
+                                            MI.getOperand(1).getReg(), // src0
+                                            AMDGPU::ZERO);             // src1
+        TII->addFlag(PredSet, 0, MO_FLAG_MASK);
+        if (Flags & MO_FLAG_PUSH) {
+          TII->setImmOperand(PredSet, AMDGPU::OpName::update_exec_mask, 1);
+        } else {
+          TII->setImmOperand(PredSet, AMDGPU::OpName::update_pred, 1);
+        }
+        MI.eraseFromParent();
+        continue;
+        }
+
+      case AMDGPU::INTERP_PAIR_XY: {
+        MachineInstr *BMI;
+        unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister(
+                MI.getOperand(2).getImm());
+
+        for (unsigned Chan = 0; Chan < 4; ++Chan) {
+          unsigned DstReg;
+
+          if (Chan < 2)
+            DstReg = MI.getOperand(Chan).getReg();
+          else
+            DstReg = Chan == 2 ? AMDGPU::T0_Z : AMDGPU::T0_W;
+
+          BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_XY,
+              DstReg, MI.getOperand(3 + (Chan % 2)).getReg(), PReg);
+
+          if (Chan > 0) {
+            BMI->bundleWithPred();
+          }
+          if (Chan >= 2)
+            TII->addFlag(BMI, 0, MO_FLAG_MASK);
+          if (Chan != 3)
+            TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
+        }
+
+        MI.eraseFromParent();
+        continue;
+        }
+
+      case AMDGPU::INTERP_PAIR_ZW: {
+        MachineInstr *BMI;
+        unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister(
+                MI.getOperand(2).getImm());
+
+        for (unsigned Chan = 0; Chan < 4; ++Chan) {
+          unsigned DstReg;
+
+          if (Chan < 2)
+            DstReg = Chan == 0 ? AMDGPU::T0_X : AMDGPU::T0_Y;
+          else
+            DstReg = MI.getOperand(Chan-2).getReg();
+
+          BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_ZW,
+              DstReg, MI.getOperand(3 + (Chan % 2)).getReg(), PReg);
+
+          if (Chan > 0) {
+            BMI->bundleWithPred();
+          }
+          if (Chan < 2)
+            TII->addFlag(BMI, 0, MO_FLAG_MASK);
+          if (Chan != 3)
+            TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
+        }
+
+        MI.eraseFromParent();
+        continue;
+        }
+
+      case AMDGPU::INTERP_VEC_LOAD: {
+        const R600RegisterInfo &TRI = TII->getRegisterInfo();
+        MachineInstr *BMI;
+        unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister(
+                MI.getOperand(1).getImm());
+        unsigned DstReg = MI.getOperand(0).getReg();
+
+        for (unsigned Chan = 0; Chan < 4; ++Chan) {
+          BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_LOAD_P0,
+              TRI.getSubReg(DstReg, TRI.getSubRegFromChannel(Chan)), PReg);
+          if (Chan > 0) {
+            BMI->bundleWithPred();
+          }
+          if (Chan != 3)
+            TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
+        }
+
+        MI.eraseFromParent();
+        continue;
+        }
+      case AMDGPU::DOT_4: {
+
+        const R600RegisterInfo &TRI = TII->getRegisterInfo();
+
+        unsigned DstReg = MI.getOperand(0).getReg();
+        unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK;
+
+        for (unsigned Chan = 0; Chan < 4; ++Chan) {
+          bool Mask = (Chan != TRI.getHWRegChan(DstReg));
+          unsigned SubDstReg =
+              AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan);
+          MachineInstr *BMI =
+              TII->buildSlotOfVectorInstruction(MBB, &MI, Chan, SubDstReg);
+          if (Chan > 0) {
+            BMI->bundleWithPred();
+          }
+          if (Mask) {
+            TII->addFlag(BMI, 0, MO_FLAG_MASK);
+          }
+          if (Chan != 3)
+            TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
+          unsigned Opcode = BMI->getOpcode();
+          // While not strictly necessary from hw point of view, we force
+          // all src operands of a dot4 inst to belong to the same slot.
+          unsigned Src0 = BMI->getOperand(
+              TII->getOperandIdx(Opcode, AMDGPU::OpName::src0))
+              .getReg();
+          unsigned Src1 = BMI->getOperand(
+              TII->getOperandIdx(Opcode, AMDGPU::OpName::src1))
+              .getReg();
+          (void) Src0;
+          (void) Src1;
+          if ((TRI.getEncodingValue(Src0) & 0xff) < 127 &&
+              (TRI.getEncodingValue(Src1) & 0xff) < 127)
+            assert(TRI.getHWRegChan(Src0) == TRI.getHWRegChan(Src1));
+        }
+        MI.eraseFromParent();
+        continue;
+      }
+      }
+
+      bool IsReduction = TII->isReductionOp(MI.getOpcode());
+      bool IsVector = TII->isVector(MI);
+      bool IsCube = TII->isCubeOp(MI.getOpcode());
+      if (!IsReduction && !IsVector && !IsCube) {
+        continue;
+      }
+
+      // Expand the instruction
+      //
+      // Reduction instructions:
+      // T0_X = DP4 T1_XYZW, T2_XYZW
+      // becomes:
+      // TO_X = DP4 T1_X, T2_X
+      // TO_Y (write masked) = DP4 T1_Y, T2_Y
+      // TO_Z (write masked) = DP4 T1_Z, T2_Z
+      // TO_W (write masked) = DP4 T1_W, T2_W
+      //
+      // Vector instructions:
+      // T0_X = MULLO_INT T1_X, T2_X
+      // becomes:
+      // T0_X = MULLO_INT T1_X, T2_X
+      // T0_Y (write masked) = MULLO_INT T1_X, T2_X
+      // T0_Z (write masked) = MULLO_INT T1_X, T2_X
+      // T0_W (write masked) = MULLO_INT T1_X, T2_X
+      //
+      // Cube instructions:
+      // T0_XYZW = CUBE T1_XYZW
+      // becomes:
+      // TO_X = CUBE T1_Z, T1_Y
+      // T0_Y = CUBE T1_Z, T1_X
+      // T0_Z = CUBE T1_X, T1_Z
+      // T0_W = CUBE T1_Y, T1_Z
+      for (unsigned Chan = 0; Chan < 4; Chan++) {
+        unsigned DstReg = MI.getOperand(
+                            TII->getOperandIdx(MI, AMDGPU::OpName::dst)).getReg();
+        unsigned Src0 = MI.getOperand(
+                           TII->getOperandIdx(MI, AMDGPU::OpName::src0)).getReg();
+        unsigned Src1 = 0;
+
+        // Determine the correct source registers
+        if (!IsCube) {
+          int Src1Idx = TII->getOperandIdx(MI, AMDGPU::OpName::src1);
+          if (Src1Idx != -1) {
+            Src1 = MI.getOperand(Src1Idx).getReg();
+          }
+        }
+        if (IsReduction) {
+          unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan);
+          Src0 = TRI.getSubReg(Src0, SubRegIndex);
+          Src1 = TRI.getSubReg(Src1, SubRegIndex);
+        } else if (IsCube) {
+          static const int CubeSrcSwz[] = {2, 2, 0, 1};
+          unsigned SubRegIndex0 = TRI.getSubRegFromChannel(CubeSrcSwz[Chan]);
+          unsigned SubRegIndex1 = TRI.getSubRegFromChannel(CubeSrcSwz[3 - Chan]);
+          Src1 = TRI.getSubReg(Src0, SubRegIndex1);
+          Src0 = TRI.getSubReg(Src0, SubRegIndex0);
+        }
+
+        // Determine the correct destination registers;
+        bool Mask = false;
+        bool NotLast = true;
+        if (IsCube) {
+          unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan);
+          DstReg = TRI.getSubReg(DstReg, SubRegIndex);
+        } else {
+          // Mask the write if the original instruction does not write to
+          // the current Channel.
+          Mask = (Chan != TRI.getHWRegChan(DstReg));
+          unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK;
+          DstReg = AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan);
+        }
+
+        // Set the IsLast bit
+        NotLast = (Chan != 3 );
+
+        // Add the new instruction
+        unsigned Opcode = MI.getOpcode();
+        switch (Opcode) {
+        case AMDGPU::CUBE_r600_pseudo:
+          Opcode = AMDGPU::CUBE_r600_real;
+          break;
+        case AMDGPU::CUBE_eg_pseudo:
+          Opcode = AMDGPU::CUBE_eg_real;
+          break;
+        default:
+          break;
+        }
+
+        MachineInstr *NewMI =
+          TII->buildDefaultInstruction(MBB, I, Opcode, DstReg, Src0, Src1);
+
+        if (Chan != 0)
+          NewMI->bundleWithPred();
+        if (Mask) {
+          TII->addFlag(NewMI, 0, MO_FLAG_MASK);
+        }
+        if (NotLast) {
+          TII->addFlag(NewMI, 0, MO_FLAG_NOT_LAST);
+        }
+        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::clamp);
+        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::literal);
+        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src0_abs);
+        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src1_abs);
+        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src0_neg);
+        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src1_neg);
+      }
+      MI.eraseFromParent();
+    }
+  }
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/R600/R600ISelLowering.cpp b/contrib/llvm/lib/Target/R600/R600ISelLowering.cpp
new file mode 100644
index 0000000..52315bf
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/R600ISelLowering.cpp
@@ -0,0 +1,2326 @@
+//===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Custom DAG lowering for R600
+//
+//===----------------------------------------------------------------------===//
+
+#include "R600ISelLowering.h"
+#include "AMDGPUFrameLowering.h"
+#include "AMDGPUIntrinsicInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "R600Defines.h"
+#include "R600InstrInfo.h"
+#include "R600MachineFunctionInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Function.h"
+
+using namespace llvm;
+
+R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
+    AMDGPUTargetLowering(TM),
+    Gen(TM.getSubtarget<AMDGPUSubtarget>().getGeneration()) {
+  addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
+  addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
+  addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
+  addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
+  addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
+  addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
+
+  computeRegisterProperties();
+
+  // Set condition code actions
+  setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
+  setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
+  setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
+  setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
+
+  setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
+  setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
+  setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
+  setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
+
+  setOperationAction(ISD::FCOS, MVT::f32, Custom);
+  setOperationAction(ISD::FSIN, MVT::f32, Custom);
+
+  setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
+  setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
+
+  setOperationAction(ISD::BR_CC, MVT::i32, Expand);
+  setOperationAction(ISD::BR_CC, MVT::f32, Expand);
+  setOperationAction(ISD::BRCOND, MVT::Other, Custom);
+
+  setOperationAction(ISD::FSUB, MVT::f32, Expand);
+
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
+
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
+
+  setOperationAction(ISD::SETCC, MVT::i32, Expand);
+  setOperationAction(ISD::SETCC, MVT::f32, Expand);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
+
+  setOperationAction(ISD::SELECT, MVT::i32, Expand);
+  setOperationAction(ISD::SELECT, MVT::f32, Expand);
+  setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
+  setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
+
+  // Expand sign extension of vectors
+  if (!Subtarget->hasBFE())
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
+
+  if (!Subtarget->hasBFE())
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
+
+  if (!Subtarget->hasBFE())
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
+
+
+  // Legalize loads and stores to the private address space.
+  setOperationAction(ISD::LOAD, MVT::i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
+
+  // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
+  // spaces, so it is custom lowered to handle those where it isn't.
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
+  setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
+  setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom);
+
+  setOperationAction(ISD::STORE, MVT::i8, Custom);
+  setOperationAction(ISD::STORE, MVT::i32, Custom);
+  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
+  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
+  setTruncStoreAction(MVT::i32, MVT::i8, Custom);
+  setTruncStoreAction(MVT::i32, MVT::i16, Custom);
+
+  setOperationAction(ISD::LOAD, MVT::i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
+  setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
+
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
+
+  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
+
+  setTargetDAGCombine(ISD::FP_ROUND);
+  setTargetDAGCombine(ISD::FP_TO_SINT);
+  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+  setTargetDAGCombine(ISD::SELECT_CC);
+  setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
+
+  setOperationAction(ISD::SUB, MVT::i64, Expand);
+
+  // These should be replaced by UDVIREM, but it does not happen automatically
+  // during Type Legalization
+  setOperationAction(ISD::UDIV, MVT::i64, Custom);
+  setOperationAction(ISD::UREM, MVT::i64, Custom);
+  setOperationAction(ISD::SDIV, MVT::i64, Custom);
+  setOperationAction(ISD::SREM, MVT::i64, Custom);
+
+  // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
+  //  to be Legal/Custom in order to avoid library calls.
+  setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
+  setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
+  setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
+
+  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+
+  const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
+  for (MVT VT : ScalarIntVTs) {
+    setOperationAction(ISD::ADDC, VT, Expand);
+    setOperationAction(ISD::SUBC, VT, Expand);
+    setOperationAction(ISD::ADDE, VT, Expand);
+    setOperationAction(ISD::SUBE, VT, Expand);
+  }
+
+  setBooleanContents(ZeroOrNegativeOneBooleanContent);
+  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+  setSchedulingPreference(Sched::Source);
+}
+
+MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
+    MachineInstr * MI, MachineBasicBlock * BB) const {
+  MachineFunction * MF = BB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  MachineBasicBlock::iterator I = *MI;
+  const R600InstrInfo *TII =
+    static_cast<const R600InstrInfo*>(MF->getTarget().getInstrInfo());
+
+  switch (MI->getOpcode()) {
+  default:
+    // Replace LDS_*_RET instruction that don't have any uses with the
+    // equivalent LDS_*_NORET instruction.
+    if (TII->isLDSRetInstr(MI->getOpcode())) {
+      int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
+      assert(DstIdx != -1);
+      MachineInstrBuilder NewMI;
+      if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()))
+        return BB;
+
+      NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
+                      TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode())));
+      for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
+        NewMI.addOperand(MI->getOperand(i));
+      }
+    } else {
+      return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
+    }
+    break;
+  case AMDGPU::CLAMP_R600: {
+    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
+                                                   AMDGPU::MOV,
+                                                   MI->getOperand(0).getReg(),
+                                                   MI->getOperand(1).getReg());
+    TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
+    break;
+  }
+
+  case AMDGPU::FABS_R600: {
+    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
+                                                    AMDGPU::MOV,
+                                                    MI->getOperand(0).getReg(),
+                                                    MI->getOperand(1).getReg());
+    TII->addFlag(NewMI, 0, MO_FLAG_ABS);
+    break;
+  }
+
+  case AMDGPU::FNEG_R600: {
+    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
+                                                    AMDGPU::MOV,
+                                                    MI->getOperand(0).getReg(),
+                                                    MI->getOperand(1).getReg());
+    TII->addFlag(NewMI, 0, MO_FLAG_NEG);
+    break;
+  }
+
+  case AMDGPU::MASK_WRITE: {
+    unsigned maskedRegister = MI->getOperand(0).getReg();
+    assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
+    MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
+    TII->addFlag(defInstr, 0, MO_FLAG_MASK);
+    break;
+  }
+
+  case AMDGPU::MOV_IMM_F32:
+    TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
+                     MI->getOperand(1).getFPImm()->getValueAPF()
+                         .bitcastToAPInt().getZExtValue());
+    break;
+  case AMDGPU::MOV_IMM_I32:
+    TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
+                     MI->getOperand(1).getImm());
+    break;
+  case AMDGPU::CONST_COPY: {
+    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
+        MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
+    TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel,
+        MI->getOperand(1).getImm());
+    break;
+  }
+
+  case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
+  case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
+  case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
+    unsigned EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
+
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
+            .addOperand(MI->getOperand(0))
+            .addOperand(MI->getOperand(1))
+            .addImm(EOP); // Set End of program bit
+    break;
+  }
+
+  case AMDGPU::TXD: {
+    unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
+    unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
+    MachineOperand &RID = MI->getOperand(4);
+    MachineOperand &SID = MI->getOperand(5);
+    unsigned TextureId = MI->getOperand(6).getImm();
+    unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
+    unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
+
+    switch (TextureId) {
+    case 5: // Rect
+      CTX = CTY = 0;
+      break;
+    case 6: // Shadow1D
+      SrcW = SrcZ;
+      break;
+    case 7: // Shadow2D
+      SrcW = SrcZ;
+      break;
+    case 8: // ShadowRect
+      CTX = CTY = 0;
+      SrcW = SrcZ;
+      break;
+    case 9: // 1DArray
+      SrcZ = SrcY;
+      CTZ = 0;
+      break;
+    case 10: // 2DArray
+      CTZ = 0;
+      break;
+    case 11: // Shadow1DArray
+      SrcZ = SrcY;
+      CTZ = 0;
+      break;
+    case 12: // Shadow2DArray
+      CTZ = 0;
+      break;
+    }
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
+            .addOperand(MI->getOperand(3))
+            .addImm(SrcX)
+            .addImm(SrcY)
+            .addImm(SrcZ)
+            .addImm(SrcW)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(1)
+            .addImm(2)
+            .addImm(3)
+            .addOperand(RID)
+            .addOperand(SID)
+            .addImm(CTX)
+            .addImm(CTY)
+            .addImm(CTZ)
+            .addImm(CTW);
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
+            .addOperand(MI->getOperand(2))
+            .addImm(SrcX)
+            .addImm(SrcY)
+            .addImm(SrcZ)
+            .addImm(SrcW)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(1)
+            .addImm(2)
+            .addImm(3)
+            .addOperand(RID)
+            .addOperand(SID)
+            .addImm(CTX)
+            .addImm(CTY)
+            .addImm(CTZ)
+            .addImm(CTW);
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
+            .addOperand(MI->getOperand(0))
+            .addOperand(MI->getOperand(1))
+            .addImm(SrcX)
+            .addImm(SrcY)
+            .addImm(SrcZ)
+            .addImm(SrcW)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(1)
+            .addImm(2)
+            .addImm(3)
+            .addOperand(RID)
+            .addOperand(SID)
+            .addImm(CTX)
+            .addImm(CTY)
+            .addImm(CTZ)
+            .addImm(CTW)
+            .addReg(T0, RegState::Implicit)
+            .addReg(T1, RegState::Implicit);
+    break;
+  }
+
+  case AMDGPU::TXD_SHADOW: {
+    unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
+    unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
+    MachineOperand &RID = MI->getOperand(4);
+    MachineOperand &SID = MI->getOperand(5);
+    unsigned TextureId = MI->getOperand(6).getImm();
+    unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
+    unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
+
+    switch (TextureId) {
+    case 5: // Rect
+      CTX = CTY = 0;
+      break;
+    case 6: // Shadow1D
+      SrcW = SrcZ;
+      break;
+    case 7: // Shadow2D
+      SrcW = SrcZ;
+      break;
+    case 8: // ShadowRect
+      CTX = CTY = 0;
+      SrcW = SrcZ;
+      break;
+    case 9: // 1DArray
+      SrcZ = SrcY;
+      CTZ = 0;
+      break;
+    case 10: // 2DArray
+      CTZ = 0;
+      break;
+    case 11: // Shadow1DArray
+      SrcZ = SrcY;
+      CTZ = 0;
+      break;
+    case 12: // Shadow2DArray
+      CTZ = 0;
+      break;
+    }
+
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
+            .addOperand(MI->getOperand(3))
+            .addImm(SrcX)
+            .addImm(SrcY)
+            .addImm(SrcZ)
+            .addImm(SrcW)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(1)
+            .addImm(2)
+            .addImm(3)
+            .addOperand(RID)
+            .addOperand(SID)
+            .addImm(CTX)
+            .addImm(CTY)
+            .addImm(CTZ)
+            .addImm(CTW);
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
+            .addOperand(MI->getOperand(2))
+            .addImm(SrcX)
+            .addImm(SrcY)
+            .addImm(SrcZ)
+            .addImm(SrcW)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(1)
+            .addImm(2)
+            .addImm(3)
+            .addOperand(RID)
+            .addOperand(SID)
+            .addImm(CTX)
+            .addImm(CTY)
+            .addImm(CTZ)
+            .addImm(CTW);
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
+            .addOperand(MI->getOperand(0))
+            .addOperand(MI->getOperand(1))
+            .addImm(SrcX)
+            .addImm(SrcY)
+            .addImm(SrcZ)
+            .addImm(SrcW)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(1)
+            .addImm(2)
+            .addImm(3)
+            .addOperand(RID)
+            .addOperand(SID)
+            .addImm(CTX)
+            .addImm(CTY)
+            .addImm(CTZ)
+            .addImm(CTW)
+            .addReg(T0, RegState::Implicit)
+            .addReg(T1, RegState::Implicit);
+    break;
+  }
+
+  case AMDGPU::BRANCH:
+      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
+              .addOperand(MI->getOperand(0));
+      break;
+
+  case AMDGPU::BRANCH_COND_f32: {
+    MachineInstr *NewMI =
+      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
+              AMDGPU::PREDICATE_BIT)
+              .addOperand(MI->getOperand(1))
+              .addImm(OPCODE_IS_NOT_ZERO)
+              .addImm(0); // Flags
+    TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
+            .addOperand(MI->getOperand(0))
+            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
+    break;
+  }
+
+  case AMDGPU::BRANCH_COND_i32: {
+    MachineInstr *NewMI =
+      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
+            AMDGPU::PREDICATE_BIT)
+            .addOperand(MI->getOperand(1))
+            .addImm(OPCODE_IS_NOT_ZERO_INT)
+            .addImm(0); // Flags
+    TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
+           .addOperand(MI->getOperand(0))
+            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
+    break;
+  }
+
+  case AMDGPU::EG_ExportSwz:
+  case AMDGPU::R600_ExportSwz: {
+    // Instruction is left unmodified if its not the last one of its type
+    bool isLastInstructionOfItsType = true;
+    unsigned InstExportType = MI->getOperand(1).getImm();
+    for (MachineBasicBlock::iterator NextExportInst = std::next(I),
+         EndBlock = BB->end(); NextExportInst != EndBlock;
+         NextExportInst = std::next(NextExportInst)) {
+      if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
+          NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
+        unsigned CurrentInstExportType = NextExportInst->getOperand(1)
+            .getImm();
+        if (CurrentInstExportType == InstExportType) {
+          isLastInstructionOfItsType = false;
+          break;
+        }
+      }
+    }
+    bool EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
+    if (!EOP && !isLastInstructionOfItsType)
+      return BB;
+    unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
+            .addOperand(MI->getOperand(0))
+            .addOperand(MI->getOperand(1))
+            .addOperand(MI->getOperand(2))
+            .addOperand(MI->getOperand(3))
+            .addOperand(MI->getOperand(4))
+            .addOperand(MI->getOperand(5))
+            .addOperand(MI->getOperand(6))
+            .addImm(CfInst)
+            .addImm(EOP);
+    break;
+  }
+  case AMDGPU::RETURN: {
+    // RETURN instructions must have the live-out registers as implicit uses,
+    // otherwise they appear dead.
+    R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
+    MachineInstrBuilder MIB(*MF, MI);
+    for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
+      MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
+    return BB;
+  }
+  }
+
+  MI->eraseFromParent();
+  return BB;
+}
+
+//===----------------------------------------------------------------------===//
+// Custom DAG Lowering Operations
+//===----------------------------------------------------------------------===//
+
+SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
+  switch (Op.getOpcode()) {
+  default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
+  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
+  case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
+  case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
+  case ISD::SRA_PARTS:
+  case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
+  case ISD::FCOS:
+  case ISD::FSIN: return LowerTrig(Op, DAG);
+  case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
+  case ISD::STORE: return LowerSTORE(Op, DAG);
+  case ISD::LOAD: {
+    SDValue Result = LowerLOAD(Op, DAG);
+    assert((!Result.getNode() ||
+            Result.getNode()->getNumValues() == 2) &&
+           "Load should return a value and a chain");
+    return Result;
+  }
+
+  case ISD::BRCOND: return LowerBRCOND(Op, DAG);
+  case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
+  case ISD::INTRINSIC_VOID: {
+    SDValue Chain = Op.getOperand(0);
+    unsigned IntrinsicID =
+                         cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+    switch (IntrinsicID) {
+    case AMDGPUIntrinsic::AMDGPU_store_output: {
+      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
+      MFI->LiveOuts.push_back(Reg);
+      return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
+    }
+    case AMDGPUIntrinsic::R600_store_swizzle: {
+      const SDValue Args[8] = {
+        Chain,
+        Op.getOperand(2), // Export Value
+        Op.getOperand(3), // ArrayBase
+        Op.getOperand(4), // Type
+        DAG.getConstant(0, MVT::i32), // SWZ_X
+        DAG.getConstant(1, MVT::i32), // SWZ_Y
+        DAG.getConstant(2, MVT::i32), // SWZ_Z
+        DAG.getConstant(3, MVT::i32) // SWZ_W
+      };
+      return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(), Args);
+    }
+
+    // default for switch(IntrinsicID)
+    default: break;
+    }
+    // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
+    break;
+  }
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IntrinsicID =
+                         cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    EVT VT = Op.getValueType();
+    SDLoc DL(Op);
+    switch(IntrinsicID) {
+    default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
+    case AMDGPUIntrinsic::R600_load_input: {
+      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
+      MachineFunction &MF = DAG.getMachineFunction();
+      MachineRegisterInfo &MRI = MF.getRegInfo();
+      MRI.addLiveIn(Reg);
+      return DAG.getCopyFromReg(DAG.getEntryNode(),
+          SDLoc(DAG.getEntryNode()), Reg, VT);
+    }
+
+    case AMDGPUIntrinsic::R600_interp_input: {
+      int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+      int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
+      MachineSDNode *interp;
+      if (ijb < 0) {
+        const MachineFunction &MF = DAG.getMachineFunction();
+        const R600InstrInfo *TII =
+          static_cast<const R600InstrInfo*>(MF.getTarget().getInstrInfo());
+        interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
+            MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
+        return DAG.getTargetExtractSubreg(
+            TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
+            DL, MVT::f32, SDValue(interp, 0));
+      }
+      MachineFunction &MF = DAG.getMachineFunction();
+      MachineRegisterInfo &MRI = MF.getRegInfo();
+      unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
+      unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
+      MRI.addLiveIn(RegisterI);
+      MRI.addLiveIn(RegisterJ);
+      SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
+          SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
+      SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
+          SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
+
+      if (slot % 4 < 2)
+        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
+            MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
+            RegisterJNode, RegisterINode);
+      else
+        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
+            MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
+            RegisterJNode, RegisterINode);
+      return SDValue(interp, slot % 2);
+    }
+    case AMDGPUIntrinsic::R600_interp_xy:
+    case AMDGPUIntrinsic::R600_interp_zw: {
+      int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+      MachineSDNode *interp;
+      SDValue RegisterINode = Op.getOperand(2);
+      SDValue RegisterJNode = Op.getOperand(3);
+
+      if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy)
+        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
+            MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
+            RegisterJNode, RegisterINode);
+      else
+        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
+            MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
+            RegisterJNode, RegisterINode);
+      return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32,
+          SDValue(interp, 0), SDValue(interp, 1));
+    }
+    case AMDGPUIntrinsic::R600_tex:
+    case AMDGPUIntrinsic::R600_texc:
+    case AMDGPUIntrinsic::R600_txl:
+    case AMDGPUIntrinsic::R600_txlc:
+    case AMDGPUIntrinsic::R600_txb:
+    case AMDGPUIntrinsic::R600_txbc:
+    case AMDGPUIntrinsic::R600_txf:
+    case AMDGPUIntrinsic::R600_txq:
+    case AMDGPUIntrinsic::R600_ddx:
+    case AMDGPUIntrinsic::R600_ddy:
+    case AMDGPUIntrinsic::R600_ldptr: {
+      unsigned TextureOp;
+      switch (IntrinsicID) {
+      case AMDGPUIntrinsic::R600_tex:
+        TextureOp = 0;
+        break;
+      case AMDGPUIntrinsic::R600_texc:
+        TextureOp = 1;
+        break;
+      case AMDGPUIntrinsic::R600_txl:
+        TextureOp = 2;
+        break;
+      case AMDGPUIntrinsic::R600_txlc:
+        TextureOp = 3;
+        break;
+      case AMDGPUIntrinsic::R600_txb:
+        TextureOp = 4;
+        break;
+      case AMDGPUIntrinsic::R600_txbc:
+        TextureOp = 5;
+        break;
+      case AMDGPUIntrinsic::R600_txf:
+        TextureOp = 6;
+        break;
+      case AMDGPUIntrinsic::R600_txq:
+        TextureOp = 7;
+        break;
+      case AMDGPUIntrinsic::R600_ddx:
+        TextureOp = 8;
+        break;
+      case AMDGPUIntrinsic::R600_ddy:
+        TextureOp = 9;
+        break;
+      case AMDGPUIntrinsic::R600_ldptr:
+        TextureOp = 10;
+        break;
+      default:
+        llvm_unreachable("Unknow Texture Operation");
+      }
+
+      SDValue TexArgs[19] = {
+        DAG.getConstant(TextureOp, MVT::i32),
+        Op.getOperand(1),
+        DAG.getConstant(0, MVT::i32),
+        DAG.getConstant(1, MVT::i32),
+        DAG.getConstant(2, MVT::i32),
+        DAG.getConstant(3, MVT::i32),
+        Op.getOperand(2),
+        Op.getOperand(3),
+        Op.getOperand(4),
+        DAG.getConstant(0, MVT::i32),
+        DAG.getConstant(1, MVT::i32),
+        DAG.getConstant(2, MVT::i32),
+        DAG.getConstant(3, MVT::i32),
+        Op.getOperand(5),
+        Op.getOperand(6),
+        Op.getOperand(7),
+        Op.getOperand(8),
+        Op.getOperand(9),
+        Op.getOperand(10)
+      };
+      return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
+    }
+    case AMDGPUIntrinsic::AMDGPU_dp4: {
+      SDValue Args[8] = {
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
+          DAG.getConstant(0, MVT::i32)),
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
+          DAG.getConstant(0, MVT::i32)),
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
+          DAG.getConstant(1, MVT::i32)),
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
+          DAG.getConstant(1, MVT::i32)),
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
+          DAG.getConstant(2, MVT::i32)),
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
+          DAG.getConstant(2, MVT::i32)),
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
+          DAG.getConstant(3, MVT::i32)),
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
+          DAG.getConstant(3, MVT::i32))
+      };
+      return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
+    }
+
+    case Intrinsic::r600_read_ngroups_x:
+      return LowerImplicitParameter(DAG, VT, DL, 0);
+    case Intrinsic::r600_read_ngroups_y:
+      return LowerImplicitParameter(DAG, VT, DL, 1);
+    case Intrinsic::r600_read_ngroups_z:
+      return LowerImplicitParameter(DAG, VT, DL, 2);
+    case Intrinsic::r600_read_global_size_x:
+      return LowerImplicitParameter(DAG, VT, DL, 3);
+    case Intrinsic::r600_read_global_size_y:
+      return LowerImplicitParameter(DAG, VT, DL, 4);
+    case Intrinsic::r600_read_global_size_z:
+      return LowerImplicitParameter(DAG, VT, DL, 5);
+    case Intrinsic::r600_read_local_size_x:
+      return LowerImplicitParameter(DAG, VT, DL, 6);
+    case Intrinsic::r600_read_local_size_y:
+      return LowerImplicitParameter(DAG, VT, DL, 7);
+    case Intrinsic::r600_read_local_size_z:
+      return LowerImplicitParameter(DAG, VT, DL, 8);
+
+    case Intrinsic::r600_read_tgid_x:
+      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
+                                  AMDGPU::T1_X, VT);
+    case Intrinsic::r600_read_tgid_y:
+      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
+                                  AMDGPU::T1_Y, VT);
+    case Intrinsic::r600_read_tgid_z:
+      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
+                                  AMDGPU::T1_Z, VT);
+    case Intrinsic::r600_read_tidig_x:
+      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
+                                  AMDGPU::T0_X, VT);
+    case Intrinsic::r600_read_tidig_y:
+      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
+                                  AMDGPU::T0_Y, VT);
+    case Intrinsic::r600_read_tidig_z:
+      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
+                                  AMDGPU::T0_Z, VT);
+    case Intrinsic::AMDGPU_rsq:
+      // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior.
+      return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
+    }
+    // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
+    break;
+  }
+  } // end switch(Op.getOpcode())
+  return SDValue();
+}
+
+void R600TargetLowering::ReplaceNodeResults(SDNode *N,
+                                            SmallVectorImpl<SDValue> &Results,
+                                            SelectionDAG &DAG) const {
+  switch (N->getOpcode()) {
+  default:
+    AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
+    return;
+  case ISD::FP_TO_UINT:
+    if (N->getValueType(0) == MVT::i1) {
+      Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
+      return;
+    }
+    // Fall-through. Since we don't care about out of bounds values
+    // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint
+    // considers some extra cases which are not necessary here.
+  case ISD::FP_TO_SINT: {
+    SDValue Result;
+    if (expandFP_TO_SINT(N, Result, DAG))
+      Results.push_back(Result);
+    return;
+  }
+  case ISD::UDIV: {
+    SDValue Op = SDValue(N, 0);
+    SDLoc DL(Op);
+    EVT VT = Op.getValueType();
+    SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
+      N->getOperand(0), N->getOperand(1));
+    Results.push_back(UDIVREM);
+    break;
+  }
+  case ISD::UREM: {
+    SDValue Op = SDValue(N, 0);
+    SDLoc DL(Op);
+    EVT VT = Op.getValueType();
+    SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
+      N->getOperand(0), N->getOperand(1));
+    Results.push_back(UDIVREM.getValue(1));
+    break;
+  }
+  case ISD::SDIV: {
+    SDValue Op = SDValue(N, 0);
+    SDLoc DL(Op);
+    EVT VT = Op.getValueType();
+    SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT),
+      N->getOperand(0), N->getOperand(1));
+    Results.push_back(SDIVREM);
+    break;
+  }
+  case ISD::SREM: {
+    SDValue Op = SDValue(N, 0);
+    SDLoc DL(Op);
+    EVT VT = Op.getValueType();
+    SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT),
+      N->getOperand(0), N->getOperand(1));
+    Results.push_back(SDIVREM.getValue(1));
+    break;
+  }
+  case ISD::SDIVREM: {
+    SDValue Op = SDValue(N, 1);
+    SDValue RES = LowerSDIVREM(Op, DAG);
+    Results.push_back(RES);
+    Results.push_back(RES.getValue(1));
+    break;
+  }
+  case ISD::UDIVREM: {
+    SDValue Op = SDValue(N, 0);
+    SDLoc DL(Op);
+    EVT VT = Op.getValueType();
+    EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
+
+    SDValue one = DAG.getConstant(1, HalfVT);
+    SDValue zero = DAG.getConstant(0, HalfVT);
+
+    //HiLo split
+    SDValue LHS = N->getOperand(0);
+    SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero);
+    SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one);
+
+    SDValue RHS = N->getOperand(1);
+    SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero);
+    SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one);
+
+    // Get Speculative values
+    SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
+    SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
+
+    SDValue REM_Hi = zero;
+    SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ);
+
+    SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ);
+    SDValue DIV_Lo = zero;
+
+    const unsigned halfBitWidth = HalfVT.getSizeInBits();
+
+    for (unsigned i = 0; i < halfBitWidth; ++i) {
+      SDValue POS = DAG.getConstant(halfBitWidth - i - 1, HalfVT);
+      // Get Value of high bit
+      SDValue HBit;
+      if (halfBitWidth == 32 && Subtarget->hasBFE()) {
+        HBit = DAG.getNode(AMDGPUISD::BFE_U32, DL, HalfVT, LHS_Lo, POS, one);
+      } else {
+        HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
+        HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one);
+      }
+
+      SDValue Carry = DAG.getNode(ISD::SRL, DL, HalfVT, REM_Lo,
+        DAG.getConstant(halfBitWidth - 1, HalfVT));
+      REM_Hi = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Hi, one);
+      REM_Hi = DAG.getNode(ISD::OR, DL, HalfVT, REM_Hi, Carry);
+
+      REM_Lo = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Lo, one);
+      REM_Lo = DAG.getNode(ISD::OR, DL, HalfVT, REM_Lo, HBit);
+
+
+      SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
+
+      SDValue BIT = DAG.getConstant(1 << (halfBitWidth - i - 1), HalfVT);
+      SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETGE);
+
+      DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
+
+      // Update REM
+
+      SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
+
+      REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETGE);
+      REM_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, zero);
+      REM_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, one);
+    }
+
+    SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
+    SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi);
+    Results.push_back(DIV);
+    Results.push_back(REM);
+    break;
+  }
+  }
+}
+
+SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
+                                                   SDValue Vector) const {
+
+  SDLoc DL(Vector);
+  EVT VecVT = Vector.getValueType();
+  EVT EltVT = VecVT.getVectorElementType();
+  SmallVector<SDValue, 8> Args;
+
+  for (unsigned i = 0, e = VecVT.getVectorNumElements();
+                                                           i != e; ++i) {
+    Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
+                               Vector, DAG.getConstant(i, getVectorIdxTy())));
+  }
+
+  return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
+}
+
+SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
+                                                    SelectionDAG &DAG) const {
+
+  SDLoc DL(Op);
+  SDValue Vector = Op.getOperand(0);
+  SDValue Index = Op.getOperand(1);
+
+  if (isa<ConstantSDNode>(Index) ||
+      Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
+    return Op;
+
+  Vector = vectorToVerticalVector(DAG, Vector);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
+                     Vector, Index);
+}
+
+SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  SDValue Vector = Op.getOperand(0);
+  SDValue Value = Op.getOperand(1);
+  SDValue Index = Op.getOperand(2);
+
+  if (isa<ConstantSDNode>(Index) ||
+      Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
+    return Op;
+
+  Vector = vectorToVerticalVector(DAG, Vector);
+  SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
+                               Vector, Value, Index);
+  return vectorToVerticalVector(DAG, Insert);
+}
+
+SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
+  // On hw >= R700, COS/SIN input must be between -1. and 1.
+  // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
+  EVT VT = Op.getValueType();
+  SDValue Arg = Op.getOperand(0);
+  SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT,
+      DAG.getNode(ISD::FADD, SDLoc(Op), VT,
+        DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg,
+          DAG.getConstantFP(0.15915494309, MVT::f32)),
+        DAG.getConstantFP(0.5, MVT::f32)));
+  unsigned TrigNode;
+  switch (Op.getOpcode()) {
+  case ISD::FCOS:
+    TrigNode = AMDGPUISD::COS_HW;
+    break;
+  case ISD::FSIN:
+    TrigNode = AMDGPUISD::SIN_HW;
+    break;
+  default:
+    llvm_unreachable("Wrong trig opcode");
+  }
+  SDValue TrigVal = DAG.getNode(TrigNode, SDLoc(Op), VT,
+      DAG.getNode(ISD::FADD, SDLoc(Op), VT, FractPart,
+        DAG.getConstantFP(-0.5, MVT::f32)));
+  if (Gen >= AMDGPUSubtarget::R700)
+    return TrigVal;
+  // On R600 hw, COS/SIN input must be between -Pi and Pi.
+  return DAG.getNode(ISD::FMUL, SDLoc(Op), VT, TrigVal,
+      DAG.getConstantFP(3.14159265359, MVT::f32));
+}
+
+SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+
+  SDValue Lo = Op.getOperand(0);
+  SDValue Hi = Op.getOperand(1);
+  SDValue Shift = Op.getOperand(2);
+  SDValue Zero = DAG.getConstant(0, VT);
+  SDValue One  = DAG.getConstant(1, VT);
+
+  SDValue Width  = DAG.getConstant(VT.getSizeInBits(), VT);
+  SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT);
+  SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
+  SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
+
+  // The dance around Width1 is necessary for 0 special case.
+  // Without it the CompShift might be 32, producing incorrect results in
+  // Overflow. So we do the shift in two steps, the alternative is to
+  // add a conditional to filter the special case.
+
+  SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
+  Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
+
+  SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
+  HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
+  SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
+
+  SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
+  SDValue LoBig = Zero;
+
+  Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
+  Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
+
+  return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
+}
+
+SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+
+  SDValue Lo = Op.getOperand(0);
+  SDValue Hi = Op.getOperand(1);
+  SDValue Shift = Op.getOperand(2);
+  SDValue Zero = DAG.getConstant(0, VT);
+  SDValue One  = DAG.getConstant(1, VT);
+
+  const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
+
+  SDValue Width  = DAG.getConstant(VT.getSizeInBits(), VT);
+  SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT);
+  SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
+  SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
+
+  // The dance around Width1 is necessary for 0 special case.
+  // Without it the CompShift might be 32, producing incorrect results in
+  // Overflow. So we do the shift in two steps, the alternative is to
+  // add a conditional to filter the special case.
+
+  SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
+  Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
+
+  SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
+  SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
+  LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
+
+  SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
+  SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
+
+  Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
+  Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
+
+  return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
+}
+
+SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
+  return DAG.getNode(
+      ISD::SETCC,
+      SDLoc(Op),
+      MVT::i1,
+      Op, DAG.getConstantFP(0.0f, MVT::f32),
+      DAG.getCondCode(ISD::SETNE)
+      );
+}
+
+SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
+                                                   SDLoc DL,
+                                                   unsigned DwordOffset) const {
+  unsigned ByteOffset = DwordOffset * 4;
+  PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
+                                      AMDGPUAS::CONSTANT_BUFFER_0);
+
+  // We shouldn't be using an offset wider than 16-bits for implicit parameters.
+  assert(isInt<16>(ByteOffset));
+
+  return DAG.getLoad(VT, DL, DAG.getEntryNode(),
+                     DAG.getConstant(ByteOffset, MVT::i32), // PTR
+                     MachinePointerInfo(ConstantPointerNull::get(PtrType)),
+                     false, false, false, 0);
+}
+
+bool R600TargetLowering::isZero(SDValue Op) const {
+  if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
+    return Cst->isNullValue();
+  } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
+    return CstFP->isZero();
+  } else {
+    return false;
+  }
+}
+
+SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue True = Op.getOperand(2);
+  SDValue False = Op.getOperand(3);
+  SDValue CC = Op.getOperand(4);
+  SDValue Temp;
+
+  // LHS and RHS are guaranteed to be the same value type
+  EVT CompareVT = LHS.getValueType();
+
+  // Check if we can lower this to a native operation.
+
+  // Try to lower to a SET* instruction:
+  //
+  // SET* can match the following patterns:
+  //
+  // select_cc f32, f32, -1,  0, cc_supported
+  // select_cc f32, f32, 1.0f, 0.0f, cc_supported
+  // select_cc i32, i32, -1,  0, cc_supported
+  //
+
+  // Move hardware True/False values to the correct operand.
+  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
+  ISD::CondCode InverseCC =
+     ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
+  if (isHWTrueValue(False) && isHWFalseValue(True)) {
+    if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
+      std::swap(False, True);
+      CC = DAG.getCondCode(InverseCC);
+    } else {
+      ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
+      if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
+        std::swap(False, True);
+        std::swap(LHS, RHS);
+        CC = DAG.getCondCode(SwapInvCC);
+      }
+    }
+  }
+
+  if (isHWTrueValue(True) && isHWFalseValue(False) &&
+      (CompareVT == VT || VT == MVT::i32)) {
+    // This can be matched by a SET* instruction.
+    return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
+  }
+
+  // Try to lower to a CND* instruction:
+  //
+  // CND* can match the following patterns:
+  //
+  // select_cc f32, 0.0, f32, f32, cc_supported
+  // select_cc f32, 0.0, i32, i32, cc_supported
+  // select_cc i32, 0,   f32, f32, cc_supported
+  // select_cc i32, 0,   i32, i32, cc_supported
+  //
+
+  // Try to move the zero value to the RHS
+  if (isZero(LHS)) {
+    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
+    // Try swapping the operands
+    ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
+    if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
+      std::swap(LHS, RHS);
+      CC = DAG.getCondCode(CCSwapped);
+    } else {
+      // Try inverting the conditon and then swapping the operands
+      ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
+      CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
+      if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
+        std::swap(True, False);
+        std::swap(LHS, RHS);
+        CC = DAG.getCondCode(CCSwapped);
+      }
+    }
+  }
+  if (isZero(RHS)) {
+    SDValue Cond = LHS;
+    SDValue Zero = RHS;
+    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
+    if (CompareVT != VT) {
+      // Bitcast True / False to the correct types.  This will end up being
+      // a nop, but it allows us to define only a single pattern in the
+      // .TD files for each CND* instruction rather than having to have
+      // one pattern for integer True/False and one for fp True/False
+      True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
+      False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
+    }
+
+    switch (CCOpcode) {
+    case ISD::SETONE:
+    case ISD::SETUNE:
+    case ISD::SETNE:
+      CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
+      Temp = True;
+      True = False;
+      False = Temp;
+      break;
+    default:
+      break;
+    }
+    SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
+        Cond, Zero,
+        True, False,
+        DAG.getCondCode(CCOpcode));
+    return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
+  }
+
+  // If we make it this for it means we have no native instructions to handle
+  // this SELECT_CC, so we must lower it.
+  SDValue HWTrue, HWFalse;
+
+  if (CompareVT == MVT::f32) {
+    HWTrue = DAG.getConstantFP(1.0f, CompareVT);
+    HWFalse = DAG.getConstantFP(0.0f, CompareVT);
+  } else if (CompareVT == MVT::i32) {
+    HWTrue = DAG.getConstant(-1, CompareVT);
+    HWFalse = DAG.getConstant(0, CompareVT);
+  }
+  else {
+    llvm_unreachable("Unhandled value type in LowerSELECT_CC");
+  }
+
+  // Lower this unsupported SELECT_CC into a combination of two supported
+  // SELECT_CC operations.
+  SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
+
+  return DAG.getNode(ISD::SELECT_CC, DL, VT,
+      Cond, HWFalse,
+      True, False,
+      DAG.getCondCode(ISD::SETNE));
+}
+
+/// LLVM generates byte-addressed pointers.  For indirect addressing, we need to
+/// convert these pointers to a register index.  Each register holds
+/// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
+/// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
+/// for indirect addressing.
+SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
+                                               unsigned StackWidth,
+                                               SelectionDAG &DAG) const {
+  unsigned SRLPad;
+  switch(StackWidth) {
+  case 1:
+    SRLPad = 2;
+    break;
+  case 2:
+    SRLPad = 3;
+    break;
+  case 4:
+    SRLPad = 4;
+    break;
+  default: llvm_unreachable("Invalid stack width");
+  }
+
+  return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr,
+                     DAG.getConstant(SRLPad, MVT::i32));
+}
+
+void R600TargetLowering::getStackAddress(unsigned StackWidth,
+                                         unsigned ElemIdx,
+                                         unsigned &Channel,
+                                         unsigned &PtrIncr) const {
+  switch (StackWidth) {
+  default:
+  case 1:
+    Channel = 0;
+    if (ElemIdx > 0) {
+      PtrIncr = 1;
+    } else {
+      PtrIncr = 0;
+    }
+    break;
+  case 2:
+    Channel = ElemIdx % 2;
+    if (ElemIdx == 2) {
+      PtrIncr = 1;
+    } else {
+      PtrIncr = 0;
+    }
+    break;
+  case 4:
+    Channel = ElemIdx;
+    PtrIncr = 0;
+    break;
+  }
+}
+
+SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
+  SDValue Chain = Op.getOperand(0);
+  SDValue Value = Op.getOperand(1);
+  SDValue Ptr = Op.getOperand(2);
+
+  SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
+  if (Result.getNode()) {
+    return Result;
+  }
+
+  if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) {
+    if (StoreNode->isTruncatingStore()) {
+      EVT VT = Value.getValueType();
+      assert(VT.bitsLE(MVT::i32));
+      EVT MemVT = StoreNode->getMemoryVT();
+      SDValue MaskConstant;
+      if (MemVT == MVT::i8) {
+        MaskConstant = DAG.getConstant(0xFF, MVT::i32);
+      } else {
+        assert(MemVT == MVT::i16);
+        MaskConstant = DAG.getConstant(0xFFFF, MVT::i32);
+      }
+      SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
+                                      DAG.getConstant(2, MVT::i32));
+      SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
+                                      DAG.getConstant(0x00000003, VT));
+      SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
+      SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
+                                   DAG.getConstant(3, VT));
+      SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
+      SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
+      // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
+      // vector instead.
+      SDValue Src[4] = {
+        ShiftedValue,
+        DAG.getConstant(0, MVT::i32),
+        DAG.getConstant(0, MVT::i32),
+        Mask
+      };
+      SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src);
+      SDValue Args[3] = { Chain, Input, DWordAddr };
+      return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
+                                     Op->getVTList(), Args, MemVT,
+                                     StoreNode->getMemOperand());
+    } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
+               Value.getValueType().bitsGE(MVT::i32)) {
+      // Convert pointer from byte address to dword address.
+      Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
+                        DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
+                                    Ptr, DAG.getConstant(2, MVT::i32)));
+
+      if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
+        llvm_unreachable("Truncated and indexed stores not supported yet");
+      } else {
+        Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
+      }
+      return Chain;
+    }
+  }
+
+  EVT ValueVT = Value.getValueType();
+
+  if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
+    return SDValue();
+  }
+
+  SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
+  if (Ret.getNode()) {
+    return Ret;
+  }
+  // Lowering for indirect addressing
+
+  const MachineFunction &MF = DAG.getMachineFunction();
+  const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
+                                         getTargetMachine().getFrameLowering());
+  unsigned StackWidth = TFL->getStackWidth(MF);
+
+  Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
+
+  if (ValueVT.isVector()) {
+    unsigned NumElemVT = ValueVT.getVectorNumElements();
+    EVT ElemVT = ValueVT.getVectorElementType();
+    SmallVector<SDValue, 4> Stores(NumElemVT);
+
+    assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
+                                      "vector width in load");
+
+    for (unsigned i = 0; i < NumElemVT; ++i) {
+      unsigned Channel, PtrIncr;
+      getStackAddress(StackWidth, i, Channel, PtrIncr);
+      Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
+                        DAG.getConstant(PtrIncr, MVT::i32));
+      SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
+                                 Value, DAG.getConstant(i, MVT::i32));
+
+      Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
+                              Chain, Elem, Ptr,
+                              DAG.getTargetConstant(Channel, MVT::i32));
+    }
+     Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
+   } else {
+    if (ValueVT == MVT::i8) {
+      Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
+    }
+    Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
+    DAG.getTargetConstant(0, MVT::i32)); // Channel
+  }
+
+  return Chain;
+}
+
+// return (512 + (kc_bank << 12)
+static int
+ConstantAddressBlock(unsigned AddressSpace) {
+  switch (AddressSpace) {
+  case AMDGPUAS::CONSTANT_BUFFER_0:
+    return 512;
+  case AMDGPUAS::CONSTANT_BUFFER_1:
+    return 512 + 4096;
+  case AMDGPUAS::CONSTANT_BUFFER_2:
+    return 512 + 4096 * 2;
+  case AMDGPUAS::CONSTANT_BUFFER_3:
+    return 512 + 4096 * 3;
+  case AMDGPUAS::CONSTANT_BUFFER_4:
+    return 512 + 4096 * 4;
+  case AMDGPUAS::CONSTANT_BUFFER_5:
+    return 512 + 4096 * 5;
+  case AMDGPUAS::CONSTANT_BUFFER_6:
+    return 512 + 4096 * 6;
+  case AMDGPUAS::CONSTANT_BUFFER_7:
+    return 512 + 4096 * 7;
+  case AMDGPUAS::CONSTANT_BUFFER_8:
+    return 512 + 4096 * 8;
+  case AMDGPUAS::CONSTANT_BUFFER_9:
+    return 512 + 4096 * 9;
+  case AMDGPUAS::CONSTANT_BUFFER_10:
+    return 512 + 4096 * 10;
+  case AMDGPUAS::CONSTANT_BUFFER_11:
+    return 512 + 4096 * 11;
+  case AMDGPUAS::CONSTANT_BUFFER_12:
+    return 512 + 4096 * 12;
+  case AMDGPUAS::CONSTANT_BUFFER_13:
+    return 512 + 4096 * 13;
+  case AMDGPUAS::CONSTANT_BUFFER_14:
+    return 512 + 4096 * 14;
+  case AMDGPUAS::CONSTANT_BUFFER_15:
+    return 512 + 4096 * 15;
+  default:
+    return -1;
+  }
+}
+
+SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
+{
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
+  SDValue Chain = Op.getOperand(0);
+  SDValue Ptr = Op.getOperand(1);
+  SDValue LoweredLoad;
+
+  SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
+  if (Ret.getNode()) {
+    SDValue Ops[2] = {
+      Ret,
+      Chain
+    };
+    return DAG.getMergeValues(Ops, DL);
+  }
+
+  // Lower loads constant address space global variable loads
+  if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
+      isa<GlobalVariable>(
+          GetUnderlyingObject(LoadNode->getMemOperand()->getValue()))) {
+
+    SDValue Ptr = DAG.getZExtOrTrunc(LoadNode->getBasePtr(), DL,
+        getPointerTy(AMDGPUAS::PRIVATE_ADDRESS));
+    Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
+        DAG.getConstant(2, MVT::i32));
+    return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(),
+                       LoadNode->getChain(), Ptr,
+                       DAG.getTargetConstant(0, MVT::i32), Op.getOperand(2));
+  }
+
+  if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
+    SDValue MergedValues[2] = {
+      SplitVectorLoad(Op, DAG),
+      Chain
+    };
+    return DAG.getMergeValues(MergedValues, DL);
+  }
+
+  int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
+  if (ConstantBlock > -1 &&
+      ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
+       (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
+    SDValue Result;
+    if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
+        isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
+        isa<ConstantSDNode>(Ptr)) {
+      SDValue Slots[4];
+      for (unsigned i = 0; i < 4; i++) {
+        // We want Const position encoded with the following formula :
+        // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
+        // const_index is Ptr computed by llvm using an alignment of 16.
+        // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
+        // then div by 4 at the ISel step
+        SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
+            DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
+        Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
+      }
+      EVT NewVT = MVT::v4i32;
+      unsigned NumElements = 4;
+      if (VT.isVector()) {
+        NewVT = VT;
+        NumElements = VT.getVectorNumElements();
+      }
+      Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT,
+                           makeArrayRef(Slots, NumElements));
+    } else {
+      // non-constant ptr can't be folded, keeps it as a v4f32 load
+      Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
+          DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
+          DAG.getConstant(LoadNode->getAddressSpace() -
+                          AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
+          );
+    }
+
+    if (!VT.isVector()) {
+      Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
+          DAG.getConstant(0, MVT::i32));
+    }
+
+    SDValue MergedValues[2] = {
+      Result,
+      Chain
+    };
+    return DAG.getMergeValues(MergedValues, DL);
+  }
+
+  // For most operations returning SDValue() will result in the node being
+  // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
+  // need to manually expand loads that may be legal in some address spaces and
+  // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
+  // compute shaders, since the data is sign extended when it is uploaded to the
+  // buffer. However SEXT loads from other address spaces are not supported, so
+  // we need to expand them here.
+  if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
+    EVT MemVT = LoadNode->getMemoryVT();
+    assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
+    SDValue ShiftAmount =
+          DAG.getConstant(VT.getSizeInBits() - MemVT.getSizeInBits(), MVT::i32);
+    SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
+                                  LoadNode->getPointerInfo(), MemVT,
+                                  LoadNode->isVolatile(),
+                                  LoadNode->isNonTemporal(),
+                                  LoadNode->getAlignment());
+    SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, NewLoad, ShiftAmount);
+    SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount);
+
+    SDValue MergedValues[2] = { Sra, Chain };
+    return DAG.getMergeValues(MergedValues, DL);
+  }
+
+  if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
+    return SDValue();
+  }
+
+  // Lowering for indirect addressing
+  const MachineFunction &MF = DAG.getMachineFunction();
+  const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
+                                         getTargetMachine().getFrameLowering());
+  unsigned StackWidth = TFL->getStackWidth(MF);
+
+  Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
+
+  if (VT.isVector()) {
+    unsigned NumElemVT = VT.getVectorNumElements();
+    EVT ElemVT = VT.getVectorElementType();
+    SDValue Loads[4];
+
+    assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
+                                      "vector width in load");
+
+    for (unsigned i = 0; i < NumElemVT; ++i) {
+      unsigned Channel, PtrIncr;
+      getStackAddress(StackWidth, i, Channel, PtrIncr);
+      Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
+                        DAG.getConstant(PtrIncr, MVT::i32));
+      Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
+                             Chain, Ptr,
+                             DAG.getTargetConstant(Channel, MVT::i32),
+                             Op.getOperand(2));
+    }
+    for (unsigned i = NumElemVT; i < 4; ++i) {
+      Loads[i] = DAG.getUNDEF(ElemVT);
+    }
+    EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
+    LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads);
+  } else {
+    LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
+                              Chain, Ptr,
+                              DAG.getTargetConstant(0, MVT::i32), // Channel
+                              Op.getOperand(2));
+  }
+
+  SDValue Ops[2] = {
+    LoweredLoad,
+    Chain
+  };
+
+  return DAG.getMergeValues(Ops, DL);
+}
+
+SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  SDValue Cond  = Op.getOperand(1);
+  SDValue Jump  = Op.getOperand(2);
+
+  return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(),
+                     Chain, Jump, Cond);
+}
+
+/// XXX Only kernel functions are supported, so we can assume for now that
+/// every function is a kernel function, but in the future we should use
+/// separate calling conventions for kernel and non-kernel functions.
+SDValue R600TargetLowering::LowerFormalArguments(
+                                      SDValue Chain,
+                                      CallingConv::ID CallConv,
+                                      bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                      SDLoc DL, SelectionDAG &DAG,
+                                      SmallVectorImpl<SDValue> &InVals) const {
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+  MachineFunction &MF = DAG.getMachineFunction();
+  unsigned ShaderType = MF.getInfo<R600MachineFunctionInfo>()->getShaderType();
+
+  SmallVector<ISD::InputArg, 8> LocalIns;
+
+  getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns);
+
+  AnalyzeFormalArguments(CCInfo, LocalIns);
+
+  for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    EVT VT = Ins[i].VT;
+    EVT MemVT = LocalIns[i].VT;
+
+    if (ShaderType != ShaderType::COMPUTE) {
+      unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
+      SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
+      InVals.push_back(Register);
+      continue;
+    }
+
+    PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
+                                                   AMDGPUAS::CONSTANT_BUFFER_0);
+
+    // i64 isn't a legal type, so the register type used ends up as i32, which
+    // isn't expected here. It attempts to create this sextload, but it ends up
+    // being invalid. Somehow this seems to work with i64 arguments, but breaks
+    // for <1 x i64>.
+
+    // The first 36 bytes of the input buffer contains information about
+    // thread group and global sizes.
+
+    // FIXME: This should really check the extload type, but the handling of
+    // extload vecto parameters seems to be broken.
+    //ISD::LoadExtType Ext = Ins[i].Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
+    ISD::LoadExtType Ext = ISD::SEXTLOAD;
+    SDValue Arg = DAG.getExtLoad(Ext, DL, VT, Chain,
+                                 DAG.getConstant(36 + VA.getLocMemOffset(), MVT::i32),
+                                 MachinePointerInfo(UndefValue::get(PtrTy)),
+                                 MemVT, false, false, 4);
+
+    // 4 is the preferred alignment for the CONSTANT memory space.
+    InVals.push_back(Arg);
+  }
+  return Chain;
+}
+
+EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
+   if (!VT.isVector())
+     return MVT::i32;
+   return VT.changeVectorElementTypeToInteger();
+}
+
+static SDValue CompactSwizzlableVector(
+  SelectionDAG &DAG, SDValue VectorEntry,
+  DenseMap<unsigned, unsigned> &RemapSwizzle) {
+  assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
+  assert(RemapSwizzle.empty());
+  SDValue NewBldVec[4] = {
+    VectorEntry.getOperand(0),
+    VectorEntry.getOperand(1),
+    VectorEntry.getOperand(2),
+    VectorEntry.getOperand(3)
+  };
+
+  for (unsigned i = 0; i < 4; i++) {
+    if (NewBldVec[i].getOpcode() == ISD::UNDEF)
+      // We mask write here to teach later passes that the ith element of this
+      // vector is undef. Thus we can use it to reduce 128 bits reg usage,
+      // break false dependencies and additionnaly make assembly easier to read.
+      RemapSwizzle[i] = 7; // SEL_MASK_WRITE
+    if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
+      if (C->isZero()) {
+        RemapSwizzle[i] = 4; // SEL_0
+        NewBldVec[i] = DAG.getUNDEF(MVT::f32);
+      } else if (C->isExactlyValue(1.0)) {
+        RemapSwizzle[i] = 5; // SEL_1
+        NewBldVec[i] = DAG.getUNDEF(MVT::f32);
+      }
+    }
+
+    if (NewBldVec[i].getOpcode() == ISD::UNDEF)
+      continue;
+    for (unsigned j = 0; j < i; j++) {
+      if (NewBldVec[i] == NewBldVec[j]) {
+        NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
+        RemapSwizzle[i] = j;
+        break;
+      }
+    }
+  }
+
+  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
+                     VectorEntry.getValueType(), NewBldVec);
+}
+
+static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
+                                DenseMap<unsigned, unsigned> &RemapSwizzle) {
+  assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
+  assert(RemapSwizzle.empty());
+  SDValue NewBldVec[4] = {
+      VectorEntry.getOperand(0),
+      VectorEntry.getOperand(1),
+      VectorEntry.getOperand(2),
+      VectorEntry.getOperand(3)
+  };
+  bool isUnmovable[4] = { false, false, false, false };
+  for (unsigned i = 0; i < 4; i++) {
+    RemapSwizzle[i] = i;
+    if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+      unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
+          ->getZExtValue();
+      if (i == Idx)
+        isUnmovable[Idx] = true;
+    }
+  }
+
+  for (unsigned i = 0; i < 4; i++) {
+    if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+      unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
+          ->getZExtValue();
+      if (isUnmovable[Idx])
+        continue;
+      // Swap i and Idx
+      std::swap(NewBldVec[Idx], NewBldVec[i]);
+      std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
+      break;
+    }
+  }
+
+  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
+                     VectorEntry.getValueType(), NewBldVec);
+}
+
+
+SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
+SDValue Swz[4], SelectionDAG &DAG) const {
+  assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
+  // Old -> New swizzle values
+  DenseMap<unsigned, unsigned> SwizzleRemap;
+
+  BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
+  for (unsigned i = 0; i < 4; i++) {
+    unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
+    if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
+      Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
+  }
+
+  SwizzleRemap.clear();
+  BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
+  for (unsigned i = 0; i < 4; i++) {
+    unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
+    if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
+      Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
+  }
+
+  return BuildVector;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Custom DAG Optimizations
+//===----------------------------------------------------------------------===//
+
+SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
+                                              DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+
+  switch (N->getOpcode()) {
+  default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
+  // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
+  case ISD::FP_ROUND: {
+      SDValue Arg = N->getOperand(0);
+      if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
+        return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
+                           Arg.getOperand(0));
+      }
+      break;
+    }
+
+  // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
+  // (i32 select_cc f32, f32, -1, 0 cc)
+  //
+  // Mesa's GLSL frontend generates the above pattern a lot and we can lower
+  // this to one of the SET*_DX10 instructions.
+  case ISD::FP_TO_SINT: {
+    SDValue FNeg = N->getOperand(0);
+    if (FNeg.getOpcode() != ISD::FNEG) {
+      return SDValue();
+    }
+    SDValue SelectCC = FNeg.getOperand(0);
+    if (SelectCC.getOpcode() != ISD::SELECT_CC ||
+        SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
+        SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
+        !isHWTrueValue(SelectCC.getOperand(2)) ||
+        !isHWFalseValue(SelectCC.getOperand(3))) {
+      return SDValue();
+    }
+
+    return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0),
+                           SelectCC.getOperand(0), // LHS
+                           SelectCC.getOperand(1), // RHS
+                           DAG.getConstant(-1, MVT::i32), // True
+                           DAG.getConstant(0, MVT::i32),  // Flase
+                           SelectCC.getOperand(4)); // CC
+
+    break;
+  }
+
+  // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
+  // => build_vector elt0, ... , NewEltIdx, ... , eltN
+  case ISD::INSERT_VECTOR_ELT: {
+    SDValue InVec = N->getOperand(0);
+    SDValue InVal = N->getOperand(1);
+    SDValue EltNo = N->getOperand(2);
+    SDLoc dl(N);
+
+    // If the inserted element is an UNDEF, just use the input vector.
+    if (InVal.getOpcode() == ISD::UNDEF)
+      return InVec;
+
+    EVT VT = InVec.getValueType();
+
+    // If we can't generate a legal BUILD_VECTOR, exit
+    if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
+      return SDValue();
+
+    // Check that we know which element is being inserted
+    if (!isa<ConstantSDNode>(EltNo))
+      return SDValue();
+    unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
+
+    // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
+    // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
+    // vector elements.
+    SmallVector<SDValue, 8> Ops;
+    if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
+      Ops.append(InVec.getNode()->op_begin(),
+                 InVec.getNode()->op_end());
+    } else if (InVec.getOpcode() == ISD::UNDEF) {
+      unsigned NElts = VT.getVectorNumElements();
+      Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
+    } else {
+      return SDValue();
+    }
+
+    // Insert the element
+    if (Elt < Ops.size()) {
+      // All the operands of BUILD_VECTOR must have the same type;
+      // we enforce that here.
+      EVT OpVT = Ops[0].getValueType();
+      if (InVal.getValueType() != OpVT)
+        InVal = OpVT.bitsGT(InVal.getValueType()) ?
+          DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
+          DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
+      Ops[Elt] = InVal;
+    }
+
+    // Return the new vector
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
+  }
+
+  // Extract_vec (Build_vector) generated by custom lowering
+  // also needs to be customly combined
+  case ISD::EXTRACT_VECTOR_ELT: {
+    SDValue Arg = N->getOperand(0);
+    if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
+      if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
+        unsigned Element = Const->getZExtValue();
+        return Arg->getOperand(Element);
+      }
+    }
+    if (Arg.getOpcode() == ISD::BITCAST &&
+        Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
+      if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
+        unsigned Element = Const->getZExtValue();
+        return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
+            Arg->getOperand(0).getOperand(Element));
+      }
+    }
+  }
+
+  case ISD::SELECT_CC: {
+    // Try common optimizations
+    SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
+    if (Ret.getNode())
+      return Ret;
+
+    // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
+    //      selectcc x, y, a, b, inv(cc)
+    //
+    // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
+    //      selectcc x, y, a, b, cc
+    SDValue LHS = N->getOperand(0);
+    if (LHS.getOpcode() != ISD::SELECT_CC) {
+      return SDValue();
+    }
+
+    SDValue RHS = N->getOperand(1);
+    SDValue True = N->getOperand(2);
+    SDValue False = N->getOperand(3);
+    ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
+
+    if (LHS.getOperand(2).getNode() != True.getNode() ||
+        LHS.getOperand(3).getNode() != False.getNode() ||
+        RHS.getNode() != False.getNode()) {
+      return SDValue();
+    }
+
+    switch (NCC) {
+    default: return SDValue();
+    case ISD::SETNE: return LHS;
+    case ISD::SETEQ: {
+      ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
+      LHSCC = ISD::getSetCCInverse(LHSCC,
+                                  LHS.getOperand(0).getValueType().isInteger());
+      if (DCI.isBeforeLegalizeOps() ||
+          isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
+        return DAG.getSelectCC(SDLoc(N),
+                               LHS.getOperand(0),
+                               LHS.getOperand(1),
+                               LHS.getOperand(2),
+                               LHS.getOperand(3),
+                               LHSCC);
+      break;
+    }
+    }
+    return SDValue();
+  }
+
+  case AMDGPUISD::EXPORT: {
+    SDValue Arg = N->getOperand(1);
+    if (Arg.getOpcode() != ISD::BUILD_VECTOR)
+      break;
+
+    SDValue NewArgs[8] = {
+      N->getOperand(0), // Chain
+      SDValue(),
+      N->getOperand(2), // ArrayBase
+      N->getOperand(3), // Type
+      N->getOperand(4), // SWZ_X
+      N->getOperand(5), // SWZ_Y
+      N->getOperand(6), // SWZ_Z
+      N->getOperand(7) // SWZ_W
+    };
+    SDLoc DL(N);
+    NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG);
+    return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs);
+  }
+  case AMDGPUISD::TEXTURE_FETCH: {
+    SDValue Arg = N->getOperand(1);
+    if (Arg.getOpcode() != ISD::BUILD_VECTOR)
+      break;
+
+    SDValue NewArgs[19] = {
+      N->getOperand(0),
+      N->getOperand(1),
+      N->getOperand(2),
+      N->getOperand(3),
+      N->getOperand(4),
+      N->getOperand(5),
+      N->getOperand(6),
+      N->getOperand(7),
+      N->getOperand(8),
+      N->getOperand(9),
+      N->getOperand(10),
+      N->getOperand(11),
+      N->getOperand(12),
+      N->getOperand(13),
+      N->getOperand(14),
+      N->getOperand(15),
+      N->getOperand(16),
+      N->getOperand(17),
+      N->getOperand(18),
+    };
+    NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG);
+    return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(),
+        NewArgs);
+  }
+  }
+
+  return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
+}
+
+static bool
+FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
+            SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) {
+  const R600InstrInfo *TII =
+      static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo());
+  if (!Src.isMachineOpcode())
+    return false;
+  switch (Src.getMachineOpcode()) {
+  case AMDGPU::FNEG_R600:
+    if (!Neg.getNode())
+      return false;
+    Src = Src.getOperand(0);
+    Neg = DAG.getTargetConstant(1, MVT::i32);
+    return true;
+  case AMDGPU::FABS_R600:
+    if (!Abs.getNode())
+      return false;
+    Src = Src.getOperand(0);
+    Abs = DAG.getTargetConstant(1, MVT::i32);
+    return true;
+  case AMDGPU::CONST_COPY: {
+    unsigned Opcode = ParentNode->getMachineOpcode();
+    bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
+
+    if (!Sel.getNode())
+      return false;
+
+    SDValue CstOffset = Src.getOperand(0);
+    if (ParentNode->getValueType(0).isVector())
+      return false;
+
+    // Gather constants values
+    int SrcIndices[] = {
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
+    };
+    std::vector<unsigned> Consts;
+    for (int OtherSrcIdx : SrcIndices) {
+      int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
+      if (OtherSrcIdx < 0 || OtherSelIdx < 0)
+        continue;
+      if (HasDst) {
+        OtherSrcIdx--;
+        OtherSelIdx--;
+      }
+      if (RegisterSDNode *Reg =
+          dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
+        if (Reg->getReg() == AMDGPU::ALU_CONST) {
+          ConstantSDNode *Cst
+            = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
+          Consts.push_back(Cst->getZExtValue());
+        }
+      }
+    }
+
+    ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
+    Consts.push_back(Cst->getZExtValue());
+    if (!TII->fitsConstReadLimitations(Consts)) {
+      return false;
+    }
+
+    Sel = CstOffset;
+    Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
+    return true;
+  }
+  case AMDGPU::MOV_IMM_I32:
+  case AMDGPU::MOV_IMM_F32: {
+    unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
+    uint64_t ImmValue = 0;
+
+
+    if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
+      ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
+      float FloatValue = FPC->getValueAPF().convertToFloat();
+      if (FloatValue == 0.0) {
+        ImmReg = AMDGPU::ZERO;
+      } else if (FloatValue == 0.5) {
+        ImmReg = AMDGPU::HALF;
+      } else if (FloatValue == 1.0) {
+        ImmReg = AMDGPU::ONE;
+      } else {
+        ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
+      }
+    } else {
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
+      uint64_t Value = C->getZExtValue();
+      if (Value == 0) {
+        ImmReg = AMDGPU::ZERO;
+      } else if (Value == 1) {
+        ImmReg = AMDGPU::ONE_INT;
+      } else {
+        ImmValue = Value;
+      }
+    }
+
+    // Check that we aren't already using an immediate.
+    // XXX: It's possible for an instruction to have more than one
+    // immediate operand, but this is not supported yet.
+    if (ImmReg == AMDGPU::ALU_LITERAL_X) {
+      if (!Imm.getNode())
+        return false;
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
+      assert(C);
+      if (C->getZExtValue())
+        return false;
+      Imm = DAG.getTargetConstant(ImmValue, MVT::i32);
+    }
+    Src = DAG.getRegister(ImmReg, MVT::i32);
+    return true;
+  }
+  default:
+    return false;
+  }
+}
+
+
+/// \brief Fold the instructions after selecting them
+SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
+                                            SelectionDAG &DAG) const {
+  const R600InstrInfo *TII =
+      static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo());
+  if (!Node->isMachineOpcode())
+    return Node;
+  unsigned Opcode = Node->getMachineOpcode();
+  SDValue FakeOp;
+
+  std::vector<SDValue> Ops;
+  for (const SDUse &I : Node->ops())
+    Ops.push_back(I);
+
+  if (Opcode == AMDGPU::DOT_4) {
+    int OperandIdx[] = {
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
+        };
+    int NegIdx[] = {
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
+    };
+    int AbsIdx[] = {
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
+    };
+    for (unsigned i = 0; i < 8; i++) {
+      if (OperandIdx[i] < 0)
+        return Node;
+      SDValue &Src = Ops[OperandIdx[i] - 1];
+      SDValue &Neg = Ops[NegIdx[i] - 1];
+      SDValue &Abs = Ops[AbsIdx[i] - 1];
+      bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
+      int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
+      if (HasDst)
+        SelIdx--;
+      SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
+      if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
+        return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
+    }
+  } else if (Opcode == AMDGPU::REG_SEQUENCE) {
+    for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
+      SDValue &Src = Ops[i];
+      if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
+        return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
+    }
+  } else if (Opcode == AMDGPU::CLAMP_R600) {
+    SDValue Src = Node->getOperand(0);
+    if (!Src.isMachineOpcode() ||
+        !TII->hasInstrModifiers(Src.getMachineOpcode()))
+      return Node;
+    int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
+        AMDGPU::OpName::clamp);
+    if (ClampIdx < 0)
+      return Node;
+    std::vector<SDValue> Ops;
+    unsigned NumOp = Src.getNumOperands();
+    for(unsigned i = 0; i < NumOp; ++i)
+          Ops.push_back(Src.getOperand(i));
+    Ops[ClampIdx - 1] = DAG.getTargetConstant(1, MVT::i32);
+    return DAG.getMachineNode(Src.getMachineOpcode(), SDLoc(Node),
+        Node->getVTList(), Ops);
+  } else {
+    if (!TII->hasInstrModifiers(Opcode))
+      return Node;
+    int OperandIdx[] = {
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
+    };
+    int NegIdx[] = {
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
+    };
+    int AbsIdx[] = {
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
+      -1
+    };
+    for (unsigned i = 0; i < 3; i++) {
+      if (OperandIdx[i] < 0)
+        return Node;
+      SDValue &Src = Ops[OperandIdx[i] - 1];
+      SDValue &Neg = Ops[NegIdx[i] - 1];
+      SDValue FakeAbs;
+      SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
+      bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
+      int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
+      int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
+      if (HasDst) {
+        SelIdx--;
+        ImmIdx--;
+      }
+      SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
+      SDValue &Imm = Ops[ImmIdx];
+      if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
+        return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
+    }
+  }
+
+  return Node;
+}
diff --git a/contrib/llvm/lib/Target/R600/R600ISelLowering.h b/contrib/llvm/lib/Target/R600/R600ISelLowering.h
new file mode 100644
index 0000000..d22c8c9
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/R600ISelLowering.h
@@ -0,0 +1,77 @@
+//===-- R600ISelLowering.h - R600 DAG Lowering Interface -*- C++ -*--------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief R600 DAG Lowering interface definition
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef R600ISELLOWERING_H
+#define R600ISELLOWERING_H
+
+#include "AMDGPUISelLowering.h"
+
+namespace llvm {
+
+class R600InstrInfo;
+
+class R600TargetLowering : public AMDGPUTargetLowering {
+public:
+  R600TargetLowering(TargetMachine &TM);
+  MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI,
+      MachineBasicBlock * BB) const override;
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+  void ReplaceNodeResults(SDNode * N,
+                          SmallVectorImpl<SDValue> &Results,
+                          SelectionDAG &DAG) const override;
+  SDValue LowerFormalArguments(
+                              SDValue Chain,
+                              CallingConv::ID CallConv,
+                              bool isVarArg,
+                              const SmallVectorImpl<ISD::InputArg> &Ins,
+                              SDLoc DL, SelectionDAG &DAG,
+                              SmallVectorImpl<SDValue> &InVals) const override;
+  EVT getSetCCResultType(LLVMContext &, EVT VT) const override;
+private:
+  unsigned Gen;
+  /// Each OpenCL kernel has nine implicit parameters that are stored in the
+  /// first nine dwords of a Vertex Buffer.  These implicit parameters are
+  /// lowered to load instructions which retrieve the values from the Vertex
+  /// Buffer.
+  SDValue LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
+                                 SDLoc DL, unsigned DwordOffset) const;
+
+  void lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB,
+      MachineRegisterInfo & MRI, unsigned dword_offset) const;
+  SDValue OptimizeSwizzle(SDValue BuildVector, SDValue Swz[], SelectionDAG &DAG) const;
+  SDValue vectorToVerticalVector(SelectionDAG &DAG, SDValue Vector) const;
+
+  SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSHLParts(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSRXParts(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue stackPtrToRegIndex(SDValue Ptr, unsigned StackWidth,
+                                          SelectionDAG &DAG) const;
+  void getStackAddress(unsigned StackWidth, unsigned ElemIdx,
+                       unsigned &Channel, unsigned &PtrIncr) const;
+  bool isZero(SDValue Op) const;
+  SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override;
+};
+
+} // End namespace llvm;
+
+#endif // R600ISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/R600/R600InstrFormats.td b/contrib/llvm/lib/Target/R600/R600InstrFormats.td
new file mode 100644
index 0000000..9428bab
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/R600InstrFormats.td
@@ -0,0 +1,492 @@
+//===-- R600InstrFormats.td - R600 Instruction Encodings ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// R600 Instruction format definitions.
+//
+//===----------------------------------------------------------------------===//
+
+class InstR600 <dag outs, dag ins, string asm, list<dag> pattern,
+                InstrItinClass itin>
+    : AMDGPUInst <outs, ins, asm, pattern> {
+
+  field bits<64> Inst;
+  bit Trig = 0;
+  bit Op3 = 0;
+  bit isVector = 0;
+  bits<2> FlagOperandIdx = 0;
+  bit Op1 = 0;
+  bit Op2 = 0;
+  bit LDS_1A = 0;
+  bit LDS_1A1D = 0;
+  bit HasNativeOperands = 0;
+  bit VTXInst = 0;
+  bit TEXInst = 0;
+  bit ALUInst = 0;
+  bit IsExport = 0;
+  bit LDS_1A2D = 0;
+
+  let Namespace = "AMDGPU";
+  let OutOperandList = outs;
+  let InOperandList = ins;
+  let AsmString = asm;
+  let Pattern = pattern;
+  let Itinerary = itin;
+
+  let TSFlags{4} = Trig;
+  let TSFlags{5} = Op3;
+
+  // Vector instructions are instructions that must fill all slots in an
+  // instruction group
+  let TSFlags{6} = isVector;
+  let TSFlags{8-7} = FlagOperandIdx;
+  let TSFlags{9} = HasNativeOperands;
+  let TSFlags{10} = Op1;
+  let TSFlags{11} = Op2;
+  let TSFlags{12} = VTXInst;
+  let TSFlags{13} = TEXInst;
+  let TSFlags{14} = ALUInst;
+  let TSFlags{15} = LDS_1A;
+  let TSFlags{16} = LDS_1A1D;
+  let TSFlags{17} = IsExport;
+  let TSFlags{18} = LDS_1A2D;
+}
+
+//===----------------------------------------------------------------------===//
+// ALU instructions
+//===----------------------------------------------------------------------===//
+
+class R600_ALU_LDS_Word0 {
+  field bits<32> Word0;
+
+  bits<11> src0;
+  bits<1>  src0_rel;
+  bits<11> src1;
+  bits<1>  src1_rel;
+  bits<3>  index_mode = 0;
+  bits<2>  pred_sel;
+  bits<1>  last;
+
+  bits<9>  src0_sel  = src0{8-0};
+  bits<2>  src0_chan = src0{10-9};
+  bits<9>  src1_sel  = src1{8-0};
+  bits<2>  src1_chan = src1{10-9};
+
+  let Word0{8-0}   = src0_sel;
+  let Word0{9}     = src0_rel;
+  let Word0{11-10} = src0_chan;
+  let Word0{21-13} = src1_sel;
+  let Word0{22}    = src1_rel;
+  let Word0{24-23} = src1_chan;
+  let Word0{28-26} = index_mode;
+  let Word0{30-29} = pred_sel;
+  let Word0{31}    = last;
+}
+
+class R600ALU_Word0 : R600_ALU_LDS_Word0 {
+
+  bits<1>  src0_neg;
+  bits<1>  src1_neg;
+
+  let Word0{12}    = src0_neg;
+  let Word0{25}    = src1_neg;
+}
+
+class R600ALU_Word1 {
+  field bits<32> Word1;
+
+  bits<11> dst;
+  bits<3>  bank_swizzle;
+  bits<1>  dst_rel;
+  bits<1>  clamp;
+
+  bits<7>  dst_sel  = dst{6-0};
+  bits<2>  dst_chan = dst{10-9};
+
+  let Word1{20-18} = bank_swizzle;
+  let Word1{27-21} = dst_sel;
+  let Word1{28}    = dst_rel;
+  let Word1{30-29} = dst_chan;
+  let Word1{31}    = clamp;
+}
+
+class R600ALU_Word1_OP2 <bits<11> alu_inst> : R600ALU_Word1{
+
+  bits<1>  src0_abs;
+  bits<1>  src1_abs;
+  bits<1>  update_exec_mask;
+  bits<1>  update_pred;
+  bits<1>  write;
+  bits<2>  omod;
+
+  let Word1{0}     = src0_abs;
+  let Word1{1}     = src1_abs;
+  let Word1{2}     = update_exec_mask;
+  let Word1{3}     = update_pred;
+  let Word1{4}     = write;
+  let Word1{6-5}   = omod;
+  let Word1{17-7}  = alu_inst;
+}
+
+class R600ALU_Word1_OP3 <bits<5> alu_inst> : R600ALU_Word1{
+
+  bits<11> src2;
+  bits<1>  src2_rel;
+  bits<1>  src2_neg;
+
+  bits<9>  src2_sel = src2{8-0};
+  bits<2>  src2_chan = src2{10-9};
+
+  let Word1{8-0}   = src2_sel;
+  let Word1{9}     = src2_rel;
+  let Word1{11-10} = src2_chan;
+  let Word1{12}    = src2_neg;
+  let Word1{17-13} = alu_inst;
+}
+
+class R600LDS_Word1 {
+  field bits<32> Word1;
+
+  bits<11> src2;
+  bits<9>  src2_sel  = src2{8-0};
+  bits<2>  src2_chan = src2{10-9};
+  bits<1>  src2_rel;
+  // offset specifies the stride offset to the second set of data to be read
+  // from.  This is a dword offset.
+  bits<5>  alu_inst = 17; // OP3_INST_LDS_IDX_OP
+  bits<3>  bank_swizzle;
+  bits<6>  lds_op;
+  bits<2>  dst_chan = 0;
+
+  let Word1{8-0}   = src2_sel;
+  let Word1{9}     = src2_rel;
+  let Word1{11-10} = src2_chan;
+  let Word1{17-13} = alu_inst;
+  let Word1{20-18} = bank_swizzle;
+  let Word1{26-21} = lds_op;
+  let Word1{30-29} = dst_chan;
+}
+
+
+/*
+XXX: R600 subtarget uses a slightly different encoding than the other
+subtargets.  We currently handle this in R600MCCodeEmitter, but we may
+want to use these instruction classes in the future.
+
+class R600ALU_Word1_OP2_r600 : R600ALU_Word1_OP2 {
+
+  bits<1>  fog_merge;
+  bits<10> alu_inst;
+
+  let Inst{37}    = fog_merge;
+  let Inst{39-38} = omod;
+  let Inst{49-40} = alu_inst;
+}
+
+class R600ALU_Word1_OP2_r700 : R600ALU_Word1_OP2 {
+
+  bits<11> alu_inst;
+
+  let Inst{38-37} = omod;
+  let Inst{49-39} = alu_inst;
+}
+*/
+
+//===----------------------------------------------------------------------===//
+// Vertex Fetch instructions
+//===----------------------------------------------------------------------===//
+
+class VTX_WORD0 {
+  field bits<32> Word0;
+  bits<7> src_gpr;
+  bits<5> VC_INST;
+  bits<2> FETCH_TYPE;
+  bits<1> FETCH_WHOLE_QUAD;
+  bits<8> BUFFER_ID;
+  bits<1> SRC_REL;
+  bits<2> SRC_SEL_X;
+
+  let Word0{4-0}   = VC_INST;
+  let Word0{6-5}   = FETCH_TYPE;
+  let Word0{7}     = FETCH_WHOLE_QUAD;
+  let Word0{15-8}  = BUFFER_ID;
+  let Word0{22-16} = src_gpr;
+  let Word0{23}    = SRC_REL;
+  let Word0{25-24} = SRC_SEL_X;
+}
+
+class VTX_WORD0_eg : VTX_WORD0 {
+
+  bits<6> MEGA_FETCH_COUNT;
+
+  let Word0{31-26} = MEGA_FETCH_COUNT;
+}
+
+class VTX_WORD0_cm : VTX_WORD0 {
+
+  bits<2> SRC_SEL_Y;
+  bits<2> STRUCTURED_READ;
+  bits<1> LDS_REQ;
+  bits<1> COALESCED_READ;
+
+  let Word0{27-26} = SRC_SEL_Y;
+  let Word0{29-28} = STRUCTURED_READ;
+  let Word0{30}    = LDS_REQ;
+  let Word0{31}    = COALESCED_READ;
+}
+
+class VTX_WORD1_GPR {
+  field bits<32> Word1;
+  bits<7> dst_gpr;
+  bits<1> DST_REL;
+  bits<3> DST_SEL_X;
+  bits<3> DST_SEL_Y;
+  bits<3> DST_SEL_Z;
+  bits<3> DST_SEL_W;
+  bits<1> USE_CONST_FIELDS;
+  bits<6> DATA_FORMAT;
+  bits<2> NUM_FORMAT_ALL;
+  bits<1> FORMAT_COMP_ALL;
+  bits<1> SRF_MODE_ALL;
+
+  let Word1{6-0} = dst_gpr;
+  let Word1{7}    = DST_REL;
+  let Word1{8}    = 0; // Reserved
+  let Word1{11-9} = DST_SEL_X;
+  let Word1{14-12} = DST_SEL_Y;
+  let Word1{17-15} = DST_SEL_Z;
+  let Word1{20-18} = DST_SEL_W;
+  let Word1{21}    = USE_CONST_FIELDS;
+  let Word1{27-22} = DATA_FORMAT;
+  let Word1{29-28} = NUM_FORMAT_ALL;
+  let Word1{30}    = FORMAT_COMP_ALL;
+  let Word1{31}    = SRF_MODE_ALL;
+}
+
+//===----------------------------------------------------------------------===//
+// Texture fetch instructions
+//===----------------------------------------------------------------------===//
+
+class TEX_WORD0 {
+  field bits<32> Word0;
+
+  bits<5> TEX_INST;
+  bits<2> INST_MOD;
+  bits<1> FETCH_WHOLE_QUAD;
+  bits<8> RESOURCE_ID;
+  bits<7> SRC_GPR;
+  bits<1> SRC_REL;
+  bits<1> ALT_CONST;
+  bits<2> RESOURCE_INDEX_MODE;
+  bits<2> SAMPLER_INDEX_MODE;
+
+  let Word0{4-0} = TEX_INST;
+  let Word0{6-5} = INST_MOD;
+  let Word0{7} = FETCH_WHOLE_QUAD;
+  let Word0{15-8} = RESOURCE_ID;
+  let Word0{22-16} = SRC_GPR;
+  let Word0{23} = SRC_REL;
+  let Word0{24} = ALT_CONST;
+  let Word0{26-25} = RESOURCE_INDEX_MODE;
+  let Word0{28-27} = SAMPLER_INDEX_MODE;
+}
+
+class TEX_WORD1 {
+  field bits<32> Word1;
+
+  bits<7> DST_GPR;
+  bits<1> DST_REL;
+  bits<3> DST_SEL_X;
+  bits<3> DST_SEL_Y;
+  bits<3> DST_SEL_Z;
+  bits<3> DST_SEL_W;
+  bits<7> LOD_BIAS;
+  bits<1> COORD_TYPE_X;
+  bits<1> COORD_TYPE_Y;
+  bits<1> COORD_TYPE_Z;
+  bits<1> COORD_TYPE_W;
+
+  let Word1{6-0} = DST_GPR;
+  let Word1{7} = DST_REL;
+  let Word1{11-9} = DST_SEL_X;
+  let Word1{14-12} = DST_SEL_Y;
+  let Word1{17-15} = DST_SEL_Z;
+  let Word1{20-18} = DST_SEL_W;
+  let Word1{27-21} = LOD_BIAS;
+  let Word1{28} = COORD_TYPE_X;
+  let Word1{29} = COORD_TYPE_Y;
+  let Word1{30} = COORD_TYPE_Z;
+  let Word1{31} = COORD_TYPE_W;
+}
+
+class TEX_WORD2 {
+  field bits<32> Word2;
+
+  bits<5> OFFSET_X;
+  bits<5> OFFSET_Y;
+  bits<5> OFFSET_Z;
+  bits<5> SAMPLER_ID;
+  bits<3> SRC_SEL_X;
+  bits<3> SRC_SEL_Y;
+  bits<3> SRC_SEL_Z;
+  bits<3> SRC_SEL_W;
+
+  let Word2{4-0} = OFFSET_X;
+  let Word2{9-5} = OFFSET_Y;
+  let Word2{14-10} = OFFSET_Z;
+  let Word2{19-15} = SAMPLER_ID;
+  let Word2{22-20} = SRC_SEL_X;
+  let Word2{25-23} = SRC_SEL_Y;
+  let Word2{28-26} = SRC_SEL_Z;
+  let Word2{31-29} = SRC_SEL_W;
+}
+
+//===----------------------------------------------------------------------===//
+// Control Flow Instructions
+//===----------------------------------------------------------------------===//
+
+class CF_WORD1_R600 {
+  field bits<32> Word1;
+
+  bits<3> POP_COUNT;
+  bits<5> CF_CONST;
+  bits<2> COND;
+  bits<3> COUNT;
+  bits<6> CALL_COUNT;
+  bits<1> COUNT_3;
+  bits<1> END_OF_PROGRAM;
+  bits<1> VALID_PIXEL_MODE;
+  bits<7> CF_INST;
+  bits<1> WHOLE_QUAD_MODE;
+  bits<1> BARRIER;
+
+  let Word1{2-0} = POP_COUNT;
+  let Word1{7-3} = CF_CONST;
+  let Word1{9-8} = COND;
+  let Word1{12-10} = COUNT;
+  let Word1{18-13} = CALL_COUNT;
+  let Word1{19} = COUNT_3;
+  let Word1{21} = END_OF_PROGRAM;
+  let Word1{22} = VALID_PIXEL_MODE;
+  let Word1{29-23} = CF_INST;
+  let Word1{30} = WHOLE_QUAD_MODE;
+  let Word1{31} = BARRIER;
+}
+
+class CF_WORD0_EG {
+  field bits<32> Word0;
+
+  bits<24> ADDR;
+  bits<3> JUMPTABLE_SEL;
+
+  let Word0{23-0} = ADDR;
+  let Word0{26-24} = JUMPTABLE_SEL;
+}
+
+class CF_WORD1_EG {
+  field bits<32> Word1;
+
+  bits<3> POP_COUNT;
+  bits<5> CF_CONST;
+  bits<2> COND;
+  bits<6> COUNT;
+  bits<1> VALID_PIXEL_MODE;
+  bits<1> END_OF_PROGRAM;
+  bits<8> CF_INST;
+  bits<1> BARRIER;
+
+  let Word1{2-0} = POP_COUNT;
+  let Word1{7-3} = CF_CONST;
+  let Word1{9-8} = COND;
+  let Word1{15-10} = COUNT;
+  let Word1{20} = VALID_PIXEL_MODE;
+  let Word1{21} = END_OF_PROGRAM;
+  let Word1{29-22} = CF_INST;
+  let Word1{31} = BARRIER;
+}
+
+class CF_ALU_WORD0 {
+  field bits<32> Word0;
+
+  bits<22> ADDR;
+  bits<4> KCACHE_BANK0;
+  bits<4> KCACHE_BANK1;
+  bits<2> KCACHE_MODE0;
+
+  let Word0{21-0} = ADDR;
+  let Word0{25-22} = KCACHE_BANK0;
+  let Word0{29-26} = KCACHE_BANK1;
+  let Word0{31-30} = KCACHE_MODE0;
+}
+
+class CF_ALU_WORD1 {
+  field bits<32> Word1;
+
+  bits<2> KCACHE_MODE1;
+  bits<8> KCACHE_ADDR0;
+  bits<8> KCACHE_ADDR1;
+  bits<7> COUNT;
+  bits<1> ALT_CONST;
+  bits<4> CF_INST;
+  bits<1> WHOLE_QUAD_MODE;
+  bits<1> BARRIER;
+
+  let Word1{1-0} = KCACHE_MODE1;
+  let Word1{9-2} = KCACHE_ADDR0;
+  let Word1{17-10} = KCACHE_ADDR1;
+  let Word1{24-18} = COUNT;
+  let Word1{25} = ALT_CONST;
+  let Word1{29-26} = CF_INST;
+  let Word1{30} = WHOLE_QUAD_MODE;
+  let Word1{31} = BARRIER;
+}
+
+class CF_ALLOC_EXPORT_WORD0_RAT {
+  field bits<32> Word0;
+
+  bits<4> rat_id;
+  bits<6> rat_inst;
+  bits<2> rim;
+  bits<2> type;
+  bits<7> rw_gpr;
+  bits<1> rw_rel;
+  bits<7> index_gpr;
+  bits<2> elem_size;
+
+  let Word0{3-0}   = rat_id;
+  let Word0{9-4}   = rat_inst;
+  let Word0{10}    = 0; // Reserved
+  let Word0{12-11} = rim;
+  let Word0{14-13} = type;
+  let Word0{21-15} = rw_gpr;
+  let Word0{22}    = rw_rel;
+  let Word0{29-23} = index_gpr;
+  let Word0{31-30} = elem_size;
+}
+
+class CF_ALLOC_EXPORT_WORD1_BUF {
+  field bits<32> Word1;
+
+  bits<12> array_size;
+  bits<4>  comp_mask;
+  bits<4>  burst_count;
+  bits<1>  vpm;
+  bits<1>  eop;
+  bits<8>  cf_inst;
+  bits<1>  mark;
+  bits<1>  barrier;
+
+  let Word1{11-0} = array_size;
+  let Word1{15-12} = comp_mask;
+  let Word1{19-16} = burst_count;
+  let Word1{20}    = vpm;
+  let Word1{21}    = eop;
+  let Word1{29-22} = cf_inst;
+  let Word1{30}    = mark;
+  let Word1{31}    = barrier;
+}
diff --git a/contrib/llvm/lib/Target/R600/R600InstrInfo.cpp b/contrib/llvm/lib/Target/R600/R600InstrInfo.cpp
new file mode 100644
index 0000000..99920b7
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/R600InstrInfo.cpp
@@ -0,0 +1,1441 @@
+//===-- R600InstrInfo.cpp - R600 Instruction Information ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief R600 Implementation of TargetInstrInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "R600InstrInfo.h"
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "AMDGPUTargetMachine.h"
+#include "R600Defines.h"
+#include "R600MachineFunctionInfo.h"
+#include "R600RegisterInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "AMDGPUGenDFAPacketizer.inc"
+
+R600InstrInfo::R600InstrInfo(const AMDGPUSubtarget &st)
+  : AMDGPUInstrInfo(st),
+    RI(st)
+  { }
+
+const R600RegisterInfo &R600InstrInfo::getRegisterInfo() const {
+  return RI;
+}
+
+bool R600InstrInfo::isTrig(const MachineInstr &MI) const {
+  return get(MI.getOpcode()).TSFlags & R600_InstFlag::TRIG;
+}
+
+bool R600InstrInfo::isVector(const MachineInstr &MI) const {
+  return get(MI.getOpcode()).TSFlags & R600_InstFlag::VECTOR;
+}
+
+void
+R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg,
+                           bool KillSrc) const {
+  unsigned VectorComponents = 0;
+  if ((AMDGPU::R600_Reg128RegClass.contains(DestReg) ||
+      AMDGPU::R600_Reg128VerticalRegClass.contains(DestReg)) &&
+      (AMDGPU::R600_Reg128RegClass.contains(SrcReg) ||
+       AMDGPU::R600_Reg128VerticalRegClass.contains(SrcReg))) {
+    VectorComponents = 4;
+  } else if((AMDGPU::R600_Reg64RegClass.contains(DestReg) ||
+            AMDGPU::R600_Reg64VerticalRegClass.contains(DestReg)) &&
+            (AMDGPU::R600_Reg64RegClass.contains(SrcReg) ||
+             AMDGPU::R600_Reg64VerticalRegClass.contains(SrcReg))) {
+    VectorComponents = 2;
+  }
+
+  if (VectorComponents > 0) {
+    for (unsigned I = 0; I < VectorComponents; I++) {
+      unsigned SubRegIndex = RI.getSubRegFromChannel(I);
+      buildDefaultInstruction(MBB, MI, AMDGPU::MOV,
+                              RI.getSubReg(DestReg, SubRegIndex),
+                              RI.getSubReg(SrcReg, SubRegIndex))
+                              .addReg(DestReg,
+                                      RegState::Define | RegState::Implicit);
+    }
+  } else {
+    MachineInstr *NewMI = buildDefaultInstruction(MBB, MI, AMDGPU::MOV,
+                                                  DestReg, SrcReg);
+    NewMI->getOperand(getOperandIdx(*NewMI, AMDGPU::OpName::src0))
+                                    .setIsKill(KillSrc);
+  }
+}
+
+/// \returns true if \p MBBI can be moved into a new basic.
+bool R600InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MBBI) const {
+  for (MachineInstr::const_mop_iterator I = MBBI->operands_begin(),
+                                        E = MBBI->operands_end(); I != E; ++I) {
+    if (I->isReg() && !TargetRegisterInfo::isVirtualRegister(I->getReg()) &&
+        I->isUse() && RI.isPhysRegLiveAcrossClauses(I->getReg()))
+      return false;
+  }
+  return true;
+}
+
+bool R600InstrInfo::isMov(unsigned Opcode) const {
+
+
+  switch(Opcode) {
+  default: return false;
+  case AMDGPU::MOV:
+  case AMDGPU::MOV_IMM_F32:
+  case AMDGPU::MOV_IMM_I32:
+    return true;
+  }
+}
+
+// Some instructions act as place holders to emulate operations that the GPU
+// hardware does automatically. This function can be used to check if
+// an opcode falls into this category.
+bool R600InstrInfo::isPlaceHolderOpcode(unsigned Opcode) const {
+  switch (Opcode) {
+  default: return false;
+  case AMDGPU::RETURN:
+    return true;
+  }
+}
+
+bool R600InstrInfo::isReductionOp(unsigned Opcode) const {
+  return false;
+}
+
+bool R600InstrInfo::isCubeOp(unsigned Opcode) const {
+  switch(Opcode) {
+    default: return false;
+    case AMDGPU::CUBE_r600_pseudo:
+    case AMDGPU::CUBE_r600_real:
+    case AMDGPU::CUBE_eg_pseudo:
+    case AMDGPU::CUBE_eg_real:
+      return true;
+  }
+}
+
+bool R600InstrInfo::isALUInstr(unsigned Opcode) const {
+  unsigned TargetFlags = get(Opcode).TSFlags;
+
+  return (TargetFlags & R600_InstFlag::ALU_INST);
+}
+
+bool R600InstrInfo::hasInstrModifiers(unsigned Opcode) const {
+  unsigned TargetFlags = get(Opcode).TSFlags;
+
+  return ((TargetFlags & R600_InstFlag::OP1) |
+          (TargetFlags & R600_InstFlag::OP2) |
+          (TargetFlags & R600_InstFlag::OP3));
+}
+
+bool R600InstrInfo::isLDSInstr(unsigned Opcode) const {
+  unsigned TargetFlags = get(Opcode).TSFlags;
+
+  return ((TargetFlags & R600_InstFlag::LDS_1A) |
+          (TargetFlags & R600_InstFlag::LDS_1A1D) |
+          (TargetFlags & R600_InstFlag::LDS_1A2D));
+}
+
+bool R600InstrInfo::isLDSNoRetInstr(unsigned Opcode) const {
+  return isLDSInstr(Opcode) && getOperandIdx(Opcode, AMDGPU::OpName::dst) == -1;
+}
+
+bool R600InstrInfo::isLDSRetInstr(unsigned Opcode) const {
+  return isLDSInstr(Opcode) && getOperandIdx(Opcode, AMDGPU::OpName::dst) != -1;
+}
+
+bool R600InstrInfo::canBeConsideredALU(const MachineInstr *MI) const {
+  if (isALUInstr(MI->getOpcode()))
+    return true;
+  if (isVector(*MI) || isCubeOp(MI->getOpcode()))
+    return true;
+  switch (MI->getOpcode()) {
+  case AMDGPU::PRED_X:
+  case AMDGPU::INTERP_PAIR_XY:
+  case AMDGPU::INTERP_PAIR_ZW:
+  case AMDGPU::INTERP_VEC_LOAD:
+  case AMDGPU::COPY:
+  case AMDGPU::DOT_4:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool R600InstrInfo::isTransOnly(unsigned Opcode) const {
+  if (ST.hasCaymanISA())
+    return false;
+  return (get(Opcode).getSchedClass() == AMDGPU::Sched::TransALU);
+}
+
+bool R600InstrInfo::isTransOnly(const MachineInstr *MI) const {
+  return isTransOnly(MI->getOpcode());
+}
+
+bool R600InstrInfo::isVectorOnly(unsigned Opcode) const {
+  return (get(Opcode).getSchedClass() == AMDGPU::Sched::VecALU);
+}
+
+bool R600InstrInfo::isVectorOnly(const MachineInstr *MI) const {
+  return isVectorOnly(MI->getOpcode());
+}
+
+bool R600InstrInfo::isExport(unsigned Opcode) const {
+  return (get(Opcode).TSFlags & R600_InstFlag::IS_EXPORT);
+}
+
+bool R600InstrInfo::usesVertexCache(unsigned Opcode) const {
+  return ST.hasVertexCache() && IS_VTX(get(Opcode));
+}
+
+bool R600InstrInfo::usesVertexCache(const MachineInstr *MI) const {
+  const MachineFunction *MF = MI->getParent()->getParent();
+  const R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
+  return MFI->getShaderType() != ShaderType::COMPUTE &&
+    usesVertexCache(MI->getOpcode());
+}
+
+bool R600InstrInfo::usesTextureCache(unsigned Opcode) const {
+  return (!ST.hasVertexCache() && IS_VTX(get(Opcode))) || IS_TEX(get(Opcode));
+}
+
+bool R600InstrInfo::usesTextureCache(const MachineInstr *MI) const {
+  const MachineFunction *MF = MI->getParent()->getParent();
+  const R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
+  return (MFI->getShaderType() == ShaderType::COMPUTE &&
+          usesVertexCache(MI->getOpcode())) ||
+    usesTextureCache(MI->getOpcode());
+}
+
+bool R600InstrInfo::mustBeLastInClause(unsigned Opcode) const {
+  switch (Opcode) {
+  case AMDGPU::KILLGT:
+  case AMDGPU::GROUP_BARRIER:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool R600InstrInfo::usesAddressRegister(MachineInstr *MI) const {
+  return  MI->findRegisterUseOperandIdx(AMDGPU::AR_X) != -1;
+}
+
+bool R600InstrInfo::definesAddressRegister(MachineInstr *MI) const {
+  return MI->findRegisterDefOperandIdx(AMDGPU::AR_X) != -1;
+}
+
+bool R600InstrInfo::readsLDSSrcReg(const MachineInstr *MI) const {
+  if (!isALUInstr(MI->getOpcode())) {
+    return false;
+  }
+  for (MachineInstr::const_mop_iterator I = MI->operands_begin(),
+                                        E = MI->operands_end(); I != E; ++I) {
+    if (!I->isReg() || !I->isUse() ||
+        TargetRegisterInfo::isVirtualRegister(I->getReg()))
+      continue;
+
+    if (AMDGPU::R600_LDS_SRC_REGRegClass.contains(I->getReg()))
+      return true;
+  }
+  return false;
+}
+
+int R600InstrInfo::getSrcIdx(unsigned Opcode, unsigned SrcNum) const {
+  static const unsigned OpTable[] = {
+    AMDGPU::OpName::src0,
+    AMDGPU::OpName::src1,
+    AMDGPU::OpName::src2
+  };
+
+  assert (SrcNum < 3);
+  return getOperandIdx(Opcode, OpTable[SrcNum]);
+}
+
+#define SRC_SEL_ROWS 11
+int R600InstrInfo::getSelIdx(unsigned Opcode, unsigned SrcIdx) const {
+  static const unsigned SrcSelTable[SRC_SEL_ROWS][2] = {
+    {AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel},
+    {AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel},
+    {AMDGPU::OpName::src2, AMDGPU::OpName::src2_sel},
+    {AMDGPU::OpName::src0_X, AMDGPU::OpName::src0_sel_X},
+    {AMDGPU::OpName::src0_Y, AMDGPU::OpName::src0_sel_Y},
+    {AMDGPU::OpName::src0_Z, AMDGPU::OpName::src0_sel_Z},
+    {AMDGPU::OpName::src0_W, AMDGPU::OpName::src0_sel_W},
+    {AMDGPU::OpName::src1_X, AMDGPU::OpName::src1_sel_X},
+    {AMDGPU::OpName::src1_Y, AMDGPU::OpName::src1_sel_Y},
+    {AMDGPU::OpName::src1_Z, AMDGPU::OpName::src1_sel_Z},
+    {AMDGPU::OpName::src1_W, AMDGPU::OpName::src1_sel_W}
+  };
+
+  for (unsigned i = 0; i < SRC_SEL_ROWS; ++i) {
+    if (getOperandIdx(Opcode, SrcSelTable[i][0]) == (int)SrcIdx) {
+      return getOperandIdx(Opcode, SrcSelTable[i][1]);
+    }
+  }
+  return -1;
+}
+#undef SRC_SEL_ROWS
+
+SmallVector<std::pair<MachineOperand *, int64_t>, 3>
+R600InstrInfo::getSrcs(MachineInstr *MI) const {
+  SmallVector<std::pair<MachineOperand *, int64_t>, 3> Result;
+
+  if (MI->getOpcode() == AMDGPU::DOT_4) {
+    static const unsigned OpTable[8][2] = {
+      {AMDGPU::OpName::src0_X, AMDGPU::OpName::src0_sel_X},
+      {AMDGPU::OpName::src0_Y, AMDGPU::OpName::src0_sel_Y},
+      {AMDGPU::OpName::src0_Z, AMDGPU::OpName::src0_sel_Z},
+      {AMDGPU::OpName::src0_W, AMDGPU::OpName::src0_sel_W},
+      {AMDGPU::OpName::src1_X, AMDGPU::OpName::src1_sel_X},
+      {AMDGPU::OpName::src1_Y, AMDGPU::OpName::src1_sel_Y},
+      {AMDGPU::OpName::src1_Z, AMDGPU::OpName::src1_sel_Z},
+      {AMDGPU::OpName::src1_W, AMDGPU::OpName::src1_sel_W},
+    };
+
+    for (unsigned j = 0; j < 8; j++) {
+      MachineOperand &MO = MI->getOperand(getOperandIdx(MI->getOpcode(),
+                                                        OpTable[j][0]));
+      unsigned Reg = MO.getReg();
+      if (Reg == AMDGPU::ALU_CONST) {
+        unsigned Sel = MI->getOperand(getOperandIdx(MI->getOpcode(),
+                                                    OpTable[j][1])).getImm();
+        Result.push_back(std::pair<MachineOperand *, int64_t>(&MO, Sel));
+        continue;
+      }
+
+    }
+    return Result;
+  }
+
+  static const unsigned OpTable[3][2] = {
+    {AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel},
+    {AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel},
+    {AMDGPU::OpName::src2, AMDGPU::OpName::src2_sel},
+  };
+
+  for (unsigned j = 0; j < 3; j++) {
+    int SrcIdx = getOperandIdx(MI->getOpcode(), OpTable[j][0]);
+    if (SrcIdx < 0)
+      break;
+    MachineOperand &MO = MI->getOperand(SrcIdx);
+    unsigned Reg = MI->getOperand(SrcIdx).getReg();
+    if (Reg == AMDGPU::ALU_CONST) {
+      unsigned Sel = MI->getOperand(
+          getOperandIdx(MI->getOpcode(), OpTable[j][1])).getImm();
+      Result.push_back(std::pair<MachineOperand *, int64_t>(&MO, Sel));
+      continue;
+    }
+    if (Reg == AMDGPU::ALU_LITERAL_X) {
+      unsigned Imm = MI->getOperand(
+          getOperandIdx(MI->getOpcode(), AMDGPU::OpName::literal)).getImm();
+      Result.push_back(std::pair<MachineOperand *, int64_t>(&MO, Imm));
+      continue;
+    }
+    Result.push_back(std::pair<MachineOperand *, int64_t>(&MO, 0));
+  }
+  return Result;
+}
+
+std::vector<std::pair<int, unsigned> >
+R600InstrInfo::ExtractSrcs(MachineInstr *MI,
+                           const DenseMap<unsigned, unsigned> &PV,
+                           unsigned &ConstCount) const {
+  ConstCount = 0;
+  const SmallVector<std::pair<MachineOperand *, int64_t>, 3> Srcs = getSrcs(MI);
+  const std::pair<int, unsigned> DummyPair(-1, 0);
+  std::vector<std::pair<int, unsigned> > Result;
+  unsigned i = 0;
+  for (unsigned n = Srcs.size(); i < n; ++i) {
+    unsigned Reg = Srcs[i].first->getReg();
+    unsigned Index = RI.getEncodingValue(Reg) & 0xff;
+    if (Reg == AMDGPU::OQAP) {
+      Result.push_back(std::pair<int, unsigned>(Index, 0));
+    }
+    if (PV.find(Reg) != PV.end()) {
+      // 255 is used to tells its a PS/PV reg
+      Result.push_back(std::pair<int, unsigned>(255, 0));
+      continue;
+    }
+    if (Index > 127) {
+      ConstCount++;
+      Result.push_back(DummyPair);
+      continue;
+    }
+    unsigned Chan = RI.getHWRegChan(Reg);
+    Result.push_back(std::pair<int, unsigned>(Index, Chan));
+  }
+  for (; i < 3; ++i)
+    Result.push_back(DummyPair);
+  return Result;
+}
+
+static std::vector<std::pair<int, unsigned> >
+Swizzle(std::vector<std::pair<int, unsigned> > Src,
+        R600InstrInfo::BankSwizzle Swz) {
+  if (Src[0] == Src[1])
+    Src[1].first = -1;
+  switch (Swz) {
+  case R600InstrInfo::ALU_VEC_012_SCL_210:
+    break;
+  case R600InstrInfo::ALU_VEC_021_SCL_122:
+    std::swap(Src[1], Src[2]);
+    break;
+  case R600InstrInfo::ALU_VEC_102_SCL_221:
+    std::swap(Src[0], Src[1]);
+    break;
+  case R600InstrInfo::ALU_VEC_120_SCL_212:
+    std::swap(Src[0], Src[1]);
+    std::swap(Src[0], Src[2]);
+    break;
+  case R600InstrInfo::ALU_VEC_201:
+    std::swap(Src[0], Src[2]);
+    std::swap(Src[0], Src[1]);
+    break;
+  case R600InstrInfo::ALU_VEC_210:
+    std::swap(Src[0], Src[2]);
+    break;
+  }
+  return Src;
+}
+
+static unsigned
+getTransSwizzle(R600InstrInfo::BankSwizzle Swz, unsigned Op) {
+  switch (Swz) {
+  case R600InstrInfo::ALU_VEC_012_SCL_210: {
+    unsigned Cycles[3] = { 2, 1, 0};
+    return Cycles[Op];
+  }
+  case R600InstrInfo::ALU_VEC_021_SCL_122: {
+    unsigned Cycles[3] = { 1, 2, 2};
+    return Cycles[Op];
+  }
+  case R600InstrInfo::ALU_VEC_120_SCL_212: {
+    unsigned Cycles[3] = { 2, 1, 2};
+    return Cycles[Op];
+  }
+  case R600InstrInfo::ALU_VEC_102_SCL_221: {
+    unsigned Cycles[3] = { 2, 2, 1};
+    return Cycles[Op];
+  }
+  default:
+    llvm_unreachable("Wrong Swizzle for Trans Slot");
+    return 0;
+  }
+}
+
+/// returns how many MIs (whose inputs are represented by IGSrcs) can be packed
+/// in the same Instruction Group while meeting read port limitations given a
+/// Swz swizzle sequence.
+unsigned  R600InstrInfo::isLegalUpTo(
+    const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
+    const std::vector<R600InstrInfo::BankSwizzle> &Swz,
+    const std::vector<std::pair<int, unsigned> > &TransSrcs,
+    R600InstrInfo::BankSwizzle TransSwz) const {
+  int Vector[4][3];
+  memset(Vector, -1, sizeof(Vector));
+  for (unsigned i = 0, e = IGSrcs.size(); i < e; i++) {
+    const std::vector<std::pair<int, unsigned> > &Srcs =
+        Swizzle(IGSrcs[i], Swz[i]);
+    for (unsigned j = 0; j < 3; j++) {
+      const std::pair<int, unsigned> &Src = Srcs[j];
+      if (Src.first < 0 || Src.first == 255)
+        continue;
+      if (Src.first == GET_REG_INDEX(RI.getEncodingValue(AMDGPU::OQAP))) {
+        if (Swz[i] != R600InstrInfo::ALU_VEC_012_SCL_210 &&
+            Swz[i] != R600InstrInfo::ALU_VEC_021_SCL_122) {
+            // The value from output queue A (denoted by register OQAP) can
+            // only be fetched during the first cycle.
+            return false;
+        }
+        // OQAP does not count towards the normal read port restrictions
+        continue;
+      }
+      if (Vector[Src.second][j] < 0)
+        Vector[Src.second][j] = Src.first;
+      if (Vector[Src.second][j] != Src.first)
+        return i;
+    }
+  }
+  // Now check Trans Alu
+  for (unsigned i = 0, e = TransSrcs.size(); i < e; ++i) {
+    const std::pair<int, unsigned> &Src = TransSrcs[i];
+    unsigned Cycle = getTransSwizzle(TransSwz, i);
+    if (Src.first < 0)
+      continue;
+    if (Src.first == 255)
+      continue;
+    if (Vector[Src.second][Cycle] < 0)
+      Vector[Src.second][Cycle] = Src.first;
+    if (Vector[Src.second][Cycle] != Src.first)
+      return IGSrcs.size() - 1;
+  }
+  return IGSrcs.size();
+}
+
+/// Given a swizzle sequence SwzCandidate and an index Idx, returns the next
+/// (in lexicographic term) swizzle sequence assuming that all swizzles after
+/// Idx can be skipped
+static bool
+NextPossibleSolution(
+    std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate,
+    unsigned Idx) {
+  assert(Idx < SwzCandidate.size());
+  int ResetIdx = Idx;
+  while (ResetIdx > -1 && SwzCandidate[ResetIdx] == R600InstrInfo::ALU_VEC_210)
+    ResetIdx --;
+  for (unsigned i = ResetIdx + 1, e = SwzCandidate.size(); i < e; i++) {
+    SwzCandidate[i] = R600InstrInfo::ALU_VEC_012_SCL_210;
+  }
+  if (ResetIdx == -1)
+    return false;
+  int NextSwizzle = SwzCandidate[ResetIdx] + 1;
+  SwzCandidate[ResetIdx] = (R600InstrInfo::BankSwizzle)NextSwizzle;
+  return true;
+}
+
+/// Enumerate all possible Swizzle sequence to find one that can meet all
+/// read port requirements.
+bool R600InstrInfo::FindSwizzleForVectorSlot(
+    const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
+    std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate,
+    const std::vector<std::pair<int, unsigned> > &TransSrcs,
+    R600InstrInfo::BankSwizzle TransSwz) const {
+  unsigned ValidUpTo = 0;
+  do {
+    ValidUpTo = isLegalUpTo(IGSrcs, SwzCandidate, TransSrcs, TransSwz);
+    if (ValidUpTo == IGSrcs.size())
+      return true;
+  } while (NextPossibleSolution(SwzCandidate, ValidUpTo));
+  return false;
+}
+
+/// Instructions in Trans slot can't read gpr at cycle 0 if they also read
+/// a const, and can't read a gpr at cycle 1 if they read 2 const.
+static bool
+isConstCompatible(R600InstrInfo::BankSwizzle TransSwz,
+                  const std::vector<std::pair<int, unsigned> > &TransOps,
+                  unsigned ConstCount) {
+  // TransALU can't read 3 constants
+  if (ConstCount > 2)
+    return false;
+  for (unsigned i = 0, e = TransOps.size(); i < e; ++i) {
+    const std::pair<int, unsigned> &Src = TransOps[i];
+    unsigned Cycle = getTransSwizzle(TransSwz, i);
+    if (Src.first < 0)
+      continue;
+    if (ConstCount > 0 && Cycle == 0)
+      return false;
+    if (ConstCount > 1 && Cycle == 1)
+      return false;
+  }
+  return true;
+}
+
+bool
+R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG,
+                                       const DenseMap<unsigned, unsigned> &PV,
+                                       std::vector<BankSwizzle> &ValidSwizzle,
+                                       bool isLastAluTrans)
+    const {
+  //Todo : support shared src0 - src1 operand
+
+  std::vector<std::vector<std::pair<int, unsigned> > > IGSrcs;
+  ValidSwizzle.clear();
+  unsigned ConstCount;
+  BankSwizzle TransBS = ALU_VEC_012_SCL_210;
+  for (unsigned i = 0, e = IG.size(); i < e; ++i) {
+    IGSrcs.push_back(ExtractSrcs(IG[i], PV, ConstCount));
+    unsigned Op = getOperandIdx(IG[i]->getOpcode(),
+        AMDGPU::OpName::bank_swizzle);
+    ValidSwizzle.push_back( (R600InstrInfo::BankSwizzle)
+        IG[i]->getOperand(Op).getImm());
+  }
+  std::vector<std::pair<int, unsigned> > TransOps;
+  if (!isLastAluTrans)
+    return FindSwizzleForVectorSlot(IGSrcs, ValidSwizzle, TransOps, TransBS);
+
+  TransOps = IGSrcs.back();
+  IGSrcs.pop_back();
+  ValidSwizzle.pop_back();
+
+  static const R600InstrInfo::BankSwizzle TransSwz[] = {
+    ALU_VEC_012_SCL_210,
+    ALU_VEC_021_SCL_122,
+    ALU_VEC_120_SCL_212,
+    ALU_VEC_102_SCL_221
+  };
+  for (unsigned i = 0; i < 4; i++) {
+    TransBS = TransSwz[i];
+    if (!isConstCompatible(TransBS, TransOps, ConstCount))
+      continue;
+    bool Result = FindSwizzleForVectorSlot(IGSrcs, ValidSwizzle, TransOps,
+        TransBS);
+    if (Result) {
+      ValidSwizzle.push_back(TransBS);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+
+bool
+R600InstrInfo::fitsConstReadLimitations(const std::vector<unsigned> &Consts)
+    const {
+  assert (Consts.size() <= 12 && "Too many operands in instructions group");
+  unsigned Pair1 = 0, Pair2 = 0;
+  for (unsigned i = 0, n = Consts.size(); i < n; ++i) {
+    unsigned ReadConstHalf = Consts[i] & 2;
+    unsigned ReadConstIndex = Consts[i] & (~3);
+    unsigned ReadHalfConst = ReadConstIndex | ReadConstHalf;
+    if (!Pair1) {
+      Pair1 = ReadHalfConst;
+      continue;
+    }
+    if (Pair1 == ReadHalfConst)
+      continue;
+    if (!Pair2) {
+      Pair2 = ReadHalfConst;
+      continue;
+    }
+    if (Pair2 != ReadHalfConst)
+      return false;
+  }
+  return true;
+}
+
+bool
+R600InstrInfo::fitsConstReadLimitations(const std::vector<MachineInstr *> &MIs)
+    const {
+  std::vector<unsigned> Consts;
+  SmallSet<int64_t, 4> Literals;
+  for (unsigned i = 0, n = MIs.size(); i < n; i++) {
+    MachineInstr *MI = MIs[i];
+    if (!isALUInstr(MI->getOpcode()))
+      continue;
+
+    const SmallVectorImpl<std::pair<MachineOperand *, int64_t> > &Srcs =
+        getSrcs(MI);
+
+    for (unsigned j = 0, e = Srcs.size(); j < e; j++) {
+      std::pair<MachineOperand *, unsigned> Src = Srcs[j];
+      if (Src.first->getReg() == AMDGPU::ALU_LITERAL_X)
+        Literals.insert(Src.second);
+      if (Literals.size() > 4)
+        return false;
+      if (Src.first->getReg() == AMDGPU::ALU_CONST)
+        Consts.push_back(Src.second);
+      if (AMDGPU::R600_KC0RegClass.contains(Src.first->getReg()) ||
+          AMDGPU::R600_KC1RegClass.contains(Src.first->getReg())) {
+        unsigned Index = RI.getEncodingValue(Src.first->getReg()) & 0xff;
+        unsigned Chan = RI.getHWRegChan(Src.first->getReg());
+        Consts.push_back((Index << 2) | Chan);
+      }
+    }
+  }
+  return fitsConstReadLimitations(Consts);
+}
+
+DFAPacketizer *R600InstrInfo::CreateTargetScheduleState(const TargetMachine *TM,
+    const ScheduleDAG *DAG) const {
+  const InstrItineraryData *II = TM->getInstrItineraryData();
+  return TM->getSubtarget<AMDGPUSubtarget>().createDFAPacketizer(II);
+}
+
+static bool
+isPredicateSetter(unsigned Opcode) {
+  switch (Opcode) {
+  case AMDGPU::PRED_X:
+    return true;
+  default:
+    return false;
+  }
+}
+
+static MachineInstr *
+findFirstPredicateSetterFrom(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator I) {
+  while (I != MBB.begin()) {
+    --I;
+    MachineInstr *MI = I;
+    if (isPredicateSetter(MI->getOpcode()))
+      return MI;
+  }
+
+  return nullptr;
+}
+
+static
+bool isJump(unsigned Opcode) {
+  return Opcode == AMDGPU::JUMP || Opcode == AMDGPU::JUMP_COND;
+}
+
+static bool isBranch(unsigned Opcode) {
+  return Opcode == AMDGPU::BRANCH || Opcode == AMDGPU::BRANCH_COND_i32 ||
+      Opcode == AMDGPU::BRANCH_COND_f32;
+}
+
+bool
+R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+                             MachineBasicBlock *&TBB,
+                             MachineBasicBlock *&FBB,
+                             SmallVectorImpl<MachineOperand> &Cond,
+                             bool AllowModify) const {
+  // Most of the following comes from the ARM implementation of AnalyzeBranch
+
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin())
+    return false;
+  --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return false;
+    --I;
+  }
+  // AMDGPU::BRANCH* instructions are only available after isel and are not
+  // handled
+  if (isBranch(I->getOpcode()))
+    return true;
+  if (!isJump(static_cast<MachineInstr *>(I)->getOpcode())) {
+    return false;
+  }
+
+  // Remove successive JUMP
+  while (I != MBB.begin() && std::prev(I)->getOpcode() == AMDGPU::JUMP) {
+      MachineBasicBlock::iterator PriorI = std::prev(I);
+      if (AllowModify)
+        I->removeFromParent();
+      I = PriorI;
+  }
+  MachineInstr *LastInst = I;
+
+  // If there is only one terminator instruction, process it.
+  unsigned LastOpc = LastInst->getOpcode();
+  if (I == MBB.begin() ||
+          !isJump(static_cast<MachineInstr *>(--I)->getOpcode())) {
+    if (LastOpc == AMDGPU::JUMP) {
+      TBB = LastInst->getOperand(0).getMBB();
+      return false;
+    } else if (LastOpc == AMDGPU::JUMP_COND) {
+      MachineInstr *predSet = I;
+      while (!isPredicateSetter(predSet->getOpcode())) {
+        predSet = --I;
+      }
+      TBB = LastInst->getOperand(0).getMBB();
+      Cond.push_back(predSet->getOperand(1));
+      Cond.push_back(predSet->getOperand(2));
+      Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false));
+      return false;
+    }
+    return true;  // Can't handle indirect branch.
+  }
+
+  // Get the instruction before it if it is a terminator.
+  MachineInstr *SecondLastInst = I;
+  unsigned SecondLastOpc = SecondLastInst->getOpcode();
+
+  // If the block ends with a B and a Bcc, handle it.
+  if (SecondLastOpc == AMDGPU::JUMP_COND && LastOpc == AMDGPU::JUMP) {
+    MachineInstr *predSet = --I;
+    while (!isPredicateSetter(predSet->getOpcode())) {
+      predSet = --I;
+    }
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    FBB = LastInst->getOperand(0).getMBB();
+    Cond.push_back(predSet->getOperand(1));
+    Cond.push_back(predSet->getOperand(2));
+    Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false));
+    return false;
+  }
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+static
+MachineBasicBlock::iterator FindLastAluClause(MachineBasicBlock &MBB) {
+  for (MachineBasicBlock::reverse_iterator It = MBB.rbegin(), E = MBB.rend();
+      It != E; ++It) {
+    if (It->getOpcode() == AMDGPU::CF_ALU ||
+        It->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE)
+      return std::prev(It.base());
+  }
+  return MBB.end();
+}
+
+unsigned
+R600InstrInfo::InsertBranch(MachineBasicBlock &MBB,
+                            MachineBasicBlock *TBB,
+                            MachineBasicBlock *FBB,
+                            const SmallVectorImpl<MachineOperand> &Cond,
+                            DebugLoc DL) const {
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+
+  if (!FBB) {
+    if (Cond.empty()) {
+      BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(TBB);
+      return 1;
+    } else {
+      MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end());
+      assert(PredSet && "No previous predicate !");
+      addFlag(PredSet, 0, MO_FLAG_PUSH);
+      PredSet->getOperand(2).setImm(Cond[1].getImm());
+
+      BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND))
+             .addMBB(TBB)
+             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
+      MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
+      if (CfAlu == MBB.end())
+        return 1;
+      assert (CfAlu->getOpcode() == AMDGPU::CF_ALU);
+      CfAlu->setDesc(get(AMDGPU::CF_ALU_PUSH_BEFORE));
+      return 1;
+    }
+  } else {
+    MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end());
+    assert(PredSet && "No previous predicate !");
+    addFlag(PredSet, 0, MO_FLAG_PUSH);
+    PredSet->getOperand(2).setImm(Cond[1].getImm());
+    BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND))
+            .addMBB(TBB)
+            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
+    BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(FBB);
+    MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
+    if (CfAlu == MBB.end())
+      return 2;
+    assert (CfAlu->getOpcode() == AMDGPU::CF_ALU);
+    CfAlu->setDesc(get(AMDGPU::CF_ALU_PUSH_BEFORE));
+    return 2;
+  }
+}
+
+unsigned
+R600InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+
+  // Note : we leave PRED* instructions there.
+  // They may be needed when predicating instructions.
+
+  MachineBasicBlock::iterator I = MBB.end();
+
+  if (I == MBB.begin()) {
+    return 0;
+  }
+  --I;
+  switch (I->getOpcode()) {
+  default:
+    return 0;
+  case AMDGPU::JUMP_COND: {
+    MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I);
+    clearFlag(predSet, 0, MO_FLAG_PUSH);
+    I->eraseFromParent();
+    MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
+    if (CfAlu == MBB.end())
+      break;
+    assert (CfAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE);
+    CfAlu->setDesc(get(AMDGPU::CF_ALU));
+    break;
+  }
+  case AMDGPU::JUMP:
+    I->eraseFromParent();
+    break;
+  }
+  I = MBB.end();
+
+  if (I == MBB.begin()) {
+    return 1;
+  }
+  --I;
+  switch (I->getOpcode()) {
+    // FIXME: only one case??
+  default:
+    return 1;
+  case AMDGPU::JUMP_COND: {
+    MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I);
+    clearFlag(predSet, 0, MO_FLAG_PUSH);
+    I->eraseFromParent();
+    MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
+    if (CfAlu == MBB.end())
+      break;
+    assert (CfAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE);
+    CfAlu->setDesc(get(AMDGPU::CF_ALU));
+    break;
+  }
+  case AMDGPU::JUMP:
+    I->eraseFromParent();
+    break;
+  }
+  return 2;
+}
+
+bool
+R600InstrInfo::isPredicated(const MachineInstr *MI) const {
+  int idx = MI->findFirstPredOperandIdx();
+  if (idx < 0)
+    return false;
+
+  unsigned Reg = MI->getOperand(idx).getReg();
+  switch (Reg) {
+  default: return false;
+  case AMDGPU::PRED_SEL_ONE:
+  case AMDGPU::PRED_SEL_ZERO:
+  case AMDGPU::PREDICATE_BIT:
+    return true;
+  }
+}
+
+bool
+R600InstrInfo::isPredicable(MachineInstr *MI) const {
+  // XXX: KILL* instructions can be predicated, but they must be the last
+  // instruction in a clause, so this means any instructions after them cannot
+  // be predicated.  Until we have proper support for instruction clauses in the
+  // backend, we will mark KILL* instructions as unpredicable.
+
+  if (MI->getOpcode() == AMDGPU::KILLGT) {
+    return false;
+  } else if (MI->getOpcode() == AMDGPU::CF_ALU) {
+    // If the clause start in the middle of MBB then the MBB has more
+    // than a single clause, unable to predicate several clauses.
+    if (MI->getParent()->begin() != MachineBasicBlock::iterator(MI))
+      return false;
+    // TODO: We don't support KC merging atm
+    if (MI->getOperand(3).getImm() != 0 || MI->getOperand(4).getImm() != 0)
+      return false;
+    return true;
+  } else if (isVector(*MI)) {
+    return false;
+  } else {
+    return AMDGPUInstrInfo::isPredicable(MI);
+  }
+}
+
+
+bool
+R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB,
+                                   unsigned NumCyles,
+                                   unsigned ExtraPredCycles,
+                                   const BranchProbability &Probability) const{
+  return true;
+}
+
+bool
+R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB,
+                                   unsigned NumTCycles,
+                                   unsigned ExtraTCycles,
+                                   MachineBasicBlock &FMBB,
+                                   unsigned NumFCycles,
+                                   unsigned ExtraFCycles,
+                                   const BranchProbability &Probability) const {
+  return true;
+}
+
+bool
+R600InstrInfo::isProfitableToDupForIfCvt(MachineBasicBlock &MBB,
+                                         unsigned NumCyles,
+                                         const BranchProbability &Probability)
+                                         const {
+  return true;
+}
+
+bool
+R600InstrInfo::isProfitableToUnpredicate(MachineBasicBlock &TMBB,
+                                         MachineBasicBlock &FMBB) const {
+  return false;
+}
+
+
+bool
+R600InstrInfo::ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+  MachineOperand &MO = Cond[1];
+  switch (MO.getImm()) {
+  case OPCODE_IS_ZERO_INT:
+    MO.setImm(OPCODE_IS_NOT_ZERO_INT);
+    break;
+  case OPCODE_IS_NOT_ZERO_INT:
+    MO.setImm(OPCODE_IS_ZERO_INT);
+    break;
+  case OPCODE_IS_ZERO:
+    MO.setImm(OPCODE_IS_NOT_ZERO);
+    break;
+  case OPCODE_IS_NOT_ZERO:
+    MO.setImm(OPCODE_IS_ZERO);
+    break;
+  default:
+    return true;
+  }
+
+  MachineOperand &MO2 = Cond[2];
+  switch (MO2.getReg()) {
+  case AMDGPU::PRED_SEL_ZERO:
+    MO2.setReg(AMDGPU::PRED_SEL_ONE);
+    break;
+  case AMDGPU::PRED_SEL_ONE:
+    MO2.setReg(AMDGPU::PRED_SEL_ZERO);
+    break;
+  default:
+    return true;
+  }
+  return false;
+}
+
+bool
+R600InstrInfo::DefinesPredicate(MachineInstr *MI,
+                                std::vector<MachineOperand> &Pred) const {
+  return isPredicateSetter(MI->getOpcode());
+}
+
+
+bool
+R600InstrInfo::SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
+                       const SmallVectorImpl<MachineOperand> &Pred2) const {
+  return false;
+}
+
+
+bool
+R600InstrInfo::PredicateInstruction(MachineInstr *MI,
+                      const SmallVectorImpl<MachineOperand> &Pred) const {
+  int PIdx = MI->findFirstPredOperandIdx();
+
+  if (MI->getOpcode() == AMDGPU::CF_ALU) {
+    MI->getOperand(8).setImm(0);
+    return true;
+  }
+
+  if (MI->getOpcode() == AMDGPU::DOT_4) {
+    MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_X))
+        .setReg(Pred[2].getReg());
+    MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_Y))
+        .setReg(Pred[2].getReg());
+    MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_Z))
+        .setReg(Pred[2].getReg());
+    MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_W))
+        .setReg(Pred[2].getReg());
+    MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI);
+    MIB.addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit);
+    return true;
+  }
+
+  if (PIdx != -1) {
+    MachineOperand &PMO = MI->getOperand(PIdx);
+    PMO.setReg(Pred[2].getReg());
+    MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI);
+    MIB.addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit);
+    return true;
+  }
+
+  return false;
+}
+
+unsigned int R600InstrInfo::getPredicationCost(const MachineInstr *) const {
+  return 2;
+}
+
+unsigned int R600InstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
+                                            const MachineInstr *MI,
+                                            unsigned *PredCost) const {
+  if (PredCost)
+    *PredCost = 2;
+  return 2;
+}
+
+bool R600InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
+
+  switch(MI->getOpcode()) {
+  default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
+  case AMDGPU::R600_EXTRACT_ELT_V2:
+  case AMDGPU::R600_EXTRACT_ELT_V4:
+    buildIndirectRead(MI->getParent(), MI, MI->getOperand(0).getReg(),
+                      RI.getHWRegIndex(MI->getOperand(1).getReg()), //  Address
+                      MI->getOperand(2).getReg(),
+                      RI.getHWRegChan(MI->getOperand(1).getReg()));
+    break;
+  case AMDGPU::R600_INSERT_ELT_V2:
+  case AMDGPU::R600_INSERT_ELT_V4:
+    buildIndirectWrite(MI->getParent(), MI, MI->getOperand(2).getReg(), // Value
+                       RI.getHWRegIndex(MI->getOperand(1).getReg()),  // Address
+                       MI->getOperand(3).getReg(),                    // Offset
+                       RI.getHWRegChan(MI->getOperand(1).getReg()));  // Channel
+    break;
+  }
+  MI->eraseFromParent();
+  return true;
+}
+
+void  R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved,
+                                             const MachineFunction &MF) const {
+  const AMDGPUFrameLowering *TFL =
+    static_cast<const AMDGPUFrameLowering*>(
+    MF.getTarget().getFrameLowering());
+
+  unsigned StackWidth = TFL->getStackWidth(MF);
+  int End = getIndirectIndexEnd(MF);
+
+  if (End == -1)
+    return;
+
+  for (int Index = getIndirectIndexBegin(MF); Index <= End; ++Index) {
+    unsigned SuperReg = AMDGPU::R600_Reg128RegClass.getRegister(Index);
+    Reserved.set(SuperReg);
+    for (unsigned Chan = 0; Chan < StackWidth; ++Chan) {
+      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister((4 * Index) + Chan);
+      Reserved.set(Reg);
+    }
+  }
+}
+
+unsigned R600InstrInfo::calculateIndirectAddress(unsigned RegIndex,
+                                                 unsigned Channel) const {
+  // XXX: Remove when we support a stack width > 2
+  assert(Channel == 0);
+  return RegIndex;
+}
+
+const TargetRegisterClass *R600InstrInfo::getIndirectAddrRegClass() const {
+  return &AMDGPU::R600_TReg32_XRegClass;
+}
+
+MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB,
+                                       MachineBasicBlock::iterator I,
+                                       unsigned ValueReg, unsigned Address,
+                                       unsigned OffsetReg) const {
+  return buildIndirectWrite(MBB, I, ValueReg, Address, OffsetReg, 0);
+}
+
+MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB,
+                                       MachineBasicBlock::iterator I,
+                                       unsigned ValueReg, unsigned Address,
+                                       unsigned OffsetReg,
+                                       unsigned AddrChan) const {
+  unsigned AddrReg;
+  switch (AddrChan) {
+    default: llvm_unreachable("Invalid Channel");
+    case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break;
+    case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break;
+    case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break;
+    case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break;
+  }
+  MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg,
+                                               AMDGPU::AR_X, OffsetReg);
+  setImmOperand(MOVA, AMDGPU::OpName::write, 0);
+
+  MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV,
+                                      AddrReg, ValueReg)
+                                      .addReg(AMDGPU::AR_X,
+                                           RegState::Implicit | RegState::Kill);
+  setImmOperand(Mov, AMDGPU::OpName::dst_rel, 1);
+  return Mov;
+}
+
+MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB,
+                                       MachineBasicBlock::iterator I,
+                                       unsigned ValueReg, unsigned Address,
+                                       unsigned OffsetReg) const {
+  return buildIndirectRead(MBB, I, ValueReg, Address, OffsetReg, 0);
+}
+
+MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB,
+                                       MachineBasicBlock::iterator I,
+                                       unsigned ValueReg, unsigned Address,
+                                       unsigned OffsetReg,
+                                       unsigned AddrChan) const {
+  unsigned AddrReg;
+  switch (AddrChan) {
+    default: llvm_unreachable("Invalid Channel");
+    case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break;
+    case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break;
+    case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break;
+    case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break;
+  }
+  MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg,
+                                                       AMDGPU::AR_X,
+                                                       OffsetReg);
+  setImmOperand(MOVA, AMDGPU::OpName::write, 0);
+  MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV,
+                                      ValueReg,
+                                      AddrReg)
+                                      .addReg(AMDGPU::AR_X,
+                                           RegState::Implicit | RegState::Kill);
+  setImmOperand(Mov, AMDGPU::OpName::src0_rel, 1);
+
+  return Mov;
+}
+
+unsigned R600InstrInfo::getMaxAlusPerClause() const {
+  return 115;
+}
+
+MachineInstrBuilder R600InstrInfo::buildDefaultInstruction(MachineBasicBlock &MBB,
+                                                  MachineBasicBlock::iterator I,
+                                                  unsigned Opcode,
+                                                  unsigned DstReg,
+                                                  unsigned Src0Reg,
+                                                  unsigned Src1Reg) const {
+  MachineInstrBuilder MIB = BuildMI(MBB, I, MBB.findDebugLoc(I), get(Opcode),
+    DstReg);           // $dst
+
+  if (Src1Reg) {
+    MIB.addImm(0)     // $update_exec_mask
+       .addImm(0);    // $update_predicate
+  }
+  MIB.addImm(1)        // $write
+     .addImm(0)        // $omod
+     .addImm(0)        // $dst_rel
+     .addImm(0)        // $dst_clamp
+     .addReg(Src0Reg)  // $src0
+     .addImm(0)        // $src0_neg
+     .addImm(0)        // $src0_rel
+     .addImm(0)        // $src0_abs
+     .addImm(-1);       // $src0_sel
+
+  if (Src1Reg) {
+    MIB.addReg(Src1Reg) // $src1
+       .addImm(0)       // $src1_neg
+       .addImm(0)       // $src1_rel
+       .addImm(0)       // $src1_abs
+       .addImm(-1);      // $src1_sel
+  }
+
+  //XXX: The r600g finalizer expects this to be 1, once we've moved the
+  //scheduling to the backend, we can change the default to 0.
+  MIB.addImm(1)        // $last
+      .addReg(AMDGPU::PRED_SEL_OFF) // $pred_sel
+      .addImm(0)         // $literal
+      .addImm(0);        // $bank_swizzle
+
+  return MIB;
+}
+
+#define OPERAND_CASE(Label) \
+  case Label: { \
+    static const unsigned Ops[] = \
+    { \
+      Label##_X, \
+      Label##_Y, \
+      Label##_Z, \
+      Label##_W \
+    }; \
+    return Ops[Slot]; \
+  }
+
+static unsigned getSlotedOps(unsigned  Op, unsigned Slot) {
+  switch (Op) {
+  OPERAND_CASE(AMDGPU::OpName::update_exec_mask)
+  OPERAND_CASE(AMDGPU::OpName::update_pred)
+  OPERAND_CASE(AMDGPU::OpName::write)
+  OPERAND_CASE(AMDGPU::OpName::omod)
+  OPERAND_CASE(AMDGPU::OpName::dst_rel)
+  OPERAND_CASE(AMDGPU::OpName::clamp)
+  OPERAND_CASE(AMDGPU::OpName::src0)
+  OPERAND_CASE(AMDGPU::OpName::src0_neg)
+  OPERAND_CASE(AMDGPU::OpName::src0_rel)
+  OPERAND_CASE(AMDGPU::OpName::src0_abs)
+  OPERAND_CASE(AMDGPU::OpName::src0_sel)
+  OPERAND_CASE(AMDGPU::OpName::src1)
+  OPERAND_CASE(AMDGPU::OpName::src1_neg)
+  OPERAND_CASE(AMDGPU::OpName::src1_rel)
+  OPERAND_CASE(AMDGPU::OpName::src1_abs)
+  OPERAND_CASE(AMDGPU::OpName::src1_sel)
+  OPERAND_CASE(AMDGPU::OpName::pred_sel)
+  default:
+    llvm_unreachable("Wrong Operand");
+  }
+}
+
+#undef OPERAND_CASE
+
+MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction(
+    MachineBasicBlock &MBB, MachineInstr *MI, unsigned Slot, unsigned DstReg)
+    const {
+  assert (MI->getOpcode() == AMDGPU::DOT_4 && "Not Implemented");
+  unsigned Opcode;
+  if (ST.getGeneration() <= AMDGPUSubtarget::R700)
+    Opcode = AMDGPU::DOT4_r600;
+  else
+    Opcode = AMDGPU::DOT4_eg;
+  MachineBasicBlock::iterator I = MI;
+  MachineOperand &Src0 = MI->getOperand(
+      getOperandIdx(MI->getOpcode(), getSlotedOps(AMDGPU::OpName::src0, Slot)));
+  MachineOperand &Src1 = MI->getOperand(
+      getOperandIdx(MI->getOpcode(), getSlotedOps(AMDGPU::OpName::src1, Slot)));
+  MachineInstr *MIB = buildDefaultInstruction(
+      MBB, I, Opcode, DstReg, Src0.getReg(), Src1.getReg());
+  static const unsigned  Operands[14] = {
+    AMDGPU::OpName::update_exec_mask,
+    AMDGPU::OpName::update_pred,
+    AMDGPU::OpName::write,
+    AMDGPU::OpName::omod,
+    AMDGPU::OpName::dst_rel,
+    AMDGPU::OpName::clamp,
+    AMDGPU::OpName::src0_neg,
+    AMDGPU::OpName::src0_rel,
+    AMDGPU::OpName::src0_abs,
+    AMDGPU::OpName::src0_sel,
+    AMDGPU::OpName::src1_neg,
+    AMDGPU::OpName::src1_rel,
+    AMDGPU::OpName::src1_abs,
+    AMDGPU::OpName::src1_sel,
+  };
+
+  MachineOperand &MO = MI->getOperand(getOperandIdx(MI->getOpcode(),
+      getSlotedOps(AMDGPU::OpName::pred_sel, Slot)));
+  MIB->getOperand(getOperandIdx(Opcode, AMDGPU::OpName::pred_sel))
+      .setReg(MO.getReg());
+
+  for (unsigned i = 0; i < 14; i++) {
+    MachineOperand &MO = MI->getOperand(
+        getOperandIdx(MI->getOpcode(), getSlotedOps(Operands[i], Slot)));
+    assert (MO.isImm());
+    setImmOperand(MIB, Operands[i], MO.getImm());
+  }
+  MIB->getOperand(20).setImm(0);
+  return MIB;
+}
+
+MachineInstr *R600InstrInfo::buildMovImm(MachineBasicBlock &BB,
+                                         MachineBasicBlock::iterator I,
+                                         unsigned DstReg,
+                                         uint64_t Imm) const {
+  MachineInstr *MovImm = buildDefaultInstruction(BB, I, AMDGPU::MOV, DstReg,
+                                                  AMDGPU::ALU_LITERAL_X);
+  setImmOperand(MovImm, AMDGPU::OpName::literal, Imm);
+  return MovImm;
+}
+
+MachineInstr *R600InstrInfo::buildMovInstr(MachineBasicBlock *MBB,
+                                       MachineBasicBlock::iterator I,
+                                       unsigned DstReg, unsigned SrcReg) const {
+  return buildDefaultInstruction(*MBB, I, AMDGPU::MOV, DstReg, SrcReg);
+}
+
+int R600InstrInfo::getOperandIdx(const MachineInstr &MI, unsigned Op) const {
+  return getOperandIdx(MI.getOpcode(), Op);
+}
+
+int R600InstrInfo::getOperandIdx(unsigned Opcode, unsigned Op) const {
+  return AMDGPU::getNamedOperandIdx(Opcode, Op);
+}
+
+void R600InstrInfo::setImmOperand(MachineInstr *MI, unsigned Op,
+                                  int64_t Imm) const {
+  int Idx = getOperandIdx(*MI, Op);
+  assert(Idx != -1 && "Operand not supported for this instruction.");
+  assert(MI->getOperand(Idx).isImm());
+  MI->getOperand(Idx).setImm(Imm);
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction flag getters/setters
+//===----------------------------------------------------------------------===//
+
+bool R600InstrInfo::hasFlagOperand(const MachineInstr &MI) const {
+  return GET_FLAG_OPERAND_IDX(get(MI.getOpcode()).TSFlags) != 0;
+}
+
+MachineOperand &R600InstrInfo::getFlagOp(MachineInstr *MI, unsigned SrcIdx,
+                                         unsigned Flag) const {
+  unsigned TargetFlags = get(MI->getOpcode()).TSFlags;
+  int FlagIndex = 0;
+  if (Flag != 0) {
+    // If we pass something other than the default value of Flag to this
+    // function, it means we are want to set a flag on an instruction
+    // that uses native encoding.
+    assert(HAS_NATIVE_OPERANDS(TargetFlags));
+    bool IsOP3 = (TargetFlags & R600_InstFlag::OP3) == R600_InstFlag::OP3;
+    switch (Flag) {
+    case MO_FLAG_CLAMP:
+      FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::clamp);
+      break;
+    case MO_FLAG_MASK:
+      FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::write);
+      break;
+    case MO_FLAG_NOT_LAST:
+    case MO_FLAG_LAST:
+      FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::last);
+      break;
+    case MO_FLAG_NEG:
+      switch (SrcIdx) {
+      case 0: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src0_neg); break;
+      case 1: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src1_neg); break;
+      case 2: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src2_neg); break;
+      }
+      break;
+
+    case MO_FLAG_ABS:
+      assert(!IsOP3 && "Cannot set absolute value modifier for OP3 "
+                       "instructions.");
+      (void)IsOP3;
+      switch (SrcIdx) {
+      case 0: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src0_abs); break;
+      case 1: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src1_abs); break;
+      }
+      break;
+
+    default:
+      FlagIndex = -1;
+      break;
+    }
+    assert(FlagIndex != -1 && "Flag not supported for this instruction");
+  } else {
+      FlagIndex = GET_FLAG_OPERAND_IDX(TargetFlags);
+      assert(FlagIndex != 0 &&
+         "Instruction flags not supported for this instruction");
+  }
+
+  MachineOperand &FlagOp = MI->getOperand(FlagIndex);
+  assert(FlagOp.isImm());
+  return FlagOp;
+}
+
+void R600InstrInfo::addFlag(MachineInstr *MI, unsigned Operand,
+                            unsigned Flag) const {
+  unsigned TargetFlags = get(MI->getOpcode()).TSFlags;
+  if (Flag == 0) {
+    return;
+  }
+  if (HAS_NATIVE_OPERANDS(TargetFlags)) {
+    MachineOperand &FlagOp = getFlagOp(MI, Operand, Flag);
+    if (Flag == MO_FLAG_NOT_LAST) {
+      clearFlag(MI, Operand, MO_FLAG_LAST);
+    } else if (Flag == MO_FLAG_MASK) {
+      clearFlag(MI, Operand, Flag);
+    } else {
+      FlagOp.setImm(1);
+    }
+  } else {
+      MachineOperand &FlagOp = getFlagOp(MI, Operand);
+      FlagOp.setImm(FlagOp.getImm() | (Flag << (NUM_MO_FLAGS * Operand)));
+  }
+}
+
+void R600InstrInfo::clearFlag(MachineInstr *MI, unsigned Operand,
+                              unsigned Flag) const {
+  unsigned TargetFlags = get(MI->getOpcode()).TSFlags;
+  if (HAS_NATIVE_OPERANDS(TargetFlags)) {
+    MachineOperand &FlagOp = getFlagOp(MI, Operand, Flag);
+    FlagOp.setImm(0);
+  } else {
+    MachineOperand &FlagOp = getFlagOp(MI);
+    unsigned InstFlags = FlagOp.getImm();
+    InstFlags &= ~(Flag << (NUM_MO_FLAGS * Operand));
+    FlagOp.setImm(InstFlags);
+  }
+}
diff --git a/contrib/llvm/lib/Target/R600/R600InstrInfo.h b/contrib/llvm/lib/Target/R600/R600InstrInfo.h
new file mode 100644
index 0000000..1c3cb63
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/R600InstrInfo.h
@@ -0,0 +1,301 @@
+//===-- R600InstrInfo.h - R600 Instruction Info Interface -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface definition for R600InstrInfo
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef R600INSTRUCTIONINFO_H_
+#define R600INSTRUCTIONINFO_H_
+
+#include "AMDGPUInstrInfo.h"
+#include "R600Defines.h"
+#include "R600RegisterInfo.h"
+#include <map>
+
+namespace llvm {
+
+  class AMDGPUTargetMachine;
+  class DFAPacketizer;
+  class ScheduleDAG;
+  class MachineFunction;
+  class MachineInstr;
+  class MachineInstrBuilder;
+
+  class R600InstrInfo : public AMDGPUInstrInfo {
+  private:
+  const R600RegisterInfo RI;
+
+  std::vector<std::pair<int, unsigned> >
+  ExtractSrcs(MachineInstr *MI, const DenseMap<unsigned, unsigned> &PV, unsigned &ConstCount) const;
+
+
+  MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
+                                        MachineBasicBlock::iterator I,
+                                        unsigned ValueReg, unsigned Address,
+                                        unsigned OffsetReg,
+                                        unsigned AddrChan) const;
+
+  MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
+                                        MachineBasicBlock::iterator I,
+                                        unsigned ValueReg, unsigned Address,
+                                        unsigned OffsetReg,
+                                        unsigned AddrChan) const;
+  public:
+  enum BankSwizzle {
+    ALU_VEC_012_SCL_210 = 0,
+    ALU_VEC_021_SCL_122,
+    ALU_VEC_120_SCL_212,
+    ALU_VEC_102_SCL_221,
+    ALU_VEC_201,
+    ALU_VEC_210
+  };
+
+  explicit R600InstrInfo(const AMDGPUSubtarget &st);
+
+  const R600RegisterInfo &getRegisterInfo() const override;
+  void copyPhysReg(MachineBasicBlock &MBB,
+                   MachineBasicBlock::iterator MI, DebugLoc DL,
+                   unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+  bool isLegalToSplitMBBAt(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI) const override;
+
+  bool isTrig(const MachineInstr &MI) const;
+  bool isPlaceHolderOpcode(unsigned opcode) const;
+  bool isReductionOp(unsigned opcode) const;
+  bool isCubeOp(unsigned opcode) const;
+
+  /// \returns true if this \p Opcode represents an ALU instruction.
+  bool isALUInstr(unsigned Opcode) const;
+  bool hasInstrModifiers(unsigned Opcode) const;
+  bool isLDSInstr(unsigned Opcode) const;
+  bool isLDSNoRetInstr(unsigned Opcode) const;
+  bool isLDSRetInstr(unsigned Opcode) const;
+
+  /// \returns true if this \p Opcode represents an ALU instruction or an
+  /// instruction that will be lowered in ExpandSpecialInstrs Pass.
+  bool canBeConsideredALU(const MachineInstr *MI) const;
+
+  bool isTransOnly(unsigned Opcode) const;
+  bool isTransOnly(const MachineInstr *MI) const;
+  bool isVectorOnly(unsigned Opcode) const;
+  bool isVectorOnly(const MachineInstr *MI) const;
+  bool isExport(unsigned Opcode) const;
+
+  bool usesVertexCache(unsigned Opcode) const;
+  bool usesVertexCache(const MachineInstr *MI) const;
+  bool usesTextureCache(unsigned Opcode) const;
+  bool usesTextureCache(const MachineInstr *MI) const;
+
+  bool mustBeLastInClause(unsigned Opcode) const;
+  bool usesAddressRegister(MachineInstr *MI) const;
+  bool definesAddressRegister(MachineInstr *MI) const;
+  bool readsLDSSrcReg(const MachineInstr *MI) const;
+
+  /// \returns The operand index for the given source number.  Legal values
+  /// for SrcNum are 0, 1, and 2.
+  int getSrcIdx(unsigned Opcode, unsigned SrcNum) const;
+  /// \returns The operand Index for the Sel operand given an index to one
+  /// of the instruction's src operands.
+  int getSelIdx(unsigned Opcode, unsigned SrcIdx) const;
+
+  /// \returns a pair for each src of an ALU instructions.
+  /// The first member of a pair is the register id.
+  /// If register is ALU_CONST, second member is SEL.
+  /// If register is ALU_LITERAL, second member is IMM.
+  /// Otherwise, second member value is undefined.
+  SmallVector<std::pair<MachineOperand *, int64_t>, 3>
+      getSrcs(MachineInstr *MI) const;
+
+  unsigned  isLegalUpTo(
+    const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
+    const std::vector<R600InstrInfo::BankSwizzle> &Swz,
+    const std::vector<std::pair<int, unsigned> > &TransSrcs,
+    R600InstrInfo::BankSwizzle TransSwz) const;
+
+  bool FindSwizzleForVectorSlot(
+    const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
+    std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate,
+    const std::vector<std::pair<int, unsigned> > &TransSrcs,
+    R600InstrInfo::BankSwizzle TransSwz) const;
+
+  /// Given the order VEC_012 < VEC_021 < VEC_120 < VEC_102 < VEC_201 < VEC_210
+  /// returns true and the first (in lexical order) BankSwizzle affectation
+  /// starting from the one already provided in the Instruction Group MIs that
+  /// fits Read Port limitations in BS if available. Otherwise returns false
+  /// and undefined content in BS.
+  /// isLastAluTrans should be set if the last Alu of MIs will be executed on
+  /// Trans ALU. In this case, ValidTSwizzle returns the BankSwizzle value to
+  /// apply to the last instruction.
+  /// PV holds GPR to PV registers in the Instruction Group MIs.
+  bool fitsReadPortLimitations(const std::vector<MachineInstr *> &MIs,
+                               const DenseMap<unsigned, unsigned> &PV,
+                               std::vector<BankSwizzle> &BS,
+                               bool isLastAluTrans) const;
+
+  /// An instruction group can only access 2 channel pair (either [XY] or [ZW])
+  /// from KCache bank on R700+. This function check if MI set in input meet
+  /// this limitations
+  bool fitsConstReadLimitations(const std::vector<MachineInstr *> &) const;
+  /// Same but using const index set instead of MI set.
+  bool fitsConstReadLimitations(const std::vector<unsigned>&) const;
+
+  /// \brief Vector instructions are instructions that must fill all
+  /// instruction slots within an instruction group.
+  bool isVector(const MachineInstr &MI) const;
+
+  bool isMov(unsigned Opcode) const override;
+
+  DFAPacketizer *CreateTargetScheduleState(const TargetMachine *TM,
+                                           const ScheduleDAG *DAG) const override;
+
+  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+
+  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const override;
+
+  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const override;
+
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
+
+  bool isPredicated(const MachineInstr *MI) const override;
+
+  bool isPredicable(MachineInstr *MI) const override;
+
+  bool
+   isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
+                             const BranchProbability &Probability) const override;
+
+  bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
+                           unsigned ExtraPredCycles,
+                           const BranchProbability &Probability) const override ;
+
+  bool
+   isProfitableToIfCvt(MachineBasicBlock &TMBB,
+                       unsigned NumTCycles, unsigned ExtraTCycles,
+                       MachineBasicBlock &FMBB,
+                       unsigned NumFCycles, unsigned ExtraFCycles,
+                       const BranchProbability &Probability) const override;
+
+  bool DefinesPredicate(MachineInstr *MI,
+                                  std::vector<MachineOperand> &Pred) const override;
+
+  bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
+                         const SmallVectorImpl<MachineOperand> &Pred2) const override;
+
+  bool isProfitableToUnpredicate(MachineBasicBlock &TMBB,
+                                          MachineBasicBlock &FMBB) const override;
+
+  bool PredicateInstruction(MachineInstr *MI,
+                        const SmallVectorImpl<MachineOperand> &Pred) const override;
+
+  unsigned int getPredicationCost(const MachineInstr *) const override;
+
+  unsigned int getInstrLatency(const InstrItineraryData *ItinData,
+                               const MachineInstr *MI,
+                               unsigned *PredCost = nullptr) const override;
+
+  int getInstrLatency(const InstrItineraryData *ItinData,
+                      SDNode *Node) const override { return 1;}
+
+  virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
+
+  /// \brief Reserve the registers that may be accesed using indirect addressing.
+  void reserveIndirectRegisters(BitVector &Reserved,
+                                const MachineFunction &MF) const;
+
+  unsigned calculateIndirectAddress(unsigned RegIndex,
+                                    unsigned Channel) const override;
+
+  const TargetRegisterClass *getIndirectAddrRegClass() const override;
+
+  MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
+                          MachineBasicBlock::iterator I,
+                          unsigned ValueReg, unsigned Address,
+                          unsigned OffsetReg) const override;
+
+  MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
+                                        MachineBasicBlock::iterator I,
+                                        unsigned ValueReg, unsigned Address,
+                                        unsigned OffsetReg) const override;
+
+  unsigned getMaxAlusPerClause() const;
+
+  ///buildDefaultInstruction - This function returns a MachineInstr with
+  /// all the instruction modifiers initialized to their default values.
+  /// You can use this function to avoid manually specifying each instruction
+  /// modifier operand when building a new instruction.
+  ///
+  /// \returns a MachineInstr with all the instruction modifiers initialized
+  /// to their default values.
+  MachineInstrBuilder buildDefaultInstruction(MachineBasicBlock &MBB,
+                                              MachineBasicBlock::iterator I,
+                                              unsigned Opcode,
+                                              unsigned DstReg,
+                                              unsigned Src0Reg,
+                                              unsigned Src1Reg = 0) const;
+
+  MachineInstr *buildSlotOfVectorInstruction(MachineBasicBlock &MBB,
+                                             MachineInstr *MI,
+                                             unsigned Slot,
+                                             unsigned DstReg) const;
+
+  MachineInstr *buildMovImm(MachineBasicBlock &BB,
+                                  MachineBasicBlock::iterator I,
+                                  unsigned DstReg,
+                                  uint64_t Imm) const;
+
+  MachineInstr *buildMovInstr(MachineBasicBlock *MBB,
+                              MachineBasicBlock::iterator I,
+                              unsigned DstReg, unsigned SrcReg) const override;
+
+  /// \brief Get the index of Op in the MachineInstr.
+  ///
+  /// \returns -1 if the Instruction does not contain the specified \p Op.
+  int getOperandIdx(const MachineInstr &MI, unsigned Op) const;
+
+  /// \brief Get the index of \p Op for the given Opcode.
+  ///
+  /// \returns -1 if the Instruction does not contain the specified \p Op.
+  int getOperandIdx(unsigned Opcode, unsigned Op) const;
+
+  /// \brief Helper function for setting instruction flag values.
+  void setImmOperand(MachineInstr *MI, unsigned Op, int64_t Imm) const;
+
+  /// \returns true if this instruction has an operand for storing target flags.
+  bool hasFlagOperand(const MachineInstr &MI) const;
+
+  ///\brief Add one of the MO_FLAG* flags to the specified \p Operand.
+  void addFlag(MachineInstr *MI, unsigned Operand, unsigned Flag) const;
+
+  ///\brief Determine if the specified \p Flag is set on this \p Operand.
+  bool isFlagSet(const MachineInstr &MI, unsigned Operand, unsigned Flag) const;
+
+  /// \param SrcIdx The register source to set the flag on (e.g src0, src1, src2)
+  /// \param Flag The flag being set.
+  ///
+  /// \returns the operand containing the flags for this instruction.
+  MachineOperand &getFlagOp(MachineInstr *MI, unsigned SrcIdx = 0,
+                            unsigned Flag = 0) const;
+
+  /// \brief Clear the specified flag on the instruction.
+  void clearFlag(MachineInstr *MI, unsigned Operand, unsigned Flag) const;
+};
+
+namespace AMDGPU {
+
+int getLDSNoRetOp(uint16_t Opcode);
+
+} //End namespace AMDGPU
+
+} // End llvm namespace
+
+#endif // R600INSTRINFO_H_
diff --git a/contrib/llvm/lib/Target/R600/R600Instructions.td b/contrib/llvm/lib/Target/R600/R600Instructions.td
new file mode 100644
index 0000000..704507d
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/R600Instructions.td
@@ -0,0 +1,1728 @@
+//===-- R600Instructions.td - R600 Instruction defs  -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TableGen definitions for instructions which are available on R600 family
+// GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+include "R600Intrinsics.td"
+include "R600InstrFormats.td"
+
+class InstR600ISA <dag outs, dag ins, string asm, list<dag> pattern> :
+    InstR600 <outs, ins, asm, pattern, NullALU> {
+
+  let Namespace = "AMDGPU";
+}
+
+def MEMxi : Operand<iPTR> {
+  let MIOperandInfo = (ops R600_TReg32_X:$ptr, i32imm:$index);
+  let PrintMethod = "printMemOperand";
+}
+
+def MEMrr : Operand<iPTR> {
+  let MIOperandInfo = (ops R600_Reg32:$ptr, R600_Reg32:$index);
+}
+
+// Operands for non-registers
+
+class InstFlag<string PM = "printOperand", int Default = 0>
+    : OperandWithDefaultOps <i32, (ops (i32 Default))> {
+  let PrintMethod = PM;
+}
+
+// src_sel for ALU src operands, see also ALU_CONST, ALU_PARAM registers
+def SEL : OperandWithDefaultOps <i32, (ops (i32 -1))> {
+  let PrintMethod = "printSel";
+}
+def BANK_SWIZZLE : OperandWithDefaultOps <i32, (ops (i32 0))> {
+  let PrintMethod = "printBankSwizzle";
+}
+
+def LITERAL : InstFlag<"printLiteral">;
+
+def WRITE : InstFlag <"printWrite", 1>;
+def OMOD : InstFlag <"printOMOD">;
+def REL : InstFlag <"printRel">;
+def CLAMP : InstFlag <"printClamp">;
+def NEG : InstFlag <"printNeg">;
+def ABS : InstFlag <"printAbs">;
+def UEM : InstFlag <"printUpdateExecMask">;
+def UP : InstFlag <"printUpdatePred">;
+
+// XXX: The r600g finalizer in Mesa expects last to be one in most cases.
+// Once we start using the packetizer in this backend we should have this
+// default to 0.
+def LAST : InstFlag<"printLast", 1>;
+def RSel : Operand<i32> {
+  let PrintMethod = "printRSel";
+}
+def CT: Operand<i32> {
+  let PrintMethod = "printCT";
+}
+
+def FRAMEri : Operand<iPTR> {
+  let MIOperandInfo = (ops R600_Reg32:$ptr, i32imm:$index);
+}
+
+def ADDRParam : ComplexPattern<i32, 2, "SelectADDRParam", [], []>;
+def ADDRDWord : ComplexPattern<i32, 1, "SelectADDRDWord", [], []>;
+def ADDRVTX_READ : ComplexPattern<i32, 2, "SelectADDRVTX_READ", [], []>;
+def ADDRGA_CONST_OFFSET : ComplexPattern<i32, 1, "SelectGlobalValueConstantOffset", [], []>;
+def ADDRGA_VAR_OFFSET : ComplexPattern<i32, 2, "SelectGlobalValueVariableOffset", [], []>;
+
+
+def R600_Pred : PredicateOperand<i32, (ops R600_Predicate),
+                                     (ops PRED_SEL_OFF)>;
+
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+
+// Class for instructions with only one source register.
+// If you add new ins to this instruction, make sure they are listed before
+// $literal, because the backend currently assumes that the last operand is
+// a literal.  Also be sure to update the enum R600Op1OperandIndex::ROI in
+// R600Defines.h, R600InstrInfo::buildDefaultInstruction(),
+// and R600InstrInfo::getOperandIdx().
+class R600_1OP <bits<11> inst, string opName, list<dag> pattern,
+                InstrItinClass itin = AnyALU> :
+    InstR600 <(outs R600_Reg32:$dst),
+              (ins WRITE:$write, OMOD:$omod, REL:$dst_rel, CLAMP:$clamp,
+                   R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, SEL:$src0_sel,
+                   LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal,
+                   BANK_SWIZZLE:$bank_swizzle),
+              !strconcat("  ", opName,
+                   "$clamp $last $dst$write$dst_rel$omod, "
+                   "$src0_neg$src0_abs$src0$src0_abs$src0_rel, "
+                   "$pred_sel $bank_swizzle"),
+              pattern,
+              itin>,
+    R600ALU_Word0,
+    R600ALU_Word1_OP2 <inst> {
+
+  let src1 = 0;
+  let src1_rel = 0;
+  let src1_neg = 0;
+  let src1_abs = 0;
+  let update_exec_mask = 0;
+  let update_pred = 0;
+  let HasNativeOperands = 1;
+  let Op1 = 1;
+  let ALUInst = 1;
+  let DisableEncoding = "$literal";
+  let UseNamedOperandTable = 1;
+
+  let Inst{31-0}  = Word0;
+  let Inst{63-32} = Word1;
+}
+
+class R600_1OP_Helper <bits<11> inst, string opName, SDPatternOperator node,
+                    InstrItinClass itin = AnyALU> :
+    R600_1OP <inst, opName,
+              [(set R600_Reg32:$dst, (node R600_Reg32:$src0))], itin
+>;
+
+// If you add or change the operands for R600_2OP instructions, you must
+// also update the R600Op2OperandIndex::ROI enum in R600Defines.h,
+// R600InstrInfo::buildDefaultInstruction(), and R600InstrInfo::getOperandIdx().
+class R600_2OP <bits<11> inst, string opName, list<dag> pattern,
+                InstrItinClass itin = AnyALU> :
+  InstR600 <(outs R600_Reg32:$dst),
+          (ins UEM:$update_exec_mask, UP:$update_pred, WRITE:$write,
+               OMOD:$omod, REL:$dst_rel, CLAMP:$clamp,
+               R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, SEL:$src0_sel,
+               R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, ABS:$src1_abs, SEL:$src1_sel,
+               LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal,
+               BANK_SWIZZLE:$bank_swizzle),
+          !strconcat("  ", opName,
+                "$clamp $last $update_exec_mask$update_pred$dst$write$dst_rel$omod, "
+                "$src0_neg$src0_abs$src0$src0_abs$src0_rel, "
+                "$src1_neg$src1_abs$src1$src1_abs$src1_rel, "
+                "$pred_sel $bank_swizzle"),
+          pattern,
+          itin>,
+    R600ALU_Word0,
+    R600ALU_Word1_OP2 <inst> {
+
+  let HasNativeOperands = 1;
+  let Op2 = 1;
+  let ALUInst = 1;
+  let DisableEncoding = "$literal";
+  let UseNamedOperandTable = 1;
+
+  let Inst{31-0}  = Word0;
+  let Inst{63-32} = Word1;
+}
+
+class R600_2OP_Helper <bits<11> inst, string opName, SDPatternOperator node,
+                       InstrItinClass itin = AnyALU> :
+    R600_2OP <inst, opName,
+              [(set R600_Reg32:$dst, (node R600_Reg32:$src0,
+                                           R600_Reg32:$src1))], itin
+>;
+
+// If you add our change the operands for R600_3OP instructions, you must
+// also update the R600Op3OperandIndex::ROI enum in R600Defines.h,
+// R600InstrInfo::buildDefaultInstruction(), and
+// R600InstrInfo::getOperandIdx().
+class R600_3OP <bits<5> inst, string opName, list<dag> pattern,
+                InstrItinClass itin = AnyALU> :
+  InstR600 <(outs R600_Reg32:$dst),
+          (ins REL:$dst_rel, CLAMP:$clamp,
+               R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, SEL:$src0_sel,
+               R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, SEL:$src1_sel,
+               R600_Reg32:$src2, NEG:$src2_neg, REL:$src2_rel, SEL:$src2_sel,
+               LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal,
+               BANK_SWIZZLE:$bank_swizzle),
+          !strconcat("  ", opName, "$clamp $last $dst$dst_rel, "
+                             "$src0_neg$src0$src0_rel, "
+                             "$src1_neg$src1$src1_rel, "
+                             "$src2_neg$src2$src2_rel, "
+                             "$pred_sel"
+                             "$bank_swizzle"),
+          pattern,
+          itin>,
+    R600ALU_Word0,
+    R600ALU_Word1_OP3<inst>{
+
+  let HasNativeOperands = 1;
+  let DisableEncoding = "$literal";
+  let Op3 = 1;
+  let UseNamedOperandTable = 1;
+  let ALUInst = 1;
+
+  let Inst{31-0}  = Word0;
+  let Inst{63-32} = Word1;
+}
+
+class R600_REDUCTION <bits<11> inst, dag ins, string asm, list<dag> pattern,
+                      InstrItinClass itin = VecALU> :
+  InstR600 <(outs R600_Reg32:$dst),
+          ins,
+          asm,
+          pattern,
+          itin>;
+
+
+
+} // End mayLoad = 1, mayStore = 0, hasSideEffects = 0
+
+def TEX_SHADOW : PatLeaf<
+  (imm),
+  [{uint32_t TType = (uint32_t)N->getZExtValue();
+    return (TType >= 6 && TType <= 8) || TType == 13;
+  }]
+>;
+
+def TEX_RECT : PatLeaf<
+  (imm),
+  [{uint32_t TType = (uint32_t)N->getZExtValue();
+    return TType == 5;
+  }]
+>;
+
+def TEX_ARRAY : PatLeaf<
+  (imm),
+  [{uint32_t TType = (uint32_t)N->getZExtValue();
+    return TType == 9 || TType == 10 || TType == 16;
+  }]
+>;
+
+def TEX_SHADOW_ARRAY : PatLeaf<
+  (imm),
+  [{uint32_t TType = (uint32_t)N->getZExtValue();
+    return TType == 11 || TType == 12 || TType == 17;
+  }]
+>;
+
+def TEX_MSAA : PatLeaf<
+  (imm),
+  [{uint32_t TType = (uint32_t)N->getZExtValue();
+    return TType == 14;
+  }]
+>;
+
+def TEX_ARRAY_MSAA : PatLeaf<
+  (imm),
+  [{uint32_t TType = (uint32_t)N->getZExtValue();
+    return TType == 15;
+  }]
+>;
+
+class EG_CF_RAT <bits <8> cfinst, bits <6> ratinst, bits<4> ratid, bits<4> mask,
+                 dag outs, dag ins, string asm, list<dag> pattern> :
+    InstR600ISA <outs, ins, asm, pattern>,
+    CF_ALLOC_EXPORT_WORD0_RAT, CF_ALLOC_EXPORT_WORD1_BUF  {
+
+  let rat_id = ratid;
+  let rat_inst = ratinst;
+  let rim         = 0;
+  // XXX: Have a separate instruction for non-indexed writes.
+  let type        = 1;
+  let rw_rel      = 0;
+  let elem_size   = 0;
+
+  let array_size  = 0;
+  let comp_mask   = mask;
+  let burst_count = 0;
+  let vpm         = 0;
+  let cf_inst = cfinst;
+  let mark        = 0;
+  let barrier     = 1;
+
+  let Inst{31-0} = Word0;
+  let Inst{63-32} = Word1;
+  let IsExport = 1;
+
+}
+
+class VTX_READ <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
+    : InstR600ISA <outs, (ins MEMxi:$src_gpr), name, pattern>,
+      VTX_WORD1_GPR {
+
+  // Static fields
+  let DST_REL = 0;
+  // The docs say that if this bit is set, then DATA_FORMAT, NUM_FORMAT_ALL,
+  // FORMAT_COMP_ALL, SRF_MODE_ALL, and ENDIAN_SWAP fields will be ignored,
+  // however, based on my testing if USE_CONST_FIELDS is set, then all
+  // these fields need to be set to 0.
+  let USE_CONST_FIELDS = 0;
+  let NUM_FORMAT_ALL = 1;
+  let FORMAT_COMP_ALL = 0;
+  let SRF_MODE_ALL = 0;
+
+  let Inst{63-32} = Word1;
+  // LLVM can only encode 64-bit instructions, so these fields are manually
+  // encoded in R600CodeEmitter
+  //
+  // bits<16> OFFSET;
+  // bits<2>  ENDIAN_SWAP = 0;
+  // bits<1>  CONST_BUF_NO_STRIDE = 0;
+  // bits<1>  MEGA_FETCH = 0;
+  // bits<1>  ALT_CONST = 0;
+  // bits<2>  BUFFER_INDEX_MODE = 0;
+
+  // VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding
+  // is done in R600CodeEmitter
+  //
+  // Inst{79-64} = OFFSET;
+  // Inst{81-80} = ENDIAN_SWAP;
+  // Inst{82}    = CONST_BUF_NO_STRIDE;
+  // Inst{83}    = MEGA_FETCH;
+  // Inst{84}    = ALT_CONST;
+  // Inst{86-85} = BUFFER_INDEX_MODE;
+  // Inst{95-86} = 0; Reserved
+
+  // VTX_WORD3 (Padding)
+  //
+  // Inst{127-96} = 0;
+
+  let VTXInst = 1;
+}
+
+class LoadParamFrag <PatFrag load_type> : PatFrag <
+  (ops node:$ptr), (load_type node:$ptr),
+  [{ return isConstantLoad(dyn_cast<LoadSDNode>(N), 0); }]
+>;
+
+def load_param : LoadParamFrag<load>;
+def load_param_exti8 : LoadParamFrag<az_extloadi8>;
+def load_param_exti16 : LoadParamFrag<az_extloadi16>;
+
+def isR600 : Predicate<"Subtarget.getGeneration() <= AMDGPUSubtarget::R700">;
+
+def isR600toCayman : Predicate<
+                     "Subtarget.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS">;
+
+//===----------------------------------------------------------------------===//
+// R600 SDNodes
+//===----------------------------------------------------------------------===//
+
+def INTERP_PAIR_XY :  AMDGPUShaderInst <
+  (outs R600_TReg32_X:$dst0, R600_TReg32_Y:$dst1),
+  (ins i32imm:$src0, R600_TReg32_Y:$src1, R600_TReg32_X:$src2),
+  "INTERP_PAIR_XY $src0 $src1 $src2 : $dst0 dst1",
+  []>;
+
+def INTERP_PAIR_ZW :  AMDGPUShaderInst <
+  (outs R600_TReg32_Z:$dst0, R600_TReg32_W:$dst1),
+  (ins i32imm:$src0, R600_TReg32_Y:$src1, R600_TReg32_X:$src2),
+  "INTERP_PAIR_ZW $src0 $src1 $src2 : $dst0 dst1",
+  []>;
+
+def CONST_ADDRESS: SDNode<"AMDGPUISD::CONST_ADDRESS",
+  SDTypeProfile<1, -1, [SDTCisInt<0>, SDTCisPtrTy<1>]>,
+  [SDNPVariadic]
+>;
+
+def DOT4 : SDNode<"AMDGPUISD::DOT4",
+  SDTypeProfile<1, 8, [SDTCisFP<0>, SDTCisVT<1, f32>, SDTCisVT<2, f32>,
+      SDTCisVT<3, f32>, SDTCisVT<4, f32>, SDTCisVT<5, f32>,
+      SDTCisVT<6, f32>, SDTCisVT<7, f32>, SDTCisVT<8, f32>]>,
+  []
+>;
+
+def COS_HW : SDNode<"AMDGPUISD::COS_HW",
+  SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]>
+>;
+
+def SIN_HW : SDNode<"AMDGPUISD::SIN_HW",
+  SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]>
+>;
+
+def TEXTURE_FETCH_Type : SDTypeProfile<1, 19, [SDTCisFP<0>]>;
+
+def TEXTURE_FETCH: SDNode<"AMDGPUISD::TEXTURE_FETCH", TEXTURE_FETCH_Type, []>;
+
+multiclass TexPattern<bits<32> TextureOp, Instruction inst, ValueType vt = v4f32> {
+def : Pat<(TEXTURE_FETCH (i32 TextureOp), vt:$SRC_GPR,
+          (i32 imm:$srcx), (i32 imm:$srcy), (i32 imm:$srcz), (i32 imm:$srcw),
+          (i32 imm:$offsetx), (i32 imm:$offsety), (i32 imm:$offsetz),
+          (i32 imm:$DST_SEL_X), (i32 imm:$DST_SEL_Y), (i32 imm:$DST_SEL_Z),
+          (i32 imm:$DST_SEL_W),
+          (i32 imm:$RESOURCE_ID), (i32 imm:$SAMPLER_ID),
+          (i32 imm:$COORD_TYPE_X), (i32 imm:$COORD_TYPE_Y), (i32 imm:$COORD_TYPE_Z),
+          (i32 imm:$COORD_TYPE_W)),
+          (inst R600_Reg128:$SRC_GPR,
+          imm:$srcx, imm:$srcy, imm:$srcz, imm:$srcw,
+          imm:$offsetx, imm:$offsety, imm:$offsetz,
+          imm:$DST_SEL_X, imm:$DST_SEL_Y, imm:$DST_SEL_Z,
+          imm:$DST_SEL_W,
+          imm:$RESOURCE_ID, imm:$SAMPLER_ID,
+          imm:$COORD_TYPE_X, imm:$COORD_TYPE_Y, imm:$COORD_TYPE_Z,
+          imm:$COORD_TYPE_W)>;
+}
+
+//===----------------------------------------------------------------------===//
+// Interpolation Instructions
+//===----------------------------------------------------------------------===//
+
+def INTERP_VEC_LOAD :  AMDGPUShaderInst <
+  (outs R600_Reg128:$dst),
+  (ins i32imm:$src0),
+  "INTERP_LOAD $src0 : $dst",
+  [(set R600_Reg128:$dst, (int_R600_interp_const imm:$src0))]>;
+
+def INTERP_XY : R600_2OP <0xD6, "INTERP_XY", []> {
+  let bank_swizzle = 5;
+}
+
+def INTERP_ZW : R600_2OP <0xD7, "INTERP_ZW", []> {
+  let bank_swizzle = 5;
+}
+
+def INTERP_LOAD_P0 : R600_1OP <0xE0, "INTERP_LOAD_P0", []>;
+
+//===----------------------------------------------------------------------===//
+// Export Instructions
+//===----------------------------------------------------------------------===//
+
+def ExportType : SDTypeProfile<0, 7, [SDTCisFP<0>, SDTCisInt<1>]>;
+
+def EXPORT: SDNode<"AMDGPUISD::EXPORT", ExportType,
+  [SDNPHasChain, SDNPSideEffect]>;
+
+class ExportWord0 {
+  field bits<32> Word0;
+
+  bits<13> arraybase;
+  bits<2> type;
+  bits<7> gpr;
+  bits<2> elem_size;
+
+  let Word0{12-0} = arraybase;
+  let Word0{14-13} = type;
+  let Word0{21-15} = gpr;
+  let Word0{22} = 0; // RW_REL
+  let Word0{29-23} = 0; // INDEX_GPR
+  let Word0{31-30} = elem_size;
+}
+
+class ExportSwzWord1 {
+  field bits<32> Word1;
+
+  bits<3> sw_x;
+  bits<3> sw_y;
+  bits<3> sw_z;
+  bits<3> sw_w;
+  bits<1> eop;
+  bits<8> inst;
+
+  let Word1{2-0} = sw_x;
+  let Word1{5-3} = sw_y;
+  let Word1{8-6} = sw_z;
+  let Word1{11-9} = sw_w;
+}
+
+class ExportBufWord1 {
+  field bits<32> Word1;
+
+  bits<12> arraySize;
+  bits<4> compMask;
+  bits<1> eop;
+  bits<8> inst;
+
+  let Word1{11-0} = arraySize;
+  let Word1{15-12} = compMask;
+}
+
+multiclass ExportPattern<Instruction ExportInst, bits<8> cf_inst> {
+  def : Pat<(int_R600_store_pixel_depth R600_Reg32:$reg),
+    (ExportInst
+        (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), R600_Reg32:$reg, sub0),
+        0, 61, 0, 7, 7, 7, cf_inst, 0)
+  >;
+
+  def : Pat<(int_R600_store_pixel_stencil R600_Reg32:$reg),
+    (ExportInst
+        (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), R600_Reg32:$reg, sub0),
+        0, 61, 7, 0, 7, 7, cf_inst, 0)
+  >;
+
+  def : Pat<(int_R600_store_dummy (i32 imm:$type)),
+    (ExportInst
+        (v4f32 (IMPLICIT_DEF)), imm:$type, 0, 7, 7, 7, 7, cf_inst, 0)
+  >;
+
+  def : Pat<(int_R600_store_dummy 1),
+    (ExportInst
+        (v4f32 (IMPLICIT_DEF)), 1, 60, 7, 7, 7, 7, cf_inst, 0)
+  >;
+
+  def : Pat<(EXPORT (v4f32 R600_Reg128:$src), (i32 imm:$base), (i32 imm:$type),
+    (i32 imm:$swz_x), (i32 imm:$swz_y), (i32 imm:$swz_z), (i32 imm:$swz_w)),
+        (ExportInst R600_Reg128:$src, imm:$type, imm:$base,
+        imm:$swz_x, imm:$swz_y, imm:$swz_z, imm:$swz_w, cf_inst, 0)
+  >;
+
+}
+
+multiclass SteamOutputExportPattern<Instruction ExportInst,
+    bits<8> buf0inst, bits<8> buf1inst, bits<8> buf2inst, bits<8> buf3inst> {
+// Stream0
+  def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src),
+      (i32 imm:$arraybase), (i32 0), (i32 imm:$mask)),
+      (ExportInst R600_Reg128:$src, 0, imm:$arraybase,
+      4095, imm:$mask, buf0inst, 0)>;
+// Stream1
+  def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src),
+      (i32 imm:$arraybase), (i32 1), (i32 imm:$mask)),
+      (ExportInst R600_Reg128:$src, 0, imm:$arraybase,
+      4095, imm:$mask, buf1inst, 0)>;
+// Stream2
+  def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src),
+      (i32 imm:$arraybase), (i32 2), (i32 imm:$mask)),
+      (ExportInst R600_Reg128:$src, 0, imm:$arraybase,
+      4095, imm:$mask, buf2inst, 0)>;
+// Stream3
+  def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src),
+      (i32 imm:$arraybase), (i32 3), (i32 imm:$mask)),
+      (ExportInst R600_Reg128:$src, 0, imm:$arraybase,
+      4095, imm:$mask, buf3inst, 0)>;
+}
+
+// Export Instructions should not be duplicated by TailDuplication pass
+// (which assumes that duplicable instruction are affected by exec mask)
+let usesCustomInserter = 1, isNotDuplicable = 1 in {
+
+class ExportSwzInst : InstR600ISA<(
+    outs),
+    (ins R600_Reg128:$gpr, i32imm:$type, i32imm:$arraybase,
+    RSel:$sw_x, RSel:$sw_y, RSel:$sw_z, RSel:$sw_w, i32imm:$inst,
+    i32imm:$eop),
+    !strconcat("EXPORT", " $gpr.$sw_x$sw_y$sw_z$sw_w"),
+    []>, ExportWord0, ExportSwzWord1 {
+  let elem_size = 3;
+  let Inst{31-0} = Word0;
+  let Inst{63-32} = Word1;
+  let IsExport = 1;
+}
+
+} // End usesCustomInserter = 1
+
+class ExportBufInst : InstR600ISA<(
+    outs),
+    (ins R600_Reg128:$gpr, i32imm:$type, i32imm:$arraybase,
+    i32imm:$arraySize, i32imm:$compMask, i32imm:$inst, i32imm:$eop),
+    !strconcat("EXPORT", " $gpr"),
+    []>, ExportWord0, ExportBufWord1 {
+  let elem_size = 0;
+  let Inst{31-0} = Word0;
+  let Inst{63-32} = Word1;
+  let IsExport = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Control Flow Instructions
+//===----------------------------------------------------------------------===//
+
+
+def KCACHE : InstFlag<"printKCache">;
+
+class ALU_CLAUSE<bits<4> inst, string OpName> : AMDGPUInst <(outs),
+(ins i32imm:$ADDR, i32imm:$KCACHE_BANK0, i32imm:$KCACHE_BANK1,
+KCACHE:$KCACHE_MODE0, KCACHE:$KCACHE_MODE1,
+i32imm:$KCACHE_ADDR0, i32imm:$KCACHE_ADDR1,
+i32imm:$COUNT, i32imm:$Enabled),
+!strconcat(OpName, " $COUNT, @$ADDR, "
+"KC0[$KCACHE_MODE0], KC1[$KCACHE_MODE1]"),
+[] >, CF_ALU_WORD0, CF_ALU_WORD1 {
+  field bits<64> Inst;
+
+  let CF_INST = inst;
+  let ALT_CONST = 0;
+  let WHOLE_QUAD_MODE = 0;
+  let BARRIER = 1;
+  let UseNamedOperandTable = 1;
+
+  let Inst{31-0} = Word0;
+  let Inst{63-32} = Word1;
+}
+
+class CF_WORD0_R600 {
+  field bits<32> Word0;
+
+  bits<32> ADDR;
+
+  let Word0 = ADDR;
+}
+
+class CF_CLAUSE_R600 <bits<7> inst, dag ins, string AsmPrint> : AMDGPUInst <(outs),
+ins, AsmPrint, [] >, CF_WORD0_R600, CF_WORD1_R600 {
+  field bits<64> Inst;
+  bits<4> CNT;
+
+  let CF_INST = inst;
+  let BARRIER = 1;
+  let CF_CONST = 0;
+  let VALID_PIXEL_MODE = 0;
+  let COND = 0;
+  let COUNT = CNT{2-0};
+  let CALL_COUNT = 0;
+  let COUNT_3 = CNT{3};
+  let END_OF_PROGRAM = 0;
+  let WHOLE_QUAD_MODE = 0;
+
+  let Inst{31-0} = Word0;
+  let Inst{63-32} = Word1;
+}
+
+class CF_CLAUSE_EG <bits<8> inst, dag ins, string AsmPrint> : AMDGPUInst <(outs),
+ins, AsmPrint, [] >, CF_WORD0_EG, CF_WORD1_EG {
+  field bits<64> Inst;
+
+  let CF_INST = inst;
+  let BARRIER = 1;
+  let JUMPTABLE_SEL = 0;
+  let CF_CONST = 0;
+  let VALID_PIXEL_MODE = 0;
+  let COND = 0;
+  let END_OF_PROGRAM = 0;
+
+  let Inst{31-0} = Word0;
+  let Inst{63-32} = Word1;
+}
+
+def CF_ALU : ALU_CLAUSE<8, "ALU">;
+def CF_ALU_PUSH_BEFORE : ALU_CLAUSE<9, "ALU_PUSH_BEFORE">;
+def CF_ALU_POP_AFTER : ALU_CLAUSE<10, "ALU_POP_AFTER">;
+def CF_ALU_CONTINUE : ALU_CLAUSE<13, "ALU_CONTINUE">;
+def CF_ALU_BREAK : ALU_CLAUSE<14, "ALU_BREAK">;
+def CF_ALU_ELSE_AFTER : ALU_CLAUSE<15, "ALU_ELSE_AFTER">;
+
+def FETCH_CLAUSE : AMDGPUInst <(outs),
+(ins i32imm:$addr), "Fetch clause starting at $addr:", [] > {
+  field bits<8> Inst;
+  bits<8> num;
+  let Inst = num;
+}
+
+def ALU_CLAUSE : AMDGPUInst <(outs),
+(ins i32imm:$addr), "ALU clause starting at $addr:", [] > {
+  field bits<8> Inst;
+  bits<8> num;
+  let Inst = num;
+}
+
+def LITERALS : AMDGPUInst <(outs),
+(ins LITERAL:$literal1, LITERAL:$literal2), "$literal1, $literal2", [] > {
+  field bits<64> Inst;
+  bits<32> literal1;
+  bits<32> literal2;
+
+  let Inst{31-0} = literal1;
+  let Inst{63-32} = literal2;
+}
+
+def PAD : AMDGPUInst <(outs), (ins), "PAD", [] > {
+  field bits<64> Inst;
+}
+
+let Predicates = [isR600toCayman] in {
+
+//===----------------------------------------------------------------------===//
+// Common Instructions R600, R700, Evergreen, Cayman
+//===----------------------------------------------------------------------===//
+
+def ADD : R600_2OP_Helper <0x0, "ADD", fadd>;
+// Non-IEEE MUL: 0 * anything = 0
+def MUL : R600_2OP_Helper <0x1, "MUL NON-IEEE", int_AMDGPU_mul>;
+def MUL_IEEE : R600_2OP_Helper <0x2, "MUL_IEEE", fmul>;
+def MAX : R600_2OP_Helper <0x3, "MAX", AMDGPUfmax>;
+def MIN : R600_2OP_Helper <0x4, "MIN", AMDGPUfmin>;
+
+// For the SET* instructions there is a naming conflict in TargetSelectionDAG.td,
+// so some of the instruction names don't match the asm string.
+// XXX: Use the defs in TargetSelectionDAG.td instead of intrinsics.
+def SETE : R600_2OP <
+  0x08, "SETE",
+  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OEQ))]
+>;
+
+def SGT : R600_2OP <
+  0x09, "SETGT",
+  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OGT))]
+>;
+
+def SGE : R600_2OP <
+  0xA, "SETGE",
+  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OGE))]
+>;
+
+def SNE : R600_2OP <
+  0xB, "SETNE",
+  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_UNE))]
+>;
+
+def SETE_DX10 : R600_2OP <
+  0xC, "SETE_DX10",
+  [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_OEQ))]
+>;
+
+def SETGT_DX10 : R600_2OP <
+  0xD, "SETGT_DX10",
+  [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_OGT))]
+>;
+
+def SETGE_DX10 : R600_2OP <
+  0xE, "SETGE_DX10",
+  [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_OGE))]
+>;
+
+def SETNE_DX10 : R600_2OP <
+  0xF, "SETNE_DX10",
+  [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_UNE))]
+>;
+
+def FRACT : R600_1OP_Helper <0x10, "FRACT", AMDGPUfract>;
+def TRUNC : R600_1OP_Helper <0x11, "TRUNC", ftrunc>;
+def CEIL : R600_1OP_Helper <0x12, "CEIL", fceil>;
+def RNDNE : R600_1OP_Helper <0x13, "RNDNE", frint>;
+def FLOOR : R600_1OP_Helper <0x14, "FLOOR", ffloor>;
+
+def MOV : R600_1OP <0x19, "MOV", []>;
+
+let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1 in {
+
+class MOV_IMM <ValueType vt, Operand immType> : AMDGPUInst <
+  (outs R600_Reg32:$dst),
+  (ins immType:$imm),
+  "",
+  []
+>;
+
+} // end let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1
+
+def MOV_IMM_I32 : MOV_IMM<i32, i32imm>;
+def : Pat <
+  (imm:$val),
+  (MOV_IMM_I32 imm:$val)
+>;
+
+def MOV_IMM_F32 : MOV_IMM<f32, f32imm>;
+def : Pat <
+  (fpimm:$val),
+  (MOV_IMM_F32  fpimm:$val)
+>;
+
+def PRED_SETE : R600_2OP <0x20, "PRED_SETE", []>;
+def PRED_SETGT : R600_2OP <0x21, "PRED_SETGT", []>;
+def PRED_SETGE : R600_2OP <0x22, "PRED_SETGE", []>;
+def PRED_SETNE : R600_2OP <0x23, "PRED_SETNE", []>;
+
+let hasSideEffects = 1 in {
+
+def KILLGT : R600_2OP <0x2D, "KILLGT", []>;
+
+} // end hasSideEffects
+
+def AND_INT : R600_2OP_Helper <0x30, "AND_INT", and>;
+def OR_INT : R600_2OP_Helper <0x31, "OR_INT", or>;
+def XOR_INT : R600_2OP_Helper <0x32, "XOR_INT", xor>;
+def NOT_INT : R600_1OP_Helper <0x33, "NOT_INT", not>;
+def ADD_INT : R600_2OP_Helper <0x34, "ADD_INT", add>;
+def SUB_INT : R600_2OP_Helper <0x35, "SUB_INT", sub>;
+def MAX_INT : R600_2OP_Helper <0x36, "MAX_INT", AMDGPUsmax>;
+def MIN_INT : R600_2OP_Helper <0x37, "MIN_INT", AMDGPUsmin>;
+def MAX_UINT : R600_2OP_Helper <0x38, "MAX_UINT", AMDGPUumax>;
+def MIN_UINT : R600_2OP_Helper <0x39, "MIN_UINT", AMDGPUumin>;
+
+def SETE_INT : R600_2OP <
+  0x3A, "SETE_INT",
+  [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETEQ))]
+>;
+
+def SETGT_INT : R600_2OP <
+  0x3B, "SETGT_INT",
+  [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETGT))]
+>;
+
+def SETGE_INT : R600_2OP <
+  0x3C, "SETGE_INT",
+  [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETGE))]
+>;
+
+def SETNE_INT : R600_2OP <
+  0x3D, "SETNE_INT",
+  [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETNE))]
+>;
+
+def SETGT_UINT : R600_2OP <
+  0x3E, "SETGT_UINT",
+  [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETUGT))]
+>;
+
+def SETGE_UINT : R600_2OP <
+  0x3F, "SETGE_UINT",
+  [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETUGE))]
+>;
+
+def PRED_SETE_INT : R600_2OP <0x42, "PRED_SETE_INT", []>;
+def PRED_SETGT_INT : R600_2OP <0x43, "PRED_SETGE_INT", []>;
+def PRED_SETGE_INT : R600_2OP <0x44, "PRED_SETGE_INT", []>;
+def PRED_SETNE_INT : R600_2OP <0x45, "PRED_SETNE_INT", []>;
+
+def CNDE_INT : R600_3OP <
+  0x1C, "CNDE_INT",
+  [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_EQ))]
+>;
+
+def CNDGE_INT : R600_3OP <
+  0x1E, "CNDGE_INT",
+  [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_SGE))]
+>;
+
+def CNDGT_INT : R600_3OP <
+  0x1D, "CNDGT_INT",
+  [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_SGT))]
+>;
+
+//===----------------------------------------------------------------------===//
+// Texture instructions
+//===----------------------------------------------------------------------===//
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+
+class R600_TEX <bits<11> inst, string opName> :
+  InstR600 <(outs R600_Reg128:$DST_GPR),
+          (ins R600_Reg128:$SRC_GPR,
+          RSel:$srcx, RSel:$srcy, RSel:$srcz, RSel:$srcw,
+          i32imm:$offsetx, i32imm:$offsety, i32imm:$offsetz,
+          RSel:$DST_SEL_X, RSel:$DST_SEL_Y, RSel:$DST_SEL_Z, RSel:$DST_SEL_W,
+          i32imm:$RESOURCE_ID, i32imm:$SAMPLER_ID,
+          CT:$COORD_TYPE_X, CT:$COORD_TYPE_Y, CT:$COORD_TYPE_Z,
+          CT:$COORD_TYPE_W),
+          !strconcat(opName,
+          " $DST_GPR.$DST_SEL_X$DST_SEL_Y$DST_SEL_Z$DST_SEL_W, "
+          "$SRC_GPR.$srcx$srcy$srcz$srcw "
+          "RID:$RESOURCE_ID SID:$SAMPLER_ID "
+          "CT:$COORD_TYPE_X$COORD_TYPE_Y$COORD_TYPE_Z$COORD_TYPE_W"),
+          [],
+          NullALU>, TEX_WORD0, TEX_WORD1, TEX_WORD2 {
+  let Inst{31-0} = Word0;
+  let Inst{63-32} = Word1;
+
+  let TEX_INST = inst{4-0};
+  let SRC_REL = 0;
+  let DST_REL = 0;
+  let LOD_BIAS = 0;
+
+  let INST_MOD = 0;
+  let FETCH_WHOLE_QUAD = 0;
+  let ALT_CONST = 0;
+  let SAMPLER_INDEX_MODE = 0;
+  let RESOURCE_INDEX_MODE = 0;
+
+  let TEXInst = 1;
+}
+
+} // End mayLoad = 0, mayStore = 0, hasSideEffects = 0
+
+
+
+def TEX_SAMPLE : R600_TEX <0x10, "TEX_SAMPLE">;
+def TEX_SAMPLE_C : R600_TEX <0x18, "TEX_SAMPLE_C">;
+def TEX_SAMPLE_L : R600_TEX <0x11, "TEX_SAMPLE_L">;
+def TEX_SAMPLE_C_L : R600_TEX <0x19, "TEX_SAMPLE_C_L">;
+def TEX_SAMPLE_LB : R600_TEX <0x12, "TEX_SAMPLE_LB">;
+def TEX_SAMPLE_C_LB : R600_TEX <0x1A, "TEX_SAMPLE_C_LB">;
+def TEX_LD : R600_TEX <0x03, "TEX_LD">;
+def TEX_LDPTR : R600_TEX <0x03, "TEX_LDPTR"> {
+  let INST_MOD = 1;
+}
+def TEX_GET_TEXTURE_RESINFO : R600_TEX <0x04, "TEX_GET_TEXTURE_RESINFO">;
+def TEX_GET_GRADIENTS_H : R600_TEX <0x07, "TEX_GET_GRADIENTS_H">;
+def TEX_GET_GRADIENTS_V : R600_TEX <0x08, "TEX_GET_GRADIENTS_V">;
+def TEX_SET_GRADIENTS_H : R600_TEX <0x0B, "TEX_SET_GRADIENTS_H">;
+def TEX_SET_GRADIENTS_V : R600_TEX <0x0C, "TEX_SET_GRADIENTS_V">;
+def TEX_SAMPLE_G : R600_TEX <0x14, "TEX_SAMPLE_G">;
+def TEX_SAMPLE_C_G : R600_TEX <0x1C, "TEX_SAMPLE_C_G">;
+
+defm : TexPattern<0, TEX_SAMPLE>;
+defm : TexPattern<1, TEX_SAMPLE_C>;
+defm : TexPattern<2, TEX_SAMPLE_L>;
+defm : TexPattern<3, TEX_SAMPLE_C_L>;
+defm : TexPattern<4, TEX_SAMPLE_LB>;
+defm : TexPattern<5, TEX_SAMPLE_C_LB>;
+defm : TexPattern<6, TEX_LD, v4i32>;
+defm : TexPattern<7, TEX_GET_TEXTURE_RESINFO, v4i32>;
+defm : TexPattern<8, TEX_GET_GRADIENTS_H>;
+defm : TexPattern<9, TEX_GET_GRADIENTS_V>;
+defm : TexPattern<10, TEX_LDPTR, v4i32>;
+
+//===----------------------------------------------------------------------===//
+// Helper classes for common instructions
+//===----------------------------------------------------------------------===//
+
+class MUL_LIT_Common <bits<5> inst> : R600_3OP <
+  inst, "MUL_LIT",
+  []
+>;
+
+class MULADD_Common <bits<5> inst> : R600_3OP <
+  inst, "MULADD",
+  []
+>;
+
+class MULADD_IEEE_Common <bits<5> inst> : R600_3OP <
+  inst, "MULADD_IEEE",
+  [(set f32:$dst, (fadd (fmul f32:$src0, f32:$src1), f32:$src2))]
+>;
+
+class CNDE_Common <bits<5> inst> : R600_3OP <
+  inst, "CNDE",
+  [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OEQ))]
+>;
+
+class CNDGT_Common <bits<5> inst> : R600_3OP <
+  inst, "CNDGT",
+  [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OGT))]
+> {
+  let Itinerary = VecALU;
+}
+
+class CNDGE_Common <bits<5> inst> : R600_3OP <
+  inst, "CNDGE",
+  [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OGE))]
+> {
+  let Itinerary = VecALU;
+}
+
+
+let isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU"  in {
+class R600_VEC2OP<list<dag> pattern> : InstR600 <(outs R600_Reg32:$dst), (ins
+// Slot X
+   UEM:$update_exec_mask_X, UP:$update_pred_X, WRITE:$write_X,
+   OMOD:$omod_X, REL:$dst_rel_X, CLAMP:$clamp_X,
+   R600_TReg32_X:$src0_X, NEG:$src0_neg_X, REL:$src0_rel_X, ABS:$src0_abs_X, SEL:$src0_sel_X,
+   R600_TReg32_X:$src1_X, NEG:$src1_neg_X, REL:$src1_rel_X, ABS:$src1_abs_X, SEL:$src1_sel_X,
+   R600_Pred:$pred_sel_X,
+// Slot Y
+   UEM:$update_exec_mask_Y, UP:$update_pred_Y, WRITE:$write_Y,
+   OMOD:$omod_Y, REL:$dst_rel_Y, CLAMP:$clamp_Y,
+   R600_TReg32_Y:$src0_Y, NEG:$src0_neg_Y, REL:$src0_rel_Y, ABS:$src0_abs_Y, SEL:$src0_sel_Y,
+   R600_TReg32_Y:$src1_Y, NEG:$src1_neg_Y, REL:$src1_rel_Y, ABS:$src1_abs_Y, SEL:$src1_sel_Y,
+   R600_Pred:$pred_sel_Y,
+// Slot Z
+   UEM:$update_exec_mask_Z, UP:$update_pred_Z, WRITE:$write_Z,
+   OMOD:$omod_Z, REL:$dst_rel_Z, CLAMP:$clamp_Z,
+   R600_TReg32_Z:$src0_Z, NEG:$src0_neg_Z, REL:$src0_rel_Z, ABS:$src0_abs_Z, SEL:$src0_sel_Z,
+   R600_TReg32_Z:$src1_Z, NEG:$src1_neg_Z, REL:$src1_rel_Z, ABS:$src1_abs_Z, SEL:$src1_sel_Z,
+   R600_Pred:$pred_sel_Z,
+// Slot W
+   UEM:$update_exec_mask_W, UP:$update_pred_W, WRITE:$write_W,
+   OMOD:$omod_W, REL:$dst_rel_W, CLAMP:$clamp_W,
+   R600_TReg32_W:$src0_W, NEG:$src0_neg_W, REL:$src0_rel_W, ABS:$src0_abs_W, SEL:$src0_sel_W,
+   R600_TReg32_W:$src1_W, NEG:$src1_neg_W, REL:$src1_rel_W, ABS:$src1_abs_W, SEL:$src1_sel_W,
+   R600_Pred:$pred_sel_W,
+   LITERAL:$literal0, LITERAL:$literal1),
+  "",
+  pattern,
+  AnyALU> {
+
+  let UseNamedOperandTable = 1;
+
+}
+}
+
+def DOT_4 : R600_VEC2OP<[(set R600_Reg32:$dst, (DOT4
+  R600_TReg32_X:$src0_X, R600_TReg32_X:$src1_X,
+  R600_TReg32_Y:$src0_Y, R600_TReg32_Y:$src1_Y,
+  R600_TReg32_Z:$src0_Z, R600_TReg32_Z:$src1_Z,
+  R600_TReg32_W:$src0_W, R600_TReg32_W:$src1_W))]>;
+
+
+class DOT4_Common <bits<11> inst> : R600_2OP <inst, "DOT4", []>;
+
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+multiclass CUBE_Common <bits<11> inst> {
+
+  def _pseudo : InstR600 <
+    (outs R600_Reg128:$dst),
+    (ins R600_Reg128:$src0),
+    "CUBE $dst $src0",
+    [(set v4f32:$dst, (int_AMDGPU_cube v4f32:$src0))],
+    VecALU
+  > {
+    let isPseudo = 1;
+    let UseNamedOperandTable = 1;
+  }
+
+  def _real : R600_2OP <inst, "CUBE", []>;
+}
+} // End mayLoad = 0, mayStore = 0, hasSideEffects = 0
+
+class EXP_IEEE_Common <bits<11> inst> : R600_1OP_Helper <
+  inst, "EXP_IEEE", fexp2
+> {
+  let Itinerary = TransALU;
+}
+
+class FLT_TO_INT_Common <bits<11> inst> : R600_1OP_Helper <
+  inst, "FLT_TO_INT", fp_to_sint
+> {
+  let Itinerary = TransALU;
+}
+
+class INT_TO_FLT_Common <bits<11> inst> : R600_1OP_Helper <
+  inst, "INT_TO_FLT", sint_to_fp
+> {
+  let Itinerary = TransALU;
+}
+
+class FLT_TO_UINT_Common <bits<11> inst> : R600_1OP_Helper <
+  inst, "FLT_TO_UINT", fp_to_uint
+> {
+  let Itinerary = TransALU;
+}
+
+class UINT_TO_FLT_Common <bits<11> inst> : R600_1OP_Helper <
+  inst, "UINT_TO_FLT", uint_to_fp
+> {
+  let Itinerary = TransALU;
+}
+
+class LOG_CLAMPED_Common <bits<11> inst> : R600_1OP <
+  inst, "LOG_CLAMPED", []
+>;
+
+class LOG_IEEE_Common <bits<11> inst> : R600_1OP_Helper <
+  inst, "LOG_IEEE", flog2
+> {
+  let Itinerary = TransALU;
+}
+
+class LSHL_Common <bits<11> inst> : R600_2OP_Helper <inst, "LSHL", shl>;
+class LSHR_Common <bits<11> inst> : R600_2OP_Helper <inst, "LSHR", srl>;
+class ASHR_Common <bits<11> inst> : R600_2OP_Helper <inst, "ASHR", sra>;
+class MULHI_INT_Common <bits<11> inst> : R600_2OP_Helper <
+  inst, "MULHI_INT", mulhs
+> {
+  let Itinerary = TransALU;
+}
+class MULHI_UINT_Common <bits<11> inst> : R600_2OP_Helper <
+  inst, "MULHI", mulhu
+> {
+  let Itinerary = TransALU;
+}
+class MULLO_INT_Common <bits<11> inst> : R600_2OP_Helper <
+  inst, "MULLO_INT", mul
+> {
+  let Itinerary = TransALU;
+}
+class MULLO_UINT_Common <bits<11> inst> : R600_2OP <inst, "MULLO_UINT", []> {
+  let Itinerary = TransALU;
+}
+
+class RECIP_CLAMPED_Common <bits<11> inst> : R600_1OP <
+  inst, "RECIP_CLAMPED", []
+> {
+  let Itinerary = TransALU;
+}
+
+class RECIP_IEEE_Common <bits<11> inst> : R600_1OP <
+  inst, "RECIP_IEEE", [(set f32:$dst, (fdiv FP_ONE, f32:$src0))]
+> {
+  let Itinerary = TransALU;
+}
+
+class RECIP_UINT_Common <bits<11> inst> : R600_1OP_Helper <
+  inst, "RECIP_UINT", AMDGPUurecip
+> {
+  let Itinerary = TransALU;
+}
+
+// Clamped to maximum.
+class RECIPSQRT_CLAMPED_Common <bits<11> inst> : R600_1OP_Helper <
+  inst, "RECIPSQRT_CLAMPED", AMDGPUrsq_clamped
+> {
+  let Itinerary = TransALU;
+}
+
+class RECIPSQRT_IEEE_Common <bits<11> inst> : R600_1OP_Helper <
+  inst, "RECIPSQRT_IEEE", AMDGPUrsq_legacy
+> {
+  let Itinerary = TransALU;
+}
+
+// TODO: There is also RECIPSQRT_FF which clamps to zero.
+
+class SIN_Common <bits<11> inst> : R600_1OP <
+  inst, "SIN", [(set f32:$dst, (SIN_HW f32:$src0))]>{
+  let Trig = 1;
+  let Itinerary = TransALU;
+}
+
+class COS_Common <bits<11> inst> : R600_1OP <
+  inst, "COS", [(set f32:$dst, (COS_HW f32:$src0))]> {
+  let Trig = 1;
+  let Itinerary = TransALU;
+}
+
+def CLAMP_R600 :  CLAMP <R600_Reg32>;
+def FABS_R600 : FABS<R600_Reg32>;
+def FNEG_R600 : FNEG<R600_Reg32>;
+
+//===----------------------------------------------------------------------===//
+// Helper patterns for complex intrinsics
+//===----------------------------------------------------------------------===//
+
+multiclass DIV_Common <InstR600 recip_ieee> {
+def : Pat<
+  (int_AMDGPU_div f32:$src0, f32:$src1),
+  (MUL_IEEE $src0, (recip_ieee $src1))
+>;
+
+def : Pat<
+  (fdiv f32:$src0, f32:$src1),
+  (MUL_IEEE $src0, (recip_ieee $src1))
+>;
+}
+
+class TGSI_LIT_Z_Common <InstR600 mul_lit, InstR600 log_clamped, InstR600 exp_ieee>
+  : Pat <
+  (int_TGSI_lit_z f32:$src_x, f32:$src_y, f32:$src_w),
+  (exp_ieee (mul_lit (log_clamped (MAX $src_y, (f32 ZERO))), $src_w, $src_x))
+>;
+
+// FROUND pattern
+class FROUNDPat<Instruction CNDGE> : Pat <
+  (AMDGPUround f32:$x),
+  (CNDGE (ADD (FNEG_R600 (f32 HALF)), (FRACT $x)), (CEIL $x), (FLOOR $x))
+>;
+
+
+//===----------------------------------------------------------------------===//
+// R600 / R700 Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isR600] in {
+
+  def MUL_LIT_r600 : MUL_LIT_Common<0x0C>;
+  def MULADD_r600 : MULADD_Common<0x10>;
+  def MULADD_IEEE_r600 : MULADD_IEEE_Common<0x14>;
+  def CNDE_r600 : CNDE_Common<0x18>;
+  def CNDGT_r600 : CNDGT_Common<0x19>;
+  def CNDGE_r600 : CNDGE_Common<0x1A>;
+  def DOT4_r600 : DOT4_Common<0x50>;
+  defm CUBE_r600 : CUBE_Common<0x52>;
+  def EXP_IEEE_r600 : EXP_IEEE_Common<0x61>;
+  def LOG_CLAMPED_r600 : LOG_CLAMPED_Common<0x62>;
+  def LOG_IEEE_r600 : LOG_IEEE_Common<0x63>;
+  def RECIP_CLAMPED_r600 : RECIP_CLAMPED_Common<0x64>;
+  def RECIP_IEEE_r600 : RECIP_IEEE_Common<0x66>;
+  def RECIPSQRT_CLAMPED_r600 : RECIPSQRT_CLAMPED_Common<0x67>;
+  def RECIPSQRT_IEEE_r600 : RECIPSQRT_IEEE_Common<0x69>;
+  def FLT_TO_INT_r600 : FLT_TO_INT_Common<0x6b>;
+  def INT_TO_FLT_r600 : INT_TO_FLT_Common<0x6c>;
+  def FLT_TO_UINT_r600 : FLT_TO_UINT_Common<0x79>;
+  def UINT_TO_FLT_r600 : UINT_TO_FLT_Common<0x6d>;
+  def SIN_r600 : SIN_Common<0x6E>;
+  def COS_r600 : COS_Common<0x6F>;
+  def ASHR_r600 : ASHR_Common<0x70>;
+  def LSHR_r600 : LSHR_Common<0x71>;
+  def LSHL_r600 : LSHL_Common<0x72>;
+  def MULLO_INT_r600 : MULLO_INT_Common<0x73>;
+  def MULHI_INT_r600 : MULHI_INT_Common<0x74>;
+  def MULLO_UINT_r600 : MULLO_UINT_Common<0x75>;
+  def MULHI_UINT_r600 : MULHI_UINT_Common<0x76>;
+  def RECIP_UINT_r600 : RECIP_UINT_Common <0x78>;
+
+  defm DIV_r600 : DIV_Common<RECIP_IEEE_r600>;
+  def : POW_Common <LOG_IEEE_r600, EXP_IEEE_r600, MUL>;
+  def TGSI_LIT_Z_r600 : TGSI_LIT_Z_Common<MUL_LIT_r600, LOG_CLAMPED_r600, EXP_IEEE_r600>;
+
+  def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>;
+  def : FROUNDPat <CNDGE_r600>;
+
+  def R600_ExportSwz : ExportSwzInst {
+    let Word1{20-17} = 0; // BURST_COUNT
+    let Word1{21} = eop;
+    let Word1{22} = 0; // VALID_PIXEL_MODE
+    let Word1{30-23} = inst;
+    let Word1{31} = 1; // BARRIER
+  }
+  defm : ExportPattern<R600_ExportSwz, 39>;
+
+  def R600_ExportBuf : ExportBufInst {
+    let Word1{20-17} = 0; // BURST_COUNT
+    let Word1{21} = eop;
+    let Word1{22} = 0; // VALID_PIXEL_MODE
+    let Word1{30-23} = inst;
+    let Word1{31} = 1; // BARRIER
+  }
+  defm : SteamOutputExportPattern<R600_ExportBuf, 0x20, 0x21, 0x22, 0x23>;
+
+  def CF_TC_R600 : CF_CLAUSE_R600<1, (ins i32imm:$ADDR, i32imm:$CNT),
+  "TEX $CNT @$ADDR"> {
+    let POP_COUNT = 0;
+  }
+  def CF_VC_R600 : CF_CLAUSE_R600<2, (ins i32imm:$ADDR, i32imm:$CNT),
+  "VTX $CNT @$ADDR"> {
+    let POP_COUNT = 0;
+  }
+  def WHILE_LOOP_R600 : CF_CLAUSE_R600<6, (ins i32imm:$ADDR),
+  "LOOP_START_DX10 @$ADDR"> {
+    let POP_COUNT = 0;
+    let CNT = 0;
+  }
+  def END_LOOP_R600 : CF_CLAUSE_R600<5, (ins i32imm:$ADDR), "END_LOOP @$ADDR"> {
+    let POP_COUNT = 0;
+    let CNT = 0;
+  }
+  def LOOP_BREAK_R600 : CF_CLAUSE_R600<9, (ins i32imm:$ADDR),
+  "LOOP_BREAK @$ADDR"> {
+    let POP_COUNT = 0;
+    let CNT = 0;
+  }
+  def CF_CONTINUE_R600 : CF_CLAUSE_R600<8, (ins i32imm:$ADDR),
+  "CONTINUE @$ADDR"> {
+    let POP_COUNT = 0;
+    let CNT = 0;
+  }
+  def CF_JUMP_R600 : CF_CLAUSE_R600<10, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
+  "JUMP @$ADDR POP:$POP_COUNT"> {
+    let CNT = 0;
+  }
+  def CF_PUSH_ELSE_R600 : CF_CLAUSE_R600<12, (ins i32imm:$ADDR),
+  "PUSH_ELSE @$ADDR"> {
+    let CNT = 0;
+  }
+  def CF_ELSE_R600 : CF_CLAUSE_R600<13, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
+  "ELSE @$ADDR POP:$POP_COUNT"> {
+    let CNT = 0;
+  }
+  def CF_CALL_FS_R600 : CF_CLAUSE_R600<19, (ins), "CALL_FS"> {
+    let ADDR = 0;
+    let CNT = 0;
+    let POP_COUNT = 0;
+  }
+  def POP_R600 : CF_CLAUSE_R600<14, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
+  "POP @$ADDR POP:$POP_COUNT"> {
+    let CNT = 0;
+  }
+  def CF_END_R600 : CF_CLAUSE_R600<0, (ins), "CF_END"> {
+    let CNT = 0;
+    let POP_COUNT = 0;
+    let ADDR = 0;
+    let END_OF_PROGRAM = 1;
+  }
+
+}
+
+
+//===----------------------------------------------------------------------===//
+// Regist loads and stores - for indirect addressing
+//===----------------------------------------------------------------------===//
+
+defm R600_ : RegisterLoadStore <R600_Reg32, FRAMEri, ADDRIndirect>;
+
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions
+//===----------------------------------------------------------------------===//
+
+let isPseudo = 1 in {
+
+def PRED_X : InstR600 <
+  (outs R600_Predicate_Bit:$dst),
+  (ins R600_Reg32:$src0, i32imm:$src1, i32imm:$flags),
+  "", [], NullALU> {
+  let FlagOperandIdx = 3;
+}
+
+let isTerminator = 1, isBranch = 1 in {
+def JUMP_COND : InstR600 <
+          (outs),
+          (ins brtarget:$target, R600_Predicate_Bit:$p),
+          "JUMP $target ($p)",
+          [], AnyALU
+  >;
+
+def JUMP : InstR600 <
+          (outs),
+          (ins brtarget:$target),
+          "JUMP $target",
+          [], AnyALU
+  >
+{
+  let isPredicable = 1;
+  let isBarrier = 1;
+}
+
+}  // End isTerminator = 1, isBranch = 1
+
+let usesCustomInserter = 1 in {
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in {
+
+def MASK_WRITE : AMDGPUShaderInst <
+    (outs),
+    (ins R600_Reg32:$src),
+    "MASK_WRITE $src",
+    []
+>;
+
+} // End mayLoad = 0, mayStore = 0, hasSideEffects = 1
+
+
+def TXD: InstR600 <
+  (outs R600_Reg128:$dst),
+  (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2,
+       i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget),
+  "TXD $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget",
+  [(set v4f32:$dst, (int_AMDGPU_txd v4f32:$src0, v4f32:$src1, v4f32:$src2,
+                     imm:$resourceId, imm:$samplerId, imm:$textureTarget))],
+  NullALU > {
+  let TEXInst = 1;
+}
+
+def TXD_SHADOW: InstR600 <
+  (outs R600_Reg128:$dst),
+  (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2,
+       i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget),
+  "TXD_SHADOW $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget",
+  [(set v4f32:$dst, (int_AMDGPU_txd v4f32:$src0, v4f32:$src1, v4f32:$src2,
+        imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))],
+   NullALU
+> {
+  let TEXInst = 1;
+}
+} // End isPseudo = 1
+} // End usesCustomInserter = 1
+
+
+//===----------------------------------------------------------------------===//
+// Constant Buffer Addressing Support
+//===----------------------------------------------------------------------===//
+
+let usesCustomInserter = 1, isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU"  in {
+def CONST_COPY : Instruction {
+  let OutOperandList = (outs R600_Reg32:$dst);
+  let InOperandList = (ins i32imm:$src);
+  let Pattern =
+      [(set R600_Reg32:$dst, (CONST_ADDRESS ADDRGA_CONST_OFFSET:$src))];
+  let AsmString = "CONST_COPY";
+  let neverHasSideEffects = 1;
+  let isAsCheapAsAMove = 1;
+  let Itinerary = NullALU;
+}
+} // end usesCustomInserter = 1, isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU"
+
+def TEX_VTX_CONSTBUF :
+  InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), "VTX_READ_eg $dst, $ptr",
+      [(set v4i32:$dst, (CONST_ADDRESS ADDRGA_VAR_OFFSET:$ptr, (i32 imm:$BUFFER_ID)))]>,
+  VTX_WORD1_GPR, VTX_WORD0_eg {
+
+  let VC_INST = 0;
+  let FETCH_TYPE = 2;
+  let FETCH_WHOLE_QUAD = 0;
+  let SRC_REL = 0;
+  let SRC_SEL_X = 0;
+  let DST_REL = 0;
+  let USE_CONST_FIELDS = 0;
+  let NUM_FORMAT_ALL = 2;
+  let FORMAT_COMP_ALL = 1;
+  let SRF_MODE_ALL = 1;
+  let MEGA_FETCH_COUNT = 16;
+  let DST_SEL_X        = 0;
+  let DST_SEL_Y        = 1;
+  let DST_SEL_Z        = 2;
+  let DST_SEL_W        = 3;
+  let DATA_FORMAT      = 35;
+
+  let Inst{31-0} = Word0;
+  let Inst{63-32} = Word1;
+
+// LLVM can only encode 64-bit instructions, so these fields are manually
+// encoded in R600CodeEmitter
+//
+// bits<16> OFFSET;
+// bits<2>  ENDIAN_SWAP = 0;
+// bits<1>  CONST_BUF_NO_STRIDE = 0;
+// bits<1>  MEGA_FETCH = 0;
+// bits<1>  ALT_CONST = 0;
+// bits<2>  BUFFER_INDEX_MODE = 0;
+
+
+
+// VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding
+// is done in R600CodeEmitter
+//
+// Inst{79-64} = OFFSET;
+// Inst{81-80} = ENDIAN_SWAP;
+// Inst{82}    = CONST_BUF_NO_STRIDE;
+// Inst{83}    = MEGA_FETCH;
+// Inst{84}    = ALT_CONST;
+// Inst{86-85} = BUFFER_INDEX_MODE;
+// Inst{95-86} = 0; Reserved
+
+// VTX_WORD3 (Padding)
+//
+// Inst{127-96} = 0;
+  let VTXInst = 1;
+}
+
+def TEX_VTX_TEXBUF:
+  InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), "TEX_VTX_EXPLICIT_READ $dst, $ptr",
+      [(set v4f32:$dst, (int_R600_load_texbuf ADDRGA_VAR_OFFSET:$ptr, imm:$BUFFER_ID))]>,
+VTX_WORD1_GPR, VTX_WORD0_eg {
+
+let VC_INST = 0;
+let FETCH_TYPE = 2;
+let FETCH_WHOLE_QUAD = 0;
+let SRC_REL = 0;
+let SRC_SEL_X = 0;
+let DST_REL = 0;
+let USE_CONST_FIELDS = 1;
+let NUM_FORMAT_ALL = 0;
+let FORMAT_COMP_ALL = 0;
+let SRF_MODE_ALL = 1;
+let MEGA_FETCH_COUNT = 16;
+let DST_SEL_X        = 0;
+let DST_SEL_Y        = 1;
+let DST_SEL_Z        = 2;
+let DST_SEL_W        = 3;
+let DATA_FORMAT      = 0;
+
+let Inst{31-0} = Word0;
+let Inst{63-32} = Word1;
+
+// LLVM can only encode 64-bit instructions, so these fields are manually
+// encoded in R600CodeEmitter
+//
+// bits<16> OFFSET;
+// bits<2>  ENDIAN_SWAP = 0;
+// bits<1>  CONST_BUF_NO_STRIDE = 0;
+// bits<1>  MEGA_FETCH = 0;
+// bits<1>  ALT_CONST = 0;
+// bits<2>  BUFFER_INDEX_MODE = 0;
+
+
+
+// VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding
+// is done in R600CodeEmitter
+//
+// Inst{79-64} = OFFSET;
+// Inst{81-80} = ENDIAN_SWAP;
+// Inst{82}    = CONST_BUF_NO_STRIDE;
+// Inst{83}    = MEGA_FETCH;
+// Inst{84}    = ALT_CONST;
+// Inst{86-85} = BUFFER_INDEX_MODE;
+// Inst{95-86} = 0; Reserved
+
+// VTX_WORD3 (Padding)
+//
+// Inst{127-96} = 0;
+  let VTXInst = 1;
+}
+
+//===---------------------------------------------------------------------===//
+// Flow and Program control Instructions
+//===---------------------------------------------------------------------===//
+class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern>
+: Instruction {
+
+     let Namespace = "AMDGPU";
+     dag OutOperandList = outs;
+     dag InOperandList = ins;
+     let Pattern = pattern;
+     let AsmString = !strconcat(asmstr, "\n");
+     let isPseudo = 1;
+     let Itinerary = NullALU;
+     bit hasIEEEFlag = 0;
+     bit hasZeroOpFlag = 0;
+     let mayLoad = 0;
+     let mayStore = 0;
+     let hasSideEffects = 0;
+}
+
+multiclass BranchConditional<SDNode Op, RegisterClass rci, RegisterClass rcf> {
+    def _i32 : ILFormat<(outs),
+  (ins brtarget:$target, rci:$src0),
+        "; i32 Pseudo branch instruction",
+  [(Op bb:$target, (i32 rci:$src0))]>;
+    def _f32 : ILFormat<(outs),
+  (ins brtarget:$target, rcf:$src0),
+        "; f32 Pseudo branch instruction",
+  [(Op bb:$target, (f32 rcf:$src0))]>;
+}
+
+// Only scalar types should generate flow control
+multiclass BranchInstr<string name> {
+  def _i32 : ILFormat<(outs), (ins R600_Reg32:$src),
+      !strconcat(name, " $src"), []>;
+  def _f32 : ILFormat<(outs), (ins R600_Reg32:$src),
+      !strconcat(name, " $src"), []>;
+}
+// Only scalar types should generate flow control
+multiclass BranchInstr2<string name> {
+  def _i32 : ILFormat<(outs), (ins R600_Reg32:$src0, R600_Reg32:$src1),
+      !strconcat(name, " $src0, $src1"), []>;
+  def _f32 : ILFormat<(outs), (ins R600_Reg32:$src0, R600_Reg32:$src1),
+      !strconcat(name, " $src0, $src1"), []>;
+}
+
+//===---------------------------------------------------------------------===//
+// Custom Inserter for Branches and returns, this eventually will be a
+// separate pass
+//===---------------------------------------------------------------------===//
+let isTerminator = 1, usesCustomInserter = 1, isBranch = 1, isBarrier = 1 in {
+  def BRANCH : ILFormat<(outs), (ins brtarget:$target),
+      "; Pseudo unconditional branch instruction",
+      [(br bb:$target)]>;
+  defm BRANCH_COND : BranchConditional<IL_brcond, R600_Reg32, R600_Reg32>;
+}
+
+//===---------------------------------------------------------------------===//
+// Return instruction
+//===---------------------------------------------------------------------===//
+let isTerminator = 1, isReturn = 1, hasCtrlDep = 1,
+    usesCustomInserter = 1 in {
+  def RETURN          : ILFormat<(outs), (ins variable_ops),
+      "RETURN", [(IL_retflag)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Branch Instructions
+//===----------------------------------------------------------------------===//
+
+def IF_PREDICATE_SET  : ILFormat<(outs), (ins R600_Reg32:$src),
+  "IF_PREDICATE_SET $src", []>;
+
+let isTerminator=1 in {
+  def BREAK       : ILFormat< (outs), (ins),
+      "BREAK", []>;
+  def CONTINUE    : ILFormat< (outs), (ins),
+      "CONTINUE", []>;
+  def DEFAULT     : ILFormat< (outs), (ins),
+      "DEFAULT", []>;
+  def ELSE        : ILFormat< (outs), (ins),
+      "ELSE", []>;
+  def ENDSWITCH   : ILFormat< (outs), (ins),
+      "ENDSWITCH", []>;
+  def ENDMAIN     : ILFormat< (outs), (ins),
+      "ENDMAIN", []>;
+  def END         : ILFormat< (outs), (ins),
+      "END", []>;
+  def ENDFUNC     : ILFormat< (outs), (ins),
+      "ENDFUNC", []>;
+  def ENDIF       : ILFormat< (outs), (ins),
+      "ENDIF", []>;
+  def WHILELOOP   : ILFormat< (outs), (ins),
+      "WHILE", []>;
+  def ENDLOOP     : ILFormat< (outs), (ins),
+      "ENDLOOP", []>;
+  def FUNC        : ILFormat< (outs), (ins),
+      "FUNC", []>;
+  def RETDYN      : ILFormat< (outs), (ins),
+      "RET_DYN", []>;
+  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+  defm IF_LOGICALNZ  : BranchInstr<"IF_LOGICALNZ">;
+  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+  defm IF_LOGICALZ   : BranchInstr<"IF_LOGICALZ">;
+  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+  defm BREAK_LOGICALNZ : BranchInstr<"BREAK_LOGICALNZ">;
+  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+  defm BREAK_LOGICALZ : BranchInstr<"BREAK_LOGICALZ">;
+  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+  defm CONTINUE_LOGICALNZ : BranchInstr<"CONTINUE_LOGICALNZ">;
+  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+  defm CONTINUE_LOGICALZ : BranchInstr<"CONTINUE_LOGICALZ">;
+  defm IFC         : BranchInstr2<"IFC">;
+  defm BREAKC      : BranchInstr2<"BREAKC">;
+  defm CONTINUEC   : BranchInstr2<"CONTINUEC">;
+}
+
+//===----------------------------------------------------------------------===//
+// Indirect addressing pseudo instructions
+//===----------------------------------------------------------------------===//
+
+let isPseudo = 1 in {
+
+class ExtractVertical <RegisterClass vec_rc> : InstR600 <
+  (outs R600_Reg32:$dst),
+  (ins vec_rc:$vec, R600_Reg32:$index), "",
+  [],
+  AnyALU
+>;
+
+let Constraints = "$dst = $vec" in {
+
+class InsertVertical <RegisterClass vec_rc> : InstR600 <
+  (outs vec_rc:$dst),
+  (ins vec_rc:$vec, R600_Reg32:$value, R600_Reg32:$index), "",
+  [],
+  AnyALU
+>;
+
+} // End Constraints = "$dst = $vec"
+
+} // End isPseudo = 1
+
+def R600_EXTRACT_ELT_V2 : ExtractVertical <R600_Reg64Vertical>;
+def R600_EXTRACT_ELT_V4 : ExtractVertical <R600_Reg128Vertical>;
+
+def R600_INSERT_ELT_V2 : InsertVertical <R600_Reg64Vertical>;
+def R600_INSERT_ELT_V4 : InsertVertical <R600_Reg128Vertical>;
+
+class ExtractVerticalPat <Instruction inst, ValueType vec_ty,
+                          ValueType scalar_ty> : Pat <
+  (scalar_ty (extractelt vec_ty:$vec, i32:$index)),
+  (inst $vec, $index)
+>;
+
+def : ExtractVerticalPat <R600_EXTRACT_ELT_V2, v2i32, i32>;
+def : ExtractVerticalPat <R600_EXTRACT_ELT_V2, v2f32, f32>;
+def : ExtractVerticalPat <R600_EXTRACT_ELT_V4, v4i32, i32>;
+def : ExtractVerticalPat <R600_EXTRACT_ELT_V4, v4f32, f32>;
+
+class InsertVerticalPat <Instruction inst, ValueType vec_ty,
+                         ValueType scalar_ty> : Pat <
+  (vec_ty (insertelt vec_ty:$vec, scalar_ty:$value, i32:$index)),
+  (inst $vec, $value, $index)
+>;
+
+def : InsertVerticalPat <R600_INSERT_ELT_V2, v2i32, i32>;
+def : InsertVerticalPat <R600_INSERT_ELT_V2, v2f32, f32>;
+def : InsertVerticalPat <R600_INSERT_ELT_V4, v4i32, i32>;
+def : InsertVerticalPat <R600_INSERT_ELT_V4, v4f32, f32>;
+
+//===----------------------------------------------------------------------===//
+// ISel Patterns
+//===----------------------------------------------------------------------===//
+
+// CND*_INT Pattterns for f32 True / False values
+
+class CND_INT_f32 <InstR600 cnd, CondCode cc> : Pat <
+  (selectcc i32:$src0, 0, f32:$src1, f32:$src2, cc),
+  (cnd $src0, $src1, $src2)
+>;
+
+def : CND_INT_f32 <CNDE_INT,  SETEQ>;
+def : CND_INT_f32 <CNDGT_INT, SETGT>;
+def : CND_INT_f32 <CNDGE_INT, SETGE>;
+
+//CNDGE_INT extra pattern
+def : Pat <
+  (selectcc i32:$src0, -1, i32:$src1, i32:$src2, COND_SGT),
+  (CNDGE_INT $src0, $src1, $src2)
+>;
+
+// KIL Patterns
+def KILP : Pat <
+  (int_AMDGPU_kilp),
+  (MASK_WRITE (KILLGT (f32 ONE), (f32 ZERO)))
+>;
+
+def KIL : Pat <
+  (int_AMDGPU_kill f32:$src0),
+  (MASK_WRITE (KILLGT (f32 ZERO), $src0))
+>;
+
+def : Extract_Element <f32, v4f32, 0, sub0>;
+def : Extract_Element <f32, v4f32, 1, sub1>;
+def : Extract_Element <f32, v4f32, 2, sub2>;
+def : Extract_Element <f32, v4f32, 3, sub3>;
+
+def : Insert_Element <f32, v4f32, 0, sub0>;
+def : Insert_Element <f32, v4f32, 1, sub1>;
+def : Insert_Element <f32, v4f32, 2, sub2>;
+def : Insert_Element <f32, v4f32, 3, sub3>;
+
+def : Extract_Element <i32, v4i32, 0, sub0>;
+def : Extract_Element <i32, v4i32, 1, sub1>;
+def : Extract_Element <i32, v4i32, 2, sub2>;
+def : Extract_Element <i32, v4i32, 3, sub3>;
+
+def : Insert_Element <i32, v4i32, 0, sub0>;
+def : Insert_Element <i32, v4i32, 1, sub1>;
+def : Insert_Element <i32, v4i32, 2, sub2>;
+def : Insert_Element <i32, v4i32, 3, sub3>;
+
+def : Extract_Element <f32, v2f32, 0, sub0>;
+def : Extract_Element <f32, v2f32, 1, sub1>;
+
+def : Insert_Element <f32, v2f32, 0, sub0>;
+def : Insert_Element <f32, v2f32, 1, sub1>;
+
+def : Extract_Element <i32, v2i32, 0, sub0>;
+def : Extract_Element <i32, v2i32, 1, sub1>;
+
+def : Insert_Element <i32, v2i32, 0, sub0>;
+def : Insert_Element <i32, v2i32, 1, sub1>;
+
+// bitconvert patterns
+
+def : BitConvert <i32, f32, R600_Reg32>;
+def : BitConvert <f32, i32, R600_Reg32>;
+def : BitConvert <v2f32, v2i32, R600_Reg64>;
+def : BitConvert <v2i32, v2f32, R600_Reg64>;
+def : BitConvert <v4f32, v4i32, R600_Reg128>;
+def : BitConvert <v4i32, v4f32, R600_Reg128>;
+
+// DWORDADDR pattern
+def : DwordAddrPat  <i32, R600_Reg32>;
+
+} // End isR600toCayman Predicate
+
+let Predicates = [isR600] in {
+// Intrinsic patterns
+defm : Expand24IBitOps<MULLO_INT_r600, ADD_INT>;
+defm : Expand24UBitOps<MULLO_UINT_r600, ADD_INT>;
+} // End isR600
+
+def getLDSNoRetOp : InstrMapping {
+  let FilterClass = "R600_LDS_1A1D";
+  let RowFields = ["BaseOp"];
+  let ColFields = ["DisableEncoding"];
+  let KeyCol = ["$dst"];
+  let ValueCols = [[""""]];
+}
diff --git a/contrib/llvm/lib/Target/R600/R600Intrinsics.td b/contrib/llvm/lib/Target/R600/R600Intrinsics.td
new file mode 100644
index 0000000..9681747
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/R600Intrinsics.td
@@ -0,0 +1,75 @@
+//===-- R600Intrinsics.td - R600 Instrinsic defs -------*- tablegen -*-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// R600 Intrinsic Definitions
+//
+//===----------------------------------------------------------------------===//
+
+let TargetPrefix = "R600", isTarget = 1 in {
+  class TextureIntrinsicFloatInput :
+    Intrinsic<[llvm_v4f32_ty], [
+      llvm_v4f32_ty, // Coord
+      llvm_i32_ty, // offset_x
+      llvm_i32_ty, // offset_y,
+      llvm_i32_ty, // offset_z,
+      llvm_i32_ty, // resource_id
+      llvm_i32_ty, // samplerid
+      llvm_i32_ty, // coord_type_x
+      llvm_i32_ty, // coord_type_y
+      llvm_i32_ty, // coord_type_z
+      llvm_i32_ty // coord_type_w
+    ], [IntrNoMem]>;
+  class TextureIntrinsicInt32Input :
+    Intrinsic<[llvm_v4i32_ty], [
+      llvm_v4i32_ty, // Coord
+      llvm_i32_ty, // offset_x
+      llvm_i32_ty, // offset_y,
+      llvm_i32_ty, // offset_z,
+      llvm_i32_ty, // resource_id
+      llvm_i32_ty, // samplerid
+      llvm_i32_ty, // coord_type_x
+      llvm_i32_ty, // coord_type_y
+      llvm_i32_ty, // coord_type_z
+      llvm_i32_ty // coord_type_w
+    ], [IntrNoMem]>;
+
+  def int_R600_load_input :
+    Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_R600_interp_input :
+    Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_R600_interp_const :
+    Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty], [IntrNoMem]>;
+def int_R600_interp_xy :
+    Intrinsic<[llvm_v2f32_ty], [llvm_i32_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+def int_R600_interp_zw :
+    Intrinsic<[llvm_v2f32_ty], [llvm_i32_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_R600_load_texbuf :
+    Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_R600_tex : TextureIntrinsicFloatInput;
+  def int_R600_texc : TextureIntrinsicFloatInput;
+  def int_R600_txl : TextureIntrinsicFloatInput;
+  def int_R600_txlc : TextureIntrinsicFloatInput;
+  def int_R600_txb : TextureIntrinsicFloatInput;
+  def int_R600_txbc : TextureIntrinsicFloatInput;
+  def int_R600_txf : TextureIntrinsicInt32Input;
+  def int_R600_ldptr : TextureIntrinsicInt32Input;
+  def int_R600_txq : TextureIntrinsicInt32Input;
+  def int_R600_ddx : TextureIntrinsicFloatInput;
+  def int_R600_ddy : TextureIntrinsicFloatInput;
+  def int_R600_store_swizzle :
+    Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>;
+  def int_R600_store_stream_output :
+    Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
+  def int_R600_store_pixel_depth :
+      Intrinsic<[], [llvm_float_ty], []>;
+  def int_R600_store_pixel_stencil :
+      Intrinsic<[], [llvm_float_ty], []>;
+  def int_R600_store_dummy :
+      Intrinsic<[], [llvm_i32_ty], []>;
+}
diff --git a/contrib/llvm/lib/Target/R600/R600MachineFunctionInfo.cpp b/contrib/llvm/lib/Target/R600/R600MachineFunctionInfo.cpp
new file mode 100644
index 0000000..01105c6
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/R600MachineFunctionInfo.cpp
@@ -0,0 +1,20 @@
+//===-- R600MachineFunctionInfo.cpp - R600 Machine Function Info-*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#include "R600MachineFunctionInfo.h"
+
+using namespace llvm;
+
+
+// Pin the vtable to this file.
+void R600MachineFunctionInfo::anchor() {}
+
+R600MachineFunctionInfo::R600MachineFunctionInfo(const MachineFunction &MF)
+  : AMDGPUMachineFunction(MF) { }
diff --git a/contrib/llvm/lib/Target/R600/R600MachineFunctionInfo.h b/contrib/llvm/lib/Target/R600/R600MachineFunctionInfo.h
new file mode 100644
index 0000000..b0ae22e
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/R600MachineFunctionInfo.h
@@ -0,0 +1,34 @@
+//===-- R600MachineFunctionInfo.h - R600 Machine Function Info ----*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#ifndef R600MACHINEFUNCTIONINFO_H
+#define R600MACHINEFUNCTIONINFO_H
+
+#include "AMDGPUMachineFunction.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include <vector>
+
+namespace llvm {
+
+class R600MachineFunctionInfo : public AMDGPUMachineFunction {
+  void anchor() override;
+public:
+  R600MachineFunctionInfo(const MachineFunction &MF);
+  SmallVector<unsigned, 4> LiveOuts;
+  std::vector<unsigned> IndirectRegs;
+  unsigned StackSize;
+};
+
+} // End llvm namespace
+
+#endif //R600MACHINEFUNCTIONINFO_H
diff --git a/contrib/llvm/lib/Target/R600/R600MachineScheduler.cpp b/contrib/llvm/lib/Target/R600/R600MachineScheduler.cpp
new file mode 100644
index 0000000..7ea654c
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/R600MachineScheduler.cpp
@@ -0,0 +1,467 @@
+//===-- R600MachineScheduler.cpp - R600 Scheduler Interface -*- C++ -*-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief R600 Machine Scheduler interface
+//
+//===----------------------------------------------------------------------===//
+
+#include "R600MachineScheduler.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "misched"
+
+void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
+  assert(dag->hasVRegLiveness() && "R600SchedStrategy needs vreg liveness");
+  DAG = static_cast<ScheduleDAGMILive*>(dag);
+  TII = static_cast<const R600InstrInfo*>(DAG->TII);
+  TRI = static_cast<const R600RegisterInfo*>(DAG->TRI);
+  VLIW5 = !DAG->MF.getTarget().getSubtarget<AMDGPUSubtarget>().hasCaymanISA();
+  MRI = &DAG->MRI;
+  CurInstKind = IDOther;
+  CurEmitted = 0;
+  OccupedSlotsMask = 31;
+  InstKindLimit[IDAlu] = TII->getMaxAlusPerClause();
+  InstKindLimit[IDOther] = 32;
+
+  const AMDGPUSubtarget &ST = DAG->TM.getSubtarget<AMDGPUSubtarget>();
+  InstKindLimit[IDFetch] = ST.getTexVTXClauseSize();
+  AluInstCount = 0;
+  FetchInstCount = 0;
+}
+
+void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc,
+                                  std::vector<SUnit *> &QDst)
+{
+  QDst.insert(QDst.end(), QSrc.begin(), QSrc.end());
+  QSrc.clear();
+}
+
+static
+unsigned getWFCountLimitedByGPR(unsigned GPRCount) {
+  assert (GPRCount && "GPRCount cannot be 0");
+  return 248 / GPRCount;
+}
+
+SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
+  SUnit *SU = nullptr;
+  NextInstKind = IDOther;
+
+  IsTopNode = false;
+
+  // check if we might want to switch current clause type
+  bool AllowSwitchToAlu = (CurEmitted >= InstKindLimit[CurInstKind]) ||
+      (Available[CurInstKind].empty());
+  bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind]) &&
+      (!Available[IDFetch].empty() || !Available[IDOther].empty());
+
+  if (CurInstKind == IDAlu && !Available[IDFetch].empty()) {
+    // We use the heuristic provided by AMD Accelerated Parallel Processing
+    // OpenCL Programming Guide :
+    // The approx. number of WF that allows TEX inst to hide ALU inst is :
+    // 500 (cycles for TEX) / (AluFetchRatio * 8 (cycles for ALU))
+    float ALUFetchRationEstimate =
+        (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) /
+        (FetchInstCount + Available[IDFetch].size());
+    unsigned NeededWF = 62.5f / ALUFetchRationEstimate;
+    DEBUG( dbgs() << NeededWF << " approx. Wavefronts Required\n" );
+    // We assume the local GPR requirements to be "dominated" by the requirement
+    // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and
+    // after TEX are indeed likely to consume or generate values from/for the
+    // TEX clause.
+    // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause
+    // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need
+    // one GPR) or TmXYZW = TnXYZW (need 2 GPR).
+    // (TODO : use RegisterPressure)
+    // If we are going too use too many GPR, we flush Fetch instruction to lower
+    // register pressure on 128 bits regs.
+    unsigned NearRegisterRequirement = 2 * Available[IDFetch].size();
+    if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement))
+      AllowSwitchFromAlu = true;
+  }
+
+  if (!SU && ((AllowSwitchToAlu && CurInstKind != IDAlu) ||
+      (!AllowSwitchFromAlu && CurInstKind == IDAlu))) {
+    // try to pick ALU
+    SU = pickAlu();
+    if (!SU && !PhysicalRegCopy.empty()) {
+      SU = PhysicalRegCopy.front();
+      PhysicalRegCopy.erase(PhysicalRegCopy.begin());
+    }
+    if (SU) {
+      if (CurEmitted >= InstKindLimit[IDAlu])
+        CurEmitted = 0;
+      NextInstKind = IDAlu;
+    }
+  }
+
+  if (!SU) {
+    // try to pick FETCH
+    SU = pickOther(IDFetch);
+    if (SU)
+      NextInstKind = IDFetch;
+  }
+
+  // try to pick other
+  if (!SU) {
+    SU = pickOther(IDOther);
+    if (SU)
+      NextInstKind = IDOther;
+  }
+
+  DEBUG(
+      if (SU) {
+        dbgs() << " ** Pick node **\n";
+        SU->dump(DAG);
+      } else {
+        dbgs() << "NO NODE \n";
+        for (unsigned i = 0; i < DAG->SUnits.size(); i++) {
+          const SUnit &S = DAG->SUnits[i];
+          if (!S.isScheduled)
+            S.dump(DAG);
+        }
+      }
+  );
+
+  return SU;
+}
+
+void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
+  if (NextInstKind != CurInstKind) {
+    DEBUG(dbgs() << "Instruction Type Switch\n");
+    if (NextInstKind != IDAlu)
+      OccupedSlotsMask |= 31;
+    CurEmitted = 0;
+    CurInstKind = NextInstKind;
+  }
+
+  if (CurInstKind == IDAlu) {
+    AluInstCount ++;
+    switch (getAluKind(SU)) {
+    case AluT_XYZW:
+      CurEmitted += 4;
+      break;
+    case AluDiscarded:
+      break;
+    default: {
+      ++CurEmitted;
+      for (MachineInstr::mop_iterator It = SU->getInstr()->operands_begin(),
+          E = SU->getInstr()->operands_end(); It != E; ++It) {
+        MachineOperand &MO = *It;
+        if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X)
+          ++CurEmitted;
+      }
+    }
+    }
+  } else {
+    ++CurEmitted;
+  }
+
+
+  DEBUG(dbgs() << CurEmitted << " Instructions Emitted in this clause\n");
+
+  if (CurInstKind != IDFetch) {
+    MoveUnits(Pending[IDFetch], Available[IDFetch]);
+  } else
+    FetchInstCount++;
+}
+
+static bool
+isPhysicalRegCopy(MachineInstr *MI) {
+  if (MI->getOpcode() != AMDGPU::COPY)
+    return false;
+
+  return !TargetRegisterInfo::isVirtualRegister(MI->getOperand(1).getReg());
+}
+
+void R600SchedStrategy::releaseTopNode(SUnit *SU) {
+  DEBUG(dbgs() << "Top Releasing ";SU->dump(DAG););
+}
+
+void R600SchedStrategy::releaseBottomNode(SUnit *SU) {
+  DEBUG(dbgs() << "Bottom Releasing ";SU->dump(DAG););
+  if (isPhysicalRegCopy(SU->getInstr())) {
+    PhysicalRegCopy.push_back(SU);
+    return;
+  }
+
+  int IK = getInstKind(SU);
+
+  // There is no export clause, we can schedule one as soon as its ready
+  if (IK == IDOther)
+    Available[IDOther].push_back(SU);
+  else
+    Pending[IK].push_back(SU);
+
+}
+
+bool R600SchedStrategy::regBelongsToClass(unsigned Reg,
+                                          const TargetRegisterClass *RC) const {
+  if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+    return RC->contains(Reg);
+  } else {
+    return MRI->getRegClass(Reg) == RC;
+  }
+}
+
+R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
+  MachineInstr *MI = SU->getInstr();
+
+  if (TII->isTransOnly(MI))
+    return AluTrans;
+
+    switch (MI->getOpcode()) {
+    case AMDGPU::PRED_X:
+      return AluPredX;
+    case AMDGPU::INTERP_PAIR_XY:
+    case AMDGPU::INTERP_PAIR_ZW:
+    case AMDGPU::INTERP_VEC_LOAD:
+    case AMDGPU::DOT_4:
+      return AluT_XYZW;
+    case AMDGPU::COPY:
+      if (MI->getOperand(1).isUndef()) {
+        // MI will become a KILL, don't considers it in scheduling
+        return AluDiscarded;
+      }
+    default:
+      break;
+    }
+
+    // Does the instruction take a whole IG ?
+    // XXX: Is it possible to add a helper function in R600InstrInfo that can
+    // be used here and in R600PacketizerList::isSoloInstruction() ?
+    if(TII->isVector(*MI) ||
+        TII->isCubeOp(MI->getOpcode()) ||
+        TII->isReductionOp(MI->getOpcode()) ||
+        MI->getOpcode() == AMDGPU::GROUP_BARRIER) {
+      return AluT_XYZW;
+    }
+
+    if (TII->isLDSInstr(MI->getOpcode())) {
+      return AluT_X;
+    }
+
+    // Is the result already assigned to a channel ?
+    unsigned DestSubReg = MI->getOperand(0).getSubReg();
+    switch (DestSubReg) {
+    case AMDGPU::sub0:
+      return AluT_X;
+    case AMDGPU::sub1:
+      return AluT_Y;
+    case AMDGPU::sub2:
+      return AluT_Z;
+    case AMDGPU::sub3:
+      return AluT_W;
+    default:
+      break;
+    }
+
+    // Is the result already member of a X/Y/Z/W class ?
+    unsigned DestReg = MI->getOperand(0).getReg();
+    if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_XRegClass) ||
+        regBelongsToClass(DestReg, &AMDGPU::R600_AddrRegClass))
+      return AluT_X;
+    if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_YRegClass))
+      return AluT_Y;
+    if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass))
+      return AluT_Z;
+    if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_WRegClass))
+      return AluT_W;
+    if (regBelongsToClass(DestReg, &AMDGPU::R600_Reg128RegClass))
+      return AluT_XYZW;
+
+    // LDS src registers cannot be used in the Trans slot.
+    if (TII->readsLDSSrcReg(MI))
+      return AluT_XYZW;
+
+    return AluAny;
+
+}
+
+int R600SchedStrategy::getInstKind(SUnit* SU) {
+  int Opcode = SU->getInstr()->getOpcode();
+
+  if (TII->usesTextureCache(Opcode) || TII->usesVertexCache(Opcode))
+    return IDFetch;
+
+  if (TII->isALUInstr(Opcode)) {
+    return IDAlu;
+  }
+
+  switch (Opcode) {
+  case AMDGPU::PRED_X:
+  case AMDGPU::COPY:
+  case AMDGPU::CONST_COPY:
+  case AMDGPU::INTERP_PAIR_XY:
+  case AMDGPU::INTERP_PAIR_ZW:
+  case AMDGPU::INTERP_VEC_LOAD:
+  case AMDGPU::DOT_4:
+    return IDAlu;
+  default:
+    return IDOther;
+  }
+}
+
+SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) {
+  if (Q.empty())
+    return nullptr;
+  for (std::vector<SUnit *>::reverse_iterator It = Q.rbegin(), E = Q.rend();
+      It != E; ++It) {
+    SUnit *SU = *It;
+    InstructionsGroupCandidate.push_back(SU->getInstr());
+    if (TII->fitsConstReadLimitations(InstructionsGroupCandidate)
+        && (!AnyALU || !TII->isVectorOnly(SU->getInstr()))
+    ) {
+      InstructionsGroupCandidate.pop_back();
+      Q.erase((It + 1).base());
+      return SU;
+    } else {
+      InstructionsGroupCandidate.pop_back();
+    }
+  }
+  return nullptr;
+}
+
+void R600SchedStrategy::LoadAlu() {
+  std::vector<SUnit *> &QSrc = Pending[IDAlu];
+  for (unsigned i = 0, e = QSrc.size(); i < e; ++i) {
+    AluKind AK = getAluKind(QSrc[i]);
+    AvailableAlus[AK].push_back(QSrc[i]);
+  }
+  QSrc.clear();
+}
+
+void R600SchedStrategy::PrepareNextSlot() {
+  DEBUG(dbgs() << "New Slot\n");
+  assert (OccupedSlotsMask && "Slot wasn't filled");
+  OccupedSlotsMask = 0;
+//  if (HwGen == AMDGPUSubtarget::NORTHERN_ISLANDS)
+//    OccupedSlotsMask |= 16;
+  InstructionsGroupCandidate.clear();
+  LoadAlu();
+}
+
+void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) {
+  int DstIndex = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
+  if (DstIndex == -1) {
+    return;
+  }
+  unsigned DestReg = MI->getOperand(DstIndex).getReg();
+  // PressureRegister crashes if an operand is def and used in the same inst
+  // and we try to constraint its regclass
+  for (MachineInstr::mop_iterator It = MI->operands_begin(),
+      E = MI->operands_end(); It != E; ++It) {
+    MachineOperand &MO = *It;
+    if (MO.isReg() && !MO.isDef() &&
+        MO.getReg() == DestReg)
+      return;
+  }
+  // Constrains the regclass of DestReg to assign it to Slot
+  switch (Slot) {
+  case 0:
+    MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_XRegClass);
+    break;
+  case 1:
+    MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_YRegClass);
+    break;
+  case 2:
+    MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass);
+    break;
+  case 3:
+    MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_WRegClass);
+    break;
+  }
+}
+
+SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot, bool AnyAlu) {
+  static const AluKind IndexToID[] = {AluT_X, AluT_Y, AluT_Z, AluT_W};
+  SUnit *SlotedSU = PopInst(AvailableAlus[IndexToID[Slot]], AnyAlu);
+  if (SlotedSU)
+    return SlotedSU;
+  SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny], AnyAlu);
+  if (UnslotedSU)
+    AssignSlot(UnslotedSU->getInstr(), Slot);
+  return UnslotedSU;
+}
+
+unsigned R600SchedStrategy::AvailablesAluCount() const {
+  return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() +
+      AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() +
+      AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() +
+      AvailableAlus[AluTrans].size() + AvailableAlus[AluDiscarded].size() +
+      AvailableAlus[AluPredX].size();
+}
+
+SUnit* R600SchedStrategy::pickAlu() {
+  while (AvailablesAluCount() || !Pending[IDAlu].empty()) {
+    if (!OccupedSlotsMask) {
+      // Bottom up scheduling : predX must comes first
+      if (!AvailableAlus[AluPredX].empty()) {
+        OccupedSlotsMask |= 31;
+        return PopInst(AvailableAlus[AluPredX], false);
+      }
+      // Flush physical reg copies (RA will discard them)
+      if (!AvailableAlus[AluDiscarded].empty()) {
+        OccupedSlotsMask |= 31;
+        return PopInst(AvailableAlus[AluDiscarded], false);
+      }
+      // If there is a T_XYZW alu available, use it
+      if (!AvailableAlus[AluT_XYZW].empty()) {
+        OccupedSlotsMask |= 15;
+        return PopInst(AvailableAlus[AluT_XYZW], false);
+      }
+    }
+    bool TransSlotOccuped = OccupedSlotsMask & 16;
+    if (!TransSlotOccuped && VLIW5) {
+      if (!AvailableAlus[AluTrans].empty()) {
+        OccupedSlotsMask |= 16;
+        return PopInst(AvailableAlus[AluTrans], false);
+      }
+      SUnit *SU = AttemptFillSlot(3, true);
+      if (SU) {
+        OccupedSlotsMask |= 16;
+        return SU;
+      }
+    }
+    for (int Chan = 3; Chan > -1; --Chan) {
+      bool isOccupied = OccupedSlotsMask & (1 << Chan);
+      if (!isOccupied) {
+        SUnit *SU = AttemptFillSlot(Chan, false);
+        if (SU) {
+          OccupedSlotsMask |= (1 << Chan);
+          InstructionsGroupCandidate.push_back(SU->getInstr());
+          return SU;
+        }
+      }
+    }
+    PrepareNextSlot();
+  }
+  return nullptr;
+}
+
+SUnit* R600SchedStrategy::pickOther(int QID) {
+  SUnit *SU = nullptr;
+  std::vector<SUnit *> &AQ = Available[QID];
+
+  if (AQ.empty()) {
+    MoveUnits(Pending[QID], AQ);
+  }
+  if (!AQ.empty()) {
+    SU = AQ.back();
+    AQ.resize(AQ.size() - 1);
+  }
+  return SU;
+}
diff --git a/contrib/llvm/lib/Target/R600/R600MachineScheduler.h b/contrib/llvm/lib/Target/R600/R600MachineScheduler.h
new file mode 100644
index 0000000..fd475af
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/R600MachineScheduler.h
@@ -0,0 +1,103 @@
+//===-- R600MachineScheduler.h - R600 Scheduler Interface -*- C++ -*-------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief R600 Machine Scheduler interface
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef R600MACHINESCHEDULER_H_
+#define R600MACHINESCHEDULER_H_
+
+#include "R600InstrInfo.h"
+#include "llvm/ADT/PriorityQueue.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+namespace llvm {
+
+class R600SchedStrategy : public MachineSchedStrategy {
+
+  const ScheduleDAGMILive *DAG;
+  const R600InstrInfo *TII;
+  const R600RegisterInfo *TRI;
+  MachineRegisterInfo *MRI;
+
+  enum InstKind {
+    IDAlu,
+    IDFetch,
+    IDOther,
+    IDLast
+  };
+
+  enum AluKind {
+    AluAny,
+    AluT_X,
+    AluT_Y,
+    AluT_Z,
+    AluT_W,
+    AluT_XYZW,
+    AluPredX,
+    AluTrans,
+    AluDiscarded, // LLVM Instructions that are going to be eliminated
+    AluLast
+  };
+
+  std::vector<SUnit *> Available[IDLast], Pending[IDLast];
+  std::vector<SUnit *> AvailableAlus[AluLast];
+  std::vector<SUnit *> PhysicalRegCopy;
+
+  InstKind CurInstKind;
+  int CurEmitted;
+  InstKind NextInstKind;
+
+  unsigned AluInstCount;
+  unsigned FetchInstCount;
+
+  int InstKindLimit[IDLast];
+
+  int OccupedSlotsMask;
+
+public:
+  R600SchedStrategy() :
+    DAG(nullptr), TII(nullptr), TRI(nullptr), MRI(nullptr) {
+  }
+
+  virtual ~R600SchedStrategy() {}
+
+  void initialize(ScheduleDAGMI *dag) override;
+  SUnit *pickNode(bool &IsTopNode) override;
+  void schedNode(SUnit *SU, bool IsTopNode) override;
+  void releaseTopNode(SUnit *SU) override;
+  void releaseBottomNode(SUnit *SU) override;
+
+private:
+  std::vector<MachineInstr *> InstructionsGroupCandidate;
+  bool VLIW5;
+
+  int getInstKind(SUnit *SU);
+  bool regBelongsToClass(unsigned Reg, const TargetRegisterClass *RC) const;
+  AluKind getAluKind(SUnit *SU) const;
+  void LoadAlu();
+  unsigned AvailablesAluCount() const;
+  SUnit *AttemptFillSlot (unsigned Slot, bool AnyAlu);
+  void PrepareNextSlot();
+  SUnit *PopInst(std::vector<SUnit*> &Q, bool AnyALU);
+
+  void AssignSlot(MachineInstr *MI, unsigned Slot);
+  SUnit* pickAlu();
+  SUnit* pickOther(int QID);
+  void MoveUnits(std::vector<SUnit *> &QSrc, std::vector<SUnit *> &QDst);
+};
+
+} // namespace llvm
+
+#endif /* R600MACHINESCHEDULER_H_ */
diff --git a/contrib/llvm/lib/Target/R600/R600OptimizeVectorRegisters.cpp b/contrib/llvm/lib/Target/R600/R600OptimizeVectorRegisters.cpp
new file mode 100644
index 0000000..2314136
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/R600OptimizeVectorRegisters.cpp
@@ -0,0 +1,382 @@
+//===--------------------- R600MergeVectorRegisters.cpp -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass merges inputs of swizzeable instructions into vector sharing
+/// common data and/or have enough undef subreg using swizzle abilities.
+///
+/// For instance let's consider the following pseudo code :
+/// vreg5<def> = REG_SEQ vreg1, sub0, vreg2, sub1, vreg3, sub2, undef, sub3
+/// ...
+/// vreg7<def> = REG_SEQ vreg1, sub0, vreg3, sub1, undef, sub2, vreg4, sub3
+/// (swizzable Inst) vreg7, SwizzleMask : sub0, sub1, sub2, sub3
+///
+/// is turned into :
+/// vreg5<def> = REG_SEQ vreg1, sub0, vreg2, sub1, vreg3, sub2, undef, sub3
+/// ...
+/// vreg7<def> = INSERT_SUBREG vreg4, sub3
+/// (swizzable Inst) vreg7, SwizzleMask : sub0, sub2, sub1, sub3
+///
+/// This allow regalloc to reduce register pressure for vector registers and
+/// to reduce MOV count.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Debug.h"
+#include "AMDGPU.h"
+#include "R600InstrInfo.h"
+#include "llvm/CodeGen/DFAPacketizer.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "vec-merger"
+
+namespace {
+
+static bool
+isImplicitlyDef(MachineRegisterInfo &MRI, unsigned Reg) {
+  for (MachineRegisterInfo::def_instr_iterator It = MRI.def_instr_begin(Reg),
+      E = MRI.def_instr_end(); It != E; ++It) {
+    return (*It).isImplicitDef();
+  }
+  if (MRI.isReserved(Reg)) {
+    return false;
+  }
+  llvm_unreachable("Reg without a def");
+  return false;
+}
+
+class RegSeqInfo {
+public:
+  MachineInstr *Instr;
+  DenseMap<unsigned, unsigned> RegToChan;
+  std::vector<unsigned> UndefReg;
+  RegSeqInfo(MachineRegisterInfo &MRI, MachineInstr *MI) : Instr(MI) {
+    assert(MI->getOpcode() == AMDGPU::REG_SEQUENCE);
+    for (unsigned i = 1, e = Instr->getNumOperands(); i < e; i+=2) {
+      MachineOperand &MO = Instr->getOperand(i);
+      unsigned Chan = Instr->getOperand(i + 1).getImm();
+      if (isImplicitlyDef(MRI, MO.getReg()))
+        UndefReg.push_back(Chan);
+      else
+        RegToChan[MO.getReg()] = Chan;
+    }
+  }
+  RegSeqInfo() {}
+
+  bool operator==(const RegSeqInfo &RSI) const {
+    return RSI.Instr == Instr;
+  }
+};
+
+class R600VectorRegMerger : public MachineFunctionPass {
+private:
+  MachineRegisterInfo *MRI;
+  const R600InstrInfo *TII;
+  bool canSwizzle(const MachineInstr &) const;
+  bool areAllUsesSwizzeable(unsigned Reg) const;
+  void SwizzleInput(MachineInstr &,
+      const std::vector<std::pair<unsigned, unsigned> > &) const;
+  bool tryMergeVector(const RegSeqInfo *, RegSeqInfo *,
+      std::vector<std::pair<unsigned, unsigned> > &Remap) const;
+  bool tryMergeUsingCommonSlot(RegSeqInfo &RSI, RegSeqInfo &CompatibleRSI,
+      std::vector<std::pair<unsigned, unsigned> > &RemapChan);
+  bool tryMergeUsingFreeSlot(RegSeqInfo &RSI, RegSeqInfo &CompatibleRSI,
+      std::vector<std::pair<unsigned, unsigned> > &RemapChan);
+  MachineInstr *RebuildVector(RegSeqInfo *MI,
+      const RegSeqInfo *BaseVec,
+      const std::vector<std::pair<unsigned, unsigned> > &RemapChan) const;
+  void RemoveMI(MachineInstr *);
+  void trackRSI(const RegSeqInfo &RSI);
+
+  typedef DenseMap<unsigned, std::vector<MachineInstr *> > InstructionSetMap;
+  DenseMap<MachineInstr *, RegSeqInfo> PreviousRegSeq;
+  InstructionSetMap PreviousRegSeqByReg;
+  InstructionSetMap PreviousRegSeqByUndefCount;
+public:
+  static char ID;
+  R600VectorRegMerger(TargetMachine &tm) : MachineFunctionPass(ID),
+  TII(nullptr) { }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineDominatorTree>();
+    AU.addPreserved<MachineDominatorTree>();
+    AU.addRequired<MachineLoopInfo>();
+    AU.addPreserved<MachineLoopInfo>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  const char *getPassName() const override {
+    return "R600 Vector Registers Merge Pass";
+  }
+
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+};
+
+char R600VectorRegMerger::ID = 0;
+
+bool R600VectorRegMerger::canSwizzle(const MachineInstr &MI)
+    const {
+  if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST)
+    return true;
+  switch (MI.getOpcode()) {
+  case AMDGPU::R600_ExportSwz:
+  case AMDGPU::EG_ExportSwz:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool R600VectorRegMerger::tryMergeVector(const RegSeqInfo *Untouched,
+    RegSeqInfo *ToMerge, std::vector< std::pair<unsigned, unsigned> > &Remap)
+    const {
+  unsigned CurrentUndexIdx = 0;
+  for (DenseMap<unsigned, unsigned>::iterator It = ToMerge->RegToChan.begin(),
+      E = ToMerge->RegToChan.end(); It != E; ++It) {
+    DenseMap<unsigned, unsigned>::const_iterator PosInUntouched =
+        Untouched->RegToChan.find((*It).first);
+    if (PosInUntouched != Untouched->RegToChan.end()) {
+      Remap.push_back(std::pair<unsigned, unsigned>
+          ((*It).second, (*PosInUntouched).second));
+      continue;
+    }
+    if (CurrentUndexIdx >= Untouched->UndefReg.size())
+      return false;
+    Remap.push_back(std::pair<unsigned, unsigned>
+        ((*It).second, Untouched->UndefReg[CurrentUndexIdx++]));
+  }
+
+  return true;
+}
+
+static
+unsigned getReassignedChan(
+    const std::vector<std::pair<unsigned, unsigned> > &RemapChan,
+    unsigned Chan) {
+  for (unsigned j = 0, je = RemapChan.size(); j < je; j++) {
+    if (RemapChan[j].first == Chan)
+      return RemapChan[j].second;
+  }
+  llvm_unreachable("Chan wasn't reassigned");
+}
+
+MachineInstr *R600VectorRegMerger::RebuildVector(
+    RegSeqInfo *RSI, const RegSeqInfo *BaseRSI,
+    const std::vector<std::pair<unsigned, unsigned> > &RemapChan) const {
+  unsigned Reg = RSI->Instr->getOperand(0).getReg();
+  MachineBasicBlock::iterator Pos = RSI->Instr;
+  MachineBasicBlock &MBB = *Pos->getParent();
+  DebugLoc DL = Pos->getDebugLoc();
+
+  unsigned SrcVec = BaseRSI->Instr->getOperand(0).getReg();
+  DenseMap<unsigned, unsigned> UpdatedRegToChan = BaseRSI->RegToChan;
+  std::vector<unsigned> UpdatedUndef = BaseRSI->UndefReg;
+  for (DenseMap<unsigned, unsigned>::iterator It = RSI->RegToChan.begin(),
+      E = RSI->RegToChan.end(); It != E; ++It) {
+    unsigned DstReg = MRI->createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
+    unsigned SubReg = (*It).first;
+    unsigned Swizzle = (*It).second;
+    unsigned Chan = getReassignedChan(RemapChan, Swizzle);
+
+    MachineInstr *Tmp = BuildMI(MBB, Pos, DL, TII->get(AMDGPU::INSERT_SUBREG),
+        DstReg)
+        .addReg(SrcVec)
+        .addReg(SubReg)
+        .addImm(Chan);
+    UpdatedRegToChan[SubReg] = Chan;
+    std::vector<unsigned>::iterator ChanPos =
+        std::find(UpdatedUndef.begin(), UpdatedUndef.end(), Chan);
+    if (ChanPos != UpdatedUndef.end())
+      UpdatedUndef.erase(ChanPos);
+    assert(std::find(UpdatedUndef.begin(), UpdatedUndef.end(), Chan) ==
+               UpdatedUndef.end() &&
+           "UpdatedUndef shouldn't contain Chan more than once!");
+    DEBUG(dbgs() << "    ->"; Tmp->dump(););
+    (void)Tmp;
+    SrcVec = DstReg;
+  }
+  Pos = BuildMI(MBB, Pos, DL, TII->get(AMDGPU::COPY), Reg)
+      .addReg(SrcVec);
+  DEBUG(dbgs() << "    ->"; Pos->dump(););
+
+  DEBUG(dbgs() << "  Updating Swizzle:\n");
+  for (MachineRegisterInfo::use_instr_iterator It = MRI->use_instr_begin(Reg),
+      E = MRI->use_instr_end(); It != E; ++It) {
+    DEBUG(dbgs() << "    ";(*It).dump(); dbgs() << "    ->");
+    SwizzleInput(*It, RemapChan);
+    DEBUG((*It).dump());
+  }
+  RSI->Instr->eraseFromParent();
+
+  // Update RSI
+  RSI->Instr = Pos;
+  RSI->RegToChan = UpdatedRegToChan;
+  RSI->UndefReg = UpdatedUndef;
+
+  return Pos;
+}
+
+void R600VectorRegMerger::RemoveMI(MachineInstr *MI) {
+  for (InstructionSetMap::iterator It = PreviousRegSeqByReg.begin(),
+      E = PreviousRegSeqByReg.end(); It != E; ++It) {
+    std::vector<MachineInstr *> &MIs = (*It).second;
+    MIs.erase(std::find(MIs.begin(), MIs.end(), MI), MIs.end());
+  }
+  for (InstructionSetMap::iterator It = PreviousRegSeqByUndefCount.begin(),
+      E = PreviousRegSeqByUndefCount.end(); It != E; ++It) {
+    std::vector<MachineInstr *> &MIs = (*It).second;
+    MIs.erase(std::find(MIs.begin(), MIs.end(), MI), MIs.end());
+  }
+}
+
+void R600VectorRegMerger::SwizzleInput(MachineInstr &MI,
+    const std::vector<std::pair<unsigned, unsigned> > &RemapChan) const {
+  unsigned Offset;
+  if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST)
+    Offset = 2;
+  else
+    Offset = 3;
+  for (unsigned i = 0; i < 4; i++) {
+    unsigned Swizzle = MI.getOperand(i + Offset).getImm() + 1;
+    for (unsigned j = 0, e = RemapChan.size(); j < e; j++) {
+      if (RemapChan[j].first == Swizzle) {
+        MI.getOperand(i + Offset).setImm(RemapChan[j].second - 1);
+        break;
+      }
+    }
+  }
+}
+
+bool R600VectorRegMerger::areAllUsesSwizzeable(unsigned Reg) const {
+  for (MachineRegisterInfo::use_instr_iterator It = MRI->use_instr_begin(Reg),
+      E = MRI->use_instr_end(); It != E; ++It) {
+    if (!canSwizzle(*It))
+      return false;
+  }
+  return true;
+}
+
+bool R600VectorRegMerger::tryMergeUsingCommonSlot(RegSeqInfo &RSI,
+    RegSeqInfo &CompatibleRSI,
+    std::vector<std::pair<unsigned, unsigned> > &RemapChan) {
+  for (MachineInstr::mop_iterator MOp = RSI.Instr->operands_begin(),
+      MOE = RSI.Instr->operands_end(); MOp != MOE; ++MOp) {
+    if (!MOp->isReg())
+      continue;
+    if (PreviousRegSeqByReg[MOp->getReg()].empty())
+      continue;
+    std::vector<MachineInstr *> MIs = PreviousRegSeqByReg[MOp->getReg()];
+    for (unsigned i = 0, e = MIs.size(); i < e; i++) {
+      CompatibleRSI = PreviousRegSeq[MIs[i]];
+      if (RSI == CompatibleRSI)
+        continue;
+      if (tryMergeVector(&CompatibleRSI, &RSI, RemapChan))
+        return true;
+    }
+  }
+  return false;
+}
+
+bool R600VectorRegMerger::tryMergeUsingFreeSlot(RegSeqInfo &RSI,
+    RegSeqInfo &CompatibleRSI,
+    std::vector<std::pair<unsigned, unsigned> > &RemapChan) {
+  unsigned NeededUndefs = 4 - RSI.UndefReg.size();
+  if (PreviousRegSeqByUndefCount[NeededUndefs].empty())
+    return false;
+  std::vector<MachineInstr *> &MIs =
+      PreviousRegSeqByUndefCount[NeededUndefs];
+  CompatibleRSI = PreviousRegSeq[MIs.back()];
+  tryMergeVector(&CompatibleRSI, &RSI, RemapChan);
+  return true;
+}
+
+void R600VectorRegMerger::trackRSI(const RegSeqInfo &RSI) {
+  for (DenseMap<unsigned, unsigned>::const_iterator
+  It = RSI.RegToChan.begin(), E = RSI.RegToChan.end(); It != E; ++It) {
+    PreviousRegSeqByReg[(*It).first].push_back(RSI.Instr);
+  }
+  PreviousRegSeqByUndefCount[RSI.UndefReg.size()].push_back(RSI.Instr);
+  PreviousRegSeq[RSI.Instr] = RSI;
+}
+
+bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
+  TII = static_cast<const R600InstrInfo *>(Fn.getTarget().getInstrInfo());
+  MRI = &(Fn.getRegInfo());
+  for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
+       MBB != MBBe; ++MBB) {
+    MachineBasicBlock *MB = MBB;
+    PreviousRegSeq.clear();
+    PreviousRegSeqByReg.clear();
+    PreviousRegSeqByUndefCount.clear();
+
+    for (MachineBasicBlock::iterator MII = MB->begin(), MIIE = MB->end();
+         MII != MIIE; ++MII) {
+      MachineInstr *MI = MII;
+      if (MI->getOpcode() != AMDGPU::REG_SEQUENCE) {
+        if (TII->get(MI->getOpcode()).TSFlags & R600_InstFlag::TEX_INST) {
+          unsigned Reg = MI->getOperand(1).getReg();
+          for (MachineRegisterInfo::def_instr_iterator
+               It = MRI->def_instr_begin(Reg), E = MRI->def_instr_end();
+               It != E; ++It) {
+            RemoveMI(&(*It));
+          }
+        }
+        continue;
+      }
+
+
+      RegSeqInfo RSI(*MRI, MI);
+
+      // All uses of MI are swizzeable ?
+      unsigned Reg = MI->getOperand(0).getReg();
+      if (!areAllUsesSwizzeable(Reg))
+        continue;
+
+      DEBUG (dbgs() << "Trying to optimize ";
+          MI->dump();
+      );
+
+      RegSeqInfo CandidateRSI;
+      std::vector<std::pair<unsigned, unsigned> > RemapChan;
+      DEBUG(dbgs() << "Using common slots...\n";);
+      if (tryMergeUsingCommonSlot(RSI, CandidateRSI, RemapChan)) {
+        // Remove CandidateRSI mapping
+        RemoveMI(CandidateRSI.Instr);
+        MII = RebuildVector(&RSI, &CandidateRSI, RemapChan);
+        trackRSI(RSI);
+        continue;
+      }
+      DEBUG(dbgs() << "Using free slots...\n";);
+      RemapChan.clear();
+      if (tryMergeUsingFreeSlot(RSI, CandidateRSI, RemapChan)) {
+        RemoveMI(CandidateRSI.Instr);
+        MII = RebuildVector(&RSI, &CandidateRSI, RemapChan);
+        trackRSI(RSI);
+        continue;
+      }
+      //Failed to merge
+      trackRSI(RSI);
+    }
+  }
+  return false;
+}
+
+}
+
+llvm::FunctionPass *llvm::createR600VectorRegMerger(TargetMachine &tm) {
+  return new R600VectorRegMerger(tm);
+}
diff --git a/contrib/llvm/lib/Target/R600/R600Packetizer.cpp b/contrib/llvm/lib/Target/R600/R600Packetizer.cpp
new file mode 100644
index 0000000..74cf309
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/R600Packetizer.cpp
@@ -0,0 +1,409 @@
+//===----- R600Packetizer.cpp - VLIW packetizer ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass implements instructions packetization for R600. It unsets isLast
+/// bit of instructions inside a bundle and substitutes src register with
+/// PreviousVector when applicable.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Debug.h"
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "R600InstrInfo.h"
+#include "llvm/CodeGen/DFAPacketizer.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "packets"
+
+namespace {
+
+class R600Packetizer : public MachineFunctionPass {
+
+public:
+  static char ID;
+  R600Packetizer(const TargetMachine &TM) : MachineFunctionPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineDominatorTree>();
+    AU.addPreserved<MachineDominatorTree>();
+    AU.addRequired<MachineLoopInfo>();
+    AU.addPreserved<MachineLoopInfo>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  const char *getPassName() const override {
+    return "R600 Packetizer";
+  }
+
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+};
+char R600Packetizer::ID = 0;
+
+class R600PacketizerList : public VLIWPacketizerList {
+
+private:
+  const R600InstrInfo *TII;
+  const R600RegisterInfo &TRI;
+  bool VLIW5;
+  bool ConsideredInstUsesAlreadyWrittenVectorElement;
+
+  unsigned getSlot(const MachineInstr *MI) const {
+    return TRI.getHWRegChan(MI->getOperand(0).getReg());
+  }
+
+  /// \returns register to PV chan mapping for bundle/single instructions that
+  /// immediately precedes I.
+  DenseMap<unsigned, unsigned> getPreviousVector(MachineBasicBlock::iterator I)
+      const {
+    DenseMap<unsigned, unsigned> Result;
+    I--;
+    if (!TII->isALUInstr(I->getOpcode()) && !I->isBundle())
+      return Result;
+    MachineBasicBlock::instr_iterator BI = I.getInstrIterator();
+    if (I->isBundle())
+      BI++;
+    int LastDstChan = -1;
+    do {
+      bool isTrans = false;
+      int BISlot = getSlot(BI);
+      if (LastDstChan >= BISlot)
+        isTrans = true;
+      LastDstChan = BISlot;
+      if (TII->isPredicated(BI))
+        continue;
+      int OperandIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::write);
+      if (OperandIdx > -1 && BI->getOperand(OperandIdx).getImm() == 0)
+        continue;
+      int DstIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::dst);
+      if (DstIdx == -1) {
+        continue;
+      }
+      unsigned Dst = BI->getOperand(DstIdx).getReg();
+      if (isTrans || TII->isTransOnly(BI)) {
+        Result[Dst] = AMDGPU::PS;
+        continue;
+      }
+      if (BI->getOpcode() == AMDGPU::DOT4_r600 ||
+          BI->getOpcode() == AMDGPU::DOT4_eg) {
+        Result[Dst] = AMDGPU::PV_X;
+        continue;
+      }
+      if (Dst == AMDGPU::OQAP) {
+        continue;
+      }
+      unsigned PVReg = 0;
+      switch (TRI.getHWRegChan(Dst)) {
+      case 0:
+        PVReg = AMDGPU::PV_X;
+        break;
+      case 1:
+        PVReg = AMDGPU::PV_Y;
+        break;
+      case 2:
+        PVReg = AMDGPU::PV_Z;
+        break;
+      case 3:
+        PVReg = AMDGPU::PV_W;
+        break;
+      default:
+        llvm_unreachable("Invalid Chan");
+      }
+      Result[Dst] = PVReg;
+    } while ((++BI)->isBundledWithPred());
+    return Result;
+  }
+
+  void substitutePV(MachineInstr *MI, const DenseMap<unsigned, unsigned> &PVs)
+      const {
+    unsigned Ops[] = {
+      AMDGPU::OpName::src0,
+      AMDGPU::OpName::src1,
+      AMDGPU::OpName::src2
+    };
+    for (unsigned i = 0; i < 3; i++) {
+      int OperandIdx = TII->getOperandIdx(MI->getOpcode(), Ops[i]);
+      if (OperandIdx < 0)
+        continue;
+      unsigned Src = MI->getOperand(OperandIdx).getReg();
+      const DenseMap<unsigned, unsigned>::const_iterator It = PVs.find(Src);
+      if (It != PVs.end())
+        MI->getOperand(OperandIdx).setReg(It->second);
+    }
+  }
+public:
+  // Ctor.
+  R600PacketizerList(MachineFunction &MF, MachineLoopInfo &MLI,
+                        MachineDominatorTree &MDT)
+  : VLIWPacketizerList(MF, MLI, MDT, true),
+    TII (static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo())),
+    TRI(TII->getRegisterInfo()) {
+    VLIW5 = !MF.getTarget().getSubtarget<AMDGPUSubtarget>().hasCaymanISA();
+  }
+
+  // initPacketizerState - initialize some internal flags.
+  void initPacketizerState() override {
+    ConsideredInstUsesAlreadyWrittenVectorElement = false;
+  }
+
+  // ignorePseudoInstruction - Ignore bundling of pseudo instructions.
+  bool ignorePseudoInstruction(MachineInstr *MI,
+                               MachineBasicBlock *MBB) override {
+    return false;
+  }
+
+  // isSoloInstruction - return true if instruction MI can not be packetized
+  // with any other instruction, which means that MI itself is a packet.
+  bool isSoloInstruction(MachineInstr *MI) override {
+    if (TII->isVector(*MI))
+      return true;
+    if (!TII->isALUInstr(MI->getOpcode()))
+      return true;
+    if (MI->getOpcode() == AMDGPU::GROUP_BARRIER)
+      return true;
+    // XXX: This can be removed once the packetizer properly handles all the
+    // LDS instruction group restrictions.
+    if (TII->isLDSInstr(MI->getOpcode()))
+      return true;
+    return false;
+  }
+
+  // isLegalToPacketizeTogether - Is it legal to packetize SUI and SUJ
+  // together.
+  bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) override {
+    MachineInstr *MII = SUI->getInstr(), *MIJ = SUJ->getInstr();
+    if (getSlot(MII) == getSlot(MIJ))
+      ConsideredInstUsesAlreadyWrittenVectorElement = true;
+    // Does MII and MIJ share the same pred_sel ?
+    int OpI = TII->getOperandIdx(MII->getOpcode(), AMDGPU::OpName::pred_sel),
+        OpJ = TII->getOperandIdx(MIJ->getOpcode(), AMDGPU::OpName::pred_sel);
+    unsigned PredI = (OpI > -1)?MII->getOperand(OpI).getReg():0,
+        PredJ = (OpJ > -1)?MIJ->getOperand(OpJ).getReg():0;
+    if (PredI != PredJ)
+      return false;
+    if (SUJ->isSucc(SUI)) {
+      for (unsigned i = 0, e = SUJ->Succs.size(); i < e; ++i) {
+        const SDep &Dep = SUJ->Succs[i];
+        if (Dep.getSUnit() != SUI)
+          continue;
+        if (Dep.getKind() == SDep::Anti)
+          continue;
+        if (Dep.getKind() == SDep::Output)
+          if (MII->getOperand(0).getReg() != MIJ->getOperand(0).getReg())
+            continue;
+        return false;
+      }
+    }
+
+    bool ARDef = TII->definesAddressRegister(MII) ||
+                 TII->definesAddressRegister(MIJ);
+    bool ARUse = TII->usesAddressRegister(MII) ||
+                 TII->usesAddressRegister(MIJ);
+    if (ARDef && ARUse)
+      return false;
+
+    return true;
+  }
+
+  // isLegalToPruneDependencies - Is it legal to prune dependece between SUI
+  // and SUJ.
+  bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) override {
+    return false;
+  }
+
+  void setIsLastBit(MachineInstr *MI, unsigned Bit) const {
+    unsigned LastOp = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::last);
+    MI->getOperand(LastOp).setImm(Bit);
+  }
+
+  bool isBundlableWithCurrentPMI(MachineInstr *MI,
+                                 const DenseMap<unsigned, unsigned> &PV,
+                                 std::vector<R600InstrInfo::BankSwizzle> &BS,
+                                 bool &isTransSlot) {
+    isTransSlot = TII->isTransOnly(MI);
+    assert (!isTransSlot || VLIW5);
+
+    // Is the dst reg sequence legal ?
+    if (!isTransSlot && !CurrentPacketMIs.empty()) {
+      if (getSlot(MI) <= getSlot(CurrentPacketMIs.back())) {
+        if (ConsideredInstUsesAlreadyWrittenVectorElement  &&
+            !TII->isVectorOnly(MI) && VLIW5) {
+          isTransSlot = true;
+          DEBUG(dbgs() << "Considering as Trans Inst :"; MI->dump(););
+        }
+        else
+          return false;
+      }
+    }
+
+    // Are the Constants limitations met ?
+    CurrentPacketMIs.push_back(MI);
+    if (!TII->fitsConstReadLimitations(CurrentPacketMIs)) {
+      DEBUG(
+        dbgs() << "Couldn't pack :\n";
+        MI->dump();
+        dbgs() << "with the following packets :\n";
+        for (unsigned i = 0, e = CurrentPacketMIs.size() - 1; i < e; i++) {
+          CurrentPacketMIs[i]->dump();
+          dbgs() << "\n";
+        }
+        dbgs() << "because of Consts read limitations\n";
+      );
+      CurrentPacketMIs.pop_back();
+      return false;
+    }
+
+    // Is there a BankSwizzle set that meet Read Port limitations ?
+    if (!TII->fitsReadPortLimitations(CurrentPacketMIs,
+            PV, BS, isTransSlot)) {
+      DEBUG(
+        dbgs() << "Couldn't pack :\n";
+        MI->dump();
+        dbgs() << "with the following packets :\n";
+        for (unsigned i = 0, e = CurrentPacketMIs.size() - 1; i < e; i++) {
+          CurrentPacketMIs[i]->dump();
+          dbgs() << "\n";
+        }
+        dbgs() << "because of Read port limitations\n";
+      );
+      CurrentPacketMIs.pop_back();
+      return false;
+    }
+
+    // We cannot read LDS source registrs from the Trans slot.
+    if (isTransSlot && TII->readsLDSSrcReg(MI))
+      return false;
+
+    CurrentPacketMIs.pop_back();
+    return true;
+  }
+
+  MachineBasicBlock::iterator addToPacket(MachineInstr *MI) override {
+    MachineBasicBlock::iterator FirstInBundle =
+        CurrentPacketMIs.empty() ? MI : CurrentPacketMIs.front();
+    const DenseMap<unsigned, unsigned> &PV =
+        getPreviousVector(FirstInBundle);
+    std::vector<R600InstrInfo::BankSwizzle> BS;
+    bool isTransSlot;
+
+    if (isBundlableWithCurrentPMI(MI, PV, BS, isTransSlot)) {
+      for (unsigned i = 0, e = CurrentPacketMIs.size(); i < e; i++) {
+        MachineInstr *MI = CurrentPacketMIs[i];
+        unsigned Op = TII->getOperandIdx(MI->getOpcode(),
+            AMDGPU::OpName::bank_swizzle);
+        MI->getOperand(Op).setImm(BS[i]);
+      }
+      unsigned Op = TII->getOperandIdx(MI->getOpcode(),
+          AMDGPU::OpName::bank_swizzle);
+      MI->getOperand(Op).setImm(BS.back());
+      if (!CurrentPacketMIs.empty())
+        setIsLastBit(CurrentPacketMIs.back(), 0);
+      substitutePV(MI, PV);
+      MachineBasicBlock::iterator It = VLIWPacketizerList::addToPacket(MI);
+      if (isTransSlot) {
+        endPacket(std::next(It)->getParent(), std::next(It));
+      }
+      return It;
+    }
+    endPacket(MI->getParent(), MI);
+    if (TII->isTransOnly(MI))
+      return MI;
+    return VLIWPacketizerList::addToPacket(MI);
+  }
+};
+
+bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) {
+  const TargetInstrInfo *TII = Fn.getTarget().getInstrInfo();
+  MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
+  MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>();
+
+  // Instantiate the packetizer.
+  R600PacketizerList Packetizer(Fn, MLI, MDT);
+
+  // DFA state table should not be empty.
+  assert(Packetizer.getResourceTracker() && "Empty DFA table!");
+
+  //
+  // Loop over all basic blocks and remove KILL pseudo-instructions
+  // These instructions confuse the dependence analysis. Consider:
+  // D0 = ...   (Insn 0)
+  // R0 = KILL R0, D0 (Insn 1)
+  // R0 = ... (Insn 2)
+  // Here, Insn 1 will result in the dependence graph not emitting an output
+  // dependence between Insn 0 and Insn 2. This can lead to incorrect
+  // packetization
+  //
+  for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
+       MBB != MBBe; ++MBB) {
+    MachineBasicBlock::iterator End = MBB->end();
+    MachineBasicBlock::iterator MI = MBB->begin();
+    while (MI != End) {
+      if (MI->isKill() || MI->getOpcode() == AMDGPU::IMPLICIT_DEF ||
+          (MI->getOpcode() == AMDGPU::CF_ALU && !MI->getOperand(8).getImm())) {
+        MachineBasicBlock::iterator DeleteMI = MI;
+        ++MI;
+        MBB->erase(DeleteMI);
+        End = MBB->end();
+        continue;
+      }
+      ++MI;
+    }
+  }
+
+  // Loop over all of the basic blocks.
+  for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
+       MBB != MBBe; ++MBB) {
+    // Find scheduling regions and schedule / packetize each region.
+    unsigned RemainingCount = MBB->size();
+    for(MachineBasicBlock::iterator RegionEnd = MBB->end();
+        RegionEnd != MBB->begin();) {
+      // The next region starts above the previous region. Look backward in the
+      // instruction stream until we find the nearest boundary.
+      MachineBasicBlock::iterator I = RegionEnd;
+      for(;I != MBB->begin(); --I, --RemainingCount) {
+        if (TII->isSchedulingBoundary(std::prev(I), MBB, Fn))
+          break;
+      }
+      I = MBB->begin();
+
+      // Skip empty scheduling regions.
+      if (I == RegionEnd) {
+        RegionEnd = std::prev(RegionEnd);
+        --RemainingCount;
+        continue;
+      }
+      // Skip regions with one instruction.
+      if (I == std::prev(RegionEnd)) {
+        RegionEnd = std::prev(RegionEnd);
+        continue;
+      }
+
+      Packetizer.PacketizeMIs(MBB, I, RegionEnd);
+      RegionEnd = I;
+    }
+  }
+
+  return true;
+
+}
+
+} // end anonymous namespace
+
+llvm::FunctionPass *llvm::createR600Packetizer(TargetMachine &tm) {
+  return new R600Packetizer(tm);
+}
diff --git a/contrib/llvm/lib/Target/R600/R600RegisterInfo.cpp b/contrib/llvm/lib/Target/R600/R600RegisterInfo.cpp
new file mode 100644
index 0000000..dc95675
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/R600RegisterInfo.cpp
@@ -0,0 +1,89 @@
+//===-- R600RegisterInfo.cpp - R600 Register Information ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief R600 implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "R600RegisterInfo.h"
+#include "AMDGPUTargetMachine.h"
+#include "R600Defines.h"
+#include "R600InstrInfo.h"
+#include "R600MachineFunctionInfo.h"
+
+using namespace llvm;
+
+R600RegisterInfo::R600RegisterInfo(const AMDGPUSubtarget &st)
+: AMDGPURegisterInfo(st)
+  { RCW.RegWeight = 0; RCW.WeightLimit = 0;}
+
+BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+
+  const R600InstrInfo *TII = static_cast<const R600InstrInfo*>(ST.getInstrInfo());
+
+  Reserved.set(AMDGPU::ZERO);
+  Reserved.set(AMDGPU::HALF);
+  Reserved.set(AMDGPU::ONE);
+  Reserved.set(AMDGPU::ONE_INT);
+  Reserved.set(AMDGPU::NEG_HALF);
+  Reserved.set(AMDGPU::NEG_ONE);
+  Reserved.set(AMDGPU::PV_X);
+  Reserved.set(AMDGPU::ALU_LITERAL_X);
+  Reserved.set(AMDGPU::ALU_CONST);
+  Reserved.set(AMDGPU::PREDICATE_BIT);
+  Reserved.set(AMDGPU::PRED_SEL_OFF);
+  Reserved.set(AMDGPU::PRED_SEL_ZERO);
+  Reserved.set(AMDGPU::PRED_SEL_ONE);
+  Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
+
+  for (TargetRegisterClass::iterator I = AMDGPU::R600_AddrRegClass.begin(),
+                        E = AMDGPU::R600_AddrRegClass.end(); I != E; ++I) {
+    Reserved.set(*I);
+  }
+
+  TII->reserveIndirectRegisters(Reserved, MF);
+
+  return Reserved;
+}
+
+unsigned R600RegisterInfo::getHWRegChan(unsigned reg) const {
+  return this->getEncodingValue(reg) >> HW_CHAN_SHIFT;
+}
+
+unsigned R600RegisterInfo::getHWRegIndex(unsigned Reg) const {
+  return GET_REG_INDEX(getEncodingValue(Reg));
+}
+
+const TargetRegisterClass * R600RegisterInfo::getCFGStructurizerRegClass(
+                                                                   MVT VT) const {
+  switch(VT.SimpleTy) {
+  default:
+  case MVT::i32: return &AMDGPU::R600_TReg32RegClass;
+  }
+}
+
+const RegClassWeight &R600RegisterInfo::getRegClassWeight(
+  const TargetRegisterClass *RC) const {
+  return RCW;
+}
+
+bool R600RegisterInfo::isPhysRegLiveAcrossClauses(unsigned Reg) const {
+  assert(!TargetRegisterInfo::isVirtualRegister(Reg));
+
+  switch (Reg) {
+  case AMDGPU::OQAP:
+  case AMDGPU::OQBP:
+  case AMDGPU::AR_X:
+    return false;
+  default:
+    return true;
+  }
+}
diff --git a/contrib/llvm/lib/Target/R600/R600RegisterInfo.h b/contrib/llvm/lib/Target/R600/R600RegisterInfo.h
new file mode 100644
index 0000000..247808b
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/R600RegisterInfo.h
@@ -0,0 +1,49 @@
+//===-- R600RegisterInfo.h - R600 Register Info Interface ------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface definition for R600RegisterInfo
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef R600REGISTERINFO_H_
+#define R600REGISTERINFO_H_
+
+#include "AMDGPURegisterInfo.h"
+
+namespace llvm {
+
+class AMDGPUSubtarget;
+
+struct R600RegisterInfo : public AMDGPURegisterInfo {
+  RegClassWeight RCW;
+
+  R600RegisterInfo(const AMDGPUSubtarget &st);
+
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+  /// \brief get the HW encoding for a register's channel.
+  unsigned getHWRegChan(unsigned reg) const;
+
+  unsigned getHWRegIndex(unsigned Reg) const override;
+
+  /// \brief get the register class of the specified type to use in the
+  /// CFGStructurizer
+  const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override;
+
+  const RegClassWeight &
+    getRegClassWeight(const TargetRegisterClass *RC) const override;
+
+  // \returns true if \p Reg can be defined in one ALU caluse and used in another.
+  bool isPhysRegLiveAcrossClauses(unsigned Reg) const;
+};
+
+} // End namespace llvm
+
+#endif // AMDIDSAREGISTERINFO_H_
diff --git a/contrib/llvm/lib/Target/R600/R600RegisterInfo.td b/contrib/llvm/lib/Target/R600/R600RegisterInfo.td
new file mode 100644
index 0000000..cc667d9
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/R600RegisterInfo.td
@@ -0,0 +1,252 @@
+
+class R600Reg <string name, bits<16> encoding> : Register<name> {
+  let Namespace = "AMDGPU";
+  let HWEncoding = encoding;
+}
+
+class R600RegWithChan <string name, bits<9> sel, string chan> :
+    Register <name> {
+
+  field bits<2> chan_encoding = !if(!eq(chan, "X"), 0,
+                                !if(!eq(chan, "Y"), 1,
+                                !if(!eq(chan, "Z"), 2,
+                                !if(!eq(chan, "W"), 3, 0))));
+  let HWEncoding{8-0}  = sel;
+  let HWEncoding{10-9} = chan_encoding;
+  let Namespace = "AMDGPU";
+}
+
+class R600Reg_128<string n, list<Register> subregs, bits<16> encoding> :
+    RegisterWithSubRegs<n, subregs> {
+  field bits<2> chan_encoding = 0;
+  let Namespace = "AMDGPU";
+  let SubRegIndices = [sub0, sub1, sub2, sub3];
+  let HWEncoding{8-0} = encoding{8-0};
+  let HWEncoding{10-9} = chan_encoding;
+}
+
+class R600Reg_64<string n, list<Register> subregs, bits<16> encoding> :
+    RegisterWithSubRegs<n, subregs> {
+  field bits<2> chan_encoding = 0;
+  let Namespace = "AMDGPU";
+  let SubRegIndices = [sub0, sub1];
+  let HWEncoding = encoding;
+  let HWEncoding{8-0} = encoding{8-0};
+  let HWEncoding{10-9} = chan_encoding;
+}
+
+class R600Reg_64Vertical<int lo, int hi, string chan> : R600Reg_64 <
+  "V"#lo#hi#"_"#chan,
+  [!cast<Register>("T"#lo#"_"#chan), !cast<Register>("T"#hi#"_"#chan)],
+  lo
+>;
+
+foreach Index = 0-127 in {
+  foreach Chan = [ "X", "Y", "Z", "W" ] in {
+    // 32-bit Temporary Registers
+    def T#Index#_#Chan : R600RegWithChan <"T"#Index#"."#Chan, Index, Chan>;
+
+    // Indirect addressing offset registers
+    def Addr#Index#_#Chan : R600RegWithChan <"T("#Index#" + AR.x)."#Chan,
+                                              Index, Chan>;
+  }
+  // 128-bit Temporary Registers
+  def T#Index#_XYZW : R600Reg_128 <"T"#Index#"",
+                                   [!cast<Register>("T"#Index#"_X"),
+                                    !cast<Register>("T"#Index#"_Y"),
+                                    !cast<Register>("T"#Index#"_Z"),
+                                    !cast<Register>("T"#Index#"_W")],
+                                   Index>;
+
+  def T#Index#_XY : R600Reg_64 <"T"#Index#"",
+                                   [!cast<Register>("T"#Index#"_X"),
+                                    !cast<Register>("T"#Index#"_Y")],
+                                   Index>;
+}
+
+foreach Chan = [ "X", "Y", "Z", "W"] in {
+
+  let chan_encoding = !if(!eq(Chan, "X"), 0,
+                      !if(!eq(Chan, "Y"), 1,
+                      !if(!eq(Chan, "Z"), 2,
+                      !if(!eq(Chan, "W"), 3, 0)))) in {
+    def V0123_#Chan : R600Reg_128 <"V0123_"#Chan,
+                                   [!cast<Register>("T0_"#Chan),
+                                    !cast<Register>("T1_"#Chan),
+                                    !cast<Register>("T2_"#Chan),
+                                    !cast<Register>("T3_"#Chan)],
+                                    0>;
+    def V01_#Chan : R600Reg_64Vertical<0, 1, Chan>;
+    def V23_#Chan : R600Reg_64Vertical<2, 3, Chan>;
+  }
+}
+
+
+// KCACHE_BANK0
+foreach Index = 159-128 in {
+  foreach Chan = [ "X", "Y", "Z", "W" ] in {
+    // 32-bit Temporary Registers
+    def KC0_#Index#_#Chan : R600RegWithChan <"KC0["#!add(Index,-128)#"]."#Chan, Index, Chan>;
+  }
+  // 128-bit Temporary Registers
+  def KC0_#Index#_XYZW : R600Reg_128 <"KC0["#!add(Index, -128)#"].XYZW",
+                                 [!cast<Register>("KC0_"#Index#"_X"),
+                                  !cast<Register>("KC0_"#Index#"_Y"),
+                                  !cast<Register>("KC0_"#Index#"_Z"),
+                                  !cast<Register>("KC0_"#Index#"_W")],
+                                 Index>;
+}
+
+// KCACHE_BANK1
+foreach Index = 191-160 in {
+  foreach Chan = [ "X", "Y", "Z", "W" ] in {
+    // 32-bit Temporary Registers
+    def KC1_#Index#_#Chan : R600RegWithChan <"KC1["#!add(Index,-160)#"]."#Chan, Index, Chan>;
+  }
+  // 128-bit Temporary Registers
+  def KC1_#Index#_XYZW : R600Reg_128 <"KC1["#!add(Index, -160)#"].XYZW",
+                                 [!cast<Register>("KC1_"#Index#"_X"),
+                                  !cast<Register>("KC1_"#Index#"_Y"),
+                                  !cast<Register>("KC1_"#Index#"_Z"),
+                                  !cast<Register>("KC1_"#Index#"_W")],
+                                 Index>;
+}
+
+
+// Array Base Register holding input in FS
+foreach Index = 448-480 in {
+  def ArrayBase#Index :  R600Reg<"ARRAY_BASE", Index>;
+}
+
+
+// Special Registers
+
+def OQA : R600Reg<"OQA", 219>;
+def OQB : R600Reg<"OQB", 220>;
+def OQAP : R600Reg<"OQAP", 221>;
+def OQBP : R600Reg<"OQAP", 222>;
+def LDS_DIRECT_A : R600Reg<"LDS_DIRECT_A", 223>;
+def LDS_DIRECT_B : R600Reg<"LDS_DIRECT_B", 224>;
+def ZERO : R600Reg<"0.0", 248>;
+def ONE : R600Reg<"1.0", 249>;
+def NEG_ONE : R600Reg<"-1.0", 249>;
+def ONE_INT : R600Reg<"1", 250>;
+def HALF : R600Reg<"0.5", 252>;
+def NEG_HALF : R600Reg<"-0.5", 252>;
+def ALU_LITERAL_X : R600RegWithChan<"literal.x", 253, "X">;
+def ALU_LITERAL_Y : R600RegWithChan<"literal.y", 253, "Y">;
+def ALU_LITERAL_Z : R600RegWithChan<"literal.z", 253, "Z">;
+def ALU_LITERAL_W : R600RegWithChan<"literal.w", 253, "W">;
+def PV_X : R600RegWithChan<"PV.X", 254, "X">;
+def PV_Y : R600RegWithChan<"PV.Y", 254, "Y">;
+def PV_Z : R600RegWithChan<"PV.Z", 254, "Z">;
+def PV_W : R600RegWithChan<"PV.W", 254, "W">;
+def PS: R600Reg<"PS", 255>;
+def PREDICATE_BIT : R600Reg<"PredicateBit", 0>;
+def PRED_SEL_OFF: R600Reg<"Pred_sel_off", 0>;
+def PRED_SEL_ZERO : R600Reg<"Pred_sel_zero", 2>;
+def PRED_SEL_ONE : R600Reg<"Pred_sel_one", 3>;
+def AR_X : R600Reg<"AR.x", 0>;
+
+def R600_ArrayBase : RegisterClass <"AMDGPU", [f32, i32], 32,
+                          (add (sequence "ArrayBase%u", 448, 480))>;
+// special registers for ALU src operands
+// const buffer reference, SRCx_SEL contains index
+def ALU_CONST : R600Reg<"CBuf", 0>;
+// interpolation param reference, SRCx_SEL contains index
+def ALU_PARAM : R600Reg<"Param", 0>;
+
+let isAllocatable = 0 in {
+
+def R600_Addr : RegisterClass <"AMDGPU", [i32], 32, (add (sequence "Addr%u_X", 0, 127))>;
+
+// We only use Addr_[YZW] for vertical vectors.
+// FIXME if we add more vertical vector registers we will need to ad more
+// registers to these classes.
+def R600_Addr_Y : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_Y)>;
+def R600_Addr_Z : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_Z)>;
+def R600_Addr_W : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_W)>;
+
+def R600_LDS_SRC_REG : RegisterClass<"AMDGPU", [i32], 32,
+  (add OQA, OQB, OQAP, OQBP, LDS_DIRECT_A, LDS_DIRECT_B)>;
+
+def R600_KC0_X : RegisterClass <"AMDGPU", [f32, i32], 32,
+                              (add (sequence "KC0_%u_X", 128, 159))>;
+
+def R600_KC0_Y : RegisterClass <"AMDGPU", [f32, i32], 32,
+                              (add (sequence "KC0_%u_Y", 128, 159))>;
+
+def R600_KC0_Z : RegisterClass <"AMDGPU", [f32, i32], 32,
+                              (add (sequence "KC0_%u_Z", 128, 159))>;
+
+def R600_KC0_W : RegisterClass <"AMDGPU", [f32, i32], 32,
+                              (add (sequence "KC0_%u_W", 128, 159))>;
+
+def R600_KC0 : RegisterClass <"AMDGPU", [f32, i32], 32,
+                                   (interleave R600_KC0_X, R600_KC0_Y,
+                                               R600_KC0_Z, R600_KC0_W)>;
+
+def R600_KC1_X : RegisterClass <"AMDGPU", [f32, i32], 32,
+                              (add (sequence "KC1_%u_X", 160, 191))>;
+
+def R600_KC1_Y : RegisterClass <"AMDGPU", [f32, i32], 32,
+                              (add (sequence "KC1_%u_Y", 160, 191))>;
+
+def R600_KC1_Z : RegisterClass <"AMDGPU", [f32, i32], 32,
+                              (add (sequence "KC1_%u_Z", 160, 191))>;
+
+def R600_KC1_W : RegisterClass <"AMDGPU", [f32, i32], 32,
+                              (add (sequence "KC1_%u_W", 160, 191))>;
+
+def R600_KC1 : RegisterClass <"AMDGPU", [f32, i32], 32,
+                                   (interleave R600_KC1_X, R600_KC1_Y,
+                                               R600_KC1_Z, R600_KC1_W)>;
+
+} // End isAllocatable = 0
+
+def R600_TReg32_X : RegisterClass <"AMDGPU", [f32, i32], 32,
+                                   (add (sequence "T%u_X", 0, 127), AR_X)>;
+
+def R600_TReg32_Y : RegisterClass <"AMDGPU", [f32, i32], 32,
+                                   (add (sequence "T%u_Y", 0, 127))>;
+
+def R600_TReg32_Z : RegisterClass <"AMDGPU", [f32, i32], 32,
+                                   (add (sequence "T%u_Z", 0, 127))>;
+
+def R600_TReg32_W : RegisterClass <"AMDGPU", [f32, i32], 32,
+                                   (add (sequence "T%u_W", 0, 127))>;
+
+def R600_TReg32 : RegisterClass <"AMDGPU", [f32, i32], 32,
+                                   (interleave R600_TReg32_X, R600_TReg32_Y,
+                                               R600_TReg32_Z, R600_TReg32_W)>;
+
+def R600_Reg32 : RegisterClass <"AMDGPU", [f32, i32], 32, (add
+    R600_TReg32,
+    R600_ArrayBase,
+    R600_Addr,
+    R600_KC0, R600_KC1,
+    ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF,
+    ALU_CONST, ALU_PARAM, OQAP
+    )>;
+
+def R600_Predicate : RegisterClass <"AMDGPU", [i32], 32, (add
+    PRED_SEL_OFF, PRED_SEL_ZERO, PRED_SEL_ONE)>;
+
+def R600_Predicate_Bit: RegisterClass <"AMDGPU", [i32], 32, (add
+    PREDICATE_BIT)>;
+
+def R600_Reg128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128,
+                                (add (sequence "T%u_XYZW", 0, 127))> {
+  let CopyCost = -1;
+}
+
+def R600_Reg128Vertical : RegisterClass<"AMDGPU", [v4f32, v4i32], 128,
+  (add V0123_W, V0123_Z, V0123_Y, V0123_X)
+>;
+
+def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32], 64,
+                                (add (sequence "T%u_XY", 0, 63))>;
+
+def R600_Reg64Vertical : RegisterClass<"AMDGPU", [v2f32, v2i32], 64,
+                                      (add V01_X, V01_Y, V01_Z, V01_W,
+                                           V23_X, V23_Y, V23_Z, V23_W)>;
diff --git a/contrib/llvm/lib/Target/R600/R600Schedule.td b/contrib/llvm/lib/Target/R600/R600Schedule.td
new file mode 100644
index 0000000..df62bf8
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/R600Schedule.td
@@ -0,0 +1,49 @@
+//===-- R600Schedule.td - R600 Scheduling definitions ------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// R600 has a VLIW architecture.  On pre-cayman cards there are 5 instruction
+// slots ALU.X, ALU.Y, ALU.Z, ALU.W, and TRANS.  For cayman cards, the TRANS
+// slot has been removed. 
+//
+//===----------------------------------------------------------------------===//
+
+
+def ALU_X : FuncUnit;
+def ALU_Y : FuncUnit;
+def ALU_Z : FuncUnit;
+def ALU_W : FuncUnit;
+def TRANS : FuncUnit;
+
+def AnyALU : InstrItinClass;
+def VecALU : InstrItinClass;
+def TransALU : InstrItinClass;
+def XALU : InstrItinClass;
+
+def R600_VLIW5_Itin : ProcessorItineraries <
+  [ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS, ALU_NULL],
+  [],
+  [
+    InstrItinData<AnyALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS]>]>,
+    InstrItinData<VecALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W]>]>,
+    InstrItinData<TransALU, [InstrStage<1, [TRANS]>]>,
+    InstrItinData<XALU, [InstrStage<1, [ALU_X]>]>,
+    InstrItinData<NullALU, [InstrStage<1, [ALU_NULL]>]>
+  ]
+>;
+
+def R600_VLIW4_Itin : ProcessorItineraries <
+  [ALU_X, ALU_Y, ALU_Z, ALU_W, ALU_NULL],
+  [],
+  [
+    InstrItinData<AnyALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W]>]>,
+    InstrItinData<VecALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W]>]>,
+    InstrItinData<TransALU, [InstrStage<1, [ALU_NULL]>]>,
+    InstrItinData<NullALU, [InstrStage<1, [ALU_NULL]>]>
+  ]
+>;
diff --git a/contrib/llvm/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp b/contrib/llvm/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp
new file mode 100644
index 0000000..419ec8b
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp
@@ -0,0 +1,303 @@
+//===-- R600TextureIntrinsicsReplacer.cpp ---------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass translates tgsi-like texture intrinsics into R600 texture
+/// closer to hardware intrinsics.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+
+using namespace llvm;
+
+namespace {
+class R600TextureIntrinsicsReplacer :
+    public FunctionPass, public InstVisitor<R600TextureIntrinsicsReplacer> {
+  static char ID;
+
+  Module *Mod;
+  Type *FloatType;
+  Type *Int32Type;
+  Type *V4f32Type;
+  Type *V4i32Type;
+  FunctionType *TexSign;
+  FunctionType *TexQSign;
+
+  void getAdjustmentFromTextureTarget(unsigned TextureType, bool hasLOD,
+                                      unsigned SrcSelect[4], unsigned CT[4],
+                                      bool &useShadowVariant) {
+    enum TextureTypes {
+      TEXTURE_1D = 1,
+      TEXTURE_2D,
+      TEXTURE_3D,
+      TEXTURE_CUBE,
+      TEXTURE_RECT,
+      TEXTURE_SHADOW1D,
+      TEXTURE_SHADOW2D,
+      TEXTURE_SHADOWRECT,
+      TEXTURE_1D_ARRAY,
+      TEXTURE_2D_ARRAY,
+      TEXTURE_SHADOW1D_ARRAY,
+      TEXTURE_SHADOW2D_ARRAY,
+      TEXTURE_SHADOWCUBE,
+      TEXTURE_2D_MSAA,
+      TEXTURE_2D_ARRAY_MSAA,
+      TEXTURE_CUBE_ARRAY,
+      TEXTURE_SHADOWCUBE_ARRAY
+    };
+
+    switch (TextureType) {
+    case 0:
+      useShadowVariant = false;
+      return;
+    case TEXTURE_RECT:
+    case TEXTURE_1D:
+    case TEXTURE_2D:
+    case TEXTURE_3D:
+    case TEXTURE_CUBE:
+    case TEXTURE_1D_ARRAY:
+    case TEXTURE_2D_ARRAY:
+    case TEXTURE_CUBE_ARRAY:
+    case TEXTURE_2D_MSAA:
+    case TEXTURE_2D_ARRAY_MSAA:
+      useShadowVariant = false;
+      break;
+    case TEXTURE_SHADOW1D:
+    case TEXTURE_SHADOW2D:
+    case TEXTURE_SHADOWRECT:
+    case TEXTURE_SHADOW1D_ARRAY:
+    case TEXTURE_SHADOW2D_ARRAY:
+    case TEXTURE_SHADOWCUBE:
+    case TEXTURE_SHADOWCUBE_ARRAY:
+      useShadowVariant = true;
+      break;
+    default:
+      llvm_unreachable("Unknow Texture Type");
+    }
+
+    if (TextureType == TEXTURE_RECT ||
+        TextureType == TEXTURE_SHADOWRECT) {
+      CT[0] = 0;
+      CT[1] = 0;
+    }
+
+    if (TextureType == TEXTURE_CUBE_ARRAY ||
+        TextureType == TEXTURE_SHADOWCUBE_ARRAY)
+      CT[2] = 0;
+
+    if (TextureType == TEXTURE_1D_ARRAY ||
+        TextureType == TEXTURE_SHADOW1D_ARRAY) {
+      if (hasLOD && useShadowVariant) {
+        CT[1] = 0;
+      } else {
+        CT[2] = 0;
+        SrcSelect[2] = 1;
+      }
+    } else if (TextureType == TEXTURE_2D_ARRAY ||
+        TextureType == TEXTURE_SHADOW2D_ARRAY) {
+      CT[2] = 0;
+    }
+
+    if ((TextureType == TEXTURE_SHADOW1D ||
+        TextureType == TEXTURE_SHADOW2D ||
+        TextureType == TEXTURE_SHADOWRECT ||
+        TextureType == TEXTURE_SHADOW1D_ARRAY) &&
+        !(hasLOD && useShadowVariant))
+      SrcSelect[3] = 2;
+  }
+
+  void ReplaceCallInst(CallInst &I, FunctionType *FT, const char *Name,
+                       unsigned SrcSelect[4], Value *Offset[3], Value *Resource,
+                       Value *Sampler, unsigned CT[4], Value *Coord) {
+    IRBuilder<> Builder(&I);
+    Constant *Mask[] = {
+      ConstantInt::get(Int32Type, SrcSelect[0]),
+      ConstantInt::get(Int32Type, SrcSelect[1]),
+      ConstantInt::get(Int32Type, SrcSelect[2]),
+      ConstantInt::get(Int32Type, SrcSelect[3])
+    };
+    Value *SwizzleMask = ConstantVector::get(Mask);
+    Value *SwizzledCoord =
+        Builder.CreateShuffleVector(Coord, Coord, SwizzleMask);
+
+    Value *Args[] = {
+      SwizzledCoord,
+      Offset[0],
+      Offset[1],
+      Offset[2],
+      Resource,
+      Sampler,
+      ConstantInt::get(Int32Type, CT[0]),
+      ConstantInt::get(Int32Type, CT[1]),
+      ConstantInt::get(Int32Type, CT[2]),
+      ConstantInt::get(Int32Type, CT[3])
+    };
+
+    Function *F = Mod->getFunction(Name);
+    if (!F) {
+      F = Function::Create(FT, GlobalValue::ExternalLinkage, Name, Mod);
+      F->addFnAttr(Attribute::ReadNone);
+    }
+    I.replaceAllUsesWith(Builder.CreateCall(F, Args));
+    I.eraseFromParent();
+  }
+
+  void ReplaceTexIntrinsic(CallInst &I, bool hasLOD, FunctionType *FT,
+                           const char *VanillaInt,
+                           const char *ShadowInt) {
+    Value *Coord = I.getArgOperand(0);
+    Value *ResourceId = I.getArgOperand(1);
+    Value *SamplerId = I.getArgOperand(2);
+
+    unsigned TextureType =
+        dyn_cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
+
+    unsigned SrcSelect[4] = { 0, 1, 2, 3 };
+    unsigned CT[4] = {1, 1, 1, 1};
+    Value *Offset[3] = {
+      ConstantInt::get(Int32Type, 0),
+      ConstantInt::get(Int32Type, 0),
+      ConstantInt::get(Int32Type, 0)
+    };
+    bool useShadowVariant;
+
+    getAdjustmentFromTextureTarget(TextureType, hasLOD, SrcSelect, CT,
+                                   useShadowVariant);
+
+    ReplaceCallInst(I, FT, useShadowVariant?ShadowInt:VanillaInt, SrcSelect,
+                    Offset, ResourceId, SamplerId, CT, Coord);
+  }
+
+  void ReplaceTXF(CallInst &I) {
+    Value *Coord = I.getArgOperand(0);
+    Value *ResourceId = I.getArgOperand(4);
+    Value *SamplerId = I.getArgOperand(5);
+
+    unsigned TextureType =
+        dyn_cast<ConstantInt>(I.getArgOperand(6))->getZExtValue();
+
+    unsigned SrcSelect[4] = { 0, 1, 2, 3 };
+    unsigned CT[4] = {1, 1, 1, 1};
+    Value *Offset[3] = {
+      I.getArgOperand(1),
+      I.getArgOperand(2),
+      I.getArgOperand(3),
+    };
+    bool useShadowVariant;
+
+    getAdjustmentFromTextureTarget(TextureType, false, SrcSelect, CT,
+                                   useShadowVariant);
+
+    ReplaceCallInst(I, TexQSign, "llvm.R600.txf", SrcSelect,
+                    Offset, ResourceId, SamplerId, CT, Coord);
+  }
+
+public:
+  R600TextureIntrinsicsReplacer():
+    FunctionPass(ID) {
+  }
+
+  bool doInitialization(Module &M) override {
+    LLVMContext &Ctx = M.getContext();
+    Mod = &M;
+    FloatType = Type::getFloatTy(Ctx);
+    Int32Type = Type::getInt32Ty(Ctx);
+    V4f32Type = VectorType::get(FloatType, 4);
+    V4i32Type = VectorType::get(Int32Type, 4);
+    Type *ArgsType[] = {
+      V4f32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+    };
+    TexSign = FunctionType::get(V4f32Type, ArgsType, /*isVarArg=*/false);
+    Type *ArgsQType[] = {
+      V4i32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+    };
+    TexQSign = FunctionType::get(V4f32Type, ArgsQType, /*isVarArg=*/false);
+    return false;
+  }
+
+  bool runOnFunction(Function &F) override {
+    visit(F);
+    return false;
+  }
+
+  const char *getPassName() const override {
+    return "R600 Texture Intrinsics Replacer";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+  }
+
+  void visitCallInst(CallInst &I) {
+    if (!I.getCalledFunction())
+      return;
+
+    StringRef Name = I.getCalledFunction()->getName();
+    if (Name == "llvm.AMDGPU.tex") {
+      ReplaceTexIntrinsic(I, false, TexSign, "llvm.R600.tex", "llvm.R600.texc");
+      return;
+    }
+    if (Name == "llvm.AMDGPU.txl") {
+      ReplaceTexIntrinsic(I, true, TexSign, "llvm.R600.txl", "llvm.R600.txlc");
+      return;
+    }
+    if (Name == "llvm.AMDGPU.txb") {
+      ReplaceTexIntrinsic(I, true, TexSign, "llvm.R600.txb", "llvm.R600.txbc");
+      return;
+    }
+    if (Name == "llvm.AMDGPU.txf") {
+      ReplaceTXF(I);
+      return;
+    }
+    if (Name == "llvm.AMDGPU.txq") {
+      ReplaceTexIntrinsic(I, false, TexQSign, "llvm.R600.txq", "llvm.R600.txq");
+      return;
+    }
+    if (Name == "llvm.AMDGPU.ddx") {
+      ReplaceTexIntrinsic(I, false, TexSign, "llvm.R600.ddx", "llvm.R600.ddx");
+      return;
+    }
+    if (Name == "llvm.AMDGPU.ddy") {
+      ReplaceTexIntrinsic(I, false, TexSign, "llvm.R600.ddy", "llvm.R600.ddy");
+      return;
+    }
+  }
+
+};
+
+char R600TextureIntrinsicsReplacer::ID = 0;
+
+}
+
+FunctionPass *llvm::createR600TextureIntrinsicsReplacer() {
+  return new R600TextureIntrinsicsReplacer();
+}
diff --git a/contrib/llvm/lib/Target/R600/R700Instructions.td b/contrib/llvm/lib/Target/R600/R700Instructions.td
new file mode 100644
index 0000000..9aad85d
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/R700Instructions.td
@@ -0,0 +1,21 @@
+//===-- R700Instructions.td - R700 Instruction defs  -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TableGen definitions for instructions which are:
+// - Available to R700 and newer VLIW4/VLIW5 GPUs
+// - Available only on R700 family GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+def isR700 : Predicate<"Subtarget.getGeneration() == AMDGPUSubtarget::R700">;
+
+let Predicates = [isR700] in {
+  def SIN_r700 : SIN_Common<0x6E>;
+  def COS_r700 : COS_Common<0x6F>;
+}
diff --git a/contrib/llvm/lib/Target/R600/SIAnnotateControlFlow.cpp b/contrib/llvm/lib/Target/R600/SIAnnotateControlFlow.cpp
new file mode 100644
index 0000000..91eb60b
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/SIAnnotateControlFlow.cpp
@@ -0,0 +1,329 @@
+//===-- SIAnnotateControlFlow.cpp -  ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Annotates the control flow with hardware specific intrinsics.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-annotate-control-flow"
+
+namespace {
+
+// Complex types used in this pass
+typedef std::pair<BasicBlock *, Value *> StackEntry;
+typedef SmallVector<StackEntry, 16> StackVector;
+
+// Intrinsic names the control flow is annotated with
+static const char *const IfIntrinsic = "llvm.SI.if";
+static const char *const ElseIntrinsic = "llvm.SI.else";
+static const char *const BreakIntrinsic = "llvm.SI.break";
+static const char *const IfBreakIntrinsic = "llvm.SI.if.break";
+static const char *const ElseBreakIntrinsic = "llvm.SI.else.break";
+static const char *const LoopIntrinsic = "llvm.SI.loop";
+static const char *const EndCfIntrinsic = "llvm.SI.end.cf";
+
+class SIAnnotateControlFlow : public FunctionPass {
+
+  static char ID;
+
+  Type *Boolean;
+  Type *Void;
+  Type *Int64;
+  Type *ReturnStruct;
+
+  ConstantInt *BoolTrue;
+  ConstantInt *BoolFalse;
+  UndefValue *BoolUndef;
+  Constant *Int64Zero;
+
+  Constant *If;
+  Constant *Else;
+  Constant *Break;
+  Constant *IfBreak;
+  Constant *ElseBreak;
+  Constant *Loop;
+  Constant *EndCf;
+
+  DominatorTree *DT;
+  StackVector Stack;
+
+  bool isTopOfStack(BasicBlock *BB);
+
+  Value *popSaved();
+
+  void push(BasicBlock *BB, Value *Saved);
+
+  bool isElse(PHINode *Phi);
+
+  void eraseIfUnused(PHINode *Phi);
+
+  void openIf(BranchInst *Term);
+
+  void insertElse(BranchInst *Term);
+
+  Value *handleLoopCondition(Value *Cond, PHINode *Broken);
+
+  void handleLoop(BranchInst *Term);
+
+  void closeControlFlow(BasicBlock *BB);
+
+public:
+  SIAnnotateControlFlow():
+    FunctionPass(ID) { }
+
+  bool doInitialization(Module &M) override;
+
+  bool runOnFunction(Function &F) override;
+
+  const char *getPassName() const override {
+    return "SI annotate control flow";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+
+};
+
+} // end anonymous namespace
+
+char SIAnnotateControlFlow::ID = 0;
+
+/// \brief Initialize all the types and constants used in the pass
+bool SIAnnotateControlFlow::doInitialization(Module &M) {
+  LLVMContext &Context = M.getContext();
+
+  Void = Type::getVoidTy(Context);
+  Boolean = Type::getInt1Ty(Context);
+  Int64 = Type::getInt64Ty(Context);
+  ReturnStruct = StructType::get(Boolean, Int64, (Type *)nullptr);
+
+  BoolTrue = ConstantInt::getTrue(Context);
+  BoolFalse = ConstantInt::getFalse(Context);
+  BoolUndef = UndefValue::get(Boolean);
+  Int64Zero = ConstantInt::get(Int64, 0);
+
+  If = M.getOrInsertFunction(
+    IfIntrinsic, ReturnStruct, Boolean, (Type *)nullptr);
+
+  Else = M.getOrInsertFunction(
+    ElseIntrinsic, ReturnStruct, Int64, (Type *)nullptr);
+
+  Break = M.getOrInsertFunction(
+    BreakIntrinsic, Int64, Int64, (Type *)nullptr);
+
+  IfBreak = M.getOrInsertFunction(
+    IfBreakIntrinsic, Int64, Boolean, Int64, (Type *)nullptr);
+
+  ElseBreak = M.getOrInsertFunction(
+    ElseBreakIntrinsic, Int64, Int64, Int64, (Type *)nullptr);
+
+  Loop = M.getOrInsertFunction(
+    LoopIntrinsic, Boolean, Int64, (Type *)nullptr);
+
+  EndCf = M.getOrInsertFunction(
+    EndCfIntrinsic, Void, Int64, (Type *)nullptr);
+
+  return false;
+}
+
+/// \brief Is BB the last block saved on the stack ?
+bool SIAnnotateControlFlow::isTopOfStack(BasicBlock *BB) {
+  return !Stack.empty() && Stack.back().first == BB;
+}
+
+/// \brief Pop the last saved value from the control flow stack
+Value *SIAnnotateControlFlow::popSaved() {
+  return Stack.pop_back_val().second;
+}
+
+/// \brief Push a BB and saved value to the control flow stack
+void SIAnnotateControlFlow::push(BasicBlock *BB, Value *Saved) {
+  Stack.push_back(std::make_pair(BB, Saved));
+}
+
+/// \brief Can the condition represented by this PHI node treated like
+/// an "Else" block?
+bool SIAnnotateControlFlow::isElse(PHINode *Phi) {
+  BasicBlock *IDom = DT->getNode(Phi->getParent())->getIDom()->getBlock();
+  for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
+    if (Phi->getIncomingBlock(i) == IDom) {
+
+      if (Phi->getIncomingValue(i) != BoolTrue)
+        return false;
+
+    } else {
+      if (Phi->getIncomingValue(i) != BoolFalse)
+        return false;
+
+    }
+  }
+  return true;
+}
+
+// \brief Erase "Phi" if it is not used any more
+void SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) {
+  if (!Phi->hasNUsesOrMore(1))
+    Phi->eraseFromParent();
+}
+
+/// \brief Open a new "If" block
+void SIAnnotateControlFlow::openIf(BranchInst *Term) {
+  Value *Ret = CallInst::Create(If, Term->getCondition(), "", Term);
+  Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term));
+  push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term));
+}
+
+/// \brief Close the last "If" block and open a new "Else" block
+void SIAnnotateControlFlow::insertElse(BranchInst *Term) {
+  Value *Ret = CallInst::Create(Else, popSaved(), "", Term);
+  Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term));
+  push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term));
+}
+
+/// \brief Recursively handle the condition leading to a loop
+Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken) {
+  if (PHINode *Phi = dyn_cast<PHINode>(Cond)) {
+    BasicBlock *Parent = Phi->getParent();
+    PHINode *NewPhi = PHINode::Create(Int64, 0, "", &Parent->front());
+    Value *Ret = NewPhi;
+
+    // Handle all non-constant incoming values first
+    for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
+      Value *Incoming = Phi->getIncomingValue(i);
+      BasicBlock *From = Phi->getIncomingBlock(i);
+      if (isa<ConstantInt>(Incoming)) {
+        NewPhi->addIncoming(Broken, From);
+        continue;
+      }
+
+      Phi->setIncomingValue(i, BoolFalse);
+      Value *PhiArg = handleLoopCondition(Incoming, Broken);
+      NewPhi->addIncoming(PhiArg, From);
+    }
+
+    BasicBlock *IDom = DT->getNode(Parent)->getIDom()->getBlock();
+
+    for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
+
+      Value *Incoming = Phi->getIncomingValue(i);
+      if (Incoming != BoolTrue)
+        continue;
+
+      BasicBlock *From = Phi->getIncomingBlock(i);
+      if (From == IDom) {
+        CallInst *OldEnd = dyn_cast<CallInst>(Parent->getFirstInsertionPt());
+        if (OldEnd && OldEnd->getCalledFunction() == EndCf) {
+          Value *Args[] = { OldEnd->getArgOperand(0), NewPhi };
+          Ret = CallInst::Create(ElseBreak, Args, "", OldEnd);
+          continue;
+        }
+      }
+      TerminatorInst *Insert = From->getTerminator();
+      Value *PhiArg = CallInst::Create(Break, Broken, "", Insert);
+      NewPhi->setIncomingValue(i, PhiArg);
+    }
+    eraseIfUnused(Phi);
+    return Ret;
+
+  } else if (Instruction *Inst = dyn_cast<Instruction>(Cond)) {
+    BasicBlock *Parent = Inst->getParent();
+    TerminatorInst *Insert = Parent->getTerminator();
+    Value *Args[] = { Cond, Broken };
+    return CallInst::Create(IfBreak, Args, "", Insert);
+
+  } else {
+    llvm_unreachable("Unhandled loop condition!");
+  }
+  return 0;
+}
+
+/// \brief Handle a back edge (loop)
+void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
+  BasicBlock *Target = Term->getSuccessor(1);
+  PHINode *Broken = PHINode::Create(Int64, 0, "", &Target->front());
+
+  Value *Cond = Term->getCondition();
+  Term->setCondition(BoolTrue);
+  Value *Arg = handleLoopCondition(Cond, Broken);
+
+  BasicBlock *BB = Term->getParent();
+  for (pred_iterator PI = pred_begin(Target), PE = pred_end(Target);
+       PI != PE; ++PI) {
+
+    Broken->addIncoming(*PI == BB ? Arg : Int64Zero, *PI);
+  }
+
+  Term->setCondition(CallInst::Create(Loop, Arg, "", Term));
+  push(Term->getSuccessor(0), Arg);
+}
+
+/// \brief Close the last opened control flow
+void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
+  CallInst::Create(EndCf, popSaved(), "", BB->getFirstInsertionPt());
+}
+
+/// \brief Annotate the control flow with intrinsics so the backend can
+/// recognize if/then/else and loops.
+bool SIAnnotateControlFlow::runOnFunction(Function &F) {
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+  for (df_iterator<BasicBlock *> I = df_begin(&F.getEntryBlock()),
+       E = df_end(&F.getEntryBlock()); I != E; ++I) {
+
+    BranchInst *Term = dyn_cast<BranchInst>((*I)->getTerminator());
+
+    if (!Term || Term->isUnconditional()) {
+      if (isTopOfStack(*I))
+        closeControlFlow(*I);
+      continue;
+    }
+
+    if (I.nodeVisited(Term->getSuccessor(1))) {
+      if (isTopOfStack(*I))
+        closeControlFlow(*I);
+      handleLoop(Term);
+      continue;
+    }
+
+    if (isTopOfStack(*I)) {
+      PHINode *Phi = dyn_cast<PHINode>(Term->getCondition());
+      if (Phi && Phi->getParent() == *I && isElse(Phi)) {
+        insertElse(Term);
+        eraseIfUnused(Phi);
+        continue;
+      }
+      closeControlFlow(*I);
+    }
+    openIf(Term);
+  }
+
+  assert(Stack.empty());
+  return true;
+}
+
+/// \brief Create the annotation pass
+FunctionPass *llvm::createSIAnnotateControlFlowPass() {
+  return new SIAnnotateControlFlow();
+}
diff --git a/contrib/llvm/lib/Target/R600/SIDefines.h b/contrib/llvm/lib/Target/R600/SIDefines.h
new file mode 100644
index 0000000..b7e7a2d
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/SIDefines.h
@@ -0,0 +1,92 @@
+//===-- SIDefines.h - SI Helper Macros ----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#ifndef SIDEFINES_H_
+#define SIDEFINES_H_
+
+namespace SIInstrFlags {
+enum {
+  MIMG = 1 << 3,
+  SMRD = 1 << 4,
+  VOP1 = 1 << 5,
+  VOP2 = 1 << 6,
+  VOP3 = 1 << 7,
+  VOPC = 1 << 8,
+  SALU = 1 << 9
+};
+}
+
+#define R_00B028_SPI_SHADER_PGM_RSRC1_PS                                0x00B028
+#define R_00B02C_SPI_SHADER_PGM_RSRC2_PS                                0x00B02C
+#define   S_00B02C_EXTRA_LDS_SIZE(x)                                  (((x) & 0xFF) << 8)
+#define R_00B128_SPI_SHADER_PGM_RSRC1_VS                                0x00B128
+#define R_00B228_SPI_SHADER_PGM_RSRC1_GS                                0x00B228
+#define R_00B848_COMPUTE_PGM_RSRC1                                      0x00B848
+#define   S_00B028_VGPRS(x)                                           (((x) & 0x3F) << 0)
+#define   S_00B028_SGPRS(x)                                           (((x) & 0x0F) << 6)
+#define R_00B84C_COMPUTE_PGM_RSRC2                                      0x00B84C
+#define   S_00B02C_SCRATCH_EN(x)                                      (((x) & 0x1) << 0)
+#define   S_00B84C_LDS_SIZE(x)                                        (((x) & 0x1FF) << 15)
+#define R_0286CC_SPI_PS_INPUT_ENA                                       0x0286CC
+
+
+#define R_00B848_COMPUTE_PGM_RSRC1                                      0x00B848
+#define   S_00B848_VGPRS(x)                                           (((x) & 0x3F) << 0)
+#define   G_00B848_VGPRS(x)                                           (((x) >> 0) & 0x3F)
+#define   C_00B848_VGPRS                                              0xFFFFFFC0
+#define   S_00B848_SGPRS(x)                                           (((x) & 0x0F) << 6)
+#define   G_00B848_SGPRS(x)                                           (((x) >> 6) & 0x0F)
+#define   C_00B848_SGPRS                                              0xFFFFFC3F
+#define   S_00B848_PRIORITY(x)                                        (((x) & 0x03) << 10)
+#define   G_00B848_PRIORITY(x)                                        (((x) >> 10) & 0x03)
+#define   C_00B848_PRIORITY                                           0xFFFFF3FF
+#define   S_00B848_FLOAT_MODE(x)                                      (((x) & 0xFF) << 12)
+#define   G_00B848_FLOAT_MODE(x)                                      (((x) >> 12) & 0xFF)
+#define   C_00B848_FLOAT_MODE                                         0xFFF00FFF
+#define   S_00B848_PRIV(x)                                            (((x) & 0x1) << 20)
+#define   G_00B848_PRIV(x)                                            (((x) >> 20) & 0x1)
+#define   C_00B848_PRIV                                               0xFFEFFFFF
+#define   S_00B848_DX10_CLAMP(x)                                      (((x) & 0x1) << 21)
+#define   G_00B848_DX10_CLAMP(x)                                      (((x) >> 21) & 0x1)
+#define   C_00B848_DX10_CLAMP                                         0xFFDFFFFF
+#define   S_00B848_DEBUG_MODE(x)                                      (((x) & 0x1) << 22)
+#define   G_00B848_DEBUG_MODE(x)                                      (((x) >> 22) & 0x1)
+#define   C_00B848_DEBUG_MODE                                         0xFFBFFFFF
+#define   S_00B848_IEEE_MODE(x)                                       (((x) & 0x1) << 23)
+#define   G_00B848_IEEE_MODE(x)                                       (((x) >> 23) & 0x1)
+#define   C_00B848_IEEE_MODE                                          0xFF7FFFFF
+
+
+// Helpers for setting FLOAT_MODE
+#define FP_ROUND_ROUND_TO_NEAREST 0
+#define FP_ROUND_ROUND_TO_INF 1
+#define FP_ROUND_ROUND_TO_NEGINF 2
+#define FP_ROUND_ROUND_TO_ZERO 3
+
+// Bits 3:0 control rounding mode. 1:0 control single precision, 3:2 double
+// precision.
+#define FP_ROUND_MODE_SP(x) ((x) & 0x3)
+#define FP_ROUND_MODE_DP(x) (((x) & 0x3) << 2)
+
+#define FP_DENORM_FLUSH_IN_FLUSH_OUT 0
+#define FP_DENORM_FLUSH_OUT 1
+#define FP_DENORM_FLUSH_IN 2
+#define FP_DENORM_FLUSH_NONE 3
+
+
+// Bits 7:4 control denormal handling. 5:4 control single precision, 6:7 double
+// precision.
+#define FP_DENORM_MODE_SP(x) (((x) & 0x3) << 4)
+#define FP_DENORM_MODE_DP(x) (((x) & 0x3) << 6)
+
+#define R_00B860_COMPUTE_TMPRING_SIZE                                   0x00B860
+#define   S_00B860_WAVESIZE(x)                                        (((x) & 0x1FFF) << 12)
+
+#endif // SIDEFINES_H_
diff --git a/contrib/llvm/lib/Target/R600/SIFixSGPRCopies.cpp b/contrib/llvm/lib/Target/R600/SIFixSGPRCopies.cpp
new file mode 100644
index 0000000..5f71453
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/SIFixSGPRCopies.cpp
@@ -0,0 +1,278 @@
+//===-- SIFixSGPRCopies.cpp - Remove potential VGPR => SGPR copies --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Copies from VGPR to SGPR registers are illegal and the register coalescer
+/// will sometimes generate these illegal copies in situations like this:
+///
+///  Register Class <vsrc> is the union of <vgpr> and <sgpr>
+///
+/// BB0:
+///   %vreg0 <sgpr> = SCALAR_INST
+///   %vreg1 <vsrc> = COPY %vreg0 <sgpr>
+///    ...
+///    BRANCH %cond BB1, BB2
+///  BB1:
+///    %vreg2 <vgpr> = VECTOR_INST
+///    %vreg3 <vsrc> = COPY %vreg2 <vgpr>
+///  BB2:
+///    %vreg4 <vsrc> = PHI %vreg1 <vsrc>, <BB#0>, %vreg3 <vrsc>, <BB#1>
+///    %vreg5 <vgpr> = VECTOR_INST %vreg4 <vsrc>
+///
+///
+/// The coalescer will begin at BB0 and eliminate its copy, then the resulting
+/// code will look like this:
+///
+/// BB0:
+///   %vreg0 <sgpr> = SCALAR_INST
+///    ...
+///    BRANCH %cond BB1, BB2
+/// BB1:
+///   %vreg2 <vgpr> = VECTOR_INST
+///   %vreg3 <vsrc> = COPY %vreg2 <vgpr>
+/// BB2:
+///   %vreg4 <sgpr> = PHI %vreg0 <sgpr>, <BB#0>, %vreg3 <vsrc>, <BB#1>
+///   %vreg5 <vgpr> = VECTOR_INST %vreg4 <sgpr>
+///
+/// Now that the result of the PHI instruction is an SGPR, the register
+/// allocator is now forced to constrain the register class of %vreg3 to
+/// <sgpr> so we end up with final code like this:
+///
+/// BB0:
+///   %vreg0 <sgpr> = SCALAR_INST
+///    ...
+///    BRANCH %cond BB1, BB2
+/// BB1:
+///   %vreg2 <vgpr> = VECTOR_INST
+///   %vreg3 <sgpr> = COPY %vreg2 <vgpr>
+/// BB2:
+///   %vreg4 <sgpr> = PHI %vreg0 <sgpr>, <BB#0>, %vreg3 <sgpr>, <BB#1>
+///   %vreg5 <vgpr> = VECTOR_INST %vreg4 <sgpr>
+///
+/// Now this code contains an illegal copy from a VGPR to an SGPR.
+///
+/// In order to avoid this problem, this pass searches for PHI instructions
+/// which define a <vsrc> register and constrains its definition class to
+/// <vgpr> if the user of the PHI's definition register is a vector instruction.
+/// If the PHI's definition class is constrained to <vgpr> then the coalescer
+/// will be unable to perform the COPY removal from the above example  which
+/// ultimately led to the creation of an illegal COPY.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "sgpr-copies"
+
+namespace {
+
+class SIFixSGPRCopies : public MachineFunctionPass {
+
+private:
+  static char ID;
+  const TargetRegisterClass *inferRegClassFromUses(const SIRegisterInfo *TRI,
+                                           const MachineRegisterInfo &MRI,
+                                           unsigned Reg,
+                                           unsigned SubReg) const;
+  const TargetRegisterClass *inferRegClassFromDef(const SIRegisterInfo *TRI,
+                                                 const MachineRegisterInfo &MRI,
+                                                 unsigned Reg,
+                                                 unsigned SubReg) const;
+  bool isVGPRToSGPRCopy(const MachineInstr &Copy, const SIRegisterInfo *TRI,
+                        const MachineRegisterInfo &MRI) const;
+
+public:
+  SIFixSGPRCopies(TargetMachine &tm) : MachineFunctionPass(ID) { }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "SI Fix SGPR copies";
+  }
+
+};
+
+} // End anonymous namespace
+
+char SIFixSGPRCopies::ID = 0;
+
+FunctionPass *llvm::createSIFixSGPRCopiesPass(TargetMachine &tm) {
+  return new SIFixSGPRCopies(tm);
+}
+
+static bool hasVGPROperands(const MachineInstr &MI, const SIRegisterInfo *TRI) {
+  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    if (!MI.getOperand(i).isReg() ||
+        !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg()))
+      continue;
+
+    if (TRI->hasVGPRs(MRI.getRegClass(MI.getOperand(i).getReg())))
+      return true;
+  }
+  return false;
+}
+
+/// This functions walks the use list of Reg until it finds an Instruction
+/// that isn't a COPY returns the register class of that instruction.
+/// \return The register defined by the first non-COPY instruction.
+const TargetRegisterClass *SIFixSGPRCopies::inferRegClassFromUses(
+                                                 const SIRegisterInfo *TRI,
+                                                 const MachineRegisterInfo &MRI,
+                                                 unsigned Reg,
+                                                 unsigned SubReg) const {
+  // The Reg parameter to the function must always be defined by either a PHI
+  // or a COPY, therefore it cannot be a physical register.
+  assert(TargetRegisterInfo::isVirtualRegister(Reg) &&
+         "Reg cannot be a physical register");
+
+  const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+  RC = TRI->getSubRegClass(RC, SubReg);
+  for (MachineRegisterInfo::use_instr_iterator
+       I = MRI.use_instr_begin(Reg), E = MRI.use_instr_end(); I != E; ++I) {
+    switch (I->getOpcode()) {
+    case AMDGPU::COPY:
+      RC = TRI->getCommonSubClass(RC, inferRegClassFromUses(TRI, MRI,
+                                  I->getOperand(0).getReg(),
+                                  I->getOperand(0).getSubReg()));
+      break;
+    }
+  }
+
+  return RC;
+}
+
+const TargetRegisterClass *SIFixSGPRCopies::inferRegClassFromDef(
+                                                 const SIRegisterInfo *TRI,
+                                                 const MachineRegisterInfo &MRI,
+                                                 unsigned Reg,
+                                                 unsigned SubReg) const {
+  if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+    const TargetRegisterClass *RC = TRI->getPhysRegClass(Reg);
+    return TRI->getSubRegClass(RC, SubReg);
+  }
+  MachineInstr *Def = MRI.getVRegDef(Reg);
+  if (Def->getOpcode() != AMDGPU::COPY) {
+    return TRI->getSubRegClass(MRI.getRegClass(Reg), SubReg);
+  }
+
+  return inferRegClassFromDef(TRI, MRI, Def->getOperand(1).getReg(),
+                                   Def->getOperand(1).getSubReg());
+}
+
+bool SIFixSGPRCopies::isVGPRToSGPRCopy(const MachineInstr &Copy,
+                                      const SIRegisterInfo *TRI,
+                                      const MachineRegisterInfo &MRI) const {
+
+  unsigned DstReg = Copy.getOperand(0).getReg();
+  unsigned SrcReg = Copy.getOperand(1).getReg();
+  unsigned SrcSubReg = Copy.getOperand(1).getSubReg();
+  const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
+  const TargetRegisterClass *SrcRC;
+
+  if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
+      DstRC == &AMDGPU::M0RegRegClass ||
+      MRI.getRegClass(SrcReg) == &AMDGPU::VReg_1RegClass)
+    return false;
+
+  SrcRC = TRI->getSubRegClass(MRI.getRegClass(SrcReg), SrcSubReg);
+  return TRI->isSGPRClass(DstRC) && TRI->hasVGPRs(SrcRC);
+}
+
+bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(
+      MF.getTarget().getRegisterInfo());
+  const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
+      MF.getTarget().getInstrInfo());
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+                                                  BI != BE; ++BI) {
+
+    MachineBasicBlock &MBB = *BI;
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+                                                      I != E; ++I) {
+      MachineInstr &MI = *I;
+      if (MI.getOpcode() == AMDGPU::COPY && isVGPRToSGPRCopy(MI, TRI, MRI)) {
+        DEBUG(dbgs() << "Fixing VGPR -> SGPR copy:\n");
+        DEBUG(MI.print(dbgs()));
+        TII->moveToVALU(MI);
+
+      }
+
+      switch (MI.getOpcode()) {
+      default: continue;
+      case AMDGPU::PHI: {
+        DEBUG(dbgs() << " Fixing PHI:\n");
+        DEBUG(MI.print(dbgs()));
+
+        for (unsigned i = 1; i < MI.getNumOperands(); i+=2) {
+          unsigned Reg = MI.getOperand(i).getReg();
+          const TargetRegisterClass *RC = inferRegClassFromDef(TRI, MRI, Reg,
+                                                  MI.getOperand(0).getSubReg());
+          MRI.constrainRegClass(Reg, RC);
+        }
+        unsigned Reg = MI.getOperand(0).getReg();
+        const TargetRegisterClass *RC = inferRegClassFromUses(TRI, MRI, Reg,
+                                                  MI.getOperand(0).getSubReg());
+        if (TRI->getCommonSubClass(RC, &AMDGPU::VReg_32RegClass)) {
+          MRI.constrainRegClass(Reg, &AMDGPU::VReg_32RegClass);
+        }
+
+        if (!TRI->isSGPRClass(MRI.getRegClass(Reg)))
+          break;
+
+        // If a PHI node defines an SGPR and any of its operands are VGPRs,
+        // then we need to move it to the VALU.
+        for (unsigned i = 1; i < MI.getNumOperands(); i+=2) {
+          unsigned Reg = MI.getOperand(i).getReg();
+          if (TRI->hasVGPRs(MRI.getRegClass(Reg))) {
+            TII->moveToVALU(MI);
+            break;
+          }
+        }
+
+        break;
+      }
+      case AMDGPU::REG_SEQUENCE: {
+        if (TRI->hasVGPRs(TII->getOpRegClass(MI, 0)) ||
+            !hasVGPROperands(MI, TRI))
+          continue;
+
+        DEBUG(dbgs() << "Fixing REG_SEQUENCE:\n");
+        DEBUG(MI.print(dbgs()));
+
+        TII->moveToVALU(MI);
+        break;
+      }
+      case AMDGPU::INSERT_SUBREG: {
+        const TargetRegisterClass *DstRC, *Src0RC, *Src1RC;
+        DstRC = MRI.getRegClass(MI.getOperand(0).getReg());
+        Src0RC = MRI.getRegClass(MI.getOperand(1).getReg());
+        Src1RC = MRI.getRegClass(MI.getOperand(2).getReg());
+        if (TRI->isSGPRClass(DstRC) &&
+            (TRI->hasVGPRs(Src0RC) || TRI->hasVGPRs(Src1RC))) {
+          DEBUG(dbgs() << " Fixing INSERT_SUBREG:\n");
+          DEBUG(MI.print(dbgs()));
+          TII->moveToVALU(MI);
+        }
+        break;
+      }
+      }
+    }
+  }
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/R600/SIFixSGPRLiveRanges.cpp b/contrib/llvm/lib/Target/R600/SIFixSGPRLiveRanges.cpp
new file mode 100644
index 0000000..7d116ee
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/SIFixSGPRLiveRanges.cpp
@@ -0,0 +1,110 @@
+//===-- SIFixSGPRLiveRanges.cpp - Fix SGPR live ranges ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// SALU instructions ignore control flow, so we need to modify the live ranges
+/// of the registers they define.
+///
+/// The strategy is to view the entire program as if it were a single basic
+/// block and calculate the intervals accordingly.  We implement this
+/// by walking this list of segments for each LiveRange and setting the
+/// end of each segment equal to the start of the segment that immediately
+/// follows it.
+
+#include "AMDGPU.h"
+#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-fix-sgpr-live-ranges"
+
+namespace {
+
+class SIFixSGPRLiveRanges : public MachineFunctionPass {
+public:
+  static char ID;
+
+public:
+  SIFixSGPRLiveRanges() : MachineFunctionPass(ID) {
+    initializeSIFixSGPRLiveRangesPass(*PassRegistry::getPassRegistry());
+  }
+
+  virtual bool runOnMachineFunction(MachineFunction &MF) override;
+
+  virtual const char *getPassName() const override {
+    return "SI Fix SGPR live ranges";
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LiveIntervals>();
+    AU.addPreserved<LiveIntervals>();
+    AU.addPreserved<SlotIndexes>();
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(SIFixSGPRLiveRanges, DEBUG_TYPE,
+                      "SI Fix SGPR Live Ranges", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(SIFixSGPRLiveRanges, DEBUG_TYPE,
+                    "SI Fix SGPR Live Ranges", false, false)
+
+char SIFixSGPRLiveRanges::ID = 0;
+
+char &llvm::SIFixSGPRLiveRangesID = SIFixSGPRLiveRanges::ID;
+
+FunctionPass *llvm::createSIFixSGPRLiveRangesPass() {
+  return new SIFixSGPRLiveRanges();
+}
+
+bool SIFixSGPRLiveRanges::runOnMachineFunction(MachineFunction &MF) {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(
+      MF.getTarget().getRegisterInfo());
+  LiveIntervals *LIS = &getAnalysis<LiveIntervals>();
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+                                                  BI != BE; ++BI) {
+
+    MachineBasicBlock &MBB = *BI;
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+                                                      I != E; ++I) {
+      MachineInstr &MI = *I;
+      MachineOperand *ExecUse = MI.findRegisterUseOperand(AMDGPU::EXEC);
+      if (ExecUse)
+        continue;
+
+      for (const MachineOperand &Def : MI.operands()) {
+        if (!Def.isReg() || !Def.isDef() ||!TargetRegisterInfo::isVirtualRegister(Def.getReg()))
+          continue;
+
+        const TargetRegisterClass *RC = MRI.getRegClass(Def.getReg());
+
+        if (!TRI->isSGPRClass(RC))
+          continue;
+        LiveInterval &LI = LIS->getInterval(Def.getReg());
+        for (unsigned i = 0, e = LI.size() - 1; i != e; ++i) {
+          LiveRange::Segment &Seg = LI.segments[i];
+          LiveRange::Segment &Next = LI.segments[i + 1];
+          Seg.end = Next.start;
+        }
+      }
+    }
+  }
+
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/R600/SIISelLowering.cpp b/contrib/llvm/lib/Target/R600/SIISelLowering.cpp
new file mode 100644
index 0000000..5a148a2
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/SIISelLowering.cpp
@@ -0,0 +1,1821 @@
+//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Custom DAG lowering for SI
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef _MSC_VER
+// Provide M_PI.
+#define _USE_MATH_DEFINES
+#include <cmath>
+#endif
+
+#include "SIISelLowering.h"
+#include "AMDGPU.h"
+#include "AMDGPUIntrinsicInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/Function.h"
+#include "llvm/ADT/SmallString.h"
+
+using namespace llvm;
+
+SITargetLowering::SITargetLowering(TargetMachine &TM) :
+    AMDGPUTargetLowering(TM) {
+  addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
+  addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
+
+  addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass);
+  addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass);
+
+  addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
+  addRegisterClass(MVT::f32, &AMDGPU::VReg_32RegClass);
+
+  addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
+  addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
+  addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
+
+  addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
+  addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
+
+  addRegisterClass(MVT::v8i32, &AMDGPU::VReg_256RegClass);
+  addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
+
+  addRegisterClass(MVT::v16i32, &AMDGPU::VReg_512RegClass);
+  addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
+
+  computeRegisterProperties();
+
+  // Condition Codes
+  setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
+
+  setCondCodeAction(ISD::SETONE, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETUEQ, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETUGE, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETUGT, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETULE, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETULT, MVT::f64, Expand);
+
+  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
+  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
+  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
+  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
+
+  setOperationAction(ISD::ADD, MVT::i32, Legal);
+  setOperationAction(ISD::ADDC, MVT::i32, Legal);
+  setOperationAction(ISD::ADDE, MVT::i32, Legal);
+  setOperationAction(ISD::SUBC, MVT::i32, Legal);
+  setOperationAction(ISD::SUBE, MVT::i32, Legal);
+
+  setOperationAction(ISD::FSIN, MVT::f32, Custom);
+  setOperationAction(ISD::FCOS, MVT::f32, Custom);
+
+  // We need to custom lower vector stores from local memory
+  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
+
+  setOperationAction(ISD::STORE, MVT::v8i32, Custom);
+  setOperationAction(ISD::STORE, MVT::v16i32, Custom);
+
+  setOperationAction(ISD::STORE, MVT::i1, Custom);
+  setOperationAction(ISD::STORE, MVT::i32, Custom);
+  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
+  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
+
+  setOperationAction(ISD::SELECT, MVT::f32, Promote);
+  AddPromotedToType(ISD::SELECT, MVT::f32, MVT::i32);
+  setOperationAction(ISD::SELECT, MVT::i64, Custom);
+  setOperationAction(ISD::SELECT, MVT::f64, Promote);
+  AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
+
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
+
+  setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
+  setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Custom);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
+
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v16i8, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
+
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+  setOperationAction(ISD::BRCOND, MVT::Other, Custom);
+
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, Expand);
+
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, Expand);
+
+  setLoadExtAction(ISD::EXTLOAD, MVT::i1, Promote);
+  setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
+  setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom);
+  setLoadExtAction(ISD::EXTLOAD, MVT::i32, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+
+  setTruncStoreAction(MVT::i32, MVT::i8, Custom);
+  setTruncStoreAction(MVT::i32, MVT::i16, Custom);
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+  setTruncStoreAction(MVT::i64, MVT::i32, Expand);
+  setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
+  setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
+
+  setOperationAction(ISD::LOAD, MVT::i1, Custom);
+
+  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Expand);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
+
+  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
+  setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
+
+  // These should use UDIVREM, so set them to expand
+  setOperationAction(ISD::UDIV, MVT::i64, Expand);
+  setOperationAction(ISD::UREM, MVT::i64, Expand);
+
+  // We only support LOAD/STORE and vector manipulation ops for vectors
+  // with > 4 elements.
+  MVT VecTypes[] = {
+    MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32
+  };
+
+  setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
+  setOperationAction(ISD::SELECT, MVT::i1, Promote);
+
+  for (MVT VT : VecTypes) {
+    for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
+      switch(Op) {
+      case ISD::LOAD:
+      case ISD::STORE:
+      case ISD::BUILD_VECTOR:
+      case ISD::BITCAST:
+      case ISD::EXTRACT_VECTOR_ELT:
+      case ISD::INSERT_VECTOR_ELT:
+      case ISD::CONCAT_VECTORS:
+      case ISD::INSERT_SUBVECTOR:
+      case ISD::EXTRACT_SUBVECTOR:
+        break;
+      default:
+        setOperationAction(Op, VT, Expand);
+        break;
+      }
+    }
+  }
+
+  for (int I = MVT::v1f64; I <= MVT::v8f64; ++I) {
+    MVT::SimpleValueType VT = static_cast<MVT::SimpleValueType>(I);
+    setOperationAction(ISD::FTRUNC, VT, Expand);
+    setOperationAction(ISD::FCEIL, VT, Expand);
+    setOperationAction(ISD::FFLOOR, VT, Expand);
+  }
+
+  if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
+    setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
+    setOperationAction(ISD::FCEIL, MVT::f64, Legal);
+    setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
+    setOperationAction(ISD::FRINT, MVT::f64, Legal);
+  }
+
+  // FIXME: These should be removed and handled the same was as f32 fneg. Source
+  // modifiers also work for the double instructions.
+  setOperationAction(ISD::FNEG, MVT::f64, Expand);
+  setOperationAction(ISD::FABS, MVT::f64, Expand);
+
+  setOperationAction(ISD::FDIV, MVT::f32, Custom);
+
+  setTargetDAGCombine(ISD::SELECT_CC);
+  setTargetDAGCombine(ISD::SETCC);
+
+  setTargetDAGCombine(ISD::UINT_TO_FP);
+
+  setSchedulingPreference(Sched::RegPressure);
+}
+
+//===----------------------------------------------------------------------===//
+// TargetLowering queries
+//===----------------------------------------------------------------------===//
+
+bool SITargetLowering::allowsUnalignedMemoryAccesses(EVT  VT,
+                                                     unsigned AddrSpace,
+                                                     bool *IsFast) const {
+  if (IsFast)
+    *IsFast = false;
+
+  // XXX: This depends on the address space and also we may want to revist
+  // the alignment values we specify in the DataLayout.
+
+  // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
+  // which isn't a simple VT.
+  if (!VT.isSimple() || VT == MVT::Other)
+    return false;
+
+  // XXX - CI changes say "Support for unaligned memory accesses" but I don't
+  // see what for specifically. The wording everywhere else seems to be the
+  // same.
+
+  // XXX - The only mention I see of this in the ISA manual is for LDS direct
+  // reads the "byte address and must be dword aligned". Is it also true for the
+  // normal loads and stores?
+  if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS)
+    return false;
+
+  // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
+  // byte-address are ignored, thus forcing Dword alignment.
+  // This applies to private, global, and constant memory.
+  if (IsFast)
+    *IsFast = true;
+  return VT.bitsGT(MVT::i32);
+}
+
+TargetLoweringBase::LegalizeTypeAction
+SITargetLowering::getPreferredVectorAction(EVT VT) const {
+  if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
+    return TypeSplitVector;
+
+  return TargetLoweringBase::getPreferredVectorAction(VT);
+}
+
+bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                                         Type *Ty) const {
+  const SIInstrInfo *TII =
+    static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+  return TII->isInlineConstant(Imm);
+}
+
+SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
+                                         SDLoc DL, SDValue Chain,
+                                         unsigned Offset, bool Signed) const {
+  MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
+  PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
+                                            AMDGPUAS::CONSTANT_ADDRESS);
+  SDValue BasePtr =  DAG.getCopyFromReg(Chain, DL,
+                           MRI.getLiveInVirtReg(AMDGPU::SGPR0_SGPR1), MVT::i64);
+  SDValue Ptr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
+                                             DAG.getConstant(Offset, MVT::i64));
+  return DAG.getExtLoad(Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD, DL, VT, Chain, Ptr,
+                            MachinePointerInfo(UndefValue::get(PtrTy)), MemVT,
+                            false, false, MemVT.getSizeInBits() >> 3);
+
+}
+
+SDValue SITargetLowering::LowerFormalArguments(
+                                      SDValue Chain,
+                                      CallingConv::ID CallConv,
+                                      bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                      SDLoc DL, SelectionDAG &DAG,
+                                      SmallVectorImpl<SDValue> &InVals) const {
+
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  FunctionType *FType = MF.getFunction()->getFunctionType();
+  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+
+  assert(CallConv == CallingConv::C);
+
+  SmallVector<ISD::InputArg, 16> Splits;
+  uint32_t Skipped = 0;
+
+  for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) {
+    const ISD::InputArg &Arg = Ins[i];
+
+    // First check if it's a PS input addr
+    if (Info->getShaderType() == ShaderType::PIXEL && !Arg.Flags.isInReg() &&
+        !Arg.Flags.isByVal()) {
+
+      assert((PSInputNum <= 15) && "Too many PS inputs!");
+
+      if (!Arg.Used) {
+        // We can savely skip PS inputs
+        Skipped |= 1 << i;
+        ++PSInputNum;
+        continue;
+      }
+
+      Info->PSInputAddr |= 1 << PSInputNum++;
+    }
+
+    // Second split vertices into their elements
+    if (Info->getShaderType() != ShaderType::COMPUTE && Arg.VT.isVector()) {
+      ISD::InputArg NewArg = Arg;
+      NewArg.Flags.setSplit();
+      NewArg.VT = Arg.VT.getVectorElementType();
+
+      // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
+      // three or five element vertex only needs three or five registers,
+      // NOT four or eigth.
+      Type *ParamType = FType->getParamType(Arg.OrigArgIndex);
+      unsigned NumElements = ParamType->getVectorNumElements();
+
+      for (unsigned j = 0; j != NumElements; ++j) {
+        Splits.push_back(NewArg);
+        NewArg.PartOffset += NewArg.VT.getStoreSize();
+      }
+
+    } else if (Info->getShaderType() != ShaderType::COMPUTE) {
+      Splits.push_back(Arg);
+    }
+  }
+
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+
+  // At least one interpolation mode must be enabled or else the GPU will hang.
+  if (Info->getShaderType() == ShaderType::PIXEL &&
+      (Info->PSInputAddr & 0x7F) == 0) {
+    Info->PSInputAddr |= 1;
+    CCInfo.AllocateReg(AMDGPU::VGPR0);
+    CCInfo.AllocateReg(AMDGPU::VGPR1);
+  }
+
+  // The pointer to the list of arguments is stored in SGPR0, SGPR1
+	// The pointer to the scratch buffer is stored in SGPR2, SGPR3
+  if (Info->getShaderType() == ShaderType::COMPUTE) {
+    Info->NumUserSGPRs = 4;
+    CCInfo.AllocateReg(AMDGPU::SGPR0);
+    CCInfo.AllocateReg(AMDGPU::SGPR1);
+    CCInfo.AllocateReg(AMDGPU::SGPR2);
+    CCInfo.AllocateReg(AMDGPU::SGPR3);
+    MF.addLiveIn(AMDGPU::SGPR0_SGPR1, &AMDGPU::SReg_64RegClass);
+    MF.addLiveIn(AMDGPU::SGPR2_SGPR3, &AMDGPU::SReg_64RegClass);
+  }
+
+  if (Info->getShaderType() == ShaderType::COMPUTE) {
+    getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins,
+                            Splits);
+  }
+
+  AnalyzeFormalArguments(CCInfo, Splits);
+
+  for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
+
+    const ISD::InputArg &Arg = Ins[i];
+    if (Skipped & (1 << i)) {
+      InVals.push_back(DAG.getUNDEF(Arg.VT));
+      continue;
+    }
+
+    CCValAssign &VA = ArgLocs[ArgIdx++];
+    EVT VT = VA.getLocVT();
+
+    if (VA.isMemLoc()) {
+      VT = Ins[i].VT;
+      EVT MemVT = Splits[i].VT;
+      // The first 36 bytes of the input buffer contains information about
+      // thread group and global sizes.
+      SDValue Arg = LowerParameter(DAG, VT, MemVT,  DL, DAG.getRoot(),
+                                   36 + VA.getLocMemOffset(),
+                                   Ins[i].Flags.isSExt());
+      InVals.push_back(Arg);
+      continue;
+    }
+    assert(VA.isRegLoc() && "Parameter must be in a register!");
+
+    unsigned Reg = VA.getLocReg();
+
+    if (VT == MVT::i64) {
+      // For now assume it is a pointer
+      Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0,
+                                     &AMDGPU::SReg_64RegClass);
+      Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass);
+      InVals.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT));
+      continue;
+    }
+
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
+
+    Reg = MF.addLiveIn(Reg, RC);
+    SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
+
+    if (Arg.VT.isVector()) {
+
+      // Build a vector from the registers
+      Type *ParamType = FType->getParamType(Arg.OrigArgIndex);
+      unsigned NumElements = ParamType->getVectorNumElements();
+
+      SmallVector<SDValue, 4> Regs;
+      Regs.push_back(Val);
+      for (unsigned j = 1; j != NumElements; ++j) {
+        Reg = ArgLocs[ArgIdx++].getLocReg();
+        Reg = MF.addLiveIn(Reg, RC);
+        Regs.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT));
+      }
+
+      // Fill up the missing vector elements
+      NumElements = Arg.VT.getVectorNumElements() - NumElements;
+      for (unsigned j = 0; j != NumElements; ++j)
+        Regs.push_back(DAG.getUNDEF(VT));
+
+      InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, Regs));
+      continue;
+    }
+
+    InVals.push_back(Val);
+  }
+  return Chain;
+}
+
+MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
+    MachineInstr * MI, MachineBasicBlock * BB) const {
+
+  MachineBasicBlock::iterator I = *MI;
+  const SIInstrInfo *TII =
+    static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+
+  switch (MI->getOpcode()) {
+  default:
+    return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
+  case AMDGPU::BRANCH: return BB;
+  case AMDGPU::SI_ADDR64_RSRC: {
+    unsigned SuperReg = MI->getOperand(0).getReg();
+    unsigned SubRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
+    unsigned SubRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
+    unsigned SubRegHiHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+    unsigned SubRegHiLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), SubRegLo)
+            .addOperand(MI->getOperand(1));
+    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), SubRegHiLo)
+            .addImm(0);
+    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), SubRegHiHi)
+            .addImm(AMDGPU::RSRC_DATA_FORMAT >> 32);
+    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), SubRegHi)
+            .addReg(SubRegHiLo)
+            .addImm(AMDGPU::sub0)
+            .addReg(SubRegHiHi)
+            .addImm(AMDGPU::sub1);
+    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), SuperReg)
+            .addReg(SubRegLo)
+            .addImm(AMDGPU::sub0_sub1)
+            .addReg(SubRegHi)
+            .addImm(AMDGPU::sub2_sub3);
+    MI->eraseFromParent();
+    break;
+  }
+  case AMDGPU::SI_BUFFER_RSRC: {
+    unsigned SuperReg = MI->getOperand(0).getReg();
+    unsigned Args[4];
+    for (unsigned i = 0, e = 4; i < e; ++i) {
+      MachineOperand &Arg = MI->getOperand(i + 1);
+
+      if (Arg.isReg()) {
+        Args[i] = Arg.getReg();
+        continue;
+      }
+
+      assert(Arg.isImm());
+      unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+      BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), Reg)
+              .addImm(Arg.getImm());
+      Args[i] = Reg;
+    }
+    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE),
+            SuperReg)
+            .addReg(Args[0])
+            .addImm(AMDGPU::sub0)
+            .addReg(Args[1])
+            .addImm(AMDGPU::sub1)
+            .addReg(Args[2])
+            .addImm(AMDGPU::sub2)
+            .addReg(Args[3])
+            .addImm(AMDGPU::sub3);
+    MI->eraseFromParent();
+    break;
+  }
+  case AMDGPU::V_SUB_F64: {
+    unsigned DestReg = MI->getOperand(0).getReg();
+    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64), DestReg)
+      .addImm(0)  // SRC0 modifiers
+      .addReg(MI->getOperand(1).getReg())
+      .addImm(1)  // SRC1 modifiers
+      .addReg(MI->getOperand(2).getReg())
+      .addImm(0)  // SRC2 modifiers
+      .addImm(0)  // src2
+      .addImm(0)  // CLAMP
+      .addImm(0); // OMOD
+    MI->eraseFromParent();
+    break;
+  }
+  case AMDGPU::SI_RegisterStorePseudo: {
+    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+    unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+    MachineInstrBuilder MIB =
+        BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::SI_RegisterStore),
+                Reg);
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i)
+      MIB.addOperand(MI->getOperand(i));
+
+    MI->eraseFromParent();
+    break;
+  }
+  case AMDGPU::FABS_SI: {
+    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+    const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+    unsigned Reg = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32),
+            Reg)
+            .addImm(0x7fffffff);
+    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_AND_B32_e32),
+            MI->getOperand(0).getReg())
+            .addReg(MI->getOperand(1).getReg())
+            .addReg(Reg);
+    MI->eraseFromParent();
+    break;
+  }
+  case AMDGPU::FNEG_SI: {
+    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+    const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+    unsigned Reg = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32),
+            Reg)
+            .addImm(0x80000000);
+    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_XOR_B32_e32),
+            MI->getOperand(0).getReg())
+            .addReg(MI->getOperand(1).getReg())
+            .addReg(Reg);
+    MI->eraseFromParent();
+    break;
+  }
+  case AMDGPU::FCLAMP_SI: {
+    const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F32_e64),
+            MI->getOperand(0).getReg())
+            .addImm(0) // SRC0 modifiers
+            .addOperand(MI->getOperand(1))
+            .addImm(0) // SRC1 modifiers
+            .addImm(0) // SRC1
+            .addImm(1) // CLAMP
+            .addImm(0); // OMOD
+    MI->eraseFromParent();
+  }
+  }
+  return BB;
+}
+
+EVT SITargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
+  if (!VT.isVector()) {
+    return MVT::i1;
+  }
+  return MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+}
+
+MVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const {
+  return MVT::i32;
+}
+
+bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+  VT = VT.getScalarType();
+
+  if (!VT.isSimple())
+    return false;
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::f32:
+    return false; /* There is V_MAD_F32 for f32 */
+  case MVT::f64:
+    return true;
+  default:
+    break;
+  }
+
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// Custom DAG Lowering Operations
+//===----------------------------------------------------------------------===//
+
+SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  switch (Op.getOpcode()) {
+  default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
+  case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
+  case ISD::BRCOND: return LowerBRCOND(Op, DAG);
+  case ISD::LOAD: {
+    SDValue Result = LowerLOAD(Op, DAG);
+    assert((!Result.getNode() ||
+            Result.getNode()->getNumValues() == 2) &&
+           "Load should return a value and a chain");
+    return Result;
+  }
+
+  case ISD::FSIN:
+  case ISD::FCOS:
+    return LowerTrig(Op, DAG);
+  case ISD::SELECT: return LowerSELECT(Op, DAG);
+  case ISD::FDIV: return LowerFDIV(Op, DAG);
+  case ISD::STORE: return LowerSTORE(Op, DAG);
+  case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IntrinsicID =
+                         cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    EVT VT = Op.getValueType();
+    SDLoc DL(Op);
+    switch (IntrinsicID) {
+    default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
+    case Intrinsic::r600_read_ngroups_x:
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 0, false);
+    case Intrinsic::r600_read_ngroups_y:
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 4, false);
+    case Intrinsic::r600_read_ngroups_z:
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 8, false);
+    case Intrinsic::r600_read_global_size_x:
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 12, false);
+    case Intrinsic::r600_read_global_size_y:
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 16, false);
+    case Intrinsic::r600_read_global_size_z:
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 20, false);
+    case Intrinsic::r600_read_local_size_x:
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 24, false);
+    case Intrinsic::r600_read_local_size_y:
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 28, false);
+    case Intrinsic::r600_read_local_size_z:
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 32, false);
+    case Intrinsic::r600_read_tgid_x:
+      return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
+                     AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 0), VT);
+    case Intrinsic::r600_read_tgid_y:
+      return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
+                     AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 1), VT);
+    case Intrinsic::r600_read_tgid_z:
+      return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
+                     AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2), VT);
+    case Intrinsic::r600_read_tidig_x:
+      return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
+                                  AMDGPU::VGPR0, VT);
+    case Intrinsic::r600_read_tidig_y:
+      return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
+                                  AMDGPU::VGPR1, VT);
+    case Intrinsic::r600_read_tidig_z:
+      return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
+                                  AMDGPU::VGPR2, VT);
+    case AMDGPUIntrinsic::SI_load_const: {
+      SDValue Ops [] = {
+        Op.getOperand(1),
+        Op.getOperand(2)
+      };
+
+      MachineMemOperand *MMO = MF.getMachineMemOperand(
+          MachinePointerInfo(),
+          MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant,
+          VT.getSizeInBits() / 8, 4);
+      return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
+                                     Op->getVTList(), Ops, VT, MMO);
+    }
+    case AMDGPUIntrinsic::SI_sample:
+      return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG);
+    case AMDGPUIntrinsic::SI_sampleb:
+      return LowerSampleIntrinsic(AMDGPUISD::SAMPLEB, Op, DAG);
+    case AMDGPUIntrinsic::SI_sampled:
+      return LowerSampleIntrinsic(AMDGPUISD::SAMPLED, Op, DAG);
+    case AMDGPUIntrinsic::SI_samplel:
+      return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG);
+    case AMDGPUIntrinsic::SI_vs_load_input:
+      return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT,
+                         Op.getOperand(1),
+                         Op.getOperand(2),
+                         Op.getOperand(3));
+    }
+  }
+
+  case ISD::INTRINSIC_VOID:
+    SDValue Chain = Op.getOperand(0);
+    unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+
+    switch (IntrinsicID) {
+      case AMDGPUIntrinsic::SI_tbuffer_store: {
+        SDLoc DL(Op);
+        SDValue Ops [] = {
+          Chain,
+          Op.getOperand(2),
+          Op.getOperand(3),
+          Op.getOperand(4),
+          Op.getOperand(5),
+          Op.getOperand(6),
+          Op.getOperand(7),
+          Op.getOperand(8),
+          Op.getOperand(9),
+          Op.getOperand(10),
+          Op.getOperand(11),
+          Op.getOperand(12),
+          Op.getOperand(13),
+          Op.getOperand(14)
+        };
+        EVT VT = Op.getOperand(3).getValueType();
+
+        MachineMemOperand *MMO = MF.getMachineMemOperand(
+            MachinePointerInfo(),
+            MachineMemOperand::MOStore,
+            VT.getSizeInBits() / 8, 4);
+        return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
+                                       Op->getVTList(), Ops, VT, MMO);
+      }
+      default:
+        break;
+    }
+  }
+  return SDValue();
+}
+
+/// \brief Helper function for LowerBRCOND
+static SDNode *findUser(SDValue Value, unsigned Opcode) {
+
+  SDNode *Parent = Value.getNode();
+  for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
+       I != E; ++I) {
+
+    if (I.getUse().get() != Value)
+      continue;
+
+    if (I->getOpcode() == Opcode)
+      return *I;
+  }
+  return nullptr;
+}
+
+SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  const SIInstrInfo *TII =
+    static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+  const SIRegisterInfo &TRI = TII->getRegisterInfo();
+  FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op);
+  unsigned FrameIndex = FINode->getIndex();
+
+  CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
+    TRI.getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET), MVT::i32);
+
+  return DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
+}
+
+/// This transforms the control flow intrinsics to get the branch destination as
+/// last parameter, also switches branch target with BR if the need arise
+SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
+                                      SelectionDAG &DAG) const {
+
+  SDLoc DL(BRCOND);
+
+  SDNode *Intr = BRCOND.getOperand(1).getNode();
+  SDValue Target = BRCOND.getOperand(2);
+  SDNode *BR = nullptr;
+
+  if (Intr->getOpcode() == ISD::SETCC) {
+    // As long as we negate the condition everything is fine
+    SDNode *SetCC = Intr;
+    assert(SetCC->getConstantOperandVal(1) == 1);
+    assert(cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
+           ISD::SETNE);
+    Intr = SetCC->getOperand(0).getNode();
+
+  } else {
+    // Get the target from BR if we don't negate the condition
+    BR = findUser(BRCOND, ISD::BR);
+    Target = BR->getOperand(1);
+  }
+
+  assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN);
+
+  // Build the result and
+  SmallVector<EVT, 4> Res;
+  for (unsigned i = 1, e = Intr->getNumValues(); i != e; ++i)
+    Res.push_back(Intr->getValueType(i));
+
+  // operands of the new intrinsic call
+  SmallVector<SDValue, 4> Ops;
+  Ops.push_back(BRCOND.getOperand(0));
+  for (unsigned i = 1, e = Intr->getNumOperands(); i != e; ++i)
+    Ops.push_back(Intr->getOperand(i));
+  Ops.push_back(Target);
+
+  // build the new intrinsic call
+  SDNode *Result = DAG.getNode(
+    Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL,
+    DAG.getVTList(Res), Ops).getNode();
+
+  if (BR) {
+    // Give the branch instruction our target
+    SDValue Ops[] = {
+      BR->getOperand(0),
+      BRCOND.getOperand(2)
+    };
+    DAG.MorphNodeTo(BR, ISD::BR, BR->getVTList(), Ops);
+  }
+
+  SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
+
+  // Copy the intrinsic results to registers
+  for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
+    SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
+    if (!CopyToReg)
+      continue;
+
+    Chain = DAG.getCopyToReg(
+      Chain, DL,
+      CopyToReg->getOperand(1),
+      SDValue(Result, i - 1),
+      SDValue());
+
+    DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
+  }
+
+  // Remove the old intrinsic from the chain
+  DAG.ReplaceAllUsesOfValueWith(
+    SDValue(Intr, Intr->getNumValues() - 1),
+    Intr->getOperand(0));
+
+  return Chain;
+}
+
+SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
+                                             SDValue Op,
+                                             SelectionDAG &DAG) const {
+  GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
+
+  if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
+    return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
+
+  SDLoc DL(GSD);
+  const GlobalValue *GV = GSD->getGlobal();
+  MVT PtrVT = getPointerTy(GSD->getAddressSpace());
+
+  SDValue Ptr = DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT);
+  SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32);
+
+  SDValue PtrLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr,
+                              DAG.getConstant(0, MVT::i32));
+  SDValue PtrHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr,
+                              DAG.getConstant(1, MVT::i32));
+
+  SDValue Lo = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i32, MVT::Glue),
+                           PtrLo, GA);
+  SDValue Hi = DAG.getNode(ISD::ADDE, DL, DAG.getVTList(MVT::i32, MVT::Glue),
+                           PtrHi, DAG.getConstant(0, MVT::i32),
+                           SDValue(Lo.getNode(), 1));
+  return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi);
+}
+
+SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  LoadSDNode *Load = cast<LoadSDNode>(Op);
+
+  if (Op.getValueType().isVector()) {
+    assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
+           "Custom lowering for non-i32 vectors hasn't been implemented.");
+    unsigned NumElements = Op.getValueType().getVectorNumElements();
+    assert(NumElements != 2 && "v2 loads are supported for all address spaces.");
+    switch (Load->getAddressSpace()) {
+      default: break;
+      case AMDGPUAS::GLOBAL_ADDRESS:
+      case AMDGPUAS::PRIVATE_ADDRESS:
+        // v4 loads are supported for private and global memory.
+        if (NumElements <= 4)
+          break;
+        // fall-through
+      case AMDGPUAS::LOCAL_ADDRESS:
+        return SplitVectorLoad(Op, DAG);
+    }
+  }
+
+  return AMDGPUTargetLowering::LowerLOAD(Op, DAG);
+}
+
+SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode,
+                                               const SDValue &Op,
+                                               SelectionDAG &DAG) const {
+  return DAG.getNode(Opcode, SDLoc(Op), Op.getValueType(), Op.getOperand(1),
+                     Op.getOperand(2),
+                     Op.getOperand(3),
+                     Op.getOperand(4));
+}
+
+SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
+  if (Op.getValueType() != MVT::i64)
+    return SDValue();
+
+  SDLoc DL(Op);
+  SDValue Cond = Op.getOperand(0);
+
+  SDValue Zero = DAG.getConstant(0, MVT::i32);
+  SDValue One = DAG.getConstant(1, MVT::i32);
+
+  SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
+  SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
+
+  SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
+  SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
+
+  SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
+
+  SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
+  SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
+
+  SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
+
+  SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i32, Lo, Hi);
+  return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res);
+}
+
+// Catch division cases where we can use shortcuts with rcp and rsq
+// instructions.
+SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  EVT VT = Op.getValueType();
+  bool Unsafe = DAG.getTarget().Options.UnsafeFPMath;
+
+  if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
+    if ((Unsafe || (VT == MVT::f32 && !Subtarget->hasFP32Denormals())) &&
+        CLHS->isExactlyValue(1.0)) {
+      // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
+      // the CI documentation has a worst case error of 1 ulp.
+      // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
+      // use it as long as we aren't trying to use denormals.
+
+      // 1.0 / sqrt(x) -> rsq(x)
+      //
+      // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
+      // error seems really high at 2^29 ULP.
+      if (RHS.getOpcode() == ISD::FSQRT)
+        return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
+
+      // 1.0 / x -> rcp(x)
+      return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
+    }
+  }
+
+  if (Unsafe) {
+    // Turn into multiply by the reciprocal.
+    // x / y -> x * (1.0 / y)
+    SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
+    return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip);
+  }
+
+  return SDValue();
+}
+
+SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
+  SDValue FastLowered = LowerFastFDIV(Op, DAG);
+  if (FastLowered.getNode())
+    return FastLowered;
+
+  // This uses v_rcp_f32 which does not handle denormals. Let this hit a
+  // selection error for now rather than do something incorrect.
+  if (Subtarget->hasFP32Denormals())
+    return SDValue();
+
+  SDLoc SL(Op);
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+
+  SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
+
+  const APFloat K0Val(BitsToFloat(0x6f800000));
+  const SDValue K0 = DAG.getConstantFP(K0Val, MVT::f32);
+
+  const APFloat K1Val(BitsToFloat(0x2f800000));
+  const SDValue K1 = DAG.getConstantFP(K1Val, MVT::f32);
+
+  const SDValue One = DAG.getTargetConstantFP(1.0, MVT::f32);
+
+  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32);
+
+  SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
+
+  SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
+
+  r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
+
+  SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
+
+  SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
+
+  return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
+}
+
+SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
+  return SDValue();
+}
+
+SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+
+  if (VT == MVT::f32)
+    return LowerFDIV32(Op, DAG);
+
+  if (VT == MVT::f64)
+    return LowerFDIV64(Op, DAG);
+
+  llvm_unreachable("Unexpected type for fdiv");
+}
+
+SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  StoreSDNode *Store = cast<StoreSDNode>(Op);
+  EVT VT = Store->getMemoryVT();
+
+  // These stores are legal.
+  if (Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
+      VT.isVector() && VT.getVectorNumElements() == 2 &&
+      VT.getVectorElementType() == MVT::i32)
+    return SDValue();
+
+  if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
+    if (VT.isVector() && VT.getVectorNumElements() > 4)
+      return SplitVectorStore(Op, DAG);
+    return SDValue();
+  }
+
+  SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
+  if (Ret.getNode())
+    return Ret;
+
+  if (VT.isVector() && VT.getVectorNumElements() >= 8)
+      return SplitVectorStore(Op, DAG);
+
+  if (VT == MVT::i1)
+    return DAG.getTruncStore(Store->getChain(), DL,
+                        DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
+                        Store->getBasePtr(), MVT::i1, Store->getMemOperand());
+
+  return SDValue();
+}
+
+SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  SDValue Arg = Op.getOperand(0);
+  SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT,
+        DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg,
+          DAG.getConstantFP(0.5 / M_PI, VT)));
+
+  switch (Op.getOpcode()) {
+  case ISD::FCOS:
+    return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, FractPart);
+  case ISD::FSIN:
+    return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, FractPart);
+  default:
+    llvm_unreachable("Wrong trig opcode");
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Custom DAG optimizations
+//===----------------------------------------------------------------------===//
+
+SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
+                                                     DAGCombinerInfo &DCI) {
+  EVT VT = N->getValueType(0);
+  EVT ScalarVT = VT.getScalarType();
+  if (ScalarVT != MVT::f32)
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
+
+  SDValue Src = N->getOperand(0);
+  EVT SrcVT = Src.getValueType();
+
+  // TODO: We could try to match extracting the higher bytes, which would be
+  // easier if i8 vectors weren't promoted to i32 vectors, particularly after
+  // types are legalized. v4i8 -> v4f32 is probably the only case to worry
+  // about in practice.
+  if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) {
+    if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
+      SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src);
+      DCI.AddToWorklist(Cvt.getNode());
+      return Cvt;
+    }
+  }
+
+  // We are primarily trying to catch operations on illegal vector types
+  // before they are expanded.
+  // For scalars, we can use the more flexible method of checking masked bits
+  // after legalization.
+  if (!DCI.isBeforeLegalize() ||
+      !SrcVT.isVector() ||
+      SrcVT.getVectorElementType() != MVT::i8) {
+    return SDValue();
+  }
+
+  assert(DCI.isBeforeLegalize() && "Unexpected legal type");
+
+  // Weird sized vectors are a pain to handle, but we know 3 is really the same
+  // size as 4.
+  unsigned NElts = SrcVT.getVectorNumElements();
+  if (!SrcVT.isSimple() && NElts != 3)
+    return SDValue();
+
+  // Handle v4i8 -> v4f32 extload. Replace the v4i8 with a legal i32 load to
+  // prevent a mess from expanding to v4i32 and repacking.
+  if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
+    EVT LoadVT = getEquivalentMemType(*DAG.getContext(), SrcVT);
+    EVT RegVT = getEquivalentLoadRegType(*DAG.getContext(), SrcVT);
+    EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, NElts);
+
+    LoadSDNode *Load = cast<LoadSDNode>(Src);
+    SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegVT,
+                                     Load->getChain(),
+                                     Load->getBasePtr(),
+                                     LoadVT,
+                                     Load->getMemOperand());
+
+    // Make sure successors of the original load stay after it by updating
+    // them to use the new Chain.
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), NewLoad.getValue(1));
+
+    SmallVector<SDValue, 4> Elts;
+    if (RegVT.isVector())
+      DAG.ExtractVectorElements(NewLoad, Elts);
+    else
+      Elts.push_back(NewLoad);
+
+    SmallVector<SDValue, 4> Ops;
+
+    unsigned EltIdx = 0;
+    for (SDValue Elt : Elts) {
+      unsigned ComponentsInElt = std::min(4u, NElts - 4 * EltIdx);
+      for (unsigned I = 0; I < ComponentsInElt; ++I) {
+        unsigned Opc = AMDGPUISD::CVT_F32_UBYTE0 + I;
+        SDValue Cvt = DAG.getNode(Opc, DL, MVT::f32, Elt);
+        DCI.AddToWorklist(Cvt.getNode());
+        Ops.push_back(Cvt);
+      }
+
+      ++EltIdx;
+    }
+
+    assert(Ops.size() == NElts);
+
+    return DAG.getNode(ISD::BUILD_VECTOR, DL, FloatVT, Ops);
+  }
+
+  return SDValue();
+}
+
+SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
+                                            DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+
+  switch (N->getOpcode()) {
+    default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
+    case ISD::SETCC: {
+      SDValue Arg0 = N->getOperand(0);
+      SDValue Arg1 = N->getOperand(1);
+      SDValue CC = N->getOperand(2);
+      ConstantSDNode * C = nullptr;
+      ISD::CondCode CCOp = dyn_cast<CondCodeSDNode>(CC)->get();
+
+      // i1 setcc (sext(i1), 0, setne) -> i1 setcc(i1, 0, setne)
+      if (VT == MVT::i1
+          && Arg0.getOpcode() == ISD::SIGN_EXTEND
+          && Arg0.getOperand(0).getValueType() == MVT::i1
+          && (C = dyn_cast<ConstantSDNode>(Arg1))
+          && C->isNullValue()
+          && CCOp == ISD::SETNE) {
+        return SimplifySetCC(VT, Arg0.getOperand(0),
+                             DAG.getConstant(0, MVT::i1), CCOp, true, DCI, DL);
+      }
+      break;
+    }
+
+  case AMDGPUISD::CVT_F32_UBYTE0:
+  case AMDGPUISD::CVT_F32_UBYTE1:
+  case AMDGPUISD::CVT_F32_UBYTE2:
+  case AMDGPUISD::CVT_F32_UBYTE3: {
+    unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
+
+    SDValue Src = N->getOperand(0);
+    APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
+
+    APInt KnownZero, KnownOne;
+    TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+                                          !DCI.isBeforeLegalizeOps());
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    if (TLO.ShrinkDemandedConstant(Src, Demanded) ||
+        TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) {
+      DCI.CommitTargetLoweringOpt(TLO);
+    }
+
+    break;
+  }
+
+  case ISD::UINT_TO_FP: {
+    return performUCharToFloatCombine(N, DCI);
+  }
+  }
+
+  return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
+}
+
+/// \brief Test if RegClass is one of the VSrc classes
+static bool isVSrc(unsigned RegClass) {
+  return AMDGPU::VSrc_32RegClassID == RegClass ||
+         AMDGPU::VSrc_64RegClassID == RegClass;
+}
+
+/// \brief Test if RegClass is one of the SSrc classes
+static bool isSSrc(unsigned RegClass) {
+  return AMDGPU::SSrc_32RegClassID == RegClass ||
+         AMDGPU::SSrc_64RegClassID == RegClass;
+}
+
+/// \brief Analyze the possible immediate value Op
+///
+/// Returns -1 if it isn't an immediate, 0 if it's and inline immediate
+/// and the immediate value if it's a literal immediate
+int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const {
+
+  union {
+    int32_t I;
+    float F;
+  } Imm;
+
+  if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) {
+    if (Node->getZExtValue() >> 32) {
+        return -1;
+    }
+    Imm.I = Node->getSExtValue();
+  } else if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) {
+    if (N->getValueType(0) != MVT::f32)
+      return -1;
+    Imm.F = Node->getValueAPF().convertToFloat();
+  } else
+    return -1; // It isn't an immediate
+
+  if ((Imm.I >= -16 && Imm.I <= 64) ||
+      Imm.F == 0.5f || Imm.F == -0.5f ||
+      Imm.F == 1.0f || Imm.F == -1.0f ||
+      Imm.F == 2.0f || Imm.F == -2.0f ||
+      Imm.F == 4.0f || Imm.F == -4.0f)
+    return 0; // It's an inline immediate
+
+  return Imm.I; // It's a literal immediate
+}
+
+/// \brief Try to fold an immediate directly into an instruction
+bool SITargetLowering::foldImm(SDValue &Operand, int32_t &Immediate,
+                               bool &ScalarSlotUsed) const {
+
+  MachineSDNode *Mov = dyn_cast<MachineSDNode>(Operand);
+  const SIInstrInfo *TII =
+    static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+  if (!Mov || !TII->isMov(Mov->getMachineOpcode()))
+    return false;
+
+  const SDValue &Op = Mov->getOperand(0);
+  int32_t Value = analyzeImmediate(Op.getNode());
+  if (Value == -1) {
+    // Not an immediate at all
+    return false;
+
+  } else if (Value == 0) {
+    // Inline immediates can always be fold
+    Operand = Op;
+    return true;
+
+  } else if (Value == Immediate) {
+    // Already fold literal immediate
+    Operand = Op;
+    return true;
+
+  } else if (!ScalarSlotUsed && !Immediate) {
+    // Fold this literal immediate
+    ScalarSlotUsed = true;
+    Immediate = Value;
+    Operand = Op;
+    return true;
+
+  }
+
+  return false;
+}
+
+const TargetRegisterClass *SITargetLowering::getRegClassForNode(
+                                   SelectionDAG &DAG, const SDValue &Op) const {
+  const SIInstrInfo *TII =
+    static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+  const SIRegisterInfo &TRI = TII->getRegisterInfo();
+
+  if (!Op->isMachineOpcode()) {
+    switch(Op->getOpcode()) {
+    case ISD::CopyFromReg: {
+      MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
+      unsigned Reg = cast<RegisterSDNode>(Op->getOperand(1))->getReg();
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        return MRI.getRegClass(Reg);
+      }
+      return TRI.getPhysRegClass(Reg);
+    }
+    default:  return nullptr;
+    }
+  }
+  const MCInstrDesc &Desc = TII->get(Op->getMachineOpcode());
+  int OpClassID = Desc.OpInfo[Op.getResNo()].RegClass;
+  if (OpClassID != -1) {
+    return TRI.getRegClass(OpClassID);
+  }
+  switch(Op.getMachineOpcode()) {
+  case AMDGPU::COPY_TO_REGCLASS:
+    // Operand 1 is the register class id for COPY_TO_REGCLASS instructions.
+    OpClassID = cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue();
+
+    // If the COPY_TO_REGCLASS instruction is copying to a VSrc register
+    // class, then the register class for the value could be either a
+    // VReg or and SReg.  In order to get a more accurate
+    if (OpClassID == AMDGPU::VSrc_32RegClassID ||
+        OpClassID == AMDGPU::VSrc_64RegClassID) {
+      return getRegClassForNode(DAG, Op.getOperand(0));
+    }
+    return TRI.getRegClass(OpClassID);
+  case AMDGPU::EXTRACT_SUBREG: {
+    int SubIdx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+    const TargetRegisterClass *SuperClass =
+      getRegClassForNode(DAG, Op.getOperand(0));
+    return TRI.getSubClassWithSubReg(SuperClass, SubIdx);
+  }
+  case AMDGPU::REG_SEQUENCE:
+    // Operand 0 is the register class id for REG_SEQUENCE instructions.
+    return TRI.getRegClass(
+      cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue());
+  default:
+    return getRegClassFor(Op.getSimpleValueType());
+  }
+}
+
+/// \brief Does "Op" fit into register class "RegClass" ?
+bool SITargetLowering::fitsRegClass(SelectionDAG &DAG, const SDValue &Op,
+                                    unsigned RegClass) const {
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const TargetRegisterClass *RC = getRegClassForNode(DAG, Op);
+  if (!RC) {
+    return false;
+  }
+  return TRI->getRegClass(RegClass)->hasSubClassEq(RC);
+}
+
+/// \brief Make sure that we don't exeed the number of allowed scalars
+void SITargetLowering::ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand,
+                                       unsigned RegClass,
+                                       bool &ScalarSlotUsed) const {
+
+  // First map the operands register class to a destination class
+  if (RegClass == AMDGPU::VSrc_32RegClassID)
+    RegClass = AMDGPU::VReg_32RegClassID;
+  else if (RegClass == AMDGPU::VSrc_64RegClassID)
+    RegClass = AMDGPU::VReg_64RegClassID;
+  else
+    return;
+
+  // Nothing to do if they fit naturally
+  if (fitsRegClass(DAG, Operand, RegClass))
+    return;
+
+  // If the scalar slot isn't used yet use it now
+  if (!ScalarSlotUsed) {
+    ScalarSlotUsed = true;
+    return;
+  }
+
+  // This is a conservative aproach. It is possible that we can't determine the
+  // correct register class and copy too often, but better safe than sorry.
+
+  SDNode *Node;
+  // We can't use COPY_TO_REGCLASS with FrameIndex arguments.
+  if (isa<FrameIndexSDNode>(Operand)) {
+    unsigned Opcode = Operand.getValueType() == MVT::i32 ?
+                      AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+    Node = DAG.getMachineNode(Opcode, SDLoc(), Operand.getValueType(),
+                              Operand);
+  } else {
+    SDValue RC = DAG.getTargetConstant(RegClass, MVT::i32);
+    Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, SDLoc(),
+                              Operand.getValueType(), Operand, RC);
+  }
+  Operand = SDValue(Node, 0);
+}
+
+/// \returns true if \p Node's operands are different from the SDValue list
+/// \p Ops
+static bool isNodeChanged(const SDNode *Node, const std::vector<SDValue> &Ops) {
+  for (unsigned i = 0, e = Node->getNumOperands(); i < e; ++i) {
+    if (Ops[i].getNode() != Node->getOperand(i).getNode()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+/// \brief Try to fold the Nodes operands into the Node
+SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
+                                       SelectionDAG &DAG) const {
+
+  // Original encoding (either e32 or e64)
+  int Opcode = Node->getMachineOpcode();
+  const SIInstrInfo *TII =
+    static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+  const MCInstrDesc *Desc = &TII->get(Opcode);
+
+  unsigned NumDefs = Desc->getNumDefs();
+  unsigned NumOps = Desc->getNumOperands();
+
+  // Commuted opcode if available
+  int OpcodeRev = Desc->isCommutable() ? TII->commuteOpcode(Opcode) : -1;
+  const MCInstrDesc *DescRev = OpcodeRev == -1 ? nullptr : &TII->get(OpcodeRev);
+
+  assert(!DescRev || DescRev->getNumDefs() == NumDefs);
+  assert(!DescRev || DescRev->getNumOperands() == NumOps);
+
+  // e64 version if available, -1 otherwise
+  int OpcodeE64 = AMDGPU::getVOPe64(Opcode);
+  const MCInstrDesc *DescE64 = OpcodeE64 == -1 ? nullptr : &TII->get(OpcodeE64);
+  int InputModifiers[3] = {0};
+
+  assert(!DescE64 || DescE64->getNumDefs() == NumDefs);
+
+  int32_t Immediate = Desc->getSize() == 4 ? 0 : -1;
+  bool HaveVSrc = false, HaveSSrc = false;
+
+  // First figure out what we already have in this instruction.
+  for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs;
+       i != e && Op < NumOps; ++i, ++Op) {
+
+    unsigned RegClass = Desc->OpInfo[Op].RegClass;
+    if (isVSrc(RegClass))
+      HaveVSrc = true;
+    else if (isSSrc(RegClass))
+      HaveSSrc = true;
+    else
+      continue;
+
+    int32_t Imm = analyzeImmediate(Node->getOperand(i).getNode());
+    if (Imm != -1 && Imm != 0) {
+      // Literal immediate
+      Immediate = Imm;
+    }
+  }
+
+  // If we neither have VSrc nor SSrc, it makes no sense to continue.
+  if (!HaveVSrc && !HaveSSrc)
+    return Node;
+
+  // No scalar allowed when we have both VSrc and SSrc
+  bool ScalarSlotUsed = HaveVSrc && HaveSSrc;
+
+  // Second go over the operands and try to fold them
+  std::vector<SDValue> Ops;
+  bool Promote2e64 = false;
+  for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs;
+       i != e && Op < NumOps; ++i, ++Op) {
+
+    const SDValue &Operand = Node->getOperand(i);
+    Ops.push_back(Operand);
+
+    // Already folded immediate?
+    if (isa<ConstantSDNode>(Operand.getNode()) ||
+        isa<ConstantFPSDNode>(Operand.getNode()))
+      continue;
+
+    // Is this a VSrc or SSrc operand?
+    unsigned RegClass = Desc->OpInfo[Op].RegClass;
+    if (isVSrc(RegClass) || isSSrc(RegClass)) {
+      // Try to fold the immediates
+      if (!foldImm(Ops[i], Immediate, ScalarSlotUsed)) {
+        // Folding didn't work, make sure we don't hit the SReg limit.
+        ensureSRegLimit(DAG, Ops[i], RegClass, ScalarSlotUsed);
+      }
+      continue;
+    } else {
+      // If it's not a VSrc or SSrc operand check if we have a GlobalAddress.
+      // These will be lowered to immediates, so we will need to insert a MOV.
+      if (isa<GlobalAddressSDNode>(Ops[i])) {
+        SDNode *Node = DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(),
+                                    Operand.getValueType(), Operand);
+        Ops[i] = SDValue(Node, 0);
+      }
+    }
+
+    if (i == 1 && DescRev && fitsRegClass(DAG, Ops[0], RegClass)) {
+
+      unsigned OtherRegClass = Desc->OpInfo[NumDefs].RegClass;
+      assert(isVSrc(OtherRegClass) || isSSrc(OtherRegClass));
+
+      // Test if it makes sense to swap operands
+      if (foldImm(Ops[1], Immediate, ScalarSlotUsed) ||
+          (!fitsRegClass(DAG, Ops[1], RegClass) &&
+           fitsRegClass(DAG, Ops[1], OtherRegClass))) {
+
+        // Swap commutable operands
+        std::swap(Ops[0], Ops[1]);
+
+        Desc = DescRev;
+        DescRev = nullptr;
+        continue;
+      }
+    }
+
+    if (Immediate)
+      continue;
+
+    if (DescE64) {
+      // Test if it makes sense to switch to e64 encoding
+      unsigned OtherRegClass = DescE64->OpInfo[Op].RegClass;
+      if (!isVSrc(OtherRegClass) && !isSSrc(OtherRegClass))
+        continue;
+
+      int32_t TmpImm = -1;
+      if (foldImm(Ops[i], TmpImm, ScalarSlotUsed) ||
+          (!fitsRegClass(DAG, Ops[i], RegClass) &&
+           fitsRegClass(DAG, Ops[1], OtherRegClass))) {
+
+        // Switch to e64 encoding
+        Immediate = -1;
+        Promote2e64 = true;
+        Desc = DescE64;
+        DescE64 = nullptr;
+      }
+    }
+
+    if (!DescE64 && !Promote2e64)
+      continue;
+    if (!Operand.isMachineOpcode())
+      continue;
+    if (Operand.getMachineOpcode() == AMDGPU::FNEG_SI) {
+      Ops.pop_back();
+      Ops.push_back(Operand.getOperand(0));
+      InputModifiers[i] = 1;
+      Promote2e64 = true;
+      if (!DescE64)
+        continue;
+      Desc = DescE64;
+      DescE64 = nullptr;
+    }
+    else if (Operand.getMachineOpcode() == AMDGPU::FABS_SI) {
+      Ops.pop_back();
+      Ops.push_back(Operand.getOperand(0));
+      InputModifiers[i] = 2;
+      Promote2e64 = true;
+      if (!DescE64)
+        continue;
+      Desc = DescE64;
+      DescE64 = nullptr;
+    }
+  }
+
+  if (Promote2e64) {
+    std::vector<SDValue> OldOps(Ops);
+    Ops.clear();
+    for (unsigned i = 0; i < OldOps.size(); ++i) {
+      // src_modifier
+      Ops.push_back(DAG.getTargetConstant(InputModifiers[i], MVT::i32));
+      Ops.push_back(OldOps[i]);
+    }
+    // Add the modifier flags while promoting
+    for (unsigned i = 0; i < 2; ++i)
+      Ops.push_back(DAG.getTargetConstant(0, MVT::i32));
+  }
+
+  // Add optional chain and glue
+  for (unsigned i = NumOps - NumDefs, e = Node->getNumOperands(); i < e; ++i)
+    Ops.push_back(Node->getOperand(i));
+
+  // Nodes that have a glue result are not CSE'd by getMachineNode(), so in
+  // this case a brand new node is always be created, even if the operands
+  // are the same as before.  So, manually check if anything has been changed.
+  if (Desc->Opcode == Opcode && !isNodeChanged(Node, Ops)) {
+    return Node;
+  }
+
+  // Create a complete new instruction
+  return DAG.getMachineNode(Desc->Opcode, SDLoc(Node), Node->getVTList(), Ops);
+}
+
+/// \brief Helper function for adjustWritemask
+static unsigned SubIdx2Lane(unsigned Idx) {
+  switch (Idx) {
+  default: return 0;
+  case AMDGPU::sub0: return 0;
+  case AMDGPU::sub1: return 1;
+  case AMDGPU::sub2: return 2;
+  case AMDGPU::sub3: return 3;
+  }
+}
+
+/// \brief Adjust the writemask of MIMG instructions
+void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
+                                       SelectionDAG &DAG) const {
+  SDNode *Users[4] = { };
+  unsigned Lane = 0;
+  unsigned OldDmask = Node->getConstantOperandVal(0);
+  unsigned NewDmask = 0;
+
+  // Try to figure out the used register components
+  for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
+       I != E; ++I) {
+
+    // Abort if we can't understand the usage
+    if (!I->isMachineOpcode() ||
+        I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
+      return;
+
+    // Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used.
+    // Note that subregs are packed, i.e. Lane==0 is the first bit set
+    // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
+    // set, etc.
+    Lane = SubIdx2Lane(I->getConstantOperandVal(1));
+
+    // Set which texture component corresponds to the lane.
+    unsigned Comp;
+    for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) {
+      assert(Dmask);
+      Comp = countTrailingZeros(Dmask);
+      Dmask &= ~(1 << Comp);
+    }
+
+    // Abort if we have more than one user per component
+    if (Users[Lane])
+      return;
+
+    Users[Lane] = *I;
+    NewDmask |= 1 << Comp;
+  }
+
+  // Abort if there's no change
+  if (NewDmask == OldDmask)
+    return;
+
+  // Adjust the writemask in the node
+  std::vector<SDValue> Ops;
+  Ops.push_back(DAG.getTargetConstant(NewDmask, MVT::i32));
+  for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i)
+    Ops.push_back(Node->getOperand(i));
+  Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops);
+
+  // If we only got one lane, replace it with a copy
+  // (if NewDmask has only one bit set...)
+  if (NewDmask && (NewDmask & (NewDmask-1)) == 0) {
+    SDValue RC = DAG.getTargetConstant(AMDGPU::VReg_32RegClassID, MVT::i32);
+    SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
+                                      SDLoc(), Users[Lane]->getValueType(0),
+                                      SDValue(Node, 0), RC);
+    DAG.ReplaceAllUsesWith(Users[Lane], Copy);
+    return;
+  }
+
+  // Update the users of the node with the new indices
+  for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) {
+
+    SDNode *User = Users[i];
+    if (!User)
+      continue;
+
+    SDValue Op = DAG.getTargetConstant(Idx, MVT::i32);
+    DAG.UpdateNodeOperands(User, User->getOperand(0), Op);
+
+    switch (Idx) {
+    default: break;
+    case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
+    case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
+    case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
+    }
+  }
+}
+
+/// \brief Fold the instructions after selecting them.
+SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
+                                          SelectionDAG &DAG) const {
+  const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+  Node = AdjustRegClass(Node, DAG);
+
+  if (TII->isMIMG(Node->getMachineOpcode()))
+    adjustWritemask(Node, DAG);
+
+  return foldOperands(Node, DAG);
+}
+
+/// \brief Assign the register class depending on the number of
+/// bits set in the writemask
+void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
+                                                     SDNode *Node) const {
+  const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+  if (!TII->isMIMG(MI->getOpcode()))
+    return;
+
+  unsigned VReg = MI->getOperand(0).getReg();
+  unsigned Writemask = MI->getOperand(1).getImm();
+  unsigned BitsSet = 0;
+  for (unsigned i = 0; i < 4; ++i)
+    BitsSet += Writemask & (1 << i) ? 1 : 0;
+
+  const TargetRegisterClass *RC;
+  switch (BitsSet) {
+  default: return;
+  case 1:  RC = &AMDGPU::VReg_32RegClass; break;
+  case 2:  RC = &AMDGPU::VReg_64RegClass; break;
+  case 3:  RC = &AMDGPU::VReg_96RegClass; break;
+  }
+
+  unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet);
+  MI->setDesc(TII->get(NewOpcode));
+  MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+  MRI.setRegClass(VReg, RC);
+}
+
+MachineSDNode *SITargetLowering::AdjustRegClass(MachineSDNode *N,
+                                                SelectionDAG &DAG) const {
+
+  SDLoc DL(N);
+  unsigned NewOpcode = N->getMachineOpcode();
+
+  switch (N->getMachineOpcode()) {
+  default: return N;
+  case AMDGPU::S_LOAD_DWORD_IMM:
+    NewOpcode = AMDGPU::BUFFER_LOAD_DWORD_ADDR64;
+    // Fall-through
+  case AMDGPU::S_LOAD_DWORDX2_SGPR:
+    if (NewOpcode == N->getMachineOpcode()) {
+      NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64;
+    }
+    // Fall-through
+  case AMDGPU::S_LOAD_DWORDX4_IMM:
+  case AMDGPU::S_LOAD_DWORDX4_SGPR: {
+    if (NewOpcode == N->getMachineOpcode()) {
+      NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64;
+    }
+    if (fitsRegClass(DAG, N->getOperand(0), AMDGPU::SReg_64RegClassID)) {
+      return N;
+    }
+    ConstantSDNode *Offset = cast<ConstantSDNode>(N->getOperand(1));
+    SDValue Ops[] = {
+      SDValue(DAG.getMachineNode(AMDGPU::SI_ADDR64_RSRC, DL, MVT::i128,
+                                 DAG.getConstant(0, MVT::i64)), 0),
+      N->getOperand(0),
+      DAG.getConstant(Offset->getSExtValue() << 2, MVT::i32)
+    };
+    return DAG.getMachineNode(NewOpcode, DL, N->getVTList(), Ops);
+  }
+  }
+}
+
+SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
+                                               const TargetRegisterClass *RC,
+                                               unsigned Reg, EVT VT) const {
+  SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT);
+
+  return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()),
+                            cast<RegisterSDNode>(VReg)->getReg(), VT);
+}
diff --git a/contrib/llvm/lib/Target/R600/SIISelLowering.h b/contrib/llvm/lib/Target/R600/SIISelLowering.h
new file mode 100644
index 0000000..d106d4a
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/SIISelLowering.h
@@ -0,0 +1,92 @@
+//===-- SIISelLowering.h - SI DAG Lowering Interface ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief SI DAG Lowering interface definition
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SIISELLOWERING_H
+#define SIISELLOWERING_H
+
+#include "AMDGPUISelLowering.h"
+#include "SIInstrInfo.h"
+
+namespace llvm {
+
+class SITargetLowering : public AMDGPUTargetLowering {
+  SDValue LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, SDLoc DL,
+                         SDValue Chain, unsigned Offset, bool Signed) const;
+  SDValue LowerSampleIntrinsic(unsigned Opcode, const SDValue &Op,
+                               SelectionDAG &DAG) const;
+  SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
+                             SelectionDAG &DAG) const override;
+  SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
+
+  bool foldImm(SDValue &Operand, int32_t &Immediate,
+               bool &ScalarSlotUsed) const;
+  const TargetRegisterClass *getRegClassForNode(SelectionDAG &DAG,
+                                                const SDValue &Op) const;
+  bool fitsRegClass(SelectionDAG &DAG, const SDValue &Op,
+                    unsigned RegClass) const;
+  void ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand,
+                       unsigned RegClass, bool &ScalarSlotUsed) const;
+
+  SDNode *foldOperands(MachineSDNode *N, SelectionDAG &DAG) const;
+  void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
+  MachineSDNode *AdjustRegClass(MachineSDNode *N, SelectionDAG &DAG) const;
+
+  static SDValue performUCharToFloatCombine(SDNode *N,
+                                            DAGCombinerInfo &DCI);
+
+public:
+  SITargetLowering(TargetMachine &tm);
+  bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AS,
+                                     bool *IsFast) const override;
+
+  TargetLoweringBase::LegalizeTypeAction
+  getPreferredVectorAction(EVT VT) const override;
+
+  bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                        Type *Ty) const override;
+
+  SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
+                               bool isVarArg,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               SDLoc DL, SelectionDAG &DAG,
+                               SmallVectorImpl<SDValue> &InVals) const override;
+
+  MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI,
+                                      MachineBasicBlock * BB) const override;
+  EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
+  MVT getScalarShiftAmountTy(EVT VT) const override;
+  bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+  SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override;
+  void AdjustInstrPostInstrSelection(MachineInstr *MI,
+                                     SDNode *Node) const override;
+
+  int32_t analyzeImmediate(const SDNode *N) const;
+  SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC,
+                               unsigned Reg, EVT VT) const override;
+};
+
+} // End namespace llvm
+
+#endif //SIISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/R600/SIInsertWaits.cpp b/contrib/llvm/lib/Target/R600/SIInsertWaits.cpp
new file mode 100644
index 0000000..7dfc31b
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/SIInsertWaits.cpp
@@ -0,0 +1,376 @@
+//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Insert wait instructions for memory reads and writes.
+///
+/// Memory reads and writes are issued asynchronously, so we need to insert
+/// S_WAITCNT instructions when we want to access any of their results or
+/// overwrite any register that's used asynchronously.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+namespace {
+
+/// \brief One variable for each of the hardware counters
+typedef union {
+  struct {
+    unsigned VM;
+    unsigned EXP;
+    unsigned LGKM;
+  } Named;
+  unsigned Array[3];
+
+} Counters;
+
+typedef Counters RegCounters[512];
+typedef std::pair<unsigned, unsigned> RegInterval;
+
+class SIInsertWaits : public MachineFunctionPass {
+
+private:
+  static char ID;
+  const SIInstrInfo *TII;
+  const SIRegisterInfo *TRI;
+  const MachineRegisterInfo *MRI;
+
+  /// \brief Constant hardware limits
+  static const Counters WaitCounts;
+
+  /// \brief Constant zero value
+  static const Counters ZeroCounts;
+
+  /// \brief Counter values we have already waited on.
+  Counters WaitedOn;
+
+  /// \brief Counter values for last instruction issued.
+  Counters LastIssued;
+
+  /// \brief Registers used by async instructions.
+  RegCounters UsedRegs;
+
+  /// \brief Registers defined by async instructions.
+  RegCounters DefinedRegs;
+
+  /// \brief Different export instruction types seen since last wait.
+  unsigned ExpInstrTypesSeen;
+
+  /// \brief Get increment/decrement amount for this instruction.
+  Counters getHwCounts(MachineInstr &MI);
+
+  /// \brief Is operand relevant for async execution?
+  bool isOpRelevant(MachineOperand &Op);
+
+  /// \brief Get register interval an operand affects.
+  RegInterval getRegInterval(MachineOperand &Op);
+
+  /// \brief Handle instructions async components
+  void pushInstruction(MachineInstr &MI);
+
+  /// \brief Insert the actual wait instruction
+  bool insertWait(MachineBasicBlock &MBB,
+                  MachineBasicBlock::iterator I,
+                  const Counters &Counts);
+
+  /// \brief Do we need def2def checks?
+  bool unorderedDefines(MachineInstr &MI);
+
+  /// \brief Resolve all operand dependencies to counter requirements
+  Counters handleOperands(MachineInstr &MI);
+
+public:
+  SIInsertWaits(TargetMachine &tm) :
+    MachineFunctionPass(ID),
+    TII(nullptr),
+    TRI(nullptr),
+    ExpInstrTypesSeen(0) { }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "SI insert wait  instructions";
+  }
+
+};
+
+} // End anonymous namespace
+
+char SIInsertWaits::ID = 0;
+
+const Counters SIInsertWaits::WaitCounts = { { 15, 7, 7 } };
+const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } };
+
+FunctionPass *llvm::createSIInsertWaits(TargetMachine &tm) {
+  return new SIInsertWaits(tm);
+}
+
+Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
+
+  uint64_t TSFlags = TII->get(MI.getOpcode()).TSFlags;
+  Counters Result;
+
+  Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT);
+
+  // Only consider stores or EXP for EXP_CNT
+  Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT &&
+      (MI.getOpcode() == AMDGPU::EXP || MI.getDesc().mayStore()));
+
+  // LGKM may uses larger values
+  if (TSFlags & SIInstrFlags::LGKM_CNT) {
+
+    if (TII->isSMRD(MI.getOpcode())) {
+
+      MachineOperand &Op = MI.getOperand(0);
+      assert(Op.isReg() && "First LGKM operand must be a register!");
+
+      unsigned Reg = Op.getReg();
+      unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize();
+      Result.Named.LGKM = Size > 4 ? 2 : 1;
+
+    } else {
+      // DS
+      Result.Named.LGKM = 1;
+    }
+
+  } else {
+    Result.Named.LGKM = 0;
+  }
+
+  return Result;
+}
+
+bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
+
+  // Constants are always irrelevant
+  if (!Op.isReg())
+    return false;
+
+  // Defines are always relevant
+  if (Op.isDef())
+    return true;
+
+  // For exports all registers are relevant
+  MachineInstr &MI = *Op.getParent();
+  if (MI.getOpcode() == AMDGPU::EXP)
+    return true;
+
+  // For stores the stored value is also relevant
+  if (!MI.getDesc().mayStore())
+    return false;
+
+  for (MachineInstr::mop_iterator I = MI.operands_begin(),
+       E = MI.operands_end(); I != E; ++I) {
+
+    if (I->isReg() && I->isUse())
+      return Op.isIdenticalTo(*I);
+  }
+
+  return false;
+}
+
+RegInterval SIInsertWaits::getRegInterval(MachineOperand &Op) {
+
+  if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
+    return std::make_pair(0, 0);
+
+  unsigned Reg = Op.getReg();
+  unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize();
+
+  assert(Size >= 4);
+
+  RegInterval Result;
+  Result.first = TRI->getEncodingValue(Reg);
+  Result.second = Result.first + Size / 4;
+
+  return Result;
+}
+
+void SIInsertWaits::pushInstruction(MachineInstr &MI) {
+
+  // Get the hardware counter increments and sum them up
+  Counters Increment = getHwCounts(MI);
+  unsigned Sum = 0;
+
+  for (unsigned i = 0; i < 3; ++i) {
+    LastIssued.Array[i] += Increment.Array[i];
+    Sum += Increment.Array[i];
+  }
+
+  // If we don't increase anything then that's it
+  if (Sum == 0)
+    return;
+
+  // Remember which export instructions we have seen
+  if (Increment.Named.EXP) {
+    ExpInstrTypesSeen |= MI.getOpcode() == AMDGPU::EXP ? 1 : 2;
+  }
+
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+
+    MachineOperand &Op = MI.getOperand(i);
+    if (!isOpRelevant(Op))
+      continue;
+
+    RegInterval Interval = getRegInterval(Op);
+    for (unsigned j = Interval.first; j < Interval.second; ++j) {
+
+      // Remember which registers we define
+      if (Op.isDef())
+        DefinedRegs[j] = LastIssued;
+
+      // and which one we are using
+      if (Op.isUse())
+        UsedRegs[j] = LastIssued;
+    }
+  }
+}
+
+bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator I,
+                               const Counters &Required) {
+
+  // End of program? No need to wait on anything
+  if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM)
+    return false;
+
+  // Figure out if the async instructions execute in order
+  bool Ordered[3];
+
+  // VM_CNT is always ordered
+  Ordered[0] = true;
+
+  // EXP_CNT is unordered if we have both EXP & VM-writes
+  Ordered[1] = ExpInstrTypesSeen == 3;
+
+  // LGKM_CNT is handled as always unordered. TODO: Handle LDS and GDS
+  Ordered[2] = false;
+
+  // The values we are going to put into the S_WAITCNT instruction
+  Counters Counts = WaitCounts;
+
+  // Do we really need to wait?
+  bool NeedWait = false;
+
+  for (unsigned i = 0; i < 3; ++i) {
+
+    if (Required.Array[i] <= WaitedOn.Array[i])
+      continue;
+
+    NeedWait = true;
+
+    if (Ordered[i]) {
+      unsigned Value = LastIssued.Array[i] - Required.Array[i];
+
+      // Adjust the value to the real hardware possibilities.
+      Counts.Array[i] = std::min(Value, WaitCounts.Array[i]);
+
+    } else
+      Counts.Array[i] = 0;
+
+    // Remember on what we have waited on.
+    WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
+  }
+
+  if (!NeedWait)
+    return false;
+
+  // Reset EXP_CNT instruction types
+  if (Counts.Named.EXP == 0)
+    ExpInstrTypesSeen = 0;
+
+  // Build the wait instruction
+  BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+          .addImm((Counts.Named.VM & 0xF) |
+                  ((Counts.Named.EXP & 0x7) << 4) |
+                  ((Counts.Named.LGKM & 0x7) << 8));
+
+  return true;
+}
+
+/// \brief helper function for handleOperands
+static void increaseCounters(Counters &Dst, const Counters &Src) {
+
+  for (unsigned i = 0; i < 3; ++i)
+    Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]);
+}
+
+Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
+
+  Counters Result = ZeroCounts;
+
+  // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
+  // but we also want to wait for any other outstanding transfers before
+  // signalling other hardware blocks
+  if (MI.getOpcode() == AMDGPU::S_SENDMSG)
+    return LastIssued;
+
+  // For each register affected by this
+  // instruction increase the result sequence
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+
+    MachineOperand &Op = MI.getOperand(i);
+    RegInterval Interval = getRegInterval(Op);
+    for (unsigned j = Interval.first; j < Interval.second; ++j) {
+
+      if (Op.isDef()) {
+        increaseCounters(Result, UsedRegs[j]);
+        increaseCounters(Result, DefinedRegs[j]);
+      }
+
+      if (Op.isUse())
+        increaseCounters(Result, DefinedRegs[j]);
+    }
+  }
+
+  return Result;
+}
+
+// FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States"
+// around other non-memory instructions.
+bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
+  bool Changes = false;
+
+  TII = static_cast<const SIInstrInfo*>(MF.getTarget().getInstrInfo());
+  TRI = static_cast<const SIRegisterInfo*>(MF.getTarget().getRegisterInfo());
+
+  MRI = &MF.getRegInfo();
+
+  WaitedOn = ZeroCounts;
+  LastIssued = ZeroCounts;
+
+  memset(&UsedRegs, 0, sizeof(UsedRegs));
+  memset(&DefinedRegs, 0, sizeof(DefinedRegs));
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+       BI != BE; ++BI) {
+
+    MachineBasicBlock &MBB = *BI;
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+         I != E; ++I) {
+
+      Changes |= insertWait(MBB, I, handleOperands(*I));
+      pushInstruction(*I);
+    }
+
+    // Wait for everything at the end of the MBB
+    Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
+  }
+
+  return Changes;
+}
diff --git a/contrib/llvm/lib/Target/R600/SIInstrFormats.td b/contrib/llvm/lib/Target/R600/SIInstrFormats.td
new file mode 100644
index 0000000..00e69dd
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/SIInstrFormats.td
@@ -0,0 +1,535 @@
+//===-- SIInstrFormats.td - SI Instruction Encodings ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// SI Instruction format definitions.
+//
+//===----------------------------------------------------------------------===//
+
+class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
+    AMDGPUInst<outs, ins, asm, pattern>, PredicateControl {
+
+  field bits<1> VM_CNT = 0;
+  field bits<1> EXP_CNT = 0;
+  field bits<1> LGKM_CNT = 0;
+  field bits<1> MIMG = 0;
+  field bits<1> SMRD = 0;
+  field bits<1> VOP1 = 0;
+  field bits<1> VOP2 = 0;
+  field bits<1> VOP3 = 0;
+  field bits<1> VOPC = 0;
+  field bits<1> SALU = 0;
+
+  let TSFlags{0} = VM_CNT;
+  let TSFlags{1} = EXP_CNT;
+  let TSFlags{2} = LGKM_CNT;
+  let TSFlags{3} = MIMG;
+  let TSFlags{4} = SMRD;
+  let TSFlags{5} = VOP1;
+  let TSFlags{6} = VOP2;
+  let TSFlags{7} = VOP3;
+  let TSFlags{8} = VOPC;
+  let TSFlags{9} = SALU;
+}
+
+class Enc32 {
+
+  field bits<32> Inst;
+  int Size = 4;
+}
+
+class Enc64 {
+
+  field bits<64> Inst;
+  int Size = 8;
+}
+
+class VOP3Common <dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI <outs, ins, asm, pattern> {
+
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let UseNamedOperandTable = 1;
+  let VOP3 = 1;
+
+  int Size = 8;
+}
+
+//===----------------------------------------------------------------------===//
+// Scalar operations
+//===----------------------------------------------------------------------===//
+
+class SOP1e <bits<8> op> : Enc32 {
+
+  bits<7> SDST;
+  bits<8> SSRC0;
+
+  let Inst{7-0} = SSRC0;
+  let Inst{15-8} = op;
+  let Inst{22-16} = SDST;
+  let Inst{31-23} = 0x17d; //encoding;
+}
+
+class SOP2e <bits<7> op> : Enc32 {
+
+  bits<7> SDST;
+  bits<8> SSRC0;
+  bits<8> SSRC1;
+
+  let Inst{7-0} = SSRC0;
+  let Inst{15-8} = SSRC1;
+  let Inst{22-16} = SDST;
+  let Inst{29-23} = op;
+  let Inst{31-30} = 0x2; // encoding
+}
+
+class SOPCe <bits<7> op> : Enc32 {
+
+  bits<8> SSRC0;
+  bits<8> SSRC1;
+
+  let Inst{7-0} = SSRC0;
+  let Inst{15-8} = SSRC1;
+  let Inst{22-16} = op;
+  let Inst{31-23} = 0x17e;
+}
+
+class SOPKe <bits<5> op> : Enc32 {
+
+  bits <7> SDST;
+  bits <16> SIMM16;
+
+  let Inst{15-0} = SIMM16;
+  let Inst{22-16} = SDST;
+  let Inst{27-23} = op;
+  let Inst{31-28} = 0xb; //encoding
+}
+
+class SOPPe <bits<7> op> : Enc32 {
+
+  bits <16> simm16;
+
+  let Inst{15-0} = simm16;
+  let Inst{22-16} = op;
+  let Inst{31-23} = 0x17f; // encoding
+}
+
+class SMRDe <bits<5> op, bits<1> imm> : Enc32 {
+
+  bits<7> SDST;
+  bits<7> SBASE;
+  bits<8> OFFSET;
+
+  let Inst{7-0} = OFFSET;
+  let Inst{8} = imm;
+  let Inst{14-9} = SBASE{6-1};
+  let Inst{21-15} = SDST;
+  let Inst{26-22} = op;
+  let Inst{31-27} = 0x18; //encoding
+}
+
+class SOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI<outs, ins, asm, pattern>, SOP1e <op> {
+
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let SALU = 1;
+}
+
+class SOP2 <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI <outs, ins, asm, pattern>, SOP2e<op> {
+
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let SALU = 1;
+}
+
+class SOPC <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
+  InstSI<outs, ins, asm, pattern>, SOPCe <op> {
+
+  let DisableEncoding = "$dst";
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let SALU = 1;
+}
+
+class SOPK <bits<5> op, dag outs, dag ins, string asm, list<dag> pattern> :
+   InstSI <outs, ins , asm, pattern>, SOPKe<op> {
+
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let SALU = 1;
+}
+
+class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern> :
+		InstSI <(outs), ins, asm, pattern >, SOPPe <op> {
+
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let SALU = 1;
+}
+
+class SMRD <bits<5> op, bits<1> imm, dag outs, dag ins, string asm,
+            list<dag> pattern> : InstSI<outs, ins, asm, pattern>, SMRDe<op, imm> {
+
+  let LGKM_CNT = 1;
+  let SMRD = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Vector ALU operations
+//===----------------------------------------------------------------------===//
+
+class VOP1e <bits<8> op> : Enc32 {
+
+  bits<8> VDST;
+  bits<9> SRC0;
+
+  let Inst{8-0} = SRC0;
+  let Inst{16-9} = op;
+  let Inst{24-17} = VDST;
+  let Inst{31-25} = 0x3f; //encoding
+}
+
+class VOP2e <bits<6> op> : Enc32 {
+
+  bits<8> VDST;
+  bits<9> SRC0;
+  bits<8> VSRC1;
+
+  let Inst{8-0} = SRC0;
+  let Inst{16-9} = VSRC1;
+  let Inst{24-17} = VDST;
+  let Inst{30-25} = op;
+  let Inst{31} = 0x0; //encoding
+}
+
+class VOP3e <bits<9> op> : Enc64 {
+
+  bits<8> dst;
+  bits<2> src0_modifiers;
+  bits<9> src0;
+  bits<2> src1_modifiers;
+  bits<9> src1;
+  bits<2> src2_modifiers;
+  bits<9> src2;
+  bits<1> clamp;
+  bits<2> omod;
+
+  let Inst{7-0} = dst;
+  let Inst{8} = src0_modifiers{1};
+  let Inst{9} = src1_modifiers{1};
+  let Inst{10} = src2_modifiers{1};
+  let Inst{11} = clamp;
+  let Inst{25-17} = op;
+  let Inst{31-26} = 0x34; //encoding
+  let Inst{40-32} = src0;
+  let Inst{49-41} = src1;
+  let Inst{58-50} = src2;
+  let Inst{60-59} = omod;
+  let Inst{61} = src0_modifiers{0};
+  let Inst{62} = src1_modifiers{0};
+  let Inst{63} = src2_modifiers{0};
+}
+
+class VOP3be <bits<9> op> : Enc64 {
+
+  bits<8> dst;
+  bits<2> src0_modifiers;
+  bits<9> src0;
+  bits<2> src1_modifiers;
+  bits<9> src1;
+  bits<2> src2_modifiers;
+  bits<9> src2;
+  bits<7> sdst;
+  bits<2> omod;
+
+  let Inst{7-0} = dst;
+  let Inst{14-8} = sdst;
+  let Inst{25-17} = op;
+  let Inst{31-26} = 0x34; //encoding
+  let Inst{40-32} = src0;
+  let Inst{49-41} = src1;
+  let Inst{58-50} = src2;
+  let Inst{60-59} = omod;
+  let Inst{61} = src0_modifiers{0};
+  let Inst{62} = src1_modifiers{0};
+  let Inst{63} = src2_modifiers{0};
+}
+
+class VOPCe <bits<8> op> : Enc32 {
+
+  bits<9> SRC0;
+  bits<8> VSRC1;
+
+  let Inst{8-0} = SRC0;
+  let Inst{16-9} = VSRC1;
+  let Inst{24-17} = op;
+  let Inst{31-25} = 0x3e;
+}
+
+class VINTRPe <bits<2> op> : Enc32 {
+
+  bits<8> VDST;
+  bits<8> VSRC;
+  bits<2> ATTRCHAN;
+  bits<6> ATTR;
+
+  let Inst{7-0} = VSRC;
+  let Inst{9-8} = ATTRCHAN;
+  let Inst{15-10} = ATTR;
+  let Inst{17-16} = op;
+  let Inst{25-18} = VDST;
+  let Inst{31-26} = 0x32; // encoding
+}
+
+class DSe <bits<8> op> : Enc64 {
+
+  bits<8> vdst;
+  bits<1> gds;
+  bits<8> addr;
+  bits<8> data0;
+  bits<8> data1;
+  bits<8> offset0;
+  bits<8> offset1;
+
+  let Inst{7-0} = offset0;
+  let Inst{15-8} = offset1;
+  let Inst{17} = gds;
+  let Inst{25-18} = op;
+  let Inst{31-26} = 0x36; //encoding
+  let Inst{39-32} = addr;
+  let Inst{47-40} = data0;
+  let Inst{55-48} = data1;
+  let Inst{63-56} = vdst;
+}
+
+class MUBUFe <bits<7> op> : Enc64 {
+
+  bits<12> offset;
+  bits<1> offen;
+  bits<1> idxen;
+  bits<1> glc;
+  bits<1> addr64;
+  bits<1> lds;
+  bits<8> vaddr;
+  bits<8> vdata;
+  bits<7> srsrc;
+  bits<1> slc;
+  bits<1> tfe;
+  bits<8> soffset;
+
+  let Inst{11-0} = offset;
+  let Inst{12} = offen;
+  let Inst{13} = idxen;
+  let Inst{14} = glc;
+  let Inst{15} = addr64;
+  let Inst{16} = lds;
+  let Inst{24-18} = op;
+  let Inst{31-26} = 0x38; //encoding
+  let Inst{39-32} = vaddr;
+  let Inst{47-40} = vdata;
+  let Inst{52-48} = srsrc{6-2};
+  let Inst{54} = slc;
+  let Inst{55} = tfe;
+  let Inst{63-56} = soffset;
+}
+
+class MTBUFe <bits<3> op> : Enc64 {
+
+  bits<8> VDATA;
+  bits<12> OFFSET;
+  bits<1> OFFEN;
+  bits<1> IDXEN;
+  bits<1> GLC;
+  bits<1> ADDR64;
+  bits<4> DFMT;
+  bits<3> NFMT;
+  bits<8> VADDR;
+  bits<7> SRSRC;
+  bits<1> SLC;
+  bits<1> TFE;
+  bits<8> SOFFSET;
+
+  let Inst{11-0} = OFFSET;
+  let Inst{12} = OFFEN;
+  let Inst{13} = IDXEN;
+  let Inst{14} = GLC;
+  let Inst{15} = ADDR64;
+  let Inst{18-16} = op;
+  let Inst{22-19} = DFMT;
+  let Inst{25-23} = NFMT;
+  let Inst{31-26} = 0x3a; //encoding
+  let Inst{39-32} = VADDR;
+  let Inst{47-40} = VDATA;
+  let Inst{52-48} = SRSRC{6-2};
+  let Inst{54} = SLC;
+  let Inst{55} = TFE;
+  let Inst{63-56} = SOFFSET;
+}
+
+class MIMGe <bits<7> op> : Enc64 {
+
+  bits<8> VDATA;
+  bits<4> DMASK;
+  bits<1> UNORM;
+  bits<1> GLC;
+  bits<1> DA;
+  bits<1> R128;
+  bits<1> TFE;
+  bits<1> LWE;
+  bits<1> SLC;
+  bits<8> VADDR;
+  bits<7> SRSRC;
+  bits<7> SSAMP;
+
+  let Inst{11-8} = DMASK;
+  let Inst{12} = UNORM;
+  let Inst{13} = GLC;
+  let Inst{14} = DA;
+  let Inst{15} = R128;
+  let Inst{16} = TFE;
+  let Inst{17} = LWE;
+  let Inst{24-18} = op;
+  let Inst{25} = SLC;
+  let Inst{31-26} = 0x3c;
+  let Inst{39-32} = VADDR;
+  let Inst{47-40} = VDATA;
+  let Inst{52-48} = SRSRC{6-2};
+  let Inst{57-53} = SSAMP{6-2};
+}
+
+class EXPe : Enc64 {
+
+  bits<4> EN;
+  bits<6> TGT;
+  bits<1> COMPR;
+  bits<1> DONE;
+  bits<1> VM;
+  bits<8> VSRC0;
+  bits<8> VSRC1;
+  bits<8> VSRC2;
+  bits<8> VSRC3;
+
+  let Inst{3-0} = EN;
+  let Inst{9-4} = TGT;
+  let Inst{10} = COMPR;
+  let Inst{11} = DONE;
+  let Inst{12} = VM;
+  let Inst{31-26} = 0x3e;
+  let Inst{39-32} = VSRC0;
+  let Inst{47-40} = VSRC1;
+  let Inst{55-48} = VSRC2;
+  let Inst{63-56} = VSRC3;
+}
+
+let Uses = [EXEC] in {
+
+class VOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI <outs, ins, asm, pattern>, VOP1e<op> {
+
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let UseNamedOperandTable = 1;
+  let VOP1 = 1;
+}
+
+class VOP2 <bits<6> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI <outs, ins, asm, pattern>, VOP2e<op> {
+
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let UseNamedOperandTable = 1;
+  let VOP2 = 1;
+}
+
+class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    VOP3Common <outs, ins, asm, pattern>, VOP3e<op>;
+
+class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    VOP3Common <outs, ins, asm, pattern>, VOP3be<op>;
+
+class VOPC <bits<8> op, dag ins, string asm, list<dag> pattern> :
+    InstSI <(outs VCCReg:$dst), ins, asm, pattern>, VOPCe <op> {
+
+  let DisableEncoding = "$dst";
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let UseNamedOperandTable = 1;
+  let VOPC = 1;
+}
+
+class VINTRP <bits <2> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI <outs, ins, asm, pattern>, VINTRPe<op> {
+
+  let neverHasSideEffects = 1;
+  let mayLoad = 1;
+  let mayStore = 0;
+}
+
+} // End Uses = [EXEC]
+
+//===----------------------------------------------------------------------===//
+// Vector I/O operations
+//===----------------------------------------------------------------------===//
+
+let Uses = [EXEC] in {
+
+class DS <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI <outs, ins, asm, pattern> , DSe<op> {
+
+  let LGKM_CNT = 1;
+}
+
+class MUBUF <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI<outs, ins, asm, pattern>, MUBUFe <op> {
+
+  let VM_CNT = 1;
+  let EXP_CNT = 1;
+
+  let neverHasSideEffects = 1;
+  let UseNamedOperandTable = 1;
+}
+
+class MTBUF <bits<3> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI<outs, ins, asm, pattern>, MTBUFe <op> {
+
+  let VM_CNT = 1;
+  let EXP_CNT = 1;
+
+  let neverHasSideEffects = 1;
+}
+
+class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI <outs, ins, asm, pattern>, MIMGe <op> {
+
+  let VM_CNT = 1;
+  let EXP_CNT = 1;
+  let MIMG = 1;
+}
+
+def EXP : InstSI<
+  (outs),
+  (ins i32imm:$en, i32imm:$tgt, i32imm:$compr, i32imm:$done, i32imm:$vm,
+       VReg_32:$src0, VReg_32:$src1, VReg_32:$src2, VReg_32:$src3),
+  "EXP $en, $tgt, $compr, $done, $vm, $src0, $src1, $src2, $src3",
+  [] >, EXPe {
+
+  let EXP_CNT = 1;
+}
+
+} // End Uses = [EXEC]
diff --git a/contrib/llvm/lib/Target/R600/SIInstrInfo.cpp b/contrib/llvm/lib/Target/R600/SIInstrInfo.cpp
new file mode 100644
index 0000000..51f4532
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/SIInstrInfo.cpp
@@ -0,0 +1,1654 @@
+//===-- SIInstrInfo.cpp - SI Instruction Information  ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief SI Implementation of TargetInstrInfo.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "SIInstrInfo.h"
+#include "AMDGPUTargetMachine.h"
+#include "SIDefines.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCInstrDesc.h"
+
+using namespace llvm;
+
+SIInstrInfo::SIInstrInfo(const AMDGPUSubtarget &st)
+  : AMDGPUInstrInfo(st),
+    RI(st) { }
+
+//===----------------------------------------------------------------------===//
+// TargetInstrInfo callbacks
+//===----------------------------------------------------------------------===//
+
+void
+SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator MI, DebugLoc DL,
+                         unsigned DestReg, unsigned SrcReg,
+                         bool KillSrc) const {
+
+  // If we are trying to copy to or from SCC, there is a bug somewhere else in
+  // the backend.  While it may be theoretically possible to do this, it should
+  // never be necessary.
+  assert(DestReg != AMDGPU::SCC && SrcReg != AMDGPU::SCC);
+
+  static const int16_t Sub0_15[] = {
+    AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
+    AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
+    AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
+    AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 0
+  };
+
+  static const int16_t Sub0_7[] = {
+    AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
+    AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 0
+  };
+
+  static const int16_t Sub0_3[] = {
+    AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 0
+  };
+
+  static const int16_t Sub0_2[] = {
+    AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, 0
+  };
+
+  static const int16_t Sub0_1[] = {
+    AMDGPU::sub0, AMDGPU::sub1, 0
+  };
+
+  unsigned Opcode;
+  const int16_t *SubIndices;
+
+  if (AMDGPU::M0 == DestReg) {
+    // Check if M0 isn't already set to this value
+    for (MachineBasicBlock::reverse_iterator E = MBB.rend(),
+      I = MachineBasicBlock::reverse_iterator(MI); I != E; ++I) {
+
+      if (!I->definesRegister(AMDGPU::M0))
+        continue;
+
+      unsigned Opc = I->getOpcode();
+      if (Opc != TargetOpcode::COPY && Opc != AMDGPU::S_MOV_B32)
+        break;
+
+      if (!I->readsRegister(SrcReg))
+        break;
+
+      // The copy isn't necessary
+      return;
+    }
+  }
+
+  if (AMDGPU::SReg_32RegClass.contains(DestReg)) {
+    assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
+    BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
+            .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+
+  } else if (AMDGPU::SReg_64RegClass.contains(DestReg)) {
+    assert(AMDGPU::SReg_64RegClass.contains(SrcReg));
+    BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
+            .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+
+  } else if (AMDGPU::SReg_128RegClass.contains(DestReg)) {
+    assert(AMDGPU::SReg_128RegClass.contains(SrcReg));
+    Opcode = AMDGPU::S_MOV_B32;
+    SubIndices = Sub0_3;
+
+  } else if (AMDGPU::SReg_256RegClass.contains(DestReg)) {
+    assert(AMDGPU::SReg_256RegClass.contains(SrcReg));
+    Opcode = AMDGPU::S_MOV_B32;
+    SubIndices = Sub0_7;
+
+  } else if (AMDGPU::SReg_512RegClass.contains(DestReg)) {
+    assert(AMDGPU::SReg_512RegClass.contains(SrcReg));
+    Opcode = AMDGPU::S_MOV_B32;
+    SubIndices = Sub0_15;
+
+  } else if (AMDGPU::VReg_32RegClass.contains(DestReg)) {
+    assert(AMDGPU::VReg_32RegClass.contains(SrcReg) ||
+           AMDGPU::SReg_32RegClass.contains(SrcReg));
+    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
+            .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+
+  } else if (AMDGPU::VReg_64RegClass.contains(DestReg)) {
+    assert(AMDGPU::VReg_64RegClass.contains(SrcReg) ||
+           AMDGPU::SReg_64RegClass.contains(SrcReg));
+    Opcode = AMDGPU::V_MOV_B32_e32;
+    SubIndices = Sub0_1;
+
+  } else if (AMDGPU::VReg_96RegClass.contains(DestReg)) {
+    assert(AMDGPU::VReg_96RegClass.contains(SrcReg));
+    Opcode = AMDGPU::V_MOV_B32_e32;
+    SubIndices = Sub0_2;
+
+  } else if (AMDGPU::VReg_128RegClass.contains(DestReg)) {
+    assert(AMDGPU::VReg_128RegClass.contains(SrcReg) ||
+           AMDGPU::SReg_128RegClass.contains(SrcReg));
+    Opcode = AMDGPU::V_MOV_B32_e32;
+    SubIndices = Sub0_3;
+
+  } else if (AMDGPU::VReg_256RegClass.contains(DestReg)) {
+    assert(AMDGPU::VReg_256RegClass.contains(SrcReg) ||
+           AMDGPU::SReg_256RegClass.contains(SrcReg));
+    Opcode = AMDGPU::V_MOV_B32_e32;
+    SubIndices = Sub0_7;
+
+  } else if (AMDGPU::VReg_512RegClass.contains(DestReg)) {
+    assert(AMDGPU::VReg_512RegClass.contains(SrcReg) ||
+           AMDGPU::SReg_512RegClass.contains(SrcReg));
+    Opcode = AMDGPU::V_MOV_B32_e32;
+    SubIndices = Sub0_15;
+
+  } else {
+    llvm_unreachable("Can't copy register!");
+  }
+
+  while (unsigned SubIdx = *SubIndices++) {
+    MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
+      get(Opcode), RI.getSubReg(DestReg, SubIdx));
+
+    Builder.addReg(RI.getSubReg(SrcReg, SubIdx), getKillRegState(KillSrc));
+
+    if (*SubIndices)
+      Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
+  }
+}
+
+unsigned SIInstrInfo::commuteOpcode(unsigned Opcode) const {
+  int NewOpc;
+
+  // Try to map original to commuted opcode
+  if ((NewOpc = AMDGPU::getCommuteRev(Opcode)) != -1)
+    return NewOpc;
+
+  // Try to map commuted to original opcode
+  if ((NewOpc = AMDGPU::getCommuteOrig(Opcode)) != -1)
+    return NewOpc;
+
+  return Opcode;
+}
+
+void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator MI,
+                                      unsigned SrcReg, bool isKill,
+                                      int FrameIndex,
+                                      const TargetRegisterClass *RC,
+                                      const TargetRegisterInfo *TRI) const {
+  MachineFunction *MF = MBB.getParent();
+  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  DebugLoc DL = MBB.findDebugLoc(MI);
+  unsigned KillFlag = isKill ? RegState::Kill : 0;
+
+  if (RI.hasVGPRs(RC)) {
+    LLVMContext &Ctx = MF->getFunction()->getContext();
+    Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Can't spill VGPR!");
+    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), AMDGPU::VGPR0)
+            .addReg(SrcReg);
+  } else if (TRI->getCommonSubClass(RC, &AMDGPU::SGPR_32RegClass)) {
+    unsigned Lane = MFI->SpillTracker.reserveLanes(MRI, MF);
+    unsigned TgtReg = MFI->SpillTracker.LaneVGPR;
+
+    BuildMI(MBB, MI, DL, get(AMDGPU::V_WRITELANE_B32), TgtReg)
+            .addReg(SrcReg, KillFlag)
+            .addImm(Lane);
+    MFI->SpillTracker.addSpilledReg(FrameIndex, TgtReg, Lane);
+  } else if (RI.isSGPRClass(RC)) {
+    // We are only allowed to create one new instruction when spilling
+    // registers, so we need to use pseudo instruction for vector
+    // registers.
+    //
+    // Reserve a spot in the spill tracker for each sub-register of
+    // the vector register.
+    unsigned NumSubRegs = RC->getSize() / 4;
+    unsigned FirstLane = MFI->SpillTracker.reserveLanes(MRI, MF, NumSubRegs);
+    MFI->SpillTracker.addSpilledReg(FrameIndex, MFI->SpillTracker.LaneVGPR,
+                                    FirstLane);
+
+    unsigned Opcode;
+    switch (RC->getSize() * 8) {
+    case 64:  Opcode = AMDGPU::SI_SPILL_S64_SAVE;  break;
+    case 128: Opcode = AMDGPU::SI_SPILL_S128_SAVE; break;
+    case 256: Opcode = AMDGPU::SI_SPILL_S256_SAVE; break;
+    case 512: Opcode = AMDGPU::SI_SPILL_S512_SAVE; break;
+    default: llvm_unreachable("Cannot spill register class");
+    }
+
+    BuildMI(MBB, MI, DL, get(Opcode), MFI->SpillTracker.LaneVGPR)
+            .addReg(SrcReg)
+            .addImm(FrameIndex);
+  } else {
+    llvm_unreachable("VGPR spilling not supported");
+  }
+}
+
+void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MI,
+                                       unsigned DestReg, int FrameIndex,
+                                       const TargetRegisterClass *RC,
+                                       const TargetRegisterInfo *TRI) const {
+  MachineFunction *MF = MBB.getParent();
+  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+  DebugLoc DL = MBB.findDebugLoc(MI);
+
+  if (RI.hasVGPRs(RC)) {
+    LLVMContext &Ctx = MF->getFunction()->getContext();
+    Ctx.emitError("SIInstrInfo::loadRegToStackSlot - Can't retrieve spilled VGPR!");
+    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
+            .addImm(0);
+  } else if (RI.isSGPRClass(RC)){
+    unsigned Opcode;
+    switch(RC->getSize() * 8) {
+    case 32:  Opcode = AMDGPU::SI_SPILL_S32_RESTORE; break;
+    case 64:  Opcode = AMDGPU::SI_SPILL_S64_RESTORE;  break;
+    case 128: Opcode = AMDGPU::SI_SPILL_S128_RESTORE; break;
+    case 256: Opcode = AMDGPU::SI_SPILL_S256_RESTORE; break;
+    case 512: Opcode = AMDGPU::SI_SPILL_S512_RESTORE; break;
+    default: llvm_unreachable("Cannot spill register class");
+    }
+
+    SIMachineFunctionInfo::SpilledReg Spill =
+        MFI->SpillTracker.getSpilledReg(FrameIndex);
+
+    BuildMI(MBB, MI, DL, get(Opcode), DestReg)
+            .addReg(Spill.VGPR)
+            .addImm(FrameIndex);
+  } else {
+    llvm_unreachable("VGPR spilling not supported");
+  }
+}
+
+static unsigned getNumSubRegsForSpillOp(unsigned Op) {
+
+  switch (Op) {
+  case AMDGPU::SI_SPILL_S512_SAVE:
+  case AMDGPU::SI_SPILL_S512_RESTORE:
+    return 16;
+  case AMDGPU::SI_SPILL_S256_SAVE:
+  case AMDGPU::SI_SPILL_S256_RESTORE:
+    return 8;
+  case AMDGPU::SI_SPILL_S128_SAVE:
+  case AMDGPU::SI_SPILL_S128_RESTORE:
+    return 4;
+  case AMDGPU::SI_SPILL_S64_SAVE:
+  case AMDGPU::SI_SPILL_S64_RESTORE:
+    return 2;
+  case AMDGPU::SI_SPILL_S32_RESTORE:
+    return 1;
+  default: llvm_unreachable("Invalid spill opcode");
+  }
+}
+
+void SIInstrInfo::insertNOPs(MachineBasicBlock::iterator MI,
+                             int Count) const {
+  while (Count > 0) {
+    int Arg;
+    if (Count >= 8)
+      Arg = 7;
+    else
+      Arg = Count - 1;
+    Count -= 8;
+    BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(AMDGPU::S_NOP))
+            .addImm(Arg);
+  }
+}
+
+bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
+  SIMachineFunctionInfo *MFI =
+      MI->getParent()->getParent()->getInfo<SIMachineFunctionInfo>();
+  MachineBasicBlock &MBB = *MI->getParent();
+  DebugLoc DL = MBB.findDebugLoc(MI);
+  switch (MI->getOpcode()) {
+  default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
+
+  // SGPR register spill
+  case AMDGPU::SI_SPILL_S512_SAVE:
+  case AMDGPU::SI_SPILL_S256_SAVE:
+  case AMDGPU::SI_SPILL_S128_SAVE:
+  case AMDGPU::SI_SPILL_S64_SAVE: {
+    unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
+    unsigned FrameIndex = MI->getOperand(2).getImm();
+
+    for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
+      SIMachineFunctionInfo::SpilledReg Spill;
+      unsigned SubReg = RI.getPhysRegSubReg(MI->getOperand(1).getReg(),
+                                            &AMDGPU::SGPR_32RegClass, i);
+      Spill = MFI->SpillTracker.getSpilledReg(FrameIndex);
+
+      BuildMI(MBB, MI, DL, get(AMDGPU::V_WRITELANE_B32),
+              MI->getOperand(0).getReg())
+              .addReg(SubReg)
+              .addImm(Spill.Lane + i);
+    }
+    MI->eraseFromParent();
+    break;
+  }
+
+  // SGPR register restore
+  case AMDGPU::SI_SPILL_S512_RESTORE:
+  case AMDGPU::SI_SPILL_S256_RESTORE:
+  case AMDGPU::SI_SPILL_S128_RESTORE:
+  case AMDGPU::SI_SPILL_S64_RESTORE:
+  case AMDGPU::SI_SPILL_S32_RESTORE: {
+    unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
+
+    for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
+      SIMachineFunctionInfo::SpilledReg Spill;
+      unsigned FrameIndex = MI->getOperand(2).getImm();
+      unsigned SubReg = RI.getPhysRegSubReg(MI->getOperand(0).getReg(),
+                                   &AMDGPU::SGPR_32RegClass, i);
+      Spill = MFI->SpillTracker.getSpilledReg(FrameIndex);
+
+      BuildMI(MBB, MI, DL, get(AMDGPU::V_READLANE_B32), SubReg)
+              .addReg(MI->getOperand(1).getReg())
+              .addImm(Spill.Lane + i);
+    }
+    insertNOPs(MI, 3);
+    MI->eraseFromParent();
+    break;
+  }
+  case AMDGPU::SI_CONSTDATA_PTR: {
+    unsigned Reg = MI->getOperand(0).getReg();
+    unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
+    unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
+
+    BuildMI(MBB, MI, DL, get(AMDGPU::S_GETPC_B64), Reg);
+
+    // Add 32-bit offset from this instruction to the start of the constant data.
+    BuildMI(MBB, MI, DL, get(AMDGPU::S_ADD_I32), RegLo)
+            .addReg(RegLo)
+            .addTargetIndex(AMDGPU::TI_CONSTDATA_START)
+            .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit);
+    BuildMI(MBB, MI, DL, get(AMDGPU::S_ADDC_U32), RegHi)
+            .addReg(RegHi)
+            .addImm(0)
+            .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit)
+            .addReg(AMDGPU::SCC, RegState::Implicit);
+    MI->eraseFromParent();
+    break;
+  }
+  }
+  return true;
+}
+
+MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
+                                              bool NewMI) const {
+
+  MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+  if (MI->getNumOperands() < 3 || !MI->getOperand(1).isReg())
+    return nullptr;
+
+  // Cannot commute VOP2 if src0 is SGPR.
+  if (isVOP2(MI->getOpcode()) && MI->getOperand(1).isReg() &&
+      RI.isSGPRClass(MRI.getRegClass(MI->getOperand(1).getReg())))
+   return nullptr;
+
+  if (!MI->getOperand(2).isReg()) {
+    // XXX: Commute instructions with FPImm operands
+    if (NewMI || MI->getOperand(2).isFPImm() ||
+       (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode()))) {
+      return nullptr;
+    }
+
+    // XXX: Commute VOP3 instructions with abs and neg set.
+    if (isVOP3(MI->getOpcode()) &&
+        (MI->getOperand(AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                        AMDGPU::OpName::abs)).getImm() ||
+         MI->getOperand(AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                        AMDGPU::OpName::neg)).getImm()))
+      return nullptr;
+
+    unsigned Reg = MI->getOperand(1).getReg();
+    unsigned SubReg = MI->getOperand(1).getSubReg();
+    MI->getOperand(1).ChangeToImmediate(MI->getOperand(2).getImm());
+    MI->getOperand(2).ChangeToRegister(Reg, false);
+    MI->getOperand(2).setSubReg(SubReg);
+  } else {
+    MI = TargetInstrInfo::commuteInstruction(MI, NewMI);
+  }
+
+  if (MI)
+    MI->setDesc(get(commuteOpcode(MI->getOpcode())));
+
+  return MI;
+}
+
+MachineInstr *SIInstrInfo::buildMovInstr(MachineBasicBlock *MBB,
+                                         MachineBasicBlock::iterator I,
+                                         unsigned DstReg,
+                                         unsigned SrcReg) const {
+  return BuildMI(*MBB, I, MBB->findDebugLoc(I), get(AMDGPU::V_MOV_B32_e32),
+                 DstReg) .addReg(SrcReg);
+}
+
+bool SIInstrInfo::isMov(unsigned Opcode) const {
+  switch(Opcode) {
+  default: return false;
+  case AMDGPU::S_MOV_B32:
+  case AMDGPU::S_MOV_B64:
+  case AMDGPU::V_MOV_B32_e32:
+  case AMDGPU::V_MOV_B32_e64:
+    return true;
+  }
+}
+
+bool
+SIInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
+  return RC != &AMDGPU::EXECRegRegClass;
+}
+
+bool
+SIInstrInfo::isTriviallyReMaterializable(const MachineInstr *MI,
+                                         AliasAnalysis *AA) const {
+  switch(MI->getOpcode()) {
+  default: return AMDGPUInstrInfo::isTriviallyReMaterializable(MI, AA);
+  case AMDGPU::S_MOV_B32:
+  case AMDGPU::S_MOV_B64:
+  case AMDGPU::V_MOV_B32_e32:
+    return MI->getOperand(1).isImm();
+  }
+}
+
+namespace llvm {
+namespace AMDGPU {
+// Helper function generated by tablegen.  We are wrapping this with
+// an SIInstrInfo function that reutrns bool rather than int.
+int isDS(uint16_t Opcode);
+}
+}
+
+bool SIInstrInfo::isDS(uint16_t Opcode) const {
+  return ::AMDGPU::isDS(Opcode) != -1;
+}
+
+int SIInstrInfo::isMIMG(uint16_t Opcode) const {
+  return get(Opcode).TSFlags & SIInstrFlags::MIMG;
+}
+
+int SIInstrInfo::isSMRD(uint16_t Opcode) const {
+  return get(Opcode).TSFlags & SIInstrFlags::SMRD;
+}
+
+bool SIInstrInfo::isVOP1(uint16_t Opcode) const {
+  return get(Opcode).TSFlags & SIInstrFlags::VOP1;
+}
+
+bool SIInstrInfo::isVOP2(uint16_t Opcode) const {
+  return get(Opcode).TSFlags & SIInstrFlags::VOP2;
+}
+
+bool SIInstrInfo::isVOP3(uint16_t Opcode) const {
+  return get(Opcode).TSFlags & SIInstrFlags::VOP3;
+}
+
+bool SIInstrInfo::isVOPC(uint16_t Opcode) const {
+  return get(Opcode).TSFlags & SIInstrFlags::VOPC;
+}
+
+bool SIInstrInfo::isSALUInstr(const MachineInstr &MI) const {
+  return get(MI.getOpcode()).TSFlags & SIInstrFlags::SALU;
+}
+
+bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
+  int32_t Val = Imm.getSExtValue();
+  if (Val >= -16 && Val <= 64)
+    return true;
+
+  // The actual type of the operand does not seem to matter as long
+  // as the bits match one of the inline immediate values.  For example:
+  //
+  // -nan has the hexadecimal encoding of 0xfffffffe which is -2 in decimal,
+  // so it is a legal inline immediate.
+  //
+  // 1065353216 has the hexadecimal encoding 0x3f800000 which is 1.0f in
+  // floating-point, so it is a legal inline immediate.
+
+  return (APInt::floatToBits(0.0f) == Imm) ||
+         (APInt::floatToBits(1.0f) == Imm) ||
+         (APInt::floatToBits(-1.0f) == Imm) ||
+         (APInt::floatToBits(0.5f) == Imm) ||
+         (APInt::floatToBits(-0.5f) == Imm) ||
+         (APInt::floatToBits(2.0f) == Imm) ||
+         (APInt::floatToBits(-2.0f) == Imm) ||
+         (APInt::floatToBits(4.0f) == Imm) ||
+         (APInt::floatToBits(-4.0f) == Imm);
+}
+
+bool SIInstrInfo::isInlineConstant(const MachineOperand &MO) const {
+  if (MO.isImm())
+    return isInlineConstant(APInt(32, MO.getImm(), true));
+
+  if (MO.isFPImm()) {
+    APFloat FpImm = MO.getFPImm()->getValueAPF();
+    return isInlineConstant(FpImm.bitcastToAPInt());
+  }
+
+  return false;
+}
+
+bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO) const {
+  return (MO.isImm() || MO.isFPImm()) && !isInlineConstant(MO);
+}
+
+static bool compareMachineOp(const MachineOperand &Op0,
+                             const MachineOperand &Op1) {
+  if (Op0.getType() != Op1.getType())
+    return false;
+
+  switch (Op0.getType()) {
+  case MachineOperand::MO_Register:
+    return Op0.getReg() == Op1.getReg();
+  case MachineOperand::MO_Immediate:
+    return Op0.getImm() == Op1.getImm();
+  case MachineOperand::MO_FPImmediate:
+    return Op0.getFPImm() == Op1.getFPImm();
+  default:
+    llvm_unreachable("Didn't expect to be comparing these operand types");
+  }
+}
+
+bool SIInstrInfo::isImmOperandLegal(const MachineInstr *MI, unsigned OpNo,
+                                 const MachineOperand &MO) const {
+  const MCOperandInfo &OpInfo = get(MI->getOpcode()).OpInfo[OpNo];
+
+  assert(MO.isImm() || MO.isFPImm());
+
+  if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
+    return true;
+
+  if (OpInfo.RegClass < 0)
+    return false;
+
+  return RI.regClassCanUseImmediate(OpInfo.RegClass);
+}
+
+bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
+  return AMDGPU::getVOPe32(Opcode) != -1;
+}
+
+bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
+                                    StringRef &ErrInfo) const {
+  uint16_t Opcode = MI->getOpcode();
+  int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
+  int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
+  int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
+
+  // Make sure the number of operands is correct.
+  const MCInstrDesc &Desc = get(Opcode);
+  if (!Desc.isVariadic() &&
+      Desc.getNumOperands() != MI->getNumExplicitOperands()) {
+     ErrInfo = "Instruction has wrong number of operands.";
+     return false;
+  }
+
+  // Make sure the register classes are correct
+  for (unsigned i = 0, e = Desc.getNumOperands(); i != e; ++i) {
+    switch (Desc.OpInfo[i].OperandType) {
+    case MCOI::OPERAND_REGISTER: {
+      int RegClass = Desc.OpInfo[i].RegClass;
+      if (!RI.regClassCanUseImmediate(RegClass) &&
+          (MI->getOperand(i).isImm() || MI->getOperand(i).isFPImm())) {
+        ErrInfo = "Expected register, but got immediate";
+        return false;
+      }
+    }
+      break;
+    case MCOI::OPERAND_IMMEDIATE:
+      // Check if this operand is an immediate.
+      // FrameIndex operands will be replaced by immediates, so they are
+      // allowed.
+      if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFPImm() &&
+          !MI->getOperand(i).isFI()) {
+        ErrInfo = "Expected immediate, but got non-immediate";
+        return false;
+      }
+      // Fall-through
+    default:
+      continue;
+    }
+
+    if (!MI->getOperand(i).isReg())
+      continue;
+
+    int RegClass = Desc.OpInfo[i].RegClass;
+    if (RegClass != -1) {
+      unsigned Reg = MI->getOperand(i).getReg();
+      if (TargetRegisterInfo::isVirtualRegister(Reg))
+        continue;
+
+      const TargetRegisterClass *RC = RI.getRegClass(RegClass);
+      if (!RC->contains(Reg)) {
+        ErrInfo = "Operand has incorrect register class.";
+        return false;
+      }
+    }
+  }
+
+
+  // Verify VOP*
+  if (isVOP1(Opcode) || isVOP2(Opcode) || isVOP3(Opcode) || isVOPC(Opcode)) {
+    unsigned ConstantBusCount = 0;
+    unsigned SGPRUsed = AMDGPU::NoRegister;
+    for (int i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = MI->getOperand(i);
+      if (MO.isReg() && MO.isUse() &&
+          !TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+
+        // EXEC register uses the constant bus.
+        if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC)
+          ++ConstantBusCount;
+
+        // SGPRs use the constant bus
+        if (MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC ||
+            (!MO.isImplicit() &&
+            (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) ||
+            AMDGPU::SGPR_64RegClass.contains(MO.getReg())))) {
+          if (SGPRUsed != MO.getReg()) {
+            ++ConstantBusCount;
+            SGPRUsed = MO.getReg();
+          }
+        }
+      }
+      // Literal constants use the constant bus.
+      if (isLiteralConstant(MO))
+        ++ConstantBusCount;
+    }
+    if (ConstantBusCount > 1) {
+      ErrInfo = "VOP* instruction uses the constant bus more than once";
+      return false;
+    }
+  }
+
+  // Verify SRC1 for VOP2 and VOPC
+  if (Src1Idx != -1 && (isVOP2(Opcode) || isVOPC(Opcode))) {
+    const MachineOperand &Src1 = MI->getOperand(Src1Idx);
+    if (Src1.isImm() || Src1.isFPImm()) {
+      ErrInfo = "VOP[2C] src1 cannot be an immediate.";
+      return false;
+    }
+  }
+
+  // Verify VOP3
+  if (isVOP3(Opcode)) {
+    if (Src0Idx != -1 && isLiteralConstant(MI->getOperand(Src0Idx))) {
+      ErrInfo = "VOP3 src0 cannot be a literal constant.";
+      return false;
+    }
+    if (Src1Idx != -1 && isLiteralConstant(MI->getOperand(Src1Idx))) {
+      ErrInfo = "VOP3 src1 cannot be a literal constant.";
+      return false;
+    }
+    if (Src2Idx != -1 && isLiteralConstant(MI->getOperand(Src2Idx))) {
+      ErrInfo = "VOP3 src2 cannot be a literal constant.";
+      return false;
+    }
+  }
+
+  // Verify misc. restrictions on specific instructions.
+  if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 ||
+      Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) {
+    MI->dump();
+
+    const MachineOperand &Src0 = MI->getOperand(2);
+    const MachineOperand &Src1 = MI->getOperand(3);
+    const MachineOperand &Src2 = MI->getOperand(4);
+    if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
+      if (!compareMachineOp(Src0, Src1) &&
+          !compareMachineOp(Src0, Src2)) {
+        ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  default: return AMDGPU::INSTRUCTION_LIST_END;
+  case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
+  case AMDGPU::COPY: return AMDGPU::COPY;
+  case AMDGPU::PHI: return AMDGPU::PHI;
+  case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
+  case AMDGPU::S_MOV_B32:
+    return MI.getOperand(1).isReg() ?
+           AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
+  case AMDGPU::S_ADD_I32: return AMDGPU::V_ADD_I32_e32;
+  case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32;
+  case AMDGPU::S_SUB_I32: return AMDGPU::V_SUB_I32_e32;
+  case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
+  case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e32;
+  case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e32;
+  case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e32;
+  case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e32;
+  case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e32;
+  case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e32;
+  case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e32;
+  case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
+  case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64;
+  case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
+  case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64;
+  case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
+  case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64;
+  case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32;
+  case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32;
+  case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32;
+  case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32;
+  case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
+  case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
+  case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
+  case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32;
+  case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32;
+  case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32;
+  case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32;
+  case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32;
+  case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32;
+  case AMDGPU::S_LOAD_DWORD_IMM:
+  case AMDGPU::S_LOAD_DWORD_SGPR: return AMDGPU::BUFFER_LOAD_DWORD_ADDR64;
+  case AMDGPU::S_LOAD_DWORDX2_IMM:
+  case AMDGPU::S_LOAD_DWORDX2_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64;
+  case AMDGPU::S_LOAD_DWORDX4_IMM:
+  case AMDGPU::S_LOAD_DWORDX4_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64;
+  case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e32;
+  case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
+  case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
+  }
+}
+
+bool SIInstrInfo::isSALUOpSupportedOnVALU(const MachineInstr &MI) const {
+  return getVALUOp(MI) != AMDGPU::INSTRUCTION_LIST_END;
+}
+
+const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
+                                                      unsigned OpNo) const {
+  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+  const MCInstrDesc &Desc = get(MI.getOpcode());
+  if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
+      Desc.OpInfo[OpNo].RegClass == -1)
+    return MRI.getRegClass(MI.getOperand(OpNo).getReg());
+
+  unsigned RCID = Desc.OpInfo[OpNo].RegClass;
+  return RI.getRegClass(RCID);
+}
+
+bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const {
+  switch (MI.getOpcode()) {
+  case AMDGPU::COPY:
+  case AMDGPU::REG_SEQUENCE:
+  case AMDGPU::PHI:
+  case AMDGPU::INSERT_SUBREG:
+    return RI.hasVGPRs(getOpRegClass(MI, 0));
+  default:
+    return RI.hasVGPRs(getOpRegClass(MI, OpNo));
+  }
+}
+
+void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const {
+  MachineBasicBlock::iterator I = MI;
+  MachineOperand &MO = MI->getOperand(OpIdx);
+  MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+  unsigned RCID = get(MI->getOpcode()).OpInfo[OpIdx].RegClass;
+  const TargetRegisterClass *RC = RI.getRegClass(RCID);
+  unsigned Opcode = AMDGPU::V_MOV_B32_e32;
+  if (MO.isReg()) {
+    Opcode = AMDGPU::COPY;
+  } else if (RI.isSGPRClass(RC)) {
+    Opcode = AMDGPU::S_MOV_B32;
+  }
+
+  const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
+  unsigned Reg = MRI.createVirtualRegister(VRC);
+  BuildMI(*MI->getParent(), I, MI->getParent()->findDebugLoc(I), get(Opcode),
+          Reg).addOperand(MO);
+  MO.ChangeToRegister(Reg, false);
+}
+
+unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI,
+                                         MachineRegisterInfo &MRI,
+                                         MachineOperand &SuperReg,
+                                         const TargetRegisterClass *SuperRC,
+                                         unsigned SubIdx,
+                                         const TargetRegisterClass *SubRC)
+                                         const {
+  assert(SuperReg.isReg());
+
+  unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC);
+  unsigned SubReg = MRI.createVirtualRegister(SubRC);
+
+  // Just in case the super register is itself a sub-register, copy it to a new
+  // value so we don't need to worry about merging its subreg index with the
+  // SubIdx passed to this function. The register coalescer should be able to
+  // eliminate this extra copy.
+  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(TargetOpcode::COPY),
+          NewSuperReg)
+          .addOperand(SuperReg);
+
+  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(TargetOpcode::COPY),
+          SubReg)
+          .addReg(NewSuperReg, 0, SubIdx);
+  return SubReg;
+}
+
+MachineOperand SIInstrInfo::buildExtractSubRegOrImm(
+  MachineBasicBlock::iterator MII,
+  MachineRegisterInfo &MRI,
+  MachineOperand &Op,
+  const TargetRegisterClass *SuperRC,
+  unsigned SubIdx,
+  const TargetRegisterClass *SubRC) const {
+  if (Op.isImm()) {
+    // XXX - Is there a better way to do this?
+    if (SubIdx == AMDGPU::sub0)
+      return MachineOperand::CreateImm(Op.getImm() & 0xFFFFFFFF);
+    if (SubIdx == AMDGPU::sub1)
+      return MachineOperand::CreateImm(Op.getImm() >> 32);
+
+    llvm_unreachable("Unhandled register index for immediate");
+  }
+
+  unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
+                                       SubIdx, SubRC);
+  return MachineOperand::CreateReg(SubReg, false);
+}
+
+unsigned SIInstrInfo::split64BitImm(SmallVectorImpl<MachineInstr *> &Worklist,
+                                    MachineBasicBlock::iterator MI,
+                                    MachineRegisterInfo &MRI,
+                                    const TargetRegisterClass *RC,
+                                    const MachineOperand &Op) const {
+  MachineBasicBlock *MBB = MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned LoDst = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+  unsigned HiDst = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+  unsigned Dst = MRI.createVirtualRegister(RC);
+
+  MachineInstr *Lo = BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32),
+                             LoDst)
+    .addImm(Op.getImm() & 0xFFFFFFFF);
+  MachineInstr *Hi = BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32),
+                             HiDst)
+    .addImm(Op.getImm() >> 32);
+
+  BuildMI(*MBB, MI, DL, get(TargetOpcode::REG_SEQUENCE), Dst)
+    .addReg(LoDst)
+    .addImm(AMDGPU::sub0)
+    .addReg(HiDst)
+    .addImm(AMDGPU::sub1);
+
+  Worklist.push_back(Lo);
+  Worklist.push_back(Hi);
+
+  return Dst;
+}
+
+void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
+  MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+  int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                           AMDGPU::OpName::src0);
+  int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                           AMDGPU::OpName::src1);
+  int Src2Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                           AMDGPU::OpName::src2);
+
+  // Legalize VOP2
+  if (isVOP2(MI->getOpcode()) && Src1Idx != -1) {
+    MachineOperand &Src0 = MI->getOperand(Src0Idx);
+    MachineOperand &Src1 = MI->getOperand(Src1Idx);
+
+    // If the instruction implicitly reads VCC, we can't have any SGPR operands,
+    // so move any.
+    bool ReadsVCC = MI->readsRegister(AMDGPU::VCC, &RI);
+    if (ReadsVCC && Src0.isReg() &&
+        RI.isSGPRClass(MRI.getRegClass(Src0.getReg()))) {
+      legalizeOpWithMove(MI, Src0Idx);
+      return;
+    }
+
+    if (ReadsVCC && Src1.isReg() &&
+        RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
+      legalizeOpWithMove(MI, Src1Idx);
+      return;
+    }
+
+    // Legalize VOP2 instructions where src1 is not a VGPR. An SGPR input must
+    // be the first operand, and there can only be one.
+    if (Src1.isImm() || Src1.isFPImm() ||
+        (Src1.isReg() && RI.isSGPRClass(MRI.getRegClass(Src1.getReg())))) {
+      if (MI->isCommutable()) {
+        if (commuteInstruction(MI))
+          return;
+      }
+      legalizeOpWithMove(MI, Src1Idx);
+    }
+  }
+
+  // XXX - Do any VOP3 instructions read VCC?
+  // Legalize VOP3
+  if (isVOP3(MI->getOpcode())) {
+    int VOP3Idx[3] = {Src0Idx, Src1Idx, Src2Idx};
+    unsigned SGPRReg = AMDGPU::NoRegister;
+    for (unsigned i = 0; i < 3; ++i) {
+      int Idx = VOP3Idx[i];
+      if (Idx == -1)
+        continue;
+      MachineOperand &MO = MI->getOperand(Idx);
+
+      if (MO.isReg()) {
+        if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
+          continue; // VGPRs are legal
+
+        assert(MO.getReg() != AMDGPU::SCC && "SCC operand to VOP3 instruction");
+
+        if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) {
+          SGPRReg = MO.getReg();
+          // We can use one SGPR in each VOP3 instruction.
+          continue;
+        }
+      } else if (!isLiteralConstant(MO)) {
+        // If it is not a register and not a literal constant, then it must be
+        // an inline constant which is always legal.
+        continue;
+      }
+      // If we make it this far, then the operand is not legal and we must
+      // legalize it.
+      legalizeOpWithMove(MI, Idx);
+    }
+  }
+
+  // Legalize REG_SEQUENCE and PHI
+  // The register class of the operands much be the same type as the register
+  // class of the output.
+  if (MI->getOpcode() == AMDGPU::REG_SEQUENCE ||
+      MI->getOpcode() == AMDGPU::PHI) {
+    const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
+    for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) {
+      if (!MI->getOperand(i).isReg() ||
+          !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg()))
+        continue;
+      const TargetRegisterClass *OpRC =
+              MRI.getRegClass(MI->getOperand(i).getReg());
+      if (RI.hasVGPRs(OpRC)) {
+        VRC = OpRC;
+      } else {
+        SRC = OpRC;
+      }
+    }
+
+    // If any of the operands are VGPR registers, then they all most be
+    // otherwise we will create illegal VGPR->SGPR copies when legalizing
+    // them.
+    if (VRC || !RI.isSGPRClass(getOpRegClass(*MI, 0))) {
+      if (!VRC) {
+        assert(SRC);
+        VRC = RI.getEquivalentVGPRClass(SRC);
+      }
+      RC = VRC;
+    } else {
+      RC = SRC;
+    }
+
+    // Update all the operands so they have the same type.
+    for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) {
+      if (!MI->getOperand(i).isReg() ||
+          !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg()))
+        continue;
+      unsigned DstReg = MRI.createVirtualRegister(RC);
+      MachineBasicBlock *InsertBB;
+      MachineBasicBlock::iterator Insert;
+      if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) {
+        InsertBB = MI->getParent();
+        Insert = MI;
+      } else {
+        // MI is a PHI instruction.
+        InsertBB = MI->getOperand(i + 1).getMBB();
+        Insert = InsertBB->getFirstTerminator();
+      }
+      BuildMI(*InsertBB, Insert, MI->getDebugLoc(),
+              get(AMDGPU::COPY), DstReg)
+              .addOperand(MI->getOperand(i));
+      MI->getOperand(i).setReg(DstReg);
+    }
+  }
+
+  // Legalize INSERT_SUBREG
+  // src0 must have the same register class as dst
+  if (MI->getOpcode() == AMDGPU::INSERT_SUBREG) {
+    unsigned Dst = MI->getOperand(0).getReg();
+    unsigned Src0 = MI->getOperand(1).getReg();
+    const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
+    const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
+    if (DstRC != Src0RC) {
+      MachineBasicBlock &MBB = *MI->getParent();
+      unsigned NewSrc0 = MRI.createVirtualRegister(DstRC);
+      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), NewSrc0)
+              .addReg(Src0);
+      MI->getOperand(1).setReg(NewSrc0);
+    }
+    return;
+  }
+
+  // Legalize MUBUF* instructions
+  // FIXME: If we start using the non-addr64 instructions for compute, we
+  // may need to legalize them here.
+
+  int SRsrcIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                            AMDGPU::OpName::srsrc);
+  int VAddrIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                             AMDGPU::OpName::vaddr);
+  if (SRsrcIdx != -1 && VAddrIdx != -1) {
+    const TargetRegisterClass *VAddrRC =
+        RI.getRegClass(get(MI->getOpcode()).OpInfo[VAddrIdx].RegClass);
+
+    if(VAddrRC->getSize() == 8 &&
+       MRI.getRegClass(MI->getOperand(SRsrcIdx).getReg()) != VAddrRC) {
+      // We have a MUBUF instruction that uses a 64-bit vaddr register and
+      // srsrc has the incorrect register class.  In order to fix this, we
+      // need to extract the pointer from the resource descriptor (srsrc),
+      // add it to the value of vadd,  then store the result in the vaddr
+      // operand.  Then, we need to set the pointer field of the resource
+      // descriptor to zero.
+
+      MachineBasicBlock &MBB = *MI->getParent();
+      MachineOperand &SRsrcOp = MI->getOperand(SRsrcIdx);
+      MachineOperand &VAddrOp = MI->getOperand(VAddrIdx);
+      unsigned SRsrcPtrLo, SRsrcPtrHi, VAddrLo, VAddrHi;
+      unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+      unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+      unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+      unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+      unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+      unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+      unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
+
+      // SRsrcPtrLo = srsrc:sub0
+      SRsrcPtrLo = buildExtractSubReg(MI, MRI, SRsrcOp,
+          &AMDGPU::VReg_128RegClass, AMDGPU::sub0, &AMDGPU::VReg_32RegClass);
+
+      // SRsrcPtrHi = srsrc:sub1
+      SRsrcPtrHi = buildExtractSubReg(MI, MRI, SRsrcOp,
+          &AMDGPU::VReg_128RegClass, AMDGPU::sub1, &AMDGPU::VReg_32RegClass);
+
+      // VAddrLo = vaddr:sub0
+      VAddrLo = buildExtractSubReg(MI, MRI, VAddrOp,
+          &AMDGPU::VReg_64RegClass, AMDGPU::sub0, &AMDGPU::VReg_32RegClass);
+
+      // VAddrHi = vaddr:sub1
+      VAddrHi = buildExtractSubReg(MI, MRI, VAddrOp,
+          &AMDGPU::VReg_64RegClass, AMDGPU::sub1, &AMDGPU::VReg_32RegClass);
+
+      // NewVaddrLo = SRsrcPtrLo + VAddrLo
+      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADD_I32_e32),
+              NewVAddrLo)
+              .addReg(SRsrcPtrLo)
+              .addReg(VAddrLo)
+              .addReg(AMDGPU::VCC, RegState::Define | RegState::Implicit);
+
+      // NewVaddrHi = SRsrcPtrHi + VAddrHi
+      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADDC_U32_e32),
+              NewVAddrHi)
+              .addReg(SRsrcPtrHi)
+              .addReg(VAddrHi)
+              .addReg(AMDGPU::VCC, RegState::ImplicitDefine)
+              .addReg(AMDGPU::VCC, RegState::Implicit);
+
+      // NewVaddr = {NewVaddrHi, NewVaddrLo}
+      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
+              NewVAddr)
+              .addReg(NewVAddrLo)
+              .addImm(AMDGPU::sub0)
+              .addReg(NewVAddrHi)
+              .addImm(AMDGPU::sub1);
+
+      // Zero64 = 0
+      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B64),
+              Zero64)
+              .addImm(0);
+
+      // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
+      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
+              SRsrcFormatLo)
+              .addImm(AMDGPU::RSRC_DATA_FORMAT & 0xFFFFFFFF);
+
+      // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
+      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
+              SRsrcFormatHi)
+              .addImm(AMDGPU::RSRC_DATA_FORMAT >> 32);
+
+      // NewSRsrc = {Zero64, SRsrcFormat}
+      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
+              NewSRsrc)
+              .addReg(Zero64)
+              .addImm(AMDGPU::sub0_sub1)
+              .addReg(SRsrcFormatLo)
+              .addImm(AMDGPU::sub2)
+              .addReg(SRsrcFormatHi)
+              .addImm(AMDGPU::sub3);
+
+      // Update the instruction to use NewVaddr
+      MI->getOperand(VAddrIdx).setReg(NewVAddr);
+      // Update the instruction to use NewSRsrc
+      MI->getOperand(SRsrcIdx).setReg(NewSRsrc);
+    }
+  }
+}
+
+void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const {
+  MachineBasicBlock *MBB = MI->getParent();
+  switch (MI->getOpcode()) {
+    case AMDGPU::S_LOAD_DWORD_IMM:
+    case AMDGPU::S_LOAD_DWORD_SGPR:
+    case AMDGPU::S_LOAD_DWORDX2_IMM:
+    case AMDGPU::S_LOAD_DWORDX2_SGPR:
+    case AMDGPU::S_LOAD_DWORDX4_IMM:
+    case AMDGPU::S_LOAD_DWORDX4_SGPR:
+      unsigned NewOpcode = getVALUOp(*MI);
+      unsigned RegOffset;
+      unsigned ImmOffset;
+
+      if (MI->getOperand(2).isReg()) {
+        RegOffset = MI->getOperand(2).getReg();
+        ImmOffset = 0;
+      } else {
+        assert(MI->getOperand(2).isImm());
+        // SMRD instructions take a dword offsets and MUBUF instructions
+        // take a byte offset.
+        ImmOffset = MI->getOperand(2).getImm() << 2;
+        RegOffset = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+        if (isUInt<12>(ImmOffset)) {
+          BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
+                  RegOffset)
+                  .addImm(0);
+        } else {
+          BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
+                  RegOffset)
+                  .addImm(ImmOffset);
+          ImmOffset = 0;
+        }
+      }
+
+      unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
+      unsigned DWord0 = RegOffset;
+      unsigned DWord1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+      unsigned DWord2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+      unsigned DWord3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+
+      BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord1)
+              .addImm(0);
+      BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord2)
+              .addImm(AMDGPU::RSRC_DATA_FORMAT & 0xFFFFFFFF);
+      BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord3)
+              .addImm(AMDGPU::RSRC_DATA_FORMAT >> 32);
+      BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), SRsrc)
+              .addReg(DWord0)
+              .addImm(AMDGPU::sub0)
+              .addReg(DWord1)
+              .addImm(AMDGPU::sub1)
+              .addReg(DWord2)
+              .addImm(AMDGPU::sub2)
+              .addReg(DWord3)
+              .addImm(AMDGPU::sub3);
+     MI->setDesc(get(NewOpcode));
+     if (MI->getOperand(2).isReg()) {
+       MI->getOperand(2).setReg(MI->getOperand(1).getReg());
+     } else {
+       MI->getOperand(2).ChangeToRegister(MI->getOperand(1).getReg(), false);
+     }
+     MI->getOperand(1).setReg(SRsrc);
+     MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(ImmOffset));
+  }
+}
+
+void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
+  SmallVector<MachineInstr *, 128> Worklist;
+  Worklist.push_back(&TopInst);
+
+  while (!Worklist.empty()) {
+    MachineInstr *Inst = Worklist.pop_back_val();
+    MachineBasicBlock *MBB = Inst->getParent();
+    MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+
+    unsigned Opcode = Inst->getOpcode();
+    unsigned NewOpcode = getVALUOp(*Inst);
+
+    // Handle some special cases
+    switch (Opcode) {
+    default:
+      if (isSMRD(Inst->getOpcode())) {
+        moveSMRDToVALU(Inst, MRI);
+      }
+      break;
+    case AMDGPU::S_MOV_B64: {
+      DebugLoc DL = Inst->getDebugLoc();
+
+      // If the source operand is a register we can replace this with a
+      // copy.
+      if (Inst->getOperand(1).isReg()) {
+        MachineInstr *Copy = BuildMI(*MBB, Inst, DL, get(TargetOpcode::COPY))
+          .addOperand(Inst->getOperand(0))
+          .addOperand(Inst->getOperand(1));
+        Worklist.push_back(Copy);
+      } else {
+        // Otherwise, we need to split this into two movs, because there is
+        // no 64-bit VALU move instruction.
+        unsigned Reg = Inst->getOperand(0).getReg();
+        unsigned Dst = split64BitImm(Worklist,
+                                     Inst,
+                                     MRI,
+                                     MRI.getRegClass(Reg),
+                                     Inst->getOperand(1));
+        MRI.replaceRegWith(Reg, Dst);
+      }
+      Inst->eraseFromParent();
+      continue;
+    }
+    case AMDGPU::S_AND_B64:
+      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32);
+      Inst->eraseFromParent();
+      continue;
+
+    case AMDGPU::S_OR_B64:
+      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32);
+      Inst->eraseFromParent();
+      continue;
+
+    case AMDGPU::S_XOR_B64:
+      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32);
+      Inst->eraseFromParent();
+      continue;
+
+    case AMDGPU::S_NOT_B64:
+      splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
+      Inst->eraseFromParent();
+      continue;
+
+    case AMDGPU::S_BCNT1_I32_B64:
+      splitScalar64BitBCNT(Worklist, Inst);
+      Inst->eraseFromParent();
+      continue;
+
+    case AMDGPU::S_BFE_U64:
+    case AMDGPU::S_BFE_I64:
+    case AMDGPU::S_BFM_B64:
+      llvm_unreachable("Moving this op to VALU not implemented");
+    }
+
+    if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
+      // We cannot move this instruction to the VALU, so we should try to
+      // legalize its operands instead.
+      legalizeOperands(Inst);
+      continue;
+    }
+
+    // Use the new VALU Opcode.
+    const MCInstrDesc &NewDesc = get(NewOpcode);
+    Inst->setDesc(NewDesc);
+
+    // Remove any references to SCC. Vector instructions can't read from it, and
+    // We're just about to add the implicit use / defs of VCC, and we don't want
+    // both.
+    for (unsigned i = Inst->getNumOperands() - 1; i > 0; --i) {
+      MachineOperand &Op = Inst->getOperand(i);
+      if (Op.isReg() && Op.getReg() == AMDGPU::SCC)
+        Inst->RemoveOperand(i);
+    }
+
+    if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
+      // We are converting these to a BFE, so we need to add the missing
+      // operands for the size and offset.
+      unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
+      Inst->addOperand(Inst->getOperand(1));
+      Inst->getOperand(1).ChangeToImmediate(0);
+      Inst->addOperand(MachineOperand::CreateImm(0));
+      Inst->addOperand(MachineOperand::CreateImm(0));
+      Inst->addOperand(MachineOperand::CreateImm(0));
+      Inst->addOperand(MachineOperand::CreateImm(Size));
+
+      // XXX - Other pointless operands. There are 4, but it seems you only need
+      // 3 to not hit an assertion later in MCInstLower.
+      Inst->addOperand(MachineOperand::CreateImm(0));
+      Inst->addOperand(MachineOperand::CreateImm(0));
+    } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
+      // The VALU version adds the second operand to the result, so insert an
+      // extra 0 operand.
+      Inst->addOperand(MachineOperand::CreateImm(0));
+    }
+
+    addDescImplicitUseDef(NewDesc, Inst);
+
+    if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
+      const MachineOperand &OffsetWidthOp = Inst->getOperand(2);
+      // If we need to move this to VGPRs, we need to unpack the second operand
+      // back into the 2 separate ones for bit offset and width.
+      assert(OffsetWidthOp.isImm() &&
+             "Scalar BFE is only implemented for constant width and offset");
+      uint32_t Imm = OffsetWidthOp.getImm();
+
+      uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
+      uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
+
+      Inst->RemoveOperand(2); // Remove old immediate.
+      Inst->addOperand(Inst->getOperand(1));
+      Inst->getOperand(1).ChangeToImmediate(0);
+      Inst->addOperand(MachineOperand::CreateImm(0));
+      Inst->addOperand(MachineOperand::CreateImm(Offset));
+      Inst->addOperand(MachineOperand::CreateImm(0));
+      Inst->addOperand(MachineOperand::CreateImm(BitWidth));
+      Inst->addOperand(MachineOperand::CreateImm(0));
+      Inst->addOperand(MachineOperand::CreateImm(0));
+    }
+
+    // Update the destination register class.
+
+    const TargetRegisterClass *NewDstRC = getOpRegClass(*Inst, 0);
+
+    switch (Opcode) {
+      // For target instructions, getOpRegClass just returns the virtual
+      // register class associated with the operand, so we need to find an
+      // equivalent VGPR register class in order to move the instruction to the
+      // VALU.
+    case AMDGPU::COPY:
+    case AMDGPU::PHI:
+    case AMDGPU::REG_SEQUENCE:
+    case AMDGPU::INSERT_SUBREG:
+      if (RI.hasVGPRs(NewDstRC))
+        continue;
+      NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
+      if (!NewDstRC)
+        continue;
+      break;
+    default:
+      break;
+    }
+
+    unsigned DstReg = Inst->getOperand(0).getReg();
+    unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC);
+    MRI.replaceRegWith(DstReg, NewDstReg);
+
+    // Legalize the operands
+    legalizeOperands(Inst);
+
+    for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg),
+           E = MRI.use_end(); I != E; ++I) {
+      MachineInstr &UseMI = *I->getParent();
+      if (!canReadVGPR(UseMI, I.getOperandNo())) {
+        Worklist.push_back(&UseMI);
+      }
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Indirect addressing callbacks
+//===----------------------------------------------------------------------===//
+
+unsigned SIInstrInfo::calculateIndirectAddress(unsigned RegIndex,
+                                                 unsigned Channel) const {
+  assert(Channel == 0);
+  return RegIndex;
+}
+
+const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const {
+  return &AMDGPU::VReg_32RegClass;
+}
+
+void SIInstrInfo::splitScalar64BitUnaryOp(
+  SmallVectorImpl<MachineInstr *> &Worklist,
+  MachineInstr *Inst,
+  unsigned Opcode) const {
+  MachineBasicBlock &MBB = *Inst->getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+  MachineOperand &Dest = Inst->getOperand(0);
+  MachineOperand &Src0 = Inst->getOperand(1);
+  DebugLoc DL = Inst->getDebugLoc();
+
+  MachineBasicBlock::iterator MII = Inst;
+
+  const MCInstrDesc &InstDesc = get(Opcode);
+  const TargetRegisterClass *Src0RC = Src0.isReg() ?
+    MRI.getRegClass(Src0.getReg()) :
+    &AMDGPU::SGPR_32RegClass;
+
+  const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
+
+  MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
+                                                       AMDGPU::sub0, Src0SubRC);
+
+  const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
+  const TargetRegisterClass *DestSubRC = RI.getSubRegClass(DestRC, AMDGPU::sub0);
+
+  unsigned DestSub0 = MRI.createVirtualRegister(DestRC);
+  MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0)
+    .addOperand(SrcReg0Sub0);
+
+  MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
+                                                       AMDGPU::sub1, Src0SubRC);
+
+  unsigned DestSub1 = MRI.createVirtualRegister(DestSubRC);
+  MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1)
+    .addOperand(SrcReg0Sub1);
+
+  unsigned FullDestReg = MRI.createVirtualRegister(DestRC);
+  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
+    .addReg(DestSub0)
+    .addImm(AMDGPU::sub0)
+    .addReg(DestSub1)
+    .addImm(AMDGPU::sub1);
+
+  MRI.replaceRegWith(Dest.getReg(), FullDestReg);
+
+  // Try to legalize the operands in case we need to swap the order to keep it
+  // valid.
+  Worklist.push_back(LoHalf);
+  Worklist.push_back(HiHalf);
+}
+
+void SIInstrInfo::splitScalar64BitBinaryOp(
+  SmallVectorImpl<MachineInstr *> &Worklist,
+  MachineInstr *Inst,
+  unsigned Opcode) const {
+  MachineBasicBlock &MBB = *Inst->getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+  MachineOperand &Dest = Inst->getOperand(0);
+  MachineOperand &Src0 = Inst->getOperand(1);
+  MachineOperand &Src1 = Inst->getOperand(2);
+  DebugLoc DL = Inst->getDebugLoc();
+
+  MachineBasicBlock::iterator MII = Inst;
+
+  const MCInstrDesc &InstDesc = get(Opcode);
+  const TargetRegisterClass *Src0RC = Src0.isReg() ?
+    MRI.getRegClass(Src0.getReg()) :
+    &AMDGPU::SGPR_32RegClass;
+
+  const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
+  const TargetRegisterClass *Src1RC = Src1.isReg() ?
+    MRI.getRegClass(Src1.getReg()) :
+    &AMDGPU::SGPR_32RegClass;
+
+  const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
+
+  MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
+                                                       AMDGPU::sub0, Src0SubRC);
+  MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
+                                                       AMDGPU::sub0, Src1SubRC);
+
+  const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
+  const TargetRegisterClass *DestSubRC = RI.getSubRegClass(DestRC, AMDGPU::sub0);
+
+  unsigned DestSub0 = MRI.createVirtualRegister(DestRC);
+  MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0)
+    .addOperand(SrcReg0Sub0)
+    .addOperand(SrcReg1Sub0);
+
+  MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
+                                                       AMDGPU::sub1, Src0SubRC);
+  MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
+                                                       AMDGPU::sub1, Src1SubRC);
+
+  unsigned DestSub1 = MRI.createVirtualRegister(DestSubRC);
+  MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1)
+    .addOperand(SrcReg0Sub1)
+    .addOperand(SrcReg1Sub1);
+
+  unsigned FullDestReg = MRI.createVirtualRegister(DestRC);
+  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
+    .addReg(DestSub0)
+    .addImm(AMDGPU::sub0)
+    .addReg(DestSub1)
+    .addImm(AMDGPU::sub1);
+
+  MRI.replaceRegWith(Dest.getReg(), FullDestReg);
+
+  // Try to legalize the operands in case we need to swap the order to keep it
+  // valid.
+  Worklist.push_back(LoHalf);
+  Worklist.push_back(HiHalf);
+}
+
+void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist,
+                                       MachineInstr *Inst) const {
+  MachineBasicBlock &MBB = *Inst->getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+  MachineBasicBlock::iterator MII = Inst;
+  DebugLoc DL = Inst->getDebugLoc();
+
+  MachineOperand &Dest = Inst->getOperand(0);
+  MachineOperand &Src = Inst->getOperand(1);
+
+  const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e32);
+  const TargetRegisterClass *SrcRC = Src.isReg() ?
+    MRI.getRegClass(Src.getReg()) :
+    &AMDGPU::SGPR_32RegClass;
+
+  unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+  const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0);
+
+  MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
+                                                      AMDGPU::sub0, SrcSubRC);
+  MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
+                                                      AMDGPU::sub1, SrcSubRC);
+
+  MachineInstr *First = BuildMI(MBB, MII, DL, InstDesc, MidReg)
+    .addOperand(SrcRegSub0)
+    .addImm(0);
+
+  MachineInstr *Second = BuildMI(MBB, MII, DL, InstDesc, ResultReg)
+    .addOperand(SrcRegSub1)
+    .addReg(MidReg);
+
+  MRI.replaceRegWith(Dest.getReg(), ResultReg);
+
+  Worklist.push_back(First);
+  Worklist.push_back(Second);
+}
+
+void SIInstrInfo::addDescImplicitUseDef(const MCInstrDesc &NewDesc,
+                                        MachineInstr *Inst) const {
+  // Add the implict and explicit register definitions.
+  if (NewDesc.ImplicitUses) {
+    for (unsigned i = 0; NewDesc.ImplicitUses[i]; ++i) {
+      unsigned Reg = NewDesc.ImplicitUses[i];
+      Inst->addOperand(MachineOperand::CreateReg(Reg, false, true));
+    }
+  }
+
+  if (NewDesc.ImplicitDefs) {
+    for (unsigned i = 0; NewDesc.ImplicitDefs[i]; ++i) {
+      unsigned Reg = NewDesc.ImplicitDefs[i];
+      Inst->addOperand(MachineOperand::CreateReg(Reg, true, true));
+    }
+  }
+}
+
+MachineInstrBuilder SIInstrInfo::buildIndirectWrite(
+                                   MachineBasicBlock *MBB,
+                                   MachineBasicBlock::iterator I,
+                                   unsigned ValueReg,
+                                   unsigned Address, unsigned OffsetReg) const {
+  const DebugLoc &DL = MBB->findDebugLoc(I);
+  unsigned IndirectBaseReg = AMDGPU::VReg_32RegClass.getRegister(
+                                      getIndirectIndexBegin(*MBB->getParent()));
+
+  return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_DST_V1))
+          .addReg(IndirectBaseReg, RegState::Define)
+          .addOperand(I->getOperand(0))
+          .addReg(IndirectBaseReg)
+          .addReg(OffsetReg)
+          .addImm(0)
+          .addReg(ValueReg);
+}
+
+MachineInstrBuilder SIInstrInfo::buildIndirectRead(
+                                   MachineBasicBlock *MBB,
+                                   MachineBasicBlock::iterator I,
+                                   unsigned ValueReg,
+                                   unsigned Address, unsigned OffsetReg) const {
+  const DebugLoc &DL = MBB->findDebugLoc(I);
+  unsigned IndirectBaseReg = AMDGPU::VReg_32RegClass.getRegister(
+                                      getIndirectIndexBegin(*MBB->getParent()));
+
+  return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_SRC))
+          .addOperand(I->getOperand(0))
+          .addOperand(I->getOperand(1))
+          .addReg(IndirectBaseReg)
+          .addReg(OffsetReg)
+          .addImm(0);
+
+}
+
+void SIInstrInfo::reserveIndirectRegisters(BitVector &Reserved,
+                                            const MachineFunction &MF) const {
+  int End = getIndirectIndexEnd(MF);
+  int Begin = getIndirectIndexBegin(MF);
+
+  if (End == -1)
+    return;
+
+
+  for (int Index = Begin; Index <= End; ++Index)
+    Reserved.set(AMDGPU::VReg_32RegClass.getRegister(Index));
+
+  for (int Index = std::max(0, Begin - 1); Index <= End; ++Index)
+    Reserved.set(AMDGPU::VReg_64RegClass.getRegister(Index));
+
+  for (int Index = std::max(0, Begin - 2); Index <= End; ++Index)
+    Reserved.set(AMDGPU::VReg_96RegClass.getRegister(Index));
+
+  for (int Index = std::max(0, Begin - 3); Index <= End; ++Index)
+    Reserved.set(AMDGPU::VReg_128RegClass.getRegister(Index));
+
+  for (int Index = std::max(0, Begin - 7); Index <= End; ++Index)
+    Reserved.set(AMDGPU::VReg_256RegClass.getRegister(Index));
+
+  for (int Index = std::max(0, Begin - 15); Index <= End; ++Index)
+    Reserved.set(AMDGPU::VReg_512RegClass.getRegister(Index));
+}
+
+const MachineOperand *SIInstrInfo::getNamedOperand(const MachineInstr& MI,
+                                                   unsigned OperandName) const {
+  int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
+  if (Idx == -1)
+    return nullptr;
+
+  return &MI.getOperand(Idx);
+}
diff --git a/contrib/llvm/lib/Target/R600/SIInstrInfo.h b/contrib/llvm/lib/Target/R600/SIInstrInfo.h
new file mode 100644
index 0000000..4687539
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/SIInstrInfo.h
@@ -0,0 +1,212 @@
+//===-- SIInstrInfo.h - SI Instruction Info Interface -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface definition for SIInstrInfo.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef SIINSTRINFO_H
+#define SIINSTRINFO_H
+
+#include "AMDGPUInstrInfo.h"
+#include "SIRegisterInfo.h"
+
+namespace llvm {
+
+class SIInstrInfo : public AMDGPUInstrInfo {
+private:
+  const SIRegisterInfo RI;
+
+  unsigned buildExtractSubReg(MachineBasicBlock::iterator MI,
+                              MachineRegisterInfo &MRI,
+                              MachineOperand &SuperReg,
+                              const TargetRegisterClass *SuperRC,
+                              unsigned SubIdx,
+                              const TargetRegisterClass *SubRC) const;
+  MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI,
+                                         MachineRegisterInfo &MRI,
+                                         MachineOperand &SuperReg,
+                                         const TargetRegisterClass *SuperRC,
+                                         unsigned SubIdx,
+                                         const TargetRegisterClass *SubRC) const;
+
+  unsigned split64BitImm(SmallVectorImpl<MachineInstr *> &Worklist,
+                         MachineBasicBlock::iterator MI,
+                         MachineRegisterInfo &MRI,
+                         const TargetRegisterClass *RC,
+                         const MachineOperand &Op) const;
+
+  void splitScalar64BitUnaryOp(SmallVectorImpl<MachineInstr *> &Worklist,
+                               MachineInstr *Inst, unsigned Opcode) const;
+
+  void splitScalar64BitBinaryOp(SmallVectorImpl<MachineInstr *> &Worklist,
+                                MachineInstr *Inst, unsigned Opcode) const;
+
+  void splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist,
+                            MachineInstr *Inst) const;
+
+  void addDescImplicitUseDef(const MCInstrDesc &Desc, MachineInstr *MI) const;
+
+public:
+  explicit SIInstrInfo(const AMDGPUSubtarget &st);
+
+  const SIRegisterInfo &getRegisterInfo() const override {
+    return RI;
+  }
+
+  void copyPhysReg(MachineBasicBlock &MBB,
+                   MachineBasicBlock::iterator MI, DebugLoc DL,
+                   unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
+
+  virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
+
+  unsigned commuteOpcode(unsigned Opcode) const;
+
+  MachineInstr *commuteInstruction(MachineInstr *MI,
+                                   bool NewMI=false) const override;
+
+  bool isTriviallyReMaterializable(const MachineInstr *MI,
+                                   AliasAnalysis *AA = nullptr) const;
+
+  MachineInstr *buildMovInstr(MachineBasicBlock *MBB,
+                              MachineBasicBlock::iterator I,
+                              unsigned DstReg, unsigned SrcReg) const override;
+  bool isMov(unsigned Opcode) const override;
+
+  bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override;
+  bool isDS(uint16_t Opcode) const;
+  int isMIMG(uint16_t Opcode) const;
+  int isSMRD(uint16_t Opcode) const;
+  bool isVOP1(uint16_t Opcode) const;
+  bool isVOP2(uint16_t Opcode) const;
+  bool isVOP3(uint16_t Opcode) const;
+  bool isVOPC(uint16_t Opcode) const;
+  bool isInlineConstant(const APInt &Imm) const;
+  bool isInlineConstant(const MachineOperand &MO) const;
+  bool isLiteralConstant(const MachineOperand &MO) const;
+
+  bool isImmOperandLegal(const MachineInstr *MI, unsigned OpNo,
+                         const MachineOperand &MO) const;
+
+  /// \brief Return true if this 64-bit VALU instruction has a 32-bit encoding.
+  /// This function will return false if you pass it a 32-bit instruction.
+  bool hasVALU32BitEncoding(unsigned Opcode) const;
+
+  bool verifyInstruction(const MachineInstr *MI,
+                         StringRef &ErrInfo) const override;
+
+  bool isSALUInstr(const MachineInstr &MI) const;
+  static unsigned getVALUOp(const MachineInstr &MI);
+
+  bool isSALUOpSupportedOnVALU(const MachineInstr &MI) const;
+
+  /// \brief Return the correct register class for \p OpNo.  For target-specific
+  /// instructions, this will return the register class that has been defined
+  /// in tablegen.  For generic instructions, like REG_SEQUENCE it will return
+  /// the register class of its machine operand.
+  /// to infer the correct register class base on the other operands.
+  const TargetRegisterClass *getOpRegClass(const MachineInstr &MI,
+                                           unsigned OpNo) const;\
+
+  /// \returns true if it is legal for the operand at index \p OpNo
+  /// to read a VGPR.
+  bool canReadVGPR(const MachineInstr &MI, unsigned OpNo) const;
+
+  /// \brief Legalize the \p OpIndex operand of this instruction by inserting
+  /// a MOV.  For example:
+  /// ADD_I32_e32 VGPR0, 15
+  /// to
+  /// MOV VGPR1, 15
+  /// ADD_I32_e32 VGPR0, VGPR1
+  ///
+  /// If the operand being legalized is a register, then a COPY will be used
+  /// instead of MOV.
+  void legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const;
+
+  /// \brief Legalize all operands in this instruction.  This function may
+  /// create new instruction and insert them before \p MI.
+  void legalizeOperands(MachineInstr *MI) const;
+
+  void moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const;
+
+  /// \brief Replace this instruction's opcode with the equivalent VALU
+  /// opcode.  This function will also move the users of \p MI to the
+  /// VALU if necessary.
+  void moveToVALU(MachineInstr &MI) const;
+
+  unsigned calculateIndirectAddress(unsigned RegIndex,
+                                    unsigned Channel) const override;
+
+  const TargetRegisterClass *getIndirectAddrRegClass() const override;
+
+  MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
+                                         MachineBasicBlock::iterator I,
+                                         unsigned ValueReg,
+                                         unsigned Address,
+                                         unsigned OffsetReg) const override;
+
+  MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
+                                        MachineBasicBlock::iterator I,
+                                        unsigned ValueReg,
+                                        unsigned Address,
+                                        unsigned OffsetReg) const override;
+  void reserveIndirectRegisters(BitVector &Reserved,
+                                const MachineFunction &MF) const;
+
+  void LoadM0(MachineInstr *MoveRel, MachineBasicBlock::iterator I,
+              unsigned SavReg, unsigned IndexReg) const;
+
+  void insertNOPs(MachineBasicBlock::iterator MI, int Count) const;
+
+  /// \brief Returns the operand named \p Op.  If \p MI does not have an
+  /// operand named \c Op, this function returns nullptr.
+  const MachineOperand *getNamedOperand(const MachineInstr& MI,
+                                        unsigned OperandName) const;
+};
+
+namespace AMDGPU {
+
+  int getVOPe64(uint16_t Opcode);
+  int getVOPe32(uint16_t Opcode);
+  int getCommuteRev(uint16_t Opcode);
+  int getCommuteOrig(uint16_t Opcode);
+  int getMCOpcode(uint16_t Opcode, unsigned Gen);
+
+  const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
+  const uint64_t RSRC_TID_ENABLE = 1LL << 55;
+
+} // End namespace AMDGPU
+
+} // End namespace llvm
+
+namespace SIInstrFlags {
+  enum Flags {
+    // First 4 bits are the instruction encoding
+    VM_CNT = 1 << 0,
+    EXP_CNT = 1 << 1,
+    LGKM_CNT = 1 << 2
+  };
+}
+
+#endif //SIINSTRINFO_H
diff --git a/contrib/llvm/lib/Target/R600/SIInstrInfo.td b/contrib/llvm/lib/Target/R600/SIInstrInfo.td
new file mode 100644
index 0000000..b0ac20f
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/SIInstrInfo.td
@@ -0,0 +1,917 @@
+//===-- SIInstrInfo.td - SI Instruction Infos -------------*- tablegen -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// Execpt for the NONE field, this must be kept in sync with the SISubtarget enum
+// in AMDGPUMCInstLower.h
+def SISubtarget {
+  int NONE = -1;
+  int SI = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// SI DAG Nodes
+//===----------------------------------------------------------------------===//
+
+def SIload_constant : SDNode<"AMDGPUISD::LOAD_CONSTANT",
+  SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i32>]>,
+                      [SDNPMayLoad, SDNPMemOperand]
+>;
+
+def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT",
+  SDTypeProfile<0, 13,
+    [SDTCisVT<0, v4i32>,   // rsrc(SGPR)
+     SDTCisVT<1, iAny>,   // vdata(VGPR)
+     SDTCisVT<2, i32>,    // num_channels(imm)
+     SDTCisVT<3, i32>,    // vaddr(VGPR)
+     SDTCisVT<4, i32>,    // soffset(SGPR)
+     SDTCisVT<5, i32>,    // inst_offset(imm)
+     SDTCisVT<6, i32>,    // dfmt(imm)
+     SDTCisVT<7, i32>,    // nfmt(imm)
+     SDTCisVT<8, i32>,    // offen(imm)
+     SDTCisVT<9, i32>,    // idxen(imm)
+     SDTCisVT<10, i32>,   // glc(imm)
+     SDTCisVT<11, i32>,   // slc(imm)
+     SDTCisVT<12, i32>    // tfe(imm)
+    ]>,
+  [SDNPMayStore, SDNPMemOperand, SDNPHasChain]
+>;
+
+def SIload_input : SDNode<"AMDGPUISD::LOAD_INPUT",
+  SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i16>,
+                       SDTCisVT<3, i32>]>
+>;
+
+class SDSample<string opcode> : SDNode <opcode,
+  SDTypeProfile<1, 4, [SDTCisVT<0, v4f32>, SDTCisVT<2, v32i8>,
+                       SDTCisVT<3, v4i32>, SDTCisVT<4, i32>]>
+>;
+
+def SIsample : SDSample<"AMDGPUISD::SAMPLE">;
+def SIsampleb : SDSample<"AMDGPUISD::SAMPLEB">;
+def SIsampled : SDSample<"AMDGPUISD::SAMPLED">;
+def SIsamplel : SDSample<"AMDGPUISD::SAMPLEL">;
+
+def SIconstdata_ptr : SDNode<
+  "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 0, [SDTCisVT<0, i64>]>
+>;
+
+// Transformation function, extract the lower 32bit of a 64bit immediate
+def LO32 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() & 0xffffffff, MVT::i32);
+}]>;
+
+def LO32f : SDNodeXForm<fpimm, [{
+  APInt V = N->getValueAPF().bitcastToAPInt().trunc(32);
+  return CurDAG->getTargetConstantFP(APFloat(APFloat::IEEEsingle, V), MVT::f32);
+}]>;
+
+// Transformation function, extract the upper 32bit of a 64bit immediate
+def HI32 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() >> 32, MVT::i32);
+}]>;
+
+def HI32f : SDNodeXForm<fpimm, [{
+  APInt V = N->getValueAPF().bitcastToAPInt().lshr(32).trunc(32);
+  return CurDAG->getTargetConstantFP(APFloat(APFloat::IEEEsingle, V), MVT::f32);
+}]>;
+
+def IMM8bitDWORD : PatLeaf <(imm),
+  [{return (N->getZExtValue() & ~0x3FC) == 0;}]
+>;
+
+def as_dword_i32imm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() >> 2, MVT::i32);
+}]>;
+
+def as_i1imm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue(), MVT::i1);
+}]>;
+
+def as_i8imm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue(), MVT::i8);
+}]>;
+
+def as_i16imm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getSExtValue(), MVT::i16);
+}]>;
+
+def as_i32imm: SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getSExtValue(), MVT::i32);
+}]>;
+
+def IMM8bit : PatLeaf <(imm),
+  [{return isUInt<8>(N->getZExtValue());}]
+>;
+
+def IMM12bit : PatLeaf <(imm),
+  [{return isUInt<12>(N->getZExtValue());}]
+>;
+
+def IMM16bit : PatLeaf <(imm),
+  [{return isUInt<16>(N->getZExtValue());}]
+>;
+
+def IMM32bit : PatLeaf <(imm),
+  [{return isUInt<32>(N->getZExtValue());}]
+>;
+
+def mubuf_vaddr_offset : PatFrag<
+  (ops node:$ptr, node:$offset, node:$imm_offset),
+  (add (add node:$ptr, node:$offset), node:$imm_offset)
+>;
+
+class InlineImm <ValueType vt> : PatLeaf <(vt imm), [{
+  return isInlineImmediate(N);
+}]>;
+
+class SGPRImm <dag frag> : PatLeaf<frag, [{
+  if (TM.getSubtarget<AMDGPUSubtarget>().getGeneration() <
+      AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+    return false;
+  }
+  const SIRegisterInfo *SIRI =
+                       static_cast<const SIRegisterInfo*>(TM.getRegisterInfo());
+  for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
+                                                U != E; ++U) {
+    if (SIRI->isSGPRClass(getOperandRegClass(*U, U.getOperandNo()))) {
+      return true;
+    }
+  }
+  return false;
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Custom Operands
+//===----------------------------------------------------------------------===//
+
+def FRAMEri32 : Operand<iPTR> {
+  let MIOperandInfo = (ops i32:$ptr, i32imm:$index);
+}
+
+def sopp_brtarget : Operand<OtherVT> {
+  let EncoderMethod = "getSOPPBrEncoding";
+  let OperandType = "OPERAND_PCREL";
+}
+
+//===----------------------------------------------------------------------===//
+// Complex patterns
+//===----------------------------------------------------------------------===//
+
+def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">;
+def MUBUFAddr64 : ComplexPattern<i64, 3, "SelectMUBUFAddr64">;
+def MUBUFScratch : ComplexPattern<i64, 4, "SelectMUBUFScratch">;
+
+//===----------------------------------------------------------------------===//
+// SI assembler operands
+//===----------------------------------------------------------------------===//
+
+def SIOperand {
+  int ZERO = 0x80;
+  int VCC = 0x6A;
+}
+
+include "SIInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+//
+// SI Instruction multiclass helpers.
+//
+// Instructions with _32 take 32-bit operands.
+// Instructions with _64 take 64-bit operands.
+//
+// VOP_* instructions can use either a 32-bit or 64-bit encoding.  The 32-bit
+// encoding is the standard encoding, but instruction that make use of
+// any of the instruction modifiers must use the 64-bit encoding.
+//
+// Instructions with _e32 use the 32-bit encoding.
+// Instructions with _e64 use the 64-bit encoding.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Scalar classes
+//===----------------------------------------------------------------------===//
+
+class SOP1_32 <bits<8> op, string opName, list<dag> pattern> : SOP1 <
+  op, (outs SReg_32:$dst), (ins SSrc_32:$src0),
+  opName#" $dst, $src0", pattern
+>;
+
+class SOP1_64 <bits<8> op, string opName, list<dag> pattern> : SOP1 <
+  op, (outs SReg_64:$dst), (ins SSrc_64:$src0),
+  opName#" $dst, $src0", pattern
+>;
+
+// 64-bit input, 32-bit output.
+class SOP1_32_64 <bits<8> op, string opName, list<dag> pattern> : SOP1 <
+  op, (outs SReg_32:$dst), (ins SSrc_64:$src0),
+  opName#" $dst, $src0", pattern
+>;
+
+class SOP2_32 <bits<7> op, string opName, list<dag> pattern> : SOP2 <
+  op, (outs SReg_32:$dst), (ins SSrc_32:$src0, SSrc_32:$src1),
+  opName#" $dst, $src0, $src1", pattern
+>;
+
+class SOP2_64 <bits<7> op, string opName, list<dag> pattern> : SOP2 <
+  op, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_64:$src1),
+  opName#" $dst, $src0, $src1", pattern
+>;
+
+class SOP2_SHIFT_64 <bits<7> op, string opName, list<dag> pattern> : SOP2 <
+  op, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_32:$src1),
+  opName#" $dst, $src0, $src1", pattern
+>;
+
+
+class SOPC_Helper <bits<7> op, RegisterClass rc, ValueType vt,
+                    string opName, PatLeaf cond> : SOPC <
+  op, (outs SCCReg:$dst), (ins rc:$src0, rc:$src1),
+  opName#" $dst, $src0, $src1", []>;
+
+class SOPC_32<bits<7> op, string opName, PatLeaf cond = COND_NULL>
+  : SOPC_Helper<op, SSrc_32, i32, opName, cond>;
+
+class SOPC_64<bits<7> op, string opName, PatLeaf cond = COND_NULL>
+  : SOPC_Helper<op, SSrc_64, i64, opName, cond>;
+
+class SOPK_32 <bits<5> op, string opName, list<dag> pattern> : SOPK <
+  op, (outs SReg_32:$dst), (ins i16imm:$src0),
+  opName#" $dst, $src0", pattern
+>;
+
+class SOPK_64 <bits<5> op, string opName, list<dag> pattern> : SOPK <
+  op, (outs SReg_64:$dst), (ins i16imm:$src0),
+  opName#" $dst, $src0", pattern
+>;
+
+multiclass SMRD_Helper <bits<5> op, string asm, RegisterClass baseClass,
+                        RegisterClass dstClass> {
+  def _IMM : SMRD <
+    op, 1, (outs dstClass:$dst),
+    (ins baseClass:$sbase, u32imm:$offset),
+    asm#" $dst, $sbase, $offset", []
+  >;
+
+  def _SGPR : SMRD <
+    op, 0, (outs dstClass:$dst),
+    (ins baseClass:$sbase, SReg_32:$soff),
+    asm#" $dst, $sbase, $soff", []
+  >;
+}
+
+//===----------------------------------------------------------------------===//
+// Vector ALU classes
+//===----------------------------------------------------------------------===//
+
+class VOP <string opName> {
+  string OpName = opName;
+}
+
+class VOP2_REV <string revOp, bit isOrig> {
+  string RevOp = revOp;
+  bit IsOrig = isOrig;
+}
+
+class SIMCInstr <string pseudo, int subtarget> {
+  string PseudoInstr = pseudo;
+  int Subtarget = subtarget;
+}
+
+class VOP3_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
+  VOP3Common <outs, ins, "", pattern>,
+  VOP <opName>,
+  SIMCInstr<opName, SISubtarget.NONE> {
+  let isPseudo = 1;
+}
+
+class VOP3_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName> :
+  VOP3 <op, outs, ins, asm, []>,
+  SIMCInstr<opName, SISubtarget.SI>;
+
+multiclass VOP3_m <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern,
+                   string opName> {
+
+  def "" : VOP3_Pseudo <outs, ins, pattern, opName>;
+
+  def _si : VOP3_Real_si <op, outs, ins, asm, opName>;
+
+}
+
+multiclass VOP3_1_m <bits<8> op, dag outs, dag ins, string asm,
+                     list<dag> pattern, string opName> {
+
+  def "" : VOP3_Pseudo <outs, ins, pattern, opName>;
+
+  let src1 = 0, src1_modifiers = 0, src2 = 0, src2_modifiers = 0 in {
+
+    def _si : VOP3_Real_si <
+      {1, 1, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
+      outs, ins, asm, opName
+    >;
+
+  } // src1 = 0, src1_modifiers = 0, src2 = 0, src2_modifiers = 0
+}
+
+multiclass VOP3_2_m <bits<6> op, dag outs, dag ins, string asm,
+                     list<dag> pattern, string opName, string revOp> {
+
+  def "" : VOP3_Pseudo <outs, ins, pattern, opName>;
+
+  let src2 = 0, src2_modifiers = 0 in {
+
+    def _si : VOP3_Real_si <
+        {1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
+        outs, ins, asm, opName>,
+        VOP2_REV<revOp#"_e64", !eq(revOp, opName)>;
+
+  } // src2 = 0, src2_modifiers = 0
+}
+
+// This must always be right before the operand being input modified.
+def InputMods : OperandWithDefaultOps <i32, (ops (i32 0))> {
+  let PrintMethod = "printOperandAndMods";
+}
+
+multiclass VOP1_Helper <bits<8> op, RegisterClass drc, RegisterClass src,
+                        string opName, list<dag> pattern> {
+
+  def _e32 : VOP1 <
+    op, (outs drc:$dst), (ins src:$src0),
+    opName#"_e32 $dst, $src0", pattern
+  >, VOP <opName>;
+
+  defm _e64 : VOP3_1_m <
+    op,
+    (outs drc:$dst),
+    (ins InputMods:$src0_modifiers, src:$src0, i32imm:$clamp, i32imm:$omod),
+    opName#"_e64 $dst, $src0_modifiers, $clamp, $omod", [], opName>;
+}
+
+multiclass VOP1_32 <bits<8> op, string opName, list<dag> pattern>
+  : VOP1_Helper <op, VReg_32, VSrc_32, opName, pattern>;
+
+multiclass VOP1_64 <bits<8> op, string opName, list<dag> pattern>
+  : VOP1_Helper <op, VReg_64, VSrc_64, opName, pattern>;
+
+multiclass VOP1_32_64 <bits<8> op, string opName, list<dag> pattern>
+  : VOP1_Helper <op, VReg_32, VSrc_64, opName, pattern>;
+
+multiclass VOP1_64_32 <bits<8> op, string opName, list<dag> pattern>
+  : VOP1_Helper <op, VReg_64, VSrc_32, opName, pattern>;
+
+multiclass VOP2_Helper <bits<6> op, RegisterClass vrc, RegisterClass arc,
+                        string opName, list<dag> pattern, string revOp> {
+  def _e32 : VOP2 <
+    op, (outs vrc:$dst), (ins arc:$src0, vrc:$src1),
+    opName#"_e32 $dst, $src0, $src1", pattern
+  >, VOP <opName>, VOP2_REV<revOp#"_e32", !eq(revOp, opName)>;
+
+  defm _e64 : VOP3_2_m <
+    op,
+    (outs vrc:$dst),
+    (ins InputMods:$src0_modifiers, arc:$src0,
+         InputMods:$src1_modifiers, arc:$src1,
+         i32imm:$clamp, i32imm:$omod),
+    opName#"_e64 $dst, $src0_modifiers, $src1_modifiers, $clamp, $omod", [],
+    opName, revOp>;
+}
+
+multiclass VOP2_32 <bits<6> op, string opName, list<dag> pattern,
+                    string revOp = opName>
+  : VOP2_Helper <op, VReg_32, VSrc_32, opName, pattern, revOp>;
+
+multiclass VOP2_64 <bits<6> op, string opName, list<dag> pattern,
+                    string revOp = opName>
+  : VOP2_Helper <op, VReg_64, VSrc_64, opName, pattern, revOp>;
+
+multiclass VOP2b_32 <bits<6> op, string opName, list<dag> pattern,
+                     RegisterClass src0_rc, string revOp = opName> {
+
+  def _e32 : VOP2 <
+    op, (outs VReg_32:$dst), (ins src0_rc:$src0, VReg_32:$src1),
+    opName#"_e32 $dst, $src0, $src1", pattern
+  >, VOP <opName>, VOP2_REV<revOp#"_e32", !eq(revOp, opName)>;
+
+  def _e64 : VOP3b <
+    {1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
+    (outs VReg_32:$dst),
+    (ins InputMods: $src0_modifiers, VSrc_32:$src0,
+         InputMods:$src1_modifiers, VSrc_32:$src1,
+         i32imm:$clamp, i32imm:$omod),
+    opName#"_e64 $dst, $src0_modifiers, $src1_modifiers, $clamp, $omod", []
+  >, VOP <opName>, VOP2_REV<revOp#"_e64", !eq(revOp, opName)> {
+    let src2 = 0;
+    let src2_modifiers = 0;
+    /* the VOP2 variant puts the carry out into VCC, the VOP3 variant
+       can write it into any SGPR. We currently don't use the carry out,
+       so for now hardcode it to VCC as well */
+    let sdst = SIOperand.VCC;
+  }
+}
+
+multiclass VOPC_Helper <bits<8> op, RegisterClass vrc, RegisterClass arc,
+                        string opName, ValueType vt, PatLeaf cond, bit defExec = 0> {
+  def _e32 : VOPC <
+    op, (ins arc:$src0, vrc:$src1),
+    opName#"_e32 $dst, $src0, $src1", []
+  >, VOP <opName> {
+    let Defs = !if(defExec, [EXEC], []);
+  }
+
+  def _e64 : VOP3 <
+    {0, op{7}, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
+    (outs SReg_64:$dst),
+    (ins InputMods:$src0_modifiers, arc:$src0,
+         InputMods:$src1_modifiers, arc:$src1,
+         InstFlag:$clamp, InstFlag:$omod),
+    opName#"_e64 $dst, $src0_modifiers, $src1_modifiers, $clamp, $omod",
+    !if(!eq(!cast<string>(cond), "COND_NULL"), []<dag>,
+      [(set SReg_64:$dst, (i1 (setcc (vt arc:$src0), arc:$src1, cond)))]
+    )
+  >, VOP <opName> {
+    let Defs = !if(defExec, [EXEC], []);
+    let src2 = 0;
+    let src2_modifiers = 0;
+  }
+}
+
+multiclass VOPC_32 <bits<8> op, string opName,
+  ValueType vt = untyped, PatLeaf cond = COND_NULL>
+  : VOPC_Helper <op, VReg_32, VSrc_32, opName, vt, cond>;
+
+multiclass VOPC_64 <bits<8> op, string opName,
+  ValueType vt = untyped, PatLeaf cond = COND_NULL>
+  : VOPC_Helper <op, VReg_64, VSrc_64, opName, vt, cond>;
+
+multiclass VOPCX_32 <bits<8> op, string opName,
+  ValueType vt = untyped, PatLeaf cond = COND_NULL>
+  : VOPC_Helper <op, VReg_32, VSrc_32, opName, vt, cond, 1>;
+
+multiclass VOPCX_64 <bits<8> op, string opName,
+  ValueType vt = untyped, PatLeaf cond = COND_NULL>
+  : VOPC_Helper <op, VReg_64, VSrc_64, opName, vt, cond, 1>;
+
+multiclass VOP3_32 <bits<9> op, string opName, list<dag> pattern> : VOP3_m <
+  op, (outs VReg_32:$dst),
+  (ins InputMods: $src0_modifiers, VSrc_32:$src0, InputMods:$src1_modifiers,
+   VSrc_32:$src1, InputMods:$src2_modifiers, VSrc_32:$src2,
+   InstFlag:$clamp, InstFlag:$omod),
+  opName#" $dst, $src0_modifiers, $src1, $src2, $clamp, $omod", pattern, opName
+>;
+
+class VOP3_64_32 <bits <9> op, string opName, list<dag> pattern> : VOP3 <
+  op, (outs VReg_64:$dst),
+  (ins VSrc_64:$src0, VSrc_32:$src1),
+  opName#" $dst, $src0, $src1", pattern
+>, VOP <opName> {
+
+  let src2 = 0;
+  let src2_modifiers = 0;
+  let src0_modifiers = 0;
+  let clamp = 0;
+  let omod = 0;
+}
+
+class VOP3_64 <bits<9> op, string opName, list<dag> pattern> : VOP3 <
+  op, (outs VReg_64:$dst),
+  (ins InputMods:$src0_modifiers, VSrc_64:$src0,
+       InputMods:$src1_modifiers, VSrc_64:$src1,
+       InputMods:$src2_modifiers, VSrc_64:$src2,
+       InstFlag:$clamp, InstFlag:$omod),
+  opName#" $dst, $src0_modifiers, $src1_modifiers, $src2_modifiers, $clamp, $omod", pattern
+>, VOP <opName>;
+
+
+class VOP3b_Helper <bits<9> op, RegisterClass vrc, RegisterClass arc,
+                    string opName, list<dag> pattern> : VOP3 <
+  op, (outs vrc:$dst0, SReg_64:$dst1),
+  (ins arc:$src0, arc:$src1, arc:$src2,
+   InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg),
+  opName#" $dst0, $dst1, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", pattern
+>, VOP <opName>;
+
+
+class VOP3b_64 <bits<9> op, string opName, list<dag> pattern> :
+  VOP3b_Helper <op, VReg_64, VSrc_64, opName, pattern>;
+
+class VOP3b_32 <bits<9> op, string opName, list<dag> pattern> :
+  VOP3b_Helper <op, VReg_32, VSrc_32, opName, pattern>;
+
+//===----------------------------------------------------------------------===//
+// Vector I/O classes
+//===----------------------------------------------------------------------===//
+
+class DS_1A <bits<8> op, dag outs, dag ins, string asm, list<dag> pat> :
+    DS <op, outs, ins, asm, pat> {
+  bits<16> offset;
+
+  // Single load interpret the 2 i8imm operands as a single i16 offset.
+  let offset0 = offset{7-0};
+  let offset1 = offset{15-8};
+}
+
+class DS_Load_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A <
+  op,
+  (outs regClass:$vdst),
+  (ins i1imm:$gds, VReg_32:$addr, u16imm:$offset),
+  asm#" $vdst, $addr, $offset, [M0]",
+  []> {
+  let data0 = 0;
+  let data1 = 0;
+  let mayLoad = 1;
+  let mayStore = 0;
+}
+
+class DS_Load2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS <
+  op,
+  (outs regClass:$vdst),
+  (ins i1imm:$gds, VReg_32:$addr, u8imm:$offset0, u8imm:$offset1),
+  asm#" $gds, $vdst, $addr, $offset0, $offset1, [M0]",
+  []> {
+  let data0 = 0;
+  let data1 = 0;
+  let mayLoad = 1;
+  let mayStore = 0;
+}
+
+class DS_Store_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A <
+  op,
+  (outs),
+  (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, u16imm:$offset),
+  asm#" $addr, $data0, $offset [M0]",
+  []> {
+  let data1 = 0;
+  let mayStore = 1;
+  let mayLoad = 0;
+  let vdst = 0;
+}
+
+class DS_Store2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A <
+  op,
+  (outs),
+  (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, u8imm:$offset0, u8imm:$offset1),
+  asm#" $addr, $data0, $data1, $offset0, $offset1 [M0]",
+  []> {
+  let mayStore = 1;
+  let mayLoad = 0;
+  let vdst = 0;
+}
+
+// 1 address, 1 data.
+class DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc> : DS_1A <
+  op,
+  (outs rc:$vdst),
+  (ins i1imm:$gds, VReg_32:$addr, rc:$data0, u16imm:$offset),
+  asm#" $vdst, $addr, $data0, $offset, [M0]",
+  []> {
+
+  let data1 = 0;
+  let mayStore = 1;
+  let mayLoad = 1;
+}
+
+// 1 address, 2 data.
+class DS_1A2D_RET <bits<8> op, string asm, RegisterClass rc> : DS_1A <
+  op,
+  (outs rc:$vdst),
+  (ins i1imm:$gds, VReg_32:$addr, rc:$data0, rc:$data1, u16imm:$offset),
+  asm#" $vdst, $addr, $data0, $data1, $offset, [M0]",
+  []> {
+  let mayStore = 1;
+  let mayLoad = 1;
+}
+
+// 1 address, 2 data.
+class DS_1A2D_NORET <bits<8> op, string asm, RegisterClass rc> : DS_1A <
+  op,
+  (outs),
+  (ins i1imm:$gds, VReg_32:$addr, rc:$data0, rc:$data1, u16imm:$offset),
+  asm#" $addr, $data0, $data1, $offset, [M0]",
+  []> {
+  let mayStore = 1;
+  let mayLoad = 1;
+}
+
+// 1 address, 1 data.
+class DS_1A1D_NORET <bits<8> op, string asm, RegisterClass rc> : DS_1A <
+  op,
+  (outs),
+  (ins i1imm:$gds, VReg_32:$addr, rc:$data0, u16imm:$offset),
+  asm#" $addr, $data0, $offset, [M0]",
+  []> {
+
+  let data1 = 0;
+  let mayStore = 1;
+  let mayLoad = 1;
+}
+
+class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
+  op,
+  (outs),
+  (ins regClass:$vdata, u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc,
+   i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr,
+   SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset),
+  asm#" $vdata, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"
+     #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset",
+  []> {
+  let mayStore = 1;
+  let mayLoad = 0;
+}
+
+multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass,
+                              ValueType load_vt = i32,
+                              SDPatternOperator ld = null_frag> {
+
+  let lds = 0, mayLoad = 1 in {
+
+    let addr64 = 0 in {
+
+      let offen = 0, idxen = 0, vaddr = 0 in {
+        def _OFFSET : MUBUF <op, (outs regClass:$vdata),
+                             (ins SReg_128:$srsrc,
+                             u16imm:$offset, SSrc_32:$soffset, i1imm:$glc,
+                             i1imm:$slc, i1imm:$tfe),
+                             asm#" $vdata, $srsrc + $offset + $soffset, glc=$glc, slc=$slc, tfe=$tfe", []>;
+      }
+
+      let offen = 1, idxen = 0  in {
+        def _OFFEN  : MUBUF <op, (outs regClass:$vdata),
+                             (ins SReg_128:$srsrc, VReg_32:$vaddr,
+                             SSrc_32:$soffset, u16imm:$offset, i1imm:$glc, i1imm:$slc,
+                             i1imm:$tfe),
+                             asm#" $vdata, $srsrc + $vaddr + $soffset + $offset, glc=$glc, slc=$slc, tfe=$tfe", []>;
+      }
+
+      let offen = 0, idxen = 1 in {
+        def _IDXEN  : MUBUF <op, (outs regClass:$vdata),
+                             (ins SReg_128:$srsrc, VReg_32:$vaddr,
+                             u16imm:$offset, SSrc_32:$soffset, i1imm:$glc,
+                             i1imm:$slc, i1imm:$tfe),
+                             asm#" $vdata, $srsrc[$vaddr] + $offset + $soffset, glc=$glc, slc=$slc, tfe=$tfe", []>;
+      }
+
+      let offen = 1, idxen = 1 in {
+        def _BOTHEN : MUBUF <op, (outs regClass:$vdata),
+                             (ins SReg_128:$srsrc, VReg_64:$vaddr,
+                             SSrc_32:$soffset, i1imm:$glc,
+                             i1imm:$slc, i1imm:$tfe),
+                             asm#" $vdata, $srsrc[$vaddr[0]] + $vaddr[1] + $soffset, glc=$glc, slc=$slc, tfe=$tfe", []>;
+      }
+    }
+
+    let offen = 0, idxen = 0, addr64 = 1, glc = 0, slc = 0, tfe = 0, soffset = 128 /* ZERO */ in {
+      def _ADDR64 : MUBUF <op, (outs regClass:$vdata),
+                           (ins SReg_128:$srsrc, VReg_64:$vaddr, u16imm:$offset),
+                           asm#" $vdata, $srsrc + $vaddr + $offset",
+                           [(set load_vt:$vdata, (ld (MUBUFAddr64 v4i32:$srsrc,
+                                                  i64:$vaddr, u16imm:$offset)))]>;
+    }
+  }
+}
+
+multiclass MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass,
+                          ValueType store_vt, SDPatternOperator st> {
+
+  def "" : MUBUF <
+    op, (outs),
+    (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_32:$vaddr, SSrc_32:$soffset,
+         u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$slc,
+         i1imm:$tfe),
+    name#" $vdata, $srsrc, $vaddr, $soffset, $offset $offen $idxen $glc $slc $tfe",
+    []
+  > {
+    let addr64 = 0;
+  }
+
+  def _ADDR64 : MUBUF <
+    op, (outs),
+    (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr, u16imm:$offset),
+    name#" $vdata, $srsrc + $vaddr + $offset",
+    [(st store_vt:$vdata,
+     (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, u16imm:$offset))]> {
+
+      let mayLoad = 0;
+      let mayStore = 1;
+
+      // Encoding
+      let offen = 0;
+      let idxen = 0;
+      let glc = 0;
+      let addr64 = 1;
+      let lds = 0;
+      let slc = 0;
+      let tfe = 0;
+      let soffset = 128; // ZERO
+   }
+}
+
+class MTBUF_Load_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
+  op,
+  (outs regClass:$dst),
+  (ins u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
+       i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr, SReg_128:$srsrc,
+       i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset),
+  asm#" $dst, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"
+     #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset",
+  []> {
+  let mayLoad = 1;
+  let mayStore = 0;
+}
+
+class MIMG_Mask <string op, int channels> {
+  string Op = op;
+  int Channels = channels;
+}
+
+class MIMG_NoSampler_Helper <bits<7> op, string asm,
+                             RegisterClass dst_rc,
+                             RegisterClass src_rc> : MIMG <
+  op,
+  (outs dst_rc:$vdata),
+  (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128,
+       i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr,
+       SReg_256:$srsrc),
+  asm#" $vdata, $dmask, $unorm, $glc, $da, $r128,"
+     #" $tfe, $lwe, $slc, $vaddr, $srsrc",
+  []> {
+  let SSAMP = 0;
+  let mayLoad = 1;
+  let mayStore = 0;
+  let hasPostISelHook = 1;
+}
+
+multiclass MIMG_NoSampler_Src_Helper <bits<7> op, string asm,
+                                      RegisterClass dst_rc,
+                                      int channels> {
+  def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_32>,
+            MIMG_Mask<asm#"_V1", channels>;
+  def _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>,
+            MIMG_Mask<asm#"_V2", channels>;
+  def _V4 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_128>,
+            MIMG_Mask<asm#"_V4", channels>;
+}
+
+multiclass MIMG_NoSampler <bits<7> op, string asm> {
+  defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VReg_32, 1>;
+  defm _V2 : MIMG_NoSampler_Src_Helper <op, asm, VReg_64, 2>;
+  defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 3>;
+  defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 4>;
+}
+
+class MIMG_Sampler_Helper <bits<7> op, string asm,
+                           RegisterClass dst_rc,
+                           RegisterClass src_rc> : MIMG <
+  op,
+  (outs dst_rc:$vdata),
+  (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128,
+       i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr,
+       SReg_256:$srsrc, SReg_128:$ssamp),
+  asm#" $vdata, $dmask, $unorm, $glc, $da, $r128,"
+     #" $tfe, $lwe, $slc, $vaddr, $srsrc, $ssamp",
+  []> {
+  let mayLoad = 1;
+  let mayStore = 0;
+  let hasPostISelHook = 1;
+}
+
+multiclass MIMG_Sampler_Src_Helper <bits<7> op, string asm,
+                                    RegisterClass dst_rc,
+                                    int channels> {
+  def _V1 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_32>,
+            MIMG_Mask<asm#"_V1", channels>;
+  def _V2 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_64>,
+            MIMG_Mask<asm#"_V2", channels>;
+  def _V4 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_128>,
+            MIMG_Mask<asm#"_V4", channels>;
+  def _V8 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_256>,
+            MIMG_Mask<asm#"_V8", channels>;
+  def _V16 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_512>,
+            MIMG_Mask<asm#"_V16", channels>;
+}
+
+multiclass MIMG_Sampler <bits<7> op, string asm> {
+  defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VReg_32, 1>;
+  defm _V2 : MIMG_Sampler_Src_Helper<op, asm, VReg_64, 2>;
+  defm _V3 : MIMG_Sampler_Src_Helper<op, asm, VReg_96, 3>;
+  defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4>;
+}
+
+class MIMG_Gather_Helper <bits<7> op, string asm,
+                          RegisterClass dst_rc,
+                          RegisterClass src_rc> : MIMG <
+  op,
+  (outs dst_rc:$vdata),
+  (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128,
+       i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr,
+       SReg_256:$srsrc, SReg_128:$ssamp),
+  asm#" $vdata, $dmask, $unorm, $glc, $da, $r128,"
+     #" $tfe, $lwe, $slc, $vaddr, $srsrc, $ssamp",
+  []> {
+  let mayLoad = 1;
+  let mayStore = 0;
+
+  // DMASK was repurposed for GATHER4. 4 components are always
+  // returned and DMASK works like a swizzle - it selects
+  // the component to fetch. The only useful DMASK values are
+  // 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
+  // (red,red,red,red) etc.) The ISA document doesn't mention
+  // this.
+  // Therefore, disable all code which updates DMASK by setting these two:
+  let MIMG = 0;
+  let hasPostISelHook = 0;
+}
+
+multiclass MIMG_Gather_Src_Helper <bits<7> op, string asm,
+                                    RegisterClass dst_rc,
+                                    int channels> {
+  def _V1 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_32>,
+            MIMG_Mask<asm#"_V1", channels>;
+  def _V2 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_64>,
+            MIMG_Mask<asm#"_V2", channels>;
+  def _V4 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_128>,
+            MIMG_Mask<asm#"_V4", channels>;
+  def _V8 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_256>,
+            MIMG_Mask<asm#"_V8", channels>;
+  def _V16 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_512>,
+            MIMG_Mask<asm#"_V16", channels>;
+}
+
+multiclass MIMG_Gather <bits<7> op, string asm> {
+  defm _V1 : MIMG_Gather_Src_Helper<op, asm, VReg_32, 1>;
+  defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2>;
+  defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3>;
+  defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4>;
+}
+
+//===----------------------------------------------------------------------===//
+// Vector instruction mappings
+//===----------------------------------------------------------------------===//
+
+// Maps an opcode in e32 form to its e64 equivalent
+def getVOPe64 : InstrMapping {
+  let FilterClass = "VOP";
+  let RowFields = ["OpName"];
+  let ColFields = ["Size"];
+  let KeyCol = ["4"];
+  let ValueCols = [["8"]];
+}
+
+// Maps an opcode in e64 form to its e32 equivalent
+def getVOPe32 : InstrMapping {
+  let FilterClass = "VOP";
+  let RowFields = ["OpName"];
+  let ColFields = ["Size"];
+  let KeyCol = ["8"];
+  let ValueCols = [["4"]];
+}
+
+// Maps an original opcode to its commuted version
+def getCommuteRev : InstrMapping {
+  let FilterClass = "VOP2_REV";
+  let RowFields = ["RevOp"];
+  let ColFields = ["IsOrig"];
+  let KeyCol = ["1"];
+  let ValueCols = [["0"]];
+}
+
+def getMaskedMIMGOp : InstrMapping {
+  let FilterClass = "MIMG_Mask";
+  let RowFields = ["Op"];
+  let ColFields = ["Channels"];
+  let KeyCol = ["4"];
+  let ValueCols = [["1"], ["2"], ["3"] ];
+}
+
+// Maps an commuted opcode to its original version
+def getCommuteOrig : InstrMapping {
+  let FilterClass = "VOP2_REV";
+  let RowFields = ["RevOp"];
+  let ColFields = ["IsOrig"];
+  let KeyCol = ["0"];
+  let ValueCols = [["1"]];
+}
+
+def isDS : InstrMapping {
+  let FilterClass = "DS";
+  let RowFields = ["Inst"];
+  let ColFields = ["Size"];
+  let KeyCol = ["8"];
+  let ValueCols = [["8"]];
+}
+
+def getMCOpcode : InstrMapping {
+  let FilterClass = "SIMCInstr";
+  let RowFields = ["PseudoInstr"];
+  let ColFields = ["Subtarget"];
+  let KeyCol = [!cast<string>(SISubtarget.NONE)];
+  let ValueCols = [[!cast<string>(SISubtarget.SI)]];
+}
+
+include "SIInstructions.td"
diff --git a/contrib/llvm/lib/Target/R600/SIInstructions.td b/contrib/llvm/lib/Target/R600/SIInstructions.td
new file mode 100644
index 0000000..aecd847
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/SIInstructions.td
@@ -0,0 +1,2926 @@
+//===-- SIInstructions.td - SI Instruction Defintions ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This file was originally auto-generated from a GPU register header file and
+// all the instruction definitions were originally commented out.  Instructions
+// that are not yet supported remain commented out.
+//===----------------------------------------------------------------------===//
+
+class InterpSlots {
+int P0 = 2;
+int P10 = 0;
+int P20 = 1;
+}
+def INTERP : InterpSlots;
+
+def InterpSlot : Operand<i32> {
+  let PrintMethod = "printInterpSlot";
+}
+
+def SendMsgImm : Operand<i32> {
+  let PrintMethod = "printSendMsg";
+}
+
+def isSI : Predicate<"Subtarget.getGeneration() "
+                      ">= AMDGPUSubtarget::SOUTHERN_ISLANDS">;
+
+def isCI : Predicate<"Subtarget.getGeneration() "
+                      ">= AMDGPUSubtarget::SEA_ISLANDS">;
+
+def isCFDepth0 : Predicate<"isCFDepth0()">;
+
+def WAIT_FLAG : InstFlag<"printWaitFlag">;
+
+let SubtargetPredicate = isSI in {
+let OtherPredicates  = [isCFDepth0] in {
+
+//===----------------------------------------------------------------------===//
+// SMRD Instructions
+//===----------------------------------------------------------------------===//
+
+let mayLoad = 1 in {
+
+// We are using the SGPR_32 and not the SReg_32 register class for 32-bit
+// SMRD instructions, because the SGPR_32 register class does not include M0
+// and writing to M0 from an SMRD instruction will hang the GPU.
+defm S_LOAD_DWORD : SMRD_Helper <0x00, "S_LOAD_DWORD", SReg_64, SGPR_32>;
+defm S_LOAD_DWORDX2 : SMRD_Helper <0x01, "S_LOAD_DWORDX2", SReg_64, SReg_64>;
+defm S_LOAD_DWORDX4 : SMRD_Helper <0x02, "S_LOAD_DWORDX4", SReg_64, SReg_128>;
+defm S_LOAD_DWORDX8 : SMRD_Helper <0x03, "S_LOAD_DWORDX8", SReg_64, SReg_256>;
+defm S_LOAD_DWORDX16 : SMRD_Helper <0x04, "S_LOAD_DWORDX16", SReg_64, SReg_512>;
+
+defm S_BUFFER_LOAD_DWORD : SMRD_Helper <
+  0x08, "S_BUFFER_LOAD_DWORD", SReg_128, SGPR_32
+>;
+
+defm S_BUFFER_LOAD_DWORDX2 : SMRD_Helper <
+  0x09, "S_BUFFER_LOAD_DWORDX2", SReg_128, SReg_64
+>;
+
+defm S_BUFFER_LOAD_DWORDX4 : SMRD_Helper <
+  0x0a, "S_BUFFER_LOAD_DWORDX4", SReg_128, SReg_128
+>;
+
+defm S_BUFFER_LOAD_DWORDX8 : SMRD_Helper <
+  0x0b, "S_BUFFER_LOAD_DWORDX8", SReg_128, SReg_256
+>;
+
+defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper <
+  0x0c, "S_BUFFER_LOAD_DWORDX16", SReg_128, SReg_512
+>;
+
+} // mayLoad = 1
+
+//def S_MEMTIME : SMRD_ <0x0000001e, "S_MEMTIME", []>;
+//def S_DCACHE_INV : SMRD_ <0x0000001f, "S_DCACHE_INV", []>;
+
+//===----------------------------------------------------------------------===//
+// SOP1 Instructions
+//===----------------------------------------------------------------------===//
+
+let neverHasSideEffects = 1 in {
+
+let isMoveImm = 1 in {
+def S_MOV_B32 : SOP1_32 <0x00000003, "S_MOV_B32", []>;
+def S_MOV_B64 : SOP1_64 <0x00000004, "S_MOV_B64", []>;
+def S_CMOV_B32 : SOP1_32 <0x00000005, "S_CMOV_B32", []>;
+def S_CMOV_B64 : SOP1_64 <0x00000006, "S_CMOV_B64", []>;
+} // End isMoveImm = 1
+
+def S_NOT_B32 : SOP1_32 <0x00000007, "S_NOT_B32",
+  [(set i32:$dst, (not i32:$src0))]
+>;
+
+def S_NOT_B64 : SOP1_64 <0x00000008, "S_NOT_B64",
+  [(set i64:$dst, (not i64:$src0))]
+>;
+def S_WQM_B32 : SOP1_32 <0x00000009, "S_WQM_B32", []>;
+def S_WQM_B64 : SOP1_64 <0x0000000a, "S_WQM_B64", []>;
+def S_BREV_B32 : SOP1_32 <0x0000000b, "S_BREV_B32",
+  [(set i32:$dst, (AMDGPUbrev i32:$src0))]
+>;
+def S_BREV_B64 : SOP1_64 <0x0000000c, "S_BREV_B64", []>;
+} // End neverHasSideEffects = 1
+
+////def S_BCNT0_I32_B32 : SOP1_BCNT0 <0x0000000d, "S_BCNT0_I32_B32", []>;
+////def S_BCNT0_I32_B64 : SOP1_BCNT0 <0x0000000e, "S_BCNT0_I32_B64", []>;
+def S_BCNT1_I32_B32 : SOP1_32 <0x0000000f, "S_BCNT1_I32_B32",
+  [(set i32:$dst, (ctpop i32:$src0))]
+>;
+def S_BCNT1_I32_B64 : SOP1_32_64 <0x00000010, "S_BCNT1_I32_B64", []>;
+
+////def S_FF0_I32_B32 : SOP1_32 <0x00000011, "S_FF0_I32_B32", []>;
+////def S_FF0_I32_B64 : SOP1_FF0 <0x00000012, "S_FF0_I32_B64", []>;
+def S_FF1_I32_B32 : SOP1_32 <0x00000013, "S_FF1_I32_B32",
+  [(set i32:$dst, (cttz_zero_undef i32:$src0))]
+>;
+////def S_FF1_I32_B64 : SOP1_FF1 <0x00000014, "S_FF1_I32_B64", []>;
+
+def S_FLBIT_I32_B32 : SOP1_32 <0x00000015, "S_FLBIT_I32_B32",
+  [(set i32:$dst, (ctlz_zero_undef i32:$src0))]
+>;
+
+//def S_FLBIT_I32_B64 : SOP1_32 <0x00000016, "S_FLBIT_I32_B64", []>;
+def S_FLBIT_I32 : SOP1_32 <0x00000017, "S_FLBIT_I32", []>;
+//def S_FLBIT_I32_I64 : SOP1_32 <0x00000018, "S_FLBIT_I32_I64", []>;
+def S_SEXT_I32_I8 : SOP1_32 <0x00000019, "S_SEXT_I32_I8",
+  [(set i32:$dst, (sext_inreg i32:$src0, i8))]
+>;
+def S_SEXT_I32_I16 : SOP1_32 <0x0000001a, "S_SEXT_I32_I16",
+  [(set i32:$dst, (sext_inreg i32:$src0, i16))]
+>;
+
+////def S_BITSET0_B32 : SOP1_BITSET0 <0x0000001b, "S_BITSET0_B32", []>;
+////def S_BITSET0_B64 : SOP1_BITSET0 <0x0000001c, "S_BITSET0_B64", []>;
+////def S_BITSET1_B32 : SOP1_BITSET1 <0x0000001d, "S_BITSET1_B32", []>;
+////def S_BITSET1_B64 : SOP1_BITSET1 <0x0000001e, "S_BITSET1_B64", []>;
+def S_GETPC_B64 : SOP1 <
+  0x0000001f, (outs SReg_64:$dst), (ins), "S_GETPC_B64 $dst", []
+> {
+  let SSRC0 = 0;
+}
+def S_SETPC_B64 : SOP1_64 <0x00000020, "S_SETPC_B64", []>;
+def S_SWAPPC_B64 : SOP1_64 <0x00000021, "S_SWAPPC_B64", []>;
+def S_RFE_B64 : SOP1_64 <0x00000022, "S_RFE_B64", []>;
+
+let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC] in {
+
+def S_AND_SAVEEXEC_B64 : SOP1_64 <0x00000024, "S_AND_SAVEEXEC_B64", []>;
+def S_OR_SAVEEXEC_B64 : SOP1_64 <0x00000025, "S_OR_SAVEEXEC_B64", []>;
+def S_XOR_SAVEEXEC_B64 : SOP1_64 <0x00000026, "S_XOR_SAVEEXEC_B64", []>;
+def S_ANDN2_SAVEEXEC_B64 : SOP1_64 <0x00000027, "S_ANDN2_SAVEEXEC_B64", []>;
+def S_ORN2_SAVEEXEC_B64 : SOP1_64 <0x00000028, "S_ORN2_SAVEEXEC_B64", []>;
+def S_NAND_SAVEEXEC_B64 : SOP1_64 <0x00000029, "S_NAND_SAVEEXEC_B64", []>;
+def S_NOR_SAVEEXEC_B64 : SOP1_64 <0x0000002a, "S_NOR_SAVEEXEC_B64", []>;
+def S_XNOR_SAVEEXEC_B64 : SOP1_64 <0x0000002b, "S_XNOR_SAVEEXEC_B64", []>;
+
+} // End hasSideEffects = 1
+
+def S_QUADMASK_B32 : SOP1_32 <0x0000002c, "S_QUADMASK_B32", []>;
+def S_QUADMASK_B64 : SOP1_64 <0x0000002d, "S_QUADMASK_B64", []>;
+def S_MOVRELS_B32 : SOP1_32 <0x0000002e, "S_MOVRELS_B32", []>;
+def S_MOVRELS_B64 : SOP1_64 <0x0000002f, "S_MOVRELS_B64", []>;
+def S_MOVRELD_B32 : SOP1_32 <0x00000030, "S_MOVRELD_B32", []>;
+def S_MOVRELD_B64 : SOP1_64 <0x00000031, "S_MOVRELD_B64", []>;
+//def S_CBRANCH_JOIN : SOP1_ <0x00000032, "S_CBRANCH_JOIN", []>;
+def S_MOV_REGRD_B32 : SOP1_32 <0x00000033, "S_MOV_REGRD_B32", []>;
+def S_ABS_I32 : SOP1_32 <0x00000034, "S_ABS_I32", []>;
+def S_MOV_FED_B32 : SOP1_32 <0x00000035, "S_MOV_FED_B32", []>;
+
+//===----------------------------------------------------------------------===//
+// SOP2 Instructions
+//===----------------------------------------------------------------------===//
+
+let Defs = [SCC] in { // Carry out goes to SCC
+let isCommutable = 1 in {
+def S_ADD_U32 : SOP2_32 <0x00000000, "S_ADD_U32", []>;
+def S_ADD_I32 : SOP2_32 <0x00000002, "S_ADD_I32",
+  [(set i32:$dst, (add SSrc_32:$src0, SSrc_32:$src1))]
+>;
+} // End isCommutable = 1
+
+def S_SUB_U32 : SOP2_32 <0x00000001, "S_SUB_U32", []>;
+def S_SUB_I32 : SOP2_32 <0x00000003, "S_SUB_I32",
+  [(set i32:$dst, (sub SSrc_32:$src0, SSrc_32:$src1))]
+>;
+
+let Uses = [SCC] in { // Carry in comes from SCC
+let isCommutable = 1 in {
+def S_ADDC_U32 : SOP2_32 <0x00000004, "S_ADDC_U32",
+  [(set i32:$dst, (adde (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>;
+} // End isCommutable = 1
+
+def S_SUBB_U32 : SOP2_32 <0x00000005, "S_SUBB_U32",
+  [(set i32:$dst, (sube (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>;
+} // End Uses = [SCC]
+} // End Defs = [SCC]
+
+def S_MIN_I32 : SOP2_32 <0x00000006, "S_MIN_I32",
+  [(set i32:$dst, (AMDGPUsmin i32:$src0, i32:$src1))]
+>;
+def S_MIN_U32 : SOP2_32 <0x00000007, "S_MIN_U32",
+  [(set i32:$dst, (AMDGPUumin i32:$src0, i32:$src1))]
+>;
+def S_MAX_I32 : SOP2_32 <0x00000008, "S_MAX_I32",
+  [(set i32:$dst, (AMDGPUsmax i32:$src0, i32:$src1))]
+>;
+def S_MAX_U32 : SOP2_32 <0x00000009, "S_MAX_U32",
+  [(set i32:$dst, (AMDGPUumax i32:$src0, i32:$src1))]
+>;
+
+def S_CSELECT_B32 : SOP2 <
+  0x0000000a, (outs SReg_32:$dst),
+  (ins SReg_32:$src0, SReg_32:$src1, SCCReg:$scc), "S_CSELECT_B32",
+  []
+>;
+
+def S_CSELECT_B64 : SOP2_64 <0x0000000b, "S_CSELECT_B64", []>;
+
+def S_AND_B32 : SOP2_32 <0x0000000e, "S_AND_B32",
+  [(set i32:$dst, (and i32:$src0, i32:$src1))]
+>;
+
+def S_AND_B64 : SOP2_64 <0x0000000f, "S_AND_B64",
+  [(set i64:$dst, (and i64:$src0, i64:$src1))]
+>;
+
+def S_OR_B32 : SOP2_32 <0x00000010, "S_OR_B32",
+  [(set i32:$dst, (or i32:$src0, i32:$src1))]
+>;
+
+def S_OR_B64 : SOP2_64 <0x00000011, "S_OR_B64",
+  [(set i64:$dst, (or i64:$src0, i64:$src1))]
+>;
+
+def S_XOR_B32 : SOP2_32 <0x00000012, "S_XOR_B32",
+  [(set i32:$dst, (xor i32:$src0, i32:$src1))]
+>;
+
+def S_XOR_B64 : SOP2_64 <0x00000013, "S_XOR_B64",
+  [(set i64:$dst, (xor i64:$src0, i64:$src1))]
+>;
+def S_ANDN2_B32 : SOP2_32 <0x00000014, "S_ANDN2_B32", []>;
+def S_ANDN2_B64 : SOP2_64 <0x00000015, "S_ANDN2_B64", []>;
+def S_ORN2_B32 : SOP2_32 <0x00000016, "S_ORN2_B32", []>;
+def S_ORN2_B64 : SOP2_64 <0x00000017, "S_ORN2_B64", []>;
+def S_NAND_B32 : SOP2_32 <0x00000018, "S_NAND_B32", []>;
+def S_NAND_B64 : SOP2_64 <0x00000019, "S_NAND_B64", []>;
+def S_NOR_B32 : SOP2_32 <0x0000001a, "S_NOR_B32", []>;
+def S_NOR_B64 : SOP2_64 <0x0000001b, "S_NOR_B64", []>;
+def S_XNOR_B32 : SOP2_32 <0x0000001c, "S_XNOR_B32", []>;
+def S_XNOR_B64 : SOP2_64 <0x0000001d, "S_XNOR_B64", []>;
+
+// Use added complexity so these patterns are preferred to the VALU patterns.
+let AddedComplexity = 1 in {
+
+def S_LSHL_B32 : SOP2_32 <0x0000001e, "S_LSHL_B32",
+  [(set i32:$dst, (shl i32:$src0, i32:$src1))]
+>;
+def S_LSHL_B64 : SOP2_SHIFT_64 <0x0000001f, "S_LSHL_B64",
+  [(set i64:$dst, (shl i64:$src0, i32:$src1))]
+>;
+def S_LSHR_B32 : SOP2_32 <0x00000020, "S_LSHR_B32",
+  [(set i32:$dst, (srl i32:$src0, i32:$src1))]
+>;
+def S_LSHR_B64 : SOP2_SHIFT_64 <0x00000021, "S_LSHR_B64",
+  [(set i64:$dst, (srl i64:$src0, i32:$src1))]
+>;
+def S_ASHR_I32 : SOP2_32 <0x00000022, "S_ASHR_I32",
+  [(set i32:$dst, (sra i32:$src0, i32:$src1))]
+>;
+def S_ASHR_I64 : SOP2_SHIFT_64 <0x00000023, "S_ASHR_I64",
+  [(set i64:$dst, (sra i64:$src0, i32:$src1))]
+>;
+
+} // End AddedComplexity = 1
+
+def S_BFM_B32 : SOP2_32 <0x00000024, "S_BFM_B32", []>;
+def S_BFM_B64 : SOP2_64 <0x00000025, "S_BFM_B64", []>;
+def S_MUL_I32 : SOP2_32 <0x00000026, "S_MUL_I32", []>;
+def S_BFE_U32 : SOP2_32 <0x00000027, "S_BFE_U32", []>;
+def S_BFE_I32 : SOP2_32 <0x00000028, "S_BFE_I32", []>;
+def S_BFE_U64 : SOP2_64 <0x00000029, "S_BFE_U64", []>;
+def S_BFE_I64 : SOP2_64 <0x0000002a, "S_BFE_I64", []>;
+//def S_CBRANCH_G_FORK : SOP2_ <0x0000002b, "S_CBRANCH_G_FORK", []>;
+def S_ABSDIFF_I32 : SOP2_32 <0x0000002c, "S_ABSDIFF_I32", []>;
+
+//===----------------------------------------------------------------------===//
+// SOPC Instructions
+//===----------------------------------------------------------------------===//
+
+def S_CMP_EQ_I32 : SOPC_32 <0x00000000, "S_CMP_EQ_I32">;
+def S_CMP_LG_I32 : SOPC_32 <0x00000001, "S_CMP_LG_I32">;
+def S_CMP_GT_I32 : SOPC_32 <0x00000002, "S_CMP_GT_I32">;
+def S_CMP_GE_I32 : SOPC_32 <0x00000003, "S_CMP_GE_I32">;
+def S_CMP_LT_I32 : SOPC_32 <0x00000004, "S_CMP_LT_I32">;
+def S_CMP_LE_I32 : SOPC_32 <0x00000005, "S_CMP_LE_I32">;
+def S_CMP_EQ_U32 : SOPC_32 <0x00000006, "S_CMP_EQ_U32">;
+def S_CMP_LG_U32 : SOPC_32 <0x00000007, "S_CMP_LG_U32">;
+def S_CMP_GT_U32 : SOPC_32 <0x00000008, "S_CMP_GT_U32">;
+def S_CMP_GE_U32 : SOPC_32 <0x00000009, "S_CMP_GE_U32">;
+def S_CMP_LT_U32 : SOPC_32 <0x0000000a, "S_CMP_LT_U32">;
+def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "S_CMP_LE_U32">;
+////def S_BITCMP0_B32 : SOPC_BITCMP0 <0x0000000c, "S_BITCMP0_B32", []>;
+////def S_BITCMP1_B32 : SOPC_BITCMP1 <0x0000000d, "S_BITCMP1_B32", []>;
+////def S_BITCMP0_B64 : SOPC_BITCMP0 <0x0000000e, "S_BITCMP0_B64", []>;
+////def S_BITCMP1_B64 : SOPC_BITCMP1 <0x0000000f, "S_BITCMP1_B64", []>;
+//def S_SETVSKIP : SOPC_ <0x00000010, "S_SETVSKIP", []>;
+
+//===----------------------------------------------------------------------===//
+// SOPK Instructions
+//===----------------------------------------------------------------------===//
+
+def S_MOVK_I32 : SOPK_32 <0x00000000, "S_MOVK_I32", []>;
+def S_CMOVK_I32 : SOPK_32 <0x00000002, "S_CMOVK_I32", []>;
+
+/*
+This instruction is disabled for now until we can figure out how to teach
+the instruction selector to correctly use the  S_CMP* vs V_CMP*
+instructions.
+
+When this instruction is enabled the code generator sometimes produces this
+invalid sequence:
+
+SCC = S_CMPK_EQ_I32 SGPR0, imm
+VCC = COPY SCC
+VGPR0 = V_CNDMASK VCC, VGPR0, VGPR1
+
+def S_CMPK_EQ_I32 : SOPK <
+  0x00000003, (outs SCCReg:$dst), (ins SReg_32:$src0, i32imm:$src1),
+  "S_CMPK_EQ_I32",
+  [(set i1:$dst, (setcc i32:$src0, imm:$src1, SETEQ))]
+>;
+*/
+
+let isCompare = 1, Defs = [SCC] in {
+def S_CMPK_LG_I32 : SOPK_32 <0x00000004, "S_CMPK_LG_I32", []>;
+def S_CMPK_GT_I32 : SOPK_32 <0x00000005, "S_CMPK_GT_I32", []>;
+def S_CMPK_GE_I32 : SOPK_32 <0x00000006, "S_CMPK_GE_I32", []>;
+def S_CMPK_LT_I32 : SOPK_32 <0x00000007, "S_CMPK_LT_I32", []>;
+def S_CMPK_LE_I32 : SOPK_32 <0x00000008, "S_CMPK_LE_I32", []>;
+def S_CMPK_EQ_U32 : SOPK_32 <0x00000009, "S_CMPK_EQ_U32", []>;
+def S_CMPK_LG_U32 : SOPK_32 <0x0000000a, "S_CMPK_LG_U32", []>;
+def S_CMPK_GT_U32 : SOPK_32 <0x0000000b, "S_CMPK_GT_U32", []>;
+def S_CMPK_GE_U32 : SOPK_32 <0x0000000c, "S_CMPK_GE_U32", []>;
+def S_CMPK_LT_U32 : SOPK_32 <0x0000000d, "S_CMPK_LT_U32", []>;
+def S_CMPK_LE_U32 : SOPK_32 <0x0000000e, "S_CMPK_LE_U32", []>;
+} // End isCompare = 1, Defs = [SCC]
+
+let Defs = [SCC], isCommutable = 1 in {
+  def S_ADDK_I32 : SOPK_32 <0x0000000f, "S_ADDK_I32", []>;
+  def S_MULK_I32 : SOPK_32 <0x00000010, "S_MULK_I32", []>;
+}
+
+//def S_CBRANCH_I_FORK : SOPK_ <0x00000011, "S_CBRANCH_I_FORK", []>;
+def S_GETREG_B32 : SOPK_32 <0x00000012, "S_GETREG_B32", []>;
+def S_SETREG_B32 : SOPK_32 <0x00000013, "S_SETREG_B32", []>;
+def S_GETREG_REGRD_B32 : SOPK_32 <0x00000014, "S_GETREG_REGRD_B32", []>;
+//def S_SETREG_IMM32_B32 : SOPK_32 <0x00000015, "S_SETREG_IMM32_B32", []>;
+//def EXP : EXP_ <0x00000000, "EXP", []>;
+
+} // End let OtherPredicates = [isCFDepth0]
+
+//===----------------------------------------------------------------------===//
+// SOPP Instructions
+//===----------------------------------------------------------------------===//
+
+def S_NOP : SOPP <0x00000000, (ins i16imm:$simm16), "S_NOP $simm16", []>;
+
+let isTerminator = 1 in {
+
+def S_ENDPGM : SOPP <0x00000001, (ins), "S_ENDPGM",
+  [(IL_retflag)]> {
+  let simm16 = 0;
+  let isBarrier = 1;
+  let hasCtrlDep = 1;
+}
+
+let isBranch = 1 in {
+def S_BRANCH : SOPP <
+  0x00000002, (ins sopp_brtarget:$simm16), "S_BRANCH $simm16",
+  [(br bb:$simm16)]> {
+  let isBarrier = 1;
+}
+
+let DisableEncoding = "$scc" in {
+def S_CBRANCH_SCC0 : SOPP <
+  0x00000004, (ins sopp_brtarget:$simm16, SCCReg:$scc),
+  "S_CBRANCH_SCC0 $simm16", []
+>;
+def S_CBRANCH_SCC1 : SOPP <
+  0x00000005, (ins sopp_brtarget:$simm16, SCCReg:$scc),
+  "S_CBRANCH_SCC1 $simm16",
+  []
+>;
+} // End DisableEncoding = "$scc"
+
+def S_CBRANCH_VCCZ : SOPP <
+  0x00000006, (ins sopp_brtarget:$simm16, VCCReg:$vcc),
+  "S_CBRANCH_VCCZ $simm16",
+  []
+>;
+def S_CBRANCH_VCCNZ : SOPP <
+  0x00000007, (ins sopp_brtarget:$simm16, VCCReg:$vcc),
+  "S_CBRANCH_VCCNZ $simm16",
+  []
+>;
+
+let DisableEncoding = "$exec" in {
+def S_CBRANCH_EXECZ : SOPP <
+  0x00000008, (ins sopp_brtarget:$simm16, EXECReg:$exec),
+  "S_CBRANCH_EXECZ $simm16",
+  []
+>;
+def S_CBRANCH_EXECNZ : SOPP <
+  0x00000009, (ins sopp_brtarget:$simm16, EXECReg:$exec),
+  "S_CBRANCH_EXECNZ $simm16",
+  []
+>;
+} // End DisableEncoding = "$exec"
+
+
+} // End isBranch = 1
+} // End isTerminator = 1
+
+let hasSideEffects = 1 in {
+def S_BARRIER : SOPP <0x0000000a, (ins), "S_BARRIER",
+  [(int_AMDGPU_barrier_local)]
+> {
+  let simm16 = 0;
+  let isBarrier = 1;
+  let hasCtrlDep = 1;
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+
+def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "S_WAITCNT $simm16",
+  []
+>;
+//def S_SETHALT : SOPP_ <0x0000000d, "S_SETHALT", []>;
+//def S_SLEEP : SOPP_ <0x0000000e, "S_SLEEP", []>;
+//def S_SETPRIO : SOPP_ <0x0000000f, "S_SETPRIO", []>;
+
+let Uses = [EXEC] in {
+  def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16, M0Reg:$m0), "S_SENDMSG $simm16",
+      [(int_SI_sendmsg imm:$simm16, M0Reg:$m0)]
+  > {
+    let DisableEncoding = "$m0";
+  }
+} // End Uses = [EXEC]
+
+//def S_SENDMSGHALT : SOPP_ <0x00000011, "S_SENDMSGHALT", []>;
+//def S_TRAP : SOPP_ <0x00000012, "S_TRAP", []>;
+//def S_ICACHE_INV : SOPP_ <0x00000013, "S_ICACHE_INV", []>;
+//def S_INCPERFLEVEL : SOPP_ <0x00000014, "S_INCPERFLEVEL", []>;
+//def S_DECPERFLEVEL : SOPP_ <0x00000015, "S_DECPERFLEVEL", []>;
+//def S_TTRACEDATA : SOPP_ <0x00000016, "S_TTRACEDATA", []>;
+} // End hasSideEffects
+
+//===----------------------------------------------------------------------===//
+// VOPC Instructions
+//===----------------------------------------------------------------------===//
+
+let isCompare = 1 in {
+
+defm V_CMP_F_F32 : VOPC_32 <0x00000000, "V_CMP_F_F32">;
+defm V_CMP_LT_F32 : VOPC_32 <0x00000001, "V_CMP_LT_F32", f32, COND_OLT>;
+defm V_CMP_EQ_F32 : VOPC_32 <0x00000002, "V_CMP_EQ_F32", f32, COND_OEQ>;
+defm V_CMP_LE_F32 : VOPC_32 <0x00000003, "V_CMP_LE_F32", f32, COND_OLE>;
+defm V_CMP_GT_F32 : VOPC_32 <0x00000004, "V_CMP_GT_F32", f32, COND_OGT>;
+defm V_CMP_LG_F32 : VOPC_32 <0x00000005, "V_CMP_LG_F32">;
+defm V_CMP_GE_F32 : VOPC_32 <0x00000006, "V_CMP_GE_F32", f32, COND_OGE>;
+defm V_CMP_O_F32 : VOPC_32 <0x00000007, "V_CMP_O_F32", f32, COND_O>;
+defm V_CMP_U_F32 : VOPC_32 <0x00000008, "V_CMP_U_F32", f32, COND_UO>;
+defm V_CMP_NGE_F32 : VOPC_32 <0x00000009, "V_CMP_NGE_F32">;
+defm V_CMP_NLG_F32 : VOPC_32 <0x0000000a, "V_CMP_NLG_F32">;
+defm V_CMP_NGT_F32 : VOPC_32 <0x0000000b, "V_CMP_NGT_F32">;
+defm V_CMP_NLE_F32 : VOPC_32 <0x0000000c, "V_CMP_NLE_F32">;
+defm V_CMP_NEQ_F32 : VOPC_32 <0x0000000d, "V_CMP_NEQ_F32", f32, COND_UNE>;
+defm V_CMP_NLT_F32 : VOPC_32 <0x0000000e, "V_CMP_NLT_F32">;
+defm V_CMP_TRU_F32 : VOPC_32 <0x0000000f, "V_CMP_TRU_F32">;
+
+let hasSideEffects = 1 in {
+
+defm V_CMPX_F_F32 : VOPCX_32 <0x00000010, "V_CMPX_F_F32">;
+defm V_CMPX_LT_F32 : VOPCX_32 <0x00000011, "V_CMPX_LT_F32">;
+defm V_CMPX_EQ_F32 : VOPCX_32 <0x00000012, "V_CMPX_EQ_F32">;
+defm V_CMPX_LE_F32 : VOPCX_32 <0x00000013, "V_CMPX_LE_F32">;
+defm V_CMPX_GT_F32 : VOPCX_32 <0x00000014, "V_CMPX_GT_F32">;
+defm V_CMPX_LG_F32 : VOPCX_32 <0x00000015, "V_CMPX_LG_F32">;
+defm V_CMPX_GE_F32 : VOPCX_32 <0x00000016, "V_CMPX_GE_F32">;
+defm V_CMPX_O_F32 : VOPCX_32 <0x00000017, "V_CMPX_O_F32">;
+defm V_CMPX_U_F32 : VOPCX_32 <0x00000018, "V_CMPX_U_F32">;
+defm V_CMPX_NGE_F32 : VOPCX_32 <0x00000019, "V_CMPX_NGE_F32">;
+defm V_CMPX_NLG_F32 : VOPCX_32 <0x0000001a, "V_CMPX_NLG_F32">;
+defm V_CMPX_NGT_F32 : VOPCX_32 <0x0000001b, "V_CMPX_NGT_F32">;
+defm V_CMPX_NLE_F32 : VOPCX_32 <0x0000001c, "V_CMPX_NLE_F32">;
+defm V_CMPX_NEQ_F32 : VOPCX_32 <0x0000001d, "V_CMPX_NEQ_F32">;
+defm V_CMPX_NLT_F32 : VOPCX_32 <0x0000001e, "V_CMPX_NLT_F32">;
+defm V_CMPX_TRU_F32 : VOPCX_32 <0x0000001f, "V_CMPX_TRU_F32">;
+
+} // End hasSideEffects = 1
+
+defm V_CMP_F_F64 : VOPC_64 <0x00000020, "V_CMP_F_F64">;
+defm V_CMP_LT_F64 : VOPC_64 <0x00000021, "V_CMP_LT_F64", f64, COND_OLT>;
+defm V_CMP_EQ_F64 : VOPC_64 <0x00000022, "V_CMP_EQ_F64", f64, COND_OEQ>;
+defm V_CMP_LE_F64 : VOPC_64 <0x00000023, "V_CMP_LE_F64", f64, COND_OLE>;
+defm V_CMP_GT_F64 : VOPC_64 <0x00000024, "V_CMP_GT_F64", f64, COND_OGT>;
+defm V_CMP_LG_F64 : VOPC_64 <0x00000025, "V_CMP_LG_F64">;
+defm V_CMP_GE_F64 : VOPC_64 <0x00000026, "V_CMP_GE_F64", f64, COND_OGE>;
+defm V_CMP_O_F64 : VOPC_64 <0x00000027, "V_CMP_O_F64", f64, COND_O>;
+defm V_CMP_U_F64 : VOPC_64 <0x00000028, "V_CMP_U_F64", f64, COND_UO>;
+defm V_CMP_NGE_F64 : VOPC_64 <0x00000029, "V_CMP_NGE_F64">;
+defm V_CMP_NLG_F64 : VOPC_64 <0x0000002a, "V_CMP_NLG_F64">;
+defm V_CMP_NGT_F64 : VOPC_64 <0x0000002b, "V_CMP_NGT_F64">;
+defm V_CMP_NLE_F64 : VOPC_64 <0x0000002c, "V_CMP_NLE_F64">;
+defm V_CMP_NEQ_F64 : VOPC_64 <0x0000002d, "V_CMP_NEQ_F64", f64, COND_UNE>;
+defm V_CMP_NLT_F64 : VOPC_64 <0x0000002e, "V_CMP_NLT_F64">;
+defm V_CMP_TRU_F64 : VOPC_64 <0x0000002f, "V_CMP_TRU_F64">;
+
+let hasSideEffects = 1 in {
+
+defm V_CMPX_F_F64 : VOPCX_64 <0x00000030, "V_CMPX_F_F64">;
+defm V_CMPX_LT_F64 : VOPCX_64 <0x00000031, "V_CMPX_LT_F64">;
+defm V_CMPX_EQ_F64 : VOPCX_64 <0x00000032, "V_CMPX_EQ_F64">;
+defm V_CMPX_LE_F64 : VOPCX_64 <0x00000033, "V_CMPX_LE_F64">;
+defm V_CMPX_GT_F64 : VOPCX_64 <0x00000034, "V_CMPX_GT_F64">;
+defm V_CMPX_LG_F64 : VOPCX_64 <0x00000035, "V_CMPX_LG_F64">;
+defm V_CMPX_GE_F64 : VOPCX_64 <0x00000036, "V_CMPX_GE_F64">;
+defm V_CMPX_O_F64 : VOPCX_64 <0x00000037, "V_CMPX_O_F64">;
+defm V_CMPX_U_F64 : VOPCX_64 <0x00000038, "V_CMPX_U_F64">;
+defm V_CMPX_NGE_F64 : VOPCX_64 <0x00000039, "V_CMPX_NGE_F64">;
+defm V_CMPX_NLG_F64 : VOPCX_64 <0x0000003a, "V_CMPX_NLG_F64">;
+defm V_CMPX_NGT_F64 : VOPCX_64 <0x0000003b, "V_CMPX_NGT_F64">;
+defm V_CMPX_NLE_F64 : VOPCX_64 <0x0000003c, "V_CMPX_NLE_F64">;
+defm V_CMPX_NEQ_F64 : VOPCX_64 <0x0000003d, "V_CMPX_NEQ_F64">;
+defm V_CMPX_NLT_F64 : VOPCX_64 <0x0000003e, "V_CMPX_NLT_F64">;
+defm V_CMPX_TRU_F64 : VOPCX_64 <0x0000003f, "V_CMPX_TRU_F64">;
+
+} // End hasSideEffects = 1
+
+defm V_CMPS_F_F32 : VOPC_32 <0x00000040, "V_CMPS_F_F32">;
+defm V_CMPS_LT_F32 : VOPC_32 <0x00000041, "V_CMPS_LT_F32">;
+defm V_CMPS_EQ_F32 : VOPC_32 <0x00000042, "V_CMPS_EQ_F32">;
+defm V_CMPS_LE_F32 : VOPC_32 <0x00000043, "V_CMPS_LE_F32">;
+defm V_CMPS_GT_F32 : VOPC_32 <0x00000044, "V_CMPS_GT_F32">;
+defm V_CMPS_LG_F32 : VOPC_32 <0x00000045, "V_CMPS_LG_F32">;
+defm V_CMPS_GE_F32 : VOPC_32 <0x00000046, "V_CMPS_GE_F32">;
+defm V_CMPS_O_F32 : VOPC_32 <0x00000047, "V_CMPS_O_F32">;
+defm V_CMPS_U_F32 : VOPC_32 <0x00000048, "V_CMPS_U_F32">;
+defm V_CMPS_NGE_F32 : VOPC_32 <0x00000049, "V_CMPS_NGE_F32">;
+defm V_CMPS_NLG_F32 : VOPC_32 <0x0000004a, "V_CMPS_NLG_F32">;
+defm V_CMPS_NGT_F32 : VOPC_32 <0x0000004b, "V_CMPS_NGT_F32">;
+defm V_CMPS_NLE_F32 : VOPC_32 <0x0000004c, "V_CMPS_NLE_F32">;
+defm V_CMPS_NEQ_F32 : VOPC_32 <0x0000004d, "V_CMPS_NEQ_F32">;
+defm V_CMPS_NLT_F32 : VOPC_32 <0x0000004e, "V_CMPS_NLT_F32">;
+defm V_CMPS_TRU_F32 : VOPC_32 <0x0000004f, "V_CMPS_TRU_F32">;
+
+let hasSideEffects = 1 in {
+
+defm V_CMPSX_F_F32 : VOPCX_32 <0x00000050, "V_CMPSX_F_F32">;
+defm V_CMPSX_LT_F32 : VOPCX_32 <0x00000051, "V_CMPSX_LT_F32">;
+defm V_CMPSX_EQ_F32 : VOPCX_32 <0x00000052, "V_CMPSX_EQ_F32">;
+defm V_CMPSX_LE_F32 : VOPCX_32 <0x00000053, "V_CMPSX_LE_F32">;
+defm V_CMPSX_GT_F32 : VOPCX_32 <0x00000054, "V_CMPSX_GT_F32">;
+defm V_CMPSX_LG_F32 : VOPCX_32 <0x00000055, "V_CMPSX_LG_F32">;
+defm V_CMPSX_GE_F32 : VOPCX_32 <0x00000056, "V_CMPSX_GE_F32">;
+defm V_CMPSX_O_F32 : VOPCX_32 <0x00000057, "V_CMPSX_O_F32">;
+defm V_CMPSX_U_F32 : VOPCX_32 <0x00000058, "V_CMPSX_U_F32">;
+defm V_CMPSX_NGE_F32 : VOPCX_32 <0x00000059, "V_CMPSX_NGE_F32">;
+defm V_CMPSX_NLG_F32 : VOPCX_32 <0x0000005a, "V_CMPSX_NLG_F32">;
+defm V_CMPSX_NGT_F32 : VOPCX_32 <0x0000005b, "V_CMPSX_NGT_F32">;
+defm V_CMPSX_NLE_F32 : VOPCX_32 <0x0000005c, "V_CMPSX_NLE_F32">;
+defm V_CMPSX_NEQ_F32 : VOPCX_32 <0x0000005d, "V_CMPSX_NEQ_F32">;
+defm V_CMPSX_NLT_F32 : VOPCX_32 <0x0000005e, "V_CMPSX_NLT_F32">;
+defm V_CMPSX_TRU_F32 : VOPCX_32 <0x0000005f, "V_CMPSX_TRU_F32">;
+
+} // End hasSideEffects = 1
+
+defm V_CMPS_F_F64 : VOPC_64 <0x00000060, "V_CMPS_F_F64">;
+defm V_CMPS_LT_F64 : VOPC_64 <0x00000061, "V_CMPS_LT_F64">;
+defm V_CMPS_EQ_F64 : VOPC_64 <0x00000062, "V_CMPS_EQ_F64">;
+defm V_CMPS_LE_F64 : VOPC_64 <0x00000063, "V_CMPS_LE_F64">;
+defm V_CMPS_GT_F64 : VOPC_64 <0x00000064, "V_CMPS_GT_F64">;
+defm V_CMPS_LG_F64 : VOPC_64 <0x00000065, "V_CMPS_LG_F64">;
+defm V_CMPS_GE_F64 : VOPC_64 <0x00000066, "V_CMPS_GE_F64">;
+defm V_CMPS_O_F64 : VOPC_64 <0x00000067, "V_CMPS_O_F64">;
+defm V_CMPS_U_F64 : VOPC_64 <0x00000068, "V_CMPS_U_F64">;
+defm V_CMPS_NGE_F64 : VOPC_64 <0x00000069, "V_CMPS_NGE_F64">;
+defm V_CMPS_NLG_F64 : VOPC_64 <0x0000006a, "V_CMPS_NLG_F64">;
+defm V_CMPS_NGT_F64 : VOPC_64 <0x0000006b, "V_CMPS_NGT_F64">;
+defm V_CMPS_NLE_F64 : VOPC_64 <0x0000006c, "V_CMPS_NLE_F64">;
+defm V_CMPS_NEQ_F64 : VOPC_64 <0x0000006d, "V_CMPS_NEQ_F64">;
+defm V_CMPS_NLT_F64 : VOPC_64 <0x0000006e, "V_CMPS_NLT_F64">;
+defm V_CMPS_TRU_F64 : VOPC_64 <0x0000006f, "V_CMPS_TRU_F64">;
+
+let hasSideEffects = 1, Defs = [EXEC] in {
+
+defm V_CMPSX_F_F64 : VOPC_64 <0x00000070, "V_CMPSX_F_F64">;
+defm V_CMPSX_LT_F64 : VOPC_64 <0x00000071, "V_CMPSX_LT_F64">;
+defm V_CMPSX_EQ_F64 : VOPC_64 <0x00000072, "V_CMPSX_EQ_F64">;
+defm V_CMPSX_LE_F64 : VOPC_64 <0x00000073, "V_CMPSX_LE_F64">;
+defm V_CMPSX_GT_F64 : VOPC_64 <0x00000074, "V_CMPSX_GT_F64">;
+defm V_CMPSX_LG_F64 : VOPC_64 <0x00000075, "V_CMPSX_LG_F64">;
+defm V_CMPSX_GE_F64 : VOPC_64 <0x00000076, "V_CMPSX_GE_F64">;
+defm V_CMPSX_O_F64 : VOPC_64 <0x00000077, "V_CMPSX_O_F64">;
+defm V_CMPSX_U_F64 : VOPC_64 <0x00000078, "V_CMPSX_U_F64">;
+defm V_CMPSX_NGE_F64 : VOPC_64 <0x00000079, "V_CMPSX_NGE_F64">;
+defm V_CMPSX_NLG_F64 : VOPC_64 <0x0000007a, "V_CMPSX_NLG_F64">;
+defm V_CMPSX_NGT_F64 : VOPC_64 <0x0000007b, "V_CMPSX_NGT_F64">;
+defm V_CMPSX_NLE_F64 : VOPC_64 <0x0000007c, "V_CMPSX_NLE_F64">;
+defm V_CMPSX_NEQ_F64 : VOPC_64 <0x0000007d, "V_CMPSX_NEQ_F64">;
+defm V_CMPSX_NLT_F64 : VOPC_64 <0x0000007e, "V_CMPSX_NLT_F64">;
+defm V_CMPSX_TRU_F64 : VOPC_64 <0x0000007f, "V_CMPSX_TRU_F64">;
+
+} // End hasSideEffects = 1, Defs = [EXEC]
+
+defm V_CMP_F_I32 : VOPC_32 <0x00000080, "V_CMP_F_I32">;
+defm V_CMP_LT_I32 : VOPC_32 <0x00000081, "V_CMP_LT_I32", i32, COND_SLT>;
+defm V_CMP_EQ_I32 : VOPC_32 <0x00000082, "V_CMP_EQ_I32", i32, COND_EQ>;
+defm V_CMP_LE_I32 : VOPC_32 <0x00000083, "V_CMP_LE_I32", i32, COND_SLE>;
+defm V_CMP_GT_I32 : VOPC_32 <0x00000084, "V_CMP_GT_I32", i32, COND_SGT>;
+defm V_CMP_NE_I32 : VOPC_32 <0x00000085, "V_CMP_NE_I32", i32, COND_NE>;
+defm V_CMP_GE_I32 : VOPC_32 <0x00000086, "V_CMP_GE_I32", i32, COND_SGE>;
+defm V_CMP_T_I32 : VOPC_32 <0x00000087, "V_CMP_T_I32">;
+
+let hasSideEffects = 1 in {
+
+defm V_CMPX_F_I32 : VOPCX_32 <0x00000090, "V_CMPX_F_I32">;
+defm V_CMPX_LT_I32 : VOPCX_32 <0x00000091, "V_CMPX_LT_I32">;
+defm V_CMPX_EQ_I32 : VOPCX_32 <0x00000092, "V_CMPX_EQ_I32">;
+defm V_CMPX_LE_I32 : VOPCX_32 <0x00000093, "V_CMPX_LE_I32">;
+defm V_CMPX_GT_I32 : VOPCX_32 <0x00000094, "V_CMPX_GT_I32">;
+defm V_CMPX_NE_I32 : VOPCX_32 <0x00000095, "V_CMPX_NE_I32">;
+defm V_CMPX_GE_I32 : VOPCX_32 <0x00000096, "V_CMPX_GE_I32">;
+defm V_CMPX_T_I32 : VOPCX_32 <0x00000097, "V_CMPX_T_I32">;
+
+} // End hasSideEffects = 1
+
+defm V_CMP_F_I64 : VOPC_64 <0x000000a0, "V_CMP_F_I64">;
+defm V_CMP_LT_I64 : VOPC_64 <0x000000a1, "V_CMP_LT_I64", i64, COND_SLT>;
+defm V_CMP_EQ_I64 : VOPC_64 <0x000000a2, "V_CMP_EQ_I64", i64, COND_EQ>;
+defm V_CMP_LE_I64 : VOPC_64 <0x000000a3, "V_CMP_LE_I64", i64, COND_SLE>;
+defm V_CMP_GT_I64 : VOPC_64 <0x000000a4, "V_CMP_GT_I64", i64, COND_SGT>;
+defm V_CMP_NE_I64 : VOPC_64 <0x000000a5, "V_CMP_NE_I64", i64, COND_NE>;
+defm V_CMP_GE_I64 : VOPC_64 <0x000000a6, "V_CMP_GE_I64", i64, COND_SGE>;
+defm V_CMP_T_I64 : VOPC_64 <0x000000a7, "V_CMP_T_I64">;
+
+let hasSideEffects = 1 in {
+
+defm V_CMPX_F_I64 : VOPCX_64 <0x000000b0, "V_CMPX_F_I64">;
+defm V_CMPX_LT_I64 : VOPCX_64 <0x000000b1, "V_CMPX_LT_I64">;
+defm V_CMPX_EQ_I64 : VOPCX_64 <0x000000b2, "V_CMPX_EQ_I64">;
+defm V_CMPX_LE_I64 : VOPCX_64 <0x000000b3, "V_CMPX_LE_I64">;
+defm V_CMPX_GT_I64 : VOPCX_64 <0x000000b4, "V_CMPX_GT_I64">;
+defm V_CMPX_NE_I64 : VOPCX_64 <0x000000b5, "V_CMPX_NE_I64">;
+defm V_CMPX_GE_I64 : VOPCX_64 <0x000000b6, "V_CMPX_GE_I64">;
+defm V_CMPX_T_I64 : VOPCX_64 <0x000000b7, "V_CMPX_T_I64">;
+
+} // End hasSideEffects = 1
+
+defm V_CMP_F_U32 : VOPC_32 <0x000000c0, "V_CMP_F_U32">;
+defm V_CMP_LT_U32 : VOPC_32 <0x000000c1, "V_CMP_LT_U32", i32, COND_ULT>;
+defm V_CMP_EQ_U32 : VOPC_32 <0x000000c2, "V_CMP_EQ_U32", i32, COND_EQ>;
+defm V_CMP_LE_U32 : VOPC_32 <0x000000c3, "V_CMP_LE_U32", i32, COND_ULE>;
+defm V_CMP_GT_U32 : VOPC_32 <0x000000c4, "V_CMP_GT_U32", i32, COND_UGT>;
+defm V_CMP_NE_U32 : VOPC_32 <0x000000c5, "V_CMP_NE_U32", i32, COND_NE>;
+defm V_CMP_GE_U32 : VOPC_32 <0x000000c6, "V_CMP_GE_U32", i32, COND_UGE>;
+defm V_CMP_T_U32 : VOPC_32 <0x000000c7, "V_CMP_T_U32">;
+
+let hasSideEffects = 1 in {
+
+defm V_CMPX_F_U32 : VOPCX_32 <0x000000d0, "V_CMPX_F_U32">;
+defm V_CMPX_LT_U32 : VOPCX_32 <0x000000d1, "V_CMPX_LT_U32">;
+defm V_CMPX_EQ_U32 : VOPCX_32 <0x000000d2, "V_CMPX_EQ_U32">;
+defm V_CMPX_LE_U32 : VOPCX_32 <0x000000d3, "V_CMPX_LE_U32">;
+defm V_CMPX_GT_U32 : VOPCX_32 <0x000000d4, "V_CMPX_GT_U32">;
+defm V_CMPX_NE_U32 : VOPCX_32 <0x000000d5, "V_CMPX_NE_U32">;
+defm V_CMPX_GE_U32 : VOPCX_32 <0x000000d6, "V_CMPX_GE_U32">;
+defm V_CMPX_T_U32 : VOPCX_32 <0x000000d7, "V_CMPX_T_U32">;
+
+} // End hasSideEffects = 1
+
+defm V_CMP_F_U64 : VOPC_64 <0x000000e0, "V_CMP_F_U64">;
+defm V_CMP_LT_U64 : VOPC_64 <0x000000e1, "V_CMP_LT_U64", i64, COND_ULT>;
+defm V_CMP_EQ_U64 : VOPC_64 <0x000000e2, "V_CMP_EQ_U64", i64, COND_EQ>;
+defm V_CMP_LE_U64 : VOPC_64 <0x000000e3, "V_CMP_LE_U64", i64, COND_ULE>;
+defm V_CMP_GT_U64 : VOPC_64 <0x000000e4, "V_CMP_GT_U64", i64, COND_UGT>;
+defm V_CMP_NE_U64 : VOPC_64 <0x000000e5, "V_CMP_NE_U64", i64, COND_NE>;
+defm V_CMP_GE_U64 : VOPC_64 <0x000000e6, "V_CMP_GE_U64", i64, COND_UGE>;
+defm V_CMP_T_U64 : VOPC_64 <0x000000e7, "V_CMP_T_U64">;
+
+let hasSideEffects = 1 in {
+
+defm V_CMPX_F_U64 : VOPCX_64 <0x000000f0, "V_CMPX_F_U64">;
+defm V_CMPX_LT_U64 : VOPCX_64 <0x000000f1, "V_CMPX_LT_U64">;
+defm V_CMPX_EQ_U64 : VOPCX_64 <0x000000f2, "V_CMPX_EQ_U64">;
+defm V_CMPX_LE_U64 : VOPCX_64 <0x000000f3, "V_CMPX_LE_U64">;
+defm V_CMPX_GT_U64 : VOPCX_64 <0x000000f4, "V_CMPX_GT_U64">;
+defm V_CMPX_NE_U64 : VOPCX_64 <0x000000f5, "V_CMPX_NE_U64">;
+defm V_CMPX_GE_U64 : VOPCX_64 <0x000000f6, "V_CMPX_GE_U64">;
+defm V_CMPX_T_U64 : VOPCX_64 <0x000000f7, "V_CMPX_T_U64">;
+
+} // End hasSideEffects = 1
+
+defm V_CMP_CLASS_F32 : VOPC_32 <0x00000088, "V_CMP_CLASS_F32">;
+
+let hasSideEffects = 1 in {
+defm V_CMPX_CLASS_F32 : VOPCX_32 <0x00000098, "V_CMPX_CLASS_F32">;
+} // End hasSideEffects = 1
+
+defm V_CMP_CLASS_F64 : VOPC_64 <0x000000a8, "V_CMP_CLASS_F64">;
+
+let hasSideEffects = 1 in {
+defm V_CMPX_CLASS_F64 : VOPCX_64 <0x000000b8, "V_CMPX_CLASS_F64">;
+} // End hasSideEffects = 1
+
+} // End isCompare = 1
+
+//===----------------------------------------------------------------------===//
+// DS Instructions
+//===----------------------------------------------------------------------===//
+
+
+def DS_ADD_U32 : DS_1A1D_NORET <0x0, "DS_ADD_U32", VReg_32>;
+def DS_SUB_U32 : DS_1A1D_NORET <0x1, "DS_SUB_U32", VReg_32>;
+def DS_RSUB_U32 : DS_1A1D_NORET <0x2, "DS_RSUB_U32", VReg_32>;
+def DS_INC_U32 : DS_1A1D_NORET <0x3, "DS_INC_U32", VReg_32>;
+def DS_DEC_U32 : DS_1A1D_NORET <0x4, "DS_DEC_U32", VReg_32>;
+def DS_MIN_I32 : DS_1A1D_NORET <0x5, "DS_MIN_I32", VReg_32>;
+def DS_MAX_I32 : DS_1A1D_NORET <0x6, "DS_MAX_I32", VReg_32>;
+def DS_MIN_U32 : DS_1A1D_NORET <0x7, "DS_MIN_U32", VReg_32>;
+def DS_MAX_U32 : DS_1A1D_NORET <0x8, "DS_MAX_U32", VReg_32>;
+def DS_AND_B32 : DS_1A1D_NORET <0x9, "DS_AND_B32", VReg_32>;
+def DS_OR_B32 : DS_1A1D_NORET <0xa, "DS_OR_B32", VReg_32>;
+def DS_XOR_B32 : DS_1A1D_NORET <0xb, "DS_XOR_B32", VReg_32>;
+def DS_MSKOR_B32 : DS_1A1D_NORET <0xc, "DS_MSKOR_B32", VReg_32>;
+def DS_CMPST_B32 : DS_1A2D_NORET <0x10, "DS_CMPST_B32", VReg_32>;
+def DS_CMPST_F32 : DS_1A2D_NORET <0x11, "DS_CMPST_F32", VReg_32>;
+def DS_MIN_F32 : DS_1A1D_NORET <0x12, "DS_MIN_F32", VReg_32>;
+def DS_MAX_F32 : DS_1A1D_NORET <0x13, "DS_MAX_F32", VReg_32>;
+
+def DS_ADD_RTN_U32 : DS_1A1D_RET <0x20, "DS_ADD_RTN_U32", VReg_32>;
+def DS_SUB_RTN_U32 : DS_1A1D_RET <0x21, "DS_SUB_RTN_U32", VReg_32>;
+def DS_RSUB_RTN_U32 : DS_1A1D_RET <0x22, "DS_RSUB_RTN_U32", VReg_32>;
+def DS_INC_RTN_U32 : DS_1A1D_RET <0x23, "DS_INC_RTN_U32", VReg_32>;
+def DS_DEC_RTN_U32 : DS_1A1D_RET <0x24, "DS_DEC_RTN_U32", VReg_32>;
+def DS_MIN_RTN_I32 : DS_1A1D_RET <0x25, "DS_MIN_RTN_I32", VReg_32>;
+def DS_MAX_RTN_I32 : DS_1A1D_RET <0x26, "DS_MAX_RTN_I32", VReg_32>;
+def DS_MIN_RTN_U32 : DS_1A1D_RET <0x27, "DS_MIN_RTN_U32", VReg_32>;
+def DS_MAX_RTN_U32 : DS_1A1D_RET <0x28, "DS_MAX_RTN_U32", VReg_32>;
+def DS_AND_RTN_B32 : DS_1A1D_RET <0x29, "DS_AND_RTN_B32", VReg_32>;
+def DS_OR_RTN_B32 : DS_1A1D_RET <0x2a, "DS_OR_RTN_B32", VReg_32>;
+def DS_XOR_RTN_B32 : DS_1A1D_RET <0x2b, "DS_XOR_RTN_B32", VReg_32>;
+def DS_MSKOR_RTN_B32 : DS_1A1D_RET <0x2c, "DS_MSKOR_RTN_B32", VReg_32>;
+def DS_WRXCHG_RTN_B32 : DS_1A1D_RET <0x2d, "DS_WRXCHG_RTN_B32", VReg_32>;
+//def DS_WRXCHG2_RTN_B32 : DS_2A0D_RET <0x2e, "DS_WRXCHG2_RTN_B32", VReg_32>;
+//def DS_WRXCHG2ST64_RTN_B32 : DS_2A0D_RET <0x2f, "DS_WRXCHG2_RTN_B32", VReg_32>;
+def DS_CMPST_RTN_B32 : DS_1A2D_RET <0x30, "DS_CMPST_RTN_B32", VReg_32>;
+def DS_CMPST_RTN_F32 : DS_1A2D_RET <0x31, "DS_CMPST_RTN_F32", VReg_32>;
+def DS_MIN_RTN_F32 : DS_1A1D_RET <0x32, "DS_MIN_RTN_F32", VReg_32>;
+def DS_MAX_RTN_F32 : DS_1A1D_RET <0x33, "DS_MAX_RTN_F32", VReg_32>;
+
+let SubtargetPredicate = isCI in {
+def DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "DS_WRAP_RTN_F32", VReg_32>;
+} // End isCI
+
+
+def DS_ADD_U64 : DS_1A1D_NORET <0x40, "DS_ADD_U64", VReg_32>;
+def DS_SUB_U64 : DS_1A1D_NORET <0x41, "DS_SUB_U64", VReg_32>;
+def DS_RSUB_U64 : DS_1A1D_NORET <0x42, "DS_RSUB_U64", VReg_32>;
+def DS_INC_U64 : DS_1A1D_NORET <0x43, "DS_INC_U64", VReg_32>;
+def DS_DEC_U64 : DS_1A1D_NORET <0x44, "DS_DEC_U64", VReg_32>;
+def DS_MIN_I64 : DS_1A1D_NORET <0x45, "DS_MIN_I64", VReg_64>;
+def DS_MAX_I64 : DS_1A1D_NORET <0x46, "DS_MAX_I64", VReg_64>;
+def DS_MIN_U64 : DS_1A1D_NORET <0x47, "DS_MIN_U64", VReg_64>;
+def DS_MAX_U64 : DS_1A1D_NORET <0x48, "DS_MAX_U64", VReg_64>;
+def DS_AND_B64 : DS_1A1D_NORET <0x49, "DS_AND_B64", VReg_64>;
+def DS_OR_B64 : DS_1A1D_NORET <0x4a, "DS_OR_B64", VReg_64>;
+def DS_XOR_B64 : DS_1A1D_NORET <0x4b, "DS_XOR_B64", VReg_64>;
+def DS_MSKOR_B64 : DS_1A1D_NORET <0x4c, "DS_MSKOR_B64", VReg_64>;
+def DS_CMPST_B64 : DS_1A2D_NORET <0x50, "DS_CMPST_B64", VReg_64>;
+def DS_CMPST_F64 : DS_1A2D_NORET <0x51, "DS_CMPST_F64", VReg_64>;
+def DS_MIN_F64 : DS_1A1D_NORET <0x52, "DS_MIN_F64", VReg_64>;
+def DS_MAX_F64 : DS_1A1D_NORET <0x53, "DS_MAX_F64", VReg_64>;
+
+def DS_ADD_RTN_U64 : DS_1A1D_RET <0x60, "DS_ADD_RTN_U64", VReg_64>;
+def DS_SUB_RTN_U64 : DS_1A1D_RET <0x61, "DS_SUB_RTN_U64", VReg_64>;
+def DS_RSUB_RTN_U64 : DS_1A1D_RET <0x62, "DS_RSUB_RTN_U64", VReg_64>;
+def DS_INC_RTN_U64 : DS_1A1D_RET <0x63, "DS_INC_RTN_U64", VReg_64>;
+def DS_DEC_RTN_U64 : DS_1A1D_RET <0x64, "DS_DEC_RTN_U64", VReg_64>;
+def DS_MIN_RTN_I64 : DS_1A1D_RET <0x65, "DS_MIN_RTN_I64", VReg_64>;
+def DS_MAX_RTN_I64 : DS_1A1D_RET <0x66, "DS_MAX_RTN_I64", VReg_64>;
+def DS_MIN_RTN_U64 : DS_1A1D_RET <0x67, "DS_MIN_RTN_U64", VReg_64>;
+def DS_MAX_RTN_U64 : DS_1A1D_RET <0x68, "DS_MAX_RTN_U64", VReg_64>;
+def DS_AND_RTN_B64 : DS_1A1D_RET <0x69, "DS_AND_RTN_B64", VReg_64>;
+def DS_OR_RTN_B64 : DS_1A1D_RET <0x6a, "DS_OR_RTN_B64", VReg_64>;
+def DS_XOR_RTN_B64 : DS_1A1D_RET <0x6b, "DS_XOR_RTN_B64", VReg_64>;
+def DS_MSKOR_RTN_B64 : DS_1A1D_RET <0x6c, "DS_MSKOR_RTN_B64", VReg_64>;
+def DS_WRXCHG_RTN_B64 : DS_1A1D_RET <0x6d, "DS_WRXCHG_RTN_B64", VReg_64>;
+//def DS_WRXCHG2_RTN_B64 : DS_2A0D_RET <0x6e, "DS_WRXCHG2_RTN_B64", VReg_64>;
+//def DS_WRXCHG2ST64_RTN_B64 : DS_2A0D_RET <0x6f, "DS_WRXCHG2_RTN_B64", VReg_64>;
+def DS_CMPST_RTN_B64 : DS_1A2D_RET <0x70, "DS_CMPST_RTN_B64", VReg_64>;
+def DS_CMPST_RTN_F64 : DS_1A2D_RET <0x71, "DS_CMPST_RTN_F64", VReg_64>;
+def DS_MIN_RTN_F64 : DS_1A1D_RET <0x72, "DS_MIN_F64", VReg_64>;
+def DS_MAX_RTN_F64 : DS_1A1D_RET <0x73, "DS_MAX_F64", VReg_64>;
+
+//let SubtargetPredicate = isCI in {
+// DS_CONDXCHG32_RTN_B64
+// DS_CONDXCHG32_RTN_B128
+//} // End isCI
+
+// TODO: _SRC2_* forms
+
+def DS_WRITE_B32 : DS_Store_Helper <0x0000000d, "DS_WRITE_B32", VReg_32>;
+def DS_WRITE_B8 : DS_Store_Helper <0x00000001e, "DS_WRITE_B8", VReg_32>;
+def DS_WRITE_B16 : DS_Store_Helper <0x00000001f, "DS_WRITE_B16", VReg_32>;
+def DS_WRITE_B64 : DS_Store_Helper <0x00000004d, "DS_WRITE_B64", VReg_64>;
+
+def DS_READ_B32 : DS_Load_Helper <0x00000036, "DS_READ_B32", VReg_32>;
+def DS_READ_I8 : DS_Load_Helper <0x00000039, "DS_READ_I8", VReg_32>;
+def DS_READ_U8 : DS_Load_Helper <0x0000003a, "DS_READ_U8", VReg_32>;
+def DS_READ_I16 : DS_Load_Helper <0x0000003b, "DS_READ_I16", VReg_32>;
+def DS_READ_U16 : DS_Load_Helper <0x0000003c, "DS_READ_U16", VReg_32>;
+def DS_READ_B64 : DS_Load_Helper <0x00000076, "DS_READ_B64", VReg_64>;
+
+// 2 forms.
+def DS_WRITE2_B32 : DS_Load2_Helper <0x0000000E, "DS_WRITE2_B32", VReg_64>;
+def DS_WRITE2_B64 : DS_Load2_Helper <0x0000004E, "DS_WRITE2_B64", VReg_128>;
+
+def DS_READ2_B32 : DS_Load2_Helper <0x00000037, "DS_READ2_B32", VReg_64>;
+def DS_READ2_B64 : DS_Load2_Helper <0x00000075, "DS_READ2_B64", VReg_128>;
+
+// TODO: DS_READ2ST64_B32, DS_READ2ST64_B64,
+// DS_WRITE2ST64_B32, DS_WRITE2ST64_B64
+
+//===----------------------------------------------------------------------===//
+// MUBUF Instructions
+//===----------------------------------------------------------------------===//
+
+//def BUFFER_LOAD_FORMAT_X : MUBUF_ <0x00000000, "BUFFER_LOAD_FORMAT_X", []>;
+//def BUFFER_LOAD_FORMAT_XY : MUBUF_ <0x00000001, "BUFFER_LOAD_FORMAT_XY", []>;
+//def BUFFER_LOAD_FORMAT_XYZ : MUBUF_ <0x00000002, "BUFFER_LOAD_FORMAT_XYZ", []>;
+defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <0x00000003, "BUFFER_LOAD_FORMAT_XYZW", VReg_128>;
+//def BUFFER_STORE_FORMAT_X : MUBUF_ <0x00000004, "BUFFER_STORE_FORMAT_X", []>;
+//def BUFFER_STORE_FORMAT_XY : MUBUF_ <0x00000005, "BUFFER_STORE_FORMAT_XY", []>;
+//def BUFFER_STORE_FORMAT_XYZ : MUBUF_ <0x00000006, "BUFFER_STORE_FORMAT_XYZ", []>;
+//def BUFFER_STORE_FORMAT_XYZW : MUBUF_ <0x00000007, "BUFFER_STORE_FORMAT_XYZW", []>;
+defm BUFFER_LOAD_UBYTE : MUBUF_Load_Helper <
+  0x00000008, "BUFFER_LOAD_UBYTE", VReg_32, i32, az_extloadi8_global
+>;
+defm BUFFER_LOAD_SBYTE : MUBUF_Load_Helper <
+  0x00000009, "BUFFER_LOAD_SBYTE", VReg_32, i32, sextloadi8_global
+>;
+defm BUFFER_LOAD_USHORT : MUBUF_Load_Helper <
+  0x0000000a, "BUFFER_LOAD_USHORT", VReg_32, i32, az_extloadi16_global
+>;
+defm BUFFER_LOAD_SSHORT : MUBUF_Load_Helper <
+  0x0000000b, "BUFFER_LOAD_SSHORT", VReg_32, i32, sextloadi16_global
+>;
+defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper <
+  0x0000000c, "BUFFER_LOAD_DWORD", VReg_32, i32, global_load
+>;
+defm BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper <
+  0x0000000d, "BUFFER_LOAD_DWORDX2", VReg_64, v2i32, global_load
+>;
+defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper <
+  0x0000000e, "BUFFER_LOAD_DWORDX4", VReg_128, v4i32, global_load
+>;
+
+defm BUFFER_STORE_BYTE : MUBUF_Store_Helper <
+  0x00000018, "BUFFER_STORE_BYTE", VReg_32, i32, truncstorei8_global
+>;
+
+defm BUFFER_STORE_SHORT : MUBUF_Store_Helper <
+  0x0000001a, "BUFFER_STORE_SHORT", VReg_32, i32, truncstorei16_global
+>;
+
+defm BUFFER_STORE_DWORD : MUBUF_Store_Helper <
+  0x0000001c, "BUFFER_STORE_DWORD", VReg_32, i32, global_store
+>;
+
+defm BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper <
+  0x0000001d, "BUFFER_STORE_DWORDX2", VReg_64, v2i32, global_store
+>;
+
+defm BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper <
+  0x0000001e, "BUFFER_STORE_DWORDX4", VReg_128, v4i32, global_store
+>;
+//def BUFFER_ATOMIC_SWAP : MUBUF_ <0x00000030, "BUFFER_ATOMIC_SWAP", []>;
+//def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <0x00000031, "BUFFER_ATOMIC_CMPSWAP", []>;
+//def BUFFER_ATOMIC_ADD : MUBUF_ <0x00000032, "BUFFER_ATOMIC_ADD", []>;
+//def BUFFER_ATOMIC_SUB : MUBUF_ <0x00000033, "BUFFER_ATOMIC_SUB", []>;
+//def BUFFER_ATOMIC_RSUB : MUBUF_ <0x00000034, "BUFFER_ATOMIC_RSUB", []>;
+//def BUFFER_ATOMIC_SMIN : MUBUF_ <0x00000035, "BUFFER_ATOMIC_SMIN", []>;
+//def BUFFER_ATOMIC_UMIN : MUBUF_ <0x00000036, "BUFFER_ATOMIC_UMIN", []>;
+//def BUFFER_ATOMIC_SMAX : MUBUF_ <0x00000037, "BUFFER_ATOMIC_SMAX", []>;
+//def BUFFER_ATOMIC_UMAX : MUBUF_ <0x00000038, "BUFFER_ATOMIC_UMAX", []>;
+//def BUFFER_ATOMIC_AND : MUBUF_ <0x00000039, "BUFFER_ATOMIC_AND", []>;
+//def BUFFER_ATOMIC_OR : MUBUF_ <0x0000003a, "BUFFER_ATOMIC_OR", []>;
+//def BUFFER_ATOMIC_XOR : MUBUF_ <0x0000003b, "BUFFER_ATOMIC_XOR", []>;
+//def BUFFER_ATOMIC_INC : MUBUF_ <0x0000003c, "BUFFER_ATOMIC_INC", []>;
+//def BUFFER_ATOMIC_DEC : MUBUF_ <0x0000003d, "BUFFER_ATOMIC_DEC", []>;
+//def BUFFER_ATOMIC_FCMPSWAP : MUBUF_ <0x0000003e, "BUFFER_ATOMIC_FCMPSWAP", []>;
+//def BUFFER_ATOMIC_FMIN : MUBUF_ <0x0000003f, "BUFFER_ATOMIC_FMIN", []>;
+//def BUFFER_ATOMIC_FMAX : MUBUF_ <0x00000040, "BUFFER_ATOMIC_FMAX", []>;
+//def BUFFER_ATOMIC_SWAP_X2 : MUBUF_X2 <0x00000050, "BUFFER_ATOMIC_SWAP_X2", []>;
+//def BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_X2 <0x00000051, "BUFFER_ATOMIC_CMPSWAP_X2", []>;
+//def BUFFER_ATOMIC_ADD_X2 : MUBUF_X2 <0x00000052, "BUFFER_ATOMIC_ADD_X2", []>;
+//def BUFFER_ATOMIC_SUB_X2 : MUBUF_X2 <0x00000053, "BUFFER_ATOMIC_SUB_X2", []>;
+//def BUFFER_ATOMIC_RSUB_X2 : MUBUF_X2 <0x00000054, "BUFFER_ATOMIC_RSUB_X2", []>;
+//def BUFFER_ATOMIC_SMIN_X2 : MUBUF_X2 <0x00000055, "BUFFER_ATOMIC_SMIN_X2", []>;
+//def BUFFER_ATOMIC_UMIN_X2 : MUBUF_X2 <0x00000056, "BUFFER_ATOMIC_UMIN_X2", []>;
+//def BUFFER_ATOMIC_SMAX_X2 : MUBUF_X2 <0x00000057, "BUFFER_ATOMIC_SMAX_X2", []>;
+//def BUFFER_ATOMIC_UMAX_X2 : MUBUF_X2 <0x00000058, "BUFFER_ATOMIC_UMAX_X2", []>;
+//def BUFFER_ATOMIC_AND_X2 : MUBUF_X2 <0x00000059, "BUFFER_ATOMIC_AND_X2", []>;
+//def BUFFER_ATOMIC_OR_X2 : MUBUF_X2 <0x0000005a, "BUFFER_ATOMIC_OR_X2", []>;
+//def BUFFER_ATOMIC_XOR_X2 : MUBUF_X2 <0x0000005b, "BUFFER_ATOMIC_XOR_X2", []>;
+//def BUFFER_ATOMIC_INC_X2 : MUBUF_X2 <0x0000005c, "BUFFER_ATOMIC_INC_X2", []>;
+//def BUFFER_ATOMIC_DEC_X2 : MUBUF_X2 <0x0000005d, "BUFFER_ATOMIC_DEC_X2", []>;
+//def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 <0x0000005e, "BUFFER_ATOMIC_FCMPSWAP_X2", []>;
+//def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 <0x0000005f, "BUFFER_ATOMIC_FMIN_X2", []>;
+//def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <0x00000060, "BUFFER_ATOMIC_FMAX_X2", []>;
+//def BUFFER_WBINVL1_SC : MUBUF_WBINVL1 <0x00000070, "BUFFER_WBINVL1_SC", []>;
+//def BUFFER_WBINVL1 : MUBUF_WBINVL1 <0x00000071, "BUFFER_WBINVL1", []>;
+
+//===----------------------------------------------------------------------===//
+// MTBUF Instructions
+//===----------------------------------------------------------------------===//
+
+//def TBUFFER_LOAD_FORMAT_X : MTBUF_ <0x00000000, "TBUFFER_LOAD_FORMAT_X", []>;
+//def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <0x00000001, "TBUFFER_LOAD_FORMAT_XY", []>;
+//def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <0x00000002, "TBUFFER_LOAD_FORMAT_XYZ", []>;
+def TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Helper <0x00000003, "TBUFFER_LOAD_FORMAT_XYZW", VReg_128>;
+def TBUFFER_STORE_FORMAT_X : MTBUF_Store_Helper <0x00000004, "TBUFFER_STORE_FORMAT_X", VReg_32>;
+def TBUFFER_STORE_FORMAT_XY : MTBUF_Store_Helper <0x00000005, "TBUFFER_STORE_FORMAT_XY", VReg_64>;
+def TBUFFER_STORE_FORMAT_XYZ : MTBUF_Store_Helper <0x00000006, "TBUFFER_STORE_FORMAT_XYZ", VReg_128>;
+def TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Helper <0x00000007, "TBUFFER_STORE_FORMAT_XYZW", VReg_128>;
+
+//===----------------------------------------------------------------------===//
+// MIMG Instructions
+//===----------------------------------------------------------------------===//
+
+defm IMAGE_LOAD : MIMG_NoSampler <0x00000000, "IMAGE_LOAD">;
+defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "IMAGE_LOAD_MIP">;
+//def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"IMAGE_LOAD_PCK", 0x00000002>;
+//def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"IMAGE_LOAD_PCK_SGN", 0x00000003>;
+//def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"IMAGE_LOAD_MIP_PCK", 0x00000004>;
+//def IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoPattern_ <"IMAGE_LOAD_MIP_PCK_SGN", 0x00000005>;
+//def IMAGE_STORE : MIMG_NoPattern_ <"IMAGE_STORE", 0x00000008>;
+//def IMAGE_STORE_MIP : MIMG_NoPattern_ <"IMAGE_STORE_MIP", 0x00000009>;
+//def IMAGE_STORE_PCK : MIMG_NoPattern_ <"IMAGE_STORE_PCK", 0x0000000a>;
+//def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"IMAGE_STORE_MIP_PCK", 0x0000000b>;
+defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "IMAGE_GET_RESINFO">;
+//def IMAGE_ATOMIC_SWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_SWAP", 0x0000000f>;
+//def IMAGE_ATOMIC_CMPSWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_CMPSWAP", 0x00000010>;
+//def IMAGE_ATOMIC_ADD : MIMG_NoPattern_ <"IMAGE_ATOMIC_ADD", 0x00000011>;
+//def IMAGE_ATOMIC_SUB : MIMG_NoPattern_ <"IMAGE_ATOMIC_SUB", 0x00000012>;
+//def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"IMAGE_ATOMIC_RSUB", 0x00000013>;
+//def IMAGE_ATOMIC_SMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_SMIN", 0x00000014>;
+//def IMAGE_ATOMIC_UMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_UMIN", 0x00000015>;
+//def IMAGE_ATOMIC_SMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_SMAX", 0x00000016>;
+//def IMAGE_ATOMIC_UMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_UMAX", 0x00000017>;
+//def IMAGE_ATOMIC_AND : MIMG_NoPattern_ <"IMAGE_ATOMIC_AND", 0x00000018>;
+//def IMAGE_ATOMIC_OR : MIMG_NoPattern_ <"IMAGE_ATOMIC_OR", 0x00000019>;
+//def IMAGE_ATOMIC_XOR : MIMG_NoPattern_ <"IMAGE_ATOMIC_XOR", 0x0000001a>;
+//def IMAGE_ATOMIC_INC : MIMG_NoPattern_ <"IMAGE_ATOMIC_INC", 0x0000001b>;
+//def IMAGE_ATOMIC_DEC : MIMG_NoPattern_ <"IMAGE_ATOMIC_DEC", 0x0000001c>;
+//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_FCMPSWAP", 0x0000001d>;
+//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_FMIN", 0x0000001e>;
+//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_FMAX", 0x0000001f>;
+defm IMAGE_SAMPLE           : MIMG_Sampler <0x00000020, "IMAGE_SAMPLE">;
+defm IMAGE_SAMPLE_CL        : MIMG_Sampler <0x00000021, "IMAGE_SAMPLE_CL">;
+defm IMAGE_SAMPLE_D         : MIMG_Sampler <0x00000022, "IMAGE_SAMPLE_D">;
+defm IMAGE_SAMPLE_D_CL      : MIMG_Sampler <0x00000023, "IMAGE_SAMPLE_D_CL">;
+defm IMAGE_SAMPLE_L         : MIMG_Sampler <0x00000024, "IMAGE_SAMPLE_L">;
+defm IMAGE_SAMPLE_B         : MIMG_Sampler <0x00000025, "IMAGE_SAMPLE_B">;
+defm IMAGE_SAMPLE_B_CL      : MIMG_Sampler <0x00000026, "IMAGE_SAMPLE_B_CL">;
+defm IMAGE_SAMPLE_LZ        : MIMG_Sampler <0x00000027, "IMAGE_SAMPLE_LZ">;
+defm IMAGE_SAMPLE_C         : MIMG_Sampler <0x00000028, "IMAGE_SAMPLE_C">;
+defm IMAGE_SAMPLE_C_CL      : MIMG_Sampler <0x00000029, "IMAGE_SAMPLE_C_CL">;
+defm IMAGE_SAMPLE_C_D       : MIMG_Sampler <0x0000002a, "IMAGE_SAMPLE_C_D">;
+defm IMAGE_SAMPLE_C_D_CL    : MIMG_Sampler <0x0000002b, "IMAGE_SAMPLE_C_D_CL">;
+defm IMAGE_SAMPLE_C_L       : MIMG_Sampler <0x0000002c, "IMAGE_SAMPLE_C_L">;
+defm IMAGE_SAMPLE_C_B       : MIMG_Sampler <0x0000002d, "IMAGE_SAMPLE_C_B">;
+defm IMAGE_SAMPLE_C_B_CL    : MIMG_Sampler <0x0000002e, "IMAGE_SAMPLE_C_B_CL">;
+defm IMAGE_SAMPLE_C_LZ      : MIMG_Sampler <0x0000002f, "IMAGE_SAMPLE_C_LZ">;
+defm IMAGE_SAMPLE_O         : MIMG_Sampler <0x00000030, "IMAGE_SAMPLE_O">;
+defm IMAGE_SAMPLE_CL_O      : MIMG_Sampler <0x00000031, "IMAGE_SAMPLE_CL_O">;
+defm IMAGE_SAMPLE_D_O       : MIMG_Sampler <0x00000032, "IMAGE_SAMPLE_D_O">;
+defm IMAGE_SAMPLE_D_CL_O    : MIMG_Sampler <0x00000033, "IMAGE_SAMPLE_D_CL_O">;
+defm IMAGE_SAMPLE_L_O       : MIMG_Sampler <0x00000034, "IMAGE_SAMPLE_L_O">;
+defm IMAGE_SAMPLE_B_O       : MIMG_Sampler <0x00000035, "IMAGE_SAMPLE_B_O">;
+defm IMAGE_SAMPLE_B_CL_O    : MIMG_Sampler <0x00000036, "IMAGE_SAMPLE_B_CL_O">;
+defm IMAGE_SAMPLE_LZ_O      : MIMG_Sampler <0x00000037, "IMAGE_SAMPLE_LZ_O">;
+defm IMAGE_SAMPLE_C_O       : MIMG_Sampler <0x00000038, "IMAGE_SAMPLE_C_O">;
+defm IMAGE_SAMPLE_C_CL_O    : MIMG_Sampler <0x00000039, "IMAGE_SAMPLE_C_CL_O">;
+defm IMAGE_SAMPLE_C_D_O     : MIMG_Sampler <0x0000003a, "IMAGE_SAMPLE_C_D_O">;
+defm IMAGE_SAMPLE_C_D_CL_O  : MIMG_Sampler <0x0000003b, "IMAGE_SAMPLE_C_D_CL_O">;
+defm IMAGE_SAMPLE_C_L_O     : MIMG_Sampler <0x0000003c, "IMAGE_SAMPLE_C_L_O">;
+defm IMAGE_SAMPLE_C_B_O     : MIMG_Sampler <0x0000003d, "IMAGE_SAMPLE_C_B_O">;
+defm IMAGE_SAMPLE_C_B_CL_O  : MIMG_Sampler <0x0000003e, "IMAGE_SAMPLE_C_B_CL_O">;
+defm IMAGE_SAMPLE_C_LZ_O    : MIMG_Sampler <0x0000003f, "IMAGE_SAMPLE_C_LZ_O">;
+defm IMAGE_GATHER4          : MIMG_Gather <0x00000040, "IMAGE_GATHER4">;
+defm IMAGE_GATHER4_CL       : MIMG_Gather <0x00000041, "IMAGE_GATHER4_CL">;
+defm IMAGE_GATHER4_L        : MIMG_Gather <0x00000044, "IMAGE_GATHER4_L">;
+defm IMAGE_GATHER4_B        : MIMG_Gather <0x00000045, "IMAGE_GATHER4_B">;
+defm IMAGE_GATHER4_B_CL     : MIMG_Gather <0x00000046, "IMAGE_GATHER4_B_CL">;
+defm IMAGE_GATHER4_LZ       : MIMG_Gather <0x00000047, "IMAGE_GATHER4_LZ">;
+defm IMAGE_GATHER4_C        : MIMG_Gather <0x00000048, "IMAGE_GATHER4_C">;
+defm IMAGE_GATHER4_C_CL     : MIMG_Gather <0x00000049, "IMAGE_GATHER4_C_CL">;
+defm IMAGE_GATHER4_C_L      : MIMG_Gather <0x0000004c, "IMAGE_GATHER4_C_L">;
+defm IMAGE_GATHER4_C_B      : MIMG_Gather <0x0000004d, "IMAGE_GATHER4_C_B">;
+defm IMAGE_GATHER4_C_B_CL   : MIMG_Gather <0x0000004e, "IMAGE_GATHER4_C_B_CL">;
+defm IMAGE_GATHER4_C_LZ     : MIMG_Gather <0x0000004f, "IMAGE_GATHER4_C_LZ">;
+defm IMAGE_GATHER4_O        : MIMG_Gather <0x00000050, "IMAGE_GATHER4_O">;
+defm IMAGE_GATHER4_CL_O     : MIMG_Gather <0x00000051, "IMAGE_GATHER4_CL_O">;
+defm IMAGE_GATHER4_L_O      : MIMG_Gather <0x00000054, "IMAGE_GATHER4_L_O">;
+defm IMAGE_GATHER4_B_O      : MIMG_Gather <0x00000055, "IMAGE_GATHER4_B_O">;
+defm IMAGE_GATHER4_B_CL_O   : MIMG_Gather <0x00000056, "IMAGE_GATHER4_B_CL_O">;
+defm IMAGE_GATHER4_LZ_O     : MIMG_Gather <0x00000057, "IMAGE_GATHER4_LZ_O">;
+defm IMAGE_GATHER4_C_O      : MIMG_Gather <0x00000058, "IMAGE_GATHER4_C_O">;
+defm IMAGE_GATHER4_C_CL_O   : MIMG_Gather <0x00000059, "IMAGE_GATHER4_C_CL_O">;
+defm IMAGE_GATHER4_C_L_O    : MIMG_Gather <0x0000005c, "IMAGE_GATHER4_C_L_O">;
+defm IMAGE_GATHER4_C_B_O    : MIMG_Gather <0x0000005d, "IMAGE_GATHER4_C_B_O">;
+defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather <0x0000005e, "IMAGE_GATHER4_C_B_CL_O">;
+defm IMAGE_GATHER4_C_LZ_O   : MIMG_Gather <0x0000005f, "IMAGE_GATHER4_C_LZ_O">;
+defm IMAGE_GET_LOD          : MIMG_Sampler <0x00000060, "IMAGE_GET_LOD">;
+defm IMAGE_SAMPLE_CD        : MIMG_Sampler <0x00000068, "IMAGE_SAMPLE_CD">;
+defm IMAGE_SAMPLE_CD_CL     : MIMG_Sampler <0x00000069, "IMAGE_SAMPLE_CD_CL">;
+defm IMAGE_SAMPLE_C_CD      : MIMG_Sampler <0x0000006a, "IMAGE_SAMPLE_C_CD">;
+defm IMAGE_SAMPLE_C_CD_CL   : MIMG_Sampler <0x0000006b, "IMAGE_SAMPLE_C_CD_CL">;
+defm IMAGE_SAMPLE_CD_O      : MIMG_Sampler <0x0000006c, "IMAGE_SAMPLE_CD_O">;
+defm IMAGE_SAMPLE_CD_CL_O   : MIMG_Sampler <0x0000006d, "IMAGE_SAMPLE_CD_CL_O">;
+defm IMAGE_SAMPLE_C_CD_O    : MIMG_Sampler <0x0000006e, "IMAGE_SAMPLE_C_CD_O">;
+defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, "IMAGE_SAMPLE_C_CD_CL_O">;
+//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"IMAGE_RSRC256", 0x0000007e>;
+//def IMAGE_SAMPLER : MIMG_NoPattern_ <"IMAGE_SAMPLER", 0x0000007f>;
+
+//===----------------------------------------------------------------------===//
+// VOP1 Instructions
+//===----------------------------------------------------------------------===//
+
+//def V_NOP : VOP1_ <0x00000000, "V_NOP", []>;
+
+let neverHasSideEffects = 1, isMoveImm = 1 in {
+defm V_MOV_B32 : VOP1_32 <0x00000001, "V_MOV_B32", []>;
+} // End neverHasSideEffects = 1, isMoveImm = 1
+
+let Uses = [EXEC] in {
+
+def V_READFIRSTLANE_B32 : VOP1 <
+  0x00000002,
+  (outs SReg_32:$vdst),
+  (ins VReg_32:$src0),
+  "V_READFIRSTLANE_B32 $vdst, $src0",
+  []
+>;
+
+}
+
+defm V_CVT_I32_F64 : VOP1_32_64 <0x00000003, "V_CVT_I32_F64",
+  [(set i32:$dst, (fp_to_sint f64:$src0))]
+>;
+defm V_CVT_F64_I32 : VOP1_64_32 <0x00000004, "V_CVT_F64_I32",
+  [(set f64:$dst, (sint_to_fp i32:$src0))]
+>;
+defm V_CVT_F32_I32 : VOP1_32 <0x00000005, "V_CVT_F32_I32",
+  [(set f32:$dst, (sint_to_fp i32:$src0))]
+>;
+defm V_CVT_F32_U32 : VOP1_32 <0x00000006, "V_CVT_F32_U32",
+  [(set f32:$dst, (uint_to_fp i32:$src0))]
+>;
+defm V_CVT_U32_F32 : VOP1_32 <0x00000007, "V_CVT_U32_F32",
+  [(set i32:$dst, (fp_to_uint f32:$src0))]
+>;
+defm V_CVT_I32_F32 : VOP1_32 <0x00000008, "V_CVT_I32_F32",
+  [(set i32:$dst, (fp_to_sint f32:$src0))]
+>;
+defm V_MOV_FED_B32 : VOP1_32 <0x00000009, "V_MOV_FED_B32", []>;
+defm V_CVT_F16_F32 : VOP1_32 <0x0000000a, "V_CVT_F16_F32",
+  [(set i32:$dst, (fp_to_f16 f32:$src0))]
+>;
+defm V_CVT_F32_F16 : VOP1_32 <0x0000000b, "V_CVT_F32_F16",
+  [(set f32:$dst, (f16_to_fp i32:$src0))]
+>;
+//defm V_CVT_RPI_I32_F32 : VOP1_32 <0x0000000c, "V_CVT_RPI_I32_F32", []>;
+//defm V_CVT_FLR_I32_F32 : VOP1_32 <0x0000000d, "V_CVT_FLR_I32_F32", []>;
+//defm V_CVT_OFF_F32_I4 : VOP1_32 <0x0000000e, "V_CVT_OFF_F32_I4", []>;
+defm V_CVT_F32_F64 : VOP1_32_64 <0x0000000f, "V_CVT_F32_F64",
+  [(set f32:$dst, (fround f64:$src0))]
+>;
+defm V_CVT_F64_F32 : VOP1_64_32 <0x00000010, "V_CVT_F64_F32",
+  [(set f64:$dst, (fextend f32:$src0))]
+>;
+defm V_CVT_F32_UBYTE0 : VOP1_32 <0x00000011, "V_CVT_F32_UBYTE0",
+  [(set f32:$dst, (AMDGPUcvt_f32_ubyte0 i32:$src0))]
+>;
+defm V_CVT_F32_UBYTE1 : VOP1_32 <0x00000012, "V_CVT_F32_UBYTE1",
+  [(set f32:$dst, (AMDGPUcvt_f32_ubyte1 i32:$src0))]
+>;
+defm V_CVT_F32_UBYTE2 : VOP1_32 <0x00000013, "V_CVT_F32_UBYTE2",
+  [(set f32:$dst, (AMDGPUcvt_f32_ubyte2 i32:$src0))]
+>;
+defm V_CVT_F32_UBYTE3 : VOP1_32 <0x00000014, "V_CVT_F32_UBYTE3",
+  [(set f32:$dst, (AMDGPUcvt_f32_ubyte3 i32:$src0))]
+>;
+defm V_CVT_U32_F64 : VOP1_32_64 <0x00000015, "V_CVT_U32_F64",
+  [(set i32:$dst, (fp_to_uint f64:$src0))]
+>;
+defm V_CVT_F64_U32 : VOP1_64_32 <0x00000016, "V_CVT_F64_U32",
+  [(set f64:$dst, (uint_to_fp i32:$src0))]
+>;
+
+defm V_FRACT_F32 : VOP1_32 <0x00000020, "V_FRACT_F32",
+  [(set f32:$dst, (AMDGPUfract f32:$src0))]
+>;
+defm V_TRUNC_F32 : VOP1_32 <0x00000021, "V_TRUNC_F32",
+  [(set f32:$dst, (ftrunc f32:$src0))]
+>;
+defm V_CEIL_F32 : VOP1_32 <0x00000022, "V_CEIL_F32",
+  [(set f32:$dst, (fceil f32:$src0))]
+>;
+defm V_RNDNE_F32 : VOP1_32 <0x00000023, "V_RNDNE_F32",
+  [(set f32:$dst, (frint f32:$src0))]
+>;
+defm V_FLOOR_F32 : VOP1_32 <0x00000024, "V_FLOOR_F32",
+  [(set f32:$dst, (ffloor f32:$src0))]
+>;
+defm V_EXP_F32 : VOP1_32 <0x00000025, "V_EXP_F32",
+  [(set f32:$dst, (fexp2 f32:$src0))]
+>;
+defm V_LOG_CLAMP_F32 : VOP1_32 <0x00000026, "V_LOG_CLAMP_F32", []>;
+defm V_LOG_F32 : VOP1_32 <0x00000027, "V_LOG_F32",
+  [(set f32:$dst, (flog2 f32:$src0))]
+>;
+
+defm V_RCP_CLAMP_F32 : VOP1_32 <0x00000028, "V_RCP_CLAMP_F32", []>;
+defm V_RCP_LEGACY_F32 : VOP1_32 <0x00000029, "V_RCP_LEGACY_F32", []>;
+defm V_RCP_F32 : VOP1_32 <0x0000002a, "V_RCP_F32",
+  [(set f32:$dst, (AMDGPUrcp f32:$src0))]
+>;
+defm V_RCP_IFLAG_F32 : VOP1_32 <0x0000002b, "V_RCP_IFLAG_F32", []>;
+defm V_RSQ_CLAMP_F32 : VOP1_32 <0x0000002c, "V_RSQ_CLAMP_F32",
+  [(set f32:$dst, (AMDGPUrsq_clamped f32:$src0))]
+>;
+defm V_RSQ_LEGACY_F32 : VOP1_32 <
+  0x0000002d, "V_RSQ_LEGACY_F32",
+  [(set f32:$dst, (AMDGPUrsq_legacy f32:$src0))]
+>;
+defm V_RSQ_F32 : VOP1_32 <0x0000002e, "V_RSQ_F32",
+  [(set f32:$dst, (AMDGPUrsq f32:$src0))]
+>;
+defm V_RCP_F64 : VOP1_64 <0x0000002f, "V_RCP_F64",
+  [(set f64:$dst, (AMDGPUrcp f64:$src0))]
+>;
+defm V_RCP_CLAMP_F64 : VOP1_64 <0x00000030, "V_RCP_CLAMP_F64", []>;
+defm V_RSQ_F64 : VOP1_64 <0x00000031, "V_RSQ_F64",
+  [(set f64:$dst, (AMDGPUrsq f64:$src0))]
+>;
+defm V_RSQ_CLAMP_F64 : VOP1_64 <0x00000032, "V_RSQ_CLAMP_F64",
+  [(set f64:$dst, (AMDGPUrsq_clamped f64:$src0))]
+>;
+defm V_SQRT_F32 : VOP1_32 <0x00000033, "V_SQRT_F32",
+  [(set f32:$dst, (fsqrt f32:$src0))]
+>;
+defm V_SQRT_F64 : VOP1_64 <0x00000034, "V_SQRT_F64",
+  [(set f64:$dst, (fsqrt f64:$src0))]
+>;
+defm V_SIN_F32 : VOP1_32 <0x00000035, "V_SIN_F32",
+  [(set f32:$dst, (AMDGPUsin f32:$src0))]
+>;
+defm V_COS_F32 : VOP1_32 <0x00000036, "V_COS_F32",
+  [(set f32:$dst, (AMDGPUcos f32:$src0))]
+>;
+defm V_NOT_B32 : VOP1_32 <0x00000037, "V_NOT_B32", []>;
+defm V_BFREV_B32 : VOP1_32 <0x00000038, "V_BFREV_B32", []>;
+defm V_FFBH_U32 : VOP1_32 <0x00000039, "V_FFBH_U32", []>;
+defm V_FFBL_B32 : VOP1_32 <0x0000003a, "V_FFBL_B32", []>;
+defm V_FFBH_I32 : VOP1_32 <0x0000003b, "V_FFBH_I32", []>;
+//defm V_FREXP_EXP_I32_F64 : VOP1_32 <0x0000003c, "V_FREXP_EXP_I32_F64", []>;
+defm V_FREXP_MANT_F64 : VOP1_64 <0x0000003d, "V_FREXP_MANT_F64", []>;
+defm V_FRACT_F64 : VOP1_64 <0x0000003e, "V_FRACT_F64", []>;
+//defm V_FREXP_EXP_I32_F32 : VOP1_32 <0x0000003f, "V_FREXP_EXP_I32_F32", []>;
+defm V_FREXP_MANT_F32 : VOP1_32 <0x00000040, "V_FREXP_MANT_F32", []>;
+//def V_CLREXCP : VOP1_ <0x00000041, "V_CLREXCP", []>;
+defm V_MOVRELD_B32 : VOP1_32 <0x00000042, "V_MOVRELD_B32", []>;
+defm V_MOVRELS_B32 : VOP1_32 <0x00000043, "V_MOVRELS_B32", []>;
+defm V_MOVRELSD_B32 : VOP1_32 <0x00000044, "V_MOVRELSD_B32", []>;
+
+
+//===----------------------------------------------------------------------===//
+// VINTRP Instructions
+//===----------------------------------------------------------------------===//
+
+def V_INTERP_P1_F32 : VINTRP <
+  0x00000000,
+  (outs VReg_32:$dst),
+  (ins VReg_32:$i, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
+  "V_INTERP_P1_F32 $dst, $i, $attr_chan, $attr, [$m0]",
+  []> {
+  let DisableEncoding = "$m0";
+}
+
+def V_INTERP_P2_F32 : VINTRP <
+  0x00000001,
+  (outs VReg_32:$dst),
+  (ins VReg_32:$src0, VReg_32:$j, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
+  "V_INTERP_P2_F32 $dst, [$src0], $j, $attr_chan, $attr, [$m0]",
+  []> {
+
+  let Constraints = "$src0 = $dst";
+  let DisableEncoding = "$src0,$m0";
+
+}
+
+def V_INTERP_MOV_F32 : VINTRP <
+  0x00000002,
+  (outs VReg_32:$dst),
+  (ins InterpSlot:$src0, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
+  "V_INTERP_MOV_F32 $dst, $src0, $attr_chan, $attr, [$m0]",
+  []> {
+  let DisableEncoding = "$m0";
+}
+
+//===----------------------------------------------------------------------===//
+// VOP2 Instructions
+//===----------------------------------------------------------------------===//
+
+def V_CNDMASK_B32_e32 : VOP2 <0x00000000, (outs VReg_32:$dst),
+  (ins VSrc_32:$src0, VReg_32:$src1, VCCReg:$vcc),
+  "V_CNDMASK_B32_e32 $dst, $src0, $src1, [$vcc]",
+  []
+>{
+  let DisableEncoding = "$vcc";
+}
+
+def V_CNDMASK_B32_e64 : VOP3 <0x00000100, (outs VReg_32:$dst),
+  (ins VSrc_32:$src0, VSrc_32:$src1, SSrc_64:$src2,
+   InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg),
+  "V_CNDMASK_B32_e64 $dst, $src0, $src1, $src2, $abs, $clamp, $omod, $neg",
+  [(set i32:$dst, (select i1:$src2, i32:$src1, i32:$src0))]
+> {
+  let src0_modifiers = 0;
+  let src1_modifiers = 0;
+  let src2_modifiers = 0;
+}
+
+def V_READLANE_B32 : VOP2 <
+  0x00000001,
+  (outs SReg_32:$vdst),
+  (ins VReg_32:$src0, SSrc_32:$vsrc1),
+  "V_READLANE_B32 $vdst, $src0, $vsrc1",
+  []
+>;
+
+def V_WRITELANE_B32 : VOP2 <
+  0x00000002,
+  (outs VReg_32:$vdst),
+  (ins SReg_32:$src0, SSrc_32:$vsrc1),
+  "V_WRITELANE_B32 $vdst, $src0, $vsrc1",
+  []
+>;
+
+let isCommutable = 1 in {
+defm V_ADD_F32 : VOP2_32 <0x00000003, "V_ADD_F32",
+  [(set f32:$dst, (fadd f32:$src0, f32:$src1))]
+>;
+
+defm V_SUB_F32 : VOP2_32 <0x00000004, "V_SUB_F32",
+  [(set f32:$dst, (fsub f32:$src0, f32:$src1))]
+>;
+defm V_SUBREV_F32 : VOP2_32 <0x00000005, "V_SUBREV_F32", [], "V_SUB_F32">;
+} // End isCommutable = 1
+
+defm V_MAC_LEGACY_F32 : VOP2_32 <0x00000006, "V_MAC_LEGACY_F32", []>;
+
+let isCommutable = 1 in {
+
+defm V_MUL_LEGACY_F32 : VOP2_32 <
+  0x00000007, "V_MUL_LEGACY_F32",
+  [(set f32:$dst, (int_AMDGPU_mul f32:$src0, f32:$src1))]
+>;
+
+defm V_MUL_F32 : VOP2_32 <0x00000008, "V_MUL_F32",
+  [(set f32:$dst, (fmul f32:$src0, f32:$src1))]
+>;
+
+
+defm V_MUL_I32_I24 : VOP2_32 <0x00000009, "V_MUL_I32_I24",
+  [(set i32:$dst, (AMDGPUmul_i24 i32:$src0, i32:$src1))]
+>;
+//defm V_MUL_HI_I32_I24 : VOP2_32 <0x0000000a, "V_MUL_HI_I32_I24", []>;
+defm V_MUL_U32_U24 : VOP2_32 <0x0000000b, "V_MUL_U32_U24",
+  [(set i32:$dst, (AMDGPUmul_u24 i32:$src0, i32:$src1))]
+>;
+//defm V_MUL_HI_U32_U24 : VOP2_32 <0x0000000c, "V_MUL_HI_U32_U24", []>;
+
+
+defm V_MIN_LEGACY_F32 : VOP2_32 <0x0000000d, "V_MIN_LEGACY_F32",
+  [(set f32:$dst, (AMDGPUfmin f32:$src0, f32:$src1))]
+>;
+
+defm V_MAX_LEGACY_F32 : VOP2_32 <0x0000000e, "V_MAX_LEGACY_F32",
+  [(set f32:$dst, (AMDGPUfmax f32:$src0, f32:$src1))]
+>;
+
+defm V_MIN_F32 : VOP2_32 <0x0000000f, "V_MIN_F32", []>;
+defm V_MAX_F32 : VOP2_32 <0x00000010, "V_MAX_F32", []>;
+defm V_MIN_I32 : VOP2_32 <0x00000011, "V_MIN_I32",
+  [(set i32:$dst, (AMDGPUsmin i32:$src0, i32:$src1))]>;
+defm V_MAX_I32 : VOP2_32 <0x00000012, "V_MAX_I32",
+  [(set i32:$dst, (AMDGPUsmax i32:$src0, i32:$src1))]>;
+defm V_MIN_U32 : VOP2_32 <0x00000013, "V_MIN_U32",
+  [(set i32:$dst, (AMDGPUumin i32:$src0, i32:$src1))]>;
+defm V_MAX_U32 : VOP2_32 <0x00000014, "V_MAX_U32",
+  [(set i32:$dst, (AMDGPUumax i32:$src0, i32:$src1))]>;
+
+defm V_LSHR_B32 : VOP2_32 <0x00000015, "V_LSHR_B32",
+  [(set i32:$dst, (srl i32:$src0, i32:$src1))]
+>;
+
+defm V_LSHRREV_B32 : VOP2_32 <0x00000016, "V_LSHRREV_B32", [], "V_LSHR_B32">;
+
+defm V_ASHR_I32 : VOP2_32 <0x00000017, "V_ASHR_I32",
+  [(set i32:$dst, (sra i32:$src0, i32:$src1))]
+>;
+defm V_ASHRREV_I32 : VOP2_32 <0x00000018, "V_ASHRREV_I32", [], "V_ASHR_I32">;
+
+let hasPostISelHook = 1 in {
+
+defm V_LSHL_B32 : VOP2_32 <0x00000019, "V_LSHL_B32",
+  [(set i32:$dst, (shl i32:$src0, i32:$src1))]
+>;
+
+}
+defm V_LSHLREV_B32 : VOP2_32 <0x0000001a, "V_LSHLREV_B32", [], "V_LSHL_B32">;
+
+defm V_AND_B32 : VOP2_32 <0x0000001b, "V_AND_B32",
+  [(set i32:$dst, (and i32:$src0, i32:$src1))]>;
+defm V_OR_B32 : VOP2_32 <0x0000001c, "V_OR_B32",
+  [(set i32:$dst, (or i32:$src0, i32:$src1))]
+>;
+defm V_XOR_B32 : VOP2_32 <0x0000001d, "V_XOR_B32",
+  [(set i32:$dst, (xor i32:$src0, i32:$src1))]
+>;
+
+} // End isCommutable = 1
+
+defm V_BFM_B32 : VOP2_32 <0x0000001e, "V_BFM_B32",
+  [(set i32:$dst, (AMDGPUbfm i32:$src0, i32:$src1))]>;
+defm V_MAC_F32 : VOP2_32 <0x0000001f, "V_MAC_F32", []>;
+defm V_MADMK_F32 : VOP2_32 <0x00000020, "V_MADMK_F32", []>;
+defm V_MADAK_F32 : VOP2_32 <0x00000021, "V_MADAK_F32", []>;
+defm V_BCNT_U32_B32 : VOP2_32 <0x00000022, "V_BCNT_U32_B32", []>;
+defm V_MBCNT_LO_U32_B32 : VOP2_32 <0x00000023, "V_MBCNT_LO_U32_B32", []>;
+defm V_MBCNT_HI_U32_B32 : VOP2_32 <0x00000024, "V_MBCNT_HI_U32_B32", []>;
+
+let isCommutable = 1, Defs = [VCC] in { // Carry-out goes to VCC
+// No patterns so that the scalar instructions are always selected.
+// The scalar versions will be replaced with vector when needed later.
+defm V_ADD_I32 : VOP2b_32 <0x00000025, "V_ADD_I32",
+  [(set i32:$dst, (add i32:$src0, i32:$src1))], VSrc_32>;
+defm V_SUB_I32 : VOP2b_32 <0x00000026, "V_SUB_I32",
+  [(set i32:$dst, (sub i32:$src0, i32:$src1))], VSrc_32>;
+defm V_SUBREV_I32 : VOP2b_32 <0x00000027, "V_SUBREV_I32", [], VSrc_32,
+                              "V_SUB_I32">;
+
+let Uses = [VCC] in { // Carry-in comes from VCC
+defm V_ADDC_U32 : VOP2b_32 <0x00000028, "V_ADDC_U32",
+  [(set i32:$dst, (adde i32:$src0, i32:$src1))], VReg_32>;
+defm V_SUBB_U32 : VOP2b_32 <0x00000029, "V_SUBB_U32",
+  [(set i32:$dst, (sube i32:$src0, i32:$src1))], VReg_32>;
+defm V_SUBBREV_U32 : VOP2b_32 <0x0000002a, "V_SUBBREV_U32", [], VReg_32,
+                               "V_SUBB_U32">;
+} // End Uses = [VCC]
+} // End isCommutable = 1, Defs = [VCC]
+
+defm V_LDEXP_F32 : VOP2_32 <0x0000002b, "V_LDEXP_F32", []>;
+////def V_CVT_PKACCUM_U8_F32 : VOP2_U8 <0x0000002c, "V_CVT_PKACCUM_U8_F32", []>;
+////def V_CVT_PKNORM_I16_F32 : VOP2_I16 <0x0000002d, "V_CVT_PKNORM_I16_F32", []>;
+////def V_CVT_PKNORM_U16_F32 : VOP2_U16 <0x0000002e, "V_CVT_PKNORM_U16_F32", []>;
+defm V_CVT_PKRTZ_F16_F32 : VOP2_32 <0x0000002f, "V_CVT_PKRTZ_F16_F32",
+ [(set i32:$dst, (int_SI_packf16 f32:$src0, f32:$src1))]
+>;
+////def V_CVT_PK_U16_U32 : VOP2_U16 <0x00000030, "V_CVT_PK_U16_U32", []>;
+////def V_CVT_PK_I16_I32 : VOP2_I16 <0x00000031, "V_CVT_PK_I16_I32", []>;
+
+//===----------------------------------------------------------------------===//
+// VOP3 Instructions
+//===----------------------------------------------------------------------===//
+
+let neverHasSideEffects = 1 in {
+
+defm V_MAD_LEGACY_F32 : VOP3_32 <0x00000140, "V_MAD_LEGACY_F32", []>;
+defm V_MAD_F32 : VOP3_32 <0x00000141, "V_MAD_F32",
+  [(set f32:$dst, (fadd (fmul f32:$src0, f32:$src1), f32:$src2))]
+>;
+defm V_MAD_I32_I24 : VOP3_32 <0x00000142, "V_MAD_I32_I24",
+  [(set i32:$dst, (AMDGPUmad_i24 i32:$src0, i32:$src1, i32:$src2))]
+>;
+defm V_MAD_U32_U24 : VOP3_32 <0x00000143, "V_MAD_U32_U24",
+  [(set i32:$dst, (AMDGPUmad_u24 i32:$src0, i32:$src1, i32:$src2))]
+>;
+
+} // End neverHasSideEffects
+
+defm V_CUBEID_F32 : VOP3_32 <0x00000144, "V_CUBEID_F32", []>;
+defm V_CUBESC_F32 : VOP3_32 <0x00000145, "V_CUBESC_F32", []>;
+defm V_CUBETC_F32 : VOP3_32 <0x00000146, "V_CUBETC_F32", []>;
+defm V_CUBEMA_F32 : VOP3_32 <0x00000147, "V_CUBEMA_F32", []>;
+
+let neverHasSideEffects = 1, mayLoad = 0, mayStore = 0 in {
+defm V_BFE_U32 : VOP3_32 <0x00000148, "V_BFE_U32",
+  [(set i32:$dst, (AMDGPUbfe_u32 i32:$src0, i32:$src1, i32:$src2))]>;
+defm V_BFE_I32 : VOP3_32 <0x00000149, "V_BFE_I32",
+  [(set i32:$dst, (AMDGPUbfe_i32 i32:$src0, i32:$src1, i32:$src2))]>;
+}
+
+defm V_BFI_B32 : VOP3_32 <0x0000014a, "V_BFI_B32",
+  [(set i32:$dst, (AMDGPUbfi i32:$src0, i32:$src1, i32:$src2))]>;
+defm V_FMA_F32 : VOP3_32 <0x0000014b, "V_FMA_F32",
+  [(set f32:$dst, (fma f32:$src0, f32:$src1, f32:$src2))]
+>;
+def V_FMA_F64 : VOP3_64 <0x0000014c, "V_FMA_F64",
+  [(set f64:$dst, (fma f64:$src0, f64:$src1, f64:$src2))]
+>;
+//def V_LERP_U8 : VOP3_U8 <0x0000014d, "V_LERP_U8", []>;
+defm V_ALIGNBIT_B32 : VOP3_32 <0x0000014e, "V_ALIGNBIT_B32", []>;
+
+defm V_ALIGNBYTE_B32 : VOP3_32 <0x0000014f, "V_ALIGNBYTE_B32", []>;
+defm V_MULLIT_F32 : VOP3_32 <0x00000150, "V_MULLIT_F32", []>;
+////def V_MIN3_F32 : VOP3_MIN3 <0x00000151, "V_MIN3_F32", []>;
+////def V_MIN3_I32 : VOP3_MIN3 <0x00000152, "V_MIN3_I32", []>;
+////def V_MIN3_U32 : VOP3_MIN3 <0x00000153, "V_MIN3_U32", []>;
+////def V_MAX3_F32 : VOP3_MAX3 <0x00000154, "V_MAX3_F32", []>;
+////def V_MAX3_I32 : VOP3_MAX3 <0x00000155, "V_MAX3_I32", []>;
+////def V_MAX3_U32 : VOP3_MAX3 <0x00000156, "V_MAX3_U32", []>;
+////def V_MED3_F32 : VOP3_MED3 <0x00000157, "V_MED3_F32", []>;
+////def V_MED3_I32 : VOP3_MED3 <0x00000158, "V_MED3_I32", []>;
+////def V_MED3_U32 : VOP3_MED3 <0x00000159, "V_MED3_U32", []>;
+//def V_SAD_U8 : VOP3_U8 <0x0000015a, "V_SAD_U8", []>;
+//def V_SAD_HI_U8 : VOP3_U8 <0x0000015b, "V_SAD_HI_U8", []>;
+//def V_SAD_U16 : VOP3_U16 <0x0000015c, "V_SAD_U16", []>;
+defm V_SAD_U32 : VOP3_32 <0x0000015d, "V_SAD_U32", []>;
+////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "V_CVT_PK_U8_F32", []>;
+defm V_DIV_FIXUP_F32 : VOP3_32 <0x0000015f, "V_DIV_FIXUP_F32",
+  [(set f32:$dst, (AMDGPUdiv_fixup f32:$src0, f32:$src1, f32:$src2))]
+>;
+def V_DIV_FIXUP_F64 : VOP3_64 <0x00000160, "V_DIV_FIXUP_F64",
+  [(set f64:$dst, (AMDGPUdiv_fixup f64:$src0, f64:$src1, f64:$src2))]
+>;
+
+def V_LSHL_B64 : VOP3_64_32 <0x00000161, "V_LSHL_B64",
+  [(set i64:$dst, (shl i64:$src0, i32:$src1))]
+>;
+def V_LSHR_B64 : VOP3_64_32 <0x00000162, "V_LSHR_B64",
+  [(set i64:$dst, (srl i64:$src0, i32:$src1))]
+>;
+def V_ASHR_I64 : VOP3_64_32 <0x00000163, "V_ASHR_I64",
+  [(set i64:$dst, (sra i64:$src0, i32:$src1))]
+>;
+
+let isCommutable = 1 in {
+
+def V_ADD_F64 : VOP3_64 <0x00000164, "V_ADD_F64", []>;
+def V_MUL_F64 : VOP3_64 <0x00000165, "V_MUL_F64", []>;
+def V_MIN_F64 : VOP3_64 <0x00000166, "V_MIN_F64", []>;
+def V_MAX_F64 : VOP3_64 <0x00000167, "V_MAX_F64", []>;
+
+} // isCommutable = 1
+
+def V_LDEXP_F64 : VOP3_64 <0x00000168, "V_LDEXP_F64", []>;
+
+let isCommutable = 1 in {
+
+defm V_MUL_LO_U32 : VOP3_32 <0x00000169, "V_MUL_LO_U32", []>;
+defm V_MUL_HI_U32 : VOP3_32 <0x0000016a, "V_MUL_HI_U32", []>;
+defm V_MUL_LO_I32 : VOP3_32 <0x0000016b, "V_MUL_LO_I32", []>;
+defm V_MUL_HI_I32 : VOP3_32 <0x0000016c, "V_MUL_HI_I32", []>;
+
+} // isCommutable = 1
+
+def V_DIV_SCALE_F32 : VOP3b_32 <0x0000016d, "V_DIV_SCALE_F32", []>;
+
+// Double precision division pre-scale.
+def V_DIV_SCALE_F64 : VOP3b_64 <0x0000016e, "V_DIV_SCALE_F64", []>;
+
+defm V_DIV_FMAS_F32 : VOP3_32 <0x0000016f, "V_DIV_FMAS_F32",
+  [(set f32:$dst, (AMDGPUdiv_fmas f32:$src0, f32:$src1, f32:$src2))]
+>;
+def V_DIV_FMAS_F64 : VOP3_64 <0x00000170, "V_DIV_FMAS_F64",
+  [(set f64:$dst, (AMDGPUdiv_fmas f64:$src0, f64:$src1, f64:$src2))]
+>;
+//def V_MSAD_U8 : VOP3_U8 <0x00000171, "V_MSAD_U8", []>;
+//def V_QSAD_U8 : VOP3_U8 <0x00000172, "V_QSAD_U8", []>;
+//def V_MQSAD_U8 : VOP3_U8 <0x00000173, "V_MQSAD_U8", []>;
+def V_TRIG_PREOP_F64 : VOP3_64_32 <0x00000174, "V_TRIG_PREOP_F64",
+  [(set f64:$dst, (AMDGPUtrig_preop f64:$src0, i32:$src1))]
+>;
+
+//===----------------------------------------------------------------------===//
+// Pseudo Instructions
+//===----------------------------------------------------------------------===//
+
+let isCodeGenOnly = 1, isPseudo = 1 in {
+
+def V_MOV_I1 : InstSI <
+  (outs VReg_1:$dst),
+  (ins i1imm:$src),
+  "", [(set i1:$dst, (imm:$src))]
+>;
+
+def V_AND_I1 : InstSI <
+   (outs VReg_1:$dst), (ins VReg_1:$src0, VReg_1:$src1), "",
+   [(set i1:$dst, (and i1:$src0, i1:$src1))]
+>;
+
+def V_OR_I1 : InstSI <
+   (outs VReg_1:$dst), (ins VReg_1:$src0, VReg_1:$src1), "",
+   [(set i1:$dst, (or i1:$src0, i1:$src1))]
+>;
+
+def V_XOR_I1 : InstSI <
+  (outs VReg_1:$dst), (ins VReg_1:$src0, VReg_1:$src1), "",
+  [(set i1:$dst, (xor i1:$src0, i1:$src1))]
+>;
+
+// SI pseudo instructions. These are used by the CFG structurizer pass
+// and should be lowered to ISA instructions prior to codegen.
+
+let mayLoad = 1, mayStore = 1, hasSideEffects = 1,
+    Uses = [EXEC], Defs = [EXEC] in {
+
+let isBranch = 1, isTerminator = 1 in {
+
+def SI_IF: InstSI <
+  (outs SReg_64:$dst),
+  (ins SReg_64:$vcc, brtarget:$target),
+  "",
+  [(set i64:$dst, (int_SI_if i1:$vcc, bb:$target))]
+>;
+
+def SI_ELSE : InstSI <
+  (outs SReg_64:$dst),
+  (ins SReg_64:$src, brtarget:$target),
+  "",
+  [(set i64:$dst, (int_SI_else i64:$src, bb:$target))]
+> {
+  let Constraints = "$src = $dst";
+}
+
+def SI_LOOP : InstSI <
+  (outs),
+  (ins SReg_64:$saved, brtarget:$target),
+  "SI_LOOP $saved, $target",
+  [(int_SI_loop i64:$saved, bb:$target)]
+>;
+
+} // end isBranch = 1, isTerminator = 1
+
+def SI_BREAK : InstSI <
+  (outs SReg_64:$dst),
+  (ins SReg_64:$src),
+  "SI_ELSE $dst, $src",
+  [(set i64:$dst, (int_SI_break i64:$src))]
+>;
+
+def SI_IF_BREAK : InstSI <
+  (outs SReg_64:$dst),
+  (ins SReg_64:$vcc, SReg_64:$src),
+  "SI_IF_BREAK $dst, $vcc, $src",
+  [(set i64:$dst, (int_SI_if_break i1:$vcc, i64:$src))]
+>;
+
+def SI_ELSE_BREAK : InstSI <
+  (outs SReg_64:$dst),
+  (ins SReg_64:$src0, SReg_64:$src1),
+  "SI_ELSE_BREAK $dst, $src0, $src1",
+  [(set i64:$dst, (int_SI_else_break i64:$src0, i64:$src1))]
+>;
+
+def SI_END_CF : InstSI <
+  (outs),
+  (ins SReg_64:$saved),
+  "SI_END_CF $saved",
+  [(int_SI_end_cf i64:$saved)]
+>;
+
+def SI_KILL : InstSI <
+  (outs),
+  (ins VSrc_32:$src),
+  "SI_KILL $src",
+  [(int_AMDGPU_kill f32:$src)]
+>;
+
+} // end mayLoad = 1, mayStore = 1, hasSideEffects = 1
+  // Uses = [EXEC], Defs = [EXEC]
+
+let Uses = [EXEC], Defs = [EXEC,VCC,M0] in {
+
+//defm SI_ : RegisterLoadStore <VReg_32, FRAMEri, ADDRIndirect>;
+
+let UseNamedOperandTable = 1 in {
+
+def SI_RegisterLoad : InstSI <
+  (outs VReg_32:$dst, SReg_64:$temp),
+  (ins FRAMEri32:$addr, i32imm:$chan),
+  "", []
+> {
+  let isRegisterLoad = 1;
+  let mayLoad = 1;
+}
+
+class SIRegStore<dag outs> : InstSI <
+  outs,
+  (ins VReg_32:$val, FRAMEri32:$addr, i32imm:$chan),
+  "", []
+> {
+  let isRegisterStore = 1;
+  let mayStore = 1;
+}
+
+let usesCustomInserter = 1 in {
+def SI_RegisterStorePseudo : SIRegStore<(outs)>;
+} // End usesCustomInserter = 1
+def SI_RegisterStore : SIRegStore<(outs SReg_64:$temp)>;
+
+
+} // End UseNamedOperandTable = 1
+
+def SI_INDIRECT_SRC : InstSI <
+  (outs VReg_32:$dst, SReg_64:$temp),
+  (ins unknown:$src, VSrc_32:$idx, i32imm:$off),
+  "SI_INDIRECT_SRC $dst, $temp, $src, $idx, $off",
+  []
+>;
+
+class SI_INDIRECT_DST<RegisterClass rc> : InstSI <
+  (outs rc:$dst, SReg_64:$temp),
+  (ins unknown:$src, VSrc_32:$idx, i32imm:$off, VReg_32:$val),
+  "SI_INDIRECT_DST $dst, $temp, $src, $idx, $off, $val",
+  []
+> {
+  let Constraints = "$src = $dst";
+}
+
+def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VReg_32>;
+def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>;
+def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>;
+def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>;
+def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>;
+
+} // Uses = [EXEC,VCC,M0], Defs = [EXEC,VCC,M0]
+
+let usesCustomInserter = 1 in {
+
+// This pseudo instruction takes a pointer as input and outputs a resource
+// constant that can be used with the ADDR64 MUBUF instructions.
+def SI_ADDR64_RSRC : InstSI <
+  (outs SReg_128:$srsrc),
+  (ins SSrc_64:$ptr),
+  "", []
+>;
+
+def SI_BUFFER_RSRC : InstSI <
+  (outs SReg_128:$srsrc),
+  (ins SReg_32:$ptr_lo, SReg_32:$ptr_hi, SSrc_32:$data_lo, SSrc_32:$data_hi),
+  "", []
+>;
+
+def V_SUB_F64 : InstSI <
+  (outs VReg_64:$dst),
+  (ins VReg_64:$src0, VReg_64:$src1),
+  "V_SUB_F64 $dst, $src0, $src1",
+  [(set f64:$dst, (fsub f64:$src0, f64:$src1))]
+>;
+
+} // end usesCustomInserter
+
+multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> {
+
+  def _SAVE : InstSI <
+    (outs VReg_32:$dst),
+    (ins sgpr_class:$src, i32imm:$frame_idx),
+    "", []
+  >;
+
+  def _RESTORE : InstSI <
+    (outs sgpr_class:$dst),
+    (ins VReg_32:$src, i32imm:$frame_idx),
+    "", []
+  >;
+
+}
+
+defm SI_SPILL_S32  : SI_SPILL_SGPR <SReg_32>;
+defm SI_SPILL_S64  : SI_SPILL_SGPR <SReg_64>;
+defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>;
+defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
+defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
+
+let Defs = [SCC] in {
+
+def SI_CONSTDATA_PTR : InstSI <
+  (outs SReg_64:$dst),
+  (ins),
+  "", [(set SReg_64:$dst, (i64 SIconstdata_ptr))]
+>;
+
+} // End Defs = [SCC]
+
+} // end IsCodeGenOnly, isPseudo
+
+} // end SubtargetPredicate = SI
+
+let Predicates = [isSI] in {
+
+def : Pat<
+  (int_AMDGPU_cndlt f32:$src0, f32:$src1, f32:$src2),
+  (V_CNDMASK_B32_e64 $src2, $src1, (V_CMP_GT_F32_e64 0, $src0))
+>;
+
+def : Pat <
+  (int_AMDGPU_kilp),
+  (SI_KILL 0xbf800000)
+>;
+
+/* int_SI_vs_load_input */
+def : Pat<
+  (SIload_input v4i32:$tlst, imm:$attr_offset, i32:$buf_idx_vgpr),
+  (BUFFER_LOAD_FORMAT_XYZW_IDXEN $tlst, $buf_idx_vgpr, imm:$attr_offset, 0, 0, 0, 0)
+>;
+
+/* int_SI_export */
+def : Pat <
+  (int_SI_export imm:$en, imm:$vm, imm:$done, imm:$tgt, imm:$compr,
+                 f32:$src0, f32:$src1, f32:$src2, f32:$src3),
+  (EXP imm:$en, imm:$tgt, imm:$compr, imm:$done, imm:$vm,
+       $src0, $src1, $src2, $src3)
+>;
+
+//===----------------------------------------------------------------------===//
+// SMRD Patterns
+//===----------------------------------------------------------------------===//
+
+multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> {
+
+  // 1. Offset as 8bit DWORD immediate
+  def : Pat <
+    (constant_load (add i64:$sbase, (i64 IMM8bitDWORD:$offset))),
+    (vt (Instr_IMM $sbase, (as_dword_i32imm $offset)))
+  >;
+
+  // 2. Offset loaded in an 32bit SGPR
+  def : Pat <
+    (constant_load (add i64:$sbase, (i64 IMM32bit:$offset))),
+    (vt (Instr_SGPR $sbase, (S_MOV_B32 (i32 (as_i32imm $offset)))))
+  >;
+
+  // 3. No offset at all
+  def : Pat <
+    (constant_load i64:$sbase),
+    (vt (Instr_IMM $sbase, 0))
+  >;
+}
+
+defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>;
+defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>;
+defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>;
+defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>;
+defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>;
+defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>;
+defm : SMRD_Pattern <S_LOAD_DWORDX16_IMM, S_LOAD_DWORDX16_SGPR, v16i32>;
+
+// 1. Offset as 8bit DWORD immediate
+def : Pat <
+  (SIload_constant v4i32:$sbase, IMM8bitDWORD:$offset),
+  (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_dword_i32imm $offset))
+>;
+
+// 2. Offset loaded in an 32bit SGPR
+def : Pat <
+  (SIload_constant v4i32:$sbase, imm:$offset),
+  (S_BUFFER_LOAD_DWORD_SGPR $sbase, (S_MOV_B32 imm:$offset))
+>;
+
+} // Predicates = [isSI] in {
+
+//===----------------------------------------------------------------------===//
+// SOP1 Patterns
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isSI, isCFDepth0] in {
+
+def : Pat <
+  (i64 (ctpop i64:$src)),
+  (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+    (S_BCNT1_I32_B64 $src), sub0),
+    (S_MOV_B32 0), sub1)
+>;
+
+//===----------------------------------------------------------------------===//
+// SOP2 Patterns
+//===----------------------------------------------------------------------===//
+
+// V_ADD_I32_e32/S_ADD_I32 produces carry in VCC/SCC. For the vector
+// case, the sgpr-copies pass will fix this to use the vector version.
+def : Pat <
+  (i32 (addc i32:$src0, i32:$src1)),
+  (S_ADD_I32 $src0, $src1)
+>;
+
+} // Predicates = [isSI, isCFDepth0]
+
+let  Predicates = [isSI] in {
+
+//===----------------------------------------------------------------------===//
+// SOPP Patterns
+//===----------------------------------------------------------------------===//
+
+def : Pat <
+  (int_AMDGPU_barrier_global),
+  (S_BARRIER)
+>;
+
+//===----------------------------------------------------------------------===//
+// VOP1 Patterns
+//===----------------------------------------------------------------------===//
+
+let Predicates = [UnsafeFPMath] in {
+def : RcpPat<V_RCP_F64_e32, f64>;
+defm : RsqPat<V_RSQ_F64_e32, f64>;
+defm : RsqPat<V_RSQ_F32_e32, f32>;
+}
+
+//===----------------------------------------------------------------------===//
+// VOP2 Patterns
+//===----------------------------------------------------------------------===//
+
+class BinOp64Pat <SDNode node, Instruction inst> : Pat <
+  (node i64:$src0, i64:$src1),
+  (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+    (inst  (EXTRACT_SUBREG i64:$src0, sub0),
+                  (EXTRACT_SUBREG i64:$src1, sub0)), sub0),
+    (inst (EXTRACT_SUBREG i64:$src0, sub1),
+                  (EXTRACT_SUBREG i64:$src1, sub1)), sub1)
+>;
+
+def : BinOp64Pat <or, V_OR_B32_e32>;
+def : BinOp64Pat <xor, V_XOR_B32_e32>;
+
+class SextInReg <ValueType vt, int ShiftAmt> : Pat <
+  (sext_inreg i32:$src0, vt),
+  (V_ASHRREV_I32_e32 ShiftAmt, (V_LSHLREV_B32_e32 ShiftAmt, $src0))
+>;
+
+def : SextInReg <i8, 24>;
+def : SextInReg <i16, 16>;
+
+def : Pat <
+  (i32 (add (i32 (ctpop i32:$popcnt)), i32:$val)),
+  (V_BCNT_U32_B32_e32 $popcnt, $val)
+>;
+
+def : Pat <
+   (i32 (ctpop i32:$popcnt)),
+   (V_BCNT_U32_B32_e64 $popcnt, 0, 0, 0)
+>;
+
+def : Pat <
+  (i64 (ctpop i64:$src)),
+  (INSERT_SUBREG
+    (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+      (V_BCNT_U32_B32_e32 (EXTRACT_SUBREG $src, sub1),
+        (V_BCNT_U32_B32_e64 (EXTRACT_SUBREG $src, sub0), 0, 0, 0)),
+      sub0),
+    (V_MOV_B32_e32 0), sub1)
+>;
+
+def : Pat <
+  (addc i32:$src0, i32:$src1),
+  (V_ADD_I32_e32 $src0, $src1)
+>;
+
+/********** ======================= **********/
+/********** Image sampling patterns **********/
+/********** ======================= **********/
+
+// Image + sampler
+class SampleRawPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat <
+  (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, i32:$dmask, i32:$unorm,
+        i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe),
+  (opcode (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $da),
+          (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $slc),
+          $addr, $rsrc, $sampler)
+>;
+
+multiclass SampleRawPatterns<SDPatternOperator name, string opcode> {
+  def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V1), i32>;
+  def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V2), v2i32>;
+  def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>;
+  def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V8), v8i32>;
+  def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V16), v16i32>;
+}
+
+// Image only
+class ImagePattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat <
+  (name vt:$addr, v8i32:$rsrc, i32:$dmask, i32:$unorm,
+        i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe),
+  (opcode (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $da),
+          (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $slc),
+          $addr, $rsrc)
+>;
+
+multiclass ImagePatterns<SDPatternOperator name, string opcode> {
+  def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V1), i32>;
+  def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V2), v2i32>;
+  def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>;
+}
+
+// Basic sample
+defm : SampleRawPatterns<int_SI_image_sample,           "IMAGE_SAMPLE">;
+defm : SampleRawPatterns<int_SI_image_sample_cl,        "IMAGE_SAMPLE_CL">;
+defm : SampleRawPatterns<int_SI_image_sample_d,         "IMAGE_SAMPLE_D">;
+defm : SampleRawPatterns<int_SI_image_sample_d_cl,      "IMAGE_SAMPLE_D_CL">;
+defm : SampleRawPatterns<int_SI_image_sample_l,         "IMAGE_SAMPLE_L">;
+defm : SampleRawPatterns<int_SI_image_sample_b,         "IMAGE_SAMPLE_B">;
+defm : SampleRawPatterns<int_SI_image_sample_b_cl,      "IMAGE_SAMPLE_B_CL">;
+defm : SampleRawPatterns<int_SI_image_sample_lz,        "IMAGE_SAMPLE_LZ">;
+defm : SampleRawPatterns<int_SI_image_sample_cd,        "IMAGE_SAMPLE_CD">;
+defm : SampleRawPatterns<int_SI_image_sample_cd_cl,     "IMAGE_SAMPLE_CD_CL">;
+
+// Sample with comparison
+defm : SampleRawPatterns<int_SI_image_sample_c,         "IMAGE_SAMPLE_C">;
+defm : SampleRawPatterns<int_SI_image_sample_c_cl,      "IMAGE_SAMPLE_C_CL">;
+defm : SampleRawPatterns<int_SI_image_sample_c_d,       "IMAGE_SAMPLE_C_D">;
+defm : SampleRawPatterns<int_SI_image_sample_c_d_cl,    "IMAGE_SAMPLE_C_D_CL">;
+defm : SampleRawPatterns<int_SI_image_sample_c_l,       "IMAGE_SAMPLE_C_L">;
+defm : SampleRawPatterns<int_SI_image_sample_c_b,       "IMAGE_SAMPLE_C_B">;
+defm : SampleRawPatterns<int_SI_image_sample_c_b_cl,    "IMAGE_SAMPLE_C_B_CL">;
+defm : SampleRawPatterns<int_SI_image_sample_c_lz,      "IMAGE_SAMPLE_C_LZ">;
+defm : SampleRawPatterns<int_SI_image_sample_c_cd,      "IMAGE_SAMPLE_C_CD">;
+defm : SampleRawPatterns<int_SI_image_sample_c_cd_cl,   "IMAGE_SAMPLE_C_CD_CL">;
+
+// Sample with offsets
+defm : SampleRawPatterns<int_SI_image_sample_o,         "IMAGE_SAMPLE_O">;
+defm : SampleRawPatterns<int_SI_image_sample_cl_o,      "IMAGE_SAMPLE_CL_O">;
+defm : SampleRawPatterns<int_SI_image_sample_d_o,       "IMAGE_SAMPLE_D_O">;
+defm : SampleRawPatterns<int_SI_image_sample_d_cl_o,    "IMAGE_SAMPLE_D_CL_O">;
+defm : SampleRawPatterns<int_SI_image_sample_l_o,       "IMAGE_SAMPLE_L_O">;
+defm : SampleRawPatterns<int_SI_image_sample_b_o,       "IMAGE_SAMPLE_B_O">;
+defm : SampleRawPatterns<int_SI_image_sample_b_cl_o,    "IMAGE_SAMPLE_B_CL_O">;
+defm : SampleRawPatterns<int_SI_image_sample_lz_o,      "IMAGE_SAMPLE_LZ_O">;
+defm : SampleRawPatterns<int_SI_image_sample_cd_o,      "IMAGE_SAMPLE_CD_O">;
+defm : SampleRawPatterns<int_SI_image_sample_cd_cl_o,   "IMAGE_SAMPLE_CD_CL_O">;
+
+// Sample with comparison and offsets
+defm : SampleRawPatterns<int_SI_image_sample_c_o,       "IMAGE_SAMPLE_C_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_cl_o,    "IMAGE_SAMPLE_C_CL_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_d_o,     "IMAGE_SAMPLE_C_D_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_d_cl_o,  "IMAGE_SAMPLE_C_D_CL_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_l_o,     "IMAGE_SAMPLE_C_L_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_b_o,     "IMAGE_SAMPLE_C_B_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_b_cl_o,  "IMAGE_SAMPLE_C_B_CL_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_lz_o,    "IMAGE_SAMPLE_C_LZ_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_cd_o,    "IMAGE_SAMPLE_C_CD_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_cd_cl_o, "IMAGE_SAMPLE_C_CD_CL_O">;
+
+// Gather opcodes
+// Only the variants which make sense are defined.
+def : SampleRawPattern<int_SI_gather4,           IMAGE_GATHER4_V4_V2,        v2i32>;
+def : SampleRawPattern<int_SI_gather4,           IMAGE_GATHER4_V4_V4,        v4i32>;
+def : SampleRawPattern<int_SI_gather4_cl,        IMAGE_GATHER4_CL_V4_V4,     v4i32>;
+def : SampleRawPattern<int_SI_gather4_l,         IMAGE_GATHER4_L_V4_V4,      v4i32>;
+def : SampleRawPattern<int_SI_gather4_b,         IMAGE_GATHER4_B_V4_V4,      v4i32>;
+def : SampleRawPattern<int_SI_gather4_b_cl,      IMAGE_GATHER4_B_CL_V4_V4,   v4i32>;
+def : SampleRawPattern<int_SI_gather4_b_cl,      IMAGE_GATHER4_B_CL_V4_V8,   v8i32>;
+def : SampleRawPattern<int_SI_gather4_lz,        IMAGE_GATHER4_LZ_V4_V2,     v2i32>;
+def : SampleRawPattern<int_SI_gather4_lz,        IMAGE_GATHER4_LZ_V4_V4,     v4i32>;
+
+def : SampleRawPattern<int_SI_gather4_c,         IMAGE_GATHER4_C_V4_V4,      v4i32>;
+def : SampleRawPattern<int_SI_gather4_c_cl,      IMAGE_GATHER4_C_CL_V4_V4,   v4i32>;
+def : SampleRawPattern<int_SI_gather4_c_cl,      IMAGE_GATHER4_C_CL_V4_V8,   v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_l,       IMAGE_GATHER4_C_L_V4_V4,    v4i32>;
+def : SampleRawPattern<int_SI_gather4_c_l,       IMAGE_GATHER4_C_L_V4_V8,    v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_b,       IMAGE_GATHER4_C_B_V4_V4,    v4i32>;
+def : SampleRawPattern<int_SI_gather4_c_b,       IMAGE_GATHER4_C_B_V4_V8,    v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_b_cl,    IMAGE_GATHER4_C_B_CL_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_lz,      IMAGE_GATHER4_C_LZ_V4_V4,   v4i32>;
+
+def : SampleRawPattern<int_SI_gather4_o,         IMAGE_GATHER4_O_V4_V4,      v4i32>;
+def : SampleRawPattern<int_SI_gather4_cl_o,      IMAGE_GATHER4_CL_O_V4_V4,   v4i32>;
+def : SampleRawPattern<int_SI_gather4_cl_o,      IMAGE_GATHER4_CL_O_V4_V8,   v8i32>;
+def : SampleRawPattern<int_SI_gather4_l_o,       IMAGE_GATHER4_L_O_V4_V4,    v4i32>;
+def : SampleRawPattern<int_SI_gather4_l_o,       IMAGE_GATHER4_L_O_V4_V8,    v8i32>;
+def : SampleRawPattern<int_SI_gather4_b_o,       IMAGE_GATHER4_B_O_V4_V4,    v4i32>;
+def : SampleRawPattern<int_SI_gather4_b_o,       IMAGE_GATHER4_B_O_V4_V8,    v8i32>;
+def : SampleRawPattern<int_SI_gather4_b_cl_o,    IMAGE_GATHER4_B_CL_O_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_lz_o,      IMAGE_GATHER4_LZ_O_V4_V4,   v4i32>;
+
+def : SampleRawPattern<int_SI_gather4_c_o,       IMAGE_GATHER4_C_O_V4_V4,    v4i32>;
+def : SampleRawPattern<int_SI_gather4_c_o,       IMAGE_GATHER4_C_O_V4_V8,    v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_cl_o,    IMAGE_GATHER4_C_CL_O_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_l_o,     IMAGE_GATHER4_C_L_O_V4_V8,  v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_b_o,     IMAGE_GATHER4_C_B_O_V4_V8,  v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_b_cl_o,  IMAGE_GATHER4_C_B_CL_O_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_lz_o,    IMAGE_GATHER4_C_LZ_O_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_c_lz_o,    IMAGE_GATHER4_C_LZ_O_V4_V8, v8i32>;
+
+def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V1, i32>;
+def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V2, v2i32>;
+def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V4, v4i32>;
+
+def : ImagePattern<int_SI_getresinfo, IMAGE_GET_RESINFO_V4_V1, i32>;
+defm : ImagePatterns<int_SI_image_load, "IMAGE_LOAD">;
+defm : ImagePatterns<int_SI_image_load_mip, "IMAGE_LOAD_MIP">;
+
+/* SIsample for simple 1D texture lookup */
+def : Pat <
+  (SIsample i32:$addr, v32i8:$rsrc, v4i32:$sampler, imm),
+  (IMAGE_SAMPLE_V4_V1 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
+>;
+
+class SamplePattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
+    (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, imm),
+    (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
+>;
+
+class SampleRectPattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
+    (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_RECT),
+    (opcode 0xf, 1, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
+>;
+
+class SampleArrayPattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
+    (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_ARRAY),
+    (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler)
+>;
+
+class SampleShadowPattern<SDNode name, MIMG opcode,
+                          ValueType vt> : Pat <
+    (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_SHADOW),
+    (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
+>;
+
+class SampleShadowArrayPattern<SDNode name, MIMG opcode,
+                               ValueType vt> : Pat <
+    (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_SHADOW_ARRAY),
+    (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler)
+>;
+
+/* SIsample* for texture lookups consuming more address parameters */
+multiclass SamplePatterns<MIMG sample, MIMG sample_c, MIMG sample_l,
+                          MIMG sample_c_l, MIMG sample_b, MIMG sample_c_b,
+MIMG sample_d, MIMG sample_c_d, ValueType addr_type> {
+  def : SamplePattern <SIsample, sample, addr_type>;
+  def : SampleRectPattern <SIsample, sample, addr_type>;
+  def : SampleArrayPattern <SIsample, sample, addr_type>;
+  def : SampleShadowPattern <SIsample, sample_c, addr_type>;
+  def : SampleShadowArrayPattern <SIsample, sample_c, addr_type>;
+
+  def : SamplePattern <SIsamplel, sample_l, addr_type>;
+  def : SampleArrayPattern <SIsamplel, sample_l, addr_type>;
+  def : SampleShadowPattern <SIsamplel, sample_c_l, addr_type>;
+  def : SampleShadowArrayPattern <SIsamplel, sample_c_l, addr_type>;
+
+  def : SamplePattern <SIsampleb, sample_b, addr_type>;
+  def : SampleArrayPattern <SIsampleb, sample_b, addr_type>;
+  def : SampleShadowPattern <SIsampleb, sample_c_b, addr_type>;
+  def : SampleShadowArrayPattern <SIsampleb, sample_c_b, addr_type>;
+
+  def : SamplePattern <SIsampled, sample_d, addr_type>;
+  def : SampleArrayPattern <SIsampled, sample_d, addr_type>;
+  def : SampleShadowPattern <SIsampled, sample_c_d, addr_type>;
+  def : SampleShadowArrayPattern <SIsampled, sample_c_d, addr_type>;
+}
+
+defm : SamplePatterns<IMAGE_SAMPLE_V4_V2, IMAGE_SAMPLE_C_V4_V2,
+                      IMAGE_SAMPLE_L_V4_V2, IMAGE_SAMPLE_C_L_V4_V2,
+                      IMAGE_SAMPLE_B_V4_V2, IMAGE_SAMPLE_C_B_V4_V2,
+                      IMAGE_SAMPLE_D_V4_V2, IMAGE_SAMPLE_C_D_V4_V2,
+                      v2i32>;
+defm : SamplePatterns<IMAGE_SAMPLE_V4_V4, IMAGE_SAMPLE_C_V4_V4,
+                      IMAGE_SAMPLE_L_V4_V4, IMAGE_SAMPLE_C_L_V4_V4,
+                      IMAGE_SAMPLE_B_V4_V4, IMAGE_SAMPLE_C_B_V4_V4,
+                      IMAGE_SAMPLE_D_V4_V4, IMAGE_SAMPLE_C_D_V4_V4,
+                      v4i32>;
+defm : SamplePatterns<IMAGE_SAMPLE_V4_V8, IMAGE_SAMPLE_C_V4_V8,
+                      IMAGE_SAMPLE_L_V4_V8, IMAGE_SAMPLE_C_L_V4_V8,
+                      IMAGE_SAMPLE_B_V4_V8, IMAGE_SAMPLE_C_B_V4_V8,
+                      IMAGE_SAMPLE_D_V4_V8, IMAGE_SAMPLE_C_D_V4_V8,
+                      v8i32>;
+defm : SamplePatterns<IMAGE_SAMPLE_V4_V16, IMAGE_SAMPLE_C_V4_V16,
+                      IMAGE_SAMPLE_L_V4_V16, IMAGE_SAMPLE_C_L_V4_V16,
+                      IMAGE_SAMPLE_B_V4_V16, IMAGE_SAMPLE_C_B_V4_V16,
+                      IMAGE_SAMPLE_D_V4_V16, IMAGE_SAMPLE_C_D_V4_V16,
+                      v16i32>;
+
+/* int_SI_imageload for texture fetches consuming varying address parameters */
+class ImageLoadPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat <
+    (name addr_type:$addr, v32i8:$rsrc, imm),
+    (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc)
+>;
+
+class ImageLoadArrayPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat <
+    (name addr_type:$addr, v32i8:$rsrc, TEX_ARRAY),
+    (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc)
+>;
+
+class ImageLoadMSAAPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat <
+    (name addr_type:$addr, v32i8:$rsrc, TEX_MSAA),
+    (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc)
+>;
+
+class ImageLoadArrayMSAAPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat <
+    (name addr_type:$addr, v32i8:$rsrc, TEX_ARRAY_MSAA),
+    (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc)
+>;
+
+multiclass ImageLoadPatterns<MIMG opcode, ValueType addr_type> {
+  def : ImageLoadPattern <int_SI_imageload, opcode, addr_type>;
+  def : ImageLoadArrayPattern <int_SI_imageload, opcode, addr_type>;
+}
+
+multiclass ImageLoadMSAAPatterns<MIMG opcode, ValueType addr_type> {
+  def : ImageLoadMSAAPattern <int_SI_imageload, opcode, addr_type>;
+  def : ImageLoadArrayMSAAPattern <int_SI_imageload, opcode, addr_type>;
+}
+
+defm : ImageLoadPatterns<IMAGE_LOAD_MIP_V4_V2, v2i32>;
+defm : ImageLoadPatterns<IMAGE_LOAD_MIP_V4_V4, v4i32>;
+
+defm : ImageLoadMSAAPatterns<IMAGE_LOAD_V4_V2, v2i32>;
+defm : ImageLoadMSAAPatterns<IMAGE_LOAD_V4_V4, v4i32>;
+
+/* Image resource information */
+def : Pat <
+  (int_SI_resinfo i32:$mipid, v32i8:$rsrc, imm),
+  (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 0, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc)
+>;
+
+def : Pat <
+  (int_SI_resinfo i32:$mipid, v32i8:$rsrc, TEX_ARRAY),
+  (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 1, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc)
+>;
+
+def : Pat <
+  (int_SI_resinfo i32:$mipid, v32i8:$rsrc, TEX_ARRAY_MSAA),
+  (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 1, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc)
+>;
+
+/********** ============================================ **********/
+/********** Extraction, Insertion, Building and Casting  **********/
+/********** ============================================ **********/
+
+foreach Index = 0-2 in {
+  def Extract_Element_v2i32_#Index : Extract_Element <
+    i32, v2i32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v2i32_#Index : Insert_Element <
+    i32, v2i32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+
+  def Extract_Element_v2f32_#Index : Extract_Element <
+    f32, v2f32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v2f32_#Index : Insert_Element <
+    f32, v2f32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+}
+
+foreach Index = 0-3 in {
+  def Extract_Element_v4i32_#Index : Extract_Element <
+    i32, v4i32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v4i32_#Index : Insert_Element <
+    i32, v4i32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+
+  def Extract_Element_v4f32_#Index : Extract_Element <
+    f32, v4f32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v4f32_#Index : Insert_Element <
+    f32, v4f32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+}
+
+foreach Index = 0-7 in {
+  def Extract_Element_v8i32_#Index : Extract_Element <
+    i32, v8i32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v8i32_#Index : Insert_Element <
+    i32, v8i32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+
+  def Extract_Element_v8f32_#Index : Extract_Element <
+    f32, v8f32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v8f32_#Index : Insert_Element <
+    f32, v8f32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+}
+
+foreach Index = 0-15 in {
+  def Extract_Element_v16i32_#Index : Extract_Element <
+    i32, v16i32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v16i32_#Index : Insert_Element <
+    i32, v16i32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+
+  def Extract_Element_v16f32_#Index : Extract_Element <
+    f32, v16f32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v16f32_#Index : Insert_Element <
+    f32, v16f32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+}
+
+def : BitConvert <i32, f32, SReg_32>;
+def : BitConvert <i32, f32, VReg_32>;
+
+def : BitConvert <f32, i32, SReg_32>;
+def : BitConvert <f32, i32, VReg_32>;
+
+def : BitConvert <i64, f64, VReg_64>;
+
+def : BitConvert <f64, i64, VReg_64>;
+
+def : BitConvert <v2f32, v2i32, VReg_64>;
+def : BitConvert <v2i32, v2f32, VReg_64>;
+def : BitConvert <v2i32, i64, VReg_64>;
+def : BitConvert <i64, v2i32, VReg_64>;
+def : BitConvert <v2f32, i64, VReg_64>;
+def : BitConvert <i64, v2f32, VReg_64>;
+def : BitConvert <v2i32, f64, VReg_64>;
+def : BitConvert <f64, v2i32, VReg_64>;
+def : BitConvert <v4f32, v4i32, VReg_128>;
+def : BitConvert <v4i32, v4f32, VReg_128>;
+
+def : BitConvert <v8f32, v8i32, SReg_256>;
+def : BitConvert <v8i32, v8f32, SReg_256>;
+def : BitConvert <v8i32, v32i8, SReg_256>;
+def : BitConvert <v32i8, v8i32, SReg_256>;
+def : BitConvert <v8i32, v32i8, VReg_256>;
+def : BitConvert <v8i32, v8f32, VReg_256>;
+def : BitConvert <v8f32, v8i32, VReg_256>;
+def : BitConvert <v32i8, v8i32, VReg_256>;
+
+def : BitConvert <v16i32, v16f32, VReg_512>;
+def : BitConvert <v16f32, v16i32, VReg_512>;
+
+/********** =================== **********/
+/********** Src & Dst modifiers **********/
+/********** =================== **********/
+
+def FCLAMP_SI : AMDGPUShaderInst <
+  (outs VReg_32:$dst),
+  (ins VSrc_32:$src0),
+  "FCLAMP_SI $dst, $src0",
+  []
+> {
+  let usesCustomInserter = 1;
+}
+
+def : Pat <
+  (AMDGPUclamp f32:$src, (f32 FP_ZERO), (f32 FP_ONE)),
+  (FCLAMP_SI f32:$src)
+>;
+
+/********** ================================ **********/
+/********** Floating point absolute/negative **********/
+/********** ================================ **********/
+
+// Manipulate the sign bit directly, as e.g. using the source negation modifier
+// in V_ADD_F32_e64 $src, 0, [...] does not result in -0.0 for $src == +0.0,
+// breaking the piglit *s-floatBitsToInt-neg* tests
+
+// TODO: Look into not implementing isFNegFree/isFAbsFree for SI, and possibly
+// removing these patterns
+
+def : Pat <
+  (fneg (fabs f32:$src)),
+  (V_OR_B32_e32 $src, (V_MOV_B32_e32 0x80000000)) /* Set sign bit */
+>;
+
+def FABS_SI : AMDGPUShaderInst <
+  (outs VReg_32:$dst),
+  (ins VSrc_32:$src0),
+  "FABS_SI $dst, $src0",
+  []
+> {
+  let usesCustomInserter = 1;
+}
+
+def : Pat <
+  (fabs f32:$src),
+  (FABS_SI f32:$src)
+>;
+
+def FNEG_SI : AMDGPUShaderInst <
+  (outs VReg_32:$dst),
+  (ins VSrc_32:$src0),
+  "FNEG_SI $dst, $src0",
+  []
+> {
+  let usesCustomInserter = 1;
+}
+
+def : Pat <
+  (fneg f32:$src),
+  (FNEG_SI f32:$src)
+>;
+
+/********** ================== **********/
+/********** Immediate Patterns **********/
+/********** ================== **********/
+
+def : Pat <
+  (SGPRImm<(i32 imm)>:$imm),
+  (S_MOV_B32 imm:$imm)
+>;
+
+def : Pat <
+  (SGPRImm<(f32 fpimm)>:$imm),
+  (S_MOV_B32 fpimm:$imm)
+>;
+
+def : Pat <
+  (i32 imm:$imm),
+  (V_MOV_B32_e32 imm:$imm)
+>;
+
+def : Pat <
+  (f32 fpimm:$imm),
+  (V_MOV_B32_e32 fpimm:$imm)
+>;
+
+def : Pat <
+  (i64 InlineImm<i64>:$imm),
+  (S_MOV_B64 InlineImm<i64>:$imm)
+>;
+
+/********** ===================== **********/
+/********** Interpolation Paterns **********/
+/********** ===================== **********/
+
+def : Pat <
+  (int_SI_fs_constant imm:$attr_chan, imm:$attr, i32:$params),
+  (V_INTERP_MOV_F32 INTERP.P0, imm:$attr_chan, imm:$attr, $params)
+>;
+
+def : Pat <
+  (int_SI_fs_interp imm:$attr_chan, imm:$attr, M0Reg:$params, v2i32:$ij),
+  (V_INTERP_P2_F32 (V_INTERP_P1_F32 (EXTRACT_SUBREG v2i32:$ij, sub0),
+                                    imm:$attr_chan, imm:$attr, i32:$params),
+                   (EXTRACT_SUBREG $ij, sub1),
+                   imm:$attr_chan, imm:$attr, $params)
+>;
+
+/********** ================== **********/
+/********** Intrinsic Patterns **********/
+/********** ================== **********/
+
+/* llvm.AMDGPU.pow */
+def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_LEGACY_F32_e32>;
+
+def : Pat <
+  (int_AMDGPU_div f32:$src0, f32:$src1),
+  (V_MUL_LEGACY_F32_e32 $src0, (V_RCP_LEGACY_F32_e32 $src1))
+>;
+
+def : Pat<
+  (fdiv f64:$src0, f64:$src1),
+  (V_MUL_F64 $src0, (V_RCP_F64_e32 $src1), (i64 0))
+>;
+
+def : Pat <
+  (int_AMDGPU_cube v4f32:$src),
+  (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)),
+    (V_CUBETC_F32 (EXTRACT_SUBREG $src, sub0),
+                  (EXTRACT_SUBREG $src, sub1),
+                  (EXTRACT_SUBREG $src, sub2)),
+                   sub0),
+    (V_CUBESC_F32 (EXTRACT_SUBREG $src, sub0),
+                  (EXTRACT_SUBREG $src, sub1),
+                  (EXTRACT_SUBREG $src, sub2)),
+                   sub1),
+    (V_CUBEMA_F32 (EXTRACT_SUBREG $src, sub0),
+                  (EXTRACT_SUBREG $src, sub1),
+                  (EXTRACT_SUBREG $src, sub2)),
+                   sub2),
+    (V_CUBEID_F32 (EXTRACT_SUBREG $src, sub0),
+                  (EXTRACT_SUBREG $src, sub1),
+                  (EXTRACT_SUBREG $src, sub2)),
+                   sub3)
+>;
+
+def : Pat <
+  (i32 (sext i1:$src0)),
+  (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src0)
+>;
+
+class Ext32Pat <SDNode ext> : Pat <
+  (i32 (ext i1:$src0)),
+  (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src0)
+>;
+
+def : Ext32Pat <zext>;
+def : Ext32Pat <anyext>;
+
+// Offset in an 32Bit VGPR
+def : Pat <
+  (SIload_constant v4i32:$sbase, i32:$voff),
+  (BUFFER_LOAD_DWORD_OFFEN $sbase, $voff, 0, 0, 0, 0, 0)
+>;
+
+// The multiplication scales from [0,1] to the unsigned integer range
+def : Pat <
+  (AMDGPUurecip i32:$src0),
+  (V_CVT_U32_F32_e32
+    (V_MUL_F32_e32 CONST.FP_UINT_MAX_PLUS_1,
+                   (V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0))))
+>;
+
+def : Pat <
+  (int_SI_tid),
+  (V_MBCNT_HI_U32_B32_e32 0xffffffff,
+                          (V_MBCNT_LO_U32_B32_e64 0xffffffff, 0, 0, 0))
+>;
+
+//===----------------------------------------------------------------------===//
+// VOP3 Patterns
+//===----------------------------------------------------------------------===//
+
+def : IMad24Pat<V_MAD_I32_I24>;
+def : UMad24Pat<V_MAD_U32_U24>;
+
+def : Pat <
+  (fadd f64:$src0, f64:$src1),
+  (V_ADD_F64 $src0, $src1, (i64 0))
+>;
+
+def : Pat <
+  (fmul f64:$src0, f64:$src1),
+  (V_MUL_F64 $src0, $src1, (i64 0))
+>;
+
+def : Pat <
+  (mul i32:$src0, i32:$src1),
+  (V_MUL_LO_I32 $src0, $src1, (i32 0))
+>;
+
+def : Pat <
+  (mulhu i32:$src0, i32:$src1),
+  (V_MUL_HI_U32 $src0, $src1, (i32 0))
+>;
+
+def : Pat <
+  (mulhs i32:$src0, i32:$src1),
+  (V_MUL_HI_I32 $src0, $src1, (i32 0))
+>;
+
+defm : BFIPatterns <V_BFI_B32, S_MOV_B32>;
+def : ROTRPattern <V_ALIGNBIT_B32>;
+
+/********** ======================= **********/
+/**********   Load/Store Patterns   **********/
+/********** ======================= **********/
+
+multiclass DSReadPat <DS inst, ValueType vt, PatFrag frag> {
+  def : Pat <
+    (vt (frag (add i32:$ptr, (i32 IMM16bit:$offset)))),
+    (inst (i1 0), $ptr, (as_i16imm $offset))
+  >;
+
+  def : Pat <
+    (frag i32:$src0),
+    (vt (inst 0, $src0, 0))
+  >;
+}
+
+defm : DSReadPat <DS_READ_I8,  i32, sextloadi8_local>;
+defm : DSReadPat <DS_READ_U8,  i32, az_extloadi8_local>;
+defm : DSReadPat <DS_READ_I16, i32, sextloadi16_local>;
+defm : DSReadPat <DS_READ_U16, i32, az_extloadi16_local>;
+defm : DSReadPat <DS_READ_B32, i32, local_load>;
+defm : DSReadPat <DS_READ_B64, v2i32, local_load>;
+
+multiclass DSWritePat <DS inst, ValueType vt, PatFrag frag> {
+  def : Pat <
+    (frag vt:$value, (add i32:$ptr, (i32 IMM16bit:$offset))),
+    (inst (i1 0), $ptr, $value, (as_i16imm $offset))
+  >;
+
+  def : Pat <
+    (frag vt:$val, i32:$ptr),
+    (inst 0, $ptr, $val, 0)
+  >;
+}
+
+defm : DSWritePat <DS_WRITE_B8, i32, truncstorei8_local>;
+defm : DSWritePat <DS_WRITE_B16, i32, truncstorei16_local>;
+defm : DSWritePat <DS_WRITE_B32, i32, local_store>;
+defm : DSWritePat <DS_WRITE_B64, v2i32, local_store>;
+
+multiclass DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> {
+  def : Pat <
+    (frag (add i32:$ptr, (i32 IMM16bit:$offset)), vt:$value),
+    (inst (i1 0), $ptr, $value, (as_i16imm $offset))
+  >;
+
+  def : Pat <
+    (frag i32:$ptr, vt:$val),
+    (inst 0, $ptr, $val, 0)
+  >;
+}
+
+// Special case of DSAtomicRetPat for add / sub 1 -> inc / dec
+//
+// We need to use something for the data0, so we set a register to
+// -1. For the non-rtn variants, the manual says it does
+// DS[A] = (DS[A] >= D0) ? 0 : DS[A] + 1, and setting D0 to uint_max
+// will always do the increment so I'm assuming it's the same.
+//
+// We also load this -1 with s_mov_b32 / s_mov_b64 even though this
+// needs to be a VGPR. The SGPR copy pass will fix this, and it's
+// easier since there is no v_mov_b64.
+multiclass DSAtomicIncRetPat<DS inst, ValueType vt,
+                             Instruction LoadImm, PatFrag frag> {
+  def : Pat <
+    (frag (add i32:$ptr, (i32 IMM16bit:$offset)), (vt 1)),
+    (inst (i1 0), $ptr, (LoadImm (vt -1)), (as_i16imm $offset))
+  >;
+
+  def : Pat <
+    (frag i32:$ptr, (vt 1)),
+    (inst 0, $ptr, (LoadImm (vt -1)), 0)
+  >;
+}
+
+multiclass DSAtomicCmpXChg <DS inst, ValueType vt, PatFrag frag> {
+  def : Pat <
+    (frag (add i32:$ptr, (i32 IMM16bit:$offset)), vt:$cmp, vt:$swap),
+    (inst (i1 0), $ptr, $cmp, $swap, (as_i16imm $offset))
+  >;
+
+  def : Pat <
+    (frag i32:$ptr, vt:$cmp, vt:$swap),
+    (inst 0, $ptr, $cmp, $swap, 0)
+  >;
+}
+
+
+// 32-bit atomics.
+defm : DSAtomicIncRetPat<DS_INC_RTN_U32, i32,
+                         S_MOV_B32, atomic_load_add_local>;
+defm : DSAtomicIncRetPat<DS_DEC_RTN_U32, i32,
+                         S_MOV_B32, atomic_load_sub_local>;
+
+defm : DSAtomicRetPat<DS_WRXCHG_RTN_B32, i32, atomic_swap_local>;
+defm : DSAtomicRetPat<DS_ADD_RTN_U32, i32, atomic_load_add_local>;
+defm : DSAtomicRetPat<DS_SUB_RTN_U32, i32, atomic_load_sub_local>;
+defm : DSAtomicRetPat<DS_AND_RTN_B32, i32, atomic_load_and_local>;
+defm : DSAtomicRetPat<DS_OR_RTN_B32, i32, atomic_load_or_local>;
+defm : DSAtomicRetPat<DS_XOR_RTN_B32, i32, atomic_load_xor_local>;
+defm : DSAtomicRetPat<DS_MIN_RTN_I32, i32, atomic_load_min_local>;
+defm : DSAtomicRetPat<DS_MAX_RTN_I32, i32, atomic_load_max_local>;
+defm : DSAtomicRetPat<DS_MIN_RTN_U32, i32, atomic_load_umin_local>;
+defm : DSAtomicRetPat<DS_MAX_RTN_U32, i32, atomic_load_umax_local>;
+
+defm : DSAtomicCmpXChg<DS_CMPST_RTN_B32, i32, atomic_cmp_swap_32_local>;
+
+// 64-bit atomics.
+defm : DSAtomicIncRetPat<DS_INC_RTN_U64, i64,
+                         S_MOV_B64, atomic_load_add_local>;
+defm : DSAtomicIncRetPat<DS_DEC_RTN_U64, i64,
+                         S_MOV_B64, atomic_load_sub_local>;
+
+defm : DSAtomicRetPat<DS_WRXCHG_RTN_B64, i64, atomic_swap_local>;
+defm : DSAtomicRetPat<DS_ADD_RTN_U64, i64, atomic_load_add_local>;
+defm : DSAtomicRetPat<DS_SUB_RTN_U64, i64, atomic_load_sub_local>;
+defm : DSAtomicRetPat<DS_AND_RTN_B64, i64, atomic_load_and_local>;
+defm : DSAtomicRetPat<DS_OR_RTN_B64, i64, atomic_load_or_local>;
+defm : DSAtomicRetPat<DS_XOR_RTN_B64, i64, atomic_load_xor_local>;
+defm : DSAtomicRetPat<DS_MIN_RTN_I64, i64, atomic_load_min_local>;
+defm : DSAtomicRetPat<DS_MAX_RTN_I64, i64, atomic_load_max_local>;
+defm : DSAtomicRetPat<DS_MIN_RTN_U64, i64, atomic_load_umin_local>;
+defm : DSAtomicRetPat<DS_MAX_RTN_U64, i64, atomic_load_umax_local>;
+
+defm : DSAtomicCmpXChg<DS_CMPST_RTN_B64, i64, atomic_cmp_swap_64_local>;
+
+
+//===----------------------------------------------------------------------===//
+// MUBUF Patterns
+//===----------------------------------------------------------------------===//
+
+multiclass MUBUFLoad_Pattern <MUBUF Instr_ADDR64, ValueType vt,
+                              PatFrag constant_ld> {
+  def : Pat <
+     (vt (constant_ld (add i64:$ptr, i64:$offset))),
+     (Instr_ADDR64 (SI_ADDR64_RSRC $ptr), $offset, 0)
+  >;
+
+}
+
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_ADDR64, i32, sextloadi8_constant>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_ADDR64, i32, az_extloadi8_constant>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_SSHORT_ADDR64, i32, sextloadi16_constant>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_ADDR64, i32, az_extloadi16_constant>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORD_ADDR64, i32, constant_load>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, v2i32, constant_load>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX4_ADDR64, v4i32, constant_load>;
+
+class MUBUFScratchLoadPat <MUBUF Instr, ValueType vt, PatFrag ld> : Pat <
+  (vt (ld (MUBUFScratch v4i32:$srsrc, i32:$vaddr,
+                        i32:$soffset, u16imm:$offset))),
+  (Instr $srsrc, $vaddr, $soffset, $offset, 0, 0, 0)
+>;
+
+def : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, i32, sextloadi8_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, i32, extloadi8_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_SSHORT_OFFEN, i32, sextloadi16_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, i32, extloadi16_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, i32, load_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, v2i32, load_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, v4i32, load_private>;
+
+// BUFFER_LOAD_DWORD*, addr64=0
+multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxen,
+                             MUBUF bothen> {
+
+  def : Pat <
+    (vt (int_SI_buffer_load_dword v4i32:$rsrc, (i32 imm), i32:$soffset,
+                                  imm:$offset, 0, 0, imm:$glc, imm:$slc,
+                                  imm:$tfe)),
+    (offset $rsrc, (as_i16imm $offset), $soffset, (as_i1imm $glc),
+            (as_i1imm $slc), (as_i1imm $tfe))
+  >;
+
+  def : Pat <
+    (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset,
+                                  imm:$offset, 1, 0, imm:$glc, imm:$slc,
+                                  imm:$tfe)),
+    (offen $rsrc, $vaddr, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc),
+           (as_i1imm $tfe))
+  >;
+
+  def : Pat <
+    (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset,
+                                  imm:$offset, 0, 1, imm:$glc, imm:$slc,
+                                  imm:$tfe)),
+    (idxen $rsrc, $vaddr, (as_i16imm $offset), $soffset, (as_i1imm $glc),
+           (as_i1imm $slc), (as_i1imm $tfe))
+  >;
+
+  def : Pat <
+    (vt (int_SI_buffer_load_dword v4i32:$rsrc, v2i32:$vaddr, i32:$soffset,
+                                  imm, 1, 1, imm:$glc, imm:$slc,
+                                  imm:$tfe)),
+    (bothen $rsrc, $vaddr, $soffset, (as_i1imm $glc), (as_i1imm $slc),
+            (as_i1imm $tfe))
+  >;
+}
+
+defm : MUBUF_Load_Dword <i32, BUFFER_LOAD_DWORD_OFFSET, BUFFER_LOAD_DWORD_OFFEN,
+                         BUFFER_LOAD_DWORD_IDXEN, BUFFER_LOAD_DWORD_BOTHEN>;
+defm : MUBUF_Load_Dword <v2i32, BUFFER_LOAD_DWORDX2_OFFSET, BUFFER_LOAD_DWORDX2_OFFEN,
+                         BUFFER_LOAD_DWORDX2_IDXEN, BUFFER_LOAD_DWORDX2_BOTHEN>;
+defm : MUBUF_Load_Dword <v4i32, BUFFER_LOAD_DWORDX4_OFFSET, BUFFER_LOAD_DWORDX4_OFFEN,
+                         BUFFER_LOAD_DWORDX4_IDXEN, BUFFER_LOAD_DWORDX4_BOTHEN>;
+
+class MUBUFScratchStorePat <MUBUF Instr, ValueType vt, PatFrag st> : Pat <
+  (st vt:$value, (MUBUFAddr32 v4i32:$srsrc, i32:$vaddr, i32:$soffset,
+                              u16imm:$offset, i1imm:$offen, i1imm:$idxen,
+                              i1imm:$glc, i1imm:$slc, i1imm:$tfe)),
+  (Instr $value, $srsrc, $vaddr, $soffset, $offset, $offen, $idxen,
+         $glc, $slc, $tfe)
+>;
+
+def : MUBUFScratchStorePat <BUFFER_STORE_BYTE, i32, truncstorei8_private>;
+def : MUBUFScratchStorePat <BUFFER_STORE_SHORT, i32, truncstorei16_private>;
+def : MUBUFScratchStorePat <BUFFER_STORE_DWORD, i32, store_private>;
+def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2, v2i32, store_private>;
+def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4, v4i32, store_private>;
+
+/*
+class MUBUFStore_Pattern <MUBUF Instr, ValueType vt, PatFrag st> : Pat <
+  (st vt:$value, (MUBUFScratch v4i32:$srsrc, i64:$vaddr, u16imm:$offset)),
+  (Instr $value, $srsrc, $vaddr, $offset)
+>;
+
+def : MUBUFStore_Pattern <BUFFER_STORE_BYTE_ADDR64, i32, truncstorei8_private>;
+def : MUBUFStore_Pattern <BUFFER_STORE_SHORT_ADDR64, i32, truncstorei16_private>;
+def : MUBUFStore_Pattern <BUFFER_STORE_DWORD_ADDR64, i32, store_private>;
+def : MUBUFStore_Pattern <BUFFER_STORE_DWORDX2_ADDR64, v2i32, store_private>;
+def : MUBUFStore_Pattern <BUFFER_STORE_DWORDX4_ADDR64, v4i32, store_private>;
+
+*/
+
+//===----------------------------------------------------------------------===//
+// MTBUF Patterns
+//===----------------------------------------------------------------------===//
+
+// TBUFFER_STORE_FORMAT_*, addr64=0
+class MTBUF_StoreResource <ValueType vt, int num_channels, MTBUF opcode> : Pat<
+  (SItbuffer_store v4i32:$rsrc, vt:$vdata, num_channels, i32:$vaddr,
+                   i32:$soffset, imm:$inst_offset, imm:$dfmt,
+                   imm:$nfmt, imm:$offen, imm:$idxen,
+                   imm:$glc, imm:$slc, imm:$tfe),
+  (opcode
+    $vdata, (as_i16imm $inst_offset), (as_i1imm $offen), (as_i1imm $idxen),
+    (as_i1imm $glc), 0, (as_i8imm $dfmt), (as_i8imm $nfmt), $vaddr, $rsrc,
+    (as_i1imm $slc), (as_i1imm $tfe), $soffset)
+>;
+
+def : MTBUF_StoreResource <i32, 1, TBUFFER_STORE_FORMAT_X>;
+def : MTBUF_StoreResource <v2i32, 2, TBUFFER_STORE_FORMAT_XY>;
+def : MTBUF_StoreResource <v4i32, 3, TBUFFER_STORE_FORMAT_XYZ>;
+def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>;
+
+let SubtargetPredicate = isCI in {
+
+// Sea island new arithmetic instructinos
+let neverHasSideEffects = 1 in {
+defm V_TRUNC_F64 : VOP1_64 <0x00000017, "V_TRUNC_F64",
+  [(set f64:$dst, (ftrunc f64:$src0))]
+>;
+defm V_CEIL_F64 : VOP1_64 <0x00000018, "V_CEIL_F64",
+  [(set f64:$dst, (fceil f64:$src0))]
+>;
+defm V_FLOOR_F64 : VOP1_64 <0x0000001A, "V_FLOOR_F64",
+  [(set f64:$dst, (ffloor f64:$src0))]
+>;
+defm V_RNDNE_F64 : VOP1_64 <0x00000019, "V_RNDNE_F64",
+  [(set f64:$dst, (frint f64:$src0))]
+>;
+
+defm V_QSAD_PK_U16_U8 : VOP3_32 <0x00000173, "V_QSAD_PK_U16_U8", []>;
+defm V_MQSAD_U16_U8 : VOP3_32 <0x000000172, "V_MQSAD_U16_U8", []>;
+defm V_MQSAD_U32_U8 : VOP3_32 <0x00000175, "V_MQSAD_U32_U8", []>;
+def V_MAD_U64_U32 : VOP3_64 <0x00000176, "V_MAD_U64_U32", []>;
+
+// XXX - Does this set VCC?
+def V_MAD_I64_I32 : VOP3_64 <0x00000177, "V_MAD_I64_I32", []>;
+} // End neverHasSideEffects = 1
+
+// Remaining instructions:
+// FLAT_*
+// S_CBRANCH_CDBGUSER
+// S_CBRANCH_CDBGSYS
+// S_CBRANCH_CDBGSYS_OR_USER
+// S_CBRANCH_CDBGSYS_AND_USER
+// S_DCACHE_INV_VOL
+// V_EXP_LEGACY_F32
+// V_LOG_LEGACY_F32
+// DS_NOP
+// DS_GWS_SEMA_RELEASE_ALL
+// DS_WRAP_RTN_B32
+// DS_CNDXCHG32_RTN_B64
+// DS_WRITE_B96
+// DS_WRITE_B128
+// DS_CONDXCHG32_RTN_B128
+// DS_READ_B96
+// DS_READ_B128
+// BUFFER_LOAD_DWORDX3
+// BUFFER_STORE_DWORDX3
+
+} // End iSCI
+
+
+/********** ====================== **********/
+/**********   Indirect adressing   **********/
+/********** ====================== **********/
+
+multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, SI_INDIRECT_DST IndDst> {
+
+  // 1. Extract with offset
+  def : Pat<
+    (vector_extract vt:$vec, (add i32:$idx, imm:$off)),
+    (eltvt (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, imm:$off))
+  >;
+
+  // 2. Extract without offset
+  def : Pat<
+    (vector_extract vt:$vec, i32:$idx),
+    (eltvt (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, 0))
+  >;
+
+  // 3. Insert with offset
+  def : Pat<
+    (vector_insert vt:$vec, eltvt:$val, (add i32:$idx, imm:$off)),
+    (IndDst (IMPLICIT_DEF), $vec, $idx, imm:$off, $val)
+  >;
+
+  // 4. Insert without offset
+  def : Pat<
+    (vector_insert vt:$vec, eltvt:$val, i32:$idx),
+    (IndDst (IMPLICIT_DEF), $vec, $idx, 0, $val)
+  >;
+}
+
+defm : SI_INDIRECT_Pattern <v2f32, f32, SI_INDIRECT_DST_V2>;
+defm : SI_INDIRECT_Pattern <v4f32, f32, SI_INDIRECT_DST_V4>;
+defm : SI_INDIRECT_Pattern <v8f32, f32, SI_INDIRECT_DST_V8>;
+defm : SI_INDIRECT_Pattern <v16f32, f32, SI_INDIRECT_DST_V16>;
+
+defm : SI_INDIRECT_Pattern <v2i32, i32, SI_INDIRECT_DST_V2>;
+defm : SI_INDIRECT_Pattern <v4i32, i32, SI_INDIRECT_DST_V4>;
+defm : SI_INDIRECT_Pattern <v8i32, i32, SI_INDIRECT_DST_V8>;
+defm : SI_INDIRECT_Pattern <v16i32, i32, SI_INDIRECT_DST_V16>;
+
+//===----------------------------------------------------------------------===//
+// Conversion Patterns
+//===----------------------------------------------------------------------===//
+
+def : Pat<(i32 (sext_inreg i32:$src, i1)),
+  (S_BFE_I32 i32:$src, 65536)>; // 0 | 1 << 16
+
+// TODO: Match 64-bit BFE. SI has a 64-bit BFE, but it's scalar only so it
+// might not be worth the effort, and will need to expand to shifts when
+// fixing SGPR copies.
+
+// Handle sext_inreg in i64
+def : Pat <
+  (i64 (sext_inreg i64:$src, i1)),
+  (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+    (S_BFE_I32 (EXTRACT_SUBREG i64:$src, sub0), 65536), sub0), // 0 | 1 << 16
+    (S_MOV_B32 -1), sub1)
+>;
+
+def : Pat <
+  (i64 (sext_inreg i64:$src, i8)),
+  (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+    (S_SEXT_I32_I8 (EXTRACT_SUBREG i64:$src, sub0)), sub0),
+    (S_MOV_B32 -1), sub1)
+>;
+
+def : Pat <
+  (i64 (sext_inreg i64:$src, i16)),
+  (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+    (S_SEXT_I32_I16 (EXTRACT_SUBREG i64:$src, sub0)), sub0),
+    (S_MOV_B32 -1), sub1)
+>;
+
+class ZExt_i64_i32_Pat <SDNode ext> : Pat <
+  (i64 (ext i32:$src)),
+  (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $src, sub0),
+    (S_MOV_B32 0), sub1)
+>;
+
+class ZExt_i64_i1_Pat <SDNode ext> : Pat <
+  (i64 (ext i1:$src)),
+  (INSERT_SUBREG
+    (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+      (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src), sub0),
+    (S_MOV_B32 0), sub1)
+>;
+
+
+def : ZExt_i64_i32_Pat<zext>;
+def : ZExt_i64_i32_Pat<anyext>;
+def : ZExt_i64_i1_Pat<zext>;
+def : ZExt_i64_i1_Pat<anyext>;
+
+def : Pat <
+  (i64 (sext i32:$src)),
+    (INSERT_SUBREG
+      (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $src, sub0),
+      (S_ASHR_I32 $src, 31), sub1)
+>;
+
+def : Pat <
+  (i64 (sext i1:$src)),
+  (INSERT_SUBREG
+    (INSERT_SUBREG
+      (i64 (IMPLICIT_DEF)),
+      (V_CNDMASK_B32_e64 0, -1, $src), sub0),
+    (V_CNDMASK_B32_e64 0, -1, $src), sub1)
+>;
+
+def : Pat <
+  (f32 (sint_to_fp i1:$src)),
+  (V_CNDMASK_B32_e64 (i32 0), CONST.FP32_NEG_ONE, $src)
+>;
+
+def : Pat <
+  (f32 (uint_to_fp i1:$src)),
+  (V_CNDMASK_B32_e64 (i32 0), CONST.FP32_ONE, $src)
+>;
+
+def : Pat <
+  (f64 (sint_to_fp i1:$src)),
+    (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src))
+>;
+
+def : Pat <
+  (f64 (uint_to_fp i1:$src)),
+  (V_CVT_F64_U32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src))
+>;
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous Patterns
+//===----------------------------------------------------------------------===//
+
+def : Pat <
+  (i32 (trunc i64:$a)),
+  (EXTRACT_SUBREG $a, sub0)
+>;
+
+def : Pat <
+  (i1 (trunc i32:$a)),
+  (V_CMP_EQ_I32_e64 (V_AND_B32_e32 (i32 1), $a), 1)
+>;
+
+//============================================================================//
+// Miscellaneous Optimization Patterns
+//============================================================================//
+
+def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e32>;
+
+} // End isSI predicate
diff --git a/contrib/llvm/lib/Target/R600/SIIntrinsics.td b/contrib/llvm/lib/Target/R600/SIIntrinsics.td
new file mode 100644
index 0000000..027a0a2
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/SIIntrinsics.td
@@ -0,0 +1,199 @@
+//===-- SIIntrinsics.td - SI Intrinsic defs ----------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// SI Intrinsic Definitions
+//
+//===----------------------------------------------------------------------===//
+
+
+let TargetPrefix = "SI", isTarget = 1 in {
+
+  def int_SI_tid : Intrinsic <[llvm_i32_ty], [], [IntrNoMem]>;
+  def int_SI_packf16 : Intrinsic <[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_SI_export : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>;
+  def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_anyint_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]> ;
+
+  // Fully-flexible TBUFFER_STORE_FORMAT_* except for the ADDR64 bit, which is not exposed
+  def int_SI_tbuffer_store : Intrinsic <
+    [],
+    [llvm_anyint_ty, // rsrc(SGPR)
+     llvm_anyint_ty, // vdata(VGPR), overloaded for types i32, v2i32, v4i32
+     llvm_i32_ty,    // num_channels(imm), selects opcode suffix: 1=X, 2=XY, 3=XYZ, 4=XYZW
+     llvm_i32_ty,    // vaddr(VGPR)
+     llvm_i32_ty,    // soffset(SGPR)
+     llvm_i32_ty,    // inst_offset(imm)
+     llvm_i32_ty,    // dfmt(imm)
+     llvm_i32_ty,    // nfmt(imm)
+     llvm_i32_ty,    // offen(imm)
+     llvm_i32_ty,    // idxen(imm)
+     llvm_i32_ty,    // glc(imm)
+     llvm_i32_ty,    // slc(imm)
+     llvm_i32_ty],   // tfe(imm)
+    []>;
+
+  // Fully-flexible BUFFER_LOAD_DWORD_* except for the ADDR64 bit, which is not exposed
+  def int_SI_buffer_load_dword : Intrinsic <
+    [llvm_anyint_ty], // vdata(VGPR), overloaded for types i32, v2i32, v4i32
+    [llvm_anyint_ty,  // rsrc(SGPR)
+     llvm_anyint_ty,  // vaddr(VGPR)
+     llvm_i32_ty,     // soffset(SGPR)
+     llvm_i32_ty,     // inst_offset(imm)
+     llvm_i32_ty,     // offen(imm)
+     llvm_i32_ty,     // idxen(imm)
+     llvm_i32_ty,     // glc(imm)
+     llvm_i32_ty,     // slc(imm)
+     llvm_i32_ty],    // tfe(imm)
+    [IntrReadArgMem]>;
+
+  def int_SI_sendmsg : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+
+  // Fully-flexible SAMPLE instruction.
+  class SampleRaw : Intrinsic <
+    [llvm_v4f32_ty],    // vdata(VGPR)
+    [llvm_anyint_ty,    // vaddr(VGPR)
+     llvm_v8i32_ty,     // rsrc(SGPR)
+     llvm_v4i32_ty,     // sampler(SGPR)
+     llvm_i32_ty,       // dmask(imm)
+     llvm_i32_ty,       // unorm(imm)
+     llvm_i32_ty,       // r128(imm)
+     llvm_i32_ty,       // da(imm)
+     llvm_i32_ty,       // glc(imm)
+     llvm_i32_ty,       // slc(imm)
+     llvm_i32_ty,       // tfe(imm)
+     llvm_i32_ty],      // lwe(imm)
+    [IntrNoMem]>;
+
+  // Image instruction without a sampler.
+  class Image : Intrinsic <
+    [llvm_v4f32_ty],    // vdata(VGPR)
+    [llvm_anyint_ty,    // vaddr(VGPR)
+     llvm_v8i32_ty,     // rsrc(SGPR)
+     llvm_i32_ty,       // dmask(imm)
+     llvm_i32_ty,       // unorm(imm)
+     llvm_i32_ty,       // r128(imm)
+     llvm_i32_ty,       // da(imm)
+     llvm_i32_ty,       // glc(imm)
+     llvm_i32_ty,       // slc(imm)
+     llvm_i32_ty,       // tfe(imm)
+     llvm_i32_ty],      // lwe(imm)
+    [IntrNoMem]>;
+
+  // Basic sample
+  def int_SI_image_sample : SampleRaw;
+  def int_SI_image_sample_cl : SampleRaw;
+  def int_SI_image_sample_d : SampleRaw;
+  def int_SI_image_sample_d_cl : SampleRaw;
+  def int_SI_image_sample_l : SampleRaw;
+  def int_SI_image_sample_b : SampleRaw;
+  def int_SI_image_sample_b_cl : SampleRaw;
+  def int_SI_image_sample_lz : SampleRaw;
+  def int_SI_image_sample_cd : SampleRaw;
+  def int_SI_image_sample_cd_cl : SampleRaw;
+
+  // Sample with comparison
+  def int_SI_image_sample_c : SampleRaw;
+  def int_SI_image_sample_c_cl : SampleRaw;
+  def int_SI_image_sample_c_d : SampleRaw;
+  def int_SI_image_sample_c_d_cl : SampleRaw;
+  def int_SI_image_sample_c_l : SampleRaw;
+  def int_SI_image_sample_c_b : SampleRaw;
+  def int_SI_image_sample_c_b_cl : SampleRaw;
+  def int_SI_image_sample_c_lz : SampleRaw;
+  def int_SI_image_sample_c_cd : SampleRaw;
+  def int_SI_image_sample_c_cd_cl : SampleRaw;
+
+  // Sample with offsets
+  def int_SI_image_sample_o : SampleRaw;
+  def int_SI_image_sample_cl_o : SampleRaw;
+  def int_SI_image_sample_d_o : SampleRaw;
+  def int_SI_image_sample_d_cl_o : SampleRaw;
+  def int_SI_image_sample_l_o : SampleRaw;
+  def int_SI_image_sample_b_o : SampleRaw;
+  def int_SI_image_sample_b_cl_o : SampleRaw;
+  def int_SI_image_sample_lz_o : SampleRaw;
+  def int_SI_image_sample_cd_o : SampleRaw;
+  def int_SI_image_sample_cd_cl_o : SampleRaw;
+
+  // Sample with comparison and offsets
+  def int_SI_image_sample_c_o : SampleRaw;
+  def int_SI_image_sample_c_cl_o : SampleRaw;
+  def int_SI_image_sample_c_d_o : SampleRaw;
+  def int_SI_image_sample_c_d_cl_o : SampleRaw;
+  def int_SI_image_sample_c_l_o : SampleRaw;
+  def int_SI_image_sample_c_b_o : SampleRaw;
+  def int_SI_image_sample_c_b_cl_o : SampleRaw;
+  def int_SI_image_sample_c_lz_o : SampleRaw;
+  def int_SI_image_sample_c_cd_o : SampleRaw;
+  def int_SI_image_sample_c_cd_cl_o : SampleRaw;
+
+  // Basic gather4
+  def int_SI_gather4 : SampleRaw;
+  def int_SI_gather4_cl : SampleRaw;
+  def int_SI_gather4_l : SampleRaw;
+  def int_SI_gather4_b : SampleRaw;
+  def int_SI_gather4_b_cl : SampleRaw;
+  def int_SI_gather4_lz : SampleRaw;
+
+  // Gather4 with comparison
+  def int_SI_gather4_c : SampleRaw;
+  def int_SI_gather4_c_cl : SampleRaw;
+  def int_SI_gather4_c_l : SampleRaw;
+  def int_SI_gather4_c_b : SampleRaw;
+  def int_SI_gather4_c_b_cl : SampleRaw;
+  def int_SI_gather4_c_lz : SampleRaw;
+
+  // Gather4 with offsets
+  def int_SI_gather4_o : SampleRaw;
+  def int_SI_gather4_cl_o : SampleRaw;
+  def int_SI_gather4_l_o : SampleRaw;
+  def int_SI_gather4_b_o : SampleRaw;
+  def int_SI_gather4_b_cl_o : SampleRaw;
+  def int_SI_gather4_lz_o : SampleRaw;
+
+  // Gather4 with comparison and offsets
+  def int_SI_gather4_c_o : SampleRaw;
+  def int_SI_gather4_c_cl_o : SampleRaw;
+  def int_SI_gather4_c_l_o : SampleRaw;
+  def int_SI_gather4_c_b_o : SampleRaw;
+  def int_SI_gather4_c_b_cl_o : SampleRaw;
+  def int_SI_gather4_c_lz_o : SampleRaw;
+
+  def int_SI_getlod : SampleRaw;
+
+  // Image instrinsics.
+  def int_SI_image_load : Image;
+  def int_SI_image_load_mip : Image;
+  def int_SI_getresinfo : Image;
+
+  // Deprecated image and sample intrinsics.
+  class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
+
+  def int_SI_sample : Sample;
+  def int_SI_sampleb : Sample;
+  def int_SI_sampled : Sample;
+  def int_SI_samplel : Sample;
+  def int_SI_imageload : Intrinsic <[llvm_v4i32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_SI_resinfo : Intrinsic <[llvm_v4i32_ty], [llvm_i32_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
+
+  /* Interpolation Intrinsics */
+
+  def int_SI_fs_constant : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_SI_fs_interp : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_v2i32_ty], [IntrNoMem]>;
+
+  /* Control flow Intrinsics */
+
+  def int_SI_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], []>;
+  def int_SI_else : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_empty_ty], []>;
+  def int_SI_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty], []>;
+  def int_SI_if_break : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_i64_ty], []>;
+  def int_SI_else_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], []>;
+  def int_SI_loop : Intrinsic<[], [llvm_i64_ty, llvm_empty_ty], []>;
+  def int_SI_end_cf : Intrinsic<[], [llvm_i64_ty], []>;
+}
diff --git a/contrib/llvm/lib/Target/R600/SILowerControlFlow.cpp b/contrib/llvm/lib/Target/R600/SILowerControlFlow.cpp
new file mode 100644
index 0000000..75b5a5e
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/SILowerControlFlow.cpp
@@ -0,0 +1,552 @@
+//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief This pass lowers the pseudo control flow instructions to real
+/// machine instructions.
+///
+/// All control flow is handled using predicated instructions and
+/// a predicate stack.  Each Scalar ALU controls the operations of 64 Vector
+/// ALUs.  The Scalar ALU can update the predicate for any of the Vector ALUs
+/// by writting to the 64-bit EXEC register (each bit corresponds to a
+/// single vector ALU).  Typically, for predicates, a vector ALU will write
+/// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
+/// Vector ALU) and then the ScalarALU will AND the VCC register with the
+/// EXEC to update the predicates.
+///
+/// For example:
+/// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2
+/// %SGPR0 = SI_IF %VCC
+///   %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0
+/// %SGPR0 = SI_ELSE %SGPR0
+///   %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0
+/// SI_END_CF %SGPR0
+///
+/// becomes:
+///
+/// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC  // Save and update the exec mask
+/// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC  // Clear live bits from saved exec mask
+/// S_CBRANCH_EXECZ label0            // This instruction is an optional
+///                                   // optimization which allows us to
+///                                   // branch if all the bits of
+///                                   // EXEC are zero.
+/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch
+///
+/// label0:
+/// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC   // Restore the exec mask for the Then block
+/// %EXEC = S_XOR_B64 %SGPR0, %EXEC    // Clear live bits from saved exec mask
+/// S_BRANCH_EXECZ label1              // Use our branch optimization
+///                                    // instruction again.
+/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR   // Do the THEN block
+/// label1:
+/// %EXEC = S_OR_B64 %EXEC, %SGPR0     // Re-enable saved exec mask bits
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
+
+using namespace llvm;
+
+namespace {
+
+class SILowerControlFlowPass : public MachineFunctionPass {
+
+private:
+  static const unsigned SkipThreshold = 12;
+
+  static char ID;
+  const SIRegisterInfo *TRI;
+  const SIInstrInfo *TII;
+
+  bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);
+
+  void Skip(MachineInstr &From, MachineOperand &To);
+  void SkipIfDead(MachineInstr &MI);
+
+  void If(MachineInstr &MI);
+  void Else(MachineInstr &MI);
+  void Break(MachineInstr &MI);
+  void IfBreak(MachineInstr &MI);
+  void ElseBreak(MachineInstr &MI);
+  void Loop(MachineInstr &MI);
+  void EndCf(MachineInstr &MI);
+
+  void Kill(MachineInstr &MI);
+  void Branch(MachineInstr &MI);
+
+  void InitM0ForLDS(MachineBasicBlock::iterator MI);
+  void LoadM0(MachineInstr &MI, MachineInstr *MovRel);
+  void IndirectSrc(MachineInstr &MI);
+  void IndirectDst(MachineInstr &MI);
+
+public:
+  SILowerControlFlowPass(TargetMachine &tm) :
+    MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "SI Lower control flow instructions";
+  }
+
+};
+
+} // End anonymous namespace
+
+char SILowerControlFlowPass::ID = 0;
+
+FunctionPass *llvm::createSILowerControlFlowPass(TargetMachine &tm) {
+  return new SILowerControlFlowPass(tm);
+}
+
+bool SILowerControlFlowPass::shouldSkip(MachineBasicBlock *From,
+                                        MachineBasicBlock *To) {
+
+  unsigned NumInstr = 0;
+
+  for (MachineBasicBlock *MBB = From; MBB != To && !MBB->succ_empty();
+       MBB = *MBB->succ_begin()) {
+
+    for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+         NumInstr < SkipThreshold && I != E; ++I) {
+
+      if (I->isBundle() || !I->isBundled())
+        if (++NumInstr >= SkipThreshold)
+          return true;
+    }
+  }
+
+  return false;
+}
+
+void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) {
+
+  if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB()))
+    return;
+
+  DebugLoc DL = From.getDebugLoc();
+  BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
+          .addOperand(To)
+          .addReg(AMDGPU::EXEC);
+}
+
+void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) {
+
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+
+  if (MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getShaderType() !=
+      ShaderType::PIXEL ||
+      !shouldSkip(&MBB, &MBB.getParent()->back()))
+    return;
+
+  MachineBasicBlock::iterator Insert = &MI;
+  ++Insert;
+
+  // If the exec mask is non-zero, skip the next two instructions
+  BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+          .addImm(3)
+          .addReg(AMDGPU::EXEC);
+
+  // Exec mask is zero: Export to NULL target...
+  BuildMI(MBB, Insert, DL, TII->get(AMDGPU::EXP))
+          .addImm(0)
+          .addImm(0x09) // V_008DFC_SQ_EXP_NULL
+          .addImm(0)
+          .addImm(1)
+          .addImm(1)
+          .addReg(AMDGPU::VGPR0)
+          .addReg(AMDGPU::VGPR0)
+          .addReg(AMDGPU::VGPR0)
+          .addReg(AMDGPU::VGPR0);
+
+  // ... and terminate wavefront
+  BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
+}
+
+void SILowerControlFlowPass::If(MachineInstr &MI) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+  unsigned Reg = MI.getOperand(0).getReg();
+  unsigned Vcc = MI.getOperand(1).getReg();
+
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg)
+          .addReg(Vcc);
+
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg)
+          .addReg(AMDGPU::EXEC)
+          .addReg(Reg);
+
+  Skip(MI, MI.getOperand(2));
+
+  MI.eraseFromParent();
+}
+
+void SILowerControlFlowPass::Else(MachineInstr &MI) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+  unsigned Dst = MI.getOperand(0).getReg();
+  unsigned Src = MI.getOperand(1).getReg();
+
+  BuildMI(MBB, MBB.getFirstNonPHI(), DL,
+          TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst)
+          .addReg(Src); // Saved EXEC
+
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
+          .addReg(AMDGPU::EXEC)
+          .addReg(Dst);
+
+  Skip(MI, MI.getOperand(2));
+
+  MI.eraseFromParent();
+}
+
+void SILowerControlFlowPass::Break(MachineInstr &MI) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+
+  unsigned Dst = MI.getOperand(0).getReg();
+  unsigned Src = MI.getOperand(1).getReg();
+ 
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
+          .addReg(AMDGPU::EXEC)
+          .addReg(Src);
+
+  MI.eraseFromParent();
+}
+
+void SILowerControlFlowPass::IfBreak(MachineInstr &MI) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+
+  unsigned Dst = MI.getOperand(0).getReg();
+  unsigned Vcc = MI.getOperand(1).getReg();
+  unsigned Src = MI.getOperand(2).getReg();
+ 
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
+          .addReg(Vcc)
+          .addReg(Src);
+
+  MI.eraseFromParent();
+}
+
+void SILowerControlFlowPass::ElseBreak(MachineInstr &MI) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+
+  unsigned Dst = MI.getOperand(0).getReg();
+  unsigned Saved = MI.getOperand(1).getReg();
+  unsigned Src = MI.getOperand(2).getReg();
+ 
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
+          .addReg(Saved)
+          .addReg(Src);
+
+  MI.eraseFromParent();
+}
+
+void SILowerControlFlowPass::Loop(MachineInstr &MI) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+  unsigned Src = MI.getOperand(0).getReg();
+
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC)
+          .addReg(AMDGPU::EXEC)
+          .addReg(Src);
+
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+          .addOperand(MI.getOperand(1))
+          .addReg(AMDGPU::EXEC);
+
+  MI.eraseFromParent();
+}
+
+void SILowerControlFlowPass::EndCf(MachineInstr &MI) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+  unsigned Reg = MI.getOperand(0).getReg();
+
+  BuildMI(MBB, MBB.getFirstNonPHI(), DL,
+          TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
+          .addReg(AMDGPU::EXEC)
+          .addReg(Reg);
+
+  MI.eraseFromParent();
+}
+
+void SILowerControlFlowPass::Branch(MachineInstr &MI) {
+  if (MI.getOperand(0).getMBB() == MI.getParent()->getNextNode())
+    MI.eraseFromParent();
+
+  // If these aren't equal, this is probably an infinite loop.
+}
+
+void SILowerControlFlowPass::Kill(MachineInstr &MI) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+  const MachineOperand &Op = MI.getOperand(0);
+
+#ifndef NDEBUG
+  const SIMachineFunctionInfo *MFI
+    = MBB.getParent()->getInfo<SIMachineFunctionInfo>();
+  // Kill is only allowed in pixel / geometry shaders.
+  assert(MFI->getShaderType() == ShaderType::PIXEL ||
+         MFI->getShaderType() == ShaderType::GEOMETRY);
+#endif
+
+  // Clear this thread from the exec mask if the operand is negative
+  if ((Op.isImm() || Op.isFPImm())) {
+    // Constant operand: Set exec mask to 0 or do nothing
+    if (Op.isImm() ? (Op.getImm() & 0x80000000) :
+        Op.getFPImm()->isNegative()) {
+      BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
+              .addImm(0);
+    }
+  } else {
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32), AMDGPU::VCC)
+           .addImm(0)
+           .addOperand(Op);
+  }
+
+  MI.eraseFromParent();
+}
+
+/// The m0 register stores the maximum allowable address for LDS reads and
+/// writes.  Its value must be at least the size in bytes of LDS allocated by
+/// the shader.  For simplicity, we set it to the maximum possible value.
+void SILowerControlFlowPass::InitM0ForLDS(MachineBasicBlock::iterator MI) {
+    BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),  TII->get(AMDGPU::S_MOV_B32),
+            AMDGPU::M0).addImm(0xffffffff);
+}
+
+void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) {
+
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+  MachineBasicBlock::iterator I = MI;
+
+  unsigned Save = MI.getOperand(1).getReg();
+  unsigned Idx = MI.getOperand(3).getReg();
+
+  if (AMDGPU::SReg_32RegClass.contains(Idx)) {
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+            .addReg(Idx);
+    MBB.insert(I, MovRel);
+  } else {
+
+    assert(AMDGPU::SReg_64RegClass.contains(Save));
+    assert(AMDGPU::VReg_32RegClass.contains(Idx));
+
+    // Save the EXEC mask
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save)
+            .addReg(AMDGPU::EXEC);
+
+    // Read the next variant into VCC (lower 32 bits) <- also loop target
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+            AMDGPU::VCC_LO)
+            .addReg(Idx);
+
+    // Move index from VCC into M0
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+            .addReg(AMDGPU::VCC_LO);
+
+    // Compare the just read M0 value to all possible Idx values
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32), AMDGPU::VCC)
+            .addReg(AMDGPU::M0)
+            .addReg(Idx);
+
+    // Update EXEC, save the original EXEC value to VCC
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
+            .addReg(AMDGPU::VCC);
+
+    // Do the actual move
+    MBB.insert(I, MovRel);
+
+    // Update EXEC, switch all done bits to 0 and all todo bits to 1
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
+            .addReg(AMDGPU::EXEC)
+            .addReg(AMDGPU::VCC);
+
+    // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+            .addImm(-7)
+            .addReg(AMDGPU::EXEC);
+
+    // Restore EXEC
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
+            .addReg(Save);
+
+  }
+  // FIXME: Are there any values other than the LDS address clamp that need to
+  // be stored in the m0 register and may be live for more than a few
+  // instructions?  If so, we should save the m0 register at the beginning
+  // of this function and restore it here.
+  // FIXME: Add support for LDS direct loads.
+  InitM0ForLDS(&MI);
+  MI.eraseFromParent();
+}
+
+void SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) {
+
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+
+  unsigned Dst = MI.getOperand(0).getReg();
+  unsigned Vec = MI.getOperand(2).getReg();
+  unsigned Off = MI.getOperand(4).getImm();
+  unsigned SubReg = TRI->getSubReg(Vec, AMDGPU::sub0);
+  if (!SubReg)
+    SubReg = Vec;
+
+  MachineInstr *MovRel =
+    BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
+            .addReg(SubReg + Off)
+            .addReg(AMDGPU::M0, RegState::Implicit)
+            .addReg(Vec, RegState::Implicit);
+
+  LoadM0(MI, MovRel);
+}
+
+void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) {
+
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+
+  unsigned Dst = MI.getOperand(0).getReg();
+  unsigned Off = MI.getOperand(4).getImm();
+  unsigned Val = MI.getOperand(5).getReg();
+  unsigned SubReg = TRI->getSubReg(Dst, AMDGPU::sub0);
+  if (!SubReg)
+    SubReg = Dst;
+
+  MachineInstr *MovRel = 
+    BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32))
+            .addReg(SubReg + Off, RegState::Define)
+            .addReg(Val)
+            .addReg(AMDGPU::M0, RegState::Implicit)
+            .addReg(Dst, RegState::Implicit);
+
+  LoadM0(MI, MovRel);
+}
+
+bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
+  TII = static_cast<const SIInstrInfo*>(MF.getTarget().getInstrInfo());
+  TRI = static_cast<const SIRegisterInfo*>(MF.getTarget().getRegisterInfo());
+  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+  bool HaveKill = false;
+  bool NeedM0 = false;
+  bool NeedWQM = false;
+  unsigned Depth = 0;
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+       BI != BE; ++BI) {
+
+    MachineBasicBlock &MBB = *BI;
+    MachineBasicBlock::iterator I, Next;
+    for (I = MBB.begin(); I != MBB.end(); I = Next) {
+      Next = std::next(I);
+
+      MachineInstr &MI = *I;
+      if (TII->isDS(MI.getOpcode())) {
+        NeedM0 = true;
+        NeedWQM = true;
+      }
+
+      switch (MI.getOpcode()) {
+        default: break;
+        case AMDGPU::SI_IF:
+          ++Depth;
+          If(MI);
+          break;
+
+        case AMDGPU::SI_ELSE:
+          Else(MI);
+          break;
+
+        case AMDGPU::SI_BREAK:
+          Break(MI);
+          break;
+
+        case AMDGPU::SI_IF_BREAK:
+          IfBreak(MI);
+          break;
+
+        case AMDGPU::SI_ELSE_BREAK:
+          ElseBreak(MI);
+          break;
+
+        case AMDGPU::SI_LOOP:
+          ++Depth;
+          Loop(MI);
+          break;
+
+        case AMDGPU::SI_END_CF:
+          if (--Depth == 0 && HaveKill) {
+            SkipIfDead(MI);
+            HaveKill = false;
+          }
+          EndCf(MI);
+          break;
+
+        case AMDGPU::SI_KILL:
+          if (Depth == 0)
+            SkipIfDead(MI);
+          else
+            HaveKill = true;
+          Kill(MI);
+          break;
+
+        case AMDGPU::S_BRANCH:
+          Branch(MI);
+          break;
+
+        case AMDGPU::SI_INDIRECT_SRC:
+          IndirectSrc(MI);
+          break;
+
+        case AMDGPU::SI_INDIRECT_DST_V1:
+        case AMDGPU::SI_INDIRECT_DST_V2:
+        case AMDGPU::SI_INDIRECT_DST_V4:
+        case AMDGPU::SI_INDIRECT_DST_V8:
+        case AMDGPU::SI_INDIRECT_DST_V16:
+          IndirectDst(MI);
+          break;
+
+        case AMDGPU::V_INTERP_P1_F32:
+        case AMDGPU::V_INTERP_P2_F32:
+        case AMDGPU::V_INTERP_MOV_F32:
+          NeedWQM = true;
+          break;
+
+      }
+    }
+  }
+
+  if (NeedM0) {
+    MachineBasicBlock &MBB = MF.front();
+    // Initialize M0 to a value that won't cause LDS access to be discarded
+    // due to offset clamping
+    InitM0ForLDS(MBB.getFirstNonPHI());
+  }
+
+  if (NeedWQM && MFI->getShaderType() == ShaderType::PIXEL) {
+    MachineBasicBlock &MBB = MF.front();
+    BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
+            AMDGPU::EXEC).addReg(AMDGPU::EXEC);
+  }
+
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/R600/SILowerI1Copies.cpp b/contrib/llvm/lib/Target/R600/SILowerI1Copies.cpp
new file mode 100644
index 0000000..db19235
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/SILowerI1Copies.cpp
@@ -0,0 +1,154 @@
+//===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// i1 values are usually inserted by the CFG Structurize pass and they are
+/// unique in that they can be copied from VALU to SALU registers.
+/// This is not possible for any other value type.  Since there are no
+/// MOV instructions for i1, we to use V_CMP_* and V_CNDMASK to move the i1.
+///
+//===----------------------------------------------------------------------===//
+//
+
+#define DEBUG_TYPE "si-i1-copies"
+#include "AMDGPU.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+namespace {
+
+class SILowerI1Copies : public MachineFunctionPass {
+public:
+  static char ID;
+
+public:
+  SILowerI1Copies() : MachineFunctionPass(ID) {
+    initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry());
+  }
+
+  virtual bool runOnMachineFunction(MachineFunction &MF) override;
+
+  virtual const char *getPassName() const override {
+    return "SI Lower il Copies";
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+  AU.addRequired<MachineDominatorTree>();
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE,
+                      "SI Lower il Copies", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE,
+                    "SI Lower il Copies", false, false)
+
+char SILowerI1Copies::ID = 0;
+
+char &llvm::SILowerI1CopiesID = SILowerI1Copies::ID;
+
+FunctionPass *llvm::createSILowerI1CopiesPass() {
+  return new SILowerI1Copies();
+}
+
+bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
+      MF.getTarget().getInstrInfo());
+  const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+  std::vector<unsigned> I1Defs;
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+                                                  BI != BE; ++BI) {
+
+    MachineBasicBlock &MBB = *BI;
+    MachineBasicBlock::iterator I, Next;
+    for (I = MBB.begin(); I != MBB.end(); I = Next) {
+      Next = std::next(I);
+      MachineInstr &MI = *I;
+
+      if (MI.getOpcode() == AMDGPU::V_MOV_I1) {
+        I1Defs.push_back(MI.getOperand(0).getReg());
+        MI.setDesc(TII->get(AMDGPU::V_MOV_B32_e32));
+        continue;
+      }
+
+      if (MI.getOpcode() == AMDGPU::V_AND_I1) {
+        I1Defs.push_back(MI.getOperand(0).getReg());
+        MI.setDesc(TII->get(AMDGPU::V_AND_B32_e32));
+        continue;
+      }
+
+      if (MI.getOpcode() == AMDGPU::V_OR_I1) {
+        I1Defs.push_back(MI.getOperand(0).getReg());
+        MI.setDesc(TII->get(AMDGPU::V_OR_B32_e32));
+        continue;
+      }
+
+      if (MI.getOpcode() == AMDGPU::V_XOR_I1) {
+        I1Defs.push_back(MI.getOperand(0).getReg());
+        MI.setDesc(TII->get(AMDGPU::V_XOR_B32_e32));
+        continue;
+      }
+
+      if (MI.getOpcode() != AMDGPU::COPY ||
+          !TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg()) ||
+          !TargetRegisterInfo::isVirtualRegister(MI.getOperand(1).getReg()))
+        continue;
+
+
+      const TargetRegisterClass *DstRC =
+          MRI.getRegClass(MI.getOperand(0).getReg());
+      const TargetRegisterClass *SrcRC =
+          MRI.getRegClass(MI.getOperand(1).getReg());
+
+      if (DstRC == &AMDGPU::VReg_1RegClass &&
+          TRI->getCommonSubClass(SrcRC, &AMDGPU::SGPR_64RegClass)) {
+        I1Defs.push_back(MI.getOperand(0).getReg());
+        BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CNDMASK_B32_e64))
+                .addOperand(MI.getOperand(0))
+                .addImm(0)
+                .addImm(-1)
+                .addOperand(MI.getOperand(1))
+                .addImm(0)
+                .addImm(0)
+                .addImm(0)
+                .addImm(0);
+        MI.eraseFromParent();
+      } else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) &&
+                 SrcRC == &AMDGPU::VReg_1RegClass) {
+        BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CMP_NE_I32_e64))
+                .addOperand(MI.getOperand(0))
+                .addImm(0)
+                .addOperand(MI.getOperand(1))
+                .addImm(0)
+                .addImm(0)
+                .addImm(0)
+                .addImm(0);
+        MI.eraseFromParent();
+      }
+    }
+  }
+
+  for (unsigned Reg : I1Defs)
+    MRI.setRegClass(Reg, &AMDGPU::VReg_32RegClass);
+
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/R600/SIMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/R600/SIMachineFunctionInfo.cpp
new file mode 100644
index 0000000..c53a7e1
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/SIMachineFunctionInfo.cpp
@@ -0,0 +1,97 @@
+//===-- SIMachineFunctionInfo.cpp - SI Machine Function Info -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+
+#include "SIMachineFunctionInfo.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+
+#define MAX_LANES 64
+
+using namespace llvm;
+
+
+// Pin the vtable to this file.
+void SIMachineFunctionInfo::anchor() {}
+
+SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
+  : AMDGPUMachineFunction(MF),
+    PSInputAddr(0),
+    SpillTracker(),
+    NumUserSGPRs(0) { }
+
+static unsigned createLaneVGPR(MachineRegisterInfo &MRI, MachineFunction *MF) {
+  unsigned VGPR = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+
+  // We need to add this register as live out for the function, in order to
+  // have the live range calculated directly.
+  //
+  // When register spilling begins, we have already calculated the live
+  // live intervals for all the registers.  Since we are spilling SGPRs to
+  // VGPRs, we need to update the Lane VGPR's live interval every time we
+  // spill or restore a register.
+  //
+  // Unfortunately, there is no good way to update the live interval as
+  // the TargetInstrInfo callbacks for spilling and restoring don't give
+  // us access to the live interval information.
+  //
+  // We are lucky, though, because the InlineSpiller calls
+  // LiveRangeEdit::calculateRegClassAndHint() which iterates through
+  // all the new register that have been created when restoring a register
+  // and calls LiveIntervals::getInterval(), which creates and computes
+  // the live interval for the newly created register.  However, once this
+  // live intervals is created, it doesn't change and since we usually reuse
+  // the Lane VGPR multiple times, this means any uses after the first aren't
+  // added to the live interval.
+  //
+  // To work around this, we add Lane VGPRs to the functions live out list,
+  // so that we can guarantee its live range will cover all of its uses.
+
+  for (MachineBasicBlock &MBB : *MF) {
+    if (MBB.back().getOpcode() == AMDGPU::S_ENDPGM) {
+      MBB.back().addOperand(*MF, MachineOperand::CreateReg(VGPR, false, true));
+      return VGPR;
+    }
+  }
+
+  LLVMContext &Ctx = MF->getFunction()->getContext();
+  Ctx.emitError("Could not find S_ENDPGM instruction.");
+
+  return VGPR;
+}
+
+unsigned SIMachineFunctionInfo::RegSpillTracker::reserveLanes(
+    MachineRegisterInfo &MRI, MachineFunction *MF, unsigned NumRegs) {
+  unsigned StartLane = CurrentLane;
+  CurrentLane += NumRegs;
+  if (!LaneVGPR) {
+    LaneVGPR = createLaneVGPR(MRI, MF);
+  } else {
+    if (CurrentLane >= MAX_LANES) {
+      StartLane = CurrentLane = 0;
+      LaneVGPR = createLaneVGPR(MRI, MF);
+    }
+  }
+  return StartLane;
+}
+
+void SIMachineFunctionInfo::RegSpillTracker::addSpilledReg(unsigned FrameIndex,
+                                                           unsigned Reg,
+                                                           int Lane) {
+  SpilledRegisters[FrameIndex] = SpilledReg(Reg, Lane);
+}
+
+const SIMachineFunctionInfo::SpilledReg&
+SIMachineFunctionInfo::RegSpillTracker::getSpilledReg(unsigned FrameIndex) {
+  return SpilledRegisters[FrameIndex];
+}
diff --git a/contrib/llvm/lib/Target/R600/SIMachineFunctionInfo.h b/contrib/llvm/lib/Target/R600/SIMachineFunctionInfo.h
new file mode 100644
index 0000000..9684d28
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/SIMachineFunctionInfo.h
@@ -0,0 +1,68 @@
+//===- SIMachineFunctionInfo.h - SIMachineFunctionInfo interface -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef SIMACHINEFUNCTIONINFO_H_
+#define SIMACHINEFUNCTIONINFO_H_
+
+#include "AMDGPUMachineFunction.h"
+#include <map>
+
+namespace llvm {
+
+class MachineRegisterInfo;
+
+/// This class keeps track of the SPI_SP_INPUT_ADDR config register, which
+/// tells the hardware which interpolation parameters to load.
+class SIMachineFunctionInfo : public AMDGPUMachineFunction {
+  void anchor() override;
+public:
+
+  struct SpilledReg {
+    unsigned VGPR;
+    int Lane;
+    SpilledReg(unsigned R, int L) : VGPR (R), Lane (L) { }
+    SpilledReg() : VGPR(0), Lane(-1) { }
+    bool hasLane() { return Lane != -1;}
+  };
+
+  struct RegSpillTracker {
+  private:
+    unsigned CurrentLane;
+    std::map<unsigned, SpilledReg> SpilledRegisters;
+  public:
+    unsigned LaneVGPR;
+    RegSpillTracker() : CurrentLane(0), SpilledRegisters(), LaneVGPR(0) { }
+    /// \p NumRegs The number of consecutive registers what need to be spilled.
+    ///            This function will ensure that all registers are stored in
+    ///            the same VGPR.
+    /// \returns The lane to be used for storing the first register.
+    unsigned reserveLanes(MachineRegisterInfo &MRI, MachineFunction *MF,
+                          unsigned NumRegs = 1);
+    void addSpilledReg(unsigned FrameIndex, unsigned Reg, int Lane = -1);
+    const SpilledReg& getSpilledReg(unsigned FrameIndex);
+    bool programSpillsRegisters() { return !SpilledRegisters.empty(); }
+  };
+
+  // SIMachineFunctionInfo definition
+
+  SIMachineFunctionInfo(const MachineFunction &MF);
+  unsigned PSInputAddr;
+  struct RegSpillTracker SpillTracker;
+  unsigned NumUserSGPRs;
+};
+
+} // End namespace llvm
+
+
+#endif //_SIMACHINEFUNCTIONINFO_H_
diff --git a/contrib/llvm/lib/Target/R600/SIRegisterInfo.cpp b/contrib/llvm/lib/Target/R600/SIRegisterInfo.cpp
new file mode 100644
index 0000000..2a9a2ac
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/SIRegisterInfo.cpp
@@ -0,0 +1,188 @@
+//===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief SI implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "SIRegisterInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+
+using namespace llvm;
+
+SIRegisterInfo::SIRegisterInfo(const AMDGPUSubtarget &st)
+: AMDGPURegisterInfo(st)
+  { }
+
+BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  Reserved.set(AMDGPU::EXEC);
+  Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
+  return Reserved;
+}
+
+unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
+                                             MachineFunction &MF) const {
+  return RC->getNumRegs();
+}
+
+bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
+  return Fn.getFrameInfo()->hasStackObjects();
+}
+
+void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
+                                        int SPAdj, unsigned FIOperandNum,
+                                        RegScavenger *RS) const {
+  MachineFunction *MF = MI->getParent()->getParent();
+  MachineFrameInfo *FrameInfo = MF->getFrameInfo();
+  const SIInstrInfo *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo());
+  MachineOperand &FIOp = MI->getOperand(FIOperandNum);
+  int Index = MI->getOperand(FIOperandNum).getIndex();
+  int64_t Offset = FrameInfo->getObjectOffset(Index);
+
+  FIOp.ChangeToImmediate(Offset);
+  if (!TII->isImmOperandLegal(MI, FIOperandNum, FIOp)) {
+    unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VReg_32RegClass, MI, SPAdj);
+    BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+            TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
+            .addImm(Offset);
+    FIOp.ChangeToRegister(TmpReg, false);
+  }
+}
+
+const TargetRegisterClass * SIRegisterInfo::getCFGStructurizerRegClass(
+                                                                   MVT VT) const {
+  switch(VT.SimpleTy) {
+    default:
+    case MVT::i32: return &AMDGPU::VReg_32RegClass;
+  }
+}
+
+unsigned SIRegisterInfo::getHWRegIndex(unsigned Reg) const {
+  return getEncodingValue(Reg) & 0xff;
+}
+
+const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
+  assert(!TargetRegisterInfo::isVirtualRegister(Reg));
+
+  const TargetRegisterClass *BaseClasses[] = {
+    &AMDGPU::VReg_32RegClass,
+    &AMDGPU::SReg_32RegClass,
+    &AMDGPU::VReg_64RegClass,
+    &AMDGPU::SReg_64RegClass,
+    &AMDGPU::SReg_128RegClass,
+    &AMDGPU::SReg_256RegClass
+  };
+
+  for (const TargetRegisterClass *BaseClass : BaseClasses) {
+    if (BaseClass->contains(Reg)) {
+      return BaseClass;
+    }
+  }
+  return nullptr;
+}
+
+bool SIRegisterInfo::isSGPRClass(const TargetRegisterClass *RC) const {
+  if (!RC) {
+    return false;
+  }
+  return !hasVGPRs(RC);
+}
+
+bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
+  return getCommonSubClass(&AMDGPU::VReg_32RegClass, RC) ||
+         getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) ||
+         getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) ||
+         getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) ||
+         getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) ||
+         getCommonSubClass(&AMDGPU::VReg_512RegClass, RC);
+}
+
+const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass(
+                                         const TargetRegisterClass *SRC) const {
+    if (hasVGPRs(SRC)) {
+      return SRC;
+    } else if (SRC == &AMDGPU::SCCRegRegClass) {
+      return &AMDGPU::VCCRegRegClass;
+    } else if (getCommonSubClass(SRC, &AMDGPU::SGPR_32RegClass)) {
+      return &AMDGPU::VReg_32RegClass;
+    } else if (getCommonSubClass(SRC, &AMDGPU::SGPR_64RegClass)) {
+      return &AMDGPU::VReg_64RegClass;
+    } else if (getCommonSubClass(SRC, &AMDGPU::SReg_128RegClass)) {
+      return &AMDGPU::VReg_128RegClass;
+    } else if (getCommonSubClass(SRC, &AMDGPU::SReg_256RegClass)) {
+      return &AMDGPU::VReg_256RegClass;
+    } else if (getCommonSubClass(SRC, &AMDGPU::SReg_512RegClass)) {
+      return &AMDGPU::VReg_512RegClass;
+    }
+    return nullptr;
+}
+
+const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
+                         const TargetRegisterClass *RC, unsigned SubIdx) const {
+  if (SubIdx == AMDGPU::NoSubRegister)
+    return RC;
+
+  // If this register has a sub-register, we can safely assume it is a 32-bit
+  // register, because all of SI's sub-registers are 32-bit.
+  if (isSGPRClass(RC)) {
+    return &AMDGPU::SGPR_32RegClass;
+  } else {
+    return &AMDGPU::VGPR_32RegClass;
+  }
+}
+
+unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg,
+                                          const TargetRegisterClass *SubRC,
+                                          unsigned Channel) const {
+  unsigned Index = getHWRegIndex(Reg);
+  return SubRC->getRegister(Index + Channel);
+}
+
+bool SIRegisterInfo::regClassCanUseImmediate(int RCID) const {
+  switch (RCID) {
+  default: return false;
+  case AMDGPU::SSrc_32RegClassID:
+  case AMDGPU::SSrc_64RegClassID:
+  case AMDGPU::VSrc_32RegClassID:
+  case AMDGPU::VSrc_64RegClassID:
+    return true;
+  }
+}
+
+bool SIRegisterInfo::regClassCanUseImmediate(
+                             const TargetRegisterClass *RC) const {
+  return regClassCanUseImmediate(RC->getID());
+}
+
+unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
+                                           enum PreloadedValue Value) const {
+
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  switch (Value) {
+  case SIRegisterInfo::TGID_X:
+    return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 0);
+  case SIRegisterInfo::TGID_Y:
+    return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 1);
+  case SIRegisterInfo::TGID_Z:
+    return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2);
+  case SIRegisterInfo::SCRATCH_WAVE_OFFSET:
+    return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 4);
+  case SIRegisterInfo::SCRATCH_PTR:
+    return AMDGPU::SGPR2_SGPR3;
+  }
+  llvm_unreachable("unexpected preloaded value type");
+}
diff --git a/contrib/llvm/lib/Target/R600/SIRegisterInfo.h b/contrib/llvm/lib/Target/R600/SIRegisterInfo.h
new file mode 100644
index 0000000..5d0235c
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/SIRegisterInfo.h
@@ -0,0 +1,94 @@
+//===-- SIRegisterInfo.h - SI Register Info Interface ----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface definition for SIRegisterInfo
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef SIREGISTERINFO_H_
+#define SIREGISTERINFO_H_
+
+#include "AMDGPURegisterInfo.h"
+
+namespace llvm {
+
+struct SIRegisterInfo : public AMDGPURegisterInfo {
+
+  SIRegisterInfo(const AMDGPUSubtarget &st);
+
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+  unsigned getRegPressureLimit(const TargetRegisterClass *RC,
+                               MachineFunction &MF) const override;
+
+  bool requiresRegisterScavenging(const MachineFunction &Fn) const override;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
+                           unsigned FIOperandNum,
+                           RegScavenger *RS) const override;
+
+  /// \brief get the register class of the specified type to use in the
+  /// CFGStructurizer
+  const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override;
+
+  unsigned getHWRegIndex(unsigned Reg) const override;
+
+  /// \brief Return the 'base' register class for this register.
+  /// e.g. SGPR0 => SReg_32, VGPR => VReg_32 SGPR0_SGPR1 -> SReg_32, etc.
+  const TargetRegisterClass *getPhysRegClass(unsigned Reg) const;
+
+  /// \returns true if this class contains only SGPR registers
+  bool isSGPRClass(const TargetRegisterClass *RC) const;
+
+  /// \returns true if this class contains VGPR registers.
+  bool hasVGPRs(const TargetRegisterClass *RC) const;
+
+  /// \returns A VGPR reg class with the same width as \p SRC
+  const TargetRegisterClass *getEquivalentVGPRClass(
+                                          const TargetRegisterClass *SRC) const;
+
+  /// \returns The register class that is used for a sub-register of \p RC for
+  /// the given \p SubIdx.  If \p SubIdx equals NoSubRegister, \p RC will
+  /// be returned.
+  const TargetRegisterClass *getSubRegClass(const TargetRegisterClass *RC,
+                                            unsigned SubIdx) const;
+
+  /// \p Channel This is the register channel (e.g. a value from 0-16), not the
+  ///            SubReg index.
+  /// \returns The sub-register of Reg that is in Channel.
+  unsigned getPhysRegSubReg(unsigned Reg, const TargetRegisterClass *SubRC,
+                            unsigned Channel) const;
+
+  /// \returns True if operands defined with this register class can accept
+  /// inline immediates.
+  bool regClassCanUseImmediate(int RCID) const;
+
+  /// \returns True if operands defined with this register class can accept
+  /// inline immediates.
+  bool regClassCanUseImmediate(const TargetRegisterClass *RC) const;
+
+  enum PreloadedValue {
+    TGID_X,
+    TGID_Y,
+    TGID_Z,
+    SCRATCH_WAVE_OFFSET,
+    SCRATCH_PTR
+  };
+
+  /// \brief Returns the physical register that \p Value is stored in.
+  unsigned getPreloadedValue(const MachineFunction &MF,
+                             enum PreloadedValue Value) const;
+
+};
+
+} // End namespace llvm
+
+#endif // SIREGISTERINFO_H_
diff --git a/contrib/llvm/lib/Target/R600/SIRegisterInfo.td b/contrib/llvm/lib/Target/R600/SIRegisterInfo.td
new file mode 100644
index 0000000..8974b63
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/SIRegisterInfo.td
@@ -0,0 +1,211 @@
+//===-- SIRegisterInfo.td - SI Register defs ---------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the SI registers
+//===----------------------------------------------------------------------===//
+
+class SIReg <string n, bits<16> encoding = 0> : Register<n> {
+  let Namespace = "AMDGPU";
+  let HWEncoding = encoding;
+}
+
+// Special Registers
+def VCC_LO : SIReg<"vcc_lo", 106>;
+def VCC_HI : SIReg<"vcc_hi", 107>;
+
+// VCC for 64-bit instructions
+def VCC : RegisterWithSubRegs<"VCC", [VCC_LO, VCC_HI]> {
+  let Namespace = "AMDGPU";
+  let SubRegIndices = [sub0, sub1];
+  let HWEncoding = 106;
+}
+
+def EXEC : SIReg<"EXEC", 126>;
+def SCC : SIReg<"SCC", 253>;
+def M0 : SIReg <"M0", 124>;
+
+// SGPR registers
+foreach Index = 0-101 in {
+  def SGPR#Index : SIReg <"SGPR"#Index, Index>;
+}
+
+// VGPR registers
+foreach Index = 0-255 in {
+  def VGPR#Index : SIReg <"VGPR"#Index, Index> {
+    let HWEncoding{8} = 1;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//  Groupings using register classes and tuples
+//===----------------------------------------------------------------------===//
+
+// SGPR 32-bit registers
+def SGPR_32 : RegisterClass<"AMDGPU", [f32, i32], 32,
+                            (add (sequence "SGPR%u", 0, 101))>;
+
+// SGPR 64-bit registers
+def SGPR_64Regs : RegisterTuples<[sub0, sub1],
+                             [(add (decimate (trunc SGPR_32, 101), 2)),
+                              (add (decimate (shl SGPR_32, 1), 2))]>;
+
+// SGPR 128-bit registers
+def SGPR_128 : RegisterTuples<[sub0, sub1, sub2, sub3],
+                              [(add (decimate (trunc SGPR_32, 99), 4)),
+                               (add (decimate (shl SGPR_32, 1), 4)),
+                               (add (decimate (shl SGPR_32, 2), 4)),
+                               (add (decimate (shl SGPR_32, 3), 4))]>;
+
+// SGPR 256-bit registers
+def SGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7],
+                              [(add (decimate (trunc SGPR_32, 95), 4)),
+                               (add (decimate (shl SGPR_32, 1), 4)),
+                               (add (decimate (shl SGPR_32, 2), 4)),
+                               (add (decimate (shl SGPR_32, 3), 4)),
+                               (add (decimate (shl SGPR_32, 4), 4)),
+                               (add (decimate (shl SGPR_32, 5), 4)),
+                               (add (decimate (shl SGPR_32, 6), 4)),
+                               (add (decimate (shl SGPR_32, 7), 4))]>;
+
+// SGPR 512-bit registers
+def SGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
+                               sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15],
+                              [(add (decimate (trunc SGPR_32, 87), 4)),
+                               (add (decimate (shl SGPR_32, 1), 4)),
+                               (add (decimate (shl SGPR_32, 2), 4)),
+                               (add (decimate (shl SGPR_32, 3), 4)),
+                               (add (decimate (shl SGPR_32, 4), 4)),
+                               (add (decimate (shl SGPR_32, 5), 4)),
+                               (add (decimate (shl SGPR_32, 6), 4)),
+                               (add (decimate (shl SGPR_32, 7), 4)),
+                               (add (decimate (shl SGPR_32, 8), 4)),
+                               (add (decimate (shl SGPR_32, 9), 4)),
+                               (add (decimate (shl SGPR_32, 10), 4)),
+                               (add (decimate (shl SGPR_32, 11), 4)),
+                               (add (decimate (shl SGPR_32, 12), 4)),
+                               (add (decimate (shl SGPR_32, 13), 4)),
+                               (add (decimate (shl SGPR_32, 14), 4)),
+                               (add (decimate (shl SGPR_32, 15), 4))]>;
+
+// VGPR 32-bit registers
+def VGPR_32 : RegisterClass<"AMDGPU", [f32, i32], 32,
+                            (add (sequence "VGPR%u", 0, 255))>;
+
+// VGPR 64-bit registers
+def VGPR_64 : RegisterTuples<[sub0, sub1],
+                             [(add (trunc VGPR_32, 255)),
+                              (add (shl VGPR_32, 1))]>;
+
+// VGPR 96-bit registers
+def VGPR_96 : RegisterTuples<[sub0, sub1, sub2],
+                             [(add (trunc VGPR_32, 254)),
+                              (add (shl VGPR_32, 1)),
+                              (add (shl VGPR_32, 2))]>;
+
+// VGPR 128-bit registers
+def VGPR_128 : RegisterTuples<[sub0, sub1, sub2, sub3],
+                              [(add (trunc VGPR_32, 253)),
+                               (add (shl VGPR_32, 1)),
+                               (add (shl VGPR_32, 2)),
+                               (add (shl VGPR_32, 3))]>;
+
+// VGPR 256-bit registers
+def VGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7],
+                              [(add (trunc VGPR_32, 249)),
+                               (add (shl VGPR_32, 1)),
+                               (add (shl VGPR_32, 2)),
+                               (add (shl VGPR_32, 3)),
+                               (add (shl VGPR_32, 4)),
+                               (add (shl VGPR_32, 5)),
+                               (add (shl VGPR_32, 6)),
+                               (add (shl VGPR_32, 7))]>;
+
+// VGPR 512-bit registers
+def VGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
+                               sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15],
+                              [(add (trunc VGPR_32, 241)),
+                               (add (shl VGPR_32, 1)),
+                               (add (shl VGPR_32, 2)),
+                               (add (shl VGPR_32, 3)),
+                               (add (shl VGPR_32, 4)),
+                               (add (shl VGPR_32, 5)),
+                               (add (shl VGPR_32, 6)),
+                               (add (shl VGPR_32, 7)),
+                               (add (shl VGPR_32, 8)),
+                               (add (shl VGPR_32, 9)),
+                               (add (shl VGPR_32, 10)),
+                               (add (shl VGPR_32, 11)),
+                               (add (shl VGPR_32, 12)),
+                               (add (shl VGPR_32, 13)),
+                               (add (shl VGPR_32, 14)),
+                               (add (shl VGPR_32, 15))]>;
+
+//===----------------------------------------------------------------------===//
+//  Register classes used as source and destination
+//===----------------------------------------------------------------------===//
+
+// Special register classes for predicates and the M0 register
+def SCCReg : RegisterClass<"AMDGPU", [i32, i1], 32, (add SCC)>;
+def VCCReg : RegisterClass<"AMDGPU", [i64, i1], 64, (add VCC)>;
+def EXECReg : RegisterClass<"AMDGPU", [i64, i1], 64, (add EXEC)>;
+def M0Reg : RegisterClass<"AMDGPU", [i32], 32, (add M0)>;
+
+// Register class for all scalar registers (SGPRs + Special Registers)
+def SReg_32 : RegisterClass<"AMDGPU", [f32, i32], 32,
+  (add SGPR_32, M0Reg, VCC_LO)
+>;
+
+def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64], 64, (add SGPR_64Regs)>;
+
+def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, i1], 64,
+  (add SGPR_64Regs, VCCReg, EXECReg)
+>;
+
+def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8], 128, (add SGPR_128)>;
+
+def SReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add SGPR_256)>;
+
+def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 512, (add SGPR_512)>;
+
+// Register class for all vector registers (VGPRs + Interploation Registers)
+def VReg_32 : RegisterClass<"AMDGPU", [i32, f32, v1i32], 32, (add VGPR_32)>;
+
+def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 64, (add VGPR_64)>;
+
+def VReg_96 : RegisterClass<"AMDGPU", [untyped], 96, (add VGPR_96)> {
+  let Size = 96;
+}
+
+def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32], 128, (add VGPR_128)>;
+
+def VReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add VGPR_256)>;
+
+def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 512, (add VGPR_512)>;
+
+def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)>;
+
+//===----------------------------------------------------------------------===//
+//  [SV]Src_(32|64) register classes, can have either an immediate or an register
+//===----------------------------------------------------------------------===//
+
+def SSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add SReg_32)>;
+
+def SSrc_64 : RegisterClass<"AMDGPU", [i64, f64, i1], 64, (add SReg_64)>;
+
+def VSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VReg_32, SReg_32)>;
+
+def VSrc_64 : RegisterClass<"AMDGPU", [i64, f64], 64, (add VReg_64, SReg_64)>;
+
+//===----------------------------------------------------------------------===//
+// SGPR and VGPR register classes
+//===----------------------------------------------------------------------===//
+
+def VSrc_128 : RegisterClass<"AMDGPU", [v4i32, v4f32], 128,
+                             (add VReg_128, SReg_128)>;
diff --git a/contrib/llvm/lib/Target/R600/SISchedule.td b/contrib/llvm/lib/Target/R600/SISchedule.td
new file mode 100644
index 0000000..28b65b8
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/SISchedule.td
@@ -0,0 +1,15 @@
+//===-- SISchedule.td - SI Scheduling definitons -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: This is just a place holder for now.
+//
+//===----------------------------------------------------------------------===//
+
+
+def SI_Itin : ProcessorItineraries <[], [], []>;
diff --git a/contrib/llvm/lib/Target/R600/SIShrinkInstructions.cpp b/contrib/llvm/lib/Target/R600/SIShrinkInstructions.cpp
new file mode 100644
index 0000000..745c4b6
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/SIShrinkInstructions.cpp
@@ -0,0 +1,194 @@
+//===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// The pass tries to use the 32-bit encoding for instructions when possible.
+//===----------------------------------------------------------------------===//
+//
+
+#include "AMDGPU.h"
+#include "SIInstrInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+
+#define DEBUG_TYPE "si-shrink-instructions"
+
+STATISTIC(NumInstructionsShrunk,
+          "Number of 64-bit instruction reduced to 32-bit.");
+
+namespace llvm {
+  void initializeSIShrinkInstructionsPass(PassRegistry&);
+}
+
+using namespace llvm;
+
+namespace {
+
+class SIShrinkInstructions : public MachineFunctionPass {
+public:
+  static char ID;
+
+public:
+  SIShrinkInstructions() : MachineFunctionPass(ID) {
+  }
+
+  virtual bool runOnMachineFunction(MachineFunction &MF) override;
+
+  virtual const char *getPassName() const override {
+    return "SI Shrink Instructions";
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(SIShrinkInstructions, DEBUG_TYPE,
+                      "SI Lower il Copies", false, false)
+INITIALIZE_PASS_END(SIShrinkInstructions, DEBUG_TYPE,
+                    "SI Lower il Copies", false, false)
+
+char SIShrinkInstructions::ID = 0;
+
+FunctionPass *llvm::createSIShrinkInstructionsPass() {
+  return new SIShrinkInstructions();
+}
+
+static bool isVGPR(const MachineOperand *MO, const SIRegisterInfo &TRI,
+                   const MachineRegisterInfo &MRI) {
+  if (!MO->isReg())
+    return false;
+
+  if (TargetRegisterInfo::isVirtualRegister(MO->getReg()))
+    return TRI.hasVGPRs(MRI.getRegClass(MO->getReg()));
+
+  return TRI.hasVGPRs(TRI.getPhysRegClass(MO->getReg()));
+}
+
+static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
+                      const SIRegisterInfo &TRI,
+                      const MachineRegisterInfo &MRI) {
+
+  const MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+  // Can't shrink instruction with three operands.
+  if (Src2)
+    return false;
+
+  const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+  const MachineOperand *Src1Mod =
+      TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
+
+  if (Src1 && (!isVGPR(Src1, TRI, MRI) || Src1Mod->getImm() != 0))
+    return false;
+
+  // We don't need to check src0, all input types are legal, so just make
+  // sure src0 isn't using any modifiers.
+  const MachineOperand *Src0Mod =
+      TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
+  if (Src0Mod && Src0Mod->getImm() != 0)
+    return false;
+
+  // Check output modifiers
+  const MachineOperand *Omod = TII->getNamedOperand(MI, AMDGPU::OpName::omod);
+  if (Omod && Omod->getImm() != 0)
+    return false;
+
+  const MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp);
+  return !Clamp || Clamp->getImm() == 0;
+}
+
+bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
+      MF.getTarget().getInstrInfo());
+  const SIRegisterInfo &TRI = TII->getRegisterInfo();
+  std::vector<unsigned> I1Defs;
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+                                                  BI != BE; ++BI) {
+
+    MachineBasicBlock &MBB = *BI;
+    MachineBasicBlock::iterator I, Next;
+    for (I = MBB.begin(); I != MBB.end(); I = Next) {
+      Next = std::next(I);
+      MachineInstr &MI = *I;
+
+      if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
+        continue;
+
+      if (!canShrink(MI, TII, TRI, MRI)) {
+        // Try commtuing the instruction and see if that enables us to shrink
+        // it.
+        if (!MI.isCommutable() || !TII->commuteInstruction(&MI) ||
+            !canShrink(MI, TII, TRI, MRI))
+          continue;
+      }
+
+      int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
+
+      // Op32 could be -1 here if we started with an instruction that had a
+      // a 32-bit encoding and then commuted it to an instruction that did not.
+      if (Op32 == -1)
+        continue;
+
+      if (TII->isVOPC(Op32)) {
+        unsigned DstReg = MI.getOperand(0).getReg();
+        if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
+          // VOPC instructions can only write to the VCC register.  We can't
+          // force them to use VCC here, because the register allocator
+          // has trouble with sequences like this, which cause the allocator
+          // to run out of registes if vreg0 and vreg1 belong to the VCCReg
+          // register class:
+          // vreg0 = VOPC;
+          // vreg1 = VOPC;
+          // S_AND_B64 vreg0, vreg1
+          //
+          // So, instead of forcing the instruction to write to VCC, we provide a
+          // hint to the register allocator to use VCC and then we
+          // we will run this pass again after RA and shrink it if it outpus to
+          // VCC.
+          MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC);
+          continue;
+        }
+        if (DstReg != AMDGPU::VCC)
+          continue;
+      }
+
+      // We can shrink this instruction
+      DEBUG(dbgs() << "Shrinking "; MI.dump(); dbgs() << "\n";);
+
+      MachineInstrBuilder MIB =
+          BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32));
+
+      // dst
+      MIB.addOperand(MI.getOperand(0));
+
+      MIB.addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::src0));
+
+      const MachineOperand *Src1 =
+          TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+      if (Src1)
+        MIB.addOperand(*Src1);
+
+      for (const MachineOperand &MO : MI.implicit_operands())
+        MIB.addOperand(MO);
+
+      DEBUG(dbgs() << "e32 MI = "; MI.dump(); dbgs() << "\n";);
+      ++NumInstructionsShrunk;
+      MI.eraseFromParent();
+    }
+  }
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/R600/SITypeRewriter.cpp b/contrib/llvm/lib/Target/R600/SITypeRewriter.cpp
new file mode 100644
index 0000000..367963a
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/SITypeRewriter.cpp
@@ -0,0 +1,162 @@
+//===-- SITypeRewriter.cpp - Remove unwanted types ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass removes performs the following type substitution on all
+/// non-compute shaders:
+///
+/// v16i8 => i128
+///   - v16i8 is used for constant memory resource descriptors.  This type is
+///      legal for some compute APIs, and we don't want to declare it as legal
+///      in the backend, because we want the legalizer to expand all v16i8
+///      operations.
+/// v1* => *
+///   - Having v1* types complicates the legalizer and we can easily replace
+///   - them with the element type.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+
+using namespace llvm;
+
+namespace {
+
+class SITypeRewriter : public FunctionPass,
+                       public InstVisitor<SITypeRewriter> {
+
+  static char ID;
+  Module *Mod;
+  Type *v16i8;
+  Type *v4i32;
+
+public:
+  SITypeRewriter() : FunctionPass(ID) { }
+  bool doInitialization(Module &M) override;
+  bool runOnFunction(Function &F) override;
+  const char *getPassName() const override {
+    return "SI Type Rewriter";
+  }
+  void visitLoadInst(LoadInst &I);
+  void visitCallInst(CallInst &I);
+  void visitBitCast(BitCastInst &I);
+};
+
+} // End anonymous namespace
+
+char SITypeRewriter::ID = 0;
+
+bool SITypeRewriter::doInitialization(Module &M) {
+  Mod = &M;
+  v16i8 = VectorType::get(Type::getInt8Ty(M.getContext()), 16);
+  v4i32 = VectorType::get(Type::getInt32Ty(M.getContext()), 4);
+  return false;
+}
+
+bool SITypeRewriter::runOnFunction(Function &F) {
+  AttributeSet Set = F.getAttributes();
+  Attribute A = Set.getAttribute(AttributeSet::FunctionIndex, "ShaderType");
+
+  unsigned ShaderType = ShaderType::COMPUTE;
+  if (A.isStringAttribute()) {
+    StringRef Str = A.getValueAsString();
+    Str.getAsInteger(0, ShaderType);
+  }
+  if (ShaderType == ShaderType::COMPUTE)
+    return false;
+
+  visit(F);
+  visit(F);
+
+  return false;
+}
+
+void SITypeRewriter::visitLoadInst(LoadInst &I) {
+  Value *Ptr = I.getPointerOperand();
+  Type *PtrTy = Ptr->getType();
+  Type *ElemTy = PtrTy->getPointerElementType();
+  IRBuilder<> Builder(&I);
+  if (ElemTy == v16i8)  {
+    Value *BitCast = Builder.CreateBitCast(Ptr,
+        PointerType::get(v4i32,PtrTy->getPointerAddressSpace()));
+    LoadInst *Load = Builder.CreateLoad(BitCast);
+    SmallVector <std::pair<unsigned, MDNode*>, 8> MD;
+    I.getAllMetadataOtherThanDebugLoc(MD);
+    for (unsigned i = 0, e = MD.size(); i != e; ++i) {
+      Load->setMetadata(MD[i].first, MD[i].second);
+    }
+    Value *BitCastLoad = Builder.CreateBitCast(Load, I.getType());
+    I.replaceAllUsesWith(BitCastLoad);
+    I.eraseFromParent();
+  }
+}
+
+void SITypeRewriter::visitCallInst(CallInst &I) {
+  IRBuilder<> Builder(&I);
+
+  SmallVector <Value*, 8> Args;
+  SmallVector <Type*, 8> Types;
+  bool NeedToReplace = false;
+  Function *F = I.getCalledFunction();
+  std::string Name = F->getName().str();
+  for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) {
+    Value *Arg = I.getArgOperand(i);
+    if (Arg->getType() == v16i8) {
+      Args.push_back(Builder.CreateBitCast(Arg, v4i32));
+      Types.push_back(v4i32);
+      NeedToReplace = true;
+      Name = Name + ".v4i32";
+    } else if (Arg->getType()->isVectorTy() &&
+               Arg->getType()->getVectorNumElements() == 1 &&
+               Arg->getType()->getVectorElementType() ==
+                                              Type::getInt32Ty(I.getContext())){
+      Type *ElementTy = Arg->getType()->getVectorElementType();
+      std::string TypeName = "i32";
+      InsertElementInst *Def = cast<InsertElementInst>(Arg);
+      Args.push_back(Def->getOperand(1));
+      Types.push_back(ElementTy);
+      std::string VecTypeName = "v1" + TypeName;
+      Name = Name.replace(Name.find(VecTypeName), VecTypeName.length(), TypeName);
+      NeedToReplace = true;
+    } else {
+      Args.push_back(Arg);
+      Types.push_back(Arg->getType());
+    }
+  }
+
+  if (!NeedToReplace) {
+    return;
+  }
+  Function *NewF = Mod->getFunction(Name);
+  if (!NewF) {
+    NewF = Function::Create(FunctionType::get(F->getReturnType(), Types, false), GlobalValue::ExternalLinkage, Name, Mod);
+    NewF->setAttributes(F->getAttributes());
+  }
+  I.replaceAllUsesWith(Builder.CreateCall(NewF, Args));
+  I.eraseFromParent();
+}
+
+void SITypeRewriter::visitBitCast(BitCastInst &I) {
+  IRBuilder<> Builder(&I);
+  if (I.getDestTy() != v4i32) {
+    return;
+  }
+
+  if (BitCastInst *Op = dyn_cast<BitCastInst>(I.getOperand(0))) {
+    if (Op->getSrcTy() == v4i32) {
+      I.replaceAllUsesWith(Op->getOperand(0));
+      I.eraseFromParent();
+    }
+  }
+}
+
+FunctionPass *llvm::createSITypeRewriter() {
+  return new SITypeRewriter();
+}
diff --git a/contrib/llvm/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp b/contrib/llvm/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp
new file mode 100644
index 0000000..f437564
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp
@@ -0,0 +1,26 @@
+//===-- TargetInfo/AMDGPUTargetInfo.cpp - TargetInfo for AMDGPU -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUTargetMachine.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+/// \brief The target for the AMDGPU backend
+Target llvm::TheAMDGPUTarget;
+
+/// \brief Extern function to initialize the targets for the AMDGPU backend
+extern "C" void LLVMInitializeR600TargetInfo() {
+  RegisterTarget<Triple::r600, false>
+    R600(TheAMDGPUTarget, "r600", "AMD GPUs HD2XXX-HD6XXX");
+}
diff --git a/contrib/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/contrib/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
new file mode 100644
index 0000000..9df0054
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
@@ -0,0 +1,939 @@
+//===-- SparcAsmParser.cpp - Parse Sparc assembly to MCInst instructions --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/SparcMCTargetDesc.h"
+#include "MCTargetDesc/SparcMCExpr.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+// The generated AsmMatcher SparcGenAsmMatcher uses "Sparc" as the target
+// namespace. But SPARC backend uses "SP" as its namespace.
+namespace llvm {
+  namespace Sparc {
+    using namespace SP;
+  }
+}
+
+namespace {
+class SparcOperand;
+class SparcAsmParser : public MCTargetAsmParser {
+
+  MCSubtargetInfo &STI;
+  MCAsmParser &Parser;
+
+  /// @name Auto-generated Match Functions
+  /// {
+
+#define GET_ASSEMBLER_HEADER
+#include "SparcGenAsmMatcher.inc"
+
+  /// }
+
+  // public interface of the MCTargetAsmParser.
+  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                               OperandVector &Operands, MCStreamer &Out,
+                               unsigned &ErrorInfo,
+                               bool MatchingInlineAsm) override;
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                        SMLoc NameLoc, OperandVector &Operands) override;
+  bool ParseDirective(AsmToken DirectiveID) override;
+
+  unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
+                                      unsigned Kind) override;
+
+  // Custom parse functions for Sparc specific operands.
+  OperandMatchResultTy parseMEMOperand(OperandVector &Operands);
+
+  OperandMatchResultTy parseOperand(OperandVector &Operands, StringRef Name);
+
+  OperandMatchResultTy
+  parseSparcAsmOperand(std::unique_ptr<SparcOperand> &Operand,
+                       bool isCall = false);
+
+  OperandMatchResultTy parseBranchModifiers(OperandVector &Operands);
+
+  // returns true if Tok is matched to a register and returns register in RegNo.
+  bool matchRegisterName(const AsmToken &Tok, unsigned &RegNo,
+                         unsigned &RegKind);
+
+  bool matchSparcAsmModifiers(const MCExpr *&EVal, SMLoc &EndLoc);
+  bool parseDirectiveWord(unsigned Size, SMLoc L);
+
+  bool is64Bit() const { return STI.getTargetTriple().startswith("sparcv9"); }
+public:
+  SparcAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser,
+                const MCInstrInfo &MII,
+                const MCTargetOptions &Options)
+      : MCTargetAsmParser(), STI(sti), Parser(parser) {
+    // Initialize the set of available features.
+    setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+  }
+
+};
+
+  static unsigned IntRegs[32] = {
+    Sparc::G0, Sparc::G1, Sparc::G2, Sparc::G3,
+    Sparc::G4, Sparc::G5, Sparc::G6, Sparc::G7,
+    Sparc::O0, Sparc::O1, Sparc::O2, Sparc::O3,
+    Sparc::O4, Sparc::O5, Sparc::O6, Sparc::O7,
+    Sparc::L0, Sparc::L1, Sparc::L2, Sparc::L3,
+    Sparc::L4, Sparc::L5, Sparc::L6, Sparc::L7,
+    Sparc::I0, Sparc::I1, Sparc::I2, Sparc::I3,
+    Sparc::I4, Sparc::I5, Sparc::I6, Sparc::I7 };
+
+  static unsigned FloatRegs[32] = {
+    Sparc::F0,  Sparc::F1,  Sparc::F2,  Sparc::F3,
+    Sparc::F4,  Sparc::F5,  Sparc::F6,  Sparc::F7,
+    Sparc::F8,  Sparc::F9,  Sparc::F10, Sparc::F11,
+    Sparc::F12, Sparc::F13, Sparc::F14, Sparc::F15,
+    Sparc::F16, Sparc::F17, Sparc::F18, Sparc::F19,
+    Sparc::F20, Sparc::F21, Sparc::F22, Sparc::F23,
+    Sparc::F24, Sparc::F25, Sparc::F26, Sparc::F27,
+    Sparc::F28, Sparc::F29, Sparc::F30, Sparc::F31 };
+
+  static unsigned DoubleRegs[32] = {
+    Sparc::D0,  Sparc::D1,  Sparc::D2,  Sparc::D3,
+    Sparc::D4,  Sparc::D5,  Sparc::D6,  Sparc::D7,
+    Sparc::D8,  Sparc::D7,  Sparc::D8,  Sparc::D9,
+    Sparc::D12, Sparc::D13, Sparc::D14, Sparc::D15,
+    Sparc::D16, Sparc::D17, Sparc::D18, Sparc::D19,
+    Sparc::D20, Sparc::D21, Sparc::D22, Sparc::D23,
+    Sparc::D24, Sparc::D25, Sparc::D26, Sparc::D27,
+    Sparc::D28, Sparc::D29, Sparc::D30, Sparc::D31 };
+
+  static unsigned QuadFPRegs[32] = {
+    Sparc::Q0,  Sparc::Q1,  Sparc::Q2,  Sparc::Q3,
+    Sparc::Q4,  Sparc::Q5,  Sparc::Q6,  Sparc::Q7,
+    Sparc::Q8,  Sparc::Q9,  Sparc::Q10, Sparc::Q11,
+    Sparc::Q12, Sparc::Q13, Sparc::Q14, Sparc::Q15 };
+
+
+/// SparcOperand - Instances of this class represent a parsed Sparc machine
+/// instruction.
+class SparcOperand : public MCParsedAsmOperand {
+public:
+  enum RegisterKind {
+    rk_None,
+    rk_IntReg,
+    rk_FloatReg,
+    rk_DoubleReg,
+    rk_QuadReg,
+    rk_CCReg,
+    rk_Y
+  };
+private:
+  enum KindTy {
+    k_Token,
+    k_Register,
+    k_Immediate,
+    k_MemoryReg,
+    k_MemoryImm
+  } Kind;
+
+  SMLoc StartLoc, EndLoc;
+
+  struct Token {
+    const char *Data;
+    unsigned Length;
+  };
+
+  struct RegOp {
+    unsigned RegNum;
+    RegisterKind Kind;
+  };
+
+  struct ImmOp {
+    const MCExpr *Val;
+  };
+
+  struct MemOp {
+    unsigned Base;
+    unsigned OffsetReg;
+    const MCExpr *Off;
+  };
+
+  union {
+    struct Token Tok;
+    struct RegOp Reg;
+    struct ImmOp Imm;
+    struct MemOp Mem;
+  };
+public:
+  SparcOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
+
+  bool isToken() const override { return Kind == k_Token; }
+  bool isReg() const override { return Kind == k_Register; }
+  bool isImm() const override { return Kind == k_Immediate; }
+  bool isMem() const override { return isMEMrr() || isMEMri(); }
+  bool isMEMrr() const { return Kind == k_MemoryReg; }
+  bool isMEMri() const { return Kind == k_MemoryImm; }
+
+  bool isFloatReg() const {
+    return (Kind == k_Register && Reg.Kind == rk_FloatReg);
+  }
+
+  bool isFloatOrDoubleReg() const {
+    return (Kind == k_Register && (Reg.Kind == rk_FloatReg
+                                   || Reg.Kind == rk_DoubleReg));
+  }
+
+
+  StringRef getToken() const {
+    assert(Kind == k_Token && "Invalid access!");
+    return StringRef(Tok.Data, Tok.Length);
+  }
+
+  unsigned getReg() const override {
+    assert((Kind == k_Register) && "Invalid access!");
+    return Reg.RegNum;
+  }
+
+  const MCExpr *getImm() const {
+    assert((Kind == k_Immediate) && "Invalid access!");
+    return Imm.Val;
+  }
+
+  unsigned getMemBase() const {
+    assert((Kind == k_MemoryReg || Kind == k_MemoryImm) && "Invalid access!");
+    return Mem.Base;
+  }
+
+  unsigned getMemOffsetReg() const {
+    assert((Kind == k_MemoryReg) && "Invalid access!");
+    return Mem.OffsetReg;
+  }
+
+  const MCExpr *getMemOff() const {
+    assert((Kind == k_MemoryImm) && "Invalid access!");
+    return Mem.Off;
+  }
+
+  /// getStartLoc - Get the location of the first token of this operand.
+  SMLoc getStartLoc() const override {
+    return StartLoc;
+  }
+  /// getEndLoc - Get the location of the last token of this operand.
+  SMLoc getEndLoc() const override {
+    return EndLoc;
+  }
+
+  void print(raw_ostream &OS) const override {
+    switch (Kind) {
+    case k_Token:     OS << "Token: " << getToken() << "\n"; break;
+    case k_Register:  OS << "Reg: #" << getReg() << "\n"; break;
+    case k_Immediate: OS << "Imm: " << getImm() << "\n"; break;
+    case k_MemoryReg: OS << "Mem: " << getMemBase() << "+"
+                         << getMemOffsetReg() << "\n"; break;
+    case k_MemoryImm: assert(getMemOff() != nullptr);
+      OS << "Mem: " << getMemBase()
+         << "+" << *getMemOff()
+         << "\n"; break;
+    }
+  }
+
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getReg()));
+  }
+
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCExpr *Expr = getImm();
+    addExpr(Inst, Expr);
+  }
+
+  void addExpr(MCInst &Inst, const MCExpr *Expr) const{
+    // Add as immediate when possible.  Null MCExpr = 0.
+    if (!Expr)
+      Inst.addOperand(MCOperand::CreateImm(0));
+    else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+      Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::CreateExpr(Expr));
+  }
+
+  void addMEMrrOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+
+    Inst.addOperand(MCOperand::CreateReg(getMemBase()));
+
+    assert(getMemOffsetReg() != 0 && "Invalid offset");
+    Inst.addOperand(MCOperand::CreateReg(getMemOffsetReg()));
+  }
+
+  void addMEMriOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+
+    Inst.addOperand(MCOperand::CreateReg(getMemBase()));
+
+    const MCExpr *Expr = getMemOff();
+    addExpr(Inst, Expr);
+  }
+
+  static std::unique_ptr<SparcOperand> CreateToken(StringRef Str, SMLoc S) {
+    auto Op = make_unique<SparcOperand>(k_Token);
+    Op->Tok.Data = Str.data();
+    Op->Tok.Length = Str.size();
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static std::unique_ptr<SparcOperand> CreateReg(unsigned RegNum, unsigned Kind,
+                                                 SMLoc S, SMLoc E) {
+    auto Op = make_unique<SparcOperand>(k_Register);
+    Op->Reg.RegNum = RegNum;
+    Op->Reg.Kind   = (SparcOperand::RegisterKind)Kind;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<SparcOperand> CreateImm(const MCExpr *Val, SMLoc S,
+                                                 SMLoc E) {
+    auto Op = make_unique<SparcOperand>(k_Immediate);
+    Op->Imm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static bool MorphToDoubleReg(SparcOperand &Op) {
+    unsigned Reg = Op.getReg();
+    assert(Op.Reg.Kind == rk_FloatReg);
+    unsigned regIdx = Reg - Sparc::F0;
+    if (regIdx % 2 || regIdx > 31)
+      return false;
+    Op.Reg.RegNum = DoubleRegs[regIdx / 2];
+    Op.Reg.Kind = rk_DoubleReg;
+    return true;
+  }
+
+  static bool MorphToQuadReg(SparcOperand &Op) {
+    unsigned Reg = Op.getReg();
+    unsigned regIdx = 0;
+    switch (Op.Reg.Kind) {
+    default: llvm_unreachable("Unexpected register kind!");
+    case rk_FloatReg:
+      regIdx = Reg - Sparc::F0;
+      if (regIdx % 4 || regIdx > 31)
+        return false;
+      Reg = QuadFPRegs[regIdx / 4];
+      break;
+    case rk_DoubleReg:
+      regIdx =  Reg - Sparc::D0;
+      if (regIdx % 2 || regIdx > 31)
+        return false;
+      Reg = QuadFPRegs[regIdx / 2];
+      break;
+    }
+    Op.Reg.RegNum = Reg;
+    Op.Reg.Kind = rk_QuadReg;
+    return true;
+  }
+
+  static std::unique_ptr<SparcOperand>
+  MorphToMEMrr(unsigned Base, std::unique_ptr<SparcOperand> Op) {
+    unsigned offsetReg = Op->getReg();
+    Op->Kind = k_MemoryReg;
+    Op->Mem.Base = Base;
+    Op->Mem.OffsetReg = offsetReg;
+    Op->Mem.Off = nullptr;
+    return Op;
+  }
+
+  static std::unique_ptr<SparcOperand>
+  CreateMEMri(unsigned Base, const MCExpr *Off, SMLoc S, SMLoc E) {
+    auto Op = make_unique<SparcOperand>(k_MemoryImm);
+    Op->Mem.Base = Base;
+    Op->Mem.OffsetReg = 0;
+    Op->Mem.Off = Off;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<SparcOperand>
+  MorphToMEMri(unsigned Base, std::unique_ptr<SparcOperand> Op) {
+    const MCExpr *Imm  = Op->getImm();
+    Op->Kind = k_MemoryImm;
+    Op->Mem.Base = Base;
+    Op->Mem.OffsetReg = 0;
+    Op->Mem.Off = Imm;
+    return Op;
+  }
+};
+
+} // end namespace
+
+bool SparcAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                             OperandVector &Operands,
+                                             MCStreamer &Out,
+                                             unsigned &ErrorInfo,
+                                             bool MatchingInlineAsm) {
+  MCInst Inst;
+  SmallVector<MCInst, 8> Instructions;
+  unsigned MatchResult = MatchInstructionImpl(Operands, Inst, ErrorInfo,
+                                              MatchingInlineAsm);
+  switch (MatchResult) {
+  default:
+    break;
+
+  case Match_Success: {
+    Inst.setLoc(IDLoc);
+    Out.EmitInstruction(Inst, STI);
+    return false;
+  }
+
+  case Match_MissingFeature:
+    return Error(IDLoc,
+                 "instruction requires a CPU feature not currently enabled");
+
+  case Match_InvalidOperand: {
+    SMLoc ErrorLoc = IDLoc;
+    if (ErrorInfo != ~0U) {
+      if (ErrorInfo >= Operands.size())
+        return Error(IDLoc, "too few operands for instruction");
+
+      ErrorLoc = ((SparcOperand &)*Operands[ErrorInfo]).getStartLoc();
+      if (ErrorLoc == SMLoc())
+        ErrorLoc = IDLoc;
+    }
+
+    return Error(ErrorLoc, "invalid operand for instruction");
+  }
+  case Match_MnemonicFail:
+    return Error(IDLoc, "invalid instruction mnemonic");
+  }
+  return true;
+}
+
+bool SparcAsmParser::
+ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc)
+{
+  const AsmToken &Tok = Parser.getTok();
+  StartLoc = Tok.getLoc();
+  EndLoc = Tok.getEndLoc();
+  RegNo = 0;
+  if (getLexer().getKind() != AsmToken::Percent)
+    return false;
+  Parser.Lex();
+  unsigned regKind = SparcOperand::rk_None;
+  if (matchRegisterName(Tok, RegNo, regKind)) {
+    Parser.Lex();
+    return false;
+  }
+
+  return Error(StartLoc, "invalid register name");
+}
+
+static void applyMnemonicAliases(StringRef &Mnemonic, unsigned Features,
+                                 unsigned VariantID);
+
+bool SparcAsmParser::ParseInstruction(ParseInstructionInfo &Info,
+                                      StringRef Name, SMLoc NameLoc,
+                                      OperandVector &Operands) {
+
+  // First operand in MCInst is instruction mnemonic.
+  Operands.push_back(SparcOperand::CreateToken(Name, NameLoc));
+
+  // apply mnemonic aliases, if any, so that we can parse operands correctly.
+  applyMnemonicAliases(Name, getAvailableFeatures(), 0);
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    // Read the first operand.
+    if (getLexer().is(AsmToken::Comma)) {
+      if (parseBranchModifiers(Operands) != MatchOperand_Success) {
+        SMLoc Loc = getLexer().getLoc();
+        Parser.eatToEndOfStatement();
+        return Error(Loc, "unexpected token");
+      }
+    }
+    if (parseOperand(Operands, Name) != MatchOperand_Success) {
+      SMLoc Loc = getLexer().getLoc();
+      Parser.eatToEndOfStatement();
+      return Error(Loc, "unexpected token");
+    }
+
+    while (getLexer().is(AsmToken::Comma)) {
+      Parser.Lex(); // Eat the comma.
+      // Parse and remember the operand.
+      if (parseOperand(Operands, Name) != MatchOperand_Success) {
+        SMLoc Loc = getLexer().getLoc();
+        Parser.eatToEndOfStatement();
+        return Error(Loc, "unexpected token");
+      }
+    }
+  }
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    SMLoc Loc = getLexer().getLoc();
+    Parser.eatToEndOfStatement();
+    return Error(Loc, "unexpected token");
+  }
+  Parser.Lex(); // Consume the EndOfStatement.
+  return false;
+}
+
+bool SparcAsmParser::
+ParseDirective(AsmToken DirectiveID)
+{
+  StringRef IDVal = DirectiveID.getString();
+
+  if (IDVal == ".byte")
+    return parseDirectiveWord(1, DirectiveID.getLoc());
+
+  if (IDVal == ".half")
+    return parseDirectiveWord(2, DirectiveID.getLoc());
+
+  if (IDVal == ".word")
+    return parseDirectiveWord(4, DirectiveID.getLoc());
+
+  if (IDVal == ".nword")
+    return parseDirectiveWord(is64Bit() ? 8 : 4, DirectiveID.getLoc());
+
+  if (is64Bit() && IDVal == ".xword")
+    return parseDirectiveWord(8, DirectiveID.getLoc());
+
+  if (IDVal == ".register") {
+    // For now, ignore .register directive.
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
+  // Let the MC layer to handle other directives.
+  return true;
+}
+
+bool SparcAsmParser:: parseDirectiveWord(unsigned Size, SMLoc L) {
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    for (;;) {
+      const MCExpr *Value;
+      if (getParser().parseExpression(Value))
+        return true;
+
+      getParser().getStreamer().EmitValue(Value, Size);
+
+      if (getLexer().is(AsmToken::EndOfStatement))
+        break;
+
+      // FIXME: Improve diagnostic.
+      if (getLexer().isNot(AsmToken::Comma))
+        return Error(L, "unexpected token in directive");
+      Parser.Lex();
+    }
+  }
+  Parser.Lex();
+  return false;
+}
+
+SparcAsmParser::OperandMatchResultTy
+SparcAsmParser::parseMEMOperand(OperandVector &Operands) {
+
+  SMLoc S, E;
+  unsigned BaseReg = 0;
+
+  if (ParseRegister(BaseReg, S, E)) {
+    return MatchOperand_NoMatch;
+  }
+
+  switch (getLexer().getKind()) {
+  default: return MatchOperand_NoMatch;
+
+  case AsmToken::Comma:
+  case AsmToken::RBrac:
+  case AsmToken::EndOfStatement:
+    Operands.push_back(SparcOperand::CreateMEMri(BaseReg, nullptr, S, E));
+    return MatchOperand_Success;
+
+  case AsmToken:: Plus:
+    Parser.Lex(); // Eat the '+'
+    break;
+  case AsmToken::Minus:
+    break;
+  }
+
+  std::unique_ptr<SparcOperand> Offset;
+  OperandMatchResultTy ResTy = parseSparcAsmOperand(Offset);
+  if (ResTy != MatchOperand_Success || !Offset)
+    return MatchOperand_NoMatch;
+
+  Operands.push_back(
+      Offset->isImm() ? SparcOperand::MorphToMEMri(BaseReg, std::move(Offset))
+                      : SparcOperand::MorphToMEMrr(BaseReg, std::move(Offset)));
+
+  return MatchOperand_Success;
+}
+
+SparcAsmParser::OperandMatchResultTy
+SparcAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
+
+  OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
+
+  // If there wasn't a custom match, try the generic matcher below. Otherwise,
+  // there was a match, but an error occurred, in which case, just return that
+  // the operand parsing failed.
+  if (ResTy == MatchOperand_Success || ResTy == MatchOperand_ParseFail)
+    return ResTy;
+
+  if (getLexer().is(AsmToken::LBrac)) {
+    // Memory operand
+    Operands.push_back(SparcOperand::CreateToken("[",
+                                                 Parser.getTok().getLoc()));
+    Parser.Lex(); // Eat the [
+
+    if (Mnemonic == "cas" || Mnemonic == "casx") {
+      SMLoc S = Parser.getTok().getLoc();
+      if (getLexer().getKind() != AsmToken::Percent)
+        return MatchOperand_NoMatch;
+      Parser.Lex(); // eat %
+
+      unsigned RegNo, RegKind;
+      if (!matchRegisterName(Parser.getTok(), RegNo, RegKind))
+        return MatchOperand_NoMatch;
+
+      Parser.Lex(); // Eat the identifier token.
+      SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer()-1);
+      Operands.push_back(SparcOperand::CreateReg(RegNo, RegKind, S, E));
+      ResTy = MatchOperand_Success;
+    } else {
+      ResTy = parseMEMOperand(Operands);
+    }
+
+    if (ResTy != MatchOperand_Success)
+      return ResTy;
+
+    if (!getLexer().is(AsmToken::RBrac))
+      return MatchOperand_ParseFail;
+
+    Operands.push_back(SparcOperand::CreateToken("]",
+                                                 Parser.getTok().getLoc()));
+    Parser.Lex(); // Eat the ]
+    return MatchOperand_Success;
+  }
+
+  std::unique_ptr<SparcOperand> Op;
+
+  ResTy = parseSparcAsmOperand(Op, (Mnemonic == "call"));
+  if (ResTy != MatchOperand_Success || !Op)
+    return MatchOperand_ParseFail;
+
+  // Push the parsed operand into the list of operands
+  Operands.push_back(std::move(Op));
+
+  return MatchOperand_Success;
+}
+
+SparcAsmParser::OperandMatchResultTy
+SparcAsmParser::parseSparcAsmOperand(std::unique_ptr<SparcOperand> &Op,
+                                     bool isCall) {
+
+  SMLoc S = Parser.getTok().getLoc();
+  SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+  const MCExpr *EVal;
+
+  Op = nullptr;
+  switch (getLexer().getKind()) {
+  default:  break;
+
+  case AsmToken::Percent:
+    Parser.Lex(); // Eat the '%'.
+    unsigned RegNo;
+    unsigned RegKind;
+    if (matchRegisterName(Parser.getTok(), RegNo, RegKind)) {
+      StringRef name = Parser.getTok().getString();
+      Parser.Lex(); // Eat the identifier token.
+      E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+      switch (RegNo) {
+      default:
+        Op = SparcOperand::CreateReg(RegNo, RegKind, S, E);
+        break;
+      case Sparc::Y:
+        Op = SparcOperand::CreateToken("%y", S);
+        break;
+
+      case Sparc::ICC:
+        if (name == "xcc")
+          Op = SparcOperand::CreateToken("%xcc", S);
+        else
+          Op = SparcOperand::CreateToken("%icc", S);
+        break;
+      }
+      break;
+    }
+    if (matchSparcAsmModifiers(EVal, E)) {
+      E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+      Op = SparcOperand::CreateImm(EVal, S, E);
+    }
+    break;
+
+  case AsmToken::Minus:
+  case AsmToken::Integer:
+    if (!getParser().parseExpression(EVal, E))
+      Op = SparcOperand::CreateImm(EVal, S, E);
+    break;
+
+  case AsmToken::Identifier: {
+    StringRef Identifier;
+    if (!getParser().parseIdentifier(Identifier)) {
+      E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+      MCSymbol *Sym = getContext().GetOrCreateSymbol(Identifier);
+
+      const MCExpr *Res = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None,
+                                                  getContext());
+      if (isCall &&
+          getContext().getObjectFileInfo()->getRelocM() == Reloc::PIC_)
+        Res = SparcMCExpr::Create(SparcMCExpr::VK_Sparc_WPLT30, Res,
+                                  getContext());
+      Op = SparcOperand::CreateImm(Res, S, E);
+    }
+    break;
+  }
+  }
+  return (Op) ? MatchOperand_Success : MatchOperand_ParseFail;
+}
+
+SparcAsmParser::OperandMatchResultTy
+SparcAsmParser::parseBranchModifiers(OperandVector &Operands) {
+
+  // parse (,a|,pn|,pt)+
+
+  while (getLexer().is(AsmToken::Comma)) {
+
+    Parser.Lex(); // Eat the comma
+
+    if (!getLexer().is(AsmToken::Identifier))
+      return MatchOperand_ParseFail;
+    StringRef modName = Parser.getTok().getString();
+    if (modName == "a" || modName == "pn" || modName == "pt") {
+      Operands.push_back(SparcOperand::CreateToken(modName,
+                                                   Parser.getTok().getLoc()));
+      Parser.Lex(); // eat the identifier.
+    }
+  }
+  return MatchOperand_Success;
+}
+
+bool SparcAsmParser::matchRegisterName(const AsmToken &Tok,
+                                       unsigned &RegNo,
+                                       unsigned &RegKind)
+{
+  int64_t intVal = 0;
+  RegNo = 0;
+  RegKind = SparcOperand::rk_None;
+  if (Tok.is(AsmToken::Identifier)) {
+    StringRef name = Tok.getString();
+
+    // %fp
+    if (name.equals("fp")) {
+      RegNo = Sparc::I6;
+      RegKind = SparcOperand::rk_IntReg;
+      return true;
+    }
+    // %sp
+    if (name.equals("sp")) {
+      RegNo = Sparc::O6;
+      RegKind = SparcOperand::rk_IntReg;
+      return true;
+    }
+
+    if (name.equals("y")) {
+      RegNo = Sparc::Y;
+      RegKind = SparcOperand::rk_Y;
+      return true;
+    }
+
+    if (name.equals("icc")) {
+      RegNo = Sparc::ICC;
+      RegKind = SparcOperand::rk_CCReg;
+      return true;
+    }
+
+    if (name.equals("xcc")) {
+      // FIXME:: check 64bit.
+      RegNo = Sparc::ICC;
+      RegKind = SparcOperand::rk_CCReg;
+      return true;
+    }
+
+    // %fcc0 - %fcc3
+    if (name.substr(0, 3).equals_lower("fcc")
+        && !name.substr(3).getAsInteger(10, intVal)
+        && intVal < 4) {
+      // FIXME: check 64bit and  handle %fcc1 - %fcc3
+      RegNo = Sparc::FCC0 + intVal;
+      RegKind = SparcOperand::rk_CCReg;
+      return true;
+    }
+
+    // %g0 - %g7
+    if (name.substr(0, 1).equals_lower("g")
+        && !name.substr(1).getAsInteger(10, intVal)
+        && intVal < 8) {
+      RegNo = IntRegs[intVal];
+      RegKind = SparcOperand::rk_IntReg;
+      return true;
+    }
+    // %o0 - %o7
+    if (name.substr(0, 1).equals_lower("o")
+        && !name.substr(1).getAsInteger(10, intVal)
+        && intVal < 8) {
+      RegNo = IntRegs[8 + intVal];
+      RegKind = SparcOperand::rk_IntReg;
+      return true;
+    }
+    if (name.substr(0, 1).equals_lower("l")
+        && !name.substr(1).getAsInteger(10, intVal)
+        && intVal < 8) {
+      RegNo = IntRegs[16 + intVal];
+      RegKind = SparcOperand::rk_IntReg;
+      return true;
+    }
+    if (name.substr(0, 1).equals_lower("i")
+        && !name.substr(1).getAsInteger(10, intVal)
+        && intVal < 8) {
+      RegNo = IntRegs[24 + intVal];
+      RegKind = SparcOperand::rk_IntReg;
+      return true;
+    }
+    // %f0 - %f31
+    if (name.substr(0, 1).equals_lower("f")
+        && !name.substr(1, 2).getAsInteger(10, intVal) && intVal < 32) {
+      RegNo = FloatRegs[intVal];
+      RegKind = SparcOperand::rk_FloatReg;
+      return true;
+    }
+    // %f32 - %f62
+    if (name.substr(0, 1).equals_lower("f")
+        && !name.substr(1, 2).getAsInteger(10, intVal)
+        && intVal >= 32 && intVal <= 62 && (intVal % 2 == 0)) {
+      // FIXME: Check V9
+      RegNo = DoubleRegs[intVal/2];
+      RegKind = SparcOperand::rk_DoubleReg;
+      return true;
+    }
+
+    // %r0 - %r31
+    if (name.substr(0, 1).equals_lower("r")
+        && !name.substr(1, 2).getAsInteger(10, intVal) && intVal < 31) {
+      RegNo = IntRegs[intVal];
+      RegKind = SparcOperand::rk_IntReg;
+      return true;
+    }
+  }
+  return false;
+}
+
+static bool hasGOTReference(const MCExpr *Expr) {
+  switch (Expr->getKind()) {
+  case MCExpr::Target:
+    if (const SparcMCExpr *SE = dyn_cast<SparcMCExpr>(Expr))
+      return hasGOTReference(SE->getSubExpr());
+    break;
+
+  case MCExpr::Constant:
+    break;
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Expr);
+    return hasGOTReference(BE->getLHS()) || hasGOTReference(BE->getRHS());
+  }
+
+  case MCExpr::SymbolRef: {
+    const MCSymbolRefExpr &SymRef = *cast<MCSymbolRefExpr>(Expr);
+    return (SymRef.getSymbol().getName() == "_GLOBAL_OFFSET_TABLE_");
+  }
+
+  case MCExpr::Unary:
+    return hasGOTReference(cast<MCUnaryExpr>(Expr)->getSubExpr());
+  }
+  return false;
+}
+
+bool SparcAsmParser::matchSparcAsmModifiers(const MCExpr *&EVal,
+                                            SMLoc &EndLoc)
+{
+  AsmToken Tok = Parser.getTok();
+  if (!Tok.is(AsmToken::Identifier))
+    return false;
+
+  StringRef name = Tok.getString();
+
+  SparcMCExpr::VariantKind VK = SparcMCExpr::parseVariantKind(name);
+
+  if (VK == SparcMCExpr::VK_Sparc_None)
+    return false;
+
+  Parser.Lex(); // Eat the identifier.
+  if (Parser.getTok().getKind() != AsmToken::LParen)
+    return false;
+
+  Parser.Lex(); // Eat the LParen token.
+  const MCExpr *subExpr;
+  if (Parser.parseParenExpression(subExpr, EndLoc))
+    return false;
+
+  bool isPIC = getContext().getObjectFileInfo()->getRelocM() == Reloc::PIC_;
+
+  switch(VK) {
+  default: break;
+  case SparcMCExpr::VK_Sparc_LO:
+    VK =  (hasGOTReference(subExpr)
+           ? SparcMCExpr::VK_Sparc_PC10
+           : (isPIC ? SparcMCExpr::VK_Sparc_GOT10 : VK));
+    break;
+  case SparcMCExpr::VK_Sparc_HI:
+    VK =  (hasGOTReference(subExpr)
+           ? SparcMCExpr::VK_Sparc_PC22
+           : (isPIC ? SparcMCExpr::VK_Sparc_GOT22 : VK));
+    break;
+  }
+
+  EVal = SparcMCExpr::Create(VK, subExpr, getContext());
+  return true;
+}
+
+
+extern "C" void LLVMInitializeSparcAsmParser() {
+  RegisterMCAsmParser<SparcAsmParser> A(TheSparcTarget);
+  RegisterMCAsmParser<SparcAsmParser> B(TheSparcV9Target);
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_MATCHER_IMPLEMENTATION
+#include "SparcGenAsmMatcher.inc"
+
+unsigned SparcAsmParser::validateTargetOperandClass(MCParsedAsmOperand &GOp,
+                                                    unsigned Kind) {
+  SparcOperand &Op = (SparcOperand &)GOp;
+  if (Op.isFloatOrDoubleReg()) {
+    switch (Kind) {
+    default: break;
+    case MCK_DFPRegs:
+      if (!Op.isFloatReg() || SparcOperand::MorphToDoubleReg(Op))
+        return MCTargetAsmParser::Match_Success;
+      break;
+    case MCK_QFPRegs:
+      if (SparcOperand::MorphToQuadReg(Op))
+        return MCTargetAsmParser::Match_Success;
+      break;
+    }
+  }
+  return Match_InvalidOperand;
+}
diff --git a/contrib/llvm/lib/Target/Sparc/DelaySlotFiller.cpp b/contrib/llvm/lib/Target/Sparc/DelaySlotFiller.cpp
new file mode 100644
index 0000000..f3441ff
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/DelaySlotFiller.cpp
@@ -0,0 +1,497 @@
+//===-- DelaySlotFiller.cpp - SPARC delay slot filler ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a simple local pass that attempts to fill delay slots with useful
+// instructions. If no instructions can be moved into the delay slot, then a
+// NOP is placed.
+//===----------------------------------------------------------------------===//
+
+#include "Sparc.h"
+#include "SparcSubtarget.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "delay-slot-filler"
+
+STATISTIC(FilledSlots, "Number of delay slots filled");
+
+static cl::opt<bool> DisableDelaySlotFiller(
+  "disable-sparc-delay-filler",
+  cl::init(false),
+  cl::desc("Disable the Sparc delay slot filler."),
+  cl::Hidden);
+
+namespace {
+  struct Filler : public MachineFunctionPass {
+    /// Target machine description which we query for reg. names, data
+    /// layout, etc.
+    ///
+    TargetMachine &TM;
+    const SparcSubtarget *Subtarget;
+
+    static char ID;
+    Filler(TargetMachine &tm)
+      : MachineFunctionPass(ID), TM(tm),
+        Subtarget(&TM.getSubtarget<SparcSubtarget>()) {
+    }
+
+    const char *getPassName() const override {
+      return "SPARC Delay Slot Filler";
+    }
+
+    bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
+    bool runOnMachineFunction(MachineFunction &F) override {
+      bool Changed = false;
+
+      // This pass invalidates liveness information when it reorders
+      // instructions to fill delay slot.
+      F.getRegInfo().invalidateLiveness();
+
+      for (MachineFunction::iterator FI = F.begin(), FE = F.end();
+           FI != FE; ++FI)
+        Changed |= runOnMachineBasicBlock(*FI);
+      return Changed;
+    }
+
+    void insertCallDefsUses(MachineBasicBlock::iterator MI,
+                            SmallSet<unsigned, 32>& RegDefs,
+                            SmallSet<unsigned, 32>& RegUses);
+
+    void insertDefsUses(MachineBasicBlock::iterator MI,
+                        SmallSet<unsigned, 32>& RegDefs,
+                        SmallSet<unsigned, 32>& RegUses);
+
+    bool IsRegInSet(SmallSet<unsigned, 32>& RegSet,
+                    unsigned Reg);
+
+    bool delayHasHazard(MachineBasicBlock::iterator candidate,
+                        bool &sawLoad, bool &sawStore,
+                        SmallSet<unsigned, 32> &RegDefs,
+                        SmallSet<unsigned, 32> &RegUses);
+
+    MachineBasicBlock::iterator
+    findDelayInstr(MachineBasicBlock &MBB, MachineBasicBlock::iterator slot);
+
+    bool needsUnimp(MachineBasicBlock::iterator I, unsigned &StructSize);
+
+    bool tryCombineRestoreWithPrevInst(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MBBI);
+
+  };
+  char Filler::ID = 0;
+} // end of anonymous namespace
+
+/// createSparcDelaySlotFillerPass - Returns a pass that fills in delay
+/// slots in Sparc MachineFunctions
+///
+FunctionPass *llvm::createSparcDelaySlotFillerPass(TargetMachine &tm) {
+  return new Filler(tm);
+}
+
+
+/// runOnMachineBasicBlock - Fill in delay slots for the given basic block.
+/// We assume there is only one delay slot per delayed instruction.
+///
+bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
+  bool Changed = false;
+
+  const TargetInstrInfo *TII = TM.getInstrInfo();
+
+  for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ) {
+    MachineBasicBlock::iterator MI = I;
+    ++I;
+
+    // If MI is restore, try combining it with previous inst.
+    if (!DisableDelaySlotFiller &&
+        (MI->getOpcode() == SP::RESTORErr
+         || MI->getOpcode() == SP::RESTOREri)) {
+      Changed |= tryCombineRestoreWithPrevInst(MBB, MI);
+      continue;
+    }
+
+    if (!Subtarget->isV9() &&
+        (MI->getOpcode() == SP::FCMPS || MI->getOpcode() == SP::FCMPD
+         || MI->getOpcode() == SP::FCMPQ)) {
+      BuildMI(MBB, I, MI->getDebugLoc(), TII->get(SP::NOP));
+      Changed = true;
+      continue;
+    }
+
+    // If MI has no delay slot, skip.
+    if (!MI->hasDelaySlot())
+      continue;
+
+    MachineBasicBlock::iterator D = MBB.end();
+
+    if (!DisableDelaySlotFiller)
+      D = findDelayInstr(MBB, MI);
+
+    ++FilledSlots;
+    Changed = true;
+
+    if (D == MBB.end())
+      BuildMI(MBB, I, MI->getDebugLoc(), TII->get(SP::NOP));
+    else
+      MBB.splice(I, &MBB, D);
+
+    unsigned structSize = 0;
+    if (needsUnimp(MI, structSize)) {
+      MachineBasicBlock::iterator J = MI;
+      ++J; // skip the delay filler.
+      assert (J != MBB.end() && "MI needs a delay instruction.");
+      BuildMI(MBB, ++J, MI->getDebugLoc(),
+              TII->get(SP::UNIMP)).addImm(structSize);
+      // Bundle the delay filler and unimp with the instruction.
+      MIBundleBuilder(MBB, MachineBasicBlock::iterator(MI), J);
+    } else {
+      MIBundleBuilder(MBB, MachineBasicBlock::iterator(MI), I);
+    }
+  }
+  return Changed;
+}
+
+MachineBasicBlock::iterator
+Filler::findDelayInstr(MachineBasicBlock &MBB,
+                       MachineBasicBlock::iterator slot)
+{
+  SmallSet<unsigned, 32> RegDefs;
+  SmallSet<unsigned, 32> RegUses;
+  bool sawLoad = false;
+  bool sawStore = false;
+
+  if (slot == MBB.begin())
+    return MBB.end();
+
+  if (slot->getOpcode() == SP::RET || slot->getOpcode() == SP::TLS_CALL)
+    return MBB.end();
+
+  if (slot->getOpcode() == SP::RETL) {
+    MachineBasicBlock::iterator J = slot;
+    --J;
+
+    if (J->getOpcode() == SP::RESTORErr
+        || J->getOpcode() == SP::RESTOREri) {
+      // change retl to ret.
+      slot->setDesc(TM.getInstrInfo()->get(SP::RET));
+      return J;
+    }
+  }
+
+  // Call's delay filler can def some of call's uses.
+  if (slot->isCall())
+    insertCallDefsUses(slot, RegDefs, RegUses);
+  else
+    insertDefsUses(slot, RegDefs, RegUses);
+
+  bool done = false;
+
+  MachineBasicBlock::iterator I = slot;
+
+  while (!done) {
+    done = (I == MBB.begin());
+
+    if (!done)
+      --I;
+
+    // skip debug value
+    if (I->isDebugValue())
+      continue;
+
+    if (I->hasUnmodeledSideEffects() || I->isInlineAsm() || I->isPosition() ||
+        I->hasDelaySlot() || I->isBundledWithSucc())
+      break;
+
+    if (delayHasHazard(I, sawLoad, sawStore, RegDefs, RegUses)) {
+      insertDefsUses(I, RegDefs, RegUses);
+      continue;
+    }
+
+    return I;
+  }
+  return MBB.end();
+}
+
+bool Filler::delayHasHazard(MachineBasicBlock::iterator candidate,
+                            bool &sawLoad,
+                            bool &sawStore,
+                            SmallSet<unsigned, 32> &RegDefs,
+                            SmallSet<unsigned, 32> &RegUses)
+{
+
+  if (candidate->isImplicitDef() || candidate->isKill())
+    return true;
+
+  if (candidate->mayLoad()) {
+    sawLoad = true;
+    if (sawStore)
+      return true;
+  }
+
+  if (candidate->mayStore()) {
+    if (sawStore)
+      return true;
+    sawStore = true;
+    if (sawLoad)
+      return true;
+  }
+
+  for (unsigned i = 0, e = candidate->getNumOperands(); i!= e; ++i) {
+    const MachineOperand &MO = candidate->getOperand(i);
+    if (!MO.isReg())
+      continue; // skip
+
+    unsigned Reg = MO.getReg();
+
+    if (MO.isDef()) {
+      // check whether Reg is defined or used before delay slot.
+      if (IsRegInSet(RegDefs, Reg) || IsRegInSet(RegUses, Reg))
+        return true;
+    }
+    if (MO.isUse()) {
+      // check whether Reg is defined before delay slot.
+      if (IsRegInSet(RegDefs, Reg))
+        return true;
+    }
+  }
+  return false;
+}
+
+
+void Filler::insertCallDefsUses(MachineBasicBlock::iterator MI,
+                                SmallSet<unsigned, 32>& RegDefs,
+                                SmallSet<unsigned, 32>& RegUses)
+{
+  // Call defines o7, which is visible to the instruction in delay slot.
+  RegDefs.insert(SP::O7);
+
+  switch(MI->getOpcode()) {
+  default: llvm_unreachable("Unknown opcode.");
+  case SP::CALL: break;
+  case SP::CALLrr:
+  case SP::CALLri:
+    assert(MI->getNumOperands() >= 2);
+    const MachineOperand &Reg = MI->getOperand(0);
+    assert(Reg.isReg() && "CALL first operand is not a register.");
+    assert(Reg.isUse() && "CALL first operand is not a use.");
+    RegUses.insert(Reg.getReg());
+
+    const MachineOperand &RegOrImm = MI->getOperand(1);
+    if (RegOrImm.isImm())
+        break;
+    assert(RegOrImm.isReg() && "CALLrr second operand is not a register.");
+    assert(RegOrImm.isUse() && "CALLrr second operand is not a use.");
+    RegUses.insert(RegOrImm.getReg());
+    break;
+  }
+}
+
+// Insert Defs and Uses of MI into the sets RegDefs and RegUses.
+void Filler::insertDefsUses(MachineBasicBlock::iterator MI,
+                            SmallSet<unsigned, 32>& RegDefs,
+                            SmallSet<unsigned, 32>& RegUses)
+{
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg())
+      continue;
+
+    unsigned Reg = MO.getReg();
+    if (Reg == 0)
+      continue;
+    if (MO.isDef())
+      RegDefs.insert(Reg);
+    if (MO.isUse()) {
+      // Implicit register uses of retl are return values and
+      // retl does not use them.
+      if (MO.isImplicit() && MI->getOpcode() == SP::RETL)
+        continue;
+      RegUses.insert(Reg);
+    }
+  }
+}
+
+// returns true if the Reg or its alias is in the RegSet.
+bool Filler::IsRegInSet(SmallSet<unsigned, 32>& RegSet, unsigned Reg)
+{
+  // Check Reg and all aliased Registers.
+  for (MCRegAliasIterator AI(Reg, TM.getRegisterInfo(), true);
+       AI.isValid(); ++AI)
+    if (RegSet.count(*AI))
+      return true;
+  return false;
+}
+
+bool Filler::needsUnimp(MachineBasicBlock::iterator I, unsigned &StructSize)
+{
+  if (!I->isCall())
+    return false;
+
+  unsigned structSizeOpNum = 0;
+  switch (I->getOpcode()) {
+  default: llvm_unreachable("Unknown call opcode.");
+  case SP::CALL: structSizeOpNum = 1; break;
+  case SP::CALLrr:
+  case SP::CALLri: structSizeOpNum = 2; break;
+  case SP::TLS_CALL: return false;
+  }
+
+  const MachineOperand &MO = I->getOperand(structSizeOpNum);
+  if (!MO.isImm())
+    return false;
+  StructSize = MO.getImm();
+  return true;
+}
+
+static bool combineRestoreADD(MachineBasicBlock::iterator RestoreMI,
+                              MachineBasicBlock::iterator AddMI,
+                              const TargetInstrInfo *TII)
+{
+  // Before:  add  <op0>, <op1>, %i[0-7]
+  //          restore %g0, %g0, %i[0-7]
+  //
+  // After :  restore <op0>, <op1>, %o[0-7]
+
+  unsigned reg = AddMI->getOperand(0).getReg();
+  if (reg < SP::I0 || reg > SP::I7)
+    return false;
+
+  // Erase RESTORE.
+  RestoreMI->eraseFromParent();
+
+  // Change ADD to RESTORE.
+  AddMI->setDesc(TII->get((AddMI->getOpcode() == SP::ADDrr)
+                          ? SP::RESTORErr
+                          : SP::RESTOREri));
+
+  // Map the destination register.
+  AddMI->getOperand(0).setReg(reg - SP::I0 + SP::O0);
+
+  return true;
+}
+
+static bool combineRestoreOR(MachineBasicBlock::iterator RestoreMI,
+                             MachineBasicBlock::iterator OrMI,
+                             const TargetInstrInfo *TII)
+{
+  // Before:  or  <op0>, <op1>, %i[0-7]
+  //          restore %g0, %g0, %i[0-7]
+  //    and <op0> or <op1> is zero,
+  //
+  // After :  restore <op0>, <op1>, %o[0-7]
+
+  unsigned reg = OrMI->getOperand(0).getReg();
+  if (reg < SP::I0 || reg > SP::I7)
+    return false;
+
+  // check whether it is a copy.
+  if (OrMI->getOpcode() == SP::ORrr
+      && OrMI->getOperand(1).getReg() != SP::G0
+      && OrMI->getOperand(2).getReg() != SP::G0)
+    return false;
+
+  if (OrMI->getOpcode() == SP::ORri
+      && OrMI->getOperand(1).getReg() != SP::G0
+      && (!OrMI->getOperand(2).isImm() || OrMI->getOperand(2).getImm() != 0))
+    return false;
+
+  // Erase RESTORE.
+  RestoreMI->eraseFromParent();
+
+  // Change OR to RESTORE.
+  OrMI->setDesc(TII->get((OrMI->getOpcode() == SP::ORrr)
+                         ? SP::RESTORErr
+                         : SP::RESTOREri));
+
+  // Map the destination register.
+  OrMI->getOperand(0).setReg(reg - SP::I0 + SP::O0);
+
+  return true;
+}
+
+static bool combineRestoreSETHIi(MachineBasicBlock::iterator RestoreMI,
+                                 MachineBasicBlock::iterator SetHiMI,
+                                 const TargetInstrInfo *TII)
+{
+  // Before:  sethi imm3, %i[0-7]
+  //          restore %g0, %g0, %g0
+  //
+  // After :  restore %g0, (imm3<<10), %o[0-7]
+
+  unsigned reg = SetHiMI->getOperand(0).getReg();
+  if (reg < SP::I0 || reg > SP::I7)
+    return false;
+
+  if (!SetHiMI->getOperand(1).isImm())
+    return false;
+
+  int64_t imm = SetHiMI->getOperand(1).getImm();
+
+  // Is it a 3 bit immediate?
+  if (!isInt<3>(imm))
+    return false;
+
+  // Make it a 13 bit immediate.
+  imm = (imm << 10) & 0x1FFF;
+
+  assert(RestoreMI->getOpcode() == SP::RESTORErr);
+
+  RestoreMI->setDesc(TII->get(SP::RESTOREri));
+
+  RestoreMI->getOperand(0).setReg(reg - SP::I0 + SP::O0);
+  RestoreMI->getOperand(1).setReg(SP::G0);
+  RestoreMI->getOperand(2).ChangeToImmediate(imm);
+
+
+  // Erase the original SETHI.
+  SetHiMI->eraseFromParent();
+
+  return true;
+}
+
+bool Filler::tryCombineRestoreWithPrevInst(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MBBI)
+{
+  // No previous instruction.
+  if (MBBI == MBB.begin())
+    return false;
+
+  // assert that MBBI is a "restore %g0, %g0, %g0".
+  assert(MBBI->getOpcode() == SP::RESTORErr
+         && MBBI->getOperand(0).getReg() == SP::G0
+         && MBBI->getOperand(1).getReg() == SP::G0
+         && MBBI->getOperand(2).getReg() == SP::G0);
+
+  MachineBasicBlock::iterator PrevInst = std::prev(MBBI);
+
+  // It cannot be combined with a bundled instruction.
+  if (PrevInst->isBundledWithSucc())
+    return false;
+
+  const TargetInstrInfo *TII = TM.getInstrInfo();
+
+  switch (PrevInst->getOpcode()) {
+  default: break;
+  case SP::ADDrr:
+  case SP::ADDri: return combineRestoreADD(MBBI, PrevInst, TII); break;
+  case SP::ORrr:
+  case SP::ORri:  return combineRestoreOR(MBBI, PrevInst, TII); break;
+  case SP::SETHIi: return combineRestoreSETHIi(MBBI, PrevInst, TII); break;
+  }
+  // It cannot combine with the previous instruction.
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp b/contrib/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
new file mode 100644
index 0000000..4df0990
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
@@ -0,0 +1,480 @@
+//===- SparcDisassembler.cpp - Disassembler for Sparc -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the Sparc Disassembler.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Sparc.h"
+#include "SparcRegisterInfo.h"
+#include "SparcSubtarget.h"
+#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/Support/MemoryObject.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "sparc-disassembler"
+
+typedef MCDisassembler::DecodeStatus DecodeStatus;
+
+namespace {
+
+/// SparcDisassembler - a disassembler class for Sparc.
+class SparcDisassembler : public MCDisassembler {
+public:
+  /// Constructor     - Initializes the disassembler.
+  ///
+  SparcDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) :
+    MCDisassembler(STI, Ctx)
+  {}
+  virtual ~SparcDisassembler() {}
+
+  /// getInstruction - See MCDisassembler.
+  DecodeStatus getInstruction(MCInst &instr,
+                              uint64_t &size,
+                              const MemoryObject &region,
+                              uint64_t address,
+                              raw_ostream &vStream,
+                              raw_ostream &cStream) const override;
+};
+
+}
+
+namespace llvm {
+  extern Target TheSparcTarget, TheSparcV9Target;
+}
+
+static MCDisassembler *createSparcDisassembler(
+                       const Target &T,
+                       const MCSubtargetInfo &STI,
+                       MCContext &Ctx) {
+  return new SparcDisassembler(STI, Ctx);
+}
+
+
+extern "C" void LLVMInitializeSparcDisassembler() {
+  // Register the disassembler.
+  TargetRegistry::RegisterMCDisassembler(TheSparcTarget,
+                                         createSparcDisassembler);
+  TargetRegistry::RegisterMCDisassembler(TheSparcV9Target,
+                                         createSparcDisassembler);
+}
+
+
+
+static const unsigned IntRegDecoderTable[] = {
+  SP::G0,  SP::G1,  SP::G2,  SP::G3,
+  SP::G4,  SP::G5,  SP::G6,  SP::G7,
+  SP::O0,  SP::O1,  SP::O2,  SP::O3,
+  SP::O4,  SP::O5,  SP::O6,  SP::O7,
+  SP::L0,  SP::L1,  SP::L2,  SP::L3,
+  SP::L4,  SP::L5,  SP::L6,  SP::L7,
+  SP::I0,  SP::I1,  SP::I2,  SP::I3,
+  SP::I4,  SP::I5,  SP::I6,  SP::I7 };
+
+static const unsigned FPRegDecoderTable[] = {
+  SP::F0,   SP::F1,   SP::F2,   SP::F3,
+  SP::F4,   SP::F5,   SP::F6,   SP::F7,
+  SP::F8,   SP::F9,   SP::F10,  SP::F11,
+  SP::F12,  SP::F13,  SP::F14,  SP::F15,
+  SP::F16,  SP::F17,  SP::F18,  SP::F19,
+  SP::F20,  SP::F21,  SP::F22,  SP::F23,
+  SP::F24,  SP::F25,  SP::F26,  SP::F27,
+  SP::F28,  SP::F29,  SP::F30,  SP::F31 };
+
+static const unsigned DFPRegDecoderTable[] = {
+  SP::D0,   SP::D16,  SP::D1,   SP::D17,
+  SP::D2,   SP::D18,  SP::D3,   SP::D19,
+  SP::D4,   SP::D20,  SP::D5,   SP::D21,
+  SP::D6,   SP::D22,  SP::D7,   SP::D23,
+  SP::D8,   SP::D24,  SP::D9,   SP::D25,
+  SP::D10,  SP::D26,  SP::D11,  SP::D27,
+  SP::D12,  SP::D28,  SP::D13,  SP::D29,
+  SP::D14,  SP::D30,  SP::D15,  SP::D31 };
+
+static const unsigned QFPRegDecoderTable[] = {
+  SP::Q0,  SP::Q8,   ~0U,  ~0U,
+  SP::Q1,  SP::Q9,   ~0U,  ~0U,
+  SP::Q2,  SP::Q10,  ~0U,  ~0U,
+  SP::Q3,  SP::Q11,  ~0U,  ~0U,
+  SP::Q4,  SP::Q12,  ~0U,  ~0U,
+  SP::Q5,  SP::Q13,  ~0U,  ~0U,
+  SP::Q6,  SP::Q14,  ~0U,  ~0U,
+  SP::Q7,  SP::Q15,  ~0U,  ~0U } ;
+
+static const unsigned FCCRegDecoderTable[] = {
+  SP::FCC0, SP::FCC1, SP::FCC2, SP::FCC3 };
+
+static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+  unsigned Reg = IntRegDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeI64RegsRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+  unsigned Reg = IntRegDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+
+static DecodeStatus DecodeFPRegsRegisterClass(MCInst &Inst,
+                                              unsigned RegNo,
+                                              uint64_t Address,
+                                              const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+  unsigned Reg = FPRegDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+
+static DecodeStatus DecodeDFPRegsRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+  unsigned Reg = DFPRegDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+
+static DecodeStatus DecodeQFPRegsRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = QFPRegDecoderTable[RegNo];
+  if (Reg == ~0U)
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeFCCRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo > 3)
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateReg(FCCRegDecoderTable[RegNo]));
+  return MCDisassembler::Success;
+}
+
+
+static DecodeStatus DecodeLoadInt(MCInst &Inst, unsigned insn, uint64_t Address,
+                                  const void *Decoder);
+static DecodeStatus DecodeLoadFP(MCInst &Inst, unsigned insn, uint64_t Address,
+                                 const void *Decoder);
+static DecodeStatus DecodeLoadDFP(MCInst &Inst, unsigned insn, uint64_t Address,
+                                  const void *Decoder);
+static DecodeStatus DecodeLoadQFP(MCInst &Inst, unsigned insn, uint64_t Address,
+                                  const void *Decoder);
+static DecodeStatus DecodeStoreInt(MCInst &Inst, unsigned insn,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeStoreFP(MCInst &Inst, unsigned insn,
+                                  uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeStoreDFP(MCInst &Inst, unsigned insn,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeStoreQFP(MCInst &Inst, unsigned insn,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeCall(MCInst &Inst, unsigned insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSIMM13(MCInst &Inst, unsigned insn,
+                                 uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeJMPL(MCInst &Inst, unsigned insn, uint64_t Address,
+                               const void *Decoder);
+static DecodeStatus DecodeReturn(MCInst &MI, unsigned insn, uint64_t Address,
+                                 const void *Decoder);
+static DecodeStatus DecodeSWAP(MCInst &Inst, unsigned insn, uint64_t Address,
+                               const void *Decoder);
+
+#include "SparcGenDisassemblerTables.inc"
+
+/// readInstruction - read four bytes from the MemoryObject
+/// and return 32 bit word.
+static DecodeStatus readInstruction32(const MemoryObject &region,
+                                      uint64_t address,
+                                      uint64_t &size,
+                                      uint32_t &insn) {
+  uint8_t Bytes[4];
+
+  // We want to read exactly 4 Bytes of data.
+  if (region.readBytes(address, 4, Bytes) == -1) {
+    size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  // Encoded as a big-endian 32-bit word in the stream.
+  insn = (Bytes[3] <<  0) |
+    (Bytes[2] <<  8) |
+    (Bytes[1] << 16) |
+    (Bytes[0] << 24);
+
+  return MCDisassembler::Success;
+}
+
+
+DecodeStatus
+SparcDisassembler::getInstruction(MCInst &instr,
+                                 uint64_t &Size,
+                                 const MemoryObject &Region,
+                                 uint64_t Address,
+                                 raw_ostream &vStream,
+                                 raw_ostream &cStream) const {
+  uint32_t Insn;
+
+  DecodeStatus Result = readInstruction32(Region, Address, Size, Insn);
+  if (Result == MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+
+
+  // Calling the auto-generated decoder function.
+  Result = decodeInstruction(DecoderTableSparc32, instr, Insn, Address,
+                             this, STI);
+
+  if (Result != MCDisassembler::Fail) {
+    Size = 4;
+    return Result;
+  }
+
+  return MCDisassembler::Fail;
+}
+
+
+typedef DecodeStatus (*DecodeFunc)(MCInst &MI, unsigned insn, uint64_t Address,
+                                   const void *Decoder);
+
+static DecodeStatus DecodeMem(MCInst &MI, unsigned insn, uint64_t Address,
+                              const void *Decoder,
+                              bool isLoad, DecodeFunc DecodeRD) {
+  unsigned rd = fieldFromInstruction(insn, 25, 5);
+  unsigned rs1 = fieldFromInstruction(insn, 14, 5);
+  bool isImm = fieldFromInstruction(insn, 13, 1);
+  unsigned rs2 = 0;
+  unsigned simm13 = 0;
+  if (isImm)
+    simm13 = SignExtend32<13>(fieldFromInstruction(insn, 0, 13));
+  else
+    rs2 = fieldFromInstruction(insn, 0, 5);
+
+  DecodeStatus status;
+  if (isLoad) {
+    status = DecodeRD(MI, rd, Address, Decoder);
+    if (status != MCDisassembler::Success)
+      return status;
+  }
+
+  // Decode rs1.
+  status = DecodeIntRegsRegisterClass(MI, rs1, Address, Decoder);
+  if (status != MCDisassembler::Success)
+    return status;
+
+  // Decode imm|rs2.
+  if (isImm)
+    MI.addOperand(MCOperand::CreateImm(simm13));
+  else {
+    status = DecodeIntRegsRegisterClass(MI, rs2, Address, Decoder);
+    if (status != MCDisassembler::Success)
+      return status;
+  }
+
+  if (!isLoad) {
+    status = DecodeRD(MI, rd, Address, Decoder);
+    if (status != MCDisassembler::Success)
+      return status;
+  }
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeLoadInt(MCInst &Inst, unsigned insn, uint64_t Address,
+                                  const void *Decoder) {
+  return DecodeMem(Inst, insn, Address, Decoder, true,
+                   DecodeIntRegsRegisterClass);
+}
+
+static DecodeStatus DecodeLoadFP(MCInst &Inst, unsigned insn, uint64_t Address,
+                                 const void *Decoder) {
+  return DecodeMem(Inst, insn, Address, Decoder, true,
+                   DecodeFPRegsRegisterClass);
+}
+
+static DecodeStatus DecodeLoadDFP(MCInst &Inst, unsigned insn, uint64_t Address,
+                                  const void *Decoder) {
+  return DecodeMem(Inst, insn, Address, Decoder, true,
+                   DecodeDFPRegsRegisterClass);
+}
+
+static DecodeStatus DecodeLoadQFP(MCInst &Inst, unsigned insn, uint64_t Address,
+                                  const void *Decoder) {
+  return DecodeMem(Inst, insn, Address, Decoder, true,
+                   DecodeQFPRegsRegisterClass);
+}
+
+static DecodeStatus DecodeStoreInt(MCInst &Inst, unsigned insn,
+                                   uint64_t Address, const void *Decoder) {
+  return DecodeMem(Inst, insn, Address, Decoder, false,
+                   DecodeIntRegsRegisterClass);
+}
+
+static DecodeStatus DecodeStoreFP(MCInst &Inst, unsigned insn, uint64_t Address,
+                                  const void *Decoder) {
+  return DecodeMem(Inst, insn, Address, Decoder, false,
+                   DecodeFPRegsRegisterClass);
+}
+
+static DecodeStatus DecodeStoreDFP(MCInst &Inst, unsigned insn,
+                                   uint64_t Address, const void *Decoder) {
+  return DecodeMem(Inst, insn, Address, Decoder, false,
+                   DecodeDFPRegsRegisterClass);
+}
+
+static DecodeStatus DecodeStoreQFP(MCInst &Inst, unsigned insn,
+                                   uint64_t Address, const void *Decoder) {
+  return DecodeMem(Inst, insn, Address, Decoder, false,
+                   DecodeQFPRegsRegisterClass);
+}
+
+static bool tryAddingSymbolicOperand(int64_t Value,  bool isBranch,
+                                     uint64_t Address, uint64_t Offset,
+                                     uint64_t Width, MCInst &MI,
+                                     const void *Decoder) {
+  const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
+  return Dis->tryAddingSymbolicOperand(MI, Value, Address, isBranch,
+                                       Offset, Width);
+}
+
+static DecodeStatus DecodeCall(MCInst &MI, unsigned insn,
+                               uint64_t Address, const void *Decoder) {
+  unsigned tgt = fieldFromInstruction(insn, 0, 30);
+  tgt <<= 2;
+  if (!tryAddingSymbolicOperand(tgt+Address, false, Address,
+                                0, 30, MI, Decoder))
+    MI.addOperand(MCOperand::CreateImm(tgt));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSIMM13(MCInst &MI, unsigned insn,
+                                 uint64_t Address, const void *Decoder) {
+  unsigned tgt = SignExtend32<13>(fieldFromInstruction(insn, 0, 13));
+  MI.addOperand(MCOperand::CreateImm(tgt));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeJMPL(MCInst &MI, unsigned insn, uint64_t Address,
+                               const void *Decoder) {
+
+  unsigned rd = fieldFromInstruction(insn, 25, 5);
+  unsigned rs1 = fieldFromInstruction(insn, 14, 5);
+  unsigned isImm = fieldFromInstruction(insn, 13, 1);
+  unsigned rs2 = 0;
+  unsigned simm13 = 0;
+  if (isImm)
+    simm13 = SignExtend32<13>(fieldFromInstruction(insn, 0, 13));
+  else
+    rs2 = fieldFromInstruction(insn, 0, 5);
+
+  // Decode RD.
+  DecodeStatus status = DecodeIntRegsRegisterClass(MI, rd, Address, Decoder);
+  if (status != MCDisassembler::Success)
+    return status;
+
+  // Decode RS1.
+  status = DecodeIntRegsRegisterClass(MI, rs1, Address, Decoder);
+  if (status != MCDisassembler::Success)
+    return status;
+
+  // Decode RS1 | SIMM13.
+  if (isImm)
+    MI.addOperand(MCOperand::CreateImm(simm13));
+  else {
+    status = DecodeIntRegsRegisterClass(MI, rs2, Address, Decoder);
+    if (status != MCDisassembler::Success)
+      return status;
+  }
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeReturn(MCInst &MI, unsigned insn, uint64_t Address,
+                                 const void *Decoder) {
+
+  unsigned rs1 = fieldFromInstruction(insn, 14, 5);
+  unsigned isImm = fieldFromInstruction(insn, 13, 1);
+  unsigned rs2 = 0;
+  unsigned simm13 = 0;
+  if (isImm)
+    simm13 = SignExtend32<13>(fieldFromInstruction(insn, 0, 13));
+  else
+    rs2 = fieldFromInstruction(insn, 0, 5);
+
+  // Decode RS1.
+  DecodeStatus status = DecodeIntRegsRegisterClass(MI, rs1, Address, Decoder);
+  if (status != MCDisassembler::Success)
+    return status;
+
+  // Decode RS2 | SIMM13.
+  if (isImm)
+    MI.addOperand(MCOperand::CreateImm(simm13));
+  else {
+    status = DecodeIntRegsRegisterClass(MI, rs2, Address, Decoder);
+    if (status != MCDisassembler::Success)
+      return status;
+  }
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSWAP(MCInst &MI, unsigned insn, uint64_t Address,
+                               const void *Decoder) {
+
+  unsigned rd = fieldFromInstruction(insn, 25, 5);
+  unsigned rs1 = fieldFromInstruction(insn, 14, 5);
+  unsigned isImm = fieldFromInstruction(insn, 13, 1);
+  unsigned rs2 = 0;
+  unsigned simm13 = 0;
+  if (isImm)
+    simm13 = SignExtend32<13>(fieldFromInstruction(insn, 0, 13));
+  else
+    rs2 = fieldFromInstruction(insn, 0, 5);
+
+  // Decode RD.
+  DecodeStatus status = DecodeIntRegsRegisterClass(MI, rd, Address, Decoder);
+  if (status != MCDisassembler::Success)
+    return status;
+
+  // Decode RS1.
+  status = DecodeIntRegsRegisterClass(MI, rs1, Address, Decoder);
+  if (status != MCDisassembler::Success)
+    return status;
+
+  // Decode RS1 | SIMM13.
+  if (isImm)
+    MI.addOperand(MCOperand::CreateImm(simm13));
+  else {
+    status = DecodeIntRegsRegisterClass(MI, rs2, Address, Decoder);
+    if (status != MCDisassembler::Success)
+      return status;
+  }
+  return MCDisassembler::Success;
+}
diff --git a/contrib/llvm/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp b/contrib/llvm/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp
new file mode 100644
index 0000000..5975a51
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp
@@ -0,0 +1,178 @@
+//===-- SparcInstPrinter.cpp - Convert Sparc MCInst to assembly syntax -----==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an Sparc MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcInstPrinter.h"
+#include "Sparc.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+// The generated AsmMatcher SparcGenAsmWriter uses "Sparc" as the target
+// namespace. But SPARC backend uses "SP" as its namespace.
+namespace llvm {
+namespace Sparc {
+  using namespace SP;
+}
+}
+
+#define GET_INSTRUCTION_NAME
+#define PRINT_ALIAS_INSTR
+#include "SparcGenAsmWriter.inc"
+
+bool SparcInstPrinter::isV9() const {
+  return (STI.getFeatureBits() & Sparc::FeatureV9) != 0;
+}
+
+void SparcInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const
+{
+  OS << '%' << StringRef(getRegisterName(RegNo)).lower();
+}
+
+void SparcInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                               StringRef Annot)
+{
+  if (!printAliasInstr(MI, O) && !printSparcAliasInstr(MI, O))
+    printInstruction(MI, O);
+  printAnnotation(O, Annot);
+}
+
+bool SparcInstPrinter::printSparcAliasInstr(const MCInst *MI, raw_ostream &O)
+{
+  switch (MI->getOpcode()) {
+  default: return false;
+  case SP::JMPLrr:
+  case SP::JMPLri: {
+    if (MI->getNumOperands() != 3)
+      return false;
+    if (!MI->getOperand(0).isReg())
+      return false;
+    switch (MI->getOperand(0).getReg()) {
+    default: return false;
+    case SP::G0: // jmp $addr | ret | retl
+      if (MI->getOperand(2).isImm() &&
+          MI->getOperand(2).getImm() == 8) {
+        switch(MI->getOperand(1).getReg()) {
+        default: break;
+        case SP::I7: O << "\tret"; return true;
+        case SP::O7: O << "\tretl"; return true;
+        }
+      }
+      O << "\tjmp "; printMemOperand(MI, 1, O);
+      return true;
+    case SP::O7: // call $addr
+      O << "\tcall "; printMemOperand(MI, 1, O);
+      return true;
+    }
+  }
+  case SP::V9FCMPS:  case SP::V9FCMPD:  case SP::V9FCMPQ:
+  case SP::V9FCMPES: case SP::V9FCMPED: case SP::V9FCMPEQ: {
+    if (isV9()
+        || (MI->getNumOperands() != 3)
+        || (!MI->getOperand(0).isReg())
+        || (MI->getOperand(0).getReg() != SP::FCC0))
+      return false;
+    // if V8, skip printing %fcc0.
+    switch(MI->getOpcode()) {
+    default:
+    case SP::V9FCMPS:  O << "\tfcmps "; break;
+    case SP::V9FCMPD:  O << "\tfcmpd "; break;
+    case SP::V9FCMPQ:  O << "\tfcmpq "; break;
+    case SP::V9FCMPES: O << "\tfcmpes "; break;
+    case SP::V9FCMPED: O << "\tfcmped "; break;
+    case SP::V9FCMPEQ: O << "\tfcmpeq "; break;
+    }
+    printOperand(MI, 1, O);
+    O << ", ";
+    printOperand(MI, 2, O);
+    return true;
+  }
+  }
+}
+
+void SparcInstPrinter::printOperand(const MCInst *MI, int opNum,
+                                    raw_ostream &O)
+{
+  const MCOperand &MO = MI->getOperand (opNum);
+
+  if (MO.isReg()) {
+    printRegName(O, MO.getReg());
+    return ;
+  }
+
+  if (MO.isImm()) {
+    O << (int)MO.getImm();
+    return;
+  }
+
+  assert(MO.isExpr() && "Unknown operand kind in printOperand");
+  MO.getExpr()->print(O);
+}
+
+void SparcInstPrinter::printMemOperand(const MCInst *MI, int opNum,
+                                      raw_ostream &O, const char *Modifier)
+{
+  printOperand(MI, opNum, O);
+
+  // If this is an ADD operand, emit it like normal operands.
+  if (Modifier && !strcmp(Modifier, "arith")) {
+    O << ", ";
+    printOperand(MI, opNum+1, O);
+    return;
+  }
+  const MCOperand &MO = MI->getOperand(opNum+1);
+
+  if (MO.isReg() && MO.getReg() == SP::G0)
+    return;   // don't print "+%g0"
+  if (MO.isImm() && MO.getImm() == 0)
+    return;   // don't print "+0"
+
+  O << "+";
+
+  printOperand(MI, opNum+1, O);
+}
+
+void SparcInstPrinter::printCCOperand(const MCInst *MI, int opNum,
+                                     raw_ostream &O)
+{
+  int CC = (int)MI->getOperand(opNum).getImm();
+  switch (MI->getOpcode()) {
+  default: break;
+  case SP::FBCOND:
+  case SP::FBCONDA:
+  case SP::BPFCC:
+  case SP::BPFCCA:
+  case SP::BPFCCNT:
+  case SP::BPFCCANT:
+  case SP::MOVFCCrr:  case SP::V9MOVFCCrr:
+  case SP::MOVFCCri:  case SP::V9MOVFCCri:
+  case SP::FMOVS_FCC: case SP::V9FMOVS_FCC:
+  case SP::FMOVD_FCC: case SP::V9FMOVD_FCC:
+  case SP::FMOVQ_FCC: case SP::V9FMOVQ_FCC:
+    // Make sure CC is a fp conditional flag.
+    CC = (CC < 16) ? (CC + 16) : CC;
+    break;
+  }
+  O << SPARCCondCodeToString((SPCC::CondCodes)CC);
+}
+
+bool SparcInstPrinter::printGetPCX(const MCInst *MI, unsigned opNum,
+                                  raw_ostream &O)
+{
+  llvm_unreachable("FIXME: Implement SparcInstPrinter::printGetPCX.");
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h b/contrib/llvm/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h
new file mode 100644
index 0000000..8fe4075
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h
@@ -0,0 +1,54 @@
+//===-- SparcInstPrinter.h - Convert Sparc MCInst to assembly syntax ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an Sparc MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SparcINSTPRINTER_H
+#define SparcINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+
+namespace llvm {
+
+class MCOperand;
+
+class SparcInstPrinter : public MCInstPrinter {
+  const MCSubtargetInfo &STI;
+public:
+ SparcInstPrinter(const MCAsmInfo &MAI,
+                  const MCInstrInfo &MII,
+                  const MCRegisterInfo &MRI,
+                  const MCSubtargetInfo &sti)
+   : MCInstPrinter(MAI, MII, MRI), STI(sti) {}
+
+  void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override;
+  bool printSparcAliasInstr(const MCInst *MI, raw_ostream &OS);
+  bool isV9() const;
+
+  // Autogenerated by tblgen.
+  void printInstruction(const MCInst *MI, raw_ostream &O);
+  bool printAliasInstr(const MCInst *MI, raw_ostream &O);
+  void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+                               unsigned PrintMethodIdx, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo);
+
+  void printOperand(const MCInst *MI, int opNum, raw_ostream &OS);
+  void printMemOperand(const MCInst *MI, int opNum, raw_ostream &OS,
+                       const char *Modifier = nullptr);
+  void printCCOperand(const MCInst *MI, int opNum, raw_ostream &OS);
+  bool printGetPCX(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
new file mode 100644
index 0000000..dcd81e3
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
@@ -0,0 +1,261 @@
+//===-- SparcAsmBackend.cpp - Sparc Assembler Backend ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCAsmBackend.h"
+#include "MCTargetDesc/SparcFixupKinds.h"
+#include "MCTargetDesc/SparcMCTargetDesc.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
+  switch (Kind) {
+  default:
+    llvm_unreachable("Unknown fixup kind!");
+  case FK_Data_1:
+  case FK_Data_2:
+  case FK_Data_4:
+  case FK_Data_8:
+    return Value;
+
+  case Sparc::fixup_sparc_wplt30:
+  case Sparc::fixup_sparc_call30:
+    return (Value >> 2) & 0x3fffffff;
+
+  case Sparc::fixup_sparc_br22:
+    return (Value >> 2) & 0x3fffff;
+
+  case Sparc::fixup_sparc_br19:
+    return (Value >> 2) & 0x7ffff;
+
+  case Sparc::fixup_sparc_br16_2:
+    return (Value >> 2) & 0xc000;
+
+  case Sparc::fixup_sparc_br16_14:
+    return (Value >> 2) & 0x3fff;
+
+  case Sparc::fixup_sparc_pc22:
+  case Sparc::fixup_sparc_got22:
+  case Sparc::fixup_sparc_tls_gd_hi22:
+  case Sparc::fixup_sparc_tls_ldm_hi22:
+  case Sparc::fixup_sparc_tls_ie_hi22:
+  case Sparc::fixup_sparc_hi22:
+    return (Value >> 10) & 0x3fffff;
+
+  case Sparc::fixup_sparc_pc10:
+  case Sparc::fixup_sparc_got10:
+  case Sparc::fixup_sparc_tls_gd_lo10:
+  case Sparc::fixup_sparc_tls_ldm_lo10:
+  case Sparc::fixup_sparc_tls_ie_lo10:
+  case Sparc::fixup_sparc_lo10:
+    return Value & 0x3ff;
+
+  case Sparc::fixup_sparc_tls_ldo_hix22:
+  case Sparc::fixup_sparc_tls_le_hix22:
+    return (~Value >> 10) & 0x3fffff;
+
+  case Sparc::fixup_sparc_tls_ldo_lox10:
+  case Sparc::fixup_sparc_tls_le_lox10:
+    return (~(~Value & 0x3ff)) & 0x1fff;
+
+  case Sparc::fixup_sparc_h44:
+    return (Value >> 22) & 0x3fffff;
+
+  case Sparc::fixup_sparc_m44:
+    return (Value >> 12) & 0x3ff;
+
+  case Sparc::fixup_sparc_l44:
+    return Value & 0xfff;
+
+  case Sparc::fixup_sparc_hh:
+    return (Value >> 42) & 0x3fffff;
+
+  case Sparc::fixup_sparc_hm:
+    return (Value >> 32) & 0x3ff;
+
+  case Sparc::fixup_sparc_tls_gd_add:
+  case Sparc::fixup_sparc_tls_gd_call:
+  case Sparc::fixup_sparc_tls_ldm_add:
+  case Sparc::fixup_sparc_tls_ldm_call:
+  case Sparc::fixup_sparc_tls_ldo_add:
+  case Sparc::fixup_sparc_tls_ie_ld:
+  case Sparc::fixup_sparc_tls_ie_ldx:
+  case Sparc::fixup_sparc_tls_ie_add:
+    return 0;
+  }
+}
+
+namespace {
+  class SparcAsmBackend : public MCAsmBackend {
+    const Target &TheTarget;
+  public:
+    SparcAsmBackend(const Target &T) : MCAsmBackend(), TheTarget(T) {}
+
+    unsigned getNumFixupKinds() const override {
+      return Sparc::NumTargetFixupKinds;
+    }
+
+    const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
+      const static MCFixupKindInfo Infos[Sparc::NumTargetFixupKinds] = {
+        // name                    offset bits  flags
+        { "fixup_sparc_call30",     2,     30,  MCFixupKindInfo::FKF_IsPCRel },
+        { "fixup_sparc_br22",      10,     22,  MCFixupKindInfo::FKF_IsPCRel },
+        { "fixup_sparc_br19",      13,     19,  MCFixupKindInfo::FKF_IsPCRel },
+        { "fixup_sparc_br16_2",    10,      2,  MCFixupKindInfo::FKF_IsPCRel },
+        { "fixup_sparc_br16_14",   18,     14,  MCFixupKindInfo::FKF_IsPCRel },
+        { "fixup_sparc_hi22",      10,     22,  0 },
+        { "fixup_sparc_lo10",      22,     10,  0 },
+        { "fixup_sparc_h44",       10,     22,  0 },
+        { "fixup_sparc_m44",       22,     10,  0 },
+        { "fixup_sparc_l44",       20,     12,  0 },
+        { "fixup_sparc_hh",        10,     22,  0 },
+        { "fixup_sparc_hm",        22,     10,  0 },
+        { "fixup_sparc_pc22",      10,     22,  MCFixupKindInfo::FKF_IsPCRel },
+        { "fixup_sparc_pc10",      22,     10,  MCFixupKindInfo::FKF_IsPCRel },
+        { "fixup_sparc_got22",     10,     22,  0 },
+        { "fixup_sparc_got10",     22,     10,  0 },
+        { "fixup_sparc_wplt30",     2,     30,  MCFixupKindInfo::FKF_IsPCRel },
+        { "fixup_sparc_tls_gd_hi22",   10, 22,  0 },
+        { "fixup_sparc_tls_gd_lo10",   22, 10,  0 },
+        { "fixup_sparc_tls_gd_add",     0,  0,  0 },
+        { "fixup_sparc_tls_gd_call",    0,  0,  0 },
+        { "fixup_sparc_tls_ldm_hi22",  10, 22,  0 },
+        { "fixup_sparc_tls_ldm_lo10",  22, 10,  0 },
+        { "fixup_sparc_tls_ldm_add",    0,  0,  0 },
+        { "fixup_sparc_tls_ldm_call",   0,  0,  0 },
+        { "fixup_sparc_tls_ldo_hix22", 10, 22,  0 },
+        { "fixup_sparc_tls_ldo_lox10", 22, 10,  0 },
+        { "fixup_sparc_tls_ldo_add",    0,  0,  0 },
+        { "fixup_sparc_tls_ie_hi22",   10, 22,  0 },
+        { "fixup_sparc_tls_ie_lo10",   22, 10,  0 },
+        { "fixup_sparc_tls_ie_ld",      0,  0,  0 },
+        { "fixup_sparc_tls_ie_ldx",     0,  0,  0 },
+        { "fixup_sparc_tls_ie_add",     0,  0,  0 },
+        { "fixup_sparc_tls_le_hix22",   0,  0,  0 },
+        { "fixup_sparc_tls_le_lox10",   0,  0,  0 }
+      };
+
+      if (Kind < FirstTargetFixupKind)
+        return MCAsmBackend::getFixupKindInfo(Kind);
+
+      assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+             "Invalid kind!");
+      return Infos[Kind - FirstTargetFixupKind];
+    }
+
+    void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
+                           const MCFixup &Fixup, const MCFragment *DF,
+                           const MCValue &Target, uint64_t &Value,
+                           bool &IsResolved) override {
+      switch ((Sparc::Fixups)Fixup.getKind()) {
+      default: break;
+      case Sparc::fixup_sparc_wplt30:
+        if (Target.getSymA()->getSymbol().isTemporary())
+          return;
+      case Sparc::fixup_sparc_tls_gd_hi22:
+      case Sparc::fixup_sparc_tls_gd_lo10:
+      case Sparc::fixup_sparc_tls_gd_add:
+      case Sparc::fixup_sparc_tls_gd_call:
+      case Sparc::fixup_sparc_tls_ldm_hi22:
+      case Sparc::fixup_sparc_tls_ldm_lo10:
+      case Sparc::fixup_sparc_tls_ldm_add:
+      case Sparc::fixup_sparc_tls_ldm_call:
+      case Sparc::fixup_sparc_tls_ldo_hix22:
+      case Sparc::fixup_sparc_tls_ldo_lox10:
+      case Sparc::fixup_sparc_tls_ldo_add:
+      case Sparc::fixup_sparc_tls_ie_hi22:
+      case Sparc::fixup_sparc_tls_ie_lo10:
+      case Sparc::fixup_sparc_tls_ie_ld:
+      case Sparc::fixup_sparc_tls_ie_ldx:
+      case Sparc::fixup_sparc_tls_ie_add:
+      case Sparc::fixup_sparc_tls_le_hix22:
+      case Sparc::fixup_sparc_tls_le_lox10:  IsResolved = false; break;
+      }
+    }
+
+    bool mayNeedRelaxation(const MCInst &Inst) const override {
+      // FIXME.
+      return false;
+    }
+
+    /// fixupNeedsRelaxation - Target specific predicate for whether a given
+    /// fixup requires the associated instruction to be relaxed.
+    bool fixupNeedsRelaxation(const MCFixup &Fixup,
+                              uint64_t Value,
+                              const MCRelaxableFragment *DF,
+                              const MCAsmLayout &Layout) const override {
+      // FIXME.
+      llvm_unreachable("fixupNeedsRelaxation() unimplemented");
+      return false;
+    }
+    void relaxInstruction(const MCInst &Inst, MCInst &Res) const override {
+      // FIXME.
+      llvm_unreachable("relaxInstruction() unimplemented");
+    }
+
+    bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override {
+      // Cannot emit NOP with size not multiple of 32 bits.
+      if (Count % 4 != 0)
+        return false;
+
+      uint64_t NumNops = Count / 4;
+      for (uint64_t i = 0; i != NumNops; ++i)
+        OW->Write32(0x01000000);
+
+      return true;
+    }
+
+    bool is64Bit() const {
+      StringRef name = TheTarget.getName();
+      return name == "sparcv9";
+    }
+  };
+
+  class ELFSparcAsmBackend : public SparcAsmBackend {
+    Triple::OSType OSType;
+  public:
+    ELFSparcAsmBackend(const Target &T, Triple::OSType OSType) :
+      SparcAsmBackend(T), OSType(OSType) { }
+
+    void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                    uint64_t Value, bool IsPCRel) const override {
+
+      Value = adjustFixupValue(Fixup.getKind(), Value);
+      if (!Value) return;           // Doesn't change encoding.
+
+      unsigned Offset = Fixup.getOffset();
+
+      // For each byte of the fragment that the fixup touches, mask in the bits
+      // from the fixup value. The Value has been "split up" into the
+      // appropriate bitfields above.
+      for (unsigned i = 0; i != 4; ++i)
+        Data[Offset + i] |= uint8_t((Value >> ((4 - i - 1)*8)) & 0xff);
+
+    }
+
+    MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
+      uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(OSType);
+      return createSparcELFObjectWriter(OS, is64Bit(), OSABI);
+    }
+  };
+
+} // end anonymous namespace
+
+
+MCAsmBackend *llvm::createSparcAsmBackend(const Target &T,
+                                          const MCRegisterInfo &MRI,
+                                          StringRef TT,
+                                          StringRef CPU) {
+  return new ELFSparcAsmBackend(T, Triple(TT).getOS());
+}
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
new file mode 100644
index 0000000..5ba82f1
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
@@ -0,0 +1,112 @@
+//===-- SparcELFObjectWriter.cpp - Sparc ELF Writer -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/SparcFixupKinds.h"
+#include "MCTargetDesc/SparcMCExpr.h"
+#include "MCTargetDesc/SparcMCTargetDesc.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace {
+  class SparcELFObjectWriter : public MCELFObjectTargetWriter {
+  public:
+    SparcELFObjectWriter(bool Is64Bit, uint8_t OSABI)
+      : MCELFObjectTargetWriter(Is64Bit, OSABI,
+                                Is64Bit ?  ELF::EM_SPARCV9 : ELF::EM_SPARC,
+                                /*HasRelocationAddend*/ true) {}
+
+    virtual ~SparcELFObjectWriter() {}
+  protected:
+    unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
+                          bool IsPCRel) const override;
+  };
+}
+
+unsigned SparcELFObjectWriter::GetRelocType(const MCValue &Target,
+                                            const MCFixup &Fixup,
+                                            bool IsPCRel) const {
+
+  if (const SparcMCExpr *SExpr = dyn_cast<SparcMCExpr>(Fixup.getValue())) {
+    if (SExpr->getKind() == SparcMCExpr::VK_Sparc_R_DISP32)
+      return ELF::R_SPARC_DISP32;
+  }
+
+  if (IsPCRel) {
+    switch((unsigned)Fixup.getKind()) {
+    default:
+      llvm_unreachable("Unimplemented fixup -> relocation");
+    case FK_Data_1:                  return ELF::R_SPARC_DISP8;
+    case FK_Data_2:                  return ELF::R_SPARC_DISP16;
+    case FK_Data_4:                  return ELF::R_SPARC_DISP32;
+    case FK_Data_8:                  return ELF::R_SPARC_DISP64;
+    case Sparc::fixup_sparc_call30:  return ELF::R_SPARC_WDISP30;
+    case Sparc::fixup_sparc_br22:    return ELF::R_SPARC_WDISP22;
+    case Sparc::fixup_sparc_br19:    return ELF::R_SPARC_WDISP19;
+    case Sparc::fixup_sparc_pc22:    return ELF::R_SPARC_PC22;
+    case Sparc::fixup_sparc_pc10:    return ELF::R_SPARC_PC10;
+    case Sparc::fixup_sparc_wplt30:  return ELF::R_SPARC_WPLT30;
+    }
+  }
+
+  switch((unsigned)Fixup.getKind()) {
+  default:
+    llvm_unreachable("Unimplemented fixup -> relocation");
+  case FK_Data_1:                return ELF::R_SPARC_8;
+  case FK_Data_2:                return ((Fixup.getOffset() % 2)
+                                         ? ELF::R_SPARC_UA16
+                                         : ELF::R_SPARC_16);
+  case FK_Data_4:                return ((Fixup.getOffset() % 4)
+                                         ? ELF::R_SPARC_UA32
+                                         : ELF::R_SPARC_32);
+  case FK_Data_8:                return ((Fixup.getOffset() % 8)
+                                         ? ELF::R_SPARC_UA64
+                                         : ELF::R_SPARC_64);
+  case Sparc::fixup_sparc_hi22:  return ELF::R_SPARC_HI22;
+  case Sparc::fixup_sparc_lo10:  return ELF::R_SPARC_LO10;
+  case Sparc::fixup_sparc_h44:   return ELF::R_SPARC_H44;
+  case Sparc::fixup_sparc_m44:   return ELF::R_SPARC_M44;
+  case Sparc::fixup_sparc_l44:   return ELF::R_SPARC_L44;
+  case Sparc::fixup_sparc_hh:    return ELF::R_SPARC_HH22;
+  case Sparc::fixup_sparc_hm:    return ELF::R_SPARC_HM10;
+  case Sparc::fixup_sparc_got22: return ELF::R_SPARC_GOT22;
+  case Sparc::fixup_sparc_got10: return ELF::R_SPARC_GOT10;
+  case Sparc::fixup_sparc_tls_gd_hi22:   return ELF::R_SPARC_TLS_GD_HI22;
+  case Sparc::fixup_sparc_tls_gd_lo10:   return ELF::R_SPARC_TLS_GD_LO10;
+  case Sparc::fixup_sparc_tls_gd_add:    return ELF::R_SPARC_TLS_GD_ADD;
+  case Sparc::fixup_sparc_tls_gd_call:   return ELF::R_SPARC_TLS_GD_CALL;
+  case Sparc::fixup_sparc_tls_ldm_hi22:  return ELF::R_SPARC_TLS_LDM_HI22;
+  case Sparc::fixup_sparc_tls_ldm_lo10:  return ELF::R_SPARC_TLS_LDM_LO10;
+  case Sparc::fixup_sparc_tls_ldm_add:   return ELF::R_SPARC_TLS_LDM_ADD;
+  case Sparc::fixup_sparc_tls_ldm_call:  return ELF::R_SPARC_TLS_LDM_CALL;
+  case Sparc::fixup_sparc_tls_ldo_hix22: return ELF::R_SPARC_TLS_LDO_HIX22;
+  case Sparc::fixup_sparc_tls_ldo_lox10: return ELF::R_SPARC_TLS_LDO_LOX10;
+  case Sparc::fixup_sparc_tls_ldo_add:   return ELF::R_SPARC_TLS_LDO_ADD;
+  case Sparc::fixup_sparc_tls_ie_hi22:   return ELF::R_SPARC_TLS_IE_HI22;
+  case Sparc::fixup_sparc_tls_ie_lo10:   return ELF::R_SPARC_TLS_IE_LO10;
+  case Sparc::fixup_sparc_tls_ie_ld:     return ELF::R_SPARC_TLS_IE_LD;
+  case Sparc::fixup_sparc_tls_ie_ldx:    return ELF::R_SPARC_TLS_IE_LDX;
+  case Sparc::fixup_sparc_tls_ie_add:    return ELF::R_SPARC_TLS_IE_ADD;
+  case Sparc::fixup_sparc_tls_le_hix22:  return ELF::R_SPARC_TLS_LE_HIX22;
+  case Sparc::fixup_sparc_tls_le_lox10:  return ELF::R_SPARC_TLS_LE_LOX10;
+  }
+
+  return ELF::R_SPARC_NONE;
+}
+
+MCObjectWriter *llvm::createSparcELFObjectWriter(raw_ostream &OS,
+                                                 bool Is64Bit,
+                                                 uint8_t OSABI) {
+  MCELFObjectTargetWriter *MOTW = new SparcELFObjectWriter(Is64Bit, OSABI);
+  return createELFObjectWriter(MOTW, OS,  /*IsLittleEndian=*/false);
+}
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h
new file mode 100644
index 0000000..d42bcee
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h
@@ -0,0 +1,97 @@
+//===-- SparcFixupKinds.h - Sparc Specific Fixup Entries --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SPARC_FIXUPKINDS_H
+#define LLVM_SPARC_FIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+  namespace Sparc {
+    enum Fixups {
+      // fixup_sparc_call30 - 30-bit PC relative relocation for call
+      fixup_sparc_call30 = FirstTargetFixupKind,
+
+      /// fixup_sparc_br22 - 22-bit PC relative relocation for
+      /// branches
+      fixup_sparc_br22,
+
+      /// fixup_sparc_br19 - 19-bit PC relative relocation for
+      /// branches on icc/xcc
+      fixup_sparc_br19,
+
+      /// fixup_sparc_bpr  - 16-bit fixup for bpr
+      fixup_sparc_br16_2,
+      fixup_sparc_br16_14,
+
+      /// fixup_sparc_hi22  - 22-bit fixup corresponding to %hi(foo)
+      /// for sethi
+      fixup_sparc_hi22,
+
+      /// fixup_sparc_lo10  - 10-bit fixup corresponding to %lo(foo)
+      fixup_sparc_lo10,
+
+      /// fixup_sparc_h44  - 22-bit fixup corresponding to %h44(foo)
+      fixup_sparc_h44,
+
+      /// fixup_sparc_m44  - 10-bit fixup corresponding to %m44(foo)
+      fixup_sparc_m44,
+
+      /// fixup_sparc_l44  - 12-bit fixup corresponding to %l44(foo)
+      fixup_sparc_l44,
+
+      /// fixup_sparc_hh  -  22-bit fixup corresponding to %hh(foo)
+      fixup_sparc_hh,
+
+      /// fixup_sparc_hm  -  10-bit fixup corresponding to %hm(foo)
+      fixup_sparc_hm,
+
+      /// fixup_sparc_pc22 - 22-bit fixup corresponding to %pc22(foo)
+      fixup_sparc_pc22,
+
+      /// fixup_sparc_pc10 - 10-bit fixup corresponding to %pc10(foo)
+      fixup_sparc_pc10,
+
+      /// fixup_sparc_got22 - 22-bit fixup corresponding to %got22(foo)
+      fixup_sparc_got22,
+
+      /// fixup_sparc_got10 - 10-bit fixup corresponding to %got10(foo)
+      fixup_sparc_got10,
+
+      /// fixup_sparc_wplt30
+      fixup_sparc_wplt30,
+
+      /// fixups for Thread Local Storage
+      fixup_sparc_tls_gd_hi22,
+      fixup_sparc_tls_gd_lo10,
+      fixup_sparc_tls_gd_add,
+      fixup_sparc_tls_gd_call,
+      fixup_sparc_tls_ldm_hi22,
+      fixup_sparc_tls_ldm_lo10,
+      fixup_sparc_tls_ldm_add,
+      fixup_sparc_tls_ldm_call,
+      fixup_sparc_tls_ldo_hix22,
+      fixup_sparc_tls_ldo_lox10,
+      fixup_sparc_tls_ldo_add,
+      fixup_sparc_tls_ie_hi22,
+      fixup_sparc_tls_ie_lo10,
+      fixup_sparc_tls_ie_ld,
+      fixup_sparc_tls_ie_ldx,
+      fixup_sparc_tls_ie_add,
+      fixup_sparc_tls_le_hix22,
+      fixup_sparc_tls_le_lox10,
+
+      // Marker
+      LastTargetFixupKind,
+      NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+    };
+  }
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
new file mode 100644
index 0000000..df66ca9
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
@@ -0,0 +1,74 @@
+//===-- SparcMCAsmInfo.cpp - Sparc asm properties -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the SparcMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcMCAsmInfo.h"
+#include "SparcMCExpr.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCStreamer.h"
+
+using namespace llvm;
+
+void SparcELFMCAsmInfo::anchor() { }
+
+SparcELFMCAsmInfo::SparcELFMCAsmInfo(StringRef TT) {
+  IsLittleEndian = false;
+  Triple TheTriple(TT);
+  bool isV9 = (TheTriple.getArch() == Triple::sparcv9);
+
+  if (isV9) {
+    PointerSize = CalleeSaveStackSlotSize = 8;
+  }
+
+  Data16bitsDirective = "\t.half\t";
+  Data32bitsDirective = "\t.word\t";
+  // .xword is only supported by V9.
+  Data64bitsDirective = (isV9) ? "\t.xword\t" : nullptr;
+  ZeroDirective = "\t.skip\t";
+  CommentString = "!";
+  HasLEB128 = true;
+  SupportsDebugInformation = true;
+
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+
+  SunStyleELFSectionSwitchSyntax = true;
+  UsesELFSectionDirectiveForBSS = true;
+
+  if (TheTriple.getOS() == llvm::Triple::Solaris ||
+      TheTriple.getOS() == llvm::Triple::OpenBSD)
+    UseIntegratedAssembler = true;
+}
+
+const MCExpr*
+SparcELFMCAsmInfo::getExprForPersonalitySymbol(const MCSymbol *Sym,
+                                               unsigned Encoding,
+                                               MCStreamer &Streamer) const {
+  if (Encoding & dwarf::DW_EH_PE_pcrel) {
+    MCContext &Ctx = Streamer.getContext();
+    return SparcMCExpr::Create(SparcMCExpr::VK_Sparc_R_DISP32,
+                               MCSymbolRefExpr::Create(Sym, Ctx), Ctx);
+  }
+
+  return MCAsmInfo::getExprForPersonalitySymbol(Sym, Encoding, Streamer);
+}
+
+const MCExpr*
+SparcELFMCAsmInfo::getExprForFDESymbol(const MCSymbol *Sym,
+                                       unsigned Encoding,
+                                       MCStreamer &Streamer) const {
+  if (Encoding & dwarf::DW_EH_PE_pcrel) {
+    MCContext &Ctx = Streamer.getContext();
+    return SparcMCExpr::Create(SparcMCExpr::VK_Sparc_R_DISP32,
+                               MCSymbolRefExpr::Create(Sym, Ctx), Ctx);
+  }
+  return MCAsmInfo::getExprForFDESymbol(Sym, Encoding, Streamer);
+}
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
new file mode 100644
index 0000000..e126b68
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
@@ -0,0 +1,37 @@
+//===-- SparcMCAsmInfo.h - Sparc asm properties ----------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the SparcMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPARCTARGETASMINFO_H
+#define SPARCTARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfoELF.h"
+
+namespace llvm {
+class StringRef;
+
+class SparcELFMCAsmInfo : public MCAsmInfoELF {
+  void anchor() override;
+public:
+  explicit SparcELFMCAsmInfo(StringRef TT);
+  const MCExpr*
+  getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding,
+                              MCStreamer &Streamer) const override;
+  const MCExpr* getExprForFDESymbol(const MCSymbol *Sym,
+                                    unsigned Encoding,
+                                    MCStreamer &Streamer) const override;
+
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
new file mode 100644
index 0000000..eea9626
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
@@ -0,0 +1,218 @@
+//===-- SparcMCCodeEmitter.cpp - Convert Sparc code to machine code -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SparcMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcMCExpr.h"
+#include "MCTargetDesc/SparcFixupKinds.h"
+#include "SparcMCTargetDesc.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mccodeemitter"
+
+STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
+
+namespace {
+class SparcMCCodeEmitter : public MCCodeEmitter {
+  SparcMCCodeEmitter(const SparcMCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  void operator=(const SparcMCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  MCContext &Ctx;
+
+public:
+  SparcMCCodeEmitter(MCContext &ctx): Ctx(ctx) {}
+
+  ~SparcMCCodeEmitter() {}
+
+  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const override;
+
+  // getBinaryCodeForInstr - TableGen'erated function for getting the
+  // binary encoding for an instruction.
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  /// getMachineOpValue - Return binary encoding of operand. If the machine
+  /// operand requires relocation, record the relocation and return zero.
+  unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+
+  unsigned getCallTargetOpValue(const MCInst &MI, unsigned OpNo,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+  unsigned getBranchTargetOpValue(const MCInst &MI, unsigned OpNo,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+  unsigned getBranchPredTargetOpValue(const MCInst &MI, unsigned OpNo,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const;
+  unsigned getBranchOnRegTargetOpValue(const MCInst &MI, unsigned OpNo,
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const;
+
+};
+} // end anonymous namespace
+
+MCCodeEmitter *llvm::createSparcMCCodeEmitter(const MCInstrInfo &MCII,
+                                              const MCRegisterInfo &MRI,
+                                              const MCSubtargetInfo &STI,
+                                              MCContext &Ctx) {
+  return new SparcMCCodeEmitter(Ctx);
+}
+
+void SparcMCCodeEmitter::
+EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                  SmallVectorImpl<MCFixup> &Fixups,
+                  const MCSubtargetInfo &STI) const {
+  unsigned Bits = getBinaryCodeForInstr(MI, Fixups, STI);
+
+  // Output the constant in big endian byte order.
+  for (unsigned i = 0; i != 4; ++i) {
+    OS << (char)(Bits >> 24);
+    Bits <<= 8;
+  }
+  unsigned tlsOpNo = 0;
+  switch (MI.getOpcode()) {
+  default: break;
+  case SP::TLS_CALL:   tlsOpNo = 1; break;
+  case SP::TLS_ADDrr:
+  case SP::TLS_ADDXrr:
+  case SP::TLS_LDrr:
+  case SP::TLS_LDXrr:  tlsOpNo = 3; break;
+  }
+  if (tlsOpNo != 0) {
+    const MCOperand &MO = MI.getOperand(tlsOpNo);
+    uint64_t op = getMachineOpValue(MI, MO, Fixups, STI);
+    assert(op == 0 && "Unexpected operand value!");
+    (void)op; // suppress warning.
+  }
+
+  ++MCNumEmitted;  // Keep track of the # of mi's emitted.
+}
+
+
+unsigned SparcMCCodeEmitter::
+getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                  SmallVectorImpl<MCFixup> &Fixups,
+                  const MCSubtargetInfo &STI) const {
+
+  if (MO.isReg())
+    return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
+
+  if (MO.isImm())
+    return MO.getImm();
+
+  assert(MO.isExpr());
+  const MCExpr *Expr = MO.getExpr();
+  if (const SparcMCExpr *SExpr = dyn_cast<SparcMCExpr>(Expr)) {
+    MCFixupKind Kind = (MCFixupKind)SExpr->getFixupKind();
+    Fixups.push_back(MCFixup::Create(0, Expr, Kind));
+    return 0;
+  }
+
+  int64_t Res;
+  if (Expr->EvaluateAsAbsolute(Res))
+    return Res;
+
+  llvm_unreachable("Unhandled expression!");
+  return 0;
+}
+
+unsigned SparcMCCodeEmitter::
+getCallTargetOpValue(const MCInst &MI, unsigned OpNo,
+                     SmallVectorImpl<MCFixup> &Fixups,
+                     const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isReg() || MO.isImm())
+    return getMachineOpValue(MI, MO, Fixups, STI);
+
+  if (MI.getOpcode() == SP::TLS_CALL) {
+    // No fixups for __tls_get_addr. Will emit for fixups for tls_symbol in
+    // EncodeInstruction.
+#ifndef NDEBUG
+    // Verify that the callee is actually __tls_get_addr.
+    const SparcMCExpr *SExpr = dyn_cast<SparcMCExpr>(MO.getExpr());
+    assert(SExpr && SExpr->getSubExpr()->getKind() == MCExpr::SymbolRef &&
+           "Unexpected expression in TLS_CALL");
+    const MCSymbolRefExpr *SymExpr = cast<MCSymbolRefExpr>(SExpr->getSubExpr());
+    assert(SymExpr->getSymbol().getName() == "__tls_get_addr" &&
+           "Unexpected function for TLS_CALL");
+#endif
+    return 0;
+  }
+
+  MCFixupKind fixupKind = (MCFixupKind)Sparc::fixup_sparc_call30;
+
+  if (const SparcMCExpr *SExpr = dyn_cast<SparcMCExpr>(MO.getExpr())) {
+    if (SExpr->getKind() == SparcMCExpr::VK_Sparc_WPLT30)
+      fixupKind = (MCFixupKind)Sparc::fixup_sparc_wplt30;
+  }
+
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(), fixupKind));
+
+  return 0;
+}
+
+unsigned SparcMCCodeEmitter::
+getBranchTargetOpValue(const MCInst &MI, unsigned OpNo,
+                  SmallVectorImpl<MCFixup> &Fixups,
+                  const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isReg() || MO.isImm())
+    return getMachineOpValue(MI, MO, Fixups, STI);
+
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
+                                   (MCFixupKind)Sparc::fixup_sparc_br22));
+  return 0;
+}
+
+unsigned SparcMCCodeEmitter::
+getBranchPredTargetOpValue(const MCInst &MI, unsigned OpNo,
+                           SmallVectorImpl<MCFixup> &Fixups,
+                           const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isReg() || MO.isImm())
+    return getMachineOpValue(MI, MO, Fixups, STI);
+
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
+                                   (MCFixupKind)Sparc::fixup_sparc_br19));
+  return 0;
+}
+unsigned SparcMCCodeEmitter::
+getBranchOnRegTargetOpValue(const MCInst &MI, unsigned OpNo,
+                           SmallVectorImpl<MCFixup> &Fixups,
+                           const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isReg() || MO.isImm())
+    return getMachineOpValue(MI, MO, Fixups, STI);
+
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
+                                   (MCFixupKind)Sparc::fixup_sparc_br16_2));
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
+                                   (MCFixupKind)Sparc::fixup_sparc_br16_14));
+
+  return 0;
+}
+
+
+
+#include "SparcGenMCCodeEmitter.inc"
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
new file mode 100644
index 0000000..7f01ab0
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
@@ -0,0 +1,225 @@
+//===-- SparcMCExpr.cpp - Sparc specific MC expression classes --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the assembly expression modifiers
+// accepted by the Sparc architecture (e.g. "%hi", "%lo", ...).
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcMCExpr.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELF.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Object/ELF.h"
+
+
+using namespace llvm;
+
+#define DEBUG_TYPE "sparcmcexpr"
+
+const SparcMCExpr*
+SparcMCExpr::Create(VariantKind Kind, const MCExpr *Expr,
+                      MCContext &Ctx) {
+    return new (Ctx) SparcMCExpr(Kind, Expr);
+}
+
+
+
+void SparcMCExpr::PrintImpl(raw_ostream &OS) const
+{
+
+  bool closeParen = printVariantKind(OS, Kind);
+
+  const MCExpr *Expr = getSubExpr();
+  Expr->print(OS);
+
+  if (closeParen)
+    OS << ')';
+}
+
+bool SparcMCExpr::printVariantKind(raw_ostream &OS, VariantKind Kind)
+{
+  bool closeParen = true;
+  switch (Kind) {
+  case VK_Sparc_None:     closeParen = false; break;
+  case VK_Sparc_LO:       OS << "%lo(";  break;
+  case VK_Sparc_HI:       OS << "%hi(";  break;
+  case VK_Sparc_H44:      OS << "%h44("; break;
+  case VK_Sparc_M44:      OS << "%m44("; break;
+  case VK_Sparc_L44:      OS << "%l44("; break;
+  case VK_Sparc_HH:       OS << "%hh(";  break;
+  case VK_Sparc_HM:       OS << "%hm(";  break;
+    // FIXME: use %pc22/%pc10, if system assembler supports them.
+  case VK_Sparc_PC22:     OS << "%hi("; break;
+  case VK_Sparc_PC10:     OS << "%lo("; break;
+    // FIXME: use %got22/%got10, if system assembler supports them.
+  case VK_Sparc_GOT22:    OS << "%hi("; break;
+  case VK_Sparc_GOT10:    OS << "%lo("; break;
+  case VK_Sparc_WPLT30:   closeParen = false; break;
+  case VK_Sparc_R_DISP32: OS << "%r_disp32("; break;
+  case VK_Sparc_TLS_GD_HI22:   OS << "%tgd_hi22(";   break;
+  case VK_Sparc_TLS_GD_LO10:   OS << "%tgd_lo10(";   break;
+  case VK_Sparc_TLS_GD_ADD:    OS << "%tgd_add(";    break;
+  case VK_Sparc_TLS_GD_CALL:   OS << "%tgd_call(";   break;
+  case VK_Sparc_TLS_LDM_HI22:  OS << "%tldm_hi22(";  break;
+  case VK_Sparc_TLS_LDM_LO10:  OS << "%tldm_lo10(";  break;
+  case VK_Sparc_TLS_LDM_ADD:   OS << "%tldm_add(";   break;
+  case VK_Sparc_TLS_LDM_CALL:  OS << "%tldm_call(";  break;
+  case VK_Sparc_TLS_LDO_HIX22: OS << "%tldo_hix22("; break;
+  case VK_Sparc_TLS_LDO_LOX10: OS << "%tldo_lox10("; break;
+  case VK_Sparc_TLS_LDO_ADD:   OS << "%tldo_add(";   break;
+  case VK_Sparc_TLS_IE_HI22:   OS << "%tie_hi22(";   break;
+  case VK_Sparc_TLS_IE_LO10:   OS << "%tie_lo10(";   break;
+  case VK_Sparc_TLS_IE_LD:     OS << "%tie_ld(";     break;
+  case VK_Sparc_TLS_IE_LDX:    OS << "%tie_ldx(";    break;
+  case VK_Sparc_TLS_IE_ADD:    OS << "%tie_add(";    break;
+  case VK_Sparc_TLS_LE_HIX22:  OS << "%tle_hix22(";  break;
+  case VK_Sparc_TLS_LE_LOX10:  OS << "%tle_lox10(";  break;
+  }
+  return closeParen;
+}
+
+SparcMCExpr::VariantKind SparcMCExpr::parseVariantKind(StringRef name)
+{
+  return StringSwitch<SparcMCExpr::VariantKind>(name)
+    .Case("lo",  VK_Sparc_LO)
+    .Case("hi",  VK_Sparc_HI)
+    .Case("h44", VK_Sparc_H44)
+    .Case("m44", VK_Sparc_M44)
+    .Case("l44", VK_Sparc_L44)
+    .Case("hh",  VK_Sparc_HH)
+    .Case("hm",  VK_Sparc_HM)
+    .Case("pc22",  VK_Sparc_PC22)
+    .Case("pc10",  VK_Sparc_PC10)
+    .Case("got22", VK_Sparc_GOT22)
+    .Case("got10", VK_Sparc_GOT10)
+    .Case("r_disp32",   VK_Sparc_R_DISP32)
+    .Case("tgd_hi22",   VK_Sparc_TLS_GD_HI22)
+    .Case("tgd_lo10",   VK_Sparc_TLS_GD_LO10)
+    .Case("tgd_add",    VK_Sparc_TLS_GD_ADD)
+    .Case("tgd_call",   VK_Sparc_TLS_GD_CALL)
+    .Case("tldm_hi22",  VK_Sparc_TLS_LDM_HI22)
+    .Case("tldm_lo10",  VK_Sparc_TLS_LDM_LO10)
+    .Case("tldm_add",   VK_Sparc_TLS_LDM_ADD)
+    .Case("tldm_call",  VK_Sparc_TLS_LDM_CALL)
+    .Case("tldo_hix22", VK_Sparc_TLS_LDO_HIX22)
+    .Case("tldo_lox10", VK_Sparc_TLS_LDO_LOX10)
+    .Case("tldo_add",   VK_Sparc_TLS_LDO_ADD)
+    .Case("tie_hi22",   VK_Sparc_TLS_IE_HI22)
+    .Case("tie_lo10",   VK_Sparc_TLS_IE_LO10)
+    .Case("tie_ld",     VK_Sparc_TLS_IE_LD)
+    .Case("tie_ldx",    VK_Sparc_TLS_IE_LDX)
+    .Case("tie_add",    VK_Sparc_TLS_IE_ADD)
+    .Case("tle_hix22",  VK_Sparc_TLS_LE_HIX22)
+    .Case("tle_lox10",  VK_Sparc_TLS_LE_LOX10)
+    .Default(VK_Sparc_None);
+}
+
+Sparc::Fixups SparcMCExpr::getFixupKind(SparcMCExpr::VariantKind Kind) {
+  switch (Kind) {
+  default: llvm_unreachable("Unhandled SparcMCExpr::VariantKind");
+  case VK_Sparc_LO:       return Sparc::fixup_sparc_lo10;
+  case VK_Sparc_HI:       return Sparc::fixup_sparc_hi22;
+  case VK_Sparc_H44:      return Sparc::fixup_sparc_h44;
+  case VK_Sparc_M44:      return Sparc::fixup_sparc_m44;
+  case VK_Sparc_L44:      return Sparc::fixup_sparc_l44;
+  case VK_Sparc_HH:       return Sparc::fixup_sparc_hh;
+  case VK_Sparc_HM:       return Sparc::fixup_sparc_hm;
+  case VK_Sparc_PC22:     return Sparc::fixup_sparc_pc22;
+  case VK_Sparc_PC10:     return Sparc::fixup_sparc_pc10;
+  case VK_Sparc_GOT22:    return Sparc::fixup_sparc_got22;
+  case VK_Sparc_GOT10:    return Sparc::fixup_sparc_got10;
+  case VK_Sparc_WPLT30:   return Sparc::fixup_sparc_wplt30;
+  case VK_Sparc_TLS_GD_HI22:   return Sparc::fixup_sparc_tls_gd_hi22;
+  case VK_Sparc_TLS_GD_LO10:   return Sparc::fixup_sparc_tls_gd_lo10;
+  case VK_Sparc_TLS_GD_ADD:    return Sparc::fixup_sparc_tls_gd_add;
+  case VK_Sparc_TLS_GD_CALL:   return Sparc::fixup_sparc_tls_gd_call;
+  case VK_Sparc_TLS_LDM_HI22:  return Sparc::fixup_sparc_tls_ldm_hi22;
+  case VK_Sparc_TLS_LDM_LO10:  return Sparc::fixup_sparc_tls_ldm_lo10;
+  case VK_Sparc_TLS_LDM_ADD:   return Sparc::fixup_sparc_tls_ldm_add;
+  case VK_Sparc_TLS_LDM_CALL:  return Sparc::fixup_sparc_tls_ldm_call;
+  case VK_Sparc_TLS_LDO_HIX22: return Sparc::fixup_sparc_tls_ldo_hix22;
+  case VK_Sparc_TLS_LDO_LOX10: return Sparc::fixup_sparc_tls_ldo_lox10;
+  case VK_Sparc_TLS_LDO_ADD:   return Sparc::fixup_sparc_tls_ldo_add;
+  case VK_Sparc_TLS_IE_HI22:   return Sparc::fixup_sparc_tls_ie_hi22;
+  case VK_Sparc_TLS_IE_LO10:   return Sparc::fixup_sparc_tls_ie_lo10;
+  case VK_Sparc_TLS_IE_LD:     return Sparc::fixup_sparc_tls_ie_ld;
+  case VK_Sparc_TLS_IE_LDX:    return Sparc::fixup_sparc_tls_ie_ldx;
+  case VK_Sparc_TLS_IE_ADD:    return Sparc::fixup_sparc_tls_ie_add;
+  case VK_Sparc_TLS_LE_HIX22:  return Sparc::fixup_sparc_tls_le_hix22;
+  case VK_Sparc_TLS_LE_LOX10:  return Sparc::fixup_sparc_tls_le_lox10;
+  }
+}
+
+bool
+SparcMCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
+                                       const MCAsmLayout *Layout) const {
+  return getSubExpr()->EvaluateAsRelocatable(Res, Layout);
+}
+
+static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) {
+  switch (Expr->getKind()) {
+  case MCExpr::Target:
+    llvm_unreachable("Can't handle nested target expr!");
+    break;
+
+  case MCExpr::Constant:
+    break;
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Expr);
+    fixELFSymbolsInTLSFixupsImpl(BE->getLHS(), Asm);
+    fixELFSymbolsInTLSFixupsImpl(BE->getRHS(), Asm);
+    break;
+  }
+
+  case MCExpr::SymbolRef: {
+    const MCSymbolRefExpr &SymRef = *cast<MCSymbolRefExpr>(Expr);
+    MCSymbolData &SD = Asm.getOrCreateSymbolData(SymRef.getSymbol());
+    MCELF::SetType(SD, ELF::STT_TLS);
+    break;
+  }
+
+  case MCExpr::Unary:
+    fixELFSymbolsInTLSFixupsImpl(cast<MCUnaryExpr>(Expr)->getSubExpr(), Asm);
+    break;
+  }
+
+}
+
+void SparcMCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
+  switch(getKind()) {
+  default: return;
+  case VK_Sparc_TLS_GD_HI22:
+  case VK_Sparc_TLS_GD_LO10:
+  case VK_Sparc_TLS_GD_ADD:
+  case VK_Sparc_TLS_GD_CALL:
+  case VK_Sparc_TLS_LDM_HI22:
+  case VK_Sparc_TLS_LDM_LO10:
+  case VK_Sparc_TLS_LDM_ADD:
+  case VK_Sparc_TLS_LDM_CALL:
+  case VK_Sparc_TLS_LDO_HIX22:
+  case VK_Sparc_TLS_LDO_LOX10:
+  case VK_Sparc_TLS_LDO_ADD:
+  case VK_Sparc_TLS_IE_HI22:
+  case VK_Sparc_TLS_IE_LO10:
+  case VK_Sparc_TLS_IE_LD:
+  case VK_Sparc_TLS_IE_LDX:
+  case VK_Sparc_TLS_IE_ADD:
+  case VK_Sparc_TLS_LE_HIX22:
+  case VK_Sparc_TLS_LE_LOX10: break;
+  }
+  fixELFSymbolsInTLSFixupsImpl(getSubExpr(), Asm);
+}
+
+void SparcMCExpr::visitUsedExpr(MCStreamer &Streamer) const {
+  Streamer.visitUsedExpr(*getSubExpr());
+}
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
new file mode 100644
index 0000000..f0d0ef3
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
@@ -0,0 +1,111 @@
+//====- SparcMCExpr.h - Sparc specific MC expression classes --*- C++ -*-=====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Sparc-specific MCExprs, used for modifiers like
+// "%hi" or "%lo" etc.,
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SPARCMCEXPR_H
+#define LLVM_SPARCMCEXPR_H
+
+#include "SparcFixupKinds.h"
+#include "llvm/MC/MCExpr.h"
+
+namespace llvm {
+
+class StringRef;
+class SparcMCExpr : public MCTargetExpr {
+public:
+  enum VariantKind {
+    VK_Sparc_None,
+    VK_Sparc_LO,
+    VK_Sparc_HI,
+    VK_Sparc_H44,
+    VK_Sparc_M44,
+    VK_Sparc_L44,
+    VK_Sparc_HH,
+    VK_Sparc_HM,
+    VK_Sparc_PC22,
+    VK_Sparc_PC10,
+    VK_Sparc_GOT22,
+    VK_Sparc_GOT10,
+    VK_Sparc_WPLT30,
+    VK_Sparc_R_DISP32,
+    VK_Sparc_TLS_GD_HI22,
+    VK_Sparc_TLS_GD_LO10,
+    VK_Sparc_TLS_GD_ADD,
+    VK_Sparc_TLS_GD_CALL,
+    VK_Sparc_TLS_LDM_HI22,
+    VK_Sparc_TLS_LDM_LO10,
+    VK_Sparc_TLS_LDM_ADD,
+    VK_Sparc_TLS_LDM_CALL,
+    VK_Sparc_TLS_LDO_HIX22,
+    VK_Sparc_TLS_LDO_LOX10,
+    VK_Sparc_TLS_LDO_ADD,
+    VK_Sparc_TLS_IE_HI22,
+    VK_Sparc_TLS_IE_LO10,
+    VK_Sparc_TLS_IE_LD,
+    VK_Sparc_TLS_IE_LDX,
+    VK_Sparc_TLS_IE_ADD,
+    VK_Sparc_TLS_LE_HIX22,
+    VK_Sparc_TLS_LE_LOX10
+  };
+
+private:
+  const VariantKind Kind;
+  const MCExpr *Expr;
+
+  explicit SparcMCExpr(VariantKind _Kind, const MCExpr *_Expr)
+    : Kind(_Kind), Expr(_Expr) {}
+
+public:
+  /// @name Construction
+  /// @{
+
+  static const SparcMCExpr *Create(VariantKind Kind, const MCExpr *Expr,
+                                 MCContext &Ctx);
+  /// @}
+  /// @name Accessors
+  /// @{
+
+  /// getOpcode - Get the kind of this expression.
+  VariantKind getKind() const { return Kind; }
+
+  /// getSubExpr - Get the child of this expression.
+  const MCExpr *getSubExpr() const { return Expr; }
+
+  /// getFixupKind - Get the fixup kind of this expression.
+  Sparc::Fixups getFixupKind() const { return getFixupKind(Kind); }
+
+  /// @}
+  void PrintImpl(raw_ostream &OS) const override;
+  bool EvaluateAsRelocatableImpl(MCValue &Res,
+                                 const MCAsmLayout *Layout) const override;
+  void visitUsedExpr(MCStreamer &Streamer) const override;
+  const MCSection *FindAssociatedSection() const override {
+    return getSubExpr()->FindAssociatedSection();
+  }
+
+  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override;
+
+  static bool classof(const MCExpr *E) {
+    return E->getKind() == MCExpr::Target;
+  }
+
+  static bool classof(const SparcMCExpr *) { return true; }
+
+  static VariantKind parseVariantKind(StringRef name);
+  static bool printVariantKind(raw_ostream &OS, VariantKind Kind);
+  static Sparc::Fixups getFixupKind(VariantKind Kind);
+};
+
+} // end namespace llvm.
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
new file mode 100644
index 0000000..571017d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
@@ -0,0 +1,212 @@
+//===-- SparcMCTargetDesc.cpp - Sparc Target Descriptions -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Sparc specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcMCTargetDesc.h"
+#include "InstPrinter/SparcInstPrinter.h"
+#include "SparcMCAsmInfo.h"
+#include "SparcTargetStreamer.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_MC_DESC
+#include "SparcGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "SparcGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "SparcGenRegisterInfo.inc"
+
+static MCAsmInfo *createSparcMCAsmInfo(const MCRegisterInfo &MRI,
+                                       StringRef TT) {
+  MCAsmInfo *MAI = new SparcELFMCAsmInfo(TT);
+  unsigned Reg = MRI.getDwarfRegNum(SP::O6, true);
+  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, Reg, 0);
+  MAI->addInitialFrameState(Inst);
+  return MAI;
+}
+
+static MCAsmInfo *createSparcV9MCAsmInfo(const MCRegisterInfo &MRI,
+                                       StringRef TT) {
+  MCAsmInfo *MAI = new SparcELFMCAsmInfo(TT);
+  unsigned Reg = MRI.getDwarfRegNum(SP::O6, true);
+  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, Reg, 2047);
+  MAI->addInitialFrameState(Inst);
+  return MAI;
+}
+
+static MCInstrInfo *createSparcMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitSparcMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createSparcMCRegisterInfo(StringRef TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitSparcMCRegisterInfo(X, SP::O7);
+  return X;
+}
+
+static MCSubtargetInfo *createSparcMCSubtargetInfo(StringRef TT, StringRef CPU,
+                                                   StringRef FS) {
+  MCSubtargetInfo *X = new MCSubtargetInfo();
+  Triple TheTriple(TT);
+  if (CPU.empty())
+    CPU = (TheTriple.getArch() == Triple::sparcv9) ? "v9" : "v8";
+  InitSparcMCSubtargetInfo(X, TT, CPU, FS);
+  return X;
+}
+
+// Code models. Some only make sense for 64-bit code.
+//
+// SunCC  Reloc   CodeModel  Constraints
+// abs32  Static  Small      text+data+bss linked below 2^32 bytes
+// abs44  Static  Medium     text+data+bss linked below 2^44 bytes
+// abs64  Static  Large      text smaller than 2^31 bytes
+// pic13  PIC_    Small      GOT < 2^13 bytes
+// pic32  PIC_    Medium     GOT < 2^32 bytes
+//
+// All code models require that the text segment is smaller than 2GB.
+
+static MCCodeGenInfo *createSparcMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                               CodeModel::Model CM,
+                                               CodeGenOpt::Level OL) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+
+  // The default 32-bit code model is abs32/pic32 and the default 32-bit
+  // code model for JIT is abs32.
+  switch (CM) {
+  default: break;
+  case CodeModel::Default:
+  case CodeModel::JITDefault: CM = CodeModel::Small; break;
+  }
+
+  X->InitMCCodeGenInfo(RM, CM, OL);
+  return X;
+}
+
+static MCCodeGenInfo *createSparcV9MCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                                 CodeModel::Model CM,
+                                                 CodeGenOpt::Level OL) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+
+  // The default 64-bit code model is abs44/pic32 and the default 64-bit
+  // code model for JIT is abs64.
+  switch (CM) {
+  default:  break;
+  case CodeModel::Default:
+    CM = RM == Reloc::PIC_ ? CodeModel::Small : CodeModel::Medium;
+    break;
+  case CodeModel::JITDefault:
+    CM = CodeModel::Large;
+    break;
+  }
+
+  X->InitMCCodeGenInfo(RM, CM, OL);
+  return X;
+}
+
+static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
+                                    MCContext &Context, MCAsmBackend &MAB,
+                                    raw_ostream &OS, MCCodeEmitter *Emitter,
+                                    const MCSubtargetInfo &STI, bool RelaxAll,
+                                    bool NoExecStack) {
+  MCStreamer *S =
+      createELFStreamer(Context, MAB, OS, Emitter, RelaxAll, NoExecStack);
+  new SparcTargetELFStreamer(*S);
+  return S;
+}
+
+static MCStreamer *
+createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
+                    bool isVerboseAsm, bool useDwarfDirectory,
+                    MCInstPrinter *InstPrint, MCCodeEmitter *CE,
+                    MCAsmBackend *TAB, bool ShowInst) {
+
+  MCStreamer *S = llvm::createAsmStreamer(
+      Ctx, OS, isVerboseAsm, useDwarfDirectory, InstPrint, CE, TAB, ShowInst);
+  new SparcTargetAsmStreamer(*S, OS);
+  return S;
+}
+
+static MCInstPrinter *createSparcMCInstPrinter(const Target &T,
+                                              unsigned SyntaxVariant,
+                                              const MCAsmInfo &MAI,
+                                              const MCInstrInfo &MII,
+                                              const MCRegisterInfo &MRI,
+                                              const MCSubtargetInfo &STI) {
+  return new SparcInstPrinter(MAI, MII, MRI, STI);
+}
+
+extern "C" void LLVMInitializeSparcTargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfoFn X(TheSparcTarget, createSparcMCAsmInfo);
+  RegisterMCAsmInfoFn Y(TheSparcV9Target, createSparcV9MCAsmInfo);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheSparcTarget,
+                                       createSparcMCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheSparcV9Target,
+                                       createSparcV9MCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheSparcTarget, createSparcMCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheSparcV9Target, createSparcMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheSparcTarget, createSparcMCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheSparcV9Target,
+                                    createSparcMCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheSparcTarget,
+                                          createSparcMCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(TheSparcV9Target,
+                                          createSparcMCSubtargetInfo);
+
+  // Register the MC Code Emitter.
+  TargetRegistry::RegisterMCCodeEmitter(TheSparcTarget,
+                                        createSparcMCCodeEmitter);
+  TargetRegistry::RegisterMCCodeEmitter(TheSparcV9Target,
+                                        createSparcMCCodeEmitter);
+
+  //Register the asm backend.
+  TargetRegistry::RegisterMCAsmBackend(TheSparcTarget,
+                                       createSparcAsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(TheSparcV9Target,
+                                       createSparcAsmBackend);
+
+  // Register the object streamer.
+  TargetRegistry::RegisterMCObjectStreamer(TheSparcTarget,
+                                           createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(TheSparcV9Target,
+                                           createMCStreamer);
+
+  // Register the asm streamer.
+  TargetRegistry::RegisterAsmStreamer(TheSparcTarget,
+                                      createMCAsmStreamer);
+  TargetRegistry::RegisterAsmStreamer(TheSparcV9Target,
+                                      createMCAsmStreamer);
+
+  // Register the MCInstPrinter
+  TargetRegistry::RegisterMCInstPrinter(TheSparcTarget,
+                                        createSparcMCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheSparcV9Target,
+                                        createSparcMCInstPrinter);
+}
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h
new file mode 100644
index 0000000..c8029a8
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h
@@ -0,0 +1,61 @@
+//===-- SparcMCTargetDesc.h - Sparc Target Descriptions ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Sparc specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPARCMCTARGETDESC_H
+#define SPARCMCTARGETDESC_H
+
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectWriter;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class Target;
+class StringRef;
+class raw_ostream;
+
+extern Target TheSparcTarget;
+extern Target TheSparcV9Target;
+
+MCCodeEmitter *createSparcMCCodeEmitter(const MCInstrInfo &MCII,
+                                        const MCRegisterInfo &MRI,
+                                        const MCSubtargetInfo &STI,
+                                        MCContext &Ctx);
+MCAsmBackend *createSparcAsmBackend(const Target &T,
+                                    const MCRegisterInfo &MRI,
+                                    StringRef TT,
+                                    StringRef CPU);
+MCObjectWriter *createSparcELFObjectWriter(raw_ostream &OS,
+                                           bool Is64Bit,
+                                           uint8_t OSABI);
+} // End llvm namespace
+
+// Defines symbolic names for Sparc registers.  This defines a mapping from
+// register name to register number.
+//
+#define GET_REGINFO_ENUM
+#include "SparcGenRegisterInfo.inc"
+
+// Defines symbolic names for the Sparc instructions.
+//
+#define GET_INSTRINFO_ENUM
+#include "SparcGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "SparcGenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcTargetStreamer.cpp b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcTargetStreamer.cpp
new file mode 100644
index 0000000..94af791
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcTargetStreamer.cpp
@@ -0,0 +1,46 @@
+//===-- SparcTargetStreamer.cpp - Sparc Target Streamer Methods -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Sparc specific target streamer methods.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcTargetStreamer.h"
+#include "InstPrinter/SparcInstPrinter.h"
+#include "llvm/Support/FormattedStream.h"
+
+using namespace llvm;
+
+// pin vtable to this file
+SparcTargetStreamer::SparcTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
+
+void SparcTargetStreamer::anchor() {}
+
+SparcTargetAsmStreamer::SparcTargetAsmStreamer(MCStreamer &S,
+                                               formatted_raw_ostream &OS)
+    : SparcTargetStreamer(S), OS(OS) {}
+
+void SparcTargetAsmStreamer::emitSparcRegisterIgnore(unsigned reg) {
+  OS << "\t.register "
+     << "%" << StringRef(SparcInstPrinter::getRegisterName(reg)).lower()
+     << ", #ignore\n";
+}
+
+void SparcTargetAsmStreamer::emitSparcRegisterScratch(unsigned reg) {
+  OS << "\t.register "
+     << "%" << StringRef(SparcInstPrinter::getRegisterName(reg)).lower()
+     << ", #scratch\n";
+}
+
+SparcTargetELFStreamer::SparcTargetELFStreamer(MCStreamer &S)
+    : SparcTargetStreamer(S) {}
+
+MCELFStreamer &SparcTargetELFStreamer::getStreamer() {
+  return static_cast<MCELFStreamer &>(Streamer);
+}
diff --git a/contrib/llvm/lib/Target/Sparc/Sparc.h b/contrib/llvm/lib/Target/Sparc/Sparc.h
new file mode 100644
index 0000000..de20aaa
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/Sparc.h
@@ -0,0 +1,136 @@
+//===-- Sparc.h - Top-level interface for Sparc representation --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// Sparc back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_SPARC_H
+#define TARGET_SPARC_H
+
+#include "MCTargetDesc/SparcMCTargetDesc.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+  class FunctionPass;
+  class SparcTargetMachine;
+  class formatted_raw_ostream;
+  class AsmPrinter;
+  class MCInst;
+  class MachineInstr;
+
+  FunctionPass *createSparcISelDag(SparcTargetMachine &TM);
+  FunctionPass *createSparcDelaySlotFillerPass(TargetMachine &TM);
+  FunctionPass *createSparcJITCodeEmitterPass(SparcTargetMachine &TM,
+                                              JITCodeEmitter &JCE);
+
+  void LowerSparcMachineInstrToMCInst(const MachineInstr *MI,
+                                      MCInst &OutMI,
+                                      AsmPrinter &AP);
+} // end namespace llvm;
+
+namespace llvm {
+  // Enums corresponding to Sparc condition codes, both icc's and fcc's.  These
+  // values must be kept in sync with the ones in the .td file.
+  namespace SPCC {
+    enum CondCodes {
+      ICC_A   =  8   ,  // Always
+      ICC_N   =  0   ,  // Never
+      ICC_NE  =  9   ,  // Not Equal
+      ICC_E   =  1   ,  // Equal
+      ICC_G   = 10   ,  // Greater
+      ICC_LE  =  2   ,  // Less or Equal
+      ICC_GE  = 11   ,  // Greater or Equal
+      ICC_L   =  3   ,  // Less
+      ICC_GU  = 12   ,  // Greater Unsigned
+      ICC_LEU =  4   ,  // Less or Equal Unsigned
+      ICC_CC  = 13   ,  // Carry Clear/Great or Equal Unsigned
+      ICC_CS  =  5   ,  // Carry Set/Less Unsigned
+      ICC_POS = 14   ,  // Positive
+      ICC_NEG =  6   ,  // Negative
+      ICC_VC  = 15   ,  // Overflow Clear
+      ICC_VS  =  7   ,  // Overflow Set
+
+      FCC_A   =  8+16,  // Always
+      FCC_N   =  0+16,  // Never
+      FCC_U   =  7+16,  // Unordered
+      FCC_G   =  6+16,  // Greater
+      FCC_UG  =  5+16,  // Unordered or Greater
+      FCC_L   =  4+16,  // Less
+      FCC_UL  =  3+16,  // Unordered or Less
+      FCC_LG  =  2+16,  // Less or Greater
+      FCC_NE  =  1+16,  // Not Equal
+      FCC_E   =  9+16,  // Equal
+      FCC_UE  = 10+16,  // Unordered or Equal
+      FCC_GE  = 11+16,  // Greater or Equal
+      FCC_UGE = 12+16,  // Unordered or Greater or Equal
+      FCC_LE  = 13+16,  // Less or Equal
+      FCC_ULE = 14+16,  // Unordered or Less or Equal
+      FCC_O   = 15+16   // Ordered
+    };
+  }
+
+  inline static const char *SPARCCondCodeToString(SPCC::CondCodes CC) {
+    switch (CC) {
+    case SPCC::ICC_A:   return "a";
+    case SPCC::ICC_N:   return "n";
+    case SPCC::ICC_NE:  return "ne";
+    case SPCC::ICC_E:   return "e";
+    case SPCC::ICC_G:   return "g";
+    case SPCC::ICC_LE:  return "le";
+    case SPCC::ICC_GE:  return "ge";
+    case SPCC::ICC_L:   return "l";
+    case SPCC::ICC_GU:  return "gu";
+    case SPCC::ICC_LEU: return "leu";
+    case SPCC::ICC_CC:  return "cc";
+    case SPCC::ICC_CS:  return "cs";
+    case SPCC::ICC_POS: return "pos";
+    case SPCC::ICC_NEG: return "neg";
+    case SPCC::ICC_VC:  return "vc";
+    case SPCC::ICC_VS:  return "vs";
+    case SPCC::FCC_A:   return "a";
+    case SPCC::FCC_N:   return "n";
+    case SPCC::FCC_U:   return "u";
+    case SPCC::FCC_G:   return "g";
+    case SPCC::FCC_UG:  return "ug";
+    case SPCC::FCC_L:   return "l";
+    case SPCC::FCC_UL:  return "ul";
+    case SPCC::FCC_LG:  return "lg";
+    case SPCC::FCC_NE:  return "ne";
+    case SPCC::FCC_E:   return "e";
+    case SPCC::FCC_UE:  return "ue";
+    case SPCC::FCC_GE:  return "ge";
+    case SPCC::FCC_UGE: return "uge";
+    case SPCC::FCC_LE:  return "le";
+    case SPCC::FCC_ULE: return "ule";
+    case SPCC::FCC_O:   return "o";
+    }
+    llvm_unreachable("Invalid cond code");
+  }
+
+  inline static unsigned HI22(int64_t imm) {
+    return (unsigned)((imm >> 10) & ((1 << 22)-1));
+  }
+
+  inline static unsigned LO10(int64_t imm) {
+    return (unsigned)(imm & 0x3FF);
+  }
+
+  inline static unsigned HIX22(int64_t imm) {
+    return HI22(~imm);
+  }
+
+  inline static unsigned LOX10(int64_t imm) {
+    return ~LO10(~imm);
+  }
+
+}  // end namespace llvm
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/Sparc.td b/contrib/llvm/lib/Target/Sparc/Sparc.td
new file mode 100644
index 0000000..3159a46
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/Sparc.td
@@ -0,0 +1,99 @@
+//===-- Sparc.td - Describe the Sparc Target Machine -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// SPARC Subtarget features.
+//
+
+def FeatureV9
+  : SubtargetFeature<"v9", "IsV9", "true",
+                     "Enable SPARC-V9 instructions">;
+def FeatureV8Deprecated
+  : SubtargetFeature<"deprecated-v8", "V8DeprecatedInsts", "true",
+                     "Enable deprecated V8 instructions in V9 mode">;
+def FeatureVIS
+  : SubtargetFeature<"vis", "IsVIS", "true",
+                     "Enable UltraSPARC Visual Instruction Set extensions">;
+def FeatureVIS2
+  : SubtargetFeature<"vis2", "IsVIS2", "true",
+                     "Enable Visual Instruction Set extensions II">;
+def FeatureVIS3
+  : SubtargetFeature<"vis3", "IsVIS3", "true",
+                     "Enable Visual Instruction Set extensions III">;
+
+def FeatureHardQuad
+  : SubtargetFeature<"hard-quad-float", "HasHardQuad", "true",
+                     "Enable quad-word floating point instructions">;
+
+def UsePopc : SubtargetFeature<"popc", "UsePopc", "true",
+                               "Use the popc (population count) instruction">;
+
+//===----------------------------------------------------------------------===//
+// Register File, Calling Conv, Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "SparcRegisterInfo.td"
+include "SparcCallingConv.td"
+include "SparcInstrInfo.td"
+
+def SparcInstrInfo : InstrInfo;
+
+def SparcAsmParser : AsmParser {
+  bit ShouldEmitMatchRegisterName = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// SPARC processors supported.
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+def : Proc<"generic",         []>;
+def : Proc<"v7",              []>;
+def : Proc<"v8",              []>;
+def : Proc<"supersparc",      []>;
+def : Proc<"sparclite",       []>;
+def : Proc<"f934",            []>;
+def : Proc<"hypersparc",      []>;
+def : Proc<"sparclite86x",    []>;
+def : Proc<"sparclet",        []>;
+def : Proc<"tsc701",          []>;
+def : Proc<"v9",              [FeatureV9]>;
+def : Proc<"ultrasparc",      [FeatureV9, FeatureV8Deprecated, FeatureVIS]>;
+def : Proc<"ultrasparc3",     [FeatureV9, FeatureV8Deprecated, FeatureVIS,
+                               FeatureVIS2]>;
+def : Proc<"niagara",         [FeatureV9, FeatureV8Deprecated, FeatureVIS,
+                               FeatureVIS2]>;
+def : Proc<"niagara2",        [FeatureV9, FeatureV8Deprecated, UsePopc,
+                               FeatureVIS, FeatureVIS2]>;
+def : Proc<"niagara3",        [FeatureV9, FeatureV8Deprecated, UsePopc,
+                               FeatureVIS, FeatureVIS2]>;
+def : Proc<"niagara4",        [FeatureV9, FeatureV8Deprecated, UsePopc,
+                               FeatureVIS, FeatureVIS2, FeatureVIS3]>;
+
+
+//===----------------------------------------------------------------------===//
+// Declare the target which we are implementing
+//===----------------------------------------------------------------------===//
+
+def Sparc : Target {
+  // Pull in Instruction Info:
+  let InstructionSet = SparcInstrInfo;
+  let AssemblyParsers  = [SparcAsmParser];
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp b/contrib/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
new file mode 100644
index 0000000..1b7330e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -0,0 +1,465 @@
+//===-- SparcAsmPrinter.cpp - Sparc LLVM assembly writer ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to GAS-format SPARC assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Sparc.h"
+#include "InstPrinter/SparcInstPrinter.h"
+#include "MCTargetDesc/SparcMCExpr.h"
+#include "SparcInstrInfo.h"
+#include "SparcTargetMachine.h"
+#include "SparcTargetStreamer.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+namespace {
+  class SparcAsmPrinter : public AsmPrinter {
+    SparcTargetStreamer &getTargetStreamer() {
+      return static_cast<SparcTargetStreamer &>(
+          *OutStreamer.getTargetStreamer());
+    }
+  public:
+    explicit SparcAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+      : AsmPrinter(TM, Streamer) {}
+
+    const char *getPassName() const override {
+      return "Sparc Assembly Printer";
+    }
+
+    void printOperand(const MachineInstr *MI, int opNum, raw_ostream &OS);
+    void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &OS,
+                         const char *Modifier = nullptr);
+    void printCCOperand(const MachineInstr *MI, int opNum, raw_ostream &OS);
+
+    void EmitFunctionBodyStart() override;
+    void EmitInstruction(const MachineInstr *MI) override;
+    void EmitEndOfAsmFile(Module &M) override;
+
+    static const char *getRegisterName(unsigned RegNo) {
+      return SparcInstPrinter::getRegisterName(RegNo);
+    }
+
+    bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                         unsigned AsmVariant, const char *ExtraCode,
+                         raw_ostream &O) override;
+    bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                               unsigned AsmVariant, const char *ExtraCode,
+                               raw_ostream &O) override;
+
+    void LowerGETPCXAndEmitMCInsts(const MachineInstr *MI,
+                                   const MCSubtargetInfo &STI);
+
+  };
+} // end of anonymous namespace
+
+static MCOperand createSparcMCOperand(SparcMCExpr::VariantKind Kind,
+                                      MCSymbol *Sym, MCContext &OutContext) {
+  const MCSymbolRefExpr *MCSym = MCSymbolRefExpr::Create(Sym,
+                                                         OutContext);
+  const SparcMCExpr *expr = SparcMCExpr::Create(Kind, MCSym, OutContext);
+  return MCOperand::CreateExpr(expr);
+
+}
+static MCOperand createPCXCallOP(MCSymbol *Label,
+                                 MCContext &OutContext) {
+  return createSparcMCOperand(SparcMCExpr::VK_Sparc_None, Label, OutContext);
+}
+
+static MCOperand createPCXRelExprOp(SparcMCExpr::VariantKind Kind,
+                                    MCSymbol *GOTLabel, MCSymbol *StartLabel,
+                                    MCSymbol *CurLabel,
+                                    MCContext &OutContext)
+{
+  const MCSymbolRefExpr *GOT = MCSymbolRefExpr::Create(GOTLabel, OutContext);
+  const MCSymbolRefExpr *Start = MCSymbolRefExpr::Create(StartLabel,
+                                                         OutContext);
+  const MCSymbolRefExpr *Cur = MCSymbolRefExpr::Create(CurLabel,
+                                                       OutContext);
+
+  const MCBinaryExpr *Sub = MCBinaryExpr::CreateSub(Cur, Start, OutContext);
+  const MCBinaryExpr *Add = MCBinaryExpr::CreateAdd(GOT, Sub, OutContext);
+  const SparcMCExpr *expr = SparcMCExpr::Create(Kind,
+                                                Add, OutContext);
+  return MCOperand::CreateExpr(expr);
+}
+
+static void EmitCall(MCStreamer &OutStreamer,
+                     MCOperand &Callee,
+                     const MCSubtargetInfo &STI)
+{
+  MCInst CallInst;
+  CallInst.setOpcode(SP::CALL);
+  CallInst.addOperand(Callee);
+  OutStreamer.EmitInstruction(CallInst, STI);
+}
+
+static void EmitSETHI(MCStreamer &OutStreamer,
+                      MCOperand &Imm, MCOperand &RD,
+                      const MCSubtargetInfo &STI)
+{
+  MCInst SETHIInst;
+  SETHIInst.setOpcode(SP::SETHIi);
+  SETHIInst.addOperand(RD);
+  SETHIInst.addOperand(Imm);
+  OutStreamer.EmitInstruction(SETHIInst, STI);
+}
+
+static void EmitBinary(MCStreamer &OutStreamer, unsigned Opcode,
+                       MCOperand &RS1, MCOperand &Src2, MCOperand &RD,
+                       const MCSubtargetInfo &STI)
+{
+  MCInst Inst;
+  Inst.setOpcode(Opcode);
+  Inst.addOperand(RD);
+  Inst.addOperand(RS1);
+  Inst.addOperand(Src2);
+  OutStreamer.EmitInstruction(Inst, STI);
+}
+
+static void EmitOR(MCStreamer &OutStreamer,
+                   MCOperand &RS1, MCOperand &Imm, MCOperand &RD,
+                   const MCSubtargetInfo &STI) {
+  EmitBinary(OutStreamer, SP::ORri, RS1, Imm, RD, STI);
+}
+
+static void EmitADD(MCStreamer &OutStreamer,
+                    MCOperand &RS1, MCOperand &RS2, MCOperand &RD,
+                    const MCSubtargetInfo &STI) {
+  EmitBinary(OutStreamer, SP::ADDrr, RS1, RS2, RD, STI);
+}
+
+static void EmitSHL(MCStreamer &OutStreamer,
+                    MCOperand &RS1, MCOperand &Imm, MCOperand &RD,
+                    const MCSubtargetInfo &STI) {
+  EmitBinary(OutStreamer, SP::SLLri, RS1, Imm, RD, STI);
+}
+
+
+static void EmitHiLo(MCStreamer &OutStreamer,  MCSymbol *GOTSym,
+                     SparcMCExpr::VariantKind HiKind,
+                     SparcMCExpr::VariantKind LoKind,
+                     MCOperand &RD,
+                     MCContext &OutContext,
+                     const MCSubtargetInfo &STI) {
+
+  MCOperand hi = createSparcMCOperand(HiKind, GOTSym, OutContext);
+  MCOperand lo = createSparcMCOperand(LoKind, GOTSym, OutContext);
+  EmitSETHI(OutStreamer, hi, RD, STI);
+  EmitOR(OutStreamer, RD, lo, RD, STI);
+}
+
+void SparcAsmPrinter::LowerGETPCXAndEmitMCInsts(const MachineInstr *MI,
+                                                const MCSubtargetInfo &STI)
+{
+  MCSymbol *GOTLabel   =
+    OutContext.GetOrCreateSymbol(Twine("_GLOBAL_OFFSET_TABLE_"));
+
+  const MachineOperand &MO = MI->getOperand(0);
+  assert(MO.getReg() != SP::O7 &&
+         "%o7 is assigned as destination for getpcx!");
+
+  MCOperand MCRegOP = MCOperand::CreateReg(MO.getReg());
+
+
+  if (TM.getRelocationModel() != Reloc::PIC_) {
+    // Just load the address of GOT to MCRegOP.
+    switch(TM.getCodeModel()) {
+    default:
+      llvm_unreachable("Unsupported absolute code model");
+    case CodeModel::Small:
+      EmitHiLo(OutStreamer, GOTLabel,
+               SparcMCExpr::VK_Sparc_HI, SparcMCExpr::VK_Sparc_LO,
+               MCRegOP, OutContext, STI);
+      break;
+    case CodeModel::Medium: {
+      EmitHiLo(OutStreamer, GOTLabel,
+               SparcMCExpr::VK_Sparc_H44, SparcMCExpr::VK_Sparc_M44,
+               MCRegOP, OutContext, STI);
+      MCOperand imm = MCOperand::CreateExpr(MCConstantExpr::Create(12,
+                                                                   OutContext));
+      EmitSHL(OutStreamer, MCRegOP, imm, MCRegOP, STI);
+      MCOperand lo = createSparcMCOperand(SparcMCExpr::VK_Sparc_L44,
+                                          GOTLabel, OutContext);
+      EmitOR(OutStreamer, MCRegOP, lo, MCRegOP, STI);
+      break;
+    }
+    case CodeModel::Large: {
+      EmitHiLo(OutStreamer, GOTLabel,
+               SparcMCExpr::VK_Sparc_HH, SparcMCExpr::VK_Sparc_HM,
+               MCRegOP, OutContext, STI);
+      MCOperand imm = MCOperand::CreateExpr(MCConstantExpr::Create(32,
+                                                                   OutContext));
+      EmitSHL(OutStreamer, MCRegOP, imm, MCRegOP, STI);
+      // Use register %o7 to load the lower 32 bits.
+      MCOperand RegO7 = MCOperand::CreateReg(SP::O7);
+      EmitHiLo(OutStreamer, GOTLabel,
+               SparcMCExpr::VK_Sparc_HI, SparcMCExpr::VK_Sparc_LO,
+               RegO7, OutContext, STI);
+      EmitADD(OutStreamer, MCRegOP, RegO7, MCRegOP, STI);
+    }
+    }
+    return;
+  }
+
+  MCSymbol *StartLabel = OutContext.CreateTempSymbol();
+  MCSymbol *EndLabel   = OutContext.CreateTempSymbol();
+  MCSymbol *SethiLabel = OutContext.CreateTempSymbol();
+
+  MCOperand RegO7   = MCOperand::CreateReg(SP::O7);
+
+  // <StartLabel>:
+  //   call <EndLabel>
+  // <SethiLabel>:
+  //     sethi %hi(_GLOBAL_OFFSET_TABLE_+(<SethiLabel>-<StartLabel>)), <MO>
+  // <EndLabel>:
+  //   or  <MO>, %lo(_GLOBAL_OFFSET_TABLE_+(<EndLabel>-<StartLabel>))), <MO>
+  //   add <MO>, %o7, <MO>
+
+  OutStreamer.EmitLabel(StartLabel);
+  MCOperand Callee =  createPCXCallOP(EndLabel, OutContext);
+  EmitCall(OutStreamer, Callee, STI);
+  OutStreamer.EmitLabel(SethiLabel);
+  MCOperand hiImm = createPCXRelExprOp(SparcMCExpr::VK_Sparc_PC22,
+                                       GOTLabel, StartLabel, SethiLabel,
+                                       OutContext);
+  EmitSETHI(OutStreamer, hiImm, MCRegOP, STI);
+  OutStreamer.EmitLabel(EndLabel);
+  MCOperand loImm = createPCXRelExprOp(SparcMCExpr::VK_Sparc_PC10,
+                                       GOTLabel, StartLabel, EndLabel,
+                                       OutContext);
+  EmitOR(OutStreamer, MCRegOP, loImm, MCRegOP, STI);
+  EmitADD(OutStreamer, MCRegOP, RegO7, MCRegOP, STI);
+}
+
+void SparcAsmPrinter::EmitInstruction(const MachineInstr *MI)
+{
+
+  switch (MI->getOpcode()) {
+  default: break;
+  case TargetOpcode::DBG_VALUE:
+    // FIXME: Debug Value.
+    return;
+  case SP::GETPCX:
+    LowerGETPCXAndEmitMCInsts(MI, getSubtargetInfo());
+    return;
+  }
+  MachineBasicBlock::const_instr_iterator I = MI;
+  MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
+  do {
+    MCInst TmpInst;
+    LowerSparcMachineInstrToMCInst(I, TmpInst, *this);
+    EmitToStreamer(OutStreamer, TmpInst);
+  } while ((++I != E) && I->isInsideBundle()); // Delay slot check.
+}
+
+void SparcAsmPrinter::EmitFunctionBodyStart() {
+  if (!TM.getSubtarget<SparcSubtarget>().is64Bit())
+    return;
+
+  const MachineRegisterInfo &MRI = MF->getRegInfo();
+  const unsigned globalRegs[] = { SP::G2, SP::G3, SP::G6, SP::G7, 0 };
+  for (unsigned i = 0; globalRegs[i] != 0; ++i) {
+    unsigned reg = globalRegs[i];
+    if (MRI.use_empty(reg))
+      continue;
+
+    if  (reg == SP::G6 || reg == SP::G7)
+      getTargetStreamer().emitSparcRegisterIgnore(reg);
+    else
+      getTargetStreamer().emitSparcRegisterScratch(reg);
+  }
+}
+
+void SparcAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
+                                   raw_ostream &O) {
+  const DataLayout *DL = TM.getDataLayout();
+  const MachineOperand &MO = MI->getOperand (opNum);
+  SparcMCExpr::VariantKind TF = (SparcMCExpr::VariantKind) MO.getTargetFlags();
+
+#ifndef NDEBUG
+  // Verify the target flags.
+  if (MO.isGlobal() || MO.isSymbol() || MO.isCPI()) {
+    if (MI->getOpcode() == SP::CALL)
+      assert(TF == SparcMCExpr::VK_Sparc_None &&
+             "Cannot handle target flags on call address");
+    else if (MI->getOpcode() == SP::SETHIi || MI->getOpcode() == SP::SETHIXi)
+      assert((TF == SparcMCExpr::VK_Sparc_HI
+              || TF == SparcMCExpr::VK_Sparc_H44
+              || TF == SparcMCExpr::VK_Sparc_HH
+              || TF == SparcMCExpr::VK_Sparc_TLS_GD_HI22
+              || TF == SparcMCExpr::VK_Sparc_TLS_LDM_HI22
+              || TF == SparcMCExpr::VK_Sparc_TLS_LDO_HIX22
+              || TF == SparcMCExpr::VK_Sparc_TLS_IE_HI22
+              || TF == SparcMCExpr::VK_Sparc_TLS_LE_HIX22) &&
+             "Invalid target flags for address operand on sethi");
+    else if (MI->getOpcode() == SP::TLS_CALL)
+      assert((TF == SparcMCExpr::VK_Sparc_None
+              || TF == SparcMCExpr::VK_Sparc_TLS_GD_CALL
+              || TF == SparcMCExpr::VK_Sparc_TLS_LDM_CALL) &&
+             "Cannot handle target flags on tls call address");
+    else if (MI->getOpcode() == SP::TLS_ADDrr)
+      assert((TF == SparcMCExpr::VK_Sparc_TLS_GD_ADD
+              || TF == SparcMCExpr::VK_Sparc_TLS_LDM_ADD
+              || TF == SparcMCExpr::VK_Sparc_TLS_LDO_ADD
+              || TF == SparcMCExpr::VK_Sparc_TLS_IE_ADD) &&
+             "Cannot handle target flags on add for TLS");
+    else if (MI->getOpcode() == SP::TLS_LDrr)
+      assert(TF == SparcMCExpr::VK_Sparc_TLS_IE_LD &&
+             "Cannot handle target flags on ld for TLS");
+    else if (MI->getOpcode() == SP::TLS_LDXrr)
+      assert(TF == SparcMCExpr::VK_Sparc_TLS_IE_LDX &&
+             "Cannot handle target flags on ldx for TLS");
+    else if (MI->getOpcode() == SP::XORri || MI->getOpcode() == SP::XORXri)
+      assert((TF == SparcMCExpr::VK_Sparc_TLS_LDO_LOX10
+              || TF == SparcMCExpr::VK_Sparc_TLS_LE_LOX10) &&
+             "Cannot handle target flags on xor for TLS");
+    else
+      assert((TF == SparcMCExpr::VK_Sparc_LO
+              || TF == SparcMCExpr::VK_Sparc_M44
+              || TF == SparcMCExpr::VK_Sparc_L44
+              || TF == SparcMCExpr::VK_Sparc_HM
+              || TF == SparcMCExpr::VK_Sparc_TLS_GD_LO10
+              || TF == SparcMCExpr::VK_Sparc_TLS_LDM_LO10
+              || TF == SparcMCExpr::VK_Sparc_TLS_IE_LO10 ) &&
+             "Invalid target flags for small address operand");
+  }
+#endif
+
+
+  bool CloseParen = SparcMCExpr::printVariantKind(O, TF);
+
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    O << "%" << StringRef(getRegisterName(MO.getReg())).lower();
+    break;
+
+  case MachineOperand::MO_Immediate:
+    O << (int)MO.getImm();
+    break;
+  case MachineOperand::MO_MachineBasicBlock:
+    O << *MO.getMBB()->getSymbol();
+    return;
+  case MachineOperand::MO_GlobalAddress:
+    O << *getSymbol(MO.getGlobal());
+    break;
+  case MachineOperand::MO_BlockAddress:
+    O <<  GetBlockAddressSymbol(MO.getBlockAddress())->getName();
+    break;
+  case MachineOperand::MO_ExternalSymbol:
+    O << MO.getSymbolName();
+    break;
+  case MachineOperand::MO_ConstantPoolIndex:
+    O << DL->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << "_"
+      << MO.getIndex();
+    break;
+  default:
+    llvm_unreachable("<unknown operand type>");
+  }
+  if (CloseParen) O << ")";
+}
+
+void SparcAsmPrinter::printMemOperand(const MachineInstr *MI, int opNum,
+                                      raw_ostream &O, const char *Modifier) {
+  printOperand(MI, opNum, O);
+
+  // If this is an ADD operand, emit it like normal operands.
+  if (Modifier && !strcmp(Modifier, "arith")) {
+    O << ", ";
+    printOperand(MI, opNum+1, O);
+    return;
+  }
+
+  if (MI->getOperand(opNum+1).isReg() &&
+      MI->getOperand(opNum+1).getReg() == SP::G0)
+    return;   // don't print "+%g0"
+  if (MI->getOperand(opNum+1).isImm() &&
+      MI->getOperand(opNum+1).getImm() == 0)
+    return;   // don't print "+0"
+
+  O << "+";
+  printOperand(MI, opNum+1, O);
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool SparcAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                      unsigned AsmVariant,
+                                      const char *ExtraCode,
+                                      raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
+    case 'r':
+     break;
+    }
+  }
+
+  printOperand(MI, OpNo, O);
+
+  return false;
+}
+
+bool SparcAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                            unsigned OpNo, unsigned AsmVariant,
+                                            const char *ExtraCode,
+                                            raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0])
+    return true;  // Unknown modifier
+
+  O << '[';
+  printMemOperand(MI, OpNo, O);
+  O << ']';
+
+  return false;
+}
+
+void SparcAsmPrinter::EmitEndOfAsmFile(Module &M) {
+  const TargetLoweringObjectFileELF &TLOFELF =
+    static_cast<const TargetLoweringObjectFileELF &>(getObjFileLowering());
+  MachineModuleInfoELF &MMIELF = MMI->getObjFileInfo<MachineModuleInfoELF>();
+
+  // Generate stubs for global variables.
+  MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList();
+  if (!Stubs.empty()) {
+    OutStreamer.SwitchSection(TLOFELF.getDataSection());
+    unsigned PtrSize = TM.getDataLayout()->getPointerSize(0);
+    for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+      OutStreamer.EmitLabel(Stubs[i].first);
+      OutStreamer.EmitSymbolValue(Stubs[i].second.getPointer(), PtrSize);
+    }
+  }
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeSparcAsmPrinter() {
+  RegisterAsmPrinter<SparcAsmPrinter> X(TheSparcTarget);
+  RegisterAsmPrinter<SparcAsmPrinter> Y(TheSparcV9Target);
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcCallingConv.td b/contrib/llvm/lib/Target/Sparc/SparcCallingConv.td
new file mode 100644
index 0000000..dfaaabf
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcCallingConv.td
@@ -0,0 +1,139 @@
+//===-- SparcCallingConv.td - Calling Conventions Sparc ----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the Sparc architectures.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SPARC v8 32-bit.
+//===----------------------------------------------------------------------===//
+
+def CC_Sparc32 : CallingConv<[
+  // Custom assign SRet to [sp+64].
+  CCIfSRet<CCCustom<"CC_Sparc_Assign_SRet">>,
+  // i32 f32 arguments get passed in integer registers if there is space.
+  CCIfType<[i32, f32], CCAssignToReg<[I0, I1, I2, I3, I4, I5]>>,
+  // f64 arguments are split and passed through registers or through stack.
+  CCIfType<[f64], CCCustom<"CC_Sparc_Assign_f64">>,
+
+  // Alternatively, they are assigned to the stack in 4-byte aligned units.
+  CCAssignToStack<4, 4>
+]>;
+
+def RetCC_Sparc32 : CallingConv<[
+  CCIfType<[i32], CCAssignToReg<[I0, I1, I2, I3, I4, I5]>>,
+  CCIfType<[f32], CCAssignToReg<[F0, F1, F2, F3]>>,
+  CCIfType<[f64], CCAssignToReg<[D0, D1]>>
+]>;
+
+
+//===----------------------------------------------------------------------===//
+// SPARC v9 64-bit.
+//===----------------------------------------------------------------------===//
+//
+// The 64-bit ABI conceptually assigns all function arguments to a parameter
+// array starting at [%fp+BIAS+128] in the callee's stack frame. All arguments
+// occupy a multiple of 8 bytes in the array. Integer arguments are extended to
+// 64 bits by the caller. Floats are right-aligned in their 8-byte slot, the
+// first 4 bytes in the slot are undefined.
+//
+// The integer registers %i0 to %i5 shadow the first 48 bytes of the parameter
+// array at fixed offsets. Integer arguments are promoted to registers when
+// possible.
+//
+// The floating point registers %f0 to %f31 shadow the first 128 bytes of the
+// parameter array at fixed offsets. Float and double parameters are promoted
+// to these registers when possible.
+//
+// Structs up to 16 bytes in size are passed by value. They are right-aligned
+// in one or two 8-byte slots in the parameter array. Struct members are
+// promoted to both floating point and integer registers when possible. A
+// struct containing two floats would thus be passed in %f0 and %f1, while two
+// float function arguments would occupy 8 bytes each, and be passed in %f1 and
+// %f3.
+//
+// When a struct { int, float } is passed by value, the int goes in the high
+// bits of an integer register while the float goes in a floating point
+// register.
+//
+// The difference is encoded in LLVM IR using the inreg atttribute on function
+// arguments:
+//
+//   C:   void f(float, float);
+//   IR:  declare void f(float %f1, float %f3)
+//
+//   C:   void f(struct { float f0, f1; });
+//   IR:  declare void f(float inreg %f0, float inreg %f1)
+//
+//   C:   void f(int, float);
+//   IR:  declare void f(int signext %i0, float %f3)
+//
+//   C:   void f(struct { int i0high; float f1; });
+//   IR:  declare void f(i32 inreg %i0high, float inreg %f1)
+//
+// Two ints in a struct are simply coerced to i64:
+//
+//   C:   void f(struct { int i0high, i0low; });
+//   IR:  declare void f(i64 %i0.coerced)
+//
+// The frontend and backend divide the task of producing ABI compliant code for
+// C functions. The C frontend will:
+//
+//  - Annotate integer arguments with zeroext or signext attributes.
+//
+//  - Split structs into one or two 64-bit sized chunks, or 32-bit chunks with
+//    inreg attributes.
+//
+//  - Pass structs larger than 16 bytes indirectly with an explicit pointer
+//    argument. The byval attribute is not used.
+//
+// The backend will:
+//
+//  - Assign all arguments to 64-bit aligned stack slots, 32-bits for inreg.
+//
+//  - Promote to integer or floating point registers depending on type.
+//
+// Function return values are passed exactly like function arguments, except a
+// struct up to 32 bytes in size can be returned in registers.
+
+// Function arguments AND most return values.
+def CC_Sparc64 : CallingConv<[
+  // The frontend uses the inreg flag to indicate i32 and float arguments from
+  // structs. These arguments are not promoted to 64 bits, but they can still
+  // be assigned to integer and float registers.
+  CCIfInReg<CCIfType<[i32, f32], CCCustom<"CC_Sparc64_Half">>>,
+
+  // All integers are promoted to i64 by the caller.
+  CCIfType<[i32], CCPromoteToType<i64>>,
+
+  // Custom assignment is required because stack space is reserved for all
+  // arguments whether they are passed in registers or not.
+  CCCustom<"CC_Sparc64_Full">
+]>;
+
+def RetCC_Sparc64 : CallingConv<[
+  // A single f32 return value always goes in %f0. The ABI doesn't specify what
+  // happens to multiple f32 return values outside a struct.
+  CCIfType<[f32], CCCustom<"CC_Sparc64_Half">>,
+
+  // Otherwise, return values are passed exactly like arguments.
+  CCDelegateTo<CC_Sparc64>
+]>;
+
+// Callee-saved registers are handled by the register window mechanism.
+def CSR : CalleeSavedRegs<(add)> {
+  let OtherPreserved = (add (sequence "I%u", 0, 7),
+                            (sequence "L%u", 0, 7));
+}
+
+// Callee-saved registers for calls with ReturnsTwice attribute.
+def RTCSR : CalleeSavedRegs<(add)> {
+  let OtherPreserved = (add I6, I7);
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcCodeEmitter.cpp b/contrib/llvm/lib/Target/Sparc/SparcCodeEmitter.cpp
new file mode 100644
index 0000000..247da2a
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcCodeEmitter.cpp
@@ -0,0 +1,280 @@
+//===-- Sparc/SparcCodeEmitter.cpp - Convert Sparc Code to Machine Code ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===---------------------------------------------------------------------===//
+//
+// This file contains the pass that transforms the Sparc machine instructions
+// into relocatable machine code.
+//
+//===---------------------------------------------------------------------===//
+
+#include "Sparc.h"
+#include "MCTargetDesc/SparcMCExpr.h"
+#include "SparcRelocations.h"
+#include "SparcTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "jit"
+
+STATISTIC(NumEmitted, "Number of machine instructions emitted");
+
+namespace {
+
+class SparcCodeEmitter : public MachineFunctionPass {
+  SparcJITInfo *JTI;
+  const SparcInstrInfo *II;
+  const DataLayout *TD;
+  const SparcSubtarget *Subtarget;
+  TargetMachine &TM;
+  JITCodeEmitter &MCE;
+  const std::vector<MachineConstantPoolEntry> *MCPEs;
+  bool IsPIC;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineModuleInfo> ();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  static char ID;
+
+public:
+  SparcCodeEmitter(TargetMachine &tm, JITCodeEmitter &mce)
+    : MachineFunctionPass(ID), JTI(nullptr), II(nullptr), TD(nullptr),
+      TM(tm), MCE(mce), MCPEs(nullptr),
+      IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "Sparc Machine Code Emitter";
+  }
+
+  /// getBinaryCodeForInstr - This function, generated by the
+  /// CodeEmitterGenerator using TableGen, produces the binary encoding for
+  /// machine instructions.
+  uint64_t getBinaryCodeForInstr(const MachineInstr &MI) const;
+
+  void emitInstruction(MachineBasicBlock::instr_iterator MI,
+                       MachineBasicBlock &MBB);
+
+private:
+  /// getMachineOpValue - Return binary encoding of operand. If the machine
+  /// operand requires relocation, record the relocation and return zero.
+  unsigned getMachineOpValue(const MachineInstr &MI,
+                             const MachineOperand &MO) const;
+
+  unsigned getCallTargetOpValue(const MachineInstr &MI,
+                                unsigned) const;
+  unsigned getBranchTargetOpValue(const MachineInstr &MI,
+                                  unsigned) const;
+  unsigned getBranchPredTargetOpValue(const MachineInstr &MI,
+                                      unsigned) const;
+  unsigned getBranchOnRegTargetOpValue(const MachineInstr &MI,
+                                       unsigned) const;
+
+  void emitWord(unsigned Word);
+
+  unsigned getRelocation(const MachineInstr &MI,
+                         const MachineOperand &MO) const;
+
+  void emitGlobalAddress(const GlobalValue *GV, unsigned Reloc) const;
+  void emitExternalSymbolAddress(const char *ES, unsigned Reloc) const;
+  void emitConstPoolAddress(unsigned CPI, unsigned Reloc) const;
+  void emitMachineBasicBlock(MachineBasicBlock *BB, unsigned Reloc) const;
+};
+}  // end anonymous namespace.
+
+char SparcCodeEmitter::ID = 0;
+
+bool SparcCodeEmitter::runOnMachineFunction(MachineFunction &MF) {
+  SparcTargetMachine &Target = static_cast<SparcTargetMachine &>(
+                                const_cast<TargetMachine &>(MF.getTarget()));
+
+  JTI = Target.getJITInfo();
+  II = Target.getInstrInfo();
+  TD = Target.getDataLayout();
+  Subtarget = &TM.getSubtarget<SparcSubtarget> ();
+  MCPEs = &MF.getConstantPool()->getConstants();
+  JTI->Initialize(MF, IsPIC);
+  MCE.setModuleInfo(&getAnalysis<MachineModuleInfo> ());
+
+  do {
+    DEBUG(errs() << "JITTing function '"
+        << MF.getName() << "'\n");
+    MCE.startFunction(MF);
+
+    for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
+        MBB != E; ++MBB){
+      MCE.StartMachineBasicBlock(MBB);
+      for (MachineBasicBlock::instr_iterator I = MBB->instr_begin(),
+           E = MBB->instr_end(); I != E;)
+        emitInstruction(*I++, *MBB);
+    }
+  } while (MCE.finishFunction(MF));
+
+  return false;
+}
+
+void SparcCodeEmitter::emitInstruction(MachineBasicBlock::instr_iterator MI,
+                                      MachineBasicBlock &MBB) {
+  DEBUG(errs() << "JIT: " << (void*)MCE.getCurrentPCValue() << ":\t" << *MI);
+
+  MCE.processDebugLoc(MI->getDebugLoc(), true);
+
+  ++NumEmitted;
+
+  switch (MI->getOpcode()) {
+  default: {
+    emitWord(getBinaryCodeForInstr(*MI));
+    break;
+  }
+  case TargetOpcode::INLINEASM: {
+    // We allow inline assembler nodes with empty bodies - they can
+    // implicitly define registers, which is ok for JIT.
+    if (MI->getOperand(0).getSymbolName()[0]) {
+      report_fatal_error("JIT does not support inline asm!");
+    }
+    break;
+  }
+  case TargetOpcode::CFI_INSTRUCTION:
+    break;
+  case TargetOpcode::EH_LABEL: {
+    MCE.emitLabel(MI->getOperand(0).getMCSymbol());
+    break;
+  }
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::KILL: {
+    // Do nothing.
+    break;
+  }
+  case SP::GETPCX: {
+    report_fatal_error("JIT does not support pseudo instruction GETPCX yet!");
+    break;
+  }
+  }
+
+  MCE.processDebugLoc(MI->getDebugLoc(), false);
+}
+
+void SparcCodeEmitter::emitWord(unsigned Word) {
+  DEBUG(errs() << "  0x";
+        errs().write_hex(Word) << "\n");
+  MCE.emitWordBE(Word);
+}
+
+/// getMachineOpValue - Return binary encoding of operand. If the machine
+/// operand requires relocation, record the relocation and return zero.
+unsigned SparcCodeEmitter::getMachineOpValue(const MachineInstr &MI,
+                                             const MachineOperand &MO) const {
+  if (MO.isReg())
+    return TM.getRegisterInfo()->getEncodingValue(MO.getReg());
+  else if (MO.isImm())
+    return static_cast<unsigned>(MO.getImm());
+  else if (MO.isGlobal())
+    emitGlobalAddress(MO.getGlobal(), getRelocation(MI, MO));
+  else if (MO.isSymbol())
+    emitExternalSymbolAddress(MO.getSymbolName(), getRelocation(MI, MO));
+  else if (MO.isCPI())
+    emitConstPoolAddress(MO.getIndex(), getRelocation(MI, MO));
+  else if (MO.isMBB())
+    emitMachineBasicBlock(MO.getMBB(), getRelocation(MI, MO));
+  else
+    llvm_unreachable("Unable to encode MachineOperand!");
+  return 0;
+}
+unsigned SparcCodeEmitter::getCallTargetOpValue(const MachineInstr &MI,
+                                                unsigned opIdx) const {
+  const MachineOperand MO = MI.getOperand(opIdx);
+  return getMachineOpValue(MI, MO);
+}
+
+unsigned SparcCodeEmitter::getBranchTargetOpValue(const MachineInstr &MI,
+                                                  unsigned opIdx) const {
+  const MachineOperand MO = MI.getOperand(opIdx);
+  return getMachineOpValue(MI, MO);
+}
+
+unsigned SparcCodeEmitter::getBranchPredTargetOpValue(const MachineInstr &MI,
+                                                      unsigned opIdx) const {
+  const MachineOperand MO = MI.getOperand(opIdx);
+  return getMachineOpValue(MI, MO);
+}
+
+unsigned SparcCodeEmitter::getBranchOnRegTargetOpValue(const MachineInstr &MI,
+                                                       unsigned opIdx) const {
+  const MachineOperand MO = MI.getOperand(opIdx);
+  return getMachineOpValue(MI, MO);
+}
+
+unsigned SparcCodeEmitter::getRelocation(const MachineInstr &MI,
+                                         const MachineOperand &MO) const {
+
+  unsigned TF = MO.getTargetFlags();
+  switch (TF) {
+  default:
+  case SparcMCExpr::VK_Sparc_None:  break;
+  case SparcMCExpr::VK_Sparc_LO:    return SP::reloc_sparc_lo;
+  case SparcMCExpr::VK_Sparc_HI:    return SP::reloc_sparc_hi;
+  case SparcMCExpr::VK_Sparc_H44:   return SP::reloc_sparc_h44;
+  case SparcMCExpr::VK_Sparc_M44:   return SP::reloc_sparc_m44;
+  case SparcMCExpr::VK_Sparc_L44:   return SP::reloc_sparc_l44;
+  case SparcMCExpr::VK_Sparc_HH:    return SP::reloc_sparc_hh;
+  case SparcMCExpr::VK_Sparc_HM:    return SP::reloc_sparc_hm;
+  }
+
+  unsigned Opc = MI.getOpcode();
+  switch (Opc) {
+  default: break;
+  case SP::CALL:    return SP::reloc_sparc_pc30;
+  case SP::BA:
+  case SP::BCOND:
+  case SP::FBCOND:  return SP::reloc_sparc_pc22;
+  case SP::BPXCC:   return SP::reloc_sparc_pc19;
+  }
+  llvm_unreachable("unknown reloc!");
+}
+
+void SparcCodeEmitter::emitGlobalAddress(const GlobalValue *GV,
+                                        unsigned Reloc) const {
+  MCE.addRelocation(MachineRelocation::getGV(MCE.getCurrentPCOffset(), Reloc,
+                                             const_cast<GlobalValue *>(GV), 0,
+                                             true));
+}
+
+void SparcCodeEmitter::
+emitExternalSymbolAddress(const char *ES, unsigned Reloc) const {
+  MCE.addRelocation(MachineRelocation::getExtSym(MCE.getCurrentPCOffset(),
+                                                 Reloc, ES, 0, 0));
+}
+
+void SparcCodeEmitter::
+emitConstPoolAddress(unsigned CPI, unsigned Reloc) const {
+  MCE.addRelocation(MachineRelocation::getConstPool(MCE.getCurrentPCOffset(),
+                                                    Reloc, CPI, 0, false));
+}
+
+void SparcCodeEmitter::emitMachineBasicBlock(MachineBasicBlock *BB,
+                                            unsigned Reloc) const {
+  MCE.addRelocation(MachineRelocation::getBB(MCE.getCurrentPCOffset(),
+                                             Reloc, BB));
+}
+
+
+/// createSparcJITCodeEmitterPass - Return a pass that emits the collected Sparc
+/// code to the specified MCE object.
+FunctionPass *llvm::createSparcJITCodeEmitterPass(SparcTargetMachine &TM,
+                                                 JITCodeEmitter &JCE) {
+  return new SparcCodeEmitter(TM, JCE);
+}
+
+#include "SparcGenCodeEmitter.inc"
diff --git a/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.cpp b/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.cpp
new file mode 100644
index 0000000..3cdfda3
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.cpp
@@ -0,0 +1,262 @@
+//===-- SparcFrameLowering.cpp - Sparc Frame Information ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Sparc implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcFrameLowering.h"
+#include "SparcInstrInfo.h"
+#include "SparcMachineFunctionInfo.h"
+#include "SparcSubtarget.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+static cl::opt<bool>
+DisableLeafProc("disable-sparc-leaf-proc",
+                cl::init(false),
+                cl::desc("Disable Sparc leaf procedure optimization."),
+                cl::Hidden);
+
+SparcFrameLowering::SparcFrameLowering(const SparcSubtarget &ST)
+    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown,
+                          ST.is64Bit() ? 16 : 8, 0, ST.is64Bit() ? 16 : 8) {}
+
+void SparcFrameLowering::emitSPAdjustment(MachineFunction &MF,
+                                          MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator MBBI,
+                                          int NumBytes,
+                                          unsigned ADDrr,
+                                          unsigned ADDri) const {
+
+  DebugLoc dl = (MBBI != MBB.end()) ? MBBI->getDebugLoc() : DebugLoc();
+  const SparcInstrInfo &TII =
+    *static_cast<const SparcInstrInfo*>(MF.getTarget().getInstrInfo());
+
+  if (NumBytes >= -4096 && NumBytes < 4096) {
+    BuildMI(MBB, MBBI, dl, TII.get(ADDri), SP::O6)
+      .addReg(SP::O6).addImm(NumBytes);
+    return;
+  }
+
+  // Emit this the hard way.  This clobbers G1 which we always know is
+  // available here.
+  if (NumBytes >= 0) {
+    // Emit nonnegative numbers with sethi + or.
+    // sethi %hi(NumBytes), %g1
+    // or %g1, %lo(NumBytes), %g1
+    // add %sp, %g1, %sp
+    BuildMI(MBB, MBBI, dl, TII.get(SP::SETHIi), SP::G1)
+      .addImm(HI22(NumBytes));
+    BuildMI(MBB, MBBI, dl, TII.get(SP::ORri), SP::G1)
+      .addReg(SP::G1).addImm(LO10(NumBytes));
+    BuildMI(MBB, MBBI, dl, TII.get(ADDrr), SP::O6)
+      .addReg(SP::O6).addReg(SP::G1);
+    return ;
+  }
+
+  // Emit negative numbers with sethi + xor.
+  // sethi %hix(NumBytes), %g1
+  // xor %g1, %lox(NumBytes), %g1
+  // add %sp, %g1, %sp
+  BuildMI(MBB, MBBI, dl, TII.get(SP::SETHIi), SP::G1)
+    .addImm(HIX22(NumBytes));
+  BuildMI(MBB, MBBI, dl, TII.get(SP::XORri), SP::G1)
+    .addReg(SP::G1).addImm(LOX10(NumBytes));
+  BuildMI(MBB, MBBI, dl, TII.get(ADDrr), SP::O6)
+    .addReg(SP::O6).addReg(SP::G1);
+}
+
+void SparcFrameLowering::emitPrologue(MachineFunction &MF) const {
+  SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>();
+
+  MachineBasicBlock &MBB = MF.front();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const SparcInstrInfo &TII =
+    *static_cast<const SparcInstrInfo*>(MF.getTarget().getInstrInfo());
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+
+  // Get the number of bytes to allocate from the FrameInfo
+  int NumBytes = (int) MFI->getStackSize();
+
+  unsigned SAVEri = SP::SAVEri;
+  unsigned SAVErr = SP::SAVErr;
+  if (FuncInfo->isLeafProc()) {
+    if (NumBytes == 0)
+      return;
+    SAVEri = SP::ADDri;
+    SAVErr = SP::ADDrr;
+  }
+  NumBytes =
+      -MF.getTarget().getSubtarget<SparcSubtarget>().getAdjustedFrameSize(
+          NumBytes);
+  emitSPAdjustment(MF, MBB, MBBI, NumBytes, SAVErr, SAVEri);
+
+  MachineModuleInfo &MMI = MF.getMMI();
+  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+  unsigned regFP = MRI->getDwarfRegNum(SP::I6, true);
+
+  // Emit ".cfi_def_cfa_register 30".
+  unsigned CFIIndex =
+      MMI.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, regFP));
+  BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+
+  // Emit ".cfi_window_save".
+  CFIIndex = MMI.addFrameInst(MCCFIInstruction::createWindowSave(nullptr));
+  BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+
+  unsigned regInRA = MRI->getDwarfRegNum(SP::I7, true);
+  unsigned regOutRA = MRI->getDwarfRegNum(SP::O7, true);
+  // Emit ".cfi_register 15, 31".
+  CFIIndex = MMI.addFrameInst(
+      MCCFIInstruction::createRegister(nullptr, regOutRA, regInRA));
+  BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+}
+
+void SparcFrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  if (!hasReservedCallFrame(MF)) {
+    MachineInstr &MI = *I;
+    int Size = MI.getOperand(0).getImm();
+    if (MI.getOpcode() == SP::ADJCALLSTACKDOWN)
+      Size = -Size;
+
+    if (Size)
+      emitSPAdjustment(MF, MBB, I, Size, SP::ADDrr, SP::ADDri);
+  }
+  MBB.erase(I);
+}
+
+
+void SparcFrameLowering::emitEpilogue(MachineFunction &MF,
+                                  MachineBasicBlock &MBB) const {
+  SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>();
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  const SparcInstrInfo &TII =
+    *static_cast<const SparcInstrInfo*>(MF.getTarget().getInstrInfo());
+  DebugLoc dl = MBBI->getDebugLoc();
+  assert(MBBI->getOpcode() == SP::RETL &&
+         "Can only put epilog before 'retl' instruction!");
+  if (!FuncInfo->isLeafProc()) {
+    BuildMI(MBB, MBBI, dl, TII.get(SP::RESTORErr), SP::G0).addReg(SP::G0)
+      .addReg(SP::G0);
+    return;
+  }
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  int NumBytes = (int) MFI->getStackSize();
+  if (NumBytes == 0)
+    return;
+
+  NumBytes = MF.getTarget().getSubtarget<SparcSubtarget>().getAdjustedFrameSize(
+      NumBytes);
+  emitSPAdjustment(MF, MBB, MBBI, NumBytes, SP::ADDrr, SP::ADDri);
+}
+
+bool SparcFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+  // Reserve call frame if there are no variable sized objects on the stack.
+  return !MF.getFrameInfo()->hasVarSizedObjects();
+}
+
+// hasFP - Return true if the specified function should have a dedicated frame
+// pointer register.  This is true if the function has variable sized allocas or
+// if frame pointer elimination is disabled.
+bool SparcFrameLowering::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  return MF.getTarget().Options.DisableFramePointerElim(MF) ||
+    MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken();
+}
+
+
+static bool LLVM_ATTRIBUTE_UNUSED verifyLeafProcRegUse(MachineRegisterInfo *MRI)
+{
+
+  for (unsigned reg = SP::I0; reg <= SP::I7; ++reg)
+    if (MRI->isPhysRegUsed(reg))
+      return false;
+
+  for (unsigned reg = SP::L0; reg <= SP::L7; ++reg)
+    if (MRI->isPhysRegUsed(reg))
+      return false;
+
+  return true;
+}
+
+bool SparcFrameLowering::isLeafProc(MachineFunction &MF) const
+{
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  MachineFrameInfo    *MFI = MF.getFrameInfo();
+
+  return !(MFI->hasCalls()              // has calls
+           || MRI.isPhysRegUsed(SP::L0) // Too many registers needed
+           || MRI.isPhysRegUsed(SP::O6) // %SP is used
+           || hasFP(MF));               // need %FP
+}
+
+void SparcFrameLowering::remapRegsForLeafProc(MachineFunction &MF) const {
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  // Remap %i[0-7] to %o[0-7].
+  for (unsigned reg = SP::I0; reg <= SP::I7; ++reg) {
+    if (!MRI.isPhysRegUsed(reg))
+      continue;
+    unsigned mapped_reg = (reg - SP::I0 + SP::O0);
+    assert(!MRI.isPhysRegUsed(mapped_reg));
+
+    // Replace I register with O register.
+    MRI.replaceRegWith(reg, mapped_reg);
+
+    // Mark the reg unused.
+    MRI.setPhysRegUnused(reg);
+  }
+
+  // Rewrite MBB's Live-ins.
+  for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
+       MBB != E; ++MBB) {
+    for (unsigned reg = SP::I0; reg <= SP::I7; ++reg) {
+      if (!MBB->isLiveIn(reg))
+        continue;
+      MBB->removeLiveIn(reg);
+      MBB->addLiveIn(reg - SP::I0 + SP::O0);
+    }
+  }
+
+  assert(verifyLeafProcRegUse(&MRI));
+#ifdef XDEBUG
+  MF.verify(0, "After LeafProc Remapping");
+#endif
+}
+
+void SparcFrameLowering::processFunctionBeforeCalleeSavedScan
+                  (MachineFunction &MF, RegScavenger *RS) const {
+
+  if (!DisableLeafProc && isLeafProc(MF)) {
+    SparcMachineFunctionInfo *MFI = MF.getInfo<SparcMachineFunctionInfo>();
+    MFI->setLeafProc(true);
+
+    remapRegsForLeafProc(MF);
+  }
+
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.h b/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.h
new file mode 100644
index 0000000..a7d1b89
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.h
@@ -0,0 +1,60 @@
+//===-- SparcFrameLowering.h - Define frame lowering for Sparc --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPARC_FRAMEINFO_H
+#define SPARC_FRAMEINFO_H
+
+#include "Sparc.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+
+class SparcSubtarget;
+class SparcFrameLowering : public TargetFrameLowering {
+public:
+  explicit SparcFrameLowering(const SparcSubtarget &ST);
+
+  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+  /// the function.
+  void emitPrologue(MachineFunction &MF) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+  void
+  eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator I) const override;
+
+  bool hasReservedCallFrame(const MachineFunction &MF) const override;
+  bool hasFP(const MachineFunction &MF) const override;
+  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                     RegScavenger *RS = nullptr) const override;
+
+private:
+  // Remap input registers to output registers for leaf procedure.
+  void remapRegsForLeafProc(MachineFunction &MF) const;
+
+  // Returns true if MF is a leaf procedure.
+  bool isLeafProc(MachineFunction &MF) const;
+
+
+  // Emits code for adjusting SP in function prologue/epilogue.
+  void emitSPAdjustment(MachineFunction &MF,
+                        MachineBasicBlock &MBB,
+                        MachineBasicBlock::iterator MBBI,
+                        int NumBytes, unsigned ADDrr, unsigned ADDri) const;
+
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/SparcISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Sparc/SparcISelDAGToDAG.cpp
new file mode 100644
index 0000000..2fade27
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcISelDAGToDAG.cpp
@@ -0,0 +1,221 @@
+//===-- SparcISelDAGToDAG.cpp - A dag to dag inst selector for Sparc ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the SPARC target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcTargetMachine.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Instruction Selector Implementation
+//===----------------------------------------------------------------------===//
+
+//===--------------------------------------------------------------------===//
+/// SparcDAGToDAGISel - SPARC specific code to select SPARC machine
+/// instructions for SelectionDAG operations.
+///
+namespace {
+class SparcDAGToDAGISel : public SelectionDAGISel {
+  /// Subtarget - Keep a pointer to the Sparc Subtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const SparcSubtarget &Subtarget;
+  SparcTargetMachine &TM;
+public:
+  explicit SparcDAGToDAGISel(SparcTargetMachine &tm)
+    : SelectionDAGISel(tm),
+      Subtarget(tm.getSubtarget<SparcSubtarget>()),
+      TM(tm) {
+  }
+
+  SDNode *Select(SDNode *N) override;
+
+  // Complex Pattern Selectors.
+  bool SelectADDRrr(SDValue N, SDValue &R1, SDValue &R2);
+  bool SelectADDRri(SDValue N, SDValue &Base, SDValue &Offset);
+
+  /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+  /// inline asm expressions.
+  bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                    char ConstraintCode,
+                                    std::vector<SDValue> &OutOps) override;
+
+  const char *getPassName() const override {
+    return "SPARC DAG->DAG Pattern Instruction Selection";
+  }
+
+  // Include the pieces autogenerated from the target description.
+#include "SparcGenDAGISel.inc"
+
+private:
+  SDNode* getGlobalBaseReg();
+};
+}  // end anonymous namespace
+
+SDNode* SparcDAGToDAGISel::getGlobalBaseReg() {
+  unsigned GlobalBaseReg = TM.getInstrInfo()->getGlobalBaseReg(MF);
+  return CurDAG->getRegister(GlobalBaseReg,
+                             getTargetLowering()->getPointerTy()).getNode();
+}
+
+bool SparcDAGToDAGISel::SelectADDRri(SDValue Addr,
+                                     SDValue &Base, SDValue &Offset) {
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(),
+                                       getTargetLowering()->getPointerTy());
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress ||
+      Addr.getOpcode() == ISD::TargetGlobalTLSAddress)
+    return false;  // direct calls.
+
+  if (Addr.getOpcode() == ISD::ADD) {
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
+      if (isInt<13>(CN->getSExtValue())) {
+        if (FrameIndexSDNode *FIN =
+                dyn_cast<FrameIndexSDNode>(Addr.getOperand(0))) {
+          // Constant offset from frame ref.
+          Base = CurDAG->getTargetFrameIndex(FIN->getIndex(),
+                                           getTargetLowering()->getPointerTy());
+        } else {
+          Base = Addr.getOperand(0);
+        }
+        Offset = CurDAG->getTargetConstant(CN->getZExtValue(), MVT::i32);
+        return true;
+      }
+    }
+    if (Addr.getOperand(0).getOpcode() == SPISD::Lo) {
+      Base = Addr.getOperand(1);
+      Offset = Addr.getOperand(0).getOperand(0);
+      return true;
+    }
+    if (Addr.getOperand(1).getOpcode() == SPISD::Lo) {
+      Base = Addr.getOperand(0);
+      Offset = Addr.getOperand(1).getOperand(0);
+      return true;
+    }
+  }
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  return true;
+}
+
+bool SparcDAGToDAGISel::SelectADDRrr(SDValue Addr, SDValue &R1, SDValue &R2) {
+  if (Addr.getOpcode() == ISD::FrameIndex) return false;
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress ||
+      Addr.getOpcode() == ISD::TargetGlobalTLSAddress)
+    return false;  // direct calls.
+
+  if (Addr.getOpcode() == ISD::ADD) {
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
+      if (isInt<13>(CN->getSExtValue()))
+        return false;  // Let the reg+imm pattern catch this!
+    if (Addr.getOperand(0).getOpcode() == SPISD::Lo ||
+        Addr.getOperand(1).getOpcode() == SPISD::Lo)
+      return false;  // Let the reg+imm pattern catch this!
+    R1 = Addr.getOperand(0);
+    R2 = Addr.getOperand(1);
+    return true;
+  }
+
+  R1 = Addr;
+  R2 = CurDAG->getRegister(SP::G0, getTargetLowering()->getPointerTy());
+  return true;
+}
+
+SDNode *SparcDAGToDAGISel::Select(SDNode *N) {
+  SDLoc dl(N);
+  if (N->isMachineOpcode()) {
+    N->setNodeId(-1);
+    return nullptr;   // Already selected.
+  }
+
+  switch (N->getOpcode()) {
+  default: break;
+  case SPISD::GLOBAL_BASE_REG:
+    return getGlobalBaseReg();
+
+  case ISD::SDIV:
+  case ISD::UDIV: {
+    // sdivx / udivx handle 64-bit divides.
+    if (N->getValueType(0) == MVT::i64)
+      break;
+    // FIXME: should use a custom expander to expose the SRA to the dag.
+    SDValue DivLHS = N->getOperand(0);
+    SDValue DivRHS = N->getOperand(1);
+
+    // Set the Y register to the high-part.
+    SDValue TopPart;
+    if (N->getOpcode() == ISD::SDIV) {
+      TopPart = SDValue(CurDAG->getMachineNode(SP::SRAri, dl, MVT::i32, DivLHS,
+                                   CurDAG->getTargetConstant(31, MVT::i32)), 0);
+    } else {
+      TopPart = CurDAG->getRegister(SP::G0, MVT::i32);
+    }
+    TopPart = SDValue(CurDAG->getMachineNode(SP::WRYrr, dl, MVT::Glue, TopPart,
+                                     CurDAG->getRegister(SP::G0, MVT::i32)), 0);
+
+    // FIXME: Handle div by immediate.
+    unsigned Opcode = N->getOpcode() == ISD::SDIV ? SP::SDIVrr : SP::UDIVrr;
+    return CurDAG->SelectNodeTo(N, Opcode, MVT::i32, DivLHS, DivRHS,
+                                TopPart);
+  }
+  case ISD::MULHU:
+  case ISD::MULHS: {
+    // FIXME: Handle mul by immediate.
+    SDValue MulLHS = N->getOperand(0);
+    SDValue MulRHS = N->getOperand(1);
+    unsigned Opcode = N->getOpcode() == ISD::MULHU ? SP::UMULrr : SP::SMULrr;
+    SDNode *Mul = CurDAG->getMachineNode(Opcode, dl, MVT::i32, MVT::Glue,
+                                         MulLHS, MulRHS);
+    // The high part is in the Y register.
+    return CurDAG->SelectNodeTo(N, SP::RDY, MVT::i32, SDValue(Mul, 1));
+  }
+  }
+
+  return SelectCode(N);
+}
+
+
+/// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+/// inline asm expressions.
+bool
+SparcDAGToDAGISel::SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                                char ConstraintCode,
+                                                std::vector<SDValue> &OutOps) {
+  SDValue Op0, Op1;
+  switch (ConstraintCode) {
+  default: return true;
+  case 'm':   // memory
+   if (!SelectADDRrr(Op, Op0, Op1))
+     SelectADDRri(Op, Op0, Op1);
+   break;
+  }
+
+  OutOps.push_back(Op0);
+  OutOps.push_back(Op1);
+  return false;
+}
+
+/// createSparcISelDag - This pass converts a legalized DAG into a
+/// SPARC-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createSparcISelDag(SparcTargetMachine &TM) {
+  return new SparcDAGToDAGISel(TM);
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/contrib/llvm/lib/Target/Sparc/SparcISelLowering.cpp
new file mode 100644
index 0000000..990f52a
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -0,0 +1,3216 @@
+//===-- SparcISelLowering.cpp - Sparc DAG Lowering Implementation ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the interfaces that Sparc uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcISelLowering.h"
+#include "MCTargetDesc/SparcMCExpr.h"
+#include "SparcMachineFunctionInfo.h"
+#include "SparcRegisterInfo.h"
+#include "SparcTargetMachine.h"
+#include "SparcTargetObjectFile.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/ErrorHandling.h"
+using namespace llvm;
+
+
+//===----------------------------------------------------------------------===//
+// Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+static bool CC_Sparc_Assign_SRet(unsigned &ValNo, MVT &ValVT,
+                                 MVT &LocVT, CCValAssign::LocInfo &LocInfo,
+                                 ISD::ArgFlagsTy &ArgFlags, CCState &State)
+{
+  assert (ArgFlags.isSRet());
+
+  // Assign SRet argument.
+  State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
+                                         0,
+                                         LocVT, LocInfo));
+  return true;
+}
+
+static bool CC_Sparc_Assign_f64(unsigned &ValNo, MVT &ValVT,
+                                MVT &LocVT, CCValAssign::LocInfo &LocInfo,
+                                ISD::ArgFlagsTy &ArgFlags, CCState &State)
+{
+  static const MCPhysReg RegList[] = {
+    SP::I0, SP::I1, SP::I2, SP::I3, SP::I4, SP::I5
+  };
+  // Try to get first reg.
+  if (unsigned Reg = State.AllocateReg(RegList, 6)) {
+    State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+  } else {
+    // Assign whole thing in stack.
+    State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
+                                           State.AllocateStack(8,4),
+                                           LocVT, LocInfo));
+    return true;
+  }
+
+  // Try to get second reg.
+  if (unsigned Reg = State.AllocateReg(RegList, 6))
+    State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+  else
+    State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
+                                           State.AllocateStack(4,4),
+                                           LocVT, LocInfo));
+  return true;
+}
+
+// Allocate a full-sized argument for the 64-bit ABI.
+static bool CC_Sparc64_Full(unsigned &ValNo, MVT &ValVT,
+                            MVT &LocVT, CCValAssign::LocInfo &LocInfo,
+                            ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+  assert((LocVT == MVT::f32 || LocVT == MVT::f128
+          || LocVT.getSizeInBits() == 64) &&
+         "Can't handle non-64 bits locations");
+
+  // Stack space is allocated for all arguments starting from [%fp+BIAS+128].
+  unsigned size      = (LocVT == MVT::f128) ? 16 : 8;
+  unsigned alignment = (LocVT == MVT::f128) ? 16 : 8;
+  unsigned Offset = State.AllocateStack(size, alignment);
+  unsigned Reg = 0;
+
+  if (LocVT == MVT::i64 && Offset < 6*8)
+    // Promote integers to %i0-%i5.
+    Reg = SP::I0 + Offset/8;
+  else if (LocVT == MVT::f64 && Offset < 16*8)
+    // Promote doubles to %d0-%d30. (Which LLVM calls D0-D15).
+    Reg = SP::D0 + Offset/8;
+  else if (LocVT == MVT::f32 && Offset < 16*8)
+    // Promote floats to %f1, %f3, ...
+    Reg = SP::F1 + Offset/4;
+  else if (LocVT == MVT::f128 && Offset < 16*8)
+    // Promote long doubles to %q0-%q28. (Which LLVM calls Q0-Q7).
+    Reg = SP::Q0 + Offset/16;
+
+  // Promote to register when possible, otherwise use the stack slot.
+  if (Reg) {
+    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+    return true;
+  }
+
+  // This argument goes on the stack in an 8-byte slot.
+  // When passing floats, LocVT is smaller than 8 bytes. Adjust the offset to
+  // the right-aligned float. The first 4 bytes of the stack slot are undefined.
+  if (LocVT == MVT::f32)
+    Offset += 4;
+
+  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+  return true;
+}
+
+// Allocate a half-sized argument for the 64-bit ABI.
+//
+// This is used when passing { float, int } structs by value in registers.
+static bool CC_Sparc64_Half(unsigned &ValNo, MVT &ValVT,
+                            MVT &LocVT, CCValAssign::LocInfo &LocInfo,
+                            ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+  assert(LocVT.getSizeInBits() == 32 && "Can't handle non-32 bits locations");
+  unsigned Offset = State.AllocateStack(4, 4);
+
+  if (LocVT == MVT::f32 && Offset < 16*8) {
+    // Promote floats to %f0-%f31.
+    State.addLoc(CCValAssign::getReg(ValNo, ValVT, SP::F0 + Offset/4,
+                                     LocVT, LocInfo));
+    return true;
+  }
+
+  if (LocVT == MVT::i32 && Offset < 6*8) {
+    // Promote integers to %i0-%i5, using half the register.
+    unsigned Reg = SP::I0 + Offset/8;
+    LocVT = MVT::i64;
+    LocInfo = CCValAssign::AExt;
+
+    // Set the Custom bit if this i32 goes in the high bits of a register.
+    if (Offset % 8 == 0)
+      State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg,
+                                             LocVT, LocInfo));
+    else
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+    return true;
+  }
+
+  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+  return true;
+}
+
+#include "SparcGenCallingConv.inc"
+
+// The calling conventions in SparcCallingConv.td are described in terms of the
+// callee's register window. This function translates registers to the
+// corresponding caller window %o register.
+static unsigned toCallerWindow(unsigned Reg) {
+  assert(SP::I0 + 7 == SP::I7 && SP::O0 + 7 == SP::O7 && "Unexpected enum");
+  if (Reg >= SP::I0 && Reg <= SP::I7)
+    return Reg - SP::I0 + SP::O0;
+  return Reg;
+}
+
+SDValue
+SparcTargetLowering::LowerReturn(SDValue Chain,
+                                 CallingConv::ID CallConv, bool IsVarArg,
+                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                 const SmallVectorImpl<SDValue> &OutVals,
+                                 SDLoc DL, SelectionDAG &DAG) const {
+  if (Subtarget->is64Bit())
+    return LowerReturn_64(Chain, CallConv, IsVarArg, Outs, OutVals, DL, DAG);
+  return LowerReturn_32(Chain, CallConv, IsVarArg, Outs, OutVals, DL, DAG);
+}
+
+SDValue
+SparcTargetLowering::LowerReturn_32(SDValue Chain,
+                                    CallingConv::ID CallConv, bool IsVarArg,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<SDValue> &OutVals,
+                                    SDLoc DL, SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  // CCValAssign - represent the assignment of the return value to locations.
+  SmallVector<CCValAssign, 16> RVLocs;
+
+  // CCState - Info about the registers and stack slot.
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
+                 DAG.getTarget(), RVLocs, *DAG.getContext());
+
+  // Analyze return values.
+  CCInfo.AnalyzeReturn(Outs, RetCC_Sparc32);
+
+  SDValue Flag;
+  SmallVector<SDValue, 4> RetOps(1, Chain);
+  // Make room for the return address offset.
+  RetOps.push_back(SDValue());
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(),
+                             OutVals[i], Flag);
+
+    // Guarantee that all emitted copies are stuck together with flags.
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
+
+  unsigned RetAddrOffset = 8; // Call Inst + Delay Slot
+  // If the function returns a struct, copy the SRetReturnReg to I0
+  if (MF.getFunction()->hasStructRetAttr()) {
+    SparcMachineFunctionInfo *SFI = MF.getInfo<SparcMachineFunctionInfo>();
+    unsigned Reg = SFI->getSRetReturnReg();
+    if (!Reg)
+      llvm_unreachable("sret virtual register not created in the entry block");
+    SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy());
+    Chain = DAG.getCopyToReg(Chain, DL, SP::I0, Val, Flag);
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(SP::I0, getPointerTy()));
+    RetAddrOffset = 12; // CallInst + Delay Slot + Unimp
+  }
+
+  RetOps[0] = Chain;  // Update chain.
+  RetOps[1] = DAG.getConstant(RetAddrOffset, MVT::i32);
+
+  // Add the flag if we have it.
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
+  return DAG.getNode(SPISD::RET_FLAG, DL, MVT::Other, RetOps);
+}
+
+// Lower return values for the 64-bit ABI.
+// Return values are passed the exactly the same way as function arguments.
+SDValue
+SparcTargetLowering::LowerReturn_64(SDValue Chain,
+                                    CallingConv::ID CallConv, bool IsVarArg,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<SDValue> &OutVals,
+                                    SDLoc DL, SelectionDAG &DAG) const {
+  // CCValAssign - represent the assignment of the return value to locations.
+  SmallVector<CCValAssign, 16> RVLocs;
+
+  // CCState - Info about the registers and stack slot.
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
+                 DAG.getTarget(), RVLocs, *DAG.getContext());
+
+  // Analyze return values.
+  CCInfo.AnalyzeReturn(Outs, RetCC_Sparc64);
+
+  SDValue Flag;
+  SmallVector<SDValue, 4> RetOps(1, Chain);
+
+  // The second operand on the return instruction is the return address offset.
+  // The return address is always %i7+8 with the 64-bit ABI.
+  RetOps.push_back(DAG.getConstant(8, MVT::i32));
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+    SDValue OutVal = OutVals[i];
+
+    // Integer return values must be sign or zero extended by the callee.
+    switch (VA.getLocInfo()) {
+    case CCValAssign::Full: break;
+    case CCValAssign::SExt:
+      OutVal = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), OutVal);
+      break;
+    case CCValAssign::ZExt:
+      OutVal = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), OutVal);
+      break;
+    case CCValAssign::AExt:
+      OutVal = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), OutVal);
+      break;
+    default:
+      llvm_unreachable("Unknown loc info!");
+    }
+
+    // The custom bit on an i32 return value indicates that it should be passed
+    // in the high bits of the register.
+    if (VA.getValVT() == MVT::i32 && VA.needsCustom()) {
+      OutVal = DAG.getNode(ISD::SHL, DL, MVT::i64, OutVal,
+                           DAG.getConstant(32, MVT::i32));
+
+      // The next value may go in the low bits of the same register.
+      // Handle both at once.
+      if (i+1 < RVLocs.size() && RVLocs[i+1].getLocReg() == VA.getLocReg()) {
+        SDValue NV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, OutVals[i+1]);
+        OutVal = DAG.getNode(ISD::OR, DL, MVT::i64, OutVal, NV);
+        // Skip the next value, it's already done.
+        ++i;
+      }
+    }
+
+    Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), OutVal, Flag);
+
+    // Guarantee that all emitted copies are stuck together with flags.
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
+
+  RetOps[0] = Chain;  // Update chain.
+
+  // Add the flag if we have it.
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
+  return DAG.getNode(SPISD::RET_FLAG, DL, MVT::Other, RetOps);
+}
+
+SDValue SparcTargetLowering::
+LowerFormalArguments(SDValue Chain,
+                     CallingConv::ID CallConv,
+                     bool IsVarArg,
+                     const SmallVectorImpl<ISD::InputArg> &Ins,
+                     SDLoc DL,
+                     SelectionDAG &DAG,
+                     SmallVectorImpl<SDValue> &InVals) const {
+  if (Subtarget->is64Bit())
+    return LowerFormalArguments_64(Chain, CallConv, IsVarArg, Ins,
+                                   DL, DAG, InVals);
+  return LowerFormalArguments_32(Chain, CallConv, IsVarArg, Ins,
+                                 DL, DAG, InVals);
+}
+
+/// LowerFormalArguments32 - V8 uses a very simple ABI, where all values are
+/// passed in either one or two GPRs, including FP values.  TODO: we should
+/// pass FP values in FP registers for fastcc functions.
+SDValue SparcTargetLowering::
+LowerFormalArguments_32(SDValue Chain,
+                        CallingConv::ID CallConv,
+                        bool isVarArg,
+                        const SmallVectorImpl<ISD::InputArg> &Ins,
+                        SDLoc dl,
+                        SelectionDAG &DAG,
+                        SmallVectorImpl<SDValue> &InVals) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+  SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>();
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+  CCInfo.AnalyzeFormalArguments(Ins, CC_Sparc32);
+
+  const unsigned StackOffset = 92;
+
+  unsigned InIdx = 0;
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i, ++InIdx) {
+    CCValAssign &VA = ArgLocs[i];
+
+    if (Ins[InIdx].Flags.isSRet()) {
+      if (InIdx != 0)
+        report_fatal_error("sparc only supports sret on the first parameter");
+      // Get SRet from [%fp+64].
+      int FrameIdx = MF.getFrameInfo()->CreateFixedObject(4, 64, true);
+      SDValue FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
+      SDValue Arg = DAG.getLoad(MVT::i32, dl, Chain, FIPtr,
+                                MachinePointerInfo(),
+                                false, false, false, 0);
+      InVals.push_back(Arg);
+      continue;
+    }
+
+    if (VA.isRegLoc()) {
+      if (VA.needsCustom()) {
+        assert(VA.getLocVT() == MVT::f64);
+        unsigned VRegHi = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
+        MF.getRegInfo().addLiveIn(VA.getLocReg(), VRegHi);
+        SDValue HiVal = DAG.getCopyFromReg(Chain, dl, VRegHi, MVT::i32);
+
+        assert(i+1 < e);
+        CCValAssign &NextVA = ArgLocs[++i];
+
+        SDValue LoVal;
+        if (NextVA.isMemLoc()) {
+          int FrameIdx = MF.getFrameInfo()->
+            CreateFixedObject(4, StackOffset+NextVA.getLocMemOffset(),true);
+          SDValue FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
+          LoVal = DAG.getLoad(MVT::i32, dl, Chain, FIPtr,
+                              MachinePointerInfo(),
+                              false, false, false, 0);
+        } else {
+          unsigned loReg = MF.addLiveIn(NextVA.getLocReg(),
+                                        &SP::IntRegsRegClass);
+          LoVal = DAG.getCopyFromReg(Chain, dl, loReg, MVT::i32);
+        }
+        SDValue WholeValue =
+          DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, LoVal, HiVal);
+        WholeValue = DAG.getNode(ISD::BITCAST, dl, MVT::f64, WholeValue);
+        InVals.push_back(WholeValue);
+        continue;
+      }
+      unsigned VReg = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
+      MF.getRegInfo().addLiveIn(VA.getLocReg(), VReg);
+      SDValue Arg = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
+      if (VA.getLocVT() == MVT::f32)
+        Arg = DAG.getNode(ISD::BITCAST, dl, MVT::f32, Arg);
+      else if (VA.getLocVT() != MVT::i32) {
+        Arg = DAG.getNode(ISD::AssertSext, dl, MVT::i32, Arg,
+                          DAG.getValueType(VA.getLocVT()));
+        Arg = DAG.getNode(ISD::TRUNCATE, dl, VA.getLocVT(), Arg);
+      }
+      InVals.push_back(Arg);
+      continue;
+    }
+
+    assert(VA.isMemLoc());
+
+    unsigned Offset = VA.getLocMemOffset()+StackOffset;
+
+    if (VA.needsCustom()) {
+      assert(VA.getValVT() == MVT::f64);
+      // If it is double-word aligned, just load.
+      if (Offset % 8 == 0) {
+        int FI = MF.getFrameInfo()->CreateFixedObject(8,
+                                                      Offset,
+                                                      true);
+        SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy());
+        SDValue Load = DAG.getLoad(VA.getValVT(), dl, Chain, FIPtr,
+                                   MachinePointerInfo(),
+                                   false,false, false, 0);
+        InVals.push_back(Load);
+        continue;
+      }
+
+      int FI = MF.getFrameInfo()->CreateFixedObject(4,
+                                                    Offset,
+                                                    true);
+      SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy());
+      SDValue HiVal = DAG.getLoad(MVT::i32, dl, Chain, FIPtr,
+                                  MachinePointerInfo(),
+                                  false, false, false, 0);
+      int FI2 = MF.getFrameInfo()->CreateFixedObject(4,
+                                                     Offset+4,
+                                                     true);
+      SDValue FIPtr2 = DAG.getFrameIndex(FI2, getPointerTy());
+
+      SDValue LoVal = DAG.getLoad(MVT::i32, dl, Chain, FIPtr2,
+                                  MachinePointerInfo(),
+                                  false, false, false, 0);
+
+      SDValue WholeValue =
+        DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, LoVal, HiVal);
+      WholeValue = DAG.getNode(ISD::BITCAST, dl, MVT::f64, WholeValue);
+      InVals.push_back(WholeValue);
+      continue;
+    }
+
+    int FI = MF.getFrameInfo()->CreateFixedObject(4,
+                                                  Offset,
+                                                  true);
+    SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy());
+    SDValue Load ;
+    if (VA.getValVT() == MVT::i32 || VA.getValVT() == MVT::f32) {
+      Load = DAG.getLoad(VA.getValVT(), dl, Chain, FIPtr,
+                         MachinePointerInfo(),
+                         false, false, false, 0);
+    } else {
+      ISD::LoadExtType LoadOp = ISD::SEXTLOAD;
+      // Sparc is big endian, so add an offset based on the ObjectVT.
+      unsigned Offset = 4-std::max(1U, VA.getValVT().getSizeInBits()/8);
+      FIPtr = DAG.getNode(ISD::ADD, dl, MVT::i32, FIPtr,
+                          DAG.getConstant(Offset, MVT::i32));
+      Load = DAG.getExtLoad(LoadOp, dl, MVT::i32, Chain, FIPtr,
+                            MachinePointerInfo(),
+                            VA.getValVT(), false, false,0);
+      Load = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Load);
+    }
+    InVals.push_back(Load);
+  }
+
+  if (MF.getFunction()->hasStructRetAttr()) {
+    // Copy the SRet Argument to SRetReturnReg.
+    SparcMachineFunctionInfo *SFI = MF.getInfo<SparcMachineFunctionInfo>();
+    unsigned Reg = SFI->getSRetReturnReg();
+    if (!Reg) {
+      Reg = MF.getRegInfo().createVirtualRegister(&SP::IntRegsRegClass);
+      SFI->setSRetReturnReg(Reg);
+    }
+    SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]);
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
+  }
+
+  // Store remaining ArgRegs to the stack if this is a varargs function.
+  if (isVarArg) {
+    static const MCPhysReg ArgRegs[] = {
+      SP::I0, SP::I1, SP::I2, SP::I3, SP::I4, SP::I5
+    };
+    unsigned NumAllocated = CCInfo.getFirstUnallocated(ArgRegs, 6);
+    const MCPhysReg *CurArgReg = ArgRegs+NumAllocated, *ArgRegEnd = ArgRegs+6;
+    unsigned ArgOffset = CCInfo.getNextStackOffset();
+    if (NumAllocated == 6)
+      ArgOffset += StackOffset;
+    else {
+      assert(!ArgOffset);
+      ArgOffset = 68+4*NumAllocated;
+    }
+
+    // Remember the vararg offset for the va_start implementation.
+    FuncInfo->setVarArgsFrameOffset(ArgOffset);
+
+    std::vector<SDValue> OutChains;
+
+    for (; CurArgReg != ArgRegEnd; ++CurArgReg) {
+      unsigned VReg = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
+      MF.getRegInfo().addLiveIn(*CurArgReg, VReg);
+      SDValue Arg = DAG.getCopyFromReg(DAG.getRoot(), dl, VReg, MVT::i32);
+
+      int FrameIdx = MF.getFrameInfo()->CreateFixedObject(4, ArgOffset,
+                                                          true);
+      SDValue FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
+
+      OutChains.push_back(DAG.getStore(DAG.getRoot(), dl, Arg, FIPtr,
+                                       MachinePointerInfo(),
+                                       false, false, 0));
+      ArgOffset += 4;
+    }
+
+    if (!OutChains.empty()) {
+      OutChains.push_back(Chain);
+      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
+    }
+  }
+
+  return Chain;
+}
+
+// Lower formal arguments for the 64 bit ABI.
+SDValue SparcTargetLowering::
+LowerFormalArguments_64(SDValue Chain,
+                        CallingConv::ID CallConv,
+                        bool IsVarArg,
+                        const SmallVectorImpl<ISD::InputArg> &Ins,
+                        SDLoc DL,
+                        SelectionDAG &DAG,
+                        SmallVectorImpl<SDValue> &InVals) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  // Analyze arguments according to CC_Sparc64.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+  CCInfo.AnalyzeFormalArguments(Ins, CC_Sparc64);
+
+  // The argument array begins at %fp+BIAS+128, after the register save area.
+  const unsigned ArgArea = 128;
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    if (VA.isRegLoc()) {
+      // This argument is passed in a register.
+      // All integer register arguments are promoted by the caller to i64.
+
+      // Create a virtual register for the promoted live-in value.
+      unsigned VReg = MF.addLiveIn(VA.getLocReg(),
+                                   getRegClassFor(VA.getLocVT()));
+      SDValue Arg = DAG.getCopyFromReg(Chain, DL, VReg, VA.getLocVT());
+
+      // Get the high bits for i32 struct elements.
+      if (VA.getValVT() == MVT::i32 && VA.needsCustom())
+        Arg = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Arg,
+                          DAG.getConstant(32, MVT::i32));
+
+      // The caller promoted the argument, so insert an Assert?ext SDNode so we
+      // won't promote the value again in this function.
+      switch (VA.getLocInfo()) {
+      case CCValAssign::SExt:
+        Arg = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Arg,
+                          DAG.getValueType(VA.getValVT()));
+        break;
+      case CCValAssign::ZExt:
+        Arg = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Arg,
+                          DAG.getValueType(VA.getValVT()));
+        break;
+      default:
+        break;
+      }
+
+      // Truncate the register down to the argument type.
+      if (VA.isExtInLoc())
+        Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
+
+      InVals.push_back(Arg);
+      continue;
+    }
+
+    // The registers are exhausted. This argument was passed on the stack.
+    assert(VA.isMemLoc());
+    // The CC_Sparc64_Full/Half functions compute stack offsets relative to the
+    // beginning of the arguments area at %fp+BIAS+128.
+    unsigned Offset = VA.getLocMemOffset() + ArgArea;
+    unsigned ValSize = VA.getValVT().getSizeInBits() / 8;
+    // Adjust offset for extended arguments, SPARC is big-endian.
+    // The caller will have written the full slot with extended bytes, but we
+    // prefer our own extending loads.
+    if (VA.isExtInLoc())
+      Offset += 8 - ValSize;
+    int FI = MF.getFrameInfo()->CreateFixedObject(ValSize, Offset, true);
+    InVals.push_back(DAG.getLoad(VA.getValVT(), DL, Chain,
+                                 DAG.getFrameIndex(FI, getPointerTy()),
+                                 MachinePointerInfo::getFixedStack(FI),
+                                 false, false, false, 0));
+  }
+
+  if (!IsVarArg)
+    return Chain;
+
+  // This function takes variable arguments, some of which may have been passed
+  // in registers %i0-%i5. Variable floating point arguments are never passed
+  // in floating point registers. They go on %i0-%i5 or on the stack like
+  // integer arguments.
+  //
+  // The va_start intrinsic needs to know the offset to the first variable
+  // argument.
+  unsigned ArgOffset = CCInfo.getNextStackOffset();
+  SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>();
+  // Skip the 128 bytes of register save area.
+  FuncInfo->setVarArgsFrameOffset(ArgOffset + ArgArea +
+                                  Subtarget->getStackPointerBias());
+
+  // Save the variable arguments that were passed in registers.
+  // The caller is required to reserve stack space for 6 arguments regardless
+  // of how many arguments were actually passed.
+  SmallVector<SDValue, 8> OutChains;
+  for (; ArgOffset < 6*8; ArgOffset += 8) {
+    unsigned VReg = MF.addLiveIn(SP::I0 + ArgOffset/8, &SP::I64RegsRegClass);
+    SDValue VArg = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
+    int FI = MF.getFrameInfo()->CreateFixedObject(8, ArgOffset + ArgArea, true);
+    OutChains.push_back(DAG.getStore(Chain, DL, VArg,
+                                     DAG.getFrameIndex(FI, getPointerTy()),
+                                     MachinePointerInfo::getFixedStack(FI),
+                                     false, false, 0));
+  }
+
+  if (!OutChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
+
+  return Chain;
+}
+
+SDValue
+SparcTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                               SmallVectorImpl<SDValue> &InVals) const {
+  if (Subtarget->is64Bit())
+    return LowerCall_64(CLI, InVals);
+  return LowerCall_32(CLI, InVals);
+}
+
+static bool hasReturnsTwiceAttr(SelectionDAG &DAG, SDValue Callee,
+                                     ImmutableCallSite *CS) {
+  if (CS)
+    return CS->hasFnAttr(Attribute::ReturnsTwice);
+
+  const Function *CalleeFn = nullptr;
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    CalleeFn = dyn_cast<Function>(G->getGlobal());
+  } else if (ExternalSymbolSDNode *E =
+             dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    const Function *Fn = DAG.getMachineFunction().getFunction();
+    const Module *M = Fn->getParent();
+    const char *CalleeName = E->getSymbol();
+    CalleeFn = M->getFunction(CalleeName);
+  }
+
+  if (!CalleeFn)
+    return false;
+  return CalleeFn->hasFnAttribute(Attribute::ReturnsTwice);
+}
+
+// Lower a call for the 32-bit ABI.
+SDValue
+SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
+                                  SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  SDLoc &dl                             = CLI.DL;
+  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+  SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
+  SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  bool &isTailCall                      = CLI.IsTailCall;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool isVarArg                         = CLI.IsVarArg;
+
+  // Sparc target does not yet support tail call optimization.
+  isTailCall = false;
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 DAG.getTarget(), ArgLocs, *DAG.getContext());
+  CCInfo.AnalyzeCallOperands(Outs, CC_Sparc32);
+
+  // Get the size of the outgoing arguments stack space requirement.
+  unsigned ArgsSize = CCInfo.getNextStackOffset();
+
+  // Keep stack frames 8-byte aligned.
+  ArgsSize = (ArgsSize+7) & ~7;
+
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+
+  // Create local copies for byval args.
+  SmallVector<SDValue, 8> ByValArgs;
+  for (unsigned i = 0,  e = Outs.size(); i != e; ++i) {
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
+    if (!Flags.isByVal())
+      continue;
+
+    SDValue Arg = OutVals[i];
+    unsigned Size = Flags.getByValSize();
+    unsigned Align = Flags.getByValAlign();
+
+    int FI = MFI->CreateStackObject(Size, Align, false);
+    SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy());
+    SDValue SizeNode = DAG.getConstant(Size, MVT::i32);
+
+    Chain = DAG.getMemcpy(Chain, dl, FIPtr, Arg, SizeNode, Align,
+                          false,        // isVolatile,
+                          (Size <= 32), // AlwaysInline if size <= 32
+                          MachinePointerInfo(), MachinePointerInfo());
+    ByValArgs.push_back(FIPtr);
+  }
+
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(ArgsSize, true),
+                               dl);
+
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  SmallVector<SDValue, 8> MemOpChains;
+
+  const unsigned StackOffset = 92;
+  bool hasStructRetAttr = false;
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, realArgIdx = 0, byvalArgIdx = 0, e = ArgLocs.size();
+       i != e;
+       ++i, ++realArgIdx) {
+    CCValAssign &VA = ArgLocs[i];
+    SDValue Arg = OutVals[realArgIdx];
+
+    ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
+
+    // Use local copy if it is a byval arg.
+    if (Flags.isByVal())
+      Arg = ByValArgs[byvalArgIdx++];
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::AExt:
+      Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::BCvt:
+      Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
+      break;
+    }
+
+    if (Flags.isSRet()) {
+      assert(VA.needsCustom());
+      // store SRet argument in %sp+64
+      SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
+      SDValue PtrOff = DAG.getIntPtrConstant(64);
+      PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
+      MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
+                                         MachinePointerInfo(),
+                                         false, false, 0));
+      hasStructRetAttr = true;
+      continue;
+    }
+
+    if (VA.needsCustom()) {
+      assert(VA.getLocVT() == MVT::f64);
+
+      if (VA.isMemLoc()) {
+        unsigned Offset = VA.getLocMemOffset() + StackOffset;
+        // if it is double-word aligned, just store.
+        if (Offset % 8 == 0) {
+          SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
+          SDValue PtrOff = DAG.getIntPtrConstant(Offset);
+          PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
+          MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
+                                             MachinePointerInfo(),
+                                             false, false, 0));
+          continue;
+        }
+      }
+
+      SDValue StackPtr = DAG.CreateStackTemporary(MVT::f64, MVT::i32);
+      SDValue Store = DAG.getStore(DAG.getEntryNode(), dl,
+                                   Arg, StackPtr, MachinePointerInfo(),
+                                   false, false, 0);
+      // Sparc is big-endian, so the high part comes first.
+      SDValue Hi = DAG.getLoad(MVT::i32, dl, Store, StackPtr,
+                               MachinePointerInfo(), false, false, false, 0);
+      // Increment the pointer to the other half.
+      StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr,
+                             DAG.getIntPtrConstant(4));
+      // Load the low part.
+      SDValue Lo = DAG.getLoad(MVT::i32, dl, Store, StackPtr,
+                               MachinePointerInfo(), false, false, false, 0);
+
+      if (VA.isRegLoc()) {
+        RegsToPass.push_back(std::make_pair(VA.getLocReg(), Hi));
+        assert(i+1 != e);
+        CCValAssign &NextVA = ArgLocs[++i];
+        if (NextVA.isRegLoc()) {
+          RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Lo));
+        } else {
+          // Store the low part in stack.
+          unsigned Offset = NextVA.getLocMemOffset() + StackOffset;
+          SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
+          SDValue PtrOff = DAG.getIntPtrConstant(Offset);
+          PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
+          MemOpChains.push_back(DAG.getStore(Chain, dl, Lo, PtrOff,
+                                             MachinePointerInfo(),
+                                             false, false, 0));
+        }
+      } else {
+        unsigned Offset = VA.getLocMemOffset() + StackOffset;
+        // Store the high part.
+        SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
+        SDValue PtrOff = DAG.getIntPtrConstant(Offset);
+        PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
+        MemOpChains.push_back(DAG.getStore(Chain, dl, Hi, PtrOff,
+                                           MachinePointerInfo(),
+                                           false, false, 0));
+        // Store the low part.
+        PtrOff = DAG.getIntPtrConstant(Offset+4);
+        PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
+        MemOpChains.push_back(DAG.getStore(Chain, dl, Lo, PtrOff,
+                                           MachinePointerInfo(),
+                                           false, false, 0));
+      }
+      continue;
+    }
+
+    // Arguments that can be passed on register must be kept at
+    // RegsToPass vector
+    if (VA.isRegLoc()) {
+      if (VA.getLocVT() != MVT::f32) {
+        RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+        continue;
+      }
+      Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+      continue;
+    }
+
+    assert(VA.isMemLoc());
+
+    // Create a store off the stack pointer for this argument.
+    SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
+    SDValue PtrOff = DAG.getIntPtrConstant(VA.getLocMemOffset()+StackOffset);
+    PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
+    MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
+                                       MachinePointerInfo(),
+                                       false, false, 0));
+  }
+
+
+  // Emit all stores, make sure the occur before any copies into physregs.
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
+
+  // Build a sequence of copy-to-reg nodes chained together with token
+  // chain and flag operands which copy the outgoing args into registers.
+  // The InFlag in necessary since all emitted instructions must be
+  // stuck together.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    unsigned Reg = toCallerWindow(RegsToPass[i].first);
+    Chain = DAG.getCopyToReg(Chain, dl, Reg, RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  unsigned SRetArgSize = (hasStructRetAttr)? getSRetArgSize(DAG, Callee):0;
+  bool hasReturnsTwice = hasReturnsTwiceAttr(DAG, Callee, CLI.CS);
+
+  // If the callee is a GlobalAddress node (quite common, every direct call is)
+  // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+  // Likewise ExternalSymbol -> TargetExternalSymbol.
+  unsigned TF = ((getTargetMachine().getRelocationModel() == Reloc::PIC_)
+                 ? SparcMCExpr::VK_Sparc_WPLT30 : 0);
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, MVT::i32, 0, TF);
+  else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
+    Callee = DAG.getTargetExternalSymbol(E->getSymbol(), MVT::i32, TF);
+
+  // Returns a chain & a flag for retval copy to use
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+  if (hasStructRetAttr)
+    Ops.push_back(DAG.getTargetConstant(SRetArgSize, MVT::i32));
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(toCallerWindow(RegsToPass[i].first),
+                                  RegsToPass[i].second.getValueType()));
+
+  // Add a register mask operand representing the call-preserved registers.
+  const SparcRegisterInfo *TRI =
+    ((const SparcTargetMachine&)getTargetMachine()).getRegisterInfo();
+  const uint32_t *Mask = ((hasReturnsTwice)
+                          ? TRI->getRTCallPreservedMask(CallConv)
+                          : TRI->getCallPreservedMask(CallConv));
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  Chain = DAG.getNode(SPISD::CALL, dl, NodeTys, Ops);
+  InFlag = Chain.getValue(1);
+
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(ArgsSize, true),
+                             DAG.getIntPtrConstant(0, true), InFlag, dl);
+  InFlag = Chain.getValue(1);
+
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState RVInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 DAG.getTarget(), RVLocs, *DAG.getContext());
+
+  RVInfo.AnalyzeCallResult(Ins, RetCC_Sparc32);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    Chain = DAG.getCopyFromReg(Chain, dl, toCallerWindow(RVLocs[i].getLocReg()),
+                               RVLocs[i].getValVT(), InFlag).getValue(1);
+    InFlag = Chain.getValue(2);
+    InVals.push_back(Chain.getValue(0));
+  }
+
+  return Chain;
+}
+
+// This functions returns true if CalleeName is a ABI function that returns
+// a long double (fp128).
+static bool isFP128ABICall(const char *CalleeName)
+{
+  static const char *const ABICalls[] =
+    {  "_Q_add", "_Q_sub", "_Q_mul", "_Q_div",
+       "_Q_sqrt", "_Q_neg",
+       "_Q_itoq", "_Q_stoq", "_Q_dtoq", "_Q_utoq",
+       "_Q_lltoq", "_Q_ulltoq",
+       nullptr
+    };
+  for (const char * const *I = ABICalls; *I != nullptr; ++I)
+    if (strcmp(CalleeName, *I) == 0)
+      return true;
+  return false;
+}
+
+unsigned
+SparcTargetLowering::getSRetArgSize(SelectionDAG &DAG, SDValue Callee) const
+{
+  const Function *CalleeFn = nullptr;
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    CalleeFn = dyn_cast<Function>(G->getGlobal());
+  } else if (ExternalSymbolSDNode *E =
+             dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    const Function *Fn = DAG.getMachineFunction().getFunction();
+    const Module *M = Fn->getParent();
+    const char *CalleeName = E->getSymbol();
+    CalleeFn = M->getFunction(CalleeName);
+    if (!CalleeFn && isFP128ABICall(CalleeName))
+      return 16; // Return sizeof(fp128)
+  }
+
+  if (!CalleeFn)
+    return 0;
+
+  assert(CalleeFn->hasStructRetAttr() &&
+         "Callee does not have the StructRet attribute.");
+
+  PointerType *Ty = cast<PointerType>(CalleeFn->arg_begin()->getType());
+  Type *ElementTy = Ty->getElementType();
+  return getDataLayout()->getTypeAllocSize(ElementTy);
+}
+
+
+// Fixup floating point arguments in the ... part of a varargs call.
+//
+// The SPARC v9 ABI requires that floating point arguments are treated the same
+// as integers when calling a varargs function. This does not apply to the
+// fixed arguments that are part of the function's prototype.
+//
+// This function post-processes a CCValAssign array created by
+// AnalyzeCallOperands().
+static void fixupVariableFloatArgs(SmallVectorImpl<CCValAssign> &ArgLocs,
+                                   ArrayRef<ISD::OutputArg> Outs) {
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    const CCValAssign &VA = ArgLocs[i];
+    MVT ValTy = VA.getLocVT();
+    // FIXME: What about f32 arguments? C promotes them to f64 when calling
+    // varargs functions.
+    if (!VA.isRegLoc() || (ValTy != MVT::f64 && ValTy != MVT::f128))
+      continue;
+    // The fixed arguments to a varargs function still go in FP registers.
+    if (Outs[VA.getValNo()].IsFixed)
+      continue;
+
+    // This floating point argument should be reassigned.
+    CCValAssign NewVA;
+
+    // Determine the offset into the argument array.
+    unsigned firstReg = (ValTy == MVT::f64) ? SP::D0 : SP::Q0;
+    unsigned argSize  = (ValTy == MVT::f64) ? 8 : 16;
+    unsigned Offset = argSize * (VA.getLocReg() - firstReg);
+    assert(Offset < 16*8 && "Offset out of range, bad register enum?");
+
+    if (Offset < 6*8) {
+      // This argument should go in %i0-%i5.
+      unsigned IReg = SP::I0 + Offset/8;
+      if (ValTy == MVT::f64)
+        // Full register, just bitconvert into i64.
+        NewVA = CCValAssign::getReg(VA.getValNo(), VA.getValVT(),
+                                    IReg, MVT::i64, CCValAssign::BCvt);
+      else {
+        assert(ValTy == MVT::f128 && "Unexpected type!");
+        // Full register, just bitconvert into i128 -- We will lower this into
+        // two i64s in LowerCall_64.
+        NewVA = CCValAssign::getCustomReg(VA.getValNo(), VA.getValVT(),
+                                          IReg, MVT::i128, CCValAssign::BCvt);
+      }
+    } else {
+      // This needs to go to memory, we're out of integer registers.
+      NewVA = CCValAssign::getMem(VA.getValNo(), VA.getValVT(),
+                                  Offset, VA.getLocVT(), VA.getLocInfo());
+    }
+    ArgLocs[i] = NewVA;
+  }
+}
+
+// Lower a call for the 64-bit ABI.
+SDValue
+SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
+                                  SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG = CLI.DAG;
+  SDLoc DL = CLI.DL;
+  SDValue Chain = CLI.Chain;
+
+  // Sparc target does not yet support tail call optimization.
+  CLI.IsTailCall = false;
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(),
+                 DAG.getTarget(), ArgLocs, *DAG.getContext());
+  CCInfo.AnalyzeCallOperands(CLI.Outs, CC_Sparc64);
+
+  // Get the size of the outgoing arguments stack space requirement.
+  // The stack offset computed by CC_Sparc64 includes all arguments.
+  // Called functions expect 6 argument words to exist in the stack frame, used
+  // or not.
+  unsigned ArgsSize = std::max(6*8u, CCInfo.getNextStackOffset());
+
+  // Keep stack frames 16-byte aligned.
+  ArgsSize = RoundUpToAlignment(ArgsSize, 16);
+
+  // Varargs calls require special treatment.
+  if (CLI.IsVarArg)
+    fixupVariableFloatArgs(ArgLocs, CLI.Outs);
+
+  // Adjust the stack pointer to make room for the arguments.
+  // FIXME: Use hasReservedCallFrame to avoid %sp adjustments around all calls
+  // with more than 6 arguments.
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(ArgsSize, true),
+                               DL);
+
+  // Collect the set of registers to pass to the function and their values.
+  // This will be emitted as a sequence of CopyToReg nodes glued to the call
+  // instruction.
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+
+  // Collect chains from all the memory opeations that copy arguments to the
+  // stack. They must follow the stack pointer adjustment above and precede the
+  // call instruction itself.
+  SmallVector<SDValue, 8> MemOpChains;
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    const CCValAssign &VA = ArgLocs[i];
+    SDValue Arg = CLI.OutVals[i];
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    default:
+      llvm_unreachable("Unknown location info!");
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::AExt:
+      Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::BCvt:
+      // fixupVariableFloatArgs() may create bitcasts from f128 to i128. But
+      // SPARC does not support i128 natively. Lower it into two i64, see below.
+      if (!VA.needsCustom() || VA.getValVT() != MVT::f128
+          || VA.getLocVT() != MVT::i128)
+        Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
+      break;
+    }
+
+    if (VA.isRegLoc()) {
+      if (VA.needsCustom() && VA.getValVT() == MVT::f128
+          && VA.getLocVT() == MVT::i128) {
+        // Store and reload into the interger register reg and reg+1.
+        unsigned Offset = 8 * (VA.getLocReg() - SP::I0);
+        unsigned StackOffset = Offset + Subtarget->getStackPointerBias() + 128;
+        SDValue StackPtr = DAG.getRegister(SP::O6, getPointerTy());
+        SDValue HiPtrOff = DAG.getIntPtrConstant(StackOffset);
+        HiPtrOff         = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr,
+                                       HiPtrOff);
+        SDValue LoPtrOff = DAG.getIntPtrConstant(StackOffset + 8);
+        LoPtrOff         = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr,
+                                       LoPtrOff);
+
+        // Store to %sp+BIAS+128+Offset
+        SDValue Store = DAG.getStore(Chain, DL, Arg, HiPtrOff,
+                                     MachinePointerInfo(),
+                                     false, false, 0);
+        // Load into Reg and Reg+1
+        SDValue Hi64 = DAG.getLoad(MVT::i64, DL, Store, HiPtrOff,
+                                   MachinePointerInfo(),
+                                   false, false, false, 0);
+        SDValue Lo64 = DAG.getLoad(MVT::i64, DL, Store, LoPtrOff,
+                                   MachinePointerInfo(),
+                                   false, false, false, 0);
+        RegsToPass.push_back(std::make_pair(toCallerWindow(VA.getLocReg()),
+                                            Hi64));
+        RegsToPass.push_back(std::make_pair(toCallerWindow(VA.getLocReg()+1),
+                                            Lo64));
+        continue;
+      }
+
+      // The custom bit on an i32 return value indicates that it should be
+      // passed in the high bits of the register.
+      if (VA.getValVT() == MVT::i32 && VA.needsCustom()) {
+        Arg = DAG.getNode(ISD::SHL, DL, MVT::i64, Arg,
+                          DAG.getConstant(32, MVT::i32));
+
+        // The next value may go in the low bits of the same register.
+        // Handle both at once.
+        if (i+1 < ArgLocs.size() && ArgLocs[i+1].isRegLoc() &&
+            ArgLocs[i+1].getLocReg() == VA.getLocReg()) {
+          SDValue NV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64,
+                                   CLI.OutVals[i+1]);
+          Arg = DAG.getNode(ISD::OR, DL, MVT::i64, Arg, NV);
+          // Skip the next value, it's already done.
+          ++i;
+        }
+      }
+      RegsToPass.push_back(std::make_pair(toCallerWindow(VA.getLocReg()), Arg));
+      continue;
+    }
+
+    assert(VA.isMemLoc());
+
+    // Create a store off the stack pointer for this argument.
+    SDValue StackPtr = DAG.getRegister(SP::O6, getPointerTy());
+    // The argument area starts at %fp+BIAS+128 in the callee frame,
+    // %sp+BIAS+128 in ours.
+    SDValue PtrOff = DAG.getIntPtrConstant(VA.getLocMemOffset() +
+                                           Subtarget->getStackPointerBias() +
+                                           128);
+    PtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
+    MemOpChains.push_back(DAG.getStore(Chain, DL, Arg, PtrOff,
+                                       MachinePointerInfo(),
+                                       false, false, 0));
+  }
+
+  // Emit all stores, make sure they occur before the call.
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
+
+  // Build a sequence of CopyToReg nodes glued together with token chain and
+  // glue operands which copy the outgoing args into registers. The InGlue is
+  // necessary since all emitted instructions must be stuck together in order
+  // to pass the live physical registers.
+  SDValue InGlue;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, DL,
+                             RegsToPass[i].first, RegsToPass[i].second, InGlue);
+    InGlue = Chain.getValue(1);
+  }
+
+  // If the callee is a GlobalAddress node (quite common, every direct call is)
+  // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+  // Likewise ExternalSymbol -> TargetExternalSymbol.
+  SDValue Callee = CLI.Callee;
+  bool hasReturnsTwice = hasReturnsTwiceAttr(DAG, Callee, CLI.CS);
+  unsigned TF = ((getTargetMachine().getRelocationModel() == Reloc::PIC_)
+                 ? SparcMCExpr::VK_Sparc_WPLT30 : 0);
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, getPointerTy(), 0,
+                                        TF);
+  else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
+    Callee = DAG.getTargetExternalSymbol(E->getSymbol(), getPointerTy(), TF);
+
+  // Build the operands for the call instruction itself.
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  // Add a register mask operand representing the call-preserved registers.
+  const SparcRegisterInfo *TRI =
+    ((const SparcTargetMachine&)getTargetMachine()).getRegisterInfo();
+  const uint32_t *Mask = ((hasReturnsTwice)
+                          ? TRI->getRTCallPreservedMask(CLI.CallConv)
+                          : TRI->getCallPreservedMask(CLI.CallConv));
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
+
+  // Make sure the CopyToReg nodes are glued to the call instruction which
+  // consumes the registers.
+  if (InGlue.getNode())
+    Ops.push_back(InGlue);
+
+  // Now the call itself.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  Chain = DAG.getNode(SPISD::CALL, DL, NodeTys, Ops);
+  InGlue = Chain.getValue(1);
+
+  // Revert the stack pointer immediately after the call.
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(ArgsSize, true),
+                             DAG.getIntPtrConstant(0, true), InGlue, DL);
+  InGlue = Chain.getValue(1);
+
+  // Now extract the return values. This is more or less the same as
+  // LowerFormalArguments_64.
+
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState RVInfo(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(),
+                 DAG.getTarget(), RVLocs, *DAG.getContext());
+
+  // Set inreg flag manually for codegen generated library calls that
+  // return float.
+  if (CLI.Ins.size() == 1 && CLI.Ins[0].VT == MVT::f32 && CLI.CS == nullptr)
+    CLI.Ins[0].Flags.setInReg();
+
+  RVInfo.AnalyzeCallResult(CLI.Ins, RetCC_Sparc64);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    unsigned Reg = toCallerWindow(VA.getLocReg());
+
+    // When returning 'inreg {i32, i32 }', two consecutive i32 arguments can
+    // reside in the same register in the high and low bits. Reuse the
+    // CopyFromReg previous node to avoid duplicate copies.
+    SDValue RV;
+    if (RegisterSDNode *SrcReg = dyn_cast<RegisterSDNode>(Chain.getOperand(1)))
+      if (SrcReg->getReg() == Reg && Chain->getOpcode() == ISD::CopyFromReg)
+        RV = Chain.getValue(0);
+
+    // But usually we'll create a new CopyFromReg for a different register.
+    if (!RV.getNode()) {
+      RV = DAG.getCopyFromReg(Chain, DL, Reg, RVLocs[i].getLocVT(), InGlue);
+      Chain = RV.getValue(1);
+      InGlue = Chain.getValue(2);
+    }
+
+    // Get the high bits for i32 struct elements.
+    if (VA.getValVT() == MVT::i32 && VA.needsCustom())
+      RV = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), RV,
+                       DAG.getConstant(32, MVT::i32));
+
+    // The callee promoted the return value, so insert an Assert?ext SDNode so
+    // we won't promote the value again in this function.
+    switch (VA.getLocInfo()) {
+    case CCValAssign::SExt:
+      RV = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), RV,
+                       DAG.getValueType(VA.getValVT()));
+      break;
+    case CCValAssign::ZExt:
+      RV = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), RV,
+                       DAG.getValueType(VA.getValVT()));
+      break;
+    default:
+      break;
+    }
+
+    // Truncate the register down to the return value type.
+    if (VA.isExtInLoc())
+      RV = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), RV);
+
+    InVals.push_back(RV);
+  }
+
+  return Chain;
+}
+
+//===----------------------------------------------------------------------===//
+// TargetLowering Implementation
+//===----------------------------------------------------------------------===//
+
+/// IntCondCCodeToICC - Convert a DAG integer condition code to a SPARC ICC
+/// condition.
+static SPCC::CondCodes IntCondCCodeToICC(ISD::CondCode CC) {
+  switch (CC) {
+  default: llvm_unreachable("Unknown integer condition code!");
+  case ISD::SETEQ:  return SPCC::ICC_E;
+  case ISD::SETNE:  return SPCC::ICC_NE;
+  case ISD::SETLT:  return SPCC::ICC_L;
+  case ISD::SETGT:  return SPCC::ICC_G;
+  case ISD::SETLE:  return SPCC::ICC_LE;
+  case ISD::SETGE:  return SPCC::ICC_GE;
+  case ISD::SETULT: return SPCC::ICC_CS;
+  case ISD::SETULE: return SPCC::ICC_LEU;
+  case ISD::SETUGT: return SPCC::ICC_GU;
+  case ISD::SETUGE: return SPCC::ICC_CC;
+  }
+}
+
+/// FPCondCCodeToFCC - Convert a DAG floatingp oint condition code to a SPARC
+/// FCC condition.
+static SPCC::CondCodes FPCondCCodeToFCC(ISD::CondCode CC) {
+  switch (CC) {
+  default: llvm_unreachable("Unknown fp condition code!");
+  case ISD::SETEQ:
+  case ISD::SETOEQ: return SPCC::FCC_E;
+  case ISD::SETNE:
+  case ISD::SETUNE: return SPCC::FCC_NE;
+  case ISD::SETLT:
+  case ISD::SETOLT: return SPCC::FCC_L;
+  case ISD::SETGT:
+  case ISD::SETOGT: return SPCC::FCC_G;
+  case ISD::SETLE:
+  case ISD::SETOLE: return SPCC::FCC_LE;
+  case ISD::SETGE:
+  case ISD::SETOGE: return SPCC::FCC_GE;
+  case ISD::SETULT: return SPCC::FCC_UL;
+  case ISD::SETULE: return SPCC::FCC_ULE;
+  case ISD::SETUGT: return SPCC::FCC_UG;
+  case ISD::SETUGE: return SPCC::FCC_UGE;
+  case ISD::SETUO:  return SPCC::FCC_U;
+  case ISD::SETO:   return SPCC::FCC_O;
+  case ISD::SETONE: return SPCC::FCC_LG;
+  case ISD::SETUEQ: return SPCC::FCC_UE;
+  }
+}
+
+SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
+  : TargetLowering(TM, new SparcELFTargetObjectFile()) {
+  Subtarget = &TM.getSubtarget<SparcSubtarget>();
+
+  // Set up the register classes.
+  addRegisterClass(MVT::i32, &SP::IntRegsRegClass);
+  addRegisterClass(MVT::f32, &SP::FPRegsRegClass);
+  addRegisterClass(MVT::f64, &SP::DFPRegsRegClass);
+  addRegisterClass(MVT::f128, &SP::QFPRegsRegClass);
+  if (Subtarget->is64Bit())
+    addRegisterClass(MVT::i64, &SP::I64RegsRegClass);
+
+  // Turn FP extload into load/fextend
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f64, Expand);
+
+  // Sparc doesn't have i1 sign extending load
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+
+  // Turn FP truncstore into trunc + store.
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f64, Expand);
+
+  // Custom legalize GlobalAddress nodes into LO/HI parts.
+  setOperationAction(ISD::GlobalAddress, getPointerTy(), Custom);
+  setOperationAction(ISD::GlobalTLSAddress, getPointerTy(), Custom);
+  setOperationAction(ISD::ConstantPool, getPointerTy(), Custom);
+  setOperationAction(ISD::BlockAddress, getPointerTy(), Custom);
+
+  // Sparc doesn't have sext_inreg, replace them with shl/sra
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
+
+  // Sparc has no REM or DIVREM operations.
+  setOperationAction(ISD::UREM, MVT::i32, Expand);
+  setOperationAction(ISD::SREM, MVT::i32, Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+
+  // ... nor does SparcV9.
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::UREM, MVT::i64, Expand);
+    setOperationAction(ISD::SREM, MVT::i64, Expand);
+    setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
+    setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
+  }
+
+  // Custom expand fp<->sint
+  setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
+
+  // Custom Expand fp<->uint
+  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
+
+  setOperationAction(ISD::BITCAST, MVT::f32, Expand);
+  setOperationAction(ISD::BITCAST, MVT::i32, Expand);
+
+  // Sparc has no select or setcc: expand to SELECT_CC.
+  setOperationAction(ISD::SELECT, MVT::i32, Expand);
+  setOperationAction(ISD::SELECT, MVT::f32, Expand);
+  setOperationAction(ISD::SELECT, MVT::f64, Expand);
+  setOperationAction(ISD::SELECT, MVT::f128, Expand);
+
+  setOperationAction(ISD::SETCC, MVT::i32, Expand);
+  setOperationAction(ISD::SETCC, MVT::f32, Expand);
+  setOperationAction(ISD::SETCC, MVT::f64, Expand);
+  setOperationAction(ISD::SETCC, MVT::f128, Expand);
+
+  // Sparc doesn't have BRCOND either, it has BR_CC.
+  setOperationAction(ISD::BRCOND, MVT::Other, Expand);
+  setOperationAction(ISD::BRIND, MVT::Other, Expand);
+  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i32, Custom);
+  setOperationAction(ISD::BR_CC, MVT::f32, Custom);
+  setOperationAction(ISD::BR_CC, MVT::f64, Custom);
+  setOperationAction(ISD::BR_CC, MVT::f128, Custom);
+
+  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
+
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::ADDC, MVT::i64, Custom);
+    setOperationAction(ISD::ADDE, MVT::i64, Custom);
+    setOperationAction(ISD::SUBC, MVT::i64, Custom);
+    setOperationAction(ISD::SUBE, MVT::i64, Custom);
+    setOperationAction(ISD::BITCAST, MVT::f64, Expand);
+    setOperationAction(ISD::BITCAST, MVT::i64, Expand);
+    setOperationAction(ISD::SELECT, MVT::i64, Expand);
+    setOperationAction(ISD::SETCC, MVT::i64, Expand);
+    setOperationAction(ISD::BR_CC, MVT::i64, Custom);
+    setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
+
+    setOperationAction(ISD::CTPOP, MVT::i64,
+                       Subtarget->usePopc() ? Legal : Expand);
+    setOperationAction(ISD::CTTZ , MVT::i64, Expand);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
+    setOperationAction(ISD::CTLZ , MVT::i64, Expand);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
+    setOperationAction(ISD::BSWAP, MVT::i64, Expand);
+    setOperationAction(ISD::ROTL , MVT::i64, Expand);
+    setOperationAction(ISD::ROTR , MVT::i64, Expand);
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
+  }
+
+  // ATOMICs.
+  // FIXME: We insert fences for each atomics and generate sub-optimal code
+  // for PSO/TSO. Also, implement other atomicrmw operations.
+
+  setInsertFencesForAtomic(true);
+
+  setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Legal);
+  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32,
+                     (Subtarget->isV9() ? Legal: Expand));
+
+
+  setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Legal);
+
+  // Custom Lower Atomic LOAD/STORE
+  setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);
+
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Legal);
+    setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Legal);
+    setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Custom);
+  }
+
+  if (!Subtarget->isV9()) {
+    // SparcV8 does not have FNEGD and FABSD.
+    setOperationAction(ISD::FNEG, MVT::f64, Custom);
+    setOperationAction(ISD::FABS, MVT::f64, Custom);
+  }
+
+  setOperationAction(ISD::FSIN , MVT::f128, Expand);
+  setOperationAction(ISD::FCOS , MVT::f128, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
+  setOperationAction(ISD::FREM , MVT::f128, Expand);
+  setOperationAction(ISD::FMA  , MVT::f128, Expand);
+  setOperationAction(ISD::FSIN , MVT::f64, Expand);
+  setOperationAction(ISD::FCOS , MVT::f64, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+  setOperationAction(ISD::FREM , MVT::f64, Expand);
+  setOperationAction(ISD::FMA  , MVT::f64, Expand);
+  setOperationAction(ISD::FSIN , MVT::f32, Expand);
+  setOperationAction(ISD::FCOS , MVT::f32, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
+  setOperationAction(ISD::FREM , MVT::f32, Expand);
+  setOperationAction(ISD::FMA  , MVT::f32, Expand);
+  setOperationAction(ISD::CTTZ , MVT::i32, Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
+  setOperationAction(ISD::CTLZ , MVT::i32, Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
+  setOperationAction(ISD::ROTL , MVT::i32, Expand);
+  setOperationAction(ISD::ROTR , MVT::i32, Expand);
+  setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+  setOperationAction(ISD::FPOW , MVT::f128, Expand);
+  setOperationAction(ISD::FPOW , MVT::f64, Expand);
+  setOperationAction(ISD::FPOW , MVT::f32, Expand);
+
+  setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
+  setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);
+  setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand);
+
+  // FIXME: Sparc provides these multiplies, but we don't have them yet.
+  setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
+
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
+    setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
+    setOperationAction(ISD::MULHU,     MVT::i64, Expand);
+    setOperationAction(ISD::MULHS,     MVT::i64, Expand);
+
+    setOperationAction(ISD::UMULO,     MVT::i64, Custom);
+    setOperationAction(ISD::SMULO,     MVT::i64, Custom);
+
+    setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
+    setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);
+    setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
+  }
+
+  // VASTART needs to be custom lowered to use the VarArgsFrameIndex.
+  setOperationAction(ISD::VASTART           , MVT::Other, Custom);
+  // VAARG needs to be lowered to not do unaligned accesses for doubles.
+  setOperationAction(ISD::VAARG             , MVT::Other, Custom);
+
+  setOperationAction(ISD::TRAP              , MVT::Other, Legal);
+
+  // Use the default implementation.
+  setOperationAction(ISD::VACOPY            , MVT::Other, Expand);
+  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
+  setOperationAction(ISD::STACKSAVE         , MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE      , MVT::Other, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32  , Custom);
+
+  setExceptionPointerRegister(SP::I0);
+  setExceptionSelectorRegister(SP::I1);
+
+  setStackPointerRegisterToSaveRestore(SP::O6);
+
+  setOperationAction(ISD::CTPOP, MVT::i32,
+                     Subtarget->usePopc() ? Legal : Expand);
+
+  if (Subtarget->isV9() && Subtarget->hasHardQuad()) {
+    setOperationAction(ISD::LOAD, MVT::f128, Legal);
+    setOperationAction(ISD::STORE, MVT::f128, Legal);
+  } else {
+    setOperationAction(ISD::LOAD, MVT::f128, Custom);
+    setOperationAction(ISD::STORE, MVT::f128, Custom);
+  }
+
+  if (Subtarget->hasHardQuad()) {
+    setOperationAction(ISD::FADD,  MVT::f128, Legal);
+    setOperationAction(ISD::FSUB,  MVT::f128, Legal);
+    setOperationAction(ISD::FMUL,  MVT::f128, Legal);
+    setOperationAction(ISD::FDIV,  MVT::f128, Legal);
+    setOperationAction(ISD::FSQRT, MVT::f128, Legal);
+    setOperationAction(ISD::FP_EXTEND, MVT::f128, Legal);
+    setOperationAction(ISD::FP_ROUND,  MVT::f64, Legal);
+    if (Subtarget->isV9()) {
+      setOperationAction(ISD::FNEG, MVT::f128, Legal);
+      setOperationAction(ISD::FABS, MVT::f128, Legal);
+    } else {
+      setOperationAction(ISD::FNEG, MVT::f128, Custom);
+      setOperationAction(ISD::FABS, MVT::f128, Custom);
+    }
+
+    if (!Subtarget->is64Bit()) {
+      setLibcallName(RTLIB::FPTOSINT_F128_I64, "_Q_qtoll");
+      setLibcallName(RTLIB::FPTOUINT_F128_I64, "_Q_qtoull");
+      setLibcallName(RTLIB::SINTTOFP_I64_F128, "_Q_lltoq");
+      setLibcallName(RTLIB::UINTTOFP_I64_F128, "_Q_ulltoq");
+    }
+
+  } else {
+    // Custom legalize f128 operations.
+
+    setOperationAction(ISD::FADD,  MVT::f128, Custom);
+    setOperationAction(ISD::FSUB,  MVT::f128, Custom);
+    setOperationAction(ISD::FMUL,  MVT::f128, Custom);
+    setOperationAction(ISD::FDIV,  MVT::f128, Custom);
+    setOperationAction(ISD::FSQRT, MVT::f128, Custom);
+    setOperationAction(ISD::FNEG,  MVT::f128, Custom);
+    setOperationAction(ISD::FABS,  MVT::f128, Custom);
+
+    setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
+    setOperationAction(ISD::FP_ROUND,  MVT::f64, Custom);
+    setOperationAction(ISD::FP_ROUND,  MVT::f32, Custom);
+
+    // Setup Runtime library names.
+    if (Subtarget->is64Bit()) {
+      setLibcallName(RTLIB::ADD_F128,  "_Qp_add");
+      setLibcallName(RTLIB::SUB_F128,  "_Qp_sub");
+      setLibcallName(RTLIB::MUL_F128,  "_Qp_mul");
+      setLibcallName(RTLIB::DIV_F128,  "_Qp_div");
+      setLibcallName(RTLIB::SQRT_F128, "_Qp_sqrt");
+      setLibcallName(RTLIB::FPTOSINT_F128_I32, "_Qp_qtoi");
+      setLibcallName(RTLIB::FPTOUINT_F128_I32, "_Qp_qtoui");
+      setLibcallName(RTLIB::SINTTOFP_I32_F128, "_Qp_itoq");
+      setLibcallName(RTLIB::UINTTOFP_I32_F128, "_Qp_uitoq");
+      setLibcallName(RTLIB::FPTOSINT_F128_I64, "_Qp_qtox");
+      setLibcallName(RTLIB::FPTOUINT_F128_I64, "_Qp_qtoux");
+      setLibcallName(RTLIB::SINTTOFP_I64_F128, "_Qp_xtoq");
+      setLibcallName(RTLIB::UINTTOFP_I64_F128, "_Qp_uxtoq");
+      setLibcallName(RTLIB::FPEXT_F32_F128, "_Qp_stoq");
+      setLibcallName(RTLIB::FPEXT_F64_F128, "_Qp_dtoq");
+      setLibcallName(RTLIB::FPROUND_F128_F32, "_Qp_qtos");
+      setLibcallName(RTLIB::FPROUND_F128_F64, "_Qp_qtod");
+    } else {
+      setLibcallName(RTLIB::ADD_F128,  "_Q_add");
+      setLibcallName(RTLIB::SUB_F128,  "_Q_sub");
+      setLibcallName(RTLIB::MUL_F128,  "_Q_mul");
+      setLibcallName(RTLIB::DIV_F128,  "_Q_div");
+      setLibcallName(RTLIB::SQRT_F128, "_Q_sqrt");
+      setLibcallName(RTLIB::FPTOSINT_F128_I32, "_Q_qtoi");
+      setLibcallName(RTLIB::FPTOUINT_F128_I32, "_Q_qtou");
+      setLibcallName(RTLIB::SINTTOFP_I32_F128, "_Q_itoq");
+      setLibcallName(RTLIB::UINTTOFP_I32_F128, "_Q_utoq");
+      setLibcallName(RTLIB::FPTOSINT_F128_I64, "_Q_qtoll");
+      setLibcallName(RTLIB::FPTOUINT_F128_I64, "_Q_qtoull");
+      setLibcallName(RTLIB::SINTTOFP_I64_F128, "_Q_lltoq");
+      setLibcallName(RTLIB::UINTTOFP_I64_F128, "_Q_ulltoq");
+      setLibcallName(RTLIB::FPEXT_F32_F128, "_Q_stoq");
+      setLibcallName(RTLIB::FPEXT_F64_F128, "_Q_dtoq");
+      setLibcallName(RTLIB::FPROUND_F128_F32, "_Q_qtos");
+      setLibcallName(RTLIB::FPROUND_F128_F64, "_Q_qtod");
+    }
+  }
+
+  setMinFunctionAlignment(2);
+
+  computeRegisterProperties();
+}
+
+const char *SparcTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default: return nullptr;
+  case SPISD::CMPICC:     return "SPISD::CMPICC";
+  case SPISD::CMPFCC:     return "SPISD::CMPFCC";
+  case SPISD::BRICC:      return "SPISD::BRICC";
+  case SPISD::BRXCC:      return "SPISD::BRXCC";
+  case SPISD::BRFCC:      return "SPISD::BRFCC";
+  case SPISD::SELECT_ICC: return "SPISD::SELECT_ICC";
+  case SPISD::SELECT_XCC: return "SPISD::SELECT_XCC";
+  case SPISD::SELECT_FCC: return "SPISD::SELECT_FCC";
+  case SPISD::Hi:         return "SPISD::Hi";
+  case SPISD::Lo:         return "SPISD::Lo";
+  case SPISD::FTOI:       return "SPISD::FTOI";
+  case SPISD::ITOF:       return "SPISD::ITOF";
+  case SPISD::FTOX:       return "SPISD::FTOX";
+  case SPISD::XTOF:       return "SPISD::XTOF";
+  case SPISD::CALL:       return "SPISD::CALL";
+  case SPISD::RET_FLAG:   return "SPISD::RET_FLAG";
+  case SPISD::GLOBAL_BASE_REG: return "SPISD::GLOBAL_BASE_REG";
+  case SPISD::FLUSHW:     return "SPISD::FLUSHW";
+  case SPISD::TLS_ADD:    return "SPISD::TLS_ADD";
+  case SPISD::TLS_LD:     return "SPISD::TLS_LD";
+  case SPISD::TLS_CALL:   return "SPISD::TLS_CALL";
+  }
+}
+
+EVT SparcTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
+  if (!VT.isVector())
+    return MVT::i32;
+  return VT.changeVectorElementTypeToInteger();
+}
+
+/// isMaskedValueZeroForTargetNode - Return true if 'Op & Mask' is known to
+/// be zero. Op is expected to be a target specific node. Used by DAG
+/// combiner.
+void SparcTargetLowering::computeKnownBitsForTargetNode
+                                (const SDValue Op,
+                                 APInt &KnownZero,
+                                 APInt &KnownOne,
+                                 const SelectionDAG &DAG,
+                                 unsigned Depth) const {
+  APInt KnownZero2, KnownOne2;
+  KnownZero = KnownOne = APInt(KnownZero.getBitWidth(), 0);
+
+  switch (Op.getOpcode()) {
+  default: break;
+  case SPISD::SELECT_ICC:
+  case SPISD::SELECT_XCC:
+  case SPISD::SELECT_FCC:
+    DAG.computeKnownBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1);
+    DAG.computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1);
+
+    // Only known if known in both the LHS and RHS.
+    KnownOne &= KnownOne2;
+    KnownZero &= KnownZero2;
+    break;
+  }
+}
+
+// Look at LHS/RHS/CC and see if they are a lowered setcc instruction.  If so
+// set LHS/RHS and SPCC to the LHS/RHS of the setcc and SPCC to the condition.
+static void LookThroughSetCC(SDValue &LHS, SDValue &RHS,
+                             ISD::CondCode CC, unsigned &SPCC) {
+  if (isa<ConstantSDNode>(RHS) &&
+      cast<ConstantSDNode>(RHS)->isNullValue() &&
+      CC == ISD::SETNE &&
+      (((LHS.getOpcode() == SPISD::SELECT_ICC ||
+         LHS.getOpcode() == SPISD::SELECT_XCC) &&
+        LHS.getOperand(3).getOpcode() == SPISD::CMPICC) ||
+       (LHS.getOpcode() == SPISD::SELECT_FCC &&
+        LHS.getOperand(3).getOpcode() == SPISD::CMPFCC)) &&
+      isa<ConstantSDNode>(LHS.getOperand(0)) &&
+      isa<ConstantSDNode>(LHS.getOperand(1)) &&
+      cast<ConstantSDNode>(LHS.getOperand(0))->isOne() &&
+      cast<ConstantSDNode>(LHS.getOperand(1))->isNullValue()) {
+    SDValue CMPCC = LHS.getOperand(3);
+    SPCC = cast<ConstantSDNode>(LHS.getOperand(2))->getZExtValue();
+    LHS = CMPCC.getOperand(0);
+    RHS = CMPCC.getOperand(1);
+  }
+}
+
+// Convert to a target node and set target flags.
+SDValue SparcTargetLowering::withTargetFlags(SDValue Op, unsigned TF,
+                                             SelectionDAG &DAG) const {
+  if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op))
+    return DAG.getTargetGlobalAddress(GA->getGlobal(),
+                                      SDLoc(GA),
+                                      GA->getValueType(0),
+                                      GA->getOffset(), TF);
+
+  if (const ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Op))
+    return DAG.getTargetConstantPool(CP->getConstVal(),
+                                     CP->getValueType(0),
+                                     CP->getAlignment(),
+                                     CP->getOffset(), TF);
+
+  if (const BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(Op))
+    return DAG.getTargetBlockAddress(BA->getBlockAddress(),
+                                     Op.getValueType(),
+                                     0,
+                                     TF);
+
+  if (const ExternalSymbolSDNode *ES = dyn_cast<ExternalSymbolSDNode>(Op))
+    return DAG.getTargetExternalSymbol(ES->getSymbol(),
+                                       ES->getValueType(0), TF);
+
+  llvm_unreachable("Unhandled address SDNode");
+}
+
+// Split Op into high and low parts according to HiTF and LoTF.
+// Return an ADD node combining the parts.
+SDValue SparcTargetLowering::makeHiLoPair(SDValue Op,
+                                          unsigned HiTF, unsigned LoTF,
+                                          SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  SDValue Hi = DAG.getNode(SPISD::Hi, DL, VT, withTargetFlags(Op, HiTF, DAG));
+  SDValue Lo = DAG.getNode(SPISD::Lo, DL, VT, withTargetFlags(Op, LoTF, DAG));
+  return DAG.getNode(ISD::ADD, DL, VT, Hi, Lo);
+}
+
+// Build SDNodes for producing an address from a GlobalAddress, ConstantPool,
+// or ExternalSymbol SDNode.
+SDValue SparcTargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT VT = getPointerTy();
+
+  // Handle PIC mode first.
+  if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
+    // This is the pic32 code model, the GOT is known to be smaller than 4GB.
+    SDValue HiLo = makeHiLoPair(Op, SparcMCExpr::VK_Sparc_GOT22,
+                                SparcMCExpr::VK_Sparc_GOT10, DAG);
+    SDValue GlobalBase = DAG.getNode(SPISD::GLOBAL_BASE_REG, DL, VT);
+    SDValue AbsAddr = DAG.getNode(ISD::ADD, DL, VT, GlobalBase, HiLo);
+    // GLOBAL_BASE_REG codegen'ed with call. Inform MFI that this
+    // function has calls.
+    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+    MFI->setHasCalls(true);
+    return DAG.getLoad(VT, DL, DAG.getEntryNode(), AbsAddr,
+                       MachinePointerInfo::getGOT(), false, false, false, 0);
+  }
+
+  // This is one of the absolute code models.
+  switch(getTargetMachine().getCodeModel()) {
+  default:
+    llvm_unreachable("Unsupported absolute code model");
+  case CodeModel::Small:
+    // abs32.
+    return makeHiLoPair(Op, SparcMCExpr::VK_Sparc_HI,
+                        SparcMCExpr::VK_Sparc_LO, DAG);
+  case CodeModel::Medium: {
+    // abs44.
+    SDValue H44 = makeHiLoPair(Op, SparcMCExpr::VK_Sparc_H44,
+                               SparcMCExpr::VK_Sparc_M44, DAG);
+    H44 = DAG.getNode(ISD::SHL, DL, VT, H44, DAG.getConstant(12, MVT::i32));
+    SDValue L44 = withTargetFlags(Op, SparcMCExpr::VK_Sparc_L44, DAG);
+    L44 = DAG.getNode(SPISD::Lo, DL, VT, L44);
+    return DAG.getNode(ISD::ADD, DL, VT, H44, L44);
+  }
+  case CodeModel::Large: {
+    // abs64.
+    SDValue Hi = makeHiLoPair(Op, SparcMCExpr::VK_Sparc_HH,
+                              SparcMCExpr::VK_Sparc_HM, DAG);
+    Hi = DAG.getNode(ISD::SHL, DL, VT, Hi, DAG.getConstant(32, MVT::i32));
+    SDValue Lo = makeHiLoPair(Op, SparcMCExpr::VK_Sparc_HI,
+                              SparcMCExpr::VK_Sparc_LO, DAG);
+    return DAG.getNode(ISD::ADD, DL, VT, Hi, Lo);
+  }
+  }
+}
+
+SDValue SparcTargetLowering::LowerGlobalAddress(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  return makeAddress(Op, DAG);
+}
+
+SDValue SparcTargetLowering::LowerConstantPool(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  return makeAddress(Op, DAG);
+}
+
+SDValue SparcTargetLowering::LowerBlockAddress(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  return makeAddress(Op, DAG);
+}
+
+SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+
+  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+  SDLoc DL(GA);
+  const GlobalValue *GV = GA->getGlobal();
+  EVT PtrVT = getPointerTy();
+
+  TLSModel::Model model = getTargetMachine().getTLSModel(GV);
+
+  if (model == TLSModel::GeneralDynamic || model == TLSModel::LocalDynamic) {
+    unsigned HiTF = ((model == TLSModel::GeneralDynamic)
+                     ? SparcMCExpr::VK_Sparc_TLS_GD_HI22
+                     : SparcMCExpr::VK_Sparc_TLS_LDM_HI22);
+    unsigned LoTF = ((model == TLSModel::GeneralDynamic)
+                     ? SparcMCExpr::VK_Sparc_TLS_GD_LO10
+                     : SparcMCExpr::VK_Sparc_TLS_LDM_LO10);
+    unsigned addTF = ((model == TLSModel::GeneralDynamic)
+                      ? SparcMCExpr::VK_Sparc_TLS_GD_ADD
+                      : SparcMCExpr::VK_Sparc_TLS_LDM_ADD);
+    unsigned callTF = ((model == TLSModel::GeneralDynamic)
+                       ? SparcMCExpr::VK_Sparc_TLS_GD_CALL
+                       : SparcMCExpr::VK_Sparc_TLS_LDM_CALL);
+
+    SDValue HiLo = makeHiLoPair(Op, HiTF, LoTF, DAG);
+    SDValue Base = DAG.getNode(SPISD::GLOBAL_BASE_REG, DL, PtrVT);
+    SDValue Argument = DAG.getNode(SPISD::TLS_ADD, DL, PtrVT, Base, HiLo,
+                               withTargetFlags(Op, addTF, DAG));
+
+    SDValue Chain = DAG.getEntryNode();
+    SDValue InFlag;
+
+    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(1, true), DL);
+    Chain = DAG.getCopyToReg(Chain, DL, SP::O0, Argument, InFlag);
+    InFlag = Chain.getValue(1);
+    SDValue Callee = DAG.getTargetExternalSymbol("__tls_get_addr", PtrVT);
+    SDValue Symbol = withTargetFlags(Op, callTF, DAG);
+
+    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+    SmallVector<SDValue, 4> Ops;
+    Ops.push_back(Chain);
+    Ops.push_back(Callee);
+    Ops.push_back(Symbol);
+    Ops.push_back(DAG.getRegister(SP::O0, PtrVT));
+    const uint32_t *Mask = getTargetMachine()
+      .getRegisterInfo()->getCallPreservedMask(CallingConv::C);
+    assert(Mask && "Missing call preserved mask for calling convention");
+    Ops.push_back(DAG.getRegisterMask(Mask));
+    Ops.push_back(InFlag);
+    Chain = DAG.getNode(SPISD::TLS_CALL, DL, NodeTys, Ops);
+    InFlag = Chain.getValue(1);
+    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(1, true),
+                               DAG.getIntPtrConstant(0, true), InFlag, DL);
+    InFlag = Chain.getValue(1);
+    SDValue Ret = DAG.getCopyFromReg(Chain, DL, SP::O0, PtrVT, InFlag);
+
+    if (model != TLSModel::LocalDynamic)
+      return Ret;
+
+    SDValue Hi = DAG.getNode(SPISD::Hi, DL, PtrVT,
+                 withTargetFlags(Op, SparcMCExpr::VK_Sparc_TLS_LDO_HIX22, DAG));
+    SDValue Lo = DAG.getNode(SPISD::Lo, DL, PtrVT,
+                 withTargetFlags(Op, SparcMCExpr::VK_Sparc_TLS_LDO_LOX10, DAG));
+    HiLo =  DAG.getNode(ISD::XOR, DL, PtrVT, Hi, Lo);
+    return DAG.getNode(SPISD::TLS_ADD, DL, PtrVT, Ret, HiLo,
+                   withTargetFlags(Op, SparcMCExpr::VK_Sparc_TLS_LDO_ADD, DAG));
+  }
+
+  if (model == TLSModel::InitialExec) {
+    unsigned ldTF     = ((PtrVT == MVT::i64)? SparcMCExpr::VK_Sparc_TLS_IE_LDX
+                         : SparcMCExpr::VK_Sparc_TLS_IE_LD);
+
+    SDValue Base = DAG.getNode(SPISD::GLOBAL_BASE_REG, DL, PtrVT);
+
+    // GLOBAL_BASE_REG codegen'ed with call. Inform MFI that this
+    // function has calls.
+    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+    MFI->setHasCalls(true);
+
+    SDValue TGA = makeHiLoPair(Op,
+                               SparcMCExpr::VK_Sparc_TLS_IE_HI22,
+                               SparcMCExpr::VK_Sparc_TLS_IE_LO10, DAG);
+    SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base, TGA);
+    SDValue Offset = DAG.getNode(SPISD::TLS_LD,
+                                 DL, PtrVT, Ptr,
+                                 withTargetFlags(Op, ldTF, DAG));
+    return DAG.getNode(SPISD::TLS_ADD, DL, PtrVT,
+                       DAG.getRegister(SP::G7, PtrVT), Offset,
+                       withTargetFlags(Op,
+                                       SparcMCExpr::VK_Sparc_TLS_IE_ADD, DAG));
+  }
+
+  assert(model == TLSModel::LocalExec);
+  SDValue Hi = DAG.getNode(SPISD::Hi, DL, PtrVT,
+                  withTargetFlags(Op, SparcMCExpr::VK_Sparc_TLS_LE_HIX22, DAG));
+  SDValue Lo = DAG.getNode(SPISD::Lo, DL, PtrVT,
+                  withTargetFlags(Op, SparcMCExpr::VK_Sparc_TLS_LE_LOX10, DAG));
+  SDValue Offset =  DAG.getNode(ISD::XOR, DL, PtrVT, Hi, Lo);
+
+  return DAG.getNode(ISD::ADD, DL, PtrVT,
+                     DAG.getRegister(SP::G7, PtrVT), Offset);
+}
+
+SDValue
+SparcTargetLowering::LowerF128_LibCallArg(SDValue Chain, ArgListTy &Args,
+                                          SDValue Arg, SDLoc DL,
+                                          SelectionDAG &DAG) const {
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  EVT ArgVT = Arg.getValueType();
+  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+
+  ArgListEntry Entry;
+  Entry.Node = Arg;
+  Entry.Ty   = ArgTy;
+
+  if (ArgTy->isFP128Ty()) {
+    // Create a stack object and pass the pointer to the library function.
+    int FI = MFI->CreateStackObject(16, 8, false);
+    SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy());
+    Chain = DAG.getStore(Chain,
+                         DL,
+                         Entry.Node,
+                         FIPtr,
+                         MachinePointerInfo(),
+                         false,
+                         false,
+                         8);
+
+    Entry.Node = FIPtr;
+    Entry.Ty   = PointerType::getUnqual(ArgTy);
+  }
+  Args.push_back(Entry);
+  return Chain;
+}
+
+SDValue
+SparcTargetLowering::LowerF128Op(SDValue Op, SelectionDAG &DAG,
+                                 const char *LibFuncName,
+                                 unsigned numArgs) const {
+
+  ArgListTy Args;
+
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+
+  SDValue Callee = DAG.getExternalSymbol(LibFuncName, getPointerTy());
+  Type *RetTy = Op.getValueType().getTypeForEVT(*DAG.getContext());
+  Type *RetTyABI = RetTy;
+  SDValue Chain = DAG.getEntryNode();
+  SDValue RetPtr;
+
+  if (RetTy->isFP128Ty()) {
+    // Create a Stack Object to receive the return value of type f128.
+    ArgListEntry Entry;
+    int RetFI = MFI->CreateStackObject(16, 8, false);
+    RetPtr = DAG.getFrameIndex(RetFI, getPointerTy());
+    Entry.Node = RetPtr;
+    Entry.Ty   = PointerType::getUnqual(RetTy);
+    if (!Subtarget->is64Bit())
+      Entry.isSRet = true;
+    Entry.isReturned = false;
+    Args.push_back(Entry);
+    RetTyABI = Type::getVoidTy(*DAG.getContext());
+  }
+
+  assert(Op->getNumOperands() >= numArgs && "Not enough operands!");
+  for (unsigned i = 0, e = numArgs; i != e; ++i) {
+    Chain = LowerF128_LibCallArg(Chain, Args, Op.getOperand(i), SDLoc(Op), DAG);
+  }
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(SDLoc(Op)).setChain(Chain)
+    .setCallee(CallingConv::C, RetTyABI, Callee, std::move(Args), 0);
+
+  std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
+
+  // chain is in second result.
+  if (RetTyABI == RetTy)
+    return CallInfo.first;
+
+  assert (RetTy->isFP128Ty() && "Unexpected return type!");
+
+  Chain = CallInfo.second;
+
+  // Load RetPtr to get the return value.
+  return DAG.getLoad(Op.getValueType(),
+                     SDLoc(Op),
+                     Chain,
+                     RetPtr,
+                     MachinePointerInfo(),
+                     false, false, false, 8);
+}
+
+SDValue
+SparcTargetLowering::LowerF128Compare(SDValue LHS, SDValue RHS,
+                                      unsigned &SPCC,
+                                      SDLoc DL,
+                                      SelectionDAG &DAG) const {
+
+  const char *LibCall = nullptr;
+  bool is64Bit = Subtarget->is64Bit();
+  switch(SPCC) {
+  default: llvm_unreachable("Unhandled conditional code!");
+  case SPCC::FCC_E  : LibCall = is64Bit? "_Qp_feq" : "_Q_feq"; break;
+  case SPCC::FCC_NE : LibCall = is64Bit? "_Qp_fne" : "_Q_fne"; break;
+  case SPCC::FCC_L  : LibCall = is64Bit? "_Qp_flt" : "_Q_flt"; break;
+  case SPCC::FCC_G  : LibCall = is64Bit? "_Qp_fgt" : "_Q_fgt"; break;
+  case SPCC::FCC_LE : LibCall = is64Bit? "_Qp_fle" : "_Q_fle"; break;
+  case SPCC::FCC_GE : LibCall = is64Bit? "_Qp_fge" : "_Q_fge"; break;
+  case SPCC::FCC_UL :
+  case SPCC::FCC_ULE:
+  case SPCC::FCC_UG :
+  case SPCC::FCC_UGE:
+  case SPCC::FCC_U  :
+  case SPCC::FCC_O  :
+  case SPCC::FCC_LG :
+  case SPCC::FCC_UE : LibCall = is64Bit? "_Qp_cmp" : "_Q_cmp"; break;
+  }
+
+  SDValue Callee = DAG.getExternalSymbol(LibCall, getPointerTy());
+  Type *RetTy = Type::getInt32Ty(*DAG.getContext());
+  ArgListTy Args;
+  SDValue Chain = DAG.getEntryNode();
+  Chain = LowerF128_LibCallArg(Chain, Args, LHS, DL, DAG);
+  Chain = LowerF128_LibCallArg(Chain, Args, RHS, DL, DAG);
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(DL).setChain(Chain)
+    .setCallee(CallingConv::C, RetTy, Callee, std::move(Args), 0);
+
+  std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
+
+  // result is in first, and chain is in second result.
+  SDValue Result =  CallInfo.first;
+
+  switch(SPCC) {
+  default: {
+    SDValue RHS = DAG.getTargetConstant(0, Result.getValueType());
+    SPCC = SPCC::ICC_NE;
+    return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+  }
+  case SPCC::FCC_UL : {
+    SDValue Mask   = DAG.getTargetConstant(1, Result.getValueType());
+    Result = DAG.getNode(ISD::AND, DL, Result.getValueType(), Result, Mask);
+    SDValue RHS    = DAG.getTargetConstant(0, Result.getValueType());
+    SPCC = SPCC::ICC_NE;
+    return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+  }
+  case SPCC::FCC_ULE: {
+    SDValue RHS = DAG.getTargetConstant(2, Result.getValueType());
+    SPCC = SPCC::ICC_NE;
+    return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+  }
+  case SPCC::FCC_UG :  {
+    SDValue RHS = DAG.getTargetConstant(1, Result.getValueType());
+    SPCC = SPCC::ICC_G;
+    return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+  }
+  case SPCC::FCC_UGE: {
+    SDValue RHS = DAG.getTargetConstant(1, Result.getValueType());
+    SPCC = SPCC::ICC_NE;
+    return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+  }
+
+  case SPCC::FCC_U  :  {
+    SDValue RHS = DAG.getTargetConstant(3, Result.getValueType());
+    SPCC = SPCC::ICC_E;
+    return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+  }
+  case SPCC::FCC_O  :  {
+    SDValue RHS = DAG.getTargetConstant(3, Result.getValueType());
+    SPCC = SPCC::ICC_NE;
+    return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+  }
+  case SPCC::FCC_LG :  {
+    SDValue Mask   = DAG.getTargetConstant(3, Result.getValueType());
+    Result = DAG.getNode(ISD::AND, DL, Result.getValueType(), Result, Mask);
+    SDValue RHS    = DAG.getTargetConstant(0, Result.getValueType());
+    SPCC = SPCC::ICC_NE;
+    return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+  }
+  case SPCC::FCC_UE : {
+    SDValue Mask   = DAG.getTargetConstant(3, Result.getValueType());
+    Result = DAG.getNode(ISD::AND, DL, Result.getValueType(), Result, Mask);
+    SDValue RHS    = DAG.getTargetConstant(0, Result.getValueType());
+    SPCC = SPCC::ICC_E;
+    return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+  }
+  }
+}
+
+static SDValue
+LowerF128_FPEXTEND(SDValue Op, SelectionDAG &DAG,
+                   const SparcTargetLowering &TLI) {
+
+  if (Op.getOperand(0).getValueType() == MVT::f64)
+    return TLI.LowerF128Op(Op, DAG,
+                           TLI.getLibcallName(RTLIB::FPEXT_F64_F128), 1);
+
+  if (Op.getOperand(0).getValueType() == MVT::f32)
+    return TLI.LowerF128Op(Op, DAG,
+                           TLI.getLibcallName(RTLIB::FPEXT_F32_F128), 1);
+
+  llvm_unreachable("fpextend with non-float operand!");
+  return SDValue();
+}
+
+static SDValue
+LowerF128_FPROUND(SDValue Op, SelectionDAG &DAG,
+                  const SparcTargetLowering &TLI) {
+  // FP_ROUND on f64 and f32 are legal.
+  if (Op.getOperand(0).getValueType() != MVT::f128)
+    return Op;
+
+  if (Op.getValueType() == MVT::f64)
+    return TLI.LowerF128Op(Op, DAG,
+                           TLI.getLibcallName(RTLIB::FPROUND_F128_F64), 1);
+  if (Op.getValueType() == MVT::f32)
+    return TLI.LowerF128Op(Op, DAG,
+                           TLI.getLibcallName(RTLIB::FPROUND_F128_F32), 1);
+
+  llvm_unreachable("fpround to non-float!");
+  return SDValue();
+}
+
+static SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG,
+                               const SparcTargetLowering &TLI,
+                               bool hasHardQuad) {
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+  assert(VT == MVT::i32 || VT == MVT::i64);
+
+  // Expand f128 operations to fp128 abi calls.
+  if (Op.getOperand(0).getValueType() == MVT::f128
+      && (!hasHardQuad || !TLI.isTypeLegal(VT))) {
+    const char *libName = TLI.getLibcallName(VT == MVT::i32
+                                             ? RTLIB::FPTOSINT_F128_I32
+                                             : RTLIB::FPTOSINT_F128_I64);
+    return TLI.LowerF128Op(Op, DAG, libName, 1);
+  }
+
+  // Expand if the resulting type is illegal.
+  if (!TLI.isTypeLegal(VT))
+    return SDValue();
+
+  // Otherwise, Convert the fp value to integer in an FP register.
+  if (VT == MVT::i32)
+    Op = DAG.getNode(SPISD::FTOI, dl, MVT::f32, Op.getOperand(0));
+  else
+    Op = DAG.getNode(SPISD::FTOX, dl, MVT::f64, Op.getOperand(0));
+
+  return DAG.getNode(ISD::BITCAST, dl, VT, Op);
+}
+
+static SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG,
+                               const SparcTargetLowering &TLI,
+                               bool hasHardQuad) {
+  SDLoc dl(Op);
+  EVT OpVT = Op.getOperand(0).getValueType();
+  assert(OpVT == MVT::i32 || (OpVT == MVT::i64));
+
+  EVT floatVT = (OpVT == MVT::i32) ? MVT::f32 : MVT::f64;
+
+  // Expand f128 operations to fp128 ABI calls.
+  if (Op.getValueType() == MVT::f128
+      && (!hasHardQuad || !TLI.isTypeLegal(OpVT))) {
+    const char *libName = TLI.getLibcallName(OpVT == MVT::i32
+                                             ? RTLIB::SINTTOFP_I32_F128
+                                             : RTLIB::SINTTOFP_I64_F128);
+    return TLI.LowerF128Op(Op, DAG, libName, 1);
+  }
+
+  // Expand if the operand type is illegal.
+  if (!TLI.isTypeLegal(OpVT))
+    return SDValue();
+
+  // Otherwise, Convert the int value to FP in an FP register.
+  SDValue Tmp = DAG.getNode(ISD::BITCAST, dl, floatVT, Op.getOperand(0));
+  unsigned opcode = (OpVT == MVT::i32)? SPISD::ITOF : SPISD::XTOF;
+  return DAG.getNode(opcode, dl, Op.getValueType(), Tmp);
+}
+
+static SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG,
+                               const SparcTargetLowering &TLI,
+                               bool hasHardQuad) {
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+
+  // Expand if it does not involve f128 or the target has support for
+  // quad floating point instructions and the resulting type is legal.
+  if (Op.getOperand(0).getValueType() != MVT::f128 ||
+      (hasHardQuad && TLI.isTypeLegal(VT)))
+    return SDValue();
+
+  assert(VT == MVT::i32 || VT == MVT::i64);
+
+  return TLI.LowerF128Op(Op, DAG,
+                         TLI.getLibcallName(VT == MVT::i32
+                                            ? RTLIB::FPTOUINT_F128_I32
+                                            : RTLIB::FPTOUINT_F128_I64),
+                         1);
+}
+
+static SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG,
+                               const SparcTargetLowering &TLI,
+                               bool hasHardQuad) {
+  SDLoc dl(Op);
+  EVT OpVT = Op.getOperand(0).getValueType();
+  assert(OpVT == MVT::i32 || OpVT == MVT::i64);
+
+  // Expand if it does not involve f128 or the target has support for
+  // quad floating point instructions and the operand type is legal.
+  if (Op.getValueType() != MVT::f128 || (hasHardQuad && TLI.isTypeLegal(OpVT)))
+    return SDValue();
+
+  return TLI.LowerF128Op(Op, DAG,
+                         TLI.getLibcallName(OpVT == MVT::i32
+                                            ? RTLIB::UINTTOFP_I32_F128
+                                            : RTLIB::UINTTOFP_I64_F128),
+                         1);
+}
+
+static SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG,
+                          const SparcTargetLowering &TLI,
+                          bool hasHardQuad) {
+  SDValue Chain = Op.getOperand(0);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+  SDValue LHS = Op.getOperand(2);
+  SDValue RHS = Op.getOperand(3);
+  SDValue Dest = Op.getOperand(4);
+  SDLoc dl(Op);
+  unsigned Opc, SPCC = ~0U;
+
+  // If this is a br_cc of a "setcc", and if the setcc got lowered into
+  // an CMP[IF]CC/SELECT_[IF]CC pair, find the original compared values.
+  LookThroughSetCC(LHS, RHS, CC, SPCC);
+
+  // Get the condition flag.
+  SDValue CompareFlag;
+  if (LHS.getValueType().isInteger()) {
+    CompareFlag = DAG.getNode(SPISD::CMPICC, dl, MVT::Glue, LHS, RHS);
+    if (SPCC == ~0U) SPCC = IntCondCCodeToICC(CC);
+    // 32-bit compares use the icc flags, 64-bit uses the xcc flags.
+    Opc = LHS.getValueType() == MVT::i32 ? SPISD::BRICC : SPISD::BRXCC;
+  } else {
+    if (!hasHardQuad && LHS.getValueType() == MVT::f128) {
+      if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC);
+      CompareFlag = TLI.LowerF128Compare(LHS, RHS, SPCC, dl, DAG);
+      Opc = SPISD::BRICC;
+    } else {
+      CompareFlag = DAG.getNode(SPISD::CMPFCC, dl, MVT::Glue, LHS, RHS);
+      if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC);
+      Opc = SPISD::BRFCC;
+    }
+  }
+  return DAG.getNode(Opc, dl, MVT::Other, Chain, Dest,
+                     DAG.getConstant(SPCC, MVT::i32), CompareFlag);
+}
+
+static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG,
+                              const SparcTargetLowering &TLI,
+                              bool hasHardQuad) {
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+  SDValue TrueVal = Op.getOperand(2);
+  SDValue FalseVal = Op.getOperand(3);
+  SDLoc dl(Op);
+  unsigned Opc, SPCC = ~0U;
+
+  // If this is a select_cc of a "setcc", and if the setcc got lowered into
+  // an CMP[IF]CC/SELECT_[IF]CC pair, find the original compared values.
+  LookThroughSetCC(LHS, RHS, CC, SPCC);
+
+  SDValue CompareFlag;
+  if (LHS.getValueType().isInteger()) {
+    CompareFlag = DAG.getNode(SPISD::CMPICC, dl, MVT::Glue, LHS, RHS);
+    Opc = LHS.getValueType() == MVT::i32 ?
+          SPISD::SELECT_ICC : SPISD::SELECT_XCC;
+    if (SPCC == ~0U) SPCC = IntCondCCodeToICC(CC);
+  } else {
+    if (!hasHardQuad && LHS.getValueType() == MVT::f128) {
+      if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC);
+      CompareFlag = TLI.LowerF128Compare(LHS, RHS, SPCC, dl, DAG);
+      Opc = SPISD::SELECT_ICC;
+    } else {
+      CompareFlag = DAG.getNode(SPISD::CMPFCC, dl, MVT::Glue, LHS, RHS);
+      Opc = SPISD::SELECT_FCC;
+      if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC);
+    }
+  }
+  return DAG.getNode(Opc, dl, TrueVal.getValueType(), TrueVal, FalseVal,
+                     DAG.getConstant(SPCC, MVT::i32), CompareFlag);
+}
+
+static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG,
+                            const SparcTargetLowering &TLI) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>();
+
+  // Need frame address to find the address of VarArgsFrameIndex.
+  MF.getFrameInfo()->setFrameAddressIsTaken(true);
+
+  // vastart just stores the address of the VarArgsFrameIndex slot into the
+  // memory location argument.
+  SDLoc DL(Op);
+  SDValue Offset =
+    DAG.getNode(ISD::ADD, DL, TLI.getPointerTy(),
+                DAG.getRegister(SP::I6, TLI.getPointerTy()),
+                DAG.getIntPtrConstant(FuncInfo->getVarArgsFrameOffset()));
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  return DAG.getStore(Op.getOperand(0), DL, Offset, Op.getOperand(1),
+                      MachinePointerInfo(SV), false, false, 0);
+}
+
+static SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) {
+  SDNode *Node = Op.getNode();
+  EVT VT = Node->getValueType(0);
+  SDValue InChain = Node->getOperand(0);
+  SDValue VAListPtr = Node->getOperand(1);
+  EVT PtrVT = VAListPtr.getValueType();
+  const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
+  SDLoc DL(Node);
+  SDValue VAList = DAG.getLoad(PtrVT, DL, InChain, VAListPtr,
+                               MachinePointerInfo(SV), false, false, false, 0);
+  // Increment the pointer, VAList, to the next vaarg.
+  SDValue NextPtr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
+                                DAG.getIntPtrConstant(VT.getSizeInBits()/8));
+  // Store the incremented VAList to the legalized pointer.
+  InChain = DAG.getStore(VAList.getValue(1), DL, NextPtr,
+                         VAListPtr, MachinePointerInfo(SV), false, false, 0);
+  // Load the actual argument out of the pointer VAList.
+  // We can't count on greater alignment than the word size.
+  return DAG.getLoad(VT, DL, InChain, VAList, MachinePointerInfo(),
+                     false, false, false,
+                     std::min(PtrVT.getSizeInBits(), VT.getSizeInBits())/8);
+}
+
+static SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG,
+                                       const SparcSubtarget *Subtarget) {
+  SDValue Chain = Op.getOperand(0);  // Legalize the chain.
+  SDValue Size  = Op.getOperand(1);  // Legalize the size.
+  EVT VT = Size->getValueType(0);
+  SDLoc dl(Op);
+
+  unsigned SPReg = SP::O6;
+  SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
+  SDValue NewSP = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
+  Chain = DAG.getCopyToReg(SP.getValue(1), dl, SPReg, NewSP);    // Output chain
+
+  // The resultant pointer is actually 16 words from the bottom of the stack,
+  // to provide a register spill area.
+  unsigned regSpillArea = Subtarget->is64Bit() ? 128 : 96;
+  regSpillArea += Subtarget->getStackPointerBias();
+
+  SDValue NewVal = DAG.getNode(ISD::ADD, dl, VT, NewSP,
+                               DAG.getConstant(regSpillArea, VT));
+  SDValue Ops[2] = { NewVal, Chain };
+  return DAG.getMergeValues(Ops, dl);
+}
+
+
+static SDValue getFLUSHW(SDValue Op, SelectionDAG &DAG) {
+  SDLoc dl(Op);
+  SDValue Chain = DAG.getNode(SPISD::FLUSHW,
+                              dl, MVT::Other, DAG.getEntryNode());
+  return Chain;
+}
+
+static SDValue getFRAMEADDR(uint64_t depth, SDValue Op, SelectionDAG &DAG,
+                            const SparcSubtarget *Subtarget) {
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI->setFrameAddressIsTaken(true);
+
+  EVT VT = Op.getValueType();
+  SDLoc dl(Op);
+  unsigned FrameReg = SP::I6;
+  unsigned stackBias = Subtarget->getStackPointerBias();
+
+  SDValue FrameAddr;
+
+  if (depth == 0) {
+    FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
+    if (Subtarget->is64Bit())
+      FrameAddr = DAG.getNode(ISD::ADD, dl, VT, FrameAddr,
+                              DAG.getIntPtrConstant(stackBias));
+    return FrameAddr;
+  }
+
+  // flush first to make sure the windowed registers' values are in stack
+  SDValue Chain = getFLUSHW(Op, DAG);
+  FrameAddr = DAG.getCopyFromReg(Chain, dl, FrameReg, VT);
+
+  unsigned Offset = (Subtarget->is64Bit()) ? (stackBias + 112) : 56;
+
+  while (depth--) {
+    SDValue Ptr = DAG.getNode(ISD::ADD, dl, VT, FrameAddr,
+                              DAG.getIntPtrConstant(Offset));
+    FrameAddr = DAG.getLoad(VT, dl, Chain, Ptr, MachinePointerInfo(),
+                            false, false, false, 0);
+  }
+  if (Subtarget->is64Bit())
+    FrameAddr = DAG.getNode(ISD::ADD, dl, VT, FrameAddr,
+                            DAG.getIntPtrConstant(stackBias));
+  return FrameAddr;
+}
+
+
+static SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG,
+                              const SparcSubtarget *Subtarget) {
+
+  uint64_t depth = Op.getConstantOperandVal(0);
+
+  return getFRAMEADDR(depth, Op, DAG, Subtarget);
+
+}
+
+static SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG,
+                               const SparcTargetLowering &TLI,
+                               const SparcSubtarget *Subtarget) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MFI->setReturnAddressIsTaken(true);
+
+  if (TLI.verifyReturnAddressArgumentIsConstant(Op, DAG))
+    return SDValue();
+
+  EVT VT = Op.getValueType();
+  SDLoc dl(Op);
+  uint64_t depth = Op.getConstantOperandVal(0);
+
+  SDValue RetAddr;
+  if (depth == 0) {
+    unsigned RetReg = MF.addLiveIn(SP::I7,
+                                   TLI.getRegClassFor(TLI.getPointerTy()));
+    RetAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, RetReg, VT);
+    return RetAddr;
+  }
+
+  // Need frame address to find return address of the caller.
+  SDValue FrameAddr = getFRAMEADDR(depth - 1, Op, DAG, Subtarget);
+
+  unsigned Offset = (Subtarget->is64Bit()) ? 120 : 60;
+  SDValue Ptr = DAG.getNode(ISD::ADD,
+                            dl, VT,
+                            FrameAddr,
+                            DAG.getIntPtrConstant(Offset));
+  RetAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), Ptr,
+                        MachinePointerInfo(), false, false, false, 0);
+
+  return RetAddr;
+}
+
+static SDValue LowerF64Op(SDValue Op, SelectionDAG &DAG, unsigned opcode)
+{
+  SDLoc dl(Op);
+
+  assert(Op.getValueType() == MVT::f64 && "LowerF64Op called on non-double!");
+  assert(opcode == ISD::FNEG || opcode == ISD::FABS);
+
+  // Lower fneg/fabs on f64 to fneg/fabs on f32.
+  // fneg f64 => fneg f32:sub_even, fmov f32:sub_odd.
+  // fabs f64 => fabs f32:sub_even, fmov f32:sub_odd.
+
+  SDValue SrcReg64 = Op.getOperand(0);
+  SDValue Hi32 = DAG.getTargetExtractSubreg(SP::sub_even, dl, MVT::f32,
+                                            SrcReg64);
+  SDValue Lo32 = DAG.getTargetExtractSubreg(SP::sub_odd, dl, MVT::f32,
+                                            SrcReg64);
+
+  Hi32 = DAG.getNode(opcode, dl, MVT::f32, Hi32);
+
+  SDValue DstReg64 = SDValue(DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF,
+                                                dl, MVT::f64), 0);
+  DstReg64 = DAG.getTargetInsertSubreg(SP::sub_even, dl, MVT::f64,
+                                       DstReg64, Hi32);
+  DstReg64 = DAG.getTargetInsertSubreg(SP::sub_odd, dl, MVT::f64,
+                                       DstReg64, Lo32);
+  return DstReg64;
+}
+
+// Lower a f128 load into two f64 loads.
+static SDValue LowerF128Load(SDValue Op, SelectionDAG &DAG)
+{
+  SDLoc dl(Op);
+  LoadSDNode *LdNode = dyn_cast<LoadSDNode>(Op.getNode());
+  assert(LdNode && LdNode->getOffset().getOpcode() == ISD::UNDEF
+         && "Unexpected node type");
+
+  unsigned alignment = LdNode->getAlignment();
+  if (alignment > 8)
+    alignment = 8;
+
+  SDValue Hi64 = DAG.getLoad(MVT::f64,
+                             dl,
+                             LdNode->getChain(),
+                             LdNode->getBasePtr(),
+                             LdNode->getPointerInfo(),
+                             false, false, false, alignment);
+  EVT addrVT = LdNode->getBasePtr().getValueType();
+  SDValue LoPtr = DAG.getNode(ISD::ADD, dl, addrVT,
+                              LdNode->getBasePtr(),
+                              DAG.getConstant(8, addrVT));
+  SDValue Lo64 = DAG.getLoad(MVT::f64,
+                             dl,
+                             LdNode->getChain(),
+                             LoPtr,
+                             LdNode->getPointerInfo(),
+                             false, false, false, alignment);
+
+  SDValue SubRegEven = DAG.getTargetConstant(SP::sub_even64, MVT::i32);
+  SDValue SubRegOdd  = DAG.getTargetConstant(SP::sub_odd64, MVT::i32);
+
+  SDNode *InFP128 = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF,
+                                       dl, MVT::f128);
+  InFP128 = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl,
+                               MVT::f128,
+                               SDValue(InFP128, 0),
+                               Hi64,
+                               SubRegEven);
+  InFP128 = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl,
+                               MVT::f128,
+                               SDValue(InFP128, 0),
+                               Lo64,
+                               SubRegOdd);
+  SDValue OutChains[2] = { SDValue(Hi64.getNode(), 1),
+                           SDValue(Lo64.getNode(), 1) };
+  SDValue OutChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
+  SDValue Ops[2] = {SDValue(InFP128,0), OutChain};
+  return DAG.getMergeValues(Ops, dl);
+}
+
+// Lower a f128 store into two f64 stores.
+static SDValue LowerF128Store(SDValue Op, SelectionDAG &DAG) {
+  SDLoc dl(Op);
+  StoreSDNode *StNode = dyn_cast<StoreSDNode>(Op.getNode());
+  assert(StNode && StNode->getOffset().getOpcode() == ISD::UNDEF
+         && "Unexpected node type");
+  SDValue SubRegEven = DAG.getTargetConstant(SP::sub_even64, MVT::i32);
+  SDValue SubRegOdd  = DAG.getTargetConstant(SP::sub_odd64, MVT::i32);
+
+  SDNode *Hi64 = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                    dl,
+                                    MVT::f64,
+                                    StNode->getValue(),
+                                    SubRegEven);
+  SDNode *Lo64 = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                    dl,
+                                    MVT::f64,
+                                    StNode->getValue(),
+                                    SubRegOdd);
+
+  unsigned alignment = StNode->getAlignment();
+  if (alignment > 8)
+    alignment = 8;
+
+  SDValue OutChains[2];
+  OutChains[0] = DAG.getStore(StNode->getChain(),
+                              dl,
+                              SDValue(Hi64, 0),
+                              StNode->getBasePtr(),
+                              MachinePointerInfo(),
+                              false, false, alignment);
+  EVT addrVT = StNode->getBasePtr().getValueType();
+  SDValue LoPtr = DAG.getNode(ISD::ADD, dl, addrVT,
+                              StNode->getBasePtr(),
+                              DAG.getConstant(8, addrVT));
+  OutChains[1] = DAG.getStore(StNode->getChain(),
+                             dl,
+                             SDValue(Lo64, 0),
+                             LoPtr,
+                             MachinePointerInfo(),
+                             false, false, alignment);
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
+}
+
+static SDValue LowerFNEGorFABS(SDValue Op, SelectionDAG &DAG, bool isV9) {
+  assert((Op.getOpcode() == ISD::FNEG || Op.getOpcode() == ISD::FABS)
+         && "invalid opcode");
+
+  if (Op.getValueType() == MVT::f64)
+    return LowerF64Op(Op, DAG, Op.getOpcode());
+  if (Op.getValueType() != MVT::f128)
+    return Op;
+
+  // Lower fabs/fneg on f128 to fabs/fneg on f64
+  // fabs/fneg f128 => fabs/fneg f64:sub_even64, fmov f64:sub_odd64
+
+  SDLoc dl(Op);
+  SDValue SrcReg128 = Op.getOperand(0);
+  SDValue Hi64 = DAG.getTargetExtractSubreg(SP::sub_even64, dl, MVT::f64,
+                                            SrcReg128);
+  SDValue Lo64 = DAG.getTargetExtractSubreg(SP::sub_odd64, dl, MVT::f64,
+                                            SrcReg128);
+  if (isV9)
+    Hi64 = DAG.getNode(Op.getOpcode(), dl, MVT::f64, Hi64);
+  else
+    Hi64 = LowerF64Op(Hi64, DAG, Op.getOpcode());
+
+  SDValue DstReg128 = SDValue(DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF,
+                                                 dl, MVT::f128), 0);
+  DstReg128 = DAG.getTargetInsertSubreg(SP::sub_even64, dl, MVT::f128,
+                                        DstReg128, Hi64);
+  DstReg128 = DAG.getTargetInsertSubreg(SP::sub_odd64, dl, MVT::f128,
+                                        DstReg128, Lo64);
+  return DstReg128;
+}
+
+static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
+
+  if (Op.getValueType() != MVT::i64)
+    return Op;
+
+  SDLoc dl(Op);
+  SDValue Src1 = Op.getOperand(0);
+  SDValue Src1Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src1);
+  SDValue Src1Hi = DAG.getNode(ISD::SRL, dl, MVT::i64, Src1,
+                               DAG.getConstant(32, MVT::i64));
+  Src1Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src1Hi);
+
+  SDValue Src2 = Op.getOperand(1);
+  SDValue Src2Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src2);
+  SDValue Src2Hi = DAG.getNode(ISD::SRL, dl, MVT::i64, Src2,
+                               DAG.getConstant(32, MVT::i64));
+  Src2Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src2Hi);
+
+
+  bool hasChain = false;
+  unsigned hiOpc = Op.getOpcode();
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Invalid opcode");
+  case ISD::ADDC: hiOpc = ISD::ADDE; break;
+  case ISD::ADDE: hasChain = true; break;
+  case ISD::SUBC: hiOpc = ISD::SUBE; break;
+  case ISD::SUBE: hasChain = true; break;
+  }
+  SDValue Lo;
+  SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Glue);
+  if (hasChain) {
+    Lo = DAG.getNode(Op.getOpcode(), dl, VTs, Src1Lo, Src2Lo,
+                     Op.getOperand(2));
+  } else {
+    Lo = DAG.getNode(Op.getOpcode(), dl, VTs, Src1Lo, Src2Lo);
+  }
+  SDValue Hi = DAG.getNode(hiOpc, dl, VTs, Src1Hi, Src2Hi, Lo.getValue(1));
+  SDValue Carry = Hi.getValue(1);
+
+  Lo = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Lo);
+  Hi = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Hi);
+  Hi = DAG.getNode(ISD::SHL, dl, MVT::i64, Hi,
+                   DAG.getConstant(32, MVT::i64));
+
+  SDValue Dst = DAG.getNode(ISD::OR, dl, MVT::i64, Hi, Lo);
+  SDValue Ops[2] = { Dst, Carry };
+  return DAG.getMergeValues(Ops, dl);
+}
+
+// Custom lower UMULO/SMULO for SPARC. This code is similar to ExpandNode()
+// in LegalizeDAG.cpp except the order of arguments to the library function.
+static SDValue LowerUMULO_SMULO(SDValue Op, SelectionDAG &DAG,
+                                const SparcTargetLowering &TLI)
+{
+  unsigned opcode = Op.getOpcode();
+  assert((opcode == ISD::UMULO || opcode == ISD::SMULO) && "Invalid Opcode.");
+
+  bool isSigned = (opcode == ISD::SMULO);
+  EVT VT = MVT::i64;
+  EVT WideVT = MVT::i128;
+  SDLoc dl(Op);
+  SDValue LHS = Op.getOperand(0);
+
+  if (LHS.getValueType() != VT)
+    return Op;
+
+  SDValue ShiftAmt = DAG.getConstant(63, VT);
+
+  SDValue RHS = Op.getOperand(1);
+  SDValue HiLHS = DAG.getNode(ISD::SRA, dl, VT, LHS, ShiftAmt);
+  SDValue HiRHS = DAG.getNode(ISD::SRA, dl, MVT::i64, RHS, ShiftAmt);
+  SDValue Args[] = { HiLHS, LHS, HiRHS, RHS };
+
+  SDValue MulResult = TLI.makeLibCall(DAG,
+                                      RTLIB::MUL_I128, WideVT,
+                                      Args, 4, isSigned, dl).first;
+  SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT,
+                                   MulResult, DAG.getIntPtrConstant(0));
+  SDValue TopHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT,
+                                MulResult, DAG.getIntPtrConstant(1));
+  if (isSigned) {
+    SDValue Tmp1 = DAG.getNode(ISD::SRA, dl, VT, BottomHalf, ShiftAmt);
+    TopHalf = DAG.getSetCC(dl, MVT::i32, TopHalf, Tmp1, ISD::SETNE);
+  } else {
+    TopHalf = DAG.getSetCC(dl, MVT::i32, TopHalf, DAG.getConstant(0, VT),
+                           ISD::SETNE);
+  }
+  // MulResult is a node with an illegal type. Because such things are not
+  // generally permitted during this phase of legalization, delete the
+  // node. The above EXTRACT_ELEMENT nodes should have been folded.
+  DAG.DeleteNode(MulResult.getNode());
+
+  SDValue Ops[2] = { BottomHalf, TopHalf } ;
+  return DAG.getMergeValues(Ops, dl);
+}
+
+static SDValue LowerATOMIC_LOAD_STORE(SDValue Op, SelectionDAG &DAG) {
+  // Monotonic load/stores are legal.
+  if (cast<AtomicSDNode>(Op)->getOrdering() <= Monotonic)
+    return Op;
+
+  // Otherwise, expand with a fence.
+  return SDValue();
+}
+
+
+SDValue SparcTargetLowering::
+LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+
+  bool hasHardQuad = Subtarget->hasHardQuad();
+  bool isV9        = Subtarget->isV9();
+
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Should not custom lower this!");
+
+  case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG, *this,
+                                                       Subtarget);
+  case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG,
+                                                      Subtarget);
+  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
+  case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
+  case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
+  case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
+  case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG, *this,
+                                                       hasHardQuad);
+  case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG, *this,
+                                                       hasHardQuad);
+  case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG, *this,
+                                                       hasHardQuad);
+  case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG, *this,
+                                                       hasHardQuad);
+  case ISD::BR_CC:              return LowerBR_CC(Op, DAG, *this,
+                                                  hasHardQuad);
+  case ISD::SELECT_CC:          return LowerSELECT_CC(Op, DAG, *this,
+                                                      hasHardQuad);
+  case ISD::VASTART:            return LowerVASTART(Op, DAG, *this);
+  case ISD::VAARG:              return LowerVAARG(Op, DAG);
+  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG,
+                                                               Subtarget);
+
+  case ISD::LOAD:               return LowerF128Load(Op, DAG);
+  case ISD::STORE:              return LowerF128Store(Op, DAG);
+  case ISD::FADD:               return LowerF128Op(Op, DAG,
+                                       getLibcallName(RTLIB::ADD_F128), 2);
+  case ISD::FSUB:               return LowerF128Op(Op, DAG,
+                                       getLibcallName(RTLIB::SUB_F128), 2);
+  case ISD::FMUL:               return LowerF128Op(Op, DAG,
+                                       getLibcallName(RTLIB::MUL_F128), 2);
+  case ISD::FDIV:               return LowerF128Op(Op, DAG,
+                                       getLibcallName(RTLIB::DIV_F128), 2);
+  case ISD::FSQRT:              return LowerF128Op(Op, DAG,
+                                       getLibcallName(RTLIB::SQRT_F128),1);
+  case ISD::FABS:
+  case ISD::FNEG:               return LowerFNEGorFABS(Op, DAG, isV9);
+  case ISD::FP_EXTEND:          return LowerF128_FPEXTEND(Op, DAG, *this);
+  case ISD::FP_ROUND:           return LowerF128_FPROUND(Op, DAG, *this);
+  case ISD::ADDC:
+  case ISD::ADDE:
+  case ISD::SUBC:
+  case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
+  case ISD::UMULO:
+  case ISD::SMULO:              return LowerUMULO_SMULO(Op, DAG, *this);
+  case ISD::ATOMIC_LOAD:
+  case ISD::ATOMIC_STORE:       return LowerATOMIC_LOAD_STORE(Op, DAG);
+  }
+}
+
+MachineBasicBlock *
+SparcTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                 MachineBasicBlock *BB) const {
+  switch (MI->getOpcode()) {
+  default: llvm_unreachable("Unknown SELECT_CC!");
+  case SP::SELECT_CC_Int_ICC:
+  case SP::SELECT_CC_FP_ICC:
+  case SP::SELECT_CC_DFP_ICC:
+  case SP::SELECT_CC_QFP_ICC:
+    return expandSelectCC(MI, BB, SP::BCOND);
+  case SP::SELECT_CC_Int_FCC:
+  case SP::SELECT_CC_FP_FCC:
+  case SP::SELECT_CC_DFP_FCC:
+  case SP::SELECT_CC_QFP_FCC:
+    return expandSelectCC(MI, BB, SP::FBCOND);
+
+  case SP::ATOMIC_LOAD_ADD_32:
+    return expandAtomicRMW(MI, BB, SP::ADDrr);
+  case SP::ATOMIC_LOAD_ADD_64:
+    return expandAtomicRMW(MI, BB, SP::ADDXrr);
+  case SP::ATOMIC_LOAD_SUB_32:
+    return expandAtomicRMW(MI, BB, SP::SUBrr);
+  case SP::ATOMIC_LOAD_SUB_64:
+    return expandAtomicRMW(MI, BB, SP::SUBXrr);
+  case SP::ATOMIC_LOAD_AND_32:
+    return expandAtomicRMW(MI, BB, SP::ANDrr);
+  case SP::ATOMIC_LOAD_AND_64:
+    return expandAtomicRMW(MI, BB, SP::ANDXrr);
+  case SP::ATOMIC_LOAD_OR_32:
+    return expandAtomicRMW(MI, BB, SP::ORrr);
+  case SP::ATOMIC_LOAD_OR_64:
+    return expandAtomicRMW(MI, BB, SP::ORXrr);
+  case SP::ATOMIC_LOAD_XOR_32:
+    return expandAtomicRMW(MI, BB, SP::XORrr);
+  case SP::ATOMIC_LOAD_XOR_64:
+    return expandAtomicRMW(MI, BB, SP::XORXrr);
+  case SP::ATOMIC_LOAD_NAND_32:
+    return expandAtomicRMW(MI, BB, SP::ANDrr);
+  case SP::ATOMIC_LOAD_NAND_64:
+    return expandAtomicRMW(MI, BB, SP::ANDXrr);
+
+  case SP::ATOMIC_SWAP_64:
+    return expandAtomicRMW(MI, BB, 0);
+
+  case SP::ATOMIC_LOAD_MAX_32:
+    return expandAtomicRMW(MI, BB, SP::MOVICCrr, SPCC::ICC_G);
+  case SP::ATOMIC_LOAD_MAX_64:
+    return expandAtomicRMW(MI, BB, SP::MOVXCCrr, SPCC::ICC_G);
+  case SP::ATOMIC_LOAD_MIN_32:
+    return expandAtomicRMW(MI, BB, SP::MOVICCrr, SPCC::ICC_LE);
+  case SP::ATOMIC_LOAD_MIN_64:
+    return expandAtomicRMW(MI, BB, SP::MOVXCCrr, SPCC::ICC_LE);
+  case SP::ATOMIC_LOAD_UMAX_32:
+    return expandAtomicRMW(MI, BB, SP::MOVICCrr, SPCC::ICC_GU);
+  case SP::ATOMIC_LOAD_UMAX_64:
+    return expandAtomicRMW(MI, BB, SP::MOVXCCrr, SPCC::ICC_GU);
+  case SP::ATOMIC_LOAD_UMIN_32:
+    return expandAtomicRMW(MI, BB, SP::MOVICCrr, SPCC::ICC_LEU);
+  case SP::ATOMIC_LOAD_UMIN_64:
+    return expandAtomicRMW(MI, BB, SP::MOVXCCrr, SPCC::ICC_LEU);
+  }
+}
+
+MachineBasicBlock*
+SparcTargetLowering::expandSelectCC(MachineInstr *MI,
+                                    MachineBasicBlock *BB,
+                                    unsigned BROpcode) const {
+  const TargetInstrInfo &TII = *getTargetMachine().getInstrInfo();
+  DebugLoc dl = MI->getDebugLoc();
+  unsigned CC = (SPCC::CondCodes)MI->getOperand(3).getImm();
+
+  // To "insert" a SELECT_CC instruction, we actually have to insert the diamond
+  // control-flow pattern.  The incoming instruction knows the destination vreg
+  // to set, the condition code register to branch on, the true/false values to
+  // select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   [f]bCC copy1MBB
+  //   fallthrough --> copy0MBB
+  MachineBasicBlock *thisMBB = BB;
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, copy0MBB);
+  F->insert(It, sinkMBB);
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), BB,
+                  std::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Add the true and fallthrough blocks as its successors.
+  BB->addSuccessor(copy0MBB);
+  BB->addSuccessor(sinkMBB);
+
+  BuildMI(BB, dl, TII.get(BROpcode)).addMBB(sinkMBB).addImm(CC);
+
+  //  copy0MBB:
+  //   %FalseValue = ...
+  //   # fallthrough to sinkMBB
+  BB = copy0MBB;
+
+  // Update machine-CFG edges
+  BB->addSuccessor(sinkMBB);
+
+  //  sinkMBB:
+  //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+  //  ...
+  BB = sinkMBB;
+  BuildMI(*BB, BB->begin(), dl, TII.get(SP::PHI), MI->getOperand(0).getReg())
+    .addReg(MI->getOperand(2).getReg()).addMBB(copy0MBB)
+    .addReg(MI->getOperand(1).getReg()).addMBB(thisMBB);
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+}
+
+MachineBasicBlock*
+SparcTargetLowering::expandAtomicRMW(MachineInstr *MI,
+                                     MachineBasicBlock *MBB,
+                                     unsigned Opcode,
+                                     unsigned CondCode) const {
+  const TargetInstrInfo &TII = *getTargetMachine().getInstrInfo();
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  DebugLoc DL = MI->getDebugLoc();
+
+  // MI is an atomic read-modify-write instruction of the form:
+  //
+  //   rd = atomicrmw<op> addr, rs2
+  //
+  // All three operands are registers.
+  unsigned DestReg = MI->getOperand(0).getReg();
+  unsigned AddrReg = MI->getOperand(1).getReg();
+  unsigned Rs2Reg  = MI->getOperand(2).getReg();
+
+  // SelectionDAG has already inserted memory barriers before and after MI, so
+  // we simply have to implement the operatiuon in terms of compare-and-swap.
+  //
+  //   %val0 = load %addr
+  // loop:
+  //   %val = phi %val0, %dest
+  //   %upd = op %val, %rs2
+  //   %dest = cas %addr, %val, %upd
+  //   cmp %val, %dest
+  //   bne loop
+  // done:
+  //
+  bool is64Bit = SP::I64RegsRegClass.hasSubClassEq(MRI.getRegClass(DestReg));
+  const TargetRegisterClass *ValueRC =
+    is64Bit ? &SP::I64RegsRegClass : &SP::IntRegsRegClass;
+  unsigned Val0Reg = MRI.createVirtualRegister(ValueRC);
+
+  BuildMI(*MBB, MI, DL, TII.get(is64Bit ? SP::LDXri : SP::LDri), Val0Reg)
+    .addReg(AddrReg).addImm(0);
+
+  // Split the basic block MBB before MI and insert the loop block in the hole.
+  MachineFunction::iterator MFI = MBB;
+  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+  MachineFunction *MF = MBB->getParent();
+  MachineBasicBlock *LoopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *DoneMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  ++MFI;
+  MF->insert(MFI, LoopMBB);
+  MF->insert(MFI, DoneMBB);
+
+  // Move MI and following instructions to DoneMBB.
+  DoneMBB->splice(DoneMBB->begin(), MBB, MI, MBB->end());
+  DoneMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  // Connect the CFG again.
+  MBB->addSuccessor(LoopMBB);
+  LoopMBB->addSuccessor(LoopMBB);
+  LoopMBB->addSuccessor(DoneMBB);
+
+  // Build the loop block.
+  unsigned ValReg = MRI.createVirtualRegister(ValueRC);
+  // Opcode == 0 means try to write Rs2Reg directly (ATOMIC_SWAP).
+  unsigned UpdReg = (Opcode ? MRI.createVirtualRegister(ValueRC) : Rs2Reg);
+
+  BuildMI(LoopMBB, DL, TII.get(SP::PHI), ValReg)
+    .addReg(Val0Reg).addMBB(MBB)
+    .addReg(DestReg).addMBB(LoopMBB);
+
+  if (CondCode) {
+    // This is one of the min/max operations. We need a CMPrr followed by a
+    // MOVXCC/MOVICC.
+    BuildMI(LoopMBB, DL, TII.get(SP::CMPrr)).addReg(ValReg).addReg(Rs2Reg);
+    BuildMI(LoopMBB, DL, TII.get(Opcode), UpdReg)
+      .addReg(ValReg).addReg(Rs2Reg).addImm(CondCode);
+  } else if (Opcode) {
+    BuildMI(LoopMBB, DL, TII.get(Opcode), UpdReg)
+      .addReg(ValReg).addReg(Rs2Reg);
+  }
+
+  if (MI->getOpcode() == SP::ATOMIC_LOAD_NAND_32 ||
+      MI->getOpcode() == SP::ATOMIC_LOAD_NAND_64) {
+    unsigned TmpReg = UpdReg;
+    UpdReg = MRI.createVirtualRegister(ValueRC);
+    BuildMI(LoopMBB, DL, TII.get(SP::XORri), UpdReg).addReg(TmpReg).addImm(-1);
+  }
+
+  BuildMI(LoopMBB, DL, TII.get(is64Bit ? SP::CASXrr : SP::CASrr), DestReg)
+    .addReg(AddrReg).addReg(ValReg).addReg(UpdReg)
+    .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+  BuildMI(LoopMBB, DL, TII.get(SP::CMPrr)).addReg(ValReg).addReg(DestReg);
+  BuildMI(LoopMBB, DL, TII.get(is64Bit ? SP::BPXCC : SP::BCOND))
+    .addMBB(LoopMBB).addImm(SPCC::ICC_NE);
+
+  MI->eraseFromParent();
+  return DoneMBB;
+}
+
+//===----------------------------------------------------------------------===//
+//                         Sparc Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+SparcTargetLowering::ConstraintType
+SparcTargetLowering::getConstraintType(const std::string &Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default:  break;
+    case 'r': return C_RegisterClass;
+    case 'I': // SIMM13
+      return C_Other;
+    }
+  }
+
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+TargetLowering::ConstraintWeight SparcTargetLowering::
+getSingleConstraintMatchWeight(AsmOperandInfo &info,
+                               const char *constraint) const {
+  ConstraintWeight weight = CW_Invalid;
+  Value *CallOperandVal = info.CallOperandVal;
+  // If we don't have a value, we can't do a match,
+  // but allow it at the lowest weight.
+  if (!CallOperandVal)
+    return CW_Default;
+
+  // Look at the constraint type.
+  switch (*constraint) {
+  default:
+    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+    break;
+  case 'I': // SIMM13
+    if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
+      if (isInt<13>(C->getSExtValue()))
+        weight = CW_Constant;
+    }
+    break;
+  }
+  return weight;
+}
+
+/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+/// vector.  If it is invalid, don't add anything to Ops.
+void SparcTargetLowering::
+LowerAsmOperandForConstraint(SDValue Op,
+                             std::string &Constraint,
+                             std::vector<SDValue> &Ops,
+                             SelectionDAG &DAG) const {
+  SDValue Result(nullptr, 0);
+
+  // Only support length 1 constraints for now.
+  if (Constraint.length() > 1)
+    return;
+
+  char ConstraintLetter = Constraint[0];
+  switch (ConstraintLetter) {
+  default: break;
+  case 'I':
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (isInt<13>(C->getSExtValue())) {
+        Result = DAG.getTargetConstant(C->getSExtValue(), Op.getValueType());
+        break;
+      }
+      return;
+    }
+  }
+
+  if (Result.getNode()) {
+    Ops.push_back(Result);
+    return;
+  }
+  TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+std::pair<unsigned, const TargetRegisterClass*>
+SparcTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
+                                                  MVT VT) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'r':
+      return std::make_pair(0U, &SP::IntRegsRegClass);
+    }
+  } else  if (!Constraint.empty() && Constraint.size() <= 5
+              && Constraint[0] == '{' && *(Constraint.end()-1) == '}') {
+    // constraint = '{r<d>}'
+    // Remove the braces from around the name.
+    StringRef name(Constraint.data()+1, Constraint.size()-2);
+    // Handle register aliases:
+    //       r0-r7   -> g0-g7
+    //       r8-r15  -> o0-o7
+    //       r16-r23 -> l0-l7
+    //       r24-r31 -> i0-i7
+    uint64_t intVal = 0;
+    if (name.substr(0, 1).equals("r")
+        && !name.substr(1).getAsInteger(10, intVal) && intVal <= 31) {
+      const char regTypes[] = { 'g', 'o', 'l', 'i' };
+      char regType = regTypes[intVal/8];
+      char regIdx = '0' + (intVal % 8);
+      char tmp[] = { '{', regType, regIdx, '}', 0 };
+      std::string newConstraint = std::string(tmp);
+      return TargetLowering::getRegForInlineAsmConstraint(newConstraint, VT);
+    }
+  }
+
+  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+}
+
+bool
+SparcTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+  // The Sparc target isn't yet aware of offsets.
+  return false;
+}
+
+void SparcTargetLowering::ReplaceNodeResults(SDNode *N,
+                                             SmallVectorImpl<SDValue>& Results,
+                                             SelectionDAG &DAG) const {
+
+  SDLoc dl(N);
+
+  RTLIB::Libcall libCall = RTLIB::UNKNOWN_LIBCALL;
+
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Do not know how to custom type legalize this operation!");
+
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+    // Custom lower only if it involves f128 or i64.
+    if (N->getOperand(0).getValueType() != MVT::f128
+        || N->getValueType(0) != MVT::i64)
+      return;
+    libCall = ((N->getOpcode() == ISD::FP_TO_SINT)
+               ? RTLIB::FPTOSINT_F128_I64
+               : RTLIB::FPTOUINT_F128_I64);
+
+    Results.push_back(LowerF128Op(SDValue(N, 0),
+                                  DAG,
+                                  getLibcallName(libCall),
+                                  1));
+    return;
+
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+    // Custom lower only if it involves f128 or i64.
+    if (N->getValueType(0) != MVT::f128
+        || N->getOperand(0).getValueType() != MVT::i64)
+      return;
+
+    libCall = ((N->getOpcode() == ISD::SINT_TO_FP)
+               ? RTLIB::SINTTOFP_I64_F128
+               : RTLIB::UINTTOFP_I64_F128);
+
+    Results.push_back(LowerF128Op(SDValue(N, 0),
+                                  DAG,
+                                  getLibcallName(libCall),
+                                  1));
+    return;
+  }
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcISelLowering.h b/contrib/llvm/lib/Target/Sparc/SparcISelLowering.h
new file mode 100644
index 0000000..a24cc82
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcISelLowering.h
@@ -0,0 +1,178 @@
+//===-- SparcISelLowering.h - Sparc DAG Lowering Interface ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that Sparc uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPARC_ISELLOWERING_H
+#define SPARC_ISELLOWERING_H
+
+#include "Sparc.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+  class SparcSubtarget;
+
+  namespace SPISD {
+    enum {
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+      CMPICC,      // Compare two GPR operands, set icc+xcc.
+      CMPFCC,      // Compare two FP operands, set fcc.
+      BRICC,       // Branch to dest on icc condition
+      BRXCC,       // Branch to dest on xcc condition (64-bit only).
+      BRFCC,       // Branch to dest on fcc condition
+      SELECT_ICC,  // Select between two values using the current ICC flags.
+      SELECT_XCC,  // Select between two values using the current XCC flags.
+      SELECT_FCC,  // Select between two values using the current FCC flags.
+
+      Hi, Lo,      // Hi/Lo operations, typically on a global address.
+
+      FTOI,        // FP to Int within a FP register.
+      ITOF,        // Int to FP within a FP register.
+      FTOX,        // FP to Int64 within a FP register.
+      XTOF,        // Int64 to FP within a FP register.
+
+      CALL,        // A call instruction.
+      RET_FLAG,    // Return with a flag operand.
+      GLOBAL_BASE_REG, // Global base reg for PIC.
+      FLUSHW,      // FLUSH register windows to stack.
+
+      TLS_ADD,     // For Thread Local Storage (TLS).
+      TLS_LD,
+      TLS_CALL
+    };
+  }
+
+  class SparcTargetLowering : public TargetLowering {
+    const SparcSubtarget *Subtarget;
+  public:
+    SparcTargetLowering(TargetMachine &TM);
+    SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+    /// computeKnownBitsForTargetNode - Determine which of the bits specified
+    /// in Mask are known to be either zero or one and return them in the
+    /// KnownZero/KnownOne bitsets.
+    void computeKnownBitsForTargetNode(const SDValue Op,
+                                       APInt &KnownZero,
+                                       APInt &KnownOne,
+                                       const SelectionDAG &DAG,
+                                       unsigned Depth = 0) const override;
+
+    MachineBasicBlock *
+      EmitInstrWithCustomInserter(MachineInstr *MI,
+                                  MachineBasicBlock *MBB) const override;
+
+    const char *getTargetNodeName(unsigned Opcode) const override;
+
+    ConstraintType getConstraintType(const std::string &Constraint) const override;
+    ConstraintWeight
+    getSingleConstraintMatchWeight(AsmOperandInfo &info,
+                                   const char *constraint) const override;
+    void LowerAsmOperandForConstraint(SDValue Op,
+                                      std::string &Constraint,
+                                      std::vector<SDValue> &Ops,
+                                      SelectionDAG &DAG) const override;
+    std::pair<unsigned, const TargetRegisterClass*>
+    getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const override;
+
+    bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+    MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i32; }
+
+    /// getSetCCResultType - Return the ISD::SETCC ValueType
+    EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
+
+    SDValue
+      LowerFormalArguments(SDValue Chain,
+                           CallingConv::ID CallConv,
+                           bool isVarArg,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           SDLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals) const override;
+    SDValue LowerFormalArguments_32(SDValue Chain,
+                                    CallingConv::ID CallConv,
+                                    bool isVarArg,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                    SDLoc dl, SelectionDAG &DAG,
+                                    SmallVectorImpl<SDValue> &InVals) const;
+    SDValue LowerFormalArguments_64(SDValue Chain,
+                                    CallingConv::ID CallConv,
+                                    bool isVarArg,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                    SDLoc dl, SelectionDAG &DAG,
+                                    SmallVectorImpl<SDValue> &InVals) const;
+
+    SDValue
+      LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                SmallVectorImpl<SDValue> &InVals) const override;
+    SDValue LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
+                         SmallVectorImpl<SDValue> &InVals) const;
+    SDValue LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
+                         SmallVectorImpl<SDValue> &InVals) const;
+
+    SDValue
+      LowerReturn(SDValue Chain,
+                  CallingConv::ID CallConv, bool isVarArg,
+                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  const SmallVectorImpl<SDValue> &OutVals,
+                  SDLoc dl, SelectionDAG &DAG) const override;
+    SDValue LowerReturn_32(SDValue Chain,
+                           CallingConv::ID CallConv, bool IsVarArg,
+                           const SmallVectorImpl<ISD::OutputArg> &Outs,
+                           const SmallVectorImpl<SDValue> &OutVals,
+                           SDLoc DL, SelectionDAG &DAG) const;
+    SDValue LowerReturn_64(SDValue Chain,
+                           CallingConv::ID CallConv, bool IsVarArg,
+                           const SmallVectorImpl<ISD::OutputArg> &Outs,
+                           const SmallVectorImpl<SDValue> &OutVals,
+                           SDLoc DL, SelectionDAG &DAG) const;
+
+    SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+
+    unsigned getSRetArgSize(SelectionDAG &DAG, SDValue Callee) const;
+    SDValue withTargetFlags(SDValue Op, unsigned TF, SelectionDAG &DAG) const;
+    SDValue makeHiLoPair(SDValue Op, unsigned HiTF, unsigned LoTF,
+                         SelectionDAG &DAG) const;
+    SDValue makeAddress(SDValue Op, SelectionDAG &DAG) const;
+
+    SDValue LowerF128_LibCallArg(SDValue Chain, ArgListTy &Args,
+                                 SDValue Arg, SDLoc DL,
+                                 SelectionDAG &DAG) const;
+    SDValue LowerF128Op(SDValue Op, SelectionDAG &DAG,
+                        const char *LibFuncName,
+                        unsigned numArgs) const;
+    SDValue LowerF128Compare(SDValue LHS, SDValue RHS,
+                             unsigned &SPCC,
+                             SDLoc DL,
+                             SelectionDAG &DAG) const;
+
+    bool ShouldShrinkFPConstant(EVT VT) const override {
+      // Do not shrink FP constpool if VT == MVT::f128.
+      // (ldd, call _Q_fdtoq) is more expensive than two ldds.
+      return VT != MVT::f128;
+    }
+
+    void ReplaceNodeResults(SDNode *N,
+                                    SmallVectorImpl<SDValue>& Results,
+                                    SelectionDAG &DAG) const override;
+
+    MachineBasicBlock *expandSelectCC(MachineInstr *MI, MachineBasicBlock *BB,
+                                      unsigned BROpcode) const;
+    MachineBasicBlock *expandAtomicRMW(MachineInstr *MI,
+                                       MachineBasicBlock *BB,
+                                       unsigned Opcode,
+                                       unsigned CondCode = 0) const;
+  };
+} // end namespace llvm
+
+#endif    // SPARC_ISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstr64Bit.td b/contrib/llvm/lib/Target/Sparc/SparcInstr64Bit.td
new file mode 100644
index 0000000..54d8240
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcInstr64Bit.td
@@ -0,0 +1,573 @@
+//===-- SparcInstr64Bit.td - 64-bit instructions for Sparc Target ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains instruction definitions and patterns needed for 64-bit
+// code generation on SPARC v9.
+//
+// Some SPARC v9 instructions are defined in SparcInstrInfo.td because they can
+// also be used in 32-bit code running on a SPARC v9 CPU.
+//
+//===----------------------------------------------------------------------===//
+
+let Predicates = [Is64Bit] in {
+// The same integer registers are used for i32 and i64 values.
+// When registers hold i32 values, the high bits are don't care.
+// This give us free trunc and anyext.
+def : Pat<(i64 (anyext i32:$val)), (COPY_TO_REGCLASS $val, I64Regs)>;
+def : Pat<(i32 (trunc i64:$val)), (COPY_TO_REGCLASS $val, IntRegs)>;
+
+} // Predicates = [Is64Bit]
+
+
+//===----------------------------------------------------------------------===//
+// 64-bit Shift Instructions.
+//===----------------------------------------------------------------------===//
+//
+// The 32-bit shift instructions are still available. The left shift srl
+// instructions shift all 64 bits, but it only accepts a 5-bit shift amount.
+//
+// The srl instructions only shift the low 32 bits and clear the high 32 bits.
+// Finally, sra shifts the low 32 bits and sign-extends to 64 bits.
+
+let Predicates = [Is64Bit] in {
+
+def : Pat<(i64 (zext i32:$val)), (SRLri $val, 0)>;
+def : Pat<(i64 (sext i32:$val)), (SRAri $val, 0)>;
+
+def : Pat<(i64 (and i64:$val, 0xffffffff)), (SRLri $val, 0)>;
+def : Pat<(i64 (sext_inreg i64:$val, i32)), (SRAri $val, 0)>;
+
+defm SLLX : F3_S<"sllx", 0b100101, 1, shl, i64, I64Regs>;
+defm SRLX : F3_S<"srlx", 0b100110, 1, srl, i64, I64Regs>;
+defm SRAX : F3_S<"srax", 0b100111, 1, sra, i64, I64Regs>;
+
+} // Predicates = [Is64Bit]
+
+
+//===----------------------------------------------------------------------===//
+// 64-bit Immediates.
+//===----------------------------------------------------------------------===//
+//
+// All 32-bit immediates can be materialized with sethi+or, but 64-bit
+// immediates may require more code. There may be a point where it is
+// preferable to use a constant pool load instead, depending on the
+// microarchitecture.
+
+// Single-instruction patterns.
+
+// The ALU instructions want their simm13 operands as i32 immediates.
+def as_i32imm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getSExtValue(), MVT::i32);
+}]>;
+def : Pat<(i64 simm13:$val), (ORri (i64 G0), (as_i32imm $val))>;
+def : Pat<(i64 SETHIimm:$val), (SETHIi (HI22 $val))>;
+
+// Double-instruction patterns.
+
+// All unsigned i32 immediates can be handled by sethi+or.
+def uimm32 : PatLeaf<(imm), [{ return isUInt<32>(N->getZExtValue()); }]>;
+def : Pat<(i64 uimm32:$val), (ORri (SETHIi (HI22 $val)), (LO10 $val))>,
+      Requires<[Is64Bit]>;
+
+// All negative i33 immediates can be handled by sethi+xor.
+def nimm33 : PatLeaf<(imm), [{
+  int64_t Imm = N->getSExtValue();
+  return Imm < 0 && isInt<33>(Imm);
+}]>;
+// Bits 10-31 inverted. Same as assembler's %hix.
+def HIX22 : SDNodeXForm<imm, [{
+  uint64_t Val = (~N->getZExtValue() >> 10) & ((1u << 22) - 1);
+  return CurDAG->getTargetConstant(Val, MVT::i32);
+}]>;
+// Bits 0-9 with ones in bits 10-31. Same as assembler's %lox.
+def LOX10 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(~(~N->getZExtValue() & 0x3ff), MVT::i32);
+}]>;
+def : Pat<(i64 nimm33:$val), (XORri (SETHIi (HIX22 $val)), (LOX10 $val))>,
+      Requires<[Is64Bit]>;
+
+// More possible patterns:
+//
+//   (sllx sethi, n)
+//   (sllx simm13, n)
+//
+// 3 instrs:
+//
+//   (xor (sllx sethi), simm13)
+//   (sllx (xor sethi, simm13))
+//
+// 4 instrs:
+//
+//   (or sethi, (sllx sethi))
+//   (xnor sethi, (sllx sethi))
+//
+// 5 instrs:
+//
+//   (or (sllx sethi), (or sethi, simm13))
+//   (xnor (sllx sethi), (or sethi, simm13))
+//   (or (sllx sethi), (sllx sethi))
+//   (xnor (sllx sethi), (sllx sethi))
+//
+// Worst case is 6 instrs:
+//
+//   (or (sllx (or sethi, simmm13)), (or sethi, simm13))
+
+// Bits 42-63, same as assembler's %hh.
+def HH22 : SDNodeXForm<imm, [{
+  uint64_t Val = (N->getZExtValue() >> 42) & ((1u << 22) - 1);
+  return CurDAG->getTargetConstant(Val, MVT::i32);
+}]>;
+// Bits 32-41, same as assembler's %hm.
+def HM10 : SDNodeXForm<imm, [{
+  uint64_t Val = (N->getZExtValue() >> 32) & ((1u << 10) - 1);
+  return CurDAG->getTargetConstant(Val, MVT::i32);
+}]>;
+def : Pat<(i64 imm:$val),
+          (ORrr (SLLXri (ORri (SETHIi (HH22 $val)), (HM10 $val)), (i32 32)),
+                (ORri (SETHIi (HI22 $val)), (LO10 $val)))>,
+      Requires<[Is64Bit]>;
+
+
+//===----------------------------------------------------------------------===//
+// 64-bit Integer Arithmetic and Logic.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [Is64Bit] in {
+
+// Register-register instructions.
+let isCodeGenOnly = 1 in {
+defm ANDX    : F3_12<"and", 0b000001, and, I64Regs, i64, i64imm>;
+defm ORX     : F3_12<"or",  0b000010, or,  I64Regs, i64, i64imm>;
+defm XORX    : F3_12<"xor", 0b000011, xor, I64Regs, i64, i64imm>;
+
+def ANDXNrr  : F3_1<2, 0b000101,
+                 (outs I64Regs:$dst), (ins I64Regs:$b, I64Regs:$c),
+                 "andn $b, $c, $dst",
+                 [(set i64:$dst, (and i64:$b, (not i64:$c)))]>;
+def ORXNrr   : F3_1<2, 0b000110,
+                 (outs I64Regs:$dst), (ins I64Regs:$b, I64Regs:$c),
+                 "orn $b, $c, $dst",
+                 [(set i64:$dst, (or i64:$b, (not i64:$c)))]>;
+def XNORXrr  : F3_1<2, 0b000111,
+                   (outs I64Regs:$dst), (ins I64Regs:$b, I64Regs:$c),
+                   "xnor $b, $c, $dst",
+                   [(set i64:$dst, (not (xor i64:$b, i64:$c)))]>;
+
+defm ADDX    : F3_12<"add", 0b000000, add, I64Regs, i64, i64imm>;
+defm SUBX    : F3_12<"sub", 0b000100, sub, I64Regs, i64, i64imm>;
+
+def TLS_ADDXrr : F3_1<2, 0b000000, (outs I64Regs:$rd),
+                   (ins I64Regs:$rs1, I64Regs:$rs2, TLSSym:$sym),
+                   "add $rs1, $rs2, $rd, $sym",
+                   [(set i64:$rd,
+                       (tlsadd i64:$rs1, i64:$rs2, tglobaltlsaddr:$sym))]>;
+
+// "LEA" form of add
+def LEAX_ADDri : F3_2<2, 0b000000,
+                     (outs I64Regs:$dst), (ins MEMri:$addr),
+                     "add ${addr:arith}, $dst",
+                     [(set iPTR:$dst, ADDRri:$addr)]>;
+}
+
+def : Pat<(SPcmpicc i64:$a, i64:$b), (CMPrr $a, $b)>;
+def : Pat<(SPcmpicc i64:$a, (i64 simm13:$b)), (CMPri $a, (as_i32imm $b))>;
+def : Pat<(ctpop i64:$src), (POPCrr $src)>;
+
+} // Predicates = [Is64Bit]
+
+
+//===----------------------------------------------------------------------===//
+// 64-bit Integer Multiply and Divide.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [Is64Bit] in {
+
+def MULXrr : F3_1<2, 0b001001,
+                  (outs I64Regs:$rd), (ins I64Regs:$rs1, I64Regs:$rs2),
+                  "mulx $rs1, $rs2, $rd",
+                  [(set i64:$rd, (mul i64:$rs1, i64:$rs2))]>;
+def MULXri : F3_2<2, 0b001001,
+                  (outs IntRegs:$rd), (ins IntRegs:$rs1, i64imm:$simm13),
+                  "mulx $rs1, $simm13, $rd",
+                  [(set i64:$rd, (mul i64:$rs1, (i64 simm13:$simm13)))]>;
+
+// Division can trap.
+let hasSideEffects = 1 in {
+def SDIVXrr : F3_1<2, 0b101101,
+                   (outs I64Regs:$rd), (ins I64Regs:$rs1, I64Regs:$rs2),
+                   "sdivx $rs1, $rs2, $rd",
+                   [(set i64:$rd, (sdiv i64:$rs1, i64:$rs2))]>;
+def SDIVXri : F3_2<2, 0b101101,
+                   (outs IntRegs:$rd), (ins IntRegs:$rs1, i64imm:$simm13),
+                   "sdivx $rs1, $simm13, $rd",
+                   [(set i64:$rd, (sdiv i64:$rs1, (i64 simm13:$simm13)))]>;
+
+def UDIVXrr : F3_1<2, 0b001101,
+                   (outs I64Regs:$rd), (ins I64Regs:$rs1, I64Regs:$rs2),
+                   "udivx $rs1, $rs2, $rd",
+                   [(set i64:$rd, (udiv i64:$rs1, i64:$rs2))]>;
+def UDIVXri : F3_2<2, 0b001101,
+                   (outs IntRegs:$rd), (ins IntRegs:$rs1, i64imm:$simm13),
+                   "udivx $rs1, $simm13, $rd",
+                   [(set i64:$rd, (udiv i64:$rs1, (i64 simm13:$simm13)))]>;
+} // hasSideEffects = 1
+
+} // Predicates = [Is64Bit]
+
+
+//===----------------------------------------------------------------------===//
+// 64-bit Loads and Stores.
+//===----------------------------------------------------------------------===//
+//
+// All the 32-bit loads and stores are available. The extending loads are sign
+// or zero-extending to 64 bits. The LDrr and LDri instructions load 32 bits
+// zero-extended to i64. Their mnemonic is lduw in SPARC v9 (Load Unsigned
+// Word).
+//
+// SPARC v9 adds 64-bit loads as well as a sign-extending ldsw i32 loads.
+
+let Predicates = [Is64Bit] in {
+
+// 64-bit loads.
+let DecoderMethod = "DecodeLoadInt" in
+  defm LDX   : Load<"ldx", 0b001011, load, I64Regs, i64>;
+
+let mayLoad = 1, isCodeGenOnly = 1, isAsmParserOnly = 1 in
+  def TLS_LDXrr : F3_1<3, 0b001011,
+                       (outs IntRegs:$dst), (ins MEMrr:$addr, TLSSym:$sym),
+                       "ldx [$addr], $dst, $sym",
+                       [(set i64:$dst,
+                           (tlsld ADDRrr:$addr, tglobaltlsaddr:$sym))]>;
+
+// Extending loads to i64.
+def : Pat<(i64 (zextloadi1 ADDRrr:$addr)), (LDUBrr ADDRrr:$addr)>;
+def : Pat<(i64 (zextloadi1 ADDRri:$addr)), (LDUBri ADDRri:$addr)>;
+def : Pat<(i64 (extloadi1 ADDRrr:$addr)), (LDUBrr ADDRrr:$addr)>;
+def : Pat<(i64 (extloadi1 ADDRri:$addr)), (LDUBri ADDRri:$addr)>;
+
+def : Pat<(i64 (zextloadi8 ADDRrr:$addr)), (LDUBrr ADDRrr:$addr)>;
+def : Pat<(i64 (zextloadi8 ADDRri:$addr)), (LDUBri ADDRri:$addr)>;
+def : Pat<(i64 (extloadi8 ADDRrr:$addr)),  (LDUBrr ADDRrr:$addr)>;
+def : Pat<(i64 (extloadi8 ADDRri:$addr)),  (LDUBri ADDRri:$addr)>;
+def : Pat<(i64 (sextloadi8 ADDRrr:$addr)), (LDSBrr ADDRrr:$addr)>;
+def : Pat<(i64 (sextloadi8 ADDRri:$addr)), (LDSBri ADDRri:$addr)>;
+
+def : Pat<(i64 (zextloadi16 ADDRrr:$addr)), (LDUHrr ADDRrr:$addr)>;
+def : Pat<(i64 (zextloadi16 ADDRri:$addr)), (LDUHri ADDRri:$addr)>;
+def : Pat<(i64 (extloadi16 ADDRrr:$addr)),  (LDUHrr ADDRrr:$addr)>;
+def : Pat<(i64 (extloadi16 ADDRri:$addr)),  (LDUHri ADDRri:$addr)>;
+def : Pat<(i64 (sextloadi16 ADDRrr:$addr)), (LDSHrr ADDRrr:$addr)>;
+def : Pat<(i64 (sextloadi16 ADDRri:$addr)), (LDSHri ADDRri:$addr)>;
+
+def : Pat<(i64 (zextloadi32 ADDRrr:$addr)), (LDrr ADDRrr:$addr)>;
+def : Pat<(i64 (zextloadi32 ADDRri:$addr)), (LDri ADDRri:$addr)>;
+def : Pat<(i64 (extloadi32 ADDRrr:$addr)),  (LDrr ADDRrr:$addr)>;
+def : Pat<(i64 (extloadi32 ADDRri:$addr)),  (LDri ADDRri:$addr)>;
+
+// Sign-extending load of i32 into i64 is a new SPARC v9 instruction.
+let DecoderMethod = "DecodeLoadInt" in
+  defm LDSW   : Load<"ldsw", 0b001000, sextloadi32, I64Regs, i64>;
+
+// 64-bit stores.
+let DecoderMethod = "DecodeStoreInt" in
+  defm STX    : Store<"stx", 0b001110, store,  I64Regs, i64>;
+
+// Truncating stores from i64 are identical to the i32 stores.
+def : Pat<(truncstorei8  i64:$src, ADDRrr:$addr), (STBrr ADDRrr:$addr, $src)>;
+def : Pat<(truncstorei8  i64:$src, ADDRri:$addr), (STBri ADDRri:$addr, $src)>;
+def : Pat<(truncstorei16 i64:$src, ADDRrr:$addr), (STHrr ADDRrr:$addr, $src)>;
+def : Pat<(truncstorei16 i64:$src, ADDRri:$addr), (STHri ADDRri:$addr, $src)>;
+def : Pat<(truncstorei32 i64:$src, ADDRrr:$addr), (STrr  ADDRrr:$addr, $src)>;
+def : Pat<(truncstorei32 i64:$src, ADDRri:$addr), (STri  ADDRri:$addr, $src)>;
+
+// store 0, addr -> store %g0, addr
+def : Pat<(store (i64 0), ADDRrr:$dst), (STXrr ADDRrr:$dst, (i64 G0))>;
+def : Pat<(store (i64 0), ADDRri:$dst), (STXri ADDRri:$dst, (i64 G0))>;
+
+} // Predicates = [Is64Bit]
+
+
+//===----------------------------------------------------------------------===//
+// 64-bit Conditionals.
+//===----------------------------------------------------------------------===//
+
+//
+// Flag-setting instructions like subcc and addcc set both icc and xcc flags.
+// The icc flags correspond to the 32-bit result, and the xcc are for the
+// full 64-bit result.
+//
+// We reuse CMPICC SDNodes for compares, but use new BRXCC branch nodes for
+// 64-bit compares. See LowerBR_CC.
+
+let Predicates = [Is64Bit] in {
+
+let Uses = [ICC], cc = 0b10 in
+  defm BPX : IPredBranch<"%xcc", [(SPbrxcc bb:$imm19, imm:$cond)]>;
+
+// Conditional moves on %xcc.
+let Uses = [ICC], Constraints = "$f = $rd" in {
+let intcc = 1, cc = 0b10 in {
+def MOVXCCrr : F4_1<0b101100, (outs IntRegs:$rd),
+                      (ins IntRegs:$rs2, IntRegs:$f, CCOp:$cond),
+                      "mov$cond %xcc, $rs2, $rd",
+                      [(set i32:$rd,
+                       (SPselectxcc i32:$rs2, i32:$f, imm:$cond))]>;
+def MOVXCCri : F4_2<0b101100, (outs IntRegs:$rd),
+                      (ins i32imm:$simm11, IntRegs:$f, CCOp:$cond),
+                      "mov$cond %xcc, $simm11, $rd",
+                      [(set i32:$rd,
+                       (SPselectxcc simm11:$simm11, i32:$f, imm:$cond))]>;
+} // cc
+
+let intcc = 1, opf_cc = 0b10 in {
+def FMOVS_XCC : F4_3<0b110101, 0b000001, (outs FPRegs:$rd),
+                      (ins FPRegs:$rs2, FPRegs:$f, CCOp:$cond),
+                      "fmovs$cond %xcc, $rs2, $rd",
+                      [(set f32:$rd,
+                       (SPselectxcc f32:$rs2, f32:$f, imm:$cond))]>;
+def FMOVD_XCC : F4_3<0b110101, 0b000010, (outs DFPRegs:$rd),
+                      (ins DFPRegs:$rs2, DFPRegs:$f, CCOp:$cond),
+                      "fmovd$cond %xcc, $rs2, $rd",
+                      [(set f64:$rd,
+                       (SPselectxcc f64:$rs2, f64:$f, imm:$cond))]>;
+def FMOVQ_XCC : F4_3<0b110101, 0b000011, (outs QFPRegs:$rd),
+                      (ins QFPRegs:$rs2, QFPRegs:$f, CCOp:$cond),
+                      "fmovq$cond %xcc, $rs2, $rd",
+                      [(set f128:$rd,
+                       (SPselectxcc f128:$rs2, f128:$f, imm:$cond))]>;
+} // opf_cc
+} // Uses, Constraints
+
+// Branch On integer register with Prediction (BPr).
+let isBranch = 1, isTerminator = 1, hasDelaySlot = 1 in
+multiclass BranchOnReg<bits<3> cond, string OpcStr> {
+  def napt : F2_4<cond, 0, 1, (outs), (ins I64Regs:$rs1, bprtarget16:$imm16),
+             !strconcat(OpcStr, " $rs1, $imm16"), []>;
+  def apt  : F2_4<cond, 1, 1, (outs), (ins I64Regs:$rs1, bprtarget16:$imm16),
+             !strconcat(OpcStr, ",a $rs1, $imm16"), []>;
+  def napn  : F2_4<cond, 0, 0, (outs), (ins I64Regs:$rs1, bprtarget16:$imm16),
+             !strconcat(OpcStr, ",pn $rs1, $imm16"), []>;
+  def apn : F2_4<cond, 1, 0, (outs), (ins I64Regs:$rs1, bprtarget16:$imm16),
+             !strconcat(OpcStr, ",a,pn $rs1, $imm16"), []>;
+}
+
+multiclass bpr_alias<string OpcStr, Instruction NAPT, Instruction APT> {
+  def : InstAlias<!strconcat(OpcStr, ",pt $rs1, $imm16"),
+                  (NAPT I64Regs:$rs1, bprtarget16:$imm16), 0>;
+  def : InstAlias<!strconcat(OpcStr, ",a,pt $rs1, $imm16"),
+                  (APT I64Regs:$rs1, bprtarget16:$imm16), 0>;
+}
+
+defm BPZ   : BranchOnReg<0b001, "brz">;
+defm BPLEZ : BranchOnReg<0b010, "brlez">;
+defm BPLZ  : BranchOnReg<0b011, "brlz">;
+defm BPNZ  : BranchOnReg<0b101, "brnz">;
+defm BPGZ  : BranchOnReg<0b110, "brgz">;
+defm BPGEZ : BranchOnReg<0b111, "brgez">;
+
+defm : bpr_alias<"brz",   BPZnapt,   BPZapt  >;
+defm : bpr_alias<"brlez", BPLEZnapt, BPLEZapt>;
+defm : bpr_alias<"brlz",  BPLZnapt,  BPLZapt >;
+defm : bpr_alias<"brnz",  BPNZnapt,  BPNZapt >;
+defm : bpr_alias<"brgz",  BPGZnapt,  BPGZapt >;
+defm : bpr_alias<"brgez", BPGEZnapt, BPGEZapt>;
+
+// Move integer register on register condition (MOVr).
+multiclass MOVR< bits<3> rcond,  string OpcStr> {
+  def rr : F4_4r<0b101111, 0b00000, rcond, (outs I64Regs:$rd),
+                   (ins I64Regs:$rs1, IntRegs:$rs2),
+                   !strconcat(OpcStr, " $rs1, $rs2, $rd"), []>;
+
+  def ri : F4_4i<0b101111, rcond, (outs I64Regs:$rd),
+                   (ins I64Regs:$rs1, i64imm:$simm10),
+                   !strconcat(OpcStr, " $rs1, $simm10, $rd"), []>;
+}
+
+defm MOVRRZ  : MOVR<0b001, "movrz">;
+defm MOVRLEZ : MOVR<0b010, "movrlez">;
+defm MOVRLZ  : MOVR<0b011, "movrlz">;
+defm MOVRNZ  : MOVR<0b101, "movrnz">;
+defm MOVRGZ  : MOVR<0b110, "movrgz">;
+defm MOVRGEZ : MOVR<0b111, "movrgez">;
+
+// Move FP register on integer register condition (FMOVr).
+multiclass FMOVR<bits<3> rcond, string OpcStr> {
+
+  def S : F4_4r<0b110101, 0b00101, rcond,
+                (outs FPRegs:$rd), (ins I64Regs:$rs1, FPRegs:$rs2),
+                !strconcat(!strconcat("fmovrs", OpcStr)," $rs1, $rs2, $rd"),
+                []>;
+  def D : F4_4r<0b110101, 0b00110, rcond,
+                (outs FPRegs:$rd), (ins I64Regs:$rs1, FPRegs:$rs2),
+                !strconcat(!strconcat("fmovrd", OpcStr)," $rs1, $rs2, $rd"),
+                []>;
+  def Q : F4_4r<0b110101, 0b00111, rcond,
+                (outs FPRegs:$rd), (ins I64Regs:$rs1, FPRegs:$rs2),
+                !strconcat(!strconcat("fmovrq", OpcStr)," $rs1, $rs2, $rd"),
+                []>, Requires<[HasHardQuad]>;
+}
+
+let Predicates = [HasV9] in {
+  defm FMOVRZ   : FMOVR<0b001, "z">;
+  defm FMOVRLEZ : FMOVR<0b010, "lez">;
+  defm FMOVRLZ  : FMOVR<0b011, "lz">;
+  defm FMOVRNZ  : FMOVR<0b101, "nz">;
+  defm FMOVRGZ  : FMOVR<0b110, "gz">;
+  defm FMOVRGEZ : FMOVR<0b111, "gez">;
+}
+
+//===----------------------------------------------------------------------===//
+// 64-bit Floating Point Conversions.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [Is64Bit] in {
+
+def FXTOS : F3_3u<2, 0b110100, 0b010000100,
+                 (outs FPRegs:$rd), (ins DFPRegs:$rs2),
+                 "fxtos $rs2, $rd",
+                 [(set FPRegs:$rd, (SPxtof DFPRegs:$rs2))]>;
+def FXTOD : F3_3u<2, 0b110100, 0b010001000,
+                 (outs DFPRegs:$rd), (ins DFPRegs:$rs2),
+                 "fxtod $rs2, $rd",
+                 [(set DFPRegs:$rd, (SPxtof DFPRegs:$rs2))]>;
+def FXTOQ : F3_3u<2, 0b110100, 0b010001100,
+                 (outs QFPRegs:$rd), (ins DFPRegs:$rs2),
+                 "fxtoq $rs2, $rd",
+                 [(set QFPRegs:$rd, (SPxtof DFPRegs:$rs2))]>,
+                 Requires<[HasHardQuad]>;
+
+def FSTOX : F3_3u<2, 0b110100, 0b010000001,
+                 (outs DFPRegs:$rd), (ins FPRegs:$rs2),
+                 "fstox $rs2, $rd",
+                 [(set DFPRegs:$rd, (SPftox FPRegs:$rs2))]>;
+def FDTOX : F3_3u<2, 0b110100, 0b010000010,
+                 (outs DFPRegs:$rd), (ins DFPRegs:$rs2),
+                 "fdtox $rs2, $rd",
+                 [(set DFPRegs:$rd, (SPftox DFPRegs:$rs2))]>;
+def FQTOX : F3_3u<2, 0b110100, 0b010000011,
+                 (outs DFPRegs:$rd), (ins QFPRegs:$rs2),
+                 "fqtox $rs2, $rd",
+                 [(set DFPRegs:$rd, (SPftox QFPRegs:$rs2))]>,
+                 Requires<[HasHardQuad]>;
+
+} // Predicates = [Is64Bit]
+
+def : Pat<(SPselectxcc i64:$t, i64:$f, imm:$cond),
+          (MOVXCCrr $t, $f, imm:$cond)>;
+def : Pat<(SPselectxcc (i64 simm11:$t), i64:$f, imm:$cond),
+          (MOVXCCri (as_i32imm $t), $f, imm:$cond)>;
+
+def : Pat<(SPselecticc i64:$t, i64:$f, imm:$cond),
+          (MOVICCrr $t, $f, imm:$cond)>;
+def : Pat<(SPselecticc (i64 simm11:$t), i64:$f, imm:$cond),
+          (MOVICCri (as_i32imm $t), $f, imm:$cond)>;
+
+def : Pat<(SPselectfcc i64:$t, i64:$f, imm:$cond),
+          (MOVFCCrr $t, $f, imm:$cond)>;
+def : Pat<(SPselectfcc (i64 simm11:$t), i64:$f, imm:$cond),
+          (MOVFCCri (as_i32imm $t), $f, imm:$cond)>;
+
+} // Predicates = [Is64Bit]
+
+
+// 64 bit SETHI
+let Predicates = [Is64Bit], isCodeGenOnly = 1 in {
+def SETHIXi : F2_1<0b100,
+                   (outs IntRegs:$rd), (ins i64imm:$imm22),
+                   "sethi $imm22, $rd",
+                   [(set i64:$rd, SETHIimm:$imm22)]>;
+}
+
+// ATOMICS.
+let Predicates = [Is64Bit], Constraints = "$swap = $rd" in {
+  def CASXrr: F3_1_asi<3, 0b111110, 0b10000000,
+                (outs I64Regs:$rd), (ins I64Regs:$rs1, I64Regs:$rs2,
+                                     I64Regs:$swap),
+                 "casx [$rs1], $rs2, $rd",
+                 [(set i64:$rd,
+                     (atomic_cmp_swap i64:$rs1, i64:$rs2, i64:$swap))]>;
+
+} // Predicates = [Is64Bit], Constraints = ...
+
+let Predicates = [Is64Bit] in {
+
+def : Pat<(atomic_fence imm, imm), (MEMBARi 0xf)>;
+
+// atomic_load_64 addr -> load addr
+def : Pat<(i64 (atomic_load ADDRrr:$src)), (LDXrr ADDRrr:$src)>;
+def : Pat<(i64 (atomic_load ADDRri:$src)), (LDXri ADDRri:$src)>;
+
+// atomic_store_64 val, addr -> store val, addr
+def : Pat<(atomic_store ADDRrr:$dst, i64:$val), (STXrr ADDRrr:$dst, $val)>;
+def : Pat<(atomic_store ADDRri:$dst, i64:$val), (STXri ADDRri:$dst, $val)>;
+
+} // Predicates = [Is64Bit]
+
+let usesCustomInserter = 1, hasCtrlDep = 1, mayLoad = 1, mayStore = 1,
+    Defs = [ICC] in
+multiclass AtomicRMW<SDPatternOperator op32, SDPatternOperator op64> {
+
+  def _32 : Pseudo<(outs IntRegs:$rd),
+                   (ins ptr_rc:$addr, IntRegs:$rs2), "",
+                   [(set i32:$rd, (op32 iPTR:$addr, i32:$rs2))]>;
+
+  let Predicates = [Is64Bit] in
+  def _64 : Pseudo<(outs I64Regs:$rd),
+                   (ins ptr_rc:$addr, I64Regs:$rs2), "",
+                   [(set i64:$rd, (op64 iPTR:$addr, i64:$rs2))]>;
+}
+
+defm ATOMIC_LOAD_ADD  : AtomicRMW<atomic_load_add_32,  atomic_load_add_64>;
+defm ATOMIC_LOAD_SUB  : AtomicRMW<atomic_load_sub_32,  atomic_load_sub_64>;
+defm ATOMIC_LOAD_AND  : AtomicRMW<atomic_load_and_32,  atomic_load_and_64>;
+defm ATOMIC_LOAD_OR   : AtomicRMW<atomic_load_or_32,   atomic_load_or_64>;
+defm ATOMIC_LOAD_XOR  : AtomicRMW<atomic_load_xor_32,  atomic_load_xor_64>;
+defm ATOMIC_LOAD_NAND : AtomicRMW<atomic_load_nand_32, atomic_load_nand_64>;
+defm ATOMIC_LOAD_MIN  : AtomicRMW<atomic_load_min_32,  atomic_load_min_64>;
+defm ATOMIC_LOAD_MAX  : AtomicRMW<atomic_load_max_32,  atomic_load_max_64>;
+defm ATOMIC_LOAD_UMIN : AtomicRMW<atomic_load_umin_32, atomic_load_umin_64>;
+defm ATOMIC_LOAD_UMAX : AtomicRMW<atomic_load_umax_32, atomic_load_umax_64>;
+
+// There is no 64-bit variant of SWAP, so use a pseudo.
+let usesCustomInserter = 1, hasCtrlDep = 1, mayLoad = 1, mayStore = 1,
+    Defs = [ICC], Predicates = [Is64Bit] in
+def ATOMIC_SWAP_64 : Pseudo<(outs I64Regs:$rd),
+                            (ins ptr_rc:$addr, I64Regs:$rs2), "",
+                            [(set i64:$rd,
+                                  (atomic_swap_64 iPTR:$addr, i64:$rs2))]>;
+
+let Predicates = [Is64Bit], hasSideEffects = 1, Uses = [ICC], cc = 0b10 in
+ defm TXCC : TRAP<"%xcc">;
+
+// Global addresses, constant pool entries
+let Predicates = [Is64Bit] in {
+
+def : Pat<(SPhi tglobaladdr:$in), (SETHIi tglobaladdr:$in)>;
+def : Pat<(SPlo tglobaladdr:$in), (ORXri (i64 G0), tglobaladdr:$in)>;
+def : Pat<(SPhi tconstpool:$in), (SETHIi tconstpool:$in)>;
+def : Pat<(SPlo tconstpool:$in), (ORXri (i64 G0), tconstpool:$in)>;
+
+// GlobalTLS addresses
+def : Pat<(SPhi tglobaltlsaddr:$in), (SETHIi tglobaltlsaddr:$in)>;
+def : Pat<(SPlo tglobaltlsaddr:$in), (ORXri (i64 G0), tglobaltlsaddr:$in)>;
+def : Pat<(add (SPhi tglobaltlsaddr:$in1), (SPlo tglobaltlsaddr:$in2)),
+          (ADDXri (SETHIXi tglobaltlsaddr:$in1), (tglobaltlsaddr:$in2))>;
+def : Pat<(xor (SPhi tglobaltlsaddr:$in1), (SPlo tglobaltlsaddr:$in2)),
+          (XORXri  (SETHIXi tglobaltlsaddr:$in1), (tglobaltlsaddr:$in2))>;
+
+// Blockaddress
+def : Pat<(SPhi tblockaddress:$in), (SETHIi tblockaddress:$in)>;
+def : Pat<(SPlo tblockaddress:$in), (ORXri (i64 G0), tblockaddress:$in)>;
+
+// Add reg, lo.  This is used when taking the addr of a global/constpool entry.
+def : Pat<(add iPTR:$r, (SPlo tglobaladdr:$in)), (ADDXri $r, tglobaladdr:$in)>;
+def : Pat<(add iPTR:$r, (SPlo tconstpool:$in)),  (ADDXri $r, tconstpool:$in)>;
+def : Pat<(add iPTR:$r, (SPlo tblockaddress:$in)),
+                        (ADDXri $r, tblockaddress:$in)>;
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstrAliases.td b/contrib/llvm/lib/Target/Sparc/SparcInstrAliases.td
new file mode 100644
index 0000000..d36f67b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcInstrAliases.td
@@ -0,0 +1,325 @@
+//===-- SparcInstrAliases.td - Instruction Aliases for Sparc Target -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains instruction aliases for Sparc.
+//===----------------------------------------------------------------------===//
+
+// Instruction aliases for conditional moves.
+
+// mov<cond> <ccreg> rs2, rd
+multiclass intcond_mov_alias<string cond, int condVal, string ccreg,
+                          Instruction movrr, Instruction movri,
+                          Instruction fmovs, Instruction fmovd> {
+
+  // mov<cond> (%icc|%xcc), rs2, rd
+  def : InstAlias<!strconcat(!strconcat(!strconcat("mov", cond), ccreg),
+                             ", $rs2, $rd"),
+                  (movrr IntRegs:$rd, IntRegs:$rs2, condVal)>;
+
+  // mov<cond> (%icc|%xcc), simm11, rd
+  def : InstAlias<!strconcat(!strconcat(!strconcat("mov", cond), ccreg),
+                             ", $simm11, $rd"),
+                  (movri IntRegs:$rd, i32imm:$simm11, condVal)>;
+
+  // fmovs<cond> (%icc|%xcc), $rs2, $rd
+  def : InstAlias<!strconcat(!strconcat(!strconcat("fmovs", cond), ccreg),
+                             ", $rs2, $rd"),
+                  (fmovs FPRegs:$rd, FPRegs:$rs2, condVal)>;
+
+  // fmovd<cond> (%icc|%xcc), $rs2, $rd
+  def : InstAlias<!strconcat(!strconcat(!strconcat("fmovd", cond), ccreg),
+                             ", $rs2, $rd"),
+                  (fmovd DFPRegs:$rd, DFPRegs:$rs2, condVal)>;
+}
+
+// mov<cond> <ccreg> rs2, rd
+multiclass fpcond_mov_alias<string cond, int condVal,
+                           Instruction movrr, Instruction movri,
+                           Instruction fmovs, Instruction fmovd> {
+
+  // mov<cond> %fcc[0-3], rs2, rd
+  def : InstAlias<!strconcat(!strconcat("mov", cond), " $cc, $rs2, $rd"),
+                  (movrr IntRegs:$rd, FCCRegs:$cc, IntRegs:$rs2, condVal)>;
+
+  // mov<cond> %fcc[0-3], simm11, rd
+  def : InstAlias<!strconcat(!strconcat("mov", cond), " $cc, $simm11, $rd"),
+                  (movri IntRegs:$rd, FCCRegs:$cc, i32imm:$simm11, condVal)>;
+
+  // fmovs<cond> %fcc[0-3], $rs2, $rd
+  def : InstAlias<!strconcat(!strconcat("fmovs", cond), " $cc, $rs2, $rd"),
+                  (fmovs FPRegs:$rd, FCCRegs:$cc, FPRegs:$rs2, condVal)>;
+
+  // fmovd<cond> %fcc[0-3], $rs2, $rd
+  def : InstAlias<!strconcat(!strconcat("fmovd", cond), " $cc, $rs2, $rd"),
+                  (fmovd DFPRegs:$rd, FCCRegs:$cc, DFPRegs:$rs2, condVal)>;
+}
+
+// Instruction aliases for integer conditional branches and moves.
+multiclass int_cond_alias<string cond, int condVal> {
+
+  // b<cond> $imm
+  def : InstAlias<!strconcat(!strconcat("b", cond), " $imm"),
+                  (BCOND brtarget:$imm, condVal)>;
+
+  // b<cond>,a $imm
+  def : InstAlias<!strconcat(!strconcat("b", cond), ",a $imm"),
+                  (BCONDA brtarget:$imm, condVal)>;
+
+  // b<cond> %icc, $imm
+  def : InstAlias<!strconcat(!strconcat("b", cond), " %icc, $imm"),
+                  (BPICC brtarget:$imm, condVal)>, Requires<[HasV9]>;
+
+  // b<cond>,pt %icc, $imm
+  def : InstAlias<!strconcat(!strconcat("b", cond), ",pt %icc, $imm"),
+                  (BPICC brtarget:$imm, condVal)>, Requires<[HasV9]>;
+
+  // b<cond>,a %icc, $imm
+  def : InstAlias<!strconcat(!strconcat("b", cond), ",a %icc, $imm"),
+                  (BPICCA brtarget:$imm, condVal)>, Requires<[HasV9]>;
+
+  // b<cond>,a,pt %icc, $imm
+  def : InstAlias<!strconcat(!strconcat("b", cond), ",a,pt %icc, $imm"),
+                  (BPICCA brtarget:$imm, condVal)>, Requires<[HasV9]>;
+
+  // b<cond>,pn %icc, $imm
+  def : InstAlias<!strconcat(!strconcat("b", cond), ",pn %icc, $imm"),
+                  (BPICCNT brtarget:$imm, condVal)>, Requires<[HasV9]>;
+
+  // b<cond>,a,pn %icc, $imm
+  def : InstAlias<!strconcat(!strconcat("b", cond), ",a,pn %icc, $imm"),
+                  (BPICCANT brtarget:$imm, condVal)>, Requires<[HasV9]>;
+
+  // b<cond> %xcc, $imm
+  def : InstAlias<!strconcat(!strconcat("b", cond), " %xcc, $imm"),
+                  (BPXCC brtarget:$imm, condVal)>, Requires<[Is64Bit]>;
+
+  // b<cond>,pt %xcc, $imm
+  def : InstAlias<!strconcat(!strconcat("b", cond), ",pt %xcc, $imm"),
+                  (BPXCC brtarget:$imm, condVal)>, Requires<[Is64Bit]>;
+
+  // b<cond>,a %xcc, $imm
+  def : InstAlias<!strconcat(!strconcat("b", cond), ",a %xcc, $imm"),
+                  (BPXCCA brtarget:$imm, condVal)>, Requires<[Is64Bit]>;
+
+  // b<cond>,a,pt %xcc, $imm
+  def : InstAlias<!strconcat(!strconcat("b", cond), ",a,pt %xcc, $imm"),
+                  (BPXCCA brtarget:$imm, condVal)>, Requires<[Is64Bit]>;
+
+  // b<cond>,pn %xcc, $imm
+  def : InstAlias<!strconcat(!strconcat("b", cond), ",pn %xcc, $imm"),
+                  (BPXCCNT brtarget:$imm, condVal)>, Requires<[Is64Bit]>;
+
+  // b<cond>,a,pn %xcc, $imm
+  def : InstAlias<!strconcat(!strconcat("b", cond), ",a,pn %xcc, $imm"),
+                  (BPXCCANT brtarget:$imm, condVal)>, Requires<[Is64Bit]>;
+
+
+  defm : intcond_mov_alias<cond, condVal, " %icc",
+                            MOVICCrr, MOVICCri,
+                            FMOVS_ICC, FMOVD_ICC>, Requires<[HasV9]>;
+
+  defm : intcond_mov_alias<cond, condVal, " %xcc",
+                            MOVXCCrr, MOVXCCri,
+                            FMOVS_XCC, FMOVD_XCC>, Requires<[Is64Bit]>;
+
+  // fmovq<cond> (%icc|%xcc), $rs2, $rd
+  def : InstAlias<!strconcat(!strconcat("fmovq", cond), " %icc, $rs2, $rd"),
+                  (FMOVQ_ICC QFPRegs:$rd, QFPRegs:$rs2, condVal)>,
+                  Requires<[HasV9, HasHardQuad]>;
+  def : InstAlias<!strconcat(!strconcat("fmovq", cond), " %xcc, $rs2, $rd"),
+                  (FMOVQ_XCC QFPRegs:$rd, QFPRegs:$rs2, condVal)>,
+                  Requires<[Is64Bit, HasHardQuad]>;
+
+  // t<cond> %icc, rs1 + rs2
+  def : InstAlias<!strconcat(!strconcat("t", cond), " %icc, $rs1 + $rs2"),
+                  (TICCrr IntRegs:$rs1, IntRegs:$rs2, condVal)>,
+                  Requires<[HasV9]>;
+
+  // t<cond> %icc,  rs => t<cond> %icc, G0 + rs
+  def : InstAlias<!strconcat(!strconcat("t", cond), " %icc, $rs2"),
+                  (TICCrr G0, IntRegs:$rs2, condVal)>,
+                  Requires<[HasV9]>;
+
+  // t<cond> %xcc, rs1 + rs2
+  def : InstAlias<!strconcat(!strconcat("t", cond), " %xcc, $rs1 + $rs2"),
+                  (TXCCrr IntRegs:$rs1, IntRegs:$rs2, condVal)>,
+                  Requires<[HasV9]>;
+
+  // t<cond> %xcc, rs => t<cond> %xcc, G0 + rs
+  def : InstAlias<!strconcat(!strconcat("t", cond), " %xcc, $rs2"),
+                  (TXCCrr G0, IntRegs:$rs2, condVal)>,
+                  Requires<[HasV9]>;
+
+  // t<cond> rs1 + rs2 => t<cond> %icc, rs1 + rs2
+  def : InstAlias<!strconcat(!strconcat("t", cond), " $rs1 + $rs2"),
+                  (TICCrr IntRegs:$rs1, IntRegs:$rs2, condVal)>;
+
+  // t<cond> rs=> t<cond> %icc,  G0 + rs2
+  def : InstAlias<!strconcat(!strconcat("t", cond), " $rs2"),
+                  (TICCrr G0, IntRegs:$rs2, condVal)>;
+
+  // t<cond> %icc, rs1 + imm
+  def : InstAlias<!strconcat(!strconcat("t", cond), " %icc, $rs1 + $imm"),
+                  (TICCri IntRegs:$rs1, i32imm:$imm, condVal)>,
+                  Requires<[HasV9]>;
+  // t<cond> %icc, imm => t<cond> %icc, G0 + imm
+  def : InstAlias<!strconcat(!strconcat("t", cond), " %icc, $imm"),
+                  (TICCri G0, i32imm:$imm, condVal)>,
+                  Requires<[HasV9]>;
+  // t<cond> %xcc, rs1 + imm
+  def : InstAlias<!strconcat(!strconcat("t", cond), " %xcc, $rs1 + $imm"),
+                  (TXCCri IntRegs:$rs1, i32imm:$imm, condVal)>,
+                  Requires<[HasV9]>;
+  // t<cond> %xcc, imm => t<cond> %xcc, G0 + imm
+  def : InstAlias<!strconcat(!strconcat("t", cond), " %xcc, $imm"),
+                  (TXCCri G0, i32imm:$imm, condVal)>,
+                  Requires<[HasV9]>;
+
+  // t<cond> rs1 + imm => t<cond> %icc, rs1 + imm
+  def : InstAlias<!strconcat(!strconcat("t", cond), " $rs1 + $imm"),
+                  (TICCri IntRegs:$rs1, i32imm:$imm, condVal)>;
+
+  // t<cond> imm => t<cond> %icc, G0 + imm
+  def : InstAlias<!strconcat(!strconcat("t", cond), " $imm"),
+                  (TICCri G0, i32imm:$imm, condVal)>;
+
+}
+
+
+// Instruction aliases for floating point conditional branches and moves.
+multiclass fp_cond_alias<string cond, int condVal> {
+
+  // fb<cond> $imm
+  def : InstAlias<!strconcat(!strconcat("fb", cond), " $imm"),
+                  (FBCOND brtarget:$imm, condVal), 0>;
+
+  // fb<cond>,a $imm
+  def : InstAlias<!strconcat(!strconcat("fb", cond), ",a $imm"),
+                  (FBCONDA brtarget:$imm, condVal), 0>;
+
+  // fb<cond> %fcc0, $imm
+  def : InstAlias<!strconcat(!strconcat("fb", cond), " $cc, $imm"),
+                  (BPFCC brtarget:$imm, condVal, FCCRegs:$cc)>,
+                  Requires<[HasV9]>;
+
+  // fb<cond>,pt %fcc0, $imm
+  def : InstAlias<!strconcat(!strconcat("fb", cond), ",pt $cc, $imm"),
+                  (BPFCC brtarget:$imm, condVal, FCCRegs:$cc)>,
+                  Requires<[HasV9]>;
+
+  // fb<cond>,a %fcc0, $imm
+  def : InstAlias<!strconcat(!strconcat("fb", cond), ",a $cc, $imm"),
+                  (BPFCCA brtarget:$imm, condVal, FCCRegs:$cc)>,
+                  Requires<[HasV9]>;
+
+  // fb<cond>,a,pt %fcc0, $imm
+  def : InstAlias<!strconcat(!strconcat("fb", cond), ",a,pt $cc, $imm"),
+                  (BPFCCA brtarget:$imm, condVal, FCCRegs:$cc)>,
+                   Requires<[HasV9]>;
+
+  // fb<cond>,pn %fcc0, $imm
+  def : InstAlias<!strconcat(!strconcat("fb", cond), ",pn $cc, $imm"),
+                  (BPFCCNT brtarget:$imm, condVal, FCCRegs:$cc)>,
+                   Requires<[HasV9]>;
+
+  // fb<cond>,a,pn %fcc0, $imm
+  def : InstAlias<!strconcat(!strconcat("fb", cond), ",a,pn $cc, $imm"),
+                  (BPFCCANT brtarget:$imm, condVal, FCCRegs:$cc)>,
+                  Requires<[HasV9]>;
+
+  defm : fpcond_mov_alias<cond, condVal,
+                          V9MOVFCCrr, V9MOVFCCri,
+                          V9FMOVS_FCC, V9FMOVD_FCC>, Requires<[HasV9]>;
+
+  // fmovq<cond> %fcc0, $rs2, $rd
+  def : InstAlias<!strconcat(!strconcat("fmovq", cond), " $cc, $rs2, $rd"),
+                  (V9FMOVQ_FCC QFPRegs:$rd, FCCRegs:$cc, QFPRegs:$rs2,
+                                                          condVal)>,
+                  Requires<[HasV9, HasHardQuad]>;
+}
+
+defm : int_cond_alias<"a",    0b1000>;
+defm : int_cond_alias<"n",    0b0000>;
+defm : int_cond_alias<"ne",   0b1001>;
+defm : int_cond_alias<"e",    0b0001>;
+defm : int_cond_alias<"g",    0b1010>;
+defm : int_cond_alias<"le",   0b0010>;
+defm : int_cond_alias<"ge",   0b1011>;
+defm : int_cond_alias<"l",    0b0011>;
+defm : int_cond_alias<"gu",   0b1100>;
+defm : int_cond_alias<"leu",  0b0100>;
+defm : int_cond_alias<"cc",   0b1101>;
+defm : int_cond_alias<"cs",   0b0101>;
+defm : int_cond_alias<"pos",  0b1110>;
+defm : int_cond_alias<"neg",  0b0110>;
+defm : int_cond_alias<"vc",   0b1111>;
+defm : int_cond_alias<"vs",   0b0111>;
+
+defm : fp_cond_alias<"a",     0b0000>;
+defm : fp_cond_alias<"n",     0b1000>;
+defm : fp_cond_alias<"u",     0b0111>;
+defm : fp_cond_alias<"g",     0b0110>;
+defm : fp_cond_alias<"ug",    0b0101>;
+defm : fp_cond_alias<"l",     0b0100>;
+defm : fp_cond_alias<"ul",    0b0011>;
+defm : fp_cond_alias<"lg",    0b0010>;
+defm : fp_cond_alias<"ne",    0b0001>;
+defm : fp_cond_alias<"e",     0b1001>;
+defm : fp_cond_alias<"ue",    0b1010>;
+defm : fp_cond_alias<"ge",    0b1011>;
+defm : fp_cond_alias<"uge",   0b1100>;
+defm : fp_cond_alias<"le",    0b1101>;
+defm : fp_cond_alias<"ule",   0b1110>;
+defm : fp_cond_alias<"o",     0b1111>;
+
+// Instruction aliases for JMPL.
+
+// jmp addr -> jmpl addr, %g0
+def : InstAlias<"jmp $addr", (JMPLrr G0, MEMrr:$addr), 0>;
+def : InstAlias<"jmp $addr", (JMPLri G0, MEMri:$addr), 0>;
+
+// call addr -> jmpl addr, %o7
+def : InstAlias<"call $addr", (JMPLrr O7, MEMrr:$addr), 0>;
+def : InstAlias<"call $addr", (JMPLri O7, MEMri:$addr), 0>;
+
+// retl -> RETL 8
+def : InstAlias<"retl", (RETL 8)>;
+
+// ret -> RET 8
+def : InstAlias<"ret", (RET 8)>;
+
+// mov reg, rd -> or %g0, reg, rd
+def : InstAlias<"mov $rs2, $rd", (ORrr IntRegs:$rd, G0, IntRegs:$rs2)>;
+
+// mov simm13, rd -> or %g0, simm13, rd
+def : InstAlias<"mov $simm13, $rd", (ORri IntRegs:$rd, G0, i32imm:$simm13)>;
+
+// restore -> restore %g0, %g0, %g0
+def : InstAlias<"restore", (RESTORErr G0, G0, G0)>;
+
+def : MnemonicAlias<"return", "rett">, Requires<[HasV9]>;
+
+def : MnemonicAlias<"addc", "addx">, Requires<[HasV9]>;
+def : MnemonicAlias<"addccc", "addxcc">, Requires<[HasV9]>;
+
+def : MnemonicAlias<"subc", "subx">, Requires<[HasV9]>;
+def : MnemonicAlias<"subccc", "subxcc">, Requires<[HasV9]>;
+
+
+def : InstAlias<"fcmps $rs1, $rs2", (V9FCMPS FCC0, FPRegs:$rs1, FPRegs:$rs2)>;
+def : InstAlias<"fcmpd $rs1, $rs2", (V9FCMPD FCC0, DFPRegs:$rs1, DFPRegs:$rs2)>;
+def : InstAlias<"fcmpq $rs1, $rs2", (V9FCMPQ FCC0, QFPRegs:$rs1, QFPRegs:$rs2)>,
+                Requires<[HasHardQuad]>;
+
+def : InstAlias<"fcmpes $rs1, $rs2", (V9FCMPES FCC0, FPRegs:$rs1, FPRegs:$rs2)>;
+def : InstAlias<"fcmped $rs1, $rs2", (V9FCMPED FCC0, DFPRegs:$rs1,
+                                                     DFPRegs:$rs2)>;
+def : InstAlias<"fcmpeq $rs1, $rs2", (V9FCMPEQ FCC0, QFPRegs:$rs1,
+                                                     QFPRegs:$rs2)>,
+                Requires<[HasHardQuad]>;
diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstrFormats.td b/contrib/llvm/lib/Target/Sparc/SparcInstrFormats.td
new file mode 100644
index 0000000..3b5e238
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcInstrFormats.td
@@ -0,0 +1,330 @@
+//===-- SparcInstrFormats.td - Sparc Instruction Formats ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+class InstSP<dag outs, dag ins, string asmstr, list<dag> pattern>
+          : Instruction {
+  field bits<32> Inst;
+
+  let Namespace = "SP";
+  let Size = 4;
+
+  bits<2> op;
+  let Inst{31-30} = op;               // Top two bits are the 'op' field
+
+  dag OutOperandList = outs;
+  dag InOperandList = ins;
+  let AsmString   = asmstr;
+  let Pattern = pattern;
+
+  let DecoderNamespace = "Sparc";
+  field bits<32> SoftFail = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Format #2 instruction classes in the Sparc
+//===----------------------------------------------------------------------===//
+
+// Format 2 instructions
+class F2<dag outs, dag ins, string asmstr, list<dag> pattern>
+   : InstSP<outs, ins, asmstr, pattern> {
+  bits<3>  op2;
+  bits<22> imm22;
+  let op          = 0;    // op = 0
+  let Inst{24-22} = op2;
+  let Inst{21-0}  = imm22;
+}
+
+// Specific F2 classes: SparcV8 manual, page 44
+//
+class F2_1<bits<3> op2Val, dag outs, dag ins, string asmstr, list<dag> pattern>
+   : F2<outs, ins, asmstr, pattern> {
+  bits<5>  rd;
+
+  let op2         = op2Val;
+
+  let Inst{29-25} = rd;
+}
+
+class F2_2<bits<3> op2Val, bit annul, dag outs, dag ins, string asmstr,
+           list<dag> pattern> : F2<outs, ins, asmstr, pattern> {
+  bits<4>   cond;
+  let op2         = op2Val;
+
+  let Inst{29}    = annul;
+  let Inst{28-25} = cond;
+}
+
+class F2_3<bits<3> op2Val, bit annul, bit pred,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+      : InstSP<outs, ins, asmstr, pattern> {
+  bits<2>  cc;
+  bits<4>  cond;
+  bits<19> imm19;
+
+  let op          = 0;    // op = 0
+
+  let Inst{29}    = annul;
+  let Inst{28-25} = cond;
+  let Inst{24-22} = op2Val;
+  let Inst{21-20} = cc;
+  let Inst{19}    = pred;
+  let Inst{18-0}  = imm19;
+}
+
+class F2_4<bits<3> cond, bit annul, bit pred,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+      : InstSP<outs, ins, asmstr, pattern> {
+  bits<16> imm16;
+  bits<5>  rs1;
+
+  let op          = 0;    // op = 0
+
+  let Inst{29}    = annul;
+  let Inst{28}    = 0;
+  let Inst{27-25} = cond;
+  let Inst{24-22} = 0b011;
+  let Inst{21-20} = imm16{15-14};
+  let Inst{19}    = pred;
+  let Inst{18-14} = rs1;
+  let Inst{13-0}  = imm16{13-0};
+}
+
+
+//===----------------------------------------------------------------------===//
+// Format #3 instruction classes in the Sparc
+//===----------------------------------------------------------------------===//
+
+class F3<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstSP<outs, ins, asmstr, pattern> {
+  bits<5> rd;
+  bits<6> op3;
+  bits<5> rs1;
+  let op{1} = 1;   // Op = 2 or 3
+  let Inst{29-25} = rd;
+  let Inst{24-19} = op3;
+  let Inst{18-14} = rs1;
+}
+
+// Specific F3 classes: SparcV8 manual, page 44
+//
+class F3_1_asi<bits<2> opVal, bits<6> op3val, bits<8> asi, dag outs, dag ins,
+           string asmstr, list<dag> pattern> : F3<outs, ins, asmstr, pattern> {
+  bits<5> rs2;
+
+  let op         = opVal;
+  let op3        = op3val;
+
+  let Inst{13}   = 0;     // i field = 0
+  let Inst{12-5} = asi;   // address space identifier
+  let Inst{4-0}  = rs2;
+}
+
+class F3_1<bits<2> opVal, bits<6> op3val, dag outs, dag ins, string asmstr,
+       list<dag> pattern> : F3_1_asi<opVal, op3val, 0, outs, ins,
+                                                     asmstr, pattern>;
+
+class F3_2<bits<2> opVal, bits<6> op3val, dag outs, dag ins,
+           string asmstr, list<dag> pattern> : F3<outs, ins, asmstr, pattern> {
+  bits<13> simm13;
+
+  let op         = opVal;
+  let op3        = op3val;
+
+  let Inst{13}   = 1;     // i field = 1
+  let Inst{12-0} = simm13;
+}
+
+// floating-point
+class F3_3<bits<2> opVal, bits<6> op3val, bits<9> opfval, dag outs, dag ins,
+           string asmstr, list<dag> pattern> : F3<outs, ins, asmstr, pattern> {
+  bits<5> rs2;
+
+  let op         = opVal;
+  let op3        = op3val;
+
+  let Inst{13-5} = opfval;   // fp opcode
+  let Inst{4-0}  = rs2;
+}
+
+// floating-point unary operations.
+class F3_3u<bits<2> opVal, bits<6> op3val, bits<9> opfval, dag outs, dag ins,
+           string asmstr, list<dag> pattern> : F3<outs, ins, asmstr, pattern> {
+  bits<5> rs2;
+
+  let op         = opVal;
+  let op3        = op3val;
+  let rs1        = 0;
+
+  let Inst{13-5} = opfval;   // fp opcode
+  let Inst{4-0}  = rs2;
+}
+
+// floating-point compares.
+class F3_3c<bits<2> opVal, bits<6> op3val, bits<9> opfval, dag outs, dag ins,
+           string asmstr, list<dag> pattern> : F3<outs, ins, asmstr, pattern> {
+  bits<5> rs2;
+
+  let op         = opVal;
+  let op3        = op3val;
+
+  let Inst{13-5} = opfval;   // fp opcode
+  let Inst{4-0}  = rs2;
+}
+
+// Shift by register rs2.
+class F3_Sr<bits<2> opVal, bits<6> op3val, bit xVal, dag outs, dag ins,
+            string asmstr, list<dag> pattern> : F3<outs, ins, asmstr, pattern> {
+  bit x = xVal;           // 1 for 64-bit shifts.
+  bits<5> rs2;
+
+  let op         = opVal;
+  let op3        = op3val;
+
+  let Inst{13}   = 0;     // i field = 0
+  let Inst{12}   = x;     // extended registers.
+  let Inst{4-0}  = rs2;
+}
+
+// Shift by immediate.
+class F3_Si<bits<2> opVal, bits<6> op3val, bit xVal, dag outs, dag ins,
+            string asmstr, list<dag> pattern> : F3<outs, ins, asmstr, pattern> {
+  bit x = xVal;           // 1 for 64-bit shifts.
+  bits<6> shcnt;          // shcnt32 / shcnt64.
+
+  let op         = opVal;
+  let op3        = op3val;
+
+  let Inst{13}   = 1;     // i field = 1
+  let Inst{12}   = x;     // extended registers.
+  let Inst{5-0}  = shcnt;
+}
+
+// Define rr and ri shift instructions with patterns.
+multiclass F3_S<string OpcStr, bits<6> Op3Val, bit XVal, SDNode OpNode,
+                ValueType VT, RegisterClass RC> {
+  def rr : F3_Sr<2, Op3Val, XVal, (outs RC:$rd), (ins RC:$rs1, IntRegs:$rs2),
+                 !strconcat(OpcStr, " $rs1, $rs2, $rd"),
+                 [(set VT:$rd, (OpNode VT:$rs1, i32:$rs2))]>;
+  def ri : F3_Si<2, Op3Val, XVal, (outs RC:$rd), (ins RC:$rs1, i32imm:$shcnt),
+                 !strconcat(OpcStr, " $rs1, $shcnt, $rd"),
+                 [(set VT:$rd, (OpNode VT:$rs1, (i32 imm:$shcnt)))]>;
+}
+
+class F4<bits<6> op3, dag outs, dag ins, string asmstr, list<dag> pattern>
+      : InstSP<outs, ins, asmstr, pattern> {
+  bits<5> rd;
+
+  let op          = 2;
+  let Inst{29-25} = rd;
+  let Inst{24-19} = op3;
+}
+
+
+class F4_1<bits<6> op3, dag outs, dag ins,
+            string asmstr, list<dag> pattern>
+      : F4<op3, outs, ins, asmstr, pattern> {
+
+  bit    intcc;
+  bits<2> cc;
+  bits<4> cond;
+  bits<5> rs2;
+
+  let Inst{4-0}   = rs2;
+  let Inst{12-11} = cc;
+  let Inst{13}    = 0;
+  let Inst{17-14} = cond;
+  let Inst{18}    = intcc;
+
+}
+
+class F4_2<bits<6> op3, dag outs, dag ins,
+            string asmstr, list<dag> pattern>
+      : F4<op3, outs, ins, asmstr, pattern> {
+  bit      intcc;
+  bits<2>  cc;
+  bits<4>  cond;
+  bits<11> simm11;
+
+  let Inst{10-0}  = simm11;
+  let Inst{12-11} = cc;
+  let Inst{13}    = 1;
+  let Inst{17-14} = cond;
+  let Inst{18}    = intcc;
+}
+
+class F4_3<bits<6> op3, bits<6> opf_low, dag outs, dag ins,
+           string asmstr, list<dag> pattern>
+      : F4<op3, outs, ins, asmstr, pattern> {
+  bits<4> cond;
+  bit     intcc;
+  bits<2> opf_cc;
+  bits<5> rs2;
+
+  let Inst{18}     = 0;
+  let Inst{17-14}  = cond;
+  let Inst{13}     = intcc;
+  let Inst{12-11}  = opf_cc;
+  let Inst{10-5}   = opf_low;
+  let Inst{4-0}    = rs2;
+}
+
+class F4_4r<bits<6> op3, bits<5> opf_low, bits<3> rcond, dag outs, dag ins,
+            string asmstr, list<dag> pattern>
+       : F4<op3, outs, ins, asmstr, pattern> {
+  bits <5> rs1;
+  bits <5> rs2;
+  let Inst{18-14} = rs1;
+  let Inst{13}    = 0;  // IsImm
+  let Inst{12-10} = rcond;
+  let Inst{9-5}   = opf_low;
+  let Inst{4-0}   = rs2;
+}
+
+
+class F4_4i<bits<6> op3, bits<3> rcond, dag outs, dag ins,
+            string asmstr, list<dag> pattern>
+       : F4<op3, outs, ins, asmstr, pattern> {
+  bits<5> rs1;
+  bits<10> simm10;
+  let Inst{18-14} = rs1;
+  let Inst{13}    = 1;  // IsImm
+  let Inst{12-10} = rcond;
+  let Inst{9-0}   = simm10;
+}
+
+
+class TRAPSP<bits<6> op3Val, bit isimm, dag outs, dag ins, string asmstr,
+       list<dag> pattern>: F3<outs, ins, asmstr, pattern> {
+
+   bits<4> cond;
+   bits<2> cc;
+
+   let op = 0b10;
+   let rd{4} = 0;
+   let rd{3-0} = cond;
+   let op3 = op3Val;
+   let Inst{13} = isimm;
+   let Inst{12-11} = cc;
+
+}
+
+class TRAPSPrr<bits<6> op3Val, dag outs, dag ins, string asmstr,
+    list<dag> pattern>: TRAPSP<op3Val, 0, outs, ins, asmstr, pattern> {
+   bits<5> rs2;
+
+   let Inst{10-5} = 0;
+   let Inst{4-0}  = rs2;
+}
+class TRAPSPri<bits<6> op3Val, dag outs, dag ins, string asmstr,
+    list<dag> pattern>: TRAPSP<op3Val, 1, outs, ins, asmstr, pattern> {
+   bits<8> imm;
+
+   let Inst{10-8} = 0;
+   let Inst{7-0}  = imm;
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.cpp b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
new file mode 100644
index 0000000..8b2e6bc
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
@@ -0,0 +1,446 @@
+//===-- SparcInstrInfo.cpp - Sparc Instruction Information ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Sparc implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcInstrInfo.h"
+#include "Sparc.h"
+#include "SparcMachineFunctionInfo.h"
+#include "SparcSubtarget.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "SparcGenInstrInfo.inc"
+
+// Pin the vtable to this file.
+void SparcInstrInfo::anchor() {}
+
+SparcInstrInfo::SparcInstrInfo(SparcSubtarget &ST)
+  : SparcGenInstrInfo(SP::ADJCALLSTACKDOWN, SP::ADJCALLSTACKUP),
+    RI(ST), Subtarget(ST) {
+}
+
+/// isLoadFromStackSlot - If the specified machine instruction is a direct
+/// load from a stack slot, return the virtual or physical register number of
+/// the destination along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than loading from the stack slot.
+unsigned SparcInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+                                             int &FrameIndex) const {
+  if (MI->getOpcode() == SP::LDri ||
+      MI->getOpcode() == SP::LDXri ||
+      MI->getOpcode() == SP::LDFri ||
+      MI->getOpcode() == SP::LDDFri ||
+      MI->getOpcode() == SP::LDQFri) {
+    if (MI->getOperand(1).isFI() && MI->getOperand(2).isImm() &&
+        MI->getOperand(2).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+  }
+  return 0;
+}
+
+/// isStoreToStackSlot - If the specified machine instruction is a direct
+/// store to a stack slot, return the virtual or physical register number of
+/// the source reg along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than storing to the stack slot.
+unsigned SparcInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+                                            int &FrameIndex) const {
+  if (MI->getOpcode() == SP::STri ||
+      MI->getOpcode() == SP::STXri ||
+      MI->getOpcode() == SP::STFri ||
+      MI->getOpcode() == SP::STDFri ||
+      MI->getOpcode() == SP::STQFri) {
+    if (MI->getOperand(0).isFI() && MI->getOperand(1).isImm() &&
+        MI->getOperand(1).getImm() == 0) {
+      FrameIndex = MI->getOperand(0).getIndex();
+      return MI->getOperand(2).getReg();
+    }
+  }
+  return 0;
+}
+
+static bool IsIntegerCC(unsigned CC)
+{
+  return  (CC <= SPCC::ICC_VC);
+}
+
+
+static SPCC::CondCodes GetOppositeBranchCondition(SPCC::CondCodes CC)
+{
+  switch(CC) {
+  case SPCC::ICC_A:    return SPCC::ICC_N;
+  case SPCC::ICC_N:    return SPCC::ICC_A;
+  case SPCC::ICC_NE:   return SPCC::ICC_E;
+  case SPCC::ICC_E:    return SPCC::ICC_NE;
+  case SPCC::ICC_G:    return SPCC::ICC_LE;
+  case SPCC::ICC_LE:   return SPCC::ICC_G;
+  case SPCC::ICC_GE:   return SPCC::ICC_L;
+  case SPCC::ICC_L:    return SPCC::ICC_GE;
+  case SPCC::ICC_GU:   return SPCC::ICC_LEU;
+  case SPCC::ICC_LEU:  return SPCC::ICC_GU;
+  case SPCC::ICC_CC:   return SPCC::ICC_CS;
+  case SPCC::ICC_CS:   return SPCC::ICC_CC;
+  case SPCC::ICC_POS:  return SPCC::ICC_NEG;
+  case SPCC::ICC_NEG:  return SPCC::ICC_POS;
+  case SPCC::ICC_VC:   return SPCC::ICC_VS;
+  case SPCC::ICC_VS:   return SPCC::ICC_VC;
+
+  case SPCC::FCC_A:    return SPCC::FCC_N;
+  case SPCC::FCC_N:    return SPCC::FCC_A;
+  case SPCC::FCC_U:    return SPCC::FCC_O;
+  case SPCC::FCC_O:    return SPCC::FCC_U;
+  case SPCC::FCC_G:    return SPCC::FCC_ULE;
+  case SPCC::FCC_LE:   return SPCC::FCC_UG;
+  case SPCC::FCC_UG:   return SPCC::FCC_LE;
+  case SPCC::FCC_ULE:  return SPCC::FCC_G;
+  case SPCC::FCC_L:    return SPCC::FCC_UGE;
+  case SPCC::FCC_GE:   return SPCC::FCC_UL;
+  case SPCC::FCC_UL:   return SPCC::FCC_GE;
+  case SPCC::FCC_UGE:  return SPCC::FCC_L;
+  case SPCC::FCC_LG:   return SPCC::FCC_UE;
+  case SPCC::FCC_UE:   return SPCC::FCC_LG;
+  case SPCC::FCC_NE:   return SPCC::FCC_E;
+  case SPCC::FCC_E:    return SPCC::FCC_NE;
+  }
+  llvm_unreachable("Invalid cond code");
+}
+
+bool SparcInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+                                   MachineBasicBlock *&TBB,
+                                   MachineBasicBlock *&FBB,
+                                   SmallVectorImpl<MachineOperand> &Cond,
+                                   bool AllowModify) const
+{
+
+  MachineBasicBlock::iterator I = MBB.end();
+  MachineBasicBlock::iterator UnCondBrIter = MBB.end();
+  while (I != MBB.begin()) {
+    --I;
+
+    if (I->isDebugValue())
+      continue;
+
+    // When we see a non-terminator, we are done.
+    if (!isUnpredicatedTerminator(I))
+      break;
+
+    // Terminator is not a branch.
+    if (!I->isBranch())
+      return true;
+
+    // Handle Unconditional branches.
+    if (I->getOpcode() == SP::BA) {
+      UnCondBrIter = I;
+
+      if (!AllowModify) {
+        TBB = I->getOperand(0).getMBB();
+        continue;
+      }
+
+      while (std::next(I) != MBB.end())
+        std::next(I)->eraseFromParent();
+
+      Cond.clear();
+      FBB = nullptr;
+
+      if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
+        TBB = nullptr;
+        I->eraseFromParent();
+        I = MBB.end();
+        UnCondBrIter = MBB.end();
+        continue;
+      }
+
+      TBB = I->getOperand(0).getMBB();
+      continue;
+    }
+
+    unsigned Opcode = I->getOpcode();
+    if (Opcode != SP::BCOND && Opcode != SP::FBCOND)
+      return true; // Unknown Opcode.
+
+    SPCC::CondCodes BranchCode = (SPCC::CondCodes)I->getOperand(1).getImm();
+
+    if (Cond.empty()) {
+      MachineBasicBlock *TargetBB = I->getOperand(0).getMBB();
+      if (AllowModify && UnCondBrIter != MBB.end() &&
+          MBB.isLayoutSuccessor(TargetBB)) {
+
+        // Transform the code
+        //
+        //    brCC L1
+        //    ba L2
+        // L1:
+        //    ..
+        // L2:
+        //
+        // into
+        //
+        //   brnCC L2
+        // L1:
+        //   ...
+        // L2:
+        //
+        BranchCode = GetOppositeBranchCondition(BranchCode);
+        MachineBasicBlock::iterator OldInst = I;
+        BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(Opcode))
+          .addMBB(UnCondBrIter->getOperand(0).getMBB()).addImm(BranchCode);
+        BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(SP::BA))
+          .addMBB(TargetBB);
+
+        OldInst->eraseFromParent();
+        UnCondBrIter->eraseFromParent();
+
+        UnCondBrIter = MBB.end();
+        I = MBB.end();
+        continue;
+      }
+      FBB = TBB;
+      TBB = I->getOperand(0).getMBB();
+      Cond.push_back(MachineOperand::CreateImm(BranchCode));
+      continue;
+    }
+    // FIXME: Handle subsequent conditional branches.
+    // For now, we can't handle multiple conditional branches.
+    return true;
+  }
+  return false;
+}
+
+unsigned
+SparcInstrInfo::InsertBranch(MachineBasicBlock &MBB,MachineBasicBlock *TBB,
+                             MachineBasicBlock *FBB,
+                             const SmallVectorImpl<MachineOperand> &Cond,
+                             DebugLoc DL) const {
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 1 || Cond.size() == 0) &&
+         "Sparc branch conditions should have one component!");
+
+  if (Cond.empty()) {
+    assert(!FBB && "Unconditional branch with multiple successors!");
+    BuildMI(&MBB, DL, get(SP::BA)).addMBB(TBB);
+    return 1;
+  }
+
+  // Conditional branch
+  unsigned CC = Cond[0].getImm();
+
+  if (IsIntegerCC(CC))
+    BuildMI(&MBB, DL, get(SP::BCOND)).addMBB(TBB).addImm(CC);
+  else
+    BuildMI(&MBB, DL, get(SP::FBCOND)).addMBB(TBB).addImm(CC);
+  if (!FBB)
+    return 1;
+
+  BuildMI(&MBB, DL, get(SP::BA)).addMBB(FBB);
+  return 2;
+}
+
+unsigned SparcInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const
+{
+  MachineBasicBlock::iterator I = MBB.end();
+  unsigned Count = 0;
+  while (I != MBB.begin()) {
+    --I;
+
+    if (I->isDebugValue())
+      continue;
+
+    if (I->getOpcode() != SP::BA
+        && I->getOpcode() != SP::BCOND
+        && I->getOpcode() != SP::FBCOND)
+      break; // Not a branch
+
+    I->eraseFromParent();
+    I = MBB.end();
+    ++Count;
+  }
+  return Count;
+}
+
+void SparcInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator I, DebugLoc DL,
+                                 unsigned DestReg, unsigned SrcReg,
+                                 bool KillSrc) const {
+  unsigned numSubRegs = 0;
+  unsigned movOpc     = 0;
+  const unsigned *subRegIdx = nullptr;
+
+  const unsigned DFP_FP_SubRegsIdx[]  = { SP::sub_even, SP::sub_odd };
+  const unsigned QFP_DFP_SubRegsIdx[] = { SP::sub_even64, SP::sub_odd64 };
+  const unsigned QFP_FP_SubRegsIdx[]  = { SP::sub_even, SP::sub_odd,
+                                          SP::sub_odd64_then_sub_even,
+                                          SP::sub_odd64_then_sub_odd };
+
+  if (SP::IntRegsRegClass.contains(DestReg, SrcReg))
+    BuildMI(MBB, I, DL, get(SP::ORrr), DestReg).addReg(SP::G0)
+      .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (SP::FPRegsRegClass.contains(DestReg, SrcReg))
+    BuildMI(MBB, I, DL, get(SP::FMOVS), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (SP::DFPRegsRegClass.contains(DestReg, SrcReg)) {
+    if (Subtarget.isV9()) {
+      BuildMI(MBB, I, DL, get(SP::FMOVD), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    } else {
+      // Use two FMOVS instructions.
+      subRegIdx  = DFP_FP_SubRegsIdx;
+      numSubRegs = 2;
+      movOpc     = SP::FMOVS;
+    }
+  } else if (SP::QFPRegsRegClass.contains(DestReg, SrcReg)) {
+    if (Subtarget.isV9()) {
+      if (Subtarget.hasHardQuad()) {
+        BuildMI(MBB, I, DL, get(SP::FMOVQ), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+      } else {
+        // Use two FMOVD instructions.
+        subRegIdx  = QFP_DFP_SubRegsIdx;
+        numSubRegs = 2;
+        movOpc     = SP::FMOVD;
+      }
+    } else {
+      // Use four FMOVS instructions.
+      subRegIdx  = QFP_FP_SubRegsIdx;
+      numSubRegs = 4;
+      movOpc     = SP::FMOVS;
+    }
+  } else
+    llvm_unreachable("Impossible reg-to-reg copy");
+
+  if (numSubRegs == 0 || subRegIdx == nullptr || movOpc == 0)
+    return;
+
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  MachineInstr *MovMI = nullptr;
+
+  for (unsigned i = 0; i != numSubRegs; ++i) {
+    unsigned Dst = TRI->getSubReg(DestReg, subRegIdx[i]);
+    unsigned Src = TRI->getSubReg(SrcReg,  subRegIdx[i]);
+    assert(Dst && Src && "Bad sub-register");
+
+    MovMI = BuildMI(MBB, I, DL, get(movOpc), Dst).addReg(Src);
+  }
+  // Add implicit super-register defs and kills to the last MovMI.
+  MovMI->addRegisterDefined(DestReg, TRI);
+  if (KillSrc)
+    MovMI->addRegisterKilled(SrcReg, TRI);
+}
+
+void SparcInstrInfo::
+storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                    unsigned SrcReg, bool isKill, int FI,
+                    const TargetRegisterClass *RC,
+                    const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  if (I != MBB.end()) DL = I->getDebugLoc();
+
+  MachineFunction *MF = MBB.getParent();
+  const MachineFrameInfo &MFI = *MF->getFrameInfo();
+  MachineMemOperand *MMO =
+    MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
+                             MachineMemOperand::MOStore,
+                             MFI.getObjectSize(FI),
+                             MFI.getObjectAlignment(FI));
+
+  // On the order of operands here: think "[FrameIdx + 0] = SrcReg".
+ if (RC == &SP::I64RegsRegClass)
+    BuildMI(MBB, I, DL, get(SP::STXri)).addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
+  else if (RC == &SP::IntRegsRegClass)
+    BuildMI(MBB, I, DL, get(SP::STri)).addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
+  else if (RC == &SP::FPRegsRegClass)
+    BuildMI(MBB, I, DL, get(SP::STFri)).addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg,  getKillRegState(isKill)).addMemOperand(MMO);
+  else if (SP::DFPRegsRegClass.hasSubClassEq(RC))
+    BuildMI(MBB, I, DL, get(SP::STDFri)).addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg,  getKillRegState(isKill)).addMemOperand(MMO);
+  else if (SP::QFPRegsRegClass.hasSubClassEq(RC))
+    // Use STQFri irrespective of its legality. If STQ is not legal, it will be
+    // lowered into two STDs in eliminateFrameIndex.
+    BuildMI(MBB, I, DL, get(SP::STQFri)).addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg,  getKillRegState(isKill)).addMemOperand(MMO);
+  else
+    llvm_unreachable("Can't store this register to stack slot");
+}
+
+void SparcInstrInfo::
+loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                     unsigned DestReg, int FI,
+                     const TargetRegisterClass *RC,
+                     const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  if (I != MBB.end()) DL = I->getDebugLoc();
+
+  MachineFunction *MF = MBB.getParent();
+  const MachineFrameInfo &MFI = *MF->getFrameInfo();
+  MachineMemOperand *MMO =
+    MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
+                             MachineMemOperand::MOLoad,
+                             MFI.getObjectSize(FI),
+                             MFI.getObjectAlignment(FI));
+
+  if (RC == &SP::I64RegsRegClass)
+    BuildMI(MBB, I, DL, get(SP::LDXri), DestReg).addFrameIndex(FI).addImm(0)
+      .addMemOperand(MMO);
+  else if (RC == &SP::IntRegsRegClass)
+    BuildMI(MBB, I, DL, get(SP::LDri), DestReg).addFrameIndex(FI).addImm(0)
+      .addMemOperand(MMO);
+  else if (RC == &SP::FPRegsRegClass)
+    BuildMI(MBB, I, DL, get(SP::LDFri), DestReg).addFrameIndex(FI).addImm(0)
+      .addMemOperand(MMO);
+  else if (SP::DFPRegsRegClass.hasSubClassEq(RC))
+    BuildMI(MBB, I, DL, get(SP::LDDFri), DestReg).addFrameIndex(FI).addImm(0)
+      .addMemOperand(MMO);
+  else if (SP::QFPRegsRegClass.hasSubClassEq(RC))
+    // Use LDQFri irrespective of its legality. If LDQ is not legal, it will be
+    // lowered into two LDDs in eliminateFrameIndex.
+    BuildMI(MBB, I, DL, get(SP::LDQFri), DestReg).addFrameIndex(FI).addImm(0)
+      .addMemOperand(MMO);
+  else
+    llvm_unreachable("Can't load this register from stack slot");
+}
+
+unsigned SparcInstrInfo::getGlobalBaseReg(MachineFunction *MF) const
+{
+  SparcMachineFunctionInfo *SparcFI = MF->getInfo<SparcMachineFunctionInfo>();
+  unsigned GlobalBaseReg = SparcFI->getGlobalBaseReg();
+  if (GlobalBaseReg != 0)
+    return GlobalBaseReg;
+
+  // Insert the set of GlobalBaseReg into the first MBB of the function
+  MachineBasicBlock &FirstMBB = MF->front();
+  MachineBasicBlock::iterator MBBI = FirstMBB.begin();
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+
+  const TargetRegisterClass *PtrRC =
+    Subtarget.is64Bit() ? &SP::I64RegsRegClass : &SP::IntRegsRegClass;
+  GlobalBaseReg = RegInfo.createVirtualRegister(PtrRC);
+
+  DebugLoc dl;
+
+  BuildMI(FirstMBB, MBBI, dl, get(SP::GETPCX), GlobalBaseReg);
+  SparcFI->setGlobalBaseReg(GlobalBaseReg);
+  return GlobalBaseReg;
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.h b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.h
new file mode 100644
index 0000000..3a1472e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.h
@@ -0,0 +1,100 @@
+//===-- SparcInstrInfo.h - Sparc Instruction Information --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Sparc implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPARCINSTRUCTIONINFO_H
+#define SPARCINSTRUCTIONINFO_H
+
+#include "SparcRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "SparcGenInstrInfo.inc"
+
+namespace llvm {
+
+/// SPII - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace SPII {
+  enum {
+    Pseudo = (1<<0),
+    Load = (1<<1),
+    Store = (1<<2),
+    DelaySlot = (1<<3)
+  };
+}
+
+class SparcInstrInfo : public SparcGenInstrInfo {
+  const SparcRegisterInfo RI;
+  const SparcSubtarget& Subtarget;
+  virtual void anchor();
+public:
+  explicit SparcInstrInfo(SparcSubtarget &ST);
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  const SparcRegisterInfo &getRegisterInfo() const { return RI; }
+
+  /// isLoadFromStackSlot - If the specified machine instruction is a direct
+  /// load from a stack slot, return the virtual or physical register number of
+  /// the destination along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than loading from the stack slot.
+  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                               int &FrameIndex) const override;
+
+  /// isStoreToStackSlot - If the specified machine instruction is a direct
+  /// store to a stack slot, return the virtual or physical register number of
+  /// the source reg along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than storing to the stack slot.
+  unsigned isStoreToStackSlot(const MachineInstr *MI,
+                              int &FrameIndex) const override;
+
+  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify = false) const override ;
+
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
+
+  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB,
+                        const SmallVectorImpl<MachineOperand> &Cond,
+                        DebugLoc DL) const override;
+
+  void copyPhysReg(MachineBasicBlock &MBB,
+                   MachineBasicBlock::iterator I, DebugLoc DL,
+                   unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
+
+  unsigned getGlobalBaseReg(MachineFunction *MF) const;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.td b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.td
new file mode 100644
index 0000000..960261c
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.td
@@ -0,0 +1,1227 @@
+//===-- SparcInstrInfo.td - Target Description for Sparc Target -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Sparc instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass
+//===----------------------------------------------------------------------===//
+
+include "SparcInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Feature predicates.
+//===----------------------------------------------------------------------===//
+
+// True when generating 32-bit code.
+def Is32Bit : Predicate<"!Subtarget.is64Bit()">;
+
+// True when generating 64-bit code. This also implies HasV9.
+def Is64Bit : Predicate<"Subtarget.is64Bit()">;
+
+// HasV9 - This predicate is true when the target processor supports V9
+// instructions.  Note that the machine may be running in 32-bit mode.
+def HasV9   : Predicate<"Subtarget.isV9()">,
+              AssemblerPredicate<"FeatureV9">;
+
+// HasNoV9 - This predicate is true when the target doesn't have V9
+// instructions.  Use of this is just a hack for the isel not having proper
+// costs for V8 instructions that are more expensive than their V9 ones.
+def HasNoV9 : Predicate<"!Subtarget.isV9()">;
+
+// HasVIS - This is true when the target processor has VIS extensions.
+def HasVIS : Predicate<"Subtarget.isVIS()">,
+             AssemblerPredicate<"FeatureVIS">;
+def HasVIS2 : Predicate<"Subtarget.isVIS2()">,
+             AssemblerPredicate<"FeatureVIS2">;
+def HasVIS3 : Predicate<"Subtarget.isVIS3()">,
+             AssemblerPredicate<"FeatureVIS3">;
+
+// HasHardQuad - This is true when the target processor supports quad floating
+// point instructions.
+def HasHardQuad : Predicate<"Subtarget.hasHardQuad()">;
+
+// UseDeprecatedInsts - This predicate is true when the target processor is a
+// V8, or when it is V9 but the V8 deprecated instructions are efficient enough
+// to use when appropriate.  In either of these cases, the instruction selector
+// will pick deprecated instructions.
+def UseDeprecatedInsts : Predicate<"Subtarget.useDeprecatedV8Instructions()">;
+
+//===----------------------------------------------------------------------===//
+// Instruction Pattern Stuff
+//===----------------------------------------------------------------------===//
+
+def simm11  : PatLeaf<(imm), [{ return isInt<11>(N->getSExtValue()); }]>;
+
+def simm13  : PatLeaf<(imm), [{ return isInt<13>(N->getSExtValue()); }]>;
+
+def LO10 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant((unsigned)N->getZExtValue() & 1023,
+                                   MVT::i32);
+}]>;
+
+def HI22 : SDNodeXForm<imm, [{
+  // Transformation function: shift the immediate value down into the low bits.
+  return CurDAG->getTargetConstant((unsigned)N->getZExtValue() >> 10, MVT::i32);
+}]>;
+
+def SETHIimm : PatLeaf<(imm), [{
+  return isShiftedUInt<22, 10>(N->getZExtValue());
+}], HI22>;
+
+// Addressing modes.
+def ADDRrr : ComplexPattern<iPTR, 2, "SelectADDRrr", [], []>;
+def ADDRri : ComplexPattern<iPTR, 2, "SelectADDRri", [frameindex], []>;
+
+// Address operands
+def SparcMEMrrAsmOperand : AsmOperandClass {
+  let Name = "MEMrr";
+  let ParserMethod = "parseMEMOperand";
+}
+
+def SparcMEMriAsmOperand : AsmOperandClass {
+  let Name = "MEMri";
+  let ParserMethod = "parseMEMOperand";
+}
+
+def MEMrr : Operand<iPTR> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops ptr_rc, ptr_rc);
+  let ParserMatchClass = SparcMEMrrAsmOperand;
+}
+def MEMri : Operand<iPTR> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops ptr_rc, i32imm);
+  let ParserMatchClass = SparcMEMriAsmOperand;
+}
+
+def TLSSym : Operand<iPTR>;
+
+// Branch targets have OtherVT type.
+def brtarget : Operand<OtherVT> {
+  let EncoderMethod = "getBranchTargetOpValue";
+}
+
+def bprtarget : Operand<OtherVT> {
+  let EncoderMethod = "getBranchPredTargetOpValue";
+}
+
+def bprtarget16 : Operand<OtherVT> {
+  let EncoderMethod = "getBranchOnRegTargetOpValue";
+}
+
+def calltarget : Operand<i32> {
+  let EncoderMethod = "getCallTargetOpValue";
+  let DecoderMethod = "DecodeCall";
+}
+
+def simm13Op : Operand<i32> {
+  let DecoderMethod = "DecodeSIMM13";
+}
+
+// Operand for printing out a condition code.
+let PrintMethod = "printCCOperand" in
+  def CCOp : Operand<i32>;
+
+def SDTSPcmpicc :
+SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisSameAs<0, 1>]>;
+def SDTSPcmpfcc :
+SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisSameAs<0, 1>]>;
+def SDTSPbrcc :
+SDTypeProfile<0, 2, [SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>]>;
+def SDTSPselectcc :
+SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisVT<3, i32>]>;
+def SDTSPFTOI :
+SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisFP<1>]>;
+def SDTSPITOF :
+SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVT<1, f32>]>;
+def SDTSPFTOX :
+SDTypeProfile<1, 1, [SDTCisVT<0, f64>, SDTCisFP<1>]>;
+def SDTSPXTOF :
+SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVT<1, f64>]>;
+
+def SDTSPtlsadd :
+SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
+def SDTSPtlsld :
+SDTypeProfile<1, 2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>;
+
+def SPcmpicc : SDNode<"SPISD::CMPICC", SDTSPcmpicc, [SDNPOutGlue]>;
+def SPcmpfcc : SDNode<"SPISD::CMPFCC", SDTSPcmpfcc, [SDNPOutGlue]>;
+def SPbricc : SDNode<"SPISD::BRICC", SDTSPbrcc, [SDNPHasChain, SDNPInGlue]>;
+def SPbrxcc : SDNode<"SPISD::BRXCC", SDTSPbrcc, [SDNPHasChain, SDNPInGlue]>;
+def SPbrfcc : SDNode<"SPISD::BRFCC", SDTSPbrcc, [SDNPHasChain, SDNPInGlue]>;
+
+def SPhi    : SDNode<"SPISD::Hi", SDTIntUnaryOp>;
+def SPlo    : SDNode<"SPISD::Lo", SDTIntUnaryOp>;
+
+def SPftoi  : SDNode<"SPISD::FTOI", SDTSPFTOI>;
+def SPitof  : SDNode<"SPISD::ITOF", SDTSPITOF>;
+def SPftox  : SDNode<"SPISD::FTOX", SDTSPFTOX>;
+def SPxtof  : SDNode<"SPISD::XTOF", SDTSPXTOF>;
+
+def SPselecticc : SDNode<"SPISD::SELECT_ICC", SDTSPselectcc, [SDNPInGlue]>;
+def SPselectxcc : SDNode<"SPISD::SELECT_XCC", SDTSPselectcc, [SDNPInGlue]>;
+def SPselectfcc : SDNode<"SPISD::SELECT_FCC", SDTSPselectcc, [SDNPInGlue]>;
+
+//  These are target-independent nodes, but have target-specific formats.
+def SDT_SPCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_SPCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>,
+                                        SDTCisVT<1, i32> ]>;
+
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_SPCallSeqStart,
+                           [SDNPHasChain, SDNPOutGlue]>;
+def callseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_SPCallSeqEnd,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def SDT_SPCall    : SDTypeProfile<0, -1, [SDTCisVT<0, i32>]>;
+def call          : SDNode<"SPISD::CALL", SDT_SPCall,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                            SDNPVariadic]>;
+
+def SDT_SPRet     : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
+def retflag       : SDNode<"SPISD::RET_FLAG", SDT_SPRet,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+def flushw        : SDNode<"SPISD::FLUSHW", SDTNone,
+                           [SDNPHasChain, SDNPSideEffect, SDNPMayStore]>;
+
+def tlsadd        : SDNode<"SPISD::TLS_ADD", SDTSPtlsadd>;
+def tlsld         : SDNode<"SPISD::TLS_LD",  SDTSPtlsld>;
+def tlscall       : SDNode<"SPISD::TLS_CALL", SDT_SPCall,
+                            [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                             SDNPVariadic]>;
+
+def getPCX        : Operand<iPTR> {
+  let PrintMethod = "printGetPCX";
+}
+
+//===----------------------------------------------------------------------===//
+// SPARC Flag Conditions
+//===----------------------------------------------------------------------===//
+
+// Note that these values must be kept in sync with the CCOp::CondCode enum
+// values.
+class ICC_VAL<int N> : PatLeaf<(i32 N)>;
+def ICC_NE  : ICC_VAL< 9>;  // Not Equal
+def ICC_E   : ICC_VAL< 1>;  // Equal
+def ICC_G   : ICC_VAL<10>;  // Greater
+def ICC_LE  : ICC_VAL< 2>;  // Less or Equal
+def ICC_GE  : ICC_VAL<11>;  // Greater or Equal
+def ICC_L   : ICC_VAL< 3>;  // Less
+def ICC_GU  : ICC_VAL<12>;  // Greater Unsigned
+def ICC_LEU : ICC_VAL< 4>;  // Less or Equal Unsigned
+def ICC_CC  : ICC_VAL<13>;  // Carry Clear/Great or Equal Unsigned
+def ICC_CS  : ICC_VAL< 5>;  // Carry Set/Less Unsigned
+def ICC_POS : ICC_VAL<14>;  // Positive
+def ICC_NEG : ICC_VAL< 6>;  // Negative
+def ICC_VC  : ICC_VAL<15>;  // Overflow Clear
+def ICC_VS  : ICC_VAL< 7>;  // Overflow Set
+
+class FCC_VAL<int N> : PatLeaf<(i32 N)>;
+def FCC_U   : FCC_VAL<23>;  // Unordered
+def FCC_G   : FCC_VAL<22>;  // Greater
+def FCC_UG  : FCC_VAL<21>;  // Unordered or Greater
+def FCC_L   : FCC_VAL<20>;  // Less
+def FCC_UL  : FCC_VAL<19>;  // Unordered or Less
+def FCC_LG  : FCC_VAL<18>;  // Less or Greater
+def FCC_NE  : FCC_VAL<17>;  // Not Equal
+def FCC_E   : FCC_VAL<25>;  // Equal
+def FCC_UE  : FCC_VAL<24>;  // Unordered or Equal
+def FCC_GE  : FCC_VAL<25>;  // Greater or Equal
+def FCC_UGE : FCC_VAL<26>;  // Unordered or Greater or Equal
+def FCC_LE  : FCC_VAL<27>;  // Less or Equal
+def FCC_ULE : FCC_VAL<28>;  // Unordered or Less or Equal
+def FCC_O   : FCC_VAL<29>;  // Ordered
+
+//===----------------------------------------------------------------------===//
+// Instruction Class Templates
+//===----------------------------------------------------------------------===//
+
+/// F3_12 multiclass - Define a normal F3_1/F3_2 pattern in one shot.
+multiclass F3_12<string OpcStr, bits<6> Op3Val, SDNode OpNode,
+                 RegisterClass RC, ValueType Ty, Operand immOp> {
+  def rr  : F3_1<2, Op3Val,
+                 (outs RC:$rd), (ins RC:$rs1, RC:$rs2),
+                 !strconcat(OpcStr, " $rs1, $rs2, $rd"),
+                 [(set Ty:$rd, (OpNode Ty:$rs1, Ty:$rs2))]>;
+  def ri  : F3_2<2, Op3Val,
+                 (outs RC:$rd), (ins RC:$rs1, immOp:$simm13),
+                 !strconcat(OpcStr, " $rs1, $simm13, $rd"),
+                 [(set Ty:$rd, (OpNode Ty:$rs1, (Ty simm13:$simm13)))]>;
+}
+
+/// F3_12np multiclass - Define a normal F3_1/F3_2 pattern in one shot, with no
+/// pattern.
+multiclass F3_12np<string OpcStr, bits<6> Op3Val> {
+  def rr  : F3_1<2, Op3Val,
+                 (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2),
+                 !strconcat(OpcStr, " $rs1, $rs2, $rd"), []>;
+  def ri  : F3_2<2, Op3Val,
+                 (outs IntRegs:$rd), (ins IntRegs:$rs1, simm13Op:$simm13),
+                 !strconcat(OpcStr, " $rs1, $simm13, $rd"), []>;
+}
+
+// Load multiclass - Define both Reg+Reg/Reg+Imm patterns in one shot.
+multiclass Load<string OpcStr, bits<6> Op3Val, SDPatternOperator OpNode,
+           RegisterClass RC, ValueType Ty> {
+  def rr  : F3_1<3, Op3Val,
+                 (outs RC:$dst), (ins MEMrr:$addr),
+                 !strconcat(OpcStr, " [$addr], $dst"),
+                 [(set Ty:$dst, (OpNode ADDRrr:$addr))]>;
+  def ri  : F3_2<3, Op3Val,
+                 (outs RC:$dst), (ins MEMri:$addr),
+                 !strconcat(OpcStr, " [$addr], $dst"),
+                 [(set Ty:$dst, (OpNode ADDRri:$addr))]>;
+}
+
+// Store multiclass - Define both Reg+Reg/Reg+Imm patterns in one shot.
+multiclass Store<string OpcStr, bits<6> Op3Val, SDPatternOperator OpNode,
+           RegisterClass RC, ValueType Ty> {
+  def rr  : F3_1<3, Op3Val,
+                 (outs), (ins MEMrr:$addr, RC:$rd),
+                 !strconcat(OpcStr, " $rd, [$addr]"),
+                 [(OpNode Ty:$rd, ADDRrr:$addr)]>;
+  def ri  : F3_2<3, Op3Val,
+                 (outs), (ins MEMri:$addr, RC:$rd),
+                 !strconcat(OpcStr, " $rd, [$addr]"),
+                 [(OpNode Ty:$rd, ADDRri:$addr)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+// Pseudo instructions.
+class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+   : InstSP<outs, ins, asmstr, pattern> {
+  let isCodeGenOnly = 1;
+  let isPseudo = 1;
+}
+
+// GETPCX for PIC
+let Defs = [O7] in {
+  def GETPCX : Pseudo<(outs getPCX:$getpcseq), (ins), "$getpcseq", [] >;
+}
+
+let Defs = [O6], Uses = [O6] in {
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
+                               "!ADJCALLSTACKDOWN $amt",
+                               [(callseq_start timm:$amt)]>;
+def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+                            "!ADJCALLSTACKUP $amt1",
+                            [(callseq_end timm:$amt1, timm:$amt2)]>;
+}
+
+let hasSideEffects = 1, mayStore = 1 in {
+  let rd = 0, rs1 = 0, rs2 = 0 in
+    def FLUSHW : F3_1<0b10, 0b101011, (outs), (ins),
+                      "flushw",
+                      [(flushw)]>, Requires<[HasV9]>;
+  let rd = 0, rs1 = 1, simm13 = 3 in
+    def TA3 : F3_2<0b10, 0b111010, (outs), (ins),
+                   "ta 3",
+                   [(flushw)]>;
+}
+
+let isBarrier = 1, isTerminator = 1, rd = 0b1000, rs1 = 0, simm13 = 5 in
+  def TA5 : F3_2<0b10, 0b111010, (outs), (ins), "ta 5", [(trap)]>;
+
+let rd = 0 in
+  def UNIMP : F2_1<0b000, (outs), (ins i32imm:$imm22),
+                  "unimp $imm22", []>;
+
+// SELECT_CC_* - Used to implement the SELECT_CC DAG operation.  Expanded after
+// instruction selection into a branch sequence.  This has to handle all
+// permutations of selection between i32/f32/f64 on ICC and FCC.
+// Expanded after instruction selection.
+let Uses = [ICC], usesCustomInserter = 1 in {
+  def SELECT_CC_Int_ICC
+   : Pseudo<(outs IntRegs:$dst), (ins IntRegs:$T, IntRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_Int_ICC PSEUDO!",
+            [(set i32:$dst, (SPselecticc i32:$T, i32:$F, imm:$Cond))]>;
+  def SELECT_CC_FP_ICC
+   : Pseudo<(outs FPRegs:$dst), (ins FPRegs:$T, FPRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_FP_ICC PSEUDO!",
+            [(set f32:$dst, (SPselecticc f32:$T, f32:$F, imm:$Cond))]>;
+
+  def SELECT_CC_DFP_ICC
+   : Pseudo<(outs DFPRegs:$dst), (ins DFPRegs:$T, DFPRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_DFP_ICC PSEUDO!",
+            [(set f64:$dst, (SPselecticc f64:$T, f64:$F, imm:$Cond))]>;
+
+  def SELECT_CC_QFP_ICC
+   : Pseudo<(outs QFPRegs:$dst), (ins QFPRegs:$T, QFPRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_QFP_ICC PSEUDO!",
+            [(set f128:$dst, (SPselecticc f128:$T, f128:$F, imm:$Cond))]>;
+}
+
+let usesCustomInserter = 1, Uses = [FCC0] in {
+
+  def SELECT_CC_Int_FCC
+   : Pseudo<(outs IntRegs:$dst), (ins IntRegs:$T, IntRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_Int_FCC PSEUDO!",
+            [(set i32:$dst, (SPselectfcc i32:$T, i32:$F, imm:$Cond))]>;
+
+  def SELECT_CC_FP_FCC
+   : Pseudo<(outs FPRegs:$dst), (ins FPRegs:$T, FPRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_FP_FCC PSEUDO!",
+            [(set f32:$dst, (SPselectfcc f32:$T, f32:$F, imm:$Cond))]>;
+  def SELECT_CC_DFP_FCC
+   : Pseudo<(outs DFPRegs:$dst), (ins DFPRegs:$T, DFPRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_DFP_FCC PSEUDO!",
+            [(set f64:$dst, (SPselectfcc f64:$T, f64:$F, imm:$Cond))]>;
+  def SELECT_CC_QFP_FCC
+   : Pseudo<(outs QFPRegs:$dst), (ins QFPRegs:$T, QFPRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_QFP_FCC PSEUDO!",
+            [(set f128:$dst, (SPselectfcc f128:$T, f128:$F, imm:$Cond))]>;
+}
+
+// JMPL Instruction.
+let isTerminator = 1, hasDelaySlot = 1, isBarrier = 1,
+    DecoderMethod = "DecodeJMPL" in {
+  def JMPLrr: F3_1<2, 0b111000, (outs IntRegs:$dst), (ins MEMrr:$addr),
+                  "jmpl $addr, $dst", []>;
+  def JMPLri: F3_2<2, 0b111000, (outs IntRegs:$dst), (ins MEMri:$addr),
+                  "jmpl $addr, $dst", []>;
+}
+
+// Section A.3 - Synthetic Instructions, p. 85
+// special cases of JMPL:
+let isReturn = 1, isTerminator = 1, hasDelaySlot = 1, isBarrier = 1,
+    isCodeGenOnly = 1 in {
+  let rd = 0, rs1 = 15 in
+    def RETL: F3_2<2, 0b111000, (outs), (ins i32imm:$val),
+                   "jmp %o7+$val", [(retflag simm13:$val)]>;
+
+  let rd = 0, rs1 = 31 in
+    def RET: F3_2<2, 0b111000, (outs), (ins i32imm:$val),
+                  "jmp %i7+$val", []>;
+}
+
+let isReturn = 1, isTerminator = 1, hasDelaySlot = 1,
+     isBarrier = 1, rd = 0, DecoderMethod = "DecodeReturn" in {
+  def RETTrr : F3_1<2, 0b111001, (outs), (ins MEMrr:$addr),
+                       "rett $addr", []>;
+  def RETTri : F3_2<2, 0b111001, (outs), (ins MEMri:$addr),
+                       "rett $addr", []>;
+}
+
+// Section B.1 - Load Integer Instructions, p. 90
+let DecoderMethod = "DecodeLoadInt" in {
+  defm LDSB : Load<"ldsb", 0b001001, sextloadi8,  IntRegs, i32>;
+  defm LDSH : Load<"ldsh", 0b001010, sextloadi16, IntRegs, i32>;
+  defm LDUB : Load<"ldub", 0b000001, zextloadi8,  IntRegs, i32>;
+  defm LDUH : Load<"lduh", 0b000010, zextloadi16, IntRegs, i32>;
+  defm LD   : Load<"ld",   0b000000, load,        IntRegs, i32>;
+}
+
+// Section B.2 - Load Floating-point Instructions, p. 92
+let DecoderMethod = "DecodeLoadFP" in
+  defm LDF   : Load<"ld",  0b100000, load, FPRegs,  f32>;
+let DecoderMethod = "DecodeLoadDFP" in
+  defm LDDF  : Load<"ldd", 0b100011, load, DFPRegs, f64>;
+let DecoderMethod = "DecodeLoadQFP" in
+  defm LDQF  : Load<"ldq", 0b100010, load, QFPRegs, f128>,
+               Requires<[HasV9, HasHardQuad]>;
+
+// Section B.4 - Store Integer Instructions, p. 95
+let DecoderMethod = "DecodeStoreInt" in {
+  defm STB   : Store<"stb", 0b000101, truncstorei8,  IntRegs, i32>;
+  defm STH   : Store<"sth", 0b000110, truncstorei16, IntRegs, i32>;
+  defm ST    : Store<"st",  0b000100, store,         IntRegs, i32>;
+}
+
+// Section B.5 - Store Floating-point Instructions, p. 97
+let DecoderMethod = "DecodeStoreFP" in
+  defm STF   : Store<"st",  0b100100, store,         FPRegs,  f32>;
+let DecoderMethod = "DecodeStoreDFP" in
+  defm STDF  : Store<"std", 0b100111, store,         DFPRegs, f64>;
+let DecoderMethod = "DecodeStoreQFP" in
+  defm STQF  : Store<"stq", 0b100110, store,         QFPRegs, f128>,
+               Requires<[HasV9, HasHardQuad]>;
+
+// Section B.9 - SETHI Instruction, p. 104
+def SETHIi: F2_1<0b100,
+                 (outs IntRegs:$rd), (ins i32imm:$imm22),
+                 "sethi $imm22, $rd",
+                 [(set i32:$rd, SETHIimm:$imm22)]>;
+
+// Section B.10 - NOP Instruction, p. 105
+// (It's a special case of SETHI)
+let rd = 0, imm22 = 0 in
+  def NOP : F2_1<0b100, (outs), (ins), "nop", []>;
+
+// Section B.11 - Logical Instructions, p. 106
+defm AND    : F3_12<"and", 0b000001, and, IntRegs, i32, simm13Op>;
+
+def ANDNrr  : F3_1<2, 0b000101,
+                   (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2),
+                   "andn $rs1, $rs2, $rd",
+                   [(set i32:$rd, (and i32:$rs1, (not i32:$rs2)))]>;
+def ANDNri  : F3_2<2, 0b000101,
+                   (outs IntRegs:$rd), (ins IntRegs:$rs1, simm13Op:$simm13),
+                   "andn $rs1, $simm13, $rd", []>;
+
+defm OR     : F3_12<"or", 0b000010, or, IntRegs, i32, simm13Op>;
+
+def ORNrr   : F3_1<2, 0b000110,
+                   (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2),
+                   "orn $rs1, $rs2, $rd",
+                   [(set i32:$rd, (or i32:$rs1, (not i32:$rs2)))]>;
+def ORNri   : F3_2<2, 0b000110,
+                   (outs IntRegs:$rd), (ins IntRegs:$rs1, simm13Op:$simm13),
+                   "orn $rs1, $simm13, $rd", []>;
+defm XOR    : F3_12<"xor", 0b000011, xor, IntRegs, i32, simm13Op>;
+
+def XNORrr  : F3_1<2, 0b000111,
+                   (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2),
+                   "xnor $rs1, $rs2, $rd",
+                   [(set i32:$rd, (not (xor i32:$rs1, i32:$rs2)))]>;
+def XNORri  : F3_2<2, 0b000111,
+                   (outs IntRegs:$rd), (ins IntRegs:$rs1, simm13Op:$simm13),
+                   "xnor $rs1, $simm13, $rd", []>;
+
+let Defs = [ICC] in {
+  defm ANDCC  : F3_12np<"andcc",  0b010001>;
+  defm ANDNCC : F3_12np<"andncc", 0b010101>;
+  defm ORCC   : F3_12np<"orcc",   0b010010>;
+  defm ORNCC  : F3_12np<"orncc",  0b010110>;
+  defm XORCC  : F3_12np<"xorcc",  0b010011>;
+  defm XNORCC : F3_12np<"xnorcc", 0b010111>;
+}
+
+// Section B.12 - Shift Instructions, p. 107
+defm SLL : F3_12<"sll", 0b100101, shl, IntRegs, i32, simm13Op>;
+defm SRL : F3_12<"srl", 0b100110, srl, IntRegs, i32, simm13Op>;
+defm SRA : F3_12<"sra", 0b100111, sra, IntRegs, i32, simm13Op>;
+
+// Section B.13 - Add Instructions, p. 108
+defm ADD   : F3_12<"add", 0b000000, add, IntRegs, i32, simm13Op>;
+
+// "LEA" forms of add (patterns to make tblgen happy)
+let Predicates = [Is32Bit], isCodeGenOnly = 1 in
+  def LEA_ADDri   : F3_2<2, 0b000000,
+                     (outs IntRegs:$dst), (ins MEMri:$addr),
+                     "add ${addr:arith}, $dst",
+                     [(set iPTR:$dst, ADDRri:$addr)]>;
+
+let Defs = [ICC] in
+  defm ADDCC  : F3_12<"addcc", 0b010000, addc, IntRegs, i32, simm13Op>;
+
+let Uses = [ICC] in
+  defm ADDC   : F3_12np<"addx", 0b001000>;
+
+let Uses = [ICC], Defs = [ICC] in
+  defm ADDE  : F3_12<"addxcc", 0b011000, adde, IntRegs, i32, simm13Op>;
+
+// Section B.15 - Subtract Instructions, p. 110
+defm SUB    : F3_12  <"sub"  , 0b000100, sub, IntRegs, i32, simm13Op>;
+let Uses = [ICC], Defs = [ICC] in
+  defm SUBE   : F3_12  <"subxcc" , 0b011100, sube, IntRegs, i32, simm13Op>;
+
+let Defs = [ICC] in
+  defm SUBCC  : F3_12  <"subcc", 0b010100, subc, IntRegs, i32, simm13Op>;
+
+let Uses = [ICC] in
+  defm SUBC   : F3_12np <"subx", 0b001100>;
+
+let Defs = [ICC], rd = 0 in {
+  def CMPrr   : F3_1<2, 0b010100,
+                     (outs), (ins IntRegs:$rs1, IntRegs:$rs2),
+                     "cmp $rs1, $rs2",
+                     [(SPcmpicc i32:$rs1, i32:$rs2)]>;
+  def CMPri   : F3_2<2, 0b010100,
+                     (outs), (ins IntRegs:$rs1, simm13Op:$simm13),
+                     "cmp $rs1, $simm13",
+                     [(SPcmpicc i32:$rs1, (i32 simm13:$simm13))]>;
+}
+
+// Section B.18 - Multiply Instructions, p. 113
+let Defs = [Y] in {
+  defm UMUL : F3_12np<"umul", 0b001010>;
+  defm SMUL : F3_12  <"smul", 0b001011, mul, IntRegs, i32, simm13Op>;
+}
+
+let Defs = [Y, ICC] in {
+  defm UMULCC : F3_12np<"umulcc", 0b011010>;
+  defm SMULCC : F3_12np<"smulcc", 0b011011>;
+}
+
+// Section B.19 - Divide Instructions, p. 115
+let Defs = [Y] in {
+  defm UDIV : F3_12np<"udiv", 0b001110>;
+  defm SDIV : F3_12np<"sdiv", 0b001111>;
+}
+
+let Defs = [Y, ICC] in {
+  defm UDIVCC : F3_12np<"udivcc", 0b011110>;
+  defm SDIVCC : F3_12np<"sdivcc", 0b011111>;
+}
+
+// Section B.20 - SAVE and RESTORE, p. 117
+defm SAVE    : F3_12np<"save"   , 0b111100>;
+defm RESTORE : F3_12np<"restore", 0b111101>;
+
+// Section B.21 - Branch on Integer Condition Codes Instructions, p. 119
+
+// unconditional branch class.
+class BranchAlways<dag ins, string asmstr, list<dag> pattern>
+  : F2_2<0b010, 0, (outs), ins, asmstr, pattern> {
+  let isBranch     = 1;
+  let isTerminator = 1;
+  let hasDelaySlot = 1;
+  let isBarrier    = 1;
+}
+
+let cond = 8 in
+  def BA : BranchAlways<(ins brtarget:$imm22), "ba $imm22", [(br bb:$imm22)]>;
+
+
+let isBranch = 1, isTerminator = 1, hasDelaySlot = 1 in {
+
+// conditional branch class:
+class BranchSP<dag ins, string asmstr, list<dag> pattern>
+ : F2_2<0b010, 0, (outs), ins, asmstr, pattern>;
+
+// conditional branch with annul class:
+class BranchSPA<dag ins, string asmstr, list<dag> pattern>
+ : F2_2<0b010, 1, (outs), ins, asmstr, pattern>;
+
+// Conditional branch class on %icc|%xcc with predication:
+multiclass IPredBranch<string regstr, list<dag> CCPattern> {
+  def CC    : F2_3<0b001, 0, 1, (outs), (ins bprtarget:$imm19, CCOp:$cond),
+                  !strconcat("b$cond ", !strconcat(regstr, ", $imm19")),
+                   CCPattern>;
+  def CCA   : F2_3<0b001, 1, 1, (outs), (ins bprtarget:$imm19, CCOp:$cond),
+                  !strconcat("b$cond,a ", !strconcat(regstr, ", $imm19")),
+                   []>;
+  def CCNT  : F2_3<0b001, 0, 0, (outs), (ins bprtarget:$imm19, CCOp:$cond),
+                   !strconcat("b$cond,pn ", !strconcat(regstr, ", $imm19")),
+                   []>;
+  def CCANT : F2_3<0b001, 1, 0, (outs), (ins bprtarget:$imm19, CCOp:$cond),
+                   !strconcat("b$cond,a,pn ", !strconcat(regstr, ", $imm19")),
+                   []>;
+}
+
+} // let isBranch = 1, isTerminator = 1, hasDelaySlot = 1
+
+
+// Indirect branch instructions.
+let isTerminator = 1, isBarrier = 1,  hasDelaySlot = 1, isBranch =1,
+     isIndirectBranch = 1, rd = 0, isCodeGenOnly = 1 in {
+  def BINDrr  : F3_1<2, 0b111000,
+                   (outs), (ins MEMrr:$ptr),
+                   "jmp $ptr",
+                   [(brind ADDRrr:$ptr)]>;
+  def BINDri  : F3_2<2, 0b111000,
+                   (outs), (ins MEMri:$ptr),
+                   "jmp $ptr",
+                   [(brind ADDRri:$ptr)]>;
+}
+
+let Uses = [ICC] in {
+  def BCOND : BranchSP<(ins brtarget:$imm22, CCOp:$cond),
+                         "b$cond $imm22",
+                        [(SPbricc bb:$imm22, imm:$cond)]>;
+  def BCONDA : BranchSPA<(ins brtarget:$imm22, CCOp:$cond),
+                         "b$cond,a $imm22", []>;
+
+  let Predicates = [HasV9], cc = 0b00 in
+    defm BPI : IPredBranch<"%icc", []>;
+}
+
+// Section B.22 - Branch on Floating-point Condition Codes Instructions, p. 121
+
+let isBranch = 1, isTerminator = 1, hasDelaySlot = 1 in {
+
+// floating-point conditional branch class:
+class FPBranchSP<dag ins, string asmstr, list<dag> pattern>
+ : F2_2<0b110, 0, (outs), ins, asmstr, pattern>;
+
+// floating-point conditional branch with annul class:
+class FPBranchSPA<dag ins, string asmstr, list<dag> pattern>
+ : F2_2<0b110, 1, (outs), ins, asmstr, pattern>;
+
+// Conditional branch class on %fcc0-%fcc3 with predication:
+multiclass FPredBranch {
+  def CC    : F2_3<0b101, 0, 1, (outs), (ins bprtarget:$imm19, CCOp:$cond,
+                                         FCCRegs:$cc),
+                  "fb$cond $cc, $imm19", []>;
+  def CCA   : F2_3<0b101, 1, 1, (outs), (ins bprtarget:$imm19, CCOp:$cond,
+                                         FCCRegs:$cc),
+                  "fb$cond,a $cc, $imm19", []>;
+  def CCNT  : F2_3<0b101, 0, 0, (outs), (ins bprtarget:$imm19, CCOp:$cond,
+                                         FCCRegs:$cc),
+                  "fb$cond,pn $cc, $imm19", []>;
+  def CCANT : F2_3<0b101, 1, 0, (outs), (ins bprtarget:$imm19, CCOp:$cond,
+                                         FCCRegs:$cc),
+                  "fb$cond,a,pn $cc, $imm19", []>;
+}
+} // let isBranch = 1, isTerminator = 1, hasDelaySlot = 1
+
+let Uses = [FCC0] in {
+  def FBCOND  : FPBranchSP<(ins brtarget:$imm22, CCOp:$cond),
+                              "fb$cond $imm22",
+                              [(SPbrfcc bb:$imm22, imm:$cond)]>;
+  def FBCONDA : FPBranchSPA<(ins brtarget:$imm22, CCOp:$cond),
+                             "fb$cond,a $imm22", []>;
+}
+
+let Predicates = [HasV9] in
+  defm BPF : FPredBranch;
+
+
+// Section B.24 - Call and Link Instruction, p. 125
+// This is the only Format 1 instruction
+let Uses = [O6],
+    hasDelaySlot = 1, isCall = 1 in {
+  def CALL : InstSP<(outs), (ins calltarget:$disp, variable_ops),
+                    "call $disp", []> {
+    bits<30> disp;
+    let op = 1;
+    let Inst{29-0} = disp;
+  }
+
+  // indirect calls: special cases of JMPL.
+  let isCodeGenOnly = 1, rd = 15 in {
+    def CALLrr : F3_1<2, 0b111000,
+                      (outs), (ins MEMrr:$ptr, variable_ops),
+                      "call $ptr",
+                      [(call ADDRrr:$ptr)]>;
+    def CALLri : F3_2<2, 0b111000,
+                      (outs), (ins MEMri:$ptr, variable_ops),
+                      "call $ptr",
+                      [(call ADDRri:$ptr)]>;
+  }
+}
+
+// Section B.28 - Read State Register Instructions
+let Uses = [Y], rs1 = 0, rs2 = 0 in
+  def RDY : F3_1<2, 0b101000,
+                 (outs IntRegs:$dst), (ins),
+                 "rd %y, $dst", []>;
+
+// Section B.29 - Write State Register Instructions
+let Defs = [Y], rd = 0 in {
+  def WRYrr : F3_1<2, 0b110000,
+                   (outs), (ins IntRegs:$rs1, IntRegs:$rs2),
+                   "wr $rs1, $rs2, %y", []>;
+  def WRYri : F3_2<2, 0b110000,
+                   (outs), (ins IntRegs:$rs1, simm13Op:$simm13),
+                   "wr $rs1, $simm13, %y", []>;
+}
+// Convert Integer to Floating-point Instructions, p. 141
+def FITOS : F3_3u<2, 0b110100, 0b011000100,
+                 (outs FPRegs:$rd), (ins FPRegs:$rs2),
+                 "fitos $rs2, $rd",
+                 [(set FPRegs:$rd, (SPitof FPRegs:$rs2))]>;
+def FITOD : F3_3u<2, 0b110100, 0b011001000,
+                 (outs DFPRegs:$rd), (ins FPRegs:$rs2),
+                 "fitod $rs2, $rd",
+                 [(set DFPRegs:$rd, (SPitof FPRegs:$rs2))]>;
+def FITOQ : F3_3u<2, 0b110100, 0b011001100,
+                 (outs QFPRegs:$rd), (ins FPRegs:$rs2),
+                 "fitoq $rs2, $rd",
+                 [(set QFPRegs:$rd, (SPitof FPRegs:$rs2))]>,
+                 Requires<[HasHardQuad]>;
+
+// Convert Floating-point to Integer Instructions, p. 142
+def FSTOI : F3_3u<2, 0b110100, 0b011010001,
+                 (outs FPRegs:$rd), (ins FPRegs:$rs2),
+                 "fstoi $rs2, $rd",
+                 [(set FPRegs:$rd, (SPftoi FPRegs:$rs2))]>;
+def FDTOI : F3_3u<2, 0b110100, 0b011010010,
+                 (outs FPRegs:$rd), (ins DFPRegs:$rs2),
+                 "fdtoi $rs2, $rd",
+                 [(set FPRegs:$rd, (SPftoi DFPRegs:$rs2))]>;
+def FQTOI : F3_3u<2, 0b110100, 0b011010011,
+                 (outs FPRegs:$rd), (ins QFPRegs:$rs2),
+                 "fqtoi $rs2, $rd",
+                 [(set FPRegs:$rd, (SPftoi QFPRegs:$rs2))]>,
+                 Requires<[HasHardQuad]>;
+
+// Convert between Floating-point Formats Instructions, p. 143
+def FSTOD : F3_3u<2, 0b110100, 0b011001001,
+                 (outs DFPRegs:$rd), (ins FPRegs:$rs2),
+                 "fstod $rs2, $rd",
+                 [(set f64:$rd, (fextend f32:$rs2))]>;
+def FSTOQ : F3_3u<2, 0b110100, 0b011001101,
+                 (outs QFPRegs:$rd), (ins FPRegs:$rs2),
+                 "fstoq $rs2, $rd",
+                 [(set f128:$rd, (fextend f32:$rs2))]>,
+                 Requires<[HasHardQuad]>;
+def FDTOS : F3_3u<2, 0b110100, 0b011000110,
+                 (outs FPRegs:$rd), (ins DFPRegs:$rs2),
+                 "fdtos $rs2, $rd",
+                 [(set f32:$rd, (fround f64:$rs2))]>;
+def FDTOQ : F3_3u<2, 0b110100, 0b011001110,
+                 (outs QFPRegs:$rd), (ins DFPRegs:$rs2),
+                 "fdtoq $rs2, $rd",
+                 [(set f128:$rd, (fextend f64:$rs2))]>,
+                 Requires<[HasHardQuad]>;
+def FQTOS : F3_3u<2, 0b110100, 0b011000111,
+                 (outs FPRegs:$rd), (ins QFPRegs:$rs2),
+                 "fqtos $rs2, $rd",
+                 [(set f32:$rd, (fround f128:$rs2))]>,
+                 Requires<[HasHardQuad]>;
+def FQTOD : F3_3u<2, 0b110100, 0b011001011,
+                 (outs DFPRegs:$rd), (ins QFPRegs:$rs2),
+                 "fqtod $rs2, $rd",
+                 [(set f64:$rd, (fround f128:$rs2))]>,
+                 Requires<[HasHardQuad]>;
+
+// Floating-point Move Instructions, p. 144
+def FMOVS : F3_3u<2, 0b110100, 0b000000001,
+                 (outs FPRegs:$rd), (ins FPRegs:$rs2),
+                 "fmovs $rs2, $rd", []>;
+def FNEGS : F3_3u<2, 0b110100, 0b000000101,
+                 (outs FPRegs:$rd), (ins FPRegs:$rs2),
+                 "fnegs $rs2, $rd",
+                 [(set f32:$rd, (fneg f32:$rs2))]>;
+def FABSS : F3_3u<2, 0b110100, 0b000001001,
+                 (outs FPRegs:$rd), (ins FPRegs:$rs2),
+                 "fabss $rs2, $rd",
+                 [(set f32:$rd, (fabs f32:$rs2))]>;
+
+
+// Floating-point Square Root Instructions, p.145
+def FSQRTS : F3_3u<2, 0b110100, 0b000101001,
+                  (outs FPRegs:$rd), (ins FPRegs:$rs2),
+                  "fsqrts $rs2, $rd",
+                  [(set f32:$rd, (fsqrt f32:$rs2))]>;
+def FSQRTD : F3_3u<2, 0b110100, 0b000101010,
+                  (outs DFPRegs:$rd), (ins DFPRegs:$rs2),
+                  "fsqrtd $rs2, $rd",
+                  [(set f64:$rd, (fsqrt f64:$rs2))]>;
+def FSQRTQ : F3_3u<2, 0b110100, 0b000101011,
+                  (outs QFPRegs:$rd), (ins QFPRegs:$rs2),
+                  "fsqrtq $rs2, $rd",
+                  [(set f128:$rd, (fsqrt f128:$rs2))]>,
+                  Requires<[HasHardQuad]>;
+
+
+
+// Floating-point Add and Subtract Instructions, p. 146
+def FADDS  : F3_3<2, 0b110100, 0b001000001,
+                  (outs FPRegs:$rd), (ins FPRegs:$rs1, FPRegs:$rs2),
+                  "fadds $rs1, $rs2, $rd",
+                  [(set f32:$rd, (fadd f32:$rs1, f32:$rs2))]>;
+def FADDD  : F3_3<2, 0b110100, 0b001000010,
+                  (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                  "faddd $rs1, $rs2, $rd",
+                  [(set f64:$rd, (fadd f64:$rs1, f64:$rs2))]>;
+def FADDQ  : F3_3<2, 0b110100, 0b001000011,
+                  (outs QFPRegs:$rd), (ins QFPRegs:$rs1, QFPRegs:$rs2),
+                  "faddq $rs1, $rs2, $rd",
+                  [(set f128:$rd, (fadd f128:$rs1, f128:$rs2))]>,
+                  Requires<[HasHardQuad]>;
+
+def FSUBS  : F3_3<2, 0b110100, 0b001000101,
+                  (outs FPRegs:$rd), (ins FPRegs:$rs1, FPRegs:$rs2),
+                  "fsubs $rs1, $rs2, $rd",
+                  [(set f32:$rd, (fsub f32:$rs1, f32:$rs2))]>;
+def FSUBD  : F3_3<2, 0b110100, 0b001000110,
+                  (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                  "fsubd $rs1, $rs2, $rd",
+                  [(set f64:$rd, (fsub f64:$rs1, f64:$rs2))]>;
+def FSUBQ  : F3_3<2, 0b110100, 0b001000111,
+                  (outs QFPRegs:$rd), (ins QFPRegs:$rs1, QFPRegs:$rs2),
+                  "fsubq $rs1, $rs2, $rd",
+                  [(set f128:$rd, (fsub f128:$rs1, f128:$rs2))]>,
+                  Requires<[HasHardQuad]>;
+
+
+// Floating-point Multiply and Divide Instructions, p. 147
+def FMULS  : F3_3<2, 0b110100, 0b001001001,
+                  (outs FPRegs:$rd), (ins FPRegs:$rs1, FPRegs:$rs2),
+                  "fmuls $rs1, $rs2, $rd",
+                  [(set f32:$rd, (fmul f32:$rs1, f32:$rs2))]>;
+def FMULD  : F3_3<2, 0b110100, 0b001001010,
+                  (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                  "fmuld $rs1, $rs2, $rd",
+                  [(set f64:$rd, (fmul f64:$rs1, f64:$rs2))]>;
+def FMULQ  : F3_3<2, 0b110100, 0b001001011,
+                  (outs QFPRegs:$rd), (ins QFPRegs:$rs1, QFPRegs:$rs2),
+                  "fmulq $rs1, $rs2, $rd",
+                  [(set f128:$rd, (fmul f128:$rs1, f128:$rs2))]>,
+                  Requires<[HasHardQuad]>;
+
+def FSMULD : F3_3<2, 0b110100, 0b001101001,
+                  (outs DFPRegs:$rd), (ins FPRegs:$rs1, FPRegs:$rs2),
+                  "fsmuld $rs1, $rs2, $rd",
+                  [(set f64:$rd, (fmul (fextend f32:$rs1),
+                                        (fextend f32:$rs2)))]>;
+def FDMULQ : F3_3<2, 0b110100, 0b001101110,
+                  (outs QFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                  "fdmulq $rs1, $rs2, $rd",
+                  [(set f128:$rd, (fmul (fextend f64:$rs1),
+                                         (fextend f64:$rs2)))]>,
+                  Requires<[HasHardQuad]>;
+
+def FDIVS  : F3_3<2, 0b110100, 0b001001101,
+                 (outs FPRegs:$rd), (ins FPRegs:$rs1, FPRegs:$rs2),
+                 "fdivs $rs1, $rs2, $rd",
+                 [(set f32:$rd, (fdiv f32:$rs1, f32:$rs2))]>;
+def FDIVD  : F3_3<2, 0b110100, 0b001001110,
+                 (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                 "fdivd $rs1, $rs2, $rd",
+                 [(set f64:$rd, (fdiv f64:$rs1, f64:$rs2))]>;
+def FDIVQ  : F3_3<2, 0b110100, 0b001001111,
+                 (outs QFPRegs:$rd), (ins QFPRegs:$rs1, QFPRegs:$rs2),
+                 "fdivq $rs1, $rs2, $rd",
+                 [(set f128:$rd, (fdiv f128:$rs1, f128:$rs2))]>,
+                 Requires<[HasHardQuad]>;
+
+// Floating-point Compare Instructions, p. 148
+// Note: the 2nd template arg is different for these guys.
+// Note 2: the result of a FCMP is not available until the 2nd cycle
+// after the instr is retired, but there is no interlock in Sparc V8.
+// This behavior is modeled with a forced noop after the instruction in
+// DelaySlotFiller.
+
+let Defs = [FCC0], rd = 0, isCodeGenOnly = 1 in {
+  def FCMPS  : F3_3c<2, 0b110101, 0b001010001,
+                   (outs), (ins FPRegs:$rs1, FPRegs:$rs2),
+                   "fcmps $rs1, $rs2",
+                   [(SPcmpfcc f32:$rs1, f32:$rs2)]>;
+  def FCMPD  : F3_3c<2, 0b110101, 0b001010010,
+                   (outs), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                   "fcmpd $rs1, $rs2",
+                   [(SPcmpfcc f64:$rs1, f64:$rs2)]>;
+  def FCMPQ  : F3_3c<2, 0b110101, 0b001010011,
+                   (outs), (ins QFPRegs:$rs1, QFPRegs:$rs2),
+                   "fcmpq $rs1, $rs2",
+                   [(SPcmpfcc f128:$rs1, f128:$rs2)]>,
+                   Requires<[HasHardQuad]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Instructions for Thread Local Storage(TLS).
+//===----------------------------------------------------------------------===//
+let isCodeGenOnly = 1, isAsmParserOnly = 1 in {
+def TLS_ADDrr : F3_1<2, 0b000000,
+                    (outs IntRegs:$rd),
+                    (ins IntRegs:$rs1, IntRegs:$rs2, TLSSym:$sym),
+                    "add $rs1, $rs2, $rd, $sym",
+                    [(set i32:$rd,
+                        (tlsadd i32:$rs1, i32:$rs2, tglobaltlsaddr:$sym))]>;
+
+let mayLoad = 1 in
+  def TLS_LDrr : F3_1<3, 0b000000,
+                      (outs IntRegs:$dst), (ins MEMrr:$addr, TLSSym:$sym),
+                      "ld [$addr], $dst, $sym",
+                      [(set i32:$dst,
+                          (tlsld ADDRrr:$addr, tglobaltlsaddr:$sym))]>;
+
+let Uses = [O6], isCall = 1, hasDelaySlot = 1 in
+  def TLS_CALL : InstSP<(outs),
+                        (ins calltarget:$disp, TLSSym:$sym, variable_ops),
+                        "call $disp, $sym",
+                        [(tlscall texternalsym:$disp, tglobaltlsaddr:$sym)]> {
+  bits<30> disp;
+  let op = 1;
+  let Inst{29-0} = disp;
+}
+}
+
+//===----------------------------------------------------------------------===//
+// V9 Instructions
+//===----------------------------------------------------------------------===//
+
+// V9 Conditional Moves.
+let Predicates = [HasV9], Constraints = "$f = $rd" in {
+  // Move Integer Register on Condition (MOVcc) p. 194 of the V9 manual.
+  let Uses = [ICC], intcc = 1, cc = 0b00 in {
+    def MOVICCrr
+      : F4_1<0b101100, (outs IntRegs:$rd),
+             (ins IntRegs:$rs2, IntRegs:$f, CCOp:$cond),
+             "mov$cond %icc, $rs2, $rd",
+             [(set i32:$rd, (SPselecticc i32:$rs2, i32:$f, imm:$cond))]>;
+
+    def MOVICCri
+      : F4_2<0b101100, (outs IntRegs:$rd),
+             (ins i32imm:$simm11, IntRegs:$f, CCOp:$cond),
+             "mov$cond %icc, $simm11, $rd",
+             [(set i32:$rd,
+                    (SPselecticc simm11:$simm11, i32:$f, imm:$cond))]>;
+  }
+
+  let Uses = [FCC0], intcc = 0, cc = 0b00 in {
+    def MOVFCCrr
+      : F4_1<0b101100, (outs IntRegs:$rd),
+             (ins IntRegs:$rs2, IntRegs:$f, CCOp:$cond),
+             "mov$cond %fcc0, $rs2, $rd",
+             [(set i32:$rd, (SPselectfcc i32:$rs2, i32:$f, imm:$cond))]>;
+    def MOVFCCri
+      : F4_2<0b101100, (outs IntRegs:$rd),
+             (ins i32imm:$simm11, IntRegs:$f, CCOp:$cond),
+             "mov$cond %fcc0, $simm11, $rd",
+             [(set i32:$rd,
+                    (SPselectfcc simm11:$simm11, i32:$f, imm:$cond))]>;
+  }
+
+  let Uses = [ICC], intcc = 1, opf_cc = 0b00 in {
+    def FMOVS_ICC
+      : F4_3<0b110101, 0b000001, (outs FPRegs:$rd),
+             (ins FPRegs:$rs2, FPRegs:$f, CCOp:$cond),
+             "fmovs$cond %icc, $rs2, $rd",
+             [(set f32:$rd, (SPselecticc f32:$rs2, f32:$f, imm:$cond))]>;
+    def FMOVD_ICC
+      : F4_3<0b110101, 0b000010, (outs DFPRegs:$rd),
+               (ins DFPRegs:$rs2, DFPRegs:$f, CCOp:$cond),
+               "fmovd$cond %icc, $rs2, $rd",
+               [(set f64:$rd, (SPselecticc f64:$rs2, f64:$f, imm:$cond))]>;
+    def FMOVQ_ICC
+      : F4_3<0b110101, 0b000011, (outs QFPRegs:$rd),
+               (ins QFPRegs:$rs2, QFPRegs:$f, CCOp:$cond),
+               "fmovq$cond %icc, $rs2, $rd",
+               [(set f128:$rd, (SPselecticc f128:$rs2, f128:$f, imm:$cond))]>,
+               Requires<[HasHardQuad]>;
+  }
+
+  let Uses = [FCC0], intcc = 0, opf_cc = 0b00 in {
+    def FMOVS_FCC
+      : F4_3<0b110101, 0b000001, (outs FPRegs:$rd),
+             (ins FPRegs:$rs2, FPRegs:$f, CCOp:$cond),
+             "fmovs$cond %fcc0, $rs2, $rd",
+             [(set f32:$rd, (SPselectfcc f32:$rs2, f32:$f, imm:$cond))]>;
+    def FMOVD_FCC
+      : F4_3<0b110101, 0b000010, (outs DFPRegs:$rd),
+             (ins DFPRegs:$rs2, DFPRegs:$f, CCOp:$cond),
+             "fmovd$cond %fcc0, $rs2, $rd",
+             [(set f64:$rd, (SPselectfcc f64:$rs2, f64:$f, imm:$cond))]>;
+    def FMOVQ_FCC
+      : F4_3<0b110101, 0b000011, (outs QFPRegs:$rd),
+             (ins QFPRegs:$rs2, QFPRegs:$f, CCOp:$cond),
+             "fmovq$cond %fcc0, $rs2, $rd",
+             [(set f128:$rd, (SPselectfcc f128:$rs2, f128:$f, imm:$cond))]>,
+             Requires<[HasHardQuad]>;
+  }
+
+}
+
+// Floating-Point Move Instructions, p. 164 of the V9 manual.
+let Predicates = [HasV9] in {
+  def FMOVD : F3_3u<2, 0b110100, 0b000000010,
+                   (outs DFPRegs:$rd), (ins DFPRegs:$rs2),
+                   "fmovd $rs2, $rd", []>;
+  def FMOVQ : F3_3u<2, 0b110100, 0b000000011,
+                   (outs QFPRegs:$rd), (ins QFPRegs:$rs2),
+                   "fmovq $rs2, $rd", []>,
+                   Requires<[HasHardQuad]>;
+  def FNEGD : F3_3u<2, 0b110100, 0b000000110,
+                   (outs DFPRegs:$rd), (ins DFPRegs:$rs2),
+                   "fnegd $rs2, $rd",
+                   [(set f64:$rd, (fneg f64:$rs2))]>;
+  def FNEGQ : F3_3u<2, 0b110100, 0b000000111,
+                   (outs QFPRegs:$rd), (ins QFPRegs:$rs2),
+                   "fnegq $rs2, $rd",
+                   [(set f128:$rd, (fneg f128:$rs2))]>,
+                   Requires<[HasHardQuad]>;
+  def FABSD : F3_3u<2, 0b110100, 0b000001010,
+                   (outs DFPRegs:$rd), (ins DFPRegs:$rs2),
+                   "fabsd $rs2, $rd",
+                   [(set f64:$rd, (fabs f64:$rs2))]>;
+  def FABSQ : F3_3u<2, 0b110100, 0b000001011,
+                   (outs QFPRegs:$rd), (ins QFPRegs:$rs2),
+                   "fabsq $rs2, $rd",
+                   [(set f128:$rd, (fabs f128:$rs2))]>,
+                   Requires<[HasHardQuad]>;
+}
+
+// Floating-point compare instruction with %fcc0-%fcc3.
+def V9FCMPS  : F3_3c<2, 0b110101, 0b001010001,
+               (outs FCCRegs:$rd), (ins FPRegs:$rs1, FPRegs:$rs2),
+               "fcmps $rd, $rs1, $rs2", []>;
+def V9FCMPD  : F3_3c<2, 0b110101, 0b001010010,
+                (outs FCCRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                "fcmpd $rd, $rs1, $rs2", []>;
+def V9FCMPQ  : F3_3c<2, 0b110101, 0b001010011,
+                (outs FCCRegs:$rd), (ins QFPRegs:$rs1, QFPRegs:$rs2),
+                "fcmpq $rd, $rs1, $rs2", []>,
+                 Requires<[HasHardQuad]>;
+
+let hasSideEffects = 1 in {
+  def V9FCMPES  : F3_3c<2, 0b110101, 0b001010101,
+                   (outs FCCRegs:$rd), (ins FPRegs:$rs1, FPRegs:$rs2),
+                   "fcmpes $rd, $rs1, $rs2", []>;
+  def V9FCMPED  : F3_3c<2, 0b110101, 0b001010110,
+                   (outs FCCRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                   "fcmped $rd, $rs1, $rs2", []>;
+  def V9FCMPEQ  : F3_3c<2, 0b110101, 0b001010111,
+                   (outs FCCRegs:$rd), (ins QFPRegs:$rs1, QFPRegs:$rs2),
+                   "fcmpeq $rd, $rs1, $rs2", []>,
+                   Requires<[HasHardQuad]>;
+}
+
+// Floating point conditional move instrucitons with %fcc0-%fcc3.
+let Predicates = [HasV9] in {
+  let Constraints = "$f = $rd", intcc = 0 in {
+    def V9MOVFCCrr
+      : F4_1<0b101100, (outs IntRegs:$rd),
+             (ins FCCRegs:$cc, IntRegs:$rs2, IntRegs:$f, CCOp:$cond),
+             "mov$cond $cc, $rs2, $rd", []>;
+    def V9MOVFCCri
+      : F4_2<0b101100, (outs IntRegs:$rd),
+             (ins FCCRegs:$cc, i32imm:$simm11, IntRegs:$f, CCOp:$cond),
+             "mov$cond $cc, $simm11, $rd", []>;
+    def V9FMOVS_FCC
+      : F4_3<0b110101, 0b000001, (outs FPRegs:$rd),
+             (ins FCCRegs:$opf_cc, FPRegs:$rs2, FPRegs:$f, CCOp:$cond),
+             "fmovs$cond $opf_cc, $rs2, $rd", []>;
+    def V9FMOVD_FCC
+      : F4_3<0b110101, 0b000010, (outs DFPRegs:$rd),
+             (ins FCCRegs:$opf_cc, DFPRegs:$rs2, DFPRegs:$f, CCOp:$cond),
+             "fmovd$cond $opf_cc, $rs2, $rd", []>;
+    def V9FMOVQ_FCC
+      : F4_3<0b110101, 0b000011, (outs QFPRegs:$rd),
+             (ins FCCRegs:$opf_cc, QFPRegs:$rs2, QFPRegs:$f, CCOp:$cond),
+             "fmovq$cond $opf_cc, $rs2, $rd", []>,
+             Requires<[HasHardQuad]>;
+  } // Constraints = "$f = $rd", ...
+} // let Predicates = [hasV9]
+
+
+// POPCrr - This does a ctpop of a 64-bit register.  As such, we have to clear
+// the top 32-bits before using it.  To do this clearing, we use a SRLri X,0.
+let rs1 = 0 in
+  def POPCrr : F3_1<2, 0b101110,
+                    (outs IntRegs:$dst), (ins IntRegs:$src),
+                    "popc $src, $dst", []>, Requires<[HasV9]>;
+def : Pat<(ctpop i32:$src),
+          (POPCrr (SRLri $src, 0))>;
+
+// Atomic swap.
+let hasSideEffects =1, rd = 0, rs1 = 0b01111, rs2 = 0 in
+  def STBAR : F3_1<2, 0b101000, (outs), (ins), "stbar", []>;
+
+let Predicates = [HasV9], hasSideEffects = 1, rd = 0, rs1 = 0b01111 in
+ def MEMBARi : F3_2<2, 0b101000, (outs), (ins simm13Op:$simm13),
+                    "membar $simm13", []>;
+
+let Constraints = "$val = $dst", DecoderMethod = "DecodeSWAP" in {
+  def SWAPrr : F3_1<3, 0b001111,
+                 (outs IntRegs:$dst), (ins MEMrr:$addr, IntRegs:$val),
+                 "swap [$addr], $dst",
+                 [(set i32:$dst, (atomic_swap_32 ADDRrr:$addr, i32:$val))]>;
+  def SWAPri : F3_2<3, 0b001111,
+                 (outs IntRegs:$dst), (ins MEMri:$addr, IntRegs:$val),
+                 "swap [$addr], $dst",
+                 [(set i32:$dst, (atomic_swap_32 ADDRri:$addr, i32:$val))]>;
+}
+
+let Predicates = [HasV9], Constraints = "$swap = $rd" in
+  def CASrr: F3_1_asi<3, 0b111100, 0b10000000,
+                (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2,
+                                     IntRegs:$swap),
+                 "cas [$rs1], $rs2, $rd",
+                 [(set i32:$rd,
+                     (atomic_cmp_swap iPTR:$rs1, i32:$rs2, i32:$swap))]>;
+
+let Defs = [ICC] in {
+defm TADDCC   : F3_12np<"taddcc",   0b100000>;
+defm TSUBCC   : F3_12np<"tsubcc",   0b100001>;
+
+let hasSideEffects = 1 in {
+  defm TADDCCTV : F3_12np<"taddcctv", 0b100010>;
+  defm TSUBCCTV : F3_12np<"tsubcctv", 0b100011>;
+}
+}
+
+multiclass TRAP<string regStr> {
+  def rr : TRAPSPrr<0b111010, (outs), (ins IntRegs:$rs1, IntRegs:$rs2,
+                                       CCOp:$cond),
+              !strconcat(!strconcat("t$cond ", regStr), ", $rs1 + $rs2"), []>;
+  def ri : TRAPSPri<0b111010, (outs), (ins IntRegs:$rs1, i32imm:$imm,
+                                      CCOp:$cond),
+              !strconcat(!strconcat("t$cond ", regStr), ", $rs1 + $imm"), []>;
+}
+
+let hasSideEffects = 1, Uses = [ICC], cc = 0b00 in
+  defm TICC : TRAP<"%icc">;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// Small immediates.
+def : Pat<(i32 simm13:$val),
+          (ORri (i32 G0), imm:$val)>;
+// Arbitrary immediates.
+def : Pat<(i32 imm:$val),
+          (ORri (SETHIi (HI22 imm:$val)), (LO10 imm:$val))>;
+
+
+// Global addresses, constant pool entries
+let Predicates = [Is32Bit] in {
+
+def : Pat<(SPhi tglobaladdr:$in), (SETHIi tglobaladdr:$in)>;
+def : Pat<(SPlo tglobaladdr:$in), (ORri (i32 G0), tglobaladdr:$in)>;
+def : Pat<(SPhi tconstpool:$in), (SETHIi tconstpool:$in)>;
+def : Pat<(SPlo tconstpool:$in), (ORri (i32 G0), tconstpool:$in)>;
+
+// GlobalTLS addresses
+def : Pat<(SPhi tglobaltlsaddr:$in), (SETHIi tglobaltlsaddr:$in)>;
+def : Pat<(SPlo tglobaltlsaddr:$in), (ORri (i32 G0), tglobaltlsaddr:$in)>;
+def : Pat<(add (SPhi tglobaltlsaddr:$in1), (SPlo tglobaltlsaddr:$in2)),
+          (ADDri (SETHIi tglobaltlsaddr:$in1), (tglobaltlsaddr:$in2))>;
+def : Pat<(xor (SPhi tglobaltlsaddr:$in1), (SPlo tglobaltlsaddr:$in2)),
+          (XORri (SETHIi tglobaltlsaddr:$in1), (tglobaltlsaddr:$in2))>;
+
+// Blockaddress
+def : Pat<(SPhi tblockaddress:$in), (SETHIi tblockaddress:$in)>;
+def : Pat<(SPlo tblockaddress:$in), (ORri (i32 G0), tblockaddress:$in)>;
+
+// Add reg, lo.  This is used when taking the addr of a global/constpool entry.
+def : Pat<(add iPTR:$r, (SPlo tglobaladdr:$in)), (ADDri $r, tglobaladdr:$in)>;
+def : Pat<(add iPTR:$r, (SPlo tconstpool:$in)),  (ADDri $r, tconstpool:$in)>;
+def : Pat<(add iPTR:$r, (SPlo tblockaddress:$in)),
+                        (ADDri $r, tblockaddress:$in)>;
+}
+
+// Calls:
+def : Pat<(call tglobaladdr:$dst),
+          (CALL tglobaladdr:$dst)>;
+def : Pat<(call texternalsym:$dst),
+          (CALL texternalsym:$dst)>;
+
+// Map integer extload's to zextloads.
+def : Pat<(i32 (extloadi1 ADDRrr:$src)), (LDUBrr ADDRrr:$src)>;
+def : Pat<(i32 (extloadi1 ADDRri:$src)), (LDUBri ADDRri:$src)>;
+def : Pat<(i32 (extloadi8 ADDRrr:$src)), (LDUBrr ADDRrr:$src)>;
+def : Pat<(i32 (extloadi8 ADDRri:$src)), (LDUBri ADDRri:$src)>;
+def : Pat<(i32 (extloadi16 ADDRrr:$src)), (LDUHrr ADDRrr:$src)>;
+def : Pat<(i32 (extloadi16 ADDRri:$src)), (LDUHri ADDRri:$src)>;
+
+// zextload bool -> zextload byte
+def : Pat<(i32 (zextloadi1 ADDRrr:$src)), (LDUBrr ADDRrr:$src)>;
+def : Pat<(i32 (zextloadi1 ADDRri:$src)), (LDUBri ADDRri:$src)>;
+
+// store 0, addr -> store %g0, addr
+def : Pat<(store (i32 0), ADDRrr:$dst), (STrr ADDRrr:$dst, (i32 G0))>;
+def : Pat<(store (i32 0), ADDRri:$dst), (STri ADDRri:$dst, (i32 G0))>;
+
+// store bar for all atomic_fence in V8.
+let Predicates = [HasNoV9] in
+  def : Pat<(atomic_fence imm, imm), (STBAR)>;
+
+// atomic_load_32 addr -> load addr
+def : Pat<(i32 (atomic_load ADDRrr:$src)), (LDrr ADDRrr:$src)>;
+def : Pat<(i32 (atomic_load ADDRri:$src)), (LDri ADDRri:$src)>;
+
+// atomic_store_32 val, addr -> store val, addr
+def : Pat<(atomic_store ADDRrr:$dst, i32:$val), (STrr ADDRrr:$dst, $val)>;
+def : Pat<(atomic_store ADDRri:$dst, i32:$val), (STri ADDRri:$dst, $val)>;
+
+
+include "SparcInstr64Bit.td"
+include "SparcInstrVIS.td"
+include "SparcInstrAliases.td"
diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstrVIS.td b/contrib/llvm/lib/Target/Sparc/SparcInstrVIS.td
new file mode 100644
index 0000000..3e2b49d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcInstrVIS.td
@@ -0,0 +1,263 @@
+//===---- SparcInstrVIS.td - Visual Instruction Set extensions (VIS) -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains instruction formats, definitions and patterns needed for
+// VIS, VIS II, VIS II instructions on SPARC.
+//===----------------------------------------------------------------------===//
+
+// VIS Instruction Format.
+class VISInstFormat<bits<9> opfval, dag outs, dag ins, string asmstr,
+      list<dag> pattern>
+      : F3_3<0b10, 0b110110, opfval, outs, ins, asmstr, pattern>;
+
+class VISInst<bits<9> opfval, string OpcStr, RegisterClass RC = DFPRegs>
+       : VISInstFormat<opfval,
+        (outs RC:$rd), (ins RC:$rs1, RC:$rs2),
+        !strconcat(OpcStr, " $rs1, $rs2, $rd"), []>;
+
+// VIS Instruction with integer destination register.
+class VISInstID<bits<9> opfval, string OpcStr>
+       : VISInstFormat<opfval,
+        (outs I64Regs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+        !strconcat(OpcStr, " $rs1, $rs2, $rd"), []>;
+
+// For VIS Instructions with no operand.
+let rd = 0, rs1 = 0, rs2 = 0 in
+class VISInst0<bits<9> opfval, string asmstr>
+       : VISInstFormat<opfval, (outs), (ins), asmstr, []>;
+
+// For VIS Instructions with only rs1, rd operands.
+let rs2 = 0 in
+class VISInst1<bits<9> opfval, string OpcStr, RegisterClass RC = DFPRegs>
+       : VISInstFormat<opfval,
+        (outs RC:$rd), (ins RC:$rs1),
+        !strconcat(OpcStr, " $rs1, $rd"), []>;
+
+// For VIS Instructions with only rs2, rd operands.
+let rs1 = 0 in
+class VISInst2<bits<9> opfval, string OpcStr, RegisterClass RC = DFPRegs>
+       : VISInstFormat<opfval,
+        (outs RC:$rd), (ins RC:$rs2),
+        !strconcat(OpcStr, " $rs2, $rd"), []>;
+
+// For VIS Instructions with only rd operand.
+let Constraints = "$rd = $f", rs1 = 0, rs2 = 0 in
+class VISInstD<bits<9> opfval, string OpcStr, RegisterClass RC = DFPRegs>
+       : VISInstFormat<opfval,
+        (outs RC:$rd), (ins RC:$f),
+        !strconcat(OpcStr, " $rd"), []>;
+
+// VIS 1 Instructions
+let Predicates = [HasVIS] in {
+
+def FPADD16     : VISInst<0b001010000, "fpadd16">;
+def FPADD16S    : VISInst<0b001010001, "fpadd16s">;
+def FPADD32     : VISInst<0b001010010, "fpadd32">;
+def FPADD32S    : VISInst<0b001010011, "fpadd32s">;
+def FPSUB16     : VISInst<0b001010100, "fpsub16">;
+def FPSUB16S    : VISInst<0b001010101, "fpsub16S">;
+def FPSUB32     : VISInst<0b001010110, "fpsub32">;
+def FPSUB32S    : VISInst<0b001010111, "fpsub32S">;
+
+def FPACK16     : VISInst2<0b000111011, "fpack16">;
+def FPACK32     : VISInst <0b000111010, "fpack32">;
+def FPACKFIX    : VISInst2<0b000111101, "fpackfix">;
+def FEXPAND     : VISInst2<0b001001101, "fexpand">;
+def FPMERGE     : VISInst <0b001001011, "fpmerge">;
+
+def FMUL8X16    : VISInst<0b00110001, "fmul8x16">;
+def FMUL8X16AU  : VISInst<0b00110011, "fmul8x16au">;
+def FMUL8X16AL  : VISInst<0b00110101, "fmul8x16al">;
+def FMUL8SUX16  : VISInst<0b00110110, "fmul8sux16">;
+def FMUL8ULX16  : VISInst<0b00110111, "fmul8ulx16">;
+def FMULD8SUX16 : VISInst<0b00111000, "fmuld8sux16">;
+def FMULD8ULX16 : VISInst<0b00111001, "fmuld8ulx16">;
+
+def ALIGNADDR   : VISInst<0b000011000, "alignaddr", I64Regs>;
+def ALIGNADDRL  : VISInst<0b000011010, "alignaddrl", I64Regs>;
+def FALIGNADATA : VISInst<0b001001000, "faligndata">;
+
+def FZERO       : VISInstD<0b001100000, "fzero">;
+def FZEROS      : VISInstD<0b001100001, "fzeros", FPRegs>;
+def FONE        : VISInstD<0b001111110, "fone">;
+def FONES       : VISInstD<0b001111111, "fones", FPRegs>;
+def FSRC1       : VISInst1<0b001110100, "fsrc1">;
+def FSRC1S      : VISInst1<0b001110101, "fsrc1s", FPRegs>;
+def FSRC2       : VISInst2<0b001111000, "fsrc2">;
+def FSRC2S      : VISInst2<0b001111001, "fsrc2s", FPRegs>;
+def FNOT1       : VISInst1<0b001101010, "fnot1">;
+def FNOT1S      : VISInst1<0b001101011, "fnot1s", FPRegs>;
+def FNOT2       : VISInst2<0b001100110, "fnot2">;
+def FNOT2S      : VISInst2<0b001100111, "fnot2s", FPRegs>;
+def FOR         : VISInst<0b001111100,  "for">;
+def FORS        : VISInst<0b001111101,  "fors",  FPRegs>;
+def FNOR        : VISInst<0b001100010,  "fnor">;
+def FNORS       : VISInst<0b001100011,  "fnors", FPRegs>;
+def FAND        : VISInst<0b001110000,  "fand">;
+def FANDS       : VISInst<0b001110001,  "fands", FPRegs>;
+def FNAND       : VISInst<0b001101110,  "fnand">;
+def FNANDS      : VISInst<0b001101111,  "fnands", FPRegs>;
+def FXOR        : VISInst<0b001101100,  "fxor">;
+def FXORS       : VISInst<0b001101101,  "fxors", FPRegs>;
+def FXNOR       : VISInst<0b001110010,  "fxnor">;
+def FXNORS      : VISInst<0b001110011,  "fxnors", FPRegs>;
+
+def FORNOT1     : VISInst<0b001111010,  "fornot1">;
+def FORNOT1S    : VISInst<0b001111011,  "fornot1s",  FPRegs>;
+def FORNOT2     : VISInst<0b001110110,  "fornot2">;
+def FORNOT2S    : VISInst<0b001110111,  "fornot2s",  FPRegs>;
+def FANDNOT1    : VISInst<0b001101000,  "fandnot1">;
+def FANDNOT1S   : VISInst<0b001101001,  "fandnot1s", FPRegs>;
+def FANDNOT2    : VISInst<0b001100100,  "fandnot2">;
+def FANDNOT2S   : VISInst<0b001100101,  "fandnot2s", FPRegs>;
+
+def FCMPGT16    : VISInstID<0b000101000,  "fcmpgt16">;
+def FCMPGT32    : VISInstID<0b000101100,  "fcmpgt32">;
+def FCMPLE16    : VISInstID<0b000100000,  "fcmple16">;
+def FCMPLE32    : VISInstID<0b000100100,  "fcmple32">;
+def FCMPNE16    : VISInstID<0b000100010,  "fcmpne16">;
+def FCMPNE32    : VISInstID<0b000100110,  "fcmpne32">;
+def FCMPEQ16    : VISInstID<0b000101010,  "fcmpeq16">;
+def FCMPEQ32    : VISInstID<0b000101110,  "fcmpeq32">;
+
+
+def EDGE8       : VISInst<0b000000000,  "edge8",   I64Regs>;
+def EDGE8L      : VISInst<0b000000010,  "edge8l",  I64Regs>;
+def EDGE16      : VISInst<0b000000100,  "edge16",  I64Regs>;
+def EDGE16L     : VISInst<0b000000110,  "edge16l", I64Regs>;
+def EDGE32      : VISInst<0b000001000,  "edge32",  I64Regs>;
+def EDGE32L     : VISInst<0b000001010,  "edge32l", I64Regs>;
+
+def PDIST       : VISInst<0b00111110, "pdist">;
+
+def ARRAY8      : VISInst<0b000010000, "array8",  I64Regs>;
+def ARRAY16     : VISInst<0b000010010, "array16", I64Regs>;
+def ARRAY32     : VISInst<0b000010100, "array32", I64Regs>;
+
+def SHUTDOWN    : VISInst0<0b010000000, "shutdown">;
+
+} // Predicates = [HasVIS]
+
+
+// VIS 2 Instructions.
+let Predicates = [HasVIS2] in {
+
+def BMASK     : VISInst<0b000011001, "bmask", I64Regs>;
+def BSHUFFLE  : VISInst<0b000011100, "bshuffle">;
+
+def SIAM      : VISInst0<0b010000001, "siam">;
+
+def EDGE8N    : VISInst<0b000000001,  "edge8n",   I64Regs>;
+def EDGE8LN   : VISInst<0b000000011,  "edge8ln",  I64Regs>;
+def EDGE16N   : VISInst<0b000000101,  "edge16n",  I64Regs>;
+def EDGE16LN  : VISInst<0b000000111,  "edge16ln", I64Regs>;
+def EDGE32N   : VISInst<0b000001001,  "edge32n",  I64Regs>;
+def EDGE32LN  : VISInst<0b000001011,  "edge32ln", I64Regs>;
+} // Predicates = [HasVIS2]
+
+
+// VIS 3 Instructions.
+let Predicates = [HasVIS3] in {
+
+let Uses = [ICC] in
+def ADDXC : VISInst<0b000010001, "addxc", I64Regs>;
+
+let Defs = [ICC], Uses = [ICC] in
+def ADDXCCC : VISInst<0b000010011, "addxccc", I64Regs>;
+
+let rd = 0, rs1 = 0 in {
+def CMASK8  : VISInstFormat<0b000011011, (outs), (ins I64Regs:$rs2),
+              "cmask8 $rs2", []>;
+def CMASK16  : VISInstFormat<0b000011101, (outs), (ins I64Regs:$rs2),
+              "cmask16 $rs2", []>;
+def CMASK32  : VISInstFormat<0b000011111, (outs), (ins I64Regs:$rs2),
+              "cmask32 $rs2", []>;
+
+}
+
+def FCHKSM16 : VISInst<0b01000100, "fchksm16">;
+
+def FHADDS   : F3_3<0b10, 0b110100, 0b001100001,
+                    (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                    "fhadds $rs1, $rs2, $rd", []>;
+def FHADDD   : F3_3<0b10, 0b110100, 0b001100010,
+                    (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                    "fhaddd $rs1, $rs2, $rd", []>;
+def FHSUBS   : F3_3<0b10, 0b110100, 0b001100101,
+                    (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                    "fhsubs $rs1, $rs2, $rd", []>;
+def FHSUBD   : F3_3<0b10, 0b110100, 0b001100110,
+                    (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                    "fhsubd $rs1, $rs2, $rd", []>;
+def FLCMPS   : VISInstFormat<0b101010001, (outs FCCRegs:$rd),
+                     (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                     "flcmps $rd, $rs1, $rs2", []>;
+def FLCMPD   : VISInstFormat<0b101010010, (outs FCCRegs:$rd),
+                     (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                     "flcmpd $rd, $rs1, $rs2", []>;
+
+def FMEAN16  : VISInst<0b001000000, "fmean16">;
+
+def FNADDS   : F3_3<0b10, 0b110100, 0b001010001,
+                    (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                    "fnadds $rs1, $rs2, $rd", []>;
+def FNADDD   : F3_3<0b10, 0b110100, 0b001010010,
+                    (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                    "fnaddd $rs1, $rs2, $rd", []>;
+def FNHADDS  : F3_3<0b10, 0b110100, 0b001110001,
+                    (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                    "fnhadds $rs1, $rs2, $rd", []>;
+def FNHADDD  : F3_3<0b10, 0b110100, 0b001110010,
+                    (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                    "fnhaddd $rs1, $rs2, $rd", []>;
+
+def FNMULS   : F3_3<0b10, 0b110100, 0b001011001,
+                    (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                    "fnhadds $rs1, $rs2, $rd", []>;
+def FNMULD   : F3_3<0b10, 0b110100, 0b001011010,
+                    (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                    "fnhaddd $rs1, $rs2, $rd", []>;
+def FNSMULD  : F3_3<0b10, 0b110100, 0b001111001,
+                    (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                    "fnhadds $rs1, $rs2, $rd", []>;
+
+def FPADD64   : VISInst<0b001000010, "fpadd64">;
+
+def FSLL16    : VISInst<0b00100001, "fsll16">;
+def FSRL16    : VISInst<0b00100011, "fsrl16">;
+def FSLL32    : VISInst<0b00100101, "fsll32">;
+def FSRL32    : VISInst<0b00100111, "fsrl32">;
+def FSLAS16   : VISInst<0b00101001, "fslas16">;
+def FSRA16    : VISInst<0b00101011, "fsra16">;
+def FSLAS32   : VISInst<0b00101101, "fslas32">;
+def FSRA32    : VISInst<0b00101111, "fsra32">;
+
+let rs1 = 0 in
+def LZCNT     : VISInstFormat<0b000010111, (outs I64Regs:$rd),
+                   (ins I64Regs:$rs2), "lzcnt $rs2, $rd", []>;
+
+let rs1 = 0 in {
+def MOVSTOSW : VISInstFormat<0b100010011, (outs I64Regs:$rd),
+                   (ins DFPRegs:$rs2), "movstosw $rs2, $rd", []>;
+def MOVSTOUW : VISInstFormat<0b100010001, (outs I64Regs:$rd),
+                   (ins DFPRegs:$rs2), "movstouw $rs2, $rd", []>;
+def MOVDTOX  : VISInstFormat<0b100010000, (outs I64Regs:$rd),
+                   (ins DFPRegs:$rs2), "movdtox $rs2, $rd", []>;
+def MOVWTOS  :  VISInstFormat<0b100011001, (outs DFPRegs:$rd),
+                   (ins I64Regs:$rs2), "movdtox $rs2, $rd", []>;
+def MOVXTOD  :  VISInstFormat<0b100011000, (outs DFPRegs:$rd),
+                   (ins I64Regs:$rs2), "movdtox $rs2, $rd", []>;
+}
+
+def PDISTN   : VISInst<0b000111111, "pdistn">;
+
+def UMULXHI  : VISInst<0b000010110, "umulxhi", I64Regs>;
+def XMULX    : VISInst<0b100010101, "xmulx",   I64Regs>;
+def XMULXHI  : VISInst<0b100010111, "xmulxhi", I64Regs>;
+} // Predicates = [IsVIS3]
diff --git a/contrib/llvm/lib/Target/Sparc/SparcJITInfo.cpp b/contrib/llvm/lib/Target/Sparc/SparcJITInfo.cpp
new file mode 100644
index 0000000..d0eec98
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcJITInfo.cpp
@@ -0,0 +1,326 @@
+//===-- SparcJITInfo.cpp - Implement the Sparc JIT Interface --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the JIT interfaces for the Sparc target.
+//
+//===----------------------------------------------------------------------===//
+#include "SparcJITInfo.h"
+#include "Sparc.h"
+#include "SparcRelocations.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/Support/Memory.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "jit"
+
+/// JITCompilerFunction - This contains the address of the JIT function used to
+/// compile a function lazily.
+static TargetJITInfo::JITCompilerFn JITCompilerFunction;
+
+extern "C" void SparcCompilationCallback();
+
+extern "C" {
+#if defined (__sparc__)
+
+#if defined(__arch64__)
+#define FRAME_PTR(X) #X "+2047"
+#else
+#define FRAME_PTR(X) #X
+#endif
+
+  asm(
+      ".text\n"
+      "\t.align 4\n"
+      "\t.global SparcCompilationCallback\n"
+      "\t.type SparcCompilationCallback, #function\n"
+      "SparcCompilationCallback:\n"
+      // Save current register window and create stack.
+      // 128 (save area) + 6*8 (for arguments) + 16*8 (for float regfile) = 304
+      "\tsave %sp, -304, %sp\n"
+      // save float regfile to the stack.
+      "\tstd %f0,  [" FRAME_PTR(%fp) "-0]\n"
+      "\tstd %f2,  [" FRAME_PTR(%fp) "-8]\n"
+      "\tstd %f4,  [" FRAME_PTR(%fp) "-16]\n"
+      "\tstd %f6,  [" FRAME_PTR(%fp) "-24]\n"
+      "\tstd %f8,  [" FRAME_PTR(%fp) "-32]\n"
+      "\tstd %f10, [" FRAME_PTR(%fp) "-40]\n"
+      "\tstd %f12, [" FRAME_PTR(%fp) "-48]\n"
+      "\tstd %f14, [" FRAME_PTR(%fp) "-56]\n"
+      "\tstd %f16, [" FRAME_PTR(%fp) "-64]\n"
+      "\tstd %f18, [" FRAME_PTR(%fp) "-72]\n"
+      "\tstd %f20, [" FRAME_PTR(%fp) "-80]\n"
+      "\tstd %f22, [" FRAME_PTR(%fp) "-88]\n"
+      "\tstd %f24, [" FRAME_PTR(%fp) "-96]\n"
+      "\tstd %f26, [" FRAME_PTR(%fp) "-104]\n"
+      "\tstd %f28, [" FRAME_PTR(%fp) "-112]\n"
+      "\tstd %f30, [" FRAME_PTR(%fp) "-120]\n"
+      // stubaddr is in %g1.
+      "\tcall SparcCompilationCallbackC\n"
+      "\t  mov %g1, %o0\n"
+      // restore float regfile from the stack.
+      "\tldd [" FRAME_PTR(%fp) "-0],   %f0\n"
+      "\tldd [" FRAME_PTR(%fp) "-8],   %f2\n"
+      "\tldd [" FRAME_PTR(%fp) "-16],  %f4\n"
+      "\tldd [" FRAME_PTR(%fp) "-24],  %f6\n"
+      "\tldd [" FRAME_PTR(%fp) "-32],  %f8\n"
+      "\tldd [" FRAME_PTR(%fp) "-40],  %f10\n"
+      "\tldd [" FRAME_PTR(%fp) "-48],  %f12\n"
+      "\tldd [" FRAME_PTR(%fp) "-56],  %f14\n"
+      "\tldd [" FRAME_PTR(%fp) "-64],  %f16\n"
+      "\tldd [" FRAME_PTR(%fp) "-72],  %f18\n"
+      "\tldd [" FRAME_PTR(%fp) "-80],  %f20\n"
+      "\tldd [" FRAME_PTR(%fp) "-88],  %f22\n"
+      "\tldd [" FRAME_PTR(%fp) "-96],  %f24\n"
+      "\tldd [" FRAME_PTR(%fp) "-104], %f26\n"
+      "\tldd [" FRAME_PTR(%fp) "-112], %f28\n"
+      "\tldd [" FRAME_PTR(%fp) "-120], %f30\n"
+      // restore original register window and
+      // copy %o0 to %g1
+      "\trestore %o0, 0, %g1\n"
+      // call the new stub
+      "\tjmp %g1\n"
+      "\t  nop\n"
+      "\t.size   SparcCompilationCallback, .-SparcCompilationCallback"
+      );
+#else
+  void SparcCompilationCallback() {
+    llvm_unreachable(
+      "Cannot call SparcCompilationCallback() on a non-sparc arch!");
+  }
+#endif
+}
+
+
+#define SETHI_INST(imm, rd)    (0x01000000 | ((rd) << 25) | ((imm) & 0x3FFFFF))
+#define JMP_INST(rs1, imm, rd) (0x80000000 | ((rd) << 25) | (0x38 << 19) \
+                                | ((rs1) << 14) | (1 << 13) | ((imm) & 0x1FFF))
+#define NOP_INST               SETHI_INST(0, 0)
+#define OR_INST_I(rs1, imm, rd) (0x80000000 | ((rd) << 25) | (0x02 << 19) \
+                                 | ((rs1) << 14) | (1 << 13) | ((imm) & 0x1FFF))
+#define OR_INST_R(rs1, rs2, rd) (0x80000000 | ((rd) << 25) | (0x02 << 19) \
+                                 | ((rs1) << 14) | (0 << 13) | ((rs2) & 0x1F))
+#define RDPC_INST(rd)           (0x80000000 | ((rd) << 25) | (0x28 << 19) \
+                                 | (5 << 14))
+#define LDX_INST(rs1, imm, rd)  (0xC0000000 | ((rd) << 25) | (0x0B << 19) \
+                                 | ((rs1) << 14) | (1 << 13) | ((imm) & 0x1FFF))
+#define SLLX_INST(rs1, imm, rd) (0x80000000 | ((rd) << 25) | (0x25 << 19) \
+                                 | ((rs1) << 14) | (3 << 12) | ((imm) & 0x3F))
+#define SUB_INST(rs1, imm, rd)  (0x80000000 | ((rd) << 25) | (0x04 << 19) \
+                                 | ((rs1) << 14) | (1 << 13) | ((imm) & 0x1FFF))
+#define XOR_INST(rs1, imm, rd)  (0x80000000 | ((rd) << 25) | (0x03 << 19) \
+                                 | ((rs1) << 14) | (1 << 13) | ((imm) & 0x1FFF))
+#define BA_INST(tgt)             (0x10800000 | ((tgt) & 0x3FFFFF))
+
+// Emit instructions to jump to Addr and store the starting address of
+// the instructions emitted in the scratch register.
+static void emitInstrForIndirectJump(intptr_t Addr,
+                                     unsigned scratch,
+                                     SmallVectorImpl<uint32_t> &Insts) {
+
+  if (isInt<13>(Addr)) {
+    // Emit: jmpl %g0+Addr, <scratch>
+    //         nop
+    Insts.push_back(JMP_INST(0, LO10(Addr), scratch));
+    Insts.push_back(NOP_INST);
+    return;
+  }
+
+  if (isUInt<32>(Addr)) {
+    // Emit: sethi %hi(Addr), scratch
+    //       jmpl scratch+%lo(Addr), scratch
+    //         sub scratch, 4, scratch
+    Insts.push_back(SETHI_INST(HI22(Addr), scratch));
+    Insts.push_back(JMP_INST(scratch, LO10(Addr), scratch));
+    Insts.push_back(SUB_INST(scratch, 4, scratch));
+    return;
+  }
+
+  if (Addr < 0 && isInt<33>(Addr)) {
+    // Emit: sethi %hix(Addr), scratch)
+    //       xor   scratch, %lox(Addr), scratch
+    //       jmpl scratch+0, scratch
+    //         sub scratch, 8, scratch
+    Insts.push_back(SETHI_INST(HIX22(Addr), scratch));
+    Insts.push_back(XOR_INST(scratch, LOX10(Addr), scratch));
+    Insts.push_back(JMP_INST(scratch, 0, scratch));
+    Insts.push_back(SUB_INST(scratch, 8, scratch));
+    return;
+  }
+
+  // Emit: rd %pc, scratch
+  //       ldx [scratch+16], scratch
+  //       jmpl scratch+0, scratch
+  //         sub scratch, 8, scratch
+  //       <Addr: 8 byte>
+  Insts.push_back(RDPC_INST(scratch));
+  Insts.push_back(LDX_INST(scratch, 16, scratch));
+  Insts.push_back(JMP_INST(scratch, 0, scratch));
+  Insts.push_back(SUB_INST(scratch, 8, scratch));
+  Insts.push_back((uint32_t)(((int64_t)Addr) >> 32) & 0xffffffff);
+  Insts.push_back((uint32_t)(Addr & 0xffffffff));
+
+  // Instruction sequence without rdpc instruction
+  // 7 instruction and 2 scratch register
+  // Emit: sethi %hh(Addr), scratch
+  //       or scratch, %hm(Addr), scratch
+  //       sllx scratch, 32, scratch
+  //       sethi %hi(Addr), scratch2
+  //       or scratch, scratch2, scratch
+  //       jmpl scratch+%lo(Addr), scratch
+  //         sub scratch, 20, scratch
+  // Insts.push_back(SETHI_INST(HH22(Addr), scratch));
+  // Insts.push_back(OR_INST_I(scratch, HM10(Addr), scratch));
+  // Insts.push_back(SLLX_INST(scratch, 32, scratch));
+  // Insts.push_back(SETHI_INST(HI22(Addr), scratch2));
+  // Insts.push_back(OR_INST_R(scratch, scratch2, scratch));
+  // Insts.push_back(JMP_INST(scratch, LO10(Addr), scratch));
+  // Insts.push_back(SUB_INST(scratch, 20, scratch));
+}
+
+extern "C" void *SparcCompilationCallbackC(intptr_t StubAddr) {
+  // Get the address of the compiled code for this function.
+  intptr_t NewVal = (intptr_t) JITCompilerFunction((void*) StubAddr);
+
+  // Rewrite the function stub so that we don't end up here every time we
+  // execute the call. We're replacing the stub instructions with code
+  // that jumps to the compiled function:
+
+  SmallVector<uint32_t, 8> Insts;
+  intptr_t diff = (NewVal - StubAddr) >> 2;
+  if (isInt<22>(diff)) {
+    // Use branch instruction to jump
+    Insts.push_back(BA_INST(diff));
+    Insts.push_back(NOP_INST);
+  } else {
+    // Otherwise, use indirect jump to the compiled function
+    emitInstrForIndirectJump(NewVal, 1, Insts);
+  }
+
+  for (unsigned i = 0, e = Insts.size(); i != e; ++i)
+    *(uint32_t *)(StubAddr + i*4) = Insts[i];
+
+  sys::Memory::InvalidateInstructionCache((void*) StubAddr, Insts.size() * 4);
+  return (void*)StubAddr;
+}
+
+
+void SparcJITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
+  llvm_unreachable("FIXME: Implement SparcJITInfo::"
+                   "replaceMachineCodeForFunction");
+}
+
+
+TargetJITInfo::StubLayout SparcJITInfo::getStubLayout() {
+  // The stub contains maximum of 4 4-byte instructions and 8 bytes for address,
+  // aligned at 32 bytes.
+  // See emitFunctionStub and emitInstrForIndirectJump for details.
+  StubLayout Result = { 4*4 + 8, 32 };
+  return Result;
+}
+
+void *SparcJITInfo::emitFunctionStub(const Function *F, void *Fn,
+                                     JITCodeEmitter &JCE)
+{
+  JCE.emitAlignment(32);
+  void *Addr = (void*) (JCE.getCurrentPCValue());
+
+  intptr_t CurrentAddr = (intptr_t)Addr;
+  intptr_t EmittedAddr;
+  SmallVector<uint32_t, 8> Insts;
+  if (Fn != (void*)(intptr_t)SparcCompilationCallback) {
+    EmittedAddr = (intptr_t)Fn;
+    intptr_t diff = (EmittedAddr - CurrentAddr) >> 2;
+    if (isInt<22>(diff)) {
+      Insts.push_back(BA_INST(diff));
+      Insts.push_back(NOP_INST);
+    }
+  } else {
+    EmittedAddr = (intptr_t)SparcCompilationCallback;
+  }
+
+  if (Insts.size() == 0)
+    emitInstrForIndirectJump(EmittedAddr, 1, Insts);
+
+
+  if (!sys::Memory::setRangeWritable(Addr, 4 * Insts.size()))
+    llvm_unreachable("ERROR: Unable to mark stub writable.");
+
+  for (unsigned i = 0, e = Insts.size(); i != e; ++i)
+    JCE.emitWordBE(Insts[i]);
+
+  sys::Memory::InvalidateInstructionCache(Addr, 4 * Insts.size());
+  if (!sys::Memory::setRangeExecutable(Addr, 4 * Insts.size()))
+    llvm_unreachable("ERROR: Unable to mark stub executable.");
+
+  return Addr;
+}
+
+
+TargetJITInfo::LazyResolverFn
+SparcJITInfo::getLazyResolverFunction(JITCompilerFn F) {
+  JITCompilerFunction = F;
+  return SparcCompilationCallback;
+}
+
+/// relocate - Before the JIT can run a block of code that has been emitted,
+/// it must rewrite the code to contain the actual addresses of any
+/// referenced global symbols.
+void SparcJITInfo::relocate(void *Function, MachineRelocation *MR,
+                            unsigned NumRelocs, unsigned char *GOTBase) {
+  for (unsigned i = 0; i != NumRelocs; ++i, ++MR) {
+    void *RelocPos = (char*) Function + MR->getMachineCodeOffset();
+    intptr_t ResultPtr = (intptr_t) MR->getResultPointer();
+
+    switch ((SP::RelocationType) MR->getRelocationType()) {
+    case SP::reloc_sparc_hi:
+      ResultPtr = (ResultPtr >> 10) & 0x3fffff;
+      break;
+
+    case SP::reloc_sparc_lo:
+      ResultPtr = (ResultPtr & 0x3ff);
+      break;
+
+    case SP::reloc_sparc_pc30:
+      ResultPtr = ((ResultPtr - (intptr_t)RelocPos) >> 2) & 0x3fffffff;
+      break;
+
+    case SP::reloc_sparc_pc22:
+      ResultPtr = ((ResultPtr - (intptr_t)RelocPos) >> 2) & 0x3fffff;
+      break;
+
+    case SP::reloc_sparc_pc19:
+      ResultPtr = ((ResultPtr - (intptr_t)RelocPos) >> 2) & 0x7ffff;
+      break;
+
+    case SP::reloc_sparc_h44:
+      ResultPtr = (ResultPtr >> 22) & 0x3fffff;
+      break;
+
+    case SP::reloc_sparc_m44:
+      ResultPtr = (ResultPtr >> 12) & 0x3ff;
+      break;
+
+    case SP::reloc_sparc_l44:
+      ResultPtr = (ResultPtr & 0xfff);
+      break;
+
+    case SP::reloc_sparc_hh:
+      ResultPtr = (((int64_t)ResultPtr) >> 42) & 0x3fffff;
+      break;
+
+    case SP::reloc_sparc_hm:
+      ResultPtr = (((int64_t)ResultPtr) >> 32) & 0x3ff;
+      break;
+
+    }
+    *((unsigned*) RelocPos) |= (unsigned) ResultPtr;
+  }
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcJITInfo.h b/contrib/llvm/lib/Target/Sparc/SparcJITInfo.h
new file mode 100644
index 0000000..ff1b43a
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcJITInfo.h
@@ -0,0 +1,67 @@
+//==- SparcJITInfo.h - Sparc Implementation of the JIT Interface -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the SparcJITInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPARCJITINFO_H
+#define SPARCJITINFO_H
+
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Target/TargetJITInfo.h"
+
+namespace llvm {
+class SparcTargetMachine;
+
+class SparcJITInfo : public TargetJITInfo {
+
+  bool IsPIC;
+
+  public:
+  explicit SparcJITInfo()
+    :  IsPIC(false) {}
+
+  /// replaceMachineCodeForFunction - Make it so that calling the function
+  /// whose machine code is at OLD turns into a call to NEW, perhaps by
+  /// overwriting OLD with a branch to NEW.  This is used for self-modifying
+  /// code.
+  ///
+  void replaceMachineCodeForFunction(void *Old, void *New) override;
+
+  // getStubLayout - Returns the size and alignment of the largest call stub
+  // on Sparc.
+  StubLayout getStubLayout() override;
+
+
+  /// emitFunctionStub - Use the specified JITCodeEmitter object to emit a
+  /// small native function that simply calls the function at the specified
+  /// address.
+  void *emitFunctionStub(const Function *F, void *Fn,
+                         JITCodeEmitter &JCE) override;
+
+  /// getLazyResolverFunction - Expose the lazy resolver to the JIT.
+  LazyResolverFn getLazyResolverFunction(JITCompilerFn) override;
+
+  /// relocate - Before the JIT can run a block of code that has been emitted,
+  /// it must rewrite the code to contain the actual addresses of any
+  /// referenced global symbols.
+  void relocate(void *Function, MachineRelocation *MR,
+                unsigned NumRelocs, unsigned char *GOTBase) override;
+
+  /// Initialize - Initialize internal stage for the function being JITted.
+  void Initialize(const MachineFunction &MF, bool isPIC) {
+    IsPIC = isPIC;
+  }
+
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/SparcMCInstLower.cpp b/contrib/llvm/lib/Target/Sparc/SparcMCInstLower.cpp
new file mode 100644
index 0000000..9e94d2c
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcMCInstLower.cpp
@@ -0,0 +1,109 @@
+//===-- SparcMCInstLower.cpp - Convert Sparc MachineInstr to MCInst -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower Sparc MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Sparc.h"
+#include "MCTargetDesc/SparcMCExpr.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+
+using namespace llvm;
+
+
+static MCOperand LowerSymbolOperand(const MachineInstr *MI,
+                                    const MachineOperand &MO,
+                                    AsmPrinter &AP) {
+
+  SparcMCExpr::VariantKind Kind =
+    (SparcMCExpr::VariantKind)MO.getTargetFlags();
+  const MCSymbol *Symbol = nullptr;
+
+  switch(MO.getType()) {
+  default: llvm_unreachable("Unknown type in LowerSymbolOperand");
+  case MachineOperand::MO_MachineBasicBlock:
+    Symbol = MO.getMBB()->getSymbol();
+    break;
+
+  case MachineOperand::MO_GlobalAddress:
+    Symbol = AP.getSymbol(MO.getGlobal());
+    break;
+
+  case MachineOperand::MO_BlockAddress:
+    Symbol = AP.GetBlockAddressSymbol(MO.getBlockAddress());
+    break;
+
+  case MachineOperand::MO_ExternalSymbol:
+    Symbol = AP.GetExternalSymbolSymbol(MO.getSymbolName());
+    break;
+
+  case MachineOperand::MO_ConstantPoolIndex:
+    Symbol = AP.GetCPISymbol(MO.getIndex());
+    break;
+  }
+
+  const MCSymbolRefExpr *MCSym = MCSymbolRefExpr::Create(Symbol,
+                                                         AP.OutContext);
+  const SparcMCExpr *expr = SparcMCExpr::Create(Kind, MCSym,
+                                                AP.OutContext);
+  return MCOperand::CreateExpr(expr);
+}
+
+static MCOperand LowerOperand(const MachineInstr *MI,
+                              const MachineOperand &MO,
+                              AsmPrinter &AP) {
+  switch(MO.getType()) {
+  default: llvm_unreachable("unknown operand type"); break;
+  case MachineOperand::MO_Register:
+    if (MO.isImplicit())
+      break;
+    return MCOperand::CreateReg(MO.getReg());
+
+  case MachineOperand::MO_Immediate:
+    return MCOperand::CreateImm(MO.getImm());
+
+  case MachineOperand::MO_MachineBasicBlock:
+  case MachineOperand::MO_GlobalAddress:
+  case MachineOperand::MO_BlockAddress:
+  case MachineOperand::MO_ExternalSymbol:
+  case MachineOperand::MO_ConstantPoolIndex:
+    return LowerSymbolOperand(MI, MO, AP);
+
+  case MachineOperand::MO_RegisterMask:   break;
+
+  }
+  return MCOperand();
+}
+
+void llvm::LowerSparcMachineInstrToMCInst(const MachineInstr *MI,
+                                          MCInst &OutMI,
+                                          AsmPrinter &AP)
+{
+
+  OutMI.setOpcode(MI->getOpcode());
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    MCOperand MCOp = LowerOperand(MI, MO, AP);
+
+    if (MCOp.isValid())
+      OutMI.addOperand(MCOp);
+  }
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/Sparc/SparcMachineFunctionInfo.cpp
new file mode 100644
index 0000000..e744282
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcMachineFunctionInfo.cpp
@@ -0,0 +1,14 @@
+//===-- SparcMachineFunctionInfo.cpp - Sparc Machine Function Info --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcMachineFunctionInfo.h"
+
+using namespace llvm;
+
+void SparcMachineFunctionInfo::anchor() { }
diff --git a/contrib/llvm/lib/Target/Sparc/SparcMachineFunctionInfo.h b/contrib/llvm/lib/Target/Sparc/SparcMachineFunctionInfo.h
new file mode 100644
index 0000000..3783c16
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcMachineFunctionInfo.h
@@ -0,0 +1,56 @@
+//===- SparcMachineFunctionInfo.h - Sparc Machine Function Info -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares  Sparc specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+#ifndef SPARCMACHINEFUNCTIONINFO_H
+#define SPARCMACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+  class SparcMachineFunctionInfo : public MachineFunctionInfo {
+    virtual void anchor();
+  private:
+    unsigned GlobalBaseReg;
+
+    /// VarArgsFrameOffset - Frame offset to start of varargs area.
+    int VarArgsFrameOffset;
+
+    /// SRetReturnReg - Holds the virtual register into which the sret
+    /// argument is passed.
+    unsigned SRetReturnReg;
+
+    /// IsLeafProc - True if the function is a leaf procedure.
+    bool IsLeafProc;
+  public:
+    SparcMachineFunctionInfo()
+      : GlobalBaseReg(0), VarArgsFrameOffset(0), SRetReturnReg(0),
+        IsLeafProc(false) {}
+    explicit SparcMachineFunctionInfo(MachineFunction &MF)
+      : GlobalBaseReg(0), VarArgsFrameOffset(0), SRetReturnReg(0),
+        IsLeafProc(false) {}
+
+    unsigned getGlobalBaseReg() const { return GlobalBaseReg; }
+    void setGlobalBaseReg(unsigned Reg) { GlobalBaseReg = Reg; }
+
+    int getVarArgsFrameOffset() const { return VarArgsFrameOffset; }
+    void setVarArgsFrameOffset(int Offset) { VarArgsFrameOffset = Offset; }
+
+    unsigned getSRetReturnReg() const { return SRetReturnReg; }
+    void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
+
+    void setLeafProc(bool rhs) { IsLeafProc = rhs; }
+    bool isLeafProc() const { return IsLeafProc; }
+  };
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp b/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp
new file mode 100644
index 0000000..dc1ec7c
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp
@@ -0,0 +1,211 @@
+//===-- SparcRegisterInfo.cpp - SPARC Register Information ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the SPARC implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcRegisterInfo.h"
+#include "Sparc.h"
+#include "SparcMachineFunctionInfo.h"
+#include "SparcSubtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define GET_REGINFO_TARGET_DESC
+#include "SparcGenRegisterInfo.inc"
+
+static cl::opt<bool>
+ReserveAppRegisters("sparc-reserve-app-registers", cl::Hidden, cl::init(false),
+                    cl::desc("Reserve application registers (%g2-%g4)"));
+
+SparcRegisterInfo::SparcRegisterInfo(SparcSubtarget &st)
+  : SparcGenRegisterInfo(SP::O7), Subtarget(st) {
+}
+
+const MCPhysReg*
+SparcRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  return CSR_SaveList;
+}
+
+const uint32_t*
+SparcRegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
+  return CSR_RegMask;
+}
+
+const uint32_t*
+SparcRegisterInfo::getRTCallPreservedMask(CallingConv::ID CC) const {
+  return RTCSR_RegMask;
+}
+
+BitVector SparcRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  // FIXME: G1 reserved for now for large imm generation by frame code.
+  Reserved.set(SP::G1);
+
+  // G1-G4 can be used in applications.
+  if (ReserveAppRegisters) {
+    Reserved.set(SP::G2);
+    Reserved.set(SP::G3);
+    Reserved.set(SP::G4);
+  }
+  // G5 is not reserved in 64 bit mode.
+  if (!Subtarget.is64Bit())
+    Reserved.set(SP::G5);
+
+  Reserved.set(SP::O6);
+  Reserved.set(SP::I6);
+  Reserved.set(SP::I7);
+  Reserved.set(SP::G0);
+  Reserved.set(SP::G6);
+  Reserved.set(SP::G7);
+
+  // Unaliased double registers are not available in non-V9 targets.
+  if (!Subtarget.isV9()) {
+    for (unsigned n = 0; n != 16; ++n) {
+      for (MCRegAliasIterator AI(SP::D16 + n, this, true); AI.isValid(); ++AI)
+        Reserved.set(*AI);
+    }
+  }
+
+  return Reserved;
+}
+
+const TargetRegisterClass*
+SparcRegisterInfo::getPointerRegClass(const MachineFunction &MF,
+                                      unsigned Kind) const {
+  return Subtarget.is64Bit() ? &SP::I64RegsRegClass : &SP::IntRegsRegClass;
+}
+
+static void replaceFI(MachineFunction &MF,
+                      MachineBasicBlock::iterator II,
+                      MachineInstr &MI,
+                      DebugLoc dl,
+                      unsigned FIOperandNum, int Offset,
+                      unsigned FramePtr)
+{
+  // Replace frame index with a frame pointer reference.
+  if (Offset >= -4096 && Offset <= 4095) {
+    // If the offset is small enough to fit in the immediate field, directly
+    // encode it.
+    MI.getOperand(FIOperandNum).ChangeToRegister(FramePtr, false);
+    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+    return;
+  }
+
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+
+  // FIXME: it would be better to scavenge a register here instead of
+  // reserving G1 all of the time.
+  if (Offset >= 0) {
+    // Emit nonnegaive immediates with sethi + or.
+    // sethi %hi(Offset), %g1
+    // add %g1, %fp, %g1
+    // Insert G1+%lo(offset) into the user.
+    BuildMI(*MI.getParent(), II, dl, TII.get(SP::SETHIi), SP::G1)
+      .addImm(HI22(Offset));
+
+
+    // Emit G1 = G1 + I6
+    BuildMI(*MI.getParent(), II, dl, TII.get(SP::ADDrr), SP::G1).addReg(SP::G1)
+      .addReg(FramePtr);
+    // Insert: G1+%lo(offset) into the user.
+    MI.getOperand(FIOperandNum).ChangeToRegister(SP::G1, false);
+    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(LO10(Offset));
+    return;
+  }
+
+  // Emit Negative numbers with sethi + xor
+  // sethi %hix(Offset), %g1
+  // xor  %g1, %lox(offset), %g1
+  // add %g1, %fp, %g1
+  // Insert: G1 + 0 into the user.
+  BuildMI(*MI.getParent(), II, dl, TII.get(SP::SETHIi), SP::G1)
+    .addImm(HIX22(Offset));
+  BuildMI(*MI.getParent(), II, dl, TII.get(SP::XORri), SP::G1)
+    .addReg(SP::G1).addImm(LOX10(Offset));
+
+  BuildMI(*MI.getParent(), II, dl, TII.get(SP::ADDrr), SP::G1).addReg(SP::G1)
+    .addReg(FramePtr);
+  // Insert: G1+%lo(offset) into the user.
+  MI.getOperand(FIOperandNum).ChangeToRegister(SP::G1, false);
+  MI.getOperand(FIOperandNum + 1).ChangeToImmediate(0);
+}
+
+
+void
+SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                       int SPAdj, unsigned FIOperandNum,
+                                       RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected");
+
+  MachineInstr &MI = *II;
+  DebugLoc dl = MI.getDebugLoc();
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+
+  // Addressable stack objects are accessed using neg. offsets from %fp
+  MachineFunction &MF = *MI.getParent()->getParent();
+  int64_t Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) +
+                   MI.getOperand(FIOperandNum + 1).getImm() +
+                   Subtarget.getStackPointerBias();
+  SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>();
+  unsigned FramePtr = SP::I6;
+  if (FuncInfo->isLeafProc()) {
+    // Use %sp and adjust offset if needed.
+    FramePtr = SP::O6;
+    int stackSize = MF.getFrameInfo()->getStackSize();
+    Offset += (stackSize) ? Subtarget.getAdjustedFrameSize(stackSize) : 0 ;
+  }
+
+  if (!Subtarget.isV9() || !Subtarget.hasHardQuad()) {
+    if (MI.getOpcode() == SP::STQFri) {
+      const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+      unsigned SrcReg = MI.getOperand(2).getReg();
+      unsigned SrcEvenReg = getSubReg(SrcReg, SP::sub_even64);
+      unsigned SrcOddReg  = getSubReg(SrcReg, SP::sub_odd64);
+      MachineInstr *StMI =
+        BuildMI(*MI.getParent(), II, dl, TII.get(SP::STDFri))
+        .addReg(FramePtr).addImm(0).addReg(SrcEvenReg);
+      replaceFI(MF, II, *StMI, dl, 0, Offset, FramePtr);
+      MI.setDesc(TII.get(SP::STDFri));
+      MI.getOperand(2).setReg(SrcOddReg);
+      Offset += 8;
+    } else if (MI.getOpcode() == SP::LDQFri) {
+      const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+      unsigned DestReg     = MI.getOperand(0).getReg();
+      unsigned DestEvenReg = getSubReg(DestReg, SP::sub_even64);
+      unsigned DestOddReg  = getSubReg(DestReg, SP::sub_odd64);
+      MachineInstr *StMI =
+        BuildMI(*MI.getParent(), II, dl, TII.get(SP::LDDFri), DestEvenReg)
+        .addReg(FramePtr).addImm(0);
+      replaceFI(MF, II, *StMI, dl, 1, Offset, FramePtr);
+
+      MI.setDesc(TII.get(SP::LDDFri));
+      MI.getOperand(0).setReg(DestOddReg);
+      Offset += 8;
+    }
+  }
+
+  replaceFI(MF, II, MI, dl, FIOperandNum, Offset, FramePtr);
+
+}
+
+unsigned SparcRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  return SP::I6;
+}
+
diff --git a/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.h b/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.h
new file mode 100644
index 0000000..77f879a
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.h
@@ -0,0 +1,58 @@
+//===-- SparcRegisterInfo.h - Sparc Register Information Impl ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Sparc implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPARCREGISTERINFO_H
+#define SPARCREGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "SparcGenRegisterInfo.inc"
+
+namespace llvm {
+
+class SparcSubtarget;
+class TargetInstrInfo;
+class Type;
+
+struct SparcRegisterInfo : public SparcGenRegisterInfo {
+  SparcSubtarget &Subtarget;
+
+  SparcRegisterInfo(SparcSubtarget &st);
+
+  /// Code Generation virtual methods...
+  const MCPhysReg *
+  getCalleeSavedRegs(const MachineFunction *MF =nullptr) const override;
+  const uint32_t* getCallPreservedMask(CallingConv::ID CC) const override;
+
+  const uint32_t* getRTCallPreservedMask(CallingConv::ID CC) const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+  const TargetRegisterClass *getPointerRegClass(const MachineFunction &MF,
+                                                unsigned Kind) const override;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                           int SPAdj, unsigned FIOperandNum,
+                           RegScavenger *RS = nullptr) const override;
+
+  void processFunctionBeforeFrameFinalized(MachineFunction &MF,
+                                       RegScavenger *RS = nullptr) const;
+
+  // Debug information queries.
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.td b/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.td
new file mode 100644
index 0000000..2cadff1
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.td
@@ -0,0 +1,211 @@
+//===-- SparcRegisterInfo.td - Sparc Register defs ---------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the Sparc register file
+//===----------------------------------------------------------------------===//
+
+class SparcReg<bits<16> Enc, string n> : Register<n> {
+  let HWEncoding = Enc;
+  let Namespace = "SP";
+}
+
+class SparcCtrlReg<bits<16> Enc, string n>: Register<n> {
+  let HWEncoding = Enc;
+  let Namespace = "SP";
+}
+
+let Namespace = "SP" in {
+def sub_even : SubRegIndex<32>;
+def sub_odd  : SubRegIndex<32, 32>;
+def sub_even64 : SubRegIndex<64>;
+def sub_odd64  : SubRegIndex<64, 64>;
+}
+
+// Registers are identified with 5-bit ID numbers.
+// Ri - 32-bit integer registers
+class Ri<bits<16> Enc, string n> : SparcReg<Enc, n>;
+
+// Rf - 32-bit floating-point registers
+class Rf<bits<16> Enc, string n> : SparcReg<Enc, n>;
+
+// Rd - Slots in the FP register file for 64-bit floating-point values.
+class Rd<bits<16> Enc, string n, list<Register> subregs> : SparcReg<Enc, n> {
+  let SubRegs = subregs;
+  let SubRegIndices = [sub_even, sub_odd];
+  let CoveredBySubRegs = 1;
+}
+
+// Rq - Slots in the FP register file for 128-bit floating-point values.
+class Rq<bits<16> Enc, string n, list<Register> subregs> : SparcReg<Enc, n> {
+  let SubRegs = subregs;
+  let SubRegIndices = [sub_even64, sub_odd64];
+  let CoveredBySubRegs = 1;
+}
+
+// Control Registers
+def ICC : SparcCtrlReg<0, "ICC">; // This represents icc and xcc in 64-bit code.
+foreach I = 0-3 in
+  def FCC#I : SparcCtrlReg<I, "FCC"#I>;
+
+// Y register
+def Y : SparcCtrlReg<0, "Y">, DwarfRegNum<[64]>;
+
+// Integer registers
+def G0 : Ri< 0, "G0">, DwarfRegNum<[0]>;
+def G1 : Ri< 1, "G1">, DwarfRegNum<[1]>;
+def G2 : Ri< 2, "G2">, DwarfRegNum<[2]>;
+def G3 : Ri< 3, "G3">, DwarfRegNum<[3]>;
+def G4 : Ri< 4, "G4">, DwarfRegNum<[4]>;
+def G5 : Ri< 5, "G5">, DwarfRegNum<[5]>;
+def G6 : Ri< 6, "G6">, DwarfRegNum<[6]>;
+def G7 : Ri< 7, "G7">, DwarfRegNum<[7]>;
+def O0 : Ri< 8, "O0">, DwarfRegNum<[8]>;
+def O1 : Ri< 9, "O1">, DwarfRegNum<[9]>;
+def O2 : Ri<10, "O2">, DwarfRegNum<[10]>;
+def O3 : Ri<11, "O3">, DwarfRegNum<[11]>;
+def O4 : Ri<12, "O4">, DwarfRegNum<[12]>;
+def O5 : Ri<13, "O5">, DwarfRegNum<[13]>;
+def O6 : Ri<14, "SP">, DwarfRegNum<[14]>;
+def O7 : Ri<15, "O7">, DwarfRegNum<[15]>;
+def L0 : Ri<16, "L0">, DwarfRegNum<[16]>;
+def L1 : Ri<17, "L1">, DwarfRegNum<[17]>;
+def L2 : Ri<18, "L2">, DwarfRegNum<[18]>;
+def L3 : Ri<19, "L3">, DwarfRegNum<[19]>;
+def L4 : Ri<20, "L4">, DwarfRegNum<[20]>;
+def L5 : Ri<21, "L5">, DwarfRegNum<[21]>;
+def L6 : Ri<22, "L6">, DwarfRegNum<[22]>;
+def L7 : Ri<23, "L7">, DwarfRegNum<[23]>;
+def I0 : Ri<24, "I0">, DwarfRegNum<[24]>;
+def I1 : Ri<25, "I1">, DwarfRegNum<[25]>;
+def I2 : Ri<26, "I2">, DwarfRegNum<[26]>;
+def I3 : Ri<27, "I3">, DwarfRegNum<[27]>;
+def I4 : Ri<28, "I4">, DwarfRegNum<[28]>;
+def I5 : Ri<29, "I5">, DwarfRegNum<[29]>;
+def I6 : Ri<30, "FP">, DwarfRegNum<[30]>;
+def I7 : Ri<31, "I7">, DwarfRegNum<[31]>;
+
+// Floating-point registers
+def F0  : Rf< 0,  "F0">, DwarfRegNum<[32]>;
+def F1  : Rf< 1,  "F1">, DwarfRegNum<[33]>;
+def F2  : Rf< 2,  "F2">, DwarfRegNum<[34]>;
+def F3  : Rf< 3,  "F3">, DwarfRegNum<[35]>;
+def F4  : Rf< 4,  "F4">, DwarfRegNum<[36]>;
+def F5  : Rf< 5,  "F5">, DwarfRegNum<[37]>;
+def F6  : Rf< 6,  "F6">, DwarfRegNum<[38]>;
+def F7  : Rf< 7,  "F7">, DwarfRegNum<[39]>;
+def F8  : Rf< 8,  "F8">, DwarfRegNum<[40]>;
+def F9  : Rf< 9,  "F9">, DwarfRegNum<[41]>;
+def F10 : Rf<10, "F10">, DwarfRegNum<[42]>;
+def F11 : Rf<11, "F11">, DwarfRegNum<[43]>;
+def F12 : Rf<12, "F12">, DwarfRegNum<[44]>;
+def F13 : Rf<13, "F13">, DwarfRegNum<[45]>;
+def F14 : Rf<14, "F14">, DwarfRegNum<[46]>;
+def F15 : Rf<15, "F15">, DwarfRegNum<[47]>;
+def F16 : Rf<16, "F16">, DwarfRegNum<[48]>;
+def F17 : Rf<17, "F17">, DwarfRegNum<[49]>;
+def F18 : Rf<18, "F18">, DwarfRegNum<[50]>;
+def F19 : Rf<19, "F19">, DwarfRegNum<[51]>;
+def F20 : Rf<20, "F20">, DwarfRegNum<[52]>;
+def F21 : Rf<21, "F21">, DwarfRegNum<[53]>;
+def F22 : Rf<22, "F22">, DwarfRegNum<[54]>;
+def F23 : Rf<23, "F23">, DwarfRegNum<[55]>;
+def F24 : Rf<24, "F24">, DwarfRegNum<[56]>;
+def F25 : Rf<25, "F25">, DwarfRegNum<[57]>;
+def F26 : Rf<26, "F26">, DwarfRegNum<[58]>;
+def F27 : Rf<27, "F27">, DwarfRegNum<[59]>;
+def F28 : Rf<28, "F28">, DwarfRegNum<[60]>;
+def F29 : Rf<29, "F29">, DwarfRegNum<[61]>;
+def F30 : Rf<30, "F30">, DwarfRegNum<[62]>;
+def F31 : Rf<31, "F31">, DwarfRegNum<[63]>;
+
+// Aliases of the F* registers used to hold 64-bit fp values (doubles)
+def D0  : Rd< 0,  "F0", [F0,   F1]>, DwarfRegNum<[72]>;
+def D1  : Rd< 2,  "F2", [F2,   F3]>, DwarfRegNum<[73]>;
+def D2  : Rd< 4,  "F4", [F4,   F5]>, DwarfRegNum<[74]>;
+def D3  : Rd< 6,  "F6", [F6,   F7]>, DwarfRegNum<[75]>;
+def D4  : Rd< 8,  "F8", [F8,   F9]>, DwarfRegNum<[76]>;
+def D5  : Rd<10, "F10", [F10, F11]>, DwarfRegNum<[77]>;
+def D6  : Rd<12, "F12", [F12, F13]>, DwarfRegNum<[78]>;
+def D7  : Rd<14, "F14", [F14, F15]>, DwarfRegNum<[79]>;
+def D8  : Rd<16, "F16", [F16, F17]>, DwarfRegNum<[80]>;
+def D9  : Rd<18, "F18", [F18, F19]>, DwarfRegNum<[81]>;
+def D10 : Rd<20, "F20", [F20, F21]>, DwarfRegNum<[82]>;
+def D11 : Rd<22, "F22", [F22, F23]>, DwarfRegNum<[83]>;
+def D12 : Rd<24, "F24", [F24, F25]>, DwarfRegNum<[84]>;
+def D13 : Rd<26, "F26", [F26, F27]>, DwarfRegNum<[85]>;
+def D14 : Rd<28, "F28", [F28, F29]>, DwarfRegNum<[86]>;
+def D15 : Rd<30, "F30", [F30, F31]>, DwarfRegNum<[87]>;
+
+// Unaliased double precision floating point registers.
+// FIXME: Define DwarfRegNum for these registers.
+def D16 : SparcReg< 1, "F32">;
+def D17 : SparcReg< 3, "F34">;
+def D18 : SparcReg< 5, "F36">;
+def D19 : SparcReg< 7, "F38">;
+def D20 : SparcReg< 9, "F40">;
+def D21 : SparcReg<11, "F42">;
+def D22 : SparcReg<13, "F44">;
+def D23 : SparcReg<15, "F46">;
+def D24 : SparcReg<17, "F48">;
+def D25 : SparcReg<19, "F50">;
+def D26 : SparcReg<21, "F52">;
+def D27 : SparcReg<23, "F54">;
+def D28 : SparcReg<25, "F56">;
+def D29 : SparcReg<27, "F58">;
+def D30 : SparcReg<29, "F60">;
+def D31 : SparcReg<31, "F62">;
+
+// Aliases of the F* registers used to hold 128-bit for values (long doubles).
+def Q0  : Rq< 0,  "F0", [D0,   D1]>;
+def Q1  : Rq< 4,  "F4", [D2,   D3]>;
+def Q2  : Rq< 8,  "F8", [D4,   D5]>;
+def Q3  : Rq<12, "F12", [D6,   D7]>;
+def Q4  : Rq<16, "F16", [D8,   D9]>;
+def Q5  : Rq<20, "F20", [D10, D11]>;
+def Q6  : Rq<24, "F24", [D12, D13]>;
+def Q7  : Rq<28, "F28", [D14, D15]>;
+def Q8  : Rq< 1, "F32", [D16, D17]>;
+def Q9  : Rq< 5, "F36", [D18, D19]>;
+def Q10 : Rq< 9, "F40", [D20, D21]>;
+def Q11 : Rq<13, "F44", [D22, D23]>;
+def Q12 : Rq<17, "F48", [D24, D25]>;
+def Q13 : Rq<21, "F52", [D26, D27]>;
+def Q14 : Rq<25, "F56", [D28, D29]>;
+def Q15 : Rq<29, "F60", [D30, D31]>;
+
+// Register classes.
+//
+// FIXME: the register order should be defined in terms of the preferred
+// allocation order...
+//
+// This register class should not be used to hold i64 values, use the I64Regs
+// register class for that. The i64 type is included here to allow i64 patterns
+// using the integer instructions.
+def IntRegs : RegisterClass<"SP", [i32, i64], 32,
+                            (add (sequence "I%u", 0, 7),
+                                 (sequence "G%u", 0, 7),
+                                 (sequence "L%u", 0, 7),
+                                 (sequence "O%u", 0, 7))>;
+
+// Register class for 64-bit mode, with a 64-bit spill slot size.
+// These are the same as the 32-bit registers, so TableGen will consider this
+// to be a sub-class of IntRegs. That works out because requiring a 64-bit
+// spill slot is a stricter constraint than only requiring a 32-bit spill slot.
+def I64Regs : RegisterClass<"SP", [i64], 64, (add IntRegs)>;
+
+// Floating point register classes.
+def FPRegs : RegisterClass<"SP", [f32], 32, (sequence "F%u", 0, 31)>;
+
+def DFPRegs : RegisterClass<"SP", [f64], 64, (sequence "D%u", 0, 31)>;
+
+def QFPRegs : RegisterClass<"SP", [f128], 128, (sequence "Q%u", 0, 15)>;
+
+// Floating point control register classes.
+def FCCRegs : RegisterClass<"SP", [i1], 1, (sequence "FCC%u", 0, 3)>;
diff --git a/contrib/llvm/lib/Target/Sparc/SparcRelocations.h b/contrib/llvm/lib/Target/Sparc/SparcRelocations.h
new file mode 100644
index 0000000..c1ff78d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcRelocations.h
@@ -0,0 +1,56 @@
+//===-- SparcRelocations.h - Sparc Code Relocations -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the Sparc target-specific relocation types
+// (for relocation-model=static).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPARC_RELOCATIONS_H
+#define SPARC_RELOCATIONS_H
+
+#include "llvm/CodeGen/MachineRelocation.h"
+
+namespace llvm {
+  namespace SP {
+    enum RelocationType {
+      // reloc_sparc_hi - upper 22 bits
+      reloc_sparc_hi = 1,
+
+      // reloc_sparc_lo - lower 10 bits
+      reloc_sparc_lo = 2,
+
+      // reloc_sparc_pc30 - pc rel. 30 bits for call
+      reloc_sparc_pc30 = 3,
+
+     // reloc_sparc_pc22 - pc rel. 22 bits for branch
+      reloc_sparc_pc22 = 4,
+
+      // reloc_sparc_pc22 - pc rel. 19 bits for branch with icc/xcc
+      reloc_sparc_pc19 = 5,
+
+      // reloc_sparc_h44 - 43-22 bits
+      reloc_sparc_h44 = 6,
+
+      // reloc_sparc_m44 - 21-12 bits
+      reloc_sparc_m44 = 7,
+
+      // reloc_sparc_l44 - lower 12 bits
+      reloc_sparc_l44 = 8,
+
+      // reloc_sparc_hh - 63-42 bits
+      reloc_sparc_hh  = 9,
+
+      // reloc_sparc_hm - 41-32 bits
+      reloc_sparc_hm  = 10
+    };
+  }
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/SparcSelectionDAGInfo.cpp b/contrib/llvm/lib/Target/Sparc/SparcSelectionDAGInfo.cpp
new file mode 100644
index 0000000..a308fc5
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcSelectionDAGInfo.cpp
@@ -0,0 +1,24 @@
+//===-- SparcSelectionDAGInfo.cpp - Sparc SelectionDAG Info ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SparcSelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcSelectionDAGInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "sparc-selectiondag-info"
+
+SparcSelectionDAGInfo::SparcSelectionDAGInfo(const DataLayout &DL)
+  : TargetSelectionDAGInfo(&DL) {
+}
+
+SparcSelectionDAGInfo::~SparcSelectionDAGInfo() {
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcSelectionDAGInfo.h b/contrib/llvm/lib/Target/Sparc/SparcSelectionDAGInfo.h
new file mode 100644
index 0000000..2346f41
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcSelectionDAGInfo.h
@@ -0,0 +1,31 @@
+//===-- SparcSelectionDAGInfo.h - Sparc SelectionDAG Info -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the Sparc subclass for TargetSelectionDAGInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPARCSELECTIONDAGINFO_H
+#define SPARCSELECTIONDAGINFO_H
+
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+
+namespace llvm {
+
+class SparcTargetMachine;
+
+class SparcSelectionDAGInfo : public TargetSelectionDAGInfo {
+public:
+  explicit SparcSelectionDAGInfo(const DataLayout &DL);
+  ~SparcSelectionDAGInfo();
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/SparcSubtarget.cpp b/contrib/llvm/lib/Target/Sparc/SparcSubtarget.cpp
new file mode 100644
index 0000000..eea0c8c
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcSubtarget.cpp
@@ -0,0 +1,109 @@
+//===-- SparcSubtarget.cpp - SPARC Subtarget Information ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SPARC specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcSubtarget.h"
+#include "Sparc.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "sparc-subtarget"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "SparcGenSubtargetInfo.inc"
+
+void SparcSubtarget::anchor() { }
+
+static std::string computeDataLayout(const SparcSubtarget &ST) {
+  // Sparc is big endian.
+  std::string Ret = "E-m:e";
+
+  // Some ABIs have 32bit pointers.
+  if (!ST.is64Bit())
+    Ret += "-p:32:32";
+
+  // Alignments for 64 bit integers.
+  Ret += "-i64:64";
+
+  // On SparcV9 128 floats are aligned to 128 bits, on others only to 64.
+  // On SparcV9 registers can hold 64 or 32 bits, on others only 32.
+  if (ST.is64Bit())
+    Ret += "-n32:64";
+  else
+    Ret += "-f128:64-n32";
+
+  if (ST.is64Bit())
+    Ret += "-S128";
+  else
+    Ret += "-S64";
+
+  return Ret;
+}
+
+SparcSubtarget &SparcSubtarget::initializeSubtargetDependencies(StringRef CPU,
+                                                                StringRef FS) {
+  IsV9 = false;
+  V8DeprecatedInsts = false;
+  IsVIS = false;
+  HasHardQuad = false;
+  UsePopc = false;
+
+  // Determine default and user specified characteristics
+  std::string CPUName = CPU;
+  if (CPUName.empty())
+    CPUName = (Is64Bit) ? "v9" : "v8";
+
+  // Parse features string.
+  ParseSubtargetFeatures(CPUName, FS);
+
+  // Popc is a v9-only instruction.
+  if (!IsV9)
+    UsePopc = false;
+
+  return *this;
+}
+
+SparcSubtarget::SparcSubtarget(const std::string &TT, const std::string &CPU,
+                               const std::string &FS, TargetMachine &TM,
+                               bool is64Bit)
+    : SparcGenSubtargetInfo(TT, CPU, FS), Is64Bit(is64Bit),
+      DL(computeDataLayout(initializeSubtargetDependencies(CPU, FS))),
+      InstrInfo(*this), TLInfo(TM), TSInfo(DL), FrameLowering(*this) {}
+
+int SparcSubtarget::getAdjustedFrameSize(int frameSize) const {
+
+  if (is64Bit()) {
+    // All 64-bit stack frames must be 16-byte aligned, and must reserve space
+    // for spilling the 16 window registers at %sp+BIAS..%sp+BIAS+128.
+    frameSize += 128;
+    // Frames with calls must also reserve space for 6 outgoing arguments
+    // whether they are used or not. LowerCall_64 takes care of that.
+    assert(frameSize % 16 == 0 && "Stack size not 16-byte aligned");
+  } else {
+    // Emit the correct save instruction based on the number of bytes in
+    // the frame. Minimum stack frame size according to V8 ABI is:
+    //   16 words for register window spill
+    //    1 word for address of returned aggregate-value
+    // +  6 words for passing parameters on the stack
+    // ----------
+    //   23 words * 4 bytes per word = 92 bytes
+    frameSize += 92;
+
+    // Round up to next doubleword boundary -- a double-word boundary
+    // is required by the ABI.
+    frameSize = RoundUpToAlignment(frameSize, 8);
+  }
+  return frameSize;
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcSubtarget.h b/contrib/llvm/lib/Target/Sparc/SparcSubtarget.h
new file mode 100644
index 0000000..a335778
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcSubtarget.h
@@ -0,0 +1,92 @@
+//===-- SparcSubtarget.h - Define Subtarget for the SPARC -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the SPARC specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPARC_SUBTARGET_H
+#define SPARC_SUBTARGET_H
+
+#include "SparcFrameLowering.h"
+#include "SparcInstrInfo.h"
+#include "SparcISelLowering.h"
+#include "SparcJITInfo.h"
+#include "SparcSelectionDAGInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "SparcGenSubtargetInfo.inc"
+
+namespace llvm {
+class StringRef;
+
+class SparcSubtarget : public SparcGenSubtargetInfo {
+  virtual void anchor();
+  bool IsV9;
+  bool V8DeprecatedInsts;
+  bool IsVIS, IsVIS2, IsVIS3;
+  bool Is64Bit;
+  bool HasHardQuad;
+  bool UsePopc;
+  const DataLayout DL;       // Calculates type size & alignment
+  SparcInstrInfo InstrInfo;
+  SparcTargetLowering TLInfo;
+  SparcSelectionDAGInfo TSInfo;
+  SparcFrameLowering FrameLowering;
+  SparcJITInfo JITInfo;
+
+public:
+  SparcSubtarget(const std::string &TT, const std::string &CPU,
+                 const std::string &FS, TargetMachine &TM, bool is64bit);
+
+  const SparcInstrInfo *getInstrInfo() const { return &InstrInfo; }
+  const TargetFrameLowering *getFrameLowering() const { return &FrameLowering; }
+  const SparcRegisterInfo *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+  const SparcTargetLowering *getTargetLowering() const { return &TLInfo; }
+  const SparcSelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; }
+  SparcJITInfo *getJITInfo() { return &JITInfo; }
+  const DataLayout *getDataLayout() const { return &DL; }
+
+  bool isV9() const { return IsV9; }
+  bool isVIS() const { return IsVIS; }
+  bool isVIS2() const { return IsVIS2; }
+  bool isVIS3() const { return IsVIS3; }
+  bool useDeprecatedV8Instructions() const { return V8DeprecatedInsts; }
+  bool hasHardQuad() const { return HasHardQuad; }
+  bool usePopc() const { return UsePopc; }
+
+  /// ParseSubtargetFeatures - Parses features string setting specified
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+  SparcSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
+
+  bool is64Bit() const { return Is64Bit; }
+
+  /// The 64-bit ABI uses biased stack and frame pointers, so the stack frame
+  /// of the current function is the area from [%sp+BIAS] to [%fp+BIAS].
+  int64_t getStackPointerBias() const {
+    return is64Bit() ? 2047 : 0;
+  }
+
+  /// Given a actual stack size as determined by FrameInfo, this function
+  /// returns adjusted framesize which includes space for register window
+  /// spills and arguments.
+  int getAdjustedFrameSize(int stackSize) const;
+
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/SparcTargetMachine.cpp b/contrib/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
new file mode 100644
index 0000000..0130fac
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -0,0 +1,101 @@
+//===-- SparcTargetMachine.cpp - Define TargetMachine for Sparc -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcTargetMachine.h"
+#include "Sparc.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+extern "C" void LLVMInitializeSparcTarget() {
+  // Register the target.
+  RegisterTargetMachine<SparcV8TargetMachine> X(TheSparcTarget);
+  RegisterTargetMachine<SparcV9TargetMachine> Y(TheSparcV9Target);
+}
+
+/// SparcTargetMachine ctor - Create an ILP32 architecture model
+///
+SparcTargetMachine::SparcTargetMachine(const Target &T, StringRef TT,
+                                       StringRef CPU, StringRef FS,
+                                       const TargetOptions &Options,
+                                       Reloc::Model RM, CodeModel::Model CM,
+                                       CodeGenOpt::Level OL,
+                                       bool is64bit)
+  : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+    Subtarget(TT, CPU, FS, *this, is64bit) {
+  initAsmInfo();
+}
+
+namespace {
+/// Sparc Code Generator Pass Configuration Options.
+class SparcPassConfig : public TargetPassConfig {
+public:
+  SparcPassConfig(SparcTargetMachine *TM, PassManagerBase &PM)
+    : TargetPassConfig(TM, PM) {}
+
+  SparcTargetMachine &getSparcTargetMachine() const {
+    return getTM<SparcTargetMachine>();
+  }
+
+  bool addInstSelector() override;
+  bool addPreEmitPass() override;
+};
+} // namespace
+
+TargetPassConfig *SparcTargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new SparcPassConfig(this, PM);
+}
+
+bool SparcPassConfig::addInstSelector() {
+  addPass(createSparcISelDag(getSparcTargetMachine()));
+  return false;
+}
+
+bool SparcTargetMachine::addCodeEmitter(PassManagerBase &PM,
+                                        JITCodeEmitter &JCE) {
+  // Machine code emitter pass for Sparc.
+  PM.add(createSparcJITCodeEmitterPass(*this, JCE));
+  return false;
+}
+
+/// addPreEmitPass - This pass may be implemented by targets that want to run
+/// passes immediately before machine code is emitted.  This should return
+/// true if -print-machineinstrs should print out the code after the passes.
+bool SparcPassConfig::addPreEmitPass(){
+  addPass(createSparcDelaySlotFillerPass(getSparcTargetMachine()));
+  return true;
+}
+
+void SparcV8TargetMachine::anchor() { }
+
+SparcV8TargetMachine::SparcV8TargetMachine(const Target &T,
+                                           StringRef TT, StringRef CPU,
+                                           StringRef FS,
+                                           const TargetOptions &Options,
+                                           Reloc::Model RM,
+                                           CodeModel::Model CM,
+                                           CodeGenOpt::Level OL)
+  : SparcTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {
+}
+
+void SparcV9TargetMachine::anchor() { }
+
+SparcV9TargetMachine::SparcV9TargetMachine(const Target &T,
+                                           StringRef TT,  StringRef CPU,
+                                           StringRef FS,
+                                           const TargetOptions &Options,
+                                           Reloc::Model RM,
+                                           CodeModel::Model CM,
+                                           CodeGenOpt::Level OL)
+  : SparcTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcTargetMachine.h b/contrib/llvm/lib/Target/Sparc/SparcTargetMachine.h
new file mode 100644
index 0000000..03b5137
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcTargetMachine.h
@@ -0,0 +1,83 @@
+//===-- SparcTargetMachine.h - Define TargetMachine for Sparc ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Sparc specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPARCTARGETMACHINE_H
+#define SPARCTARGETMACHINE_H
+
+#include "SparcInstrInfo.h"
+#include "SparcSubtarget.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class SparcTargetMachine : public LLVMTargetMachine {
+  SparcSubtarget Subtarget;
+public:
+  SparcTargetMachine(const Target &T, StringRef TT,
+                     StringRef CPU, StringRef FS, const TargetOptions &Options,
+                     Reloc::Model RM, CodeModel::Model CM,
+                     CodeGenOpt::Level OL, bool is64bit);
+
+  const SparcInstrInfo *getInstrInfo() const override {
+    return getSubtargetImpl()->getInstrInfo();
+  }
+  const TargetFrameLowering *getFrameLowering() const override {
+    return getSubtargetImpl()->getFrameLowering();
+  }
+  const SparcSubtarget *getSubtargetImpl() const override { return &Subtarget; }
+  const SparcRegisterInfo *getRegisterInfo() const override {
+    return getSubtargetImpl()->getRegisterInfo();
+  }
+  const SparcTargetLowering *getTargetLowering() const override {
+    return getSubtargetImpl()->getTargetLowering();
+  }
+  const SparcSelectionDAGInfo *getSelectionDAGInfo() const override {
+    return getSubtargetImpl()->getSelectionDAGInfo();
+  }
+  SparcJITInfo *getJITInfo() override { return Subtarget.getJITInfo(); }
+  const DataLayout *getDataLayout() const override {
+    return getSubtargetImpl()->getDataLayout();
+  }
+
+  // Pass Pipeline Configuration
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+  bool addCodeEmitter(PassManagerBase &PM, JITCodeEmitter &JCE) override;
+};
+
+/// SparcV8TargetMachine - Sparc 32-bit target machine
+///
+class SparcV8TargetMachine : public SparcTargetMachine {
+  virtual void anchor();
+public:
+  SparcV8TargetMachine(const Target &T, StringRef TT,
+                       StringRef CPU, StringRef FS,
+                       const TargetOptions &Options,
+                       Reloc::Model RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL);
+};
+
+/// SparcV9TargetMachine - Sparc 64-bit target machine
+///
+class SparcV9TargetMachine : public SparcTargetMachine {
+  virtual void anchor();
+public:
+  SparcV9TargetMachine(const Target &T, StringRef TT,
+                       StringRef CPU, StringRef FS,
+                       const TargetOptions &Options,
+                       Reloc::Model RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp b/contrib/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp
new file mode 100644
index 0000000..32b2240
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp
@@ -0,0 +1,43 @@
+//===------- SparcTargetObjectFile.cpp - Sparc Object Info Impl -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcTargetObjectFile.h"
+#include "MCTargetDesc/SparcMCExpr.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Target/TargetLowering.h"
+
+using namespace llvm;
+
+const MCExpr *SparcELFTargetObjectFile::getTTypeGlobalReference(
+    const GlobalValue *GV, unsigned Encoding, Mangler &Mang,
+    const TargetMachine &TM, MachineModuleInfo *MMI,
+    MCStreamer &Streamer) const {
+
+  if (Encoding & dwarf::DW_EH_PE_pcrel) {
+    MachineModuleInfoELF &ELFMMI = MMI->getObjFileInfo<MachineModuleInfoELF>();
+
+    MCSymbol *SSym = getSymbolWithGlobalValueBase(GV, ".DW.stub", Mang, TM);
+
+    // Add information about the stub reference to ELFMMI so that the stub
+    // gets emitted by the asmprinter.
+    MachineModuleInfoImpl::StubValueTy &StubSym = ELFMMI.getGVStubEntry(SSym);
+    if (!StubSym.getPointer()) {
+      MCSymbol *Sym = TM.getSymbol(GV, Mang);
+      StubSym = MachineModuleInfoImpl::StubValueTy(Sym, !GV->hasLocalLinkage());
+    }
+
+    MCContext &Ctx = getContext();
+    return SparcMCExpr::Create(SparcMCExpr::VK_Sparc_R_DISP32,
+                               MCSymbolRefExpr::Create(SSym, Ctx), Ctx);
+  }
+
+  return TargetLoweringObjectFileELF::getTTypeGlobalReference(
+      GV, Encoding, Mang, TM, MMI, Streamer);
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcTargetObjectFile.h b/contrib/llvm/lib/Target/Sparc/SparcTargetObjectFile.h
new file mode 100644
index 0000000..c60675b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcTargetObjectFile.h
@@ -0,0 +1,35 @@
+//===-- SparcTargetObjectFile.h - Sparc Object Info -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_SPARC_TARGETOBJECTFILE_H
+#define LLVM_TARGET_SPARC_TARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+
+namespace llvm {
+
+class MCContext;
+class TargetMachine;
+
+class SparcELFTargetObjectFile : public TargetLoweringObjectFileELF {
+public:
+  SparcELFTargetObjectFile() :
+    TargetLoweringObjectFileELF()
+  {}
+
+  const MCExpr *
+  getTTypeGlobalReference(const GlobalValue *GV, unsigned Encoding,
+                          Mangler &Mang, const TargetMachine &TM,
+                          MachineModuleInfo *MMI,
+                          MCStreamer &Streamer) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/SparcTargetStreamer.h b/contrib/llvm/lib/Target/Sparc/SparcTargetStreamer.h
new file mode 100644
index 0000000..3767d8e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcTargetStreamer.h
@@ -0,0 +1,49 @@
+//===-- SparcTargetStreamer.h - Sparc Target Streamer ----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPARCTARGETSTREAMER_H
+#define SPARCTARGETSTREAMER_H
+
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCStreamer.h"
+
+namespace llvm {
+class SparcTargetStreamer : public MCTargetStreamer {
+  virtual void anchor();
+
+public:
+  SparcTargetStreamer(MCStreamer &S);
+  /// Emit ".register <reg>, #ignore".
+  virtual void emitSparcRegisterIgnore(unsigned reg) = 0;
+  /// Emit ".register <reg>, #scratch".
+  virtual void emitSparcRegisterScratch(unsigned reg) = 0;
+};
+
+// This part is for ascii assembly output
+class SparcTargetAsmStreamer : public SparcTargetStreamer {
+  formatted_raw_ostream &OS;
+
+public:
+  SparcTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
+  void emitSparcRegisterIgnore(unsigned reg) override;
+  void emitSparcRegisterScratch(unsigned reg) override;
+
+};
+
+// This part is for ELF object output
+class SparcTargetELFStreamer : public SparcTargetStreamer {
+public:
+  SparcTargetELFStreamer(MCStreamer &S);
+  MCELFStreamer &getStreamer();
+  void emitSparcRegisterIgnore(unsigned reg) override {}
+  void emitSparcRegisterScratch(unsigned reg) override {}
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp b/contrib/llvm/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp
new file mode 100644
index 0000000..4eea163
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp
@@ -0,0 +1,23 @@
+//===-- SparcTargetInfo.cpp - Sparc Target Implementation -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Sparc.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheSparcTarget;
+Target llvm::TheSparcV9Target;
+
+extern "C" void LLVMInitializeSparcTargetInfo() {
+  RegisterTarget<Triple::sparc, /*HasJIT=*/ true>
+    X(TheSparcTarget, "sparc", "Sparc");
+  RegisterTarget<Triple::sparcv9, /*HasJIT=*/ true>
+    Y(TheSparcV9Target, "sparcv9", "Sparc V9");
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/contrib/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
new file mode 100644
index 0000000..758be41
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -0,0 +1,779 @@
+//===-- SystemZAsmParser.cpp - Parse SystemZ assembly instructions --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/SystemZMCTargetDesc.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+// Return true if Expr is in the range [MinValue, MaxValue].
+static bool inRange(const MCExpr *Expr, int64_t MinValue, int64_t MaxValue) {
+  if (auto *CE = dyn_cast<MCConstantExpr>(Expr)) {
+    int64_t Value = CE->getValue();
+    return Value >= MinValue && Value <= MaxValue;
+  }
+  return false;
+}
+
+namespace {
+enum RegisterKind {
+  GR32Reg,
+  GRH32Reg,
+  GR64Reg,
+  GR128Reg,
+  ADDR32Reg,
+  ADDR64Reg,
+  FP32Reg,
+  FP64Reg,
+  FP128Reg
+};
+
+enum MemoryKind {
+  BDMem,
+  BDXMem,
+  BDLMem
+};
+
+class SystemZOperand : public MCParsedAsmOperand {
+public:
+private:
+  enum OperandKind {
+    KindInvalid,
+    KindToken,
+    KindReg,
+    KindAccessReg,
+    KindImm,
+    KindMem
+  };
+
+  OperandKind Kind;
+  SMLoc StartLoc, EndLoc;
+
+  // A string of length Length, starting at Data.
+  struct TokenOp {
+    const char *Data;
+    unsigned Length;
+  };
+
+  // LLVM register Num, which has kind Kind.  In some ways it might be
+  // easier for this class to have a register bank (general, floating-point
+  // or access) and a raw register number (0-15).  This would postpone the
+  // interpretation of the operand to the add*() methods and avoid the need
+  // for context-dependent parsing.  However, we do things the current way
+  // because of the virtual getReg() method, which needs to distinguish
+  // between (say) %r0 used as a single register and %r0 used as a pair.
+  // Context-dependent parsing can also give us slightly better error
+  // messages when invalid pairs like %r1 are used.
+  struct RegOp {
+    RegisterKind Kind;
+    unsigned Num;
+  };
+
+  // Base + Disp + Index, where Base and Index are LLVM registers or 0.
+  // RegKind says what type the registers have (ADDR32Reg or ADDR64Reg).
+  // Length is the operand length for D(L,B)-style operands, otherwise
+  // it is null.
+  struct MemOp {
+    unsigned Base : 8;
+    unsigned Index : 8;
+    unsigned RegKind : 8;
+    unsigned Unused : 8;
+    const MCExpr *Disp;
+    const MCExpr *Length;
+  };
+
+  union {
+    TokenOp Token;
+    RegOp Reg;
+    unsigned AccessReg;
+    const MCExpr *Imm;
+    MemOp Mem;
+  };
+
+  void addExpr(MCInst &Inst, const MCExpr *Expr) const {
+    // Add as immediates when possible.  Null MCExpr = 0.
+    if (!Expr)
+      Inst.addOperand(MCOperand::CreateImm(0));
+    else if (auto *CE = dyn_cast<MCConstantExpr>(Expr))
+      Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::CreateExpr(Expr));
+  }
+
+public:
+  SystemZOperand(OperandKind kind, SMLoc startLoc, SMLoc endLoc)
+      : Kind(kind), StartLoc(startLoc), EndLoc(endLoc) {}
+
+  // Create particular kinds of operand.
+  static std::unique_ptr<SystemZOperand> createInvalid(SMLoc StartLoc,
+                                                       SMLoc EndLoc) {
+    return make_unique<SystemZOperand>(KindInvalid, StartLoc, EndLoc);
+  }
+  static std::unique_ptr<SystemZOperand> createToken(StringRef Str, SMLoc Loc) {
+    auto Op = make_unique<SystemZOperand>(KindToken, Loc, Loc);
+    Op->Token.Data = Str.data();
+    Op->Token.Length = Str.size();
+    return Op;
+  }
+  static std::unique_ptr<SystemZOperand>
+  createReg(RegisterKind Kind, unsigned Num, SMLoc StartLoc, SMLoc EndLoc) {
+    auto Op = make_unique<SystemZOperand>(KindReg, StartLoc, EndLoc);
+    Op->Reg.Kind = Kind;
+    Op->Reg.Num = Num;
+    return Op;
+  }
+  static std::unique_ptr<SystemZOperand>
+  createAccessReg(unsigned Num, SMLoc StartLoc, SMLoc EndLoc) {
+    auto Op = make_unique<SystemZOperand>(KindAccessReg, StartLoc, EndLoc);
+    Op->AccessReg = Num;
+    return Op;
+  }
+  static std::unique_ptr<SystemZOperand>
+  createImm(const MCExpr *Expr, SMLoc StartLoc, SMLoc EndLoc) {
+    auto Op = make_unique<SystemZOperand>(KindImm, StartLoc, EndLoc);
+    Op->Imm = Expr;
+    return Op;
+  }
+  static std::unique_ptr<SystemZOperand>
+  createMem(RegisterKind RegKind, unsigned Base, const MCExpr *Disp,
+            unsigned Index, const MCExpr *Length, SMLoc StartLoc,
+            SMLoc EndLoc) {
+    auto Op = make_unique<SystemZOperand>(KindMem, StartLoc, EndLoc);
+    Op->Mem.RegKind = RegKind;
+    Op->Mem.Base = Base;
+    Op->Mem.Index = Index;
+    Op->Mem.Disp = Disp;
+    Op->Mem.Length = Length;
+    return Op;
+  }
+
+  // Token operands
+  bool isToken() const override {
+    return Kind == KindToken;
+  }
+  StringRef getToken() const {
+    assert(Kind == KindToken && "Not a token");
+    return StringRef(Token.Data, Token.Length);
+  }
+
+  // Register operands.
+  bool isReg() const override {
+    return Kind == KindReg;
+  }
+  bool isReg(RegisterKind RegKind) const {
+    return Kind == KindReg && Reg.Kind == RegKind;
+  }
+  unsigned getReg() const override {
+    assert(Kind == KindReg && "Not a register");
+    return Reg.Num;
+  }
+
+  // Access register operands.  Access registers aren't exposed to LLVM
+  // as registers.
+  bool isAccessReg() const {
+    return Kind == KindAccessReg;
+  }
+
+  // Immediate operands.
+  bool isImm() const override {
+    return Kind == KindImm;
+  }
+  bool isImm(int64_t MinValue, int64_t MaxValue) const {
+    return Kind == KindImm && inRange(Imm, MinValue, MaxValue);
+  }
+  const MCExpr *getImm() const {
+    assert(Kind == KindImm && "Not an immediate");
+    return Imm;
+  }
+
+  // Memory operands.
+  bool isMem() const override {
+    return Kind == KindMem;
+  }
+  bool isMem(RegisterKind RegKind, MemoryKind MemKind) const {
+    return (Kind == KindMem &&
+            Mem.RegKind == RegKind &&
+            (MemKind == BDXMem || !Mem.Index) &&
+            (MemKind == BDLMem) == (Mem.Length != nullptr));
+  }
+  bool isMemDisp12(RegisterKind RegKind, MemoryKind MemKind) const {
+    return isMem(RegKind, MemKind) && inRange(Mem.Disp, 0, 0xfff);
+  }
+  bool isMemDisp20(RegisterKind RegKind, MemoryKind MemKind) const {
+    return isMem(RegKind, MemKind) && inRange(Mem.Disp, -524288, 524287);
+  }
+  bool isMemDisp12Len8(RegisterKind RegKind) const {
+    return isMemDisp12(RegKind, BDLMem) && inRange(Mem.Length, 1, 0x100);
+  }
+
+  // Override MCParsedAsmOperand.
+  SMLoc getStartLoc() const override { return StartLoc; }
+  SMLoc getEndLoc() const override { return EndLoc; }
+  void print(raw_ostream &OS) const override;
+
+  // Used by the TableGen code to add particular types of operand
+  // to an instruction.
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands");
+    Inst.addOperand(MCOperand::CreateReg(getReg()));
+  }
+  void addAccessRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands");
+    assert(Kind == KindAccessReg && "Invalid operand type");
+    Inst.addOperand(MCOperand::CreateImm(AccessReg));
+  }
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands");
+    addExpr(Inst, getImm());
+  }
+  void addBDAddrOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands");
+    assert(Kind == KindMem && Mem.Index == 0 && "Invalid operand type");
+    Inst.addOperand(MCOperand::CreateReg(Mem.Base));
+    addExpr(Inst, Mem.Disp);
+  }
+  void addBDXAddrOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 3 && "Invalid number of operands");
+    assert(Kind == KindMem && "Invalid operand type");
+    Inst.addOperand(MCOperand::CreateReg(Mem.Base));
+    addExpr(Inst, Mem.Disp);
+    Inst.addOperand(MCOperand::CreateReg(Mem.Index));
+  }
+  void addBDLAddrOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 3 && "Invalid number of operands");
+    assert(Kind == KindMem && "Invalid operand type");
+    Inst.addOperand(MCOperand::CreateReg(Mem.Base));
+    addExpr(Inst, Mem.Disp);
+    addExpr(Inst, Mem.Length);
+  }
+
+  // Used by the TableGen code to check for particular operand types.
+  bool isGR32() const { return isReg(GR32Reg); }
+  bool isGRH32() const { return isReg(GRH32Reg); }
+  bool isGRX32() const { return false; }
+  bool isGR64() const { return isReg(GR64Reg); }
+  bool isGR128() const { return isReg(GR128Reg); }
+  bool isADDR32() const { return isReg(ADDR32Reg); }
+  bool isADDR64() const { return isReg(ADDR64Reg); }
+  bool isADDR128() const { return false; }
+  bool isFP32() const { return isReg(FP32Reg); }
+  bool isFP64() const { return isReg(FP64Reg); }
+  bool isFP128() const { return isReg(FP128Reg); }
+  bool isBDAddr32Disp12() const { return isMemDisp12(ADDR32Reg, BDMem); }
+  bool isBDAddr32Disp20() const { return isMemDisp20(ADDR32Reg, BDMem); }
+  bool isBDAddr64Disp12() const { return isMemDisp12(ADDR64Reg, BDMem); }
+  bool isBDAddr64Disp20() const { return isMemDisp20(ADDR64Reg, BDMem); }
+  bool isBDXAddr64Disp12() const { return isMemDisp12(ADDR64Reg, BDXMem); }
+  bool isBDXAddr64Disp20() const { return isMemDisp20(ADDR64Reg, BDXMem); }
+  bool isBDLAddr64Disp12Len8() const { return isMemDisp12Len8(ADDR64Reg); }
+  bool isU4Imm() const { return isImm(0, 15); }
+  bool isU6Imm() const { return isImm(0, 63); }
+  bool isU8Imm() const { return isImm(0, 255); }
+  bool isS8Imm() const { return isImm(-128, 127); }
+  bool isU16Imm() const { return isImm(0, 65535); }
+  bool isS16Imm() const { return isImm(-32768, 32767); }
+  bool isU32Imm() const { return isImm(0, (1LL << 32) - 1); }
+  bool isS32Imm() const { return isImm(-(1LL << 31), (1LL << 31) - 1); }
+};
+
+class SystemZAsmParser : public MCTargetAsmParser {
+#define GET_ASSEMBLER_HEADER
+#include "SystemZGenAsmMatcher.inc"
+
+private:
+  MCSubtargetInfo &STI;
+  MCAsmParser &Parser;
+  enum RegisterGroup {
+    RegGR,
+    RegFP,
+    RegAccess
+  };
+  struct Register {
+    RegisterGroup Group;
+    unsigned Num;
+    SMLoc StartLoc, EndLoc;
+  };
+
+  bool parseRegister(Register &Reg);
+
+  bool parseRegister(Register &Reg, RegisterGroup Group, const unsigned *Regs,
+                     bool IsAddress = false);
+
+  OperandMatchResultTy parseRegister(OperandVector &Operands,
+                                     RegisterGroup Group, const unsigned *Regs,
+                                     RegisterKind Kind);
+
+  bool parseAddress(unsigned &Base, const MCExpr *&Disp,
+                    unsigned &Index, const MCExpr *&Length,
+                    const unsigned *Regs, RegisterKind RegKind);
+
+  OperandMatchResultTy parseAddress(OperandVector &Operands,
+                                    const unsigned *Regs, RegisterKind RegKind,
+                                    MemoryKind MemKind);
+
+  bool parseOperand(OperandVector &Operands, StringRef Mnemonic);
+
+public:
+  SystemZAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser,
+                   const MCInstrInfo &MII,
+                   const MCTargetOptions &Options)
+      : MCTargetAsmParser(), STI(sti), Parser(parser) {
+    MCAsmParserExtension::Initialize(Parser);
+
+    // Initialize the set of available features.
+    setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+  }
+
+  // Override MCTargetAsmParser.
+  bool ParseDirective(AsmToken DirectiveID) override;
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                        SMLoc NameLoc, OperandVector &Operands) override;
+  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                               OperandVector &Operands, MCStreamer &Out,
+                               unsigned &ErrorInfo,
+                               bool MatchingInlineAsm) override;
+
+  // Used by the TableGen code to parse particular operand types.
+  OperandMatchResultTy parseGR32(OperandVector &Operands) {
+    return parseRegister(Operands, RegGR, SystemZMC::GR32Regs, GR32Reg);
+  }
+  OperandMatchResultTy parseGRH32(OperandVector &Operands) {
+    return parseRegister(Operands, RegGR, SystemZMC::GRH32Regs, GRH32Reg);
+  }
+  OperandMatchResultTy parseGRX32(OperandVector &Operands) {
+    llvm_unreachable("GRX32 should only be used for pseudo instructions");
+  }
+  OperandMatchResultTy parseGR64(OperandVector &Operands) {
+    return parseRegister(Operands, RegGR, SystemZMC::GR64Regs, GR64Reg);
+  }
+  OperandMatchResultTy parseGR128(OperandVector &Operands) {
+    return parseRegister(Operands, RegGR, SystemZMC::GR128Regs, GR128Reg);
+  }
+  OperandMatchResultTy parseADDR32(OperandVector &Operands) {
+    return parseRegister(Operands, RegGR, SystemZMC::GR32Regs, ADDR32Reg);
+  }
+  OperandMatchResultTy parseADDR64(OperandVector &Operands) {
+    return parseRegister(Operands, RegGR, SystemZMC::GR64Regs, ADDR64Reg);
+  }
+  OperandMatchResultTy parseADDR128(OperandVector &Operands) {
+    llvm_unreachable("Shouldn't be used as an operand");
+  }
+  OperandMatchResultTy parseFP32(OperandVector &Operands) {
+    return parseRegister(Operands, RegFP, SystemZMC::FP32Regs, FP32Reg);
+  }
+  OperandMatchResultTy parseFP64(OperandVector &Operands) {
+    return parseRegister(Operands, RegFP, SystemZMC::FP64Regs, FP64Reg);
+  }
+  OperandMatchResultTy parseFP128(OperandVector &Operands) {
+    return parseRegister(Operands, RegFP, SystemZMC::FP128Regs, FP128Reg);
+  }
+  OperandMatchResultTy parseBDAddr32(OperandVector &Operands) {
+    return parseAddress(Operands, SystemZMC::GR32Regs, ADDR32Reg, BDMem);
+  }
+  OperandMatchResultTy parseBDAddr64(OperandVector &Operands) {
+    return parseAddress(Operands, SystemZMC::GR64Regs, ADDR64Reg, BDMem);
+  }
+  OperandMatchResultTy parseBDXAddr64(OperandVector &Operands) {
+    return parseAddress(Operands, SystemZMC::GR64Regs, ADDR64Reg, BDXMem);
+  }
+  OperandMatchResultTy parseBDLAddr64(OperandVector &Operands) {
+    return parseAddress(Operands, SystemZMC::GR64Regs, ADDR64Reg, BDLMem);
+  }
+  OperandMatchResultTy parseAccessReg(OperandVector &Operands);
+  OperandMatchResultTy parsePCRel(OperandVector &Operands, int64_t MinVal,
+                                  int64_t MaxVal);
+  OperandMatchResultTy parsePCRel16(OperandVector &Operands) {
+    return parsePCRel(Operands, -(1LL << 16), (1LL << 16) - 1);
+  }
+  OperandMatchResultTy parsePCRel32(OperandVector &Operands) {
+    return parsePCRel(Operands, -(1LL << 32), (1LL << 32) - 1);
+  }
+};
+} // end anonymous namespace
+
+#define GET_REGISTER_MATCHER
+#define GET_SUBTARGET_FEATURE_NAME
+#define GET_MATCHER_IMPLEMENTATION
+#include "SystemZGenAsmMatcher.inc"
+
+void SystemZOperand::print(raw_ostream &OS) const {
+  llvm_unreachable("Not implemented");
+}
+
+// Parse one register of the form %<prefix><number>.
+bool SystemZAsmParser::parseRegister(Register &Reg) {
+  Reg.StartLoc = Parser.getTok().getLoc();
+
+  // Eat the % prefix.
+  if (Parser.getTok().isNot(AsmToken::Percent))
+    return Error(Parser.getTok().getLoc(), "register expected");
+  Parser.Lex();
+
+  // Expect a register name.
+  if (Parser.getTok().isNot(AsmToken::Identifier))
+    return Error(Reg.StartLoc, "invalid register");
+
+  // Check that there's a prefix.
+  StringRef Name = Parser.getTok().getString();
+  if (Name.size() < 2)
+    return Error(Reg.StartLoc, "invalid register");
+  char Prefix = Name[0];
+
+  // Treat the rest of the register name as a register number.
+  if (Name.substr(1).getAsInteger(10, Reg.Num))
+    return Error(Reg.StartLoc, "invalid register");
+
+  // Look for valid combinations of prefix and number.
+  if (Prefix == 'r' && Reg.Num < 16)
+    Reg.Group = RegGR;
+  else if (Prefix == 'f' && Reg.Num < 16)
+    Reg.Group = RegFP;
+  else if (Prefix == 'a' && Reg.Num < 16)
+    Reg.Group = RegAccess;
+  else
+    return Error(Reg.StartLoc, "invalid register");
+
+  Reg.EndLoc = Parser.getTok().getLoc();
+  Parser.Lex();
+  return false;
+}
+
+// Parse a register of group Group.  If Regs is nonnull, use it to map
+// the raw register number to LLVM numbering, with zero entries indicating
+// an invalid register.  IsAddress says whether the register appears in an
+// address context.
+bool SystemZAsmParser::parseRegister(Register &Reg, RegisterGroup Group,
+                                     const unsigned *Regs, bool IsAddress) {
+  if (parseRegister(Reg))
+    return true;
+  if (Reg.Group != Group)
+    return Error(Reg.StartLoc, "invalid operand for instruction");
+  if (Regs && Regs[Reg.Num] == 0)
+    return Error(Reg.StartLoc, "invalid register pair");
+  if (Reg.Num == 0 && IsAddress)
+    return Error(Reg.StartLoc, "%r0 used in an address");
+  if (Regs)
+    Reg.Num = Regs[Reg.Num];
+  return false;
+}
+
+// Parse a register and add it to Operands.  The other arguments are as above.
+SystemZAsmParser::OperandMatchResultTy
+SystemZAsmParser::parseRegister(OperandVector &Operands, RegisterGroup Group,
+                                const unsigned *Regs, RegisterKind Kind) {
+  if (Parser.getTok().isNot(AsmToken::Percent))
+    return MatchOperand_NoMatch;
+
+  Register Reg;
+  bool IsAddress = (Kind == ADDR32Reg || Kind == ADDR64Reg);
+  if (parseRegister(Reg, Group, Regs, IsAddress))
+    return MatchOperand_ParseFail;
+
+  Operands.push_back(SystemZOperand::createReg(Kind, Reg.Num,
+                                               Reg.StartLoc, Reg.EndLoc));
+  return MatchOperand_Success;
+}
+
+// Parse a memory operand into Base, Disp, Index and Length.
+// Regs maps asm register numbers to LLVM register numbers and RegKind
+// says what kind of address register we're using (ADDR32Reg or ADDR64Reg).
+bool SystemZAsmParser::parseAddress(unsigned &Base, const MCExpr *&Disp,
+                                    unsigned &Index, const MCExpr *&Length,
+                                    const unsigned *Regs,
+                                    RegisterKind RegKind) {
+  // Parse the displacement, which must always be present.
+  if (getParser().parseExpression(Disp))
+    return true;
+
+  // Parse the optional base and index.
+  Index = 0;
+  Base = 0;
+  Length = nullptr;
+  if (getLexer().is(AsmToken::LParen)) {
+    Parser.Lex();
+
+    if (getLexer().is(AsmToken::Percent)) {
+      // Parse the first register and decide whether it's a base or an index.
+      Register Reg;
+      if (parseRegister(Reg, RegGR, Regs, RegKind))
+        return true;
+      if (getLexer().is(AsmToken::Comma))
+        Index = Reg.Num;
+      else
+        Base = Reg.Num;
+    } else {
+      // Parse the length.
+      if (getParser().parseExpression(Length))
+        return true;
+    }
+
+    // Check whether there's a second register.  It's the base if so.
+    if (getLexer().is(AsmToken::Comma)) {
+      Parser.Lex();
+      Register Reg;
+      if (parseRegister(Reg, RegGR, Regs, RegKind))
+        return true;
+      Base = Reg.Num;
+    }
+
+    // Consume the closing bracket.
+    if (getLexer().isNot(AsmToken::RParen))
+      return Error(Parser.getTok().getLoc(), "unexpected token in address");
+    Parser.Lex();
+  }
+  return false;
+}
+
+// Parse a memory operand and add it to Operands.  The other arguments
+// are as above.
+SystemZAsmParser::OperandMatchResultTy
+SystemZAsmParser::parseAddress(OperandVector &Operands, const unsigned *Regs,
+                               RegisterKind RegKind, MemoryKind MemKind) {
+  SMLoc StartLoc = Parser.getTok().getLoc();
+  unsigned Base, Index;
+  const MCExpr *Disp;
+  const MCExpr *Length;
+  if (parseAddress(Base, Disp, Index, Length, Regs, RegKind))
+    return MatchOperand_ParseFail;
+
+  if (Index && MemKind != BDXMem)
+    {
+      Error(StartLoc, "invalid use of indexed addressing");
+      return MatchOperand_ParseFail;
+    }
+
+  if (Length && MemKind != BDLMem)
+    {
+      Error(StartLoc, "invalid use of length addressing");
+      return MatchOperand_ParseFail;
+    }
+
+  if (!Length && MemKind == BDLMem)
+    {
+      Error(StartLoc, "missing length in address");
+      return MatchOperand_ParseFail;
+    }
+
+  SMLoc EndLoc =
+    SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+  Operands.push_back(SystemZOperand::createMem(RegKind, Base, Disp, Index,
+                                               Length, StartLoc, EndLoc));
+  return MatchOperand_Success;
+}
+
+bool SystemZAsmParser::ParseDirective(AsmToken DirectiveID) {
+  return true;
+}
+
+bool SystemZAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                     SMLoc &EndLoc) {
+  Register Reg;
+  if (parseRegister(Reg))
+    return true;
+  if (Reg.Group == RegGR)
+    RegNo = SystemZMC::GR64Regs[Reg.Num];
+  else if (Reg.Group == RegFP)
+    RegNo = SystemZMC::FP64Regs[Reg.Num];
+  else
+    // FIXME: Access registers aren't modelled as LLVM registers yet.
+    return Error(Reg.StartLoc, "invalid operand for instruction");
+  StartLoc = Reg.StartLoc;
+  EndLoc = Reg.EndLoc;
+  return false;
+}
+
+bool SystemZAsmParser::ParseInstruction(ParseInstructionInfo &Info,
+                                        StringRef Name, SMLoc NameLoc,
+                                        OperandVector &Operands) {
+  Operands.push_back(SystemZOperand::createToken(Name, NameLoc));
+
+  // Read the remaining operands.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    // Read the first operand.
+    if (parseOperand(Operands, Name)) {
+      Parser.eatToEndOfStatement();
+      return true;
+    }
+
+    // Read any subsequent operands.
+    while (getLexer().is(AsmToken::Comma)) {
+      Parser.Lex();
+      if (parseOperand(Operands, Name)) {
+        Parser.eatToEndOfStatement();
+        return true;
+      }
+    }
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      SMLoc Loc = getLexer().getLoc();
+      Parser.eatToEndOfStatement();
+      return Error(Loc, "unexpected token in argument list");
+    }
+  }
+
+  // Consume the EndOfStatement.
+  Parser.Lex();
+  return false;
+}
+
+bool SystemZAsmParser::parseOperand(OperandVector &Operands,
+                                    StringRef Mnemonic) {
+  // Check if the current operand has a custom associated parser, if so, try to
+  // custom parse the operand, or fallback to the general approach.
+  OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
+  if (ResTy == MatchOperand_Success)
+    return false;
+
+  // If there wasn't a custom match, try the generic matcher below. Otherwise,
+  // there was a match, but an error occurred, in which case, just return that
+  // the operand parsing failed.
+  if (ResTy == MatchOperand_ParseFail)
+    return true;
+
+  // Check for a register.  All real register operands should have used
+  // a context-dependent parse routine, which gives the required register
+  // class.  The code is here to mop up other cases, like those where
+  // the instruction isn't recognized.
+  if (Parser.getTok().is(AsmToken::Percent)) {
+    Register Reg;
+    if (parseRegister(Reg))
+      return true;
+    Operands.push_back(SystemZOperand::createInvalid(Reg.StartLoc, Reg.EndLoc));
+    return false;
+  }
+
+  // The only other type of operand is an immediate or address.  As above,
+  // real address operands should have used a context-dependent parse routine,
+  // so we treat any plain expression as an immediate.
+  SMLoc StartLoc = Parser.getTok().getLoc();
+  unsigned Base, Index;
+  const MCExpr *Expr, *Length;
+  if (parseAddress(Base, Expr, Index, Length, SystemZMC::GR64Regs, ADDR64Reg))
+    return true;
+
+  SMLoc EndLoc =
+    SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+  if (Base || Index || Length)
+    Operands.push_back(SystemZOperand::createInvalid(StartLoc, EndLoc));
+  else
+    Operands.push_back(SystemZOperand::createImm(Expr, StartLoc, EndLoc));
+  return false;
+}
+
+bool SystemZAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                               OperandVector &Operands,
+                                               MCStreamer &Out,
+                                               unsigned &ErrorInfo,
+                                               bool MatchingInlineAsm) {
+  MCInst Inst;
+  unsigned MatchResult;
+
+  MatchResult = MatchInstructionImpl(Operands, Inst, ErrorInfo,
+                                     MatchingInlineAsm);
+  switch (MatchResult) {
+  default: break;
+  case Match_Success:
+    Inst.setLoc(IDLoc);
+    Out.EmitInstruction(Inst, STI);
+    return false;
+
+  case Match_MissingFeature: {
+    assert(ErrorInfo && "Unknown missing feature!");
+    // Special case the error message for the very common case where only
+    // a single subtarget feature is missing
+    std::string Msg = "instruction requires:";
+    unsigned Mask = 1;
+    for (unsigned I = 0; I < sizeof(ErrorInfo) * 8 - 1; ++I) {
+      if (ErrorInfo & Mask) {
+        Msg += " ";
+        Msg += getSubtargetFeatureName(ErrorInfo & Mask);
+      }
+      Mask <<= 1;
+    }
+    return Error(IDLoc, Msg);
+  }
+
+  case Match_InvalidOperand: {
+    SMLoc ErrorLoc = IDLoc;
+    if (ErrorInfo != ~0U) {
+      if (ErrorInfo >= Operands.size())
+        return Error(IDLoc, "too few operands for instruction");
+
+      ErrorLoc = ((SystemZOperand &)*Operands[ErrorInfo]).getStartLoc();
+      if (ErrorLoc == SMLoc())
+        ErrorLoc = IDLoc;
+    }
+    return Error(ErrorLoc, "invalid operand for instruction");
+  }
+
+  case Match_MnemonicFail:
+    return Error(IDLoc, "invalid instruction");
+  }
+
+  llvm_unreachable("Unexpected match type");
+}
+
+SystemZAsmParser::OperandMatchResultTy
+SystemZAsmParser::parseAccessReg(OperandVector &Operands) {
+  if (Parser.getTok().isNot(AsmToken::Percent))
+    return MatchOperand_NoMatch;
+
+  Register Reg;
+  if (parseRegister(Reg, RegAccess, nullptr))
+    return MatchOperand_ParseFail;
+
+  Operands.push_back(SystemZOperand::createAccessReg(Reg.Num,
+                                                     Reg.StartLoc,
+                                                     Reg.EndLoc));
+  return MatchOperand_Success;
+}
+
+SystemZAsmParser::OperandMatchResultTy
+SystemZAsmParser::parsePCRel(OperandVector &Operands, int64_t MinVal,
+                             int64_t MaxVal) {
+  MCContext &Ctx = getContext();
+  MCStreamer &Out = getStreamer();
+  const MCExpr *Expr;
+  SMLoc StartLoc = Parser.getTok().getLoc();
+  if (getParser().parseExpression(Expr))
+    return MatchOperand_NoMatch;
+
+  // For consistency with the GNU assembler, treat immediates as offsets
+  // from ".".
+  if (auto *CE = dyn_cast<MCConstantExpr>(Expr)) {
+    int64_t Value = CE->getValue();
+    if ((Value & 1) || Value < MinVal || Value > MaxVal) {
+      Error(StartLoc, "offset out of range");
+      return MatchOperand_ParseFail;
+    }
+    MCSymbol *Sym = Ctx.CreateTempSymbol();
+    Out.EmitLabel(Sym);
+    const MCExpr *Base = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None,
+                                                 Ctx);
+    Expr = Value == 0 ? Base : MCBinaryExpr::CreateAdd(Base, Expr, Ctx);
+  }
+
+  SMLoc EndLoc =
+    SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+  Operands.push_back(SystemZOperand::createImm(Expr, StartLoc, EndLoc));
+  return MatchOperand_Success;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeSystemZAsmParser() {
+  RegisterMCAsmParser<SystemZAsmParser> X(TheSystemZTarget);
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp b/contrib/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
new file mode 100644
index 0000000..2350776
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
@@ -0,0 +1,324 @@
+//===-- SystemZDisassembler.cpp - Disassembler for SystemZ ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZ.h"
+#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/MemoryObject.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "systemz-disassembler"
+
+typedef MCDisassembler::DecodeStatus DecodeStatus;
+
+namespace {
+class SystemZDisassembler : public MCDisassembler {
+public:
+  SystemZDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
+    : MCDisassembler(STI, Ctx) {}
+  virtual ~SystemZDisassembler() {}
+
+  // Override MCDisassembler.
+  DecodeStatus getInstruction(MCInst &instr, uint64_t &size,
+                              const MemoryObject &region, uint64_t address,
+                              raw_ostream &vStream,
+                              raw_ostream &cStream) const override;
+};
+} // end anonymous namespace
+
+static MCDisassembler *createSystemZDisassembler(const Target &T,
+                                                 const MCSubtargetInfo &STI,
+                                                 MCContext &Ctx) {
+  return new SystemZDisassembler(STI, Ctx);
+}
+
+extern "C" void LLVMInitializeSystemZDisassembler() {
+  // Register the disassembler.
+  TargetRegistry::RegisterMCDisassembler(TheSystemZTarget,
+                                         createSystemZDisassembler);
+}
+
+static DecodeStatus decodeRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                        const unsigned *Regs) {
+  assert(RegNo < 16 && "Invalid register");
+  RegNo = Regs[RegNo];
+  if (RegNo == 0)
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateReg(RegNo));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeGR32BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, SystemZMC::GR32Regs);
+}
+
+static DecodeStatus DecodeGRH32BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                                uint64_t Address,
+                                                const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, SystemZMC::GRH32Regs);
+}
+
+static DecodeStatus DecodeGR64BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, SystemZMC::GR64Regs);
+}
+
+static DecodeStatus DecodeGR128BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                                uint64_t Address,
+                                                const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, SystemZMC::GR128Regs);
+}
+
+static DecodeStatus DecodeADDR64BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                                 uint64_t Address,
+                                                 const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, SystemZMC::GR64Regs);
+}
+
+static DecodeStatus DecodeFP32BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, SystemZMC::FP32Regs);
+}
+
+static DecodeStatus DecodeFP64BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, SystemZMC::FP64Regs);
+}
+
+static DecodeStatus DecodeFP128BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                                uint64_t Address,
+                                                const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, SystemZMC::FP128Regs);
+}
+
+template<unsigned N>
+static DecodeStatus decodeUImmOperand(MCInst &Inst, uint64_t Imm) {
+  assert(isUInt<N>(Imm) && "Invalid immediate");
+  Inst.addOperand(MCOperand::CreateImm(Imm));
+  return MCDisassembler::Success;
+}
+
+template<unsigned N>
+static DecodeStatus decodeSImmOperand(MCInst &Inst, uint64_t Imm) {
+  assert(isUInt<N>(Imm) && "Invalid immediate");
+  Inst.addOperand(MCOperand::CreateImm(SignExtend64<N>(Imm)));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeAccessRegOperand(MCInst &Inst, uint64_t Imm,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  return decodeUImmOperand<4>(Inst, Imm);
+}
+
+static DecodeStatus decodeU4ImmOperand(MCInst &Inst, uint64_t Imm,
+                                       uint64_t Address, const void *Decoder) {
+  return decodeUImmOperand<4>(Inst, Imm);
+}
+
+static DecodeStatus decodeU6ImmOperand(MCInst &Inst, uint64_t Imm,
+                                       uint64_t Address, const void *Decoder) {
+  return decodeUImmOperand<6>(Inst, Imm);
+}
+
+static DecodeStatus decodeU8ImmOperand(MCInst &Inst, uint64_t Imm,
+                                       uint64_t Address, const void *Decoder) {
+  return decodeUImmOperand<8>(Inst, Imm);
+}
+
+static DecodeStatus decodeU16ImmOperand(MCInst &Inst, uint64_t Imm,
+                                        uint64_t Address, const void *Decoder) {
+  return decodeUImmOperand<16>(Inst, Imm);
+}
+
+static DecodeStatus decodeU32ImmOperand(MCInst &Inst, uint64_t Imm,
+                                        uint64_t Address, const void *Decoder) {
+  return decodeUImmOperand<32>(Inst, Imm);
+}
+
+static DecodeStatus decodeS8ImmOperand(MCInst &Inst, uint64_t Imm,
+                                       uint64_t Address, const void *Decoder) {
+  return decodeSImmOperand<8>(Inst, Imm);
+}
+
+static DecodeStatus decodeS16ImmOperand(MCInst &Inst, uint64_t Imm,
+                                        uint64_t Address, const void *Decoder) {
+  return decodeSImmOperand<16>(Inst, Imm);
+}
+
+static DecodeStatus decodeS32ImmOperand(MCInst &Inst, uint64_t Imm,
+                                        uint64_t Address, const void *Decoder) {
+  return decodeSImmOperand<32>(Inst, Imm);
+}
+
+template<unsigned N>
+static DecodeStatus decodePCDBLOperand(MCInst &Inst, uint64_t Imm,
+                                       uint64_t Address) {
+  assert(isUInt<N>(Imm) && "Invalid PC-relative offset");
+  Inst.addOperand(MCOperand::CreateImm(SignExtend64<N>(Imm) * 2 + Address));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodePC16DBLOperand(MCInst &Inst, uint64_t Imm,
+                                         uint64_t Address,
+                                         const void *Decoder) {
+  return decodePCDBLOperand<16>(Inst, Imm, Address);
+}
+
+static DecodeStatus decodePC32DBLOperand(MCInst &Inst, uint64_t Imm,
+                                         uint64_t Address,
+                                         const void *Decoder) {
+  return decodePCDBLOperand<32>(Inst, Imm, Address);
+}
+
+static DecodeStatus decodeBDAddr12Operand(MCInst &Inst, uint64_t Field,
+                                          const unsigned *Regs) {
+  uint64_t Base = Field >> 12;
+  uint64_t Disp = Field & 0xfff;
+  assert(Base < 16 && "Invalid BDAddr12");
+  Inst.addOperand(MCOperand::CreateReg(Base == 0 ? 0 : Regs[Base]));
+  Inst.addOperand(MCOperand::CreateImm(Disp));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeBDAddr20Operand(MCInst &Inst, uint64_t Field,
+                                          const unsigned *Regs) {
+  uint64_t Base = Field >> 20;
+  uint64_t Disp = ((Field << 12) & 0xff000) | ((Field >> 8) & 0xfff);
+  assert(Base < 16 && "Invalid BDAddr20");
+  Inst.addOperand(MCOperand::CreateReg(Base == 0 ? 0 : Regs[Base]));
+  Inst.addOperand(MCOperand::CreateImm(SignExtend64<20>(Disp)));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeBDXAddr12Operand(MCInst &Inst, uint64_t Field,
+                                           const unsigned *Regs) {
+  uint64_t Index = Field >> 16;
+  uint64_t Base = (Field >> 12) & 0xf;
+  uint64_t Disp = Field & 0xfff;
+  assert(Index < 16 && "Invalid BDXAddr12");
+  Inst.addOperand(MCOperand::CreateReg(Base == 0 ? 0 : Regs[Base]));
+  Inst.addOperand(MCOperand::CreateImm(Disp));
+  Inst.addOperand(MCOperand::CreateReg(Index == 0 ? 0 : Regs[Index]));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeBDXAddr20Operand(MCInst &Inst, uint64_t Field,
+                                           const unsigned *Regs) {
+  uint64_t Index = Field >> 24;
+  uint64_t Base = (Field >> 20) & 0xf;
+  uint64_t Disp = ((Field & 0xfff00) >> 8) | ((Field & 0xff) << 12);
+  assert(Index < 16 && "Invalid BDXAddr20");
+  Inst.addOperand(MCOperand::CreateReg(Base == 0 ? 0 : Regs[Base]));
+  Inst.addOperand(MCOperand::CreateImm(SignExtend64<20>(Disp)));
+  Inst.addOperand(MCOperand::CreateReg(Index == 0 ? 0 : Regs[Index]));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeBDLAddr12Len8Operand(MCInst &Inst, uint64_t Field,
+                                               const unsigned *Regs) {
+  uint64_t Length = Field >> 16;
+  uint64_t Base = (Field >> 12) & 0xf;
+  uint64_t Disp = Field & 0xfff;
+  assert(Length < 256 && "Invalid BDLAddr12Len8");
+  Inst.addOperand(MCOperand::CreateReg(Base == 0 ? 0 : Regs[Base]));
+  Inst.addOperand(MCOperand::CreateImm(Disp));
+  Inst.addOperand(MCOperand::CreateImm(Length + 1));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeBDAddr32Disp12Operand(MCInst &Inst, uint64_t Field,
+                                                uint64_t Address,
+                                                const void *Decoder) {
+  return decodeBDAddr12Operand(Inst, Field, SystemZMC::GR32Regs);
+}
+
+static DecodeStatus decodeBDAddr32Disp20Operand(MCInst &Inst, uint64_t Field,
+                                                uint64_t Address,
+                                                const void *Decoder) {
+  return decodeBDAddr20Operand(Inst, Field, SystemZMC::GR32Regs);
+}
+
+static DecodeStatus decodeBDAddr64Disp12Operand(MCInst &Inst, uint64_t Field,
+                                                uint64_t Address,
+                                                const void *Decoder) {
+  return decodeBDAddr12Operand(Inst, Field, SystemZMC::GR64Regs);
+}
+
+static DecodeStatus decodeBDAddr64Disp20Operand(MCInst &Inst, uint64_t Field,
+                                                uint64_t Address,
+                                                const void *Decoder) {
+  return decodeBDAddr20Operand(Inst, Field, SystemZMC::GR64Regs);
+}
+
+static DecodeStatus decodeBDXAddr64Disp12Operand(MCInst &Inst, uint64_t Field,
+                                                 uint64_t Address,
+                                                 const void *Decoder) {
+  return decodeBDXAddr12Operand(Inst, Field, SystemZMC::GR64Regs);
+}
+
+static DecodeStatus decodeBDXAddr64Disp20Operand(MCInst &Inst, uint64_t Field,
+                                                 uint64_t Address,
+                                                 const void *Decoder) {
+  return decodeBDXAddr20Operand(Inst, Field, SystemZMC::GR64Regs);
+}
+
+static DecodeStatus decodeBDLAddr64Disp12Len8Operand(MCInst &Inst,
+                                                     uint64_t Field,
+                                                     uint64_t Address,
+                                                     const void *Decoder) {
+  return decodeBDLAddr12Len8Operand(Inst, Field, SystemZMC::GR64Regs);
+}
+
+#include "SystemZGenDisassemblerTables.inc"
+
+DecodeStatus SystemZDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
+                                                 const MemoryObject &Region,
+                                                 uint64_t Address,
+                                                 raw_ostream &os,
+                                                 raw_ostream &cs) const {
+  // Get the first two bytes of the instruction.
+  uint8_t Bytes[6];
+  Size = 0;
+  if (Region.readBytes(Address, 2, Bytes) == -1)
+    return MCDisassembler::Fail;
+
+  // The top 2 bits of the first byte specify the size.
+  const uint8_t *Table;
+  if (Bytes[0] < 0x40) {
+    Size = 2;
+    Table = DecoderTable16;
+  } else if (Bytes[0] < 0xc0) {
+    Size = 4;
+    Table = DecoderTable32;
+  } else {
+    Size = 6;
+    Table = DecoderTable48;
+  }
+
+  // Read any remaining bytes.
+  if (Size > 2 && Region.readBytes(Address + 2, Size - 2, Bytes + 2) == -1)
+    return MCDisassembler::Fail;
+
+  // Construct the instruction.
+  uint64_t Inst = 0;
+  for (uint64_t I = 0; I < Size; ++I)
+    Inst = (Inst << 8) | Bytes[I];
+
+  return decodeInstruction(Table, MI, Inst, Address, this, STI);
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp b/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
new file mode 100644
index 0000000..d2ba9b6
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
@@ -0,0 +1,165 @@
+//===-- SystemZInstPrinter.cpp - Convert SystemZ MCInst to assembly syntax ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZInstPrinter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+#include "SystemZGenAsmWriter.inc"
+
+void SystemZInstPrinter::printAddress(unsigned Base, int64_t Disp,
+                                      unsigned Index, raw_ostream &O) {
+  O << Disp;
+  if (Base) {
+    O << '(';
+    if (Index)
+      O << '%' << getRegisterName(Index) << ',';
+    O << '%' << getRegisterName(Base) << ')';
+  } else
+    assert(!Index && "Shouldn't have an index without a base");
+}
+
+void SystemZInstPrinter::printOperand(const MCOperand &MO, raw_ostream &O) {
+  if (MO.isReg())
+    O << '%' << getRegisterName(MO.getReg());
+  else if (MO.isImm())
+    O << MO.getImm();
+  else if (MO.isExpr())
+    O << *MO.getExpr();
+  else
+    llvm_unreachable("Invalid operand");
+}
+
+void SystemZInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                                   StringRef Annot) {
+  printInstruction(MI, O);
+  printAnnotation(O, Annot);
+}
+
+void SystemZInstPrinter::printRegName(raw_ostream &O, unsigned RegNo) const {
+  O << '%' << getRegisterName(RegNo);
+}
+
+void SystemZInstPrinter::printU4ImmOperand(const MCInst *MI, int OpNum,
+                                           raw_ostream &O) {
+  int64_t Value = MI->getOperand(OpNum).getImm();
+  assert(isUInt<4>(Value) && "Invalid u4imm argument");
+  O << Value;
+}
+
+void SystemZInstPrinter::printU6ImmOperand(const MCInst *MI, int OpNum,
+                                           raw_ostream &O) {
+  int64_t Value = MI->getOperand(OpNum).getImm();
+  assert(isUInt<6>(Value) && "Invalid u6imm argument");
+  O << Value;
+}
+
+void SystemZInstPrinter::printS8ImmOperand(const MCInst *MI, int OpNum,
+                                           raw_ostream &O) {
+  int64_t Value = MI->getOperand(OpNum).getImm();
+  assert(isInt<8>(Value) && "Invalid s8imm argument");
+  O << Value;
+}
+
+void SystemZInstPrinter::printU8ImmOperand(const MCInst *MI, int OpNum,
+                                           raw_ostream &O) {
+  int64_t Value = MI->getOperand(OpNum).getImm();
+  assert(isUInt<8>(Value) && "Invalid u8imm argument");
+  O << Value;
+}
+
+void SystemZInstPrinter::printS16ImmOperand(const MCInst *MI, int OpNum,
+                                            raw_ostream &O) {
+  int64_t Value = MI->getOperand(OpNum).getImm();
+  assert(isInt<16>(Value) && "Invalid s16imm argument");
+  O << Value;
+}
+
+void SystemZInstPrinter::printU16ImmOperand(const MCInst *MI, int OpNum,
+                                            raw_ostream &O) {
+  int64_t Value = MI->getOperand(OpNum).getImm();
+  assert(isUInt<16>(Value) && "Invalid u16imm argument");
+  O << Value;
+}
+
+void SystemZInstPrinter::printS32ImmOperand(const MCInst *MI, int OpNum,
+                                            raw_ostream &O) {
+  int64_t Value = MI->getOperand(OpNum).getImm();
+  assert(isInt<32>(Value) && "Invalid s32imm argument");
+  O << Value;
+}
+
+void SystemZInstPrinter::printU32ImmOperand(const MCInst *MI, int OpNum,
+                                            raw_ostream &O) {
+  int64_t Value = MI->getOperand(OpNum).getImm();
+  assert(isUInt<32>(Value) && "Invalid u32imm argument");
+  O << Value;
+}
+
+void SystemZInstPrinter::printAccessRegOperand(const MCInst *MI, int OpNum,
+                                               raw_ostream &O) {
+  uint64_t Value = MI->getOperand(OpNum).getImm();
+  assert(Value < 16 && "Invalid access register number");
+  O << "%a" << (unsigned int)Value;
+}
+
+void SystemZInstPrinter::printPCRelOperand(const MCInst *MI, int OpNum,
+                                           raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  if (MO.isImm()) {
+    O << "0x";
+    O.write_hex(MO.getImm());
+  } else
+    O << *MO.getExpr();
+}
+
+void SystemZInstPrinter::printOperand(const MCInst *MI, int OpNum,
+                                      raw_ostream &O) {
+  printOperand(MI->getOperand(OpNum), O);
+}
+
+void SystemZInstPrinter::printBDAddrOperand(const MCInst *MI, int OpNum,
+                                            raw_ostream &O) {
+  printAddress(MI->getOperand(OpNum).getReg(),
+               MI->getOperand(OpNum + 1).getImm(), 0, O);
+}
+
+void SystemZInstPrinter::printBDXAddrOperand(const MCInst *MI, int OpNum,
+                                             raw_ostream &O) {
+  printAddress(MI->getOperand(OpNum).getReg(),
+               MI->getOperand(OpNum + 1).getImm(),
+               MI->getOperand(OpNum + 2).getReg(), O);
+}
+
+void SystemZInstPrinter::printBDLAddrOperand(const MCInst *MI, int OpNum,
+                                             raw_ostream &O) {
+  unsigned Base = MI->getOperand(OpNum).getReg();
+  uint64_t Disp = MI->getOperand(OpNum + 1).getImm();
+  uint64_t Length = MI->getOperand(OpNum + 2).getImm();
+  O << Disp << '(' << Length;
+  if (Base)
+    O << ",%" << getRegisterName(Base);
+  O << ')';
+}
+
+void SystemZInstPrinter::printCond4Operand(const MCInst *MI, int OpNum,
+                                           raw_ostream &O) {
+  static const char *const CondNames[] = {
+    "o", "h", "nle", "l", "nhe", "lh", "ne",
+    "e", "nlh", "he", "nl", "le", "nh", "no"
+  };
+  uint64_t Imm = MI->getOperand(OpNum).getImm();
+  assert(Imm > 0 && Imm < 15 && "Invalid condition");
+  O << CondNames[Imm - 1];
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h b/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
new file mode 100644
index 0000000..dce482b
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
@@ -0,0 +1,67 @@
+//==- SystemZInstPrinter.h - Convert SystemZ MCInst to assembly --*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints a SystemZ MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SYSTEMZINSTPRINTER_H
+#define LLVM_SYSTEMZINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+class MCOperand;
+
+class SystemZInstPrinter : public MCInstPrinter {
+public:
+  SystemZInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                     const MCRegisterInfo &MRI)
+    : MCInstPrinter(MAI, MII, MRI) {}
+
+  // Automatically generated by tblgen.
+  void printInstruction(const MCInst *MI, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo);
+
+  // Print an address with the given base, displacement and index.
+  static void printAddress(unsigned Base, int64_t Disp, unsigned Index,
+                           raw_ostream &O);
+
+  // Print the given operand.
+  static void printOperand(const MCOperand &MO, raw_ostream &O);
+
+  // Override MCInstPrinter.
+  void printRegName(raw_ostream &O, unsigned RegNo) const override;
+  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override;
+
+private:
+  // Print various types of operand.
+  void printOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printBDAddrOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printBDXAddrOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printBDLAddrOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printU4ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printU6ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printS8ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printU8ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printS16ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printU16ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printS32ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printU32ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printPCRelOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printAccessRegOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+
+  // Print the mnemonic for a condition-code mask ("ne", "lh", etc.)
+  // This forms part of the instruction name rather than the operand list.
+  void printCond4Operand(const MCInst *MI, int OpNum, raw_ostream &O);
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
new file mode 100644
index 0000000..6e7268d
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
@@ -0,0 +1,117 @@
+//===-- SystemZMCAsmBackend.cpp - SystemZ assembler backend ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/SystemZMCTargetDesc.h"
+#include "MCTargetDesc/SystemZMCFixups.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCObjectWriter.h"
+
+using namespace llvm;
+
+// Value is a fully-resolved relocation value: Symbol + Addend [- Pivot].
+// Return the bits that should be installed in a relocation field for
+// fixup kind Kind.
+static uint64_t extractBitsForFixup(MCFixupKind Kind, uint64_t Value) {
+  if (Kind < FirstTargetFixupKind)
+    return Value;
+
+  switch (unsigned(Kind)) {
+  case SystemZ::FK_390_PC16DBL:
+  case SystemZ::FK_390_PC32DBL:
+  case SystemZ::FK_390_PLT16DBL:
+  case SystemZ::FK_390_PLT32DBL:
+    return (int64_t)Value / 2;
+  }
+
+  llvm_unreachable("Unknown fixup kind!");
+}
+
+namespace {
+class SystemZMCAsmBackend : public MCAsmBackend {
+  uint8_t OSABI;
+public:
+  SystemZMCAsmBackend(uint8_t osABI)
+    : OSABI(osABI) {}
+
+  // Override MCAsmBackend
+  unsigned getNumFixupKinds() const override {
+    return SystemZ::NumTargetFixupKinds;
+  }
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value, bool IsPCRel) const override;
+  bool mayNeedRelaxation(const MCInst &Inst) const override {
+    return false;
+  }
+  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                            const MCRelaxableFragment *Fragment,
+                            const MCAsmLayout &Layout) const override {
+    return false;
+  }
+  void relaxInstruction(const MCInst &Inst, MCInst &Res) const override {
+    llvm_unreachable("SystemZ does do not have assembler relaxation");
+  }
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
+    return createSystemZObjectWriter(OS, OSABI);
+  }
+};
+} // end anonymous namespace
+
+const MCFixupKindInfo &
+SystemZMCAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
+  const static MCFixupKindInfo Infos[SystemZ::NumTargetFixupKinds] = {
+    { "FK_390_PC16DBL",  0, 16, MCFixupKindInfo::FKF_IsPCRel },
+    { "FK_390_PC32DBL",  0, 32, MCFixupKindInfo::FKF_IsPCRel },
+    { "FK_390_PLT16DBL", 0, 16, MCFixupKindInfo::FKF_IsPCRel },
+    { "FK_390_PLT32DBL", 0, 32, MCFixupKindInfo::FKF_IsPCRel }
+  };
+
+  if (Kind < FirstTargetFixupKind)
+    return MCAsmBackend::getFixupKindInfo(Kind);
+
+  assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+         "Invalid kind!");
+  return Infos[Kind - FirstTargetFixupKind];
+}
+
+void SystemZMCAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+                                     unsigned DataSize, uint64_t Value,
+                                     bool IsPCRel) const {
+  MCFixupKind Kind = Fixup.getKind();
+  unsigned Offset = Fixup.getOffset();
+  unsigned Size = (getFixupKindInfo(Kind).TargetSize + 7) / 8;
+
+  assert(Offset + Size <= DataSize && "Invalid fixup offset!");
+
+  // Big-endian insertion of Size bytes.
+  Value = extractBitsForFixup(Kind, Value);
+  unsigned ShiftValue = (Size * 8) - 8;
+  for (unsigned I = 0; I != Size; ++I) {
+    Data[Offset + I] |= uint8_t(Value >> ShiftValue);
+    ShiftValue -= 8;
+  }
+}
+
+bool SystemZMCAsmBackend::writeNopData(uint64_t Count,
+                                       MCObjectWriter *OW) const {
+  for (uint64_t I = 0; I != Count; ++I)
+    OW->Write8(7);
+  return true;
+}
+
+MCAsmBackend *llvm::createSystemZMCAsmBackend(const Target &T,
+                                              const MCRegisterInfo &MRI,
+                                              StringRef TT, StringRef CPU) {
+  uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(Triple(TT).getOS());
+  return new SystemZMCAsmBackend(OSABI);
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
new file mode 100644
index 0000000..c46a36b
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
@@ -0,0 +1,34 @@
+//===-- SystemZMCAsmInfo.cpp - SystemZ asm properties ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZMCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+
+using namespace llvm;
+
+SystemZMCAsmInfo::SystemZMCAsmInfo(StringRef TT) {
+  PointerSize = 8;
+  CalleeSaveStackSlotSize = 8;
+  IsLittleEndian = false;
+
+  CommentString = "#";
+  ZeroDirective = "\t.space\t";
+  Data64bitsDirective = "\t.quad\t";
+  UsesELFSectionDirectiveForBSS = true;
+  SupportsDebugInformation = true;
+  HasLEB128 = true;
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+}
+
+const MCSection *
+SystemZMCAsmInfo::getNonexecutableStackSection(MCContext &Ctx) const {
+  return Ctx.getELFSection(".note.GNU-stack", ELF::SHT_PROGBITS,
+                           0, SectionKind::getMetadata());
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h
new file mode 100644
index 0000000..1de97af
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h
@@ -0,0 +1,29 @@
+//====-- SystemZMCAsmInfo.h - SystemZ asm properties -----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SystemZTARGETASMINFO_H
+#define SystemZTARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfoELF.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+class StringRef;
+
+class SystemZMCAsmInfo : public MCAsmInfoELF {
+public:
+  explicit SystemZMCAsmInfo(StringRef TT);
+
+  // Override MCAsmInfo;
+  const MCSection *getNonexecutableStackSection(MCContext &Ctx) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
new file mode 100644
index 0000000..27b4bd8
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
@@ -0,0 +1,204 @@
+//===-- SystemZMCCodeEmitter.cpp - Convert SystemZ code to machine code ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SystemZMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/SystemZMCTargetDesc.h"
+#include "MCTargetDesc/SystemZMCFixups.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mccodeemitter"
+
+namespace {
+class SystemZMCCodeEmitter : public MCCodeEmitter {
+  const MCInstrInfo &MCII;
+  MCContext &Ctx;
+
+public:
+  SystemZMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
+    : MCII(mcii), Ctx(ctx) {
+  }
+
+  ~SystemZMCCodeEmitter() {}
+
+  // OVerride MCCodeEmitter.
+  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const override;
+
+private:
+  // Automatically generated by TableGen.
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  // Called by the TableGen code to get the binary encoding of operand
+  // MO in MI.  Fixups is the list of fixups against MI.
+  uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+
+  // Called by the TableGen code to get the binary encoding of an address.
+  // The index or length, if any, is encoded first, followed by the base,
+  // followed by the displacement.  In a 20-bit displacement,
+  // the low 12 bits are encoded before the high 8 bits.
+  uint64_t getBDAddr12Encoding(const MCInst &MI, unsigned OpNum,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
+  uint64_t getBDAddr20Encoding(const MCInst &MI, unsigned OpNum,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
+  uint64_t getBDXAddr12Encoding(const MCInst &MI, unsigned OpNum,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+  uint64_t getBDXAddr20Encoding(const MCInst &MI, unsigned OpNum,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+  uint64_t getBDLAddr12Len8Encoding(const MCInst &MI, unsigned OpNum,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const;
+
+  // Operand OpNum of MI needs a PC-relative fixup of kind Kind at
+  // Offset bytes from the start of MI.  Add the fixup to Fixups
+  // and return the in-place addend, which since we're a RELA target
+  // is always 0.
+  uint64_t getPCRelEncoding(const MCInst &MI, unsigned OpNum,
+                            SmallVectorImpl<MCFixup> &Fixups,
+                            unsigned Kind, int64_t Offset) const;
+
+  uint64_t getPC16DBLEncoding(const MCInst &MI, unsigned OpNum,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const {
+    return getPCRelEncoding(MI, OpNum, Fixups, SystemZ::FK_390_PC16DBL, 2);
+  }
+  uint64_t getPC32DBLEncoding(const MCInst &MI, unsigned OpNum,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const {
+    return getPCRelEncoding(MI, OpNum, Fixups, SystemZ::FK_390_PC32DBL, 2);
+  }
+};
+} // end anonymous namespace
+
+MCCodeEmitter *llvm::createSystemZMCCodeEmitter(const MCInstrInfo &MCII,
+                                                const MCRegisterInfo &MRI,
+                                                const MCSubtargetInfo &MCSTI,
+                                                MCContext &Ctx) {
+  return new SystemZMCCodeEmitter(MCII, Ctx);
+}
+
+void SystemZMCCodeEmitter::
+EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                  SmallVectorImpl<MCFixup> &Fixups,
+                  const MCSubtargetInfo &STI) const {
+  uint64_t Bits = getBinaryCodeForInstr(MI, Fixups, STI);
+  unsigned Size = MCII.get(MI.getOpcode()).getSize();
+  // Big-endian insertion of Size bytes.
+  unsigned ShiftValue = (Size * 8) - 8;
+  for (unsigned I = 0; I != Size; ++I) {
+    OS << uint8_t(Bits >> ShiftValue);
+    ShiftValue -= 8;
+  }
+}
+
+uint64_t SystemZMCCodeEmitter::
+getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                  SmallVectorImpl<MCFixup> &Fixups,
+                  const MCSubtargetInfo &STI) const {
+  if (MO.isReg())
+    return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
+  if (MO.isImm())
+    return static_cast<uint64_t>(MO.getImm());
+  llvm_unreachable("Unexpected operand type!");
+}
+
+uint64_t SystemZMCCodeEmitter::
+getBDAddr12Encoding(const MCInst &MI, unsigned OpNum,
+                    SmallVectorImpl<MCFixup> &Fixups,
+                    const MCSubtargetInfo &STI) const {
+  uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups, STI);
+  uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups, STI);
+  assert(isUInt<4>(Base) && isUInt<12>(Disp));
+  return (Base << 12) | Disp;
+}
+
+uint64_t SystemZMCCodeEmitter::
+getBDAddr20Encoding(const MCInst &MI, unsigned OpNum,
+                    SmallVectorImpl<MCFixup> &Fixups,
+                    const MCSubtargetInfo &STI) const {
+  uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups, STI);
+  uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups, STI);
+  assert(isUInt<4>(Base) && isInt<20>(Disp));
+  return (Base << 20) | ((Disp & 0xfff) << 8) | ((Disp & 0xff000) >> 12);
+}
+
+uint64_t SystemZMCCodeEmitter::
+getBDXAddr12Encoding(const MCInst &MI, unsigned OpNum,
+                     SmallVectorImpl<MCFixup> &Fixups,
+                     const MCSubtargetInfo &STI) const {
+  uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups, STI);
+  uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups, STI);
+  uint64_t Index = getMachineOpValue(MI, MI.getOperand(OpNum + 2), Fixups, STI);
+  assert(isUInt<4>(Base) && isUInt<12>(Disp) && isUInt<4>(Index));
+  return (Index << 16) | (Base << 12) | Disp;
+}
+
+uint64_t SystemZMCCodeEmitter::
+getBDXAddr20Encoding(const MCInst &MI, unsigned OpNum,
+                     SmallVectorImpl<MCFixup> &Fixups,
+                     const MCSubtargetInfo &STI) const {
+  uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups, STI);
+  uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups, STI);
+  uint64_t Index = getMachineOpValue(MI, MI.getOperand(OpNum + 2), Fixups, STI);
+  assert(isUInt<4>(Base) && isInt<20>(Disp) && isUInt<4>(Index));
+  return (Index << 24) | (Base << 20) | ((Disp & 0xfff) << 8)
+    | ((Disp & 0xff000) >> 12);
+}
+
+uint64_t SystemZMCCodeEmitter::
+getBDLAddr12Len8Encoding(const MCInst &MI, unsigned OpNum,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const {
+  uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups, STI);
+  uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups, STI);
+  uint64_t Len  = getMachineOpValue(MI, MI.getOperand(OpNum + 2), Fixups, STI) - 1;
+  assert(isUInt<4>(Base) && isUInt<12>(Disp) && isUInt<8>(Len));
+  return (Len << 16) | (Base << 12) | Disp;
+}
+
+uint64_t
+SystemZMCCodeEmitter::getPCRelEncoding(const MCInst &MI, unsigned OpNum,
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       unsigned Kind, int64_t Offset) const {
+  const MCOperand &MO = MI.getOperand(OpNum);
+  const MCExpr *Expr;
+  if (MO.isImm())
+    Expr = MCConstantExpr::Create(MO.getImm() + Offset, Ctx);
+  else {
+    Expr = MO.getExpr();
+    if (Offset) {
+      // The operand value is relative to the start of MI, but the fixup
+      // is relative to the operand field itself, which is Offset bytes
+      // into MI.  Add Offset to the relocation value to cancel out
+      // this difference.
+      const MCExpr *OffsetExpr = MCConstantExpr::Create(Offset, Ctx);
+      Expr = MCBinaryExpr::CreateAdd(Expr, OffsetExpr, Ctx);
+    }
+  }
+  Fixups.push_back(MCFixup::Create(Offset, Expr, (MCFixupKind)Kind));
+  return 0;
+}
+
+#include "SystemZGenMCCodeEmitter.inc"
diff --git a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCFixups.h b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCFixups.h
new file mode 100644
index 0000000..a3aab71
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCFixups.h
@@ -0,0 +1,31 @@
+//===-- SystemZMCFixups.h - SystemZ-specific fixup entries ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SYSTEMZMCFIXUPS_H
+#define LLVM_SYSTEMZMCFIXUPS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace SystemZ {
+enum FixupKind {
+  // These correspond directly to R_390_* relocations.
+  FK_390_PC16DBL = FirstTargetFixupKind,
+  FK_390_PC32DBL,
+  FK_390_PLT16DBL,
+  FK_390_PLT32DBL,
+
+  // Marker
+  LastTargetFixupKind,
+  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+} // end namespace SystemZ
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
new file mode 100644
index 0000000..c6a1816
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
@@ -0,0 +1,115 @@
+//===-- SystemZMCObjectWriter.cpp - SystemZ ELF writer --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/SystemZMCTargetDesc.h"
+#include "MCTargetDesc/SystemZMCFixups.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
+
+using namespace llvm;
+
+namespace {
+class SystemZObjectWriter : public MCELFObjectTargetWriter {
+public:
+  SystemZObjectWriter(uint8_t OSABI);
+
+  virtual ~SystemZObjectWriter();
+
+protected:
+  // Override MCELFObjectTargetWriter.
+  unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
+                        bool IsPCRel) const override;
+};
+} // end anonymous namespace
+
+SystemZObjectWriter::SystemZObjectWriter(uint8_t OSABI)
+  : MCELFObjectTargetWriter(/*Is64Bit=*/true, OSABI, ELF::EM_S390,
+                            /*HasRelocationAddend=*/ true) {}
+
+SystemZObjectWriter::~SystemZObjectWriter() {
+}
+
+// Return the relocation type for an absolute value of MCFixupKind Kind.
+static unsigned getAbsoluteReloc(unsigned Kind) {
+  switch (Kind) {
+  case FK_Data_1: return ELF::R_390_8;
+  case FK_Data_2: return ELF::R_390_16;
+  case FK_Data_4: return ELF::R_390_32;
+  case FK_Data_8: return ELF::R_390_64;
+  }
+  llvm_unreachable("Unsupported absolute address");
+}
+
+// Return the relocation type for a PC-relative value of MCFixupKind Kind.
+static unsigned getPCRelReloc(unsigned Kind) {
+  switch (Kind) {
+  case FK_Data_2:                return ELF::R_390_PC16;
+  case FK_Data_4:                return ELF::R_390_PC32;
+  case FK_Data_8:                return ELF::R_390_PC64;
+  case SystemZ::FK_390_PC16DBL:  return ELF::R_390_PC16DBL;
+  case SystemZ::FK_390_PC32DBL:  return ELF::R_390_PC32DBL;
+  case SystemZ::FK_390_PLT16DBL: return ELF::R_390_PLT16DBL;
+  case SystemZ::FK_390_PLT32DBL: return ELF::R_390_PLT32DBL;
+  }
+  llvm_unreachable("Unsupported PC-relative address");
+}
+
+// Return the R_390_TLS_LE* relocation type for MCFixupKind Kind.
+static unsigned getTLSLEReloc(unsigned Kind) {
+  switch (Kind) {
+  case FK_Data_4: return ELF::R_390_TLS_LE32;
+  case FK_Data_8: return ELF::R_390_TLS_LE64;
+  }
+  llvm_unreachable("Unsupported absolute address");
+}
+
+// Return the PLT relocation counterpart of MCFixupKind Kind.
+static unsigned getPLTReloc(unsigned Kind) {
+  switch (Kind) {
+  case SystemZ::FK_390_PC16DBL: return ELF::R_390_PLT16DBL;
+  case SystemZ::FK_390_PC32DBL: return ELF::R_390_PLT32DBL;
+  }
+  llvm_unreachable("Unsupported absolute address");
+}
+
+unsigned SystemZObjectWriter::GetRelocType(const MCValue &Target,
+                                           const MCFixup &Fixup,
+                                           bool IsPCRel) const {
+  MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant();
+  unsigned Kind = Fixup.getKind();
+  switch (Modifier) {
+  case MCSymbolRefExpr::VK_None:
+    if (IsPCRel)
+      return getPCRelReloc(Kind);
+    return getAbsoluteReloc(Kind);
+
+  case MCSymbolRefExpr::VK_NTPOFF:
+    assert(!IsPCRel && "NTPOFF shouldn't be PC-relative");
+    return getTLSLEReloc(Kind);
+
+  case MCSymbolRefExpr::VK_GOT:
+    if (IsPCRel && Kind == SystemZ::FK_390_PC32DBL)
+      return ELF::R_390_GOTENT;
+    llvm_unreachable("Only PC-relative GOT accesses are supported for now");
+
+  case MCSymbolRefExpr::VK_PLT:
+    assert(IsPCRel && "@PLT shouldt be PC-relative");
+    return getPLTReloc(Kind);
+
+  default:
+    llvm_unreachable("Modifier not supported");
+  }
+}
+
+MCObjectWriter *llvm::createSystemZObjectWriter(raw_ostream &OS,
+                                                uint8_t OSABI) {
+  MCELFObjectTargetWriter *MOTW = new SystemZObjectWriter(OSABI);
+  return createELFObjectWriter(MOTW, OS, /*IsLittleEndian=*/false);
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
new file mode 100644
index 0000000..cc94869
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
@@ -0,0 +1,231 @@
+//===-- SystemZMCTargetDesc.cpp - SystemZ target descriptions -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZMCTargetDesc.h"
+#include "InstPrinter/SystemZInstPrinter.h"
+#include "SystemZMCAsmInfo.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_MC_DESC
+#include "SystemZGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "SystemZGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "SystemZGenRegisterInfo.inc"
+
+const unsigned SystemZMC::GR32Regs[16] = {
+  SystemZ::R0L, SystemZ::R1L, SystemZ::R2L, SystemZ::R3L,
+  SystemZ::R4L, SystemZ::R5L, SystemZ::R6L, SystemZ::R7L,
+  SystemZ::R8L, SystemZ::R9L, SystemZ::R10L, SystemZ::R11L,
+  SystemZ::R12L, SystemZ::R13L, SystemZ::R14L, SystemZ::R15L
+};
+
+const unsigned SystemZMC::GRH32Regs[16] = {
+  SystemZ::R0H, SystemZ::R1H, SystemZ::R2H, SystemZ::R3H,
+  SystemZ::R4H, SystemZ::R5H, SystemZ::R6H, SystemZ::R7H,
+  SystemZ::R8H, SystemZ::R9H, SystemZ::R10H, SystemZ::R11H,
+  SystemZ::R12H, SystemZ::R13H, SystemZ::R14H, SystemZ::R15H
+};
+
+const unsigned SystemZMC::GR64Regs[16] = {
+  SystemZ::R0D, SystemZ::R1D, SystemZ::R2D, SystemZ::R3D,
+  SystemZ::R4D, SystemZ::R5D, SystemZ::R6D, SystemZ::R7D,
+  SystemZ::R8D, SystemZ::R9D, SystemZ::R10D, SystemZ::R11D,
+  SystemZ::R12D, SystemZ::R13D, SystemZ::R14D, SystemZ::R15D
+};
+
+const unsigned SystemZMC::GR128Regs[16] = {
+  SystemZ::R0Q, 0, SystemZ::R2Q, 0,
+  SystemZ::R4Q, 0, SystemZ::R6Q, 0,
+  SystemZ::R8Q, 0, SystemZ::R10Q, 0,
+  SystemZ::R12Q, 0, SystemZ::R14Q, 0
+};
+
+const unsigned SystemZMC::FP32Regs[16] = {
+  SystemZ::F0S, SystemZ::F1S, SystemZ::F2S, SystemZ::F3S,
+  SystemZ::F4S, SystemZ::F5S, SystemZ::F6S, SystemZ::F7S,
+  SystemZ::F8S, SystemZ::F9S, SystemZ::F10S, SystemZ::F11S,
+  SystemZ::F12S, SystemZ::F13S, SystemZ::F14S, SystemZ::F15S
+};
+
+const unsigned SystemZMC::FP64Regs[16] = {
+  SystemZ::F0D, SystemZ::F1D, SystemZ::F2D, SystemZ::F3D,
+  SystemZ::F4D, SystemZ::F5D, SystemZ::F6D, SystemZ::F7D,
+  SystemZ::F8D, SystemZ::F9D, SystemZ::F10D, SystemZ::F11D,
+  SystemZ::F12D, SystemZ::F13D, SystemZ::F14D, SystemZ::F15D
+};
+
+const unsigned SystemZMC::FP128Regs[16] = {
+  SystemZ::F0Q, SystemZ::F1Q, 0, 0,
+  SystemZ::F4Q, SystemZ::F5Q, 0, 0,
+  SystemZ::F8Q, SystemZ::F9Q, 0, 0,
+  SystemZ::F12Q, SystemZ::F13Q, 0, 0
+};
+
+unsigned SystemZMC::getFirstReg(unsigned Reg) {
+  static unsigned Map[SystemZ::NUM_TARGET_REGS];
+  static bool Initialized = false;
+  if (!Initialized) {
+    for (unsigned I = 0; I < 16; ++I) {
+      Map[GR32Regs[I]] = I;
+      Map[GRH32Regs[I]] = I;
+      Map[GR64Regs[I]] = I;
+      Map[GR128Regs[I]] = I;
+      Map[FP32Regs[I]] = I;
+      Map[FP64Regs[I]] = I;
+      Map[FP128Regs[I]] = I;
+    }
+  }
+  assert(Reg < SystemZ::NUM_TARGET_REGS);
+  return Map[Reg];
+}
+
+static MCAsmInfo *createSystemZMCAsmInfo(const MCRegisterInfo &MRI,
+                                         StringRef TT) {
+  MCAsmInfo *MAI = new SystemZMCAsmInfo(TT);
+  MCCFIInstruction Inst =
+      MCCFIInstruction::createDefCfa(nullptr,
+                                     MRI.getDwarfRegNum(SystemZ::R15D, true),
+                                     SystemZMC::CFAOffsetFromInitialSP);
+  MAI->addInitialFrameState(Inst);
+  return MAI;
+}
+
+static MCInstrInfo *createSystemZMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitSystemZMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createSystemZMCRegisterInfo(StringRef TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitSystemZMCRegisterInfo(X, SystemZ::R14D);
+  return X;
+}
+
+static MCSubtargetInfo *createSystemZMCSubtargetInfo(StringRef TT,
+                                                     StringRef CPU,
+                                                     StringRef FS) {
+  MCSubtargetInfo *X = new MCSubtargetInfo();
+  InitSystemZMCSubtargetInfo(X, TT, CPU, FS);
+  return X;
+}
+
+static MCCodeGenInfo *createSystemZMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                                 CodeModel::Model CM,
+                                                 CodeGenOpt::Level OL) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+
+  // Static code is suitable for use in a dynamic executable; there is no
+  // separate DynamicNoPIC model.
+  if (RM == Reloc::Default || RM == Reloc::DynamicNoPIC)
+    RM = Reloc::Static;
+
+  // For SystemZ we define the models as follows:
+  //
+  // Small:  BRASL can call any function and will use a stub if necessary.
+  //         Locally-binding symbols will always be in range of LARL.
+  //
+  // Medium: BRASL can call any function and will use a stub if necessary.
+  //         GOT slots and locally-defined text will always be in range
+  //         of LARL, but other symbols might not be.
+  //
+  // Large:  Equivalent to Medium for now.
+  //
+  // Kernel: Equivalent to Medium for now.
+  //
+  // This means that any PIC module smaller than 4GB meets the
+  // requirements of Small, so Small seems like the best default there.
+  //
+  // All symbols bind locally in a non-PIC module, so the choice is less
+  // obvious.  There are two cases:
+  //
+  // - When creating an executable, PLTs and copy relocations allow
+  //   us to treat external symbols as part of the executable.
+  //   Any executable smaller than 4GB meets the requirements of Small,
+  //   so that seems like the best default.
+  //
+  // - When creating JIT code, stubs will be in range of BRASL if the
+  //   image is less than 4GB in size.  GOT entries will likewise be
+  //   in range of LARL.  However, the JIT environment has no equivalent
+  //   of copy relocs, so locally-binding data symbols might not be in
+  //   the range of LARL.  We need the Medium model in that case.
+  if (CM == CodeModel::Default)
+    CM = CodeModel::Small;
+  else if (CM == CodeModel::JITDefault)
+    CM = RM == Reloc::PIC_ ? CodeModel::Small : CodeModel::Medium;
+  X->InitMCCodeGenInfo(RM, CM, OL);
+  return X;
+}
+
+static MCInstPrinter *createSystemZMCInstPrinter(const Target &T,
+                                                 unsigned SyntaxVariant,
+                                                 const MCAsmInfo &MAI,
+                                                 const MCInstrInfo &MII,
+                                                 const MCRegisterInfo &MRI,
+                                                 const MCSubtargetInfo &STI) {
+  return new SystemZInstPrinter(MAI, MII, MRI);
+}
+
+static MCStreamer *createSystemZMCObjectStreamer(const Target &T, StringRef TT,
+                                                 MCContext &Ctx,
+                                                 MCAsmBackend &MAB,
+                                                 raw_ostream &OS,
+                                                 MCCodeEmitter *Emitter,
+                                                 const MCSubtargetInfo &STI,
+                                                 bool RelaxAll,
+                                                 bool NoExecStack) {
+  return createELFStreamer(Ctx, MAB, OS, Emitter, RelaxAll, NoExecStack);
+}
+
+extern "C" void LLVMInitializeSystemZTargetMC() {
+  // Register the MCAsmInfo.
+  TargetRegistry::RegisterMCAsmInfo(TheSystemZTarget,
+                                    createSystemZMCAsmInfo);
+
+  // Register the MCCodeGenInfo.
+  TargetRegistry::RegisterMCCodeGenInfo(TheSystemZTarget,
+                                        createSystemZMCCodeGenInfo);
+
+  // Register the MCCodeEmitter.
+  TargetRegistry::RegisterMCCodeEmitter(TheSystemZTarget,
+					createSystemZMCCodeEmitter);
+
+  // Register the MCInstrInfo.
+  TargetRegistry::RegisterMCInstrInfo(TheSystemZTarget,
+                                      createSystemZMCInstrInfo);
+
+  // Register the MCRegisterInfo.
+  TargetRegistry::RegisterMCRegInfo(TheSystemZTarget,
+                                    createSystemZMCRegisterInfo);
+
+  // Register the MCSubtargetInfo.
+  TargetRegistry::RegisterMCSubtargetInfo(TheSystemZTarget,
+                                          createSystemZMCSubtargetInfo);
+
+  // Register the MCAsmBackend.
+  TargetRegistry::RegisterMCAsmBackend(TheSystemZTarget,
+                                       createSystemZMCAsmBackend);
+
+  // Register the MCInstPrinter.
+  TargetRegistry::RegisterMCInstPrinter(TheSystemZTarget,
+                                        createSystemZMCInstPrinter);
+
+  // Register the MCObjectStreamer;
+  TargetRegistry::RegisterMCObjectStreamer(TheSystemZTarget,
+                                           createSystemZMCObjectStreamer);
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
new file mode 100644
index 0000000..cbaf9a8
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
@@ -0,0 +1,96 @@
+//===-- SystemZMCTargetDesc.h - SystemZ target descriptions -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SYSTEMZMCTARGETDESC_H
+#define SYSTEMZMCTARGETDESC_H
+
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectWriter;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class StringRef;
+class Target;
+class raw_ostream;
+
+extern Target TheSystemZTarget;
+
+namespace SystemZMC {
+// How many bytes are in the ABI-defined, caller-allocated part of
+// a stack frame.
+const int64_t CallFrameSize = 160;
+
+// The offset of the DWARF CFA from the incoming stack pointer.
+const int64_t CFAOffsetFromInitialSP = CallFrameSize;
+
+// Maps of asm register numbers to LLVM register numbers, with 0 indicating
+// an invalid register.  In principle we could use 32-bit and 64-bit register
+// classes directly, provided that we relegated the GPR allocation order
+// in SystemZRegisterInfo.td to an AltOrder and left the default order
+// as %r0-%r15.  It seems better to provide the same interface for
+// all classes though.
+extern const unsigned GR32Regs[16];
+extern const unsigned GRH32Regs[16];
+extern const unsigned GR64Regs[16];
+extern const unsigned GR128Regs[16];
+extern const unsigned FP32Regs[16];
+extern const unsigned FP64Regs[16];
+extern const unsigned FP128Regs[16];
+
+// Return the 0-based number of the first architectural register that
+// contains the given LLVM register.   E.g. R1D -> 1.
+unsigned getFirstReg(unsigned Reg);
+
+// Return the given register as a GR64.
+inline unsigned getRegAsGR64(unsigned Reg) {
+  return GR64Regs[getFirstReg(Reg)];
+}
+
+// Return the given register as a low GR32.
+inline unsigned getRegAsGR32(unsigned Reg) {
+  return GR32Regs[getFirstReg(Reg)];
+}
+
+// Return the given register as a high GR32.
+inline unsigned getRegAsGRH32(unsigned Reg) {
+  return GRH32Regs[getFirstReg(Reg)];
+}
+} // end namespace SystemZMC
+
+MCCodeEmitter *createSystemZMCCodeEmitter(const MCInstrInfo &MCII,
+                                          const MCRegisterInfo &MRI,
+                                          const MCSubtargetInfo &STI,
+                                          MCContext &Ctx);
+
+MCAsmBackend *createSystemZMCAsmBackend(const Target &T,
+                                        const MCRegisterInfo &MRI,
+                                        StringRef TT, StringRef CPU);
+
+MCObjectWriter *createSystemZObjectWriter(raw_ostream &OS, uint8_t OSABI);
+} // end namespace llvm
+
+// Defines symbolic names for SystemZ registers.
+// This defines a mapping from register name to register number.
+#define GET_REGINFO_ENUM
+#include "SystemZGenRegisterInfo.inc"
+
+// Defines symbolic names for the SystemZ instructions.
+#define GET_INSTRINFO_ENUM
+#include "SystemZGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "SystemZGenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/README.txt b/contrib/llvm/lib/Target/SystemZ/README.txt
new file mode 100644
index 0000000..e089047
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/README.txt
@@ -0,0 +1,174 @@
+//===---------------------------------------------------------------------===//
+// Random notes about and ideas for the SystemZ backend.
+//===---------------------------------------------------------------------===//
+
+The initial backend is deliberately restricted to z10.  We should add support
+for later architectures at some point.
+
+--
+
+SystemZDAGToDAGISel::SelectInlineAsmMemoryOperand() is passed "m" for all
+inline asm memory constraints; it doesn't get to see the original constraint.
+This means that it must conservatively treat all inline asm constraints
+as the most restricted type, "R".
+
+--
+
+If an inline asm ties an i32 "r" result to an i64 input, the input
+will be treated as an i32, leaving the upper bits uninitialised.
+For example:
+
+define void @f4(i32 *%dst) {
+  %val = call i32 asm "blah $0", "=r,0" (i64 103)
+  store i32 %val, i32 *%dst
+  ret void
+}
+
+from CodeGen/SystemZ/asm-09.ll will use LHI rather than LGHI.
+to load 103.  This seems to be a general target-independent problem.
+
+--
+
+The tuning of the choice between LOAD ADDRESS (LA) and addition in
+SystemZISelDAGToDAG.cpp is suspect.  It should be tweaked based on
+performance measurements.
+
+--
+
+There is no scheduling support.
+
+--
+
+We don't use the BRANCH ON INDEX instructions.
+
+--
+
+We might want to use BRANCH ON CONDITION for conditional indirect calls
+and conditional returns.
+
+--
+
+We don't use the TEST DATA CLASS instructions.
+
+--
+
+We could use the generic floating-point forms of LOAD COMPLEMENT,
+LOAD NEGATIVE and LOAD POSITIVE in cases where we don't need the
+condition codes.  For example, we could use LCDFR instead of LCDBR.
+
+--
+
+We only use MVC, XC and CLC for constant-length block operations.
+We could extend them to variable-length operations too,
+using EXECUTE RELATIVE LONG.
+
+MVCIN, MVCLE and CLCLE may be worthwhile too.
+
+--
+
+We don't use CUSE or the TRANSLATE family of instructions for string
+operations.  The TRANSLATE ones are probably more difficult to exploit.
+
+--
+
+We don't take full advantage of builtins like fabsl because the calling
+conventions require f128s to be returned by invisible reference.
+
+--
+
+ADD LOGICAL WITH SIGNED IMMEDIATE could be useful when we need to
+produce a carry.  SUBTRACT LOGICAL IMMEDIATE could be useful when we
+need to produce a borrow.  (Note that there are no memory forms of
+ADD LOGICAL WITH CARRY and SUBTRACT LOGICAL WITH BORROW, so the high
+part of 128-bit memory operations would probably need to be done
+via a register.)
+
+--
+
+We don't use the halfword forms of LOAD REVERSED and STORE REVERSED
+(LRVH and STRVH).
+
+--
+
+We don't use ICM or STCM.
+
+--
+
+DAGCombiner doesn't yet fold truncations of extended loads.  Functions like:
+
+    unsigned long f (unsigned long x, unsigned short *y)
+    {
+      return (x << 32) | *y;
+    }
+
+therefore end up as:
+
+        sllg    %r2, %r2, 32
+        llgh    %r0, 0(%r3)
+        lr      %r2, %r0
+        br      %r14
+
+but truncating the load would give:
+
+        sllg    %r2, %r2, 32
+        lh      %r2, 0(%r3)
+        br      %r14
+
+--
+
+Functions like:
+
+define i64 @f1(i64 %a) {
+  %and = and i64 %a, 1
+  ret i64 %and
+}
+
+ought to be implemented as:
+
+        lhi     %r0, 1
+        ngr     %r2, %r0
+        br      %r14
+
+but two-address optimisations reverse the order of the AND and force:
+
+        lhi     %r0, 1
+        ngr     %r0, %r2
+        lgr     %r2, %r0
+        br      %r14
+
+CodeGen/SystemZ/and-04.ll has several examples of this.
+
+--
+
+Out-of-range displacements are usually handled by loading the full
+address into a register.  In many cases it would be better to create
+an anchor point instead.  E.g. for:
+
+define void @f4a(i128 *%aptr, i64 %base) {
+  %addr = add i64 %base, 524288
+  %bptr = inttoptr i64 %addr to i128 *
+  %a = load volatile i128 *%aptr
+  %b = load i128 *%bptr
+  %add = add i128 %a, %b
+  store i128 %add, i128 *%aptr
+  ret void
+}
+
+(from CodeGen/SystemZ/int-add-08.ll) we load %base+524288 and %base+524296
+into separate registers, rather than using %base+524288 as a base for both.
+
+--
+
+Dynamic stack allocations round the size to 8 bytes and then allocate
+that rounded amount.  It would be simpler to subtract the unrounded
+size from the copy of the stack pointer and then align the result.
+See CodeGen/SystemZ/alloca-01.ll for an example.
+
+--
+
+If needed, we can support 16-byte atomics using LPQ, STPQ and CSDG.
+
+--
+
+We might want to model all access registers and use them to spill
+32-bit values.
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZ.h b/contrib/llvm/lib/Target/SystemZ/SystemZ.h
new file mode 100644
index 0000000..15792494
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZ.h
@@ -0,0 +1,116 @@
+//==- SystemZ.h - Top-Level Interface for SystemZ representation -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in
+// the LLVM SystemZ backend.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SYSTEMZ_H
+#define SYSTEMZ_H
+
+#include "MCTargetDesc/SystemZMCTargetDesc.h"
+#include "llvm/Support/CodeGen.h"
+
+namespace llvm {
+class SystemZTargetMachine;
+class FunctionPass;
+
+namespace SystemZ {
+// Condition-code mask values.
+const unsigned CCMASK_0 = 1 << 3;
+const unsigned CCMASK_1 = 1 << 2;
+const unsigned CCMASK_2 = 1 << 1;
+const unsigned CCMASK_3 = 1 << 0;
+const unsigned CCMASK_ANY = CCMASK_0 | CCMASK_1 | CCMASK_2 | CCMASK_3;
+
+// Condition-code mask assignments for integer and floating-point
+// comparisons.
+const unsigned CCMASK_CMP_EQ = CCMASK_0;
+const unsigned CCMASK_CMP_LT = CCMASK_1;
+const unsigned CCMASK_CMP_GT = CCMASK_2;
+const unsigned CCMASK_CMP_NE = CCMASK_CMP_LT | CCMASK_CMP_GT;
+const unsigned CCMASK_CMP_LE = CCMASK_CMP_EQ | CCMASK_CMP_LT;
+const unsigned CCMASK_CMP_GE = CCMASK_CMP_EQ | CCMASK_CMP_GT;
+
+// Condition-code mask assignments for floating-point comparisons only.
+const unsigned CCMASK_CMP_UO = CCMASK_3;
+const unsigned CCMASK_CMP_O  = CCMASK_ANY ^ CCMASK_CMP_UO;
+
+// All condition-code values produced by comparisons.
+const unsigned CCMASK_ICMP = CCMASK_0 | CCMASK_1 | CCMASK_2;
+const unsigned CCMASK_FCMP = CCMASK_0 | CCMASK_1 | CCMASK_2 | CCMASK_3;
+
+// Condition-code mask assignments for CS.
+const unsigned CCMASK_CS_EQ = CCMASK_0;
+const unsigned CCMASK_CS_NE = CCMASK_1;
+const unsigned CCMASK_CS    = CCMASK_0 | CCMASK_1;
+
+// Condition-code mask assignments for a completed SRST loop.
+const unsigned CCMASK_SRST_FOUND    = CCMASK_1;
+const unsigned CCMASK_SRST_NOTFOUND = CCMASK_2;
+const unsigned CCMASK_SRST          = CCMASK_1 | CCMASK_2;
+
+// Condition-code mask assignments for TEST UNDER MASK.
+const unsigned CCMASK_TM_ALL_0       = CCMASK_0;
+const unsigned CCMASK_TM_MIXED_MSB_0 = CCMASK_1;
+const unsigned CCMASK_TM_MIXED_MSB_1 = CCMASK_2;
+const unsigned CCMASK_TM_ALL_1       = CCMASK_3;
+const unsigned CCMASK_TM_SOME_0      = CCMASK_TM_ALL_1 ^ CCMASK_ANY;
+const unsigned CCMASK_TM_SOME_1      = CCMASK_TM_ALL_0 ^ CCMASK_ANY;
+const unsigned CCMASK_TM_MSB_0       = CCMASK_0 | CCMASK_1;
+const unsigned CCMASK_TM_MSB_1       = CCMASK_2 | CCMASK_3;
+const unsigned CCMASK_TM             = CCMASK_ANY;
+
+// The position of the low CC bit in an IPM result.
+const unsigned IPM_CC = 28;
+
+// Mask assignments for PFD.
+const unsigned PFD_READ  = 1;
+const unsigned PFD_WRITE = 2;
+
+// Return true if Val fits an LLILL operand.
+static inline bool isImmLL(uint64_t Val) {
+  return (Val & ~0x000000000000ffffULL) == 0;
+}
+
+// Return true if Val fits an LLILH operand.
+static inline bool isImmLH(uint64_t Val) {
+  return (Val & ~0x00000000ffff0000ULL) == 0;
+}
+
+// Return true if Val fits an LLIHL operand.
+static inline bool isImmHL(uint64_t Val) {
+  return (Val & ~0x00000ffff00000000ULL) == 0;
+}
+
+// Return true if Val fits an LLIHH operand.
+static inline bool isImmHH(uint64_t Val) {
+  return (Val & ~0xffff000000000000ULL) == 0;
+}
+
+// Return true if Val fits an LLILF operand.
+static inline bool isImmLF(uint64_t Val) {
+  return (Val & ~0x00000000ffffffffULL) == 0;
+}
+
+// Return true if Val fits an LLIHF operand.
+static inline bool isImmHF(uint64_t Val) {
+  return (Val & ~0xffffffff00000000ULL) == 0;
+}
+} // end namespace SystemZ
+
+FunctionPass *createSystemZISelDag(SystemZTargetMachine &TM,
+                                   CodeGenOpt::Level OptLevel);
+FunctionPass *createSystemZElimComparePass(SystemZTargetMachine &TM);
+FunctionPass *createSystemZShortenInstPass(SystemZTargetMachine &TM);
+FunctionPass *createSystemZLongBranchPass(SystemZTargetMachine &TM);
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZ.td b/contrib/llvm/lib/Target/SystemZ/SystemZ.td
new file mode 100644
index 0000000..5f82903
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZ.td
@@ -0,0 +1,62 @@
+//===-- SystemZ.td - Describe the SystemZ target machine -----*- tblgen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// SystemZ supported processors and features
+//===----------------------------------------------------------------------===//
+
+include "SystemZProcessors.td"
+
+//===----------------------------------------------------------------------===//
+// Register file description
+//===----------------------------------------------------------------------===//
+
+include "SystemZRegisterInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Calling convention description
+//===----------------------------------------------------------------------===//
+
+include "SystemZCallingConv.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction descriptions
+//===----------------------------------------------------------------------===//
+
+include "SystemZOperators.td"
+include "SystemZOperands.td"
+include "SystemZPatterns.td"
+include "SystemZInstrFormats.td"
+include "SystemZInstrInfo.td"
+include "SystemZInstrFP.td"
+
+def SystemZInstrInfo : InstrInfo {}
+
+//===----------------------------------------------------------------------===//
+// Assembly parser
+//===----------------------------------------------------------------------===//
+
+def SystemZAsmParser : AsmParser {
+  let ShouldEmitMatchRegisterName = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Top-level target declaration
+//===----------------------------------------------------------------------===//
+
+def SystemZ : Target {
+  let InstructionSet = SystemZInstrInfo;
+  let AssemblyParsers = [SystemZAsmParser];
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
new file mode 100644
index 0000000..8b18bc1
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -0,0 +1,247 @@
+//===-- SystemZAsmPrinter.cpp - SystemZ LLVM assembly printer -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Streams SystemZ assembly language and associated data, in the form of
+// MCInsts and MCExprs respectively.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZAsmPrinter.h"
+#include "InstPrinter/SystemZInstPrinter.h"
+#include "SystemZConstantPoolValue.h"
+#include "SystemZMCInstLower.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+// Return an RI instruction like MI with opcode Opcode, but with the
+// GR64 register operands turned into GR32s.
+static MCInst lowerRILow(const MachineInstr *MI, unsigned Opcode) {
+  if (MI->isCompare())
+    return MCInstBuilder(Opcode)
+      .addReg(SystemZMC::getRegAsGR32(MI->getOperand(0).getReg()))
+      .addImm(MI->getOperand(1).getImm());
+  else
+    return MCInstBuilder(Opcode)
+      .addReg(SystemZMC::getRegAsGR32(MI->getOperand(0).getReg()))
+      .addReg(SystemZMC::getRegAsGR32(MI->getOperand(1).getReg()))
+      .addImm(MI->getOperand(2).getImm());
+}
+
+// Return an RI instruction like MI with opcode Opcode, but with the
+// GR64 register operands turned into GRH32s.
+static MCInst lowerRIHigh(const MachineInstr *MI, unsigned Opcode) {
+  if (MI->isCompare())
+    return MCInstBuilder(Opcode)
+      .addReg(SystemZMC::getRegAsGRH32(MI->getOperand(0).getReg()))
+      .addImm(MI->getOperand(1).getImm());
+  else
+    return MCInstBuilder(Opcode)
+      .addReg(SystemZMC::getRegAsGRH32(MI->getOperand(0).getReg()))
+      .addReg(SystemZMC::getRegAsGRH32(MI->getOperand(1).getReg()))
+      .addImm(MI->getOperand(2).getImm());
+}
+
+// Return an RI instruction like MI with opcode Opcode, but with the
+// R2 register turned into a GR64.
+static MCInst lowerRIEfLow(const MachineInstr *MI, unsigned Opcode) {
+  return MCInstBuilder(Opcode)
+    .addReg(MI->getOperand(0).getReg())
+    .addReg(MI->getOperand(1).getReg())
+    .addReg(SystemZMC::getRegAsGR64(MI->getOperand(2).getReg()))
+    .addImm(MI->getOperand(3).getImm())
+    .addImm(MI->getOperand(4).getImm())
+    .addImm(MI->getOperand(5).getImm());
+}
+
+void SystemZAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  SystemZMCInstLower Lower(MF->getContext(), *this);
+  MCInst LoweredMI;
+  switch (MI->getOpcode()) {
+  case SystemZ::Return:
+    LoweredMI = MCInstBuilder(SystemZ::BR).addReg(SystemZ::R14D);
+    break;
+
+  case SystemZ::CallBRASL:
+    LoweredMI = MCInstBuilder(SystemZ::BRASL)
+      .addReg(SystemZ::R14D)
+      .addExpr(Lower.getExpr(MI->getOperand(0), MCSymbolRefExpr::VK_PLT));
+    break;
+
+  case SystemZ::CallBASR:
+    LoweredMI = MCInstBuilder(SystemZ::BASR)
+      .addReg(SystemZ::R14D)
+      .addReg(MI->getOperand(0).getReg());
+    break;
+
+  case SystemZ::CallJG:
+    LoweredMI = MCInstBuilder(SystemZ::JG)
+      .addExpr(Lower.getExpr(MI->getOperand(0), MCSymbolRefExpr::VK_PLT));
+    break;
+
+  case SystemZ::CallBR:
+    LoweredMI = MCInstBuilder(SystemZ::BR).addReg(SystemZ::R1D);
+    break;
+
+  case SystemZ::IILF64:
+    LoweredMI = MCInstBuilder(SystemZ::IILF)
+      .addReg(SystemZMC::getRegAsGR32(MI->getOperand(0).getReg()))
+      .addImm(MI->getOperand(2).getImm());
+    break;
+
+  case SystemZ::IIHF64:
+    LoweredMI = MCInstBuilder(SystemZ::IIHF)
+      .addReg(SystemZMC::getRegAsGRH32(MI->getOperand(0).getReg()))
+      .addImm(MI->getOperand(2).getImm());
+    break;
+
+  case SystemZ::RISBHH:
+  case SystemZ::RISBHL:
+    LoweredMI = lowerRIEfLow(MI, SystemZ::RISBHG);
+    break;
+
+  case SystemZ::RISBLH:
+  case SystemZ::RISBLL:
+    LoweredMI = lowerRIEfLow(MI, SystemZ::RISBLG);
+    break;
+
+#define LOWER_LOW(NAME)                                                 \
+  case SystemZ::NAME##64: LoweredMI = lowerRILow(MI, SystemZ::NAME); break
+
+  LOWER_LOW(IILL);
+  LOWER_LOW(IILH);
+  LOWER_LOW(TMLL);
+  LOWER_LOW(TMLH);
+  LOWER_LOW(NILL);
+  LOWER_LOW(NILH);
+  LOWER_LOW(NILF);
+  LOWER_LOW(OILL);
+  LOWER_LOW(OILH);
+  LOWER_LOW(OILF);
+  LOWER_LOW(XILF);
+
+#undef LOWER_LOW
+
+#define LOWER_HIGH(NAME) \
+  case SystemZ::NAME##64: LoweredMI = lowerRIHigh(MI, SystemZ::NAME); break
+
+  LOWER_HIGH(IIHL);
+  LOWER_HIGH(IIHH);
+  LOWER_HIGH(TMHL);
+  LOWER_HIGH(TMHH);
+  LOWER_HIGH(NIHL);
+  LOWER_HIGH(NIHH);
+  LOWER_HIGH(NIHF);
+  LOWER_HIGH(OIHL);
+  LOWER_HIGH(OIHH);
+  LOWER_HIGH(OIHF);
+  LOWER_HIGH(XIHF);
+
+#undef LOWER_HIGH
+
+  case SystemZ::Serialize:
+    if (Subtarget->hasFastSerialization())
+      LoweredMI = MCInstBuilder(SystemZ::AsmBCR)
+        .addImm(14).addReg(SystemZ::R0D);
+    else
+      LoweredMI = MCInstBuilder(SystemZ::AsmBCR)
+        .addImm(15).addReg(SystemZ::R0D);
+    break;
+
+  default:
+    Lower.lower(MI, LoweredMI);
+    break;
+  }
+  EmitToStreamer(OutStreamer, LoweredMI);
+}
+
+// Convert a SystemZ-specific constant pool modifier into the associated
+// MCSymbolRefExpr variant kind.
+static MCSymbolRefExpr::VariantKind
+getModifierVariantKind(SystemZCP::SystemZCPModifier Modifier) {
+  switch (Modifier) {
+  case SystemZCP::NTPOFF: return MCSymbolRefExpr::VK_NTPOFF;
+  }
+  llvm_unreachable("Invalid SystemCPModifier!");
+}
+
+void SystemZAsmPrinter::
+EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
+  auto *ZCPV = static_cast<SystemZConstantPoolValue*>(MCPV);
+
+  const MCExpr *Expr =
+    MCSymbolRefExpr::Create(getSymbol(ZCPV->getGlobalValue()),
+                            getModifierVariantKind(ZCPV->getModifier()),
+                            OutContext);
+  uint64_t Size = TM.getDataLayout()->getTypeAllocSize(ZCPV->getType());
+
+  OutStreamer.EmitValue(Expr, Size);
+}
+
+bool SystemZAsmPrinter::PrintAsmOperand(const MachineInstr *MI,
+                                        unsigned OpNo,
+                                        unsigned AsmVariant,
+                                        const char *ExtraCode,
+                                        raw_ostream &OS) {
+  if (ExtraCode && *ExtraCode == 'n') {
+    if (!MI->getOperand(OpNo).isImm())
+      return true;
+    OS << -int64_t(MI->getOperand(OpNo).getImm());
+  } else {
+    SystemZMCInstLower Lower(MF->getContext(), *this);
+    MCOperand MO(Lower.lowerOperand(MI->getOperand(OpNo)));
+    SystemZInstPrinter::printOperand(MO, OS);
+  }
+  return false;
+}
+
+bool SystemZAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                              unsigned OpNo,
+                                              unsigned AsmVariant,
+                                              const char *ExtraCode,
+                                              raw_ostream &OS) {
+  SystemZInstPrinter::printAddress(MI->getOperand(OpNo).getReg(),
+                                   MI->getOperand(OpNo + 1).getImm(),
+                                   MI->getOperand(OpNo + 2).getReg(), OS);
+  return false;
+}
+
+void SystemZAsmPrinter::EmitEndOfAsmFile(Module &M) {
+  if (Subtarget->isTargetELF()) {
+    auto &TLOFELF =
+      static_cast<const TargetLoweringObjectFileELF &>(getObjFileLowering());
+
+    MachineModuleInfoELF &MMIELF = MMI->getObjFileInfo<MachineModuleInfoELF>();
+
+    // Output stubs for external and common global variables.
+    MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList();
+    if (!Stubs.empty()) {
+      OutStreamer.SwitchSection(TLOFELF.getDataRelSection());
+      const DataLayout *TD = TM.getDataLayout();
+
+      for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+        OutStreamer.EmitLabel(Stubs[i].first);
+        OutStreamer.EmitSymbolValue(Stubs[i].second.getPointer(),
+                                    TD->getPointerSize(0));
+      }
+      Stubs.clear();
+    }
+  }
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeSystemZAsmPrinter() {
+  RegisterAsmPrinter<SystemZAsmPrinter> X(TheSystemZTarget);
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h b/contrib/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h
new file mode 100644
index 0000000..20093bc
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h
@@ -0,0 +1,50 @@
+//===-- SystemZAsmPrinter.h - SystemZ LLVM assembly printer ----*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SYSTEMZASMPRINTER_H
+#define SYSTEMZASMPRINTER_H
+
+#include "SystemZTargetMachine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+class MCStreamer;
+class MachineBasicBlock;
+class MachineInstr;
+class Module;
+class raw_ostream;
+
+class LLVM_LIBRARY_VISIBILITY SystemZAsmPrinter : public AsmPrinter {
+private:
+  const SystemZSubtarget *Subtarget;
+
+public:
+  SystemZAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+    : AsmPrinter(TM, Streamer) {
+    Subtarget = &TM.getSubtarget<SystemZSubtarget>();
+  }
+
+  // Override AsmPrinter.
+  const char *getPassName() const override {
+    return "SystemZ Assembly Printer";
+  }
+  void EmitInstruction(const MachineInstr *MI) override;
+  void EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) override;
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                       unsigned AsmVariant, const char *ExtraCode,
+                       raw_ostream &OS) override;
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                             unsigned AsmVariant, const char *ExtraCode,
+                             raw_ostream &OS) override;
+  void EmitEndOfAsmFile(Module &M) override;
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZCallingConv.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZCallingConv.cpp
new file mode 100644
index 0000000..cc9c84b
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZCallingConv.cpp
@@ -0,0 +1,21 @@
+//===-- SystemZCallingConv.cpp - Calling conventions for SystemZ ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZCallingConv.h"
+#include "SystemZRegisterInfo.h"
+
+using namespace llvm;
+
+const unsigned SystemZ::ArgGPRs[SystemZ::NumArgGPRs] = {
+  SystemZ::R2D, SystemZ::R3D, SystemZ::R4D, SystemZ::R5D, SystemZ::R6D
+};
+
+const unsigned SystemZ::ArgFPRs[SystemZ::NumArgFPRs] = {
+  SystemZ::F0D, SystemZ::F2D, SystemZ::F4D, SystemZ::F6D
+};
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZCallingConv.h b/contrib/llvm/lib/Target/SystemZ/SystemZCallingConv.h
new file mode 100644
index 0000000..4b1569d
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZCallingConv.h
@@ -0,0 +1,23 @@
+//===-- SystemZCallingConv.h - Calling conventions for SystemZ --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SYSTEMZCALLINGCONV_H
+#define SYSTEMZCALLINGCONV_H
+
+namespace llvm {
+namespace SystemZ {
+  const unsigned NumArgGPRs = 5;
+  extern const unsigned ArgGPRs[NumArgGPRs];
+
+  const unsigned NumArgFPRs = 4;
+  extern const unsigned ArgFPRs[NumArgFPRs];
+} // end namespace SystemZ
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/contrib/llvm/lib/Target/SystemZ/SystemZCallingConv.td
new file mode 100644
index 0000000..fb0d1d8
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZCallingConv.td
@@ -0,0 +1,71 @@
+//=- SystemZCallingConv.td - Calling conventions for SystemZ -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This describes the calling conventions for the SystemZ ABI.
+//===----------------------------------------------------------------------===//
+
+class CCIfExtend<CCAction A>
+  : CCIf<"ArgFlags.isSExt() || ArgFlags.isZExt()", A>;
+
+//===----------------------------------------------------------------------===//
+// z/Linux return value calling convention
+//===----------------------------------------------------------------------===//
+def RetCC_SystemZ : CallingConv<[
+  // Promote i32 to i64 if it has an explicit extension type.
+  CCIfType<[i32], CCIfExtend<CCPromoteToType<i64>>>,
+
+  // ABI-compliant code returns 64-bit integers in R2.  Make the other
+  // call-clobbered argument registers available for code that doesn't
+  // care about the ABI.  (R6 is an argument register too, but is
+  // call-saved and therefore not suitable for return values.)
+  CCIfType<[i32], CCAssignToReg<[R2L, R3L, R4L, R5L]>>,
+  CCIfType<[i64], CCAssignToReg<[R2D, R3D, R4D, R5D]>>,
+
+  // ABI-complaint code returns float and double in F0.  Make the
+  // other floating-point argument registers available for code that
+  // doesn't care about the ABI.  All floating-point argument registers
+  // are call-clobbered, so we can use all of them here.
+  CCIfType<[f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
+  CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>
+
+  // ABI-compliant code returns long double by reference, but that conversion
+  // is left to higher-level code.  Perhaps we could add an f128 definition
+  // here for code that doesn't care about the ABI?
+]>;
+
+//===----------------------------------------------------------------------===//
+// z/Linux argument calling conventions
+//===----------------------------------------------------------------------===//
+def CC_SystemZ : CallingConv<[
+  // Promote i32 to i64 if it has an explicit extension type.
+  // The convention is that true integer arguments that are smaller
+  // than 64 bits should be marked as extended, but structures that
+  // are smaller than 64 bits shouldn't.
+  CCIfType<[i32], CCIfExtend<CCPromoteToType<i64>>>,
+
+  // Force long double values to the stack and pass i64 pointers to them.
+  CCIfType<[f128], CCPassIndirect<i64>>,
+
+  // The first 5 integer arguments are passed in R2-R6.  Note that R6
+  // is call-saved.
+  CCIfType<[i32], CCAssignToReg<[R2L, R3L, R4L, R5L, R6L]>>,
+  CCIfType<[i64], CCAssignToReg<[R2D, R3D, R4D, R5D, R6D]>>,
+
+  // The first 4 float and double arguments are passed in even registers F0-F6.
+  CCIfType<[f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
+  CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>,
+
+  // Other arguments are passed in 8-byte-aligned 8-byte stack slots.
+  CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// z/Linux callee-saved registers
+//===----------------------------------------------------------------------===//
+def CSR_SystemZ : CalleeSavedRegs<(add (sequence "R%dD", 6, 15),
+                                       (sequence "F%dD", 8, 15))>;
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.cpp
new file mode 100644
index 0000000..19cec21
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.cpp
@@ -0,0 +1,62 @@
+//===-- SystemZConstantPoolValue.cpp - SystemZ constant-pool value --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZConstantPoolValue.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+SystemZConstantPoolValue::
+SystemZConstantPoolValue(const GlobalValue *gv,
+                         SystemZCP::SystemZCPModifier modifier)
+  : MachineConstantPoolValue(gv->getType()), GV(gv), Modifier(modifier) {}
+
+SystemZConstantPoolValue *
+SystemZConstantPoolValue::Create(const GlobalValue *GV,
+                                 SystemZCP::SystemZCPModifier Modifier) {
+  return new SystemZConstantPoolValue(GV, Modifier);
+}
+
+unsigned SystemZConstantPoolValue::getRelocationInfo() const {
+  switch (Modifier) {
+  case SystemZCP::NTPOFF:
+    // May require a relocation, but the relocations are always resolved
+    // by the static linker.
+    return 1;
+  }
+  llvm_unreachable("Unknown modifier");
+}
+
+int SystemZConstantPoolValue::
+getExistingMachineCPValue(MachineConstantPool *CP, unsigned Alignment) {
+  unsigned AlignMask = Alignment - 1;
+  const std::vector<MachineConstantPoolEntry> &Constants = CP->getConstants();
+  for (unsigned I = 0, E = Constants.size(); I != E; ++I) {
+    if (Constants[I].isMachineConstantPoolEntry() &&
+        (Constants[I].getAlignment() & AlignMask) == 0) {
+      auto *ZCPV =
+        static_cast<SystemZConstantPoolValue *>(Constants[I].Val.MachineCPVal);
+      if (ZCPV->GV == GV && ZCPV->Modifier == Modifier)
+        return I;
+    }
+  }
+  return -1;
+}
+
+void SystemZConstantPoolValue::addSelectionDAGCSEId(FoldingSetNodeID &ID) {
+  ID.AddPointer(GV);
+  ID.AddInteger(Modifier);
+}
+
+void SystemZConstantPoolValue::print(raw_ostream &O) const {
+  O << GV << "@" << int(Modifier);
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.h b/contrib/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.h
new file mode 100644
index 0000000..699718f
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.h
@@ -0,0 +1,55 @@
+//===- SystemZConstantPoolValue.h - SystemZ constant-pool value -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SYSTEMZCONSTANTPOOLVALUE_H
+#define SYSTEMZCONSTANTPOOLVALUE_H
+
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+class GlobalValue;
+
+namespace SystemZCP {
+enum SystemZCPModifier {
+  NTPOFF
+};
+} // end namespace SystemZCP
+
+/// A SystemZ-specific constant pool value.  At present, the only
+/// defined constant pool values are offsets of thread-local variables
+/// (written x@NTPOFF).
+class SystemZConstantPoolValue : public MachineConstantPoolValue {
+  const GlobalValue *GV;
+  SystemZCP::SystemZCPModifier Modifier;
+
+protected:
+  SystemZConstantPoolValue(const GlobalValue *GV,
+                           SystemZCP::SystemZCPModifier Modifier);
+
+public:
+  static SystemZConstantPoolValue *
+    Create(const GlobalValue *GV, SystemZCP::SystemZCPModifier Modifier);
+
+  // Override MachineConstantPoolValue.
+  unsigned getRelocationInfo() const override;
+  int getExistingMachineCPValue(MachineConstantPool *CP,
+                                unsigned Alignment) override;
+  void addSelectionDAGCSEId(FoldingSetNodeID &ID) override;
+  void print(raw_ostream &O) const override;
+
+  // Access SystemZ-specific fields.
+  const GlobalValue *getGlobalValue() const { return GV; }
+  SystemZCP::SystemZCPModifier getModifier() const { return Modifier; }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp
new file mode 100644
index 0000000..dc210d60
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp
@@ -0,0 +1,469 @@
+//===-- SystemZElimCompare.cpp - Eliminate comparison instructions --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass:
+// (1) tries to remove compares if CC already contains the required information
+// (2) fuses compares and branches into COMPARE AND BRANCH instructions
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "systemz-elim-compare"
+
+STATISTIC(BranchOnCounts, "Number of branch-on-count instructions");
+STATISTIC(EliminatedComparisons, "Number of eliminated comparisons");
+STATISTIC(FusedComparisons, "Number of fused compare-and-branch instructions");
+
+namespace {
+// Represents the references to a particular register in one or more
+// instructions.
+struct Reference {
+  Reference()
+    : Def(false), Use(false), IndirectDef(false), IndirectUse(false) {}
+
+  Reference &operator|=(const Reference &Other) {
+    Def |= Other.Def;
+    IndirectDef |= Other.IndirectDef;
+    Use |= Other.Use;
+    IndirectUse |= Other.IndirectUse;
+    return *this;
+  }
+
+  operator bool() const { return Def || Use; }
+
+  // True if the register is defined or used in some form, either directly or
+  // via a sub- or super-register.
+  bool Def;
+  bool Use;
+
+  // True if the register is defined or used indirectly, by a sub- or
+  // super-register.
+  bool IndirectDef;
+  bool IndirectUse;
+};
+
+class SystemZElimCompare : public MachineFunctionPass {
+public:
+  static char ID;
+  SystemZElimCompare(const SystemZTargetMachine &tm)
+    : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr) {}
+
+  const char *getPassName() const override {
+    return "SystemZ Comparison Elimination";
+  }
+
+  bool processBlock(MachineBasicBlock &MBB);
+  bool runOnMachineFunction(MachineFunction &F) override;
+
+private:
+  Reference getRegReferences(MachineInstr *MI, unsigned Reg);
+  bool convertToBRCT(MachineInstr *MI, MachineInstr *Compare,
+                     SmallVectorImpl<MachineInstr *> &CCUsers);
+  bool convertToLoadAndTest(MachineInstr *MI);
+  bool adjustCCMasksForInstr(MachineInstr *MI, MachineInstr *Compare,
+                             SmallVectorImpl<MachineInstr *> &CCUsers);
+  bool optimizeCompareZero(MachineInstr *Compare,
+                           SmallVectorImpl<MachineInstr *> &CCUsers);
+  bool fuseCompareAndBranch(MachineInstr *Compare,
+                            SmallVectorImpl<MachineInstr *> &CCUsers);
+
+  const SystemZInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+};
+
+char SystemZElimCompare::ID = 0;
+} // end anonymous namespace
+
+FunctionPass *llvm::createSystemZElimComparePass(SystemZTargetMachine &TM) {
+  return new SystemZElimCompare(TM);
+}
+
+// Return true if CC is live out of MBB.
+static bool isCCLiveOut(MachineBasicBlock &MBB) {
+  for (auto SI = MBB.succ_begin(), SE = MBB.succ_end(); SI != SE; ++SI)
+    if ((*SI)->isLiveIn(SystemZ::CC))
+      return true;
+  return false;
+}
+
+// Return true if any CC result of MI would reflect the value of subreg
+// SubReg of Reg.
+static bool resultTests(MachineInstr *MI, unsigned Reg, unsigned SubReg) {
+  if (MI->getNumOperands() > 0 &&
+      MI->getOperand(0).isReg() &&
+      MI->getOperand(0).isDef() &&
+      MI->getOperand(0).getReg() == Reg &&
+      MI->getOperand(0).getSubReg() == SubReg)
+    return true;
+
+  switch (MI->getOpcode()) {
+  case SystemZ::LR:
+  case SystemZ::LGR:
+  case SystemZ::LGFR:
+  case SystemZ::LTR:
+  case SystemZ::LTGR:
+  case SystemZ::LTGFR:
+  case SystemZ::LER:
+  case SystemZ::LDR:
+  case SystemZ::LXR:
+  case SystemZ::LTEBR:
+  case SystemZ::LTDBR:
+  case SystemZ::LTXBR:
+    if (MI->getOperand(1).getReg() == Reg &&
+        MI->getOperand(1).getSubReg() == SubReg)
+      return true;
+  }
+
+  return false;
+}
+
+// Describe the references to Reg in MI, including sub- and super-registers.
+Reference SystemZElimCompare::getRegReferences(MachineInstr *MI, unsigned Reg) {
+  Reference Ref;
+  for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
+    const MachineOperand &MO = MI->getOperand(I);
+    if (MO.isReg()) {
+      if (unsigned MOReg = MO.getReg()) {
+        if (MOReg == Reg || TRI->regsOverlap(MOReg, Reg)) {
+          if (MO.isUse()) {
+            Ref.Use = true;
+            Ref.IndirectUse |= (MOReg != Reg);
+          }
+          if (MO.isDef()) {
+            Ref.Def = true;
+            Ref.IndirectDef |= (MOReg != Reg);
+          }
+        }
+      }
+    }
+  }
+  return Ref;
+}
+
+// Compare compares the result of MI against zero.  If MI is an addition
+// of -1 and if CCUsers is a single branch on nonzero, eliminate the addition
+// and convert the branch to a BRCT(G).  Return true on success.
+bool
+SystemZElimCompare::convertToBRCT(MachineInstr *MI, MachineInstr *Compare,
+                                  SmallVectorImpl<MachineInstr *> &CCUsers) {
+  // Check whether we have an addition of -1.
+  unsigned Opcode = MI->getOpcode();
+  unsigned BRCT;
+  if (Opcode == SystemZ::AHI)
+    BRCT = SystemZ::BRCT;
+  else if (Opcode == SystemZ::AGHI)
+    BRCT = SystemZ::BRCTG;
+  else
+    return false;
+  if (MI->getOperand(2).getImm() != -1)
+    return false;
+
+  // Check whether we have a single JLH.
+  if (CCUsers.size() != 1)
+    return false;
+  MachineInstr *Branch = CCUsers[0];
+  if (Branch->getOpcode() != SystemZ::BRC ||
+      Branch->getOperand(0).getImm() != SystemZ::CCMASK_ICMP ||
+      Branch->getOperand(1).getImm() != SystemZ::CCMASK_CMP_NE)
+    return false;
+
+  // We already know that there are no references to the register between
+  // MI and Compare.  Make sure that there are also no references between
+  // Compare and Branch.
+  unsigned SrcReg = Compare->getOperand(0).getReg();
+  MachineBasicBlock::iterator MBBI = Compare, MBBE = Branch;
+  for (++MBBI; MBBI != MBBE; ++MBBI)
+    if (getRegReferences(MBBI, SrcReg))
+      return false;
+
+  // The transformation is OK.  Rebuild Branch as a BRCT(G).
+  MachineOperand Target(Branch->getOperand(2));
+  Branch->RemoveOperand(2);
+  Branch->RemoveOperand(1);
+  Branch->RemoveOperand(0);
+  Branch->setDesc(TII->get(BRCT));
+  MachineInstrBuilder(*Branch->getParent()->getParent(), Branch)
+    .addOperand(MI->getOperand(0))
+    .addOperand(MI->getOperand(1))
+    .addOperand(Target)
+    .addReg(SystemZ::CC, RegState::ImplicitDefine);
+  MI->removeFromParent();
+  return true;
+}
+
+// If MI is a load instruction, try to convert it into a LOAD AND TEST.
+// Return true on success.
+bool SystemZElimCompare::convertToLoadAndTest(MachineInstr *MI) {
+  unsigned Opcode = TII->getLoadAndTest(MI->getOpcode());
+  if (!Opcode)
+    return false;
+
+  MI->setDesc(TII->get(Opcode));
+  MachineInstrBuilder(*MI->getParent()->getParent(), MI)
+    .addReg(SystemZ::CC, RegState::ImplicitDefine);
+  return true;
+}
+
+// The CC users in CCUsers are testing the result of a comparison of some
+// value X against zero and we know that any CC value produced by MI
+// would also reflect the value of X.  Try to adjust CCUsers so that
+// they test the result of MI directly, returning true on success.
+// Leave everything unchanged on failure.
+bool SystemZElimCompare::
+adjustCCMasksForInstr(MachineInstr *MI, MachineInstr *Compare,
+                      SmallVectorImpl<MachineInstr *> &CCUsers) {
+  int Opcode = MI->getOpcode();
+  const MCInstrDesc &Desc = TII->get(Opcode);
+  unsigned MIFlags = Desc.TSFlags;
+
+  // See which compare-style condition codes are available.
+  unsigned ReusableCCMask = SystemZII::getCompareZeroCCMask(MIFlags);
+
+  // For unsigned comparisons with zero, only equality makes sense.
+  unsigned CompareFlags = Compare->getDesc().TSFlags;
+  if (CompareFlags & SystemZII::IsLogical)
+    ReusableCCMask &= SystemZ::CCMASK_CMP_EQ;
+
+  if (ReusableCCMask == 0)
+    return false;
+
+  unsigned CCValues = SystemZII::getCCValues(MIFlags);
+  assert((ReusableCCMask & ~CCValues) == 0 && "Invalid CCValues");
+
+  // Now check whether these flags are enough for all users.
+  SmallVector<MachineOperand *, 4> AlterMasks;
+  for (unsigned int I = 0, E = CCUsers.size(); I != E; ++I) {
+    MachineInstr *MI = CCUsers[I];
+
+    // Fail if this isn't a use of CC that we understand.
+    unsigned Flags = MI->getDesc().TSFlags;
+    unsigned FirstOpNum;
+    if (Flags & SystemZII::CCMaskFirst)
+      FirstOpNum = 0;
+    else if (Flags & SystemZII::CCMaskLast)
+      FirstOpNum = MI->getNumExplicitOperands() - 2;
+    else
+      return false;
+
+    // Check whether the instruction predicate treats all CC values
+    // outside of ReusableCCMask in the same way.  In that case it
+    // doesn't matter what those CC values mean.
+    unsigned CCValid = MI->getOperand(FirstOpNum).getImm();
+    unsigned CCMask = MI->getOperand(FirstOpNum + 1).getImm();
+    unsigned OutValid = ~ReusableCCMask & CCValid;
+    unsigned OutMask = ~ReusableCCMask & CCMask;
+    if (OutMask != 0 && OutMask != OutValid)
+      return false;
+
+    AlterMasks.push_back(&MI->getOperand(FirstOpNum));
+    AlterMasks.push_back(&MI->getOperand(FirstOpNum + 1));
+  }
+
+  // All users are OK.  Adjust the masks for MI.
+  for (unsigned I = 0, E = AlterMasks.size(); I != E; I += 2) {
+    AlterMasks[I]->setImm(CCValues);
+    unsigned CCMask = AlterMasks[I + 1]->getImm();
+    if (CCMask & ~ReusableCCMask)
+      AlterMasks[I + 1]->setImm((CCMask & ReusableCCMask) |
+                                (CCValues & ~ReusableCCMask));
+  }
+
+  // CC is now live after MI.
+  int CCDef = MI->findRegisterDefOperandIdx(SystemZ::CC, false, true, TRI);
+  assert(CCDef >= 0 && "Couldn't find CC set");
+  MI->getOperand(CCDef).setIsDead(false);
+
+  // Clear any intervening kills of CC.
+  MachineBasicBlock::iterator MBBI = MI, MBBE = Compare;
+  for (++MBBI; MBBI != MBBE; ++MBBI)
+    MBBI->clearRegisterKills(SystemZ::CC, TRI);
+
+  return true;
+}
+
+// Return true if Compare is a comparison against zero.
+static bool isCompareZero(MachineInstr *Compare) {
+  switch (Compare->getOpcode()) {
+  case SystemZ::LTEBRCompare:
+  case SystemZ::LTDBRCompare:
+  case SystemZ::LTXBRCompare:
+    return true;
+
+  default:
+    return (Compare->getNumExplicitOperands() == 2 &&
+            Compare->getOperand(1).isImm() &&
+            Compare->getOperand(1).getImm() == 0);
+  }
+}
+
+// Try to optimize cases where comparison instruction Compare is testing
+// a value against zero.  Return true on success and if Compare should be
+// deleted as dead.  CCUsers is the list of instructions that use the CC
+// value produced by Compare.
+bool SystemZElimCompare::
+optimizeCompareZero(MachineInstr *Compare,
+                    SmallVectorImpl<MachineInstr *> &CCUsers) {
+  if (!isCompareZero(Compare))
+    return false;
+
+  // Search back for CC results that are based on the first operand.
+  unsigned SrcReg = Compare->getOperand(0).getReg();
+  unsigned SrcSubReg = Compare->getOperand(0).getSubReg();
+  MachineBasicBlock &MBB = *Compare->getParent();
+  MachineBasicBlock::iterator MBBI = Compare, MBBE = MBB.begin();
+  Reference CCRefs;
+  Reference SrcRefs;
+  while (MBBI != MBBE) {
+    --MBBI;
+    MachineInstr *MI = MBBI;
+    if (resultTests(MI, SrcReg, SrcSubReg)) {
+      // Try to remove both MI and Compare by converting a branch to BRCT(G).
+      // We don't care in this case whether CC is modified between MI and
+      // Compare.
+      if (!CCRefs.Use && !SrcRefs && convertToBRCT(MI, Compare, CCUsers)) {
+        BranchOnCounts += 1;
+        return true;
+      }
+      // Try to eliminate Compare by reusing a CC result from MI.
+      if ((!CCRefs && convertToLoadAndTest(MI)) ||
+          (!CCRefs.Def && adjustCCMasksForInstr(MI, Compare, CCUsers))) {
+        EliminatedComparisons += 1;
+        return true;
+      }
+    }
+    SrcRefs |= getRegReferences(MI, SrcReg);
+    if (SrcRefs.Def)
+      return false;
+    CCRefs |= getRegReferences(MI, SystemZ::CC);
+    if (CCRefs.Use && CCRefs.Def)
+      return false;
+  }
+  return false;
+}
+
+// Try to fuse comparison instruction Compare into a later branch.
+// Return true on success and if Compare is therefore redundant.
+bool SystemZElimCompare::
+fuseCompareAndBranch(MachineInstr *Compare,
+                     SmallVectorImpl<MachineInstr *> &CCUsers) {
+  // See whether we have a comparison that can be fused.
+  unsigned FusedOpcode = TII->getCompareAndBranch(Compare->getOpcode(),
+                                                  Compare);
+  if (!FusedOpcode)
+    return false;
+
+  // See whether we have a single branch with which to fuse.
+  if (CCUsers.size() != 1)
+    return false;
+  MachineInstr *Branch = CCUsers[0];
+  if (Branch->getOpcode() != SystemZ::BRC)
+    return false;
+
+  // Make sure that the operands are available at the branch.
+  unsigned SrcReg = Compare->getOperand(0).getReg();
+  unsigned SrcReg2 = (Compare->getOperand(1).isReg() ?
+                      Compare->getOperand(1).getReg() : 0);
+  MachineBasicBlock::iterator MBBI = Compare, MBBE = Branch;
+  for (++MBBI; MBBI != MBBE; ++MBBI)
+    if (MBBI->modifiesRegister(SrcReg, TRI) ||
+        (SrcReg2 && MBBI->modifiesRegister(SrcReg2, TRI)))
+      return false;
+
+  // Read the branch mask and target.
+  MachineOperand CCMask(MBBI->getOperand(1));
+  MachineOperand Target(MBBI->getOperand(2));
+  assert((CCMask.getImm() & ~SystemZ::CCMASK_ICMP) == 0 &&
+         "Invalid condition-code mask for integer comparison");
+
+  // Clear out all current operands.
+  int CCUse = MBBI->findRegisterUseOperandIdx(SystemZ::CC, false, TRI);
+  assert(CCUse >= 0 && "BRC must use CC");
+  Branch->RemoveOperand(CCUse);
+  Branch->RemoveOperand(2);
+  Branch->RemoveOperand(1);
+  Branch->RemoveOperand(0);
+
+  // Rebuild Branch as a fused compare and branch.
+  Branch->setDesc(TII->get(FusedOpcode));
+  MachineInstrBuilder(*Branch->getParent()->getParent(), Branch)
+    .addOperand(Compare->getOperand(0))
+    .addOperand(Compare->getOperand(1))
+    .addOperand(CCMask)
+    .addOperand(Target)
+    .addReg(SystemZ::CC, RegState::ImplicitDefine);
+
+  // Clear any intervening kills of SrcReg and SrcReg2.
+  MBBI = Compare;
+  for (++MBBI; MBBI != MBBE; ++MBBI) {
+    MBBI->clearRegisterKills(SrcReg, TRI);
+    if (SrcReg2)
+      MBBI->clearRegisterKills(SrcReg2, TRI);
+  }
+  FusedComparisons += 1;
+  return true;
+}
+
+// Process all comparison instructions in MBB.  Return true if something
+// changed.
+bool SystemZElimCompare::processBlock(MachineBasicBlock &MBB) {
+  bool Changed = false;
+
+  // Walk backwards through the block looking for comparisons, recording
+  // all CC users as we go.  The subroutines can delete Compare and
+  // instructions before it.
+  bool CompleteCCUsers = !isCCLiveOut(MBB);
+  SmallVector<MachineInstr *, 4> CCUsers;
+  MachineBasicBlock::iterator MBBI = MBB.end();
+  while (MBBI != MBB.begin()) {
+    MachineInstr *MI = --MBBI;
+    if (CompleteCCUsers &&
+        MI->isCompare() &&
+        (optimizeCompareZero(MI, CCUsers) ||
+         fuseCompareAndBranch(MI, CCUsers))) {
+      ++MBBI;
+      MI->removeFromParent();
+      Changed = true;
+      CCUsers.clear();
+      CompleteCCUsers = true;
+      continue;
+    }
+
+    Reference CCRefs(getRegReferences(MI, SystemZ::CC));
+    if (CCRefs.Def) {
+      CCUsers.clear();
+      CompleteCCUsers = !CCRefs.IndirectDef;
+    }
+    if (CompleteCCUsers && CCRefs.Use)
+      CCUsers.push_back(MI);
+  }
+  return Changed;
+}
+
+bool SystemZElimCompare::runOnMachineFunction(MachineFunction &F) {
+  TII = static_cast<const SystemZInstrInfo *>(F.getTarget().getInstrInfo());
+  TRI = &TII->getRegisterInfo();
+
+  bool Changed = false;
+  for (auto &MBB : F)
+    Changed |= processBlock(MBB);
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
new file mode 100644
index 0000000..055dbe9
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -0,0 +1,513 @@
+//===-- SystemZFrameLowering.cpp - Frame lowering for SystemZ -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZFrameLowering.h"
+#include "SystemZCallingConv.h"
+#include "SystemZInstrBuilder.h"
+#include "SystemZInstrInfo.h"
+#include "SystemZMachineFunctionInfo.h"
+#include "SystemZRegisterInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Function.h"
+
+using namespace llvm;
+
+namespace {
+// The ABI-defined register save slots, relative to the incoming stack
+// pointer.
+static const TargetFrameLowering::SpillSlot SpillOffsetTable[] = {
+  { SystemZ::R2D,  0x10 },
+  { SystemZ::R3D,  0x18 },
+  { SystemZ::R4D,  0x20 },
+  { SystemZ::R5D,  0x28 },
+  { SystemZ::R6D,  0x30 },
+  { SystemZ::R7D,  0x38 },
+  { SystemZ::R8D,  0x40 },
+  { SystemZ::R9D,  0x48 },
+  { SystemZ::R10D, 0x50 },
+  { SystemZ::R11D, 0x58 },
+  { SystemZ::R12D, 0x60 },
+  { SystemZ::R13D, 0x68 },
+  { SystemZ::R14D, 0x70 },
+  { SystemZ::R15D, 0x78 },
+  { SystemZ::F0D,  0x80 },
+  { SystemZ::F2D,  0x88 },
+  { SystemZ::F4D,  0x90 },
+  { SystemZ::F6D,  0x98 }
+};
+} // end anonymous namespace
+
+SystemZFrameLowering::SystemZFrameLowering()
+    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 8,
+                          -SystemZMC::CallFrameSize, 8) {
+  // Create a mapping from register number to save slot offset.
+  RegSpillOffsets.grow(SystemZ::NUM_TARGET_REGS);
+  for (unsigned I = 0, E = array_lengthof(SpillOffsetTable); I != E; ++I)
+    RegSpillOffsets[SpillOffsetTable[I].Reg] = SpillOffsetTable[I].Offset;
+}
+
+const TargetFrameLowering::SpillSlot *
+SystemZFrameLowering::getCalleeSavedSpillSlots(unsigned &NumEntries) const {
+  NumEntries = array_lengthof(SpillOffsetTable);
+  return SpillOffsetTable;
+}
+
+void SystemZFrameLowering::
+processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                     RegScavenger *RS) const {
+  MachineFrameInfo *MFFrame = MF.getFrameInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+  bool HasFP = hasFP(MF);
+  SystemZMachineFunctionInfo *MFI = MF.getInfo<SystemZMachineFunctionInfo>();
+  bool IsVarArg = MF.getFunction()->isVarArg();
+
+  // va_start stores incoming FPR varargs in the normal way, but delegates
+  // the saving of incoming GPR varargs to spillCalleeSavedRegisters().
+  // Record these pending uses, which typically include the call-saved
+  // argument register R6D.
+  if (IsVarArg)
+    for (unsigned I = MFI->getVarArgsFirstGPR(); I < SystemZ::NumArgGPRs; ++I)
+      MRI.setPhysRegUsed(SystemZ::ArgGPRs[I]);
+
+  // If the function requires a frame pointer, record that the hard
+  // frame pointer will be clobbered.
+  if (HasFP)
+    MRI.setPhysRegUsed(SystemZ::R11D);
+
+  // If the function calls other functions, record that the return
+  // address register will be clobbered.
+  if (MFFrame->hasCalls())
+    MRI.setPhysRegUsed(SystemZ::R14D);
+
+  // If we are saving GPRs other than the stack pointer, we might as well
+  // save and restore the stack pointer at the same time, via STMG and LMG.
+  // This allows the deallocation to be done by the LMG, rather than needing
+  // a separate %r15 addition.
+  const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF);
+  for (unsigned I = 0; CSRegs[I]; ++I) {
+    unsigned Reg = CSRegs[I];
+    if (SystemZ::GR64BitRegClass.contains(Reg) && MRI.isPhysRegUsed(Reg)) {
+      MRI.setPhysRegUsed(SystemZ::R15D);
+      break;
+    }
+  }
+}
+
+// Add GPR64 to the save instruction being built by MIB, which is in basic
+// block MBB.  IsImplicit says whether this is an explicit operand to the
+// instruction, or an implicit one that comes between the explicit start
+// and end registers.
+static void addSavedGPR(MachineBasicBlock &MBB, MachineInstrBuilder &MIB,
+                        unsigned GPR64, bool IsImplicit) {
+  const TargetRegisterInfo *RI = MBB.getParent()->getTarget().getRegisterInfo();
+  unsigned GPR32 = RI->getSubReg(GPR64, SystemZ::subreg_l32);
+  bool IsLive = MBB.isLiveIn(GPR64) || MBB.isLiveIn(GPR32);
+  if (!IsLive || !IsImplicit) {
+    MIB.addReg(GPR64, getImplRegState(IsImplicit) | getKillRegState(!IsLive));
+    if (!IsLive)
+      MBB.addLiveIn(GPR64);
+  }
+}
+
+bool SystemZFrameLowering::
+spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MBBI,
+                          const std::vector<CalleeSavedInfo> &CSI,
+                          const TargetRegisterInfo *TRI) const {
+  if (CSI.empty())
+    return false;
+
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo *TII = MF.getTarget().getInstrInfo();
+  SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
+  bool IsVarArg = MF.getFunction()->isVarArg();
+  DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+
+  // Scan the call-saved GPRs and find the bounds of the register spill area.
+  unsigned LowGPR = 0;
+  unsigned HighGPR = SystemZ::R15D;
+  unsigned StartOffset = -1U;
+  for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
+    unsigned Reg = CSI[I].getReg();
+    if (SystemZ::GR64BitRegClass.contains(Reg)) {
+      unsigned Offset = RegSpillOffsets[Reg];
+      assert(Offset && "Unexpected GPR save");
+      if (StartOffset > Offset) {
+        LowGPR = Reg;
+        StartOffset = Offset;
+      }
+    }
+  }
+
+  // Save the range of call-saved registers, for use by the epilogue inserter.
+  ZFI->setLowSavedGPR(LowGPR);
+  ZFI->setHighSavedGPR(HighGPR);
+
+  // Include the GPR varargs, if any.  R6D is call-saved, so would
+  // be included by the loop above, but we also need to handle the
+  // call-clobbered argument registers.
+  if (IsVarArg) {
+    unsigned FirstGPR = ZFI->getVarArgsFirstGPR();
+    if (FirstGPR < SystemZ::NumArgGPRs) {
+      unsigned Reg = SystemZ::ArgGPRs[FirstGPR];
+      unsigned Offset = RegSpillOffsets[Reg];
+      if (StartOffset > Offset) {
+        LowGPR = Reg; StartOffset = Offset;
+      }
+    }
+  }
+
+  // Save GPRs
+  if (LowGPR) {
+    assert(LowGPR != HighGPR && "Should be saving %r15 and something else");
+
+    // Build an STMG instruction.
+    MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(SystemZ::STMG));
+
+    // Add the explicit register operands.
+    addSavedGPR(MBB, MIB, LowGPR, false);
+    addSavedGPR(MBB, MIB, HighGPR, false);
+
+    // Add the address.
+    MIB.addReg(SystemZ::R15D).addImm(StartOffset);
+
+    // Make sure all call-saved GPRs are included as operands and are
+    // marked as live on entry.
+    for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
+      unsigned Reg = CSI[I].getReg();
+      if (SystemZ::GR64BitRegClass.contains(Reg))
+        addSavedGPR(MBB, MIB, Reg, true);
+    }
+
+    // ...likewise GPR varargs.
+    if (IsVarArg)
+      for (unsigned I = ZFI->getVarArgsFirstGPR(); I < SystemZ::NumArgGPRs; ++I)
+        addSavedGPR(MBB, MIB, SystemZ::ArgGPRs[I], true);
+  }
+
+  // Save FPRs in the normal TargetInstrInfo way.
+  for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
+    unsigned Reg = CSI[I].getReg();
+    if (SystemZ::FP64BitRegClass.contains(Reg)) {
+      MBB.addLiveIn(Reg);
+      TII->storeRegToStackSlot(MBB, MBBI, Reg, true, CSI[I].getFrameIdx(),
+                               &SystemZ::FP64BitRegClass, TRI);
+    }
+  }
+
+  return true;
+}
+
+bool SystemZFrameLowering::
+restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI,
+                            const std::vector<CalleeSavedInfo> &CSI,
+                            const TargetRegisterInfo *TRI) const {
+  if (CSI.empty())
+    return false;
+
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo *TII = MF.getTarget().getInstrInfo();
+  SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
+  bool HasFP = hasFP(MF);
+  DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+
+  // Restore FPRs in the normal TargetInstrInfo way.
+  for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
+    unsigned Reg = CSI[I].getReg();
+    if (SystemZ::FP64BitRegClass.contains(Reg))
+      TII->loadRegFromStackSlot(MBB, MBBI, Reg, CSI[I].getFrameIdx(),
+                                &SystemZ::FP64BitRegClass, TRI);
+  }
+
+  // Restore call-saved GPRs (but not call-clobbered varargs, which at
+  // this point might hold return values).
+  unsigned LowGPR = ZFI->getLowSavedGPR();
+  unsigned HighGPR = ZFI->getHighSavedGPR();
+  unsigned StartOffset = RegSpillOffsets[LowGPR];
+  if (LowGPR) {
+    // If we saved any of %r2-%r5 as varargs, we should also be saving
+    // and restoring %r6.  If we're saving %r6 or above, we should be
+    // restoring it too.
+    assert(LowGPR != HighGPR && "Should be loading %r15 and something else");
+
+    // Build an LMG instruction.
+    MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(SystemZ::LMG));
+
+    // Add the explicit register operands.
+    MIB.addReg(LowGPR, RegState::Define);
+    MIB.addReg(HighGPR, RegState::Define);
+
+    // Add the address.
+    MIB.addReg(HasFP ? SystemZ::R11D : SystemZ::R15D);
+    MIB.addImm(StartOffset);
+
+    // Do a second scan adding regs as being defined by instruction
+    for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
+      unsigned Reg = CSI[I].getReg();
+      if (Reg != LowGPR && Reg != HighGPR)
+        MIB.addReg(Reg, RegState::ImplicitDefine);
+    }
+  }
+
+  return true;
+}
+
+void SystemZFrameLowering::
+processFunctionBeforeFrameFinalized(MachineFunction &MF,
+                                    RegScavenger *RS) const {
+  MachineFrameInfo *MFFrame = MF.getFrameInfo();
+  uint64_t MaxReach = (MFFrame->estimateStackSize(MF) +
+                       SystemZMC::CallFrameSize * 2);
+  if (!isUInt<12>(MaxReach)) {
+    // We may need register scavenging slots if some parts of the frame
+    // are outside the reach of an unsigned 12-bit displacement.
+    // Create 2 for the case where both addresses in an MVC are
+    // out of range.
+    RS->addScavengingFrameIndex(MFFrame->CreateStackObject(8, 8, false));
+    RS->addScavengingFrameIndex(MFFrame->CreateStackObject(8, 8, false));
+  }
+}
+
+// Emit instructions before MBBI (in MBB) to add NumBytes to Reg.
+static void emitIncrement(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator &MBBI,
+                          const DebugLoc &DL,
+                          unsigned Reg, int64_t NumBytes,
+                          const TargetInstrInfo *TII) {
+  while (NumBytes) {
+    unsigned Opcode;
+    int64_t ThisVal = NumBytes;
+    if (isInt<16>(NumBytes))
+      Opcode = SystemZ::AGHI;
+    else {
+      Opcode = SystemZ::AGFI;
+      // Make sure we maintain 8-byte stack alignment.
+      int64_t MinVal = -int64_t(1) << 31;
+      int64_t MaxVal = (int64_t(1) << 31) - 8;
+      if (ThisVal < MinVal)
+        ThisVal = MinVal;
+      else if (ThisVal > MaxVal)
+        ThisVal = MaxVal;
+    }
+    MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII->get(Opcode), Reg)
+      .addReg(Reg).addImm(ThisVal);
+    // The CC implicit def is dead.
+    MI->getOperand(3).setIsDead();
+    NumBytes -= ThisVal;
+  }
+}
+
+void SystemZFrameLowering::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();
+  MachineFrameInfo *MFFrame = MF.getFrameInfo();
+  auto *ZII =
+    static_cast<const SystemZInstrInfo*>(MF.getTarget().getInstrInfo());
+  SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  MachineModuleInfo &MMI = MF.getMMI();
+  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+  const std::vector<CalleeSavedInfo> &CSI = MFFrame->getCalleeSavedInfo();
+  bool HasFP = hasFP(MF);
+  DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+
+  // The current offset of the stack pointer from the CFA.
+  int64_t SPOffsetFromCFA = -SystemZMC::CFAOffsetFromInitialSP;
+
+  if (ZFI->getLowSavedGPR()) {
+    // Skip over the GPR saves.
+    if (MBBI != MBB.end() && MBBI->getOpcode() == SystemZ::STMG)
+      ++MBBI;
+    else
+      llvm_unreachable("Couldn't skip over GPR saves");
+
+    // Add CFI for the GPR saves.
+    for (auto &Save : CSI) {
+      unsigned Reg = Save.getReg();
+      if (SystemZ::GR64BitRegClass.contains(Reg)) {
+        int64_t Offset = SPOffsetFromCFA + RegSpillOffsets[Reg];
+        unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
+            nullptr, MRI->getDwarfRegNum(Reg, true), Offset));
+        BuildMI(MBB, MBBI, DL, ZII->get(TargetOpcode::CFI_INSTRUCTION))
+            .addCFIIndex(CFIIndex);
+      }
+    }
+  }
+
+  uint64_t StackSize = getAllocatedStackSize(MF);
+  if (StackSize) {
+    // Allocate StackSize bytes.
+    int64_t Delta = -int64_t(StackSize);
+    emitIncrement(MBB, MBBI, DL, SystemZ::R15D, Delta, ZII);
+
+    // Add CFI for the allocation.
+    unsigned CFIIndex = MMI.addFrameInst(
+        MCCFIInstruction::createDefCfaOffset(nullptr, SPOffsetFromCFA + Delta));
+    BuildMI(MBB, MBBI, DL, ZII->get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
+    SPOffsetFromCFA += Delta;
+  }
+
+  if (HasFP) {
+    // Copy the base of the frame to R11.
+    BuildMI(MBB, MBBI, DL, ZII->get(SystemZ::LGR), SystemZ::R11D)
+      .addReg(SystemZ::R15D);
+
+    // Add CFI for the new frame location.
+    unsigned HardFP = MRI->getDwarfRegNum(SystemZ::R11D, true);
+    unsigned CFIIndex = MMI.addFrameInst(
+        MCCFIInstruction::createDefCfaRegister(nullptr, HardFP));
+    BuildMI(MBB, MBBI, DL, ZII->get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
+
+    // Mark the FramePtr as live at the beginning of every block except
+    // the entry block.  (We'll have marked R11 as live on entry when
+    // saving the GPRs.)
+    for (auto I = std::next(MF.begin()), E = MF.end(); I != E; ++I)
+      I->addLiveIn(SystemZ::R11D);
+  }
+
+  // Skip over the FPR saves.
+  SmallVector<unsigned, 8> CFIIndexes;
+  for (auto &Save : CSI) {
+    unsigned Reg = Save.getReg();
+    if (SystemZ::FP64BitRegClass.contains(Reg)) {
+      if (MBBI != MBB.end() &&
+          (MBBI->getOpcode() == SystemZ::STD ||
+           MBBI->getOpcode() == SystemZ::STDY))
+        ++MBBI;
+      else
+        llvm_unreachable("Couldn't skip over FPR save");
+
+      // Add CFI for the this save.
+      unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
+      int64_t Offset = getFrameIndexOffset(MF, Save.getFrameIdx());
+      unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
+          nullptr, DwarfReg, SPOffsetFromCFA + Offset));
+      CFIIndexes.push_back(CFIIndex);
+    }
+  }
+  // Complete the CFI for the FPR saves, modelling them as taking effect
+  // after the last save.
+  for (auto CFIIndex : CFIIndexes) {
+    BuildMI(MBB, MBBI, DL, ZII->get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
+  }
+}
+
+void SystemZFrameLowering::emitEpilogue(MachineFunction &MF,
+                                        MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  auto *ZII =
+    static_cast<const SystemZInstrInfo*>(MF.getTarget().getInstrInfo());
+  SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
+
+  // Skip the return instruction.
+  assert(MBBI->isReturn() && "Can only insert epilogue into returning blocks");
+
+  uint64_t StackSize = getAllocatedStackSize(MF);
+  if (ZFI->getLowSavedGPR()) {
+    --MBBI;
+    unsigned Opcode = MBBI->getOpcode();
+    if (Opcode != SystemZ::LMG)
+      llvm_unreachable("Expected to see callee-save register restore code");
+
+    unsigned AddrOpNo = 2;
+    DebugLoc DL = MBBI->getDebugLoc();
+    uint64_t Offset = StackSize + MBBI->getOperand(AddrOpNo + 1).getImm();
+    unsigned NewOpcode = ZII->getOpcodeForOffset(Opcode, Offset);
+
+    // If the offset is too large, use the largest stack-aligned offset
+    // and add the rest to the base register (the stack or frame pointer).
+    if (!NewOpcode) {
+      uint64_t NumBytes = Offset - 0x7fff8;
+      emitIncrement(MBB, MBBI, DL, MBBI->getOperand(AddrOpNo).getReg(),
+                    NumBytes, ZII);
+      Offset -= NumBytes;
+      NewOpcode = ZII->getOpcodeForOffset(Opcode, Offset);
+      assert(NewOpcode && "No restore instruction available");
+    }
+
+    MBBI->setDesc(ZII->get(NewOpcode));
+    MBBI->getOperand(AddrOpNo + 1).ChangeToImmediate(Offset);
+  } else if (StackSize) {
+    DebugLoc DL = MBBI->getDebugLoc();
+    emitIncrement(MBB, MBBI, DL, SystemZ::R15D, StackSize, ZII);
+  }
+}
+
+bool SystemZFrameLowering::hasFP(const MachineFunction &MF) const {
+  return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
+          MF.getFrameInfo()->hasVarSizedObjects() ||
+          MF.getInfo<SystemZMachineFunctionInfo>()->getManipulatesSP());
+}
+
+int SystemZFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
+                                              int FI) const {
+  const MachineFrameInfo *MFFrame = MF.getFrameInfo();
+
+  // Start with the offset of FI from the top of the caller-allocated frame
+  // (i.e. the top of the 160 bytes allocated by the caller).  This initial
+  // offset is therefore negative.
+  int64_t Offset = (MFFrame->getObjectOffset(FI) +
+                    MFFrame->getOffsetAdjustment());
+
+  // Make the offset relative to the incoming stack pointer.
+  Offset -= getOffsetOfLocalArea();
+
+  // Make the offset relative to the bottom of the frame.
+  Offset += getAllocatedStackSize(MF);
+
+  return Offset;
+}
+
+uint64_t SystemZFrameLowering::
+getAllocatedStackSize(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFFrame = MF.getFrameInfo();
+
+  // Start with the size of the local variables and spill slots.
+  uint64_t StackSize = MFFrame->getStackSize();
+
+  // We need to allocate the ABI-defined 160-byte base area whenever
+  // we allocate stack space for our own use and whenever we call another
+  // function.
+  if (StackSize || MFFrame->hasVarSizedObjects() || MFFrame->hasCalls())
+    StackSize += SystemZMC::CallFrameSize;
+
+  return StackSize;
+}
+
+bool
+SystemZFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+  // The ABI requires us to allocate 160 bytes of stack space for the callee,
+  // with any outgoing stack arguments being placed above that.  It seems
+  // better to make that area a permanent feature of the frame even if
+  // we're using a frame pointer.
+  return true;
+}
+
+void SystemZFrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF,
+                              MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MI) const {
+  switch (MI->getOpcode()) {
+  case SystemZ::ADJCALLSTACKDOWN:
+  case SystemZ::ADJCALLSTACKUP:
+    assert(hasReservedCallFrame(MF) &&
+           "ADJSTACKDOWN and ADJSTACKUP should be no-ops");
+    MBB.erase(MI);
+    break;
+
+  default:
+    llvm_unreachable("Unexpected call frame instruction");
+  }
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.h b/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
new file mode 100644
index 0000000..4d5fe6d
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
@@ -0,0 +1,64 @@
+//===-- SystemZFrameLowering.h - Frame lowering for SystemZ -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SYSTEMZFRAMELOWERING_H
+#define SYSTEMZFRAMELOWERING_H
+
+#include "llvm/ADT/IndexedMap.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+class SystemZTargetMachine;
+class SystemZSubtarget;
+
+class SystemZFrameLowering : public TargetFrameLowering {
+  IndexedMap<unsigned> RegSpillOffsets;
+
+public:
+  SystemZFrameLowering();
+
+  // Override TargetFrameLowering.
+  bool isFPCloseToIncomingSP() const override { return false; }
+  const SpillSlot *getCalleeSavedSpillSlots(unsigned &NumEntries) const
+    override;
+  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                            RegScavenger *RS) const override;
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MBBI,
+                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 const TargetRegisterInfo *TRI) const override;
+  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBII,
+                                   const std::vector<CalleeSavedInfo> &CSI,
+                                   const TargetRegisterInfo *TRI) const
+    override;
+  void processFunctionBeforeFrameFinalized(MachineFunction &MF,
+                                           RegScavenger *RS) const override;
+  void emitPrologue(MachineFunction &MF) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+  bool hasFP(const MachineFunction &MF) const override;
+  int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
+  bool hasReservedCallFrame(const MachineFunction &MF) const override;
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MI) const
+    override;
+
+  // Return the number of bytes in the callee-allocated part of the frame.
+  uint64_t getAllocatedStackSize(const MachineFunction &MF) const;
+
+  // Return the byte offset from the incoming stack pointer of Reg's
+  // ABI-defined save slot.  Return 0 if no slot is defined for Reg.
+  unsigned getRegSpillOffset(unsigned Reg) const {
+    return RegSpillOffsets[Reg];
+  }
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
new file mode 100644
index 0000000..24f7584
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -0,0 +1,1144 @@
+//===-- SystemZISelDAGToDAG.cpp - A dag to dag inst selector for SystemZ --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the SystemZ target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZTargetMachine.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "systemz-isel"
+
+namespace {
+// Used to build addressing modes.
+struct SystemZAddressingMode {
+  // The shape of the address.
+  enum AddrForm {
+    // base+displacement
+    FormBD,
+
+    // base+displacement+index for load and store operands
+    FormBDXNormal,
+
+    // base+displacement+index for load address operands
+    FormBDXLA,
+
+    // base+displacement+index+ADJDYNALLOC
+    FormBDXDynAlloc
+  };
+  AddrForm Form;
+
+  // The type of displacement.  The enum names here correspond directly
+  // to the definitions in SystemZOperand.td.  We could split them into
+  // flags -- single/pair, 128-bit, etc. -- but it hardly seems worth it.
+  enum DispRange {
+    Disp12Only,
+    Disp12Pair,
+    Disp20Only,
+    Disp20Only128,
+    Disp20Pair
+  };
+  DispRange DR;
+
+  // The parts of the address.  The address is equivalent to:
+  //
+  //     Base + Disp + Index + (IncludesDynAlloc ? ADJDYNALLOC : 0)
+  SDValue Base;
+  int64_t Disp;
+  SDValue Index;
+  bool IncludesDynAlloc;
+
+  SystemZAddressingMode(AddrForm form, DispRange dr)
+    : Form(form), DR(dr), Base(), Disp(0), Index(),
+      IncludesDynAlloc(false) {}
+
+  // True if the address can have an index register.
+  bool hasIndexField() { return Form != FormBD; }
+
+  // True if the address can (and must) include ADJDYNALLOC.
+  bool isDynAlloc() { return Form == FormBDXDynAlloc; }
+
+  void dump() {
+    errs() << "SystemZAddressingMode " << this << '\n';
+
+    errs() << " Base ";
+    if (Base.getNode())
+      Base.getNode()->dump();
+    else
+      errs() << "null\n";
+
+    if (hasIndexField()) {
+      errs() << " Index ";
+      if (Index.getNode())
+        Index.getNode()->dump();
+      else
+        errs() << "null\n";
+    }
+
+    errs() << " Disp " << Disp;
+    if (IncludesDynAlloc)
+      errs() << " + ADJDYNALLOC";
+    errs() << '\n';
+  }
+};
+
+// Return a mask with Count low bits set.
+static uint64_t allOnes(unsigned int Count) {
+  return Count == 0 ? 0 : (uint64_t(1) << (Count - 1) << 1) - 1;
+}
+
+// Represents operands 2 to 5 of the ROTATE AND ... SELECTED BITS operation
+// given by Opcode.  The operands are: Input (R2), Start (I3), End (I4) and
+// Rotate (I5).  The combined operand value is effectively:
+//
+//   (or (rotl Input, Rotate), ~Mask)
+//
+// for RNSBG and:
+//
+//   (and (rotl Input, Rotate), Mask)
+//
+// otherwise.  The output value has BitSize bits, although Input may be
+// narrower (in which case the upper bits are don't care).
+struct RxSBGOperands {
+  RxSBGOperands(unsigned Op, SDValue N)
+    : Opcode(Op), BitSize(N.getValueType().getSizeInBits()),
+      Mask(allOnes(BitSize)), Input(N), Start(64 - BitSize), End(63),
+      Rotate(0) {}
+
+  unsigned Opcode;
+  unsigned BitSize;
+  uint64_t Mask;
+  SDValue Input;
+  unsigned Start;
+  unsigned End;
+  unsigned Rotate;
+};
+
+class SystemZDAGToDAGISel : public SelectionDAGISel {
+  const SystemZTargetLowering &Lowering;
+  const SystemZSubtarget &Subtarget;
+
+  // Used by SystemZOperands.td to create integer constants.
+  inline SDValue getImm(const SDNode *Node, uint64_t Imm) const {
+    return CurDAG->getTargetConstant(Imm, Node->getValueType(0));
+  }
+
+  const SystemZTargetMachine &getTargetMachine() const {
+    return static_cast<const SystemZTargetMachine &>(TM);
+  }
+
+  const SystemZInstrInfo *getInstrInfo() const {
+    return getTargetMachine().getInstrInfo();
+  }
+
+  // Try to fold more of the base or index of AM into AM, where IsBase
+  // selects between the base and index.
+  bool expandAddress(SystemZAddressingMode &AM, bool IsBase) const;
+
+  // Try to describe N in AM, returning true on success.
+  bool selectAddress(SDValue N, SystemZAddressingMode &AM) const;
+
+  // Extract individual target operands from matched address AM.
+  void getAddressOperands(const SystemZAddressingMode &AM, EVT VT,
+                          SDValue &Base, SDValue &Disp) const;
+  void getAddressOperands(const SystemZAddressingMode &AM, EVT VT,
+                          SDValue &Base, SDValue &Disp, SDValue &Index) const;
+
+  // Try to match Addr as a FormBD address with displacement type DR.
+  // Return true on success, storing the base and displacement in
+  // Base and Disp respectively.
+  bool selectBDAddr(SystemZAddressingMode::DispRange DR, SDValue Addr,
+                    SDValue &Base, SDValue &Disp) const;
+
+  // Try to match Addr as a FormBDX address with displacement type DR.
+  // Return true on success and if the result had no index.  Store the
+  // base and displacement in Base and Disp respectively.
+  bool selectMVIAddr(SystemZAddressingMode::DispRange DR, SDValue Addr,
+                     SDValue &Base, SDValue &Disp) const;
+
+  // Try to match Addr as a FormBDX* address of form Form with
+  // displacement type DR.  Return true on success, storing the base,
+  // displacement and index in Base, Disp and Index respectively.
+  bool selectBDXAddr(SystemZAddressingMode::AddrForm Form,
+                     SystemZAddressingMode::DispRange DR, SDValue Addr,
+                     SDValue &Base, SDValue &Disp, SDValue &Index) const;
+
+  // PC-relative address matching routines used by SystemZOperands.td.
+  bool selectPCRelAddress(SDValue Addr, SDValue &Target) const {
+    if (SystemZISD::isPCREL(Addr.getOpcode())) {
+      Target = Addr.getOperand(0);
+      return true;
+    }
+    return false;
+  }
+
+  // BD matching routines used by SystemZOperands.td.
+  bool selectBDAddr12Only(SDValue Addr, SDValue &Base, SDValue &Disp) const {
+    return selectBDAddr(SystemZAddressingMode::Disp12Only, Addr, Base, Disp);
+  }
+  bool selectBDAddr12Pair(SDValue Addr, SDValue &Base, SDValue &Disp) const {
+    return selectBDAddr(SystemZAddressingMode::Disp12Pair, Addr, Base, Disp);
+  }
+  bool selectBDAddr20Only(SDValue Addr, SDValue &Base, SDValue &Disp) const {
+    return selectBDAddr(SystemZAddressingMode::Disp20Only, Addr, Base, Disp);
+  }
+  bool selectBDAddr20Pair(SDValue Addr, SDValue &Base, SDValue &Disp) const {
+    return selectBDAddr(SystemZAddressingMode::Disp20Pair, Addr, Base, Disp);
+  }
+
+  // MVI matching routines used by SystemZOperands.td.
+  bool selectMVIAddr12Pair(SDValue Addr, SDValue &Base, SDValue &Disp) const {
+    return selectMVIAddr(SystemZAddressingMode::Disp12Pair, Addr, Base, Disp);
+  }
+  bool selectMVIAddr20Pair(SDValue Addr, SDValue &Base, SDValue &Disp) const {
+    return selectMVIAddr(SystemZAddressingMode::Disp20Pair, Addr, Base, Disp);
+  }
+
+  // BDX matching routines used by SystemZOperands.td.
+  bool selectBDXAddr12Only(SDValue Addr, SDValue &Base, SDValue &Disp,
+                           SDValue &Index) const {
+    return selectBDXAddr(SystemZAddressingMode::FormBDXNormal,
+                         SystemZAddressingMode::Disp12Only,
+                         Addr, Base, Disp, Index);
+  }
+  bool selectBDXAddr12Pair(SDValue Addr, SDValue &Base, SDValue &Disp,
+                           SDValue &Index) const {
+    return selectBDXAddr(SystemZAddressingMode::FormBDXNormal,
+                         SystemZAddressingMode::Disp12Pair,
+                         Addr, Base, Disp, Index);
+  }
+  bool selectDynAlloc12Only(SDValue Addr, SDValue &Base, SDValue &Disp,
+                            SDValue &Index) const {
+    return selectBDXAddr(SystemZAddressingMode::FormBDXDynAlloc,
+                         SystemZAddressingMode::Disp12Only,
+                         Addr, Base, Disp, Index);
+  }
+  bool selectBDXAddr20Only(SDValue Addr, SDValue &Base, SDValue &Disp,
+                           SDValue &Index) const {
+    return selectBDXAddr(SystemZAddressingMode::FormBDXNormal,
+                         SystemZAddressingMode::Disp20Only,
+                         Addr, Base, Disp, Index);
+  }
+  bool selectBDXAddr20Only128(SDValue Addr, SDValue &Base, SDValue &Disp,
+                              SDValue &Index) const {
+    return selectBDXAddr(SystemZAddressingMode::FormBDXNormal,
+                         SystemZAddressingMode::Disp20Only128,
+                         Addr, Base, Disp, Index);
+  }
+  bool selectBDXAddr20Pair(SDValue Addr, SDValue &Base, SDValue &Disp,
+                           SDValue &Index) const {
+    return selectBDXAddr(SystemZAddressingMode::FormBDXNormal,
+                         SystemZAddressingMode::Disp20Pair,
+                         Addr, Base, Disp, Index);
+  }
+  bool selectLAAddr12Pair(SDValue Addr, SDValue &Base, SDValue &Disp,
+                          SDValue &Index) const {
+    return selectBDXAddr(SystemZAddressingMode::FormBDXLA,
+                         SystemZAddressingMode::Disp12Pair,
+                         Addr, Base, Disp, Index);
+  }
+  bool selectLAAddr20Pair(SDValue Addr, SDValue &Base, SDValue &Disp,
+                          SDValue &Index) const {
+    return selectBDXAddr(SystemZAddressingMode::FormBDXLA,
+                         SystemZAddressingMode::Disp20Pair,
+                         Addr, Base, Disp, Index);
+  }
+
+  // Check whether (or Op (and X InsertMask)) is effectively an insertion
+  // of X into bits InsertMask of some Y != Op.  Return true if so and
+  // set Op to that Y.
+  bool detectOrAndInsertion(SDValue &Op, uint64_t InsertMask) const;
+
+  // Try to update RxSBG so that only the bits of RxSBG.Input in Mask are used.
+  // Return true on success.
+  bool refineRxSBGMask(RxSBGOperands &RxSBG, uint64_t Mask) const;
+
+  // Try to fold some of RxSBG.Input into other fields of RxSBG.
+  // Return true on success.
+  bool expandRxSBG(RxSBGOperands &RxSBG) const;
+
+  // Return an undefined value of type VT.
+  SDValue getUNDEF(SDLoc DL, EVT VT) const;
+
+  // Convert N to VT, if it isn't already.
+  SDValue convertTo(SDLoc DL, EVT VT, SDValue N) const;
+
+  // Try to implement AND or shift node N using RISBG with the zero flag set.
+  // Return the selected node on success, otherwise return null.
+  SDNode *tryRISBGZero(SDNode *N);
+
+  // Try to use RISBG or Opcode to implement OR or XOR node N.
+  // Return the selected node on success, otherwise return null.
+  SDNode *tryRxSBG(SDNode *N, unsigned Opcode);
+
+  // If Op0 is null, then Node is a constant that can be loaded using:
+  //
+  //   (Opcode UpperVal LowerVal)
+  //
+  // If Op0 is nonnull, then Node can be implemented using:
+  //
+  //   (Opcode (Opcode Op0 UpperVal) LowerVal)
+  SDNode *splitLargeImmediate(unsigned Opcode, SDNode *Node, SDValue Op0,
+                              uint64_t UpperVal, uint64_t LowerVal);
+
+  // Return true if Load and Store are loads and stores of the same size
+  // and are guaranteed not to overlap.  Such operations can be implemented
+  // using block (SS-format) instructions.
+  //
+  // Partial overlap would lead to incorrect code, since the block operations
+  // are logically bytewise, even though they have a fast path for the
+  // non-overlapping case.  We also need to avoid full overlap (i.e. two
+  // addresses that might be equal at run time) because although that case
+  // would be handled correctly, it might be implemented by millicode.
+  bool canUseBlockOperation(StoreSDNode *Store, LoadSDNode *Load) const;
+
+  // N is a (store (load Y), X) pattern.  Return true if it can use an MVC
+  // from Y to X.
+  bool storeLoadCanUseMVC(SDNode *N) const;
+
+  // N is a (store (op (load A[0]), (load A[1])), X) pattern.  Return true
+  // if A[1 - I] == X and if N can use a block operation like NC from A[I]
+  // to X.
+  bool storeLoadCanUseBlockBinary(SDNode *N, unsigned I) const;
+
+public:
+  SystemZDAGToDAGISel(SystemZTargetMachine &TM, CodeGenOpt::Level OptLevel)
+    : SelectionDAGISel(TM, OptLevel),
+      Lowering(*TM.getTargetLowering()),
+      Subtarget(*TM.getSubtargetImpl()) { }
+
+  // Override MachineFunctionPass.
+  const char *getPassName() const override {
+    return "SystemZ DAG->DAG Pattern Instruction Selection";
+  }
+
+  // Override SelectionDAGISel.
+  SDNode *Select(SDNode *Node) override;
+  bool SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
+                                    std::vector<SDValue> &OutOps) override;
+
+  // Include the pieces autogenerated from the target description.
+  #include "SystemZGenDAGISel.inc"
+};
+} // end anonymous namespace
+
+FunctionPass *llvm::createSystemZISelDag(SystemZTargetMachine &TM,
+                                         CodeGenOpt::Level OptLevel) {
+  return new SystemZDAGToDAGISel(TM, OptLevel);
+}
+
+// Return true if Val should be selected as a displacement for an address
+// with range DR.  Here we're interested in the range of both the instruction
+// described by DR and of any pairing instruction.
+static bool selectDisp(SystemZAddressingMode::DispRange DR, int64_t Val) {
+  switch (DR) {
+  case SystemZAddressingMode::Disp12Only:
+    return isUInt<12>(Val);
+
+  case SystemZAddressingMode::Disp12Pair:
+  case SystemZAddressingMode::Disp20Only:
+  case SystemZAddressingMode::Disp20Pair:
+    return isInt<20>(Val);
+
+  case SystemZAddressingMode::Disp20Only128:
+    return isInt<20>(Val) && isInt<20>(Val + 8);
+  }
+  llvm_unreachable("Unhandled displacement range");
+}
+
+// Change the base or index in AM to Value, where IsBase selects
+// between the base and index.
+static void changeComponent(SystemZAddressingMode &AM, bool IsBase,
+                            SDValue Value) {
+  if (IsBase)
+    AM.Base = Value;
+  else
+    AM.Index = Value;
+}
+
+// The base or index of AM is equivalent to Value + ADJDYNALLOC,
+// where IsBase selects between the base and index.  Try to fold the
+// ADJDYNALLOC into AM.
+static bool expandAdjDynAlloc(SystemZAddressingMode &AM, bool IsBase,
+                              SDValue Value) {
+  if (AM.isDynAlloc() && !AM.IncludesDynAlloc) {
+    changeComponent(AM, IsBase, Value);
+    AM.IncludesDynAlloc = true;
+    return true;
+  }
+  return false;
+}
+
+// The base of AM is equivalent to Base + Index.  Try to use Index as
+// the index register.
+static bool expandIndex(SystemZAddressingMode &AM, SDValue Base,
+                        SDValue Index) {
+  if (AM.hasIndexField() && !AM.Index.getNode()) {
+    AM.Base = Base;
+    AM.Index = Index;
+    return true;
+  }
+  return false;
+}
+
+// The base or index of AM is equivalent to Op0 + Op1, where IsBase selects
+// between the base and index.  Try to fold Op1 into AM's displacement.
+static bool expandDisp(SystemZAddressingMode &AM, bool IsBase,
+                       SDValue Op0, uint64_t Op1) {
+  // First try adjusting the displacement.
+  int64_t TestDisp = AM.Disp + Op1;
+  if (selectDisp(AM.DR, TestDisp)) {
+    changeComponent(AM, IsBase, Op0);
+    AM.Disp = TestDisp;
+    return true;
+  }
+
+  // We could consider forcing the displacement into a register and
+  // using it as an index, but it would need to be carefully tuned.
+  return false;
+}
+
+bool SystemZDAGToDAGISel::expandAddress(SystemZAddressingMode &AM,
+                                        bool IsBase) const {
+  SDValue N = IsBase ? AM.Base : AM.Index;
+  unsigned Opcode = N.getOpcode();
+  if (Opcode == ISD::TRUNCATE) {
+    N = N.getOperand(0);
+    Opcode = N.getOpcode();
+  }
+  if (Opcode == ISD::ADD || CurDAG->isBaseWithConstantOffset(N)) {
+    SDValue Op0 = N.getOperand(0);
+    SDValue Op1 = N.getOperand(1);
+
+    unsigned Op0Code = Op0->getOpcode();
+    unsigned Op1Code = Op1->getOpcode();
+
+    if (Op0Code == SystemZISD::ADJDYNALLOC)
+      return expandAdjDynAlloc(AM, IsBase, Op1);
+    if (Op1Code == SystemZISD::ADJDYNALLOC)
+      return expandAdjDynAlloc(AM, IsBase, Op0);
+
+    if (Op0Code == ISD::Constant)
+      return expandDisp(AM, IsBase, Op1,
+                        cast<ConstantSDNode>(Op0)->getSExtValue());
+    if (Op1Code == ISD::Constant)
+      return expandDisp(AM, IsBase, Op0,
+                        cast<ConstantSDNode>(Op1)->getSExtValue());
+
+    if (IsBase && expandIndex(AM, Op0, Op1))
+      return true;
+  }
+  if (Opcode == SystemZISD::PCREL_OFFSET) {
+    SDValue Full = N.getOperand(0);
+    SDValue Base = N.getOperand(1);
+    SDValue Anchor = Base.getOperand(0);
+    uint64_t Offset = (cast<GlobalAddressSDNode>(Full)->getOffset() -
+                       cast<GlobalAddressSDNode>(Anchor)->getOffset());
+    return expandDisp(AM, IsBase, Base, Offset);
+  }
+  return false;
+}
+
+// Return true if an instruction with displacement range DR should be
+// used for displacement value Val.  selectDisp(DR, Val) must already hold.
+static bool isValidDisp(SystemZAddressingMode::DispRange DR, int64_t Val) {
+  assert(selectDisp(DR, Val) && "Invalid displacement");
+  switch (DR) {
+  case SystemZAddressingMode::Disp12Only:
+  case SystemZAddressingMode::Disp20Only:
+  case SystemZAddressingMode::Disp20Only128:
+    return true;
+
+  case SystemZAddressingMode::Disp12Pair:
+    // Use the other instruction if the displacement is too large.
+    return isUInt<12>(Val);
+
+  case SystemZAddressingMode::Disp20Pair:
+    // Use the other instruction if the displacement is small enough.
+    return !isUInt<12>(Val);
+  }
+  llvm_unreachable("Unhandled displacement range");
+}
+
+// Return true if Base + Disp + Index should be performed by LA(Y).
+static bool shouldUseLA(SDNode *Base, int64_t Disp, SDNode *Index) {
+  // Don't use LA(Y) for constants.
+  if (!Base)
+    return false;
+
+  // Always use LA(Y) for frame addresses, since we know that the destination
+  // register is almost always (perhaps always) going to be different from
+  // the frame register.
+  if (Base->getOpcode() == ISD::FrameIndex)
+    return true;
+
+  if (Disp) {
+    // Always use LA(Y) if there is a base, displacement and index.
+    if (Index)
+      return true;
+
+    // Always use LA if the displacement is small enough.  It should always
+    // be no worse than AGHI (and better if it avoids a move).
+    if (isUInt<12>(Disp))
+      return true;
+
+    // For similar reasons, always use LAY if the constant is too big for AGHI.
+    // LAY should be no worse than AGFI.
+    if (!isInt<16>(Disp))
+      return true;
+  } else {
+    // Don't use LA for plain registers.
+    if (!Index)
+      return false;
+
+    // Don't use LA for plain addition if the index operand is only used
+    // once.  It should be a natural two-operand addition in that case.
+    if (Index->hasOneUse())
+      return false;
+
+    // Prefer addition if the second operation is sign-extended, in the
+    // hope of using AGF.
+    unsigned IndexOpcode = Index->getOpcode();
+    if (IndexOpcode == ISD::SIGN_EXTEND ||
+        IndexOpcode == ISD::SIGN_EXTEND_INREG)
+      return false;
+  }
+
+  // Don't use LA for two-operand addition if either operand is only
+  // used once.  The addition instructions are better in that case.
+  if (Base->hasOneUse())
+    return false;
+
+  return true;
+}
+
+// Return true if Addr is suitable for AM, updating AM if so.
+bool SystemZDAGToDAGISel::selectAddress(SDValue Addr,
+                                        SystemZAddressingMode &AM) const {
+  // Start out assuming that the address will need to be loaded separately,
+  // then try to extend it as much as we can.
+  AM.Base = Addr;
+
+  // First try treating the address as a constant.
+  if (Addr.getOpcode() == ISD::Constant &&
+      expandDisp(AM, true, SDValue(),
+                 cast<ConstantSDNode>(Addr)->getSExtValue()))
+    ;
+  else
+    // Otherwise try expanding each component.
+    while (expandAddress(AM, true) ||
+           (AM.Index.getNode() && expandAddress(AM, false)))
+      continue;
+
+  // Reject cases where it isn't profitable to use LA(Y).
+  if (AM.Form == SystemZAddressingMode::FormBDXLA &&
+      !shouldUseLA(AM.Base.getNode(), AM.Disp, AM.Index.getNode()))
+    return false;
+
+  // Reject cases where the other instruction in a pair should be used.
+  if (!isValidDisp(AM.DR, AM.Disp))
+    return false;
+
+  // Make sure that ADJDYNALLOC is included where necessary.
+  if (AM.isDynAlloc() && !AM.IncludesDynAlloc)
+    return false;
+
+  DEBUG(AM.dump());
+  return true;
+}
+
+// Insert a node into the DAG at least before Pos.  This will reposition
+// the node as needed, and will assign it a node ID that is <= Pos's ID.
+// Note that this does *not* preserve the uniqueness of node IDs!
+// The selection DAG must no longer depend on their uniqueness when this
+// function is used.
+static void insertDAGNode(SelectionDAG *DAG, SDNode *Pos, SDValue N) {
+  if (N.getNode()->getNodeId() == -1 ||
+      N.getNode()->getNodeId() > Pos->getNodeId()) {
+    DAG->RepositionNode(Pos, N.getNode());
+    N.getNode()->setNodeId(Pos->getNodeId());
+  }
+}
+
+void SystemZDAGToDAGISel::getAddressOperands(const SystemZAddressingMode &AM,
+                                             EVT VT, SDValue &Base,
+                                             SDValue &Disp) const {
+  Base = AM.Base;
+  if (!Base.getNode())
+    // Register 0 means "no base".  This is mostly useful for shifts.
+    Base = CurDAG->getRegister(0, VT);
+  else if (Base.getOpcode() == ISD::FrameIndex) {
+    // Lower a FrameIndex to a TargetFrameIndex.
+    int64_t FrameIndex = cast<FrameIndexSDNode>(Base)->getIndex();
+    Base = CurDAG->getTargetFrameIndex(FrameIndex, VT);
+  } else if (Base.getValueType() != VT) {
+    // Truncate values from i64 to i32, for shifts.
+    assert(VT == MVT::i32 && Base.getValueType() == MVT::i64 &&
+           "Unexpected truncation");
+    SDLoc DL(Base);
+    SDValue Trunc = CurDAG->getNode(ISD::TRUNCATE, DL, VT, Base);
+    insertDAGNode(CurDAG, Base.getNode(), Trunc);
+    Base = Trunc;
+  }
+
+  // Lower the displacement to a TargetConstant.
+  Disp = CurDAG->getTargetConstant(AM.Disp, VT);
+}
+
+void SystemZDAGToDAGISel::getAddressOperands(const SystemZAddressingMode &AM,
+                                             EVT VT, SDValue &Base,
+                                             SDValue &Disp,
+                                             SDValue &Index) const {
+  getAddressOperands(AM, VT, Base, Disp);
+
+  Index = AM.Index;
+  if (!Index.getNode())
+    // Register 0 means "no index".
+    Index = CurDAG->getRegister(0, VT);
+}
+
+bool SystemZDAGToDAGISel::selectBDAddr(SystemZAddressingMode::DispRange DR,
+                                       SDValue Addr, SDValue &Base,
+                                       SDValue &Disp) const {
+  SystemZAddressingMode AM(SystemZAddressingMode::FormBD, DR);
+  if (!selectAddress(Addr, AM))
+    return false;
+
+  getAddressOperands(AM, Addr.getValueType(), Base, Disp);
+  return true;
+}
+
+bool SystemZDAGToDAGISel::selectMVIAddr(SystemZAddressingMode::DispRange DR,
+                                        SDValue Addr, SDValue &Base,
+                                        SDValue &Disp) const {
+  SystemZAddressingMode AM(SystemZAddressingMode::FormBDXNormal, DR);
+  if (!selectAddress(Addr, AM) || AM.Index.getNode())
+    return false;
+
+  getAddressOperands(AM, Addr.getValueType(), Base, Disp);
+  return true;
+}
+
+bool SystemZDAGToDAGISel::selectBDXAddr(SystemZAddressingMode::AddrForm Form,
+                                        SystemZAddressingMode::DispRange DR,
+                                        SDValue Addr, SDValue &Base,
+                                        SDValue &Disp, SDValue &Index) const {
+  SystemZAddressingMode AM(Form, DR);
+  if (!selectAddress(Addr, AM))
+    return false;
+
+  getAddressOperands(AM, Addr.getValueType(), Base, Disp, Index);
+  return true;
+}
+
+bool SystemZDAGToDAGISel::detectOrAndInsertion(SDValue &Op,
+                                               uint64_t InsertMask) const {
+  // We're only interested in cases where the insertion is into some operand
+  // of Op, rather than into Op itself.  The only useful case is an AND.
+  if (Op.getOpcode() != ISD::AND)
+    return false;
+
+  // We need a constant mask.
+  auto *MaskNode = dyn_cast<ConstantSDNode>(Op.getOperand(1).getNode());
+  if (!MaskNode)
+    return false;
+
+  // It's not an insertion of Op.getOperand(0) if the two masks overlap.
+  uint64_t AndMask = MaskNode->getZExtValue();
+  if (InsertMask & AndMask)
+    return false;
+
+  // It's only an insertion if all bits are covered or are known to be zero.
+  // The inner check covers all cases but is more expensive.
+  uint64_t Used = allOnes(Op.getValueType().getSizeInBits());
+  if (Used != (AndMask | InsertMask)) {
+    APInt KnownZero, KnownOne;
+    CurDAG->computeKnownBits(Op.getOperand(0), KnownZero, KnownOne);
+    if (Used != (AndMask | InsertMask | KnownZero.getZExtValue()))
+      return false;
+  }
+
+  Op = Op.getOperand(0);
+  return true;
+}
+
+bool SystemZDAGToDAGISel::refineRxSBGMask(RxSBGOperands &RxSBG,
+                                          uint64_t Mask) const {
+  const SystemZInstrInfo *TII = getInstrInfo();
+  if (RxSBG.Rotate != 0)
+    Mask = (Mask << RxSBG.Rotate) | (Mask >> (64 - RxSBG.Rotate));
+  Mask &= RxSBG.Mask;
+  if (TII->isRxSBGMask(Mask, RxSBG.BitSize, RxSBG.Start, RxSBG.End)) {
+    RxSBG.Mask = Mask;
+    return true;
+  }
+  return false;
+}
+
+// Return true if any bits of (RxSBG.Input & Mask) are significant.
+static bool maskMatters(RxSBGOperands &RxSBG, uint64_t Mask) {
+  // Rotate the mask in the same way as RxSBG.Input is rotated.
+  if (RxSBG.Rotate != 0)
+    Mask = ((Mask << RxSBG.Rotate) | (Mask >> (64 - RxSBG.Rotate)));
+  return (Mask & RxSBG.Mask) != 0;
+}
+
+bool SystemZDAGToDAGISel::expandRxSBG(RxSBGOperands &RxSBG) const {
+  SDValue N = RxSBG.Input;
+  unsigned Opcode = N.getOpcode();
+  switch (Opcode) {
+  case ISD::AND: {
+    if (RxSBG.Opcode == SystemZ::RNSBG)
+      return false;
+
+    auto *MaskNode = dyn_cast<ConstantSDNode>(N.getOperand(1).getNode());
+    if (!MaskNode)
+      return false;
+
+    SDValue Input = N.getOperand(0);
+    uint64_t Mask = MaskNode->getZExtValue();
+    if (!refineRxSBGMask(RxSBG, Mask)) {
+      // If some bits of Input are already known zeros, those bits will have
+      // been removed from the mask.  See if adding them back in makes the
+      // mask suitable.
+      APInt KnownZero, KnownOne;
+      CurDAG->computeKnownBits(Input, KnownZero, KnownOne);
+      Mask |= KnownZero.getZExtValue();
+      if (!refineRxSBGMask(RxSBG, Mask))
+        return false;
+    }
+    RxSBG.Input = Input;
+    return true;
+  }
+
+  case ISD::OR: {
+    if (RxSBG.Opcode != SystemZ::RNSBG)
+      return false;
+
+    auto *MaskNode = dyn_cast<ConstantSDNode>(N.getOperand(1).getNode());
+    if (!MaskNode)
+      return false;
+
+    SDValue Input = N.getOperand(0);
+    uint64_t Mask = ~MaskNode->getZExtValue();
+    if (!refineRxSBGMask(RxSBG, Mask)) {
+      // If some bits of Input are already known ones, those bits will have
+      // been removed from the mask.  See if adding them back in makes the
+      // mask suitable.
+      APInt KnownZero, KnownOne;
+      CurDAG->computeKnownBits(Input, KnownZero, KnownOne);
+      Mask &= ~KnownOne.getZExtValue();
+      if (!refineRxSBGMask(RxSBG, Mask))
+        return false;
+    }
+    RxSBG.Input = Input;
+    return true;
+  }
+
+  case ISD::ROTL: {
+    // Any 64-bit rotate left can be merged into the RxSBG.
+    if (RxSBG.BitSize != 64 || N.getValueType() != MVT::i64)
+      return false;
+    auto *CountNode = dyn_cast<ConstantSDNode>(N.getOperand(1).getNode());
+    if (!CountNode)
+      return false;
+
+    RxSBG.Rotate = (RxSBG.Rotate + CountNode->getZExtValue()) & 63;
+    RxSBG.Input = N.getOperand(0);
+    return true;
+  }
+      
+  case ISD::ANY_EXTEND:
+    // Bits above the extended operand are don't-care.
+    RxSBG.Input = N.getOperand(0);
+    return true;
+
+  case ISD::ZERO_EXTEND:
+    if (RxSBG.Opcode != SystemZ::RNSBG) {
+      // Restrict the mask to the extended operand.
+      unsigned InnerBitSize = N.getOperand(0).getValueType().getSizeInBits();
+      if (!refineRxSBGMask(RxSBG, allOnes(InnerBitSize)))
+        return false;
+
+      RxSBG.Input = N.getOperand(0);
+      return true;
+    }
+    // Fall through.
+    
+  case ISD::SIGN_EXTEND: {
+    // Check that the extension bits are don't-care (i.e. are masked out
+    // by the final mask).
+    unsigned InnerBitSize = N.getOperand(0).getValueType().getSizeInBits();
+    if (maskMatters(RxSBG, allOnes(RxSBG.BitSize) - allOnes(InnerBitSize)))
+      return false;
+
+    RxSBG.Input = N.getOperand(0);
+    return true;
+  }
+
+  case ISD::SHL: {
+    auto *CountNode = dyn_cast<ConstantSDNode>(N.getOperand(1).getNode());
+    if (!CountNode)
+      return false;
+
+    uint64_t Count = CountNode->getZExtValue();
+    unsigned BitSize = N.getValueType().getSizeInBits();
+    if (Count < 1 || Count >= BitSize)
+      return false;
+
+    if (RxSBG.Opcode == SystemZ::RNSBG) {
+      // Treat (shl X, count) as (rotl X, size-count) as long as the bottom
+      // count bits from RxSBG.Input are ignored.
+      if (maskMatters(RxSBG, allOnes(Count)))
+        return false;
+    } else {
+      // Treat (shl X, count) as (and (rotl X, count), ~0<<count).
+      if (!refineRxSBGMask(RxSBG, allOnes(BitSize - Count) << Count))
+        return false;
+    }
+
+    RxSBG.Rotate = (RxSBG.Rotate + Count) & 63;
+    RxSBG.Input = N.getOperand(0);
+    return true;
+  }
+
+  case ISD::SRL:
+  case ISD::SRA: {
+    auto *CountNode = dyn_cast<ConstantSDNode>(N.getOperand(1).getNode());
+    if (!CountNode)
+      return false;
+
+    uint64_t Count = CountNode->getZExtValue();
+    unsigned BitSize = N.getValueType().getSizeInBits();
+    if (Count < 1 || Count >= BitSize)
+      return false;
+
+    if (RxSBG.Opcode == SystemZ::RNSBG || Opcode == ISD::SRA) {
+      // Treat (srl|sra X, count) as (rotl X, size-count) as long as the top
+      // count bits from RxSBG.Input are ignored.
+      if (maskMatters(RxSBG, allOnes(Count) << (BitSize - Count)))
+        return false;
+    } else {
+      // Treat (srl X, count), mask) as (and (rotl X, size-count), ~0>>count),
+      // which is similar to SLL above.
+      if (!refineRxSBGMask(RxSBG, allOnes(BitSize - Count)))
+        return false;
+    }
+
+    RxSBG.Rotate = (RxSBG.Rotate - Count) & 63;
+    RxSBG.Input = N.getOperand(0);
+    return true;
+  }
+  default:
+    return false;
+  }
+}
+
+SDValue SystemZDAGToDAGISel::getUNDEF(SDLoc DL, EVT VT) const {
+  SDNode *N = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
+  return SDValue(N, 0);
+}
+
+SDValue SystemZDAGToDAGISel::convertTo(SDLoc DL, EVT VT, SDValue N) const {
+  if (N.getValueType() == MVT::i32 && VT == MVT::i64)
+    return CurDAG->getTargetInsertSubreg(SystemZ::subreg_l32,
+                                         DL, VT, getUNDEF(DL, MVT::i64), N);
+  if (N.getValueType() == MVT::i64 && VT == MVT::i32)
+    return CurDAG->getTargetExtractSubreg(SystemZ::subreg_l32, DL, VT, N);
+  assert(N.getValueType() == VT && "Unexpected value types");
+  return N;
+}
+
+SDNode *SystemZDAGToDAGISel::tryRISBGZero(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  RxSBGOperands RISBG(SystemZ::RISBG, SDValue(N, 0));
+  unsigned Count = 0;
+  while (expandRxSBG(RISBG))
+    if (RISBG.Input.getOpcode() != ISD::ANY_EXTEND)
+      Count += 1;
+  if (Count == 0)
+    return nullptr;
+  if (Count == 1) {
+    // Prefer to use normal shift instructions over RISBG, since they can handle
+    // all cases and are sometimes shorter.
+    if (N->getOpcode() != ISD::AND)
+      return nullptr;
+
+    // Prefer register extensions like LLC over RISBG.  Also prefer to start
+    // out with normal ANDs if one instruction would be enough.  We can convert
+    // these ANDs into an RISBG later if a three-address instruction is useful.
+    if (VT == MVT::i32 ||
+        RISBG.Mask == 0xff ||
+        RISBG.Mask == 0xffff ||
+        SystemZ::isImmLF(~RISBG.Mask) ||
+        SystemZ::isImmHF(~RISBG.Mask)) {
+      // Force the new mask into the DAG, since it may include known-one bits.
+      auto *MaskN = cast<ConstantSDNode>(N->getOperand(1).getNode());
+      if (MaskN->getZExtValue() != RISBG.Mask) {
+        SDValue NewMask = CurDAG->getConstant(RISBG.Mask, VT);
+        N = CurDAG->UpdateNodeOperands(N, N->getOperand(0), NewMask);
+        return SelectCode(N);
+      }
+      return nullptr;
+    }
+  }  
+
+  unsigned Opcode = SystemZ::RISBG;
+  EVT OpcodeVT = MVT::i64;
+  if (VT == MVT::i32 && Subtarget.hasHighWord()) {
+    Opcode = SystemZ::RISBMux;
+    OpcodeVT = MVT::i32;
+    RISBG.Start &= 31;
+    RISBG.End &= 31;
+  }
+  SDValue Ops[5] = {
+    getUNDEF(SDLoc(N), OpcodeVT),
+    convertTo(SDLoc(N), OpcodeVT, RISBG.Input),
+    CurDAG->getTargetConstant(RISBG.Start, MVT::i32),
+    CurDAG->getTargetConstant(RISBG.End | 128, MVT::i32),
+    CurDAG->getTargetConstant(RISBG.Rotate, MVT::i32)
+  };
+  N = CurDAG->getMachineNode(Opcode, SDLoc(N), OpcodeVT, Ops);
+  return convertTo(SDLoc(N), VT, SDValue(N, 0)).getNode();
+}
+
+SDNode *SystemZDAGToDAGISel::tryRxSBG(SDNode *N, unsigned Opcode) {
+  // Try treating each operand of N as the second operand of the RxSBG
+  // and see which goes deepest.
+  RxSBGOperands RxSBG[] = {
+    RxSBGOperands(Opcode, N->getOperand(0)),
+    RxSBGOperands(Opcode, N->getOperand(1))
+  };
+  unsigned Count[] = { 0, 0 };
+  for (unsigned I = 0; I < 2; ++I)
+    while (expandRxSBG(RxSBG[I]))
+      if (RxSBG[I].Input.getOpcode() != ISD::ANY_EXTEND)
+        Count[I] += 1;
+
+  // Do nothing if neither operand is suitable.
+  if (Count[0] == 0 && Count[1] == 0)
+    return nullptr;
+
+  // Pick the deepest second operand.
+  unsigned I = Count[0] > Count[1] ? 0 : 1;
+  SDValue Op0 = N->getOperand(I ^ 1);
+
+  // Prefer IC for character insertions from memory.
+  if (Opcode == SystemZ::ROSBG && (RxSBG[I].Mask & 0xff) == 0)
+    if (auto *Load = dyn_cast<LoadSDNode>(Op0.getNode()))
+      if (Load->getMemoryVT() == MVT::i8)
+        return nullptr;
+
+  // See whether we can avoid an AND in the first operand by converting
+  // ROSBG to RISBG.
+  if (Opcode == SystemZ::ROSBG && detectOrAndInsertion(Op0, RxSBG[I].Mask))
+    Opcode = SystemZ::RISBG;
+           
+  EVT VT = N->getValueType(0);
+  SDValue Ops[5] = {
+    convertTo(SDLoc(N), MVT::i64, Op0),
+    convertTo(SDLoc(N), MVT::i64, RxSBG[I].Input),
+    CurDAG->getTargetConstant(RxSBG[I].Start, MVT::i32),
+    CurDAG->getTargetConstant(RxSBG[I].End, MVT::i32),
+    CurDAG->getTargetConstant(RxSBG[I].Rotate, MVT::i32)
+  };
+  N = CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i64, Ops);
+  return convertTo(SDLoc(N), VT, SDValue(N, 0)).getNode();
+}
+
+SDNode *SystemZDAGToDAGISel::splitLargeImmediate(unsigned Opcode, SDNode *Node,
+                                                 SDValue Op0, uint64_t UpperVal,
+                                                 uint64_t LowerVal) {
+  EVT VT = Node->getValueType(0);
+  SDLoc DL(Node);
+  SDValue Upper = CurDAG->getConstant(UpperVal, VT);
+  if (Op0.getNode())
+    Upper = CurDAG->getNode(Opcode, DL, VT, Op0, Upper);
+  Upper = SDValue(Select(Upper.getNode()), 0);
+
+  SDValue Lower = CurDAG->getConstant(LowerVal, VT);
+  SDValue Or = CurDAG->getNode(Opcode, DL, VT, Upper, Lower);
+  return Or.getNode();
+}
+
+bool SystemZDAGToDAGISel::canUseBlockOperation(StoreSDNode *Store,
+                                               LoadSDNode *Load) const {
+  // Check that the two memory operands have the same size.
+  if (Load->getMemoryVT() != Store->getMemoryVT())
+    return false;
+
+  // Volatility stops an access from being decomposed.
+  if (Load->isVolatile() || Store->isVolatile())
+    return false;
+
+  // There's no chance of overlap if the load is invariant.
+  if (Load->isInvariant())
+    return true;
+
+  // Otherwise we need to check whether there's an alias.
+  const Value *V1 = Load->getMemOperand()->getValue();
+  const Value *V2 = Store->getMemOperand()->getValue();
+  if (!V1 || !V2)
+    return false;
+
+  // Reject equality.
+  uint64_t Size = Load->getMemoryVT().getStoreSize();
+  int64_t End1 = Load->getSrcValueOffset() + Size;
+  int64_t End2 = Store->getSrcValueOffset() + Size;
+  if (V1 == V2 && End1 == End2)
+    return false;
+
+  return !AA->alias(AliasAnalysis::Location(V1, End1, Load->getTBAAInfo()),
+                    AliasAnalysis::Location(V2, End2, Store->getTBAAInfo()));
+}
+
+bool SystemZDAGToDAGISel::storeLoadCanUseMVC(SDNode *N) const {
+  auto *Store = cast<StoreSDNode>(N);
+  auto *Load = cast<LoadSDNode>(Store->getValue());
+
+  // Prefer not to use MVC if either address can use ... RELATIVE LONG
+  // instructions.
+  uint64_t Size = Load->getMemoryVT().getStoreSize();
+  if (Size > 1 && Size <= 8) {
+    // Prefer LHRL, LRL and LGRL.
+    if (SystemZISD::isPCREL(Load->getBasePtr().getOpcode()))
+      return false;
+    // Prefer STHRL, STRL and STGRL.
+    if (SystemZISD::isPCREL(Store->getBasePtr().getOpcode()))
+      return false;
+  }
+
+  return canUseBlockOperation(Store, Load);
+}
+
+bool SystemZDAGToDAGISel::storeLoadCanUseBlockBinary(SDNode *N,
+                                                     unsigned I) const {
+  auto *StoreA = cast<StoreSDNode>(N);
+  auto *LoadA = cast<LoadSDNode>(StoreA->getValue().getOperand(1 - I));
+  auto *LoadB = cast<LoadSDNode>(StoreA->getValue().getOperand(I));
+  return !LoadA->isVolatile() && canUseBlockOperation(StoreA, LoadB);
+}
+
+SDNode *SystemZDAGToDAGISel::Select(SDNode *Node) {
+  // Dump information about the Node being selected
+  DEBUG(errs() << "Selecting: "; Node->dump(CurDAG); errs() << "\n");
+
+  // If we have a custom node, we already have selected!
+  if (Node->isMachineOpcode()) {
+    DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
+    Node->setNodeId(-1);
+    return nullptr;
+  }
+
+  unsigned Opcode = Node->getOpcode();
+  SDNode *ResNode = nullptr;
+  switch (Opcode) {
+  case ISD::OR:
+    if (Node->getOperand(1).getOpcode() != ISD::Constant)
+      ResNode = tryRxSBG(Node, SystemZ::ROSBG);
+    goto or_xor;
+
+  case ISD::XOR:
+    if (Node->getOperand(1).getOpcode() != ISD::Constant)
+      ResNode = tryRxSBG(Node, SystemZ::RXSBG);
+    // Fall through.
+  or_xor:
+    // If this is a 64-bit operation in which both 32-bit halves are nonzero,
+    // split the operation into two.
+    if (!ResNode && Node->getValueType(0) == MVT::i64)
+      if (auto *Op1 = dyn_cast<ConstantSDNode>(Node->getOperand(1))) {
+        uint64_t Val = Op1->getZExtValue();
+        if (!SystemZ::isImmLF(Val) && !SystemZ::isImmHF(Val))
+          Node = splitLargeImmediate(Opcode, Node, Node->getOperand(0),
+                                     Val - uint32_t(Val), uint32_t(Val));
+      }
+    break;
+
+  case ISD::AND:
+    if (Node->getOperand(1).getOpcode() != ISD::Constant)
+      ResNode = tryRxSBG(Node, SystemZ::RNSBG);
+    // Fall through.
+  case ISD::ROTL:
+  case ISD::SHL:
+  case ISD::SRL:
+  case ISD::ZERO_EXTEND:
+    if (!ResNode)
+      ResNode = tryRISBGZero(Node);
+    break;
+
+  case ISD::Constant:
+    // If this is a 64-bit constant that is out of the range of LLILF,
+    // LLIHF and LGFI, split it into two 32-bit pieces.
+    if (Node->getValueType(0) == MVT::i64) {
+      uint64_t Val = cast<ConstantSDNode>(Node)->getZExtValue();
+      if (!SystemZ::isImmLF(Val) && !SystemZ::isImmHF(Val) && !isInt<32>(Val))
+        Node = splitLargeImmediate(ISD::OR, Node, SDValue(),
+                                   Val - uint32_t(Val), uint32_t(Val));
+    }
+    break;
+
+  case SystemZISD::SELECT_CCMASK: {
+    SDValue Op0 = Node->getOperand(0);
+    SDValue Op1 = Node->getOperand(1);
+    // Prefer to put any load first, so that it can be matched as a
+    // conditional load.
+    if (Op1.getOpcode() == ISD::LOAD && Op0.getOpcode() != ISD::LOAD) {
+      SDValue CCValid = Node->getOperand(2);
+      SDValue CCMask = Node->getOperand(3);
+      uint64_t ConstCCValid =
+        cast<ConstantSDNode>(CCValid.getNode())->getZExtValue();
+      uint64_t ConstCCMask =
+        cast<ConstantSDNode>(CCMask.getNode())->getZExtValue();
+      // Invert the condition.
+      CCMask = CurDAG->getConstant(ConstCCValid ^ ConstCCMask,
+                                   CCMask.getValueType());
+      SDValue Op4 = Node->getOperand(4);
+      Node = CurDAG->UpdateNodeOperands(Node, Op1, Op0, CCValid, CCMask, Op4);
+    }
+    break;
+  }
+  }
+
+  // Select the default instruction
+  if (!ResNode)
+    ResNode = SelectCode(Node);
+
+  DEBUG(errs() << "=> ";
+        if (ResNode == nullptr || ResNode == Node)
+          Node->dump(CurDAG);
+        else
+          ResNode->dump(CurDAG);
+        errs() << "\n";
+        );
+  return ResNode;
+}
+
+bool SystemZDAGToDAGISel::
+SelectInlineAsmMemoryOperand(const SDValue &Op,
+                             char ConstraintCode,
+                             std::vector<SDValue> &OutOps) {
+  assert(ConstraintCode == 'm' && "Unexpected constraint code");
+  // Accept addresses with short displacements, which are compatible
+  // with Q, R, S and T.  But keep the index operand for future expansion.
+  SDValue Base, Disp, Index;
+  if (!selectBDXAddr(SystemZAddressingMode::FormBD,
+                     SystemZAddressingMode::Disp12Only,
+                     Op, Base, Disp, Index))
+    return true;
+  OutOps.push_back(Base);
+  OutOps.push_back(Disp);
+  OutOps.push_back(Index);
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
new file mode 100644
index 0000000..00c65f5
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -0,0 +1,3588 @@
+//===-- SystemZISelLowering.cpp - SystemZ DAG lowering implementation -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SystemZTargetLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZISelLowering.h"
+#include "SystemZCallingConv.h"
+#include "SystemZConstantPoolValue.h"
+#include "SystemZMachineFunctionInfo.h"
+#include "SystemZTargetMachine.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include <cctype>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "systemz-lower"
+
+namespace {
+// Represents a sequence for extracting a 0/1 value from an IPM result:
+// (((X ^ XORValue) + AddValue) >> Bit)
+struct IPMConversion {
+  IPMConversion(unsigned xorValue, int64_t addValue, unsigned bit)
+    : XORValue(xorValue), AddValue(addValue), Bit(bit) {}
+
+  int64_t XORValue;
+  int64_t AddValue;
+  unsigned Bit;
+};
+
+// Represents information about a comparison.
+struct Comparison {
+  Comparison(SDValue Op0In, SDValue Op1In)
+    : Op0(Op0In), Op1(Op1In), Opcode(0), ICmpType(0), CCValid(0), CCMask(0) {}
+
+  // The operands to the comparison.
+  SDValue Op0, Op1;
+
+  // The opcode that should be used to compare Op0 and Op1.
+  unsigned Opcode;
+
+  // A SystemZICMP value.  Only used for integer comparisons.
+  unsigned ICmpType;
+
+  // The mask of CC values that Opcode can produce.
+  unsigned CCValid;
+
+  // The mask of CC values for which the original condition is true.
+  unsigned CCMask;
+};
+} // end anonymous namespace
+
+// Classify VT as either 32 or 64 bit.
+static bool is32Bit(EVT VT) {
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::i32:
+    return true;
+  case MVT::i64:
+    return false;
+  default:
+    llvm_unreachable("Unsupported type");
+  }
+}
+
+// Return a version of MachineOperand that can be safely used before the
+// final use.
+static MachineOperand earlyUseOperand(MachineOperand Op) {
+  if (Op.isReg())
+    Op.setIsKill(false);
+  return Op;
+}
+
+SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm)
+    : TargetLowering(tm, new TargetLoweringObjectFileELF()),
+      Subtarget(tm.getSubtarget<SystemZSubtarget>()) {
+  MVT PtrVT = getPointerTy();
+
+  // Set up the register classes.
+  if (Subtarget.hasHighWord())
+    addRegisterClass(MVT::i32, &SystemZ::GRX32BitRegClass);
+  else
+    addRegisterClass(MVT::i32, &SystemZ::GR32BitRegClass);
+  addRegisterClass(MVT::i64,  &SystemZ::GR64BitRegClass);
+  addRegisterClass(MVT::f32,  &SystemZ::FP32BitRegClass);
+  addRegisterClass(MVT::f64,  &SystemZ::FP64BitRegClass);
+  addRegisterClass(MVT::f128, &SystemZ::FP128BitRegClass);
+
+  // Compute derived properties from the register classes
+  computeRegisterProperties();
+
+  // Set up special registers.
+  setExceptionPointerRegister(SystemZ::R6D);
+  setExceptionSelectorRegister(SystemZ::R7D);
+  setStackPointerRegisterToSaveRestore(SystemZ::R15D);
+
+  // TODO: It may be better to default to latency-oriented scheduling, however
+  // LLVM's current latency-oriented scheduler can't handle physreg definitions
+  // such as SystemZ has with CC, so set this to the register-pressure
+  // scheduler, because it can.
+  setSchedulingPreference(Sched::RegPressure);
+
+  setBooleanContents(ZeroOrOneBooleanContent);
+  setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
+
+  // Instructions are strings of 2-byte aligned 2-byte values.
+  setMinFunctionAlignment(2);
+
+  // Handle operations that are handled in a similar way for all types.
+  for (unsigned I = MVT::FIRST_INTEGER_VALUETYPE;
+       I <= MVT::LAST_FP_VALUETYPE;
+       ++I) {
+    MVT VT = MVT::SimpleValueType(I);
+    if (isTypeLegal(VT)) {
+      // Lower SET_CC into an IPM-based sequence.
+      setOperationAction(ISD::SETCC, VT, Custom);
+
+      // Expand SELECT(C, A, B) into SELECT_CC(X, 0, A, B, NE).
+      setOperationAction(ISD::SELECT, VT, Expand);
+
+      // Lower SELECT_CC and BR_CC into separate comparisons and branches.
+      setOperationAction(ISD::SELECT_CC, VT, Custom);
+      setOperationAction(ISD::BR_CC,     VT, Custom);
+    }
+  }
+
+  // Expand jump table branches as address arithmetic followed by an
+  // indirect jump.
+  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+
+  // Expand BRCOND into a BR_CC (see above).
+  setOperationAction(ISD::BRCOND, MVT::Other, Expand);
+
+  // Handle integer types.
+  for (unsigned I = MVT::FIRST_INTEGER_VALUETYPE;
+       I <= MVT::LAST_INTEGER_VALUETYPE;
+       ++I) {
+    MVT VT = MVT::SimpleValueType(I);
+    if (isTypeLegal(VT)) {
+      // Expand individual DIV and REMs into DIVREMs.
+      setOperationAction(ISD::SDIV, VT, Expand);
+      setOperationAction(ISD::UDIV, VT, Expand);
+      setOperationAction(ISD::SREM, VT, Expand);
+      setOperationAction(ISD::UREM, VT, Expand);
+      setOperationAction(ISD::SDIVREM, VT, Custom);
+      setOperationAction(ISD::UDIVREM, VT, Custom);
+
+      // Lower ATOMIC_LOAD and ATOMIC_STORE into normal volatile loads and
+      // stores, putting a serialization instruction after the stores.
+      setOperationAction(ISD::ATOMIC_LOAD,  VT, Custom);
+      setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
+
+      // Lower ATOMIC_LOAD_SUB into ATOMIC_LOAD_ADD if LAA and LAAG are
+      // available, or if the operand is constant.
+      setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
+
+      // No special instructions for these.
+      setOperationAction(ISD::CTPOP,           VT, Expand);
+      setOperationAction(ISD::CTTZ,            VT, Expand);
+      setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
+      setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
+      setOperationAction(ISD::ROTR,            VT, Expand);
+
+      // Use *MUL_LOHI where possible instead of MULH*.
+      setOperationAction(ISD::MULHS, VT, Expand);
+      setOperationAction(ISD::MULHU, VT, Expand);
+      setOperationAction(ISD::SMUL_LOHI, VT, Custom);
+      setOperationAction(ISD::UMUL_LOHI, VT, Custom);
+
+      // Only z196 and above have native support for conversions to unsigned.
+      if (!Subtarget.hasFPExtension())
+        setOperationAction(ISD::FP_TO_UINT, VT, Expand);
+    }
+  }
+
+  // Type legalization will convert 8- and 16-bit atomic operations into
+  // forms that operate on i32s (but still keeping the original memory VT).
+  // Lower them into full i32 operations.
+  setOperationAction(ISD::ATOMIC_SWAP,      MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_MIN,  MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_MAX,  MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i32, Custom);
+
+  // z10 has instructions for signed but not unsigned FP conversion.
+  // Handle unsigned 32-bit types as signed 64-bit types.
+  if (!Subtarget.hasFPExtension()) {
+    setOperationAction(ISD::UINT_TO_FP, MVT::i32, Promote);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
+  }
+
+  // We have native support for a 64-bit CTLZ, via FLOGR.
+  setOperationAction(ISD::CTLZ, MVT::i32, Promote);
+  setOperationAction(ISD::CTLZ, MVT::i64, Legal);
+
+  // Give LowerOperation the chance to replace 64-bit ORs with subregs.
+  setOperationAction(ISD::OR, MVT::i64, Custom);
+
+  // FIXME: Can we support these natively?
+  setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
+  setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
+  setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);
+
+  // We have native instructions for i8, i16 and i32 extensions, but not i1.
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
+  setLoadExtAction(ISD::EXTLOAD,  MVT::i1, Promote);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+  // Handle the various types of symbolic address.
+  setOperationAction(ISD::ConstantPool,     PtrVT, Custom);
+  setOperationAction(ISD::GlobalAddress,    PtrVT, Custom);
+  setOperationAction(ISD::GlobalTLSAddress, PtrVT, Custom);
+  setOperationAction(ISD::BlockAddress,     PtrVT, Custom);
+  setOperationAction(ISD::JumpTable,        PtrVT, Custom);
+
+  // We need to handle dynamic allocations specially because of the
+  // 160-byte area at the bottom of the stack.
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
+
+  // Use custom expanders so that we can force the function to use
+  // a frame pointer.
+  setOperationAction(ISD::STACKSAVE,    MVT::Other, Custom);
+  setOperationAction(ISD::STACKRESTORE, MVT::Other, Custom);
+
+  // Handle prefetches with PFD or PFDRL.
+  setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
+
+  // Handle floating-point types.
+  for (unsigned I = MVT::FIRST_FP_VALUETYPE;
+       I <= MVT::LAST_FP_VALUETYPE;
+       ++I) {
+    MVT VT = MVT::SimpleValueType(I);
+    if (isTypeLegal(VT)) {
+      // We can use FI for FRINT.
+      setOperationAction(ISD::FRINT, VT, Legal);
+
+      // We can use the extended form of FI for other rounding operations.
+      if (Subtarget.hasFPExtension()) {
+        setOperationAction(ISD::FNEARBYINT, VT, Legal);
+        setOperationAction(ISD::FFLOOR, VT, Legal);
+        setOperationAction(ISD::FCEIL, VT, Legal);
+        setOperationAction(ISD::FTRUNC, VT, Legal);
+        setOperationAction(ISD::FROUND, VT, Legal);
+      }
+
+      // No special instructions for these.
+      setOperationAction(ISD::FSIN, VT, Expand);
+      setOperationAction(ISD::FCOS, VT, Expand);
+      setOperationAction(ISD::FREM, VT, Expand);
+    }
+  }
+
+  // We have fused multiply-addition for f32 and f64 but not f128.
+  setOperationAction(ISD::FMA, MVT::f32,  Legal);
+  setOperationAction(ISD::FMA, MVT::f64,  Legal);
+  setOperationAction(ISD::FMA, MVT::f128, Expand);
+
+  // Needed so that we don't try to implement f128 constant loads using
+  // a load-and-extend of a f80 constant (in cases where the constant
+  // would fit in an f80).
+  setLoadExtAction(ISD::EXTLOAD, MVT::f80, Expand);
+
+  // Floating-point truncation and stores need to be done separately.
+  setTruncStoreAction(MVT::f64,  MVT::f32, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f64, Expand);
+
+  // We have 64-bit FPR<->GPR moves, but need special handling for
+  // 32-bit forms.
+  setOperationAction(ISD::BITCAST, MVT::i32, Custom);
+  setOperationAction(ISD::BITCAST, MVT::f32, Custom);
+
+  // VASTART and VACOPY need to deal with the SystemZ-specific varargs
+  // structure, but VAEND is a no-op.
+  setOperationAction(ISD::VASTART, MVT::Other, Custom);
+  setOperationAction(ISD::VACOPY,  MVT::Other, Custom);
+  setOperationAction(ISD::VAEND,   MVT::Other, Expand);
+
+  // Codes for which we want to perform some z-specific combinations.
+  setTargetDAGCombine(ISD::SIGN_EXTEND);
+
+  // We want to use MVC in preference to even a single load/store pair.
+  MaxStoresPerMemcpy = 0;
+  MaxStoresPerMemcpyOptSize = 0;
+
+  // The main memset sequence is a byte store followed by an MVC.
+  // Two STC or MV..I stores win over that, but the kind of fused stores
+  // generated by target-independent code don't when the byte value is
+  // variable.  E.g.  "STC <reg>;MHI <reg>,257;STH <reg>" is not better
+  // than "STC;MVC".  Handle the choice in target-specific code instead.
+  MaxStoresPerMemset = 0;
+  MaxStoresPerMemsetOptSize = 0;
+}
+
+EVT SystemZTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
+  if (!VT.isVector())
+    return MVT::i32;
+  return VT.changeVectorElementTypeToInteger();
+}
+
+bool SystemZTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+  VT = VT.getScalarType();
+
+  if (!VT.isSimple())
+    return false;
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::f32:
+  case MVT::f64:
+    return true;
+  case MVT::f128:
+    return false;
+  default:
+    break;
+  }
+
+  return false;
+}
+
+bool SystemZTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+  // We can load zero using LZ?R and negative zero using LZ?R;LC?BR.
+  return Imm.isZero() || Imm.isNegZero();
+}
+
+bool SystemZTargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
+                                                          unsigned,
+                                                          bool *Fast) const {
+  // Unaligned accesses should never be slower than the expanded version.
+  // We check specifically for aligned accesses in the few cases where
+  // they are required.
+  if (Fast)
+    *Fast = true;
+  return true;
+}
+  
+bool SystemZTargetLowering::isLegalAddressingMode(const AddrMode &AM,
+                                                  Type *Ty) const {
+  // Punt on globals for now, although they can be used in limited
+  // RELATIVE LONG cases.
+  if (AM.BaseGV)
+    return false;
+
+  // Require a 20-bit signed offset.
+  if (!isInt<20>(AM.BaseOffs))
+    return false;
+
+  // Indexing is OK but no scale factor can be applied.
+  return AM.Scale == 0 || AM.Scale == 1;
+}
+
+bool SystemZTargetLowering::isTruncateFree(Type *FromType, Type *ToType) const {
+  if (!FromType->isIntegerTy() || !ToType->isIntegerTy())
+    return false;
+  unsigned FromBits = FromType->getPrimitiveSizeInBits();
+  unsigned ToBits = ToType->getPrimitiveSizeInBits();
+  return FromBits > ToBits;
+}
+
+bool SystemZTargetLowering::isTruncateFree(EVT FromVT, EVT ToVT) const {
+  if (!FromVT.isInteger() || !ToVT.isInteger())
+    return false;
+  unsigned FromBits = FromVT.getSizeInBits();
+  unsigned ToBits = ToVT.getSizeInBits();
+  return FromBits > ToBits;
+}
+
+//===----------------------------------------------------------------------===//
+// Inline asm support
+//===----------------------------------------------------------------------===//
+
+TargetLowering::ConstraintType
+SystemZTargetLowering::getConstraintType(const std::string &Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'a': // Address register
+    case 'd': // Data register (equivalent to 'r')
+    case 'f': // Floating-point register
+    case 'h': // High-part register
+    case 'r': // General-purpose register
+      return C_RegisterClass;
+
+    case 'Q': // Memory with base and unsigned 12-bit displacement
+    case 'R': // Likewise, plus an index
+    case 'S': // Memory with base and signed 20-bit displacement
+    case 'T': // Likewise, plus an index
+    case 'm': // Equivalent to 'T'.
+      return C_Memory;
+
+    case 'I': // Unsigned 8-bit constant
+    case 'J': // Unsigned 12-bit constant
+    case 'K': // Signed 16-bit constant
+    case 'L': // Signed 20-bit displacement (on all targets we support)
+    case 'M': // 0x7fffffff
+      return C_Other;
+
+    default:
+      break;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+TargetLowering::ConstraintWeight SystemZTargetLowering::
+getSingleConstraintMatchWeight(AsmOperandInfo &info,
+                               const char *constraint) const {
+  ConstraintWeight weight = CW_Invalid;
+  Value *CallOperandVal = info.CallOperandVal;
+  // If we don't have a value, we can't do a match,
+  // but allow it at the lowest weight.
+  if (!CallOperandVal)
+    return CW_Default;
+  Type *type = CallOperandVal->getType();
+  // Look at the constraint type.
+  switch (*constraint) {
+  default:
+    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+    break;
+
+  case 'a': // Address register
+  case 'd': // Data register (equivalent to 'r')
+  case 'h': // High-part register
+  case 'r': // General-purpose register
+    if (CallOperandVal->getType()->isIntegerTy())
+      weight = CW_Register;
+    break;
+
+  case 'f': // Floating-point register
+    if (type->isFloatingPointTy())
+      weight = CW_Register;
+    break;
+
+  case 'I': // Unsigned 8-bit constant
+    if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
+      if (isUInt<8>(C->getZExtValue()))
+        weight = CW_Constant;
+    break;
+
+  case 'J': // Unsigned 12-bit constant
+    if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
+      if (isUInt<12>(C->getZExtValue()))
+        weight = CW_Constant;
+    break;
+
+  case 'K': // Signed 16-bit constant
+    if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
+      if (isInt<16>(C->getSExtValue()))
+        weight = CW_Constant;
+    break;
+
+  case 'L': // Signed 20-bit displacement (on all targets we support)
+    if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
+      if (isInt<20>(C->getSExtValue()))
+        weight = CW_Constant;
+    break;
+
+  case 'M': // 0x7fffffff
+    if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
+      if (C->getZExtValue() == 0x7fffffff)
+        weight = CW_Constant;
+    break;
+  }
+  return weight;
+}
+
+// Parse a "{tNNN}" register constraint for which the register type "t"
+// has already been verified.  MC is the class associated with "t" and
+// Map maps 0-based register numbers to LLVM register numbers.
+static std::pair<unsigned, const TargetRegisterClass *>
+parseRegisterNumber(const std::string &Constraint,
+                    const TargetRegisterClass *RC, const unsigned *Map) {
+  assert(*(Constraint.end()-1) == '}' && "Missing '}'");
+  if (isdigit(Constraint[2])) {
+    std::string Suffix(Constraint.data() + 2, Constraint.size() - 2);
+    unsigned Index = atoi(Suffix.c_str());
+    if (Index < 16 && Map[Index])
+      return std::make_pair(Map[Index], RC);
+  }
+  return std::make_pair(0U, nullptr);
+}
+
+std::pair<unsigned, const TargetRegisterClass *> SystemZTargetLowering::
+getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const {
+  if (Constraint.size() == 1) {
+    // GCC Constraint Letters
+    switch (Constraint[0]) {
+    default: break;
+    case 'd': // Data register (equivalent to 'r')
+    case 'r': // General-purpose register
+      if (VT == MVT::i64)
+        return std::make_pair(0U, &SystemZ::GR64BitRegClass);
+      else if (VT == MVT::i128)
+        return std::make_pair(0U, &SystemZ::GR128BitRegClass);
+      return std::make_pair(0U, &SystemZ::GR32BitRegClass);
+
+    case 'a': // Address register
+      if (VT == MVT::i64)
+        return std::make_pair(0U, &SystemZ::ADDR64BitRegClass);
+      else if (VT == MVT::i128)
+        return std::make_pair(0U, &SystemZ::ADDR128BitRegClass);
+      return std::make_pair(0U, &SystemZ::ADDR32BitRegClass);
+
+    case 'h': // High-part register (an LLVM extension)
+      return std::make_pair(0U, &SystemZ::GRH32BitRegClass);
+
+    case 'f': // Floating-point register
+      if (VT == MVT::f64)
+        return std::make_pair(0U, &SystemZ::FP64BitRegClass);
+      else if (VT == MVT::f128)
+        return std::make_pair(0U, &SystemZ::FP128BitRegClass);
+      return std::make_pair(0U, &SystemZ::FP32BitRegClass);
+    }
+  }
+  if (Constraint[0] == '{') {
+    // We need to override the default register parsing for GPRs and FPRs
+    // because the interpretation depends on VT.  The internal names of
+    // the registers are also different from the external names
+    // (F0D and F0S instead of F0, etc.).
+    if (Constraint[1] == 'r') {
+      if (VT == MVT::i32)
+        return parseRegisterNumber(Constraint, &SystemZ::GR32BitRegClass,
+                                   SystemZMC::GR32Regs);
+      if (VT == MVT::i128)
+        return parseRegisterNumber(Constraint, &SystemZ::GR128BitRegClass,
+                                   SystemZMC::GR128Regs);
+      return parseRegisterNumber(Constraint, &SystemZ::GR64BitRegClass,
+                                 SystemZMC::GR64Regs);
+    }
+    if (Constraint[1] == 'f') {
+      if (VT == MVT::f32)
+        return parseRegisterNumber(Constraint, &SystemZ::FP32BitRegClass,
+                                   SystemZMC::FP32Regs);
+      if (VT == MVT::f128)
+        return parseRegisterNumber(Constraint, &SystemZ::FP128BitRegClass,
+                                   SystemZMC::FP128Regs);
+      return parseRegisterNumber(Constraint, &SystemZ::FP64BitRegClass,
+                                 SystemZMC::FP64Regs);
+    }
+  }
+  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+}
+
+void SystemZTargetLowering::
+LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
+                             std::vector<SDValue> &Ops,
+                             SelectionDAG &DAG) const {
+  // Only support length 1 constraints for now.
+  if (Constraint.length() == 1) {
+    switch (Constraint[0]) {
+    case 'I': // Unsigned 8-bit constant
+      if (auto *C = dyn_cast<ConstantSDNode>(Op))
+        if (isUInt<8>(C->getZExtValue()))
+          Ops.push_back(DAG.getTargetConstant(C->getZExtValue(),
+                                              Op.getValueType()));
+      return;
+
+    case 'J': // Unsigned 12-bit constant
+      if (auto *C = dyn_cast<ConstantSDNode>(Op))
+        if (isUInt<12>(C->getZExtValue()))
+          Ops.push_back(DAG.getTargetConstant(C->getZExtValue(),
+                                              Op.getValueType()));
+      return;
+
+    case 'K': // Signed 16-bit constant
+      if (auto *C = dyn_cast<ConstantSDNode>(Op))
+        if (isInt<16>(C->getSExtValue()))
+          Ops.push_back(DAG.getTargetConstant(C->getSExtValue(),
+                                              Op.getValueType()));
+      return;
+
+    case 'L': // Signed 20-bit displacement (on all targets we support)
+      if (auto *C = dyn_cast<ConstantSDNode>(Op))
+        if (isInt<20>(C->getSExtValue()))
+          Ops.push_back(DAG.getTargetConstant(C->getSExtValue(),
+                                              Op.getValueType()));
+      return;
+
+    case 'M': // 0x7fffffff
+      if (auto *C = dyn_cast<ConstantSDNode>(Op))
+        if (C->getZExtValue() == 0x7fffffff)
+          Ops.push_back(DAG.getTargetConstant(C->getZExtValue(),
+                                              Op.getValueType()));
+      return;
+    }
+  }
+  TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+//===----------------------------------------------------------------------===//
+// Calling conventions
+//===----------------------------------------------------------------------===//
+
+#include "SystemZGenCallingConv.inc"
+
+bool SystemZTargetLowering::allowTruncateForTailCall(Type *FromType,
+                                                     Type *ToType) const {
+  return isTruncateFree(FromType, ToType);
+}
+
+bool SystemZTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
+  if (!CI->isTailCall())
+    return false;
+  return true;
+}
+
+// Value is a value that has been passed to us in the location described by VA
+// (and so has type VA.getLocVT()).  Convert Value to VA.getValVT(), chaining
+// any loads onto Chain.
+static SDValue convertLocVTToValVT(SelectionDAG &DAG, SDLoc DL,
+                                   CCValAssign &VA, SDValue Chain,
+                                   SDValue Value) {
+  // If the argument has been promoted from a smaller type, insert an
+  // assertion to capture this.
+  if (VA.getLocInfo() == CCValAssign::SExt)
+    Value = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Value,
+                        DAG.getValueType(VA.getValVT()));
+  else if (VA.getLocInfo() == CCValAssign::ZExt)
+    Value = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Value,
+                        DAG.getValueType(VA.getValVT()));
+
+  if (VA.isExtInLoc())
+    Value = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Value);
+  else if (VA.getLocInfo() == CCValAssign::Indirect)
+    Value = DAG.getLoad(VA.getValVT(), DL, Chain, Value,
+                        MachinePointerInfo(), false, false, false, 0);
+  else
+    assert(VA.getLocInfo() == CCValAssign::Full && "Unsupported getLocInfo");
+  return Value;
+}
+
+// Value is a value of type VA.getValVT() that we need to copy into
+// the location described by VA.  Return a copy of Value converted to
+// VA.getValVT().  The caller is responsible for handling indirect values.
+static SDValue convertValVTToLocVT(SelectionDAG &DAG, SDLoc DL,
+                                   CCValAssign &VA, SDValue Value) {
+  switch (VA.getLocInfo()) {
+  case CCValAssign::SExt:
+    return DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Value);
+  case CCValAssign::ZExt:
+    return DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Value);
+  case CCValAssign::AExt:
+    return DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Value);
+  case CCValAssign::Full:
+    return Value;
+  default:
+    llvm_unreachable("Unhandled getLocInfo()");
+  }
+}
+
+SDValue SystemZTargetLowering::
+LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+                     const SmallVectorImpl<ISD::InputArg> &Ins,
+                     SDLoc DL, SelectionDAG &DAG,
+                     SmallVectorImpl<SDValue> &InVals) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  SystemZMachineFunctionInfo *FuncInfo =
+    MF.getInfo<SystemZMachineFunctionInfo>();
+  auto *TFL = static_cast<const SystemZFrameLowering *>(
+      DAG.getTarget().getFrameLowering());
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, IsVarArg, MF, DAG.getTarget(), ArgLocs,
+                 *DAG.getContext());
+  CCInfo.AnalyzeFormalArguments(Ins, CC_SystemZ);
+
+  unsigned NumFixedGPRs = 0;
+  unsigned NumFixedFPRs = 0;
+  for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
+    SDValue ArgValue;
+    CCValAssign &VA = ArgLocs[I];
+    EVT LocVT = VA.getLocVT();
+    if (VA.isRegLoc()) {
+      // Arguments passed in registers
+      const TargetRegisterClass *RC;
+      switch (LocVT.getSimpleVT().SimpleTy) {
+      default:
+        // Integers smaller than i64 should be promoted to i64.
+        llvm_unreachable("Unexpected argument type");
+      case MVT::i32:
+        NumFixedGPRs += 1;
+        RC = &SystemZ::GR32BitRegClass;
+        break;
+      case MVT::i64:
+        NumFixedGPRs += 1;
+        RC = &SystemZ::GR64BitRegClass;
+        break;
+      case MVT::f32:
+        NumFixedFPRs += 1;
+        RC = &SystemZ::FP32BitRegClass;
+        break;
+      case MVT::f64:
+        NumFixedFPRs += 1;
+        RC = &SystemZ::FP64BitRegClass;
+        break;
+      }
+
+      unsigned VReg = MRI.createVirtualRegister(RC);
+      MRI.addLiveIn(VA.getLocReg(), VReg);
+      ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, LocVT);
+    } else {
+      assert(VA.isMemLoc() && "Argument not register or memory");
+
+      // Create the frame index object for this incoming parameter.
+      int FI = MFI->CreateFixedObject(LocVT.getSizeInBits() / 8,
+                                      VA.getLocMemOffset(), true);
+
+      // Create the SelectionDAG nodes corresponding to a load
+      // from this parameter.  Unpromoted ints and floats are
+      // passed as right-justified 8-byte values.
+      EVT PtrVT = getPointerTy();
+      SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+      if (VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::f32)
+        FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4));
+      ArgValue = DAG.getLoad(LocVT, DL, Chain, FIN,
+                             MachinePointerInfo::getFixedStack(FI),
+                             false, false, false, 0);
+    }
+
+    // Convert the value of the argument register into the value that's
+    // being passed.
+    InVals.push_back(convertLocVTToValVT(DAG, DL, VA, Chain, ArgValue));
+  }
+
+  if (IsVarArg) {
+    // Save the number of non-varargs registers for later use by va_start, etc.
+    FuncInfo->setVarArgsFirstGPR(NumFixedGPRs);
+    FuncInfo->setVarArgsFirstFPR(NumFixedFPRs);
+
+    // Likewise the address (in the form of a frame index) of where the
+    // first stack vararg would be.  The 1-byte size here is arbitrary.
+    int64_t StackSize = CCInfo.getNextStackOffset();
+    FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize, true));
+
+    // ...and a similar frame index for the caller-allocated save area
+    // that will be used to store the incoming registers.
+    int64_t RegSaveOffset = TFL->getOffsetOfLocalArea();
+    unsigned RegSaveIndex = MFI->CreateFixedObject(1, RegSaveOffset, true);
+    FuncInfo->setRegSaveFrameIndex(RegSaveIndex);
+
+    // Store the FPR varargs in the reserved frame slots.  (We store the
+    // GPRs as part of the prologue.)
+    if (NumFixedFPRs < SystemZ::NumArgFPRs) {
+      SDValue MemOps[SystemZ::NumArgFPRs];
+      for (unsigned I = NumFixedFPRs; I < SystemZ::NumArgFPRs; ++I) {
+        unsigned Offset = TFL->getRegSpillOffset(SystemZ::ArgFPRs[I]);
+        int FI = MFI->CreateFixedObject(8, RegSaveOffset + Offset, true);
+        SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+        unsigned VReg = MF.addLiveIn(SystemZ::ArgFPRs[I],
+                                     &SystemZ::FP64BitRegClass);
+        SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f64);
+        MemOps[I] = DAG.getStore(ArgValue.getValue(1), DL, ArgValue, FIN,
+                                 MachinePointerInfo::getFixedStack(FI),
+                                 false, false, 0);
+
+      }
+      // Join the stores, which are independent of one another.
+      Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+                          makeArrayRef(&MemOps[NumFixedFPRs],
+                                       SystemZ::NumArgFPRs-NumFixedFPRs));
+    }
+  }
+
+  return Chain;
+}
+
+static bool canUseSiblingCall(CCState ArgCCInfo,
+                              SmallVectorImpl<CCValAssign> &ArgLocs) {
+  // Punt if there are any indirect or stack arguments, or if the call
+  // needs the call-saved argument register R6.
+  for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
+    CCValAssign &VA = ArgLocs[I];
+    if (VA.getLocInfo() == CCValAssign::Indirect)
+      return false;
+    if (!VA.isRegLoc())
+      return false;
+    unsigned Reg = VA.getLocReg();
+    if (Reg == SystemZ::R6H || Reg == SystemZ::R6L || Reg == SystemZ::R6D)
+      return false;
+  }
+  return true;
+}
+
+SDValue
+SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
+                                 SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG = CLI.DAG;
+  SDLoc &DL = CLI.DL;
+  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+  SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+  SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
+  SDValue Chain = CLI.Chain;
+  SDValue Callee = CLI.Callee;
+  bool &IsTailCall = CLI.IsTailCall;
+  CallingConv::ID CallConv = CLI.CallConv;
+  bool IsVarArg = CLI.IsVarArg;
+  MachineFunction &MF = DAG.getMachineFunction();
+  EVT PtrVT = getPointerTy();
+
+  // Analyze the operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState ArgCCInfo(CallConv, IsVarArg, MF, DAG.getTarget(), ArgLocs,
+                    *DAG.getContext());
+  ArgCCInfo.AnalyzeCallOperands(Outs, CC_SystemZ);
+
+  // We don't support GuaranteedTailCallOpt, only automatically-detected
+  // sibling calls.
+  if (IsTailCall && !canUseSiblingCall(ArgCCInfo, ArgLocs))
+    IsTailCall = false;
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = ArgCCInfo.getNextStackOffset();
+
+  // Mark the start of the call.
+  if (!IsTailCall)
+    Chain = DAG.getCALLSEQ_START(Chain, DAG.getConstant(NumBytes, PtrVT, true),
+                                 DL);
+
+  // Copy argument values to their designated locations.
+  SmallVector<std::pair<unsigned, SDValue>, 9> RegsToPass;
+  SmallVector<SDValue, 8> MemOpChains;
+  SDValue StackPtr;
+  for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
+    CCValAssign &VA = ArgLocs[I];
+    SDValue ArgValue = OutVals[I];
+
+    if (VA.getLocInfo() == CCValAssign::Indirect) {
+      // Store the argument in a stack slot and pass its address.
+      SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
+      int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
+      MemOpChains.push_back(DAG.getStore(Chain, DL, ArgValue, SpillSlot,
+                                         MachinePointerInfo::getFixedStack(FI),
+                                         false, false, 0));
+      ArgValue = SpillSlot;
+    } else
+      ArgValue = convertValVTToLocVT(DAG, DL, VA, ArgValue);
+
+    if (VA.isRegLoc())
+      // Queue up the argument copies and emit them at the end.
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgValue));
+    else {
+      assert(VA.isMemLoc() && "Argument not register or memory");
+
+      // Work out the address of the stack slot.  Unpromoted ints and
+      // floats are passed as right-justified 8-byte values.
+      if (!StackPtr.getNode())
+        StackPtr = DAG.getCopyFromReg(Chain, DL, SystemZ::R15D, PtrVT);
+      unsigned Offset = SystemZMC::CallFrameSize + VA.getLocMemOffset();
+      if (VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::f32)
+        Offset += 4;
+      SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr,
+                                    DAG.getIntPtrConstant(Offset));
+
+      // Emit the store.
+      MemOpChains.push_back(DAG.getStore(Chain, DL, ArgValue, Address,
+                                         MachinePointerInfo(),
+                                         false, false, 0));
+    }
+  }
+
+  // Join the stores, which are independent of one another.
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
+
+  // Accept direct calls by converting symbolic call addresses to the
+  // associated Target* opcodes.  Force %r1 to be used for indirect
+  // tail calls.
+  SDValue Glue;
+  if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, PtrVT);
+    Callee = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Callee);
+  } else if (auto *E = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    Callee = DAG.getTargetExternalSymbol(E->getSymbol(), PtrVT);
+    Callee = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Callee);
+  } else if (IsTailCall) {
+    Chain = DAG.getCopyToReg(Chain, DL, SystemZ::R1D, Callee, Glue);
+    Glue = Chain.getValue(1);
+    Callee = DAG.getRegister(SystemZ::R1D, Callee.getValueType());
+  }
+
+  // Build a sequence of copy-to-reg nodes, chained and glued together.
+  for (unsigned I = 0, E = RegsToPass.size(); I != E; ++I) {
+    Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[I].first,
+                             RegsToPass[I].second, Glue);
+    Glue = Chain.getValue(1);
+  }
+
+  // The first call operand is the chain and the second is the target address.
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add argument registers to the end of the list so that they are
+  // known live into the call.
+  for (unsigned I = 0, E = RegsToPass.size(); I != E; ++I)
+    Ops.push_back(DAG.getRegister(RegsToPass[I].first,
+                                  RegsToPass[I].second.getValueType()));
+
+  // Add a register mask operand representing the call-preserved registers.
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
+
+  // Glue the call to the argument copies, if any.
+  if (Glue.getNode())
+    Ops.push_back(Glue);
+
+  // Emit the call.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  if (IsTailCall)
+    return DAG.getNode(SystemZISD::SIBCALL, DL, NodeTys, Ops);
+  Chain = DAG.getNode(SystemZISD::CALL, DL, NodeTys, Ops);
+  Glue = Chain.getValue(1);
+
+  // Mark the end of the call, which is glued to the call itself.
+  Chain = DAG.getCALLSEQ_END(Chain,
+                             DAG.getConstant(NumBytes, PtrVT, true),
+                             DAG.getConstant(0, PtrVT, true),
+                             Glue, DL);
+  Glue = Chain.getValue(1);
+
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RetLocs;
+  CCState RetCCInfo(CallConv, IsVarArg, MF, DAG.getTarget(), RetLocs,
+                    *DAG.getContext());
+  RetCCInfo.AnalyzeCallResult(Ins, RetCC_SystemZ);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned I = 0, E = RetLocs.size(); I != E; ++I) {
+    CCValAssign &VA = RetLocs[I];
+
+    // Copy the value out, gluing the copy to the end of the call sequence.
+    SDValue RetValue = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(),
+                                          VA.getLocVT(), Glue);
+    Chain = RetValue.getValue(1);
+    Glue = RetValue.getValue(2);
+
+    // Convert the value of the return register into the value that's
+    // being returned.
+    InVals.push_back(convertLocVTToValVT(DAG, DL, VA, Chain, RetValue));
+  }
+
+  return Chain;
+}
+
+SDValue
+SystemZTargetLowering::LowerReturn(SDValue Chain,
+                                   CallingConv::ID CallConv, bool IsVarArg,
+                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                   const SmallVectorImpl<SDValue> &OutVals,
+                                   SDLoc DL, SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  // Assign locations to each returned value.
+  SmallVector<CCValAssign, 16> RetLocs;
+  CCState RetCCInfo(CallConv, IsVarArg, MF, DAG.getTarget(), RetLocs,
+                    *DAG.getContext());
+  RetCCInfo.AnalyzeReturn(Outs, RetCC_SystemZ);
+
+  // Quick exit for void returns
+  if (RetLocs.empty())
+    return DAG.getNode(SystemZISD::RET_FLAG, DL, MVT::Other, Chain);
+
+  // Copy the result values into the output registers.
+  SDValue Glue;
+  SmallVector<SDValue, 4> RetOps;
+  RetOps.push_back(Chain);
+  for (unsigned I = 0, E = RetLocs.size(); I != E; ++I) {
+    CCValAssign &VA = RetLocs[I];
+    SDValue RetValue = OutVals[I];
+
+    // Make the return register live on exit.
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    // Promote the value as required.
+    RetValue = convertValVTToLocVT(DAG, DL, VA, RetValue);
+
+    // Chain and glue the copies together.
+    unsigned Reg = VA.getLocReg();
+    Chain = DAG.getCopyToReg(Chain, DL, Reg, RetValue, Glue);
+    Glue = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(Reg, VA.getLocVT()));
+  }
+
+  // Update chain and glue.
+  RetOps[0] = Chain;
+  if (Glue.getNode())
+    RetOps.push_back(Glue);
+
+  return DAG.getNode(SystemZISD::RET_FLAG, DL, MVT::Other, RetOps);
+}
+
+SDValue SystemZTargetLowering::
+prepareVolatileOrAtomicLoad(SDValue Chain, SDLoc DL, SelectionDAG &DAG) const {
+  return DAG.getNode(SystemZISD::SERIALIZE, DL, MVT::Other, Chain);
+}
+
+// CC is a comparison that will be implemented using an integer or
+// floating-point comparison.  Return the condition code mask for
+// a branch on true.  In the integer case, CCMASK_CMP_UO is set for
+// unsigned comparisons and clear for signed ones.  In the floating-point
+// case, CCMASK_CMP_UO has its normal mask meaning (unordered).
+static unsigned CCMaskForCondCode(ISD::CondCode CC) {
+#define CONV(X) \
+  case ISD::SET##X: return SystemZ::CCMASK_CMP_##X; \
+  case ISD::SETO##X: return SystemZ::CCMASK_CMP_##X; \
+  case ISD::SETU##X: return SystemZ::CCMASK_CMP_UO | SystemZ::CCMASK_CMP_##X
+
+  switch (CC) {
+  default:
+    llvm_unreachable("Invalid integer condition!");
+
+  CONV(EQ);
+  CONV(NE);
+  CONV(GT);
+  CONV(GE);
+  CONV(LT);
+  CONV(LE);
+
+  case ISD::SETO:  return SystemZ::CCMASK_CMP_O;
+  case ISD::SETUO: return SystemZ::CCMASK_CMP_UO;
+  }
+#undef CONV
+}
+
+// Return a sequence for getting a 1 from an IPM result when CC has a
+// value in CCMask and a 0 when CC has a value in CCValid & ~CCMask.
+// The handling of CC values outside CCValid doesn't matter.
+static IPMConversion getIPMConversion(unsigned CCValid, unsigned CCMask) {
+  // Deal with cases where the result can be taken directly from a bit
+  // of the IPM result.
+  if (CCMask == (CCValid & (SystemZ::CCMASK_1 | SystemZ::CCMASK_3)))
+    return IPMConversion(0, 0, SystemZ::IPM_CC);
+  if (CCMask == (CCValid & (SystemZ::CCMASK_2 | SystemZ::CCMASK_3)))
+    return IPMConversion(0, 0, SystemZ::IPM_CC + 1);
+
+  // Deal with cases where we can add a value to force the sign bit
+  // to contain the right value.  Putting the bit in 31 means we can
+  // use SRL rather than RISBG(L), and also makes it easier to get a
+  // 0/-1 value, so it has priority over the other tests below.
+  //
+  // These sequences rely on the fact that the upper two bits of the
+  // IPM result are zero.
+  uint64_t TopBit = uint64_t(1) << 31;
+  if (CCMask == (CCValid & SystemZ::CCMASK_0))
+    return IPMConversion(0, -(1 << SystemZ::IPM_CC), 31);
+  if (CCMask == (CCValid & (SystemZ::CCMASK_0 | SystemZ::CCMASK_1)))
+    return IPMConversion(0, -(2 << SystemZ::IPM_CC), 31);
+  if (CCMask == (CCValid & (SystemZ::CCMASK_0
+                            | SystemZ::CCMASK_1
+                            | SystemZ::CCMASK_2)))
+    return IPMConversion(0, -(3 << SystemZ::IPM_CC), 31);
+  if (CCMask == (CCValid & SystemZ::CCMASK_3))
+    return IPMConversion(0, TopBit - (3 << SystemZ::IPM_CC), 31);
+  if (CCMask == (CCValid & (SystemZ::CCMASK_1
+                            | SystemZ::CCMASK_2
+                            | SystemZ::CCMASK_3)))
+    return IPMConversion(0, TopBit - (1 << SystemZ::IPM_CC), 31);
+
+  // Next try inverting the value and testing a bit.  0/1 could be
+  // handled this way too, but we dealt with that case above.
+  if (CCMask == (CCValid & (SystemZ::CCMASK_0 | SystemZ::CCMASK_2)))
+    return IPMConversion(-1, 0, SystemZ::IPM_CC);
+
+  // Handle cases where adding a value forces a non-sign bit to contain
+  // the right value.
+  if (CCMask == (CCValid & (SystemZ::CCMASK_1 | SystemZ::CCMASK_2)))
+    return IPMConversion(0, 1 << SystemZ::IPM_CC, SystemZ::IPM_CC + 1);
+  if (CCMask == (CCValid & (SystemZ::CCMASK_0 | SystemZ::CCMASK_3)))
+    return IPMConversion(0, -(1 << SystemZ::IPM_CC), SystemZ::IPM_CC + 1);
+
+  // The remaining cases are 1, 2, 0/1/3 and 0/2/3.  All these are
+  // can be done by inverting the low CC bit and applying one of the
+  // sign-based extractions above.
+  if (CCMask == (CCValid & SystemZ::CCMASK_1))
+    return IPMConversion(1 << SystemZ::IPM_CC, -(1 << SystemZ::IPM_CC), 31);
+  if (CCMask == (CCValid & SystemZ::CCMASK_2))
+    return IPMConversion(1 << SystemZ::IPM_CC,
+                         TopBit - (3 << SystemZ::IPM_CC), 31);
+  if (CCMask == (CCValid & (SystemZ::CCMASK_0
+                            | SystemZ::CCMASK_1
+                            | SystemZ::CCMASK_3)))
+    return IPMConversion(1 << SystemZ::IPM_CC, -(3 << SystemZ::IPM_CC), 31);
+  if (CCMask == (CCValid & (SystemZ::CCMASK_0
+                            | SystemZ::CCMASK_2
+                            | SystemZ::CCMASK_3)))
+    return IPMConversion(1 << SystemZ::IPM_CC,
+                         TopBit - (1 << SystemZ::IPM_CC), 31);
+
+  llvm_unreachable("Unexpected CC combination");
+}
+
+// If C can be converted to a comparison against zero, adjust the operands
+// as necessary.
+static void adjustZeroCmp(SelectionDAG &DAG, Comparison &C) {
+  if (C.ICmpType == SystemZICMP::UnsignedOnly)
+    return;
+
+  auto *ConstOp1 = dyn_cast<ConstantSDNode>(C.Op1.getNode());
+  if (!ConstOp1)
+    return;
+
+  int64_t Value = ConstOp1->getSExtValue();
+  if ((Value == -1 && C.CCMask == SystemZ::CCMASK_CMP_GT) ||
+      (Value == -1 && C.CCMask == SystemZ::CCMASK_CMP_LE) ||
+      (Value == 1 && C.CCMask == SystemZ::CCMASK_CMP_LT) ||
+      (Value == 1 && C.CCMask == SystemZ::CCMASK_CMP_GE)) {
+    C.CCMask ^= SystemZ::CCMASK_CMP_EQ;
+    C.Op1 = DAG.getConstant(0, C.Op1.getValueType());
+  }
+}
+
+// If a comparison described by C is suitable for CLI(Y), CHHSI or CLHHSI,
+// adjust the operands as necessary.
+static void adjustSubwordCmp(SelectionDAG &DAG, Comparison &C) {
+  // For us to make any changes, it must a comparison between a single-use
+  // load and a constant.
+  if (!C.Op0.hasOneUse() ||
+      C.Op0.getOpcode() != ISD::LOAD ||
+      C.Op1.getOpcode() != ISD::Constant)
+    return;
+
+  // We must have an 8- or 16-bit load.
+  auto *Load = cast<LoadSDNode>(C.Op0);
+  unsigned NumBits = Load->getMemoryVT().getStoreSizeInBits();
+  if (NumBits != 8 && NumBits != 16)
+    return;
+
+  // The load must be an extending one and the constant must be within the
+  // range of the unextended value.
+  auto *ConstOp1 = cast<ConstantSDNode>(C.Op1);
+  uint64_t Value = ConstOp1->getZExtValue();
+  uint64_t Mask = (1 << NumBits) - 1;
+  if (Load->getExtensionType() == ISD::SEXTLOAD) {
+    // Make sure that ConstOp1 is in range of C.Op0.
+    int64_t SignedValue = ConstOp1->getSExtValue();
+    if (uint64_t(SignedValue) + (uint64_t(1) << (NumBits - 1)) > Mask)
+      return;
+    if (C.ICmpType != SystemZICMP::SignedOnly) {
+      // Unsigned comparison between two sign-extended values is equivalent
+      // to unsigned comparison between two zero-extended values.
+      Value &= Mask;
+    } else if (NumBits == 8) {
+      // Try to treat the comparison as unsigned, so that we can use CLI.
+      // Adjust CCMask and Value as necessary.
+      if (Value == 0 && C.CCMask == SystemZ::CCMASK_CMP_LT)
+        // Test whether the high bit of the byte is set.
+        Value = 127, C.CCMask = SystemZ::CCMASK_CMP_GT;
+      else if (Value == 0 && C.CCMask == SystemZ::CCMASK_CMP_GE)
+        // Test whether the high bit of the byte is clear.
+        Value = 128, C.CCMask = SystemZ::CCMASK_CMP_LT;
+      else
+        // No instruction exists for this combination.
+        return;
+      C.ICmpType = SystemZICMP::UnsignedOnly;
+    }
+  } else if (Load->getExtensionType() == ISD::ZEXTLOAD) {
+    if (Value > Mask)
+      return;
+    assert(C.ICmpType == SystemZICMP::Any &&
+           "Signedness shouldn't matter here.");
+  } else
+    return;
+
+  // Make sure that the first operand is an i32 of the right extension type.
+  ISD::LoadExtType ExtType = (C.ICmpType == SystemZICMP::SignedOnly ?
+                              ISD::SEXTLOAD :
+                              ISD::ZEXTLOAD);
+  if (C.Op0.getValueType() != MVT::i32 ||
+      Load->getExtensionType() != ExtType)
+    C.Op0 = DAG.getExtLoad(ExtType, SDLoc(Load), MVT::i32,
+                           Load->getChain(), Load->getBasePtr(),
+                           Load->getPointerInfo(), Load->getMemoryVT(),
+                           Load->isVolatile(), Load->isNonTemporal(),
+                           Load->getAlignment());
+
+  // Make sure that the second operand is an i32 with the right value.
+  if (C.Op1.getValueType() != MVT::i32 ||
+      Value != ConstOp1->getZExtValue())
+    C.Op1 = DAG.getConstant(Value, MVT::i32);
+}
+
+// Return true if Op is either an unextended load, or a load suitable
+// for integer register-memory comparisons of type ICmpType.
+static bool isNaturalMemoryOperand(SDValue Op, unsigned ICmpType) {
+  auto *Load = dyn_cast<LoadSDNode>(Op.getNode());
+  if (Load) {
+    // There are no instructions to compare a register with a memory byte.
+    if (Load->getMemoryVT() == MVT::i8)
+      return false;
+    // Otherwise decide on extension type.
+    switch (Load->getExtensionType()) {
+    case ISD::NON_EXTLOAD:
+      return true;
+    case ISD::SEXTLOAD:
+      return ICmpType != SystemZICMP::UnsignedOnly;
+    case ISD::ZEXTLOAD:
+      return ICmpType != SystemZICMP::SignedOnly;
+    default:
+      break;
+    }
+  }
+  return false;
+}
+
+// Return true if it is better to swap the operands of C.
+static bool shouldSwapCmpOperands(const Comparison &C) {
+  // Leave f128 comparisons alone, since they have no memory forms.
+  if (C.Op0.getValueType() == MVT::f128)
+    return false;
+
+  // Always keep a floating-point constant second, since comparisons with
+  // zero can use LOAD TEST and comparisons with other constants make a
+  // natural memory operand.
+  if (isa<ConstantFPSDNode>(C.Op1))
+    return false;
+
+  // Never swap comparisons with zero since there are many ways to optimize
+  // those later.
+  auto *ConstOp1 = dyn_cast<ConstantSDNode>(C.Op1);
+  if (ConstOp1 && ConstOp1->getZExtValue() == 0)
+    return false;
+
+  // Also keep natural memory operands second if the loaded value is
+  // only used here.  Several comparisons have memory forms.
+  if (isNaturalMemoryOperand(C.Op1, C.ICmpType) && C.Op1.hasOneUse())
+    return false;
+
+  // Look for cases where Cmp0 is a single-use load and Cmp1 isn't.
+  // In that case we generally prefer the memory to be second.
+  if (isNaturalMemoryOperand(C.Op0, C.ICmpType) && C.Op0.hasOneUse()) {
+    // The only exceptions are when the second operand is a constant and
+    // we can use things like CHHSI.
+    if (!ConstOp1)
+      return true;
+    // The unsigned memory-immediate instructions can handle 16-bit
+    // unsigned integers.
+    if (C.ICmpType != SystemZICMP::SignedOnly &&
+        isUInt<16>(ConstOp1->getZExtValue()))
+      return false;
+    // The signed memory-immediate instructions can handle 16-bit
+    // signed integers.
+    if (C.ICmpType != SystemZICMP::UnsignedOnly &&
+        isInt<16>(ConstOp1->getSExtValue()))
+      return false;
+    return true;
+  }
+
+  // Try to promote the use of CGFR and CLGFR.
+  unsigned Opcode0 = C.Op0.getOpcode();
+  if (C.ICmpType != SystemZICMP::UnsignedOnly && Opcode0 == ISD::SIGN_EXTEND)
+    return true;
+  if (C.ICmpType != SystemZICMP::SignedOnly && Opcode0 == ISD::ZERO_EXTEND)
+    return true;
+  if (C.ICmpType != SystemZICMP::SignedOnly &&
+      Opcode0 == ISD::AND &&
+      C.Op0.getOperand(1).getOpcode() == ISD::Constant &&
+      cast<ConstantSDNode>(C.Op0.getOperand(1))->getZExtValue() == 0xffffffff)
+    return true;
+
+  return false;
+}
+
+// Return a version of comparison CC mask CCMask in which the LT and GT
+// actions are swapped.
+static unsigned reverseCCMask(unsigned CCMask) {
+  return ((CCMask & SystemZ::CCMASK_CMP_EQ) |
+          (CCMask & SystemZ::CCMASK_CMP_GT ? SystemZ::CCMASK_CMP_LT : 0) |
+          (CCMask & SystemZ::CCMASK_CMP_LT ? SystemZ::CCMASK_CMP_GT : 0) |
+          (CCMask & SystemZ::CCMASK_CMP_UO));
+}
+
+// Check whether C tests for equality between X and Y and whether X - Y
+// or Y - X is also computed.  In that case it's better to compare the
+// result of the subtraction against zero.
+static void adjustForSubtraction(SelectionDAG &DAG, Comparison &C) {
+  if (C.CCMask == SystemZ::CCMASK_CMP_EQ ||
+      C.CCMask == SystemZ::CCMASK_CMP_NE) {
+    for (auto I = C.Op0->use_begin(), E = C.Op0->use_end(); I != E; ++I) {
+      SDNode *N = *I;
+      if (N->getOpcode() == ISD::SUB &&
+          ((N->getOperand(0) == C.Op0 && N->getOperand(1) == C.Op1) ||
+           (N->getOperand(0) == C.Op1 && N->getOperand(1) == C.Op0))) {
+        C.Op0 = SDValue(N, 0);
+        C.Op1 = DAG.getConstant(0, N->getValueType(0));
+        return;
+      }
+    }
+  }
+}
+
+// Check whether C compares a floating-point value with zero and if that
+// floating-point value is also negated.  In this case we can use the
+// negation to set CC, so avoiding separate LOAD AND TEST and
+// LOAD (NEGATIVE/COMPLEMENT) instructions.
+static void adjustForFNeg(Comparison &C) {
+  auto *C1 = dyn_cast<ConstantFPSDNode>(C.Op1);
+  if (C1 && C1->isZero()) {
+    for (auto I = C.Op0->use_begin(), E = C.Op0->use_end(); I != E; ++I) {
+      SDNode *N = *I;
+      if (N->getOpcode() == ISD::FNEG) {
+        C.Op0 = SDValue(N, 0);
+        C.CCMask = reverseCCMask(C.CCMask);
+        return;
+      }
+    }
+  }
+}
+
+// Check whether C compares (shl X, 32) with 0 and whether X is
+// also sign-extended.  In that case it is better to test the result
+// of the sign extension using LTGFR.
+//
+// This case is important because InstCombine transforms a comparison
+// with (sext (trunc X)) into a comparison with (shl X, 32).
+static void adjustForLTGFR(Comparison &C) {
+  // Check for a comparison between (shl X, 32) and 0.
+  if (C.Op0.getOpcode() == ISD::SHL &&
+      C.Op0.getValueType() == MVT::i64 &&
+      C.Op1.getOpcode() == ISD::Constant &&
+      cast<ConstantSDNode>(C.Op1)->getZExtValue() == 0) {
+    auto *C1 = dyn_cast<ConstantSDNode>(C.Op0.getOperand(1));
+    if (C1 && C1->getZExtValue() == 32) {
+      SDValue ShlOp0 = C.Op0.getOperand(0);
+      // See whether X has any SIGN_EXTEND_INREG uses.
+      for (auto I = ShlOp0->use_begin(), E = ShlOp0->use_end(); I != E; ++I) {
+        SDNode *N = *I;
+        if (N->getOpcode() == ISD::SIGN_EXTEND_INREG &&
+            cast<VTSDNode>(N->getOperand(1))->getVT() == MVT::i32) {
+          C.Op0 = SDValue(N, 0);
+          return;
+        }
+      }
+    }
+  }
+}
+
+// If C compares the truncation of an extending load, try to compare
+// the untruncated value instead.  This exposes more opportunities to
+// reuse CC.
+static void adjustICmpTruncate(SelectionDAG &DAG, Comparison &C) {
+  if (C.Op0.getOpcode() == ISD::TRUNCATE &&
+      C.Op0.getOperand(0).getOpcode() == ISD::LOAD &&
+      C.Op1.getOpcode() == ISD::Constant &&
+      cast<ConstantSDNode>(C.Op1)->getZExtValue() == 0) {
+    auto *L = cast<LoadSDNode>(C.Op0.getOperand(0));
+    if (L->getMemoryVT().getStoreSizeInBits()
+        <= C.Op0.getValueType().getSizeInBits()) {
+      unsigned Type = L->getExtensionType();
+      if ((Type == ISD::ZEXTLOAD && C.ICmpType != SystemZICMP::SignedOnly) ||
+          (Type == ISD::SEXTLOAD && C.ICmpType != SystemZICMP::UnsignedOnly)) {
+        C.Op0 = C.Op0.getOperand(0);
+        C.Op1 = DAG.getConstant(0, C.Op0.getValueType());
+      }
+    }
+  }
+}
+
+// Return true if shift operation N has an in-range constant shift value.
+// Store it in ShiftVal if so.
+static bool isSimpleShift(SDValue N, unsigned &ShiftVal) {
+  auto *Shift = dyn_cast<ConstantSDNode>(N.getOperand(1));
+  if (!Shift)
+    return false;
+
+  uint64_t Amount = Shift->getZExtValue();
+  if (Amount >= N.getValueType().getSizeInBits())
+    return false;
+
+  ShiftVal = Amount;
+  return true;
+}
+
+// Check whether an AND with Mask is suitable for a TEST UNDER MASK
+// instruction and whether the CC value is descriptive enough to handle
+// a comparison of type Opcode between the AND result and CmpVal.
+// CCMask says which comparison result is being tested and BitSize is
+// the number of bits in the operands.  If TEST UNDER MASK can be used,
+// return the corresponding CC mask, otherwise return 0.
+static unsigned getTestUnderMaskCond(unsigned BitSize, unsigned CCMask,
+                                     uint64_t Mask, uint64_t CmpVal,
+                                     unsigned ICmpType) {
+  assert(Mask != 0 && "ANDs with zero should have been removed by now");
+
+  // Check whether the mask is suitable for TMHH, TMHL, TMLH or TMLL.
+  if (!SystemZ::isImmLL(Mask) && !SystemZ::isImmLH(Mask) &&
+      !SystemZ::isImmHL(Mask) && !SystemZ::isImmHH(Mask))
+    return 0;
+
+  // Work out the masks for the lowest and highest bits.
+  unsigned HighShift = 63 - countLeadingZeros(Mask);
+  uint64_t High = uint64_t(1) << HighShift;
+  uint64_t Low = uint64_t(1) << countTrailingZeros(Mask);
+
+  // Signed ordered comparisons are effectively unsigned if the sign
+  // bit is dropped.
+  bool EffectivelyUnsigned = (ICmpType != SystemZICMP::SignedOnly);
+
+  // Check for equality comparisons with 0, or the equivalent.
+  if (CmpVal == 0) {
+    if (CCMask == SystemZ::CCMASK_CMP_EQ)
+      return SystemZ::CCMASK_TM_ALL_0;
+    if (CCMask == SystemZ::CCMASK_CMP_NE)
+      return SystemZ::CCMASK_TM_SOME_1;
+  }
+  if (EffectivelyUnsigned && CmpVal <= Low) {
+    if (CCMask == SystemZ::CCMASK_CMP_LT)
+      return SystemZ::CCMASK_TM_ALL_0;
+    if (CCMask == SystemZ::CCMASK_CMP_GE)
+      return SystemZ::CCMASK_TM_SOME_1;
+  }
+  if (EffectivelyUnsigned && CmpVal < Low) {
+    if (CCMask == SystemZ::CCMASK_CMP_LE)
+      return SystemZ::CCMASK_TM_ALL_0;
+    if (CCMask == SystemZ::CCMASK_CMP_GT)
+      return SystemZ::CCMASK_TM_SOME_1;
+  }
+
+  // Check for equality comparisons with the mask, or the equivalent.
+  if (CmpVal == Mask) {
+    if (CCMask == SystemZ::CCMASK_CMP_EQ)
+      return SystemZ::CCMASK_TM_ALL_1;
+    if (CCMask == SystemZ::CCMASK_CMP_NE)
+      return SystemZ::CCMASK_TM_SOME_0;
+  }
+  if (EffectivelyUnsigned && CmpVal >= Mask - Low && CmpVal < Mask) {
+    if (CCMask == SystemZ::CCMASK_CMP_GT)
+      return SystemZ::CCMASK_TM_ALL_1;
+    if (CCMask == SystemZ::CCMASK_CMP_LE)
+      return SystemZ::CCMASK_TM_SOME_0;
+  }
+  if (EffectivelyUnsigned && CmpVal > Mask - Low && CmpVal <= Mask) {
+    if (CCMask == SystemZ::CCMASK_CMP_GE)
+      return SystemZ::CCMASK_TM_ALL_1;
+    if (CCMask == SystemZ::CCMASK_CMP_LT)
+      return SystemZ::CCMASK_TM_SOME_0;
+  }
+
+  // Check for ordered comparisons with the top bit.
+  if (EffectivelyUnsigned && CmpVal >= Mask - High && CmpVal < High) {
+    if (CCMask == SystemZ::CCMASK_CMP_LE)
+      return SystemZ::CCMASK_TM_MSB_0;
+    if (CCMask == SystemZ::CCMASK_CMP_GT)
+      return SystemZ::CCMASK_TM_MSB_1;
+  }
+  if (EffectivelyUnsigned && CmpVal > Mask - High && CmpVal <= High) {
+    if (CCMask == SystemZ::CCMASK_CMP_LT)
+      return SystemZ::CCMASK_TM_MSB_0;
+    if (CCMask == SystemZ::CCMASK_CMP_GE)
+      return SystemZ::CCMASK_TM_MSB_1;
+  }
+
+  // If there are just two bits, we can do equality checks for Low and High
+  // as well.
+  if (Mask == Low + High) {
+    if (CCMask == SystemZ::CCMASK_CMP_EQ && CmpVal == Low)
+      return SystemZ::CCMASK_TM_MIXED_MSB_0;
+    if (CCMask == SystemZ::CCMASK_CMP_NE && CmpVal == Low)
+      return SystemZ::CCMASK_TM_MIXED_MSB_0 ^ SystemZ::CCMASK_ANY;
+    if (CCMask == SystemZ::CCMASK_CMP_EQ && CmpVal == High)
+      return SystemZ::CCMASK_TM_MIXED_MSB_1;
+    if (CCMask == SystemZ::CCMASK_CMP_NE && CmpVal == High)
+      return SystemZ::CCMASK_TM_MIXED_MSB_1 ^ SystemZ::CCMASK_ANY;
+  }
+
+  // Looks like we've exhausted our options.
+  return 0;
+}
+
+// See whether C can be implemented as a TEST UNDER MASK instruction.
+// Update the arguments with the TM version if so.
+static void adjustForTestUnderMask(SelectionDAG &DAG, Comparison &C) {
+  // Check that we have a comparison with a constant.
+  auto *ConstOp1 = dyn_cast<ConstantSDNode>(C.Op1);
+  if (!ConstOp1)
+    return;
+  uint64_t CmpVal = ConstOp1->getZExtValue();
+
+  // Check whether the nonconstant input is an AND with a constant mask.
+  Comparison NewC(C);
+  uint64_t MaskVal;
+  ConstantSDNode *Mask = nullptr;
+  if (C.Op0.getOpcode() == ISD::AND) {
+    NewC.Op0 = C.Op0.getOperand(0);
+    NewC.Op1 = C.Op0.getOperand(1);
+    Mask = dyn_cast<ConstantSDNode>(NewC.Op1);
+    if (!Mask)
+      return;
+    MaskVal = Mask->getZExtValue();
+  } else {
+    // There is no instruction to compare with a 64-bit immediate
+    // so use TMHH instead if possible.  We need an unsigned ordered
+    // comparison with an i64 immediate.
+    if (NewC.Op0.getValueType() != MVT::i64 ||
+        NewC.CCMask == SystemZ::CCMASK_CMP_EQ ||
+        NewC.CCMask == SystemZ::CCMASK_CMP_NE ||
+        NewC.ICmpType == SystemZICMP::SignedOnly)
+      return;
+    // Convert LE and GT comparisons into LT and GE.
+    if (NewC.CCMask == SystemZ::CCMASK_CMP_LE ||
+        NewC.CCMask == SystemZ::CCMASK_CMP_GT) {
+      if (CmpVal == uint64_t(-1))
+        return;
+      CmpVal += 1;
+      NewC.CCMask ^= SystemZ::CCMASK_CMP_EQ;
+    }
+    // If the low N bits of Op1 are zero than the low N bits of Op0 can
+    // be masked off without changing the result.
+    MaskVal = -(CmpVal & -CmpVal);
+    NewC.ICmpType = SystemZICMP::UnsignedOnly;
+  }
+
+  // Check whether the combination of mask, comparison value and comparison
+  // type are suitable.
+  unsigned BitSize = NewC.Op0.getValueType().getSizeInBits();
+  unsigned NewCCMask, ShiftVal;
+  if (NewC.ICmpType != SystemZICMP::SignedOnly &&
+      NewC.Op0.getOpcode() == ISD::SHL &&
+      isSimpleShift(NewC.Op0, ShiftVal) &&
+      (NewCCMask = getTestUnderMaskCond(BitSize, NewC.CCMask,
+                                        MaskVal >> ShiftVal,
+                                        CmpVal >> ShiftVal,
+                                        SystemZICMP::Any))) {
+    NewC.Op0 = NewC.Op0.getOperand(0);
+    MaskVal >>= ShiftVal;
+  } else if (NewC.ICmpType != SystemZICMP::SignedOnly &&
+             NewC.Op0.getOpcode() == ISD::SRL &&
+             isSimpleShift(NewC.Op0, ShiftVal) &&
+             (NewCCMask = getTestUnderMaskCond(BitSize, NewC.CCMask,
+                                               MaskVal << ShiftVal,
+                                               CmpVal << ShiftVal,
+                                               SystemZICMP::UnsignedOnly))) {
+    NewC.Op0 = NewC.Op0.getOperand(0);
+    MaskVal <<= ShiftVal;
+  } else {
+    NewCCMask = getTestUnderMaskCond(BitSize, NewC.CCMask, MaskVal, CmpVal,
+                                     NewC.ICmpType);
+    if (!NewCCMask)
+      return;
+  }
+
+  // Go ahead and make the change.
+  C.Opcode = SystemZISD::TM;
+  C.Op0 = NewC.Op0;
+  if (Mask && Mask->getZExtValue() == MaskVal)
+    C.Op1 = SDValue(Mask, 0);
+  else
+    C.Op1 = DAG.getConstant(MaskVal, C.Op0.getValueType());
+  C.CCValid = SystemZ::CCMASK_TM;
+  C.CCMask = NewCCMask;
+}
+
+// Decide how to implement a comparison of type Cond between CmpOp0 with CmpOp1.
+static Comparison getCmp(SelectionDAG &DAG, SDValue CmpOp0, SDValue CmpOp1,
+                         ISD::CondCode Cond) {
+  Comparison C(CmpOp0, CmpOp1);
+  C.CCMask = CCMaskForCondCode(Cond);
+  if (C.Op0.getValueType().isFloatingPoint()) {
+    C.CCValid = SystemZ::CCMASK_FCMP;
+    C.Opcode = SystemZISD::FCMP;
+    adjustForFNeg(C);
+  } else {
+    C.CCValid = SystemZ::CCMASK_ICMP;
+    C.Opcode = SystemZISD::ICMP;
+    // Choose the type of comparison.  Equality and inequality tests can
+    // use either signed or unsigned comparisons.  The choice also doesn't
+    // matter if both sign bits are known to be clear.  In those cases we
+    // want to give the main isel code the freedom to choose whichever
+    // form fits best.
+    if (C.CCMask == SystemZ::CCMASK_CMP_EQ ||
+        C.CCMask == SystemZ::CCMASK_CMP_NE ||
+        (DAG.SignBitIsZero(C.Op0) && DAG.SignBitIsZero(C.Op1)))
+      C.ICmpType = SystemZICMP::Any;
+    else if (C.CCMask & SystemZ::CCMASK_CMP_UO)
+      C.ICmpType = SystemZICMP::UnsignedOnly;
+    else
+      C.ICmpType = SystemZICMP::SignedOnly;
+    C.CCMask &= ~SystemZ::CCMASK_CMP_UO;
+    adjustZeroCmp(DAG, C);
+    adjustSubwordCmp(DAG, C);
+    adjustForSubtraction(DAG, C);
+    adjustForLTGFR(C);
+    adjustICmpTruncate(DAG, C);
+  }
+
+  if (shouldSwapCmpOperands(C)) {
+    std::swap(C.Op0, C.Op1);
+    C.CCMask = reverseCCMask(C.CCMask);
+  }
+
+  adjustForTestUnderMask(DAG, C);
+  return C;
+}
+
+// Emit the comparison instruction described by C.
+static SDValue emitCmp(SelectionDAG &DAG, SDLoc DL, Comparison &C) {
+  if (C.Opcode == SystemZISD::ICMP)
+    return DAG.getNode(SystemZISD::ICMP, DL, MVT::Glue, C.Op0, C.Op1,
+                       DAG.getConstant(C.ICmpType, MVT::i32));
+  if (C.Opcode == SystemZISD::TM) {
+    bool RegisterOnly = (bool(C.CCMask & SystemZ::CCMASK_TM_MIXED_MSB_0) !=
+                         bool(C.CCMask & SystemZ::CCMASK_TM_MIXED_MSB_1));
+    return DAG.getNode(SystemZISD::TM, DL, MVT::Glue, C.Op0, C.Op1,
+                       DAG.getConstant(RegisterOnly, MVT::i32));
+  }
+  return DAG.getNode(C.Opcode, DL, MVT::Glue, C.Op0, C.Op1);
+}
+
+// Implement a 32-bit *MUL_LOHI operation by extending both operands to
+// 64 bits.  Extend is the extension type to use.  Store the high part
+// in Hi and the low part in Lo.
+static void lowerMUL_LOHI32(SelectionDAG &DAG, SDLoc DL,
+                            unsigned Extend, SDValue Op0, SDValue Op1,
+                            SDValue &Hi, SDValue &Lo) {
+  Op0 = DAG.getNode(Extend, DL, MVT::i64, Op0);
+  Op1 = DAG.getNode(Extend, DL, MVT::i64, Op1);
+  SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, Op0, Op1);
+  Hi = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul, DAG.getConstant(32, MVT::i64));
+  Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Hi);
+  Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
+}
+
+// Lower a binary operation that produces two VT results, one in each
+// half of a GR128 pair.  Op0 and Op1 are the VT operands to the operation,
+// Extend extends Op0 to a GR128, and Opcode performs the GR128 operation
+// on the extended Op0 and (unextended) Op1.  Store the even register result
+// in Even and the odd register result in Odd.
+static void lowerGR128Binary(SelectionDAG &DAG, SDLoc DL, EVT VT,
+                             unsigned Extend, unsigned Opcode,
+                             SDValue Op0, SDValue Op1,
+                             SDValue &Even, SDValue &Odd) {
+  SDNode *In128 = DAG.getMachineNode(Extend, DL, MVT::Untyped, Op0);
+  SDValue Result = DAG.getNode(Opcode, DL, MVT::Untyped,
+                               SDValue(In128, 0), Op1);
+  bool Is32Bit = is32Bit(VT);
+  Even = DAG.getTargetExtractSubreg(SystemZ::even128(Is32Bit), DL, VT, Result);
+  Odd = DAG.getTargetExtractSubreg(SystemZ::odd128(Is32Bit), DL, VT, Result);
+}
+
+// Return an i32 value that is 1 if the CC value produced by Glue is
+// in the mask CCMask and 0 otherwise.  CC is known to have a value
+// in CCValid, so other values can be ignored.
+static SDValue emitSETCC(SelectionDAG &DAG, SDLoc DL, SDValue Glue,
+                         unsigned CCValid, unsigned CCMask) {
+  IPMConversion Conversion = getIPMConversion(CCValid, CCMask);
+  SDValue Result = DAG.getNode(SystemZISD::IPM, DL, MVT::i32, Glue);
+
+  if (Conversion.XORValue)
+    Result = DAG.getNode(ISD::XOR, DL, MVT::i32, Result,
+                         DAG.getConstant(Conversion.XORValue, MVT::i32));
+
+  if (Conversion.AddValue)
+    Result = DAG.getNode(ISD::ADD, DL, MVT::i32, Result,
+                         DAG.getConstant(Conversion.AddValue, MVT::i32));
+
+  // The SHR/AND sequence should get optimized to an RISBG.
+  Result = DAG.getNode(ISD::SRL, DL, MVT::i32, Result,
+                       DAG.getConstant(Conversion.Bit, MVT::i32));
+  if (Conversion.Bit != 31)
+    Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
+                         DAG.getConstant(1, MVT::i32));
+  return Result;
+}
+
+SDValue SystemZTargetLowering::lowerSETCC(SDValue Op,
+                                          SelectionDAG &DAG) const {
+  SDValue CmpOp0   = Op.getOperand(0);
+  SDValue CmpOp1   = Op.getOperand(1);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+  SDLoc DL(Op);
+
+  Comparison C(getCmp(DAG, CmpOp0, CmpOp1, CC));
+  SDValue Glue = emitCmp(DAG, DL, C);
+  return emitSETCC(DAG, DL, Glue, C.CCValid, C.CCMask);
+}
+
+SDValue SystemZTargetLowering::lowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain    = Op.getOperand(0);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+  SDValue CmpOp0   = Op.getOperand(2);
+  SDValue CmpOp1   = Op.getOperand(3);
+  SDValue Dest     = Op.getOperand(4);
+  SDLoc DL(Op);
+
+  Comparison C(getCmp(DAG, CmpOp0, CmpOp1, CC));
+  SDValue Glue = emitCmp(DAG, DL, C);
+  return DAG.getNode(SystemZISD::BR_CCMASK, DL, Op.getValueType(),
+                     Chain, DAG.getConstant(C.CCValid, MVT::i32),
+                     DAG.getConstant(C.CCMask, MVT::i32), Dest, Glue);
+}
+
+// Return true if Pos is CmpOp and Neg is the negative of CmpOp,
+// allowing Pos and Neg to be wider than CmpOp.
+static bool isAbsolute(SDValue CmpOp, SDValue Pos, SDValue Neg) {
+  return (Neg.getOpcode() == ISD::SUB &&
+          Neg.getOperand(0).getOpcode() == ISD::Constant &&
+          cast<ConstantSDNode>(Neg.getOperand(0))->getZExtValue() == 0 &&
+          Neg.getOperand(1) == Pos &&
+          (Pos == CmpOp ||
+           (Pos.getOpcode() == ISD::SIGN_EXTEND &&
+            Pos.getOperand(0) == CmpOp)));
+}
+
+// Return the absolute or negative absolute of Op; IsNegative decides which.
+static SDValue getAbsolute(SelectionDAG &DAG, SDLoc DL, SDValue Op,
+                           bool IsNegative) {
+  Op = DAG.getNode(SystemZISD::IABS, DL, Op.getValueType(), Op);
+  if (IsNegative)
+    Op = DAG.getNode(ISD::SUB, DL, Op.getValueType(),
+                     DAG.getConstant(0, Op.getValueType()), Op);
+  return Op;
+}
+
+SDValue SystemZTargetLowering::lowerSELECT_CC(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  SDValue CmpOp0   = Op.getOperand(0);
+  SDValue CmpOp1   = Op.getOperand(1);
+  SDValue TrueOp   = Op.getOperand(2);
+  SDValue FalseOp  = Op.getOperand(3);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+  SDLoc DL(Op);
+
+  Comparison C(getCmp(DAG, CmpOp0, CmpOp1, CC));
+
+  // Check for absolute and negative-absolute selections, including those
+  // where the comparison value is sign-extended (for LPGFR and LNGFR).
+  // This check supplements the one in DAGCombiner.
+  if (C.Opcode == SystemZISD::ICMP &&
+      C.CCMask != SystemZ::CCMASK_CMP_EQ &&
+      C.CCMask != SystemZ::CCMASK_CMP_NE &&
+      C.Op1.getOpcode() == ISD::Constant &&
+      cast<ConstantSDNode>(C.Op1)->getZExtValue() == 0) {
+    if (isAbsolute(C.Op0, TrueOp, FalseOp))
+      return getAbsolute(DAG, DL, TrueOp, C.CCMask & SystemZ::CCMASK_CMP_LT);
+    if (isAbsolute(C.Op0, FalseOp, TrueOp))
+      return getAbsolute(DAG, DL, FalseOp, C.CCMask & SystemZ::CCMASK_CMP_GT);
+  }
+
+  SDValue Glue = emitCmp(DAG, DL, C);
+
+  // Special case for handling -1/0 results.  The shifts we use here
+  // should get optimized with the IPM conversion sequence.
+  auto *TrueC = dyn_cast<ConstantSDNode>(TrueOp);
+  auto *FalseC = dyn_cast<ConstantSDNode>(FalseOp);
+  if (TrueC && FalseC) {
+    int64_t TrueVal = TrueC->getSExtValue();
+    int64_t FalseVal = FalseC->getSExtValue();
+    if ((TrueVal == -1 && FalseVal == 0) || (TrueVal == 0 && FalseVal == -1)) {
+      // Invert the condition if we want -1 on false.
+      if (TrueVal == 0)
+        C.CCMask ^= C.CCValid;
+      SDValue Result = emitSETCC(DAG, DL, Glue, C.CCValid, C.CCMask);
+      EVT VT = Op.getValueType();
+      // Extend the result to VT.  Upper bits are ignored.
+      if (!is32Bit(VT))
+        Result = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Result);
+      // Sign-extend from the low bit.
+      SDValue ShAmt = DAG.getConstant(VT.getSizeInBits() - 1, MVT::i32);
+      SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, Result, ShAmt);
+      return DAG.getNode(ISD::SRA, DL, VT, Shl, ShAmt);
+    }
+  }
+
+  SmallVector<SDValue, 5> Ops;
+  Ops.push_back(TrueOp);
+  Ops.push_back(FalseOp);
+  Ops.push_back(DAG.getConstant(C.CCValid, MVT::i32));
+  Ops.push_back(DAG.getConstant(C.CCMask, MVT::i32));
+  Ops.push_back(Glue);
+
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
+  return DAG.getNode(SystemZISD::SELECT_CCMASK, DL, VTs, Ops);
+}
+
+SDValue SystemZTargetLowering::lowerGlobalAddress(GlobalAddressSDNode *Node,
+                                                  SelectionDAG &DAG) const {
+  SDLoc DL(Node);
+  const GlobalValue *GV = Node->getGlobal();
+  int64_t Offset = Node->getOffset();
+  EVT PtrVT = getPointerTy();
+  Reloc::Model RM = DAG.getTarget().getRelocationModel();
+  CodeModel::Model CM = DAG.getTarget().getCodeModel();
+
+  SDValue Result;
+  if (Subtarget.isPC32DBLSymbol(GV, RM, CM)) {
+    // Assign anchors at 1<<12 byte boundaries.
+    uint64_t Anchor = Offset & ~uint64_t(0xfff);
+    Result = DAG.getTargetGlobalAddress(GV, DL, PtrVT, Anchor);
+    Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
+
+    // The offset can be folded into the address if it is aligned to a halfword.
+    Offset -= Anchor;
+    if (Offset != 0 && (Offset & 1) == 0) {
+      SDValue Full = DAG.getTargetGlobalAddress(GV, DL, PtrVT, Anchor + Offset);
+      Result = DAG.getNode(SystemZISD::PCREL_OFFSET, DL, PtrVT, Full, Result);
+      Offset = 0;
+    }
+  } else {
+    Result = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, SystemZII::MO_GOT);
+    Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
+    Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
+                         MachinePointerInfo::getGOT(), false, false, false, 0);
+  }
+
+  // If there was a non-zero offset that we didn't fold, create an explicit
+  // addition for it.
+  if (Offset != 0)
+    Result = DAG.getNode(ISD::ADD, DL, PtrVT, Result,
+                         DAG.getConstant(Offset, PtrVT));
+
+  return Result;
+}
+
+SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
+						     SelectionDAG &DAG) const {
+  SDLoc DL(Node);
+  const GlobalValue *GV = Node->getGlobal();
+  EVT PtrVT = getPointerTy();
+  TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
+
+  if (model != TLSModel::LocalExec)
+    llvm_unreachable("only local-exec TLS mode supported");
+
+  // The high part of the thread pointer is in access register 0.
+  SDValue TPHi = DAG.getNode(SystemZISD::EXTRACT_ACCESS, DL, MVT::i32,
+                             DAG.getConstant(0, MVT::i32));
+  TPHi = DAG.getNode(ISD::ANY_EXTEND, DL, PtrVT, TPHi);
+
+  // The low part of the thread pointer is in access register 1.
+  SDValue TPLo = DAG.getNode(SystemZISD::EXTRACT_ACCESS, DL, MVT::i32,
+                             DAG.getConstant(1, MVT::i32));
+  TPLo = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TPLo);
+
+  // Merge them into a single 64-bit address.
+  SDValue TPHiShifted = DAG.getNode(ISD::SHL, DL, PtrVT, TPHi,
+				    DAG.getConstant(32, PtrVT));
+  SDValue TP = DAG.getNode(ISD::OR, DL, PtrVT, TPHiShifted, TPLo);
+
+  // Get the offset of GA from the thread pointer.
+  SystemZConstantPoolValue *CPV =
+    SystemZConstantPoolValue::Create(GV, SystemZCP::NTPOFF);
+
+  // Force the offset into the constant pool and load it from there.
+  SDValue CPAddr = DAG.getConstantPool(CPV, PtrVT, 8);
+  SDValue Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(),
+			       CPAddr, MachinePointerInfo::getConstantPool(),
+			       false, false, false, 0);
+
+  // Add the base and offset together.
+  return DAG.getNode(ISD::ADD, DL, PtrVT, TP, Offset);
+}
+
+SDValue SystemZTargetLowering::lowerBlockAddress(BlockAddressSDNode *Node,
+                                                 SelectionDAG &DAG) const {
+  SDLoc DL(Node);
+  const BlockAddress *BA = Node->getBlockAddress();
+  int64_t Offset = Node->getOffset();
+  EVT PtrVT = getPointerTy();
+
+  SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset);
+  Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
+  return Result;
+}
+
+SDValue SystemZTargetLowering::lowerJumpTable(JumpTableSDNode *JT,
+                                              SelectionDAG &DAG) const {
+  SDLoc DL(JT);
+  EVT PtrVT = getPointerTy();
+  SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
+
+  // Use LARL to load the address of the table.
+  return DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
+}
+
+SDValue SystemZTargetLowering::lowerConstantPool(ConstantPoolSDNode *CP,
+                                                 SelectionDAG &DAG) const {
+  SDLoc DL(CP);
+  EVT PtrVT = getPointerTy();
+
+  SDValue Result;
+  if (CP->isMachineConstantPoolEntry())
+    Result = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
+				       CP->getAlignment());
+  else
+    Result = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT,
+				       CP->getAlignment(), CP->getOffset());
+
+  // Use LARL to load the address of the constant pool entry.
+  return DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
+}
+
+SDValue SystemZTargetLowering::lowerBITCAST(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  SDValue In = Op.getOperand(0);
+  EVT InVT = In.getValueType();
+  EVT ResVT = Op.getValueType();
+
+  if (InVT == MVT::i32 && ResVT == MVT::f32) {
+    SDValue In64;
+    if (Subtarget.hasHighWord()) {
+      SDNode *U64 = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL,
+                                       MVT::i64);
+      In64 = DAG.getTargetInsertSubreg(SystemZ::subreg_h32, DL,
+                                       MVT::i64, SDValue(U64, 0), In);
+    } else {
+      In64 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, In);
+      In64 = DAG.getNode(ISD::SHL, DL, MVT::i64, In64,
+                         DAG.getConstant(32, MVT::i64));
+    }
+    SDValue Out64 = DAG.getNode(ISD::BITCAST, DL, MVT::f64, In64);
+    return DAG.getTargetExtractSubreg(SystemZ::subreg_h32,
+                                      DL, MVT::f32, Out64);
+  }
+  if (InVT == MVT::f32 && ResVT == MVT::i32) {
+    SDNode *U64 = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::f64);
+    SDValue In64 = DAG.getTargetInsertSubreg(SystemZ::subreg_h32, DL,
+                                             MVT::f64, SDValue(U64, 0), In);
+    SDValue Out64 = DAG.getNode(ISD::BITCAST, DL, MVT::i64, In64);
+    if (Subtarget.hasHighWord())
+      return DAG.getTargetExtractSubreg(SystemZ::subreg_h32, DL,
+                                        MVT::i32, Out64);
+    SDValue Shift = DAG.getNode(ISD::SRL, DL, MVT::i64, Out64,
+                                DAG.getConstant(32, MVT::i64));
+    return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Shift);
+  }
+  llvm_unreachable("Unexpected bitcast combination");
+}
+
+SDValue SystemZTargetLowering::lowerVASTART(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  SystemZMachineFunctionInfo *FuncInfo =
+    MF.getInfo<SystemZMachineFunctionInfo>();
+  EVT PtrVT = getPointerTy();
+
+  SDValue Chain   = Op.getOperand(0);
+  SDValue Addr    = Op.getOperand(1);
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  SDLoc DL(Op);
+
+  // The initial values of each field.
+  const unsigned NumFields = 4;
+  SDValue Fields[NumFields] = {
+    DAG.getConstant(FuncInfo->getVarArgsFirstGPR(), PtrVT),
+    DAG.getConstant(FuncInfo->getVarArgsFirstFPR(), PtrVT),
+    DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT),
+    DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT)
+  };
+
+  // Store each field into its respective slot.
+  SDValue MemOps[NumFields];
+  unsigned Offset = 0;
+  for (unsigned I = 0; I < NumFields; ++I) {
+    SDValue FieldAddr = Addr;
+    if (Offset != 0)
+      FieldAddr = DAG.getNode(ISD::ADD, DL, PtrVT, FieldAddr,
+                              DAG.getIntPtrConstant(Offset));
+    MemOps[I] = DAG.getStore(Chain, DL, Fields[I], FieldAddr,
+                             MachinePointerInfo(SV, Offset),
+                             false, false, 0);
+    Offset += 8;
+  }
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
+}
+
+SDValue SystemZTargetLowering::lowerVACOPY(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  SDValue Chain      = Op.getOperand(0);
+  SDValue DstPtr     = Op.getOperand(1);
+  SDValue SrcPtr     = Op.getOperand(2);
+  const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
+  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+  SDLoc DL(Op);
+
+  return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, DAG.getIntPtrConstant(32),
+                       /*Align*/8, /*isVolatile*/false, /*AlwaysInline*/false,
+                       MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
+}
+
+SDValue SystemZTargetLowering::
+lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size  = Op.getOperand(1);
+  SDLoc DL(Op);
+
+  unsigned SPReg = getStackPointerRegisterToSaveRestore();
+
+  // Get a reference to the stack pointer.
+  SDValue OldSP = DAG.getCopyFromReg(Chain, DL, SPReg, MVT::i64);
+
+  // Get the new stack pointer value.
+  SDValue NewSP = DAG.getNode(ISD::SUB, DL, MVT::i64, OldSP, Size);
+
+  // Copy the new stack pointer back.
+  Chain = DAG.getCopyToReg(Chain, DL, SPReg, NewSP);
+
+  // The allocated data lives above the 160 bytes allocated for the standard
+  // frame, plus any outgoing stack arguments.  We don't know how much that
+  // amounts to yet, so emit a special ADJDYNALLOC placeholder.
+  SDValue ArgAdjust = DAG.getNode(SystemZISD::ADJDYNALLOC, DL, MVT::i64);
+  SDValue Result = DAG.getNode(ISD::ADD, DL, MVT::i64, NewSP, ArgAdjust);
+
+  SDValue Ops[2] = { Result, Chain };
+  return DAG.getMergeValues(Ops, DL);
+}
+
+SDValue SystemZTargetLowering::lowerSMUL_LOHI(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  SDValue Ops[2];
+  if (is32Bit(VT))
+    // Just do a normal 64-bit multiplication and extract the results.
+    // We define this so that it can be used for constant division.
+    lowerMUL_LOHI32(DAG, DL, ISD::SIGN_EXTEND, Op.getOperand(0),
+                    Op.getOperand(1), Ops[1], Ops[0]);
+  else {
+    // Do a full 128-bit multiplication based on UMUL_LOHI64:
+    //
+    //   (ll * rl) + ((lh * rl) << 64) + ((ll * rh) << 64)
+    //
+    // but using the fact that the upper halves are either all zeros
+    // or all ones:
+    //
+    //   (ll * rl) - ((lh & rl) << 64) - ((ll & rh) << 64)
+    //
+    // and grouping the right terms together since they are quicker than the
+    // multiplication:
+    //
+    //   (ll * rl) - (((lh & rl) + (ll & rh)) << 64)
+    SDValue C63 = DAG.getConstant(63, MVT::i64);
+    SDValue LL = Op.getOperand(0);
+    SDValue RL = Op.getOperand(1);
+    SDValue LH = DAG.getNode(ISD::SRA, DL, VT, LL, C63);
+    SDValue RH = DAG.getNode(ISD::SRA, DL, VT, RL, C63);
+    // UMUL_LOHI64 returns the low result in the odd register and the high
+    // result in the even register.  SMUL_LOHI is defined to return the
+    // low half first, so the results are in reverse order.
+    lowerGR128Binary(DAG, DL, VT, SystemZ::AEXT128_64, SystemZISD::UMUL_LOHI64,
+                     LL, RL, Ops[1], Ops[0]);
+    SDValue NegLLTimesRH = DAG.getNode(ISD::AND, DL, VT, LL, RH);
+    SDValue NegLHTimesRL = DAG.getNode(ISD::AND, DL, VT, LH, RL);
+    SDValue NegSum = DAG.getNode(ISD::ADD, DL, VT, NegLLTimesRH, NegLHTimesRL);
+    Ops[1] = DAG.getNode(ISD::SUB, DL, VT, Ops[1], NegSum);
+  }
+  return DAG.getMergeValues(Ops, DL);
+}
+
+SDValue SystemZTargetLowering::lowerUMUL_LOHI(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  SDValue Ops[2];
+  if (is32Bit(VT))
+    // Just do a normal 64-bit multiplication and extract the results.
+    // We define this so that it can be used for constant division.
+    lowerMUL_LOHI32(DAG, DL, ISD::ZERO_EXTEND, Op.getOperand(0),
+                    Op.getOperand(1), Ops[1], Ops[0]);
+  else
+    // UMUL_LOHI64 returns the low result in the odd register and the high
+    // result in the even register.  UMUL_LOHI is defined to return the
+    // low half first, so the results are in reverse order.
+    lowerGR128Binary(DAG, DL, VT, SystemZ::AEXT128_64, SystemZISD::UMUL_LOHI64,
+                     Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]);
+  return DAG.getMergeValues(Ops, DL);
+}
+
+SDValue SystemZTargetLowering::lowerSDIVREM(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  unsigned Opcode;
+
+  // We use DSGF for 32-bit division.
+  if (is32Bit(VT)) {
+    Op0 = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Op0);
+    Opcode = SystemZISD::SDIVREM32;
+  } else if (DAG.ComputeNumSignBits(Op1) > 32) {
+    Op1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op1);
+    Opcode = SystemZISD::SDIVREM32;
+  } else    
+    Opcode = SystemZISD::SDIVREM64;
+
+  // DSG(F) takes a 64-bit dividend, so the even register in the GR128
+  // input is "don't care".  The instruction returns the remainder in
+  // the even register and the quotient in the odd register.
+  SDValue Ops[2];
+  lowerGR128Binary(DAG, DL, VT, SystemZ::AEXT128_64, Opcode,
+                   Op0, Op1, Ops[1], Ops[0]);
+  return DAG.getMergeValues(Ops, DL);
+}
+
+SDValue SystemZTargetLowering::lowerUDIVREM(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+
+  // DL(G) uses a double-width dividend, so we need to clear the even
+  // register in the GR128 input.  The instruction returns the remainder
+  // in the even register and the quotient in the odd register.
+  SDValue Ops[2];
+  if (is32Bit(VT))
+    lowerGR128Binary(DAG, DL, VT, SystemZ::ZEXT128_32, SystemZISD::UDIVREM32,
+                     Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]);
+  else
+    lowerGR128Binary(DAG, DL, VT, SystemZ::ZEXT128_64, SystemZISD::UDIVREM64,
+                     Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]);
+  return DAG.getMergeValues(Ops, DL);
+}
+
+SDValue SystemZTargetLowering::lowerOR(SDValue Op, SelectionDAG &DAG) const {
+  assert(Op.getValueType() == MVT::i64 && "Should be 64-bit operation");
+
+  // Get the known-zero masks for each operand.
+  SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1) };
+  APInt KnownZero[2], KnownOne[2];
+  DAG.computeKnownBits(Ops[0], KnownZero[0], KnownOne[0]);
+  DAG.computeKnownBits(Ops[1], KnownZero[1], KnownOne[1]);
+
+  // See if the upper 32 bits of one operand and the lower 32 bits of the
+  // other are known zero.  They are the low and high operands respectively.
+  uint64_t Masks[] = { KnownZero[0].getZExtValue(),
+                       KnownZero[1].getZExtValue() };
+  unsigned High, Low;
+  if ((Masks[0] >> 32) == 0xffffffff && uint32_t(Masks[1]) == 0xffffffff)
+    High = 1, Low = 0;
+  else if ((Masks[1] >> 32) == 0xffffffff && uint32_t(Masks[0]) == 0xffffffff)
+    High = 0, Low = 1;
+  else
+    return Op;
+
+  SDValue LowOp = Ops[Low];
+  SDValue HighOp = Ops[High];
+
+  // If the high part is a constant, we're better off using IILH.
+  if (HighOp.getOpcode() == ISD::Constant)
+    return Op;
+
+  // If the low part is a constant that is outside the range of LHI,
+  // then we're better off using IILF.
+  if (LowOp.getOpcode() == ISD::Constant) {
+    int64_t Value = int32_t(cast<ConstantSDNode>(LowOp)->getZExtValue());
+    if (!isInt<16>(Value))
+      return Op;
+  }
+
+  // Check whether the high part is an AND that doesn't change the
+  // high 32 bits and just masks out low bits.  We can skip it if so.
+  if (HighOp.getOpcode() == ISD::AND &&
+      HighOp.getOperand(1).getOpcode() == ISD::Constant) {
+    SDValue HighOp0 = HighOp.getOperand(0);
+    uint64_t Mask = cast<ConstantSDNode>(HighOp.getOperand(1))->getZExtValue();
+    if (DAG.MaskedValueIsZero(HighOp0, APInt(64, ~(Mask | 0xffffffff))))
+      HighOp = HighOp0;
+  }
+
+  // Take advantage of the fact that all GR32 operations only change the
+  // low 32 bits by truncating Low to an i32 and inserting it directly
+  // using a subreg.  The interesting cases are those where the truncation
+  // can be folded.
+  SDLoc DL(Op);
+  SDValue Low32 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, LowOp);
+  return DAG.getTargetInsertSubreg(SystemZ::subreg_l32, DL,
+                                   MVT::i64, HighOp, Low32);
+}
+
+// Op is an atomic load.  Lower it into a normal volatile load.
+SDValue SystemZTargetLowering::lowerATOMIC_LOAD(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  auto *Node = cast<AtomicSDNode>(Op.getNode());
+  return DAG.getExtLoad(ISD::EXTLOAD, SDLoc(Op), Op.getValueType(),
+                        Node->getChain(), Node->getBasePtr(),
+                        Node->getMemoryVT(), Node->getMemOperand());
+}
+
+// Op is an atomic store.  Lower it into a normal volatile store followed
+// by a serialization.
+SDValue SystemZTargetLowering::lowerATOMIC_STORE(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  auto *Node = cast<AtomicSDNode>(Op.getNode());
+  SDValue Chain = DAG.getTruncStore(Node->getChain(), SDLoc(Op), Node->getVal(),
+                                    Node->getBasePtr(), Node->getMemoryVT(),
+                                    Node->getMemOperand());
+  return SDValue(DAG.getMachineNode(SystemZ::Serialize, SDLoc(Op), MVT::Other,
+                                    Chain), 0);
+}
+
+// Op is an 8-, 16-bit or 32-bit ATOMIC_LOAD_* operation.  Lower the first
+// two into the fullword ATOMIC_LOADW_* operation given by Opcode.
+SDValue SystemZTargetLowering::lowerATOMIC_LOAD_OP(SDValue Op,
+                                                   SelectionDAG &DAG,
+                                                   unsigned Opcode) const {
+  auto *Node = cast<AtomicSDNode>(Op.getNode());
+
+  // 32-bit operations need no code outside the main loop.
+  EVT NarrowVT = Node->getMemoryVT();
+  EVT WideVT = MVT::i32;
+  if (NarrowVT == WideVT)
+    return Op;
+
+  int64_t BitSize = NarrowVT.getSizeInBits();
+  SDValue ChainIn = Node->getChain();
+  SDValue Addr = Node->getBasePtr();
+  SDValue Src2 = Node->getVal();
+  MachineMemOperand *MMO = Node->getMemOperand();
+  SDLoc DL(Node);
+  EVT PtrVT = Addr.getValueType();
+
+  // Convert atomic subtracts of constants into additions.
+  if (Opcode == SystemZISD::ATOMIC_LOADW_SUB)
+    if (auto *Const = dyn_cast<ConstantSDNode>(Src2)) {
+      Opcode = SystemZISD::ATOMIC_LOADW_ADD;
+      Src2 = DAG.getConstant(-Const->getSExtValue(), Src2.getValueType());
+    }
+
+  // Get the address of the containing word.
+  SDValue AlignedAddr = DAG.getNode(ISD::AND, DL, PtrVT, Addr,
+                                    DAG.getConstant(-4, PtrVT));
+
+  // Get the number of bits that the word must be rotated left in order
+  // to bring the field to the top bits of a GR32.
+  SDValue BitShift = DAG.getNode(ISD::SHL, DL, PtrVT, Addr,
+                                 DAG.getConstant(3, PtrVT));
+  BitShift = DAG.getNode(ISD::TRUNCATE, DL, WideVT, BitShift);
+
+  // Get the complementing shift amount, for rotating a field in the top
+  // bits back to its proper position.
+  SDValue NegBitShift = DAG.getNode(ISD::SUB, DL, WideVT,
+                                    DAG.getConstant(0, WideVT), BitShift);
+
+  // Extend the source operand to 32 bits and prepare it for the inner loop.
+  // ATOMIC_SWAPW uses RISBG to rotate the field left, but all other
+  // operations require the source to be shifted in advance.  (This shift
+  // can be folded if the source is constant.)  For AND and NAND, the lower
+  // bits must be set, while for other opcodes they should be left clear.
+  if (Opcode != SystemZISD::ATOMIC_SWAPW)
+    Src2 = DAG.getNode(ISD::SHL, DL, WideVT, Src2,
+                       DAG.getConstant(32 - BitSize, WideVT));
+  if (Opcode == SystemZISD::ATOMIC_LOADW_AND ||
+      Opcode == SystemZISD::ATOMIC_LOADW_NAND)
+    Src2 = DAG.getNode(ISD::OR, DL, WideVT, Src2,
+                       DAG.getConstant(uint32_t(-1) >> BitSize, WideVT));
+
+  // Construct the ATOMIC_LOADW_* node.
+  SDVTList VTList = DAG.getVTList(WideVT, MVT::Other);
+  SDValue Ops[] = { ChainIn, AlignedAddr, Src2, BitShift, NegBitShift,
+                    DAG.getConstant(BitSize, WideVT) };
+  SDValue AtomicOp = DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops,
+                                             NarrowVT, MMO);
+
+  // Rotate the result of the final CS so that the field is in the lower
+  // bits of a GR32, then truncate it.
+  SDValue ResultShift = DAG.getNode(ISD::ADD, DL, WideVT, BitShift,
+                                    DAG.getConstant(BitSize, WideVT));
+  SDValue Result = DAG.getNode(ISD::ROTL, DL, WideVT, AtomicOp, ResultShift);
+
+  SDValue RetOps[2] = { Result, AtomicOp.getValue(1) };
+  return DAG.getMergeValues(RetOps, DL);
+}
+
+// Op is an ATOMIC_LOAD_SUB operation.  Lower 8- and 16-bit operations
+// into ATOMIC_LOADW_SUBs and decide whether to convert 32- and 64-bit
+// operations into additions.
+SDValue SystemZTargetLowering::lowerATOMIC_LOAD_SUB(SDValue Op,
+                                                    SelectionDAG &DAG) const {
+  auto *Node = cast<AtomicSDNode>(Op.getNode());
+  EVT MemVT = Node->getMemoryVT();
+  if (MemVT == MVT::i32 || MemVT == MVT::i64) {
+    // A full-width operation.
+    assert(Op.getValueType() == MemVT && "Mismatched VTs");
+    SDValue Src2 = Node->getVal();
+    SDValue NegSrc2;
+    SDLoc DL(Src2);
+
+    if (auto *Op2 = dyn_cast<ConstantSDNode>(Src2)) {
+      // Use an addition if the operand is constant and either LAA(G) is
+      // available or the negative value is in the range of A(G)FHI.
+      int64_t Value = (-Op2->getAPIntValue()).getSExtValue();
+      if (isInt<32>(Value) || Subtarget.hasInterlockedAccess1())
+        NegSrc2 = DAG.getConstant(Value, MemVT);
+    } else if (Subtarget.hasInterlockedAccess1())
+      // Use LAA(G) if available.
+      NegSrc2 = DAG.getNode(ISD::SUB, DL, MemVT, DAG.getConstant(0, MemVT),
+                            Src2);
+
+    if (NegSrc2.getNode())
+      return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, MemVT,
+                           Node->getChain(), Node->getBasePtr(), NegSrc2,
+                           Node->getMemOperand(), Node->getOrdering(),
+                           Node->getSynchScope());
+
+    // Use the node as-is.
+    return Op;
+  }
+
+  return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_SUB);
+}
+
+// Node is an 8- or 16-bit ATOMIC_CMP_SWAP operation.  Lower the first two
+// into a fullword ATOMIC_CMP_SWAPW operation.
+SDValue SystemZTargetLowering::lowerATOMIC_CMP_SWAP(SDValue Op,
+                                                    SelectionDAG &DAG) const {
+  auto *Node = cast<AtomicSDNode>(Op.getNode());
+
+  // We have native support for 32-bit compare and swap.
+  EVT NarrowVT = Node->getMemoryVT();
+  EVT WideVT = MVT::i32;
+  if (NarrowVT == WideVT)
+    return Op;
+
+  int64_t BitSize = NarrowVT.getSizeInBits();
+  SDValue ChainIn = Node->getOperand(0);
+  SDValue Addr = Node->getOperand(1);
+  SDValue CmpVal = Node->getOperand(2);
+  SDValue SwapVal = Node->getOperand(3);
+  MachineMemOperand *MMO = Node->getMemOperand();
+  SDLoc DL(Node);
+  EVT PtrVT = Addr.getValueType();
+
+  // Get the address of the containing word.
+  SDValue AlignedAddr = DAG.getNode(ISD::AND, DL, PtrVT, Addr,
+                                    DAG.getConstant(-4, PtrVT));
+
+  // Get the number of bits that the word must be rotated left in order
+  // to bring the field to the top bits of a GR32.
+  SDValue BitShift = DAG.getNode(ISD::SHL, DL, PtrVT, Addr,
+                                 DAG.getConstant(3, PtrVT));
+  BitShift = DAG.getNode(ISD::TRUNCATE, DL, WideVT, BitShift);
+
+  // Get the complementing shift amount, for rotating a field in the top
+  // bits back to its proper position.
+  SDValue NegBitShift = DAG.getNode(ISD::SUB, DL, WideVT,
+                                    DAG.getConstant(0, WideVT), BitShift);
+
+  // Construct the ATOMIC_CMP_SWAPW node.
+  SDVTList VTList = DAG.getVTList(WideVT, MVT::Other);
+  SDValue Ops[] = { ChainIn, AlignedAddr, CmpVal, SwapVal, BitShift,
+                    NegBitShift, DAG.getConstant(BitSize, WideVT) };
+  SDValue AtomicOp = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_CMP_SWAPW, DL,
+                                             VTList, Ops, NarrowVT, MMO);
+  return AtomicOp;
+}
+
+SDValue SystemZTargetLowering::lowerSTACKSAVE(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MF.getInfo<SystemZMachineFunctionInfo>()->setManipulatesSP(true);
+  return DAG.getCopyFromReg(Op.getOperand(0), SDLoc(Op),
+                            SystemZ::R15D, Op.getValueType());
+}
+
+SDValue SystemZTargetLowering::lowerSTACKRESTORE(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MF.getInfo<SystemZMachineFunctionInfo>()->setManipulatesSP(true);
+  return DAG.getCopyToReg(Op.getOperand(0), SDLoc(Op),
+                          SystemZ::R15D, Op.getOperand(1));
+}
+
+SDValue SystemZTargetLowering::lowerPREFETCH(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  bool IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
+  if (!IsData)
+    // Just preserve the chain.
+    return Op.getOperand(0);
+
+  bool IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+  unsigned Code = IsWrite ? SystemZ::PFD_WRITE : SystemZ::PFD_READ;
+  auto *Node = cast<MemIntrinsicSDNode>(Op.getNode());
+  SDValue Ops[] = {
+    Op.getOperand(0),
+    DAG.getConstant(Code, MVT::i32),
+    Op.getOperand(1)
+  };
+  return DAG.getMemIntrinsicNode(SystemZISD::PREFETCH, SDLoc(Op),
+                                 Node->getVTList(), Ops,
+                                 Node->getMemoryVT(), Node->getMemOperand());
+}
+
+SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  case ISD::BR_CC:
+    return lowerBR_CC(Op, DAG);
+  case ISD::SELECT_CC:
+    return lowerSELECT_CC(Op, DAG);
+  case ISD::SETCC:
+    return lowerSETCC(Op, DAG);
+  case ISD::GlobalAddress:
+    return lowerGlobalAddress(cast<GlobalAddressSDNode>(Op), DAG);
+  case ISD::GlobalTLSAddress:
+    return lowerGlobalTLSAddress(cast<GlobalAddressSDNode>(Op), DAG);
+  case ISD::BlockAddress:
+    return lowerBlockAddress(cast<BlockAddressSDNode>(Op), DAG);
+  case ISD::JumpTable:
+    return lowerJumpTable(cast<JumpTableSDNode>(Op), DAG);
+  case ISD::ConstantPool:
+    return lowerConstantPool(cast<ConstantPoolSDNode>(Op), DAG);
+  case ISD::BITCAST:
+    return lowerBITCAST(Op, DAG);
+  case ISD::VASTART:
+    return lowerVASTART(Op, DAG);
+  case ISD::VACOPY:
+    return lowerVACOPY(Op, DAG);
+  case ISD::DYNAMIC_STACKALLOC:
+    return lowerDYNAMIC_STACKALLOC(Op, DAG);
+  case ISD::SMUL_LOHI:
+    return lowerSMUL_LOHI(Op, DAG);
+  case ISD::UMUL_LOHI:
+    return lowerUMUL_LOHI(Op, DAG);
+  case ISD::SDIVREM:
+    return lowerSDIVREM(Op, DAG);
+  case ISD::UDIVREM:
+    return lowerUDIVREM(Op, DAG);
+  case ISD::OR:
+    return lowerOR(Op, DAG);
+  case ISD::ATOMIC_SWAP:
+    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_SWAPW);
+  case ISD::ATOMIC_STORE:
+    return lowerATOMIC_STORE(Op, DAG);
+  case ISD::ATOMIC_LOAD:
+    return lowerATOMIC_LOAD(Op, DAG);
+  case ISD::ATOMIC_LOAD_ADD:
+    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_ADD);
+  case ISD::ATOMIC_LOAD_SUB:
+    return lowerATOMIC_LOAD_SUB(Op, DAG);
+  case ISD::ATOMIC_LOAD_AND:
+    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_AND);
+  case ISD::ATOMIC_LOAD_OR:
+    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_OR);
+  case ISD::ATOMIC_LOAD_XOR:
+    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_XOR);
+  case ISD::ATOMIC_LOAD_NAND:
+    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_NAND);
+  case ISD::ATOMIC_LOAD_MIN:
+    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_MIN);
+  case ISD::ATOMIC_LOAD_MAX:
+    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_MAX);
+  case ISD::ATOMIC_LOAD_UMIN:
+    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_UMIN);
+  case ISD::ATOMIC_LOAD_UMAX:
+    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_UMAX);
+  case ISD::ATOMIC_CMP_SWAP:
+    return lowerATOMIC_CMP_SWAP(Op, DAG);
+  case ISD::STACKSAVE:
+    return lowerSTACKSAVE(Op, DAG);
+  case ISD::STACKRESTORE:
+    return lowerSTACKRESTORE(Op, DAG);
+  case ISD::PREFETCH:
+    return lowerPREFETCH(Op, DAG);
+  default:
+    llvm_unreachable("Unexpected node to lower");
+  }
+}
+
+const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
+#define OPCODE(NAME) case SystemZISD::NAME: return "SystemZISD::" #NAME
+  switch (Opcode) {
+    OPCODE(RET_FLAG);
+    OPCODE(CALL);
+    OPCODE(SIBCALL);
+    OPCODE(PCREL_WRAPPER);
+    OPCODE(PCREL_OFFSET);
+    OPCODE(IABS);
+    OPCODE(ICMP);
+    OPCODE(FCMP);
+    OPCODE(TM);
+    OPCODE(BR_CCMASK);
+    OPCODE(SELECT_CCMASK);
+    OPCODE(ADJDYNALLOC);
+    OPCODE(EXTRACT_ACCESS);
+    OPCODE(UMUL_LOHI64);
+    OPCODE(SDIVREM64);
+    OPCODE(UDIVREM32);
+    OPCODE(UDIVREM64);
+    OPCODE(MVC);
+    OPCODE(MVC_LOOP);
+    OPCODE(NC);
+    OPCODE(NC_LOOP);
+    OPCODE(OC);
+    OPCODE(OC_LOOP);
+    OPCODE(XC);
+    OPCODE(XC_LOOP);
+    OPCODE(CLC);
+    OPCODE(CLC_LOOP);
+    OPCODE(STRCMP);
+    OPCODE(STPCPY);
+    OPCODE(SEARCH_STRING);
+    OPCODE(IPM);
+    OPCODE(SERIALIZE);
+    OPCODE(ATOMIC_SWAPW);
+    OPCODE(ATOMIC_LOADW_ADD);
+    OPCODE(ATOMIC_LOADW_SUB);
+    OPCODE(ATOMIC_LOADW_AND);
+    OPCODE(ATOMIC_LOADW_OR);
+    OPCODE(ATOMIC_LOADW_XOR);
+    OPCODE(ATOMIC_LOADW_NAND);
+    OPCODE(ATOMIC_LOADW_MIN);
+    OPCODE(ATOMIC_LOADW_MAX);
+    OPCODE(ATOMIC_LOADW_UMIN);
+    OPCODE(ATOMIC_LOADW_UMAX);
+    OPCODE(ATOMIC_CMP_SWAPW);
+    OPCODE(PREFETCH);
+  }
+  return nullptr;
+#undef OPCODE
+}
+
+SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
+                                                 DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  unsigned Opcode = N->getOpcode();
+  if (Opcode == ISD::SIGN_EXTEND) {
+    // Convert (sext (ashr (shl X, C1), C2)) to
+    // (ashr (shl (anyext X), C1'), C2')), since wider shifts are as
+    // cheap as narrower ones.
+    SDValue N0 = N->getOperand(0);
+    EVT VT = N->getValueType(0);
+    if (N0.hasOneUse() && N0.getOpcode() == ISD::SRA) {
+      auto *SraAmt = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+      SDValue Inner = N0.getOperand(0);
+      if (SraAmt && Inner.hasOneUse() && Inner.getOpcode() == ISD::SHL) {
+        if (auto *ShlAmt = dyn_cast<ConstantSDNode>(Inner.getOperand(1))) {
+          unsigned Extra = (VT.getSizeInBits() -
+                            N0.getValueType().getSizeInBits());
+          unsigned NewShlAmt = ShlAmt->getZExtValue() + Extra;
+          unsigned NewSraAmt = SraAmt->getZExtValue() + Extra;
+          EVT ShiftVT = N0.getOperand(1).getValueType();
+          SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SDLoc(Inner), VT,
+                                    Inner.getOperand(0));
+          SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(Inner), VT, Ext,
+                                    DAG.getConstant(NewShlAmt, ShiftVT));
+          return DAG.getNode(ISD::SRA, SDLoc(N0), VT, Shl,
+                             DAG.getConstant(NewSraAmt, ShiftVT));
+        }
+      }
+    }
+  }
+  return SDValue();
+}
+
+//===----------------------------------------------------------------------===//
+// Custom insertion
+//===----------------------------------------------------------------------===//
+
+// Create a new basic block after MBB.
+static MachineBasicBlock *emitBlockAfter(MachineBasicBlock *MBB) {
+  MachineFunction &MF = *MBB->getParent();
+  MachineBasicBlock *NewMBB = MF.CreateMachineBasicBlock(MBB->getBasicBlock());
+  MF.insert(std::next(MachineFunction::iterator(MBB)), NewMBB);
+  return NewMBB;
+}
+
+// Split MBB after MI and return the new block (the one that contains
+// instructions after MI).
+static MachineBasicBlock *splitBlockAfter(MachineInstr *MI,
+                                          MachineBasicBlock *MBB) {
+  MachineBasicBlock *NewMBB = emitBlockAfter(MBB);
+  NewMBB->splice(NewMBB->begin(), MBB,
+                 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+  NewMBB->transferSuccessorsAndUpdatePHIs(MBB);
+  return NewMBB;
+}
+
+// Split MBB before MI and return the new block (the one that contains MI).
+static MachineBasicBlock *splitBlockBefore(MachineInstr *MI,
+                                           MachineBasicBlock *MBB) {
+  MachineBasicBlock *NewMBB = emitBlockAfter(MBB);
+  NewMBB->splice(NewMBB->begin(), MBB, MI, MBB->end());
+  NewMBB->transferSuccessorsAndUpdatePHIs(MBB);
+  return NewMBB;
+}
+
+// Force base value Base into a register before MI.  Return the register.
+static unsigned forceReg(MachineInstr *MI, MachineOperand &Base,
+                         const SystemZInstrInfo *TII) {
+  if (Base.isReg())
+    return Base.getReg();
+
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineFunction &MF = *MBB->getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+  BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(SystemZ::LA), Reg)
+    .addOperand(Base).addImm(0).addReg(0);
+  return Reg;
+}
+
+// Implement EmitInstrWithCustomInserter for pseudo Select* instruction MI.
+MachineBasicBlock *
+SystemZTargetLowering::emitSelect(MachineInstr *MI,
+                                  MachineBasicBlock *MBB) const {
+  const SystemZInstrInfo *TII = static_cast<const SystemZInstrInfo *>(
+      MBB->getParent()->getTarget().getInstrInfo());
+
+  unsigned DestReg  = MI->getOperand(0).getReg();
+  unsigned TrueReg  = MI->getOperand(1).getReg();
+  unsigned FalseReg = MI->getOperand(2).getReg();
+  unsigned CCValid  = MI->getOperand(3).getImm();
+  unsigned CCMask   = MI->getOperand(4).getImm();
+  DebugLoc DL       = MI->getDebugLoc();
+
+  MachineBasicBlock *StartMBB = MBB;
+  MachineBasicBlock *JoinMBB  = splitBlockBefore(MI, MBB);
+  MachineBasicBlock *FalseMBB = emitBlockAfter(StartMBB);
+
+  //  StartMBB:
+  //   BRC CCMask, JoinMBB
+  //   # fallthrough to FalseMBB
+  MBB = StartMBB;
+  BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+    .addImm(CCValid).addImm(CCMask).addMBB(JoinMBB);
+  MBB->addSuccessor(JoinMBB);
+  MBB->addSuccessor(FalseMBB);
+
+  //  FalseMBB:
+  //   # fallthrough to JoinMBB
+  MBB = FalseMBB;
+  MBB->addSuccessor(JoinMBB);
+
+  //  JoinMBB:
+  //   %Result = phi [ %FalseReg, FalseMBB ], [ %TrueReg, StartMBB ]
+  //  ...
+  MBB = JoinMBB;
+  BuildMI(*MBB, MI, DL, TII->get(SystemZ::PHI), DestReg)
+    .addReg(TrueReg).addMBB(StartMBB)
+    .addReg(FalseReg).addMBB(FalseMBB);
+
+  MI->eraseFromParent();
+  return JoinMBB;
+}
+
+// Implement EmitInstrWithCustomInserter for pseudo CondStore* instruction MI.
+// StoreOpcode is the store to use and Invert says whether the store should
+// happen when the condition is false rather than true.  If a STORE ON
+// CONDITION is available, STOCOpcode is its opcode, otherwise it is 0.
+MachineBasicBlock *
+SystemZTargetLowering::emitCondStore(MachineInstr *MI,
+                                     MachineBasicBlock *MBB,
+                                     unsigned StoreOpcode, unsigned STOCOpcode,
+                                     bool Invert) const {
+  const SystemZInstrInfo *TII = static_cast<const SystemZInstrInfo *>(
+      MBB->getParent()->getTarget().getInstrInfo());
+
+  unsigned SrcReg     = MI->getOperand(0).getReg();
+  MachineOperand Base = MI->getOperand(1);
+  int64_t Disp        = MI->getOperand(2).getImm();
+  unsigned IndexReg   = MI->getOperand(3).getReg();
+  unsigned CCValid    = MI->getOperand(4).getImm();
+  unsigned CCMask     = MI->getOperand(5).getImm();
+  DebugLoc DL         = MI->getDebugLoc();
+
+  StoreOpcode = TII->getOpcodeForOffset(StoreOpcode, Disp);
+
+  // Use STOCOpcode if possible.  We could use different store patterns in
+  // order to avoid matching the index register, but the performance trade-offs
+  // might be more complicated in that case.
+  if (STOCOpcode && !IndexReg && Subtarget.hasLoadStoreOnCond()) {
+    if (Invert)
+      CCMask ^= CCValid;
+    BuildMI(*MBB, MI, DL, TII->get(STOCOpcode))
+      .addReg(SrcReg).addOperand(Base).addImm(Disp)
+      .addImm(CCValid).addImm(CCMask);
+    MI->eraseFromParent();
+    return MBB;
+  }
+
+  // Get the condition needed to branch around the store.
+  if (!Invert)
+    CCMask ^= CCValid;
+
+  MachineBasicBlock *StartMBB = MBB;
+  MachineBasicBlock *JoinMBB  = splitBlockBefore(MI, MBB);
+  MachineBasicBlock *FalseMBB = emitBlockAfter(StartMBB);
+
+  //  StartMBB:
+  //   BRC CCMask, JoinMBB
+  //   # fallthrough to FalseMBB
+  MBB = StartMBB;
+  BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+    .addImm(CCValid).addImm(CCMask).addMBB(JoinMBB);
+  MBB->addSuccessor(JoinMBB);
+  MBB->addSuccessor(FalseMBB);
+
+  //  FalseMBB:
+  //   store %SrcReg, %Disp(%Index,%Base)
+  //   # fallthrough to JoinMBB
+  MBB = FalseMBB;
+  BuildMI(MBB, DL, TII->get(StoreOpcode))
+    .addReg(SrcReg).addOperand(Base).addImm(Disp).addReg(IndexReg);
+  MBB->addSuccessor(JoinMBB);
+
+  MI->eraseFromParent();
+  return JoinMBB;
+}
+
+// Implement EmitInstrWithCustomInserter for pseudo ATOMIC_LOAD{,W}_*
+// or ATOMIC_SWAP{,W} instruction MI.  BinOpcode is the instruction that
+// performs the binary operation elided by "*", or 0 for ATOMIC_SWAP{,W}.
+// BitSize is the width of the field in bits, or 0 if this is a partword
+// ATOMIC_LOADW_* or ATOMIC_SWAPW instruction, in which case the bitsize
+// is one of the operands.  Invert says whether the field should be
+// inverted after performing BinOpcode (e.g. for NAND).
+MachineBasicBlock *
+SystemZTargetLowering::emitAtomicLoadBinary(MachineInstr *MI,
+                                            MachineBasicBlock *MBB,
+                                            unsigned BinOpcode,
+                                            unsigned BitSize,
+                                            bool Invert) const {
+  MachineFunction &MF = *MBB->getParent();
+  const SystemZInstrInfo *TII =
+      static_cast<const SystemZInstrInfo *>(MF.getTarget().getInstrInfo());
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  bool IsSubWord = (BitSize < 32);
+
+  // Extract the operands.  Base can be a register or a frame index.
+  // Src2 can be a register or immediate.
+  unsigned Dest        = MI->getOperand(0).getReg();
+  MachineOperand Base  = earlyUseOperand(MI->getOperand(1));
+  int64_t Disp         = MI->getOperand(2).getImm();
+  MachineOperand Src2  = earlyUseOperand(MI->getOperand(3));
+  unsigned BitShift    = (IsSubWord ? MI->getOperand(4).getReg() : 0);
+  unsigned NegBitShift = (IsSubWord ? MI->getOperand(5).getReg() : 0);
+  DebugLoc DL          = MI->getDebugLoc();
+  if (IsSubWord)
+    BitSize = MI->getOperand(6).getImm();
+
+  // Subword operations use 32-bit registers.
+  const TargetRegisterClass *RC = (BitSize <= 32 ?
+                                   &SystemZ::GR32BitRegClass :
+                                   &SystemZ::GR64BitRegClass);
+  unsigned LOpcode  = BitSize <= 32 ? SystemZ::L  : SystemZ::LG;
+  unsigned CSOpcode = BitSize <= 32 ? SystemZ::CS : SystemZ::CSG;
+
+  // Get the right opcodes for the displacement.
+  LOpcode  = TII->getOpcodeForOffset(LOpcode,  Disp);
+  CSOpcode = TII->getOpcodeForOffset(CSOpcode, Disp);
+  assert(LOpcode && CSOpcode && "Displacement out of range");
+
+  // Create virtual registers for temporary results.
+  unsigned OrigVal       = MRI.createVirtualRegister(RC);
+  unsigned OldVal        = MRI.createVirtualRegister(RC);
+  unsigned NewVal        = (BinOpcode || IsSubWord ?
+                            MRI.createVirtualRegister(RC) : Src2.getReg());
+  unsigned RotatedOldVal = (IsSubWord ? MRI.createVirtualRegister(RC) : OldVal);
+  unsigned RotatedNewVal = (IsSubWord ? MRI.createVirtualRegister(RC) : NewVal);
+
+  // Insert a basic block for the main loop.
+  MachineBasicBlock *StartMBB = MBB;
+  MachineBasicBlock *DoneMBB  = splitBlockBefore(MI, MBB);
+  MachineBasicBlock *LoopMBB  = emitBlockAfter(StartMBB);
+
+  //  StartMBB:
+  //   ...
+  //   %OrigVal = L Disp(%Base)
+  //   # fall through to LoopMMB
+  MBB = StartMBB;
+  BuildMI(MBB, DL, TII->get(LOpcode), OrigVal)
+    .addOperand(Base).addImm(Disp).addReg(0);
+  MBB->addSuccessor(LoopMBB);
+
+  //  LoopMBB:
+  //   %OldVal        = phi [ %OrigVal, StartMBB ], [ %Dest, LoopMBB ]
+  //   %RotatedOldVal = RLL %OldVal, 0(%BitShift)
+  //   %RotatedNewVal = OP %RotatedOldVal, %Src2
+  //   %NewVal        = RLL %RotatedNewVal, 0(%NegBitShift)
+  //   %Dest          = CS %OldVal, %NewVal, Disp(%Base)
+  //   JNE LoopMBB
+  //   # fall through to DoneMMB
+  MBB = LoopMBB;
+  BuildMI(MBB, DL, TII->get(SystemZ::PHI), OldVal)
+    .addReg(OrigVal).addMBB(StartMBB)
+    .addReg(Dest).addMBB(LoopMBB);
+  if (IsSubWord)
+    BuildMI(MBB, DL, TII->get(SystemZ::RLL), RotatedOldVal)
+      .addReg(OldVal).addReg(BitShift).addImm(0);
+  if (Invert) {
+    // Perform the operation normally and then invert every bit of the field.
+    unsigned Tmp = MRI.createVirtualRegister(RC);
+    BuildMI(MBB, DL, TII->get(BinOpcode), Tmp)
+      .addReg(RotatedOldVal).addOperand(Src2);
+    if (BitSize < 32)
+      // XILF with the upper BitSize bits set.
+      BuildMI(MBB, DL, TII->get(SystemZ::XILF), RotatedNewVal)
+        .addReg(Tmp).addImm(uint32_t(~0 << (32 - BitSize)));
+    else if (BitSize == 32)
+      // XILF with every bit set.
+      BuildMI(MBB, DL, TII->get(SystemZ::XILF), RotatedNewVal)
+        .addReg(Tmp).addImm(~uint32_t(0));
+    else {
+      // Use LCGR and add -1 to the result, which is more compact than
+      // an XILF, XILH pair.
+      unsigned Tmp2 = MRI.createVirtualRegister(RC);
+      BuildMI(MBB, DL, TII->get(SystemZ::LCGR), Tmp2).addReg(Tmp);
+      BuildMI(MBB, DL, TII->get(SystemZ::AGHI), RotatedNewVal)
+        .addReg(Tmp2).addImm(-1);
+    }
+  } else if (BinOpcode)
+    // A simply binary operation.
+    BuildMI(MBB, DL, TII->get(BinOpcode), RotatedNewVal)
+      .addReg(RotatedOldVal).addOperand(Src2);
+  else if (IsSubWord)
+    // Use RISBG to rotate Src2 into position and use it to replace the
+    // field in RotatedOldVal.
+    BuildMI(MBB, DL, TII->get(SystemZ::RISBG32), RotatedNewVal)
+      .addReg(RotatedOldVal).addReg(Src2.getReg())
+      .addImm(32).addImm(31 + BitSize).addImm(32 - BitSize);
+  if (IsSubWord)
+    BuildMI(MBB, DL, TII->get(SystemZ::RLL), NewVal)
+      .addReg(RotatedNewVal).addReg(NegBitShift).addImm(0);
+  BuildMI(MBB, DL, TII->get(CSOpcode), Dest)
+    .addReg(OldVal).addReg(NewVal).addOperand(Base).addImm(Disp);
+  BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+    .addImm(SystemZ::CCMASK_CS).addImm(SystemZ::CCMASK_CS_NE).addMBB(LoopMBB);
+  MBB->addSuccessor(LoopMBB);
+  MBB->addSuccessor(DoneMBB);
+
+  MI->eraseFromParent();
+  return DoneMBB;
+}
+
+// Implement EmitInstrWithCustomInserter for pseudo
+// ATOMIC_LOAD{,W}_{,U}{MIN,MAX} instruction MI.  CompareOpcode is the
+// instruction that should be used to compare the current field with the
+// minimum or maximum value.  KeepOldMask is the BRC condition-code mask
+// for when the current field should be kept.  BitSize is the width of
+// the field in bits, or 0 if this is a partword ATOMIC_LOADW_* instruction.
+MachineBasicBlock *
+SystemZTargetLowering::emitAtomicLoadMinMax(MachineInstr *MI,
+                                            MachineBasicBlock *MBB,
+                                            unsigned CompareOpcode,
+                                            unsigned KeepOldMask,
+                                            unsigned BitSize) const {
+  MachineFunction &MF = *MBB->getParent();
+  const SystemZInstrInfo *TII =
+      static_cast<const SystemZInstrInfo *>(MF.getTarget().getInstrInfo());
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  bool IsSubWord = (BitSize < 32);
+
+  // Extract the operands.  Base can be a register or a frame index.
+  unsigned Dest        = MI->getOperand(0).getReg();
+  MachineOperand Base  = earlyUseOperand(MI->getOperand(1));
+  int64_t  Disp        = MI->getOperand(2).getImm();
+  unsigned Src2        = MI->getOperand(3).getReg();
+  unsigned BitShift    = (IsSubWord ? MI->getOperand(4).getReg() : 0);
+  unsigned NegBitShift = (IsSubWord ? MI->getOperand(5).getReg() : 0);
+  DebugLoc DL          = MI->getDebugLoc();
+  if (IsSubWord)
+    BitSize = MI->getOperand(6).getImm();
+
+  // Subword operations use 32-bit registers.
+  const TargetRegisterClass *RC = (BitSize <= 32 ?
+                                   &SystemZ::GR32BitRegClass :
+                                   &SystemZ::GR64BitRegClass);
+  unsigned LOpcode  = BitSize <= 32 ? SystemZ::L  : SystemZ::LG;
+  unsigned CSOpcode = BitSize <= 32 ? SystemZ::CS : SystemZ::CSG;
+
+  // Get the right opcodes for the displacement.
+  LOpcode  = TII->getOpcodeForOffset(LOpcode,  Disp);
+  CSOpcode = TII->getOpcodeForOffset(CSOpcode, Disp);
+  assert(LOpcode && CSOpcode && "Displacement out of range");
+
+  // Create virtual registers for temporary results.
+  unsigned OrigVal       = MRI.createVirtualRegister(RC);
+  unsigned OldVal        = MRI.createVirtualRegister(RC);
+  unsigned NewVal        = MRI.createVirtualRegister(RC);
+  unsigned RotatedOldVal = (IsSubWord ? MRI.createVirtualRegister(RC) : OldVal);
+  unsigned RotatedAltVal = (IsSubWord ? MRI.createVirtualRegister(RC) : Src2);
+  unsigned RotatedNewVal = (IsSubWord ? MRI.createVirtualRegister(RC) : NewVal);
+
+  // Insert 3 basic blocks for the loop.
+  MachineBasicBlock *StartMBB  = MBB;
+  MachineBasicBlock *DoneMBB   = splitBlockBefore(MI, MBB);
+  MachineBasicBlock *LoopMBB   = emitBlockAfter(StartMBB);
+  MachineBasicBlock *UseAltMBB = emitBlockAfter(LoopMBB);
+  MachineBasicBlock *UpdateMBB = emitBlockAfter(UseAltMBB);
+
+  //  StartMBB:
+  //   ...
+  //   %OrigVal     = L Disp(%Base)
+  //   # fall through to LoopMMB
+  MBB = StartMBB;
+  BuildMI(MBB, DL, TII->get(LOpcode), OrigVal)
+    .addOperand(Base).addImm(Disp).addReg(0);
+  MBB->addSuccessor(LoopMBB);
+
+  //  LoopMBB:
+  //   %OldVal        = phi [ %OrigVal, StartMBB ], [ %Dest, UpdateMBB ]
+  //   %RotatedOldVal = RLL %OldVal, 0(%BitShift)
+  //   CompareOpcode %RotatedOldVal, %Src2
+  //   BRC KeepOldMask, UpdateMBB
+  MBB = LoopMBB;
+  BuildMI(MBB, DL, TII->get(SystemZ::PHI), OldVal)
+    .addReg(OrigVal).addMBB(StartMBB)
+    .addReg(Dest).addMBB(UpdateMBB);
+  if (IsSubWord)
+    BuildMI(MBB, DL, TII->get(SystemZ::RLL), RotatedOldVal)
+      .addReg(OldVal).addReg(BitShift).addImm(0);
+  BuildMI(MBB, DL, TII->get(CompareOpcode))
+    .addReg(RotatedOldVal).addReg(Src2);
+  BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+    .addImm(SystemZ::CCMASK_ICMP).addImm(KeepOldMask).addMBB(UpdateMBB);
+  MBB->addSuccessor(UpdateMBB);
+  MBB->addSuccessor(UseAltMBB);
+
+  //  UseAltMBB:
+  //   %RotatedAltVal = RISBG %RotatedOldVal, %Src2, 32, 31 + BitSize, 0
+  //   # fall through to UpdateMMB
+  MBB = UseAltMBB;
+  if (IsSubWord)
+    BuildMI(MBB, DL, TII->get(SystemZ::RISBG32), RotatedAltVal)
+      .addReg(RotatedOldVal).addReg(Src2)
+      .addImm(32).addImm(31 + BitSize).addImm(0);
+  MBB->addSuccessor(UpdateMBB);
+
+  //  UpdateMBB:
+  //   %RotatedNewVal = PHI [ %RotatedOldVal, LoopMBB ],
+  //                        [ %RotatedAltVal, UseAltMBB ]
+  //   %NewVal        = RLL %RotatedNewVal, 0(%NegBitShift)
+  //   %Dest          = CS %OldVal, %NewVal, Disp(%Base)
+  //   JNE LoopMBB
+  //   # fall through to DoneMMB
+  MBB = UpdateMBB;
+  BuildMI(MBB, DL, TII->get(SystemZ::PHI), RotatedNewVal)
+    .addReg(RotatedOldVal).addMBB(LoopMBB)
+    .addReg(RotatedAltVal).addMBB(UseAltMBB);
+  if (IsSubWord)
+    BuildMI(MBB, DL, TII->get(SystemZ::RLL), NewVal)
+      .addReg(RotatedNewVal).addReg(NegBitShift).addImm(0);
+  BuildMI(MBB, DL, TII->get(CSOpcode), Dest)
+    .addReg(OldVal).addReg(NewVal).addOperand(Base).addImm(Disp);
+  BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+    .addImm(SystemZ::CCMASK_CS).addImm(SystemZ::CCMASK_CS_NE).addMBB(LoopMBB);
+  MBB->addSuccessor(LoopMBB);
+  MBB->addSuccessor(DoneMBB);
+
+  MI->eraseFromParent();
+  return DoneMBB;
+}
+
+// Implement EmitInstrWithCustomInserter for pseudo ATOMIC_CMP_SWAPW
+// instruction MI.
+MachineBasicBlock *
+SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr *MI,
+                                          MachineBasicBlock *MBB) const {
+  MachineFunction &MF = *MBB->getParent();
+  const SystemZInstrInfo *TII =
+      static_cast<const SystemZInstrInfo *>(MF.getTarget().getInstrInfo());
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  // Extract the operands.  Base can be a register or a frame index.
+  unsigned Dest        = MI->getOperand(0).getReg();
+  MachineOperand Base  = earlyUseOperand(MI->getOperand(1));
+  int64_t  Disp        = MI->getOperand(2).getImm();
+  unsigned OrigCmpVal  = MI->getOperand(3).getReg();
+  unsigned OrigSwapVal = MI->getOperand(4).getReg();
+  unsigned BitShift    = MI->getOperand(5).getReg();
+  unsigned NegBitShift = MI->getOperand(6).getReg();
+  int64_t  BitSize     = MI->getOperand(7).getImm();
+  DebugLoc DL          = MI->getDebugLoc();
+
+  const TargetRegisterClass *RC = &SystemZ::GR32BitRegClass;
+
+  // Get the right opcodes for the displacement.
+  unsigned LOpcode  = TII->getOpcodeForOffset(SystemZ::L,  Disp);
+  unsigned CSOpcode = TII->getOpcodeForOffset(SystemZ::CS, Disp);
+  assert(LOpcode && CSOpcode && "Displacement out of range");
+
+  // Create virtual registers for temporary results.
+  unsigned OrigOldVal   = MRI.createVirtualRegister(RC);
+  unsigned OldVal       = MRI.createVirtualRegister(RC);
+  unsigned CmpVal       = MRI.createVirtualRegister(RC);
+  unsigned SwapVal      = MRI.createVirtualRegister(RC);
+  unsigned StoreVal     = MRI.createVirtualRegister(RC);
+  unsigned RetryOldVal  = MRI.createVirtualRegister(RC);
+  unsigned RetryCmpVal  = MRI.createVirtualRegister(RC);
+  unsigned RetrySwapVal = MRI.createVirtualRegister(RC);
+
+  // Insert 2 basic blocks for the loop.
+  MachineBasicBlock *StartMBB = MBB;
+  MachineBasicBlock *DoneMBB  = splitBlockBefore(MI, MBB);
+  MachineBasicBlock *LoopMBB  = emitBlockAfter(StartMBB);
+  MachineBasicBlock *SetMBB   = emitBlockAfter(LoopMBB);
+
+  //  StartMBB:
+  //   ...
+  //   %OrigOldVal     = L Disp(%Base)
+  //   # fall through to LoopMMB
+  MBB = StartMBB;
+  BuildMI(MBB, DL, TII->get(LOpcode), OrigOldVal)
+    .addOperand(Base).addImm(Disp).addReg(0);
+  MBB->addSuccessor(LoopMBB);
+
+  //  LoopMBB:
+  //   %OldVal        = phi [ %OrigOldVal, EntryBB ], [ %RetryOldVal, SetMBB ]
+  //   %CmpVal        = phi [ %OrigCmpVal, EntryBB ], [ %RetryCmpVal, SetMBB ]
+  //   %SwapVal       = phi [ %OrigSwapVal, EntryBB ], [ %RetrySwapVal, SetMBB ]
+  //   %Dest          = RLL %OldVal, BitSize(%BitShift)
+  //                      ^^ The low BitSize bits contain the field
+  //                         of interest.
+  //   %RetryCmpVal   = RISBG32 %CmpVal, %Dest, 32, 63-BitSize, 0
+  //                      ^^ Replace the upper 32-BitSize bits of the
+  //                         comparison value with those that we loaded,
+  //                         so that we can use a full word comparison.
+  //   CR %Dest, %RetryCmpVal
+  //   JNE DoneMBB
+  //   # Fall through to SetMBB
+  MBB = LoopMBB;
+  BuildMI(MBB, DL, TII->get(SystemZ::PHI), OldVal)
+    .addReg(OrigOldVal).addMBB(StartMBB)
+    .addReg(RetryOldVal).addMBB(SetMBB);
+  BuildMI(MBB, DL, TII->get(SystemZ::PHI), CmpVal)
+    .addReg(OrigCmpVal).addMBB(StartMBB)
+    .addReg(RetryCmpVal).addMBB(SetMBB);
+  BuildMI(MBB, DL, TII->get(SystemZ::PHI), SwapVal)
+    .addReg(OrigSwapVal).addMBB(StartMBB)
+    .addReg(RetrySwapVal).addMBB(SetMBB);
+  BuildMI(MBB, DL, TII->get(SystemZ::RLL), Dest)
+    .addReg(OldVal).addReg(BitShift).addImm(BitSize);
+  BuildMI(MBB, DL, TII->get(SystemZ::RISBG32), RetryCmpVal)
+    .addReg(CmpVal).addReg(Dest).addImm(32).addImm(63 - BitSize).addImm(0);
+  BuildMI(MBB, DL, TII->get(SystemZ::CR))
+    .addReg(Dest).addReg(RetryCmpVal);
+  BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+    .addImm(SystemZ::CCMASK_ICMP)
+    .addImm(SystemZ::CCMASK_CMP_NE).addMBB(DoneMBB);
+  MBB->addSuccessor(DoneMBB);
+  MBB->addSuccessor(SetMBB);
+
+  //  SetMBB:
+  //   %RetrySwapVal = RISBG32 %SwapVal, %Dest, 32, 63-BitSize, 0
+  //                      ^^ Replace the upper 32-BitSize bits of the new
+  //                         value with those that we loaded.
+  //   %StoreVal    = RLL %RetrySwapVal, -BitSize(%NegBitShift)
+  //                      ^^ Rotate the new field to its proper position.
+  //   %RetryOldVal = CS %Dest, %StoreVal, Disp(%Base)
+  //   JNE LoopMBB
+  //   # fall through to ExitMMB
+  MBB = SetMBB;
+  BuildMI(MBB, DL, TII->get(SystemZ::RISBG32), RetrySwapVal)
+    .addReg(SwapVal).addReg(Dest).addImm(32).addImm(63 - BitSize).addImm(0);
+  BuildMI(MBB, DL, TII->get(SystemZ::RLL), StoreVal)
+    .addReg(RetrySwapVal).addReg(NegBitShift).addImm(-BitSize);
+  BuildMI(MBB, DL, TII->get(CSOpcode), RetryOldVal)
+    .addReg(OldVal).addReg(StoreVal).addOperand(Base).addImm(Disp);
+  BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+    .addImm(SystemZ::CCMASK_CS).addImm(SystemZ::CCMASK_CS_NE).addMBB(LoopMBB);
+  MBB->addSuccessor(LoopMBB);
+  MBB->addSuccessor(DoneMBB);
+
+  MI->eraseFromParent();
+  return DoneMBB;
+}
+
+// Emit an extension from a GR32 or GR64 to a GR128.  ClearEven is true
+// if the high register of the GR128 value must be cleared or false if
+// it's "don't care".  SubReg is subreg_l32 when extending a GR32
+// and subreg_l64 when extending a GR64.
+MachineBasicBlock *
+SystemZTargetLowering::emitExt128(MachineInstr *MI,
+                                  MachineBasicBlock *MBB,
+                                  bool ClearEven, unsigned SubReg) const {
+  MachineFunction &MF = *MBB->getParent();
+  const SystemZInstrInfo *TII =
+      static_cast<const SystemZInstrInfo *>(MF.getTarget().getInstrInfo());
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  DebugLoc DL = MI->getDebugLoc();
+
+  unsigned Dest  = MI->getOperand(0).getReg();
+  unsigned Src   = MI->getOperand(1).getReg();
+  unsigned In128 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass);
+
+  BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::IMPLICIT_DEF), In128);
+  if (ClearEven) {
+    unsigned NewIn128 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass);
+    unsigned Zero64   = MRI.createVirtualRegister(&SystemZ::GR64BitRegClass);
+
+    BuildMI(*MBB, MI, DL, TII->get(SystemZ::LLILL), Zero64)
+      .addImm(0);
+    BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewIn128)
+      .addReg(In128).addReg(Zero64).addImm(SystemZ::subreg_h64);
+    In128 = NewIn128;
+  }
+  BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dest)
+    .addReg(In128).addReg(Src).addImm(SubReg);
+
+  MI->eraseFromParent();
+  return MBB;
+}
+
+MachineBasicBlock *
+SystemZTargetLowering::emitMemMemWrapper(MachineInstr *MI,
+                                         MachineBasicBlock *MBB,
+                                         unsigned Opcode) const {
+  MachineFunction &MF = *MBB->getParent();
+  const SystemZInstrInfo *TII =
+      static_cast<const SystemZInstrInfo *>(MF.getTarget().getInstrInfo());
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  DebugLoc DL = MI->getDebugLoc();
+
+  MachineOperand DestBase = earlyUseOperand(MI->getOperand(0));
+  uint64_t       DestDisp = MI->getOperand(1).getImm();
+  MachineOperand SrcBase  = earlyUseOperand(MI->getOperand(2));
+  uint64_t       SrcDisp  = MI->getOperand(3).getImm();
+  uint64_t       Length   = MI->getOperand(4).getImm();
+
+  // When generating more than one CLC, all but the last will need to
+  // branch to the end when a difference is found.
+  MachineBasicBlock *EndMBB = (Length > 256 && Opcode == SystemZ::CLC ?
+                               splitBlockAfter(MI, MBB) : nullptr);
+
+  // Check for the loop form, in which operand 5 is the trip count.
+  if (MI->getNumExplicitOperands() > 5) {
+    bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase);
+
+    uint64_t StartCountReg = MI->getOperand(5).getReg();
+    uint64_t StartSrcReg   = forceReg(MI, SrcBase, TII);
+    uint64_t StartDestReg  = (HaveSingleBase ? StartSrcReg :
+                              forceReg(MI, DestBase, TII));
+
+    const TargetRegisterClass *RC = &SystemZ::ADDR64BitRegClass;
+    uint64_t ThisSrcReg  = MRI.createVirtualRegister(RC);
+    uint64_t ThisDestReg = (HaveSingleBase ? ThisSrcReg :
+                            MRI.createVirtualRegister(RC));
+    uint64_t NextSrcReg  = MRI.createVirtualRegister(RC);
+    uint64_t NextDestReg = (HaveSingleBase ? NextSrcReg :
+                            MRI.createVirtualRegister(RC));
+
+    RC = &SystemZ::GR64BitRegClass;
+    uint64_t ThisCountReg = MRI.createVirtualRegister(RC);
+    uint64_t NextCountReg = MRI.createVirtualRegister(RC);
+
+    MachineBasicBlock *StartMBB = MBB;
+    MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB);
+    MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB);
+    MachineBasicBlock *NextMBB = (EndMBB ? emitBlockAfter(LoopMBB) : LoopMBB);
+
+    //  StartMBB:
+    //   # fall through to LoopMMB
+    MBB->addSuccessor(LoopMBB);
+
+    //  LoopMBB:
+    //   %ThisDestReg = phi [ %StartDestReg, StartMBB ],
+    //                      [ %NextDestReg, NextMBB ]
+    //   %ThisSrcReg = phi [ %StartSrcReg, StartMBB ],
+    //                     [ %NextSrcReg, NextMBB ]
+    //   %ThisCountReg = phi [ %StartCountReg, StartMBB ],
+    //                       [ %NextCountReg, NextMBB ]
+    //   ( PFD 2, 768+DestDisp(%ThisDestReg) )
+    //   Opcode DestDisp(256,%ThisDestReg), SrcDisp(%ThisSrcReg)
+    //   ( JLH EndMBB )
+    //
+    // The prefetch is used only for MVC.  The JLH is used only for CLC.
+    MBB = LoopMBB;
+
+    BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisDestReg)
+      .addReg(StartDestReg).addMBB(StartMBB)
+      .addReg(NextDestReg).addMBB(NextMBB);
+    if (!HaveSingleBase)
+      BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisSrcReg)
+        .addReg(StartSrcReg).addMBB(StartMBB)
+        .addReg(NextSrcReg).addMBB(NextMBB);
+    BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisCountReg)
+      .addReg(StartCountReg).addMBB(StartMBB)
+      .addReg(NextCountReg).addMBB(NextMBB);
+    if (Opcode == SystemZ::MVC)
+      BuildMI(MBB, DL, TII->get(SystemZ::PFD))
+        .addImm(SystemZ::PFD_WRITE)
+        .addReg(ThisDestReg).addImm(DestDisp + 768).addReg(0);
+    BuildMI(MBB, DL, TII->get(Opcode))
+      .addReg(ThisDestReg).addImm(DestDisp).addImm(256)
+      .addReg(ThisSrcReg).addImm(SrcDisp);
+    if (EndMBB) {
+      BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+        .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE)
+        .addMBB(EndMBB);
+      MBB->addSuccessor(EndMBB);
+      MBB->addSuccessor(NextMBB);
+    }
+
+    // NextMBB:
+    //   %NextDestReg = LA 256(%ThisDestReg)
+    //   %NextSrcReg = LA 256(%ThisSrcReg)
+    //   %NextCountReg = AGHI %ThisCountReg, -1
+    //   CGHI %NextCountReg, 0
+    //   JLH LoopMBB
+    //   # fall through to DoneMMB
+    //
+    // The AGHI, CGHI and JLH should be converted to BRCTG by later passes.
+    MBB = NextMBB;
+
+    BuildMI(MBB, DL, TII->get(SystemZ::LA), NextDestReg)
+      .addReg(ThisDestReg).addImm(256).addReg(0);
+    if (!HaveSingleBase)
+      BuildMI(MBB, DL, TII->get(SystemZ::LA), NextSrcReg)
+        .addReg(ThisSrcReg).addImm(256).addReg(0);
+    BuildMI(MBB, DL, TII->get(SystemZ::AGHI), NextCountReg)
+      .addReg(ThisCountReg).addImm(-1);
+    BuildMI(MBB, DL, TII->get(SystemZ::CGHI))
+      .addReg(NextCountReg).addImm(0);
+    BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+      .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE)
+      .addMBB(LoopMBB);
+    MBB->addSuccessor(LoopMBB);
+    MBB->addSuccessor(DoneMBB);
+
+    DestBase = MachineOperand::CreateReg(NextDestReg, false);
+    SrcBase = MachineOperand::CreateReg(NextSrcReg, false);
+    Length &= 255;
+    MBB = DoneMBB;
+  }
+  // Handle any remaining bytes with straight-line code.
+  while (Length > 0) {
+    uint64_t ThisLength = std::min(Length, uint64_t(256));
+    // The previous iteration might have created out-of-range displacements.
+    // Apply them using LAY if so.
+    if (!isUInt<12>(DestDisp)) {
+      unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+      BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(SystemZ::LAY), Reg)
+        .addOperand(DestBase).addImm(DestDisp).addReg(0);
+      DestBase = MachineOperand::CreateReg(Reg, false);
+      DestDisp = 0;
+    }
+    if (!isUInt<12>(SrcDisp)) {
+      unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+      BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(SystemZ::LAY), Reg)
+        .addOperand(SrcBase).addImm(SrcDisp).addReg(0);
+      SrcBase = MachineOperand::CreateReg(Reg, false);
+      SrcDisp = 0;
+    }
+    BuildMI(*MBB, MI, DL, TII->get(Opcode))
+      .addOperand(DestBase).addImm(DestDisp).addImm(ThisLength)
+      .addOperand(SrcBase).addImm(SrcDisp);
+    DestDisp += ThisLength;
+    SrcDisp += ThisLength;
+    Length -= ThisLength;
+    // If there's another CLC to go, branch to the end if a difference
+    // was found.
+    if (EndMBB && Length > 0) {
+      MachineBasicBlock *NextMBB = splitBlockBefore(MI, MBB);
+      BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+        .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE)
+        .addMBB(EndMBB);
+      MBB->addSuccessor(EndMBB);
+      MBB->addSuccessor(NextMBB);
+      MBB = NextMBB;
+    }
+  }
+  if (EndMBB) {
+    MBB->addSuccessor(EndMBB);
+    MBB = EndMBB;
+    MBB->addLiveIn(SystemZ::CC);
+  }
+
+  MI->eraseFromParent();
+  return MBB;
+}
+
+// Decompose string pseudo-instruction MI into a loop that continually performs
+// Opcode until CC != 3.
+MachineBasicBlock *
+SystemZTargetLowering::emitStringWrapper(MachineInstr *MI,
+                                         MachineBasicBlock *MBB,
+                                         unsigned Opcode) const {
+  MachineFunction &MF = *MBB->getParent();
+  const SystemZInstrInfo *TII =
+      static_cast<const SystemZInstrInfo *>(MF.getTarget().getInstrInfo());
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  DebugLoc DL = MI->getDebugLoc();
+
+  uint64_t End1Reg   = MI->getOperand(0).getReg();
+  uint64_t Start1Reg = MI->getOperand(1).getReg();
+  uint64_t Start2Reg = MI->getOperand(2).getReg();
+  uint64_t CharReg   = MI->getOperand(3).getReg();
+
+  const TargetRegisterClass *RC = &SystemZ::GR64BitRegClass;
+  uint64_t This1Reg = MRI.createVirtualRegister(RC);
+  uint64_t This2Reg = MRI.createVirtualRegister(RC);
+  uint64_t End2Reg  = MRI.createVirtualRegister(RC);
+
+  MachineBasicBlock *StartMBB = MBB;
+  MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB);
+  MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB);
+
+  //  StartMBB:
+  //   # fall through to LoopMMB
+  MBB->addSuccessor(LoopMBB);
+
+  //  LoopMBB:
+  //   %This1Reg = phi [ %Start1Reg, StartMBB ], [ %End1Reg, LoopMBB ]
+  //   %This2Reg = phi [ %Start2Reg, StartMBB ], [ %End2Reg, LoopMBB ]
+  //   R0L = %CharReg
+  //   %End1Reg, %End2Reg = CLST %This1Reg, %This2Reg -- uses R0L
+  //   JO LoopMBB
+  //   # fall through to DoneMMB
+  //
+  // The load of R0L can be hoisted by post-RA LICM.
+  MBB = LoopMBB;
+
+  BuildMI(MBB, DL, TII->get(SystemZ::PHI), This1Reg)
+    .addReg(Start1Reg).addMBB(StartMBB)
+    .addReg(End1Reg).addMBB(LoopMBB);
+  BuildMI(MBB, DL, TII->get(SystemZ::PHI), This2Reg)
+    .addReg(Start2Reg).addMBB(StartMBB)
+    .addReg(End2Reg).addMBB(LoopMBB);
+  BuildMI(MBB, DL, TII->get(TargetOpcode::COPY), SystemZ::R0L).addReg(CharReg);
+  BuildMI(MBB, DL, TII->get(Opcode))
+    .addReg(End1Reg, RegState::Define).addReg(End2Reg, RegState::Define)
+    .addReg(This1Reg).addReg(This2Reg);
+  BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+    .addImm(SystemZ::CCMASK_ANY).addImm(SystemZ::CCMASK_3).addMBB(LoopMBB);
+  MBB->addSuccessor(LoopMBB);
+  MBB->addSuccessor(DoneMBB);
+
+  DoneMBB->addLiveIn(SystemZ::CC);
+
+  MI->eraseFromParent();
+  return DoneMBB;
+}
+
+MachineBasicBlock *SystemZTargetLowering::
+EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const {
+  switch (MI->getOpcode()) {
+  case SystemZ::Select32Mux:
+  case SystemZ::Select32:
+  case SystemZ::SelectF32:
+  case SystemZ::Select64:
+  case SystemZ::SelectF64:
+  case SystemZ::SelectF128:
+    return emitSelect(MI, MBB);
+
+  case SystemZ::CondStore8Mux:
+    return emitCondStore(MI, MBB, SystemZ::STCMux, 0, false);
+  case SystemZ::CondStore8MuxInv:
+    return emitCondStore(MI, MBB, SystemZ::STCMux, 0, true);
+  case SystemZ::CondStore16Mux:
+    return emitCondStore(MI, MBB, SystemZ::STHMux, 0, false);
+  case SystemZ::CondStore16MuxInv:
+    return emitCondStore(MI, MBB, SystemZ::STHMux, 0, true);
+  case SystemZ::CondStore8:
+    return emitCondStore(MI, MBB, SystemZ::STC, 0, false);
+  case SystemZ::CondStore8Inv:
+    return emitCondStore(MI, MBB, SystemZ::STC, 0, true);
+  case SystemZ::CondStore16:
+    return emitCondStore(MI, MBB, SystemZ::STH, 0, false);
+  case SystemZ::CondStore16Inv:
+    return emitCondStore(MI, MBB, SystemZ::STH, 0, true);
+  case SystemZ::CondStore32:
+    return emitCondStore(MI, MBB, SystemZ::ST, SystemZ::STOC, false);
+  case SystemZ::CondStore32Inv:
+    return emitCondStore(MI, MBB, SystemZ::ST, SystemZ::STOC, true);
+  case SystemZ::CondStore64:
+    return emitCondStore(MI, MBB, SystemZ::STG, SystemZ::STOCG, false);
+  case SystemZ::CondStore64Inv:
+    return emitCondStore(MI, MBB, SystemZ::STG, SystemZ::STOCG, true);
+  case SystemZ::CondStoreF32:
+    return emitCondStore(MI, MBB, SystemZ::STE, 0, false);
+  case SystemZ::CondStoreF32Inv:
+    return emitCondStore(MI, MBB, SystemZ::STE, 0, true);
+  case SystemZ::CondStoreF64:
+    return emitCondStore(MI, MBB, SystemZ::STD, 0, false);
+  case SystemZ::CondStoreF64Inv:
+    return emitCondStore(MI, MBB, SystemZ::STD, 0, true);
+
+  case SystemZ::AEXT128_64:
+    return emitExt128(MI, MBB, false, SystemZ::subreg_l64);
+  case SystemZ::ZEXT128_32:
+    return emitExt128(MI, MBB, true, SystemZ::subreg_l32);
+  case SystemZ::ZEXT128_64:
+    return emitExt128(MI, MBB, true, SystemZ::subreg_l64);
+
+  case SystemZ::ATOMIC_SWAPW:
+    return emitAtomicLoadBinary(MI, MBB, 0, 0);
+  case SystemZ::ATOMIC_SWAP_32:
+    return emitAtomicLoadBinary(MI, MBB, 0, 32);
+  case SystemZ::ATOMIC_SWAP_64:
+    return emitAtomicLoadBinary(MI, MBB, 0, 64);
+
+  case SystemZ::ATOMIC_LOADW_AR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::AR, 0);
+  case SystemZ::ATOMIC_LOADW_AFI:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::AFI, 0);
+  case SystemZ::ATOMIC_LOAD_AR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::AR, 32);
+  case SystemZ::ATOMIC_LOAD_AHI:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::AHI, 32);
+  case SystemZ::ATOMIC_LOAD_AFI:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::AFI, 32);
+  case SystemZ::ATOMIC_LOAD_AGR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::AGR, 64);
+  case SystemZ::ATOMIC_LOAD_AGHI:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::AGHI, 64);
+  case SystemZ::ATOMIC_LOAD_AGFI:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::AGFI, 64);
+
+  case SystemZ::ATOMIC_LOADW_SR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::SR, 0);
+  case SystemZ::ATOMIC_LOAD_SR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::SR, 32);
+  case SystemZ::ATOMIC_LOAD_SGR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::SGR, 64);
+
+  case SystemZ::ATOMIC_LOADW_NR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 0);
+  case SystemZ::ATOMIC_LOADW_NILH:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 0);
+  case SystemZ::ATOMIC_LOAD_NR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 32);
+  case SystemZ::ATOMIC_LOAD_NILL:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL, 32);
+  case SystemZ::ATOMIC_LOAD_NILH:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 32);
+  case SystemZ::ATOMIC_LOAD_NILF:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF, 32);
+  case SystemZ::ATOMIC_LOAD_NGR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NGR, 64);
+  case SystemZ::ATOMIC_LOAD_NILL64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL64, 64);
+  case SystemZ::ATOMIC_LOAD_NILH64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH64, 64);
+  case SystemZ::ATOMIC_LOAD_NIHL64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHL64, 64);
+  case SystemZ::ATOMIC_LOAD_NIHH64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHH64, 64);
+  case SystemZ::ATOMIC_LOAD_NILF64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF64, 64);
+  case SystemZ::ATOMIC_LOAD_NIHF64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHF64, 64);
+
+  case SystemZ::ATOMIC_LOADW_OR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OR, 0);
+  case SystemZ::ATOMIC_LOADW_OILH:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILH, 0);
+  case SystemZ::ATOMIC_LOAD_OR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OR, 32);
+  case SystemZ::ATOMIC_LOAD_OILL:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILL, 32);
+  case SystemZ::ATOMIC_LOAD_OILH:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILH, 32);
+  case SystemZ::ATOMIC_LOAD_OILF:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILF, 32);
+  case SystemZ::ATOMIC_LOAD_OGR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OGR, 64);
+  case SystemZ::ATOMIC_LOAD_OILL64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILL64, 64);
+  case SystemZ::ATOMIC_LOAD_OILH64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILH64, 64);
+  case SystemZ::ATOMIC_LOAD_OIHL64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OIHL64, 64);
+  case SystemZ::ATOMIC_LOAD_OIHH64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OIHH64, 64);
+  case SystemZ::ATOMIC_LOAD_OILF64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILF64, 64);
+  case SystemZ::ATOMIC_LOAD_OIHF64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OIHF64, 64);
+
+  case SystemZ::ATOMIC_LOADW_XR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::XR, 0);
+  case SystemZ::ATOMIC_LOADW_XILF:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::XILF, 0);
+  case SystemZ::ATOMIC_LOAD_XR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::XR, 32);
+  case SystemZ::ATOMIC_LOAD_XILF:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::XILF, 32);
+  case SystemZ::ATOMIC_LOAD_XGR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::XGR, 64);
+  case SystemZ::ATOMIC_LOAD_XILF64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::XILF64, 64);
+  case SystemZ::ATOMIC_LOAD_XIHF64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::XIHF64, 64);
+
+  case SystemZ::ATOMIC_LOADW_NRi:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 0, true);
+  case SystemZ::ATOMIC_LOADW_NILHi:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 0, true);
+  case SystemZ::ATOMIC_LOAD_NRi:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 32, true);
+  case SystemZ::ATOMIC_LOAD_NILLi:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL, 32, true);
+  case SystemZ::ATOMIC_LOAD_NILHi:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 32, true);
+  case SystemZ::ATOMIC_LOAD_NILFi:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF, 32, true);
+  case SystemZ::ATOMIC_LOAD_NGRi:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NGR, 64, true);
+  case SystemZ::ATOMIC_LOAD_NILL64i:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL64, 64, true);
+  case SystemZ::ATOMIC_LOAD_NILH64i:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH64, 64, true);
+  case SystemZ::ATOMIC_LOAD_NIHL64i:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHL64, 64, true);
+  case SystemZ::ATOMIC_LOAD_NIHH64i:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHH64, 64, true);
+  case SystemZ::ATOMIC_LOAD_NILF64i:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF64, 64, true);
+  case SystemZ::ATOMIC_LOAD_NIHF64i:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHF64, 64, true);
+
+  case SystemZ::ATOMIC_LOADW_MIN:
+    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CR,
+                                SystemZ::CCMASK_CMP_LE, 0);
+  case SystemZ::ATOMIC_LOAD_MIN_32:
+    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CR,
+                                SystemZ::CCMASK_CMP_LE, 32);
+  case SystemZ::ATOMIC_LOAD_MIN_64:
+    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CGR,
+                                SystemZ::CCMASK_CMP_LE, 64);
+
+  case SystemZ::ATOMIC_LOADW_MAX:
+    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CR,
+                                SystemZ::CCMASK_CMP_GE, 0);
+  case SystemZ::ATOMIC_LOAD_MAX_32:
+    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CR,
+                                SystemZ::CCMASK_CMP_GE, 32);
+  case SystemZ::ATOMIC_LOAD_MAX_64:
+    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CGR,
+                                SystemZ::CCMASK_CMP_GE, 64);
+
+  case SystemZ::ATOMIC_LOADW_UMIN:
+    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLR,
+                                SystemZ::CCMASK_CMP_LE, 0);
+  case SystemZ::ATOMIC_LOAD_UMIN_32:
+    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLR,
+                                SystemZ::CCMASK_CMP_LE, 32);
+  case SystemZ::ATOMIC_LOAD_UMIN_64:
+    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLGR,
+                                SystemZ::CCMASK_CMP_LE, 64);
+
+  case SystemZ::ATOMIC_LOADW_UMAX:
+    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLR,
+                                SystemZ::CCMASK_CMP_GE, 0);
+  case SystemZ::ATOMIC_LOAD_UMAX_32:
+    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLR,
+                                SystemZ::CCMASK_CMP_GE, 32);
+  case SystemZ::ATOMIC_LOAD_UMAX_64:
+    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLGR,
+                                SystemZ::CCMASK_CMP_GE, 64);
+
+  case SystemZ::ATOMIC_CMP_SWAPW:
+    return emitAtomicCmpSwapW(MI, MBB);
+  case SystemZ::MVCSequence:
+  case SystemZ::MVCLoop:
+    return emitMemMemWrapper(MI, MBB, SystemZ::MVC);
+  case SystemZ::NCSequence:
+  case SystemZ::NCLoop:
+    return emitMemMemWrapper(MI, MBB, SystemZ::NC);
+  case SystemZ::OCSequence:
+  case SystemZ::OCLoop:
+    return emitMemMemWrapper(MI, MBB, SystemZ::OC);
+  case SystemZ::XCSequence:
+  case SystemZ::XCLoop:
+    return emitMemMemWrapper(MI, MBB, SystemZ::XC);
+  case SystemZ::CLCSequence:
+  case SystemZ::CLCLoop:
+    return emitMemMemWrapper(MI, MBB, SystemZ::CLC);
+  case SystemZ::CLSTLoop:
+    return emitStringWrapper(MI, MBB, SystemZ::CLST);
+  case SystemZ::MVSTLoop:
+    return emitStringWrapper(MI, MBB, SystemZ::MVST);
+  case SystemZ::SRSTLoop:
+    return emitStringWrapper(MI, MBB, SystemZ::SRST);
+  default:
+    llvm_unreachable("Unexpected instr type to insert");
+  }
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.h
new file mode 100644
index 0000000..e21b050
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -0,0 +1,324 @@
+//===-- SystemZISelLowering.h - SystemZ DAG lowering interface --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that SystemZ uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_SystemZ_ISELLOWERING_H
+#define LLVM_TARGET_SystemZ_ISELLOWERING_H
+
+#include "SystemZ.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+namespace SystemZISD {
+enum {
+  FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+  // Return with a flag operand.  Operand 0 is the chain operand.
+  RET_FLAG,
+
+  // Calls a function.  Operand 0 is the chain operand and operand 1
+  // is the target address.  The arguments start at operand 2.
+  // There is an optional glue operand at the end.
+  CALL,
+  SIBCALL,
+
+  // Wraps a TargetGlobalAddress that should be loaded using PC-relative
+  // accesses (LARL).  Operand 0 is the address.
+  PCREL_WRAPPER,
+
+  // Used in cases where an offset is applied to a TargetGlobalAddress.
+  // Operand 0 is the full TargetGlobalAddress and operand 1 is a
+  // PCREL_WRAPPER for an anchor point.  This is used so that we can
+  // cheaply refer to either the full address or the anchor point
+  // as a register base.
+  PCREL_OFFSET,
+
+  // Integer absolute.
+  IABS,
+
+  // Integer comparisons.  There are three operands: the two values
+  // to compare, and an integer of type SystemZICMP.
+  ICMP,
+
+  // Floating-point comparisons.  The two operands are the values to compare.
+  FCMP,
+
+  // Test under mask.  The first operand is ANDed with the second operand
+  // and the condition codes are set on the result.  The third operand is
+  // a boolean that is true if the condition codes need to distinguish
+  // between CCMASK_TM_MIXED_MSB_0 and CCMASK_TM_MIXED_MSB_1 (which the
+  // register forms do but the memory forms don't).
+  TM,
+
+  // Branches if a condition is true.  Operand 0 is the chain operand;
+  // operand 1 is the 4-bit condition-code mask, with bit N in
+  // big-endian order meaning "branch if CC=N"; operand 2 is the
+  // target block and operand 3 is the flag operand.
+  BR_CCMASK,
+
+  // Selects between operand 0 and operand 1.  Operand 2 is the
+  // mask of condition-code values for which operand 0 should be
+  // chosen over operand 1; it has the same form as BR_CCMASK.
+  // Operand 3 is the flag operand.
+  SELECT_CCMASK,
+
+  // Evaluates to the gap between the stack pointer and the
+  // base of the dynamically-allocatable area.
+  ADJDYNALLOC,
+
+  // Extracts the value of a 32-bit access register.  Operand 0 is
+  // the number of the register.
+  EXTRACT_ACCESS,
+
+  // Wrappers around the ISD opcodes of the same name.  The output and
+  // first input operands are GR128s.  The trailing numbers are the
+  // widths of the second operand in bits.
+  UMUL_LOHI64,
+  SDIVREM32,
+  SDIVREM64,
+  UDIVREM32,
+  UDIVREM64,
+
+  // Use a series of MVCs to copy bytes from one memory location to another.
+  // The operands are:
+  // - the target address
+  // - the source address
+  // - the constant length
+  //
+  // This isn't a memory opcode because we'd need to attach two
+  // MachineMemOperands rather than one.
+  MVC,
+
+  // Like MVC, but implemented as a loop that handles X*256 bytes
+  // followed by straight-line code to handle the rest (if any).
+  // The value of X is passed as an additional operand.
+  MVC_LOOP,
+
+  // Similar to MVC and MVC_LOOP, but for logic operations (AND, OR, XOR).
+  NC,
+  NC_LOOP,
+  OC,
+  OC_LOOP,
+  XC,
+  XC_LOOP,
+
+  // Use CLC to compare two blocks of memory, with the same comments
+  // as for MVC and MVC_LOOP.
+  CLC,
+  CLC_LOOP,
+
+  // Use an MVST-based sequence to implement stpcpy().
+  STPCPY,
+
+  // Use a CLST-based sequence to implement strcmp().  The two input operands
+  // are the addresses of the strings to compare.
+  STRCMP,
+
+  // Use an SRST-based sequence to search a block of memory.  The first
+  // operand is the end address, the second is the start, and the third
+  // is the character to search for.  CC is set to 1 on success and 2
+  // on failure.
+  SEARCH_STRING,
+
+  // Store the CC value in bits 29 and 28 of an integer.
+  IPM,
+
+  // Perform a serialization operation.  (BCR 15,0 or BCR 14,0.)
+  SERIALIZE,
+
+  // Wrappers around the inner loop of an 8- or 16-bit ATOMIC_SWAP or
+  // ATOMIC_LOAD_<op>.
+  //
+  // Operand 0: the address of the containing 32-bit-aligned field
+  // Operand 1: the second operand of <op>, in the high bits of an i32
+  //            for everything except ATOMIC_SWAPW
+  // Operand 2: how many bits to rotate the i32 left to bring the first
+  //            operand into the high bits
+  // Operand 3: the negative of operand 2, for rotating the other way
+  // Operand 4: the width of the field in bits (8 or 16)
+  ATOMIC_SWAPW = ISD::FIRST_TARGET_MEMORY_OPCODE,
+  ATOMIC_LOADW_ADD,
+  ATOMIC_LOADW_SUB,
+  ATOMIC_LOADW_AND,
+  ATOMIC_LOADW_OR,
+  ATOMIC_LOADW_XOR,
+  ATOMIC_LOADW_NAND,
+  ATOMIC_LOADW_MIN,
+  ATOMIC_LOADW_MAX,
+  ATOMIC_LOADW_UMIN,
+  ATOMIC_LOADW_UMAX,
+
+  // A wrapper around the inner loop of an ATOMIC_CMP_SWAP.
+  //
+  // Operand 0: the address of the containing 32-bit-aligned field
+  // Operand 1: the compare value, in the low bits of an i32
+  // Operand 2: the swap value, in the low bits of an i32
+  // Operand 3: how many bits to rotate the i32 left to bring the first
+  //            operand into the high bits
+  // Operand 4: the negative of operand 2, for rotating the other way
+  // Operand 5: the width of the field in bits (8 or 16)
+  ATOMIC_CMP_SWAPW,
+
+  // Prefetch from the second operand using the 4-bit control code in
+  // the first operand.  The code is 1 for a load prefetch and 2 for
+  // a store prefetch.
+  PREFETCH
+};
+
+// Return true if OPCODE is some kind of PC-relative address.
+inline bool isPCREL(unsigned Opcode) {
+  return Opcode == PCREL_WRAPPER || Opcode == PCREL_OFFSET;
+}
+} // end namespace SystemZISD
+
+namespace SystemZICMP {
+// Describes whether an integer comparison needs to be signed or unsigned,
+// or whether either type is OK.
+enum {
+  Any,
+  UnsignedOnly,
+  SignedOnly
+};
+} // end namespace SystemZICMP
+
+class SystemZSubtarget;
+class SystemZTargetMachine;
+
+class SystemZTargetLowering : public TargetLowering {
+public:
+  explicit SystemZTargetLowering(const TargetMachine &TM);
+
+  // Override TargetLowering.
+  MVT getScalarShiftAmountTy(EVT LHSTy) const override {
+    return MVT::i32;
+  }
+  EVT getSetCCResultType(LLVMContext &, EVT) const override;
+  bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
+  bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
+  bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
+  bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AS,
+                                     bool *Fast) const override;
+  bool isTruncateFree(Type *, Type *) const override;
+  bool isTruncateFree(EVT, EVT) const override;
+  const char *getTargetNodeName(unsigned Opcode) const override;
+  std::pair<unsigned, const TargetRegisterClass *>
+    getRegForInlineAsmConstraint(const std::string &Constraint,
+                                 MVT VT) const override;
+  TargetLowering::ConstraintType
+    getConstraintType(const std::string &Constraint) const override;
+  TargetLowering::ConstraintWeight
+    getSingleConstraintMatchWeight(AsmOperandInfo &info,
+                                   const char *constraint) const override;
+  void LowerAsmOperandForConstraint(SDValue Op,
+                                    std::string &Constraint,
+                                    std::vector<SDValue> &Ops,
+                                    SelectionDAG &DAG) const override;
+  MachineBasicBlock *EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                 MachineBasicBlock *BB) const
+    override;
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+  bool allowTruncateForTailCall(Type *, Type *) const override;
+  bool mayBeEmittedAsTailCall(CallInst *CI) const override;
+  SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
+                               bool isVarArg,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               SDLoc DL, SelectionDAG &DAG,
+                               SmallVectorImpl<SDValue> &InVals) const override;
+  SDValue LowerCall(CallLoweringInfo &CLI,
+                    SmallVectorImpl<SDValue> &InVals) const override;
+
+  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      const SmallVectorImpl<SDValue> &OutVals,
+                      SDLoc DL, SelectionDAG &DAG) const override;
+  SDValue prepareVolatileOrAtomicLoad(SDValue Chain, SDLoc DL,
+                                      SelectionDAG &DAG) const override;
+  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
+private:
+  const SystemZSubtarget &Subtarget;
+
+  // Implement LowerOperation for individual opcodes.
+  SDValue lowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerGlobalAddress(GlobalAddressSDNode *Node,
+                             SelectionDAG &DAG) const;
+  SDValue lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
+                                SelectionDAG &DAG) const;
+  SDValue lowerBlockAddress(BlockAddressSDNode *Node,
+                            SelectionDAG &DAG) const;
+  SDValue lowerJumpTable(JumpTableSDNode *JT, SelectionDAG &DAG) const;
+  SDValue lowerConstantPool(ConstantPoolSDNode *CP, SelectionDAG &DAG) const;
+  SDValue lowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerSMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerUMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerSDIVREM(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerATOMIC_LOAD_OP(SDValue Op, SelectionDAG &DAG,
+                              unsigned Opcode) const;
+  SDValue lowerATOMIC_LOAD_SUB(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerLOAD_SEQUENCE_POINT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const;
+
+  // If the last instruction before MBBI in MBB was some form of COMPARE,
+  // try to replace it with a COMPARE AND BRANCH just before MBBI.
+  // CCMask and Target are the BRC-like operands for the branch.
+  // Return true if the change was made.
+  bool convertPrevCompareToBranch(MachineBasicBlock *MBB,
+                                  MachineBasicBlock::iterator MBBI,
+                                  unsigned CCMask,
+                                  MachineBasicBlock *Target) const;
+
+  // Implement EmitInstrWithCustomInserter for individual operation types.
+  MachineBasicBlock *emitSelect(MachineInstr *MI,
+                                MachineBasicBlock *BB) const;
+  MachineBasicBlock *emitCondStore(MachineInstr *MI,
+                                   MachineBasicBlock *BB,
+                                   unsigned StoreOpcode, unsigned STOCOpcode,
+                                   bool Invert) const;
+  MachineBasicBlock *emitExt128(MachineInstr *MI,
+                                MachineBasicBlock *MBB,
+                                bool ClearEven, unsigned SubReg) const;
+  MachineBasicBlock *emitAtomicLoadBinary(MachineInstr *MI,
+                                          MachineBasicBlock *BB,
+                                          unsigned BinOpcode, unsigned BitSize,
+                                          bool Invert = false) const;
+  MachineBasicBlock *emitAtomicLoadMinMax(MachineInstr *MI,
+                                          MachineBasicBlock *MBB,
+                                          unsigned CompareOpcode,
+                                          unsigned KeepOldMask,
+                                          unsigned BitSize) const;
+  MachineBasicBlock *emitAtomicCmpSwapW(MachineInstr *MI,
+                                        MachineBasicBlock *BB) const;
+  MachineBasicBlock *emitMemMemWrapper(MachineInstr *MI,
+                                       MachineBasicBlock *BB,
+                                       unsigned Opcode) const;
+  MachineBasicBlock *emitStringWrapper(MachineInstr *MI,
+                                       MachineBasicBlock *BB,
+                                       unsigned Opcode) const;
+};
+} // end namespace llvm
+
+#endif // LLVM_TARGET_SystemZ_ISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrBuilder.h b/contrib/llvm/lib/Target/SystemZ/SystemZInstrBuilder.h
new file mode 100644
index 0000000..84196e9
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrBuilder.h
@@ -0,0 +1,48 @@
+//===-- SystemZInstrBuilder.h - Functions to aid building insts -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes functions that may be used with BuildMI from the
+// MachineInstrBuilder.h file to handle SystemZ'isms in a clean way.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SYSTEMZINSTRBUILDER_H
+#define SYSTEMZINSTRBUILDER_H
+
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+
+namespace llvm {
+
+/// Add a BDX memory reference for frame object FI to MIB.
+static inline const MachineInstrBuilder &
+addFrameReference(const MachineInstrBuilder &MIB, int FI) {
+  MachineInstr *MI = MIB;
+  MachineFunction &MF = *MI->getParent()->getParent();
+  MachineFrameInfo *MFFrame = MF.getFrameInfo();
+  const MCInstrDesc &MCID = MI->getDesc();
+  unsigned Flags = 0;
+  if (MCID.mayLoad())
+    Flags |= MachineMemOperand::MOLoad;
+  if (MCID.mayStore())
+    Flags |= MachineMemOperand::MOStore;
+  int64_t Offset = 0;
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(MachinePointerInfo(
+                              PseudoSourceValue::getFixedStack(FI), Offset),
+                            Flags, MFFrame->getObjectSize(FI),
+                            MFFrame->getObjectAlignment(FI));
+  return MIB.addFrameIndex(FI).addImm(Offset).addReg(0).addMemOperand(MMO);
+}
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrFP.td b/contrib/llvm/lib/Target/SystemZ/SystemZInstrFP.td
new file mode 100644
index 0000000..e8841e1
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrFP.td
@@ -0,0 +1,411 @@
+//==- SystemZInstrFP.td - Floating-point SystemZ instructions --*- tblgen-*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Select instructions
+//===----------------------------------------------------------------------===//
+
+// C's ?: operator for floating-point operands.
+def SelectF32  : SelectWrapper<FP32>;
+def SelectF64  : SelectWrapper<FP64>;
+def SelectF128 : SelectWrapper<FP128>;
+
+defm CondStoreF32 : CondStores<FP32, nonvolatile_store,
+                               nonvolatile_load, bdxaddr20only>;
+defm CondStoreF64 : CondStores<FP64, nonvolatile_store,
+                               nonvolatile_load, bdxaddr20only>;
+
+//===----------------------------------------------------------------------===//
+// Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load zero.
+let neverHasSideEffects = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
+  def LZER : InherentRRE<"lzer", 0xB374, FP32,  (fpimm0)>;
+  def LZDR : InherentRRE<"lzdr", 0xB375, FP64,  (fpimm0)>;
+  def LZXR : InherentRRE<"lzxr", 0xB376, FP128, (fpimm0)>;
+}
+
+// Moves between two floating-point registers.
+let neverHasSideEffects = 1 in {
+  def LER : UnaryRR <"le", 0x38,   null_frag, FP32,  FP32>;
+  def LDR : UnaryRR <"ld", 0x28,   null_frag, FP64,  FP64>;
+  def LXR : UnaryRRE<"lx", 0xB365, null_frag, FP128, FP128>;
+}
+
+// Moves between two floating-point registers that also set the condition
+// codes.
+let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in {
+  defm LTEBR : LoadAndTestRRE<"lteb", 0xB302, FP32>;
+  defm LTDBR : LoadAndTestRRE<"ltdb", 0xB312, FP64>;
+  defm LTXBR : LoadAndTestRRE<"ltxb", 0xB342, FP128>;
+}
+defm : CompareZeroFP<LTEBRCompare, FP32>;
+defm : CompareZeroFP<LTDBRCompare, FP64>;
+defm : CompareZeroFP<LTXBRCompare, FP128>;
+
+// Moves between 64-bit integer and floating-point registers.
+def LGDR : UnaryRRE<"lgd", 0xB3CD, bitconvert, GR64, FP64>;
+def LDGR : UnaryRRE<"ldg", 0xB3C1, bitconvert, FP64, GR64>;
+
+// fcopysign with an FP32 result.
+let isCodeGenOnly = 1 in {
+  def CPSDRss : BinaryRRF<"cpsd", 0xB372, fcopysign, FP32, FP32>;
+  def CPSDRsd : BinaryRRF<"cpsd", 0xB372, fcopysign, FP32, FP64>;
+}
+
+// The sign of an FP128 is in the high register.
+def : Pat<(fcopysign FP32:$src1, FP128:$src2),
+          (CPSDRsd FP32:$src1, (EXTRACT_SUBREG FP128:$src2, subreg_h64))>;
+
+// fcopysign with an FP64 result.
+let isCodeGenOnly = 1 in
+  def CPSDRds : BinaryRRF<"cpsd", 0xB372, fcopysign, FP64, FP32>;
+def CPSDRdd : BinaryRRF<"cpsd", 0xB372, fcopysign, FP64, FP64>;
+
+// The sign of an FP128 is in the high register.
+def : Pat<(fcopysign FP64:$src1, FP128:$src2),
+          (CPSDRdd FP64:$src1, (EXTRACT_SUBREG FP128:$src2, subreg_h64))>;
+
+// fcopysign with an FP128 result.  Use "upper" as the high half and leave
+// the low half as-is.
+class CopySign128<RegisterOperand cls, dag upper>
+  : Pat<(fcopysign FP128:$src1, cls:$src2),
+        (INSERT_SUBREG FP128:$src1, upper, subreg_h64)>;
+
+def : CopySign128<FP32,  (CPSDRds (EXTRACT_SUBREG FP128:$src1, subreg_h64),
+                                  FP32:$src2)>;
+def : CopySign128<FP64,  (CPSDRdd (EXTRACT_SUBREG FP128:$src1, subreg_h64),
+                                  FP64:$src2)>;
+def : CopySign128<FP128, (CPSDRdd (EXTRACT_SUBREG FP128:$src1, subreg_h64),
+                                  (EXTRACT_SUBREG FP128:$src2, subreg_h64))>;
+
+defm LoadStoreF32  : MVCLoadStore<load, f32,  MVCSequence, 4>;
+defm LoadStoreF64  : MVCLoadStore<load, f64,  MVCSequence, 8>;
+defm LoadStoreF128 : MVCLoadStore<load, f128, MVCSequence, 16>;
+
+//===----------------------------------------------------------------------===//
+// Load instructions
+//===----------------------------------------------------------------------===//
+
+let canFoldAsLoad = 1, SimpleBDXLoad = 1 in {
+  defm LE : UnaryRXPair<"le", 0x78, 0xED64, load, FP32, 4>;
+  defm LD : UnaryRXPair<"ld", 0x68, 0xED65, load, FP64, 8>;
+
+  // These instructions are split after register allocation, so we don't
+  // want a custom inserter.
+  let Has20BitOffset = 1, HasIndex = 1, Is128Bit = 1 in {
+    def LX : Pseudo<(outs FP128:$dst), (ins bdxaddr20only128:$src),
+                     [(set FP128:$dst, (load bdxaddr20only128:$src))]>;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Store instructions
+//===----------------------------------------------------------------------===//
+
+let SimpleBDXStore = 1 in {
+  defm STE : StoreRXPair<"ste", 0x70, 0xED66, store, FP32, 4>;
+  defm STD : StoreRXPair<"std", 0x60, 0xED67, store, FP64, 8>;
+
+  // These instructions are split after register allocation, so we don't
+  // want a custom inserter.
+  let Has20BitOffset = 1, HasIndex = 1, Is128Bit = 1 in {
+    def STX : Pseudo<(outs), (ins FP128:$src, bdxaddr20only128:$dst),
+                     [(store FP128:$src, bdxaddr20only128:$dst)]>;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Convert floating-point values to narrower representations, rounding
+// according to the current mode.  The destination of LEXBR and LDXBR
+// is a 128-bit value, but only the first register of the pair is used.
+def LEDBR : UnaryRRE<"ledb", 0xB344, fround,    FP32,  FP64>;
+def LEXBR : UnaryRRE<"lexb", 0xB346, null_frag, FP128, FP128>;
+def LDXBR : UnaryRRE<"ldxb", 0xB345, null_frag, FP128, FP128>;
+
+def LEDBRA : UnaryRRF4<"ledbra", 0xB344, FP32,  FP64>,
+             Requires<[FeatureFPExtension]>;
+def LEXBRA : UnaryRRF4<"lexbra", 0xB346, FP128, FP128>,
+             Requires<[FeatureFPExtension]>;
+def LDXBRA : UnaryRRF4<"ldxbra", 0xB345, FP128, FP128>,
+             Requires<[FeatureFPExtension]>;
+
+def : Pat<(f32 (fround FP128:$src)),
+          (EXTRACT_SUBREG (LEXBR FP128:$src), subreg_hh32)>;
+def : Pat<(f64 (fround FP128:$src)),
+          (EXTRACT_SUBREG (LDXBR FP128:$src), subreg_h64)>;
+
+// Extend register floating-point values to wider representations.
+def LDEBR : UnaryRRE<"ldeb", 0xB304, fextend, FP64,  FP32>;
+def LXEBR : UnaryRRE<"lxeb", 0xB306, fextend, FP128, FP32>;
+def LXDBR : UnaryRRE<"lxdb", 0xB305, fextend, FP128, FP64>;
+
+// Extend memory floating-point values to wider representations.
+def LDEB : UnaryRXE<"ldeb", 0xED04, extloadf32, FP64,  4>;
+def LXEB : UnaryRXE<"lxeb", 0xED06, extloadf32, FP128, 4>;
+def LXDB : UnaryRXE<"lxdb", 0xED05, extloadf64, FP128, 8>;
+
+// Convert a signed integer register value to a floating-point one.
+def CEFBR : UnaryRRE<"cefb", 0xB394, sint_to_fp, FP32,  GR32>;
+def CDFBR : UnaryRRE<"cdfb", 0xB395, sint_to_fp, FP64,  GR32>;
+def CXFBR : UnaryRRE<"cxfb", 0xB396, sint_to_fp, FP128, GR32>;
+
+def CEGBR : UnaryRRE<"cegb", 0xB3A4, sint_to_fp, FP32,  GR64>;
+def CDGBR : UnaryRRE<"cdgb", 0xB3A5, sint_to_fp, FP64,  GR64>;
+def CXGBR : UnaryRRE<"cxgb", 0xB3A6, sint_to_fp, FP128, GR64>;
+
+// Convert am unsigned integer register value to a floating-point one.
+let Predicates = [FeatureFPExtension] in {
+  def CELFBR : UnaryRRF4<"celfbr", 0xB390, FP32,  GR32>;
+  def CDLFBR : UnaryRRF4<"cdlfbr", 0xB391, FP64,  GR32>;
+  def CXLFBR : UnaryRRF4<"cxlfbr", 0xB392, FP128, GR32>;
+
+  def CELGBR : UnaryRRF4<"celgbr", 0xB3A0, FP32,  GR64>;
+  def CDLGBR : UnaryRRF4<"cdlgbr", 0xB3A1, FP64,  GR64>;
+  def CXLGBR : UnaryRRF4<"cxlgbr", 0xB3A2, FP128, GR64>;
+
+  def : Pat<(f32  (uint_to_fp GR32:$src)), (CELFBR 0, GR32:$src, 0)>;
+  def : Pat<(f64  (uint_to_fp GR32:$src)), (CDLFBR 0, GR32:$src, 0)>;
+  def : Pat<(f128 (uint_to_fp GR32:$src)), (CXLFBR 0, GR32:$src, 0)>;
+
+  def : Pat<(f32  (uint_to_fp GR64:$src)), (CELGBR 0, GR64:$src, 0)>;
+  def : Pat<(f64  (uint_to_fp GR64:$src)), (CDLGBR 0, GR64:$src, 0)>;
+  def : Pat<(f128 (uint_to_fp GR64:$src)), (CXLGBR 0, GR64:$src, 0)>;
+}
+
+// Convert a floating-point register value to a signed integer value,
+// with the second operand (modifier M3) specifying the rounding mode.
+let Defs = [CC] in {
+  def CFEBR : UnaryRRF<"cfeb", 0xB398, GR32, FP32>;
+  def CFDBR : UnaryRRF<"cfdb", 0xB399, GR32, FP64>;
+  def CFXBR : UnaryRRF<"cfxb", 0xB39A, GR32, FP128>;
+
+  def CGEBR : UnaryRRF<"cgeb", 0xB3A8, GR64, FP32>;
+  def CGDBR : UnaryRRF<"cgdb", 0xB3A9, GR64, FP64>;
+  def CGXBR : UnaryRRF<"cgxb", 0xB3AA, GR64, FP128>;
+}
+
+// fp_to_sint always rounds towards zero, which is modifier value 5.
+def : Pat<(i32 (fp_to_sint FP32:$src)),  (CFEBR 5, FP32:$src)>;
+def : Pat<(i32 (fp_to_sint FP64:$src)),  (CFDBR 5, FP64:$src)>;
+def : Pat<(i32 (fp_to_sint FP128:$src)), (CFXBR 5, FP128:$src)>;
+
+def : Pat<(i64 (fp_to_sint FP32:$src)),  (CGEBR 5, FP32:$src)>;
+def : Pat<(i64 (fp_to_sint FP64:$src)),  (CGDBR 5, FP64:$src)>;
+def : Pat<(i64 (fp_to_sint FP128:$src)), (CGXBR 5, FP128:$src)>;
+
+// Convert a floating-point register value to an unsigned integer value.
+let Predicates = [FeatureFPExtension] in {
+  let Defs = [CC] in {
+    def CLFEBR : UnaryRRF4<"clfebr", 0xB39C, GR32, FP32>;
+    def CLFDBR : UnaryRRF4<"clfdbr", 0xB39D, GR32, FP64>;
+    def CLFXBR : UnaryRRF4<"clfxbr", 0xB39E, GR32, FP128>;
+
+    def CLGEBR : UnaryRRF4<"clgebr", 0xB3AC, GR64, FP32>;
+    def CLGDBR : UnaryRRF4<"clgdbr", 0xB3AD, GR64, FP64>;
+    def CLGXBR : UnaryRRF4<"clgxbr", 0xB3AE, GR64, FP128>;
+  }
+
+  def : Pat<(i32 (fp_to_uint FP32:$src)),  (CLFEBR 5, FP32:$src,  0)>;
+  def : Pat<(i32 (fp_to_uint FP64:$src)),  (CLFDBR 5, FP64:$src,  0)>;
+  def : Pat<(i32 (fp_to_uint FP128:$src)), (CLFXBR 5, FP128:$src, 0)>;
+
+  def : Pat<(i64 (fp_to_uint FP32:$src)),  (CLGEBR 5, FP32:$src,  0)>;
+  def : Pat<(i64 (fp_to_uint FP64:$src)),  (CLGDBR 5, FP64:$src,  0)>;
+  def : Pat<(i64 (fp_to_uint FP128:$src)), (CLGXBR 5, FP128:$src, 0)>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Negation (Load Complement).
+let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in {
+  def LCEBR : UnaryRRE<"lceb", 0xB303, fneg, FP32,  FP32>;
+  def LCDBR : UnaryRRE<"lcdb", 0xB313, fneg, FP64,  FP64>;
+  def LCXBR : UnaryRRE<"lcxb", 0xB343, fneg, FP128, FP128>;
+}
+
+// Absolute value (Load Positive).
+let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in {
+  def LPEBR : UnaryRRE<"lpeb", 0xB300, fabs, FP32,  FP32>;
+  def LPDBR : UnaryRRE<"lpdb", 0xB310, fabs, FP64,  FP64>;
+  def LPXBR : UnaryRRE<"lpxb", 0xB340, fabs, FP128, FP128>;
+}
+
+// Negative absolute value (Load Negative).
+let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in {
+  def LNEBR : UnaryRRE<"lneb", 0xB301, fnabs, FP32,  FP32>;
+  def LNDBR : UnaryRRE<"lndb", 0xB311, fnabs, FP64,  FP64>;
+  def LNXBR : UnaryRRE<"lnxb", 0xB341, fnabs, FP128, FP128>;
+}
+
+// Square root.
+def SQEBR : UnaryRRE<"sqeb", 0xB314, fsqrt, FP32,  FP32>;
+def SQDBR : UnaryRRE<"sqdb", 0xB315, fsqrt, FP64,  FP64>;
+def SQXBR : UnaryRRE<"sqxb", 0xB316, fsqrt, FP128, FP128>;
+
+def SQEB : UnaryRXE<"sqeb", 0xED14, loadu<fsqrt>, FP32, 4>;
+def SQDB : UnaryRXE<"sqdb", 0xED15, loadu<fsqrt>, FP64, 8>;
+
+// Round to an integer, with the second operand (modifier M3) specifying
+// the rounding mode.  These forms always check for inexact conditions.
+def FIEBR : UnaryRRF<"fieb", 0xB357, FP32,  FP32>;
+def FIDBR : UnaryRRF<"fidb", 0xB35F, FP64,  FP64>;
+def FIXBR : UnaryRRF<"fixb", 0xB347, FP128, FP128>;
+
+// frint rounds according to the current mode (modifier 0) and detects
+// inexact conditions.
+def : Pat<(frint FP32:$src),  (FIEBR 0, FP32:$src)>;
+def : Pat<(frint FP64:$src),  (FIDBR 0, FP64:$src)>;
+def : Pat<(frint FP128:$src), (FIXBR 0, FP128:$src)>;
+
+let Predicates = [FeatureFPExtension] in {
+  // Extended forms of the FIxBR instructions.  M4 can be set to 4
+  // to suppress detection of inexact conditions.
+  def FIEBRA : UnaryRRF4<"fiebra", 0xB357, FP32,  FP32>;
+  def FIDBRA : UnaryRRF4<"fidbra", 0xB35F, FP64,  FP64>;
+  def FIXBRA : UnaryRRF4<"fixbra", 0xB347, FP128, FP128>;
+
+  // fnearbyint is like frint but does not detect inexact conditions.
+  def : Pat<(fnearbyint FP32:$src),  (FIEBRA 0, FP32:$src,  4)>;
+  def : Pat<(fnearbyint FP64:$src),  (FIDBRA 0, FP64:$src,  4)>;
+  def : Pat<(fnearbyint FP128:$src), (FIXBRA 0, FP128:$src, 4)>;
+
+  // floor is no longer allowed to raise an inexact condition,
+  // so restrict it to the cases where the condition can be suppressed.
+  // Mode 7 is round towards -inf.
+  def : Pat<(ffloor FP32:$src),  (FIEBRA 7, FP32:$src,  4)>;
+  def : Pat<(ffloor FP64:$src),  (FIDBRA 7, FP64:$src,  4)>;
+  def : Pat<(ffloor FP128:$src), (FIXBRA 7, FP128:$src, 4)>;
+
+  // Same idea for ceil, where mode 6 is round towards +inf.
+  def : Pat<(fceil FP32:$src),  (FIEBRA 6, FP32:$src,  4)>;
+  def : Pat<(fceil FP64:$src),  (FIDBRA 6, FP64:$src,  4)>;
+  def : Pat<(fceil FP128:$src), (FIXBRA 6, FP128:$src, 4)>;
+
+  // Same idea for trunc, where mode 5 is round towards zero.
+  def : Pat<(ftrunc FP32:$src),  (FIEBRA 5, FP32:$src,  4)>;
+  def : Pat<(ftrunc FP64:$src),  (FIDBRA 5, FP64:$src,  4)>;
+  def : Pat<(ftrunc FP128:$src), (FIXBRA 5, FP128:$src, 4)>;
+
+  // Same idea for round, where mode 1 is round towards nearest with
+  // ties away from zero.
+  def : Pat<(frnd FP32:$src),  (FIEBRA 1, FP32:$src,  4)>;
+  def : Pat<(frnd FP64:$src),  (FIDBRA 1, FP64:$src,  4)>;
+  def : Pat<(frnd FP128:$src), (FIXBRA 1, FP128:$src, 4)>;
+}
+
+//===----------------------------------------------------------------------===//
+// Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition.
+let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in {
+  let isCommutable = 1 in {
+    def AEBR : BinaryRRE<"aeb", 0xB30A, fadd, FP32,  FP32>;
+    def ADBR : BinaryRRE<"adb", 0xB31A, fadd, FP64,  FP64>;
+    def AXBR : BinaryRRE<"axb", 0xB34A, fadd, FP128, FP128>;
+  }
+  def AEB : BinaryRXE<"aeb", 0xED0A, fadd, FP32, load, 4>;
+  def ADB : BinaryRXE<"adb", 0xED1A, fadd, FP64, load, 8>;
+}
+
+// Subtraction.
+let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in {
+  def SEBR : BinaryRRE<"seb", 0xB30B, fsub, FP32,  FP32>;
+  def SDBR : BinaryRRE<"sdb", 0xB31B, fsub, FP64,  FP64>;
+  def SXBR : BinaryRRE<"sxb", 0xB34B, fsub, FP128, FP128>;
+
+  def SEB : BinaryRXE<"seb",  0xED0B, fsub, FP32, load, 4>;
+  def SDB : BinaryRXE<"sdb",  0xED1B, fsub, FP64, load, 8>;
+}
+
+// Multiplication.
+let isCommutable = 1 in {
+  def MEEBR : BinaryRRE<"meeb", 0xB317, fmul, FP32,  FP32>;
+  def MDBR  : BinaryRRE<"mdb",  0xB31C, fmul, FP64,  FP64>;
+  def MXBR  : BinaryRRE<"mxb",  0xB34C, fmul, FP128, FP128>;
+}
+def MEEB : BinaryRXE<"meeb", 0xED17, fmul, FP32, load, 4>;
+def MDB  : BinaryRXE<"mdb",  0xED1C, fmul, FP64, load, 8>;
+
+// f64 multiplication of two FP32 registers.
+def MDEBR : BinaryRRE<"mdeb", 0xB30C, null_frag, FP64, FP32>;
+def : Pat<(fmul (f64 (fextend FP32:$src1)), (f64 (fextend FP32:$src2))),
+          (MDEBR (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                                FP32:$src1, subreg_h32), FP32:$src2)>;
+
+// f64 multiplication of an FP32 register and an f32 memory.
+def MDEB : BinaryRXE<"mdeb", 0xED0C, null_frag, FP64, load, 4>;
+def : Pat<(fmul (f64 (fextend FP32:$src1)),
+                (f64 (extloadf32 bdxaddr12only:$addr))),
+          (MDEB (INSERT_SUBREG (f64 (IMPLICIT_DEF)), FP32:$src1, subreg_h32),
+                bdxaddr12only:$addr)>;
+
+// f128 multiplication of two FP64 registers.
+def MXDBR : BinaryRRE<"mxdb", 0xB307, null_frag, FP128, FP64>;
+def : Pat<(fmul (f128 (fextend FP64:$src1)), (f128 (fextend FP64:$src2))),
+          (MXDBR (INSERT_SUBREG (f128 (IMPLICIT_DEF)),
+                                FP64:$src1, subreg_h64), FP64:$src2)>;
+
+// f128 multiplication of an FP64 register and an f64 memory.
+def MXDB : BinaryRXE<"mxdb", 0xED07, null_frag, FP128, load, 8>;
+def : Pat<(fmul (f128 (fextend FP64:$src1)),
+                (f128 (extloadf64 bdxaddr12only:$addr))),
+          (MXDB (INSERT_SUBREG (f128 (IMPLICIT_DEF)), FP64:$src1, subreg_h64),
+                bdxaddr12only:$addr)>;
+
+// Fused multiply-add.
+def MAEBR : TernaryRRD<"maeb", 0xB30E, z_fma, FP32>;
+def MADBR : TernaryRRD<"madb", 0xB31E, z_fma, FP64>;
+
+def MAEB : TernaryRXF<"maeb", 0xED0E, z_fma, FP32, load, 4>;
+def MADB : TernaryRXF<"madb", 0xED1E, z_fma, FP64, load, 8>;
+
+// Fused multiply-subtract.
+def MSEBR : TernaryRRD<"mseb", 0xB30F, z_fms, FP32>;
+def MSDBR : TernaryRRD<"msdb", 0xB31F, z_fms, FP64>;
+
+def MSEB : TernaryRXF<"mseb", 0xED0F, z_fms, FP32, load, 4>;
+def MSDB : TernaryRXF<"msdb", 0xED1F, z_fms, FP64, load, 8>;
+
+// Division.
+def DEBR : BinaryRRE<"deb", 0xB30D, fdiv, FP32,  FP32>;
+def DDBR : BinaryRRE<"ddb", 0xB31D, fdiv, FP64,  FP64>;
+def DXBR : BinaryRRE<"dxb", 0xB34D, fdiv, FP128, FP128>;
+
+def DEB : BinaryRXE<"deb", 0xED0D, fdiv, FP32, load, 4>;
+def DDB : BinaryRXE<"ddb", 0xED1D, fdiv, FP64, load, 8>;
+
+//===----------------------------------------------------------------------===//
+// Comparisons
+//===----------------------------------------------------------------------===//
+
+let Defs = [CC], CCValues = 0xF in {
+  def CEBR : CompareRRE<"ceb", 0xB309, z_fcmp, FP32,  FP32>;
+  def CDBR : CompareRRE<"cdb", 0xB319, z_fcmp, FP64,  FP64>;
+  def CXBR : CompareRRE<"cxb", 0xB349, z_fcmp, FP128, FP128>;
+
+  def CEB : CompareRXE<"ceb", 0xED09, z_fcmp, FP32, load, 4>;
+  def CDB : CompareRXE<"cdb", 0xED19, z_fcmp, FP64, load, 8>;
+}
+
+//===----------------------------------------------------------------------===//
+// Peepholes
+//===----------------------------------------------------------------------===//
+
+def : Pat<(f32  fpimmneg0), (LCEBR (LZER))>;
+def : Pat<(f64  fpimmneg0), (LCDBR (LZDR))>;
+def : Pat<(f128 fpimmneg0), (LCXBR (LZXR))>;
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrFormats.td b/contrib/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
new file mode 100644
index 0000000..9f59a1c
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -0,0 +1,1609 @@
+//==- SystemZInstrFormats.td - SystemZ Instruction Formats --*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Basic SystemZ instruction definition
+//===----------------------------------------------------------------------===//
+
+class InstSystemZ<int size, dag outs, dag ins, string asmstr,
+                  list<dag> pattern> : Instruction {
+  let Namespace = "SystemZ";
+
+  dag OutOperandList = outs;
+  dag InOperandList = ins;
+  let Size = size;
+  let Pattern = pattern;
+  let AsmString = asmstr;
+
+  // Some instructions come in pairs, one having a 12-bit displacement
+  // and the other having a 20-bit displacement.  Both instructions in
+  // the pair have the same DispKey and their DispSizes are "12" and "20"
+  // respectively.
+  string DispKey = "";
+  string DispSize = "none";
+
+  // Many register-based <INSN>R instructions have a memory-based <INSN>
+  // counterpart.  OpKey uniquely identifies <INSN>, while OpType is
+  // "reg" for <INSN>R and "mem" for <INSN>.
+  string OpKey = "";
+  string OpType = "none";
+
+  // Many distinct-operands instructions have older 2-operand equivalents.
+  // NumOpsKey uniquely identifies one of these 2-operand and 3-operand pairs,
+  // with NumOpsValue being "2" or "3" as appropriate.
+  string NumOpsKey = "";
+  string NumOpsValue = "none";
+
+  // True if this instruction is a simple D(X,B) load of a register
+  // (with no sign or zero extension).
+  bit SimpleBDXLoad = 0;
+
+  // True if this instruction is a simple D(X,B) store of a register
+  // (with no truncation).
+  bit SimpleBDXStore = 0;
+
+  // True if this instruction has a 20-bit displacement field.
+  bit Has20BitOffset = 0;
+
+  // True if addresses in this instruction have an index register.
+  bit HasIndex = 0;
+
+  // True if this is a 128-bit pseudo instruction that combines two 64-bit
+  // operations.
+  bit Is128Bit = 0;
+
+  // The access size of all memory operands in bytes, or 0 if not known.
+  bits<5> AccessBytes = 0;
+
+  // If the instruction sets CC to a useful value, this gives the mask
+  // of all possible CC results.  The mask has the same form as
+  // SystemZ::CCMASK_*.
+  bits<4> CCValues = 0;
+
+  // The subset of CCValues that have the same meaning as they would after
+  // a comparison of the first operand against zero.
+  bits<4> CompareZeroCCMask = 0;
+
+  // True if the instruction is conditional and if the CC mask operand
+  // comes first (as for BRC, etc.).
+  bit CCMaskFirst = 0;
+
+  // Similar, but true if the CC mask operand comes last (as for LOC, etc.).
+  bit CCMaskLast = 0;
+
+  // True if the instruction is the "logical" rather than "arithmetic" form,
+  // in cases where a distinction exists.
+  bit IsLogical = 0;
+
+  let TSFlags{0}     = SimpleBDXLoad;
+  let TSFlags{1}     = SimpleBDXStore;
+  let TSFlags{2}     = Has20BitOffset;
+  let TSFlags{3}     = HasIndex;
+  let TSFlags{4}     = Is128Bit;
+  let TSFlags{9-5}   = AccessBytes;
+  let TSFlags{13-10} = CCValues;
+  let TSFlags{17-14} = CompareZeroCCMask;
+  let TSFlags{18}    = CCMaskFirst;
+  let TSFlags{19}    = CCMaskLast;
+  let TSFlags{20}    = IsLogical;
+}
+
+//===----------------------------------------------------------------------===//
+// Mappings between instructions
+//===----------------------------------------------------------------------===//
+
+// Return the version of an instruction that has an unsigned 12-bit
+// displacement.
+def getDisp12Opcode : InstrMapping {
+  let FilterClass = "InstSystemZ";
+  let RowFields = ["DispKey"];
+  let ColFields = ["DispSize"];
+  let KeyCol = ["20"];
+  let ValueCols = [["12"]];
+}
+
+// Return the version of an instruction that has a signed 20-bit displacement.
+def getDisp20Opcode : InstrMapping {
+  let FilterClass = "InstSystemZ";
+  let RowFields = ["DispKey"];
+  let ColFields = ["DispSize"];
+  let KeyCol = ["12"];
+  let ValueCols = [["20"]];
+}
+
+// Return the memory form of a register instruction.
+def getMemOpcode : InstrMapping {
+  let FilterClass = "InstSystemZ";
+  let RowFields = ["OpKey"];
+  let ColFields = ["OpType"];
+  let KeyCol = ["reg"];
+  let ValueCols = [["mem"]];
+}
+
+// Return the 3-operand form of a 2-operand instruction.
+def getThreeOperandOpcode : InstrMapping {
+  let FilterClass = "InstSystemZ";
+  let RowFields = ["NumOpsKey"];
+  let ColFields = ["NumOpsValue"];
+  let KeyCol = ["2"];
+  let ValueCols = [["3"]];
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction formats
+//===----------------------------------------------------------------------===//
+//
+// Formats are specified using operand field declarations of the form:
+//
+//   bits<4> Rn   : register input or output for operand n
+//   bits<m> In   : immediate value of width m for operand n
+//   bits<4> BDn  : address operand n, which has a base and a displacement
+//   bits<m> XBDn : address operand n, which has an index, a base and a
+//                  displacement
+//   bits<4> Xn   : index register for address operand n
+//   bits<4> Mn   : mode value for operand n
+//
+// The operand numbers ("n" in the list above) follow the architecture manual.
+// Assembly operands sometimes have a different order; in particular, R3 often
+// is often written between operands 1 and 2.
+//
+//===----------------------------------------------------------------------===//
+
+class InstRI<bits<12> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<4, outs, ins, asmstr, pattern> {
+  field bits<32> Inst;
+  field bits<32> SoftFail = 0;
+
+  bits<4> R1;
+  bits<16> I2;
+
+  let Inst{31-24} = op{11-4};
+  let Inst{23-20} = R1;
+  let Inst{19-16} = op{3-0};
+  let Inst{15-0}  = I2;
+}
+
+class InstRIEb<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> R1;
+  bits<4> R2;
+  bits<4> M3;
+  bits<16> RI4;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = R1;
+  let Inst{35-32} = R2;
+  let Inst{31-16} = RI4;
+  let Inst{15-12} = M3;
+  let Inst{11-8}  = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstRIEc<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> R1;
+  bits<8> I2;
+  bits<4> M3;
+  bits<16> RI4;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = R1;
+  let Inst{35-32} = M3;
+  let Inst{31-16} = RI4;
+  let Inst{15-8}  = I2;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstRIEd<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> R1;
+  bits<4> R3;
+  bits<16> I2;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = R1;
+  let Inst{35-32} = R3;
+  let Inst{31-16} = I2;
+  let Inst{15-8}  = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstRIEf<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> R1;
+  bits<4> R2;
+  bits<8> I3;
+  bits<8> I4;
+  bits<8> I5;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = R1;
+  let Inst{35-32} = R2;
+  let Inst{31-24} = I3;
+  let Inst{23-16} = I4;
+  let Inst{15-8}  = I5;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstRIL<bits<12> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> R1;
+  bits<32> I2;
+
+  let Inst{47-40} = op{11-4};
+  let Inst{39-36} = R1;
+  let Inst{35-32} = op{3-0};
+  let Inst{31-0}  = I2;
+}
+
+class InstRR<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<2, outs, ins, asmstr, pattern> {
+  field bits<16> Inst;
+  field bits<16> SoftFail = 0;
+
+  bits<4> R1;
+  bits<4> R2;
+
+  let Inst{15-8} = op;
+  let Inst{7-4}  = R1;
+  let Inst{3-0}  = R2;
+}
+
+class InstRRD<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<4, outs, ins, asmstr, pattern> {
+  field bits<32> Inst;
+  field bits<32> SoftFail = 0;
+
+  bits<4> R1;
+  bits<4> R3;
+  bits<4> R2;
+
+  let Inst{31-16} = op;
+  let Inst{15-12} = R1;
+  let Inst{11-8}  = 0;
+  let Inst{7-4}   = R3;
+  let Inst{3-0}   = R2;
+}
+
+class InstRRE<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<4, outs, ins, asmstr, pattern> {
+  field bits<32> Inst;
+  field bits<32> SoftFail = 0;
+
+  bits<4> R1;
+  bits<4> R2;
+
+  let Inst{31-16} = op;
+  let Inst{15-8}  = 0;
+  let Inst{7-4}   = R1;
+  let Inst{3-0}   = R2;
+}
+
+class InstRRF<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<4, outs, ins, asmstr, pattern> {
+  field bits<32> Inst;
+  field bits<32> SoftFail = 0;
+
+  bits<4> R1;
+  bits<4> R2;
+  bits<4> R3;
+  bits<4> R4;
+
+  let Inst{31-16} = op;
+  let Inst{15-12} = R3;
+  let Inst{11-8}  = R4;
+  let Inst{7-4}   = R1;
+  let Inst{3-0}   = R2;
+}
+
+class InstRX<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<4, outs, ins, asmstr, pattern> {
+  field bits<32> Inst;
+  field bits<32> SoftFail = 0;
+
+  bits<4> R1;
+  bits<20> XBD2;
+
+  let Inst{31-24} = op;
+  let Inst{23-20} = R1;
+  let Inst{19-0}  = XBD2;
+
+  let HasIndex = 1;
+}
+
+class InstRXE<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> R1;
+  bits<20> XBD2;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = R1;
+  let Inst{35-16} = XBD2;
+  let Inst{15-8}  = 0;
+  let Inst{7-0}   = op{7-0};
+
+  let HasIndex = 1;
+}
+
+class InstRXF<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> R1;
+  bits<4> R3;
+  bits<20> XBD2;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = R3;
+  let Inst{35-16} = XBD2;
+  let Inst{15-12} = R1;
+  let Inst{11-8}  = 0;
+  let Inst{7-0}   = op{7-0};
+
+  let HasIndex = 1;
+}
+
+class InstRXY<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> R1;
+  bits<28> XBD2;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = R1;
+  let Inst{35-8}  = XBD2;
+  let Inst{7-0}   = op{7-0};
+
+  let Has20BitOffset = 1;
+  let HasIndex = 1;
+}
+
+class InstRS<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<4, outs, ins, asmstr, pattern> {
+  field bits<32> Inst;
+  field bits<32> SoftFail = 0;
+
+  bits<4> R1;
+  bits<4> R3;
+  bits<16> BD2;
+
+  let Inst{31-24} = op;
+  let Inst{23-20} = R1;
+  let Inst{19-16} = R3;
+  let Inst{15-0}  = BD2;
+}
+
+class InstRSY<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> R1;
+  bits<4> R3;
+  bits<24> BD2;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = R1;
+  let Inst{35-32} = R3;
+  let Inst{31-8}  = BD2;
+  let Inst{7-0}   = op{7-0};
+
+  let Has20BitOffset = 1;
+}
+
+class InstSI<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<4, outs, ins, asmstr, pattern> {
+  field bits<32> Inst;
+  field bits<32> SoftFail = 0;
+
+  bits<16> BD1;
+  bits<8> I2;
+
+  let Inst{31-24} = op;
+  let Inst{23-16} = I2;
+  let Inst{15-0}  = BD1;
+}
+
+class InstSIL<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<16> BD1;
+  bits<16> I2;
+
+  let Inst{47-32} = op;
+  let Inst{31-16} = BD1;
+  let Inst{15-0}  = I2;
+}
+
+class InstSIY<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<24> BD1;
+  bits<8> I2;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-32} = I2;
+  let Inst{31-8}  = BD1;
+  let Inst{7-0}   = op{7-0};
+
+  let Has20BitOffset = 1;
+}
+
+class InstSS<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<24> BDL1;
+  bits<16> BD2;
+
+  let Inst{47-40} = op;
+  let Inst{39-16} = BDL1;
+  let Inst{15-0}  = BD2;
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction definitions with semantics
+//===----------------------------------------------------------------------===//
+//
+// These classes have the form [Cond]<Category><Format>, where <Format> is one
+// of the formats defined above and where <Category> describes the inputs
+// and outputs.  "Cond" is used if the instruction is conditional,
+// in which case the 4-bit condition-code mask is added as a final operand.
+// <Category> can be one of:
+//
+//   Inherent:
+//     One register output operand and no input operands.
+//
+//   BranchUnary:
+//     One register output operand, one register input operand and
+//     one branch displacement.  The instructions stores a modified
+//     form of the source register in the destination register and
+//     branches on the result.
+//
+//   Store:
+//     One register or immediate input operand and one address input operand.
+//     The instruction stores the first operand to the address.
+//
+//     This category is used for both pure and truncating stores.
+//
+//   LoadMultiple:
+//     One address input operand and two explicit output operands.
+//     The instruction loads a range of registers from the address,
+//     with the explicit operands giving the first and last register
+//     to load.  Other loaded registers are added as implicit definitions.
+//
+//   StoreMultiple:
+//     Two explicit input register operands and an address operand.
+//     The instruction stores a range of registers to the address,
+//     with the explicit operands giving the first and last register
+//     to store.  Other stored registers are added as implicit uses.
+//
+//   Unary:
+//     One register output operand and one input operand.
+//
+//   Binary:
+//     One register output operand and two input operands.
+//
+//   Compare:
+//     Two input operands and an implicit CC output operand.
+//
+//   Ternary:
+//     One register output operand and three input operands.
+//
+//   LoadAndOp:
+//     One output operand and two input operands, one of which is an address.
+//     The instruction both reads from and writes to the address.
+//
+//   CmpSwap:
+//     One output operand and three input operands, one of which is an address.
+//     The instruction both reads from and writes to the address.
+//
+//   RotateSelect:
+//     One output operand and five input operands.  The first two operands
+//     are registers and the other three are immediates.
+//
+//   Prefetch:
+//     One 4-bit immediate operand and one address operand.  The immediate
+//     operand is 1 for a load prefetch and 2 for a store prefetch.
+//
+// The format determines which input operands are tied to output operands,
+// and also determines the shape of any address operand.
+//
+// Multiclasses of the form <Category><Format>Pair define two instructions,
+// one with <Category><Format> and one with <Category><Format>Y.  The name
+// of the first instruction has no suffix, the name of the second has
+// an extra "y".
+//
+//===----------------------------------------------------------------------===//
+
+class InherentRRE<string mnemonic, bits<16> opcode, RegisterOperand cls,
+                  dag src>
+  : InstRRE<opcode, (outs cls:$R1), (ins),
+            mnemonic#"\t$R1",
+            [(set cls:$R1, src)]> {
+  let R2 = 0;
+}
+
+class BranchUnaryRI<string mnemonic, bits<12> opcode, RegisterOperand cls>
+  : InstRI<opcode, (outs cls:$R1), (ins cls:$R1src, brtarget16:$I2),
+           mnemonic##"\t$R1, $I2", []> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
+class LoadMultipleRSY<string mnemonic, bits<16> opcode, RegisterOperand cls>
+  : InstRSY<opcode, (outs cls:$R1, cls:$R3), (ins bdaddr20only:$BD2),
+            mnemonic#"\t$R1, $R3, $BD2", []> {
+  let mayLoad = 1;
+}
+
+class StoreRILPC<string mnemonic, bits<12> opcode, SDPatternOperator operator,
+                 RegisterOperand cls>
+  : InstRIL<opcode, (outs), (ins cls:$R1, pcrel32:$I2),
+            mnemonic#"\t$R1, $I2",
+            [(operator cls:$R1, pcrel32:$I2)]> {
+  let mayStore = 1;
+  // We want PC-relative addresses to be tried ahead of BD and BDX addresses.
+  // However, BDXs have two extra operands and are therefore 6 units more
+  // complex.
+  let AddedComplexity = 7;
+}
+
+class StoreRX<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+              RegisterOperand cls, bits<5> bytes,
+              AddressingMode mode = bdxaddr12only>
+  : InstRX<opcode, (outs), (ins cls:$R1, mode:$XBD2),
+           mnemonic#"\t$R1, $XBD2",
+           [(operator cls:$R1, mode:$XBD2)]> {
+  let OpKey = mnemonic ## cls;
+  let OpType = "mem";
+  let mayStore = 1;
+  let AccessBytes = bytes;
+}
+
+class StoreRXY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+               RegisterOperand cls, bits<5> bytes,
+               AddressingMode mode = bdxaddr20only>
+  : InstRXY<opcode, (outs), (ins cls:$R1, mode:$XBD2),
+            mnemonic#"\t$R1, $XBD2",
+            [(operator cls:$R1, mode:$XBD2)]> {
+  let OpKey = mnemonic ## cls;
+  let OpType = "mem";
+  let mayStore = 1;
+  let AccessBytes = bytes;
+}
+
+multiclass StoreRXPair<string mnemonic, bits<8> rxOpcode, bits<16> rxyOpcode,
+                       SDPatternOperator operator, RegisterOperand cls,
+                       bits<5> bytes> {
+  let DispKey = mnemonic ## #cls in {
+    let DispSize = "12" in
+      def "" : StoreRX<mnemonic, rxOpcode, operator, cls, bytes, bdxaddr12pair>;
+    let DispSize = "20" in
+      def Y  : StoreRXY<mnemonic#"y", rxyOpcode, operator, cls, bytes,
+                        bdxaddr20pair>;
+  }
+}
+
+class StoreMultipleRSY<string mnemonic, bits<16> opcode, RegisterOperand cls>
+  : InstRSY<opcode, (outs), (ins cls:$R1, cls:$R3, bdaddr20only:$BD2),
+            mnemonic#"\t$R1, $R3, $BD2", []> {
+  let mayStore = 1;
+}
+
+// StoreSI* instructions are used to store an integer to memory, but the
+// addresses are more restricted than for normal stores.  If we are in the
+// situation of having to force either the address into a register or the
+// constant into a register, it's usually better to do the latter.
+// We therefore match the address in the same way as a normal store and
+// only use the StoreSI* instruction if the matched address is suitable.
+class StoreSI<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+              Immediate imm>
+  : InstSI<opcode, (outs), (ins mviaddr12pair:$BD1, imm:$I2),
+           mnemonic#"\t$BD1, $I2",
+           [(operator imm:$I2, mviaddr12pair:$BD1)]> {
+  let mayStore = 1;
+}
+
+class StoreSIY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+               Immediate imm>
+  : InstSIY<opcode, (outs), (ins mviaddr20pair:$BD1, imm:$I2),
+            mnemonic#"\t$BD1, $I2",
+            [(operator imm:$I2, mviaddr20pair:$BD1)]> {
+  let mayStore = 1;
+}
+
+class StoreSIL<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+               Immediate imm>
+  : InstSIL<opcode, (outs), (ins mviaddr12pair:$BD1, imm:$I2),
+            mnemonic#"\t$BD1, $I2",
+            [(operator imm:$I2, mviaddr12pair:$BD1)]> {
+  let mayStore = 1;
+}
+
+multiclass StoreSIPair<string mnemonic, bits<8> siOpcode, bits<16> siyOpcode,
+                       SDPatternOperator operator, Immediate imm> {
+  let DispKey = mnemonic in {
+    let DispSize = "12" in
+      def "" : StoreSI<mnemonic, siOpcode, operator, imm>;
+    let DispSize = "20" in
+      def Y  : StoreSIY<mnemonic#"y", siyOpcode, operator, imm>;
+  }
+}
+
+class CondStoreRSY<string mnemonic, bits<16> opcode,
+                   RegisterOperand cls, bits<5> bytes,
+                   AddressingMode mode = bdaddr20only>
+  : InstRSY<opcode, (outs), (ins cls:$R1, mode:$BD2, cond4:$valid, cond4:$R3),
+            mnemonic#"$R3\t$R1, $BD2", []>,
+    Requires<[FeatureLoadStoreOnCond]> {
+  let mayStore = 1;
+  let AccessBytes = bytes;
+  let CCMaskLast = 1;
+}
+
+// Like CondStoreRSY, but used for the raw assembly form.  The condition-code
+// mask is the third operand rather than being part of the mnemonic.
+class AsmCondStoreRSY<string mnemonic, bits<16> opcode,
+                      RegisterOperand cls, bits<5> bytes,
+                      AddressingMode mode = bdaddr20only>
+  : InstRSY<opcode, (outs), (ins cls:$R1, mode:$BD2, imm32zx4:$R3),
+            mnemonic#"\t$R1, $BD2, $R3", []>,
+    Requires<[FeatureLoadStoreOnCond]> {
+  let mayStore = 1;
+  let AccessBytes = bytes;
+}
+
+// Like CondStoreRSY, but with a fixed CC mask.
+class FixedCondStoreRSY<string mnemonic, bits<16> opcode,
+                        RegisterOperand cls, bits<4> ccmask, bits<5> bytes,
+                        AddressingMode mode = bdaddr20only>
+  : InstRSY<opcode, (outs), (ins cls:$R1, mode:$BD2),
+            mnemonic#"\t$R1, $BD2", []>,
+    Requires<[FeatureLoadStoreOnCond]> {
+  let mayStore = 1;
+  let AccessBytes = bytes;
+  let R3 = ccmask;
+}
+
+class UnaryRR<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+              RegisterOperand cls1, RegisterOperand cls2>
+  : InstRR<opcode, (outs cls1:$R1), (ins cls2:$R2),
+           mnemonic#"r\t$R1, $R2",
+           [(set cls1:$R1, (operator cls2:$R2))]> {
+  let OpKey = mnemonic ## cls1;
+  let OpType = "reg";
+}
+
+class UnaryRRE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+               RegisterOperand cls1, RegisterOperand cls2>
+  : InstRRE<opcode, (outs cls1:$R1), (ins cls2:$R2),
+            mnemonic#"r\t$R1, $R2",
+            [(set cls1:$R1, (operator cls2:$R2))]> {
+  let OpKey = mnemonic ## cls1;
+  let OpType = "reg";
+}
+
+class UnaryRRF<string mnemonic, bits<16> opcode, RegisterOperand cls1,
+               RegisterOperand cls2>
+  : InstRRF<opcode, (outs cls1:$R1), (ins imm32zx4:$R3, cls2:$R2),
+            mnemonic#"r\t$R1, $R3, $R2", []> {
+  let OpKey = mnemonic ## cls1;
+  let OpType = "reg";
+  let R4 = 0;
+}
+
+class UnaryRRF4<string mnemonic, bits<16> opcode, RegisterOperand cls1,
+                RegisterOperand cls2>
+  : InstRRF<opcode, (outs cls1:$R1), (ins imm32zx4:$R3, cls2:$R2, imm32zx4:$R4),
+            mnemonic#"\t$R1, $R3, $R2, $R4", []>;
+
+// These instructions are generated by if conversion.  The old value of R1
+// is added as an implicit use.
+class CondUnaryRRF<string mnemonic, bits<16> opcode, RegisterOperand cls1,
+                   RegisterOperand cls2>
+  : InstRRF<opcode, (outs cls1:$R1), (ins cls2:$R2, cond4:$valid, cond4:$R3),
+            mnemonic#"r$R3\t$R1, $R2", []>,
+    Requires<[FeatureLoadStoreOnCond]> {
+  let CCMaskLast = 1;
+  let R4 = 0;
+}
+
+// Like CondUnaryRRF, but used for the raw assembly form.  The condition-code
+// mask is the third operand rather than being part of the mnemonic.
+class AsmCondUnaryRRF<string mnemonic, bits<16> opcode, RegisterOperand cls1,
+                      RegisterOperand cls2>
+  : InstRRF<opcode, (outs cls1:$R1), (ins cls1:$R1src, cls2:$R2, imm32zx4:$R3),
+            mnemonic#"r\t$R1, $R2, $R3", []>,
+    Requires<[FeatureLoadStoreOnCond]> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let R4 = 0;
+}
+
+// Like CondUnaryRRF, but with a fixed CC mask.
+class FixedCondUnaryRRF<string mnemonic, bits<16> opcode, RegisterOperand cls1,
+                        RegisterOperand cls2, bits<4> ccmask>
+  : InstRRF<opcode, (outs cls1:$R1), (ins cls1:$R1src, cls2:$R2),
+            mnemonic#"\t$R1, $R2", []>,
+    Requires<[FeatureLoadStoreOnCond]> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let R3 = ccmask;
+  let R4 = 0;
+}
+
+class UnaryRI<string mnemonic, bits<12> opcode, SDPatternOperator operator,
+              RegisterOperand cls, Immediate imm>
+  : InstRI<opcode, (outs cls:$R1), (ins imm:$I2),
+           mnemonic#"\t$R1, $I2",
+           [(set cls:$R1, (operator imm:$I2))]>;
+
+class UnaryRIL<string mnemonic, bits<12> opcode, SDPatternOperator operator,
+               RegisterOperand cls, Immediate imm>
+  : InstRIL<opcode, (outs cls:$R1), (ins imm:$I2),
+            mnemonic#"\t$R1, $I2",
+            [(set cls:$R1, (operator imm:$I2))]>;
+
+class UnaryRILPC<string mnemonic, bits<12> opcode, SDPatternOperator operator,
+                 RegisterOperand cls>
+  : InstRIL<opcode, (outs cls:$R1), (ins pcrel32:$I2),
+            mnemonic#"\t$R1, $I2",
+            [(set cls:$R1, (operator pcrel32:$I2))]> {
+  let mayLoad = 1;
+  // We want PC-relative addresses to be tried ahead of BD and BDX addresses.
+  // However, BDXs have two extra operands and are therefore 6 units more
+  // complex.
+  let AddedComplexity = 7;
+}
+
+class CondUnaryRSY<string mnemonic, bits<16> opcode,
+                   SDPatternOperator operator, RegisterOperand cls,
+                   bits<5> bytes, AddressingMode mode = bdaddr20only>
+  : InstRSY<opcode, (outs cls:$R1),
+            (ins cls:$R1src, mode:$BD2, cond4:$valid, cond4:$R3),
+            mnemonic#"$R3\t$R1, $BD2",
+            [(set cls:$R1,
+                  (z_select_ccmask (load bdaddr20only:$BD2), cls:$R1src,
+                                   cond4:$valid, cond4:$R3))]>,
+    Requires<[FeatureLoadStoreOnCond]> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+  let CCMaskLast = 1;
+}
+
+// Like CondUnaryRSY, but used for the raw assembly form.  The condition-code
+// mask is the third operand rather than being part of the mnemonic.
+class AsmCondUnaryRSY<string mnemonic, bits<16> opcode,
+                      RegisterOperand cls, bits<5> bytes,
+                      AddressingMode mode = bdaddr20only>
+  : InstRSY<opcode, (outs cls:$R1), (ins cls:$R1src, mode:$BD2, imm32zx4:$R3),
+            mnemonic#"\t$R1, $BD2, $R3", []>,
+    Requires<[FeatureLoadStoreOnCond]> {
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
+// Like CondUnaryRSY, but with a fixed CC mask.
+class FixedCondUnaryRSY<string mnemonic, bits<16> opcode,
+                        RegisterOperand cls, bits<4> ccmask, bits<5> bytes,
+                        AddressingMode mode = bdaddr20only>
+  : InstRSY<opcode, (outs cls:$R1), (ins cls:$R1src, mode:$BD2),
+            mnemonic#"\t$R1, $BD2", []>,
+    Requires<[FeatureLoadStoreOnCond]> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let R3 = ccmask;
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+}
+
+class UnaryRX<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+              RegisterOperand cls, bits<5> bytes,
+              AddressingMode mode = bdxaddr12only>
+  : InstRX<opcode, (outs cls:$R1), (ins mode:$XBD2),
+           mnemonic#"\t$R1, $XBD2",
+           [(set cls:$R1, (operator mode:$XBD2))]> {
+  let OpKey = mnemonic ## cls;
+  let OpType = "mem";
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+}
+
+class UnaryRXE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+               RegisterOperand cls, bits<5> bytes>
+  : InstRXE<opcode, (outs cls:$R1), (ins bdxaddr12only:$XBD2),
+            mnemonic#"\t$R1, $XBD2",
+            [(set cls:$R1, (operator bdxaddr12only:$XBD2))]> {
+  let OpKey = mnemonic ## cls;
+  let OpType = "mem";
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+}
+
+class UnaryRXY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+               RegisterOperand cls, bits<5> bytes,
+               AddressingMode mode = bdxaddr20only>
+  : InstRXY<opcode, (outs cls:$R1), (ins mode:$XBD2),
+            mnemonic#"\t$R1, $XBD2",
+            [(set cls:$R1, (operator mode:$XBD2))]> {
+  let OpKey = mnemonic ## cls;
+  let OpType = "mem";
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+}
+
+multiclass UnaryRXPair<string mnemonic, bits<8> rxOpcode, bits<16> rxyOpcode,
+                       SDPatternOperator operator, RegisterOperand cls,
+                       bits<5> bytes> {
+  let DispKey = mnemonic ## #cls in {
+    let DispSize = "12" in
+      def "" : UnaryRX<mnemonic, rxOpcode, operator, cls, bytes, bdxaddr12pair>;
+    let DispSize = "20" in
+      def Y  : UnaryRXY<mnemonic#"y", rxyOpcode, operator, cls, bytes,
+                        bdxaddr20pair>;
+  }
+}
+
+class BinaryRR<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+               RegisterOperand cls1, RegisterOperand cls2>
+  : InstRR<opcode, (outs cls1:$R1), (ins cls1:$R1src, cls2:$R2),
+           mnemonic#"r\t$R1, $R2",
+           [(set cls1:$R1, (operator cls1:$R1src, cls2:$R2))]> {
+  let OpKey = mnemonic ## cls1;
+  let OpType = "reg";
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
+class BinaryRRE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                RegisterOperand cls1, RegisterOperand cls2>
+  : InstRRE<opcode, (outs cls1:$R1), (ins cls1:$R1src, cls2:$R2),
+            mnemonic#"r\t$R1, $R2",
+            [(set cls1:$R1, (operator cls1:$R1src, cls2:$R2))]> {
+  let OpKey = mnemonic ## cls1;
+  let OpType = "reg";
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
+class BinaryRRF<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                RegisterOperand cls1, RegisterOperand cls2>
+  : InstRRF<opcode, (outs cls1:$R1), (ins cls1:$R3, cls2:$R2),
+            mnemonic#"r\t$R1, $R3, $R2",
+            [(set cls1:$R1, (operator cls1:$R3, cls2:$R2))]> {
+  let OpKey = mnemonic ## cls1;
+  let OpType = "reg";
+  let R4 = 0;
+}
+
+class BinaryRRFK<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                 RegisterOperand cls1, RegisterOperand cls2>
+  : InstRRF<opcode, (outs cls1:$R1), (ins cls1:$R2, cls2:$R3),
+            mnemonic#"rk\t$R1, $R2, $R3",
+            [(set cls1:$R1, (operator cls1:$R2, cls2:$R3))]> {
+  let R4 = 0;
+}
+
+multiclass BinaryRRAndK<string mnemonic, bits<8> opcode1, bits<16> opcode2,
+                        SDPatternOperator operator, RegisterOperand cls1,
+                        RegisterOperand cls2> {
+  let NumOpsKey = mnemonic in {
+    let NumOpsValue = "3" in
+      def K : BinaryRRFK<mnemonic, opcode2, null_frag, cls1, cls2>,
+              Requires<[FeatureDistinctOps]>;
+    let NumOpsValue = "2", isConvertibleToThreeAddress = 1 in
+      def "" : BinaryRR<mnemonic, opcode1, operator, cls1, cls2>;
+  }
+}
+
+multiclass BinaryRREAndK<string mnemonic, bits<16> opcode1, bits<16> opcode2,
+                         SDPatternOperator operator, RegisterOperand cls1,
+                         RegisterOperand cls2> {
+  let NumOpsKey = mnemonic in {
+    let NumOpsValue = "3" in
+      def K : BinaryRRFK<mnemonic, opcode2, null_frag, cls1, cls2>,
+              Requires<[FeatureDistinctOps]>;
+    let NumOpsValue = "2", isConvertibleToThreeAddress = 1 in
+      def "" : BinaryRRE<mnemonic, opcode1, operator, cls1, cls2>;
+  }
+}
+
+class BinaryRI<string mnemonic, bits<12> opcode, SDPatternOperator operator,
+               RegisterOperand cls, Immediate imm>
+  : InstRI<opcode, (outs cls:$R1), (ins cls:$R1src, imm:$I2),
+           mnemonic#"\t$R1, $I2",
+           [(set cls:$R1, (operator cls:$R1src, imm:$I2))]> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
+class BinaryRIE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                RegisterOperand cls, Immediate imm>
+  : InstRIEd<opcode, (outs cls:$R1), (ins cls:$R3, imm:$I2),
+             mnemonic#"\t$R1, $R3, $I2",
+             [(set cls:$R1, (operator cls:$R3, imm:$I2))]>;
+
+multiclass BinaryRIAndK<string mnemonic, bits<12> opcode1, bits<16> opcode2,
+                        SDPatternOperator operator, RegisterOperand cls,
+                        Immediate imm> {
+  let NumOpsKey = mnemonic in {
+    let NumOpsValue = "3" in
+      def K : BinaryRIE<mnemonic##"k", opcode2, null_frag, cls, imm>,
+              Requires<[FeatureDistinctOps]>;
+    let NumOpsValue = "2", isConvertibleToThreeAddress = 1 in
+      def "" : BinaryRI<mnemonic, opcode1, operator, cls, imm>;
+  }
+}
+
+class BinaryRIL<string mnemonic, bits<12> opcode, SDPatternOperator operator,
+                RegisterOperand cls, Immediate imm>
+  : InstRIL<opcode, (outs cls:$R1), (ins cls:$R1src, imm:$I2),
+            mnemonic#"\t$R1, $I2",
+            [(set cls:$R1, (operator cls:$R1src, imm:$I2))]> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
+class BinaryRS<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+               RegisterOperand cls>
+  : InstRS<opcode, (outs cls:$R1), (ins cls:$R1src, shift12only:$BD2),
+           mnemonic#"\t$R1, $BD2",
+           [(set cls:$R1, (operator cls:$R1src, shift12only:$BD2))]> {
+  let R3 = 0;
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
+class BinaryRSY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                RegisterOperand cls>
+  : InstRSY<opcode, (outs cls:$R1), (ins cls:$R3, shift20only:$BD2),
+            mnemonic#"\t$R1, $R3, $BD2",
+            [(set cls:$R1, (operator cls:$R3, shift20only:$BD2))]>;
+
+multiclass BinaryRSAndK<string mnemonic, bits<8> opcode1, bits<16> opcode2,
+                        SDPatternOperator operator, RegisterOperand cls> {
+  let NumOpsKey = mnemonic in {
+    let NumOpsValue = "3" in
+      def K  : BinaryRSY<mnemonic##"k", opcode2, null_frag, cls>,
+               Requires<[FeatureDistinctOps]>;
+    let NumOpsValue = "2", isConvertibleToThreeAddress = 1 in
+      def "" : BinaryRS<mnemonic, opcode1, operator, cls>;
+  }
+}
+
+class BinaryRX<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+               RegisterOperand cls, SDPatternOperator load, bits<5> bytes,
+               AddressingMode mode = bdxaddr12only>
+  : InstRX<opcode, (outs cls:$R1), (ins cls:$R1src, mode:$XBD2),
+           mnemonic#"\t$R1, $XBD2",
+           [(set cls:$R1, (operator cls:$R1src, (load mode:$XBD2)))]> {
+  let OpKey = mnemonic ## cls;
+  let OpType = "mem";
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+}
+
+class BinaryRXE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                  RegisterOperand cls, SDPatternOperator load, bits<5> bytes>
+  : InstRXE<opcode, (outs cls:$R1), (ins cls:$R1src, bdxaddr12only:$XBD2),
+            mnemonic#"\t$R1, $XBD2",
+            [(set cls:$R1, (operator cls:$R1src,
+                                     (load bdxaddr12only:$XBD2)))]> {
+  let OpKey = mnemonic ## cls;
+  let OpType = "mem";
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+}
+
+class BinaryRXY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                RegisterOperand cls, SDPatternOperator load, bits<5> bytes,
+                AddressingMode mode = bdxaddr20only>
+  : InstRXY<opcode, (outs cls:$R1), (ins cls:$R1src, mode:$XBD2),
+            mnemonic#"\t$R1, $XBD2",
+            [(set cls:$R1, (operator cls:$R1src, (load mode:$XBD2)))]> {
+  let OpKey = mnemonic ## cls;
+  let OpType = "mem";
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+}
+
+multiclass BinaryRXPair<string mnemonic, bits<8> rxOpcode, bits<16> rxyOpcode,
+                        SDPatternOperator operator, RegisterOperand cls,
+                        SDPatternOperator load, bits<5> bytes> {
+  let DispKey = mnemonic ## #cls in {
+    let DispSize = "12" in
+      def "" : BinaryRX<mnemonic, rxOpcode, operator, cls, load, bytes,
+                        bdxaddr12pair>;
+    let DispSize = "20" in
+      def Y  : BinaryRXY<mnemonic#"y", rxyOpcode, operator, cls, load, bytes,
+                         bdxaddr20pair>;
+  }
+}
+
+class BinarySI<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+               Operand imm, AddressingMode mode = bdaddr12only>
+  : InstSI<opcode, (outs), (ins mode:$BD1, imm:$I2),
+           mnemonic#"\t$BD1, $I2",
+           [(store (operator (load mode:$BD1), imm:$I2), mode:$BD1)]> {
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+
+class BinarySIY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                Operand imm, AddressingMode mode = bdaddr20only>
+  : InstSIY<opcode, (outs), (ins mode:$BD1, imm:$I2),
+            mnemonic#"\t$BD1, $I2",
+            [(store (operator (load mode:$BD1), imm:$I2), mode:$BD1)]> {
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+
+multiclass BinarySIPair<string mnemonic, bits<8> siOpcode,
+                        bits<16> siyOpcode, SDPatternOperator operator,
+                        Operand imm> {
+  let DispKey = mnemonic ## #cls in {
+    let DispSize = "12" in
+      def "" : BinarySI<mnemonic, siOpcode, operator, imm, bdaddr12pair>;
+    let DispSize = "20" in
+      def Y  : BinarySIY<mnemonic#"y", siyOpcode, operator, imm, bdaddr20pair>;
+  }
+}
+
+class CompareRR<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+                RegisterOperand cls1, RegisterOperand cls2>
+  : InstRR<opcode, (outs), (ins cls1:$R1, cls2:$R2),
+           mnemonic#"r\t$R1, $R2",
+           [(operator cls1:$R1, cls2:$R2)]> {
+  let OpKey = mnemonic ## cls1;
+  let OpType = "reg";
+  let isCompare = 1;
+}
+
+class CompareRRE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                 RegisterOperand cls1, RegisterOperand cls2>
+  : InstRRE<opcode, (outs), (ins cls1:$R1, cls2:$R2),
+            mnemonic#"r\t$R1, $R2",
+            [(operator cls1:$R1, cls2:$R2)]> {
+  let OpKey = mnemonic ## cls1;
+  let OpType = "reg";
+  let isCompare = 1;
+}
+
+class CompareRI<string mnemonic, bits<12> opcode, SDPatternOperator operator,
+                RegisterOperand cls, Immediate imm>
+  : InstRI<opcode, (outs), (ins cls:$R1, imm:$I2),
+           mnemonic#"\t$R1, $I2",
+           [(operator cls:$R1, imm:$I2)]> {
+  let isCompare = 1;
+}
+
+class CompareRIL<string mnemonic, bits<12> opcode, SDPatternOperator operator,
+                 RegisterOperand cls, Immediate imm>
+  : InstRIL<opcode, (outs), (ins cls:$R1, imm:$I2),
+            mnemonic#"\t$R1, $I2",
+            [(operator cls:$R1, imm:$I2)]> {
+  let isCompare = 1;
+}
+
+class CompareRILPC<string mnemonic, bits<12> opcode, SDPatternOperator operator,
+                   RegisterOperand cls, SDPatternOperator load>
+  : InstRIL<opcode, (outs), (ins cls:$R1, pcrel32:$I2),
+            mnemonic#"\t$R1, $I2",
+            [(operator cls:$R1, (load pcrel32:$I2))]> {
+  let isCompare = 1;
+  let mayLoad = 1;
+  // We want PC-relative addresses to be tried ahead of BD and BDX addresses.
+  // However, BDXs have two extra operands and are therefore 6 units more
+  // complex.
+  let AddedComplexity = 7;
+}
+
+class CompareRX<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+                RegisterOperand cls, SDPatternOperator load, bits<5> bytes,
+                AddressingMode mode = bdxaddr12only>
+  : InstRX<opcode, (outs), (ins cls:$R1, mode:$XBD2),
+           mnemonic#"\t$R1, $XBD2",
+           [(operator cls:$R1, (load mode:$XBD2))]> {
+  let OpKey = mnemonic ## cls;
+  let OpType = "mem";
+  let isCompare = 1;
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+}
+
+class CompareRXE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                 RegisterOperand cls, SDPatternOperator load, bits<5> bytes>
+  : InstRXE<opcode, (outs), (ins cls:$R1, bdxaddr12only:$XBD2),
+            mnemonic#"\t$R1, $XBD2",
+            [(operator cls:$R1, (load bdxaddr12only:$XBD2))]> {
+  let OpKey = mnemonic ## cls;
+  let OpType = "mem";
+  let isCompare = 1;
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+}
+
+class CompareRXY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                 RegisterOperand cls, SDPatternOperator load, bits<5> bytes,
+                 AddressingMode mode = bdxaddr20only>
+  : InstRXY<opcode, (outs), (ins cls:$R1, mode:$XBD2),
+            mnemonic#"\t$R1, $XBD2",
+            [(operator cls:$R1, (load mode:$XBD2))]> {
+  let OpKey = mnemonic ## cls;
+  let OpType = "mem";
+  let isCompare = 1;
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+}
+
+multiclass CompareRXPair<string mnemonic, bits<8> rxOpcode, bits<16> rxyOpcode,
+                         SDPatternOperator operator, RegisterOperand cls,
+                         SDPatternOperator load, bits<5> bytes> {
+  let DispKey = mnemonic ## #cls in {
+    let DispSize = "12" in
+      def "" : CompareRX<mnemonic, rxOpcode, operator, cls,
+                         load, bytes, bdxaddr12pair>;
+    let DispSize = "20" in
+      def Y  : CompareRXY<mnemonic#"y", rxyOpcode, operator, cls,
+                          load, bytes, bdxaddr20pair>;
+  }
+}
+
+class CompareSI<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+                SDPatternOperator load, Immediate imm,
+                AddressingMode mode = bdaddr12only>
+  : InstSI<opcode, (outs), (ins mode:$BD1, imm:$I2),
+           mnemonic#"\t$BD1, $I2",
+           [(operator (load mode:$BD1), imm:$I2)]> {
+  let isCompare = 1;
+  let mayLoad = 1;
+}
+
+class CompareSIL<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                 SDPatternOperator load, Immediate imm>
+  : InstSIL<opcode, (outs), (ins bdaddr12only:$BD1, imm:$I2),
+            mnemonic#"\t$BD1, $I2",
+            [(operator (load bdaddr12only:$BD1), imm:$I2)]> {
+  let isCompare = 1;
+  let mayLoad = 1;
+}
+
+class CompareSIY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                 SDPatternOperator load, Immediate imm,
+                 AddressingMode mode = bdaddr20only>
+  : InstSIY<opcode, (outs), (ins mode:$BD1, imm:$I2),
+            mnemonic#"\t$BD1, $I2",
+            [(operator (load mode:$BD1), imm:$I2)]> {
+  let isCompare = 1;
+  let mayLoad = 1;
+}
+
+multiclass CompareSIPair<string mnemonic, bits<8> siOpcode, bits<16> siyOpcode,
+                         SDPatternOperator operator, SDPatternOperator load,
+                         Immediate imm> {
+  let DispKey = mnemonic in {
+    let DispSize = "12" in
+      def "" : CompareSI<mnemonic, siOpcode, operator, load, imm, bdaddr12pair>;
+    let DispSize = "20" in
+      def Y  : CompareSIY<mnemonic#"y", siyOpcode, operator, load, imm,
+                          bdaddr20pair>;
+  }
+}
+
+class TernaryRRD<string mnemonic, bits<16> opcode,
+                 SDPatternOperator operator, RegisterOperand cls>
+  : InstRRD<opcode, (outs cls:$R1), (ins cls:$R1src, cls:$R3, cls:$R2),
+            mnemonic#"r\t$R1, $R3, $R2",
+            [(set cls:$R1, (operator cls:$R1src, cls:$R3, cls:$R2))]> {
+  let OpKey = mnemonic ## cls;
+  let OpType = "reg";
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
+class TernaryRXF<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                 RegisterOperand cls, SDPatternOperator load, bits<5> bytes>
+  : InstRXF<opcode, (outs cls:$R1),
+            (ins cls:$R1src, cls:$R3, bdxaddr12only:$XBD2),
+            mnemonic#"\t$R1, $R3, $XBD2",
+            [(set cls:$R1, (operator cls:$R1src, cls:$R3,
+                                     (load bdxaddr12only:$XBD2)))]> {
+  let OpKey = mnemonic ## cls;
+  let OpType = "mem";
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+}
+
+class LoadAndOpRSY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                  RegisterOperand cls, AddressingMode mode = bdaddr20only>
+  : InstRSY<opcode, (outs cls:$R1), (ins cls:$R3, mode:$BD2),
+            mnemonic#"\t$R1, $R3, $BD2",
+            [(set cls:$R1, (operator mode:$BD2, cls:$R3))]> {
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+
+class CmpSwapRS<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+                RegisterOperand cls, AddressingMode mode = bdaddr12only>
+  : InstRS<opcode, (outs cls:$R1), (ins cls:$R1src, cls:$R3, mode:$BD2),
+           mnemonic#"\t$R1, $R3, $BD2",
+           [(set cls:$R1, (operator mode:$BD2, cls:$R1src, cls:$R3))]> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+
+class CmpSwapRSY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                 RegisterOperand cls, AddressingMode mode = bdaddr20only>
+  : InstRSY<opcode, (outs cls:$R1), (ins cls:$R1src, cls:$R3, mode:$BD2),
+            mnemonic#"\t$R1, $R3, $BD2",
+            [(set cls:$R1, (operator mode:$BD2, cls:$R1src, cls:$R3))]> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+
+multiclass CmpSwapRSPair<string mnemonic, bits<8> rsOpcode, bits<16> rsyOpcode,
+                         SDPatternOperator operator, RegisterOperand cls> {
+  let DispKey = mnemonic ## #cls in {
+    let DispSize = "12" in
+      def "" : CmpSwapRS<mnemonic, rsOpcode, operator, cls, bdaddr12pair>;
+    let DispSize = "20" in
+      def Y  : CmpSwapRSY<mnemonic#"y", rsyOpcode, operator, cls, bdaddr20pair>;
+  }
+}
+
+class RotateSelectRIEf<string mnemonic, bits<16> opcode, RegisterOperand cls1,
+                       RegisterOperand cls2>
+  : InstRIEf<opcode, (outs cls1:$R1),
+             (ins cls1:$R1src, cls2:$R2, imm32zx8:$I3, imm32zx8:$I4,
+                  imm32zx6:$I5),
+             mnemonic#"\t$R1, $R2, $I3, $I4, $I5", []> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
+class PrefetchRXY<string mnemonic, bits<16> opcode, SDPatternOperator operator>
+  : InstRXY<opcode, (outs), (ins imm32zx4:$R1, bdxaddr20only:$XBD2),
+            mnemonic##"\t$R1, $XBD2",
+            [(operator imm32zx4:$R1, bdxaddr20only:$XBD2)]>;
+
+class PrefetchRILPC<string mnemonic, bits<12> opcode,
+                    SDPatternOperator operator>
+  : InstRIL<opcode, (outs), (ins imm32zx4:$R1, pcrel32:$I2),
+            mnemonic##"\t$R1, $I2",
+            [(operator imm32zx4:$R1, pcrel32:$I2)]> {
+  // We want PC-relative addresses to be tried ahead of BD and BDX addresses.
+  // However, BDXs have two extra operands and are therefore 6 units more
+  // complex.
+  let AddedComplexity = 7;
+}
+
+// A floating-point load-and test operation.  Create both a normal unary
+// operation and one that acts as a comparison against zero.
+multiclass LoadAndTestRRE<string mnemonic, bits<16> opcode,
+                          RegisterOperand cls> {
+  def "" : UnaryRRE<mnemonic, opcode, null_frag, cls, cls>;
+  let isCodeGenOnly = 1 in
+    def Compare : CompareRRE<mnemonic, opcode, null_frag, cls, cls>;
+}
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions
+//===----------------------------------------------------------------------===//
+//
+// Convenience instructions that get lowered to real instructions
+// by either SystemZTargetLowering::EmitInstrWithCustomInserter()
+// or SystemZInstrInfo::expandPostRAPseudo().
+//
+//===----------------------------------------------------------------------===//
+
+class Pseudo<dag outs, dag ins, list<dag> pattern>
+  : InstSystemZ<0, outs, ins, "", pattern> {
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+}
+
+// Like UnaryRI, but expanded after RA depending on the choice of register.
+class UnaryRIPseudo<SDPatternOperator operator, RegisterOperand cls,
+                    Immediate imm>
+  : Pseudo<(outs cls:$R1), (ins imm:$I2),
+           [(set cls:$R1, (operator imm:$I2))]>;
+
+// Like UnaryRXY, but expanded after RA depending on the choice of register.
+class UnaryRXYPseudo<string key, SDPatternOperator operator,
+                     RegisterOperand cls, bits<5> bytes,
+                     AddressingMode mode = bdxaddr20only>
+  : Pseudo<(outs cls:$R1), (ins mode:$XBD2),
+           [(set cls:$R1, (operator mode:$XBD2))]> {
+  let OpKey = key ## cls;
+  let OpType = "mem";
+  let mayLoad = 1;
+  let Has20BitOffset = 1;
+  let HasIndex = 1;
+  let AccessBytes = bytes;
+}
+
+// Like UnaryRR, but expanded after RA depending on the choice of registers.
+class UnaryRRPseudo<string key, SDPatternOperator operator,
+                    RegisterOperand cls1, RegisterOperand cls2>
+  : Pseudo<(outs cls1:$R1), (ins cls2:$R2),
+           [(set cls1:$R1, (operator cls2:$R2))]> {
+  let OpKey = key ## cls1;
+  let OpType = "reg";
+}
+
+// Like BinaryRI, but expanded after RA depending on the choice of register.
+class BinaryRIPseudo<SDPatternOperator operator, RegisterOperand cls,
+                     Immediate imm>
+  : Pseudo<(outs cls:$R1), (ins cls:$R1src, imm:$I2),
+           [(set cls:$R1, (operator cls:$R1src, imm:$I2))]> {
+  let Constraints = "$R1 = $R1src";
+}
+
+// Like BinaryRIE, but expanded after RA depending on the choice of register.
+class BinaryRIEPseudo<SDPatternOperator operator, RegisterOperand cls,
+                      Immediate imm>
+  : Pseudo<(outs cls:$R1), (ins cls:$R3, imm:$I2),
+           [(set cls:$R1, (operator cls:$R3, imm:$I2))]>;
+
+// Like BinaryRIAndK, but expanded after RA depending on the choice of register.
+multiclass BinaryRIAndKPseudo<string key, SDPatternOperator operator,
+                              RegisterOperand cls, Immediate imm> {
+  let NumOpsKey = key in {
+    let NumOpsValue = "3" in
+      def K : BinaryRIEPseudo<null_frag, cls, imm>,
+              Requires<[FeatureHighWord, FeatureDistinctOps]>;
+    let NumOpsValue = "2", isConvertibleToThreeAddress = 1 in
+      def "" : BinaryRIPseudo<operator, cls, imm>,
+               Requires<[FeatureHighWord]>;
+  }
+}
+
+// Like CompareRI, but expanded after RA depending on the choice of register.
+class CompareRIPseudo<SDPatternOperator operator, RegisterOperand cls,
+                      Immediate imm>
+  : Pseudo<(outs), (ins cls:$R1, imm:$I2), [(operator cls:$R1, imm:$I2)]>;
+
+// Like CompareRXY, but expanded after RA depending on the choice of register.
+class CompareRXYPseudo<SDPatternOperator operator, RegisterOperand cls,
+                       SDPatternOperator load, bits<5> bytes,
+                       AddressingMode mode = bdxaddr20only>
+  : Pseudo<(outs), (ins cls:$R1, mode:$XBD2),
+           [(operator cls:$R1, (load mode:$XBD2))]> {
+  let mayLoad = 1;
+  let Has20BitOffset = 1;
+  let HasIndex = 1;
+  let AccessBytes = bytes;
+}
+
+// Like StoreRXY, but expanded after RA depending on the choice of register.
+class StoreRXYPseudo<SDPatternOperator operator, RegisterOperand cls,
+                     bits<5> bytes, AddressingMode mode = bdxaddr20only>
+  : Pseudo<(outs), (ins cls:$R1, mode:$XBD2),
+           [(operator cls:$R1, mode:$XBD2)]> {
+  let mayStore = 1;
+  let Has20BitOffset = 1;
+  let HasIndex = 1;
+  let AccessBytes = bytes;
+}
+
+// Like RotateSelectRIEf, but expanded after RA depending on the choice
+// of registers.
+class RotateSelectRIEfPseudo<RegisterOperand cls1, RegisterOperand cls2>
+  : Pseudo<(outs cls1:$R1),
+           (ins cls1:$R1src, cls2:$R2, imm32zx8:$I3, imm32zx8:$I4,
+                imm32zx6:$I5),
+           []> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
+// Implements "$dst = $cc & (8 >> CC) ? $src1 : $src2", where CC is
+// the value of the PSW's 2-bit condition code field.
+class SelectWrapper<RegisterOperand cls>
+  : Pseudo<(outs cls:$dst),
+           (ins cls:$src1, cls:$src2, imm32zx4:$valid, imm32zx4:$cc),
+           [(set cls:$dst, (z_select_ccmask cls:$src1, cls:$src2,
+                                            imm32zx4:$valid, imm32zx4:$cc))]> {
+  let usesCustomInserter = 1;
+  // Although the instructions used by these nodes do not in themselves
+  // change CC, the insertion requires new blocks, and CC cannot be live
+  // across them.
+  let Defs = [CC];
+  let Uses = [CC];
+}
+
+// Stores $new to $addr if $cc is true ("" case) or false (Inv case).
+multiclass CondStores<RegisterOperand cls, SDPatternOperator store,
+                      SDPatternOperator load, AddressingMode mode> {
+  let Defs = [CC], Uses = [CC], usesCustomInserter = 1 in {
+    def "" : Pseudo<(outs),
+                    (ins cls:$new, mode:$addr, imm32zx4:$valid, imm32zx4:$cc),
+                    [(store (z_select_ccmask cls:$new, (load mode:$addr),
+                                             imm32zx4:$valid, imm32zx4:$cc),
+                            mode:$addr)]>;
+    def Inv : Pseudo<(outs),
+                     (ins cls:$new, mode:$addr, imm32zx4:$valid, imm32zx4:$cc),
+                     [(store (z_select_ccmask (load mode:$addr), cls:$new,
+                                              imm32zx4:$valid, imm32zx4:$cc),
+                              mode:$addr)]>;
+  }
+}
+
+// OPERATOR is ATOMIC_SWAP or an ATOMIC_LOAD_* operation.  PAT and OPERAND
+// describe the second (non-memory) operand.
+class AtomicLoadBinary<SDPatternOperator operator, RegisterOperand cls,
+                       dag pat, DAGOperand operand>
+  : Pseudo<(outs cls:$dst), (ins bdaddr20only:$ptr, operand:$src2),
+           [(set cls:$dst, (operator bdaddr20only:$ptr, pat))]> {
+  let Defs = [CC];
+  let Has20BitOffset = 1;
+  let mayLoad = 1;
+  let mayStore = 1;
+  let usesCustomInserter = 1;
+}
+
+// Specializations of AtomicLoadWBinary.
+class AtomicLoadBinaryReg32<SDPatternOperator operator>
+  : AtomicLoadBinary<operator, GR32, (i32 GR32:$src2), GR32>;
+class AtomicLoadBinaryImm32<SDPatternOperator operator, Immediate imm>
+  : AtomicLoadBinary<operator, GR32, (i32 imm:$src2), imm>;
+class AtomicLoadBinaryReg64<SDPatternOperator operator>
+  : AtomicLoadBinary<operator, GR64, (i64 GR64:$src2), GR64>;
+class AtomicLoadBinaryImm64<SDPatternOperator operator, Immediate imm>
+  : AtomicLoadBinary<operator, GR64, (i64 imm:$src2), imm>;
+
+// OPERATOR is ATOMIC_SWAPW or an ATOMIC_LOADW_* operation.  PAT and OPERAND
+// describe the second (non-memory) operand.
+class AtomicLoadWBinary<SDPatternOperator operator, dag pat,
+                        DAGOperand operand>
+  : Pseudo<(outs GR32:$dst),
+           (ins bdaddr20only:$ptr, operand:$src2, ADDR32:$bitshift,
+                ADDR32:$negbitshift, uimm32:$bitsize),
+           [(set GR32:$dst, (operator bdaddr20only:$ptr, pat, ADDR32:$bitshift,
+                                      ADDR32:$negbitshift, uimm32:$bitsize))]> {
+  let Defs = [CC];
+  let Has20BitOffset = 1;
+  let mayLoad = 1;
+  let mayStore = 1;
+  let usesCustomInserter = 1;
+}
+
+// Specializations of AtomicLoadWBinary.
+class AtomicLoadWBinaryReg<SDPatternOperator operator>
+  : AtomicLoadWBinary<operator, (i32 GR32:$src2), GR32>;
+class AtomicLoadWBinaryImm<SDPatternOperator operator, Immediate imm>
+  : AtomicLoadWBinary<operator, (i32 imm:$src2), imm>;
+
+// Define an instruction that operates on two fixed-length blocks of memory,
+// and associated pseudo instructions for operating on blocks of any size.
+// The Sequence form uses a straight-line sequence of instructions and
+// the Loop form uses a loop of length-256 instructions followed by
+// another instruction to handle the excess.
+multiclass MemorySS<string mnemonic, bits<8> opcode,
+                    SDPatternOperator sequence, SDPatternOperator loop> {
+  def "" : InstSS<opcode, (outs), (ins bdladdr12onlylen8:$BDL1,
+                                       bdaddr12only:$BD2),
+                  mnemonic##"\t$BDL1, $BD2", []>;
+  let usesCustomInserter = 1 in {
+    def Sequence : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
+                                       imm64:$length),
+                           [(sequence bdaddr12only:$dest, bdaddr12only:$src,
+                                      imm64:$length)]>;
+    def Loop : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
+                                   imm64:$length, GR64:$count256),
+                      [(loop bdaddr12only:$dest, bdaddr12only:$src,
+                             imm64:$length, GR64:$count256)]>;
+  }
+}
+
+// Define an instruction that operates on two strings, both terminated
+// by the character in R0.  The instruction processes a CPU-determinated
+// number of bytes at a time and sets CC to 3 if the instruction needs
+// to be repeated.  Also define a pseudo instruction that represents
+// the full loop (the main instruction plus the branch on CC==3).
+multiclass StringRRE<string mnemonic, bits<16> opcode,
+                     SDPatternOperator operator> {
+  def "" : InstRRE<opcode, (outs GR64:$R1, GR64:$R2),
+                   (ins GR64:$R1src, GR64:$R2src),
+                   mnemonic#"\t$R1, $R2", []> {
+    let Constraints = "$R1 = $R1src, $R2 = $R2src";
+    let DisableEncoding = "$R1src, $R2src";
+  }
+  let usesCustomInserter = 1 in
+    def Loop : Pseudo<(outs GR64:$end),
+                      (ins GR64:$start1, GR64:$start2, GR32:$char),
+                      [(set GR64:$end, (operator GR64:$start1, GR64:$start2,
+                                                 GR32:$char))]>;
+}
+
+// A pseudo instruction that is a direct alias of a real instruction.
+// These aliases are used in cases where a particular register operand is
+// fixed or where the same instruction is used with different register sizes.
+// The size parameter is the size in bytes of the associated real instruction.
+class Alias<int size, dag outs, dag ins, list<dag> pattern>
+  : InstSystemZ<size, outs, ins, "", pattern> {
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+}
+
+// An alias of a BinaryRI, but with different register sizes.
+class BinaryAliasRI<SDPatternOperator operator, RegisterOperand cls,
+                    Immediate imm>
+  : Alias<4, (outs cls:$R1), (ins cls:$R1src, imm:$I2),
+          [(set cls:$R1, (operator cls:$R1src, imm:$I2))]> {
+  let Constraints = "$R1 = $R1src";
+}
+
+// An alias of a BinaryRIL, but with different register sizes.
+class BinaryAliasRIL<SDPatternOperator operator, RegisterOperand cls,
+                     Immediate imm>
+  : Alias<6, (outs cls:$R1), (ins cls:$R1src, imm:$I2),
+          [(set cls:$R1, (operator cls:$R1src, imm:$I2))]> {
+  let Constraints = "$R1 = $R1src";
+}
+
+// An alias of a CompareRI, but with different register sizes.
+class CompareAliasRI<SDPatternOperator operator, RegisterOperand cls,
+                     Immediate imm>
+  : Alias<4, (outs), (ins cls:$R1, imm:$I2), [(operator cls:$R1, imm:$I2)]> {
+  let isCompare = 1;
+}
+
+// An alias of a RotateSelectRIEf, but with different register sizes.
+class RotateSelectAliasRIEf<RegisterOperand cls1, RegisterOperand cls2>
+  : Alias<6, (outs cls1:$R1),
+          (ins cls1:$R1src, cls2:$R2, imm32zx8:$I3, imm32zx8:$I4,
+               imm32zx6:$I5), []> {
+  let Constraints = "$R1 = $R1src";
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
new file mode 100644
index 0000000..f58ab47
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -0,0 +1,1247 @@
+//===-- SystemZInstrInfo.cpp - SystemZ instruction information ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the SystemZ implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZInstrInfo.h"
+#include "SystemZInstrBuilder.h"
+#include "SystemZTargetMachine.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_CTOR_DTOR
+#define GET_INSTRMAP_INFO
+#include "SystemZGenInstrInfo.inc"
+
+// Return a mask with Count low bits set.
+static uint64_t allOnes(unsigned int Count) {
+  return Count == 0 ? 0 : (uint64_t(1) << (Count - 1) << 1) - 1;
+}
+
+// Reg should be a 32-bit GPR.  Return true if it is a high register rather
+// than a low register.
+static bool isHighReg(unsigned int Reg) {
+  if (SystemZ::GRH32BitRegClass.contains(Reg))
+    return true;
+  assert(SystemZ::GR32BitRegClass.contains(Reg) && "Invalid GRX32");
+  return false;
+}
+
+// Pin the vtable to this file.
+void SystemZInstrInfo::anchor() {}
+
+SystemZInstrInfo::SystemZInstrInfo(SystemZSubtarget &sti)
+  : SystemZGenInstrInfo(SystemZ::ADJCALLSTACKDOWN, SystemZ::ADJCALLSTACKUP),
+    RI(), STI(sti) {
+}
+
+// MI is a 128-bit load or store.  Split it into two 64-bit loads or stores,
+// each having the opcode given by NewOpcode.
+void SystemZInstrInfo::splitMove(MachineBasicBlock::iterator MI,
+                                 unsigned NewOpcode) const {
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineFunction &MF = *MBB->getParent();
+
+  // Get two load or store instructions.  Use the original instruction for one
+  // of them (arbitrarily the second here) and create a clone for the other.
+  MachineInstr *EarlierMI = MF.CloneMachineInstr(MI);
+  MBB->insert(MI, EarlierMI);
+
+  // Set up the two 64-bit registers.
+  MachineOperand &HighRegOp = EarlierMI->getOperand(0);
+  MachineOperand &LowRegOp = MI->getOperand(0);
+  HighRegOp.setReg(RI.getSubReg(HighRegOp.getReg(), SystemZ::subreg_h64));
+  LowRegOp.setReg(RI.getSubReg(LowRegOp.getReg(), SystemZ::subreg_l64));
+
+  // The address in the first (high) instruction is already correct.
+  // Adjust the offset in the second (low) instruction.
+  MachineOperand &HighOffsetOp = EarlierMI->getOperand(2);
+  MachineOperand &LowOffsetOp = MI->getOperand(2);
+  LowOffsetOp.setImm(LowOffsetOp.getImm() + 8);
+
+  // Set the opcodes.
+  unsigned HighOpcode = getOpcodeForOffset(NewOpcode, HighOffsetOp.getImm());
+  unsigned LowOpcode = getOpcodeForOffset(NewOpcode, LowOffsetOp.getImm());
+  assert(HighOpcode && LowOpcode && "Both offsets should be in range");
+
+  EarlierMI->setDesc(get(HighOpcode));
+  MI->setDesc(get(LowOpcode));
+}
+
+// Split ADJDYNALLOC instruction MI.
+void SystemZInstrInfo::splitAdjDynAlloc(MachineBasicBlock::iterator MI) const {
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineFunction &MF = *MBB->getParent();
+  MachineFrameInfo *MFFrame = MF.getFrameInfo();
+  MachineOperand &OffsetMO = MI->getOperand(2);
+
+  uint64_t Offset = (MFFrame->getMaxCallFrameSize() +
+                     SystemZMC::CallFrameSize +
+                     OffsetMO.getImm());
+  unsigned NewOpcode = getOpcodeForOffset(SystemZ::LA, Offset);
+  assert(NewOpcode && "No support for huge argument lists yet");
+  MI->setDesc(get(NewOpcode));
+  OffsetMO.setImm(Offset);
+}
+
+// MI is an RI-style pseudo instruction.  Replace it with LowOpcode
+// if the first operand is a low GR32 and HighOpcode if the first operand
+// is a high GR32.  ConvertHigh is true if LowOpcode takes a signed operand
+// and HighOpcode takes an unsigned 32-bit operand.  In those cases,
+// MI has the same kind of operand as LowOpcode, so needs to be converted
+// if HighOpcode is used.
+void SystemZInstrInfo::expandRIPseudo(MachineInstr *MI, unsigned LowOpcode,
+                                      unsigned HighOpcode,
+                                      bool ConvertHigh) const {
+  unsigned Reg = MI->getOperand(0).getReg();
+  bool IsHigh = isHighReg(Reg);
+  MI->setDesc(get(IsHigh ? HighOpcode : LowOpcode));
+  if (IsHigh && ConvertHigh)
+    MI->getOperand(1).setImm(uint32_t(MI->getOperand(1).getImm()));
+}
+
+// MI is a three-operand RIE-style pseudo instruction.  Replace it with
+// LowOpcode3 if the registers are both low GR32s, otherwise use a move
+// followed by HighOpcode or LowOpcode, depending on whether the target
+// is a high or low GR32.
+void SystemZInstrInfo::expandRIEPseudo(MachineInstr *MI, unsigned LowOpcode,
+                                       unsigned LowOpcodeK,
+                                       unsigned HighOpcode) const {
+  unsigned DestReg = MI->getOperand(0).getReg();
+  unsigned SrcReg = MI->getOperand(1).getReg();
+  bool DestIsHigh = isHighReg(DestReg);
+  bool SrcIsHigh = isHighReg(SrcReg);
+  if (!DestIsHigh && !SrcIsHigh)
+    MI->setDesc(get(LowOpcodeK));
+  else {
+    emitGRX32Move(*MI->getParent(), MI, MI->getDebugLoc(),
+                  DestReg, SrcReg, SystemZ::LR, 32,
+                  MI->getOperand(1).isKill());
+    MI->setDesc(get(DestIsHigh ? HighOpcode : LowOpcode));
+    MI->getOperand(1).setReg(DestReg);
+  }
+}
+
+// MI is an RXY-style pseudo instruction.  Replace it with LowOpcode
+// if the first operand is a low GR32 and HighOpcode if the first operand
+// is a high GR32.
+void SystemZInstrInfo::expandRXYPseudo(MachineInstr *MI, unsigned LowOpcode,
+                                       unsigned HighOpcode) const {
+  unsigned Reg = MI->getOperand(0).getReg();
+  unsigned Opcode = getOpcodeForOffset(isHighReg(Reg) ? HighOpcode : LowOpcode,
+                                       MI->getOperand(2).getImm());
+  MI->setDesc(get(Opcode));
+}
+
+// MI is an RR-style pseudo instruction that zero-extends the low Size bits
+// of one GRX32 into another.  Replace it with LowOpcode if both operands
+// are low registers, otherwise use RISB[LH]G.
+void SystemZInstrInfo::expandZExtPseudo(MachineInstr *MI, unsigned LowOpcode,
+                                        unsigned Size) const {
+  emitGRX32Move(*MI->getParent(), MI, MI->getDebugLoc(),
+                MI->getOperand(0).getReg(), MI->getOperand(1).getReg(),
+                LowOpcode, Size, MI->getOperand(1).isKill());
+  MI->eraseFromParent();
+}
+
+// Emit a zero-extending move from 32-bit GPR SrcReg to 32-bit GPR
+// DestReg before MBBI in MBB.  Use LowLowOpcode when both DestReg and SrcReg
+// are low registers, otherwise use RISB[LH]G.  Size is the number of bits
+// taken from the low end of SrcReg (8 for LLCR, 16 for LLHR and 32 for LR).
+// KillSrc is true if this move is the last use of SrcReg.
+void SystemZInstrInfo::emitGRX32Move(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MBBI,
+                                     DebugLoc DL, unsigned DestReg,
+                                     unsigned SrcReg, unsigned LowLowOpcode,
+                                     unsigned Size, bool KillSrc) const {
+  unsigned Opcode;
+  bool DestIsHigh = isHighReg(DestReg);
+  bool SrcIsHigh = isHighReg(SrcReg);
+  if (DestIsHigh && SrcIsHigh)
+    Opcode = SystemZ::RISBHH;
+  else if (DestIsHigh && !SrcIsHigh)
+    Opcode = SystemZ::RISBHL;
+  else if (!DestIsHigh && SrcIsHigh)
+    Opcode = SystemZ::RISBLH;
+  else {
+    BuildMI(MBB, MBBI, DL, get(LowLowOpcode), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+  unsigned Rotate = (DestIsHigh != SrcIsHigh ? 32 : 0);
+  BuildMI(MBB, MBBI, DL, get(Opcode), DestReg)
+    .addReg(DestReg, RegState::Undef)
+    .addReg(SrcReg, getKillRegState(KillSrc))
+    .addImm(32 - Size).addImm(128 + 31).addImm(Rotate);
+}
+
+// If MI is a simple load or store for a frame object, return the register
+// it loads or stores and set FrameIndex to the index of the frame object.
+// Return 0 otherwise.
+//
+// Flag is SimpleBDXLoad for loads and SimpleBDXStore for stores.
+static int isSimpleMove(const MachineInstr *MI, int &FrameIndex,
+                        unsigned Flag) {
+  const MCInstrDesc &MCID = MI->getDesc();
+  if ((MCID.TSFlags & Flag) &&
+      MI->getOperand(1).isFI() &&
+      MI->getOperand(2).getImm() == 0 &&
+      MI->getOperand(3).getReg() == 0) {
+    FrameIndex = MI->getOperand(1).getIndex();
+    return MI->getOperand(0).getReg();
+  }
+  return 0;
+}
+
+unsigned SystemZInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+                                               int &FrameIndex) const {
+  return isSimpleMove(MI, FrameIndex, SystemZII::SimpleBDXLoad);
+}
+
+unsigned SystemZInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+                                              int &FrameIndex) const {
+  return isSimpleMove(MI, FrameIndex, SystemZII::SimpleBDXStore);
+}
+
+bool SystemZInstrInfo::isStackSlotCopy(const MachineInstr *MI,
+                                       int &DestFrameIndex,
+                                       int &SrcFrameIndex) const {
+  // Check for MVC 0(Length,FI1),0(FI2)
+  const MachineFrameInfo *MFI = MI->getParent()->getParent()->getFrameInfo();
+  if (MI->getOpcode() != SystemZ::MVC ||
+      !MI->getOperand(0).isFI() ||
+      MI->getOperand(1).getImm() != 0 ||
+      !MI->getOperand(3).isFI() ||
+      MI->getOperand(4).getImm() != 0)
+    return false;
+
+  // Check that Length covers the full slots.
+  int64_t Length = MI->getOperand(2).getImm();
+  unsigned FI1 = MI->getOperand(0).getIndex();
+  unsigned FI2 = MI->getOperand(3).getIndex();
+  if (MFI->getObjectSize(FI1) != Length ||
+      MFI->getObjectSize(FI2) != Length)
+    return false;
+
+  DestFrameIndex = FI1;
+  SrcFrameIndex = FI2;
+  return true;
+}
+
+bool SystemZInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+                                     MachineBasicBlock *&TBB,
+                                     MachineBasicBlock *&FBB,
+                                     SmallVectorImpl<MachineOperand> &Cond,
+                                     bool AllowModify) const {
+  // Most of the code and comments here are boilerplate.
+
+  // Start from the bottom of the block and work up, examining the
+  // terminator instructions.
+  MachineBasicBlock::iterator I = MBB.end();
+  while (I != MBB.begin()) {
+    --I;
+    if (I->isDebugValue())
+      continue;
+
+    // Working from the bottom, when we see a non-terminator instruction, we're
+    // done.
+    if (!isUnpredicatedTerminator(I))
+      break;
+
+    // A terminator that isn't a branch can't easily be handled by this
+    // analysis.
+    if (!I->isBranch())
+      return true;
+
+    // Can't handle indirect branches.
+    SystemZII::Branch Branch(getBranchInfo(I));
+    if (!Branch.Target->isMBB())
+      return true;
+
+    // Punt on compound branches.
+    if (Branch.Type != SystemZII::BranchNormal)
+      return true;
+
+    if (Branch.CCMask == SystemZ::CCMASK_ANY) {
+      // Handle unconditional branches.
+      if (!AllowModify) {
+        TBB = Branch.Target->getMBB();
+        continue;
+      }
+
+      // If the block has any instructions after a JMP, delete them.
+      while (std::next(I) != MBB.end())
+        std::next(I)->eraseFromParent();
+
+      Cond.clear();
+      FBB = nullptr;
+
+      // Delete the JMP if it's equivalent to a fall-through.
+      if (MBB.isLayoutSuccessor(Branch.Target->getMBB())) {
+        TBB = nullptr;
+        I->eraseFromParent();
+        I = MBB.end();
+        continue;
+      }
+
+      // TBB is used to indicate the unconditinal destination.
+      TBB = Branch.Target->getMBB();
+      continue;
+    }
+
+    // Working from the bottom, handle the first conditional branch.
+    if (Cond.empty()) {
+      // FIXME: add X86-style branch swap
+      FBB = TBB;
+      TBB = Branch.Target->getMBB();
+      Cond.push_back(MachineOperand::CreateImm(Branch.CCValid));
+      Cond.push_back(MachineOperand::CreateImm(Branch.CCMask));
+      continue;
+    }
+
+    // Handle subsequent conditional branches.
+    assert(Cond.size() == 2 && TBB && "Should have seen a conditional branch");
+
+    // Only handle the case where all conditional branches branch to the same
+    // destination.
+    if (TBB != Branch.Target->getMBB())
+      return true;
+
+    // If the conditions are the same, we can leave them alone.
+    unsigned OldCCValid = Cond[0].getImm();
+    unsigned OldCCMask = Cond[1].getImm();
+    if (OldCCValid == Branch.CCValid && OldCCMask == Branch.CCMask)
+      continue;
+
+    // FIXME: Try combining conditions like X86 does.  Should be easy on Z!
+    return false;
+  }
+
+  return false;
+}
+
+unsigned SystemZInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  // Most of the code and comments here are boilerplate.
+  MachineBasicBlock::iterator I = MBB.end();
+  unsigned Count = 0;
+
+  while (I != MBB.begin()) {
+    --I;
+    if (I->isDebugValue())
+      continue;
+    if (!I->isBranch())
+      break;
+    if (!getBranchInfo(I).Target->isMBB())
+      break;
+    // Remove the branch.
+    I->eraseFromParent();
+    I = MBB.end();
+    ++Count;
+  }
+
+  return Count;
+}
+
+bool SystemZInstrInfo::
+ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+  assert(Cond.size() == 2 && "Invalid condition");
+  Cond[1].setImm(Cond[1].getImm() ^ Cond[0].getImm());
+  return false;
+}
+
+unsigned
+SystemZInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                               MachineBasicBlock *FBB,
+                               const SmallVectorImpl<MachineOperand> &Cond,
+                               DebugLoc DL) const {
+  // In this function we output 32-bit branches, which should always
+  // have enough range.  They can be shortened and relaxed by later code
+  // in the pipeline, if desired.
+
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 2 || Cond.size() == 0) &&
+         "SystemZ branch conditions have one component!");
+
+  if (Cond.empty()) {
+    // Unconditional branch?
+    assert(!FBB && "Unconditional branch with multiple successors!");
+    BuildMI(&MBB, DL, get(SystemZ::J)).addMBB(TBB);
+    return 1;
+  }
+
+  // Conditional branch.
+  unsigned Count = 0;
+  unsigned CCValid = Cond[0].getImm();
+  unsigned CCMask = Cond[1].getImm();
+  BuildMI(&MBB, DL, get(SystemZ::BRC))
+    .addImm(CCValid).addImm(CCMask).addMBB(TBB);
+  ++Count;
+
+  if (FBB) {
+    // Two-way Conditional branch. Insert the second branch.
+    BuildMI(&MBB, DL, get(SystemZ::J)).addMBB(FBB);
+    ++Count;
+  }
+  return Count;
+}
+
+bool SystemZInstrInfo::analyzeCompare(const MachineInstr *MI,
+                                      unsigned &SrcReg, unsigned &SrcReg2,
+                                      int &Mask, int &Value) const {
+  assert(MI->isCompare() && "Caller should have checked for a comparison");
+
+  if (MI->getNumExplicitOperands() == 2 &&
+      MI->getOperand(0).isReg() &&
+      MI->getOperand(1).isImm()) {
+    SrcReg = MI->getOperand(0).getReg();
+    SrcReg2 = 0;
+    Value = MI->getOperand(1).getImm();
+    Mask = ~0;
+    return true;
+  }
+
+  return false;
+}
+
+// If Reg is a virtual register, return its definition, otherwise return null.
+static MachineInstr *getDef(unsigned Reg,
+                            const MachineRegisterInfo *MRI) {
+  if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    return nullptr;
+  return MRI->getUniqueVRegDef(Reg);
+}
+
+// Return true if MI is a shift of type Opcode by Imm bits.
+static bool isShift(MachineInstr *MI, int Opcode, int64_t Imm) {
+  return (MI->getOpcode() == Opcode &&
+          !MI->getOperand(2).getReg() &&
+          MI->getOperand(3).getImm() == Imm);
+}
+
+// If the destination of MI has no uses, delete it as dead.
+static void eraseIfDead(MachineInstr *MI, const MachineRegisterInfo *MRI) {
+  if (MRI->use_nodbg_empty(MI->getOperand(0).getReg()))
+    MI->eraseFromParent();
+}
+
+// Compare compares SrcReg against zero.  Check whether SrcReg contains
+// the result of an IPM sequence whose input CC survives until Compare,
+// and whether Compare is therefore redundant.  Delete it and return
+// true if so.
+static bool removeIPMBasedCompare(MachineInstr *Compare, unsigned SrcReg,
+                                  const MachineRegisterInfo *MRI,
+                                  const TargetRegisterInfo *TRI) {
+  MachineInstr *LGFR = nullptr;
+  MachineInstr *RLL = getDef(SrcReg, MRI);
+  if (RLL && RLL->getOpcode() == SystemZ::LGFR) {
+    LGFR = RLL;
+    RLL = getDef(LGFR->getOperand(1).getReg(), MRI);
+  }
+  if (!RLL || !isShift(RLL, SystemZ::RLL, 31))
+    return false;
+
+  MachineInstr *SRL = getDef(RLL->getOperand(1).getReg(), MRI);
+  if (!SRL || !isShift(SRL, SystemZ::SRL, SystemZ::IPM_CC))
+    return false;
+
+  MachineInstr *IPM = getDef(SRL->getOperand(1).getReg(), MRI);
+  if (!IPM || IPM->getOpcode() != SystemZ::IPM)
+    return false;
+
+  // Check that there are no assignments to CC between the IPM and Compare,
+  if (IPM->getParent() != Compare->getParent())
+    return false;
+  MachineBasicBlock::iterator MBBI = IPM, MBBE = Compare;
+  for (++MBBI; MBBI != MBBE; ++MBBI) {
+    MachineInstr *MI = MBBI;
+    if (MI->modifiesRegister(SystemZ::CC, TRI))
+      return false;
+  }
+
+  Compare->eraseFromParent();
+  if (LGFR)
+    eraseIfDead(LGFR, MRI);
+  eraseIfDead(RLL, MRI);
+  eraseIfDead(SRL, MRI);
+  eraseIfDead(IPM, MRI);
+
+  return true;
+}
+
+bool
+SystemZInstrInfo::optimizeCompareInstr(MachineInstr *Compare,
+                                       unsigned SrcReg, unsigned SrcReg2,
+                                       int Mask, int Value,
+                                       const MachineRegisterInfo *MRI) const {
+  assert(!SrcReg2 && "Only optimizing constant comparisons so far");
+  bool IsLogical = (Compare->getDesc().TSFlags & SystemZII::IsLogical) != 0;
+  if (Value == 0 &&
+      !IsLogical &&
+      removeIPMBasedCompare(Compare, SrcReg, MRI, &RI))
+    return true;
+  return false;
+}
+
+// If Opcode is a move that has a conditional variant, return that variant,
+// otherwise return 0.
+static unsigned getConditionalMove(unsigned Opcode) {
+  switch (Opcode) {
+  case SystemZ::LR:  return SystemZ::LOCR;
+  case SystemZ::LGR: return SystemZ::LOCGR;
+  default:           return 0;
+  }
+}
+
+bool SystemZInstrInfo::isPredicable(MachineInstr *MI) const {
+  unsigned Opcode = MI->getOpcode();
+  if (STI.hasLoadStoreOnCond() &&
+      getConditionalMove(Opcode))
+    return true;
+  return false;
+}
+
+bool SystemZInstrInfo::
+isProfitableToIfCvt(MachineBasicBlock &MBB,
+                    unsigned NumCycles, unsigned ExtraPredCycles,
+                    const BranchProbability &Probability) const {
+  // For now only convert single instructions.
+  return NumCycles == 1;
+}
+
+bool SystemZInstrInfo::
+isProfitableToIfCvt(MachineBasicBlock &TMBB,
+                    unsigned NumCyclesT, unsigned ExtraPredCyclesT,
+                    MachineBasicBlock &FMBB,
+                    unsigned NumCyclesF, unsigned ExtraPredCyclesF,
+                    const BranchProbability &Probability) const {
+  // For now avoid converting mutually-exclusive cases.
+  return false;
+}
+
+bool SystemZInstrInfo::
+PredicateInstruction(MachineInstr *MI,
+                     const SmallVectorImpl<MachineOperand> &Pred) const {
+  assert(Pred.size() == 2 && "Invalid condition");
+  unsigned CCValid = Pred[0].getImm();
+  unsigned CCMask = Pred[1].getImm();
+  assert(CCMask > 0 && CCMask < 15 && "Invalid predicate");
+  unsigned Opcode = MI->getOpcode();
+  if (STI.hasLoadStoreOnCond()) {
+    if (unsigned CondOpcode = getConditionalMove(Opcode)) {
+      MI->setDesc(get(CondOpcode));
+      MachineInstrBuilder(*MI->getParent()->getParent(), MI)
+        .addImm(CCValid).addImm(CCMask)
+        .addReg(SystemZ::CC, RegState::Implicit);
+      return true;
+    }
+  }
+  return false;
+}
+
+void
+SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+			      MachineBasicBlock::iterator MBBI, DebugLoc DL,
+			      unsigned DestReg, unsigned SrcReg,
+			      bool KillSrc) const {
+  // Split 128-bit GPR moves into two 64-bit moves.  This handles ADDR128 too.
+  if (SystemZ::GR128BitRegClass.contains(DestReg, SrcReg)) {
+    copyPhysReg(MBB, MBBI, DL, RI.getSubReg(DestReg, SystemZ::subreg_h64),
+                RI.getSubReg(SrcReg, SystemZ::subreg_h64), KillSrc);
+    copyPhysReg(MBB, MBBI, DL, RI.getSubReg(DestReg, SystemZ::subreg_l64),
+                RI.getSubReg(SrcReg, SystemZ::subreg_l64), KillSrc);
+    return;
+  }
+
+  if (SystemZ::GRX32BitRegClass.contains(DestReg, SrcReg)) {
+    emitGRX32Move(MBB, MBBI, DL, DestReg, SrcReg, SystemZ::LR, 32, KillSrc);
+    return;
+  }
+
+  // Everything else needs only one instruction.
+  unsigned Opcode;
+  if (SystemZ::GR64BitRegClass.contains(DestReg, SrcReg))
+    Opcode = SystemZ::LGR;
+  else if (SystemZ::FP32BitRegClass.contains(DestReg, SrcReg))
+    Opcode = SystemZ::LER;
+  else if (SystemZ::FP64BitRegClass.contains(DestReg, SrcReg))
+    Opcode = SystemZ::LDR;
+  else if (SystemZ::FP128BitRegClass.contains(DestReg, SrcReg))
+    Opcode = SystemZ::LXR;
+  else
+    llvm_unreachable("Impossible reg-to-reg copy");
+
+  BuildMI(MBB, MBBI, DL, get(Opcode), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+}
+
+void
+SystemZInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+				      MachineBasicBlock::iterator MBBI,
+				      unsigned SrcReg, bool isKill,
+				      int FrameIdx,
+				      const TargetRegisterClass *RC,
+				      const TargetRegisterInfo *TRI) const {
+  DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+
+  // Callers may expect a single instruction, so keep 128-bit moves
+  // together for now and lower them after register allocation.
+  unsigned LoadOpcode, StoreOpcode;
+  getLoadStoreOpcodes(RC, LoadOpcode, StoreOpcode);
+  addFrameReference(BuildMI(MBB, MBBI, DL, get(StoreOpcode))
+		    .addReg(SrcReg, getKillRegState(isKill)), FrameIdx);
+}
+
+void
+SystemZInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+				       MachineBasicBlock::iterator MBBI,
+				       unsigned DestReg, int FrameIdx,
+				       const TargetRegisterClass *RC,
+				       const TargetRegisterInfo *TRI) const {
+  DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+
+  // Callers may expect a single instruction, so keep 128-bit moves
+  // together for now and lower them after register allocation.
+  unsigned LoadOpcode, StoreOpcode;
+  getLoadStoreOpcodes(RC, LoadOpcode, StoreOpcode);
+  addFrameReference(BuildMI(MBB, MBBI, DL, get(LoadOpcode), DestReg),
+                    FrameIdx);
+}
+
+// Return true if MI is a simple load or store with a 12-bit displacement
+// and no index.  Flag is SimpleBDXLoad for loads and SimpleBDXStore for stores.
+static bool isSimpleBD12Move(const MachineInstr *MI, unsigned Flag) {
+  const MCInstrDesc &MCID = MI->getDesc();
+  return ((MCID.TSFlags & Flag) &&
+          isUInt<12>(MI->getOperand(2).getImm()) &&
+          MI->getOperand(3).getReg() == 0);
+}
+
+namespace {
+struct LogicOp {
+  LogicOp() : RegSize(0), ImmLSB(0), ImmSize(0) {}
+  LogicOp(unsigned regSize, unsigned immLSB, unsigned immSize)
+    : RegSize(regSize), ImmLSB(immLSB), ImmSize(immSize) {}
+
+  operator bool() const { return RegSize; }
+
+  unsigned RegSize, ImmLSB, ImmSize;
+};
+} // end anonymous namespace
+
+static LogicOp interpretAndImmediate(unsigned Opcode) {
+  switch (Opcode) {
+  case SystemZ::NILMux: return LogicOp(32,  0, 16);
+  case SystemZ::NIHMux: return LogicOp(32, 16, 16);
+  case SystemZ::NILL64: return LogicOp(64,  0, 16);
+  case SystemZ::NILH64: return LogicOp(64, 16, 16);
+  case SystemZ::NIHL64: return LogicOp(64, 32, 16);
+  case SystemZ::NIHH64: return LogicOp(64, 48, 16);
+  case SystemZ::NIFMux: return LogicOp(32,  0, 32);
+  case SystemZ::NILF64: return LogicOp(64,  0, 32);
+  case SystemZ::NIHF64: return LogicOp(64, 32, 32);
+  default:              return LogicOp();
+  }
+}
+
+// Used to return from convertToThreeAddress after replacing two-address
+// instruction OldMI with three-address instruction NewMI.
+static MachineInstr *finishConvertToThreeAddress(MachineInstr *OldMI,
+                                                 MachineInstr *NewMI,
+                                                 LiveVariables *LV) {
+  if (LV) {
+    unsigned NumOps = OldMI->getNumOperands();
+    for (unsigned I = 1; I < NumOps; ++I) {
+      MachineOperand &Op = OldMI->getOperand(I);
+      if (Op.isReg() && Op.isKill())
+        LV->replaceKillInstruction(Op.getReg(), OldMI, NewMI);
+    }
+  }
+  return NewMI;
+}
+
+MachineInstr *
+SystemZInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
+                                        MachineBasicBlock::iterator &MBBI,
+                                        LiveVariables *LV) const {
+  MachineInstr *MI = MBBI;
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+
+  unsigned Opcode = MI->getOpcode();
+  unsigned NumOps = MI->getNumOperands();
+
+  // Try to convert something like SLL into SLLK, if supported.
+  // We prefer to keep the two-operand form where possible both
+  // because it tends to be shorter and because some instructions
+  // have memory forms that can be used during spilling.
+  if (STI.hasDistinctOps()) {
+    MachineOperand &Dest = MI->getOperand(0);
+    MachineOperand &Src = MI->getOperand(1);
+    unsigned DestReg = Dest.getReg();
+    unsigned SrcReg = Src.getReg();
+    // AHIMux is only really a three-operand instruction when both operands
+    // are low registers.  Try to constrain both operands to be low if
+    // possible.
+    if (Opcode == SystemZ::AHIMux &&
+        TargetRegisterInfo::isVirtualRegister(DestReg) &&
+        TargetRegisterInfo::isVirtualRegister(SrcReg) &&
+        MRI.getRegClass(DestReg)->contains(SystemZ::R1L) &&
+        MRI.getRegClass(SrcReg)->contains(SystemZ::R1L)) {
+      MRI.constrainRegClass(DestReg, &SystemZ::GR32BitRegClass);
+      MRI.constrainRegClass(SrcReg, &SystemZ::GR32BitRegClass);
+    }
+    int ThreeOperandOpcode = SystemZ::getThreeOperandOpcode(Opcode);
+    if (ThreeOperandOpcode >= 0) {
+      MachineInstrBuilder MIB =
+        BuildMI(*MBB, MBBI, MI->getDebugLoc(), get(ThreeOperandOpcode))
+        .addOperand(Dest);
+      // Keep the kill state, but drop the tied flag.
+      MIB.addReg(Src.getReg(), getKillRegState(Src.isKill()), Src.getSubReg());
+      // Keep the remaining operands as-is.
+      for (unsigned I = 2; I < NumOps; ++I)
+        MIB.addOperand(MI->getOperand(I));
+      return finishConvertToThreeAddress(MI, MIB, LV);
+    }
+  }
+
+  // Try to convert an AND into an RISBG-type instruction.
+  if (LogicOp And = interpretAndImmediate(Opcode)) {
+    uint64_t Imm = MI->getOperand(2).getImm() << And.ImmLSB;
+    // AND IMMEDIATE leaves the other bits of the register unchanged.
+    Imm |= allOnes(And.RegSize) & ~(allOnes(And.ImmSize) << And.ImmLSB);
+    unsigned Start, End;
+    if (isRxSBGMask(Imm, And.RegSize, Start, End)) {
+      unsigned NewOpcode;
+      if (And.RegSize == 64)
+        NewOpcode = SystemZ::RISBG;
+      else {
+        NewOpcode = SystemZ::RISBMux;
+        Start &= 31;
+        End &= 31;
+      }
+      MachineOperand &Dest = MI->getOperand(0);
+      MachineOperand &Src = MI->getOperand(1);
+      MachineInstrBuilder MIB =
+        BuildMI(*MBB, MI, MI->getDebugLoc(), get(NewOpcode))
+        .addOperand(Dest).addReg(0)
+        .addReg(Src.getReg(), getKillRegState(Src.isKill()), Src.getSubReg())
+        .addImm(Start).addImm(End + 128).addImm(0);
+      return finishConvertToThreeAddress(MI, MIB, LV);
+    }
+  }
+  return nullptr;
+}
+
+MachineInstr *
+SystemZInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+                                        MachineInstr *MI,
+                                        const SmallVectorImpl<unsigned> &Ops,
+                                        int FrameIndex) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  unsigned Size = MFI->getObjectSize(FrameIndex);
+  unsigned Opcode = MI->getOpcode();
+
+  if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
+    if ((Opcode == SystemZ::LA || Opcode == SystemZ::LAY) &&
+        isInt<8>(MI->getOperand(2).getImm()) &&
+        !MI->getOperand(3).getReg()) {
+      // LA(Y) %reg, CONST(%reg) -> AGSI %mem, CONST
+      return BuildMI(MF, MI->getDebugLoc(), get(SystemZ::AGSI))
+        .addFrameIndex(FrameIndex).addImm(0)
+        .addImm(MI->getOperand(2).getImm());
+    }
+    return nullptr;
+  }
+
+  // All other cases require a single operand.
+  if (Ops.size() != 1)
+    return nullptr;
+
+  unsigned OpNum = Ops[0];
+  assert(Size == MF.getRegInfo()
+         .getRegClass(MI->getOperand(OpNum).getReg())->getSize() &&
+         "Invalid size combination");
+
+  if ((Opcode == SystemZ::AHI || Opcode == SystemZ::AGHI) &&
+      OpNum == 0 &&
+      isInt<8>(MI->getOperand(2).getImm())) {
+    // A(G)HI %reg, CONST -> A(G)SI %mem, CONST
+    Opcode = (Opcode == SystemZ::AHI ? SystemZ::ASI : SystemZ::AGSI);
+    return BuildMI(MF, MI->getDebugLoc(), get(Opcode))
+      .addFrameIndex(FrameIndex).addImm(0)
+      .addImm(MI->getOperand(2).getImm());
+  }
+
+  if (Opcode == SystemZ::LGDR || Opcode == SystemZ::LDGR) {
+    bool Op0IsGPR = (Opcode == SystemZ::LGDR);
+    bool Op1IsGPR = (Opcode == SystemZ::LDGR);
+    // If we're spilling the destination of an LDGR or LGDR, store the
+    // source register instead.
+    if (OpNum == 0) {
+      unsigned StoreOpcode = Op1IsGPR ? SystemZ::STG : SystemZ::STD;
+      return BuildMI(MF, MI->getDebugLoc(), get(StoreOpcode))
+        .addOperand(MI->getOperand(1)).addFrameIndex(FrameIndex)
+        .addImm(0).addReg(0);
+    }
+    // If we're spilling the source of an LDGR or LGDR, load the
+    // destination register instead.
+    if (OpNum == 1) {
+      unsigned LoadOpcode = Op0IsGPR ? SystemZ::LG : SystemZ::LD;
+      unsigned Dest = MI->getOperand(0).getReg();
+      return BuildMI(MF, MI->getDebugLoc(), get(LoadOpcode), Dest)
+        .addFrameIndex(FrameIndex).addImm(0).addReg(0);
+    }
+  }
+
+  // Look for cases where the source of a simple store or the destination
+  // of a simple load is being spilled.  Try to use MVC instead.
+  //
+  // Although MVC is in practice a fast choice in these cases, it is still
+  // logically a bytewise copy.  This means that we cannot use it if the
+  // load or store is volatile.  We also wouldn't be able to use MVC if
+  // the two memories partially overlap, but that case cannot occur here,
+  // because we know that one of the memories is a full frame index.
+  //
+  // For performance reasons, we also want to avoid using MVC if the addresses
+  // might be equal.  We don't worry about that case here, because spill slot
+  // coloring happens later, and because we have special code to remove
+  // MVCs that turn out to be redundant.
+  if (OpNum == 0 && MI->hasOneMemOperand()) {
+    MachineMemOperand *MMO = *MI->memoperands_begin();
+    if (MMO->getSize() == Size && !MMO->isVolatile()) {
+      // Handle conversion of loads.
+      if (isSimpleBD12Move(MI, SystemZII::SimpleBDXLoad)) {
+        return BuildMI(MF, MI->getDebugLoc(), get(SystemZ::MVC))
+          .addFrameIndex(FrameIndex).addImm(0).addImm(Size)
+          .addOperand(MI->getOperand(1)).addImm(MI->getOperand(2).getImm())
+          .addMemOperand(MMO);
+      }
+      // Handle conversion of stores.
+      if (isSimpleBD12Move(MI, SystemZII::SimpleBDXStore)) {
+        return BuildMI(MF, MI->getDebugLoc(), get(SystemZ::MVC))
+          .addOperand(MI->getOperand(1)).addImm(MI->getOperand(2).getImm())
+          .addImm(Size).addFrameIndex(FrameIndex).addImm(0)
+          .addMemOperand(MMO);
+      }
+    }
+  }
+
+  // If the spilled operand is the final one, try to change <INSN>R
+  // into <INSN>.
+  int MemOpcode = SystemZ::getMemOpcode(Opcode);
+  if (MemOpcode >= 0) {
+    unsigned NumOps = MI->getNumExplicitOperands();
+    if (OpNum == NumOps - 1) {
+      const MCInstrDesc &MemDesc = get(MemOpcode);
+      uint64_t AccessBytes = SystemZII::getAccessSize(MemDesc.TSFlags);
+      assert(AccessBytes != 0 && "Size of access should be known");
+      assert(AccessBytes <= Size && "Access outside the frame index");
+      uint64_t Offset = Size - AccessBytes;
+      MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(MemOpcode));
+      for (unsigned I = 0; I < OpNum; ++I)
+        MIB.addOperand(MI->getOperand(I));
+      MIB.addFrameIndex(FrameIndex).addImm(Offset);
+      if (MemDesc.TSFlags & SystemZII::HasIndex)
+        MIB.addReg(0);
+      return MIB;
+    }
+  }
+
+  return nullptr;
+}
+
+MachineInstr *
+SystemZInstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr* MI,
+                                        const SmallVectorImpl<unsigned> &Ops,
+                                        MachineInstr* LoadMI) const {
+  return nullptr;
+}
+
+bool
+SystemZInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
+  switch (MI->getOpcode()) {
+  case SystemZ::L128:
+    splitMove(MI, SystemZ::LG);
+    return true;
+
+  case SystemZ::ST128:
+    splitMove(MI, SystemZ::STG);
+    return true;
+
+  case SystemZ::LX:
+    splitMove(MI, SystemZ::LD);
+    return true;
+
+  case SystemZ::STX:
+    splitMove(MI, SystemZ::STD);
+    return true;
+
+  case SystemZ::LBMux:
+    expandRXYPseudo(MI, SystemZ::LB, SystemZ::LBH);
+    return true;
+
+  case SystemZ::LHMux:
+    expandRXYPseudo(MI, SystemZ::LH, SystemZ::LHH);
+    return true;
+
+  case SystemZ::LLCRMux:
+    expandZExtPseudo(MI, SystemZ::LLCR, 8);
+    return true;
+
+  case SystemZ::LLHRMux:
+    expandZExtPseudo(MI, SystemZ::LLHR, 16);
+    return true;
+
+  case SystemZ::LLCMux:
+    expandRXYPseudo(MI, SystemZ::LLC, SystemZ::LLCH);
+    return true;
+
+  case SystemZ::LLHMux:
+    expandRXYPseudo(MI, SystemZ::LLH, SystemZ::LLHH);
+    return true;
+
+  case SystemZ::LMux:
+    expandRXYPseudo(MI, SystemZ::L, SystemZ::LFH);
+    return true;
+
+  case SystemZ::STCMux:
+    expandRXYPseudo(MI, SystemZ::STC, SystemZ::STCH);
+    return true;
+
+  case SystemZ::STHMux:
+    expandRXYPseudo(MI, SystemZ::STH, SystemZ::STHH);
+    return true;
+
+  case SystemZ::STMux:
+    expandRXYPseudo(MI, SystemZ::ST, SystemZ::STFH);
+    return true;
+
+  case SystemZ::LHIMux:
+    expandRIPseudo(MI, SystemZ::LHI, SystemZ::IIHF, true);
+    return true;
+
+  case SystemZ::IIFMux:
+    expandRIPseudo(MI, SystemZ::IILF, SystemZ::IIHF, false);
+    return true;
+
+  case SystemZ::IILMux:
+    expandRIPseudo(MI, SystemZ::IILL, SystemZ::IIHL, false);
+    return true;
+
+  case SystemZ::IIHMux:
+    expandRIPseudo(MI, SystemZ::IILH, SystemZ::IIHH, false);
+    return true;
+
+  case SystemZ::NIFMux:
+    expandRIPseudo(MI, SystemZ::NILF, SystemZ::NIHF, false);
+    return true;
+
+  case SystemZ::NILMux:
+    expandRIPseudo(MI, SystemZ::NILL, SystemZ::NIHL, false);
+    return true;
+
+  case SystemZ::NIHMux:
+    expandRIPseudo(MI, SystemZ::NILH, SystemZ::NIHH, false);
+    return true;
+
+  case SystemZ::OIFMux:
+    expandRIPseudo(MI, SystemZ::OILF, SystemZ::OIHF, false);
+    return true;
+
+  case SystemZ::OILMux:
+    expandRIPseudo(MI, SystemZ::OILL, SystemZ::OIHL, false);
+    return true;
+
+  case SystemZ::OIHMux:
+    expandRIPseudo(MI, SystemZ::OILH, SystemZ::OIHH, false);
+    return true;
+
+  case SystemZ::XIFMux:
+    expandRIPseudo(MI, SystemZ::XILF, SystemZ::XIHF, false);
+    return true;
+
+  case SystemZ::TMLMux:
+    expandRIPseudo(MI, SystemZ::TMLL, SystemZ::TMHL, false);
+    return true;
+
+  case SystemZ::TMHMux:
+    expandRIPseudo(MI, SystemZ::TMLH, SystemZ::TMHH, false);
+    return true;
+
+  case SystemZ::AHIMux:
+    expandRIPseudo(MI, SystemZ::AHI, SystemZ::AIH, false);
+    return true;
+
+  case SystemZ::AHIMuxK:
+    expandRIEPseudo(MI, SystemZ::AHI, SystemZ::AHIK, SystemZ::AIH);
+    return true;
+
+  case SystemZ::AFIMux:
+    expandRIPseudo(MI, SystemZ::AFI, SystemZ::AIH, false);
+    return true;
+
+  case SystemZ::CFIMux:
+    expandRIPseudo(MI, SystemZ::CFI, SystemZ::CIH, false);
+    return true;
+
+  case SystemZ::CLFIMux:
+    expandRIPseudo(MI, SystemZ::CLFI, SystemZ::CLIH, false);
+    return true;
+
+  case SystemZ::CMux:
+    expandRXYPseudo(MI, SystemZ::C, SystemZ::CHF);
+    return true;
+
+  case SystemZ::CLMux:
+    expandRXYPseudo(MI, SystemZ::CL, SystemZ::CLHF);
+    return true;
+
+  case SystemZ::RISBMux: {
+    bool DestIsHigh = isHighReg(MI->getOperand(0).getReg());
+    bool SrcIsHigh = isHighReg(MI->getOperand(2).getReg());
+    if (SrcIsHigh == DestIsHigh)
+      MI->setDesc(get(DestIsHigh ? SystemZ::RISBHH : SystemZ::RISBLL));
+    else {
+      MI->setDesc(get(DestIsHigh ? SystemZ::RISBHL : SystemZ::RISBLH));
+      MI->getOperand(5).setImm(MI->getOperand(5).getImm() ^ 32);
+    }
+    return true;
+  }
+
+  case SystemZ::ADJDYNALLOC:
+    splitAdjDynAlloc(MI);
+    return true;
+
+  default:
+    return false;
+  }
+}
+
+uint64_t SystemZInstrInfo::getInstSizeInBytes(const MachineInstr *MI) const {
+  if (MI->getOpcode() == TargetOpcode::INLINEASM) {
+    const MachineFunction *MF = MI->getParent()->getParent();
+    const char *AsmStr = MI->getOperand(0).getSymbolName();
+    return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
+  }
+  return MI->getDesc().getSize();
+}
+
+SystemZII::Branch
+SystemZInstrInfo::getBranchInfo(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  case SystemZ::BR:
+  case SystemZ::J:
+  case SystemZ::JG:
+    return SystemZII::Branch(SystemZII::BranchNormal, SystemZ::CCMASK_ANY,
+                             SystemZ::CCMASK_ANY, &MI->getOperand(0));
+
+  case SystemZ::BRC:
+  case SystemZ::BRCL:
+    return SystemZII::Branch(SystemZII::BranchNormal,
+                             MI->getOperand(0).getImm(),
+                             MI->getOperand(1).getImm(), &MI->getOperand(2));
+
+  case SystemZ::BRCT:
+    return SystemZII::Branch(SystemZII::BranchCT, SystemZ::CCMASK_ICMP,
+                             SystemZ::CCMASK_CMP_NE, &MI->getOperand(2));
+
+  case SystemZ::BRCTG:
+    return SystemZII::Branch(SystemZII::BranchCTG, SystemZ::CCMASK_ICMP,
+                             SystemZ::CCMASK_CMP_NE, &MI->getOperand(2));
+
+  case SystemZ::CIJ:
+  case SystemZ::CRJ:
+    return SystemZII::Branch(SystemZII::BranchC, SystemZ::CCMASK_ICMP,
+                             MI->getOperand(2).getImm(), &MI->getOperand(3));
+
+  case SystemZ::CLIJ:
+  case SystemZ::CLRJ:
+    return SystemZII::Branch(SystemZII::BranchCL, SystemZ::CCMASK_ICMP,
+                             MI->getOperand(2).getImm(), &MI->getOperand(3));
+
+  case SystemZ::CGIJ:
+  case SystemZ::CGRJ:
+    return SystemZII::Branch(SystemZII::BranchCG, SystemZ::CCMASK_ICMP,
+                             MI->getOperand(2).getImm(), &MI->getOperand(3));
+
+  case SystemZ::CLGIJ:
+  case SystemZ::CLGRJ:
+    return SystemZII::Branch(SystemZII::BranchCLG, SystemZ::CCMASK_ICMP,
+                             MI->getOperand(2).getImm(), &MI->getOperand(3));
+
+  default:
+    llvm_unreachable("Unrecognized branch opcode");
+  }
+}
+
+void SystemZInstrInfo::getLoadStoreOpcodes(const TargetRegisterClass *RC,
+                                           unsigned &LoadOpcode,
+                                           unsigned &StoreOpcode) const {
+  if (RC == &SystemZ::GR32BitRegClass || RC == &SystemZ::ADDR32BitRegClass) {
+    LoadOpcode = SystemZ::L;
+    StoreOpcode = SystemZ::ST;
+  } else if (RC == &SystemZ::GRH32BitRegClass) {
+    LoadOpcode = SystemZ::LFH;
+    StoreOpcode = SystemZ::STFH;
+  } else if (RC == &SystemZ::GRX32BitRegClass) {
+    LoadOpcode = SystemZ::LMux;
+    StoreOpcode = SystemZ::STMux;
+  } else if (RC == &SystemZ::GR64BitRegClass ||
+             RC == &SystemZ::ADDR64BitRegClass) {
+    LoadOpcode = SystemZ::LG;
+    StoreOpcode = SystemZ::STG;
+  } else if (RC == &SystemZ::GR128BitRegClass ||
+             RC == &SystemZ::ADDR128BitRegClass) {
+    LoadOpcode = SystemZ::L128;
+    StoreOpcode = SystemZ::ST128;
+  } else if (RC == &SystemZ::FP32BitRegClass) {
+    LoadOpcode = SystemZ::LE;
+    StoreOpcode = SystemZ::STE;
+  } else if (RC == &SystemZ::FP64BitRegClass) {
+    LoadOpcode = SystemZ::LD;
+    StoreOpcode = SystemZ::STD;
+  } else if (RC == &SystemZ::FP128BitRegClass) {
+    LoadOpcode = SystemZ::LX;
+    StoreOpcode = SystemZ::STX;
+  } else
+    llvm_unreachable("Unsupported regclass to load or store");
+}
+
+unsigned SystemZInstrInfo::getOpcodeForOffset(unsigned Opcode,
+                                              int64_t Offset) const {
+  const MCInstrDesc &MCID = get(Opcode);
+  int64_t Offset2 = (MCID.TSFlags & SystemZII::Is128Bit ? Offset + 8 : Offset);
+  if (isUInt<12>(Offset) && isUInt<12>(Offset2)) {
+    // Get the instruction to use for unsigned 12-bit displacements.
+    int Disp12Opcode = SystemZ::getDisp12Opcode(Opcode);
+    if (Disp12Opcode >= 0)
+      return Disp12Opcode;
+
+    // All address-related instructions can use unsigned 12-bit
+    // displacements.
+    return Opcode;
+  }
+  if (isInt<20>(Offset) && isInt<20>(Offset2)) {
+    // Get the instruction to use for signed 20-bit displacements.
+    int Disp20Opcode = SystemZ::getDisp20Opcode(Opcode);
+    if (Disp20Opcode >= 0)
+      return Disp20Opcode;
+
+    // Check whether Opcode allows signed 20-bit displacements.
+    if (MCID.TSFlags & SystemZII::Has20BitOffset)
+      return Opcode;
+  }
+  return 0;
+}
+
+unsigned SystemZInstrInfo::getLoadAndTest(unsigned Opcode) const {
+  switch (Opcode) {
+  case SystemZ::L:    return SystemZ::LT;
+  case SystemZ::LY:   return SystemZ::LT;
+  case SystemZ::LG:   return SystemZ::LTG;
+  case SystemZ::LGF:  return SystemZ::LTGF;
+  case SystemZ::LR:   return SystemZ::LTR;
+  case SystemZ::LGFR: return SystemZ::LTGFR;
+  case SystemZ::LGR:  return SystemZ::LTGR;
+  case SystemZ::LER:  return SystemZ::LTEBR;
+  case SystemZ::LDR:  return SystemZ::LTDBR;
+  case SystemZ::LXR:  return SystemZ::LTXBR;
+  default:            return 0;
+  }
+}
+
+// Return true if Mask matches the regexp 0*1+0*, given that zero masks
+// have already been filtered out.  Store the first set bit in LSB and
+// the number of set bits in Length if so.
+static bool isStringOfOnes(uint64_t Mask, unsigned &LSB, unsigned &Length) {
+  unsigned First = findFirstSet(Mask);
+  uint64_t Top = (Mask >> First) + 1;
+  if ((Top & -Top) == Top) {
+    LSB = First;
+    Length = findFirstSet(Top);
+    return true;
+  }
+  return false;
+}
+
+bool SystemZInstrInfo::isRxSBGMask(uint64_t Mask, unsigned BitSize,
+                                   unsigned &Start, unsigned &End) const {
+  // Reject trivial all-zero masks.
+  if (Mask == 0)
+    return false;
+
+  // Handle the 1+0+ or 0+1+0* cases.  Start then specifies the index of
+  // the msb and End specifies the index of the lsb.
+  unsigned LSB, Length;
+  if (isStringOfOnes(Mask, LSB, Length)) {
+    Start = 63 - (LSB + Length - 1);
+    End = 63 - LSB;
+    return true;
+  }
+
+  // Handle the wrap-around 1+0+1+ cases.  Start then specifies the msb
+  // of the low 1s and End specifies the lsb of the high 1s.
+  if (isStringOfOnes(Mask ^ allOnes(BitSize), LSB, Length)) {
+    assert(LSB > 0 && "Bottom bit must be set");
+    assert(LSB + Length < BitSize && "Top bit must be set");
+    Start = 63 - (LSB - 1);
+    End = 63 - (LSB + Length);
+    return true;
+  }
+
+  return false;
+}
+
+unsigned SystemZInstrInfo::getCompareAndBranch(unsigned Opcode,
+                                               const MachineInstr *MI) const {
+  switch (Opcode) {
+  case SystemZ::CR:
+    return SystemZ::CRJ;
+  case SystemZ::CGR:
+    return SystemZ::CGRJ;
+  case SystemZ::CHI:
+    return MI && isInt<8>(MI->getOperand(1).getImm()) ? SystemZ::CIJ : 0;
+  case SystemZ::CGHI:
+    return MI && isInt<8>(MI->getOperand(1).getImm()) ? SystemZ::CGIJ : 0;
+  case SystemZ::CLR:
+    return SystemZ::CLRJ;
+  case SystemZ::CLGR:
+    return SystemZ::CLGRJ;
+  case SystemZ::CLFI:
+    return MI && isUInt<8>(MI->getOperand(1).getImm()) ? SystemZ::CLIJ : 0;
+  case SystemZ::CLGFI:
+    return MI && isUInt<8>(MI->getOperand(1).getImm()) ? SystemZ::CLGIJ : 0;
+  default:
+    return 0;
+  }
+}
+
+void SystemZInstrInfo::loadImmediate(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MBBI,
+                                     unsigned Reg, uint64_t Value) const {
+  DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+  unsigned Opcode;
+  if (isInt<16>(Value))
+    Opcode = SystemZ::LGHI;
+  else if (SystemZ::isImmLL(Value))
+    Opcode = SystemZ::LLILL;
+  else if (SystemZ::isImmLH(Value)) {
+    Opcode = SystemZ::LLILH;
+    Value >>= 16;
+  } else {
+    assert(isInt<32>(Value) && "Huge values not handled yet");
+    Opcode = SystemZ::LGFI;
+  }
+  BuildMI(MBB, MBBI, DL, get(Opcode), Reg).addImm(Value);
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.h b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
new file mode 100644
index 0000000..83009cb
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -0,0 +1,243 @@
+//===-- SystemZInstrInfo.h - SystemZ instruction information ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the SystemZ implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_SYSTEMZINSTRINFO_H
+#define LLVM_TARGET_SYSTEMZINSTRINFO_H
+
+#include "SystemZ.h"
+#include "SystemZRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "SystemZGenInstrInfo.inc"
+
+namespace llvm {
+
+class SystemZTargetMachine;
+
+namespace SystemZII {
+enum {
+  // See comments in SystemZInstrFormats.td.
+  SimpleBDXLoad          = (1 << 0),
+  SimpleBDXStore         = (1 << 1),
+  Has20BitOffset         = (1 << 2),
+  HasIndex               = (1 << 3),
+  Is128Bit               = (1 << 4),
+  AccessSizeMask         = (31 << 5),
+  AccessSizeShift        = 5,
+  CCValuesMask           = (15 << 10),
+  CCValuesShift          = 10,
+  CompareZeroCCMaskMask  = (15 << 14),
+  CompareZeroCCMaskShift = 14,
+  CCMaskFirst            = (1 << 18),
+  CCMaskLast             = (1 << 19),
+  IsLogical              = (1 << 20)
+};
+static inline unsigned getAccessSize(unsigned int Flags) {
+  return (Flags & AccessSizeMask) >> AccessSizeShift;
+}
+static inline unsigned getCCValues(unsigned int Flags) {
+  return (Flags & CCValuesMask) >> CCValuesShift;
+}
+static inline unsigned getCompareZeroCCMask(unsigned int Flags) {
+  return (Flags & CompareZeroCCMaskMask) >> CompareZeroCCMaskShift;
+}
+
+// SystemZ MachineOperand target flags.
+enum {
+  // Masks out the bits for the access model.
+  MO_SYMBOL_MODIFIER = (1 << 0),
+
+  // @GOT (aka @GOTENT)
+  MO_GOT = (1 << 0)
+};
+// Classifies a branch.
+enum BranchType {
+  // An instruction that branches on the current value of CC.
+  BranchNormal,
+
+  // An instruction that peforms a 32-bit signed comparison and branches
+  // on the result.
+  BranchC,
+
+  // An instruction that peforms a 32-bit unsigned comparison and branches
+  // on the result.
+  BranchCL,
+
+  // An instruction that peforms a 64-bit signed comparison and branches
+  // on the result.
+  BranchCG,
+
+  // An instruction that peforms a 64-bit unsigned comparison and branches
+  // on the result.
+  BranchCLG,
+
+  // An instruction that decrements a 32-bit register and branches if
+  // the result is nonzero.
+  BranchCT,
+
+  // An instruction that decrements a 64-bit register and branches if
+  // the result is nonzero.
+  BranchCTG
+};
+// Information about a branch instruction.
+struct Branch {
+  // The type of the branch.
+  BranchType Type;
+
+  // CCMASK_<N> is set if CC might be equal to N.
+  unsigned CCValid;
+
+  // CCMASK_<N> is set if the branch should be taken when CC == N.
+  unsigned CCMask;
+
+  // The target of the branch.
+  const MachineOperand *Target;
+
+  Branch(BranchType type, unsigned ccValid, unsigned ccMask,
+         const MachineOperand *target)
+    : Type(type), CCValid(ccValid), CCMask(ccMask), Target(target) {}
+};
+} // end namespace SystemZII
+
+class SystemZSubtarget;
+class SystemZInstrInfo : public SystemZGenInstrInfo {
+  const SystemZRegisterInfo RI;
+  SystemZSubtarget &STI;
+
+  void splitMove(MachineBasicBlock::iterator MI, unsigned NewOpcode) const;
+  void splitAdjDynAlloc(MachineBasicBlock::iterator MI) const;
+  void expandRIPseudo(MachineInstr *MI, unsigned LowOpcode,
+                      unsigned HighOpcode, bool ConvertHigh) const;
+  void expandRIEPseudo(MachineInstr *MI, unsigned LowOpcode,
+                       unsigned LowOpcodeK, unsigned HighOpcode) const;
+  void expandRXYPseudo(MachineInstr *MI, unsigned LowOpcode,
+                       unsigned HighOpcode) const;
+  void expandZExtPseudo(MachineInstr *MI, unsigned LowOpcode,
+                        unsigned Size) const;
+  void emitGRX32Move(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                     DebugLoc DL, unsigned DestReg, unsigned SrcReg,
+                     unsigned LowLowOpcode, unsigned Size, bool KillSrc) const;
+  virtual void anchor();
+  
+public:
+  explicit SystemZInstrInfo(SystemZSubtarget &STI);
+
+  // Override TargetInstrInfo.
+  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                               int &FrameIndex) const override;
+  unsigned isStoreToStackSlot(const MachineInstr *MI,
+                              int &FrameIndex) const override;
+  bool isStackSlotCopy(const MachineInstr *MI, int &DestFrameIndex,
+                       int &SrcFrameIndex) const override;
+  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify) const override;
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
+  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB,
+                        const SmallVectorImpl<MachineOperand> &Cond,
+                        DebugLoc DL) const override;
+  bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
+                      unsigned &SrcReg2, int &Mask, int &Value) const override;
+  bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
+                            unsigned SrcReg2, int Mask, int Value,
+                            const MachineRegisterInfo *MRI) const override;
+  bool isPredicable(MachineInstr *MI) const override;
+  bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
+                           unsigned ExtraPredCycles,
+                           const BranchProbability &Probability) const override;
+  bool isProfitableToIfCvt(MachineBasicBlock &TMBB,
+                           unsigned NumCyclesT, unsigned ExtraPredCyclesT,
+                           MachineBasicBlock &FMBB,
+                           unsigned NumCyclesF, unsigned ExtraPredCyclesF,
+                           const BranchProbability &Probability) const override;
+  bool PredicateInstruction(MachineInstr *MI,
+                            const SmallVectorImpl<MachineOperand> &Pred) const
+    override;
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                   DebugLoc DL, unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI,
+                            unsigned DestReg, int FrameIdx,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
+  MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
+                                      MachineBasicBlock::iterator &MBBI,
+                                      LiveVariables *LV) const override;
+  MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
+                                      const SmallVectorImpl<unsigned> &Ops,
+                                      int FrameIndex) const override;
+  MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr* MI,
+                                      const SmallVectorImpl<unsigned> &Ops,
+                                      MachineInstr* LoadMI) const override;
+  bool expandPostRAPseudo(MachineBasicBlock::iterator MBBI) const override;
+  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const
+    override;
+
+  // Return the SystemZRegisterInfo, which this class owns.
+  const SystemZRegisterInfo &getRegisterInfo() const { return RI; }
+
+  // Return the size in bytes of MI.
+  uint64_t getInstSizeInBytes(const MachineInstr *MI) const;
+
+  // Return true if MI is a conditional or unconditional branch.
+  // When returning true, set Cond to the mask of condition-code
+  // values on which the instruction will branch, and set Target
+  // to the operand that contains the branch target.  This target
+  // can be a register or a basic block.
+  SystemZII::Branch getBranchInfo(const MachineInstr *MI) const;
+
+  // Get the load and store opcodes for a given register class.
+  void getLoadStoreOpcodes(const TargetRegisterClass *RC,
+                           unsigned &LoadOpcode, unsigned &StoreOpcode) const;
+
+  // Opcode is the opcode of an instruction that has an address operand,
+  // and the caller wants to perform that instruction's operation on an
+  // address that has displacement Offset.  Return the opcode of a suitable
+  // instruction (which might be Opcode itself) or 0 if no such instruction
+  // exists.
+  unsigned getOpcodeForOffset(unsigned Opcode, int64_t Offset) const;
+
+  // If Opcode is a load instruction that has a LOAD AND TEST form,
+  // return the opcode for the testing form, otherwise return 0.
+  unsigned getLoadAndTest(unsigned Opcode) const;
+
+  // Return true if ROTATE AND ... SELECTED BITS can be used to select bits
+  // Mask of the R2 operand, given that only the low BitSize bits of Mask are
+  // significant.  Set Start and End to the I3 and I4 operands if so.
+  bool isRxSBGMask(uint64_t Mask, unsigned BitSize,
+                   unsigned &Start, unsigned &End) const;
+
+  // If Opcode is a COMPARE opcode for which an associated COMPARE AND
+  // BRANCH exists, return the opcode for the latter, otherwise return 0.
+  // MI, if nonnull, is the compare instruction.
+  unsigned getCompareAndBranch(unsigned Opcode,
+                               const MachineInstr *MI = nullptr) const;
+
+  // Emit code before MBBI in MI to move immediate value Value into
+  // physical register Reg.
+  void loadImmediate(MachineBasicBlock &MBB,
+                     MachineBasicBlock::iterator MBBI,
+                     unsigned Reg, uint64_t Value) const;
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.td b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
new file mode 100644
index 0000000..f4951ad
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -0,0 +1,1428 @@
+//===-- SystemZInstrInfo.td - General SystemZ instructions ----*- tblgen-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Stack allocation
+//===----------------------------------------------------------------------===//
+
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i64imm:$amt),
+                              [(callseq_start timm:$amt)]>;
+def ADJCALLSTACKUP   : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2),
+                              [(callseq_end timm:$amt1, timm:$amt2)]>;
+
+let neverHasSideEffects = 1 in {
+  // Takes as input the value of the stack pointer after a dynamic allocation
+  // has been made.  Sets the output to the address of the dynamically-
+  // allocated area itself, skipping the outgoing arguments.
+  //
+  // This expands to an LA or LAY instruction.  We restrict the offset
+  // to the range of LA and keep the LAY range in reserve for when
+  // the size of the outgoing arguments is added.
+  def ADJDYNALLOC : Pseudo<(outs GR64:$dst), (ins dynalloc12only:$src),
+                           [(set GR64:$dst, dynalloc12only:$src)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Control flow instructions
+//===----------------------------------------------------------------------===//
+
+// A return instruction (br %r14).
+let isReturn = 1, isTerminator = 1, isBarrier = 1, hasCtrlDep = 1 in
+  def Return : Alias<2, (outs), (ins), [(z_retflag)]>;
+
+// Unconditional branches.  R1 is the condition-code mask (all 1s).
+let isBranch = 1, isTerminator = 1, isBarrier = 1, R1 = 15 in {
+  let isIndirectBranch = 1 in
+    def BR : InstRR<0x07, (outs), (ins ADDR64:$R2),
+                    "br\t$R2", [(brind ADDR64:$R2)]>;
+
+  // An assembler extended mnemonic for BRC.
+  def J : InstRI<0xA74, (outs), (ins brtarget16:$I2), "j\t$I2",
+                 [(br bb:$I2)]>;
+
+  // An assembler extended mnemonic for BRCL.  (The extension is "G"
+  // rather than "L" because "JL" is "Jump if Less".)
+  def JG : InstRIL<0xC04, (outs), (ins brtarget32:$I2), "jg\t$I2", []>;
+}
+
+// Conditional branches.  It's easier for LLVM to handle these branches
+// in their raw BRC/BRCL form, with the 4-bit condition-code mask being
+// the first operand.  It seems friendlier to use mnemonic forms like
+// JE and JLH when writing out the assembly though.
+let isBranch = 1, isTerminator = 1, Uses = [CC] in {
+  let isCodeGenOnly = 1, CCMaskFirst = 1 in {
+    def BRC : InstRI<0xA74, (outs), (ins cond4:$valid, cond4:$R1,
+                                         brtarget16:$I2), "j$R1\t$I2",
+                     [(z_br_ccmask cond4:$valid, cond4:$R1, bb:$I2)]>;
+    def BRCL : InstRIL<0xC04, (outs), (ins cond4:$valid, cond4:$R1,
+                                           brtarget32:$I2), "jg$R1\t$I2", []>;
+  }
+  def AsmBRC : InstRI<0xA74, (outs), (ins imm32zx4:$R1, brtarget16:$I2),
+                      "brc\t$R1, $I2", []>;
+  def AsmBRCL : InstRIL<0xC04, (outs), (ins imm32zx4:$R1, brtarget32:$I2),
+                        "brcl\t$R1, $I2", []>;
+  def AsmBCR : InstRR<0x07, (outs), (ins imm32zx4:$R1, GR64:$R2),
+                      "bcr\t$R1, $R2", []>;
+}
+
+// Fused compare-and-branch instructions.  As for normal branches,
+// we handle these instructions internally in their raw CRJ-like form,
+// but use assembly macros like CRJE when writing them out.
+//
+// These instructions do not use or clobber the condition codes.
+// We nevertheless pretend that they clobber CC, so that we can lower
+// them to separate comparisons and BRCLs if the branch ends up being
+// out of range.
+multiclass CompareBranches<Operand ccmask, string pos1, string pos2> {
+  let isBranch = 1, isTerminator = 1, Defs = [CC] in {
+    def RJ  : InstRIEb<0xEC76, (outs), (ins GR32:$R1, GR32:$R2, ccmask:$M3,
+                                            brtarget16:$RI4),
+                       "crj"##pos1##"\t$R1, $R2, "##pos2##"$RI4", []>;
+    def GRJ : InstRIEb<0xEC64, (outs), (ins GR64:$R1, GR64:$R2, ccmask:$M3,
+                                            brtarget16:$RI4),
+                       "cgrj"##pos1##"\t$R1, $R2, "##pos2##"$RI4", []>;
+    def IJ  : InstRIEc<0xEC7E, (outs), (ins GR32:$R1, imm32sx8:$I2, ccmask:$M3,
+                                            brtarget16:$RI4),
+                       "cij"##pos1##"\t$R1, $I2, "##pos2##"$RI4", []>;
+    def GIJ : InstRIEc<0xEC7C, (outs), (ins GR64:$R1, imm64sx8:$I2, ccmask:$M3,
+                                            brtarget16:$RI4),
+                       "cgij"##pos1##"\t$R1, $I2, "##pos2##"$RI4", []>;
+    def LRJ  : InstRIEb<0xEC77, (outs), (ins GR32:$R1, GR32:$R2, ccmask:$M3,
+                                             brtarget16:$RI4),
+                        "clrj"##pos1##"\t$R1, $R2, "##pos2##"$RI4", []>;
+    def LGRJ : InstRIEb<0xEC65, (outs), (ins GR64:$R1, GR64:$R2, ccmask:$M3,
+                                             brtarget16:$RI4),
+                        "clgrj"##pos1##"\t$R1, $R2, "##pos2##"$RI4", []>;
+    def LIJ  : InstRIEc<0xEC7F, (outs), (ins GR32:$R1, imm32zx8:$I2, ccmask:$M3,
+                                             brtarget16:$RI4),
+                        "clij"##pos1##"\t$R1, $I2, "##pos2##"$RI4", []>;
+    def LGIJ : InstRIEc<0xEC7D, (outs), (ins GR64:$R1, imm64zx8:$I2, ccmask:$M3,
+                                             brtarget16:$RI4),
+                        "clgij"##pos1##"\t$R1, $I2, "##pos2##"$RI4", []>;
+  }
+}
+let isCodeGenOnly = 1 in
+  defm C : CompareBranches<cond4, "$M3", "">;
+defm AsmC : CompareBranches<imm32zx4, "", "$M3, ">;
+
+// Define AsmParser mnemonics for each general condition-code mask
+// (integer or floating-point)
+multiclass CondExtendedMnemonic<bits<4> ccmask, string name> {
+  let R1 = ccmask in {
+    def J : InstRI<0xA74, (outs), (ins brtarget16:$I2),
+                   "j"##name##"\t$I2", []>;
+    def JG : InstRIL<0xC04, (outs), (ins brtarget32:$I2),
+                     "jg"##name##"\t$I2", []>;
+    def BR : InstRR<0x07, (outs), (ins ADDR64:$R2), "b"##name##"r\t$R2", []>;
+  }
+  def LOCR  : FixedCondUnaryRRF<"locr"##name,  0xB9F2, GR32, GR32, ccmask>;
+  def LOCGR : FixedCondUnaryRRF<"locgr"##name, 0xB9E2, GR64, GR64, ccmask>;
+  def LOC   : FixedCondUnaryRSY<"loc"##name,   0xEBF2, GR32, ccmask, 4>;
+  def LOCG  : FixedCondUnaryRSY<"locg"##name,  0xEBE2, GR64, ccmask, 8>;
+  def STOC  : FixedCondStoreRSY<"stoc"##name,  0xEBF3, GR32, ccmask, 4>;
+  def STOCG : FixedCondStoreRSY<"stocg"##name, 0xEBE3, GR64, ccmask, 8>;
+}
+defm AsmO   : CondExtendedMnemonic<1,  "o">;
+defm AsmH   : CondExtendedMnemonic<2,  "h">;
+defm AsmNLE : CondExtendedMnemonic<3,  "nle">;
+defm AsmL   : CondExtendedMnemonic<4,  "l">;
+defm AsmNHE : CondExtendedMnemonic<5,  "nhe">;
+defm AsmLH  : CondExtendedMnemonic<6,  "lh">;
+defm AsmNE  : CondExtendedMnemonic<7,  "ne">;
+defm AsmE   : CondExtendedMnemonic<8,  "e">;
+defm AsmNLH : CondExtendedMnemonic<9,  "nlh">;
+defm AsmHE  : CondExtendedMnemonic<10, "he">;
+defm AsmNL  : CondExtendedMnemonic<11, "nl">;
+defm AsmLE  : CondExtendedMnemonic<12, "le">;
+defm AsmNH  : CondExtendedMnemonic<13, "nh">;
+defm AsmNO  : CondExtendedMnemonic<14, "no">;
+
+// Define AsmParser mnemonics for each integer condition-code mask.
+// This is like the list above, except that condition 3 is not possible
+// and that the low bit of the mask is therefore always 0.  This means
+// that each condition has two names.  Conditions "o" and "no" are not used.
+//
+// We don't make one of the two names an alias of the other because
+// we need the custom parsing routines to select the correct register class.
+multiclass IntCondExtendedMnemonicA<bits<4> ccmask, string name> {
+  let M3 = ccmask in {
+    def CR  : InstRIEb<0xEC76, (outs), (ins GR32:$R1, GR32:$R2,
+                                            brtarget16:$RI4),
+                       "crj"##name##"\t$R1, $R2, $RI4", []>;
+    def CGR : InstRIEb<0xEC64, (outs), (ins GR64:$R1, GR64:$R2,
+                                            brtarget16:$RI4),
+                       "cgrj"##name##"\t$R1, $R2, $RI4", []>;
+    def CI  : InstRIEc<0xEC7E, (outs), (ins GR32:$R1, imm32sx8:$I2,
+                                            brtarget16:$RI4),
+                       "cij"##name##"\t$R1, $I2, $RI4", []>;
+    def CGI : InstRIEc<0xEC7C, (outs), (ins GR64:$R1, imm64sx8:$I2,
+                                            brtarget16:$RI4),
+                       "cgij"##name##"\t$R1, $I2, $RI4", []>;
+    def CLR  : InstRIEb<0xEC77, (outs), (ins GR32:$R1, GR32:$R2,
+                                            brtarget16:$RI4),
+                        "clrj"##name##"\t$R1, $R2, $RI4", []>;
+    def CLGR : InstRIEb<0xEC65, (outs), (ins GR64:$R1, GR64:$R2,
+                                             brtarget16:$RI4),
+                        "clgrj"##name##"\t$R1, $R2, $RI4", []>;
+    def CLI  : InstRIEc<0xEC7F, (outs), (ins GR32:$R1, imm32zx8:$I2,
+                                             brtarget16:$RI4),
+                        "clij"##name##"\t$R1, $I2, $RI4", []>;
+    def CLGI : InstRIEc<0xEC7D, (outs), (ins GR64:$R1, imm64zx8:$I2,
+                                             brtarget16:$RI4),
+                        "clgij"##name##"\t$R1, $I2, $RI4", []>;
+  }
+}
+multiclass IntCondExtendedMnemonic<bits<4> ccmask, string name1, string name2>
+  : IntCondExtendedMnemonicA<ccmask, name1> {
+  let isAsmParserOnly = 1 in
+    defm Alt : IntCondExtendedMnemonicA<ccmask, name2>;
+}
+defm AsmJH   : IntCondExtendedMnemonic<2,  "h",  "nle">;
+defm AsmJL   : IntCondExtendedMnemonic<4,  "l",  "nhe">;
+defm AsmJLH  : IntCondExtendedMnemonic<6,  "lh", "ne">;
+defm AsmJE   : IntCondExtendedMnemonic<8,  "e",  "nlh">;
+defm AsmJHE  : IntCondExtendedMnemonic<10, "he", "nl">;
+defm AsmJLE  : IntCondExtendedMnemonic<12, "le", "nh">;
+
+// Decrement a register and branch if it is nonzero.  These don't clobber CC,
+// but we might need to split long branches into sequences that do.
+let Defs = [CC] in {
+  def BRCT  : BranchUnaryRI<"brct",  0xA76, GR32>;
+  def BRCTG : BranchUnaryRI<"brctg", 0xA77, GR64>;
+}
+
+//===----------------------------------------------------------------------===//
+// Select instructions
+//===----------------------------------------------------------------------===//
+
+def Select32Mux : SelectWrapper<GRX32>, Requires<[FeatureHighWord]>;
+def Select32    : SelectWrapper<GR32>;
+def Select64    : SelectWrapper<GR64>;
+
+// We don't define 32-bit Mux stores because the low-only STOC should
+// always be used if possible.
+defm CondStore8Mux  : CondStores<GRX32, nonvolatile_truncstorei8,
+                                 nonvolatile_anyextloadi8, bdxaddr20only>,
+                      Requires<[FeatureHighWord]>;
+defm CondStore16Mux : CondStores<GRX32, nonvolatile_truncstorei16,
+                                 nonvolatile_anyextloadi16, bdxaddr20only>,
+                      Requires<[FeatureHighWord]>;
+defm CondStore8     : CondStores<GR32, nonvolatile_truncstorei8,
+                                 nonvolatile_anyextloadi8, bdxaddr20only>;
+defm CondStore16    : CondStores<GR32, nonvolatile_truncstorei16,
+                                 nonvolatile_anyextloadi16, bdxaddr20only>;
+defm CondStore32    : CondStores<GR32, nonvolatile_store,
+                                 nonvolatile_load, bdxaddr20only>;
+
+defm : CondStores64<CondStore8, CondStore8Inv, nonvolatile_truncstorei8,
+                    nonvolatile_anyextloadi8, bdxaddr20only>;
+defm : CondStores64<CondStore16, CondStore16Inv, nonvolatile_truncstorei16,
+                    nonvolatile_anyextloadi16, bdxaddr20only>;
+defm : CondStores64<CondStore32, CondStore32Inv, nonvolatile_truncstorei32,
+                    nonvolatile_anyextloadi32, bdxaddr20only>;
+defm CondStore64 : CondStores<GR64, nonvolatile_store,
+                              nonvolatile_load, bdxaddr20only>;
+
+//===----------------------------------------------------------------------===//
+// Call instructions
+//===----------------------------------------------------------------------===//
+
+let isCall = 1, Defs = [R14D, CC] in {
+  def CallBRASL : Alias<6, (outs), (ins pcrel32:$I2, variable_ops),
+                        [(z_call pcrel32:$I2)]>;
+  def CallBASR  : Alias<2, (outs), (ins ADDR64:$R2, variable_ops),
+                        [(z_call ADDR64:$R2)]>;
+}
+
+// Sibling calls.  Indirect sibling calls must be via R1, since R2 upwards
+// are argument registers and since branching to R0 is a no-op.
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
+  def CallJG : Alias<6, (outs), (ins pcrel32:$I2),
+                     [(z_sibcall pcrel32:$I2)]>;
+  let Uses = [R1D] in
+    def CallBR : Alias<2, (outs), (ins), [(z_sibcall R1D)]>;
+}
+
+// Define the general form of the call instructions for the asm parser.
+// These instructions don't hard-code %r14 as the return address register.
+def BRAS  : InstRI<0xA75, (outs), (ins GR64:$R1, brtarget16:$I2),
+                   "bras\t$R1, $I2", []>;
+def BRASL : InstRIL<0xC05, (outs), (ins GR64:$R1, brtarget32:$I2),
+                    "brasl\t$R1, $I2", []>;
+def BASR  : InstRR<0x0D, (outs), (ins GR64:$R1, ADDR64:$R2),
+                   "basr\t$R1, $R2", []>;
+
+//===----------------------------------------------------------------------===//
+// Move instructions
+//===----------------------------------------------------------------------===//
+
+// Register moves.
+let neverHasSideEffects = 1 in {
+  // Expands to LR, RISBHG or RISBLG, depending on the choice of registers.
+  def LRMux : UnaryRRPseudo<"l", null_frag, GRX32, GRX32>,
+              Requires<[FeatureHighWord]>;
+  def LR  : UnaryRR <"l",  0x18,   null_frag, GR32, GR32>;
+  def LGR : UnaryRRE<"lg", 0xB904, null_frag, GR64, GR64>;
+}
+let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in {
+  def LTR  : UnaryRR <"lt",  0x12,   null_frag, GR32, GR32>;
+  def LTGR : UnaryRRE<"ltg", 0xB902, null_frag, GR64, GR64>;
+}
+
+// Move on condition.
+let isCodeGenOnly = 1, Uses = [CC] in {
+  def LOCR  : CondUnaryRRF<"loc",  0xB9F2, GR32, GR32>;
+  def LOCGR : CondUnaryRRF<"locg", 0xB9E2, GR64, GR64>;
+}
+let Uses = [CC] in {
+  def AsmLOCR  : AsmCondUnaryRRF<"loc",  0xB9F2, GR32, GR32>;
+  def AsmLOCGR : AsmCondUnaryRRF<"locg", 0xB9E2, GR64, GR64>;
+}
+
+// Immediate moves.
+let neverHasSideEffects = 1, isAsCheapAsAMove = 1, isMoveImm = 1,
+    isReMaterializable = 1 in {
+  // 16-bit sign-extended immediates.  LHIMux expands to LHI or IIHF,
+  // deopending on the choice of register.
+  def LHIMux : UnaryRIPseudo<bitconvert, GRX32, imm32sx16>,
+               Requires<[FeatureHighWord]>;
+  def LHI  : UnaryRI<"lhi",  0xA78, bitconvert, GR32, imm32sx16>;
+  def LGHI : UnaryRI<"lghi", 0xA79, bitconvert, GR64, imm64sx16>;
+
+  // Other 16-bit immediates.
+  def LLILL : UnaryRI<"llill", 0xA5F, bitconvert, GR64, imm64ll16>;
+  def LLILH : UnaryRI<"llilh", 0xA5E, bitconvert, GR64, imm64lh16>;
+  def LLIHL : UnaryRI<"llihl", 0xA5D, bitconvert, GR64, imm64hl16>;
+  def LLIHH : UnaryRI<"llihh", 0xA5C, bitconvert, GR64, imm64hh16>;
+
+  // 32-bit immediates.
+  def LGFI  : UnaryRIL<"lgfi",  0xC01, bitconvert, GR64, imm64sx32>;
+  def LLILF : UnaryRIL<"llilf", 0xC0F, bitconvert, GR64, imm64lf32>;
+  def LLIHF : UnaryRIL<"llihf", 0xC0E, bitconvert, GR64, imm64hf32>;
+}
+
+// Register loads.
+let canFoldAsLoad = 1, SimpleBDXLoad = 1 in {
+  // Expands to L, LY or LFH, depending on the choice of register.
+  def LMux : UnaryRXYPseudo<"l", load, GRX32, 4>,
+             Requires<[FeatureHighWord]>;
+  defm L : UnaryRXPair<"l", 0x58, 0xE358, load, GR32, 4>;
+  def LFH : UnaryRXY<"lfh", 0xE3CA, load, GRH32, 4>,
+            Requires<[FeatureHighWord]>;
+  def LG : UnaryRXY<"lg", 0xE304, load, GR64, 8>;
+
+  // These instructions are split after register allocation, so we don't
+  // want a custom inserter.
+  let Has20BitOffset = 1, HasIndex = 1, Is128Bit = 1 in {
+    def L128 : Pseudo<(outs GR128:$dst), (ins bdxaddr20only128:$src),
+                      [(set GR128:$dst, (load bdxaddr20only128:$src))]>;
+  }
+}
+let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in {
+  def LT  : UnaryRXY<"lt",  0xE312, load, GR32, 4>;
+  def LTG : UnaryRXY<"ltg", 0xE302, load, GR64, 8>;
+}
+
+let canFoldAsLoad = 1 in {
+  def LRL  : UnaryRILPC<"lrl",  0xC4D, aligned_load, GR32>;
+  def LGRL : UnaryRILPC<"lgrl", 0xC48, aligned_load, GR64>;
+}
+
+// Load on condition.
+let isCodeGenOnly = 1, Uses = [CC] in {
+  def LOC  : CondUnaryRSY<"loc",  0xEBF2, nonvolatile_load, GR32, 4>;
+  def LOCG : CondUnaryRSY<"locg", 0xEBE2, nonvolatile_load, GR64, 8>;
+}
+let Uses = [CC] in {
+  def AsmLOC  : AsmCondUnaryRSY<"loc",  0xEBF2, GR32, 4>;
+  def AsmLOCG : AsmCondUnaryRSY<"locg", 0xEBE2, GR64, 8>;
+}
+
+// Register stores.
+let SimpleBDXStore = 1 in {
+  // Expands to ST, STY or STFH, depending on the choice of register.
+  def STMux : StoreRXYPseudo<store, GRX32, 4>,
+              Requires<[FeatureHighWord]>;
+  defm ST : StoreRXPair<"st", 0x50, 0xE350, store, GR32, 4>;
+  def STFH : StoreRXY<"stfh", 0xE3CB, store, GRH32, 4>,
+             Requires<[FeatureHighWord]>;
+  def STG : StoreRXY<"stg", 0xE324, store, GR64, 8>;
+
+  // These instructions are split after register allocation, so we don't
+  // want a custom inserter.
+  let Has20BitOffset = 1, HasIndex = 1, Is128Bit = 1 in {
+    def ST128 : Pseudo<(outs), (ins GR128:$src, bdxaddr20only128:$dst),
+                       [(store GR128:$src, bdxaddr20only128:$dst)]>;
+  }
+}
+def STRL  : StoreRILPC<"strl", 0xC4F, aligned_store, GR32>;
+def STGRL : StoreRILPC<"stgrl", 0xC4B, aligned_store, GR64>;
+
+// Store on condition.
+let isCodeGenOnly = 1, Uses = [CC] in {
+  def STOC  : CondStoreRSY<"stoc",  0xEBF3, GR32, 4>;
+  def STOCG : CondStoreRSY<"stocg", 0xEBE3, GR64, 8>;
+}
+let Uses = [CC] in {
+  def AsmSTOC  : AsmCondStoreRSY<"stoc",  0xEBF3, GR32, 4>;
+  def AsmSTOCG : AsmCondStoreRSY<"stocg", 0xEBE3, GR64, 8>;
+}
+
+// 8-bit immediate stores to 8-bit fields.
+defm MVI : StoreSIPair<"mvi", 0x92, 0xEB52, truncstorei8, imm32zx8trunc>;
+
+// 16-bit immediate stores to 16-, 32- or 64-bit fields.
+def MVHHI : StoreSIL<"mvhhi", 0xE544, truncstorei16, imm32sx16trunc>;
+def MVHI  : StoreSIL<"mvhi",  0xE54C, store,         imm32sx16>;
+def MVGHI : StoreSIL<"mvghi", 0xE548, store,         imm64sx16>;
+
+// Memory-to-memory moves.
+let mayLoad = 1, mayStore = 1 in
+  defm MVC : MemorySS<"mvc", 0xD2, z_mvc, z_mvc_loop>;
+
+// String moves.
+let mayLoad = 1, mayStore = 1, Defs = [CC], Uses = [R0L] in
+  defm MVST : StringRRE<"mvst", 0xB255, z_stpcpy>;
+
+//===----------------------------------------------------------------------===//
+// Sign extensions
+//===----------------------------------------------------------------------===//
+//
+// Note that putting these before zero extensions mean that we will prefer
+// them for anyextload*.  There's not really much to choose between the two
+// either way, but signed-extending loads have a short LH and a long LHY,
+// while zero-extending loads have only the long LLH.
+//
+//===----------------------------------------------------------------------===//
+
+// 32-bit extensions from registers.
+let neverHasSideEffects = 1 in {
+  def LBR : UnaryRRE<"lb", 0xB926, sext8,  GR32, GR32>;
+  def LHR : UnaryRRE<"lh", 0xB927, sext16, GR32, GR32>;
+}
+
+// 64-bit extensions from registers.
+let neverHasSideEffects = 1 in {
+  def LGBR : UnaryRRE<"lgb", 0xB906, sext8,  GR64, GR64>;
+  def LGHR : UnaryRRE<"lgh", 0xB907, sext16, GR64, GR64>;
+  def LGFR : UnaryRRE<"lgf", 0xB914, sext32, GR64, GR32>;
+}
+let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in
+  def LTGFR : UnaryRRE<"ltgf", 0xB912, null_frag, GR64, GR64>;
+
+// Match 32-to-64-bit sign extensions in which the source is already
+// in a 64-bit register.
+def : Pat<(sext_inreg GR64:$src, i32),
+          (LGFR (EXTRACT_SUBREG GR64:$src, subreg_l32))>;
+
+// 32-bit extensions from 8-bit memory.  LBMux expands to LB or LBH,
+// depending on the choice of register.
+def LBMux : UnaryRXYPseudo<"lb", asextloadi8, GRX32, 1>,
+            Requires<[FeatureHighWord]>;
+def LB  : UnaryRXY<"lb", 0xE376, asextloadi8, GR32, 1>;
+def LBH : UnaryRXY<"lbh", 0xE3C0, asextloadi8, GRH32, 1>,
+          Requires<[FeatureHighWord]>;
+
+// 32-bit extensions from 16-bit memory.  LHMux expands to LH or LHH,
+// depending on the choice of register.
+def LHMux : UnaryRXYPseudo<"lh", asextloadi16, GRX32, 2>,
+            Requires<[FeatureHighWord]>;
+defm LH   : UnaryRXPair<"lh", 0x48, 0xE378, asextloadi16, GR32, 2>;
+def  LHH  : UnaryRXY<"lhh", 0xE3C4, asextloadi16, GRH32, 2>,
+            Requires<[FeatureHighWord]>;
+def  LHRL : UnaryRILPC<"lhrl", 0xC45, aligned_asextloadi16, GR32>;
+
+// 64-bit extensions from memory.
+def LGB   : UnaryRXY<"lgb", 0xE377, asextloadi8,  GR64, 1>;
+def LGH   : UnaryRXY<"lgh", 0xE315, asextloadi16, GR64, 2>;
+def LGF   : UnaryRXY<"lgf", 0xE314, asextloadi32, GR64, 4>;
+def LGHRL : UnaryRILPC<"lghrl", 0xC44, aligned_asextloadi16, GR64>;
+def LGFRL : UnaryRILPC<"lgfrl", 0xC4C, aligned_asextloadi32, GR64>;
+let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in
+  def LTGF : UnaryRXY<"ltgf", 0xE332, asextloadi32, GR64, 4>;
+
+//===----------------------------------------------------------------------===//
+// Zero extensions
+//===----------------------------------------------------------------------===//
+
+// 32-bit extensions from registers.
+let neverHasSideEffects = 1 in {
+  // Expands to LLCR or RISB[LH]G, depending on the choice of registers.
+  def LLCRMux : UnaryRRPseudo<"llc", zext8, GRX32, GRX32>,
+                Requires<[FeatureHighWord]>;
+  def LLCR    : UnaryRRE<"llc", 0xB994, zext8,  GR32, GR32>;
+  // Expands to LLHR or RISB[LH]G, depending on the choice of registers.
+  def LLHRMux : UnaryRRPseudo<"llh", zext16, GRX32, GRX32>,
+                Requires<[FeatureHighWord]>;
+  def LLHR    : UnaryRRE<"llh", 0xB995, zext16, GR32, GR32>;
+}
+
+// 64-bit extensions from registers.
+let neverHasSideEffects = 1 in {
+  def LLGCR : UnaryRRE<"llgc", 0xB984, zext8,  GR64, GR64>;
+  def LLGHR : UnaryRRE<"llgh", 0xB985, zext16, GR64, GR64>;
+  def LLGFR : UnaryRRE<"llgf", 0xB916, zext32, GR64, GR32>;
+}
+
+// Match 32-to-64-bit zero extensions in which the source is already
+// in a 64-bit register.
+def : Pat<(and GR64:$src, 0xffffffff),
+          (LLGFR (EXTRACT_SUBREG GR64:$src, subreg_l32))>;
+
+// 32-bit extensions from 8-bit memory.  LLCMux expands to LLC or LLCH,
+// depending on the choice of register.
+def LLCMux : UnaryRXYPseudo<"llc", azextloadi8, GRX32, 1>,
+             Requires<[FeatureHighWord]>;
+def LLC  : UnaryRXY<"llc", 0xE394, azextloadi8, GR32, 1>;
+def LLCH : UnaryRXY<"llch", 0xE3C2, azextloadi8, GR32, 1>,
+           Requires<[FeatureHighWord]>;
+
+// 32-bit extensions from 16-bit memory.  LLHMux expands to LLH or LLHH,
+// depending on the choice of register.
+def LLHMux : UnaryRXYPseudo<"llh", azextloadi16, GRX32, 2>,
+             Requires<[FeatureHighWord]>;
+def LLH   : UnaryRXY<"llh", 0xE395, azextloadi16, GR32, 2>;
+def LLHH  : UnaryRXY<"llhh", 0xE3C6, azextloadi16, GR32, 2>,
+            Requires<[FeatureHighWord]>;
+def LLHRL : UnaryRILPC<"llhrl", 0xC42, aligned_azextloadi16, GR32>;
+
+// 64-bit extensions from memory.
+def LLGC   : UnaryRXY<"llgc", 0xE390, azextloadi8,  GR64, 1>;
+def LLGH   : UnaryRXY<"llgh", 0xE391, azextloadi16, GR64, 2>;
+def LLGF   : UnaryRXY<"llgf", 0xE316, azextloadi32, GR64, 4>;
+def LLGHRL : UnaryRILPC<"llghrl", 0xC46, aligned_azextloadi16, GR64>;
+def LLGFRL : UnaryRILPC<"llgfrl", 0xC4E, aligned_azextloadi32, GR64>;
+
+//===----------------------------------------------------------------------===//
+// Truncations
+//===----------------------------------------------------------------------===//
+
+// Truncations of 64-bit registers to 32-bit registers.
+def : Pat<(i32 (trunc GR64:$src)),
+          (EXTRACT_SUBREG GR64:$src, subreg_l32)>;
+
+// Truncations of 32-bit registers to 8-bit memory.  STCMux expands to
+// STC, STCY or STCH, depending on the choice of register.
+def STCMux : StoreRXYPseudo<truncstorei8, GRX32, 1>,
+             Requires<[FeatureHighWord]>;
+defm STC : StoreRXPair<"stc", 0x42, 0xE372, truncstorei8, GR32, 1>;
+def STCH : StoreRXY<"stch", 0xE3C3, truncstorei8, GRH32, 1>,
+           Requires<[FeatureHighWord]>;
+
+// Truncations of 32-bit registers to 16-bit memory.  STHMux expands to
+// STH, STHY or STHH, depending on the choice of register.
+def STHMux : StoreRXYPseudo<truncstorei16, GRX32, 1>,
+             Requires<[FeatureHighWord]>;
+defm STH : StoreRXPair<"sth", 0x40, 0xE370, truncstorei16, GR32, 2>;
+def STHH : StoreRXY<"sthh", 0xE3C7, truncstorei16, GRH32, 2>,
+           Requires<[FeatureHighWord]>;
+def STHRL : StoreRILPC<"sthrl", 0xC47, aligned_truncstorei16, GR32>;
+
+// Truncations of 64-bit registers to memory.
+defm : StoreGR64Pair<STC, STCY, truncstorei8>;
+defm : StoreGR64Pair<STH, STHY, truncstorei16>;
+def  : StoreGR64PC<STHRL, aligned_truncstorei16>;
+defm : StoreGR64Pair<ST, STY, truncstorei32>;
+def  : StoreGR64PC<STRL, aligned_truncstorei32>;
+
+//===----------------------------------------------------------------------===//
+// Multi-register moves
+//===----------------------------------------------------------------------===//
+
+// Multi-register loads.
+def LMG : LoadMultipleRSY<"lmg", 0xEB04, GR64>;
+
+// Multi-register stores.
+def STMG : StoreMultipleRSY<"stmg", 0xEB24, GR64>;
+
+//===----------------------------------------------------------------------===//
+// Byte swaps
+//===----------------------------------------------------------------------===//
+
+// Byte-swapping register moves.
+let neverHasSideEffects = 1 in {
+  def LRVR  : UnaryRRE<"lrv",  0xB91F, bswap, GR32, GR32>;
+  def LRVGR : UnaryRRE<"lrvg", 0xB90F, bswap, GR64, GR64>;
+}
+
+// Byte-swapping loads.  Unlike normal loads, these instructions are
+// allowed to access storage more than once.
+def LRV  : UnaryRXY<"lrv",  0xE31E, loadu<bswap, nonvolatile_load>, GR32, 4>;
+def LRVG : UnaryRXY<"lrvg", 0xE30F, loadu<bswap, nonvolatile_load>, GR64, 8>;
+
+// Likewise byte-swapping stores.
+def STRV  : StoreRXY<"strv", 0xE33E, storeu<bswap, nonvolatile_store>, GR32, 4>;
+def STRVG : StoreRXY<"strvg", 0xE32F, storeu<bswap, nonvolatile_store>,
+                     GR64, 8>;
+
+//===----------------------------------------------------------------------===//
+// Load address instructions
+//===----------------------------------------------------------------------===//
+
+// Load BDX-style addresses.
+let neverHasSideEffects = 1, isAsCheapAsAMove = 1, isReMaterializable = 1,
+    DispKey = "la" in {
+  let DispSize = "12" in
+    def LA : InstRX<0x41, (outs GR64:$R1), (ins laaddr12pair:$XBD2),
+                    "la\t$R1, $XBD2",
+                    [(set GR64:$R1, laaddr12pair:$XBD2)]>;
+  let DispSize = "20" in
+    def LAY : InstRXY<0xE371, (outs GR64:$R1), (ins laaddr20pair:$XBD2),
+                      "lay\t$R1, $XBD2",
+                      [(set GR64:$R1, laaddr20pair:$XBD2)]>;
+}
+
+// Load a PC-relative address.  There's no version of this instruction
+// with a 16-bit offset, so there's no relaxation.
+let neverHasSideEffects = 1, isAsCheapAsAMove = 1, isMoveImm = 1,
+    isReMaterializable = 1 in {
+  def LARL : InstRIL<0xC00, (outs GR64:$R1), (ins pcrel32:$I2),
+                     "larl\t$R1, $I2",
+                     [(set GR64:$R1, pcrel32:$I2)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Absolute and Negation
+//===----------------------------------------------------------------------===//
+
+let Defs = [CC] in {
+  let CCValues = 0xF, CompareZeroCCMask = 0x8 in {
+    def LPR  : UnaryRR <"lp",  0x10,   z_iabs, GR32, GR32>;
+    def LPGR : UnaryRRE<"lpg", 0xB900, z_iabs, GR64, GR64>;
+  }
+  let CCValues = 0xE, CompareZeroCCMask = 0xE in
+    def LPGFR : UnaryRRE<"lpgf", 0xB910, null_frag, GR64, GR32>;
+}
+def : Pat<(z_iabs32 GR32:$src), (LPR  GR32:$src)>;
+def : Pat<(z_iabs64 GR64:$src), (LPGR GR64:$src)>;
+defm : SXU<z_iabs,   LPGFR>;
+defm : SXU<z_iabs64, LPGFR>;
+
+let Defs = [CC] in {
+  let CCValues = 0xF, CompareZeroCCMask = 0x8 in {
+    def LNR  : UnaryRR <"ln",  0x11,   z_inegabs, GR32, GR32>;
+    def LNGR : UnaryRRE<"lng", 0xB901, z_inegabs, GR64, GR64>;
+  }
+  let CCValues = 0xE, CompareZeroCCMask = 0xE in
+    def LNGFR : UnaryRRE<"lngf", 0xB911, null_frag, GR64, GR32>;
+}
+def : Pat<(z_inegabs32 GR32:$src), (LNR  GR32:$src)>;
+def : Pat<(z_inegabs64 GR64:$src), (LNGR GR64:$src)>;
+defm : SXU<z_inegabs,   LNGFR>;
+defm : SXU<z_inegabs64, LNGFR>;
+
+let Defs = [CC] in {
+  let CCValues = 0xF, CompareZeroCCMask = 0x8 in {
+    def LCR  : UnaryRR <"lc",  0x13,   ineg, GR32, GR32>;
+    def LCGR : UnaryRRE<"lcg", 0xB903, ineg, GR64, GR64>;
+  }
+  let CCValues = 0xE, CompareZeroCCMask = 0xE in
+    def LCGFR : UnaryRRE<"lcgf", 0xB913, null_frag, GR64, GR32>;
+}
+defm : SXU<ineg, LCGFR>;
+
+//===----------------------------------------------------------------------===//
+// Insertion
+//===----------------------------------------------------------------------===//
+
+let isCodeGenOnly = 1 in
+  defm IC32 : BinaryRXPair<"ic", 0x43, 0xE373, inserti8, GR32, azextloadi8, 1>;
+defm IC : BinaryRXPair<"ic", 0x43, 0xE373, inserti8, GR64, azextloadi8, 1>;
+
+defm : InsertMem<"inserti8", IC32,  GR32, azextloadi8, bdxaddr12pair>;
+defm : InsertMem<"inserti8", IC32Y, GR32, azextloadi8, bdxaddr20pair>;
+
+defm : InsertMem<"inserti8", IC,  GR64, azextloadi8, bdxaddr12pair>;
+defm : InsertMem<"inserti8", ICY, GR64, azextloadi8, bdxaddr20pair>;
+
+// Insertions of a 16-bit immediate, leaving other bits unaffected.
+// We don't have or_as_insert equivalents of these operations because
+// OI is available instead.
+//
+// IIxMux expands to II[LH]x, depending on the choice of register.
+def IILMux : BinaryRIPseudo<insertll, GRX32, imm32ll16>,
+             Requires<[FeatureHighWord]>;
+def IIHMux : BinaryRIPseudo<insertlh, GRX32, imm32lh16>,
+             Requires<[FeatureHighWord]>;
+def IILL : BinaryRI<"iill", 0xA53, insertll, GR32, imm32ll16>;
+def IILH : BinaryRI<"iilh", 0xA52, insertlh, GR32, imm32lh16>;
+def IIHL : BinaryRI<"iihl", 0xA51, insertll, GRH32, imm32ll16>;
+def IIHH : BinaryRI<"iihh", 0xA50, insertlh, GRH32, imm32lh16>;
+def IILL64 : BinaryAliasRI<insertll, GR64, imm64ll16>;
+def IILH64 : BinaryAliasRI<insertlh, GR64, imm64lh16>;
+def IIHL64 : BinaryAliasRI<inserthl, GR64, imm64hl16>;
+def IIHH64 : BinaryAliasRI<inserthh, GR64, imm64hh16>;
+
+// ...likewise for 32-bit immediates.  For GR32s this is a general
+// full-width move.  (We use IILF rather than something like LLILF
+// for 32-bit moves because IILF leaves the upper 32 bits of the
+// GR64 unchanged.)
+let isAsCheapAsAMove = 1, isMoveImm = 1, isReMaterializable = 1 in {
+  def IIFMux : UnaryRIPseudo<bitconvert, GRX32, uimm32>,
+               Requires<[FeatureHighWord]>;
+  def IILF : UnaryRIL<"iilf", 0xC09, bitconvert, GR32, uimm32>;
+  def IIHF : UnaryRIL<"iihf", 0xC08, bitconvert, GRH32, uimm32>;
+}
+def IILF64 : BinaryAliasRIL<insertlf, GR64, imm64lf32>;
+def IIHF64 : BinaryAliasRIL<inserthf, GR64, imm64hf32>;
+
+// An alternative model of inserthf, with the first operand being
+// a zero-extended value.
+def : Pat<(or (zext32 GR32:$src), imm64hf32:$imm),
+          (IIHF64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, subreg_l32),
+                  imm64hf32:$imm)>;
+
+//===----------------------------------------------------------------------===//
+// Addition
+//===----------------------------------------------------------------------===//
+
+// Plain addition.
+let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0x8 in {
+  // Addition of a register.
+  let isCommutable = 1 in {
+    defm AR : BinaryRRAndK<"a", 0x1A, 0xB9F8, add, GR32, GR32>;
+    defm AGR : BinaryRREAndK<"ag", 0xB908, 0xB9E8, add, GR64, GR64>;
+  }
+  def AGFR : BinaryRRE<"agf", 0xB918, null_frag, GR64, GR32>;
+
+  // Addition of signed 16-bit immediates.
+  defm AHIMux : BinaryRIAndKPseudo<"ahimux", add, GRX32, imm32sx16>;
+  defm AHI  : BinaryRIAndK<"ahi",  0xA7A, 0xECD8, add, GR32, imm32sx16>;
+  defm AGHI : BinaryRIAndK<"aghi", 0xA7B, 0xECD9, add, GR64, imm64sx16>;
+
+  // Addition of signed 32-bit immediates.
+  def AFIMux : BinaryRIPseudo<add, GRX32, simm32>,
+               Requires<[FeatureHighWord]>;
+  def AFI  : BinaryRIL<"afi",  0xC29, add, GR32, simm32>;
+  def AIH  : BinaryRIL<"aih",  0xCC8, add, GRH32, simm32>,
+             Requires<[FeatureHighWord]>;
+  def AGFI : BinaryRIL<"agfi", 0xC28, add, GR64, imm64sx32>;
+
+  // Addition of memory.
+  defm AH  : BinaryRXPair<"ah", 0x4A, 0xE37A, add, GR32, asextloadi16, 2>;
+  defm A   : BinaryRXPair<"a",  0x5A, 0xE35A, add, GR32, load, 4>;
+  def  AGF : BinaryRXY<"agf", 0xE318, add, GR64, asextloadi32, 4>;
+  def  AG  : BinaryRXY<"ag",  0xE308, add, GR64, load, 8>;
+
+  // Addition to memory.
+  def ASI  : BinarySIY<"asi",  0xEB6A, add, imm32sx8>;
+  def AGSI : BinarySIY<"agsi", 0xEB7A, add, imm64sx8>;
+}
+defm : SXB<add, GR64, AGFR>;
+
+// Addition producing a carry.
+let Defs = [CC] in {
+  // Addition of a register.
+  let isCommutable = 1 in {
+    defm ALR : BinaryRRAndK<"al", 0x1E, 0xB9FA, addc, GR32, GR32>;
+    defm ALGR : BinaryRREAndK<"alg", 0xB90A, 0xB9EA, addc, GR64, GR64>;
+  }
+  def ALGFR : BinaryRRE<"algf", 0xB91A, null_frag, GR64, GR32>;
+
+  // Addition of signed 16-bit immediates.
+  def ALHSIK  : BinaryRIE<"alhsik",  0xECDA, addc, GR32, imm32sx16>,
+                Requires<[FeatureDistinctOps]>;
+  def ALGHSIK : BinaryRIE<"alghsik", 0xECDB, addc, GR64, imm64sx16>,
+                Requires<[FeatureDistinctOps]>;
+
+  // Addition of unsigned 32-bit immediates.
+  def ALFI  : BinaryRIL<"alfi",  0xC2B, addc, GR32, uimm32>;
+  def ALGFI : BinaryRIL<"algfi", 0xC2A, addc, GR64, imm64zx32>;
+
+  // Addition of memory.
+  defm AL   : BinaryRXPair<"al", 0x5E, 0xE35E, addc, GR32, load, 4>;
+  def  ALGF : BinaryRXY<"algf", 0xE31A, addc, GR64, azextloadi32, 4>;
+  def  ALG  : BinaryRXY<"alg",  0xE30A, addc, GR64, load, 8>;
+}
+defm : ZXB<addc, GR64, ALGFR>;
+
+// Addition producing and using a carry.
+let Defs = [CC], Uses = [CC] in {
+  // Addition of a register.
+  def ALCR  : BinaryRRE<"alc",  0xB998, adde, GR32, GR32>;
+  def ALCGR : BinaryRRE<"alcg", 0xB988, adde, GR64, GR64>;
+
+  // Addition of memory.
+  def ALC  : BinaryRXY<"alc",  0xE398, adde, GR32, load, 4>;
+  def ALCG : BinaryRXY<"alcg", 0xE388, adde, GR64, load, 8>;
+}
+
+//===----------------------------------------------------------------------===//
+// Subtraction
+//===----------------------------------------------------------------------===//
+
+// Plain subtraction.  Although immediate forms exist, we use the
+// add-immediate instruction instead.
+let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0x8 in {
+  // Subtraction of a register.
+  defm SR : BinaryRRAndK<"s", 0x1B, 0xB9F9, sub, GR32, GR32>;
+  def SGFR : BinaryRRE<"sgf", 0xB919, null_frag, GR64, GR32>;
+  defm SGR : BinaryRREAndK<"sg", 0xB909, 0xB9E9, sub, GR64, GR64>;
+
+  // Subtraction of memory.
+  defm SH  : BinaryRXPair<"sh", 0x4B, 0xE37B, sub, GR32, asextloadi16, 2>;
+  defm S   : BinaryRXPair<"s", 0x5B, 0xE35B, sub, GR32, load, 4>;
+  def  SGF : BinaryRXY<"sgf", 0xE319, sub, GR64, asextloadi32, 4>;
+  def  SG  : BinaryRXY<"sg",  0xE309, sub, GR64, load, 8>;
+}
+defm : SXB<sub, GR64, SGFR>;
+
+// Subtraction producing a carry.
+let Defs = [CC] in {
+  // Subtraction of a register.
+  defm SLR : BinaryRRAndK<"sl", 0x1F, 0xB9FB, subc, GR32, GR32>;
+  def SLGFR : BinaryRRE<"slgf", 0xB91B, null_frag, GR64, GR32>;
+  defm SLGR : BinaryRREAndK<"slg", 0xB90B, 0xB9EB, subc, GR64, GR64>;
+
+  // Subtraction of unsigned 32-bit immediates.  These don't match
+  // subc because we prefer addc for constants.
+  def SLFI  : BinaryRIL<"slfi",  0xC25, null_frag, GR32, uimm32>;
+  def SLGFI : BinaryRIL<"slgfi", 0xC24, null_frag, GR64, imm64zx32>;
+
+  // Subtraction of memory.
+  defm SL   : BinaryRXPair<"sl", 0x5F, 0xE35F, subc, GR32, load, 4>;
+  def  SLGF : BinaryRXY<"slgf", 0xE31B, subc, GR64, azextloadi32, 4>;
+  def  SLG  : BinaryRXY<"slg",  0xE30B, subc, GR64, load, 8>;
+}
+defm : ZXB<subc, GR64, SLGFR>;
+
+// Subtraction producing and using a carry.
+let Defs = [CC], Uses = [CC] in {
+  // Subtraction of a register.
+  def SLBR  : BinaryRRE<"slb",  0xB999, sube, GR32, GR32>;
+  def SLGBR : BinaryRRE<"slbg", 0xB989, sube, GR64, GR64>;
+
+  // Subtraction of memory.
+  def SLB  : BinaryRXY<"slb",  0xE399, sube, GR32, load, 4>;
+  def SLBG : BinaryRXY<"slbg", 0xE389, sube, GR64, load, 8>;
+}
+
+//===----------------------------------------------------------------------===//
+// AND
+//===----------------------------------------------------------------------===//
+
+let Defs = [CC] in {
+  // ANDs of a register.
+  let isCommutable = 1, CCValues = 0xC, CompareZeroCCMask = 0x8 in {
+    defm NR : BinaryRRAndK<"n", 0x14, 0xB9F4, and, GR32, GR32>;
+    defm NGR : BinaryRREAndK<"ng", 0xB980, 0xB9E4, and, GR64, GR64>;
+  }
+
+  let isConvertibleToThreeAddress = 1 in {
+    // ANDs of a 16-bit immediate, leaving other bits unaffected.
+    // The CC result only reflects the 16-bit field, not the full register.
+    //
+    // NIxMux expands to NI[LH]x, depending on the choice of register.
+    def NILMux : BinaryRIPseudo<and, GRX32, imm32ll16c>,
+                 Requires<[FeatureHighWord]>;
+    def NIHMux : BinaryRIPseudo<and, GRX32, imm32lh16c>,
+                 Requires<[FeatureHighWord]>;
+    def NILL : BinaryRI<"nill", 0xA57, and, GR32, imm32ll16c>;
+    def NILH : BinaryRI<"nilh", 0xA56, and, GR32, imm32lh16c>;
+    def NIHL : BinaryRI<"nihl", 0xA55, and, GRH32, imm32ll16c>;
+    def NIHH : BinaryRI<"nihh", 0xA54, and, GRH32, imm32lh16c>;
+    def NILL64 : BinaryAliasRI<and, GR64, imm64ll16c>;
+    def NILH64 : BinaryAliasRI<and, GR64, imm64lh16c>;
+    def NIHL64 : BinaryAliasRI<and, GR64, imm64hl16c>;
+    def NIHH64 : BinaryAliasRI<and, GR64, imm64hh16c>;
+
+    // ANDs of a 32-bit immediate, leaving other bits unaffected.
+    // The CC result only reflects the 32-bit field, which means we can
+    // use it as a zero indicator for i32 operations but not otherwise.
+    let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
+      // Expands to NILF or NIHF, depending on the choice of register.
+      def NIFMux : BinaryRIPseudo<and, GRX32, uimm32>,
+                   Requires<[FeatureHighWord]>;
+      def NILF : BinaryRIL<"nilf", 0xC0B, and, GR32, uimm32>;
+      def NIHF : BinaryRIL<"nihf", 0xC0A, and, GRH32, uimm32>;
+    }
+    def NILF64 : BinaryAliasRIL<and, GR64, imm64lf32c>;
+    def NIHF64 : BinaryAliasRIL<and, GR64, imm64hf32c>;
+  }
+
+  // ANDs of memory.
+  let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
+    defm N  : BinaryRXPair<"n", 0x54, 0xE354, and, GR32, load, 4>;
+    def  NG : BinaryRXY<"ng", 0xE380, and, GR64, load, 8>; 
+  }
+
+  // AND to memory
+  defm NI : BinarySIPair<"ni", 0x94, 0xEB54, null_frag, imm32zx8>;
+
+  // Block AND.
+  let mayLoad = 1, mayStore = 1 in
+    defm NC : MemorySS<"nc", 0xD4, z_nc, z_nc_loop>;
+}
+defm : RMWIByte<and, bdaddr12pair, NI>;
+defm : RMWIByte<and, bdaddr20pair, NIY>;
+
+//===----------------------------------------------------------------------===//
+// OR
+//===----------------------------------------------------------------------===//
+
+let Defs = [CC] in {
+  // ORs of a register.
+  let isCommutable = 1, CCValues = 0xC, CompareZeroCCMask = 0x8 in {
+    defm OR : BinaryRRAndK<"o", 0x16, 0xB9F6, or, GR32, GR32>;
+    defm OGR : BinaryRREAndK<"og", 0xB981, 0xB9E6, or, GR64, GR64>;
+  }
+
+  // ORs of a 16-bit immediate, leaving other bits unaffected.
+  // The CC result only reflects the 16-bit field, not the full register.
+  //
+  // OIxMux expands to OI[LH]x, depending on the choice of register.
+  def OILMux : BinaryRIPseudo<or, GRX32, imm32ll16>,
+               Requires<[FeatureHighWord]>;
+  def OIHMux : BinaryRIPseudo<or, GRX32, imm32lh16>,
+               Requires<[FeatureHighWord]>;
+  def OILL : BinaryRI<"oill", 0xA5B, or, GR32, imm32ll16>;
+  def OILH : BinaryRI<"oilh", 0xA5A, or, GR32, imm32lh16>;
+  def OIHL : BinaryRI<"oihl", 0xA59, or, GRH32, imm32ll16>;
+  def OIHH : BinaryRI<"oihh", 0xA58, or, GRH32, imm32lh16>;
+  def OILL64 : BinaryAliasRI<or, GR64, imm64ll16>;
+  def OILH64 : BinaryAliasRI<or, GR64, imm64lh16>;
+  def OIHL64 : BinaryAliasRI<or, GR64, imm64hl16>;
+  def OIHH64 : BinaryAliasRI<or, GR64, imm64hh16>;
+
+  // ORs of a 32-bit immediate, leaving other bits unaffected.
+  // The CC result only reflects the 32-bit field, which means we can
+  // use it as a zero indicator for i32 operations but not otherwise.
+  let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
+    // Expands to OILF or OIHF, depending on the choice of register.
+    def OIFMux : BinaryRIPseudo<or, GRX32, uimm32>,
+                 Requires<[FeatureHighWord]>;
+    def OILF : BinaryRIL<"oilf", 0xC0D, or, GR32, uimm32>;
+    def OIHF : BinaryRIL<"oihf", 0xC0C, or, GRH32, uimm32>;
+  }
+  def OILF64 : BinaryAliasRIL<or, GR64, imm64lf32>;
+  def OIHF64 : BinaryAliasRIL<or, GR64, imm64hf32>;
+
+  // ORs of memory.
+  let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
+    defm O  : BinaryRXPair<"o", 0x56, 0xE356, or, GR32, load, 4>;
+    def  OG : BinaryRXY<"og", 0xE381, or, GR64, load, 8>;
+  }
+
+  // OR to memory
+  defm OI : BinarySIPair<"oi", 0x96, 0xEB56, null_frag, imm32zx8>;
+
+  // Block OR.
+  let mayLoad = 1, mayStore = 1 in
+    defm OC : MemorySS<"oc", 0xD6, z_oc, z_oc_loop>;
+}
+defm : RMWIByte<or, bdaddr12pair, OI>;
+defm : RMWIByte<or, bdaddr20pair, OIY>;
+
+//===----------------------------------------------------------------------===//
+// XOR
+//===----------------------------------------------------------------------===//
+
+let Defs = [CC] in {
+  // XORs of a register.
+  let isCommutable = 1, CCValues = 0xC, CompareZeroCCMask = 0x8 in {
+    defm XR : BinaryRRAndK<"x", 0x17, 0xB9F7, xor, GR32, GR32>;
+    defm XGR : BinaryRREAndK<"xg", 0xB982, 0xB9E7, xor, GR64, GR64>;
+  }
+
+  // XORs of a 32-bit immediate, leaving other bits unaffected.
+  // The CC result only reflects the 32-bit field, which means we can
+  // use it as a zero indicator for i32 operations but not otherwise.
+  let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
+    // Expands to XILF or XIHF, depending on the choice of register.
+    def XIFMux : BinaryRIPseudo<xor, GRX32, uimm32>,
+                 Requires<[FeatureHighWord]>;
+    def XILF : BinaryRIL<"xilf", 0xC07, xor, GR32, uimm32>;
+    def XIHF : BinaryRIL<"xihf", 0xC06, xor, GRH32, uimm32>;
+  }
+  def XILF64 : BinaryAliasRIL<xor, GR64, imm64lf32>;
+  def XIHF64 : BinaryAliasRIL<xor, GR64, imm64hf32>;
+
+  // XORs of memory.
+  let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
+    defm X  : BinaryRXPair<"x",0x57, 0xE357, xor, GR32, load, 4>;
+    def  XG : BinaryRXY<"xg", 0xE382, xor, GR64, load, 8>;
+  }
+
+  // XOR to memory
+  defm XI : BinarySIPair<"xi", 0x97, 0xEB57, null_frag, imm32zx8>;
+
+  // Block XOR.
+  let mayLoad = 1, mayStore = 1 in
+    defm XC : MemorySS<"xc", 0xD7, z_xc, z_xc_loop>;
+}
+defm : RMWIByte<xor, bdaddr12pair, XI>;
+defm : RMWIByte<xor, bdaddr20pair, XIY>;
+
+//===----------------------------------------------------------------------===//
+// Multiplication
+//===----------------------------------------------------------------------===//
+
+// Multiplication of a register.
+let isCommutable = 1 in {
+  def MSR  : BinaryRRE<"ms",  0xB252, mul, GR32, GR32>;
+  def MSGR : BinaryRRE<"msg", 0xB90C, mul, GR64, GR64>;
+}
+def MSGFR : BinaryRRE<"msgf", 0xB91C, null_frag, GR64, GR32>;
+defm : SXB<mul, GR64, MSGFR>;
+
+// Multiplication of a signed 16-bit immediate.
+def MHI  : BinaryRI<"mhi",  0xA7C, mul, GR32, imm32sx16>;
+def MGHI : BinaryRI<"mghi", 0xA7D, mul, GR64, imm64sx16>;
+
+// Multiplication of a signed 32-bit immediate.
+def MSFI  : BinaryRIL<"msfi",  0xC21, mul, GR32, simm32>;
+def MSGFI : BinaryRIL<"msgfi", 0xC20, mul, GR64, imm64sx32>;
+
+// Multiplication of memory.
+defm MH   : BinaryRXPair<"mh", 0x4C, 0xE37C, mul, GR32, asextloadi16, 2>;
+defm MS   : BinaryRXPair<"ms", 0x71, 0xE351, mul, GR32, load, 4>;
+def  MSGF : BinaryRXY<"msgf", 0xE31C, mul, GR64, asextloadi32, 4>;
+def  MSG  : BinaryRXY<"msg",  0xE30C, mul, GR64, load, 8>;
+
+// Multiplication of a register, producing two results.
+def MLGR : BinaryRRE<"mlg", 0xB986, z_umul_lohi64, GR128, GR64>;
+
+// Multiplication of memory, producing two results.
+def MLG : BinaryRXY<"mlg", 0xE386, z_umul_lohi64, GR128, load, 8>;
+
+//===----------------------------------------------------------------------===//
+// Division and remainder
+//===----------------------------------------------------------------------===//
+
+// Division and remainder, from registers.
+def DSGFR : BinaryRRE<"dsgf", 0xB91D, z_sdivrem32, GR128, GR32>;
+def DSGR  : BinaryRRE<"dsg",  0xB90D, z_sdivrem64, GR128, GR64>;
+def DLR   : BinaryRRE<"dl",   0xB997, z_udivrem32, GR128, GR32>;
+def DLGR  : BinaryRRE<"dlg",  0xB987, z_udivrem64, GR128, GR64>;
+
+// Division and remainder, from memory.
+def DSGF : BinaryRXY<"dsgf", 0xE31D, z_sdivrem32, GR128, load, 4>;
+def DSG  : BinaryRXY<"dsg",  0xE30D, z_sdivrem64, GR128, load, 8>;
+def DL   : BinaryRXY<"dl",   0xE397, z_udivrem32, GR128, load, 4>;
+def DLG  : BinaryRXY<"dlg",  0xE387, z_udivrem64, GR128, load, 8>;
+
+//===----------------------------------------------------------------------===//
+// Shifts
+//===----------------------------------------------------------------------===//
+
+// Shift left.
+let neverHasSideEffects = 1 in {
+  defm SLL : BinaryRSAndK<"sll", 0x89, 0xEBDF, shl, GR32>;
+  def SLLG : BinaryRSY<"sllg", 0xEB0D, shl, GR64>;
+}
+
+// Logical shift right.
+let neverHasSideEffects = 1 in {
+  defm SRL : BinaryRSAndK<"srl", 0x88, 0xEBDE, srl, GR32>;
+  def SRLG : BinaryRSY<"srlg", 0xEB0C, srl, GR64>;
+}
+
+// Arithmetic shift right.
+let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in {
+  defm SRA : BinaryRSAndK<"sra", 0x8A, 0xEBDC, sra, GR32>;
+  def SRAG : BinaryRSY<"srag", 0xEB0A, sra, GR64>;
+}
+
+// Rotate left.
+let neverHasSideEffects = 1 in {
+  def RLL  : BinaryRSY<"rll",  0xEB1D, rotl, GR32>;
+  def RLLG : BinaryRSY<"rllg", 0xEB1C, rotl, GR64>;
+}
+
+// Rotate second operand left and inserted selected bits into first operand.
+// These can act like 32-bit operands provided that the constant start and
+// end bits (operands 2 and 3) are in the range [32, 64).
+let Defs = [CC] in {
+  let isCodeGenOnly = 1 in
+    def RISBG32 : RotateSelectRIEf<"risbg", 0xEC55, GR32, GR32>;
+  let CCValues = 0xE, CompareZeroCCMask = 0xE in
+    def RISBG : RotateSelectRIEf<"risbg", 0xEC55, GR64, GR64>;
+}
+
+// Forms of RISBG that only affect one word of the destination register.
+// They do not set CC.
+let Predicates = [FeatureHighWord] in {
+  def RISBMux : RotateSelectRIEfPseudo<GRX32, GRX32>;
+  def RISBLL  : RotateSelectAliasRIEf<GR32,  GR32>;
+  def RISBLH  : RotateSelectAliasRIEf<GR32,  GRH32>;
+  def RISBHL  : RotateSelectAliasRIEf<GRH32, GR32>;
+  def RISBHH  : RotateSelectAliasRIEf<GRH32, GRH32>;
+  def RISBLG  : RotateSelectRIEf<"risblg", 0xEC51, GR32, GR64>;
+  def RISBHG  : RotateSelectRIEf<"risbhg", 0xEC5D, GRH32, GR64>;
+}
+
+// Rotate second operand left and perform a logical operation with selected
+// bits of the first operand.  The CC result only describes the selected bits,
+// so isn't useful for a full comparison against zero.
+let Defs = [CC] in {
+  def RNSBG : RotateSelectRIEf<"rnsbg", 0xEC54, GR64, GR64>;
+  def ROSBG : RotateSelectRIEf<"rosbg", 0xEC56, GR64, GR64>;
+  def RXSBG : RotateSelectRIEf<"rxsbg", 0xEC57, GR64, GR64>;
+}
+
+//===----------------------------------------------------------------------===//
+// Comparison
+//===----------------------------------------------------------------------===//
+
+// Signed comparisons.  We put these before the unsigned comparisons because
+// some of the signed forms have COMPARE AND BRANCH equivalents whereas none
+// of the unsigned forms do.
+let Defs = [CC], CCValues = 0xE in {
+  // Comparison with a register.
+  def CR   : CompareRR <"c",   0x19,   z_scmp,    GR32, GR32>;
+  def CGFR : CompareRRE<"cgf", 0xB930, null_frag, GR64, GR32>;
+  def CGR  : CompareRRE<"cg",  0xB920, z_scmp,    GR64, GR64>;
+
+  // Comparison with a signed 16-bit immediate.
+  def CHI  : CompareRI<"chi",  0xA7E, z_scmp, GR32, imm32sx16>;
+  def CGHI : CompareRI<"cghi", 0xA7F, z_scmp, GR64, imm64sx16>;
+
+  // Comparison with a signed 32-bit immediate.  CFIMux expands to CFI or CIH,
+  // depending on the choice of register.
+  def CFIMux : CompareRIPseudo<z_scmp, GRX32, simm32>,
+               Requires<[FeatureHighWord]>;
+  def CFI  : CompareRIL<"cfi",  0xC2D, z_scmp, GR32, simm32>;
+  def CIH  : CompareRIL<"cih",  0xCCD, z_scmp, GRH32, simm32>,
+             Requires<[FeatureHighWord]>;
+  def CGFI : CompareRIL<"cgfi", 0xC2C, z_scmp, GR64, imm64sx32>;
+
+  // Comparison with memory.
+  defm CH    : CompareRXPair<"ch", 0x49, 0xE379, z_scmp, GR32, asextloadi16, 2>;
+  def  CMux  : CompareRXYPseudo<z_scmp, GRX32, load, 4>,
+               Requires<[FeatureHighWord]>;
+  defm C     : CompareRXPair<"c",  0x59, 0xE359, z_scmp, GR32, load, 4>;
+  def  CHF   : CompareRXY<"chf", 0xE3CD, z_scmp, GRH32, load, 4>,
+               Requires<[FeatureHighWord]>;
+  def  CGH   : CompareRXY<"cgh", 0xE334, z_scmp, GR64, asextloadi16, 2>;
+  def  CGF   : CompareRXY<"cgf", 0xE330, z_scmp, GR64, asextloadi32, 4>;
+  def  CG    : CompareRXY<"cg",  0xE320, z_scmp, GR64, load, 8>;
+  def  CHRL  : CompareRILPC<"chrl",  0xC65, z_scmp, GR32, aligned_asextloadi16>;
+  def  CRL   : CompareRILPC<"crl",   0xC6D, z_scmp, GR32, aligned_load>;
+  def  CGHRL : CompareRILPC<"cghrl", 0xC64, z_scmp, GR64, aligned_asextloadi16>;
+  def  CGFRL : CompareRILPC<"cgfrl", 0xC6C, z_scmp, GR64, aligned_asextloadi32>;
+  def  CGRL  : CompareRILPC<"cgrl",  0xC68, z_scmp, GR64, aligned_load>;
+
+  // Comparison between memory and a signed 16-bit immediate.
+  def CHHSI : CompareSIL<"chhsi", 0xE554, z_scmp, asextloadi16, imm32sx16>;
+  def CHSI  : CompareSIL<"chsi",  0xE55C, z_scmp, load, imm32sx16>;
+  def CGHSI : CompareSIL<"cghsi", 0xE558, z_scmp, load, imm64sx16>;
+}
+defm : SXB<z_scmp, GR64, CGFR>;
+
+// Unsigned comparisons.
+let Defs = [CC], CCValues = 0xE, IsLogical = 1 in {
+  // Comparison with a register.
+  def CLR   : CompareRR <"cl",   0x15,   z_ucmp,    GR32, GR32>;
+  def CLGFR : CompareRRE<"clgf", 0xB931, null_frag, GR64, GR32>;
+  def CLGR  : CompareRRE<"clg",  0xB921, z_ucmp,    GR64, GR64>;
+
+  // Comparison with an unsigned 32-bit immediate.  CLFIMux expands to CLFI
+  // or CLIH, depending on the choice of register.
+  def CLFIMux : CompareRIPseudo<z_ucmp, GRX32, uimm32>,
+                Requires<[FeatureHighWord]>;
+  def CLFI  : CompareRIL<"clfi",  0xC2F, z_ucmp, GR32, uimm32>;
+  def CLIH  : CompareRIL<"clih",  0xCCF, z_ucmp, GR32, uimm32>,
+              Requires<[FeatureHighWord]>;
+  def CLGFI : CompareRIL<"clgfi", 0xC2E, z_ucmp, GR64, imm64zx32>;
+
+  // Comparison with memory.
+  def  CLMux  : CompareRXYPseudo<z_ucmp, GRX32, load, 4>,
+                Requires<[FeatureHighWord]>;
+  defm CL     : CompareRXPair<"cl", 0x55, 0xE355, z_ucmp, GR32, load, 4>;
+  def  CLHF   : CompareRXY<"clhf", 0xE3CF, z_ucmp, GRH32, load, 4>,
+                Requires<[FeatureHighWord]>;
+  def  CLGF   : CompareRXY<"clgf", 0xE331, z_ucmp, GR64, azextloadi32, 4>;
+  def  CLG    : CompareRXY<"clg",  0xE321, z_ucmp, GR64, load, 8>;
+  def  CLHRL  : CompareRILPC<"clhrl",  0xC67, z_ucmp, GR32,
+                             aligned_azextloadi16>;
+  def  CLRL   : CompareRILPC<"clrl",   0xC6F, z_ucmp, GR32,
+                             aligned_load>;
+  def  CLGHRL : CompareRILPC<"clghrl", 0xC66, z_ucmp, GR64,
+                             aligned_azextloadi16>;
+  def  CLGFRL : CompareRILPC<"clgfrl", 0xC6E, z_ucmp, GR64,
+                             aligned_azextloadi32>;
+  def  CLGRL  : CompareRILPC<"clgrl",  0xC6A, z_ucmp, GR64,
+                             aligned_load>;
+
+  // Comparison between memory and an unsigned 8-bit immediate.
+  defm CLI : CompareSIPair<"cli", 0x95, 0xEB55, z_ucmp, azextloadi8, imm32zx8>;
+
+  // Comparison between memory and an unsigned 16-bit immediate.
+  def CLHHSI : CompareSIL<"clhhsi", 0xE555, z_ucmp, azextloadi16, imm32zx16>;
+  def CLFHSI : CompareSIL<"clfhsi", 0xE55D, z_ucmp, load, imm32zx16>;
+  def CLGHSI : CompareSIL<"clghsi", 0xE559, z_ucmp, load, imm64zx16>;
+}
+defm : ZXB<z_ucmp, GR64, CLGFR>;
+
+// Memory-to-memory comparison.
+let mayLoad = 1, Defs = [CC] in
+  defm CLC : MemorySS<"clc", 0xD5, z_clc, z_clc_loop>;
+
+// String comparison.
+let mayLoad = 1, Defs = [CC], Uses = [R0L] in
+  defm CLST : StringRRE<"clst", 0xB25D, z_strcmp>;
+
+// Test under mask.
+let Defs = [CC] in {
+  // TMxMux expands to TM[LH]x, depending on the choice of register.
+  def TMLMux : CompareRIPseudo<z_tm_reg, GRX32, imm32ll16>,
+               Requires<[FeatureHighWord]>;
+  def TMHMux : CompareRIPseudo<z_tm_reg, GRX32, imm32lh16>,
+               Requires<[FeatureHighWord]>;
+  def TMLL : CompareRI<"tmll", 0xA71, z_tm_reg, GR32, imm32ll16>;
+  def TMLH : CompareRI<"tmlh", 0xA70, z_tm_reg, GR32, imm32lh16>;
+  def TMHL : CompareRI<"tmhl", 0xA73, z_tm_reg, GRH32, imm32ll16>;
+  def TMHH : CompareRI<"tmhh", 0xA72, z_tm_reg, GRH32, imm32lh16>;
+
+  def TMLL64 : CompareAliasRI<z_tm_reg, GR64, imm64ll16>;
+  def TMLH64 : CompareAliasRI<z_tm_reg, GR64, imm64lh16>;
+  def TMHL64 : CompareAliasRI<z_tm_reg, GR64, imm64hl16>;
+  def TMHH64 : CompareAliasRI<z_tm_reg, GR64, imm64hh16>;
+
+  defm TM : CompareSIPair<"tm", 0x91, 0xEB51, z_tm_mem, anyextloadi8, imm32zx8>;
+}
+
+//===----------------------------------------------------------------------===//
+// Prefetch
+//===----------------------------------------------------------------------===//
+
+def PFD : PrefetchRXY<"pfd", 0xE336, z_prefetch>;
+def PFDRL : PrefetchRILPC<"pfdrl", 0xC62, z_prefetch>;
+
+//===----------------------------------------------------------------------===//
+// Atomic operations
+//===----------------------------------------------------------------------===//
+
+def Serialize : Alias<2, (outs), (ins), [(z_serialize)]>;
+
+let Predicates = [FeatureInterlockedAccess1], Defs = [CC] in {
+  def LAA   : LoadAndOpRSY<"laa",   0xEBF8, atomic_load_add_32, GR32>;
+  def LAAG  : LoadAndOpRSY<"laag",  0xEBE8, atomic_load_add_64, GR64>;
+  def LAAL  : LoadAndOpRSY<"laal",  0xEBFA, null_frag, GR32>;
+  def LAALG : LoadAndOpRSY<"laalg", 0xEBEA, null_frag, GR64>;
+  def LAN   : LoadAndOpRSY<"lan",   0xEBF4, atomic_load_and_32, GR32>;
+  def LANG  : LoadAndOpRSY<"lang",  0xEBE4, atomic_load_and_64, GR64>;
+  def LAO   : LoadAndOpRSY<"lao",   0xEBF6, atomic_load_or_32, GR32>;
+  def LAOG  : LoadAndOpRSY<"laog",  0xEBE6, atomic_load_or_64, GR64>;
+  def LAX   : LoadAndOpRSY<"lax",   0xEBF7, atomic_load_xor_32, GR32>;
+  def LAXG  : LoadAndOpRSY<"laxg",  0xEBE7, atomic_load_xor_64, GR64>;
+}
+
+def ATOMIC_SWAPW   : AtomicLoadWBinaryReg<z_atomic_swapw>;
+def ATOMIC_SWAP_32 : AtomicLoadBinaryReg32<atomic_swap_32>;
+def ATOMIC_SWAP_64 : AtomicLoadBinaryReg64<atomic_swap_64>;
+
+def ATOMIC_LOADW_AR  : AtomicLoadWBinaryReg<z_atomic_loadw_add>;
+def ATOMIC_LOADW_AFI : AtomicLoadWBinaryImm<z_atomic_loadw_add, simm32>;
+let Predicates = [FeatureNoInterlockedAccess1] in {
+  def ATOMIC_LOAD_AR   : AtomicLoadBinaryReg32<atomic_load_add_32>;
+  def ATOMIC_LOAD_AHI  : AtomicLoadBinaryImm32<atomic_load_add_32, imm32sx16>;
+  def ATOMIC_LOAD_AFI  : AtomicLoadBinaryImm32<atomic_load_add_32, simm32>;
+  def ATOMIC_LOAD_AGR  : AtomicLoadBinaryReg64<atomic_load_add_64>;
+  def ATOMIC_LOAD_AGHI : AtomicLoadBinaryImm64<atomic_load_add_64, imm64sx16>;
+  def ATOMIC_LOAD_AGFI : AtomicLoadBinaryImm64<atomic_load_add_64, imm64sx32>;
+}
+
+def ATOMIC_LOADW_SR : AtomicLoadWBinaryReg<z_atomic_loadw_sub>;
+def ATOMIC_LOAD_SR  : AtomicLoadBinaryReg32<atomic_load_sub_32>;
+def ATOMIC_LOAD_SGR : AtomicLoadBinaryReg64<atomic_load_sub_64>;
+
+def ATOMIC_LOADW_NR   : AtomicLoadWBinaryReg<z_atomic_loadw_and>;
+def ATOMIC_LOADW_NILH : AtomicLoadWBinaryImm<z_atomic_loadw_and, imm32lh16c>;
+let Predicates = [FeatureNoInterlockedAccess1] in {
+  def ATOMIC_LOAD_NR     : AtomicLoadBinaryReg32<atomic_load_and_32>;
+  def ATOMIC_LOAD_NILL   : AtomicLoadBinaryImm32<atomic_load_and_32,
+                                                 imm32ll16c>;
+  def ATOMIC_LOAD_NILH   : AtomicLoadBinaryImm32<atomic_load_and_32,
+                                                 imm32lh16c>;
+  def ATOMIC_LOAD_NILF   : AtomicLoadBinaryImm32<atomic_load_and_32, uimm32>;
+  def ATOMIC_LOAD_NGR    : AtomicLoadBinaryReg64<atomic_load_and_64>;
+  def ATOMIC_LOAD_NILL64 : AtomicLoadBinaryImm64<atomic_load_and_64,
+                                                 imm64ll16c>;
+  def ATOMIC_LOAD_NILH64 : AtomicLoadBinaryImm64<atomic_load_and_64,
+                                                 imm64lh16c>;
+  def ATOMIC_LOAD_NIHL64 : AtomicLoadBinaryImm64<atomic_load_and_64,
+                                                 imm64hl16c>;
+  def ATOMIC_LOAD_NIHH64 : AtomicLoadBinaryImm64<atomic_load_and_64,
+                                                 imm64hh16c>;
+  def ATOMIC_LOAD_NILF64 : AtomicLoadBinaryImm64<atomic_load_and_64,
+                                                 imm64lf32c>;
+  def ATOMIC_LOAD_NIHF64 : AtomicLoadBinaryImm64<atomic_load_and_64,
+                                                 imm64hf32c>;
+}
+
+def ATOMIC_LOADW_OR     : AtomicLoadWBinaryReg<z_atomic_loadw_or>;
+def ATOMIC_LOADW_OILH   : AtomicLoadWBinaryImm<z_atomic_loadw_or, imm32lh16>;
+let Predicates = [FeatureNoInterlockedAccess1] in {
+  def ATOMIC_LOAD_OR     : AtomicLoadBinaryReg32<atomic_load_or_32>;
+  def ATOMIC_LOAD_OILL   : AtomicLoadBinaryImm32<atomic_load_or_32, imm32ll16>;
+  def ATOMIC_LOAD_OILH   : AtomicLoadBinaryImm32<atomic_load_or_32, imm32lh16>;
+  def ATOMIC_LOAD_OILF   : AtomicLoadBinaryImm32<atomic_load_or_32, uimm32>;
+  def ATOMIC_LOAD_OGR    : AtomicLoadBinaryReg64<atomic_load_or_64>;
+  def ATOMIC_LOAD_OILL64 : AtomicLoadBinaryImm64<atomic_load_or_64, imm64ll16>;
+  def ATOMIC_LOAD_OILH64 : AtomicLoadBinaryImm64<atomic_load_or_64, imm64lh16>;
+  def ATOMIC_LOAD_OIHL64 : AtomicLoadBinaryImm64<atomic_load_or_64, imm64hl16>;
+  def ATOMIC_LOAD_OIHH64 : AtomicLoadBinaryImm64<atomic_load_or_64, imm64hh16>;
+  def ATOMIC_LOAD_OILF64 : AtomicLoadBinaryImm64<atomic_load_or_64, imm64lf32>;
+  def ATOMIC_LOAD_OIHF64 : AtomicLoadBinaryImm64<atomic_load_or_64, imm64hf32>;
+}
+
+def ATOMIC_LOADW_XR     : AtomicLoadWBinaryReg<z_atomic_loadw_xor>;
+def ATOMIC_LOADW_XILF   : AtomicLoadWBinaryImm<z_atomic_loadw_xor, uimm32>;
+let Predicates = [FeatureNoInterlockedAccess1] in {
+  def ATOMIC_LOAD_XR     : AtomicLoadBinaryReg32<atomic_load_xor_32>;
+  def ATOMIC_LOAD_XILF   : AtomicLoadBinaryImm32<atomic_load_xor_32, uimm32>;
+  def ATOMIC_LOAD_XGR    : AtomicLoadBinaryReg64<atomic_load_xor_64>;
+  def ATOMIC_LOAD_XILF64 : AtomicLoadBinaryImm64<atomic_load_xor_64, imm64lf32>;
+  def ATOMIC_LOAD_XIHF64 : AtomicLoadBinaryImm64<atomic_load_xor_64, imm64hf32>;
+}
+
+def ATOMIC_LOADW_NRi    : AtomicLoadWBinaryReg<z_atomic_loadw_nand>;
+def ATOMIC_LOADW_NILHi  : AtomicLoadWBinaryImm<z_atomic_loadw_nand,
+                                               imm32lh16c>;
+def ATOMIC_LOAD_NRi     : AtomicLoadBinaryReg32<atomic_load_nand_32>;
+def ATOMIC_LOAD_NILLi   : AtomicLoadBinaryImm32<atomic_load_nand_32,
+                                                imm32ll16c>;
+def ATOMIC_LOAD_NILHi   : AtomicLoadBinaryImm32<atomic_load_nand_32,
+                                                imm32lh16c>;
+def ATOMIC_LOAD_NILFi   : AtomicLoadBinaryImm32<atomic_load_nand_32, uimm32>;
+def ATOMIC_LOAD_NGRi    : AtomicLoadBinaryReg64<atomic_load_nand_64>;
+def ATOMIC_LOAD_NILL64i : AtomicLoadBinaryImm64<atomic_load_nand_64,
+                                                imm64ll16c>;
+def ATOMIC_LOAD_NILH64i : AtomicLoadBinaryImm64<atomic_load_nand_64,
+                                                imm64lh16c>;
+def ATOMIC_LOAD_NIHL64i : AtomicLoadBinaryImm64<atomic_load_nand_64,
+                                                imm64hl16c>;
+def ATOMIC_LOAD_NIHH64i : AtomicLoadBinaryImm64<atomic_load_nand_64,
+                                                imm64hh16c>;
+def ATOMIC_LOAD_NILF64i : AtomicLoadBinaryImm64<atomic_load_nand_64,
+                                                imm64lf32c>;
+def ATOMIC_LOAD_NIHF64i : AtomicLoadBinaryImm64<atomic_load_nand_64,
+                                                imm64hf32c>;
+
+def ATOMIC_LOADW_MIN    : AtomicLoadWBinaryReg<z_atomic_loadw_min>;
+def ATOMIC_LOAD_MIN_32  : AtomicLoadBinaryReg32<atomic_load_min_32>;
+def ATOMIC_LOAD_MIN_64  : AtomicLoadBinaryReg64<atomic_load_min_64>;
+
+def ATOMIC_LOADW_MAX    : AtomicLoadWBinaryReg<z_atomic_loadw_max>;
+def ATOMIC_LOAD_MAX_32  : AtomicLoadBinaryReg32<atomic_load_max_32>;
+def ATOMIC_LOAD_MAX_64  : AtomicLoadBinaryReg64<atomic_load_max_64>;
+
+def ATOMIC_LOADW_UMIN   : AtomicLoadWBinaryReg<z_atomic_loadw_umin>;
+def ATOMIC_LOAD_UMIN_32 : AtomicLoadBinaryReg32<atomic_load_umin_32>;
+def ATOMIC_LOAD_UMIN_64 : AtomicLoadBinaryReg64<atomic_load_umin_64>;
+
+def ATOMIC_LOADW_UMAX   : AtomicLoadWBinaryReg<z_atomic_loadw_umax>;
+def ATOMIC_LOAD_UMAX_32 : AtomicLoadBinaryReg32<atomic_load_umax_32>;
+def ATOMIC_LOAD_UMAX_64 : AtomicLoadBinaryReg64<atomic_load_umax_64>;
+
+def ATOMIC_CMP_SWAPW
+  : Pseudo<(outs GR32:$dst), (ins bdaddr20only:$addr, GR32:$cmp, GR32:$swap,
+                                  ADDR32:$bitshift, ADDR32:$negbitshift,
+                                  uimm32:$bitsize),
+           [(set GR32:$dst,
+                 (z_atomic_cmp_swapw bdaddr20only:$addr, GR32:$cmp, GR32:$swap,
+                                     ADDR32:$bitshift, ADDR32:$negbitshift,
+                                     uimm32:$bitsize))]> {
+  let Defs = [CC];
+  let mayLoad = 1;
+  let mayStore = 1;
+  let usesCustomInserter = 1;
+}
+
+let Defs = [CC] in {
+  defm CS  : CmpSwapRSPair<"cs", 0xBA, 0xEB14, atomic_cmp_swap_32, GR32>;
+  def  CSG : CmpSwapRSY<"csg", 0xEB30, atomic_cmp_swap_64, GR64>;
+}
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous Instructions.
+//===----------------------------------------------------------------------===//
+
+// Extract CC into bits 29 and 28 of a register.
+let Uses = [CC] in
+  def IPM : InherentRRE<"ipm", 0xB222, GR32, (z_ipm)>;
+
+// Read a 32-bit access register into a GR32.  As with all GR32 operations,
+// the upper 32 bits of the enclosing GR64 remain unchanged, which is useful
+// when a 64-bit address is stored in a pair of access registers.
+def EAR : InstRRE<0xB24F, (outs GR32:$R1), (ins access_reg:$R2),
+                  "ear\t$R1, $R2",
+                  [(set GR32:$R1, (z_extract_access access_reg:$R2))]>;
+
+// Find leftmost one, AKA count leading zeros.  The instruction actually
+// returns a pair of GR64s, the first giving the number of leading zeros
+// and the second giving a copy of the source with the leftmost one bit
+// cleared.  We only use the first result here.
+let Defs = [CC] in {
+  def FLOGR : UnaryRRE<"flog", 0xB983, null_frag, GR128, GR64>;
+}
+def : Pat<(ctlz GR64:$src),
+          (EXTRACT_SUBREG (FLOGR GR64:$src), subreg_h64)>;
+
+// Use subregs to populate the "don't care" bits in a 32-bit to 64-bit anyext.
+def : Pat<(i64 (anyext GR32:$src)),
+          (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, subreg_l32)>;
+
+// Extend GR32s and GR64s to GR128s.
+let usesCustomInserter = 1 in {
+  def AEXT128_64 : Pseudo<(outs GR128:$dst), (ins GR64:$src), []>;
+  def ZEXT128_32 : Pseudo<(outs GR128:$dst), (ins GR32:$src), []>;
+  def ZEXT128_64 : Pseudo<(outs GR128:$dst), (ins GR64:$src), []>;
+}
+
+// Search a block of memory for a character.
+let mayLoad = 1, Defs = [CC], Uses = [R0L] in
+  defm SRST : StringRRE<"srst", 0xb25e, z_search_string>;
+
+//===----------------------------------------------------------------------===//
+// Peepholes.
+//===----------------------------------------------------------------------===//
+
+// Use AL* for GR64 additions of unsigned 32-bit values.
+defm : ZXB<add, GR64, ALGFR>;
+def  : Pat<(add GR64:$src1, imm64zx32:$src2),
+           (ALGFI GR64:$src1, imm64zx32:$src2)>;
+def  : Pat<(add GR64:$src1, (azextloadi32 bdxaddr20only:$addr)),
+           (ALGF GR64:$src1, bdxaddr20only:$addr)>;
+
+// Use SL* for GR64 subtractions of unsigned 32-bit values.
+defm : ZXB<sub, GR64, SLGFR>;
+def  : Pat<(add GR64:$src1, imm64zx32n:$src2),
+           (SLGFI GR64:$src1, imm64zx32n:$src2)>;
+def  : Pat<(sub GR64:$src1, (azextloadi32 bdxaddr20only:$addr)),
+           (SLGF GR64:$src1, bdxaddr20only:$addr)>;
+
+// Optimize sign-extended 1/0 selects to -1/0 selects.  This is important
+// for vector legalization.
+def : Pat<(sra (shl (i32 (z_select_ccmask 1, 0, imm32zx4:$valid, imm32zx4:$cc)),
+                         (i32 31)),
+                    (i32 31)),
+          (Select32 (LHI -1), (LHI 0), imm32zx4:$valid, imm32zx4:$cc)>;
+def : Pat<(sra (shl (i64 (anyext (i32 (z_select_ccmask 1, 0, imm32zx4:$valid,
+                                                       imm32zx4:$cc)))),
+                    (i32 63)),
+               (i32 63)),
+          (Select64 (LGHI -1), (LGHI 0), imm32zx4:$valid, imm32zx4:$cc)>;
+
+// Peepholes for turning scalar operations into block operations.
+defm : BlockLoadStore<anyextloadi8, i32, MVCSequence, NCSequence, OCSequence,
+                      XCSequence, 1>;
+defm : BlockLoadStore<anyextloadi16, i32, MVCSequence, NCSequence, OCSequence,
+                      XCSequence, 2>;
+defm : BlockLoadStore<load, i32, MVCSequence, NCSequence, OCSequence,
+                      XCSequence, 4>;
+defm : BlockLoadStore<anyextloadi8, i64, MVCSequence, NCSequence,
+                      OCSequence, XCSequence, 1>;
+defm : BlockLoadStore<anyextloadi16, i64, MVCSequence, NCSequence, OCSequence,
+                      XCSequence, 2>;
+defm : BlockLoadStore<anyextloadi32, i64, MVCSequence, NCSequence, OCSequence,
+                      XCSequence, 4>;
+defm : BlockLoadStore<load, i64, MVCSequence, NCSequence, OCSequence,
+                      XCSequence, 8>;
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp
new file mode 100644
index 0000000..8081334
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp
@@ -0,0 +1,460 @@
+//===-- SystemZLongBranch.cpp - Branch lengthening for SystemZ ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass makes sure that all branches are in range.  There are several ways
+// in which this could be done.  One aggressive approach is to assume that all
+// branches are in range and successively replace those that turn out not
+// to be in range with a longer form (branch relaxation).  A simple
+// implementation is to continually walk through the function relaxing
+// branches until no more changes are needed and a fixed point is reached.
+// However, in the pathological worst case, this implementation is
+// quadratic in the number of blocks; relaxing branch N can make branch N-1
+// go out of range, which in turn can make branch N-2 go out of range,
+// and so on.
+//
+// An alternative approach is to assume that all branches must be
+// converted to their long forms, then reinstate the short forms of
+// branches that, even under this pessimistic assumption, turn out to be
+// in range (branch shortening).  This too can be implemented as a function
+// walk that is repeated until a fixed point is reached.  In general,
+// the result of shortening is not as good as that of relaxation, and
+// shortening is also quadratic in the worst case; shortening branch N
+// can bring branch N-1 in range of the short form, which in turn can do
+// the same for branch N-2, and so on.  The main advantage of shortening
+// is that each walk through the function produces valid code, so it is
+// possible to stop at any point after the first walk.  The quadraticness
+// could therefore be handled with a maximum pass count, although the
+// question then becomes: what maximum count should be used?
+//
+// On SystemZ, long branches are only needed for functions bigger than 64k,
+// which are relatively rare to begin with, and the long branch sequences
+// are actually relatively cheap.  It therefore doesn't seem worth spending
+// much compilation time on the problem.  Instead, the approach we take is:
+//
+// (1) Work out the address that each block would have if no branches
+//     need relaxing.  Exit the pass early if all branches are in range
+//     according to this assumption.
+//
+// (2) Work out the address that each block would have if all branches
+//     need relaxing.
+//
+// (3) Walk through the block calculating the final address of each instruction
+//     and relaxing those that need to be relaxed.  For backward branches,
+//     this check uses the final address of the target block, as calculated
+//     earlier in the walk.  For forward branches, this check uses the
+//     address of the target block that was calculated in (2).  Both checks
+//     give a conservatively-correct range.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "systemz-long-branch"
+
+STATISTIC(LongBranches, "Number of long branches.");
+
+namespace {
+// Represents positional information about a basic block.
+struct MBBInfo {
+  // The address that we currently assume the block has.
+  uint64_t Address;
+
+  // The size of the block in bytes, excluding terminators.
+  // This value never changes.
+  uint64_t Size;
+
+  // The minimum alignment of the block, as a log2 value.
+  // This value never changes.
+  unsigned Alignment;
+
+  // The number of terminators in this block.  This value never changes.
+  unsigned NumTerminators;
+
+  MBBInfo()
+    : Address(0), Size(0), Alignment(0), NumTerminators(0) {} 
+};
+
+// Represents the state of a block terminator.
+struct TerminatorInfo {
+  // If this terminator is a relaxable branch, this points to the branch
+  // instruction, otherwise it is null.
+  MachineInstr *Branch;
+
+  // The address that we currently assume the terminator has.
+  uint64_t Address;
+
+  // The current size of the terminator in bytes.
+  uint64_t Size;
+
+  // If Branch is nonnull, this is the number of the target block,
+  // otherwise it is unused.
+  unsigned TargetBlock;
+
+  // If Branch is nonnull, this is the length of the longest relaxed form,
+  // otherwise it is zero.
+  unsigned ExtraRelaxSize;
+
+  TerminatorInfo() : Branch(nullptr), Size(0), TargetBlock(0),
+                     ExtraRelaxSize(0) {}
+};
+
+// Used to keep track of the current position while iterating over the blocks.
+struct BlockPosition {
+  // The address that we assume this position has.
+  uint64_t Address;
+
+  // The number of low bits in Address that are known to be the same
+  // as the runtime address.
+  unsigned KnownBits;
+
+  BlockPosition(unsigned InitialAlignment)
+    : Address(0), KnownBits(InitialAlignment) {}
+};
+
+class SystemZLongBranch : public MachineFunctionPass {
+public:
+  static char ID;
+  SystemZLongBranch(const SystemZTargetMachine &tm)
+    : MachineFunctionPass(ID), TII(nullptr) {}
+
+  const char *getPassName() const override {
+    return "SystemZ Long Branch";
+  }
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+
+private:
+  void skipNonTerminators(BlockPosition &Position, MBBInfo &Block);
+  void skipTerminator(BlockPosition &Position, TerminatorInfo &Terminator,
+                      bool AssumeRelaxed);
+  TerminatorInfo describeTerminator(MachineInstr *MI);
+  uint64_t initMBBInfo();
+  bool mustRelaxBranch(const TerminatorInfo &Terminator, uint64_t Address);
+  bool mustRelaxABranch();
+  void setWorstCaseAddresses();
+  void splitBranchOnCount(MachineInstr *MI, unsigned AddOpcode);
+  void splitCompareBranch(MachineInstr *MI, unsigned CompareOpcode);
+  void relaxBranch(TerminatorInfo &Terminator);
+  void relaxBranches();
+
+  const SystemZInstrInfo *TII;
+  MachineFunction *MF;
+  SmallVector<MBBInfo, 16> MBBs;
+  SmallVector<TerminatorInfo, 16> Terminators;
+};
+
+char SystemZLongBranch::ID = 0;
+
+const uint64_t MaxBackwardRange = 0x10000;
+const uint64_t MaxForwardRange = 0xfffe;
+} // end anonymous namespace
+
+FunctionPass *llvm::createSystemZLongBranchPass(SystemZTargetMachine &TM) {
+  return new SystemZLongBranch(TM);
+}
+
+// Position describes the state immediately before Block.  Update Block
+// accordingly and move Position to the end of the block's non-terminator
+// instructions.
+void SystemZLongBranch::skipNonTerminators(BlockPosition &Position,
+                                           MBBInfo &Block) {
+  if (Block.Alignment > Position.KnownBits) {
+    // When calculating the address of Block, we need to conservatively
+    // assume that Block had the worst possible misalignment.
+    Position.Address += ((uint64_t(1) << Block.Alignment) -
+                         (uint64_t(1) << Position.KnownBits));
+    Position.KnownBits = Block.Alignment;
+  }
+
+  // Align the addresses.
+  uint64_t AlignMask = (uint64_t(1) << Block.Alignment) - 1;
+  Position.Address = (Position.Address + AlignMask) & ~AlignMask;
+
+  // Record the block's position.
+  Block.Address = Position.Address;
+
+  // Move past the non-terminators in the block.
+  Position.Address += Block.Size;
+}
+
+// Position describes the state immediately before Terminator.
+// Update Terminator accordingly and move Position past it.
+// Assume that Terminator will be relaxed if AssumeRelaxed.
+void SystemZLongBranch::skipTerminator(BlockPosition &Position,
+                                       TerminatorInfo &Terminator,
+                                       bool AssumeRelaxed) {
+  Terminator.Address = Position.Address;
+  Position.Address += Terminator.Size;
+  if (AssumeRelaxed)
+    Position.Address += Terminator.ExtraRelaxSize;
+}
+
+// Return a description of terminator instruction MI.
+TerminatorInfo SystemZLongBranch::describeTerminator(MachineInstr *MI) {
+  TerminatorInfo Terminator;
+  Terminator.Size = TII->getInstSizeInBytes(MI);
+  if (MI->isConditionalBranch() || MI->isUnconditionalBranch()) {
+    switch (MI->getOpcode()) {
+    case SystemZ::J:
+      // Relaxes to JG, which is 2 bytes longer.
+      Terminator.ExtraRelaxSize = 2;
+      break;
+    case SystemZ::BRC:
+      // Relaxes to BRCL, which is 2 bytes longer.
+      Terminator.ExtraRelaxSize = 2;
+      break;
+    case SystemZ::BRCT:
+    case SystemZ::BRCTG:
+      // Relaxes to A(G)HI and BRCL, which is 6 bytes longer.
+      Terminator.ExtraRelaxSize = 6;
+      break;
+    case SystemZ::CRJ:
+    case SystemZ::CLRJ:
+      // Relaxes to a C(L)R/BRCL sequence, which is 2 bytes longer.
+      Terminator.ExtraRelaxSize = 2;
+      break;
+    case SystemZ::CGRJ:
+    case SystemZ::CLGRJ:
+      // Relaxes to a C(L)GR/BRCL sequence, which is 4 bytes longer.
+      Terminator.ExtraRelaxSize = 4;
+      break;
+    case SystemZ::CIJ:
+    case SystemZ::CGIJ:
+      // Relaxes to a C(G)HI/BRCL sequence, which is 4 bytes longer.
+      Terminator.ExtraRelaxSize = 4;
+      break;
+    case SystemZ::CLIJ:
+    case SystemZ::CLGIJ:
+      // Relaxes to a CL(G)FI/BRCL sequence, which is 6 bytes longer.
+      Terminator.ExtraRelaxSize = 6;
+      break;
+    default:
+      llvm_unreachable("Unrecognized branch instruction");
+    }
+    Terminator.Branch = MI;
+    Terminator.TargetBlock =
+      TII->getBranchInfo(MI).Target->getMBB()->getNumber();
+  }
+  return Terminator;
+}
+
+// Fill MBBs and Terminators, setting the addresses on the assumption
+// that no branches need relaxation.  Return the size of the function under
+// this assumption.
+uint64_t SystemZLongBranch::initMBBInfo() {
+  MF->RenumberBlocks();
+  unsigned NumBlocks = MF->size();
+
+  MBBs.clear();
+  MBBs.resize(NumBlocks);
+
+  Terminators.clear();
+  Terminators.reserve(NumBlocks);
+
+  BlockPosition Position(MF->getAlignment());
+  for (unsigned I = 0; I < NumBlocks; ++I) {
+    MachineBasicBlock *MBB = MF->getBlockNumbered(I);
+    MBBInfo &Block = MBBs[I];
+
+    // Record the alignment, for quick access.
+    Block.Alignment = MBB->getAlignment();
+
+    // Calculate the size of the fixed part of the block.
+    MachineBasicBlock::iterator MI = MBB->begin();
+    MachineBasicBlock::iterator End = MBB->end();
+    while (MI != End && !MI->isTerminator()) {
+      Block.Size += TII->getInstSizeInBytes(MI);
+      ++MI;
+    }
+    skipNonTerminators(Position, Block);
+
+    // Add the terminators.
+    while (MI != End) {
+      if (!MI->isDebugValue()) {
+        assert(MI->isTerminator() && "Terminator followed by non-terminator");
+        Terminators.push_back(describeTerminator(MI));
+        skipTerminator(Position, Terminators.back(), false);
+        ++Block.NumTerminators;
+      }
+      ++MI;
+    }
+  }
+
+  return Position.Address;
+}
+
+// Return true if, under current assumptions, Terminator would need to be
+// relaxed if it were placed at address Address.
+bool SystemZLongBranch::mustRelaxBranch(const TerminatorInfo &Terminator,
+                                        uint64_t Address) {
+  if (!Terminator.Branch)
+    return false;
+
+  const MBBInfo &Target = MBBs[Terminator.TargetBlock];
+  if (Address >= Target.Address) {
+    if (Address - Target.Address <= MaxBackwardRange)
+      return false;
+  } else {
+    if (Target.Address - Address <= MaxForwardRange)
+      return false;
+  }
+
+  return true;
+}
+
+// Return true if, under current assumptions, any terminator needs
+// to be relaxed.
+bool SystemZLongBranch::mustRelaxABranch() {
+  for (auto &Terminator : Terminators)
+    if (mustRelaxBranch(Terminator, Terminator.Address))
+      return true;
+  return false;
+}
+
+// Set the address of each block on the assumption that all branches
+// must be long.
+void SystemZLongBranch::setWorstCaseAddresses() {
+  SmallVector<TerminatorInfo, 16>::iterator TI = Terminators.begin();
+  BlockPosition Position(MF->getAlignment());
+  for (auto &Block : MBBs) {
+    skipNonTerminators(Position, Block);
+    for (unsigned BTI = 0, BTE = Block.NumTerminators; BTI != BTE; ++BTI) {
+      skipTerminator(Position, *TI, true);
+      ++TI;
+    }
+  }
+}
+
+// Split BRANCH ON COUNT MI into the addition given by AddOpcode followed
+// by a BRCL on the result.
+void SystemZLongBranch::splitBranchOnCount(MachineInstr *MI,
+                                           unsigned AddOpcode) {
+  MachineBasicBlock *MBB = MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+  BuildMI(*MBB, MI, DL, TII->get(AddOpcode))
+    .addOperand(MI->getOperand(0))
+    .addOperand(MI->getOperand(1))
+    .addImm(-1);
+  MachineInstr *BRCL = BuildMI(*MBB, MI, DL, TII->get(SystemZ::BRCL))
+    .addImm(SystemZ::CCMASK_ICMP)
+    .addImm(SystemZ::CCMASK_CMP_NE)
+    .addOperand(MI->getOperand(2));
+  // The implicit use of CC is a killing use.
+  BRCL->addRegisterKilled(SystemZ::CC, &TII->getRegisterInfo());
+  MI->eraseFromParent();
+}
+
+// Split MI into the comparison given by CompareOpcode followed
+// a BRCL on the result.
+void SystemZLongBranch::splitCompareBranch(MachineInstr *MI,
+                                           unsigned CompareOpcode) {
+  MachineBasicBlock *MBB = MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+  BuildMI(*MBB, MI, DL, TII->get(CompareOpcode))
+    .addOperand(MI->getOperand(0))
+    .addOperand(MI->getOperand(1));
+  MachineInstr *BRCL = BuildMI(*MBB, MI, DL, TII->get(SystemZ::BRCL))
+    .addImm(SystemZ::CCMASK_ICMP)
+    .addOperand(MI->getOperand(2))
+    .addOperand(MI->getOperand(3));
+  // The implicit use of CC is a killing use.
+  BRCL->addRegisterKilled(SystemZ::CC, &TII->getRegisterInfo());
+  MI->eraseFromParent();
+}
+
+// Relax the branch described by Terminator.
+void SystemZLongBranch::relaxBranch(TerminatorInfo &Terminator) {
+  MachineInstr *Branch = Terminator.Branch;
+  switch (Branch->getOpcode()) {
+  case SystemZ::J:
+    Branch->setDesc(TII->get(SystemZ::JG));
+    break;
+  case SystemZ::BRC:
+    Branch->setDesc(TII->get(SystemZ::BRCL));
+    break;
+  case SystemZ::BRCT:
+    splitBranchOnCount(Branch, SystemZ::AHI);
+    break;
+  case SystemZ::BRCTG:
+    splitBranchOnCount(Branch, SystemZ::AGHI);
+    break;
+  case SystemZ::CRJ:
+    splitCompareBranch(Branch, SystemZ::CR);
+    break;
+  case SystemZ::CGRJ:
+    splitCompareBranch(Branch, SystemZ::CGR);
+    break;
+  case SystemZ::CIJ:
+    splitCompareBranch(Branch, SystemZ::CHI);
+    break;
+  case SystemZ::CGIJ:
+    splitCompareBranch(Branch, SystemZ::CGHI);
+    break;
+  case SystemZ::CLRJ:
+    splitCompareBranch(Branch, SystemZ::CLR);
+    break;
+  case SystemZ::CLGRJ:
+    splitCompareBranch(Branch, SystemZ::CLGR);
+    break;
+  case SystemZ::CLIJ:
+    splitCompareBranch(Branch, SystemZ::CLFI);
+    break;
+  case SystemZ::CLGIJ:
+    splitCompareBranch(Branch, SystemZ::CLGFI);
+    break;
+  default:
+    llvm_unreachable("Unrecognized branch");
+  }
+
+  Terminator.Size += Terminator.ExtraRelaxSize;
+  Terminator.ExtraRelaxSize = 0;
+  Terminator.Branch = nullptr;
+
+  ++LongBranches;
+}
+
+// Run a shortening pass and relax any branches that need to be relaxed.
+void SystemZLongBranch::relaxBranches() {
+  SmallVector<TerminatorInfo, 16>::iterator TI = Terminators.begin();
+  BlockPosition Position(MF->getAlignment());
+  for (auto &Block : MBBs) {
+    skipNonTerminators(Position, Block);
+    for (unsigned BTI = 0, BTE = Block.NumTerminators; BTI != BTE; ++BTI) {
+      assert(Position.Address <= TI->Address &&
+             "Addresses shouldn't go forwards");
+      if (mustRelaxBranch(*TI, Position.Address))
+        relaxBranch(*TI);
+      skipTerminator(Position, *TI, false);
+      ++TI;
+    }
+  }
+}
+
+bool SystemZLongBranch::runOnMachineFunction(MachineFunction &F) {
+  TII = static_cast<const SystemZInstrInfo *>(F.getTarget().getInstrInfo());
+  MF = &F;
+  uint64_t Size = initMBBInfo();
+  if (Size <= MaxForwardRange || !mustRelaxABranch())
+    return false;
+
+  setWorstCaseAddresses();
+  relaxBranches();
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZMCInstLower.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZMCInstLower.cpp
new file mode 100644
index 0000000..df561e2
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZMCInstLower.cpp
@@ -0,0 +1,100 @@
+//===-- SystemZMCInstLower.cpp - Lower MachineInstr to MCInst -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZMCInstLower.h"
+#include "SystemZAsmPrinter.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCStreamer.h"
+
+using namespace llvm;
+
+// Return the VK_* enumeration for MachineOperand target flags Flags.
+static MCSymbolRefExpr::VariantKind getVariantKind(unsigned Flags) {
+  switch (Flags & SystemZII::MO_SYMBOL_MODIFIER) {
+    case 0:
+      return MCSymbolRefExpr::VK_None;
+    case SystemZII::MO_GOT:
+      return MCSymbolRefExpr::VK_GOT;
+  }
+  llvm_unreachable("Unrecognised MO_ACCESS_MODEL");
+}
+
+SystemZMCInstLower::SystemZMCInstLower(MCContext &ctx,
+                                       SystemZAsmPrinter &asmprinter)
+  : Ctx(ctx), AsmPrinter(asmprinter) {}
+
+const MCExpr *
+SystemZMCInstLower::getExpr(const MachineOperand &MO,
+                            MCSymbolRefExpr::VariantKind Kind) const {
+  const MCSymbol *Symbol;
+  bool HasOffset = true;
+  switch (MO.getType()) {
+  case MachineOperand::MO_MachineBasicBlock:
+    Symbol = MO.getMBB()->getSymbol();
+    HasOffset = false;
+    break;
+
+  case MachineOperand::MO_GlobalAddress:
+    Symbol = AsmPrinter.getSymbol(MO.getGlobal());
+    break;
+
+  case MachineOperand::MO_ExternalSymbol:
+    Symbol = AsmPrinter.GetExternalSymbolSymbol(MO.getSymbolName());
+    break;
+
+  case MachineOperand::MO_JumpTableIndex:
+    Symbol = AsmPrinter.GetJTISymbol(MO.getIndex());
+    HasOffset = false;
+    break;
+
+  case MachineOperand::MO_ConstantPoolIndex:
+    Symbol = AsmPrinter.GetCPISymbol(MO.getIndex());
+    break;
+
+  case MachineOperand::MO_BlockAddress:
+    Symbol = AsmPrinter.GetBlockAddressSymbol(MO.getBlockAddress());
+    break;
+
+  default:
+    llvm_unreachable("unknown operand type");
+  }
+  const MCExpr *Expr = MCSymbolRefExpr::Create(Symbol, Kind, Ctx);
+  if (HasOffset)
+    if (int64_t Offset = MO.getOffset()) {
+      const MCExpr *OffsetExpr = MCConstantExpr::Create(Offset, Ctx);
+      Expr = MCBinaryExpr::CreateAdd(Expr, OffsetExpr, Ctx);
+    }
+  return Expr;
+}
+
+MCOperand SystemZMCInstLower::lowerOperand(const MachineOperand &MO) const {
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    return MCOperand::CreateReg(MO.getReg());
+
+  case MachineOperand::MO_Immediate:
+    return MCOperand::CreateImm(MO.getImm());
+
+  default: {
+    MCSymbolRefExpr::VariantKind Kind = getVariantKind(MO.getTargetFlags());
+    return MCOperand::CreateExpr(getExpr(MO, Kind));
+  }
+  }
+}
+
+void SystemZMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
+  OutMI.setOpcode(MI->getOpcode());
+  for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
+    const MachineOperand &MO = MI->getOperand(I);
+    // Ignore all implicit register operands.
+    if (!MO.isReg() || !MO.isImplicit())
+      OutMI.addOperand(lowerOperand(MO));
+  }
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZMCInstLower.h b/contrib/llvm/lib/Target/SystemZ/SystemZMCInstLower.h
new file mode 100644
index 0000000..90447ff
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZMCInstLower.h
@@ -0,0 +1,44 @@
+//===-- SystemZMCInstLower.h - Lower MachineInstr to MCInst ----*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SYSTEMZMCINSTLOWER_H
+#define LLVM_SYSTEMZMCINSTLOWER_H
+
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+class MCInst;
+class MCOperand;
+class MachineInstr;
+class MachineOperand;
+class Mangler;
+class SystemZAsmPrinter;
+
+class LLVM_LIBRARY_VISIBILITY SystemZMCInstLower {
+  MCContext &Ctx;
+  SystemZAsmPrinter &AsmPrinter;
+
+public:
+  SystemZMCInstLower(MCContext &ctx, SystemZAsmPrinter &asmPrinter);
+
+  // Lower MachineInstr MI to MCInst OutMI.
+  void lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+  // Return an MCOperand for MO.
+  MCOperand lowerOperand(const MachineOperand& MO) const;
+
+  // Return an MCExpr for symbolic operand MO with variant kind Kind.
+  const MCExpr *getExpr(const MachineOperand &MO,
+                        MCSymbolRefExpr::VariantKind Kind) const;
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp
new file mode 100644
index 0000000..00572d0
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp
@@ -0,0 +1,17 @@
+//== SystemZMachineFuctionInfo.cpp - SystemZ machine function info-*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZMachineFunctionInfo.h"
+
+using namespace llvm;
+
+
+// pin vtable to this file
+void SystemZMachineFunctionInfo::anchor() {}
+
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.h b/contrib/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
new file mode 100644
index 0000000..50865f13
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
@@ -0,0 +1,68 @@
+//==- SystemZMachineFuctionInfo.h - SystemZ machine function info -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SYSTEMZMACHINEFUNCTIONINFO_H
+#define SYSTEMZMACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+class SystemZMachineFunctionInfo : public MachineFunctionInfo {
+  virtual void anchor();
+  unsigned LowSavedGPR;
+  unsigned HighSavedGPR;
+  unsigned VarArgsFirstGPR;
+  unsigned VarArgsFirstFPR;
+  unsigned VarArgsFrameIndex;
+  unsigned RegSaveFrameIndex;
+  bool ManipulatesSP;
+
+public:
+  explicit SystemZMachineFunctionInfo(MachineFunction &MF)
+    : LowSavedGPR(0), HighSavedGPR(0), VarArgsFirstGPR(0), VarArgsFirstFPR(0),
+      VarArgsFrameIndex(0), RegSaveFrameIndex(0), ManipulatesSP(false) {}
+
+  // Get and set the first call-saved GPR that should be saved and restored
+  // by this function.  This is 0 if no GPRs need to be saved or restored.
+  unsigned getLowSavedGPR() const { return LowSavedGPR; }
+  void setLowSavedGPR(unsigned Reg) { LowSavedGPR = Reg; }
+
+  // Get and set the last call-saved GPR that should be saved and restored
+  // by this function.
+  unsigned getHighSavedGPR() const { return HighSavedGPR; }
+  void setHighSavedGPR(unsigned Reg) { HighSavedGPR = Reg; }
+
+  // Get and set the number of fixed (as opposed to variable) arguments
+  // that are passed in GPRs to this function.
+  unsigned getVarArgsFirstGPR() const { return VarArgsFirstGPR; }
+  void setVarArgsFirstGPR(unsigned GPR) { VarArgsFirstGPR = GPR; }
+
+  // Likewise FPRs.
+  unsigned getVarArgsFirstFPR() const { return VarArgsFirstFPR; }
+  void setVarArgsFirstFPR(unsigned FPR) { VarArgsFirstFPR = FPR; }
+
+  // Get and set the frame index of the first stack vararg.
+  unsigned getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
+  void setVarArgsFrameIndex(unsigned FI) { VarArgsFrameIndex = FI; }
+
+  // Get and set the frame index of the register save area
+  // (i.e. the incoming stack pointer).
+  unsigned getRegSaveFrameIndex() const { return RegSaveFrameIndex; }
+  void setRegSaveFrameIndex(unsigned FI) { RegSaveFrameIndex = FI; }
+
+  // Get and set whether the function directly manipulates the stack pointer,
+  // e.g. through STACKSAVE or STACKRESTORE.
+  bool getManipulatesSP() const { return ManipulatesSP; }
+  void setManipulatesSP(bool MSP) { ManipulatesSP = MSP; }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZOperands.td b/contrib/llvm/lib/Target/SystemZ/SystemZOperands.td
new file mode 100644
index 0000000..7be81dc
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZOperands.td
@@ -0,0 +1,475 @@
+//===-- SystemZOperands.td - SystemZ instruction operands ----*- tblgen-*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Class definitions
+//===----------------------------------------------------------------------===//
+
+class ImmediateAsmOperand<string name>
+  : AsmOperandClass {
+  let Name = name;
+  let RenderMethod = "addImmOperands";
+}
+
+// Constructs both a DAG pattern and instruction operand for an immediate
+// of type VT.  PRED returns true if a node is acceptable and XFORM returns
+// the operand value associated with the node.  ASMOP is the name of the
+// associated asm operand, and also forms the basis of the asm print method.
+class Immediate<ValueType vt, code pred, SDNodeXForm xform, string asmop>
+  : PatLeaf<(vt imm), pred, xform>, Operand<vt> {
+  let PrintMethod = "print"##asmop##"Operand";
+  let DecoderMethod = "decode"##asmop##"Operand";
+  let ParserMatchClass = !cast<AsmOperandClass>(asmop);
+}
+
+// Constructs an asm operand for a PC-relative address.  SIZE says how
+// many bits there are.
+class PCRelAsmOperand<string size> : ImmediateAsmOperand<"PCRel"##size> {
+  let PredicateMethod = "isImm";
+  let ParserMethod = "parsePCRel"##size;
+}
+
+// Constructs an operand for a PC-relative address with address type VT.
+// ASMOP is the associated asm operand.
+class PCRelOperand<ValueType vt, AsmOperandClass asmop> : Operand<vt> {
+  let PrintMethod = "printPCRelOperand";
+  let ParserMatchClass = asmop;
+}
+
+// Constructs both a DAG pattern and instruction operand for a PC-relative
+// address with address size VT.  SELF is the name of the operand and
+// ASMOP is the associated asm operand.
+class PCRelAddress<ValueType vt, string self, AsmOperandClass asmop>
+  : ComplexPattern<vt, 1, "selectPCRelAddress",
+                   [z_pcrel_wrapper, z_pcrel_offset]>,
+    PCRelOperand<vt, asmop> {
+  let MIOperandInfo = (ops !cast<Operand>(self));
+}
+
+// Constructs an AsmOperandClass for addressing mode FORMAT, treating the
+// registers as having BITSIZE bits and displacements as having DISPSIZE bits.
+// LENGTH is "LenN" for addresses with an N-bit length field, otherwise it
+// is "".
+class AddressAsmOperand<string format, string bitsize, string dispsize,
+                        string length = "">
+  : AsmOperandClass {
+  let Name = format##bitsize##"Disp"##dispsize##length;
+  let ParserMethod = "parse"##format##bitsize;
+  let RenderMethod = "add"##format##"Operands";
+}
+
+// Constructs both a DAG pattern and instruction operand for an addressing mode.
+// FORMAT, BITSIZE, DISPSIZE and LENGTH are the parameters to an associated
+// AddressAsmOperand.  OPERANDS is a list of NUMOPS individual operands
+// (base register, displacement, etc.).  SELTYPE is the type of the memory
+// operand for selection purposes; sometimes we want different selection
+// choices for the same underlying addressing mode.  SUFFIX is similarly
+// a suffix appended to the displacement for selection purposes;
+// e.g. we want to reject small 20-bit displacements if a 12-bit form
+// also exists, but we want to accept them otherwise.
+class AddressingMode<string seltype, string bitsize, string dispsize,
+                     string suffix, string length, int numops, string format,
+                     dag operands>
+  : ComplexPattern<!cast<ValueType>("i"##bitsize), numops,
+                   "select"##seltype##dispsize##suffix##length,
+                   [add, sub, or, frameindex, z_adjdynalloc]>,
+    Operand<!cast<ValueType>("i"##bitsize)> {
+  let PrintMethod = "print"##format##"Operand";
+  let EncoderMethod = "get"##format##dispsize##length##"Encoding";
+  let DecoderMethod =
+    "decode"##format##bitsize##"Disp"##dispsize##length##"Operand";
+  let MIOperandInfo = operands;
+  let ParserMatchClass =
+    !cast<AddressAsmOperand>(format##bitsize##"Disp"##dispsize##length);
+}
+
+// An addressing mode with a base and displacement but no index.
+class BDMode<string type, string bitsize, string dispsize, string suffix>
+  : AddressingMode<type, bitsize, dispsize, suffix, "", 2, "BDAddr",
+                   (ops !cast<RegisterOperand>("ADDR"##bitsize),
+                        !cast<Immediate>("disp"##dispsize##"imm"##bitsize))>;
+
+// An addressing mode with a base, displacement and index.
+class BDXMode<string type, string bitsize, string dispsize, string suffix>
+  : AddressingMode<type, bitsize, dispsize, suffix, "", 3, "BDXAddr",
+                   (ops !cast<RegisterOperand>("ADDR"##bitsize),
+                        !cast<Immediate>("disp"##dispsize##"imm"##bitsize),
+                        !cast<RegisterOperand>("ADDR"##bitsize))>;
+
+// A BDMode paired with an immediate length operand of LENSIZE bits.
+class BDLMode<string type, string bitsize, string dispsize, string suffix,
+              string lensize>
+  : AddressingMode<type, bitsize, dispsize, suffix, "Len"##lensize, 3,
+                   "BDLAddr",
+                   (ops !cast<RegisterOperand>("ADDR"##bitsize),
+                        !cast<Immediate>("disp"##dispsize##"imm"##bitsize),
+                        !cast<Immediate>("imm"##bitsize))>;
+
+//===----------------------------------------------------------------------===//
+// Extracting immediate operands from nodes
+// These all create MVT::i64 nodes to ensure the value is not sign-extended
+// when converted from an SDNode to a MachineOperand later on.
+//===----------------------------------------------------------------------===//
+
+// Bits 0-15 (counting from the lsb).
+def LL16 : SDNodeXForm<imm, [{
+  uint64_t Value = N->getZExtValue() & 0x000000000000FFFFULL;
+  return CurDAG->getTargetConstant(Value, MVT::i64);
+}]>;
+
+// Bits 16-31 (counting from the lsb).
+def LH16 : SDNodeXForm<imm, [{
+  uint64_t Value = (N->getZExtValue() & 0x00000000FFFF0000ULL) >> 16;
+  return CurDAG->getTargetConstant(Value, MVT::i64);
+}]>;
+
+// Bits 32-47 (counting from the lsb).
+def HL16 : SDNodeXForm<imm, [{
+  uint64_t Value = (N->getZExtValue() & 0x0000FFFF00000000ULL) >> 32;
+  return CurDAG->getTargetConstant(Value, MVT::i64);
+}]>;
+
+// Bits 48-63 (counting from the lsb).
+def HH16 : SDNodeXForm<imm, [{
+  uint64_t Value = (N->getZExtValue() & 0xFFFF000000000000ULL) >> 48;
+  return CurDAG->getTargetConstant(Value, MVT::i64);
+}]>;
+
+// Low 32 bits.
+def LF32 : SDNodeXForm<imm, [{
+  uint64_t Value = N->getZExtValue() & 0x00000000FFFFFFFFULL;
+  return CurDAG->getTargetConstant(Value, MVT::i64);
+}]>;
+
+// High 32 bits.
+def HF32 : SDNodeXForm<imm, [{
+  uint64_t Value = N->getZExtValue() >> 32;
+  return CurDAG->getTargetConstant(Value, MVT::i64);
+}]>;
+
+// Truncate an immediate to a 8-bit signed quantity.
+def SIMM8 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(int8_t(N->getZExtValue()), MVT::i64);
+}]>;
+
+// Truncate an immediate to a 8-bit unsigned quantity.
+def UIMM8 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(uint8_t(N->getZExtValue()), MVT::i64);
+}]>;
+
+// Truncate an immediate to a 16-bit signed quantity.
+def SIMM16 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(int16_t(N->getZExtValue()), MVT::i64);
+}]>;
+
+// Truncate an immediate to a 16-bit unsigned quantity.
+def UIMM16 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(uint16_t(N->getZExtValue()), MVT::i64);
+}]>;
+
+// Truncate an immediate to a 32-bit signed quantity.
+def SIMM32 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(int32_t(N->getZExtValue()), MVT::i64);
+}]>;
+
+// Truncate an immediate to a 32-bit unsigned quantity.
+def UIMM32 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(uint32_t(N->getZExtValue()), MVT::i64);
+}]>;
+
+// Negate and then truncate an immediate to a 32-bit unsigned quantity.
+def NEGIMM32 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(uint32_t(-N->getZExtValue()), MVT::i64);
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Immediate asm operands.
+//===----------------------------------------------------------------------===//
+
+def U4Imm  : ImmediateAsmOperand<"U4Imm">;
+def U6Imm  : ImmediateAsmOperand<"U6Imm">;
+def S8Imm  : ImmediateAsmOperand<"S8Imm">;
+def U8Imm  : ImmediateAsmOperand<"U8Imm">;
+def S16Imm : ImmediateAsmOperand<"S16Imm">;
+def U16Imm : ImmediateAsmOperand<"U16Imm">;
+def S32Imm : ImmediateAsmOperand<"S32Imm">;
+def U32Imm : ImmediateAsmOperand<"U32Imm">;
+
+//===----------------------------------------------------------------------===//
+// i32 immediates
+//===----------------------------------------------------------------------===//
+
+// Immediates for the lower and upper 16 bits of an i32, with the other
+// bits of the i32 being zero.
+def imm32ll16 : Immediate<i32, [{
+  return SystemZ::isImmLL(N->getZExtValue());
+}], LL16, "U16Imm">;
+
+def imm32lh16 : Immediate<i32, [{
+  return SystemZ::isImmLH(N->getZExtValue());
+}], LH16, "U16Imm">;
+
+// Immediates for the lower and upper 16 bits of an i32, with the other
+// bits of the i32 being one.
+def imm32ll16c : Immediate<i32, [{
+  return SystemZ::isImmLL(uint32_t(~N->getZExtValue()));
+}], LL16, "U16Imm">;
+
+def imm32lh16c : Immediate<i32, [{
+  return SystemZ::isImmLH(uint32_t(~N->getZExtValue()));
+}], LH16, "U16Imm">;
+
+// Short immediates
+def imm32zx4 : Immediate<i32, [{
+  return isUInt<4>(N->getZExtValue());
+}], NOOP_SDNodeXForm, "U4Imm">;
+
+def imm32zx6 : Immediate<i32, [{
+  return isUInt<6>(N->getZExtValue());
+}], NOOP_SDNodeXForm, "U6Imm">;
+
+def imm32sx8 : Immediate<i32, [{
+  return isInt<8>(N->getSExtValue());
+}], SIMM8, "S8Imm">;
+
+def imm32zx8 : Immediate<i32, [{
+  return isUInt<8>(N->getZExtValue());
+}], UIMM8, "U8Imm">;
+
+def imm32zx8trunc : Immediate<i32, [{}], UIMM8, "U8Imm">;
+
+def imm32sx16 : Immediate<i32, [{
+  return isInt<16>(N->getSExtValue());
+}], SIMM16, "S16Imm">;
+
+def imm32zx16 : Immediate<i32, [{
+  return isUInt<16>(N->getZExtValue());
+}], UIMM16, "U16Imm">;
+
+def imm32sx16trunc : Immediate<i32, [{}], SIMM16, "S16Imm">;
+
+// Full 32-bit immediates.  we need both signed and unsigned versions
+// because the assembler is picky.  E.g. AFI requires signed operands
+// while NILF requires unsigned ones.
+def simm32 : Immediate<i32, [{}], SIMM32, "S32Imm">;
+def uimm32 : Immediate<i32, [{}], UIMM32, "U32Imm">;
+
+def imm32 : ImmLeaf<i32, [{}]>;
+
+//===----------------------------------------------------------------------===//
+// 64-bit immediates
+//===----------------------------------------------------------------------===//
+
+// Immediates for 16-bit chunks of an i64, with the other bits of the
+// i32 being zero.
+def imm64ll16 : Immediate<i64, [{
+  return SystemZ::isImmLL(N->getZExtValue());
+}], LL16, "U16Imm">;
+
+def imm64lh16 : Immediate<i64, [{
+  return SystemZ::isImmLH(N->getZExtValue());
+}], LH16, "U16Imm">;
+
+def imm64hl16 : Immediate<i64, [{
+  return SystemZ::isImmHL(N->getZExtValue());
+}], HL16, "U16Imm">;
+
+def imm64hh16 : Immediate<i64, [{
+  return SystemZ::isImmHH(N->getZExtValue());
+}], HH16, "U16Imm">;
+
+// Immediates for 16-bit chunks of an i64, with the other bits of the
+// i32 being one.
+def imm64ll16c : Immediate<i64, [{
+  return SystemZ::isImmLL(uint64_t(~N->getZExtValue()));
+}], LL16, "U16Imm">;
+
+def imm64lh16c : Immediate<i64, [{
+  return SystemZ::isImmLH(uint64_t(~N->getZExtValue()));
+}], LH16, "U16Imm">;
+
+def imm64hl16c : Immediate<i64, [{
+  return SystemZ::isImmHL(uint64_t(~N->getZExtValue()));
+}], HL16, "U16Imm">;
+
+def imm64hh16c : Immediate<i64, [{
+  return SystemZ::isImmHH(uint64_t(~N->getZExtValue()));
+}], HH16, "U16Imm">;
+
+// Immediates for the lower and upper 32 bits of an i64, with the other
+// bits of the i32 being zero.
+def imm64lf32 : Immediate<i64, [{
+  return SystemZ::isImmLF(N->getZExtValue());
+}], LF32, "U32Imm">;
+
+def imm64hf32 : Immediate<i64, [{
+  return SystemZ::isImmHF(N->getZExtValue());
+}], HF32, "U32Imm">;
+
+// Immediates for the lower and upper 32 bits of an i64, with the other
+// bits of the i32 being one.
+def imm64lf32c : Immediate<i64, [{
+  return SystemZ::isImmLF(uint64_t(~N->getZExtValue()));
+}], LF32, "U32Imm">;
+
+def imm64hf32c : Immediate<i64, [{
+  return SystemZ::isImmHF(uint64_t(~N->getZExtValue()));
+}], HF32, "U32Imm">;
+
+// Short immediates.
+def imm64sx8 : Immediate<i64, [{
+  return isInt<8>(N->getSExtValue());
+}], SIMM8, "S8Imm">;
+
+def imm64zx8 : Immediate<i64, [{
+  return isUInt<8>(N->getSExtValue());
+}], UIMM8, "U8Imm">;
+
+def imm64sx16 : Immediate<i64, [{
+  return isInt<16>(N->getSExtValue());
+}], SIMM16, "S16Imm">;
+
+def imm64zx16 : Immediate<i64, [{
+  return isUInt<16>(N->getZExtValue());
+}], UIMM16, "U16Imm">;
+
+def imm64sx32 : Immediate<i64, [{
+  return isInt<32>(N->getSExtValue());
+}], SIMM32, "S32Imm">;
+
+def imm64zx32 : Immediate<i64, [{
+  return isUInt<32>(N->getZExtValue());
+}], UIMM32, "U32Imm">;
+
+def imm64zx32n : Immediate<i64, [{
+  return isUInt<32>(-N->getSExtValue());
+}], NEGIMM32, "U32Imm">;
+
+def imm64 : ImmLeaf<i64, [{}]>, Operand<i64>;
+
+//===----------------------------------------------------------------------===//
+// Floating-point immediates
+//===----------------------------------------------------------------------===//
+
+// Floating-point zero.
+def fpimm0 : PatLeaf<(fpimm), [{ return N->isExactlyValue(+0.0); }]>;
+
+// Floating point negative zero.
+def fpimmneg0 : PatLeaf<(fpimm), [{ return N->isExactlyValue(-0.0); }]>;
+
+//===----------------------------------------------------------------------===//
+// Symbolic address operands
+//===----------------------------------------------------------------------===//
+
+// PC-relative asm operands.
+def PCRel16 : PCRelAsmOperand<"16">;
+def PCRel32 : PCRelAsmOperand<"32">;
+
+// PC-relative offsets of a basic block.  The offset is sign-extended
+// and multiplied by 2.
+def brtarget16 : PCRelOperand<OtherVT, PCRel16> {
+  let EncoderMethod = "getPC16DBLEncoding";
+  let DecoderMethod = "decodePC16DBLOperand";
+}
+def brtarget32 : PCRelOperand<OtherVT, PCRel32> {
+  let EncoderMethod = "getPC32DBLEncoding";
+  let DecoderMethod = "decodePC32DBLOperand";
+}
+
+// A PC-relative offset of a global value.  The offset is sign-extended
+// and multiplied by 2.
+def pcrel32 : PCRelAddress<i64, "pcrel32", PCRel32> {
+  let EncoderMethod = "getPC32DBLEncoding";
+  let DecoderMethod = "decodePC32DBLOperand";
+}
+
+//===----------------------------------------------------------------------===//
+// Addressing modes
+//===----------------------------------------------------------------------===//
+
+// 12-bit displacement operands.
+def disp12imm32 : Operand<i32>;
+def disp12imm64 : Operand<i64>;
+
+// 20-bit displacement operands.
+def disp20imm32 : Operand<i32>;
+def disp20imm64 : Operand<i64>;
+
+def BDAddr32Disp12      : AddressAsmOperand<"BDAddr",   "32", "12">;
+def BDAddr32Disp20      : AddressAsmOperand<"BDAddr",   "32", "20">;
+def BDAddr64Disp12      : AddressAsmOperand<"BDAddr",   "64", "12">;
+def BDAddr64Disp20      : AddressAsmOperand<"BDAddr",   "64", "20">;
+def BDXAddr64Disp12     : AddressAsmOperand<"BDXAddr",  "64", "12">;
+def BDXAddr64Disp20     : AddressAsmOperand<"BDXAddr",  "64", "20">;
+def BDLAddr64Disp12Len8 : AddressAsmOperand<"BDLAddr",  "64", "12", "Len8">;
+
+// DAG patterns and operands for addressing modes.  Each mode has
+// the form <type><range><group>[<len>] where:
+//
+// <type> is one of:
+//   shift    : base + displacement (32-bit)
+//   bdaddr   : base + displacement
+//   mviaddr  : like bdaddr, but reject cases with a natural index
+//   bdxaddr  : base + displacement + index
+//   laaddr   : like bdxaddr, but used for Load Address operations
+//   dynalloc : base + displacement + index + ADJDYNALLOC
+//   bdladdr  : base + displacement with a length field
+//
+// <range> is one of:
+//   12       : the displacement is an unsigned 12-bit value
+//   20       : the displacement is a signed 20-bit value
+//
+// <group> is one of:
+//   pair     : used when there is an equivalent instruction with the opposite
+//              range value (12 or 20)
+//   only     : used when there is no equivalent instruction with the opposite
+//              range value
+//
+// <len> is one of:
+//
+//   <empty>  : there is no length field
+//   len8     : the length field is 8 bits, with a range of [1, 0x100].
+def shift12only       : BDMode <"BDAddr",   "32", "12", "Only">;
+def shift20only       : BDMode <"BDAddr",   "32", "20", "Only">;
+def bdaddr12only      : BDMode <"BDAddr",   "64", "12", "Only">;
+def bdaddr12pair      : BDMode <"BDAddr",   "64", "12", "Pair">;
+def bdaddr20only      : BDMode <"BDAddr",   "64", "20", "Only">;
+def bdaddr20pair      : BDMode <"BDAddr",   "64", "20", "Pair">;
+def mviaddr12pair     : BDMode <"MVIAddr",  "64", "12", "Pair">;
+def mviaddr20pair     : BDMode <"MVIAddr",  "64", "20", "Pair">;
+def bdxaddr12only     : BDXMode<"BDXAddr",  "64", "12", "Only">;
+def bdxaddr12pair     : BDXMode<"BDXAddr",  "64", "12", "Pair">;
+def bdxaddr20only     : BDXMode<"BDXAddr",  "64", "20", "Only">;
+def bdxaddr20only128  : BDXMode<"BDXAddr",  "64", "20", "Only128">;
+def bdxaddr20pair     : BDXMode<"BDXAddr",  "64", "20", "Pair">;
+def dynalloc12only    : BDXMode<"DynAlloc", "64", "12", "Only">;
+def laaddr12pair      : BDXMode<"LAAddr",   "64", "12", "Pair">;
+def laaddr20pair      : BDXMode<"LAAddr",   "64", "20", "Pair">;
+def bdladdr12onlylen8 : BDLMode<"BDLAddr",  "64", "12", "Only", "8">;
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous
+//===----------------------------------------------------------------------===//
+
+// Access registers.  At present we just use them for accessing the thread
+// pointer, so we don't expose them as register to LLVM.
+def AccessReg : AsmOperandClass {
+  let Name = "AccessReg";
+  let ParserMethod = "parseAccessReg";
+}
+def access_reg : Immediate<i32, [{ return N->getZExtValue() < 16; }],
+                           NOOP_SDNodeXForm, "AccessReg"> {
+  let ParserMatchClass = AccessReg;
+}
+
+// A 4-bit condition-code mask.
+def cond4 : PatLeaf<(i32 imm), [{ return (N->getZExtValue() < 16); }]>,
+            Operand<i32> {
+  let PrintMethod = "printCond4Operand";
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZOperators.td b/contrib/llvm/lib/Target/SystemZ/SystemZOperators.td
new file mode 100644
index 0000000..c70e662
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZOperators.td
@@ -0,0 +1,385 @@
+//===-- SystemZOperators.td - SystemZ-specific operators ------*- tblgen-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Type profiles
+//===----------------------------------------------------------------------===//
+def SDT_CallSeqStart        : SDCallSeqStart<[SDTCisVT<0, i64>]>;
+def SDT_CallSeqEnd          : SDCallSeqEnd<[SDTCisVT<0, i64>,
+                                            SDTCisVT<1, i64>]>;
+def SDT_ZCall               : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
+def SDT_ZCmp                : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
+def SDT_ZICmp               : SDTypeProfile<0, 3,
+                                            [SDTCisSameAs<0, 1>,
+                                             SDTCisVT<2, i32>]>;
+def SDT_ZBRCCMask           : SDTypeProfile<0, 3,
+                                            [SDTCisVT<0, i32>,
+                                             SDTCisVT<1, i32>,
+                                             SDTCisVT<2, OtherVT>]>;
+def SDT_ZSelectCCMask       : SDTypeProfile<1, 4,
+                                            [SDTCisSameAs<0, 1>,
+                                             SDTCisSameAs<1, 2>,
+                                             SDTCisVT<3, i32>,
+                                             SDTCisVT<4, i32>]>;
+def SDT_ZWrapPtr            : SDTypeProfile<1, 1,
+                                            [SDTCisSameAs<0, 1>,
+                                             SDTCisPtrTy<0>]>;
+def SDT_ZWrapOffset         : SDTypeProfile<1, 2,
+                                            [SDTCisSameAs<0, 1>,
+                                             SDTCisSameAs<0, 2>,
+                                             SDTCisPtrTy<0>]>;
+def SDT_ZAdjDynAlloc        : SDTypeProfile<1, 0, [SDTCisVT<0, i64>]>;
+def SDT_ZExtractAccess      : SDTypeProfile<1, 1,
+                                            [SDTCisVT<0, i32>,
+                                             SDTCisVT<1, i32>]>;
+def SDT_ZGR128Binary32      : SDTypeProfile<1, 2,
+                                            [SDTCisVT<0, untyped>,
+                                             SDTCisVT<1, untyped>,
+                                             SDTCisVT<2, i32>]>;
+def SDT_ZGR128Binary64      : SDTypeProfile<1, 2,
+                                            [SDTCisVT<0, untyped>,
+                                             SDTCisVT<1, untyped>,
+                                             SDTCisVT<2, i64>]>;
+def SDT_ZAtomicLoadBinaryW  : SDTypeProfile<1, 5,
+                                            [SDTCisVT<0, i32>,
+                                             SDTCisPtrTy<1>,
+                                             SDTCisVT<2, i32>,
+                                             SDTCisVT<3, i32>,
+                                             SDTCisVT<4, i32>,
+                                             SDTCisVT<5, i32>]>;
+def SDT_ZAtomicCmpSwapW     : SDTypeProfile<1, 6,
+                                            [SDTCisVT<0, i32>,
+                                             SDTCisPtrTy<1>,
+                                             SDTCisVT<2, i32>,
+                                             SDTCisVT<3, i32>,
+                                             SDTCisVT<4, i32>,
+                                             SDTCisVT<5, i32>,
+                                             SDTCisVT<6, i32>]>;
+def SDT_ZMemMemLength       : SDTypeProfile<0, 3,
+                                            [SDTCisPtrTy<0>,
+                                             SDTCisPtrTy<1>,
+                                             SDTCisVT<2, i64>]>;
+def SDT_ZMemMemLoop         : SDTypeProfile<0, 4,
+                                            [SDTCisPtrTy<0>,
+                                             SDTCisPtrTy<1>,
+                                             SDTCisVT<2, i64>,
+                                             SDTCisVT<3, i64>]>;
+def SDT_ZString             : SDTypeProfile<1, 3,
+                                            [SDTCisPtrTy<0>,
+                                             SDTCisPtrTy<1>,
+                                             SDTCisPtrTy<2>,
+                                             SDTCisVT<3, i32>]>;
+def SDT_ZI32Intrinsic       : SDTypeProfile<1, 0, [SDTCisVT<0, i32>]>;
+def SDT_ZPrefetch           : SDTypeProfile<0, 2,
+                                            [SDTCisVT<0, i32>,
+                                             SDTCisPtrTy<1>]>;
+
+//===----------------------------------------------------------------------===//
+// Node definitions
+//===----------------------------------------------------------------------===//
+
+// These are target-independent nodes, but have target-specific formats.
+def callseq_start       : SDNode<"ISD::CALLSEQ_START", SDT_CallSeqStart,
+                                 [SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>;
+def callseq_end         : SDNode<"ISD::CALLSEQ_END",   SDT_CallSeqEnd,
+                                 [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue,
+                                  SDNPOutGlue]>;
+
+// Nodes for SystemZISD::*.  See SystemZISelLowering.h for more details.
+def z_retflag           : SDNode<"SystemZISD::RET_FLAG", SDTNone,
+                                 [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def z_call              : SDNode<"SystemZISD::CALL", SDT_ZCall,
+                                 [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
+                                  SDNPVariadic]>;
+def z_sibcall           : SDNode<"SystemZISD::SIBCALL", SDT_ZCall,
+                                 [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
+                                  SDNPVariadic]>;
+def z_pcrel_wrapper     : SDNode<"SystemZISD::PCREL_WRAPPER", SDT_ZWrapPtr, []>;
+def z_pcrel_offset      : SDNode<"SystemZISD::PCREL_OFFSET",
+                                 SDT_ZWrapOffset, []>;
+def z_iabs              : SDNode<"SystemZISD::IABS", SDTIntUnaryOp, []>;
+def z_icmp              : SDNode<"SystemZISD::ICMP", SDT_ZICmp, [SDNPOutGlue]>;
+def z_fcmp              : SDNode<"SystemZISD::FCMP", SDT_ZCmp, [SDNPOutGlue]>;
+def z_tm                : SDNode<"SystemZISD::TM", SDT_ZICmp, [SDNPOutGlue]>;
+def z_br_ccmask         : SDNode<"SystemZISD::BR_CCMASK", SDT_ZBRCCMask,
+                                 [SDNPHasChain, SDNPInGlue]>;
+def z_select_ccmask     : SDNode<"SystemZISD::SELECT_CCMASK", SDT_ZSelectCCMask,
+    		                 [SDNPInGlue]>;
+def z_adjdynalloc       : SDNode<"SystemZISD::ADJDYNALLOC", SDT_ZAdjDynAlloc>;
+def z_extract_access    : SDNode<"SystemZISD::EXTRACT_ACCESS",
+                                 SDT_ZExtractAccess>;
+def z_umul_lohi64       : SDNode<"SystemZISD::UMUL_LOHI64", SDT_ZGR128Binary64>;
+def z_sdivrem32         : SDNode<"SystemZISD::SDIVREM32", SDT_ZGR128Binary32>;
+def z_sdivrem64         : SDNode<"SystemZISD::SDIVREM64", SDT_ZGR128Binary64>;
+def z_udivrem32         : SDNode<"SystemZISD::UDIVREM32", SDT_ZGR128Binary32>;
+def z_udivrem64         : SDNode<"SystemZISD::UDIVREM64", SDT_ZGR128Binary64>;
+
+def z_serialize         : SDNode<"SystemZISD::SERIALIZE", SDTNone,
+                                 [SDNPHasChain, SDNPMayStore]>;
+
+class AtomicWOp<string name, SDTypeProfile profile = SDT_ZAtomicLoadBinaryW>
+  : SDNode<"SystemZISD::"##name, profile,
+           [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
+
+def z_atomic_swapw      : AtomicWOp<"ATOMIC_SWAPW">;
+def z_atomic_loadw_add  : AtomicWOp<"ATOMIC_LOADW_ADD">;
+def z_atomic_loadw_sub  : AtomicWOp<"ATOMIC_LOADW_SUB">;
+def z_atomic_loadw_and  : AtomicWOp<"ATOMIC_LOADW_AND">;
+def z_atomic_loadw_or   : AtomicWOp<"ATOMIC_LOADW_OR">;
+def z_atomic_loadw_xor  : AtomicWOp<"ATOMIC_LOADW_XOR">;
+def z_atomic_loadw_nand : AtomicWOp<"ATOMIC_LOADW_NAND">;
+def z_atomic_loadw_min  : AtomicWOp<"ATOMIC_LOADW_MIN">;
+def z_atomic_loadw_max  : AtomicWOp<"ATOMIC_LOADW_MAX">;
+def z_atomic_loadw_umin : AtomicWOp<"ATOMIC_LOADW_UMIN">;
+def z_atomic_loadw_umax : AtomicWOp<"ATOMIC_LOADW_UMAX">;
+def z_atomic_cmp_swapw  : AtomicWOp<"ATOMIC_CMP_SWAPW", SDT_ZAtomicCmpSwapW>;
+
+def z_mvc               : SDNode<"SystemZISD::MVC", SDT_ZMemMemLength,
+                                 [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_mvc_loop          : SDNode<"SystemZISD::MVC_LOOP", SDT_ZMemMemLoop,
+                                 [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_nc                : SDNode<"SystemZISD::NC", SDT_ZMemMemLength,
+                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_nc_loop           : SDNode<"SystemZISD::NC_LOOP", SDT_ZMemMemLoop,
+                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_oc                : SDNode<"SystemZISD::OC", SDT_ZMemMemLength,
+                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_oc_loop           : SDNode<"SystemZISD::OC_LOOP", SDT_ZMemMemLoop,
+                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_xc                : SDNode<"SystemZISD::XC", SDT_ZMemMemLength,
+                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_xc_loop           : SDNode<"SystemZISD::XC_LOOP", SDT_ZMemMemLoop,
+                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_clc               : SDNode<"SystemZISD::CLC", SDT_ZMemMemLength,
+                                 [SDNPHasChain, SDNPOutGlue, SDNPMayLoad]>;
+def z_clc_loop          : SDNode<"SystemZISD::CLC_LOOP", SDT_ZMemMemLoop,
+                                 [SDNPHasChain, SDNPOutGlue, SDNPMayLoad]>;
+def z_strcmp            : SDNode<"SystemZISD::STRCMP", SDT_ZString,
+                                 [SDNPHasChain, SDNPOutGlue, SDNPMayLoad]>;
+def z_stpcpy            : SDNode<"SystemZISD::STPCPY", SDT_ZString,
+                                 [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_search_string     : SDNode<"SystemZISD::SEARCH_STRING", SDT_ZString,
+                                 [SDNPHasChain, SDNPOutGlue, SDNPMayLoad]>;
+def z_ipm               : SDNode<"SystemZISD::IPM", SDT_ZI32Intrinsic,
+                                 [SDNPInGlue]>;
+def z_prefetch          : SDNode<"SystemZISD::PREFETCH", SDT_ZPrefetch,
+                                 [SDNPHasChain, SDNPMayLoad, SDNPMayStore,
+                                  SDNPMemOperand]>;
+
+//===----------------------------------------------------------------------===//
+// Pattern fragments
+//===----------------------------------------------------------------------===//
+
+// Signed and unsigned comparisons.
+def z_scmp : PatFrag<(ops node:$a, node:$b), (z_icmp node:$a, node:$b, imm), [{
+  unsigned Type = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+  return Type != SystemZICMP::UnsignedOnly;
+}]>;
+def z_ucmp : PatFrag<(ops node:$a, node:$b), (z_icmp node:$a, node:$b, imm), [{
+  unsigned Type = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+  return Type != SystemZICMP::SignedOnly;
+}]>;
+
+// Register- and memory-based TEST UNDER MASK.
+def z_tm_reg : PatFrag<(ops node:$a, node:$b), (z_tm node:$a, node:$b, imm)>;
+def z_tm_mem : PatFrag<(ops node:$a, node:$b), (z_tm node:$a, node:$b, 0)>;
+
+// Register sign-extend operations.  Sub-32-bit values are represented as i32s.
+def sext8  : PatFrag<(ops node:$src), (sext_inreg node:$src, i8)>;
+def sext16 : PatFrag<(ops node:$src), (sext_inreg node:$src, i16)>;
+def sext32 : PatFrag<(ops node:$src), (sext (i32 node:$src))>;
+
+// Register zero-extend operations.  Sub-32-bit values are represented as i32s.
+def zext8  : PatFrag<(ops node:$src), (and node:$src, 0xff)>;
+def zext16 : PatFrag<(ops node:$src), (and node:$src, 0xffff)>;
+def zext32 : PatFrag<(ops node:$src), (zext (i32 node:$src))>;
+
+// Typed floating-point loads.
+def loadf32 : PatFrag<(ops node:$src), (f32 (load node:$src))>;
+def loadf64 : PatFrag<(ops node:$src), (f64 (load node:$src))>;
+
+// Extending loads in which the extension type can be signed.
+def asextload : PatFrag<(ops node:$ptr), (unindexedload node:$ptr), [{
+  unsigned Type = cast<LoadSDNode>(N)->getExtensionType();
+  return Type == ISD::EXTLOAD || Type == ISD::SEXTLOAD;
+}]>;
+def asextloadi8 : PatFrag<(ops node:$ptr), (asextload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+def asextloadi16 : PatFrag<(ops node:$ptr), (asextload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+def asextloadi32 : PatFrag<(ops node:$ptr), (asextload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+// Extending loads in which the extension type can be unsigned.
+def azextload : PatFrag<(ops node:$ptr), (unindexedload node:$ptr), [{
+  unsigned Type = cast<LoadSDNode>(N)->getExtensionType();
+  return Type == ISD::EXTLOAD || Type == ISD::ZEXTLOAD;
+}]>;
+def azextloadi8 : PatFrag<(ops node:$ptr), (azextload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+def azextloadi16 : PatFrag<(ops node:$ptr), (azextload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+def azextloadi32 : PatFrag<(ops node:$ptr), (azextload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+// Extending loads in which the extension type doesn't matter.
+def anyextload : PatFrag<(ops node:$ptr), (unindexedload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getExtensionType() != ISD::NON_EXTLOAD;
+}]>;
+def anyextloadi8 : PatFrag<(ops node:$ptr), (anyextload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+def anyextloadi16 : PatFrag<(ops node:$ptr), (anyextload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+def anyextloadi32 : PatFrag<(ops node:$ptr), (anyextload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+// Aligned loads.
+class AlignedLoad<SDPatternOperator load>
+  : PatFrag<(ops node:$addr), (load node:$addr), [{
+  auto *Load = cast<LoadSDNode>(N);
+  return Load->getAlignment() >= Load->getMemoryVT().getStoreSize();
+}]>;
+def aligned_load         : AlignedLoad<load>;
+def aligned_asextloadi16 : AlignedLoad<asextloadi16>;
+def aligned_asextloadi32 : AlignedLoad<asextloadi32>;
+def aligned_azextloadi16 : AlignedLoad<azextloadi16>;
+def aligned_azextloadi32 : AlignedLoad<azextloadi32>;
+
+// Aligned stores.
+class AlignedStore<SDPatternOperator store>
+  : PatFrag<(ops node:$src, node:$addr), (store node:$src, node:$addr), [{
+  auto *Store = cast<StoreSDNode>(N);
+  return Store->getAlignment() >= Store->getMemoryVT().getStoreSize();
+}]>;
+def aligned_store         : AlignedStore<store>;
+def aligned_truncstorei16 : AlignedStore<truncstorei16>;
+def aligned_truncstorei32 : AlignedStore<truncstorei32>;
+
+// Non-volatile loads.  Used for instructions that might access the storage
+// location multiple times.
+class NonvolatileLoad<SDPatternOperator load>
+  : PatFrag<(ops node:$addr), (load node:$addr), [{
+  auto *Load = cast<LoadSDNode>(N);
+  return !Load->isVolatile();
+}]>;
+def nonvolatile_load          : NonvolatileLoad<load>;
+def nonvolatile_anyextloadi8  : NonvolatileLoad<anyextloadi8>;
+def nonvolatile_anyextloadi16 : NonvolatileLoad<anyextloadi16>;
+def nonvolatile_anyextloadi32 : NonvolatileLoad<anyextloadi32>;
+
+// Non-volatile stores.
+class NonvolatileStore<SDPatternOperator store>
+  : PatFrag<(ops node:$src, node:$addr), (store node:$src, node:$addr), [{
+  auto *Store = cast<StoreSDNode>(N);
+  return !Store->isVolatile();
+}]>;
+def nonvolatile_store         : NonvolatileStore<store>;
+def nonvolatile_truncstorei8  : NonvolatileStore<truncstorei8>;
+def nonvolatile_truncstorei16 : NonvolatileStore<truncstorei16>;
+def nonvolatile_truncstorei32 : NonvolatileStore<truncstorei32>;
+
+// A store of a load that can be implemented using MVC.
+def mvc_store : PatFrag<(ops node:$value, node:$addr),
+                        (unindexedstore node:$value, node:$addr),
+                        [{ return storeLoadCanUseMVC(N); }]>;
+
+// Binary read-modify-write operations on memory in which the other
+// operand is also memory and for which block operations like NC can
+// be used.  There are two patterns for each operator, depending on
+// which operand contains the "other" load.
+multiclass block_op<SDPatternOperator operator> {
+  def "1" : PatFrag<(ops node:$value, node:$addr),
+                    (unindexedstore (operator node:$value,
+                                              (unindexedload node:$addr)),
+                                    node:$addr),
+                    [{ return storeLoadCanUseBlockBinary(N, 0); }]>;
+  def "2" : PatFrag<(ops node:$value, node:$addr),
+                    (unindexedstore (operator (unindexedload node:$addr),
+                                              node:$value),
+                                    node:$addr),
+                    [{ return storeLoadCanUseBlockBinary(N, 1); }]>;
+}
+defm block_and : block_op<and>;
+defm block_or  : block_op<or>;
+defm block_xor : block_op<xor>;
+
+// Insertions.
+def inserti8 : PatFrag<(ops node:$src1, node:$src2),
+                       (or (and node:$src1, -256), node:$src2)>;
+def insertll : PatFrag<(ops node:$src1, node:$src2),
+                       (or (and node:$src1, 0xffffffffffff0000), node:$src2)>;
+def insertlh : PatFrag<(ops node:$src1, node:$src2),
+                       (or (and node:$src1, 0xffffffff0000ffff), node:$src2)>;
+def inserthl : PatFrag<(ops node:$src1, node:$src2),
+                       (or (and node:$src1, 0xffff0000ffffffff), node:$src2)>;
+def inserthh : PatFrag<(ops node:$src1, node:$src2),
+                       (or (and node:$src1, 0x0000ffffffffffff), node:$src2)>;
+def insertlf : PatFrag<(ops node:$src1, node:$src2),
+                       (or (and node:$src1, 0xffffffff00000000), node:$src2)>;
+def inserthf : PatFrag<(ops node:$src1, node:$src2),
+                       (or (and node:$src1, 0x00000000ffffffff), node:$src2)>;
+
+// ORs that can be treated as insertions.
+def or_as_inserti8 : PatFrag<(ops node:$src1, node:$src2),
+                             (or node:$src1, node:$src2), [{
+  unsigned BitWidth = N->getValueType(0).getScalarType().getSizeInBits();
+  return CurDAG->MaskedValueIsZero(N->getOperand(0),
+                                   APInt::getLowBitsSet(BitWidth, 8));
+}]>;
+
+// ORs that can be treated as reversed insertions.
+def or_as_revinserti8 : PatFrag<(ops node:$src1, node:$src2),
+                                (or node:$src1, node:$src2), [{
+  unsigned BitWidth = N->getValueType(0).getScalarType().getSizeInBits();
+  return CurDAG->MaskedValueIsZero(N->getOperand(1),
+                                   APInt::getLowBitsSet(BitWidth, 8));
+}]>;
+
+// Negative integer absolute.
+def z_inegabs : PatFrag<(ops node:$src), (ineg (z_iabs node:$src))>;
+
+// Integer absolute, matching the canonical form generated by DAGCombiner.
+def z_iabs32 : PatFrag<(ops node:$src),
+                       (xor (add node:$src, (sra node:$src, (i32 31))),
+                            (sra node:$src, (i32 31)))>;
+def z_iabs64 : PatFrag<(ops node:$src),
+                       (xor (add node:$src, (sra node:$src, (i32 63))),
+                            (sra node:$src, (i32 63)))>;
+def z_inegabs32 : PatFrag<(ops node:$src), (ineg (z_iabs32 node:$src))>;
+def z_inegabs64 : PatFrag<(ops node:$src), (ineg (z_iabs64 node:$src))>;
+
+// Fused multiply-add and multiply-subtract, but with the order of the
+// operands matching SystemZ's MA and MS instructions.
+def z_fma : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+                    (fma node:$src2, node:$src3, node:$src1)>;
+def z_fms : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+                    (fma node:$src2, node:$src3, (fneg node:$src1))>;
+
+// Floating-point negative absolute.
+def fnabs : PatFrag<(ops node:$ptr), (fneg (fabs node:$ptr))>;
+
+// Create a unary operator that loads from memory and then performs
+// the given operation on it.
+class loadu<SDPatternOperator operator, SDPatternOperator load = load>
+  : PatFrag<(ops node:$addr), (operator (load node:$addr))>;
+
+// Create a store operator that performs the given unary operation
+// on the value before storing it.
+class storeu<SDPatternOperator operator, SDPatternOperator store = store>
+  : PatFrag<(ops node:$value, node:$addr),
+            (store (operator node:$value), node:$addr)>;
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZPatterns.td b/contrib/llvm/lib/Target/SystemZ/SystemZPatterns.td
new file mode 100644
index 0000000..e307f8a
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZPatterns.td
@@ -0,0 +1,155 @@
+//===-- SystemZPatterns.td - SystemZ-specific pattern rules ---*- tblgen-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// Record that INSN performs a 64-bit version of unary operator OPERATOR
+// in which the operand is sign-extended from 32 to 64 bits.
+multiclass SXU<SDPatternOperator operator, Instruction insn> {
+  def : Pat<(operator (sext (i32 GR32:$src))),
+            (insn GR32:$src)>;
+  def : Pat<(operator (sext_inreg GR64:$src, i32)),
+            (insn (EXTRACT_SUBREG GR64:$src, subreg_l32))>;
+}
+
+// Record that INSN performs a 64-bit version of binary operator OPERATOR
+// in which the first operand has class CLS and which the second operand
+// is sign-extended from a 32-bit register.
+multiclass SXB<SDPatternOperator operator, RegisterOperand cls,
+               Instruction insn> {
+  def : Pat<(operator cls:$src1, (sext GR32:$src2)),
+            (insn cls:$src1, GR32:$src2)>;
+  def : Pat<(operator cls:$src1, (sext_inreg GR64:$src2, i32)),
+            (insn cls:$src1, (EXTRACT_SUBREG GR64:$src2, subreg_l32))>;
+}
+
+// Like SXB, but for zero extension.
+multiclass ZXB<SDPatternOperator operator, RegisterOperand cls,
+               Instruction insn> {
+  def : Pat<(operator cls:$src1, (zext GR32:$src2)),
+            (insn cls:$src1, GR32:$src2)>;
+  def : Pat<(operator cls:$src1, (and GR64:$src2, 0xffffffff)),
+            (insn cls:$src1, (EXTRACT_SUBREG GR64:$src2, subreg_l32))>;
+}
+
+// Record that INSN performs a binary read-modify-write operation,
+// with LOAD, OPERATOR and STORE being the read, modify and write
+// respectively.  MODE is the addressing mode and IMM is the type
+// of the second operand.
+class RMWI<SDPatternOperator load, SDPatternOperator operator,
+           SDPatternOperator store, AddressingMode mode,
+           PatFrag imm, Instruction insn>
+  : Pat<(store (operator (load mode:$addr), imm:$src), mode:$addr),
+        (insn mode:$addr, (UIMM8 imm:$src))>;
+
+// Record that INSN performs binary operation OPERATION on a byte
+// memory location.  IMM is the type of the second operand.
+multiclass RMWIByte<SDPatternOperator operator, AddressingMode mode,
+                    Instruction insn> {
+  def : RMWI<anyextloadi8, operator, truncstorei8, mode, imm32, insn>;
+  def : RMWI<anyextloadi8, operator, truncstorei8, mode, imm64, insn>;
+}
+
+// Record that INSN performs insertion TYPE into a register of class CLS.
+// The inserted operand is loaded using LOAD from an address of mode MODE.
+multiclass InsertMem<string type, Instruction insn, RegisterOperand cls,
+                     SDPatternOperator load, AddressingMode mode> {
+  def : Pat<(!cast<SDPatternOperator>("or_as_"##type)
+              cls:$src1, (load mode:$src2)),
+            (insn cls:$src1, mode:$src2)>;
+  def : Pat<(!cast<SDPatternOperator>("or_as_rev"##type)
+              (load mode:$src2), cls:$src1),
+            (insn cls:$src1, mode:$src2)>;
+}
+
+// INSN stores the low 32 bits of a GPR to a memory with addressing mode MODE.
+// Record that it is equivalent to using OPERATOR to store a GR64.
+class StoreGR64<Instruction insn, SDPatternOperator operator,
+                AddressingMode mode>
+  : Pat<(operator GR64:$R1, mode:$XBD2),
+        (insn (EXTRACT_SUBREG GR64:$R1, subreg_l32), mode:$XBD2)>;
+
+// INSN and INSNY are an RX/RXY pair of instructions that store the low
+// 32 bits of a GPR to memory.  Record that they are equivalent to using
+// OPERATOR to store a GR64.
+multiclass StoreGR64Pair<Instruction insn, Instruction insny,
+                         SDPatternOperator operator> {
+  def : StoreGR64<insn, operator, bdxaddr12pair>;
+  def : StoreGR64<insny, operator, bdxaddr20pair>;
+}
+
+// INSN stores the low 32 bits of a GPR using PC-relative addressing.
+// Record that it is equivalent to using OPERATOR to store a GR64.
+class StoreGR64PC<Instruction insn, SDPatternOperator operator>
+  : Pat<(operator GR64:$R1, pcrel32:$XBD2),
+        (insn (EXTRACT_SUBREG GR64:$R1, subreg_l32), pcrel32:$XBD2)> {
+  // We want PC-relative addresses to be tried ahead of BD and BDX addresses.
+  // However, BDXs have two extra operands and are therefore 6 units more
+  // complex.
+  let AddedComplexity = 7;
+}
+
+// INSN and INSNINV conditionally store the low 32 bits of a GPR to memory,
+// with INSN storing when the condition is true and INSNINV storing when the
+// condition is false.  Record that they are equivalent to a LOAD/select/STORE
+// sequence for GR64s.
+multiclass CondStores64<Instruction insn, Instruction insninv,
+                        SDPatternOperator store, SDPatternOperator load,
+                        AddressingMode mode> {
+  def : Pat<(store (z_select_ccmask GR64:$new, (load mode:$addr),
+                                    imm32zx4:$valid, imm32zx4:$cc),
+                   mode:$addr),
+            (insn (EXTRACT_SUBREG GR64:$new, subreg_l32), mode:$addr,
+                  imm32zx4:$valid, imm32zx4:$cc)>;
+  def : Pat<(store (z_select_ccmask (load mode:$addr), GR64:$new,
+                                    imm32zx4:$valid, imm32zx4:$cc),
+                   mode:$addr),
+            (insninv (EXTRACT_SUBREG GR64:$new, subreg_l32), mode:$addr,
+                     imm32zx4:$valid, imm32zx4:$cc)>;
+}
+
+// Try to use MVC instruction INSN for a load of type LOAD followed by a store
+// of the same size.  VT is the type of the intermediate (legalized) value and
+// LENGTH is the number of bytes loaded by LOAD.
+multiclass MVCLoadStore<SDPatternOperator load, ValueType vt, Instruction insn,
+                        bits<5> length> {
+  def : Pat<(mvc_store (vt (load bdaddr12only:$src)), bdaddr12only:$dest),
+            (insn bdaddr12only:$dest, bdaddr12only:$src, length)>;
+}
+
+// Use NC-like instruction INSN for block_op operation OPERATOR.
+// The other operand is a load of type LOAD, which accesses LENGTH bytes.
+// VT is the intermediate legalized type in which the binary operation
+// is actually done.
+multiclass BinaryLoadStore<SDPatternOperator operator, SDPatternOperator load,
+                           ValueType vt, Instruction insn, bits<5> length> {
+  def : Pat<(operator (vt (load bdaddr12only:$src)), bdaddr12only:$dest),
+            (insn bdaddr12only:$dest, bdaddr12only:$src, length)>;
+}
+
+// A convenient way of generating all block peepholes for a particular
+// LOAD/VT/LENGTH combination.
+multiclass BlockLoadStore<SDPatternOperator load, ValueType vt,
+                          Instruction mvc, Instruction nc, Instruction oc,
+                          Instruction xc, bits<5> length> {
+  defm : MVCLoadStore<load, vt, mvc, length>;
+  defm : BinaryLoadStore<block_and1, load, vt, nc, length>;
+  defm : BinaryLoadStore<block_and2, load, vt, nc, length>;
+  defm : BinaryLoadStore<block_or1,  load, vt, oc, length>;
+  defm : BinaryLoadStore<block_or2,  load, vt, oc, length>;
+  defm : BinaryLoadStore<block_xor1, load, vt, xc, length>;
+  defm : BinaryLoadStore<block_xor2, load, vt, xc, length>;
+}
+
+// Record that INSN is a LOAD AND TEST that can be used to compare
+// registers in CLS against zero.  The instruction has separate R1 and R2
+// operands, but they must be the same when the instruction is used like this.
+multiclass CompareZeroFP<Instruction insn, RegisterOperand cls> {
+  def : Pat<(z_fcmp cls:$reg, (fpimm0)), (insn cls:$reg, cls:$reg)>;
+  // The sign of the zero makes no difference.
+  def : Pat<(z_fcmp cls:$reg, (fpimmneg0)), (insn cls:$reg, cls:$reg)>;
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZProcessors.td b/contrib/llvm/lib/Target/SystemZ/SystemZProcessors.td
new file mode 100644
index 0000000..e6b58f1
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZProcessors.td
@@ -0,0 +1,62 @@
+//===-- SystemZ.td - SystemZ processors and features ---------*- tblgen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Processor and feature definitions.
+//
+//===----------------------------------------------------------------------===//
+
+class SystemZFeature<string extname, string intname, string desc>
+  : Predicate<"Subtarget.has"##intname##"()">,
+    AssemblerPredicate<"Feature"##intname, extname>,
+    SubtargetFeature<extname, "Has"##intname, "true", desc>;
+
+class SystemZMissingFeature<string intname>
+  : Predicate<"!Subtarget.has"##intname##"()">;
+
+def FeatureDistinctOps : SystemZFeature<
+  "distinct-ops", "DistinctOps",
+  "Assume that the distinct-operands facility is installed"
+>;
+
+def FeatureLoadStoreOnCond : SystemZFeature<
+  "load-store-on-cond", "LoadStoreOnCond",
+  "Assume that the load/store-on-condition facility is installed"
+>;
+
+def FeatureHighWord : SystemZFeature<
+  "high-word", "HighWord",
+  "Assume that the high-word facility is installed"
+>;
+
+def FeatureFPExtension : SystemZFeature<
+  "fp-extension", "FPExtension",
+  "Assume that the floating-point extension facility is installed"
+>;
+
+def FeatureFastSerialization : SystemZFeature<
+  "fast-serialization", "FastSerialization",
+  "Assume that the fast-serialization facility is installed"
+>;
+
+def FeatureInterlockedAccess1 : SystemZFeature<
+  "interlocked-access1", "InterlockedAccess1",
+  "Assume that interlocked-access facility 1 is installed"
+>;
+def FeatureNoInterlockedAccess1 : SystemZMissingFeature<"InterlockedAccess1">;
+
+def : Processor<"generic", NoItineraries, []>;
+def : Processor<"z10", NoItineraries, []>;
+def : Processor<"z196", NoItineraries,
+                [FeatureDistinctOps, FeatureLoadStoreOnCond, FeatureHighWord,
+                 FeatureFPExtension, FeatureFastSerialization,
+                 FeatureInterlockedAccess1]>;
+def : Processor<"zEC12", NoItineraries,
+                [FeatureDistinctOps, FeatureLoadStoreOnCond, FeatureHighWord,
+                 FeatureFPExtension, FeatureFastSerialization,
+                 FeatureInterlockedAccess1]>;
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp
new file mode 100644
index 0000000..f03bcc4
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp
@@ -0,0 +1,139 @@
+//===-- SystemZRegisterInfo.cpp - SystemZ register information ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZInstrInfo.h"
+#include "SystemZRegisterInfo.h"
+#include "SystemZSubtarget.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+using namespace llvm;
+
+#define GET_REGINFO_TARGET_DESC
+#include "SystemZGenRegisterInfo.inc"
+
+SystemZRegisterInfo::SystemZRegisterInfo()
+    : SystemZGenRegisterInfo(SystemZ::R14D) {}
+
+const MCPhysReg *
+SystemZRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  return CSR_SystemZ_SaveList;
+}
+
+const uint32_t *
+SystemZRegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
+  return CSR_SystemZ_RegMask;
+}
+
+BitVector
+SystemZRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  if (TFI->hasFP(MF)) {
+    // R11D is the frame pointer.  Reserve all aliases.
+    Reserved.set(SystemZ::R11D);
+    Reserved.set(SystemZ::R11L);
+    Reserved.set(SystemZ::R11H);
+    Reserved.set(SystemZ::R10Q);
+  }
+
+  // R15D is the stack pointer.  Reserve all aliases.
+  Reserved.set(SystemZ::R15D);
+  Reserved.set(SystemZ::R15L);
+  Reserved.set(SystemZ::R15H);
+  Reserved.set(SystemZ::R14Q);
+  return Reserved;
+}
+
+void
+SystemZRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
+                                         int SPAdj, unsigned FIOperandNum,
+                                         RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Outgoing arguments should be part of the frame");
+
+  MachineBasicBlock &MBB = *MI->getParent();
+  MachineFunction &MF = *MBB.getParent();
+  auto *TII =
+      static_cast<const SystemZInstrInfo *>(MF.getTarget().getInstrInfo());
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  DebugLoc DL = MI->getDebugLoc();
+
+  // Decompose the frame index into a base and offset.
+  int FrameIndex = MI->getOperand(FIOperandNum).getIndex();
+  unsigned BasePtr = getFrameRegister(MF);
+  int64_t Offset = (TFI->getFrameIndexOffset(MF, FrameIndex) +
+                    MI->getOperand(FIOperandNum + 1).getImm());
+
+  // Special handling of dbg_value instructions.
+  if (MI->isDebugValue()) {
+    MI->getOperand(FIOperandNum).ChangeToRegister(BasePtr, /*isDef*/ false);
+    MI->getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+    return;
+  }
+
+  // See if the offset is in range, or if an equivalent instruction that
+  // accepts the offset exists.
+  unsigned Opcode = MI->getOpcode();
+  unsigned OpcodeForOffset = TII->getOpcodeForOffset(Opcode, Offset);
+  if (OpcodeForOffset)
+    MI->getOperand(FIOperandNum).ChangeToRegister(BasePtr, false);
+  else {
+    // Create an anchor point that is in range.  Start at 0xffff so that
+    // can use LLILH to load the immediate.
+    int64_t OldOffset = Offset;
+    int64_t Mask = 0xffff;
+    do {
+      Offset = OldOffset & Mask;
+      OpcodeForOffset = TII->getOpcodeForOffset(Opcode, Offset);
+      Mask >>= 1;
+      assert(Mask && "One offset must be OK");
+    } while (!OpcodeForOffset);
+
+    unsigned ScratchReg =
+      MF.getRegInfo().createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+    int64_t HighOffset = OldOffset - Offset;
+
+    if (MI->getDesc().TSFlags & SystemZII::HasIndex
+        && MI->getOperand(FIOperandNum + 2).getReg() == 0) {
+      // Load the offset into the scratch register and use it as an index.
+      // The scratch register then dies here.
+      TII->loadImmediate(MBB, MI, ScratchReg, HighOffset);
+      MI->getOperand(FIOperandNum).ChangeToRegister(BasePtr, false);
+      MI->getOperand(FIOperandNum + 2).ChangeToRegister(ScratchReg,
+                                                        false, false, true);
+    } else {
+      // Load the anchor address into a scratch register.
+      unsigned LAOpcode = TII->getOpcodeForOffset(SystemZ::LA, HighOffset);
+      if (LAOpcode)
+        BuildMI(MBB, MI, DL, TII->get(LAOpcode),ScratchReg)
+          .addReg(BasePtr).addImm(HighOffset).addReg(0);
+      else {
+        // Load the high offset into the scratch register and use it as
+        // an index.
+        TII->loadImmediate(MBB, MI, ScratchReg, HighOffset);
+        BuildMI(MBB, MI, DL, TII->get(SystemZ::AGR),ScratchReg)
+          .addReg(ScratchReg, RegState::Kill).addReg(BasePtr);
+      }
+
+      // Use the scratch register as the base.  It then dies here.
+      MI->getOperand(FIOperandNum).ChangeToRegister(ScratchReg,
+                                                    false, false, true);
+    }
+  }
+  MI->setDesc(TII->get(OpcodeForOffset));
+  MI->getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+}
+
+unsigned
+SystemZRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  return TFI->hasFP(MF) ? SystemZ::R11D : SystemZ::R15D;
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h b/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h
new file mode 100644
index 0000000..9bffa46
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h
@@ -0,0 +1,58 @@
+//===-- SystemZRegisterInfo.h - SystemZ register information ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SystemZREGISTERINFO_H
+#define SystemZREGISTERINFO_H
+
+#include "SystemZ.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "SystemZGenRegisterInfo.inc"
+
+namespace llvm {
+
+namespace SystemZ {
+// Return the subreg to use for referring to the even and odd registers
+// in a GR128 pair.  Is32Bit says whether we want a GR32 or GR64.
+inline unsigned even128(bool Is32bit) {
+  return Is32bit ? subreg_hl32 : subreg_h64;
+}
+inline unsigned odd128(bool Is32bit) {
+  return Is32bit ? subreg_l32 : subreg_l64;
+}
+} // end namespace SystemZ
+
+struct SystemZRegisterInfo : public SystemZGenRegisterInfo {
+public:
+  SystemZRegisterInfo();
+
+  // Override TargetRegisterInfo.h.
+  bool requiresRegisterScavenging(const MachineFunction &MF) const override {
+    return true;
+  }
+  bool requiresFrameIndexScavenging(const MachineFunction &MF) const override {
+    return true;
+  }
+  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override {
+    return true;
+  }
+  const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF = nullptr) const
+    override;
+  const uint32_t *getCallPreservedMask(CallingConv::ID CC) const override;
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+  void eliminateFrameIndex(MachineBasicBlock::iterator MI,
+                           int SPAdj, unsigned FIOperandNum,
+                           RegScavenger *RS) const override;
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td b/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td
new file mode 100644
index 0000000..47ac20d
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td
@@ -0,0 +1,190 @@
+//==- SystemZRegisterInfo.td - SystemZ register definitions -*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Class definitions.
+//===----------------------------------------------------------------------===//
+
+class SystemZReg<string n> : Register<n> {
+  let Namespace = "SystemZ";
+}
+
+class SystemZRegWithSubregs<string n, list<Register> subregs>
+  : RegisterWithSubRegs<n, subregs> {
+  let Namespace = "SystemZ";
+}
+
+let Namespace = "SystemZ" in {
+def subreg_l32   : SubRegIndex<32, 0>;  // Also acts as subreg_ll32.
+def subreg_h32   : SubRegIndex<32, 32>; // Also acts as subreg_lh32.
+def subreg_l64   : SubRegIndex<64, 0>;
+def subreg_h64   : SubRegIndex<64, 64>;
+def subreg_hh32  : ComposedSubRegIndex<subreg_h64, subreg_h32>;
+def subreg_hl32  : ComposedSubRegIndex<subreg_h64, subreg_l32>;
+}
+
+// Define a register class that contains values of type TYPE and an
+// associated operand called NAME.  SIZE is the size and alignment
+// of the registers and REGLIST is the list of individual registers.
+multiclass SystemZRegClass<string name, ValueType type, int size, dag regList> {
+  def AsmOperand : AsmOperandClass {
+    let Name = name;
+    let ParserMethod = "parse"##name;
+    let RenderMethod = "addRegOperands";
+  }
+  def Bit : RegisterClass<"SystemZ", [type], size, regList> {
+    let Size = size;
+  }
+  def "" : RegisterOperand<!cast<RegisterClass>(name##"Bit")> {
+    let ParserMatchClass = !cast<AsmOperandClass>(name##"AsmOperand");
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// General-purpose registers
+//===----------------------------------------------------------------------===//
+
+// Lower 32 bits of one of the 16 64-bit general-purpose registers
+class GPR32<bits<16> num, string n> : SystemZReg<n> {
+  let HWEncoding = num;
+}
+
+// One of the 16 64-bit general-purpose registers.
+class GPR64<bits<16> num, string n, GPR32 low, GPR32 high>
+ : SystemZRegWithSubregs<n, [low, high]> {
+  let HWEncoding = num;
+  let SubRegIndices = [subreg_l32, subreg_h32];
+}
+
+// 8 even-odd pairs of GPR64s.
+class GPR128<bits<16> num, string n, GPR64 low, GPR64 high>
+ : SystemZRegWithSubregs<n, [low, high]> {
+  let HWEncoding = num;
+  let SubRegIndices = [subreg_l64, subreg_h64];
+}
+
+// General-purpose registers
+foreach I = 0-15 in {
+  def R#I#L : GPR32<I, "r"#I>;
+  def R#I#H : GPR32<I, "r"#I>;
+  def R#I#D : GPR64<I, "r"#I, !cast<GPR32>("R"#I#"L"), !cast<GPR32>("R"#I#"H")>,
+                    DwarfRegNum<[I]>;
+}
+
+foreach I = [0, 2, 4, 6, 8, 10, 12, 14] in {
+  def R#I#Q : GPR128<I, "r"#I, !cast<GPR64>("R"#!add(I, 1)#"D"),
+                     !cast<GPR64>("R"#I#"D")>;
+}
+
+/// Allocate the callee-saved R6-R13 backwards. That way they can be saved
+/// together with R14 and R15 in one prolog instruction.
+defm GR32  : SystemZRegClass<"GR32",  i32, 32, (add (sequence "R%uL",  0, 5),
+                                                    (sequence "R%uL", 15, 6))>;
+defm GRH32 : SystemZRegClass<"GRH32", i32, 32, (add (sequence "R%uH",  0, 5),
+                                                    (sequence "R%uH", 15, 6))>;
+defm GR64  : SystemZRegClass<"GR64",  i64, 64, (add (sequence "R%uD",  0, 5),
+                                                    (sequence "R%uD", 15, 6))>;
+
+// Combine the low and high GR32s into a single class.  This can only be
+// used for virtual registers if the high-word facility is available.
+defm GRX32 : SystemZRegClass<"GRX32", i32, 32,
+                             (add (sequence "R%uL",  0, 5),
+                                  (sequence "R%uH",  0, 5),
+                                  R15L, R15H, R14L, R14H, R13L, R13H,
+                                  R12L, R12H, R11L, R11H, R10L, R10H,
+                                  R9L, R9H, R8L, R8H, R7L, R7H, R6L, R6H)>;
+
+// The architecture doesn't really have any i128 support, so model the
+// register pairs as untyped instead.
+defm GR128 : SystemZRegClass<"GR128", untyped, 128, (add R0Q, R2Q, R4Q,
+                                                         R12Q, R10Q, R8Q, R6Q,
+                                                         R14Q)>;
+
+// Base and index registers.  Everything except R0, which in an address
+// context evaluates as 0.
+defm ADDR32 : SystemZRegClass<"ADDR32", i32, 32, (sub GR32Bit, R0L)>;
+defm ADDR64 : SystemZRegClass<"ADDR64", i64, 64, (sub GR64Bit, R0D)>;
+
+// Not used directly, but needs to exist for ADDR32 and ADDR64 subregs
+// of a GR128.
+defm ADDR128 : SystemZRegClass<"ADDR128", untyped, 128, (sub GR128Bit, R0Q)>;
+
+//===----------------------------------------------------------------------===//
+// Floating-point registers
+//===----------------------------------------------------------------------===//
+
+// Maps FPR register numbers to their DWARF encoding.
+class DwarfMapping<int id> { int Id = id; }
+
+def F0Dwarf  : DwarfMapping<16>;
+def F2Dwarf  : DwarfMapping<17>;
+def F4Dwarf  : DwarfMapping<18>;
+def F6Dwarf  : DwarfMapping<19>;
+
+def F1Dwarf  : DwarfMapping<20>;
+def F3Dwarf  : DwarfMapping<21>;
+def F5Dwarf  : DwarfMapping<22>;
+def F7Dwarf  : DwarfMapping<23>;
+
+def F8Dwarf  : DwarfMapping<24>;
+def F10Dwarf : DwarfMapping<25>;
+def F12Dwarf : DwarfMapping<26>;
+def F14Dwarf : DwarfMapping<27>;
+
+def F9Dwarf  : DwarfMapping<28>;
+def F11Dwarf : DwarfMapping<29>;
+def F13Dwarf : DwarfMapping<30>;
+def F15Dwarf : DwarfMapping<31>;
+
+// Lower 32 bits of one of the 16 64-bit floating-point registers
+class FPR32<bits<16> num, string n> : SystemZReg<n> {
+  let HWEncoding = num;
+}
+
+// One of the 16 64-bit floating-point registers
+class FPR64<bits<16> num, string n, FPR32 low>
+ : SystemZRegWithSubregs<n, [low]> {
+  let HWEncoding = num;
+  let SubRegIndices = [subreg_h32];
+}
+
+// 8 pairs of FPR64s, with a one-register gap inbetween.
+class FPR128<bits<16> num, string n, FPR64 low, FPR64 high>
+ : SystemZRegWithSubregs<n, [low, high]> {
+  let HWEncoding = num;
+  let SubRegIndices = [subreg_l64, subreg_h64];
+}
+
+// Floating-point registers
+foreach I = 0-15 in {
+  def F#I#S : FPR32<I, "f"#I>;
+  def F#I#D : FPR64<I, "f"#I, !cast<FPR32>("F"#I#"S")>,
+              DwarfRegNum<[!cast<DwarfMapping>("F"#I#"Dwarf").Id]>;
+}
+
+foreach I = [0, 1, 4, 5, 8, 9, 12, 13] in {
+  def F#I#Q  : FPR128<I, "f"#I, !cast<FPR64>("F"#!add(I, 2)#"D"),
+                     !cast<FPR64>("F"#I#"D")>;
+}
+
+// There's no store-multiple instruction for FPRs, so we're not fussy
+// about the order in which call-saved registers are allocated.
+defm FP32  : SystemZRegClass<"FP32", f32, 32, (sequence "F%uS", 0, 15)>;
+defm FP64  : SystemZRegClass<"FP64", f64, 64, (sequence "F%uD", 0, 15)>;
+defm FP128 : SystemZRegClass<"FP128", f128, 128, (add F0Q, F1Q, F4Q, F5Q,
+                                                      F8Q, F9Q, F12Q, F13Q)>;
+
+//===----------------------------------------------------------------------===//
+// Other registers
+//===----------------------------------------------------------------------===//
+
+// The 2-bit condition code field of the PSW.  Every register named in an
+// inline asm needs a class associated with it.
+def CC : SystemZReg<"cc">;
+def CCRegs : RegisterClass<"SystemZ", [i32], 32, (add CC)>;
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
new file mode 100644
index 0000000..a3cba64
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
@@ -0,0 +1,292 @@
+//===-- SystemZSelectionDAGInfo.cpp - SystemZ SelectionDAG Info -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SystemZSelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZTargetMachine.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "systemz-selectiondag-info"
+
+SystemZSelectionDAGInfo::SystemZSelectionDAGInfo(const DataLayout &DL)
+    : TargetSelectionDAGInfo(&DL) {}
+
+SystemZSelectionDAGInfo::~SystemZSelectionDAGInfo() {
+}
+
+// Decide whether it is best to use a loop or straight-line code for
+// a block operation of Size bytes with source address Src and destination
+// address Dest.  Sequence is the opcode to use for straight-line code
+// (such as MVC) and Loop is the opcode to use for loops (such as MVC_LOOP).
+// Return the chain for the completed operation.
+static SDValue emitMemMem(SelectionDAG &DAG, SDLoc DL, unsigned Sequence,
+                          unsigned Loop, SDValue Chain, SDValue Dst,
+                          SDValue Src, uint64_t Size) {
+  EVT PtrVT = Src.getValueType();
+  // The heuristic we use is to prefer loops for anything that would
+  // require 7 or more MVCs.  With these kinds of sizes there isn't
+  // much to choose between straight-line code and looping code,
+  // since the time will be dominated by the MVCs themselves.
+  // However, the loop has 4 or 5 instructions (depending on whether
+  // the base addresses can be proved equal), so there doesn't seem
+  // much point using a loop for 5 * 256 bytes or fewer.  Anything in
+  // the range (5 * 256, 6 * 256) will need another instruction after
+  // the loop, so it doesn't seem worth using a loop then either.
+  // The next value up, 6 * 256, can be implemented in the same
+  // number of straight-line MVCs as 6 * 256 - 1.
+  if (Size > 6 * 256)
+    return DAG.getNode(Loop, DL, MVT::Other, Chain, Dst, Src,
+                       DAG.getConstant(Size, PtrVT),
+                       DAG.getConstant(Size / 256, PtrVT));
+  return DAG.getNode(Sequence, DL, MVT::Other, Chain, Dst, Src,
+                     DAG.getConstant(Size, PtrVT));
+}
+
+SDValue SystemZSelectionDAGInfo::
+EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                        SDValue Dst, SDValue Src, SDValue Size, unsigned Align,
+                        bool IsVolatile, bool AlwaysInline,
+                        MachinePointerInfo DstPtrInfo,
+                        MachinePointerInfo SrcPtrInfo) const {
+  if (IsVolatile)
+    return SDValue();
+
+  if (auto *CSize = dyn_cast<ConstantSDNode>(Size))
+    return emitMemMem(DAG, DL, SystemZISD::MVC, SystemZISD::MVC_LOOP,
+                      Chain, Dst, Src, CSize->getZExtValue());
+  return SDValue();
+}
+
+// Handle a memset of 1, 2, 4 or 8 bytes with the operands given by
+// Chain, Dst, ByteVal and Size.  These cases are expected to use
+// MVI, MVHHI, MVHI and MVGHI respectively.
+static SDValue memsetStore(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                           SDValue Dst, uint64_t ByteVal, uint64_t Size,
+                           unsigned Align,
+                           MachinePointerInfo DstPtrInfo) {
+  uint64_t StoreVal = ByteVal;
+  for (unsigned I = 1; I < Size; ++I)
+    StoreVal |= ByteVal << (I * 8);
+  return DAG.getStore(Chain, DL,
+                      DAG.getConstant(StoreVal, MVT::getIntegerVT(Size * 8)),
+                      Dst, DstPtrInfo, false, false, Align);
+}
+
+SDValue SystemZSelectionDAGInfo::
+EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                        SDValue Dst, SDValue Byte, SDValue Size,
+                        unsigned Align, bool IsVolatile,
+                        MachinePointerInfo DstPtrInfo) const {
+  EVT PtrVT = Dst.getValueType();
+
+  if (IsVolatile)
+    return SDValue();
+
+  if (auto *CSize = dyn_cast<ConstantSDNode>(Size)) {
+    uint64_t Bytes = CSize->getZExtValue();
+    if (Bytes == 0)
+      return SDValue();
+    if (auto *CByte = dyn_cast<ConstantSDNode>(Byte)) {
+      // Handle cases that can be done using at most two of
+      // MVI, MVHI, MVHHI and MVGHI.  The latter two can only be
+      // used if ByteVal is all zeros or all ones; in other casees,
+      // we can move at most 2 halfwords.
+      uint64_t ByteVal = CByte->getZExtValue();
+      if (ByteVal == 0 || ByteVal == 255 ?
+          Bytes <= 16 && CountPopulation_64(Bytes) <= 2 :
+          Bytes <= 4) {
+        unsigned Size1 = Bytes == 16 ? 8 : 1 << findLastSet(Bytes);
+        unsigned Size2 = Bytes - Size1;
+        SDValue Chain1 = memsetStore(DAG, DL, Chain, Dst, ByteVal, Size1,
+                                     Align, DstPtrInfo);
+        if (Size2 == 0)
+          return Chain1;
+        Dst = DAG.getNode(ISD::ADD, DL, PtrVT, Dst,
+                          DAG.getConstant(Size1, PtrVT));
+        DstPtrInfo = DstPtrInfo.getWithOffset(Size1);
+        SDValue Chain2 = memsetStore(DAG, DL, Chain, Dst, ByteVal, Size2,
+                                     std::min(Align, Size1), DstPtrInfo);
+        return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chain1, Chain2);
+      }
+    } else {
+      // Handle one and two bytes using STC.
+      if (Bytes <= 2) {
+        SDValue Chain1 = DAG.getStore(Chain, DL, Byte, Dst, DstPtrInfo,
+                                      false, false, Align);
+        if (Bytes == 1)
+          return Chain1;
+        SDValue Dst2 = DAG.getNode(ISD::ADD, DL, PtrVT, Dst,
+                                   DAG.getConstant(1, PtrVT));
+        SDValue Chain2 = DAG.getStore(Chain, DL, Byte, Dst2,
+                                      DstPtrInfo.getWithOffset(1),
+                                      false, false, 1);
+        return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chain1, Chain2);
+      }
+    }
+    assert(Bytes >= 2 && "Should have dealt with 0- and 1-byte cases already");
+
+    // Handle the special case of a memset of 0, which can use XC.
+    auto *CByte = dyn_cast<ConstantSDNode>(Byte);
+    if (CByte && CByte->getZExtValue() == 0)
+      return emitMemMem(DAG, DL, SystemZISD::XC, SystemZISD::XC_LOOP,
+                        Chain, Dst, Dst, Bytes);
+
+    // Copy the byte to the first location and then use MVC to copy
+    // it to the rest.
+    Chain = DAG.getStore(Chain, DL, Byte, Dst, DstPtrInfo,
+                         false, false, Align);
+    SDValue DstPlus1 = DAG.getNode(ISD::ADD, DL, PtrVT, Dst,
+                                   DAG.getConstant(1, PtrVT));
+    return emitMemMem(DAG, DL, SystemZISD::MVC, SystemZISD::MVC_LOOP,
+                      Chain, DstPlus1, Dst, Bytes - 1);
+  }
+  return SDValue();
+}
+
+// Use CLC to compare [Src1, Src1 + Size) with [Src2, Src2 + Size),
+// deciding whether to use a loop or straight-line code.
+static SDValue emitCLC(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                       SDValue Src1, SDValue Src2, uint64_t Size) {
+  SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+  EVT PtrVT = Src1.getValueType();
+  // A two-CLC sequence is a clear win over a loop, not least because it
+  // needs only one branch.  A three-CLC sequence needs the same number
+  // of branches as a loop (i.e. 2), but is shorter.  That brings us to
+  // lengths greater than 768 bytes.  It seems relatively likely that
+  // a difference will be found within the first 768 bytes, so we just
+  // optimize for the smallest number of branch instructions, in order
+  // to avoid polluting the prediction buffer too much.  A loop only ever
+  // needs 2 branches, whereas a straight-line sequence would need 3 or more.
+  if (Size > 3 * 256)
+    return DAG.getNode(SystemZISD::CLC_LOOP, DL, VTs, Chain, Src1, Src2,
+                       DAG.getConstant(Size, PtrVT),
+                       DAG.getConstant(Size / 256, PtrVT));
+  return DAG.getNode(SystemZISD::CLC, DL, VTs, Chain, Src1, Src2,
+                     DAG.getConstant(Size, PtrVT));
+}
+
+// Convert the current CC value into an integer that is 0 if CC == 0,
+// less than zero if CC == 1 and greater than zero if CC >= 2.
+// The sequence starts with IPM, which puts CC into bits 29 and 28
+// of an integer and clears bits 30 and 31.
+static SDValue addIPMSequence(SDLoc DL, SDValue Glue, SelectionDAG &DAG) {
+  SDValue IPM = DAG.getNode(SystemZISD::IPM, DL, MVT::i32, Glue);
+  SDValue SRL = DAG.getNode(ISD::SRL, DL, MVT::i32, IPM,
+                            DAG.getConstant(SystemZ::IPM_CC, MVT::i32));
+  SDValue ROTL = DAG.getNode(ISD::ROTL, DL, MVT::i32, SRL,
+                             DAG.getConstant(31, MVT::i32));
+  return ROTL;
+}
+
+std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::
+EmitTargetCodeForMemcmp(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                        SDValue Src1, SDValue Src2, SDValue Size,
+                        MachinePointerInfo Op1PtrInfo,
+                        MachinePointerInfo Op2PtrInfo) const {
+  if (auto *CSize = dyn_cast<ConstantSDNode>(Size)) {
+    uint64_t Bytes = CSize->getZExtValue();
+    assert(Bytes > 0 && "Caller should have handled 0-size case");
+    Chain = emitCLC(DAG, DL, Chain, Src1, Src2, Bytes);
+    SDValue Glue = Chain.getValue(1);
+    return std::make_pair(addIPMSequence(DL, Glue, DAG), Chain);
+  }
+  return std::make_pair(SDValue(), SDValue());
+}
+
+std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::
+EmitTargetCodeForMemchr(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                        SDValue Src, SDValue Char, SDValue Length,
+                        MachinePointerInfo SrcPtrInfo) const {
+  // Use SRST to find the character.  End is its address on success.
+  EVT PtrVT = Src.getValueType();
+  SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other, MVT::Glue);
+  Length = DAG.getZExtOrTrunc(Length, DL, PtrVT);
+  Char = DAG.getZExtOrTrunc(Char, DL, MVT::i32);
+  Char = DAG.getNode(ISD::AND, DL, MVT::i32, Char,
+                     DAG.getConstant(255, MVT::i32));
+  SDValue Limit = DAG.getNode(ISD::ADD, DL, PtrVT, Src, Length);
+  SDValue End = DAG.getNode(SystemZISD::SEARCH_STRING, DL, VTs, Chain,
+                            Limit, Src, Char);
+  Chain = End.getValue(1);
+  SDValue Glue = End.getValue(2);
+
+  // Now select between End and null, depending on whether the character
+  // was found.
+  SmallVector<SDValue, 5> Ops;
+  Ops.push_back(End);
+  Ops.push_back(DAG.getConstant(0, PtrVT));
+  Ops.push_back(DAG.getConstant(SystemZ::CCMASK_SRST, MVT::i32));
+  Ops.push_back(DAG.getConstant(SystemZ::CCMASK_SRST_FOUND, MVT::i32));
+  Ops.push_back(Glue);
+  VTs = DAG.getVTList(PtrVT, MVT::Glue);
+  End = DAG.getNode(SystemZISD::SELECT_CCMASK, DL, VTs, Ops);
+  return std::make_pair(End, Chain);
+}
+
+std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::
+EmitTargetCodeForStrcpy(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                        SDValue Dest, SDValue Src,
+                        MachinePointerInfo DestPtrInfo,
+                        MachinePointerInfo SrcPtrInfo, bool isStpcpy) const {
+  SDVTList VTs = DAG.getVTList(Dest.getValueType(), MVT::Other);
+  SDValue EndDest = DAG.getNode(SystemZISD::STPCPY, DL, VTs, Chain, Dest, Src,
+                                DAG.getConstant(0, MVT::i32));
+  return std::make_pair(isStpcpy ? EndDest : Dest, EndDest.getValue(1));
+}
+
+std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::
+EmitTargetCodeForStrcmp(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                        SDValue Src1, SDValue Src2,
+                        MachinePointerInfo Op1PtrInfo,
+                        MachinePointerInfo Op2PtrInfo) const {
+  SDVTList VTs = DAG.getVTList(Src1.getValueType(), MVT::Other, MVT::Glue);
+  SDValue Unused = DAG.getNode(SystemZISD::STRCMP, DL, VTs, Chain, Src1, Src2,
+                               DAG.getConstant(0, MVT::i32));
+  Chain = Unused.getValue(1);
+  SDValue Glue = Chain.getValue(2);
+  return std::make_pair(addIPMSequence(DL, Glue, DAG), Chain);
+}
+
+// Search from Src for a null character, stopping once Src reaches Limit.
+// Return a pair of values, the first being the number of nonnull characters
+// and the second being the out chain.
+//
+// This can be used for strlen by setting Limit to 0.
+static std::pair<SDValue, SDValue> getBoundedStrlen(SelectionDAG &DAG, SDLoc DL,
+                                                    SDValue Chain, SDValue Src,
+                                                    SDValue Limit) {
+  EVT PtrVT = Src.getValueType();
+  SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other, MVT::Glue);
+  SDValue End = DAG.getNode(SystemZISD::SEARCH_STRING, DL, VTs, Chain,
+                            Limit, Src, DAG.getConstant(0, MVT::i32));
+  Chain = End.getValue(1);
+  SDValue Len = DAG.getNode(ISD::SUB, DL, PtrVT, End, Src);
+  return std::make_pair(Len, Chain);
+}    
+
+std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::
+EmitTargetCodeForStrlen(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                        SDValue Src, MachinePointerInfo SrcPtrInfo) const {
+  EVT PtrVT = Src.getValueType();
+  return getBoundedStrlen(DAG, DL, Chain, Src, DAG.getConstant(0, PtrVT));
+}
+
+std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::
+EmitTargetCodeForStrnlen(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                         SDValue Src, SDValue MaxLength,
+                         MachinePointerInfo SrcPtrInfo) const {
+  EVT PtrVT = Src.getValueType();
+  MaxLength = DAG.getZExtOrTrunc(MaxLength, DL, PtrVT);
+  SDValue Limit = DAG.getNode(ISD::ADD, DL, PtrVT, Src, MaxLength);
+  return getBoundedStrlen(DAG, DL, Chain, Src, Limit);
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h b/contrib/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
new file mode 100644
index 0000000..e9de146
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
@@ -0,0 +1,77 @@
+//===-- SystemZSelectionDAGInfo.h - SystemZ SelectionDAG Info ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the SystemZ subclass for TargetSelectionDAGInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SYSTEMZSELECTIONDAGINFO_H
+#define SYSTEMZSELECTIONDAGINFO_H
+
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+
+namespace llvm {
+
+class SystemZTargetMachine;
+
+class SystemZSelectionDAGInfo : public TargetSelectionDAGInfo {
+public:
+  explicit SystemZSelectionDAGInfo(const DataLayout &DL);
+  ~SystemZSelectionDAGInfo();
+
+  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                                  SDValue Dst, SDValue Src,
+                                  SDValue Size, unsigned Align,
+                                  bool IsVolatile, bool AlwaysInline,
+                                  MachinePointerInfo DstPtrInfo,
+                                  MachinePointerInfo SrcPtrInfo) const override;
+
+  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc DL,
+                                  SDValue Chain, SDValue Dst, SDValue Byte,
+                                  SDValue Size, unsigned Align, bool IsVolatile,
+                                  MachinePointerInfo DstPtrInfo) const override;
+
+  std::pair<SDValue, SDValue>
+  EmitTargetCodeForMemcmp(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                          SDValue Src1, SDValue Src2, SDValue Size,
+                          MachinePointerInfo Op1PtrInfo,
+                          MachinePointerInfo Op2PtrInfo) const override;
+
+  std::pair<SDValue, SDValue>
+  EmitTargetCodeForMemchr(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                          SDValue Src, SDValue Char, SDValue Length,
+                          MachinePointerInfo SrcPtrInfo) const override;
+
+  std::pair<SDValue, SDValue>
+  EmitTargetCodeForStrcpy(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                          SDValue Dest, SDValue Src,
+                          MachinePointerInfo DestPtrInfo,
+                          MachinePointerInfo SrcPtrInfo,
+                          bool isStpcpy) const override;
+
+  std::pair<SDValue, SDValue>
+  EmitTargetCodeForStrcmp(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                          SDValue Src1, SDValue Src2,
+                          MachinePointerInfo Op1PtrInfo,
+                          MachinePointerInfo Op2PtrInfo) const override;
+
+  std::pair<SDValue, SDValue>
+  EmitTargetCodeForStrlen(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                          SDValue Src,
+                          MachinePointerInfo SrcPtrInfo) const override;
+
+  std::pair<SDValue, SDValue>
+  EmitTargetCodeForStrnlen(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                           SDValue Src, SDValue MaxLength,
+                           MachinePointerInfo SrcPtrInfo) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp
new file mode 100644
index 0000000..aad899c
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp
@@ -0,0 +1,160 @@
+//===-- SystemZShortenInst.cpp - Instruction-shortening pass --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass tries to replace instructions with shorter forms.  For example,
+// IILF can be replaced with LLILL or LLILH if the constant fits and if the
+// other 32 bits of the GR64 destination are not live.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZTargetMachine.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "systemz-shorten-inst"
+
+namespace {
+class SystemZShortenInst : public MachineFunctionPass {
+public:
+  static char ID;
+  SystemZShortenInst(const SystemZTargetMachine &tm);
+
+  const char *getPassName() const override {
+    return "SystemZ Instruction Shortening";
+  }
+
+  bool processBlock(MachineBasicBlock &MBB);
+  bool runOnMachineFunction(MachineFunction &F) override;
+
+private:
+  bool shortenIIF(MachineInstr &MI, unsigned *GPRMap, unsigned LiveOther,
+                  unsigned LLIxL, unsigned LLIxH);
+
+  const SystemZInstrInfo *TII;
+
+  // LowGPRs[I] has bit N set if LLVM register I includes the low
+  // word of GPR N.  HighGPRs is the same for the high word.
+  unsigned LowGPRs[SystemZ::NUM_TARGET_REGS];
+  unsigned HighGPRs[SystemZ::NUM_TARGET_REGS];
+};
+
+char SystemZShortenInst::ID = 0;
+} // end anonymous namespace
+
+FunctionPass *llvm::createSystemZShortenInstPass(SystemZTargetMachine &TM) {
+  return new SystemZShortenInst(TM);
+}
+
+SystemZShortenInst::SystemZShortenInst(const SystemZTargetMachine &tm)
+  : MachineFunctionPass(ID), TII(nullptr), LowGPRs(), HighGPRs() {
+  // Set up LowGPRs and HighGPRs.
+  for (unsigned I = 0; I < 16; ++I) {
+    LowGPRs[SystemZMC::GR32Regs[I]] |= 1 << I;
+    LowGPRs[SystemZMC::GR64Regs[I]] |= 1 << I;
+    HighGPRs[SystemZMC::GRH32Regs[I]] |= 1 << I;
+    HighGPRs[SystemZMC::GR64Regs[I]] |= 1 << I;
+    if (unsigned GR128 = SystemZMC::GR128Regs[I]) {
+      LowGPRs[GR128] |= 3 << I;
+      HighGPRs[GR128] |= 3 << I;
+    }
+  }
+}
+
+// MI loads one word of a GPR using an IIxF instruction and LLIxL and LLIxH
+// are the halfword immediate loads for the same word.  Try to use one of them
+// instead of IIxF.  If MI loads the high word, GPRMap[X] is the set of high
+// words referenced by LLVM register X while LiveOther is the mask of low
+// words that are currently live, and vice versa.
+bool SystemZShortenInst::shortenIIF(MachineInstr &MI, unsigned *GPRMap,
+                                    unsigned LiveOther, unsigned LLIxL,
+                                    unsigned LLIxH) {
+  unsigned Reg = MI.getOperand(0).getReg();
+  assert(Reg < SystemZ::NUM_TARGET_REGS && "Invalid register number");
+  unsigned GPRs = GPRMap[Reg];
+  assert(GPRs != 0 && "Register must be a GPR");
+  if (GPRs & LiveOther)
+    return false;
+
+  uint64_t Imm = MI.getOperand(1).getImm();
+  if (SystemZ::isImmLL(Imm)) {
+    MI.setDesc(TII->get(LLIxL));
+    MI.getOperand(0).setReg(SystemZMC::getRegAsGR64(Reg));
+    return true;
+  }
+  if (SystemZ::isImmLH(Imm)) {
+    MI.setDesc(TII->get(LLIxH));
+    MI.getOperand(0).setReg(SystemZMC::getRegAsGR64(Reg));
+    MI.getOperand(1).setImm(Imm >> 16);
+    return true;
+  }
+  return false;
+}
+
+// Process all instructions in MBB.  Return true if something changed.
+bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) {
+  bool Changed = false;
+
+  // Work out which words are live on exit from the block.
+  unsigned LiveLow = 0;
+  unsigned LiveHigh = 0;
+  for (auto SI = MBB.succ_begin(), SE = MBB.succ_end(); SI != SE; ++SI) {
+    for (auto LI = (*SI)->livein_begin(), LE = (*SI)->livein_end();
+         LI != LE; ++LI) {
+      unsigned Reg = *LI;
+      assert(Reg < SystemZ::NUM_TARGET_REGS && "Invalid register number");
+      LiveLow |= LowGPRs[Reg];
+      LiveHigh |= HighGPRs[Reg];
+    }
+  }
+
+  // Iterate backwards through the block looking for instructions to change.
+  for (auto MBBI = MBB.rbegin(), MBBE = MBB.rend(); MBBI != MBBE; ++MBBI) {
+    MachineInstr &MI = *MBBI;
+    unsigned Opcode = MI.getOpcode();
+    if (Opcode == SystemZ::IILF)
+      Changed |= shortenIIF(MI, LowGPRs, LiveHigh, SystemZ::LLILL,
+                            SystemZ::LLILH);
+    else if (Opcode == SystemZ::IIHF)
+      Changed |= shortenIIF(MI, HighGPRs, LiveLow, SystemZ::LLIHL,
+                            SystemZ::LLIHH);
+    unsigned UsedLow = 0;
+    unsigned UsedHigh = 0;
+    for (auto MOI = MI.operands_begin(), MOE = MI.operands_end();
+         MOI != MOE; ++MOI) {
+      MachineOperand &MO = *MOI;
+      if (MO.isReg()) {
+        if (unsigned Reg = MO.getReg()) {
+          assert(Reg < SystemZ::NUM_TARGET_REGS && "Invalid register number");
+          if (MO.isDef()) {
+            LiveLow &= ~LowGPRs[Reg];
+            LiveHigh &= ~HighGPRs[Reg];
+          } else if (!MO.isUndef()) {
+            UsedLow |= LowGPRs[Reg];
+            UsedHigh |= HighGPRs[Reg];
+          }
+        }
+      }
+    }
+    LiveLow |= UsedLow;
+    LiveHigh |= UsedHigh;
+  }
+
+  return Changed;
+}
+
+bool SystemZShortenInst::runOnMachineFunction(MachineFunction &F) {
+  TII = static_cast<const SystemZInstrInfo *>(F.getTarget().getInstrInfo());
+
+  bool Changed = false;
+  for (auto &MBB : F)
+    Changed |= processBlock(MBB);
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
new file mode 100644
index 0000000..e160bc8
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -0,0 +1,80 @@
+//===-- SystemZSubtarget.cpp - SystemZ subtarget information --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZSubtarget.h"
+#include "MCTargetDesc/SystemZMCTargetDesc.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/Host.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "systemz-subtarget"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "SystemZGenSubtargetInfo.inc"
+
+// Pin the vtable to this file.
+void SystemZSubtarget::anchor() {}
+
+SystemZSubtarget &
+SystemZSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
+  std::string CPUName = CPU;
+  if (CPUName.empty())
+    CPUName = "generic";
+#if defined(__linux__) && defined(__s390x__)
+  if (CPUName == "generic")
+    CPUName = sys::getHostCPUName();
+#endif
+  // Parse features string.
+  ParseSubtargetFeatures(CPUName, FS);
+  return *this;
+}
+
+SystemZSubtarget::SystemZSubtarget(const std::string &TT,
+                                   const std::string &CPU,
+                                   const std::string &FS,
+                                   const TargetMachine &TM)
+    : SystemZGenSubtargetInfo(TT, CPU, FS), HasDistinctOps(false),
+      HasLoadStoreOnCond(false), HasHighWord(false), HasFPExtension(false),
+      HasFastSerialization(false), HasInterlockedAccess1(false),
+      TargetTriple(TT),
+      // Make sure that global data has at least 16 bits of alignment by
+      // default, so that we can refer to it using LARL.  We don't have any
+      // special requirements for stack variables though.
+      DL("E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-a:8:16-n32:64"),
+      InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM),
+      TSInfo(DL), FrameLowering() {}
+
+// Return true if GV binds locally under reloc model RM.
+static bool bindsLocally(const GlobalValue *GV, Reloc::Model RM) {
+  // For non-PIC, all symbols bind locally.
+  if (RM == Reloc::Static)
+    return true;
+
+  return GV->hasLocalLinkage() || !GV->hasDefaultVisibility();
+}
+
+bool SystemZSubtarget::isPC32DBLSymbol(const GlobalValue *GV,
+                                       Reloc::Model RM,
+                                       CodeModel::Model CM) const {
+  // PC32DBL accesses require the low bit to be clear.  Note that a zero
+  // value selects the default alignment and is therefore OK.
+  if (GV->getAlignment() == 1)
+    return false;
+
+  // For the small model, all locally-binding symbols are in range.
+  if (CM == CodeModel::Small)
+    return bindsLocally(GV, RM);
+
+  // For Medium and above, assume that the symbol is not within the 4GB range.
+  // Taking the address of locally-defined text would be OK, but that
+  // case isn't easy to detect.
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZSubtarget.h b/contrib/llvm/lib/Target/SystemZ/SystemZSubtarget.h
new file mode 100644
index 0000000..4e8c710
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZSubtarget.h
@@ -0,0 +1,100 @@
+//===-- SystemZSubtarget.h - SystemZ subtarget information -----*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the SystemZ specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SYSTEMZSUBTARGET_H
+#define SYSTEMZSUBTARGET_H
+
+#include "SystemZFrameLowering.h"
+#include "SystemZISelLowering.h"
+#include "SystemZInstrInfo.h"
+#include "SystemZRegisterInfo.h"
+#include "SystemZSelectionDAGInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "SystemZGenSubtargetInfo.inc"
+
+namespace llvm {
+class GlobalValue;
+class StringRef;
+
+class SystemZSubtarget : public SystemZGenSubtargetInfo {
+  virtual void anchor();
+protected:
+  bool HasDistinctOps;
+  bool HasLoadStoreOnCond;
+  bool HasHighWord;
+  bool HasFPExtension;
+  bool HasFastSerialization;
+  bool HasInterlockedAccess1;
+
+private:
+  Triple TargetTriple;
+  const DataLayout DL;
+  SystemZInstrInfo InstrInfo;
+  SystemZTargetLowering TLInfo;
+  SystemZSelectionDAGInfo TSInfo;
+  SystemZFrameLowering FrameLowering;
+
+  SystemZSubtarget &initializeSubtargetDependencies(StringRef CPU,
+                                                    StringRef FS);
+public:
+  SystemZSubtarget(const std::string &TT, const std::string &CPU,
+                   const std::string &FS, const TargetMachine &TM);
+
+  const TargetFrameLowering *getFrameLowering() const { return &FrameLowering; }
+  const SystemZInstrInfo *getInstrInfo() const { return &InstrInfo; }
+  const DataLayout *getDataLayout() const { return &DL; }
+  const SystemZRegisterInfo *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+  const SystemZTargetLowering *getTargetLowering() const { return &TLInfo; }
+  const TargetSelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; }
+
+  // This is important for reducing register pressure in vector code.
+  bool useAA() const override { return true; }
+
+  // Automatically generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  // Return true if the target has the distinct-operands facility.
+  bool hasDistinctOps() const { return HasDistinctOps; }
+
+  // Return true if the target has the load/store-on-condition facility.
+  bool hasLoadStoreOnCond() const { return HasLoadStoreOnCond; }
+
+  // Return true if the target has the high-word facility.
+  bool hasHighWord() const { return HasHighWord; }
+
+  // Return true if the target has the floating-point extension facility.
+  bool hasFPExtension() const { return HasFPExtension; }
+
+  // Return true if the target has the fast-serialization facility.
+  bool hasFastSerialization() const { return HasFastSerialization; }
+
+  // Return true if the target has interlocked-access facility 1.
+  bool hasInterlockedAccess1() const { return HasInterlockedAccess1; }
+
+  // Return true if GV can be accessed using LARL for reloc model RM
+  // and code model CM.
+  bool isPC32DBLSymbol(const GlobalValue *GV, Reloc::Model RM,
+                       CodeModel::Model CM) const;
+
+  bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
new file mode 100644
index 0000000..0122e99
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -0,0 +1,101 @@
+//===-- SystemZTargetMachine.cpp - Define TargetMachine for SystemZ -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZTargetMachine.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+
+extern "C" void LLVMInitializeSystemZTarget() {
+  // Register the target.
+  RegisterTargetMachine<SystemZTargetMachine> X(TheSystemZTarget);
+}
+
+SystemZTargetMachine::SystemZTargetMachine(const Target &T, StringRef TT,
+                                           StringRef CPU, StringRef FS,
+                                           const TargetOptions &Options,
+                                           Reloc::Model RM, CodeModel::Model CM,
+                                           CodeGenOpt::Level OL)
+    : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+      Subtarget(TT, CPU, FS, *this) {
+  initAsmInfo();
+}
+
+namespace {
+/// SystemZ Code Generator Pass Configuration Options.
+class SystemZPassConfig : public TargetPassConfig {
+public:
+  SystemZPassConfig(SystemZTargetMachine *TM, PassManagerBase &PM)
+    : TargetPassConfig(TM, PM) {}
+
+  SystemZTargetMachine &getSystemZTargetMachine() const {
+    return getTM<SystemZTargetMachine>();
+  }
+
+  void addIRPasses() override;
+  bool addInstSelector() override;
+  bool addPreSched2() override;
+  bool addPreEmitPass() override;
+};
+} // end anonymous namespace
+
+void SystemZPassConfig::addIRPasses() {
+  TargetPassConfig::addIRPasses();
+  addPass(createPartiallyInlineLibCallsPass());
+}
+
+bool SystemZPassConfig::addInstSelector() {
+  addPass(createSystemZISelDag(getSystemZTargetMachine(), getOptLevel()));
+  return false;
+}
+
+bool SystemZPassConfig::addPreSched2() {
+  if (getOptLevel() != CodeGenOpt::None &&
+      getSystemZTargetMachine().getSubtargetImpl()->hasLoadStoreOnCond())
+    addPass(&IfConverterID);
+  return true;
+}
+
+bool SystemZPassConfig::addPreEmitPass() {
+  // We eliminate comparisons here rather than earlier because some
+  // transformations can change the set of available CC values and we
+  // generally want those transformations to have priority.  This is
+  // especially true in the commonest case where the result of the comparison
+  // is used by a single in-range branch instruction, since we will then
+  // be able to fuse the compare and the branch instead.
+  //
+  // For example, two-address NILF can sometimes be converted into
+  // three-address RISBLG.  NILF produces a CC value that indicates whether
+  // the low word is zero, but RISBLG does not modify CC at all.  On the
+  // other hand, 64-bit ANDs like NILL can sometimes be converted to RISBG.
+  // The CC value produced by NILL isn't useful for our purposes, but the
+  // value produced by RISBG can be used for any comparison with zero
+  // (not just equality).  So there are some transformations that lose
+  // CC values (while still being worthwhile) and others that happen to make
+  // the CC result more useful than it was originally.
+  //
+  // Another reason is that we only want to use BRANCH ON COUNT in cases
+  // where we know that the count register is not going to be spilled.
+  //
+  // Doing it so late makes it more likely that a register will be reused
+  // between the comparison and the branch, but it isn't clear whether
+  // preventing that would be a win or not.
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(createSystemZElimComparePass(getSystemZTargetMachine()));
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(createSystemZShortenInstPass(getSystemZTargetMachine()));
+  addPass(createSystemZLongBranchPass(getSystemZTargetMachine()));
+  return true;
+}
+
+TargetPassConfig *SystemZTargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new SystemZPassConfig(this, PM);
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.h b/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.h
new file mode 100644
index 0000000..ded07e9
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.h
@@ -0,0 +1,63 @@
+//==- SystemZTargetMachine.h - Define TargetMachine for SystemZ ---*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the SystemZ specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef SYSTEMZTARGETMACHINE_H
+#define SYSTEMZTARGETMACHINE_H
+
+#include "SystemZSubtarget.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class TargetFrameLowering;
+
+class SystemZTargetMachine : public LLVMTargetMachine {
+  SystemZSubtarget        Subtarget;
+
+public:
+  SystemZTargetMachine(const Target &T, StringRef TT, StringRef CPU,
+                       StringRef FS, const TargetOptions &Options,
+                       Reloc::Model RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL);
+
+  // Override TargetMachine.
+  const TargetFrameLowering *getFrameLowering() const override {
+    return getSubtargetImpl()->getFrameLowering();
+  }
+  const SystemZInstrInfo *getInstrInfo() const override {
+    return getSubtargetImpl()->getInstrInfo();
+  }
+  const SystemZSubtarget *getSubtargetImpl() const override {
+    return &Subtarget;
+  }
+  const DataLayout *getDataLayout() const override {
+    return getSubtargetImpl()->getDataLayout();
+  }
+  const SystemZRegisterInfo *getRegisterInfo() const override {
+    return getSubtargetImpl()->getRegisterInfo();
+  }
+  const SystemZTargetLowering *getTargetLowering() const override {
+    return getSubtargetImpl()->getTargetLowering();
+  }
+  const TargetSelectionDAGInfo *getSelectionDAGInfo() const override {
+    return getSubtargetImpl()->getSelectionDAGInfo();
+  }
+
+  // Override LLVMTargetMachine
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp b/contrib/llvm/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp
new file mode 100644
index 0000000..8f9aa28
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp
@@ -0,0 +1,20 @@
+//===-- SystemZTargetInfo.cpp - SystemZ target implementation -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZ.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+Target llvm::TheSystemZTarget;
+
+extern "C" void LLVMInitializeSystemZTargetInfo() {
+  RegisterTarget<Triple::systemz, /*HasJIT=*/true>
+    X(TheSystemZTarget, "systemz", "SystemZ");
+}
diff --git a/contrib/llvm/lib/Target/Target.cpp b/contrib/llvm/lib/Target/Target.cpp
new file mode 100644
index 0000000..d277f82
--- /dev/null
+++ b/contrib/llvm/lib/Target/Target.cpp
@@ -0,0 +1,136 @@
+//===-- Target.cpp --------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the common infrastructure (including C bindings) for 
+// libLLVMTarget.a, which implements target information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm-c/Target.h"
+#include "llvm-c/Initialization.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/PassManager.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include <cstring>
+
+using namespace llvm;
+
+inline TargetLibraryInfo *unwrap(LLVMTargetLibraryInfoRef P) {
+  return reinterpret_cast<TargetLibraryInfo*>(P);
+}
+
+inline LLVMTargetLibraryInfoRef wrap(const TargetLibraryInfo *P) {
+  TargetLibraryInfo *X = const_cast<TargetLibraryInfo*>(P);
+  return reinterpret_cast<LLVMTargetLibraryInfoRef>(X);
+}
+
+void llvm::initializeTarget(PassRegistry &Registry) {
+  initializeDataLayoutPassPass(Registry);
+  initializeTargetLibraryInfoPass(Registry);
+}
+
+void LLVMInitializeTarget(LLVMPassRegistryRef R) {
+  initializeTarget(*unwrap(R));
+}
+
+LLVMTargetDataRef LLVMCreateTargetData(const char *StringRep) {
+  return wrap(new DataLayout(StringRep));
+}
+
+void LLVMAddTargetData(LLVMTargetDataRef TD, LLVMPassManagerRef PM) {
+  // The DataLayoutPass must now be in sync with the module. Unfortunatelly we
+  // cannot enforce that from the C api.
+  unwrap(PM)->add(new DataLayoutPass(*unwrap(TD)));
+}
+
+void LLVMAddTargetLibraryInfo(LLVMTargetLibraryInfoRef TLI,
+                              LLVMPassManagerRef PM) {
+  unwrap(PM)->add(new TargetLibraryInfo(*unwrap(TLI)));
+}
+
+char *LLVMCopyStringRepOfTargetData(LLVMTargetDataRef TD) {
+  std::string StringRep = unwrap(TD)->getStringRepresentation();
+  return strdup(StringRep.c_str());
+}
+
+LLVMByteOrdering LLVMByteOrder(LLVMTargetDataRef TD) {
+  return unwrap(TD)->isLittleEndian() ? LLVMLittleEndian : LLVMBigEndian;
+}
+
+unsigned LLVMPointerSize(LLVMTargetDataRef TD) {
+  return unwrap(TD)->getPointerSize(0);
+}
+
+unsigned LLVMPointerSizeForAS(LLVMTargetDataRef TD, unsigned AS) {
+  return unwrap(TD)->getPointerSize(AS);
+}
+
+LLVMTypeRef LLVMIntPtrType(LLVMTargetDataRef TD) {
+  return wrap(unwrap(TD)->getIntPtrType(getGlobalContext()));
+}
+
+LLVMTypeRef LLVMIntPtrTypeForAS(LLVMTargetDataRef TD, unsigned AS) {
+  return wrap(unwrap(TD)->getIntPtrType(getGlobalContext(), AS));
+}
+
+LLVMTypeRef LLVMIntPtrTypeInContext(LLVMContextRef C, LLVMTargetDataRef TD) {
+  return wrap(unwrap(TD)->getIntPtrType(*unwrap(C)));
+}
+
+LLVMTypeRef LLVMIntPtrTypeForASInContext(LLVMContextRef C, LLVMTargetDataRef TD, unsigned AS) {
+  return wrap(unwrap(TD)->getIntPtrType(*unwrap(C), AS));
+}
+
+unsigned long long LLVMSizeOfTypeInBits(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
+  return unwrap(TD)->getTypeSizeInBits(unwrap(Ty));
+}
+
+unsigned long long LLVMStoreSizeOfType(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
+  return unwrap(TD)->getTypeStoreSize(unwrap(Ty));
+}
+
+unsigned long long LLVMABISizeOfType(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
+  return unwrap(TD)->getTypeAllocSize(unwrap(Ty));
+}
+
+unsigned LLVMABIAlignmentOfType(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
+  return unwrap(TD)->getABITypeAlignment(unwrap(Ty));
+}
+
+unsigned LLVMCallFrameAlignmentOfType(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
+  return unwrap(TD)->getABITypeAlignment(unwrap(Ty));
+}
+
+unsigned LLVMPreferredAlignmentOfType(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
+  return unwrap(TD)->getPrefTypeAlignment(unwrap(Ty));
+}
+
+unsigned LLVMPreferredAlignmentOfGlobal(LLVMTargetDataRef TD,
+                                        LLVMValueRef GlobalVar) {
+  return unwrap(TD)->getPreferredAlignment(unwrap<GlobalVariable>(GlobalVar));
+}
+
+unsigned LLVMElementAtOffset(LLVMTargetDataRef TD, LLVMTypeRef StructTy,
+                             unsigned long long Offset) {
+  StructType *STy = unwrap<StructType>(StructTy);
+  return unwrap(TD)->getStructLayout(STy)->getElementContainingOffset(Offset);
+}
+
+unsigned long long LLVMOffsetOfElement(LLVMTargetDataRef TD, LLVMTypeRef StructTy,
+                                       unsigned Element) {
+  StructType *STy = unwrap<StructType>(StructTy);
+  return unwrap(TD)->getStructLayout(STy)->getElementOffset(Element);
+}
+
+void LLVMDisposeTargetData(LLVMTargetDataRef TD) {
+  delete unwrap(TD);
+}
diff --git a/contrib/llvm/lib/Target/TargetIntrinsicInfo.cpp b/contrib/llvm/lib/Target/TargetIntrinsicInfo.cpp
new file mode 100644
index 0000000..64bd56f
--- /dev/null
+++ b/contrib/llvm/lib/Target/TargetIntrinsicInfo.cpp
@@ -0,0 +1,30 @@
+//===-- TargetIntrinsicInfo.cpp - Target Instruction Information ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the TargetIntrinsicInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetIntrinsicInfo.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/IR/Function.h"
+using namespace llvm;
+
+TargetIntrinsicInfo::TargetIntrinsicInfo() {
+}
+
+TargetIntrinsicInfo::~TargetIntrinsicInfo() {
+}
+
+unsigned TargetIntrinsicInfo::getIntrinsicID(Function *F) const {
+  const ValueName *ValName = F->getValueName();
+  if (!ValName)
+    return 0;
+  return lookupName(ValName->getKeyData(), ValName->getKeyLength());
+}
diff --git a/contrib/llvm/lib/Target/TargetJITInfo.cpp b/contrib/llvm/lib/Target/TargetJITInfo.cpp
new file mode 100644
index 0000000..aafedf8
--- /dev/null
+++ b/contrib/llvm/lib/Target/TargetJITInfo.cpp
@@ -0,0 +1,14 @@
+//===- Target/TargetJITInfo.h - Target Information for JIT ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetJITInfo.h"
+
+using namespace llvm;
+
+void TargetJITInfo::anchor() { }
diff --git a/contrib/llvm/lib/Target/TargetLibraryInfo.cpp b/contrib/llvm/lib/Target/TargetLibraryInfo.cpp
new file mode 100644
index 0000000..616ff90
--- /dev/null
+++ b/contrib/llvm/lib/Target/TargetLibraryInfo.cpp
@@ -0,0 +1,743 @@
+//===-- TargetLibraryInfo.cpp - Runtime library information ----------------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the TargetLibraryInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/ADT/Triple.h"
+using namespace llvm;
+
+// Register the default implementation.
+INITIALIZE_PASS(TargetLibraryInfo, "targetlibinfo",
+                "Target Library Information", false, true)
+char TargetLibraryInfo::ID = 0;
+
+void TargetLibraryInfo::anchor() { }
+
+const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
+  {
+    "_IO_getc",
+    "_IO_putc",
+    "_ZdaPv",
+    "_ZdaPvRKSt9nothrow_t",
+    "_ZdlPv",
+    "_ZdlPvRKSt9nothrow_t",
+    "_Znaj",
+    "_ZnajRKSt9nothrow_t",
+    "_Znam",
+    "_ZnamRKSt9nothrow_t",
+    "_Znwj",
+    "_ZnwjRKSt9nothrow_t",
+    "_Znwm",
+    "_ZnwmRKSt9nothrow_t",
+    "__cospi",
+    "__cospif",
+    "__cxa_atexit",
+    "__cxa_guard_abort",
+    "__cxa_guard_acquire",
+    "__cxa_guard_release",
+    "__isoc99_scanf",
+    "__isoc99_sscanf",
+    "__memcpy_chk",
+    "__sincospi_stret",
+    "__sincospif_stret",
+    "__sinpi",
+    "__sinpif",
+    "__sqrt_finite",
+    "__sqrtf_finite",
+    "__sqrtl_finite",
+    "__strdup",
+    "__strndup",
+    "__strtok_r",
+    "abs",
+    "access",
+    "acos",
+    "acosf",
+    "acosh",
+    "acoshf",
+    "acoshl",
+    "acosl",
+    "asin",
+    "asinf",
+    "asinh",
+    "asinhf",
+    "asinhl",
+    "asinl",
+    "atan",
+    "atan2",
+    "atan2f",
+    "atan2l",
+    "atanf",
+    "atanh",
+    "atanhf",
+    "atanhl",
+    "atanl",
+    "atof",
+    "atoi",
+    "atol",
+    "atoll",
+    "bcmp",
+    "bcopy",
+    "bzero",
+    "calloc",
+    "cbrt",
+    "cbrtf",
+    "cbrtl",
+    "ceil",
+    "ceilf",
+    "ceill",
+    "chmod",
+    "chown",
+    "clearerr",
+    "closedir",
+    "copysign",
+    "copysignf",
+    "copysignl",
+    "cos",
+    "cosf",
+    "cosh",
+    "coshf",
+    "coshl",
+    "cosl",
+    "ctermid",
+    "exp",
+    "exp10",
+    "exp10f",
+    "exp10l",
+    "exp2",
+    "exp2f",
+    "exp2l",
+    "expf",
+    "expl",
+    "expm1",
+    "expm1f",
+    "expm1l",
+    "fabs",
+    "fabsf",
+    "fabsl",
+    "fclose",
+    "fdopen",
+    "feof",
+    "ferror",
+    "fflush",
+    "ffs",
+    "ffsl",
+    "ffsll",
+    "fgetc",
+    "fgetpos",
+    "fgets",
+    "fileno",
+    "fiprintf",
+    "flockfile",
+    "floor",
+    "floorf",
+    "floorl",
+    "fmax",
+    "fmaxf",
+    "fmaxl",
+    "fmin",
+    "fminf",
+    "fminl",
+    "fmod",
+    "fmodf",
+    "fmodl",
+    "fopen",
+    "fopen64",
+    "fprintf",
+    "fputc",
+    "fputs",
+    "fread",
+    "free",
+    "frexp",
+    "frexpf",
+    "frexpl",
+    "fscanf",
+    "fseek",
+    "fseeko",
+    "fseeko64",
+    "fsetpos",
+    "fstat",
+    "fstat64",
+    "fstatvfs",
+    "fstatvfs64",
+    "ftell",
+    "ftello",
+    "ftello64",
+    "ftrylockfile",
+    "funlockfile",
+    "fwrite",
+    "getc",
+    "getc_unlocked",
+    "getchar",
+    "getenv",
+    "getitimer",
+    "getlogin_r",
+    "getpwnam",
+    "gets",
+    "gettimeofday",
+    "htonl",
+    "htons",
+    "iprintf",
+    "isascii",
+    "isdigit",
+    "labs",
+    "lchown",
+    "ldexp",
+    "ldexpf",
+    "ldexpl",
+    "llabs",
+    "log",
+    "log10",
+    "log10f",
+    "log10l",
+    "log1p",
+    "log1pf",
+    "log1pl",
+    "log2",
+    "log2f",
+    "log2l",
+    "logb",
+    "logbf",
+    "logbl",
+    "logf",
+    "logl",
+    "lstat",
+    "lstat64",
+    "malloc",
+    "memalign",
+    "memccpy",
+    "memchr",
+    "memcmp",
+    "memcpy",
+    "memmove",
+    "memrchr",
+    "memset",
+    "memset_pattern16",
+    "mkdir",
+    "mktime",
+    "modf",
+    "modff",
+    "modfl",
+    "nearbyint",
+    "nearbyintf",
+    "nearbyintl",
+    "ntohl",
+    "ntohs",
+    "open",
+    "open64",
+    "opendir",
+    "pclose",
+    "perror",
+    "popen",
+    "posix_memalign",
+    "pow",
+    "powf",
+    "powl",
+    "pread",
+    "printf",
+    "putc",
+    "putchar",
+    "puts",
+    "pwrite",
+    "qsort",
+    "read",
+    "readlink",
+    "realloc",
+    "reallocf",
+    "realpath",
+    "remove",
+    "rename",
+    "rewind",
+    "rint",
+    "rintf",
+    "rintl",
+    "rmdir",
+    "round",
+    "roundf",
+    "roundl",
+    "scanf",
+    "setbuf",
+    "setitimer",
+    "setvbuf",
+    "sin",
+    "sinf",
+    "sinh",
+    "sinhf",
+    "sinhl",
+    "sinl",
+    "siprintf",
+    "snprintf",
+    "sprintf",
+    "sqrt",
+    "sqrtf",
+    "sqrtl",
+    "sscanf",
+    "stat",
+    "stat64",
+    "statvfs",
+    "statvfs64",
+    "stpcpy",
+    "stpncpy",
+    "strcasecmp",
+    "strcat",
+    "strchr",
+    "strcmp",
+    "strcoll",
+    "strcpy",
+    "strcspn",
+    "strdup",
+    "strlen",
+    "strncasecmp",
+    "strncat",
+    "strncmp",
+    "strncpy",
+    "strndup",
+    "strnlen",
+    "strpbrk",
+    "strrchr",
+    "strspn",
+    "strstr",
+    "strtod",
+    "strtof",
+    "strtok",
+    "strtok_r",
+    "strtol",
+    "strtold",
+    "strtoll",
+    "strtoul",
+    "strtoull",
+    "strxfrm",
+    "system",
+    "tan",
+    "tanf",
+    "tanh",
+    "tanhf",
+    "tanhl",
+    "tanl",
+    "times",
+    "tmpfile",
+    "tmpfile64",
+    "toascii",
+    "trunc",
+    "truncf",
+    "truncl",
+    "uname",
+    "ungetc",
+    "unlink",
+    "unsetenv",
+    "utime",
+    "utimes",
+    "valloc",
+    "vfprintf",
+    "vfscanf",
+    "vprintf",
+    "vscanf",
+    "vsnprintf",
+    "vsprintf",
+    "vsscanf",
+    "write"
+  };
+
+static bool hasSinCosPiStret(const Triple &T) {
+  // Only Darwin variants have _stret versions of combined trig functions.
+  if (!T.isMacOSX() && T.getOS() != Triple::IOS)
+    return false;
+
+  // The ABI is rather complicated on x86, so don't do anything special there.
+  if (T.getArch() == Triple::x86)
+    return false;
+
+  if (T.isMacOSX() && T.isMacOSXVersionLT(10, 9))
+    return false;
+
+  if (T.getOS() == Triple::IOS && T.isOSVersionLT(7, 0))
+    return false;
+
+  return true;
+}
+
+/// initialize - Initialize the set of available library functions based on the
+/// specified target triple.  This should be carefully written so that a missing
+/// target triple gets a sane set of defaults.
+static void initialize(TargetLibraryInfo &TLI, const Triple &T,
+                       const char **StandardNames) {
+  initializeTargetLibraryInfoPass(*PassRegistry::getPassRegistry());
+
+#ifndef NDEBUG
+  // Verify that the StandardNames array is in alphabetical order.
+  for (unsigned F = 1; F < LibFunc::NumLibFuncs; ++F) {
+    if (strcmp(StandardNames[F-1], StandardNames[F]) >= 0)
+      llvm_unreachable("TargetLibraryInfo function names must be sorted");
+  }
+#endif // !NDEBUG
+
+  // There are no library implementations of mempcy and memset for r600 and
+  // these can be difficult to lower in the backend.
+  if (T.getArch() == Triple::r600) {
+    TLI.setUnavailable(LibFunc::memcpy);
+    TLI.setUnavailable(LibFunc::memset);
+    TLI.setUnavailable(LibFunc::memset_pattern16);
+    return;
+  }
+
+  // memset_pattern16 is only available on iOS 3.0 and Mac OS X 10.5 and later.
+  if (T.isMacOSX()) {
+    if (T.isMacOSXVersionLT(10, 5))
+      TLI.setUnavailable(LibFunc::memset_pattern16);
+  } else if (T.isiOS()) {
+    if (T.isOSVersionLT(3, 0))
+      TLI.setUnavailable(LibFunc::memset_pattern16);
+  } else {
+    TLI.setUnavailable(LibFunc::memset_pattern16);
+  }
+
+  if (!hasSinCosPiStret(T)) {
+    TLI.setUnavailable(LibFunc::sinpi);
+    TLI.setUnavailable(LibFunc::sinpif);
+    TLI.setUnavailable(LibFunc::cospi);
+    TLI.setUnavailable(LibFunc::cospif);
+    TLI.setUnavailable(LibFunc::sincospi_stret);
+    TLI.setUnavailable(LibFunc::sincospif_stret);
+  }
+
+  if (T.isMacOSX() && T.getArch() == Triple::x86 &&
+      !T.isMacOSXVersionLT(10, 7)) {
+    // x86-32 OSX has a scheme where fwrite and fputs (and some other functions
+    // we don't care about) have two versions; on recent OSX, the one we want
+    // has a $UNIX2003 suffix. The two implementations are identical except
+    // for the return value in some edge cases.  However, we don't want to
+    // generate code that depends on the old symbols.
+    TLI.setAvailableWithName(LibFunc::fwrite, "fwrite$UNIX2003");
+    TLI.setAvailableWithName(LibFunc::fputs, "fputs$UNIX2003");
+  }
+
+  // iprintf and friends are only available on XCore and TCE.
+  if (T.getArch() != Triple::xcore && T.getArch() != Triple::tce) {
+    TLI.setUnavailable(LibFunc::iprintf);
+    TLI.setUnavailable(LibFunc::siprintf);
+    TLI.setUnavailable(LibFunc::fiprintf);
+  }
+
+  if (T.isOSWindows() && !T.isOSCygMing()) {
+    // Win32 does not support long double
+    TLI.setUnavailable(LibFunc::acosl);
+    TLI.setUnavailable(LibFunc::asinl);
+    TLI.setUnavailable(LibFunc::atanl);
+    TLI.setUnavailable(LibFunc::atan2l);
+    TLI.setUnavailable(LibFunc::ceill);
+    TLI.setUnavailable(LibFunc::copysignl);
+    TLI.setUnavailable(LibFunc::cosl);
+    TLI.setUnavailable(LibFunc::coshl);
+    TLI.setUnavailable(LibFunc::expl);
+    TLI.setUnavailable(LibFunc::fabsf); // Win32 and Win64 both lack fabsf
+    TLI.setUnavailable(LibFunc::fabsl);
+    TLI.setUnavailable(LibFunc::floorl);
+    TLI.setUnavailable(LibFunc::fmaxl);
+    TLI.setUnavailable(LibFunc::fminl);
+    TLI.setUnavailable(LibFunc::fmodl);
+    TLI.setUnavailable(LibFunc::frexpl);
+    TLI.setUnavailable(LibFunc::ldexpf);
+    TLI.setUnavailable(LibFunc::ldexpl);
+    TLI.setUnavailable(LibFunc::logl);
+    TLI.setUnavailable(LibFunc::modfl);
+    TLI.setUnavailable(LibFunc::powl);
+    TLI.setUnavailable(LibFunc::sinl);
+    TLI.setUnavailable(LibFunc::sinhl);
+    TLI.setUnavailable(LibFunc::sqrtl);
+    TLI.setUnavailable(LibFunc::tanl);
+    TLI.setUnavailable(LibFunc::tanhl);
+
+    // Win32 only has C89 math
+    TLI.setUnavailable(LibFunc::acosh);
+    TLI.setUnavailable(LibFunc::acoshf);
+    TLI.setUnavailable(LibFunc::acoshl);
+    TLI.setUnavailable(LibFunc::asinh);
+    TLI.setUnavailable(LibFunc::asinhf);
+    TLI.setUnavailable(LibFunc::asinhl);
+    TLI.setUnavailable(LibFunc::atanh);
+    TLI.setUnavailable(LibFunc::atanhf);
+    TLI.setUnavailable(LibFunc::atanhl);
+    TLI.setUnavailable(LibFunc::cbrt);
+    TLI.setUnavailable(LibFunc::cbrtf);
+    TLI.setUnavailable(LibFunc::cbrtl);
+    TLI.setUnavailable(LibFunc::exp2);
+    TLI.setUnavailable(LibFunc::exp2f);
+    TLI.setUnavailable(LibFunc::exp2l);
+    TLI.setUnavailable(LibFunc::expm1);
+    TLI.setUnavailable(LibFunc::expm1f);
+    TLI.setUnavailable(LibFunc::expm1l);
+    TLI.setUnavailable(LibFunc::log2);
+    TLI.setUnavailable(LibFunc::log2f);
+    TLI.setUnavailable(LibFunc::log2l);
+    TLI.setUnavailable(LibFunc::log1p);
+    TLI.setUnavailable(LibFunc::log1pf);
+    TLI.setUnavailable(LibFunc::log1pl);
+    TLI.setUnavailable(LibFunc::logb);
+    TLI.setUnavailable(LibFunc::logbf);
+    TLI.setUnavailable(LibFunc::logbl);
+    TLI.setUnavailable(LibFunc::nearbyint);
+    TLI.setUnavailable(LibFunc::nearbyintf);
+    TLI.setUnavailable(LibFunc::nearbyintl);
+    TLI.setUnavailable(LibFunc::rint);
+    TLI.setUnavailable(LibFunc::rintf);
+    TLI.setUnavailable(LibFunc::rintl);
+    TLI.setUnavailable(LibFunc::round);
+    TLI.setUnavailable(LibFunc::roundf);
+    TLI.setUnavailable(LibFunc::roundl);
+    TLI.setUnavailable(LibFunc::trunc);
+    TLI.setUnavailable(LibFunc::truncf);
+    TLI.setUnavailable(LibFunc::truncl);
+
+    // Win32 provides some C99 math with mangled names
+    TLI.setAvailableWithName(LibFunc::copysign, "_copysign");
+
+    if (T.getArch() == Triple::x86) {
+      // Win32 on x86 implements single-precision math functions as macros
+      TLI.setUnavailable(LibFunc::acosf);
+      TLI.setUnavailable(LibFunc::asinf);
+      TLI.setUnavailable(LibFunc::atanf);
+      TLI.setUnavailable(LibFunc::atan2f);
+      TLI.setUnavailable(LibFunc::ceilf);
+      TLI.setUnavailable(LibFunc::copysignf);
+      TLI.setUnavailable(LibFunc::cosf);
+      TLI.setUnavailable(LibFunc::coshf);
+      TLI.setUnavailable(LibFunc::expf);
+      TLI.setUnavailable(LibFunc::floorf);
+      TLI.setUnavailable(LibFunc::fminf);
+      TLI.setUnavailable(LibFunc::fmaxf);
+      TLI.setUnavailable(LibFunc::fmodf);
+      TLI.setUnavailable(LibFunc::logf);
+      TLI.setUnavailable(LibFunc::powf);
+      TLI.setUnavailable(LibFunc::sinf);
+      TLI.setUnavailable(LibFunc::sinhf);
+      TLI.setUnavailable(LibFunc::sqrtf);
+      TLI.setUnavailable(LibFunc::tanf);
+      TLI.setUnavailable(LibFunc::tanhf);
+    }
+
+    // Win32 does *not* provide provide these functions, but they are
+    // generally available on POSIX-compliant systems:
+    TLI.setUnavailable(LibFunc::access);
+    TLI.setUnavailable(LibFunc::bcmp);
+    TLI.setUnavailable(LibFunc::bcopy);
+    TLI.setUnavailable(LibFunc::bzero);
+    TLI.setUnavailable(LibFunc::chmod);
+    TLI.setUnavailable(LibFunc::chown);
+    TLI.setUnavailable(LibFunc::closedir);
+    TLI.setUnavailable(LibFunc::ctermid);
+    TLI.setUnavailable(LibFunc::fdopen);
+    TLI.setUnavailable(LibFunc::ffs);
+    TLI.setUnavailable(LibFunc::fileno);
+    TLI.setUnavailable(LibFunc::flockfile);
+    TLI.setUnavailable(LibFunc::fseeko);
+    TLI.setUnavailable(LibFunc::fstat);
+    TLI.setUnavailable(LibFunc::fstatvfs);
+    TLI.setUnavailable(LibFunc::ftello);
+    TLI.setUnavailable(LibFunc::ftrylockfile);
+    TLI.setUnavailable(LibFunc::funlockfile);
+    TLI.setUnavailable(LibFunc::getc_unlocked);
+    TLI.setUnavailable(LibFunc::getitimer);
+    TLI.setUnavailable(LibFunc::getlogin_r);
+    TLI.setUnavailable(LibFunc::getpwnam);
+    TLI.setUnavailable(LibFunc::gettimeofday);
+    TLI.setUnavailable(LibFunc::htonl);
+    TLI.setUnavailable(LibFunc::htons);
+    TLI.setUnavailable(LibFunc::lchown);
+    TLI.setUnavailable(LibFunc::lstat);
+    TLI.setUnavailable(LibFunc::memccpy);
+    TLI.setUnavailable(LibFunc::mkdir);
+    TLI.setUnavailable(LibFunc::ntohl);
+    TLI.setUnavailable(LibFunc::ntohs);
+    TLI.setUnavailable(LibFunc::open);
+    TLI.setUnavailable(LibFunc::opendir);
+    TLI.setUnavailable(LibFunc::pclose);
+    TLI.setUnavailable(LibFunc::popen);
+    TLI.setUnavailable(LibFunc::pread);
+    TLI.setUnavailable(LibFunc::pwrite);
+    TLI.setUnavailable(LibFunc::read);
+    TLI.setUnavailable(LibFunc::readlink);
+    TLI.setUnavailable(LibFunc::realpath);
+    TLI.setUnavailable(LibFunc::rmdir);
+    TLI.setUnavailable(LibFunc::setitimer);
+    TLI.setUnavailable(LibFunc::stat);
+    TLI.setUnavailable(LibFunc::statvfs);
+    TLI.setUnavailable(LibFunc::stpcpy);
+    TLI.setUnavailable(LibFunc::stpncpy);
+    TLI.setUnavailable(LibFunc::strcasecmp);
+    TLI.setUnavailable(LibFunc::strncasecmp);
+    TLI.setUnavailable(LibFunc::times);
+    TLI.setUnavailable(LibFunc::uname);
+    TLI.setUnavailable(LibFunc::unlink);
+    TLI.setUnavailable(LibFunc::unsetenv);
+    TLI.setUnavailable(LibFunc::utime);
+    TLI.setUnavailable(LibFunc::utimes);
+    TLI.setUnavailable(LibFunc::write);
+
+    // Win32 does *not* provide provide these functions, but they are
+    // specified by C99:
+    TLI.setUnavailable(LibFunc::atoll);
+    TLI.setUnavailable(LibFunc::frexpf);
+    TLI.setUnavailable(LibFunc::llabs);
+  }
+
+  switch (T.getOS()) {
+  case Triple::MacOSX:
+    // exp10 and exp10f are not available on OS X until 10.9 and iOS until 7.0
+    // and their names are __exp10 and __exp10f. exp10l is not available on
+    // OS X or iOS.
+    TLI.setUnavailable(LibFunc::exp10l);
+    if (T.isMacOSXVersionLT(10, 9)) {
+      TLI.setUnavailable(LibFunc::exp10);
+      TLI.setUnavailable(LibFunc::exp10f);
+    } else {
+      TLI.setAvailableWithName(LibFunc::exp10, "__exp10");
+      TLI.setAvailableWithName(LibFunc::exp10f, "__exp10f");
+    }
+    break;
+  case Triple::IOS:
+    TLI.setUnavailable(LibFunc::exp10l);
+    if (T.isOSVersionLT(7, 0)) {
+      TLI.setUnavailable(LibFunc::exp10);
+      TLI.setUnavailable(LibFunc::exp10f);
+    } else {
+      TLI.setAvailableWithName(LibFunc::exp10, "__exp10");
+      TLI.setAvailableWithName(LibFunc::exp10f, "__exp10f");
+    }
+    break;
+  case Triple::Linux:
+    // exp10, exp10f, exp10l is available on Linux (GLIBC) but are extremely
+    // buggy prior to glibc version 2.18. Until this version is widely deployed
+    // or we have a reasonable detection strategy, we cannot use exp10 reliably
+    // on Linux.
+    //
+    // Fall through to disable all of them.
+  default:
+    TLI.setUnavailable(LibFunc::exp10);
+    TLI.setUnavailable(LibFunc::exp10f);
+    TLI.setUnavailable(LibFunc::exp10l);
+  }
+
+  // ffsl is available on at least Darwin, Mac OS X, iOS, FreeBSD, and
+  // Linux (GLIBC):
+  // http://developer.apple.com/library/mac/#documentation/Darwin/Reference/ManPages/man3/ffsl.3.html
+  // http://svn.freebsd.org/base/user/eri/pf45/head/lib/libc/string/ffsl.c
+  // http://www.gnu.org/software/gnulib/manual/html_node/ffsl.html
+  switch (T.getOS()) {
+  case Triple::Darwin:
+  case Triple::MacOSX:
+  case Triple::IOS:
+  case Triple::FreeBSD:
+  case Triple::Linux:
+    break;
+  default:
+    TLI.setUnavailable(LibFunc::ffsl);
+  }
+
+  // ffsll is available on at least FreeBSD and Linux (GLIBC):
+  // http://svn.freebsd.org/base/user/eri/pf45/head/lib/libc/string/ffsll.c
+  // http://www.gnu.org/software/gnulib/manual/html_node/ffsll.html
+  switch (T.getOS()) {
+  case Triple::FreeBSD:
+  case Triple::Linux:
+    break;
+  default:
+    TLI.setUnavailable(LibFunc::ffsll);
+  }
+
+  // The following functions are available on at least Linux:
+  if (!T.isOSLinux()) {
+    TLI.setUnavailable(LibFunc::dunder_strdup);
+    TLI.setUnavailable(LibFunc::dunder_strtok_r);
+    TLI.setUnavailable(LibFunc::dunder_isoc99_scanf);
+    TLI.setUnavailable(LibFunc::dunder_isoc99_sscanf);
+    TLI.setUnavailable(LibFunc::under_IO_getc);
+    TLI.setUnavailable(LibFunc::under_IO_putc);
+    TLI.setUnavailable(LibFunc::memalign);
+    TLI.setUnavailable(LibFunc::fopen64);
+    TLI.setUnavailable(LibFunc::fseeko64);
+    TLI.setUnavailable(LibFunc::fstat64);
+    TLI.setUnavailable(LibFunc::fstatvfs64);
+    TLI.setUnavailable(LibFunc::ftello64);
+    TLI.setUnavailable(LibFunc::lstat64);
+    TLI.setUnavailable(LibFunc::open64);
+    TLI.setUnavailable(LibFunc::stat64);
+    TLI.setUnavailable(LibFunc::statvfs64);
+    TLI.setUnavailable(LibFunc::tmpfile64);
+  }
+}
+
+
+TargetLibraryInfo::TargetLibraryInfo() : ImmutablePass(ID) {
+  // Default to everything being available.
+  memset(AvailableArray, -1, sizeof(AvailableArray));
+
+  initialize(*this, Triple(), StandardNames);
+}
+
+TargetLibraryInfo::TargetLibraryInfo(const Triple &T) : ImmutablePass(ID) {
+  // Default to everything being available.
+  memset(AvailableArray, -1, sizeof(AvailableArray));
+  
+  initialize(*this, T, StandardNames);
+}
+
+TargetLibraryInfo::TargetLibraryInfo(const TargetLibraryInfo &TLI)
+  : ImmutablePass(ID) {
+  memcpy(AvailableArray, TLI.AvailableArray, sizeof(AvailableArray));
+  CustomNames = TLI.CustomNames;
+}
+
+namespace {
+struct StringComparator {
+  /// Compare two strings and return true if LHS is lexicographically less than
+  /// RHS. Requires that RHS doesn't contain any zero bytes.
+  bool operator()(const char *LHS, StringRef RHS) const {
+    // Compare prefixes with strncmp. If prefixes match we know that LHS is
+    // greater or equal to RHS as RHS can't contain any '\0'.
+    return std::strncmp(LHS, RHS.data(), RHS.size()) < 0;
+  }
+
+  // Provided for compatibility with MSVC's debug mode.
+  bool operator()(StringRef LHS, const char *RHS) const { return LHS < RHS; }
+  bool operator()(StringRef LHS, StringRef RHS) const { return LHS < RHS; }
+  bool operator()(const char *LHS, const char *RHS) const {
+    return std::strcmp(LHS, RHS) < 0;
+  }
+};
+}
+
+bool TargetLibraryInfo::getLibFunc(StringRef funcName,
+                                   LibFunc::Func &F) const {
+  const char **Start = &StandardNames[0];
+  const char **End = &StandardNames[LibFunc::NumLibFuncs];
+
+  // Filter out empty names and names containing null bytes, those can't be in
+  // our table.
+  if (funcName.empty() || funcName.find('\0') != StringRef::npos)
+    return false;
+
+  // Check for \01 prefix that is used to mangle __asm declarations and
+  // strip it if present.
+  if (funcName.front() == '\01')
+    funcName = funcName.substr(1);
+  const char **I = std::lower_bound(Start, End, funcName, StringComparator());
+  if (I != End && *I == funcName) {
+    F = (LibFunc::Func)(I - Start);
+    return true;
+  }
+  return false;
+}
+
+/// disableAllFunctions - This disables all builtins, which is used for options
+/// like -fno-builtin.
+void TargetLibraryInfo::disableAllFunctions() {
+  memset(AvailableArray, 0, sizeof(AvailableArray));
+}
diff --git a/contrib/llvm/lib/Target/TargetLoweringObjectFile.cpp b/contrib/llvm/lib/Target/TargetLoweringObjectFile.cpp
new file mode 100644
index 0000000..2569e92
--- /dev/null
+++ b/contrib/llvm/lib/Target/TargetLoweringObjectFile.cpp
@@ -0,0 +1,345 @@
+//===-- llvm/Target/TargetLoweringObjectFile.cpp - Object File Info -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements classes used to handle lowerings specific to common
+// object file formats.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//                              Generic Code
+//===----------------------------------------------------------------------===//
+
+/// Initialize - this method must be called before any actual lowering is
+/// done.  This specifies the current context for codegen, and gives the
+/// lowering implementations a chance to set up their default sections.
+void TargetLoweringObjectFile::Initialize(MCContext &ctx,
+                                          const TargetMachine &TM) {
+  Ctx = &ctx;
+  DL = TM.getDataLayout();
+  InitMCObjectFileInfo(TM.getTargetTriple(),
+                       TM.getRelocationModel(), TM.getCodeModel(), *Ctx);
+}
+
+TargetLoweringObjectFile::~TargetLoweringObjectFile() {
+}
+
+static bool isSuitableForBSS(const GlobalVariable *GV, bool NoZerosInBSS) {
+  const Constant *C = GV->getInitializer();
+
+  // Must have zero initializer.
+  if (!C->isNullValue())
+    return false;
+
+  // Leave constant zeros in readonly constant sections, so they can be shared.
+  if (GV->isConstant())
+    return false;
+
+  // If the global has an explicit section specified, don't put it in BSS.
+  if (GV->hasSection())
+    return false;
+
+  // If -nozero-initialized-in-bss is specified, don't ever use BSS.
+  if (NoZerosInBSS)
+    return false;
+
+  // Otherwise, put it in BSS!
+  return true;
+}
+
+/// IsNullTerminatedString - Return true if the specified constant (which is
+/// known to have a type that is an array of 1/2/4 byte elements) ends with a
+/// nul value and contains no other nuls in it.  Note that this is more general
+/// than ConstantDataSequential::isString because we allow 2 & 4 byte strings.
+static bool IsNullTerminatedString(const Constant *C) {
+  // First check: is we have constant array terminated with zero
+  if (const ConstantDataSequential *CDS = dyn_cast<ConstantDataSequential>(C)) {
+    unsigned NumElts = CDS->getNumElements();
+    assert(NumElts != 0 && "Can't have an empty CDS");
+    
+    if (CDS->getElementAsInteger(NumElts-1) != 0)
+      return false; // Not null terminated.
+    
+    // Verify that the null doesn't occur anywhere else in the string.
+    for (unsigned i = 0; i != NumElts-1; ++i)
+      if (CDS->getElementAsInteger(i) == 0)
+        return false;
+    return true;
+  }
+
+  // Another possibility: [1 x i8] zeroinitializer
+  if (isa<ConstantAggregateZero>(C))
+    return cast<ArrayType>(C->getType())->getNumElements() == 1;
+
+  return false;
+}
+
+MCSymbol *TargetLoweringObjectFile::getSymbolWithGlobalValueBase(
+    const GlobalValue *GV, StringRef Suffix, Mangler &Mang,
+    const TargetMachine &TM) const {
+  assert(!Suffix.empty());
+
+  SmallString<60> NameStr;
+  NameStr += DL->getPrivateGlobalPrefix();
+  TM.getNameWithPrefix(NameStr, GV, Mang);
+  NameStr.append(Suffix.begin(), Suffix.end());
+  return Ctx->GetOrCreateSymbol(NameStr.str());
+}
+
+MCSymbol *TargetLoweringObjectFile::getCFIPersonalitySymbol(
+    const GlobalValue *GV, Mangler &Mang, const TargetMachine &TM,
+    MachineModuleInfo *MMI) const {
+  return TM.getSymbol(GV, Mang);
+}
+
+void TargetLoweringObjectFile::emitPersonalityValue(MCStreamer &Streamer,
+                                                    const TargetMachine &TM,
+                                                    const MCSymbol *Sym) const {
+}
+
+
+/// getKindForGlobal - This is a top-level target-independent classifier for
+/// a global variable.  Given an global variable and information from TM, it
+/// classifies the global in a variety of ways that make various target
+/// implementations simpler.  The target implementation is free to ignore this
+/// extra info of course.
+SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalValue *GV,
+                                                       const TargetMachine &TM){
+  assert(!GV->isDeclaration() && !GV->hasAvailableExternallyLinkage() &&
+         "Can only be used for global definitions");
+
+  Reloc::Model ReloModel = TM.getRelocationModel();
+
+  // Early exit - functions should be always in text sections.
+  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
+  if (!GVar)
+    return SectionKind::getText();
+
+  // Handle thread-local data first.
+  if (GVar->isThreadLocal()) {
+    if (isSuitableForBSS(GVar, TM.Options.NoZerosInBSS))
+      return SectionKind::getThreadBSS();
+    return SectionKind::getThreadData();
+  }
+
+  // Variables with common linkage always get classified as common.
+  if (GVar->hasCommonLinkage())
+    return SectionKind::getCommon();
+
+  // Variable can be easily put to BSS section.
+  if (isSuitableForBSS(GVar, TM.Options.NoZerosInBSS)) {
+    if (GVar->hasLocalLinkage())
+      return SectionKind::getBSSLocal();
+    else if (GVar->hasExternalLinkage())
+      return SectionKind::getBSSExtern();
+    return SectionKind::getBSS();
+  }
+
+  const Constant *C = GVar->getInitializer();
+
+  // If the global is marked constant, we can put it into a mergable section,
+  // a mergable string section, or general .data if it contains relocations.
+  if (GVar->isConstant()) {
+    // If the initializer for the global contains something that requires a
+    // relocation, then we may have to drop this into a writable data section
+    // even though it is marked const.
+    switch (C->getRelocationInfo()) {
+    case Constant::NoRelocation:
+      // If the global is required to have a unique address, it can't be put
+      // into a mergable section: just drop it into the general read-only
+      // section instead.
+      if (!GVar->hasUnnamedAddr())
+        return SectionKind::getReadOnly();
+        
+      // If initializer is a null-terminated string, put it in a "cstring"
+      // section of the right width.
+      if (ArrayType *ATy = dyn_cast<ArrayType>(C->getType())) {
+        if (IntegerType *ITy =
+              dyn_cast<IntegerType>(ATy->getElementType())) {
+          if ((ITy->getBitWidth() == 8 || ITy->getBitWidth() == 16 ||
+               ITy->getBitWidth() == 32) &&
+              IsNullTerminatedString(C)) {
+            if (ITy->getBitWidth() == 8)
+              return SectionKind::getMergeable1ByteCString();
+            if (ITy->getBitWidth() == 16)
+              return SectionKind::getMergeable2ByteCString();
+
+            assert(ITy->getBitWidth() == 32 && "Unknown width");
+            return SectionKind::getMergeable4ByteCString();
+          }
+        }
+      }
+
+      // Otherwise, just drop it into a mergable constant section.  If we have
+      // a section for this size, use it, otherwise use the arbitrary sized
+      // mergable section.
+      switch (TM.getDataLayout()->getTypeAllocSize(C->getType())) {
+      case 4:  return SectionKind::getMergeableConst4();
+      case 8:  return SectionKind::getMergeableConst8();
+      case 16: return SectionKind::getMergeableConst16();
+      default: return SectionKind::getMergeableConst();
+      }
+
+    case Constant::LocalRelocation:
+      // In static relocation model, the linker will resolve all addresses, so
+      // the relocation entries will actually be constants by the time the app
+      // starts up.  However, we can't put this into a mergable section, because
+      // the linker doesn't take relocations into consideration when it tries to
+      // merge entries in the section.
+      if (ReloModel == Reloc::Static)
+        return SectionKind::getReadOnly();
+
+      // Otherwise, the dynamic linker needs to fix it up, put it in the
+      // writable data.rel.local section.
+      return SectionKind::getReadOnlyWithRelLocal();
+
+    case Constant::GlobalRelocations:
+      // In static relocation model, the linker will resolve all addresses, so
+      // the relocation entries will actually be constants by the time the app
+      // starts up.  However, we can't put this into a mergable section, because
+      // the linker doesn't take relocations into consideration when it tries to
+      // merge entries in the section.
+      if (ReloModel == Reloc::Static)
+        return SectionKind::getReadOnly();
+
+      // Otherwise, the dynamic linker needs to fix it up, put it in the
+      // writable data.rel section.
+      return SectionKind::getReadOnlyWithRel();
+    }
+  }
+
+  // Okay, this isn't a constant.  If the initializer for the global is going
+  // to require a runtime relocation by the dynamic linker, put it into a more
+  // specific section to improve startup time of the app.  This coalesces these
+  // globals together onto fewer pages, improving the locality of the dynamic
+  // linker.
+  if (ReloModel == Reloc::Static)
+    return SectionKind::getDataNoRel();
+
+  switch (C->getRelocationInfo()) {
+  case Constant::NoRelocation:
+    return SectionKind::getDataNoRel();
+  case Constant::LocalRelocation:
+    return SectionKind::getDataRelLocal();
+  case Constant::GlobalRelocations:
+    return SectionKind::getDataRel();
+  }
+  llvm_unreachable("Invalid relocation");
+}
+
+/// SectionForGlobal - This method computes the appropriate section to emit
+/// the specified global variable or function definition.  This should not
+/// be passed external (or available externally) globals.
+const MCSection *TargetLoweringObjectFile::
+SectionForGlobal(const GlobalValue *GV, SectionKind Kind, Mangler &Mang,
+                 const TargetMachine &TM) const {
+  // Select section name.
+  if (GV->hasSection())
+    return getExplicitSectionGlobal(GV, Kind, Mang, TM);
+
+
+  // Use default section depending on the 'type' of global
+  return SelectSectionForGlobal(GV, Kind, Mang, TM);
+}
+
+bool TargetLoweringObjectFile::isSectionAtomizableBySymbols(
+    const MCSection &Section) const {
+  return false;
+}
+
+// Lame default implementation. Calculate the section name for global.
+const MCSection *
+TargetLoweringObjectFile::SelectSectionForGlobal(const GlobalValue *GV,
+                                                 SectionKind Kind,
+                                                 Mangler &Mang,
+                                                 const TargetMachine &TM) const{
+  assert(!Kind.isThreadLocal() && "Doesn't support TLS");
+
+  if (Kind.isText())
+    return getTextSection();
+
+  if (Kind.isBSS() && BSSSection != nullptr)
+    return BSSSection;
+
+  if (Kind.isReadOnly() && ReadOnlySection != nullptr)
+    return ReadOnlySection;
+
+  return getDataSection();
+}
+
+/// getSectionForConstant - Given a mergable constant with the
+/// specified size and relocation information, return a section that it
+/// should be placed in.
+const MCSection *
+TargetLoweringObjectFile::getSectionForConstant(SectionKind Kind,
+                                                const Constant *C) const {
+  if (Kind.isReadOnly() && ReadOnlySection != nullptr)
+    return ReadOnlySection;
+
+  return DataSection;
+}
+
+/// getTTypeGlobalReference - Return an MCExpr to use for a
+/// reference to the specified global variable from exception
+/// handling information.
+const MCExpr *TargetLoweringObjectFile::getTTypeGlobalReference(
+    const GlobalValue *GV, unsigned Encoding, Mangler &Mang,
+    const TargetMachine &TM, MachineModuleInfo *MMI,
+    MCStreamer &Streamer) const {
+  const MCSymbolRefExpr *Ref =
+      MCSymbolRefExpr::Create(TM.getSymbol(GV, Mang), getContext());
+
+  return getTTypeReference(Ref, Encoding, Streamer);
+}
+
+const MCExpr *TargetLoweringObjectFile::
+getTTypeReference(const MCSymbolRefExpr *Sym, unsigned Encoding,
+                  MCStreamer &Streamer) const {
+  switch (Encoding & 0x70) {
+  default:
+    report_fatal_error("We do not support this DWARF encoding yet!");
+  case dwarf::DW_EH_PE_absptr:
+    // Do nothing special
+    return Sym;
+  case dwarf::DW_EH_PE_pcrel: {
+    // Emit a label to the streamer for the current position.  This gives us
+    // .-foo addressing.
+    MCSymbol *PCSym = getContext().CreateTempSymbol();
+    Streamer.EmitLabel(PCSym);
+    const MCExpr *PC = MCSymbolRefExpr::Create(PCSym, getContext());
+    return MCBinaryExpr::CreateSub(Sym, PC, getContext());
+  }
+  }
+}
+
+const MCExpr *TargetLoweringObjectFile::getDebugThreadLocalSymbol(const MCSymbol *Sym) const {
+  // FIXME: It's not clear what, if any, default this should have - perhaps a
+  // null return could mean 'no location' & we should just do that here.
+  return MCSymbolRefExpr::Create(Sym, *Ctx);
+}
diff --git a/contrib/llvm/lib/Target/TargetMachine.cpp b/contrib/llvm/lib/Target/TargetMachine.cpp
new file mode 100644
index 0000000..95c8cb6
--- /dev/null
+++ b/contrib/llvm/lib/Target/TargetMachine.cpp
@@ -0,0 +1,198 @@
+//===-- TargetMachine.cpp - General Target Information ---------------------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the general parts of a Target machine.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/MC/SectionKind.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+using namespace llvm;
+
+//---------------------------------------------------------------------------
+// TargetMachine Class
+//
+
+TargetMachine::TargetMachine(const Target &T,
+                             StringRef TT, StringRef CPU, StringRef FS,
+                             const TargetOptions &Options)
+  : TheTarget(T), TargetTriple(TT), TargetCPU(CPU), TargetFS(FS),
+    CodeGenInfo(nullptr), AsmInfo(nullptr),
+    RequireStructuredCFG(false),
+    Options(Options) {
+}
+
+TargetMachine::~TargetMachine() {
+  delete CodeGenInfo;
+  delete AsmInfo;
+}
+
+/// \brief Reset the target options based on the function's attributes.
+void TargetMachine::resetTargetOptions(const MachineFunction *MF) const {
+  const Function *F = MF->getFunction();
+  TargetOptions &TO = MF->getTarget().Options;
+
+#define RESET_OPTION(X, Y)                                              \
+  do {                                                                  \
+    if (F->hasFnAttribute(Y))                                           \
+      TO.X =                                                            \
+        (F->getAttributes().                                            \
+           getAttribute(AttributeSet::FunctionIndex,                    \
+                        Y).getValueAsString() == "true");               \
+  } while (0)
+
+  RESET_OPTION(NoFramePointerElim, "no-frame-pointer-elim");
+  RESET_OPTION(LessPreciseFPMADOption, "less-precise-fpmad");
+  RESET_OPTION(UnsafeFPMath, "unsafe-fp-math");
+  RESET_OPTION(NoInfsFPMath, "no-infs-fp-math");
+  RESET_OPTION(NoNaNsFPMath, "no-nans-fp-math");
+  RESET_OPTION(UseSoftFloat, "use-soft-float");
+  RESET_OPTION(DisableTailCalls, "disable-tail-calls");
+
+  TO.MCOptions.SanitizeAddress = F->hasFnAttribute(Attribute::SanitizeAddress);
+}
+
+/// getRelocationModel - Returns the code generation relocation model. The
+/// choices are static, PIC, and dynamic-no-pic, and target default.
+Reloc::Model TargetMachine::getRelocationModel() const {
+  if (!CodeGenInfo)
+    return Reloc::Default;
+  return CodeGenInfo->getRelocationModel();
+}
+
+/// getCodeModel - Returns the code model. The choices are small, kernel,
+/// medium, large, and target default.
+CodeModel::Model TargetMachine::getCodeModel() const {
+  if (!CodeGenInfo)
+    return CodeModel::Default;
+  return CodeGenInfo->getCodeModel();
+}
+
+/// Get the IR-specified TLS model for Var.
+static TLSModel::Model getSelectedTLSModel(const GlobalValue *GV) {
+  switch (GV->getThreadLocalMode()) {
+  case GlobalVariable::NotThreadLocal:
+    llvm_unreachable("getSelectedTLSModel for non-TLS variable");
+    break;
+  case GlobalVariable::GeneralDynamicTLSModel:
+    return TLSModel::GeneralDynamic;
+  case GlobalVariable::LocalDynamicTLSModel:
+    return TLSModel::LocalDynamic;
+  case GlobalVariable::InitialExecTLSModel:
+    return TLSModel::InitialExec;
+  case GlobalVariable::LocalExecTLSModel:
+    return TLSModel::LocalExec;
+  }
+  llvm_unreachable("invalid TLS model");
+}
+
+TLSModel::Model TargetMachine::getTLSModel(const GlobalValue *GV) const {
+  bool isLocal = GV->hasLocalLinkage();
+  bool isDeclaration = GV->isDeclaration();
+  bool isPIC = getRelocationModel() == Reloc::PIC_;
+  bool isPIE = Options.PositionIndependentExecutable;
+  // FIXME: what should we do for protected and internal visibility?
+  // For variables, is internal different from hidden?
+  bool isHidden = GV->hasHiddenVisibility();
+
+  TLSModel::Model Model;
+  if (isPIC && !isPIE) {
+    if (isLocal || isHidden)
+      Model = TLSModel::LocalDynamic;
+    else
+      Model = TLSModel::GeneralDynamic;
+  } else {
+    if (!isDeclaration || isHidden)
+      Model = TLSModel::LocalExec;
+    else
+      Model = TLSModel::InitialExec;
+  }
+
+  // If the user specified a more specific model, use that.
+  TLSModel::Model SelectedModel = getSelectedTLSModel(GV);
+  if (SelectedModel > Model)
+    return SelectedModel;
+
+  return Model;
+}
+
+/// getOptLevel - Returns the optimization level: None, Less,
+/// Default, or Aggressive.
+CodeGenOpt::Level TargetMachine::getOptLevel() const {
+  if (!CodeGenInfo)
+    return CodeGenOpt::Default;
+  return CodeGenInfo->getOptLevel();
+}
+
+void TargetMachine::setOptLevel(CodeGenOpt::Level Level) const {
+  if (CodeGenInfo)
+    CodeGenInfo->setOptLevel(Level);
+}
+
+bool TargetMachine::getAsmVerbosityDefault() const {
+  return Options.MCOptions.AsmVerbose;
+}
+
+void TargetMachine::setAsmVerbosityDefault(bool V) {
+  Options.MCOptions.AsmVerbose = V;
+}
+
+bool TargetMachine::getFunctionSections() const {
+  return Options.FunctionSections;
+}
+
+bool TargetMachine::getDataSections() const {
+  return Options.DataSections;
+}
+
+void TargetMachine::setFunctionSections(bool V) {
+  Options.FunctionSections = V;
+}
+
+void TargetMachine::setDataSections(bool V) {
+  Options.DataSections = V;
+}
+
+void TargetMachine::getNameWithPrefix(SmallVectorImpl<char> &Name,
+                                      const GlobalValue *GV, Mangler &Mang,
+                                      bool MayAlwaysUsePrivate) const {
+  if (MayAlwaysUsePrivate || !GV->hasPrivateLinkage()) {
+    // Simple case: If GV is not private, it is not important to find out if
+    // private labels are legal in this case or not.
+    Mang.getNameWithPrefix(Name, GV, false);
+    return;
+  }
+  SectionKind GVKind = TargetLoweringObjectFile::getKindForGlobal(GV, *this);
+  const TargetLoweringObjectFile &TLOF =
+      getTargetLowering()->getObjFileLowering();
+  const MCSection *TheSection = TLOF.SectionForGlobal(GV, GVKind, Mang, *this);
+  bool CannotUsePrivateLabel = TLOF.isSectionAtomizableBySymbols(*TheSection);
+  Mang.getNameWithPrefix(Name, GV, CannotUsePrivateLabel);
+}
+
+MCSymbol *TargetMachine::getSymbol(const GlobalValue *GV, Mangler &Mang) const {
+  SmallString<60> NameStr;
+  getNameWithPrefix(NameStr, GV, Mang);
+  const TargetLoweringObjectFile &TLOF =
+      getTargetLowering()->getObjFileLowering();
+  return TLOF.getContext().GetOrCreateSymbol(NameStr.str());
+}
diff --git a/contrib/llvm/lib/Target/TargetMachineC.cpp b/contrib/llvm/lib/Target/TargetMachineC.cpp
new file mode 100644
index 0000000..20923c9
--- /dev/null
+++ b/contrib/llvm/lib/Target/TargetMachineC.cpp
@@ -0,0 +1,258 @@
+//===-- TargetMachine.cpp -------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LLVM-C part of TargetMachine.h
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm-c/TargetMachine.h"
+#include "llvm-c/Core.h"
+#include "llvm-c/Target.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Module.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
+
+using namespace llvm;
+
+inline TargetMachine *unwrap(LLVMTargetMachineRef P) {
+  return reinterpret_cast<TargetMachine*>(P);
+}
+inline Target *unwrap(LLVMTargetRef P) {
+  return reinterpret_cast<Target*>(P);
+}
+inline LLVMTargetMachineRef wrap(const TargetMachine *P) {
+  return
+    reinterpret_cast<LLVMTargetMachineRef>(const_cast<TargetMachine*>(P));
+}
+inline LLVMTargetRef wrap(const Target * P) {
+  return reinterpret_cast<LLVMTargetRef>(const_cast<Target*>(P));
+}
+
+LLVMTargetRef LLVMGetFirstTarget() {
+  if(TargetRegistry::begin() == TargetRegistry::end()) {
+    return nullptr;
+  }
+
+  const Target* target = &*TargetRegistry::begin();
+  return wrap(target);
+}
+LLVMTargetRef LLVMGetNextTarget(LLVMTargetRef T) {
+  return wrap(unwrap(T)->getNext());
+}
+
+LLVMTargetRef LLVMGetTargetFromName(const char *Name) {
+  StringRef NameRef = Name;
+  for (TargetRegistry::iterator IT = TargetRegistry::begin(),
+                                IE = TargetRegistry::end(); IT != IE; ++IT) {
+    if (IT->getName() == NameRef)
+      return wrap(&*IT);
+  }
+  
+  return nullptr;
+}
+
+LLVMBool LLVMGetTargetFromTriple(const char* TripleStr, LLVMTargetRef *T,
+                                 char **ErrorMessage) {
+  std::string Error;
+  
+  *T = wrap(TargetRegistry::lookupTarget(TripleStr, Error));
+  
+  if (!*T) {
+    if (ErrorMessage)
+      *ErrorMessage = strdup(Error.c_str());
+
+    return 1;
+  }
+  
+  return 0;
+}
+
+const char * LLVMGetTargetName(LLVMTargetRef T) {
+  return unwrap(T)->getName();
+}
+
+const char * LLVMGetTargetDescription(LLVMTargetRef T) {
+  return unwrap(T)->getShortDescription();
+}
+
+LLVMBool LLVMTargetHasJIT(LLVMTargetRef T) {
+  return unwrap(T)->hasJIT();
+}
+
+LLVMBool LLVMTargetHasTargetMachine(LLVMTargetRef T) {
+  return unwrap(T)->hasTargetMachine();
+}
+
+LLVMBool LLVMTargetHasAsmBackend(LLVMTargetRef T) {
+  return unwrap(T)->hasMCAsmBackend();
+}
+
+LLVMTargetMachineRef LLVMCreateTargetMachine(LLVMTargetRef T,
+        const char* Triple, const char* CPU, const char* Features,
+        LLVMCodeGenOptLevel Level, LLVMRelocMode Reloc,
+        LLVMCodeModel CodeModel) {
+  Reloc::Model RM;
+  switch (Reloc){
+    case LLVMRelocStatic:
+      RM = Reloc::Static;
+      break;
+    case LLVMRelocPIC:
+      RM = Reloc::PIC_;
+      break;
+    case LLVMRelocDynamicNoPic:
+      RM = Reloc::DynamicNoPIC;
+      break;
+    default:
+      RM = Reloc::Default;
+      break;
+  }
+
+  CodeModel::Model CM = unwrap(CodeModel);
+
+  CodeGenOpt::Level OL;
+  switch (Level) {
+    case LLVMCodeGenLevelNone:
+      OL = CodeGenOpt::None;
+      break;
+    case LLVMCodeGenLevelLess:
+      OL = CodeGenOpt::Less;
+      break;
+    case LLVMCodeGenLevelAggressive:
+      OL = CodeGenOpt::Aggressive;
+      break;
+    default:
+      OL = CodeGenOpt::Default;
+      break;
+  }
+
+  TargetOptions opt;
+  return wrap(unwrap(T)->createTargetMachine(Triple, CPU, Features, opt, RM,
+    CM, OL));
+}
+
+
+void LLVMDisposeTargetMachine(LLVMTargetMachineRef T) {
+  delete unwrap(T);
+}
+
+LLVMTargetRef LLVMGetTargetMachineTarget(LLVMTargetMachineRef T) {
+  const Target* target = &(unwrap(T)->getTarget());
+  return wrap(target);
+}
+
+char* LLVMGetTargetMachineTriple(LLVMTargetMachineRef T) {
+  std::string StringRep = unwrap(T)->getTargetTriple();
+  return strdup(StringRep.c_str());
+}
+
+char* LLVMGetTargetMachineCPU(LLVMTargetMachineRef T) {
+  std::string StringRep = unwrap(T)->getTargetCPU();
+  return strdup(StringRep.c_str());
+}
+
+char* LLVMGetTargetMachineFeatureString(LLVMTargetMachineRef T) {
+  std::string StringRep = unwrap(T)->getTargetFeatureString();
+  return strdup(StringRep.c_str());
+}
+
+LLVMTargetDataRef LLVMGetTargetMachineData(LLVMTargetMachineRef T) {
+  return wrap(unwrap(T)->getDataLayout());
+}
+
+void LLVMSetTargetMachineAsmVerbosity(LLVMTargetMachineRef T,
+                                      LLVMBool VerboseAsm) {
+  unwrap(T)->setAsmVerbosityDefault(VerboseAsm);
+}
+
+static LLVMBool LLVMTargetMachineEmit(LLVMTargetMachineRef T, LLVMModuleRef M,
+  formatted_raw_ostream &OS, LLVMCodeGenFileType codegen, char **ErrorMessage) {
+  TargetMachine* TM = unwrap(T);
+  Module* Mod = unwrap(M);
+
+  PassManager pass;
+
+  std::string error;
+
+  const DataLayout* td = TM->getDataLayout();
+
+  if (!td) {
+    error = "No DataLayout in TargetMachine";
+    *ErrorMessage = strdup(error.c_str());
+    return true;
+  }
+  Mod->setDataLayout(td);
+  pass.add(new DataLayoutPass(Mod));
+
+  TargetMachine::CodeGenFileType ft;
+  switch (codegen) {
+    case LLVMAssemblyFile:
+      ft = TargetMachine::CGFT_AssemblyFile;
+      break;
+    default:
+      ft = TargetMachine::CGFT_ObjectFile;
+      break;
+  }
+  if (TM->addPassesToEmitFile(pass, OS, ft)) {
+    error = "TargetMachine can't emit a file of this type";
+    *ErrorMessage = strdup(error.c_str());
+    return true;
+  }
+
+  pass.run(*Mod);
+
+  OS.flush();
+  return false;
+}
+
+LLVMBool LLVMTargetMachineEmitToFile(LLVMTargetMachineRef T, LLVMModuleRef M,
+  char* Filename, LLVMCodeGenFileType codegen, char** ErrorMessage) {
+  std::string error;
+  raw_fd_ostream dest(Filename, error, sys::fs::F_None);
+  if (!error.empty()) {
+    *ErrorMessage = strdup(error.c_str());
+    return true;
+  }
+  formatted_raw_ostream destf(dest);
+  bool Result = LLVMTargetMachineEmit(T, M, destf, codegen, ErrorMessage);
+  dest.flush();
+  return Result;
+}
+
+LLVMBool LLVMTargetMachineEmitToMemoryBuffer(LLVMTargetMachineRef T,
+  LLVMModuleRef M, LLVMCodeGenFileType codegen, char** ErrorMessage,
+  LLVMMemoryBufferRef *OutMemBuf) {
+  std::string CodeString;
+  raw_string_ostream OStream(CodeString);
+  formatted_raw_ostream Out(OStream);
+  bool Result = LLVMTargetMachineEmit(T, M, Out, codegen, ErrorMessage);
+  OStream.flush();
+
+  std::string &Data = OStream.str();
+  *OutMemBuf = LLVMCreateMemoryBufferWithMemoryRangeCopy(Data.c_str(),
+                                                     Data.length(), "");
+  return Result;
+}
+
+char *LLVMGetDefaultTargetTriple(void) {
+  return strdup(sys::getDefaultTargetTriple().c_str());
+}
+
+void LLVMAddAnalysisPasses(LLVMTargetMachineRef T, LLVMPassManagerRef PM) {
+  unwrap(T)->addAnalysisPasses(*unwrap(PM));
+}
diff --git a/contrib/llvm/lib/Target/TargetSubtargetInfo.cpp b/contrib/llvm/lib/Target/TargetSubtargetInfo.cpp
new file mode 100644
index 0000000..386a813
--- /dev/null
+++ b/contrib/llvm/lib/Target/TargetSubtargetInfo.cpp
@@ -0,0 +1,61 @@
+//===-- TargetSubtargetInfo.cpp - General Target Information ---------------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the general parts of a Subtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/CommandLine.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+using namespace llvm;
+
+//---------------------------------------------------------------------------
+// TargetSubtargetInfo Class
+//
+TargetSubtargetInfo::TargetSubtargetInfo() {}
+
+TargetSubtargetInfo::~TargetSubtargetInfo() {}
+
+// Temporary option to compare overall performance change when moving from the
+// SD scheduler to the MachineScheduler pass pipeline. This is convenient for
+// benchmarking during the transition from SD to MI scheduling. Once armv7 makes
+// the switch, it should go away. The normal way to enable/disable the
+// MachineScheduling pass itself is by using -enable-misched. For targets that
+// already use MI sched (via MySubTarget::enableMachineScheduler())
+// -misched-bench=false negates the subtarget hook.
+static cl::opt<bool> BenchMachineSched("misched-bench", cl::Hidden,
+    cl::desc("Migrate from the target's default SD scheduler to MI scheduler"));
+
+bool TargetSubtargetInfo::useMachineScheduler() const {
+  if (BenchMachineSched.getNumOccurrences())
+    return BenchMachineSched;
+  return enableMachineScheduler();
+}
+
+bool TargetSubtargetInfo::enableAtomicExpandLoadLinked() const {
+  return true;
+}
+
+bool TargetSubtargetInfo::enableMachineScheduler() const {
+  return false;
+}
+
+bool TargetSubtargetInfo::enableRALocalReassignment(
+    CodeGenOpt::Level OptLevel) const {
+  return true;
+}
+
+bool TargetSubtargetInfo::enablePostMachineScheduler() const {
+  return getSchedModel()->PostRAScheduler;
+}
+
+bool TargetSubtargetInfo::useAA() const {
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
new file mode 100644
index 0000000..a365f62
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
@@ -0,0 +1,507 @@
+//===-- X86AsmInstrumentation.cpp - Instrument X86 inline assembly C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "X86AsmInstrumentation.h"
+#include "X86Operand.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/Support/CommandLine.h"
+
+namespace llvm {
+namespace {
+
+static cl::opt<bool> ClAsanInstrumentAssembly(
+    "asan-instrument-assembly",
+    cl::desc("instrument assembly with AddressSanitizer checks"), cl::Hidden,
+    cl::init(false));
+
+bool IsStackReg(unsigned Reg) {
+  return Reg == X86::RSP || Reg == X86::ESP || Reg == X86::SP;
+}
+
+std::string FuncName(unsigned AccessSize, bool IsWrite) {
+  return std::string("__asan_report_") + (IsWrite ? "store" : "load") +
+         utostr(AccessSize);
+}
+
+class X86AddressSanitizer : public X86AsmInstrumentation {
+public:
+  X86AddressSanitizer(const MCSubtargetInfo &STI) : STI(STI) {}
+  virtual ~X86AddressSanitizer() {}
+
+  // X86AsmInstrumentation implementation:
+  virtual void InstrumentInstruction(
+      const MCInst &Inst, OperandVector &Operands, MCContext &Ctx,
+      const MCInstrInfo &MII, MCStreamer &Out) override {
+    InstrumentMOV(Inst, Operands, Ctx, MII, Out);
+  }
+
+  // Should be implemented differently in x86_32 and x86_64 subclasses.
+  virtual void InstrumentMemOperandSmallImpl(
+      X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
+      MCStreamer &Out) = 0;
+  virtual void InstrumentMemOperandLargeImpl(
+      X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
+      MCStreamer &Out) = 0;
+
+  void InstrumentMemOperand(MCParsedAsmOperand &Op, unsigned AccessSize,
+                            bool IsWrite, MCContext &Ctx, MCStreamer &Out);
+  void InstrumentMOV(const MCInst &Inst, OperandVector &Operands,
+                     MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out);
+  void EmitInstruction(MCStreamer &Out, const MCInst &Inst) {
+    Out.EmitInstruction(Inst, STI);
+  }
+
+  void EmitLabel(MCStreamer &Out, MCSymbol *Label) { Out.EmitLabel(Label); }
+
+protected:
+  const MCSubtargetInfo &STI;
+};
+
+void X86AddressSanitizer::InstrumentMemOperand(
+    MCParsedAsmOperand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
+    MCStreamer &Out) {
+  assert(Op.isMem() && "Op should be a memory operand.");
+  assert((AccessSize & (AccessSize - 1)) == 0 && AccessSize <= 16 &&
+         "AccessSize should be a power of two, less or equal than 16.");
+
+  X86Operand &MemOp = static_cast<X86Operand &>(Op);
+  // FIXME: get rid of this limitation.
+  if (IsStackReg(MemOp.getMemBaseReg()) || IsStackReg(MemOp.getMemIndexReg()))
+    return;
+
+  // FIXME: take into account load/store alignment.
+  if (AccessSize < 8)
+    InstrumentMemOperandSmallImpl(MemOp, AccessSize, IsWrite, Ctx, Out);
+  else
+    InstrumentMemOperandLargeImpl(MemOp, AccessSize, IsWrite, Ctx, Out);
+}
+
+void X86AddressSanitizer::InstrumentMOV(
+    const MCInst &Inst, OperandVector &Operands, MCContext &Ctx,
+    const MCInstrInfo &MII, MCStreamer &Out) {
+  // Access size in bytes.
+  unsigned AccessSize = 0;
+
+  switch (Inst.getOpcode()) {
+  case X86::MOV8mi:
+  case X86::MOV8mr:
+  case X86::MOV8rm:
+    AccessSize = 1;
+    break;
+  case X86::MOV16mi:
+  case X86::MOV16mr:
+  case X86::MOV16rm:
+    AccessSize = 2;
+    break;
+  case X86::MOV32mi:
+  case X86::MOV32mr:
+  case X86::MOV32rm:
+    AccessSize = 4;
+    break;
+  case X86::MOV64mi32:
+  case X86::MOV64mr:
+  case X86::MOV64rm:
+    AccessSize = 8;
+    break;
+  case X86::MOVAPDmr:
+  case X86::MOVAPSmr:
+  case X86::MOVAPDrm:
+  case X86::MOVAPSrm:
+    AccessSize = 16;
+    break;
+  default:
+    return;
+  }
+
+  const bool IsWrite = MII.get(Inst.getOpcode()).mayStore();
+  for (unsigned Ix = 0; Ix < Operands.size(); ++Ix) {
+    assert(Operands[Ix]);
+    MCParsedAsmOperand &Op = *Operands[Ix];
+    if (Op.isMem())
+      InstrumentMemOperand(Op, AccessSize, IsWrite, Ctx, Out);
+  }
+}
+
+class X86AddressSanitizer32 : public X86AddressSanitizer {
+public:
+  static const long kShadowOffset = 0x20000000;
+
+  X86AddressSanitizer32(const MCSubtargetInfo &STI)
+      : X86AddressSanitizer(STI) {}
+  virtual ~X86AddressSanitizer32() {}
+
+  virtual void InstrumentMemOperandSmallImpl(
+      X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
+      MCStreamer &Out) override;
+  virtual void InstrumentMemOperandLargeImpl(
+      X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
+      MCStreamer &Out) override;
+
+ private:
+  void EmitCallAsanReport(MCContext &Ctx, MCStreamer &Out, unsigned AccessSize,
+                          bool IsWrite, unsigned AddressReg) {
+    EmitInstruction(Out, MCInstBuilder(X86::CLD));
+    EmitInstruction(Out, MCInstBuilder(X86::MMX_EMMS));
+
+    EmitInstruction(Out, MCInstBuilder(X86::AND64ri8).addReg(X86::ESP)
+                             .addReg(X86::ESP).addImm(-16));
+    EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(AddressReg));
+
+
+    const std::string& Fn = FuncName(AccessSize, IsWrite);
+    MCSymbol *FnSym = Ctx.GetOrCreateSymbol(StringRef(Fn));
+    const MCSymbolRefExpr *FnExpr =
+        MCSymbolRefExpr::Create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx);
+    EmitInstruction(Out, MCInstBuilder(X86::CALLpcrel32).addExpr(FnExpr));
+  }
+};
+
+void X86AddressSanitizer32::InstrumentMemOperandSmallImpl(
+    X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
+    MCStreamer &Out) {
+  EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::EAX));
+  EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::ECX));
+  EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::EDX));
+  EmitInstruction(Out, MCInstBuilder(X86::PUSHF32));
+
+  {
+    MCInst Inst;
+    Inst.setOpcode(X86::LEA32r);
+    Inst.addOperand(MCOperand::CreateReg(X86::EAX));
+    Op.addMemOperands(Inst, 5);
+    EmitInstruction(Out, Inst);
+  }
+
+  EmitInstruction(
+      Out, MCInstBuilder(X86::MOV32rr).addReg(X86::ECX).addReg(X86::EAX));
+  EmitInstruction(Out, MCInstBuilder(X86::SHR32ri).addReg(X86::ECX)
+                           .addReg(X86::ECX).addImm(3));
+
+  {
+    MCInst Inst;
+    Inst.setOpcode(X86::MOV8rm);
+    Inst.addOperand(MCOperand::CreateReg(X86::CL));
+    const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx);
+    std::unique_ptr<X86Operand> Op(
+        X86Operand::CreateMem(0, Disp, X86::ECX, 0, 1, SMLoc(), SMLoc()));
+    Op->addMemOperands(Inst, 5);
+    EmitInstruction(Out, Inst);
+  }
+
+  EmitInstruction(Out,
+                  MCInstBuilder(X86::TEST8rr).addReg(X86::CL).addReg(X86::CL));
+  MCSymbol *DoneSym = Ctx.CreateTempSymbol();
+  const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx);
+  EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr));
+
+  EmitInstruction(
+      Out, MCInstBuilder(X86::MOV32rr).addReg(X86::EDX).addReg(X86::EAX));
+  EmitInstruction(Out, MCInstBuilder(X86::AND32ri).addReg(X86::EDX)
+                           .addReg(X86::EDX).addImm(7));
+
+  switch (AccessSize) {
+  case 1:
+    break;
+  case 2: {
+    MCInst Inst;
+    Inst.setOpcode(X86::LEA32r);
+    Inst.addOperand(MCOperand::CreateReg(X86::EDX));
+
+    const MCExpr *Disp = MCConstantExpr::Create(1, Ctx);
+    std::unique_ptr<X86Operand> Op(
+        X86Operand::CreateMem(0, Disp, X86::EDX, 0, 1, SMLoc(), SMLoc()));
+    Op->addMemOperands(Inst, 5);
+    EmitInstruction(Out, Inst);
+    break;
+  }
+  case 4:
+    EmitInstruction(Out, MCInstBuilder(X86::ADD32ri8).addReg(X86::EDX)
+                             .addReg(X86::EDX).addImm(3));
+    break;
+  default:
+    assert(false && "Incorrect access size");
+    break;
+  }
+
+  EmitInstruction(
+      Out, MCInstBuilder(X86::MOVSX32rr8).addReg(X86::ECX).addReg(X86::CL));
+  EmitInstruction(
+      Out, MCInstBuilder(X86::CMP32rr).addReg(X86::EDX).addReg(X86::ECX));
+  EmitInstruction(Out, MCInstBuilder(X86::JL_4).addExpr(DoneExpr));
+
+  EmitCallAsanReport(Ctx, Out, AccessSize, IsWrite, X86::EAX);
+  EmitLabel(Out, DoneSym);
+
+  EmitInstruction(Out, MCInstBuilder(X86::POPF32));
+  EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::EDX));
+  EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::ECX));
+  EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::EAX));
+}
+
+void X86AddressSanitizer32::InstrumentMemOperandLargeImpl(
+    X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
+    MCStreamer &Out) {
+  EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::EAX));
+  EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::ECX));
+  EmitInstruction(Out, MCInstBuilder(X86::PUSHF32));
+
+  {
+    MCInst Inst;
+    Inst.setOpcode(X86::LEA32r);
+    Inst.addOperand(MCOperand::CreateReg(X86::EAX));
+    Op.addMemOperands(Inst, 5);
+    EmitInstruction(Out, Inst);
+  }
+  EmitInstruction(
+      Out, MCInstBuilder(X86::MOV32rr).addReg(X86::ECX).addReg(X86::EAX));
+  EmitInstruction(Out, MCInstBuilder(X86::SHR32ri).addReg(X86::ECX)
+                           .addReg(X86::ECX).addImm(3));
+  {
+    MCInst Inst;
+    switch (AccessSize) {
+      case 8:
+        Inst.setOpcode(X86::CMP8mi);
+        break;
+      case 16:
+        Inst.setOpcode(X86::CMP16mi);
+        break;
+      default:
+        assert(false && "Incorrect access size");
+        break;
+    }
+    const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx);
+    std::unique_ptr<X86Operand> Op(
+        X86Operand::CreateMem(0, Disp, X86::ECX, 0, 1, SMLoc(), SMLoc()));
+    Op->addMemOperands(Inst, 5);
+    Inst.addOperand(MCOperand::CreateImm(0));
+    EmitInstruction(Out, Inst);
+  }
+  MCSymbol *DoneSym = Ctx.CreateTempSymbol();
+  const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx);
+  EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr));
+
+  EmitCallAsanReport(Ctx, Out, AccessSize, IsWrite, X86::EAX);
+  EmitLabel(Out, DoneSym);
+
+  EmitInstruction(Out, MCInstBuilder(X86::POPF32));
+  EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::ECX));
+  EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::EAX));
+}
+
+class X86AddressSanitizer64 : public X86AddressSanitizer {
+public:
+  static const long kShadowOffset = 0x7fff8000;
+
+  X86AddressSanitizer64(const MCSubtargetInfo &STI)
+      : X86AddressSanitizer(STI) {}
+  virtual ~X86AddressSanitizer64() {}
+
+  virtual void InstrumentMemOperandSmallImpl(
+      X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
+      MCStreamer &Out) override;
+  virtual void InstrumentMemOperandLargeImpl(
+      X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
+      MCStreamer &Out) override;
+
+private:
+  void EmitAdjustRSP(MCContext &Ctx, MCStreamer &Out, long Offset) {
+    MCInst Inst;
+    Inst.setOpcode(X86::LEA64r);
+    Inst.addOperand(MCOperand::CreateReg(X86::RSP));
+
+    const MCExpr *Disp = MCConstantExpr::Create(Offset, Ctx);
+    std::unique_ptr<X86Operand> Op(
+        X86Operand::CreateMem(0, Disp, X86::RSP, 0, 1, SMLoc(), SMLoc()));
+    Op->addMemOperands(Inst, 5);
+    EmitInstruction(Out, Inst);
+  }
+
+  void EmitCallAsanReport(MCContext &Ctx, MCStreamer &Out, unsigned AccessSize,
+                          bool IsWrite) {
+    EmitInstruction(Out, MCInstBuilder(X86::CLD));
+    EmitInstruction(Out, MCInstBuilder(X86::MMX_EMMS));
+
+    EmitInstruction(Out, MCInstBuilder(X86::AND64ri8).addReg(X86::RSP)
+                             .addReg(X86::RSP).addImm(-16));
+
+    const std::string& Fn = FuncName(AccessSize, IsWrite);
+    MCSymbol *FnSym = Ctx.GetOrCreateSymbol(StringRef(Fn));
+    const MCSymbolRefExpr *FnExpr =
+        MCSymbolRefExpr::Create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx);
+    EmitInstruction(Out, MCInstBuilder(X86::CALL64pcrel32).addExpr(FnExpr));
+  }
+};
+
+void X86AddressSanitizer64::InstrumentMemOperandSmallImpl(
+    X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
+    MCStreamer &Out) {
+  EmitAdjustRSP(Ctx, Out, -128);
+  EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(X86::RAX));
+  EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(X86::RCX));
+  EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(X86::RDI));
+  EmitInstruction(Out, MCInstBuilder(X86::PUSHF64));
+  {
+    MCInst Inst;
+    Inst.setOpcode(X86::LEA64r);
+    Inst.addOperand(MCOperand::CreateReg(X86::RDI));
+    Op.addMemOperands(Inst, 5);
+    EmitInstruction(Out, Inst);
+  }
+  EmitInstruction(
+      Out, MCInstBuilder(X86::MOV64rr).addReg(X86::RAX).addReg(X86::RDI));
+  EmitInstruction(Out, MCInstBuilder(X86::SHR64ri).addReg(X86::RAX)
+                           .addReg(X86::RAX).addImm(3));
+  {
+    MCInst Inst;
+    Inst.setOpcode(X86::MOV8rm);
+    Inst.addOperand(MCOperand::CreateReg(X86::AL));
+    const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx);
+    std::unique_ptr<X86Operand> Op(
+        X86Operand::CreateMem(0, Disp, X86::RAX, 0, 1, SMLoc(), SMLoc()));
+    Op->addMemOperands(Inst, 5);
+    EmitInstruction(Out, Inst);
+  }
+
+  EmitInstruction(Out,
+                  MCInstBuilder(X86::TEST8rr).addReg(X86::AL).addReg(X86::AL));
+  MCSymbol *DoneSym = Ctx.CreateTempSymbol();
+  const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx);
+  EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr));
+
+  EmitInstruction(
+      Out, MCInstBuilder(X86::MOV32rr).addReg(X86::ECX).addReg(X86::EDI));
+  EmitInstruction(Out, MCInstBuilder(X86::AND32ri).addReg(X86::ECX)
+                           .addReg(X86::ECX).addImm(7));
+
+  switch (AccessSize) {
+  case 1:
+    break;
+  case 2: {
+    MCInst Inst;
+    Inst.setOpcode(X86::LEA32r);
+    Inst.addOperand(MCOperand::CreateReg(X86::ECX));
+
+    const MCExpr *Disp = MCConstantExpr::Create(1, Ctx);
+    std::unique_ptr<X86Operand> Op(
+        X86Operand::CreateMem(0, Disp, X86::ECX, 0, 1, SMLoc(), SMLoc()));
+    Op->addMemOperands(Inst, 5);
+    EmitInstruction(Out, Inst);
+    break;
+  }
+  case 4:
+    EmitInstruction(Out, MCInstBuilder(X86::ADD32ri8).addReg(X86::ECX)
+                             .addReg(X86::ECX).addImm(3));
+    break;
+  default:
+    assert(false && "Incorrect access size");
+    break;
+  }
+
+  EmitInstruction(
+      Out, MCInstBuilder(X86::MOVSX32rr8).addReg(X86::EAX).addReg(X86::AL));
+  EmitInstruction(
+      Out, MCInstBuilder(X86::CMP32rr).addReg(X86::ECX).addReg(X86::EAX));
+  EmitInstruction(Out, MCInstBuilder(X86::JL_4).addExpr(DoneExpr));
+
+  EmitCallAsanReport(Ctx, Out, AccessSize, IsWrite);
+  EmitLabel(Out, DoneSym);
+
+  EmitInstruction(Out, MCInstBuilder(X86::POPF64));
+  EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(X86::RDI));
+  EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(X86::RCX));
+  EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(X86::RAX));
+  EmitAdjustRSP(Ctx, Out, 128);
+}
+
+void X86AddressSanitizer64::InstrumentMemOperandLargeImpl(
+    X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
+    MCStreamer &Out) {
+  EmitAdjustRSP(Ctx, Out, -128);
+  EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(X86::RAX));
+  EmitInstruction(Out, MCInstBuilder(X86::PUSHF64));
+
+  {
+    MCInst Inst;
+    Inst.setOpcode(X86::LEA64r);
+    Inst.addOperand(MCOperand::CreateReg(X86::RAX));
+    Op.addMemOperands(Inst, 5);
+    EmitInstruction(Out, Inst);
+  }
+  EmitInstruction(Out, MCInstBuilder(X86::SHR64ri).addReg(X86::RAX)
+                           .addReg(X86::RAX).addImm(3));
+  {
+    MCInst Inst;
+    switch (AccessSize) {
+    case 8:
+      Inst.setOpcode(X86::CMP8mi);
+      break;
+    case 16:
+      Inst.setOpcode(X86::CMP16mi);
+      break;
+    default:
+      assert(false && "Incorrect access size");
+      break;
+    }
+    const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx);
+    std::unique_ptr<X86Operand> Op(
+        X86Operand::CreateMem(0, Disp, X86::RAX, 0, 1, SMLoc(), SMLoc()));
+    Op->addMemOperands(Inst, 5);
+    Inst.addOperand(MCOperand::CreateImm(0));
+    EmitInstruction(Out, Inst);
+  }
+
+  MCSymbol *DoneSym = Ctx.CreateTempSymbol();
+  const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx);
+  EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr));
+
+  EmitCallAsanReport(Ctx, Out, AccessSize, IsWrite);
+  EmitLabel(Out, DoneSym);
+
+  EmitInstruction(Out, MCInstBuilder(X86::POPF64));
+  EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(X86::RAX));
+  EmitAdjustRSP(Ctx, Out, 128);
+}
+
+} // End anonymous namespace
+
+X86AsmInstrumentation::X86AsmInstrumentation() {}
+X86AsmInstrumentation::~X86AsmInstrumentation() {}
+
+void X86AsmInstrumentation::InstrumentInstruction(
+    const MCInst &Inst, OperandVector &Operands, MCContext &Ctx,
+    const MCInstrInfo &MII, MCStreamer &Out) {}
+
+X86AsmInstrumentation *
+CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
+                            const MCContext &Ctx, const MCSubtargetInfo &STI) {
+  Triple T(STI.getTargetTriple());
+  const bool hasCompilerRTSupport = T.isOSLinux();
+  if (ClAsanInstrumentAssembly && hasCompilerRTSupport &&
+      MCOptions.SanitizeAddress) {
+    if ((STI.getFeatureBits() & X86::Mode32Bit) != 0)
+      return new X86AddressSanitizer32(STI);
+    if ((STI.getFeatureBits() & X86::Mode64Bit) != 0)
+      return new X86AddressSanitizer64(STI);
+  }
+  return new X86AsmInstrumentation();
+}
+
+} // End llvm namespace
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.h b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
new file mode 100644
index 0000000..1bc3c09
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
@@ -0,0 +1,54 @@
+//===- X86AsmInstrumentation.h - Instrument X86 inline assembly *- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86_ASM_INSTRUMENTATION_H
+#define X86_ASM_INSTRUMENTATION_H
+
+#include "llvm/ADT/SmallVector.h"
+
+#include <memory>
+
+namespace llvm {
+
+class MCContext;
+class MCInst;
+class MCInstrInfo;
+class MCParsedAsmOperand;
+class MCStreamer;
+class MCSubtargetInfo;
+class MCTargetOptions;
+
+class X86AsmInstrumentation;
+
+X86AsmInstrumentation *
+CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
+                            const MCContext &Ctx, const MCSubtargetInfo &STI);
+
+class X86AsmInstrumentation {
+public:
+  virtual ~X86AsmInstrumentation();
+
+  // Instruments Inst. Should be called just before the original
+  // instruction is sent to Out.
+  virtual void InstrumentInstruction(
+      const MCInst &Inst,
+      SmallVectorImpl<std::unique_ptr<MCParsedAsmOperand>> &Operands,
+      MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out);
+
+protected:
+  friend X86AsmInstrumentation *
+  CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
+                              const MCContext &Ctx, const MCSubtargetInfo &STI);
+
+  X86AsmInstrumentation();
+};
+
+} // End llvm namespace
+
+#endif // X86_ASM_INSTRUMENTATION_H
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
new file mode 100644
index 0000000..a11a238
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -0,0 +1,2617 @@
+//===-- X86AsmParser.cpp - Parse X86 assembly to MCInst instructions ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "X86AsmInstrumentation.h"
+#include "X86AsmParserCommon.h"
+#include "X86Operand.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include <memory>
+
+using namespace llvm;
+
+namespace {
+
+static const char OpPrecedence[] = {
+  0, // IC_OR
+  1, // IC_AND
+  2, // IC_LSHIFT
+  2, // IC_RSHIFT
+  3, // IC_PLUS
+  3, // IC_MINUS
+  4, // IC_MULTIPLY
+  4, // IC_DIVIDE
+  5, // IC_RPAREN
+  6, // IC_LPAREN
+  0, // IC_IMM
+  0  // IC_REGISTER
+};
+
+class X86AsmParser : public MCTargetAsmParser {
+  MCSubtargetInfo &STI;
+  MCAsmParser &Parser;
+  const MCInstrInfo &MII;
+  ParseInstructionInfo *InstInfo;
+  std::unique_ptr<X86AsmInstrumentation> Instrumentation;
+private:
+  SMLoc consumeToken() {
+    SMLoc Result = Parser.getTok().getLoc();
+    Parser.Lex();
+    return Result;
+  }
+
+  enum InfixCalculatorTok {
+    IC_OR = 0,
+    IC_AND,
+    IC_LSHIFT,
+    IC_RSHIFT,
+    IC_PLUS,
+    IC_MINUS,
+    IC_MULTIPLY,
+    IC_DIVIDE,
+    IC_RPAREN,
+    IC_LPAREN,
+    IC_IMM,
+    IC_REGISTER
+  };
+
+  class InfixCalculator {
+    typedef std::pair< InfixCalculatorTok, int64_t > ICToken;
+    SmallVector<InfixCalculatorTok, 4> InfixOperatorStack;
+    SmallVector<ICToken, 4> PostfixStack;
+    
+  public:
+    int64_t popOperand() {
+      assert (!PostfixStack.empty() && "Poped an empty stack!");
+      ICToken Op = PostfixStack.pop_back_val();
+      assert ((Op.first == IC_IMM || Op.first == IC_REGISTER)
+              && "Expected and immediate or register!");
+      return Op.second;
+    }
+    void pushOperand(InfixCalculatorTok Op, int64_t Val = 0) {
+      assert ((Op == IC_IMM || Op == IC_REGISTER) &&
+              "Unexpected operand!");
+      PostfixStack.push_back(std::make_pair(Op, Val));
+    }
+    
+    void popOperator() { InfixOperatorStack.pop_back(); }
+    void pushOperator(InfixCalculatorTok Op) {
+      // Push the new operator if the stack is empty.
+      if (InfixOperatorStack.empty()) {
+        InfixOperatorStack.push_back(Op);
+        return;
+      }
+      
+      // Push the new operator if it has a higher precedence than the operator
+      // on the top of the stack or the operator on the top of the stack is a
+      // left parentheses.
+      unsigned Idx = InfixOperatorStack.size() - 1;
+      InfixCalculatorTok StackOp = InfixOperatorStack[Idx];
+      if (OpPrecedence[Op] > OpPrecedence[StackOp] || StackOp == IC_LPAREN) {
+        InfixOperatorStack.push_back(Op);
+        return;
+      }
+      
+      // The operator on the top of the stack has higher precedence than the
+      // new operator.
+      unsigned ParenCount = 0;
+      while (1) {
+        // Nothing to process.
+        if (InfixOperatorStack.empty())
+          break;
+        
+        Idx = InfixOperatorStack.size() - 1;
+        StackOp = InfixOperatorStack[Idx];
+        if (!(OpPrecedence[StackOp] >= OpPrecedence[Op] || ParenCount))
+          break;
+        
+        // If we have an even parentheses count and we see a left parentheses,
+        // then stop processing.
+        if (!ParenCount && StackOp == IC_LPAREN)
+          break;
+        
+        if (StackOp == IC_RPAREN) {
+          ++ParenCount;
+          InfixOperatorStack.pop_back();
+        } else if (StackOp == IC_LPAREN) {
+          --ParenCount;
+          InfixOperatorStack.pop_back();
+        } else {
+          InfixOperatorStack.pop_back();
+          PostfixStack.push_back(std::make_pair(StackOp, 0));
+        }
+      }
+      // Push the new operator.
+      InfixOperatorStack.push_back(Op);
+    }
+    int64_t execute() {
+      // Push any remaining operators onto the postfix stack.
+      while (!InfixOperatorStack.empty()) {
+        InfixCalculatorTok StackOp = InfixOperatorStack.pop_back_val();
+        if (StackOp != IC_LPAREN && StackOp != IC_RPAREN)
+          PostfixStack.push_back(std::make_pair(StackOp, 0));
+      }
+      
+      if (PostfixStack.empty())
+        return 0;
+      
+      SmallVector<ICToken, 16> OperandStack;
+      for (unsigned i = 0, e = PostfixStack.size(); i != e; ++i) {
+        ICToken Op = PostfixStack[i];
+        if (Op.first == IC_IMM || Op.first == IC_REGISTER) {
+          OperandStack.push_back(Op);
+        } else {
+          assert (OperandStack.size() > 1 && "Too few operands.");
+          int64_t Val;
+          ICToken Op2 = OperandStack.pop_back_val();
+          ICToken Op1 = OperandStack.pop_back_val();
+          switch (Op.first) {
+          default:
+            report_fatal_error("Unexpected operator!");
+            break;
+          case IC_PLUS:
+            Val = Op1.second + Op2.second;
+            OperandStack.push_back(std::make_pair(IC_IMM, Val));
+            break;
+          case IC_MINUS:
+            Val = Op1.second - Op2.second;
+            OperandStack.push_back(std::make_pair(IC_IMM, Val));
+            break;
+          case IC_MULTIPLY:
+            assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+                    "Multiply operation with an immediate and a register!");
+            Val = Op1.second * Op2.second;
+            OperandStack.push_back(std::make_pair(IC_IMM, Val));
+            break;
+          case IC_DIVIDE:
+            assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+                    "Divide operation with an immediate and a register!");
+            assert (Op2.second != 0 && "Division by zero!");
+            Val = Op1.second / Op2.second;
+            OperandStack.push_back(std::make_pair(IC_IMM, Val));
+            break;
+          case IC_OR:
+            assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+                    "Or operation with an immediate and a register!");
+            Val = Op1.second | Op2.second;
+            OperandStack.push_back(std::make_pair(IC_IMM, Val));
+            break;
+          case IC_AND:
+            assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+                    "And operation with an immediate and a register!");
+            Val = Op1.second & Op2.second;
+            OperandStack.push_back(std::make_pair(IC_IMM, Val));
+            break;
+          case IC_LSHIFT:
+            assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+                    "Left shift operation with an immediate and a register!");
+            Val = Op1.second << Op2.second;
+            OperandStack.push_back(std::make_pair(IC_IMM, Val));
+            break;
+          case IC_RSHIFT:
+            assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+                    "Right shift operation with an immediate and a register!");
+            Val = Op1.second >> Op2.second;
+            OperandStack.push_back(std::make_pair(IC_IMM, Val));
+            break;
+          }
+        }
+      }
+      assert (OperandStack.size() == 1 && "Expected a single result.");
+      return OperandStack.pop_back_val().second;
+    }
+  };
+
+  enum IntelExprState {
+    IES_OR,
+    IES_AND,
+    IES_LSHIFT,
+    IES_RSHIFT,
+    IES_PLUS,
+    IES_MINUS,
+    IES_NOT,
+    IES_MULTIPLY,
+    IES_DIVIDE,
+    IES_LBRAC,
+    IES_RBRAC,
+    IES_LPAREN,
+    IES_RPAREN,
+    IES_REGISTER,
+    IES_INTEGER,
+    IES_IDENTIFIER,
+    IES_ERROR
+  };
+
+  class IntelExprStateMachine {
+    IntelExprState State, PrevState;
+    unsigned BaseReg, IndexReg, TmpReg, Scale;
+    int64_t Imm;
+    const MCExpr *Sym;
+    StringRef SymName;
+    bool StopOnLBrac, AddImmPrefix;
+    InfixCalculator IC;
+    InlineAsmIdentifierInfo Info;
+  public:
+    IntelExprStateMachine(int64_t imm, bool stoponlbrac, bool addimmprefix) :
+      State(IES_PLUS), PrevState(IES_ERROR), BaseReg(0), IndexReg(0), TmpReg(0),
+      Scale(1), Imm(imm), Sym(nullptr), StopOnLBrac(stoponlbrac),
+      AddImmPrefix(addimmprefix) { Info.clear(); }
+    
+    unsigned getBaseReg() { return BaseReg; }
+    unsigned getIndexReg() { return IndexReg; }
+    unsigned getScale() { return Scale; }
+    const MCExpr *getSym() { return Sym; }
+    StringRef getSymName() { return SymName; }
+    int64_t getImm() { return Imm + IC.execute(); }
+    bool isValidEndState() {
+      return State == IES_RBRAC || State == IES_INTEGER;
+    }
+    bool getStopOnLBrac() { return StopOnLBrac; }
+    bool getAddImmPrefix() { return AddImmPrefix; }
+    bool hadError() { return State == IES_ERROR; }
+
+    InlineAsmIdentifierInfo &getIdentifierInfo() {
+      return Info;
+    }
+
+    void onOr() {
+      IntelExprState CurrState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_INTEGER:
+      case IES_RPAREN:
+      case IES_REGISTER:
+        State = IES_OR;
+        IC.pushOperator(IC_OR);
+        break;
+      }
+      PrevState = CurrState;
+    }
+    void onAnd() {
+      IntelExprState CurrState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_INTEGER:
+      case IES_RPAREN:
+      case IES_REGISTER:
+        State = IES_AND;
+        IC.pushOperator(IC_AND);
+        break;
+      }
+      PrevState = CurrState;
+    }
+    void onLShift() {
+      IntelExprState CurrState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_INTEGER:
+      case IES_RPAREN:
+      case IES_REGISTER:
+        State = IES_LSHIFT;
+        IC.pushOperator(IC_LSHIFT);
+        break;
+      }
+      PrevState = CurrState;
+    }
+    void onRShift() {
+      IntelExprState CurrState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_INTEGER:
+      case IES_RPAREN:
+      case IES_REGISTER:
+        State = IES_RSHIFT;
+        IC.pushOperator(IC_RSHIFT);
+        break;
+      }
+      PrevState = CurrState;
+    }
+    void onPlus() {
+      IntelExprState CurrState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_INTEGER:
+      case IES_RPAREN:
+      case IES_REGISTER:
+        State = IES_PLUS;
+        IC.pushOperator(IC_PLUS);
+        if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) {
+          // If we already have a BaseReg, then assume this is the IndexReg with
+          // a scale of 1.
+          if (!BaseReg) {
+            BaseReg = TmpReg;
+          } else {
+            assert (!IndexReg && "BaseReg/IndexReg already set!");
+            IndexReg = TmpReg;
+            Scale = 1;
+          }
+        }
+        break;
+      }
+      PrevState = CurrState;
+    }
+    void onMinus() {
+      IntelExprState CurrState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_PLUS:
+      case IES_NOT:
+      case IES_MULTIPLY:
+      case IES_DIVIDE:
+      case IES_LPAREN:
+      case IES_RPAREN:
+      case IES_LBRAC:
+      case IES_RBRAC:
+      case IES_INTEGER:
+      case IES_REGISTER:
+        State = IES_MINUS;
+        // Only push the minus operator if it is not a unary operator.
+        if (!(CurrState == IES_PLUS || CurrState == IES_MINUS ||
+              CurrState == IES_MULTIPLY || CurrState == IES_DIVIDE ||
+              CurrState == IES_LPAREN || CurrState == IES_LBRAC))
+          IC.pushOperator(IC_MINUS);
+        if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) {
+          // If we already have a BaseReg, then assume this is the IndexReg with
+          // a scale of 1.
+          if (!BaseReg) {
+            BaseReg = TmpReg;
+          } else {
+            assert (!IndexReg && "BaseReg/IndexReg already set!");
+            IndexReg = TmpReg;
+            Scale = 1;
+          }
+        }
+        break;
+      }
+      PrevState = CurrState;
+    }
+    void onNot() {
+      IntelExprState CurrState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_PLUS:
+      case IES_NOT:
+        State = IES_NOT;
+        break;
+      }
+      PrevState = CurrState;
+    }
+    void onRegister(unsigned Reg) {
+      IntelExprState CurrState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_PLUS:
+      case IES_LPAREN:
+        State = IES_REGISTER;
+        TmpReg = Reg;
+        IC.pushOperand(IC_REGISTER);
+        break;
+      case IES_MULTIPLY:
+        // Index Register - Scale * Register
+        if (PrevState == IES_INTEGER) {
+          assert (!IndexReg && "IndexReg already set!");
+          State = IES_REGISTER;
+          IndexReg = Reg;
+          // Get the scale and replace the 'Scale * Register' with '0'.
+          Scale = IC.popOperand();
+          IC.pushOperand(IC_IMM);
+          IC.popOperator();
+        } else {
+          State = IES_ERROR;
+        }
+        break;
+      }
+      PrevState = CurrState;
+    }
+    void onIdentifierExpr(const MCExpr *SymRef, StringRef SymRefName) {
+      PrevState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_PLUS:
+      case IES_MINUS:
+      case IES_NOT:
+        State = IES_INTEGER;
+        Sym = SymRef;
+        SymName = SymRefName;
+        IC.pushOperand(IC_IMM);
+        break;
+      }
+    }
+    bool onInteger(int64_t TmpInt, StringRef &ErrMsg) {
+      IntelExprState CurrState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_PLUS:
+      case IES_MINUS:
+      case IES_NOT:
+      case IES_OR:
+      case IES_AND:
+      case IES_LSHIFT:
+      case IES_RSHIFT:
+      case IES_DIVIDE:
+      case IES_MULTIPLY:
+      case IES_LPAREN:
+        State = IES_INTEGER;
+        if (PrevState == IES_REGISTER && CurrState == IES_MULTIPLY) {
+          // Index Register - Register * Scale
+          assert (!IndexReg && "IndexReg already set!");
+          IndexReg = TmpReg;
+          Scale = TmpInt;
+          if(Scale != 1 && Scale != 2 && Scale != 4 && Scale != 8) {
+            ErrMsg = "scale factor in address must be 1, 2, 4 or 8";
+            return true;
+          }
+          // Get the scale and replace the 'Register * Scale' with '0'.
+          IC.popOperator();
+        } else if ((PrevState == IES_PLUS || PrevState == IES_MINUS ||
+                    PrevState == IES_OR || PrevState == IES_AND ||
+                    PrevState == IES_LSHIFT || PrevState == IES_RSHIFT ||
+                    PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE ||
+                    PrevState == IES_LPAREN || PrevState == IES_LBRAC ||
+                    PrevState == IES_NOT) &&
+                   CurrState == IES_MINUS) {
+          // Unary minus.  No need to pop the minus operand because it was never
+          // pushed.
+          IC.pushOperand(IC_IMM, -TmpInt); // Push -Imm.
+        } else if ((PrevState == IES_PLUS || PrevState == IES_MINUS ||
+                    PrevState == IES_OR || PrevState == IES_AND ||
+                    PrevState == IES_LSHIFT || PrevState == IES_RSHIFT ||
+                    PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE ||
+                    PrevState == IES_LPAREN || PrevState == IES_LBRAC ||
+                    PrevState == IES_NOT) &&
+                   CurrState == IES_NOT) {
+          // Unary not.  No need to pop the not operand because it was never
+          // pushed.
+          IC.pushOperand(IC_IMM, ~TmpInt); // Push ~Imm.
+        } else {
+          IC.pushOperand(IC_IMM, TmpInt);
+        }
+        break;
+      }
+      PrevState = CurrState;
+      return false;
+    }
+    void onStar() {
+      PrevState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_INTEGER:
+      case IES_REGISTER:
+      case IES_RPAREN:
+        State = IES_MULTIPLY;
+        IC.pushOperator(IC_MULTIPLY);
+        break;
+      }
+    }
+    void onDivide() {
+      PrevState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_INTEGER:
+      case IES_RPAREN:
+        State = IES_DIVIDE;
+        IC.pushOperator(IC_DIVIDE);
+        break;
+      }
+    }
+    void onLBrac() {
+      PrevState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_RBRAC:
+        State = IES_PLUS;
+        IC.pushOperator(IC_PLUS);
+        break;
+      }
+    }
+    void onRBrac() {
+      IntelExprState CurrState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_INTEGER:
+      case IES_REGISTER:
+      case IES_RPAREN:
+        State = IES_RBRAC;
+        if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) {
+          // If we already have a BaseReg, then assume this is the IndexReg with
+          // a scale of 1.
+          if (!BaseReg) {
+            BaseReg = TmpReg;
+          } else {
+            assert (!IndexReg && "BaseReg/IndexReg already set!");
+            IndexReg = TmpReg;
+            Scale = 1;
+          }
+        }
+        break;
+      }
+      PrevState = CurrState;
+    }
+    void onLParen() {
+      IntelExprState CurrState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_PLUS:
+      case IES_MINUS:
+      case IES_NOT:
+      case IES_OR:
+      case IES_AND:
+      case IES_LSHIFT:
+      case IES_RSHIFT:
+      case IES_MULTIPLY:
+      case IES_DIVIDE:
+      case IES_LPAREN:
+        // FIXME: We don't handle this type of unary minus or not, yet.
+        if ((PrevState == IES_PLUS || PrevState == IES_MINUS ||
+            PrevState == IES_OR || PrevState == IES_AND ||
+            PrevState == IES_LSHIFT || PrevState == IES_RSHIFT ||
+            PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE ||
+            PrevState == IES_LPAREN || PrevState == IES_LBRAC ||
+            PrevState == IES_NOT) &&
+            (CurrState == IES_MINUS || CurrState == IES_NOT)) {
+          State = IES_ERROR;
+          break;
+        }
+        State = IES_LPAREN;
+        IC.pushOperator(IC_LPAREN);
+        break;
+      }
+      PrevState = CurrState;
+    }
+    void onRParen() {
+      PrevState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_INTEGER:
+      case IES_REGISTER:
+      case IES_RPAREN:
+        State = IES_RPAREN;
+        IC.pushOperator(IC_RPAREN);
+        break;
+      }
+    }
+  };
+
+  MCAsmParser &getParser() const { return Parser; }
+
+  MCAsmLexer &getLexer() const { return Parser.getLexer(); }
+
+  bool Error(SMLoc L, const Twine &Msg,
+             ArrayRef<SMRange> Ranges = None,
+             bool MatchingInlineAsm = false) {
+    if (MatchingInlineAsm) return true;
+    return Parser.Error(L, Msg, Ranges);
+  }
+
+  bool ErrorAndEatStatement(SMLoc L, const Twine &Msg,
+          ArrayRef<SMRange> Ranges = None,
+          bool MatchingInlineAsm = false) {
+      Parser.eatToEndOfStatement();
+      return Error(L, Msg, Ranges, MatchingInlineAsm);
+  }
+
+  std::nullptr_t ErrorOperand(SMLoc Loc, StringRef Msg) {
+    Error(Loc, Msg);
+    return nullptr;
+  }
+
+  std::unique_ptr<X86Operand> DefaultMemSIOperand(SMLoc Loc);
+  std::unique_ptr<X86Operand> DefaultMemDIOperand(SMLoc Loc);
+  std::unique_ptr<X86Operand> ParseOperand();
+  std::unique_ptr<X86Operand> ParseATTOperand();
+  std::unique_ptr<X86Operand> ParseIntelOperand();
+  std::unique_ptr<X86Operand> ParseIntelOffsetOfOperator();
+  bool ParseIntelDotOperator(const MCExpr *Disp, const MCExpr *&NewDisp);
+  std::unique_ptr<X86Operand> ParseIntelOperator(unsigned OpKind);
+  std::unique_ptr<X86Operand>
+  ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, unsigned Size);
+  std::unique_ptr<X86Operand>
+  ParseIntelMemOperand(int64_t ImmDisp, SMLoc StartLoc, unsigned Size);
+  bool ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End);
+  std::unique_ptr<X86Operand> ParseIntelBracExpression(unsigned SegReg,
+                                                       SMLoc Start,
+                                                       int64_t ImmDisp,
+                                                       unsigned Size);
+  bool ParseIntelIdentifier(const MCExpr *&Val, StringRef &Identifier,
+                            InlineAsmIdentifierInfo &Info,
+                            bool IsUnevaluatedOperand, SMLoc &End);
+
+  std::unique_ptr<X86Operand> ParseMemOperand(unsigned SegReg, SMLoc StartLoc);
+
+  std::unique_ptr<X86Operand>
+  CreateMemForInlineAsm(unsigned SegReg, const MCExpr *Disp, unsigned BaseReg,
+                        unsigned IndexReg, unsigned Scale, SMLoc Start,
+                        SMLoc End, unsigned Size, StringRef Identifier,
+                        InlineAsmIdentifierInfo &Info);
+
+  bool ParseDirectiveWord(unsigned Size, SMLoc L);
+  bool ParseDirectiveCode(StringRef IDVal, SMLoc L);
+
+  bool processInstruction(MCInst &Inst, const OperandVector &Ops);
+
+  /// Wrapper around MCStreamer::EmitInstruction(). Possibly adds
+  /// instrumentation around Inst.
+  void EmitInstruction(MCInst &Inst, OperandVector &Operands, MCStreamer &Out);
+
+  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                               OperandVector &Operands, MCStreamer &Out,
+                               unsigned &ErrorInfo,
+                               bool MatchingInlineAsm) override;
+
+  virtual bool OmitRegisterFromClobberLists(unsigned RegNo) override;
+
+  /// doSrcDstMatch - Returns true if operands are matching in their
+  /// word size (%si and %di, %esi and %edi, etc.). Order depends on
+  /// the parsing mode (Intel vs. AT&T).
+  bool doSrcDstMatch(X86Operand &Op1, X86Operand &Op2);
+
+  /// Parses AVX512 specific operand primitives: masked registers ({%k<NUM>}, {z})
+  /// and memory broadcasting ({1to<NUM>}) primitives, updating Operands vector if required.
+  /// \return \c true if no parsing errors occurred, \c false otherwise.
+  bool HandleAVX512Operand(OperandVector &Operands,
+                           const MCParsedAsmOperand &Op);
+
+  bool is64BitMode() const {
+    // FIXME: Can tablegen auto-generate this?
+    return (STI.getFeatureBits() & X86::Mode64Bit) != 0;
+  }
+  bool is32BitMode() const {
+    // FIXME: Can tablegen auto-generate this?
+    return (STI.getFeatureBits() & X86::Mode32Bit) != 0;
+  }
+  bool is16BitMode() const {
+    // FIXME: Can tablegen auto-generate this?
+    return (STI.getFeatureBits() & X86::Mode16Bit) != 0;
+  }
+  void SwitchMode(uint64_t mode) {
+    uint64_t oldMode = STI.getFeatureBits() &
+        (X86::Mode64Bit | X86::Mode32Bit | X86::Mode16Bit);
+    unsigned FB = ComputeAvailableFeatures(STI.ToggleFeature(oldMode | mode));
+    setAvailableFeatures(FB);
+    assert(mode == (STI.getFeatureBits() &
+                    (X86::Mode64Bit | X86::Mode32Bit | X86::Mode16Bit)));
+  }
+
+  bool isParsingIntelSyntax() {
+    return getParser().getAssemblerDialect();
+  }
+
+  /// @name Auto-generated Matcher Functions
+  /// {
+
+#define GET_ASSEMBLER_HEADER
+#include "X86GenAsmMatcher.inc"
+
+  /// }
+
+public:
+  X86AsmParser(MCSubtargetInfo &sti, MCAsmParser &parser,
+               const MCInstrInfo &mii,
+               const MCTargetOptions &Options)
+      : MCTargetAsmParser(), STI(sti), Parser(parser), MII(mii),
+        InstInfo(nullptr) {
+
+    // Initialize the set of available features.
+    setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+    Instrumentation.reset(
+        CreateX86AsmInstrumentation(Options, Parser.getContext(), STI));
+  }
+
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+
+  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                        SMLoc NameLoc, OperandVector &Operands) override;
+
+  bool ParseDirective(AsmToken DirectiveID) override;
+};
+} // end anonymous namespace
+
+/// @name Auto-generated Match Functions
+/// {
+
+static unsigned MatchRegisterName(StringRef Name);
+
+/// }
+
+static bool CheckBaseRegAndIndexReg(unsigned BaseReg, unsigned IndexReg,
+                                    StringRef &ErrMsg) {
+  // If we have both a base register and an index register make sure they are
+  // both 64-bit or 32-bit registers.
+  // To support VSIB, IndexReg can be 128-bit or 256-bit registers.
+  if (BaseReg != 0 && IndexReg != 0) {
+    if (X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg) &&
+        (X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) ||
+         X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg)) &&
+        IndexReg != X86::RIZ) {
+      ErrMsg = "base register is 64-bit, but index register is not";
+      return true;
+    }
+    if (X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg) &&
+        (X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) ||
+         X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg)) &&
+        IndexReg != X86::EIZ){
+      ErrMsg = "base register is 32-bit, but index register is not";
+      return true;
+    }
+    if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg)) {
+      if (X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg) ||
+          X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg)) {
+        ErrMsg = "base register is 16-bit, but index register is not";
+        return true;
+      }
+      if (((BaseReg == X86::BX || BaseReg == X86::BP) &&
+           IndexReg != X86::SI && IndexReg != X86::DI) ||
+          ((BaseReg == X86::SI || BaseReg == X86::DI) &&
+           IndexReg != X86::BX && IndexReg != X86::BP)) {
+        ErrMsg = "invalid 16-bit base/index register combination";
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+bool X86AsmParser::doSrcDstMatch(X86Operand &Op1, X86Operand &Op2)
+{
+  // Return true and let a normal complaint about bogus operands happen.
+  if (!Op1.isMem() || !Op2.isMem())
+    return true;
+
+  // Actually these might be the other way round if Intel syntax is
+  // being used. It doesn't matter.
+  unsigned diReg = Op1.Mem.BaseReg;
+  unsigned siReg = Op2.Mem.BaseReg;
+
+  if (X86MCRegisterClasses[X86::GR16RegClassID].contains(siReg))
+    return X86MCRegisterClasses[X86::GR16RegClassID].contains(diReg);
+  if (X86MCRegisterClasses[X86::GR32RegClassID].contains(siReg))
+    return X86MCRegisterClasses[X86::GR32RegClassID].contains(diReg);
+  if (X86MCRegisterClasses[X86::GR64RegClassID].contains(siReg))
+    return X86MCRegisterClasses[X86::GR64RegClassID].contains(diReg);
+  // Again, return true and let another error happen.
+  return true;
+}
+
+bool X86AsmParser::ParseRegister(unsigned &RegNo,
+                                 SMLoc &StartLoc, SMLoc &EndLoc) {
+  RegNo = 0;
+  const AsmToken &PercentTok = Parser.getTok();
+  StartLoc = PercentTok.getLoc();
+
+  // If we encounter a %, ignore it. This code handles registers with and
+  // without the prefix, unprefixed registers can occur in cfi directives.
+  if (!isParsingIntelSyntax() && PercentTok.is(AsmToken::Percent))
+    Parser.Lex(); // Eat percent token.
+
+  const AsmToken &Tok = Parser.getTok();
+  EndLoc = Tok.getEndLoc();
+
+  if (Tok.isNot(AsmToken::Identifier)) {
+    if (isParsingIntelSyntax()) return true;
+    return Error(StartLoc, "invalid register name",
+                 SMRange(StartLoc, EndLoc));
+  }
+
+  RegNo = MatchRegisterName(Tok.getString());
+
+  // If the match failed, try the register name as lowercase.
+  if (RegNo == 0)
+    RegNo = MatchRegisterName(Tok.getString().lower());
+
+  if (!is64BitMode()) {
+    // FIXME: This should be done using Requires<Not64BitMode> and
+    // Requires<In64BitMode> so "eiz" usage in 64-bit instructions can be also
+    // checked.
+    // FIXME: Check AH, CH, DH, BH cannot be used in an instruction requiring a
+    // REX prefix.
+    if (RegNo == X86::RIZ ||
+        X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo) ||
+        X86II::isX86_64NonExtLowByteReg(RegNo) ||
+        X86II::isX86_64ExtendedReg(RegNo))
+      return Error(StartLoc, "register %"
+                   + Tok.getString() + " is only available in 64-bit mode",
+                   SMRange(StartLoc, EndLoc));
+  }
+
+  // Parse "%st" as "%st(0)" and "%st(1)", which is multiple tokens.
+  if (RegNo == 0 && (Tok.getString() == "st" || Tok.getString() == "ST")) {
+    RegNo = X86::ST0;
+    Parser.Lex(); // Eat 'st'
+
+    // Check to see if we have '(4)' after %st.
+    if (getLexer().isNot(AsmToken::LParen))
+      return false;
+    // Lex the paren.
+    getParser().Lex();
+
+    const AsmToken &IntTok = Parser.getTok();
+    if (IntTok.isNot(AsmToken::Integer))
+      return Error(IntTok.getLoc(), "expected stack index");
+    switch (IntTok.getIntVal()) {
+    case 0: RegNo = X86::ST0; break;
+    case 1: RegNo = X86::ST1; break;
+    case 2: RegNo = X86::ST2; break;
+    case 3: RegNo = X86::ST3; break;
+    case 4: RegNo = X86::ST4; break;
+    case 5: RegNo = X86::ST5; break;
+    case 6: RegNo = X86::ST6; break;
+    case 7: RegNo = X86::ST7; break;
+    default: return Error(IntTok.getLoc(), "invalid stack index");
+    }
+
+    if (getParser().Lex().isNot(AsmToken::RParen))
+      return Error(Parser.getTok().getLoc(), "expected ')'");
+
+    EndLoc = Parser.getTok().getEndLoc();
+    Parser.Lex(); // Eat ')'
+    return false;
+  }
+
+  EndLoc = Parser.getTok().getEndLoc();
+
+  // If this is "db[0-7]", match it as an alias
+  // for dr[0-7].
+  if (RegNo == 0 && Tok.getString().size() == 3 &&
+      Tok.getString().startswith("db")) {
+    switch (Tok.getString()[2]) {
+    case '0': RegNo = X86::DR0; break;
+    case '1': RegNo = X86::DR1; break;
+    case '2': RegNo = X86::DR2; break;
+    case '3': RegNo = X86::DR3; break;
+    case '4': RegNo = X86::DR4; break;
+    case '5': RegNo = X86::DR5; break;
+    case '6': RegNo = X86::DR6; break;
+    case '7': RegNo = X86::DR7; break;
+    }
+
+    if (RegNo != 0) {
+      EndLoc = Parser.getTok().getEndLoc();
+      Parser.Lex(); // Eat it.
+      return false;
+    }
+  }
+
+  if (RegNo == 0) {
+    if (isParsingIntelSyntax()) return true;
+    return Error(StartLoc, "invalid register name",
+                 SMRange(StartLoc, EndLoc));
+  }
+
+  Parser.Lex(); // Eat identifier token.
+  return false;
+}
+
+std::unique_ptr<X86Operand> X86AsmParser::DefaultMemSIOperand(SMLoc Loc) {
+  unsigned basereg =
+    is64BitMode() ? X86::RSI : (is32BitMode() ? X86::ESI : X86::SI);
+  const MCExpr *Disp = MCConstantExpr::Create(0, getContext());
+  return X86Operand::CreateMem(/*SegReg=*/0, Disp, /*BaseReg=*/basereg,
+                               /*IndexReg=*/0, /*Scale=*/1, Loc, Loc, 0);
+}
+
+std::unique_ptr<X86Operand> X86AsmParser::DefaultMemDIOperand(SMLoc Loc) {
+  unsigned basereg =
+    is64BitMode() ? X86::RDI : (is32BitMode() ? X86::EDI : X86::DI);
+  const MCExpr *Disp = MCConstantExpr::Create(0, getContext());
+  return X86Operand::CreateMem(/*SegReg=*/0, Disp, /*BaseReg=*/basereg,
+                               /*IndexReg=*/0, /*Scale=*/1, Loc, Loc, 0);
+}
+
+std::unique_ptr<X86Operand> X86AsmParser::ParseOperand() {
+  if (isParsingIntelSyntax())
+    return ParseIntelOperand();
+  return ParseATTOperand();
+}
+
+/// getIntelMemOperandSize - Return intel memory operand size.
+static unsigned getIntelMemOperandSize(StringRef OpStr) {
+  unsigned Size = StringSwitch<unsigned>(OpStr)
+    .Cases("BYTE", "byte", 8)
+    .Cases("WORD", "word", 16)
+    .Cases("DWORD", "dword", 32)
+    .Cases("QWORD", "qword", 64)
+    .Cases("XWORD", "xword", 80)
+    .Cases("XMMWORD", "xmmword", 128)
+    .Cases("YMMWORD", "ymmword", 256)
+    .Cases("ZMMWORD", "zmmword", 512)
+    .Cases("OPAQUE", "opaque", -1U) // needs to be non-zero, but doesn't matter
+    .Default(0);
+  return Size;
+}
+
+std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm(
+    unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, unsigned IndexReg,
+    unsigned Scale, SMLoc Start, SMLoc End, unsigned Size, StringRef Identifier,
+    InlineAsmIdentifierInfo &Info) {
+  // If this is not a VarDecl then assume it is a FuncDecl or some other label
+  // reference.  We need an 'r' constraint here, so we need to create register
+  // operand to ensure proper matching.  Just pick a GPR based on the size of
+  // a pointer.
+  if (isa<MCSymbolRefExpr>(Disp) && !Info.IsVarDecl) {
+    unsigned RegNo =
+        is64BitMode() ? X86::RBX : (is32BitMode() ? X86::EBX : X86::BX);
+    return X86Operand::CreateReg(RegNo, Start, End, /*AddressOf=*/true,
+                                 SMLoc(), Identifier, Info.OpDecl);
+  }
+
+  // We either have a direct symbol reference, or an offset from a symbol.  The
+  // parser always puts the symbol on the LHS, so look there for size
+  // calculation purposes.
+  const MCBinaryExpr *BinOp = dyn_cast<MCBinaryExpr>(Disp);
+  bool IsSymRef =
+      isa<MCSymbolRefExpr>(BinOp ? BinOp->getLHS() : Disp);
+  if (IsSymRef) {
+    if (!Size) {
+      Size = Info.Type * 8; // Size is in terms of bits in this context.
+      if (Size)
+        InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_SizeDirective, Start,
+                                                    /*Len=*/0, Size));
+    }
+  }
+
+  // When parsing inline assembly we set the base register to a non-zero value
+  // if we don't know the actual value at this time.  This is necessary to
+  // get the matching correct in some cases.
+  BaseReg = BaseReg ? BaseReg : 1;
+  return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale, Start,
+                               End, Size, Identifier, Info.OpDecl);
+}
+
+static void
+RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> *AsmRewrites,
+                           StringRef SymName, int64_t ImmDisp,
+                           int64_t FinalImmDisp, SMLoc &BracLoc,
+                           SMLoc &StartInBrac, SMLoc &End) {
+  // Remove the '[' and ']' from the IR string.
+  AsmRewrites->push_back(AsmRewrite(AOK_Skip, BracLoc, 1));
+  AsmRewrites->push_back(AsmRewrite(AOK_Skip, End, 1));
+
+  // If ImmDisp is non-zero, then we parsed a displacement before the
+  // bracketed expression (i.e., ImmDisp [ BaseReg + Scale*IndexReg + Disp])
+  // If ImmDisp doesn't match the displacement computed by the state machine
+  // then we have an additional displacement in the bracketed expression.
+  if (ImmDisp != FinalImmDisp) {
+    if (ImmDisp) {
+      // We have an immediate displacement before the bracketed expression.
+      // Adjust this to match the final immediate displacement.
+      bool Found = false;
+      for (SmallVectorImpl<AsmRewrite>::iterator I = AsmRewrites->begin(),
+             E = AsmRewrites->end(); I != E; ++I) {
+        if ((*I).Loc.getPointer() > BracLoc.getPointer())
+          continue;
+        if ((*I).Kind == AOK_ImmPrefix || (*I).Kind == AOK_Imm) {
+          assert (!Found && "ImmDisp already rewritten.");
+          (*I).Kind = AOK_Imm;
+          (*I).Len = BracLoc.getPointer() - (*I).Loc.getPointer();
+          (*I).Val = FinalImmDisp;
+          Found = true;
+          break;
+        }
+      }
+      assert (Found && "Unable to rewrite ImmDisp.");
+      (void)Found;
+    } else {
+      // We have a symbolic and an immediate displacement, but no displacement
+      // before the bracketed expression.  Put the immediate displacement
+      // before the bracketed expression.
+      AsmRewrites->push_back(AsmRewrite(AOK_Imm, BracLoc, 0, FinalImmDisp));
+    }
+  }
+  // Remove all the ImmPrefix rewrites within the brackets.
+  for (SmallVectorImpl<AsmRewrite>::iterator I = AsmRewrites->begin(),
+         E = AsmRewrites->end(); I != E; ++I) {
+    if ((*I).Loc.getPointer() < StartInBrac.getPointer())
+      continue;
+    if ((*I).Kind == AOK_ImmPrefix)
+      (*I).Kind = AOK_Delete;
+  }
+  const char *SymLocPtr = SymName.data();
+  // Skip everything before the symbol.        
+  if (unsigned Len = SymLocPtr - StartInBrac.getPointer()) {
+    assert(Len > 0 && "Expected a non-negative length.");
+    AsmRewrites->push_back(AsmRewrite(AOK_Skip, StartInBrac, Len));
+  }
+  // Skip everything after the symbol.
+  if (unsigned Len = End.getPointer() - (SymLocPtr + SymName.size())) {
+    SMLoc Loc = SMLoc::getFromPointer(SymLocPtr + SymName.size());
+    assert(Len > 0 && "Expected a non-negative length.");
+    AsmRewrites->push_back(AsmRewrite(AOK_Skip, Loc, Len));
+  }
+}
+
+bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
+  const AsmToken &Tok = Parser.getTok();
+
+  bool Done = false;
+  while (!Done) {
+    bool UpdateLocLex = true;
+
+    // The period in the dot operator (e.g., [ebx].foo.bar) is parsed as an
+    // identifier.  Don't try an parse it as a register.
+    if (Tok.getString().startswith("."))
+      break;
+    
+    // If we're parsing an immediate expression, we don't expect a '['.
+    if (SM.getStopOnLBrac() && getLexer().getKind() == AsmToken::LBrac)
+      break;
+
+    AsmToken::TokenKind TK = getLexer().getKind();
+    switch (TK) {
+    default: {
+      if (SM.isValidEndState()) {
+        Done = true;
+        break;
+      }
+      return Error(Tok.getLoc(), "unknown token in expression");
+    }
+    case AsmToken::EndOfStatement: {
+      Done = true;
+      break;
+    }
+    case AsmToken::String:
+    case AsmToken::Identifier: {
+      // This could be a register or a symbolic displacement.
+      unsigned TmpReg;
+      const MCExpr *Val;
+      SMLoc IdentLoc = Tok.getLoc();
+      StringRef Identifier = Tok.getString();
+      if (TK != AsmToken::String && !ParseRegister(TmpReg, IdentLoc, End)) {
+        SM.onRegister(TmpReg);
+        UpdateLocLex = false;
+        break;
+      } else {
+        if (!isParsingInlineAsm()) {
+          if (getParser().parsePrimaryExpr(Val, End))
+            return Error(Tok.getLoc(), "Unexpected identifier!");
+        } else {
+          // This is a dot operator, not an adjacent identifier.
+          if (Identifier.find('.') != StringRef::npos) {
+            return false;
+          } else {
+            InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo();
+            if (ParseIntelIdentifier(Val, Identifier, Info,
+                                     /*Unevaluated=*/false, End))
+              return true;
+          }
+        }
+        SM.onIdentifierExpr(Val, Identifier);
+        UpdateLocLex = false;
+        break;
+      }
+      return Error(Tok.getLoc(), "Unexpected identifier!");
+    }
+    case AsmToken::Integer: {
+      StringRef ErrMsg;
+      if (isParsingInlineAsm() && SM.getAddImmPrefix())
+        InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_ImmPrefix,
+                                                    Tok.getLoc()));
+      // Look for 'b' or 'f' following an Integer as a directional label
+      SMLoc Loc = getTok().getLoc();
+      int64_t IntVal = getTok().getIntVal();
+      End = consumeToken();
+      UpdateLocLex = false;
+      if (getLexer().getKind() == AsmToken::Identifier) {
+        StringRef IDVal = getTok().getString();
+        if (IDVal == "f" || IDVal == "b") {
+          MCSymbol *Sym =
+              getContext().GetDirectionalLocalSymbol(IntVal, IDVal == "b");
+          MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
+          const MCExpr *Val = 
+	    MCSymbolRefExpr::Create(Sym, Variant, getContext());
+          if (IDVal == "b" && Sym->isUndefined())
+            return Error(Loc, "invalid reference to undefined symbol");
+          StringRef Identifier = Sym->getName();
+          SM.onIdentifierExpr(Val, Identifier);
+          End = consumeToken();
+        } else {
+          if (SM.onInteger(IntVal, ErrMsg))
+            return Error(Loc, ErrMsg);
+        }
+      } else {
+        if (SM.onInteger(IntVal, ErrMsg))
+          return Error(Loc, ErrMsg);
+      }
+      break;
+    }
+    case AsmToken::Plus:    SM.onPlus(); break;
+    case AsmToken::Minus:   SM.onMinus(); break;
+    case AsmToken::Tilde:   SM.onNot(); break;
+    case AsmToken::Star:    SM.onStar(); break;
+    case AsmToken::Slash:   SM.onDivide(); break;
+    case AsmToken::Pipe:    SM.onOr(); break;
+    case AsmToken::Amp:     SM.onAnd(); break;
+    case AsmToken::LessLess:
+                            SM.onLShift(); break;
+    case AsmToken::GreaterGreater:
+                            SM.onRShift(); break;
+    case AsmToken::LBrac:   SM.onLBrac(); break;
+    case AsmToken::RBrac:   SM.onRBrac(); break;
+    case AsmToken::LParen:  SM.onLParen(); break;
+    case AsmToken::RParen:  SM.onRParen(); break;
+    }
+    if (SM.hadError())
+      return Error(Tok.getLoc(), "unknown token in expression");
+
+    if (!Done && UpdateLocLex)
+      End = consumeToken();
+  }
+  return false;
+}
+
+std::unique_ptr<X86Operand>
+X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
+                                       int64_t ImmDisp, unsigned Size) {
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc BracLoc = Tok.getLoc(), End = Tok.getEndLoc();
+  if (getLexer().isNot(AsmToken::LBrac))
+    return ErrorOperand(BracLoc, "Expected '[' token!");
+  Parser.Lex(); // Eat '['
+
+  SMLoc StartInBrac = Tok.getLoc();
+  // Parse [ Symbol + ImmDisp ] and [ BaseReg + Scale*IndexReg + ImmDisp ].  We
+  // may have already parsed an immediate displacement before the bracketed
+  // expression.
+  IntelExprStateMachine SM(ImmDisp, /*StopOnLBrac=*/false, /*AddImmPrefix=*/true);
+  if (ParseIntelExpression(SM, End))
+    return nullptr;
+
+  const MCExpr *Disp = nullptr;
+  if (const MCExpr *Sym = SM.getSym()) {
+    // A symbolic displacement.
+    Disp = Sym;
+    if (isParsingInlineAsm())
+      RewriteIntelBracExpression(InstInfo->AsmRewrites, SM.getSymName(),
+                                 ImmDisp, SM.getImm(), BracLoc, StartInBrac,
+                                 End);
+  }
+
+  if (SM.getImm() || !Disp) {
+    const MCExpr *Imm = MCConstantExpr::Create(SM.getImm(), getContext());
+    if (Disp)
+      Disp = MCBinaryExpr::CreateAdd(Disp, Imm, getContext());
+    else
+      Disp = Imm;  // An immediate displacement only.
+  }
+
+  // Parse struct field access.  Intel requires a dot, but MSVC doesn't.  MSVC
+  // will in fact do global lookup the field name inside all global typedefs,
+  // but we don't emulate that.
+  if (Tok.getString().find('.') != StringRef::npos) {
+    const MCExpr *NewDisp;
+    if (ParseIntelDotOperator(Disp, NewDisp))
+      return nullptr;
+    
+    End = Tok.getEndLoc();
+    Parser.Lex();  // Eat the field.
+    Disp = NewDisp;
+  }
+
+  int BaseReg = SM.getBaseReg();
+  int IndexReg = SM.getIndexReg();
+  int Scale = SM.getScale();
+  if (!isParsingInlineAsm()) {
+    // handle [-42]
+    if (!BaseReg && !IndexReg) {
+      if (!SegReg)
+        return X86Operand::CreateMem(Disp, Start, End, Size);
+      else
+        return X86Operand::CreateMem(SegReg, Disp, 0, 0, 1, Start, End, Size);
+    }
+    StringRef ErrMsg;
+    if (CheckBaseRegAndIndexReg(BaseReg, IndexReg, ErrMsg)) {
+      Error(StartInBrac, ErrMsg);
+      return nullptr;
+    }
+    return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale, Start,
+                                 End, Size);
+  }
+
+  InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo();
+  return CreateMemForInlineAsm(SegReg, Disp, BaseReg, IndexReg, Scale, Start,
+                               End, Size, SM.getSymName(), Info);
+}
+
+// Inline assembly may use variable names with namespace alias qualifiers.
+bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val,
+                                        StringRef &Identifier,
+                                        InlineAsmIdentifierInfo &Info,
+                                        bool IsUnevaluatedOperand, SMLoc &End) {
+  assert (isParsingInlineAsm() && "Expected to be parsing inline assembly.");
+  Val = nullptr;
+
+  StringRef LineBuf(Identifier.data());
+  SemaCallback->LookupInlineAsmIdentifier(LineBuf, Info, IsUnevaluatedOperand);
+
+  const AsmToken &Tok = Parser.getTok();
+
+  // Advance the token stream until the end of the current token is
+  // after the end of what the frontend claimed.
+  const char *EndPtr = Tok.getLoc().getPointer() + LineBuf.size();
+  while (true) {
+    End = Tok.getEndLoc();
+    getLexer().Lex();
+
+    assert(End.getPointer() <= EndPtr && "frontend claimed part of a token?");
+    if (End.getPointer() == EndPtr) break;
+  }
+
+  // Create the symbol reference.
+  Identifier = LineBuf;
+  MCSymbol *Sym = getContext().GetOrCreateSymbol(Identifier);
+  MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
+  Val = MCSymbolRefExpr::Create(Sym, Variant, getParser().getContext());
+  return false;
+}
+
+/// \brief Parse intel style segment override.
+std::unique_ptr<X86Operand>
+X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start,
+                                        unsigned Size) {
+  assert(SegReg != 0 && "Tried to parse a segment override without a segment!");
+  const AsmToken &Tok = Parser.getTok(); // Eat colon.
+  if (Tok.isNot(AsmToken::Colon))
+    return ErrorOperand(Tok.getLoc(), "Expected ':' token!");
+  Parser.Lex(); // Eat ':'
+
+  int64_t ImmDisp = 0;
+  if (getLexer().is(AsmToken::Integer)) {
+    ImmDisp = Tok.getIntVal();
+    AsmToken ImmDispToken = Parser.Lex(); // Eat the integer.
+
+    if (isParsingInlineAsm())
+      InstInfo->AsmRewrites->push_back(
+          AsmRewrite(AOK_ImmPrefix, ImmDispToken.getLoc()));
+
+    if (getLexer().isNot(AsmToken::LBrac)) {
+      // An immediate following a 'segment register', 'colon' token sequence can
+      // be followed by a bracketed expression.  If it isn't we know we have our
+      // final segment override.
+      const MCExpr *Disp = MCConstantExpr::Create(ImmDisp, getContext());
+      return X86Operand::CreateMem(SegReg, Disp, /*BaseReg=*/0, /*IndexReg=*/0,
+                                   /*Scale=*/1, Start, ImmDispToken.getEndLoc(),
+                                   Size);
+    }
+  }
+
+  if (getLexer().is(AsmToken::LBrac))
+    return ParseIntelBracExpression(SegReg, Start, ImmDisp, Size);
+
+  const MCExpr *Val;
+  SMLoc End;
+  if (!isParsingInlineAsm()) {
+    if (getParser().parsePrimaryExpr(Val, End))
+      return ErrorOperand(Tok.getLoc(), "unknown token in expression");
+
+    return X86Operand::CreateMem(Val, Start, End, Size);
+  }
+
+  InlineAsmIdentifierInfo Info;
+  StringRef Identifier = Tok.getString();
+  if (ParseIntelIdentifier(Val, Identifier, Info,
+                           /*Unevaluated=*/false, End))
+    return nullptr;
+  return CreateMemForInlineAsm(/*SegReg=*/0, Val, /*BaseReg=*/0,/*IndexReg=*/0,
+                               /*Scale=*/1, Start, End, Size, Identifier, Info);
+}
+
+/// ParseIntelMemOperand - Parse intel style memory operand.
+std::unique_ptr<X86Operand> X86AsmParser::ParseIntelMemOperand(int64_t ImmDisp,
+                                                               SMLoc Start,
+                                                               unsigned Size) {
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc End;
+
+  // Parse ImmDisp [ BaseReg + Scale*IndexReg + Disp ].
+  if (getLexer().is(AsmToken::LBrac))
+    return ParseIntelBracExpression(/*SegReg=*/0, Start, ImmDisp, Size);
+  assert(ImmDisp == 0);
+
+  const MCExpr *Val;
+  if (!isParsingInlineAsm()) {
+    if (getParser().parsePrimaryExpr(Val, End))
+      return ErrorOperand(Tok.getLoc(), "unknown token in expression");
+
+    return X86Operand::CreateMem(Val, Start, End, Size);
+  }
+
+  InlineAsmIdentifierInfo Info;
+  StringRef Identifier = Tok.getString();
+  if (ParseIntelIdentifier(Val, Identifier, Info,
+                           /*Unevaluated=*/false, End))
+    return nullptr;
+
+  if (!getLexer().is(AsmToken::LBrac))
+    return CreateMemForInlineAsm(/*SegReg=*/0, Val, /*BaseReg=*/0, /*IndexReg=*/0,
+                                 /*Scale=*/1, Start, End, Size, Identifier, Info);
+
+  Parser.Lex(); // Eat '['
+
+  // Parse Identifier [ ImmDisp ]
+  IntelExprStateMachine SM(/*ImmDisp=*/0, /*StopOnLBrac=*/true,
+                           /*AddImmPrefix=*/false);
+  if (ParseIntelExpression(SM, End))
+    return nullptr;
+
+  if (SM.getSym()) {
+    Error(Start, "cannot use more than one symbol in memory operand");
+    return nullptr;
+  }
+  if (SM.getBaseReg()) {
+    Error(Start, "cannot use base register with variable reference");
+    return nullptr;
+  }
+  if (SM.getIndexReg()) {
+    Error(Start, "cannot use index register with variable reference");
+    return nullptr;
+  }
+
+  const MCExpr *Disp = MCConstantExpr::Create(SM.getImm(), getContext());
+  // BaseReg is non-zero to avoid assertions.  In the context of inline asm,
+  // we're pointing to a local variable in memory, so the base register is
+  // really the frame or stack pointer.
+  return X86Operand::CreateMem(/*SegReg=*/0, Disp, /*BaseReg=*/1, /*IndexReg=*/0,
+                               /*Scale=*/1, Start, End, Size, Identifier,
+                               Info.OpDecl);
+}
+
+/// Parse the '.' operator.
+bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp,
+                                                const MCExpr *&NewDisp) {
+  const AsmToken &Tok = Parser.getTok();
+  int64_t OrigDispVal, DotDispVal;
+
+  // FIXME: Handle non-constant expressions.
+  if (const MCConstantExpr *OrigDisp = dyn_cast<MCConstantExpr>(Disp))
+    OrigDispVal = OrigDisp->getValue();
+  else
+    return Error(Tok.getLoc(), "Non-constant offsets are not supported!");
+
+  // Drop the optional '.'.
+  StringRef DotDispStr = Tok.getString();
+  if (DotDispStr.startswith("."))
+    DotDispStr = DotDispStr.drop_front(1);
+
+  // .Imm gets lexed as a real.
+  if (Tok.is(AsmToken::Real)) {
+    APInt DotDisp;
+    DotDispStr.getAsInteger(10, DotDisp);
+    DotDispVal = DotDisp.getZExtValue();
+  } else if (isParsingInlineAsm() && Tok.is(AsmToken::Identifier)) {
+    unsigned DotDisp;
+    std::pair<StringRef, StringRef> BaseMember = DotDispStr.split('.');
+    if (SemaCallback->LookupInlineAsmField(BaseMember.first, BaseMember.second,
+                                           DotDisp))
+      return Error(Tok.getLoc(), "Unable to lookup field reference!");
+    DotDispVal = DotDisp;
+  } else
+    return Error(Tok.getLoc(), "Unexpected token type!");
+
+  if (isParsingInlineAsm() && Tok.is(AsmToken::Identifier)) {
+    SMLoc Loc = SMLoc::getFromPointer(DotDispStr.data());
+    unsigned Len = DotDispStr.size();
+    unsigned Val = OrigDispVal + DotDispVal;
+    InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_DotOperator, Loc, Len,
+                                                Val));
+  }
+
+  NewDisp = MCConstantExpr::Create(OrigDispVal + DotDispVal, getContext());
+  return false;
+}
+
+/// Parse the 'offset' operator.  This operator is used to specify the
+/// location rather then the content of a variable.
+std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOffsetOfOperator() {
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc OffsetOfLoc = Tok.getLoc();
+  Parser.Lex(); // Eat offset.
+
+  const MCExpr *Val;
+  InlineAsmIdentifierInfo Info;
+  SMLoc Start = Tok.getLoc(), End;
+  StringRef Identifier = Tok.getString();
+  if (ParseIntelIdentifier(Val, Identifier, Info,
+                           /*Unevaluated=*/false, End))
+    return nullptr;
+
+  // Don't emit the offset operator.
+  InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Skip, OffsetOfLoc, 7));
+
+  // The offset operator will have an 'r' constraint, thus we need to create
+  // register operand to ensure proper matching.  Just pick a GPR based on
+  // the size of a pointer.
+  unsigned RegNo =
+      is64BitMode() ? X86::RBX : (is32BitMode() ? X86::EBX : X86::BX);
+  return X86Operand::CreateReg(RegNo, Start, End, /*GetAddress=*/true,
+                               OffsetOfLoc, Identifier, Info.OpDecl);
+}
+
+enum IntelOperatorKind {
+  IOK_LENGTH,
+  IOK_SIZE,
+  IOK_TYPE
+};
+
+/// Parse the 'LENGTH', 'TYPE' and 'SIZE' operators.  The LENGTH operator
+/// returns the number of elements in an array.  It returns the value 1 for
+/// non-array variables.  The SIZE operator returns the size of a C or C++
+/// variable.  A variable's size is the product of its LENGTH and TYPE.  The
+/// TYPE operator returns the size of a C or C++ type or variable. If the
+/// variable is an array, TYPE returns the size of a single element.
+std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperator(unsigned OpKind) {
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc TypeLoc = Tok.getLoc();
+  Parser.Lex(); // Eat operator.
+
+  const MCExpr *Val = nullptr;
+  InlineAsmIdentifierInfo Info;
+  SMLoc Start = Tok.getLoc(), End;
+  StringRef Identifier = Tok.getString();
+  if (ParseIntelIdentifier(Val, Identifier, Info,
+                           /*Unevaluated=*/true, End))
+    return nullptr;
+
+  if (!Info.OpDecl)
+    return ErrorOperand(Start, "unable to lookup expression");
+
+  unsigned CVal = 0;
+  switch(OpKind) {
+  default: llvm_unreachable("Unexpected operand kind!");
+  case IOK_LENGTH: CVal = Info.Length; break;
+  case IOK_SIZE: CVal = Info.Size; break;
+  case IOK_TYPE: CVal = Info.Type; break;
+  }
+
+  // Rewrite the type operator and the C or C++ type or variable in terms of an
+  // immediate.  E.g. TYPE foo -> $$4
+  unsigned Len = End.getPointer() - TypeLoc.getPointer();
+  InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Imm, TypeLoc, Len, CVal));
+
+  const MCExpr *Imm = MCConstantExpr::Create(CVal, getContext());
+  return X86Operand::CreateImm(Imm, Start, End);
+}
+
+std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc Start, End;
+
+  // Offset, length, type and size operators.
+  if (isParsingInlineAsm()) {
+    StringRef AsmTokStr = Tok.getString();
+    if (AsmTokStr == "offset" || AsmTokStr == "OFFSET")
+      return ParseIntelOffsetOfOperator();
+    if (AsmTokStr == "length" || AsmTokStr == "LENGTH")
+      return ParseIntelOperator(IOK_LENGTH);
+    if (AsmTokStr == "size" || AsmTokStr == "SIZE")
+      return ParseIntelOperator(IOK_SIZE);
+    if (AsmTokStr == "type" || AsmTokStr == "TYPE")
+      return ParseIntelOperator(IOK_TYPE);
+  }
+
+  unsigned Size = getIntelMemOperandSize(Tok.getString());
+  if (Size) {
+    Parser.Lex(); // Eat operand size (e.g., byte, word).
+    if (Tok.getString() != "PTR" && Tok.getString() != "ptr")
+      return ErrorOperand(Start, "Expected 'PTR' or 'ptr' token!");
+    Parser.Lex(); // Eat ptr.
+  }
+  Start = Tok.getLoc();
+
+  // Immediate.
+  if (getLexer().is(AsmToken::Integer) || getLexer().is(AsmToken::Minus) ||
+      getLexer().is(AsmToken::Tilde) || getLexer().is(AsmToken::LParen)) {
+    AsmToken StartTok = Tok;
+    IntelExprStateMachine SM(/*Imm=*/0, /*StopOnLBrac=*/true,
+                             /*AddImmPrefix=*/false);
+    if (ParseIntelExpression(SM, End))
+      return nullptr;
+
+    int64_t Imm = SM.getImm();
+    if (isParsingInlineAsm()) {
+      unsigned Len = Tok.getLoc().getPointer() - Start.getPointer();
+      if (StartTok.getString().size() == Len)
+        // Just add a prefix if this wasn't a complex immediate expression.
+        InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_ImmPrefix, Start));
+      else
+        // Otherwise, rewrite the complex expression as a single immediate.
+        InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Imm, Start, Len, Imm));
+    }
+
+    if (getLexer().isNot(AsmToken::LBrac)) {
+      // If a directional label (ie. 1f or 2b) was parsed above from
+      // ParseIntelExpression() then SM.getSym() was set to a pointer to
+      // to the MCExpr with the directional local symbol and this is a
+      // memory operand not an immediate operand.
+      if (SM.getSym())
+        return X86Operand::CreateMem(SM.getSym(), Start, End, Size);
+
+      const MCExpr *ImmExpr = MCConstantExpr::Create(Imm, getContext());
+      return X86Operand::CreateImm(ImmExpr, Start, End);
+    }
+
+    // Only positive immediates are valid.
+    if (Imm < 0)
+      return ErrorOperand(Start, "expected a positive immediate displacement "
+                          "before bracketed expr.");
+
+    // Parse ImmDisp [ BaseReg + Scale*IndexReg + Disp ].
+    return ParseIntelMemOperand(Imm, Start, Size);
+  }
+
+  // Register.
+  unsigned RegNo = 0;
+  if (!ParseRegister(RegNo, Start, End)) {
+    // If this is a segment register followed by a ':', then this is the start
+    // of a segment override, otherwise this is a normal register reference.
+    if (getLexer().isNot(AsmToken::Colon))
+      return X86Operand::CreateReg(RegNo, Start, End);
+
+    return ParseIntelSegmentOverride(/*SegReg=*/RegNo, Start, Size);
+  }
+
+  // Memory operand.
+  return ParseIntelMemOperand(/*Disp=*/0, Start, Size);
+}
+
+std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() {
+  switch (getLexer().getKind()) {
+  default:
+    // Parse a memory operand with no segment register.
+    return ParseMemOperand(0, Parser.getTok().getLoc());
+  case AsmToken::Percent: {
+    // Read the register.
+    unsigned RegNo;
+    SMLoc Start, End;
+    if (ParseRegister(RegNo, Start, End)) return nullptr;
+    if (RegNo == X86::EIZ || RegNo == X86::RIZ) {
+      Error(Start, "%eiz and %riz can only be used as index registers",
+            SMRange(Start, End));
+      return nullptr;
+    }
+
+    // If this is a segment register followed by a ':', then this is the start
+    // of a memory reference, otherwise this is a normal register reference.
+    if (getLexer().isNot(AsmToken::Colon))
+      return X86Operand::CreateReg(RegNo, Start, End);
+
+    getParser().Lex(); // Eat the colon.
+    return ParseMemOperand(RegNo, Start);
+  }
+  case AsmToken::Dollar: {
+    // $42 -> immediate.
+    SMLoc Start = Parser.getTok().getLoc(), End;
+    Parser.Lex();
+    const MCExpr *Val;
+    if (getParser().parseExpression(Val, End))
+      return nullptr;
+    return X86Operand::CreateImm(Val, Start, End);
+  }
+  }
+}
+
+bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands,
+                                       const MCParsedAsmOperand &Op) {
+  if(STI.getFeatureBits() & X86::FeatureAVX512) {
+    if (getLexer().is(AsmToken::LCurly)) {
+      // Eat "{" and mark the current place.
+      const SMLoc consumedToken = consumeToken();
+      // Distinguish {1to<NUM>} from {%k<NUM>}.
+      if(getLexer().is(AsmToken::Integer)) {
+        // Parse memory broadcasting ({1to<NUM>}).
+        if (getLexer().getTok().getIntVal() != 1)
+          return !ErrorAndEatStatement(getLexer().getLoc(),
+                                       "Expected 1to<NUM> at this point");
+        Parser.Lex();  // Eat "1" of 1to8
+        if (!getLexer().is(AsmToken::Identifier) ||
+            !getLexer().getTok().getIdentifier().startswith("to"))
+          return !ErrorAndEatStatement(getLexer().getLoc(),
+                                       "Expected 1to<NUM> at this point");
+        // Recognize only reasonable suffixes.
+        const char *BroadcastPrimitive =
+          StringSwitch<const char*>(getLexer().getTok().getIdentifier())
+            .Case("to2",  "{1to2}")
+            .Case("to4",  "{1to4}")
+            .Case("to8",  "{1to8}")
+            .Case("to16", "{1to16}")
+            .Default(nullptr);
+        if (!BroadcastPrimitive)
+          return !ErrorAndEatStatement(getLexer().getLoc(),
+                                       "Invalid memory broadcast primitive.");
+        Parser.Lex();  // Eat "toN" of 1toN
+        if (!getLexer().is(AsmToken::RCurly))
+          return !ErrorAndEatStatement(getLexer().getLoc(),
+                                       "Expected } at this point");
+        Parser.Lex();  // Eat "}"
+        Operands.push_back(X86Operand::CreateToken(BroadcastPrimitive,
+                                                   consumedToken));
+        // No AVX512 specific primitives can pass
+        // after memory broadcasting, so return.
+        return true;
+      } else {
+        // Parse mask register {%k1}
+        Operands.push_back(X86Operand::CreateToken("{", consumedToken));
+        if (std::unique_ptr<X86Operand> Op = ParseOperand()) {
+          Operands.push_back(std::move(Op));
+          if (!getLexer().is(AsmToken::RCurly))
+            return !ErrorAndEatStatement(getLexer().getLoc(),
+                                         "Expected } at this point");
+          Operands.push_back(X86Operand::CreateToken("}", consumeToken()));
+
+          // Parse "zeroing non-masked" semantic {z}
+          if (getLexer().is(AsmToken::LCurly)) {
+            Operands.push_back(X86Operand::CreateToken("{z}", consumeToken()));
+            if (!getLexer().is(AsmToken::Identifier) ||
+                getLexer().getTok().getIdentifier() != "z")
+              return !ErrorAndEatStatement(getLexer().getLoc(),
+                                           "Expected z at this point");
+            Parser.Lex();  // Eat the z
+            if (!getLexer().is(AsmToken::RCurly))
+              return !ErrorAndEatStatement(getLexer().getLoc(),
+                                           "Expected } at this point");
+            Parser.Lex();  // Eat the }
+          }
+        }
+      }
+    }
+  }
+  return true;
+}
+
+/// ParseMemOperand: segment: disp(basereg, indexreg, scale).  The '%ds:' prefix
+/// has already been parsed if present.
+std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg,
+                                                          SMLoc MemStart) {
+
+  // We have to disambiguate a parenthesized expression "(4+5)" from the start
+  // of a memory operand with a missing displacement "(%ebx)" or "(,%eax)".  The
+  // only way to do this without lookahead is to eat the '(' and see what is
+  // after it.
+  const MCExpr *Disp = MCConstantExpr::Create(0, getParser().getContext());
+  if (getLexer().isNot(AsmToken::LParen)) {
+    SMLoc ExprEnd;
+    if (getParser().parseExpression(Disp, ExprEnd)) return nullptr;
+
+    // After parsing the base expression we could either have a parenthesized
+    // memory address or not.  If not, return now.  If so, eat the (.
+    if (getLexer().isNot(AsmToken::LParen)) {
+      // Unless we have a segment register, treat this as an immediate.
+      if (SegReg == 0)
+        return X86Operand::CreateMem(Disp, MemStart, ExprEnd);
+      return X86Operand::CreateMem(SegReg, Disp, 0, 0, 1, MemStart, ExprEnd);
+    }
+
+    // Eat the '('.
+    Parser.Lex();
+  } else {
+    // Okay, we have a '('.  We don't know if this is an expression or not, but
+    // so we have to eat the ( to see beyond it.
+    SMLoc LParenLoc = Parser.getTok().getLoc();
+    Parser.Lex(); // Eat the '('.
+
+    if (getLexer().is(AsmToken::Percent) || getLexer().is(AsmToken::Comma)) {
+      // Nothing to do here, fall into the code below with the '(' part of the
+      // memory operand consumed.
+    } else {
+      SMLoc ExprEnd;
+
+      // It must be an parenthesized expression, parse it now.
+      if (getParser().parseParenExpression(Disp, ExprEnd))
+        return nullptr;
+
+      // After parsing the base expression we could either have a parenthesized
+      // memory address or not.  If not, return now.  If so, eat the (.
+      if (getLexer().isNot(AsmToken::LParen)) {
+        // Unless we have a segment register, treat this as an immediate.
+        if (SegReg == 0)
+          return X86Operand::CreateMem(Disp, LParenLoc, ExprEnd);
+        return X86Operand::CreateMem(SegReg, Disp, 0, 0, 1, MemStart, ExprEnd);
+      }
+
+      // Eat the '('.
+      Parser.Lex();
+    }
+  }
+
+  // If we reached here, then we just ate the ( of the memory operand.  Process
+  // the rest of the memory operand.
+  unsigned BaseReg = 0, IndexReg = 0, Scale = 1;
+  SMLoc IndexLoc, BaseLoc;
+
+  if (getLexer().is(AsmToken::Percent)) {
+    SMLoc StartLoc, EndLoc;
+    BaseLoc = Parser.getTok().getLoc();
+    if (ParseRegister(BaseReg, StartLoc, EndLoc)) return nullptr;
+    if (BaseReg == X86::EIZ || BaseReg == X86::RIZ) {
+      Error(StartLoc, "eiz and riz can only be used as index registers",
+            SMRange(StartLoc, EndLoc));
+      return nullptr;
+    }
+  }
+
+  if (getLexer().is(AsmToken::Comma)) {
+    Parser.Lex(); // Eat the comma.
+    IndexLoc = Parser.getTok().getLoc();
+
+    // Following the comma we should have either an index register, or a scale
+    // value. We don't support the later form, but we want to parse it
+    // correctly.
+    //
+    // Not that even though it would be completely consistent to support syntax
+    // like "1(%eax,,1)", the assembler doesn't. Use "eiz" or "riz" for this.
+    if (getLexer().is(AsmToken::Percent)) {
+      SMLoc L;
+      if (ParseRegister(IndexReg, L, L)) return nullptr;
+
+      if (getLexer().isNot(AsmToken::RParen)) {
+        // Parse the scale amount:
+        //  ::= ',' [scale-expression]
+        if (getLexer().isNot(AsmToken::Comma)) {
+          Error(Parser.getTok().getLoc(),
+                "expected comma in scale expression");
+          return nullptr;
+        }
+        Parser.Lex(); // Eat the comma.
+
+        if (getLexer().isNot(AsmToken::RParen)) {
+          SMLoc Loc = Parser.getTok().getLoc();
+
+          int64_t ScaleVal;
+          if (getParser().parseAbsoluteExpression(ScaleVal)){
+            Error(Loc, "expected scale expression");
+            return nullptr;
+          }
+
+          // Validate the scale amount.
+	  if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg) &&
+              ScaleVal != 1) {
+            Error(Loc, "scale factor in 16-bit address must be 1");
+            return nullptr;
+	  }
+          if (ScaleVal != 1 && ScaleVal != 2 && ScaleVal != 4 && ScaleVal != 8){
+            Error(Loc, "scale factor in address must be 1, 2, 4 or 8");
+            return nullptr;
+          }
+          Scale = (unsigned)ScaleVal;
+        }
+      }
+    } else if (getLexer().isNot(AsmToken::RParen)) {
+      // A scale amount without an index is ignored.
+      // index.
+      SMLoc Loc = Parser.getTok().getLoc();
+
+      int64_t Value;
+      if (getParser().parseAbsoluteExpression(Value))
+        return nullptr;
+
+      if (Value != 1)
+        Warning(Loc, "scale factor without index register is ignored");
+      Scale = 1;
+    }
+  }
+
+  // Ok, we've eaten the memory operand, verify we have a ')' and eat it too.
+  if (getLexer().isNot(AsmToken::RParen)) {
+    Error(Parser.getTok().getLoc(), "unexpected token in memory operand");
+    return nullptr;
+  }
+  SMLoc MemEnd = Parser.getTok().getEndLoc();
+  Parser.Lex(); // Eat the ')'.
+
+  // Check for use of invalid 16-bit registers. Only BX/BP/SI/DI are allowed,
+  // and then only in non-64-bit modes. Except for DX, which is a special case
+  // because an unofficial form of in/out instructions uses it.
+  if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg) &&
+      (is64BitMode() || (BaseReg != X86::BX && BaseReg != X86::BP &&
+                         BaseReg != X86::SI && BaseReg != X86::DI)) &&
+      BaseReg != X86::DX) {
+    Error(BaseLoc, "invalid 16-bit base register");
+    return nullptr;
+  }
+  if (BaseReg == 0 &&
+      X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg)) {
+    Error(IndexLoc, "16-bit memory operand may not include only index register");
+    return nullptr;
+  }
+
+  StringRef ErrMsg;
+  if (CheckBaseRegAndIndexReg(BaseReg, IndexReg, ErrMsg)) {
+    Error(BaseLoc, ErrMsg);
+    return nullptr;
+  }
+
+  return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale,
+                               MemStart, MemEnd);
+}
+
+bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                                    SMLoc NameLoc, OperandVector &Operands) {
+  InstInfo = &Info;
+  StringRef PatchedName = Name;
+
+  // FIXME: Hack to recognize setneb as setne.
+  if (PatchedName.startswith("set") && PatchedName.endswith("b") &&
+      PatchedName != "setb" && PatchedName != "setnb")
+    PatchedName = PatchedName.substr(0, Name.size()-1);
+
+  // FIXME: Hack to recognize cmp<comparison code>{ss,sd,ps,pd}.
+  const MCExpr *ExtraImmOp = nullptr;
+  if ((PatchedName.startswith("cmp") || PatchedName.startswith("vcmp")) &&
+      (PatchedName.endswith("ss") || PatchedName.endswith("sd") ||
+       PatchedName.endswith("ps") || PatchedName.endswith("pd"))) {
+    bool IsVCMP = PatchedName[0] == 'v';
+    unsigned SSECCIdx = IsVCMP ? 4 : 3;
+    unsigned SSEComparisonCode = StringSwitch<unsigned>(
+      PatchedName.slice(SSECCIdx, PatchedName.size() - 2))
+      .Case("eq",       0x00)
+      .Case("lt",       0x01)
+      .Case("le",       0x02)
+      .Case("unord",    0x03)
+      .Case("neq",      0x04)
+      .Case("nlt",      0x05)
+      .Case("nle",      0x06)
+      .Case("ord",      0x07)
+      /* AVX only from here */
+      .Case("eq_uq",    0x08)
+      .Case("nge",      0x09)
+      .Case("ngt",      0x0A)
+      .Case("false",    0x0B)
+      .Case("neq_oq",   0x0C)
+      .Case("ge",       0x0D)
+      .Case("gt",       0x0E)
+      .Case("true",     0x0F)
+      .Case("eq_os",    0x10)
+      .Case("lt_oq",    0x11)
+      .Case("le_oq",    0x12)
+      .Case("unord_s",  0x13)
+      .Case("neq_us",   0x14)
+      .Case("nlt_uq",   0x15)
+      .Case("nle_uq",   0x16)
+      .Case("ord_s",    0x17)
+      .Case("eq_us",    0x18)
+      .Case("nge_uq",   0x19)
+      .Case("ngt_uq",   0x1A)
+      .Case("false_os", 0x1B)
+      .Case("neq_os",   0x1C)
+      .Case("ge_oq",    0x1D)
+      .Case("gt_oq",    0x1E)
+      .Case("true_us",  0x1F)
+      .Default(~0U);
+    if (SSEComparisonCode != ~0U && (IsVCMP || SSEComparisonCode < 8)) {
+      ExtraImmOp = MCConstantExpr::Create(SSEComparisonCode,
+                                          getParser().getContext());
+      if (PatchedName.endswith("ss")) {
+        PatchedName = IsVCMP ? "vcmpss" : "cmpss";
+      } else if (PatchedName.endswith("sd")) {
+        PatchedName = IsVCMP ? "vcmpsd" : "cmpsd";
+      } else if (PatchedName.endswith("ps")) {
+        PatchedName = IsVCMP ? "vcmpps" : "cmpps";
+      } else {
+        assert(PatchedName.endswith("pd") && "Unexpected mnemonic!");
+        PatchedName = IsVCMP ? "vcmppd" : "cmppd";
+      }
+    }
+  }
+
+  Operands.push_back(X86Operand::CreateToken(PatchedName, NameLoc));
+
+  if (ExtraImmOp && !isParsingIntelSyntax())
+    Operands.push_back(X86Operand::CreateImm(ExtraImmOp, NameLoc, NameLoc));
+
+  // Determine whether this is an instruction prefix.
+  bool isPrefix =
+    Name == "lock" || Name == "rep" ||
+    Name == "repe" || Name == "repz" ||
+    Name == "repne" || Name == "repnz" ||
+    Name == "rex64" || Name == "data16";
+
+
+  // This does the actual operand parsing.  Don't parse any more if we have a
+  // prefix juxtaposed with an operation like "lock incl 4(%rax)", because we
+  // just want to parse the "lock" as the first instruction and the "incl" as
+  // the next one.
+  if (getLexer().isNot(AsmToken::EndOfStatement) && !isPrefix) {
+
+    // Parse '*' modifier.
+    if (getLexer().is(AsmToken::Star))
+      Operands.push_back(X86Operand::CreateToken("*", consumeToken()));
+
+    // Read the operands.
+    while(1) {
+      if (std::unique_ptr<X86Operand> Op = ParseOperand()) {
+        Operands.push_back(std::move(Op));
+        if (!HandleAVX512Operand(Operands, *Operands.back()))
+          return true;
+      } else {
+         Parser.eatToEndOfStatement();
+         return true;
+      }
+      // check for comma and eat it
+      if (getLexer().is(AsmToken::Comma))
+        Parser.Lex();
+      else
+        break;
+     }
+
+    if (getLexer().isNot(AsmToken::EndOfStatement))
+      return ErrorAndEatStatement(getLexer().getLoc(),
+                                  "unexpected token in argument list");
+   }
+
+  // Consume the EndOfStatement or the prefix separator Slash
+  if (getLexer().is(AsmToken::EndOfStatement) ||
+      (isPrefix && getLexer().is(AsmToken::Slash)))
+    Parser.Lex();
+
+  if (ExtraImmOp && isParsingIntelSyntax())
+    Operands.push_back(X86Operand::CreateImm(ExtraImmOp, NameLoc, NameLoc));
+
+  // This is a terrible hack to handle "out[bwl]? %al, (%dx)" ->
+  // "outb %al, %dx".  Out doesn't take a memory form, but this is a widely
+  // documented form in various unofficial manuals, so a lot of code uses it.
+  if ((Name == "outb" || Name == "outw" || Name == "outl" || Name == "out") &&
+      Operands.size() == 3) {
+    X86Operand &Op = (X86Operand &)*Operands.back();
+    if (Op.isMem() && Op.Mem.SegReg == 0 &&
+        isa<MCConstantExpr>(Op.Mem.Disp) &&
+        cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 &&
+        Op.Mem.BaseReg == MatchRegisterName("dx") && Op.Mem.IndexReg == 0) {
+      SMLoc Loc = Op.getEndLoc();
+      Operands.back() = X86Operand::CreateReg(Op.Mem.BaseReg, Loc, Loc);
+    }
+  }
+  // Same hack for "in[bwl]? (%dx), %al" -> "inb %dx, %al".
+  if ((Name == "inb" || Name == "inw" || Name == "inl" || Name == "in") &&
+      Operands.size() == 3) {
+    X86Operand &Op = (X86Operand &)*Operands[1];
+    if (Op.isMem() && Op.Mem.SegReg == 0 &&
+        isa<MCConstantExpr>(Op.Mem.Disp) &&
+        cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 &&
+        Op.Mem.BaseReg == MatchRegisterName("dx") && Op.Mem.IndexReg == 0) {
+      SMLoc Loc = Op.getEndLoc();
+      Operands[1] = X86Operand::CreateReg(Op.Mem.BaseReg, Loc, Loc);
+    }
+  }
+
+  // Append default arguments to "ins[bwld]"
+  if (Name.startswith("ins") && Operands.size() == 1 &&
+      (Name == "insb" || Name == "insw" || Name == "insl" ||
+       Name == "insd" )) {
+    if (isParsingIntelSyntax()) {
+      Operands.push_back(X86Operand::CreateReg(X86::DX, NameLoc, NameLoc));
+      Operands.push_back(DefaultMemDIOperand(NameLoc));
+    } else {
+      Operands.push_back(X86Operand::CreateReg(X86::DX, NameLoc, NameLoc));
+      Operands.push_back(DefaultMemDIOperand(NameLoc));
+    }
+  }
+
+  // Append default arguments to "outs[bwld]"
+  if (Name.startswith("outs") && Operands.size() == 1 &&
+      (Name == "outsb" || Name == "outsw" || Name == "outsl" ||
+       Name == "outsd" )) {
+    if (isParsingIntelSyntax()) {
+      Operands.push_back(DefaultMemSIOperand(NameLoc));
+      Operands.push_back(X86Operand::CreateReg(X86::DX, NameLoc, NameLoc));
+    } else {
+      Operands.push_back(DefaultMemSIOperand(NameLoc));
+      Operands.push_back(X86Operand::CreateReg(X86::DX, NameLoc, NameLoc));
+    }
+  }
+
+  // Transform "lods[bwlq]" into "lods[bwlq] ($SIREG)" for appropriate
+  // values of $SIREG according to the mode. It would be nice if this
+  // could be achieved with InstAlias in the tables.
+  if (Name.startswith("lods") && Operands.size() == 1 &&
+      (Name == "lods" || Name == "lodsb" || Name == "lodsw" ||
+       Name == "lodsl" || Name == "lodsd" || Name == "lodsq"))
+    Operands.push_back(DefaultMemSIOperand(NameLoc));
+
+  // Transform "stos[bwlq]" into "stos[bwlq] ($DIREG)" for appropriate
+  // values of $DIREG according to the mode. It would be nice if this
+  // could be achieved with InstAlias in the tables.
+  if (Name.startswith("stos") && Operands.size() == 1 &&
+      (Name == "stos" || Name == "stosb" || Name == "stosw" ||
+       Name == "stosl" || Name == "stosd" || Name == "stosq"))
+    Operands.push_back(DefaultMemDIOperand(NameLoc));
+
+  // Transform "scas[bwlq]" into "scas[bwlq] ($DIREG)" for appropriate
+  // values of $DIREG according to the mode. It would be nice if this
+  // could be achieved with InstAlias in the tables.
+  if (Name.startswith("scas") && Operands.size() == 1 &&
+      (Name == "scas" || Name == "scasb" || Name == "scasw" ||
+       Name == "scasl" || Name == "scasd" || Name == "scasq"))
+    Operands.push_back(DefaultMemDIOperand(NameLoc));
+
+  // Add default SI and DI operands to "cmps[bwlq]".
+  if (Name.startswith("cmps") &&
+      (Name == "cmps" || Name == "cmpsb" || Name == "cmpsw" ||
+       Name == "cmpsl" || Name == "cmpsd" || Name == "cmpsq")) {
+    if (Operands.size() == 1) {
+      if (isParsingIntelSyntax()) {
+        Operands.push_back(DefaultMemSIOperand(NameLoc));
+        Operands.push_back(DefaultMemDIOperand(NameLoc));
+      } else {
+        Operands.push_back(DefaultMemDIOperand(NameLoc));
+        Operands.push_back(DefaultMemSIOperand(NameLoc));
+      }
+    } else if (Operands.size() == 3) {
+      X86Operand &Op = (X86Operand &)*Operands[1];
+      X86Operand &Op2 = (X86Operand &)*Operands[2];
+      if (!doSrcDstMatch(Op, Op2))
+        return Error(Op.getStartLoc(),
+                     "mismatching source and destination index registers");
+    }
+  }
+
+  // Add default SI and DI operands to "movs[bwlq]".
+  if ((Name.startswith("movs") &&
+      (Name == "movs" || Name == "movsb" || Name == "movsw" ||
+       Name == "movsl" || Name == "movsd" || Name == "movsq")) ||
+      (Name.startswith("smov") &&
+      (Name == "smov" || Name == "smovb" || Name == "smovw" ||
+       Name == "smovl" || Name == "smovd" || Name == "smovq"))) {
+    if (Operands.size() == 1) {
+      if (Name == "movsd")
+        Operands.back() = X86Operand::CreateToken("movsl", NameLoc);
+      if (isParsingIntelSyntax()) {
+        Operands.push_back(DefaultMemDIOperand(NameLoc));
+        Operands.push_back(DefaultMemSIOperand(NameLoc));
+      } else {
+        Operands.push_back(DefaultMemSIOperand(NameLoc));
+        Operands.push_back(DefaultMemDIOperand(NameLoc));
+      }
+    } else if (Operands.size() == 3) {
+      X86Operand &Op = (X86Operand &)*Operands[1];
+      X86Operand &Op2 = (X86Operand &)*Operands[2];
+      if (!doSrcDstMatch(Op, Op2))
+        return Error(Op.getStartLoc(),
+                     "mismatching source and destination index registers");
+    }
+  }
+
+  // FIXME: Hack to handle recognize s{hr,ar,hl} $1, <op>.  Canonicalize to
+  // "shift <op>".
+  if ((Name.startswith("shr") || Name.startswith("sar") ||
+       Name.startswith("shl") || Name.startswith("sal") ||
+       Name.startswith("rcl") || Name.startswith("rcr") ||
+       Name.startswith("rol") || Name.startswith("ror")) &&
+      Operands.size() == 3) {
+    if (isParsingIntelSyntax()) {
+      // Intel syntax
+      X86Operand &Op1 = static_cast<X86Operand &>(*Operands[2]);
+      if (Op1.isImm() && isa<MCConstantExpr>(Op1.getImm()) &&
+          cast<MCConstantExpr>(Op1.getImm())->getValue() == 1)
+        Operands.pop_back();
+    } else {
+      X86Operand &Op1 = static_cast<X86Operand &>(*Operands[1]);
+      if (Op1.isImm() && isa<MCConstantExpr>(Op1.getImm()) &&
+          cast<MCConstantExpr>(Op1.getImm())->getValue() == 1)
+        Operands.erase(Operands.begin() + 1);
+    }
+  }
+
+  // Transforms "int $3" into "int3" as a size optimization.  We can't write an
+  // instalias with an immediate operand yet.
+  if (Name == "int" && Operands.size() == 2) {
+    X86Operand &Op1 = static_cast<X86Operand &>(*Operands[1]);
+    if (Op1.isImm() && isa<MCConstantExpr>(Op1.getImm()) &&
+        cast<MCConstantExpr>(Op1.getImm())->getValue() == 3) {
+      Operands.erase(Operands.begin() + 1);
+      static_cast<X86Operand &>(*Operands[0]).setTokenValue("int3");
+    }
+  }
+
+  return false;
+}
+
+static bool convertToSExti8(MCInst &Inst, unsigned Opcode, unsigned Reg,
+                            bool isCmp) {
+  MCInst TmpInst;
+  TmpInst.setOpcode(Opcode);
+  if (!isCmp)
+    TmpInst.addOperand(MCOperand::CreateReg(Reg));
+  TmpInst.addOperand(MCOperand::CreateReg(Reg));
+  TmpInst.addOperand(Inst.getOperand(0));
+  Inst = TmpInst;
+  return true;
+}
+
+static bool convert16i16to16ri8(MCInst &Inst, unsigned Opcode,
+                                bool isCmp = false) {
+  if (!Inst.getOperand(0).isImm() ||
+      !isImmSExti16i8Value(Inst.getOperand(0).getImm()))
+    return false;
+
+  return convertToSExti8(Inst, Opcode, X86::AX, isCmp);
+}
+
+static bool convert32i32to32ri8(MCInst &Inst, unsigned Opcode,
+                                bool isCmp = false) {
+  if (!Inst.getOperand(0).isImm() ||
+      !isImmSExti32i8Value(Inst.getOperand(0).getImm()))
+    return false;
+
+  return convertToSExti8(Inst, Opcode, X86::EAX, isCmp);
+}
+
+static bool convert64i32to64ri8(MCInst &Inst, unsigned Opcode,
+                                bool isCmp = false) {
+  if (!Inst.getOperand(0).isImm() ||
+      !isImmSExti64i8Value(Inst.getOperand(0).getImm()))
+    return false;
+
+  return convertToSExti8(Inst, Opcode, X86::RAX, isCmp);
+}
+
+bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) {
+  switch (Inst.getOpcode()) {
+  default: return false;
+  case X86::AND16i16: return convert16i16to16ri8(Inst, X86::AND16ri8);
+  case X86::AND32i32: return convert32i32to32ri8(Inst, X86::AND32ri8);
+  case X86::AND64i32: return convert64i32to64ri8(Inst, X86::AND64ri8);
+  case X86::XOR16i16: return convert16i16to16ri8(Inst, X86::XOR16ri8);
+  case X86::XOR32i32: return convert32i32to32ri8(Inst, X86::XOR32ri8);
+  case X86::XOR64i32: return convert64i32to64ri8(Inst, X86::XOR64ri8);
+  case X86::OR16i16:  return convert16i16to16ri8(Inst, X86::OR16ri8);
+  case X86::OR32i32:  return convert32i32to32ri8(Inst, X86::OR32ri8);
+  case X86::OR64i32:  return convert64i32to64ri8(Inst, X86::OR64ri8);
+  case X86::CMP16i16: return convert16i16to16ri8(Inst, X86::CMP16ri8, true);
+  case X86::CMP32i32: return convert32i32to32ri8(Inst, X86::CMP32ri8, true);
+  case X86::CMP64i32: return convert64i32to64ri8(Inst, X86::CMP64ri8, true);
+  case X86::ADD16i16: return convert16i16to16ri8(Inst, X86::ADD16ri8);
+  case X86::ADD32i32: return convert32i32to32ri8(Inst, X86::ADD32ri8);
+  case X86::ADD64i32: return convert64i32to64ri8(Inst, X86::ADD64ri8);
+  case X86::SUB16i16: return convert16i16to16ri8(Inst, X86::SUB16ri8);
+  case X86::SUB32i32: return convert32i32to32ri8(Inst, X86::SUB32ri8);
+  case X86::SUB64i32: return convert64i32to64ri8(Inst, X86::SUB64ri8);
+  case X86::ADC16i16: return convert16i16to16ri8(Inst, X86::ADC16ri8);
+  case X86::ADC32i32: return convert32i32to32ri8(Inst, X86::ADC32ri8);
+  case X86::ADC64i32: return convert64i32to64ri8(Inst, X86::ADC64ri8);
+  case X86::SBB16i16: return convert16i16to16ri8(Inst, X86::SBB16ri8);
+  case X86::SBB32i32: return convert32i32to32ri8(Inst, X86::SBB32ri8);
+  case X86::SBB64i32: return convert64i32to64ri8(Inst, X86::SBB64ri8);
+  case X86::VMOVAPDrr:
+  case X86::VMOVAPDYrr:
+  case X86::VMOVAPSrr:
+  case X86::VMOVAPSYrr:
+  case X86::VMOVDQArr:
+  case X86::VMOVDQAYrr:
+  case X86::VMOVDQUrr:
+  case X86::VMOVDQUYrr:
+  case X86::VMOVUPDrr:
+  case X86::VMOVUPDYrr:
+  case X86::VMOVUPSrr:
+  case X86::VMOVUPSYrr: {
+    if (X86II::isX86_64ExtendedReg(Inst.getOperand(0).getReg()) ||
+        !X86II::isX86_64ExtendedReg(Inst.getOperand(1).getReg()))
+      return false;
+
+    unsigned NewOpc;
+    switch (Inst.getOpcode()) {
+    default: llvm_unreachable("Invalid opcode");
+    case X86::VMOVAPDrr:  NewOpc = X86::VMOVAPDrr_REV;  break;
+    case X86::VMOVAPDYrr: NewOpc = X86::VMOVAPDYrr_REV; break;
+    case X86::VMOVAPSrr:  NewOpc = X86::VMOVAPSrr_REV;  break;
+    case X86::VMOVAPSYrr: NewOpc = X86::VMOVAPSYrr_REV; break;
+    case X86::VMOVDQArr:  NewOpc = X86::VMOVDQArr_REV;  break;
+    case X86::VMOVDQAYrr: NewOpc = X86::VMOVDQAYrr_REV; break;
+    case X86::VMOVDQUrr:  NewOpc = X86::VMOVDQUrr_REV;  break;
+    case X86::VMOVDQUYrr: NewOpc = X86::VMOVDQUYrr_REV; break;
+    case X86::VMOVUPDrr:  NewOpc = X86::VMOVUPDrr_REV;  break;
+    case X86::VMOVUPDYrr: NewOpc = X86::VMOVUPDYrr_REV; break;
+    case X86::VMOVUPSrr:  NewOpc = X86::VMOVUPSrr_REV;  break;
+    case X86::VMOVUPSYrr: NewOpc = X86::VMOVUPSYrr_REV; break;
+    }
+    Inst.setOpcode(NewOpc);
+    return true;
+  }
+  case X86::VMOVSDrr:
+  case X86::VMOVSSrr: {
+    if (X86II::isX86_64ExtendedReg(Inst.getOperand(0).getReg()) ||
+        !X86II::isX86_64ExtendedReg(Inst.getOperand(2).getReg()))
+      return false;
+    unsigned NewOpc;
+    switch (Inst.getOpcode()) {
+    default: llvm_unreachable("Invalid opcode");
+    case X86::VMOVSDrr: NewOpc = X86::VMOVSDrr_REV;   break;
+    case X86::VMOVSSrr: NewOpc = X86::VMOVSSrr_REV;   break;
+    }
+    Inst.setOpcode(NewOpc);
+    return true;
+  }
+  }
+}
+
+static const char *getSubtargetFeatureName(unsigned Val);
+
+void X86AsmParser::EmitInstruction(MCInst &Inst, OperandVector &Operands,
+                                   MCStreamer &Out) {
+  Instrumentation->InstrumentInstruction(Inst, Operands, getContext(), MII,
+                                         Out);
+  Out.EmitInstruction(Inst, STI);
+}
+
+bool X86AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                           OperandVector &Operands,
+                                           MCStreamer &Out, unsigned &ErrorInfo,
+                                           bool MatchingInlineAsm) {
+  assert(!Operands.empty() && "Unexpect empty operand list!");
+  X86Operand &Op = static_cast<X86Operand &>(*Operands[0]);
+  assert(Op.isToken() && "Leading operand should always be a mnemonic!");
+  ArrayRef<SMRange> EmptyRanges = None;
+
+  // First, handle aliases that expand to multiple instructions.
+  // FIXME: This should be replaced with a real .td file alias mechanism.
+  // Also, MatchInstructionImpl should actually *do* the EmitInstruction
+  // call.
+  if (Op.getToken() == "fstsw" || Op.getToken() == "fstcw" ||
+      Op.getToken() == "fstsww" || Op.getToken() == "fstcww" ||
+      Op.getToken() == "finit" || Op.getToken() == "fsave" ||
+      Op.getToken() == "fstenv" || Op.getToken() == "fclex") {
+    MCInst Inst;
+    Inst.setOpcode(X86::WAIT);
+    Inst.setLoc(IDLoc);
+    if (!MatchingInlineAsm)
+      EmitInstruction(Inst, Operands, Out);
+
+    const char *Repl = StringSwitch<const char *>(Op.getToken())
+                           .Case("finit", "fninit")
+                           .Case("fsave", "fnsave")
+                           .Case("fstcw", "fnstcw")
+                           .Case("fstcww", "fnstcw")
+                           .Case("fstenv", "fnstenv")
+                           .Case("fstsw", "fnstsw")
+                           .Case("fstsww", "fnstsw")
+                           .Case("fclex", "fnclex")
+                           .Default(nullptr);
+    assert(Repl && "Unknown wait-prefixed instruction");
+    Operands[0] = X86Operand::CreateToken(Repl, IDLoc);
+  }
+
+  bool WasOriginallyInvalidOperand = false;
+  MCInst Inst;
+
+  // First, try a direct match.
+  switch (MatchInstructionImpl(Operands, Inst,
+                               ErrorInfo, MatchingInlineAsm,
+                               isParsingIntelSyntax())) {
+  default: break;
+  case Match_Success:
+    // Some instructions need post-processing to, for example, tweak which
+    // encoding is selected. Loop on it while changes happen so the
+    // individual transformations can chain off each other.
+    if (!MatchingInlineAsm)
+      while (processInstruction(Inst, Operands))
+        ;
+
+    Inst.setLoc(IDLoc);
+    if (!MatchingInlineAsm)
+      EmitInstruction(Inst, Operands, Out);
+    Opcode = Inst.getOpcode();
+    return false;
+  case Match_MissingFeature: {
+    assert(ErrorInfo && "Unknown missing feature!");
+    // Special case the error message for the very common case where only
+    // a single subtarget feature is missing.
+    std::string Msg = "instruction requires:";
+    unsigned Mask = 1;
+    for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) {
+      if (ErrorInfo & Mask) {
+        Msg += " ";
+        Msg += getSubtargetFeatureName(ErrorInfo & Mask);
+      }
+      Mask <<= 1;
+    }
+    return Error(IDLoc, Msg, EmptyRanges, MatchingInlineAsm);
+  }
+  case Match_InvalidOperand:
+    WasOriginallyInvalidOperand = true;
+    break;
+  case Match_MnemonicFail:
+    break;
+  }
+
+  // FIXME: Ideally, we would only attempt suffix matches for things which are
+  // valid prefixes, and we could just infer the right unambiguous
+  // type. However, that requires substantially more matcher support than the
+  // following hack.
+
+  // Change the operand to point to a temporary token.
+  StringRef Base = Op.getToken();
+  SmallString<16> Tmp;
+  Tmp += Base;
+  Tmp += ' ';
+  Op.setTokenValue(Tmp.str());
+
+  // If this instruction starts with an 'f', then it is a floating point stack
+  // instruction.  These come in up to three forms for 32-bit, 64-bit, and
+  // 80-bit floating point, which use the suffixes s,l,t respectively.
+  //
+  // Otherwise, we assume that this may be an integer instruction, which comes
+  // in 8/16/32/64-bit forms using the b,w,l,q suffixes respectively.
+  const char *Suffixes = Base[0] != 'f' ? "bwlq" : "slt\0";
+
+  // Check for the various suffix matches.
+  Tmp[Base.size()] = Suffixes[0];
+  unsigned ErrorInfoIgnore;
+  unsigned ErrorInfoMissingFeature = 0; // Init suppresses compiler warnings.
+  unsigned Match1, Match2, Match3, Match4;
+
+  Match1 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore,
+                                MatchingInlineAsm, isParsingIntelSyntax());
+  // If this returned as a missing feature failure, remember that.
+  if (Match1 == Match_MissingFeature)
+    ErrorInfoMissingFeature = ErrorInfoIgnore;
+  Tmp[Base.size()] = Suffixes[1];
+  Match2 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore,
+                                MatchingInlineAsm, isParsingIntelSyntax());
+  // If this returned as a missing feature failure, remember that.
+  if (Match2 == Match_MissingFeature)
+    ErrorInfoMissingFeature = ErrorInfoIgnore;
+  Tmp[Base.size()] = Suffixes[2];
+  Match3 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore,
+                                MatchingInlineAsm, isParsingIntelSyntax());
+  // If this returned as a missing feature failure, remember that.
+  if (Match3 == Match_MissingFeature)
+    ErrorInfoMissingFeature = ErrorInfoIgnore;
+  Tmp[Base.size()] = Suffixes[3];
+  Match4 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore,
+                                MatchingInlineAsm, isParsingIntelSyntax());
+  // If this returned as a missing feature failure, remember that.
+  if (Match4 == Match_MissingFeature)
+    ErrorInfoMissingFeature = ErrorInfoIgnore;
+
+  // Restore the old token.
+  Op.setTokenValue(Base);
+
+  // If exactly one matched, then we treat that as a successful match (and the
+  // instruction will already have been filled in correctly, since the failing
+  // matches won't have modified it).
+  unsigned NumSuccessfulMatches =
+    (Match1 == Match_Success) + (Match2 == Match_Success) +
+    (Match3 == Match_Success) + (Match4 == Match_Success);
+  if (NumSuccessfulMatches == 1) {
+    Inst.setLoc(IDLoc);
+    if (!MatchingInlineAsm)
+      EmitInstruction(Inst, Operands, Out);
+    Opcode = Inst.getOpcode();
+    return false;
+  }
+
+  // Otherwise, the match failed, try to produce a decent error message.
+
+  // If we had multiple suffix matches, then identify this as an ambiguous
+  // match.
+  if (NumSuccessfulMatches > 1) {
+    char MatchChars[4];
+    unsigned NumMatches = 0;
+    if (Match1 == Match_Success) MatchChars[NumMatches++] = Suffixes[0];
+    if (Match2 == Match_Success) MatchChars[NumMatches++] = Suffixes[1];
+    if (Match3 == Match_Success) MatchChars[NumMatches++] = Suffixes[2];
+    if (Match4 == Match_Success) MatchChars[NumMatches++] = Suffixes[3];
+
+    SmallString<126> Msg;
+    raw_svector_ostream OS(Msg);
+    OS << "ambiguous instructions require an explicit suffix (could be ";
+    for (unsigned i = 0; i != NumMatches; ++i) {
+      if (i != 0)
+        OS << ", ";
+      if (i + 1 == NumMatches)
+        OS << "or ";
+      OS << "'" << Base << MatchChars[i] << "'";
+    }
+    OS << ")";
+    Error(IDLoc, OS.str(), EmptyRanges, MatchingInlineAsm);
+    return true;
+  }
+
+  // Okay, we know that none of the variants matched successfully.
+
+  // If all of the instructions reported an invalid mnemonic, then the original
+  // mnemonic was invalid.
+  if ((Match1 == Match_MnemonicFail) && (Match2 == Match_MnemonicFail) &&
+      (Match3 == Match_MnemonicFail) && (Match4 == Match_MnemonicFail)) {
+    if (!WasOriginallyInvalidOperand) {
+      ArrayRef<SMRange> Ranges =
+          MatchingInlineAsm ? EmptyRanges : Op.getLocRange();
+      return Error(IDLoc, "invalid instruction mnemonic '" + Base + "'",
+                   Ranges, MatchingInlineAsm);
+    }
+
+    // Recover location info for the operand if we know which was the problem.
+    if (ErrorInfo != ~0U) {
+      if (ErrorInfo >= Operands.size())
+        return Error(IDLoc, "too few operands for instruction",
+                     EmptyRanges, MatchingInlineAsm);
+
+      X86Operand &Operand = (X86Operand &)*Operands[ErrorInfo];
+      if (Operand.getStartLoc().isValid()) {
+        SMRange OperandRange = Operand.getLocRange();
+        return Error(Operand.getStartLoc(), "invalid operand for instruction",
+                     OperandRange, MatchingInlineAsm);
+      }
+    }
+
+    return Error(IDLoc, "invalid operand for instruction", EmptyRanges,
+                 MatchingInlineAsm);
+  }
+
+  // If one instruction matched with a missing feature, report this as a
+  // missing feature.
+  if ((Match1 == Match_MissingFeature) + (Match2 == Match_MissingFeature) +
+      (Match3 == Match_MissingFeature) + (Match4 == Match_MissingFeature) == 1){
+    std::string Msg = "instruction requires:";
+    unsigned Mask = 1;
+    for (unsigned i = 0; i < (sizeof(ErrorInfoMissingFeature)*8-1); ++i) {
+      if (ErrorInfoMissingFeature & Mask) {
+        Msg += " ";
+        Msg += getSubtargetFeatureName(ErrorInfoMissingFeature & Mask);
+      }
+      Mask <<= 1;
+    }
+    return Error(IDLoc, Msg, EmptyRanges, MatchingInlineAsm);
+  }
+
+  // If one instruction matched with an invalid operand, report this as an
+  // operand failure.
+  if ((Match1 == Match_InvalidOperand) + (Match2 == Match_InvalidOperand) +
+      (Match3 == Match_InvalidOperand) + (Match4 == Match_InvalidOperand) == 1){
+    Error(IDLoc, "invalid operand for instruction", EmptyRanges,
+          MatchingInlineAsm);
+    return true;
+  }
+
+  // If all of these were an outright failure, report it in a useless way.
+  Error(IDLoc, "unknown use of instruction mnemonic without a size suffix",
+        EmptyRanges, MatchingInlineAsm);
+  return true;
+}
+
+bool X86AsmParser::OmitRegisterFromClobberLists(unsigned RegNo) {
+  return X86MCRegisterClasses[X86::SEGMENT_REGRegClassID].contains(RegNo);
+}
+
+bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
+  StringRef IDVal = DirectiveID.getIdentifier();
+  if (IDVal == ".word")
+    return ParseDirectiveWord(2, DirectiveID.getLoc());
+  else if (IDVal.startswith(".code"))
+    return ParseDirectiveCode(IDVal, DirectiveID.getLoc());
+  else if (IDVal.startswith(".att_syntax")) {
+    getParser().setAssemblerDialect(0);
+    return false;
+  } else if (IDVal.startswith(".intel_syntax")) {
+    getParser().setAssemblerDialect(1);
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      // FIXME: Handle noprefix
+      if (Parser.getTok().getString() == "noprefix")
+        Parser.Lex();
+    }
+    return false;
+  }
+  return true;
+}
+
+/// ParseDirectiveWord
+///  ::= .word [ expression (, expression)* ]
+bool X86AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    for (;;) {
+      const MCExpr *Value;
+      if (getParser().parseExpression(Value))
+        return false;
+
+      getParser().getStreamer().EmitValue(Value, Size);
+
+      if (getLexer().is(AsmToken::EndOfStatement))
+        break;
+
+      // FIXME: Improve diagnostic.
+      if (getLexer().isNot(AsmToken::Comma)) {
+        Error(L, "unexpected token in directive");
+        return false;
+      }
+      Parser.Lex();
+    }
+  }
+
+  Parser.Lex();
+  return false;
+}
+
+/// ParseDirectiveCode
+///  ::= .code16 | .code32 | .code64
+bool X86AsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) {
+  if (IDVal == ".code16") {
+    Parser.Lex();
+    if (!is16BitMode()) {
+      SwitchMode(X86::Mode16Bit);
+      getParser().getStreamer().EmitAssemblerFlag(MCAF_Code16);
+    }
+  } else if (IDVal == ".code32") {
+    Parser.Lex();
+    if (!is32BitMode()) {
+      SwitchMode(X86::Mode32Bit);
+      getParser().getStreamer().EmitAssemblerFlag(MCAF_Code32);
+    }
+  } else if (IDVal == ".code64") {
+    Parser.Lex();
+    if (!is64BitMode()) {
+      SwitchMode(X86::Mode64Bit);
+      getParser().getStreamer().EmitAssemblerFlag(MCAF_Code64);
+    }
+  } else {
+    Error(L, "unknown directive " + IDVal);
+    return false;
+  }
+
+  return false;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeX86AsmParser() {
+  RegisterMCAsmParser<X86AsmParser> X(TheX86_32Target);
+  RegisterMCAsmParser<X86AsmParser> Y(TheX86_64Target);
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_MATCHER_IMPLEMENTATION
+#define GET_SUBTARGET_FEATURE_NAME
+#include "X86GenAsmMatcher.inc"
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h
new file mode 100644
index 0000000..ef1565f
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h
@@ -0,0 +1,43 @@
+//===-- X86AsmParserCommon.h - Common functions for X86AsmParser ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86_ASM_PARSER_COMMON_H
+#define X86_ASM_PARSER_COMMON_H
+
+namespace llvm {
+
+inline bool isImmSExti16i8Value(uint64_t Value) {
+  return ((                                  Value <= 0x000000000000007FULL)||
+          (0x000000000000FF80ULL <= Value && Value <= 0x000000000000FFFFULL)||
+          (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
+}
+
+inline bool isImmSExti32i8Value(uint64_t Value) {
+  return ((                                  Value <= 0x000000000000007FULL)||
+          (0x00000000FFFFFF80ULL <= Value && Value <= 0x00000000FFFFFFFFULL)||
+          (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
+}
+
+inline bool isImmZExtu32u8Value(uint64_t Value) {
+    return (Value <= 0x00000000000000FFULL);
+}
+
+inline bool isImmSExti64i8Value(uint64_t Value) {
+  return ((                                  Value <= 0x000000000000007FULL)||
+          (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
+}
+
+inline bool isImmSExti64i32Value(uint64_t Value) {
+  return ((                                  Value <= 0x000000007FFFFFFFULL)||
+          (0xFFFFFFFF80000000ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
+}
+
+} // End of namespace llvm
+
+#endif // X86_ASM_PARSER_COMMON_H
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h b/contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h
new file mode 100644
index 0000000..1bbfc11
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h
@@ -0,0 +1,488 @@
+//===-- X86Operand.h - Parsed X86 machine instruction --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86_OPERAND_H
+#define X86_OPERAND_H
+
+#include "X86AsmParserCommon.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/ADT/STLExtras.h"
+
+namespace llvm {
+
+/// X86Operand - Instances of this class represent a parsed X86 machine
+/// instruction.
+struct X86Operand : public MCParsedAsmOperand {
+  enum KindTy {
+    Token,
+    Register,
+    Immediate,
+    Memory
+  } Kind;
+
+  SMLoc StartLoc, EndLoc;
+  SMLoc OffsetOfLoc;
+  StringRef SymName;
+  void *OpDecl;
+  bool AddressOf;
+
+  struct TokOp {
+    const char *Data;
+    unsigned Length;
+  };
+
+  struct RegOp {
+    unsigned RegNo;
+  };
+
+  struct ImmOp {
+    const MCExpr *Val;
+  };
+
+  struct MemOp {
+    unsigned SegReg;
+    const MCExpr *Disp;
+    unsigned BaseReg;
+    unsigned IndexReg;
+    unsigned Scale;
+    unsigned Size;
+  };
+
+  union {
+    struct TokOp Tok;
+    struct RegOp Reg;
+    struct ImmOp Imm;
+    struct MemOp Mem;
+  };
+
+  X86Operand(KindTy K, SMLoc Start, SMLoc End)
+    : Kind(K), StartLoc(Start), EndLoc(End) {}
+
+  StringRef getSymName() override { return SymName; }
+  void *getOpDecl() override { return OpDecl; }
+
+  /// getStartLoc - Get the location of the first token of this operand.
+  SMLoc getStartLoc() const override { return StartLoc; }
+  /// getEndLoc - Get the location of the last token of this operand.
+  SMLoc getEndLoc() const override { return EndLoc; }
+  /// getLocRange - Get the range between the first and last token of this
+  /// operand.
+  SMRange getLocRange() const { return SMRange(StartLoc, EndLoc); }
+  /// getOffsetOfLoc - Get the location of the offset operator.
+  SMLoc getOffsetOfLoc() const override { return OffsetOfLoc; }
+
+  void print(raw_ostream &OS) const override {}
+
+  StringRef getToken() const {
+    assert(Kind == Token && "Invalid access!");
+    return StringRef(Tok.Data, Tok.Length);
+  }
+  void setTokenValue(StringRef Value) {
+    assert(Kind == Token && "Invalid access!");
+    Tok.Data = Value.data();
+    Tok.Length = Value.size();
+  }
+
+  unsigned getReg() const override {
+    assert(Kind == Register && "Invalid access!");
+    return Reg.RegNo;
+  }
+
+  const MCExpr *getImm() const {
+    assert(Kind == Immediate && "Invalid access!");
+    return Imm.Val;
+  }
+
+  const MCExpr *getMemDisp() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.Disp;
+  }
+  unsigned getMemSegReg() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.SegReg;
+  }
+  unsigned getMemBaseReg() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.BaseReg;
+  }
+  unsigned getMemIndexReg() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.IndexReg;
+  }
+  unsigned getMemScale() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.Scale;
+  }
+
+  bool isToken() const override {return Kind == Token; }
+
+  bool isImm() const override { return Kind == Immediate; }
+
+  bool isImmSExti16i8() const {
+    if (!isImm())
+      return false;
+
+    // If this isn't a constant expr, just assume it fits and let relaxation
+    // handle it.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE)
+      return true;
+
+    // Otherwise, check the value is in a range that makes sense for this
+    // extension.
+    return isImmSExti16i8Value(CE->getValue());
+  }
+  bool isImmSExti32i8() const {
+    if (!isImm())
+      return false;
+
+    // If this isn't a constant expr, just assume it fits and let relaxation
+    // handle it.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE)
+      return true;
+
+    // Otherwise, check the value is in a range that makes sense for this
+    // extension.
+    return isImmSExti32i8Value(CE->getValue());
+  }
+  bool isImmZExtu32u8() const {
+    if (!isImm())
+      return false;
+
+    // If this isn't a constant expr, just assume it fits and let relaxation
+    // handle it.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE)
+      return true;
+
+    // Otherwise, check the value is in a range that makes sense for this
+    // extension.
+    return isImmZExtu32u8Value(CE->getValue());
+  }
+  bool isImmSExti64i8() const {
+    if (!isImm())
+      return false;
+
+    // If this isn't a constant expr, just assume it fits and let relaxation
+    // handle it.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE)
+      return true;
+
+    // Otherwise, check the value is in a range that makes sense for this
+    // extension.
+    return isImmSExti64i8Value(CE->getValue());
+  }
+  bool isImmSExti64i32() const {
+    if (!isImm())
+      return false;
+
+    // If this isn't a constant expr, just assume it fits and let relaxation
+    // handle it.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE)
+      return true;
+
+    // Otherwise, check the value is in a range that makes sense for this
+    // extension.
+    return isImmSExti64i32Value(CE->getValue());
+  }
+
+  bool isOffsetOf() const override {
+    return OffsetOfLoc.getPointer();
+  }
+
+  bool needAddressOf() const override {
+    return AddressOf;
+  }
+
+  bool isMem() const override { return Kind == Memory; }
+  bool isMem8() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 8);
+  }
+  bool isMem16() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 16);
+  }
+  bool isMem32() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 32);
+  }
+  bool isMem64() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 64);
+  }
+  bool isMem80() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 80);
+  }
+  bool isMem128() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 128);
+  }
+  bool isMem256() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 256);
+  }
+  bool isMem512() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 512);
+  }
+
+  bool isMemVX32() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 32) &&
+      getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM15;
+  }
+  bool isMemVY32() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 32) &&
+      getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM15;
+  }
+  bool isMemVX64() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 64) &&
+      getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM15;
+  }
+  bool isMemVY64() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 64) &&
+      getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM15;
+  }
+  bool isMemVZ32() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 32) &&
+      getMemIndexReg() >= X86::ZMM0 && getMemIndexReg() <= X86::ZMM31;
+  }
+  bool isMemVZ64() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 64) &&
+      getMemIndexReg() >= X86::ZMM0 && getMemIndexReg() <= X86::ZMM31;
+  }
+
+  bool isAbsMem() const {
+    return Kind == Memory && !getMemSegReg() && !getMemBaseReg() &&
+      !getMemIndexReg() && getMemScale() == 1;
+  }
+
+  bool isSrcIdx() const {
+    return !getMemIndexReg() && getMemScale() == 1 &&
+      (getMemBaseReg() == X86::RSI || getMemBaseReg() == X86::ESI ||
+       getMemBaseReg() == X86::SI) && isa<MCConstantExpr>(getMemDisp()) &&
+      cast<MCConstantExpr>(getMemDisp())->getValue() == 0;
+  }
+  bool isSrcIdx8() const {
+    return isMem8() && isSrcIdx();
+  }
+  bool isSrcIdx16() const {
+    return isMem16() && isSrcIdx();
+  }
+  bool isSrcIdx32() const {
+    return isMem32() && isSrcIdx();
+  }
+  bool isSrcIdx64() const {
+    return isMem64() && isSrcIdx();
+  }
+
+  bool isDstIdx() const {
+    return !getMemIndexReg() && getMemScale() == 1 &&
+      (getMemSegReg() == 0 || getMemSegReg() == X86::ES) &&
+      (getMemBaseReg() == X86::RDI || getMemBaseReg() == X86::EDI ||
+       getMemBaseReg() == X86::DI) && isa<MCConstantExpr>(getMemDisp()) &&
+      cast<MCConstantExpr>(getMemDisp())->getValue() == 0;
+  }
+  bool isDstIdx8() const {
+    return isMem8() && isDstIdx();
+  }
+  bool isDstIdx16() const {
+    return isMem16() && isDstIdx();
+  }
+  bool isDstIdx32() const {
+    return isMem32() && isDstIdx();
+  }
+  bool isDstIdx64() const {
+    return isMem64() && isDstIdx();
+  }
+
+  bool isMemOffs8() const {
+    return Kind == Memory && !getMemBaseReg() &&
+      !getMemIndexReg() && getMemScale() == 1 && (!Mem.Size || Mem.Size == 8);
+  }
+  bool isMemOffs16() const {
+    return Kind == Memory && !getMemBaseReg() &&
+      !getMemIndexReg() && getMemScale() == 1 && (!Mem.Size || Mem.Size == 16);
+  }
+  bool isMemOffs32() const {
+    return Kind == Memory && !getMemBaseReg() &&
+      !getMemIndexReg() && getMemScale() == 1 && (!Mem.Size || Mem.Size == 32);
+  }
+  bool isMemOffs64() const {
+    return Kind == Memory && !getMemBaseReg() &&
+      !getMemIndexReg() && getMemScale() == 1 && (!Mem.Size || Mem.Size == 64);
+  }
+
+  bool isReg() const override { return Kind == Register; }
+
+  bool isGR32orGR64() const {
+    return Kind == Register &&
+      (X86MCRegisterClasses[X86::GR32RegClassID].contains(getReg()) ||
+      X86MCRegisterClasses[X86::GR64RegClassID].contains(getReg()));
+  }
+
+  void addExpr(MCInst &Inst, const MCExpr *Expr) const {
+    // Add as immediates when possible.
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+      Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::CreateExpr(Expr));
+  }
+
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getReg()));
+  }
+
+  static unsigned getGR32FromGR64(unsigned RegNo) {
+    switch (RegNo) {
+    default: llvm_unreachable("Unexpected register");
+    case X86::RAX: return X86::EAX;
+    case X86::RCX: return X86::ECX;
+    case X86::RDX: return X86::EDX;
+    case X86::RBX: return X86::EBX;
+    case X86::RBP: return X86::EBP;
+    case X86::RSP: return X86::ESP;
+    case X86::RSI: return X86::ESI;
+    case X86::RDI: return X86::EDI;
+    case X86::R8: return X86::R8D;
+    case X86::R9: return X86::R9D;
+    case X86::R10: return X86::R10D;
+    case X86::R11: return X86::R11D;
+    case X86::R12: return X86::R12D;
+    case X86::R13: return X86::R13D;
+    case X86::R14: return X86::R14D;
+    case X86::R15: return X86::R15D;
+    case X86::RIP: return X86::EIP;
+    }
+  }
+
+  void addGR32orGR64Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    unsigned RegNo = getReg();
+    if (X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo))
+      RegNo = getGR32FromGR64(RegNo);
+    Inst.addOperand(MCOperand::CreateReg(RegNo));
+  }
+
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
+
+  void addMemOperands(MCInst &Inst, unsigned N) const {
+    assert((N == 5) && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getMemBaseReg()));
+    Inst.addOperand(MCOperand::CreateImm(getMemScale()));
+    Inst.addOperand(MCOperand::CreateReg(getMemIndexReg()));
+    addExpr(Inst, getMemDisp());
+    Inst.addOperand(MCOperand::CreateReg(getMemSegReg()));
+  }
+
+  void addAbsMemOperands(MCInst &Inst, unsigned N) const {
+    assert((N == 1) && "Invalid number of operands!");
+    // Add as immediates when possible.
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemDisp()))
+      Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::CreateExpr(getMemDisp()));
+  }
+
+  void addSrcIdxOperands(MCInst &Inst, unsigned N) const {
+    assert((N == 2) && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getMemBaseReg()));
+    Inst.addOperand(MCOperand::CreateReg(getMemSegReg()));
+  }
+  void addDstIdxOperands(MCInst &Inst, unsigned N) const {
+    assert((N == 1) && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getMemBaseReg()));
+  }
+
+  void addMemOffsOperands(MCInst &Inst, unsigned N) const {
+    assert((N == 2) && "Invalid number of operands!");
+    // Add as immediates when possible.
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemDisp()))
+      Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::CreateExpr(getMemDisp()));
+    Inst.addOperand(MCOperand::CreateReg(getMemSegReg()));
+  }
+
+  static std::unique_ptr<X86Operand> CreateToken(StringRef Str, SMLoc Loc) {
+    SMLoc EndLoc = SMLoc::getFromPointer(Loc.getPointer() + Str.size());
+    auto Res = llvm::make_unique<X86Operand>(Token, Loc, EndLoc);
+    Res->Tok.Data = Str.data();
+    Res->Tok.Length = Str.size();
+    return Res;
+  }
+
+  static std::unique_ptr<X86Operand>
+  CreateReg(unsigned RegNo, SMLoc StartLoc, SMLoc EndLoc,
+            bool AddressOf = false, SMLoc OffsetOfLoc = SMLoc(),
+            StringRef SymName = StringRef(), void *OpDecl = nullptr) {
+    auto Res = llvm::make_unique<X86Operand>(Register, StartLoc, EndLoc);
+    Res->Reg.RegNo = RegNo;
+    Res->AddressOf = AddressOf;
+    Res->OffsetOfLoc = OffsetOfLoc;
+    Res->SymName = SymName;
+    Res->OpDecl = OpDecl;
+    return Res;
+  }
+
+  static std::unique_ptr<X86Operand> CreateImm(const MCExpr *Val,
+                                               SMLoc StartLoc, SMLoc EndLoc) {
+    auto Res = llvm::make_unique<X86Operand>(Immediate, StartLoc, EndLoc);
+    Res->Imm.Val = Val;
+    return Res;
+  }
+
+  /// Create an absolute memory operand.
+  static std::unique_ptr<X86Operand>
+  CreateMem(const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc, unsigned Size = 0,
+            StringRef SymName = StringRef(), void *OpDecl = nullptr) {
+    auto Res = llvm::make_unique<X86Operand>(Memory, StartLoc, EndLoc);
+    Res->Mem.SegReg   = 0;
+    Res->Mem.Disp     = Disp;
+    Res->Mem.BaseReg  = 0;
+    Res->Mem.IndexReg = 0;
+    Res->Mem.Scale    = 1;
+    Res->Mem.Size     = Size;
+    Res->SymName      = SymName;
+    Res->OpDecl       = OpDecl;
+    Res->AddressOf    = false;
+    return Res;
+  }
+
+  /// Create a generalized memory operand.
+  static std::unique_ptr<X86Operand>
+  CreateMem(unsigned SegReg, const MCExpr *Disp, unsigned BaseReg,
+            unsigned IndexReg, unsigned Scale, SMLoc StartLoc, SMLoc EndLoc,
+            unsigned Size = 0, StringRef SymName = StringRef(),
+            void *OpDecl = nullptr) {
+    // We should never just have a displacement, that should be parsed as an
+    // absolute memory operand.
+    assert((SegReg || BaseReg || IndexReg) && "Invalid memory operand!");
+
+    // The scale should always be one of {1,2,4,8}.
+    assert(((Scale == 1 || Scale == 2 || Scale == 4 || Scale == 8)) &&
+           "Invalid scale!");
+    auto Res = llvm::make_unique<X86Operand>(Memory, StartLoc, EndLoc);
+    Res->Mem.SegReg   = SegReg;
+    Res->Mem.Disp     = Disp;
+    Res->Mem.BaseReg  = BaseReg;
+    Res->Mem.IndexReg = IndexReg;
+    Res->Mem.Scale    = Scale;
+    Res->Mem.Size     = Size;
+    Res->SymName      = SymName;
+    Res->OpDecl       = OpDecl;
+    Res->AddressOf    = false;
+    return Res;
+  }
+};
+
+} // End of namespace llvm
+
+#endif // X86_OPERAND
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
new file mode 100644
index 0000000..521bd21
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -0,0 +1,816 @@
+//===-- X86Disassembler.cpp - Disassembler for x86 and x86_64 -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the X86 Disassembler.
+// It contains code to translate the data produced by the decoder into
+//  MCInsts.
+// Documentation for the disassembler can be found in X86Disassembler.h.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86Disassembler.h"
+#include "X86DisassemblerDecoder.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MemoryObject.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace llvm::X86Disassembler;
+
+#define DEBUG_TYPE "x86-disassembler"
+
+#define GET_REGINFO_ENUM
+#include "X86GenRegisterInfo.inc"
+#define GET_INSTRINFO_ENUM
+#include "X86GenInstrInfo.inc"
+#define GET_SUBTARGETINFO_ENUM
+#include "X86GenSubtargetInfo.inc"
+
+void llvm::X86Disassembler::Debug(const char *file, unsigned line,
+                                  const char *s) {
+  dbgs() << file << ":" << line << ": " << s;
+}
+
+const char *llvm::X86Disassembler::GetInstrName(unsigned Opcode,
+                                                const void *mii) {
+  const MCInstrInfo *MII = static_cast<const MCInstrInfo *>(mii);
+  return MII->getName(Opcode);
+}
+
+#define debug(s) DEBUG(Debug(__FILE__, __LINE__, s));
+
+namespace llvm {  
+  
+// Fill-ins to make the compiler happy.  These constants are never actually
+//   assigned; they are just filler to make an automatically-generated switch
+//   statement work.
+namespace X86 {
+  enum {
+    BX_SI = 500,
+    BX_DI = 501,
+    BP_SI = 502,
+    BP_DI = 503,
+    sib   = 504,
+    sib64 = 505
+  };
+}
+
+extern Target TheX86_32Target, TheX86_64Target;
+
+}
+
+static bool translateInstruction(MCInst &target,
+                                InternalInstruction &source,
+                                const MCDisassembler *Dis);
+
+X86GenericDisassembler::X86GenericDisassembler(
+                                         const MCSubtargetInfo &STI,
+                                         MCContext &Ctx,
+                                         std::unique_ptr<const MCInstrInfo> MII)
+  : MCDisassembler(STI, Ctx), MII(std::move(MII)) {
+  switch (STI.getFeatureBits() &
+          (X86::Mode16Bit | X86::Mode32Bit | X86::Mode64Bit)) {
+  case X86::Mode16Bit:
+    fMode = MODE_16BIT;
+    break;
+  case X86::Mode32Bit:
+    fMode = MODE_32BIT;
+    break;
+  case X86::Mode64Bit:
+    fMode = MODE_64BIT;
+    break;
+  default:
+    llvm_unreachable("Invalid CPU mode");
+  }
+}
+
+/// regionReader - a callback function that wraps the readByte method from
+///   MemoryObject.
+///
+/// @param arg      - The generic callback parameter.  In this case, this should
+///                   be a pointer to a MemoryObject.
+/// @param byte     - A pointer to the byte to be read.
+/// @param address  - The address to be read.
+static int regionReader(const void* arg, uint8_t* byte, uint64_t address) {
+  const MemoryObject* region = static_cast<const MemoryObject*>(arg);
+  return region->readByte(address, byte);
+}
+
+/// logger - a callback function that wraps the operator<< method from
+///   raw_ostream.
+///
+/// @param arg      - The generic callback parameter.  This should be a pointe
+///                   to a raw_ostream.
+/// @param log      - A string to be logged.  logger() adds a newline.
+static void logger(void* arg, const char* log) {
+  if (!arg)
+    return;
+  
+  raw_ostream &vStream = *(static_cast<raw_ostream*>(arg));
+  vStream << log << "\n";
+}  
+  
+//
+// Public interface for the disassembler
+//
+
+MCDisassembler::DecodeStatus
+X86GenericDisassembler::getInstruction(MCInst &instr,
+                                       uint64_t &size,
+                                       const MemoryObject &region,
+                                       uint64_t address,
+                                       raw_ostream &vStream,
+                                       raw_ostream &cStream) const {
+  CommentStream = &cStream;
+
+  InternalInstruction internalInstr;
+
+  dlog_t loggerFn = logger;
+  if (&vStream == &nulls())
+    loggerFn = nullptr; // Disable logging completely if it's going to nulls().
+  
+  int ret = decodeInstruction(&internalInstr,
+                              regionReader,
+                              (const void*)&region,
+                              loggerFn,
+                              (void*)&vStream,
+                              (const void*)MII.get(),
+                              address,
+                              fMode);
+
+  if (ret) {
+    size = internalInstr.readerCursor - address;
+    return Fail;
+  }
+  else {
+    size = internalInstr.length;
+    return (!translateInstruction(instr, internalInstr, this)) ?
+            Success : Fail;
+  }
+}
+
+//
+// Private code that translates from struct InternalInstructions to MCInsts.
+//
+
+/// translateRegister - Translates an internal register to the appropriate LLVM
+///   register, and appends it as an operand to an MCInst.
+///
+/// @param mcInst     - The MCInst to append to.
+/// @param reg        - The Reg to append.
+static void translateRegister(MCInst &mcInst, Reg reg) {
+#define ENTRY(x) X86::x,
+  uint8_t llvmRegnums[] = {
+    ALL_REGS
+    0
+  };
+#undef ENTRY
+
+  uint8_t llvmRegnum = llvmRegnums[reg];
+  mcInst.addOperand(MCOperand::CreateReg(llvmRegnum));
+}
+
+/// tryAddingSymbolicOperand - trys to add a symbolic operand in place of the
+/// immediate Value in the MCInst. 
+///
+/// @param Value      - The immediate Value, has had any PC adjustment made by
+///                     the caller.
+/// @param isBranch   - If the instruction is a branch instruction
+/// @param Address    - The starting address of the instruction
+/// @param Offset     - The byte offset to this immediate in the instruction
+/// @param Width      - The byte width of this immediate in the instruction
+///
+/// If the getOpInfo() function was set when setupForSymbolicDisassembly() was
+/// called then that function is called to get any symbolic information for the
+/// immediate in the instruction using the Address, Offset and Width.  If that
+/// returns non-zero then the symbolic information it returns is used to create 
+/// an MCExpr and that is added as an operand to the MCInst.  If getOpInfo()
+/// returns zero and isBranch is true then a symbol look up for immediate Value
+/// is done and if a symbol is found an MCExpr is created with that, else
+/// an MCExpr with the immediate Value is created.  This function returns true
+/// if it adds an operand to the MCInst and false otherwise.
+static bool tryAddingSymbolicOperand(int64_t Value, bool isBranch,
+                                     uint64_t Address, uint64_t Offset,
+                                     uint64_t Width, MCInst &MI, 
+                                     const MCDisassembler *Dis) {  
+  return Dis->tryAddingSymbolicOperand(MI, Value, Address, isBranch,
+                                       Offset, Width);
+}
+
+/// tryAddingPcLoadReferenceComment - trys to add a comment as to what is being
+/// referenced by a load instruction with the base register that is the rip.
+/// These can often be addresses in a literal pool.  The Address of the
+/// instruction and its immediate Value are used to determine the address
+/// being referenced in the literal pool entry.  The SymbolLookUp call back will
+/// return a pointer to a literal 'C' string if the referenced address is an 
+/// address into a section with 'C' string literals.
+static void tryAddingPcLoadReferenceComment(uint64_t Address, uint64_t Value,
+                                            const void *Decoder) {
+  const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
+  Dis->tryAddingPcLoadReferenceComment(Value, Address);
+}
+
+static const uint8_t segmentRegnums[SEG_OVERRIDE_max] = {
+  0,        // SEG_OVERRIDE_NONE
+  X86::CS,
+  X86::SS,
+  X86::DS,
+  X86::ES,
+  X86::FS,
+  X86::GS
+};
+
+/// translateSrcIndex   - Appends a source index operand to an MCInst.
+///
+/// @param mcInst       - The MCInst to append to.
+/// @param insn         - The internal instruction.
+static bool translateSrcIndex(MCInst &mcInst, InternalInstruction &insn) {
+  unsigned baseRegNo;
+
+  if (insn.mode == MODE_64BIT)
+    baseRegNo = insn.prefixPresent[0x67] ? X86::ESI : X86::RSI;
+  else if (insn.mode == MODE_32BIT)
+    baseRegNo = insn.prefixPresent[0x67] ? X86::SI : X86::ESI;
+  else {
+    assert(insn.mode == MODE_16BIT);
+    baseRegNo = insn.prefixPresent[0x67] ? X86::ESI : X86::SI;
+  }
+  MCOperand baseReg = MCOperand::CreateReg(baseRegNo);
+  mcInst.addOperand(baseReg);
+
+  MCOperand segmentReg;
+  segmentReg = MCOperand::CreateReg(segmentRegnums[insn.segmentOverride]);
+  mcInst.addOperand(segmentReg);
+  return false;
+}
+
+/// translateDstIndex   - Appends a destination index operand to an MCInst.
+///
+/// @param mcInst       - The MCInst to append to.
+/// @param insn         - The internal instruction.
+
+static bool translateDstIndex(MCInst &mcInst, InternalInstruction &insn) {
+  unsigned baseRegNo;
+
+  if (insn.mode == MODE_64BIT)
+    baseRegNo = insn.prefixPresent[0x67] ? X86::EDI : X86::RDI;
+  else if (insn.mode == MODE_32BIT)
+    baseRegNo = insn.prefixPresent[0x67] ? X86::DI : X86::EDI;
+  else {
+    assert(insn.mode == MODE_16BIT);
+    baseRegNo = insn.prefixPresent[0x67] ? X86::EDI : X86::DI;
+  }
+  MCOperand baseReg = MCOperand::CreateReg(baseRegNo);
+  mcInst.addOperand(baseReg);
+  return false;
+}
+
+/// translateImmediate  - Appends an immediate operand to an MCInst.
+///
+/// @param mcInst       - The MCInst to append to.
+/// @param immediate    - The immediate value to append.
+/// @param operand      - The operand, as stored in the descriptor table.
+/// @param insn         - The internal instruction.
+static void translateImmediate(MCInst &mcInst, uint64_t immediate,
+                               const OperandSpecifier &operand,
+                               InternalInstruction &insn,
+                               const MCDisassembler *Dis) {  
+  // Sign-extend the immediate if necessary.
+
+  OperandType type = (OperandType)operand.type;
+
+  bool isBranch = false;
+  uint64_t pcrel = 0;
+  if (type == TYPE_RELv) {
+    isBranch = true;
+    pcrel = insn.startLocation +
+            insn.immediateOffset + insn.immediateSize;
+    switch (insn.displacementSize) {
+    default:
+      break;
+    case 1:
+      if(immediate & 0x80)
+        immediate |= ~(0xffull);
+      break;
+    case 2:
+      if(immediate & 0x8000)
+        immediate |= ~(0xffffull);
+      break;
+    case 4:
+      if(immediate & 0x80000000)
+        immediate |= ~(0xffffffffull);
+      break;
+    case 8:
+      break;
+    }
+  }
+  // By default sign-extend all X86 immediates based on their encoding.
+  else if (type == TYPE_IMM8 || type == TYPE_IMM16 || type == TYPE_IMM32 ||
+           type == TYPE_IMM64 || type == TYPE_IMMv) {
+    uint32_t Opcode = mcInst.getOpcode();
+    switch (operand.encoding) {
+    default:
+      break;
+    case ENCODING_IB:
+      // Special case those X86 instructions that use the imm8 as a set of
+      // bits, bit count, etc. and are not sign-extend.
+      if (Opcode != X86::BLENDPSrri && Opcode != X86::BLENDPDrri &&
+          Opcode != X86::PBLENDWrri && Opcode != X86::MPSADBWrri &&
+          Opcode != X86::DPPSrri && Opcode != X86::DPPDrri &&
+          Opcode != X86::INSERTPSrr && Opcode != X86::VBLENDPSYrri &&
+          Opcode != X86::VBLENDPSYrmi && Opcode != X86::VBLENDPDYrri &&
+          Opcode != X86::VBLENDPDYrmi && Opcode != X86::VPBLENDWrri &&
+          Opcode != X86::VMPSADBWrri && Opcode != X86::VDPPSYrri &&
+          Opcode != X86::VDPPSYrmi && Opcode != X86::VDPPDrri &&
+          Opcode != X86::VINSERTPSrr)
+        if(immediate & 0x80)
+          immediate |= ~(0xffull);
+      break;
+    case ENCODING_IW:
+      if(immediate & 0x8000)
+        immediate |= ~(0xffffull);
+      break;
+    case ENCODING_ID:
+      if(immediate & 0x80000000)
+        immediate |= ~(0xffffffffull);
+      break;
+    case ENCODING_IO:
+      break;
+    }
+  }
+
+  switch (type) {
+  case TYPE_XMM32:
+  case TYPE_XMM64:
+  case TYPE_XMM128:
+    mcInst.addOperand(MCOperand::CreateReg(X86::XMM0 + (immediate >> 4)));
+    return;
+  case TYPE_XMM256:
+    mcInst.addOperand(MCOperand::CreateReg(X86::YMM0 + (immediate >> 4)));
+    return;
+  case TYPE_XMM512:
+    mcInst.addOperand(MCOperand::CreateReg(X86::ZMM0 + (immediate >> 4)));
+    return;
+  case TYPE_REL8:
+    isBranch = true;
+    pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize;
+    if(immediate & 0x80)
+      immediate |= ~(0xffull);
+    break;
+  case TYPE_REL32:
+  case TYPE_REL64:
+    isBranch = true;
+    pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize;
+    if(immediate & 0x80000000)
+      immediate |= ~(0xffffffffull);
+    break;
+  default:
+    // operand is 64 bits wide.  Do nothing.
+    break;
+  }
+
+  if(!tryAddingSymbolicOperand(immediate + pcrel, isBranch, insn.startLocation,
+                               insn.immediateOffset, insn.immediateSize,
+                               mcInst, Dis))
+    mcInst.addOperand(MCOperand::CreateImm(immediate));
+
+  if (type == TYPE_MOFFS8 || type == TYPE_MOFFS16 ||
+      type == TYPE_MOFFS32 || type == TYPE_MOFFS64) {
+    MCOperand segmentReg;
+    segmentReg = MCOperand::CreateReg(segmentRegnums[insn.segmentOverride]);
+    mcInst.addOperand(segmentReg);
+  }
+}
+
+/// translateRMRegister - Translates a register stored in the R/M field of the
+///   ModR/M byte to its LLVM equivalent and appends it to an MCInst.
+/// @param mcInst       - The MCInst to append to.
+/// @param insn         - The internal instruction to extract the R/M field
+///                       from.
+/// @return             - 0 on success; -1 otherwise
+static bool translateRMRegister(MCInst &mcInst,
+                                InternalInstruction &insn) {
+  if (insn.eaBase == EA_BASE_sib || insn.eaBase == EA_BASE_sib64) {
+    debug("A R/M register operand may not have a SIB byte");
+    return true;
+  }
+  
+  switch (insn.eaBase) {
+  default:
+    debug("Unexpected EA base register");
+    return true;
+  case EA_BASE_NONE:
+    debug("EA_BASE_NONE for ModR/M base");
+    return true;
+#define ENTRY(x) case EA_BASE_##x:
+  ALL_EA_BASES
+#undef ENTRY
+    debug("A R/M register operand may not have a base; "
+          "the operand must be a register.");
+    return true;
+#define ENTRY(x)                                                      \
+  case EA_REG_##x:                                                    \
+    mcInst.addOperand(MCOperand::CreateReg(X86::x)); break;
+  ALL_REGS
+#undef ENTRY
+  }
+  
+  return false;
+}
+
+/// translateRMMemory - Translates a memory operand stored in the Mod and R/M
+///   fields of an internal instruction (and possibly its SIB byte) to a memory
+///   operand in LLVM's format, and appends it to an MCInst.
+///
+/// @param mcInst       - The MCInst to append to.
+/// @param insn         - The instruction to extract Mod, R/M, and SIB fields
+///                       from.
+/// @return             - 0 on success; nonzero otherwise
+static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
+                              const MCDisassembler *Dis) {  
+  // Addresses in an MCInst are represented as five operands:
+  //   1. basereg       (register)  The R/M base, or (if there is a SIB) the 
+  //                                SIB base
+  //   2. scaleamount   (immediate) 1, or (if there is a SIB) the specified 
+  //                                scale amount
+  //   3. indexreg      (register)  x86_registerNONE, or (if there is a SIB)
+  //                                the index (which is multiplied by the 
+  //                                scale amount)
+  //   4. displacement  (immediate) 0, or the displacement if there is one
+  //   5. segmentreg    (register)  x86_registerNONE for now, but could be set
+  //                                if we have segment overrides
+  
+  MCOperand baseReg;
+  MCOperand scaleAmount;
+  MCOperand indexReg;
+  MCOperand displacement;
+  MCOperand segmentReg;
+  uint64_t pcrel = 0;
+  
+  if (insn.eaBase == EA_BASE_sib || insn.eaBase == EA_BASE_sib64) {
+    if (insn.sibBase != SIB_BASE_NONE) {
+      switch (insn.sibBase) {
+      default:
+        debug("Unexpected sibBase");
+        return true;
+#define ENTRY(x)                                          \
+      case SIB_BASE_##x:                                  \
+        baseReg = MCOperand::CreateReg(X86::x); break;
+      ALL_SIB_BASES
+#undef ENTRY
+      }
+    } else {
+      baseReg = MCOperand::CreateReg(0);
+    }
+
+    // Check whether we are handling VSIB addressing mode for GATHER.
+    // If sibIndex was set to SIB_INDEX_NONE, index offset is 4 and
+    // we should use SIB_INDEX_XMM4|YMM4 for VSIB.
+    // I don't see a way to get the correct IndexReg in readSIB:
+    //   We can tell whether it is VSIB or SIB after instruction ID is decoded,
+    //   but instruction ID may not be decoded yet when calling readSIB.
+    uint32_t Opcode = mcInst.getOpcode();
+    bool IndexIs128 = (Opcode == X86::VGATHERDPDrm ||
+                       Opcode == X86::VGATHERDPDYrm ||
+                       Opcode == X86::VGATHERQPDrm ||
+                       Opcode == X86::VGATHERDPSrm ||
+                       Opcode == X86::VGATHERQPSrm ||
+                       Opcode == X86::VPGATHERDQrm ||
+                       Opcode == X86::VPGATHERDQYrm ||
+                       Opcode == X86::VPGATHERQQrm ||
+                       Opcode == X86::VPGATHERDDrm ||
+                       Opcode == X86::VPGATHERQDrm);
+    bool IndexIs256 = (Opcode == X86::VGATHERQPDYrm ||
+                       Opcode == X86::VGATHERDPSYrm ||
+                       Opcode == X86::VGATHERQPSYrm ||
+                       Opcode == X86::VGATHERDPDZrm ||
+                       Opcode == X86::VPGATHERDQZrm ||
+                       Opcode == X86::VPGATHERQQYrm ||
+                       Opcode == X86::VPGATHERDDYrm ||
+                       Opcode == X86::VPGATHERQDYrm);
+    bool IndexIs512 = (Opcode == X86::VGATHERQPDZrm ||
+                       Opcode == X86::VGATHERDPSZrm ||
+                       Opcode == X86::VGATHERQPSZrm ||
+                       Opcode == X86::VPGATHERQQZrm ||
+                       Opcode == X86::VPGATHERDDZrm ||
+                       Opcode == X86::VPGATHERQDZrm);
+    if (IndexIs128 || IndexIs256 || IndexIs512) {
+      unsigned IndexOffset = insn.sibIndex -
+                         (insn.addressSize == 8 ? SIB_INDEX_RAX:SIB_INDEX_EAX);
+      SIBIndex IndexBase = IndexIs512 ? SIB_INDEX_ZMM0 :
+                           IndexIs256 ? SIB_INDEX_YMM0 : SIB_INDEX_XMM0;
+      insn.sibIndex = (SIBIndex)(IndexBase + 
+                           (insn.sibIndex == SIB_INDEX_NONE ? 4 : IndexOffset));
+    }
+
+    if (insn.sibIndex != SIB_INDEX_NONE) {
+      switch (insn.sibIndex) {
+      default:
+        debug("Unexpected sibIndex");
+        return true;
+#define ENTRY(x)                                          \
+      case SIB_INDEX_##x:                                 \
+        indexReg = MCOperand::CreateReg(X86::x); break;
+      EA_BASES_32BIT
+      EA_BASES_64BIT
+      REGS_XMM
+      REGS_YMM
+      REGS_ZMM
+#undef ENTRY
+      }
+    } else {
+      indexReg = MCOperand::CreateReg(0);
+    }
+    
+    scaleAmount = MCOperand::CreateImm(insn.sibScale);
+  } else {
+    switch (insn.eaBase) {
+    case EA_BASE_NONE:
+      if (insn.eaDisplacement == EA_DISP_NONE) {
+        debug("EA_BASE_NONE and EA_DISP_NONE for ModR/M base");
+        return true;
+      }
+      if (insn.mode == MODE_64BIT){
+        pcrel = insn.startLocation +
+                insn.displacementOffset + insn.displacementSize;
+        tryAddingPcLoadReferenceComment(insn.startLocation +
+                                        insn.displacementOffset,
+                                        insn.displacement + pcrel, Dis);
+        baseReg = MCOperand::CreateReg(X86::RIP); // Section 2.2.1.6
+      }
+      else
+        baseReg = MCOperand::CreateReg(0);
+      
+      indexReg = MCOperand::CreateReg(0);
+      break;
+    case EA_BASE_BX_SI:
+      baseReg = MCOperand::CreateReg(X86::BX);
+      indexReg = MCOperand::CreateReg(X86::SI);
+      break;
+    case EA_BASE_BX_DI:
+      baseReg = MCOperand::CreateReg(X86::BX);
+      indexReg = MCOperand::CreateReg(X86::DI);
+      break;
+    case EA_BASE_BP_SI:
+      baseReg = MCOperand::CreateReg(X86::BP);
+      indexReg = MCOperand::CreateReg(X86::SI);
+      break;
+    case EA_BASE_BP_DI:
+      baseReg = MCOperand::CreateReg(X86::BP);
+      indexReg = MCOperand::CreateReg(X86::DI);
+      break;
+    default:
+      indexReg = MCOperand::CreateReg(0);
+      switch (insn.eaBase) {
+      default:
+        debug("Unexpected eaBase");
+        return true;
+        // Here, we will use the fill-ins defined above.  However,
+        //   BX_SI, BX_DI, BP_SI, and BP_DI are all handled above and
+        //   sib and sib64 were handled in the top-level if, so they're only
+        //   placeholders to keep the compiler happy.
+#define ENTRY(x)                                        \
+      case EA_BASE_##x:                                 \
+        baseReg = MCOperand::CreateReg(X86::x); break; 
+      ALL_EA_BASES
+#undef ENTRY
+#define ENTRY(x) case EA_REG_##x:
+      ALL_REGS
+#undef ENTRY
+        debug("A R/M memory operand may not be a register; "
+              "the base field must be a base.");
+        return true;
+      }
+    }
+    
+    scaleAmount = MCOperand::CreateImm(1);
+  }
+  
+  displacement = MCOperand::CreateImm(insn.displacement);
+
+  segmentReg = MCOperand::CreateReg(segmentRegnums[insn.segmentOverride]);
+  
+  mcInst.addOperand(baseReg);
+  mcInst.addOperand(scaleAmount);
+  mcInst.addOperand(indexReg);
+  if(!tryAddingSymbolicOperand(insn.displacement + pcrel, false,
+                               insn.startLocation, insn.displacementOffset,
+                               insn.displacementSize, mcInst, Dis))
+    mcInst.addOperand(displacement);
+  mcInst.addOperand(segmentReg);
+  return false;
+}
+
+/// translateRM - Translates an operand stored in the R/M (and possibly SIB)
+///   byte of an instruction to LLVM form, and appends it to an MCInst.
+///
+/// @param mcInst       - The MCInst to append to.
+/// @param operand      - The operand, as stored in the descriptor table.
+/// @param insn         - The instruction to extract Mod, R/M, and SIB fields
+///                       from.
+/// @return             - 0 on success; nonzero otherwise
+static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand,
+                        InternalInstruction &insn, const MCDisassembler *Dis) {  
+  switch (operand.type) {
+  default:
+    debug("Unexpected type for a R/M operand");
+    return true;
+  case TYPE_R8:
+  case TYPE_R16:
+  case TYPE_R32:
+  case TYPE_R64:
+  case TYPE_Rv:
+  case TYPE_MM:
+  case TYPE_MM32:
+  case TYPE_MM64:
+  case TYPE_XMM:
+  case TYPE_XMM32:
+  case TYPE_XMM64:
+  case TYPE_XMM128:
+  case TYPE_XMM256:
+  case TYPE_XMM512:
+  case TYPE_VK1:
+  case TYPE_VK8:
+  case TYPE_VK16:
+  case TYPE_DEBUGREG:
+  case TYPE_CONTROLREG:
+    return translateRMRegister(mcInst, insn);
+  case TYPE_M:
+  case TYPE_M8:
+  case TYPE_M16:
+  case TYPE_M32:
+  case TYPE_M64:
+  case TYPE_M128:
+  case TYPE_M256:
+  case TYPE_M512:
+  case TYPE_Mv:
+  case TYPE_M32FP:
+  case TYPE_M64FP:
+  case TYPE_M80FP:
+  case TYPE_M16INT:
+  case TYPE_M32INT:
+  case TYPE_M64INT:
+  case TYPE_M1616:
+  case TYPE_M1632:
+  case TYPE_M1664:
+  case TYPE_LEA:
+    return translateRMMemory(mcInst, insn, Dis);
+  }
+}
+  
+/// translateFPRegister - Translates a stack position on the FPU stack to its
+///   LLVM form, and appends it to an MCInst.
+///
+/// @param mcInst       - The MCInst to append to.
+/// @param stackPos     - The stack position to translate.
+static void translateFPRegister(MCInst &mcInst,
+                                uint8_t stackPos) {
+  mcInst.addOperand(MCOperand::CreateReg(X86::ST0 + stackPos));
+}
+
+/// translateMaskRegister - Translates a 3-bit mask register number to
+///   LLVM form, and appends it to an MCInst.
+///
+/// @param mcInst       - The MCInst to append to.
+/// @param maskRegNum   - Number of mask register from 0 to 7.
+/// @return             - false on success; true otherwise.
+static bool translateMaskRegister(MCInst &mcInst,
+                                uint8_t maskRegNum) {
+  if (maskRegNum >= 8) {
+    debug("Invalid mask register number");
+    return true;
+  }
+
+  mcInst.addOperand(MCOperand::CreateReg(X86::K0 + maskRegNum));
+  return false;
+}
+
+/// translateOperand - Translates an operand stored in an internal instruction 
+///   to LLVM's format and appends it to an MCInst.
+///
+/// @param mcInst       - The MCInst to append to.
+/// @param operand      - The operand, as stored in the descriptor table.
+/// @param insn         - The internal instruction.
+/// @return             - false on success; true otherwise.
+static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand,
+                             InternalInstruction &insn,
+                             const MCDisassembler *Dis) {  
+  switch (operand.encoding) {
+  default:
+    debug("Unhandled operand encoding during translation");
+    return true;
+  case ENCODING_REG:
+    translateRegister(mcInst, insn.reg);
+    return false;
+  case ENCODING_WRITEMASK:
+    return translateMaskRegister(mcInst, insn.writemask);
+  CASE_ENCODING_RM:
+    return translateRM(mcInst, operand, insn, Dis);
+  case ENCODING_CB:
+  case ENCODING_CW:
+  case ENCODING_CD:
+  case ENCODING_CP:
+  case ENCODING_CO:
+  case ENCODING_CT:
+    debug("Translation of code offsets isn't supported.");
+    return true;
+  case ENCODING_IB:
+  case ENCODING_IW:
+  case ENCODING_ID:
+  case ENCODING_IO:
+  case ENCODING_Iv:
+  case ENCODING_Ia:
+    translateImmediate(mcInst,
+                       insn.immediates[insn.numImmediatesTranslated++],
+                       operand,
+                       insn,
+                       Dis);
+    return false;
+  case ENCODING_SI:
+    return translateSrcIndex(mcInst, insn);
+  case ENCODING_DI:
+    return translateDstIndex(mcInst, insn);
+  case ENCODING_RB:
+  case ENCODING_RW:
+  case ENCODING_RD:
+  case ENCODING_RO:
+  case ENCODING_Rv:
+    translateRegister(mcInst, insn.opcodeRegister);
+    return false;
+  case ENCODING_FP:
+    translateFPRegister(mcInst, insn.modRM & 7);
+    return false;
+  case ENCODING_VVVV:
+    translateRegister(mcInst, insn.vvvv);
+    return false;
+  case ENCODING_DUP:
+    return translateOperand(mcInst, insn.operands[operand.type - TYPE_DUP0],
+                            insn, Dis);
+  }
+}
+  
+/// translateInstruction - Translates an internal instruction and all its
+///   operands to an MCInst.
+///
+/// @param mcInst       - The MCInst to populate with the instruction's data.
+/// @param insn         - The internal instruction.
+/// @return             - false on success; true otherwise.
+static bool translateInstruction(MCInst &mcInst,
+                                InternalInstruction &insn,
+                                const MCDisassembler *Dis) {  
+  if (!insn.spec) {
+    debug("Instruction has no specification");
+    return true;
+  }
+  
+  mcInst.setOpcode(insn.instructionID);
+  // If when reading the prefix bytes we determined the overlapping 0xf2 or 0xf3
+  // prefix bytes should be disassembled as xrelease and xacquire then set the
+  // opcode to those instead of the rep and repne opcodes.
+  if (insn.xAcquireRelease) {
+    if(mcInst.getOpcode() == X86::REP_PREFIX)
+      mcInst.setOpcode(X86::XRELEASE_PREFIX);
+    else if(mcInst.getOpcode() == X86::REPNE_PREFIX)
+      mcInst.setOpcode(X86::XACQUIRE_PREFIX);
+  }
+  
+  insn.numImmediatesTranslated = 0;
+  
+  for (const auto &Op : insn.operands) {
+    if (Op.encoding != ENCODING_NONE) {
+      if (translateOperand(mcInst, Op, insn, Dis)) {
+        return true;
+      }
+    }
+  }
+  
+  return false;
+}
+
+static MCDisassembler *createX86Disassembler(const Target &T,
+                                             const MCSubtargetInfo &STI,
+                                             MCContext &Ctx) {
+  std::unique_ptr<const MCInstrInfo> MII(T.createMCInstrInfo());
+  return new X86Disassembler::X86GenericDisassembler(STI, Ctx, std::move(MII));
+}
+
+extern "C" void LLVMInitializeX86Disassembler() { 
+  // Register the disassembler.
+  TargetRegistry::RegisterMCDisassembler(TheX86_32Target, 
+                                         createX86Disassembler);
+  TargetRegistry::RegisterMCDisassembler(TheX86_64Target,
+                                         createX86Disassembler);
+}
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.h b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.h
new file mode 100644
index 0000000..4dc7c29
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.h
@@ -0,0 +1,116 @@
+//===-- X86Disassembler.h - Disassembler for x86 and x86_64 -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The X86 disassembler is a table-driven disassembler for the 16-, 32-, and
+// 64-bit X86 instruction sets.  The main decode sequence for an assembly
+// instruction in this disassembler is:
+//
+// 1. Read the prefix bytes and determine the attributes of the instruction.
+//    These attributes, recorded in enum attributeBits
+//    (X86DisassemblerDecoderCommon.h), form a bitmask.  The table CONTEXTS_SYM
+//    provides a mapping from bitmasks to contexts, which are represented by
+//    enum InstructionContext (ibid.).
+//
+// 2. Read the opcode, and determine what kind of opcode it is.  The
+//    disassembler distinguishes four kinds of opcodes, which are enumerated in
+//    OpcodeType (X86DisassemblerDecoderCommon.h): one-byte (0xnn), two-byte
+//    (0x0f 0xnn), three-byte-38 (0x0f 0x38 0xnn), or three-byte-3a
+//    (0x0f 0x3a 0xnn).  Mandatory prefixes are treated as part of the context.
+//
+// 3. Depending on the opcode type, look in one of four ClassDecision structures
+//    (X86DisassemblerDecoderCommon.h).  Use the opcode class to determine which
+//    OpcodeDecision (ibid.) to look the opcode in.  Look up the opcode, to get
+//    a ModRMDecision (ibid.).
+//
+// 4. Some instructions, such as escape opcodes or extended opcodes, or even
+//    instructions that have ModRM*Reg / ModRM*Mem forms in LLVM, need the
+//    ModR/M byte to complete decode.  The ModRMDecision's type is an entry from
+//    ModRMDecisionType (X86DisassemblerDecoderCommon.h) that indicates if the
+//    ModR/M byte is required and how to interpret it.
+//
+// 5. After resolving the ModRMDecision, the disassembler has a unique ID
+//    of type InstrUID (X86DisassemblerDecoderCommon.h).  Looking this ID up in
+//    INSTRUCTIONS_SYM yields the name of the instruction and the encodings and
+//    meanings of its operands.
+//
+// 6. For each operand, its encoding is an entry from OperandEncoding
+//    (X86DisassemblerDecoderCommon.h) and its type is an entry from
+//    OperandType (ibid.).  The encoding indicates how to read it from the
+//    instruction; the type indicates how to interpret the value once it has
+//    been read.  For example, a register operand could be stored in the R/M
+//    field of the ModR/M byte, the REG field of the ModR/M byte, or added to
+//    the main opcode.  This is orthogonal from its meaning (an GPR or an XMM
+//    register, for instance).  Given this information, the operands can be
+//    extracted and interpreted.
+//
+// 7. As the last step, the disassembler translates the instruction information
+//    and operands into a format understandable by the client - in this case, an
+//    MCInst for use by the MC infrastructure.
+//
+// The disassembler is broken broadly into two parts: the table emitter that
+// emits the instruction decode tables discussed above during compilation, and
+// the disassembler itself.  The table emitter is documented in more detail in
+// utils/TableGen/X86DisassemblerEmitter.h.
+//
+// X86Disassembler.h contains the public interface for the disassembler,
+//   adhering to the MCDisassembler interface.
+// X86Disassembler.cpp contains the code responsible for step 7, and for
+//   invoking the decoder to execute steps 1-6.
+// X86DisassemblerDecoderCommon.h contains the definitions needed by both the
+//   table emitter and the disassembler.
+// X86DisassemblerDecoder.h contains the public interface of the decoder,
+//   factored out into C for possible use by other projects.
+// X86DisassemblerDecoder.c contains the source code of the decoder, which is
+//   responsible for steps 1-6.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86DISASSEMBLER_H
+#define X86DISASSEMBLER_H
+
+#include "X86DisassemblerDecoderCommon.h"
+#include "llvm/MC/MCDisassembler.h"
+
+namespace llvm {
+
+class MCInst;
+class MCInstrInfo;
+class MCSubtargetInfo;
+class MemoryObject;
+class raw_ostream;
+
+namespace X86Disassembler {
+
+/// X86GenericDisassembler - Generic disassembler for all X86 platforms.
+///   All each platform class should have to do is subclass the constructor, and
+///   provide a different disassemblerMode value.
+class X86GenericDisassembler : public MCDisassembler {
+  std::unique_ptr<const MCInstrInfo> MII;
+public:
+  /// Constructor     - Initializes the disassembler.
+  ///
+  X86GenericDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
+                         std::unique_ptr<const MCInstrInfo> MII);
+public:
+
+  /// getInstruction - See MCDisassembler.
+  DecodeStatus getInstruction(MCInst &instr, uint64_t &size,
+                              const MemoryObject &region, uint64_t address,
+                              raw_ostream &vStream,
+                              raw_ostream &cStream) const override;
+
+private:
+  DisassemblerMode              fMode;
+};
+
+} // namespace X86Disassembler
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
new file mode 100644
index 0000000..ab3d1f7
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
@@ -0,0 +1,1842 @@
+//===-- X86DisassemblerDecoder.c - Disassembler decoder -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the X86 Disassembler.
+// It contains the implementation of the instruction decoder.
+// Documentation for the disassembler can be found in X86Disassembler.h.
+//
+//===----------------------------------------------------------------------===//
+
+#include <stdarg.h>   /* for va_*()       */
+#include <stdio.h>    /* for vsnprintf()  */
+#include <stdlib.h>   /* for exit()       */
+#include <string.h>   /* for memset()     */
+
+#include "X86DisassemblerDecoder.h"
+
+using namespace llvm::X86Disassembler;
+
+/// Specifies whether a ModR/M byte is needed and (if so) which
+/// instruction each possible value of the ModR/M byte corresponds to.  Once
+/// this information is known, we have narrowed down to a single instruction.
+struct ModRMDecision {
+  uint8_t modrm_type;
+  uint16_t instructionIDs;
+};
+
+/// Specifies which set of ModR/M->instruction tables to look at
+/// given a particular opcode.
+struct OpcodeDecision {
+  ModRMDecision modRMDecisions[256];
+};
+
+/// Specifies which opcode->instruction tables to look at given
+/// a particular context (set of attributes).  Since there are many possible
+/// contexts, the decoder first uses CONTEXTS_SYM to determine which context
+/// applies given a specific set of attributes.  Hence there are only IC_max
+/// entries in this table, rather than 2^(ATTR_max).
+struct ContextDecision {
+  OpcodeDecision opcodeDecisions[IC_max];
+};
+
+#include "X86GenDisassemblerTables.inc"
+
+#ifndef NDEBUG
+#define debug(s) do { Debug(__FILE__, __LINE__, s); } while (0)
+#else
+#define debug(s) do { } while (0)
+#endif
+
+
+/*
+ * contextForAttrs - Client for the instruction context table.  Takes a set of
+ *   attributes and returns the appropriate decode context.
+ *
+ * @param attrMask  - Attributes, from the enumeration attributeBits.
+ * @return          - The InstructionContext to use when looking up an
+ *                    an instruction with these attributes.
+ */
+static InstructionContext contextForAttrs(uint16_t attrMask) {
+  return static_cast<InstructionContext>(CONTEXTS_SYM[attrMask]);
+}
+
+/*
+ * modRMRequired - Reads the appropriate instruction table to determine whether
+ *   the ModR/M byte is required to decode a particular instruction.
+ *
+ * @param type        - The opcode type (i.e., how many bytes it has).
+ * @param insnContext - The context for the instruction, as returned by
+ *                      contextForAttrs.
+ * @param opcode      - The last byte of the instruction's opcode, not counting
+ *                      ModR/M extensions and escapes.
+ * @return            - true if the ModR/M byte is required, false otherwise.
+ */
+static int modRMRequired(OpcodeType type,
+                         InstructionContext insnContext,
+                         uint16_t opcode) {
+  const struct ContextDecision* decision = nullptr;
+
+  switch (type) {
+  case ONEBYTE:
+    decision = &ONEBYTE_SYM;
+    break;
+  case TWOBYTE:
+    decision = &TWOBYTE_SYM;
+    break;
+  case THREEBYTE_38:
+    decision = &THREEBYTE38_SYM;
+    break;
+  case THREEBYTE_3A:
+    decision = &THREEBYTE3A_SYM;
+    break;
+  case XOP8_MAP:
+    decision = &XOP8_MAP_SYM;
+    break;
+  case XOP9_MAP:
+    decision = &XOP9_MAP_SYM;
+    break;
+  case XOPA_MAP:
+    decision = &XOPA_MAP_SYM;
+    break;
+  }
+
+  return decision->opcodeDecisions[insnContext].modRMDecisions[opcode].
+    modrm_type != MODRM_ONEENTRY;
+}
+
+/*
+ * decode - Reads the appropriate instruction table to obtain the unique ID of
+ *   an instruction.
+ *
+ * @param type        - See modRMRequired().
+ * @param insnContext - See modRMRequired().
+ * @param opcode      - See modRMRequired().
+ * @param modRM       - The ModR/M byte if required, or any value if not.
+ * @return            - The UID of the instruction, or 0 on failure.
+ */
+static InstrUID decode(OpcodeType type,
+                       InstructionContext insnContext,
+                       uint8_t opcode,
+                       uint8_t modRM) {
+  const struct ModRMDecision* dec = nullptr;
+
+  switch (type) {
+  case ONEBYTE:
+    dec = &ONEBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
+  case TWOBYTE:
+    dec = &TWOBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
+  case THREEBYTE_38:
+    dec = &THREEBYTE38_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
+  case THREEBYTE_3A:
+    dec = &THREEBYTE3A_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
+  case XOP8_MAP:
+    dec = &XOP8_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
+  case XOP9_MAP:
+    dec = &XOP9_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
+  case XOPA_MAP:
+    dec = &XOPA_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
+  }
+
+  switch (dec->modrm_type) {
+  default:
+    debug("Corrupt table!  Unknown modrm_type");
+    return 0;
+  case MODRM_ONEENTRY:
+    return modRMTable[dec->instructionIDs];
+  case MODRM_SPLITRM:
+    if (modFromModRM(modRM) == 0x3)
+      return modRMTable[dec->instructionIDs+1];
+    return modRMTable[dec->instructionIDs];
+  case MODRM_SPLITREG:
+    if (modFromModRM(modRM) == 0x3)
+      return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)+8];
+    return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)];
+  case MODRM_SPLITMISC:
+    if (modFromModRM(modRM) == 0x3)
+      return modRMTable[dec->instructionIDs+(modRM & 0x3f)+8];
+    return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)];
+  case MODRM_FULL:
+    return modRMTable[dec->instructionIDs+modRM];
+  }
+}
+
+/*
+ * specifierForUID - Given a UID, returns the name and operand specification for
+ *   that instruction.
+ *
+ * @param uid - The unique ID for the instruction.  This should be returned by
+ *              decode(); specifierForUID will not check bounds.
+ * @return    - A pointer to the specification for that instruction.
+ */
+static const struct InstructionSpecifier *specifierForUID(InstrUID uid) {
+  return &INSTRUCTIONS_SYM[uid];
+}
+
+/*
+ * consumeByte - Uses the reader function provided by the user to consume one
+ *   byte from the instruction's memory and advance the cursor.
+ *
+ * @param insn  - The instruction with the reader function to use.  The cursor
+ *                for this instruction is advanced.
+ * @param byte  - A pointer to a pre-allocated memory buffer to be populated
+ *                with the data read.
+ * @return      - 0 if the read was successful; nonzero otherwise.
+ */
+static int consumeByte(struct InternalInstruction* insn, uint8_t* byte) {
+  int ret = insn->reader(insn->readerArg, byte, insn->readerCursor);
+
+  if (!ret)
+    ++(insn->readerCursor);
+
+  return ret;
+}
+
+/*
+ * lookAtByte - Like consumeByte, but does not advance the cursor.
+ *
+ * @param insn  - See consumeByte().
+ * @param byte  - See consumeByte().
+ * @return      - See consumeByte().
+ */
+static int lookAtByte(struct InternalInstruction* insn, uint8_t* byte) {
+  return insn->reader(insn->readerArg, byte, insn->readerCursor);
+}
+
+static void unconsumeByte(struct InternalInstruction* insn) {
+  insn->readerCursor--;
+}
+
+#define CONSUME_FUNC(name, type)                                  \
+  static int name(struct InternalInstruction* insn, type* ptr) {  \
+    type combined = 0;                                            \
+    unsigned offset;                                              \
+    for (offset = 0; offset < sizeof(type); ++offset) {           \
+      uint8_t byte;                                               \
+      int ret = insn->reader(insn->readerArg,                     \
+                             &byte,                               \
+                             insn->readerCursor + offset);        \
+      if (ret)                                                    \
+        return ret;                                               \
+      combined = combined | ((uint64_t)byte << (offset * 8));     \
+    }                                                             \
+    *ptr = combined;                                              \
+    insn->readerCursor += sizeof(type);                           \
+    return 0;                                                     \
+  }
+
+/*
+ * consume* - Use the reader function provided by the user to consume data
+ *   values of various sizes from the instruction's memory and advance the
+ *   cursor appropriately.  These readers perform endian conversion.
+ *
+ * @param insn    - See consumeByte().
+ * @param ptr     - A pointer to a pre-allocated memory of appropriate size to
+ *                  be populated with the data read.
+ * @return        - See consumeByte().
+ */
+CONSUME_FUNC(consumeInt8, int8_t)
+CONSUME_FUNC(consumeInt16, int16_t)
+CONSUME_FUNC(consumeInt32, int32_t)
+CONSUME_FUNC(consumeUInt16, uint16_t)
+CONSUME_FUNC(consumeUInt32, uint32_t)
+CONSUME_FUNC(consumeUInt64, uint64_t)
+
+/*
+ * dbgprintf - Uses the logging function provided by the user to log a single
+ *   message, typically without a carriage-return.
+ *
+ * @param insn    - The instruction containing the logging function.
+ * @param format  - See printf().
+ * @param ...     - See printf().
+ */
+static void dbgprintf(struct InternalInstruction* insn,
+                      const char* format,
+                      ...) {
+  char buffer[256];
+  va_list ap;
+
+  if (!insn->dlog)
+    return;
+
+  va_start(ap, format);
+  (void)vsnprintf(buffer, sizeof(buffer), format, ap);
+  va_end(ap);
+
+  insn->dlog(insn->dlogArg, buffer);
+
+  return;
+}
+
+/*
+ * setPrefixPresent - Marks that a particular prefix is present at a particular
+ *   location.
+ *
+ * @param insn      - The instruction to be marked as having the prefix.
+ * @param prefix    - The prefix that is present.
+ * @param location  - The location where the prefix is located (in the address
+ *                    space of the instruction's reader).
+ */
+static void setPrefixPresent(struct InternalInstruction* insn,
+                                    uint8_t prefix,
+                                    uint64_t location)
+{
+  insn->prefixPresent[prefix] = 1;
+  insn->prefixLocations[prefix] = location;
+}
+
+/*
+ * isPrefixAtLocation - Queries an instruction to determine whether a prefix is
+ *   present at a given location.
+ *
+ * @param insn      - The instruction to be queried.
+ * @param prefix    - The prefix.
+ * @param location  - The location to query.
+ * @return          - Whether the prefix is at that location.
+ */
+static bool isPrefixAtLocation(struct InternalInstruction* insn,
+                               uint8_t prefix,
+                               uint64_t location)
+{
+  if (insn->prefixPresent[prefix] == 1 &&
+     insn->prefixLocations[prefix] == location)
+    return true;
+  else
+    return false;
+}
+
+/*
+ * readPrefixes - Consumes all of an instruction's prefix bytes, and marks the
+ *   instruction as having them.  Also sets the instruction's default operand,
+ *   address, and other relevant data sizes to report operands correctly.
+ *
+ * @param insn  - The instruction whose prefixes are to be read.
+ * @return      - 0 if the instruction could be read until the end of the prefix
+ *                bytes, and no prefixes conflicted; nonzero otherwise.
+ */
+static int readPrefixes(struct InternalInstruction* insn) {
+  bool isPrefix = true;
+  bool prefixGroups[4] = { false };
+  uint64_t prefixLocation;
+  uint8_t byte = 0;
+  uint8_t nextByte;
+
+  bool hasAdSize = false;
+  bool hasOpSize = false;
+
+  dbgprintf(insn, "readPrefixes()");
+
+  while (isPrefix) {
+    prefixLocation = insn->readerCursor;
+
+    /* If we fail reading prefixes, just stop here and let the opcode reader deal with it */
+    if (consumeByte(insn, &byte))
+      break;
+
+    /*
+     * If the byte is a LOCK/REP/REPNE prefix and not a part of the opcode, then
+     * break and let it be disassembled as a normal "instruction".
+     */
+    if (insn->readerCursor - 1 == insn->startLocation && byte == 0xf0)
+      break;
+
+    if (insn->readerCursor - 1 == insn->startLocation
+        && (byte == 0xf2 || byte == 0xf3)
+        && !lookAtByte(insn, &nextByte))
+    {
+      /*
+       * If the byte is 0xf2 or 0xf3, and any of the following conditions are
+       * met:
+       * - it is followed by a LOCK (0xf0) prefix
+       * - it is followed by an xchg instruction
+       * then it should be disassembled as a xacquire/xrelease not repne/rep.
+       */
+      if ((byte == 0xf2 || byte == 0xf3) &&
+          ((nextByte == 0xf0) |
+          ((nextByte & 0xfe) == 0x86 || (nextByte & 0xf8) == 0x90)))
+        insn->xAcquireRelease = true;
+      /*
+       * Also if the byte is 0xf3, and the following condition is met:
+       * - it is followed by a "mov mem, reg" (opcode 0x88/0x89) or
+       *                       "mov mem, imm" (opcode 0xc6/0xc7) instructions.
+       * then it should be disassembled as an xrelease not rep.
+       */
+      if (byte == 0xf3 &&
+          (nextByte == 0x88 || nextByte == 0x89 ||
+           nextByte == 0xc6 || nextByte == 0xc7))
+        insn->xAcquireRelease = true;
+      if (insn->mode == MODE_64BIT && (nextByte & 0xf0) == 0x40) {
+        if (consumeByte(insn, &nextByte))
+          return -1;
+        if (lookAtByte(insn, &nextByte))
+          return -1;
+        unconsumeByte(insn);
+      }
+      if (nextByte != 0x0f && nextByte != 0x90)
+        break;
+    }
+
+    switch (byte) {
+    case 0xf0:  /* LOCK */
+    case 0xf2:  /* REPNE/REPNZ */
+    case 0xf3:  /* REP or REPE/REPZ */
+      if (prefixGroups[0])
+        dbgprintf(insn, "Redundant Group 1 prefix");
+      prefixGroups[0] = true;
+      setPrefixPresent(insn, byte, prefixLocation);
+      break;
+    case 0x2e:  /* CS segment override -OR- Branch not taken */
+    case 0x36:  /* SS segment override -OR- Branch taken */
+    case 0x3e:  /* DS segment override */
+    case 0x26:  /* ES segment override */
+    case 0x64:  /* FS segment override */
+    case 0x65:  /* GS segment override */
+      switch (byte) {
+      case 0x2e:
+        insn->segmentOverride = SEG_OVERRIDE_CS;
+        break;
+      case 0x36:
+        insn->segmentOverride = SEG_OVERRIDE_SS;
+        break;
+      case 0x3e:
+        insn->segmentOverride = SEG_OVERRIDE_DS;
+        break;
+      case 0x26:
+        insn->segmentOverride = SEG_OVERRIDE_ES;
+        break;
+      case 0x64:
+        insn->segmentOverride = SEG_OVERRIDE_FS;
+        break;
+      case 0x65:
+        insn->segmentOverride = SEG_OVERRIDE_GS;
+        break;
+      default:
+        debug("Unhandled override");
+        return -1;
+      }
+      if (prefixGroups[1])
+        dbgprintf(insn, "Redundant Group 2 prefix");
+      prefixGroups[1] = true;
+      setPrefixPresent(insn, byte, prefixLocation);
+      break;
+    case 0x66:  /* Operand-size override */
+      if (prefixGroups[2])
+        dbgprintf(insn, "Redundant Group 3 prefix");
+      prefixGroups[2] = true;
+      hasOpSize = true;
+      setPrefixPresent(insn, byte, prefixLocation);
+      break;
+    case 0x67:  /* Address-size override */
+      if (prefixGroups[3])
+        dbgprintf(insn, "Redundant Group 4 prefix");
+      prefixGroups[3] = true;
+      hasAdSize = true;
+      setPrefixPresent(insn, byte, prefixLocation);
+      break;
+    default:    /* Not a prefix byte */
+      isPrefix = false;
+      break;
+    }
+
+    if (isPrefix)
+      dbgprintf(insn, "Found prefix 0x%hhx", byte);
+  }
+
+  insn->vectorExtensionType = TYPE_NO_VEX_XOP;
+
+  if (byte == 0x62) {
+    uint8_t byte1, byte2;
+
+    if (consumeByte(insn, &byte1)) {
+      dbgprintf(insn, "Couldn't read second byte of EVEX prefix");
+      return -1;
+    }
+
+    if (lookAtByte(insn, &byte2)) {
+      dbgprintf(insn, "Couldn't read third byte of EVEX prefix");
+      return -1;
+    }
+
+    if ((insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) &&
+       ((~byte1 & 0xc) == 0xc) && ((byte2 & 0x4) == 0x4)) {
+      insn->vectorExtensionType = TYPE_EVEX;
+    }
+    else {
+      unconsumeByte(insn); /* unconsume byte1 */
+      unconsumeByte(insn); /* unconsume byte  */
+      insn->necessaryPrefixLocation = insn->readerCursor - 2;
+    }
+
+    if (insn->vectorExtensionType == TYPE_EVEX) {
+      insn->vectorExtensionPrefix[0] = byte;
+      insn->vectorExtensionPrefix[1] = byte1;
+      if (consumeByte(insn, &insn->vectorExtensionPrefix[2])) {
+        dbgprintf(insn, "Couldn't read third byte of EVEX prefix");
+        return -1;
+      }
+      if (consumeByte(insn, &insn->vectorExtensionPrefix[3])) {
+        dbgprintf(insn, "Couldn't read fourth byte of EVEX prefix");
+        return -1;
+      }
+
+      /* We simulate the REX prefix for simplicity's sake */
+      if (insn->mode == MODE_64BIT) {
+        insn->rexPrefix = 0x40
+                        | (wFromEVEX3of4(insn->vectorExtensionPrefix[2]) << 3)
+                        | (rFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 2)
+                        | (xFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 1)
+                        | (bFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 0);
+      }
+
+      dbgprintf(insn, "Found EVEX prefix 0x%hhx 0x%hhx 0x%hhx 0x%hhx",
+              insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1],
+              insn->vectorExtensionPrefix[2], insn->vectorExtensionPrefix[3]);
+    }
+  }
+  else if (byte == 0xc4) {
+    uint8_t byte1;
+
+    if (lookAtByte(insn, &byte1)) {
+      dbgprintf(insn, "Couldn't read second byte of VEX");
+      return -1;
+    }
+
+    if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) {
+      insn->vectorExtensionType = TYPE_VEX_3B;
+      insn->necessaryPrefixLocation = insn->readerCursor - 1;
+    }
+    else {
+      unconsumeByte(insn);
+      insn->necessaryPrefixLocation = insn->readerCursor - 1;
+    }
+
+    if (insn->vectorExtensionType == TYPE_VEX_3B) {
+      insn->vectorExtensionPrefix[0] = byte;
+      consumeByte(insn, &insn->vectorExtensionPrefix[1]);
+      consumeByte(insn, &insn->vectorExtensionPrefix[2]);
+
+      /* We simulate the REX prefix for simplicity's sake */
+
+      if (insn->mode == MODE_64BIT) {
+        insn->rexPrefix = 0x40
+                        | (wFromVEX3of3(insn->vectorExtensionPrefix[2]) << 3)
+                        | (rFromVEX2of3(insn->vectorExtensionPrefix[1]) << 2)
+                        | (xFromVEX2of3(insn->vectorExtensionPrefix[1]) << 1)
+                        | (bFromVEX2of3(insn->vectorExtensionPrefix[1]) << 0);
+      }
+
+      dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx 0x%hhx",
+                insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1],
+                insn->vectorExtensionPrefix[2]);
+    }
+  }
+  else if (byte == 0xc5) {
+    uint8_t byte1;
+
+    if (lookAtByte(insn, &byte1)) {
+      dbgprintf(insn, "Couldn't read second byte of VEX");
+      return -1;
+    }
+
+    if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) {
+      insn->vectorExtensionType = TYPE_VEX_2B;
+    }
+    else {
+      unconsumeByte(insn);
+    }
+
+    if (insn->vectorExtensionType == TYPE_VEX_2B) {
+      insn->vectorExtensionPrefix[0] = byte;
+      consumeByte(insn, &insn->vectorExtensionPrefix[1]);
+
+      if (insn->mode == MODE_64BIT) {
+        insn->rexPrefix = 0x40
+                        | (rFromVEX2of2(insn->vectorExtensionPrefix[1]) << 2);
+      }
+
+      switch (ppFromVEX2of2(insn->vectorExtensionPrefix[1]))
+      {
+      default:
+        break;
+      case VEX_PREFIX_66:
+        hasOpSize = true;
+        break;
+      }
+
+      dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx",
+                insn->vectorExtensionPrefix[0],
+                insn->vectorExtensionPrefix[1]);
+    }
+  }
+  else if (byte == 0x8f) {
+    uint8_t byte1;
+
+    if (lookAtByte(insn, &byte1)) {
+      dbgprintf(insn, "Couldn't read second byte of XOP");
+      return -1;
+    }
+
+    if ((byte1 & 0x38) != 0x0) { /* 0 in these 3 bits is a POP instruction. */
+      insn->vectorExtensionType = TYPE_XOP;
+      insn->necessaryPrefixLocation = insn->readerCursor - 1;
+    }
+    else {
+      unconsumeByte(insn);
+      insn->necessaryPrefixLocation = insn->readerCursor - 1;
+    }
+
+    if (insn->vectorExtensionType == TYPE_XOP) {
+      insn->vectorExtensionPrefix[0] = byte;
+      consumeByte(insn, &insn->vectorExtensionPrefix[1]);
+      consumeByte(insn, &insn->vectorExtensionPrefix[2]);
+
+      /* We simulate the REX prefix for simplicity's sake */
+
+      if (insn->mode == MODE_64BIT) {
+        insn->rexPrefix = 0x40
+                        | (wFromXOP3of3(insn->vectorExtensionPrefix[2]) << 3)
+                        | (rFromXOP2of3(insn->vectorExtensionPrefix[1]) << 2)
+                        | (xFromXOP2of3(insn->vectorExtensionPrefix[1]) << 1)
+                        | (bFromXOP2of3(insn->vectorExtensionPrefix[1]) << 0);
+      }
+
+      switch (ppFromXOP3of3(insn->vectorExtensionPrefix[2]))
+      {
+      default:
+        break;
+      case VEX_PREFIX_66:
+        hasOpSize = true;
+        break;
+      }
+
+      dbgprintf(insn, "Found XOP prefix 0x%hhx 0x%hhx 0x%hhx",
+                insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1],
+                insn->vectorExtensionPrefix[2]);
+    }
+  }
+  else {
+    if (insn->mode == MODE_64BIT) {
+      if ((byte & 0xf0) == 0x40) {
+        uint8_t opcodeByte;
+
+        if (lookAtByte(insn, &opcodeByte) || ((opcodeByte & 0xf0) == 0x40)) {
+          dbgprintf(insn, "Redundant REX prefix");
+          return -1;
+        }
+
+        insn->rexPrefix = byte;
+        insn->necessaryPrefixLocation = insn->readerCursor - 2;
+
+        dbgprintf(insn, "Found REX prefix 0x%hhx", byte);
+      } else {
+        unconsumeByte(insn);
+        insn->necessaryPrefixLocation = insn->readerCursor - 1;
+      }
+    } else {
+      unconsumeByte(insn);
+      insn->necessaryPrefixLocation = insn->readerCursor - 1;
+    }
+  }
+
+  if (insn->mode == MODE_16BIT) {
+    insn->registerSize       = (hasOpSize ? 4 : 2);
+    insn->addressSize        = (hasAdSize ? 4 : 2);
+    insn->displacementSize   = (hasAdSize ? 4 : 2);
+    insn->immediateSize      = (hasOpSize ? 4 : 2);
+  } else if (insn->mode == MODE_32BIT) {
+    insn->registerSize       = (hasOpSize ? 2 : 4);
+    insn->addressSize        = (hasAdSize ? 2 : 4);
+    insn->displacementSize   = (hasAdSize ? 2 : 4);
+    insn->immediateSize      = (hasOpSize ? 2 : 4);
+  } else if (insn->mode == MODE_64BIT) {
+    if (insn->rexPrefix && wFromREX(insn->rexPrefix)) {
+      insn->registerSize       = 8;
+      insn->addressSize        = (hasAdSize ? 4 : 8);
+      insn->displacementSize   = 4;
+      insn->immediateSize      = 4;
+    } else if (insn->rexPrefix) {
+      insn->registerSize       = (hasOpSize ? 2 : 4);
+      insn->addressSize        = (hasAdSize ? 4 : 8);
+      insn->displacementSize   = (hasOpSize ? 2 : 4);
+      insn->immediateSize      = (hasOpSize ? 2 : 4);
+    } else {
+      insn->registerSize       = (hasOpSize ? 2 : 4);
+      insn->addressSize        = (hasAdSize ? 4 : 8);
+      insn->displacementSize   = (hasOpSize ? 2 : 4);
+      insn->immediateSize      = (hasOpSize ? 2 : 4);
+    }
+  }
+
+  return 0;
+}
+
+/*
+ * readOpcode - Reads the opcode (excepting the ModR/M byte in the case of
+ *   extended or escape opcodes).
+ *
+ * @param insn  - The instruction whose opcode is to be read.
+ * @return      - 0 if the opcode could be read successfully; nonzero otherwise.
+ */
+static int readOpcode(struct InternalInstruction* insn) {
+  /* Determine the length of the primary opcode */
+
+  uint8_t current;
+
+  dbgprintf(insn, "readOpcode()");
+
+  insn->opcodeType = ONEBYTE;
+
+  if (insn->vectorExtensionType == TYPE_EVEX)
+  {
+    switch (mmFromEVEX2of4(insn->vectorExtensionPrefix[1])) {
+    default:
+      dbgprintf(insn, "Unhandled mm field for instruction (0x%hhx)",
+                mmFromEVEX2of4(insn->vectorExtensionPrefix[1]));
+      return -1;
+    case VEX_LOB_0F:
+      insn->opcodeType = TWOBYTE;
+      return consumeByte(insn, &insn->opcode);
+    case VEX_LOB_0F38:
+      insn->opcodeType = THREEBYTE_38;
+      return consumeByte(insn, &insn->opcode);
+    case VEX_LOB_0F3A:
+      insn->opcodeType = THREEBYTE_3A;
+      return consumeByte(insn, &insn->opcode);
+    }
+  }
+  else if (insn->vectorExtensionType == TYPE_VEX_3B) {
+    switch (mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1])) {
+    default:
+      dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)",
+                mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1]));
+      return -1;
+    case VEX_LOB_0F:
+      insn->opcodeType = TWOBYTE;
+      return consumeByte(insn, &insn->opcode);
+    case VEX_LOB_0F38:
+      insn->opcodeType = THREEBYTE_38;
+      return consumeByte(insn, &insn->opcode);
+    case VEX_LOB_0F3A:
+      insn->opcodeType = THREEBYTE_3A;
+      return consumeByte(insn, &insn->opcode);
+    }
+  }
+  else if (insn->vectorExtensionType == TYPE_VEX_2B) {
+    insn->opcodeType = TWOBYTE;
+    return consumeByte(insn, &insn->opcode);
+  }
+  else if (insn->vectorExtensionType == TYPE_XOP) {
+    switch (mmmmmFromXOP2of3(insn->vectorExtensionPrefix[1])) {
+    default:
+      dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)",
+                mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1]));
+      return -1;
+    case XOP_MAP_SELECT_8:
+      insn->opcodeType = XOP8_MAP;
+      return consumeByte(insn, &insn->opcode);
+    case XOP_MAP_SELECT_9:
+      insn->opcodeType = XOP9_MAP;
+      return consumeByte(insn, &insn->opcode);
+    case XOP_MAP_SELECT_A:
+      insn->opcodeType = XOPA_MAP;
+      return consumeByte(insn, &insn->opcode);
+    }
+  }
+
+  if (consumeByte(insn, &current))
+    return -1;
+
+  if (current == 0x0f) {
+    dbgprintf(insn, "Found a two-byte escape prefix (0x%hhx)", current);
+
+    if (consumeByte(insn, &current))
+      return -1;
+
+    if (current == 0x38) {
+      dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
+
+      if (consumeByte(insn, &current))
+        return -1;
+
+      insn->opcodeType = THREEBYTE_38;
+    } else if (current == 0x3a) {
+      dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
+
+      if (consumeByte(insn, &current))
+        return -1;
+
+      insn->opcodeType = THREEBYTE_3A;
+    } else {
+      dbgprintf(insn, "Didn't find a three-byte escape prefix");
+
+      insn->opcodeType = TWOBYTE;
+    }
+  }
+
+  /*
+   * At this point we have consumed the full opcode.
+   * Anything we consume from here on must be unconsumed.
+   */
+
+  insn->opcode = current;
+
+  return 0;
+}
+
+static int readModRM(struct InternalInstruction* insn);
+
+/*
+ * getIDWithAttrMask - Determines the ID of an instruction, consuming
+ *   the ModR/M byte as appropriate for extended and escape opcodes,
+ *   and using a supplied attribute mask.
+ *
+ * @param instructionID - A pointer whose target is filled in with the ID of the
+ *                        instruction.
+ * @param insn          - The instruction whose ID is to be determined.
+ * @param attrMask      - The attribute mask to search.
+ * @return              - 0 if the ModR/M could be read when needed or was not
+ *                        needed; nonzero otherwise.
+ */
+static int getIDWithAttrMask(uint16_t* instructionID,
+                             struct InternalInstruction* insn,
+                             uint16_t attrMask) {
+  bool hasModRMExtension;
+
+  InstructionContext instructionClass = contextForAttrs(attrMask);
+
+  hasModRMExtension = modRMRequired(insn->opcodeType,
+                                    instructionClass,
+                                    insn->opcode);
+
+  if (hasModRMExtension) {
+    if (readModRM(insn))
+      return -1;
+
+    *instructionID = decode(insn->opcodeType,
+                            instructionClass,
+                            insn->opcode,
+                            insn->modRM);
+  } else {
+    *instructionID = decode(insn->opcodeType,
+                            instructionClass,
+                            insn->opcode,
+                            0);
+  }
+
+  return 0;
+}
+
+/*
+ * is16BitEquivalent - Determines whether two instruction names refer to
+ * equivalent instructions but one is 16-bit whereas the other is not.
+ *
+ * @param orig  - The instruction that is not 16-bit
+ * @param equiv - The instruction that is 16-bit
+ */
+static bool is16BitEquivalent(const char* orig, const char* equiv) {
+  off_t i;
+
+  for (i = 0;; i++) {
+    if (orig[i] == '\0' && equiv[i] == '\0')
+      return true;
+    if (orig[i] == '\0' || equiv[i] == '\0')
+      return false;
+    if (orig[i] != equiv[i]) {
+      if ((orig[i] == 'Q' || orig[i] == 'L') && equiv[i] == 'W')
+        continue;
+      if ((orig[i] == '6' || orig[i] == '3') && equiv[i] == '1')
+        continue;
+      if ((orig[i] == '4' || orig[i] == '2') && equiv[i] == '6')
+        continue;
+      return false;
+    }
+  }
+}
+
+/*
+ * getID - Determines the ID of an instruction, consuming the ModR/M byte as
+ *   appropriate for extended and escape opcodes.  Determines the attributes and
+ *   context for the instruction before doing so.
+ *
+ * @param insn  - The instruction whose ID is to be determined.
+ * @return      - 0 if the ModR/M could be read when needed or was not needed;
+ *                nonzero otherwise.
+ */
+static int getID(struct InternalInstruction* insn, const void *miiArg) {
+  uint16_t attrMask;
+  uint16_t instructionID;
+
+  dbgprintf(insn, "getID()");
+
+  attrMask = ATTR_NONE;
+
+  if (insn->mode == MODE_64BIT)
+    attrMask |= ATTR_64BIT;
+
+  if (insn->vectorExtensionType != TYPE_NO_VEX_XOP) {
+    attrMask |= (insn->vectorExtensionType == TYPE_EVEX) ? ATTR_EVEX : ATTR_VEX;
+
+    if (insn->vectorExtensionType == TYPE_EVEX) {
+      switch (ppFromEVEX3of4(insn->vectorExtensionPrefix[2])) {
+      case VEX_PREFIX_66:
+        attrMask |= ATTR_OPSIZE;
+        break;
+      case VEX_PREFIX_F3:
+        attrMask |= ATTR_XS;
+        break;
+      case VEX_PREFIX_F2:
+        attrMask |= ATTR_XD;
+        break;
+      }
+
+      if (zFromEVEX4of4(insn->vectorExtensionPrefix[3]))
+        attrMask |= ATTR_EVEXKZ;
+      if (bFromEVEX4of4(insn->vectorExtensionPrefix[3]))
+        attrMask |= ATTR_EVEXB;
+      if (aaaFromEVEX4of4(insn->vectorExtensionPrefix[3]))
+        attrMask |= ATTR_EVEXK;
+      if (lFromEVEX4of4(insn->vectorExtensionPrefix[3]))
+        attrMask |= ATTR_EVEXL;
+      if (l2FromEVEX4of4(insn->vectorExtensionPrefix[3]))
+        attrMask |= ATTR_EVEXL2;
+    }
+    else if (insn->vectorExtensionType == TYPE_VEX_3B) {
+      switch (ppFromVEX3of3(insn->vectorExtensionPrefix[2])) {
+      case VEX_PREFIX_66:
+        attrMask |= ATTR_OPSIZE;
+        break;
+      case VEX_PREFIX_F3:
+        attrMask |= ATTR_XS;
+        break;
+      case VEX_PREFIX_F2:
+        attrMask |= ATTR_XD;
+        break;
+      }
+
+      if (lFromVEX3of3(insn->vectorExtensionPrefix[2]))
+        attrMask |= ATTR_VEXL;
+    }
+    else if (insn->vectorExtensionType == TYPE_VEX_2B) {
+      switch (ppFromVEX2of2(insn->vectorExtensionPrefix[1])) {
+      case VEX_PREFIX_66:
+        attrMask |= ATTR_OPSIZE;
+        break;
+      case VEX_PREFIX_F3:
+        attrMask |= ATTR_XS;
+        break;
+      case VEX_PREFIX_F2:
+        attrMask |= ATTR_XD;
+        break;
+      }
+
+      if (lFromVEX2of2(insn->vectorExtensionPrefix[1]))
+        attrMask |= ATTR_VEXL;
+    }
+    else if (insn->vectorExtensionType == TYPE_XOP) {
+      switch (ppFromXOP3of3(insn->vectorExtensionPrefix[2])) {
+      case VEX_PREFIX_66:
+        attrMask |= ATTR_OPSIZE;
+        break;
+      case VEX_PREFIX_F3:
+        attrMask |= ATTR_XS;
+        break;
+      case VEX_PREFIX_F2:
+        attrMask |= ATTR_XD;
+        break;
+      }
+
+      if (lFromXOP3of3(insn->vectorExtensionPrefix[2]))
+        attrMask |= ATTR_VEXL;
+    }
+    else {
+      return -1;
+    }
+  }
+  else {
+    if (insn->mode != MODE_16BIT && isPrefixAtLocation(insn, 0x66, insn->necessaryPrefixLocation))
+      attrMask |= ATTR_OPSIZE;
+    else if (isPrefixAtLocation(insn, 0x67, insn->necessaryPrefixLocation))
+      attrMask |= ATTR_ADSIZE;
+    else if (isPrefixAtLocation(insn, 0xf3, insn->necessaryPrefixLocation))
+      attrMask |= ATTR_XS;
+    else if (isPrefixAtLocation(insn, 0xf2, insn->necessaryPrefixLocation))
+      attrMask |= ATTR_XD;
+  }
+
+  if (insn->rexPrefix & 0x08)
+    attrMask |= ATTR_REXW;
+
+  if (getIDWithAttrMask(&instructionID, insn, attrMask))
+    return -1;
+
+  /*
+   * JCXZ/JECXZ need special handling for 16-bit mode because the meaning
+   * of the AdSize prefix is inverted w.r.t. 32-bit mode.
+   */
+  if (insn->mode == MODE_16BIT && insn->opcode == 0xE3) {
+    const struct InstructionSpecifier *spec;
+    spec = specifierForUID(instructionID);
+
+    /*
+     * Check for Ii8PCRel instructions. We could alternatively do a
+     * string-compare on the names, but this is probably cheaper.
+     */
+    if (x86OperandSets[spec->operands][0].type == TYPE_REL8) {
+      attrMask ^= ATTR_ADSIZE;
+      if (getIDWithAttrMask(&instructionID, insn, attrMask))
+        return -1;
+    }
+  }
+
+  /* The following clauses compensate for limitations of the tables. */
+
+  if ((insn->mode == MODE_16BIT || insn->prefixPresent[0x66]) &&
+      !(attrMask & ATTR_OPSIZE)) {
+    /*
+     * The instruction tables make no distinction between instructions that
+     * allow OpSize anywhere (i.e., 16-bit operations) and that need it in a
+     * particular spot (i.e., many MMX operations).  In general we're
+     * conservative, but in the specific case where OpSize is present but not
+     * in the right place we check if there's a 16-bit operation.
+     */
+
+    const struct InstructionSpecifier *spec;
+    uint16_t instructionIDWithOpsize;
+    const char *specName, *specWithOpSizeName;
+
+    spec = specifierForUID(instructionID);
+
+    if (getIDWithAttrMask(&instructionIDWithOpsize,
+                          insn,
+                          attrMask | ATTR_OPSIZE)) {
+      /*
+       * ModRM required with OpSize but not present; give up and return version
+       * without OpSize set
+       */
+
+      insn->instructionID = instructionID;
+      insn->spec = spec;
+      return 0;
+    }
+
+    specName = GetInstrName(instructionID, miiArg);
+    specWithOpSizeName = GetInstrName(instructionIDWithOpsize, miiArg);
+
+    if (is16BitEquivalent(specName, specWithOpSizeName) &&
+        (insn->mode == MODE_16BIT) ^ insn->prefixPresent[0x66]) {
+      insn->instructionID = instructionIDWithOpsize;
+      insn->spec = specifierForUID(instructionIDWithOpsize);
+    } else {
+      insn->instructionID = instructionID;
+      insn->spec = spec;
+    }
+    return 0;
+  }
+
+  if (insn->opcodeType == ONEBYTE && insn->opcode == 0x90 &&
+      insn->rexPrefix & 0x01) {
+    /*
+     * NOOP shouldn't decode as NOOP if REX.b is set. Instead
+     * it should decode as XCHG %r8, %eax.
+     */
+
+    const struct InstructionSpecifier *spec;
+    uint16_t instructionIDWithNewOpcode;
+    const struct InstructionSpecifier *specWithNewOpcode;
+
+    spec = specifierForUID(instructionID);
+
+    /* Borrow opcode from one of the other XCHGar opcodes */
+    insn->opcode = 0x91;
+
+    if (getIDWithAttrMask(&instructionIDWithNewOpcode,
+                          insn,
+                          attrMask)) {
+      insn->opcode = 0x90;
+
+      insn->instructionID = instructionID;
+      insn->spec = spec;
+      return 0;
+    }
+
+    specWithNewOpcode = specifierForUID(instructionIDWithNewOpcode);
+
+    /* Change back */
+    insn->opcode = 0x90;
+
+    insn->instructionID = instructionIDWithNewOpcode;
+    insn->spec = specWithNewOpcode;
+
+    return 0;
+  }
+
+  insn->instructionID = instructionID;
+  insn->spec = specifierForUID(insn->instructionID);
+
+  return 0;
+}
+
+/*
+ * readSIB - Consumes the SIB byte to determine addressing information for an
+ *   instruction.
+ *
+ * @param insn  - The instruction whose SIB byte is to be read.
+ * @return      - 0 if the SIB byte was successfully read; nonzero otherwise.
+ */
+static int readSIB(struct InternalInstruction* insn) {
+  SIBIndex sibIndexBase = SIB_INDEX_NONE;
+  SIBBase sibBaseBase = SIB_BASE_NONE;
+  uint8_t index, base;
+
+  dbgprintf(insn, "readSIB()");
+
+  if (insn->consumedSIB)
+    return 0;
+
+  insn->consumedSIB = true;
+
+  switch (insn->addressSize) {
+  case 2:
+    dbgprintf(insn, "SIB-based addressing doesn't work in 16-bit mode");
+    return -1;
+  case 4:
+    sibIndexBase = SIB_INDEX_EAX;
+    sibBaseBase = SIB_BASE_EAX;
+    break;
+  case 8:
+    sibIndexBase = SIB_INDEX_RAX;
+    sibBaseBase = SIB_BASE_RAX;
+    break;
+  }
+
+  if (consumeByte(insn, &insn->sib))
+    return -1;
+
+  index = indexFromSIB(insn->sib) | (xFromREX(insn->rexPrefix) << 3);
+  if (insn->vectorExtensionType == TYPE_EVEX)
+    index |= v2FromEVEX4of4(insn->vectorExtensionPrefix[3]) << 4;
+
+  switch (index) {
+  case 0x4:
+    insn->sibIndex = SIB_INDEX_NONE;
+    break;
+  default:
+    insn->sibIndex = (SIBIndex)(sibIndexBase + index);
+    if (insn->sibIndex == SIB_INDEX_sib ||
+        insn->sibIndex == SIB_INDEX_sib64)
+      insn->sibIndex = SIB_INDEX_NONE;
+    break;
+  }
+
+  switch (scaleFromSIB(insn->sib)) {
+  case 0:
+    insn->sibScale = 1;
+    break;
+  case 1:
+    insn->sibScale = 2;
+    break;
+  case 2:
+    insn->sibScale = 4;
+    break;
+  case 3:
+    insn->sibScale = 8;
+    break;
+  }
+
+  base = baseFromSIB(insn->sib) | (bFromREX(insn->rexPrefix) << 3);
+
+  switch (base) {
+  case 0x5:
+  case 0xd:
+    switch (modFromModRM(insn->modRM)) {
+    case 0x0:
+      insn->eaDisplacement = EA_DISP_32;
+      insn->sibBase = SIB_BASE_NONE;
+      break;
+    case 0x1:
+      insn->eaDisplacement = EA_DISP_8;
+      insn->sibBase = (SIBBase)(sibBaseBase + base);
+      break;
+    case 0x2:
+      insn->eaDisplacement = EA_DISP_32;
+      insn->sibBase = (SIBBase)(sibBaseBase + base);
+      break;
+    case 0x3:
+      debug("Cannot have Mod = 0b11 and a SIB byte");
+      return -1;
+    }
+    break;
+  default:
+    insn->sibBase = (SIBBase)(sibBaseBase + base);
+    break;
+  }
+
+  return 0;
+}
+
+/*
+ * readDisplacement - Consumes the displacement of an instruction.
+ *
+ * @param insn  - The instruction whose displacement is to be read.
+ * @return      - 0 if the displacement byte was successfully read; nonzero
+ *                otherwise.
+ */
+static int readDisplacement(struct InternalInstruction* insn) {
+  int8_t d8;
+  int16_t d16;
+  int32_t d32;
+
+  dbgprintf(insn, "readDisplacement()");
+
+  if (insn->consumedDisplacement)
+    return 0;
+
+  insn->consumedDisplacement = true;
+  insn->displacementOffset = insn->readerCursor - insn->startLocation;
+
+  switch (insn->eaDisplacement) {
+  case EA_DISP_NONE:
+    insn->consumedDisplacement = false;
+    break;
+  case EA_DISP_8:
+    if (consumeInt8(insn, &d8))
+      return -1;
+    insn->displacement = d8;
+    break;
+  case EA_DISP_16:
+    if (consumeInt16(insn, &d16))
+      return -1;
+    insn->displacement = d16;
+    break;
+  case EA_DISP_32:
+    if (consumeInt32(insn, &d32))
+      return -1;
+    insn->displacement = d32;
+    break;
+  }
+
+  insn->consumedDisplacement = true;
+  return 0;
+}
+
+/*
+ * readModRM - Consumes all addressing information (ModR/M byte, SIB byte, and
+ *   displacement) for an instruction and interprets it.
+ *
+ * @param insn  - The instruction whose addressing information is to be read.
+ * @return      - 0 if the information was successfully read; nonzero otherwise.
+ */
+static int readModRM(struct InternalInstruction* insn) {
+  uint8_t mod, rm, reg;
+
+  dbgprintf(insn, "readModRM()");
+
+  if (insn->consumedModRM)
+    return 0;
+
+  if (consumeByte(insn, &insn->modRM))
+    return -1;
+  insn->consumedModRM = true;
+
+  mod     = modFromModRM(insn->modRM);
+  rm      = rmFromModRM(insn->modRM);
+  reg     = regFromModRM(insn->modRM);
+
+  /*
+   * This goes by insn->registerSize to pick the correct register, which messes
+   * up if we're using (say) XMM or 8-bit register operands.  That gets fixed in
+   * fixupReg().
+   */
+  switch (insn->registerSize) {
+  case 2:
+    insn->regBase = MODRM_REG_AX;
+    insn->eaRegBase = EA_REG_AX;
+    break;
+  case 4:
+    insn->regBase = MODRM_REG_EAX;
+    insn->eaRegBase = EA_REG_EAX;
+    break;
+  case 8:
+    insn->regBase = MODRM_REG_RAX;
+    insn->eaRegBase = EA_REG_RAX;
+    break;
+  }
+
+  reg |= rFromREX(insn->rexPrefix) << 3;
+  rm  |= bFromREX(insn->rexPrefix) << 3;
+  if (insn->vectorExtensionType == TYPE_EVEX) {
+    reg |= r2FromEVEX2of4(insn->vectorExtensionPrefix[1]) << 4;
+    rm  |=  xFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 4;
+  }
+
+  insn->reg = (Reg)(insn->regBase + reg);
+
+  switch (insn->addressSize) {
+  case 2:
+    insn->eaBaseBase = EA_BASE_BX_SI;
+
+    switch (mod) {
+    case 0x0:
+      if (rm == 0x6) {
+        insn->eaBase = EA_BASE_NONE;
+        insn->eaDisplacement = EA_DISP_16;
+        if (readDisplacement(insn))
+          return -1;
+      } else {
+        insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+        insn->eaDisplacement = EA_DISP_NONE;
+      }
+      break;
+    case 0x1:
+      insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+      insn->eaDisplacement = EA_DISP_8;
+      insn->displacementSize = 1;
+      if (readDisplacement(insn))
+        return -1;
+      break;
+    case 0x2:
+      insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+      insn->eaDisplacement = EA_DISP_16;
+      if (readDisplacement(insn))
+        return -1;
+      break;
+    case 0x3:
+      insn->eaBase = (EABase)(insn->eaRegBase + rm);
+      if (readDisplacement(insn))
+        return -1;
+      break;
+    }
+    break;
+  case 4:
+  case 8:
+    insn->eaBaseBase = (insn->addressSize == 4 ? EA_BASE_EAX : EA_BASE_RAX);
+
+    switch (mod) {
+    case 0x0:
+      insn->eaDisplacement = EA_DISP_NONE; /* readSIB may override this */
+      switch (rm) {
+      case 0x14:
+      case 0x4:
+      case 0xc:   /* in case REXW.b is set */
+        insn->eaBase = (insn->addressSize == 4 ?
+                        EA_BASE_sib : EA_BASE_sib64);
+        if (readSIB(insn) || readDisplacement(insn))
+          return -1;
+        break;
+      case 0x5:
+        insn->eaBase = EA_BASE_NONE;
+        insn->eaDisplacement = EA_DISP_32;
+        if (readDisplacement(insn))
+          return -1;
+        break;
+      default:
+        insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+        break;
+      }
+      break;
+    case 0x1:
+      insn->displacementSize = 1;
+      /* FALLTHROUGH */
+    case 0x2:
+      insn->eaDisplacement = (mod == 0x1 ? EA_DISP_8 : EA_DISP_32);
+      switch (rm) {
+      case 0x14:
+      case 0x4:
+      case 0xc:   /* in case REXW.b is set */
+        insn->eaBase = EA_BASE_sib;
+        if (readSIB(insn) || readDisplacement(insn))
+          return -1;
+        break;
+      default:
+        insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+        if (readDisplacement(insn))
+          return -1;
+        break;
+      }
+      break;
+    case 0x3:
+      insn->eaDisplacement = EA_DISP_NONE;
+      insn->eaBase = (EABase)(insn->eaRegBase + rm);
+      break;
+    }
+    break;
+  } /* switch (insn->addressSize) */
+
+  return 0;
+}
+
+#define GENERIC_FIXUP_FUNC(name, base, prefix)            \
+  static uint8_t name(struct InternalInstruction *insn,   \
+                      OperandType type,                   \
+                      uint8_t index,                      \
+                      uint8_t *valid) {                   \
+    *valid = 1;                                           \
+    switch (type) {                                       \
+    default:                                              \
+      debug("Unhandled register type");                   \
+      *valid = 0;                                         \
+      return 0;                                           \
+    case TYPE_Rv:                                         \
+      return base + index;                                \
+    case TYPE_R8:                                         \
+      if (insn->rexPrefix &&                              \
+         index >= 4 && index <= 7) {                      \
+        return prefix##_SPL + (index - 4);                \
+      } else {                                            \
+        return prefix##_AL + index;                       \
+      }                                                   \
+    case TYPE_R16:                                        \
+      return prefix##_AX + index;                         \
+    case TYPE_R32:                                        \
+      return prefix##_EAX + index;                        \
+    case TYPE_R64:                                        \
+      return prefix##_RAX + index;                        \
+    case TYPE_XMM512:                                     \
+      return prefix##_ZMM0 + index;                       \
+    case TYPE_XMM256:                                     \
+      return prefix##_YMM0 + index;                       \
+    case TYPE_XMM128:                                     \
+    case TYPE_XMM64:                                      \
+    case TYPE_XMM32:                                      \
+    case TYPE_XMM:                                        \
+      return prefix##_XMM0 + index;                       \
+    case TYPE_VK1:                                        \
+    case TYPE_VK8:                                        \
+    case TYPE_VK16:                                       \
+      return prefix##_K0 + index;                         \
+    case TYPE_MM64:                                       \
+    case TYPE_MM32:                                       \
+    case TYPE_MM:                                         \
+      if (index > 7)                                      \
+        *valid = 0;                                       \
+      return prefix##_MM0 + index;                        \
+    case TYPE_SEGMENTREG:                                 \
+      if (index > 5)                                      \
+        *valid = 0;                                       \
+      return prefix##_ES + index;                         \
+    case TYPE_DEBUGREG:                                   \
+      if (index > 7)                                      \
+        *valid = 0;                                       \
+      return prefix##_DR0 + index;                        \
+    case TYPE_CONTROLREG:                                 \
+      if (index > 8)                                      \
+        *valid = 0;                                       \
+      return prefix##_CR0 + index;                        \
+    }                                                     \
+  }
+
+/*
+ * fixup*Value - Consults an operand type to determine the meaning of the
+ *   reg or R/M field.  If the operand is an XMM operand, for example, an
+ *   operand would be XMM0 instead of AX, which readModRM() would otherwise
+ *   misinterpret it as.
+ *
+ * @param insn  - The instruction containing the operand.
+ * @param type  - The operand type.
+ * @param index - The existing value of the field as reported by readModRM().
+ * @param valid - The address of a uint8_t.  The target is set to 1 if the
+ *                field is valid for the register class; 0 if not.
+ * @return      - The proper value.
+ */
+GENERIC_FIXUP_FUNC(fixupRegValue, insn->regBase,    MODRM_REG)
+GENERIC_FIXUP_FUNC(fixupRMValue,  insn->eaRegBase,  EA_REG)
+
+/*
+ * fixupReg - Consults an operand specifier to determine which of the
+ *   fixup*Value functions to use in correcting readModRM()'ss interpretation.
+ *
+ * @param insn  - See fixup*Value().
+ * @param op    - The operand specifier.
+ * @return      - 0 if fixup was successful; -1 if the register returned was
+ *                invalid for its class.
+ */
+static int fixupReg(struct InternalInstruction *insn,
+                    const struct OperandSpecifier *op) {
+  uint8_t valid;
+
+  dbgprintf(insn, "fixupReg()");
+
+  switch ((OperandEncoding)op->encoding) {
+  default:
+    debug("Expected a REG or R/M encoding in fixupReg");
+    return -1;
+  case ENCODING_VVVV:
+    insn->vvvv = (Reg)fixupRegValue(insn,
+                                    (OperandType)op->type,
+                                    insn->vvvv,
+                                    &valid);
+    if (!valid)
+      return -1;
+    break;
+  case ENCODING_REG:
+    insn->reg = (Reg)fixupRegValue(insn,
+                                   (OperandType)op->type,
+                                   insn->reg - insn->regBase,
+                                   &valid);
+    if (!valid)
+      return -1;
+    break;
+  CASE_ENCODING_RM:
+    if (insn->eaBase >= insn->eaRegBase) {
+      insn->eaBase = (EABase)fixupRMValue(insn,
+                                          (OperandType)op->type,
+                                          insn->eaBase - insn->eaRegBase,
+                                          &valid);
+      if (!valid)
+        return -1;
+    }
+    break;
+  }
+
+  return 0;
+}
+
+/*
+ * readOpcodeRegister - Reads an operand from the opcode field of an
+ *   instruction and interprets it appropriately given the operand width.
+ *   Handles AddRegFrm instructions.
+ *
+ * @param insn  - the instruction whose opcode field is to be read.
+ * @param size  - The width (in bytes) of the register being specified.
+ *                1 means AL and friends, 2 means AX, 4 means EAX, and 8 means
+ *                RAX.
+ * @return      - 0 on success; nonzero otherwise.
+ */
+static int readOpcodeRegister(struct InternalInstruction* insn, uint8_t size) {
+  dbgprintf(insn, "readOpcodeRegister()");
+
+  if (size == 0)
+    size = insn->registerSize;
+
+  switch (size) {
+  case 1:
+    insn->opcodeRegister = (Reg)(MODRM_REG_AL + ((bFromREX(insn->rexPrefix) << 3)
+                                                  | (insn->opcode & 7)));
+    if (insn->rexPrefix &&
+        insn->opcodeRegister >= MODRM_REG_AL + 0x4 &&
+        insn->opcodeRegister < MODRM_REG_AL + 0x8) {
+      insn->opcodeRegister = (Reg)(MODRM_REG_SPL
+                                   + (insn->opcodeRegister - MODRM_REG_AL - 4));
+    }
+
+    break;
+  case 2:
+    insn->opcodeRegister = (Reg)(MODRM_REG_AX
+                                 + ((bFromREX(insn->rexPrefix) << 3)
+                                    | (insn->opcode & 7)));
+    break;
+  case 4:
+    insn->opcodeRegister = (Reg)(MODRM_REG_EAX
+                                 + ((bFromREX(insn->rexPrefix) << 3)
+                                    | (insn->opcode & 7)));
+    break;
+  case 8:
+    insn->opcodeRegister = (Reg)(MODRM_REG_RAX
+                                 + ((bFromREX(insn->rexPrefix) << 3)
+                                    | (insn->opcode & 7)));
+    break;
+  }
+
+  return 0;
+}
+
+/*
+ * readImmediate - Consumes an immediate operand from an instruction, given the
+ *   desired operand size.
+ *
+ * @param insn  - The instruction whose operand is to be read.
+ * @param size  - The width (in bytes) of the operand.
+ * @return      - 0 if the immediate was successfully consumed; nonzero
+ *                otherwise.
+ */
+static int readImmediate(struct InternalInstruction* insn, uint8_t size) {
+  uint8_t imm8;
+  uint16_t imm16;
+  uint32_t imm32;
+  uint64_t imm64;
+
+  dbgprintf(insn, "readImmediate()");
+
+  if (insn->numImmediatesConsumed == 2) {
+    debug("Already consumed two immediates");
+    return -1;
+  }
+
+  if (size == 0)
+    size = insn->immediateSize;
+  else
+    insn->immediateSize = size;
+  insn->immediateOffset = insn->readerCursor - insn->startLocation;
+
+  switch (size) {
+  case 1:
+    if (consumeByte(insn, &imm8))
+      return -1;
+    insn->immediates[insn->numImmediatesConsumed] = imm8;
+    break;
+  case 2:
+    if (consumeUInt16(insn, &imm16))
+      return -1;
+    insn->immediates[insn->numImmediatesConsumed] = imm16;
+    break;
+  case 4:
+    if (consumeUInt32(insn, &imm32))
+      return -1;
+    insn->immediates[insn->numImmediatesConsumed] = imm32;
+    break;
+  case 8:
+    if (consumeUInt64(insn, &imm64))
+      return -1;
+    insn->immediates[insn->numImmediatesConsumed] = imm64;
+    break;
+  }
+
+  insn->numImmediatesConsumed++;
+
+  return 0;
+}
+
+/*
+ * readVVVV - Consumes vvvv from an instruction if it has a VEX prefix.
+ *
+ * @param insn  - The instruction whose operand is to be read.
+ * @return      - 0 if the vvvv was successfully consumed; nonzero
+ *                otherwise.
+ */
+static int readVVVV(struct InternalInstruction* insn) {
+  dbgprintf(insn, "readVVVV()");
+
+  int vvvv;
+  if (insn->vectorExtensionType == TYPE_EVEX)
+    vvvv = (v2FromEVEX4of4(insn->vectorExtensionPrefix[3]) << 4 |
+            vvvvFromEVEX3of4(insn->vectorExtensionPrefix[2]));
+  else if (insn->vectorExtensionType == TYPE_VEX_3B)
+    vvvv = vvvvFromVEX3of3(insn->vectorExtensionPrefix[2]);
+  else if (insn->vectorExtensionType == TYPE_VEX_2B)
+    vvvv = vvvvFromVEX2of2(insn->vectorExtensionPrefix[1]);
+  else if (insn->vectorExtensionType == TYPE_XOP)
+    vvvv = vvvvFromXOP3of3(insn->vectorExtensionPrefix[2]);
+  else
+    return -1;
+
+  if (insn->mode != MODE_64BIT)
+    vvvv &= 0x7;
+
+  insn->vvvv = static_cast<Reg>(vvvv);
+  return 0;
+}
+
+/*
+ * readMaskRegister - Reads an mask register from the opcode field of an
+ *   instruction.
+ *
+ * @param insn    - The instruction whose opcode field is to be read.
+ * @return        - 0 on success; nonzero otherwise.
+ */
+static int readMaskRegister(struct InternalInstruction* insn) {
+  dbgprintf(insn, "readMaskRegister()");
+
+  if (insn->vectorExtensionType != TYPE_EVEX)
+    return -1;
+
+  insn->writemask =
+      static_cast<Reg>(aaaFromEVEX4of4(insn->vectorExtensionPrefix[3]));
+  return 0;
+}
+
+/*
+ * readOperands - Consults the specifier for an instruction and consumes all
+ *   operands for that instruction, interpreting them as it goes.
+ *
+ * @param insn  - The instruction whose operands are to be read and interpreted.
+ * @return      - 0 if all operands could be read; nonzero otherwise.
+ */
+static int readOperands(struct InternalInstruction* insn) {
+  int hasVVVV, needVVVV;
+  int sawRegImm = 0;
+
+  dbgprintf(insn, "readOperands()");
+
+  /* If non-zero vvvv specified, need to make sure one of the operands
+     uses it. */
+  hasVVVV = !readVVVV(insn);
+  needVVVV = hasVVVV && (insn->vvvv != 0);
+
+  for (const auto &Op : x86OperandSets[insn->spec->operands]) {
+    switch (Op.encoding) {
+    case ENCODING_NONE:
+    case ENCODING_SI:
+    case ENCODING_DI:
+      break;
+    case ENCODING_REG:
+    CASE_ENCODING_RM:
+      if (readModRM(insn))
+        return -1;
+      if (fixupReg(insn, &Op))
+        return -1;
+      // Apply the AVX512 compressed displacement scaling factor.
+      if (Op.encoding != ENCODING_REG && insn->eaDisplacement == EA_DISP_8)
+        insn->displacement *= 1 << (Op.encoding - ENCODING_RM);
+      break;
+    case ENCODING_CB:
+    case ENCODING_CW:
+    case ENCODING_CD:
+    case ENCODING_CP:
+    case ENCODING_CO:
+    case ENCODING_CT:
+      dbgprintf(insn, "We currently don't hande code-offset encodings");
+      return -1;
+    case ENCODING_IB:
+      if (sawRegImm) {
+        /* Saw a register immediate so don't read again and instead split the
+           previous immediate.  FIXME: This is a hack. */
+        insn->immediates[insn->numImmediatesConsumed] =
+          insn->immediates[insn->numImmediatesConsumed - 1] & 0xf;
+        ++insn->numImmediatesConsumed;
+        break;
+      }
+      if (readImmediate(insn, 1))
+        return -1;
+      if (Op.type == TYPE_IMM3 &&
+          insn->immediates[insn->numImmediatesConsumed - 1] > 7)
+        return -1;
+      if (Op.type == TYPE_IMM5 &&
+          insn->immediates[insn->numImmediatesConsumed - 1] > 31)
+        return -1;
+      if (Op.type == TYPE_XMM128 ||
+          Op.type == TYPE_XMM256)
+        sawRegImm = 1;
+      break;
+    case ENCODING_IW:
+      if (readImmediate(insn, 2))
+        return -1;
+      break;
+    case ENCODING_ID:
+      if (readImmediate(insn, 4))
+        return -1;
+      break;
+    case ENCODING_IO:
+      if (readImmediate(insn, 8))
+        return -1;
+      break;
+    case ENCODING_Iv:
+      if (readImmediate(insn, insn->immediateSize))
+        return -1;
+      break;
+    case ENCODING_Ia:
+      if (readImmediate(insn, insn->addressSize))
+        return -1;
+      break;
+    case ENCODING_RB:
+      if (readOpcodeRegister(insn, 1))
+        return -1;
+      break;
+    case ENCODING_RW:
+      if (readOpcodeRegister(insn, 2))
+        return -1;
+      break;
+    case ENCODING_RD:
+      if (readOpcodeRegister(insn, 4))
+        return -1;
+      break;
+    case ENCODING_RO:
+      if (readOpcodeRegister(insn, 8))
+        return -1;
+      break;
+    case ENCODING_Rv:
+      if (readOpcodeRegister(insn, 0))
+        return -1;
+      break;
+    case ENCODING_FP:
+      break;
+    case ENCODING_VVVV:
+      needVVVV = 0; /* Mark that we have found a VVVV operand. */
+      if (!hasVVVV)
+        return -1;
+      if (fixupReg(insn, &Op))
+        return -1;
+      break;
+    case ENCODING_WRITEMASK:
+      if (readMaskRegister(insn))
+        return -1;
+      break;
+    case ENCODING_DUP:
+      break;
+    default:
+      dbgprintf(insn, "Encountered an operand with an unknown encoding.");
+      return -1;
+    }
+  }
+
+  /* If we didn't find ENCODING_VVVV operand, but non-zero vvvv present, fail */
+  if (needVVVV) return -1;
+
+  return 0;
+}
+
+/*
+ * decodeInstruction - Reads and interprets a full instruction provided by the
+ *   user.
+ *
+ * @param insn      - A pointer to the instruction to be populated.  Must be
+ *                    pre-allocated.
+ * @param reader    - The function to be used to read the instruction's bytes.
+ * @param readerArg - A generic argument to be passed to the reader to store
+ *                    any internal state.
+ * @param logger    - If non-NULL, the function to be used to write log messages
+ *                    and warnings.
+ * @param loggerArg - A generic argument to be passed to the logger to store
+ *                    any internal state.
+ * @param startLoc  - The address (in the reader's address space) of the first
+ *                    byte in the instruction.
+ * @param mode      - The mode (real mode, IA-32e, or IA-32e in 64-bit mode) to
+ *                    decode the instruction in.
+ * @return          - 0 if the instruction's memory could be read; nonzero if
+ *                    not.
+ */
+int llvm::X86Disassembler::decodeInstruction(
+    struct InternalInstruction *insn, byteReader_t reader,
+    const void *readerArg, dlog_t logger, void *loggerArg, const void *miiArg,
+    uint64_t startLoc, DisassemblerMode mode) {
+  memset(insn, 0, sizeof(struct InternalInstruction));
+
+  insn->reader = reader;
+  insn->readerArg = readerArg;
+  insn->dlog = logger;
+  insn->dlogArg = loggerArg;
+  insn->startLocation = startLoc;
+  insn->readerCursor = startLoc;
+  insn->mode = mode;
+  insn->numImmediatesConsumed = 0;
+
+  if (readPrefixes(insn)       ||
+      readOpcode(insn)         ||
+      getID(insn, miiArg)      ||
+      insn->instructionID == 0 ||
+      readOperands(insn))
+    return -1;
+
+  insn->operands = x86OperandSets[insn->spec->operands];
+
+  insn->length = insn->readerCursor - insn->startLocation;
+
+  dbgprintf(insn, "Read from 0x%llx to 0x%llx: length %zu",
+            startLoc, insn->readerCursor, insn->length);
+
+  if (insn->length > 15)
+    dbgprintf(insn, "Instruction exceeds 15-byte limit");
+
+  return 0;
+}
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
new file mode 100644
index 0000000..8c45402
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
@@ -0,0 +1,662 @@
+//===-- X86DisassemblerDecoderInternal.h - Disassembler decoder -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the X86 Disassembler.
+// It contains the public interface of the instruction decoder.
+// Documentation for the disassembler can be found in X86Disassembler.h.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86DISASSEMBLERDECODER_H
+#define X86DISASSEMBLERDECODER_H
+
+#include "X86DisassemblerDecoderCommon.h"
+#include "llvm/ADT/ArrayRef.h"
+
+namespace llvm {
+namespace X86Disassembler {
+
+// Accessor functions for various fields of an Intel instruction
+#define modFromModRM(modRM)  (((modRM) & 0xc0) >> 6)
+#define regFromModRM(modRM)  (((modRM) & 0x38) >> 3)
+#define rmFromModRM(modRM)   ((modRM) & 0x7)
+#define scaleFromSIB(sib)    (((sib) & 0xc0) >> 6)
+#define indexFromSIB(sib)    (((sib) & 0x38) >> 3)
+#define baseFromSIB(sib)     ((sib) & 0x7)
+#define wFromREX(rex)        (((rex) & 0x8) >> 3)
+#define rFromREX(rex)        (((rex) & 0x4) >> 2)
+#define xFromREX(rex)        (((rex) & 0x2) >> 1)
+#define bFromREX(rex)        ((rex) & 0x1)
+
+#define rFromEVEX2of4(evex)     (((~(evex)) & 0x80) >> 7)
+#define xFromEVEX2of4(evex)     (((~(evex)) & 0x40) >> 6)
+#define bFromEVEX2of4(evex)     (((~(evex)) & 0x20) >> 5)
+#define r2FromEVEX2of4(evex)    (((~(evex)) & 0x10) >> 4)
+#define mmFromEVEX2of4(evex)    ((evex) & 0x3)
+#define wFromEVEX3of4(evex)     (((evex) & 0x80) >> 7)
+#define vvvvFromEVEX3of4(evex)  (((~(evex)) & 0x78) >> 3)
+#define ppFromEVEX3of4(evex)    ((evex) & 0x3)
+#define zFromEVEX4of4(evex)     (((evex) & 0x80) >> 7)
+#define l2FromEVEX4of4(evex)    (((evex) & 0x40) >> 6)
+#define lFromEVEX4of4(evex)     (((evex) & 0x20) >> 5)
+#define bFromEVEX4of4(evex)     (((evex) & 0x10) >> 4)
+#define v2FromEVEX4of4(evex)    (((~evex) & 0x8) >> 3)
+#define aaaFromEVEX4of4(evex)   ((evex) & 0x7)
+
+#define rFromVEX2of3(vex)       (((~(vex)) & 0x80) >> 7)
+#define xFromVEX2of3(vex)       (((~(vex)) & 0x40) >> 6)
+#define bFromVEX2of3(vex)       (((~(vex)) & 0x20) >> 5)
+#define mmmmmFromVEX2of3(vex)   ((vex) & 0x1f)
+#define wFromVEX3of3(vex)       (((vex) & 0x80) >> 7)
+#define vvvvFromVEX3of3(vex)    (((~(vex)) & 0x78) >> 3)
+#define lFromVEX3of3(vex)       (((vex) & 0x4) >> 2)
+#define ppFromVEX3of3(vex)      ((vex) & 0x3)
+
+#define rFromVEX2of2(vex)       (((~(vex)) & 0x80) >> 7)
+#define vvvvFromVEX2of2(vex)    (((~(vex)) & 0x78) >> 3)
+#define lFromVEX2of2(vex)       (((vex) & 0x4) >> 2)
+#define ppFromVEX2of2(vex)      ((vex) & 0x3)
+
+#define rFromXOP2of3(xop)       (((~(xop)) & 0x80) >> 7)
+#define xFromXOP2of3(xop)       (((~(xop)) & 0x40) >> 6)
+#define bFromXOP2of3(xop)       (((~(xop)) & 0x20) >> 5)
+#define mmmmmFromXOP2of3(xop)   ((xop) & 0x1f)
+#define wFromXOP3of3(xop)       (((xop) & 0x80) >> 7)
+#define vvvvFromXOP3of3(vex)    (((~(vex)) & 0x78) >> 3)
+#define lFromXOP3of3(xop)       (((xop) & 0x4) >> 2)
+#define ppFromXOP3of3(xop)      ((xop) & 0x3)
+
+// These enums represent Intel registers for use by the decoder.
+#define REGS_8BIT     \
+  ENTRY(AL)           \
+  ENTRY(CL)           \
+  ENTRY(DL)           \
+  ENTRY(BL)           \
+  ENTRY(AH)           \
+  ENTRY(CH)           \
+  ENTRY(DH)           \
+  ENTRY(BH)           \
+  ENTRY(R8B)          \
+  ENTRY(R9B)          \
+  ENTRY(R10B)         \
+  ENTRY(R11B)         \
+  ENTRY(R12B)         \
+  ENTRY(R13B)         \
+  ENTRY(R14B)         \
+  ENTRY(R15B)         \
+  ENTRY(SPL)          \
+  ENTRY(BPL)          \
+  ENTRY(SIL)          \
+  ENTRY(DIL)
+
+#define EA_BASES_16BIT  \
+  ENTRY(BX_SI)          \
+  ENTRY(BX_DI)          \
+  ENTRY(BP_SI)          \
+  ENTRY(BP_DI)          \
+  ENTRY(SI)             \
+  ENTRY(DI)             \
+  ENTRY(BP)             \
+  ENTRY(BX)             \
+  ENTRY(R8W)            \
+  ENTRY(R9W)            \
+  ENTRY(R10W)           \
+  ENTRY(R11W)           \
+  ENTRY(R12W)           \
+  ENTRY(R13W)           \
+  ENTRY(R14W)           \
+  ENTRY(R15W)
+
+#define REGS_16BIT    \
+  ENTRY(AX)           \
+  ENTRY(CX)           \
+  ENTRY(DX)           \
+  ENTRY(BX)           \
+  ENTRY(SP)           \
+  ENTRY(BP)           \
+  ENTRY(SI)           \
+  ENTRY(DI)           \
+  ENTRY(R8W)          \
+  ENTRY(R9W)          \
+  ENTRY(R10W)         \
+  ENTRY(R11W)         \
+  ENTRY(R12W)         \
+  ENTRY(R13W)         \
+  ENTRY(R14W)         \
+  ENTRY(R15W)
+
+#define EA_BASES_32BIT  \
+  ENTRY(EAX)            \
+  ENTRY(ECX)            \
+  ENTRY(EDX)            \
+  ENTRY(EBX)            \
+  ENTRY(sib)            \
+  ENTRY(EBP)            \
+  ENTRY(ESI)            \
+  ENTRY(EDI)            \
+  ENTRY(R8D)            \
+  ENTRY(R9D)            \
+  ENTRY(R10D)           \
+  ENTRY(R11D)           \
+  ENTRY(R12D)           \
+  ENTRY(R13D)           \
+  ENTRY(R14D)           \
+  ENTRY(R15D)
+
+#define REGS_32BIT  \
+  ENTRY(EAX)        \
+  ENTRY(ECX)        \
+  ENTRY(EDX)        \
+  ENTRY(EBX)        \
+  ENTRY(ESP)        \
+  ENTRY(EBP)        \
+  ENTRY(ESI)        \
+  ENTRY(EDI)        \
+  ENTRY(R8D)        \
+  ENTRY(R9D)        \
+  ENTRY(R10D)       \
+  ENTRY(R11D)       \
+  ENTRY(R12D)       \
+  ENTRY(R13D)       \
+  ENTRY(R14D)       \
+  ENTRY(R15D)
+
+#define EA_BASES_64BIT  \
+  ENTRY(RAX)            \
+  ENTRY(RCX)            \
+  ENTRY(RDX)            \
+  ENTRY(RBX)            \
+  ENTRY(sib64)          \
+  ENTRY(RBP)            \
+  ENTRY(RSI)            \
+  ENTRY(RDI)            \
+  ENTRY(R8)             \
+  ENTRY(R9)             \
+  ENTRY(R10)            \
+  ENTRY(R11)            \
+  ENTRY(R12)            \
+  ENTRY(R13)            \
+  ENTRY(R14)            \
+  ENTRY(R15)
+
+#define REGS_64BIT  \
+  ENTRY(RAX)        \
+  ENTRY(RCX)        \
+  ENTRY(RDX)        \
+  ENTRY(RBX)        \
+  ENTRY(RSP)        \
+  ENTRY(RBP)        \
+  ENTRY(RSI)        \
+  ENTRY(RDI)        \
+  ENTRY(R8)         \
+  ENTRY(R9)         \
+  ENTRY(R10)        \
+  ENTRY(R11)        \
+  ENTRY(R12)        \
+  ENTRY(R13)        \
+  ENTRY(R14)        \
+  ENTRY(R15)
+
+#define REGS_MMX  \
+  ENTRY(MM0)      \
+  ENTRY(MM1)      \
+  ENTRY(MM2)      \
+  ENTRY(MM3)      \
+  ENTRY(MM4)      \
+  ENTRY(MM5)      \
+  ENTRY(MM6)      \
+  ENTRY(MM7)
+
+#define REGS_XMM  \
+  ENTRY(XMM0)     \
+  ENTRY(XMM1)     \
+  ENTRY(XMM2)     \
+  ENTRY(XMM3)     \
+  ENTRY(XMM4)     \
+  ENTRY(XMM5)     \
+  ENTRY(XMM6)     \
+  ENTRY(XMM7)     \
+  ENTRY(XMM8)     \
+  ENTRY(XMM9)     \
+  ENTRY(XMM10)    \
+  ENTRY(XMM11)    \
+  ENTRY(XMM12)    \
+  ENTRY(XMM13)    \
+  ENTRY(XMM14)    \
+  ENTRY(XMM15)    \
+  ENTRY(XMM16)    \
+  ENTRY(XMM17)    \
+  ENTRY(XMM18)    \
+  ENTRY(XMM19)    \
+  ENTRY(XMM20)    \
+  ENTRY(XMM21)    \
+  ENTRY(XMM22)    \
+  ENTRY(XMM23)    \
+  ENTRY(XMM24)    \
+  ENTRY(XMM25)    \
+  ENTRY(XMM26)    \
+  ENTRY(XMM27)    \
+  ENTRY(XMM28)    \
+  ENTRY(XMM29)    \
+  ENTRY(XMM30)    \
+  ENTRY(XMM31)
+
+#define REGS_YMM  \
+  ENTRY(YMM0)     \
+  ENTRY(YMM1)     \
+  ENTRY(YMM2)     \
+  ENTRY(YMM3)     \
+  ENTRY(YMM4)     \
+  ENTRY(YMM5)     \
+  ENTRY(YMM6)     \
+  ENTRY(YMM7)     \
+  ENTRY(YMM8)     \
+  ENTRY(YMM9)     \
+  ENTRY(YMM10)    \
+  ENTRY(YMM11)    \
+  ENTRY(YMM12)    \
+  ENTRY(YMM13)    \
+  ENTRY(YMM14)    \
+  ENTRY(YMM15)    \
+  ENTRY(YMM16)    \
+  ENTRY(YMM17)    \
+  ENTRY(YMM18)    \
+  ENTRY(YMM19)    \
+  ENTRY(YMM20)    \
+  ENTRY(YMM21)    \
+  ENTRY(YMM22)    \
+  ENTRY(YMM23)    \
+  ENTRY(YMM24)    \
+  ENTRY(YMM25)    \
+  ENTRY(YMM26)    \
+  ENTRY(YMM27)    \
+  ENTRY(YMM28)    \
+  ENTRY(YMM29)    \
+  ENTRY(YMM30)    \
+  ENTRY(YMM31)
+
+#define REGS_ZMM  \
+  ENTRY(ZMM0)     \
+  ENTRY(ZMM1)     \
+  ENTRY(ZMM2)     \
+  ENTRY(ZMM3)     \
+  ENTRY(ZMM4)     \
+  ENTRY(ZMM5)     \
+  ENTRY(ZMM6)     \
+  ENTRY(ZMM7)     \
+  ENTRY(ZMM8)     \
+  ENTRY(ZMM9)     \
+  ENTRY(ZMM10)    \
+  ENTRY(ZMM11)    \
+  ENTRY(ZMM12)    \
+  ENTRY(ZMM13)    \
+  ENTRY(ZMM14)    \
+  ENTRY(ZMM15)    \
+  ENTRY(ZMM16)    \
+  ENTRY(ZMM17)    \
+  ENTRY(ZMM18)    \
+  ENTRY(ZMM19)    \
+  ENTRY(ZMM20)    \
+  ENTRY(ZMM21)    \
+  ENTRY(ZMM22)    \
+  ENTRY(ZMM23)    \
+  ENTRY(ZMM24)    \
+  ENTRY(ZMM25)    \
+  ENTRY(ZMM26)    \
+  ENTRY(ZMM27)    \
+  ENTRY(ZMM28)    \
+  ENTRY(ZMM29)    \
+  ENTRY(ZMM30)    \
+  ENTRY(ZMM31)
+
+#define REGS_MASKS \
+  ENTRY(K0)        \
+  ENTRY(K1)        \
+  ENTRY(K2)        \
+  ENTRY(K3)        \
+  ENTRY(K4)        \
+  ENTRY(K5)        \
+  ENTRY(K6)        \
+  ENTRY(K7)
+
+#define REGS_SEGMENT \
+  ENTRY(ES)          \
+  ENTRY(CS)          \
+  ENTRY(SS)          \
+  ENTRY(DS)          \
+  ENTRY(FS)          \
+  ENTRY(GS)
+
+#define REGS_DEBUG  \
+  ENTRY(DR0)        \
+  ENTRY(DR1)        \
+  ENTRY(DR2)        \
+  ENTRY(DR3)        \
+  ENTRY(DR4)        \
+  ENTRY(DR5)        \
+  ENTRY(DR6)        \
+  ENTRY(DR7)
+
+#define REGS_CONTROL  \
+  ENTRY(CR0)          \
+  ENTRY(CR1)          \
+  ENTRY(CR2)          \
+  ENTRY(CR3)          \
+  ENTRY(CR4)          \
+  ENTRY(CR5)          \
+  ENTRY(CR6)          \
+  ENTRY(CR7)          \
+  ENTRY(CR8)
+
+#define ALL_EA_BASES  \
+  EA_BASES_16BIT      \
+  EA_BASES_32BIT      \
+  EA_BASES_64BIT
+
+#define ALL_SIB_BASES \
+  REGS_32BIT          \
+  REGS_64BIT
+
+#define ALL_REGS      \
+  REGS_8BIT           \
+  REGS_16BIT          \
+  REGS_32BIT          \
+  REGS_64BIT          \
+  REGS_MMX            \
+  REGS_XMM            \
+  REGS_YMM            \
+  REGS_ZMM            \
+  REGS_MASKS          \
+  REGS_SEGMENT        \
+  REGS_DEBUG          \
+  REGS_CONTROL        \
+  ENTRY(RIP)
+
+/// \brief All possible values of the base field for effective-address
+/// computations, a.k.a. the Mod and R/M fields of the ModR/M byte.
+/// We distinguish between bases (EA_BASE_*) and registers that just happen
+/// to be referred to when Mod == 0b11 (EA_REG_*).
+enum EABase {
+  EA_BASE_NONE,
+#define ENTRY(x) EA_BASE_##x,
+  ALL_EA_BASES
+#undef ENTRY
+#define ENTRY(x) EA_REG_##x,
+  ALL_REGS
+#undef ENTRY
+  EA_max
+};
+
+/// \brief All possible values of the SIB index field.
+/// borrows entries from ALL_EA_BASES with the special case that
+/// sib is synonymous with NONE.
+/// Vector SIB: index can be XMM or YMM.
+enum SIBIndex {
+  SIB_INDEX_NONE,
+#define ENTRY(x) SIB_INDEX_##x,
+  ALL_EA_BASES
+  REGS_XMM
+  REGS_YMM
+  REGS_ZMM
+#undef ENTRY
+  SIB_INDEX_max
+};
+
+/// \brief All possible values of the SIB base field.
+enum SIBBase {
+  SIB_BASE_NONE,
+#define ENTRY(x) SIB_BASE_##x,
+  ALL_SIB_BASES
+#undef ENTRY
+  SIB_BASE_max
+};
+
+/// \brief Possible displacement types for effective-address computations.
+typedef enum {
+  EA_DISP_NONE,
+  EA_DISP_8,
+  EA_DISP_16,
+  EA_DISP_32
+} EADisplacement;
+
+/// \brief All possible values of the reg field in the ModR/M byte.
+enum Reg {
+#define ENTRY(x) MODRM_REG_##x,
+  ALL_REGS
+#undef ENTRY
+  MODRM_REG_max
+};
+
+/// \brief All possible segment overrides.
+enum SegmentOverride {
+  SEG_OVERRIDE_NONE,
+  SEG_OVERRIDE_CS,
+  SEG_OVERRIDE_SS,
+  SEG_OVERRIDE_DS,
+  SEG_OVERRIDE_ES,
+  SEG_OVERRIDE_FS,
+  SEG_OVERRIDE_GS,
+  SEG_OVERRIDE_max
+};
+
+/// \brief Possible values for the VEX.m-mmmm field
+enum VEXLeadingOpcodeByte {
+  VEX_LOB_0F = 0x1,
+  VEX_LOB_0F38 = 0x2,
+  VEX_LOB_0F3A = 0x3
+};
+
+enum XOPMapSelect {
+  XOP_MAP_SELECT_8 = 0x8,
+  XOP_MAP_SELECT_9 = 0x9,
+  XOP_MAP_SELECT_A = 0xA
+};
+
+/// \brief Possible values for the VEX.pp/EVEX.pp field
+enum VEXPrefixCode {
+  VEX_PREFIX_NONE = 0x0,
+  VEX_PREFIX_66 = 0x1,
+  VEX_PREFIX_F3 = 0x2,
+  VEX_PREFIX_F2 = 0x3
+};
+
+enum VectorExtensionType {
+  TYPE_NO_VEX_XOP   = 0x0,
+  TYPE_VEX_2B       = 0x1,
+  TYPE_VEX_3B       = 0x2,
+  TYPE_EVEX         = 0x3,
+  TYPE_XOP          = 0x4
+};
+
+/// \brief Type for the byte reader that the consumer must provide to
+/// the decoder. Reads a single byte from the instruction's address space.
+/// \param arg     A baton that the consumer can associate with any internal
+///                state that it needs.
+/// \param byte    A pointer to a single byte in memory that should be set to
+///                contain the value at address.
+/// \param address The address in the instruction's address space that should
+///                be read from.
+/// \return        -1 if the byte cannot be read for any reason; 0 otherwise.
+typedef int (*byteReader_t)(const void *arg, uint8_t *byte, uint64_t address);
+
+/// \brief Type for the logging function that the consumer can provide to
+/// get debugging output from the decoder.
+/// \param arg A baton that the consumer can associate with any internal
+///            state that it needs.
+/// \param log A string that contains the message.  Will be reused after
+///            the logger returns.
+typedef void (*dlog_t)(void *arg, const char *log);
+
+/// The specification for how to extract and interpret a full instruction and
+/// its operands.
+struct InstructionSpecifier {
+  uint16_t operands;
+};
+
+/// The x86 internal instruction, which is produced by the decoder.
+struct InternalInstruction {
+  // Reader interface (C)
+  byteReader_t reader;
+  // Opaque value passed to the reader
+  const void* readerArg;
+  // The address of the next byte to read via the reader
+  uint64_t readerCursor;
+
+  // Logger interface (C)
+  dlog_t dlog;
+  // Opaque value passed to the logger
+  void* dlogArg;
+
+  // General instruction information
+
+  // The mode to disassemble for (64-bit, protected, real)
+  DisassemblerMode mode;
+  // The start of the instruction, usable with the reader
+  uint64_t startLocation;
+  // The length of the instruction, in bytes
+  size_t length;
+
+  // Prefix state
+
+  // 1 if the prefix byte corresponding to the entry is present; 0 if not
+  uint8_t prefixPresent[0x100];
+  // contains the location (for use with the reader) of the prefix byte
+  uint64_t prefixLocations[0x100];
+  // The value of the vector extension prefix(EVEX/VEX/XOP), if present
+  uint8_t vectorExtensionPrefix[4];
+  // The type of the vector extension prefix
+  VectorExtensionType vectorExtensionType;
+  // The value of the REX prefix, if present
+  uint8_t rexPrefix;
+  // The location where a mandatory prefix would have to be (i.e., right before
+  // the opcode, or right before the REX prefix if one is present).
+  uint64_t necessaryPrefixLocation;
+  // The segment override type
+  SegmentOverride segmentOverride;
+  // 1 if the prefix byte, 0xf2 or 0xf3 is xacquire or xrelease
+  bool xAcquireRelease;
+
+  // Sizes of various critical pieces of data, in bytes
+  uint8_t registerSize;
+  uint8_t addressSize;
+  uint8_t displacementSize;
+  uint8_t immediateSize;
+
+  // Offsets from the start of the instruction to the pieces of data, which is
+  // needed to find relocation entries for adding symbolic operands.
+  uint8_t displacementOffset;
+  uint8_t immediateOffset;
+
+  // opcode state
+
+  // The last byte of the opcode, not counting any ModR/M extension
+  uint8_t opcode;
+  // The ModR/M byte of the instruction, if it is an opcode extension
+  uint8_t modRMExtension;
+
+  // decode state
+
+  // The type of opcode, used for indexing into the array of decode tables
+  OpcodeType opcodeType;
+  // The instruction ID, extracted from the decode table
+  uint16_t instructionID;
+  // The specifier for the instruction, from the instruction info table
+  const InstructionSpecifier *spec;
+
+  // state for additional bytes, consumed during operand decode.  Pattern:
+  // consumed___ indicates that the byte was already consumed and does not
+  // need to be consumed again.
+
+  // The VEX.vvvv field, which contains a third register operand for some AVX
+  // instructions.
+  Reg                           vvvv;
+
+  // The writemask for AVX-512 instructions which is contained in EVEX.aaa
+  Reg                           writemask;
+
+  // The ModR/M byte, which contains most register operands and some portion of
+  // all memory operands.
+  bool                          consumedModRM;
+  uint8_t                       modRM;
+
+  // The SIB byte, used for more complex 32- or 64-bit memory operands
+  bool                          consumedSIB;
+  uint8_t                       sib;
+
+  // The displacement, used for memory operands
+  bool                          consumedDisplacement;
+  int32_t                       displacement;
+
+  // Immediates.  There can be two in some cases
+  uint8_t                       numImmediatesConsumed;
+  uint8_t                       numImmediatesTranslated;
+  uint64_t                      immediates[2];
+
+  // A register or immediate operand encoded into the opcode
+  Reg                           opcodeRegister;
+
+  // Portions of the ModR/M byte
+
+  // These fields determine the allowable values for the ModR/M fields, which
+  // depend on operand and address widths.
+  EABase                        eaBaseBase;
+  EABase                        eaRegBase;
+  Reg                           regBase;
+
+  // The Mod and R/M fields can encode a base for an effective address, or a
+  // register.  These are separated into two fields here.
+  EABase                        eaBase;
+  EADisplacement                eaDisplacement;
+  // The reg field always encodes a register
+  Reg                           reg;
+
+  // SIB state
+  SIBIndex                      sibIndex;
+  uint8_t                       sibScale;
+  SIBBase                       sibBase;
+
+  ArrayRef<OperandSpecifier> operands;
+};
+
+/// \brief Decode one instruction and store the decoding results in
+/// a buffer provided by the consumer.
+/// \param insn      The buffer to store the instruction in.  Allocated by the
+///                  consumer.
+/// \param reader    The byteReader_t for the bytes to be read.
+/// \param readerArg An argument to pass to the reader for storing context
+///                  specific to the consumer.  May be NULL.
+/// \param logger    The dlog_t to be used in printing status messages from the
+///                  disassembler.  May be NULL.
+/// \param loggerArg An argument to pass to the logger for storing context
+///                  specific to the logger.  May be NULL.
+/// \param startLoc  The address (in the reader's address space) of the first
+///                  byte in the instruction.
+/// \param mode      The mode (16-bit, 32-bit, 64-bit) to decode in.
+/// \return          Nonzero if there was an error during decode, 0 otherwise.
+int decodeInstruction(InternalInstruction *insn,
+                      byteReader_t reader,
+                      const void *readerArg,
+                      dlog_t logger,
+                      void *loggerArg,
+                      const void *miiArg,
+                      uint64_t startLoc,
+                      DisassemblerMode mode);
+
+/// \brief Print a message to debugs()
+/// \param file The name of the file printing the debug message.
+/// \param line The line number that printed the debug message.
+/// \param s    The message to print.
+void Debug(const char *file, unsigned line, const char *s);
+
+const char *GetInstrName(unsigned Opcode, const void *mii);
+
+} // namespace X86Disassembler
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
new file mode 100644
index 0000000..13a7b55
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
@@ -0,0 +1,517 @@
+//===-- X86DisassemblerDecoderCommon.h - Disassembler decoder ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the X86 Disassembler.
+// It contains common definitions used by both the disassembler and the table
+//  generator.
+// Documentation for the disassembler can be found in X86Disassembler.h.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86DISASSEMBLERDECODERCOMMON_H
+#define X86DISASSEMBLERDECODERCOMMON_H
+
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+namespace X86Disassembler {
+
+#define INSTRUCTIONS_SYM  x86DisassemblerInstrSpecifiers
+#define CONTEXTS_SYM      x86DisassemblerContexts
+#define ONEBYTE_SYM       x86DisassemblerOneByteOpcodes
+#define TWOBYTE_SYM       x86DisassemblerTwoByteOpcodes
+#define THREEBYTE38_SYM   x86DisassemblerThreeByte38Opcodes
+#define THREEBYTE3A_SYM   x86DisassemblerThreeByte3AOpcodes
+#define XOP8_MAP_SYM      x86DisassemblerXOP8Opcodes
+#define XOP9_MAP_SYM      x86DisassemblerXOP9Opcodes
+#define XOPA_MAP_SYM      x86DisassemblerXOPAOpcodes
+
+#define INSTRUCTIONS_STR  "x86DisassemblerInstrSpecifiers"
+#define CONTEXTS_STR      "x86DisassemblerContexts"
+#define ONEBYTE_STR       "x86DisassemblerOneByteOpcodes"
+#define TWOBYTE_STR       "x86DisassemblerTwoByteOpcodes"
+#define THREEBYTE38_STR   "x86DisassemblerThreeByte38Opcodes"
+#define THREEBYTE3A_STR   "x86DisassemblerThreeByte3AOpcodes"
+#define XOP8_MAP_STR      "x86DisassemblerXOP8Opcodes"
+#define XOP9_MAP_STR      "x86DisassemblerXOP9Opcodes"
+#define XOPA_MAP_STR      "x86DisassemblerXOPAOpcodes"
+
+// Attributes of an instruction that must be known before the opcode can be
+// processed correctly.  Most of these indicate the presence of particular
+// prefixes, but ATTR_64BIT is simply an attribute of the decoding context.
+#define ATTRIBUTE_BITS                  \
+  ENUM_ENTRY(ATTR_NONE,   0x00)         \
+  ENUM_ENTRY(ATTR_64BIT,  (0x1 << 0))   \
+  ENUM_ENTRY(ATTR_XS,     (0x1 << 1))   \
+  ENUM_ENTRY(ATTR_XD,     (0x1 << 2))   \
+  ENUM_ENTRY(ATTR_REXW,   (0x1 << 3))   \
+  ENUM_ENTRY(ATTR_OPSIZE, (0x1 << 4))   \
+  ENUM_ENTRY(ATTR_ADSIZE, (0x1 << 5))   \
+  ENUM_ENTRY(ATTR_VEX,    (0x1 << 6))   \
+  ENUM_ENTRY(ATTR_VEXL,   (0x1 << 7))   \
+  ENUM_ENTRY(ATTR_EVEX,   (0x1 << 8))   \
+  ENUM_ENTRY(ATTR_EVEXL,  (0x1 << 9))   \
+  ENUM_ENTRY(ATTR_EVEXL2, (0x1 << 10))  \
+  ENUM_ENTRY(ATTR_EVEXK,  (0x1 << 11))  \
+  ENUM_ENTRY(ATTR_EVEXKZ, (0x1 << 12))  \
+  ENUM_ENTRY(ATTR_EVEXB,  (0x1 << 13))
+
+#define ENUM_ENTRY(n, v) n = v,
+enum attributeBits {
+  ATTRIBUTE_BITS
+  ATTR_max
+};
+#undef ENUM_ENTRY
+
+// Combinations of the above attributes that are relevant to instruction
+// decode. Although other combinations are possible, they can be reduced to
+// these without affecting the ultimately decoded instruction.
+
+//           Class name           Rank  Rationale for rank assignment
+#define INSTRUCTION_CONTEXTS                                                   \
+  ENUM_ENTRY(IC,                    0,  "says nothing about the instruction")  \
+  ENUM_ENTRY(IC_64BIT,              1,  "says the instruction applies in "     \
+                                        "64-bit mode but no more")             \
+  ENUM_ENTRY(IC_OPSIZE,             3,  "requires an OPSIZE prefix, so "       \
+                                        "operands change width")               \
+  ENUM_ENTRY(IC_ADSIZE,             3,  "requires an ADSIZE prefix, so "       \
+                                        "operands change width")               \
+  ENUM_ENTRY(IC_XD,                 2,  "may say something about the opcode "  \
+                                        "but not the operands")                \
+  ENUM_ENTRY(IC_XS,                 2,  "may say something about the opcode "  \
+                                        "but not the operands")                \
+  ENUM_ENTRY(IC_XD_OPSIZE,          3,  "requires an OPSIZE prefix, so "       \
+                                        "operands change width")               \
+  ENUM_ENTRY(IC_XS_OPSIZE,          3,  "requires an OPSIZE prefix, so "       \
+                                        "operands change width")               \
+  ENUM_ENTRY(IC_64BIT_REXW,         4,  "requires a REX.W prefix, so operands "\
+                                        "change width; overrides IC_OPSIZE")   \
+  ENUM_ENTRY(IC_64BIT_OPSIZE,       3,  "Just as meaningful as IC_OPSIZE")     \
+  ENUM_ENTRY(IC_64BIT_ADSIZE,       3,  "Just as meaningful as IC_ADSIZE")     \
+  ENUM_ENTRY(IC_64BIT_XD,           5,  "XD instructions are SSE; REX.W is "   \
+                                        "secondary")                           \
+  ENUM_ENTRY(IC_64BIT_XS,           5,  "Just as meaningful as IC_64BIT_XD")   \
+  ENUM_ENTRY(IC_64BIT_XD_OPSIZE,    3,  "Just as meaningful as IC_XD_OPSIZE")  \
+  ENUM_ENTRY(IC_64BIT_XS_OPSIZE,    3,  "Just as meaningful as IC_XS_OPSIZE")  \
+  ENUM_ENTRY(IC_64BIT_REXW_XS,      6,  "OPSIZE could mean a different "       \
+                                        "opcode")                              \
+  ENUM_ENTRY(IC_64BIT_REXW_XD,      6,  "Just as meaningful as "               \
+                                        "IC_64BIT_REXW_XS")                    \
+  ENUM_ENTRY(IC_64BIT_REXW_OPSIZE,  7,  "The Dynamic Duo!  Prefer over all "   \
+                                        "else because this changes most "      \
+                                        "operands' meaning")                   \
+  ENUM_ENTRY(IC_VEX,                1,  "requires a VEX prefix")               \
+  ENUM_ENTRY(IC_VEX_XS,             2,  "requires VEX and the XS prefix")      \
+  ENUM_ENTRY(IC_VEX_XD,             2,  "requires VEX and the XD prefix")      \
+  ENUM_ENTRY(IC_VEX_OPSIZE,         2,  "requires VEX and the OpSize prefix")  \
+  ENUM_ENTRY(IC_VEX_W,              3,  "requires VEX and the W prefix")       \
+  ENUM_ENTRY(IC_VEX_W_XS,           4,  "requires VEX, W, and XS prefix")      \
+  ENUM_ENTRY(IC_VEX_W_XD,           4,  "requires VEX, W, and XD prefix")      \
+  ENUM_ENTRY(IC_VEX_W_OPSIZE,       4,  "requires VEX, W, and OpSize")         \
+  ENUM_ENTRY(IC_VEX_L,              3,  "requires VEX and the L prefix")       \
+  ENUM_ENTRY(IC_VEX_L_XS,           4,  "requires VEX and the L and XS prefix")\
+  ENUM_ENTRY(IC_VEX_L_XD,           4,  "requires VEX and the L and XD prefix")\
+  ENUM_ENTRY(IC_VEX_L_OPSIZE,       4,  "requires VEX, L, and OpSize")         \
+  ENUM_ENTRY(IC_VEX_L_W,            4,  "requires VEX, L and W")               \
+  ENUM_ENTRY(IC_VEX_L_W_XS,         5,  "requires VEX, L, W and XS prefix")    \
+  ENUM_ENTRY(IC_VEX_L_W_XD,         5,  "requires VEX, L, W and XD prefix")    \
+  ENUM_ENTRY(IC_VEX_L_W_OPSIZE,     5,  "requires VEX, L, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX,               1,  "requires an EVEX prefix")             \
+  ENUM_ENTRY(IC_EVEX_XS,            2,  "requires EVEX and the XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_XD,            2,  "requires EVEX and the XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_OPSIZE,        2,  "requires EVEX and the OpSize prefix") \
+  ENUM_ENTRY(IC_EVEX_W,             3,  "requires EVEX and the W prefix")      \
+  ENUM_ENTRY(IC_EVEX_W_XS,          4,  "requires EVEX, W, and XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_XD,          4,  "requires EVEX, W, and XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_OPSIZE,      4,  "requires EVEX, W, and OpSize")        \
+  ENUM_ENTRY(IC_EVEX_L,             3,  "requires EVEX and the L prefix")       \
+  ENUM_ENTRY(IC_EVEX_L_XS,          4,  "requires EVEX and the L and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L_XD,          4,  "requires EVEX and the L and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L_OPSIZE,      4,  "requires EVEX, L, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L_W,           3,  "requires EVEX, L and W")               \
+  ENUM_ENTRY(IC_EVEX_L_W_XS,        4,  "requires EVEX, L, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_XD,        4,  "requires EVEX, L, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_OPSIZE,    4,  "requires EVEX, L, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_L2,            3,  "requires EVEX and the L2 prefix")       \
+  ENUM_ENTRY(IC_EVEX_L2_XS,         4,  "requires EVEX and the L2 and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_XD,         4,  "requires EVEX and the L2 and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_OPSIZE,     4,  "requires EVEX, L2, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L2_W,          3,  "requires EVEX, L2 and W")               \
+  ENUM_ENTRY(IC_EVEX_L2_W_XS,       4,  "requires EVEX, L2, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_XD,       4,  "requires EVEX, L2, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE,   4,  "requires EVEX, L2, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_K,             1,  "requires an EVEX_K prefix")             \
+  ENUM_ENTRY(IC_EVEX_XS_K,          2,  "requires EVEX_K and the XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_XD_K,          2,  "requires EVEX_K and the XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_OPSIZE_K,      2,  "requires EVEX_K and the OpSize prefix") \
+  ENUM_ENTRY(IC_EVEX_W_K,           3,  "requires EVEX_K and the W prefix")      \
+  ENUM_ENTRY(IC_EVEX_W_XS_K,        4,  "requires EVEX_K, W, and XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_XD_K,        4,  "requires EVEX_K, W, and XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_OPSIZE_K,    4,  "requires EVEX_K, W, and OpSize")        \
+  ENUM_ENTRY(IC_EVEX_L_K,           3,  "requires EVEX_K and the L prefix")       \
+  ENUM_ENTRY(IC_EVEX_L_XS_K,        4,  "requires EVEX_K and the L and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L_XD_K,        4,  "requires EVEX_K and the L and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L_OPSIZE_K,    4,  "requires EVEX_K, L, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L_W_K,         3,  "requires EVEX_K, L and W")               \
+  ENUM_ENTRY(IC_EVEX_L_W_XS_K,      4,  "requires EVEX_K, L, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_XD_K,      4,  "requires EVEX_K, L, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_K,  4,  "requires EVEX_K, L, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_L2_K,          3,  "requires EVEX_K and the L2 prefix")       \
+  ENUM_ENTRY(IC_EVEX_L2_XS_K,       4,  "requires EVEX_K and the L2 and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_XD_K,       4,  "requires EVEX_K and the L2 and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_OPSIZE_K,   4,  "requires EVEX_K, L2, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L2_W_K,        3,  "requires EVEX_K, L2 and W")               \
+  ENUM_ENTRY(IC_EVEX_L2_W_XS_K,     4,  "requires EVEX_K, L2, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_XD_K,     4,  "requires EVEX_K, L2, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_K, 4,  "requires EVEX_K, L2, W and OpSize")     \
+  ENUM_ENTRY(IC_EVEX_B,             1,  "requires an EVEX_B prefix")             \
+  ENUM_ENTRY(IC_EVEX_XS_B,          2,  "requires EVEX_B and the XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_XD_B,          2,  "requires EVEX_B and the XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_OPSIZE_B,      2,  "requires EVEX_B and the OpSize prefix") \
+  ENUM_ENTRY(IC_EVEX_W_B,           3,  "requires EVEX_B and the W prefix")      \
+  ENUM_ENTRY(IC_EVEX_W_XS_B,        4,  "requires EVEX_B, W, and XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_XD_B,        4,  "requires EVEX_B, W, and XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_OPSIZE_B,    4,  "requires EVEX_B, W, and OpSize")        \
+  ENUM_ENTRY(IC_EVEX_L_B,           3,  "requires EVEX_B and the L prefix")       \
+  ENUM_ENTRY(IC_EVEX_L_XS_B,        4,  "requires EVEX_B and the L and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L_XD_B,        4,  "requires EVEX_B and the L and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L_OPSIZE_B,    4,  "requires EVEX_B, L, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L_W_B,         3,  "requires EVEX_B, L and W")               \
+  ENUM_ENTRY(IC_EVEX_L_W_XS_B,      4,  "requires EVEX_B, L, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_XD_B,      4,  "requires EVEX_B, L, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_B,  4,  "requires EVEX_B, L, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_L2_B,          3,  "requires EVEX_B and the L2 prefix")       \
+  ENUM_ENTRY(IC_EVEX_L2_XS_B,       4,  "requires EVEX_B and the L2 and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_XD_B,       4,  "requires EVEX_B and the L2 and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_OPSIZE_B,   4,  "requires EVEX_B, L2, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L2_W_B,        3,  "requires EVEX_B, L2 and W")               \
+  ENUM_ENTRY(IC_EVEX_L2_W_XS_B,     4,  "requires EVEX_B, L2, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_XD_B,     4,  "requires EVEX_B, L2, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_B, 4,  "requires EVEX_B, L2, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_K_B,           1,  "requires EVEX_B and EVEX_K prefix")             \
+  ENUM_ENTRY(IC_EVEX_XS_K_B,        2,  "requires EVEX_B, EVEX_K and the XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_XD_K_B,        2,  "requires EVEX_B, EVEX_K and the XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_OPSIZE_K_B,    2,  "requires EVEX_B, EVEX_K and the OpSize prefix") \
+  ENUM_ENTRY(IC_EVEX_W_K_B,         3,  "requires EVEX_B, EVEX_K and the W prefix")      \
+  ENUM_ENTRY(IC_EVEX_W_XS_K_B,      4,  "requires EVEX_B, EVEX_K, W, and XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_XD_K_B,      4,  "requires EVEX_B, EVEX_K, W, and XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_OPSIZE_K_B,  4,  "requires EVEX_B, EVEX_K, W, and OpSize")        \
+  ENUM_ENTRY(IC_EVEX_L_K_B,         3,  "requires EVEX_B, EVEX_K and the L prefix")       \
+  ENUM_ENTRY(IC_EVEX_L_XS_K_B,      4,  "requires EVEX_B, EVEX_K and the L and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L_XD_K_B,      4,  "requires EVEX_B, EVEX_K and the L and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L_OPSIZE_K_B,  4,  "requires EVEX_B, EVEX_K, L, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L_W_K_B,       3,  "requires EVEX_B, EVEX_K, L and W")               \
+  ENUM_ENTRY(IC_EVEX_L_W_XS_K_B,    4,  "requires EVEX_B, EVEX_K, L, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_XD_K_B,    4,  "requires EVEX_B, EVEX_K, L, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_K_B,4,  "requires EVEX_B, EVEX_K, L, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_L2_K_B,        3,  "requires EVEX_B, EVEX_K and the L2 prefix")       \
+  ENUM_ENTRY(IC_EVEX_L2_XS_K_B,     4,  "requires EVEX_B, EVEX_K and the L2 and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_XD_K_B,     4,  "requires EVEX_B, EVEX_K and the L2 and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_OPSIZE_K_B, 4,  "requires EVEX_B, EVEX_K, L2, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L2_W_K_B,      3,  "requires EVEX_B, EVEX_K, L2 and W")               \
+  ENUM_ENTRY(IC_EVEX_L2_W_XS_K_B,   4,  "requires EVEX_B, EVEX_K, L2, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_XD_K_B,   4,  "requires EVEX_B, EVEX_K, L2, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_K_B,4,  "requires EVEX_B, EVEX_K, L2, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_KZ_B,           1,  "requires EVEX_B and EVEX_KZ prefix")             \
+  ENUM_ENTRY(IC_EVEX_XS_KZ_B,        2,  "requires EVEX_B, EVEX_KZ and the XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_XD_KZ_B,        2,  "requires EVEX_B, EVEX_KZ and the XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_OPSIZE_KZ_B,    2,  "requires EVEX_B, EVEX_KZ and the OpSize prefix") \
+  ENUM_ENTRY(IC_EVEX_W_KZ_B,         3,  "requires EVEX_B, EVEX_KZ and the W prefix")      \
+  ENUM_ENTRY(IC_EVEX_W_XS_KZ_B,      4,  "requires EVEX_B, EVEX_KZ, W, and XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_XD_KZ_B,      4,  "requires EVEX_B, EVEX_KZ, W, and XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_OPSIZE_KZ_B,  4,  "requires EVEX_B, EVEX_KZ, W, and OpSize")        \
+  ENUM_ENTRY(IC_EVEX_L_KZ_B,           3,  "requires EVEX_B, EVEX_KZ and the L prefix")       \
+  ENUM_ENTRY(IC_EVEX_L_XS_KZ_B,        4,  "requires EVEX_B, EVEX_KZ and the L and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L_XD_KZ_B,        4,  "requires EVEX_B, EVEX_KZ and the L and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L_OPSIZE_KZ_B,    4,  "requires EVEX_B, EVEX_KZ, L, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L_W_KZ_B,         3,  "requires EVEX_B, EVEX_KZ, L and W")               \
+  ENUM_ENTRY(IC_EVEX_L_W_XS_KZ_B,      4,  "requires EVEX_B, EVEX_KZ, L, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_XD_KZ_B,      4,  "requires EVEX_B, EVEX_KZ, L, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_KZ_B,  4,  "requires EVEX_B, EVEX_KZ, L, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_L2_KZ_B,          3,  "requires EVEX_B, EVEX_KZ and the L2 prefix")       \
+  ENUM_ENTRY(IC_EVEX_L2_XS_KZ_B,       4,  "requires EVEX_B, EVEX_KZ and the L2 and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_XD_KZ_B,       4,  "requires EVEX_B, EVEX_KZ and the L2 and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_OPSIZE_KZ_B,   4,  "requires EVEX_B, EVEX_KZ, L2, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L2_W_KZ_B,        3,  "requires EVEX_B, EVEX_KZ, L2 and W")               \
+  ENUM_ENTRY(IC_EVEX_L2_W_XS_KZ_B,     4,  "requires EVEX_B, EVEX_KZ, L2, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_XD_KZ_B,     4,  "requires EVEX_B, EVEX_KZ, L2, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_KZ_B, 4,  "requires EVEX_B, EVEX_KZ, L2, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_KZ,             1,  "requires an EVEX_KZ prefix")             \
+  ENUM_ENTRY(IC_EVEX_XS_KZ,          2,  "requires EVEX_KZ and the XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_XD_KZ,          2,  "requires EVEX_KZ and the XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_OPSIZE_KZ,      2,  "requires EVEX_KZ and the OpSize prefix") \
+  ENUM_ENTRY(IC_EVEX_W_KZ,           3,  "requires EVEX_KZ and the W prefix")      \
+  ENUM_ENTRY(IC_EVEX_W_XS_KZ,        4,  "requires EVEX_KZ, W, and XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_XD_KZ,        4,  "requires EVEX_KZ, W, and XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_OPSIZE_KZ,    4,  "requires EVEX_KZ, W, and OpSize")        \
+  ENUM_ENTRY(IC_EVEX_L_KZ,           3,  "requires EVEX_KZ and the L prefix")       \
+  ENUM_ENTRY(IC_EVEX_L_XS_KZ,        4,  "requires EVEX_KZ and the L and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L_XD_KZ,        4,  "requires EVEX_KZ and the L and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L_OPSIZE_KZ,    4,  "requires EVEX_KZ, L, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L_W_KZ,         3,  "requires EVEX_KZ, L and W")               \
+  ENUM_ENTRY(IC_EVEX_L_W_XS_KZ,      4,  "requires EVEX_KZ, L, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_XD_KZ,      4,  "requires EVEX_KZ, L, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_KZ,  4,  "requires EVEX_KZ, L, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_L2_KZ,          3,  "requires EVEX_KZ and the L2 prefix")       \
+  ENUM_ENTRY(IC_EVEX_L2_XS_KZ,       4,  "requires EVEX_KZ and the L2 and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_XD_KZ,       4,  "requires EVEX_KZ and the L2 and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_OPSIZE_KZ,   4,  "requires EVEX_KZ, L2, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L2_W_KZ,        3,  "requires EVEX_KZ, L2 and W")               \
+  ENUM_ENTRY(IC_EVEX_L2_W_XS_KZ,     4,  "requires EVEX_KZ, L2, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_XD_KZ,     4,  "requires EVEX_KZ, L2, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_KZ, 4,  "requires EVEX_KZ, L2, W and OpSize")
+
+#define ENUM_ENTRY(n, r, d) n,
+enum InstructionContext {
+  INSTRUCTION_CONTEXTS
+  IC_max
+};
+#undef ENUM_ENTRY
+
+// Opcode types, which determine which decode table to use, both in the Intel
+// manual and also for the decoder.
+enum OpcodeType {
+  ONEBYTE       = 0,
+  TWOBYTE       = 1,
+  THREEBYTE_38  = 2,
+  THREEBYTE_3A  = 3,
+  XOP8_MAP      = 4,
+  XOP9_MAP      = 5,
+  XOPA_MAP      = 6
+};
+
+// The following structs are used for the hierarchical decode table.  After
+// determining the instruction's class (i.e., which IC_* constant applies to
+// it), the decoder reads the opcode.  Some instructions require specific
+// values of the ModR/M byte, so the ModR/M byte indexes into the final table.
+//
+// If a ModR/M byte is not required, "required" is left unset, and the values
+// for each instructionID are identical.
+typedef uint16_t InstrUID;
+
+// ModRMDecisionType - describes the type of ModR/M decision, allowing the
+// consumer to determine the number of entries in it.
+//
+// MODRM_ONEENTRY - No matter what the value of the ModR/M byte is, the decoded
+//                  instruction is the same.
+// MODRM_SPLITRM  - If the ModR/M byte is between 0x00 and 0xbf, the opcode
+//                  corresponds to one instruction; otherwise, it corresponds to
+//                  a different instruction.
+// MODRM_SPLITMISC- If the ModR/M byte is between 0x00 and 0xbf, ModR/M byte
+//                  divided by 8 is used to select instruction; otherwise, each
+//                  value of the ModR/M byte could correspond to a different
+//                  instruction.
+// MODRM_SPLITREG - ModR/M byte divided by 8 is used to select instruction. This
+//                  corresponds to instructions that use reg field as opcode
+// MODRM_FULL     - Potentially, each value of the ModR/M byte could correspond
+//                  to a different instruction.
+#define MODRMTYPES            \
+  ENUM_ENTRY(MODRM_ONEENTRY)  \
+  ENUM_ENTRY(MODRM_SPLITRM)   \
+  ENUM_ENTRY(MODRM_SPLITMISC)  \
+  ENUM_ENTRY(MODRM_SPLITREG)  \
+  ENUM_ENTRY(MODRM_FULL)
+
+#define ENUM_ENTRY(n) n,
+enum ModRMDecisionType {
+  MODRMTYPES
+  MODRM_max
+};
+#undef ENUM_ENTRY
+
+#define CASE_ENCODING_RM     \
+    case ENCODING_RM:        \
+    case ENCODING_RM_CD2:    \
+    case ENCODING_RM_CD4:    \
+    case ENCODING_RM_CD8:    \
+    case ENCODING_RM_CD16:   \
+    case ENCODING_RM_CD32:   \
+    case ENCODING_RM_CD64
+
+// Physical encodings of instruction operands.
+#define ENCODINGS                                                              \
+  ENUM_ENTRY(ENCODING_NONE,   "")                                              \
+  ENUM_ENTRY(ENCODING_REG,    "Register operand in ModR/M byte.")              \
+  ENUM_ENTRY(ENCODING_RM,     "R/M operand in ModR/M byte.")                   \
+  ENUM_ENTRY(ENCODING_RM_CD2, "R/M operand with CDisp scaling of 2")           \
+  ENUM_ENTRY(ENCODING_RM_CD4, "R/M operand with CDisp scaling of 4")           \
+  ENUM_ENTRY(ENCODING_RM_CD8, "R/M operand with CDisp scaling of 8")           \
+  ENUM_ENTRY(ENCODING_RM_CD16,"R/M operand with CDisp scaling of 16")          \
+  ENUM_ENTRY(ENCODING_RM_CD32,"R/M operand with CDisp scaling of 32")          \
+  ENUM_ENTRY(ENCODING_RM_CD64,"R/M operand with CDisp scaling of 64")          \
+  ENUM_ENTRY(ENCODING_VVVV,   "Register operand in VEX.vvvv byte.")            \
+  ENUM_ENTRY(ENCODING_WRITEMASK, "Register operand in EVEX.aaa byte.")         \
+  ENUM_ENTRY(ENCODING_CB,     "1-byte code offset (possible new CS value)")    \
+  ENUM_ENTRY(ENCODING_CW,     "2-byte")                                        \
+  ENUM_ENTRY(ENCODING_CD,     "4-byte")                                        \
+  ENUM_ENTRY(ENCODING_CP,     "6-byte")                                        \
+  ENUM_ENTRY(ENCODING_CO,     "8-byte")                                        \
+  ENUM_ENTRY(ENCODING_CT,     "10-byte")                                       \
+  ENUM_ENTRY(ENCODING_IB,     "1-byte immediate")                              \
+  ENUM_ENTRY(ENCODING_IW,     "2-byte")                                        \
+  ENUM_ENTRY(ENCODING_ID,     "4-byte")                                        \
+  ENUM_ENTRY(ENCODING_IO,     "8-byte")                                        \
+  ENUM_ENTRY(ENCODING_RB,     "(AL..DIL, R8L..R15L) Register code added to "   \
+                              "the opcode byte")                               \
+  ENUM_ENTRY(ENCODING_RW,     "(AX..DI, R8W..R15W)")                           \
+  ENUM_ENTRY(ENCODING_RD,     "(EAX..EDI, R8D..R15D)")                         \
+  ENUM_ENTRY(ENCODING_RO,     "(RAX..RDI, R8..R15)")                           \
+  ENUM_ENTRY(ENCODING_FP,     "Position on floating-point stack in ModR/M "    \
+                              "byte.")                                         \
+                                                                               \
+  ENUM_ENTRY(ENCODING_Iv,     "Immediate of operand size")                     \
+  ENUM_ENTRY(ENCODING_Ia,     "Immediate of address size")                     \
+  ENUM_ENTRY(ENCODING_Rv,     "Register code of operand size added to the "    \
+                              "opcode byte")                                   \
+  ENUM_ENTRY(ENCODING_DUP,    "Duplicate of another operand; ID is encoded "   \
+                              "in type")                                       \
+  ENUM_ENTRY(ENCODING_SI,     "Source index; encoded in OpSize/Adsize prefix") \
+  ENUM_ENTRY(ENCODING_DI,     "Destination index; encoded in prefixes")
+
+#define ENUM_ENTRY(n, d) n,
+enum OperandEncoding {
+  ENCODINGS
+  ENCODING_max
+};
+#undef ENUM_ENTRY
+
+// Semantic interpretations of instruction operands.
+#define TYPES                                                                  \
+  ENUM_ENTRY(TYPE_NONE,       "")                                              \
+  ENUM_ENTRY(TYPE_REL8,       "1-byte immediate address")                      \
+  ENUM_ENTRY(TYPE_REL16,      "2-byte")                                        \
+  ENUM_ENTRY(TYPE_REL32,      "4-byte")                                        \
+  ENUM_ENTRY(TYPE_REL64,      "8-byte")                                        \
+  ENUM_ENTRY(TYPE_PTR1616,    "2+2-byte segment+offset address")               \
+  ENUM_ENTRY(TYPE_PTR1632,    "2+4-byte")                                      \
+  ENUM_ENTRY(TYPE_PTR1664,    "2+8-byte")                                      \
+  ENUM_ENTRY(TYPE_R8,         "1-byte register operand")                       \
+  ENUM_ENTRY(TYPE_R16,        "2-byte")                                        \
+  ENUM_ENTRY(TYPE_R32,        "4-byte")                                        \
+  ENUM_ENTRY(TYPE_R64,        "8-byte")                                        \
+  ENUM_ENTRY(TYPE_IMM8,       "1-byte immediate operand")                      \
+  ENUM_ENTRY(TYPE_IMM16,      "2-byte")                                        \
+  ENUM_ENTRY(TYPE_IMM32,      "4-byte")                                        \
+  ENUM_ENTRY(TYPE_IMM64,      "8-byte")                                        \
+  ENUM_ENTRY(TYPE_IMM3,       "1-byte immediate operand between 0 and 7")      \
+  ENUM_ENTRY(TYPE_IMM5,       "1-byte immediate operand between 0 and 31")     \
+  ENUM_ENTRY(TYPE_RM8,        "1-byte register or memory operand")             \
+  ENUM_ENTRY(TYPE_RM16,       "2-byte")                                        \
+  ENUM_ENTRY(TYPE_RM32,       "4-byte")                                        \
+  ENUM_ENTRY(TYPE_RM64,       "8-byte")                                        \
+  ENUM_ENTRY(TYPE_M,          "Memory operand")                                \
+  ENUM_ENTRY(TYPE_M8,         "1-byte")                                        \
+  ENUM_ENTRY(TYPE_M16,        "2-byte")                                        \
+  ENUM_ENTRY(TYPE_M32,        "4-byte")                                        \
+  ENUM_ENTRY(TYPE_M64,        "8-byte")                                        \
+  ENUM_ENTRY(TYPE_LEA,        "Effective address")                             \
+  ENUM_ENTRY(TYPE_M128,       "16-byte (SSE/SSE2)")                            \
+  ENUM_ENTRY(TYPE_M256,       "256-byte (AVX)")                                \
+  ENUM_ENTRY(TYPE_M1616,      "2+2-byte segment+offset address")               \
+  ENUM_ENTRY(TYPE_M1632,      "2+4-byte")                                      \
+  ENUM_ENTRY(TYPE_M1664,      "2+8-byte")                                      \
+  ENUM_ENTRY(TYPE_M16_32,     "2+4-byte two-part memory operand (LIDT, LGDT)") \
+  ENUM_ENTRY(TYPE_M16_16,     "2+2-byte (BOUND)")                              \
+  ENUM_ENTRY(TYPE_M32_32,     "4+4-byte (BOUND)")                              \
+  ENUM_ENTRY(TYPE_M16_64,     "2+8-byte (LIDT, LGDT)")                         \
+  ENUM_ENTRY(TYPE_SRCIDX8,    "1-byte memory at source index")                 \
+  ENUM_ENTRY(TYPE_SRCIDX16,   "2-byte memory at source index")                 \
+  ENUM_ENTRY(TYPE_SRCIDX32,   "4-byte memory at source index")                 \
+  ENUM_ENTRY(TYPE_SRCIDX64,   "8-byte memory at source index")                 \
+  ENUM_ENTRY(TYPE_DSTIDX8,    "1-byte memory at destination index")            \
+  ENUM_ENTRY(TYPE_DSTIDX16,   "2-byte memory at destination index")            \
+  ENUM_ENTRY(TYPE_DSTIDX32,   "4-byte memory at destination index")            \
+  ENUM_ENTRY(TYPE_DSTIDX64,   "8-byte memory at destination index")            \
+  ENUM_ENTRY(TYPE_MOFFS8,     "1-byte memory offset (relative to segment "     \
+                              "base)")                                         \
+  ENUM_ENTRY(TYPE_MOFFS16,    "2-byte")                                        \
+  ENUM_ENTRY(TYPE_MOFFS32,    "4-byte")                                        \
+  ENUM_ENTRY(TYPE_MOFFS64,    "8-byte")                                        \
+  ENUM_ENTRY(TYPE_SREG,       "Byte with single bit set: 0 = ES, 1 = CS, "     \
+                              "2 = SS, 3 = DS, 4 = FS, 5 = GS")                \
+  ENUM_ENTRY(TYPE_M32FP,      "32-bit IEE754 memory floating-point operand")   \
+  ENUM_ENTRY(TYPE_M64FP,      "64-bit")                                        \
+  ENUM_ENTRY(TYPE_M80FP,      "80-bit extended")                               \
+  ENUM_ENTRY(TYPE_M16INT,     "2-byte memory integer operand for use in "      \
+                              "floating-point instructions")                   \
+  ENUM_ENTRY(TYPE_M32INT,     "4-byte")                                        \
+  ENUM_ENTRY(TYPE_M64INT,     "8-byte")                                        \
+  ENUM_ENTRY(TYPE_ST,         "Position on the floating-point stack")          \
+  ENUM_ENTRY(TYPE_MM,         "MMX register operand")                          \
+  ENUM_ENTRY(TYPE_MM32,       "4-byte MMX register or memory operand")         \
+  ENUM_ENTRY(TYPE_MM64,       "8-byte")                                        \
+  ENUM_ENTRY(TYPE_XMM,        "XMM register operand")                          \
+  ENUM_ENTRY(TYPE_XMM32,      "4-byte XMM register or memory operand")         \
+  ENUM_ENTRY(TYPE_XMM64,      "8-byte")                                        \
+  ENUM_ENTRY(TYPE_XMM128,     "16-byte")                                       \
+  ENUM_ENTRY(TYPE_XMM256,     "32-byte")                                       \
+  ENUM_ENTRY(TYPE_XMM512,     "64-byte")                                       \
+  ENUM_ENTRY(TYPE_VK1,        "1-bit")                                         \
+  ENUM_ENTRY(TYPE_VK2,        "2-bit")                                         \
+  ENUM_ENTRY(TYPE_VK4,        "4-bit")                                         \
+  ENUM_ENTRY(TYPE_VK8,        "8-bit")                                         \
+  ENUM_ENTRY(TYPE_VK16,       "16-bit")                                        \
+  ENUM_ENTRY(TYPE_VK32,       "32-bit")                                        \
+  ENUM_ENTRY(TYPE_VK64,       "64-bit")                                        \
+  ENUM_ENTRY(TYPE_XMM0,       "Implicit use of XMM0")                          \
+  ENUM_ENTRY(TYPE_SEGMENTREG, "Segment register operand")                      \
+  ENUM_ENTRY(TYPE_DEBUGREG,   "Debug register operand")                        \
+  ENUM_ENTRY(TYPE_CONTROLREG, "Control register operand")                      \
+                                                                               \
+  ENUM_ENTRY(TYPE_Mv,         "Memory operand of operand size")                \
+  ENUM_ENTRY(TYPE_Rv,         "Register operand of operand size")              \
+  ENUM_ENTRY(TYPE_IMMv,       "Immediate operand of operand size")             \
+  ENUM_ENTRY(TYPE_RELv,       "Immediate address of operand size")             \
+  ENUM_ENTRY(TYPE_DUP0,       "Duplicate of operand 0")                        \
+  ENUM_ENTRY(TYPE_DUP1,       "operand 1")                                     \
+  ENUM_ENTRY(TYPE_DUP2,       "operand 2")                                     \
+  ENUM_ENTRY(TYPE_DUP3,       "operand 3")                                     \
+  ENUM_ENTRY(TYPE_DUP4,       "operand 4")                                     \
+  ENUM_ENTRY(TYPE_M512,       "512-bit FPU/MMX/XMM/MXCSR state")
+
+#define ENUM_ENTRY(n, d) n,
+enum OperandType {
+  TYPES
+  TYPE_max
+};
+#undef ENUM_ENTRY
+
+/// \brief The specification for how to extract and interpret one operand.
+struct OperandSpecifier {
+  uint8_t encoding;
+  uint8_t type;
+};
+
+// Indicates where the opcode modifier (if any) is to be found.  Extended
+// opcodes with AddRegFrm have the opcode modifier in the ModR/M byte.
+#define MODIFIER_TYPES        \
+  ENUM_ENTRY(MODIFIER_NONE)
+
+#define ENUM_ENTRY(n) n,
+enum ModifierType {
+  MODIFIER_TYPES
+  MODIFIER_max
+};
+#undef ENUM_ENTRY
+
+static const unsigned X86_MAX_OPERANDS = 5;
+
+/// Decoding mode for the Intel disassembler.  16-bit, 32-bit, and 64-bit mode
+/// are supported, and represent real mode, IA-32e, and IA-32e in 64-bit mode,
+/// respectively.
+enum DisassemblerMode {
+  MODE_16BIT,
+  MODE_32BIT,
+  MODE_64BIT
+};
+
+} // namespace X86Disassembler
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
new file mode 100644
index 0000000..b45b118
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
@@ -0,0 +1,281 @@
+//===-- X86ATTInstPrinter.cpp - AT&T assembly instruction printing --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file includes code for rendering MCInst instances as AT&T-style
+// assembly.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86ATTInstPrinter.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "X86InstComments.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/FormattedStream.h"
+#include <map>
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+// Include the auto-generated portion of the assembly writer.
+#define PRINT_ALIAS_INSTR
+#include "X86GenAsmWriter.inc"
+
+void X86ATTInstPrinter::printRegName(raw_ostream &OS,
+                                     unsigned RegNo) const {
+  OS << markup("<reg:")
+     << '%' << getRegisterName(RegNo)
+     << markup(">");
+}
+
+void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
+                                  StringRef Annot) {
+  const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+  uint64_t TSFlags = Desc.TSFlags;
+
+  if (TSFlags & X86II::LOCK)
+    OS << "\tlock\n";
+
+  // Try to print any aliases first.
+  if (!printAliasInstr(MI, OS))
+    printInstruction(MI, OS);
+
+  // Next always print the annotation.
+  printAnnotation(OS, Annot);
+
+  // If verbose assembly is enabled, we can print some informative comments.
+  if (CommentStream)
+    EmitAnyX86InstComments(MI, *CommentStream, getRegisterName);
+}
+
+void X86ATTInstPrinter::printSSECC(const MCInst *MI, unsigned Op,
+                                   raw_ostream &O) {
+  int64_t Imm = MI->getOperand(Op).getImm() & 0xf;
+  switch (Imm) {
+  default: llvm_unreachable("Invalid ssecc argument!");
+  case    0: O << "eq"; break;
+  case    1: O << "lt"; break;
+  case    2: O << "le"; break;
+  case    3: O << "unord"; break;
+  case    4: O << "neq"; break;
+  case    5: O << "nlt"; break;
+  case    6: O << "nle"; break;
+  case    7: O << "ord"; break;
+  case    8: O << "eq_uq"; break;
+  case    9: O << "nge"; break;
+  case  0xa: O << "ngt"; break;
+  case  0xb: O << "false"; break;
+  case  0xc: O << "neq_oq"; break;
+  case  0xd: O << "ge"; break;
+  case  0xe: O << "gt"; break;
+  case  0xf: O << "true"; break;
+  }
+}
+
+void X86ATTInstPrinter::printAVXCC(const MCInst *MI, unsigned Op,
+                                   raw_ostream &O) {
+  int64_t Imm = MI->getOperand(Op).getImm() & 0x1f;
+  switch (Imm) {
+  default: llvm_unreachable("Invalid avxcc argument!");
+  case    0: O << "eq"; break;
+  case    1: O << "lt"; break;
+  case    2: O << "le"; break;
+  case    3: O << "unord"; break;
+  case    4: O << "neq"; break;
+  case    5: O << "nlt"; break;
+  case    6: O << "nle"; break;
+  case    7: O << "ord"; break;
+  case    8: O << "eq_uq"; break;
+  case    9: O << "nge"; break;
+  case  0xa: O << "ngt"; break;
+  case  0xb: O << "false"; break;
+  case  0xc: O << "neq_oq"; break;
+  case  0xd: O << "ge"; break;
+  case  0xe: O << "gt"; break;
+  case  0xf: O << "true"; break;
+  case 0x10: O << "eq_os"; break;
+  case 0x11: O << "lt_oq"; break;
+  case 0x12: O << "le_oq"; break;
+  case 0x13: O << "unord_s"; break;
+  case 0x14: O << "neq_us"; break;
+  case 0x15: O << "nlt_uq"; break;
+  case 0x16: O << "nle_uq"; break;
+  case 0x17: O << "ord_s"; break;
+  case 0x18: O << "eq_us"; break;
+  case 0x19: O << "nge_uq"; break;
+  case 0x1a: O << "ngt_uq"; break;
+  case 0x1b: O << "false_os"; break;
+  case 0x1c: O << "neq_os"; break;
+  case 0x1d: O << "ge_oq"; break;
+  case 0x1e: O << "gt_oq"; break;
+  case 0x1f: O << "true_us"; break;
+  }
+}
+
+void X86ATTInstPrinter::printRoundingControl(const MCInst *MI, unsigned Op,
+                                   raw_ostream &O) {
+  int64_t Imm = MI->getOperand(Op).getImm() & 0x3;
+  switch (Imm) {
+  case 0: O << "{rn-sae}"; break;
+  case 1: O << "{rd-sae}"; break;
+  case 2: O << "{ru-sae}"; break;
+  case 3: O << "{rz-sae}"; break;
+  }
+}
+/// printPCRelImm - This is used to print an immediate value that ends up
+/// being encoded as a pc-relative value (e.g. for jumps and calls).  These
+/// print slightly differently than normal immediates.  For example, a $ is not
+/// emitted.
+void X86ATTInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo,
+                                      raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isImm())
+    O << formatImm(Op.getImm());
+  else {
+    assert(Op.isExpr() && "unknown pcrel immediate operand");
+    // If a symbolic branch target was added as a constant expression then print
+    // that address in hex.
+    const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr());
+    int64_t Address;
+    if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) {
+      O << formatHex((uint64_t)Address);
+    }
+    else {
+      // Otherwise, just print the expression.
+      O << *Op.getExpr();
+    }
+  }
+}
+
+void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                     raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    printRegName(O, Op.getReg());
+  } else if (Op.isImm()) {
+    // Print X86 immediates as signed values.
+    O << markup("<imm:")
+      << '$' << formatImm((int64_t)Op.getImm())
+      << markup(">");
+
+    if (CommentStream && (Op.getImm() > 255 || Op.getImm() < -256))
+      *CommentStream << format("imm = 0x%" PRIX64 "\n", (uint64_t)Op.getImm());
+
+  } else {
+    assert(Op.isExpr() && "unknown operand kind in printOperand");
+    O << markup("<imm:")
+      << '$' << *Op.getExpr()
+      << markup(">");
+  }
+}
+
+void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
+                                          raw_ostream &O) {
+  const MCOperand &BaseReg  = MI->getOperand(Op+X86::AddrBaseReg);
+  const MCOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg);
+  const MCOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp);
+  const MCOperand &SegReg = MI->getOperand(Op+X86::AddrSegmentReg);
+
+  O << markup("<mem:");
+
+  // If this has a segment register, print it.
+  if (SegReg.getReg()) {
+    printOperand(MI, Op+X86::AddrSegmentReg, O);
+    O << ':';
+  }
+
+  if (DispSpec.isImm()) {
+    int64_t DispVal = DispSpec.getImm();
+    if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg()))
+      O << formatImm(DispVal);
+  } else {
+    assert(DispSpec.isExpr() && "non-immediate displacement for LEA?");
+    O << *DispSpec.getExpr();
+  }
+
+  if (IndexReg.getReg() || BaseReg.getReg()) {
+    O << '(';
+    if (BaseReg.getReg())
+      printOperand(MI, Op+X86::AddrBaseReg, O);
+
+    if (IndexReg.getReg()) {
+      O << ',';
+      printOperand(MI, Op+X86::AddrIndexReg, O);
+      unsigned ScaleVal = MI->getOperand(Op+X86::AddrScaleAmt).getImm();
+      if (ScaleVal != 1) {
+        O << ','
+          << markup("<imm:")
+          << ScaleVal // never printed in hex.
+          << markup(">");
+      }
+    }
+    O << ')';
+  }
+
+  O << markup(">");
+}
+
+void X86ATTInstPrinter::printSrcIdx(const MCInst *MI, unsigned Op,
+                                    raw_ostream &O) {
+  const MCOperand &SegReg = MI->getOperand(Op+1);
+
+  O << markup("<mem:");
+
+  // If this has a segment register, print it.
+  if (SegReg.getReg()) {
+    printOperand(MI, Op+1, O);
+    O << ':';
+  }
+
+  O << "(";
+  printOperand(MI, Op, O);
+  O << ")";
+
+  O << markup(">");
+}
+
+void X86ATTInstPrinter::printDstIdx(const MCInst *MI, unsigned Op,
+                                    raw_ostream &O) {
+  O << markup("<mem:");
+
+  O << "%es:(";
+  printOperand(MI, Op, O);
+  O << ")";
+
+  O << markup(">");
+}
+
+void X86ATTInstPrinter::printMemOffset(const MCInst *MI, unsigned Op,
+                                       raw_ostream &O) {
+  const MCOperand &DispSpec = MI->getOperand(Op);
+  const MCOperand &SegReg = MI->getOperand(Op+1);
+
+  O << markup("<mem:");
+
+  // If this has a segment register, print it.
+  if (SegReg.getReg()) {
+    printOperand(MI, Op+1, O);
+    O << ':';
+  }
+
+  if (DispSpec.isImm()) {
+    O << formatImm(DispSpec.getImm());
+  } else {
+    assert(DispSpec.isExpr() && "non-immediate displacement?");
+    O << *DispSpec.getExpr();
+  }
+
+  O << markup(">");
+}
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
new file mode 100644
index 0000000..531183b
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
@@ -0,0 +1,136 @@
+//==- X86ATTInstPrinter.h - Convert X86 MCInst to assembly syntax -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an X86 MCInst to AT&T style .s file syntax.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86_ATT_INST_PRINTER_H
+#define X86_ATT_INST_PRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+
+class MCOperand;
+
+class X86ATTInstPrinter final : public MCInstPrinter {
+public:
+  X86ATTInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                    const MCRegisterInfo &MRI)
+    : MCInstPrinter(MAI, MII, MRI) {}
+
+  void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+  void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot) override;
+
+  // Autogenerated by tblgen, returns true if we successfully printed an
+  // alias.
+  bool printAliasInstr(const MCInst *MI, raw_ostream &OS);
+  void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+                               unsigned PrintMethodIdx, raw_ostream &O);
+
+  // Autogenerated by tblgen.
+  void printInstruction(const MCInst *MI, raw_ostream &OS);
+  static const char *getRegisterName(unsigned RegNo);
+
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+  void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &OS);
+  void printSSECC(const MCInst *MI, unsigned Op, raw_ostream &OS);
+  void printAVXCC(const MCInst *MI, unsigned Op, raw_ostream &OS);
+  void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+  void printSrcIdx(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+  void printDstIdx(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+  void printMemOffset(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+  void printRoundingControl(const MCInst *MI, unsigned Op, raw_ostream &OS);
+
+  void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  
+  void printi8mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  void printi16mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  void printi32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  void printi64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  void printi128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  void printi256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  void printi512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  void printf32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  void printf64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  void printf80mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  void printf128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  void printf256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  void printf512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+
+  void printSrcIdx8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printSrcIdx(MI, OpNo, O);
+  }
+  void printSrcIdx16(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printSrcIdx(MI, OpNo, O);
+  }
+  void printSrcIdx32(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printSrcIdx(MI, OpNo, O);
+  }
+  void printSrcIdx64(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printSrcIdx(MI, OpNo, O);
+  }
+  void printDstIdx8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printDstIdx(MI, OpNo, O);
+  }
+  void printDstIdx16(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printDstIdx(MI, OpNo, O);
+  }
+  void printDstIdx32(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printDstIdx(MI, OpNo, O);
+  }
+  void printDstIdx64(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printDstIdx(MI, OpNo, O);
+  }
+  void printMemOffs8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemOffset(MI, OpNo, O);
+  }
+  void printMemOffs16(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemOffset(MI, OpNo, O);
+  }
+  void printMemOffs32(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemOffset(MI, OpNo, O);
+  }
+  void printMemOffs64(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemOffset(MI, OpNo, O);
+  }
+};
+  
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp
new file mode 100644
index 0000000..baf6507
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp
@@ -0,0 +1,542 @@
+//===-- X86InstComments.cpp - Generate verbose-asm comments for instrs ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This defines functionality used to emit comments about X86 instructions to
+// an output stream for -fverbose-asm.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstComments.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "Utils/X86ShuffleDecode.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Top Level Entrypoint
+//===----------------------------------------------------------------------===//
+
+/// EmitAnyX86InstComments - This function decodes x86 instructions and prints
+/// newline terminated strings to the specified string if desired.  This
+/// information is shown in disassembly dumps when verbose assembly is enabled.
+void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
+                                  const char *(*getRegName)(unsigned)) {
+  // If this is a shuffle operation, the switch should fill in this state.
+  SmallVector<int, 8> ShuffleMask;
+  const char *DestName = nullptr, *Src1Name = nullptr, *Src2Name = nullptr;
+
+  switch (MI->getOpcode()) {
+  case X86::INSERTPSrr:
+  case X86::VINSERTPSrr:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    if(MI->getOperand(3).isImm())
+      DecodeINSERTPSMask(MI->getOperand(3).getImm(), ShuffleMask);
+    break;
+
+  case X86::MOVLHPSrr:
+  case X86::VMOVLHPSrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeMOVLHPSMask(2, ShuffleMask);
+    break;
+
+  case X86::MOVHLPSrr:
+  case X86::VMOVHLPSrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeMOVHLPSMask(2, ShuffleMask);
+    break;
+
+  case X86::PALIGNR128rr:
+  case X86::VPALIGNR128rr:
+    Src1Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::PALIGNR128rm:
+  case X86::VPALIGNR128rm:
+    Src2Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodePALIGNRMask(MVT::v16i8,
+                        MI->getOperand(MI->getNumOperands()-1).getImm(),
+                        ShuffleMask);
+    break;
+  case X86::VPALIGNR256rr:
+    Src1Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPALIGNR256rm:
+    Src2Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodePALIGNRMask(MVT::v32i8,
+                        MI->getOperand(MI->getNumOperands()-1).getImm(),
+                        ShuffleMask);
+    break;
+
+  case X86::PSHUFDri:
+  case X86::VPSHUFDri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::PSHUFDmi:
+  case X86::VPSHUFDmi:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodePSHUFMask(MVT::v4i32,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    break;
+  case X86::VPSHUFDYri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::VPSHUFDYmi:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodePSHUFMask(MVT::v8i32,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    break;
+
+
+  case X86::PSHUFHWri:
+  case X86::VPSHUFHWri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::PSHUFHWmi:
+  case X86::VPSHUFHWmi:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodePSHUFHWMask(MVT::v8i16,
+                        MI->getOperand(MI->getNumOperands()-1).getImm(),
+                        ShuffleMask);
+    break;
+  case X86::VPSHUFHWYri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::VPSHUFHWYmi:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodePSHUFHWMask(MVT::v16i16,
+                        MI->getOperand(MI->getNumOperands()-1).getImm(),
+                        ShuffleMask);
+    break;
+  case X86::PSHUFLWri:
+  case X86::VPSHUFLWri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::PSHUFLWmi:
+  case X86::VPSHUFLWmi:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodePSHUFLWMask(MVT::v8i16,
+                        MI->getOperand(MI->getNumOperands()-1).getImm(),
+                        ShuffleMask);
+    break;
+  case X86::VPSHUFLWYri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::VPSHUFLWYmi:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodePSHUFLWMask(MVT::v16i16,
+                        MI->getOperand(MI->getNumOperands()-1).getImm(),
+                        ShuffleMask);
+    break;
+
+  case X86::PUNPCKHBWrr:
+  case X86::VPUNPCKHBWrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::PUNPCKHBWrm:
+  case X86::VPUNPCKHBWrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKHMask(MVT::v16i8, ShuffleMask);
+    break;
+  case X86::VPUNPCKHBWYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPUNPCKHBWYrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKHMask(MVT::v32i8, ShuffleMask);
+    break;
+  case X86::PUNPCKHWDrr:
+  case X86::VPUNPCKHWDrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::PUNPCKHWDrm:
+  case X86::VPUNPCKHWDrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKHMask(MVT::v8i16, ShuffleMask);
+    break;
+  case X86::VPUNPCKHWDYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPUNPCKHWDYrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKHMask(MVT::v16i16, ShuffleMask);
+    break;
+  case X86::PUNPCKHDQrr:
+  case X86::VPUNPCKHDQrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::PUNPCKHDQrm:
+  case X86::VPUNPCKHDQrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKHMask(MVT::v4i32, ShuffleMask);
+    break;
+  case X86::VPUNPCKHDQYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPUNPCKHDQYrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKHMask(MVT::v8i32, ShuffleMask);
+    break;
+  case X86::PUNPCKHQDQrr:
+  case X86::VPUNPCKHQDQrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::PUNPCKHQDQrm:
+  case X86::VPUNPCKHQDQrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKHMask(MVT::v2i64, ShuffleMask);
+    break;
+  case X86::VPUNPCKHQDQYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPUNPCKHQDQYrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKHMask(MVT::v4i64, ShuffleMask);
+    break;
+
+  case X86::PUNPCKLBWrr:
+  case X86::VPUNPCKLBWrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::PUNPCKLBWrm:
+  case X86::VPUNPCKLBWrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKLMask(MVT::v16i8, ShuffleMask);
+    break;
+  case X86::VPUNPCKLBWYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPUNPCKLBWYrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKLMask(MVT::v32i8, ShuffleMask);
+    break;
+  case X86::PUNPCKLWDrr:
+  case X86::VPUNPCKLWDrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::PUNPCKLWDrm:
+  case X86::VPUNPCKLWDrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKLMask(MVT::v8i16, ShuffleMask);
+    break;
+  case X86::VPUNPCKLWDYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPUNPCKLWDYrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKLMask(MVT::v16i16, ShuffleMask);
+    break;
+  case X86::PUNPCKLDQrr:
+  case X86::VPUNPCKLDQrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::PUNPCKLDQrm:
+  case X86::VPUNPCKLDQrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKLMask(MVT::v4i32, ShuffleMask);
+    break;
+  case X86::VPUNPCKLDQYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPUNPCKLDQYrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKLMask(MVT::v8i32, ShuffleMask);
+    break;
+  case X86::PUNPCKLQDQrr:
+  case X86::VPUNPCKLQDQrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::PUNPCKLQDQrm:
+  case X86::VPUNPCKLQDQrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKLMask(MVT::v2i64, ShuffleMask);
+    break;
+  case X86::VPUNPCKLQDQYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPUNPCKLQDQYrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKLMask(MVT::v4i64, ShuffleMask);
+    break;
+
+  case X86::SHUFPDrri:
+  case X86::VSHUFPDrri:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::SHUFPDrmi:
+  case X86::VSHUFPDrmi:
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodeSHUFPMask(MVT::v2f64,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VSHUFPDYrri:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VSHUFPDYrmi:
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodeSHUFPMask(MVT::v4f64,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  case X86::SHUFPSrri:
+  case X86::VSHUFPSrri:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::SHUFPSrmi:
+  case X86::VSHUFPSrmi:
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodeSHUFPMask(MVT::v4f32,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VSHUFPSYrri:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VSHUFPSYrmi:
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodeSHUFPMask(MVT::v8f32,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  case X86::UNPCKLPDrr:
+  case X86::VUNPCKLPDrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::UNPCKLPDrm:
+  case X86::VUNPCKLPDrm:
+    DecodeUNPCKLMask(MVT::v2f64, ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VUNPCKLPDYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VUNPCKLPDYrm:
+    DecodeUNPCKLMask(MVT::v4f64, ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::UNPCKLPSrr:
+  case X86::VUNPCKLPSrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::UNPCKLPSrm:
+  case X86::VUNPCKLPSrm:
+    DecodeUNPCKLMask(MVT::v4f32, ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VUNPCKLPSYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VUNPCKLPSYrm:
+    DecodeUNPCKLMask(MVT::v8f32, ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::UNPCKHPDrr:
+  case X86::VUNPCKHPDrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::UNPCKHPDrm:
+  case X86::VUNPCKHPDrm:
+    DecodeUNPCKHMask(MVT::v2f64, ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VUNPCKHPDYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VUNPCKHPDYrm:
+    DecodeUNPCKHMask(MVT::v4f64, ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::UNPCKHPSrr:
+  case X86::VUNPCKHPSrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::UNPCKHPSrm:
+  case X86::VUNPCKHPSrm:
+    DecodeUNPCKHMask(MVT::v4f32, ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VUNPCKHPSYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VUNPCKHPSYrm:
+    DecodeUNPCKHMask(MVT::v8f32, ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VPERMILPSri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::VPERMILPSmi:
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodePSHUFMask(MVT::v4f32,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VPERMILPSYri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::VPERMILPSYmi:
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodePSHUFMask(MVT::v8f32,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VPERMILPDri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::VPERMILPDmi:
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodePSHUFMask(MVT::v2f64,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VPERMILPDYri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::VPERMILPDYmi:
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodePSHUFMask(MVT::v4f64,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VPERM2F128rr:
+  case X86::VPERM2I128rr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPERM2F128rm:
+  case X86::VPERM2I128rm:
+    // For instruction comments purpose, assume the 256-bit vector is v4i64.
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodeVPERM2X128Mask(MVT::v4i64,
+                           MI->getOperand(MI->getNumOperands()-1).getImm(),
+                           ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VPERMQYri:
+  case X86::VPERMPDYri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::VPERMQYmi:
+  case X86::VPERMPDYmi:
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodeVPERMMask(MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  }
+
+
+  // If this was a shuffle operation, print the shuffle mask.
+  if (!ShuffleMask.empty()) {
+    if (!DestName) DestName = Src1Name;
+    OS << (DestName ? DestName : "mem") << " = ";
+
+    // If the two sources are the same, canonicalize the input elements to be
+    // from the first src so that we get larger element spans.
+    if (Src1Name == Src2Name) {
+      for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) {
+        if ((int)ShuffleMask[i] >= 0 && // Not sentinel.
+            ShuffleMask[i] >= (int)e)        // From second mask.
+          ShuffleMask[i] -= e;
+      }
+    }
+
+    // The shuffle mask specifies which elements of the src1/src2 fill in the
+    // destination, with a few sentinel values.  Loop through and print them
+    // out.
+    for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) {
+      if (i != 0)
+        OS << ',';
+      if (ShuffleMask[i] == SM_SentinelZero) {
+        OS << "zero";
+        continue;
+      }
+
+      // Otherwise, it must come from src1 or src2.  Print the span of elements
+      // that comes from this src.
+      bool isSrc1 = ShuffleMask[i] < (int)ShuffleMask.size();
+      const char *SrcName = isSrc1 ? Src1Name : Src2Name;
+      OS << (SrcName ? SrcName : "mem") << '[';
+      bool IsFirst = true;
+      while (i != e &&
+             (int)ShuffleMask[i] >= 0 &&
+             (ShuffleMask[i] < (int)ShuffleMask.size()) == isSrc1) {
+        if (!IsFirst)
+          OS << ',';
+        else
+          IsFirst = false;
+        OS << ShuffleMask[i] % ShuffleMask.size();
+        ++i;
+      }
+      OS << ']';
+      --i;  // For loop increments element #.
+    }
+    //MI->print(OS, 0);
+    OS << "\n";
+  }
+
+}
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h
new file mode 100644
index 0000000..13fdf9a
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h
@@ -0,0 +1,25 @@
+//=- X86InstComments.h - Generate verbose-asm comments for instrs -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This defines functionality used to emit comments about X86 instructions to
+// an output stream for -fverbose-asm.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86_INST_COMMENTS_H
+#define X86_INST_COMMENTS_H
+
+namespace llvm {
+  class MCInst;
+  class raw_ostream;
+  void EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
+                              const char *(*getRegName)(unsigned));
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
new file mode 100644
index 0000000..1c8466b
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
@@ -0,0 +1,259 @@
+//===-- X86IntelInstPrinter.cpp - Intel assembly instruction printing -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file includes code for rendering MCInst instances as Intel-style
+// assembly.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86IntelInstPrinter.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "X86InstComments.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include <cctype>
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+#include "X86GenAsmWriter1.inc"
+
+void X86IntelInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+  OS << getRegisterName(RegNo);
+}
+
+void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
+                                    StringRef Annot) {
+  const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+  uint64_t TSFlags = Desc.TSFlags;
+
+  if (TSFlags & X86II::LOCK)
+    OS << "\tlock\n";
+
+  printInstruction(MI, OS);
+
+  // Next always print the annotation.
+  printAnnotation(OS, Annot);
+
+  // If verbose assembly is enabled, we can print some informative comments.
+  if (CommentStream)
+    EmitAnyX86InstComments(MI, *CommentStream, getRegisterName);
+}
+
+void X86IntelInstPrinter::printSSECC(const MCInst *MI, unsigned Op,
+                                     raw_ostream &O) {
+  int64_t Imm = MI->getOperand(Op).getImm() & 0xf;
+  switch (Imm) {
+  default: llvm_unreachable("Invalid ssecc argument!");
+  case    0: O << "eq"; break;
+  case    1: O << "lt"; break;
+  case    2: O << "le"; break;
+  case    3: O << "unord"; break;
+  case    4: O << "neq"; break;
+  case    5: O << "nlt"; break;
+  case    6: O << "nle"; break;
+  case    7: O << "ord"; break;
+  case    8: O << "eq_uq"; break;
+  case    9: O << "nge"; break;
+  case  0xa: O << "ngt"; break;
+  case  0xb: O << "false"; break;
+  case  0xc: O << "neq_oq"; break;
+  case  0xd: O << "ge"; break;
+  case  0xe: O << "gt"; break;
+  case  0xf: O << "true"; break;
+  }
+}
+
+void X86IntelInstPrinter::printAVXCC(const MCInst *MI, unsigned Op,
+                                     raw_ostream &O) {
+  int64_t Imm = MI->getOperand(Op).getImm() & 0x1f;
+  switch (Imm) {
+  default: llvm_unreachable("Invalid avxcc argument!");
+  case    0: O << "eq"; break;
+  case    1: O << "lt"; break;
+  case    2: O << "le"; break;
+  case    3: O << "unord"; break;
+  case    4: O << "neq"; break;
+  case    5: O << "nlt"; break;
+  case    6: O << "nle"; break;
+  case    7: O << "ord"; break;
+  case    8: O << "eq_uq"; break;
+  case    9: O << "nge"; break;
+  case  0xa: O << "ngt"; break;
+  case  0xb: O << "false"; break;
+  case  0xc: O << "neq_oq"; break;
+  case  0xd: O << "ge"; break;
+  case  0xe: O << "gt"; break;
+  case  0xf: O << "true"; break;
+  case 0x10: O << "eq_os"; break;
+  case 0x11: O << "lt_oq"; break;
+  case 0x12: O << "le_oq"; break;
+  case 0x13: O << "unord_s"; break;
+  case 0x14: O << "neq_us"; break;
+  case 0x15: O << "nlt_uq"; break;
+  case 0x16: O << "nle_uq"; break;
+  case 0x17: O << "ord_s"; break;
+  case 0x18: O << "eq_us"; break;
+  case 0x19: O << "nge_uq"; break;
+  case 0x1a: O << "ngt_uq"; break;
+  case 0x1b: O << "false_os"; break;
+  case 0x1c: O << "neq_os"; break;
+  case 0x1d: O << "ge_oq"; break;
+  case 0x1e: O << "gt_oq"; break;
+  case 0x1f: O << "true_us"; break;
+  }
+}
+
+void X86IntelInstPrinter::printRoundingControl(const MCInst *MI, unsigned Op,
+                                   raw_ostream &O) {
+  int64_t Imm = MI->getOperand(Op).getImm() & 0x3;
+  switch (Imm) {
+  case 0: O << "{rn-sae}"; break;
+  case 1: O << "{rd-sae}"; break;
+  case 2: O << "{ru-sae}"; break;
+  case 3: O << "{rz-sae}"; break;
+  }
+}
+
+/// printPCRelImm - This is used to print an immediate value that ends up
+/// being encoded as a pc-relative value.
+void X86IntelInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isImm())
+    O << formatImm(Op.getImm());
+  else {
+    assert(Op.isExpr() && "unknown pcrel immediate operand");
+    // If a symbolic branch target was added as a constant expression then print
+    // that address in hex.
+    const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr());
+    int64_t Address;
+    if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) {
+      O << formatHex((uint64_t)Address);
+    }
+    else {
+      // Otherwise, just print the expression.
+      O << *Op.getExpr();
+    }
+  }
+}
+
+void X86IntelInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                       raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    printRegName(O, Op.getReg());
+  } else if (Op.isImm()) {
+    O << formatImm((int64_t)Op.getImm());
+  } else {
+    assert(Op.isExpr() && "unknown operand kind in printOperand");
+    O << *Op.getExpr();
+  }
+}
+
+void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
+                                            raw_ostream &O) {
+  const MCOperand &BaseReg  = MI->getOperand(Op+X86::AddrBaseReg);
+  unsigned ScaleVal         = MI->getOperand(Op+X86::AddrScaleAmt).getImm();
+  const MCOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg);
+  const MCOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp);
+  const MCOperand &SegReg   = MI->getOperand(Op+X86::AddrSegmentReg);
+  
+  // If this has a segment register, print it.
+  if (SegReg.getReg()) {
+    printOperand(MI, Op+X86::AddrSegmentReg, O);
+    O << ':';
+  }
+  
+  O << '[';
+  
+  bool NeedPlus = false;
+  if (BaseReg.getReg()) {
+    printOperand(MI, Op+X86::AddrBaseReg, O);
+    NeedPlus = true;
+  }
+  
+  if (IndexReg.getReg()) {
+    if (NeedPlus) O << " + ";
+    if (ScaleVal != 1)
+      O << ScaleVal << '*';
+    printOperand(MI, Op+X86::AddrIndexReg, O);
+    NeedPlus = true;
+  }
+
+  if (!DispSpec.isImm()) {
+    if (NeedPlus) O << " + ";
+    assert(DispSpec.isExpr() && "non-immediate displacement for LEA?");
+    O << *DispSpec.getExpr();
+  } else {
+    int64_t DispVal = DispSpec.getImm();
+    if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) {
+      if (NeedPlus) {
+        if (DispVal > 0)
+          O << " + ";
+        else {
+          O << " - ";
+          DispVal = -DispVal;
+        }
+      }
+      O << formatImm(DispVal);
+    }
+  }
+  
+  O << ']';
+}
+
+void X86IntelInstPrinter::printSrcIdx(const MCInst *MI, unsigned Op,
+                                      raw_ostream &O) {
+  const MCOperand &SegReg   = MI->getOperand(Op+1);
+
+  // If this has a segment register, print it.
+  if (SegReg.getReg()) {
+    printOperand(MI, Op+1, O);
+    O << ':';
+  }
+  O << '[';
+  printOperand(MI, Op, O);
+  O << ']';
+}
+
+void X86IntelInstPrinter::printDstIdx(const MCInst *MI, unsigned Op,
+                                      raw_ostream &O) {
+  // DI accesses are always ES-based.
+  O << "es:[";
+  printOperand(MI, Op, O);
+  O << ']';
+}
+
+void X86IntelInstPrinter::printMemOffset(const MCInst *MI, unsigned Op,
+                                         raw_ostream &O) {
+  const MCOperand &DispSpec = MI->getOperand(Op);
+  const MCOperand &SegReg   = MI->getOperand(Op+1);
+
+  // If this has a segment register, print it.
+  if (SegReg.getReg()) {
+    printOperand(MI, Op+1, O);
+    O << ':';
+  }
+
+  O << '[';
+
+  if (DispSpec.isImm()) {
+    O << formatImm(DispSpec.getImm());
+  } else {
+    assert(DispSpec.isExpr() && "non-immediate displacement?");
+    O << *DispSpec.getExpr();
+  }
+
+  O << ']';
+}
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
new file mode 100644
index 0000000..4d9b481
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
@@ -0,0 +1,158 @@
+//= X86IntelInstPrinter.h - Convert X86 MCInst to assembly syntax -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an X86 MCInst to Intel style .s file syntax.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86_INTEL_INST_PRINTER_H
+#define X86_INTEL_INST_PRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+class MCOperand;
+
+class X86IntelInstPrinter final : public MCInstPrinter {
+public:
+  X86IntelInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                      const MCRegisterInfo &MRI)
+    : MCInstPrinter(MAI, MII, MRI) {}
+
+  void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+  void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot) override;
+
+  // Autogenerated by tblgen.
+  void printInstruction(const MCInst *MI, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo);
+
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &O);
+  void printSSECC(const MCInst *MI, unsigned Op, raw_ostream &O);
+  void printAVXCC(const MCInst *MI, unsigned Op, raw_ostream &O);
+  void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printMemOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printSrcIdx(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printDstIdx(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printRoundingControl(const MCInst *MI, unsigned Op, raw_ostream &OS);
+
+  void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "opaque ptr ";
+    printMemReference(MI, OpNo, O);
+  }
+  
+  void printi8mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "byte ptr ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printi16mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "word ptr ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printi32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "dword ptr ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printi64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "qword ptr ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printi128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "xmmword ptr ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printi256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "ymmword ptr ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printi512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "zmmword ptr ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printf32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "dword ptr ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printf64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "qword ptr ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printf80mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "xword ptr ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printf128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "xmmword ptr ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printf256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "ymmword ptr ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printf512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "zmmword ptr ";
+    printMemReference(MI, OpNo, O);
+  }
+
+
+  void printSrcIdx8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "byte ptr ";
+    printSrcIdx(MI, OpNo, O);
+  }
+  void printSrcIdx16(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "word ptr ";
+    printSrcIdx(MI, OpNo, O);
+  }
+  void printSrcIdx32(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "dword ptr ";
+    printSrcIdx(MI, OpNo, O);
+  }
+  void printSrcIdx64(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "qword ptr ";
+    printSrcIdx(MI, OpNo, O);
+  }
+  void printDstIdx8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "byte ptr ";
+    printDstIdx(MI, OpNo, O);
+  }
+  void printDstIdx16(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "word ptr ";
+    printDstIdx(MI, OpNo, O);
+  }
+  void printDstIdx32(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "dword ptr ";
+    printDstIdx(MI, OpNo, O);
+  }
+  void printDstIdx64(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "qword ptr ";
+    printDstIdx(MI, OpNo, O);
+  }
+  void printMemOffs8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "byte ptr ";
+    printMemOffset(MI, OpNo, O);
+  }
+  void printMemOffs16(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "word ptr ";
+    printMemOffset(MI, OpNo, O);
+  }
+  void printMemOffs32(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "dword ptr ";
+    printMemOffset(MI, OpNo, O);
+  }
+  void printMemOffs64(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "qword ptr ";
+    printMemOffset(MI, OpNo, O);
+  }
+};
+  
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
new file mode 100644
index 0000000..23bca0d
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -0,0 +1,842 @@
+//===-- X86AsmBackend.cpp - X86 Assembler Backend -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86FixupKinds.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachO.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+// Option to allow disabling arithmetic relaxation to workaround PR9807, which
+// is useful when running bitwise comparison experiments on Darwin. We should be
+// able to remove this once PR9807 is resolved.
+static cl::opt<bool>
+MCDisableArithRelaxation("mc-x86-disable-arith-relaxation",
+         cl::desc("Disable relaxation of arithmetic instruction for X86"));
+
+static unsigned getFixupKindLog2Size(unsigned Kind) {
+  switch (Kind) {
+  default:
+    llvm_unreachable("invalid fixup kind!");
+  case FK_PCRel_1:
+  case FK_SecRel_1:
+  case FK_Data_1:
+    return 0;
+  case FK_PCRel_2:
+  case FK_SecRel_2:
+  case FK_Data_2:
+    return 1;
+  case FK_PCRel_4:
+  case X86::reloc_riprel_4byte:
+  case X86::reloc_riprel_4byte_movq_load:
+  case X86::reloc_signed_4byte:
+  case X86::reloc_global_offset_table:
+  case FK_SecRel_4:
+  case FK_Data_4:
+    return 2;
+  case FK_PCRel_8:
+  case FK_SecRel_8:
+  case FK_Data_8:
+  case X86::reloc_global_offset_table8:
+    return 3;
+  }
+}
+
+namespace {
+
+class X86ELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+  X86ELFObjectWriter(bool is64Bit, uint8_t OSABI, uint16_t EMachine,
+                     bool HasRelocationAddend, bool foobar)
+    : MCELFObjectTargetWriter(is64Bit, OSABI, EMachine, HasRelocationAddend) {}
+};
+
+class X86AsmBackend : public MCAsmBackend {
+  const StringRef CPU;
+  bool HasNopl;
+  const uint64_t MaxNopLength;
+public:
+  X86AsmBackend(const Target &T, StringRef _CPU)
+    : MCAsmBackend(), CPU(_CPU), MaxNopLength(_CPU == "slm" ? 7 : 15) {
+    HasNopl = CPU != "generic" && CPU != "i386" && CPU != "i486" &&
+              CPU != "i586" && CPU != "pentium" && CPU != "pentium-mmx" &&
+              CPU != "i686" && CPU != "k6" && CPU != "k6-2" && CPU != "k6-3" &&
+              CPU != "geode" && CPU != "winchip-c6" && CPU != "winchip2" &&
+              CPU != "c3" && CPU != "c3-2";
+  }
+
+  unsigned getNumFixupKinds() const override {
+    return X86::NumTargetFixupKinds;
+  }
+
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
+    const static MCFixupKindInfo Infos[X86::NumTargetFixupKinds] = {
+      { "reloc_riprel_4byte", 0, 4 * 8, MCFixupKindInfo::FKF_IsPCRel },
+      { "reloc_riprel_4byte_movq_load", 0, 4 * 8, MCFixupKindInfo::FKF_IsPCRel},
+      { "reloc_signed_4byte", 0, 4 * 8, 0},
+      { "reloc_global_offset_table", 0, 4 * 8, 0}
+    };
+
+    if (Kind < FirstTargetFixupKind)
+      return MCAsmBackend::getFixupKindInfo(Kind);
+
+    assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+           "Invalid kind!");
+    return Infos[Kind - FirstTargetFixupKind];
+  }
+
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value, bool IsPCRel) const override {
+    unsigned Size = 1 << getFixupKindLog2Size(Fixup.getKind());
+
+    assert(Fixup.getOffset() + Size <= DataSize &&
+           "Invalid fixup offset!");
+
+    // Check that uppper bits are either all zeros or all ones.
+    // Specifically ignore overflow/underflow as long as the leakage is
+    // limited to the lower bits. This is to remain compatible with
+    // other assemblers.
+    assert(isIntN(Size * 8 + 1, Value) &&
+           "Value does not fit in the Fixup field");
+
+    for (unsigned i = 0; i != Size; ++i)
+      Data[Fixup.getOffset() + i] = uint8_t(Value >> (i * 8));
+  }
+
+  bool mayNeedRelaxation(const MCInst &Inst) const override;
+
+  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const override;
+
+  void relaxInstruction(const MCInst &Inst, MCInst &Res) const override;
+
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+};
+} // end anonymous namespace
+
+static unsigned getRelaxedOpcodeBranch(unsigned Op) {
+  switch (Op) {
+  default:
+    return Op;
+
+  case X86::JAE_1: return X86::JAE_4;
+  case X86::JA_1:  return X86::JA_4;
+  case X86::JBE_1: return X86::JBE_4;
+  case X86::JB_1:  return X86::JB_4;
+  case X86::JE_1:  return X86::JE_4;
+  case X86::JGE_1: return X86::JGE_4;
+  case X86::JG_1:  return X86::JG_4;
+  case X86::JLE_1: return X86::JLE_4;
+  case X86::JL_1:  return X86::JL_4;
+  case X86::JMP_1: return X86::JMP_4;
+  case X86::JNE_1: return X86::JNE_4;
+  case X86::JNO_1: return X86::JNO_4;
+  case X86::JNP_1: return X86::JNP_4;
+  case X86::JNS_1: return X86::JNS_4;
+  case X86::JO_1:  return X86::JO_4;
+  case X86::JP_1:  return X86::JP_4;
+  case X86::JS_1:  return X86::JS_4;
+  }
+}
+
+static unsigned getRelaxedOpcodeArith(unsigned Op) {
+  switch (Op) {
+  default:
+    return Op;
+
+    // IMUL
+  case X86::IMUL16rri8: return X86::IMUL16rri;
+  case X86::IMUL16rmi8: return X86::IMUL16rmi;
+  case X86::IMUL32rri8: return X86::IMUL32rri;
+  case X86::IMUL32rmi8: return X86::IMUL32rmi;
+  case X86::IMUL64rri8: return X86::IMUL64rri32;
+  case X86::IMUL64rmi8: return X86::IMUL64rmi32;
+
+    // AND
+  case X86::AND16ri8: return X86::AND16ri;
+  case X86::AND16mi8: return X86::AND16mi;
+  case X86::AND32ri8: return X86::AND32ri;
+  case X86::AND32mi8: return X86::AND32mi;
+  case X86::AND64ri8: return X86::AND64ri32;
+  case X86::AND64mi8: return X86::AND64mi32;
+
+    // OR
+  case X86::OR16ri8: return X86::OR16ri;
+  case X86::OR16mi8: return X86::OR16mi;
+  case X86::OR32ri8: return X86::OR32ri;
+  case X86::OR32mi8: return X86::OR32mi;
+  case X86::OR64ri8: return X86::OR64ri32;
+  case X86::OR64mi8: return X86::OR64mi32;
+
+    // XOR
+  case X86::XOR16ri8: return X86::XOR16ri;
+  case X86::XOR16mi8: return X86::XOR16mi;
+  case X86::XOR32ri8: return X86::XOR32ri;
+  case X86::XOR32mi8: return X86::XOR32mi;
+  case X86::XOR64ri8: return X86::XOR64ri32;
+  case X86::XOR64mi8: return X86::XOR64mi32;
+
+    // ADD
+  case X86::ADD16ri8: return X86::ADD16ri;
+  case X86::ADD16mi8: return X86::ADD16mi;
+  case X86::ADD32ri8: return X86::ADD32ri;
+  case X86::ADD32mi8: return X86::ADD32mi;
+  case X86::ADD64ri8: return X86::ADD64ri32;
+  case X86::ADD64mi8: return X86::ADD64mi32;
+
+    // SUB
+  case X86::SUB16ri8: return X86::SUB16ri;
+  case X86::SUB16mi8: return X86::SUB16mi;
+  case X86::SUB32ri8: return X86::SUB32ri;
+  case X86::SUB32mi8: return X86::SUB32mi;
+  case X86::SUB64ri8: return X86::SUB64ri32;
+  case X86::SUB64mi8: return X86::SUB64mi32;
+
+    // CMP
+  case X86::CMP16ri8: return X86::CMP16ri;
+  case X86::CMP16mi8: return X86::CMP16mi;
+  case X86::CMP32ri8: return X86::CMP32ri;
+  case X86::CMP32mi8: return X86::CMP32mi;
+  case X86::CMP64ri8: return X86::CMP64ri32;
+  case X86::CMP64mi8: return X86::CMP64mi32;
+
+    // PUSH
+  case X86::PUSH32i8:  return X86::PUSHi32;
+  case X86::PUSH16i8:  return X86::PUSHi16;
+  case X86::PUSH64i8:  return X86::PUSH64i32;
+  case X86::PUSH64i16: return X86::PUSH64i32;
+  }
+}
+
+static unsigned getRelaxedOpcode(unsigned Op) {
+  unsigned R = getRelaxedOpcodeArith(Op);
+  if (R != Op)
+    return R;
+  return getRelaxedOpcodeBranch(Op);
+}
+
+bool X86AsmBackend::mayNeedRelaxation(const MCInst &Inst) const {
+  // Branches can always be relaxed.
+  if (getRelaxedOpcodeBranch(Inst.getOpcode()) != Inst.getOpcode())
+    return true;
+
+  if (MCDisableArithRelaxation)
+    return false;
+
+  // Check if this instruction is ever relaxable.
+  if (getRelaxedOpcodeArith(Inst.getOpcode()) == Inst.getOpcode())
+    return false;
+
+
+  // Check if it has an expression and is not RIP relative.
+  bool hasExp = false;
+  bool hasRIP = false;
+  for (unsigned i = 0; i < Inst.getNumOperands(); ++i) {
+    const MCOperand &Op = Inst.getOperand(i);
+    if (Op.isExpr())
+      hasExp = true;
+
+    if (Op.isReg() && Op.getReg() == X86::RIP)
+      hasRIP = true;
+  }
+
+  // FIXME: Why exactly do we need the !hasRIP? Is it just a limitation on
+  // how we do relaxations?
+  return hasExp && !hasRIP;
+}
+
+bool X86AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
+                                         uint64_t Value,
+                                         const MCRelaxableFragment *DF,
+                                         const MCAsmLayout &Layout) const {
+  // Relax if the value is too big for a (signed) i8.
+  return int64_t(Value) != int64_t(int8_t(Value));
+}
+
+// FIXME: Can tblgen help at all here to verify there aren't other instructions
+// we can relax?
+void X86AsmBackend::relaxInstruction(const MCInst &Inst, MCInst &Res) const {
+  // The only relaxations X86 does is from a 1byte pcrel to a 4byte pcrel.
+  unsigned RelaxedOp = getRelaxedOpcode(Inst.getOpcode());
+
+  if (RelaxedOp == Inst.getOpcode()) {
+    SmallString<256> Tmp;
+    raw_svector_ostream OS(Tmp);
+    Inst.dump_pretty(OS);
+    OS << "\n";
+    report_fatal_error("unexpected instruction to relax: " + OS.str());
+  }
+
+  Res = Inst;
+  Res.setOpcode(RelaxedOp);
+}
+
+/// \brief Write a sequence of optimal nops to the output, covering \p Count
+/// bytes.
+/// \return - true on success, false on failure
+bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+  static const uint8_t Nops[10][10] = {
+    // nop
+    {0x90},
+    // xchg %ax,%ax
+    {0x66, 0x90},
+    // nopl (%[re]ax)
+    {0x0f, 0x1f, 0x00},
+    // nopl 0(%[re]ax)
+    {0x0f, 0x1f, 0x40, 0x00},
+    // nopl 0(%[re]ax,%[re]ax,1)
+    {0x0f, 0x1f, 0x44, 0x00, 0x00},
+    // nopw 0(%[re]ax,%[re]ax,1)
+    {0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00},
+    // nopl 0L(%[re]ax)
+    {0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00},
+    // nopl 0L(%[re]ax,%[re]ax,1)
+    {0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+    // nopw 0L(%[re]ax,%[re]ax,1)
+    {0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+    // nopw %cs:0L(%[re]ax,%[re]ax,1)
+    {0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+  };
+
+  // This CPU doesn't support long nops. If needed add more.
+  // FIXME: Can we get this from the subtarget somehow?
+  // FIXME: We could generated something better than plain 0x90.
+  if (!HasNopl) {
+    for (uint64_t i = 0; i < Count; ++i)
+      OW->Write8(0x90);
+    return true;
+  }
+
+  // 15 is the longest single nop instruction.  Emit as many 15-byte nops as
+  // needed, then emit a nop of the remaining length.
+  do {
+    const uint8_t ThisNopLength = (uint8_t) std::min(Count, MaxNopLength);
+    const uint8_t Prefixes = ThisNopLength <= 10 ? 0 : ThisNopLength - 10;
+    for (uint8_t i = 0; i < Prefixes; i++)
+      OW->Write8(0x66);
+    const uint8_t Rest = ThisNopLength - Prefixes;
+    for (uint8_t i = 0; i < Rest; i++)
+      OW->Write8(Nops[Rest - 1][i]);
+    Count -= ThisNopLength;
+  } while (Count != 0);
+
+  return true;
+}
+
+/* *** */
+
+namespace {
+
+class ELFX86AsmBackend : public X86AsmBackend {
+public:
+  uint8_t OSABI;
+  ELFX86AsmBackend(const Target &T, uint8_t _OSABI, StringRef CPU)
+      : X86AsmBackend(T, CPU), OSABI(_OSABI) {}
+};
+
+class ELFX86_32AsmBackend : public ELFX86AsmBackend {
+public:
+  ELFX86_32AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
+    : ELFX86AsmBackend(T, OSABI, CPU) {}
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
+    return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI, ELF::EM_386);
+  }
+};
+
+class ELFX86_X32AsmBackend : public ELFX86AsmBackend {
+public:
+  ELFX86_X32AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
+      : ELFX86AsmBackend(T, OSABI, CPU) {}
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
+    return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI,
+                                    ELF::EM_X86_64);
+  }
+};
+
+class ELFX86_64AsmBackend : public ELFX86AsmBackend {
+public:
+  ELFX86_64AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
+    : ELFX86AsmBackend(T, OSABI, CPU) {}
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
+    return createX86ELFObjectWriter(OS, /*IsELF64*/ true, OSABI, ELF::EM_X86_64);
+  }
+};
+
+class WindowsX86AsmBackend : public X86AsmBackend {
+  bool Is64Bit;
+
+public:
+  WindowsX86AsmBackend(const Target &T, bool is64Bit, StringRef CPU)
+    : X86AsmBackend(T, CPU)
+    , Is64Bit(is64Bit) {
+  }
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
+    return createX86WinCOFFObjectWriter(OS, Is64Bit);
+  }
+};
+
+namespace CU {
+
+  /// Compact unwind encoding values.
+  enum CompactUnwindEncodings {
+    /// [RE]BP based frame where [RE]BP is pused on the stack immediately after
+    /// the return address, then [RE]SP is moved to [RE]BP.
+    UNWIND_MODE_BP_FRAME                   = 0x01000000,
+
+    /// A frameless function with a small constant stack size.
+    UNWIND_MODE_STACK_IMMD                 = 0x02000000,
+
+    /// A frameless function with a large constant stack size.
+    UNWIND_MODE_STACK_IND                  = 0x03000000,
+
+    /// No compact unwind encoding is available.
+    UNWIND_MODE_DWARF                      = 0x04000000,
+
+    /// Mask for encoding the frame registers.
+    UNWIND_BP_FRAME_REGISTERS              = 0x00007FFF,
+
+    /// Mask for encoding the frameless registers.
+    UNWIND_FRAMELESS_STACK_REG_PERMUTATION = 0x000003FF
+  };
+
+} // end CU namespace
+
+class DarwinX86AsmBackend : public X86AsmBackend {
+  const MCRegisterInfo &MRI;
+
+  /// \brief Number of registers that can be saved in a compact unwind encoding.
+  enum { CU_NUM_SAVED_REGS = 6 };
+
+  mutable unsigned SavedRegs[CU_NUM_SAVED_REGS];
+  bool Is64Bit;
+
+  unsigned OffsetSize;                   ///< Offset of a "push" instruction.
+  unsigned PushInstrSize;                ///< Size of a "push" instruction.
+  unsigned MoveInstrSize;                ///< Size of a "move" instruction.
+  unsigned StackDivide;                  ///< Amount to adjust stack stize by.
+protected:
+  /// \brief Implementation of algorithm to generate the compact unwind encoding
+  /// for the CFI instructions.
+  uint32_t
+  generateCompactUnwindEncodingImpl(ArrayRef<MCCFIInstruction> Instrs) const {
+    if (Instrs.empty()) return 0;
+
+    // Reset the saved registers.
+    unsigned SavedRegIdx = 0;
+    memset(SavedRegs, 0, sizeof(SavedRegs));
+
+    bool HasFP = false;
+
+    // Encode that we are using EBP/RBP as the frame pointer.
+    uint32_t CompactUnwindEncoding = 0;
+
+    unsigned SubtractInstrIdx = Is64Bit ? 3 : 2;
+    unsigned InstrOffset = 0;
+    unsigned StackAdjust = 0;
+    unsigned StackSize = 0;
+    unsigned PrevStackSize = 0;
+    unsigned NumDefCFAOffsets = 0;
+
+    for (unsigned i = 0, e = Instrs.size(); i != e; ++i) {
+      const MCCFIInstruction &Inst = Instrs[i];
+
+      switch (Inst.getOperation()) {
+      default:
+        // Any other CFI directives indicate a frame that we aren't prepared
+        // to represent via compact unwind, so just bail out.
+        return 0;
+      case MCCFIInstruction::OpDefCfaRegister: {
+        // Defines a frame pointer. E.g.
+        //
+        //     movq %rsp, %rbp
+        //  L0:
+        //     .cfi_def_cfa_register %rbp
+        //
+        HasFP = true;
+        assert(MRI.getLLVMRegNum(Inst.getRegister(), true) ==
+               (Is64Bit ? X86::RBP : X86::EBP) && "Invalid frame pointer!");
+
+        // Reset the counts.
+        memset(SavedRegs, 0, sizeof(SavedRegs));
+        StackAdjust = 0;
+        SavedRegIdx = 0;
+        InstrOffset += MoveInstrSize;
+        break;
+      }
+      case MCCFIInstruction::OpDefCfaOffset: {
+        // Defines a new offset for the CFA. E.g.
+        //
+        //  With frame:
+        //  
+        //     pushq %rbp
+        //  L0:
+        //     .cfi_def_cfa_offset 16
+        //
+        //  Without frame:
+        //
+        //     subq $72, %rsp
+        //  L0:
+        //     .cfi_def_cfa_offset 80
+        //
+        PrevStackSize = StackSize;
+        StackSize = std::abs(Inst.getOffset()) / StackDivide;
+        ++NumDefCFAOffsets;
+        break;
+      }
+      case MCCFIInstruction::OpOffset: {
+        // Defines a "push" of a callee-saved register. E.g.
+        //
+        //     pushq %r15
+        //     pushq %r14
+        //     pushq %rbx
+        //  L0:
+        //     subq $120, %rsp
+        //  L1:
+        //     .cfi_offset %rbx, -40
+        //     .cfi_offset %r14, -32
+        //     .cfi_offset %r15, -24
+        //
+        if (SavedRegIdx == CU_NUM_SAVED_REGS)
+          // If there are too many saved registers, we cannot use a compact
+          // unwind encoding.
+          return CU::UNWIND_MODE_DWARF;
+
+        unsigned Reg = MRI.getLLVMRegNum(Inst.getRegister(), true);
+        SavedRegs[SavedRegIdx++] = Reg;
+        StackAdjust += OffsetSize;
+        InstrOffset += PushInstrSize;
+        break;
+      }
+      }
+    }
+
+    StackAdjust /= StackDivide;
+
+    if (HasFP) {
+      if ((StackAdjust & 0xFF) != StackAdjust)
+        // Offset was too big for a compact unwind encoding.
+        return CU::UNWIND_MODE_DWARF;
+
+      // Get the encoding of the saved registers when we have a frame pointer.
+      uint32_t RegEnc = encodeCompactUnwindRegistersWithFrame();
+      if (RegEnc == ~0U) return CU::UNWIND_MODE_DWARF;
+
+      CompactUnwindEncoding |= CU::UNWIND_MODE_BP_FRAME;
+      CompactUnwindEncoding |= (StackAdjust & 0xFF) << 16;
+      CompactUnwindEncoding |= RegEnc & CU::UNWIND_BP_FRAME_REGISTERS;
+    } else {
+      // If the amount of the stack allocation is the size of a register, then
+      // we "push" the RAX/EAX register onto the stack instead of adjusting the
+      // stack pointer with a SUB instruction. We don't support the push of the
+      // RAX/EAX register with compact unwind. So we check for that situation
+      // here.
+      if ((NumDefCFAOffsets == SavedRegIdx + 1 &&
+           StackSize - PrevStackSize == 1) ||
+          (Instrs.size() == 1 && NumDefCFAOffsets == 1 && StackSize == 2))
+        return CU::UNWIND_MODE_DWARF;
+
+      SubtractInstrIdx += InstrOffset;
+      ++StackAdjust;
+
+      if ((StackSize & 0xFF) == StackSize) {
+        // Frameless stack with a small stack size.
+        CompactUnwindEncoding |= CU::UNWIND_MODE_STACK_IMMD;
+
+        // Encode the stack size.
+        CompactUnwindEncoding |= (StackSize & 0xFF) << 16;
+      } else {
+        if ((StackAdjust & 0x7) != StackAdjust)
+          // The extra stack adjustments are too big for us to handle.
+          return CU::UNWIND_MODE_DWARF;
+
+        // Frameless stack with an offset too large for us to encode compactly.
+        CompactUnwindEncoding |= CU::UNWIND_MODE_STACK_IND;
+
+        // Encode the offset to the nnnnnn value in the 'subl $nnnnnn, ESP'
+        // instruction.
+        CompactUnwindEncoding |= (SubtractInstrIdx & 0xFF) << 16;
+
+        // Encode any extra stack stack adjustments (done via push
+        // instructions).
+        CompactUnwindEncoding |= (StackAdjust & 0x7) << 13;
+      }
+
+      // Encode the number of registers saved. (Reverse the list first.)
+      std::reverse(&SavedRegs[0], &SavedRegs[SavedRegIdx]);
+      CompactUnwindEncoding |= (SavedRegIdx & 0x7) << 10;
+
+      // Get the encoding of the saved registers when we don't have a frame
+      // pointer.
+      uint32_t RegEnc = encodeCompactUnwindRegistersWithoutFrame(SavedRegIdx);
+      if (RegEnc == ~0U) return CU::UNWIND_MODE_DWARF;
+
+      // Encode the register encoding.
+      CompactUnwindEncoding |=
+        RegEnc & CU::UNWIND_FRAMELESS_STACK_REG_PERMUTATION;
+    }
+
+    return CompactUnwindEncoding;
+  }
+
+private:
+  /// \brief Get the compact unwind number for a given register. The number
+  /// corresponds to the enum lists in compact_unwind_encoding.h.
+  int getCompactUnwindRegNum(unsigned Reg) const {
+    static const uint16_t CU32BitRegs[7] = {
+      X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
+    };
+    static const uint16_t CU64BitRegs[] = {
+      X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
+    };
+    const uint16_t *CURegs = Is64Bit ? CU64BitRegs : CU32BitRegs;
+    for (int Idx = 1; *CURegs; ++CURegs, ++Idx)
+      if (*CURegs == Reg)
+        return Idx;
+
+    return -1;
+  }
+
+  /// \brief Return the registers encoded for a compact encoding with a frame
+  /// pointer.
+  uint32_t encodeCompactUnwindRegistersWithFrame() const {
+    // Encode the registers in the order they were saved --- 3-bits per
+    // register. The list of saved registers is assumed to be in reverse
+    // order. The registers are numbered from 1 to CU_NUM_SAVED_REGS.
+    uint32_t RegEnc = 0;
+    for (int i = 0, Idx = 0; i != CU_NUM_SAVED_REGS; ++i) {
+      unsigned Reg = SavedRegs[i];
+      if (Reg == 0) break;
+
+      int CURegNum = getCompactUnwindRegNum(Reg);
+      if (CURegNum == -1) return ~0U;
+
+      // Encode the 3-bit register number in order, skipping over 3-bits for
+      // each register.
+      RegEnc |= (CURegNum & 0x7) << (Idx++ * 3);
+    }
+
+    assert((RegEnc & 0x3FFFF) == RegEnc &&
+           "Invalid compact register encoding!");
+    return RegEnc;
+  }
+
+  /// \brief Create the permutation encoding used with frameless stacks. It is
+  /// passed the number of registers to be saved and an array of the registers
+  /// saved.
+  uint32_t encodeCompactUnwindRegistersWithoutFrame(unsigned RegCount) const {
+    // The saved registers are numbered from 1 to 6. In order to encode the
+    // order in which they were saved, we re-number them according to their
+    // place in the register order. The re-numbering is relative to the last
+    // re-numbered register. E.g., if we have registers {6, 2, 4, 5} saved in
+    // that order:
+    //
+    //    Orig  Re-Num
+    //    ----  ------
+    //     6       6
+    //     2       2
+    //     4       3
+    //     5       3
+    //
+    for (unsigned i = 0; i != CU_NUM_SAVED_REGS; ++i) {
+      int CUReg = getCompactUnwindRegNum(SavedRegs[i]);
+      if (CUReg == -1) return ~0U;
+      SavedRegs[i] = CUReg;
+    }
+
+    // Reverse the list.
+    std::reverse(&SavedRegs[0], &SavedRegs[CU_NUM_SAVED_REGS]);
+
+    uint32_t RenumRegs[CU_NUM_SAVED_REGS];
+    for (unsigned i = CU_NUM_SAVED_REGS - RegCount; i < CU_NUM_SAVED_REGS; ++i){
+      unsigned Countless = 0;
+      for (unsigned j = CU_NUM_SAVED_REGS - RegCount; j < i; ++j)
+        if (SavedRegs[j] < SavedRegs[i])
+          ++Countless;
+
+      RenumRegs[i] = SavedRegs[i] - Countless - 1;
+    }
+
+    // Take the renumbered values and encode them into a 10-bit number.
+    uint32_t permutationEncoding = 0;
+    switch (RegCount) {
+    case 6:
+      permutationEncoding |= 120 * RenumRegs[0] + 24 * RenumRegs[1]
+                             + 6 * RenumRegs[2] +  2 * RenumRegs[3]
+                             +     RenumRegs[4];
+      break;
+    case 5:
+      permutationEncoding |= 120 * RenumRegs[1] + 24 * RenumRegs[2]
+                             + 6 * RenumRegs[3] +  2 * RenumRegs[4]
+                             +     RenumRegs[5];
+      break;
+    case 4:
+      permutationEncoding |=  60 * RenumRegs[2] + 12 * RenumRegs[3]
+                             + 3 * RenumRegs[4] +      RenumRegs[5];
+      break;
+    case 3:
+      permutationEncoding |=  20 * RenumRegs[3] +  4 * RenumRegs[4]
+                             +     RenumRegs[5];
+      break;
+    case 2:
+      permutationEncoding |=   5 * RenumRegs[4] +      RenumRegs[5];
+      break;
+    case 1:
+      permutationEncoding |=       RenumRegs[5];
+      break;
+    }
+
+    assert((permutationEncoding & 0x3FF) == permutationEncoding &&
+           "Invalid compact register encoding!");
+    return permutationEncoding;
+  }
+
+public:
+  DarwinX86AsmBackend(const Target &T, const MCRegisterInfo &MRI, StringRef CPU,
+                      bool Is64Bit)
+    : X86AsmBackend(T, CPU), MRI(MRI), Is64Bit(Is64Bit) {
+    memset(SavedRegs, 0, sizeof(SavedRegs));
+    OffsetSize = Is64Bit ? 8 : 4;
+    MoveInstrSize = Is64Bit ? 3 : 2;
+    StackDivide = Is64Bit ? 8 : 4;
+    PushInstrSize = 1;
+  }
+};
+
+class DarwinX86_32AsmBackend : public DarwinX86AsmBackend {
+public:
+  DarwinX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                         StringRef CPU)
+      : DarwinX86AsmBackend(T, MRI, CPU, false) {}
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
+    return createX86MachObjectWriter(OS, /*Is64Bit=*/false,
+                                     MachO::CPU_TYPE_I386,
+                                     MachO::CPU_SUBTYPE_I386_ALL);
+  }
+
+  /// \brief Generate the compact unwind encoding for the CFI instructions.
+  uint32_t generateCompactUnwindEncoding(
+                             ArrayRef<MCCFIInstruction> Instrs) const override {
+    return generateCompactUnwindEncodingImpl(Instrs);
+  }
+};
+
+class DarwinX86_64AsmBackend : public DarwinX86AsmBackend {
+  const MachO::CPUSubTypeX86 Subtype;
+public:
+  DarwinX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                         StringRef CPU, MachO::CPUSubTypeX86 st)
+      : DarwinX86AsmBackend(T, MRI, CPU, true), Subtype(st) {}
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
+    return createX86MachObjectWriter(OS, /*Is64Bit=*/true,
+                                     MachO::CPU_TYPE_X86_64, Subtype);
+  }
+
+  bool doesSectionRequireSymbols(const MCSection &Section) const override {
+    // Temporary labels in the string literals sections require symbols. The
+    // issue is that the x86_64 relocation format does not allow symbol +
+    // offset, and so the linker does not have enough information to resolve the
+    // access to the appropriate atom unless an external relocation is used. For
+    // non-cstring sections, we expect the compiler to use a non-temporary label
+    // for anything that could have an addend pointing outside the symbol.
+    //
+    // See <rdar://problem/4765733>.
+    const MCSectionMachO &SMO = static_cast<const MCSectionMachO&>(Section);
+    return SMO.getType() == MachO::S_CSTRING_LITERALS;
+  }
+
+  bool isSectionAtomizable(const MCSection &Section) const override {
+    const MCSectionMachO &SMO = static_cast<const MCSectionMachO&>(Section);
+    // Fixed sized data sections are uniqued, they cannot be diced into atoms.
+    switch (SMO.getType()) {
+    default:
+      return true;
+
+    case MachO::S_4BYTE_LITERALS:
+    case MachO::S_8BYTE_LITERALS:
+    case MachO::S_16BYTE_LITERALS:
+    case MachO::S_LITERAL_POINTERS:
+    case MachO::S_NON_LAZY_SYMBOL_POINTERS:
+    case MachO::S_LAZY_SYMBOL_POINTERS:
+    case MachO::S_MOD_INIT_FUNC_POINTERS:
+    case MachO::S_MOD_TERM_FUNC_POINTERS:
+    case MachO::S_INTERPOSING:
+      return false;
+    }
+  }
+
+  /// \brief Generate the compact unwind encoding for the CFI instructions.
+  uint32_t generateCompactUnwindEncoding(
+                             ArrayRef<MCCFIInstruction> Instrs) const override {
+    return generateCompactUnwindEncodingImpl(Instrs);
+  }
+};
+
+} // end anonymous namespace
+
+MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T,
+                                           const MCRegisterInfo &MRI,
+                                           StringRef TT,
+                                           StringRef CPU) {
+  Triple TheTriple(TT);
+
+  if (TheTriple.isOSBinFormatMachO())
+    return new DarwinX86_32AsmBackend(T, MRI, CPU);
+
+  if (TheTriple.isOSWindows() && !TheTriple.isOSBinFormatELF())
+    return new WindowsX86AsmBackend(T, false, CPU);
+
+  uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
+  return new ELFX86_32AsmBackend(T, OSABI, CPU);
+}
+
+MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T,
+                                           const MCRegisterInfo &MRI,
+                                           StringRef TT,
+                                           StringRef CPU) {
+  Triple TheTriple(TT);
+
+  if (TheTriple.isOSBinFormatMachO()) {
+    MachO::CPUSubTypeX86 CS =
+        StringSwitch<MachO::CPUSubTypeX86>(TheTriple.getArchName())
+            .Case("x86_64h", MachO::CPU_SUBTYPE_X86_64_H)
+            .Default(MachO::CPU_SUBTYPE_X86_64_ALL);
+    return new DarwinX86_64AsmBackend(T, MRI, CPU, CS);
+  }
+
+  if (TheTriple.isOSWindows() && !TheTriple.isOSBinFormatELF())
+    return new WindowsX86AsmBackend(T, true, CPU);
+
+  uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
+
+  if (TheTriple.getEnvironment() == Triple::GNUX32)
+    return new ELFX86_X32AsmBackend(T, OSABI, CPU);
+  return new ELFX86_64AsmBackend(T, OSABI, CPU);
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
new file mode 100644
index 0000000..026e4c4
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -0,0 +1,763 @@
+//===-- X86BaseInfo.h - Top level definitions for X86 -------- --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the X86 target useful for the compiler back-end and the MC libraries.
+// As such, it deliberately does not include references to LLVM core
+// code gen types, passes, etc..
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86BASEINFO_H
+#define X86BASEINFO_H
+
+#include "X86MCTargetDesc.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+namespace X86 {
+  // Enums for memory operand decoding.  Each memory operand is represented with
+  // a 5 operand sequence in the form:
+  //   [BaseReg, ScaleAmt, IndexReg, Disp, Segment]
+  // These enums help decode this.
+  enum {
+    AddrBaseReg = 0,
+    AddrScaleAmt = 1,
+    AddrIndexReg = 2,
+    AddrDisp = 3,
+
+    /// AddrSegmentReg - The operand # of the segment in the memory operand.
+    AddrSegmentReg = 4,
+
+    /// AddrNumOperands - Total number of operands in a memory reference.
+    AddrNumOperands = 5
+  };
+} // end namespace X86;
+
+/// X86II - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace X86II {
+  /// Target Operand Flag enum.
+  enum TOF {
+    //===------------------------------------------------------------------===//
+    // X86 Specific MachineOperand flags.
+
+    MO_NO_FLAG,
+
+    /// MO_GOT_ABSOLUTE_ADDRESS - On a symbol operand, this represents a
+    /// relocation of:
+    ///    SYMBOL_LABEL + [. - PICBASELABEL]
+    MO_GOT_ABSOLUTE_ADDRESS,
+
+    /// MO_PIC_BASE_OFFSET - On a symbol operand this indicates that the
+    /// immediate should get the value of the symbol minus the PIC base label:
+    ///    SYMBOL_LABEL - PICBASELABEL
+    MO_PIC_BASE_OFFSET,
+
+    /// MO_GOT - On a symbol operand this indicates that the immediate is the
+    /// offset to the GOT entry for the symbol name from the base of the GOT.
+    ///
+    /// See the X86-64 ELF ABI supplement for more details.
+    ///    SYMBOL_LABEL @GOT
+    MO_GOT,
+
+    /// MO_GOTOFF - On a symbol operand this indicates that the immediate is
+    /// the offset to the location of the symbol name from the base of the GOT.
+    ///
+    /// See the X86-64 ELF ABI supplement for more details.
+    ///    SYMBOL_LABEL @GOTOFF
+    MO_GOTOFF,
+
+    /// MO_GOTPCREL - On a symbol operand this indicates that the immediate is
+    /// offset to the GOT entry for the symbol name from the current code
+    /// location.
+    ///
+    /// See the X86-64 ELF ABI supplement for more details.
+    ///    SYMBOL_LABEL @GOTPCREL
+    MO_GOTPCREL,
+
+    /// MO_PLT - On a symbol operand this indicates that the immediate is
+    /// offset to the PLT entry of symbol name from the current code location.
+    ///
+    /// See the X86-64 ELF ABI supplement for more details.
+    ///    SYMBOL_LABEL @PLT
+    MO_PLT,
+
+    /// MO_TLSGD - On a symbol operand this indicates that the immediate is
+    /// the offset of the GOT entry with the TLS index structure that contains
+    /// the module number and variable offset for the symbol. Used in the
+    /// general dynamic TLS access model.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @TLSGD
+    MO_TLSGD,
+
+    /// MO_TLSLD - On a symbol operand this indicates that the immediate is
+    /// the offset of the GOT entry with the TLS index for the module that
+    /// contains the symbol. When this index is passed to a call to
+    /// __tls_get_addr, the function will return the base address of the TLS
+    /// block for the symbol. Used in the x86-64 local dynamic TLS access model.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @TLSLD
+    MO_TLSLD,
+
+    /// MO_TLSLDM - On a symbol operand this indicates that the immediate is
+    /// the offset of the GOT entry with the TLS index for the module that
+    /// contains the symbol. When this index is passed to a call to
+    /// ___tls_get_addr, the function will return the base address of the TLS
+    /// block for the symbol. Used in the IA32 local dynamic TLS access model.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @TLSLDM
+    MO_TLSLDM,
+
+    /// MO_GOTTPOFF - On a symbol operand this indicates that the immediate is
+    /// the offset of the GOT entry with the thread-pointer offset for the
+    /// symbol. Used in the x86-64 initial exec TLS access model.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @GOTTPOFF
+    MO_GOTTPOFF,
+
+    /// MO_INDNTPOFF - On a symbol operand this indicates that the immediate is
+    /// the absolute address of the GOT entry with the negative thread-pointer
+    /// offset for the symbol. Used in the non-PIC IA32 initial exec TLS access
+    /// model.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @INDNTPOFF
+    MO_INDNTPOFF,
+
+    /// MO_TPOFF - On a symbol operand this indicates that the immediate is
+    /// the thread-pointer offset for the symbol. Used in the x86-64 local
+    /// exec TLS access model.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @TPOFF
+    MO_TPOFF,
+
+    /// MO_DTPOFF - On a symbol operand this indicates that the immediate is
+    /// the offset of the GOT entry with the TLS offset of the symbol. Used
+    /// in the local dynamic TLS access model.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @DTPOFF
+    MO_DTPOFF,
+
+    /// MO_NTPOFF - On a symbol operand this indicates that the immediate is
+    /// the negative thread-pointer offset for the symbol. Used in the IA32
+    /// local exec TLS access model.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @NTPOFF
+    MO_NTPOFF,
+
+    /// MO_GOTNTPOFF - On a symbol operand this indicates that the immediate is
+    /// the offset of the GOT entry with the negative thread-pointer offset for
+    /// the symbol. Used in the PIC IA32 initial exec TLS access model.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @GOTNTPOFF
+    MO_GOTNTPOFF,
+
+    /// MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the
+    /// reference is actually to the "__imp_FOO" symbol.  This is used for
+    /// dllimport linkage on windows.
+    MO_DLLIMPORT,
+
+    /// MO_DARWIN_STUB - On a symbol operand "FOO", this indicates that the
+    /// reference is actually to the "FOO$stub" symbol.  This is used for calls
+    /// and jumps to external functions on Tiger and earlier.
+    MO_DARWIN_STUB,
+
+    /// MO_DARWIN_NONLAZY - On a symbol operand "FOO", this indicates that the
+    /// reference is actually to the "FOO$non_lazy_ptr" symbol, which is a
+    /// non-PIC-base-relative reference to a non-hidden dyld lazy pointer stub.
+    MO_DARWIN_NONLAZY,
+
+    /// MO_DARWIN_NONLAZY_PIC_BASE - On a symbol operand "FOO", this indicates
+    /// that the reference is actually to "FOO$non_lazy_ptr - PICBASE", which is
+    /// a PIC-base-relative reference to a non-hidden dyld lazy pointer stub.
+    MO_DARWIN_NONLAZY_PIC_BASE,
+
+    /// MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE - On a symbol operand "FOO", this
+    /// indicates that the reference is actually to "FOO$non_lazy_ptr -PICBASE",
+    /// which is a PIC-base-relative reference to a hidden dyld lazy pointer
+    /// stub.
+    MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE,
+
+    /// MO_TLVP - On a symbol operand this indicates that the immediate is
+    /// some TLS offset.
+    ///
+    /// This is the TLS offset for the Darwin TLS mechanism.
+    MO_TLVP,
+
+    /// MO_TLVP_PIC_BASE - On a symbol operand this indicates that the immediate
+    /// is some TLS offset from the picbase.
+    ///
+    /// This is the 32-bit TLS offset for Darwin TLS in PIC mode.
+    MO_TLVP_PIC_BASE,
+
+    /// MO_SECREL - On a symbol operand this indicates that the immediate is
+    /// the offset from beginning of section.
+    ///
+    /// This is the TLS offset for the COFF/Windows TLS mechanism.
+    MO_SECREL
+  };
+
+  enum {
+    //===------------------------------------------------------------------===//
+    // Instruction encodings.  These are the standard/most common forms for X86
+    // instructions.
+    //
+
+    // PseudoFrm - This represents an instruction that is a pseudo instruction
+    // or one that has not been implemented yet.  It is illegal to code generate
+    // it, but tolerated for intermediate implementation stages.
+    Pseudo         = 0,
+
+    /// Raw - This form is for instructions that don't have any operands, so
+    /// they are just a fixed opcode value, like 'leave'.
+    RawFrm         = 1,
+
+    /// AddRegFrm - This form is used for instructions like 'push r32' that have
+    /// their one register operand added to their opcode.
+    AddRegFrm      = 2,
+
+    /// MRMDestReg - This form is used for instructions that use the Mod/RM byte
+    /// to specify a destination, which in this case is a register.
+    ///
+    MRMDestReg     = 3,
+
+    /// MRMDestMem - This form is used for instructions that use the Mod/RM byte
+    /// to specify a destination, which in this case is memory.
+    ///
+    MRMDestMem     = 4,
+
+    /// MRMSrcReg - This form is used for instructions that use the Mod/RM byte
+    /// to specify a source, which in this case is a register.
+    ///
+    MRMSrcReg      = 5,
+
+    /// MRMSrcMem - This form is used for instructions that use the Mod/RM byte
+    /// to specify a source, which in this case is memory.
+    ///
+    MRMSrcMem      = 6,
+
+    /// RawFrmMemOffs - This form is for instructions that store an absolute
+    /// memory offset as an immediate with a possible segment override.
+    RawFrmMemOffs  = 7,
+
+    /// RawFrmSrc - This form is for instructions that use the source index
+    /// register SI/ESI/RSI with a possible segment override.
+    RawFrmSrc      = 8,
+
+    /// RawFrmDst - This form is for instructions that use the destination index
+    /// register DI/EDI/ESI.
+    RawFrmDst      = 9,
+
+    /// RawFrmSrc - This form is for instructions that use the the source index
+    /// register SI/ESI/ERI with a possible segment override, and also the
+    /// destination index register DI/ESI/RDI.
+    RawFrmDstSrc   = 10,
+
+    /// RawFrmImm8 - This is used for the ENTER instruction, which has two
+    /// immediates, the first of which is a 16-bit immediate (specified by
+    /// the imm encoding) and the second is a 8-bit fixed value.
+    RawFrmImm8 = 11,
+
+    /// RawFrmImm16 - This is used for CALL FAR instructions, which have two
+    /// immediates, the first of which is a 16 or 32-bit immediate (specified by
+    /// the imm encoding) and the second is a 16-bit fixed value.  In the AMD
+    /// manual, this operand is described as pntr16:32 and pntr16:16
+    RawFrmImm16 = 12,
+
+    /// MRMX[rm] - The forms are used to represent instructions that use a
+    /// Mod/RM byte, and don't use the middle field for anything.
+    MRMXr = 14, MRMXm = 15,
+
+    /// MRM[0-7][rm] - These forms are used to represent instructions that use
+    /// a Mod/RM byte, and use the middle field to hold extended opcode
+    /// information.  In the intel manual these are represented as /0, /1, ...
+    ///
+
+    // First, instructions that operate on a register r/m operand...
+    MRM0r = 16,  MRM1r = 17,  MRM2r = 18,  MRM3r = 19, // Format /0 /1 /2 /3
+    MRM4r = 20,  MRM5r = 21,  MRM6r = 22,  MRM7r = 23, // Format /4 /5 /6 /7
+
+    // Next, instructions that operate on a memory r/m operand...
+    MRM0m = 24,  MRM1m = 25,  MRM2m = 26,  MRM3m = 27, // Format /0 /1 /2 /3
+    MRM4m = 28,  MRM5m = 29,  MRM6m = 30,  MRM7m = 31, // Format /4 /5 /6 /7
+
+    //// MRM_XX - A mod/rm byte of exactly 0xXX.
+    MRM_C0 = 32, MRM_C1 = 33, MRM_C2 = 34, MRM_C3 = 35,
+    MRM_C4 = 36, MRM_C8 = 37, MRM_C9 = 38, MRM_CA = 39,
+    MRM_CB = 40, MRM_D0 = 41, MRM_D1 = 42, MRM_D4 = 43,
+    MRM_D5 = 44, MRM_D6 = 45, MRM_D8 = 46, MRM_D9 = 47,
+    MRM_DA = 48, MRM_DB = 49, MRM_DC = 50, MRM_DD = 51,
+    MRM_DE = 52, MRM_DF = 53, MRM_E0 = 54, MRM_E1 = 55,
+    MRM_E2 = 56, MRM_E3 = 57, MRM_E4 = 58, MRM_E5 = 59,
+    MRM_E8 = 60, MRM_E9 = 61, MRM_EA = 62, MRM_EB = 63,
+    MRM_EC = 64, MRM_ED = 65, MRM_EE = 66, MRM_F0 = 67,
+    MRM_F1 = 68, MRM_F2 = 69, MRM_F3 = 70, MRM_F4 = 71,
+    MRM_F5 = 72, MRM_F6 = 73, MRM_F7 = 74, MRM_F8 = 75,
+    MRM_F9 = 76, MRM_FA = 77, MRM_FB = 78, MRM_FC = 79,
+    MRM_FD = 80, MRM_FE = 81, MRM_FF = 82,
+
+    FormMask       = 127,
+
+    //===------------------------------------------------------------------===//
+    // Actual flags...
+
+    // OpSize - OpSizeFixed implies instruction never needs a 0x66 prefix.
+    // OpSize16 means this is a 16-bit instruction and needs 0x66 prefix in
+    // 32-bit mode. OpSize32 means this is a 32-bit instruction needs a 0x66
+    // prefix in 16-bit mode.
+    OpSizeShift = 7,
+    OpSizeMask = 0x3 << OpSizeShift,
+
+    OpSize16 = 1,
+    OpSize32 = 2,
+
+    // AsSize - Set if this instruction requires an operand size prefix (0x67),
+    // which most often indicates that the instruction address 16 bit address
+    // instead of 32 bit address (or 32 bit address in 64 bit mode).
+    AdSizeShift = OpSizeShift + 2,
+    AdSize      = 1 << AdSizeShift,
+
+    //===------------------------------------------------------------------===//
+    // OpPrefix - There are several prefix bytes that are used as opcode
+    // extensions. These are 0x66, 0xF3, and 0xF2. If this field is 0 there is
+    // no prefix.
+    //
+    OpPrefixShift = AdSizeShift + 1,
+    OpPrefixMask  = 0x7 << OpPrefixShift,
+
+    // PS, PD - Prefix code for packed single and double precision vector
+    // floating point operations performed in the SSE registers.
+    PS = 1 << OpPrefixShift, PD = 2 << OpPrefixShift,
+
+    // XS, XD - These prefix codes are for single and double precision scalar
+    // floating point operations performed in the SSE registers.
+    XS = 3 << OpPrefixShift,  XD = 4 << OpPrefixShift,
+
+    //===------------------------------------------------------------------===//
+    // OpMap - This field determines which opcode map this instruction
+    // belongs to. i.e. one-byte, two-byte, 0x0f 0x38, 0x0f 0x3a, etc.
+    //
+    OpMapShift = OpPrefixShift + 3,
+    OpMapMask  = 0x7 << OpMapShift,
+
+    // OB - OneByte - Set if this instruction has a one byte opcode.
+    OB = 0 << OpMapShift,
+
+    // TB - TwoByte - Set if this instruction has a two byte opcode, which
+    // starts with a 0x0F byte before the real opcode.
+    TB = 1 << OpMapShift,
+
+    // T8, TA - Prefix after the 0x0F prefix.
+    T8 = 2 << OpMapShift,  TA = 3 << OpMapShift,
+
+    // XOP8 - Prefix to include use of imm byte.
+    XOP8 = 4 << OpMapShift,
+
+    // XOP9 - Prefix to exclude use of imm byte.
+    XOP9 = 5 << OpMapShift,
+
+    // XOPA - Prefix to encode 0xA in VEX.MMMM of XOP instructions.
+    XOPA = 6 << OpMapShift,
+
+    //===------------------------------------------------------------------===//
+    // REX_W - REX prefixes are instruction prefixes used in 64-bit mode.
+    // They are used to specify GPRs and SSE registers, 64-bit operand size,
+    // etc. We only cares about REX.W and REX.R bits and only the former is
+    // statically determined.
+    //
+    REXShift    = OpMapShift + 3,
+    REX_W       = 1 << REXShift,
+
+    //===------------------------------------------------------------------===//
+    // This three-bit field describes the size of an immediate operand.  Zero is
+    // unused so that we can tell if we forgot to set a value.
+    ImmShift = REXShift + 1,
+    ImmMask    = 15 << ImmShift,
+    Imm8       = 1 << ImmShift,
+    Imm8PCRel  = 2 << ImmShift,
+    Imm16      = 3 << ImmShift,
+    Imm16PCRel = 4 << ImmShift,
+    Imm32      = 5 << ImmShift,
+    Imm32PCRel = 6 << ImmShift,
+    Imm32S     = 7 << ImmShift,
+    Imm64      = 8 << ImmShift,
+
+    //===------------------------------------------------------------------===//
+    // FP Instruction Classification...  Zero is non-fp instruction.
+
+    // FPTypeMask - Mask for all of the FP types...
+    FPTypeShift = ImmShift + 4,
+    FPTypeMask  = 7 << FPTypeShift,
+
+    // NotFP - The default, set for instructions that do not use FP registers.
+    NotFP      = 0 << FPTypeShift,
+
+    // ZeroArgFP - 0 arg FP instruction which implicitly pushes ST(0), f.e. fld0
+    ZeroArgFP  = 1 << FPTypeShift,
+
+    // OneArgFP - 1 arg FP instructions which implicitly read ST(0), such as fst
+    OneArgFP   = 2 << FPTypeShift,
+
+    // OneArgFPRW - 1 arg FP instruction which implicitly read ST(0) and write a
+    // result back to ST(0).  For example, fcos, fsqrt, etc.
+    //
+    OneArgFPRW = 3 << FPTypeShift,
+
+    // TwoArgFP - 2 arg FP instructions which implicitly read ST(0), and an
+    // explicit argument, storing the result to either ST(0) or the implicit
+    // argument.  For example: fadd, fsub, fmul, etc...
+    TwoArgFP   = 4 << FPTypeShift,
+
+    // CompareFP - 2 arg FP instructions which implicitly read ST(0) and an
+    // explicit argument, but have no destination.  Example: fucom, fucomi, ...
+    CompareFP  = 5 << FPTypeShift,
+
+    // CondMovFP - "2 operand" floating point conditional move instructions.
+    CondMovFP  = 6 << FPTypeShift,
+
+    // SpecialFP - Special instruction forms.  Dispatch by opcode explicitly.
+    SpecialFP  = 7 << FPTypeShift,
+
+    // Lock prefix
+    LOCKShift = FPTypeShift + 3,
+    LOCK = 1 << LOCKShift,
+
+    // REP prefix
+    REPShift = LOCKShift + 1,
+    REP = 1 << REPShift,
+
+    // Execution domain for SSE instructions.
+    // 0 means normal, non-SSE instruction.
+    SSEDomainShift = REPShift + 1,
+
+    // Encoding
+    EncodingShift = SSEDomainShift + 2,
+    EncodingMask = 0x3 << EncodingShift,
+
+    // VEX - encoding using 0xC4/0xC5
+    VEX = 1,
+
+    /// XOP - Opcode prefix used by XOP instructions.
+    XOP = 2,
+
+    // VEX_EVEX - Specifies that this instruction use EVEX form which provides
+    // syntax support up to 32 512-bit register operands and up to 7 16-bit
+    // mask operands as well as source operand data swizzling/memory operand
+    // conversion, eviction hint, and rounding mode.
+    EVEX = 3,
+
+    // Opcode
+    OpcodeShift   = EncodingShift + 2,
+
+    //===------------------------------------------------------------------===//
+    /// VEX - The opcode prefix used by AVX instructions
+    VEXShift = OpcodeShift + 8,
+
+    /// VEX_W - Has a opcode specific functionality, but is used in the same
+    /// way as REX_W is for regular SSE instructions.
+    VEX_W       = 1U << 0,
+
+    /// VEX_4V - Used to specify an additional AVX/SSE register. Several 2
+    /// address instructions in SSE are represented as 3 address ones in AVX
+    /// and the additional register is encoded in VEX_VVVV prefix.
+    VEX_4V      = 1U << 1,
+
+    /// VEX_4VOp3 - Similar to VEX_4V, but used on instructions that encode
+    /// operand 3 with VEX.vvvv.
+    VEX_4VOp3   = 1U << 2,
+
+    /// VEX_I8IMM - Specifies that the last register used in a AVX instruction,
+    /// must be encoded in the i8 immediate field. This usually happens in
+    /// instructions with 4 operands.
+    VEX_I8IMM   = 1U << 3,
+
+    /// VEX_L - Stands for a bit in the VEX opcode prefix meaning the current
+    /// instruction uses 256-bit wide registers. This is usually auto detected
+    /// if a VR256 register is used, but some AVX instructions also have this
+    /// field marked when using a f256 memory references.
+    VEX_L       = 1U << 4,
+
+    // VEX_LIG - Specifies that this instruction ignores the L-bit in the VEX
+    // prefix. Usually used for scalar instructions. Needed by disassembler.
+    VEX_LIG     = 1U << 5,
+
+    // TODO: we should combine VEX_L and VEX_LIG together to form a 2-bit field
+    // with following encoding:
+    // - 00 V128
+    // - 01 V256
+    // - 10 V512
+    // - 11 LIG (but, in insn encoding, leave VEX.L and EVEX.L in zeros.
+    // this will save 1 tsflag bit
+
+    // EVEX_K - Set if this instruction requires masking
+    EVEX_K      = 1U << 6,
+
+    // EVEX_Z - Set if this instruction has EVEX.Z field set.
+    EVEX_Z      = 1U << 7,
+
+    // EVEX_L2 - Set if this instruction has EVEX.L' field set.
+    EVEX_L2     = 1U << 8,
+
+    // EVEX_B - Set if this instruction has EVEX.B field set.
+    EVEX_B      = 1U << 9,
+
+    // The scaling factor for the AVX512's 8-bit compressed displacement.
+    CD8_Scale_Shift = VEXShift + 10,
+    CD8_Scale_Mask = 127,
+
+    /// Has3DNow0F0FOpcode - This flag indicates that the instruction uses the
+    /// wacky 0x0F 0x0F prefix for 3DNow! instructions.  The manual documents
+    /// this as having a 0x0F prefix with a 0x0F opcode, and each instruction
+    /// storing a classifier in the imm8 field.  To simplify our implementation,
+    /// we handle this by storeing the classifier in the opcode field and using
+    /// this flag to indicate that the encoder should do the wacky 3DNow! thing.
+    Has3DNow0F0FOpcodeShift = CD8_Scale_Shift + 7,
+    Has3DNow0F0FOpcode = 1U << (Has3DNow0F0FOpcodeShift - VEXShift),
+
+    /// MemOp4 - Used to indicate swapping of operand 3 and 4 to be encoded in
+    /// ModRM or I8IMM. This is used for FMA4 and XOP instructions.
+    MemOp4Shift = Has3DNow0F0FOpcodeShift + 1,
+    MemOp4 = 1U << (MemOp4Shift - VEXShift),
+
+    /// Explicitly specified rounding control
+    EVEX_RCShift = MemOp4Shift + 1,
+    EVEX_RC = 1U << (EVEX_RCShift - VEXShift)
+  };
+
+  // getBaseOpcodeFor - This function returns the "base" X86 opcode for the
+  // specified machine instruction.
+  //
+  inline unsigned char getBaseOpcodeFor(uint64_t TSFlags) {
+    return TSFlags >> X86II::OpcodeShift;
+  }
+
+  inline bool hasImm(uint64_t TSFlags) {
+    return (TSFlags & X86II::ImmMask) != 0;
+  }
+
+  /// getSizeOfImm - Decode the "size of immediate" field from the TSFlags field
+  /// of the specified instruction.
+  inline unsigned getSizeOfImm(uint64_t TSFlags) {
+    switch (TSFlags & X86II::ImmMask) {
+    default: llvm_unreachable("Unknown immediate size");
+    case X86II::Imm8:
+    case X86II::Imm8PCRel:  return 1;
+    case X86II::Imm16:
+    case X86II::Imm16PCRel: return 2;
+    case X86II::Imm32:
+    case X86II::Imm32S:
+    case X86II::Imm32PCRel: return 4;
+    case X86II::Imm64:      return 8;
+    }
+  }
+
+  /// isImmPCRel - Return true if the immediate of the specified instruction's
+  /// TSFlags indicates that it is pc relative.
+  inline unsigned isImmPCRel(uint64_t TSFlags) {
+    switch (TSFlags & X86II::ImmMask) {
+    default: llvm_unreachable("Unknown immediate size");
+    case X86II::Imm8PCRel:
+    case X86II::Imm16PCRel:
+    case X86II::Imm32PCRel:
+      return true;
+    case X86II::Imm8:
+    case X86II::Imm16:
+    case X86II::Imm32:
+    case X86II::Imm32S:
+    case X86II::Imm64:
+      return false;
+    }
+  }
+
+  /// isImmSigned - Return true if the immediate of the specified instruction's
+  /// TSFlags indicates that it is signed.
+  inline unsigned isImmSigned(uint64_t TSFlags) {
+    switch (TSFlags & X86II::ImmMask) {
+    default: llvm_unreachable("Unknown immediate signedness");
+    case X86II::Imm32S:
+      return true;
+    case X86II::Imm8:
+    case X86II::Imm8PCRel:
+    case X86II::Imm16:
+    case X86II::Imm16PCRel:
+    case X86II::Imm32:
+    case X86II::Imm32PCRel:
+    case X86II::Imm64:
+      return false;
+    }
+  }
+
+  /// getOperandBias - compute any additional adjustment needed to
+  ///                  the offset to the start of the memory operand
+  ///                  in this instruction.
+  /// If this is a two-address instruction,skip one of the register operands.
+  /// FIXME: This should be handled during MCInst lowering.
+  inline int getOperandBias(const MCInstrDesc& Desc)
+  {
+    unsigned NumOps = Desc.getNumOperands();
+    unsigned CurOp = 0;
+    if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) == 0)
+      ++CurOp;
+    else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 &&
+             Desc.getOperandConstraint(3, MCOI::TIED_TO) == 1)
+      // Special case for AVX-512 GATHER with 2 TIED_TO operands
+      // Skip the first 2 operands: dst, mask_wb
+      CurOp += 2;
+    else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 &&
+             Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1)
+      // Special case for GATHER with 2 TIED_TO operands
+      // Skip the first 2 operands: dst, mask_wb
+      CurOp += 2;
+    else if (NumOps > 2 && Desc.getOperandConstraint(NumOps - 2, MCOI::TIED_TO) == 0)
+      // SCATTER
+      ++CurOp;
+    return CurOp;
+  }
+
+  /// getMemoryOperandNo - The function returns the MCInst operand # for the
+  /// first field of the memory operand.  If the instruction doesn't have a
+  /// memory operand, this returns -1.
+  ///
+  /// Note that this ignores tied operands.  If there is a tied register which
+  /// is duplicated in the MCInst (e.g. "EAX = addl EAX, [mem]") it is only
+  /// counted as one operand.
+  ///
+  inline int getMemoryOperandNo(uint64_t TSFlags, unsigned Opcode) {
+    bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
+    bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
+    bool HasEVEX_K = ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K);
+    
+    switch (TSFlags & X86II::FormMask) {
+    default: llvm_unreachable("Unknown FormMask value in getMemoryOperandNo!");
+    case X86II::Pseudo:
+    case X86II::RawFrm:
+    case X86II::AddRegFrm:
+    case X86II::MRMDestReg:
+    case X86II::MRMSrcReg:
+    case X86II::RawFrmImm8:
+    case X86II::RawFrmImm16:
+    case X86II::RawFrmMemOffs:
+    case X86II::RawFrmSrc:
+    case X86II::RawFrmDst:
+    case X86II::RawFrmDstSrc:
+       return -1;
+    case X86II::MRMDestMem:
+      return 0;
+    case X86II::MRMSrcMem: {
+      unsigned FirstMemOp = 1;
+      if (HasVEX_4V)
+        ++FirstMemOp;// Skip the register source (which is encoded in VEX_VVVV).
+      if (HasMemOp4)
+        ++FirstMemOp;// Skip the register source (which is encoded in I8IMM).
+      if (HasEVEX_K)
+        ++FirstMemOp;// Skip the mask register
+      // FIXME: Maybe lea should have its own form?  This is a horrible hack.
+      //if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r ||
+      //    Opcode == X86::LEA16r || Opcode == X86::LEA32r)
+      return FirstMemOp;
+    }
+    case X86II::MRMXr:
+    case X86II::MRM0r: case X86II::MRM1r:
+    case X86II::MRM2r: case X86II::MRM3r:
+    case X86II::MRM4r: case X86II::MRM5r:
+    case X86II::MRM6r: case X86II::MRM7r:
+      return -1;
+    case X86II::MRMXm:
+    case X86II::MRM0m: case X86II::MRM1m:
+    case X86II::MRM2m: case X86II::MRM3m:
+    case X86II::MRM4m: case X86II::MRM5m:
+    case X86II::MRM6m: case X86II::MRM7m: {
+      bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
+      unsigned FirstMemOp = 0;
+      if (HasVEX_4V)
+        ++FirstMemOp;// Skip the register dest (which is encoded in VEX_VVVV).
+      if (HasEVEX_K)
+        ++FirstMemOp;// Skip the mask register
+      return FirstMemOp;
+    }
+    case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2:
+    case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C8:
+    case X86II::MRM_C9: case X86II::MRM_CA: case X86II::MRM_CB:
+    case X86II::MRM_D0: case X86II::MRM_D1: case X86II::MRM_D4:
+    case X86II::MRM_D5: case X86II::MRM_D6: case X86II::MRM_D8:
+    case X86II::MRM_D9: case X86II::MRM_DA: case X86II::MRM_DB:
+    case X86II::MRM_DC: case X86II::MRM_DD: case X86II::MRM_DE:
+    case X86II::MRM_DF: case X86II::MRM_E0: case X86II::MRM_E1:
+    case X86II::MRM_E2: case X86II::MRM_E3: case X86II::MRM_E4:
+    case X86II::MRM_E5: case X86II::MRM_E8: case X86II::MRM_E9:
+    case X86II::MRM_EA: case X86II::MRM_EB: case X86II::MRM_EC:
+    case X86II::MRM_ED: case X86II::MRM_EE: case X86II::MRM_F0:
+    case X86II::MRM_F1: case X86II::MRM_F2: case X86II::MRM_F3:
+    case X86II::MRM_F4: case X86II::MRM_F5: case X86II::MRM_F6:
+    case X86II::MRM_F7: case X86II::MRM_F8: case X86II::MRM_F9:
+    case X86II::MRM_FA: case X86II::MRM_FB: case X86II::MRM_FC:
+    case X86II::MRM_FD: case X86II::MRM_FE: case X86II::MRM_FF:
+      return -1;
+    }
+  }
+
+  /// isX86_64ExtendedReg - Is the MachineOperand a x86-64 extended (r8 or
+  /// higher) register?  e.g. r8, xmm8, xmm13, etc.
+  inline bool isX86_64ExtendedReg(unsigned RegNo) {
+    if ((RegNo > X86::XMM7 && RegNo <= X86::XMM15) ||
+        (RegNo > X86::XMM23 && RegNo <= X86::XMM31) ||
+        (RegNo > X86::YMM7 && RegNo <= X86::YMM15) ||
+        (RegNo > X86::YMM23 && RegNo <= X86::YMM31) ||
+        (RegNo > X86::ZMM7 && RegNo <= X86::ZMM15) ||
+        (RegNo > X86::ZMM23 && RegNo <= X86::ZMM31))
+      return true;
+
+    switch (RegNo) {
+    default: break;
+    case X86::R8:    case X86::R9:    case X86::R10:   case X86::R11:
+    case X86::R12:   case X86::R13:   case X86::R14:   case X86::R15:
+    case X86::R8D:   case X86::R9D:   case X86::R10D:  case X86::R11D:
+    case X86::R12D:  case X86::R13D:  case X86::R14D:  case X86::R15D:
+    case X86::R8W:   case X86::R9W:   case X86::R10W:  case X86::R11W:
+    case X86::R12W:  case X86::R13W:  case X86::R14W:  case X86::R15W:
+    case X86::R8B:   case X86::R9B:   case X86::R10B:  case X86::R11B:
+    case X86::R12B:  case X86::R13B:  case X86::R14B:  case X86::R15B:
+    case X86::CR8:   case X86::CR9:   case X86::CR10:  case X86::CR11:
+    case X86::CR12:  case X86::CR13:  case X86::CR14:  case X86::CR15:
+        return true;
+    }
+    return false;
+  }
+
+  /// is32ExtendedReg - Is the MemoryOperand a 32 extended (zmm16 or higher)
+  /// registers? e.g. zmm21, etc.
+  static inline bool is32ExtendedReg(unsigned RegNo) {
+    return ((RegNo > X86::XMM15 && RegNo <= X86::XMM31) ||
+            (RegNo > X86::YMM15 && RegNo <= X86::YMM31) ||
+            (RegNo > X86::ZMM15 && RegNo <= X86::ZMM31));
+  }
+
+  
+  inline bool isX86_64NonExtLowByteReg(unsigned reg) {
+    return (reg == X86::SPL || reg == X86::BPL ||
+            reg == X86::SIL || reg == X86::DIL);
+  }
+}
+
+} // end namespace llvm;
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
new file mode 100644
index 0000000..3fdec87
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -0,0 +1,272 @@
+//===-- X86ELFObjectWriter.cpp - X86 ELF Writer ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86FixupKinds.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace {
+  class X86ELFObjectWriter : public MCELFObjectTargetWriter {
+  public:
+    X86ELFObjectWriter(bool IsELF64, uint8_t OSABI, uint16_t EMachine);
+
+    virtual ~X86ELFObjectWriter();
+  protected:
+    unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
+                          bool IsPCRel) const override;
+  };
+}
+
+X86ELFObjectWriter::X86ELFObjectWriter(bool IsELF64, uint8_t OSABI,
+                                       uint16_t EMachine)
+  : MCELFObjectTargetWriter(IsELF64, OSABI, EMachine,
+                            // Only i386 uses Rel instead of RelA.
+                            /*HasRelocationAddend*/ EMachine != ELF::EM_386) {}
+
+X86ELFObjectWriter::~X86ELFObjectWriter()
+{}
+
+unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target,
+                                          const MCFixup &Fixup,
+                                          bool IsPCRel) const {
+  // determine the type of the relocation
+
+  MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant();
+  unsigned Type;
+  if (getEMachine() == ELF::EM_X86_64) {
+    if (IsPCRel) {
+      switch ((unsigned)Fixup.getKind()) {
+      default: llvm_unreachable("invalid fixup kind!");
+
+      case FK_Data_8: Type = ELF::R_X86_64_PC64; break;
+      case FK_Data_4: Type = ELF::R_X86_64_PC32; break;
+      case FK_Data_2: Type = ELF::R_X86_64_PC16; break;
+      case FK_Data_1: Type = ELF::R_X86_64_PC8; break;
+
+      case FK_PCRel_8:
+        assert(Modifier == MCSymbolRefExpr::VK_None);
+        Type = ELF::R_X86_64_PC64;
+        break;
+      case X86::reloc_signed_4byte:
+      case X86::reloc_riprel_4byte_movq_load:
+      case X86::reloc_riprel_4byte:
+      case FK_PCRel_4:
+        switch (Modifier) {
+        default:
+          llvm_unreachable("Unimplemented");
+        case MCSymbolRefExpr::VK_None:
+          Type = ELF::R_X86_64_PC32;
+          break;
+        case MCSymbolRefExpr::VK_PLT:
+          Type = ELF::R_X86_64_PLT32;
+          break;
+        case MCSymbolRefExpr::VK_GOTPCREL:
+          Type = ELF::R_X86_64_GOTPCREL;
+          break;
+        case MCSymbolRefExpr::VK_GOTTPOFF:
+          Type = ELF::R_X86_64_GOTTPOFF;
+        break;
+        case MCSymbolRefExpr::VK_TLSGD:
+          Type = ELF::R_X86_64_TLSGD;
+          break;
+        case MCSymbolRefExpr::VK_TLSLD:
+          Type = ELF::R_X86_64_TLSLD;
+          break;
+        }
+        break;
+      case FK_PCRel_2:
+        assert(Modifier == MCSymbolRefExpr::VK_None);
+        Type = ELF::R_X86_64_PC16;
+        break;
+      case FK_PCRel_1:
+        assert(Modifier == MCSymbolRefExpr::VK_None);
+        Type = ELF::R_X86_64_PC8;
+        break;
+      }
+    } else {
+      switch ((unsigned)Fixup.getKind()) {
+      default: llvm_unreachable("invalid fixup kind!");
+      case X86::reloc_global_offset_table8:
+        Type = ELF::R_X86_64_GOTPC64;
+        break;
+      case X86::reloc_global_offset_table:
+        Type = ELF::R_X86_64_GOTPC32;
+        break;
+      case FK_Data_8:
+        switch (Modifier) {
+        default:
+          llvm_unreachable("Unimplemented");
+        case MCSymbolRefExpr::VK_None:
+          Type = ELF::R_X86_64_64;
+          break;
+        case MCSymbolRefExpr::VK_GOT:
+          Type = ELF::R_X86_64_GOT64;
+          break;
+        case MCSymbolRefExpr::VK_GOTOFF:
+          Type = ELF::R_X86_64_GOTOFF64;
+          break;
+        case MCSymbolRefExpr::VK_TPOFF:
+          Type = ELF::R_X86_64_TPOFF64;
+          break;
+        case MCSymbolRefExpr::VK_DTPOFF:
+          Type = ELF::R_X86_64_DTPOFF64;
+          break;
+        }
+        break;
+      case X86::reloc_signed_4byte:
+        switch (Modifier) {
+        default:
+          llvm_unreachable("Unimplemented");
+        case MCSymbolRefExpr::VK_None:
+          Type = ELF::R_X86_64_32S;
+          break;
+        case MCSymbolRefExpr::VK_GOT:
+          Type = ELF::R_X86_64_GOT32;
+          break;
+        case MCSymbolRefExpr::VK_GOTPCREL:
+          Type = ELF::R_X86_64_GOTPCREL;
+          break;
+        case MCSymbolRefExpr::VK_TPOFF:
+          Type = ELF::R_X86_64_TPOFF32;
+          break;
+        case MCSymbolRefExpr::VK_DTPOFF:
+          Type = ELF::R_X86_64_DTPOFF32;
+          break;
+        }
+        break;
+      case FK_Data_4:
+        Type = ELF::R_X86_64_32;
+        break;
+      case FK_Data_2: Type = ELF::R_X86_64_16; break;
+      case FK_PCRel_1:
+      case FK_Data_1: Type = ELF::R_X86_64_8; break;
+      }
+    }
+  } else if (getEMachine() == ELF::EM_386) {
+    if (IsPCRel) {
+      switch ((unsigned)Fixup.getKind()) {
+      default: llvm_unreachable("invalid fixup kind!");
+
+      case X86::reloc_global_offset_table:
+        Type = ELF::R_386_GOTPC;
+        break;
+
+      case FK_PCRel_1:
+      case FK_Data_1:
+        switch (Modifier) {
+        default:
+          llvm_unreachable("Unimplemented");
+        case MCSymbolRefExpr::VK_None:
+          Type = ELF::R_386_PC8;
+          break;
+        }
+        break;
+
+      case FK_PCRel_2:
+      case FK_Data_2:
+        switch (Modifier) {
+        default:
+          llvm_unreachable("Unimplemented");
+        case MCSymbolRefExpr::VK_None:
+          Type = ELF::R_386_PC16;
+          break;
+        }
+        break;
+
+      case X86::reloc_signed_4byte:
+      case FK_PCRel_4:
+      case FK_Data_4:
+        switch (Modifier) {
+        default:
+          llvm_unreachable("Unimplemented");
+        case MCSymbolRefExpr::VK_None:
+          Type = ELF::R_386_PC32;
+          break;
+        case MCSymbolRefExpr::VK_PLT:
+          Type = ELF::R_386_PLT32;
+          break;
+        }
+        break;
+      }
+    } else {
+      switch ((unsigned)Fixup.getKind()) {
+      default: llvm_unreachable("invalid fixup kind!");
+
+      case X86::reloc_global_offset_table:
+        Type = ELF::R_386_GOTPC;
+        break;
+
+      // FIXME: Should we avoid selecting reloc_signed_4byte in 32 bit mode
+      // instead?
+      case X86::reloc_signed_4byte:
+      case FK_PCRel_4:
+      case FK_Data_4:
+        switch (Modifier) {
+        default:
+          llvm_unreachable("Unimplemented");
+        case MCSymbolRefExpr::VK_None:
+          Type = ELF::R_386_32;
+          break;
+        case MCSymbolRefExpr::VK_GOT:
+          Type = ELF::R_386_GOT32;
+          break;
+        case MCSymbolRefExpr::VK_GOTOFF:
+          Type = ELF::R_386_GOTOFF;
+          break;
+        case MCSymbolRefExpr::VK_TLSGD:
+          Type = ELF::R_386_TLS_GD;
+          break;
+        case MCSymbolRefExpr::VK_TPOFF:
+          Type = ELF::R_386_TLS_LE_32;
+          break;
+        case MCSymbolRefExpr::VK_INDNTPOFF:
+          Type = ELF::R_386_TLS_IE;
+          break;
+        case MCSymbolRefExpr::VK_NTPOFF:
+          Type = ELF::R_386_TLS_LE;
+          break;
+        case MCSymbolRefExpr::VK_GOTNTPOFF:
+          Type = ELF::R_386_TLS_GOTIE;
+          break;
+        case MCSymbolRefExpr::VK_TLSLDM:
+          Type = ELF::R_386_TLS_LDM;
+          break;
+        case MCSymbolRefExpr::VK_DTPOFF:
+          Type = ELF::R_386_TLS_LDO_32;
+          break;
+        case MCSymbolRefExpr::VK_GOTTPOFF:
+          Type = ELF::R_386_TLS_IE_32;
+          break;
+        }
+        break;
+      case FK_Data_2: Type = ELF::R_386_16; break;
+      case FK_PCRel_1:
+      case FK_Data_1: Type = ELF::R_386_8; break;
+      }
+    }
+  } else
+    llvm_unreachable("Unsupported ELF machine type.");
+
+  return Type;
+}
+
+MCObjectWriter *llvm::createX86ELFObjectWriter(raw_ostream &OS,
+                                               bool IsELF64,
+                                               uint8_t OSABI,
+                                               uint16_t EMachine) {
+  MCELFObjectTargetWriter *MOTW =
+    new X86ELFObjectWriter(IsELF64, OSABI, EMachine);
+  return createELFObjectWriter(MOTW, OS,  /*IsLittleEndian=*/true);
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp
new file mode 100644
index 0000000..b679316
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp
@@ -0,0 +1,135 @@
+//===-- X86ELFRelocationInfo.cpp ----------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCRelocationInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Support/ELF.h"
+
+using namespace llvm;
+using namespace object;
+using namespace ELF;
+
+namespace {
+class X86_64ELFRelocationInfo : public MCRelocationInfo {
+public:
+  X86_64ELFRelocationInfo(MCContext &Ctx) : MCRelocationInfo(Ctx) {}
+
+  const MCExpr *createExprForRelocation(RelocationRef Rel) override {
+    uint64_t RelType; Rel.getType(RelType);
+    symbol_iterator SymI = Rel.getSymbol();
+
+    StringRef SymName; SymI->getName(SymName);
+    uint64_t  SymAddr; SymI->getAddress(SymAddr);
+    uint64_t  SymSize; SymI->getSize(SymSize);
+    int64_t  Addend;  getELFRelocationAddend(Rel, Addend);
+
+    MCSymbol *Sym = Ctx.GetOrCreateSymbol(SymName);
+    // FIXME: check that the value is actually the same.
+    if (Sym->isVariable() == false)
+      Sym->setVariableValue(MCConstantExpr::Create(SymAddr, Ctx));
+
+    const MCExpr *Expr = nullptr;
+    // If hasAddend is true, then we need to add Addend (r_addend) to Expr.
+    bool hasAddend = false;
+
+    // The AMD64 SysV ABI says:
+    // A: the addend used to compute the value of the relocatable field.
+    // B: the base address at which a shared object has been loaded into memory
+    //    during execution. Generally, a shared object is built with a 0 base
+    //    virtual address, but the execution address will be different.
+    // G: the offset into the global offset table at which the relocation
+    //    entry's symbol will reside during execution.
+    // GOT: the address of the global offset table.
+    // L: the place (section offset or address) of the Procedure Linkage Table
+    //    entry for a symbol.
+    // P: the place (section offset or address) of the storage unit being
+    //    relocated (computed using r_offset).
+    // S: the value of the symbol whose index resides in the relocation entry.
+    // Z: the size of the symbol whose index resides in the relocation entry.
+
+    switch(RelType) {
+    case R_X86_64_NONE:
+    case R_X86_64_COPY:
+      // none
+      break;
+    case R_X86_64_64:
+    case R_X86_64_16:
+    case R_X86_64_8:
+      // S + A
+    case R_X86_64_32:
+    case R_X86_64_32S:
+      // S + A (We don't care about the result not fitting in 32 bits.)
+    case R_X86_64_PC32:
+    case R_X86_64_PC16:
+    case R_X86_64_PC8:
+    case R_X86_64_PC64:
+      // S + A - P (P/pcrel is implicit)
+      hasAddend = true;
+      Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+      break;
+    case R_X86_64_GOT32:
+    case R_X86_64_GOT64:
+    case R_X86_64_GOTPC32:
+    case R_X86_64_GOTPC64:
+    case R_X86_64_GOTPLT64:
+      // G + A
+      hasAddend = true;
+      Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOT, Ctx);
+      break;
+    case R_X86_64_PLT32:
+      // L + A - P -> S@PLT + A
+      hasAddend = true;
+      Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_PLT, Ctx);
+      break;
+    case R_X86_64_GLOB_DAT:
+    case R_X86_64_JUMP_SLOT:
+      // S
+      Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+      break;
+    case R_X86_64_GOTPCREL:
+    case R_X86_64_GOTPCREL64:
+      // G + GOT + A - P -> S@GOTPCREL + A
+      hasAddend = true;
+      Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, Ctx);
+      break;
+    case R_X86_64_GOTOFF64:
+      // S + A - GOT
+      Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTOFF, Ctx);
+      break;
+    case R_X86_64_PLTOFF64:
+      // L + A - GOT
+      break;
+    case R_X86_64_SIZE32:
+    case R_X86_64_SIZE64:
+      // Z + A
+      Expr = MCConstantExpr::Create(SymSize, Ctx);
+      break;
+    default:
+      Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+      break;
+    }
+    if (Expr && hasAddend && Addend != 0)
+      Expr = MCBinaryExpr::CreateAdd(Expr,
+                                     MCConstantExpr::Create(Addend, Ctx),
+                                     Ctx);
+    return Expr;
+  }
+};
+} // End unnamed namespace
+
+/// createX86ELFRelocationInfo - Construct an X86 Mach-O RelocationInfo.
+MCRelocationInfo *llvm::createX86_64ELFRelocationInfo(MCContext &Ctx) {
+  // We only handle x86-64 for now.
+  return new X86_64ELFRelocationInfo(Ctx);
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
new file mode 100644
index 0000000..09396b7
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
@@ -0,0 +1,34 @@
+//===-- X86FixupKinds.h - X86 Specific Fixup Entries ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_X86_X86FIXUPKINDS_H
+#define LLVM_X86_X86FIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace X86 {
+enum Fixups {
+  reloc_riprel_4byte = FirstTargetFixupKind, // 32-bit rip-relative
+  reloc_riprel_4byte_movq_load,              // 32-bit rip-relative in movq
+  reloc_signed_4byte,                        // 32-bit signed. Unlike FK_Data_4
+                                             // this will be sign extended at
+                                             // runtime.
+  reloc_global_offset_table,                 // 32-bit, relative to the start
+                                             // of the instruction. Used only
+                                             // for _GLOBAL_OFFSET_TABLE_.
+  reloc_global_offset_table8,                // 64-bit variant.
+  // Marker
+  LastTargetFixupKind,
+  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+}
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
new file mode 100644
index 0000000..b1411bc
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -0,0 +1,177 @@
+//===-- X86MCAsmInfo.cpp - X86 asm properties -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the X86MCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ELF.h"
+using namespace llvm;
+
+enum AsmWriterFlavorTy {
+  // Note: This numbering has to match the GCC assembler dialects for inline
+  // asm alternatives to work right.
+  ATT = 0, Intel = 1
+};
+
+static cl::opt<AsmWriterFlavorTy>
+AsmWriterFlavor("x86-asm-syntax", cl::init(ATT),
+  cl::desc("Choose style of code to emit from X86 backend:"),
+  cl::values(clEnumValN(ATT,   "att",   "Emit AT&T-style assembly"),
+             clEnumValN(Intel, "intel", "Emit Intel-style assembly"),
+             clEnumValEnd));
+
+static cl::opt<bool>
+MarkedJTDataRegions("mark-data-regions", cl::init(false),
+  cl::desc("Mark code section jump table data regions."),
+  cl::Hidden);
+
+void X86MCAsmInfoDarwin::anchor() { }
+
+X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) {
+  bool is64Bit = T.getArch() == Triple::x86_64;
+  if (is64Bit)
+    PointerSize = CalleeSaveStackSlotSize = 8;
+
+  AssemblerDialect = AsmWriterFlavor;
+
+  TextAlignFillValue = 0x90;
+
+  if (!is64Bit)
+    Data64bitsDirective = nullptr;       // we can't emit a 64-bit unit
+
+  // Use ## as a comment string so that .s files generated by llvm can go
+  // through the GCC preprocessor without causing an error.  This is needed
+  // because "clang foo.s" runs the C preprocessor, which is usually reserved
+  // for .S files on other systems.  Perhaps this is because the file system
+  // wasn't always case preserving or something.
+  CommentString = "##";
+
+  SupportsDebugInformation = true;
+  UseDataRegionDirectives = MarkedJTDataRegions;
+
+  // Exceptions handling
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+
+  // old assembler lacks some directives
+  // FIXME: this should really be a check on the assembler characteristics
+  // rather than OS version
+  if (T.isMacOSX() && T.isMacOSXVersionLT(10, 6))
+    HasWeakDefCanBeHiddenDirective = false;
+
+  // Assume ld64 is new enough that the abs-ified FDE relocs may be used
+  // (actually, must, since otherwise the non-extern relocations we produce
+  // overwhelm ld64's tiny little mind and it fails).
+  DwarfFDESymbolsUseAbsDiff = true;
+
+  UseIntegratedAssembler = true;
+}
+
+X86_64MCAsmInfoDarwin::X86_64MCAsmInfoDarwin(const Triple &Triple)
+  : X86MCAsmInfoDarwin(Triple) {
+}
+
+void X86ELFMCAsmInfo::anchor() { }
+
+X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) {
+  bool is64Bit = T.getArch() == Triple::x86_64;
+  bool isX32 = T.getEnvironment() == Triple::GNUX32;
+
+  // For ELF, x86-64 pointer size depends on the ABI.
+  // For x86-64 without the x32 ABI, pointer size is 8. For x86 and for x86-64
+  // with the x32 ABI, pointer size remains the default 4.
+  PointerSize = (is64Bit && !isX32) ? 8 : 4;
+
+  // OTOH, stack slot size is always 8 for x86-64, even with the x32 ABI.
+  CalleeSaveStackSlotSize = is64Bit ? 8 : 4;
+
+  AssemblerDialect = AsmWriterFlavor;
+
+  TextAlignFillValue = 0x90;
+
+  // Set up DWARF directives
+  HasLEB128 = true;  // Target asm supports leb128 directives (little-endian)
+
+  // Debug Information
+  SupportsDebugInformation = true;
+
+  // Exceptions handling
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+
+  // OpenBSD and Bitrig have buggy support for .quad in 32-bit mode, just split
+  // into two .words.
+  if ((T.getOS() == Triple::OpenBSD || T.getOS() == Triple::Bitrig) &&
+       T.getArch() == Triple::x86)
+    Data64bitsDirective = nullptr;
+
+  // Always enable the integrated assembler by default.
+  // Clang also enabled it when the OS is Solaris but that is redundant here.
+  UseIntegratedAssembler = true;
+}
+
+const MCExpr *
+X86_64MCAsmInfoDarwin::getExprForPersonalitySymbol(const MCSymbol *Sym,
+                                                   unsigned Encoding,
+                                                   MCStreamer &Streamer) const {
+  MCContext &Context = Streamer.getContext();
+  const MCExpr *Res =
+    MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, Context);
+  const MCExpr *Four = MCConstantExpr::Create(4, Context);
+  return MCBinaryExpr::CreateAdd(Res, Four, Context);
+}
+
+const MCSection *X86ELFMCAsmInfo::
+getNonexecutableStackSection(MCContext &Ctx) const {
+  return Ctx.getELFSection(".note.GNU-stack", ELF::SHT_PROGBITS,
+                           0, SectionKind::getMetadata());
+}
+
+void X86MCAsmInfoMicrosoft::anchor() { }
+
+X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) {
+  if (Triple.getArch() == Triple::x86_64) {
+    PrivateGlobalPrefix = ".L";
+    PointerSize = 8;
+    ExceptionsType = ExceptionHandling::WinEH;
+  }
+
+  AssemblerDialect = AsmWriterFlavor;
+
+  TextAlignFillValue = 0x90;
+
+  AllowAtInName = true;
+
+  UseIntegratedAssembler = true;
+}
+
+void X86MCAsmInfoGNUCOFF::anchor() { }
+
+X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) {
+  assert(Triple.isOSWindows() && "Windows is the only supported COFF target");
+  if (Triple.getArch() == Triple::x86_64) {
+    PrivateGlobalPrefix = ".L";
+    PointerSize = 8;
+    ExceptionsType = ExceptionHandling::WinEH;
+  } else {
+    ExceptionsType = ExceptionHandling::DwarfCFI;
+  }
+
+  AssemblerDialect = AsmWriterFlavor;
+
+  TextAlignFillValue = 0x90;
+
+  UseIntegratedAssembler = true;
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
new file mode 100644
index 0000000..a7509b0
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
@@ -0,0 +1,59 @@
+//===-- X86MCAsmInfo.h - X86 asm properties --------------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the X86MCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86TARGETASMINFO_H
+#define X86TARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmInfoCOFF.h"
+#include "llvm/MC/MCAsmInfoDarwin.h"
+#include "llvm/MC/MCAsmInfoELF.h"
+
+namespace llvm {
+  class Triple;
+
+  class X86MCAsmInfoDarwin : public MCAsmInfoDarwin {
+    void anchor() override;
+  public:
+    explicit X86MCAsmInfoDarwin(const Triple &Triple);
+  };
+
+  struct X86_64MCAsmInfoDarwin : public X86MCAsmInfoDarwin {
+    explicit X86_64MCAsmInfoDarwin(const Triple &Triple);
+    const MCExpr *
+    getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding,
+                                MCStreamer &Streamer) const override;
+  };
+
+  class X86ELFMCAsmInfo : public MCAsmInfoELF {
+    void anchor() override;
+  public:
+    explicit X86ELFMCAsmInfo(const Triple &Triple);
+    const MCSection *
+    getNonexecutableStackSection(MCContext &Ctx) const override;
+  };
+
+  class X86MCAsmInfoMicrosoft : public MCAsmInfoMicrosoft {
+    void anchor() override;
+  public:
+    explicit X86MCAsmInfoMicrosoft(const Triple &Triple);
+  };
+
+  class X86MCAsmInfoGNUCOFF : public MCAsmInfoGNUCOFF {
+    void anchor() override;
+  public:
+    explicit X86MCAsmInfoGNUCOFF(const Triple &Triple);
+  };
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
new file mode 100644
index 0000000..075db11
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -0,0 +1,1559 @@
+//===-- X86MCCodeEmitter.cpp - Convert X86 code to machine code -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the X86MCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86FixupKinds.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mccodeemitter"
+
+namespace {
+class X86MCCodeEmitter : public MCCodeEmitter {
+  X86MCCodeEmitter(const X86MCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  void operator=(const X86MCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  const MCInstrInfo &MCII;
+  MCContext &Ctx;
+public:
+  X86MCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
+    : MCII(mcii), Ctx(ctx) {
+  }
+
+  ~X86MCCodeEmitter() {}
+
+  bool is64BitMode(const MCSubtargetInfo &STI) const {
+    return (STI.getFeatureBits() & X86::Mode64Bit) != 0;
+  }
+
+  bool is32BitMode(const MCSubtargetInfo &STI) const {
+    return (STI.getFeatureBits() & X86::Mode32Bit) != 0;
+  }
+
+  bool is16BitMode(const MCSubtargetInfo &STI) const {
+    return (STI.getFeatureBits() & X86::Mode16Bit) != 0;
+  }
+
+  /// Is16BitMemOperand - Return true if the specified instruction has
+  /// a 16-bit memory operand. Op specifies the operand # of the memoperand.
+  bool Is16BitMemOperand(const MCInst &MI, unsigned Op,
+                         const MCSubtargetInfo &STI) const {
+    const MCOperand &BaseReg  = MI.getOperand(Op+X86::AddrBaseReg);
+    const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+    const MCOperand &Disp     = MI.getOperand(Op+X86::AddrDisp);
+
+    if (is16BitMode(STI) && BaseReg.getReg() == 0 &&
+        Disp.isImm() && Disp.getImm() < 0x10000)
+      return true;
+    if ((BaseReg.getReg() != 0 &&
+         X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg.getReg())) ||
+        (IndexReg.getReg() != 0 &&
+         X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg.getReg())))
+      return true;
+    return false;
+  }
+
+  unsigned GetX86RegNum(const MCOperand &MO) const {
+    return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg()) & 0x7;
+  }
+
+  // On regular x86, both XMM0-XMM7 and XMM8-XMM15 are encoded in the range
+  // 0-7 and the difference between the 2 groups is given by the REX prefix.
+  // In the VEX prefix, registers are seen sequencially from 0-15 and encoded
+  // in 1's complement form, example:
+  //
+  //  ModRM field => XMM9 => 1
+  //  VEX.VVVV    => XMM9 => ~9
+  //
+  // See table 4-35 of Intel AVX Programming Reference for details.
+  unsigned char getVEXRegisterEncoding(const MCInst &MI,
+                                       unsigned OpNum) const {
+    unsigned SrcReg = MI.getOperand(OpNum).getReg();
+    unsigned SrcRegNum = GetX86RegNum(MI.getOperand(OpNum));
+    if (X86II::isX86_64ExtendedReg(SrcReg))
+      SrcRegNum |= 8;
+
+    // The registers represented through VEX_VVVV should
+    // be encoded in 1's complement form.
+    return (~SrcRegNum) & 0xf;
+  }
+
+  unsigned char getWriteMaskRegisterEncoding(const MCInst &MI,
+                                             unsigned OpNum) const {
+    assert(X86::K0 != MI.getOperand(OpNum).getReg() &&
+           "Invalid mask register as write-mask!");
+    unsigned MaskRegNum = GetX86RegNum(MI.getOperand(OpNum));
+    return MaskRegNum;
+  }
+
+  void EmitByte(unsigned char C, unsigned &CurByte, raw_ostream &OS) const {
+    OS << (char)C;
+    ++CurByte;
+  }
+
+  void EmitConstant(uint64_t Val, unsigned Size, unsigned &CurByte,
+                    raw_ostream &OS) const {
+    // Output the constant in little endian byte order.
+    for (unsigned i = 0; i != Size; ++i) {
+      EmitByte(Val & 255, CurByte, OS);
+      Val >>= 8;
+    }
+  }
+
+  void EmitImmediate(const MCOperand &Disp, SMLoc Loc,
+                     unsigned ImmSize, MCFixupKind FixupKind,
+                     unsigned &CurByte, raw_ostream &OS,
+                     SmallVectorImpl<MCFixup> &Fixups,
+                     int ImmOffset = 0) const;
+
+  inline static unsigned char ModRMByte(unsigned Mod, unsigned RegOpcode,
+                                        unsigned RM) {
+    assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!");
+    return RM | (RegOpcode << 3) | (Mod << 6);
+  }
+
+  void EmitRegModRMByte(const MCOperand &ModRMReg, unsigned RegOpcodeFld,
+                        unsigned &CurByte, raw_ostream &OS) const {
+    EmitByte(ModRMByte(3, RegOpcodeFld, GetX86RegNum(ModRMReg)), CurByte, OS);
+  }
+
+  void EmitSIBByte(unsigned SS, unsigned Index, unsigned Base,
+                   unsigned &CurByte, raw_ostream &OS) const {
+    // SIB byte is in the same format as the ModRMByte.
+    EmitByte(ModRMByte(SS, Index, Base), CurByte, OS);
+  }
+
+
+  void EmitMemModRMByte(const MCInst &MI, unsigned Op,
+                        unsigned RegOpcodeField,
+                        uint64_t TSFlags, unsigned &CurByte, raw_ostream &OS,
+                        SmallVectorImpl<MCFixup> &Fixups,
+                        const MCSubtargetInfo &STI) const;
+
+  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const override;
+
+  void EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand,
+                           const MCInst &MI, const MCInstrDesc &Desc,
+                           raw_ostream &OS) const;
+
+  void EmitSegmentOverridePrefix(unsigned &CurByte, unsigned SegOperand,
+                                 const MCInst &MI, raw_ostream &OS) const;
+
+  void EmitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand,
+                        const MCInst &MI, const MCInstrDesc &Desc,
+                        const MCSubtargetInfo &STI,
+                        raw_ostream &OS) const;
+};
+
+} // end anonymous namespace
+
+
+MCCodeEmitter *llvm::createX86MCCodeEmitter(const MCInstrInfo &MCII,
+                                            const MCRegisterInfo &MRI,
+                                            const MCSubtargetInfo &STI,
+                                            MCContext &Ctx) {
+  return new X86MCCodeEmitter(MCII, Ctx);
+}
+
+/// isDisp8 - Return true if this signed displacement fits in a 8-bit
+/// sign-extended field.
+static bool isDisp8(int Value) {
+  return Value == (signed char)Value;
+}
+
+/// isCDisp8 - Return true if this signed displacement fits in a 8-bit
+/// compressed dispacement field.
+static bool isCDisp8(uint64_t TSFlags, int Value, int& CValue) {
+  assert(((TSFlags & X86II::EncodingMask) >>
+          X86II::EncodingShift == X86II::EVEX) &&
+         "Compressed 8-bit displacement is only valid for EVEX inst.");
+
+  unsigned CD8_Scale =
+    (TSFlags >> X86II::CD8_Scale_Shift) & X86II::CD8_Scale_Mask;
+  if (CD8_Scale == 0) {
+    CValue = Value;
+    return isDisp8(Value);
+  }
+
+  unsigned Mask = CD8_Scale - 1;
+  assert((CD8_Scale & Mask) == 0 && "Invalid memory object size.");
+  if (Value & Mask) // Unaligned offset
+    return false;
+  Value /= (int)CD8_Scale;
+  bool Ret = (Value == (signed char)Value);
+
+  if (Ret)
+    CValue = Value;
+  return Ret;
+}
+
+/// getImmFixupKind - Return the appropriate fixup kind to use for an immediate
+/// in an instruction with the specified TSFlags.
+static MCFixupKind getImmFixupKind(uint64_t TSFlags) {
+  unsigned Size = X86II::getSizeOfImm(TSFlags);
+  bool isPCRel = X86II::isImmPCRel(TSFlags);
+
+  if (X86II::isImmSigned(TSFlags)) {
+    switch (Size) {
+    default: llvm_unreachable("Unsupported signed fixup size!");
+    case 4: return MCFixupKind(X86::reloc_signed_4byte);
+    }
+  }
+  return MCFixup::getKindForSize(Size, isPCRel);
+}
+
+/// Is32BitMemOperand - Return true if the specified instruction has
+/// a 32-bit memory operand. Op specifies the operand # of the memoperand.
+static bool Is32BitMemOperand(const MCInst &MI, unsigned Op) {
+  const MCOperand &BaseReg  = MI.getOperand(Op+X86::AddrBaseReg);
+  const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+
+  if ((BaseReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg.getReg())) ||
+      (IndexReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg.getReg())))
+    return true;
+  return false;
+}
+
+/// Is64BitMemOperand - Return true if the specified instruction has
+/// a 64-bit memory operand. Op specifies the operand # of the memoperand.
+#ifndef NDEBUG
+static bool Is64BitMemOperand(const MCInst &MI, unsigned Op) {
+  const MCOperand &BaseReg  = MI.getOperand(Op+X86::AddrBaseReg);
+  const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+
+  if ((BaseReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg.getReg())) ||
+      (IndexReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg.getReg())))
+    return true;
+  return false;
+}
+#endif
+
+/// StartsWithGlobalOffsetTable - Check if this expression starts with
+///  _GLOBAL_OFFSET_TABLE_ and if it is of the form
+///  _GLOBAL_OFFSET_TABLE_-symbol. This is needed to support PIC on ELF
+/// i386 as _GLOBAL_OFFSET_TABLE_ is magical. We check only simple case that
+/// are know to be used: _GLOBAL_OFFSET_TABLE_ by itself or at the start
+/// of a binary expression.
+enum GlobalOffsetTableExprKind {
+  GOT_None,
+  GOT_Normal,
+  GOT_SymDiff
+};
+static GlobalOffsetTableExprKind
+StartsWithGlobalOffsetTable(const MCExpr *Expr) {
+  const MCExpr *RHS = nullptr;
+  if (Expr->getKind() == MCExpr::Binary) {
+    const MCBinaryExpr *BE = static_cast<const MCBinaryExpr *>(Expr);
+    Expr = BE->getLHS();
+    RHS = BE->getRHS();
+  }
+
+  if (Expr->getKind() != MCExpr::SymbolRef)
+    return GOT_None;
+
+  const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr*>(Expr);
+  const MCSymbol &S = Ref->getSymbol();
+  if (S.getName() != "_GLOBAL_OFFSET_TABLE_")
+    return GOT_None;
+  if (RHS && RHS->getKind() == MCExpr::SymbolRef)
+    return GOT_SymDiff;
+  return GOT_Normal;
+}
+
+static bool HasSecRelSymbolRef(const MCExpr *Expr) {
+  if (Expr->getKind() == MCExpr::SymbolRef) {
+    const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr*>(Expr);
+    return Ref->getKind() == MCSymbolRefExpr::VK_SECREL;
+  }
+  return false;
+}
+
+void X86MCCodeEmitter::
+EmitImmediate(const MCOperand &DispOp, SMLoc Loc, unsigned Size,
+              MCFixupKind FixupKind, unsigned &CurByte, raw_ostream &OS,
+              SmallVectorImpl<MCFixup> &Fixups, int ImmOffset) const {
+  const MCExpr *Expr = nullptr;
+  if (DispOp.isImm()) {
+    // If this is a simple integer displacement that doesn't require a
+    // relocation, emit it now.
+    if (FixupKind != FK_PCRel_1 &&
+        FixupKind != FK_PCRel_2 &&
+        FixupKind != FK_PCRel_4) {
+      EmitConstant(DispOp.getImm()+ImmOffset, Size, CurByte, OS);
+      return;
+    }
+    Expr = MCConstantExpr::Create(DispOp.getImm(), Ctx);
+  } else {
+    Expr = DispOp.getExpr();
+  }
+
+  // If we have an immoffset, add it to the expression.
+  if ((FixupKind == FK_Data_4 ||
+       FixupKind == FK_Data_8 ||
+       FixupKind == MCFixupKind(X86::reloc_signed_4byte))) {
+    GlobalOffsetTableExprKind Kind = StartsWithGlobalOffsetTable(Expr);
+    if (Kind != GOT_None) {
+      assert(ImmOffset == 0);
+
+      if (Size == 8) {
+        FixupKind = MCFixupKind(X86::reloc_global_offset_table8);
+      } else {
+        assert(Size == 4);
+        FixupKind = MCFixupKind(X86::reloc_global_offset_table);
+      }
+
+      if (Kind == GOT_Normal)
+        ImmOffset = CurByte;
+    } else if (Expr->getKind() == MCExpr::SymbolRef) {
+      if (HasSecRelSymbolRef(Expr)) {
+        FixupKind = MCFixupKind(FK_SecRel_4);
+      }
+    } else if (Expr->getKind() == MCExpr::Binary) {
+      const MCBinaryExpr *Bin = static_cast<const MCBinaryExpr*>(Expr);
+      if (HasSecRelSymbolRef(Bin->getLHS())
+          || HasSecRelSymbolRef(Bin->getRHS())) {
+        FixupKind = MCFixupKind(FK_SecRel_4);
+      }
+    }
+  }
+
+  // If the fixup is pc-relative, we need to bias the value to be relative to
+  // the start of the field, not the end of the field.
+  if (FixupKind == FK_PCRel_4 ||
+      FixupKind == MCFixupKind(X86::reloc_riprel_4byte) ||
+      FixupKind == MCFixupKind(X86::reloc_riprel_4byte_movq_load))
+    ImmOffset -= 4;
+  if (FixupKind == FK_PCRel_2)
+    ImmOffset -= 2;
+  if (FixupKind == FK_PCRel_1)
+    ImmOffset -= 1;
+
+  if (ImmOffset)
+    Expr = MCBinaryExpr::CreateAdd(Expr, MCConstantExpr::Create(ImmOffset, Ctx),
+                                   Ctx);
+
+  // Emit a symbolic constant as a fixup and 4 zeros.
+  Fixups.push_back(MCFixup::Create(CurByte, Expr, FixupKind, Loc));
+  EmitConstant(0, Size, CurByte, OS);
+}
+
+void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op,
+                                        unsigned RegOpcodeField,
+                                        uint64_t TSFlags, unsigned &CurByte,
+                                        raw_ostream &OS,
+                                        SmallVectorImpl<MCFixup> &Fixups,
+                                        const MCSubtargetInfo &STI) const{
+  const MCOperand &Disp     = MI.getOperand(Op+X86::AddrDisp);
+  const MCOperand &Base     = MI.getOperand(Op+X86::AddrBaseReg);
+  const MCOperand &Scale    = MI.getOperand(Op+X86::AddrScaleAmt);
+  const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+  unsigned BaseReg = Base.getReg();
+  unsigned char Encoding = (TSFlags & X86II::EncodingMask) >>
+                           X86II::EncodingShift;
+  bool HasEVEX = (Encoding == X86II::EVEX);
+
+  // Handle %rip relative addressing.
+  if (BaseReg == X86::RIP) {    // [disp32+RIP] in X86-64 mode
+    assert(is64BitMode(STI) && "Rip-relative addressing requires 64-bit mode");
+    assert(IndexReg.getReg() == 0 && "Invalid rip-relative address");
+    EmitByte(ModRMByte(0, RegOpcodeField, 5), CurByte, OS);
+
+    unsigned FixupKind = X86::reloc_riprel_4byte;
+
+    // movq loads are handled with a special relocation form which allows the
+    // linker to eliminate some loads for GOT references which end up in the
+    // same linkage unit.
+    if (MI.getOpcode() == X86::MOV64rm)
+      FixupKind = X86::reloc_riprel_4byte_movq_load;
+
+    // rip-relative addressing is actually relative to the *next* instruction.
+    // Since an immediate can follow the mod/rm byte for an instruction, this
+    // means that we need to bias the immediate field of the instruction with
+    // the size of the immediate field.  If we have this case, add it into the
+    // expression to emit.
+    int ImmSize = X86II::hasImm(TSFlags) ? X86II::getSizeOfImm(TSFlags) : 0;
+
+    EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind),
+                  CurByte, OS, Fixups, -ImmSize);
+    return;
+  }
+
+  unsigned BaseRegNo = BaseReg ? GetX86RegNum(Base) : -1U;
+
+  // 16-bit addressing forms of the ModR/M byte have a different encoding for
+  // the R/M field and are far more limited in which registers can be used.
+  if (Is16BitMemOperand(MI, Op, STI)) {
+    if (BaseReg) {
+      // For 32-bit addressing, the row and column values in Table 2-2 are
+      // basically the same. It's AX/CX/DX/BX/SP/BP/SI/DI in that order, with
+      // some special cases. And GetX86RegNum reflects that numbering.
+      // For 16-bit addressing it's more fun, as shown in the SDM Vol 2A,
+      // Table 2-1 "16-Bit Addressing Forms with the ModR/M byte". We can only
+      // use SI/DI/BP/BX, which have "row" values 4-7 in no particular order,
+      // while values 0-3 indicate the allowed combinations (base+index) of
+      // those: 0 for BX+SI, 1 for BX+DI, 2 for BP+SI, 3 for BP+DI.
+      //
+      // R16Table[] is a lookup from the normal RegNo, to the row values from
+      // Table 2-1 for 16-bit addressing modes. Where zero means disallowed.
+      static const unsigned R16Table[] = { 0, 0, 0, 7, 0, 6, 4, 5 };
+      unsigned RMfield = R16Table[BaseRegNo];
+
+      assert(RMfield && "invalid 16-bit base register");
+
+      if (IndexReg.getReg()) {
+        unsigned IndexReg16 = R16Table[GetX86RegNum(IndexReg)];
+
+        assert(IndexReg16 && "invalid 16-bit index register");
+        // We must have one of SI/DI (4,5), and one of BP/BX (6,7).
+        assert(((IndexReg16 ^ RMfield) & 2) &&
+               "invalid 16-bit base/index register combination");
+        assert(Scale.getImm() == 1 &&
+               "invalid scale for 16-bit memory reference");
+
+        // Allow base/index to appear in either order (although GAS doesn't).
+        if (IndexReg16 & 2)
+          RMfield = (RMfield & 1) | ((7 - IndexReg16) << 1);
+        else
+          RMfield = (IndexReg16 & 1) | ((7 - RMfield) << 1);
+      }
+
+      if (Disp.isImm() && isDisp8(Disp.getImm())) {
+        if (Disp.getImm() == 0 && BaseRegNo != N86::EBP) {
+          // There is no displacement; just the register.
+          EmitByte(ModRMByte(0, RegOpcodeField, RMfield), CurByte, OS);
+          return;
+        }
+        // Use the [REG]+disp8 form, including for [BP] which cannot be encoded.
+        EmitByte(ModRMByte(1, RegOpcodeField, RMfield), CurByte, OS);
+        EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups);
+        return;
+      }
+      // This is the [REG]+disp16 case.
+      EmitByte(ModRMByte(2, RegOpcodeField, RMfield), CurByte, OS);
+    } else {
+      // There is no BaseReg; this is the plain [disp16] case.
+      EmitByte(ModRMByte(0, RegOpcodeField, 6), CurByte, OS);
+    }
+
+    // Emit 16-bit displacement for plain disp16 or [REG]+disp16 cases.
+    EmitImmediate(Disp, MI.getLoc(), 2, FK_Data_2, CurByte, OS, Fixups);
+    return;
+  }
+
+  // Determine whether a SIB byte is needed.
+  // If no BaseReg, issue a RIP relative instruction only if the MCE can
+  // resolve addresses on-the-fly, otherwise use SIB (Intel Manual 2A, table
+  // 2-7) and absolute references.
+
+  if (// The SIB byte must be used if there is an index register.
+      IndexReg.getReg() == 0 &&
+      // The SIB byte must be used if the base is ESP/RSP/R12, all of which
+      // encode to an R/M value of 4, which indicates that a SIB byte is
+      // present.
+      BaseRegNo != N86::ESP &&
+      // If there is no base register and we're in 64-bit mode, we need a SIB
+      // byte to emit an addr that is just 'disp32' (the non-RIP relative form).
+      (!is64BitMode(STI) || BaseReg != 0)) {
+
+    if (BaseReg == 0) {          // [disp32]     in X86-32 mode
+      EmitByte(ModRMByte(0, RegOpcodeField, 5), CurByte, OS);
+      EmitImmediate(Disp, MI.getLoc(), 4, FK_Data_4, CurByte, OS, Fixups);
+      return;
+    }
+
+    // If the base is not EBP/ESP and there is no displacement, use simple
+    // indirect register encoding, this handles addresses like [EAX].  The
+    // encoding for [EBP] with no displacement means [disp32] so we handle it
+    // by emitting a displacement of 0 below.
+    if (Disp.isImm() && Disp.getImm() == 0 && BaseRegNo != N86::EBP) {
+      EmitByte(ModRMByte(0, RegOpcodeField, BaseRegNo), CurByte, OS);
+      return;
+    }
+
+    // Otherwise, if the displacement fits in a byte, encode as [REG+disp8].
+    if (Disp.isImm()) {
+      if (!HasEVEX && isDisp8(Disp.getImm())) {
+        EmitByte(ModRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS);
+        EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups);
+        return;
+      }
+      // Try EVEX compressed 8-bit displacement first; if failed, fall back to
+      // 32-bit displacement.
+      int CDisp8 = 0;
+      if (HasEVEX && isCDisp8(TSFlags, Disp.getImm(), CDisp8)) {
+        EmitByte(ModRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS);
+        EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups,
+                      CDisp8 - Disp.getImm());
+        return;
+      }
+    }
+
+    // Otherwise, emit the most general non-SIB encoding: [REG+disp32]
+    EmitByte(ModRMByte(2, RegOpcodeField, BaseRegNo), CurByte, OS);
+    EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte), CurByte, OS,
+                  Fixups);
+    return;
+  }
+
+  // We need a SIB byte, so start by outputting the ModR/M byte first
+  assert(IndexReg.getReg() != X86::ESP &&
+         IndexReg.getReg() != X86::RSP && "Cannot use ESP as index reg!");
+
+  bool ForceDisp32 = false;
+  bool ForceDisp8  = false;
+  int CDisp8 = 0;
+  int ImmOffset = 0;
+  if (BaseReg == 0) {
+    // If there is no base register, we emit the special case SIB byte with
+    // MOD=0, BASE=5, to JUST get the index, scale, and displacement.
+    EmitByte(ModRMByte(0, RegOpcodeField, 4), CurByte, OS);
+    ForceDisp32 = true;
+  } else if (!Disp.isImm()) {
+    // Emit the normal disp32 encoding.
+    EmitByte(ModRMByte(2, RegOpcodeField, 4), CurByte, OS);
+    ForceDisp32 = true;
+  } else if (Disp.getImm() == 0 &&
+             // Base reg can't be anything that ends up with '5' as the base
+             // reg, it is the magic [*] nomenclature that indicates no base.
+             BaseRegNo != N86::EBP) {
+    // Emit no displacement ModR/M byte
+    EmitByte(ModRMByte(0, RegOpcodeField, 4), CurByte, OS);
+  } else if (!HasEVEX && isDisp8(Disp.getImm())) {
+    // Emit the disp8 encoding.
+    EmitByte(ModRMByte(1, RegOpcodeField, 4), CurByte, OS);
+    ForceDisp8 = true;           // Make sure to force 8 bit disp if Base=EBP
+  } else if (HasEVEX && isCDisp8(TSFlags, Disp.getImm(), CDisp8)) {
+    // Emit the disp8 encoding.
+    EmitByte(ModRMByte(1, RegOpcodeField, 4), CurByte, OS);
+    ForceDisp8 = true;           // Make sure to force 8 bit disp if Base=EBP
+    ImmOffset = CDisp8 - Disp.getImm();
+  } else {
+    // Emit the normal disp32 encoding.
+    EmitByte(ModRMByte(2, RegOpcodeField, 4), CurByte, OS);
+  }
+
+  // Calculate what the SS field value should be...
+  static const unsigned SSTable[] = { ~0U, 0, 1, ~0U, 2, ~0U, ~0U, ~0U, 3 };
+  unsigned SS = SSTable[Scale.getImm()];
+
+  if (BaseReg == 0) {
+    // Handle the SIB byte for the case where there is no base, see Intel
+    // Manual 2A, table 2-7. The displacement has already been output.
+    unsigned IndexRegNo;
+    if (IndexReg.getReg())
+      IndexRegNo = GetX86RegNum(IndexReg);
+    else // Examples: [ESP+1*<noreg>+4] or [scaled idx]+disp32 (MOD=0,BASE=5)
+      IndexRegNo = 4;
+    EmitSIBByte(SS, IndexRegNo, 5, CurByte, OS);
+  } else {
+    unsigned IndexRegNo;
+    if (IndexReg.getReg())
+      IndexRegNo = GetX86RegNum(IndexReg);
+    else
+      IndexRegNo = 4;   // For example [ESP+1*<noreg>+4]
+    EmitSIBByte(SS, IndexRegNo, GetX86RegNum(Base), CurByte, OS);
+  }
+
+  // Do we need to output a displacement?
+  if (ForceDisp8)
+    EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups, ImmOffset);
+  else if (ForceDisp32 || Disp.getImm() != 0)
+    EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte),
+                  CurByte, OS, Fixups);
+}
+
+/// EmitVEXOpcodePrefix - AVX instructions are encoded using a opcode prefix
+/// called VEX.
+void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
+                                           int MemOperand, const MCInst &MI,
+                                           const MCInstrDesc &Desc,
+                                           raw_ostream &OS) const {
+  unsigned char Encoding = (TSFlags & X86II::EncodingMask) >>
+                           X86II::EncodingShift;
+  bool HasEVEX_K = ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K);
+  bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
+  bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3;
+  bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
+  bool HasEVEX_RC = (TSFlags >> X86II::VEXShift) & X86II::EVEX_RC;
+
+  // VEX_R: opcode externsion equivalent to REX.R in
+  // 1's complement (inverted) form
+  //
+  //  1: Same as REX_R=0 (must be 1 in 32-bit mode)
+  //  0: Same as REX_R=1 (64 bit mode only)
+  //
+  unsigned char VEX_R = 0x1;
+  unsigned char EVEX_R2 = 0x1;
+
+  // VEX_X: equivalent to REX.X, only used when a
+  // register is used for index in SIB Byte.
+  //
+  //  1: Same as REX.X=0 (must be 1 in 32-bit mode)
+  //  0: Same as REX.X=1 (64-bit mode only)
+  unsigned char VEX_X = 0x1;
+
+  // VEX_B:
+  //
+  //  1: Same as REX_B=0 (ignored in 32-bit mode)
+  //  0: Same as REX_B=1 (64 bit mode only)
+  //
+  unsigned char VEX_B = 0x1;
+
+  // VEX_W: opcode specific (use like REX.W, or used for
+  // opcode extension, or ignored, depending on the opcode byte)
+  unsigned char VEX_W = 0;
+
+  // VEX_5M (VEX m-mmmmm field):
+  //
+  //  0b00000: Reserved for future use
+  //  0b00001: implied 0F leading opcode
+  //  0b00010: implied 0F 38 leading opcode bytes
+  //  0b00011: implied 0F 3A leading opcode bytes
+  //  0b00100-0b11111: Reserved for future use
+  //  0b01000: XOP map select - 08h instructions with imm byte
+  //  0b01001: XOP map select - 09h instructions with no imm byte
+  //  0b01010: XOP map select - 0Ah instructions with imm dword
+  unsigned char VEX_5M = 0;
+
+  // VEX_4V (VEX vvvv field): a register specifier
+  // (in 1's complement form) or 1111 if unused.
+  unsigned char VEX_4V = 0xf;
+  unsigned char EVEX_V2 = 0x1;
+
+  // VEX_L (Vector Length):
+  //
+  //  0: scalar or 128-bit vector
+  //  1: 256-bit vector
+  //
+  unsigned char VEX_L = 0;
+  unsigned char EVEX_L2 = 0;
+
+  // VEX_PP: opcode extension providing equivalent
+  // functionality of a SIMD prefix
+  //
+  //  0b00: None
+  //  0b01: 66
+  //  0b10: F3
+  //  0b11: F2
+  //
+  unsigned char VEX_PP = 0;
+
+  // EVEX_U
+  unsigned char EVEX_U = 1; // Always '1' so far
+
+  // EVEX_z
+  unsigned char EVEX_z = 0;
+
+  // EVEX_b
+  unsigned char EVEX_b = 0;
+
+  // EVEX_rc
+  unsigned char EVEX_rc = 0;
+
+  // EVEX_aaa
+  unsigned char EVEX_aaa = 0;
+
+  bool EncodeRC = false;
+
+  if ((TSFlags >> X86II::VEXShift) & X86II::VEX_W)
+    VEX_W = 1;
+
+  if ((TSFlags >> X86II::VEXShift) & X86II::VEX_L)
+    VEX_L = 1;
+  if (((TSFlags >> X86II::VEXShift) & X86II::EVEX_L2))
+    EVEX_L2 = 1;
+
+  if (HasEVEX_K && ((TSFlags >> X86II::VEXShift) & X86II::EVEX_Z))
+    EVEX_z = 1;
+
+  if (((TSFlags >> X86II::VEXShift) & X86II::EVEX_B))
+    EVEX_b = 1;
+
+  switch (TSFlags & X86II::OpPrefixMask) {
+  default: break; // VEX_PP already correct
+  case X86II::PD: VEX_PP = 0x1; break; // 66
+  case X86II::XS: VEX_PP = 0x2; break; // F3
+  case X86II::XD: VEX_PP = 0x3; break; // F2
+  }
+
+  switch (TSFlags & X86II::OpMapMask) {
+  default: llvm_unreachable("Invalid prefix!");
+  case X86II::TB:   VEX_5M = 0x1; break; // 0F
+  case X86II::T8:   VEX_5M = 0x2; break; // 0F 38
+  case X86II::TA:   VEX_5M = 0x3; break; // 0F 3A
+  case X86II::XOP8: VEX_5M = 0x8; break;
+  case X86II::XOP9: VEX_5M = 0x9; break;
+  case X86II::XOPA: VEX_5M = 0xA; break;
+  }
+
+  // Classify VEX_B, VEX_4V, VEX_R, VEX_X
+  unsigned NumOps = Desc.getNumOperands();
+  unsigned CurOp = X86II::getOperandBias(Desc);
+
+  switch (TSFlags & X86II::FormMask) {
+  default: llvm_unreachable("Unexpected form in EmitVEXOpcodePrefix!");
+  case X86II::RawFrm:
+    break;
+  case X86II::MRMDestMem: {
+    // MRMDestMem instructions forms:
+    //  MemAddr, src1(ModR/M)
+    //  MemAddr, src1(VEX_4V), src2(ModR/M)
+    //  MemAddr, src1(ModR/M), imm8
+    //
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(MemOperand + 
+                                                 X86::AddrBaseReg).getReg()))
+      VEX_B = 0x0;
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(MemOperand +
+                                                 X86::AddrIndexReg).getReg()))
+      VEX_X = 0x0;
+    if (X86II::is32ExtendedReg(MI.getOperand(MemOperand +
+                                          X86::AddrIndexReg).getReg()))
+      EVEX_V2 = 0x0;
+
+    CurOp += X86::AddrNumOperands;
+
+    if (HasEVEX_K)
+      EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
+
+    if (HasVEX_4V) {
+      VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+      if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+        EVEX_V2 = 0x0;
+      CurOp++;
+    }
+
+    const MCOperand &MO = MI.getOperand(CurOp);
+    if (MO.isReg()) {
+      if (X86II::isX86_64ExtendedReg(MO.getReg()))
+        VEX_R = 0x0;
+      if (X86II::is32ExtendedReg(MO.getReg()))
+        EVEX_R2 = 0x0;
+    }
+    break;
+  }
+  case X86II::MRMSrcMem:
+    // MRMSrcMem instructions forms:
+    //  src1(ModR/M), MemAddr
+    //  src1(ModR/M), src2(VEX_4V), MemAddr
+    //  src1(ModR/M), MemAddr, imm8
+    //  src1(ModR/M), MemAddr, src2(VEX_I8IMM)
+    //
+    //  FMA4:
+    //  dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
+    //  dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M),
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+      VEX_R = 0x0;
+    if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+      EVEX_R2 = 0x0;
+    CurOp++;
+
+    if (HasEVEX_K)
+      EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
+
+    if (HasVEX_4V) {
+      VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+      if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+        EVEX_V2 = 0x0;
+      CurOp++;
+    }
+
+    if (X86II::isX86_64ExtendedReg(
+               MI.getOperand(MemOperand+X86::AddrBaseReg).getReg()))
+      VEX_B = 0x0;
+    if (X86II::isX86_64ExtendedReg(
+               MI.getOperand(MemOperand+X86::AddrIndexReg).getReg()))
+      VEX_X = 0x0;
+    if (X86II::is32ExtendedReg(MI.getOperand(MemOperand +
+                               X86::AddrIndexReg).getReg()))
+      EVEX_V2 = 0x0;
+
+    if (HasVEX_4VOp3)
+      // Instruction format for 4VOp3:
+      //   src1(ModR/M), MemAddr, src3(VEX_4V)
+      // CurOp points to start of the MemoryOperand,
+      //   it skips TIED_TO operands if exist, then increments past src1.
+      // CurOp + X86::AddrNumOperands will point to src3.
+      VEX_4V = getVEXRegisterEncoding(MI, CurOp+X86::AddrNumOperands);
+    break;
+  case X86II::MRM0m: case X86II::MRM1m:
+  case X86II::MRM2m: case X86II::MRM3m:
+  case X86II::MRM4m: case X86II::MRM5m:
+  case X86II::MRM6m: case X86II::MRM7m: {
+    // MRM[0-9]m instructions forms:
+    //  MemAddr
+    //  src1(VEX_4V), MemAddr
+    if (HasVEX_4V) {
+      VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+      if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+        EVEX_V2 = 0x0;
+      CurOp++;
+    }
+
+    if (HasEVEX_K)
+      EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
+
+    if (X86II::isX86_64ExtendedReg(
+               MI.getOperand(MemOperand+X86::AddrBaseReg).getReg()))
+      VEX_B = 0x0;
+    if (X86II::isX86_64ExtendedReg(
+               MI.getOperand(MemOperand+X86::AddrIndexReg).getReg()))
+      VEX_X = 0x0;
+    break;
+  }
+  case X86II::MRMSrcReg:
+    // MRMSrcReg instructions forms:
+    //  dst(ModR/M), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
+    //  dst(ModR/M), src1(ModR/M)
+    //  dst(ModR/M), src1(ModR/M), imm8
+    //
+    //  FMA4:
+    //  dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
+    //  dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M),
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+      VEX_R = 0x0;
+    if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+      EVEX_R2 = 0x0;
+    CurOp++;
+
+    if (HasEVEX_K)
+      EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
+
+    if (HasVEX_4V) {
+      VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+      if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+        EVEX_V2 = 0x0;
+      CurOp++;
+    }
+
+    if (HasMemOp4) // Skip second register source (encoded in I8IMM)
+      CurOp++;
+
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+      VEX_B = 0x0;
+    if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+      VEX_X = 0x0;
+    CurOp++;
+    if (HasVEX_4VOp3)
+      VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
+    if (EVEX_b) {
+      if (HasEVEX_RC) {
+        unsigned RcOperand = NumOps-1;
+        assert(RcOperand >= CurOp);
+        EVEX_rc = MI.getOperand(RcOperand).getImm() & 0x3;
+      }
+      EncodeRC = true;
+    }      
+    break;
+  case X86II::MRMDestReg:
+    // MRMDestReg instructions forms:
+    //  dst(ModR/M), src(ModR/M)
+    //  dst(ModR/M), src(ModR/M), imm8
+    //  dst(ModR/M), src1(VEX_4V), src2(ModR/M)
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+      VEX_B = 0x0;
+    if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+      VEX_X = 0x0;
+    CurOp++;
+
+    if (HasEVEX_K)
+      EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
+
+    if (HasVEX_4V) {
+      VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+      if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+        EVEX_V2 = 0x0;
+      CurOp++;
+    }
+
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+      VEX_R = 0x0;
+    if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+      EVEX_R2 = 0x0;
+    if (EVEX_b)
+      EncodeRC = true;
+    break;
+  case X86II::MRM0r: case X86II::MRM1r:
+  case X86II::MRM2r: case X86II::MRM3r:
+  case X86II::MRM4r: case X86II::MRM5r:
+  case X86II::MRM6r: case X86II::MRM7r:
+    // MRM0r-MRM7r instructions forms:
+    //  dst(VEX_4V), src(ModR/M), imm8
+    if (HasVEX_4V) {
+      VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+      if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+          EVEX_V2 = 0x0;
+      CurOp++;
+    }
+    if (HasEVEX_K)
+      EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
+
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+      VEX_B = 0x0;
+    if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+      VEX_X = 0x0;
+    break;
+  }
+
+  if (Encoding == X86II::VEX || Encoding == X86II::XOP) {
+    // VEX opcode prefix can have 2 or 3 bytes
+    //
+    //  3 bytes:
+    //    +-----+ +--------------+ +-------------------+
+    //    | C4h | | RXB | m-mmmm | | W | vvvv | L | pp |
+    //    +-----+ +--------------+ +-------------------+
+    //  2 bytes:
+    //    +-----+ +-------------------+
+    //    | C5h | | R | vvvv | L | pp |
+    //    +-----+ +-------------------+
+    //
+    //  XOP uses a similar prefix:
+    //    +-----+ +--------------+ +-------------------+
+    //    | 8Fh | | RXB | m-mmmm | | W | vvvv | L | pp |
+    //    +-----+ +--------------+ +-------------------+
+    unsigned char LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3);
+
+    // Can we use the 2 byte VEX prefix?
+    if (Encoding == X86II::VEX && VEX_B && VEX_X && !VEX_W && (VEX_5M == 1)) {
+      EmitByte(0xC5, CurByte, OS);
+      EmitByte(LastByte | (VEX_R << 7), CurByte, OS);
+      return;
+    }
+
+    // 3 byte VEX prefix
+    EmitByte(Encoding == X86II::XOP ? 0x8F : 0xC4, CurByte, OS);
+    EmitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M, CurByte, OS);
+    EmitByte(LastByte | (VEX_W << 7), CurByte, OS);
+  } else {
+    assert(Encoding == X86II::EVEX && "unknown encoding!");
+    // EVEX opcode prefix can have 4 bytes
+    //
+    // +-----+ +--------------+ +-------------------+ +------------------------+
+    // | 62h | | RXBR' | 00mm | | W | vvvv | U | pp | | z | L'L | b | v' | aaa |
+    // +-----+ +--------------+ +-------------------+ +------------------------+
+    assert((VEX_5M & 0x3) == VEX_5M
+           && "More than 2 significant bits in VEX.m-mmmm fields for EVEX!");
+
+    VEX_5M &= 0x3;
+
+    EmitByte(0x62, CurByte, OS);
+    EmitByte((VEX_R   << 7) |
+             (VEX_X   << 6) |
+             (VEX_B   << 5) |
+             (EVEX_R2 << 4) |
+             VEX_5M, CurByte, OS);
+    EmitByte((VEX_W   << 7) |
+             (VEX_4V  << 3) |
+             (EVEX_U  << 2) |
+             VEX_PP, CurByte, OS);
+    if (EncodeRC)
+      EmitByte((EVEX_z  << 7) |
+              (EVEX_rc << 5) |
+              (EVEX_b  << 4) |
+              (EVEX_V2 << 3) |
+              EVEX_aaa, CurByte, OS);
+    else
+      EmitByte((EVEX_z  << 7) |
+              (EVEX_L2 << 6) |
+              (VEX_L   << 5) |
+              (EVEX_b  << 4) |
+              (EVEX_V2 << 3) |
+              EVEX_aaa, CurByte, OS);
+  }
+}
+
+/// DetermineREXPrefix - Determine if the MCInst has to be encoded with a X86-64
+/// REX prefix which specifies 1) 64-bit instructions, 2) non-default operand
+/// size, and 3) use of X86-64 extended registers.
+static unsigned DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
+                                   const MCInstrDesc &Desc) {
+  unsigned REX = 0;
+  if (TSFlags & X86II::REX_W)
+    REX |= 1 << 3; // set REX.W
+
+  if (MI.getNumOperands() == 0) return REX;
+
+  unsigned NumOps = MI.getNumOperands();
+  // FIXME: MCInst should explicitize the two-addrness.
+  bool isTwoAddr = NumOps > 1 &&
+                      Desc.getOperandConstraint(1, MCOI::TIED_TO) != -1;
+
+  // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix.
+  unsigned i = isTwoAddr ? 1 : 0;
+  for (; i != NumOps; ++i) {
+    const MCOperand &MO = MI.getOperand(i);
+    if (!MO.isReg()) continue;
+    unsigned Reg = MO.getReg();
+    if (!X86II::isX86_64NonExtLowByteReg(Reg)) continue;
+    // FIXME: The caller of DetermineREXPrefix slaps this prefix onto anything
+    // that returns non-zero.
+    REX |= 0x40; // REX fixed encoding prefix
+    break;
+  }
+
+  switch (TSFlags & X86II::FormMask) {
+  case X86II::MRMSrcReg:
+    if (MI.getOperand(0).isReg() &&
+        X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
+      REX |= 1 << 2; // set REX.R
+    i = isTwoAddr ? 2 : 1;
+    for (; i != NumOps; ++i) {
+      const MCOperand &MO = MI.getOperand(i);
+      if (MO.isReg() && X86II::isX86_64ExtendedReg(MO.getReg()))
+        REX |= 1 << 0; // set REX.B
+    }
+    break;
+  case X86II::MRMSrcMem: {
+    if (MI.getOperand(0).isReg() &&
+        X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
+      REX |= 1 << 2; // set REX.R
+    unsigned Bit = 0;
+    i = isTwoAddr ? 2 : 1;
+    for (; i != NumOps; ++i) {
+      const MCOperand &MO = MI.getOperand(i);
+      if (MO.isReg()) {
+        if (X86II::isX86_64ExtendedReg(MO.getReg()))
+          REX |= 1 << Bit; // set REX.B (Bit=0) and REX.X (Bit=1)
+        Bit++;
+      }
+    }
+    break;
+  }
+  case X86II::MRMXm:
+  case X86II::MRM0m: case X86II::MRM1m:
+  case X86II::MRM2m: case X86II::MRM3m:
+  case X86II::MRM4m: case X86II::MRM5m:
+  case X86II::MRM6m: case X86II::MRM7m:
+  case X86II::MRMDestMem: {
+    unsigned e = (isTwoAddr ? X86::AddrNumOperands+1 : X86::AddrNumOperands);
+    i = isTwoAddr ? 1 : 0;
+    if (NumOps > e && MI.getOperand(e).isReg() &&
+        X86II::isX86_64ExtendedReg(MI.getOperand(e).getReg()))
+      REX |= 1 << 2; // set REX.R
+    unsigned Bit = 0;
+    for (; i != e; ++i) {
+      const MCOperand &MO = MI.getOperand(i);
+      if (MO.isReg()) {
+        if (X86II::isX86_64ExtendedReg(MO.getReg()))
+          REX |= 1 << Bit; // REX.B (Bit=0) and REX.X (Bit=1)
+        Bit++;
+      }
+    }
+    break;
+  }
+  default:
+    if (MI.getOperand(0).isReg() &&
+        X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
+      REX |= 1 << 0; // set REX.B
+    i = isTwoAddr ? 2 : 1;
+    for (unsigned e = NumOps; i != e; ++i) {
+      const MCOperand &MO = MI.getOperand(i);
+      if (MO.isReg() && X86II::isX86_64ExtendedReg(MO.getReg()))
+        REX |= 1 << 2; // set REX.R
+    }
+    break;
+  }
+  return REX;
+}
+
+/// EmitSegmentOverridePrefix - Emit segment override opcode prefix as needed
+void X86MCCodeEmitter::EmitSegmentOverridePrefix(unsigned &CurByte,
+                                                 unsigned SegOperand,
+                                                 const MCInst &MI,
+                                                 raw_ostream &OS) const {
+  // Check for explicit segment override on memory operand.
+  switch (MI.getOperand(SegOperand).getReg()) {
+  default: llvm_unreachable("Unknown segment register!");
+  case 0: break;
+  case X86::CS: EmitByte(0x2E, CurByte, OS); break;
+  case X86::SS: EmitByte(0x36, CurByte, OS); break;
+  case X86::DS: EmitByte(0x3E, CurByte, OS); break;
+  case X86::ES: EmitByte(0x26, CurByte, OS); break;
+  case X86::FS: EmitByte(0x64, CurByte, OS); break;
+  case X86::GS: EmitByte(0x65, CurByte, OS); break;
+  }
+}
+
+/// EmitOpcodePrefix - Emit all instruction prefixes prior to the opcode.
+///
+/// MemOperand is the operand # of the start of a memory operand if present.  If
+/// Not present, it is -1.
+void X86MCCodeEmitter::EmitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
+                                        int MemOperand, const MCInst &MI,
+                                        const MCInstrDesc &Desc,
+                                        const MCSubtargetInfo &STI,
+                                        raw_ostream &OS) const {
+
+  // Emit the operand size opcode prefix as needed.
+  unsigned char OpSize = (TSFlags & X86II::OpSizeMask) >> X86II::OpSizeShift;
+  if (OpSize == (is16BitMode(STI) ? X86II::OpSize32 : X86II::OpSize16))
+    EmitByte(0x66, CurByte, OS);
+
+  switch (TSFlags & X86II::OpPrefixMask) {
+  case X86II::PD:   // 66
+    EmitByte(0x66, CurByte, OS);
+    break;
+  case X86II::XS:   // F3
+    EmitByte(0xF3, CurByte, OS);
+    break;
+  case X86II::XD:   // F2
+    EmitByte(0xF2, CurByte, OS);
+    break;
+  }
+
+  // Handle REX prefix.
+  // FIXME: Can this come before F2 etc to simplify emission?
+  if (is64BitMode(STI)) {
+    if (unsigned REX = DetermineREXPrefix(MI, TSFlags, Desc))
+      EmitByte(0x40 | REX, CurByte, OS);
+  }
+
+  // 0x0F escape code must be emitted just before the opcode.
+  switch (TSFlags & X86II::OpMapMask) {
+  case X86II::TB:  // Two-byte opcode map
+  case X86II::T8:  // 0F 38
+  case X86II::TA:  // 0F 3A
+    EmitByte(0x0F, CurByte, OS);
+    break;
+  }
+
+  switch (TSFlags & X86II::OpMapMask) {
+  case X86II::T8:    // 0F 38
+    EmitByte(0x38, CurByte, OS);
+    break;
+  case X86II::TA:    // 0F 3A
+    EmitByte(0x3A, CurByte, OS);
+    break;
+  }
+}
+
+void X86MCCodeEmitter::
+EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                  SmallVectorImpl<MCFixup> &Fixups,
+                  const MCSubtargetInfo &STI) const {
+  unsigned Opcode = MI.getOpcode();
+  const MCInstrDesc &Desc = MCII.get(Opcode);
+  uint64_t TSFlags = Desc.TSFlags;
+
+  // Pseudo instructions don't get encoded.
+  if ((TSFlags & X86II::FormMask) == X86II::Pseudo)
+    return;
+
+  unsigned NumOps = Desc.getNumOperands();
+  unsigned CurOp = X86II::getOperandBias(Desc);
+
+  // Keep track of the current byte being emitted.
+  unsigned CurByte = 0;
+
+  // Encoding type for this instruction.
+  unsigned char Encoding = (TSFlags & X86II::EncodingMask) >>
+                           X86II::EncodingShift;
+
+  // It uses the VEX.VVVV field?
+  bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
+  bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3;
+  bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
+  const unsigned MemOp4_I8IMMOperand = 2;
+
+  // It uses the EVEX.aaa field?
+  bool HasEVEX_K = ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K);
+  bool HasEVEX_RC = ((TSFlags >> X86II::VEXShift) & X86II::EVEX_RC);
+  
+  // Determine where the memory operand starts, if present.
+  int MemoryOperand = X86II::getMemoryOperandNo(TSFlags, Opcode);
+  if (MemoryOperand != -1) MemoryOperand += CurOp;
+
+  // Emit the lock opcode prefix as needed.
+  if (TSFlags & X86II::LOCK)
+    EmitByte(0xF0, CurByte, OS);
+
+  // Emit segment override opcode prefix as needed.
+  if (MemoryOperand >= 0)
+    EmitSegmentOverridePrefix(CurByte, MemoryOperand+X86::AddrSegmentReg,
+                              MI, OS);
+
+  // Emit the repeat opcode prefix as needed.
+  if (TSFlags & X86II::REP)
+    EmitByte(0xF3, CurByte, OS);
+
+  // Emit the address size opcode prefix as needed.
+  bool need_address_override;
+  // The AdSize prefix is only for 32-bit and 64-bit modes. Hm, perhaps we
+  // should introduce an AdSize16 bit instead of having seven special cases?
+  if ((!is16BitMode(STI) && TSFlags & X86II::AdSize) ||
+      (is16BitMode(STI) && (MI.getOpcode() == X86::JECXZ_32 ||
+                         MI.getOpcode() == X86::MOV8o8a ||
+                         MI.getOpcode() == X86::MOV16o16a ||
+                         MI.getOpcode() == X86::MOV32o32a ||
+                         MI.getOpcode() == X86::MOV8ao8 ||
+                         MI.getOpcode() == X86::MOV16ao16 ||
+                         MI.getOpcode() == X86::MOV32ao32))) {
+    need_address_override = true;
+  } else if (MemoryOperand < 0) {
+    need_address_override = false;
+  } else if (is64BitMode(STI)) {
+    assert(!Is16BitMemOperand(MI, MemoryOperand, STI));
+    need_address_override = Is32BitMemOperand(MI, MemoryOperand);
+  } else if (is32BitMode(STI)) {
+    assert(!Is64BitMemOperand(MI, MemoryOperand));
+    need_address_override = Is16BitMemOperand(MI, MemoryOperand, STI);
+  } else {
+    assert(is16BitMode(STI));
+    assert(!Is64BitMemOperand(MI, MemoryOperand));
+    need_address_override = !Is16BitMemOperand(MI, MemoryOperand, STI);
+  }
+
+  if (need_address_override)
+    EmitByte(0x67, CurByte, OS);
+
+  if (Encoding == 0)
+    EmitOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, STI, OS);
+  else
+    EmitVEXOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, OS);
+
+  unsigned char BaseOpcode = X86II::getBaseOpcodeFor(TSFlags);
+
+  if ((TSFlags >> X86II::VEXShift) & X86II::Has3DNow0F0FOpcode)
+    BaseOpcode = 0x0F;   // Weird 3DNow! encoding.
+
+  unsigned SrcRegNum = 0;
+  switch (TSFlags & X86II::FormMask) {
+  default: errs() << "FORM: " << (TSFlags & X86II::FormMask) << "\n";
+    llvm_unreachable("Unknown FormMask value in X86MCCodeEmitter!");
+  case X86II::Pseudo:
+    llvm_unreachable("Pseudo instruction shouldn't be emitted");
+  case X86II::RawFrmDstSrc: {
+    unsigned siReg = MI.getOperand(1).getReg();
+    assert(((siReg == X86::SI && MI.getOperand(0).getReg() == X86::DI) ||
+            (siReg == X86::ESI && MI.getOperand(0).getReg() == X86::EDI) ||
+            (siReg == X86::RSI && MI.getOperand(0).getReg() == X86::RDI)) &&
+           "SI and DI register sizes do not match");
+    // Emit segment override opcode prefix as needed (not for %ds).
+    if (MI.getOperand(2).getReg() != X86::DS)
+      EmitSegmentOverridePrefix(CurByte, 2, MI, OS);
+    // Emit AdSize prefix as needed.
+    if ((!is32BitMode(STI) && siReg == X86::ESI) ||
+        (is32BitMode(STI) && siReg == X86::SI))
+      EmitByte(0x67, CurByte, OS);
+    CurOp += 3; // Consume operands.
+    EmitByte(BaseOpcode, CurByte, OS);
+    break;
+  }
+  case X86II::RawFrmSrc: {
+    unsigned siReg = MI.getOperand(0).getReg();
+    // Emit segment override opcode prefix as needed (not for %ds).
+    if (MI.getOperand(1).getReg() != X86::DS)
+      EmitSegmentOverridePrefix(CurByte, 1, MI, OS);
+    // Emit AdSize prefix as needed.
+    if ((!is32BitMode(STI) && siReg == X86::ESI) ||
+        (is32BitMode(STI) && siReg == X86::SI))
+      EmitByte(0x67, CurByte, OS);
+    CurOp += 2; // Consume operands.
+    EmitByte(BaseOpcode, CurByte, OS);
+    break;
+  }
+  case X86II::RawFrmDst: {
+    unsigned siReg = MI.getOperand(0).getReg();
+    // Emit AdSize prefix as needed.
+    if ((!is32BitMode(STI) && siReg == X86::EDI) ||
+        (is32BitMode(STI) && siReg == X86::DI))
+      EmitByte(0x67, CurByte, OS);
+    ++CurOp; // Consume operand.
+    EmitByte(BaseOpcode, CurByte, OS);
+    break;
+  }
+  case X86II::RawFrm:
+    EmitByte(BaseOpcode, CurByte, OS);
+    break;
+  case X86II::RawFrmMemOffs:
+    // Emit segment override opcode prefix as needed.
+    EmitSegmentOverridePrefix(CurByte, 1, MI, OS);
+    EmitByte(BaseOpcode, CurByte, OS);
+    EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
+                  X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
+                  CurByte, OS, Fixups);
+    ++CurOp; // skip segment operand
+    break;
+  case X86II::RawFrmImm8:
+    EmitByte(BaseOpcode, CurByte, OS);
+    EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
+                  X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
+                  CurByte, OS, Fixups);
+    EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 1, FK_Data_1, CurByte,
+                  OS, Fixups);
+    break;
+  case X86II::RawFrmImm16:
+    EmitByte(BaseOpcode, CurByte, OS);
+    EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
+                  X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
+                  CurByte, OS, Fixups);
+    EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 2, FK_Data_2, CurByte,
+                  OS, Fixups);
+    break;
+
+  case X86II::AddRegFrm:
+    EmitByte(BaseOpcode + GetX86RegNum(MI.getOperand(CurOp++)), CurByte, OS);
+    break;
+
+  case X86II::MRMDestReg:
+    EmitByte(BaseOpcode, CurByte, OS);
+    SrcRegNum = CurOp + 1;
+
+    if (HasEVEX_K) // Skip writemask
+      SrcRegNum++;
+
+    if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
+      ++SrcRegNum;
+
+    EmitRegModRMByte(MI.getOperand(CurOp),
+                     GetX86RegNum(MI.getOperand(SrcRegNum)), CurByte, OS);
+    CurOp = SrcRegNum + 1;
+    break;
+
+  case X86II::MRMDestMem:
+    EmitByte(BaseOpcode, CurByte, OS);
+    SrcRegNum = CurOp + X86::AddrNumOperands;
+
+    if (HasEVEX_K) // Skip writemask
+      SrcRegNum++;
+
+    if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
+      ++SrcRegNum;
+
+    EmitMemModRMByte(MI, CurOp,
+                     GetX86RegNum(MI.getOperand(SrcRegNum)),
+                     TSFlags, CurByte, OS, Fixups, STI);
+    CurOp = SrcRegNum + 1;
+    break;
+
+  case X86II::MRMSrcReg:
+    EmitByte(BaseOpcode, CurByte, OS);
+    SrcRegNum = CurOp + 1;
+
+    if (HasEVEX_K) // Skip writemask
+      SrcRegNum++;
+
+    if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
+      ++SrcRegNum;
+
+    if (HasMemOp4) // Skip 2nd src (which is encoded in I8IMM)
+      ++SrcRegNum;
+
+    EmitRegModRMByte(MI.getOperand(SrcRegNum),
+                     GetX86RegNum(MI.getOperand(CurOp)), CurByte, OS);
+
+    // 2 operands skipped with HasMemOp4, compensate accordingly
+    CurOp = HasMemOp4 ? SrcRegNum : SrcRegNum + 1;
+    if (HasVEX_4VOp3)
+      ++CurOp;
+    // do not count the rounding control operand
+    if (HasEVEX_RC)
+      NumOps--;
+    break;
+
+  case X86II::MRMSrcMem: {
+    int AddrOperands = X86::AddrNumOperands;
+    unsigned FirstMemOp = CurOp+1;
+
+    if (HasEVEX_K) { // Skip writemask
+      ++AddrOperands;
+      ++FirstMemOp;
+    }
+
+    if (HasVEX_4V) {
+      ++AddrOperands;
+      ++FirstMemOp;  // Skip the register source (which is encoded in VEX_VVVV).
+    }
+    if (HasMemOp4) // Skip second register source (encoded in I8IMM)
+      ++FirstMemOp;
+
+    EmitByte(BaseOpcode, CurByte, OS);
+
+    EmitMemModRMByte(MI, FirstMemOp, GetX86RegNum(MI.getOperand(CurOp)),
+                     TSFlags, CurByte, OS, Fixups, STI);
+    CurOp += AddrOperands + 1;
+    if (HasVEX_4VOp3)
+      ++CurOp;
+    break;
+  }
+
+  case X86II::MRMXr:
+  case X86II::MRM0r: case X86II::MRM1r:
+  case X86II::MRM2r: case X86II::MRM3r:
+  case X86II::MRM4r: case X86II::MRM5r:
+  case X86II::MRM6r: case X86II::MRM7r: {
+    if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
+      ++CurOp;
+    if (HasEVEX_K) // Skip writemask
+      ++CurOp;
+    EmitByte(BaseOpcode, CurByte, OS);
+    uint64_t Form = TSFlags & X86II::FormMask;
+    EmitRegModRMByte(MI.getOperand(CurOp++),
+                     (Form == X86II::MRMXr) ? 0 : Form-X86II::MRM0r,
+                     CurByte, OS);
+    break;
+  }
+
+  case X86II::MRMXm:
+  case X86II::MRM0m: case X86II::MRM1m:
+  case X86II::MRM2m: case X86II::MRM3m:
+  case X86II::MRM4m: case X86II::MRM5m:
+  case X86II::MRM6m: case X86II::MRM7m: {
+    if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
+      ++CurOp;
+    if (HasEVEX_K) // Skip writemask
+      ++CurOp;
+    EmitByte(BaseOpcode, CurByte, OS);
+    uint64_t Form = TSFlags & X86II::FormMask;
+    EmitMemModRMByte(MI, CurOp, (Form == X86II::MRMXm) ? 0 : Form-X86II::MRM0m,
+                     TSFlags, CurByte, OS, Fixups, STI);
+    CurOp += X86::AddrNumOperands;
+    break;
+  }
+  case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2:
+  case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C8:
+  case X86II::MRM_C9: case X86II::MRM_CA: case X86II::MRM_CB:
+  case X86II::MRM_D0: case X86II::MRM_D1: case X86II::MRM_D4:
+  case X86II::MRM_D5: case X86II::MRM_D6: case X86II::MRM_D8:
+  case X86II::MRM_D9: case X86II::MRM_DA: case X86II::MRM_DB:
+  case X86II::MRM_DC: case X86II::MRM_DD: case X86II::MRM_DE:
+  case X86II::MRM_DF: case X86II::MRM_E0: case X86II::MRM_E1:
+  case X86II::MRM_E2: case X86II::MRM_E3: case X86II::MRM_E4:
+  case X86II::MRM_E5: case X86II::MRM_E8: case X86II::MRM_E9:
+  case X86II::MRM_EA: case X86II::MRM_EB: case X86II::MRM_EC:
+  case X86II::MRM_ED: case X86II::MRM_EE: case X86II::MRM_F0:
+  case X86II::MRM_F1: case X86II::MRM_F2: case X86II::MRM_F3:
+  case X86II::MRM_F4: case X86II::MRM_F5: case X86II::MRM_F6:
+  case X86II::MRM_F7: case X86II::MRM_F8: case X86II::MRM_F9:
+  case X86II::MRM_FA: case X86II::MRM_FB: case X86II::MRM_FC:
+  case X86II::MRM_FD: case X86II::MRM_FE: case X86II::MRM_FF:
+    EmitByte(BaseOpcode, CurByte, OS);
+
+    unsigned char MRM;
+    switch (TSFlags & X86II::FormMask) {
+    default: llvm_unreachable("Invalid Form");
+    case X86II::MRM_C0: MRM = 0xC0; break;
+    case X86II::MRM_C1: MRM = 0xC1; break;
+    case X86II::MRM_C2: MRM = 0xC2; break;
+    case X86II::MRM_C3: MRM = 0xC3; break;
+    case X86II::MRM_C4: MRM = 0xC4; break;
+    case X86II::MRM_C8: MRM = 0xC8; break;
+    case X86II::MRM_C9: MRM = 0xC9; break;
+    case X86II::MRM_CA: MRM = 0xCA; break;
+    case X86II::MRM_CB: MRM = 0xCB; break;
+    case X86II::MRM_D0: MRM = 0xD0; break;
+    case X86II::MRM_D1: MRM = 0xD1; break;
+    case X86II::MRM_D4: MRM = 0xD4; break;
+    case X86II::MRM_D5: MRM = 0xD5; break;
+    case X86II::MRM_D6: MRM = 0xD6; break;
+    case X86II::MRM_D8: MRM = 0xD8; break;
+    case X86II::MRM_D9: MRM = 0xD9; break;
+    case X86II::MRM_DA: MRM = 0xDA; break;
+    case X86II::MRM_DB: MRM = 0xDB; break;
+    case X86II::MRM_DC: MRM = 0xDC; break;
+    case X86II::MRM_DD: MRM = 0xDD; break;
+    case X86II::MRM_DE: MRM = 0xDE; break;
+    case X86II::MRM_DF: MRM = 0xDF; break;
+    case X86II::MRM_E0: MRM = 0xE0; break;
+    case X86II::MRM_E1: MRM = 0xE1; break;
+    case X86II::MRM_E2: MRM = 0xE2; break;
+    case X86II::MRM_E3: MRM = 0xE3; break;
+    case X86II::MRM_E4: MRM = 0xE4; break;
+    case X86II::MRM_E5: MRM = 0xE5; break;
+    case X86II::MRM_E8: MRM = 0xE8; break;
+    case X86II::MRM_E9: MRM = 0xE9; break;
+    case X86II::MRM_EA: MRM = 0xEA; break;
+    case X86II::MRM_EB: MRM = 0xEB; break;
+    case X86II::MRM_EC: MRM = 0xEC; break;
+    case X86II::MRM_ED: MRM = 0xED; break;
+    case X86II::MRM_EE: MRM = 0xEE; break;
+    case X86II::MRM_F0: MRM = 0xF0; break;
+    case X86II::MRM_F1: MRM = 0xF1; break;
+    case X86II::MRM_F2: MRM = 0xF2; break;
+    case X86II::MRM_F3: MRM = 0xF3; break;
+    case X86II::MRM_F4: MRM = 0xF4; break;
+    case X86II::MRM_F5: MRM = 0xF5; break;
+    case X86II::MRM_F6: MRM = 0xF6; break;
+    case X86II::MRM_F7: MRM = 0xF7; break;
+    case X86II::MRM_F8: MRM = 0xF8; break;
+    case X86II::MRM_F9: MRM = 0xF9; break;
+    case X86II::MRM_FA: MRM = 0xFA; break;
+    case X86II::MRM_FB: MRM = 0xFB; break;
+    case X86II::MRM_FC: MRM = 0xFC; break;
+    case X86II::MRM_FD: MRM = 0xFD; break;
+    case X86II::MRM_FE: MRM = 0xFE; break;
+    case X86II::MRM_FF: MRM = 0xFF; break;
+    }
+    EmitByte(MRM, CurByte, OS);
+    break;
+  }
+
+  // If there is a remaining operand, it must be a trailing immediate.  Emit it
+  // according to the right size for the instruction. Some instructions
+  // (SSE4a extrq and insertq) have two trailing immediates.
+  while (CurOp != NumOps && NumOps - CurOp <= 2) {
+    // The last source register of a 4 operand instruction in AVX is encoded
+    // in bits[7:4] of a immediate byte.
+    if ((TSFlags >> X86II::VEXShift) & X86II::VEX_I8IMM) {
+      const MCOperand &MO = MI.getOperand(HasMemOp4 ? MemOp4_I8IMMOperand
+                                                    : CurOp);
+      ++CurOp;
+      unsigned RegNum = GetX86RegNum(MO) << 4;
+      if (X86II::isX86_64ExtendedReg(MO.getReg()))
+        RegNum |= 1 << 7;
+      // If there is an additional 5th operand it must be an immediate, which
+      // is encoded in bits[3:0]
+      if (CurOp != NumOps) {
+        const MCOperand &MIMM = MI.getOperand(CurOp++);
+        if (MIMM.isImm()) {
+          unsigned Val = MIMM.getImm();
+          assert(Val < 16 && "Immediate operand value out of range");
+          RegNum |= Val;
+        }
+      }
+      EmitImmediate(MCOperand::CreateImm(RegNum), MI.getLoc(), 1, FK_Data_1,
+                    CurByte, OS, Fixups);
+    } else {
+      EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
+                    X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
+                    CurByte, OS, Fixups);
+    }
+  }
+
+  if ((TSFlags >> X86II::VEXShift) & X86II::Has3DNow0F0FOpcode)
+    EmitByte(X86II::getBaseOpcodeFor(TSFlags), CurByte, OS);
+
+#ifndef NDEBUG
+  // FIXME: Verify.
+  if (/*!Desc.isVariadic() &&*/ CurOp != NumOps) {
+    errs() << "Cannot encode all operands of: ";
+    MI.dump();
+    errs() << '\n';
+    abort();
+  }
+#endif
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
new file mode 100644
index 0000000..3bfad6c
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -0,0 +1,460 @@
+//===-- X86MCTargetDesc.cpp - X86 Target Descriptions ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides X86 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MCTargetDesc.h"
+#include "InstPrinter/X86ATTInstPrinter.h"
+#include "InstPrinter/X86IntelInstPrinter.h"
+#include "X86MCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCInstrAnalysis.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#if _MSC_VER
+#include <intrin.h>
+#endif
+
+using namespace llvm;
+
+#define GET_REGINFO_MC_DESC
+#include "X86GenRegisterInfo.inc"
+
+#define GET_INSTRINFO_MC_DESC
+#include "X86GenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "X86GenSubtargetInfo.inc"
+
+std::string X86_MC::ParseX86Triple(StringRef TT) {
+  Triple TheTriple(TT);
+  std::string FS;
+  if (TheTriple.getArch() == Triple::x86_64)
+    FS = "+64bit-mode,-32bit-mode,-16bit-mode";
+  else if (TheTriple.getEnvironment() != Triple::CODE16)
+    FS = "-64bit-mode,+32bit-mode,-16bit-mode";
+  else
+    FS = "-64bit-mode,-32bit-mode,+16bit-mode";
+
+  return FS;
+}
+
+/// GetCpuIDAndInfo - Execute the specified cpuid and return the 4 values in the
+/// specified arguments.  If we can't run cpuid on the host, return true.
+bool X86_MC::GetCpuIDAndInfo(unsigned value, unsigned *rEAX,
+                             unsigned *rEBX, unsigned *rECX, unsigned *rEDX) {
+#if defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64)
+  #if defined(__GNUC__)
+    // gcc doesn't know cpuid would clobber ebx/rbx. Preseve it manually.
+    asm ("movq\t%%rbx, %%rsi\n\t"
+         "cpuid\n\t"
+         "xchgq\t%%rbx, %%rsi\n\t"
+         : "=a" (*rEAX),
+           "=S" (*rEBX),
+           "=c" (*rECX),
+           "=d" (*rEDX)
+         :  "a" (value));
+    return false;
+  #elif defined(_MSC_VER)
+    int registers[4];
+    __cpuid(registers, value);
+    *rEAX = registers[0];
+    *rEBX = registers[1];
+    *rECX = registers[2];
+    *rEDX = registers[3];
+    return false;
+  #else
+    return true;
+  #endif
+#elif defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)
+  #if defined(__GNUC__)
+    asm ("movl\t%%ebx, %%esi\n\t"
+         "cpuid\n\t"
+         "xchgl\t%%ebx, %%esi\n\t"
+         : "=a" (*rEAX),
+           "=S" (*rEBX),
+           "=c" (*rECX),
+           "=d" (*rEDX)
+         :  "a" (value));
+    return false;
+  #elif defined(_MSC_VER)
+    __asm {
+      mov   eax,value
+      cpuid
+      mov   esi,rEAX
+      mov   dword ptr [esi],eax
+      mov   esi,rEBX
+      mov   dword ptr [esi],ebx
+      mov   esi,rECX
+      mov   dword ptr [esi],ecx
+      mov   esi,rEDX
+      mov   dword ptr [esi],edx
+    }
+    return false;
+  #else
+    return true;
+  #endif
+#else
+  return true;
+#endif
+}
+
+/// GetCpuIDAndInfoEx - Execute the specified cpuid with subleaf and return the
+/// 4 values in the specified arguments.  If we can't run cpuid on the host,
+/// return true.
+bool X86_MC::GetCpuIDAndInfoEx(unsigned value, unsigned subleaf, unsigned *rEAX,
+                               unsigned *rEBX, unsigned *rECX, unsigned *rEDX) {
+#if defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64)
+  #if defined(__GNUC__)
+    // gcc desn't know cpuid would clobber ebx/rbx. Preseve it manually.
+    asm ("movq\t%%rbx, %%rsi\n\t"
+         "cpuid\n\t"
+         "xchgq\t%%rbx, %%rsi\n\t"
+         : "=a" (*rEAX),
+           "=S" (*rEBX),
+           "=c" (*rECX),
+           "=d" (*rEDX)
+         :  "a" (value),
+            "c" (subleaf));
+    return false;
+  #elif defined(_MSC_VER)
+    // __cpuidex was added in MSVC++ 9.0 SP1
+    #if (_MSC_VER > 1500) || (_MSC_VER == 1500 && _MSC_FULL_VER >= 150030729)
+      int registers[4];
+      __cpuidex(registers, value, subleaf);
+      *rEAX = registers[0];
+      *rEBX = registers[1];
+      *rECX = registers[2];
+      *rEDX = registers[3];
+      return false;
+    #else
+      return true;
+    #endif
+  #else
+    return true;
+  #endif
+#elif defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)
+  #if defined(__GNUC__)
+    asm ("movl\t%%ebx, %%esi\n\t"
+         "cpuid\n\t"
+         "xchgl\t%%ebx, %%esi\n\t"
+         : "=a" (*rEAX),
+           "=S" (*rEBX),
+           "=c" (*rECX),
+           "=d" (*rEDX)
+         :  "a" (value),
+            "c" (subleaf));
+    return false;
+  #elif defined(_MSC_VER)
+    __asm {
+      mov   eax,value
+      mov   ecx,subleaf
+      cpuid
+      mov   esi,rEAX
+      mov   dword ptr [esi],eax
+      mov   esi,rEBX
+      mov   dword ptr [esi],ebx
+      mov   esi,rECX
+      mov   dword ptr [esi],ecx
+      mov   esi,rEDX
+      mov   dword ptr [esi],edx
+    }
+    return false;
+  #else
+    return true;
+  #endif
+#else
+  return true;
+#endif
+}
+
+void X86_MC::DetectFamilyModel(unsigned EAX, unsigned &Family,
+                               unsigned &Model) {
+  Family = (EAX >> 8) & 0xf; // Bits 8 - 11
+  Model  = (EAX >> 4) & 0xf; // Bits 4 - 7
+  if (Family == 6 || Family == 0xf) {
+    if (Family == 0xf)
+      // Examine extended family ID if family ID is F.
+      Family += (EAX >> 20) & 0xff;    // Bits 20 - 27
+    // Examine extended model ID if family ID is 6 or F.
+    Model += ((EAX >> 16) & 0xf) << 4; // Bits 16 - 19
+  }
+}
+
+unsigned X86_MC::getDwarfRegFlavour(Triple TT, bool isEH) {
+  if (TT.getArch() == Triple::x86_64)
+    return DWARFFlavour::X86_64;
+
+  if (TT.isOSDarwin())
+    return isEH ? DWARFFlavour::X86_32_DarwinEH : DWARFFlavour::X86_32_Generic;
+  if (TT.isOSCygMing())
+    // Unsupported by now, just quick fallback
+    return DWARFFlavour::X86_32_Generic;
+  return DWARFFlavour::X86_32_Generic;
+}
+
+void X86_MC::InitLLVM2SEHRegisterMapping(MCRegisterInfo *MRI) {
+  // FIXME: TableGen these.
+  for (unsigned Reg = X86::NoRegister+1; Reg < X86::NUM_TARGET_REGS; ++Reg) {
+    unsigned SEH = MRI->getEncodingValue(Reg);
+    MRI->mapLLVMRegToSEHReg(Reg, SEH);
+  }
+}
+
+MCSubtargetInfo *X86_MC::createX86MCSubtargetInfo(StringRef TT, StringRef CPU,
+                                                  StringRef FS) {
+  std::string ArchFS = X86_MC::ParseX86Triple(TT);
+  if (!FS.empty()) {
+    if (!ArchFS.empty())
+      ArchFS = ArchFS + "," + FS.str();
+    else
+      ArchFS = FS;
+  }
+
+  std::string CPUName = CPU;
+  if (CPUName.empty())
+    CPUName = "generic";
+
+  MCSubtargetInfo *X = new MCSubtargetInfo();
+  InitX86MCSubtargetInfo(X, TT, CPUName, ArchFS);
+  return X;
+}
+
+static MCInstrInfo *createX86MCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitX86MCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createX86MCRegisterInfo(StringRef TT) {
+  Triple TheTriple(TT);
+  unsigned RA = (TheTriple.getArch() == Triple::x86_64)
+    ? X86::RIP     // Should have dwarf #16.
+    : X86::EIP;    // Should have dwarf #8.
+
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitX86MCRegisterInfo(X, RA,
+                        X86_MC::getDwarfRegFlavour(TheTriple, false),
+                        X86_MC::getDwarfRegFlavour(TheTriple, true),
+                        RA);
+  X86_MC::InitLLVM2SEHRegisterMapping(X);
+  return X;
+}
+
+static MCAsmInfo *createX86MCAsmInfo(const MCRegisterInfo &MRI, StringRef TT) {
+  Triple TheTriple(TT);
+  bool is64Bit = TheTriple.getArch() == Triple::x86_64;
+
+  MCAsmInfo *MAI;
+  if (TheTriple.isOSBinFormatMachO()) {
+    if (is64Bit)
+      MAI = new X86_64MCAsmInfoDarwin(TheTriple);
+    else
+      MAI = new X86MCAsmInfoDarwin(TheTriple);
+  } else if (TheTriple.isOSBinFormatELF()) {
+    // Force the use of an ELF container.
+    MAI = new X86ELFMCAsmInfo(TheTriple);
+  } else if (TheTriple.isWindowsMSVCEnvironment()) {
+    MAI = new X86MCAsmInfoMicrosoft(TheTriple);
+  } else if (TheTriple.isOSCygMing() ||
+             TheTriple.isWindowsItaniumEnvironment()) {
+    MAI = new X86MCAsmInfoGNUCOFF(TheTriple);
+  } else {
+    // The default is ELF.
+    MAI = new X86ELFMCAsmInfo(TheTriple);
+  }
+
+  // Initialize initial frame state.
+  // Calculate amount of bytes used for return address storing
+  int stackGrowth = is64Bit ? -8 : -4;
+
+  // Initial state of the frame pointer is esp+stackGrowth.
+  unsigned StackPtr = is64Bit ? X86::RSP : X86::ESP;
+  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(
+      nullptr, MRI.getDwarfRegNum(StackPtr, true), -stackGrowth);
+  MAI->addInitialFrameState(Inst);
+
+  // Add return address to move list
+  unsigned InstPtr = is64Bit ? X86::RIP : X86::EIP;
+  MCCFIInstruction Inst2 = MCCFIInstruction::createOffset(
+      nullptr, MRI.getDwarfRegNum(InstPtr, true), stackGrowth);
+  MAI->addInitialFrameState(Inst2);
+
+  return MAI;
+}
+
+static MCCodeGenInfo *createX86MCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                             CodeModel::Model CM,
+                                             CodeGenOpt::Level OL) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+
+  Triple T(TT);
+  bool is64Bit = T.getArch() == Triple::x86_64;
+
+  if (RM == Reloc::Default) {
+    // Darwin defaults to PIC in 64 bit mode and dynamic-no-pic in 32 bit mode.
+    // Win64 requires rip-rel addressing, thus we force it to PIC. Otherwise we
+    // use static relocation model by default.
+    if (T.isOSDarwin()) {
+      if (is64Bit)
+        RM = Reloc::PIC_;
+      else
+        RM = Reloc::DynamicNoPIC;
+    } else if (T.isOSWindows() && is64Bit)
+      RM = Reloc::PIC_;
+    else
+      RM = Reloc::Static;
+  }
+
+  // ELF and X86-64 don't have a distinct DynamicNoPIC model.  DynamicNoPIC
+  // is defined as a model for code which may be used in static or dynamic
+  // executables but not necessarily a shared library. On X86-32 we just
+  // compile in -static mode, in x86-64 we use PIC.
+  if (RM == Reloc::DynamicNoPIC) {
+    if (is64Bit)
+      RM = Reloc::PIC_;
+    else if (!T.isOSDarwin())
+      RM = Reloc::Static;
+  }
+
+  // If we are on Darwin, disallow static relocation model in X86-64 mode, since
+  // the Mach-O file format doesn't support it.
+  if (RM == Reloc::Static && T.isOSDarwin() && is64Bit)
+    RM = Reloc::PIC_;
+
+  // For static codegen, if we're not already set, use Small codegen.
+  if (CM == CodeModel::Default)
+    CM = CodeModel::Small;
+  else if (CM == CodeModel::JITDefault)
+    // 64-bit JIT places everything in the same buffer except external funcs.
+    CM = is64Bit ? CodeModel::Large : CodeModel::Small;
+
+  X->InitMCCodeGenInfo(RM, CM, OL);
+  return X;
+}
+
+static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
+                                    MCContext &Ctx, MCAsmBackend &MAB,
+                                    raw_ostream &_OS,
+                                    MCCodeEmitter *_Emitter,
+                                    const MCSubtargetInfo &STI,
+                                    bool RelaxAll,
+                                    bool NoExecStack) {
+  Triple TheTriple(TT);
+
+  switch (TheTriple.getObjectFormat()) {
+  default: llvm_unreachable("unsupported object format");
+  case Triple::MachO:
+    return createMachOStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll);
+  case Triple::COFF:
+    assert(TheTriple.isOSWindows() && "only Windows COFF is supported");
+    return createX86WinCOFFStreamer(Ctx, MAB, _Emitter, _OS, RelaxAll);
+  case Triple::ELF:
+    return createELFStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll, NoExecStack);
+  }
+}
+
+static MCInstPrinter *createX86MCInstPrinter(const Target &T,
+                                             unsigned SyntaxVariant,
+                                             const MCAsmInfo &MAI,
+                                             const MCInstrInfo &MII,
+                                             const MCRegisterInfo &MRI,
+                                             const MCSubtargetInfo &STI) {
+  if (SyntaxVariant == 0)
+    return new X86ATTInstPrinter(MAI, MII, MRI);
+  if (SyntaxVariant == 1)
+    return new X86IntelInstPrinter(MAI, MII, MRI);
+  return nullptr;
+}
+
+static MCRelocationInfo *createX86MCRelocationInfo(StringRef TT,
+                                                   MCContext &Ctx) {
+  Triple TheTriple(TT);
+  if (TheTriple.isOSBinFormatMachO() && TheTriple.getArch() == Triple::x86_64)
+    return createX86_64MachORelocationInfo(Ctx);
+  else if (TheTriple.isOSBinFormatELF())
+    return createX86_64ELFRelocationInfo(Ctx);
+  // Default to the stock relocation info.
+  return llvm::createMCRelocationInfo(TT, Ctx);
+}
+
+static MCInstrAnalysis *createX86MCInstrAnalysis(const MCInstrInfo *Info) {
+  return new MCInstrAnalysis(Info);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeX86TargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfoFn A(TheX86_32Target, createX86MCAsmInfo);
+  RegisterMCAsmInfoFn B(TheX86_64Target, createX86MCAsmInfo);
+
+  // Register the MC codegen info.
+  RegisterMCCodeGenInfoFn C(TheX86_32Target, createX86MCCodeGenInfo);
+  RegisterMCCodeGenInfoFn D(TheX86_64Target, createX86MCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheX86_32Target, createX86MCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheX86_64Target, createX86MCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheX86_32Target, createX86MCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheX86_64Target, createX86MCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheX86_32Target,
+                                          X86_MC::createX86MCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(TheX86_64Target,
+                                          X86_MC::createX86MCSubtargetInfo);
+
+  // Register the MC instruction analyzer.
+  TargetRegistry::RegisterMCInstrAnalysis(TheX86_32Target,
+                                          createX86MCInstrAnalysis);
+  TargetRegistry::RegisterMCInstrAnalysis(TheX86_64Target,
+                                          createX86MCInstrAnalysis);
+
+  // Register the code emitter.
+  TargetRegistry::RegisterMCCodeEmitter(TheX86_32Target,
+                                        createX86MCCodeEmitter);
+  TargetRegistry::RegisterMCCodeEmitter(TheX86_64Target,
+                                        createX86MCCodeEmitter);
+
+  // Register the asm backend.
+  TargetRegistry::RegisterMCAsmBackend(TheX86_32Target,
+                                       createX86_32AsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(TheX86_64Target,
+                                       createX86_64AsmBackend);
+
+  // Register the object streamer.
+  TargetRegistry::RegisterMCObjectStreamer(TheX86_32Target,
+                                           createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(TheX86_64Target,
+                                           createMCStreamer);
+
+  // Register the MCInstPrinter.
+  TargetRegistry::RegisterMCInstPrinter(TheX86_32Target,
+                                        createX86MCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheX86_64Target,
+                                        createX86MCInstPrinter);
+
+  // Register the MC relocation info.
+  TargetRegistry::RegisterMCRelocationInfo(TheX86_32Target,
+                                           createX86MCRelocationInfo);
+  TargetRegistry::RegisterMCRelocationInfo(TheX86_64Target,
+                                           createX86MCRelocationInfo);
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
new file mode 100644
index 0000000..ebe74cf
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -0,0 +1,133 @@
+//===-- X86MCTargetDesc.h - X86 Target Descriptions -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides X86 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86MCTARGETDESC_H
+#define X86MCTARGETDESC_H
+
+#include "llvm/Support/DataTypes.h"
+#include <string>
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectWriter;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class MCRelocationInfo;
+class MCStreamer;
+class Target;
+class Triple;
+class StringRef;
+class raw_ostream;
+
+extern Target TheX86_32Target, TheX86_64Target;
+
+/// DWARFFlavour - Flavour of dwarf regnumbers
+///
+namespace DWARFFlavour {
+  enum {
+    X86_64 = 0, X86_32_DarwinEH = 1, X86_32_Generic = 2
+  };
+} 
+  
+/// N86 namespace - Native X86 register numbers
+///
+namespace N86 {
+  enum {
+    EAX = 0, ECX = 1, EDX = 2, EBX = 3, ESP = 4, EBP = 5, ESI = 6, EDI = 7
+  };
+}
+
+namespace X86_MC {
+  std::string ParseX86Triple(StringRef TT);
+
+  /// GetCpuIDAndInfo - Execute the specified cpuid and return the 4 values in
+  /// the specified arguments.  If we can't run cpuid on the host, return true.
+  bool GetCpuIDAndInfo(unsigned value, unsigned *rEAX,
+                       unsigned *rEBX, unsigned *rECX, unsigned *rEDX);
+  /// GetCpuIDAndInfoEx - Execute the specified cpuid with subleaf and return
+  /// the 4 values in the specified arguments.  If we can't run cpuid on the
+  /// host, return true.
+  bool GetCpuIDAndInfoEx(unsigned value, unsigned subleaf, unsigned *rEAX,
+                       unsigned *rEBX, unsigned *rECX, unsigned *rEDX);
+
+  void DetectFamilyModel(unsigned EAX, unsigned &Family, unsigned &Model);
+
+  unsigned getDwarfRegFlavour(Triple TT, bool isEH);
+
+  void InitLLVM2SEHRegisterMapping(MCRegisterInfo *MRI);
+
+  /// createX86MCSubtargetInfo - Create a X86 MCSubtargetInfo instance.
+  /// This is exposed so Asm parser, etc. do not need to go through
+  /// TargetRegistry.
+  MCSubtargetInfo *createX86MCSubtargetInfo(StringRef TT, StringRef CPU,
+                                            StringRef FS);
+}
+
+MCCodeEmitter *createX86MCCodeEmitter(const MCInstrInfo &MCII,
+                                      const MCRegisterInfo &MRI,
+                                      const MCSubtargetInfo &STI,
+                                      MCContext &Ctx);
+
+MCAsmBackend *createX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                     StringRef TT, StringRef CPU);
+MCAsmBackend *createX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                     StringRef TT, StringRef CPU);
+
+/// createX86WinCOFFStreamer - Construct an X86 Windows COFF machine code
+/// streamer which will generate PE/COFF format object files.
+///
+/// Takes ownership of \p AB and \p CE.
+MCStreamer *createX86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB,
+                                     MCCodeEmitter *CE, raw_ostream &OS,
+                                     bool RelaxAll);
+
+/// createX86MachObjectWriter - Construct an X86 Mach-O object writer.
+MCObjectWriter *createX86MachObjectWriter(raw_ostream &OS,
+                                          bool Is64Bit,
+                                          uint32_t CPUType,
+                                          uint32_t CPUSubtype);
+
+/// createX86ELFObjectWriter - Construct an X86 ELF object writer.
+MCObjectWriter *createX86ELFObjectWriter(raw_ostream &OS,
+                                         bool IsELF64,
+                                         uint8_t OSABI,
+                                         uint16_t EMachine);
+/// createX86WinCOFFObjectWriter - Construct an X86 Win COFF object writer.
+MCObjectWriter *createX86WinCOFFObjectWriter(raw_ostream &OS, bool Is64Bit);
+
+/// createX86_64MachORelocationInfo - Construct X86-64 Mach-O relocation info.
+MCRelocationInfo *createX86_64MachORelocationInfo(MCContext &Ctx);
+
+/// createX86_64ELFORelocationInfo - Construct X86-64 ELF relocation info.
+MCRelocationInfo *createX86_64ELFRelocationInfo(MCContext &Ctx);
+} // End llvm namespace
+
+
+// Defines symbolic names for X86 registers.  This defines a mapping from
+// register name to register number.
+//
+#define GET_REGINFO_ENUM
+#include "X86GenRegisterInfo.inc"
+
+// Defines symbolic names for the X86 instructions.
+//
+#define GET_INSTRINFO_ENUM
+#include "X86GenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "X86GenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp
new file mode 100644
index 0000000..3b81d53
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp
@@ -0,0 +1,116 @@
+//===-- X86MachORelocationInfo.cpp ----------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCRelocationInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Object/MachO.h"
+
+using namespace llvm;
+using namespace object;
+using namespace MachO;
+
+namespace {
+class X86_64MachORelocationInfo : public MCRelocationInfo {
+public:
+  X86_64MachORelocationInfo(MCContext &Ctx) : MCRelocationInfo(Ctx) {}
+
+  const MCExpr *createExprForRelocation(RelocationRef Rel) override {
+    const MachOObjectFile *Obj = cast<MachOObjectFile>(Rel.getObjectFile());
+
+    uint64_t RelType; Rel.getType(RelType);
+    symbol_iterator SymI = Rel.getSymbol();
+
+    StringRef SymName; SymI->getName(SymName);
+    uint64_t  SymAddr; SymI->getAddress(SymAddr);
+
+    any_relocation_info RE = Obj->getRelocation(Rel.getRawDataRefImpl());
+    bool isPCRel = Obj->getAnyRelocationPCRel(RE);
+
+    MCSymbol *Sym = Ctx.GetOrCreateSymbol(SymName);
+    // FIXME: check that the value is actually the same.
+    if (Sym->isVariable() == false)
+      Sym->setVariableValue(MCConstantExpr::Create(SymAddr, Ctx));
+    const MCExpr *Expr = nullptr;
+
+    switch(RelType) {
+    case X86_64_RELOC_TLV:
+      Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_TLVP, Ctx);
+      break;
+    case X86_64_RELOC_SIGNED_4:
+      Expr = MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(Sym, Ctx),
+                                     MCConstantExpr::Create(4, Ctx),
+                                     Ctx);
+      break;
+    case X86_64_RELOC_SIGNED_2:
+      Expr = MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(Sym, Ctx),
+                                     MCConstantExpr::Create(2, Ctx),
+                                     Ctx);
+      break;
+    case X86_64_RELOC_SIGNED_1:
+      Expr = MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(Sym, Ctx),
+                                     MCConstantExpr::Create(1, Ctx),
+                                     Ctx);
+      break;
+    case X86_64_RELOC_GOT_LOAD:
+      Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, Ctx);
+      break;
+    case X86_64_RELOC_GOT:
+      Expr = MCSymbolRefExpr::Create(Sym, isPCRel ?
+                                     MCSymbolRefExpr::VK_GOTPCREL :
+                                     MCSymbolRefExpr::VK_GOT,
+                                     Ctx);
+      break;
+    case X86_64_RELOC_SUBTRACTOR:
+      {
+        Rel.moveNext();
+        any_relocation_info RENext =
+            Obj->getRelocation(Rel.getRawDataRefImpl());
+
+        // X86_64_SUBTRACTOR must be followed by a relocation of type
+        // X86_64_RELOC_UNSIGNED.
+        // NOTE: Scattered relocations don't exist on x86_64.
+        unsigned RType = Obj->getAnyRelocationType(RENext);
+        if (RType != X86_64_RELOC_UNSIGNED)
+          report_fatal_error("Expected X86_64_RELOC_UNSIGNED after "
+                             "X86_64_RELOC_SUBTRACTOR.");
+
+        const MCExpr *LHS = MCSymbolRefExpr::Create(Sym, Ctx);
+
+        symbol_iterator RSymI = Rel.getSymbol();
+        uint64_t RSymAddr;
+        RSymI->getAddress(RSymAddr);
+        StringRef RSymName;
+        RSymI->getName(RSymName);
+
+        MCSymbol *RSym = Ctx.GetOrCreateSymbol(RSymName);
+        if (RSym->isVariable() == false)
+          RSym->setVariableValue(MCConstantExpr::Create(RSymAddr, Ctx));
+
+        const MCExpr *RHS = MCSymbolRefExpr::Create(RSym, Ctx);
+
+        Expr = MCBinaryExpr::CreateSub(LHS, RHS, Ctx);
+        break;
+      }
+    default:
+      Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+      break;
+    }
+    return Expr;
+  }
+};
+} // End unnamed namespace
+
+/// createX86_64MachORelocationInfo - Construct an X86-64 Mach-O RelocationInfo.
+MCRelocationInfo *llvm::createX86_64MachORelocationInfo(MCContext &Ctx) {
+  return new X86_64MachORelocationInfo(Ctx);
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
new file mode 100644
index 0000000..ead3338
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -0,0 +1,609 @@
+//===-- X86MachObjectWriter.cpp - X86 Mach-O Writer -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "MCTargetDesc/X86FixupKinds.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/MachO.h"
+
+using namespace llvm;
+
+namespace {
+class X86MachObjectWriter : public MCMachObjectTargetWriter {
+  bool RecordScatteredRelocation(MachObjectWriter *Writer,
+                                 const MCAssembler &Asm,
+                                 const MCAsmLayout &Layout,
+                                 const MCFragment *Fragment,
+                                 const MCFixup &Fixup,
+                                 MCValue Target,
+                                 unsigned Log2Size,
+                                 uint64_t &FixedValue);
+  void RecordTLVPRelocation(MachObjectWriter *Writer,
+                            const MCAssembler &Asm,
+                            const MCAsmLayout &Layout,
+                            const MCFragment *Fragment,
+                            const MCFixup &Fixup,
+                            MCValue Target,
+                            uint64_t &FixedValue);
+
+  void RecordX86Relocation(MachObjectWriter *Writer,
+                              const MCAssembler &Asm,
+                              const MCAsmLayout &Layout,
+                              const MCFragment *Fragment,
+                              const MCFixup &Fixup,
+                              MCValue Target,
+                              uint64_t &FixedValue);
+  void RecordX86_64Relocation(MachObjectWriter *Writer,
+                              const MCAssembler &Asm,
+                              const MCAsmLayout &Layout,
+                              const MCFragment *Fragment,
+                              const MCFixup &Fixup,
+                              MCValue Target,
+                              uint64_t &FixedValue);
+public:
+  X86MachObjectWriter(bool Is64Bit, uint32_t CPUType,
+                      uint32_t CPUSubtype)
+    : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype,
+                               /*UseAggressiveSymbolFolding=*/Is64Bit) {}
+
+  void RecordRelocation(MachObjectWriter *Writer,
+                        const MCAssembler &Asm, const MCAsmLayout &Layout,
+                        const MCFragment *Fragment, const MCFixup &Fixup,
+                        MCValue Target, uint64_t &FixedValue) override {
+    if (Writer->is64Bit())
+      RecordX86_64Relocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+                             FixedValue);
+    else
+      RecordX86Relocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+                          FixedValue);
+  }
+};
+}
+
+static bool isFixupKindRIPRel(unsigned Kind) {
+  return Kind == X86::reloc_riprel_4byte ||
+    Kind == X86::reloc_riprel_4byte_movq_load;
+}
+
+static unsigned getFixupKindLog2Size(unsigned Kind) {
+  switch (Kind) {
+  default:
+    llvm_unreachable("invalid fixup kind!");
+  case FK_PCRel_1:
+  case FK_Data_1: return 0;
+  case FK_PCRel_2:
+  case FK_Data_2: return 1;
+  case FK_PCRel_4:
+    // FIXME: Remove these!!!
+  case X86::reloc_riprel_4byte:
+  case X86::reloc_riprel_4byte_movq_load:
+  case X86::reloc_signed_4byte:
+  case FK_Data_4: return 2;
+  case FK_Data_8: return 3;
+  }
+}
+
+void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
+                                                 const MCAssembler &Asm,
+                                                 const MCAsmLayout &Layout,
+                                                 const MCFragment *Fragment,
+                                                 const MCFixup &Fixup,
+                                                 MCValue Target,
+                                                 uint64_t &FixedValue) {
+  unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+  unsigned IsRIPRel = isFixupKindRIPRel(Fixup.getKind());
+  unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind());
+
+  // See <reloc.h>.
+  uint32_t FixupOffset =
+    Layout.getFragmentOffset(Fragment) + Fixup.getOffset();
+  uint32_t FixupAddress =
+    Writer->getFragmentAddress(Fragment, Layout) + Fixup.getOffset();
+  int64_t Value = 0;
+  unsigned Index = 0;
+  unsigned IsExtern = 0;
+  unsigned Type = 0;
+
+  Value = Target.getConstant();
+
+  if (IsPCRel) {
+    // Compensate for the relocation offset, Darwin x86_64 relocations only have
+    // the addend and appear to have attempted to define it to be the actual
+    // expression addend without the PCrel bias. However, instructions with data
+    // following the relocation are not accommodated for (see comment below
+    // regarding SIGNED{1,2,4}), so it isn't exactly that either.
+    Value += 1LL << Log2Size;
+  }
+
+  if (Target.isAbsolute()) { // constant
+    // SymbolNum of 0 indicates the absolute section.
+    Type = MachO::X86_64_RELOC_UNSIGNED;
+    Index = 0;
+
+    // FIXME: I believe this is broken, I don't think the linker can understand
+    // it. I think it would require a local relocation, but I'm not sure if that
+    // would work either. The official way to get an absolute PCrel relocation
+    // is to use an absolute symbol (which we don't support yet).
+    if (IsPCRel) {
+      IsExtern = 1;
+      Type = MachO::X86_64_RELOC_BRANCH;
+    }
+  } else if (Target.getSymB()) { // A - B + constant
+    const MCSymbol *A = &Target.getSymA()->getSymbol();
+    if (A->isTemporary())
+      A = &A->AliasedSymbol();
+    const MCSymbolData &A_SD = Asm.getSymbolData(*A);
+    const MCSymbolData *A_Base = Asm.getAtom(&A_SD);
+
+    const MCSymbol *B = &Target.getSymB()->getSymbol();
+    if (B->isTemporary())
+      B = &B->AliasedSymbol();
+    const MCSymbolData &B_SD = Asm.getSymbolData(*B);
+    const MCSymbolData *B_Base = Asm.getAtom(&B_SD);
+
+    // Neither symbol can be modified.
+    if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None ||
+        Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None)
+      report_fatal_error("unsupported relocation of modified symbol", false);
+
+    // We don't support PCrel relocations of differences. Darwin 'as' doesn't
+    // implement most of these correctly.
+    if (IsPCRel)
+      report_fatal_error("unsupported pc-relative relocation of difference",
+                         false);
+
+    // The support for the situation where one or both of the symbols would
+    // require a local relocation is handled just like if the symbols were
+    // external.  This is certainly used in the case of debug sections where the
+    // section has only temporary symbols and thus the symbols don't have base
+    // symbols.  This is encoded using the section ordinal and non-extern
+    // relocation entries.
+
+    // Darwin 'as' doesn't emit correct relocations for this (it ends up with a
+    // single SIGNED relocation); reject it for now.  Except the case where both
+    // symbols don't have a base, equal but both NULL.
+    if (A_Base == B_Base && A_Base)
+      report_fatal_error("unsupported relocation with identical base", false);
+
+    // A subtraction expression where both symbols are undefined is a
+    // non-relocatable expression.
+    if (A->isUndefined() && B->isUndefined())
+      report_fatal_error("unsupported relocation with subtraction expression",
+                         false);
+
+    Value += Writer->getSymbolAddress(&A_SD, Layout) -
+      (!A_Base ? 0 : Writer->getSymbolAddress(A_Base, Layout));
+    Value -= Writer->getSymbolAddress(&B_SD, Layout) -
+      (!B_Base ? 0 : Writer->getSymbolAddress(B_Base, Layout));
+
+    if (A_Base) {
+      Index = A_Base->getIndex();
+      IsExtern = 1;
+    }
+    else {
+      Index = A_SD.getFragment()->getParent()->getOrdinal() + 1;
+      IsExtern = 0;
+    }
+    Type = MachO::X86_64_RELOC_UNSIGNED;
+
+    MachO::any_relocation_info MRE;
+    MRE.r_word0 = FixupOffset;
+    MRE.r_word1 = ((Index     <<  0) |
+                   (IsPCRel   << 24) |
+                   (Log2Size  << 25) |
+                   (IsExtern  << 27) |
+                   (Type      << 28));
+    Writer->addRelocation(Fragment->getParent(), MRE);
+
+    if (B_Base) {
+      Index = B_Base->getIndex();
+      IsExtern = 1;
+    }
+    else {
+      Index = B_SD.getFragment()->getParent()->getOrdinal() + 1;
+      IsExtern = 0;
+    }
+    Type = MachO::X86_64_RELOC_SUBTRACTOR;
+  } else {
+    const MCSymbol *Symbol = &Target.getSymA()->getSymbol();
+    const MCSymbolData &SD = Asm.getSymbolData(*Symbol);
+    const MCSymbolData *Base = Asm.getAtom(&SD);
+
+    // Relocations inside debug sections always use local relocations when
+    // possible. This seems to be done because the debugger doesn't fully
+    // understand x86_64 relocation entries, and expects to find values that
+    // have already been fixed up.
+    if (Symbol->isInSection()) {
+      const MCSectionMachO &Section = static_cast<const MCSectionMachO&>(
+        Fragment->getParent()->getSection());
+      if (Section.hasAttribute(MachO::S_ATTR_DEBUG))
+        Base = nullptr;
+    }
+
+    // x86_64 almost always uses external relocations, except when there is no
+    // symbol to use as a base address (a local symbol with no preceding
+    // non-local symbol).
+    if (Base) {
+      Index = Base->getIndex();
+      IsExtern = 1;
+
+      // Add the local offset, if needed.
+      if (Base != &SD)
+        Value += Layout.getSymbolOffset(&SD) - Layout.getSymbolOffset(Base);
+    } else if (Symbol->isInSection() && !Symbol->isVariable()) {
+      // The index is the section ordinal (1-based).
+      Index = SD.getFragment()->getParent()->getOrdinal() + 1;
+      IsExtern = 0;
+      Value += Writer->getSymbolAddress(&SD, Layout);
+
+      if (IsPCRel)
+        Value -= FixupAddress + (1 << Log2Size);
+    } else if (Symbol->isVariable()) {
+      const MCExpr *Value = Symbol->getVariableValue();
+      int64_t Res;
+      bool isAbs = Value->EvaluateAsAbsolute(Res, Layout,
+                                             Writer->getSectionAddressMap());
+      if (isAbs) {
+        FixedValue = Res;
+        return;
+      } else {
+        report_fatal_error("unsupported relocation of variable '" +
+                           Symbol->getName() + "'", false);
+      }
+    } else {
+      report_fatal_error("unsupported relocation of undefined symbol '" +
+                         Symbol->getName() + "'", false);
+    }
+
+    MCSymbolRefExpr::VariantKind Modifier = Target.getSymA()->getKind();
+    if (IsPCRel) {
+      if (IsRIPRel) {
+        if (Modifier == MCSymbolRefExpr::VK_GOTPCREL) {
+          // x86_64 distinguishes movq foo@GOTPCREL so that the linker can
+          // rewrite the movq to an leaq at link time if the symbol ends up in
+          // the same linkage unit.
+          if (unsigned(Fixup.getKind()) == X86::reloc_riprel_4byte_movq_load)
+            Type = MachO::X86_64_RELOC_GOT_LOAD;
+          else
+            Type = MachO::X86_64_RELOC_GOT;
+        }  else if (Modifier == MCSymbolRefExpr::VK_TLVP) {
+          Type = MachO::X86_64_RELOC_TLV;
+        }  else if (Modifier != MCSymbolRefExpr::VK_None) {
+          report_fatal_error("unsupported symbol modifier in relocation",
+                             false);
+        } else {
+          Type = MachO::X86_64_RELOC_SIGNED;
+
+          // The Darwin x86_64 relocation format has a problem where it cannot
+          // encode an address (L<foo> + <constant>) which is outside the atom
+          // containing L<foo>. Generally, this shouldn't occur but it does
+          // happen when we have a RIPrel instruction with data following the
+          // relocation entry (e.g., movb $012, L0(%rip)). Even with the PCrel
+          // adjustment Darwin x86_64 uses, the offset is still negative and the
+          // linker has no way to recognize this.
+          //
+          // To work around this, Darwin uses several special relocation types
+          // to indicate the offsets. However, the specification or
+          // implementation of these seems to also be incomplete; they should
+          // adjust the addend as well based on the actual encoded instruction
+          // (the additional bias), but instead appear to just look at the final
+          // offset.
+          switch (-(Target.getConstant() + (1LL << Log2Size))) {
+          case 1: Type = MachO::X86_64_RELOC_SIGNED_1; break;
+          case 2: Type = MachO::X86_64_RELOC_SIGNED_2; break;
+          case 4: Type = MachO::X86_64_RELOC_SIGNED_4; break;
+          }
+        }
+      } else {
+        if (Modifier != MCSymbolRefExpr::VK_None)
+          report_fatal_error("unsupported symbol modifier in branch "
+                             "relocation", false);
+
+        Type = MachO::X86_64_RELOC_BRANCH;
+      }
+    } else {
+      if (Modifier == MCSymbolRefExpr::VK_GOT) {
+        Type = MachO::X86_64_RELOC_GOT;
+      } else if (Modifier == MCSymbolRefExpr::VK_GOTPCREL) {
+        // GOTPCREL is allowed as a modifier on non-PCrel instructions, in which
+        // case all we do is set the PCrel bit in the relocation entry; this is
+        // used with exception handling, for example. The source is required to
+        // include any necessary offset directly.
+        Type = MachO::X86_64_RELOC_GOT;
+        IsPCRel = 1;
+      } else if (Modifier == MCSymbolRefExpr::VK_TLVP) {
+        report_fatal_error("TLVP symbol modifier should have been rip-rel",
+                           false);
+      } else if (Modifier != MCSymbolRefExpr::VK_None)
+        report_fatal_error("unsupported symbol modifier in relocation", false);
+      else {
+        Type = MachO::X86_64_RELOC_UNSIGNED;
+        unsigned Kind = Fixup.getKind();
+        if (Kind == X86::reloc_signed_4byte)
+          report_fatal_error("32-bit absolute addressing is not supported in "
+                             "64-bit mode", false);
+      }
+    }
+  }
+
+  // x86_64 always writes custom values into the fixups.
+  FixedValue = Value;
+
+  // struct relocation_info (8 bytes)
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = FixupOffset;
+  MRE.r_word1 = ((Index     <<  0) |
+                 (IsPCRel   << 24) |
+                 (Log2Size  << 25) |
+                 (IsExtern  << 27) |
+                 (Type      << 28));
+  Writer->addRelocation(Fragment->getParent(), MRE);
+}
+
+bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer,
+                                                    const MCAssembler &Asm,
+                                                    const MCAsmLayout &Layout,
+                                                    const MCFragment *Fragment,
+                                                    const MCFixup &Fixup,
+                                                    MCValue Target,
+                                                    unsigned Log2Size,
+                                                    uint64_t &FixedValue) {
+  uint64_t OriginalFixedValue = FixedValue;
+  uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
+  unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+  unsigned Type = MachO::GENERIC_RELOC_VANILLA;
+
+  // See <reloc.h>.
+  const MCSymbol *A = &Target.getSymA()->getSymbol();
+  const MCSymbolData *A_SD = &Asm.getSymbolData(*A);
+
+  if (!A_SD->getFragment())
+    report_fatal_error("symbol '" + A->getName() +
+                       "' can not be undefined in a subtraction expression",
+                       false);
+
+  uint32_t Value = Writer->getSymbolAddress(A_SD, Layout);
+  uint64_t SecAddr = Writer->getSectionAddress(A_SD->getFragment()->getParent());
+  FixedValue += SecAddr;
+  uint32_t Value2 = 0;
+
+  if (const MCSymbolRefExpr *B = Target.getSymB()) {
+    const MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol());
+
+    if (!B_SD->getFragment())
+      report_fatal_error("symbol '" + B->getSymbol().getName() +
+                         "' can not be undefined in a subtraction expression",
+                         false);
+
+    // Select the appropriate difference relocation type.
+    //
+    // Note that there is no longer any semantic difference between these two
+    // relocation types from the linkers point of view, this is done solely for
+    // pedantic compatibility with 'as'.
+    Type = A_SD->isExternal() ? (unsigned)MachO::GENERIC_RELOC_SECTDIFF :
+      (unsigned)MachO::GENERIC_RELOC_LOCAL_SECTDIFF;
+    Value2 = Writer->getSymbolAddress(B_SD, Layout);
+    FixedValue -= Writer->getSectionAddress(B_SD->getFragment()->getParent());
+  }
+
+  // Relocations are written out in reverse order, so the PAIR comes first.
+  if (Type == MachO::GENERIC_RELOC_SECTDIFF ||
+      Type == MachO::GENERIC_RELOC_LOCAL_SECTDIFF) {
+    // If the offset is too large to fit in a scattered relocation,
+    // we're hosed. It's an unfortunate limitation of the MachO format.
+    if (FixupOffset > 0xffffff) {
+      char Buffer[32];
+      format("0x%x", FixupOffset).print(Buffer, sizeof(Buffer));
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                         Twine("Section too large, can't encode "
+                                "r_address (") + Buffer +
+                         ") into 24 bits of scattered "
+                         "relocation entry.");
+      llvm_unreachable("fatal error returned?!");
+    }
+
+    MachO::any_relocation_info MRE;
+    MRE.r_word0 = ((0                         <<  0) | // r_address
+                   (MachO::GENERIC_RELOC_PAIR << 24) | // r_type
+                   (Log2Size                  << 28) |
+                   (IsPCRel                   << 30) |
+                   MachO::R_SCATTERED);
+    MRE.r_word1 = Value2;
+    Writer->addRelocation(Fragment->getParent(), MRE);
+  } else {
+    // If the offset is more than 24-bits, it won't fit in a scattered
+    // relocation offset field, so we fall back to using a non-scattered
+    // relocation. This is a bit risky, as if the offset reaches out of
+    // the block and the linker is doing scattered loading on this
+    // symbol, things can go badly.
+    //
+    // Required for 'as' compatibility.
+    if (FixupOffset > 0xffffff) {
+      FixedValue = OriginalFixedValue;
+      return false;
+    }
+  }
+
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = ((FixupOffset <<  0) |
+                 (Type        << 24) |
+                 (Log2Size    << 28) |
+                 (IsPCRel     << 30) |
+                 MachO::R_SCATTERED);
+  MRE.r_word1 = Value;
+  Writer->addRelocation(Fragment->getParent(), MRE);
+  return true;
+}
+
+void X86MachObjectWriter::RecordTLVPRelocation(MachObjectWriter *Writer,
+                                               const MCAssembler &Asm,
+                                               const MCAsmLayout &Layout,
+                                               const MCFragment *Fragment,
+                                               const MCFixup &Fixup,
+                                               MCValue Target,
+                                               uint64_t &FixedValue) {
+  assert(Target.getSymA()->getKind() == MCSymbolRefExpr::VK_TLVP &&
+         !is64Bit() &&
+         "Should only be called with a 32-bit TLVP relocation!");
+
+  unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind());
+  uint32_t Value = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
+  unsigned IsPCRel = 0;
+
+  // Get the symbol data.
+  const MCSymbolData *SD_A = &Asm.getSymbolData(Target.getSymA()->getSymbol());
+  unsigned Index = SD_A->getIndex();
+
+  // We're only going to have a second symbol in pic mode and it'll be a
+  // subtraction from the picbase. For 32-bit pic the addend is the difference
+  // between the picbase and the next address.  For 32-bit static the addend is
+  // zero.
+  if (Target.getSymB()) {
+    // If this is a subtraction then we're pcrel.
+    uint32_t FixupAddress =
+      Writer->getFragmentAddress(Fragment, Layout) + Fixup.getOffset();
+    const MCSymbolData *SD_B =
+        &Asm.getSymbolData(Target.getSymB()->getSymbol());
+    IsPCRel = 1;
+    FixedValue = (FixupAddress - Writer->getSymbolAddress(SD_B, Layout) +
+                  Target.getConstant());
+    FixedValue += 1ULL << Log2Size;
+  } else {
+    FixedValue = 0;
+  }
+
+  // struct relocation_info (8 bytes)
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = Value;
+  MRE.r_word1 = ((Index                    <<  0) |
+                 (IsPCRel                  << 24) |
+                 (Log2Size                 << 25) |
+                 (1                        << 27) | // r_extern
+                 (MachO::GENERIC_RELOC_TLV << 28)); // r_type
+  Writer->addRelocation(Fragment->getParent(), MRE);
+}
+
+void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer,
+                                              const MCAssembler &Asm,
+                                              const MCAsmLayout &Layout,
+                                              const MCFragment *Fragment,
+                                              const MCFixup &Fixup,
+                                              MCValue Target,
+                                              uint64_t &FixedValue) {
+  unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+  unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind());
+
+  // If this is a 32-bit TLVP reloc it's handled a bit differently.
+  if (Target.getSymA() &&
+      Target.getSymA()->getKind() == MCSymbolRefExpr::VK_TLVP) {
+    RecordTLVPRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+                         FixedValue);
+    return;
+  }
+
+  // If this is a difference or a defined symbol plus an offset, then we need a
+  // scattered relocation entry. Differences always require scattered
+  // relocations.
+  if (Target.getSymB()) {
+    RecordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
+                              Target, Log2Size, FixedValue);
+    return;
+  }
+
+  // Get the symbol data, if any.
+  const MCSymbolData *SD = nullptr;
+  if (Target.getSymA())
+    SD = &Asm.getSymbolData(Target.getSymA()->getSymbol());
+
+  // If this is an internal relocation with an offset, it also needs a scattered
+  // relocation entry.
+  uint32_t Offset = Target.getConstant();
+  if (IsPCRel)
+    Offset += 1 << Log2Size;
+  // Try to record the scattered relocation if needed. Fall back to non
+  // scattered if necessary (see comments in RecordScatteredRelocation()
+  // for details).
+  if (Offset && SD && !Writer->doesSymbolRequireExternRelocation(SD) &&
+      RecordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
+                                Target, Log2Size, FixedValue))
+    return;
+
+  // See <reloc.h>.
+  uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
+  unsigned Index = 0;
+  unsigned IsExtern = 0;
+  unsigned Type = 0;
+
+  if (Target.isAbsolute()) { // constant
+    // SymbolNum of 0 indicates the absolute section.
+    //
+    // FIXME: Currently, these are never generated (see code below). I cannot
+    // find a case where they are actually emitted.
+    Type = MachO::GENERIC_RELOC_VANILLA;
+  } else {
+    // Resolve constant variables.
+    if (SD->getSymbol().isVariable()) {
+      int64_t Res;
+      if (SD->getSymbol().getVariableValue()->EvaluateAsAbsolute(
+            Res, Layout, Writer->getSectionAddressMap())) {
+        FixedValue = Res;
+        return;
+      }
+    }
+
+    // Check whether we need an external or internal relocation.
+    if (Writer->doesSymbolRequireExternRelocation(SD)) {
+      IsExtern = 1;
+      Index = SD->getIndex();
+      // For external relocations, make sure to offset the fixup value to
+      // compensate for the addend of the symbol address, if it was
+      // undefined. This occurs with weak definitions, for example.
+      if (!SD->Symbol->isUndefined())
+        FixedValue -= Layout.getSymbolOffset(SD);
+    } else {
+      // The index is the section ordinal (1-based).
+      const MCSectionData &SymSD = Asm.getSectionData(
+        SD->getSymbol().getSection());
+      Index = SymSD.getOrdinal() + 1;
+      FixedValue += Writer->getSectionAddress(&SymSD);
+    }
+    if (IsPCRel)
+      FixedValue -= Writer->getSectionAddress(Fragment->getParent());
+
+    Type = MachO::GENERIC_RELOC_VANILLA;
+  }
+
+  // struct relocation_info (8 bytes)
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = FixupOffset;
+  MRE.r_word1 = ((Index     <<  0) |
+                 (IsPCRel   << 24) |
+                 (Log2Size  << 25) |
+                 (IsExtern  << 27) |
+                 (Type      << 28));
+  Writer->addRelocation(Fragment->getParent(), MRE);
+}
+
+MCObjectWriter *llvm::createX86MachObjectWriter(raw_ostream &OS,
+                                                bool Is64Bit,
+                                                uint32_t CPUType,
+                                                uint32_t CPUSubtype) {
+  return createMachObjectWriter(new X86MachObjectWriter(Is64Bit,
+                                                        CPUType,
+                                                        CPUSubtype),
+                                OS, /*IsLittleEndian=*/true);
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
new file mode 100644
index 0000000..40af822
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
@@ -0,0 +1,95 @@
+//===-- X86WinCOFFObjectWriter.cpp - X86 Win COFF Writer ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86FixupKinds.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/MC/MCWinCOFFObjectWriter.h"
+#include "llvm/Support/COFF.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace llvm {
+  class MCObjectWriter;
+}
+
+namespace {
+  class X86WinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter {
+  public:
+    X86WinCOFFObjectWriter(bool Is64Bit);
+    virtual ~X86WinCOFFObjectWriter();
+
+    unsigned getRelocType(const MCValue &Target, const MCFixup &Fixup,
+                          bool IsCrossSection) const override;
+  };
+}
+
+X86WinCOFFObjectWriter::X86WinCOFFObjectWriter(bool Is64Bit)
+    : MCWinCOFFObjectTargetWriter(Is64Bit ? COFF::IMAGE_FILE_MACHINE_AMD64
+                                          : COFF::IMAGE_FILE_MACHINE_I386) {}
+
+X86WinCOFFObjectWriter::~X86WinCOFFObjectWriter() {}
+
+unsigned X86WinCOFFObjectWriter::getRelocType(const MCValue &Target,
+                                              const MCFixup &Fixup,
+                                              bool IsCrossSection) const {
+  unsigned FixupKind = IsCrossSection ? FK_PCRel_4 : Fixup.getKind();
+
+  MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ?
+    MCSymbolRefExpr::VK_None : Target.getSymA()->getKind();
+
+  if (getMachine() == COFF::IMAGE_FILE_MACHINE_AMD64) {
+    switch (FixupKind) {
+    case FK_PCRel_4:
+    case X86::reloc_riprel_4byte:
+    case X86::reloc_riprel_4byte_movq_load:
+      return COFF::IMAGE_REL_AMD64_REL32;
+    case FK_Data_4:
+    case X86::reloc_signed_4byte:
+      if (Modifier == MCSymbolRefExpr::VK_COFF_IMGREL32)
+        return COFF::IMAGE_REL_AMD64_ADDR32NB;
+      return COFF::IMAGE_REL_AMD64_ADDR32;
+    case FK_Data_8:
+      return COFF::IMAGE_REL_AMD64_ADDR64;
+    case FK_SecRel_2:
+      return COFF::IMAGE_REL_AMD64_SECTION;
+    case FK_SecRel_4:
+      return COFF::IMAGE_REL_AMD64_SECREL;
+    default:
+      llvm_unreachable("unsupported relocation type");
+    }
+  } else if (getMachine() == COFF::IMAGE_FILE_MACHINE_I386) {
+    switch (FixupKind) {
+    case FK_PCRel_4:
+    case X86::reloc_riprel_4byte:
+    case X86::reloc_riprel_4byte_movq_load:
+      return COFF::IMAGE_REL_I386_REL32;
+    case FK_Data_4:
+    case X86::reloc_signed_4byte:
+      if (Modifier == MCSymbolRefExpr::VK_COFF_IMGREL32)
+        return COFF::IMAGE_REL_I386_DIR32NB;
+      return COFF::IMAGE_REL_I386_DIR32;
+    case FK_SecRel_2:
+      return COFF::IMAGE_REL_I386_SECTION;
+    case FK_SecRel_4:
+      return COFF::IMAGE_REL_I386_SECREL;
+    default:
+      llvm_unreachable("unsupported relocation type");
+    }
+  } else
+    llvm_unreachable("Unsupported COFF machine type.");
+}
+
+MCObjectWriter *llvm::createX86WinCOFFObjectWriter(raw_ostream &OS,
+                                                   bool Is64Bit) {
+  MCWinCOFFObjectTargetWriter *MOTW = new X86WinCOFFObjectWriter(Is64Bit);
+  return createWinCOFFObjectWriter(MOTW, OS);
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
new file mode 100644
index 0000000..6727f5e
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
@@ -0,0 +1,51 @@
+//===-- X86WinCOFFStreamer.cpp - X86 Target WinCOFF Streamer ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MCTargetDesc.h"
+#include "llvm/MC/MCWinCOFFStreamer.h"
+
+using namespace llvm;
+
+namespace {
+class X86WinCOFFStreamer : public MCWinCOFFStreamer {
+public:
+  X86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB, MCCodeEmitter *CE,
+                     raw_ostream &OS)
+    : MCWinCOFFStreamer(C, AB, *CE, OS) { }
+
+  void EmitWinEHHandlerData() override;
+  void FinishImpl() override;
+};
+
+void X86WinCOFFStreamer::EmitWinEHHandlerData() {
+  MCStreamer::EmitWinEHHandlerData();
+
+  // We have to emit the unwind info now, because this directive
+  // actually switches to the .xdata section!
+  MCWin64EHUnwindEmitter::EmitUnwindInfo(*this, getCurrentWinFrameInfo());
+}
+
+void X86WinCOFFStreamer::FinishImpl() {
+  EmitFrames(nullptr);
+  EmitWindowsUnwindTables();
+
+  MCWinCOFFStreamer::FinishImpl();
+}
+}
+
+namespace llvm {
+MCStreamer *createX86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB,
+                                     MCCodeEmitter *CE, raw_ostream &OS,
+                                     bool RelaxAll) {
+  X86WinCOFFStreamer *S = new X86WinCOFFStreamer(C, AB, CE, OS);
+  S->getAssembler().setRelaxAll(RelaxAll);
+  return S;
+}
+}
+
diff --git a/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp b/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
new file mode 100644
index 0000000..1ea8798
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
@@ -0,0 +1,22 @@
+//===-- X86TargetInfo.cpp - X86 Target Implementation ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheX86_32Target, llvm::TheX86_64Target;
+
+extern "C" void LLVMInitializeX86TargetInfo() { 
+  RegisterTarget<Triple::x86, /*HasJIT=*/true>
+    X(TheX86_32Target, "x86", "32-bit X86: Pentium-Pro and above");
+
+  RegisterTarget<Triple::x86_64, /*HasJIT=*/true>
+    Y(TheX86_64Target, "x86-64", "64-bit X86: EM64T and AMD64");
+}
diff --git a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp
new file mode 100644
index 0000000..5f2441c
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp
@@ -0,0 +1,218 @@
+//===-- X86ShuffleDecode.cpp - X86 shuffle decode logic -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Define several functions to decode x86 specific shuffle semantics into a
+// generic vector mask.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86ShuffleDecode.h"
+#include "llvm/CodeGen/MachineValueType.h"
+
+//===----------------------------------------------------------------------===//
+//  Vector Mask Decoding
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+
+void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+  // Defaults the copying the dest value.
+  ShuffleMask.push_back(0);
+  ShuffleMask.push_back(1);
+  ShuffleMask.push_back(2);
+  ShuffleMask.push_back(3);
+
+  // Decode the immediate.
+  unsigned ZMask = Imm & 15;
+  unsigned CountD = (Imm >> 4) & 3;
+  unsigned CountS = (Imm >> 6) & 3;
+
+  // CountS selects which input element to use.
+  unsigned InVal = 4+CountS;
+  // CountD specifies which element of destination to update.
+  ShuffleMask[CountD] = InVal;
+  // ZMask zaps values, potentially overriding the CountD elt.
+  if (ZMask & 1) ShuffleMask[0] = SM_SentinelZero;
+  if (ZMask & 2) ShuffleMask[1] = SM_SentinelZero;
+  if (ZMask & 4) ShuffleMask[2] = SM_SentinelZero;
+  if (ZMask & 8) ShuffleMask[3] = SM_SentinelZero;
+}
+
+// <3,1> or <6,7,2,3>
+void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) {
+  for (unsigned i = NElts/2; i != NElts; ++i)
+    ShuffleMask.push_back(NElts+i);
+
+  for (unsigned i = NElts/2; i != NElts; ++i)
+    ShuffleMask.push_back(i);
+}
+
+// <0,2> or <0,1,4,5>
+void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) {
+  for (unsigned i = 0; i != NElts/2; ++i)
+    ShuffleMask.push_back(i);
+
+  for (unsigned i = 0; i != NElts/2; ++i)
+    ShuffleMask.push_back(NElts+i);
+}
+
+void DecodePALIGNRMask(MVT VT, unsigned Imm,
+                       SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned Offset = Imm * (VT.getVectorElementType().getSizeInBits() / 8);
+
+  unsigned NumLanes = VT.getSizeInBits() / 128;
+  unsigned NumLaneElts = NumElts / NumLanes;
+
+  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+    for (unsigned i = 0; i != NumLaneElts; ++i) {
+      unsigned Base = i + Offset;
+      // if i+offset is out of this lane then we actually need the other source
+      if (Base >= NumLaneElts) Base += NumElts - NumLaneElts;
+      ShuffleMask.push_back(Base + l);
+    }
+  }
+}
+
+/// DecodePSHUFMask - This decodes the shuffle masks for pshufd, and vpermilp*.
+/// VT indicates the type of the vector allowing it to handle different
+/// datatypes and vector widths.
+void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  unsigned NumLanes = VT.getSizeInBits() / 128;
+  unsigned NumLaneElts = NumElts / NumLanes;
+
+  unsigned NewImm = Imm;
+  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+    for (unsigned i = 0; i != NumLaneElts; ++i) {
+      ShuffleMask.push_back(NewImm % NumLaneElts + l);
+      NewImm /= NumLaneElts;
+    }
+    if (NumLaneElts == 4) NewImm = Imm; // reload imm
+  }
+}
+
+void DecodePSHUFHWMask(MVT VT, unsigned Imm,
+                       SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  for (unsigned l = 0; l != NumElts; l += 8) {
+    unsigned NewImm = Imm;
+    for (unsigned i = 0, e = 4; i != e; ++i) {
+      ShuffleMask.push_back(l + i);
+    }
+    for (unsigned i = 4, e = 8; i != e; ++i) {
+      ShuffleMask.push_back(l + 4 + (NewImm & 3));
+      NewImm >>= 2;
+    }
+  }
+}
+
+void DecodePSHUFLWMask(MVT VT, unsigned Imm,
+                       SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  for (unsigned l = 0; l != NumElts; l += 8) {
+    unsigned NewImm = Imm;
+    for (unsigned i = 0, e = 4; i != e; ++i) {
+      ShuffleMask.push_back(l + (NewImm & 3));
+      NewImm >>= 2;
+    }
+    for (unsigned i = 4, e = 8; i != e; ++i) {
+      ShuffleMask.push_back(l + i);
+    }
+  }
+}
+
+/// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates
+/// the type of the vector allowing it to handle different datatypes and vector
+/// widths.
+void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  unsigned NumLanes = VT.getSizeInBits() / 128;
+  unsigned NumLaneElts = NumElts / NumLanes;
+
+  unsigned NewImm = Imm;
+  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+    // each half of a lane comes from different source
+    for (unsigned s = 0; s != NumElts*2; s += NumElts) {
+      for (unsigned i = 0; i != NumLaneElts/2; ++i) {
+        ShuffleMask.push_back(NewImm % NumLaneElts + s + l);
+        NewImm /= NumLaneElts;
+      }
+    }
+    if (NumLaneElts == 4) NewImm = Imm; // reload imm
+  }
+}
+
+/// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd
+/// and punpckh*. VT indicates the type of the vector allowing it to handle
+/// different datatypes and vector widths.
+void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+  // independently on 128-bit lanes.
+  unsigned NumLanes = VT.getSizeInBits() / 128;
+  if (NumLanes == 0 ) NumLanes = 1;  // Handle MMX
+  unsigned NumLaneElts = NumElts / NumLanes;
+
+  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+    for (unsigned i = l + NumLaneElts/2, e = l + NumLaneElts; i != e; ++i) {
+      ShuffleMask.push_back(i);          // Reads from dest/src1
+      ShuffleMask.push_back(i+NumElts);  // Reads from src/src2
+    }
+  }
+}
+
+/// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd
+/// and punpckl*. VT indicates the type of the vector allowing it to handle
+/// different datatypes and vector widths.
+void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+  // independently on 128-bit lanes.
+  unsigned NumLanes = VT.getSizeInBits() / 128;
+  if (NumLanes == 0 ) NumLanes = 1;  // Handle MMX
+  unsigned NumLaneElts = NumElts / NumLanes;
+
+  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+    for (unsigned i = l, e = l + NumLaneElts/2; i != e; ++i) {
+      ShuffleMask.push_back(i);          // Reads from dest/src1
+      ShuffleMask.push_back(i+NumElts);  // Reads from src/src2
+    }
+  }
+}
+
+void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
+                          SmallVectorImpl<int> &ShuffleMask) {
+  if (Imm & 0x88)
+    return; // Not a shuffle
+
+  unsigned HalfSize = VT.getVectorNumElements()/2;
+
+  for (unsigned l = 0; l != 2; ++l) {
+    unsigned HalfBegin = ((Imm >> (l*4)) & 0x3) * HalfSize;
+    for (unsigned i = HalfBegin, e = HalfBegin+HalfSize; i != e; ++i)
+      ShuffleMask.push_back(i);
+  }
+}
+
+/// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD.
+/// No VT provided since it only works on 256-bit, 4 element vectors.
+void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+  for (unsigned i = 0; i != 4; ++i) {
+    ShuffleMask.push_back((Imm >> (2*i)) & 3);
+  }
+}
+
+} // llvm namespace
diff --git a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h
new file mode 100644
index 0000000..9e75b6b
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h
@@ -0,0 +1,72 @@
+//===-- X86ShuffleDecode.h - X86 shuffle decode logic -----------*-C++-*---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Define several functions to decode x86 specific shuffle semantics into a
+// generic vector mask.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86_SHUFFLE_DECODE_H
+#define X86_SHUFFLE_DECODE_H
+
+#include "llvm/ADT/SmallVector.h"
+
+//===----------------------------------------------------------------------===//
+//  Vector Mask Decoding
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+class MVT;
+
+enum {
+  SM_SentinelZero = -1
+};
+
+void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+// <3,1> or <6,7,2,3>
+void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask);
+
+// <0,2> or <0,1,4,5>
+void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePALIGNRMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePSHUFHWMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePSHUFLWMask(MVT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+/// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates
+/// the type of the vector allowing it to handle different datatypes and vector
+/// widths.
+void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+/// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd
+/// and punpckh*. VT indicates the type of the vector allowing it to handle
+/// different datatypes and vector widths.
+void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+/// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd
+/// and punpckl*. VT indicates the type of the vector allowing it to handle
+/// different datatypes and vector widths.
+void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+
+void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
+                          SmallVectorImpl<int> &ShuffleMask);
+
+/// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD.
+/// No VT provided since it only works on 256-bit, 4 element vectors.
+void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+} // llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86.h b/contrib/llvm/lib/Target/X86/X86.h
new file mode 100644
index 0000000..d5522ed
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86.h
@@ -0,0 +1,82 @@
+//===-- X86.h - Top-level interface for X86 representation ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the x86
+// target library, as used by the LLVM JIT.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_X86_H
+#define TARGET_X86_H
+
+#include "llvm/Support/CodeGen.h"
+
+namespace llvm {
+
+class FunctionPass;
+class ImmutablePass;
+class JITCodeEmitter;
+class X86TargetMachine;
+
+/// createX86AtomicExpandPass - This pass expands atomic operations that cannot
+/// be handled natively in terms of a loop using cmpxchg.
+FunctionPass *createX86AtomicExpandPass(const X86TargetMachine *TM);
+
+/// createX86ISelDag - This pass converts a legalized DAG into a
+/// X86-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *createX86ISelDag(X86TargetMachine &TM,
+                               CodeGenOpt::Level OptLevel);
+
+/// createX86GlobalBaseRegPass - This pass initializes a global base
+/// register for PIC on x86-32.
+FunctionPass* createX86GlobalBaseRegPass();
+
+/// createCleanupLocalDynamicTLSPass() - This pass combines multiple accesses
+/// to local-dynamic TLS variables so that the TLS base address for the module
+/// is only fetched once per execution path through the function.
+FunctionPass *createCleanupLocalDynamicTLSPass();
+
+/// createX86FloatingPointStackifierPass - This function returns a pass which
+/// converts floating point register references and pseudo instructions into
+/// floating point stack references and physical instructions.
+///
+FunctionPass *createX86FloatingPointStackifierPass();
+
+/// createX86IssueVZeroUpperPass - This pass inserts AVX vzeroupper instructions
+/// before each call to avoid transition penalty between functions encoded with
+/// AVX and SSE.
+FunctionPass *createX86IssueVZeroUpperPass();
+
+/// createX86CodeEmitterPass - Return a pass that emits the collected X86 code
+/// to the specified MCE object.
+FunctionPass *createX86JITCodeEmitterPass(X86TargetMachine &TM,
+                                          JITCodeEmitter &JCE);
+
+/// createX86EmitCodeToMemory - Returns a pass that converts a register
+/// allocated function into raw machine code in a dynamically
+/// allocated chunk of memory.
+///
+FunctionPass *createEmitX86CodeToMemory();
+
+/// \brief Creates an X86-specific Target Transformation Info pass.
+ImmutablePass *createX86TargetTransformInfoPass(const X86TargetMachine *TM);
+
+/// createX86PadShortFunctions - Return a pass that pads short functions
+/// with NOOPs. This will prevent a stall when returning on the Atom.
+FunctionPass *createX86PadShortFunctions();
+/// createX86FixupLEAs - Return a a pass that selectively replaces
+/// certain instructions (like add, sub, inc, dec, some shifts,
+/// and some multiplies) by equivalent LEA instructions, in order
+/// to eliminate execution delays in some Atom processors.
+FunctionPass *createX86FixupLEAs();
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86.td b/contrib/llvm/lib/Target/X86/X86.td
new file mode 100644
index 0000000..cd32a0f
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86.td
@@ -0,0 +1,460 @@
+//===-- X86.td - Target definition file for the Intel X86 --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a target description file for the Intel i386 architecture, referred
+// to here as the "X86" architecture.
+//
+//===----------------------------------------------------------------------===//
+
+// Get the target-independent interfaces which we are implementing...
+//
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// X86 Subtarget state
+//
+
+def Mode64Bit : SubtargetFeature<"64bit-mode", "In64BitMode", "true",
+                                  "64-bit mode (x86_64)">;
+def Mode32Bit : SubtargetFeature<"32bit-mode", "In32BitMode", "true",
+                                  "32-bit mode (80386)">;
+def Mode16Bit : SubtargetFeature<"16bit-mode", "In16BitMode", "true",
+                                  "16-bit mode (i8086)">;
+
+//===----------------------------------------------------------------------===//
+// X86 Subtarget features
+//===----------------------------------------------------------------------===//
+
+def FeatureCMOV    : SubtargetFeature<"cmov","HasCMov", "true",
+                                      "Enable conditional move instructions">;
+
+def FeaturePOPCNT   : SubtargetFeature<"popcnt", "HasPOPCNT", "true",
+                                       "Support POPCNT instruction">;
+
+
+def FeatureMMX     : SubtargetFeature<"mmx","X86SSELevel", "MMX",
+                                      "Enable MMX instructions">;
+def FeatureSSE1    : SubtargetFeature<"sse", "X86SSELevel", "SSE1",
+                                      "Enable SSE instructions",
+                                      // SSE codegen depends on cmovs, and all
+                                      // SSE1+ processors support them.
+                                      [FeatureMMX, FeatureCMOV]>;
+def FeatureSSE2    : SubtargetFeature<"sse2", "X86SSELevel", "SSE2",
+                                      "Enable SSE2 instructions",
+                                      [FeatureSSE1]>;
+def FeatureSSE3    : SubtargetFeature<"sse3", "X86SSELevel", "SSE3",
+                                      "Enable SSE3 instructions",
+                                      [FeatureSSE2]>;
+def FeatureSSSE3   : SubtargetFeature<"ssse3", "X86SSELevel", "SSSE3",
+                                      "Enable SSSE3 instructions",
+                                      [FeatureSSE3]>;
+def FeatureSSE41   : SubtargetFeature<"sse4.1", "X86SSELevel", "SSE41",
+                                      "Enable SSE 4.1 instructions",
+                                      [FeatureSSSE3]>;
+def FeatureSSE42   : SubtargetFeature<"sse4.2", "X86SSELevel", "SSE42",
+                                      "Enable SSE 4.2 instructions",
+                                      [FeatureSSE41]>;
+def Feature3DNow   : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow",
+                                      "Enable 3DNow! instructions",
+                                      [FeatureMMX]>;
+def Feature3DNowA  : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA",
+                                      "Enable 3DNow! Athlon instructions",
+                                      [Feature3DNow]>;
+// All x86-64 hardware has SSE2, but we don't mark SSE2 as an implied
+// feature, because SSE2 can be disabled (e.g. for compiling OS kernels)
+// without disabling 64-bit mode.
+def Feature64Bit   : SubtargetFeature<"64bit", "HasX86_64", "true",
+                                      "Support 64-bit instructions",
+                                      [FeatureCMOV]>;
+def FeatureCMPXCHG16B : SubtargetFeature<"cx16", "HasCmpxchg16b", "true",
+                                      "64-bit with cmpxchg16b",
+                                      [Feature64Bit]>;
+def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true",
+                                       "Bit testing of memory is slow">;
+def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true",
+                                       "SHLD instruction is slow">;
+def FeatureFastUAMem : SubtargetFeature<"fast-unaligned-mem",
+                                        "IsUAMemFast", "true",
+                                        "Fast unaligned memory access">;
+def FeatureSSE4A   : SubtargetFeature<"sse4a", "HasSSE4A", "true",
+                                      "Support SSE 4a instructions",
+                                      [FeatureSSE3]>;
+
+def FeatureAVX     : SubtargetFeature<"avx", "X86SSELevel", "AVX",
+                                      "Enable AVX instructions",
+                                      [FeatureSSE42]>;
+def FeatureAVX2    : SubtargetFeature<"avx2", "X86SSELevel", "AVX2",
+                                      "Enable AVX2 instructions",
+                                      [FeatureAVX]>;
+def FeatureAVX512   : SubtargetFeature<"avx512f", "X86SSELevel", "AVX512F",
+                                      "Enable AVX-512 instructions",
+                                      [FeatureAVX2]>;
+def FeatureERI      : SubtargetFeature<"avx512er", "HasERI", "true",
+                      "Enable AVX-512 Exponential and Reciprocal Instructions",
+                                      [FeatureAVX512]>;
+def FeatureCDI      : SubtargetFeature<"avx512cd", "HasCDI", "true",
+                      "Enable AVX-512 Conflict Detection Instructions",
+                                      [FeatureAVX512]>;
+def FeaturePFI      : SubtargetFeature<"avx512pf", "HasPFI", "true",
+                      "Enable AVX-512 PreFetch Instructions",
+                                      [FeatureAVX512]>;
+def FeatureDQI     : SubtargetFeature<"avx512dq", "HasDQI", "true",
+                      "Enable AVX-512 Doubleword and Quadword Instructions",
+                                      [FeatureAVX512]>;
+def FeatureBWI     : SubtargetFeature<"avx512bw", "HasBWI", "true",
+                      "Enable AVX-512 Byte and Word Instructions",
+                                      [FeatureAVX512]>;
+def FeatureVLX     : SubtargetFeature<"avx512vl", "HasVLX", "true",
+                      "Enable AVX-512 Vector Length eXtensions",
+                                      [FeatureAVX512]>;
+def FeaturePCLMUL  : SubtargetFeature<"pclmul", "HasPCLMUL", "true",
+                         "Enable packed carry-less multiplication instructions",
+                               [FeatureSSE2]>;
+def FeatureFMA     : SubtargetFeature<"fma", "HasFMA", "true",
+                                      "Enable three-operand fused multiple-add",
+                                      [FeatureAVX]>;
+def FeatureFMA4    : SubtargetFeature<"fma4", "HasFMA4", "true",
+                                      "Enable four-operand fused multiple-add",
+                                      [FeatureAVX, FeatureSSE4A]>;
+def FeatureXOP     : SubtargetFeature<"xop", "HasXOP", "true",
+                                      "Enable XOP instructions",
+                                      [FeatureFMA4]>;
+def FeatureVectorUAMem : SubtargetFeature<"vector-unaligned-mem",
+                                          "HasVectorUAMem", "true",
+                 "Allow unaligned memory operands on vector/SIMD instructions">;
+def FeatureAES     : SubtargetFeature<"aes", "HasAES", "true",
+                                      "Enable AES instructions",
+                                      [FeatureSSE2]>;
+def FeatureTBM     : SubtargetFeature<"tbm", "HasTBM", "true",
+                                      "Enable TBM instructions">;
+def FeatureMOVBE   : SubtargetFeature<"movbe", "HasMOVBE", "true",
+                                      "Support MOVBE instruction">;
+def FeatureRDRAND  : SubtargetFeature<"rdrnd", "HasRDRAND", "true",
+                                      "Support RDRAND instruction">;
+def FeatureF16C    : SubtargetFeature<"f16c", "HasF16C", "true",
+                       "Support 16-bit floating point conversion instructions",
+                       [FeatureAVX]>;
+def FeatureFSGSBase : SubtargetFeature<"fsgsbase", "HasFSGSBase", "true",
+                                       "Support FS/GS Base instructions">;
+def FeatureLZCNT   : SubtargetFeature<"lzcnt", "HasLZCNT", "true",
+                                      "Support LZCNT instruction">;
+def FeatureBMI     : SubtargetFeature<"bmi", "HasBMI", "true",
+                                      "Support BMI instructions">;
+def FeatureBMI2    : SubtargetFeature<"bmi2", "HasBMI2", "true",
+                                      "Support BMI2 instructions">;
+def FeatureRTM     : SubtargetFeature<"rtm", "HasRTM", "true",
+                                      "Support RTM instructions">;
+def FeatureHLE     : SubtargetFeature<"hle", "HasHLE", "true",
+                                      "Support HLE">;
+def FeatureADX     : SubtargetFeature<"adx", "HasADX", "true",
+                                      "Support ADX instructions">;
+def FeatureSHA     : SubtargetFeature<"sha", "HasSHA", "true",
+                                      "Enable SHA instructions",
+                                      [FeatureSSE2]>;
+def FeaturePRFCHW  : SubtargetFeature<"prfchw", "HasPRFCHW", "true",
+                                      "Support PRFCHW instructions">;
+def FeatureRDSEED  : SubtargetFeature<"rdseed", "HasRDSEED", "true",
+                                      "Support RDSEED instruction">;
+def FeatureLeaForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
+                                     "Use LEA for adjusting the stack pointer">;
+def FeatureSlowDivide : SubtargetFeature<"idiv-to-divb",
+                                     "HasSlowDivide", "true",
+                                     "Use small divide for positive values less than 256">;
+def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions",
+                                     "PadShortFunctions", "true",
+                                     "Pad short functions">;
+def FeatureCallRegIndirect : SubtargetFeature<"call-reg-indirect",
+                                     "CallRegIndirect", "true",
+                                     "Call register indirect">;
+def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true",
+                                   "LEA instruction needs inputs at AG stage">;
+def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true",
+                                   "LEA instruction with certain arguments is slow">;
+def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
+                                   "INC and DEC instructions are slower than ADD and SUB">;
+
+//===----------------------------------------------------------------------===//
+// X86 processors supported.
+//===----------------------------------------------------------------------===//
+
+include "X86Schedule.td"
+
+def ProcIntelAtom : SubtargetFeature<"atom", "X86ProcFamily", "IntelAtom",
+                    "Intel Atom processors">;
+def ProcIntelSLM  : SubtargetFeature<"slm", "X86ProcFamily", "IntelSLM",
+                    "Intel Silvermont processors">;
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : ProcessorModel<Name, GenericModel, Features>;
+
+def : Proc<"generic",         []>;
+def : Proc<"i386",            []>;
+def : Proc<"i486",            []>;
+def : Proc<"i586",            []>;
+def : Proc<"pentium",         []>;
+def : Proc<"pentium-mmx",     [FeatureMMX]>;
+def : Proc<"i686",            []>;
+def : Proc<"pentiumpro",      [FeatureCMOV]>;
+def : Proc<"pentium2",        [FeatureMMX, FeatureCMOV]>;
+def : Proc<"pentium3",        [FeatureSSE1]>;
+def : Proc<"pentium3m",       [FeatureSSE1, FeatureSlowBTMem]>;
+def : Proc<"pentium-m",       [FeatureSSE2, FeatureSlowBTMem]>;
+def : Proc<"pentium4",        [FeatureSSE2]>;
+def : Proc<"pentium4m",       [FeatureSSE2, FeatureSlowBTMem]>;
+
+// Intel Core Duo.
+def : ProcessorModel<"yonah", SandyBridgeModel,
+                     [FeatureSSE3, FeatureSlowBTMem]>;
+
+// NetBurst.
+def : Proc<"prescott", [FeatureSSE3, FeatureSlowBTMem]>;
+def : Proc<"nocona",   [FeatureSSE3, FeatureCMPXCHG16B, FeatureSlowBTMem]>;
+
+// Intel Core 2 Solo/Duo.
+def : ProcessorModel<"core2", SandyBridgeModel,
+                     [FeatureSSSE3, FeatureCMPXCHG16B, FeatureSlowBTMem]>;
+def : ProcessorModel<"penryn", SandyBridgeModel,
+                     [FeatureSSE41, FeatureCMPXCHG16B, FeatureSlowBTMem]>;
+
+// Atom.
+def : ProcessorModel<"atom", AtomModel,
+                     [ProcIntelAtom, FeatureSSSE3, FeatureCMPXCHG16B,
+                      FeatureMOVBE, FeatureSlowBTMem, FeatureLeaForSP,
+                      FeatureSlowDivide,
+                      FeatureCallRegIndirect,
+                      FeatureLEAUsesAG,
+                      FeaturePadShortFunctions]>;
+
+// Atom Silvermont.
+def : ProcessorModel<"slm",  SLMModel, [ProcIntelSLM,
+                               FeatureSSE42, FeatureCMPXCHG16B,
+                               FeatureMOVBE, FeaturePOPCNT,
+                               FeaturePCLMUL, FeatureAES,
+                               FeatureCallRegIndirect,
+                               FeaturePRFCHW,
+                               FeatureSlowLEA, FeatureSlowIncDec,
+                               FeatureSlowBTMem, FeatureFastUAMem]>;
+// "Arrandale" along with corei3 and corei5
+def : ProcessorModel<"corei7", SandyBridgeModel,
+                     [FeatureSSE42, FeatureCMPXCHG16B, FeatureSlowBTMem,
+                      FeatureFastUAMem, FeaturePOPCNT, FeatureAES]>;
+
+def : ProcessorModel<"nehalem", SandyBridgeModel,
+                     [FeatureSSE42,  FeatureCMPXCHG16B, FeatureSlowBTMem,
+                      FeatureFastUAMem, FeaturePOPCNT]>;
+// Westmere is a similar machine to nehalem with some additional features.
+// Westmere is the corei3/i5/i7 path from nehalem to sandybridge
+def : ProcessorModel<"westmere", SandyBridgeModel,
+                     [FeatureSSE42, FeatureCMPXCHG16B, FeatureSlowBTMem,
+                      FeatureFastUAMem, FeaturePOPCNT, FeatureAES,
+                      FeaturePCLMUL]>;
+// Sandy Bridge
+// SSE is not listed here since llvm treats AVX as a reimplementation of SSE,
+// rather than a superset.
+def : ProcessorModel<"corei7-avx", SandyBridgeModel,
+                     [FeatureAVX, FeatureCMPXCHG16B, FeatureFastUAMem,
+                      FeaturePOPCNT, FeatureAES, FeaturePCLMUL]>;
+// Ivy Bridge
+def : ProcessorModel<"core-avx-i", SandyBridgeModel,
+                     [FeatureAVX, FeatureCMPXCHG16B, FeatureFastUAMem,
+                      FeaturePOPCNT, FeatureAES, FeaturePCLMUL, FeatureRDRAND,
+                      FeatureF16C, FeatureFSGSBase]>;
+
+// Haswell
+def : ProcessorModel<"core-avx2", HaswellModel,
+                     [FeatureAVX2, FeatureCMPXCHG16B, FeatureFastUAMem,
+                      FeaturePOPCNT, FeatureAES, FeaturePCLMUL, FeatureRDRAND,
+                      FeatureF16C, FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT,
+                      FeatureBMI, FeatureBMI2, FeatureFMA, FeatureRTM,
+                      FeatureHLE]>;
+
+// KNL
+// FIXME: define KNL model
+def : ProcessorModel<"knl", HaswellModel,
+                     [FeatureAVX512, FeatureERI, FeatureCDI, FeaturePFI,
+                      FeatureCMPXCHG16B, FeatureFastUAMem, FeaturePOPCNT,
+                      FeatureAES, FeaturePCLMUL, FeatureRDRAND, FeatureF16C,
+                      FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, FeatureBMI,
+                      FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE,
+                      FeatureSlowIncDec]>;
+
+// SKX
+// FIXME: define SKX model
+def : ProcessorModel<"skx", HaswellModel,
+                     [FeatureAVX512, FeatureCDI,
+                      FeatureDQI, FeatureBWI, FeatureVLX,
+                      FeatureCMPXCHG16B, FeatureFastUAMem, FeaturePOPCNT,
+                      FeatureAES, FeaturePCLMUL, FeatureRDRAND, FeatureF16C,
+                      FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, FeatureBMI,
+                      FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE,
+                      FeatureSlowIncDec]>;
+
+def : Proc<"k6",              [FeatureMMX]>;
+def : Proc<"k6-2",            [Feature3DNow]>;
+def : Proc<"k6-3",            [Feature3DNow]>;
+def : Proc<"athlon",          [Feature3DNowA, FeatureSlowBTMem,
+                               FeatureSlowSHLD]>;
+def : Proc<"athlon-tbird",    [Feature3DNowA, FeatureSlowBTMem,
+                               FeatureSlowSHLD]>;
+def : Proc<"athlon-4",        [FeatureSSE1,   Feature3DNowA, FeatureSlowBTMem, 
+                               FeatureSlowSHLD]>;
+def : Proc<"athlon-xp",       [FeatureSSE1,   Feature3DNowA, FeatureSlowBTMem,
+                               FeatureSlowSHLD]>;
+def : Proc<"athlon-mp",       [FeatureSSE1,   Feature3DNowA, FeatureSlowBTMem,
+                               FeatureSlowSHLD]>;
+def : Proc<"k8",              [FeatureSSE2,   Feature3DNowA, Feature64Bit,
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"opteron",         [FeatureSSE2,   Feature3DNowA, Feature64Bit,
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"athlon64",        [FeatureSSE2,   Feature3DNowA, Feature64Bit,
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"athlon-fx",       [FeatureSSE2,   Feature3DNowA, Feature64Bit,
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"k8-sse3",         [FeatureSSE3,   Feature3DNowA, FeatureCMPXCHG16B,
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"opteron-sse3",    [FeatureSSE3,   Feature3DNowA, FeatureCMPXCHG16B,
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"athlon64-sse3",   [FeatureSSE3,   Feature3DNowA, FeatureCMPXCHG16B,
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"amdfam10",        [FeatureSSE4A,
+                               Feature3DNowA, FeatureCMPXCHG16B, FeatureLZCNT,
+                               FeaturePOPCNT, FeatureSlowBTMem,
+                               FeatureSlowSHLD]>;
+// Bobcat
+def : Proc<"btver1",          [FeatureSSSE3, FeatureSSE4A, FeatureCMPXCHG16B,
+                               FeaturePRFCHW, FeatureLZCNT, FeaturePOPCNT,
+                               FeatureSlowSHLD]>;
+// Jaguar
+def : Proc<"btver2",          [FeatureAVX, FeatureSSE4A, FeatureCMPXCHG16B,
+                               FeaturePRFCHW, FeatureAES, FeaturePCLMUL,
+                               FeatureBMI, FeatureF16C, FeatureMOVBE,
+                               FeatureLZCNT, FeaturePOPCNT, FeatureSlowSHLD]>;
+// Bulldozer
+def : Proc<"bdver1",          [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
+                               FeatureAES, FeaturePRFCHW, FeaturePCLMUL,
+                               FeatureLZCNT, FeaturePOPCNT, FeatureSlowSHLD]>;
+// Piledriver
+def : Proc<"bdver2",          [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
+                               FeatureAES, FeaturePRFCHW, FeaturePCLMUL,
+                               FeatureF16C, FeatureLZCNT,
+                               FeaturePOPCNT, FeatureBMI, FeatureTBM,
+                               FeatureFMA, FeatureSlowSHLD]>;
+
+// Steamroller
+def : Proc<"bdver3",          [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
+                               FeatureAES, FeaturePRFCHW, FeaturePCLMUL,
+                               FeatureF16C, FeatureLZCNT,
+                               FeaturePOPCNT, FeatureBMI,  FeatureTBM,
+                               FeatureFMA, FeatureFSGSBase]>;
+
+// Excavator
+def : Proc<"bdver4",          [FeatureAVX2, FeatureXOP, FeatureFMA4,
+                               FeatureCMPXCHG16B, FeatureAES, FeaturePRFCHW,
+                               FeaturePCLMUL, FeatureF16C, FeatureLZCNT,
+                               FeaturePOPCNT, FeatureBMI, FeatureBMI2,
+                               FeatureTBM, FeatureFMA, FeatureFSGSBase]>;
+
+def : Proc<"geode",           [Feature3DNowA]>;
+
+def : Proc<"winchip-c6",      [FeatureMMX]>;
+def : Proc<"winchip2",        [Feature3DNow]>;
+def : Proc<"c3",              [Feature3DNow]>;
+def : Proc<"c3-2",            [FeatureSSE1]>;
+
+// We also provide a generic 64-bit specific x86 processor model which tries to
+// be good for modern chips without enabling instruction set encodings past the
+// basic SSE2 and 64-bit ones. It disables slow things from any mainstream and
+// modern 64-bit x86 chip, and enables features that are generally beneficial.
+// 
+// We currently use the Sandy Bridge model as the default scheduling model as
+// we use it across Nehalem, Westmere, Sandy Bridge, and Ivy Bridge which
+// covers a huge swath of x86 processors. If there are specific scheduling
+// knobs which need to be tuned differently for AMD chips, we might consider
+// forming a common base for them.
+def : ProcessorModel<"x86-64", SandyBridgeModel,
+                     [FeatureSSE2, Feature64Bit, FeatureSlowBTMem,
+                      FeatureFastUAMem]>;
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "X86RegisterInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "X86InstrInfo.td"
+
+def X86InstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// Calling Conventions
+//===----------------------------------------------------------------------===//
+
+include "X86CallingConv.td"
+
+
+//===----------------------------------------------------------------------===//
+// Assembly Parser
+//===----------------------------------------------------------------------===//
+
+def ATTAsmParser : AsmParser {
+  string AsmParserClassName = "AsmParser";
+}
+
+def ATTAsmParserVariant : AsmParserVariant {
+  int Variant = 0;
+
+  // Variant name.
+  string Name = "att";
+
+  // Discard comments in assembly strings.
+  string CommentDelimiter = "#";
+
+  // Recognize hard coded registers.
+  string RegisterPrefix = "%";
+}
+
+def IntelAsmParserVariant : AsmParserVariant {
+  int Variant = 1;
+
+  // Variant name.
+  string Name = "intel";
+
+  // Discard comments in assembly strings.
+  string CommentDelimiter = ";";
+
+  // Recognize hard coded registers.
+  string RegisterPrefix = "";
+}
+
+//===----------------------------------------------------------------------===//
+// Assembly Printers
+//===----------------------------------------------------------------------===//
+
+// The X86 target supports two different syntaxes for emitting machine code.
+// This is controlled by the -x86-asm-syntax={att|intel}
+def ATTAsmWriter : AsmWriter {
+  string AsmWriterClassName  = "ATTInstPrinter";
+  int Variant = 0;
+}
+def IntelAsmWriter : AsmWriter {
+  string AsmWriterClassName  = "IntelInstPrinter";
+  int Variant = 1;
+}
+
+def X86 : Target {
+  // Information about the instructions...
+  let InstructionSet = X86InstrInfo;
+  let AssemblyParsers = [ATTAsmParser];
+  let AssemblyParserVariants = [ATTAsmParserVariant, IntelAsmParserVariant];
+  let AssemblyWriters = [ATTAsmWriter, IntelAsmWriter];
+}
diff --git a/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp b/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp
new file mode 100644
index 0000000..57c7a62
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp
@@ -0,0 +1,748 @@
+//===-- X86AsmPrinter.cpp - Convert X86 LLVM code to AT&T assembly --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to X86 machine code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86AsmPrinter.h"
+#include "InstPrinter/X86ATTInstPrinter.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "X86InstrInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/COFF.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Primitive Helper Functions.
+//===----------------------------------------------------------------------===//
+
+/// runOnMachineFunction - Emit the function body.
+///
+bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  SetupMachineFunction(MF);
+
+  if (Subtarget->isTargetCOFF()) {
+    bool Intrn = MF.getFunction()->hasInternalLinkage();
+    OutStreamer.BeginCOFFSymbolDef(CurrentFnSym);
+    OutStreamer.EmitCOFFSymbolStorageClass(Intrn ? COFF::IMAGE_SYM_CLASS_STATIC
+                                              : COFF::IMAGE_SYM_CLASS_EXTERNAL);
+    OutStreamer.EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_FUNCTION
+                                               << COFF::SCT_COMPLEX_TYPE_SHIFT);
+    OutStreamer.EndCOFFSymbolDef();
+  }
+
+  // Have common code print out the function header with linkage info etc.
+  EmitFunctionHeader();
+
+  // Emit the rest of the function body.
+  EmitFunctionBody();
+
+  // We didn't modify anything.
+  return false;
+}
+
+/// printSymbolOperand - Print a raw symbol reference operand.  This handles
+/// jump tables, constant pools, global address and external symbols, all of
+/// which print to a label with various suffixes for relocation types etc.
+static void printSymbolOperand(X86AsmPrinter &P, const MachineOperand &MO,
+                               raw_ostream &O) {
+  switch (MO.getType()) {
+  default: llvm_unreachable("unknown symbol type!");
+  case MachineOperand::MO_ConstantPoolIndex:
+    O << *P.GetCPISymbol(MO.getIndex());
+    P.printOffset(MO.getOffset(), O);
+    break;
+  case MachineOperand::MO_GlobalAddress: {
+    const GlobalValue *GV = MO.getGlobal();
+
+    MCSymbol *GVSym;
+    if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB)
+      GVSym = P.getSymbolWithGlobalValueBase(GV, "$stub");
+    else if (MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY ||
+             MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE ||
+             MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE)
+      GVSym = P.getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+    else
+      GVSym = P.getSymbol(GV);
+
+    // Handle dllimport linkage.
+    if (MO.getTargetFlags() == X86II::MO_DLLIMPORT)
+      GVSym =
+          P.OutContext.GetOrCreateSymbol(Twine("__imp_") + GVSym->getName());
+
+    if (MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY ||
+        MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE) {
+      MCSymbol *Sym = P.getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+      MachineModuleInfoImpl::StubValueTy &StubSym =
+          P.MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry(Sym);
+      if (!StubSym.getPointer())
+        StubSym = MachineModuleInfoImpl::
+          StubValueTy(P.getSymbol(GV), !GV->hasInternalLinkage());
+    } else if (MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE){
+      MCSymbol *Sym = P.getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+      MachineModuleInfoImpl::StubValueTy &StubSym =
+          P.MMI->getObjFileInfo<MachineModuleInfoMachO>().getHiddenGVStubEntry(
+              Sym);
+      if (!StubSym.getPointer())
+        StubSym = MachineModuleInfoImpl::
+          StubValueTy(P.getSymbol(GV), !GV->hasInternalLinkage());
+    } else if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB) {
+      MCSymbol *Sym = P.getSymbolWithGlobalValueBase(GV, "$stub");
+      MachineModuleInfoImpl::StubValueTy &StubSym =
+          P.MMI->getObjFileInfo<MachineModuleInfoMachO>().getFnStubEntry(Sym);
+      if (!StubSym.getPointer())
+        StubSym = MachineModuleInfoImpl::
+          StubValueTy(P.getSymbol(GV), !GV->hasInternalLinkage());
+    }
+
+    // If the name begins with a dollar-sign, enclose it in parens.  We do this
+    // to avoid having it look like an integer immediate to the assembler.
+    if (GVSym->getName()[0] != '$')
+      O << *GVSym;
+    else
+      O << '(' << *GVSym << ')';
+    P.printOffset(MO.getOffset(), O);
+    break;
+  }
+  }
+
+  switch (MO.getTargetFlags()) {
+  default:
+    llvm_unreachable("Unknown target flag on GV operand");
+  case X86II::MO_NO_FLAG:    // No flag.
+    break;
+  case X86II::MO_DARWIN_NONLAZY:
+  case X86II::MO_DLLIMPORT:
+  case X86II::MO_DARWIN_STUB:
+    // These affect the name of the symbol, not any suffix.
+    break;
+  case X86II::MO_GOT_ABSOLUTE_ADDRESS:
+    O << " + [.-" << *P.MF->getPICBaseSymbol() << ']';
+    break;
+  case X86II::MO_PIC_BASE_OFFSET:
+  case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
+  case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE:
+    O << '-' << *P.MF->getPICBaseSymbol();
+    break;
+  case X86II::MO_TLSGD:     O << "@TLSGD";     break;
+  case X86II::MO_TLSLD:     O << "@TLSLD";     break;
+  case X86II::MO_TLSLDM:    O << "@TLSLDM";    break;
+  case X86II::MO_GOTTPOFF:  O << "@GOTTPOFF";  break;
+  case X86II::MO_INDNTPOFF: O << "@INDNTPOFF"; break;
+  case X86II::MO_TPOFF:     O << "@TPOFF";     break;
+  case X86II::MO_DTPOFF:    O << "@DTPOFF";    break;
+  case X86II::MO_NTPOFF:    O << "@NTPOFF";    break;
+  case X86II::MO_GOTNTPOFF: O << "@GOTNTPOFF"; break;
+  case X86II::MO_GOTPCREL:  O << "@GOTPCREL";  break;
+  case X86II::MO_GOT:       O << "@GOT";       break;
+  case X86II::MO_GOTOFF:    O << "@GOTOFF";    break;
+  case X86II::MO_PLT:       O << "@PLT";       break;
+  case X86II::MO_TLVP:      O << "@TLVP";      break;
+  case X86II::MO_TLVP_PIC_BASE:
+    O << "@TLVP" << '-' << *P.MF->getPICBaseSymbol();
+    break;
+  case X86II::MO_SECREL:    O << "@SECREL32";  break;
+  }
+}
+
+static void printOperand(X86AsmPrinter &P, const MachineInstr *MI,
+                         unsigned OpNo, raw_ostream &O,
+                         const char *Modifier = nullptr, unsigned AsmVariant = 0);
+
+/// printPCRelImm - This is used to print an immediate value that ends up
+/// being encoded as a pc-relative value.  These print slightly differently, for
+/// example, a $ is not emitted.
+static void printPCRelImm(X86AsmPrinter &P, const MachineInstr *MI,
+                          unsigned OpNo, raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand(OpNo);
+  switch (MO.getType()) {
+  default: llvm_unreachable("Unknown pcrel immediate operand");
+  case MachineOperand::MO_Register:
+    // pc-relativeness was handled when computing the value in the reg.
+    printOperand(P, MI, OpNo, O);
+    return;
+  case MachineOperand::MO_Immediate:
+    O << MO.getImm();
+    return;
+  case MachineOperand::MO_GlobalAddress:
+    printSymbolOperand(P, MO, O);
+    return;
+  }
+}
+
+static void printOperand(X86AsmPrinter &P, const MachineInstr *MI,
+                         unsigned OpNo, raw_ostream &O, const char *Modifier,
+                         unsigned AsmVariant) {
+  const MachineOperand &MO = MI->getOperand(OpNo);
+  switch (MO.getType()) {
+  default: llvm_unreachable("unknown operand type!");
+  case MachineOperand::MO_Register: {
+    // FIXME: Enumerating AsmVariant, so we can remove magic number.
+    if (AsmVariant == 0) O << '%';
+    unsigned Reg = MO.getReg();
+    if (Modifier && strncmp(Modifier, "subreg", strlen("subreg")) == 0) {
+      MVT::SimpleValueType VT = (strcmp(Modifier+6,"64") == 0) ?
+        MVT::i64 : ((strcmp(Modifier+6, "32") == 0) ? MVT::i32 :
+                    ((strcmp(Modifier+6,"16") == 0) ? MVT::i16 : MVT::i8));
+      Reg = getX86SubSuperRegister(Reg, VT);
+    }
+    O << X86ATTInstPrinter::getRegisterName(Reg);
+    return;
+  }
+
+  case MachineOperand::MO_Immediate:
+    if (AsmVariant == 0) O << '$';
+    O << MO.getImm();
+    return;
+
+  case MachineOperand::MO_GlobalAddress: {
+    if (AsmVariant == 0) O << '$';
+    printSymbolOperand(P, MO, O);
+    break;
+  }
+  }
+}
+
+static void printLeaMemReference(X86AsmPrinter &P, const MachineInstr *MI,
+                                 unsigned Op, raw_ostream &O,
+                                 const char *Modifier = nullptr) {
+  const MachineOperand &BaseReg  = MI->getOperand(Op+X86::AddrBaseReg);
+  const MachineOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg);
+  const MachineOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp);
+
+  // If we really don't want to print out (rip), don't.
+  bool HasBaseReg = BaseReg.getReg() != 0;
+  if (HasBaseReg && Modifier && !strcmp(Modifier, "no-rip") &&
+      BaseReg.getReg() == X86::RIP)
+    HasBaseReg = false;
+
+  // HasParenPart - True if we will print out the () part of the mem ref.
+  bool HasParenPart = IndexReg.getReg() || HasBaseReg;
+
+  switch (DispSpec.getType()) {
+  default:
+    llvm_unreachable("unknown operand type!");
+  case MachineOperand::MO_Immediate: {
+    int DispVal = DispSpec.getImm();
+    if (DispVal || !HasParenPart)
+      O << DispVal;
+    break;
+  }
+  case MachineOperand::MO_GlobalAddress:
+  case MachineOperand::MO_ConstantPoolIndex:
+    printSymbolOperand(P, DispSpec, O);
+  }
+
+  if (Modifier && strcmp(Modifier, "H") == 0)
+    O << "+8";
+
+  if (HasParenPart) {
+    assert(IndexReg.getReg() != X86::ESP &&
+           "X86 doesn't allow scaling by ESP");
+
+    O << '(';
+    if (HasBaseReg)
+      printOperand(P, MI, Op+X86::AddrBaseReg, O, Modifier);
+
+    if (IndexReg.getReg()) {
+      O << ',';
+      printOperand(P, MI, Op+X86::AddrIndexReg, O, Modifier);
+      unsigned ScaleVal = MI->getOperand(Op+X86::AddrScaleAmt).getImm();
+      if (ScaleVal != 1)
+        O << ',' << ScaleVal;
+    }
+    O << ')';
+  }
+}
+
+static void printMemReference(X86AsmPrinter &P, const MachineInstr *MI,
+                              unsigned Op, raw_ostream &O,
+                              const char *Modifier = nullptr) {
+  assert(isMem(MI, Op) && "Invalid memory reference!");
+  const MachineOperand &Segment = MI->getOperand(Op+X86::AddrSegmentReg);
+  if (Segment.getReg()) {
+    printOperand(P, MI, Op+X86::AddrSegmentReg, O, Modifier);
+    O << ':';
+  }
+  printLeaMemReference(P, MI, Op, O, Modifier);
+}
+
+static void printIntelMemReference(X86AsmPrinter &P, const MachineInstr *MI,
+                                   unsigned Op, raw_ostream &O,
+                                   const char *Modifier = nullptr,
+                                   unsigned AsmVariant = 1) {
+  const MachineOperand &BaseReg  = MI->getOperand(Op+X86::AddrBaseReg);
+  unsigned ScaleVal = MI->getOperand(Op+X86::AddrScaleAmt).getImm();
+  const MachineOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg);
+  const MachineOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp);
+  const MachineOperand &SegReg   = MI->getOperand(Op+X86::AddrSegmentReg);
+
+  // If this has a segment register, print it.
+  if (SegReg.getReg()) {
+    printOperand(P, MI, Op+X86::AddrSegmentReg, O, Modifier, AsmVariant);
+    O << ':';
+  }
+
+  O << '[';
+
+  bool NeedPlus = false;
+  if (BaseReg.getReg()) {
+    printOperand(P, MI, Op+X86::AddrBaseReg, O, Modifier, AsmVariant);
+    NeedPlus = true;
+  }
+
+  if (IndexReg.getReg()) {
+    if (NeedPlus) O << " + ";
+    if (ScaleVal != 1)
+      O << ScaleVal << '*';
+    printOperand(P, MI, Op+X86::AddrIndexReg, O, Modifier, AsmVariant);
+    NeedPlus = true;
+  }
+
+  if (!DispSpec.isImm()) {
+    if (NeedPlus) O << " + ";
+    printOperand(P, MI, Op+X86::AddrDisp, O, Modifier, AsmVariant);
+  } else {
+    int64_t DispVal = DispSpec.getImm();
+    if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) {
+      if (NeedPlus) {
+        if (DispVal > 0)
+          O << " + ";
+        else {
+          O << " - ";
+          DispVal = -DispVal;
+        }
+      }
+      O << DispVal;
+    }
+  }
+  O << ']';
+}
+
+static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO,
+                              char Mode, raw_ostream &O) {
+  unsigned Reg = MO.getReg();
+  switch (Mode) {
+  default: return true;  // Unknown mode.
+  case 'b': // Print QImode register
+    Reg = getX86SubSuperRegister(Reg, MVT::i8);
+    break;
+  case 'h': // Print QImode high register
+    Reg = getX86SubSuperRegister(Reg, MVT::i8, true);
+    break;
+  case 'w': // Print HImode register
+    Reg = getX86SubSuperRegister(Reg, MVT::i16);
+    break;
+  case 'k': // Print SImode register
+    Reg = getX86SubSuperRegister(Reg, MVT::i32);
+    break;
+  case 'q':
+    // Print 64-bit register names if 64-bit integer registers are available.
+    // Otherwise, print 32-bit register names.
+    MVT::SimpleValueType Ty = P.getSubtarget().is64Bit() ? MVT::i64 : MVT::i32;
+    Reg = getX86SubSuperRegister(Reg, Ty);
+    break;
+  }
+
+  O << '%' << X86ATTInstPrinter::getRegisterName(Reg);
+  return false;
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                    unsigned AsmVariant,
+                                    const char *ExtraCode, raw_ostream &O) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    const MachineOperand &MO = MI->getOperand(OpNo);
+
+    switch (ExtraCode[0]) {
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
+    case 'a': // This is an address.  Currently only 'i' and 'r' are expected.
+      switch (MO.getType()) {
+      default:
+        return true;
+      case MachineOperand::MO_Immediate:
+        O << MO.getImm();
+        return false;
+      case MachineOperand::MO_ConstantPoolIndex:
+      case MachineOperand::MO_JumpTableIndex:
+      case MachineOperand::MO_ExternalSymbol:
+        llvm_unreachable("unexpected operand type!");
+      case MachineOperand::MO_GlobalAddress:
+        printSymbolOperand(*this, MO, O);
+        if (Subtarget->isPICStyleRIPRel())
+          O << "(%rip)";
+        return false;
+      case MachineOperand::MO_Register:
+        O << '(';
+        printOperand(*this, MI, OpNo, O);
+        O << ')';
+        return false;
+      }
+
+    case 'c': // Don't print "$" before a global var name or constant.
+      switch (MO.getType()) {
+      default:
+        printOperand(*this, MI, OpNo, O);
+        break;
+      case MachineOperand::MO_Immediate:
+        O << MO.getImm();
+        break;
+      case MachineOperand::MO_ConstantPoolIndex:
+      case MachineOperand::MO_JumpTableIndex:
+      case MachineOperand::MO_ExternalSymbol:
+        llvm_unreachable("unexpected operand type!");
+      case MachineOperand::MO_GlobalAddress:
+        printSymbolOperand(*this, MO, O);
+        break;
+      }
+      return false;
+
+    case 'A': // Print '*' before a register (it must be a register)
+      if (MO.isReg()) {
+        O << '*';
+        printOperand(*this, MI, OpNo, O);
+        return false;
+      }
+      return true;
+
+    case 'b': // Print QImode register
+    case 'h': // Print QImode high register
+    case 'w': // Print HImode register
+    case 'k': // Print SImode register
+    case 'q': // Print DImode register
+      if (MO.isReg())
+        return printAsmMRegister(*this, MO, ExtraCode[0], O);
+      printOperand(*this, MI, OpNo, O);
+      return false;
+
+    case 'P': // This is the operand of a call, treat specially.
+      printPCRelImm(*this, MI, OpNo, O);
+      return false;
+
+    case 'n':  // Negate the immediate or print a '-' before the operand.
+      // Note: this is a temporary solution. It should be handled target
+      // independently as part of the 'MC' work.
+      if (MO.isImm()) {
+        O << -MO.getImm();
+        return false;
+      }
+      O << '-';
+    }
+  }
+
+  printOperand(*this, MI, OpNo, O, /*Modifier*/ nullptr, AsmVariant);
+  return false;
+}
+
+bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                          unsigned OpNo, unsigned AsmVariant,
+                                          const char *ExtraCode,
+                                          raw_ostream &O) {
+  if (AsmVariant) {
+    printIntelMemReference(*this, MI, OpNo, O);
+    return false;
+  }
+
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default: return true;  // Unknown modifier.
+    case 'b': // Print QImode register
+    case 'h': // Print QImode high register
+    case 'w': // Print HImode register
+    case 'k': // Print SImode register
+    case 'q': // Print SImode register
+      // These only apply to registers, ignore on mem.
+      break;
+    case 'H':
+      printMemReference(*this, MI, OpNo, O, "H");
+      return false;
+    case 'P': // Don't print @PLT, but do print as memory.
+      printMemReference(*this, MI, OpNo, O, "no-rip");
+      return false;
+    }
+  }
+  printMemReference(*this, MI, OpNo, O);
+  return false;
+}
+
+void X86AsmPrinter::EmitStartOfAsmFile(Module &M) {
+  if (Subtarget->isTargetMacho())
+    OutStreamer.SwitchSection(getObjFileLowering().getTextSection());
+
+  if (Subtarget->isTargetCOFF()) {
+    // Emit an absolute @feat.00 symbol.  This appears to be some kind of
+    // compiler features bitfield read by link.exe.
+    if (!Subtarget->is64Bit()) {
+      MCSymbol *S = MMI->getContext().GetOrCreateSymbol(StringRef("@feat.00"));
+      OutStreamer.BeginCOFFSymbolDef(S);
+      OutStreamer.EmitCOFFSymbolStorageClass(COFF::IMAGE_SYM_CLASS_STATIC);
+      OutStreamer.EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_NULL);
+      OutStreamer.EndCOFFSymbolDef();
+      // According to the PE-COFF spec, the LSB of this value marks the object
+      // for "registered SEH".  This means that all SEH handler entry points
+      // must be registered in .sxdata.  Use of any unregistered handlers will
+      // cause the process to terminate immediately.  LLVM does not know how to
+      // register any SEH handlers, so its object files should be safe.
+      S->setAbsolute();
+      OutStreamer.EmitSymbolAttribute(S, MCSA_Global);
+      OutStreamer.EmitAssignment(
+          S, MCConstantExpr::Create(int64_t(1), MMI->getContext()));
+    }
+  }
+}
+
+static void
+emitNonLazySymbolPointer(MCStreamer &OutStreamer, MCSymbol *StubLabel,
+                         MachineModuleInfoImpl::StubValueTy &MCSym) {
+  // L_foo$stub:
+  OutStreamer.EmitLabel(StubLabel);
+  //   .indirect_symbol _foo
+  OutStreamer.EmitSymbolAttribute(MCSym.getPointer(), MCSA_IndirectSymbol);
+
+  if (MCSym.getInt())
+    // External to current translation unit.
+    OutStreamer.EmitIntValue(0, 4/*size*/);
+  else
+    // Internal to current translation unit.
+    //
+    // When we place the LSDA into the TEXT section, the type info
+    // pointers need to be indirect and pc-rel. We accomplish this by
+    // using NLPs; however, sometimes the types are local to the file.
+    // We need to fill in the value for the NLP in those cases.
+    OutStreamer.EmitValue(
+        MCSymbolRefExpr::Create(MCSym.getPointer(), OutStreamer.getContext()),
+        4 /*size*/);
+}
+
+MCSymbol *X86AsmPrinter::GetCPISymbol(unsigned CPID) const {
+  if (Subtarget->isTargetKnownWindowsMSVC()) {
+    const MachineConstantPoolEntry &CPE =
+        MF->getConstantPool()->getConstants()[CPID];
+    if (!CPE.isMachineConstantPoolEntry()) {
+      SectionKind Kind = CPE.getSectionKind(TM.getDataLayout());
+      const Constant *C = CPE.Val.ConstVal;
+      const MCSectionCOFF *S = cast<MCSectionCOFF>(
+          getObjFileLowering().getSectionForConstant(Kind, C));
+      if (MCSymbol *Sym = S->getCOMDATSymbol()) {
+        if (Sym->isUndefined())
+          OutStreamer.EmitSymbolAttribute(Sym, MCSA_Global);
+        return Sym;
+      }
+    }
+  }
+
+  return AsmPrinter::GetCPISymbol(CPID);
+}
+
+void X86AsmPrinter::GenerateExportDirective(const MCSymbol *Sym, bool IsData) {
+  SmallString<128> Directive;
+  raw_svector_ostream OS(Directive);
+  StringRef Name = Sym->getName();
+
+  if (Subtarget->isTargetKnownWindowsMSVC())
+    OS << " /EXPORT:";
+  else
+    OS << " -export:";
+
+  if ((Subtarget->isTargetWindowsGNU() || Subtarget->isTargetWindowsCygwin()) &&
+      (Name[0] == getDataLayout().getGlobalPrefix()))
+    Name = Name.drop_front();
+
+  OS << Name;
+
+  if (IsData) {
+    if (Subtarget->isTargetKnownWindowsMSVC())
+      OS << ",DATA";
+    else
+      OS << ",data";
+  }
+
+  OS.flush();
+  OutStreamer.EmitBytes(Directive);
+}
+
+void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
+  if (Subtarget->isTargetMacho()) {
+    // All darwin targets use mach-o.
+    MachineModuleInfoMachO &MMIMacho =
+      MMI->getObjFileInfo<MachineModuleInfoMachO>();
+
+    // Output stubs for dynamically-linked functions.
+    MachineModuleInfoMachO::SymbolListTy Stubs;
+
+    Stubs = MMIMacho.GetFnStubList();
+    if (!Stubs.empty()) {
+      const MCSection *TheSection =
+        OutContext.getMachOSection("__IMPORT", "__jump_table",
+                                   MachO::S_SYMBOL_STUBS |
+                                   MachO::S_ATTR_SELF_MODIFYING_CODE |
+                                   MachO::S_ATTR_PURE_INSTRUCTIONS,
+                                   5, SectionKind::getMetadata());
+      OutStreamer.SwitchSection(TheSection);
+
+      for (const auto &Stub : Stubs) {
+        // L_foo$stub:
+        OutStreamer.EmitLabel(Stub.first);
+        //   .indirect_symbol _foo
+        OutStreamer.EmitSymbolAttribute(Stub.second.getPointer(),
+                                        MCSA_IndirectSymbol);
+        // hlt; hlt; hlt; hlt; hlt     hlt = 0xf4.
+        const char HltInsts[] = "\xf4\xf4\xf4\xf4\xf4";
+        OutStreamer.EmitBytes(StringRef(HltInsts, 5));
+      }
+
+      Stubs.clear();
+      OutStreamer.AddBlankLine();
+    }
+
+    // Output stubs for external and common global variables.
+    Stubs = MMIMacho.GetGVStubList();
+    if (!Stubs.empty()) {
+      const MCSection *TheSection =
+        OutContext.getMachOSection("__IMPORT", "__pointers",
+                                   MachO::S_NON_LAZY_SYMBOL_POINTERS,
+                                   SectionKind::getMetadata());
+      OutStreamer.SwitchSection(TheSection);
+
+      for (auto &Stub : Stubs)
+        emitNonLazySymbolPointer(OutStreamer, Stub.first, Stub.second);
+
+      Stubs.clear();
+      OutStreamer.AddBlankLine();
+    }
+
+    Stubs = MMIMacho.GetHiddenGVStubList();
+    if (!Stubs.empty()) {
+      const MCSection *TheSection =
+        OutContext.getMachOSection("__IMPORT", "__pointers",
+                                   MachO::S_NON_LAZY_SYMBOL_POINTERS,
+                                   SectionKind::getMetadata());
+      OutStreamer.SwitchSection(TheSection);
+
+      for (auto &Stub : Stubs)
+        emitNonLazySymbolPointer(OutStreamer, Stub.first, Stub.second);
+
+      Stubs.clear();
+      OutStreamer.AddBlankLine();
+    }
+
+    SM.serializeToStackMapSection();
+
+    // Funny Darwin hack: This flag tells the linker that no global symbols
+    // contain code that falls through to other global symbols (e.g. the obvious
+    // implementation of multiple entry points).  If this doesn't occur, the
+    // linker can safely perform dead code stripping.  Since LLVM never
+    // generates code that does this, it is always safe to set.
+    OutStreamer.EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
+  }
+
+  if (Subtarget->isTargetKnownWindowsMSVC() && MMI->usesVAFloatArgument()) {
+    StringRef SymbolName = Subtarget->is64Bit() ? "_fltused" : "__fltused";
+    MCSymbol *S = MMI->getContext().GetOrCreateSymbol(SymbolName);
+    OutStreamer.EmitSymbolAttribute(S, MCSA_Global);
+  }
+
+  if (Subtarget->isTargetCOFF()) {
+    // Necessary for dllexport support
+    std::vector<const MCSymbol*> DLLExportedFns, DLLExportedGlobals;
+
+    for (const auto &Function : M)
+      if (Function.hasDLLExportStorageClass())
+        DLLExportedFns.push_back(getSymbol(&Function));
+
+    for (const auto &Global : M.globals())
+      if (Global.hasDLLExportStorageClass())
+        DLLExportedGlobals.push_back(getSymbol(&Global));
+
+    for (const auto &Alias : M.aliases()) {
+      if (!Alias.hasDLLExportStorageClass())
+        continue;
+
+      if (Alias.getType()->getElementType()->isFunctionTy())
+        DLLExportedFns.push_back(getSymbol(&Alias));
+      else
+        DLLExportedGlobals.push_back(getSymbol(&Alias));
+    }
+
+    // Output linker support code for dllexported globals on windows.
+    if (!DLLExportedGlobals.empty() || !DLLExportedFns.empty()) {
+      const TargetLoweringObjectFileCOFF &TLOFCOFF =
+        static_cast<const TargetLoweringObjectFileCOFF&>(getObjFileLowering());
+
+      OutStreamer.SwitchSection(TLOFCOFF.getDrectveSection());
+
+      for (auto & Symbol : DLLExportedGlobals)
+        GenerateExportDirective(Symbol, /*IsData=*/true);
+      for (auto & Symbol : DLLExportedFns)
+        GenerateExportDirective(Symbol, /*IsData=*/false);
+    }
+  }
+
+  if (Subtarget->isTargetELF()) {
+    const TargetLoweringObjectFileELF &TLOFELF =
+      static_cast<const TargetLoweringObjectFileELF &>(getObjFileLowering());
+
+    MachineModuleInfoELF &MMIELF = MMI->getObjFileInfo<MachineModuleInfoELF>();
+
+    // Output stubs for external and common global variables.
+    MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList();
+    if (!Stubs.empty()) {
+      OutStreamer.SwitchSection(TLOFELF.getDataRelSection());
+      const DataLayout *TD = TM.getDataLayout();
+
+      for (const auto &Stub : Stubs) {
+        OutStreamer.EmitLabel(Stub.first);
+        OutStreamer.EmitSymbolValue(Stub.second.getPointer(),
+                                    TD->getPointerSize());
+      }
+      Stubs.clear();
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Target Registry Stuff
+//===----------------------------------------------------------------------===//
+
+// Force static initialization.
+extern "C" void LLVMInitializeX86AsmPrinter() {
+  RegisterAsmPrinter<X86AsmPrinter> X(TheX86_32Target);
+  RegisterAsmPrinter<X86AsmPrinter> Y(TheX86_64Target);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86AsmPrinter.h b/contrib/llvm/lib/Target/X86/X86AsmPrinter.h
new file mode 100644
index 0000000..b1bbe8e
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86AsmPrinter.h
@@ -0,0 +1,61 @@
+//===-- X86AsmPrinter.h - X86 implementation of AsmPrinter ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86ASMPRINTER_H
+#define X86ASMPRINTER_H
+
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+class MCStreamer;
+class MCSymbol;
+
+class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
+  const X86Subtarget *Subtarget;
+  StackMaps SM;
+
+  void GenerateExportDirective(const MCSymbol *Sym, bool IsData);
+
+ public:
+  explicit X86AsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+    : AsmPrinter(TM, Streamer), SM(*this) {
+    Subtarget = &TM.getSubtarget<X86Subtarget>();
+  }
+
+  const char *getPassName() const override {
+    return "X86 Assembly / Object Emitter";
+  }
+
+  const X86Subtarget &getSubtarget() const { return *Subtarget; }
+
+  void EmitStartOfAsmFile(Module &M) override;
+
+  void EmitEndOfAsmFile(Module &M) override;
+
+  void EmitInstruction(const MachineInstr *MI) override;
+
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                       unsigned AsmVariant, const char *ExtraCode,
+                       raw_ostream &OS) override;
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                             unsigned AsmVariant, const char *ExtraCode,
+                             raw_ostream &OS) override;
+
+  /// \brief Return the symbol for the specified constant pool entry.
+  MCSymbol *GetCPISymbol(unsigned CPID) const override;
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86AtomicExpandPass.cpp b/contrib/llvm/lib/Target/X86/X86AtomicExpandPass.cpp
new file mode 100644
index 0000000..3dcadb1
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86AtomicExpandPass.cpp
@@ -0,0 +1,283 @@
+//===-- X86AtomicExpandPass.cpp - Expand illegal atomic instructions --0---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass (at IR level) to replace atomic instructions which
+// cannot be implemented as a single instruction with cmpxchg-based loops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86TargetMachine.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-atomic-expand"
+
+namespace {
+  class X86AtomicExpandPass : public FunctionPass {
+    const X86TargetMachine *TM;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit X86AtomicExpandPass(const X86TargetMachine *TM)
+      : FunctionPass(ID), TM(TM) {}
+
+    bool runOnFunction(Function &F) override;
+    bool expandAtomicInsts(Function &F);
+
+    bool needsCmpXchgNb(Type *MemType);
+
+    /// There are four kinds of atomic operations. Two never need expanding:
+    /// cmpxchg is what we expand the others *to*, and loads are easily handled
+    /// by ISelLowering. Atomicrmw and store can need expanding in some
+    /// circumstances.
+    bool shouldExpand(Instruction *Inst);
+
+    /// 128-bit atomic stores (64-bit on i686) need to be implemented in terms
+    /// of trivial cmpxchg16b loops. A simple store isn't necessarily atomic.
+    bool shouldExpandStore(StoreInst *SI);
+
+    /// Only some atomicrmw instructions need expanding -- some operations
+    /// (e.g. max) have absolutely no architectural support; some (e.g. or) have
+    /// limited support but can't return the previous value; some (e.g. add)
+    /// have complete support in the instruction set.
+    ///
+    /// Also, naturally, 128-bit operations always need to be expanded.
+    bool shouldExpandAtomicRMW(AtomicRMWInst *AI);
+
+    bool expandAtomicRMW(AtomicRMWInst *AI);
+    bool expandAtomicStore(StoreInst *SI);
+  };
+}
+
+char X86AtomicExpandPass::ID = 0;
+
+FunctionPass *llvm::createX86AtomicExpandPass(const X86TargetMachine *TM) {
+  return new X86AtomicExpandPass(TM);
+}
+
+bool X86AtomicExpandPass::runOnFunction(Function &F) {
+  SmallVector<Instruction *, 1> AtomicInsts;
+
+  // Changing control-flow while iterating through it is a bad idea, so gather a
+  // list of all atomic instructions before we start.
+  for (BasicBlock &BB : F)
+    for (Instruction &Inst : BB) {
+      if (isa<AtomicRMWInst>(&Inst) ||
+          (isa<StoreInst>(&Inst) && cast<StoreInst>(&Inst)->isAtomic()))
+        AtomicInsts.push_back(&Inst);
+    }
+
+  bool MadeChange = false;
+  for (Instruction *Inst : AtomicInsts) {
+    if (!shouldExpand(Inst))
+      continue;
+
+    if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(Inst))
+      MadeChange |= expandAtomicRMW(AI);
+    if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+      MadeChange |= expandAtomicStore(SI);
+
+    assert(MadeChange && "Atomic inst not expanded when it should be?");
+    Inst->eraseFromParent();
+  }
+
+  return MadeChange;
+}
+
+/// Returns true if the operand type is 1 step up from the native width, and
+/// the corresponding cmpxchg8b or cmpxchg16b instruction is available
+/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
+bool X86AtomicExpandPass::needsCmpXchgNb(llvm::Type *MemType) {
+  const X86Subtarget &Subtarget = TM->getSubtarget<X86Subtarget>();
+  unsigned OpWidth = MemType->getPrimitiveSizeInBits();
+
+  if (OpWidth == 64)
+    return !Subtarget.is64Bit();  // FIXME this should be Subtarget.hasCmpxchg8b
+  if (OpWidth == 128)
+    return Subtarget.hasCmpxchg16b();
+
+  return false;
+}
+
+bool X86AtomicExpandPass::shouldExpandAtomicRMW(AtomicRMWInst *AI) {
+  const X86Subtarget &Subtarget = TM->getSubtarget<X86Subtarget>();
+  unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
+
+  if (needsCmpXchgNb(AI->getType()))
+    return true;
+
+  if (AI->getType()->getPrimitiveSizeInBits() > NativeWidth)
+    return false;
+
+  AtomicRMWInst::BinOp Op = AI->getOperation();
+  switch (Op) {
+  default:
+    llvm_unreachable("Unknown atomic operation");
+  case AtomicRMWInst::Xchg:
+  case AtomicRMWInst::Add:
+  case AtomicRMWInst::Sub:
+    // It's better to use xadd, xsub or xchg for these in all cases.
+    return false;
+  case AtomicRMWInst::Or:
+  case AtomicRMWInst::And:
+  case AtomicRMWInst::Xor:
+    // If the atomicrmw's result isn't actually used, we can just add a "lock"
+    // prefix to a normal instruction for these operations.
+    return !AI->use_empty();
+  case AtomicRMWInst::Nand:
+  case AtomicRMWInst::Max:
+  case AtomicRMWInst::Min:
+  case AtomicRMWInst::UMax:
+  case AtomicRMWInst::UMin:
+    // These always require a non-trivial set of data operations on x86. We must
+    // use a cmpxchg loop.
+    return true;
+  }
+}
+
+bool X86AtomicExpandPass::shouldExpandStore(StoreInst *SI) {
+  if (needsCmpXchgNb(SI->getValueOperand()->getType()))
+    return true;
+
+  return false;
+}
+
+bool X86AtomicExpandPass::shouldExpand(Instruction *Inst) {
+  if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(Inst))
+    return shouldExpandAtomicRMW(AI);
+  if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+    return shouldExpandStore(SI);
+  return false;
+}
+
+/// Emit IR to implement the given atomicrmw operation on values in registers,
+/// returning the new value.
+static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder,
+                              Value *Loaded, Value *Inc) {
+  Value *NewVal;
+  switch (Op) {
+  case AtomicRMWInst::Xchg:
+    return Inc;
+  case AtomicRMWInst::Add:
+    return Builder.CreateAdd(Loaded, Inc, "new");
+  case AtomicRMWInst::Sub:
+    return Builder.CreateSub(Loaded, Inc, "new");
+  case AtomicRMWInst::And:
+    return Builder.CreateAnd(Loaded, Inc, "new");
+  case AtomicRMWInst::Nand:
+    return Builder.CreateNot(Builder.CreateAnd(Loaded, Inc), "new");
+  case AtomicRMWInst::Or:
+    return Builder.CreateOr(Loaded, Inc, "new");
+  case AtomicRMWInst::Xor:
+    return Builder.CreateXor(Loaded, Inc, "new");
+  case AtomicRMWInst::Max:
+    NewVal = Builder.CreateICmpSGT(Loaded, Inc);
+    return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
+  case AtomicRMWInst::Min:
+    NewVal = Builder.CreateICmpSLE(Loaded, Inc);
+    return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
+  case AtomicRMWInst::UMax:
+    NewVal = Builder.CreateICmpUGT(Loaded, Inc);
+    return  Builder.CreateSelect(NewVal, Loaded, Inc, "new");
+  case AtomicRMWInst::UMin:
+    NewVal = Builder.CreateICmpULE(Loaded, Inc);
+    return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
+  default:
+    break;
+  }
+  llvm_unreachable("Unknown atomic op");
+}
+
+bool X86AtomicExpandPass::expandAtomicRMW(AtomicRMWInst *AI) {
+  AtomicOrdering Order =
+      AI->getOrdering() == Unordered ? Monotonic : AI->getOrdering();
+  Value *Addr = AI->getPointerOperand();
+  BasicBlock *BB = AI->getParent();
+  Function *F = BB->getParent();
+  LLVMContext &Ctx = F->getContext();
+
+  // Given: atomicrmw some_op iN* %addr, iN %incr ordering
+  //
+  // The standard expansion we produce is:
+  //     [...]
+  //     %init_loaded = load atomic iN* %addr
+  //     br label %loop
+  // loop:
+  //     %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ]
+  //     %new = some_op iN %loaded, %incr
+  //     %pair = cmpxchg iN* %addr, iN %loaded, iN %new
+  //     %new_loaded = extractvalue { iN, i1 } %pair, 0
+  //     %success = extractvalue { iN, i1 } %pair, 1
+  //     br i1 %success, label %atomicrmw.end, label %loop
+  // atomicrmw.end:
+  //     [...]
+  BasicBlock *ExitBB = BB->splitBasicBlock(AI, "atomicrmw.end");
+  BasicBlock *LoopBB =  BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);
+
+  // This grabs the DebugLoc from AI.
+  IRBuilder<> Builder(AI);
+
+  // The split call above "helpfully" added a branch at the end of BB (to the
+  // wrong place), but we want a load. It's easiest to just remove
+  // the branch entirely.
+  std::prev(BB->end())->eraseFromParent();
+  Builder.SetInsertPoint(BB);
+  LoadInst *InitLoaded = Builder.CreateLoad(Addr);
+  InitLoaded->setAlignment(AI->getType()->getPrimitiveSizeInBits());
+  Builder.CreateBr(LoopBB);
+
+  // Start the main loop block now that we've taken care of the preliminaries.
+  Builder.SetInsertPoint(LoopBB);
+  PHINode *Loaded = Builder.CreatePHI(AI->getType(), 2, "loaded");
+  Loaded->addIncoming(InitLoaded, BB);
+
+  Value *NewVal =
+      performAtomicOp(AI->getOperation(), Builder, Loaded, AI->getValOperand());
+
+  Value *Pair = Builder.CreateAtomicCmpXchg(
+      Addr, Loaded, NewVal, Order,
+      AtomicCmpXchgInst::getStrongestFailureOrdering(Order));
+  Value *NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded");
+  Loaded->addIncoming(NewLoaded, LoopBB);
+
+  Value *Success = Builder.CreateExtractValue(Pair, 1, "success");
+  Builder.CreateCondBr(Success, ExitBB, LoopBB);
+
+  AI->replaceAllUsesWith(NewLoaded);
+
+  return true;
+}
+
+bool X86AtomicExpandPass::expandAtomicStore(StoreInst *SI) {
+  // An atomic store might need cmpxchg16b (or 8b on x86) to execute. Express
+  // this in terms of the usual expansion to "atomicrmw xchg".
+  IRBuilder<> Builder(SI);
+  AtomicOrdering Order =
+      SI->getOrdering() == Unordered ? Monotonic : SI->getOrdering();
+  AtomicRMWInst *AI =
+      Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, SI->getPointerOperand(),
+                              SI->getValueOperand(), Order);
+
+  // Now we have an appropriate swap instruction, lower it as usual.
+  if (shouldExpandAtomicRMW(AI)) {
+    expandAtomicRMW(AI);
+    AI->eraseFromParent();
+    return true;
+  }
+
+  return AI;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.h b/contrib/llvm/lib/Target/X86/X86CallingConv.h
new file mode 100644
index 0000000..e76f9fd
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86CallingConv.h
@@ -0,0 +1,35 @@
+//=== X86CallingConv.h - X86 Custom Calling Convention Routines -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the custom routines for the X86 Calling Convention that
+// aren't done by tablegen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86CALLINGCONV_H
+#define X86CALLINGCONV_H
+
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/IR/CallingConv.h"
+
+namespace llvm {
+
+inline bool CC_X86_AnyReg_Error(unsigned &, MVT &, MVT &,
+                                CCValAssign::LocInfo &, ISD::ArgFlagsTy &,
+                                CCState &) {
+  llvm_unreachable("The AnyReg calling convention is only supported by the " \
+                   "stackmap and patchpoint intrinsics.");
+  // gracefully fallback to X86 C calling convention on Release builds.
+  return false;
+}
+
+} // End llvm namespace
+
+#endif
+
diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.td b/contrib/llvm/lib/Target/X86/X86CallingConv.td
new file mode 100644
index 0000000..86c01bd
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86CallingConv.td
@@ -0,0 +1,662 @@
+//===-- X86CallingConv.td - Calling Conventions X86 32/64 --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the X86-32 and X86-64
+// architectures.
+//
+//===----------------------------------------------------------------------===//
+
+/// CCIfSubtarget - Match if the current subtarget has a feature F.
+class CCIfSubtarget<string F, CCAction A>
+ : CCIf<!strconcat("State.getTarget().getSubtarget<X86Subtarget>().", F), A>;
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Conventions
+//===----------------------------------------------------------------------===//
+
+// Return-value conventions common to all X86 CC's.
+def RetCC_X86Common : CallingConv<[
+  // Scalar values are returned in AX first, then DX.  For i8, the ABI
+  // requires the values to be in AL and AH, however this code uses AL and DL
+  // instead. This is because using AH for the second register conflicts with
+  // the way LLVM does multiple return values -- a return of {i16,i8} would end
+  // up in AX and AH, which overlap. Front-ends wishing to conform to the ABI
+  // for functions that return two i8 values are currently expected to pack the
+  // values into an i16 (which uses AX, and thus AL:AH).
+  //
+  // For code that doesn't care about the ABI, we allow returning more than two
+  // integer values in registers.
+  CCIfType<[i8] , CCAssignToReg<[AL, DL, CL]>>,
+  CCIfType<[i16], CCAssignToReg<[AX, DX, CX]>>,
+  CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>,
+  CCIfType<[i64], CCAssignToReg<[RAX, RDX, RCX]>>,
+
+  // Vector types are returned in XMM0 and XMM1, when they fit.  XMM2 and XMM3
+  // can only be used by ABI non-compliant code. If the target doesn't have XMM
+  // registers, it won't have vector types.
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+            CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>,
+
+  // 256-bit vectors are returned in YMM0 and XMM1, when they fit. YMM2 and YMM3
+  // can only be used by ABI non-compliant code. This vector type is only
+  // supported while using the AVX target feature.
+  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+            CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>,
+
+  // 512-bit vectors are returned in ZMM0 and ZMM1, when they fit. ZMM2 and ZMM3
+  // can only be used by ABI non-compliant code. This vector type is only
+  // supported while using the AVX-512 target feature.
+  CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+            CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>,
+
+  // MMX vector types are always returned in MM0. If the target doesn't have
+  // MM0, it doesn't support these vector types.
+  CCIfType<[x86mmx], CCAssignToReg<[MM0]>>,
+
+  // Long double types are always returned in ST0 (even with SSE).
+  CCIfType<[f80], CCAssignToReg<[ST0, ST1]>>
+]>;
+
+// X86-32 C return-value convention.
+def RetCC_X86_32_C : CallingConv<[
+  // The X86-32 calling convention returns FP values in ST0, unless marked
+  // with "inreg" (used here to distinguish one kind of reg from another,
+  // weirdly; this is really the sse-regparm calling convention) in which
+  // case they use XMM0, otherwise it is the same as the common X86 calling
+  // conv.
+  CCIfInReg<CCIfSubtarget<"hasSSE2()",
+    CCIfType<[f32, f64], CCAssignToReg<[XMM0,XMM1,XMM2]>>>>,
+  CCIfType<[f32,f64], CCAssignToReg<[ST0, ST1]>>,
+  CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-32 FastCC return-value convention.
+def RetCC_X86_32_Fast : CallingConv<[
+  // The X86-32 fastcc returns 1, 2, or 3 FP values in XMM0-2 if the target has
+  // SSE2.
+  // This can happen when a float, 2 x float, or 3 x float vector is split by
+  // target lowering, and is returned in 1-3 sse regs.
+  CCIfType<[f32], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>,
+  CCIfType<[f64], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>,
+
+  // For integers, ECX can be used as an extra return register
+  CCIfType<[i8],  CCAssignToReg<[AL, DL, CL]>>,
+  CCIfType<[i16], CCAssignToReg<[AX, DX, CX]>>,
+  CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>,
+
+  // Otherwise, it is the same as the common X86 calling convention.
+  CCDelegateTo<RetCC_X86Common>
+]>;
+
+// Intel_OCL_BI return-value convention.
+def RetCC_Intel_OCL_BI : CallingConv<[
+  // Vector types are returned in XMM0,XMM1,XMMM2 and XMM3.
+  CCIfType<[f32, f64, v4i32, v2i64, v4f32, v2f64],
+            CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>,
+
+  // 256-bit FP vectors
+  // No more than 4 registers
+  CCIfType<[v8f32, v4f64, v8i32, v4i64],
+            CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>,
+
+  // 512-bit FP vectors
+  CCIfType<[v16f32, v8f64, v16i32, v8i64],
+            CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>,
+
+  // i32, i64 in the standard way
+  CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-32 HiPE return-value convention.
+def RetCC_X86_32_HiPE : CallingConv<[
+  // Promote all types to i32
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // Return: HP, P, VAL1, VAL2
+  CCIfType<[i32], CCAssignToReg<[ESI, EBP, EAX, EDX]>>
+]>;
+
+// X86-64 C return-value convention.
+def RetCC_X86_64_C : CallingConv<[
+  // The X86-64 calling convention always returns FP values in XMM0.
+  CCIfType<[f32], CCAssignToReg<[XMM0, XMM1]>>,
+  CCIfType<[f64], CCAssignToReg<[XMM0, XMM1]>>,
+
+  // MMX vector types are always returned in XMM0.
+  CCIfType<[x86mmx], CCAssignToReg<[XMM0, XMM1]>>,
+  CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-Win64 C return-value convention.
+def RetCC_X86_Win64_C : CallingConv<[
+  // The X86-Win64 calling convention always returns __m64 values in RAX.
+  CCIfType<[x86mmx], CCBitConvertToType<i64>>,
+
+  // Otherwise, everything is the same as 'normal' X86-64 C CC.
+  CCDelegateTo<RetCC_X86_64_C>
+]>;
+
+// X86-64 HiPE return-value convention.
+def RetCC_X86_64_HiPE : CallingConv<[
+  // Promote all types to i64
+  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+  // Return: HP, P, VAL1, VAL2
+  CCIfType<[i64], CCAssignToReg<[R15, RBP, RAX, RDX]>>
+]>;
+
+// X86-64 WebKit_JS return-value convention.
+def RetCC_X86_64_WebKit_JS : CallingConv<[
+  // Promote all types to i64
+  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+  // Return: RAX
+  CCIfType<[i64], CCAssignToReg<[RAX]>>
+]>;
+
+// X86-64 AnyReg return-value convention. No explicit register is specified for
+// the return-value. The register allocator is allowed and expected to choose
+// any free register.
+//
+// This calling convention is currently only supported by the stackmap and
+// patchpoint intrinsics. All other uses will result in an assert on Debug
+// builds. On Release builds we fallback to the X86 C calling convention.
+def RetCC_X86_64_AnyReg : CallingConv<[
+  CCCustom<"CC_X86_AnyReg_Error">
+]>;
+
+// This is the root return-value convention for the X86-32 backend.
+def RetCC_X86_32 : CallingConv<[
+  // If FastCC, use RetCC_X86_32_Fast.
+  CCIfCC<"CallingConv::Fast", CCDelegateTo<RetCC_X86_32_Fast>>,
+  // If HiPE, use RetCC_X86_32_HiPE.
+  CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_32_HiPE>>,
+
+  // Otherwise, use RetCC_X86_32_C.
+  CCDelegateTo<RetCC_X86_32_C>
+]>;
+
+// This is the root return-value convention for the X86-64 backend.
+def RetCC_X86_64 : CallingConv<[
+  // HiPE uses RetCC_X86_64_HiPE
+  CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_64_HiPE>>,
+
+  // Handle JavaScript calls.
+  CCIfCC<"CallingConv::WebKit_JS", CCDelegateTo<RetCC_X86_64_WebKit_JS>>,
+  CCIfCC<"CallingConv::AnyReg", CCDelegateTo<RetCC_X86_64_AnyReg>>,
+
+  // Handle explicit CC selection
+  CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<RetCC_X86_Win64_C>>,
+  CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<RetCC_X86_64_C>>,
+
+  // Mingw64 and native Win64 use Win64 CC
+  CCIfSubtarget<"isTargetWin64()", CCDelegateTo<RetCC_X86_Win64_C>>,
+
+  // Otherwise, drop to normal X86-64 CC
+  CCDelegateTo<RetCC_X86_64_C>
+]>;
+
+// This is the return-value convention used for the entire X86 backend.
+def RetCC_X86 : CallingConv<[
+
+  // Check if this is the Intel OpenCL built-ins calling convention
+  CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<RetCC_Intel_OCL_BI>>,
+
+  CCIfSubtarget<"is64Bit()", CCDelegateTo<RetCC_X86_64>>,
+  CCDelegateTo<RetCC_X86_32>
+]>;
+
+//===----------------------------------------------------------------------===//
+// X86-64 Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+
+def CC_X86_64_C : CallingConv<[
+  // Handles byval parameters.
+  CCIfByVal<CCPassByVal<8, 8>>,
+
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // The 'nest' parameter, if any, is passed in R10.
+  CCIfNest<CCAssignToReg<[R10]>>,
+
+  // The first 6 integer arguments are passed in integer registers.
+  CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX, R8D, R9D]>>,
+  CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX, R8 , R9 ]>>,
+
+  // The first 8 MMX vector arguments are passed in XMM registers on Darwin.
+  CCIfType<[x86mmx],
+            CCIfSubtarget<"isTargetDarwin()",
+            CCIfSubtarget<"hasSSE2()",
+            CCPromoteToType<v2i64>>>>,
+
+  // The first 8 FP/Vector arguments are passed in XMM registers.
+  CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+            CCIfSubtarget<"hasSSE1()",
+            CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>,
+
+  // The first 8 256-bit vector arguments are passed in YMM registers, unless
+  // this is a vararg function.
+  // FIXME: This isn't precisely correct; the x86-64 ABI document says that
+  // fixed arguments to vararg functions are supposed to be passed in
+  // registers.  Actually modeling that would be a lot of work, though.
+  CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+                          CCIfSubtarget<"hasFp256()",
+                          CCAssignToReg<[YMM0, YMM1, YMM2, YMM3,
+                                         YMM4, YMM5, YMM6, YMM7]>>>>,
+
+  // The first 8 512-bit vector arguments are passed in ZMM registers.
+  CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+            CCIfSubtarget<"hasAVX512()",
+            CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5, ZMM6, ZMM7]>>>>,
+
+  // Integer/FP values get stored in stack slots that are 8 bytes in size and
+  // 8-byte aligned if there are no more registers to hold them.
+  CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
+
+  // Long doubles get stack slots whose size and alignment depends on the
+  // subtarget.
+  CCIfType<[f80], CCAssignToStack<0, 0>>,
+
+  // Vectors get 16-byte stack slots that are 16-byte aligned.
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
+
+  // 256-bit vectors get 32-byte stack slots that are 32-byte aligned.
+  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+           CCAssignToStack<32, 32>>,
+
+  // 512-bit vectors get 64-byte stack slots that are 64-byte aligned.
+  CCIfType<[v16i32, v8i64, v16f32, v8f64],
+           CCAssignToStack<64, 64>>
+]>;
+
+// Calling convention used on Win64
+def CC_X86_Win64_C : CallingConv<[
+  // FIXME: Handle byval stuff.
+  // FIXME: Handle varargs.
+
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // The 'nest' parameter, if any, is passed in R10.
+  CCIfNest<CCAssignToReg<[R10]>>,
+
+  // 128 bit vectors are passed by pointer
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCPassIndirect<i64>>,
+
+
+  // 256 bit vectors are passed by pointer
+  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], CCPassIndirect<i64>>,
+
+  // 512 bit vectors are passed by pointer
+  CCIfType<[v16i32, v16f32, v8f64, v8i64], CCPassIndirect<i64>>,
+
+  // The first 4 MMX vector arguments are passed in GPRs.
+  CCIfType<[x86mmx], CCBitConvertToType<i64>>,
+
+  // The first 4 integer arguments are passed in integer registers.
+  CCIfType<[i32], CCAssignToRegWithShadow<[ECX , EDX , R8D , R9D ],
+                                          [XMM0, XMM1, XMM2, XMM3]>>,
+
+  // Do not pass the sret argument in RCX, the Win64 thiscall calling
+  // convention requires "this" to be passed in RCX.
+  CCIfCC<"CallingConv::X86_ThisCall",
+    CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[RDX , R8  , R9  ],
+                                                     [XMM1, XMM2, XMM3]>>>>,
+
+  CCIfType<[i64], CCAssignToRegWithShadow<[RCX , RDX , R8  , R9  ],
+                                          [XMM0, XMM1, XMM2, XMM3]>>,
+
+  // The first 4 FP/Vector arguments are passed in XMM registers.
+  CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+           CCAssignToRegWithShadow<[XMM0, XMM1, XMM2, XMM3],
+                                   [RCX , RDX , R8  , R9  ]>>,
+
+  // Integer/FP values get stored in stack slots that are 8 bytes in size and
+  // 8-byte aligned if there are no more registers to hold them.
+  CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
+
+  // Long doubles get stack slots whose size and alignment depends on the
+  // subtarget.
+  CCIfType<[f80], CCAssignToStack<0, 0>>
+]>;
+
+def CC_X86_64_GHC : CallingConv<[
+  // Promote i8/i16/i32 arguments to i64.
+  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+  // Pass in STG registers: Base, Sp, Hp, R1, R2, R3, R4, R5, R6, SpLim
+  CCIfType<[i64],
+            CCAssignToReg<[R13, RBP, R12, RBX, R14, RSI, RDI, R8, R9, R15]>>,
+
+  // Pass in STG registers: F1, F2, F3, F4, D1, D2
+  CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+            CCIfSubtarget<"hasSSE1()",
+            CCAssignToReg<[XMM1, XMM2, XMM3, XMM4, XMM5, XMM6]>>>
+]>;
+
+def CC_X86_64_HiPE : CallingConv<[
+  // Promote i8/i16/i32 arguments to i64.
+  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+  // Pass in VM's registers: HP, P, ARG0, ARG1, ARG2, ARG3
+  CCIfType<[i64], CCAssignToReg<[R15, RBP, RSI, RDX, RCX, R8]>>,
+
+  // Integer/FP values get stored in stack slots that are 8 bytes in size and
+  // 8-byte aligned if there are no more registers to hold them.
+  CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>
+]>;
+
+def CC_X86_64_WebKit_JS : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // Only the first integer argument is passed in register.
+  CCIfType<[i32], CCAssignToReg<[EAX]>>,
+  CCIfType<[i64], CCAssignToReg<[RAX]>>,
+
+  // The remaining integer arguments are passed on the stack. 32bit integer and
+  // floating-point arguments are aligned to 4 byte and stored in 4 byte slots.
+  // 64bit integer and floating-point arguments are aligned to 8 byte and stored
+  // in 8 byte stack slots.
+  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+  CCIfType<[i64, f64], CCAssignToStack<8, 8>>
+]>;
+
+// No explicit register is specified for the AnyReg calling convention. The
+// register allocator may assign the arguments to any free register.
+//
+// This calling convention is currently only supported by the stackmap and
+// patchpoint intrinsics. All other uses will result in an assert on Debug
+// builds. On Release builds we fallback to the X86 C calling convention.
+def CC_X86_64_AnyReg : CallingConv<[
+  CCCustom<"CC_X86_AnyReg_Error">
+]>;
+
+//===----------------------------------------------------------------------===//
+// X86 C Calling Convention
+//===----------------------------------------------------------------------===//
+
+/// CC_X86_32_Common - In all X86-32 calling conventions, extra integers and FP
+/// values are spilled on the stack, and the first 4 vector values go in XMM
+/// regs.
+def CC_X86_32_Common : CallingConv<[
+  // Handles byval parameters.
+  CCIfByVal<CCPassByVal<4, 4>>,
+
+  // The first 3 float or double arguments, if marked 'inreg' and if the call
+  // is not a vararg call and if SSE2 is available, are passed in SSE registers.
+  CCIfNotVarArg<CCIfInReg<CCIfType<[f32,f64],
+                CCIfSubtarget<"hasSSE2()",
+                CCAssignToReg<[XMM0,XMM1,XMM2]>>>>>,
+
+  // The first 3 __m64 vector arguments are passed in mmx registers if the
+  // call is not a vararg call.
+  CCIfNotVarArg<CCIfType<[x86mmx],
+                CCAssignToReg<[MM0, MM1, MM2]>>>,
+
+  // Integer/Float values get stored in stack slots that are 4 bytes in
+  // size and 4-byte aligned.
+  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+
+  // Doubles get 8-byte slots that are 4-byte aligned.
+  CCIfType<[f64], CCAssignToStack<8, 4>>,
+
+  // Long doubles get slots whose size depends on the subtarget.
+  CCIfType<[f80], CCAssignToStack<0, 4>>,
+
+  // The first 4 SSE vector arguments are passed in XMM registers.
+  CCIfNotVarArg<CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+                CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>>,
+
+  // The first 4 AVX 256-bit vector arguments are passed in YMM registers.
+  CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+                CCIfSubtarget<"hasFp256()",
+                CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>>>,
+
+  // Other SSE vectors get 16-byte stack slots that are 16-byte aligned.
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
+
+  // 256-bit AVX vectors get 32-byte stack slots that are 32-byte aligned.
+  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+           CCAssignToStack<32, 32>>,
+
+  // __m64 vectors get 8-byte stack slots that are 4-byte aligned. They are
+  // passed in the parameter area.
+  CCIfType<[x86mmx], CCAssignToStack<8, 4>>]>;
+
+def CC_X86_32_C : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // The 'nest' parameter, if any, is passed in ECX.
+  CCIfNest<CCAssignToReg<[ECX]>>,
+
+  // The first 3 integer arguments, if marked 'inreg' and if the call is not
+  // a vararg call, are passed in integer registers.
+  CCIfNotVarArg<CCIfInReg<CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>>>,
+
+  // Otherwise, same as everything else.
+  CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_32_FastCall : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // The 'nest' parameter, if any, is passed in EAX.
+  CCIfNest<CCAssignToReg<[EAX]>>,
+
+  // The first 2 integer arguments are passed in ECX/EDX
+  CCIfInReg<CCIfType<[i32], CCAssignToReg<[ECX, EDX]>>>,
+
+  // Otherwise, same as everything else.
+  CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_32_ThisCall_Common : CallingConv<[
+  // The first integer argument is passed in ECX
+  CCIfType<[i32], CCAssignToReg<[ECX]>>,
+
+  // Otherwise, same as everything else.
+  CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_32_ThisCall_Mingw : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  CCDelegateTo<CC_X86_32_ThisCall_Common>
+]>;
+
+def CC_X86_32_ThisCall_Win : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // Pass sret arguments indirectly through stack.
+  CCIfSRet<CCAssignToStack<4, 4>>,
+
+  CCDelegateTo<CC_X86_32_ThisCall_Common>
+]>;
+
+def CC_X86_32_ThisCall : CallingConv<[
+  CCIfSubtarget<"isTargetCygMing()", CCDelegateTo<CC_X86_32_ThisCall_Mingw>>,
+  CCDelegateTo<CC_X86_32_ThisCall_Win>
+]>;
+
+def CC_X86_32_FastCC : CallingConv<[
+  // Handles byval parameters.  Note that we can't rely on the delegation
+  // to CC_X86_32_Common for this because that happens after code that
+  // puts arguments in registers.
+  CCIfByVal<CCPassByVal<4, 4>>,
+
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // The 'nest' parameter, if any, is passed in EAX.
+  CCIfNest<CCAssignToReg<[EAX]>>,
+
+  // The first 2 integer arguments are passed in ECX/EDX
+  CCIfType<[i32], CCAssignToReg<[ECX, EDX]>>,
+
+  // The first 3 float or double arguments, if the call is not a vararg
+  // call and if SSE2 is available, are passed in SSE registers.
+  CCIfNotVarArg<CCIfType<[f32,f64],
+                CCIfSubtarget<"hasSSE2()",
+                CCAssignToReg<[XMM0,XMM1,XMM2]>>>>,
+
+  // Doubles get 8-byte slots that are 8-byte aligned.
+  CCIfType<[f64], CCAssignToStack<8, 8>>,
+
+  // Otherwise, same as everything else.
+  CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_32_GHC : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // Pass in STG registers: Base, Sp, Hp, R1
+  CCIfType<[i32], CCAssignToReg<[EBX, EBP, EDI, ESI]>>
+]>;
+
+def CC_X86_32_HiPE : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // Pass in VM's registers: HP, P, ARG0, ARG1, ARG2
+  CCIfType<[i32], CCAssignToReg<[ESI, EBP, EAX, EDX, ECX]>>,
+
+  // Integer/Float values get stored in stack slots that are 4 bytes in
+  // size and 4-byte aligned.
+  CCIfType<[i32, f32], CCAssignToStack<4, 4>>
+]>;
+
+// X86-64 Intel OpenCL built-ins calling convention.
+def CC_Intel_OCL_BI : CallingConv<[
+
+  CCIfType<[i32], CCIfSubtarget<"isTargetWin64()", CCAssignToReg<[ECX, EDX, R8D, R9D]>>>,
+  CCIfType<[i64], CCIfSubtarget<"isTargetWin64()", CCAssignToReg<[RCX, RDX, R8,  R9 ]>>>,
+
+  CCIfType<[i32], CCIfSubtarget<"is64Bit()", CCAssignToReg<[EDI, ESI, EDX, ECX]>>>,
+  CCIfType<[i64], CCIfSubtarget<"is64Bit()", CCAssignToReg<[RDI, RSI, RDX, RCX]>>>,
+
+  CCIfType<[i32], CCAssignToStack<4, 4>>,
+
+  // The SSE vector arguments are passed in XMM registers.
+  CCIfType<[f32, f64, v4i32, v2i64, v4f32, v2f64],
+           CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
+
+  // The 256-bit vector arguments are passed in YMM registers.
+  CCIfType<[v8f32, v4f64, v8i32, v4i64],
+           CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>,
+
+  // The 512-bit vector arguments are passed in ZMM registers.
+  CCIfType<[v16f32, v8f64, v16i32, v8i64],
+           CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3]>>,
+
+  CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>,
+  CCIfSubtarget<"is64Bit()",       CCDelegateTo<CC_X86_64_C>>,
+  CCDelegateTo<CC_X86_32_C>
+]>;
+
+//===----------------------------------------------------------------------===//
+// X86 Root Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+
+// This is the root argument convention for the X86-32 backend.
+def CC_X86_32 : CallingConv<[
+  CCIfCC<"CallingConv::X86_FastCall", CCDelegateTo<CC_X86_32_FastCall>>,
+  CCIfCC<"CallingConv::X86_ThisCall", CCDelegateTo<CC_X86_32_ThisCall>>,
+  CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_X86_32_FastCC>>,
+  CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_32_GHC>>,
+  CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_32_HiPE>>,
+
+  // Otherwise, drop to normal X86-32 CC
+  CCDelegateTo<CC_X86_32_C>
+]>;
+
+// This is the root argument convention for the X86-64 backend.
+def CC_X86_64 : CallingConv<[
+  CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_64_GHC>>,
+  CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_64_HiPE>>,
+  CCIfCC<"CallingConv::WebKit_JS", CCDelegateTo<CC_X86_64_WebKit_JS>>,
+  CCIfCC<"CallingConv::AnyReg", CCDelegateTo<CC_X86_64_AnyReg>>,
+  CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<CC_X86_Win64_C>>,
+  CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<CC_X86_64_C>>,
+
+  // Mingw64 and native Win64 use Win64 CC
+  CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>,
+
+  // Otherwise, drop to normal X86-64 CC
+  CCDelegateTo<CC_X86_64_C>
+]>;
+
+// This is the argument convention used for the entire X86 backend.
+def CC_X86 : CallingConv<[
+  CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<CC_Intel_OCL_BI>>,
+  CCIfSubtarget<"is64Bit()", CCDelegateTo<CC_X86_64>>,
+  CCDelegateTo<CC_X86_32>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Callee-saved Registers.
+//===----------------------------------------------------------------------===//
+
+def CSR_NoRegs : CalleeSavedRegs<(add)>;
+
+def CSR_32 : CalleeSavedRegs<(add ESI, EDI, EBX, EBP)>;
+def CSR_64 : CalleeSavedRegs<(add RBX, R12, R13, R14, R15, RBP)>;
+
+def CSR_32EHRet : CalleeSavedRegs<(add EAX, EDX, CSR_32)>;
+def CSR_64EHRet : CalleeSavedRegs<(add RAX, RDX, CSR_64)>;
+
+def CSR_Win64 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, R13, R14, R15,
+                                     (sequence "XMM%u", 6, 15))>;
+
+// All GPRs - except r11
+def CSR_64_RT_MostRegs : CalleeSavedRegs<(add CSR_64, RAX, RCX, RDX, RSI, RDI,
+                                              R8, R9, R10, RSP)>;
+
+// All registers - except r11
+def CSR_64_RT_AllRegs     : CalleeSavedRegs<(add CSR_64_RT_MostRegs,
+                                                 (sequence "XMM%u", 0, 15))>;
+def CSR_64_RT_AllRegs_AVX : CalleeSavedRegs<(add CSR_64_RT_MostRegs,
+                                                 (sequence "YMM%u", 0, 15))>;
+
+def CSR_64_MostRegs : CalleeSavedRegs<(add RBX, RCX, RDX, RSI, RDI, R8, R9, R10,
+                                           R11, R12, R13, R14, R15, RBP,
+                                           (sequence "XMM%u", 0, 15))>;
+
+def CSR_64_AllRegs     : CalleeSavedRegs<(add CSR_64_MostRegs, RAX, RSP,
+                                              (sequence "XMM%u", 16, 31))>;
+def CSR_64_AllRegs_AVX : CalleeSavedRegs<(sub (add CSR_64_MostRegs, RAX, RSP,
+                                                   (sequence "YMM%u", 0, 31)),
+                                              (sequence "XMM%u", 0, 15))>;
+
+// Standard C + YMM6-15
+def CSR_Win64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12,
+                                                  R13, R14, R15,
+                                                  (sequence "YMM%u", 6, 15))>;
+
+def CSR_Win64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI,
+                                                     R12, R13, R14, R15,
+                                                     (sequence "ZMM%u", 6, 21),
+                                                     K4, K5, K6, K7)>;
+//Standard C + XMM 8-15
+def CSR_64_Intel_OCL_BI       : CalleeSavedRegs<(add CSR_64,
+                                                 (sequence "XMM%u", 8, 15))>;
+
+//Standard C + YMM 8-15
+def CSR_64_Intel_OCL_BI_AVX    : CalleeSavedRegs<(add CSR_64,
+                                                  (sequence "YMM%u", 8, 15))>;
+
+def CSR_64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add RBX, RDI, RSI, R14, R15,
+                                                  (sequence "ZMM%u", 16, 31),
+                                                  K4, K5, K6, K7)>;
diff --git a/contrib/llvm/lib/Target/X86/X86CodeEmitter.cpp b/contrib/llvm/lib/Target/X86/X86CodeEmitter.cpp
new file mode 100644
index 0000000..a3ae7ee
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86CodeEmitter.cpp
@@ -0,0 +1,1498 @@
+//===-- X86CodeEmitter.cpp - Convert X86 code to machine code -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the pass that transforms the X86 machine instructions into
+// relocatable machine code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86JITInfo.h"
+#include "X86Relocations.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-emitter"
+
+STATISTIC(NumEmitted, "Number of machine instructions emitted");
+
+namespace {
+  template<class CodeEmitter>
+  class Emitter : public MachineFunctionPass {
+    const X86InstrInfo  *II;
+    const DataLayout    *TD;
+    X86TargetMachine    &TM;
+    CodeEmitter         &MCE;
+    MachineModuleInfo   *MMI;
+    intptr_t PICBaseOffset;
+    bool Is64BitMode;
+    bool IsPIC;
+  public:
+    static char ID;
+    explicit Emitter(X86TargetMachine &tm, CodeEmitter &mce)
+      : MachineFunctionPass(ID), II(nullptr), TD(nullptr), TM(tm),
+        MCE(mce), PICBaseOffset(0), Is64BitMode(false),
+        IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    const char *getPassName() const override {
+      return "X86 Machine Code Emitter";
+    }
+
+    void emitOpcodePrefix(uint64_t TSFlags, int MemOperand,
+                          const MachineInstr &MI,
+                          const MCInstrDesc *Desc) const;
+
+    void emitVEXOpcodePrefix(uint64_t TSFlags, int MemOperand,
+                             const MachineInstr &MI,
+                             const MCInstrDesc *Desc) const;
+
+    void emitSegmentOverridePrefix(uint64_t TSFlags,
+                                   int MemOperand,
+                                   const MachineInstr &MI) const;
+
+    void emitInstruction(MachineInstr &MI, const MCInstrDesc *Desc);
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesAll();
+      AU.addRequired<MachineModuleInfo>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+  private:
+    void emitPCRelativeBlockAddress(MachineBasicBlock *MBB);
+    void emitGlobalAddress(const GlobalValue *GV, unsigned Reloc,
+                           intptr_t Disp = 0, intptr_t PCAdj = 0,
+                           bool Indirect = false);
+    void emitExternalSymbolAddress(const char *ES, unsigned Reloc);
+    void emitConstPoolAddress(unsigned CPI, unsigned Reloc, intptr_t Disp = 0,
+                              intptr_t PCAdj = 0);
+    void emitJumpTableAddress(unsigned JTI, unsigned Reloc,
+                              intptr_t PCAdj = 0);
+
+    void emitDisplacementField(const MachineOperand *RelocOp, int DispVal,
+                               intptr_t Adj = 0, bool IsPCRel = true);
+
+    void emitRegModRMByte(unsigned ModRMReg, unsigned RegOpcodeField);
+    void emitRegModRMByte(unsigned RegOpcodeField);
+    void emitSIBByte(unsigned SS, unsigned Index, unsigned Base);
+    void emitConstant(uint64_t Val, unsigned Size);
+
+    void emitMemModRMByte(const MachineInstr &MI,
+                          unsigned Op, unsigned RegOpcodeField,
+                          intptr_t PCAdj = 0);
+
+    unsigned getX86RegNum(unsigned RegNo) const {
+      const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+      return TRI->getEncodingValue(RegNo) & 0x7;
+    }
+
+    unsigned char getVEXRegisterEncoding(const MachineInstr &MI,
+                                         unsigned OpNum) const;
+  };
+
+template<class CodeEmitter>
+  char Emitter<CodeEmitter>::ID = 0;
+} // end anonymous namespace.
+
+/// createX86CodeEmitterPass - Return a pass that emits the collected X86 code
+/// to the specified JITCodeEmitter object.
+FunctionPass *llvm::createX86JITCodeEmitterPass(X86TargetMachine &TM,
+                                                JITCodeEmitter &JCE) {
+  return new Emitter<JITCodeEmitter>(TM, JCE);
+}
+
+template<class CodeEmitter>
+bool Emitter<CodeEmitter>::runOnMachineFunction(MachineFunction &MF) {
+  MMI = &getAnalysis<MachineModuleInfo>();
+  MCE.setModuleInfo(MMI);
+
+  II = TM.getInstrInfo();
+  TD = TM.getDataLayout();
+  Is64BitMode = TM.getSubtarget<X86Subtarget>().is64Bit();
+  IsPIC = TM.getRelocationModel() == Reloc::PIC_;
+
+  do {
+    DEBUG(dbgs() << "JITTing function '" << MF.getName() << "'\n");
+    MCE.startFunction(MF);
+    for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
+         MBB != E; ++MBB) {
+      MCE.StartMachineBasicBlock(MBB);
+      for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+           I != E; ++I) {
+        const MCInstrDesc &Desc = I->getDesc();
+        emitInstruction(*I, &Desc);
+        // MOVPC32r is basically a call plus a pop instruction.
+        if (Desc.getOpcode() == X86::MOVPC32r)
+          emitInstruction(*I, &II->get(X86::POP32r));
+        ++NumEmitted;  // Keep track of the # of mi's emitted
+      }
+    }
+  } while (MCE.finishFunction(MF));
+
+  return false;
+}
+
+/// determineREX - Determine if the MachineInstr has to be encoded with a X86-64
+/// REX prefix which specifies 1) 64-bit instructions, 2) non-default operand
+/// size, and 3) use of X86-64 extended registers.
+static unsigned determineREX(const MachineInstr &MI) {
+  unsigned REX = 0;
+  const MCInstrDesc &Desc = MI.getDesc();
+
+  // Pseudo instructions do not need REX prefix byte.
+  if ((Desc.TSFlags & X86II::FormMask) == X86II::Pseudo)
+    return 0;
+  if (Desc.TSFlags & X86II::REX_W)
+    REX |= 1 << 3;
+
+  unsigned NumOps = Desc.getNumOperands();
+  if (NumOps) {
+    bool isTwoAddr = NumOps > 1 &&
+      Desc.getOperandConstraint(1, MCOI::TIED_TO) != -1;
+
+    // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix.
+    unsigned i = isTwoAddr ? 1 : 0;
+    for (unsigned e = NumOps; i != e; ++i) {
+      const MachineOperand& MO = MI.getOperand(i);
+      if (MO.isReg()) {
+        unsigned Reg = MO.getReg();
+        if (X86II::isX86_64NonExtLowByteReg(Reg))
+          REX |= 0x40;
+      }
+    }
+
+    switch (Desc.TSFlags & X86II::FormMask) {
+      case X86II::MRMSrcReg: {
+        if (X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0)))
+          REX |= 1 << 2;
+        i = isTwoAddr ? 2 : 1;
+        for (unsigned e = NumOps; i != e; ++i) {
+          const MachineOperand& MO = MI.getOperand(i);
+          if (X86InstrInfo::isX86_64ExtendedReg(MO))
+            REX |= 1 << 0;
+        }
+        break;
+      }
+      case X86II::MRMSrcMem: {
+        if (X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0)))
+          REX |= 1 << 2;
+        unsigned Bit = 0;
+        i = isTwoAddr ? 2 : 1;
+        for (; i != NumOps; ++i) {
+          const MachineOperand& MO = MI.getOperand(i);
+          if (MO.isReg()) {
+            if (X86InstrInfo::isX86_64ExtendedReg(MO))
+              REX |= 1 << Bit;
+            Bit++;
+          }
+        }
+        break;
+      }
+      case X86II::MRMXm:
+      case X86II::MRM0m: case X86II::MRM1m:
+      case X86II::MRM2m: case X86II::MRM3m:
+      case X86II::MRM4m: case X86II::MRM5m:
+      case X86II::MRM6m: case X86II::MRM7m:
+      case X86II::MRMDestMem: {
+        unsigned e = (isTwoAddr ? X86::AddrNumOperands+1 : X86::AddrNumOperands);
+        i = isTwoAddr ? 1 : 0;
+        if (NumOps > e && X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(e)))
+          REX |= 1 << 2;
+        unsigned Bit = 0;
+        for (; i != e; ++i) {
+          const MachineOperand& MO = MI.getOperand(i);
+          if (MO.isReg()) {
+            if (X86InstrInfo::isX86_64ExtendedReg(MO))
+              REX |= 1 << Bit;
+            Bit++;
+          }
+        }
+        break;
+      }
+      default: {
+        if (X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0)))
+          REX |= 1 << 0;
+        i = isTwoAddr ? 2 : 1;
+        for (unsigned e = NumOps; i != e; ++i) {
+          const MachineOperand& MO = MI.getOperand(i);
+          if (X86InstrInfo::isX86_64ExtendedReg(MO))
+            REX |= 1 << 2;
+        }
+        break;
+      }
+    }
+  }
+  return REX;
+}
+
+
+/// emitPCRelativeBlockAddress - This method keeps track of the information
+/// necessary to resolve the address of this block later and emits a dummy
+/// value.
+///
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitPCRelativeBlockAddress(MachineBasicBlock *MBB) {
+  // Remember where this reference was and where it is to so we can
+  // deal with it later.
+  MCE.addRelocation(MachineRelocation::getBB(MCE.getCurrentPCOffset(),
+                                             X86::reloc_pcrel_word, MBB));
+  MCE.emitWordLE(0);
+}
+
+/// emitGlobalAddress - Emit the specified address to the code stream assuming
+/// this is part of a "take the address of a global" instruction.
+///
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitGlobalAddress(const GlobalValue *GV,
+                                unsigned Reloc,
+                                intptr_t Disp /* = 0 */,
+                                intptr_t PCAdj /* = 0 */,
+                                bool Indirect /* = false */) {
+  intptr_t RelocCST = Disp;
+  if (Reloc == X86::reloc_picrel_word)
+    RelocCST = PICBaseOffset;
+  else if (Reloc == X86::reloc_pcrel_word)
+    RelocCST = PCAdj;
+  MachineRelocation MR = Indirect
+    ? MachineRelocation::getIndirectSymbol(MCE.getCurrentPCOffset(), Reloc,
+                                           const_cast<GlobalValue *>(GV),
+                                           RelocCST, false)
+    : MachineRelocation::getGV(MCE.getCurrentPCOffset(), Reloc,
+                               const_cast<GlobalValue *>(GV), RelocCST, false);
+  MCE.addRelocation(MR);
+  // The relocated value will be added to the displacement
+  if (Reloc == X86::reloc_absolute_dword)
+    MCE.emitDWordLE(Disp);
+  else
+    MCE.emitWordLE((int32_t)Disp);
+}
+
+/// emitExternalSymbolAddress - Arrange for the address of an external symbol to
+/// be emitted to the current location in the function, and allow it to be PC
+/// relative.
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitExternalSymbolAddress(const char *ES,
+                                                     unsigned Reloc) {
+  intptr_t RelocCST = (Reloc == X86::reloc_picrel_word) ? PICBaseOffset : 0;
+
+  // X86 never needs stubs because instruction selection will always pick
+  // an instruction sequence that is large enough to hold any address
+  // to a symbol.
+  // (see X86ISelLowering.cpp, near 2039: X86TargetLowering::LowerCall)
+  bool NeedStub = false;
+  MCE.addRelocation(MachineRelocation::getExtSym(MCE.getCurrentPCOffset(),
+                                                 Reloc, ES, RelocCST,
+                                                 0, NeedStub));
+  if (Reloc == X86::reloc_absolute_dword)
+    MCE.emitDWordLE(0);
+  else
+    MCE.emitWordLE(0);
+}
+
+/// emitConstPoolAddress - Arrange for the address of an constant pool
+/// to be emitted to the current location in the function, and allow it to be PC
+/// relative.
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitConstPoolAddress(unsigned CPI, unsigned Reloc,
+                                   intptr_t Disp /* = 0 */,
+                                   intptr_t PCAdj /* = 0 */) {
+  intptr_t RelocCST = 0;
+  if (Reloc == X86::reloc_picrel_word)
+    RelocCST = PICBaseOffset;
+  else if (Reloc == X86::reloc_pcrel_word)
+    RelocCST = PCAdj;
+  MCE.addRelocation(MachineRelocation::getConstPool(MCE.getCurrentPCOffset(),
+                                                    Reloc, CPI, RelocCST));
+  // The relocated value will be added to the displacement
+  if (Reloc == X86::reloc_absolute_dword)
+    MCE.emitDWordLE(Disp);
+  else
+    MCE.emitWordLE((int32_t)Disp);
+}
+
+/// emitJumpTableAddress - Arrange for the address of a jump table to
+/// be emitted to the current location in the function, and allow it to be PC
+/// relative.
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitJumpTableAddress(unsigned JTI, unsigned Reloc,
+                                   intptr_t PCAdj /* = 0 */) {
+  intptr_t RelocCST = 0;
+  if (Reloc == X86::reloc_picrel_word)
+    RelocCST = PICBaseOffset;
+  else if (Reloc == X86::reloc_pcrel_word)
+    RelocCST = PCAdj;
+  MCE.addRelocation(MachineRelocation::getJumpTable(MCE.getCurrentPCOffset(),
+                                                    Reloc, JTI, RelocCST));
+  // The relocated value will be added to the displacement
+  if (Reloc == X86::reloc_absolute_dword)
+    MCE.emitDWordLE(0);
+  else
+    MCE.emitWordLE(0);
+}
+
+inline static unsigned char ModRMByte(unsigned Mod, unsigned RegOpcode,
+                                      unsigned RM) {
+  assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!");
+  return RM | (RegOpcode << 3) | (Mod << 6);
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitRegModRMByte(unsigned ModRMReg,
+                                            unsigned RegOpcodeFld){
+  MCE.emitByte(ModRMByte(3, RegOpcodeFld, getX86RegNum(ModRMReg)));
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitRegModRMByte(unsigned RegOpcodeFld) {
+  MCE.emitByte(ModRMByte(3, RegOpcodeFld, 0));
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitSIBByte(unsigned SS,
+                                       unsigned Index,
+                                       unsigned Base) {
+  // SIB byte is in the same format as the ModRMByte...
+  MCE.emitByte(ModRMByte(SS, Index, Base));
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitConstant(uint64_t Val, unsigned Size) {
+  // Output the constant in little endian byte order...
+  for (unsigned i = 0; i != Size; ++i) {
+    MCE.emitByte(Val & 255);
+    Val >>= 8;
+  }
+}
+
+/// isDisp8 - Return true if this signed displacement fits in a 8-bit
+/// sign-extended field.
+static bool isDisp8(int Value) {
+  return Value == (signed char)Value;
+}
+
+static bool gvNeedsNonLazyPtr(const MachineOperand &GVOp,
+                              const TargetMachine &TM) {
+  // For Darwin-64, simulate the linktime GOT by using the same non-lazy-pointer
+  // mechanism as 32-bit mode.
+  if (TM.getSubtarget<X86Subtarget>().is64Bit() &&
+      !TM.getSubtarget<X86Subtarget>().isTargetDarwin())
+    return false;
+
+  // Return true if this is a reference to a stub containing the address of the
+  // global, not the global itself.
+  return isGlobalStubReference(GVOp.getTargetFlags());
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitDisplacementField(const MachineOperand *RelocOp,
+                                                 int DispVal,
+                                                 intptr_t Adj /* = 0 */,
+                                                 bool IsPCRel /* = true */) {
+  // If this is a simple integer displacement that doesn't require a relocation,
+  // emit it now.
+  if (!RelocOp) {
+    emitConstant(DispVal, 4);
+    return;
+  }
+
+  // Otherwise, this is something that requires a relocation.  Emit it as such
+  // now.
+  unsigned RelocType = Is64BitMode ?
+    (IsPCRel ? X86::reloc_pcrel_word : X86::reloc_absolute_word_sext)
+    : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
+  if (RelocOp->isGlobal()) {
+    // In 64-bit static small code model, we could potentially emit absolute.
+    // But it's probably not beneficial. If the MCE supports using RIP directly
+    // do it, otherwise fallback to absolute (this is determined by IsPCRel).
+    //  89 05 00 00 00 00     mov    %eax,0(%rip)  # PC-relative
+    //  89 04 25 00 00 00 00  mov    %eax,0x0      # Absolute
+    bool Indirect = gvNeedsNonLazyPtr(*RelocOp, TM);
+    emitGlobalAddress(RelocOp->getGlobal(), RelocType, RelocOp->getOffset(),
+                      Adj, Indirect);
+  } else if (RelocOp->isSymbol()) {
+    emitExternalSymbolAddress(RelocOp->getSymbolName(), RelocType);
+  } else if (RelocOp->isCPI()) {
+    emitConstPoolAddress(RelocOp->getIndex(), RelocType,
+                         RelocOp->getOffset(), Adj);
+  } else {
+    assert(RelocOp->isJTI() && "Unexpected machine operand!");
+    emitJumpTableAddress(RelocOp->getIndex(), RelocType, Adj);
+  }
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI,
+                                            unsigned Op,unsigned RegOpcodeField,
+                                            intptr_t PCAdj) {
+  const MachineOperand &Op3 = MI.getOperand(Op+3);
+  int DispVal = 0;
+  const MachineOperand *DispForReloc = nullptr;
+
+  // Figure out what sort of displacement we have to handle here.
+  if (Op3.isGlobal()) {
+    DispForReloc = &Op3;
+  } else if (Op3.isSymbol()) {
+    DispForReloc = &Op3;
+  } else if (Op3.isCPI()) {
+    if (!MCE.earlyResolveAddresses() || Is64BitMode || IsPIC) {
+      DispForReloc = &Op3;
+    } else {
+      DispVal += MCE.getConstantPoolEntryAddress(Op3.getIndex());
+      DispVal += Op3.getOffset();
+    }
+  } else if (Op3.isJTI()) {
+    if (!MCE.earlyResolveAddresses() || Is64BitMode || IsPIC) {
+      DispForReloc = &Op3;
+    } else {
+      DispVal += MCE.getJumpTableEntryAddress(Op3.getIndex());
+    }
+  } else {
+    DispVal = Op3.getImm();
+  }
+
+  const MachineOperand &Base     = MI.getOperand(Op);
+  const MachineOperand &Scale    = MI.getOperand(Op+1);
+  const MachineOperand &IndexReg = MI.getOperand(Op+2);
+
+  unsigned BaseReg = Base.getReg();
+
+  // Handle %rip relative addressing.
+  if (BaseReg == X86::RIP ||
+      (Is64BitMode && DispForReloc)) { // [disp32+RIP] in X86-64 mode
+    assert(IndexReg.getReg() == 0 && Is64BitMode &&
+           "Invalid rip-relative address");
+    MCE.emitByte(ModRMByte(0, RegOpcodeField, 5));
+    emitDisplacementField(DispForReloc, DispVal, PCAdj, true);
+    return;
+  }
+
+  // Indicate that the displacement will use an pcrel or absolute reference
+  // by default. MCEs able to resolve addresses on-the-fly use pcrel by default
+  // while others, unless explicit asked to use RIP, use absolute references.
+  bool IsPCRel = MCE.earlyResolveAddresses() ? true : false;
+
+  // Is a SIB byte needed?
+  // If no BaseReg, issue a RIP relative instruction only if the MCE can
+  // resolve addresses on-the-fly, otherwise use SIB (Intel Manual 2A, table
+  // 2-7) and absolute references.
+  unsigned BaseRegNo = -1U;
+  if (BaseReg != 0 && BaseReg != X86::RIP)
+    BaseRegNo = getX86RegNum(BaseReg);
+
+  if (// The SIB byte must be used if there is an index register.
+      IndexReg.getReg() == 0 &&
+      // The SIB byte must be used if the base is ESP/RSP/R12, all of which
+      // encode to an R/M value of 4, which indicates that a SIB byte is
+      // present.
+      BaseRegNo != N86::ESP &&
+      // If there is no base register and we're in 64-bit mode, we need a SIB
+      // byte to emit an addr that is just 'disp32' (the non-RIP relative form).
+      (!Is64BitMode || BaseReg != 0)) {
+    if (BaseReg == 0 ||          // [disp32]     in X86-32 mode
+        BaseReg == X86::RIP) {   // [disp32+RIP] in X86-64 mode
+      MCE.emitByte(ModRMByte(0, RegOpcodeField, 5));
+      emitDisplacementField(DispForReloc, DispVal, PCAdj, true);
+      return;
+    }
+
+    // If the base is not EBP/ESP and there is no displacement, use simple
+    // indirect register encoding, this handles addresses like [EAX].  The
+    // encoding for [EBP] with no displacement means [disp32] so we handle it
+    // by emitting a displacement of 0 below.
+    if (!DispForReloc && DispVal == 0 && BaseRegNo != N86::EBP) {
+      MCE.emitByte(ModRMByte(0, RegOpcodeField, BaseRegNo));
+      return;
+    }
+
+    // Otherwise, if the displacement fits in a byte, encode as [REG+disp8].
+    if (!DispForReloc && isDisp8(DispVal)) {
+      MCE.emitByte(ModRMByte(1, RegOpcodeField, BaseRegNo));
+      emitConstant(DispVal, 1);
+      return;
+    }
+
+    // Otherwise, emit the most general non-SIB encoding: [REG+disp32]
+    MCE.emitByte(ModRMByte(2, RegOpcodeField, BaseRegNo));
+    emitDisplacementField(DispForReloc, DispVal, PCAdj, IsPCRel);
+    return;
+  }
+
+  // Otherwise we need a SIB byte, so start by outputting the ModR/M byte first.
+  assert(IndexReg.getReg() != X86::ESP &&
+         IndexReg.getReg() != X86::RSP && "Cannot use ESP as index reg!");
+
+  bool ForceDisp32 = false;
+  bool ForceDisp8  = false;
+  if (BaseReg == 0) {
+    // If there is no base register, we emit the special case SIB byte with
+    // MOD=0, BASE=4, to JUST get the index, scale, and displacement.
+    MCE.emitByte(ModRMByte(0, RegOpcodeField, 4));
+    ForceDisp32 = true;
+  } else if (DispForReloc) {
+    // Emit the normal disp32 encoding.
+    MCE.emitByte(ModRMByte(2, RegOpcodeField, 4));
+    ForceDisp32 = true;
+  } else if (DispVal == 0 && BaseRegNo != N86::EBP) {
+    // Emit no displacement ModR/M byte
+    MCE.emitByte(ModRMByte(0, RegOpcodeField, 4));
+  } else if (isDisp8(DispVal)) {
+    // Emit the disp8 encoding...
+    MCE.emitByte(ModRMByte(1, RegOpcodeField, 4));
+    ForceDisp8 = true;           // Make sure to force 8 bit disp if Base=EBP
+  } else {
+    // Emit the normal disp32 encoding...
+    MCE.emitByte(ModRMByte(2, RegOpcodeField, 4));
+  }
+
+  // Calculate what the SS field value should be...
+  static const unsigned SSTable[] = { ~0U, 0, 1, ~0U, 2, ~0U, ~0U, ~0U, 3 };
+  unsigned SS = SSTable[Scale.getImm()];
+
+  if (BaseReg == 0) {
+    // Handle the SIB byte for the case where there is no base, see Intel
+    // Manual 2A, table 2-7. The displacement has already been output.
+    unsigned IndexRegNo;
+    if (IndexReg.getReg())
+      IndexRegNo = getX86RegNum(IndexReg.getReg());
+    else // Examples: [ESP+1*<noreg>+4] or [scaled idx]+disp32 (MOD=0,BASE=5)
+      IndexRegNo = 4;
+    emitSIBByte(SS, IndexRegNo, 5);
+  } else {
+    unsigned BaseRegNo = getX86RegNum(BaseReg);
+    unsigned IndexRegNo;
+    if (IndexReg.getReg())
+      IndexRegNo = getX86RegNum(IndexReg.getReg());
+    else
+      IndexRegNo = 4;   // For example [ESP+1*<noreg>+4]
+    emitSIBByte(SS, IndexRegNo, BaseRegNo);
+  }
+
+  // Do we need to output a displacement?
+  if (ForceDisp8) {
+    emitConstant(DispVal, 1);
+  } else if (DispVal != 0 || ForceDisp32) {
+    emitDisplacementField(DispForReloc, DispVal, PCAdj, IsPCRel);
+  }
+}
+
+static const MCInstrDesc *UpdateOp(MachineInstr &MI, const X86InstrInfo *II,
+                                   unsigned Opcode) {
+  const MCInstrDesc *Desc = &II->get(Opcode);
+  MI.setDesc(*Desc);
+  return Desc;
+}
+
+/// Is16BitMemOperand - Return true if the specified instruction has
+/// a 16-bit memory operand. Op specifies the operand # of the memoperand.
+static bool Is16BitMemOperand(const MachineInstr &MI, unsigned Op) {
+  const MachineOperand &BaseReg  = MI.getOperand(Op+X86::AddrBaseReg);
+  const MachineOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+
+  if ((BaseReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg.getReg())) ||
+      (IndexReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg.getReg())))
+    return true;
+  return false;
+}
+
+/// Is32BitMemOperand - Return true if the specified instruction has
+/// a 32-bit memory operand. Op specifies the operand # of the memoperand.
+static bool Is32BitMemOperand(const MachineInstr &MI, unsigned Op) {
+  const MachineOperand &BaseReg  = MI.getOperand(Op+X86::AddrBaseReg);
+  const MachineOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+
+  if ((BaseReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg.getReg())) ||
+      (IndexReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg.getReg())))
+    return true;
+  return false;
+}
+
+/// Is64BitMemOperand - Return true if the specified instruction has
+/// a 64-bit memory operand. Op specifies the operand # of the memoperand.
+#ifndef NDEBUG
+static bool Is64BitMemOperand(const MachineInstr &MI, unsigned Op) {
+  const MachineOperand &BaseReg  = MI.getOperand(Op+X86::AddrBaseReg);
+  const MachineOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+
+  if ((BaseReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg.getReg())) ||
+      (IndexReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg.getReg())))
+    return true;
+  return false;
+}
+#endif
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitOpcodePrefix(uint64_t TSFlags,
+                                            int MemOperand,
+                                            const MachineInstr &MI,
+                                            const MCInstrDesc *Desc) const {
+  // Emit the operand size opcode prefix as needed.
+  if (((TSFlags & X86II::OpSizeMask) >> X86II::OpSizeShift) == X86II::OpSize16)
+    MCE.emitByte(0x66);
+
+  switch (Desc->TSFlags & X86II::OpPrefixMask) {
+  case X86II::PD:   // 66
+    MCE.emitByte(0x66);
+    break;
+  case X86II::XS:   // F3
+    MCE.emitByte(0xF3);
+    break;
+  case X86II::XD:   // F2
+    MCE.emitByte(0xF2);
+    break;
+  }
+
+  // Handle REX prefix.
+  if (Is64BitMode) {
+    if (unsigned REX = determineREX(MI))
+      MCE.emitByte(0x40 | REX);
+  }
+
+  // 0x0F escape code must be emitted just before the opcode.
+  switch (Desc->TSFlags & X86II::OpMapMask) {
+  case X86II::TB:  // Two-byte opcode map
+  case X86II::T8:  // 0F 38
+  case X86II::TA:  // 0F 3A
+    MCE.emitByte(0x0F);
+    break;
+  }
+
+  switch (Desc->TSFlags & X86II::OpMapMask) {
+  case X86II::T8:    // 0F 38
+    MCE.emitByte(0x38);
+    break;
+  case X86II::TA:    // 0F 3A
+    MCE.emitByte(0x3A);
+    break;
+  }
+}
+
+// On regular x86, both XMM0-XMM7 and XMM8-XMM15 are encoded in the range
+// 0-7 and the difference between the 2 groups is given by the REX prefix.
+// In the VEX prefix, registers are seen sequencially from 0-15 and encoded
+// in 1's complement form, example:
+//
+//  ModRM field => XMM9 => 1
+//  VEX.VVVV    => XMM9 => ~9
+//
+// See table 4-35 of Intel AVX Programming Reference for details.
+template<class CodeEmitter>
+unsigned char
+Emitter<CodeEmitter>::getVEXRegisterEncoding(const MachineInstr &MI,
+                                             unsigned OpNum) const {
+  unsigned SrcReg = MI.getOperand(OpNum).getReg();
+  unsigned SrcRegNum = getX86RegNum(MI.getOperand(OpNum).getReg());
+  if (X86II::isX86_64ExtendedReg(SrcReg))
+    SrcRegNum |= 8;
+
+  // The registers represented through VEX_VVVV should
+  // be encoded in 1's complement form.
+  return (~SrcRegNum) & 0xf;
+}
+
+/// EmitSegmentOverridePrefix - Emit segment override opcode prefix as needed
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitSegmentOverridePrefix(uint64_t TSFlags,
+                                                 int MemOperand,
+                                                 const MachineInstr &MI) const {
+  if (MemOperand < 0)
+    return; // No memory operand
+
+  // Check for explicit segment override on memory operand.
+  switch (MI.getOperand(MemOperand+X86::AddrSegmentReg).getReg()) {
+  default: llvm_unreachable("Unknown segment register!");
+  case 0: break;
+  case X86::CS: MCE.emitByte(0x2E); break;
+  case X86::SS: MCE.emitByte(0x36); break;
+  case X86::DS: MCE.emitByte(0x3E); break;
+  case X86::ES: MCE.emitByte(0x26); break;
+  case X86::FS: MCE.emitByte(0x64); break;
+  case X86::GS: MCE.emitByte(0x65); break;
+  }
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
+                                               int MemOperand,
+                                               const MachineInstr &MI,
+                                               const MCInstrDesc *Desc) const {
+  unsigned char Encoding = (TSFlags & X86II::EncodingMask) >>
+                           X86II::EncodingShift;
+  bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
+  bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3;
+  bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
+
+  // VEX_R: opcode externsion equivalent to REX.R in
+  // 1's complement (inverted) form
+  //
+  //  1: Same as REX_R=0 (must be 1 in 32-bit mode)
+  //  0: Same as REX_R=1 (64 bit mode only)
+  //
+  unsigned char VEX_R = 0x1;
+
+  // VEX_X: equivalent to REX.X, only used when a
+  // register is used for index in SIB Byte.
+  //
+  //  1: Same as REX.X=0 (must be 1 in 32-bit mode)
+  //  0: Same as REX.X=1 (64-bit mode only)
+  unsigned char VEX_X = 0x1;
+
+  // VEX_B:
+  //
+  //  1: Same as REX_B=0 (ignored in 32-bit mode)
+  //  0: Same as REX_B=1 (64 bit mode only)
+  //
+  unsigned char VEX_B = 0x1;
+
+  // VEX_W: opcode specific (use like REX.W, or used for
+  // opcode extension, or ignored, depending on the opcode byte)
+  unsigned char VEX_W = 0;
+
+  // VEX_5M (VEX m-mmmmm field):
+  //
+  //  0b00000: Reserved for future use
+  //  0b00001: implied 0F leading opcode
+  //  0b00010: implied 0F 38 leading opcode bytes
+  //  0b00011: implied 0F 3A leading opcode bytes
+  //  0b00100-0b11111: Reserved for future use
+  //  0b01000: XOP map select - 08h instructions with imm byte
+  //  0b01001: XOP map select - 09h instructions with no imm byte
+  //  0b01010: XOP map select - 0Ah instructions with imm dword
+  unsigned char VEX_5M = 0;
+
+  // VEX_4V (VEX vvvv field): a register specifier
+  // (in 1's complement form) or 1111 if unused.
+  unsigned char VEX_4V = 0xf;
+
+  // VEX_L (Vector Length):
+  //
+  //  0: scalar or 128-bit vector
+  //  1: 256-bit vector
+  //
+  unsigned char VEX_L = 0;
+
+  // VEX_PP: opcode extension providing equivalent
+  // functionality of a SIMD prefix
+  //
+  //  0b00: None
+  //  0b01: 66
+  //  0b10: F3
+  //  0b11: F2
+  //
+  unsigned char VEX_PP = 0;
+
+  if ((TSFlags >> X86II::VEXShift) & X86II::VEX_W)
+    VEX_W = 1;
+
+  if ((TSFlags >> X86II::VEXShift) & X86II::VEX_L)
+    VEX_L = 1;
+
+  switch (TSFlags & X86II::OpPrefixMask) {
+  default: break; // VEX_PP already correct
+  case X86II::PD: VEX_PP = 0x1; break; // 66
+  case X86II::XS: VEX_PP = 0x2; break; // F3
+  case X86II::XD: VEX_PP = 0x3; break; // F2
+  }
+
+  switch (TSFlags & X86II::OpMapMask) {
+  default: llvm_unreachable("Invalid prefix!");
+  case X86II::TB:   VEX_5M = 0x1; break; // 0F
+  case X86II::T8:   VEX_5M = 0x2; break; // 0F 38
+  case X86II::TA:   VEX_5M = 0x3; break; // 0F 3A
+  case X86II::XOP8: VEX_5M = 0x8; break;
+  case X86II::XOP9: VEX_5M = 0x9; break;
+  case X86II::XOPA: VEX_5M = 0xA; break;
+  }
+
+  // Classify VEX_B, VEX_4V, VEX_R, VEX_X
+  unsigned NumOps = Desc->getNumOperands();
+  unsigned CurOp = 0;
+  if (NumOps > 1 && Desc->getOperandConstraint(1, MCOI::TIED_TO) == 0)
+    ++CurOp;
+  else if (NumOps > 3 && Desc->getOperandConstraint(2, MCOI::TIED_TO) == 0) {
+    assert(Desc->getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1);
+    // Special case for GATHER with 2 TIED_TO operands
+    // Skip the first 2 operands: dst, mask_wb
+    CurOp += 2;
+  }
+
+  switch (TSFlags & X86II::FormMask) {
+    default: llvm_unreachable("Unexpected form in emitVEXOpcodePrefix!");
+    case X86II::RawFrm:
+      break;
+    case X86II::MRMDestMem: {
+      // MRMDestMem instructions forms:
+      //  MemAddr, src1(ModR/M)
+      //  MemAddr, src1(VEX_4V), src2(ModR/M)
+      //  MemAddr, src1(ModR/M), imm8
+      //
+      if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrBaseReg).getReg()))
+        VEX_B = 0x0;
+      if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrIndexReg).getReg()))
+        VEX_X = 0x0;
+
+      CurOp = X86::AddrNumOperands;
+      if (HasVEX_4V)
+        VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
+
+      const MachineOperand &MO = MI.getOperand(CurOp);
+      if (MO.isReg() && X86II::isX86_64ExtendedReg(MO.getReg()))
+        VEX_R = 0x0;
+      break;
+    }
+    case X86II::MRMSrcMem:
+      // MRMSrcMem instructions forms:
+      //  src1(ModR/M), MemAddr
+      //  src1(ModR/M), src2(VEX_4V), MemAddr
+      //  src1(ModR/M), MemAddr, imm8
+      //  src1(ModR/M), MemAddr, src2(VEX_I8IMM)
+      //
+      //  FMA4:
+      //  dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
+      //  dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M),
+      if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+        VEX_R = 0x0;
+      CurOp++;
+
+      if (HasVEX_4V) {
+        VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+        CurOp++;
+      }
+
+      if (X86II::isX86_64ExtendedReg(
+                          MI.getOperand(MemOperand+X86::AddrBaseReg).getReg()))
+        VEX_B = 0x0;
+      if (X86II::isX86_64ExtendedReg(
+                          MI.getOperand(MemOperand+X86::AddrIndexReg).getReg()))
+        VEX_X = 0x0;
+
+      if (HasVEX_4VOp3)
+        VEX_4V = getVEXRegisterEncoding(MI, CurOp+X86::AddrNumOperands);
+      break;
+    case X86II::MRM0m: case X86II::MRM1m:
+    case X86II::MRM2m: case X86II::MRM3m:
+    case X86II::MRM4m: case X86II::MRM5m:
+    case X86II::MRM6m: case X86II::MRM7m: {
+      // MRM[0-9]m instructions forms:
+      //  MemAddr
+      //  src1(VEX_4V), MemAddr
+      if (HasVEX_4V)
+        VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
+
+      if (X86II::isX86_64ExtendedReg(
+                          MI.getOperand(MemOperand+X86::AddrBaseReg).getReg()))
+        VEX_B = 0x0;
+      if (X86II::isX86_64ExtendedReg(
+                          MI.getOperand(MemOperand+X86::AddrIndexReg).getReg()))
+        VEX_X = 0x0;
+      break;
+    }
+    case X86II::MRMSrcReg:
+      // MRMSrcReg instructions forms:
+      //  dst(ModR/M), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
+      //  dst(ModR/M), src1(ModR/M)
+      //  dst(ModR/M), src1(ModR/M), imm8
+      //
+      if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+        VEX_R = 0x0;
+      CurOp++;
+
+      if (HasVEX_4V)
+        VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
+
+      if (HasMemOp4) // Skip second register source (encoded in I8IMM)
+        CurOp++;
+
+      if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+        VEX_B = 0x0;
+      CurOp++;
+      if (HasVEX_4VOp3)
+        VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+      break;
+    case X86II::MRMDestReg:
+      // MRMDestReg instructions forms:
+      //  dst(ModR/M), src(ModR/M)
+      //  dst(ModR/M), src(ModR/M), imm8
+      //  dst(ModR/M), src1(VEX_4V), src2(ModR/M)
+      if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+        VEX_B = 0x0;
+      CurOp++;
+
+      if (HasVEX_4V)
+        VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
+
+      if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+        VEX_R = 0x0;
+      break;
+    case X86II::MRM0r: case X86II::MRM1r:
+    case X86II::MRM2r: case X86II::MRM3r:
+    case X86II::MRM4r: case X86II::MRM5r:
+    case X86II::MRM6r: case X86II::MRM7r:
+      // MRM0r-MRM7r instructions forms:
+      //  dst(VEX_4V), src(ModR/M), imm8
+      VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+      CurOp++;
+
+      if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+        VEX_B = 0x0;
+      break;
+  }
+
+  // Emit segment override opcode prefix as needed.
+  emitSegmentOverridePrefix(TSFlags, MemOperand, MI);
+
+  // VEX opcode prefix can have 2 or 3 bytes
+  //
+  //  3 bytes:
+  //    +-----+ +--------------+ +-------------------+
+  //    | C4h | | RXB | m-mmmm | | W | vvvv | L | pp |
+  //    +-----+ +--------------+ +-------------------+
+  //  2 bytes:
+  //    +-----+ +-------------------+
+  //    | C5h | | R | vvvv | L | pp |
+  //    +-----+ +-------------------+
+  //
+  //  XOP uses a similar prefix:
+  //    +-----+ +--------------+ +-------------------+
+  //    | 8Fh | | RXB | m-mmmm | | W | vvvv | L | pp |
+  //    +-----+ +--------------+ +-------------------+
+  unsigned char LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3);
+
+  // Can this use the 2 byte VEX prefix?
+  if (Encoding == X86II::VEX && VEX_B && VEX_X && !VEX_W && (VEX_5M == 1)) {
+    MCE.emitByte(0xC5);
+    MCE.emitByte(LastByte | (VEX_R << 7));
+    return;
+  }
+
+  // 3 byte VEX prefix
+  MCE.emitByte(Encoding == X86II::XOP ? 0x8F : 0xC4);
+  MCE.emitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M);
+  MCE.emitByte(LastByte | (VEX_W << 7));
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
+                                           const MCInstrDesc *Desc) {
+  DEBUG(dbgs() << MI);
+
+  // If this is a pseudo instruction, lower it.
+  switch (Desc->getOpcode()) {
+  case X86::ADD16rr_DB:      Desc = UpdateOp(MI, II, X86::OR16rr); break;
+  case X86::ADD32rr_DB:      Desc = UpdateOp(MI, II, X86::OR32rr); break;
+  case X86::ADD64rr_DB:      Desc = UpdateOp(MI, II, X86::OR64rr); break;
+  case X86::ADD16ri_DB:      Desc = UpdateOp(MI, II, X86::OR16ri); break;
+  case X86::ADD32ri_DB:      Desc = UpdateOp(MI, II, X86::OR32ri); break;
+  case X86::ADD64ri32_DB:    Desc = UpdateOp(MI, II, X86::OR64ri32); break;
+  case X86::ADD16ri8_DB:     Desc = UpdateOp(MI, II, X86::OR16ri8); break;
+  case X86::ADD32ri8_DB:     Desc = UpdateOp(MI, II, X86::OR32ri8); break;
+  case X86::ADD64ri8_DB:     Desc = UpdateOp(MI, II, X86::OR64ri8); break;
+  case X86::ACQUIRE_MOV8rm:  Desc = UpdateOp(MI, II, X86::MOV8rm); break;
+  case X86::ACQUIRE_MOV16rm: Desc = UpdateOp(MI, II, X86::MOV16rm); break;
+  case X86::ACQUIRE_MOV32rm: Desc = UpdateOp(MI, II, X86::MOV32rm); break;
+  case X86::ACQUIRE_MOV64rm: Desc = UpdateOp(MI, II, X86::MOV64rm); break;
+  case X86::RELEASE_MOV8mr:  Desc = UpdateOp(MI, II, X86::MOV8mr); break;
+  case X86::RELEASE_MOV16mr: Desc = UpdateOp(MI, II, X86::MOV16mr); break;
+  case X86::RELEASE_MOV32mr: Desc = UpdateOp(MI, II, X86::MOV32mr); break;
+  case X86::RELEASE_MOV64mr: Desc = UpdateOp(MI, II, X86::MOV64mr); break;
+  }
+
+
+  MCE.processDebugLoc(MI.getDebugLoc(), true);
+
+  unsigned Opcode = Desc->Opcode;
+
+  // If this is a two-address instruction, skip one of the register operands.
+  unsigned NumOps = Desc->getNumOperands();
+  unsigned CurOp = 0;
+  if (NumOps > 1 && Desc->getOperandConstraint(1, MCOI::TIED_TO) == 0)
+    ++CurOp;
+  else if (NumOps > 3 && Desc->getOperandConstraint(2, MCOI::TIED_TO) == 0) {
+    assert(Desc->getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1);
+    // Special case for GATHER with 2 TIED_TO operands
+    // Skip the first 2 operands: dst, mask_wb
+    CurOp += 2;
+  }
+
+  uint64_t TSFlags = Desc->TSFlags;
+
+  // Encoding type for this instruction.
+  unsigned char Encoding = (TSFlags & X86II::EncodingMask) >>
+                           X86II::EncodingShift;
+
+  // It uses the VEX.VVVV field?
+  bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
+  bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3;
+  bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
+  const unsigned MemOp4_I8IMMOperand = 2;
+
+  // Determine where the memory operand starts, if present.
+  int MemoryOperand = X86II::getMemoryOperandNo(TSFlags, Opcode);
+  if (MemoryOperand != -1) MemoryOperand += CurOp;
+
+  // Emit the lock opcode prefix as needed.
+  if (Desc->TSFlags & X86II::LOCK)
+    MCE.emitByte(0xF0);
+
+  // Emit segment override opcode prefix as needed.
+  emitSegmentOverridePrefix(TSFlags, MemoryOperand, MI);
+
+  // Emit the repeat opcode prefix as needed.
+  if (Desc->TSFlags & X86II::REP)
+    MCE.emitByte(0xF3);
+
+  // Emit the address size opcode prefix as needed.
+  bool need_address_override;
+  if (TSFlags & X86II::AdSize) {
+    need_address_override = true;
+  } else if (MemoryOperand < 0) {
+    need_address_override = false;
+  } else if (Is64BitMode) {
+    assert(!Is16BitMemOperand(MI, MemoryOperand));
+    need_address_override = Is32BitMemOperand(MI, MemoryOperand);
+  } else {
+    assert(!Is64BitMemOperand(MI, MemoryOperand));
+    need_address_override = Is16BitMemOperand(MI, MemoryOperand);
+  }
+
+  if (need_address_override)
+    MCE.emitByte(0x67);
+
+  if (Encoding == 0)
+    emitOpcodePrefix(TSFlags, MemoryOperand, MI, Desc);
+  else
+    emitVEXOpcodePrefix(TSFlags, MemoryOperand, MI, Desc);
+
+  unsigned char BaseOpcode = X86II::getBaseOpcodeFor(Desc->TSFlags);
+  switch (TSFlags & X86II::FormMask) {
+  default:
+    llvm_unreachable("Unknown FormMask value in X86 MachineCodeEmitter!");
+  case X86II::Pseudo:
+    // Remember the current PC offset, this is the PIC relocation
+    // base address.
+    switch (Opcode) {
+    default:
+      llvm_unreachable("pseudo instructions should be removed before code"
+                       " emission");
+    // Do nothing for Int_MemBarrier - it's just a comment.  Add a debug
+    // to make it slightly easier to see.
+    case X86::Int_MemBarrier:
+      DEBUG(dbgs() << "#MEMBARRIER\n");
+      break;
+
+    case TargetOpcode::INLINEASM:
+      // We allow inline assembler nodes with empty bodies - they can
+      // implicitly define registers, which is ok for JIT.
+      if (MI.getOperand(0).getSymbolName()[0]) {
+        DebugLoc DL = MI.getDebugLoc();
+        DL.print(MI.getParent()->getParent()->getFunction()->getContext(),
+                 llvm::errs());
+        report_fatal_error("JIT does not support inline asm!");
+      }
+      break;
+    case TargetOpcode::DBG_VALUE:
+    case TargetOpcode::CFI_INSTRUCTION:
+      break;
+    case TargetOpcode::GC_LABEL:
+    case TargetOpcode::EH_LABEL:
+      MCE.emitLabel(MI.getOperand(0).getMCSymbol());
+      break;
+
+    case TargetOpcode::IMPLICIT_DEF:
+    case TargetOpcode::KILL:
+      break;
+
+    case X86::SEH_PushReg:
+    case X86::SEH_SaveReg:
+    case X86::SEH_SaveXMM:
+    case X86::SEH_StackAlloc:
+    case X86::SEH_SetFrame:
+    case X86::SEH_PushFrame:
+    case X86::SEH_EndPrologue:
+      break;
+
+    case X86::MOVPC32r: {
+      // This emits the "call" portion of this pseudo instruction.
+      MCE.emitByte(BaseOpcode);
+      emitConstant(0, X86II::getSizeOfImm(Desc->TSFlags));
+      // Remember PIC base.
+      PICBaseOffset = (intptr_t) MCE.getCurrentPCOffset();
+      X86JITInfo *JTI = TM.getJITInfo();
+      JTI->setPICBase(MCE.getCurrentPCValue());
+      break;
+    }
+    }
+    CurOp = NumOps;
+    break;
+  case X86II::RawFrm: {
+    MCE.emitByte(BaseOpcode);
+
+    if (CurOp == NumOps)
+      break;
+
+    const MachineOperand &MO = MI.getOperand(CurOp++);
+
+    DEBUG(dbgs() << "RawFrm CurOp " << CurOp << "\n");
+    DEBUG(dbgs() << "isMBB " << MO.isMBB() << "\n");
+    DEBUG(dbgs() << "isGlobal " << MO.isGlobal() << "\n");
+    DEBUG(dbgs() << "isSymbol " << MO.isSymbol() << "\n");
+    DEBUG(dbgs() << "isImm " << MO.isImm() << "\n");
+
+    if (MO.isMBB()) {
+      emitPCRelativeBlockAddress(MO.getMBB());
+      break;
+    }
+
+    if (MO.isGlobal()) {
+      emitGlobalAddress(MO.getGlobal(), X86::reloc_pcrel_word,
+                        MO.getOffset(), 0);
+      break;
+    }
+
+    if (MO.isSymbol()) {
+      emitExternalSymbolAddress(MO.getSymbolName(), X86::reloc_pcrel_word);
+      break;
+    }
+
+    // FIXME: Only used by hackish MCCodeEmitter, remove when dead.
+    if (MO.isJTI()) {
+      emitJumpTableAddress(MO.getIndex(), X86::reloc_pcrel_word);
+      break;
+    }
+
+    assert(MO.isImm() && "Unknown RawFrm operand!");
+    if (Opcode == X86::CALLpcrel32 || Opcode == X86::CALL64pcrel32) {
+      // Fix up immediate operand for pc relative calls.
+      intptr_t Imm = (intptr_t)MO.getImm();
+      Imm = Imm - MCE.getCurrentPCValue() - 4;
+      emitConstant(Imm, X86II::getSizeOfImm(Desc->TSFlags));
+    } else
+      emitConstant(MO.getImm(), X86II::getSizeOfImm(Desc->TSFlags));
+    break;
+  }
+
+  case X86II::AddRegFrm: {
+    MCE.emitByte(BaseOpcode +
+                 getX86RegNum(MI.getOperand(CurOp++).getReg()));
+
+    if (CurOp == NumOps)
+      break;
+
+    const MachineOperand &MO1 = MI.getOperand(CurOp++);
+    unsigned Size = X86II::getSizeOfImm(Desc->TSFlags);
+    if (MO1.isImm()) {
+      emitConstant(MO1.getImm(), Size);
+      break;
+    }
+
+    unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
+      : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
+    if (Opcode == X86::MOV32ri64)
+      rt = X86::reloc_absolute_word;  // FIXME: add X86II flag?
+    // This should not occur on Darwin for relocatable objects.
+    if (Opcode == X86::MOV64ri)
+      rt = X86::reloc_absolute_dword;  // FIXME: add X86II flag?
+    if (MO1.isGlobal()) {
+      bool Indirect = gvNeedsNonLazyPtr(MO1, TM);
+      emitGlobalAddress(MO1.getGlobal(), rt, MO1.getOffset(), 0,
+                        Indirect);
+    } else if (MO1.isSymbol())
+      emitExternalSymbolAddress(MO1.getSymbolName(), rt);
+    else if (MO1.isCPI())
+      emitConstPoolAddress(MO1.getIndex(), rt);
+    else if (MO1.isJTI())
+      emitJumpTableAddress(MO1.getIndex(), rt);
+    break;
+  }
+
+  case X86II::MRMDestReg: {
+    MCE.emitByte(BaseOpcode);
+
+    unsigned SrcRegNum = CurOp+1;
+    if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
+      SrcRegNum++;
+
+    emitRegModRMByte(MI.getOperand(CurOp).getReg(),
+                     getX86RegNum(MI.getOperand(SrcRegNum).getReg()));
+    CurOp = SrcRegNum + 1;
+    break;
+  }
+  case X86II::MRMDestMem: {
+    MCE.emitByte(BaseOpcode);
+
+    unsigned SrcRegNum = CurOp + X86::AddrNumOperands;
+    if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
+      SrcRegNum++;
+    emitMemModRMByte(MI, CurOp,
+                     getX86RegNum(MI.getOperand(SrcRegNum).getReg()));
+    CurOp = SrcRegNum + 1;
+    break;
+  }
+
+  case X86II::MRMSrcReg: {
+    MCE.emitByte(BaseOpcode);
+
+    unsigned SrcRegNum = CurOp+1;
+    if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
+      ++SrcRegNum;
+
+    if (HasMemOp4) // Skip 2nd src (which is encoded in I8IMM)
+      ++SrcRegNum;
+
+    emitRegModRMByte(MI.getOperand(SrcRegNum).getReg(),
+                     getX86RegNum(MI.getOperand(CurOp).getReg()));
+    // 2 operands skipped with HasMemOp4, compensate accordingly
+    CurOp = HasMemOp4 ? SrcRegNum : SrcRegNum + 1;
+    if (HasVEX_4VOp3)
+      ++CurOp;
+    break;
+  }
+  case X86II::MRMSrcMem: {
+    int AddrOperands = X86::AddrNumOperands;
+    unsigned FirstMemOp = CurOp+1;
+    if (HasVEX_4V) {
+      ++AddrOperands;
+      ++FirstMemOp;  // Skip the register source (which is encoded in VEX_VVVV).
+    }
+    if (HasMemOp4) // Skip second register source (encoded in I8IMM)
+      ++FirstMemOp;
+
+    MCE.emitByte(BaseOpcode);
+
+    intptr_t PCAdj = (CurOp + AddrOperands + 1 != NumOps) ?
+      X86II::getSizeOfImm(Desc->TSFlags) : 0;
+    emitMemModRMByte(MI, FirstMemOp,
+                     getX86RegNum(MI.getOperand(CurOp).getReg()),PCAdj);
+    CurOp += AddrOperands + 1;
+    if (HasVEX_4VOp3)
+      ++CurOp;
+    break;
+  }
+
+  case X86II::MRMXr:
+  case X86II::MRM0r: case X86II::MRM1r:
+  case X86II::MRM2r: case X86II::MRM3r:
+  case X86II::MRM4r: case X86II::MRM5r:
+  case X86II::MRM6r: case X86II::MRM7r: {
+    if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
+      ++CurOp;
+    MCE.emitByte(BaseOpcode);
+    uint64_t Form = (Desc->TSFlags & X86II::FormMask);
+    emitRegModRMByte(MI.getOperand(CurOp++).getReg(),
+                     (Form == X86II::MRMXr) ? 0 : Form-X86II::MRM0r);
+
+    if (CurOp == NumOps)
+      break;
+
+    const MachineOperand &MO1 = MI.getOperand(CurOp++);
+    unsigned Size = X86II::getSizeOfImm(Desc->TSFlags);
+    if (MO1.isImm()) {
+      emitConstant(MO1.getImm(), Size);
+      break;
+    }
+
+    unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
+      : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
+    if (Opcode == X86::MOV64ri32)
+      rt = X86::reloc_absolute_word_sext;  // FIXME: add X86II flag?
+    if (MO1.isGlobal()) {
+      bool Indirect = gvNeedsNonLazyPtr(MO1, TM);
+      emitGlobalAddress(MO1.getGlobal(), rt, MO1.getOffset(), 0,
+                        Indirect);
+    } else if (MO1.isSymbol())
+      emitExternalSymbolAddress(MO1.getSymbolName(), rt);
+    else if (MO1.isCPI())
+      emitConstPoolAddress(MO1.getIndex(), rt);
+    else if (MO1.isJTI())
+      emitJumpTableAddress(MO1.getIndex(), rt);
+    break;
+  }
+
+  case X86II::MRMXm:
+  case X86II::MRM0m: case X86II::MRM1m:
+  case X86II::MRM2m: case X86II::MRM3m:
+  case X86II::MRM4m: case X86II::MRM5m:
+  case X86II::MRM6m: case X86II::MRM7m: {
+    if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
+      ++CurOp;
+    intptr_t PCAdj = (CurOp + X86::AddrNumOperands != NumOps) ?
+      (MI.getOperand(CurOp+X86::AddrNumOperands).isImm() ?
+          X86II::getSizeOfImm(Desc->TSFlags) : 4) : 0;
+
+    MCE.emitByte(BaseOpcode);
+    uint64_t Form = (Desc->TSFlags & X86II::FormMask);
+    emitMemModRMByte(MI, CurOp, (Form==X86II::MRMXm) ? 0 : Form - X86II::MRM0m,
+                     PCAdj);
+    CurOp += X86::AddrNumOperands;
+
+    if (CurOp == NumOps)
+      break;
+
+    const MachineOperand &MO = MI.getOperand(CurOp++);
+    unsigned Size = X86II::getSizeOfImm(Desc->TSFlags);
+    if (MO.isImm()) {
+      emitConstant(MO.getImm(), Size);
+      break;
+    }
+
+    unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
+      : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
+    if (Opcode == X86::MOV64mi32)
+      rt = X86::reloc_absolute_word_sext;  // FIXME: add X86II flag?
+    if (MO.isGlobal()) {
+      bool Indirect = gvNeedsNonLazyPtr(MO, TM);
+      emitGlobalAddress(MO.getGlobal(), rt, MO.getOffset(), 0,
+                        Indirect);
+    } else if (MO.isSymbol())
+      emitExternalSymbolAddress(MO.getSymbolName(), rt);
+    else if (MO.isCPI())
+      emitConstPoolAddress(MO.getIndex(), rt);
+    else if (MO.isJTI())
+      emitJumpTableAddress(MO.getIndex(), rt);
+    break;
+  }
+
+  case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2:
+  case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C8:
+  case X86II::MRM_C9: case X86II::MRM_CA: case X86II::MRM_CB:
+  case X86II::MRM_D0: case X86II::MRM_D1: case X86II::MRM_D4:
+  case X86II::MRM_D5: case X86II::MRM_D6: case X86II::MRM_D8:
+  case X86II::MRM_D9: case X86II::MRM_DA: case X86II::MRM_DB:
+  case X86II::MRM_DC: case X86II::MRM_DD: case X86II::MRM_DE:
+  case X86II::MRM_DF: case X86II::MRM_E0: case X86II::MRM_E1:
+  case X86II::MRM_E2: case X86II::MRM_E3: case X86II::MRM_E4:
+  case X86II::MRM_E5: case X86II::MRM_E8: case X86II::MRM_E9:
+  case X86II::MRM_EA: case X86II::MRM_EB: case X86II::MRM_EC:
+  case X86II::MRM_ED: case X86II::MRM_EE: case X86II::MRM_F0:
+  case X86II::MRM_F1: case X86II::MRM_F2: case X86II::MRM_F3:
+  case X86II::MRM_F4: case X86II::MRM_F5: case X86II::MRM_F6:
+  case X86II::MRM_F7: case X86II::MRM_F8: case X86II::MRM_F9:
+  case X86II::MRM_FA: case X86II::MRM_FB: case X86II::MRM_FC:
+  case X86II::MRM_FD: case X86II::MRM_FE: case X86II::MRM_FF:
+    MCE.emitByte(BaseOpcode);
+
+    unsigned char MRM;
+    switch (TSFlags & X86II::FormMask) {
+    default: llvm_unreachable("Invalid Form");
+    case X86II::MRM_C0: MRM = 0xC0; break;
+    case X86II::MRM_C1: MRM = 0xC1; break;
+    case X86II::MRM_C2: MRM = 0xC2; break;
+    case X86II::MRM_C3: MRM = 0xC3; break;
+    case X86II::MRM_C4: MRM = 0xC4; break;
+    case X86II::MRM_C8: MRM = 0xC8; break;
+    case X86II::MRM_C9: MRM = 0xC9; break;
+    case X86II::MRM_CA: MRM = 0xCA; break;
+    case X86II::MRM_CB: MRM = 0xCB; break;
+    case X86II::MRM_D0: MRM = 0xD0; break;
+    case X86II::MRM_D1: MRM = 0xD1; break;
+    case X86II::MRM_D4: MRM = 0xD4; break;
+    case X86II::MRM_D5: MRM = 0xD5; break;
+    case X86II::MRM_D6: MRM = 0xD6; break;
+    case X86II::MRM_D8: MRM = 0xD8; break;
+    case X86II::MRM_D9: MRM = 0xD9; break;
+    case X86II::MRM_DA: MRM = 0xDA; break;
+    case X86II::MRM_DB: MRM = 0xDB; break;
+    case X86II::MRM_DC: MRM = 0xDC; break;
+    case X86II::MRM_DD: MRM = 0xDD; break;
+    case X86II::MRM_DE: MRM = 0xDE; break;
+    case X86II::MRM_DF: MRM = 0xDF; break;
+    case X86II::MRM_E0: MRM = 0xE0; break;
+    case X86II::MRM_E1: MRM = 0xE1; break;
+    case X86II::MRM_E2: MRM = 0xE2; break;
+    case X86II::MRM_E3: MRM = 0xE3; break;
+    case X86II::MRM_E4: MRM = 0xE4; break;
+    case X86II::MRM_E5: MRM = 0xE5; break;
+    case X86II::MRM_E8: MRM = 0xE8; break;
+    case X86II::MRM_E9: MRM = 0xE9; break;
+    case X86II::MRM_EA: MRM = 0xEA; break;
+    case X86II::MRM_EB: MRM = 0xEB; break;
+    case X86II::MRM_EC: MRM = 0xEC; break;
+    case X86II::MRM_ED: MRM = 0xED; break;
+    case X86II::MRM_EE: MRM = 0xEE; break;
+    case X86II::MRM_F0: MRM = 0xF0; break;
+    case X86II::MRM_F1: MRM = 0xF1; break;
+    case X86II::MRM_F2: MRM = 0xF2; break;
+    case X86II::MRM_F3: MRM = 0xF3; break;
+    case X86II::MRM_F4: MRM = 0xF4; break;
+    case X86II::MRM_F5: MRM = 0xF5; break;
+    case X86II::MRM_F6: MRM = 0xF6; break;
+    case X86II::MRM_F7: MRM = 0xF7; break;
+    case X86II::MRM_F8: MRM = 0xF8; break;
+    case X86II::MRM_F9: MRM = 0xF9; break;
+    case X86II::MRM_FA: MRM = 0xFA; break;
+    case X86II::MRM_FB: MRM = 0xFB; break;
+    case X86II::MRM_FC: MRM = 0xFC; break;
+    case X86II::MRM_FD: MRM = 0xFD; break;
+    case X86II::MRM_FE: MRM = 0xFE; break;
+    case X86II::MRM_FF: MRM = 0xFF; break;
+    }
+    MCE.emitByte(MRM);
+    break;
+  }
+
+  while (CurOp != NumOps && NumOps - CurOp <= 2) {
+    // The last source register of a 4 operand instruction in AVX is encoded
+    // in bits[7:4] of a immediate byte.
+    if ((TSFlags >> X86II::VEXShift) & X86II::VEX_I8IMM) {
+      const MachineOperand &MO = MI.getOperand(HasMemOp4 ? MemOp4_I8IMMOperand
+                                                         : CurOp);
+      ++CurOp;
+      unsigned RegNum = getX86RegNum(MO.getReg()) << 4;
+      if (X86II::isX86_64ExtendedReg(MO.getReg()))
+        RegNum |= 1 << 7;
+      // If there is an additional 5th operand it must be an immediate, which
+      // is encoded in bits[3:0]
+      if (CurOp != NumOps) {
+        const MachineOperand &MIMM = MI.getOperand(CurOp++);
+        if (MIMM.isImm()) {
+          unsigned Val = MIMM.getImm();
+          assert(Val < 16 && "Immediate operand value out of range");
+          RegNum |= Val;
+        }
+      }
+      emitConstant(RegNum, 1);
+    } else {
+      emitConstant(MI.getOperand(CurOp++).getImm(),
+                   X86II::getSizeOfImm(Desc->TSFlags));
+    }
+  }
+
+  if (!MI.isVariadic() && CurOp != NumOps) {
+#ifndef NDEBUG
+    dbgs() << "Cannot encode all operands of: " << MI << "\n";
+#endif
+    llvm_unreachable(nullptr);
+  }
+
+  MCE.processDebugLoc(MI.getDebugLoc(), false);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86FastISel.cpp b/contrib/llvm/lib/Target/X86/X86FastISel.cpp
new file mode 100644
index 0000000..2d494b4
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86FastISel.cpp
@@ -0,0 +1,3304 @@
+//===-- X86FastISel.cpp - X86 FastISel implementation ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86-specific support for the FastISel class. Much
+// of the target-specific code is generated by tablegen in the file
+// X86GenFastISel.inc, which is #included here.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86CallingConv.h"
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+namespace {
+
+class X86FastISel final : public FastISel {
+  /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const X86Subtarget *Subtarget;
+
+  /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87
+  /// floating point ops.
+  /// When SSE is available, use it for f32 operations.
+  /// When SSE2 is available, use it for f64 operations.
+  bool X86ScalarSSEf64;
+  bool X86ScalarSSEf32;
+
+public:
+  explicit X86FastISel(FunctionLoweringInfo &funcInfo,
+                       const TargetLibraryInfo *libInfo)
+    : FastISel(funcInfo, libInfo) {
+    Subtarget = &TM.getSubtarget<X86Subtarget>();
+    X86ScalarSSEf64 = Subtarget->hasSSE2();
+    X86ScalarSSEf32 = Subtarget->hasSSE1();
+  }
+
+  bool TargetSelectInstruction(const Instruction *I) override;
+
+  /// \brief The specified machine instr operand is a vreg, and that
+  /// vreg is being provided by the specified load instruction.  If possible,
+  /// try to fold the load as an operand to the instruction, returning true if
+  /// possible.
+  bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
+                           const LoadInst *LI) override;
+
+  bool FastLowerArguments() override;
+  bool FastLowerCall(CallLoweringInfo &CLI) override;
+  bool FastLowerIntrinsicCall(const IntrinsicInst *II) override;
+
+#include "X86GenFastISel.inc"
+
+private:
+  bool X86FastEmitCompare(const Value *LHS, const Value *RHS, EVT VT);
+
+  bool X86FastEmitLoad(EVT VT, const X86AddressMode &AM, MachineMemOperand *MMO,
+                       unsigned &ResultReg);
+
+  bool X86FastEmitStore(EVT VT, const Value *Val, const X86AddressMode &AM,
+                        MachineMemOperand *MMO = nullptr, bool Aligned = false);
+  bool X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
+                        const X86AddressMode &AM,
+                        MachineMemOperand *MMO = nullptr, bool Aligned = false);
+
+  bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT,
+                         unsigned &ResultReg);
+
+  bool X86SelectAddress(const Value *V, X86AddressMode &AM);
+  bool X86SelectCallAddress(const Value *V, X86AddressMode &AM);
+
+  bool X86SelectLoad(const Instruction *I);
+
+  bool X86SelectStore(const Instruction *I);
+
+  bool X86SelectRet(const Instruction *I);
+
+  bool X86SelectCmp(const Instruction *I);
+
+  bool X86SelectZExt(const Instruction *I);
+
+  bool X86SelectBranch(const Instruction *I);
+
+  bool X86SelectShift(const Instruction *I);
+
+  bool X86SelectDivRem(const Instruction *I);
+
+  bool X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I);
+
+  bool X86FastEmitSSESelect(MVT RetVT, const Instruction *I);
+
+  bool X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I);
+
+  bool X86SelectSelect(const Instruction *I);
+
+  bool X86SelectTrunc(const Instruction *I);
+
+  bool X86SelectFPExt(const Instruction *I);
+  bool X86SelectFPTrunc(const Instruction *I);
+
+  const X86InstrInfo *getInstrInfo() const {
+    return getTargetMachine()->getInstrInfo();
+  }
+  const X86TargetMachine *getTargetMachine() const {
+    return static_cast<const X86TargetMachine *>(&TM);
+  }
+
+  bool handleConstantAddresses(const Value *V, X86AddressMode &AM);
+
+  unsigned TargetMaterializeConstant(const Constant *C) override;
+
+  unsigned TargetMaterializeAlloca(const AllocaInst *C) override;
+
+  unsigned TargetMaterializeFloatZero(const ConstantFP *CF) override;
+
+  /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is
+  /// computed in an SSE register, not on the X87 floating point stack.
+  bool isScalarFPTypeInSSEReg(EVT VT) const {
+    return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
+      (VT == MVT::f32 && X86ScalarSSEf32);   // f32 is when SSE1
+  }
+
+  bool isTypeLegal(Type *Ty, MVT &VT, bool AllowI1 = false);
+
+  bool IsMemcpySmall(uint64_t Len);
+
+  bool TryEmitSmallMemcpy(X86AddressMode DestAM,
+                          X86AddressMode SrcAM, uint64_t Len);
+
+  bool foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
+                            const Value *Cond);
+};
+
+} // end anonymous namespace.
+
+static CmpInst::Predicate optimizeCmpPredicate(const CmpInst *CI) {
+  // If both operands are the same, then try to optimize or fold the cmp.
+  CmpInst::Predicate Predicate = CI->getPredicate();
+  if (CI->getOperand(0) != CI->getOperand(1))
+    return Predicate;
+
+  switch (Predicate) {
+  default: llvm_unreachable("Invalid predicate!");
+  case CmpInst::FCMP_FALSE: Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::FCMP_OEQ:   Predicate = CmpInst::FCMP_ORD;   break;
+  case CmpInst::FCMP_OGT:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::FCMP_OGE:   Predicate = CmpInst::FCMP_ORD;   break;
+  case CmpInst::FCMP_OLT:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::FCMP_OLE:   Predicate = CmpInst::FCMP_ORD;   break;
+  case CmpInst::FCMP_ONE:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::FCMP_ORD:   Predicate = CmpInst::FCMP_ORD;   break;
+  case CmpInst::FCMP_UNO:   Predicate = CmpInst::FCMP_UNO;   break;
+  case CmpInst::FCMP_UEQ:   Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::FCMP_UGT:   Predicate = CmpInst::FCMP_UNO;   break;
+  case CmpInst::FCMP_UGE:   Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::FCMP_ULT:   Predicate = CmpInst::FCMP_UNO;   break;
+  case CmpInst::FCMP_ULE:   Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::FCMP_UNE:   Predicate = CmpInst::FCMP_UNO;   break;
+  case CmpInst::FCMP_TRUE:  Predicate = CmpInst::FCMP_TRUE;  break;
+
+  case CmpInst::ICMP_EQ:    Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::ICMP_NE:    Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::ICMP_UGT:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::ICMP_UGE:   Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::ICMP_ULT:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::ICMP_ULE:   Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::ICMP_SGT:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::ICMP_SGE:   Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::ICMP_SLT:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::ICMP_SLE:   Predicate = CmpInst::FCMP_TRUE;  break;
+  }
+
+  return Predicate;
+}
+
+static std::pair<X86::CondCode, bool>
+getX86ConditionCode(CmpInst::Predicate Predicate) {
+  X86::CondCode CC = X86::COND_INVALID;
+  bool NeedSwap = false;
+  switch (Predicate) {
+  default: break;
+  // Floating-point Predicates
+  case CmpInst::FCMP_UEQ: CC = X86::COND_E;       break;
+  case CmpInst::FCMP_OLT: NeedSwap = true; // fall-through
+  case CmpInst::FCMP_OGT: CC = X86::COND_A;       break;
+  case CmpInst::FCMP_OLE: NeedSwap = true; // fall-through
+  case CmpInst::FCMP_OGE: CC = X86::COND_AE;      break;
+  case CmpInst::FCMP_UGT: NeedSwap = true; // fall-through
+  case CmpInst::FCMP_ULT: CC = X86::COND_B;       break;
+  case CmpInst::FCMP_UGE: NeedSwap = true; // fall-through
+  case CmpInst::FCMP_ULE: CC = X86::COND_BE;      break;
+  case CmpInst::FCMP_ONE: CC = X86::COND_NE;      break;
+  case CmpInst::FCMP_UNO: CC = X86::COND_P;       break;
+  case CmpInst::FCMP_ORD: CC = X86::COND_NP;      break;
+  case CmpInst::FCMP_OEQ: // fall-through
+  case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break;
+
+  // Integer Predicates
+  case CmpInst::ICMP_EQ:  CC = X86::COND_E;       break;
+  case CmpInst::ICMP_NE:  CC = X86::COND_NE;      break;
+  case CmpInst::ICMP_UGT: CC = X86::COND_A;       break;
+  case CmpInst::ICMP_UGE: CC = X86::COND_AE;      break;
+  case CmpInst::ICMP_ULT: CC = X86::COND_B;       break;
+  case CmpInst::ICMP_ULE: CC = X86::COND_BE;      break;
+  case CmpInst::ICMP_SGT: CC = X86::COND_G;       break;
+  case CmpInst::ICMP_SGE: CC = X86::COND_GE;      break;
+  case CmpInst::ICMP_SLT: CC = X86::COND_L;       break;
+  case CmpInst::ICMP_SLE: CC = X86::COND_LE;      break;
+  }
+
+  return std::make_pair(CC, NeedSwap);
+}
+
+static std::pair<unsigned, bool>
+getX86SSEConditionCode(CmpInst::Predicate Predicate) {
+  unsigned CC;
+  bool NeedSwap = false;
+
+  // SSE Condition code mapping:
+  //  0 - EQ
+  //  1 - LT
+  //  2 - LE
+  //  3 - UNORD
+  //  4 - NEQ
+  //  5 - NLT
+  //  6 - NLE
+  //  7 - ORD
+  switch (Predicate) {
+  default: llvm_unreachable("Unexpected predicate");
+  case CmpInst::FCMP_OEQ: CC = 0;          break;
+  case CmpInst::FCMP_OGT: NeedSwap = true; // fall-through
+  case CmpInst::FCMP_OLT: CC = 1;          break;
+  case CmpInst::FCMP_OGE: NeedSwap = true; // fall-through
+  case CmpInst::FCMP_OLE: CC = 2;          break;
+  case CmpInst::FCMP_UNO: CC = 3;          break;
+  case CmpInst::FCMP_UNE: CC = 4;          break;
+  case CmpInst::FCMP_ULE: NeedSwap = true; // fall-through
+  case CmpInst::FCMP_UGE: CC = 5;          break;
+  case CmpInst::FCMP_ULT: NeedSwap = true; // fall-through
+  case CmpInst::FCMP_UGT: CC = 6;          break;
+  case CmpInst::FCMP_ORD: CC = 7;          break;
+  case CmpInst::FCMP_UEQ:
+  case CmpInst::FCMP_ONE: CC = 8;          break;
+  }
+
+  return std::make_pair(CC, NeedSwap);
+}
+
+/// \brief Check if it is possible to fold the condition from the XALU intrinsic
+/// into the user. The condition code will only be updated on success.
+bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
+                                       const Value *Cond) {
+  if (!isa<ExtractValueInst>(Cond))
+    return false;
+
+  const auto *EV = cast<ExtractValueInst>(Cond);
+  if (!isa<IntrinsicInst>(EV->getAggregateOperand()))
+    return false;
+
+  const auto *II = cast<IntrinsicInst>(EV->getAggregateOperand());
+  MVT RetVT;
+  const Function *Callee = II->getCalledFunction();
+  Type *RetTy =
+    cast<StructType>(Callee->getReturnType())->getTypeAtIndex(0U);
+  if (!isTypeLegal(RetTy, RetVT))
+    return false;
+
+  if (RetVT != MVT::i32 && RetVT != MVT::i64)
+    return false;
+
+  X86::CondCode TmpCC;
+  switch (II->getIntrinsicID()) {
+  default: return false;
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::ssub_with_overflow:
+  case Intrinsic::smul_with_overflow:
+  case Intrinsic::umul_with_overflow: TmpCC = X86::COND_O; break;
+  case Intrinsic::uadd_with_overflow:
+  case Intrinsic::usub_with_overflow: TmpCC = X86::COND_B; break;
+  }
+
+  // Check if both instructions are in the same basic block.
+  if (II->getParent() != I->getParent())
+    return false;
+
+  // Make sure nothing is in the way
+  BasicBlock::const_iterator Start = I;
+  BasicBlock::const_iterator End = II;
+  for (auto Itr = std::prev(Start); Itr != End; --Itr) {
+    // We only expect extractvalue instructions between the intrinsic and the
+    // instruction to be selected.
+    if (!isa<ExtractValueInst>(Itr))
+      return false;
+
+    // Check that the extractvalue operand comes from the intrinsic.
+    const auto *EVI = cast<ExtractValueInst>(Itr);
+    if (EVI->getAggregateOperand() != II)
+      return false;
+  }
+
+  CC = TmpCC;
+  return true;
+}
+
+bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) {
+  EVT evt = TLI.getValueType(Ty, /*HandleUnknown=*/true);
+  if (evt == MVT::Other || !evt.isSimple())
+    // Unhandled type. Halt "fast" selection and bail.
+    return false;
+
+  VT = evt.getSimpleVT();
+  // For now, require SSE/SSE2 for performing floating-point operations,
+  // since x87 requires additional work.
+  if (VT == MVT::f64 && !X86ScalarSSEf64)
+    return false;
+  if (VT == MVT::f32 && !X86ScalarSSEf32)
+    return false;
+  // Similarly, no f80 support yet.
+  if (VT == MVT::f80)
+    return false;
+  // We only handle legal types. For example, on x86-32 the instruction
+  // selector contains all of the 64-bit instructions from x86-64,
+  // under the assumption that i64 won't be used if the target doesn't
+  // support it.
+  return (AllowI1 && VT == MVT::i1) || TLI.isTypeLegal(VT);
+}
+
+#include "X86GenCallingConv.inc"
+
+/// X86FastEmitLoad - Emit a machine instruction to load a value of type VT.
+/// The address is either pre-computed, i.e. Ptr, or a GlobalAddress, i.e. GV.
+/// Return true and the result register by reference if it is possible.
+bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM,
+                                  MachineMemOperand *MMO, unsigned &ResultReg) {
+  // Get opcode and regclass of the output for the given load instruction.
+  unsigned Opc = 0;
+  const TargetRegisterClass *RC = nullptr;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: return false;
+  case MVT::i1:
+  case MVT::i8:
+    Opc = X86::MOV8rm;
+    RC  = &X86::GR8RegClass;
+    break;
+  case MVT::i16:
+    Opc = X86::MOV16rm;
+    RC  = &X86::GR16RegClass;
+    break;
+  case MVT::i32:
+    Opc = X86::MOV32rm;
+    RC  = &X86::GR32RegClass;
+    break;
+  case MVT::i64:
+    // Must be in x86-64 mode.
+    Opc = X86::MOV64rm;
+    RC  = &X86::GR64RegClass;
+    break;
+  case MVT::f32:
+    if (X86ScalarSSEf32) {
+      Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm;
+      RC  = &X86::FR32RegClass;
+    } else {
+      Opc = X86::LD_Fp32m;
+      RC  = &X86::RFP32RegClass;
+    }
+    break;
+  case MVT::f64:
+    if (X86ScalarSSEf64) {
+      Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm;
+      RC  = &X86::FR64RegClass;
+    } else {
+      Opc = X86::LD_Fp64m;
+      RC  = &X86::RFP64RegClass;
+    }
+    break;
+  case MVT::f80:
+    // No f80 support yet.
+    return false;
+  }
+
+  ResultReg = createResultReg(RC);
+  MachineInstrBuilder MIB =
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
+  addFullAddress(MIB, AM);
+  if (MMO)
+    MIB->addMemOperand(*FuncInfo.MF, MMO);
+  return true;
+}
+
+/// X86FastEmitStore - Emit a machine instruction to store a value Val of
+/// type VT. The address is either pre-computed, consisted of a base ptr, Ptr
+/// and a displacement offset, or a GlobalAddress,
+/// i.e. V. Return true if it is possible.
+bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
+                                   const X86AddressMode &AM,
+                                   MachineMemOperand *MMO, bool Aligned) {
+  // Get opcode and regclass of the output for the given store instruction.
+  unsigned Opc = 0;
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::f80: // No f80 support yet.
+  default: return false;
+  case MVT::i1: {
+    // Mask out all but lowest bit.
+    unsigned AndResult = createResultReg(&X86::GR8RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(X86::AND8ri), AndResult)
+      .addReg(ValReg, getKillRegState(ValIsKill)).addImm(1);
+    ValReg = AndResult;
+  }
+  // FALLTHROUGH, handling i1 as i8.
+  case MVT::i8:  Opc = X86::MOV8mr;  break;
+  case MVT::i16: Opc = X86::MOV16mr; break;
+  case MVT::i32: Opc = X86::MOV32mr; break;
+  case MVT::i64: Opc = X86::MOV64mr; break; // Must be in x86-64 mode.
+  case MVT::f32:
+    Opc = X86ScalarSSEf32 ?
+          (Subtarget->hasAVX() ? X86::VMOVSSmr : X86::MOVSSmr) : X86::ST_Fp32m;
+    break;
+  case MVT::f64:
+    Opc = X86ScalarSSEf64 ?
+          (Subtarget->hasAVX() ? X86::VMOVSDmr : X86::MOVSDmr) : X86::ST_Fp64m;
+    break;
+  case MVT::v4f32:
+    if (Aligned)
+      Opc = Subtarget->hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;
+    else
+      Opc = Subtarget->hasAVX() ? X86::VMOVUPSmr : X86::MOVUPSmr;
+    break;
+  case MVT::v2f64:
+    if (Aligned)
+      Opc = Subtarget->hasAVX() ? X86::VMOVAPDmr : X86::MOVAPDmr;
+    else
+      Opc = Subtarget->hasAVX() ? X86::VMOVUPDmr : X86::MOVUPDmr;
+    break;
+  case MVT::v4i32:
+  case MVT::v2i64:
+  case MVT::v8i16:
+  case MVT::v16i8:
+    if (Aligned)
+      Opc = Subtarget->hasAVX() ? X86::VMOVDQAmr : X86::MOVDQAmr;
+    else
+      Opc = Subtarget->hasAVX() ? X86::VMOVDQUmr : X86::MOVDQUmr;
+    break;
+  }
+
+  MachineInstrBuilder MIB =
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc));
+  addFullAddress(MIB, AM).addReg(ValReg, getKillRegState(ValIsKill));
+  if (MMO)
+    MIB->addMemOperand(*FuncInfo.MF, MMO);
+
+  return true;
+}
+
+bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val,
+                                   const X86AddressMode &AM,
+                                   MachineMemOperand *MMO, bool Aligned) {
+  // Handle 'null' like i32/i64 0.
+  if (isa<ConstantPointerNull>(Val))
+    Val = Constant::getNullValue(DL.getIntPtrType(Val->getContext()));
+
+  // If this is a store of a simple constant, fold the constant into the store.
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(Val)) {
+    unsigned Opc = 0;
+    bool Signed = true;
+    switch (VT.getSimpleVT().SimpleTy) {
+    default: break;
+    case MVT::i1:  Signed = false;     // FALLTHROUGH to handle as i8.
+    case MVT::i8:  Opc = X86::MOV8mi;  break;
+    case MVT::i16: Opc = X86::MOV16mi; break;
+    case MVT::i32: Opc = X86::MOV32mi; break;
+    case MVT::i64:
+      // Must be a 32-bit sign extended value.
+      if (isInt<32>(CI->getSExtValue()))
+        Opc = X86::MOV64mi32;
+      break;
+    }
+
+    if (Opc) {
+      MachineInstrBuilder MIB =
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc));
+      addFullAddress(MIB, AM).addImm(Signed ? (uint64_t) CI->getSExtValue()
+                                            : CI->getZExtValue());
+      if (MMO)
+        MIB->addMemOperand(*FuncInfo.MF, MMO);
+      return true;
+    }
+  }
+
+  unsigned ValReg = getRegForValue(Val);
+  if (ValReg == 0)
+    return false;
+
+  bool ValKill = hasTrivialKill(Val);
+  return X86FastEmitStore(VT, ValReg, ValKill, AM, MMO, Aligned);
+}
+
+/// X86FastEmitExtend - Emit a machine instruction to extend a value Src of
+/// type SrcVT to type DstVT using the specified extension opcode Opc (e.g.
+/// ISD::SIGN_EXTEND).
+bool X86FastISel::X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT,
+                                    unsigned Src, EVT SrcVT,
+                                    unsigned &ResultReg) {
+  unsigned RR = FastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc,
+                           Src, /*TODO: Kill=*/false);
+  if (RR == 0)
+    return false;
+
+  ResultReg = RR;
+  return true;
+}
+
+bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) {
+  // Handle constant address.
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+    // Can't handle alternate code models yet.
+    if (TM.getCodeModel() != CodeModel::Small)
+      return false;
+
+    // Can't handle TLS yet.
+    if (GV->isThreadLocal())
+      return false;
+
+    // RIP-relative addresses can't have additional register operands, so if
+    // we've already folded stuff into the addressing mode, just force the
+    // global value into its own register, which we can use as the basereg.
+    if (!Subtarget->isPICStyleRIPRel() ||
+        (AM.Base.Reg == 0 && AM.IndexReg == 0)) {
+      // Okay, we've committed to selecting this global. Set up the address.
+      AM.GV = GV;
+
+      // Allow the subtarget to classify the global.
+      unsigned char GVFlags = Subtarget->ClassifyGlobalReference(GV, TM);
+
+      // If this reference is relative to the pic base, set it now.
+      if (isGlobalRelativeToPICBase(GVFlags)) {
+        // FIXME: How do we know Base.Reg is free??
+        AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
+      }
+
+      // Unless the ABI requires an extra load, return a direct reference to
+      // the global.
+      if (!isGlobalStubReference(GVFlags)) {
+        if (Subtarget->isPICStyleRIPRel()) {
+          // Use rip-relative addressing if we can.  Above we verified that the
+          // base and index registers are unused.
+          assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
+          AM.Base.Reg = X86::RIP;
+        }
+        AM.GVOpFlags = GVFlags;
+        return true;
+      }
+
+      // Ok, we need to do a load from a stub.  If we've already loaded from
+      // this stub, reuse the loaded pointer, otherwise emit the load now.
+      DenseMap<const Value*, unsigned>::iterator I = LocalValueMap.find(V);
+      unsigned LoadReg;
+      if (I != LocalValueMap.end() && I->second != 0) {
+        LoadReg = I->second;
+      } else {
+        // Issue load from stub.
+        unsigned Opc = 0;
+        const TargetRegisterClass *RC = nullptr;
+        X86AddressMode StubAM;
+        StubAM.Base.Reg = AM.Base.Reg;
+        StubAM.GV = GV;
+        StubAM.GVOpFlags = GVFlags;
+
+        // Prepare for inserting code in the local-value area.
+        SavePoint SaveInsertPt = enterLocalValueArea();
+
+        if (TLI.getPointerTy() == MVT::i64) {
+          Opc = X86::MOV64rm;
+          RC  = &X86::GR64RegClass;
+
+          if (Subtarget->isPICStyleRIPRel())
+            StubAM.Base.Reg = X86::RIP;
+        } else {
+          Opc = X86::MOV32rm;
+          RC  = &X86::GR32RegClass;
+        }
+
+        LoadReg = createResultReg(RC);
+        MachineInstrBuilder LoadMI =
+          BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), LoadReg);
+        addFullAddress(LoadMI, StubAM);
+
+        // Ok, back to normal mode.
+        leaveLocalValueArea(SaveInsertPt);
+
+        // Prevent loading GV stub multiple times in same MBB.
+        LocalValueMap[V] = LoadReg;
+      }
+
+      // Now construct the final address. Note that the Disp, Scale,
+      // and Index values may already be set here.
+      AM.Base.Reg = LoadReg;
+      AM.GV = nullptr;
+      return true;
+    }
+  }
+
+  // If all else fails, try to materialize the value in a register.
+  if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {
+    if (AM.Base.Reg == 0) {
+      AM.Base.Reg = getRegForValue(V);
+      return AM.Base.Reg != 0;
+    }
+    if (AM.IndexReg == 0) {
+      assert(AM.Scale == 1 && "Scale with no index!");
+      AM.IndexReg = getRegForValue(V);
+      return AM.IndexReg != 0;
+    }
+  }
+
+  return false;
+}
+
+/// X86SelectAddress - Attempt to fill in an address from the given value.
+///
+bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
+  SmallVector<const Value *, 32> GEPs;
+redo_gep:
+  const User *U = nullptr;
+  unsigned Opcode = Instruction::UserOp1;
+  if (const Instruction *I = dyn_cast<Instruction>(V)) {
+    // Don't walk into other basic blocks; it's possible we haven't
+    // visited them yet, so the instructions may not yet be assigned
+    // virtual registers.
+    if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(V)) ||
+        FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
+      Opcode = I->getOpcode();
+      U = I;
+    }
+  } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) {
+    Opcode = C->getOpcode();
+    U = C;
+  }
+
+  if (PointerType *Ty = dyn_cast<PointerType>(V->getType()))
+    if (Ty->getAddressSpace() > 255)
+      // Fast instruction selection doesn't support the special
+      // address spaces.
+      return false;
+
+  switch (Opcode) {
+  default: break;
+  case Instruction::BitCast:
+    // Look past bitcasts.
+    return X86SelectAddress(U->getOperand(0), AM);
+
+  case Instruction::IntToPtr:
+    // Look past no-op inttoptrs.
+    if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
+      return X86SelectAddress(U->getOperand(0), AM);
+    break;
+
+  case Instruction::PtrToInt:
+    // Look past no-op ptrtoints.
+    if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
+      return X86SelectAddress(U->getOperand(0), AM);
+    break;
+
+  case Instruction::Alloca: {
+    // Do static allocas.
+    const AllocaInst *A = cast<AllocaInst>(V);
+    DenseMap<const AllocaInst*, int>::iterator SI =
+      FuncInfo.StaticAllocaMap.find(A);
+    if (SI != FuncInfo.StaticAllocaMap.end()) {
+      AM.BaseType = X86AddressMode::FrameIndexBase;
+      AM.Base.FrameIndex = SI->second;
+      return true;
+    }
+    break;
+  }
+
+  case Instruction::Add: {
+    // Adds of constants are common and easy enough.
+    if (const ConstantInt *CI = dyn_cast<ConstantInt>(U->getOperand(1))) {
+      uint64_t Disp = (int32_t)AM.Disp + (uint64_t)CI->getSExtValue();
+      // They have to fit in the 32-bit signed displacement field though.
+      if (isInt<32>(Disp)) {
+        AM.Disp = (uint32_t)Disp;
+        return X86SelectAddress(U->getOperand(0), AM);
+      }
+    }
+    break;
+  }
+
+  case Instruction::GetElementPtr: {
+    X86AddressMode SavedAM = AM;
+
+    // Pattern-match simple GEPs.
+    uint64_t Disp = (int32_t)AM.Disp;
+    unsigned IndexReg = AM.IndexReg;
+    unsigned Scale = AM.Scale;
+    gep_type_iterator GTI = gep_type_begin(U);
+    // Iterate through the indices, folding what we can. Constants can be
+    // folded, and one dynamic index can be handled, if the scale is supported.
+    for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end();
+         i != e; ++i, ++GTI) {
+      const Value *Op = *i;
+      if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+        const StructLayout *SL = DL.getStructLayout(STy);
+        Disp += SL->getElementOffset(cast<ConstantInt>(Op)->getZExtValue());
+        continue;
+      }
+
+      // A array/variable index is always of the form i*S where S is the
+      // constant scale size.  See if we can push the scale into immediates.
+      uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
+      for (;;) {
+        if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+          // Constant-offset addressing.
+          Disp += CI->getSExtValue() * S;
+          break;
+        }
+        if (canFoldAddIntoGEP(U, Op)) {
+          // A compatible add with a constant operand. Fold the constant.
+          ConstantInt *CI =
+            cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
+          Disp += CI->getSExtValue() * S;
+          // Iterate on the other operand.
+          Op = cast<AddOperator>(Op)->getOperand(0);
+          continue;
+        }
+        if (IndexReg == 0 &&
+            (!AM.GV || !Subtarget->isPICStyleRIPRel()) &&
+            (S == 1 || S == 2 || S == 4 || S == 8)) {
+          // Scaled-index addressing.
+          Scale = S;
+          IndexReg = getRegForGEPIndex(Op).first;
+          if (IndexReg == 0)
+            return false;
+          break;
+        }
+        // Unsupported.
+        goto unsupported_gep;
+      }
+    }
+
+    // Check for displacement overflow.
+    if (!isInt<32>(Disp))
+      break;
+
+    AM.IndexReg = IndexReg;
+    AM.Scale = Scale;
+    AM.Disp = (uint32_t)Disp;
+    GEPs.push_back(V);
+
+    if (const GetElementPtrInst *GEP =
+          dyn_cast<GetElementPtrInst>(U->getOperand(0))) {
+      // Ok, the GEP indices were covered by constant-offset and scaled-index
+      // addressing. Update the address state and move on to examining the base.
+      V = GEP;
+      goto redo_gep;
+    } else if (X86SelectAddress(U->getOperand(0), AM)) {
+      return true;
+    }
+
+    // If we couldn't merge the gep value into this addr mode, revert back to
+    // our address and just match the value instead of completely failing.
+    AM = SavedAM;
+
+    for (SmallVectorImpl<const Value *>::reverse_iterator
+           I = GEPs.rbegin(), E = GEPs.rend(); I != E; ++I)
+      if (handleConstantAddresses(*I, AM))
+        return true;
+
+    return false;
+  unsupported_gep:
+    // Ok, the GEP indices weren't all covered.
+    break;
+  }
+  }
+
+  return handleConstantAddresses(V, AM);
+}
+
+/// X86SelectCallAddress - Attempt to fill in an address from the given value.
+///
+bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) {
+  const User *U = nullptr;
+  unsigned Opcode = Instruction::UserOp1;
+  const Instruction *I = dyn_cast<Instruction>(V);
+  // Record if the value is defined in the same basic block.
+  //
+  // This information is crucial to know whether or not folding an
+  // operand is valid.
+  // Indeed, FastISel generates or reuses a virtual register for all
+  // operands of all instructions it selects. Obviously, the definition and
+  // its uses must use the same virtual register otherwise the produced
+  // code is incorrect.
+  // Before instruction selection, FunctionLoweringInfo::set sets the virtual
+  // registers for values that are alive across basic blocks. This ensures
+  // that the values are consistently set between across basic block, even
+  // if different instruction selection mechanisms are used (e.g., a mix of
+  // SDISel and FastISel).
+  // For values local to a basic block, the instruction selection process
+  // generates these virtual registers with whatever method is appropriate
+  // for its needs. In particular, FastISel and SDISel do not share the way
+  // local virtual registers are set.
+  // Therefore, this is impossible (or at least unsafe) to share values
+  // between basic blocks unless they use the same instruction selection
+  // method, which is not guarantee for X86.
+  // Moreover, things like hasOneUse could not be used accurately, if we
+  // allow to reference values across basic blocks whereas they are not
+  // alive across basic blocks initially.
+  bool InMBB = true;
+  if (I) {
+    Opcode = I->getOpcode();
+    U = I;
+    InMBB = I->getParent() == FuncInfo.MBB->getBasicBlock();
+  } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) {
+    Opcode = C->getOpcode();
+    U = C;
+  }
+
+  switch (Opcode) {
+  default: break;
+  case Instruction::BitCast:
+    // Look past bitcasts if its operand is in the same BB.
+    if (InMBB)
+      return X86SelectCallAddress(U->getOperand(0), AM);
+    break;
+
+  case Instruction::IntToPtr:
+    // Look past no-op inttoptrs if its operand is in the same BB.
+    if (InMBB &&
+        TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
+      return X86SelectCallAddress(U->getOperand(0), AM);
+    break;
+
+  case Instruction::PtrToInt:
+    // Look past no-op ptrtoints if its operand is in the same BB.
+    if (InMBB &&
+        TLI.getValueType(U->getType()) == TLI.getPointerTy())
+      return X86SelectCallAddress(U->getOperand(0), AM);
+    break;
+  }
+
+  // Handle constant address.
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+    // Can't handle alternate code models yet.
+    if (TM.getCodeModel() != CodeModel::Small)
+      return false;
+
+    // RIP-relative addresses can't have additional register operands.
+    if (Subtarget->isPICStyleRIPRel() &&
+        (AM.Base.Reg != 0 || AM.IndexReg != 0))
+      return false;
+
+    // Can't handle DLL Import.
+    if (GV->hasDLLImportStorageClass())
+      return false;
+
+    // Can't handle TLS.
+    if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
+      if (GVar->isThreadLocal())
+        return false;
+
+    // Okay, we've committed to selecting this global. Set up the basic address.
+    AM.GV = GV;
+
+    // No ABI requires an extra load for anything other than DLLImport, which
+    // we rejected above. Return a direct reference to the global.
+    if (Subtarget->isPICStyleRIPRel()) {
+      // Use rip-relative addressing if we can.  Above we verified that the
+      // base and index registers are unused.
+      assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
+      AM.Base.Reg = X86::RIP;
+    } else if (Subtarget->isPICStyleStubPIC()) {
+      AM.GVOpFlags = X86II::MO_PIC_BASE_OFFSET;
+    } else if (Subtarget->isPICStyleGOT()) {
+      AM.GVOpFlags = X86II::MO_GOTOFF;
+    }
+
+    return true;
+  }
+
+  // If all else fails, try to materialize the value in a register.
+  if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {
+    if (AM.Base.Reg == 0) {
+      AM.Base.Reg = getRegForValue(V);
+      return AM.Base.Reg != 0;
+    }
+    if (AM.IndexReg == 0) {
+      assert(AM.Scale == 1 && "Scale with no index!");
+      AM.IndexReg = getRegForValue(V);
+      return AM.IndexReg != 0;
+    }
+  }
+
+  return false;
+}
+
+
+/// X86SelectStore - Select and emit code to implement store instructions.
+bool X86FastISel::X86SelectStore(const Instruction *I) {
+  // Atomic stores need special handling.
+  const StoreInst *S = cast<StoreInst>(I);
+
+  if (S->isAtomic())
+    return false;
+
+  const Value *Val = S->getValueOperand();
+  const Value *Ptr = S->getPointerOperand();
+
+  MVT VT;
+  if (!isTypeLegal(Val->getType(), VT, /*AllowI1=*/true))
+    return false;
+
+  unsigned Alignment = S->getAlignment();
+  unsigned ABIAlignment = DL.getABITypeAlignment(Val->getType());
+  if (Alignment == 0)  // Ensure that codegen never sees alignment 0
+    Alignment = ABIAlignment;
+  bool Aligned = Alignment >= ABIAlignment;
+
+  X86AddressMode AM;
+  if (!X86SelectAddress(Ptr, AM))
+    return false;
+
+  return X86FastEmitStore(VT, Val, AM, createMachineMemOperandFor(I), Aligned);
+}
+
+/// X86SelectRet - Select and emit code to implement ret instructions.
+bool X86FastISel::X86SelectRet(const Instruction *I) {
+  const ReturnInst *Ret = cast<ReturnInst>(I);
+  const Function &F = *I->getParent()->getParent();
+  const X86MachineFunctionInfo *X86MFInfo =
+      FuncInfo.MF->getInfo<X86MachineFunctionInfo>();
+
+  if (!FuncInfo.CanLowerReturn)
+    return false;
+
+  CallingConv::ID CC = F.getCallingConv();
+  if (CC != CallingConv::C &&
+      CC != CallingConv::Fast &&
+      CC != CallingConv::X86_FastCall &&
+      CC != CallingConv::X86_64_SysV)
+    return false;
+
+  if (Subtarget->isCallingConvWin64(CC))
+    return false;
+
+  // Don't handle popping bytes on return for now.
+  if (X86MFInfo->getBytesToPopOnReturn() != 0)
+    return false;
+
+  // fastcc with -tailcallopt is intended to provide a guaranteed
+  // tail call optimization. Fastisel doesn't know how to do that.
+  if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt)
+    return false;
+
+  // Let SDISel handle vararg functions.
+  if (F.isVarArg())
+    return false;
+
+  // Build a list of return value registers.
+  SmallVector<unsigned, 4> RetRegs;
+
+  if (Ret->getNumOperands() > 0) {
+    SmallVector<ISD::OutputArg, 4> Outs;
+    GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI);
+
+    // Analyze operands of the call, assigning locations to each operand.
+    SmallVector<CCValAssign, 16> ValLocs;
+    CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs,
+                   I->getContext());
+    CCInfo.AnalyzeReturn(Outs, RetCC_X86);
+
+    const Value *RV = Ret->getOperand(0);
+    unsigned Reg = getRegForValue(RV);
+    if (Reg == 0)
+      return false;
+
+    // Only handle a single return value for now.
+    if (ValLocs.size() != 1)
+      return false;
+
+    CCValAssign &VA = ValLocs[0];
+
+    // Don't bother handling odd stuff for now.
+    if (VA.getLocInfo() != CCValAssign::Full)
+      return false;
+    // Only handle register returns for now.
+    if (!VA.isRegLoc())
+      return false;
+
+    // The calling-convention tables for x87 returns don't tell
+    // the whole story.
+    if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1)
+      return false;
+
+    unsigned SrcReg = Reg + VA.getValNo();
+    EVT SrcVT = TLI.getValueType(RV->getType());
+    EVT DstVT = VA.getValVT();
+    // Special handling for extended integers.
+    if (SrcVT != DstVT) {
+      if (SrcVT != MVT::i1 && SrcVT != MVT::i8 && SrcVT != MVT::i16)
+        return false;
+
+      if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt())
+        return false;
+
+      assert(DstVT == MVT::i32 && "X86 should always ext to i32");
+
+      if (SrcVT == MVT::i1) {
+        if (Outs[0].Flags.isSExt())
+          return false;
+        SrcReg = FastEmitZExtFromI1(MVT::i8, SrcReg, /*TODO: Kill=*/false);
+        SrcVT = MVT::i8;
+      }
+      unsigned Op = Outs[0].Flags.isZExt() ? ISD::ZERO_EXTEND :
+                                             ISD::SIGN_EXTEND;
+      SrcReg = FastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op,
+                          SrcReg, /*TODO: Kill=*/false);
+    }
+
+    // Make the copy.
+    unsigned DstReg = VA.getLocReg();
+    const TargetRegisterClass* SrcRC = MRI.getRegClass(SrcReg);
+    // Avoid a cross-class copy. This is very unlikely.
+    if (!SrcRC->contains(DstReg))
+      return false;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY),
+            DstReg).addReg(SrcReg);
+
+    // Add register to return instruction.
+    RetRegs.push_back(VA.getLocReg());
+  }
+
+  // The x86-64 ABI for returning structs by value requires that we copy
+  // the sret argument into %rax for the return. We saved the argument into
+  // a virtual register in the entry block, so now we copy the value out
+  // and into %rax. We also do the same with %eax for Win32.
+  if (F.hasStructRetAttr() &&
+      (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {
+    unsigned Reg = X86MFInfo->getSRetReturnReg();
+    assert(Reg &&
+           "SRetReturnReg should have been set in LowerFormalArguments()!");
+    unsigned RetReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY),
+            RetReg).addReg(Reg);
+    RetRegs.push_back(RetReg);
+  }
+
+  // Now emit the RET.
+  MachineInstrBuilder MIB =
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Subtarget->is64Bit() ? X86::RETQ : X86::RETL));
+  for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
+    MIB.addReg(RetRegs[i], RegState::Implicit);
+  return true;
+}
+
+/// X86SelectLoad - Select and emit code to implement load instructions.
+///
+bool X86FastISel::X86SelectLoad(const Instruction *I) {
+  const LoadInst *LI = cast<LoadInst>(I);
+
+  // Atomic loads need special handling.
+  if (LI->isAtomic())
+    return false;
+
+  MVT VT;
+  if (!isTypeLegal(LI->getType(), VT, /*AllowI1=*/true))
+    return false;
+
+  const Value *Ptr = LI->getPointerOperand();
+
+  X86AddressMode AM;
+  if (!X86SelectAddress(Ptr, AM))
+    return false;
+
+  unsigned ResultReg = 0;
+  if (!X86FastEmitLoad(VT, AM, createMachineMemOperandFor(LI), ResultReg))
+    return false;
+
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) {
+  bool HasAVX = Subtarget->hasAVX();
+  bool X86ScalarSSEf32 = Subtarget->hasSSE1();
+  bool X86ScalarSSEf64 = Subtarget->hasSSE2();
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  default:       return 0;
+  case MVT::i8:  return X86::CMP8rr;
+  case MVT::i16: return X86::CMP16rr;
+  case MVT::i32: return X86::CMP32rr;
+  case MVT::i64: return X86::CMP64rr;
+  case MVT::f32:
+    return X86ScalarSSEf32 ? (HasAVX ? X86::VUCOMISSrr : X86::UCOMISSrr) : 0;
+  case MVT::f64:
+    return X86ScalarSSEf64 ? (HasAVX ? X86::VUCOMISDrr : X86::UCOMISDrr) : 0;
+  }
+}
+
+/// X86ChooseCmpImmediateOpcode - If we have a comparison with RHS as the RHS
+/// of the comparison, return an opcode that works for the compare (e.g.
+/// CMP32ri) otherwise return 0.
+static unsigned X86ChooseCmpImmediateOpcode(EVT VT, const ConstantInt *RHSC) {
+  switch (VT.getSimpleVT().SimpleTy) {
+  // Otherwise, we can't fold the immediate into this comparison.
+  default: return 0;
+  case MVT::i8: return X86::CMP8ri;
+  case MVT::i16: return X86::CMP16ri;
+  case MVT::i32: return X86::CMP32ri;
+  case MVT::i64:
+    // 64-bit comparisons are only valid if the immediate fits in a 32-bit sext
+    // field.
+    if ((int)RHSC->getSExtValue() == RHSC->getSExtValue())
+      return X86::CMP64ri32;
+    return 0;
+  }
+}
+
+bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1,
+                                     EVT VT) {
+  unsigned Op0Reg = getRegForValue(Op0);
+  if (Op0Reg == 0) return false;
+
+  // Handle 'null' like i32/i64 0.
+  if (isa<ConstantPointerNull>(Op1))
+    Op1 = Constant::getNullValue(DL.getIntPtrType(Op0->getContext()));
+
+  // We have two options: compare with register or immediate.  If the RHS of
+  // the compare is an immediate that we can fold into this compare, use
+  // CMPri, otherwise use CMPrr.
+  if (const ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
+    if (unsigned CompareImmOpc = X86ChooseCmpImmediateOpcode(VT, Op1C)) {
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CompareImmOpc))
+        .addReg(Op0Reg)
+        .addImm(Op1C->getSExtValue());
+      return true;
+    }
+  }
+
+  unsigned CompareOpc = X86ChooseCmpOpcode(VT, Subtarget);
+  if (CompareOpc == 0) return false;
+
+  unsigned Op1Reg = getRegForValue(Op1);
+  if (Op1Reg == 0) return false;
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CompareOpc))
+    .addReg(Op0Reg)
+    .addReg(Op1Reg);
+
+  return true;
+}
+
+bool X86FastISel::X86SelectCmp(const Instruction *I) {
+  const CmpInst *CI = cast<CmpInst>(I);
+
+  MVT VT;
+  if (!isTypeLegal(I->getOperand(0)->getType(), VT))
+    return false;
+
+  // Try to optimize or fold the cmp.
+  CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+  unsigned ResultReg = 0;
+  switch (Predicate) {
+  default: break;
+  case CmpInst::FCMP_FALSE: {
+    ResultReg = createResultReg(&X86::GR32RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV32r0),
+            ResultReg);
+    ResultReg = FastEmitInst_extractsubreg(MVT::i8, ResultReg, /*Kill=*/true,
+                                           X86::sub_8bit);
+    if (!ResultReg)
+      return false;
+    break;
+  }
+  case CmpInst::FCMP_TRUE: {
+    ResultReg = createResultReg(&X86::GR8RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri),
+            ResultReg).addImm(1);
+    break;
+  }
+  }
+
+  if (ResultReg) {
+    UpdateValueMap(I, ResultReg);
+    return true;
+  }
+
+  const Value *LHS = CI->getOperand(0);
+  const Value *RHS = CI->getOperand(1);
+
+  // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0.
+  // We don't have to materialize a zero constant for this case and can just use
+  // %x again on the RHS.
+  if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
+    const auto *RHSC = dyn_cast<ConstantFP>(RHS);
+    if (RHSC && RHSC->isNullValue())
+      RHS = LHS;
+  }
+
+  // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.
+  static unsigned SETFOpcTable[2][3] = {
+    { X86::SETEr,  X86::SETNPr, X86::AND8rr },
+    { X86::SETNEr, X86::SETPr,  X86::OR8rr  }
+  };
+  unsigned *SETFOpc = nullptr;
+  switch (Predicate) {
+  default: break;
+  case CmpInst::FCMP_OEQ: SETFOpc = &SETFOpcTable[0][0]; break;
+  case CmpInst::FCMP_UNE: SETFOpc = &SETFOpcTable[1][0]; break;
+  }
+
+  ResultReg = createResultReg(&X86::GR8RegClass);
+  if (SETFOpc) {
+    if (!X86FastEmitCompare(LHS, RHS, VT))
+      return false;
+
+    unsigned FlagReg1 = createResultReg(&X86::GR8RegClass);
+    unsigned FlagReg2 = createResultReg(&X86::GR8RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]),
+            FlagReg1);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]),
+            FlagReg2);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[2]),
+            ResultReg).addReg(FlagReg1).addReg(FlagReg2);
+    UpdateValueMap(I, ResultReg);
+    return true;
+  }
+
+  X86::CondCode CC;
+  bool SwapArgs;
+  std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate);
+  assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
+  unsigned Opc = X86::getSETFromCond(CC);
+
+  if (SwapArgs)
+    std::swap(LHS, RHS);
+
+  // Emit a compare of LHS/RHS.
+  if (!X86FastEmitCompare(LHS, RHS, VT))
+    return false;
+
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool X86FastISel::X86SelectZExt(const Instruction *I) {
+  EVT DstVT = TLI.getValueType(I->getType());
+  if (!TLI.isTypeLegal(DstVT))
+    return false;
+
+  unsigned ResultReg = getRegForValue(I->getOperand(0));
+  if (ResultReg == 0)
+    return false;
+
+  // Handle zero-extension from i1 to i8, which is common.
+  MVT SrcVT = TLI.getSimpleValueType(I->getOperand(0)->getType());
+  if (SrcVT.SimpleTy == MVT::i1) {
+    // Set the high bits to zero.
+    ResultReg = FastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false);
+    SrcVT = MVT::i8;
+
+    if (ResultReg == 0)
+      return false;
+  }
+
+  if (DstVT == MVT::i64) {
+    // Handle extension to 64-bits via sub-register shenanigans.
+    unsigned MovInst;
+
+    switch (SrcVT.SimpleTy) {
+    case MVT::i8:  MovInst = X86::MOVZX32rr8;  break;
+    case MVT::i16: MovInst = X86::MOVZX32rr16; break;
+    case MVT::i32: MovInst = X86::MOV32rr;     break;
+    default: llvm_unreachable("Unexpected zext to i64 source type");
+    }
+
+    unsigned Result32 = createResultReg(&X86::GR32RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MovInst), Result32)
+      .addReg(ResultReg);
+
+    ResultReg = createResultReg(&X86::GR64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::SUBREG_TO_REG),
+            ResultReg)
+      .addImm(0).addReg(Result32).addImm(X86::sub_32bit);
+  } else if (DstVT != MVT::i8) {
+    ResultReg = FastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND,
+                           ResultReg, /*Kill=*/true);
+    if (ResultReg == 0)
+      return false;
+  }
+
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+
+bool X86FastISel::X86SelectBranch(const Instruction *I) {
+  // Unconditional branches are selected by tablegen-generated code.
+  // Handle a conditional branch.
+  const BranchInst *BI = cast<BranchInst>(I);
+  MachineBasicBlock *TrueMBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
+  MachineBasicBlock *FalseMBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
+
+  // Fold the common case of a conditional branch with a comparison
+  // in the same block (values defined on other blocks may not have
+  // initialized registers).
+  X86::CondCode CC;
+  if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
+    if (CI->hasOneUse() && CI->getParent() == I->getParent()) {
+      EVT VT = TLI.getValueType(CI->getOperand(0)->getType());
+
+      // Try to optimize or fold the cmp.
+      CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+      switch (Predicate) {
+      default: break;
+      case CmpInst::FCMP_FALSE: FastEmitBranch(FalseMBB, DbgLoc); return true;
+      case CmpInst::FCMP_TRUE:  FastEmitBranch(TrueMBB, DbgLoc); return true;
+      }
+
+      const Value *CmpLHS = CI->getOperand(0);
+      const Value *CmpRHS = CI->getOperand(1);
+
+      // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x,
+      // 0.0.
+      // We don't have to materialize a zero constant for this case and can just
+      // use %x again on the RHS.
+      if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
+        const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS);
+        if (CmpRHSC && CmpRHSC->isNullValue())
+          CmpRHS = CmpLHS;
+      }
+
+      // Try to take advantage of fallthrough opportunities.
+      if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {
+        std::swap(TrueMBB, FalseMBB);
+        Predicate = CmpInst::getInversePredicate(Predicate);
+      }
+
+      // FCMP_OEQ and FCMP_UNE cannot be expressed with a single flag/condition
+      // code check. Instead two branch instructions are required to check all
+      // the flags. First we change the predicate to a supported condition code,
+      // which will be the first branch. Later one we will emit the second
+      // branch.
+      bool NeedExtraBranch = false;
+      switch (Predicate) {
+      default: break;
+      case CmpInst::FCMP_OEQ:
+        std::swap(TrueMBB, FalseMBB); // fall-through
+      case CmpInst::FCMP_UNE:
+        NeedExtraBranch = true;
+        Predicate = CmpInst::FCMP_ONE;
+        break;
+      }
+
+      bool SwapArgs;
+      unsigned BranchOpc;
+      std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate);
+      assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
+
+      BranchOpc = X86::GetCondBranchFromCond(CC);
+      if (SwapArgs)
+        std::swap(CmpLHS, CmpRHS);
+
+      // Emit a compare of the LHS and RHS, setting the flags.
+      if (!X86FastEmitCompare(CmpLHS, CmpRHS, VT))
+        return false;
+
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc))
+        .addMBB(TrueMBB);
+
+      // X86 requires a second branch to handle UNE (and OEQ, which is mapped
+      // to UNE above).
+      if (NeedExtraBranch) {
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JP_4))
+          .addMBB(TrueMBB);
+      }
+
+      // Obtain the branch weight and add the TrueBB to the successor list.
+      uint32_t BranchWeight = 0;
+      if (FuncInfo.BPI)
+        BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
+                                                   TrueMBB->getBasicBlock());
+      FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight);
+
+      // Emits an unconditional branch to the FalseBB, obtains the branch
+      // weight, and adds it to the successor list.
+      FastEmitBranch(FalseMBB, DbgLoc);
+
+      return true;
+    }
+  } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
+    // Handle things like "%cond = trunc i32 %X to i1 / br i1 %cond", which
+    // typically happen for _Bool and C++ bools.
+    MVT SourceVT;
+    if (TI->hasOneUse() && TI->getParent() == I->getParent() &&
+        isTypeLegal(TI->getOperand(0)->getType(), SourceVT)) {
+      unsigned TestOpc = 0;
+      switch (SourceVT.SimpleTy) {
+      default: break;
+      case MVT::i8:  TestOpc = X86::TEST8ri; break;
+      case MVT::i16: TestOpc = X86::TEST16ri; break;
+      case MVT::i32: TestOpc = X86::TEST32ri; break;
+      case MVT::i64: TestOpc = X86::TEST64ri32; break;
+      }
+      if (TestOpc) {
+        unsigned OpReg = getRegForValue(TI->getOperand(0));
+        if (OpReg == 0) return false;
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TestOpc))
+          .addReg(OpReg).addImm(1);
+
+        unsigned JmpOpc = X86::JNE_4;
+        if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {
+          std::swap(TrueMBB, FalseMBB);
+          JmpOpc = X86::JE_4;
+        }
+
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(JmpOpc))
+          .addMBB(TrueMBB);
+        FastEmitBranch(FalseMBB, DbgLoc);
+        uint32_t BranchWeight = 0;
+        if (FuncInfo.BPI)
+          BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
+                                                     TrueMBB->getBasicBlock());
+        FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight);
+        return true;
+      }
+    }
+  } else if (foldX86XALUIntrinsic(CC, BI, BI->getCondition())) {
+    // Fake request the condition, otherwise the intrinsic might be completely
+    // optimized away.
+    unsigned TmpReg = getRegForValue(BI->getCondition());
+    if (TmpReg == 0)
+      return false;
+
+    unsigned BranchOpc = X86::GetCondBranchFromCond(CC);
+
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc))
+      .addMBB(TrueMBB);
+    FastEmitBranch(FalseMBB, DbgLoc);
+    uint32_t BranchWeight = 0;
+    if (FuncInfo.BPI)
+      BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
+                                                 TrueMBB->getBasicBlock());
+    FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight);
+    return true;
+  }
+
+  // Otherwise do a clumsy setcc and re-test it.
+  // Note that i1 essentially gets ANY_EXTEND'ed to i8 where it isn't used
+  // in an explicit cast, so make sure to handle that correctly.
+  unsigned OpReg = getRegForValue(BI->getCondition());
+  if (OpReg == 0) return false;
+
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
+    .addReg(OpReg).addImm(1);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JNE_4))
+    .addMBB(TrueMBB);
+  FastEmitBranch(FalseMBB, DbgLoc);
+  uint32_t BranchWeight = 0;
+  if (FuncInfo.BPI)
+    BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
+                                               TrueMBB->getBasicBlock());
+  FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight);
+  return true;
+}
+
+bool X86FastISel::X86SelectShift(const Instruction *I) {
+  unsigned CReg = 0, OpReg = 0;
+  const TargetRegisterClass *RC = nullptr;
+  if (I->getType()->isIntegerTy(8)) {
+    CReg = X86::CL;
+    RC = &X86::GR8RegClass;
+    switch (I->getOpcode()) {
+    case Instruction::LShr: OpReg = X86::SHR8rCL; break;
+    case Instruction::AShr: OpReg = X86::SAR8rCL; break;
+    case Instruction::Shl:  OpReg = X86::SHL8rCL; break;
+    default: return false;
+    }
+  } else if (I->getType()->isIntegerTy(16)) {
+    CReg = X86::CX;
+    RC = &X86::GR16RegClass;
+    switch (I->getOpcode()) {
+    case Instruction::LShr: OpReg = X86::SHR16rCL; break;
+    case Instruction::AShr: OpReg = X86::SAR16rCL; break;
+    case Instruction::Shl:  OpReg = X86::SHL16rCL; break;
+    default: return false;
+    }
+  } else if (I->getType()->isIntegerTy(32)) {
+    CReg = X86::ECX;
+    RC = &X86::GR32RegClass;
+    switch (I->getOpcode()) {
+    case Instruction::LShr: OpReg = X86::SHR32rCL; break;
+    case Instruction::AShr: OpReg = X86::SAR32rCL; break;
+    case Instruction::Shl:  OpReg = X86::SHL32rCL; break;
+    default: return false;
+    }
+  } else if (I->getType()->isIntegerTy(64)) {
+    CReg = X86::RCX;
+    RC = &X86::GR64RegClass;
+    switch (I->getOpcode()) {
+    case Instruction::LShr: OpReg = X86::SHR64rCL; break;
+    case Instruction::AShr: OpReg = X86::SAR64rCL; break;
+    case Instruction::Shl:  OpReg = X86::SHL64rCL; break;
+    default: return false;
+    }
+  } else {
+    return false;
+  }
+
+  MVT VT;
+  if (!isTypeLegal(I->getType(), VT))
+    return false;
+
+  unsigned Op0Reg = getRegForValue(I->getOperand(0));
+  if (Op0Reg == 0) return false;
+
+  unsigned Op1Reg = getRegForValue(I->getOperand(1));
+  if (Op1Reg == 0) return false;
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY),
+          CReg).addReg(Op1Reg);
+
+  // The shift instruction uses X86::CL. If we defined a super-register
+  // of X86::CL, emit a subreg KILL to precisely describe what we're doing here.
+  if (CReg != X86::CL)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::KILL), X86::CL)
+      .addReg(CReg, RegState::Kill);
+
+  unsigned ResultReg = createResultReg(RC);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpReg), ResultReg)
+    .addReg(Op0Reg);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool X86FastISel::X86SelectDivRem(const Instruction *I) {
+  const static unsigned NumTypes = 4; // i8, i16, i32, i64
+  const static unsigned NumOps   = 4; // SDiv, SRem, UDiv, URem
+  const static bool S = true;  // IsSigned
+  const static bool U = false; // !IsSigned
+  const static unsigned Copy = TargetOpcode::COPY;
+  // For the X86 DIV/IDIV instruction, in most cases the dividend
+  // (numerator) must be in a specific register pair highreg:lowreg,
+  // producing the quotient in lowreg and the remainder in highreg.
+  // For most data types, to set up the instruction, the dividend is
+  // copied into lowreg, and lowreg is sign-extended or zero-extended
+  // into highreg.  The exception is i8, where the dividend is defined
+  // as a single register rather than a register pair, and we
+  // therefore directly sign-extend or zero-extend the dividend into
+  // lowreg, instead of copying, and ignore the highreg.
+  const static struct DivRemEntry {
+    // The following portion depends only on the data type.
+    const TargetRegisterClass *RC;
+    unsigned LowInReg;  // low part of the register pair
+    unsigned HighInReg; // high part of the register pair
+    // The following portion depends on both the data type and the operation.
+    struct DivRemResult {
+    unsigned OpDivRem;        // The specific DIV/IDIV opcode to use.
+    unsigned OpSignExtend;    // Opcode for sign-extending lowreg into
+                              // highreg, or copying a zero into highreg.
+    unsigned OpCopy;          // Opcode for copying dividend into lowreg, or
+                              // zero/sign-extending into lowreg for i8.
+    unsigned DivRemResultReg; // Register containing the desired result.
+    bool IsOpSigned;          // Whether to use signed or unsigned form.
+    } ResultTable[NumOps];
+  } OpTable[NumTypes] = {
+    { &X86::GR8RegClass,  X86::AX,  0, {
+        { X86::IDIV8r,  0,            X86::MOVSX16rr8, X86::AL,  S }, // SDiv
+        { X86::IDIV8r,  0,            X86::MOVSX16rr8, X86::AH,  S }, // SRem
+        { X86::DIV8r,   0,            X86::MOVZX16rr8, X86::AL,  U }, // UDiv
+        { X86::DIV8r,   0,            X86::MOVZX16rr8, X86::AH,  U }, // URem
+      }
+    }, // i8
+    { &X86::GR16RegClass, X86::AX,  X86::DX, {
+        { X86::IDIV16r, X86::CWD,     Copy,            X86::AX,  S }, // SDiv
+        { X86::IDIV16r, X86::CWD,     Copy,            X86::DX,  S }, // SRem
+        { X86::DIV16r,  X86::MOV32r0, Copy,            X86::AX,  U }, // UDiv
+        { X86::DIV16r,  X86::MOV32r0, Copy,            X86::DX,  U }, // URem
+      }
+    }, // i16
+    { &X86::GR32RegClass, X86::EAX, X86::EDX, {
+        { X86::IDIV32r, X86::CDQ,     Copy,            X86::EAX, S }, // SDiv
+        { X86::IDIV32r, X86::CDQ,     Copy,            X86::EDX, S }, // SRem
+        { X86::DIV32r,  X86::MOV32r0, Copy,            X86::EAX, U }, // UDiv
+        { X86::DIV32r,  X86::MOV32r0, Copy,            X86::EDX, U }, // URem
+      }
+    }, // i32
+    { &X86::GR64RegClass, X86::RAX, X86::RDX, {
+        { X86::IDIV64r, X86::CQO,     Copy,            X86::RAX, S }, // SDiv
+        { X86::IDIV64r, X86::CQO,     Copy,            X86::RDX, S }, // SRem
+        { X86::DIV64r,  X86::MOV32r0, Copy,            X86::RAX, U }, // UDiv
+        { X86::DIV64r,  X86::MOV32r0, Copy,            X86::RDX, U }, // URem
+      }
+    }, // i64
+  };
+
+  MVT VT;
+  if (!isTypeLegal(I->getType(), VT))
+    return false;
+
+  unsigned TypeIndex, OpIndex;
+  switch (VT.SimpleTy) {
+  default: return false;
+  case MVT::i8:  TypeIndex = 0; break;
+  case MVT::i16: TypeIndex = 1; break;
+  case MVT::i32: TypeIndex = 2; break;
+  case MVT::i64: TypeIndex = 3;
+    if (!Subtarget->is64Bit())
+      return false;
+    break;
+  }
+
+  switch (I->getOpcode()) {
+  default: llvm_unreachable("Unexpected div/rem opcode");
+  case Instruction::SDiv: OpIndex = 0; break;
+  case Instruction::SRem: OpIndex = 1; break;
+  case Instruction::UDiv: OpIndex = 2; break;
+  case Instruction::URem: OpIndex = 3; break;
+  }
+
+  const DivRemEntry &TypeEntry = OpTable[TypeIndex];
+  const DivRemEntry::DivRemResult &OpEntry = TypeEntry.ResultTable[OpIndex];
+  unsigned Op0Reg = getRegForValue(I->getOperand(0));
+  if (Op0Reg == 0)
+    return false;
+  unsigned Op1Reg = getRegForValue(I->getOperand(1));
+  if (Op1Reg == 0)
+    return false;
+
+  // Move op0 into low-order input register.
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+          TII.get(OpEntry.OpCopy), TypeEntry.LowInReg).addReg(Op0Reg);
+  // Zero-extend or sign-extend into high-order input register.
+  if (OpEntry.OpSignExtend) {
+    if (OpEntry.IsOpSigned)
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(OpEntry.OpSignExtend));
+    else {
+      unsigned Zero32 = createResultReg(&X86::GR32RegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(X86::MOV32r0), Zero32);
+
+      // Copy the zero into the appropriate sub/super/identical physical
+      // register. Unfortunately the operations needed are not uniform enough to
+      // fit neatly into the table above.
+      if (VT.SimpleTy == MVT::i16) {
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                TII.get(Copy), TypeEntry.HighInReg)
+          .addReg(Zero32, 0, X86::sub_16bit);
+      } else if (VT.SimpleTy == MVT::i32) {
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                TII.get(Copy), TypeEntry.HighInReg)
+            .addReg(Zero32);
+      } else if (VT.SimpleTy == MVT::i64) {
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                TII.get(TargetOpcode::SUBREG_TO_REG), TypeEntry.HighInReg)
+            .addImm(0).addReg(Zero32).addImm(X86::sub_32bit);
+      }
+    }
+  }
+  // Generate the DIV/IDIV instruction.
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+          TII.get(OpEntry.OpDivRem)).addReg(Op1Reg);
+  // For i8 remainder, we can't reference AH directly, as we'll end
+  // up with bogus copies like %R9B = COPY %AH. Reference AX
+  // instead to prevent AH references in a REX instruction.
+  //
+  // The current assumption of the fast register allocator is that isel
+  // won't generate explicit references to the GPR8_NOREX registers. If
+  // the allocator and/or the backend get enhanced to be more robust in
+  // that regard, this can be, and should be, removed.
+  unsigned ResultReg = 0;
+  if ((I->getOpcode() == Instruction::SRem ||
+       I->getOpcode() == Instruction::URem) &&
+      OpEntry.DivRemResultReg == X86::AH && Subtarget->is64Bit()) {
+    unsigned SourceSuperReg = createResultReg(&X86::GR16RegClass);
+    unsigned ResultSuperReg = createResultReg(&X86::GR16RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(Copy), SourceSuperReg).addReg(X86::AX);
+
+    // Shift AX right by 8 bits instead of using AH.
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SHR16ri),
+            ResultSuperReg).addReg(SourceSuperReg).addImm(8);
+
+    // Now reference the 8-bit subreg of the result.
+    ResultReg = FastEmitInst_extractsubreg(MVT::i8, ResultSuperReg,
+                                           /*Kill=*/true, X86::sub_8bit);
+  }
+  // Copy the result out of the physreg if we haven't already.
+  if (!ResultReg) {
+    ResultReg = createResultReg(TypeEntry.RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Copy), ResultReg)
+        .addReg(OpEntry.DivRemResultReg);
+  }
+  UpdateValueMap(I, ResultReg);
+
+  return true;
+}
+
+/// \brief Emit a conditional move instruction (if the are supported) to lower
+/// the select.
+bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
+  // Check if the subtarget supports these instructions.
+  if (!Subtarget->hasCMov())
+    return false;
+
+  // FIXME: Add support for i8.
+  if (RetVT < MVT::i16 || RetVT > MVT::i64)
+    return false;
+
+  const Value *Cond = I->getOperand(0);
+  const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
+  bool NeedTest = true;
+  X86::CondCode CC = X86::COND_NE;
+
+  // Optimize conditions coming from a compare if both instructions are in the
+  // same basic block (values defined in other basic blocks may not have
+  // initialized registers).
+  const auto *CI = dyn_cast<CmpInst>(Cond);
+  if (CI && (CI->getParent() == I->getParent())) {
+    CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+
+    // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.
+    static unsigned SETFOpcTable[2][3] = {
+      { X86::SETNPr, X86::SETEr , X86::TEST8rr },
+      { X86::SETPr,  X86::SETNEr, X86::OR8rr   }
+    };
+    unsigned *SETFOpc = nullptr;
+    switch (Predicate) {
+    default: break;
+    case CmpInst::FCMP_OEQ:
+      SETFOpc = &SETFOpcTable[0][0];
+      Predicate = CmpInst::ICMP_NE;
+      break;
+    case CmpInst::FCMP_UNE:
+      SETFOpc = &SETFOpcTable[1][0];
+      Predicate = CmpInst::ICMP_NE;
+      break;
+    }
+
+    bool NeedSwap;
+    std::tie(CC, NeedSwap) = getX86ConditionCode(Predicate);
+    assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
+
+    const Value *CmpLHS = CI->getOperand(0);
+    const Value *CmpRHS = CI->getOperand(1);
+    if (NeedSwap)
+      std::swap(CmpLHS, CmpRHS);
+
+    EVT CmpVT = TLI.getValueType(CmpLHS->getType());
+    // Emit a compare of the LHS and RHS, setting the flags.
+    if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT))
+     return false;
+
+    if (SETFOpc) {
+      unsigned FlagReg1 = createResultReg(&X86::GR8RegClass);
+      unsigned FlagReg2 = createResultReg(&X86::GR8RegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]),
+              FlagReg1);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]),
+              FlagReg2);
+      auto const &II = TII.get(SETFOpc[2]);
+      if (II.getNumDefs()) {
+        unsigned TmpReg = createResultReg(&X86::GR8RegClass);
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, TmpReg)
+          .addReg(FlagReg2).addReg(FlagReg1);
+      } else {
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+          .addReg(FlagReg2).addReg(FlagReg1);
+      }
+    }
+    NeedTest = false;
+  } else if (foldX86XALUIntrinsic(CC, I, Cond)) {
+    // Fake request the condition, otherwise the intrinsic might be completely
+    // optimized away.
+    unsigned TmpReg = getRegForValue(Cond);
+    if (TmpReg == 0)
+      return false;
+
+    NeedTest = false;
+  }
+
+  if (NeedTest) {
+    // Selects operate on i1, however, CondReg is 8 bits width and may contain
+    // garbage. Indeed, only the less significant bit is supposed to be
+    // accurate. If we read more than the lsb, we may see non-zero values
+    // whereas lsb is zero. Therefore, we have to truncate Op0Reg to i1 for
+    // the select. This is achieved by performing TEST against 1.
+    unsigned CondReg = getRegForValue(Cond);
+    if (CondReg == 0)
+      return false;
+    bool CondIsKill = hasTrivialKill(Cond);
+
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
+      .addReg(CondReg, getKillRegState(CondIsKill)).addImm(1);
+  }
+
+  const Value *LHS = I->getOperand(1);
+  const Value *RHS = I->getOperand(2);
+
+  unsigned RHSReg = getRegForValue(RHS);
+  bool RHSIsKill = hasTrivialKill(RHS);
+
+  unsigned LHSReg = getRegForValue(LHS);
+  bool LHSIsKill = hasTrivialKill(LHS);
+
+  if (!LHSReg || !RHSReg)
+    return false;
+
+  unsigned Opc = X86::getCMovFromCond(CC, RC->getSize());
+  unsigned ResultReg = FastEmitInst_rr(Opc, RC, RHSReg, RHSIsKill,
+                                       LHSReg, LHSIsKill);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+/// \brief Emit SSE instructions to lower the select.
+///
+/// Try to use SSE1/SSE2 instructions to simulate a select without branches.
+/// This lowers fp selects into a CMP/AND/ANDN/OR sequence when the necessary
+/// SSE instructions are available.
+bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
+  // Optimize conditions coming from a compare if both instructions are in the
+  // same basic block (values defined in other basic blocks may not have
+  // initialized registers).
+  const auto *CI = dyn_cast<FCmpInst>(I->getOperand(0));
+  if (!CI || (CI->getParent() != I->getParent()))
+    return false;
+
+  if (I->getType() != CI->getOperand(0)->getType() ||
+      !((Subtarget->hasSSE1() && RetVT == MVT::f32) ||
+        (Subtarget->hasSSE2() && RetVT == MVT::f64)    ))
+    return false;
+
+  const Value *CmpLHS = CI->getOperand(0);
+  const Value *CmpRHS = CI->getOperand(1);
+  CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+
+  // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0.
+  // We don't have to materialize a zero constant for this case and can just use
+  // %x again on the RHS.
+  if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
+    const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS);
+    if (CmpRHSC && CmpRHSC->isNullValue())
+      CmpRHS = CmpLHS;
+  }
+
+  unsigned CC;
+  bool NeedSwap;
+  std::tie(CC, NeedSwap) = getX86SSEConditionCode(Predicate);
+  if (CC > 7)
+    return false;
+
+  if (NeedSwap)
+    std::swap(CmpLHS, CmpRHS);
+
+  static unsigned OpcTable[2][2][4] = {
+    { { X86::CMPSSrr,  X86::FsANDPSrr,  X86::FsANDNPSrr,  X86::FsORPSrr  },
+      { X86::VCMPSSrr, X86::VFsANDPSrr, X86::VFsANDNPSrr, X86::VFsORPSrr }  },
+    { { X86::CMPSDrr,  X86::FsANDPDrr,  X86::FsANDNPDrr,  X86::FsORPDrr  },
+      { X86::VCMPSDrr, X86::VFsANDPDrr, X86::VFsANDNPDrr, X86::VFsORPDrr }  }
+  };
+
+  bool HasAVX = Subtarget->hasAVX();
+  unsigned *Opc = nullptr;
+  switch (RetVT.SimpleTy) {
+  default: return false;
+  case MVT::f32: Opc = &OpcTable[0][HasAVX][0]; break;
+  case MVT::f64: Opc = &OpcTable[1][HasAVX][0]; break;
+  }
+
+  const Value *LHS = I->getOperand(1);
+  const Value *RHS = I->getOperand(2);
+
+  unsigned LHSReg = getRegForValue(LHS);
+  bool LHSIsKill = hasTrivialKill(LHS);
+
+  unsigned RHSReg = getRegForValue(RHS);
+  bool RHSIsKill = hasTrivialKill(RHS);
+
+  unsigned CmpLHSReg = getRegForValue(CmpLHS);
+  bool CmpLHSIsKill = hasTrivialKill(CmpLHS);
+
+  unsigned CmpRHSReg = getRegForValue(CmpRHS);
+  bool CmpRHSIsKill = hasTrivialKill(CmpRHS);
+
+  if (!LHSReg || !RHSReg || !CmpLHS || !CmpRHS)
+    return false;
+
+  const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
+  unsigned CmpReg = FastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill,
+                                     CmpRHSReg, CmpRHSIsKill, CC);
+  unsigned AndReg = FastEmitInst_rr(Opc[1], RC, CmpReg, /*IsKill=*/false,
+                                    LHSReg, LHSIsKill);
+  unsigned AndNReg = FastEmitInst_rr(Opc[2], RC, CmpReg, /*IsKill=*/true,
+                                     RHSReg, RHSIsKill);
+  unsigned ResultReg = FastEmitInst_rr(Opc[3], RC, AndNReg, /*IsKill=*/true,
+                                       AndReg, /*IsKill=*/true);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) {
+  // These are pseudo CMOV instructions and will be later expanded into control-
+  // flow.
+  unsigned Opc;
+  switch (RetVT.SimpleTy) {
+  default: return false;
+  case MVT::i8:  Opc = X86::CMOV_GR8;  break;
+  case MVT::i16: Opc = X86::CMOV_GR16; break;
+  case MVT::i32: Opc = X86::CMOV_GR32; break;
+  case MVT::f32: Opc = X86::CMOV_FR32; break;
+  case MVT::f64: Opc = X86::CMOV_FR64; break;
+  }
+
+  const Value *Cond = I->getOperand(0);
+  X86::CondCode CC = X86::COND_NE;
+
+  // Optimize conditions coming from a compare if both instructions are in the
+  // same basic block (values defined in other basic blocks may not have
+  // initialized registers).
+  const auto *CI = dyn_cast<CmpInst>(Cond);
+  if (CI && (CI->getParent() == I->getParent())) {
+    bool NeedSwap;
+    std::tie(CC, NeedSwap) = getX86ConditionCode(CI->getPredicate());
+    if (CC > X86::LAST_VALID_COND)
+      return false;
+
+    const Value *CmpLHS = CI->getOperand(0);
+    const Value *CmpRHS = CI->getOperand(1);
+
+    if (NeedSwap)
+      std::swap(CmpLHS, CmpRHS);
+
+    EVT CmpVT = TLI.getValueType(CmpLHS->getType());
+    if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT))
+      return false;
+  } else {
+    unsigned CondReg = getRegForValue(Cond);
+    if (CondReg == 0)
+      return false;
+    bool CondIsKill = hasTrivialKill(Cond);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
+      .addReg(CondReg, getKillRegState(CondIsKill)).addImm(1);
+  }
+
+  const Value *LHS = I->getOperand(1);
+  const Value *RHS = I->getOperand(2);
+
+  unsigned LHSReg = getRegForValue(LHS);
+  bool LHSIsKill = hasTrivialKill(LHS);
+
+  unsigned RHSReg = getRegForValue(RHS);
+  bool RHSIsKill = hasTrivialKill(RHS);
+
+  if (!LHSReg || !RHSReg)
+    return false;
+
+  const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
+
+  unsigned ResultReg =
+    FastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill, LHSReg, LHSIsKill, CC);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool X86FastISel::X86SelectSelect(const Instruction *I) {
+  MVT RetVT;
+  if (!isTypeLegal(I->getType(), RetVT))
+    return false;
+
+  // Check if we can fold the select.
+  if (const auto *CI = dyn_cast<CmpInst>(I->getOperand(0))) {
+    CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+    const Value *Opnd = nullptr;
+    switch (Predicate) {
+    default:                              break;
+    case CmpInst::FCMP_FALSE: Opnd = I->getOperand(2); break;
+    case CmpInst::FCMP_TRUE:  Opnd = I->getOperand(1); break;
+    }
+    // No need for a select anymore - this is an unconditional move.
+    if (Opnd) {
+      unsigned OpReg = getRegForValue(Opnd);
+      if (OpReg == 0)
+        return false;
+      bool OpIsKill = hasTrivialKill(Opnd);
+      const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
+      unsigned ResultReg = createResultReg(RC);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), ResultReg)
+        .addReg(OpReg, getKillRegState(OpIsKill));
+      UpdateValueMap(I, ResultReg);
+      return true;
+    }
+  }
+
+  // First try to use real conditional move instructions.
+  if (X86FastEmitCMoveSelect(RetVT, I))
+    return true;
+
+  // Try to use a sequence of SSE instructions to simulate a conditional move.
+  if (X86FastEmitSSESelect(RetVT, I))
+    return true;
+
+  // Fall-back to pseudo conditional move instructions, which will be later
+  // converted to control-flow.
+  if (X86FastEmitPseudoSelect(RetVT, I))
+    return true;
+
+  return false;
+}
+
+bool X86FastISel::X86SelectFPExt(const Instruction *I) {
+  // fpext from float to double.
+  if (X86ScalarSSEf64 &&
+      I->getType()->isDoubleTy()) {
+    const Value *V = I->getOperand(0);
+    if (V->getType()->isFloatTy()) {
+      unsigned OpReg = getRegForValue(V);
+      if (OpReg == 0) return false;
+      unsigned ResultReg = createResultReg(&X86::FR64RegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(X86::CVTSS2SDrr), ResultReg)
+        .addReg(OpReg);
+      UpdateValueMap(I, ResultReg);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool X86FastISel::X86SelectFPTrunc(const Instruction *I) {
+  if (X86ScalarSSEf64) {
+    if (I->getType()->isFloatTy()) {
+      const Value *V = I->getOperand(0);
+      if (V->getType()->isDoubleTy()) {
+        unsigned OpReg = getRegForValue(V);
+        if (OpReg == 0) return false;
+        unsigned ResultReg = createResultReg(&X86::FR32RegClass);
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                TII.get(X86::CVTSD2SSrr), ResultReg)
+          .addReg(OpReg);
+        UpdateValueMap(I, ResultReg);
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+bool X86FastISel::X86SelectTrunc(const Instruction *I) {
+  EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType());
+  EVT DstVT = TLI.getValueType(I->getType());
+
+  // This code only handles truncation to byte.
+  if (DstVT != MVT::i8 && DstVT != MVT::i1)
+    return false;
+  if (!TLI.isTypeLegal(SrcVT))
+    return false;
+
+  unsigned InputReg = getRegForValue(I->getOperand(0));
+  if (!InputReg)
+    // Unhandled operand.  Halt "fast" selection and bail.
+    return false;
+
+  if (SrcVT == MVT::i8) {
+    // Truncate from i8 to i1; no code needed.
+    UpdateValueMap(I, InputReg);
+    return true;
+  }
+
+  if (!Subtarget->is64Bit()) {
+    // If we're on x86-32; we can't extract an i8 from a general register.
+    // First issue a copy to GR16_ABCD or GR32_ABCD.
+    const TargetRegisterClass *CopyRC = (SrcVT == MVT::i16) ?
+      (const TargetRegisterClass*)&X86::GR16_ABCDRegClass :
+      (const TargetRegisterClass*)&X86::GR32_ABCDRegClass;
+    unsigned CopyReg = createResultReg(CopyRC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY),
+            CopyReg).addReg(InputReg);
+    InputReg = CopyReg;
+  }
+
+  // Issue an extract_subreg.
+  unsigned ResultReg = FastEmitInst_extractsubreg(MVT::i8,
+                                                  InputReg, /*Kill=*/true,
+                                                  X86::sub_8bit);
+  if (!ResultReg)
+    return false;
+
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool X86FastISel::IsMemcpySmall(uint64_t Len) {
+  return Len <= (Subtarget->is64Bit() ? 32 : 16);
+}
+
+bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM,
+                                     X86AddressMode SrcAM, uint64_t Len) {
+
+  // Make sure we don't bloat code by inlining very large memcpy's.
+  if (!IsMemcpySmall(Len))
+    return false;
+
+  bool i64Legal = Subtarget->is64Bit();
+
+  // We don't care about alignment here since we just emit integer accesses.
+  while (Len) {
+    MVT VT;
+    if (Len >= 8 && i64Legal)
+      VT = MVT::i64;
+    else if (Len >= 4)
+      VT = MVT::i32;
+    else if (Len >= 2)
+      VT = MVT::i16;
+    else {
+      VT = MVT::i8;
+    }
+
+    unsigned Reg;
+    bool RV = X86FastEmitLoad(VT, SrcAM, nullptr, Reg);
+    RV &= X86FastEmitStore(VT, Reg, /*Kill=*/true, DestAM);
+    assert(RV && "Failed to emit load or store??");
+
+    unsigned Size = VT.getSizeInBits()/8;
+    Len -= Size;
+    DestAM.Disp += Size;
+    SrcAM.Disp += Size;
+  }
+
+  return true;
+}
+
+static bool isCommutativeIntrinsic(IntrinsicInst const *II) {
+  switch (II->getIntrinsicID()) {
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::uadd_with_overflow:
+  case Intrinsic::smul_with_overflow:
+  case Intrinsic::umul_with_overflow:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool X86FastISel::FastLowerIntrinsicCall(const IntrinsicInst *II) {
+  // FIXME: Handle more intrinsics.
+  switch (II->getIntrinsicID()) {
+  default: return false;
+  case Intrinsic::frameaddress: {
+    Type *RetTy = II->getCalledFunction()->getReturnType();
+
+    MVT VT;
+    if (!isTypeLegal(RetTy, VT))
+      return false;
+
+    unsigned Opc;
+    const TargetRegisterClass *RC = nullptr;
+
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Invalid result type for frameaddress.");
+    case MVT::i32: Opc = X86::MOV32rm; RC = &X86::GR32RegClass; break;
+    case MVT::i64: Opc = X86::MOV64rm; RC = &X86::GR64RegClass; break;
+    }
+
+    // This needs to be set before we call getFrameRegister, otherwise we get
+    // the wrong frame register.
+    MachineFrameInfo *MFI = FuncInfo.MF->getFrameInfo();
+    MFI->setFrameAddressIsTaken(true);
+
+    const X86RegisterInfo *RegInfo =
+      static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
+    unsigned FrameReg = RegInfo->getFrameRegister(*(FuncInfo.MF));
+    assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
+            (FrameReg == X86::EBP && VT == MVT::i32)) &&
+           "Invalid Frame Register!");
+
+    // Always make a copy of the frame register to to a vreg first, so that we
+    // never directly reference the frame register (the TwoAddressInstruction-
+    // Pass doesn't like that).
+    unsigned SrcReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), SrcReg).addReg(FrameReg);
+
+    // Now recursively load from the frame address.
+    // movq (%rbp), %rax
+    // movq (%rax), %rax
+    // movq (%rax), %rax
+    // ...
+    unsigned DestReg;
+    unsigned Depth = cast<ConstantInt>(II->getOperand(0))->getZExtValue();
+    while (Depth--) {
+      DestReg = createResultReg(RC);
+      addDirectMem(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                           TII.get(Opc), DestReg), SrcReg);
+      SrcReg = DestReg;
+    }
+
+    UpdateValueMap(II, SrcReg);
+    return true;
+  }
+  case Intrinsic::memcpy: {
+    const MemCpyInst *MCI = cast<MemCpyInst>(II);
+    // Don't handle volatile or variable length memcpys.
+    if (MCI->isVolatile())
+      return false;
+
+    if (isa<ConstantInt>(MCI->getLength())) {
+      // Small memcpy's are common enough that we want to do them
+      // without a call if possible.
+      uint64_t Len = cast<ConstantInt>(MCI->getLength())->getZExtValue();
+      if (IsMemcpySmall(Len)) {
+        X86AddressMode DestAM, SrcAM;
+        if (!X86SelectAddress(MCI->getRawDest(), DestAM) ||
+            !X86SelectAddress(MCI->getRawSource(), SrcAM))
+          return false;
+        TryEmitSmallMemcpy(DestAM, SrcAM, Len);
+        return true;
+      }
+    }
+
+    unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32;
+    if (!MCI->getLength()->getType()->isIntegerTy(SizeWidth))
+      return false;
+
+    if (MCI->getSourceAddressSpace() > 255 || MCI->getDestAddressSpace() > 255)
+      return false;
+
+    return LowerCallTo(II, "memcpy", II->getNumArgOperands() - 2);
+  }
+  case Intrinsic::memset: {
+    const MemSetInst *MSI = cast<MemSetInst>(II);
+
+    if (MSI->isVolatile())
+      return false;
+
+    unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32;
+    if (!MSI->getLength()->getType()->isIntegerTy(SizeWidth))
+      return false;
+
+    if (MSI->getDestAddressSpace() > 255)
+      return false;
+
+    return LowerCallTo(II, "memset", II->getNumArgOperands() - 2);
+  }
+  case Intrinsic::stackprotector: {
+    // Emit code to store the stack guard onto the stack.
+    EVT PtrTy = TLI.getPointerTy();
+
+    const Value *Op1 = II->getArgOperand(0); // The guard's value.
+    const AllocaInst *Slot = cast<AllocaInst>(II->getArgOperand(1));
+
+    MFI.setStackProtectorIndex(FuncInfo.StaticAllocaMap[Slot]);
+
+    // Grab the frame index.
+    X86AddressMode AM;
+    if (!X86SelectAddress(Slot, AM)) return false;
+    if (!X86FastEmitStore(PtrTy, Op1, AM)) return false;
+    return true;
+  }
+  case Intrinsic::dbg_declare: {
+    const DbgDeclareInst *DI = cast<DbgDeclareInst>(II);
+    X86AddressMode AM;
+    assert(DI->getAddress() && "Null address should be checked earlier!");
+    if (!X86SelectAddress(DI->getAddress(), AM))
+      return false;
+    const MCInstrDesc &II = TII.get(TargetOpcode::DBG_VALUE);
+    // FIXME may need to add RegState::Debug to any registers produced,
+    // although ESP/EBP should be the only ones at the moment.
+    addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II), AM).
+      addImm(0).addMetadata(DI->getVariable());
+    return true;
+  }
+  case Intrinsic::trap: {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TRAP));
+    return true;
+  }
+  case Intrinsic::sqrt: {
+    if (!Subtarget->hasSSE1())
+      return false;
+
+    Type *RetTy = II->getCalledFunction()->getReturnType();
+
+    MVT VT;
+    if (!isTypeLegal(RetTy, VT))
+      return false;
+
+    // Unfortunately we can't use FastEmit_r, because the AVX version of FSQRT
+    // is not generated by FastISel yet.
+    // FIXME: Update this code once tablegen can handle it.
+    static const unsigned SqrtOpc[2][2] = {
+      {X86::SQRTSSr, X86::VSQRTSSr},
+      {X86::SQRTSDr, X86::VSQRTSDr}
+    };
+    bool HasAVX = Subtarget->hasAVX();
+    unsigned Opc;
+    const TargetRegisterClass *RC;
+    switch (VT.SimpleTy) {
+    default: return false;
+    case MVT::f32: Opc = SqrtOpc[0][HasAVX]; RC = &X86::FR32RegClass; break;
+    case MVT::f64: Opc = SqrtOpc[1][HasAVX]; RC = &X86::FR64RegClass; break;
+    }
+
+    const Value *SrcVal = II->getArgOperand(0);
+    unsigned SrcReg = getRegForValue(SrcVal);
+
+    if (SrcReg == 0)
+      return false;
+
+    unsigned ImplicitDefReg = 0;
+    if (HasAVX) {
+      ImplicitDefReg = createResultReg(RC);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
+    }
+
+    unsigned ResultReg = createResultReg(RC);
+    MachineInstrBuilder MIB;
+    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
+                  ResultReg);
+
+    if (ImplicitDefReg)
+      MIB.addReg(ImplicitDefReg);
+
+    MIB.addReg(SrcReg);
+
+    UpdateValueMap(II, ResultReg);
+    return true;
+  }
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::uadd_with_overflow:
+  case Intrinsic::ssub_with_overflow:
+  case Intrinsic::usub_with_overflow:
+  case Intrinsic::smul_with_overflow:
+  case Intrinsic::umul_with_overflow: {
+    // This implements the basic lowering of the xalu with overflow intrinsics
+    // into add/sub/mul followed by either seto or setb.
+    const Function *Callee = II->getCalledFunction();
+    auto *Ty = cast<StructType>(Callee->getReturnType());
+    Type *RetTy = Ty->getTypeAtIndex(0U);
+    Type *CondTy = Ty->getTypeAtIndex(1);
+
+    MVT VT;
+    if (!isTypeLegal(RetTy, VT))
+      return false;
+
+    if (VT < MVT::i8 || VT > MVT::i64)
+      return false;
+
+    const Value *LHS = II->getArgOperand(0);
+    const Value *RHS = II->getArgOperand(1);
+
+    // Canonicalize immediate to the RHS.
+    if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) &&
+        isCommutativeIntrinsic(II))
+      std::swap(LHS, RHS);
+
+    unsigned BaseOpc, CondOpc;
+    switch (II->getIntrinsicID()) {
+    default: llvm_unreachable("Unexpected intrinsic!");
+    case Intrinsic::sadd_with_overflow:
+      BaseOpc = ISD::ADD; CondOpc = X86::SETOr; break;
+    case Intrinsic::uadd_with_overflow:
+      BaseOpc = ISD::ADD; CondOpc = X86::SETBr; break;
+    case Intrinsic::ssub_with_overflow:
+      BaseOpc = ISD::SUB; CondOpc = X86::SETOr; break;
+    case Intrinsic::usub_with_overflow:
+      BaseOpc = ISD::SUB; CondOpc = X86::SETBr; break;
+    case Intrinsic::smul_with_overflow:
+      BaseOpc = X86ISD::SMUL; CondOpc = X86::SETOr; break;
+    case Intrinsic::umul_with_overflow:
+      BaseOpc = X86ISD::UMUL; CondOpc = X86::SETOr; break;
+    }
+
+    unsigned LHSReg = getRegForValue(LHS);
+    if (LHSReg == 0)
+      return false;
+    bool LHSIsKill = hasTrivialKill(LHS);
+
+    unsigned ResultReg = 0;
+    // Check if we have an immediate version.
+    if (auto const *C = dyn_cast<ConstantInt>(RHS)) {
+      ResultReg = FastEmit_ri(VT, VT, BaseOpc, LHSReg, LHSIsKill,
+                              C->getZExtValue());
+    }
+
+    unsigned RHSReg;
+    bool RHSIsKill;
+    if (!ResultReg) {
+      RHSReg = getRegForValue(RHS);
+      if (RHSReg == 0)
+        return false;
+      RHSIsKill = hasTrivialKill(RHS);
+      ResultReg = FastEmit_rr(VT, VT, BaseOpc, LHSReg, LHSIsKill, RHSReg,
+                              RHSIsKill);
+    }
+
+    // FastISel doesn't have a pattern for all X86::MUL*r and X86::IMUL*r. Emit
+    // it manually.
+    if (BaseOpc == X86ISD::UMUL && !ResultReg) {
+      static const unsigned MULOpc[] =
+        { X86::MUL8r, X86::MUL16r, X86::MUL32r, X86::MUL64r };
+      static const unsigned Reg[] = { X86::AL, X86::AX, X86::EAX, X86::RAX };
+      // First copy the first operand into RAX, which is an implicit input to
+      // the X86::MUL*r instruction.
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), Reg[VT.SimpleTy-MVT::i8])
+        .addReg(LHSReg, getKillRegState(LHSIsKill));
+      ResultReg = FastEmitInst_r(MULOpc[VT.SimpleTy-MVT::i8],
+                                 TLI.getRegClassFor(VT), RHSReg, RHSIsKill);
+    } else if (BaseOpc == X86ISD::SMUL && !ResultReg) {
+      static const unsigned MULOpc[] =
+        { X86::IMUL8r, X86::IMUL16rr, X86::IMUL32rr, X86::IMUL64rr };
+      if (VT == MVT::i8) {
+        // Copy the first operand into AL, which is an implicit input to the
+        // X86::IMUL8r instruction.
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+               TII.get(TargetOpcode::COPY), X86::AL)
+          .addReg(LHSReg, getKillRegState(LHSIsKill));
+        ResultReg = FastEmitInst_r(MULOpc[0], TLI.getRegClassFor(VT), RHSReg,
+                                   RHSIsKill);
+      } else
+        ResultReg = FastEmitInst_rr(MULOpc[VT.SimpleTy-MVT::i8],
+                                    TLI.getRegClassFor(VT), LHSReg, LHSIsKill,
+                                    RHSReg, RHSIsKill);
+    }
+
+    if (!ResultReg)
+      return false;
+
+    unsigned ResultReg2 = FuncInfo.CreateRegs(CondTy);
+    assert((ResultReg+1) == ResultReg2 && "Nonconsecutive result registers.");
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CondOpc),
+            ResultReg2);
+
+    UpdateValueMap(II, ResultReg, 2);
+    return true;
+  }
+  case Intrinsic::x86_sse_cvttss2si:
+  case Intrinsic::x86_sse_cvttss2si64:
+  case Intrinsic::x86_sse2_cvttsd2si:
+  case Intrinsic::x86_sse2_cvttsd2si64: {
+    bool IsInputDouble;
+    switch (II->getIntrinsicID()) {
+    default: llvm_unreachable("Unexpected intrinsic.");
+    case Intrinsic::x86_sse_cvttss2si:
+    case Intrinsic::x86_sse_cvttss2si64:
+      if (!Subtarget->hasSSE1())
+        return false;
+      IsInputDouble = false;
+      break;
+    case Intrinsic::x86_sse2_cvttsd2si:
+    case Intrinsic::x86_sse2_cvttsd2si64:
+      if (!Subtarget->hasSSE2())
+        return false;
+      IsInputDouble = true;
+      break;
+    }
+
+    Type *RetTy = II->getCalledFunction()->getReturnType();
+    MVT VT;
+    if (!isTypeLegal(RetTy, VT))
+      return false;
+
+    static const unsigned CvtOpc[2][2][2] = {
+      { { X86::CVTTSS2SIrr,   X86::VCVTTSS2SIrr   },
+        { X86::CVTTSS2SI64rr, X86::VCVTTSS2SI64rr }  },
+      { { X86::CVTTSD2SIrr,   X86::VCVTTSD2SIrr   },
+        { X86::CVTTSD2SI64rr, X86::VCVTTSD2SI64rr }  }
+    };
+    bool HasAVX = Subtarget->hasAVX();
+    unsigned Opc;
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected result type.");
+    case MVT::i32: Opc = CvtOpc[IsInputDouble][0][HasAVX]; break;
+    case MVT::i64: Opc = CvtOpc[IsInputDouble][1][HasAVX]; break;
+    }
+
+    // Check if we can fold insertelement instructions into the convert.
+    const Value *Op = II->getArgOperand(0);
+    while (auto *IE = dyn_cast<InsertElementInst>(Op)) {
+      const Value *Index = IE->getOperand(2);
+      if (!isa<ConstantInt>(Index))
+        break;
+      unsigned Idx = cast<ConstantInt>(Index)->getZExtValue();
+
+      if (Idx == 0) {
+        Op = IE->getOperand(1);
+        break;
+      }
+      Op = IE->getOperand(0);
+    }
+
+    unsigned Reg = getRegForValue(Op);
+    if (Reg == 0)
+      return false;
+
+    unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+      .addReg(Reg);
+
+    UpdateValueMap(II, ResultReg);
+    return true;
+  }
+  }
+}
+
+bool X86FastISel::FastLowerArguments() {
+  if (!FuncInfo.CanLowerReturn)
+    return false;
+
+  const Function *F = FuncInfo.Fn;
+  if (F->isVarArg())
+    return false;
+
+  CallingConv::ID CC = F->getCallingConv();
+  if (CC != CallingConv::C)
+    return false;
+
+  if (Subtarget->isCallingConvWin64(CC))
+    return false;
+
+  if (!Subtarget->is64Bit())
+    return false;
+  
+  // Only handle simple cases. i.e. Up to 6 i32/i64 scalar arguments.
+  unsigned GPRCnt = 0;
+  unsigned FPRCnt = 0;
+  unsigned Idx = 0;
+  for (auto const &Arg : F->args()) {
+    // The first argument is at index 1.
+    ++Idx;
+    if (F->getAttributes().hasAttribute(Idx, Attribute::ByVal) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::InReg) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::StructRet) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::Nest))
+      return false;
+
+    Type *ArgTy = Arg.getType();
+    if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy())
+      return false;
+
+    EVT ArgVT = TLI.getValueType(ArgTy);
+    if (!ArgVT.isSimple()) return false;
+    switch (ArgVT.getSimpleVT().SimpleTy) {
+    default: return false;
+    case MVT::i32:
+    case MVT::i64:
+      ++GPRCnt;
+      break;
+    case MVT::f32:
+    case MVT::f64:
+      if (!Subtarget->hasSSE1())
+        return false;
+      ++FPRCnt;
+      break;
+    }
+
+    if (GPRCnt > 6)
+      return false;
+
+    if (FPRCnt > 8)
+      return false;
+  }
+
+  static const MCPhysReg GPR32ArgRegs[] = {
+    X86::EDI, X86::ESI, X86::EDX, X86::ECX, X86::R8D, X86::R9D
+  };
+  static const MCPhysReg GPR64ArgRegs[] = {
+    X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8 , X86::R9
+  };
+  static const MCPhysReg XMMArgRegs[] = {
+    X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+    X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+  };
+
+  unsigned GPRIdx = 0;
+  unsigned FPRIdx = 0;
+  for (auto const &Arg : F->args()) {
+    MVT VT = TLI.getSimpleValueType(Arg.getType());
+    const TargetRegisterClass *RC = TLI.getRegClassFor(VT);
+    unsigned SrcReg;
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type.");
+    case MVT::i32: SrcReg = GPR32ArgRegs[GPRIdx++]; break;
+    case MVT::i64: SrcReg = GPR64ArgRegs[GPRIdx++]; break;
+    case MVT::f32: // fall-through
+    case MVT::f64: SrcReg = XMMArgRegs[FPRIdx++]; break;
+    }
+    unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
+    // FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
+    // Without this, EmitLiveInCopies may eliminate the livein if its only
+    // use is a bitcast (which isn't turned into an instruction).
+    unsigned ResultReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), ResultReg)
+      .addReg(DstReg, getKillRegState(true));
+    UpdateValueMap(&Arg, ResultReg);
+  }
+  return true;
+}
+
+static unsigned computeBytesPoppedByCallee(const X86Subtarget *Subtarget,
+                                           CallingConv::ID CC,
+                                           ImmutableCallSite *CS) {
+  if (Subtarget->is64Bit())
+    return 0;
+  if (Subtarget->getTargetTriple().isOSMSVCRT())
+    return 0;
+  if (CC == CallingConv::Fast || CC == CallingConv::GHC ||
+      CC == CallingConv::HiPE)
+    return 0;
+  if (CS && !CS->paramHasAttr(1, Attribute::StructRet))
+    return 0;
+  if (CS && CS->paramHasAttr(1, Attribute::InReg))
+    return 0;
+  return 4;
+}
+
+bool X86FastISel::FastLowerCall(CallLoweringInfo &CLI) {
+  auto &OutVals       = CLI.OutVals;
+  auto &OutFlags      = CLI.OutFlags;
+  auto &OutRegs       = CLI.OutRegs;
+  auto &Ins           = CLI.Ins;
+  auto &InRegs        = CLI.InRegs;
+  CallingConv::ID CC  = CLI.CallConv;
+  bool &IsTailCall    = CLI.IsTailCall;
+  bool IsVarArg       = CLI.IsVarArg;
+  const Value *Callee = CLI.Callee;
+  const char *SymName = CLI.SymName;
+
+  bool Is64Bit        = Subtarget->is64Bit();
+  bool IsWin64        = Subtarget->isCallingConvWin64(CC);
+
+  // Handle only C, fastcc, and webkit_js calling conventions for now.
+  switch (CC) {
+  default: return false;
+  case CallingConv::C:
+  case CallingConv::Fast:
+  case CallingConv::WebKit_JS:
+  case CallingConv::X86_FastCall:
+  case CallingConv::X86_64_Win64:
+  case CallingConv::X86_64_SysV:
+    break;
+  }
+
+  // Allow SelectionDAG isel to handle tail calls.
+  if (IsTailCall)
+    return false;
+
+  // fastcc with -tailcallopt is intended to provide a guaranteed
+  // tail call optimization. Fastisel doesn't know how to do that.
+  if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt)
+    return false;
+
+  // Don't know how to handle Win64 varargs yet.  Nothing special needed for
+  // x86-32. Special handling for x86-64 is implemented.
+  if (IsVarArg && IsWin64)
+    return false;
+
+  // Don't know about inalloca yet.
+  if (CLI.CS && CLI.CS->hasInAllocaArgument())
+    return false;
+
+  // Fast-isel doesn't know about callee-pop yet.
+  if (X86::isCalleePop(CC, Subtarget->is64Bit(), IsVarArg,
+                       TM.Options.GuaranteedTailCallOpt))
+    return false;
+
+  // If this is a constant i1/i8/i16 argument, promote to i32 to avoid an extra
+  // instruction. This is safe because it is common to all FastISel supported
+  // calling conventions on x86.
+  for (int i = 0, e = OutVals.size(); i != e; ++i) {
+    Value *&Val = OutVals[i];
+    ISD::ArgFlagsTy Flags = OutFlags[i];
+    if (auto *CI = dyn_cast<ConstantInt>(Val)) {
+      if (CI->getBitWidth() < 32) {
+        if (Flags.isSExt())
+          Val = ConstantExpr::getSExt(CI, Type::getInt32Ty(CI->getContext()));
+        else
+          Val = ConstantExpr::getZExt(CI, Type::getInt32Ty(CI->getContext()));
+      }
+    }
+
+    // Passing bools around ends up doing a trunc to i1 and passing it.
+    // Codegen this as an argument + "and 1".
+    if (auto *TI = dyn_cast<TruncInst>(Val)) {
+      if (TI->getType()->isIntegerTy(1) && CLI.CS &&
+          (TI->getParent() == CLI.CS->getInstruction()->getParent()) &&
+          TI->hasOneUse()) {
+        Val = cast<TruncInst>(Val)->getOperand(0);
+        unsigned ResultReg = getRegForValue(Val);
+
+        if (!ResultReg)
+          return false;
+
+        MVT ArgVT;
+        if (!isTypeLegal(Val->getType(), ArgVT))
+          return false;
+
+        ResultReg =
+          FastEmit_ri(ArgVT, ArgVT, ISD::AND, ResultReg, Val->hasOneUse(), 1);
+
+        if (!ResultReg)
+          return false;
+        UpdateValueMap(Val, ResultReg);
+      }
+    }
+  }
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, TM, ArgLocs,
+                 CLI.RetTy->getContext());
+
+  // Allocate shadow area for Win64
+  if (IsWin64)
+    CCInfo.AllocateStack(32, 8);
+
+  SmallVector<MVT, 16> OutVTs;
+  for (auto *Val : OutVals) {
+    MVT VT;
+    if (!isTypeLegal(Val->getType(), VT))
+      return false;
+    OutVTs.push_back(VT);
+  }
+  CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, CC_X86);
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+
+  // Issue CALLSEQ_START
+  unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
+    .addImm(NumBytes);
+
+  // Walk the register/memloc assignments, inserting copies/loads.
+  const X86RegisterInfo *RegInfo =
+    static_cast<const X86RegisterInfo *>(TM.getRegisterInfo());
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign const &VA = ArgLocs[i];
+    const Value *ArgVal = OutVals[VA.getValNo()];
+    MVT ArgVT = OutVTs[VA.getValNo()];
+
+    if (ArgVT == MVT::x86mmx)
+      return false;
+
+    unsigned ArgReg = getRegForValue(ArgVal);
+    if (!ArgReg)
+      return false;
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    case CCValAssign::Full: break;
+    case CCValAssign::SExt: {
+      assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
+             "Unexpected extend");
+      bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg,
+                                       ArgVT, ArgReg);
+      assert(Emitted && "Failed to emit a sext!"); (void)Emitted;
+      ArgVT = VA.getLocVT();
+      break;
+    }
+    case CCValAssign::ZExt: {
+      assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
+             "Unexpected extend");
+      bool Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg,
+                                       ArgVT, ArgReg);
+      assert(Emitted && "Failed to emit a zext!"); (void)Emitted;
+      ArgVT = VA.getLocVT();
+      break;
+    }
+    case CCValAssign::AExt: {
+      assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
+             "Unexpected extend");
+      bool Emitted = X86FastEmitExtend(ISD::ANY_EXTEND, VA.getLocVT(), ArgReg,
+                                       ArgVT, ArgReg);
+      if (!Emitted)
+        Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg,
+                                    ArgVT, ArgReg);
+      if (!Emitted)
+        Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg,
+                                    ArgVT, ArgReg);
+
+      assert(Emitted && "Failed to emit a aext!"); (void)Emitted;
+      ArgVT = VA.getLocVT();
+      break;
+    }
+    case CCValAssign::BCvt: {
+      ArgReg = FastEmit_r(ArgVT, VA.getLocVT(), ISD::BITCAST, ArgReg,
+                          /*TODO: Kill=*/false);
+      assert(ArgReg && "Failed to emit a bitcast!");
+      ArgVT = VA.getLocVT();
+      break;
+    }
+    case CCValAssign::VExt:
+      // VExt has not been implemented, so this should be impossible to reach
+      // for now.  However, fallback to Selection DAG isel once implemented.
+      return false;
+    case CCValAssign::FPExt:
+      llvm_unreachable("Unexpected loc info!");
+    case CCValAssign::Indirect:
+      // FIXME: Indirect doesn't need extending, but fast-isel doesn't fully
+      // support this.
+      return false;
+    }
+
+    if (VA.isRegLoc()) {
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(ArgReg);
+      OutRegs.push_back(VA.getLocReg());
+    } else {
+      assert(VA.isMemLoc());
+      unsigned LocMemOffset = VA.getLocMemOffset();
+      X86AddressMode AM;
+      AM.Base.Reg = RegInfo->getStackRegister();
+      AM.Disp = LocMemOffset;
+      ISD::ArgFlagsTy Flags = OutFlags[VA.getValNo()];
+      unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType());
+      MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
+        MachinePointerInfo::getStack(LocMemOffset), MachineMemOperand::MOStore,
+        ArgVT.getStoreSize(), Alignment);
+      if (Flags.isByVal()) {
+        X86AddressMode SrcAM;
+        SrcAM.Base.Reg = ArgReg;
+        if (!TryEmitSmallMemcpy(AM, SrcAM, Flags.getByValSize()))
+          return false;
+      } else if (isa<ConstantInt>(ArgVal) || isa<ConstantPointerNull>(ArgVal)) {
+        // If this is a really simple value, emit this with the Value* version
+        // of X86FastEmitStore.  If it isn't simple, we don't want to do this,
+        // as it can cause us to reevaluate the argument.
+        if (!X86FastEmitStore(ArgVT, ArgVal, AM, MMO))
+          return false;
+      } else {
+        bool ValIsKill = hasTrivialKill(ArgVal);
+        if (!X86FastEmitStore(ArgVT, ArgReg, ValIsKill, AM, MMO))
+          return false;
+      }
+    }
+  }
+
+  // ELF / PIC requires GOT in the EBX register before function calls via PLT
+  // GOT pointer.
+  if (Subtarget->isPICStyleGOT()) {
+    unsigned Base = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), X86::EBX).addReg(Base);
+  }
+
+  if (Is64Bit && IsVarArg && !IsWin64) {
+    // From AMD64 ABI document:
+    // For calls that may call functions that use varargs or stdargs
+    // (prototype-less calls or calls to functions containing ellipsis (...) in
+    // the declaration) %al is used as hidden argument to specify the number
+    // of SSE registers used. The contents of %al do not need to match exactly
+    // the number of registers, but must be an ubound on the number of SSE
+    // registers used and is in the range 0 - 8 inclusive.
+
+    // Count the number of XMM registers allocated.
+    static const MCPhysReg XMMArgRegs[] = {
+      X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+      X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+    };
+    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
+    assert((Subtarget->hasSSE1() || !NumXMMRegs)
+           && "SSE registers cannot be used when SSE is disabled");
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri),
+            X86::AL).addImm(NumXMMRegs);
+  }
+
+  // Materialize callee address in a register. FIXME: GV address can be
+  // handled with a CALLpcrel32 instead.
+  X86AddressMode CalleeAM;
+  if (!X86SelectCallAddress(Callee, CalleeAM))
+    return false;
+
+  unsigned CalleeOp = 0;
+  const GlobalValue *GV = nullptr;
+  if (CalleeAM.GV != nullptr) {
+    GV = CalleeAM.GV;
+  } else if (CalleeAM.Base.Reg != 0) {
+    CalleeOp = CalleeAM.Base.Reg;
+  } else
+    return false;
+
+  // Issue the call.
+  MachineInstrBuilder MIB;
+  if (CalleeOp) {
+    // Register-indirect call.
+    unsigned CallOpc = Is64Bit ? X86::CALL64r : X86::CALL32r;
+    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc))
+      .addReg(CalleeOp);
+  } else {
+    // Direct call.
+    assert(GV && "Not a direct call");
+    unsigned CallOpc = Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32;
+
+    // See if we need any target-specific flags on the GV operand.
+    unsigned char OpFlags = 0;
+
+    // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
+    // external symbols most go through the PLT in PIC mode.  If the symbol
+    // has hidden or protected visibility, or if it is static or local, then
+    // we don't need to use the PLT - we can directly call it.
+    if (Subtarget->isTargetELF() &&
+        TM.getRelocationModel() == Reloc::PIC_ &&
+        GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
+      OpFlags = X86II::MO_PLT;
+    } else if (Subtarget->isPICStyleStubAny() &&
+               (GV->isDeclaration() || GV->isWeakForLinker()) &&
+               (!Subtarget->getTargetTriple().isMacOSX() ||
+                Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
+      // PC-relative references to external symbols should go through $stub,
+      // unless we're building with the leopard linker or later, which
+      // automatically synthesizes these stubs.
+      OpFlags = X86II::MO_DARWIN_STUB;
+    }
+
+    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc));
+    if (SymName)
+      MIB.addExternalSymbol(SymName, OpFlags);
+    else
+      MIB.addGlobalAddress(GV, 0, OpFlags);
+  }
+
+  // Add a register mask operand representing the call-preserved registers.
+  // Proper defs for return values will be added by setPhysRegsDeadExcept().
+  MIB.addRegMask(TRI.getCallPreservedMask(CC));
+
+  // Add an implicit use GOT pointer in EBX.
+  if (Subtarget->isPICStyleGOT())
+    MIB.addReg(X86::EBX, RegState::Implicit);
+
+  if (Is64Bit && IsVarArg && !IsWin64)
+    MIB.addReg(X86::AL, RegState::Implicit);
+
+  // Add implicit physical register uses to the call.
+  for (auto Reg : OutRegs)
+    MIB.addReg(Reg, RegState::Implicit);
+
+  // Issue CALLSEQ_END
+  unsigned NumBytesForCalleeToPop =
+    computeBytesPoppedByCallee(Subtarget, CC, CLI.CS);
+  unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp))
+    .addImm(NumBytes).addImm(NumBytesForCalleeToPop);
+
+  // Now handle call return values.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCRetInfo(CC, IsVarArg, *FuncInfo.MF, TM, RVLocs,
+                    CLI.RetTy->getContext());
+  CCRetInfo.AnalyzeCallResult(Ins, RetCC_X86);
+
+  // Copy all of the result registers out of their specified physreg.
+  unsigned ResultReg = FuncInfo.CreateRegs(CLI.RetTy);
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    EVT CopyVT = VA.getValVT();
+    unsigned CopyReg = ResultReg + i;
+
+    // If this is x86-64, and we disabled SSE, we can't return FP values
+    if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
+        ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
+      report_fatal_error("SSE register return with SSE disabled");
+    }
+
+    // If this is a call to a function that returns an fp value on the floating
+    // point stack, we must guarantee the value is popped from the stack, so
+    // a COPY is not good enough - the copy instruction may be eliminated if the
+    // return value is not used. We use the FpPOP_RETVAL instruction instead.
+    if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) {
+      // If we prefer to use the value in xmm registers, copy it out as f80 and
+      // use a truncate to move it from fp stack reg to xmm reg.
+      if (isScalarFPTypeInSSEReg(VA.getValVT())) {
+        CopyVT = MVT::f80;
+        CopyReg = createResultReg(&X86::RFP80RegClass);
+      }
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(X86::FpPOP_RETVAL), CopyReg);
+
+      // Round the f80 to the right size, which also moves it to the appropriate
+      // xmm register. This is accomplished by storing the f80 value in memory
+      // and then loading it back.
+      if (CopyVT != VA.getValVT()) {
+        EVT ResVT = VA.getValVT();
+        unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64;
+        unsigned MemSize = ResVT.getSizeInBits()/8;
+        int FI = MFI.CreateStackObject(MemSize, MemSize, false);
+        addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                                  TII.get(Opc)), FI)
+          .addReg(CopyReg);
+        Opc = ResVT == MVT::f32 ? X86::MOVSSrm : X86::MOVSDrm;
+        addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                                  TII.get(Opc), ResultReg + i), FI);
+      }
+    } else {
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), CopyReg).addReg(VA.getLocReg());
+      InRegs.push_back(VA.getLocReg());
+    }
+  }
+
+  CLI.ResultReg = ResultReg;
+  CLI.NumResultRegs = RVLocs.size();
+  CLI.Call = MIB;
+
+  return true;
+}
+
+bool
+X86FastISel::TargetSelectInstruction(const Instruction *I)  {
+  switch (I->getOpcode()) {
+  default: break;
+  case Instruction::Load:
+    return X86SelectLoad(I);
+  case Instruction::Store:
+    return X86SelectStore(I);
+  case Instruction::Ret:
+    return X86SelectRet(I);
+  case Instruction::ICmp:
+  case Instruction::FCmp:
+    return X86SelectCmp(I);
+  case Instruction::ZExt:
+    return X86SelectZExt(I);
+  case Instruction::Br:
+    return X86SelectBranch(I);
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::Shl:
+    return X86SelectShift(I);
+  case Instruction::SDiv:
+  case Instruction::UDiv:
+  case Instruction::SRem:
+  case Instruction::URem:
+    return X86SelectDivRem(I);
+  case Instruction::Select:
+    return X86SelectSelect(I);
+  case Instruction::Trunc:
+    return X86SelectTrunc(I);
+  case Instruction::FPExt:
+    return X86SelectFPExt(I);
+  case Instruction::FPTrunc:
+    return X86SelectFPTrunc(I);
+  case Instruction::IntToPtr: // Deliberate fall-through.
+  case Instruction::PtrToInt: {
+    EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType());
+    EVT DstVT = TLI.getValueType(I->getType());
+    if (DstVT.bitsGT(SrcVT))
+      return X86SelectZExt(I);
+    if (DstVT.bitsLT(SrcVT))
+      return X86SelectTrunc(I);
+    unsigned Reg = getRegForValue(I->getOperand(0));
+    if (Reg == 0) return false;
+    UpdateValueMap(I, Reg);
+    return true;
+  }
+  }
+
+  return false;
+}
+
+unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) {
+  MVT VT;
+  if (!isTypeLegal(C->getType(), VT))
+    return 0;
+
+  // Can't handle alternate code models yet.
+  if (TM.getCodeModel() != CodeModel::Small)
+    return 0;
+
+  // Get opcode and regclass of the output for the given load instruction.
+  unsigned Opc = 0;
+  const TargetRegisterClass *RC = nullptr;
+  switch (VT.SimpleTy) {
+  default: return 0;
+  case MVT::i8:
+    Opc = X86::MOV8rm;
+    RC  = &X86::GR8RegClass;
+    break;
+  case MVT::i16:
+    Opc = X86::MOV16rm;
+    RC  = &X86::GR16RegClass;
+    break;
+  case MVT::i32:
+    Opc = X86::MOV32rm;
+    RC  = &X86::GR32RegClass;
+    break;
+  case MVT::i64:
+    // Must be in x86-64 mode.
+    Opc = X86::MOV64rm;
+    RC  = &X86::GR64RegClass;
+    break;
+  case MVT::f32:
+    if (X86ScalarSSEf32) {
+      Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm;
+      RC  = &X86::FR32RegClass;
+    } else {
+      Opc = X86::LD_Fp32m;
+      RC  = &X86::RFP32RegClass;
+    }
+    break;
+  case MVT::f64:
+    if (X86ScalarSSEf64) {
+      Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm;
+      RC  = &X86::FR64RegClass;
+    } else {
+      Opc = X86::LD_Fp64m;
+      RC  = &X86::RFP64RegClass;
+    }
+    break;
+  case MVT::f80:
+    // No f80 support yet.
+    return 0;
+  }
+
+  // Materialize addresses with LEA/MOV instructions.
+  if (isa<GlobalValue>(C)) {
+    X86AddressMode AM;
+    if (X86SelectAddress(C, AM)) {
+      // If the expression is just a basereg, then we're done, otherwise we need
+      // to emit an LEA.
+      if (AM.BaseType == X86AddressMode::RegBase &&
+          AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == nullptr)
+        return AM.Base.Reg;
+
+      unsigned ResultReg = createResultReg(RC);
+      if (TM.getRelocationModel() == Reloc::Static &&
+          TLI.getPointerTy() == MVT::i64) {
+        // The displacement code be more than 32 bits away so we need to use
+        // an instruction with a 64 bit immediate
+        Opc = X86::MOV64ri;
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(Opc), ResultReg).addGlobalAddress(cast<GlobalValue>(C));
+      } else {
+        Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r;
+        addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                             TII.get(Opc), ResultReg), AM);
+      }
+      return ResultReg;
+    }
+    return 0;
+  }
+
+  // MachineConstantPool wants an explicit alignment.
+  unsigned Align = DL.getPrefTypeAlignment(C->getType());
+  if (Align == 0) {
+    // Alignment of vector types.  FIXME!
+    Align = DL.getTypeAllocSize(C->getType());
+  }
+
+  // x86-32 PIC requires a PIC base register for constant pools.
+  unsigned PICBase = 0;
+  unsigned char OpFlag = 0;
+  if (Subtarget->isPICStyleStubPIC()) { // Not dynamic-no-pic
+    OpFlag = X86II::MO_PIC_BASE_OFFSET;
+    PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
+  } else if (Subtarget->isPICStyleGOT()) {
+    OpFlag = X86II::MO_GOTOFF;
+    PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
+  } else if (Subtarget->isPICStyleRIPRel() &&
+             TM.getCodeModel() == CodeModel::Small) {
+    PICBase = X86::RIP;
+  }
+
+  // Create the load from the constant pool.
+  unsigned MCPOffset = MCP.getConstantPoolIndex(C, Align);
+  unsigned ResultReg = createResultReg(RC);
+  addConstantPoolReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                                   TII.get(Opc), ResultReg),
+                           MCPOffset, PICBase, OpFlag);
+
+  return ResultReg;
+}
+
+unsigned X86FastISel::TargetMaterializeAlloca(const AllocaInst *C) {
+  // Fail on dynamic allocas. At this point, getRegForValue has already
+  // checked its CSE maps, so if we're here trying to handle a dynamic
+  // alloca, we're not going to succeed. X86SelectAddress has a
+  // check for dynamic allocas, because it's called directly from
+  // various places, but TargetMaterializeAlloca also needs a check
+  // in order to avoid recursion between getRegForValue,
+  // X86SelectAddrss, and TargetMaterializeAlloca.
+  if (!FuncInfo.StaticAllocaMap.count(C))
+    return 0;
+  assert(C->isStaticAlloca() && "dynamic alloca in the static alloca map?");
+
+  X86AddressMode AM;
+  if (!X86SelectAddress(C, AM))
+    return 0;
+  unsigned Opc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
+  const TargetRegisterClass* RC = TLI.getRegClassFor(TLI.getPointerTy());
+  unsigned ResultReg = createResultReg(RC);
+  addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                         TII.get(Opc), ResultReg), AM);
+  return ResultReg;
+}
+
+unsigned X86FastISel::TargetMaterializeFloatZero(const ConstantFP *CF) {
+  MVT VT;
+  if (!isTypeLegal(CF->getType(), VT))
+    return 0;
+
+  // Get opcode and regclass for the given zero.
+  unsigned Opc = 0;
+  const TargetRegisterClass *RC = nullptr;
+  switch (VT.SimpleTy) {
+  default: return 0;
+  case MVT::f32:
+    if (X86ScalarSSEf32) {
+      Opc = X86::FsFLD0SS;
+      RC  = &X86::FR32RegClass;
+    } else {
+      Opc = X86::LD_Fp032;
+      RC  = &X86::RFP32RegClass;
+    }
+    break;
+  case MVT::f64:
+    if (X86ScalarSSEf64) {
+      Opc = X86::FsFLD0SD;
+      RC  = &X86::FR64RegClass;
+    } else {
+      Opc = X86::LD_Fp064;
+      RC  = &X86::RFP64RegClass;
+    }
+    break;
+  case MVT::f80:
+    // No f80 support yet.
+    return 0;
+  }
+
+  unsigned ResultReg = createResultReg(RC);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
+  return ResultReg;
+}
+
+
+bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
+                                      const LoadInst *LI) {
+  const Value *Ptr = LI->getPointerOperand();
+  X86AddressMode AM;
+  if (!X86SelectAddress(Ptr, AM))
+    return false;
+
+  const X86InstrInfo &XII = (const X86InstrInfo&)TII;
+
+  unsigned Size = DL.getTypeAllocSize(LI->getType());
+  unsigned Alignment = LI->getAlignment();
+
+  if (Alignment == 0)  // Ensure that codegen never sees alignment 0
+    Alignment = DL.getABITypeAlignment(LI->getType());
+
+  SmallVector<MachineOperand, 8> AddrOps;
+  AM.getFullAddress(AddrOps);
+
+  MachineInstr *Result =
+    XII.foldMemoryOperandImpl(*FuncInfo.MF, MI, OpNo, AddrOps, Size, Alignment);
+  if (!Result)
+    return false;
+
+  Result->addMemOperand(*FuncInfo.MF, createMachineMemOperandFor(LI));
+  FuncInfo.MBB->insert(FuncInfo.InsertPt, Result);
+  MI->eraseFromParent();
+  return true;
+}
+
+
+namespace llvm {
+  FastISel *X86::createFastISel(FunctionLoweringInfo &funcInfo,
+                                const TargetLibraryInfo *libInfo) {
+    return new X86FastISel(funcInfo, libInfo);
+  }
+}
diff --git a/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp b/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp
new file mode 100644
index 0000000..eb9f743
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp
@@ -0,0 +1,341 @@
+//===-- X86FixupLEAs.cpp - use or replace LEA instructions -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass that finds instructions that can be
+// re-written as LEA instructions in order to reduce pipeline delays.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-fixup-LEAs"
+
+STATISTIC(NumLEAs, "Number of LEA instructions created");
+
+namespace {
+class FixupLEAPass : public MachineFunctionPass {
+  enum RegUsageState { RU_NotUsed, RU_Write, RU_Read };
+  static char ID;
+  /// \brief Loop over all of the instructions in the basic block
+  /// replacing applicable instructions with LEA instructions,
+  /// where appropriate.
+  bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI);
+
+  const char *getPassName() const override { return "X86 LEA Fixup"; }
+
+  /// \brief Given a machine register, look for the instruction
+  /// which writes it in the current basic block. If found,
+  /// try to replace it with an equivalent LEA instruction.
+  /// If replacement succeeds, then also process the the newly created
+  /// instruction.
+  void seekLEAFixup(MachineOperand &p, MachineBasicBlock::iterator &I,
+                    MachineFunction::iterator MFI);
+
+  /// \brief Given a memory access or LEA instruction
+  /// whose address mode uses a base and/or index register, look for
+  /// an opportunity to replace the instruction which sets the base or index
+  /// register with an equivalent LEA instruction.
+  void processInstruction(MachineBasicBlock::iterator &I,
+                          MachineFunction::iterator MFI);
+
+  /// \brief Given a LEA instruction which is unprofitable
+  /// on Silvermont try to replace it with an equivalent ADD instruction
+  void processInstructionForSLM(MachineBasicBlock::iterator &I,
+                                MachineFunction::iterator MFI);
+
+  /// \brief Determine if an instruction references a machine register
+  /// and, if so, whether it reads or writes the register.
+  RegUsageState usesRegister(MachineOperand &p, MachineBasicBlock::iterator I);
+
+  /// \brief Step backwards through a basic block, looking
+  /// for an instruction which writes a register within
+  /// a maximum of INSTR_DISTANCE_THRESHOLD instruction latency cycles.
+  MachineBasicBlock::iterator searchBackwards(MachineOperand &p,
+                                              MachineBasicBlock::iterator &I,
+                                              MachineFunction::iterator MFI);
+
+  /// \brief if an instruction can be converted to an
+  /// equivalent LEA, insert the new instruction into the basic block
+  /// and return a pointer to it. Otherwise, return zero.
+  MachineInstr *postRAConvertToLEA(MachineFunction::iterator &MFI,
+                                   MachineBasicBlock::iterator &MBBI) const;
+
+public:
+  FixupLEAPass() : MachineFunctionPass(ID) {}
+
+  /// \brief Loop over all of the basic blocks,
+  /// replacing instructions by equivalent LEA instructions
+  /// if needed and when possible.
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+  MachineFunction *MF;
+  const TargetMachine *TM;
+  const X86InstrInfo *TII; // Machine instruction info.
+};
+char FixupLEAPass::ID = 0;
+}
+
+MachineInstr *
+FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
+                                 MachineBasicBlock::iterator &MBBI) const {
+  MachineInstr *MI = MBBI;
+  MachineInstr *NewMI;
+  switch (MI->getOpcode()) {
+  case X86::MOV32rr:
+  case X86::MOV64rr: {
+    const MachineOperand &Src = MI->getOperand(1);
+    const MachineOperand &Dest = MI->getOperand(0);
+    NewMI = BuildMI(*MF, MI->getDebugLoc(),
+                    TII->get(MI->getOpcode() == X86::MOV32rr ? X86::LEA32r
+                                                             : X86::LEA64r))
+                .addOperand(Dest)
+                .addOperand(Src)
+                .addImm(1)
+                .addReg(0)
+                .addImm(0)
+                .addReg(0);
+    MFI->insert(MBBI, NewMI); // Insert the new inst
+    return NewMI;
+  }
+  case X86::ADD64ri32:
+  case X86::ADD64ri8:
+  case X86::ADD64ri32_DB:
+  case X86::ADD64ri8_DB:
+  case X86::ADD32ri:
+  case X86::ADD32ri8:
+  case X86::ADD32ri_DB:
+  case X86::ADD32ri8_DB:
+  case X86::ADD16ri:
+  case X86::ADD16ri8:
+  case X86::ADD16ri_DB:
+  case X86::ADD16ri8_DB:
+    if (!MI->getOperand(2).isImm()) {
+      // convertToThreeAddress will call getImm()
+      // which requires isImm() to be true
+      return nullptr;
+    }
+    break;
+  case X86::ADD16rr:
+  case X86::ADD16rr_DB:
+    if (MI->getOperand(1).getReg() != MI->getOperand(2).getReg()) {
+      // if src1 != src2, then convertToThreeAddress will
+      // need to create a Virtual register, which we cannot do
+      // after register allocation.
+      return nullptr;
+    }
+  }
+  return TII->convertToThreeAddress(MFI, MBBI, nullptr);
+}
+
+FunctionPass *llvm::createX86FixupLEAs() { return new FixupLEAPass(); }
+
+bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) {
+  MF = &Func;
+  TM = &Func.getTarget();
+  const X86Subtarget &ST = TM->getSubtarget<X86Subtarget>();
+  if (!ST.LEAusesAG() && !ST.slowLEA())
+    return false;
+
+  TII = static_cast<const X86InstrInfo *>(TM->getInstrInfo());
+
+  DEBUG(dbgs() << "Start X86FixupLEAs\n";);
+  // Process all basic blocks.
+  for (MachineFunction::iterator I = Func.begin(), E = Func.end(); I != E; ++I)
+    processBasicBlock(Func, I);
+  DEBUG(dbgs() << "End X86FixupLEAs\n";);
+
+  return true;
+}
+
+FixupLEAPass::RegUsageState
+FixupLEAPass::usesRegister(MachineOperand &p, MachineBasicBlock::iterator I) {
+  RegUsageState RegUsage = RU_NotUsed;
+  MachineInstr *MI = I;
+
+  for (unsigned int i = 0; i < MI->getNumOperands(); ++i) {
+    MachineOperand &opnd = MI->getOperand(i);
+    if (opnd.isReg() && opnd.getReg() == p.getReg()) {
+      if (opnd.isDef())
+        return RU_Write;
+      RegUsage = RU_Read;
+    }
+  }
+  return RegUsage;
+}
+
+/// getPreviousInstr - Given a reference to an instruction in a basic
+/// block, return a reference to the previous instruction in the block,
+/// wrapping around to the last instruction of the block if the block
+/// branches to itself.
+static inline bool getPreviousInstr(MachineBasicBlock::iterator &I,
+                                    MachineFunction::iterator MFI) {
+  if (I == MFI->begin()) {
+    if (MFI->isPredecessor(MFI)) {
+      I = --MFI->end();
+      return true;
+    } else
+      return false;
+  }
+  --I;
+  return true;
+}
+
+MachineBasicBlock::iterator
+FixupLEAPass::searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I,
+                              MachineFunction::iterator MFI) {
+  int InstrDistance = 1;
+  MachineBasicBlock::iterator CurInst;
+  static const int INSTR_DISTANCE_THRESHOLD = 5;
+
+  CurInst = I;
+  bool Found;
+  Found = getPreviousInstr(CurInst, MFI);
+  while (Found && I != CurInst) {
+    if (CurInst->isCall() || CurInst->isInlineAsm())
+      break;
+    if (InstrDistance > INSTR_DISTANCE_THRESHOLD)
+      break; // too far back to make a difference
+    if (usesRegister(p, CurInst) == RU_Write) {
+      return CurInst;
+    }
+    InstrDistance += TII->getInstrLatency(TM->getInstrItineraryData(), CurInst);
+    Found = getPreviousInstr(CurInst, MFI);
+  }
+  return nullptr;
+}
+
+void FixupLEAPass::processInstruction(MachineBasicBlock::iterator &I,
+                                      MachineFunction::iterator MFI) {
+  // Process a load, store, or LEA instruction.
+  MachineInstr *MI = I;
+  int opcode = MI->getOpcode();
+  const MCInstrDesc &Desc = MI->getDesc();
+  int AddrOffset = X86II::getMemoryOperandNo(Desc.TSFlags, opcode);
+  if (AddrOffset >= 0) {
+    AddrOffset += X86II::getOperandBias(Desc);
+    MachineOperand &p = MI->getOperand(AddrOffset + X86::AddrBaseReg);
+    if (p.isReg() && p.getReg() != X86::ESP) {
+      seekLEAFixup(p, I, MFI);
+    }
+    MachineOperand &q = MI->getOperand(AddrOffset + X86::AddrIndexReg);
+    if (q.isReg() && q.getReg() != X86::ESP) {
+      seekLEAFixup(q, I, MFI);
+    }
+  }
+}
+
+void FixupLEAPass::seekLEAFixup(MachineOperand &p,
+                                MachineBasicBlock::iterator &I,
+                                MachineFunction::iterator MFI) {
+  MachineBasicBlock::iterator MBI = searchBackwards(p, I, MFI);
+  if (MBI) {
+    MachineInstr *NewMI = postRAConvertToLEA(MFI, MBI);
+    if (NewMI) {
+      ++NumLEAs;
+      DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MBI->dump(););
+      // now to replace with an equivalent LEA...
+      DEBUG(dbgs() << "FixLEA: Replaced by: "; NewMI->dump(););
+      MFI->erase(MBI);
+      MachineBasicBlock::iterator J =
+          static_cast<MachineBasicBlock::iterator>(NewMI);
+      processInstruction(J, MFI);
+    }
+  }
+}
+
+void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I,
+                                            MachineFunction::iterator MFI) {
+  MachineInstr *MI = I;
+  const int opcode = MI->getOpcode();
+  if (opcode != X86::LEA16r && opcode != X86::LEA32r && opcode != X86::LEA64r &&
+      opcode != X86::LEA64_32r)
+    return;
+  if (MI->getOperand(5).getReg() != 0 || !MI->getOperand(4).isImm() ||
+      !TII->isSafeToClobberEFLAGS(*MFI, I))
+    return;
+  const unsigned DstR = MI->getOperand(0).getReg();
+  const unsigned SrcR1 = MI->getOperand(1).getReg();
+  const unsigned SrcR2 = MI->getOperand(3).getReg();
+  if ((SrcR1 == 0 || SrcR1 != DstR) && (SrcR2 == 0 || SrcR2 != DstR))
+    return;
+  if (MI->getOperand(2).getImm() > 1)
+    return;
+  int addrr_opcode, addri_opcode;
+  switch (opcode) {
+  case X86::LEA16r:
+    addrr_opcode = X86::ADD16rr;
+    addri_opcode = X86::ADD16ri;
+    break;
+  case X86::LEA32r:
+    addrr_opcode = X86::ADD32rr;
+    addri_opcode = X86::ADD32ri;
+    break;
+  case X86::LEA64_32r:
+  case X86::LEA64r:
+    addrr_opcode = X86::ADD64rr;
+    addri_opcode = X86::ADD64ri32;
+    break;
+  default:
+    assert(false && "Unexpected LEA instruction");
+  }
+  DEBUG(dbgs() << "FixLEA: Candidate to replace:"; I->dump(););
+  DEBUG(dbgs() << "FixLEA: Replaced by: ";);
+  MachineInstr *NewMI = nullptr;
+  const MachineOperand &Dst = MI->getOperand(0);
+  // Make ADD instruction for two registers writing to LEA's destination
+  if (SrcR1 != 0 && SrcR2 != 0) {
+    const MachineOperand &Src1 = MI->getOperand(SrcR1 == DstR ? 1 : 3);
+    const MachineOperand &Src2 = MI->getOperand(SrcR1 == DstR ? 3 : 1);
+    NewMI = BuildMI(*MF, MI->getDebugLoc(), TII->get(addrr_opcode))
+                .addOperand(Dst)
+                .addOperand(Src1)
+                .addOperand(Src2);
+    MFI->insert(I, NewMI);
+    DEBUG(NewMI->dump(););
+  }
+  // Make ADD instruction for immediate
+  if (MI->getOperand(4).getImm() != 0) {
+    const MachineOperand &SrcR = MI->getOperand(SrcR1 == DstR ? 1 : 3);
+    NewMI = BuildMI(*MF, MI->getDebugLoc(), TII->get(addri_opcode))
+                .addOperand(Dst)
+                .addOperand(SrcR)
+                .addImm(MI->getOperand(4).getImm());
+    MFI->insert(I, NewMI);
+    DEBUG(NewMI->dump(););
+  }
+  if (NewMI) {
+    MFI->erase(I);
+    I = static_cast<MachineBasicBlock::iterator>(NewMI);
+  }
+}
+
+bool FixupLEAPass::processBasicBlock(MachineFunction &MF,
+                                     MachineFunction::iterator MFI) {
+
+  for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I) {
+    if (TM->getSubtarget<X86Subtarget>().isSLM())
+      processInstructionForSLM(I, MFI);
+    else
+      processInstruction(I, MFI);
+  }
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp
new file mode 100644
index 0000000..c8a3ab3
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp
@@ -0,0 +1,1773 @@
+//===-- X86FloatingPoint.cpp - Floating point Reg -> Stack converter ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which converts floating point instructions from
+// pseudo registers into register stack instructions.  This pass uses live
+// variable information to indicate where the FPn registers are used and their
+// lifetimes.
+//
+// The x87 hardware tracks liveness of the stack registers, so it is necessary
+// to implement exact liveness tracking between basic blocks. The CFG edges are
+// partitioned into bundles where the same FP registers must be live in
+// identical stack positions. Instructions are inserted at the end of each basic
+// block to rearrange the live registers to match the outgoing bundle.
+//
+// This approach avoids splitting critical edges at the potential cost of more
+// live register shuffling instructions when critical edges are present.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/EdgeBundles.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include <algorithm>
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-codegen"
+
+STATISTIC(NumFXCH, "Number of fxch instructions inserted");
+STATISTIC(NumFP  , "Number of floating point instructions");
+
+namespace {
+  struct FPS : public MachineFunctionPass {
+    static char ID;
+    FPS() : MachineFunctionPass(ID) {
+      initializeEdgeBundlesPass(*PassRegistry::getPassRegistry());
+      // This is really only to keep valgrind quiet.
+      // The logic in isLive() is too much for it.
+      memset(Stack, 0, sizeof(Stack));
+      memset(RegMap, 0, sizeof(RegMap));
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      AU.addRequired<EdgeBundles>();
+      AU.addPreservedID(MachineLoopInfoID);
+      AU.addPreservedID(MachineDominatorsID);
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    const char *getPassName() const override { return "X86 FP Stackifier"; }
+
+  private:
+    const TargetInstrInfo *TII; // Machine instruction info.
+
+    // Two CFG edges are related if they leave the same block, or enter the same
+    // block. The transitive closure of an edge under this relation is a
+    // LiveBundle. It represents a set of CFG edges where the live FP stack
+    // registers must be allocated identically in the x87 stack.
+    //
+    // A LiveBundle is usually all the edges leaving a block, or all the edges
+    // entering a block, but it can contain more edges if critical edges are
+    // present.
+    //
+    // The set of live FP registers in a LiveBundle is calculated by bundleCFG,
+    // but the exact mapping of FP registers to stack slots is fixed later.
+    struct LiveBundle {
+      // Bit mask of live FP registers. Bit 0 = FP0, bit 1 = FP1, &c.
+      unsigned Mask;
+
+      // Number of pre-assigned live registers in FixStack. This is 0 when the
+      // stack order has not yet been fixed.
+      unsigned FixCount;
+
+      // Assigned stack order for live-in registers.
+      // FixStack[i] == getStackEntry(i) for all i < FixCount.
+      unsigned char FixStack[8];
+
+      LiveBundle() : Mask(0), FixCount(0) {}
+
+      // Have the live registers been assigned a stack order yet?
+      bool isFixed() const { return !Mask || FixCount; }
+    };
+
+    // Numbered LiveBundle structs. LiveBundles[0] is used for all CFG edges
+    // with no live FP registers.
+    SmallVector<LiveBundle, 8> LiveBundles;
+
+    // The edge bundle analysis provides indices into the LiveBundles vector.
+    EdgeBundles *Bundles;
+
+    // Return a bitmask of FP registers in block's live-in list.
+    static unsigned calcLiveInMask(MachineBasicBlock *MBB) {
+      unsigned Mask = 0;
+      for (MachineBasicBlock::livein_iterator I = MBB->livein_begin(),
+           E = MBB->livein_end(); I != E; ++I) {
+        unsigned Reg = *I;
+        if (Reg < X86::FP0 || Reg > X86::FP6)
+          continue;
+        Mask |= 1 << (Reg - X86::FP0);
+      }
+      return Mask;
+    }
+
+    // Partition all the CFG edges into LiveBundles.
+    void bundleCFG(MachineFunction &MF);
+
+    MachineBasicBlock *MBB;     // Current basic block
+
+    // The hardware keeps track of how many FP registers are live, so we have
+    // to model that exactly. Usually, each live register corresponds to an
+    // FP<n> register, but when dealing with calls, returns, and inline
+    // assembly, it is sometimes necessary to have live scratch registers.
+    unsigned Stack[8];          // FP<n> Registers in each stack slot...
+    unsigned StackTop;          // The current top of the FP stack.
+
+    enum {
+      NumFPRegs = 16            // Including scratch pseudo-registers.
+    };
+
+    // For each live FP<n> register, point to its Stack[] entry.
+    // The first entries correspond to FP0-FP6, the rest are scratch registers
+    // used when we need slightly different live registers than what the
+    // register allocator thinks.
+    unsigned RegMap[NumFPRegs];
+
+    // Pending fixed registers - Inline assembly needs FP registers to appear
+    // in fixed stack slot positions. This is handled by copying FP registers
+    // to ST registers before the instruction, and copying back after the
+    // instruction.
+    //
+    // This is modeled with pending ST registers. NumPendingSTs is the number
+    // of ST registers (ST0-STn) we are tracking. PendingST[n] points to an FP
+    // register that holds the ST value. The ST registers are not moved into
+    // place until immediately before the instruction that needs them.
+    //
+    // It can happen that we need an ST register to be live when no FP register
+    // holds the value:
+    //
+    //   %ST0 = COPY %FP4<kill>
+    //
+    // When that happens, we allocate a scratch FP register to hold the ST
+    // value. That means every register in PendingST must be live.
+
+    unsigned NumPendingSTs;
+    unsigned char PendingST[8];
+
+    // Set up our stack model to match the incoming registers to MBB.
+    void setupBlockStack();
+
+    // Shuffle live registers to match the expectations of successor blocks.
+    void finishBlockStack();
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+    void dumpStack() const {
+      dbgs() << "Stack contents:";
+      for (unsigned i = 0; i != StackTop; ++i) {
+        dbgs() << " FP" << Stack[i];
+        assert(RegMap[Stack[i]] == i && "Stack[] doesn't match RegMap[]!");
+      }
+      for (unsigned i = 0; i != NumPendingSTs; ++i)
+        dbgs() << ", ST" << i << " in FP" << unsigned(PendingST[i]);
+      dbgs() << "\n";
+    }
+#endif
+
+    /// getSlot - Return the stack slot number a particular register number is
+    /// in.
+    unsigned getSlot(unsigned RegNo) const {
+      assert(RegNo < NumFPRegs && "Regno out of range!");
+      return RegMap[RegNo];
+    }
+
+    /// isLive - Is RegNo currently live in the stack?
+    bool isLive(unsigned RegNo) const {
+      unsigned Slot = getSlot(RegNo);
+      return Slot < StackTop && Stack[Slot] == RegNo;
+    }
+
+    /// getScratchReg - Return an FP register that is not currently in use.
+    unsigned getScratchReg() const {
+      for (int i = NumFPRegs - 1; i >= 8; --i)
+        if (!isLive(i))
+          return i;
+      llvm_unreachable("Ran out of scratch FP registers");
+    }
+
+    /// isScratchReg - Returns trus if RegNo is a scratch FP register.
+    static bool isScratchReg(unsigned RegNo) {
+      return RegNo > 8 && RegNo < NumFPRegs;
+    }
+
+    /// getStackEntry - Return the X86::FP<n> register in register ST(i).
+    unsigned getStackEntry(unsigned STi) const {
+      if (STi >= StackTop)
+        report_fatal_error("Access past stack top!");
+      return Stack[StackTop-1-STi];
+    }
+
+    /// getSTReg - Return the X86::ST(i) register which contains the specified
+    /// FP<RegNo> register.
+    unsigned getSTReg(unsigned RegNo) const {
+      return StackTop - 1 - getSlot(RegNo) + X86::ST0;
+    }
+
+    // pushReg - Push the specified FP<n> register onto the stack.
+    void pushReg(unsigned Reg) {
+      assert(Reg < NumFPRegs && "Register number out of range!");
+      if (StackTop >= 8)
+        report_fatal_error("Stack overflow!");
+      Stack[StackTop] = Reg;
+      RegMap[Reg] = StackTop++;
+    }
+
+    bool isAtTop(unsigned RegNo) const { return getSlot(RegNo) == StackTop-1; }
+    void moveToTop(unsigned RegNo, MachineBasicBlock::iterator I) {
+      DebugLoc dl = I == MBB->end() ? DebugLoc() : I->getDebugLoc();
+      if (isAtTop(RegNo)) return;
+
+      unsigned STReg = getSTReg(RegNo);
+      unsigned RegOnTop = getStackEntry(0);
+
+      // Swap the slots the regs are in.
+      std::swap(RegMap[RegNo], RegMap[RegOnTop]);
+
+      // Swap stack slot contents.
+      if (RegMap[RegOnTop] >= StackTop)
+        report_fatal_error("Access past stack top!");
+      std::swap(Stack[RegMap[RegOnTop]], Stack[StackTop-1]);
+
+      // Emit an fxch to update the runtime processors version of the state.
+      BuildMI(*MBB, I, dl, TII->get(X86::XCH_F)).addReg(STReg);
+      ++NumFXCH;
+    }
+
+    void duplicateToTop(unsigned RegNo, unsigned AsReg, MachineInstr *I) {
+      DebugLoc dl = I == MBB->end() ? DebugLoc() : I->getDebugLoc();
+      unsigned STReg = getSTReg(RegNo);
+      pushReg(AsReg);   // New register on top of stack
+
+      BuildMI(*MBB, I, dl, TII->get(X86::LD_Frr)).addReg(STReg);
+    }
+
+    /// duplicatePendingSTBeforeKill - The instruction at I is about to kill
+    /// RegNo. If any PendingST registers still need the RegNo value, duplicate
+    /// them to new scratch registers.
+    void duplicatePendingSTBeforeKill(unsigned RegNo, MachineInstr *I) {
+      for (unsigned i = 0; i != NumPendingSTs; ++i) {
+        if (PendingST[i] != RegNo)
+          continue;
+        unsigned SR = getScratchReg();
+        DEBUG(dbgs() << "Duplicating pending ST" << i
+                     << " in FP" << RegNo << " to FP" << SR << '\n');
+        duplicateToTop(RegNo, SR, I);
+        PendingST[i] = SR;
+      }
+    }
+
+    /// popStackAfter - Pop the current value off of the top of the FP stack
+    /// after the specified instruction.
+    void popStackAfter(MachineBasicBlock::iterator &I);
+
+    /// freeStackSlotAfter - Free the specified register from the register
+    /// stack, so that it is no longer in a register.  If the register is
+    /// currently at the top of the stack, we just pop the current instruction,
+    /// otherwise we store the current top-of-stack into the specified slot,
+    /// then pop the top of stack.
+    void freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned Reg);
+
+    /// freeStackSlotBefore - Just the pop, no folding. Return the inserted
+    /// instruction.
+    MachineBasicBlock::iterator
+    freeStackSlotBefore(MachineBasicBlock::iterator I, unsigned FPRegNo);
+
+    /// Adjust the live registers to be the set in Mask.
+    void adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I);
+
+    /// Shuffle the top FixCount stack entries such that FP reg FixStack[0] is
+    /// st(0), FP reg FixStack[1] is st(1) etc.
+    void shuffleStackTop(const unsigned char *FixStack, unsigned FixCount,
+                         MachineBasicBlock::iterator I);
+
+    bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB);
+
+    void handleZeroArgFP(MachineBasicBlock::iterator &I);
+    void handleOneArgFP(MachineBasicBlock::iterator &I);
+    void handleOneArgFPRW(MachineBasicBlock::iterator &I);
+    void handleTwoArgFP(MachineBasicBlock::iterator &I);
+    void handleCompareFP(MachineBasicBlock::iterator &I);
+    void handleCondMovFP(MachineBasicBlock::iterator &I);
+    void handleSpecialFP(MachineBasicBlock::iterator &I);
+
+    // Check if a COPY instruction is using FP registers.
+    static bool isFPCopy(MachineInstr *MI) {
+      unsigned DstReg = MI->getOperand(0).getReg();
+      unsigned SrcReg = MI->getOperand(1).getReg();
+
+      return X86::RFP80RegClass.contains(DstReg) ||
+        X86::RFP80RegClass.contains(SrcReg);
+    }
+  };
+  char FPS::ID = 0;
+}
+
+FunctionPass *llvm::createX86FloatingPointStackifierPass() { return new FPS(); }
+
+/// getFPReg - Return the X86::FPx register number for the specified operand.
+/// For example, this returns 3 for X86::FP3.
+static unsigned getFPReg(const MachineOperand &MO) {
+  assert(MO.isReg() && "Expected an FP register!");
+  unsigned Reg = MO.getReg();
+  assert(Reg >= X86::FP0 && Reg <= X86::FP6 && "Expected FP register!");
+  return Reg - X86::FP0;
+}
+
+/// runOnMachineFunction - Loop over all of the basic blocks, transforming FP
+/// register references into FP stack references.
+///
+bool FPS::runOnMachineFunction(MachineFunction &MF) {
+  // We only need to run this pass if there are any FP registers used in this
+  // function.  If it is all integer, there is nothing for us to do!
+  bool FPIsUsed = false;
+
+  assert(X86::FP6 == X86::FP0+6 && "Register enums aren't sorted right!");
+  for (unsigned i = 0; i <= 6; ++i)
+    if (MF.getRegInfo().isPhysRegUsed(X86::FP0+i)) {
+      FPIsUsed = true;
+      break;
+    }
+
+  // Early exit.
+  if (!FPIsUsed) return false;
+
+  Bundles = &getAnalysis<EdgeBundles>();
+  TII = MF.getTarget().getInstrInfo();
+
+  // Prepare cross-MBB liveness.
+  bundleCFG(MF);
+
+  StackTop = 0;
+
+  // Process the function in depth first order so that we process at least one
+  // of the predecessors for every reachable block in the function.
+  SmallPtrSet<MachineBasicBlock*, 8> Processed;
+  MachineBasicBlock *Entry = MF.begin();
+
+  bool Changed = false;
+  for (df_ext_iterator<MachineBasicBlock*, SmallPtrSet<MachineBasicBlock*, 8> >
+         I = df_ext_begin(Entry, Processed), E = df_ext_end(Entry, Processed);
+       I != E; ++I)
+    Changed |= processBasicBlock(MF, **I);
+
+  // Process any unreachable blocks in arbitrary order now.
+  if (MF.size() != Processed.size())
+    for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB)
+      if (Processed.insert(BB))
+        Changed |= processBasicBlock(MF, *BB);
+
+  LiveBundles.clear();
+
+  return Changed;
+}
+
+/// bundleCFG - Scan all the basic blocks to determine consistent live-in and
+/// live-out sets for the FP registers. Consistent means that the set of
+/// registers live-out from a block is identical to the live-in set of all
+/// successors. This is not enforced by the normal live-in lists since
+/// registers may be implicitly defined, or not used by all successors.
+void FPS::bundleCFG(MachineFunction &MF) {
+  assert(LiveBundles.empty() && "Stale data in LiveBundles");
+  LiveBundles.resize(Bundles->getNumBundles());
+
+  // Gather the actual live-in masks for all MBBs.
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
+    MachineBasicBlock *MBB = I;
+    const unsigned Mask = calcLiveInMask(MBB);
+    if (!Mask)
+      continue;
+    // Update MBB ingoing bundle mask.
+    LiveBundles[Bundles->getBundle(MBB->getNumber(), false)].Mask |= Mask;
+  }
+}
+
+/// processBasicBlock - Loop over all of the instructions in the basic block,
+/// transforming FP instructions into their stack form.
+///
+bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
+  bool Changed = false;
+  MBB = &BB;
+  NumPendingSTs = 0;
+
+  setupBlockStack();
+
+  for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
+    MachineInstr *MI = I;
+    uint64_t Flags = MI->getDesc().TSFlags;
+
+    unsigned FPInstClass = Flags & X86II::FPTypeMask;
+    if (MI->isInlineAsm())
+      FPInstClass = X86II::SpecialFP;
+
+    if (MI->isCopy() && isFPCopy(MI))
+      FPInstClass = X86II::SpecialFP;
+
+    if (MI->isImplicitDef() &&
+        X86::RFP80RegClass.contains(MI->getOperand(0).getReg()))
+      FPInstClass = X86II::SpecialFP;
+
+    if (FPInstClass == X86II::NotFP)
+      continue;  // Efficiently ignore non-fp insts!
+
+    MachineInstr *PrevMI = nullptr;
+    if (I != BB.begin())
+      PrevMI = std::prev(I);
+
+    ++NumFP;  // Keep track of # of pseudo instrs
+    DEBUG(dbgs() << "\nFPInst:\t" << *MI);
+
+    // Get dead variables list now because the MI pointer may be deleted as part
+    // of processing!
+    SmallVector<unsigned, 8> DeadRegs;
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = MI->getOperand(i);
+      if (MO.isReg() && MO.isDead())
+        DeadRegs.push_back(MO.getReg());
+    }
+
+    switch (FPInstClass) {
+    case X86II::ZeroArgFP:  handleZeroArgFP(I); break;
+    case X86II::OneArgFP:   handleOneArgFP(I);  break;  // fstp ST(0)
+    case X86II::OneArgFPRW: handleOneArgFPRW(I); break; // ST(0) = fsqrt(ST(0))
+    case X86II::TwoArgFP:   handleTwoArgFP(I);  break;
+    case X86II::CompareFP:  handleCompareFP(I); break;
+    case X86II::CondMovFP:  handleCondMovFP(I); break;
+    case X86II::SpecialFP:  handleSpecialFP(I); break;
+    default: llvm_unreachable("Unknown FP Type!");
+    }
+
+    // Check to see if any of the values defined by this instruction are dead
+    // after definition.  If so, pop them.
+    for (unsigned i = 0, e = DeadRegs.size(); i != e; ++i) {
+      unsigned Reg = DeadRegs[i];
+      if (Reg >= X86::FP0 && Reg <= X86::FP6) {
+        DEBUG(dbgs() << "Register FP#" << Reg-X86::FP0 << " is dead!\n");
+        freeStackSlotAfter(I, Reg-X86::FP0);
+      }
+    }
+
+    // Print out all of the instructions expanded to if -debug
+    DEBUG(
+      MachineBasicBlock::iterator PrevI(PrevMI);
+      if (I == PrevI) {
+        dbgs() << "Just deleted pseudo instruction\n";
+      } else {
+        MachineBasicBlock::iterator Start = I;
+        // Rewind to first instruction newly inserted.
+        while (Start != BB.begin() && std::prev(Start) != PrevI) --Start;
+        dbgs() << "Inserted instructions:\n\t";
+        Start->print(dbgs(), &MF.getTarget());
+        while (++Start != std::next(I)) {}
+      }
+      dumpStack();
+    );
+    (void)PrevMI;
+
+    Changed = true;
+  }
+
+  finishBlockStack();
+
+  return Changed;
+}
+
+/// setupBlockStack - Use the live bundles to set up our model of the stack
+/// to match predecessors' live out stack.
+void FPS::setupBlockStack() {
+  DEBUG(dbgs() << "\nSetting up live-ins for BB#" << MBB->getNumber()
+               << " derived from " << MBB->getName() << ".\n");
+  StackTop = 0;
+  // Get the live-in bundle for MBB.
+  const LiveBundle &Bundle =
+    LiveBundles[Bundles->getBundle(MBB->getNumber(), false)];
+
+  if (!Bundle.Mask) {
+    DEBUG(dbgs() << "Block has no FP live-ins.\n");
+    return;
+  }
+
+  // Depth-first iteration should ensure that we always have an assigned stack.
+  assert(Bundle.isFixed() && "Reached block before any predecessors");
+
+  // Push the fixed live-in registers.
+  for (unsigned i = Bundle.FixCount; i > 0; --i) {
+    MBB->addLiveIn(X86::ST0+i-1);
+    DEBUG(dbgs() << "Live-in st(" << (i-1) << "): %FP"
+                 << unsigned(Bundle.FixStack[i-1]) << '\n');
+    pushReg(Bundle.FixStack[i-1]);
+  }
+
+  // Kill off unwanted live-ins. This can happen with a critical edge.
+  // FIXME: We could keep these live registers around as zombies. They may need
+  // to be revived at the end of a short block. It might save a few instrs.
+  adjustLiveRegs(calcLiveInMask(MBB), MBB->begin());
+  DEBUG(MBB->dump());
+}
+
+/// finishBlockStack - Revive live-outs that are implicitly defined out of
+/// MBB. Shuffle live registers to match the expected fixed stack of any
+/// predecessors, and ensure that all predecessors are expecting the same
+/// stack.
+void FPS::finishBlockStack() {
+  // The RET handling below takes care of return blocks for us.
+  if (MBB->succ_empty())
+    return;
+
+  DEBUG(dbgs() << "Setting up live-outs for BB#" << MBB->getNumber()
+               << " derived from " << MBB->getName() << ".\n");
+
+  // Get MBB's live-out bundle.
+  unsigned BundleIdx = Bundles->getBundle(MBB->getNumber(), true);
+  LiveBundle &Bundle = LiveBundles[BundleIdx];
+
+  // We may need to kill and define some registers to match successors.
+  // FIXME: This can probably be combined with the shuffle below.
+  MachineBasicBlock::iterator Term = MBB->getFirstTerminator();
+  adjustLiveRegs(Bundle.Mask, Term);
+
+  if (!Bundle.Mask) {
+    DEBUG(dbgs() << "No live-outs.\n");
+    return;
+  }
+
+  // Has the stack order been fixed yet?
+  DEBUG(dbgs() << "LB#" << BundleIdx << ": ");
+  if (Bundle.isFixed()) {
+    DEBUG(dbgs() << "Shuffling stack to match.\n");
+    shuffleStackTop(Bundle.FixStack, Bundle.FixCount, Term);
+  } else {
+    // Not fixed yet, we get to choose.
+    DEBUG(dbgs() << "Fixing stack order now.\n");
+    Bundle.FixCount = StackTop;
+    for (unsigned i = 0; i < StackTop; ++i)
+      Bundle.FixStack[i] = getStackEntry(i);
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+// Efficient Lookup Table Support
+//===----------------------------------------------------------------------===//
+
+namespace {
+  struct TableEntry {
+    uint16_t from;
+    uint16_t to;
+    bool operator<(const TableEntry &TE) const { return from < TE.from; }
+    friend bool operator<(const TableEntry &TE, unsigned V) {
+      return TE.from < V;
+    }
+    friend bool LLVM_ATTRIBUTE_UNUSED operator<(unsigned V,
+                                                const TableEntry &TE) {
+      return V < TE.from;
+    }
+  };
+}
+
+#ifndef NDEBUG
+static bool TableIsSorted(const TableEntry *Table, unsigned NumEntries) {
+  for (unsigned i = 0; i != NumEntries-1; ++i)
+    if (!(Table[i] < Table[i+1])) return false;
+  return true;
+}
+#endif
+
+static int Lookup(const TableEntry *Table, unsigned N, unsigned Opcode) {
+  const TableEntry *I = std::lower_bound(Table, Table+N, Opcode);
+  if (I != Table+N && I->from == Opcode)
+    return I->to;
+  return -1;
+}
+
+#ifdef NDEBUG
+#define ASSERT_SORTED(TABLE)
+#else
+#define ASSERT_SORTED(TABLE)                                              \
+  { static bool TABLE##Checked = false;                                   \
+    if (!TABLE##Checked) {                                                \
+       assert(TableIsSorted(TABLE, array_lengthof(TABLE)) &&              \
+              "All lookup tables must be sorted for efficient access!");  \
+       TABLE##Checked = true;                                             \
+    }                                                                     \
+  }
+#endif
+
+//===----------------------------------------------------------------------===//
+// Register File -> Register Stack Mapping Methods
+//===----------------------------------------------------------------------===//
+
+// OpcodeTable - Sorted map of register instructions to their stack version.
+// The first element is an register file pseudo instruction, the second is the
+// concrete X86 instruction which uses the register stack.
+//
+static const TableEntry OpcodeTable[] = {
+  { X86::ABS_Fp32     , X86::ABS_F     },
+  { X86::ABS_Fp64     , X86::ABS_F     },
+  { X86::ABS_Fp80     , X86::ABS_F     },
+  { X86::ADD_Fp32m    , X86::ADD_F32m  },
+  { X86::ADD_Fp64m    , X86::ADD_F64m  },
+  { X86::ADD_Fp64m32  , X86::ADD_F32m  },
+  { X86::ADD_Fp80m32  , X86::ADD_F32m  },
+  { X86::ADD_Fp80m64  , X86::ADD_F64m  },
+  { X86::ADD_FpI16m32 , X86::ADD_FI16m },
+  { X86::ADD_FpI16m64 , X86::ADD_FI16m },
+  { X86::ADD_FpI16m80 , X86::ADD_FI16m },
+  { X86::ADD_FpI32m32 , X86::ADD_FI32m },
+  { X86::ADD_FpI32m64 , X86::ADD_FI32m },
+  { X86::ADD_FpI32m80 , X86::ADD_FI32m },
+  { X86::CHS_Fp32     , X86::CHS_F     },
+  { X86::CHS_Fp64     , X86::CHS_F     },
+  { X86::CHS_Fp80     , X86::CHS_F     },
+  { X86::CMOVBE_Fp32  , X86::CMOVBE_F  },
+  { X86::CMOVBE_Fp64  , X86::CMOVBE_F  },
+  { X86::CMOVBE_Fp80  , X86::CMOVBE_F  },
+  { X86::CMOVB_Fp32   , X86::CMOVB_F   },
+  { X86::CMOVB_Fp64   , X86::CMOVB_F  },
+  { X86::CMOVB_Fp80   , X86::CMOVB_F  },
+  { X86::CMOVE_Fp32   , X86::CMOVE_F  },
+  { X86::CMOVE_Fp64   , X86::CMOVE_F   },
+  { X86::CMOVE_Fp80   , X86::CMOVE_F   },
+  { X86::CMOVNBE_Fp32 , X86::CMOVNBE_F },
+  { X86::CMOVNBE_Fp64 , X86::CMOVNBE_F },
+  { X86::CMOVNBE_Fp80 , X86::CMOVNBE_F },
+  { X86::CMOVNB_Fp32  , X86::CMOVNB_F  },
+  { X86::CMOVNB_Fp64  , X86::CMOVNB_F  },
+  { X86::CMOVNB_Fp80  , X86::CMOVNB_F  },
+  { X86::CMOVNE_Fp32  , X86::CMOVNE_F  },
+  { X86::CMOVNE_Fp64  , X86::CMOVNE_F  },
+  { X86::CMOVNE_Fp80  , X86::CMOVNE_F  },
+  { X86::CMOVNP_Fp32  , X86::CMOVNP_F  },
+  { X86::CMOVNP_Fp64  , X86::CMOVNP_F  },
+  { X86::CMOVNP_Fp80  , X86::CMOVNP_F  },
+  { X86::CMOVP_Fp32   , X86::CMOVP_F   },
+  { X86::CMOVP_Fp64   , X86::CMOVP_F   },
+  { X86::CMOVP_Fp80   , X86::CMOVP_F   },
+  { X86::COS_Fp32     , X86::COS_F     },
+  { X86::COS_Fp64     , X86::COS_F     },
+  { X86::COS_Fp80     , X86::COS_F     },
+  { X86::DIVR_Fp32m   , X86::DIVR_F32m },
+  { X86::DIVR_Fp64m   , X86::DIVR_F64m },
+  { X86::DIVR_Fp64m32 , X86::DIVR_F32m },
+  { X86::DIVR_Fp80m32 , X86::DIVR_F32m },
+  { X86::DIVR_Fp80m64 , X86::DIVR_F64m },
+  { X86::DIVR_FpI16m32, X86::DIVR_FI16m},
+  { X86::DIVR_FpI16m64, X86::DIVR_FI16m},
+  { X86::DIVR_FpI16m80, X86::DIVR_FI16m},
+  { X86::DIVR_FpI32m32, X86::DIVR_FI32m},
+  { X86::DIVR_FpI32m64, X86::DIVR_FI32m},
+  { X86::DIVR_FpI32m80, X86::DIVR_FI32m},
+  { X86::DIV_Fp32m    , X86::DIV_F32m  },
+  { X86::DIV_Fp64m    , X86::DIV_F64m  },
+  { X86::DIV_Fp64m32  , X86::DIV_F32m  },
+  { X86::DIV_Fp80m32  , X86::DIV_F32m  },
+  { X86::DIV_Fp80m64  , X86::DIV_F64m  },
+  { X86::DIV_FpI16m32 , X86::DIV_FI16m },
+  { X86::DIV_FpI16m64 , X86::DIV_FI16m },
+  { X86::DIV_FpI16m80 , X86::DIV_FI16m },
+  { X86::DIV_FpI32m32 , X86::DIV_FI32m },
+  { X86::DIV_FpI32m64 , X86::DIV_FI32m },
+  { X86::DIV_FpI32m80 , X86::DIV_FI32m },
+  { X86::ILD_Fp16m32  , X86::ILD_F16m  },
+  { X86::ILD_Fp16m64  , X86::ILD_F16m  },
+  { X86::ILD_Fp16m80  , X86::ILD_F16m  },
+  { X86::ILD_Fp32m32  , X86::ILD_F32m  },
+  { X86::ILD_Fp32m64  , X86::ILD_F32m  },
+  { X86::ILD_Fp32m80  , X86::ILD_F32m  },
+  { X86::ILD_Fp64m32  , X86::ILD_F64m  },
+  { X86::ILD_Fp64m64  , X86::ILD_F64m  },
+  { X86::ILD_Fp64m80  , X86::ILD_F64m  },
+  { X86::ISTT_Fp16m32 , X86::ISTT_FP16m},
+  { X86::ISTT_Fp16m64 , X86::ISTT_FP16m},
+  { X86::ISTT_Fp16m80 , X86::ISTT_FP16m},
+  { X86::ISTT_Fp32m32 , X86::ISTT_FP32m},
+  { X86::ISTT_Fp32m64 , X86::ISTT_FP32m},
+  { X86::ISTT_Fp32m80 , X86::ISTT_FP32m},
+  { X86::ISTT_Fp64m32 , X86::ISTT_FP64m},
+  { X86::ISTT_Fp64m64 , X86::ISTT_FP64m},
+  { X86::ISTT_Fp64m80 , X86::ISTT_FP64m},
+  { X86::IST_Fp16m32  , X86::IST_F16m  },
+  { X86::IST_Fp16m64  , X86::IST_F16m  },
+  { X86::IST_Fp16m80  , X86::IST_F16m  },
+  { X86::IST_Fp32m32  , X86::IST_F32m  },
+  { X86::IST_Fp32m64  , X86::IST_F32m  },
+  { X86::IST_Fp32m80  , X86::IST_F32m  },
+  { X86::IST_Fp64m32  , X86::IST_FP64m },
+  { X86::IST_Fp64m64  , X86::IST_FP64m },
+  { X86::IST_Fp64m80  , X86::IST_FP64m },
+  { X86::LD_Fp032     , X86::LD_F0     },
+  { X86::LD_Fp064     , X86::LD_F0     },
+  { X86::LD_Fp080     , X86::LD_F0     },
+  { X86::LD_Fp132     , X86::LD_F1     },
+  { X86::LD_Fp164     , X86::LD_F1     },
+  { X86::LD_Fp180     , X86::LD_F1     },
+  { X86::LD_Fp32m     , X86::LD_F32m   },
+  { X86::LD_Fp32m64   , X86::LD_F32m   },
+  { X86::LD_Fp32m80   , X86::LD_F32m   },
+  { X86::LD_Fp64m     , X86::LD_F64m   },
+  { X86::LD_Fp64m80   , X86::LD_F64m   },
+  { X86::LD_Fp80m     , X86::LD_F80m   },
+  { X86::MUL_Fp32m    , X86::MUL_F32m  },
+  { X86::MUL_Fp64m    , X86::MUL_F64m  },
+  { X86::MUL_Fp64m32  , X86::MUL_F32m  },
+  { X86::MUL_Fp80m32  , X86::MUL_F32m  },
+  { X86::MUL_Fp80m64  , X86::MUL_F64m  },
+  { X86::MUL_FpI16m32 , X86::MUL_FI16m },
+  { X86::MUL_FpI16m64 , X86::MUL_FI16m },
+  { X86::MUL_FpI16m80 , X86::MUL_FI16m },
+  { X86::MUL_FpI32m32 , X86::MUL_FI32m },
+  { X86::MUL_FpI32m64 , X86::MUL_FI32m },
+  { X86::MUL_FpI32m80 , X86::MUL_FI32m },
+  { X86::SIN_Fp32     , X86::SIN_F     },
+  { X86::SIN_Fp64     , X86::SIN_F     },
+  { X86::SIN_Fp80     , X86::SIN_F     },
+  { X86::SQRT_Fp32    , X86::SQRT_F    },
+  { X86::SQRT_Fp64    , X86::SQRT_F    },
+  { X86::SQRT_Fp80    , X86::SQRT_F    },
+  { X86::ST_Fp32m     , X86::ST_F32m   },
+  { X86::ST_Fp64m     , X86::ST_F64m   },
+  { X86::ST_Fp64m32   , X86::ST_F32m   },
+  { X86::ST_Fp80m32   , X86::ST_F32m   },
+  { X86::ST_Fp80m64   , X86::ST_F64m   },
+  { X86::ST_FpP80m    , X86::ST_FP80m  },
+  { X86::SUBR_Fp32m   , X86::SUBR_F32m },
+  { X86::SUBR_Fp64m   , X86::SUBR_F64m },
+  { X86::SUBR_Fp64m32 , X86::SUBR_F32m },
+  { X86::SUBR_Fp80m32 , X86::SUBR_F32m },
+  { X86::SUBR_Fp80m64 , X86::SUBR_F64m },
+  { X86::SUBR_FpI16m32, X86::SUBR_FI16m},
+  { X86::SUBR_FpI16m64, X86::SUBR_FI16m},
+  { X86::SUBR_FpI16m80, X86::SUBR_FI16m},
+  { X86::SUBR_FpI32m32, X86::SUBR_FI32m},
+  { X86::SUBR_FpI32m64, X86::SUBR_FI32m},
+  { X86::SUBR_FpI32m80, X86::SUBR_FI32m},
+  { X86::SUB_Fp32m    , X86::SUB_F32m  },
+  { X86::SUB_Fp64m    , X86::SUB_F64m  },
+  { X86::SUB_Fp64m32  , X86::SUB_F32m  },
+  { X86::SUB_Fp80m32  , X86::SUB_F32m  },
+  { X86::SUB_Fp80m64  , X86::SUB_F64m  },
+  { X86::SUB_FpI16m32 , X86::SUB_FI16m },
+  { X86::SUB_FpI16m64 , X86::SUB_FI16m },
+  { X86::SUB_FpI16m80 , X86::SUB_FI16m },
+  { X86::SUB_FpI32m32 , X86::SUB_FI32m },
+  { X86::SUB_FpI32m64 , X86::SUB_FI32m },
+  { X86::SUB_FpI32m80 , X86::SUB_FI32m },
+  { X86::TST_Fp32     , X86::TST_F     },
+  { X86::TST_Fp64     , X86::TST_F     },
+  { X86::TST_Fp80     , X86::TST_F     },
+  { X86::UCOM_FpIr32  , X86::UCOM_FIr  },
+  { X86::UCOM_FpIr64  , X86::UCOM_FIr  },
+  { X86::UCOM_FpIr80  , X86::UCOM_FIr  },
+  { X86::UCOM_Fpr32   , X86::UCOM_Fr   },
+  { X86::UCOM_Fpr64   , X86::UCOM_Fr   },
+  { X86::UCOM_Fpr80   , X86::UCOM_Fr   },
+};
+
+static unsigned getConcreteOpcode(unsigned Opcode) {
+  ASSERT_SORTED(OpcodeTable);
+  int Opc = Lookup(OpcodeTable, array_lengthof(OpcodeTable), Opcode);
+  assert(Opc != -1 && "FP Stack instruction not in OpcodeTable!");
+  return Opc;
+}
+
+//===----------------------------------------------------------------------===//
+// Helper Methods
+//===----------------------------------------------------------------------===//
+
+// PopTable - Sorted map of instructions to their popping version.  The first
+// element is an instruction, the second is the version which pops.
+//
+static const TableEntry PopTable[] = {
+  { X86::ADD_FrST0 , X86::ADD_FPrST0  },
+
+  { X86::DIVR_FrST0, X86::DIVR_FPrST0 },
+  { X86::DIV_FrST0 , X86::DIV_FPrST0  },
+
+  { X86::IST_F16m  , X86::IST_FP16m   },
+  { X86::IST_F32m  , X86::IST_FP32m   },
+
+  { X86::MUL_FrST0 , X86::MUL_FPrST0  },
+
+  { X86::ST_F32m   , X86::ST_FP32m    },
+  { X86::ST_F64m   , X86::ST_FP64m    },
+  { X86::ST_Frr    , X86::ST_FPrr     },
+
+  { X86::SUBR_FrST0, X86::SUBR_FPrST0 },
+  { X86::SUB_FrST0 , X86::SUB_FPrST0  },
+
+  { X86::UCOM_FIr  , X86::UCOM_FIPr   },
+
+  { X86::UCOM_FPr  , X86::UCOM_FPPr   },
+  { X86::UCOM_Fr   , X86::UCOM_FPr    },
+};
+
+/// popStackAfter - Pop the current value off of the top of the FP stack after
+/// the specified instruction.  This attempts to be sneaky and combine the pop
+/// into the instruction itself if possible.  The iterator is left pointing to
+/// the last instruction, be it a new pop instruction inserted, or the old
+/// instruction if it was modified in place.
+///
+void FPS::popStackAfter(MachineBasicBlock::iterator &I) {
+  MachineInstr* MI = I;
+  DebugLoc dl = MI->getDebugLoc();
+  ASSERT_SORTED(PopTable);
+  if (StackTop == 0)
+    report_fatal_error("Cannot pop empty stack!");
+  RegMap[Stack[--StackTop]] = ~0;     // Update state
+
+  // Check to see if there is a popping version of this instruction...
+  int Opcode = Lookup(PopTable, array_lengthof(PopTable), I->getOpcode());
+  if (Opcode != -1) {
+    I->setDesc(TII->get(Opcode));
+    if (Opcode == X86::UCOM_FPPr)
+      I->RemoveOperand(0);
+  } else {    // Insert an explicit pop
+    I = BuildMI(*MBB, ++I, dl, TII->get(X86::ST_FPrr)).addReg(X86::ST0);
+  }
+}
+
+/// freeStackSlotAfter - Free the specified register from the register stack, so
+/// that it is no longer in a register.  If the register is currently at the top
+/// of the stack, we just pop the current instruction, otherwise we store the
+/// current top-of-stack into the specified slot, then pop the top of stack.
+void FPS::freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned FPRegNo) {
+  if (getStackEntry(0) == FPRegNo) {  // already at the top of stack? easy.
+    popStackAfter(I);
+    return;
+  }
+
+  // Otherwise, store the top of stack into the dead slot, killing the operand
+  // without having to add in an explicit xchg then pop.
+  //
+  I = freeStackSlotBefore(++I, FPRegNo);
+}
+
+/// freeStackSlotBefore - Free the specified register without trying any
+/// folding.
+MachineBasicBlock::iterator
+FPS::freeStackSlotBefore(MachineBasicBlock::iterator I, unsigned FPRegNo) {
+  unsigned STReg    = getSTReg(FPRegNo);
+  unsigned OldSlot  = getSlot(FPRegNo);
+  unsigned TopReg   = Stack[StackTop-1];
+  Stack[OldSlot]    = TopReg;
+  RegMap[TopReg]    = OldSlot;
+  RegMap[FPRegNo]   = ~0;
+  Stack[--StackTop] = ~0;
+  return BuildMI(*MBB, I, DebugLoc(), TII->get(X86::ST_FPrr)).addReg(STReg);
+}
+
+/// adjustLiveRegs - Kill and revive registers such that exactly the FP
+/// registers with a bit in Mask are live.
+void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) {
+  unsigned Defs = Mask;
+  unsigned Kills = 0;
+  for (unsigned i = 0; i < StackTop; ++i) {
+    unsigned RegNo = Stack[i];
+    if (!(Defs & (1 << RegNo)))
+      // This register is live, but we don't want it.
+      Kills |= (1 << RegNo);
+    else
+      // We don't need to imp-def this live register.
+      Defs &= ~(1 << RegNo);
+  }
+  assert((Kills & Defs) == 0 && "Register needs killing and def'ing?");
+
+  // Produce implicit-defs for free by using killed registers.
+  while (Kills && Defs) {
+    unsigned KReg = countTrailingZeros(Kills);
+    unsigned DReg = countTrailingZeros(Defs);
+    DEBUG(dbgs() << "Renaming %FP" << KReg << " as imp %FP" << DReg << "\n");
+    std::swap(Stack[getSlot(KReg)], Stack[getSlot(DReg)]);
+    std::swap(RegMap[KReg], RegMap[DReg]);
+    Kills &= ~(1 << KReg);
+    Defs &= ~(1 << DReg);
+  }
+
+  // Kill registers by popping.
+  if (Kills && I != MBB->begin()) {
+    MachineBasicBlock::iterator I2 = std::prev(I);
+    while (StackTop) {
+      unsigned KReg = getStackEntry(0);
+      if (!(Kills & (1 << KReg)))
+        break;
+      DEBUG(dbgs() << "Popping %FP" << KReg << "\n");
+      popStackAfter(I2);
+      Kills &= ~(1 << KReg);
+    }
+  }
+
+  // Manually kill the rest.
+  while (Kills) {
+    unsigned KReg = countTrailingZeros(Kills);
+    DEBUG(dbgs() << "Killing %FP" << KReg << "\n");
+    freeStackSlotBefore(I, KReg);
+    Kills &= ~(1 << KReg);
+  }
+
+  // Load zeros for all the imp-defs.
+  while(Defs) {
+    unsigned DReg = countTrailingZeros(Defs);
+    DEBUG(dbgs() << "Defining %FP" << DReg << " as 0\n");
+    BuildMI(*MBB, I, DebugLoc(), TII->get(X86::LD_F0));
+    pushReg(DReg);
+    Defs &= ~(1 << DReg);
+  }
+
+  // Now we should have the correct registers live.
+  DEBUG(dumpStack());
+  assert(StackTop == CountPopulation_32(Mask) && "Live count mismatch");
+}
+
+/// shuffleStackTop - emit fxch instructions before I to shuffle the top
+/// FixCount entries into the order given by FixStack.
+/// FIXME: Is there a better algorithm than insertion sort?
+void FPS::shuffleStackTop(const unsigned char *FixStack,
+                          unsigned FixCount,
+                          MachineBasicBlock::iterator I) {
+  // Move items into place, starting from the desired stack bottom.
+  while (FixCount--) {
+    // Old register at position FixCount.
+    unsigned OldReg = getStackEntry(FixCount);
+    // Desired register at position FixCount.
+    unsigned Reg = FixStack[FixCount];
+    if (Reg == OldReg)
+      continue;
+    // (Reg st0) (OldReg st0) = (Reg OldReg st0)
+    moveToTop(Reg, I);
+    if (FixCount > 0)
+      moveToTop(OldReg, I);
+  }
+  DEBUG(dumpStack());
+}
+
+
+//===----------------------------------------------------------------------===//
+// Instruction transformation implementation
+//===----------------------------------------------------------------------===//
+
+/// handleZeroArgFP - ST(0) = fld0    ST(0) = flds <mem>
+///
+void FPS::handleZeroArgFP(MachineBasicBlock::iterator &I) {
+  MachineInstr *MI = I;
+  unsigned DestReg = getFPReg(MI->getOperand(0));
+
+  // Change from the pseudo instruction to the concrete instruction.
+  MI->RemoveOperand(0);   // Remove the explicit ST(0) operand
+  MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+
+  // Result gets pushed on the stack.
+  pushReg(DestReg);
+}
+
+/// handleOneArgFP - fst <mem>, ST(0)
+///
+void FPS::handleOneArgFP(MachineBasicBlock::iterator &I) {
+  MachineInstr *MI = I;
+  unsigned NumOps = MI->getDesc().getNumOperands();
+  assert((NumOps == X86::AddrNumOperands + 1 || NumOps == 1) &&
+         "Can only handle fst* & ftst instructions!");
+
+  // Is this the last use of the source register?
+  unsigned Reg = getFPReg(MI->getOperand(NumOps-1));
+  bool KillsSrc = MI->killsRegister(X86::FP0+Reg);
+
+  if (KillsSrc)
+    duplicatePendingSTBeforeKill(Reg, I);
+
+  // FISTP64m is strange because there isn't a non-popping versions.
+  // If we have one _and_ we don't want to pop the operand, duplicate the value
+  // on the stack instead of moving it.  This ensure that popping the value is
+  // always ok.
+  // Ditto FISTTP16m, FISTTP32m, FISTTP64m, ST_FpP80m.
+  //
+  if (!KillsSrc &&
+      (MI->getOpcode() == X86::IST_Fp64m32 ||
+       MI->getOpcode() == X86::ISTT_Fp16m32 ||
+       MI->getOpcode() == X86::ISTT_Fp32m32 ||
+       MI->getOpcode() == X86::ISTT_Fp64m32 ||
+       MI->getOpcode() == X86::IST_Fp64m64 ||
+       MI->getOpcode() == X86::ISTT_Fp16m64 ||
+       MI->getOpcode() == X86::ISTT_Fp32m64 ||
+       MI->getOpcode() == X86::ISTT_Fp64m64 ||
+       MI->getOpcode() == X86::IST_Fp64m80 ||
+       MI->getOpcode() == X86::ISTT_Fp16m80 ||
+       MI->getOpcode() == X86::ISTT_Fp32m80 ||
+       MI->getOpcode() == X86::ISTT_Fp64m80 ||
+       MI->getOpcode() == X86::ST_FpP80m)) {
+    duplicateToTop(Reg, getScratchReg(), I);
+  } else {
+    moveToTop(Reg, I);            // Move to the top of the stack...
+  }
+
+  // Convert from the pseudo instruction to the concrete instruction.
+  MI->RemoveOperand(NumOps-1);    // Remove explicit ST(0) operand
+  MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+
+  if (MI->getOpcode() == X86::IST_FP64m ||
+      MI->getOpcode() == X86::ISTT_FP16m ||
+      MI->getOpcode() == X86::ISTT_FP32m ||
+      MI->getOpcode() == X86::ISTT_FP64m ||
+      MI->getOpcode() == X86::ST_FP80m) {
+    if (StackTop == 0)
+      report_fatal_error("Stack empty??");
+    --StackTop;
+  } else if (KillsSrc) { // Last use of operand?
+    popStackAfter(I);
+  }
+}
+
+
+/// handleOneArgFPRW: Handle instructions that read from the top of stack and
+/// replace the value with a newly computed value.  These instructions may have
+/// non-fp operands after their FP operands.
+///
+///  Examples:
+///     R1 = fchs R2
+///     R1 = fadd R2, [mem]
+///
+void FPS::handleOneArgFPRW(MachineBasicBlock::iterator &I) {
+  MachineInstr *MI = I;
+#ifndef NDEBUG
+  unsigned NumOps = MI->getDesc().getNumOperands();
+  assert(NumOps >= 2 && "FPRW instructions must have 2 ops!!");
+#endif
+
+  // Is this the last use of the source register?
+  unsigned Reg = getFPReg(MI->getOperand(1));
+  bool KillsSrc = MI->killsRegister(X86::FP0+Reg);
+
+  if (KillsSrc) {
+    duplicatePendingSTBeforeKill(Reg, I);
+    // If this is the last use of the source register, just make sure it's on
+    // the top of the stack.
+    moveToTop(Reg, I);
+    if (StackTop == 0)
+      report_fatal_error("Stack cannot be empty!");
+    --StackTop;
+    pushReg(getFPReg(MI->getOperand(0)));
+  } else {
+    // If this is not the last use of the source register, _copy_ it to the top
+    // of the stack.
+    duplicateToTop(Reg, getFPReg(MI->getOperand(0)), I);
+  }
+
+  // Change from the pseudo instruction to the concrete instruction.
+  MI->RemoveOperand(1);   // Drop the source operand.
+  MI->RemoveOperand(0);   // Drop the destination operand.
+  MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define tables of various ways to map pseudo instructions
+//
+
+// ForwardST0Table - Map: A = B op C  into: ST(0) = ST(0) op ST(i)
+static const TableEntry ForwardST0Table[] = {
+  { X86::ADD_Fp32  , X86::ADD_FST0r },
+  { X86::ADD_Fp64  , X86::ADD_FST0r },
+  { X86::ADD_Fp80  , X86::ADD_FST0r },
+  { X86::DIV_Fp32  , X86::DIV_FST0r },
+  { X86::DIV_Fp64  , X86::DIV_FST0r },
+  { X86::DIV_Fp80  , X86::DIV_FST0r },
+  { X86::MUL_Fp32  , X86::MUL_FST0r },
+  { X86::MUL_Fp64  , X86::MUL_FST0r },
+  { X86::MUL_Fp80  , X86::MUL_FST0r },
+  { X86::SUB_Fp32  , X86::SUB_FST0r },
+  { X86::SUB_Fp64  , X86::SUB_FST0r },
+  { X86::SUB_Fp80  , X86::SUB_FST0r },
+};
+
+// ReverseST0Table - Map: A = B op C  into: ST(0) = ST(i) op ST(0)
+static const TableEntry ReverseST0Table[] = {
+  { X86::ADD_Fp32  , X86::ADD_FST0r  },   // commutative
+  { X86::ADD_Fp64  , X86::ADD_FST0r  },   // commutative
+  { X86::ADD_Fp80  , X86::ADD_FST0r  },   // commutative
+  { X86::DIV_Fp32  , X86::DIVR_FST0r },
+  { X86::DIV_Fp64  , X86::DIVR_FST0r },
+  { X86::DIV_Fp80  , X86::DIVR_FST0r },
+  { X86::MUL_Fp32  , X86::MUL_FST0r  },   // commutative
+  { X86::MUL_Fp64  , X86::MUL_FST0r  },   // commutative
+  { X86::MUL_Fp80  , X86::MUL_FST0r  },   // commutative
+  { X86::SUB_Fp32  , X86::SUBR_FST0r },
+  { X86::SUB_Fp64  , X86::SUBR_FST0r },
+  { X86::SUB_Fp80  , X86::SUBR_FST0r },
+};
+
+// ForwardSTiTable - Map: A = B op C  into: ST(i) = ST(0) op ST(i)
+static const TableEntry ForwardSTiTable[] = {
+  { X86::ADD_Fp32  , X86::ADD_FrST0  },   // commutative
+  { X86::ADD_Fp64  , X86::ADD_FrST0  },   // commutative
+  { X86::ADD_Fp80  , X86::ADD_FrST0  },   // commutative
+  { X86::DIV_Fp32  , X86::DIVR_FrST0 },
+  { X86::DIV_Fp64  , X86::DIVR_FrST0 },
+  { X86::DIV_Fp80  , X86::DIVR_FrST0 },
+  { X86::MUL_Fp32  , X86::MUL_FrST0  },   // commutative
+  { X86::MUL_Fp64  , X86::MUL_FrST0  },   // commutative
+  { X86::MUL_Fp80  , X86::MUL_FrST0  },   // commutative
+  { X86::SUB_Fp32  , X86::SUBR_FrST0 },
+  { X86::SUB_Fp64  , X86::SUBR_FrST0 },
+  { X86::SUB_Fp80  , X86::SUBR_FrST0 },
+};
+
+// ReverseSTiTable - Map: A = B op C  into: ST(i) = ST(i) op ST(0)
+static const TableEntry ReverseSTiTable[] = {
+  { X86::ADD_Fp32  , X86::ADD_FrST0 },
+  { X86::ADD_Fp64  , X86::ADD_FrST0 },
+  { X86::ADD_Fp80  , X86::ADD_FrST0 },
+  { X86::DIV_Fp32  , X86::DIV_FrST0 },
+  { X86::DIV_Fp64  , X86::DIV_FrST0 },
+  { X86::DIV_Fp80  , X86::DIV_FrST0 },
+  { X86::MUL_Fp32  , X86::MUL_FrST0 },
+  { X86::MUL_Fp64  , X86::MUL_FrST0 },
+  { X86::MUL_Fp80  , X86::MUL_FrST0 },
+  { X86::SUB_Fp32  , X86::SUB_FrST0 },
+  { X86::SUB_Fp64  , X86::SUB_FrST0 },
+  { X86::SUB_Fp80  , X86::SUB_FrST0 },
+};
+
+
+/// handleTwoArgFP - Handle instructions like FADD and friends which are virtual
+/// instructions which need to be simplified and possibly transformed.
+///
+/// Result: ST(0) = fsub  ST(0), ST(i)
+///         ST(i) = fsub  ST(0), ST(i)
+///         ST(0) = fsubr ST(0), ST(i)
+///         ST(i) = fsubr ST(0), ST(i)
+///
+void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) {
+  ASSERT_SORTED(ForwardST0Table); ASSERT_SORTED(ReverseST0Table);
+  ASSERT_SORTED(ForwardSTiTable); ASSERT_SORTED(ReverseSTiTable);
+  MachineInstr *MI = I;
+
+  unsigned NumOperands = MI->getDesc().getNumOperands();
+  assert(NumOperands == 3 && "Illegal TwoArgFP instruction!");
+  unsigned Dest = getFPReg(MI->getOperand(0));
+  unsigned Op0 = getFPReg(MI->getOperand(NumOperands-2));
+  unsigned Op1 = getFPReg(MI->getOperand(NumOperands-1));
+  bool KillsOp0 = MI->killsRegister(X86::FP0+Op0);
+  bool KillsOp1 = MI->killsRegister(X86::FP0+Op1);
+  DebugLoc dl = MI->getDebugLoc();
+
+  unsigned TOS = getStackEntry(0);
+
+  // One of our operands must be on the top of the stack.  If neither is yet, we
+  // need to move one.
+  if (Op0 != TOS && Op1 != TOS) {   // No operand at TOS?
+    // We can choose to move either operand to the top of the stack.  If one of
+    // the operands is killed by this instruction, we want that one so that we
+    // can update right on top of the old version.
+    if (KillsOp0) {
+      moveToTop(Op0, I);         // Move dead operand to TOS.
+      TOS = Op0;
+    } else if (KillsOp1) {
+      moveToTop(Op1, I);
+      TOS = Op1;
+    } else {
+      // All of the operands are live after this instruction executes, so we
+      // cannot update on top of any operand.  Because of this, we must
+      // duplicate one of the stack elements to the top.  It doesn't matter
+      // which one we pick.
+      //
+      duplicateToTop(Op0, Dest, I);
+      Op0 = TOS = Dest;
+      KillsOp0 = true;
+    }
+  } else if (!KillsOp0 && !KillsOp1) {
+    // If we DO have one of our operands at the top of the stack, but we don't
+    // have a dead operand, we must duplicate one of the operands to a new slot
+    // on the stack.
+    duplicateToTop(Op0, Dest, I);
+    Op0 = TOS = Dest;
+    KillsOp0 = true;
+  }
+
+  // Now we know that one of our operands is on the top of the stack, and at
+  // least one of our operands is killed by this instruction.
+  assert((TOS == Op0 || TOS == Op1) && (KillsOp0 || KillsOp1) &&
+         "Stack conditions not set up right!");
+
+  // We decide which form to use based on what is on the top of the stack, and
+  // which operand is killed by this instruction.
+  const TableEntry *InstTable;
+  bool isForward = TOS == Op0;
+  bool updateST0 = (TOS == Op0 && !KillsOp1) || (TOS == Op1 && !KillsOp0);
+  if (updateST0) {
+    if (isForward)
+      InstTable = ForwardST0Table;
+    else
+      InstTable = ReverseST0Table;
+  } else {
+    if (isForward)
+      InstTable = ForwardSTiTable;
+    else
+      InstTable = ReverseSTiTable;
+  }
+
+  int Opcode = Lookup(InstTable, array_lengthof(ForwardST0Table),
+                      MI->getOpcode());
+  assert(Opcode != -1 && "Unknown TwoArgFP pseudo instruction!");
+
+  // NotTOS - The register which is not on the top of stack...
+  unsigned NotTOS = (TOS == Op0) ? Op1 : Op0;
+
+  // Replace the old instruction with a new instruction
+  MBB->remove(I++);
+  I = BuildMI(*MBB, I, dl, TII->get(Opcode)).addReg(getSTReg(NotTOS));
+
+  // If both operands are killed, pop one off of the stack in addition to
+  // overwriting the other one.
+  if (KillsOp0 && KillsOp1 && Op0 != Op1) {
+    assert(!updateST0 && "Should have updated other operand!");
+    popStackAfter(I);   // Pop the top of stack
+  }
+
+  // Update stack information so that we know the destination register is now on
+  // the stack.
+  unsigned UpdatedSlot = getSlot(updateST0 ? TOS : NotTOS);
+  assert(UpdatedSlot < StackTop && Dest < 7);
+  Stack[UpdatedSlot]   = Dest;
+  RegMap[Dest]         = UpdatedSlot;
+  MBB->getParent()->DeleteMachineInstr(MI); // Remove the old instruction
+}
+
+/// handleCompareFP - Handle FUCOM and FUCOMI instructions, which have two FP
+/// register arguments and no explicit destinations.
+///
+void FPS::handleCompareFP(MachineBasicBlock::iterator &I) {
+  ASSERT_SORTED(ForwardST0Table); ASSERT_SORTED(ReverseST0Table);
+  ASSERT_SORTED(ForwardSTiTable); ASSERT_SORTED(ReverseSTiTable);
+  MachineInstr *MI = I;
+
+  unsigned NumOperands = MI->getDesc().getNumOperands();
+  assert(NumOperands == 2 && "Illegal FUCOM* instruction!");
+  unsigned Op0 = getFPReg(MI->getOperand(NumOperands-2));
+  unsigned Op1 = getFPReg(MI->getOperand(NumOperands-1));
+  bool KillsOp0 = MI->killsRegister(X86::FP0+Op0);
+  bool KillsOp1 = MI->killsRegister(X86::FP0+Op1);
+
+  // Make sure the first operand is on the top of stack, the other one can be
+  // anywhere.
+  moveToTop(Op0, I);
+
+  // Change from the pseudo instruction to the concrete instruction.
+  MI->getOperand(0).setReg(getSTReg(Op1));
+  MI->RemoveOperand(1);
+  MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+
+  // If any of the operands are killed by this instruction, free them.
+  if (KillsOp0) freeStackSlotAfter(I, Op0);
+  if (KillsOp1 && Op0 != Op1) freeStackSlotAfter(I, Op1);
+}
+
+/// handleCondMovFP - Handle two address conditional move instructions.  These
+/// instructions move a st(i) register to st(0) iff a condition is true.  These
+/// instructions require that the first operand is at the top of the stack, but
+/// otherwise don't modify the stack at all.
+void FPS::handleCondMovFP(MachineBasicBlock::iterator &I) {
+  MachineInstr *MI = I;
+
+  unsigned Op0 = getFPReg(MI->getOperand(0));
+  unsigned Op1 = getFPReg(MI->getOperand(2));
+  bool KillsOp1 = MI->killsRegister(X86::FP0+Op1);
+
+  // The first operand *must* be on the top of the stack.
+  moveToTop(Op0, I);
+
+  // Change the second operand to the stack register that the operand is in.
+  // Change from the pseudo instruction to the concrete instruction.
+  MI->RemoveOperand(0);
+  MI->RemoveOperand(1);
+  MI->getOperand(0).setReg(getSTReg(Op1));
+  MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+
+  // If we kill the second operand, make sure to pop it from the stack.
+  if (Op0 != Op1 && KillsOp1) {
+    // Get this value off of the register stack.
+    freeStackSlotAfter(I, Op1);
+  }
+}
+
+
+/// handleSpecialFP - Handle special instructions which behave unlike other
+/// floating point instructions.  This is primarily intended for use by pseudo
+/// instructions.
+///
+void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
+  MachineInstr *MI = I;
+  switch (MI->getOpcode()) {
+  default: llvm_unreachable("Unknown SpecialFP instruction!");
+  case TargetOpcode::COPY: {
+    // We handle three kinds of copies: FP <- FP, FP <- ST, and ST <- FP.
+    const MachineOperand &MO1 = MI->getOperand(1);
+    const MachineOperand &MO0 = MI->getOperand(0);
+    unsigned DstST = MO0.getReg() - X86::ST0;
+    unsigned SrcST = MO1.getReg() - X86::ST0;
+    bool KillsSrc = MI->killsRegister(MO1.getReg());
+
+    // ST = COPY FP. Set up a pending ST register.
+    if (DstST < 8) {
+      unsigned SrcFP = getFPReg(MO1);
+      assert(isLive(SrcFP) && "Cannot copy dead register");
+      assert(!MO0.isDead() && "Cannot copy to dead ST register");
+
+      // Unallocated STs are marked as the nonexistent FP255.
+      while (NumPendingSTs <= DstST)
+        PendingST[NumPendingSTs++] = NumFPRegs;
+
+      // STi could still be live from a previous inline asm.
+      if (isScratchReg(PendingST[DstST])) {
+        DEBUG(dbgs() << "Clobbering old ST in FP" << unsigned(PendingST[DstST])
+                     << '\n');
+        freeStackSlotBefore(MI, PendingST[DstST]);
+      }
+
+      // When the source is killed, allocate a scratch FP register.
+      if (KillsSrc) {
+        duplicatePendingSTBeforeKill(SrcFP, I);
+        unsigned Slot = getSlot(SrcFP);
+        unsigned SR = getScratchReg();
+        PendingST[DstST] = SR;
+        Stack[Slot] = SR;
+        RegMap[SR] = Slot;
+      } else
+        PendingST[DstST] = SrcFP;
+      break;
+    }
+
+    // FP = COPY ST. Extract fixed stack value.
+    // Any instruction defining ST registers must have assigned them to a
+    // scratch register.
+    if (SrcST < 8) {
+      unsigned DstFP = getFPReg(MO0);
+      assert(!isLive(DstFP) && "Cannot copy ST to live FP register");
+      assert(NumPendingSTs > SrcST && "Cannot copy from dead ST register");
+      unsigned SrcFP = PendingST[SrcST];
+      assert(isScratchReg(SrcFP) && "Expected ST in a scratch register");
+      assert(isLive(SrcFP) && "Scratch holding ST is dead");
+
+      // DstFP steals the stack slot from SrcFP.
+      unsigned Slot = getSlot(SrcFP);
+      Stack[Slot] = DstFP;
+      RegMap[DstFP] = Slot;
+
+      // Always treat the ST as killed.
+      PendingST[SrcST] = NumFPRegs;
+      while (NumPendingSTs && PendingST[NumPendingSTs - 1] == NumFPRegs)
+        --NumPendingSTs;
+      break;
+    }
+
+    // FP <- FP copy.
+    unsigned DstFP = getFPReg(MO0);
+    unsigned SrcFP = getFPReg(MO1);
+    assert(isLive(SrcFP) && "Cannot copy dead register");
+    if (KillsSrc) {
+      // If the input operand is killed, we can just change the owner of the
+      // incoming stack slot into the result.
+      unsigned Slot = getSlot(SrcFP);
+      Stack[Slot] = DstFP;
+      RegMap[DstFP] = Slot;
+    } else {
+      // For COPY we just duplicate the specified value to a new stack slot.
+      // This could be made better, but would require substantial changes.
+      duplicateToTop(SrcFP, DstFP, I);
+    }
+    break;
+  }
+
+  case TargetOpcode::IMPLICIT_DEF: {
+    // All FP registers must be explicitly defined, so load a 0 instead.
+    unsigned Reg = MI->getOperand(0).getReg() - X86::FP0;
+    DEBUG(dbgs() << "Emitting LD_F0 for implicit FP" << Reg << '\n');
+    BuildMI(*MBB, I, MI->getDebugLoc(), TII->get(X86::LD_F0));
+    pushReg(Reg);
+    break;
+  }
+
+  case X86::FpPOP_RETVAL: {
+    // The FpPOP_RETVAL instruction is used after calls that return a value on
+    // the floating point stack. We cannot model this with ST defs since CALL
+    // instructions have fixed clobber lists. This instruction is interpreted
+    // to mean that there is one more live register on the stack than we
+    // thought.
+    //
+    // This means that StackTop does not match the hardware stack between a
+    // call and the FpPOP_RETVAL instructions.  We do tolerate FP instructions
+    // between CALL and FpPOP_RETVAL as long as they don't overflow the
+    // hardware stack.
+    unsigned DstFP = getFPReg(MI->getOperand(0));
+
+    // Move existing stack elements up to reflect reality.
+    assert(StackTop < 8 && "Stack overflowed before FpPOP_RETVAL");
+    if (StackTop) {
+      std::copy_backward(Stack, Stack + StackTop, Stack + StackTop + 1);
+      for (unsigned i = 0; i != NumFPRegs; ++i)
+        ++RegMap[i];
+    }
+    ++StackTop;
+
+    // DstFP is the new bottom of the stack.
+    Stack[0] = DstFP;
+    RegMap[DstFP] = 0;
+
+    // DstFP will be killed by processBasicBlock if this was a dead def.
+    break;
+  }
+
+  case TargetOpcode::INLINEASM: {
+    // The inline asm MachineInstr currently only *uses* FP registers for the
+    // 'f' constraint.  These should be turned into the current ST(x) register
+    // in the machine instr.
+    //
+    // There are special rules for x87 inline assembly. The compiler must know
+    // exactly how many registers are popped and pushed implicitly by the asm.
+    // Otherwise it is not possible to restore the stack state after the inline
+    // asm.
+    //
+    // There are 3 kinds of input operands:
+    //
+    // 1. Popped inputs. These must appear at the stack top in ST0-STn. A
+    //    popped input operand must be in a fixed stack slot, and it is either
+    //    tied to an output operand, or in the clobber list. The MI has ST use
+    //    and def operands for these inputs.
+    //
+    // 2. Fixed inputs. These inputs appear in fixed stack slots, but are
+    //    preserved by the inline asm. The fixed stack slots must be STn-STm
+    //    following the popped inputs. A fixed input operand cannot be tied to
+    //    an output or appear in the clobber list. The MI has ST use operands
+    //    and no defs for these inputs.
+    //
+    // 3. Preserved inputs. These inputs use the "f" constraint which is
+    //    represented as an FP register. The inline asm won't change these
+    //    stack slots.
+    //
+    // Outputs must be in ST registers, FP outputs are not allowed. Clobbered
+    // registers do not count as output operands. The inline asm changes the
+    // stack as if it popped all the popped inputs and then pushed all the
+    // output operands.
+
+    // Scan the assembly for ST registers used, defined and clobbered. We can
+    // only tell clobbers from defs by looking at the asm descriptor.
+    unsigned STUses = 0, STDefs = 0, STClobbers = 0, STDeadDefs = 0;
+    unsigned NumOps = 0;
+    for (unsigned i = InlineAsm::MIOp_FirstOperand, e = MI->getNumOperands();
+         i != e && MI->getOperand(i).isImm(); i += 1 + NumOps) {
+      unsigned Flags = MI->getOperand(i).getImm();
+      NumOps = InlineAsm::getNumOperandRegisters(Flags);
+      if (NumOps != 1)
+        continue;
+      const MachineOperand &MO = MI->getOperand(i + 1);
+      if (!MO.isReg())
+        continue;
+      unsigned STReg = MO.getReg() - X86::ST0;
+      if (STReg >= 8)
+        continue;
+
+      switch (InlineAsm::getKind(Flags)) {
+      case InlineAsm::Kind_RegUse:
+        STUses |= (1u << STReg);
+        break;
+      case InlineAsm::Kind_RegDef:
+      case InlineAsm::Kind_RegDefEarlyClobber:
+        STDefs |= (1u << STReg);
+        if (MO.isDead())
+          STDeadDefs |= (1u << STReg);
+        break;
+      case InlineAsm::Kind_Clobber:
+        STClobbers |= (1u << STReg);
+        break;
+      default:
+        break;
+      }
+    }
+
+    if (STUses && !isMask_32(STUses))
+      MI->emitError("fixed input regs must be last on the x87 stack");
+    unsigned NumSTUses = CountTrailingOnes_32(STUses);
+
+    // Defs must be contiguous from the stack top. ST0-STn.
+    if (STDefs && !isMask_32(STDefs)) {
+      MI->emitError("output regs must be last on the x87 stack");
+      STDefs = NextPowerOf2(STDefs) - 1;
+    }
+    unsigned NumSTDefs = CountTrailingOnes_32(STDefs);
+
+    // So must the clobbered stack slots. ST0-STm, m >= n.
+    if (STClobbers && !isMask_32(STDefs | STClobbers))
+      MI->emitError("clobbers must be last on the x87 stack");
+
+    // Popped inputs are the ones that are also clobbered or defined.
+    unsigned STPopped = STUses & (STDefs | STClobbers);
+    if (STPopped && !isMask_32(STPopped))
+      MI->emitError("implicitly popped regs must be last on the x87 stack");
+    unsigned NumSTPopped = CountTrailingOnes_32(STPopped);
+
+    DEBUG(dbgs() << "Asm uses " << NumSTUses << " fixed regs, pops "
+                 << NumSTPopped << ", and defines " << NumSTDefs << " regs.\n");
+
+    // Scan the instruction for FP uses corresponding to "f" constraints.
+    // Collect FP registers to kill afer the instruction.
+    // Always kill all the scratch regs.
+    unsigned FPKills = ((1u << NumFPRegs) - 1) & ~0xff;
+    unsigned FPUsed = 0;
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand &Op = MI->getOperand(i);
+      if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
+        continue;
+      if (!Op.isUse())
+        MI->emitError("illegal \"f\" output constraint");
+      unsigned FPReg = getFPReg(Op);
+      FPUsed |= 1U << FPReg;
+
+      // If we kill this operand, make sure to pop it from the stack after the
+      // asm.  We just remember it for now, and pop them all off at the end in
+      // a batch.
+      if (Op.isKill())
+        FPKills |= 1U << FPReg;
+    }
+
+    // The popped inputs will be killed by the instruction, so duplicate them
+    // if the FP register needs to be live after the instruction, or if it is
+    // used in the instruction itself. We effectively treat the popped inputs
+    // as early clobbers.
+    for (unsigned i = 0; i < NumSTPopped; ++i) {
+      if ((FPKills & ~FPUsed) & (1u << PendingST[i]))
+        continue;
+      unsigned SR = getScratchReg();
+      duplicateToTop(PendingST[i], SR, I);
+      DEBUG(dbgs() << "Duplicating ST" << i << " in FP"
+                   << unsigned(PendingST[i]) << " to avoid clobbering it.\n");
+      PendingST[i] = SR;
+    }
+
+    // Make sure we have a unique live register for every fixed use. Some of
+    // them could be undef uses, and we need to emit LD_F0 instructions.
+    for (unsigned i = 0; i < NumSTUses; ++i) {
+      if (i < NumPendingSTs && PendingST[i] < NumFPRegs) {
+        // Check for shared assignments.
+        for (unsigned j = 0; j < i; ++j) {
+          if (PendingST[j] != PendingST[i])
+            continue;
+          // STi and STj are inn the same register, create a copy.
+          unsigned SR = getScratchReg();
+          duplicateToTop(PendingST[i], SR, I);
+          DEBUG(dbgs() << "Duplicating ST" << i << " in FP"
+                       << unsigned(PendingST[i])
+                       << " to avoid collision with ST" << j << '\n');
+          PendingST[i] = SR;
+        }
+        continue;
+      }
+      unsigned SR = getScratchReg();
+      DEBUG(dbgs() << "Emitting LD_F0 for ST" << i << " in FP" << SR << '\n');
+      BuildMI(*MBB, I, MI->getDebugLoc(), TII->get(X86::LD_F0));
+      pushReg(SR);
+      PendingST[i] = SR;
+      if (NumPendingSTs == i)
+        ++NumPendingSTs;
+    }
+    assert(NumPendingSTs >= NumSTUses && "Fixed registers should be assigned");
+
+    // Now we can rearrange the live registers to match what was requested.
+    shuffleStackTop(PendingST, NumPendingSTs, I);
+    DEBUG({dbgs() << "Before asm: "; dumpStack();});
+
+    // With the stack layout fixed, rewrite the FP registers.
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand &Op = MI->getOperand(i);
+      if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
+        continue;
+      unsigned FPReg = getFPReg(Op);
+      Op.setReg(getSTReg(FPReg));
+    }
+
+    // Simulate the inline asm popping its inputs and pushing its outputs.
+    StackTop -= NumSTPopped;
+
+    // Hold the fixed output registers in scratch FP registers. They will be
+    // transferred to real FP registers by copies.
+    NumPendingSTs = 0;
+    for (unsigned i = 0; i < NumSTDefs; ++i) {
+      unsigned SR = getScratchReg();
+      pushReg(SR);
+      FPKills &= ~(1u << SR);
+    }
+    for (unsigned i = 0; i < NumSTDefs; ++i)
+      PendingST[NumPendingSTs++] = getStackEntry(i);
+    DEBUG({dbgs() << "After asm: "; dumpStack();});
+
+    // If any of the ST defs were dead, pop them immediately. Our caller only
+    // handles dead FP defs.
+    MachineBasicBlock::iterator InsertPt = MI;
+    for (unsigned i = 0; STDefs & (1u << i); ++i) {
+      if (!(STDeadDefs & (1u << i)))
+        continue;
+      freeStackSlotAfter(InsertPt, PendingST[i]);
+      PendingST[i] = NumFPRegs;
+    }
+    while (NumPendingSTs && PendingST[NumPendingSTs - 1] == NumFPRegs)
+      --NumPendingSTs;
+
+    // If this asm kills any FP registers (is the last use of them) we must
+    // explicitly emit pop instructions for them.  Do this now after the asm has
+    // executed so that the ST(x) numbers are not off (which would happen if we
+    // did this inline with operand rewriting).
+    //
+    // Note: this might be a non-optimal pop sequence.  We might be able to do
+    // better by trying to pop in stack order or something.
+    while (FPKills) {
+      unsigned FPReg = countTrailingZeros(FPKills);
+      if (isLive(FPReg))
+        freeStackSlotAfter(InsertPt, FPReg);
+      FPKills &= ~(1U << FPReg);
+    }
+    // Don't delete the inline asm!
+    return;
+  }
+
+  case X86::WIN_FTOL_32:
+  case X86::WIN_FTOL_64: {
+    // Push the operand into ST0.
+    MachineOperand &Op = MI->getOperand(0);
+    assert(Op.isUse() && Op.isReg() &&
+      Op.getReg() >= X86::FP0 && Op.getReg() <= X86::FP6);
+    unsigned FPReg = getFPReg(Op);
+    if (Op.isKill())
+      moveToTop(FPReg, I);
+    else
+      duplicateToTop(FPReg, FPReg, I);
+
+    // Emit the call. This will pop the operand.
+    BuildMI(*MBB, I, MI->getDebugLoc(), TII->get(X86::CALLpcrel32))
+      .addExternalSymbol("_ftol2")
+      .addReg(X86::ST0, RegState::ImplicitKill)
+      .addReg(X86::ECX, RegState::ImplicitDefine)
+      .addReg(X86::EAX, RegState::Define | RegState::Implicit)
+      .addReg(X86::EDX, RegState::Define | RegState::Implicit)
+      .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
+    --StackTop;
+
+    break;
+  }
+
+  case X86::RETQ:
+  case X86::RETL:
+  case X86::RETIL:
+  case X86::RETIQ:
+    // If RET has an FP register use operand, pass the first one in ST(0) and
+    // the second one in ST(1).
+
+    // Find the register operands.
+    unsigned FirstFPRegOp = ~0U, SecondFPRegOp = ~0U;
+    unsigned LiveMask = 0;
+
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand &Op = MI->getOperand(i);
+      if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
+        continue;
+      // FP Register uses must be kills unless there are two uses of the same
+      // register, in which case only one will be a kill.
+      assert(Op.isUse() &&
+             (Op.isKill() ||                        // Marked kill.
+              getFPReg(Op) == FirstFPRegOp ||       // Second instance.
+              MI->killsRegister(Op.getReg())) &&    // Later use is marked kill.
+             "Ret only defs operands, and values aren't live beyond it");
+
+      if (FirstFPRegOp == ~0U)
+        FirstFPRegOp = getFPReg(Op);
+      else {
+        assert(SecondFPRegOp == ~0U && "More than two fp operands!");
+        SecondFPRegOp = getFPReg(Op);
+      }
+      LiveMask |= (1 << getFPReg(Op));
+
+      // Remove the operand so that later passes don't see it.
+      MI->RemoveOperand(i);
+      --i, --e;
+    }
+
+    // We may have been carrying spurious live-ins, so make sure only the returned
+    // registers are left live.
+    adjustLiveRegs(LiveMask, MI);
+    if (!LiveMask) return;  // Quick check to see if any are possible.
+
+    // There are only four possibilities here:
+    // 1) we are returning a single FP value.  In this case, it has to be in
+    //    ST(0) already, so just declare success by removing the value from the
+    //    FP Stack.
+    if (SecondFPRegOp == ~0U) {
+      // Assert that the top of stack contains the right FP register.
+      assert(StackTop == 1 && FirstFPRegOp == getStackEntry(0) &&
+             "Top of stack not the right register for RET!");
+
+      // Ok, everything is good, mark the value as not being on the stack
+      // anymore so that our assertion about the stack being empty at end of
+      // block doesn't fire.
+      StackTop = 0;
+      return;
+    }
+
+    // Otherwise, we are returning two values:
+    // 2) If returning the same value for both, we only have one thing in the FP
+    //    stack.  Consider:  RET FP1, FP1
+    if (StackTop == 1) {
+      assert(FirstFPRegOp == SecondFPRegOp && FirstFPRegOp == getStackEntry(0)&&
+             "Stack misconfiguration for RET!");
+
+      // Duplicate the TOS so that we return it twice.  Just pick some other FPx
+      // register to hold it.
+      unsigned NewReg = getScratchReg();
+      duplicateToTop(FirstFPRegOp, NewReg, MI);
+      FirstFPRegOp = NewReg;
+    }
+
+    /// Okay we know we have two different FPx operands now:
+    assert(StackTop == 2 && "Must have two values live!");
+
+    /// 3) If SecondFPRegOp is currently in ST(0) and FirstFPRegOp is currently
+    ///    in ST(1).  In this case, emit an fxch.
+    if (getStackEntry(0) == SecondFPRegOp) {
+      assert(getStackEntry(1) == FirstFPRegOp && "Unknown regs live");
+      moveToTop(FirstFPRegOp, MI);
+    }
+
+    /// 4) Finally, FirstFPRegOp must be in ST(0) and SecondFPRegOp must be in
+    /// ST(1).  Just remove both from our understanding of the stack and return.
+    assert(getStackEntry(0) == FirstFPRegOp && "Unknown regs live");
+    assert(getStackEntry(1) == SecondFPRegOp && "Unknown regs live");
+    StackTop = 0;
+    return;
+  }
+
+  I = MBB->erase(I);  // Remove the pseudo instruction
+
+  // We want to leave I pointing to the previous instruction, but what if we
+  // just erased the first instruction?
+  if (I == MBB->begin()) {
+    DEBUG(dbgs() << "Inserting dummy KILL\n");
+    I = BuildMI(*MBB, I, DebugLoc(), TII->get(TargetOpcode::KILL));
+  } else
+    --I;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp
new file mode 100644
index 0000000..8c029a8
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -0,0 +1,1739 @@
+//===-- X86FrameLowering.cpp - X86 Frame Information ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86FrameLowering.h"
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+// FIXME: completely move here.
+extern cl::opt<bool> ForceStackAlign;
+
+bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+  return !MF.getFrameInfo()->hasVarSizedObjects();
+}
+
+/// hasFP - Return true if the specified function should have a dedicated frame
+/// pointer register.  This is true if the function has variable sized allocas
+/// or if frame pointer elimination is disabled.
+bool X86FrameLowering::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const MachineModuleInfo &MMI = MF.getMMI();
+  const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
+
+  return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
+          RegInfo->needsStackRealignment(MF) ||
+          MFI->hasVarSizedObjects() ||
+          MFI->isFrameAddressTaken() || MFI->hasInlineAsmWithSPAdjust() ||
+          MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() ||
+          MMI.callsUnwindInit() || MMI.callsEHReturn());
+}
+
+static unsigned getSUBriOpcode(unsigned IsLP64, int64_t Imm) {
+  if (IsLP64) {
+    if (isInt<8>(Imm))
+      return X86::SUB64ri8;
+    return X86::SUB64ri32;
+  } else {
+    if (isInt<8>(Imm))
+      return X86::SUB32ri8;
+    return X86::SUB32ri;
+  }
+}
+
+static unsigned getADDriOpcode(unsigned IsLP64, int64_t Imm) {
+  if (IsLP64) {
+    if (isInt<8>(Imm))
+      return X86::ADD64ri8;
+    return X86::ADD64ri32;
+  } else {
+    if (isInt<8>(Imm))
+      return X86::ADD32ri8;
+    return X86::ADD32ri;
+  }
+}
+
+static unsigned getLEArOpcode(unsigned IsLP64) {
+  return IsLP64 ? X86::LEA64r : X86::LEA32r;
+}
+
+/// findDeadCallerSavedReg - Return a caller-saved register that isn't live
+/// when it reaches the "return" instruction. We can then pop a stack object
+/// to this register without worry about clobbering it.
+static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator &MBBI,
+                                       const TargetRegisterInfo &TRI,
+                                       bool Is64Bit) {
+  const MachineFunction *MF = MBB.getParent();
+  const Function *F = MF->getFunction();
+  if (!F || MF->getMMI().callsEHReturn())
+    return 0;
+
+  static const uint16_t CallerSavedRegs32Bit[] = {
+    X86::EAX, X86::EDX, X86::ECX, 0
+  };
+
+  static const uint16_t CallerSavedRegs64Bit[] = {
+    X86::RAX, X86::RDX, X86::RCX, X86::RSI, X86::RDI,
+    X86::R8,  X86::R9,  X86::R10, X86::R11, 0
+  };
+
+  unsigned Opc = MBBI->getOpcode();
+  switch (Opc) {
+  default: return 0;
+  case X86::RETL:
+  case X86::RETQ:
+  case X86::RETIL:
+  case X86::RETIQ:
+  case X86::TCRETURNdi:
+  case X86::TCRETURNri:
+  case X86::TCRETURNmi:
+  case X86::TCRETURNdi64:
+  case X86::TCRETURNri64:
+  case X86::TCRETURNmi64:
+  case X86::EH_RETURN:
+  case X86::EH_RETURN64: {
+    SmallSet<uint16_t, 8> Uses;
+    for (unsigned i = 0, e = MBBI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MBBI->getOperand(i);
+      if (!MO.isReg() || MO.isDef())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (!Reg)
+        continue;
+      for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI)
+        Uses.insert(*AI);
+    }
+
+    const uint16_t *CS = Is64Bit ? CallerSavedRegs64Bit : CallerSavedRegs32Bit;
+    for (; *CS; ++CS)
+      if (!Uses.count(*CS))
+        return *CS;
+  }
+  }
+
+  return 0;
+}
+
+
+/// emitSPUpdate - Emit a series of instructions to increment / decrement the
+/// stack pointer by a constant value.
+static
+void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+                  unsigned StackPtr, int64_t NumBytes,
+                  bool Is64Bit, bool IsLP64, bool UseLEA,
+                  const TargetInstrInfo &TII, const TargetRegisterInfo &TRI) {
+  bool isSub = NumBytes < 0;
+  uint64_t Offset = isSub ? -NumBytes : NumBytes;
+  unsigned Opc;
+  if (UseLEA)
+    Opc = getLEArOpcode(IsLP64);
+  else
+    Opc = isSub
+      ? getSUBriOpcode(IsLP64, Offset)
+      : getADDriOpcode(IsLP64, Offset);
+
+  uint64_t Chunk = (1LL << 31) - 1;
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+
+  while (Offset) {
+    uint64_t ThisVal = (Offset > Chunk) ? Chunk : Offset;
+    if (ThisVal == (Is64Bit ? 8 : 4)) {
+      // Use push / pop instead.
+      unsigned Reg = isSub
+        ? (unsigned)(Is64Bit ? X86::RAX : X86::EAX)
+        : findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit);
+      if (Reg) {
+        Opc = isSub
+          ? (Is64Bit ? X86::PUSH64r : X86::PUSH32r)
+          : (Is64Bit ? X86::POP64r  : X86::POP32r);
+        MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc))
+          .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub));
+        if (isSub)
+          MI->setFlag(MachineInstr::FrameSetup);
+        Offset -= ThisVal;
+        continue;
+      }
+    }
+
+    MachineInstr *MI = nullptr;
+
+    if (UseLEA) {
+      MI =  addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr),
+                          StackPtr, false, isSub ? -ThisVal : ThisVal);
+    } else {
+      MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
+            .addReg(StackPtr)
+            .addImm(ThisVal);
+      MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
+    }
+
+    if (isSub)
+      MI->setFlag(MachineInstr::FrameSetup);
+
+    Offset -= ThisVal;
+  }
+}
+
+/// mergeSPUpdatesUp - Merge two stack-manipulating instructions upper iterator.
+static
+void mergeSPUpdatesUp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+                      unsigned StackPtr, uint64_t *NumBytes = nullptr) {
+  if (MBBI == MBB.begin()) return;
+
+  MachineBasicBlock::iterator PI = std::prev(MBBI);
+  unsigned Opc = PI->getOpcode();
+  if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
+       Opc == X86::ADD32ri || Opc == X86::ADD32ri8 ||
+       Opc == X86::LEA32r || Opc == X86::LEA64_32r) &&
+      PI->getOperand(0).getReg() == StackPtr) {
+    if (NumBytes)
+      *NumBytes += PI->getOperand(2).getImm();
+    MBB.erase(PI);
+  } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
+              Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
+             PI->getOperand(0).getReg() == StackPtr) {
+    if (NumBytes)
+      *NumBytes -= PI->getOperand(2).getImm();
+    MBB.erase(PI);
+  }
+}
+
+/// mergeSPUpdatesDown - Merge two stack-manipulating instructions lower
+/// iterator.
+static
+void mergeSPUpdatesDown(MachineBasicBlock &MBB,
+                        MachineBasicBlock::iterator &MBBI,
+                        unsigned StackPtr, uint64_t *NumBytes = nullptr) {
+  // FIXME:  THIS ISN'T RUN!!!
+  return;
+
+  if (MBBI == MBB.end()) return;
+
+  MachineBasicBlock::iterator NI = std::next(MBBI);
+  if (NI == MBB.end()) return;
+
+  unsigned Opc = NI->getOpcode();
+  if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
+       Opc == X86::ADD32ri || Opc == X86::ADD32ri8) &&
+      NI->getOperand(0).getReg() == StackPtr) {
+    if (NumBytes)
+      *NumBytes -= NI->getOperand(2).getImm();
+    MBB.erase(NI);
+    MBBI = NI;
+  } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
+              Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
+             NI->getOperand(0).getReg() == StackPtr) {
+    if (NumBytes)
+      *NumBytes += NI->getOperand(2).getImm();
+    MBB.erase(NI);
+    MBBI = NI;
+  }
+}
+
+/// mergeSPUpdates - Checks the instruction before/after the passed
+/// instruction. If it is an ADD/SUB/LEA instruction it is deleted argument and
+/// the stack adjustment is returned as a positive value for ADD/LEA and a
+/// negative for SUB.
+static int mergeSPUpdates(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator &MBBI, unsigned StackPtr,
+                          bool doMergeWithPrevious) {
+  if ((doMergeWithPrevious && MBBI == MBB.begin()) ||
+      (!doMergeWithPrevious && MBBI == MBB.end()))
+    return 0;
+
+  MachineBasicBlock::iterator PI = doMergeWithPrevious ? std::prev(MBBI) : MBBI;
+  MachineBasicBlock::iterator NI = doMergeWithPrevious ? nullptr
+                                                       : std::next(MBBI);
+  unsigned Opc = PI->getOpcode();
+  int Offset = 0;
+
+  if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
+       Opc == X86::ADD32ri || Opc == X86::ADD32ri8 ||
+       Opc == X86::LEA32r || Opc == X86::LEA64_32r) &&
+      PI->getOperand(0).getReg() == StackPtr){
+    Offset += PI->getOperand(2).getImm();
+    MBB.erase(PI);
+    if (!doMergeWithPrevious) MBBI = NI;
+  } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
+              Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
+             PI->getOperand(0).getReg() == StackPtr) {
+    Offset -= PI->getOperand(2).getImm();
+    MBB.erase(PI);
+    if (!doMergeWithPrevious) MBBI = NI;
+  }
+
+  return Offset;
+}
+
+static bool isEAXLiveIn(MachineFunction &MF) {
+  for (MachineRegisterInfo::livein_iterator II = MF.getRegInfo().livein_begin(),
+       EE = MF.getRegInfo().livein_end(); II != EE; ++II) {
+    unsigned Reg = II->first;
+
+    if (Reg == X86::EAX || Reg == X86::AX ||
+        Reg == X86::AH || Reg == X86::AL)
+      return true;
+  }
+
+  return false;
+}
+
+void
+X86FrameLowering::emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
+                                            MachineBasicBlock::iterator MBBI,
+                                            DebugLoc DL) const {
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineModuleInfo &MMI = MF.getMMI();
+  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+
+  // Add callee saved registers to move list.
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+  if (CSI.empty()) return;
+
+  // Calculate offsets.
+  for (std::vector<CalleeSavedInfo>::const_iterator
+         I = CSI.begin(), E = CSI.end(); I != E; ++I) {
+    int64_t Offset = MFI->getObjectOffset(I->getFrameIdx());
+    unsigned Reg = I->getReg();
+
+    unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
+    unsigned CFIIndex =
+        MMI.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg,
+                                                        Offset));
+    BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
+  }
+}
+
+/// usesTheStack - This function checks if any of the users of EFLAGS
+/// copies the EFLAGS. We know that the code that lowers COPY of EFLAGS has
+/// to use the stack, and if we don't adjust the stack we clobber the first
+/// frame index.
+/// See X86InstrInfo::copyPhysReg.
+static bool usesTheStack(const MachineFunction &MF) {
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  for (MachineRegisterInfo::reg_instr_iterator
+       ri = MRI.reg_instr_begin(X86::EFLAGS), re = MRI.reg_instr_end();
+       ri != re; ++ri)
+    if (ri->isCopy())
+      return true;
+
+  return false;
+}
+
+/// emitPrologue - Push callee-saved registers onto the stack, which
+/// automatically adjust the stack pointer. Adjust the stack pointer to allocate
+/// space for local variables. Also emit labels used by the exception handler to
+/// generate the exception handling frames.
+
+/*
+  Here's a gist of what gets emitted:
+
+  ; Establish frame pointer, if needed
+  [if needs FP]
+      push  %rbp
+      .cfi_def_cfa_offset 16
+      .cfi_offset %rbp, -16
+      .seh_pushreg %rpb
+      mov  %rsp, %rbp
+      .cfi_def_cfa_register %rbp
+
+  ; Spill general-purpose registers
+  [for all callee-saved GPRs]
+      pushq %<reg>
+      [if not needs FP]
+         .cfi_def_cfa_offset (offset from RETADDR)
+      .seh_pushreg %<reg>
+
+  ; If the required stack alignment > default stack alignment
+  ; rsp needs to be re-aligned.  This creates a "re-alignment gap"
+  ; of unknown size in the stack frame.
+  [if stack needs re-alignment]
+      and  $MASK, %rsp
+
+  ; Allocate space for locals
+  [if target is Windows and allocated space > 4096 bytes]
+      ; Windows needs special care for allocations larger
+      ; than one page.
+      mov $NNN, %rax
+      call ___chkstk_ms/___chkstk
+      sub  %rax, %rsp
+  [else]
+      sub  $NNN, %rsp
+
+  [if needs FP]
+      .seh_stackalloc (size of XMM spill slots)
+      .seh_setframe %rbp, SEHFrameOffset ; = size of all spill slots
+  [else]
+      .seh_stackalloc NNN
+
+  ; Spill XMMs
+  ; Note, that while only Windows 64 ABI specifies XMMs as callee-preserved,
+  ; they may get spilled on any platform, if the current function
+  ; calls @llvm.eh.unwind.init
+  [if needs FP]
+      [for all callee-saved XMM registers]
+          movaps  %<xmm reg>, -MMM(%rbp)
+      [for all callee-saved XMM registers]
+          .seh_savexmm %<xmm reg>, (-MMM + SEHFrameOffset)
+              ; i.e. the offset relative to (%rbp - SEHFrameOffset)
+  [else]
+      [for all callee-saved XMM registers]
+          movaps  %<xmm reg>, KKK(%rsp)
+      [for all callee-saved XMM registers]
+          .seh_savexmm %<xmm reg>, KKK
+
+  .seh_endprologue
+
+  [if needs base pointer]
+      mov  %rsp, %rbx
+
+  ; Emit CFI info
+  [if needs FP]
+      [for all callee-saved registers]
+          .cfi_offset %<reg>, (offset from %rbp)
+  [else]
+       .cfi_def_cfa_offset (offset from RETADDR)
+      [for all callee-saved registers]
+          .cfi_offset %<reg>, (offset from %rsp)
+
+  Notes:
+  - .seh directives are emitted only for Windows 64 ABI
+  - .cfi directives are emitted for all other ABIs
+  - for 32-bit code, substitute %e?? registers for %r??
+*/
+
+void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front(); // Prologue goes in entry BB.
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const Function *Fn = MF.getFunction();
+  const X86RegisterInfo *RegInfo =
+      static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo());
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  MachineModuleInfo &MMI = MF.getMMI();
+  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  uint64_t MaxAlign  = MFI->getMaxAlignment(); // Desired stack alignment.
+  uint64_t StackSize = MFI->getStackSize();    // Number of bytes to allocate.
+  bool HasFP = hasFP(MF);
+  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
+  bool Is64Bit = STI.is64Bit();
+  bool IsLP64 = STI.isTarget64BitLP64();
+  bool IsWin64 = STI.isTargetWin64();
+  bool IsWinEH =
+      MF.getTarget().getMCAsmInfo()->getExceptionHandlingType() ==
+      ExceptionHandling::WinEH; // Not necessarily synonymous with IsWin64.
+  bool NeedsWinEH = IsWinEH && Fn->needsUnwindTableEntry();
+  bool NeedsDwarfCFI =
+      !IsWinEH && (MMI.hasDebugInfo() || Fn->needsUnwindTableEntry());
+  bool UseLEA = STI.useLeaForSP();
+  unsigned StackAlign = getStackAlignment();
+  unsigned SlotSize = RegInfo->getSlotSize();
+  unsigned FramePtr = RegInfo->getFrameRegister(MF);
+  unsigned StackPtr = RegInfo->getStackRegister();
+  unsigned BasePtr = RegInfo->getBaseRegister();
+  DebugLoc DL;
+
+  // If we're forcing a stack realignment we can't rely on just the frame
+  // info, we need to know the ABI stack alignment as well in case we
+  // have a call out.  Otherwise just make sure we have some alignment - we'll
+  // go with the minimum SlotSize.
+  if (ForceStackAlign) {
+    if (MFI->hasCalls())
+      MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign;
+    else if (MaxAlign < SlotSize)
+      MaxAlign = SlotSize;
+  }
+
+  // Add RETADDR move area to callee saved frame size.
+  int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+  if (TailCallReturnAddrDelta < 0)
+    X86FI->setCalleeSavedFrameSize(
+      X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta);
+
+  // If this is x86-64 and the Red Zone is not disabled, if we are a leaf
+  // function, and use up to 128 bytes of stack space, don't have a frame
+  // pointer, calls, or dynamic alloca then we do not need to adjust the
+  // stack pointer (we fit in the Red Zone). We also check that we don't
+  // push and pop from the stack.
+  if (Is64Bit && !Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                                   Attribute::NoRedZone) &&
+      !RegInfo->needsStackRealignment(MF) &&
+      !MFI->hasVarSizedObjects() &&                     // No dynamic alloca.
+      !MFI->adjustsStack() &&                           // No calls.
+      !IsWin64 &&                                       // Win64 has no Red Zone
+      !usesTheStack(MF) &&                              // Don't push and pop.
+      !MF.shouldSplitStack()) {                         // Regular stack
+    uint64_t MinSize = X86FI->getCalleeSavedFrameSize();
+    if (HasFP) MinSize += SlotSize;
+    StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0);
+    MFI->setStackSize(StackSize);
+  }
+
+  // Insert stack pointer adjustment for later moving of return addr.  Only
+  // applies to tail call optimized functions where the callee argument stack
+  // size is bigger than the callers.
+  if (TailCallReturnAddrDelta < 0) {
+    MachineInstr *MI =
+      BuildMI(MBB, MBBI, DL,
+              TII.get(getSUBriOpcode(IsLP64, -TailCallReturnAddrDelta)),
+              StackPtr)
+        .addReg(StackPtr)
+        .addImm(-TailCallReturnAddrDelta)
+        .setMIFlag(MachineInstr::FrameSetup);
+    MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
+  }
+
+  // Mapping for machine moves:
+  //
+  //   DST: VirtualFP AND
+  //        SRC: VirtualFP              => DW_CFA_def_cfa_offset
+  //        ELSE                        => DW_CFA_def_cfa
+  //
+  //   SRC: VirtualFP AND
+  //        DST: Register               => DW_CFA_def_cfa_register
+  //
+  //   ELSE
+  //        OFFSET < 0                  => DW_CFA_offset_extended_sf
+  //        REG < 64                    => DW_CFA_offset + Reg
+  //        ELSE                        => DW_CFA_offset_extended
+
+  uint64_t NumBytes = 0;
+  int stackGrowth = -SlotSize;
+
+  if (HasFP) {
+    // Calculate required stack adjustment.
+    uint64_t FrameSize = StackSize - SlotSize;
+    if (RegInfo->needsStackRealignment(MF)) {
+      // Callee-saved registers are pushed on stack before the stack
+      // is realigned.
+      FrameSize -= X86FI->getCalleeSavedFrameSize();
+      NumBytes = (FrameSize + MaxAlign - 1) / MaxAlign * MaxAlign;
+    } else {
+      NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize();
+    }
+
+    // Get the offset of the stack slot for the EBP register, which is
+    // guaranteed to be the last slot by processFunctionBeforeFrameFinalized.
+    // Update the frame offset adjustment.
+    MFI->setOffsetAdjustment(-NumBytes);
+
+    // Save EBP/RBP into the appropriate stack slot.
+    BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
+      .addReg(FramePtr, RegState::Kill)
+      .setMIFlag(MachineInstr::FrameSetup);
+
+    if (NeedsDwarfCFI) {
+      // Mark the place where EBP/RBP was saved.
+      // Define the current CFA rule to use the provided offset.
+      assert(StackSize);
+      unsigned CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createDefCfaOffset(nullptr, 2 * stackGrowth));
+      BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+
+      // Change the rule for the FramePtr to be an "offset" rule.
+      unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(FramePtr, true);
+      CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createOffset(nullptr,
+                                         DwarfFramePtr, 2 * stackGrowth));
+      BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    }
+
+    if (NeedsWinEH) {
+      BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg))
+          .addImm(FramePtr)
+          .setMIFlag(MachineInstr::FrameSetup);
+    }
+
+    // Update EBP with the new base value.
+    BuildMI(MBB, MBBI, DL,
+            TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), FramePtr)
+        .addReg(StackPtr)
+        .setMIFlag(MachineInstr::FrameSetup);
+
+    if (NeedsDwarfCFI) {
+      // Mark effective beginning of when frame pointer becomes valid.
+      // Define the current CFA to use the EBP/RBP register.
+      unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(FramePtr, true);
+      unsigned CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createDefCfaRegister(nullptr, DwarfFramePtr));
+      BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    }
+
+    // Mark the FramePtr as live-in in every block.
+    for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)
+      I->addLiveIn(FramePtr);
+  } else {
+    NumBytes = StackSize - X86FI->getCalleeSavedFrameSize();
+  }
+
+  // Skip the callee-saved push instructions.
+  bool PushedRegs = false;
+  int StackOffset = 2 * stackGrowth;
+
+  while (MBBI != MBB.end() &&
+         (MBBI->getOpcode() == X86::PUSH32r ||
+          MBBI->getOpcode() == X86::PUSH64r)) {
+    PushedRegs = true;
+    unsigned Reg = MBBI->getOperand(0).getReg();
+    ++MBBI;
+
+    if (!HasFP && NeedsDwarfCFI) {
+      // Mark callee-saved push instruction.
+      // Define the current CFA rule to use the provided offset.
+      assert(StackSize);
+      unsigned CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createDefCfaOffset(nullptr, StackOffset));
+      BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+      StackOffset += stackGrowth;
+    }
+
+    if (NeedsWinEH) {
+      BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg)).addImm(Reg).setMIFlag(
+          MachineInstr::FrameSetup);
+    }
+  }
+
+  // Realign stack after we pushed callee-saved registers (so that we'll be
+  // able to calculate their offsets from the frame pointer).
+  if (RegInfo->needsStackRealignment(MF)) {
+    assert(HasFP && "There should be a frame pointer if stack is realigned.");
+    MachineInstr *MI =
+      BuildMI(MBB, MBBI, DL,
+              TII.get(Is64Bit ? X86::AND64ri32 : X86::AND32ri), StackPtr)
+      .addReg(StackPtr)
+      .addImm(-MaxAlign)
+      .setMIFlag(MachineInstr::FrameSetup);
+
+    // The EFLAGS implicit def is dead.
+    MI->getOperand(3).setIsDead();
+  }
+
+  // If there is an SUB32ri of ESP immediately before this instruction, merge
+  // the two. This can be the case when tail call elimination is enabled and
+  // the callee has more arguments then the caller.
+  NumBytes -= mergeSPUpdates(MBB, MBBI, StackPtr, true);
+
+  // If there is an ADD32ri or SUB32ri of ESP immediately after this
+  // instruction, merge the two instructions.
+  mergeSPUpdatesDown(MBB, MBBI, StackPtr, &NumBytes);
+
+  // Adjust stack pointer: ESP -= numbytes.
+
+  // Windows and cygwin/mingw require a prologue helper routine when allocating
+  // more than 4K bytes on the stack.  Windows uses __chkstk and cygwin/mingw
+  // uses __alloca.  __alloca and the 32-bit version of __chkstk will probe the
+  // stack and adjust the stack pointer in one go.  The 64-bit version of
+  // __chkstk is only responsible for probing the stack.  The 64-bit prologue is
+  // responsible for adjusting the stack pointer.  Touching the stack at 4K
+  // increments is necessary to ensure that the guard pages used by the OS
+  // virtual memory manager are allocated in correct sequence.
+  if (NumBytes >= 4096 && STI.isOSWindows() && !STI.isTargetMacho()) {
+    const char *StackProbeSymbol;
+
+    if (Is64Bit) {
+      if (STI.isTargetCygMing()) {
+        StackProbeSymbol = "___chkstk_ms";
+      } else {
+        StackProbeSymbol = "__chkstk";
+      }
+    } else if (STI.isTargetCygMing())
+      StackProbeSymbol = "_alloca";
+    else
+      StackProbeSymbol = "_chkstk";
+
+    // Check whether EAX is livein for this function.
+    bool isEAXAlive = isEAXLiveIn(MF);
+
+    if (isEAXAlive) {
+      // Sanity check that EAX is not livein for this function.
+      // It should not be, so throw an assert.
+      assert(!Is64Bit && "EAX is livein in x64 case!");
+
+      // Save EAX
+      BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r))
+        .addReg(X86::EAX, RegState::Kill)
+        .setMIFlag(MachineInstr::FrameSetup);
+    }
+
+    if (Is64Bit) {
+      // Handle the 64-bit Windows ABI case where we need to call __chkstk.
+      // Function prologue is responsible for adjusting the stack pointer.
+      BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::RAX)
+        .addImm(NumBytes)
+        .setMIFlag(MachineInstr::FrameSetup);
+    } else {
+      // Allocate NumBytes-4 bytes on stack in case of isEAXAlive.
+      // We'll also use 4 already allocated bytes for EAX.
+      BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
+        .addImm(isEAXAlive ? NumBytes - 4 : NumBytes)
+        .setMIFlag(MachineInstr::FrameSetup);
+    }
+
+    BuildMI(MBB, MBBI, DL,
+            TII.get(Is64Bit ? X86::W64ALLOCA : X86::CALLpcrel32))
+      .addExternalSymbol(StackProbeSymbol)
+      .addReg(StackPtr,    RegState::Define | RegState::Implicit)
+      .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit)
+      .setMIFlag(MachineInstr::FrameSetup);
+
+    if (Is64Bit) {
+      // MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp
+      // themself. It also does not clobber %rax so we can reuse it when
+      // adjusting %rsp.
+      BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64rr), StackPtr)
+        .addReg(StackPtr)
+        .addReg(X86::RAX)
+        .setMIFlag(MachineInstr::FrameSetup);
+    }
+    if (isEAXAlive) {
+        // Restore EAX
+        MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm),
+                                                X86::EAX),
+                                        StackPtr, false, NumBytes - 4);
+        MI->setFlag(MachineInstr::FrameSetup);
+        MBB.insert(MBBI, MI);
+    }
+  } else if (NumBytes) {
+    emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, IsLP64,
+                 UseLEA, TII, *RegInfo);
+  }
+
+  int SEHFrameOffset = 0;
+  if (NeedsWinEH) {
+    if (HasFP) {
+      // We need to set frame base offset low enough such that all saved
+      // register offsets would be positive relative to it, but we can't
+      // just use NumBytes, because .seh_setframe offset must be <=240.
+      // So we pretend to have only allocated enough space to spill the
+      // non-volatile registers.
+      // We don't care about the rest of stack allocation, because unwinder
+      // will restore SP to (BP - SEHFrameOffset)
+      for (const CalleeSavedInfo &Info : MFI->getCalleeSavedInfo()) {
+        int offset = MFI->getObjectOffset(Info.getFrameIdx());
+        SEHFrameOffset = std::max(SEHFrameOffset, abs(offset));
+      }
+      SEHFrameOffset += SEHFrameOffset % 16; // ensure alignmant
+
+      // This only needs to account for XMM spill slots, GPR slots
+      // are covered by the .seh_pushreg's emitted above.
+      unsigned Size = SEHFrameOffset - X86FI->getCalleeSavedFrameSize();
+      if (Size) {
+        BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc))
+            .addImm(Size)
+            .setMIFlag(MachineInstr::FrameSetup);
+      }
+
+      BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame))
+          .addImm(FramePtr)
+          .addImm(SEHFrameOffset)
+          .setMIFlag(MachineInstr::FrameSetup);
+    } else {
+      // SP will be the base register for restoring XMMs
+      if (NumBytes) {
+        BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc))
+            .addImm(NumBytes)
+            .setMIFlag(MachineInstr::FrameSetup);
+      }
+    }
+  }
+
+  // Skip the rest of register spilling code
+  while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup))
+    ++MBBI;
+
+  // Emit SEH info for non-GPRs
+  if (NeedsWinEH) {
+    for (const CalleeSavedInfo &Info : MFI->getCalleeSavedInfo()) {
+      unsigned Reg = Info.getReg();
+      if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
+        continue;
+      assert(X86::FR64RegClass.contains(Reg) && "Unexpected register class");
+
+      int Offset = getFrameIndexOffset(MF, Info.getFrameIdx());
+      Offset += SEHFrameOffset;
+
+      BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM))
+          .addImm(Reg)
+          .addImm(Offset)
+          .setMIFlag(MachineInstr::FrameSetup);
+    }
+
+    BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_EndPrologue))
+        .setMIFlag(MachineInstr::FrameSetup);
+  }
+
+  // If we need a base pointer, set it up here. It's whatever the value
+  // of the stack pointer is at this point. Any variable size objects
+  // will be allocated after this, so we can still use the base pointer
+  // to reference locals.
+  if (RegInfo->hasBasePointer(MF)) {
+    // Update the base pointer with the current stack pointer.
+    unsigned Opc = Is64Bit ? X86::MOV64rr : X86::MOV32rr;
+    BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr)
+      .addReg(StackPtr)
+      .setMIFlag(MachineInstr::FrameSetup);
+  }
+
+  if (((!HasFP && NumBytes) || PushedRegs) && NeedsDwarfCFI) {
+    // Mark end of stack pointer adjustment.
+    if (!HasFP && NumBytes) {
+      // Define the current CFA rule to use the provided offset.
+      assert(StackSize);
+      unsigned CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createDefCfaOffset(nullptr,
+                                               -StackSize + stackGrowth));
+
+      BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    }
+
+    // Emit DWARF info specifying the offsets of the callee-saved registers.
+    if (PushedRegs)
+      emitCalleeSavedFrameMoves(MBB, MBBI, DL);
+  }
+}
+
+void X86FrameLowering::emitEpilogue(MachineFunction &MF,
+                                    MachineBasicBlock &MBB) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  const X86RegisterInfo *RegInfo =
+      static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo());
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  assert(MBBI != MBB.end() && "Returning block has no instructions");
+  unsigned RetOpcode = MBBI->getOpcode();
+  DebugLoc DL = MBBI->getDebugLoc();
+  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
+  bool Is64Bit = STI.is64Bit();
+  bool IsLP64 = STI.isTarget64BitLP64();
+  bool UseLEA = STI.useLeaForSP();
+  unsigned StackAlign = getStackAlignment();
+  unsigned SlotSize = RegInfo->getSlotSize();
+  unsigned FramePtr = RegInfo->getFrameRegister(MF);
+  unsigned StackPtr = RegInfo->getStackRegister();
+
+  switch (RetOpcode) {
+  default:
+    llvm_unreachable("Can only insert epilog into returning blocks");
+  case X86::RETQ:
+  case X86::RETL:
+  case X86::RETIL:
+  case X86::RETIQ:
+  case X86::TCRETURNdi:
+  case X86::TCRETURNri:
+  case X86::TCRETURNmi:
+  case X86::TCRETURNdi64:
+  case X86::TCRETURNri64:
+  case X86::TCRETURNmi64:
+  case X86::EH_RETURN:
+  case X86::EH_RETURN64:
+    break;  // These are ok
+  }
+
+  // Get the number of bytes to allocate from the FrameInfo.
+  uint64_t StackSize = MFI->getStackSize();
+  uint64_t MaxAlign  = MFI->getMaxAlignment();
+  unsigned CSSize = X86FI->getCalleeSavedFrameSize();
+  uint64_t NumBytes = 0;
+
+  // If we're forcing a stack realignment we can't rely on just the frame
+  // info, we need to know the ABI stack alignment as well in case we
+  // have a call out.  Otherwise just make sure we have some alignment - we'll
+  // go with the minimum.
+  if (ForceStackAlign) {
+    if (MFI->hasCalls())
+      MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign;
+    else
+      MaxAlign = MaxAlign ? MaxAlign : 4;
+  }
+
+  if (hasFP(MF)) {
+    // Calculate required stack adjustment.
+    uint64_t FrameSize = StackSize - SlotSize;
+    if (RegInfo->needsStackRealignment(MF)) {
+      // Callee-saved registers were pushed on stack before the stack
+      // was realigned.
+      FrameSize -= CSSize;
+      NumBytes = (FrameSize + MaxAlign - 1) / MaxAlign * MaxAlign;
+    } else {
+      NumBytes = FrameSize - CSSize;
+    }
+
+    // Pop EBP.
+    BuildMI(MBB, MBBI, DL,
+            TII.get(Is64Bit ? X86::POP64r : X86::POP32r), FramePtr);
+  } else {
+    NumBytes = StackSize - CSSize;
+  }
+
+  // Skip the callee-saved pop instructions.
+  while (MBBI != MBB.begin()) {
+    MachineBasicBlock::iterator PI = std::prev(MBBI);
+    unsigned Opc = PI->getOpcode();
+
+    if (Opc != X86::POP32r && Opc != X86::POP64r && Opc != X86::DBG_VALUE &&
+        !PI->isTerminator())
+      break;
+
+    --MBBI;
+  }
+  MachineBasicBlock::iterator FirstCSPop = MBBI;
+
+  DL = MBBI->getDebugLoc();
+
+  // If there is an ADD32ri or SUB32ri of ESP immediately before this
+  // instruction, merge the two instructions.
+  if (NumBytes || MFI->hasVarSizedObjects())
+    mergeSPUpdatesUp(MBB, MBBI, StackPtr, &NumBytes);
+
+  // If dynamic alloca is used, then reset esp to point to the last callee-saved
+  // slot before popping them off! Same applies for the case, when stack was
+  // realigned.
+  if (RegInfo->needsStackRealignment(MF) || MFI->hasVarSizedObjects()) {
+    if (RegInfo->needsStackRealignment(MF))
+      MBBI = FirstCSPop;
+    if (CSSize != 0) {
+      unsigned Opc = getLEArOpcode(IsLP64);
+      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr),
+                   FramePtr, false, -CSSize);
+    } else {
+      unsigned Opc = (Is64Bit ? X86::MOV64rr : X86::MOV32rr);
+      BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
+        .addReg(FramePtr);
+    }
+  } else if (NumBytes) {
+    // Adjust stack pointer back: ESP += numbytes.
+    emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, IsLP64, UseLEA,
+                 TII, *RegInfo);
+  }
+
+  // We're returning from function via eh_return.
+  if (RetOpcode == X86::EH_RETURN || RetOpcode == X86::EH_RETURN64) {
+    MBBI = MBB.getLastNonDebugInstr();
+    MachineOperand &DestAddr  = MBBI->getOperand(0);
+    assert(DestAddr.isReg() && "Offset should be in register!");
+    BuildMI(MBB, MBBI, DL,
+            TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr),
+            StackPtr).addReg(DestAddr.getReg());
+  } else if (RetOpcode == X86::TCRETURNri || RetOpcode == X86::TCRETURNdi ||
+             RetOpcode == X86::TCRETURNmi ||
+             RetOpcode == X86::TCRETURNri64 || RetOpcode == X86::TCRETURNdi64 ||
+             RetOpcode == X86::TCRETURNmi64) {
+    bool isMem = RetOpcode == X86::TCRETURNmi || RetOpcode == X86::TCRETURNmi64;
+    // Tail call return: adjust the stack pointer and jump to callee.
+    MBBI = MBB.getLastNonDebugInstr();
+    MachineOperand &JumpTarget = MBBI->getOperand(0);
+    MachineOperand &StackAdjust = MBBI->getOperand(isMem ? 5 : 1);
+    assert(StackAdjust.isImm() && "Expecting immediate value.");
+
+    // Adjust stack pointer.
+    int StackAdj = StackAdjust.getImm();
+    int MaxTCDelta = X86FI->getTCReturnAddrDelta();
+    int Offset = 0;
+    assert(MaxTCDelta <= 0 && "MaxTCDelta should never be positive");
+
+    // Incoporate the retaddr area.
+    Offset = StackAdj-MaxTCDelta;
+    assert(Offset >= 0 && "Offset should never be negative");
+
+    if (Offset) {
+      // Check for possible merge with preceding ADD instruction.
+      Offset += mergeSPUpdates(MBB, MBBI, StackPtr, true);
+      emitSPUpdate(MBB, MBBI, StackPtr, Offset, Is64Bit, IsLP64,
+                   UseLEA, TII, *RegInfo);
+    }
+
+    // Jump to label or value in register.
+    if (RetOpcode == X86::TCRETURNdi || RetOpcode == X86::TCRETURNdi64) {
+      MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI, DL, TII.get((RetOpcode == X86::TCRETURNdi)
+                                       ? X86::TAILJMPd : X86::TAILJMPd64));
+      if (JumpTarget.isGlobal())
+        MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(),
+                             JumpTarget.getTargetFlags());
+      else {
+        assert(JumpTarget.isSymbol());
+        MIB.addExternalSymbol(JumpTarget.getSymbolName(),
+                              JumpTarget.getTargetFlags());
+      }
+    } else if (RetOpcode == X86::TCRETURNmi || RetOpcode == X86::TCRETURNmi64) {
+      MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI, DL, TII.get((RetOpcode == X86::TCRETURNmi)
+                                       ? X86::TAILJMPm : X86::TAILJMPm64));
+      for (unsigned i = 0; i != 5; ++i)
+        MIB.addOperand(MBBI->getOperand(i));
+    } else if (RetOpcode == X86::TCRETURNri64) {
+      BuildMI(MBB, MBBI, DL, TII.get(X86::TAILJMPr64)).
+        addReg(JumpTarget.getReg(), RegState::Kill);
+    } else {
+      BuildMI(MBB, MBBI, DL, TII.get(X86::TAILJMPr)).
+        addReg(JumpTarget.getReg(), RegState::Kill);
+    }
+
+    MachineInstr *NewMI = std::prev(MBBI);
+    NewMI->copyImplicitOps(MF, MBBI);
+
+    // Delete the pseudo instruction TCRETURN.
+    MBB.erase(MBBI);
+  } else if ((RetOpcode == X86::RETQ || RetOpcode == X86::RETL ||
+              RetOpcode == X86::RETIQ || RetOpcode == X86::RETIL) &&
+             (X86FI->getTCReturnAddrDelta() < 0)) {
+    // Add the return addr area delta back since we are not tail calling.
+    int delta = -1*X86FI->getTCReturnAddrDelta();
+    MBBI = MBB.getLastNonDebugInstr();
+
+    // Check for possible merge with preceding ADD instruction.
+    delta += mergeSPUpdates(MBB, MBBI, StackPtr, true);
+    emitSPUpdate(MBB, MBBI, StackPtr, delta, Is64Bit, IsLP64, UseLEA, TII,
+                 *RegInfo);
+  }
+}
+
+int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF,
+                                          int FI) const {
+  const X86RegisterInfo *RegInfo =
+    static_cast<const X86RegisterInfo*>(MF.getTarget().getRegisterInfo());
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  int Offset = MFI->getObjectOffset(FI) - getOffsetOfLocalArea();
+  uint64_t StackSize = MFI->getStackSize();
+
+  if (RegInfo->hasBasePointer(MF)) {
+    assert (hasFP(MF) && "VLAs and dynamic stack realign, but no FP?!");
+    if (FI < 0) {
+      // Skip the saved EBP.
+      return Offset + RegInfo->getSlotSize();
+    } else {
+      assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0);
+      return Offset + StackSize;
+    }
+  } else if (RegInfo->needsStackRealignment(MF)) {
+    if (FI < 0) {
+      // Skip the saved EBP.
+      return Offset + RegInfo->getSlotSize();
+    } else {
+      assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0);
+      return Offset + StackSize;
+    }
+    // FIXME: Support tail calls
+  } else {
+    if (!hasFP(MF))
+      return Offset + StackSize;
+
+    // Skip the saved EBP.
+    Offset += RegInfo->getSlotSize();
+
+    // Skip the RETADDR move area
+    const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+    int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+    if (TailCallReturnAddrDelta < 0)
+      Offset -= TailCallReturnAddrDelta;
+  }
+
+  return Offset;
+}
+
+int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+                                             unsigned &FrameReg) const {
+  const X86RegisterInfo *RegInfo =
+      static_cast<const X86RegisterInfo*>(MF.getTarget().getRegisterInfo());
+  // We can't calculate offset from frame pointer if the stack is realigned,
+  // so enforce usage of stack/base pointer.  The base pointer is used when we
+  // have dynamic allocas in addition to dynamic realignment.
+  if (RegInfo->hasBasePointer(MF))
+    FrameReg = RegInfo->getBaseRegister();
+  else if (RegInfo->needsStackRealignment(MF))
+    FrameReg = RegInfo->getStackRegister();
+  else
+    FrameReg = RegInfo->getFrameRegister(MF);
+  return getFrameIndexOffset(MF, FI);
+}
+
+bool X86FrameLowering::assignCalleeSavedSpillSlots(
+    MachineFunction &MF, const TargetRegisterInfo *TRI,
+    std::vector<CalleeSavedInfo> &CSI) const {
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const X86RegisterInfo *RegInfo =
+      static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo());
+  unsigned SlotSize = RegInfo->getSlotSize();
+  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+
+  unsigned CalleeSavedFrameSize = 0;
+  int SpillSlotOffset = getOffsetOfLocalArea() + X86FI->getTCReturnAddrDelta();
+
+  if (hasFP(MF)) {
+    // emitPrologue always spills frame register the first thing.
+    SpillSlotOffset -= SlotSize;
+    MFI->CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);
+
+    // Since emitPrologue and emitEpilogue will handle spilling and restoring of
+    // the frame register, we can delete it from CSI list and not have to worry
+    // about avoiding it later.
+    unsigned FPReg = RegInfo->getFrameRegister(MF);
+    for (unsigned i = 0; i < CSI.size(); ++i) {
+      if (CSI[i].getReg() == FPReg) {
+        CSI.erase(CSI.begin() + i);
+        break;
+      }
+    }
+  }
+
+  // Assign slots for GPRs. It increases frame size.
+  for (unsigned i = CSI.size(); i != 0; --i) {
+    unsigned Reg = CSI[i - 1].getReg();
+
+    if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
+      continue;
+
+    SpillSlotOffset -= SlotSize;
+    CalleeSavedFrameSize += SlotSize;
+
+    int SlotIndex = MFI->CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);
+    CSI[i - 1].setFrameIdx(SlotIndex);
+  }
+
+  X86FI->setCalleeSavedFrameSize(CalleeSavedFrameSize);
+
+  // Assign slots for XMMs.
+  for (unsigned i = CSI.size(); i != 0; --i) {
+    unsigned Reg = CSI[i - 1].getReg();
+    if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
+      continue;
+
+    const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);
+    // ensure alignment
+    SpillSlotOffset -= abs(SpillSlotOffset) % RC->getAlignment();
+    // spill into slot
+    SpillSlotOffset -= RC->getSize();
+    int SlotIndex =
+        MFI->CreateFixedSpillStackObject(RC->getSize(), SpillSlotOffset);
+    CSI[i - 1].setFrameIdx(SlotIndex);
+    MFI->ensureMaxAlignment(RC->getAlignment());
+  }
+
+  return true;
+}
+
+bool X86FrameLowering::spillCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    const std::vector<CalleeSavedInfo> &CSI,
+    const TargetRegisterInfo *TRI) const {
+  DebugLoc DL = MBB.findDebugLoc(MI);
+
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
+
+  // Push GPRs. It increases frame size.
+  unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r;
+  for (unsigned i = CSI.size(); i != 0; --i) {
+    unsigned Reg = CSI[i - 1].getReg();
+
+    if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
+      continue;
+    // Add the callee-saved register as live-in. It's killed at the spill.
+    MBB.addLiveIn(Reg);
+
+    BuildMI(MBB, MI, DL, TII.get(Opc)).addReg(Reg, RegState::Kill)
+      .setMIFlag(MachineInstr::FrameSetup);
+  }
+
+  // Make XMM regs spilled. X86 does not have ability of push/pop XMM.
+  // It can be done by spilling XMMs to stack frame.
+  for (unsigned i = CSI.size(); i != 0; --i) {
+    unsigned Reg = CSI[i-1].getReg();
+    if (X86::GR64RegClass.contains(Reg) ||
+        X86::GR32RegClass.contains(Reg))
+      continue;
+    // Add the callee-saved register as live-in. It's killed at the spill.
+    MBB.addLiveIn(Reg);
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+
+    TII.storeRegToStackSlot(MBB, MI, Reg, true, CSI[i - 1].getFrameIdx(), RC,
+                            TRI);
+    --MI;
+    MI->setFlag(MachineInstr::FrameSetup);
+    ++MI;
+  }
+
+  return true;
+}
+
+bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                               MachineBasicBlock::iterator MI,
+                                        const std::vector<CalleeSavedInfo> &CSI,
+                                          const TargetRegisterInfo *TRI) const {
+  if (CSI.empty())
+    return false;
+
+  DebugLoc DL = MBB.findDebugLoc(MI);
+
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
+
+  // Reload XMMs from stack frame.
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    if (X86::GR64RegClass.contains(Reg) ||
+        X86::GR32RegClass.contains(Reg))
+      continue;
+
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+    TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), RC, TRI);
+  }
+
+  // POP GPRs.
+  unsigned Opc = STI.is64Bit() ? X86::POP64r : X86::POP32r;
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    if (!X86::GR64RegClass.contains(Reg) &&
+        !X86::GR32RegClass.contains(Reg))
+      continue;
+
+    BuildMI(MBB, MI, DL, TII.get(Opc), Reg);
+  }
+  return true;
+}
+
+void
+X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                                       RegScavenger *RS) const {
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const X86RegisterInfo *RegInfo =
+      static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo());
+  unsigned SlotSize = RegInfo->getSlotSize();
+
+  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  int64_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+
+  if (TailCallReturnAddrDelta < 0) {
+    // create RETURNADDR area
+    //   arg
+    //   arg
+    //   RETADDR
+    //   { ...
+    //     RETADDR area
+    //     ...
+    //   }
+    //   [EBP]
+    MFI->CreateFixedObject(-TailCallReturnAddrDelta,
+                           TailCallReturnAddrDelta - SlotSize, true);
+  }
+
+  // Spill the BasePtr if it's used.
+  if (RegInfo->hasBasePointer(MF))
+    MF.getRegInfo().setPhysRegUsed(RegInfo->getBaseRegister());
+}
+
+static bool
+HasNestArgument(const MachineFunction *MF) {
+  const Function *F = MF->getFunction();
+  for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+       I != E; I++) {
+    if (I->hasNestAttr())
+      return true;
+  }
+  return false;
+}
+
+/// GetScratchRegister - Get a temp register for performing work in the
+/// segmented stack and the Erlang/HiPE stack prologue. Depending on platform
+/// and the properties of the function either one or two registers will be
+/// needed. Set primary to true for the first register, false for the second.
+static unsigned
+GetScratchRegister(bool Is64Bit, const MachineFunction &MF, bool Primary) {
+  CallingConv::ID CallingConvention = MF.getFunction()->getCallingConv();
+
+  // Erlang stuff.
+  if (CallingConvention == CallingConv::HiPE) {
+    if (Is64Bit)
+      return Primary ? X86::R14 : X86::R13;
+    else
+      return Primary ? X86::EBX : X86::EDI;
+  }
+
+  if (Is64Bit)
+    return Primary ? X86::R11 : X86::R12;
+
+  bool IsNested = HasNestArgument(&MF);
+
+  if (CallingConvention == CallingConv::X86_FastCall ||
+      CallingConvention == CallingConv::Fast) {
+    if (IsNested)
+      report_fatal_error("Segmented stacks does not support fastcall with "
+                         "nested function.");
+    return Primary ? X86::EAX : X86::ECX;
+  }
+  if (IsNested)
+    return Primary ? X86::EDX : X86::EAX;
+  return Primary ? X86::ECX : X86::EAX;
+}
+
+// The stack limit in the TCB is set to this many bytes above the actual stack
+// limit.
+static const uint64_t kSplitStackAvailable = 256;
+
+void
+X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
+  MachineBasicBlock &prologueMBB = MF.front();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  uint64_t StackSize;
+  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
+  bool Is64Bit = STI.is64Bit();
+  unsigned TlsReg, TlsOffset;
+  DebugLoc DL;
+
+  unsigned ScratchReg = GetScratchRegister(Is64Bit, MF, true);
+  assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&
+         "Scratch register is live-in");
+
+  if (MF.getFunction()->isVarArg())
+    report_fatal_error("Segmented stacks do not support vararg functions.");
+  if (!STI.isTargetLinux() && !STI.isTargetDarwin() &&
+      !STI.isTargetWin32() && !STI.isTargetWin64() && !STI.isTargetFreeBSD())
+    report_fatal_error("Segmented stacks not supported on this platform.");
+
+  // Eventually StackSize will be calculated by a link-time pass; which will
+  // also decide whether checking code needs to be injected into this particular
+  // prologue.
+  StackSize = MFI->getStackSize();
+
+  // Do not generate a prologue for functions with a stack of size zero
+  if (StackSize == 0)
+    return;
+
+  MachineBasicBlock *allocMBB = MF.CreateMachineBasicBlock();
+  MachineBasicBlock *checkMBB = MF.CreateMachineBasicBlock();
+  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  bool IsNested = false;
+
+  // We need to know if the function has a nest argument only in 64 bit mode.
+  if (Is64Bit)
+    IsNested = HasNestArgument(&MF);
+
+  // The MOV R10, RAX needs to be in a different block, since the RET we emit in
+  // allocMBB needs to be last (terminating) instruction.
+
+  for (MachineBasicBlock::livein_iterator i = prologueMBB.livein_begin(),
+         e = prologueMBB.livein_end(); i != e; i++) {
+    allocMBB->addLiveIn(*i);
+    checkMBB->addLiveIn(*i);
+  }
+
+  if (IsNested)
+    allocMBB->addLiveIn(X86::R10);
+
+  MF.push_front(allocMBB);
+  MF.push_front(checkMBB);
+
+  // When the frame size is less than 256 we just compare the stack
+  // boundary directly to the value of the stack pointer, per gcc.
+  bool CompareStackPointer = StackSize < kSplitStackAvailable;
+
+  // Read the limit off the current stacklet off the stack_guard location.
+  if (Is64Bit) {
+    if (STI.isTargetLinux()) {
+      TlsReg = X86::FS;
+      TlsOffset = 0x70;
+    } else if (STI.isTargetDarwin()) {
+      TlsReg = X86::GS;
+      TlsOffset = 0x60 + 90*8; // See pthread_machdep.h. Steal TLS slot 90.
+    } else if (STI.isTargetWin64()) {
+      TlsReg = X86::GS;
+      TlsOffset = 0x28; // pvArbitrary, reserved for application use
+    } else if (STI.isTargetFreeBSD()) {
+      TlsReg = X86::FS;
+      TlsOffset = 0x18;
+    } else {
+      report_fatal_error("Segmented stacks not supported on this platform.");
+    }
+
+    if (CompareStackPointer)
+      ScratchReg = X86::RSP;
+    else
+      BuildMI(checkMBB, DL, TII.get(X86::LEA64r), ScratchReg).addReg(X86::RSP)
+        .addImm(1).addReg(0).addImm(-StackSize).addReg(0);
+
+    BuildMI(checkMBB, DL, TII.get(X86::CMP64rm)).addReg(ScratchReg)
+      .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg);
+  } else {
+    if (STI.isTargetLinux()) {
+      TlsReg = X86::GS;
+      TlsOffset = 0x30;
+    } else if (STI.isTargetDarwin()) {
+      TlsReg = X86::GS;
+      TlsOffset = 0x48 + 90*4;
+    } else if (STI.isTargetWin32()) {
+      TlsReg = X86::FS;
+      TlsOffset = 0x14; // pvArbitrary, reserved for application use
+    } else if (STI.isTargetFreeBSD()) {
+      report_fatal_error("Segmented stacks not supported on FreeBSD i386.");
+    } else {
+      report_fatal_error("Segmented stacks not supported on this platform.");
+    }
+
+    if (CompareStackPointer)
+      ScratchReg = X86::ESP;
+    else
+      BuildMI(checkMBB, DL, TII.get(X86::LEA32r), ScratchReg).addReg(X86::ESP)
+        .addImm(1).addReg(0).addImm(-StackSize).addReg(0);
+
+    if (STI.isTargetLinux() || STI.isTargetWin32() || STI.isTargetWin64()) {
+      BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)).addReg(ScratchReg)
+        .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg);
+    } else if (STI.isTargetDarwin()) {
+
+      // TlsOffset doesn't fit into a mod r/m byte so we need an extra register.
+      unsigned ScratchReg2;
+      bool SaveScratch2;
+      if (CompareStackPointer) {
+        // The primary scratch register is available for holding the TLS offset.
+        ScratchReg2 = GetScratchRegister(Is64Bit, MF, true);
+        SaveScratch2 = false;
+      } else {
+        // Need to use a second register to hold the TLS offset
+        ScratchReg2 = GetScratchRegister(Is64Bit, MF, false);
+
+        // Unfortunately, with fastcc the second scratch register may hold an
+        // argument.
+        SaveScratch2 = MF.getRegInfo().isLiveIn(ScratchReg2);
+      }
+
+      // If Scratch2 is live-in then it needs to be saved.
+      assert((!MF.getRegInfo().isLiveIn(ScratchReg2) || SaveScratch2) &&
+             "Scratch register is live-in and not saved");
+
+      if (SaveScratch2)
+        BuildMI(checkMBB, DL, TII.get(X86::PUSH32r))
+          .addReg(ScratchReg2, RegState::Kill);
+
+      BuildMI(checkMBB, DL, TII.get(X86::MOV32ri), ScratchReg2)
+        .addImm(TlsOffset);
+      BuildMI(checkMBB, DL, TII.get(X86::CMP32rm))
+        .addReg(ScratchReg)
+        .addReg(ScratchReg2).addImm(1).addReg(0)
+        .addImm(0)
+        .addReg(TlsReg);
+
+      if (SaveScratch2)
+        BuildMI(checkMBB, DL, TII.get(X86::POP32r), ScratchReg2);
+    }
+  }
+
+  // This jump is taken if SP >= (Stacklet Limit + Stack Space required).
+  // It jumps to normal execution of the function body.
+  BuildMI(checkMBB, DL, TII.get(X86::JA_4)).addMBB(&prologueMBB);
+
+  // On 32 bit we first push the arguments size and then the frame size. On 64
+  // bit, we pass the stack frame size in r10 and the argument size in r11.
+  if (Is64Bit) {
+    // Functions with nested arguments use R10, so it needs to be saved across
+    // the call to _morestack
+
+    if (IsNested)
+      BuildMI(allocMBB, DL, TII.get(X86::MOV64rr), X86::RAX).addReg(X86::R10);
+
+    BuildMI(allocMBB, DL, TII.get(X86::MOV64ri), X86::R10)
+      .addImm(StackSize);
+    BuildMI(allocMBB, DL, TII.get(X86::MOV64ri), X86::R11)
+      .addImm(X86FI->getArgumentStackSize());
+    MF.getRegInfo().setPhysRegUsed(X86::R10);
+    MF.getRegInfo().setPhysRegUsed(X86::R11);
+  } else {
+    BuildMI(allocMBB, DL, TII.get(X86::PUSHi32))
+      .addImm(X86FI->getArgumentStackSize());
+    BuildMI(allocMBB, DL, TII.get(X86::PUSHi32))
+      .addImm(StackSize);
+  }
+
+  // __morestack is in libgcc
+  if (Is64Bit)
+    BuildMI(allocMBB, DL, TII.get(X86::CALL64pcrel32))
+      .addExternalSymbol("__morestack");
+  else
+    BuildMI(allocMBB, DL, TII.get(X86::CALLpcrel32))
+      .addExternalSymbol("__morestack");
+
+  if (IsNested)
+    BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET_RESTORE_R10));
+  else
+    BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET));
+
+  allocMBB->addSuccessor(&prologueMBB);
+
+  checkMBB->addSuccessor(allocMBB);
+  checkMBB->addSuccessor(&prologueMBB);
+
+#ifdef XDEBUG
+  MF.verify();
+#endif
+}
+
+/// Erlang programs may need a special prologue to handle the stack size they
+/// might need at runtime. That is because Erlang/OTP does not implement a C
+/// stack but uses a custom implementation of hybrid stack/heap architecture.
+/// (for more information see Eric Stenman's Ph.D. thesis:
+/// http://publications.uu.se/uu/fulltext/nbn_se_uu_diva-2688.pdf)
+///
+/// CheckStack:
+///       temp0 = sp - MaxStack
+///       if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
+/// OldStart:
+///       ...
+/// IncStack:
+///       call inc_stack   # doubles the stack space
+///       temp0 = sp - MaxStack
+///       if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
+void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const {
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const unsigned SlotSize =
+      static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo())
+          ->getSlotSize();
+  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
+  const bool Is64Bit = STI.is64Bit();
+  DebugLoc DL;
+  // HiPE-specific values
+  const unsigned HipeLeafWords = 24;
+  const unsigned CCRegisteredArgs = Is64Bit ? 6 : 5;
+  const unsigned Guaranteed = HipeLeafWords * SlotSize;
+  unsigned CallerStkArity = MF.getFunction()->arg_size() > CCRegisteredArgs ?
+                            MF.getFunction()->arg_size() - CCRegisteredArgs : 0;
+  unsigned MaxStack = MFI->getStackSize() + CallerStkArity*SlotSize + SlotSize;
+
+  assert(STI.isTargetLinux() &&
+         "HiPE prologue is only supported on Linux operating systems.");
+
+  // Compute the largest caller's frame that is needed to fit the callees'
+  // frames. This 'MaxStack' is computed from:
+  //
+  // a) the fixed frame size, which is the space needed for all spilled temps,
+  // b) outgoing on-stack parameter areas, and
+  // c) the minimum stack space this function needs to make available for the
+  //    functions it calls (a tunable ABI property).
+  if (MFI->hasCalls()) {
+    unsigned MoreStackForCalls = 0;
+
+    for (MachineFunction::iterator MBBI = MF.begin(), MBBE = MF.end();
+         MBBI != MBBE; ++MBBI)
+      for (MachineBasicBlock::iterator MI = MBBI->begin(), ME = MBBI->end();
+           MI != ME; ++MI) {
+        if (!MI->isCall())
+          continue;
+
+        // Get callee operand.
+        const MachineOperand &MO = MI->getOperand(0);
+
+        // Only take account of global function calls (no closures etc.).
+        if (!MO.isGlobal())
+          continue;
+
+        const Function *F = dyn_cast<Function>(MO.getGlobal());
+        if (!F)
+          continue;
+
+        // Do not update 'MaxStack' for primitive and built-in functions
+        // (encoded with names either starting with "erlang."/"bif_" or not
+        // having a ".", such as a simple <Module>.<Function>.<Arity>, or an
+        // "_", such as the BIF "suspend_0") as they are executed on another
+        // stack.
+        if (F->getName().find("erlang.") != StringRef::npos ||
+            F->getName().find("bif_") != StringRef::npos ||
+            F->getName().find_first_of("._") == StringRef::npos)
+          continue;
+
+        unsigned CalleeStkArity =
+          F->arg_size() > CCRegisteredArgs ? F->arg_size()-CCRegisteredArgs : 0;
+        if (HipeLeafWords - 1 > CalleeStkArity)
+          MoreStackForCalls = std::max(MoreStackForCalls,
+                               (HipeLeafWords - 1 - CalleeStkArity) * SlotSize);
+      }
+    MaxStack += MoreStackForCalls;
+  }
+
+  // If the stack frame needed is larger than the guaranteed then runtime checks
+  // and calls to "inc_stack_0" BIF should be inserted in the assembly prologue.
+  if (MaxStack > Guaranteed) {
+    MachineBasicBlock &prologueMBB = MF.front();
+    MachineBasicBlock *stackCheckMBB = MF.CreateMachineBasicBlock();
+    MachineBasicBlock *incStackMBB = MF.CreateMachineBasicBlock();
+
+    for (MachineBasicBlock::livein_iterator I = prologueMBB.livein_begin(),
+           E = prologueMBB.livein_end(); I != E; I++) {
+      stackCheckMBB->addLiveIn(*I);
+      incStackMBB->addLiveIn(*I);
+    }
+
+    MF.push_front(incStackMBB);
+    MF.push_front(stackCheckMBB);
+
+    unsigned ScratchReg, SPReg, PReg, SPLimitOffset;
+    unsigned LEAop, CMPop, CALLop;
+    if (Is64Bit) {
+      SPReg = X86::RSP;
+      PReg  = X86::RBP;
+      LEAop = X86::LEA64r;
+      CMPop = X86::CMP64rm;
+      CALLop = X86::CALL64pcrel32;
+      SPLimitOffset = 0x90;
+    } else {
+      SPReg = X86::ESP;
+      PReg  = X86::EBP;
+      LEAop = X86::LEA32r;
+      CMPop = X86::CMP32rm;
+      CALLop = X86::CALLpcrel32;
+      SPLimitOffset = 0x4c;
+    }
+
+    ScratchReg = GetScratchRegister(Is64Bit, MF, true);
+    assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&
+           "HiPE prologue scratch register is live-in");
+
+    // Create new MBB for StackCheck:
+    addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(LEAop), ScratchReg),
+                 SPReg, false, -MaxStack);
+    // SPLimitOffset is in a fixed heap location (pointed by BP).
+    addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(CMPop))
+                 .addReg(ScratchReg), PReg, false, SPLimitOffset);
+    BuildMI(stackCheckMBB, DL, TII.get(X86::JAE_4)).addMBB(&prologueMBB);
+
+    // Create new MBB for IncStack:
+    BuildMI(incStackMBB, DL, TII.get(CALLop)).
+      addExternalSymbol("inc_stack_0");
+    addRegOffset(BuildMI(incStackMBB, DL, TII.get(LEAop), ScratchReg),
+                 SPReg, false, -MaxStack);
+    addRegOffset(BuildMI(incStackMBB, DL, TII.get(CMPop))
+                 .addReg(ScratchReg), PReg, false, SPLimitOffset);
+    BuildMI(incStackMBB, DL, TII.get(X86::JLE_4)).addMBB(incStackMBB);
+
+    stackCheckMBB->addSuccessor(&prologueMBB, 99);
+    stackCheckMBB->addSuccessor(incStackMBB, 1);
+    incStackMBB->addSuccessor(&prologueMBB, 99);
+    incStackMBB->addSuccessor(incStackMBB, 1);
+  }
+#ifdef XDEBUG
+  MF.verify();
+#endif
+}
+
+void X86FrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const X86RegisterInfo &RegInfo =
+      *static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo());
+  unsigned StackPtr = RegInfo.getStackRegister();
+  bool reseveCallFrame = hasReservedCallFrame(MF);
+  int Opcode = I->getOpcode();
+  bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode();
+  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
+  bool IsLP64 = STI.isTarget64BitLP64();
+  DebugLoc DL = I->getDebugLoc();
+  uint64_t Amount = !reseveCallFrame ? I->getOperand(0).getImm() : 0;
+  uint64_t CalleeAmt = isDestroy ? I->getOperand(1).getImm() : 0;
+  I = MBB.erase(I);
+
+  if (!reseveCallFrame) {
+    // If the stack pointer can be changed after prologue, turn the
+    // adjcallstackup instruction into a 'sub ESP, <amt>' and the
+    // adjcallstackdown instruction into 'add ESP, <amt>'
+    // TODO: consider using push / pop instead of sub + store / add
+    if (Amount == 0)
+      return;
+
+    // We need to keep the stack aligned properly.  To do this, we round the
+    // amount of space needed for the outgoing arguments up to the next
+    // alignment boundary.
+    unsigned StackAlign =
+        MF.getTarget().getFrameLowering()->getStackAlignment();
+    Amount = (Amount + StackAlign - 1) / StackAlign * StackAlign;
+
+    MachineInstr *New = nullptr;
+    if (Opcode == TII.getCallFrameSetupOpcode()) {
+      New = BuildMI(MF, DL, TII.get(getSUBriOpcode(IsLP64, Amount)),
+                    StackPtr)
+        .addReg(StackPtr)
+        .addImm(Amount);
+    } else {
+      assert(Opcode == TII.getCallFrameDestroyOpcode());
+
+      // Factor out the amount the callee already popped.
+      Amount -= CalleeAmt;
+
+      if (Amount) {
+        unsigned Opc = getADDriOpcode(IsLP64, Amount);
+        New = BuildMI(MF, DL, TII.get(Opc), StackPtr)
+          .addReg(StackPtr).addImm(Amount);
+      }
+    }
+
+    if (New) {
+      // The EFLAGS implicit def is dead.
+      New->getOperand(3).setIsDead();
+
+      // Replace the pseudo instruction with a new instruction.
+      MBB.insert(I, New);
+    }
+
+    return;
+  }
+
+  if (Opcode == TII.getCallFrameDestroyOpcode() && CalleeAmt) {
+    // If we are performing frame pointer elimination and if the callee pops
+    // something off the stack pointer, add it back.  We do this until we have
+    // more advanced stack pointer tracking ability.
+    unsigned Opc = getSUBriOpcode(IsLP64, CalleeAmt);
+    MachineInstr *New = BuildMI(MF, DL, TII.get(Opc), StackPtr)
+      .addReg(StackPtr).addImm(CalleeAmt);
+
+    // The EFLAGS implicit def is dead.
+    New->getOperand(3).setIsDead();
+
+    // We are not tracking the stack pointer adjustment by the callee, so make
+    // sure we restore the stack pointer immediately after the call, there may
+    // be spill code inserted between the CALL and ADJCALLSTACKUP instructions.
+    MachineBasicBlock::iterator B = MBB.begin();
+    while (I != B && !std::prev(I)->isCall())
+      --I;
+    MBB.insert(I, New);
+  }
+}
+
diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.h b/contrib/llvm/lib/Target/X86/X86FrameLowering.h
new file mode 100644
index 0000000..5ad3d4d
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.h
@@ -0,0 +1,74 @@
+//===-- X86TargetFrameLowering.h - Define frame lowering for X86 -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class implements X86-specific bits of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86_FRAMELOWERING_H
+#define X86_FRAMELOWERING_H
+
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+
+class MCSymbol;
+class X86TargetMachine;
+
+class X86FrameLowering : public TargetFrameLowering {
+public:
+  explicit X86FrameLowering(StackDirection D, unsigned StackAl, int LAO)
+    : TargetFrameLowering(StackGrowsDown, StackAl, LAO) {}
+
+  void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MBBI,
+                                 DebugLoc DL) const;
+
+  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+  /// the function.
+  void emitPrologue(MachineFunction &MF) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+  void adjustForSegmentedStacks(MachineFunction &MF) const override;
+
+  void adjustForHiPEPrologue(MachineFunction &MF) const override;
+
+  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                     RegScavenger *RS = nullptr) const override;
+
+  bool
+  assignCalleeSavedSpillSlots(MachineFunction &MF,
+                              const TargetRegisterInfo *TRI,
+                              std::vector<CalleeSavedInfo> &CSI) const override;
+
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 const TargetRegisterInfo *TRI) const override;
+
+  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator MI,
+                                  const std::vector<CalleeSavedInfo> &CSI,
+                                  const TargetRegisterInfo *TRI) const override;
+
+  bool hasFP(const MachineFunction &MF) const override;
+  bool hasReservedCallFrame(const MachineFunction &MF) const override;
+
+  int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
+  int getFrameIndexReference(const MachineFunction &MF, int FI,
+                             unsigned &FrameReg) const override;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                 MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI) const override;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
new file mode 100644
index 0000000..ba2f5f6
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -0,0 +1,2788 @@
+//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a DAG pattern matching instruction selector for X86,
+// converting from a legalized dag to a X86 dag.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-isel"
+
+STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
+
+//===----------------------------------------------------------------------===//
+//                      Pattern Matcher Implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+  /// X86ISelAddressMode - This corresponds to X86AddressMode, but uses
+  /// SDValue's instead of register numbers for the leaves of the matched
+  /// tree.
+  struct X86ISelAddressMode {
+    enum {
+      RegBase,
+      FrameIndexBase
+    } BaseType;
+
+    // This is really a union, discriminated by BaseType!
+    SDValue Base_Reg;
+    int Base_FrameIndex;
+
+    unsigned Scale;
+    SDValue IndexReg;
+    int32_t Disp;
+    SDValue Segment;
+    const GlobalValue *GV;
+    const Constant *CP;
+    const BlockAddress *BlockAddr;
+    const char *ES;
+    int JT;
+    unsigned Align;    // CP alignment.
+    unsigned char SymbolFlags;  // X86II::MO_*
+
+    X86ISelAddressMode()
+      : BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0),
+        Segment(), GV(nullptr), CP(nullptr), BlockAddr(nullptr), ES(nullptr),
+        JT(-1), Align(0), SymbolFlags(X86II::MO_NO_FLAG) {
+    }
+
+    bool hasSymbolicDisplacement() const {
+      return GV != nullptr || CP != nullptr || ES != nullptr ||
+             JT != -1 || BlockAddr != nullptr;
+    }
+
+    bool hasBaseOrIndexReg() const {
+      return BaseType == FrameIndexBase ||
+             IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
+    }
+
+    /// isRIPRelative - Return true if this addressing mode is already RIP
+    /// relative.
+    bool isRIPRelative() const {
+      if (BaseType != RegBase) return false;
+      if (RegisterSDNode *RegNode =
+            dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode()))
+        return RegNode->getReg() == X86::RIP;
+      return false;
+    }
+
+    void setBaseReg(SDValue Reg) {
+      BaseType = RegBase;
+      Base_Reg = Reg;
+    }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+    void dump() {
+      dbgs() << "X86ISelAddressMode " << this << '\n';
+      dbgs() << "Base_Reg ";
+      if (Base_Reg.getNode())
+        Base_Reg.getNode()->dump();
+      else
+        dbgs() << "nul";
+      dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n'
+             << " Scale" << Scale << '\n'
+             << "IndexReg ";
+      if (IndexReg.getNode())
+        IndexReg.getNode()->dump();
+      else
+        dbgs() << "nul";
+      dbgs() << " Disp " << Disp << '\n'
+             << "GV ";
+      if (GV)
+        GV->dump();
+      else
+        dbgs() << "nul";
+      dbgs() << " CP ";
+      if (CP)
+        CP->dump();
+      else
+        dbgs() << "nul";
+      dbgs() << '\n'
+             << "ES ";
+      if (ES)
+        dbgs() << ES;
+      else
+        dbgs() << "nul";
+      dbgs() << " JT" << JT << " Align" << Align << '\n';
+    }
+#endif
+  };
+}
+
+namespace {
+  //===--------------------------------------------------------------------===//
+  /// ISel - X86 specific code to select X86 machine instructions for
+  /// SelectionDAG operations.
+  ///
+  class X86DAGToDAGISel final : public SelectionDAGISel {
+    /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
+    /// make the right decision when generating code for different targets.
+    const X86Subtarget *Subtarget;
+
+    /// OptForSize - If true, selector should try to optimize for code size
+    /// instead of performance.
+    bool OptForSize;
+
+  public:
+    explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
+      : SelectionDAGISel(tm, OptLevel),
+        Subtarget(&tm.getSubtarget<X86Subtarget>()),
+        OptForSize(false) {}
+
+    const char *getPassName() const override {
+      return "X86 DAG->DAG Instruction Selection";
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override {
+      // Reset the subtarget each time through.
+      Subtarget = &TM.getSubtarget<X86Subtarget>();
+      SelectionDAGISel::runOnMachineFunction(MF);
+      return true;
+    }
+
+    void EmitFunctionEntryCode() override;
+
+    bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
+
+    void PreprocessISelDAG() override;
+
+    inline bool immSext8(SDNode *N) const {
+      return isInt<8>(cast<ConstantSDNode>(N)->getSExtValue());
+    }
+
+    // i64immSExt32 predicate - True if the 64-bit immediate fits in a 32-bit
+    // sign extended field.
+    inline bool i64immSExt32(SDNode *N) const {
+      uint64_t v = cast<ConstantSDNode>(N)->getZExtValue();
+      return (int64_t)v == (int32_t)v;
+    }
+
+// Include the pieces autogenerated from the target description.
+#include "X86GenDAGISel.inc"
+
+  private:
+    SDNode *Select(SDNode *N) override;
+    SDNode *SelectGather(SDNode *N, unsigned Opc);
+    SDNode *SelectAtomic64(SDNode *Node, unsigned Opc);
+    SDNode *SelectAtomicLoadArith(SDNode *Node, MVT NVT);
+
+    bool FoldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
+    bool MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM);
+    bool MatchWrapper(SDValue N, X86ISelAddressMode &AM);
+    bool MatchAddress(SDValue N, X86ISelAddressMode &AM);
+    bool MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
+                                 unsigned Depth);
+    bool MatchAddressBase(SDValue N, X86ISelAddressMode &AM);
+    bool SelectAddr(SDNode *Parent, SDValue N, SDValue &Base,
+                    SDValue &Scale, SDValue &Index, SDValue &Disp,
+                    SDValue &Segment);
+    bool SelectMOV64Imm32(SDValue N, SDValue &Imm);
+    bool SelectLEAAddr(SDValue N, SDValue &Base,
+                       SDValue &Scale, SDValue &Index, SDValue &Disp,
+                       SDValue &Segment);
+    bool SelectLEA64_32Addr(SDValue N, SDValue &Base,
+                            SDValue &Scale, SDValue &Index, SDValue &Disp,
+                            SDValue &Segment);
+    bool SelectTLSADDRAddr(SDValue N, SDValue &Base,
+                           SDValue &Scale, SDValue &Index, SDValue &Disp,
+                           SDValue &Segment);
+    bool SelectScalarSSELoad(SDNode *Root, SDValue N,
+                             SDValue &Base, SDValue &Scale,
+                             SDValue &Index, SDValue &Disp,
+                             SDValue &Segment,
+                             SDValue &NodeWithChain);
+
+    bool TryFoldLoad(SDNode *P, SDValue N,
+                     SDValue &Base, SDValue &Scale,
+                     SDValue &Index, SDValue &Disp,
+                     SDValue &Segment);
+
+    /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+    /// inline asm expressions.
+    bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                      char ConstraintCode,
+                                      std::vector<SDValue> &OutOps) override;
+
+    void EmitSpecialCodeForMain(MachineBasicBlock *BB, MachineFrameInfo *MFI);
+
+    inline void getAddressOperands(X86ISelAddressMode &AM, SDValue &Base,
+                                   SDValue &Scale, SDValue &Index,
+                                   SDValue &Disp, SDValue &Segment) {
+      Base  = (AM.BaseType == X86ISelAddressMode::FrameIndexBase) ?
+        CurDAG->getTargetFrameIndex(AM.Base_FrameIndex,
+                                    getTargetLowering()->getPointerTy()) :
+        AM.Base_Reg;
+      Scale = getI8Imm(AM.Scale);
+      Index = AM.IndexReg;
+      // These are 32-bit even in 64-bit mode since RIP relative offset
+      // is 32-bit.
+      if (AM.GV)
+        Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
+                                              MVT::i32, AM.Disp,
+                                              AM.SymbolFlags);
+      else if (AM.CP)
+        Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32,
+                                             AM.Align, AM.Disp, AM.SymbolFlags);
+      else if (AM.ES) {
+        assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
+        Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
+      } else if (AM.JT != -1) {
+        assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
+        Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
+      } else if (AM.BlockAddr)
+        Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
+                                             AM.SymbolFlags);
+      else
+        Disp = CurDAG->getTargetConstant(AM.Disp, MVT::i32);
+
+      if (AM.Segment.getNode())
+        Segment = AM.Segment;
+      else
+        Segment = CurDAG->getRegister(0, MVT::i32);
+    }
+
+    /// getI8Imm - Return a target constant with the specified value, of type
+    /// i8.
+    inline SDValue getI8Imm(unsigned Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i8);
+    }
+
+    /// getI32Imm - Return a target constant with the specified value, of type
+    /// i32.
+    inline SDValue getI32Imm(unsigned Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i32);
+    }
+
+    /// getGlobalBaseReg - Return an SDNode that returns the value of
+    /// the global base register. Output instructions required to
+    /// initialize the global base register, if necessary.
+    ///
+    SDNode *getGlobalBaseReg();
+
+    /// getTargetMachine - Return a reference to the TargetMachine, casted
+    /// to the target-specific type.
+    const X86TargetMachine &getTargetMachine() const {
+      return static_cast<const X86TargetMachine &>(TM);
+    }
+
+    /// getInstrInfo - Return a reference to the TargetInstrInfo, casted
+    /// to the target-specific type.
+    const X86InstrInfo *getInstrInfo() const {
+      return getTargetMachine().getInstrInfo();
+    }
+  };
+}
+
+
+bool
+X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
+  if (OptLevel == CodeGenOpt::None) return false;
+
+  if (!N.hasOneUse())
+    return false;
+
+  if (N.getOpcode() != ISD::LOAD)
+    return true;
+
+  // If N is a load, do additional profitability checks.
+  if (U == Root) {
+    switch (U->getOpcode()) {
+    default: break;
+    case X86ISD::ADD:
+    case X86ISD::SUB:
+    case X86ISD::AND:
+    case X86ISD::XOR:
+    case X86ISD::OR:
+    case ISD::ADD:
+    case ISD::ADDC:
+    case ISD::ADDE:
+    case ISD::AND:
+    case ISD::OR:
+    case ISD::XOR: {
+      SDValue Op1 = U->getOperand(1);
+
+      // If the other operand is a 8-bit immediate we should fold the immediate
+      // instead. This reduces code size.
+      // e.g.
+      // movl 4(%esp), %eax
+      // addl $4, %eax
+      // vs.
+      // movl $4, %eax
+      // addl 4(%esp), %eax
+      // The former is 2 bytes shorter. In case where the increment is 1, then
+      // the saving can be 4 bytes (by using incl %eax).
+      if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Op1))
+        if (Imm->getAPIntValue().isSignedIntN(8))
+          return false;
+
+      // If the other operand is a TLS address, we should fold it instead.
+      // This produces
+      // movl    %gs:0, %eax
+      // leal    i@NTPOFF(%eax), %eax
+      // instead of
+      // movl    $i@NTPOFF, %eax
+      // addl    %gs:0, %eax
+      // if the block also has an access to a second TLS address this will save
+      // a load.
+      // FIXME: This is probably also true for non-TLS addresses.
+      if (Op1.getOpcode() == X86ISD::Wrapper) {
+        SDValue Val = Op1.getOperand(0);
+        if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
+          return false;
+      }
+    }
+    }
+  }
+
+  return true;
+}
+
+/// MoveBelowCallOrigChain - Replace the original chain operand of the call with
+/// load's chain operand and move load below the call's chain operand.
+static void MoveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
+                               SDValue Call, SDValue OrigChain) {
+  SmallVector<SDValue, 8> Ops;
+  SDValue Chain = OrigChain.getOperand(0);
+  if (Chain.getNode() == Load.getNode())
+    Ops.push_back(Load.getOperand(0));
+  else {
+    assert(Chain.getOpcode() == ISD::TokenFactor &&
+           "Unexpected chain operand");
+    for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
+      if (Chain.getOperand(i).getNode() == Load.getNode())
+        Ops.push_back(Load.getOperand(0));
+      else
+        Ops.push_back(Chain.getOperand(i));
+    SDValue NewChain =
+      CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops);
+    Ops.clear();
+    Ops.push_back(NewChain);
+  }
+  for (unsigned i = 1, e = OrigChain.getNumOperands(); i != e; ++i)
+    Ops.push_back(OrigChain.getOperand(i));
+  CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops);
+  CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
+                             Load.getOperand(1), Load.getOperand(2));
+
+  unsigned NumOps = Call.getNode()->getNumOperands();
+  Ops.clear();
+  Ops.push_back(SDValue(Load.getNode(), 1));
+  for (unsigned i = 1, e = NumOps; i != e; ++i)
+    Ops.push_back(Call.getOperand(i));
+  CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
+}
+
+/// isCalleeLoad - Return true if call address is a load and it can be
+/// moved below CALLSEQ_START and the chains leading up to the call.
+/// Return the CALLSEQ_START by reference as a second output.
+/// In the case of a tail call, there isn't a callseq node between the call
+/// chain and the load.
+static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
+  // The transformation is somewhat dangerous if the call's chain was glued to
+  // the call. After MoveBelowOrigChain the load is moved between the call and
+  // the chain, this can create a cycle if the load is not folded. So it is
+  // *really* important that we are sure the load will be folded.
+  if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
+    return false;
+  LoadSDNode *LD = dyn_cast<LoadSDNode>(Callee.getNode());
+  if (!LD ||
+      LD->isVolatile() ||
+      LD->getAddressingMode() != ISD::UNINDEXED ||
+      LD->getExtensionType() != ISD::NON_EXTLOAD)
+    return false;
+
+  // Now let's find the callseq_start.
+  while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
+    if (!Chain.hasOneUse())
+      return false;
+    Chain = Chain.getOperand(0);
+  }
+
+  if (!Chain.getNumOperands())
+    return false;
+  // Since we are not checking for AA here, conservatively abort if the chain
+  // writes to memory. It's not safe to move the callee (a load) across a store.
+  if (isa<MemSDNode>(Chain.getNode()) &&
+      cast<MemSDNode>(Chain.getNode())->writeMem())
+    return false;
+  if (Chain.getOperand(0).getNode() == Callee.getNode())
+    return true;
+  if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
+      Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
+      Callee.getValue(1).hasOneUse())
+    return true;
+  return false;
+}
+
+void X86DAGToDAGISel::PreprocessISelDAG() {
+  // OptForSize is used in pattern predicates that isel is matching.
+  OptForSize = MF->getFunction()->getAttributes().
+    hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
+
+  for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
+       E = CurDAG->allnodes_end(); I != E; ) {
+    SDNode *N = I++;  // Preincrement iterator to avoid invalidation issues.
+
+    if (OptLevel != CodeGenOpt::None &&
+        // Only does this when target favors doesn't favor register indirect
+        // call.
+        ((N->getOpcode() == X86ISD::CALL && !Subtarget->callRegIndirect()) ||
+         (N->getOpcode() == X86ISD::TC_RETURN &&
+          // Only does this if load can be folded into TC_RETURN.
+          (Subtarget->is64Bit() ||
+           getTargetMachine().getRelocationModel() != Reloc::PIC_)))) {
+      /// Also try moving call address load from outside callseq_start to just
+      /// before the call to allow it to be folded.
+      ///
+      ///     [Load chain]
+      ///         ^
+      ///         |
+      ///       [Load]
+      ///       ^    ^
+      ///       |    |
+      ///      /      \--
+      ///     /          |
+      ///[CALLSEQ_START] |
+      ///     ^          |
+      ///     |          |
+      /// [LOAD/C2Reg]   |
+      ///     |          |
+      ///      \        /
+      ///       \      /
+      ///       [CALL]
+      bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
+      SDValue Chain = N->getOperand(0);
+      SDValue Load  = N->getOperand(1);
+      if (!isCalleeLoad(Load, Chain, HasCallSeq))
+        continue;
+      MoveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
+      ++NumLoadMoved;
+      continue;
+    }
+
+    // Lower fpround and fpextend nodes that target the FP stack to be store and
+    // load to the stack.  This is a gross hack.  We would like to simply mark
+    // these as being illegal, but when we do that, legalize produces these when
+    // it expands calls, then expands these in the same legalize pass.  We would
+    // like dag combine to be able to hack on these between the call expansion
+    // and the node legalization.  As such this pass basically does "really
+    // late" legalization of these inline with the X86 isel pass.
+    // FIXME: This should only happen when not compiled with -O0.
+    if (N->getOpcode() != ISD::FP_ROUND && N->getOpcode() != ISD::FP_EXTEND)
+      continue;
+
+    MVT SrcVT = N->getOperand(0).getSimpleValueType();
+    MVT DstVT = N->getSimpleValueType(0);
+
+    // If any of the sources are vectors, no fp stack involved.
+    if (SrcVT.isVector() || DstVT.isVector())
+      continue;
+
+    // If the source and destination are SSE registers, then this is a legal
+    // conversion that should not be lowered.
+    const X86TargetLowering *X86Lowering =
+        static_cast<const X86TargetLowering *>(getTargetLowering());
+    bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
+    bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
+    if (SrcIsSSE && DstIsSSE)
+      continue;
+
+    if (!SrcIsSSE && !DstIsSSE) {
+      // If this is an FPStack extension, it is a noop.
+      if (N->getOpcode() == ISD::FP_EXTEND)
+        continue;
+      // If this is a value-preserving FPStack truncation, it is a noop.
+      if (N->getConstantOperandVal(1))
+        continue;
+    }
+
+    // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
+    // FPStack has extload and truncstore.  SSE can fold direct loads into other
+    // operations.  Based on this, decide what we want to do.
+    MVT MemVT;
+    if (N->getOpcode() == ISD::FP_ROUND)
+      MemVT = DstVT;  // FP_ROUND must use DstVT, we can't do a 'trunc load'.
+    else
+      MemVT = SrcIsSSE ? SrcVT : DstVT;
+
+    SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
+    SDLoc dl(N);
+
+    // FIXME: optimize the case where the src/dest is a load or store?
+    SDValue Store = CurDAG->getTruncStore(CurDAG->getEntryNode(), dl,
+                                          N->getOperand(0),
+                                          MemTmp, MachinePointerInfo(), MemVT,
+                                          false, false, 0);
+    SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
+                                        MachinePointerInfo(),
+                                        MemVT, false, false, 0);
+
+    // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
+    // extload we created.  This will cause general havok on the dag because
+    // anything below the conversion could be folded into other existing nodes.
+    // To avoid invalidating 'I', back it up to the convert node.
+    --I;
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
+
+    // Now that we did that, the node is dead.  Increment the iterator to the
+    // next node to process, then delete N.
+    ++I;
+    CurDAG->DeleteNode(N);
+  }
+}
+
+
+/// EmitSpecialCodeForMain - Emit any code that needs to be executed only in
+/// the main function.
+void X86DAGToDAGISel::EmitSpecialCodeForMain(MachineBasicBlock *BB,
+                                             MachineFrameInfo *MFI) {
+  const TargetInstrInfo *TII = TM.getInstrInfo();
+  if (Subtarget->isTargetCygMing()) {
+    unsigned CallOp =
+      Subtarget->is64Bit() ? X86::CALL64pcrel32 : X86::CALLpcrel32;
+    BuildMI(BB, DebugLoc(),
+            TII->get(CallOp)).addExternalSymbol("__main");
+  }
+}
+
+void X86DAGToDAGISel::EmitFunctionEntryCode() {
+  // If this is main, emit special code for main.
+  if (const Function *Fn = MF->getFunction())
+    if (Fn->hasExternalLinkage() && Fn->getName() == "main")
+      EmitSpecialCodeForMain(MF->begin(), MF->getFrameInfo());
+}
+
+static bool isDispSafeForFrameIndex(int64_t Val) {
+  // On 64-bit platforms, we can run into an issue where a frame index
+  // includes a displacement that, when added to the explicit displacement,
+  // will overflow the displacement field. Assuming that the frame index
+  // displacement fits into a 31-bit integer  (which is only slightly more
+  // aggressive than the current fundamental assumption that it fits into
+  // a 32-bit integer), a 31-bit disp should always be safe.
+  return isInt<31>(Val);
+}
+
+bool X86DAGToDAGISel::FoldOffsetIntoAddress(uint64_t Offset,
+                                            X86ISelAddressMode &AM) {
+  int64_t Val = AM.Disp + Offset;
+  CodeModel::Model M = TM.getCodeModel();
+  if (Subtarget->is64Bit()) {
+    if (!X86::isOffsetSuitableForCodeModel(Val, M,
+                                           AM.hasSymbolicDisplacement()))
+      return true;
+    // In addition to the checks required for a register base, check that
+    // we do not try to use an unsafe Disp with a frame index.
+    if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
+        !isDispSafeForFrameIndex(Val))
+      return true;
+  }
+  AM.Disp = Val;
+  return false;
+
+}
+
+bool X86DAGToDAGISel::MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
+  SDValue Address = N->getOperand(1);
+
+  // load gs:0 -> GS segment register.
+  // load fs:0 -> FS segment register.
+  //
+  // This optimization is valid because the GNU TLS model defines that
+  // gs:0 (or fs:0 on X86-64) contains its own address.
+  // For more information see http://people.redhat.com/drepper/tls.pdf
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address))
+    if (C->getSExtValue() == 0 && AM.Segment.getNode() == nullptr &&
+        Subtarget->isTargetLinux())
+      switch (N->getPointerInfo().getAddrSpace()) {
+      case 256:
+        AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
+        return false;
+      case 257:
+        AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
+        return false;
+      }
+
+  return true;
+}
+
+/// MatchWrapper - Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes
+/// into an addressing mode.  These wrap things that will resolve down into a
+/// symbol reference.  If no match is possible, this returns true, otherwise it
+/// returns false.
+bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) {
+  // If the addressing mode already has a symbol as the displacement, we can
+  // never match another symbol.
+  if (AM.hasSymbolicDisplacement())
+    return true;
+
+  SDValue N0 = N.getOperand(0);
+  CodeModel::Model M = TM.getCodeModel();
+
+  // Handle X86-64 rip-relative addresses.  We check this before checking direct
+  // folding because RIP is preferable to non-RIP accesses.
+  if (Subtarget->is64Bit() && N.getOpcode() == X86ISD::WrapperRIP &&
+      // Under X86-64 non-small code model, GV (and friends) are 64-bits, so
+      // they cannot be folded into immediate fields.
+      // FIXME: This can be improved for kernel and other models?
+      (M == CodeModel::Small || M == CodeModel::Kernel)) {
+    // Base and index reg must be 0 in order to use %rip as base.
+    if (AM.hasBaseOrIndexReg())
+      return true;
+    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
+      X86ISelAddressMode Backup = AM;
+      AM.GV = G->getGlobal();
+      AM.SymbolFlags = G->getTargetFlags();
+      if (FoldOffsetIntoAddress(G->getOffset(), AM)) {
+        AM = Backup;
+        return true;
+      }
+    } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
+      X86ISelAddressMode Backup = AM;
+      AM.CP = CP->getConstVal();
+      AM.Align = CP->getAlignment();
+      AM.SymbolFlags = CP->getTargetFlags();
+      if (FoldOffsetIntoAddress(CP->getOffset(), AM)) {
+        AM = Backup;
+        return true;
+      }
+    } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
+      AM.ES = S->getSymbol();
+      AM.SymbolFlags = S->getTargetFlags();
+    } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
+      AM.JT = J->getIndex();
+      AM.SymbolFlags = J->getTargetFlags();
+    } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
+      X86ISelAddressMode Backup = AM;
+      AM.BlockAddr = BA->getBlockAddress();
+      AM.SymbolFlags = BA->getTargetFlags();
+      if (FoldOffsetIntoAddress(BA->getOffset(), AM)) {
+        AM = Backup;
+        return true;
+      }
+    } else
+      llvm_unreachable("Unhandled symbol reference node.");
+
+    if (N.getOpcode() == X86ISD::WrapperRIP)
+      AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
+    return false;
+  }
+
+  // Handle the case when globals fit in our immediate field: This is true for
+  // X86-32 always and X86-64 when in -mcmodel=small mode.  In 64-bit
+  // mode, this only applies to a non-RIP-relative computation.
+  if (!Subtarget->is64Bit() ||
+      M == CodeModel::Small || M == CodeModel::Kernel) {
+    assert(N.getOpcode() != X86ISD::WrapperRIP &&
+           "RIP-relative addressing already handled");
+    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
+      AM.GV = G->getGlobal();
+      AM.Disp += G->getOffset();
+      AM.SymbolFlags = G->getTargetFlags();
+    } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
+      AM.CP = CP->getConstVal();
+      AM.Align = CP->getAlignment();
+      AM.Disp += CP->getOffset();
+      AM.SymbolFlags = CP->getTargetFlags();
+    } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
+      AM.ES = S->getSymbol();
+      AM.SymbolFlags = S->getTargetFlags();
+    } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
+      AM.JT = J->getIndex();
+      AM.SymbolFlags = J->getTargetFlags();
+    } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
+      AM.BlockAddr = BA->getBlockAddress();
+      AM.Disp += BA->getOffset();
+      AM.SymbolFlags = BA->getTargetFlags();
+    } else
+      llvm_unreachable("Unhandled symbol reference node.");
+    return false;
+  }
+
+  return true;
+}
+
+/// MatchAddress - Add the specified node to the specified addressing mode,
+/// returning true if it cannot be done.  This just pattern matches for the
+/// addressing mode.
+bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM) {
+  if (MatchAddressRecursively(N, AM, 0))
+    return true;
+
+  // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
+  // a smaller encoding and avoids a scaled-index.
+  if (AM.Scale == 2 &&
+      AM.BaseType == X86ISelAddressMode::RegBase &&
+      AM.Base_Reg.getNode() == nullptr) {
+    AM.Base_Reg = AM.IndexReg;
+    AM.Scale = 1;
+  }
+
+  // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
+  // because it has a smaller encoding.
+  // TODO: Which other code models can use this?
+  if (TM.getCodeModel() == CodeModel::Small &&
+      Subtarget->is64Bit() &&
+      AM.Scale == 1 &&
+      AM.BaseType == X86ISelAddressMode::RegBase &&
+      AM.Base_Reg.getNode() == nullptr &&
+      AM.IndexReg.getNode() == nullptr &&
+      AM.SymbolFlags == X86II::MO_NO_FLAG &&
+      AM.hasSymbolicDisplacement())
+    AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
+
+  return false;
+}
+
+// Insert a node into the DAG at least before the Pos node's position. This
+// will reposition the node as needed, and will assign it a node ID that is <=
+// the Pos node's ID. Note that this does *not* preserve the uniqueness of node
+// IDs! The selection DAG must no longer depend on their uniqueness when this
+// is used.
+static void InsertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
+  if (N.getNode()->getNodeId() == -1 ||
+      N.getNode()->getNodeId() > Pos.getNode()->getNodeId()) {
+    DAG.RepositionNode(Pos.getNode(), N.getNode());
+    N.getNode()->setNodeId(Pos.getNode()->getNodeId());
+  }
+}
+
+// Transform "(X >> (8-C1)) & C2" to "(X >> 8) & 0xff)" if safe. This
+// allows us to convert the shift and and into an h-register extract and
+// a scaled index. Returns false if the simplification is performed.
+static bool FoldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
+                                      uint64_t Mask,
+                                      SDValue Shift, SDValue X,
+                                      X86ISelAddressMode &AM) {
+  if (Shift.getOpcode() != ISD::SRL ||
+      !isa<ConstantSDNode>(Shift.getOperand(1)) ||
+      !Shift.hasOneUse())
+    return true;
+
+  int ScaleLog = 8 - Shift.getConstantOperandVal(1);
+  if (ScaleLog <= 0 || ScaleLog >= 4 ||
+      Mask != (0xffu << ScaleLog))
+    return true;
+
+  MVT VT = N.getSimpleValueType();
+  SDLoc DL(N);
+  SDValue Eight = DAG.getConstant(8, MVT::i8);
+  SDValue NewMask = DAG.getConstant(0xff, VT);
+  SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, X, Eight);
+  SDValue And = DAG.getNode(ISD::AND, DL, VT, Srl, NewMask);
+  SDValue ShlCount = DAG.getConstant(ScaleLog, MVT::i8);
+  SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And, ShlCount);
+
+  // Insert the new nodes into the topological ordering. We must do this in
+  // a valid topological ordering as nothing is going to go back and re-sort
+  // these nodes. We continually insert before 'N' in sequence as this is
+  // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
+  // hierarchy left to express.
+  InsertDAGNode(DAG, N, Eight);
+  InsertDAGNode(DAG, N, Srl);
+  InsertDAGNode(DAG, N, NewMask);
+  InsertDAGNode(DAG, N, And);
+  InsertDAGNode(DAG, N, ShlCount);
+  InsertDAGNode(DAG, N, Shl);
+  DAG.ReplaceAllUsesWith(N, Shl);
+  AM.IndexReg = And;
+  AM.Scale = (1 << ScaleLog);
+  return false;
+}
+
+// Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
+// allows us to fold the shift into this addressing mode. Returns false if the
+// transform succeeded.
+static bool FoldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
+                                        uint64_t Mask,
+                                        SDValue Shift, SDValue X,
+                                        X86ISelAddressMode &AM) {
+  if (Shift.getOpcode() != ISD::SHL ||
+      !isa<ConstantSDNode>(Shift.getOperand(1)))
+    return true;
+
+  // Not likely to be profitable if either the AND or SHIFT node has more
+  // than one use (unless all uses are for address computation). Besides,
+  // isel mechanism requires their node ids to be reused.
+  if (!N.hasOneUse() || !Shift.hasOneUse())
+    return true;
+
+  // Verify that the shift amount is something we can fold.
+  unsigned ShiftAmt = Shift.getConstantOperandVal(1);
+  if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
+    return true;
+
+  MVT VT = N.getSimpleValueType();
+  SDLoc DL(N);
+  SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, VT);
+  SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
+  SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
+
+  // Insert the new nodes into the topological ordering. We must do this in
+  // a valid topological ordering as nothing is going to go back and re-sort
+  // these nodes. We continually insert before 'N' in sequence as this is
+  // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
+  // hierarchy left to express.
+  InsertDAGNode(DAG, N, NewMask);
+  InsertDAGNode(DAG, N, NewAnd);
+  InsertDAGNode(DAG, N, NewShift);
+  DAG.ReplaceAllUsesWith(N, NewShift);
+
+  AM.Scale = 1 << ShiftAmt;
+  AM.IndexReg = NewAnd;
+  return false;
+}
+
+// Implement some heroics to detect shifts of masked values where the mask can
+// be replaced by extending the shift and undoing that in the addressing mode
+// scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
+// (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
+// the addressing mode. This results in code such as:
+//
+//   int f(short *y, int *lookup_table) {
+//     ...
+//     return *y + lookup_table[*y >> 11];
+//   }
+//
+// Turning into:
+//   movzwl (%rdi), %eax
+//   movl %eax, %ecx
+//   shrl $11, %ecx
+//   addl (%rsi,%rcx,4), %eax
+//
+// Instead of:
+//   movzwl (%rdi), %eax
+//   movl %eax, %ecx
+//   shrl $9, %ecx
+//   andl $124, %rcx
+//   addl (%rsi,%rcx), %eax
+//
+// Note that this function assumes the mask is provided as a mask *after* the
+// value is shifted. The input chain may or may not match that, but computing
+// such a mask is trivial.
+static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
+                                    uint64_t Mask,
+                                    SDValue Shift, SDValue X,
+                                    X86ISelAddressMode &AM) {
+  if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
+      !isa<ConstantSDNode>(Shift.getOperand(1)))
+    return true;
+
+  unsigned ShiftAmt = Shift.getConstantOperandVal(1);
+  unsigned MaskLZ = countLeadingZeros(Mask);
+  unsigned MaskTZ = countTrailingZeros(Mask);
+
+  // The amount of shift we're trying to fit into the addressing mode is taken
+  // from the trailing zeros of the mask.
+  unsigned AMShiftAmt = MaskTZ;
+
+  // There is nothing we can do here unless the mask is removing some bits.
+  // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
+  if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true;
+
+  // We also need to ensure that mask is a continuous run of bits.
+  if (CountTrailingOnes_64(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true;
+
+  // Scale the leading zero count down based on the actual size of the value.
+  // Also scale it down based on the size of the shift.
+  MaskLZ -= (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
+
+  // The final check is to ensure that any masked out high bits of X are
+  // already known to be zero. Otherwise, the mask has a semantic impact
+  // other than masking out a couple of low bits. Unfortunately, because of
+  // the mask, zero extensions will be removed from operands in some cases.
+  // This code works extra hard to look through extensions because we can
+  // replace them with zero extensions cheaply if necessary.
+  bool ReplacingAnyExtend = false;
+  if (X.getOpcode() == ISD::ANY_EXTEND) {
+    unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
+                          X.getOperand(0).getSimpleValueType().getSizeInBits();
+    // Assume that we'll replace the any-extend with a zero-extend, and
+    // narrow the search to the extended value.
+    X = X.getOperand(0);
+    MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
+    ReplacingAnyExtend = true;
+  }
+  APInt MaskedHighBits =
+    APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
+  APInt KnownZero, KnownOne;
+  DAG.computeKnownBits(X, KnownZero, KnownOne);
+  if (MaskedHighBits != KnownZero) return true;
+
+  // We've identified a pattern that can be transformed into a single shift
+  // and an addressing mode. Make it so.
+  MVT VT = N.getSimpleValueType();
+  if (ReplacingAnyExtend) {
+    assert(X.getValueType() != VT);
+    // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
+    SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
+    InsertDAGNode(DAG, N, NewX);
+    X = NewX;
+  }
+  SDLoc DL(N);
+  SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, MVT::i8);
+  SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
+  SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, MVT::i8);
+  SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewSRL, NewSHLAmt);
+
+  // Insert the new nodes into the topological ordering. We must do this in
+  // a valid topological ordering as nothing is going to go back and re-sort
+  // these nodes. We continually insert before 'N' in sequence as this is
+  // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
+  // hierarchy left to express.
+  InsertDAGNode(DAG, N, NewSRLAmt);
+  InsertDAGNode(DAG, N, NewSRL);
+  InsertDAGNode(DAG, N, NewSHLAmt);
+  InsertDAGNode(DAG, N, NewSHL);
+  DAG.ReplaceAllUsesWith(N, NewSHL);
+
+  AM.Scale = 1 << AMShiftAmt;
+  AM.IndexReg = NewSRL;
+  return false;
+}
+
+bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
+                                              unsigned Depth) {
+  SDLoc dl(N);
+  DEBUG({
+      dbgs() << "MatchAddress: ";
+      AM.dump();
+    });
+  // Limit recursion.
+  if (Depth > 5)
+    return MatchAddressBase(N, AM);
+
+  // If this is already a %rip relative address, we can only merge immediates
+  // into it.  Instead of handling this in every case, we handle it here.
+  // RIP relative addressing: %rip + 32-bit displacement!
+  if (AM.isRIPRelative()) {
+    // FIXME: JumpTable and ExternalSymbol address currently don't like
+    // displacements.  It isn't very important, but this should be fixed for
+    // consistency.
+    if (!AM.ES && AM.JT != -1) return true;
+
+    if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N))
+      if (!FoldOffsetIntoAddress(Cst->getSExtValue(), AM))
+        return false;
+    return true;
+  }
+
+  switch (N.getOpcode()) {
+  default: break;
+  case ISD::Constant: {
+    uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
+    if (!FoldOffsetIntoAddress(Val, AM))
+      return false;
+    break;
+  }
+
+  case X86ISD::Wrapper:
+  case X86ISD::WrapperRIP:
+    if (!MatchWrapper(N, AM))
+      return false;
+    break;
+
+  case ISD::LOAD:
+    if (!MatchLoadInAddress(cast<LoadSDNode>(N), AM))
+      return false;
+    break;
+
+  case ISD::FrameIndex:
+    if (AM.BaseType == X86ISelAddressMode::RegBase &&
+        AM.Base_Reg.getNode() == nullptr &&
+        (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) {
+      AM.BaseType = X86ISelAddressMode::FrameIndexBase;
+      AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
+      return false;
+    }
+    break;
+
+  case ISD::SHL:
+    if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
+      break;
+
+    if (ConstantSDNode
+          *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1))) {
+      unsigned Val = CN->getZExtValue();
+      // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
+      // that the base operand remains free for further matching. If
+      // the base doesn't end up getting used, a post-processing step
+      // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
+      if (Val == 1 || Val == 2 || Val == 3) {
+        AM.Scale = 1 << Val;
+        SDValue ShVal = N.getNode()->getOperand(0);
+
+        // Okay, we know that we have a scale by now.  However, if the scaled
+        // value is an add of something and a constant, we can fold the
+        // constant into the disp field here.
+        if (CurDAG->isBaseWithConstantOffset(ShVal)) {
+          AM.IndexReg = ShVal.getNode()->getOperand(0);
+          ConstantSDNode *AddVal =
+            cast<ConstantSDNode>(ShVal.getNode()->getOperand(1));
+          uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val;
+          if (!FoldOffsetIntoAddress(Disp, AM))
+            return false;
+        }
+
+        AM.IndexReg = ShVal;
+        return false;
+      }
+    }
+    break;
+
+  case ISD::SRL: {
+    // Scale must not be used already.
+    if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
+
+    SDValue And = N.getOperand(0);
+    if (And.getOpcode() != ISD::AND) break;
+    SDValue X = And.getOperand(0);
+
+    // We only handle up to 64-bit values here as those are what matter for
+    // addressing mode optimizations.
+    if (X.getSimpleValueType().getSizeInBits() > 64) break;
+
+    // The mask used for the transform is expected to be post-shift, but we
+    // found the shift first so just apply the shift to the mask before passing
+    // it down.
+    if (!isa<ConstantSDNode>(N.getOperand(1)) ||
+        !isa<ConstantSDNode>(And.getOperand(1)))
+      break;
+    uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1);
+
+    // Try to fold the mask and shift into the scale, and return false if we
+    // succeed.
+    if (!FoldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
+      return false;
+    break;
+  }
+
+  case ISD::SMUL_LOHI:
+  case ISD::UMUL_LOHI:
+    // A mul_lohi where we need the low part can be folded as a plain multiply.
+    if (N.getResNo() != 0) break;
+    // FALL THROUGH
+  case ISD::MUL:
+  case X86ISD::MUL_IMM:
+    // X*[3,5,9] -> X+X*[2,4,8]
+    if (AM.BaseType == X86ISelAddressMode::RegBase &&
+        AM.Base_Reg.getNode() == nullptr &&
+        AM.IndexReg.getNode() == nullptr) {
+      if (ConstantSDNode
+            *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1)))
+        if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
+            CN->getZExtValue() == 9) {
+          AM.Scale = unsigned(CN->getZExtValue())-1;
+
+          SDValue MulVal = N.getNode()->getOperand(0);
+          SDValue Reg;
+
+          // Okay, we know that we have a scale by now.  However, if the scaled
+          // value is an add of something and a constant, we can fold the
+          // constant into the disp field here.
+          if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
+              isa<ConstantSDNode>(MulVal.getNode()->getOperand(1))) {
+            Reg = MulVal.getNode()->getOperand(0);
+            ConstantSDNode *AddVal =
+              cast<ConstantSDNode>(MulVal.getNode()->getOperand(1));
+            uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
+            if (FoldOffsetIntoAddress(Disp, AM))
+              Reg = N.getNode()->getOperand(0);
+          } else {
+            Reg = N.getNode()->getOperand(0);
+          }
+
+          AM.IndexReg = AM.Base_Reg = Reg;
+          return false;
+        }
+    }
+    break;
+
+  case ISD::SUB: {
+    // Given A-B, if A can be completely folded into the address and
+    // the index field with the index field unused, use -B as the index.
+    // This is a win if a has multiple parts that can be folded into
+    // the address. Also, this saves a mov if the base register has
+    // other uses, since it avoids a two-address sub instruction, however
+    // it costs an additional mov if the index register has other uses.
+
+    // Add an artificial use to this node so that we can keep track of
+    // it if it gets CSE'd with a different node.
+    HandleSDNode Handle(N);
+
+    // Test if the LHS of the sub can be folded.
+    X86ISelAddressMode Backup = AM;
+    if (MatchAddressRecursively(N.getNode()->getOperand(0), AM, Depth+1)) {
+      AM = Backup;
+      break;
+    }
+    // Test if the index field is free for use.
+    if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
+      AM = Backup;
+      break;
+    }
+
+    int Cost = 0;
+    SDValue RHS = Handle.getValue().getNode()->getOperand(1);
+    // If the RHS involves a register with multiple uses, this
+    // transformation incurs an extra mov, due to the neg instruction
+    // clobbering its operand.
+    if (!RHS.getNode()->hasOneUse() ||
+        RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
+        RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
+        RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
+        (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
+         RHS.getNode()->getOperand(0).getValueType() == MVT::i32))
+      ++Cost;
+    // If the base is a register with multiple uses, this
+    // transformation may save a mov.
+    if ((AM.BaseType == X86ISelAddressMode::RegBase &&
+         AM.Base_Reg.getNode() &&
+         !AM.Base_Reg.getNode()->hasOneUse()) ||
+        AM.BaseType == X86ISelAddressMode::FrameIndexBase)
+      --Cost;
+    // If the folded LHS was interesting, this transformation saves
+    // address arithmetic.
+    if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
+        ((AM.Disp != 0) && (Backup.Disp == 0)) +
+        (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
+      --Cost;
+    // If it doesn't look like it may be an overall win, don't do it.
+    if (Cost >= 0) {
+      AM = Backup;
+      break;
+    }
+
+    // Ok, the transformation is legal and appears profitable. Go for it.
+    SDValue Zero = CurDAG->getConstant(0, N.getValueType());
+    SDValue Neg = CurDAG->getNode(ISD::SUB, dl, N.getValueType(), Zero, RHS);
+    AM.IndexReg = Neg;
+    AM.Scale = 1;
+
+    // Insert the new nodes into the topological ordering.
+    InsertDAGNode(*CurDAG, N, Zero);
+    InsertDAGNode(*CurDAG, N, Neg);
+    return false;
+  }
+
+  case ISD::ADD: {
+    // Add an artificial use to this node so that we can keep track of
+    // it if it gets CSE'd with a different node.
+    HandleSDNode Handle(N);
+
+    X86ISelAddressMode Backup = AM;
+    if (!MatchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
+        !MatchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
+      return false;
+    AM = Backup;
+
+    // Try again after commuting the operands.
+    if (!MatchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1)&&
+        !MatchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth+1))
+      return false;
+    AM = Backup;
+
+    // If we couldn't fold both operands into the address at the same time,
+    // see if we can just put each operand into a register and fold at least
+    // the add.
+    if (AM.BaseType == X86ISelAddressMode::RegBase &&
+        !AM.Base_Reg.getNode() &&
+        !AM.IndexReg.getNode()) {
+      N = Handle.getValue();
+      AM.Base_Reg = N.getOperand(0);
+      AM.IndexReg = N.getOperand(1);
+      AM.Scale = 1;
+      return false;
+    }
+    N = Handle.getValue();
+    break;
+  }
+
+  case ISD::OR:
+    // Handle "X | C" as "X + C" iff X is known to have C bits clear.
+    if (CurDAG->isBaseWithConstantOffset(N)) {
+      X86ISelAddressMode Backup = AM;
+      ConstantSDNode *CN = cast<ConstantSDNode>(N.getOperand(1));
+
+      // Start with the LHS as an addr mode.
+      if (!MatchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
+          !FoldOffsetIntoAddress(CN->getSExtValue(), AM))
+        return false;
+      AM = Backup;
+    }
+    break;
+
+  case ISD::AND: {
+    // Perform some heroic transforms on an and of a constant-count shift
+    // with a constant to enable use of the scaled offset field.
+
+    // Scale must not be used already.
+    if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
+
+    SDValue Shift = N.getOperand(0);
+    if (Shift.getOpcode() != ISD::SRL && Shift.getOpcode() != ISD::SHL) break;
+    SDValue X = Shift.getOperand(0);
+
+    // We only handle up to 64-bit values here as those are what matter for
+    // addressing mode optimizations.
+    if (X.getSimpleValueType().getSizeInBits() > 64) break;
+
+    if (!isa<ConstantSDNode>(N.getOperand(1)))
+      break;
+    uint64_t Mask = N.getConstantOperandVal(1);
+
+    // Try to fold the mask and shift into an extract and scale.
+    if (!FoldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
+      return false;
+
+    // Try to fold the mask and shift directly into the scale.
+    if (!FoldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
+      return false;
+
+    // Try to swap the mask and shift to place shifts which can be done as
+    // a scale on the outside of the mask.
+    if (!FoldMaskedShiftToScaledMask(*CurDAG, N, Mask, Shift, X, AM))
+      return false;
+    break;
+  }
+  }
+
+  return MatchAddressBase(N, AM);
+}
+
+/// MatchAddressBase - Helper for MatchAddress. Add the specified node to the
+/// specified addressing mode without any further recursion.
+bool X86DAGToDAGISel::MatchAddressBase(SDValue N, X86ISelAddressMode &AM) {
+  // Is the base register already occupied?
+  if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
+    // If so, check to see if the scale index register is set.
+    if (!AM.IndexReg.getNode()) {
+      AM.IndexReg = N;
+      AM.Scale = 1;
+      return false;
+    }
+
+    // Otherwise, we cannot select it.
+    return true;
+  }
+
+  // Default, generate it as a register.
+  AM.BaseType = X86ISelAddressMode::RegBase;
+  AM.Base_Reg = N;
+  return false;
+}
+
+/// SelectAddr - returns true if it is able pattern match an addressing mode.
+/// It returns the operands which make up the maximal addressing mode it can
+/// match by reference.
+///
+/// Parent is the parent node of the addr operand that is being matched.  It
+/// is always a load, store, atomic node, or null.  It is only null when
+/// checking memory operands for inline asm nodes.
+bool X86DAGToDAGISel::SelectAddr(SDNode *Parent, SDValue N, SDValue &Base,
+                                 SDValue &Scale, SDValue &Index,
+                                 SDValue &Disp, SDValue &Segment) {
+  X86ISelAddressMode AM;
+
+  if (Parent &&
+      // This list of opcodes are all the nodes that have an "addr:$ptr" operand
+      // that are not a MemSDNode, and thus don't have proper addrspace info.
+      Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
+      Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
+      Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
+      Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
+      Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
+    unsigned AddrSpace =
+      cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
+    // AddrSpace 256 -> GS, 257 -> FS.
+    if (AddrSpace == 256)
+      AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
+    if (AddrSpace == 257)
+      AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
+  }
+
+  if (MatchAddress(N, AM))
+    return false;
+
+  MVT VT = N.getSimpleValueType();
+  if (AM.BaseType == X86ISelAddressMode::RegBase) {
+    if (!AM.Base_Reg.getNode())
+      AM.Base_Reg = CurDAG->getRegister(0, VT);
+  }
+
+  if (!AM.IndexReg.getNode())
+    AM.IndexReg = CurDAG->getRegister(0, VT);
+
+  getAddressOperands(AM, Base, Scale, Index, Disp, Segment);
+  return true;
+}
+
+/// SelectScalarSSELoad - Match a scalar SSE load.  In particular, we want to
+/// match a load whose top elements are either undef or zeros.  The load flavor
+/// is derived from the type of N, which is either v4f32 or v2f64.
+///
+/// We also return:
+///   PatternChainNode: this is the matched node that has a chain input and
+///   output.
+bool X86DAGToDAGISel::SelectScalarSSELoad(SDNode *Root,
+                                          SDValue N, SDValue &Base,
+                                          SDValue &Scale, SDValue &Index,
+                                          SDValue &Disp, SDValue &Segment,
+                                          SDValue &PatternNodeWithChain) {
+  if (N.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+    PatternNodeWithChain = N.getOperand(0);
+    if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
+        PatternNodeWithChain.hasOneUse() &&
+        IsProfitableToFold(N.getOperand(0), N.getNode(), Root) &&
+        IsLegalToFold(N.getOperand(0), N.getNode(), Root, OptLevel)) {
+      LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
+      if (!SelectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment))
+        return false;
+      return true;
+    }
+  }
+
+  // Also handle the case where we explicitly require zeros in the top
+  // elements.  This is a vector shuffle from the zero vector.
+  if (N.getOpcode() == X86ISD::VZEXT_MOVL && N.getNode()->hasOneUse() &&
+      // Check to see if the top elements are all zeros (or bitcast of zeros).
+      N.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
+      N.getOperand(0).getNode()->hasOneUse() &&
+      ISD::isNON_EXTLoad(N.getOperand(0).getOperand(0).getNode()) &&
+      N.getOperand(0).getOperand(0).hasOneUse() &&
+      IsProfitableToFold(N.getOperand(0), N.getNode(), Root) &&
+      IsLegalToFold(N.getOperand(0), N.getNode(), Root, OptLevel)) {
+    // Okay, this is a zero extending load.  Fold it.
+    LoadSDNode *LD = cast<LoadSDNode>(N.getOperand(0).getOperand(0));
+    if (!SelectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment))
+      return false;
+    PatternNodeWithChain = SDValue(LD, 0);
+    return true;
+  }
+  return false;
+}
+
+
+bool X86DAGToDAGISel::SelectMOV64Imm32(SDValue N, SDValue &Imm) {
+  if (const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
+    uint64_t ImmVal = CN->getZExtValue();
+    if ((uint32_t)ImmVal != (uint64_t)ImmVal)
+      return false;
+
+    Imm = CurDAG->getTargetConstant(ImmVal, MVT::i64);
+    return true;
+  }
+
+  // In static codegen with small code model, we can get the address of a label
+  // into a register with 'movl'. TableGen has already made sure we're looking
+  // at a label of some kind.
+  assert(N->getOpcode() == X86ISD::Wrapper &&
+         "Unexpected node type for MOV32ri64");
+  N = N.getOperand(0);
+
+  if (N->getOpcode() != ISD::TargetConstantPool &&
+      N->getOpcode() != ISD::TargetJumpTable &&
+      N->getOpcode() != ISD::TargetGlobalAddress &&
+      N->getOpcode() != ISD::TargetExternalSymbol &&
+      N->getOpcode() != ISD::TargetBlockAddress)
+    return false;
+
+  Imm = N;
+  return TM.getCodeModel() == CodeModel::Small;
+}
+
+bool X86DAGToDAGISel::SelectLEA64_32Addr(SDValue N, SDValue &Base,
+                                         SDValue &Scale, SDValue &Index,
+                                         SDValue &Disp, SDValue &Segment) {
+  if (!SelectLEAAddr(N, Base, Scale, Index, Disp, Segment))
+    return false;
+
+  SDLoc DL(N);
+  RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Base);
+  if (RN && RN->getReg() == 0)
+    Base = CurDAG->getRegister(0, MVT::i64);
+  else if (Base.getValueType() == MVT::i32 && !dyn_cast<FrameIndexSDNode>(N)) {
+    // Base could already be %rip, particularly in the x32 ABI.
+    Base = SDValue(CurDAG->getMachineNode(
+                       TargetOpcode::SUBREG_TO_REG, DL, MVT::i64,
+                       CurDAG->getTargetConstant(0, MVT::i64),
+                       Base,
+                       CurDAG->getTargetConstant(X86::sub_32bit, MVT::i32)),
+                   0);
+  }
+
+  RN = dyn_cast<RegisterSDNode>(Index);
+  if (RN && RN->getReg() == 0)
+    Index = CurDAG->getRegister(0, MVT::i64);
+  else {
+    assert(Index.getValueType() == MVT::i32 &&
+           "Expect to be extending 32-bit registers for use in LEA");
+    Index = SDValue(CurDAG->getMachineNode(
+                        TargetOpcode::SUBREG_TO_REG, DL, MVT::i64,
+                        CurDAG->getTargetConstant(0, MVT::i64),
+                        Index,
+                        CurDAG->getTargetConstant(X86::sub_32bit, MVT::i32)),
+                    0);
+  }
+
+  return true;
+}
+
+/// SelectLEAAddr - it calls SelectAddr and determines if the maximal addressing
+/// mode it matches can be cost effectively emitted as an LEA instruction.
+bool X86DAGToDAGISel::SelectLEAAddr(SDValue N,
+                                    SDValue &Base, SDValue &Scale,
+                                    SDValue &Index, SDValue &Disp,
+                                    SDValue &Segment) {
+  X86ISelAddressMode AM;
+
+  // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
+  // segments.
+  SDValue Copy = AM.Segment;
+  SDValue T = CurDAG->getRegister(0, MVT::i32);
+  AM.Segment = T;
+  if (MatchAddress(N, AM))
+    return false;
+  assert (T == AM.Segment);
+  AM.Segment = Copy;
+
+  MVT VT = N.getSimpleValueType();
+  unsigned Complexity = 0;
+  if (AM.BaseType == X86ISelAddressMode::RegBase)
+    if (AM.Base_Reg.getNode())
+      Complexity = 1;
+    else
+      AM.Base_Reg = CurDAG->getRegister(0, VT);
+  else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
+    Complexity = 4;
+
+  if (AM.IndexReg.getNode())
+    Complexity++;
+  else
+    AM.IndexReg = CurDAG->getRegister(0, VT);
+
+  // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
+  // a simple shift.
+  if (AM.Scale > 1)
+    Complexity++;
+
+  // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
+  // to a LEA. This is determined with some expermentation but is by no means
+  // optimal (especially for code size consideration). LEA is nice because of
+  // its three-address nature. Tweak the cost function again when we can run
+  // convertToThreeAddress() at register allocation time.
+  if (AM.hasSymbolicDisplacement()) {
+    // For X86-64, we should always use lea to materialize RIP relative
+    // addresses.
+    if (Subtarget->is64Bit())
+      Complexity = 4;
+    else
+      Complexity += 2;
+  }
+
+  if (AM.Disp && (AM.Base_Reg.getNode() || AM.IndexReg.getNode()))
+    Complexity++;
+
+  // If it isn't worth using an LEA, reject it.
+  if (Complexity <= 2)
+    return false;
+
+  getAddressOperands(AM, Base, Scale, Index, Disp, Segment);
+  return true;
+}
+
+/// SelectTLSADDRAddr - This is only run on TargetGlobalTLSAddress nodes.
+bool X86DAGToDAGISel::SelectTLSADDRAddr(SDValue N, SDValue &Base,
+                                        SDValue &Scale, SDValue &Index,
+                                        SDValue &Disp, SDValue &Segment) {
+  assert(N.getOpcode() == ISD::TargetGlobalTLSAddress);
+  const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N);
+
+  X86ISelAddressMode AM;
+  AM.GV = GA->getGlobal();
+  AM.Disp += GA->getOffset();
+  AM.Base_Reg = CurDAG->getRegister(0, N.getValueType());
+  AM.SymbolFlags = GA->getTargetFlags();
+
+  if (N.getValueType() == MVT::i32) {
+    AM.Scale = 1;
+    AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
+  } else {
+    AM.IndexReg = CurDAG->getRegister(0, MVT::i64);
+  }
+
+  getAddressOperands(AM, Base, Scale, Index, Disp, Segment);
+  return true;
+}
+
+
+bool X86DAGToDAGISel::TryFoldLoad(SDNode *P, SDValue N,
+                                  SDValue &Base, SDValue &Scale,
+                                  SDValue &Index, SDValue &Disp,
+                                  SDValue &Segment) {
+  if (!ISD::isNON_EXTLoad(N.getNode()) ||
+      !IsProfitableToFold(N, P, P) ||
+      !IsLegalToFold(N, P, P, OptLevel))
+    return false;
+
+  return SelectAddr(N.getNode(),
+                    N.getOperand(1), Base, Scale, Index, Disp, Segment);
+}
+
+/// getGlobalBaseReg - Return an SDNode that returns the value of
+/// the global base register. Output instructions required to
+/// initialize the global base register, if necessary.
+///
+SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
+  unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
+  return CurDAG->getRegister(GlobalBaseReg,
+                             getTargetLowering()->getPointerTy()).getNode();
+}
+
+SDNode *X86DAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) {
+  SDValue Chain = Node->getOperand(0);
+  SDValue In1 = Node->getOperand(1);
+  SDValue In2L = Node->getOperand(2);
+  SDValue In2H = Node->getOperand(3);
+
+  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+  if (!SelectAddr(Node, In1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4))
+    return nullptr;
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemSDNode>(Node)->getMemOperand();
+  const SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, In2L, In2H, Chain};
+  SDNode *ResNode = CurDAG->getMachineNode(Opc, SDLoc(Node),
+                                           MVT::i32, MVT::i32, MVT::Other, Ops);
+  cast<MachineSDNode>(ResNode)->setMemRefs(MemOp, MemOp + 1);
+  return ResNode;
+}
+
+/// Atomic opcode table
+///
+enum AtomicOpc {
+  ADD,
+  SUB,
+  INC,
+  DEC,
+  OR,
+  AND,
+  XOR,
+  AtomicOpcEnd
+};
+
+enum AtomicSz {
+  ConstantI8,
+  I8,
+  SextConstantI16,
+  ConstantI16,
+  I16,
+  SextConstantI32,
+  ConstantI32,
+  I32,
+  SextConstantI64,
+  ConstantI64,
+  I64,
+  AtomicSzEnd
+};
+
+static const uint16_t AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = {
+  {
+    X86::LOCK_ADD8mi,
+    X86::LOCK_ADD8mr,
+    X86::LOCK_ADD16mi8,
+    X86::LOCK_ADD16mi,
+    X86::LOCK_ADD16mr,
+    X86::LOCK_ADD32mi8,
+    X86::LOCK_ADD32mi,
+    X86::LOCK_ADD32mr,
+    X86::LOCK_ADD64mi8,
+    X86::LOCK_ADD64mi32,
+    X86::LOCK_ADD64mr,
+  },
+  {
+    X86::LOCK_SUB8mi,
+    X86::LOCK_SUB8mr,
+    X86::LOCK_SUB16mi8,
+    X86::LOCK_SUB16mi,
+    X86::LOCK_SUB16mr,
+    X86::LOCK_SUB32mi8,
+    X86::LOCK_SUB32mi,
+    X86::LOCK_SUB32mr,
+    X86::LOCK_SUB64mi8,
+    X86::LOCK_SUB64mi32,
+    X86::LOCK_SUB64mr,
+  },
+  {
+    0,
+    X86::LOCK_INC8m,
+    0,
+    0,
+    X86::LOCK_INC16m,
+    0,
+    0,
+    X86::LOCK_INC32m,
+    0,
+    0,
+    X86::LOCK_INC64m,
+  },
+  {
+    0,
+    X86::LOCK_DEC8m,
+    0,
+    0,
+    X86::LOCK_DEC16m,
+    0,
+    0,
+    X86::LOCK_DEC32m,
+    0,
+    0,
+    X86::LOCK_DEC64m,
+  },
+  {
+    X86::LOCK_OR8mi,
+    X86::LOCK_OR8mr,
+    X86::LOCK_OR16mi8,
+    X86::LOCK_OR16mi,
+    X86::LOCK_OR16mr,
+    X86::LOCK_OR32mi8,
+    X86::LOCK_OR32mi,
+    X86::LOCK_OR32mr,
+    X86::LOCK_OR64mi8,
+    X86::LOCK_OR64mi32,
+    X86::LOCK_OR64mr,
+  },
+  {
+    X86::LOCK_AND8mi,
+    X86::LOCK_AND8mr,
+    X86::LOCK_AND16mi8,
+    X86::LOCK_AND16mi,
+    X86::LOCK_AND16mr,
+    X86::LOCK_AND32mi8,
+    X86::LOCK_AND32mi,
+    X86::LOCK_AND32mr,
+    X86::LOCK_AND64mi8,
+    X86::LOCK_AND64mi32,
+    X86::LOCK_AND64mr,
+  },
+  {
+    X86::LOCK_XOR8mi,
+    X86::LOCK_XOR8mr,
+    X86::LOCK_XOR16mi8,
+    X86::LOCK_XOR16mi,
+    X86::LOCK_XOR16mr,
+    X86::LOCK_XOR32mi8,
+    X86::LOCK_XOR32mi,
+    X86::LOCK_XOR32mr,
+    X86::LOCK_XOR64mi8,
+    X86::LOCK_XOR64mi32,
+    X86::LOCK_XOR64mr,
+  }
+};
+
+// Return the target constant operand for atomic-load-op and do simple
+// translations, such as from atomic-load-add to lock-sub. The return value is
+// one of the following 3 cases:
+// + target-constant, the operand could be supported as a target constant.
+// + empty, the operand is not needed any more with the new op selected.
+// + non-empty, otherwise.
+static SDValue getAtomicLoadArithTargetConstant(SelectionDAG *CurDAG,
+                                                SDLoc dl,
+                                                enum AtomicOpc &Op, MVT NVT,
+                                                SDValue Val) {
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val)) {
+    int64_t CNVal = CN->getSExtValue();
+    // Quit if not 32-bit imm.
+    if ((int32_t)CNVal != CNVal)
+      return Val;
+    // For atomic-load-add, we could do some optimizations.
+    if (Op == ADD) {
+      // Translate to INC/DEC if ADD by 1 or -1.
+      if ((CNVal == 1) || (CNVal == -1)) {
+        Op = (CNVal == 1) ? INC : DEC;
+        // No more constant operand after being translated into INC/DEC.
+        return SDValue();
+      }
+      // Translate to SUB if ADD by negative value.
+      if (CNVal < 0) {
+        Op = SUB;
+        CNVal = -CNVal;
+      }
+    }
+    return CurDAG->getTargetConstant(CNVal, NVT);
+  }
+
+  // If the value operand is single-used, try to optimize it.
+  if (Op == ADD && Val.hasOneUse()) {
+    // Translate (atomic-load-add ptr (sub 0 x)) back to (lock-sub x).
+    if (Val.getOpcode() == ISD::SUB && X86::isZeroNode(Val.getOperand(0))) {
+      Op = SUB;
+      return Val.getOperand(1);
+    }
+    // A special case for i16, which needs truncating as, in most cases, it's
+    // promoted to i32. We will translate
+    // (atomic-load-add (truncate (sub 0 x))) to (lock-sub (EXTRACT_SUBREG x))
+    if (Val.getOpcode() == ISD::TRUNCATE && NVT == MVT::i16 &&
+        Val.getOperand(0).getOpcode() == ISD::SUB &&
+        X86::isZeroNode(Val.getOperand(0).getOperand(0))) {
+      Op = SUB;
+      Val = Val.getOperand(0);
+      return CurDAG->getTargetExtractSubreg(X86::sub_16bit, dl, NVT,
+                                            Val.getOperand(1));
+    }
+  }
+
+  return Val;
+}
+
+SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, MVT NVT) {
+  if (Node->hasAnyUseOfValue(0))
+    return nullptr;
+
+  SDLoc dl(Node);
+
+  // Optimize common patterns for __sync_or_and_fetch and similar arith
+  // operations where the result is not used. This allows us to use the "lock"
+  // version of the arithmetic instruction.
+  SDValue Chain = Node->getOperand(0);
+  SDValue Ptr = Node->getOperand(1);
+  SDValue Val = Node->getOperand(2);
+  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+  if (!SelectAddr(Node, Ptr, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4))
+    return nullptr;
+
+  // Which index into the table.
+  enum AtomicOpc Op;
+  switch (Node->getOpcode()) {
+    default:
+      return nullptr;
+    case ISD::ATOMIC_LOAD_OR:
+      Op = OR;
+      break;
+    case ISD::ATOMIC_LOAD_AND:
+      Op = AND;
+      break;
+    case ISD::ATOMIC_LOAD_XOR:
+      Op = XOR;
+      break;
+    case ISD::ATOMIC_LOAD_ADD:
+      Op = ADD;
+      break;
+  }
+
+  Val = getAtomicLoadArithTargetConstant(CurDAG, dl, Op, NVT, Val);
+  bool isUnOp = !Val.getNode();
+  bool isCN = Val.getNode() && (Val.getOpcode() == ISD::TargetConstant);
+
+  unsigned Opc = 0;
+  switch (NVT.SimpleTy) {
+    default: return nullptr;
+    case MVT::i8:
+      if (isCN)
+        Opc = AtomicOpcTbl[Op][ConstantI8];
+      else
+        Opc = AtomicOpcTbl[Op][I8];
+      break;
+    case MVT::i16:
+      if (isCN) {
+        if (immSext8(Val.getNode()))
+          Opc = AtomicOpcTbl[Op][SextConstantI16];
+        else
+          Opc = AtomicOpcTbl[Op][ConstantI16];
+      } else
+        Opc = AtomicOpcTbl[Op][I16];
+      break;
+    case MVT::i32:
+      if (isCN) {
+        if (immSext8(Val.getNode()))
+          Opc = AtomicOpcTbl[Op][SextConstantI32];
+        else
+          Opc = AtomicOpcTbl[Op][ConstantI32];
+      } else
+        Opc = AtomicOpcTbl[Op][I32];
+      break;
+    case MVT::i64:
+      Opc = AtomicOpcTbl[Op][I64];
+      if (isCN) {
+        if (immSext8(Val.getNode()))
+          Opc = AtomicOpcTbl[Op][SextConstantI64];
+        else if (i64immSExt32(Val.getNode()))
+          Opc = AtomicOpcTbl[Op][ConstantI64];
+      }
+      break;
+  }
+
+  assert(Opc != 0 && "Invalid arith lock transform!");
+
+  SDValue Ret;
+  SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
+                                                 dl, NVT), 0);
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemSDNode>(Node)->getMemOperand();
+  if (isUnOp) {
+    SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain };
+    Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops), 0);
+  } else {
+    SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Val, Chain };
+    Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops), 0);
+  }
+  cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1);
+  SDValue RetVals[] = { Undef, Ret };
+  return CurDAG->getMergeValues(RetVals, dl).getNode();
+}
+
+/// HasNoSignedComparisonUses - Test whether the given X86ISD::CMP node has
+/// any uses which require the SF or OF bits to be accurate.
+static bool HasNoSignedComparisonUses(SDNode *N) {
+  // Examine each user of the node.
+  for (SDNode::use_iterator UI = N->use_begin(),
+         UE = N->use_end(); UI != UE; ++UI) {
+    // Only examine CopyToReg uses.
+    if (UI->getOpcode() != ISD::CopyToReg)
+      return false;
+    // Only examine CopyToReg uses that copy to EFLAGS.
+    if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() !=
+          X86::EFLAGS)
+      return false;
+    // Examine each user of the CopyToReg use.
+    for (SDNode::use_iterator FlagUI = UI->use_begin(),
+           FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
+      // Only examine the Flag result.
+      if (FlagUI.getUse().getResNo() != 1) continue;
+      // Anything unusual: assume conservatively.
+      if (!FlagUI->isMachineOpcode()) return false;
+      // Examine the opcode of the user.
+      switch (FlagUI->getMachineOpcode()) {
+      // These comparisons don't treat the most significant bit specially.
+      case X86::SETAr: case X86::SETAEr: case X86::SETBr: case X86::SETBEr:
+      case X86::SETEr: case X86::SETNEr: case X86::SETPr: case X86::SETNPr:
+      case X86::SETAm: case X86::SETAEm: case X86::SETBm: case X86::SETBEm:
+      case X86::SETEm: case X86::SETNEm: case X86::SETPm: case X86::SETNPm:
+      case X86::JA_4: case X86::JAE_4: case X86::JB_4: case X86::JBE_4:
+      case X86::JE_4: case X86::JNE_4: case X86::JP_4: case X86::JNP_4:
+      case X86::CMOVA16rr: case X86::CMOVA16rm:
+      case X86::CMOVA32rr: case X86::CMOVA32rm:
+      case X86::CMOVA64rr: case X86::CMOVA64rm:
+      case X86::CMOVAE16rr: case X86::CMOVAE16rm:
+      case X86::CMOVAE32rr: case X86::CMOVAE32rm:
+      case X86::CMOVAE64rr: case X86::CMOVAE64rm:
+      case X86::CMOVB16rr: case X86::CMOVB16rm:
+      case X86::CMOVB32rr: case X86::CMOVB32rm:
+      case X86::CMOVB64rr: case X86::CMOVB64rm:
+      case X86::CMOVBE16rr: case X86::CMOVBE16rm:
+      case X86::CMOVBE32rr: case X86::CMOVBE32rm:
+      case X86::CMOVBE64rr: case X86::CMOVBE64rm:
+      case X86::CMOVE16rr: case X86::CMOVE16rm:
+      case X86::CMOVE32rr: case X86::CMOVE32rm:
+      case X86::CMOVE64rr: case X86::CMOVE64rm:
+      case X86::CMOVNE16rr: case X86::CMOVNE16rm:
+      case X86::CMOVNE32rr: case X86::CMOVNE32rm:
+      case X86::CMOVNE64rr: case X86::CMOVNE64rm:
+      case X86::CMOVNP16rr: case X86::CMOVNP16rm:
+      case X86::CMOVNP32rr: case X86::CMOVNP32rm:
+      case X86::CMOVNP64rr: case X86::CMOVNP64rm:
+      case X86::CMOVP16rr: case X86::CMOVP16rm:
+      case X86::CMOVP32rr: case X86::CMOVP32rm:
+      case X86::CMOVP64rr: case X86::CMOVP64rm:
+        continue;
+      // Anything else: assume conservatively.
+      default: return false;
+      }
+    }
+  }
+  return true;
+}
+
+/// isLoadIncOrDecStore - Check whether or not the chain ending in StoreNode
+/// is suitable for doing the {load; increment or decrement; store} to modify
+/// transformation.
+static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc,
+                                SDValue StoredVal, SelectionDAG *CurDAG,
+                                LoadSDNode* &LoadNode, SDValue &InputChain) {
+
+  // is the value stored the result of a DEC or INC?
+  if (!(Opc == X86ISD::DEC || Opc == X86ISD::INC)) return false;
+
+  // is the stored value result 0 of the load?
+  if (StoredVal.getResNo() != 0) return false;
+
+  // are there other uses of the loaded value than the inc or dec?
+  if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;
+
+  // is the store non-extending and non-indexed?
+  if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
+    return false;
+
+  SDValue Load = StoredVal->getOperand(0);
+  // Is the stored value a non-extending and non-indexed load?
+  if (!ISD::isNormalLoad(Load.getNode())) return false;
+
+  // Return LoadNode by reference.
+  LoadNode = cast<LoadSDNode>(Load);
+  // is the size of the value one that we can handle? (i.e. 64, 32, 16, or 8)
+  EVT LdVT = LoadNode->getMemoryVT();
+  if (LdVT != MVT::i64 && LdVT != MVT::i32 && LdVT != MVT::i16 &&
+      LdVT != MVT::i8)
+    return false;
+
+  // Is store the only read of the loaded value?
+  if (!Load.hasOneUse())
+    return false;
+
+  // Is the address of the store the same as the load?
+  if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
+      LoadNode->getOffset() != StoreNode->getOffset())
+    return false;
+
+  // Check if the chain is produced by the load or is a TokenFactor with
+  // the load output chain as an operand. Return InputChain by reference.
+  SDValue Chain = StoreNode->getChain();
+
+  bool ChainCheck = false;
+  if (Chain == Load.getValue(1)) {
+    ChainCheck = true;
+    InputChain = LoadNode->getChain();
+  } else if (Chain.getOpcode() == ISD::TokenFactor) {
+    SmallVector<SDValue, 4> ChainOps;
+    for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
+      SDValue Op = Chain.getOperand(i);
+      if (Op == Load.getValue(1)) {
+        ChainCheck = true;
+        continue;
+      }
+
+      // Make sure using Op as part of the chain would not cause a cycle here.
+      // In theory, we could check whether the chain node is a predecessor of
+      // the load. But that can be very expensive. Instead visit the uses and
+      // make sure they all have smaller node id than the load.
+      int LoadId = LoadNode->getNodeId();
+      for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+             UE = UI->use_end(); UI != UE; ++UI) {
+        if (UI.getUse().getResNo() != 0)
+          continue;
+        if (UI->getNodeId() > LoadId)
+          return false;
+      }
+
+      ChainOps.push_back(Op);
+    }
+
+    if (ChainCheck)
+      // Make a new TokenFactor with all the other input chains except
+      // for the load.
+      InputChain = CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain),
+                                   MVT::Other, ChainOps);
+  }
+  if (!ChainCheck)
+    return false;
+
+  return true;
+}
+
+/// getFusedLdStOpcode - Get the appropriate X86 opcode for an in memory
+/// increment or decrement. Opc should be X86ISD::DEC or X86ISD::INC.
+static unsigned getFusedLdStOpcode(EVT &LdVT, unsigned Opc) {
+  if (Opc == X86ISD::DEC) {
+    if (LdVT == MVT::i64) return X86::DEC64m;
+    if (LdVT == MVT::i32) return X86::DEC32m;
+    if (LdVT == MVT::i16) return X86::DEC16m;
+    if (LdVT == MVT::i8)  return X86::DEC8m;
+  } else {
+    assert(Opc == X86ISD::INC && "unrecognized opcode");
+    if (LdVT == MVT::i64) return X86::INC64m;
+    if (LdVT == MVT::i32) return X86::INC32m;
+    if (LdVT == MVT::i16) return X86::INC16m;
+    if (LdVT == MVT::i8)  return X86::INC8m;
+  }
+  llvm_unreachable("unrecognized size for LdVT");
+}
+
+/// SelectGather - Customized ISel for GATHER operations.
+///
+SDNode *X86DAGToDAGISel::SelectGather(SDNode *Node, unsigned Opc) {
+  // Operands of Gather: VSrc, Base, VIdx, VMask, Scale
+  SDValue Chain = Node->getOperand(0);
+  SDValue VSrc = Node->getOperand(2);
+  SDValue Base = Node->getOperand(3);
+  SDValue VIdx = Node->getOperand(4);
+  SDValue VMask = Node->getOperand(5);
+  ConstantSDNode *Scale = dyn_cast<ConstantSDNode>(Node->getOperand(6));
+  if (!Scale)
+    return nullptr;
+
+  SDVTList VTs = CurDAG->getVTList(VSrc.getValueType(), VSrc.getValueType(),
+                                   MVT::Other);
+
+  // Memory Operands: Base, Scale, Index, Disp, Segment
+  SDValue Disp = CurDAG->getTargetConstant(0, MVT::i32);
+  SDValue Segment = CurDAG->getRegister(0, MVT::i32);
+  const SDValue Ops[] = { VSrc, Base, getI8Imm(Scale->getSExtValue()), VIdx,
+                          Disp, Segment, VMask, Chain};
+  SDNode *ResNode = CurDAG->getMachineNode(Opc, SDLoc(Node), VTs, Ops);
+  // Node has 2 outputs: VDst and MVT::Other.
+  // ResNode has 3 outputs: VDst, VMask_wb, and MVT::Other.
+  // We replace VDst of Node with VDst of ResNode, and Other of Node with Other
+  // of ResNode.
+  ReplaceUses(SDValue(Node, 0), SDValue(ResNode, 0));
+  ReplaceUses(SDValue(Node, 1), SDValue(ResNode, 2));
+  return ResNode;
+}
+
+SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
+  MVT NVT = Node->getSimpleValueType(0);
+  unsigned Opc, MOpc;
+  unsigned Opcode = Node->getOpcode();
+  SDLoc dl(Node);
+
+  DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << '\n');
+
+  if (Node->isMachineOpcode()) {
+    DEBUG(dbgs() << "== ";  Node->dump(CurDAG); dbgs() << '\n');
+    Node->setNodeId(-1);
+    return nullptr;   // Already selected.
+  }
+
+  switch (Opcode) {
+  default: break;
+  case ISD::INTRINSIC_W_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    switch (IntNo) {
+    default: break;
+    case Intrinsic::x86_avx2_gather_d_pd:
+    case Intrinsic::x86_avx2_gather_d_pd_256:
+    case Intrinsic::x86_avx2_gather_q_pd:
+    case Intrinsic::x86_avx2_gather_q_pd_256:
+    case Intrinsic::x86_avx2_gather_d_ps:
+    case Intrinsic::x86_avx2_gather_d_ps_256:
+    case Intrinsic::x86_avx2_gather_q_ps:
+    case Intrinsic::x86_avx2_gather_q_ps_256:
+    case Intrinsic::x86_avx2_gather_d_q:
+    case Intrinsic::x86_avx2_gather_d_q_256:
+    case Intrinsic::x86_avx2_gather_q_q:
+    case Intrinsic::x86_avx2_gather_q_q_256:
+    case Intrinsic::x86_avx2_gather_d_d:
+    case Intrinsic::x86_avx2_gather_d_d_256:
+    case Intrinsic::x86_avx2_gather_q_d:
+    case Intrinsic::x86_avx2_gather_q_d_256: {
+      if (!Subtarget->hasAVX2())
+        break;
+      unsigned Opc;
+      switch (IntNo) {
+      default: llvm_unreachable("Impossible intrinsic");
+      case Intrinsic::x86_avx2_gather_d_pd:     Opc = X86::VGATHERDPDrm;  break;
+      case Intrinsic::x86_avx2_gather_d_pd_256: Opc = X86::VGATHERDPDYrm; break;
+      case Intrinsic::x86_avx2_gather_q_pd:     Opc = X86::VGATHERQPDrm;  break;
+      case Intrinsic::x86_avx2_gather_q_pd_256: Opc = X86::VGATHERQPDYrm; break;
+      case Intrinsic::x86_avx2_gather_d_ps:     Opc = X86::VGATHERDPSrm;  break;
+      case Intrinsic::x86_avx2_gather_d_ps_256: Opc = X86::VGATHERDPSYrm; break;
+      case Intrinsic::x86_avx2_gather_q_ps:     Opc = X86::VGATHERQPSrm;  break;
+      case Intrinsic::x86_avx2_gather_q_ps_256: Opc = X86::VGATHERQPSYrm; break;
+      case Intrinsic::x86_avx2_gather_d_q:      Opc = X86::VPGATHERDQrm;  break;
+      case Intrinsic::x86_avx2_gather_d_q_256:  Opc = X86::VPGATHERDQYrm; break;
+      case Intrinsic::x86_avx2_gather_q_q:      Opc = X86::VPGATHERQQrm;  break;
+      case Intrinsic::x86_avx2_gather_q_q_256:  Opc = X86::VPGATHERQQYrm; break;
+      case Intrinsic::x86_avx2_gather_d_d:      Opc = X86::VPGATHERDDrm;  break;
+      case Intrinsic::x86_avx2_gather_d_d_256:  Opc = X86::VPGATHERDDYrm; break;
+      case Intrinsic::x86_avx2_gather_q_d:      Opc = X86::VPGATHERQDrm;  break;
+      case Intrinsic::x86_avx2_gather_q_d_256:  Opc = X86::VPGATHERQDYrm; break;
+      }
+      SDNode *RetVal = SelectGather(Node, Opc);
+      if (RetVal)
+        // We already called ReplaceUses inside SelectGather.
+        return nullptr;
+      break;
+    }
+    }
+    break;
+  }
+  case X86ISD::GlobalBaseReg:
+    return getGlobalBaseReg();
+
+
+  case ISD::ATOMIC_LOAD_XOR:
+  case ISD::ATOMIC_LOAD_AND:
+  case ISD::ATOMIC_LOAD_OR:
+  case ISD::ATOMIC_LOAD_ADD: {
+    SDNode *RetVal = SelectAtomicLoadArith(Node, NVT);
+    if (RetVal)
+      return RetVal;
+    break;
+  }
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR: {
+    // For operations of the form (x << C1) op C2, check if we can use a smaller
+    // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
+    SDValue N0 = Node->getOperand(0);
+    SDValue N1 = Node->getOperand(1);
+
+    if (N0->getOpcode() != ISD::SHL || !N0->hasOneUse())
+      break;
+
+    // i8 is unshrinkable, i16 should be promoted to i32.
+    if (NVT != MVT::i32 && NVT != MVT::i64)
+      break;
+
+    ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1);
+    ConstantSDNode *ShlCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
+    if (!Cst || !ShlCst)
+      break;
+
+    int64_t Val = Cst->getSExtValue();
+    uint64_t ShlVal = ShlCst->getZExtValue();
+
+    // Make sure that we don't change the operation by removing bits.
+    // This only matters for OR and XOR, AND is unaffected.
+    uint64_t RemovedBitsMask = (1ULL << ShlVal) - 1;
+    if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
+      break;
+
+    unsigned ShlOp, Op;
+    MVT CstVT = NVT;
+
+    // Check the minimum bitwidth for the new constant.
+    // TODO: AND32ri is the same as AND64ri32 with zext imm.
+    // TODO: MOV32ri+OR64r is cheaper than MOV64ri64+OR64rr
+    // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
+    if (!isInt<8>(Val) && isInt<8>(Val >> ShlVal))
+      CstVT = MVT::i8;
+    else if (!isInt<32>(Val) && isInt<32>(Val >> ShlVal))
+      CstVT = MVT::i32;
+
+    // Bail if there is no smaller encoding.
+    if (NVT == CstVT)
+      break;
+
+    switch (NVT.SimpleTy) {
+    default: llvm_unreachable("Unsupported VT!");
+    case MVT::i32:
+      assert(CstVT == MVT::i8);
+      ShlOp = X86::SHL32ri;
+
+      switch (Opcode) {
+      default: llvm_unreachable("Impossible opcode");
+      case ISD::AND: Op = X86::AND32ri8; break;
+      case ISD::OR:  Op =  X86::OR32ri8; break;
+      case ISD::XOR: Op = X86::XOR32ri8; break;
+      }
+      break;
+    case MVT::i64:
+      assert(CstVT == MVT::i8 || CstVT == MVT::i32);
+      ShlOp = X86::SHL64ri;
+
+      switch (Opcode) {
+      default: llvm_unreachable("Impossible opcode");
+      case ISD::AND: Op = CstVT==MVT::i8? X86::AND64ri8 : X86::AND64ri32; break;
+      case ISD::OR:  Op = CstVT==MVT::i8?  X86::OR64ri8 :  X86::OR64ri32; break;
+      case ISD::XOR: Op = CstVT==MVT::i8? X86::XOR64ri8 : X86::XOR64ri32; break;
+      }
+      break;
+    }
+
+    // Emit the smaller op and the shift.
+    SDValue NewCst = CurDAG->getTargetConstant(Val >> ShlVal, CstVT);
+    SDNode *New = CurDAG->getMachineNode(Op, dl, NVT, N0->getOperand(0),NewCst);
+    return CurDAG->SelectNodeTo(Node, ShlOp, NVT, SDValue(New, 0),
+                                getI8Imm(ShlVal));
+  }
+  case X86ISD::UMUL: {
+    SDValue N0 = Node->getOperand(0);
+    SDValue N1 = Node->getOperand(1);
+
+    unsigned LoReg;
+    switch (NVT.SimpleTy) {
+    default: llvm_unreachable("Unsupported VT!");
+    case MVT::i8:  LoReg = X86::AL;  Opc = X86::MUL8r; break;
+    case MVT::i16: LoReg = X86::AX;  Opc = X86::MUL16r; break;
+    case MVT::i32: LoReg = X86::EAX; Opc = X86::MUL32r; break;
+    case MVT::i64: LoReg = X86::RAX; Opc = X86::MUL64r; break;
+    }
+
+    SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
+                                          N0, SDValue()).getValue(1);
+
+    SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);
+    SDValue Ops[] = {N1, InFlag};
+    SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+
+    ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
+    ReplaceUses(SDValue(Node, 1), SDValue(CNode, 1));
+    ReplaceUses(SDValue(Node, 2), SDValue(CNode, 2));
+    return nullptr;
+  }
+
+  case ISD::SMUL_LOHI:
+  case ISD::UMUL_LOHI: {
+    SDValue N0 = Node->getOperand(0);
+    SDValue N1 = Node->getOperand(1);
+
+    bool isSigned = Opcode == ISD::SMUL_LOHI;
+    bool hasBMI2 = Subtarget->hasBMI2();
+    if (!isSigned) {
+      switch (NVT.SimpleTy) {
+      default: llvm_unreachable("Unsupported VT!");
+      case MVT::i8:  Opc = X86::MUL8r;  MOpc = X86::MUL8m;  break;
+      case MVT::i16: Opc = X86::MUL16r; MOpc = X86::MUL16m; break;
+      case MVT::i32: Opc = hasBMI2 ? X86::MULX32rr : X86::MUL32r;
+                     MOpc = hasBMI2 ? X86::MULX32rm : X86::MUL32m; break;
+      case MVT::i64: Opc = hasBMI2 ? X86::MULX64rr : X86::MUL64r;
+                     MOpc = hasBMI2 ? X86::MULX64rm : X86::MUL64m; break;
+      }
+    } else {
+      switch (NVT.SimpleTy) {
+      default: llvm_unreachable("Unsupported VT!");
+      case MVT::i8:  Opc = X86::IMUL8r;  MOpc = X86::IMUL8m;  break;
+      case MVT::i16: Opc = X86::IMUL16r; MOpc = X86::IMUL16m; break;
+      case MVT::i32: Opc = X86::IMUL32r; MOpc = X86::IMUL32m; break;
+      case MVT::i64: Opc = X86::IMUL64r; MOpc = X86::IMUL64m; break;
+      }
+    }
+
+    unsigned SrcReg, LoReg, HiReg;
+    switch (Opc) {
+    default: llvm_unreachable("Unknown MUL opcode!");
+    case X86::IMUL8r:
+    case X86::MUL8r:
+      SrcReg = LoReg = X86::AL; HiReg = X86::AH;
+      break;
+    case X86::IMUL16r:
+    case X86::MUL16r:
+      SrcReg = LoReg = X86::AX; HiReg = X86::DX;
+      break;
+    case X86::IMUL32r:
+    case X86::MUL32r:
+      SrcReg = LoReg = X86::EAX; HiReg = X86::EDX;
+      break;
+    case X86::IMUL64r:
+    case X86::MUL64r:
+      SrcReg = LoReg = X86::RAX; HiReg = X86::RDX;
+      break;
+    case X86::MULX32rr:
+      SrcReg = X86::EDX; LoReg = HiReg = 0;
+      break;
+    case X86::MULX64rr:
+      SrcReg = X86::RDX; LoReg = HiReg = 0;
+      break;
+    }
+
+    SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+    bool foldedLoad = TryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
+    // Multiply is commmutative.
+    if (!foldedLoad) {
+      foldedLoad = TryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
+      if (foldedLoad)
+        std::swap(N0, N1);
+    }
+
+    SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, SrcReg,
+                                          N0, SDValue()).getValue(1);
+    SDValue ResHi, ResLo;
+
+    if (foldedLoad) {
+      SDValue Chain;
+      SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
+                        InFlag };
+      if (MOpc == X86::MULX32rm || MOpc == X86::MULX64rm) {
+        SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other, MVT::Glue);
+        SDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+        ResHi = SDValue(CNode, 0);
+        ResLo = SDValue(CNode, 1);
+        Chain = SDValue(CNode, 2);
+        InFlag = SDValue(CNode, 3);
+      } else {
+        SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
+        SDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+        Chain = SDValue(CNode, 0);
+        InFlag = SDValue(CNode, 1);
+      }
+
+      // Update the chain.
+      ReplaceUses(N1.getValue(1), Chain);
+    } else {
+      SDValue Ops[] = { N1, InFlag };
+      if (Opc == X86::MULX32rr || Opc == X86::MULX64rr) {
+        SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Glue);
+        SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+        ResHi = SDValue(CNode, 0);
+        ResLo = SDValue(CNode, 1);
+        InFlag = SDValue(CNode, 2);
+      } else {
+        SDVTList VTs = CurDAG->getVTList(MVT::Glue);
+        SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+        InFlag = SDValue(CNode, 0);
+      }
+    }
+
+    // Prevent use of AH in a REX instruction by referencing AX instead.
+    if (HiReg == X86::AH && Subtarget->is64Bit() &&
+        !SDValue(Node, 1).use_empty()) {
+      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                              X86::AX, MVT::i16, InFlag);
+      InFlag = Result.getValue(2);
+      // Get the low part if needed. Don't use getCopyFromReg for aliasing
+      // registers.
+      if (!SDValue(Node, 0).use_empty())
+        ReplaceUses(SDValue(Node, 1),
+          CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result));
+
+      // Shift AX down 8 bits.
+      Result = SDValue(CurDAG->getMachineNode(X86::SHR16ri, dl, MVT::i16,
+                                              Result,
+                                     CurDAG->getTargetConstant(8, MVT::i8)), 0);
+      // Then truncate it down to i8.
+      ReplaceUses(SDValue(Node, 1),
+        CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result));
+    }
+    // Copy the low half of the result, if it is needed.
+    if (!SDValue(Node, 0).use_empty()) {
+      if (!ResLo.getNode()) {
+        assert(LoReg && "Register for low half is not defined!");
+        ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg, NVT,
+                                       InFlag);
+        InFlag = ResLo.getValue(2);
+      }
+      ReplaceUses(SDValue(Node, 0), ResLo);
+      DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG); dbgs() << '\n');
+    }
+    // Copy the high half of the result, if it is needed.
+    if (!SDValue(Node, 1).use_empty()) {
+      if (!ResHi.getNode()) {
+        assert(HiReg && "Register for high half is not defined!");
+        ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg, NVT,
+                                       InFlag);
+        InFlag = ResHi.getValue(2);
+      }
+      ReplaceUses(SDValue(Node, 1), ResHi);
+      DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG); dbgs() << '\n');
+    }
+
+    return nullptr;
+  }
+
+  case ISD::SDIVREM:
+  case ISD::UDIVREM: {
+    SDValue N0 = Node->getOperand(0);
+    SDValue N1 = Node->getOperand(1);
+
+    bool isSigned = Opcode == ISD::SDIVREM;
+    if (!isSigned) {
+      switch (NVT.SimpleTy) {
+      default: llvm_unreachable("Unsupported VT!");
+      case MVT::i8:  Opc = X86::DIV8r;  MOpc = X86::DIV8m;  break;
+      case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break;
+      case MVT::i32: Opc = X86::DIV32r; MOpc = X86::DIV32m; break;
+      case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break;
+      }
+    } else {
+      switch (NVT.SimpleTy) {
+      default: llvm_unreachable("Unsupported VT!");
+      case MVT::i8:  Opc = X86::IDIV8r;  MOpc = X86::IDIV8m;  break;
+      case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
+      case MVT::i32: Opc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
+      case MVT::i64: Opc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
+      }
+    }
+
+    unsigned LoReg, HiReg, ClrReg;
+    unsigned SExtOpcode;
+    switch (NVT.SimpleTy) {
+    default: llvm_unreachable("Unsupported VT!");
+    case MVT::i8:
+      LoReg = X86::AL;  ClrReg = HiReg = X86::AH;
+      SExtOpcode = X86::CBW;
+      break;
+    case MVT::i16:
+      LoReg = X86::AX;  HiReg = X86::DX;
+      ClrReg = X86::DX;
+      SExtOpcode = X86::CWD;
+      break;
+    case MVT::i32:
+      LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
+      SExtOpcode = X86::CDQ;
+      break;
+    case MVT::i64:
+      LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
+      SExtOpcode = X86::CQO;
+      break;
+    }
+
+    SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+    bool foldedLoad = TryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
+    bool signBitIsZero = CurDAG->SignBitIsZero(N0);
+
+    SDValue InFlag;
+    if (NVT == MVT::i8 && (!isSigned || signBitIsZero)) {
+      // Special case for div8, just use a move with zero extension to AX to
+      // clear the upper 8 bits (AH).
+      SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Move, Chain;
+      if (TryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
+        SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
+        Move =
+          SDValue(CurDAG->getMachineNode(X86::MOVZX32rm8, dl, MVT::i32,
+                                         MVT::Other, Ops), 0);
+        Chain = Move.getValue(1);
+        ReplaceUses(N0.getValue(1), Chain);
+      } else {
+        Move =
+          SDValue(CurDAG->getMachineNode(X86::MOVZX32rr8, dl, MVT::i32, N0),0);
+        Chain = CurDAG->getEntryNode();
+      }
+      Chain  = CurDAG->getCopyToReg(Chain, dl, X86::EAX, Move, SDValue());
+      InFlag = Chain.getValue(1);
+    } else {
+      InFlag =
+        CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
+                             LoReg, N0, SDValue()).getValue(1);
+      if (isSigned && !signBitIsZero) {
+        // Sign extend the low part into the high part.
+        InFlag =
+          SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0);
+      } else {
+        // Zero out the high part, effectively zero extending the input.
+        SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, NVT), 0);       
+        switch (NVT.SimpleTy) {
+        case MVT::i16:
+          ClrNode =
+              SDValue(CurDAG->getMachineNode(
+                          TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode,
+                          CurDAG->getTargetConstant(X86::sub_16bit, MVT::i32)),
+                      0);
+          break;
+        case MVT::i32:
+          break;
+        case MVT::i64:
+          ClrNode =
+              SDValue(CurDAG->getMachineNode(
+                          TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
+                          CurDAG->getTargetConstant(0, MVT::i64), ClrNode,
+                          CurDAG->getTargetConstant(X86::sub_32bit, MVT::i32)),
+                      0);
+          break;
+        default:
+          llvm_unreachable("Unexpected division source");
+        }
+
+        InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
+                                      ClrNode, InFlag).getValue(1);
+      }
+    }
+
+    if (foldedLoad) {
+      SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
+                        InFlag };
+      SDNode *CNode =
+        CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops);
+      InFlag = SDValue(CNode, 1);
+      // Update the chain.
+      ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
+    } else {
+      InFlag =
+        SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag), 0);
+    }
+
+    // Prevent use of AH in a REX instruction by referencing AX instead.
+    // Shift it down 8 bits.
+    //
+    // The current assumption of the register allocator is that isel
+    // won't generate explicit references to the GPR8_NOREX registers. If
+    // the allocator and/or the backend get enhanced to be more robust in
+    // that regard, this can be, and should be, removed.
+    if (HiReg == X86::AH && Subtarget->is64Bit() &&
+        !SDValue(Node, 1).use_empty()) {
+      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                              X86::AX, MVT::i16, InFlag);
+      InFlag = Result.getValue(2);
+
+      // If we also need AL (the quotient), get it by extracting a subreg from
+      // Result. The fast register allocator does not like multiple CopyFromReg
+      // nodes using aliasing registers.
+      if (!SDValue(Node, 0).use_empty())
+        ReplaceUses(SDValue(Node, 0),
+          CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result));
+
+      // Shift AX right by 8 bits instead of using AH.
+      Result = SDValue(CurDAG->getMachineNode(X86::SHR16ri, dl, MVT::i16,
+                                         Result,
+                                         CurDAG->getTargetConstant(8, MVT::i8)),
+                       0);
+      ReplaceUses(SDValue(Node, 1),
+        CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result));
+    }
+    // Copy the division (low) result, if it is needed.
+    if (!SDValue(Node, 0).use_empty()) {
+      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                                LoReg, NVT, InFlag);
+      InFlag = Result.getValue(2);
+      ReplaceUses(SDValue(Node, 0), Result);
+      DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
+    }
+    // Copy the remainder (high) result, if it is needed.
+    if (!SDValue(Node, 1).use_empty()) {
+      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                              HiReg, NVT, InFlag);
+      InFlag = Result.getValue(2);
+      ReplaceUses(SDValue(Node, 1), Result);
+      DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
+    }
+    return nullptr;
+  }
+
+  case X86ISD::CMP:
+  case X86ISD::SUB: {
+    // Sometimes a SUB is used to perform comparison.
+    if (Opcode == X86ISD::SUB && Node->hasAnyUseOfValue(0))
+      // This node is not a CMP.
+      break;
+    SDValue N0 = Node->getOperand(0);
+    SDValue N1 = Node->getOperand(1);
+
+    // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
+    // use a smaller encoding.
+    if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() &&
+        HasNoSignedComparisonUses(Node))
+      // Look past the truncate if CMP is the only use of it.
+      N0 = N0.getOperand(0);
+    if ((N0.getNode()->getOpcode() == ISD::AND ||
+         (N0.getResNo() == 0 && N0.getNode()->getOpcode() == X86ISD::AND)) &&
+        N0.getNode()->hasOneUse() &&
+        N0.getValueType() != MVT::i8 &&
+        X86::isZeroNode(N1)) {
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getNode()->getOperand(1));
+      if (!C) break;
+
+      // For example, convert "testl %eax, $8" to "testb %al, $8"
+      if ((C->getZExtValue() & ~UINT64_C(0xff)) == 0 &&
+          (!(C->getZExtValue() & 0x80) ||
+           HasNoSignedComparisonUses(Node))) {
+        SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i8);
+        SDValue Reg = N0.getNode()->getOperand(0);
+
+        // On x86-32, only the ABCD registers have 8-bit subregisters.
+        if (!Subtarget->is64Bit()) {
+          const TargetRegisterClass *TRC;
+          switch (N0.getSimpleValueType().SimpleTy) {
+          case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break;
+          case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break;
+          default: llvm_unreachable("Unsupported TEST operand type!");
+          }
+          SDValue RC = CurDAG->getTargetConstant(TRC->getID(), MVT::i32);
+          Reg = SDValue(CurDAG->getMachineNode(X86::COPY_TO_REGCLASS, dl,
+                                               Reg.getValueType(), Reg, RC), 0);
+        }
+
+        // Extract the l-register.
+        SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl,
+                                                        MVT::i8, Reg);
+
+        // Emit a testb.
+        SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri, dl, MVT::i32,
+                                                 Subreg, Imm);
+        // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
+        // one, do not call ReplaceAllUsesWith.
+        ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
+                    SDValue(NewNode, 0));
+        return nullptr;
+      }
+
+      // For example, "testl %eax, $2048" to "testb %ah, $8".
+      if ((C->getZExtValue() & ~UINT64_C(0xff00)) == 0 &&
+          (!(C->getZExtValue() & 0x8000) ||
+           HasNoSignedComparisonUses(Node))) {
+        // Shift the immediate right by 8 bits.
+        SDValue ShiftedImm = CurDAG->getTargetConstant(C->getZExtValue() >> 8,
+                                                       MVT::i8);
+        SDValue Reg = N0.getNode()->getOperand(0);
+
+        // Put the value in an ABCD register.
+        const TargetRegisterClass *TRC;
+        switch (N0.getSimpleValueType().SimpleTy) {
+        case MVT::i64: TRC = &X86::GR64_ABCDRegClass; break;
+        case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break;
+        case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break;
+        default: llvm_unreachable("Unsupported TEST operand type!");
+        }
+        SDValue RC = CurDAG->getTargetConstant(TRC->getID(), MVT::i32);
+        Reg = SDValue(CurDAG->getMachineNode(X86::COPY_TO_REGCLASS, dl,
+                                             Reg.getValueType(), Reg, RC), 0);
+
+        // Extract the h-register.
+        SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl,
+                                                        MVT::i8, Reg);
+
+        // Emit a testb.  The EXTRACT_SUBREG becomes a COPY that can only
+        // target GR8_NOREX registers, so make sure the register class is
+        // forced.
+        SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri_NOREX, dl,
+                                                 MVT::i32, Subreg, ShiftedImm);
+        // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
+        // one, do not call ReplaceAllUsesWith.
+        ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
+                    SDValue(NewNode, 0));
+        return nullptr;
+      }
+
+      // For example, "testl %eax, $32776" to "testw %ax, $32776".
+      if ((C->getZExtValue() & ~UINT64_C(0xffff)) == 0 &&
+          N0.getValueType() != MVT::i16 &&
+          (!(C->getZExtValue() & 0x8000) ||
+           HasNoSignedComparisonUses(Node))) {
+        SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i16);
+        SDValue Reg = N0.getNode()->getOperand(0);
+
+        // Extract the 16-bit subregister.
+        SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_16bit, dl,
+                                                        MVT::i16, Reg);
+
+        // Emit a testw.
+        SDNode *NewNode = CurDAG->getMachineNode(X86::TEST16ri, dl, MVT::i32,
+                                                 Subreg, Imm);
+        // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
+        // one, do not call ReplaceAllUsesWith.
+        ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
+                    SDValue(NewNode, 0));
+        return nullptr;
+      }
+
+      // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
+      if ((C->getZExtValue() & ~UINT64_C(0xffffffff)) == 0 &&
+          N0.getValueType() == MVT::i64 &&
+          (!(C->getZExtValue() & 0x80000000) ||
+           HasNoSignedComparisonUses(Node))) {
+        SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i32);
+        SDValue Reg = N0.getNode()->getOperand(0);
+
+        // Extract the 32-bit subregister.
+        SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_32bit, dl,
+                                                        MVT::i32, Reg);
+
+        // Emit a testl.
+        SDNode *NewNode = CurDAG->getMachineNode(X86::TEST32ri, dl, MVT::i32,
+                                                 Subreg, Imm);
+        // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
+        // one, do not call ReplaceAllUsesWith.
+        ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
+                    SDValue(NewNode, 0));
+        return nullptr;
+      }
+    }
+    break;
+  }
+  case ISD::STORE: {
+    // Change a chain of {load; incr or dec; store} of the same value into
+    // a simple increment or decrement through memory of that value, if the
+    // uses of the modified value and its address are suitable.
+    // The DEC64m tablegen pattern is currently not able to match the case where
+    // the EFLAGS on the original DEC are used. (This also applies to
+    // {INC,DEC}X{64,32,16,8}.)
+    // We'll need to improve tablegen to allow flags to be transferred from a
+    // node in the pattern to the result node.  probably with a new keyword
+    // for example, we have this
+    // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
+    //  [(store (add (loadi64 addr:$dst), -1), addr:$dst),
+    //   (implicit EFLAGS)]>;
+    // but maybe need something like this
+    // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
+    //  [(store (add (loadi64 addr:$dst), -1), addr:$dst),
+    //   (transferrable EFLAGS)]>;
+
+    StoreSDNode *StoreNode = cast<StoreSDNode>(Node);
+    SDValue StoredVal = StoreNode->getOperand(1);
+    unsigned Opc = StoredVal->getOpcode();
+
+    LoadSDNode *LoadNode = nullptr;
+    SDValue InputChain;
+    if (!isLoadIncOrDecStore(StoreNode, Opc, StoredVal, CurDAG,
+                             LoadNode, InputChain))
+      break;
+
+    SDValue Base, Scale, Index, Disp, Segment;
+    if (!SelectAddr(LoadNode, LoadNode->getBasePtr(),
+                    Base, Scale, Index, Disp, Segment))
+      break;
+
+    MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(2);
+    MemOp[0] = StoreNode->getMemOperand();
+    MemOp[1] = LoadNode->getMemOperand();
+    const SDValue Ops[] = { Base, Scale, Index, Disp, Segment, InputChain };
+    EVT LdVT = LoadNode->getMemoryVT();
+    unsigned newOpc = getFusedLdStOpcode(LdVT, Opc);
+    MachineSDNode *Result = CurDAG->getMachineNode(newOpc,
+                                                   SDLoc(Node),
+                                                   MVT::i32, MVT::Other, Ops);
+    Result->setMemRefs(MemOp, MemOp + 2);
+
+    ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
+    ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
+
+    return Result;
+  }
+  }
+
+  SDNode *ResNode = SelectCode(Node);
+
+  DEBUG(dbgs() << "=> ";
+        if (ResNode == nullptr || ResNode == Node)
+          Node->dump(CurDAG);
+        else
+          ResNode->dump(CurDAG);
+        dbgs() << '\n');
+
+  return ResNode;
+}
+
+bool X86DAGToDAGISel::
+SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
+                             std::vector<SDValue> &OutOps) {
+  SDValue Op0, Op1, Op2, Op3, Op4;
+  switch (ConstraintCode) {
+  case 'o':   // offsetable        ??
+  case 'v':   // not offsetable    ??
+  default: return true;
+  case 'm':   // memory
+    if (!SelectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
+      return true;
+    break;
+  }
+
+  OutOps.push_back(Op0);
+  OutOps.push_back(Op1);
+  OutOps.push_back(Op2);
+  OutOps.push_back(Op3);
+  OutOps.push_back(Op4);
+  return false;
+}
+
+/// createX86ISelDag - This pass converts a legalized DAG into a
+/// X86-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
+                                     CodeGenOpt::Level OptLevel) {
+  return new X86DAGToDAGISel(TM, OptLevel);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
new file mode 100644
index 0000000..6fc4c84
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -0,0 +1,22910 @@
+//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that X86 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86ISelLowering.h"
+#include "Utils/X86ShuffleDecode.h"
+#include "X86CallingConv.h"
+#include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86TargetMachine.h"
+#include "X86TargetObjectFile.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/VariadicFunction.h"
+#include "llvm/CodeGen/IntrinsicLowering.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetOptions.h"
+#include <bitset>
+#include <numeric>
+#include <cctype>
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-isel"
+
+STATISTIC(NumTailCalls, "Number of tail calls");
+
+static cl::opt<bool> ExperimentalVectorWideningLegalization(
+    "x86-experimental-vector-widening-legalization", cl::init(false),
+    cl::desc("Enable an experimental vector type legalization through widening "
+             "rather than promotion."),
+    cl::Hidden);
+
+static cl::opt<bool> ExperimentalVectorShuffleLowering(
+    "x86-experimental-vector-shuffle-lowering", cl::init(false),
+    cl::desc("Enable an experimental vector shuffle lowering code path."),
+    cl::Hidden);
+
+// Forward declarations.
+static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
+                       SDValue V2);
+
+static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
+                                SelectionDAG &DAG, SDLoc dl,
+                                unsigned vectorWidth) {
+  assert((vectorWidth == 128 || vectorWidth == 256) &&
+         "Unsupported vector width");
+  EVT VT = Vec.getValueType();
+  EVT ElVT = VT.getVectorElementType();
+  unsigned Factor = VT.getSizeInBits()/vectorWidth;
+  EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
+                                  VT.getVectorNumElements()/Factor);
+
+  // Extract from UNDEF is UNDEF.
+  if (Vec.getOpcode() == ISD::UNDEF)
+    return DAG.getUNDEF(ResultVT);
+
+  // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
+  unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
+
+  // This is the index of the first element of the vectorWidth-bit chunk
+  // we want.
+  unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
+                               * ElemsPerChunk);
+
+  // If the input is a buildvector just emit a smaller one.
+  if (Vec.getOpcode() == ISD::BUILD_VECTOR)
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
+                       makeArrayRef(Vec->op_begin()+NormalizedIdxVal,
+                                    ElemsPerChunk));
+
+  SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
+  SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
+                               VecIdx);
+
+  return Result;
+
+}
+/// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
+/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
+/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
+/// instructions or a simple subregister reference. Idx is an index in the
+/// 128 bits we want.  It need not be aligned to a 128-bit bounday.  That makes
+/// lowering EXTRACT_VECTOR_ELT operations easier.
+static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
+                                   SelectionDAG &DAG, SDLoc dl) {
+  assert((Vec.getValueType().is256BitVector() ||
+          Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
+  return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
+}
+
+/// Generate a DAG to grab 256-bits from a 512-bit vector.
+static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
+                                   SelectionDAG &DAG, SDLoc dl) {
+  assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
+  return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
+}
+
+static SDValue InsertSubVector(SDValue Result, SDValue Vec,
+                               unsigned IdxVal, SelectionDAG &DAG,
+                               SDLoc dl, unsigned vectorWidth) {
+  assert((vectorWidth == 128 || vectorWidth == 256) &&
+         "Unsupported vector width");
+  // Inserting UNDEF is Result
+  if (Vec.getOpcode() == ISD::UNDEF)
+    return Result;
+  EVT VT = Vec.getValueType();
+  EVT ElVT = VT.getVectorElementType();
+  EVT ResultVT = Result.getValueType();
+
+  // Insert the relevant vectorWidth bits.
+  unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
+
+  // This is the index of the first element of the vectorWidth-bit chunk
+  // we want.
+  unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
+                               * ElemsPerChunk);
+
+  SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
+  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
+                     VecIdx);
+}
+/// Generate a DAG to put 128-bits into a vector > 128 bits.  This
+/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
+/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
+/// simple superregister reference.  Idx is an index in the 128 bits
+/// we want.  It need not be aligned to a 128-bit bounday.  That makes
+/// lowering INSERT_VECTOR_ELT operations easier.
+static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
+                                  unsigned IdxVal, SelectionDAG &DAG,
+                                  SDLoc dl) {
+  assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
+  return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
+}
+
+static SDValue Insert256BitVector(SDValue Result, SDValue Vec,
+                                  unsigned IdxVal, SelectionDAG &DAG,
+                                  SDLoc dl) {
+  assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
+  return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
+}
+
+/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
+/// instructions. This is used because creating CONCAT_VECTOR nodes of
+/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
+/// large BUILD_VECTORS.
+static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
+                                   unsigned NumElems, SelectionDAG &DAG,
+                                   SDLoc dl) {
+  SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
+  return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
+}
+
+static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
+                                   unsigned NumElems, SelectionDAG &DAG,
+                                   SDLoc dl) {
+  SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
+  return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
+}
+
+static TargetLoweringObjectFile *createTLOF(const Triple &TT) {
+  if (TT.isOSBinFormatMachO()) {
+    if (TT.getArch() == Triple::x86_64)
+      return new X86_64MachoTargetObjectFile();
+    return new TargetLoweringObjectFileMachO();
+  }
+
+  if (TT.isOSLinux())
+    return new X86LinuxTargetObjectFile();
+  if (TT.isOSBinFormatELF())
+    return new TargetLoweringObjectFileELF();
+  if (TT.isKnownWindowsMSVCEnvironment())
+    return new X86WindowsTargetObjectFile();
+  if (TT.isOSBinFormatCOFF())
+    return new TargetLoweringObjectFileCOFF();
+  llvm_unreachable("unknown subtarget type");
+}
+
+// FIXME: This should stop caching the target machine as soon as
+// we can remove resetOperationActions et al.
+X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
+  : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) {
+  Subtarget = &TM.getSubtarget<X86Subtarget>();
+  X86ScalarSSEf64 = Subtarget->hasSSE2();
+  X86ScalarSSEf32 = Subtarget->hasSSE1();
+  TD = getDataLayout();
+
+  resetOperationActions();
+}
+
+void X86TargetLowering::resetOperationActions() {
+  const TargetMachine &TM = getTargetMachine();
+  static bool FirstTimeThrough = true;
+
+  // If none of the target options have changed, then we don't need to reset the
+  // operation actions.
+  if (!FirstTimeThrough && TO == TM.Options) return;
+
+  if (!FirstTimeThrough) {
+    // Reinitialize the actions.
+    initActions();
+    FirstTimeThrough = false;
+  }
+
+  TO = TM.Options;
+
+  // Set up the TargetLowering object.
+  static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
+
+  // X86 is weird, it always uses i8 for shift amounts and setcc results.
+  setBooleanContents(ZeroOrOneBooleanContent);
+  // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
+  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+
+  // For 64-bit since we have so many registers use the ILP scheduler, for
+  // 32-bit code use the register pressure specific scheduling.
+  // For Atom, always use ILP scheduling.
+  if (Subtarget->isAtom())
+    setSchedulingPreference(Sched::ILP);
+  else if (Subtarget->is64Bit())
+    setSchedulingPreference(Sched::ILP);
+  else
+    setSchedulingPreference(Sched::RegPressure);
+  const X86RegisterInfo *RegInfo =
+    static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
+  setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
+
+  // Bypass expensive divides on Atom when compiling with O2
+  if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default) {
+    addBypassSlowDiv(32, 8);
+    if (Subtarget->is64Bit())
+      addBypassSlowDiv(64, 16);
+  }
+
+  if (Subtarget->isTargetKnownWindowsMSVC()) {
+    // Setup Windows compiler runtime calls.
+    setLibcallName(RTLIB::SDIV_I64, "_alldiv");
+    setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
+    setLibcallName(RTLIB::SREM_I64, "_allrem");
+    setLibcallName(RTLIB::UREM_I64, "_aullrem");
+    setLibcallName(RTLIB::MUL_I64, "_allmul");
+    setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
+    setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
+    setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
+    setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
+    setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
+
+    // The _ftol2 runtime function has an unusual calling conv, which
+    // is modeled by a special pseudo-instruction.
+    setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr);
+    setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr);
+    setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr);
+    setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr);
+  }
+
+  if (Subtarget->isTargetDarwin()) {
+    // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
+    setUseUnderscoreSetJmp(false);
+    setUseUnderscoreLongJmp(false);
+  } else if (Subtarget->isTargetWindowsGNU()) {
+    // MS runtime is weird: it exports _setjmp, but longjmp!
+    setUseUnderscoreSetJmp(true);
+    setUseUnderscoreLongJmp(false);
+  } else {
+    setUseUnderscoreSetJmp(true);
+    setUseUnderscoreLongJmp(true);
+  }
+
+  // Set up the register classes.
+  addRegisterClass(MVT::i8, &X86::GR8RegClass);
+  addRegisterClass(MVT::i16, &X86::GR16RegClass);
+  addRegisterClass(MVT::i32, &X86::GR32RegClass);
+  if (Subtarget->is64Bit())
+    addRegisterClass(MVT::i64, &X86::GR64RegClass);
+
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+
+  // We don't accept any truncstore of integer registers.
+  setTruncStoreAction(MVT::i64, MVT::i32, Expand);
+  setTruncStoreAction(MVT::i64, MVT::i16, Expand);
+  setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
+  setTruncStoreAction(MVT::i32, MVT::i16, Expand);
+  setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
+  setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
+
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+  // SETOEQ and SETUNE require checking two conditions.
+  setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
+  setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
+
+  // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
+  // operation.
+  setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
+  setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
+  setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
+
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
+    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
+  } else if (!TM.Options.UseSoftFloat) {
+    // We have an algorithm for SSE2->double, and we turn this into a
+    // 64-bit FILD followed by conditional FADD for other targets.
+    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
+    // We have an algorithm for SSE2, and we turn this into a 64-bit
+    // FILD for other targets.
+    setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
+  }
+
+  // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
+  // this operation.
+  setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
+  setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
+
+  if (!TM.Options.UseSoftFloat) {
+    // SSE has no i16 to fp conversion, only i32
+    if (X86ScalarSSEf32) {
+      setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
+      // f32 and f64 cases are Legal, f80 case is not
+      setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
+    } else {
+      setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
+      setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
+    }
+  } else {
+    setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
+    setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
+  }
+
+  // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
+  // are Legal, f80 is custom lowered.
+  setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
+  setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
+
+  // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
+  // this operation.
+  setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
+  setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
+
+  if (X86ScalarSSEf32) {
+    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
+    // f32 and f64 cases are Legal, f80 case is not
+    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
+  } else {
+    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
+    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
+  }
+
+  // Handle FP_TO_UINT by promoting the destination to a larger signed
+  // conversion.
+  setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
+  setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
+  setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
+
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
+    setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
+  } else if (!TM.Options.UseSoftFloat) {
+    // Since AVX is a superset of SSE3, only check for SSE here.
+    if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
+      // Expand FP_TO_UINT into a select.
+      // FIXME: We would like to use a Custom expander here eventually to do
+      // the optimal thing for SSE vs. the default expansion in the legalizer.
+      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
+    else
+      // With SSE3 we can use fisttpll to convert to a signed i64; without
+      // SSE, we're stuck with a fistpll.
+      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
+  }
+
+  if (isTargetFTOL()) {
+    // Use the _ftol2 runtime function, which has a pseudo-instruction
+    // to handle its weird calling convention.
+    setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
+  }
+
+  // TODO: when we have SSE, these could be more efficient, by using movd/movq.
+  if (!X86ScalarSSEf64) {
+    setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
+    setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
+    if (Subtarget->is64Bit()) {
+      setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
+      // Without SSE, i64->f64 goes through memory.
+      setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
+    }
+  }
+
+  // Scalar integer divide and remainder are lowered to use operations that
+  // produce two results, to match the available instructions. This exposes
+  // the two-result form to trivial CSE, which is able to combine x/y and x%y
+  // into a single instruction.
+  //
+  // Scalar integer multiply-high is also lowered to use two-result
+  // operations, to match the available instructions. However, plain multiply
+  // (low) operations are left as Legal, as there are single-result
+  // instructions for this in x86. Using the two-result multiply instructions
+  // when both high and low results are needed must be arranged by dagcombine.
+  for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
+    MVT VT = IntVTs[i];
+    setOperationAction(ISD::MULHS, VT, Expand);
+    setOperationAction(ISD::MULHU, VT, Expand);
+    setOperationAction(ISD::SDIV, VT, Expand);
+    setOperationAction(ISD::UDIV, VT, Expand);
+    setOperationAction(ISD::SREM, VT, Expand);
+    setOperationAction(ISD::UREM, VT, Expand);
+
+    // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
+    setOperationAction(ISD::ADDC, VT, Custom);
+    setOperationAction(ISD::ADDE, VT, Custom);
+    setOperationAction(ISD::SUBC, VT, Custom);
+    setOperationAction(ISD::SUBE, VT, Custom);
+  }
+
+  setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
+  setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
+  setOperationAction(ISD::BR_CC            , MVT::f32,   Expand);
+  setOperationAction(ISD::BR_CC            , MVT::f64,   Expand);
+  setOperationAction(ISD::BR_CC            , MVT::f80,   Expand);
+  setOperationAction(ISD::BR_CC            , MVT::i8,    Expand);
+  setOperationAction(ISD::BR_CC            , MVT::i16,   Expand);
+  setOperationAction(ISD::BR_CC            , MVT::i32,   Expand);
+  setOperationAction(ISD::BR_CC            , MVT::i64,   Expand);
+  setOperationAction(ISD::SELECT_CC        , MVT::f32,   Expand);
+  setOperationAction(ISD::SELECT_CC        , MVT::f64,   Expand);
+  setOperationAction(ISD::SELECT_CC        , MVT::f80,   Expand);
+  setOperationAction(ISD::SELECT_CC        , MVT::i8,    Expand);
+  setOperationAction(ISD::SELECT_CC        , MVT::i16,   Expand);
+  setOperationAction(ISD::SELECT_CC        , MVT::i32,   Expand);
+  setOperationAction(ISD::SELECT_CC        , MVT::i64,   Expand);
+  if (Subtarget->is64Bit())
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
+  setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
+  setOperationAction(ISD::FREM             , MVT::f32  , Expand);
+  setOperationAction(ISD::FREM             , MVT::f64  , Expand);
+  setOperationAction(ISD::FREM             , MVT::f80  , Expand);
+  setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
+
+  // Promote the i8 variants and force them on up to i32 which has a shorter
+  // encoding.
+  setOperationAction(ISD::CTTZ             , MVT::i8   , Promote);
+  AddPromotedToType (ISD::CTTZ             , MVT::i8   , MVT::i32);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , Promote);
+  AddPromotedToType (ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , MVT::i32);
+  if (Subtarget->hasBMI()) {
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Expand);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Expand);
+    if (Subtarget->is64Bit())
+      setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
+  } else {
+    setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
+    setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
+    if (Subtarget->is64Bit())
+      setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
+  }
+
+  if (Subtarget->hasLZCNT()) {
+    // When promoting the i8 variants, force them to i32 for a shorter
+    // encoding.
+    setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
+    AddPromotedToType (ISD::CTLZ           , MVT::i8   , MVT::i32);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Promote);
+    AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Expand);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Expand);
+    if (Subtarget->is64Bit())
+      setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
+  } else {
+    setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
+    setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
+    setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
+    if (Subtarget->is64Bit()) {
+      setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
+      setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
+    }
+  }
+
+  // Special handling for half-precision floating point conversions.
+  // If we don't have F16C support, then lower half float conversions
+  // into library calls.
+  if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) {
+    setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
+    setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
+  }
+
+  // There's never any support for operations beyond MVT::f32.
+  setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
+  setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
+  setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
+  setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
+
+  setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f80, MVT::f16, Expand);
+
+  if (Subtarget->hasPOPCNT()) {
+    setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
+  } else {
+    setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
+    setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
+    setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
+    if (Subtarget->is64Bit())
+      setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
+  }
+
+  setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
+
+  if (!Subtarget->hasMOVBE())
+    setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
+
+  // These should be promoted to a larger select which is supported.
+  setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
+  // X86 wants to expand cmov itself.
+  setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
+  setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
+  setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
+  setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
+  setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
+  setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
+  setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
+  setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
+  setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
+  setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
+  setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
+  setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
+    setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
+  }
+  setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
+  // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
+  // SjLj exception handling but a light-weight setjmp/longjmp replacement to
+  // support continuation, user-level threading, and etc.. As a result, no
+  // other SjLj exception interfaces are implemented and please don't build
+  // your own exception handling based on them.
+  // LLVM/Clang supports zero-cost DWARF exception handling.
+  setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
+  setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
+
+  // Darwin ABI issue.
+  setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
+  setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
+  setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
+  setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
+  if (Subtarget->is64Bit())
+    setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
+  setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
+  setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
+    setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
+    setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
+    setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
+    setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
+  }
+  // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
+  setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
+  setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
+  setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
+    setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
+    setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
+  }
+
+  if (Subtarget->hasSSE1())
+    setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
+
+  setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
+
+  // Expand certain atomics
+  for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
+    MVT VT = IntVTs[i];
+    setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
+    setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
+  }
+
+  if (Subtarget->hasCmpxchg16b()) {
+    setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
+  }
+
+  // FIXME - use subtarget debug flags
+  if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() &&
+      !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) {
+    setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
+  }
+
+  if (Subtarget->is64Bit()) {
+    setExceptionPointerRegister(X86::RAX);
+    setExceptionSelectorRegister(X86::RDX);
+  } else {
+    setExceptionPointerRegister(X86::EAX);
+    setExceptionSelectorRegister(X86::EDX);
+  }
+  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
+  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
+
+  setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
+  setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
+
+  setOperationAction(ISD::TRAP, MVT::Other, Legal);
+  setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
+
+  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
+  setOperationAction(ISD::VASTART           , MVT::Other, Custom);
+  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
+  if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) {
+    // TargetInfo::X86_64ABIBuiltinVaList
+    setOperationAction(ISD::VAARG           , MVT::Other, Custom);
+    setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
+  } else {
+    // TargetInfo::CharPtrBuiltinVaList
+    setOperationAction(ISD::VAARG           , MVT::Other, Expand);
+    setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
+  }
+
+  setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
+
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
+                     MVT::i64 : MVT::i32, Custom);
+
+  if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
+    // f32 and f64 use SSE.
+    // Set up the FP register classes.
+    addRegisterClass(MVT::f32, &X86::FR32RegClass);
+    addRegisterClass(MVT::f64, &X86::FR64RegClass);
+
+    // Use ANDPD to simulate FABS.
+    setOperationAction(ISD::FABS , MVT::f64, Custom);
+    setOperationAction(ISD::FABS , MVT::f32, Custom);
+
+    // Use XORP to simulate FNEG.
+    setOperationAction(ISD::FNEG , MVT::f64, Custom);
+    setOperationAction(ISD::FNEG , MVT::f32, Custom);
+
+    // Use ANDPD and ORPD to simulate FCOPYSIGN.
+    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
+
+    // Lower this to FGETSIGNx86 plus an AND.
+    setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
+    setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
+
+    // We don't support sin/cos/fmod
+    setOperationAction(ISD::FSIN   , MVT::f64, Expand);
+    setOperationAction(ISD::FCOS   , MVT::f64, Expand);
+    setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+    setOperationAction(ISD::FSIN   , MVT::f32, Expand);
+    setOperationAction(ISD::FCOS   , MVT::f32, Expand);
+    setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
+
+    // Expand FP immediates into loads from the stack, except for the special
+    // cases we handle.
+    addLegalFPImmediate(APFloat(+0.0)); // xorpd
+    addLegalFPImmediate(APFloat(+0.0f)); // xorps
+  } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
+    // Use SSE for f32, x87 for f64.
+    // Set up the FP register classes.
+    addRegisterClass(MVT::f32, &X86::FR32RegClass);
+    addRegisterClass(MVT::f64, &X86::RFP64RegClass);
+
+    // Use ANDPS to simulate FABS.
+    setOperationAction(ISD::FABS , MVT::f32, Custom);
+
+    // Use XORP to simulate FNEG.
+    setOperationAction(ISD::FNEG , MVT::f32, Custom);
+
+    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
+
+    // Use ANDPS and ORPS to simulate FCOPYSIGN.
+    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
+
+    // We don't support sin/cos/fmod
+    setOperationAction(ISD::FSIN   , MVT::f32, Expand);
+    setOperationAction(ISD::FCOS   , MVT::f32, Expand);
+    setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
+
+    // Special cases we handle for FP constants.
+    addLegalFPImmediate(APFloat(+0.0f)); // xorps
+    addLegalFPImmediate(APFloat(+0.0)); // FLD0
+    addLegalFPImmediate(APFloat(+1.0)); // FLD1
+    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
+    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
+
+    if (!TM.Options.UnsafeFPMath) {
+      setOperationAction(ISD::FSIN   , MVT::f64, Expand);
+      setOperationAction(ISD::FCOS   , MVT::f64, Expand);
+      setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+    }
+  } else if (!TM.Options.UseSoftFloat) {
+    // f32 and f64 in x87.
+    // Set up the FP register classes.
+    addRegisterClass(MVT::f64, &X86::RFP64RegClass);
+    addRegisterClass(MVT::f32, &X86::RFP32RegClass);
+
+    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
+    setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+
+    if (!TM.Options.UnsafeFPMath) {
+      setOperationAction(ISD::FSIN   , MVT::f64, Expand);
+      setOperationAction(ISD::FSIN   , MVT::f32, Expand);
+      setOperationAction(ISD::FCOS   , MVT::f64, Expand);
+      setOperationAction(ISD::FCOS   , MVT::f32, Expand);
+      setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+      setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
+    }
+    addLegalFPImmediate(APFloat(+0.0)); // FLD0
+    addLegalFPImmediate(APFloat(+1.0)); // FLD1
+    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
+    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
+    addLegalFPImmediate(APFloat(+0.0f)); // FLD0
+    addLegalFPImmediate(APFloat(+1.0f)); // FLD1
+    addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
+    addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
+  }
+
+  // We don't support FMA.
+  setOperationAction(ISD::FMA, MVT::f64, Expand);
+  setOperationAction(ISD::FMA, MVT::f32, Expand);
+
+  // Long double always uses X87.
+  if (!TM.Options.UseSoftFloat) {
+    addRegisterClass(MVT::f80, &X86::RFP80RegClass);
+    setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
+    {
+      APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
+      addLegalFPImmediate(TmpFlt);  // FLD0
+      TmpFlt.changeSign();
+      addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
+
+      bool ignored;
+      APFloat TmpFlt2(+1.0);
+      TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
+                      &ignored);
+      addLegalFPImmediate(TmpFlt2);  // FLD1
+      TmpFlt2.changeSign();
+      addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
+    }
+
+    if (!TM.Options.UnsafeFPMath) {
+      setOperationAction(ISD::FSIN   , MVT::f80, Expand);
+      setOperationAction(ISD::FCOS   , MVT::f80, Expand);
+      setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
+    }
+
+    setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
+    setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
+    setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
+    setOperationAction(ISD::FRINT,  MVT::f80, Expand);
+    setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
+    setOperationAction(ISD::FMA, MVT::f80, Expand);
+  }
+
+  // Always use a library call for pow.
+  setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
+  setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
+  setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
+
+  setOperationAction(ISD::FLOG, MVT::f80, Expand);
+  setOperationAction(ISD::FLOG2, MVT::f80, Expand);
+  setOperationAction(ISD::FLOG10, MVT::f80, Expand);
+  setOperationAction(ISD::FEXP, MVT::f80, Expand);
+  setOperationAction(ISD::FEXP2, MVT::f80, Expand);
+
+  // First set operation action for all vector types to either promote
+  // (for widening) or expand (for scalarization). Then we will selectively
+  // turn on ones that can be effectively codegen'd.
+  for (int i = MVT::FIRST_VECTOR_VALUETYPE;
+           i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
+    MVT VT = (MVT::SimpleValueType)i;
+    setOperationAction(ISD::ADD , VT, Expand);
+    setOperationAction(ISD::SUB , VT, Expand);
+    setOperationAction(ISD::FADD, VT, Expand);
+    setOperationAction(ISD::FNEG, VT, Expand);
+    setOperationAction(ISD::FSUB, VT, Expand);
+    setOperationAction(ISD::MUL , VT, Expand);
+    setOperationAction(ISD::FMUL, VT, Expand);
+    setOperationAction(ISD::SDIV, VT, Expand);
+    setOperationAction(ISD::UDIV, VT, Expand);
+    setOperationAction(ISD::FDIV, VT, Expand);
+    setOperationAction(ISD::SREM, VT, Expand);
+    setOperationAction(ISD::UREM, VT, Expand);
+    setOperationAction(ISD::LOAD, VT, Expand);
+    setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
+    setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
+    setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
+    setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
+    setOperationAction(ISD::FABS, VT, Expand);
+    setOperationAction(ISD::FSIN, VT, Expand);
+    setOperationAction(ISD::FSINCOS, VT, Expand);
+    setOperationAction(ISD::FCOS, VT, Expand);
+    setOperationAction(ISD::FSINCOS, VT, Expand);
+    setOperationAction(ISD::FREM, VT, Expand);
+    setOperationAction(ISD::FMA,  VT, Expand);
+    setOperationAction(ISD::FPOWI, VT, Expand);
+    setOperationAction(ISD::FSQRT, VT, Expand);
+    setOperationAction(ISD::FCOPYSIGN, VT, Expand);
+    setOperationAction(ISD::FFLOOR, VT, Expand);
+    setOperationAction(ISD::FCEIL, VT, Expand);
+    setOperationAction(ISD::FTRUNC, VT, Expand);
+    setOperationAction(ISD::FRINT, VT, Expand);
+    setOperationAction(ISD::FNEARBYINT, VT, Expand);
+    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::MULHS, VT, Expand);
+    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::MULHU, VT, Expand);
+    setOperationAction(ISD::SDIVREM, VT, Expand);
+    setOperationAction(ISD::UDIVREM, VT, Expand);
+    setOperationAction(ISD::FPOW, VT, Expand);
+    setOperationAction(ISD::CTPOP, VT, Expand);
+    setOperationAction(ISD::CTTZ, VT, Expand);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
+    setOperationAction(ISD::CTLZ, VT, Expand);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
+    setOperationAction(ISD::SHL, VT, Expand);
+    setOperationAction(ISD::SRA, VT, Expand);
+    setOperationAction(ISD::SRL, VT, Expand);
+    setOperationAction(ISD::ROTL, VT, Expand);
+    setOperationAction(ISD::ROTR, VT, Expand);
+    setOperationAction(ISD::BSWAP, VT, Expand);
+    setOperationAction(ISD::SETCC, VT, Expand);
+    setOperationAction(ISD::FLOG, VT, Expand);
+    setOperationAction(ISD::FLOG2, VT, Expand);
+    setOperationAction(ISD::FLOG10, VT, Expand);
+    setOperationAction(ISD::FEXP, VT, Expand);
+    setOperationAction(ISD::FEXP2, VT, Expand);
+    setOperationAction(ISD::FP_TO_UINT, VT, Expand);
+    setOperationAction(ISD::FP_TO_SINT, VT, Expand);
+    setOperationAction(ISD::UINT_TO_FP, VT, Expand);
+    setOperationAction(ISD::SINT_TO_FP, VT, Expand);
+    setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
+    setOperationAction(ISD::TRUNCATE, VT, Expand);
+    setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
+    setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
+    setOperationAction(ISD::ANY_EXTEND, VT, Expand);
+    setOperationAction(ISD::VSELECT, VT, Expand);
+    setOperationAction(ISD::SELECT_CC, VT, Expand);
+    for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE;
+             InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
+      setTruncStoreAction(VT,
+                          (MVT::SimpleValueType)InnerVT, Expand);
+    setLoadExtAction(ISD::SEXTLOAD, VT, Expand);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, Expand);
+
+    // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like types,
+    // we have to deal with them whether we ask for Expansion or not. Setting
+    // Expand causes its own optimisation problems though, so leave them legal.
+    if (VT.getVectorElementType() == MVT::i1)
+      setLoadExtAction(ISD::EXTLOAD, VT, Expand);
+  }
+
+  // FIXME: In order to prevent SSE instructions being expanded to MMX ones
+  // with -msoft-float, disable use of MMX as well.
+  if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
+    addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
+    // No operations on x86mmx supported, everything uses intrinsics.
+  }
+
+  // MMX-sized vectors (other than x86mmx) are expected to be expanded
+  // into smaller operations.
+  setOperationAction(ISD::MULHS,              MVT::v8i8,  Expand);
+  setOperationAction(ISD::MULHS,              MVT::v4i16, Expand);
+  setOperationAction(ISD::MULHS,              MVT::v2i32, Expand);
+  setOperationAction(ISD::MULHS,              MVT::v1i64, Expand);
+  setOperationAction(ISD::AND,                MVT::v8i8,  Expand);
+  setOperationAction(ISD::AND,                MVT::v4i16, Expand);
+  setOperationAction(ISD::AND,                MVT::v2i32, Expand);
+  setOperationAction(ISD::AND,                MVT::v1i64, Expand);
+  setOperationAction(ISD::OR,                 MVT::v8i8,  Expand);
+  setOperationAction(ISD::OR,                 MVT::v4i16, Expand);
+  setOperationAction(ISD::OR,                 MVT::v2i32, Expand);
+  setOperationAction(ISD::OR,                 MVT::v1i64, Expand);
+  setOperationAction(ISD::XOR,                MVT::v8i8,  Expand);
+  setOperationAction(ISD::XOR,                MVT::v4i16, Expand);
+  setOperationAction(ISD::XOR,                MVT::v2i32, Expand);
+  setOperationAction(ISD::XOR,                MVT::v1i64, Expand);
+  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Expand);
+  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Expand);
+  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2i32, Expand);
+  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Expand);
+  setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v1i64, Expand);
+  setOperationAction(ISD::SELECT,             MVT::v8i8,  Expand);
+  setOperationAction(ISD::SELECT,             MVT::v4i16, Expand);
+  setOperationAction(ISD::SELECT,             MVT::v2i32, Expand);
+  setOperationAction(ISD::SELECT,             MVT::v1i64, Expand);
+  setOperationAction(ISD::BITCAST,            MVT::v8i8,  Expand);
+  setOperationAction(ISD::BITCAST,            MVT::v4i16, Expand);
+  setOperationAction(ISD::BITCAST,            MVT::v2i32, Expand);
+  setOperationAction(ISD::BITCAST,            MVT::v1i64, Expand);
+
+  if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
+    addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
+
+    setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
+    setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
+    setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
+    setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
+    setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
+    setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
+    setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
+    setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
+    setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
+  }
+
+  if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
+    addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
+
+    // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
+    // registers cannot be used even for integer operations.
+    addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
+    addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
+    addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
+    addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
+
+    setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
+    setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
+    setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
+    setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
+    setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
+    setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
+    setOperationAction(ISD::UMUL_LOHI,          MVT::v4i32, Custom);
+    setOperationAction(ISD::SMUL_LOHI,          MVT::v4i32, Custom);
+    setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
+    setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
+    setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
+    setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
+    setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
+    setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
+    setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
+    setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
+    setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
+    setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
+    setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
+    setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
+    setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
+    setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
+
+    setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
+    setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
+    setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
+    setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
+
+    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
+    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
+
+    // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
+    for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
+      MVT VT = (MVT::SimpleValueType)i;
+      // Do not attempt to custom lower non-power-of-2 vectors
+      if (!isPowerOf2_32(VT.getVectorNumElements()))
+        continue;
+      // Do not attempt to custom lower non-128-bit vectors
+      if (!VT.is128BitVector())
+        continue;
+      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+    }
+
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
+
+    if (Subtarget->is64Bit()) {
+      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
+    }
+
+    // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
+    for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
+      MVT VT = (MVT::SimpleValueType)i;
+
+      // Do not attempt to promote non-128-bit vectors
+      if (!VT.is128BitVector())
+        continue;
+
+      setOperationAction(ISD::AND,    VT, Promote);
+      AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
+      setOperationAction(ISD::OR,     VT, Promote);
+      AddPromotedToType (ISD::OR,     VT, MVT::v2i64);
+      setOperationAction(ISD::XOR,    VT, Promote);
+      AddPromotedToType (ISD::XOR,    VT, MVT::v2i64);
+      setOperationAction(ISD::LOAD,   VT, Promote);
+      AddPromotedToType (ISD::LOAD,   VT, MVT::v2i64);
+      setOperationAction(ISD::SELECT, VT, Promote);
+      AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
+    }
+
+    // Custom lower v2i64 and v2f64 selects.
+    setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
+    setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
+    setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
+    setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
+
+    setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
+
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
+    // As there is no 64-bit GPR available, we need build a special custom
+    // sequence to convert from v2i32 to v2f32.
+    if (!Subtarget->is64Bit())
+      setOperationAction(ISD::UINT_TO_FP,       MVT::v2f32, Custom);
+
+    setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
+    setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
+
+    setLoadExtAction(ISD::EXTLOAD,              MVT::v2f32, Legal);
+
+    setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
+    setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
+    setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
+  }
+
+  if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {
+    setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
+    setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
+    setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
+    setOperationAction(ISD::FRINT,              MVT::f32,   Legal);
+    setOperationAction(ISD::FNEARBYINT,         MVT::f32,   Legal);
+    setOperationAction(ISD::FFLOOR,             MVT::f64,   Legal);
+    setOperationAction(ISD::FCEIL,              MVT::f64,   Legal);
+    setOperationAction(ISD::FTRUNC,             MVT::f64,   Legal);
+    setOperationAction(ISD::FRINT,              MVT::f64,   Legal);
+    setOperationAction(ISD::FNEARBYINT,         MVT::f64,   Legal);
+
+    setOperationAction(ISD::FFLOOR,             MVT::v4f32, Legal);
+    setOperationAction(ISD::FCEIL,              MVT::v4f32, Legal);
+    setOperationAction(ISD::FTRUNC,             MVT::v4f32, Legal);
+    setOperationAction(ISD::FRINT,              MVT::v4f32, Legal);
+    setOperationAction(ISD::FNEARBYINT,         MVT::v4f32, Legal);
+    setOperationAction(ISD::FFLOOR,             MVT::v2f64, Legal);
+    setOperationAction(ISD::FCEIL,              MVT::v2f64, Legal);
+    setOperationAction(ISD::FTRUNC,             MVT::v2f64, Legal);
+    setOperationAction(ISD::FRINT,              MVT::v2f64, Legal);
+    setOperationAction(ISD::FNEARBYINT,         MVT::v2f64, Legal);
+
+    // FIXME: Do we need to handle scalar-to-vector here?
+    setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
+
+    setOperationAction(ISD::VSELECT,            MVT::v2f64, Custom);
+    setOperationAction(ISD::VSELECT,            MVT::v2i64, Custom);
+    setOperationAction(ISD::VSELECT,            MVT::v4i32, Custom);
+    setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
+    setOperationAction(ISD::VSELECT,            MVT::v8i16, Custom);
+    // There is no BLENDI for byte vectors. We don't need to custom lower
+    // some vselects for now.
+    setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
+
+    // i8 and i16 vectors are custom , because the source register and source
+    // source memory operand types are not the same width.  f32 vectors are
+    // custom since the immediate controlling the insert encodes additional
+    // information.
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
+
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
+
+    // FIXME: these should be Legal but thats only for the case where
+    // the index is constant.  For now custom expand to deal with that.
+    if (Subtarget->is64Bit()) {
+      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
+    }
+  }
+
+  if (Subtarget->hasSSE2()) {
+    setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
+    setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
+
+    setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
+    setOperationAction(ISD::SHL,               MVT::v16i8, Custom);
+
+    setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
+    setOperationAction(ISD::SRA,               MVT::v16i8, Custom);
+
+    // In the customized shift lowering, the legal cases in AVX2 will be
+    // recognized.
+    setOperationAction(ISD::SRL,               MVT::v2i64, Custom);
+    setOperationAction(ISD::SRL,               MVT::v4i32, Custom);
+
+    setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
+    setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
+
+    setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
+  }
+
+  if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
+    addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
+    addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
+    addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
+    addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
+    addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
+    addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
+
+    setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
+    setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
+    setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
+
+    setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
+    setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
+    setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
+    setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
+    setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
+    setOperationAction(ISD::FFLOOR,             MVT::v8f32, Legal);
+    setOperationAction(ISD::FCEIL,              MVT::v8f32, Legal);
+    setOperationAction(ISD::FTRUNC,             MVT::v8f32, Legal);
+    setOperationAction(ISD::FRINT,              MVT::v8f32, Legal);
+    setOperationAction(ISD::FNEARBYINT,         MVT::v8f32, Legal);
+    setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
+    setOperationAction(ISD::FABS,               MVT::v8f32, Custom);
+
+    setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
+    setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
+    setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
+    setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
+    setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
+    setOperationAction(ISD::FFLOOR,             MVT::v4f64, Legal);
+    setOperationAction(ISD::FCEIL,              MVT::v4f64, Legal);
+    setOperationAction(ISD::FTRUNC,             MVT::v4f64, Legal);
+    setOperationAction(ISD::FRINT,              MVT::v4f64, Legal);
+    setOperationAction(ISD::FNEARBYINT,         MVT::v4f64, Legal);
+    setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
+    setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
+
+    // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
+    // even though v8i16 is a legal type.
+    setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Promote);
+    setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Promote);
+    setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
+
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16, Promote);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
+    setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
+
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
+
+    setLoadExtAction(ISD::EXTLOAD,              MVT::v4f32, Legal);
+
+    setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
+    setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
+
+    setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
+    setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
+
+    setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
+    setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
+
+    setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
+    setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
+    setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
+    setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
+
+    setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
+    setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
+    setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
+
+    setOperationAction(ISD::VSELECT,           MVT::v4f64, Custom);
+    setOperationAction(ISD::VSELECT,           MVT::v4i64, Custom);
+    setOperationAction(ISD::VSELECT,           MVT::v8i32, Custom);
+    setOperationAction(ISD::VSELECT,           MVT::v8f32, Custom);
+
+    setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
+    setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
+    setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i16, Custom);
+    setOperationAction(ISD::ZERO_EXTEND,       MVT::v4i64, Custom);
+    setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i32, Custom);
+    setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i16, Custom);
+    setOperationAction(ISD::ANY_EXTEND,        MVT::v4i64, Custom);
+    setOperationAction(ISD::ANY_EXTEND,        MVT::v8i32, Custom);
+    setOperationAction(ISD::ANY_EXTEND,        MVT::v16i16, Custom);
+    setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
+    setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
+    setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
+
+    if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
+      setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
+      setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
+      setOperationAction(ISD::FMA,             MVT::v4f32, Legal);
+      setOperationAction(ISD::FMA,             MVT::v2f64, Legal);
+      setOperationAction(ISD::FMA,             MVT::f32, Legal);
+      setOperationAction(ISD::FMA,             MVT::f64, Legal);
+    }
+
+    if (Subtarget->hasInt256()) {
+      setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
+      setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
+      setOperationAction(ISD::ADD,             MVT::v16i16, Legal);
+      setOperationAction(ISD::ADD,             MVT::v32i8, Legal);
+
+      setOperationAction(ISD::SUB,             MVT::v4i64, Legal);
+      setOperationAction(ISD::SUB,             MVT::v8i32, Legal);
+      setOperationAction(ISD::SUB,             MVT::v16i16, Legal);
+      setOperationAction(ISD::SUB,             MVT::v32i8, Legal);
+
+      setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
+      setOperationAction(ISD::MUL,             MVT::v8i32, Legal);
+      setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
+      // Don't lower v32i8 because there is no 128-bit byte mul
+
+      setOperationAction(ISD::UMUL_LOHI,       MVT::v8i32, Custom);
+      setOperationAction(ISD::SMUL_LOHI,       MVT::v8i32, Custom);
+      setOperationAction(ISD::MULHU,           MVT::v16i16, Legal);
+      setOperationAction(ISD::MULHS,           MVT::v16i16, Legal);
+
+      setOperationAction(ISD::VSELECT,         MVT::v16i16, Custom);
+      setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
+    } else {
+      setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
+      setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
+      setOperationAction(ISD::ADD,             MVT::v16i16, Custom);
+      setOperationAction(ISD::ADD,             MVT::v32i8, Custom);
+
+      setOperationAction(ISD::SUB,             MVT::v4i64, Custom);
+      setOperationAction(ISD::SUB,             MVT::v8i32, Custom);
+      setOperationAction(ISD::SUB,             MVT::v16i16, Custom);
+      setOperationAction(ISD::SUB,             MVT::v32i8, Custom);
+
+      setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
+      setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
+      setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
+      // Don't lower v32i8 because there is no 128-bit byte mul
+    }
+
+    // In the customized shift lowering, the legal cases in AVX2 will be
+    // recognized.
+    setOperationAction(ISD::SRL,               MVT::v4i64, Custom);
+    setOperationAction(ISD::SRL,               MVT::v8i32, Custom);
+
+    setOperationAction(ISD::SHL,               MVT::v4i64, Custom);
+    setOperationAction(ISD::SHL,               MVT::v8i32, Custom);
+
+    setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
+
+    // Custom lower several nodes for 256-bit types.
+    for (int i = MVT::FIRST_VECTOR_VALUETYPE;
+             i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
+      MVT VT = (MVT::SimpleValueType)i;
+
+      // Extract subvector is special because the value type
+      // (result) is 128-bit but the source is 256-bit wide.
+      if (VT.is128BitVector())
+        setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+
+      // Do not attempt to custom lower other non-256-bit vectors
+      if (!VT.is256BitVector())
+        continue;
+
+      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
+      setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+      setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
+      setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
+      setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
+    }
+
+    // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
+    for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
+      MVT VT = (MVT::SimpleValueType)i;
+
+      // Do not attempt to promote non-256-bit vectors
+      if (!VT.is256BitVector())
+        continue;
+
+      setOperationAction(ISD::AND,    VT, Promote);
+      AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
+      setOperationAction(ISD::OR,     VT, Promote);
+      AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
+      setOperationAction(ISD::XOR,    VT, Promote);
+      AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
+      setOperationAction(ISD::LOAD,   VT, Promote);
+      AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
+      setOperationAction(ISD::SELECT, VT, Promote);
+      AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
+    }
+  }
+
+  if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) {
+    addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
+    addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
+    addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
+    addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
+
+    addRegisterClass(MVT::i1,     &X86::VK1RegClass);
+    addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
+    addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
+
+    setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
+    setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
+    setOperationAction(ISD::XOR,                MVT::i1,    Legal);
+    setOperationAction(ISD::OR,                 MVT::i1,    Legal);
+    setOperationAction(ISD::AND,                MVT::i1,    Legal);
+    setLoadExtAction(ISD::EXTLOAD,              MVT::v8f32, Legal);
+    setOperationAction(ISD::LOAD,               MVT::v16f32, Legal);
+    setOperationAction(ISD::LOAD,               MVT::v8f64, Legal);
+    setOperationAction(ISD::LOAD,               MVT::v8i64, Legal);
+    setOperationAction(ISD::LOAD,               MVT::v16i32, Legal);
+    setOperationAction(ISD::LOAD,               MVT::v16i1, Legal);
+
+    setOperationAction(ISD::FADD,               MVT::v16f32, Legal);
+    setOperationAction(ISD::FSUB,               MVT::v16f32, Legal);
+    setOperationAction(ISD::FMUL,               MVT::v16f32, Legal);
+    setOperationAction(ISD::FDIV,               MVT::v16f32, Legal);
+    setOperationAction(ISD::FSQRT,              MVT::v16f32, Legal);
+    setOperationAction(ISD::FNEG,               MVT::v16f32, Custom);
+
+    setOperationAction(ISD::FADD,               MVT::v8f64, Legal);
+    setOperationAction(ISD::FSUB,               MVT::v8f64, Legal);
+    setOperationAction(ISD::FMUL,               MVT::v8f64, Legal);
+    setOperationAction(ISD::FDIV,               MVT::v8f64, Legal);
+    setOperationAction(ISD::FSQRT,              MVT::v8f64, Legal);
+    setOperationAction(ISD::FNEG,               MVT::v8f64, Custom);
+    setOperationAction(ISD::FMA,                MVT::v8f64, Legal);
+    setOperationAction(ISD::FMA,                MVT::v16f32, Legal);
+
+    setOperationAction(ISD::FP_TO_SINT,         MVT::i32, Legal);
+    setOperationAction(ISD::FP_TO_UINT,         MVT::i32, Legal);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::i32, Legal);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::i32, Legal);
+    if (Subtarget->is64Bit()) {
+      setOperationAction(ISD::FP_TO_UINT,       MVT::i64, Legal);
+      setOperationAction(ISD::FP_TO_SINT,       MVT::i64, Legal);
+      setOperationAction(ISD::SINT_TO_FP,       MVT::i64, Legal);
+      setOperationAction(ISD::UINT_TO_FP,       MVT::i64, Legal);
+    }
+    setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
+    setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
+    setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
+    setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Legal);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
+    setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
+    setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
+
+    setOperationAction(ISD::TRUNCATE,           MVT::i1, Custom);
+    setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
+    setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
+    setOperationAction(ISD::TRUNCATE,           MVT::v8i1, Custom);
+    setOperationAction(ISD::TRUNCATE,           MVT::v16i1, Custom);
+    setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
+    setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
+    setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
+    setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
+    setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
+    setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
+    setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
+    setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);
+
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1,    Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1, Legal);
+
+    setOperationAction(ISD::SETCC,              MVT::v16i1, Custom);
+    setOperationAction(ISD::SETCC,              MVT::v8i1, Custom);
+
+    setOperationAction(ISD::MUL,              MVT::v8i64, Custom);
+
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1,  Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i1, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i1, Custom);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v16i1, Custom);
+    setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
+    setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
+    setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);
+
+    setOperationAction(ISD::ADD,                MVT::v8i64, Legal);
+    setOperationAction(ISD::ADD,                MVT::v16i32, Legal);
+
+    setOperationAction(ISD::SUB,                MVT::v8i64, Legal);
+    setOperationAction(ISD::SUB,                MVT::v16i32, Legal);
+
+    setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
+
+    setOperationAction(ISD::SRL,                MVT::v8i64, Custom);
+    setOperationAction(ISD::SRL,                MVT::v16i32, Custom);
+
+    setOperationAction(ISD::SHL,                MVT::v8i64, Custom);
+    setOperationAction(ISD::SHL,                MVT::v16i32, Custom);
+
+    setOperationAction(ISD::SRA,                MVT::v8i64, Custom);
+    setOperationAction(ISD::SRA,                MVT::v16i32, Custom);
+
+    setOperationAction(ISD::AND,                MVT::v8i64, Legal);
+    setOperationAction(ISD::OR,                 MVT::v8i64, Legal);
+    setOperationAction(ISD::XOR,                MVT::v8i64, Legal);
+    setOperationAction(ISD::AND,                MVT::v16i32, Legal);
+    setOperationAction(ISD::OR,                 MVT::v16i32, Legal);
+    setOperationAction(ISD::XOR,                MVT::v16i32, Legal);
+
+    if (Subtarget->hasCDI()) {
+      setOperationAction(ISD::CTLZ,             MVT::v8i64, Legal);
+      setOperationAction(ISD::CTLZ,             MVT::v16i32, Legal);
+    }
+
+    // Custom lower several nodes.
+    for (int i = MVT::FIRST_VECTOR_VALUETYPE;
+             i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
+      MVT VT = (MVT::SimpleValueType)i;
+
+      unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+      // Extract subvector is special because the value type
+      // (result) is 256/128-bit but the source is 512-bit wide.
+      if (VT.is128BitVector() || VT.is256BitVector())
+        setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+
+      if (VT.getVectorElementType() == MVT::i1)
+        setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
+
+      // Do not attempt to custom lower other non-512-bit vectors
+      if (!VT.is512BitVector())
+        continue;
+
+      if ( EltSize >= 32) {
+        setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
+        setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
+        setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
+        setOperationAction(ISD::VSELECT,             VT, Legal);
+        setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
+        setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
+        setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
+      }
+    }
+    for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
+      MVT VT = (MVT::SimpleValueType)i;
+
+      // Do not attempt to promote non-256-bit vectors
+      if (!VT.is512BitVector())
+        continue;
+
+      setOperationAction(ISD::SELECT, VT, Promote);
+      AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
+    }
+  }// has  AVX-512
+
+  // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
+  // of this type with custom code.
+  for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
+           VT != MVT::LAST_VECTOR_VALUETYPE; VT++) {
+    setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
+                       Custom);
+  }
+
+  // We want to custom lower some of our intrinsics.
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+  if (!Subtarget->is64Bit())
+    setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
+
+  // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
+  // handle type legalization for these operations here.
+  //
+  // FIXME: We really should do custom legalization for addition and
+  // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
+  // than generic legalization for 64-bit multiplication-with-overflow, though.
+  for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
+    // Add/Sub/Mul with overflow operations are custom lowered.
+    MVT VT = IntVTs[i];
+    setOperationAction(ISD::SADDO, VT, Custom);
+    setOperationAction(ISD::UADDO, VT, Custom);
+    setOperationAction(ISD::SSUBO, VT, Custom);
+    setOperationAction(ISD::USUBO, VT, Custom);
+    setOperationAction(ISD::SMULO, VT, Custom);
+    setOperationAction(ISD::UMULO, VT, Custom);
+  }
+
+  // There are no 8-bit 3-address imul/mul instructions
+  setOperationAction(ISD::SMULO, MVT::i8, Expand);
+  setOperationAction(ISD::UMULO, MVT::i8, Expand);
+
+  if (!Subtarget->is64Bit()) {
+    // These libcalls are not available in 32-bit.
+    setLibcallName(RTLIB::SHL_I128, nullptr);
+    setLibcallName(RTLIB::SRL_I128, nullptr);
+    setLibcallName(RTLIB::SRA_I128, nullptr);
+  }
+
+  // Combine sin / cos into one node or libcall if possible.
+  if (Subtarget->hasSinCos()) {
+    setLibcallName(RTLIB::SINCOS_F32, "sincosf");
+    setLibcallName(RTLIB::SINCOS_F64, "sincos");
+    if (Subtarget->isTargetDarwin()) {
+      // For MacOSX, we don't want to the normal expansion of a libcall to
+      // sincos. We want to issue a libcall to __sincos_stret to avoid memory
+      // traffic.
+      setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
+      setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
+    }
+  }
+
+  if (Subtarget->isTargetWin64()) {
+    setOperationAction(ISD::SDIV, MVT::i128, Custom);
+    setOperationAction(ISD::UDIV, MVT::i128, Custom);
+    setOperationAction(ISD::SREM, MVT::i128, Custom);
+    setOperationAction(ISD::UREM, MVT::i128, Custom);
+    setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
+    setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
+  }
+
+  // We have target-specific dag combine patterns for the following nodes:
+  setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
+  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+  setTargetDAGCombine(ISD::VSELECT);
+  setTargetDAGCombine(ISD::SELECT);
+  setTargetDAGCombine(ISD::SHL);
+  setTargetDAGCombine(ISD::SRA);
+  setTargetDAGCombine(ISD::SRL);
+  setTargetDAGCombine(ISD::OR);
+  setTargetDAGCombine(ISD::AND);
+  setTargetDAGCombine(ISD::ADD);
+  setTargetDAGCombine(ISD::FADD);
+  setTargetDAGCombine(ISD::FSUB);
+  setTargetDAGCombine(ISD::FMA);
+  setTargetDAGCombine(ISD::SUB);
+  setTargetDAGCombine(ISD::LOAD);
+  setTargetDAGCombine(ISD::STORE);
+  setTargetDAGCombine(ISD::ZERO_EXTEND);
+  setTargetDAGCombine(ISD::ANY_EXTEND);
+  setTargetDAGCombine(ISD::SIGN_EXTEND);
+  setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
+  setTargetDAGCombine(ISD::TRUNCATE);
+  setTargetDAGCombine(ISD::SINT_TO_FP);
+  setTargetDAGCombine(ISD::SETCC);
+  setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
+  setTargetDAGCombine(ISD::BUILD_VECTOR);
+  if (Subtarget->is64Bit())
+    setTargetDAGCombine(ISD::MUL);
+  setTargetDAGCombine(ISD::XOR);
+
+  computeRegisterProperties();
+
+  // On Darwin, -Os means optimize for size without hurting performance,
+  // do not reduce the limit.
+  MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
+  MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
+  MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
+  MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
+  MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
+  MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
+  setPrefLoopAlignment(4); // 2^4 bytes.
+
+  // Predictable cmov don't hurt on atom because it's in-order.
+  PredictableSelectIsExpensive = !Subtarget->isAtom();
+
+  setPrefFunctionAlignment(4); // 2^4 bytes.
+}
+
+TargetLoweringBase::LegalizeTypeAction
+X86TargetLowering::getPreferredVectorAction(EVT VT) const {
+  if (ExperimentalVectorWideningLegalization &&
+      VT.getVectorNumElements() != 1 &&
+      VT.getVectorElementType().getSimpleVT() != MVT::i1)
+    return TypeWidenVector;
+
+  return TargetLoweringBase::getPreferredVectorAction(VT);
+}
+
+EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
+  if (!VT.isVector())
+    return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
+
+  if (Subtarget->hasAVX512())
+    switch(VT.getVectorNumElements()) {
+    case  8: return MVT::v8i1;
+    case 16: return MVT::v16i1;
+  }
+
+  return VT.changeVectorElementTypeToInteger();
+}
+
+/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
+/// the desired ByVal argument alignment.
+static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
+  if (MaxAlign == 16)
+    return;
+  if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
+    if (VTy->getBitWidth() == 128)
+      MaxAlign = 16;
+  } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+    unsigned EltAlign = 0;
+    getMaxByValAlign(ATy->getElementType(), EltAlign);
+    if (EltAlign > MaxAlign)
+      MaxAlign = EltAlign;
+  } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
+    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+      unsigned EltAlign = 0;
+      getMaxByValAlign(STy->getElementType(i), EltAlign);
+      if (EltAlign > MaxAlign)
+        MaxAlign = EltAlign;
+      if (MaxAlign == 16)
+        break;
+    }
+  }
+}
+
+/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
+/// function arguments in the caller parameter area. For X86, aggregates
+/// that contain SSE vectors are placed at 16-byte boundaries while the rest
+/// are at 4-byte boundaries.
+unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
+  if (Subtarget->is64Bit()) {
+    // Max of 8 and alignment of type.
+    unsigned TyAlign = TD->getABITypeAlignment(Ty);
+    if (TyAlign > 8)
+      return TyAlign;
+    return 8;
+  }
+
+  unsigned Align = 4;
+  if (Subtarget->hasSSE1())
+    getMaxByValAlign(Ty, Align);
+  return Align;
+}
+
+/// getOptimalMemOpType - Returns the target specific optimal type for load
+/// and store operations as a result of memset, memcpy, and memmove
+/// lowering. If DstAlign is zero that means it's safe to destination
+/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
+/// means there isn't a need to check it against alignment requirement,
+/// probably because the source does not need to be loaded. If 'IsMemset' is
+/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
+/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
+/// source is constant so it does not need to be loaded.
+/// It returns EVT::Other if the type should be determined using generic
+/// target-independent logic.
+EVT
+X86TargetLowering::getOptimalMemOpType(uint64_t Size,
+                                       unsigned DstAlign, unsigned SrcAlign,
+                                       bool IsMemset, bool ZeroMemset,
+                                       bool MemcpyStrSrc,
+                                       MachineFunction &MF) const {
+  const Function *F = MF.getFunction();
+  if ((!IsMemset || ZeroMemset) &&
+      !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                       Attribute::NoImplicitFloat)) {
+    if (Size >= 16 &&
+        (Subtarget->isUnalignedMemAccessFast() ||
+         ((DstAlign == 0 || DstAlign >= 16) &&
+          (SrcAlign == 0 || SrcAlign >= 16)))) {
+      if (Size >= 32) {
+        if (Subtarget->hasInt256())
+          return MVT::v8i32;
+        if (Subtarget->hasFp256())
+          return MVT::v8f32;
+      }
+      if (Subtarget->hasSSE2())
+        return MVT::v4i32;
+      if (Subtarget->hasSSE1())
+        return MVT::v4f32;
+    } else if (!MemcpyStrSrc && Size >= 8 &&
+               !Subtarget->is64Bit() &&
+               Subtarget->hasSSE2()) {
+      // Do not use f64 to lower memcpy if source is string constant. It's
+      // better to use i32 to avoid the loads.
+      return MVT::f64;
+    }
+  }
+  if (Subtarget->is64Bit() && Size >= 8)
+    return MVT::i64;
+  return MVT::i32;
+}
+
+bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
+  if (VT == MVT::f32)
+    return X86ScalarSSEf32;
+  else if (VT == MVT::f64)
+    return X86ScalarSSEf64;
+  return true;
+}
+
+bool
+X86TargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
+                                                 unsigned,
+                                                 bool *Fast) const {
+  if (Fast)
+    *Fast = Subtarget->isUnalignedMemAccessFast();
+  return true;
+}
+
+/// getJumpTableEncoding - Return the entry encoding for a jump table in the
+/// current function.  The returned value is a member of the
+/// MachineJumpTableInfo::JTEntryKind enum.
+unsigned X86TargetLowering::getJumpTableEncoding() const {
+  // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
+  // symbol.
+  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+      Subtarget->isPICStyleGOT())
+    return MachineJumpTableInfo::EK_Custom32;
+
+  // Otherwise, use the normal jump table encoding heuristics.
+  return TargetLowering::getJumpTableEncoding();
+}
+
+const MCExpr *
+X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
+                                             const MachineBasicBlock *MBB,
+                                             unsigned uid,MCContext &Ctx) const{
+  assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ &&
+         Subtarget->isPICStyleGOT());
+  // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
+  // entries.
+  return MCSymbolRefExpr::Create(MBB->getSymbol(),
+                                 MCSymbolRefExpr::VK_GOTOFF, Ctx);
+}
+
+/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
+/// jumptable.
+SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
+                                                    SelectionDAG &DAG) const {
+  if (!Subtarget->is64Bit())
+    // This doesn't have SDLoc associated with it, but is not really the
+    // same as a Register.
+    return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy());
+  return Table;
+}
+
+/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the
+/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an
+/// MCExpr.
+const MCExpr *X86TargetLowering::
+getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
+                             MCContext &Ctx) const {
+  // X86-64 uses RIP relative addressing based on the jump table label.
+  if (Subtarget->isPICStyleRIPRel())
+    return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
+
+  // Otherwise, the reference is relative to the PIC base.
+  return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
+}
+
+// FIXME: Why this routine is here? Move to RegInfo!
+std::pair<const TargetRegisterClass*, uint8_t>
+X86TargetLowering::findRepresentativeClass(MVT VT) const{
+  const TargetRegisterClass *RRC = nullptr;
+  uint8_t Cost = 1;
+  switch (VT.SimpleTy) {
+  default:
+    return TargetLowering::findRepresentativeClass(VT);
+  case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
+    RRC = Subtarget->is64Bit() ?
+      (const TargetRegisterClass*)&X86::GR64RegClass :
+      (const TargetRegisterClass*)&X86::GR32RegClass;
+    break;
+  case MVT::x86mmx:
+    RRC = &X86::VR64RegClass;
+    break;
+  case MVT::f32: case MVT::f64:
+  case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
+  case MVT::v4f32: case MVT::v2f64:
+  case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
+  case MVT::v4f64:
+    RRC = &X86::VR128RegClass;
+    break;
+  }
+  return std::make_pair(RRC, Cost);
+}
+
+bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
+                                               unsigned &Offset) const {
+  if (!Subtarget->isTargetLinux())
+    return false;
+
+  if (Subtarget->is64Bit()) {
+    // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
+    Offset = 0x28;
+    if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
+      AddressSpace = 256;
+    else
+      AddressSpace = 257;
+  } else {
+    // %gs:0x14 on i386
+    Offset = 0x14;
+    AddressSpace = 256;
+  }
+  return true;
+}
+
+bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
+                                            unsigned DestAS) const {
+  assert(SrcAS != DestAS && "Expected different address spaces!");
+
+  return SrcAS < 256 && DestAS < 256;
+}
+
+//===----------------------------------------------------------------------===//
+//               Return Value Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "X86GenCallingConv.inc"
+
+bool
+X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
+                                  MachineFunction &MF, bool isVarArg,
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        LLVMContext &Context) const {
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, MF, MF.getTarget(),
+                 RVLocs, Context);
+  return CCInfo.CheckReturn(Outs, RetCC_X86);
+}
+
+const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
+  static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
+  return ScratchRegs;
+}
+
+SDValue
+X86TargetLowering::LowerReturn(SDValue Chain,
+                               CallingConv::ID CallConv, bool isVarArg,
+                               const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               const SmallVectorImpl<SDValue> &OutVals,
+                               SDLoc dl, SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, MF, DAG.getTarget(),
+                 RVLocs, *DAG.getContext());
+  CCInfo.AnalyzeReturn(Outs, RetCC_X86);
+
+  SDValue Flag;
+  SmallVector<SDValue, 6> RetOps;
+  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
+  // Operand #1 = Bytes To Pop
+  RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
+                   MVT::i16));
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+    SDValue ValToCopy = OutVals[i];
+    EVT ValVT = ValToCopy.getValueType();
+
+    // Promote values to the appropriate types
+    if (VA.getLocInfo() == CCValAssign::SExt)
+      ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
+    else if (VA.getLocInfo() == CCValAssign::ZExt)
+      ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
+    else if (VA.getLocInfo() == CCValAssign::AExt)
+      ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
+    else if (VA.getLocInfo() == CCValAssign::BCvt)
+      ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
+
+    assert(VA.getLocInfo() != CCValAssign::FPExt &&
+           "Unexpected FP-extend for return value.");  
+
+    // If this is x86-64, and we disabled SSE, we can't return FP values,
+    // or SSE or MMX vectors.
+    if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
+         VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
+          (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
+      report_fatal_error("SSE register return with SSE disabled");
+    }
+    // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
+    // llvm-gcc has never done it right and no one has noticed, so this
+    // should be OK for now.
+    if (ValVT == MVT::f64 &&
+        (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
+      report_fatal_error("SSE2 register return with SSE2 disabled");
+
+    // Returns in ST0/ST1 are handled specially: these are pushed as operands to
+    // the RET instruction and handled by the FP Stackifier.
+    if (VA.getLocReg() == X86::ST0 ||
+        VA.getLocReg() == X86::ST1) {
+      // If this is a copy from an xmm register to ST(0), use an FPExtend to
+      // change the value to the FP stack register class.
+      if (isScalarFPTypeInSSEReg(VA.getValVT()))
+        ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
+      RetOps.push_back(ValToCopy);
+      // Don't emit a copytoreg.
+      continue;
+    }
+
+    // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
+    // which is returned in RAX / RDX.
+    if (Subtarget->is64Bit()) {
+      if (ValVT == MVT::x86mmx) {
+        if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
+          ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
+          ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
+                                  ValToCopy);
+          // If we don't have SSE2 available, convert to v4f32 so the generated
+          // register is legal.
+          if (!Subtarget->hasSSE2())
+            ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
+        }
+      }
+    }
+
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
+
+  // The x86-64 ABIs require that for returning structs by value we copy
+  // the sret argument into %rax/%eax (depending on ABI) for the return.
+  // Win32 requires us to put the sret argument to %eax as well.
+  // We saved the argument into a virtual register in the entry block,
+  // so now we copy the value out and into %rax/%eax.
+  if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() &&
+      (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {
+    MachineFunction &MF = DAG.getMachineFunction();
+    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+    unsigned Reg = FuncInfo->getSRetReturnReg();
+    assert(Reg &&
+           "SRetReturnReg should have been set in LowerFormalArguments().");
+    SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
+
+    unsigned RetValReg
+        = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
+          X86::RAX : X86::EAX;
+    Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
+    Flag = Chain.getValue(1);
+
+    // RAX/EAX now acts like a return value.
+    RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy()));
+  }
+
+  RetOps[0] = Chain;  // Update chain.
+
+  // Add the flag if we have it.
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
+  return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps);
+}
+
+bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
+  if (N->getNumValues() != 1)
+    return false;
+  if (!N->hasNUsesOfValue(1, 0))
+    return false;
+
+  SDValue TCChain = Chain;
+  SDNode *Copy = *N->use_begin();
+  if (Copy->getOpcode() == ISD::CopyToReg) {
+    // If the copy has a glue operand, we conservatively assume it isn't safe to
+    // perform a tail call.
+    if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
+      return false;
+    TCChain = Copy->getOperand(0);
+  } else if (Copy->getOpcode() != ISD::FP_EXTEND)
+    return false;
+
+  bool HasRet = false;
+  for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
+       UI != UE; ++UI) {
+    if (UI->getOpcode() != X86ISD::RET_FLAG)
+      return false;
+    HasRet = true;
+  }
+
+  if (!HasRet)
+    return false;
+
+  Chain = TCChain;
+  return true;
+}
+
+MVT
+X86TargetLowering::getTypeForExtArgOrReturn(MVT VT,
+                                            ISD::NodeType ExtendKind) const {
+  MVT ReturnMVT;
+  // TODO: Is this also valid on 32-bit?
+  if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
+    ReturnMVT = MVT::i8;
+  else
+    ReturnMVT = MVT::i32;
+
+  MVT MinVT = getRegisterType(ReturnMVT);
+  return VT.bitsLT(MinVT) ? MinVT : VT;
+}
+
+/// LowerCallResult - Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+///
+SDValue
+X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
+                                   CallingConv::ID CallConv, bool isVarArg,
+                                   const SmallVectorImpl<ISD::InputArg> &Ins,
+                                   SDLoc dl, SelectionDAG &DAG,
+                                   SmallVectorImpl<SDValue> &InVals) const {
+
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  bool Is64Bit = Subtarget->is64Bit();
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 DAG.getTarget(), RVLocs, *DAG.getContext());
+  CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
+    CCValAssign &VA = RVLocs[i];
+    EVT CopyVT = VA.getValVT();
+
+    // If this is x86-64, and we disabled SSE, we can't return FP values
+    if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
+        ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
+      report_fatal_error("SSE register return with SSE disabled");
+    }
+
+    SDValue Val;
+
+    // If this is a call to a function that returns an fp value on the floating
+    // point stack, we must guarantee the value is popped from the stack, so
+    // a CopyFromReg is not good enough - the copy instruction may be eliminated
+    // if the return value is not used. We use the FpPOP_RETVAL instruction
+    // instead.
+    if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) {
+      // If we prefer to use the value in xmm registers, copy it out as f80 and
+      // use a truncate to move it from fp stack reg to xmm reg.
+      if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80;
+      SDValue Ops[] = { Chain, InFlag };
+      Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT,
+                                         MVT::Other, MVT::Glue, Ops), 1);
+      Val = Chain.getValue(0);
+
+      // Round the f80 to the right size, which also moves it to the appropriate
+      // xmm register.
+      if (CopyVT != VA.getValVT())
+        Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
+                          // This truncation won't change the value.
+                          DAG.getIntPtrConstant(1));
+    } else {
+      Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
+                                 CopyVT, InFlag).getValue(1);
+      Val = Chain.getValue(0);
+    }
+    InFlag = Chain.getValue(2);
+    InVals.push_back(Val);
+  }
+
+  return Chain;
+}
+
+//===----------------------------------------------------------------------===//
+//                C & StdCall & Fast Calling Convention implementation
+//===----------------------------------------------------------------------===//
+//  StdCall calling convention seems to be standard for many Windows' API
+//  routines and around. It differs from C calling convention just a little:
+//  callee should clean up the stack, not caller. Symbols should be also
+//  decorated in some fancy way :) It doesn't support any vector arguments.
+//  For info on fast calling convention see Fast Calling Convention (tail call)
+//  implementation LowerX86_32FastCCCallTo.
+
+/// CallIsStructReturn - Determines whether a call uses struct return
+/// semantics.
+enum StructReturnType {
+  NotStructReturn,
+  RegStructReturn,
+  StackStructReturn
+};
+static StructReturnType
+callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
+  if (Outs.empty())
+    return NotStructReturn;
+
+  const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
+  if (!Flags.isSRet())
+    return NotStructReturn;
+  if (Flags.isInReg())
+    return RegStructReturn;
+  return StackStructReturn;
+}
+
+/// ArgsAreStructReturn - Determines whether a function uses struct
+/// return semantics.
+static StructReturnType
+argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
+  if (Ins.empty())
+    return NotStructReturn;
+
+  const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
+  if (!Flags.isSRet())
+    return NotStructReturn;
+  if (Flags.isInReg())
+    return RegStructReturn;
+  return StackStructReturn;
+}
+
+/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
+/// by "Src" to address "Dst" with size and alignment information specified by
+/// the specific parameter attribute. The copy will be passed as a byval
+/// function parameter.
+static SDValue
+CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
+                          ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
+                          SDLoc dl) {
+  SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
+
+  return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
+                       /*isVolatile*/false, /*AlwaysInline=*/true,
+                       MachinePointerInfo(), MachinePointerInfo());
+}
+
+/// IsTailCallConvention - Return true if the calling convention is one that
+/// supports tail call optimization.
+static bool IsTailCallConvention(CallingConv::ID CC) {
+  return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
+          CC == CallingConv::HiPE);
+}
+
+/// \brief Return true if the calling convention is a C calling convention.
+static bool IsCCallConvention(CallingConv::ID CC) {
+  return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 ||
+          CC == CallingConv::X86_64_SysV);
+}
+
+bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
+  if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
+    return false;
+
+  CallSite CS(CI);
+  CallingConv::ID CalleeCC = CS.getCallingConv();
+  if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
+    return false;
+
+  return true;
+}
+
+/// FuncIsMadeTailCallSafe - Return true if the function is being made into
+/// a tailcall target by changing its ABI.
+static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
+                                   bool GuaranteedTailCallOpt) {
+  return GuaranteedTailCallOpt && IsTailCallConvention(CC);
+}
+
+SDValue
+X86TargetLowering::LowerMemArgument(SDValue Chain,
+                                    CallingConv::ID CallConv,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                    SDLoc dl, SelectionDAG &DAG,
+                                    const CCValAssign &VA,
+                                    MachineFrameInfo *MFI,
+                                    unsigned i) const {
+  // Create the nodes corresponding to a load from this parameter slot.
+  ISD::ArgFlagsTy Flags = Ins[i].Flags;
+  bool AlwaysUseMutable = FuncIsMadeTailCallSafe(
+      CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
+  bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
+  EVT ValVT;
+
+  // If value is passed by pointer we have address passed instead of the value
+  // itself.
+  if (VA.getLocInfo() == CCValAssign::Indirect)
+    ValVT = VA.getLocVT();
+  else
+    ValVT = VA.getValVT();
+
+  // FIXME: For now, all byval parameter objects are marked mutable. This can be
+  // changed with more analysis.
+  // In case of tail call optimization mark all arguments mutable. Since they
+  // could be overwritten by lowering of arguments in case of a tail call.
+  if (Flags.isByVal()) {
+    unsigned Bytes = Flags.getByValSize();
+    if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
+    int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
+    return DAG.getFrameIndex(FI, getPointerTy());
+  } else {
+    int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
+                                    VA.getLocMemOffset(), isImmutable);
+    SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+    return DAG.getLoad(ValVT, dl, Chain, FIN,
+                       MachinePointerInfo::getFixedStack(FI),
+                       false, false, false, 0);
+  }
+}
+
+SDValue
+X86TargetLowering::LowerFormalArguments(SDValue Chain,
+                                        CallingConv::ID CallConv,
+                                        bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                        SDLoc dl,
+                                        SelectionDAG &DAG,
+                                        SmallVectorImpl<SDValue> &InVals)
+                                          const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+
+  const Function* Fn = MF.getFunction();
+  if (Fn->hasExternalLinkage() &&
+      Subtarget->isTargetCygMing() &&
+      Fn->getName() == "main")
+    FuncInfo->setForceFramePointer(true);
+
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  bool Is64Bit = Subtarget->is64Bit();
+  bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
+
+  assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
+         "Var args not supported with calling convention fastcc, ghc or hipe");
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, MF, DAG.getTarget(),
+                 ArgLocs, *DAG.getContext());
+
+  // Allocate shadow area for Win64
+  if (IsWin64)
+    CCInfo.AllocateStack(32, 8);
+
+  CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
+
+  unsigned LastVal = ~0U;
+  SDValue ArgValue;
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
+    // places.
+    assert(VA.getValNo() != LastVal &&
+           "Don't support value assigned to multiple locs yet");
+    (void)LastVal;
+    LastVal = VA.getValNo();
+
+    if (VA.isRegLoc()) {
+      EVT RegVT = VA.getLocVT();
+      const TargetRegisterClass *RC;
+      if (RegVT == MVT::i32)
+        RC = &X86::GR32RegClass;
+      else if (Is64Bit && RegVT == MVT::i64)
+        RC = &X86::GR64RegClass;
+      else if (RegVT == MVT::f32)
+        RC = &X86::FR32RegClass;
+      else if (RegVT == MVT::f64)
+        RC = &X86::FR64RegClass;
+      else if (RegVT.is512BitVector())
+        RC = &X86::VR512RegClass;
+      else if (RegVT.is256BitVector())
+        RC = &X86::VR256RegClass;
+      else if (RegVT.is128BitVector())
+        RC = &X86::VR128RegClass;
+      else if (RegVT == MVT::x86mmx)
+        RC = &X86::VR64RegClass;
+      else if (RegVT == MVT::i1)
+        RC = &X86::VK1RegClass;
+      else if (RegVT == MVT::v8i1)
+        RC = &X86::VK8RegClass;
+      else if (RegVT == MVT::v16i1)
+        RC = &X86::VK16RegClass;
+      else
+        llvm_unreachable("Unknown argument type!");
+
+      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+      ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
+
+      // If this is an 8 or 16-bit value, it is really passed promoted to 32
+      // bits.  Insert an assert[sz]ext to capture this, then truncate to the
+      // right size.
+      if (VA.getLocInfo() == CCValAssign::SExt)
+        ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
+                               DAG.getValueType(VA.getValVT()));
+      else if (VA.getLocInfo() == CCValAssign::ZExt)
+        ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
+                               DAG.getValueType(VA.getValVT()));
+      else if (VA.getLocInfo() == CCValAssign::BCvt)
+        ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
+
+      if (VA.isExtInLoc()) {
+        // Handle MMX values passed in XMM regs.
+        if (RegVT.isVector())
+          ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
+        else
+          ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
+      }
+    } else {
+      assert(VA.isMemLoc());
+      ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
+    }
+
+    // If value is passed via pointer - do a load.
+    if (VA.getLocInfo() == CCValAssign::Indirect)
+      ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
+                             MachinePointerInfo(), false, false, false, 0);
+
+    InVals.push_back(ArgValue);
+  }
+
+  if (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) {
+    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+      // The x86-64 ABIs require that for returning structs by value we copy
+      // the sret argument into %rax/%eax (depending on ABI) for the return.
+      // Win32 requires us to put the sret argument to %eax as well.
+      // Save the argument into a virtual register so that we can access it
+      // from the return points.
+      if (Ins[i].Flags.isSRet()) {
+        unsigned Reg = FuncInfo->getSRetReturnReg();
+        if (!Reg) {
+          MVT PtrTy = getPointerTy();
+          Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
+          FuncInfo->setSRetReturnReg(Reg);
+        }
+        SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
+        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
+        break;
+      }
+    }
+  }
+
+  unsigned StackSize = CCInfo.getNextStackOffset();
+  // Align stack specially for tail calls.
+  if (FuncIsMadeTailCallSafe(CallConv,
+                             MF.getTarget().Options.GuaranteedTailCallOpt))
+    StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
+
+  // If the function takes variable number of arguments, make a frame index for
+  // the start of the first vararg value... for expansion of llvm.va_start.
+  if (isVarArg) {
+    if (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
+                    CallConv != CallingConv::X86_ThisCall)) {
+      FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true));
+    }
+    if (Is64Bit) {
+      unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
+
+      // FIXME: We should really autogenerate these arrays
+      static const MCPhysReg GPR64ArgRegsWin64[] = {
+        X86::RCX, X86::RDX, X86::R8,  X86::R9
+      };
+      static const MCPhysReg GPR64ArgRegs64Bit[] = {
+        X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
+      };
+      static const MCPhysReg XMMArgRegs64Bit[] = {
+        X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+        X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+      };
+      const MCPhysReg *GPR64ArgRegs;
+      unsigned NumXMMRegs = 0;
+
+      if (IsWin64) {
+        // The XMM registers which might contain var arg parameters are shadowed
+        // in their paired GPR.  So we only need to save the GPR to their home
+        // slots.
+        TotalNumIntRegs = 4;
+        GPR64ArgRegs = GPR64ArgRegsWin64;
+      } else {
+        TotalNumIntRegs = 6; TotalNumXMMRegs = 8;
+        GPR64ArgRegs = GPR64ArgRegs64Bit;
+
+        NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit,
+                                                TotalNumXMMRegs);
+      }
+      unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs,
+                                                       TotalNumIntRegs);
+
+      bool NoImplicitFloatOps = Fn->getAttributes().
+        hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
+      assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
+             "SSE register cannot be used when SSE is disabled!");
+      assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat &&
+               NoImplicitFloatOps) &&
+             "SSE register cannot be used when SSE is disabled!");
+      if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
+          !Subtarget->hasSSE1())
+        // Kernel mode asks for SSE to be disabled, so don't push them
+        // on the stack.
+        TotalNumXMMRegs = 0;
+
+      if (IsWin64) {
+        const TargetFrameLowering &TFI = *MF.getTarget().getFrameLowering();
+        // Get to the caller-allocated home save location.  Add 8 to account
+        // for the return address.
+        int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
+        FuncInfo->setRegSaveFrameIndex(
+          MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
+        // Fixup to set vararg frame on shadow area (4 x i64).
+        if (NumIntRegs < 4)
+          FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
+      } else {
+        // For X86-64, if there are vararg parameters that are passed via
+        // registers, then we must store them to their spots on the stack so
+        // they may be loaded by deferencing the result of va_next.
+        FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
+        FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16);
+        FuncInfo->setRegSaveFrameIndex(
+          MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16,
+                               false));
+      }
+
+      // Store the integer parameter registers.
+      SmallVector<SDValue, 8> MemOps;
+      SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
+                                        getPointerTy());
+      unsigned Offset = FuncInfo->getVarArgsGPOffset();
+      for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) {
+        SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
+                                  DAG.getIntPtrConstant(Offset));
+        unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs],
+                                     &X86::GR64RegClass);
+        SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
+        SDValue Store =
+          DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                       MachinePointerInfo::getFixedStack(
+                         FuncInfo->getRegSaveFrameIndex(), Offset),
+                       false, false, 0);
+        MemOps.push_back(Store);
+        Offset += 8;
+      }
+
+      if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) {
+        // Now store the XMM (fp + vector) parameter registers.
+        SmallVector<SDValue, 11> SaveXMMOps;
+        SaveXMMOps.push_back(Chain);
+
+        unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
+        SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8);
+        SaveXMMOps.push_back(ALVal);
+
+        SaveXMMOps.push_back(DAG.getIntPtrConstant(
+                               FuncInfo->getRegSaveFrameIndex()));
+        SaveXMMOps.push_back(DAG.getIntPtrConstant(
+                               FuncInfo->getVarArgsFPOffset()));
+
+        for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
+          unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs],
+                                       &X86::VR128RegClass);
+          SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32);
+          SaveXMMOps.push_back(Val);
+        }
+        MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
+                                     MVT::Other, SaveXMMOps));
+      }
+
+      if (!MemOps.empty())
+        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
+    }
+  }
+
+  // Some CCs need callee pop.
+  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
+                       MF.getTarget().Options.GuaranteedTailCallOpt)) {
+    FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
+  } else {
+    FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
+    // If this is an sret function, the return should pop the hidden pointer.
+    if (!Is64Bit && !IsTailCallConvention(CallConv) &&
+        !Subtarget->getTargetTriple().isOSMSVCRT() &&
+        argsAreStructReturn(Ins) == StackStructReturn)
+      FuncInfo->setBytesToPopOnReturn(4);
+  }
+
+  if (!Is64Bit) {
+    // RegSaveFrameIndex is X86-64 only.
+    FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
+    if (CallConv == CallingConv::X86_FastCall ||
+        CallConv == CallingConv::X86_ThisCall)
+      // fastcc functions can't have varargs.
+      FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
+  }
+
+  FuncInfo->setArgumentStackSize(StackSize);
+
+  return Chain;
+}
+
+SDValue
+X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
+                                    SDValue StackPtr, SDValue Arg,
+                                    SDLoc dl, SelectionDAG &DAG,
+                                    const CCValAssign &VA,
+                                    ISD::ArgFlagsTy Flags) const {
+  unsigned LocMemOffset = VA.getLocMemOffset();
+  SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
+  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
+  if (Flags.isByVal())
+    return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
+
+  return DAG.getStore(Chain, dl, Arg, PtrOff,
+                      MachinePointerInfo::getStack(LocMemOffset),
+                      false, false, 0);
+}
+
+/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call
+/// optimization is performed and it is required.
+SDValue
+X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
+                                           SDValue &OutRetAddr, SDValue Chain,
+                                           bool IsTailCall, bool Is64Bit,
+                                           int FPDiff, SDLoc dl) const {
+  // Adjust the Return address stack slot.
+  EVT VT = getPointerTy();
+  OutRetAddr = getReturnAddressFrameIndex(DAG);
+
+  // Load the "old" Return address.
+  OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
+                           false, false, false, 0);
+  return SDValue(OutRetAddr.getNode(), 1);
+}
+
+/// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call
+/// optimization is performed and it is required (FPDiff!=0).
+static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
+                                        SDValue Chain, SDValue RetAddrFrIdx,
+                                        EVT PtrVT, unsigned SlotSize,
+                                        int FPDiff, SDLoc dl) {
+  // Store the return address to the appropriate stack slot.
+  if (!FPDiff) return Chain;
+  // Calculate the new stack slot for the return address.
+  int NewReturnAddrFI =
+    MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
+                                         false);
+  SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
+  Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
+                       MachinePointerInfo::getFixedStack(NewReturnAddrFI),
+                       false, false, 0);
+  return Chain;
+}
+
+SDValue
+X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                             SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  SDLoc &dl                             = CLI.DL;
+  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+  SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
+  SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool &isTailCall                      = CLI.IsTailCall;
+  bool isVarArg                         = CLI.IsVarArg;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool Is64Bit        = Subtarget->is64Bit();
+  bool IsWin64        = Subtarget->isCallingConvWin64(CallConv);
+  StructReturnType SR = callIsStructReturn(Outs);
+  bool IsSibcall      = false;
+
+  if (MF.getTarget().Options.DisableTailCalls)
+    isTailCall = false;
+
+  bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
+  if (IsMustTail) {
+    // Force this to be a tail call.  The verifier rules are enough to ensure
+    // that we can lower this successfully without moving the return address
+    // around.
+    isTailCall = true;
+  } else if (isTailCall) {
+    // Check if it's really possible to do a tail call.
+    isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
+                    isVarArg, SR != NotStructReturn,
+                    MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
+                    Outs, OutVals, Ins, DAG);
+
+    // Sibcalls are automatically detected tailcalls which do not require
+    // ABI changes.
+    if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
+      IsSibcall = true;
+
+    if (isTailCall)
+      ++NumTailCalls;
+  }
+
+  assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
+         "Var args not supported with calling convention fastcc, ghc or hipe");
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, MF, MF.getTarget(),
+                 ArgLocs, *DAG.getContext());
+
+  // Allocate shadow area for Win64
+  if (IsWin64)
+    CCInfo.AllocateStack(32, 8);
+
+  CCInfo.AnalyzeCallOperands(Outs, CC_X86);
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+  if (IsSibcall)
+    // This is a sibcall. The memory operands are available in caller's
+    // own caller's stack.
+    NumBytes = 0;
+  else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
+           IsTailCallConvention(CallConv))
+    NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
+
+  int FPDiff = 0;
+  if (isTailCall && !IsSibcall && !IsMustTail) {
+    // Lower arguments at fp - stackoffset + fpdiff.
+    X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
+    unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
+
+    FPDiff = NumBytesCallerPushed - NumBytes;
+
+    // Set the delta of movement of the returnaddr stackslot.
+    // But only set if delta is greater than previous delta.
+    if (FPDiff < X86Info->getTCReturnAddrDelta())
+      X86Info->setTCReturnAddrDelta(FPDiff);
+  }
+
+  unsigned NumBytesToPush = NumBytes;
+  unsigned NumBytesToPop = NumBytes;
+
+  // If we have an inalloca argument, all stack space has already been allocated
+  // for us and be right at the top of the stack.  We don't support multiple
+  // arguments passed in memory when using inalloca.
+  if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
+    NumBytesToPush = 0;
+    assert(ArgLocs.back().getLocMemOffset() == 0 &&
+           "an inalloca argument must be the only memory argument");
+  }
+
+  if (!IsSibcall)
+    Chain = DAG.getCALLSEQ_START(
+        Chain, DAG.getIntPtrConstant(NumBytesToPush, true), dl);
+
+  SDValue RetAddrFrIdx;
+  // Load return address for tail calls.
+  if (isTailCall && FPDiff)
+    Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
+                                    Is64Bit, FPDiff, dl);
+
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  SmallVector<SDValue, 8> MemOpChains;
+  SDValue StackPtr;
+
+  // Walk the register/memloc assignments, inserting copies/loads.  In the case
+  // of tail call optimization arguments are handle later.
+  const X86RegisterInfo *RegInfo =
+    static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    // Skip inalloca arguments, they have already been written.
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
+    if (Flags.isInAlloca())
+      continue;
+
+    CCValAssign &VA = ArgLocs[i];
+    EVT RegVT = VA.getLocVT();
+    SDValue Arg = OutVals[i];
+    bool isByVal = Flags.isByVal();
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
+      break;
+    case CCValAssign::AExt:
+      if (RegVT.is128BitVector()) {
+        // Special case: passing MMX values in XMM registers.
+        Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
+        Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
+        Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
+      } else
+        Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
+      break;
+    case CCValAssign::BCvt:
+      Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
+      break;
+    case CCValAssign::Indirect: {
+      // Store the argument.
+      SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
+      int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
+      Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
+                           MachinePointerInfo::getFixedStack(FI),
+                           false, false, 0);
+      Arg = SpillSlot;
+      break;
+    }
+    }
+
+    if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+      if (isVarArg && IsWin64) {
+        // Win64 ABI requires argument XMM reg to be copied to the corresponding
+        // shadow reg if callee is a varargs function.
+        unsigned ShadowReg = 0;
+        switch (VA.getLocReg()) {
+        case X86::XMM0: ShadowReg = X86::RCX; break;
+        case X86::XMM1: ShadowReg = X86::RDX; break;
+        case X86::XMM2: ShadowReg = X86::R8; break;
+        case X86::XMM3: ShadowReg = X86::R9; break;
+        }
+        if (ShadowReg)
+          RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
+      }
+    } else if (!IsSibcall && (!isTailCall || isByVal)) {
+      assert(VA.isMemLoc());
+      if (!StackPtr.getNode())
+        StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
+                                      getPointerTy());
+      MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
+                                             dl, DAG, VA, Flags));
+    }
+  }
+
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
+
+  if (Subtarget->isPICStyleGOT()) {
+    // ELF / PIC requires GOT in the EBX register before function calls via PLT
+    // GOT pointer.
+    if (!isTailCall) {
+      RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
+               DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy())));
+    } else {
+      // If we are tail calling and generating PIC/GOT style code load the
+      // address of the callee into ECX. The value in ecx is used as target of
+      // the tail jump. This is done to circumvent the ebx/callee-saved problem
+      // for tail calls on PIC/GOT architectures. Normally we would just put the
+      // address of GOT into ebx and then call target@PLT. But for tail calls
+      // ebx would be restored (since ebx is callee saved) before jumping to the
+      // target@PLT.
+
+      // Note: The actual moving to ECX is done further down.
+      GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
+      if (G && !G->getGlobal()->hasHiddenVisibility() &&
+          !G->getGlobal()->hasProtectedVisibility())
+        Callee = LowerGlobalAddress(Callee, DAG);
+      else if (isa<ExternalSymbolSDNode>(Callee))
+        Callee = LowerExternalSymbol(Callee, DAG);
+    }
+  }
+
+  if (Is64Bit && isVarArg && !IsWin64) {
+    // From AMD64 ABI document:
+    // For calls that may call functions that use varargs or stdargs
+    // (prototype-less calls or calls to functions containing ellipsis (...) in
+    // the declaration) %al is used as hidden argument to specify the number
+    // of SSE registers used. The contents of %al do not need to match exactly
+    // the number of registers, but must be an ubound on the number of SSE
+    // registers used and is in the range 0 - 8 inclusive.
+
+    // Count the number of XMM registers allocated.
+    static const MCPhysReg XMMArgRegs[] = {
+      X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+      X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+    };
+    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
+    assert((Subtarget->hasSSE1() || !NumXMMRegs)
+           && "SSE registers cannot be used when SSE is disabled");
+
+    RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
+                                        DAG.getConstant(NumXMMRegs, MVT::i8)));
+  }
+
+  // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
+  // don't need this because the eligibility check rejects calls that require
+  // shuffling arguments passed in memory.
+  if (!IsSibcall && isTailCall) {
+    // Force all the incoming stack arguments to be loaded from the stack
+    // before any new outgoing arguments are stored to the stack, because the
+    // outgoing stack slots may alias the incoming argument stack slots, and
+    // the alias isn't otherwise explicit. This is slightly more conservative
+    // than necessary, because it means that each store effectively depends
+    // on every argument instead of just those arguments it would clobber.
+    SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
+
+    SmallVector<SDValue, 8> MemOpChains2;
+    SDValue FIN;
+    int FI = 0;
+    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+      CCValAssign &VA = ArgLocs[i];
+      if (VA.isRegLoc())
+        continue;
+      assert(VA.isMemLoc());
+      SDValue Arg = OutVals[i];
+      ISD::ArgFlagsTy Flags = Outs[i].Flags;
+      // Skip inalloca arguments.  They don't require any work.
+      if (Flags.isInAlloca())
+        continue;
+      // Create frame index.
+      int32_t Offset = VA.getLocMemOffset()+FPDiff;
+      uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
+      FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
+      FIN = DAG.getFrameIndex(FI, getPointerTy());
+
+      if (Flags.isByVal()) {
+        // Copy relative to framepointer.
+        SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
+        if (!StackPtr.getNode())
+          StackPtr = DAG.getCopyFromReg(Chain, dl,
+                                        RegInfo->getStackRegister(),
+                                        getPointerTy());
+        Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
+
+        MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
+                                                         ArgChain,
+                                                         Flags, DAG, dl));
+      } else {
+        // Store relative to framepointer.
+        MemOpChains2.push_back(
+          DAG.getStore(ArgChain, dl, Arg, FIN,
+                       MachinePointerInfo::getFixedStack(FI),
+                       false, false, 0));
+      }
+    }
+
+    if (!MemOpChains2.empty())
+      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
+
+    // Store the return address to the appropriate stack slot.
+    Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
+                                     getPointerTy(), RegInfo->getSlotSize(),
+                                     FPDiff, dl);
+  }
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into registers.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
+    assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
+    // In the 64-bit large code model, we have to make all calls
+    // through a register, since the call instruction's 32-bit
+    // pc-relative offset may not be large enough to hold the whole
+    // address.
+  } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    // If the callee is a GlobalAddress node (quite common, every direct call
+    // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
+    // it.
+
+    // We should use extra load for direct calls to dllimported functions in
+    // non-JIT mode.
+    const GlobalValue *GV = G->getGlobal();
+    if (!GV->hasDLLImportStorageClass()) {
+      unsigned char OpFlags = 0;
+      bool ExtraLoad = false;
+      unsigned WrapperKind = ISD::DELETED_NODE;
+
+      // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
+      // external symbols most go through the PLT in PIC mode.  If the symbol
+      // has hidden or protected visibility, or if it is static or local, then
+      // we don't need to use the PLT - we can directly call it.
+      if (Subtarget->isTargetELF() &&
+          DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
+          GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
+        OpFlags = X86II::MO_PLT;
+      } else if (Subtarget->isPICStyleStubAny() &&
+                 (GV->isDeclaration() || GV->isWeakForLinker()) &&
+                 (!Subtarget->getTargetTriple().isMacOSX() ||
+                  Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
+        // PC-relative references to external symbols should go through $stub,
+        // unless we're building with the leopard linker or later, which
+        // automatically synthesizes these stubs.
+        OpFlags = X86II::MO_DARWIN_STUB;
+      } else if (Subtarget->isPICStyleRIPRel() &&
+                 isa<Function>(GV) &&
+                 cast<Function>(GV)->getAttributes().
+                   hasAttribute(AttributeSet::FunctionIndex,
+                                Attribute::NonLazyBind)) {
+        // If the function is marked as non-lazy, generate an indirect call
+        // which loads from the GOT directly. This avoids runtime overhead
+        // at the cost of eager binding (and one extra byte of encoding).
+        OpFlags = X86II::MO_GOTPCREL;
+        WrapperKind = X86ISD::WrapperRIP;
+        ExtraLoad = true;
+      }
+
+      Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
+                                          G->getOffset(), OpFlags);
+
+      // Add a wrapper if needed.
+      if (WrapperKind != ISD::DELETED_NODE)
+        Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
+      // Add extra indirection if needed.
+      if (ExtraLoad)
+        Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
+                             MachinePointerInfo::getGOT(),
+                             false, false, false, 0);
+    }
+  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    unsigned char OpFlags = 0;
+
+    // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
+    // external symbols should go through the PLT.
+    if (Subtarget->isTargetELF() &&
+        DAG.getTarget().getRelocationModel() == Reloc::PIC_) {
+      OpFlags = X86II::MO_PLT;
+    } else if (Subtarget->isPICStyleStubAny() &&
+               (!Subtarget->getTargetTriple().isMacOSX() ||
+                Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
+      // PC-relative references to external symbols should go through $stub,
+      // unless we're building with the leopard linker or later, which
+      // automatically synthesizes these stubs.
+      OpFlags = X86II::MO_DARWIN_STUB;
+    }
+
+    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
+                                         OpFlags);
+  }
+
+  // Returns a chain & a flag for retval copy to use.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SmallVector<SDValue, 8> Ops;
+
+  if (!IsSibcall && isTailCall) {
+    Chain = DAG.getCALLSEQ_END(Chain,
+                               DAG.getIntPtrConstant(NumBytesToPop, true),
+                               DAG.getIntPtrConstant(0, true), InFlag, dl);
+    InFlag = Chain.getValue(1);
+  }
+
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  if (isTailCall)
+    Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
+
+  // Add argument registers to the end of the list so that they are known live
+  // into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  // Add a register mask operand representing the call-preserved registers.
+  const TargetRegisterInfo *TRI = DAG.getTarget().getRegisterInfo();
+  const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  if (isTailCall) {
+    // We used to do:
+    //// If this is the first return lowered for this function, add the regs
+    //// to the liveout set for the function.
+    // This isn't right, although it's probably harmless on x86; liveouts
+    // should be computed from returns not tail calls.  Consider a void
+    // function making a tail call to a function returning int.
+    return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
+  }
+
+  Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
+  InFlag = Chain.getValue(1);
+
+  // Create the CALLSEQ_END node.
+  unsigned NumBytesForCalleeToPop;
+  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
+                       DAG.getTarget().Options.GuaranteedTailCallOpt))
+    NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
+  else if (!Is64Bit && !IsTailCallConvention(CallConv) &&
+           !Subtarget->getTargetTriple().isOSMSVCRT() &&
+           SR == StackStructReturn)
+    // If this is a call to a struct-return function, the callee
+    // pops the hidden struct pointer, so we have to push it back.
+    // This is common for Darwin/X86, Linux & Mingw32 targets.
+    // For MSVC Win32 targets, the caller pops the hidden struct pointer.
+    NumBytesForCalleeToPop = 4;
+  else
+    NumBytesForCalleeToPop = 0;  // Callee pops nothing.
+
+  // Returns a flag for retval copy to use.
+  if (!IsSibcall) {
+    Chain = DAG.getCALLSEQ_END(Chain,
+                               DAG.getIntPtrConstant(NumBytesToPop, true),
+                               DAG.getIntPtrConstant(NumBytesForCalleeToPop,
+                                                     true),
+                               InFlag, dl);
+    InFlag = Chain.getValue(1);
+  }
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
+                         Ins, dl, DAG, InVals);
+}
+
+//===----------------------------------------------------------------------===//
+//                Fast Calling Convention (tail call) implementation
+//===----------------------------------------------------------------------===//
+
+//  Like std call, callee cleans arguments, convention except that ECX is
+//  reserved for storing the tail called function address. Only 2 registers are
+//  free for argument passing (inreg). Tail call optimization is performed
+//  provided:
+//                * tailcallopt is enabled
+//                * caller/callee are fastcc
+//  On X86_64 architecture with GOT-style position independent code only local
+//  (within module) calls are supported at the moment.
+//  To keep the stack aligned according to platform abi the function
+//  GetAlignedArgumentStackSize ensures that argument delta is always multiples
+//  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
+//  If a tail called function callee has more arguments than the caller the
+//  caller needs to make sure that there is room to move the RETADDR to. This is
+//  achieved by reserving an area the size of the argument delta right after the
+//  original RETADDR, but before the saved framepointer or the spilled registers
+//  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
+//  stack layout:
+//    arg1
+//    arg2
+//    RETADDR
+//    [ new RETADDR
+//      move area ]
+//    (possible EBP)
+//    ESI
+//    EDI
+//    local1 ..
+
+/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
+/// for a 16 byte align requirement.
+unsigned
+X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
+                                               SelectionDAG& DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  const TargetMachine &TM = MF.getTarget();
+  const X86RegisterInfo *RegInfo =
+    static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
+  const TargetFrameLowering &TFI = *TM.getFrameLowering();
+  unsigned StackAlignment = TFI.getStackAlignment();
+  uint64_t AlignMask = StackAlignment - 1;
+  int64_t Offset = StackSize;
+  unsigned SlotSize = RegInfo->getSlotSize();
+  if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
+    // Number smaller than 12 so just add the difference.
+    Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
+  } else {
+    // Mask out lower bits, add stackalignment once plus the 12 bytes.
+    Offset = ((~AlignMask) & Offset) + StackAlignment +
+      (StackAlignment-SlotSize);
+  }
+  return Offset;
+}
+
+/// MatchingStackOffset - Return true if the given stack call argument is
+/// already available in the same position (relatively) of the caller's
+/// incoming argument stack.
+static
+bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
+                         MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
+                         const X86InstrInfo *TII) {
+  unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
+  int FI = INT_MAX;
+  if (Arg.getOpcode() == ISD::CopyFromReg) {
+    unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(VR))
+      return false;
+    MachineInstr *Def = MRI->getVRegDef(VR);
+    if (!Def)
+      return false;
+    if (!Flags.isByVal()) {
+      if (!TII->isLoadFromStackSlot(Def, FI))
+        return false;
+    } else {
+      unsigned Opcode = Def->getOpcode();
+      if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) &&
+          Def->getOperand(1).isFI()) {
+        FI = Def->getOperand(1).getIndex();
+        Bytes = Flags.getByValSize();
+      } else
+        return false;
+    }
+  } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
+    if (Flags.isByVal())
+      // ByVal argument is passed in as a pointer but it's now being
+      // dereferenced. e.g.
+      // define @foo(%struct.X* %A) {
+      //   tail call @bar(%struct.X* byval %A)
+      // }
+      return false;
+    SDValue Ptr = Ld->getBasePtr();
+    FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
+    if (!FINode)
+      return false;
+    FI = FINode->getIndex();
+  } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
+    FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
+    FI = FINode->getIndex();
+    Bytes = Flags.getByValSize();
+  } else
+    return false;
+
+  assert(FI != INT_MAX);
+  if (!MFI->isFixedObjectIndex(FI))
+    return false;
+  return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
+}
+
+/// IsEligibleForTailCallOptimization - Check whether the call is eligible
+/// for tail call optimization. Targets which want to do tail call
+/// optimization should implement this function.
+bool
+X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
+                                                     CallingConv::ID CalleeCC,
+                                                     bool isVarArg,
+                                                     bool isCalleeStructRet,
+                                                     bool isCallerStructRet,
+                                                     Type *RetTy,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<SDValue> &OutVals,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                                     SelectionDAG &DAG) const {
+  if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
+    return false;
+
+  // If -tailcallopt is specified, make fastcc functions tail-callable.
+  const MachineFunction &MF = DAG.getMachineFunction();
+  const Function *CallerF = MF.getFunction();
+
+  // If the function return type is x86_fp80 and the callee return type is not,
+  // then the FP_EXTEND of the call result is not a nop. It's not safe to
+  // perform a tailcall optimization here.
+  if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
+    return false;
+
+  CallingConv::ID CallerCC = CallerF->getCallingConv();
+  bool CCMatch = CallerCC == CalleeCC;
+  bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
+  bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
+
+  if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
+    if (IsTailCallConvention(CalleeCC) && CCMatch)
+      return true;
+    return false;
+  }
+
+  // Look for obvious safe cases to perform tail call optimization that do not
+  // require ABI changes. This is what gcc calls sibcall.
+
+  // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
+  // emit a special epilogue.
+  const X86RegisterInfo *RegInfo =
+    static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
+  if (RegInfo->needsStackRealignment(MF))
+    return false;
+
+  // Also avoid sibcall optimization if either caller or callee uses struct
+  // return semantics.
+  if (isCalleeStructRet || isCallerStructRet)
+    return false;
+
+  // An stdcall/thiscall caller is expected to clean up its arguments; the
+  // callee isn't going to do that.
+  // FIXME: this is more restrictive than needed. We could produce a tailcall
+  // when the stack adjustment matches. For example, with a thiscall that takes
+  // only one argument.
+  if (!CCMatch && (CallerCC == CallingConv::X86_StdCall ||
+                   CallerCC == CallingConv::X86_ThisCall))
+    return false;
+
+  // Do not sibcall optimize vararg calls unless all arguments are passed via
+  // registers.
+  if (isVarArg && !Outs.empty()) {
+
+    // Optimizing for varargs on Win64 is unlikely to be safe without
+    // additional testing.
+    if (IsCalleeWin64 || IsCallerWin64)
+      return false;
+
+    SmallVector<CCValAssign, 16> ArgLocs;
+    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
+                   DAG.getTarget(), ArgLocs, *DAG.getContext());
+
+    CCInfo.AnalyzeCallOperands(Outs, CC_X86);
+    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
+      if (!ArgLocs[i].isRegLoc())
+        return false;
+  }
+
+  // If the call result is in ST0 / ST1, it needs to be popped off the x87
+  // stack.  Therefore, if it's not used by the call it is not safe to optimize
+  // this into a sibcall.
+  bool Unused = false;
+  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+    if (!Ins[i].Used) {
+      Unused = true;
+      break;
+    }
+  }
+  if (Unused) {
+    SmallVector<CCValAssign, 16> RVLocs;
+    CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(),
+                   DAG.getTarget(), RVLocs, *DAG.getContext());
+    CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
+    for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
+      CCValAssign &VA = RVLocs[i];
+      if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1)
+        return false;
+    }
+  }
+
+  // If the calling conventions do not match, then we'd better make sure the
+  // results are returned in the same way as what the caller expects.
+  if (!CCMatch) {
+    SmallVector<CCValAssign, 16> RVLocs1;
+    CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
+                    DAG.getTarget(), RVLocs1, *DAG.getContext());
+    CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
+
+    SmallVector<CCValAssign, 16> RVLocs2;
+    CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
+                    DAG.getTarget(), RVLocs2, *DAG.getContext());
+    CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
+
+    if (RVLocs1.size() != RVLocs2.size())
+      return false;
+    for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
+      if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
+        return false;
+      if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
+        return false;
+      if (RVLocs1[i].isRegLoc()) {
+        if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
+          return false;
+      } else {
+        if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
+          return false;
+      }
+    }
+  }
+
+  // If the callee takes no arguments then go on to check the results of the
+  // call.
+  if (!Outs.empty()) {
+    // Check if stack adjustment is needed. For now, do not do this if any
+    // argument is passed on the stack.
+    SmallVector<CCValAssign, 16> ArgLocs;
+    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
+                   DAG.getTarget(), ArgLocs, *DAG.getContext());
+
+    // Allocate shadow area for Win64
+    if (IsCalleeWin64)
+      CCInfo.AllocateStack(32, 8);
+
+    CCInfo.AnalyzeCallOperands(Outs, CC_X86);
+    if (CCInfo.getNextStackOffset()) {
+      MachineFunction &MF = DAG.getMachineFunction();
+      if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
+        return false;
+
+      // Check if the arguments are already laid out in the right way as
+      // the caller's fixed stack objects.
+      MachineFrameInfo *MFI = MF.getFrameInfo();
+      const MachineRegisterInfo *MRI = &MF.getRegInfo();
+      const X86InstrInfo *TII =
+          static_cast<const X86InstrInfo *>(DAG.getTarget().getInstrInfo());
+      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+        CCValAssign &VA = ArgLocs[i];
+        SDValue Arg = OutVals[i];
+        ISD::ArgFlagsTy Flags = Outs[i].Flags;
+        if (VA.getLocInfo() == CCValAssign::Indirect)
+          return false;
+        if (!VA.isRegLoc()) {
+          if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
+                                   MFI, MRI, TII))
+            return false;
+        }
+      }
+    }
+
+    // If the tailcall address may be in a register, then make sure it's
+    // possible to register allocate for it. In 32-bit, the call address can
+    // only target EAX, EDX, or ECX since the tail call must be scheduled after
+    // callee-saved registers are restored. These happen to be the same
+    // registers used to pass 'inreg' arguments so watch out for those.
+    if (!Subtarget->is64Bit() &&
+        ((!isa<GlobalAddressSDNode>(Callee) &&
+          !isa<ExternalSymbolSDNode>(Callee)) ||
+         DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
+      unsigned NumInRegs = 0;
+      // In PIC we need an extra register to formulate the address computation
+      // for the callee.
+      unsigned MaxInRegs =
+	(DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
+
+      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+        CCValAssign &VA = ArgLocs[i];
+        if (!VA.isRegLoc())
+          continue;
+        unsigned Reg = VA.getLocReg();
+        switch (Reg) {
+        default: break;
+        case X86::EAX: case X86::EDX: case X86::ECX:
+          if (++NumInRegs == MaxInRegs)
+            return false;
+          break;
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+FastISel *
+X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
+                                  const TargetLibraryInfo *libInfo) const {
+  return X86::createFastISel(funcInfo, libInfo);
+}
+
+//===----------------------------------------------------------------------===//
+//                           Other Lowering Hooks
+//===----------------------------------------------------------------------===//
+
+static bool MayFoldLoad(SDValue Op) {
+  return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
+}
+
+static bool MayFoldIntoStore(SDValue Op) {
+  return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
+}
+
+static bool isTargetShuffle(unsigned Opcode) {
+  switch(Opcode) {
+  default: return false;
+  case X86ISD::PSHUFD:
+  case X86ISD::PSHUFHW:
+  case X86ISD::PSHUFLW:
+  case X86ISD::SHUFP:
+  case X86ISD::PALIGNR:
+  case X86ISD::MOVLHPS:
+  case X86ISD::MOVLHPD:
+  case X86ISD::MOVHLPS:
+  case X86ISD::MOVLPS:
+  case X86ISD::MOVLPD:
+  case X86ISD::MOVSHDUP:
+  case X86ISD::MOVSLDUP:
+  case X86ISD::MOVDDUP:
+  case X86ISD::MOVSS:
+  case X86ISD::MOVSD:
+  case X86ISD::UNPCKL:
+  case X86ISD::UNPCKH:
+  case X86ISD::VPERMILP:
+  case X86ISD::VPERM2X128:
+  case X86ISD::VPERMI:
+    return true;
+  }
+}
+
+static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
+                                    SDValue V1, SelectionDAG &DAG) {
+  switch(Opc) {
+  default: llvm_unreachable("Unknown x86 shuffle node");
+  case X86ISD::MOVSHDUP:
+  case X86ISD::MOVSLDUP:
+  case X86ISD::MOVDDUP:
+    return DAG.getNode(Opc, dl, VT, V1);
+  }
+}
+
+static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
+                                    SDValue V1, unsigned TargetMask,
+                                    SelectionDAG &DAG) {
+  switch(Opc) {
+  default: llvm_unreachable("Unknown x86 shuffle node");
+  case X86ISD::PSHUFD:
+  case X86ISD::PSHUFHW:
+  case X86ISD::PSHUFLW:
+  case X86ISD::VPERMILP:
+  case X86ISD::VPERMI:
+    return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
+  }
+}
+
+static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
+                                    SDValue V1, SDValue V2, unsigned TargetMask,
+                                    SelectionDAG &DAG) {
+  switch(Opc) {
+  default: llvm_unreachable("Unknown x86 shuffle node");
+  case X86ISD::PALIGNR:
+  case X86ISD::SHUFP:
+  case X86ISD::VPERM2X128:
+    return DAG.getNode(Opc, dl, VT, V1, V2,
+                       DAG.getConstant(TargetMask, MVT::i8));
+  }
+}
+
+static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
+                                    SDValue V1, SDValue V2, SelectionDAG &DAG) {
+  switch(Opc) {
+  default: llvm_unreachable("Unknown x86 shuffle node");
+  case X86ISD::MOVLHPS:
+  case X86ISD::MOVLHPD:
+  case X86ISD::MOVHLPS:
+  case X86ISD::MOVLPS:
+  case X86ISD::MOVLPD:
+  case X86ISD::MOVSS:
+  case X86ISD::MOVSD:
+  case X86ISD::UNPCKL:
+  case X86ISD::UNPCKH:
+    return DAG.getNode(Opc, dl, VT, V1, V2);
+  }
+}
+
+SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  const X86RegisterInfo *RegInfo =
+    static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
+  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+  int ReturnAddrIndex = FuncInfo->getRAIndex();
+
+  if (ReturnAddrIndex == 0) {
+    // Set up a frame object for the return address.
+    unsigned SlotSize = RegInfo->getSlotSize();
+    ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
+                                                           -(int64_t)SlotSize,
+                                                           false);
+    FuncInfo->setRAIndex(ReturnAddrIndex);
+  }
+
+  return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
+}
+
+bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
+                                       bool hasSymbolicDisplacement) {
+  // Offset should fit into 32 bit immediate field.
+  if (!isInt<32>(Offset))
+    return false;
+
+  // If we don't have a symbolic displacement - we don't have any extra
+  // restrictions.
+  if (!hasSymbolicDisplacement)
+    return true;
+
+  // FIXME: Some tweaks might be needed for medium code model.
+  if (M != CodeModel::Small && M != CodeModel::Kernel)
+    return false;
+
+  // For small code model we assume that latest object is 16MB before end of 31
+  // bits boundary. We may also accept pretty large negative constants knowing
+  // that all objects are in the positive half of address space.
+  if (M == CodeModel::Small && Offset < 16*1024*1024)
+    return true;
+
+  // For kernel code model we know that all object resist in the negative half
+  // of 32bits address space. We may not accept negative offsets, since they may
+  // be just off and we may accept pretty large positive ones.
+  if (M == CodeModel::Kernel && Offset > 0)
+    return true;
+
+  return false;
+}
+
+/// isCalleePop - Determines whether the callee is required to pop its
+/// own arguments. Callee pop is necessary to support tail calls.
+bool X86::isCalleePop(CallingConv::ID CallingConv,
+                      bool is64Bit, bool IsVarArg, bool TailCallOpt) {
+  if (IsVarArg)
+    return false;
+
+  switch (CallingConv) {
+  default:
+    return false;
+  case CallingConv::X86_StdCall:
+    return !is64Bit;
+  case CallingConv::X86_FastCall:
+    return !is64Bit;
+  case CallingConv::X86_ThisCall:
+    return !is64Bit;
+  case CallingConv::Fast:
+    return TailCallOpt;
+  case CallingConv::GHC:
+    return TailCallOpt;
+  case CallingConv::HiPE:
+    return TailCallOpt;
+  }
+}
+
+/// \brief Return true if the condition is an unsigned comparison operation.
+static bool isX86CCUnsigned(unsigned X86CC) {
+  switch (X86CC) {
+  default: llvm_unreachable("Invalid integer condition!");
+  case X86::COND_E:     return true;
+  case X86::COND_G:     return false;
+  case X86::COND_GE:    return false;
+  case X86::COND_L:     return false;
+  case X86::COND_LE:    return false;
+  case X86::COND_NE:    return true;
+  case X86::COND_B:     return true;
+  case X86::COND_A:     return true;
+  case X86::COND_BE:    return true;
+  case X86::COND_AE:    return true;
+  }
+  llvm_unreachable("covered switch fell through?!");
+}
+
+/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
+/// specific condition code, returning the condition code and the LHS/RHS of the
+/// comparison to make.
+static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
+                               SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
+  if (!isFP) {
+    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
+      if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
+        // X > -1   -> X == 0, jump !sign.
+        RHS = DAG.getConstant(0, RHS.getValueType());
+        return X86::COND_NS;
+      }
+      if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
+        // X < 0   -> X == 0, jump on sign.
+        return X86::COND_S;
+      }
+      if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
+        // X < 1   -> X <= 0
+        RHS = DAG.getConstant(0, RHS.getValueType());
+        return X86::COND_LE;
+      }
+    }
+
+    switch (SetCCOpcode) {
+    default: llvm_unreachable("Invalid integer condition!");
+    case ISD::SETEQ:  return X86::COND_E;
+    case ISD::SETGT:  return X86::COND_G;
+    case ISD::SETGE:  return X86::COND_GE;
+    case ISD::SETLT:  return X86::COND_L;
+    case ISD::SETLE:  return X86::COND_LE;
+    case ISD::SETNE:  return X86::COND_NE;
+    case ISD::SETULT: return X86::COND_B;
+    case ISD::SETUGT: return X86::COND_A;
+    case ISD::SETULE: return X86::COND_BE;
+    case ISD::SETUGE: return X86::COND_AE;
+    }
+  }
+
+  // First determine if it is required or is profitable to flip the operands.
+
+  // If LHS is a foldable load, but RHS is not, flip the condition.
+  if (ISD::isNON_EXTLoad(LHS.getNode()) &&
+      !ISD::isNON_EXTLoad(RHS.getNode())) {
+    SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
+    std::swap(LHS, RHS);
+  }
+
+  switch (SetCCOpcode) {
+  default: break;
+  case ISD::SETOLT:
+  case ISD::SETOLE:
+  case ISD::SETUGT:
+  case ISD::SETUGE:
+    std::swap(LHS, RHS);
+    break;
+  }
+
+  // On a floating point condition, the flags are set as follows:
+  // ZF  PF  CF   op
+  //  0 | 0 | 0 | X > Y
+  //  0 | 0 | 1 | X < Y
+  //  1 | 0 | 0 | X == Y
+  //  1 | 1 | 1 | unordered
+  switch (SetCCOpcode) {
+  default: llvm_unreachable("Condcode should be pre-legalized away");
+  case ISD::SETUEQ:
+  case ISD::SETEQ:   return X86::COND_E;
+  case ISD::SETOLT:              // flipped
+  case ISD::SETOGT:
+  case ISD::SETGT:   return X86::COND_A;
+  case ISD::SETOLE:              // flipped
+  case ISD::SETOGE:
+  case ISD::SETGE:   return X86::COND_AE;
+  case ISD::SETUGT:              // flipped
+  case ISD::SETULT:
+  case ISD::SETLT:   return X86::COND_B;
+  case ISD::SETUGE:              // flipped
+  case ISD::SETULE:
+  case ISD::SETLE:   return X86::COND_BE;
+  case ISD::SETONE:
+  case ISD::SETNE:   return X86::COND_NE;
+  case ISD::SETUO:   return X86::COND_P;
+  case ISD::SETO:    return X86::COND_NP;
+  case ISD::SETOEQ:
+  case ISD::SETUNE:  return X86::COND_INVALID;
+  }
+}
+
+/// hasFPCMov - is there a floating point cmov for the specific X86 condition
+/// code. Current x86 isa includes the following FP cmov instructions:
+/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
+static bool hasFPCMov(unsigned X86CC) {
+  switch (X86CC) {
+  default:
+    return false;
+  case X86::COND_B:
+  case X86::COND_BE:
+  case X86::COND_E:
+  case X86::COND_P:
+  case X86::COND_A:
+  case X86::COND_AE:
+  case X86::COND_NE:
+  case X86::COND_NP:
+    return true;
+  }
+}
+
+/// isFPImmLegal - Returns true if the target can instruction select the
+/// specified FP immediate natively. If false, the legalizer will
+/// materialize the FP immediate as a load from a constant pool.
+bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+  for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
+    if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
+      return true;
+  }
+  return false;
+}
+
+/// \brief Returns true if it is beneficial to convert a load of a constant
+/// to just the constant itself.
+bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                                          Type *Ty) const {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  if (BitSize == 0 || BitSize > 64)
+    return false;
+  return true;
+}
+
+/// isUndefOrInRange - Return true if Val is undef or if its value falls within
+/// the specified range (L, H].
+static bool isUndefOrInRange(int Val, int Low, int Hi) {
+  return (Val < 0) || (Val >= Low && Val < Hi);
+}
+
+/// isUndefOrEqual - Val is either less than zero (undef) or equal to the
+/// specified value.
+static bool isUndefOrEqual(int Val, int CmpVal) {
+  return (Val < 0 || Val == CmpVal);
+}
+
+/// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
+/// from position Pos and ending in Pos+Size, falls within the specified
+/// sequential range (L, L+Pos]. or is undef.
+static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
+                                       unsigned Pos, unsigned Size, int Low) {
+  for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
+    if (!isUndefOrEqual(Mask[i], Low))
+      return false;
+  return true;
+}
+
+/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
+/// is suitable for input to PSHUFD or PSHUFW.  That is, it doesn't reference
+/// the second operand.
+static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT) {
+  if (VT == MVT::v4f32 || VT == MVT::v4i32 )
+    return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
+  if (VT == MVT::v2f64 || VT == MVT::v2i64)
+    return (Mask[0] < 2 && Mask[1] < 2);
+  return false;
+}
+
+/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
+/// is suitable for input to PSHUFHW.
+static bool isPSHUFHWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
+  if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
+    return false;
+
+  // Lower quadword copied in order or undef.
+  if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
+    return false;
+
+  // Upper quadword shuffled.
+  for (unsigned i = 4; i != 8; ++i)
+    if (!isUndefOrInRange(Mask[i], 4, 8))
+      return false;
+
+  if (VT == MVT::v16i16) {
+    // Lower quadword copied in order or undef.
+    if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
+      return false;
+
+    // Upper quadword shuffled.
+    for (unsigned i = 12; i != 16; ++i)
+      if (!isUndefOrInRange(Mask[i], 12, 16))
+        return false;
+  }
+
+  return true;
+}
+
+/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
+/// is suitable for input to PSHUFLW.
+static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
+  if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
+    return false;
+
+  // Upper quadword copied in order.
+  if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
+    return false;
+
+  // Lower quadword shuffled.
+  for (unsigned i = 0; i != 4; ++i)
+    if (!isUndefOrInRange(Mask[i], 0, 4))
+      return false;
+
+  if (VT == MVT::v16i16) {
+    // Upper quadword copied in order.
+    if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
+      return false;
+
+    // Lower quadword shuffled.
+    for (unsigned i = 8; i != 12; ++i)
+      if (!isUndefOrInRange(Mask[i], 8, 12))
+        return false;
+  }
+
+  return true;
+}
+
+/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
+/// is suitable for input to PALIGNR.
+static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT,
+                          const X86Subtarget *Subtarget) {
+  if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
+      (VT.is256BitVector() && !Subtarget->hasInt256()))
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumLanes = VT.is512BitVector() ? 1: VT.getSizeInBits()/128;
+  unsigned NumLaneElts = NumElts/NumLanes;
+
+  // Do not handle 64-bit element shuffles with palignr.
+  if (NumLaneElts == 2)
+    return false;
+
+  for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
+    unsigned i;
+    for (i = 0; i != NumLaneElts; ++i) {
+      if (Mask[i+l] >= 0)
+        break;
+    }
+
+    // Lane is all undef, go to next lane
+    if (i == NumLaneElts)
+      continue;
+
+    int Start = Mask[i+l];
+
+    // Make sure its in this lane in one of the sources
+    if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
+        !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
+      return false;
+
+    // If not lane 0, then we must match lane 0
+    if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
+      return false;
+
+    // Correct second source to be contiguous with first source
+    if (Start >= (int)NumElts)
+      Start -= NumElts - NumLaneElts;
+
+    // Make sure we're shifting in the right direction.
+    if (Start <= (int)(i+l))
+      return false;
+
+    Start -= i;
+
+    // Check the rest of the elements to see if they are consecutive.
+    for (++i; i != NumLaneElts; ++i) {
+      int Idx = Mask[i+l];
+
+      // Make sure its in this lane
+      if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
+          !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
+        return false;
+
+      // If not lane 0, then we must match lane 0
+      if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
+        return false;
+
+      if (Idx >= (int)NumElts)
+        Idx -= NumElts - NumLaneElts;
+
+      if (!isUndefOrEqual(Idx, Start+i))
+        return false;
+
+    }
+  }
+
+  return true;
+}
+
+/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
+/// the two vector operands have swapped position.
+static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
+                                     unsigned NumElems) {
+  for (unsigned i = 0; i != NumElems; ++i) {
+    int idx = Mask[i];
+    if (idx < 0)
+      continue;
+    else if (idx < (int)NumElems)
+      Mask[i] = idx + NumElems;
+    else
+      Mask[i] = idx - NumElems;
+  }
+}
+
+/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to 128/256-bit
+/// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
+/// reverse of what x86 shuffles want.
+static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) {
+
+  unsigned NumElems = VT.getVectorNumElements();
+  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned NumLaneElems = NumElems/NumLanes;
+
+  if (NumLaneElems != 2 && NumLaneElems != 4)
+    return false;
+
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+  bool symetricMaskRequired =
+    (VT.getSizeInBits() >= 256) && (EltSize == 32);
+
+  // VSHUFPSY divides the resulting vector into 4 chunks.
+  // The sources are also splitted into 4 chunks, and each destination
+  // chunk must come from a different source chunk.
+  //
+  //  SRC1 =>   X7    X6    X5    X4    X3    X2    X1    X0
+  //  SRC2 =>   Y7    Y6    Y5    Y4    Y3    Y2    Y1    Y9
+  //
+  //  DST  =>  Y7..Y4,   Y7..Y4,   X7..X4,   X7..X4,
+  //           Y3..Y0,   Y3..Y0,   X3..X0,   X3..X0
+  //
+  // VSHUFPDY divides the resulting vector into 4 chunks.
+  // The sources are also splitted into 4 chunks, and each destination
+  // chunk must come from a different source chunk.
+  //
+  //  SRC1 =>      X3       X2       X1       X0
+  //  SRC2 =>      Y3       Y2       Y1       Y0
+  //
+  //  DST  =>  Y3..Y2,  X3..X2,  Y1..Y0,  X1..X0
+  //
+  SmallVector<int, 4> MaskVal(NumLaneElems, -1);
+  unsigned HalfLaneElems = NumLaneElems/2;
+  for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
+    for (unsigned i = 0; i != NumLaneElems; ++i) {
+      int Idx = Mask[i+l];
+      unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
+      if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
+        return false;
+      // For VSHUFPSY, the mask of the second half must be the same as the
+      // first but with the appropriate offsets. This works in the same way as
+      // VPERMILPS works with masks.
+      if (!symetricMaskRequired || Idx < 0)
+        continue;
+      if (MaskVal[i] < 0) {
+        MaskVal[i] = Idx - l;
+        continue;
+      }
+      if ((signed)(Idx - l) != MaskVal[i])
+        return false;
+    }
+  }
+
+  return true;
+}
+
+/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVHLPS.
+static bool isMOVHLPSMask(ArrayRef<int> Mask, MVT VT) {
+  if (!VT.is128BitVector())
+    return false;
+
+  unsigned NumElems = VT.getVectorNumElements();
+
+  if (NumElems != 4)
+    return false;
+
+  // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
+  return isUndefOrEqual(Mask[0], 6) &&
+         isUndefOrEqual(Mask[1], 7) &&
+         isUndefOrEqual(Mask[2], 2) &&
+         isUndefOrEqual(Mask[3], 3);
+}
+
+/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
+/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
+/// <2, 3, 2, 3>
+static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, MVT VT) {
+  if (!VT.is128BitVector())
+    return false;
+
+  unsigned NumElems = VT.getVectorNumElements();
+
+  if (NumElems != 4)
+    return false;
+
+  return isUndefOrEqual(Mask[0], 2) &&
+         isUndefOrEqual(Mask[1], 3) &&
+         isUndefOrEqual(Mask[2], 2) &&
+         isUndefOrEqual(Mask[3], 3);
+}
+
+/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
+static bool isMOVLPMask(ArrayRef<int> Mask, MVT VT) {
+  if (!VT.is128BitVector())
+    return false;
+
+  unsigned NumElems = VT.getVectorNumElements();
+
+  if (NumElems != 2 && NumElems != 4)
+    return false;
+
+  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
+    if (!isUndefOrEqual(Mask[i], i + NumElems))
+      return false;
+
+  for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
+    if (!isUndefOrEqual(Mask[i], i))
+      return false;
+
+  return true;
+}
+
+/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVLHPS.
+static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) {
+  if (!VT.is128BitVector())
+    return false;
+
+  unsigned NumElems = VT.getVectorNumElements();
+
+  if (NumElems != 2 && NumElems != 4)
+    return false;
+
+  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
+    if (!isUndefOrEqual(Mask[i], i))
+      return false;
+
+  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
+    if (!isUndefOrEqual(Mask[i + e], i + NumElems))
+      return false;
+
+  return true;
+}
+
+/// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to INSERTPS.
+/// i. e: If all but one element come from the same vector.
+static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {
+  // TODO: Deal with AVX's VINSERTPS
+  if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32))
+    return false;
+
+  unsigned CorrectPosV1 = 0;
+  unsigned CorrectPosV2 = 0;
+  for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) {
+    if (Mask[i] == -1) {
+      ++CorrectPosV1;
+      ++CorrectPosV2;
+      continue;
+    }
+
+    if (Mask[i] == i)
+      ++CorrectPosV1;
+    else if (Mask[i] == i + 4)
+      ++CorrectPosV2;
+  }
+
+  if (CorrectPosV1 == 3 || CorrectPosV2 == 3)
+    // We have 3 elements (undefs count as elements from any vector) from one
+    // vector, and one from another.
+    return true;
+
+  return false;
+}
+
+//
+// Some special combinations that can be optimized.
+//
+static
+SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
+                               SelectionDAG &DAG) {
+  MVT VT = SVOp->getSimpleValueType(0);
+  SDLoc dl(SVOp);
+
+  if (VT != MVT::v8i32 && VT != MVT::v8f32)
+    return SDValue();
+
+  ArrayRef<int> Mask = SVOp->getMask();
+
+  // These are the special masks that may be optimized.
+  static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
+  static const int MaskToOptimizeOdd[]  = {1, 9, 3, 11, 5, 13, 7, 15};
+  bool MatchEvenMask = true;
+  bool MatchOddMask  = true;
+  for (int i=0; i<8; ++i) {
+    if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
+      MatchEvenMask = false;
+    if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
+      MatchOddMask = false;
+  }
+
+  if (!MatchEvenMask && !MatchOddMask)
+    return SDValue();
+
+  SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
+
+  SDValue Op0 = SVOp->getOperand(0);
+  SDValue Op1 = SVOp->getOperand(1);
+
+  if (MatchEvenMask) {
+    // Shift the second operand right to 32 bits.
+    static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };
+    Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);
+  } else {
+    // Shift the first operand left to 32 bits.
+    static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };
+    Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);
+  }
+  static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};
+  return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);
+}
+
+/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to UNPCKL.
+static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT,
+                         bool HasInt256, bool V2IsSplat = false) {
+
+  assert(VT.getSizeInBits() >= 128 &&
+         "Unsupported vector type for unpckl");
+
+  // AVX defines UNPCK* to operate independently on 128-bit lanes.
+  unsigned NumLanes;
+  unsigned NumOf256BitLanes;
+  unsigned NumElts = VT.getVectorNumElements();
+  if (VT.is256BitVector()) {
+    if (NumElts != 4 && NumElts != 8 &&
+        (!HasInt256 || (NumElts != 16 && NumElts != 32)))
+    return false;
+    NumLanes = 2;
+    NumOf256BitLanes = 1;
+  } else if (VT.is512BitVector()) {
+    assert(VT.getScalarType().getSizeInBits() >= 32 &&
+           "Unsupported vector type for unpckh");
+    NumLanes = 2;
+    NumOf256BitLanes = 2;
+  } else {
+    NumLanes = 1;
+    NumOf256BitLanes = 1;
+  }
+
+  unsigned NumEltsInStride = NumElts/NumOf256BitLanes;
+  unsigned NumLaneElts = NumEltsInStride/NumLanes;
+
+  for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) {
+    for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) {
+      for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
+        int BitI  = Mask[l256*NumEltsInStride+l+i];
+        int BitI1 = Mask[l256*NumEltsInStride+l+i+1];
+        if (!isUndefOrEqual(BitI, j+l256*NumElts))
+          return false;
+        if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts))
+          return false;
+        if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride))
+          return false;
+      }
+    }
+  }
+  return true;
+}
+
+/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to UNPCKH.
+static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT,
+                         bool HasInt256, bool V2IsSplat = false) {
+  assert(VT.getSizeInBits() >= 128 &&
+         "Unsupported vector type for unpckh");
+
+  // AVX defines UNPCK* to operate independently on 128-bit lanes.
+  unsigned NumLanes;
+  unsigned NumOf256BitLanes;
+  unsigned NumElts = VT.getVectorNumElements();
+  if (VT.is256BitVector()) {
+    if (NumElts != 4 && NumElts != 8 &&
+        (!HasInt256 || (NumElts != 16 && NumElts != 32)))
+    return false;
+    NumLanes = 2;
+    NumOf256BitLanes = 1;
+  } else if (VT.is512BitVector()) {
+    assert(VT.getScalarType().getSizeInBits() >= 32 &&
+           "Unsupported vector type for unpckh");
+    NumLanes = 2;
+    NumOf256BitLanes = 2;
+  } else {
+    NumLanes = 1;
+    NumOf256BitLanes = 1;
+  }
+
+  unsigned NumEltsInStride = NumElts/NumOf256BitLanes;
+  unsigned NumLaneElts = NumEltsInStride/NumLanes;
+
+  for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) {
+    for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) {
+      for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
+        int BitI  = Mask[l256*NumEltsInStride+l+i];
+        int BitI1 = Mask[l256*NumEltsInStride+l+i+1];
+        if (!isUndefOrEqual(BitI, j+l256*NumElts))
+          return false;
+        if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts))
+          return false;
+        if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride))
+          return false;
+      }
+    }
+  }
+  return true;
+}
+
+/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
+/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
+/// <0, 0, 1, 1>
+static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
+  unsigned NumElts = VT.getVectorNumElements();
+  bool Is256BitVec = VT.is256BitVector();
+
+  if (VT.is512BitVector())
+    return false;
+  assert((VT.is128BitVector() || VT.is256BitVector()) &&
+         "Unsupported vector type for unpckh");
+
+  if (Is256BitVec && NumElts != 4 && NumElts != 8 &&
+      (!HasInt256 || (NumElts != 16 && NumElts != 32)))
+    return false;
+
+  // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
+  // FIXME: Need a better way to get rid of this, there's no latency difference
+  // between UNPCKLPD and MOVDDUP, the later should always be checked first and
+  // the former later. We should also remove the "_undef" special mask.
+  if (NumElts == 4 && Is256BitVec)
+    return false;
+
+  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+  // independently on 128-bit lanes.
+  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned NumLaneElts = NumElts/NumLanes;
+
+  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+    for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
+      int BitI  = Mask[l+i];
+      int BitI1 = Mask[l+i+1];
+
+      if (!isUndefOrEqual(BitI, j))
+        return false;
+      if (!isUndefOrEqual(BitI1, j))
+        return false;
+    }
+  }
+
+  return true;
+}
+
+/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
+/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
+/// <2, 2, 3, 3>
+static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  if (VT.is512BitVector())
+    return false;
+
+  assert((VT.is128BitVector() || VT.is256BitVector()) &&
+         "Unsupported vector type for unpckh");
+
+  if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
+      (!HasInt256 || (NumElts != 16 && NumElts != 32)))
+    return false;
+
+  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+  // independently on 128-bit lanes.
+  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned NumLaneElts = NumElts/NumLanes;
+
+  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+    for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
+      int BitI  = Mask[l+i];
+      int BitI1 = Mask[l+i+1];
+      if (!isUndefOrEqual(BitI, j))
+        return false;
+      if (!isUndefOrEqual(BitI1, j))
+        return false;
+    }
+  }
+  return true;
+}
+
+// Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or
+// (src1[0], src0[1]), manipulation with 256-bit sub-vectors
+static bool isINSERT64x4Mask(ArrayRef<int> Mask, MVT VT, unsigned int *Imm) {
+  if (!VT.is512BitVector())
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned HalfSize = NumElts/2;
+  if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) {
+    if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) {
+      *Imm = 1;
+      return true;
+    }
+  }
+  if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) {
+    if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) {
+      *Imm = 0;
+      return true;
+    }
+  }
+  return false;
+}
+
+/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVSS,
+/// MOVSD, and MOVD, i.e. setting the lowest element.
+static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
+  if (VT.getVectorElementType().getSizeInBits() < 32)
+    return false;
+  if (!VT.is128BitVector())
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+
+  if (!isUndefOrEqual(Mask[0], NumElts))
+    return false;
+
+  for (unsigned i = 1; i != NumElts; ++i)
+    if (!isUndefOrEqual(Mask[i], i))
+      return false;
+
+  return true;
+}
+
+/// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
+/// as permutations between 128-bit chunks or halves. As an example: this
+/// shuffle bellow:
+///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
+/// The first half comes from the second half of V1 and the second half from the
+/// the second half of V2.
+static bool isVPERM2X128Mask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
+  if (!HasFp256 || !VT.is256BitVector())
+    return false;
+
+  // The shuffle result is divided into half A and half B. In total the two
+  // sources have 4 halves, namely: C, D, E, F. The final values of A and
+  // B must come from C, D, E or F.
+  unsigned HalfSize = VT.getVectorNumElements()/2;
+  bool MatchA = false, MatchB = false;
+
+  // Check if A comes from one of C, D, E, F.
+  for (unsigned Half = 0; Half != 4; ++Half) {
+    if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
+      MatchA = true;
+      break;
+    }
+  }
+
+  // Check if B comes from one of C, D, E, F.
+  for (unsigned Half = 0; Half != 4; ++Half) {
+    if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
+      MatchB = true;
+      break;
+    }
+  }
+
+  return MatchA && MatchB;
+}
+
+/// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
+static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
+  MVT VT = SVOp->getSimpleValueType(0);
+
+  unsigned HalfSize = VT.getVectorNumElements()/2;
+
+  unsigned FstHalf = 0, SndHalf = 0;
+  for (unsigned i = 0; i < HalfSize; ++i) {
+    if (SVOp->getMaskElt(i) > 0) {
+      FstHalf = SVOp->getMaskElt(i)/HalfSize;
+      break;
+    }
+  }
+  for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
+    if (SVOp->getMaskElt(i) > 0) {
+      SndHalf = SVOp->getMaskElt(i)/HalfSize;
+      break;
+    }
+  }
+
+  return (FstHalf | (SndHalf << 4));
+}
+
+// Symetric in-lane mask. Each lane has 4 elements (for imm8)
+static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) {
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+  if (EltSize < 32)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  Imm8 = 0;
+  if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) {
+    for (unsigned i = 0; i != NumElts; ++i) {
+      if (Mask[i] < 0)
+        continue;
+      Imm8 |= Mask[i] << (i*2);
+    }
+    return true;
+  }
+
+  unsigned LaneSize = 4;
+  SmallVector<int, 4> MaskVal(LaneSize, -1);
+
+  for (unsigned l = 0; l != NumElts; l += LaneSize) {
+    for (unsigned i = 0; i != LaneSize; ++i) {
+      if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
+        return false;
+      if (Mask[i+l] < 0)
+        continue;
+      if (MaskVal[i] < 0) {
+        MaskVal[i] = Mask[i+l] - l;
+        Imm8 |= MaskVal[i] << (i*2);
+        continue;
+      }
+      if (Mask[i+l] != (signed)(MaskVal[i]+l))
+        return false;
+    }
+  }
+  return true;
+}
+
+/// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
+/// Note that VPERMIL mask matching is different depending whether theunderlying
+/// type is 32 or 64. In the VPERMILPS the high half of the mask should point
+/// to the same elements of the low, but to the higher half of the source.
+/// In VPERMILPD the two lanes could be shuffled independently of each other
+/// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
+static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) {
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+  if (VT.getSizeInBits() < 256 || EltSize < 32)
+    return false;
+  bool symetricMaskRequired = (EltSize == 32);
+  unsigned NumElts = VT.getVectorNumElements();
+
+  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned LaneSize = NumElts/NumLanes;
+  // 2 or 4 elements in one lane
+
+  SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1);
+  for (unsigned l = 0; l != NumElts; l += LaneSize) {
+    for (unsigned i = 0; i != LaneSize; ++i) {
+      if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
+        return false;
+      if (symetricMaskRequired) {
+        if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) {
+          ExpectedMaskVal[i] = Mask[i+l] - l;
+          continue;
+        }
+        if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l))
+          return false;
+      }
+    }
+  }
+  return true;
+}
+
+/// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
+/// of what x86 movss want. X86 movs requires the lowest  element to be lowest
+/// element of vector 2 and the other elements to come from vector 1 in order.
+static bool isCommutedMOVLMask(ArrayRef<int> Mask, MVT VT,
+                               bool V2IsSplat = false, bool V2IsUndef = false) {
+  if (!VT.is128BitVector())
+    return false;
+
+  unsigned NumOps = VT.getVectorNumElements();
+  if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
+    return false;
+
+  if (!isUndefOrEqual(Mask[0], 0))
+    return false;
+
+  for (unsigned i = 1; i != NumOps; ++i)
+    if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
+          (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
+          (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
+      return false;
+
+  return true;
+}
+
+/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
+/// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
+static bool isMOVSHDUPMask(ArrayRef<int> Mask, MVT VT,
+                           const X86Subtarget *Subtarget) {
+  if (!Subtarget->hasSSE3())
+    return false;
+
+  unsigned NumElems = VT.getVectorNumElements();
+
+  if ((VT.is128BitVector() && NumElems != 4) ||
+      (VT.is256BitVector() && NumElems != 8) ||
+      (VT.is512BitVector() && NumElems != 16))
+    return false;
+
+  // "i+1" is the value the indexed mask element must have
+  for (unsigned i = 0; i != NumElems; i += 2)
+    if (!isUndefOrEqual(Mask[i], i+1) ||
+        !isUndefOrEqual(Mask[i+1], i+1))
+      return false;
+
+  return true;
+}
+
+/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
+/// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
+static bool isMOVSLDUPMask(ArrayRef<int> Mask, MVT VT,
+                           const X86Subtarget *Subtarget) {
+  if (!Subtarget->hasSSE3())
+    return false;
+
+  unsigned NumElems = VT.getVectorNumElements();
+
+  if ((VT.is128BitVector() && NumElems != 4) ||
+      (VT.is256BitVector() && NumElems != 8) ||
+      (VT.is512BitVector() && NumElems != 16))
+    return false;
+
+  // "i" is the value the indexed mask element must have
+  for (unsigned i = 0; i != NumElems; i += 2)
+    if (!isUndefOrEqual(Mask[i], i) ||
+        !isUndefOrEqual(Mask[i+1], i))
+      return false;
+
+  return true;
+}
+
+/// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to 256-bit
+/// version of MOVDDUP.
+static bool isMOVDDUPYMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
+  if (!HasFp256 || !VT.is256BitVector())
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  if (NumElts != 4)
+    return false;
+
+  for (unsigned i = 0; i != NumElts/2; ++i)
+    if (!isUndefOrEqual(Mask[i], 0))
+      return false;
+  for (unsigned i = NumElts/2; i != NumElts; ++i)
+    if (!isUndefOrEqual(Mask[i], NumElts/2))
+      return false;
+  return true;
+}
+
+/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to 128-bit
+/// version of MOVDDUP.
+static bool isMOVDDUPMask(ArrayRef<int> Mask, MVT VT) {
+  if (!VT.is128BitVector())
+    return false;
+
+  unsigned e = VT.getVectorNumElements() / 2;
+  for (unsigned i = 0; i != e; ++i)
+    if (!isUndefOrEqual(Mask[i], i))
+      return false;
+  for (unsigned i = 0; i != e; ++i)
+    if (!isUndefOrEqual(Mask[e+i], i))
+      return false;
+  return true;
+}
+
+/// isVEXTRACTIndex - Return true if the specified
+/// EXTRACT_SUBVECTOR operand specifies a vector extract that is
+/// suitable for instruction that extract 128 or 256 bit vectors
+static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
+  assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
+  if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
+    return false;
+
+  // The index should be aligned on a vecWidth-bit boundary.
+  uint64_t Index =
+    cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
+
+  MVT VT = N->getSimpleValueType(0);
+  unsigned ElSize = VT.getVectorElementType().getSizeInBits();
+  bool Result = (Index * ElSize) % vecWidth == 0;
+
+  return Result;
+}
+
+/// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR
+/// operand specifies a subvector insert that is suitable for input to
+/// insertion of 128 or 256-bit subvectors
+static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
+  assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
+  if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
+    return false;
+  // The index should be aligned on a vecWidth-bit boundary.
+  uint64_t Index =
+    cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
+
+  MVT VT = N->getSimpleValueType(0);
+  unsigned ElSize = VT.getVectorElementType().getSizeInBits();
+  bool Result = (Index * ElSize) % vecWidth == 0;
+
+  return Result;
+}
+
+bool X86::isVINSERT128Index(SDNode *N) {
+  return isVINSERTIndex(N, 128);
+}
+
+bool X86::isVINSERT256Index(SDNode *N) {
+  return isVINSERTIndex(N, 256);
+}
+
+bool X86::isVEXTRACT128Index(SDNode *N) {
+  return isVEXTRACTIndex(N, 128);
+}
+
+bool X86::isVEXTRACT256Index(SDNode *N) {
+  return isVEXTRACTIndex(N, 256);
+}
+
+/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
+/// Handles 128-bit and 256-bit.
+static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
+  MVT VT = N->getSimpleValueType(0);
+
+  assert((VT.getSizeInBits() >= 128) &&
+         "Unsupported vector type for PSHUF/SHUFP");
+
+  // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
+  // independently on 128-bit lanes.
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned NumLaneElts = NumElts/NumLanes;
+
+  assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) &&
+         "Only supports 2, 4 or 8 elements per lane");
+
+  unsigned Shift = (NumLaneElts >= 4) ? 1 : 0;
+  unsigned Mask = 0;
+  for (unsigned i = 0; i != NumElts; ++i) {
+    int Elt = N->getMaskElt(i);
+    if (Elt < 0) continue;
+    Elt &= NumLaneElts - 1;
+    unsigned ShAmt = (i << Shift) % 8;
+    Mask |= Elt << ShAmt;
+  }
+
+  return Mask;
+}
+
+/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
+static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
+  MVT VT = N->getSimpleValueType(0);
+
+  assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
+         "Unsupported vector type for PSHUFHW");
+
+  unsigned NumElts = VT.getVectorNumElements();
+
+  unsigned Mask = 0;
+  for (unsigned l = 0; l != NumElts; l += 8) {
+    // 8 nodes per lane, but we only care about the last 4.
+    for (unsigned i = 0; i < 4; ++i) {
+      int Elt = N->getMaskElt(l+i+4);
+      if (Elt < 0) continue;
+      Elt &= 0x3; // only 2-bits.
+      Mask |= Elt << (i * 2);
+    }
+  }
+
+  return Mask;
+}
+
+/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
+static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
+  MVT VT = N->getSimpleValueType(0);
+
+  assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
+         "Unsupported vector type for PSHUFHW");
+
+  unsigned NumElts = VT.getVectorNumElements();
+
+  unsigned Mask = 0;
+  for (unsigned l = 0; l != NumElts; l += 8) {
+    // 8 nodes per lane, but we only care about the first 4.
+    for (unsigned i = 0; i < 4; ++i) {
+      int Elt = N->getMaskElt(l+i);
+      if (Elt < 0) continue;
+      Elt &= 0x3; // only 2-bits
+      Mask |= Elt << (i * 2);
+    }
+  }
+
+  return Mask;
+}
+
+/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
+static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
+  MVT VT = SVOp->getSimpleValueType(0);
+  unsigned EltSize = VT.is512BitVector() ? 1 :
+    VT.getVectorElementType().getSizeInBits() >> 3;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128;
+  unsigned NumLaneElts = NumElts/NumLanes;
+
+  int Val = 0;
+  unsigned i;
+  for (i = 0; i != NumElts; ++i) {
+    Val = SVOp->getMaskElt(i);
+    if (Val >= 0)
+      break;
+  }
+  if (Val >= (int)NumElts)
+    Val -= NumElts - NumLaneElts;
+
+  assert(Val - i > 0 && "PALIGNR imm should be positive");
+  return (Val - i) * EltSize;
+}
+
+static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
+  assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
+  if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
+    llvm_unreachable("Illegal extract subvector for VEXTRACT");
+
+  uint64_t Index =
+    cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
+
+  MVT VecVT = N->getOperand(0).getSimpleValueType();
+  MVT ElVT = VecVT.getVectorElementType();
+
+  unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
+  return Index / NumElemsPerChunk;
+}
+
+static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
+  assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
+  if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
+    llvm_unreachable("Illegal insert subvector for VINSERT");
+
+  uint64_t Index =
+    cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
+
+  MVT VecVT = N->getSimpleValueType(0);
+  MVT ElVT = VecVT.getVectorElementType();
+
+  unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
+  return Index / NumElemsPerChunk;
+}
+
+/// getExtractVEXTRACT128Immediate - Return the appropriate immediate
+/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
+/// and VINSERTI128 instructions.
+unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
+  return getExtractVEXTRACTImmediate(N, 128);
+}
+
+/// getExtractVEXTRACT256Immediate - Return the appropriate immediate
+/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4
+/// and VINSERTI64x4 instructions.
+unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
+  return getExtractVEXTRACTImmediate(N, 256);
+}
+
+/// getInsertVINSERT128Immediate - Return the appropriate immediate
+/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
+/// and VINSERTI128 instructions.
+unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
+  return getInsertVINSERTImmediate(N, 128);
+}
+
+/// getInsertVINSERT256Immediate - Return the appropriate immediate
+/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4
+/// and VINSERTI64x4 instructions.
+unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
+  return getInsertVINSERTImmediate(N, 256);
+}
+
+/// isZero - Returns true if Elt is a constant integer zero
+static bool isZero(SDValue V) {
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
+  return C && C->isNullValue();
+}
+
+/// isZeroNode - Returns true if Elt is a constant zero or a floating point
+/// constant +0.0.
+bool X86::isZeroNode(SDValue Elt) {
+  if (isZero(Elt))
+    return true;
+  if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
+    return CFP->getValueAPF().isPosZero();
+  return false;
+}
+
+/// ShouldXformToMOVHLPS - Return true if the node should be transformed to
+/// match movhlps. The lower half elements should come from upper half of
+/// V1 (and in order), and the upper half elements should come from the upper
+/// half of V2 (and in order).
+static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) {
+  if (!VT.is128BitVector())
+    return false;
+  if (VT.getVectorNumElements() != 4)
+    return false;
+  for (unsigned i = 0, e = 2; i != e; ++i)
+    if (!isUndefOrEqual(Mask[i], i+2))
+      return false;
+  for (unsigned i = 2; i != 4; ++i)
+    if (!isUndefOrEqual(Mask[i], i+4))
+      return false;
+  return true;
+}
+
+/// isScalarLoadToVector - Returns true if the node is a scalar load that
+/// is promoted to a vector. It also returns the LoadSDNode by reference if
+/// required.
+static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = nullptr) {
+  if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
+    return false;
+  N = N->getOperand(0).getNode();
+  if (!ISD::isNON_EXTLoad(N))
+    return false;
+  if (LD)
+    *LD = cast<LoadSDNode>(N);
+  return true;
+}
+
+// Test whether the given value is a vector value which will be legalized
+// into a load.
+static bool WillBeConstantPoolLoad(SDNode *N) {
+  if (N->getOpcode() != ISD::BUILD_VECTOR)
+    return false;
+
+  // Check for any non-constant elements.
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+    switch (N->getOperand(i).getNode()->getOpcode()) {
+    case ISD::UNDEF:
+    case ISD::ConstantFP:
+    case ISD::Constant:
+      break;
+    default:
+      return false;
+    }
+
+  // Vectors of all-zeros and all-ones are materialized with special
+  // instructions rather than being loaded.
+  return !ISD::isBuildVectorAllZeros(N) &&
+         !ISD::isBuildVectorAllOnes(N);
+}
+
+/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
+/// match movlp{s|d}. The lower half elements should come from lower half of
+/// V1 (and in order), and the upper half elements should come from the upper
+/// half of V2 (and in order). And since V1 will become the source of the
+/// MOVLP, it must be either a vector load or a scalar load to vector.
+static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
+                               ArrayRef<int> Mask, MVT VT) {
+  if (!VT.is128BitVector())
+    return false;
+
+  if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
+    return false;
+  // Is V2 is a vector load, don't do this transformation. We will try to use
+  // load folding shufps op.
+  if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2))
+    return false;
+
+  unsigned NumElems = VT.getVectorNumElements();
+
+  if (NumElems != 2 && NumElems != 4)
+    return false;
+  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
+    if (!isUndefOrEqual(Mask[i], i))
+      return false;
+  for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
+    if (!isUndefOrEqual(Mask[i], i+NumElems))
+      return false;
+  return true;
+}
+
+/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
+/// to an zero vector.
+/// FIXME: move to dag combiner / method on ShuffleVectorSDNode
+static bool isZeroShuffle(ShuffleVectorSDNode *N) {
+  SDValue V1 = N->getOperand(0);
+  SDValue V2 = N->getOperand(1);
+  unsigned NumElems = N->getValueType(0).getVectorNumElements();
+  for (unsigned i = 0; i != NumElems; ++i) {
+    int Idx = N->getMaskElt(i);
+    if (Idx >= (int)NumElems) {
+      unsigned Opc = V2.getOpcode();
+      if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
+        continue;
+      if (Opc != ISD::BUILD_VECTOR ||
+          !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
+        return false;
+    } else if (Idx >= 0) {
+      unsigned Opc = V1.getOpcode();
+      if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
+        continue;
+      if (Opc != ISD::BUILD_VECTOR ||
+          !X86::isZeroNode(V1.getOperand(Idx)))
+        return false;
+    }
+  }
+  return true;
+}
+
+/// getZeroVector - Returns a vector of specified type with all zero elements.
+///
+static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
+                             SelectionDAG &DAG, SDLoc dl) {
+  assert(VT.isVector() && "Expected a vector type");
+
+  // Always build SSE zero vectors as <4 x i32> bitcasted
+  // to their dest type. This ensures they get CSE'd.
+  SDValue Vec;
+  if (VT.is128BitVector()) {  // SSE
+    if (Subtarget->hasSSE2()) {  // SSE2
+      SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
+      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
+    } else { // SSE1
+      SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
+      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
+    }
+  } else if (VT.is256BitVector()) { // AVX
+    if (Subtarget->hasInt256()) { // AVX2
+      SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
+      SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
+      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
+    } else {
+      // 256-bit logic and arithmetic instructions in AVX are all
+      // floating-point, no support for integer ops. Emit fp zeroed vectors.
+      SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
+      SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
+      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops);
+    }
+  } else if (VT.is512BitVector()) { // AVX-512
+      SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
+      SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
+                        Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
+      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
+  } else if (VT.getScalarType() == MVT::i1) {
+    assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type");
+    SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
+    SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
+  } else
+    llvm_unreachable("Unexpected vector type");
+
+  return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
+}
+
+/// getOnesVector - Returns a vector of specified type with all bits set.
+/// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
+/// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
+/// Then bitcast to their original type, ensuring they get CSE'd.
+static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
+                             SDLoc dl) {
+  assert(VT.isVector() && "Expected a vector type");
+
+  SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
+  SDValue Vec;
+  if (VT.is256BitVector()) {
+    if (HasInt256) { // AVX2
+      SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
+      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
+    } else { // AVX
+      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
+      Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
+    }
+  } else if (VT.is128BitVector()) {
+    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
+  } else
+    llvm_unreachable("Unexpected vector type");
+
+  return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
+}
+
+/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
+/// that point to V2 points to its first element.
+static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
+  for (unsigned i = 0; i != NumElems; ++i) {
+    if (Mask[i] > (int)NumElems) {
+      Mask[i] = NumElems;
+    }
+  }
+}
+
+/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
+/// operation of specified width.
+static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
+                       SDValue V2) {
+  unsigned NumElems = VT.getVectorNumElements();
+  SmallVector<int, 8> Mask;
+  Mask.push_back(NumElems);
+  for (unsigned i = 1; i != NumElems; ++i)
+    Mask.push_back(i);
+  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
+}
+
+/// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
+static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
+                          SDValue V2) {
+  unsigned NumElems = VT.getVectorNumElements();
+  SmallVector<int, 8> Mask;
+  for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
+    Mask.push_back(i);
+    Mask.push_back(i + NumElems);
+  }
+  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
+}
+
+/// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
+static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
+                          SDValue V2) {
+  unsigned NumElems = VT.getVectorNumElements();
+  SmallVector<int, 8> Mask;
+  for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
+    Mask.push_back(i + Half);
+    Mask.push_back(i + NumElems + Half);
+  }
+  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
+}
+
+// PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
+// a generic shuffle instruction because the target has no such instructions.
+// Generate shuffles which repeat i16 and i8 several times until they can be
+// represented by v4f32 and then be manipulated by target suported shuffles.
+static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
+  MVT VT = V.getSimpleValueType();
+  int NumElems = VT.getVectorNumElements();
+  SDLoc dl(V);
+
+  while (NumElems > 4) {
+    if (EltNo < NumElems/2) {
+      V = getUnpackl(DAG, dl, VT, V, V);
+    } else {
+      V = getUnpackh(DAG, dl, VT, V, V);
+      EltNo -= NumElems/2;
+    }
+    NumElems >>= 1;
+  }
+  return V;
+}
+
+/// getLegalSplat - Generate a legal splat with supported x86 shuffles
+static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
+  MVT VT = V.getSimpleValueType();
+  SDLoc dl(V);
+
+  if (VT.is128BitVector()) {
+    V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
+    int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
+    V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
+                             &SplatMask[0]);
+  } else if (VT.is256BitVector()) {
+    // To use VPERMILPS to splat scalars, the second half of indicies must
+    // refer to the higher part, which is a duplication of the lower one,
+    // because VPERMILPS can only handle in-lane permutations.
+    int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
+                         EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
+
+    V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
+    V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
+                             &SplatMask[0]);
+  } else
+    llvm_unreachable("Vector size not supported");
+
+  return DAG.getNode(ISD::BITCAST, dl, VT, V);
+}
+
+/// PromoteSplat - Splat is promoted to target supported vector shuffles.
+static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
+  MVT SrcVT = SV->getSimpleValueType(0);
+  SDValue V1 = SV->getOperand(0);
+  SDLoc dl(SV);
+
+  int EltNo = SV->getSplatIndex();
+  int NumElems = SrcVT.getVectorNumElements();
+  bool Is256BitVec = SrcVT.is256BitVector();
+
+  assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) &&
+         "Unknown how to promote splat for type");
+
+  // Extract the 128-bit part containing the splat element and update
+  // the splat element index when it refers to the higher register.
+  if (Is256BitVec) {
+    V1 = Extract128BitVector(V1, EltNo, DAG, dl);
+    if (EltNo >= NumElems/2)
+      EltNo -= NumElems/2;
+  }
+
+  // All i16 and i8 vector types can't be used directly by a generic shuffle
+  // instruction because the target has no such instruction. Generate shuffles
+  // which repeat i16 and i8 several times until they fit in i32, and then can
+  // be manipulated by target suported shuffles.
+  MVT EltVT = SrcVT.getVectorElementType();
+  if (EltVT == MVT::i8 || EltVT == MVT::i16)
+    V1 = PromoteSplati8i16(V1, DAG, EltNo);
+
+  // Recreate the 256-bit vector and place the same 128-bit vector
+  // into the low and high part. This is necessary because we want
+  // to use VPERM* to shuffle the vectors
+  if (Is256BitVec) {
+    V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);
+  }
+
+  return getLegalSplat(DAG, V1, EltNo);
+}
+
+/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
+/// vector of zero or undef vector.  This produces a shuffle where the low
+/// element of V2 is swizzled into the zero/undef vector, landing at element
+/// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
+static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
+                                           bool IsZero,
+                                           const X86Subtarget *Subtarget,
+                                           SelectionDAG &DAG) {
+  MVT VT = V2.getSimpleValueType();
+  SDValue V1 = IsZero
+    ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
+  unsigned NumElems = VT.getVectorNumElements();
+  SmallVector<int, 16> MaskVec;
+  for (unsigned i = 0; i != NumElems; ++i)
+    // If this is the insertion idx, put the low elt of V2 here.
+    MaskVec.push_back(i == Idx ? NumElems : i);
+  return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]);
+}
+
+/// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
+/// target specific opcode. Returns true if the Mask could be calculated.
+/// Sets IsUnary to true if only uses one source.
+static bool getTargetShuffleMask(SDNode *N, MVT VT,
+                                 SmallVectorImpl<int> &Mask, bool &IsUnary) {
+  unsigned NumElems = VT.getVectorNumElements();
+  SDValue ImmN;
+
+  IsUnary = false;
+  switch(N->getOpcode()) {
+  case X86ISD::SHUFP:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    break;
+  case X86ISD::UNPCKH:
+    DecodeUNPCKHMask(VT, Mask);
+    break;
+  case X86ISD::UNPCKL:
+    DecodeUNPCKLMask(VT, Mask);
+    break;
+  case X86ISD::MOVHLPS:
+    DecodeMOVHLPSMask(NumElems, Mask);
+    break;
+  case X86ISD::MOVLHPS:
+    DecodeMOVLHPSMask(NumElems, Mask);
+    break;
+  case X86ISD::PALIGNR:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    break;
+  case X86ISD::PSHUFD:
+  case X86ISD::VPERMILP:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    IsUnary = true;
+    break;
+  case X86ISD::PSHUFHW:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    IsUnary = true;
+    break;
+  case X86ISD::PSHUFLW:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    IsUnary = true;
+    break;
+  case X86ISD::VPERMI:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    IsUnary = true;
+    break;
+  case X86ISD::MOVSS:
+  case X86ISD::MOVSD: {
+    // The index 0 always comes from the first element of the second source,
+    // this is why MOVSS and MOVSD are used in the first place. The other
+    // elements come from the other positions of the first source vector
+    Mask.push_back(NumElems);
+    for (unsigned i = 1; i != NumElems; ++i) {
+      Mask.push_back(i);
+    }
+    break;
+  }
+  case X86ISD::VPERM2X128:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    if (Mask.empty()) return false;
+    break;
+  case X86ISD::MOVDDUP:
+  case X86ISD::MOVLHPD:
+  case X86ISD::MOVLPD:
+  case X86ISD::MOVLPS:
+  case X86ISD::MOVSHDUP:
+  case X86ISD::MOVSLDUP:
+    // Not yet implemented
+    return false;
+  default: llvm_unreachable("unknown target shuffle node");
+  }
+
+  return true;
+}
+
+/// getShuffleScalarElt - Returns the scalar element that will make up the ith
+/// element of the result of the vector shuffle.
+static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
+                                   unsigned Depth) {
+  if (Depth == 6)
+    return SDValue();  // Limit search depth.
+
+  SDValue V = SDValue(N, 0);
+  EVT VT = V.getValueType();
+  unsigned Opcode = V.getOpcode();
+
+  // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
+  if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
+    int Elt = SV->getMaskElt(Index);
+
+    if (Elt < 0)
+      return DAG.getUNDEF(VT.getVectorElementType());
+
+    unsigned NumElems = VT.getVectorNumElements();
+    SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
+                                         : SV->getOperand(1);
+    return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
+  }
+
+  // Recurse into target specific vector shuffles to find scalars.
+  if (isTargetShuffle(Opcode)) {
+    MVT ShufVT = V.getSimpleValueType();
+    unsigned NumElems = ShufVT.getVectorNumElements();
+    SmallVector<int, 16> ShuffleMask;
+    bool IsUnary;
+
+    if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
+      return SDValue();
+
+    int Elt = ShuffleMask[Index];
+    if (Elt < 0)
+      return DAG.getUNDEF(ShufVT.getVectorElementType());
+
+    SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
+                                         : N->getOperand(1);
+    return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
+                               Depth+1);
+  }
+
+  // Actual nodes that may contain scalar elements
+  if (Opcode == ISD::BITCAST) {
+    V = V.getOperand(0);
+    EVT SrcVT = V.getValueType();
+    unsigned NumElems = VT.getVectorNumElements();
+
+    if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
+      return SDValue();
+  }
+
+  if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
+    return (Index == 0) ? V.getOperand(0)
+                        : DAG.getUNDEF(VT.getVectorElementType());
+
+  if (V.getOpcode() == ISD::BUILD_VECTOR)
+    return V.getOperand(Index);
+
+  return SDValue();
+}
+
+/// getNumOfConsecutiveZeros - Return the number of elements of a vector
+/// shuffle operation which come from a consecutively from a zero. The
+/// search can start in two different directions, from left or right.
+/// We count undefs as zeros until PreferredNum is reached.
+static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp,
+                                         unsigned NumElems, bool ZerosFromLeft,
+                                         SelectionDAG &DAG,
+                                         unsigned PreferredNum = -1U) {
+  unsigned NumZeros = 0;
+  for (unsigned i = 0; i != NumElems; ++i) {
+    unsigned Index = ZerosFromLeft ? i : NumElems - i - 1;
+    SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
+    if (!Elt.getNode())
+      break;
+
+    if (X86::isZeroNode(Elt))
+      ++NumZeros;
+    else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum.
+      NumZeros = std::min(NumZeros + 1, PreferredNum);
+    else
+      break;
+  }
+
+  return NumZeros;
+}
+
+/// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
+/// correspond consecutively to elements from one of the vector operands,
+/// starting from its index OpIdx. Also tell OpNum which source vector operand.
+static
+bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
+                              unsigned MaskI, unsigned MaskE, unsigned OpIdx,
+                              unsigned NumElems, unsigned &OpNum) {
+  bool SeenV1 = false;
+  bool SeenV2 = false;
+
+  for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) {
+    int Idx = SVOp->getMaskElt(i);
+    // Ignore undef indicies
+    if (Idx < 0)
+      continue;
+
+    if (Idx < (int)NumElems)
+      SeenV1 = true;
+    else
+      SeenV2 = true;
+
+    // Only accept consecutive elements from the same vector
+    if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
+      return false;
+  }
+
+  OpNum = SeenV1 ? 0 : 1;
+  return true;
+}
+
+/// isVectorShiftRight - Returns true if the shuffle can be implemented as a
+/// logical left shift of a vector.
+static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
+                               bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
+  unsigned NumElems =
+    SVOp->getSimpleValueType(0).getVectorNumElements();
+  unsigned NumZeros = getNumOfConsecutiveZeros(
+      SVOp, NumElems, false /* check zeros from right */, DAG,
+      SVOp->getMaskElt(0));
+  unsigned OpSrc;
+
+  if (!NumZeros)
+    return false;
+
+  // Considering the elements in the mask that are not consecutive zeros,
+  // check if they consecutively come from only one of the source vectors.
+  //
+  //               V1 = {X, A, B, C}     0
+  //                         \  \  \    /
+  //   vector_shuffle V1, V2 <1, 2, 3, X>
+  //
+  if (!isShuffleMaskConsecutive(SVOp,
+            0,                   // Mask Start Index
+            NumElems-NumZeros,   // Mask End Index(exclusive)
+            NumZeros,            // Where to start looking in the src vector
+            NumElems,            // Number of elements in vector
+            OpSrc))              // Which source operand ?
+    return false;
+
+  isLeft = false;
+  ShAmt = NumZeros;
+  ShVal = SVOp->getOperand(OpSrc);
+  return true;
+}
+
+/// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
+/// logical left shift of a vector.
+static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
+                              bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
+  unsigned NumElems =
+    SVOp->getSimpleValueType(0).getVectorNumElements();
+  unsigned NumZeros = getNumOfConsecutiveZeros(
+      SVOp, NumElems, true /* check zeros from left */, DAG,
+      NumElems - SVOp->getMaskElt(NumElems - 1) - 1);
+  unsigned OpSrc;
+
+  if (!NumZeros)
+    return false;
+
+  // Considering the elements in the mask that are not consecutive zeros,
+  // check if they consecutively come from only one of the source vectors.
+  //
+  //                           0    { A, B, X, X } = V2
+  //                          / \    /  /
+  //   vector_shuffle V1, V2 <X, X, 4, 5>
+  //
+  if (!isShuffleMaskConsecutive(SVOp,
+            NumZeros,     // Mask Start Index
+            NumElems,     // Mask End Index(exclusive)
+            0,            // Where to start looking in the src vector
+            NumElems,     // Number of elements in vector
+            OpSrc))       // Which source operand ?
+    return false;
+
+  isLeft = true;
+  ShAmt = NumZeros;
+  ShVal = SVOp->getOperand(OpSrc);
+  return true;
+}
+
+/// isVectorShift - Returns true if the shuffle can be implemented as a
+/// logical left or right shift of a vector.
+static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
+                          bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
+  // Although the logic below support any bitwidth size, there are no
+  // shift instructions which handle more than 128-bit vectors.
+  if (!SVOp->getSimpleValueType(0).is128BitVector())
+    return false;
+
+  if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
+      isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
+    return true;
+
+  return false;
+}
+
+/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
+///
+static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
+                                       unsigned NumNonZero, unsigned NumZero,
+                                       SelectionDAG &DAG,
+                                       const X86Subtarget* Subtarget,
+                                       const TargetLowering &TLI) {
+  if (NumNonZero > 8)
+    return SDValue();
+
+  SDLoc dl(Op);
+  SDValue V;
+  bool First = true;
+  for (unsigned i = 0; i < 16; ++i) {
+    bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
+    if (ThisIsNonZero && First) {
+      if (NumZero)
+        V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
+      else
+        V = DAG.getUNDEF(MVT::v8i16);
+      First = false;
+    }
+
+    if ((i & 1) != 0) {
+      SDValue ThisElt, LastElt;
+      bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
+      if (LastIsNonZero) {
+        LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
+                              MVT::i16, Op.getOperand(i-1));
+      }
+      if (ThisIsNonZero) {
+        ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
+        ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
+                              ThisElt, DAG.getConstant(8, MVT::i8));
+        if (LastIsNonZero)
+          ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
+      } else
+        ThisElt = LastElt;
+
+      if (ThisElt.getNode())
+        V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
+                        DAG.getIntPtrConstant(i/2));
+    }
+  }
+
+  return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
+}
+
+/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
+///
+static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
+                                     unsigned NumNonZero, unsigned NumZero,
+                                     SelectionDAG &DAG,
+                                     const X86Subtarget* Subtarget,
+                                     const TargetLowering &TLI) {
+  if (NumNonZero > 4)
+    return SDValue();
+
+  SDLoc dl(Op);
+  SDValue V;
+  bool First = true;
+  for (unsigned i = 0; i < 8; ++i) {
+    bool isNonZero = (NonZeros & (1 << i)) != 0;
+    if (isNonZero) {
+      if (First) {
+        if (NumZero)
+          V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
+        else
+          V = DAG.getUNDEF(MVT::v8i16);
+        First = false;
+      }
+      V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
+                      MVT::v8i16, V, Op.getOperand(i),
+                      DAG.getIntPtrConstant(i));
+    }
+  }
+
+  return V;
+}
+
+/// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32.
+static SDValue LowerBuildVectorv4x32(SDValue Op, unsigned NumElems,
+                                     unsigned NonZeros, unsigned NumNonZero,
+                                     unsigned NumZero, SelectionDAG &DAG,
+                                     const X86Subtarget *Subtarget,
+                                     const TargetLowering &TLI) {
+  // We know there's at least one non-zero element
+  unsigned FirstNonZeroIdx = 0;
+  SDValue FirstNonZero = Op->getOperand(FirstNonZeroIdx);
+  while (FirstNonZero.getOpcode() == ISD::UNDEF ||
+         X86::isZeroNode(FirstNonZero)) {
+    ++FirstNonZeroIdx;
+    FirstNonZero = Op->getOperand(FirstNonZeroIdx);
+  }
+
+  if (FirstNonZero.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+      !isa<ConstantSDNode>(FirstNonZero.getOperand(1)))
+    return SDValue();
+
+  SDValue V = FirstNonZero.getOperand(0);
+  MVT VVT = V.getSimpleValueType();
+  if (!Subtarget->hasSSE41() || (VVT != MVT::v4f32 && VVT != MVT::v4i32))
+    return SDValue();
+
+  unsigned FirstNonZeroDst =
+      cast<ConstantSDNode>(FirstNonZero.getOperand(1))->getZExtValue();
+  unsigned CorrectIdx = FirstNonZeroDst == FirstNonZeroIdx;
+  unsigned IncorrectIdx = CorrectIdx ? -1U : FirstNonZeroIdx;
+  unsigned IncorrectDst = CorrectIdx ? -1U : FirstNonZeroDst;
+
+  for (unsigned Idx = FirstNonZeroIdx + 1; Idx < NumElems; ++Idx) {
+    SDValue Elem = Op.getOperand(Idx);
+    if (Elem.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elem))
+      continue;
+
+    // TODO: What else can be here? Deal with it.
+    if (Elem.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      return SDValue();
+
+    // TODO: Some optimizations are still possible here
+    // ex: Getting one element from a vector, and the rest from another.
+    if (Elem.getOperand(0) != V)
+      return SDValue();
+
+    unsigned Dst = cast<ConstantSDNode>(Elem.getOperand(1))->getZExtValue();
+    if (Dst == Idx)
+      ++CorrectIdx;
+    else if (IncorrectIdx == -1U) {
+      IncorrectIdx = Idx;
+      IncorrectDst = Dst;
+    } else
+      // There was already one element with an incorrect index.
+      // We can't optimize this case to an insertps.
+      return SDValue();
+  }
+
+  if (NumNonZero == CorrectIdx || NumNonZero == CorrectIdx + 1) {
+    SDLoc dl(Op);
+    EVT VT = Op.getSimpleValueType();
+    unsigned ElementMoveMask = 0;
+    if (IncorrectIdx == -1U)
+      ElementMoveMask = FirstNonZeroIdx << 6 | FirstNonZeroIdx << 4;
+    else
+      ElementMoveMask = IncorrectDst << 6 | IncorrectIdx << 4;
+
+    SDValue InsertpsMask =
+        DAG.getIntPtrConstant(ElementMoveMask | (~NonZeros & 0xf));
+    return DAG.getNode(X86ISD::INSERTPS, dl, VT, V, V, InsertpsMask);
+  }
+
+  return SDValue();
+}
+
+/// getVShift - Return a vector logical shift node.
+///
+static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
+                         unsigned NumBits, SelectionDAG &DAG,
+                         const TargetLowering &TLI, SDLoc dl) {
+  assert(VT.is128BitVector() && "Unknown type for VShift");
+  EVT ShVT = MVT::v2i64;
+  unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
+  SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
+  return DAG.getNode(ISD::BITCAST, dl, VT,
+                     DAG.getNode(Opc, dl, ShVT, SrcOp,
+                             DAG.getConstant(NumBits,
+                                  TLI.getScalarShiftAmountTy(SrcOp.getValueType()))));
+}
+
+static SDValue
+LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
+
+  // Check if the scalar load can be widened into a vector load. And if
+  // the address is "base + cst" see if the cst can be "absorbed" into
+  // the shuffle mask.
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
+    SDValue Ptr = LD->getBasePtr();
+    if (!ISD::isNormalLoad(LD) || LD->isVolatile())
+      return SDValue();
+    EVT PVT = LD->getValueType(0);
+    if (PVT != MVT::i32 && PVT != MVT::f32)
+      return SDValue();
+
+    int FI = -1;
+    int64_t Offset = 0;
+    if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
+      FI = FINode->getIndex();
+      Offset = 0;
+    } else if (DAG.isBaseWithConstantOffset(Ptr) &&
+               isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
+      FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
+      Offset = Ptr.getConstantOperandVal(1);
+      Ptr = Ptr.getOperand(0);
+    } else {
+      return SDValue();
+    }
+
+    // FIXME: 256-bit vector instructions don't require a strict alignment,
+    // improve this code to support it better.
+    unsigned RequiredAlign = VT.getSizeInBits()/8;
+    SDValue Chain = LD->getChain();
+    // Make sure the stack object alignment is at least 16 or 32.
+    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+    if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
+      if (MFI->isFixedObjectIndex(FI)) {
+        // Can't change the alignment. FIXME: It's possible to compute
+        // the exact stack offset and reference FI + adjust offset instead.
+        // If someone *really* cares about this. That's the way to implement it.
+        return SDValue();
+      } else {
+        MFI->setObjectAlignment(FI, RequiredAlign);
+      }
+    }
+
+    // (Offset % 16 or 32) must be multiple of 4. Then address is then
+    // Ptr + (Offset & ~15).
+    if (Offset < 0)
+      return SDValue();
+    if ((Offset % RequiredAlign) & 3)
+      return SDValue();
+    int64_t StartOffset = Offset & ~(RequiredAlign-1);
+    if (StartOffset)
+      Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(),
+                        Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
+
+    int EltNo = (Offset - StartOffset) >> 2;
+    unsigned NumElems = VT.getVectorNumElements();
+
+    EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
+    SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
+                             LD->getPointerInfo().getWithOffset(StartOffset),
+                             false, false, false, 0);
+
+    SmallVector<int, 8> Mask;
+    for (unsigned i = 0; i != NumElems; ++i)
+      Mask.push_back(EltNo);
+
+    return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
+  }
+
+  return SDValue();
+}
+
+/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a
+/// vector of type 'VT', see if the elements can be replaced by a single large
+/// load which has the same value as a build_vector whose operands are 'elts'.
+///
+/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
+///
+/// FIXME: we'd also like to handle the case where the last elements are zero
+/// rather than undef via VZEXT_LOAD, but we do not detect that case today.
+/// There's even a handy isZeroNode for that purpose.
+static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
+                                        SDLoc &DL, SelectionDAG &DAG,
+                                        bool isAfterLegalize) {
+  EVT EltVT = VT.getVectorElementType();
+  unsigned NumElems = Elts.size();
+
+  LoadSDNode *LDBase = nullptr;
+  unsigned LastLoadedElt = -1U;
+
+  // For each element in the initializer, see if we've found a load or an undef.
+  // If we don't find an initial load element, or later load elements are
+  // non-consecutive, bail out.
+  for (unsigned i = 0; i < NumElems; ++i) {
+    SDValue Elt = Elts[i];
+
+    if (!Elt.getNode() ||
+        (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
+      return SDValue();
+    if (!LDBase) {
+      if (Elt.getNode()->getOpcode() == ISD::UNDEF)
+        return SDValue();
+      LDBase = cast<LoadSDNode>(Elt.getNode());
+      LastLoadedElt = i;
+      continue;
+    }
+    if (Elt.getOpcode() == ISD::UNDEF)
+      continue;
+
+    LoadSDNode *LD = cast<LoadSDNode>(Elt);
+    if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
+      return SDValue();
+    LastLoadedElt = i;
+  }
+
+  // If we have found an entire vector of loads and undefs, then return a large
+  // load of the entire vector width starting at the base pointer.  If we found
+  // consecutive loads for the low half, generate a vzext_load node.
+  if (LastLoadedElt == NumElems - 1) {
+
+    if (isAfterLegalize &&
+        !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
+      return SDValue();
+
+    SDValue NewLd = SDValue();
+
+    if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
+      NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
+                          LDBase->getPointerInfo(),
+                          LDBase->isVolatile(), LDBase->isNonTemporal(),
+                          LDBase->isInvariant(), 0);
+    NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
+                        LDBase->getPointerInfo(),
+                        LDBase->isVolatile(), LDBase->isNonTemporal(),
+                        LDBase->isInvariant(), LDBase->getAlignment());
+
+    if (LDBase->hasAnyUseOfValue(1)) {
+      SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+                                     SDValue(LDBase, 1),
+                                     SDValue(NewLd.getNode(), 1));
+      DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
+      DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
+                             SDValue(NewLd.getNode(), 1));
+    }
+
+    return NewLd;
+  }
+  if (NumElems == 4 && LastLoadedElt == 1 &&
+      DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
+    SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
+    SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
+    SDValue ResNode =
+        DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64,
+                                LDBase->getPointerInfo(),
+                                LDBase->getAlignment(),
+                                false/*isVolatile*/, true/*ReadMem*/,
+                                false/*WriteMem*/);
+
+    // Make sure the newly-created LOAD is in the same position as LDBase in
+    // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
+    // update uses of LDBase's output chain to use the TokenFactor.
+    if (LDBase->hasAnyUseOfValue(1)) {
+      SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+                             SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
+      DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
+      DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
+                             SDValue(ResNode.getNode(), 1));
+    }
+
+    return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
+  }
+  return SDValue();
+}
+
+/// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
+/// to generate a splat value for the following cases:
+/// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
+/// 2. A splat shuffle which uses a scalar_to_vector node which comes from
+/// a scalar load, or a constant.
+/// The VBROADCAST node is returned when a pattern is found,
+/// or SDValue() otherwise.
+static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
+                                    SelectionDAG &DAG) {
+  if (!Subtarget->hasFp256())
+    return SDValue();
+
+  MVT VT = Op.getSimpleValueType();
+  SDLoc dl(Op);
+
+  assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
+         "Unsupported vector type for broadcast.");
+
+  SDValue Ld;
+  bool ConstSplatVal;
+
+  switch (Op.getOpcode()) {
+    default:
+      // Unknown pattern found.
+      return SDValue();
+
+    case ISD::BUILD_VECTOR: {
+      auto *BVOp = cast<BuildVectorSDNode>(Op.getNode());
+      BitVector UndefElements;
+      SDValue Splat = BVOp->getSplatValue(&UndefElements);
+
+      // We need a splat of a single value to use broadcast, and it doesn't
+      // make any sense if the value is only in one element of the vector.
+      if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1)
+        return SDValue();
+
+      Ld = Splat;
+      ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
+                       Ld.getOpcode() == ISD::ConstantFP);
+
+      // Make sure that all of the users of a non-constant load are from the
+      // BUILD_VECTOR node.
+      if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
+        return SDValue();
+      break;
+    }
+
+    case ISD::VECTOR_SHUFFLE: {
+      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+
+      // Shuffles must have a splat mask where the first element is
+      // broadcasted.
+      if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
+        return SDValue();
+
+      SDValue Sc = Op.getOperand(0);
+      if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
+          Sc.getOpcode() != ISD::BUILD_VECTOR) {
+
+        if (!Subtarget->hasInt256())
+          return SDValue();
+
+        // Use the register form of the broadcast instruction available on AVX2.
+        if (VT.getSizeInBits() >= 256)
+          Sc = Extract128BitVector(Sc, 0, DAG, dl);
+        return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
+      }
+
+      Ld = Sc.getOperand(0);
+      ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
+                       Ld.getOpcode() == ISD::ConstantFP);
+
+      // The scalar_to_vector node and the suspected
+      // load node must have exactly one user.
+      // Constants may have multiple users.
+
+      // AVX-512 has register version of the broadcast
+      bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() &&
+        Ld.getValueType().getSizeInBits() >= 32;
+      if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
+          !hasRegVer))
+        return SDValue();
+      break;
+    }
+  }
+
+  bool IsGE256 = (VT.getSizeInBits() >= 256);
+
+  // Handle the broadcasting a single constant scalar from the constant pool
+  // into a vector. On Sandybridge it is still better to load a constant vector
+  // from the constant pool and not to broadcast it from a scalar.
+  if (ConstSplatVal && Subtarget->hasInt256()) {
+    EVT CVT = Ld.getValueType();
+    assert(!CVT.isVector() && "Must not broadcast a vector type");
+    unsigned ScalarSize = CVT.getSizeInBits();
+
+    if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) {
+      const Constant *C = nullptr;
+      if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
+        C = CI->getConstantIntValue();
+      else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
+        C = CF->getConstantFPValue();
+
+      assert(C && "Invalid constant type");
+
+      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+      SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
+      unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
+      Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
+                       MachinePointerInfo::getConstantPool(),
+                       false, false, false, Alignment);
+
+      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+    }
+  }
+
+  bool IsLoad = ISD::isNormalLoad(Ld.getNode());
+  unsigned ScalarSize = Ld.getValueType().getSizeInBits();
+
+  // Handle AVX2 in-register broadcasts.
+  if (!IsLoad && Subtarget->hasInt256() &&
+      (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
+    return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+
+  // The scalar source must be a normal load.
+  if (!IsLoad)
+    return SDValue();
+
+  if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64))
+    return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+
+  // The integer check is needed for the 64-bit into 128-bit so it doesn't match
+  // double since there is no vbroadcastsd xmm
+  if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {
+    if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
+      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+  }
+
+  // Unsupported broadcast.
+  return SDValue();
+}
+
+/// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
+/// underlying vector and index.
+///
+/// Modifies \p ExtractedFromVec to the real vector and returns the real
+/// index.
+static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
+                                         SDValue ExtIdx) {
+  int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
+  if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
+    return Idx;
+
+  // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
+  // lowered this:
+  //   (extract_vector_elt (v8f32 %vreg1), Constant<6>)
+  // to:
+  //   (extract_vector_elt (vector_shuffle<2,u,u,u>
+  //                           (extract_subvector (v8f32 %vreg0), Constant<4>),
+  //                           undef)
+  //                       Constant<0>)
+  // In this case the vector is the extract_subvector expression and the index
+  // is 2, as specified by the shuffle.
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
+  SDValue ShuffleVec = SVOp->getOperand(0);
+  MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
+  assert(ShuffleVecVT.getVectorElementType() ==
+         ExtractedFromVec.getSimpleValueType().getVectorElementType());
+
+  int ShuffleIdx = SVOp->getMaskElt(Idx);
+  if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
+    ExtractedFromVec = ShuffleVec;
+    return ShuffleIdx;
+  }
+  return Idx;
+}
+
+static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+
+  // Skip if insert_vec_elt is not supported.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
+    return SDValue();
+
+  SDLoc DL(Op);
+  unsigned NumElems = Op.getNumOperands();
+
+  SDValue VecIn1;
+  SDValue VecIn2;
+  SmallVector<unsigned, 4> InsertIndices;
+  SmallVector<int, 8> Mask(NumElems, -1);
+
+  for (unsigned i = 0; i != NumElems; ++i) {
+    unsigned Opc = Op.getOperand(i).getOpcode();
+
+    if (Opc == ISD::UNDEF)
+      continue;
+
+    if (Opc != ISD::EXTRACT_VECTOR_ELT) {
+      // Quit if more than 1 elements need inserting.
+      if (InsertIndices.size() > 1)
+        return SDValue();
+
+      InsertIndices.push_back(i);
+      continue;
+    }
+
+    SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
+    SDValue ExtIdx = Op.getOperand(i).getOperand(1);
+    // Quit if non-constant index.
+    if (!isa<ConstantSDNode>(ExtIdx))
+      return SDValue();
+    int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
+
+    // Quit if extracted from vector of different type.
+    if (ExtractedFromVec.getValueType() != VT)
+      return SDValue();
+
+    if (!VecIn1.getNode())
+      VecIn1 = ExtractedFromVec;
+    else if (VecIn1 != ExtractedFromVec) {
+      if (!VecIn2.getNode())
+        VecIn2 = ExtractedFromVec;
+      else if (VecIn2 != ExtractedFromVec)
+        // Quit if more than 2 vectors to shuffle
+        return SDValue();
+    }
+
+    if (ExtractedFromVec == VecIn1)
+      Mask[i] = Idx;
+    else if (ExtractedFromVec == VecIn2)
+      Mask[i] = Idx + NumElems;
+  }
+
+  if (!VecIn1.getNode())
+    return SDValue();
+
+  VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
+  SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
+  for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
+    unsigned Idx = InsertIndices[i];
+    NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
+                     DAG.getIntPtrConstant(Idx));
+  }
+
+  return NV;
+}
+
+// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
+SDValue
+X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
+
+  MVT VT = Op.getSimpleValueType();
+  assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) &&
+         "Unexpected type in LowerBUILD_VECTORvXi1!");
+
+  SDLoc dl(Op);
+  if (ISD::isBuildVectorAllZeros(Op.getNode())) {
+    SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
+    SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
+  }
+
+  if (ISD::isBuildVectorAllOnes(Op.getNode())) {
+    SDValue Cst = DAG.getTargetConstant(1, MVT::i1);
+    SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
+  }
+
+  bool AllContants = true;
+  uint64_t Immediate = 0;
+  int NonConstIdx = -1;
+  bool IsSplat = true;
+  unsigned NumNonConsts = 0;
+  unsigned NumConsts = 0;
+  for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
+    SDValue In = Op.getOperand(idx);
+    if (In.getOpcode() == ISD::UNDEF)
+      continue;
+    if (!isa<ConstantSDNode>(In)) {
+      AllContants = false;
+      NonConstIdx = idx;
+      NumNonConsts++;
+    }
+    else {
+      NumConsts++;
+      if (cast<ConstantSDNode>(In)->getZExtValue())
+      Immediate |= (1ULL << idx);
+    }
+    if (In != Op.getOperand(0))
+      IsSplat = false;
+  }
+
+  if (AllContants) {
+    SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1,
+      DAG.getConstant(Immediate, MVT::i16));
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask,
+                       DAG.getIntPtrConstant(0));
+  }
+
+  if (NumNonConsts == 1 && NonConstIdx != 0) {
+    SDValue DstVec;
+    if (NumConsts) {
+      SDValue VecAsImm = DAG.getConstant(Immediate,
+                                         MVT::getIntegerVT(VT.getSizeInBits()));
+      DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm);
+    }
+    else 
+      DstVec = DAG.getUNDEF(VT);
+    return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
+                       Op.getOperand(NonConstIdx),
+                       DAG.getIntPtrConstant(NonConstIdx));
+  }
+  if (!IsSplat && (NonConstIdx != 0))
+    llvm_unreachable("Unsupported BUILD_VECTOR operation");
+  MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8;
+  SDValue Select;
+  if (IsSplat)
+    Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
+                          DAG.getConstant(-1, SelectVT),
+                          DAG.getConstant(0, SelectVT));
+  else
+    Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
+                         DAG.getConstant((Immediate | 1), SelectVT),
+                         DAG.getConstant(Immediate, SelectVT));
+  return DAG.getNode(ISD::BITCAST, dl, VT, Select);
+}
+
+/// \brief Return true if \p N implements a horizontal binop and return the
+/// operands for the horizontal binop into V0 and V1.
+/// 
+/// This is a helper function of PerformBUILD_VECTORCombine.
+/// This function checks that the build_vector \p N in input implements a
+/// horizontal operation. Parameter \p Opcode defines the kind of horizontal
+/// operation to match.
+/// For example, if \p Opcode is equal to ISD::ADD, then this function
+/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
+/// is equal to ISD::SUB, then this function checks if this is a horizontal
+/// arithmetic sub.
+///
+/// This function only analyzes elements of \p N whose indices are
+/// in range [BaseIdx, LastIdx).
+static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
+                              SelectionDAG &DAG,
+                              unsigned BaseIdx, unsigned LastIdx,
+                              SDValue &V0, SDValue &V1) {
+  EVT VT = N->getValueType(0);
+
+  assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
+  assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
+         "Invalid Vector in input!");
+  
+  bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
+  bool CanFold = true;
+  unsigned ExpectedVExtractIdx = BaseIdx;
+  unsigned NumElts = LastIdx - BaseIdx;
+  V0 = DAG.getUNDEF(VT);
+  V1 = DAG.getUNDEF(VT);
+
+  // Check if N implements a horizontal binop.
+  for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
+    SDValue Op = N->getOperand(i + BaseIdx);
+
+    // Skip UNDEFs.
+    if (Op->getOpcode() == ISD::UNDEF) {
+      // Update the expected vector extract index.
+      if (i * 2 == NumElts)
+        ExpectedVExtractIdx = BaseIdx;
+      ExpectedVExtractIdx += 2;
+      continue;
+    }
+
+    CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
+
+    if (!CanFold)
+      break;
+
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+
+    // Try to match the following pattern:
+    // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
+    CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+        Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+        Op0.getOperand(0) == Op1.getOperand(0) &&
+        isa<ConstantSDNode>(Op0.getOperand(1)) &&
+        isa<ConstantSDNode>(Op1.getOperand(1)));
+    if (!CanFold)
+      break;
+
+    unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
+    unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
+
+    if (i * 2 < NumElts) {
+      if (V0.getOpcode() == ISD::UNDEF)
+        V0 = Op0.getOperand(0);
+    } else {
+      if (V1.getOpcode() == ISD::UNDEF)
+        V1 = Op0.getOperand(0);
+      if (i * 2 == NumElts)
+        ExpectedVExtractIdx = BaseIdx;
+    }
+
+    SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
+    if (I0 == ExpectedVExtractIdx)
+      CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
+    else if (IsCommutable && I1 == ExpectedVExtractIdx) {
+      // Try to match the following dag sequence:
+      // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
+      CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
+    } else
+      CanFold = false;
+
+    ExpectedVExtractIdx += 2;
+  }
+
+  return CanFold;
+}
+
+/// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
+/// a concat_vector. 
+///
+/// This is a helper function of PerformBUILD_VECTORCombine.
+/// This function expects two 256-bit vectors called V0 and V1.
+/// At first, each vector is split into two separate 128-bit vectors.
+/// Then, the resulting 128-bit vectors are used to implement two
+/// horizontal binary operations. 
+///
+/// The kind of horizontal binary operation is defined by \p X86Opcode.
+///
+/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
+/// the two new horizontal binop.
+/// When Mode is set, the first horizontal binop dag node would take as input
+/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
+/// horizontal binop dag node would take as input the lower 128-bit of V1
+/// and the upper 128-bit of V1.
+///   Example:
+///     HADD V0_LO, V0_HI
+///     HADD V1_LO, V1_HI
+///
+/// Otherwise, the first horizontal binop dag node takes as input the lower
+/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
+/// dag node takes the the upper 128-bit of V0 and the upper 128-bit of V1.
+///   Example:
+///     HADD V0_LO, V1_LO
+///     HADD V0_HI, V1_HI
+///
+/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
+/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
+/// the upper 128-bits of the result.
+static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
+                                     SDLoc DL, SelectionDAG &DAG,
+                                     unsigned X86Opcode, bool Mode,
+                                     bool isUndefLO, bool isUndefHI) {
+  EVT VT = V0.getValueType();
+  assert(VT.is256BitVector() && VT == V1.getValueType() &&
+         "Invalid nodes in input!");
+
+  unsigned NumElts = VT.getVectorNumElements();
+  SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL);
+  SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL);
+  SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL);
+  SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL);
+  EVT NewVT = V0_LO.getValueType();
+
+  SDValue LO = DAG.getUNDEF(NewVT);
+  SDValue HI = DAG.getUNDEF(NewVT);
+
+  if (Mode) {
+    // Don't emit a horizontal binop if the result is expected to be UNDEF.
+    if (!isUndefLO && V0->getOpcode() != ISD::UNDEF)
+      LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
+    if (!isUndefHI && V1->getOpcode() != ISD::UNDEF)
+      HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
+  } else {
+    // Don't emit a horizontal binop if the result is expected to be UNDEF.
+    if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF ||
+                       V1_LO->getOpcode() != ISD::UNDEF))
+      LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
+
+    if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF ||
+                       V1_HI->getOpcode() != ISD::UNDEF))
+      HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
+  }
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
+}
+
+/// \brief Try to fold a build_vector that performs an 'addsub' into the
+/// sequence of 'vadd + vsub + blendi'.
+static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG,
+                           const X86Subtarget *Subtarget) {
+  SDLoc DL(BV);
+  EVT VT = BV->getValueType(0);
+  unsigned NumElts = VT.getVectorNumElements();
+  SDValue InVec0 = DAG.getUNDEF(VT);
+  SDValue InVec1 = DAG.getUNDEF(VT);
+
+  assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
+          VT == MVT::v2f64) && "build_vector with an invalid type found!");
+
+  // Don't try to emit a VSELECT that cannot be lowered into a blend.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
+    return SDValue();
+
+  // Odd-numbered elements in the input build vector are obtained from
+  // adding two integer/float elements.
+  // Even-numbered elements in the input build vector are obtained from
+  // subtracting two integer/float elements.
+  unsigned ExpectedOpcode = ISD::FSUB;
+  unsigned NextExpectedOpcode = ISD::FADD;
+  bool AddFound = false;
+  bool SubFound = false;
+
+  for (unsigned i = 0, e = NumElts; i != e; i++) {
+    SDValue Op = BV->getOperand(i);
+      
+    // Skip 'undef' values.
+    unsigned Opcode = Op.getOpcode();
+    if (Opcode == ISD::UNDEF) {
+      std::swap(ExpectedOpcode, NextExpectedOpcode);
+      continue;
+    }
+      
+    // Early exit if we found an unexpected opcode.
+    if (Opcode != ExpectedOpcode)
+      return SDValue();
+
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+
+    // Try to match the following pattern:
+    // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
+    // Early exit if we cannot match that sequence.
+    if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+        Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+        !isa<ConstantSDNode>(Op0.getOperand(1)) ||
+        !isa<ConstantSDNode>(Op1.getOperand(1)) ||
+        Op0.getOperand(1) != Op1.getOperand(1))
+      return SDValue();
+
+    unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
+    if (I0 != i)
+      return SDValue();
+
+    // We found a valid add/sub node. Update the information accordingly.
+    if (i & 1)
+      AddFound = true;
+    else
+      SubFound = true;
+
+    // Update InVec0 and InVec1.
+    if (InVec0.getOpcode() == ISD::UNDEF)
+      InVec0 = Op0.getOperand(0);
+    if (InVec1.getOpcode() == ISD::UNDEF)
+      InVec1 = Op1.getOperand(0);
+
+    // Make sure that operands in input to each add/sub node always
+    // come from a same pair of vectors.
+    if (InVec0 != Op0.getOperand(0)) {
+      if (ExpectedOpcode == ISD::FSUB)
+        return SDValue();
+
+      // FADD is commutable. Try to commute the operands
+      // and then test again.
+      std::swap(Op0, Op1);
+      if (InVec0 != Op0.getOperand(0))
+        return SDValue();
+    }
+
+    if (InVec1 != Op1.getOperand(0))
+      return SDValue();
+
+    // Update the pair of expected opcodes.
+    std::swap(ExpectedOpcode, NextExpectedOpcode);
+  }
+
+  // Don't try to fold this build_vector into a VSELECT if it has
+  // too many UNDEF operands.
+  if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF &&
+      InVec1.getOpcode() != ISD::UNDEF) {
+    // Emit a sequence of vector add and sub followed by a VSELECT.
+    // The new VSELECT will be lowered into a BLENDI.
+    // At ISel stage, we pattern-match the sequence 'add + sub + BLENDI'
+    // and emit a single ADDSUB instruction.
+    SDValue Sub = DAG.getNode(ExpectedOpcode, DL, VT, InVec0, InVec1);
+    SDValue Add = DAG.getNode(NextExpectedOpcode, DL, VT, InVec0, InVec1);
+
+    // Construct the VSELECT mask.
+    EVT MaskVT = VT.changeVectorElementTypeToInteger();
+    EVT SVT = MaskVT.getVectorElementType();
+    unsigned SVTBits = SVT.getSizeInBits();
+    SmallVector<SDValue, 8> Ops;
+
+    for (unsigned i = 0, e = NumElts; i != e; ++i) {
+      APInt Value = i & 1 ? APInt::getNullValue(SVTBits) :
+                            APInt::getAllOnesValue(SVTBits);
+      SDValue Constant = DAG.getConstant(Value, SVT);
+      Ops.push_back(Constant);
+    }
+
+    SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskVT, Ops);
+    return DAG.getSelect(DL, VT, Mask, Sub, Add);
+  }
+  
+  return SDValue();
+}
+
+static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
+                                          const X86Subtarget *Subtarget) {
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  unsigned NumElts = VT.getVectorNumElements();
+  BuildVectorSDNode *BV = cast<BuildVectorSDNode>(N);
+  SDValue InVec0, InVec1;
+
+  // Try to match an ADDSUB.
+  if ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
+      (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
+    SDValue Value = matchAddSub(BV, DAG, Subtarget);
+    if (Value.getNode())
+      return Value;
+  }
+
+  // Try to match horizontal ADD/SUB.
+  unsigned NumUndefsLO = 0;
+  unsigned NumUndefsHI = 0;
+  unsigned Half = NumElts/2;
+
+  // Count the number of UNDEF operands in the build_vector in input.
+  for (unsigned i = 0, e = Half; i != e; ++i)
+    if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
+      NumUndefsLO++;
+
+  for (unsigned i = Half, e = NumElts; i != e; ++i)
+    if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
+      NumUndefsHI++;
+
+  // Early exit if this is either a build_vector of all UNDEFs or all the
+  // operands but one are UNDEF.
+  if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
+    return SDValue();
+
+  if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) {
+    // Try to match an SSE3 float HADD/HSUB.
+    if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
+      return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
+    
+    if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
+      return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
+  } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) {
+    // Try to match an SSSE3 integer HADD/HSUB.
+    if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
+      return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
+    
+    if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
+      return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
+  }
+  
+  if (!Subtarget->hasAVX())
+    return SDValue();
+
+  if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
+    // Try to match an AVX horizontal add/sub of packed single/double
+    // precision floating point values from 256-bit vectors.
+    SDValue InVec2, InVec3;
+    if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
+        isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
+        ((InVec0.getOpcode() == ISD::UNDEF ||
+          InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
+        ((InVec1.getOpcode() == ISD::UNDEF ||
+          InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
+      return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
+
+    if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
+        isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
+        ((InVec0.getOpcode() == ISD::UNDEF ||
+          InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
+        ((InVec1.getOpcode() == ISD::UNDEF ||
+          InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
+      return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
+  } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
+    // Try to match an AVX2 horizontal add/sub of signed integers.
+    SDValue InVec2, InVec3;
+    unsigned X86Opcode;
+    bool CanFold = true;
+
+    if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
+        isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
+        ((InVec0.getOpcode() == ISD::UNDEF ||
+          InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
+        ((InVec1.getOpcode() == ISD::UNDEF ||
+          InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
+      X86Opcode = X86ISD::HADD;
+    else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
+        isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
+        ((InVec0.getOpcode() == ISD::UNDEF ||
+          InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
+        ((InVec1.getOpcode() == ISD::UNDEF ||
+          InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
+      X86Opcode = X86ISD::HSUB;
+    else
+      CanFold = false;
+
+    if (CanFold) {
+      // Fold this build_vector into a single horizontal add/sub.
+      // Do this only if the target has AVX2.
+      if (Subtarget->hasAVX2())
+        return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
+ 
+      // Do not try to expand this build_vector into a pair of horizontal
+      // add/sub if we can emit a pair of scalar add/sub.
+      if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
+        return SDValue();
+
+      // Convert this build_vector into a pair of horizontal binop followed by
+      // a concat vector.
+      bool isUndefLO = NumUndefsLO == Half;
+      bool isUndefHI = NumUndefsHI == Half;
+      return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
+                                   isUndefLO, isUndefHI);
+    }
+  }
+
+  if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
+       VT == MVT::v16i16) && Subtarget->hasAVX()) {
+    unsigned X86Opcode;
+    if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
+      X86Opcode = X86ISD::HADD;
+    else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
+      X86Opcode = X86ISD::HSUB;
+    else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
+      X86Opcode = X86ISD::FHADD;
+    else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
+      X86Opcode = X86ISD::FHSUB;
+    else
+      return SDValue();
+
+    // Don't try to expand this build_vector into a pair of horizontal add/sub
+    // if we can simply emit a pair of scalar add/sub.
+    if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
+      return SDValue();
+
+    // Convert this build_vector into two horizontal add/sub followed by
+    // a concat vector.
+    bool isUndefLO = NumUndefsLO == Half;
+    bool isUndefHI = NumUndefsHI == Half;
+    return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
+                                 isUndefLO, isUndefHI);
+  }
+
+  return SDValue();
+}
+
+SDValue
+X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+
+  MVT VT = Op.getSimpleValueType();
+  MVT ExtVT = VT.getVectorElementType();
+  unsigned NumElems = Op.getNumOperands();
+
+  // Generate vectors for predicate vectors.
+  if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512())
+    return LowerBUILD_VECTORvXi1(Op, DAG);
+
+  // Vectors containing all zeros can be matched by pxor and xorps later
+  if (ISD::isBuildVectorAllZeros(Op.getNode())) {
+    // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
+    // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
+    if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
+      return Op;
+
+    return getZeroVector(VT, Subtarget, DAG, dl);
+  }
+
+  // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
+  // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
+  // vpcmpeqd on 256-bit vectors.
+  if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
+    if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
+      return Op;
+
+    if (!VT.is512BitVector())
+      return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
+  }
+
+  SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
+  if (Broadcast.getNode())
+    return Broadcast;
+
+  unsigned EVTBits = ExtVT.getSizeInBits();
+
+  unsigned NumZero  = 0;
+  unsigned NumNonZero = 0;
+  unsigned NonZeros = 0;
+  bool IsAllConstants = true;
+  SmallSet<SDValue, 8> Values;
+  for (unsigned i = 0; i < NumElems; ++i) {
+    SDValue Elt = Op.getOperand(i);
+    if (Elt.getOpcode() == ISD::UNDEF)
+      continue;
+    Values.insert(Elt);
+    if (Elt.getOpcode() != ISD::Constant &&
+        Elt.getOpcode() != ISD::ConstantFP)
+      IsAllConstants = false;
+    if (X86::isZeroNode(Elt))
+      NumZero++;
+    else {
+      NonZeros |= (1 << i);
+      NumNonZero++;
+    }
+  }
+
+  // All undef vector. Return an UNDEF.  All zero vectors were handled above.
+  if (NumNonZero == 0)
+    return DAG.getUNDEF(VT);
+
+  // Special case for single non-zero, non-undef, element.
+  if (NumNonZero == 1) {
+    unsigned Idx = countTrailingZeros(NonZeros);
+    SDValue Item = Op.getOperand(Idx);
+
+    // If this is an insertion of an i64 value on x86-32, and if the top bits of
+    // the value are obviously zero, truncate the value to i32 and do the
+    // insertion that way.  Only do this if the value is non-constant or if the
+    // value is a constant being inserted into element 0.  It is cheaper to do
+    // a constant pool load than it is to do a movd + shuffle.
+    if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
+        (!IsAllConstants || Idx == 0)) {
+      if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
+        // Handle SSE only.
+        assert(VT == MVT::v2i64 && "Expected an SSE value type!");
+        EVT VecVT = MVT::v4i32;
+        unsigned VecElts = 4;
+
+        // Truncate the value (which may itself be a constant) to i32, and
+        // convert it to a vector with movd (S2V+shuffle to zero extend).
+        Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
+        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
+        Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
+
+        // Now we have our 32-bit value zero extended in the low element of
+        // a vector.  If Idx != 0, swizzle it into place.
+        if (Idx != 0) {
+          SmallVector<int, 4> Mask;
+          Mask.push_back(Idx);
+          for (unsigned i = 1; i != VecElts; ++i)
+            Mask.push_back(i);
+          Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT),
+                                      &Mask[0]);
+        }
+        return DAG.getNode(ISD::BITCAST, dl, VT, Item);
+      }
+    }
+
+    // If we have a constant or non-constant insertion into the low element of
+    // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
+    // the rest of the elements.  This will be matched as movd/movq/movss/movsd
+    // depending on what the source datatype is.
+    if (Idx == 0) {
+      if (NumZero == 0)
+        return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
+
+      if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
+          (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
+        if (VT.is256BitVector() || VT.is512BitVector()) {
+          SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
+          return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
+                             Item, DAG.getIntPtrConstant(0));
+        }
+        assert(VT.is128BitVector() && "Expected an SSE value type!");
+        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
+        // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
+        return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
+      }
+
+      if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
+        Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
+        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
+        if (VT.is256BitVector()) {
+          SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
+          Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
+        } else {
+          assert(VT.is128BitVector() && "Expected an SSE value type!");
+          Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
+        }
+        return DAG.getNode(ISD::BITCAST, dl, VT, Item);
+      }
+    }
+
+    // Is it a vector logical left shift?
+    if (NumElems == 2 && Idx == 1 &&
+        X86::isZeroNode(Op.getOperand(0)) &&
+        !X86::isZeroNode(Op.getOperand(1))) {
+      unsigned NumBits = VT.getSizeInBits();
+      return getVShift(true, VT,
+                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+                                   VT, Op.getOperand(1)),
+                       NumBits/2, DAG, *this, dl);
+    }
+
+    if (IsAllConstants) // Otherwise, it's better to do a constpool load.
+      return SDValue();
+
+    // Otherwise, if this is a vector with i32 or f32 elements, and the element
+    // is a non-constant being inserted into an element other than the low one,
+    // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
+    // movd/movss) to move this into the low element, then shuffle it into
+    // place.
+    if (EVTBits == 32) {
+      Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
+
+      // Turn it into a shuffle of zero and zero-extended scalar to vector.
+      Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
+      SmallVector<int, 8> MaskVec;
+      for (unsigned i = 0; i != NumElems; ++i)
+        MaskVec.push_back(i == Idx ? 0 : 1);
+      return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
+    }
+  }
+
+  // Splat is obviously ok. Let legalizer expand it to a shuffle.
+  if (Values.size() == 1) {
+    if (EVTBits == 32) {
+      // Instead of a shuffle like this:
+      // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
+      // Check if it's possible to issue this instead.
+      // shuffle (vload ptr)), undef, <1, 1, 1, 1>
+      unsigned Idx = countTrailingZeros(NonZeros);
+      SDValue Item = Op.getOperand(Idx);
+      if (Op.getNode()->isOnlyUserOf(Item.getNode()))
+        return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
+    }
+    return SDValue();
+  }
+
+  // A vector full of immediates; various special cases are already
+  // handled, so this is best done with a single constant-pool load.
+  if (IsAllConstants)
+    return SDValue();
+
+  // For AVX-length vectors, build the individual 128-bit pieces and use
+  // shuffles to put them in place.
+  if (VT.is256BitVector() || VT.is512BitVector()) {
+    SmallVector<SDValue, 64> V;
+    for (unsigned i = 0; i != NumElems; ++i)
+      V.push_back(Op.getOperand(i));
+
+    EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
+
+    // Build both the lower and upper subvector.
+    SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
+                                makeArrayRef(&V[0], NumElems/2));
+    SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
+                                makeArrayRef(&V[NumElems / 2], NumElems/2));
+
+    // Recreate the wider vector with the lower and upper part.
+    if (VT.is256BitVector())
+      return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
+    return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
+  }
+
+  // Let legalizer expand 2-wide build_vectors.
+  if (EVTBits == 64) {
+    if (NumNonZero == 1) {
+      // One half is zero or undef.
+      unsigned Idx = countTrailingZeros(NonZeros);
+      SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
+                                 Op.getOperand(Idx));
+      return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
+    }
+    return SDValue();
+  }
+
+  // If element VT is < 32 bits, convert it to inserts into a zero vector.
+  if (EVTBits == 8 && NumElems == 16) {
+    SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
+                                        Subtarget, *this);
+    if (V.getNode()) return V;
+  }
+
+  if (EVTBits == 16 && NumElems == 8) {
+    SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
+                                      Subtarget, *this);
+    if (V.getNode()) return V;
+  }
+
+  // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
+  if (EVTBits == 32 && NumElems == 4) {
+    SDValue V = LowerBuildVectorv4x32(Op, NumElems, NonZeros, NumNonZero,
+                                      NumZero, DAG, Subtarget, *this);
+    if (V.getNode())
+      return V;
+  }
+
+  // If element VT is == 32 bits, turn it into a number of shuffles.
+  SmallVector<SDValue, 8> V(NumElems);
+  if (NumElems == 4 && NumZero > 0) {
+    for (unsigned i = 0; i < 4; ++i) {
+      bool isZero = !(NonZeros & (1 << i));
+      if (isZero)
+        V[i] = getZeroVector(VT, Subtarget, DAG, dl);
+      else
+        V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
+    }
+
+    for (unsigned i = 0; i < 2; ++i) {
+      switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
+        default: break;
+        case 0:
+          V[i] = V[i*2];  // Must be a zero vector.
+          break;
+        case 1:
+          V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
+          break;
+        case 2:
+          V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
+          break;
+        case 3:
+          V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
+          break;
+      }
+    }
+
+    bool Reverse1 = (NonZeros & 0x3) == 2;
+    bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
+    int MaskVec[] = {
+      Reverse1 ? 1 : 0,
+      Reverse1 ? 0 : 1,
+      static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
+      static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
+    };
+    return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
+  }
+
+  if (Values.size() > 1 && VT.is128BitVector()) {
+    // Check for a build vector of consecutive loads.
+    for (unsigned i = 0; i < NumElems; ++i)
+      V[i] = Op.getOperand(i);
+
+    // Check for elements which are consecutive loads.
+    SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false);
+    if (LD.getNode())
+      return LD;
+
+    // Check for a build vector from mostly shuffle plus few inserting.
+    SDValue Sh = buildFromShuffleMostly(Op, DAG);
+    if (Sh.getNode())
+      return Sh;
+
+    // For SSE 4.1, use insertps to put the high elements into the low element.
+    if (getSubtarget()->hasSSE41()) {
+      SDValue Result;
+      if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
+        Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
+      else
+        Result = DAG.getUNDEF(VT);
+
+      for (unsigned i = 1; i < NumElems; ++i) {
+        if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
+        Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
+                             Op.getOperand(i), DAG.getIntPtrConstant(i));
+      }
+      return Result;
+    }
+
+    // Otherwise, expand into a number of unpckl*, start by extending each of
+    // our (non-undef) elements to the full vector width with the element in the
+    // bottom slot of the vector (which generates no code for SSE).
+    for (unsigned i = 0; i < NumElems; ++i) {
+      if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
+        V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
+      else
+        V[i] = DAG.getUNDEF(VT);
+    }
+
+    // Next, we iteratively mix elements, e.g. for v4f32:
+    //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
+    //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
+    //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
+    unsigned EltStride = NumElems >> 1;
+    while (EltStride != 0) {
+      for (unsigned i = 0; i < EltStride; ++i) {
+        // If V[i+EltStride] is undef and this is the first round of mixing,
+        // then it is safe to just drop this shuffle: V[i] is already in the
+        // right place, the one element (since it's the first round) being
+        // inserted as undef can be dropped.  This isn't safe for successive
+        // rounds because they will permute elements within both vectors.
+        if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
+            EltStride == NumElems/2)
+          continue;
+
+        V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
+      }
+      EltStride >>= 1;
+    }
+    return V[0];
+  }
+  return SDValue();
+}
+
+// LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
+// to create 256-bit vectors from two other 128-bit ones.
+static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
+  SDLoc dl(Op);
+  MVT ResVT = Op.getSimpleValueType();
+
+  assert((ResVT.is256BitVector() ||
+          ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
+
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  unsigned NumElems = ResVT.getVectorNumElements();
+  if(ResVT.is256BitVector())
+    return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
+
+  if (Op.getNumOperands() == 4) {
+    MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(),
+                                ResVT.getVectorNumElements()/2);
+    SDValue V3 = Op.getOperand(2);
+    SDValue V4 = Op.getOperand(3);
+    return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl),
+      Concat128BitVectors(V3, V4, HalfVT, NumElems/2, DAG, dl), ResVT, NumElems, DAG, dl);
+  }
+  return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
+}
+
+static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
+  MVT LLVM_ATTRIBUTE_UNUSED VT = Op.getSimpleValueType();
+  assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
+         (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
+          Op.getNumOperands() == 4)));
+
+  // AVX can use the vinsertf128 instruction to create 256-bit vectors
+  // from two other 128-bit ones.
+
+  // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
+  return LowerAVXCONCAT_VECTORS(Op, DAG);
+}
+
+
+//===----------------------------------------------------------------------===//
+// Vector shuffle lowering
+//
+// This is an experimental code path for lowering vector shuffles on x86. It is
+// designed to handle arbitrary vector shuffles and blends, gracefully
+// degrading performance as necessary. It works hard to recognize idiomatic
+// shuffles and lower them to optimal instruction patterns without leaving
+// a framework that allows reasonably efficient handling of all vector shuffle
+// patterns.
+//===----------------------------------------------------------------------===//
+
+/// \brief Tiny helper function to identify a no-op mask.
+///
+/// This is a somewhat boring predicate function. It checks whether the mask
+/// array input, which is assumed to be a single-input shuffle mask of the kind
+/// used by the X86 shuffle instructions (not a fully general
+/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
+/// in-place shuffle are 'no-op's.
+static bool isNoopShuffleMask(ArrayRef<int> Mask) {
+  for (int i = 0, Size = Mask.size(); i < Size; ++i)
+    if (Mask[i] != -1 && Mask[i] != i)
+      return false;
+  return true;
+}
+
+/// \brief Helper function to classify a mask as a single-input mask.
+///
+/// This isn't a generic single-input test because in the vector shuffle
+/// lowering we canonicalize single inputs to be the first input operand. This
+/// means we can more quickly test for a single input by only checking whether
+/// an input from the second operand exists. We also assume that the size of
+/// mask corresponds to the size of the input vectors which isn't true in the
+/// fully general case.
+static bool isSingleInputShuffleMask(ArrayRef<int> Mask) {
+  for (int M : Mask)
+    if (M >= (int)Mask.size())
+      return false;
+  return true;
+}
+
+/// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
+///
+/// This helper function produces an 8-bit shuffle immediate corresponding to
+/// the ubiquitous shuffle encoding scheme used in x86 instructions for
+/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
+/// example.
+///
+/// NB: We rely heavily on "undef" masks preserving the input lane.
+static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask,
+                                          SelectionDAG &DAG) {
+  assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
+  assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
+  assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
+  assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
+  assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
+
+  unsigned Imm = 0;
+  Imm |= (Mask[0] == -1 ? 0 : Mask[0]) << 0;
+  Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2;
+  Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4;
+  Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6;
+  return DAG.getConstant(Imm, MVT::i8);
+}
+
+/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
+///
+/// This is the basis function for the 2-lane 64-bit shuffles as we have full
+/// support for floating point shuffles but not integer shuffles. These
+/// instructions will incur a domain crossing penalty on some chips though so
+/// it is better to avoid lowering through this for integer vectors where
+/// possible.
+static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(Op.getSimpleValueType() == MVT::v2f64 && "Bad shuffle type!");
+  assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
+
+  if (isSingleInputShuffleMask(Mask)) {
+    // Straight shuffle of a single input vector. Simulate this by using the
+    // single input as both of the "inputs" to this instruction..
+    unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
+    return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V1,
+                       DAG.getConstant(SHUFPDMask, MVT::i8));
+  }
+  assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
+  assert(Mask[1] >= 2 && "Non-canonicalized blend!");
+
+  unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
+  return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2,
+                     DAG.getConstant(SHUFPDMask, MVT::i8));
+}
+
+/// \brief Handle lowering of 2-lane 64-bit integer shuffles.
+///
+/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
+/// the integer unit to minimize domain crossing penalties. However, for blends
+/// it falls back to the floating point shuffle operation with appropriate bit
+/// casting.
+static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(Op.getSimpleValueType() == MVT::v2i64 && "Bad shuffle type!");
+  assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
+
+  if (isSingleInputShuffleMask(Mask)) {
+    // Straight shuffle of a single input vector. For everything from SSE2
+    // onward this has a single fast instruction with no scary immediates.
+    // We have to map the mask as it is actually a v4i32 shuffle instruction.
+    V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V1);
+    int WidenedMask[4] = {
+        std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
+        std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
+    return DAG.getNode(
+        ISD::BITCAST, DL, MVT::v2i64,
+        DAG.getNode(X86ISD::PSHUFD, SDLoc(Op), MVT::v4i32, V1,
+                    getV4X86ShuffleImm8ForMask(WidenedMask, DAG)));
+  }
+
+  // We implement this with SHUFPD which is pretty lame because it will likely
+  // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
+  // However, all the alternatives are still more cycles and newer chips don't
+  // have this problem. It would be really nice if x86 had better shuffles here.
+  V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V1);
+  V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V2);
+  return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
+                     DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
+}
+
+/// \brief Lower 4-lane 32-bit floating point shuffles.
+///
+/// Uses instructions exclusively from the floating point unit to minimize
+/// domain crossing penalties, as these are sufficient to implement all v4f32
+/// shuffles.
+static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
+  assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+
+  SDValue LowV = V1, HighV = V2;
+  int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
+
+  int NumV2Elements =
+      std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
+
+  if (NumV2Elements == 0)
+    // Straight shuffle of a single input vector. We pass the input vector to
+    // both operands to simulate this with a SHUFPS.
+    return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
+                       getV4X86ShuffleImm8ForMask(Mask, DAG));
+
+  if (NumV2Elements == 1) {
+    int V2Index =
+        std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
+        Mask.begin();
+    // Compute the index adjacent to V2Index and in the same half by toggling
+    // the low bit.
+    int V2AdjIndex = V2Index ^ 1;
+
+    if (Mask[V2AdjIndex] == -1) {
+      // Handles all the cases where we have a single V2 element and an undef.
+      // This will only ever happen in the high lanes because we commute the
+      // vector otherwise.
+      if (V2Index < 2)
+        std::swap(LowV, HighV);
+      NewMask[V2Index] -= 4;
+    } else {
+      // Handle the case where the V2 element ends up adjacent to a V1 element.
+      // To make this work, blend them together as the first step.
+      int V1Index = V2AdjIndex;
+      int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
+      V2 = DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V2, V1,
+                       getV4X86ShuffleImm8ForMask(BlendMask, DAG));
+
+      // Now proceed to reconstruct the final blend as we have the necessary
+      // high or low half formed.
+      if (V2Index < 2) {
+        LowV = V2;
+        HighV = V1;
+      } else {
+        HighV = V2;
+      }
+      NewMask[V1Index] = 2; // We put the V1 element in V2[2].
+      NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
+    }
+  } else if (NumV2Elements == 2) {
+    if (Mask[0] < 4 && Mask[1] < 4) {
+      // Handle the easy case where we have V1 in the low lanes and V2 in the
+      // high lanes. We never see this reversed because we sort the shuffle.
+      NewMask[2] -= 4;
+      NewMask[3] -= 4;
+    } else {
+      // We have a mixture of V1 and V2 in both low and high lanes. Rather than
+      // trying to place elements directly, just blend them and set up the final
+      // shuffle to place them.
+
+      // The first two blend mask elements are for V1, the second two are for
+      // V2.
+      int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
+                          Mask[2] < 4 ? Mask[2] : Mask[3],
+                          (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
+                          (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
+      V1 = DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V2,
+                       getV4X86ShuffleImm8ForMask(BlendMask, DAG));
+
+      // Now we do a normal shuffle of V1 by giving V1 as both operands to
+      // a blend.
+      LowV = HighV = V1;
+      NewMask[0] = Mask[0] < 4 ? 0 : 2;
+      NewMask[1] = Mask[0] < 4 ? 2 : 0;
+      NewMask[2] = Mask[2] < 4 ? 1 : 3;
+      NewMask[3] = Mask[2] < 4 ? 3 : 1;
+    }
+  }
+  return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, LowV, HighV,
+                     getV4X86ShuffleImm8ForMask(NewMask, DAG));
+}
+
+/// \brief Lower 4-lane i32 vector shuffles.
+///
+/// We try to handle these with integer-domain shuffles where we can, but for
+/// blends we use the floating point domain blend instructions.
+static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(Op.getSimpleValueType() == MVT::v4i32 && "Bad shuffle type!");
+  assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+
+  if (isSingleInputShuffleMask(Mask))
+    // Straight shuffle of a single input vector. For everything from SSE2
+    // onward this has a single fast instruction with no scary immediates.
+    return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
+                       getV4X86ShuffleImm8ForMask(Mask, DAG));
+
+  // We implement this with SHUFPS because it can blend from two vectors.
+  // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
+  // up the inputs, bypassing domain shift penalties that we would encur if we
+  // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
+  // relevant.
+  return DAG.getNode(ISD::BITCAST, DL, MVT::v4i32,
+                     DAG.getVectorShuffle(
+                         MVT::v4f32, DL,
+                         DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V1),
+                         DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V2), Mask));
+}
+
+/// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
+/// shuffle lowering, and the most complex part.
+///
+/// The lowering strategy is to try to form pairs of input lanes which are
+/// targeted at the same half of the final vector, and then use a dword shuffle
+/// to place them onto the right half, and finally unpack the paired lanes into
+/// their final position.
+///
+/// The exact breakdown of how to form these dword pairs and align them on the
+/// correct sides is really tricky. See the comments within the function for
+/// more of the details.
+static SDValue lowerV8I16SingleInputVectorShuffle(
+    SDLoc DL, SDValue V, MutableArrayRef<int> Mask,
+    const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+  assert(V.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
+  MutableArrayRef<int> LoMask = Mask.slice(0, 4);
+  MutableArrayRef<int> HiMask = Mask.slice(4, 4);
+
+  SmallVector<int, 4> LoInputs;
+  std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs),
+               [](int M) { return M >= 0; });
+  std::sort(LoInputs.begin(), LoInputs.end());
+  LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
+  SmallVector<int, 4> HiInputs;
+  std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs),
+               [](int M) { return M >= 0; });
+  std::sort(HiInputs.begin(), HiInputs.end());
+  HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
+  int NumLToL =
+      std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
+  int NumHToL = LoInputs.size() - NumLToL;
+  int NumLToH =
+      std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
+  int NumHToH = HiInputs.size() - NumLToH;
+  MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
+  MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
+  MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
+  MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
+
+  // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
+  // such inputs we can swap two of the dwords across the half mark and end up
+  // with <=2 inputs to each half in each half. Once there, we can fall through
+  // to the generic code below. For example:
+  //
+  // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
+  // Mask:  [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
+  //
+  // Before we had 3-1 in the low half and 3-1 in the high half. Afterward, 2-2
+  // and 2-2.
+  auto balanceSides = [&](ArrayRef<int> ThreeInputs, int OneInput,
+                          int ThreeInputHalfSum, int OneInputHalfOffset) {
+    // Compute the index of dword with only one word among the three inputs in
+    // a half by taking the sum of the half with three inputs and subtracting
+    // the sum of the actual three inputs. The difference is the remaining
+    // slot.
+    int DWordA = (ThreeInputHalfSum -
+                  std::accumulate(ThreeInputs.begin(), ThreeInputs.end(), 0)) /
+                 2;
+    int DWordB = OneInputHalfOffset / 2 + (OneInput / 2 + 1) % 2;
+
+    int PSHUFDMask[] = {0, 1, 2, 3};
+    PSHUFDMask[DWordA] = DWordB;
+    PSHUFDMask[DWordB] = DWordA;
+    V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
+                    DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
+                                DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
+                                getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
+
+    // Adjust the mask to match the new locations of A and B.
+    for (int &M : Mask)
+      if (M != -1 && M/2 == DWordA)
+        M = 2 * DWordB + M % 2;
+      else if (M != -1 && M/2 == DWordB)
+        M = 2 * DWordA + M % 2;
+
+    // Recurse back into this routine to re-compute state now that this isn't
+    // a 3 and 1 problem.
+    return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
+                                Mask);
+  };
+  if (NumLToL == 3 && NumHToL == 1)
+    return balanceSides(LToLInputs, HToLInputs[0], 0 + 1 + 2 + 3, 4);
+  else if (NumLToL == 1 && NumHToL == 3)
+    return balanceSides(HToLInputs, LToLInputs[0], 4 + 5 + 6 + 7, 0);
+  else if (NumLToH == 1 && NumHToH == 3)
+    return balanceSides(HToHInputs, LToHInputs[0], 4 + 5 + 6 + 7, 0);
+  else if (NumLToH == 3 && NumHToH == 1)
+    return balanceSides(LToHInputs, HToHInputs[0], 0 + 1 + 2 + 3, 4);
+
+  // At this point there are at most two inputs to the low and high halves from
+  // each half. That means the inputs can always be grouped into dwords and
+  // those dwords can then be moved to the correct half with a dword shuffle.
+  // We use at most one low and one high word shuffle to collect these paired
+  // inputs into dwords, and finally a dword shuffle to place them.
+  int PSHUFLMask[4] = {-1, -1, -1, -1};
+  int PSHUFHMask[4] = {-1, -1, -1, -1};
+  int PSHUFDMask[4] = {-1, -1, -1, -1};
+
+  // First fix the masks for all the inputs that are staying in their
+  // original halves. This will then dictate the targets of the cross-half
+  // shuffles.
+  auto fixInPlaceInputs = [&PSHUFDMask](
+      ArrayRef<int> InPlaceInputs, MutableArrayRef<int> SourceHalfMask,
+      MutableArrayRef<int> HalfMask, int HalfOffset) {
+    if (InPlaceInputs.empty())
+      return;
+    if (InPlaceInputs.size() == 1) {
+      SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
+          InPlaceInputs[0] - HalfOffset;
+      PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
+      return;
+    }
+
+    assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
+    SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
+        InPlaceInputs[0] - HalfOffset;
+    // Put the second input next to the first so that they are packed into
+    // a dword. We find the adjacent index by toggling the low bit.
+    int AdjIndex = InPlaceInputs[0] ^ 1;
+    SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
+    std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
+    PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
+  };
+  if (!HToLInputs.empty())
+    fixInPlaceInputs(LToLInputs, PSHUFLMask, LoMask, 0);
+  if (!LToHInputs.empty())
+    fixInPlaceInputs(HToHInputs, PSHUFHMask, HiMask, 4);
+
+  // Now gather the cross-half inputs and place them into a free dword of
+  // their target half.
+  // FIXME: This operation could almost certainly be simplified dramatically to
+  // look more like the 3-1 fixing operation.
+  auto moveInputsToRightHalf = [&PSHUFDMask](
+      MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
+      MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
+      int SourceOffset, int DestOffset) {
+    auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
+      return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word;
+    };
+    auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
+                                               int Word) {
+      int LowWord = Word & ~1;
+      int HighWord = Word | 1;
+      return isWordClobbered(SourceHalfMask, LowWord) ||
+             isWordClobbered(SourceHalfMask, HighWord);
+    };
+
+    if (IncomingInputs.empty())
+      return;
+
+    if (ExistingInputs.empty()) {
+      // Map any dwords with inputs from them into the right half.
+      for (int Input : IncomingInputs) {
+        // If the source half mask maps over the inputs, turn those into
+        // swaps and use the swapped lane.
+        if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
+          if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == -1) {
+            SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
+                Input - SourceOffset;
+            // We have to swap the uses in our half mask in one sweep.
+            for (int &M : HalfMask)
+              if (M == SourceHalfMask[Input - SourceOffset])
+                M = Input;
+              else if (M == Input)
+                M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
+          } else {
+            assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
+                       Input - SourceOffset &&
+                   "Previous placement doesn't match!");
+          }
+          // Note that this correctly re-maps both when we do a swap and when
+          // we observe the other side of the swap above. We rely on that to
+          // avoid swapping the members of the input list directly.
+          Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
+        }
+
+        // Map the input's dword into the correct half.
+        if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == -1)
+          PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
+        else
+          assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
+                     Input / 2 &&
+                 "Previous placement doesn't match!");
+      }
+
+      // And just directly shift any other-half mask elements to be same-half
+      // as we will have mirrored the dword containing the element into the
+      // same position within that half.
+      for (int &M : HalfMask)
+        if (M >= SourceOffset && M < SourceOffset + 4) {
+          M = M - SourceOffset + DestOffset;
+          assert(M >= 0 && "This should never wrap below zero!");
+        }
+      return;
+    }
+
+    // Ensure we have the input in a viable dword of its current half. This
+    // is particularly tricky because the original position may be clobbered
+    // by inputs being moved and *staying* in that half.
+    if (IncomingInputs.size() == 1) {
+      if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
+        int InputFixed = std::find(std::begin(SourceHalfMask),
+                                   std::end(SourceHalfMask), -1) -
+                         std::begin(SourceHalfMask) + SourceOffset;
+        SourceHalfMask[InputFixed - SourceOffset] =
+            IncomingInputs[0] - SourceOffset;
+        std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
+                     InputFixed);
+        IncomingInputs[0] = InputFixed;
+      }
+    } else if (IncomingInputs.size() == 2) {
+      if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
+          isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
+        int SourceDWordBase = !isDWordClobbered(SourceHalfMask, 0) ? 0 : 2;
+        assert(!isDWordClobbered(SourceHalfMask, SourceDWordBase) &&
+               "Not all dwords can be clobbered!");
+        SourceHalfMask[SourceDWordBase] = IncomingInputs[0] - SourceOffset;
+        SourceHalfMask[SourceDWordBase + 1] = IncomingInputs[1] - SourceOffset;
+        for (int &M : HalfMask)
+          if (M == IncomingInputs[0])
+            M = SourceDWordBase + SourceOffset;
+          else if (M == IncomingInputs[1])
+            M = SourceDWordBase + 1 + SourceOffset;
+        IncomingInputs[0] = SourceDWordBase + SourceOffset;
+        IncomingInputs[1] = SourceDWordBase + 1 + SourceOffset;
+      }
+    } else {
+      llvm_unreachable("Unhandled input size!");
+    }
+
+    // Now hoist the DWord down to the right half.
+    int FreeDWord = (PSHUFDMask[DestOffset / 2] == -1 ? 0 : 1) + DestOffset / 2;
+    assert(PSHUFDMask[FreeDWord] == -1 && "DWord not free");
+    PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
+    for (int Input : IncomingInputs)
+      std::replace(HalfMask.begin(), HalfMask.end(), Input,
+                   FreeDWord * 2 + Input % 2);
+  };
+  moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask,
+                        /*SourceOffset*/ 4, /*DestOffset*/ 0);
+  moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask,
+                        /*SourceOffset*/ 0, /*DestOffset*/ 4);
+
+  // Now enact all the shuffles we've computed to move the inputs into their
+  // target half.
+  if (!isNoopShuffleMask(PSHUFLMask))
+    V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
+                    getV4X86ShuffleImm8ForMask(PSHUFLMask, DAG));
+  if (!isNoopShuffleMask(PSHUFHMask))
+    V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
+                    getV4X86ShuffleImm8ForMask(PSHUFHMask, DAG));
+  if (!isNoopShuffleMask(PSHUFDMask))
+    V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
+                    DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
+                                DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
+                                getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
+
+  // At this point, each half should contain all its inputs, and we can then
+  // just shuffle them into their final position.
+  assert(std::count_if(LoMask.begin(), LoMask.end(),
+                       [](int M) { return M >= 4; }) == 0 &&
+         "Failed to lift all the high half inputs to the low mask!");
+  assert(std::count_if(HiMask.begin(), HiMask.end(),
+                       [](int M) { return M >= 0 && M < 4; }) == 0 &&
+         "Failed to lift all the low half inputs to the high mask!");
+
+  // Do a half shuffle for the low mask.
+  if (!isNoopShuffleMask(LoMask))
+    V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
+                    getV4X86ShuffleImm8ForMask(LoMask, DAG));
+
+  // Do a half shuffle with the high mask after shifting its values down.
+  for (int &M : HiMask)
+    if (M >= 0)
+      M -= 4;
+  if (!isNoopShuffleMask(HiMask))
+    V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
+                    getV4X86ShuffleImm8ForMask(HiMask, DAG));
+
+  return V;
+}
+
+/// \brief Detect whether the mask pattern should be lowered through
+/// interleaving.
+///
+/// This essentially tests whether viewing the mask as an interleaving of two
+/// sub-sequences reduces the cross-input traffic of a blend operation. If so,
+/// lowering it through interleaving is a significantly better strategy.
+static bool shouldLowerAsInterleaving(ArrayRef<int> Mask) {
+  int NumEvenInputs[2] = {0, 0};
+  int NumOddInputs[2] = {0, 0};
+  int NumLoInputs[2] = {0, 0};
+  int NumHiInputs[2] = {0, 0};
+  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+    if (Mask[i] < 0)
+      continue;
+
+    int InputIdx = Mask[i] >= Size;
+
+    if (i < Size / 2)
+      ++NumLoInputs[InputIdx];
+    else
+      ++NumHiInputs[InputIdx];
+
+    if ((i % 2) == 0)
+      ++NumEvenInputs[InputIdx];
+    else
+      ++NumOddInputs[InputIdx];
+  }
+
+  // The minimum number of cross-input results for both the interleaved and
+  // split cases. If interleaving results in fewer cross-input results, return
+  // true.
+  int InterleavedCrosses = std::min(NumEvenInputs[1] + NumOddInputs[0],
+                                    NumEvenInputs[0] + NumOddInputs[1]);
+  int SplitCrosses = std::min(NumLoInputs[1] + NumHiInputs[0],
+                              NumLoInputs[0] + NumHiInputs[1]);
+  return InterleavedCrosses < SplitCrosses;
+}
+
+/// \brief Blend two v8i16 vectors using a naive unpack strategy.
+///
+/// This strategy only works when the inputs from each vector fit into a single
+/// half of that vector, and generally there are not so many inputs as to leave
+/// the in-place shuffles required highly constrained (and thus expensive). It
+/// shifts all the inputs into a single side of both input vectors and then
+/// uses an unpack to interleave these inputs in a single vector. At that
+/// point, we will fall back on the generic single input shuffle lowering.
+static SDValue lowerV8I16BasicBlendVectorShuffle(SDLoc DL, SDValue V1,
+                                                 SDValue V2,
+                                                 MutableArrayRef<int> Mask,
+                                                 const X86Subtarget *Subtarget,
+                                                 SelectionDAG &DAG) {
+  assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
+  assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
+  SmallVector<int, 3> LoV1Inputs, HiV1Inputs, LoV2Inputs, HiV2Inputs;
+  for (int i = 0; i < 8; ++i)
+    if (Mask[i] >= 0 && Mask[i] < 4)
+      LoV1Inputs.push_back(i);
+    else if (Mask[i] >= 4 && Mask[i] < 8)
+      HiV1Inputs.push_back(i);
+    else if (Mask[i] >= 8 && Mask[i] < 12)
+      LoV2Inputs.push_back(i);
+    else if (Mask[i] >= 12)
+      HiV2Inputs.push_back(i);
+
+  int NumV1Inputs = LoV1Inputs.size() + HiV1Inputs.size();
+  int NumV2Inputs = LoV2Inputs.size() + HiV2Inputs.size();
+  (void)NumV1Inputs;
+  (void)NumV2Inputs;
+  assert(NumV1Inputs > 0 && NumV1Inputs <= 3 && "At most 3 inputs supported");
+  assert(NumV2Inputs > 0 && NumV2Inputs <= 3 && "At most 3 inputs supported");
+  assert(NumV1Inputs + NumV2Inputs <= 4 && "At most 4 combined inputs");
+
+  bool MergeFromLo = LoV1Inputs.size() + LoV2Inputs.size() >=
+                     HiV1Inputs.size() + HiV2Inputs.size();
+
+  auto moveInputsToHalf = [&](SDValue V, ArrayRef<int> LoInputs,
+                              ArrayRef<int> HiInputs, bool MoveToLo,
+                              int MaskOffset) {
+    ArrayRef<int> GoodInputs = MoveToLo ? LoInputs : HiInputs;
+    ArrayRef<int> BadInputs = MoveToLo ? HiInputs : LoInputs;
+    if (BadInputs.empty())
+      return V;
+
+    int MoveMask[] = {-1, -1, -1, -1, -1, -1, -1, -1};
+    int MoveOffset = MoveToLo ? 0 : 4;
+
+    if (GoodInputs.empty()) {
+      for (int BadInput : BadInputs) {
+        MoveMask[Mask[BadInput] % 4 + MoveOffset] = Mask[BadInput] - MaskOffset;
+        Mask[BadInput] = Mask[BadInput] % 4 + MoveOffset + MaskOffset;
+      }
+    } else {
+      if (GoodInputs.size() == 2) {
+        // If the low inputs are spread across two dwords, pack them into
+        // a single dword.
+        MoveMask[Mask[GoodInputs[0]] % 2 + MoveOffset] =
+            Mask[GoodInputs[0]] - MaskOffset;
+        MoveMask[Mask[GoodInputs[1]] % 2 + MoveOffset] =
+            Mask[GoodInputs[1]] - MaskOffset;
+        Mask[GoodInputs[0]] = Mask[GoodInputs[0]] % 2 + MoveOffset + MaskOffset;
+        Mask[GoodInputs[1]] = Mask[GoodInputs[0]] % 2 + MoveOffset + MaskOffset;
+      } else {
+        // Otherwise pin the low inputs.
+        for (int GoodInput : GoodInputs)
+          MoveMask[Mask[GoodInput] - MaskOffset] = Mask[GoodInput] - MaskOffset;
+      }
+
+      int MoveMaskIdx =
+          std::find(std::begin(MoveMask) + MoveOffset, std::end(MoveMask), -1) -
+          std::begin(MoveMask);
+      assert(MoveMaskIdx >= MoveOffset && "Established above");
+
+      if (BadInputs.size() == 2) {
+        assert(MoveMask[MoveMaskIdx] == -1 && "Expected empty slot");
+        assert(MoveMask[MoveMaskIdx + 1] == -1 && "Expected empty slot");
+        MoveMask[MoveMaskIdx + Mask[BadInputs[0]] % 2] =
+            Mask[BadInputs[0]] - MaskOffset;
+        MoveMask[MoveMaskIdx + Mask[BadInputs[1]] % 2] =
+            Mask[BadInputs[1]] - MaskOffset;
+        Mask[BadInputs[0]] = MoveMaskIdx + Mask[BadInputs[0]] % 2 + MaskOffset;
+        Mask[BadInputs[1]] = MoveMaskIdx + Mask[BadInputs[1]] % 2 + MaskOffset;
+      } else {
+        assert(BadInputs.size() == 1 && "All sizes handled");
+        MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset;
+        Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset;
+      }
+    }
+
+    return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
+                                MoveMask);
+  };
+  V1 = moveInputsToHalf(V1, LoV1Inputs, HiV1Inputs, MergeFromLo,
+                        /*MaskOffset*/ 0);
+  V2 = moveInputsToHalf(V2, LoV2Inputs, HiV2Inputs, MergeFromLo,
+                        /*MaskOffset*/ 8);
+
+  // FIXME: Select an interleaving of the merge of V1 and V2 that minimizes
+  // cross-half traffic in the final shuffle.
+
+  // Munge the mask to be a single-input mask after the unpack merges the
+  // results.
+  for (int &M : Mask)
+    if (M != -1)
+      M = 2 * (M % 4) + (M / 8);
+
+  return DAG.getVectorShuffle(
+      MVT::v8i16, DL, DAG.getNode(MergeFromLo ? X86ISD::UNPCKL : X86ISD::UNPCKH,
+                                  DL, MVT::v8i16, V1, V2),
+      DAG.getUNDEF(MVT::v8i16), Mask);
+}
+
+/// \brief Generic lowering of 8-lane i16 shuffles.
+///
+/// This handles both single-input shuffles and combined shuffle/blends with
+/// two inputs. The single input shuffles are immediately delegated to
+/// a dedicated lowering routine.
+///
+/// The blends are lowered in one of three fundamental ways. If there are few
+/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
+/// of the input is significantly cheaper when lowered as an interleaving of
+/// the two inputs, try to interleave them. Otherwise, blend the low and high
+/// halves of the inputs separately (making them have relatively few inputs)
+/// and then concatenate them.
+static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(Op.getSimpleValueType() == MVT::v8i16 && "Bad shuffle type!");
+  assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> OrigMask = SVOp->getMask();
+  int MaskStorage[8] = {OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3],
+                        OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7]};
+  MutableArrayRef<int> Mask(MaskStorage);
+
+  assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+
+  auto isV1 = [](int M) { return M >= 0 && M < 8; };
+  auto isV2 = [](int M) { return M >= 8; };
+
+  int NumV1Inputs = std::count_if(Mask.begin(), Mask.end(), isV1);
+  int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2);
+
+  if (NumV2Inputs == 0)
+    return lowerV8I16SingleInputVectorShuffle(DL, V1, Mask, Subtarget, DAG);
+
+  assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized "
+                            "to be V1-input shuffles.");
+
+  if (NumV1Inputs + NumV2Inputs <= 4)
+    return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG);
+
+  // Check whether an interleaving lowering is likely to be more efficient.
+  // This isn't perfect but it is a strong heuristic that tends to work well on
+  // the kinds of shuffles that show up in practice.
+  //
+  // FIXME: Handle 1x, 2x, and 4x interleaving.
+  if (shouldLowerAsInterleaving(Mask)) {
+    // FIXME: Figure out whether we should pack these into the low or high
+    // halves.
+
+    int EMask[8], OMask[8];
+    for (int i = 0; i < 4; ++i) {
+      EMask[i] = Mask[2*i];
+      OMask[i] = Mask[2*i + 1];
+      EMask[i + 4] = -1;
+      OMask[i + 4] = -1;
+    }
+
+    SDValue Evens = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, EMask);
+    SDValue Odds = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, OMask);
+
+    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, Evens, Odds);
+  }
+
+  int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+  int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+
+  for (int i = 0; i < 4; ++i) {
+    LoBlendMask[i] = Mask[i];
+    HiBlendMask[i] = Mask[i + 4];
+  }
+
+  SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
+  SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
+  LoV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, LoV);
+  HiV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, HiV);
+
+  return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
+                     DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, LoV, HiV));
+}
+
+/// \brief Generic lowering of v16i8 shuffles.
+///
+/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
+/// detect any complexity reducing interleaving. If that doesn't help, it uses
+/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
+/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
+/// back together.
+static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(Op.getSimpleValueType() == MVT::v16i8 && "Bad shuffle type!");
+  assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> OrigMask = SVOp->getMask();
+  assert(OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+  int MaskStorage[16] = {
+      OrigMask[0],  OrigMask[1],  OrigMask[2],  OrigMask[3],
+      OrigMask[4],  OrigMask[5],  OrigMask[6],  OrigMask[7],
+      OrigMask[8],  OrigMask[9],  OrigMask[10], OrigMask[11],
+      OrigMask[12], OrigMask[13], OrigMask[14], OrigMask[15]};
+  MutableArrayRef<int> Mask(MaskStorage);
+  MutableArrayRef<int> LoMask = Mask.slice(0, 8);
+  MutableArrayRef<int> HiMask = Mask.slice(8, 8);
+
+  // For single-input shuffles, there are some nicer lowering tricks we can use.
+  if (isSingleInputShuffleMask(Mask)) {
+    // Check whether we can widen this to an i16 shuffle by duplicating bytes.
+    // Notably, this handles splat and partial-splat shuffles more efficiently.
+    // However, it only makes sense if the pre-duplication shuffle simplifies
+    // things significantly. Currently, this means we need to be able to
+    // express the pre-duplication shuffle as an i16 shuffle.
+    //
+    // FIXME: We should check for other patterns which can be widened into an
+    // i16 shuffle as well.
+    auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
+      for (int i = 0; i < 16; i += 2) {
+        if (Mask[i] != Mask[i + 1])
+          return false;
+      }
+      return true;
+    };
+    auto tryToWidenViaDuplication = [&]() -> SDValue {
+      if (!canWidenViaDuplication(Mask))
+        return SDValue();
+      SmallVector<int, 4> LoInputs;
+      std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
+                   [](int M) { return M >= 0 && M < 8; });
+      std::sort(LoInputs.begin(), LoInputs.end());
+      LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
+                     LoInputs.end());
+      SmallVector<int, 4> HiInputs;
+      std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),
+                   [](int M) { return M >= 8; });
+      std::sort(HiInputs.begin(), HiInputs.end());
+      HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
+                     HiInputs.end());
+
+      bool TargetLo = LoInputs.size() >= HiInputs.size();
+      ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
+      ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
+
+      int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
+      SmallDenseMap<int, int, 8> LaneMap;
+      for (int I : InPlaceInputs) {
+        PreDupI16Shuffle[I/2] = I/2;
+        LaneMap[I] = I;
+      }
+      int j = TargetLo ? 0 : 4, je = j + 4;
+      for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
+        // Check if j is already a shuffle of this input. This happens when
+        // there are two adjacent bytes after we move the low one.
+        if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
+          // If we haven't yet mapped the input, search for a slot into which
+          // we can map it.
+          while (j < je && PreDupI16Shuffle[j] != -1)
+            ++j;
+
+          if (j == je)
+            // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
+            return SDValue();
+
+          // Map this input with the i16 shuffle.
+          PreDupI16Shuffle[j] = MovingInputs[i] / 2;
+        }
+
+        // Update the lane map based on the mapping we ended up with.
+        LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
+      }
+      V1 = DAG.getNode(
+          ISD::BITCAST, DL, MVT::v16i8,
+          DAG.getVectorShuffle(MVT::v8i16, DL,
+                               DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
+                               DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
+
+      // Unpack the bytes to form the i16s that will be shuffled into place.
+      V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
+                       MVT::v16i8, V1, V1);
+
+      int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+      for (int i = 0; i < 16; i += 2) {
+        if (Mask[i] != -1)
+          PostDupI16Shuffle[i / 2] = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
+        assert(PostDupI16Shuffle[i / 2] < 8 && "Invalid v8 shuffle mask!");
+      }
+      return DAG.getNode(
+          ISD::BITCAST, DL, MVT::v16i8,
+          DAG.getVectorShuffle(MVT::v8i16, DL,
+                               DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
+                               DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
+    };
+    if (SDValue V = tryToWidenViaDuplication())
+      return V;
+  }
+
+  // Check whether an interleaving lowering is likely to be more efficient.
+  // This isn't perfect but it is a strong heuristic that tends to work well on
+  // the kinds of shuffles that show up in practice.
+  //
+  // FIXME: We need to handle other interleaving widths (i16, i32, ...).
+  if (shouldLowerAsInterleaving(Mask)) {
+    // FIXME: Figure out whether we should pack these into the low or high
+    // halves.
+
+    int EMask[16], OMask[16];
+    for (int i = 0; i < 8; ++i) {
+      EMask[i] = Mask[2*i];
+      OMask[i] = Mask[2*i + 1];
+      EMask[i + 8] = -1;
+      OMask[i + 8] = -1;
+    }
+
+    SDValue Evens = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, EMask);
+    SDValue Odds = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, OMask);
+
+    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, Evens, Odds);
+  }
+
+  int V1LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+  int V1HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+  int V2LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+  int V2HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+
+  auto buildBlendMasks = [](MutableArrayRef<int> HalfMask,
+                            MutableArrayRef<int> V1HalfBlendMask,
+                            MutableArrayRef<int> V2HalfBlendMask) {
+    for (int i = 0; i < 8; ++i)
+      if (HalfMask[i] >= 0 && HalfMask[i] < 16) {
+        V1HalfBlendMask[i] = HalfMask[i];
+        HalfMask[i] = i;
+      } else if (HalfMask[i] >= 16) {
+        V2HalfBlendMask[i] = HalfMask[i] - 16;
+        HalfMask[i] = i + 8;
+      }
+  };
+  buildBlendMasks(LoMask, V1LoBlendMask, V2LoBlendMask);
+  buildBlendMasks(HiMask, V1HiBlendMask, V2HiBlendMask);
+
+  SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
+
+  auto buildLoAndHiV8s = [&](SDValue V, MutableArrayRef<int> LoBlendMask,
+                             MutableArrayRef<int> HiBlendMask) {
+    SDValue V1, V2;
+    // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
+    // them out and avoid using UNPCK{L,H} to extract the elements of V as
+    // i16s.
+    if (std::none_of(LoBlendMask.begin(), LoBlendMask.end(),
+                     [](int M) { return M >= 0 && M % 2 == 1; }) &&
+        std::none_of(HiBlendMask.begin(), HiBlendMask.end(),
+                     [](int M) { return M >= 0 && M % 2 == 1; })) {
+      // Use a mask to drop the high bytes.
+      V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
+      V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, V1,
+                       DAG.getConstant(0x00FF, MVT::v8i16));
+
+      // This will be a single vector shuffle instead of a blend so nuke V2.
+      V2 = DAG.getUNDEF(MVT::v8i16);
+
+      // Squash the masks to point directly into V1.
+      for (int &M : LoBlendMask)
+        if (M >= 0)
+          M /= 2;
+      for (int &M : HiBlendMask)
+        if (M >= 0)
+          M /= 2;
+    } else {
+      // Otherwise just unpack the low half of V into V1 and the high half into
+      // V2 so that we can blend them as i16s.
+      V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
+                       DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
+      V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
+                       DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
+    }
+
+    SDValue BlendedLo = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
+    SDValue BlendedHi = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
+    return std::make_pair(BlendedLo, BlendedHi);
+  };
+  SDValue V1Lo, V1Hi, V2Lo, V2Hi;
+  std::tie(V1Lo, V1Hi) = buildLoAndHiV8s(V1, V1LoBlendMask, V1HiBlendMask);
+  std::tie(V2Lo, V2Hi) = buildLoAndHiV8s(V2, V2LoBlendMask, V2HiBlendMask);
+
+  SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Lo, V2Lo, LoMask);
+  SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Hi, V2Hi, HiMask);
+
+  return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
+}
+
+/// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
+///
+/// This routine breaks down the specific type of 128-bit shuffle and
+/// dispatches to the lowering routines accordingly.
+static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                        MVT VT, const X86Subtarget *Subtarget,
+                                        SelectionDAG &DAG) {
+  switch (VT.SimpleTy) {
+  case MVT::v2i64:
+    return lowerV2I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
+  case MVT::v2f64:
+    return lowerV2F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
+  case MVT::v4i32:
+    return lowerV4I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
+  case MVT::v4f32:
+    return lowerV4F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
+  case MVT::v8i16:
+    return lowerV8I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
+  case MVT::v16i8:
+    return lowerV16I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
+
+  default:
+    llvm_unreachable("Unimplemented!");
+  }
+}
+
+/// \brief Tiny helper function to test whether adjacent masks are sequential.
+static bool areAdjacentMasksSequential(ArrayRef<int> Mask) {
+  for (int i = 0, Size = Mask.size(); i < Size; i += 2)
+    if (Mask[i] + 1 != Mask[i+1])
+      return false;
+
+  return true;
+}
+
+/// \brief Top-level lowering for x86 vector shuffles.
+///
+/// This handles decomposition, canonicalization, and lowering of all x86
+/// vector shuffles. Most of the specific lowering strategies are encapsulated
+/// above in helper routines. The canonicalization attempts to widen shuffles
+/// to involve fewer lanes of wider elements, consolidate symmetric patterns
+/// s.t. only one of the two inputs needs to be tested, etc.
+static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
+                                  SelectionDAG &DAG) {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  MVT VT = Op.getSimpleValueType();
+  int NumElements = VT.getVectorNumElements();
+  SDLoc dl(Op);
+
+  assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
+
+  bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
+  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
+  if (V1IsUndef && V2IsUndef)
+    return DAG.getUNDEF(VT);
+
+  // When we create a shuffle node we put the UNDEF node to second operand,
+  // but in some cases the first operand may be transformed to UNDEF.
+  // In this case we should just commute the node.
+  if (V1IsUndef)
+    return DAG.getCommutedVectorShuffle(*SVOp);
+
+  // Check for non-undef masks pointing at an undef vector and make the masks
+  // undef as well. This makes it easier to match the shuffle based solely on
+  // the mask.
+  if (V2IsUndef)
+    for (int M : Mask)
+      if (M >= NumElements) {
+        SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
+        for (int &M : NewMask)
+          if (M >= NumElements)
+            M = -1;
+        return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask);
+      }
+
+  // For integer vector shuffles, try to collapse them into a shuffle of fewer
+  // lanes but wider integers. We cap this to not form integers larger than i64
+  // but it might be interesting to form i128 integers to handle flipping the
+  // low and high halves of AVX 256-bit vectors.
+  if (VT.isInteger() && VT.getScalarSizeInBits() < 64 &&
+      areAdjacentMasksSequential(Mask)) {
+    SmallVector<int, 8> NewMask;
+    for (int i = 0, Size = Mask.size(); i < Size; i += 2)
+      NewMask.push_back(Mask[i] / 2);
+    MVT NewVT =
+        MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits() * 2),
+                         VT.getVectorNumElements() / 2);
+    V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1);
+    V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2);
+    return DAG.getNode(ISD::BITCAST, dl, VT,
+                       DAG.getVectorShuffle(NewVT, dl, V1, V2, NewMask));
+  }
+
+  int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0;
+  for (int M : SVOp->getMask())
+    if (M < 0)
+      ++NumUndefElements;
+    else if (M < NumElements)
+      ++NumV1Elements;
+    else
+      ++NumV2Elements;
+
+  // Commute the shuffle as needed such that more elements come from V1 than
+  // V2. This allows us to match the shuffle pattern strictly on how many
+  // elements come from V1 without handling the symmetric cases.
+  if (NumV2Elements > NumV1Elements)
+    return DAG.getCommutedVectorShuffle(*SVOp);
+
+  // When the number of V1 and V2 elements are the same, try to minimize the
+  // number of uses of V2 in the low half of the vector.
+  if (NumV1Elements == NumV2Elements) {
+    int LowV1Elements = 0, LowV2Elements = 0;
+    for (int M : SVOp->getMask().slice(0, NumElements / 2))
+      if (M >= NumElements)
+        ++LowV2Elements;
+      else if (M >= 0)
+        ++LowV1Elements;
+    if (LowV2Elements > LowV1Elements)
+      return DAG.getCommutedVectorShuffle(*SVOp);
+  }
+
+  // For each vector width, delegate to a specialized lowering routine.
+  if (VT.getSizeInBits() == 128)
+    return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
+
+  llvm_unreachable("Unimplemented!");
+}
+
+
+//===----------------------------------------------------------------------===//
+// Legacy vector shuffle lowering
+//
+// This code is the legacy code handling vector shuffles until the above
+// replaces its functionality and performance.
+//===----------------------------------------------------------------------===//
+
+static bool isBlendMask(ArrayRef<int> MaskVals, MVT VT, bool hasSSE41,
+                        bool hasInt256, unsigned *MaskOut = nullptr) {
+  MVT EltVT = VT.getVectorElementType();
+
+  // There is no blend with immediate in AVX-512.
+  if (VT.is512BitVector())
+    return false;
+
+  if (!hasSSE41 || EltVT == MVT::i8)
+    return false;
+  if (!hasInt256 && VT == MVT::v16i16)
+    return false;
+
+  unsigned MaskValue = 0;
+  unsigned NumElems = VT.getVectorNumElements();
+  // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
+  unsigned NumLanes = (NumElems - 1) / 8 + 1;
+  unsigned NumElemsInLane = NumElems / NumLanes;
+
+  // Blend for v16i16 should be symetric for the both lanes.
+  for (unsigned i = 0; i < NumElemsInLane; ++i) {
+
+    int SndLaneEltIdx = (NumLanes == 2) ? MaskVals[i + NumElemsInLane] : -1;
+    int EltIdx = MaskVals[i];
+
+    if ((EltIdx < 0 || EltIdx == (int)i) &&
+        (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane)))
+      continue;
+
+    if (((unsigned)EltIdx == (i + NumElems)) &&
+        (SndLaneEltIdx < 0 ||
+         (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane))
+      MaskValue |= (1 << i);
+    else
+      return false;
+  }
+
+  if (MaskOut)
+    *MaskOut = MaskValue;
+  return true;
+}
+
+// Try to lower a shuffle node into a simple blend instruction.
+// This function assumes isBlendMask returns true for this
+// SuffleVectorSDNode
+static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
+                                          unsigned MaskValue,
+                                          const X86Subtarget *Subtarget,
+                                          SelectionDAG &DAG) {
+  MVT VT = SVOp->getSimpleValueType(0);
+  MVT EltVT = VT.getVectorElementType();
+  assert(isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(),
+                     Subtarget->hasInt256() && "Trying to lower a "
+                                               "VECTOR_SHUFFLE to a Blend but "
+                                               "with the wrong mask"));
+  SDValue V1 = SVOp->getOperand(0);
+  SDValue V2 = SVOp->getOperand(1);
+  SDLoc dl(SVOp);
+  unsigned NumElems = VT.getVectorNumElements();
+
+  // Convert i32 vectors to floating point if it is not AVX2.
+  // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
+  MVT BlendVT = VT;
+  if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
+    BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
+                               NumElems);
+    V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1);
+    V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2);
+  }
+
+  SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2,
+                            DAG.getConstant(MaskValue, MVT::i32));
+  return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
+}
+
+/// In vector type \p VT, return true if the element at index \p InputIdx
+/// falls on a different 128-bit lane than \p OutputIdx.
+static bool ShuffleCrosses128bitLane(MVT VT, unsigned InputIdx,
+                                     unsigned OutputIdx) {
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+  return InputIdx * EltSize / 128 != OutputIdx * EltSize / 128;
+}
+
+/// Generate a PSHUFB if possible.  Selects elements from \p V1 according to
+/// \p MaskVals.  MaskVals[OutputIdx] = InputIdx specifies that we want to
+/// shuffle the element at InputIdx in V1 to OutputIdx in the result.  If \p
+/// MaskVals refers to elements outside of \p V1 or is undef (-1), insert a
+/// zero.
+static SDValue getPSHUFB(ArrayRef<int> MaskVals, SDValue V1, SDLoc &dl,
+                         SelectionDAG &DAG) {
+  MVT VT = V1.getSimpleValueType();
+  assert(VT.is128BitVector() || VT.is256BitVector());
+
+  MVT EltVT = VT.getVectorElementType();
+  unsigned EltSizeInBytes = EltVT.getSizeInBits() / 8;
+  unsigned NumElts = VT.getVectorNumElements();
+
+  SmallVector<SDValue, 32> PshufbMask;
+  for (unsigned OutputIdx = 0; OutputIdx < NumElts; ++OutputIdx) {
+    int InputIdx = MaskVals[OutputIdx];
+    unsigned InputByteIdx;
+
+    if (InputIdx < 0 || NumElts <= (unsigned)InputIdx)
+      InputByteIdx = 0x80;
+    else {
+      // Cross lane is not allowed.
+      if (ShuffleCrosses128bitLane(VT, InputIdx, OutputIdx))
+        return SDValue();
+      InputByteIdx = InputIdx * EltSizeInBytes;
+      // Index is an byte offset within the 128-bit lane.
+      InputByteIdx &= 0xf;
+    }
+
+    for (unsigned j = 0; j < EltSizeInBytes; ++j) {
+      PshufbMask.push_back(DAG.getConstant(InputByteIdx, MVT::i8));
+      if (InputByteIdx != 0x80)
+        ++InputByteIdx;
+    }
+  }
+
+  MVT ShufVT = MVT::getVectorVT(MVT::i8, PshufbMask.size());
+  if (ShufVT != VT)
+    V1 = DAG.getNode(ISD::BITCAST, dl, ShufVT, V1);
+  return DAG.getNode(X86ISD::PSHUFB, dl, ShufVT, V1,
+                     DAG.getNode(ISD::BUILD_VECTOR, dl, ShufVT, PshufbMask));
+}
+
+// v8i16 shuffles - Prefer shuffles in the following order:
+// 1. [all]   pshuflw, pshufhw, optional move
+// 2. [ssse3] 1 x pshufb
+// 3. [ssse3] 2 x pshufb + 1 x por
+// 4. [all]   mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
+static SDValue
+LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
+                         SelectionDAG &DAG) {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  SDValue V1 = SVOp->getOperand(0);
+  SDValue V2 = SVOp->getOperand(1);
+  SDLoc dl(SVOp);
+  SmallVector<int, 8> MaskVals;
+
+  // Determine if more than 1 of the words in each of the low and high quadwords
+  // of the result come from the same quadword of one of the two inputs.  Undef
+  // mask values count as coming from any quadword, for better codegen.
+  //
+  // Lo/HiQuad[i] = j indicates how many words from the ith quad of the input
+  // feeds this quad.  For i, 0 and 1 refer to V1, 2 and 3 refer to V2.
+  unsigned LoQuad[] = { 0, 0, 0, 0 };
+  unsigned HiQuad[] = { 0, 0, 0, 0 };
+  // Indices of quads used.
+  std::bitset<4> InputQuads;
+  for (unsigned i = 0; i < 8; ++i) {
+    unsigned *Quad = i < 4 ? LoQuad : HiQuad;
+    int EltIdx = SVOp->getMaskElt(i);
+    MaskVals.push_back(EltIdx);
+    if (EltIdx < 0) {
+      ++Quad[0];
+      ++Quad[1];
+      ++Quad[2];
+      ++Quad[3];
+      continue;
+    }
+    ++Quad[EltIdx / 4];
+    InputQuads.set(EltIdx / 4);
+  }
+
+  int BestLoQuad = -1;
+  unsigned MaxQuad = 1;
+  for (unsigned i = 0; i < 4; ++i) {
+    if (LoQuad[i] > MaxQuad) {
+      BestLoQuad = i;
+      MaxQuad = LoQuad[i];
+    }
+  }
+
+  int BestHiQuad = -1;
+  MaxQuad = 1;
+  for (unsigned i = 0; i < 4; ++i) {
+    if (HiQuad[i] > MaxQuad) {
+      BestHiQuad = i;
+      MaxQuad = HiQuad[i];
+    }
+  }
+
+  // For SSSE3, If all 8 words of the result come from only 1 quadword of each
+  // of the two input vectors, shuffle them into one input vector so only a
+  // single pshufb instruction is necessary. If there are more than 2 input
+  // quads, disable the next transformation since it does not help SSSE3.
+  bool V1Used = InputQuads[0] || InputQuads[1];
+  bool V2Used = InputQuads[2] || InputQuads[3];
+  if (Subtarget->hasSSSE3()) {
+    if (InputQuads.count() == 2 && V1Used && V2Used) {
+      BestLoQuad = InputQuads[0] ? 0 : 1;
+      BestHiQuad = InputQuads[2] ? 2 : 3;
+    }
+    if (InputQuads.count() > 2) {
+      BestLoQuad = -1;
+      BestHiQuad = -1;
+    }
+  }
+
+  // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
+  // the shuffle mask.  If a quad is scored as -1, that means that it contains
+  // words from all 4 input quadwords.
+  SDValue NewV;
+  if (BestLoQuad >= 0 || BestHiQuad >= 0) {
+    int MaskV[] = {
+      BestLoQuad < 0 ? 0 : BestLoQuad,
+      BestHiQuad < 0 ? 1 : BestHiQuad
+    };
+    NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
+                  DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1),
+                  DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]);
+    NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV);
+
+    // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
+    // source words for the shuffle, to aid later transformations.
+    bool AllWordsInNewV = true;
+    bool InOrder[2] = { true, true };
+    for (unsigned i = 0; i != 8; ++i) {
+      int idx = MaskVals[i];
+      if (idx != (int)i)
+        InOrder[i/4] = false;
+      if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
+        continue;
+      AllWordsInNewV = false;
+      break;
+    }
+
+    bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
+    if (AllWordsInNewV) {
+      for (int i = 0; i != 8; ++i) {
+        int idx = MaskVals[i];
+        if (idx < 0)
+          continue;
+        idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
+        if ((idx != i) && idx < 4)
+          pshufhw = false;
+        if ((idx != i) && idx > 3)
+          pshuflw = false;
+      }
+      V1 = NewV;
+      V2Used = false;
+      BestLoQuad = 0;
+      BestHiQuad = 1;
+    }
+
+    // If we've eliminated the use of V2, and the new mask is a pshuflw or
+    // pshufhw, that's as cheap as it gets.  Return the new shuffle.
+    if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
+      unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW;
+      unsigned TargetMask = 0;
+      NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
+                                  DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
+      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
+      TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp):
+                             getShufflePSHUFLWImmediate(SVOp);
+      V1 = NewV.getOperand(0);
+      return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG);
+    }
+  }
+
+  // Promote splats to a larger type which usually leads to more efficient code.
+  // FIXME: Is this true if pshufb is available?
+  if (SVOp->isSplat())
+    return PromoteSplat(SVOp, DAG);
+
+  // If we have SSSE3, and all words of the result are from 1 input vector,
+  // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
+  // is present, fall back to case 4.
+  if (Subtarget->hasSSSE3()) {
+    SmallVector<SDValue,16> pshufbMask;
+
+    // If we have elements from both input vectors, set the high bit of the
+    // shuffle mask element to zero out elements that come from V2 in the V1
+    // mask, and elements that come from V1 in the V2 mask, so that the two
+    // results can be OR'd together.
+    bool TwoInputs = V1Used && V2Used;
+    V1 = getPSHUFB(MaskVals, V1, dl, DAG);
+    if (!TwoInputs)
+      return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
+
+    // Calculate the shuffle mask for the second input, shuffle it, and
+    // OR it with the first shuffled input.
+    CommuteVectorShuffleMask(MaskVals, 8);
+    V2 = getPSHUFB(MaskVals, V2, dl, DAG);
+    V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
+    return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
+  }
+
+  // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
+  // and update MaskVals with new element order.
+  std::bitset<8> InOrder;
+  if (BestLoQuad >= 0) {
+    int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 };
+    for (int i = 0; i != 4; ++i) {
+      int idx = MaskVals[i];
+      if (idx < 0) {
+        InOrder.set(i);
+      } else if ((idx / 4) == BestLoQuad) {
+        MaskV[i] = idx & 3;
+        InOrder.set(i);
+      }
+    }
+    NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
+                                &MaskV[0]);
+
+    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
+      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
+      NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
+                                  NewV.getOperand(0),
+                                  getShufflePSHUFLWImmediate(SVOp), DAG);
+    }
+  }
+
+  // If BestHi >= 0, generate a pshufhw to put the high elements in order,
+  // and update MaskVals with the new element order.
+  if (BestHiQuad >= 0) {
+    int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 };
+    for (unsigned i = 4; i != 8; ++i) {
+      int idx = MaskVals[i];
+      if (idx < 0) {
+        InOrder.set(i);
+      } else if ((idx / 4) == BestHiQuad) {
+        MaskV[i] = (idx & 3) + 4;
+        InOrder.set(i);
+      }
+    }
+    NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
+                                &MaskV[0]);
+
+    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
+      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
+      NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
+                                  NewV.getOperand(0),
+                                  getShufflePSHUFHWImmediate(SVOp), DAG);
+    }
+  }
+
+  // In case BestHi & BestLo were both -1, which means each quadword has a word
+  // from each of the four input quadwords, calculate the InOrder bitvector now
+  // before falling through to the insert/extract cleanup.
+  if (BestLoQuad == -1 && BestHiQuad == -1) {
+    NewV = V1;
+    for (int i = 0; i != 8; ++i)
+      if (MaskVals[i] < 0 || MaskVals[i] == i)
+        InOrder.set(i);
+  }
+
+  // The other elements are put in the right place using pextrw and pinsrw.
+  for (unsigned i = 0; i != 8; ++i) {
+    if (InOrder[i])
+      continue;
+    int EltIdx = MaskVals[i];
+    if (EltIdx < 0)
+      continue;
+    SDValue ExtOp = (EltIdx < 8) ?
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
+                  DAG.getIntPtrConstant(EltIdx)) :
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
+                  DAG.getIntPtrConstant(EltIdx - 8));
+    NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
+                       DAG.getIntPtrConstant(i));
+  }
+  return NewV;
+}
+
+/// \brief v16i16 shuffles
+///
+/// FIXME: We only support generation of a single pshufb currently.  We can
+/// generalize the other applicable cases from LowerVECTOR_SHUFFLEv8i16 as
+/// well (e.g 2 x pshufb + 1 x por).
+static SDValue
+LowerVECTOR_SHUFFLEv16i16(SDValue Op, SelectionDAG &DAG) {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  SDValue V1 = SVOp->getOperand(0);
+  SDValue V2 = SVOp->getOperand(1);
+  SDLoc dl(SVOp);
+
+  if (V2.getOpcode() != ISD::UNDEF)
+    return SDValue();
+
+  SmallVector<int, 16> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
+  return getPSHUFB(MaskVals, V1, dl, DAG);
+}
+
+// v16i8 shuffles - Prefer shuffles in the following order:
+// 1. [ssse3] 1 x pshufb
+// 2. [ssse3] 2 x pshufb + 1 x por
+// 3. [all]   v8i16 shuffle + N x pextrw + rotate + pinsrw
+static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
+                                        const X86Subtarget* Subtarget,
+                                        SelectionDAG &DAG) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue V1 = SVOp->getOperand(0);
+  SDValue V2 = SVOp->getOperand(1);
+  SDLoc dl(SVOp);
+  ArrayRef<int> MaskVals = SVOp->getMask();
+
+  // Promote splats to a larger type which usually leads to more efficient code.
+  // FIXME: Is this true if pshufb is available?
+  if (SVOp->isSplat())
+    return PromoteSplat(SVOp, DAG);
+
+  // If we have SSSE3, case 1 is generated when all result bytes come from
+  // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is
+  // present, fall back to case 3.
+
+  // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
+  if (Subtarget->hasSSSE3()) {
+    SmallVector<SDValue,16> pshufbMask;
+
+    // If all result elements are from one input vector, then only translate
+    // undef mask values to 0x80 (zero out result) in the pshufb mask.
+    //
+    // Otherwise, we have elements from both input vectors, and must zero out
+    // elements that come from V2 in the first mask, and V1 in the second mask
+    // so that we can OR them together.
+    for (unsigned i = 0; i != 16; ++i) {
+      int EltIdx = MaskVals[i];
+      if (EltIdx < 0 || EltIdx >= 16)
+        EltIdx = 0x80;
+      pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
+    }
+    V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
+                     DAG.getNode(ISD::BUILD_VECTOR, dl,
+                                 MVT::v16i8, pshufbMask));
+
+    // As PSHUFB will zero elements with negative indices, it's safe to ignore
+    // the 2nd operand if it's undefined or zero.
+    if (V2.getOpcode() == ISD::UNDEF ||
+        ISD::isBuildVectorAllZeros(V2.getNode()))
+      return V1;
+
+    // Calculate the shuffle mask for the second input, shuffle it, and
+    // OR it with the first shuffled input.
+    pshufbMask.clear();
+    for (unsigned i = 0; i != 16; ++i) {
+      int EltIdx = MaskVals[i];
+      EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16;
+      pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
+    }
+    V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
+                     DAG.getNode(ISD::BUILD_VECTOR, dl,
+                                 MVT::v16i8, pshufbMask));
+    return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
+  }
+
+  // No SSSE3 - Calculate in place words and then fix all out of place words
+  // With 0-16 extracts & inserts.  Worst case is 16 bytes out of order from
+  // the 16 different words that comprise the two doublequadword input vectors.
+  V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
+  V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
+  SDValue NewV = V1;
+  for (int i = 0; i != 8; ++i) {
+    int Elt0 = MaskVals[i*2];
+    int Elt1 = MaskVals[i*2+1];
+
+    // This word of the result is all undef, skip it.
+    if (Elt0 < 0 && Elt1 < 0)
+      continue;
+
+    // This word of the result is already in the correct place, skip it.
+    if ((Elt0 == i*2) && (Elt1 == i*2+1))
+      continue;
+
+    SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
+    SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
+    SDValue InsElt;
+
+    // If Elt0 and Elt1 are defined, are consecutive, and can be load
+    // using a single extract together, load it and store it.
+    if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
+      InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
+                           DAG.getIntPtrConstant(Elt1 / 2));
+      NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
+                        DAG.getIntPtrConstant(i));
+      continue;
+    }
+
+    // If Elt1 is defined, extract it from the appropriate source.  If the
+    // source byte is not also odd, shift the extracted word left 8 bits
+    // otherwise clear the bottom 8 bits if we need to do an or.
+    if (Elt1 >= 0) {
+      InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
+                           DAG.getIntPtrConstant(Elt1 / 2));
+      if ((Elt1 & 1) == 0)
+        InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
+                             DAG.getConstant(8,
+                                  TLI.getShiftAmountTy(InsElt.getValueType())));
+      else if (Elt0 >= 0)
+        InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
+                             DAG.getConstant(0xFF00, MVT::i16));
+    }
+    // If Elt0 is defined, extract it from the appropriate source.  If the
+    // source byte is not also even, shift the extracted word right 8 bits. If
+    // Elt1 was also defined, OR the extracted values together before
+    // inserting them in the result.
+    if (Elt0 >= 0) {
+      SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
+                                    Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
+      if ((Elt0 & 1) != 0)
+        InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
+                              DAG.getConstant(8,
+                                 TLI.getShiftAmountTy(InsElt0.getValueType())));
+      else if (Elt1 >= 0)
+        InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
+                             DAG.getConstant(0x00FF, MVT::i16));
+      InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
+                         : InsElt0;
+    }
+    NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
+                       DAG.getIntPtrConstant(i));
+  }
+  return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV);
+}
+
+// v32i8 shuffles - Translate to VPSHUFB if possible.
+static
+SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
+                                 const X86Subtarget *Subtarget,
+                                 SelectionDAG &DAG) {
+  MVT VT = SVOp->getSimpleValueType(0);
+  SDValue V1 = SVOp->getOperand(0);
+  SDValue V2 = SVOp->getOperand(1);
+  SDLoc dl(SVOp);
+  SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
+
+  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
+  bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode());
+  bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode());
+
+  // VPSHUFB may be generated if
+  // (1) one of input vector is undefined or zeroinitializer.
+  // The mask value 0x80 puts 0 in the corresponding slot of the vector.
+  // And (2) the mask indexes don't cross the 128-bit lane.
+  if (VT != MVT::v32i8 || !Subtarget->hasInt256() ||
+      (!V2IsUndef && !V2IsAllZero && !V1IsAllZero))
+    return SDValue();
+
+  if (V1IsAllZero && !V2IsAllZero) {
+    CommuteVectorShuffleMask(MaskVals, 32);
+    V1 = V2;
+  }
+  return getPSHUFB(MaskVals, V1, dl, DAG);
+}
+
+/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
+/// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be
+/// done when every pair / quad of shuffle mask elements point to elements in
+/// the right sequence. e.g.
+/// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15>
+static
+SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
+                                 SelectionDAG &DAG) {
+  MVT VT = SVOp->getSimpleValueType(0);
+  SDLoc dl(SVOp);
+  unsigned NumElems = VT.getVectorNumElements();
+  MVT NewVT;
+  unsigned Scale;
+  switch (VT.SimpleTy) {
+  default: llvm_unreachable("Unexpected!");
+  case MVT::v2i64:
+  case MVT::v2f64:
+           return SDValue(SVOp, 0);
+  case MVT::v4f32:  NewVT = MVT::v2f64; Scale = 2; break;
+  case MVT::v4i32:  NewVT = MVT::v2i64; Scale = 2; break;
+  case MVT::v8i16:  NewVT = MVT::v4i32; Scale = 2; break;
+  case MVT::v16i8:  NewVT = MVT::v4i32; Scale = 4; break;
+  case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break;
+  case MVT::v32i8:  NewVT = MVT::v8i32; Scale = 4; break;
+  }
+
+  SmallVector<int, 8> MaskVec;
+  for (unsigned i = 0; i != NumElems; i += Scale) {
+    int StartIdx = -1;
+    for (unsigned j = 0; j != Scale; ++j) {
+      int EltIdx = SVOp->getMaskElt(i+j);
+      if (EltIdx < 0)
+        continue;
+      if (StartIdx < 0)
+        StartIdx = (EltIdx / Scale);
+      if (EltIdx != (int)(StartIdx*Scale + j))
+        return SDValue();
+    }
+    MaskVec.push_back(StartIdx);
+  }
+
+  SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0));
+  SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1));
+  return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
+}
+
+/// getVZextMovL - Return a zero-extending vector move low node.
+///
+static SDValue getVZextMovL(MVT VT, MVT OpVT,
+                            SDValue SrcOp, SelectionDAG &DAG,
+                            const X86Subtarget *Subtarget, SDLoc dl) {
+  if (VT == MVT::v2f64 || VT == MVT::v4f32) {
+    LoadSDNode *LD = nullptr;
+    if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
+      LD = dyn_cast<LoadSDNode>(SrcOp);
+    if (!LD) {
+      // movssrr and movsdrr do not clear top bits. Try to use movd, movq
+      // instead.
+      MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
+      if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) &&
+          SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+          SrcOp.getOperand(0).getOpcode() == ISD::BITCAST &&
+          SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
+        // PR2108
+        OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
+        return DAG.getNode(ISD::BITCAST, dl, VT,
+                           DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
+                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+                                                   OpVT,
+                                                   SrcOp.getOperand(0)
+                                                          .getOperand(0))));
+      }
+    }
+  }
+
+  return DAG.getNode(ISD::BITCAST, dl, VT,
+                     DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
+                                 DAG.getNode(ISD::BITCAST, dl,
+                                             OpVT, SrcOp)));
+}
+
+/// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
+/// which could not be matched by any known target speficic shuffle
+static SDValue
+LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
+
+  SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG);
+  if (NewOp.getNode())
+    return NewOp;
+
+  MVT VT = SVOp->getSimpleValueType(0);
+
+  unsigned NumElems = VT.getVectorNumElements();
+  unsigned NumLaneElems = NumElems / 2;
+
+  SDLoc dl(SVOp);
+  MVT EltVT = VT.getVectorElementType();
+  MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
+  SDValue Output[2];
+
+  SmallVector<int, 16> Mask;
+  for (unsigned l = 0; l < 2; ++l) {
+    // Build a shuffle mask for the output, discovering on the fly which
+    // input vectors to use as shuffle operands (recorded in InputUsed).
+    // If building a suitable shuffle vector proves too hard, then bail
+    // out with UseBuildVector set.
+    bool UseBuildVector = false;
+    int InputUsed[2] = { -1, -1 }; // Not yet discovered.
+    unsigned LaneStart = l * NumLaneElems;
+    for (unsigned i = 0; i != NumLaneElems; ++i) {
+      // The mask element.  This indexes into the input.
+      int Idx = SVOp->getMaskElt(i+LaneStart);
+      if (Idx < 0) {
+        // the mask element does not index into any input vector.
+        Mask.push_back(-1);
+        continue;
+      }
+
+      // The input vector this mask element indexes into.
+      int Input = Idx / NumLaneElems;
+
+      // Turn the index into an offset from the start of the input vector.
+      Idx -= Input * NumLaneElems;
+
+      // Find or create a shuffle vector operand to hold this input.
+      unsigned OpNo;
+      for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
+        if (InputUsed[OpNo] == Input)
+          // This input vector is already an operand.
+          break;
+        if (InputUsed[OpNo] < 0) {
+          // Create a new operand for this input vector.
+          InputUsed[OpNo] = Input;
+          break;
+        }
+      }
+
+      if (OpNo >= array_lengthof(InputUsed)) {
+        // More than two input vectors used!  Give up on trying to create a
+        // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
+        UseBuildVector = true;
+        break;
+      }
+
+      // Add the mask index for the new shuffle vector.
+      Mask.push_back(Idx + OpNo * NumLaneElems);
+    }
+
+    if (UseBuildVector) {
+      SmallVector<SDValue, 16> SVOps;
+      for (unsigned i = 0; i != NumLaneElems; ++i) {
+        // The mask element.  This indexes into the input.
+        int Idx = SVOp->getMaskElt(i+LaneStart);
+        if (Idx < 0) {
+          SVOps.push_back(DAG.getUNDEF(EltVT));
+          continue;
+        }
+
+        // The input vector this mask element indexes into.
+        int Input = Idx / NumElems;
+
+        // Turn the index into an offset from the start of the input vector.
+        Idx -= Input * NumElems;
+
+        // Extract the vector element by hand.
+        SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
+                                    SVOp->getOperand(Input),
+                                    DAG.getIntPtrConstant(Idx)));
+      }
+
+      // Construct the output using a BUILD_VECTOR.
+      Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, SVOps);
+    } else if (InputUsed[0] < 0) {
+      // No input vectors were used! The result is undefined.
+      Output[l] = DAG.getUNDEF(NVT);
+    } else {
+      SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2),
+                                        (InputUsed[0] % 2) * NumLaneElems,
+                                        DAG, dl);
+      // If only one input was used, use an undefined vector for the other.
+      SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) :
+        Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2),
+                            (InputUsed[1] % 2) * NumLaneElems, DAG, dl);
+      // At least one input vector was used. Create a new shuffle vector.
+      Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]);
+    }
+
+    Mask.clear();
+  }
+
+  // Concatenate the result back
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]);
+}
+
+/// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
+/// 4 elements, and match them with several different shuffle types.
+static SDValue
+LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
+  SDValue V1 = SVOp->getOperand(0);
+  SDValue V2 = SVOp->getOperand(1);
+  SDLoc dl(SVOp);
+  MVT VT = SVOp->getSimpleValueType(0);
+
+  assert(VT.is128BitVector() && "Unsupported vector size");
+
+  std::pair<int, int> Locs[4];
+  int Mask1[] = { -1, -1, -1, -1 };
+  SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end());
+
+  unsigned NumHi = 0;
+  unsigned NumLo = 0;
+  for (unsigned i = 0; i != 4; ++i) {
+    int Idx = PermMask[i];
+    if (Idx < 0) {
+      Locs[i] = std::make_pair(-1, -1);
+    } else {
+      assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
+      if (Idx < 4) {
+        Locs[i] = std::make_pair(0, NumLo);
+        Mask1[NumLo] = Idx;
+        NumLo++;
+      } else {
+        Locs[i] = std::make_pair(1, NumHi);
+        if (2+NumHi < 4)
+          Mask1[2+NumHi] = Idx;
+        NumHi++;
+      }
+    }
+  }
+
+  if (NumLo <= 2 && NumHi <= 2) {
+    // If no more than two elements come from either vector. This can be
+    // implemented with two shuffles. First shuffle gather the elements.
+    // The second shuffle, which takes the first shuffle as both of its
+    // vector operands, put the elements into the right order.
+    V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
+
+    int Mask2[] = { -1, -1, -1, -1 };
+
+    for (unsigned i = 0; i != 4; ++i)
+      if (Locs[i].first != -1) {
+        unsigned Idx = (i < 2) ? 0 : 4;
+        Idx += Locs[i].first * 2 + Locs[i].second;
+        Mask2[i] = Idx;
+      }
+
+    return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
+  }
+
+  if (NumLo == 3 || NumHi == 3) {
+    // Otherwise, we must have three elements from one vector, call it X, and
+    // one element from the other, call it Y.  First, use a shufps to build an
+    // intermediate vector with the one element from Y and the element from X
+    // that will be in the same half in the final destination (the indexes don't
+    // matter). Then, use a shufps to build the final vector, taking the half
+    // containing the element from Y from the intermediate, and the other half
+    // from X.
+    if (NumHi == 3) {
+      // Normalize it so the 3 elements come from V1.
+      CommuteVectorShuffleMask(PermMask, 4);
+      std::swap(V1, V2);
+    }
+
+    // Find the element from V2.
+    unsigned HiIndex;
+    for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
+      int Val = PermMask[HiIndex];
+      if (Val < 0)
+        continue;
+      if (Val >= 4)
+        break;
+    }
+
+    Mask1[0] = PermMask[HiIndex];
+    Mask1[1] = -1;
+    Mask1[2] = PermMask[HiIndex^1];
+    Mask1[3] = -1;
+    V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
+
+    if (HiIndex >= 2) {
+      Mask1[0] = PermMask[0];
+      Mask1[1] = PermMask[1];
+      Mask1[2] = HiIndex & 1 ? 6 : 4;
+      Mask1[3] = HiIndex & 1 ? 4 : 6;
+      return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
+    }
+
+    Mask1[0] = HiIndex & 1 ? 2 : 0;
+    Mask1[1] = HiIndex & 1 ? 0 : 2;
+    Mask1[2] = PermMask[2];
+    Mask1[3] = PermMask[3];
+    if (Mask1[2] >= 0)
+      Mask1[2] += 4;
+    if (Mask1[3] >= 0)
+      Mask1[3] += 4;
+    return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
+  }
+
+  // Break it into (shuffle shuffle_hi, shuffle_lo).
+  int LoMask[] = { -1, -1, -1, -1 };
+  int HiMask[] = { -1, -1, -1, -1 };
+
+  int *MaskPtr = LoMask;
+  unsigned MaskIdx = 0;
+  unsigned LoIdx = 0;
+  unsigned HiIdx = 2;
+  for (unsigned i = 0; i != 4; ++i) {
+    if (i == 2) {
+      MaskPtr = HiMask;
+      MaskIdx = 1;
+      LoIdx = 0;
+      HiIdx = 2;
+    }
+    int Idx = PermMask[i];
+    if (Idx < 0) {
+      Locs[i] = std::make_pair(-1, -1);
+    } else if (Idx < 4) {
+      Locs[i] = std::make_pair(MaskIdx, LoIdx);
+      MaskPtr[LoIdx] = Idx;
+      LoIdx++;
+    } else {
+      Locs[i] = std::make_pair(MaskIdx, HiIdx);
+      MaskPtr[HiIdx] = Idx;
+      HiIdx++;
+    }
+  }
+
+  SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
+  SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
+  int MaskOps[] = { -1, -1, -1, -1 };
+  for (unsigned i = 0; i != 4; ++i)
+    if (Locs[i].first != -1)
+      MaskOps[i] = Locs[i].first * 4 + Locs[i].second;
+  return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
+}
+
+static bool MayFoldVectorLoad(SDValue V) {
+  while (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
+    V = V.getOperand(0);
+
+  if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
+    V = V.getOperand(0);
+  if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR &&
+      V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF)
+    // BUILD_VECTOR (load), undef
+    V = V.getOperand(0);
+
+  return MayFoldLoad(V);
+}
+
+static
+SDValue getMOVDDup(SDValue &Op, SDLoc &dl, SDValue V1, SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+
+  // Canonizalize to v2f64.
+  V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
+  return DAG.getNode(ISD::BITCAST, dl, VT,
+                     getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64,
+                                          V1, DAG));
+}
+
+static
+SDValue getMOVLowToHigh(SDValue &Op, SDLoc &dl, SelectionDAG &DAG,
+                        bool HasSSE2) {
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  MVT VT = Op.getSimpleValueType();
+
+  assert(VT != MVT::v2i64 && "unsupported shuffle type");
+
+  if (HasSSE2 && VT == MVT::v2f64)
+    return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
+
+  // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1)
+  return DAG.getNode(ISD::BITCAST, dl, VT,
+                     getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32,
+                           DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1),
+                           DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG));
+}
+
+static
+SDValue getMOVHighToLow(SDValue &Op, SDLoc &dl, SelectionDAG &DAG) {
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  MVT VT = Op.getSimpleValueType();
+
+  assert((VT == MVT::v4i32 || VT == MVT::v4f32) &&
+         "unsupported shuffle type");
+
+  if (V2.getOpcode() == ISD::UNDEF)
+    V2 = V1;
+
+  // v4i32 or v4f32
+  return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG);
+}
+
+static
+SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  MVT VT = Op.getSimpleValueType();
+  unsigned NumElems = VT.getVectorNumElements();
+
+  // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second
+  // operand of these instructions is only memory, so check if there's a
+  // potencial load folding here, otherwise use SHUFPS or MOVSD to match the
+  // same masks.
+  bool CanFoldLoad = false;
+
+  // Trivial case, when V2 comes from a load.
+  if (MayFoldVectorLoad(V2))
+    CanFoldLoad = true;
+
+  // When V1 is a load, it can be folded later into a store in isel, example:
+  //  (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1)
+  //    turns into:
+  //  (MOVLPSmr addr:$src1, VR128:$src2)
+  // So, recognize this potential and also use MOVLPS or MOVLPD
+  else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
+    CanFoldLoad = true;
+
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  if (CanFoldLoad) {
+    if (HasSSE2 && NumElems == 2)
+      return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
+
+    if (NumElems == 4)
+      // If we don't care about the second element, proceed to use movss.
+      if (SVOp->getMaskElt(1) != -1)
+        return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
+  }
+
+  // movl and movlp will both match v2i64, but v2i64 is never matched by
+  // movl earlier because we make it strict to avoid messing with the movlp load
+  // folding logic (see the code above getMOVLP call). Match it here then,
+  // this is horrible, but will stay like this until we move all shuffle
+  // matching to x86 specific nodes. Note that for the 1st condition all
+  // types are matched with movsd.
+  if (HasSSE2) {
+    // FIXME: isMOVLMask should be checked and matched before getMOVLP,
+    // as to remove this logic from here, as much as possible
+    if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT))
+      return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
+    return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
+  }
+
+  assert(VT != MVT::v4i32 && "unsupported shuffle type");
+
+  // Invert the operand order and use SHUFPS to match it.
+  return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1,
+                              getShuffleSHUFImmediate(SVOp), DAG);
+}
+
+static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index,
+                                         SelectionDAG &DAG) {
+  SDLoc dl(Load);
+  MVT VT = Load->getSimpleValueType(0);
+  MVT EVT = VT.getVectorElementType();
+  SDValue Addr = Load->getOperand(1);
+  SDValue NewAddr = DAG.getNode(
+      ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
+      DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType()));
+
+  SDValue NewLoad =
+      DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
+                  DAG.getMachineFunction().getMachineMemOperand(
+                      Load->getMemOperand(), 0, EVT.getStoreSize()));
+  return NewLoad;
+}
+
+// It is only safe to call this function if isINSERTPSMask is true for
+// this shufflevector mask.
+static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
+                           SelectionDAG &DAG) {
+  // Generate an insertps instruction when inserting an f32 from memory onto a
+  // v4f32 or when copying a member from one v4f32 to another.
+  // We also use it for transferring i32 from one register to another,
+  // since it simply copies the same bits.
+  // If we're transferring an i32 from memory to a specific element in a
+  // register, we output a generic DAG that will match the PINSRD
+  // instruction.
+  MVT VT = SVOp->getSimpleValueType(0);
+  MVT EVT = VT.getVectorElementType();
+  SDValue V1 = SVOp->getOperand(0);
+  SDValue V2 = SVOp->getOperand(1);
+  auto Mask = SVOp->getMask();
+  assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
+         "unsupported vector type for insertps/pinsrd");
+
+  auto FromV1Predicate = [](const int &i) { return i < 4 && i > -1; };
+  auto FromV2Predicate = [](const int &i) { return i >= 4; };
+  int FromV1 = std::count_if(Mask.begin(), Mask.end(), FromV1Predicate);
+
+  SDValue From;
+  SDValue To;
+  unsigned DestIndex;
+  if (FromV1 == 1) {
+    From = V1;
+    To = V2;
+    DestIndex = std::find_if(Mask.begin(), Mask.end(), FromV1Predicate) -
+                Mask.begin();
+
+    // If we have 1 element from each vector, we have to check if we're
+    // changing V1's element's place. If so, we're done. Otherwise, we
+    // should assume we're changing V2's element's place and behave
+    // accordingly.
+    int FromV2 = std::count_if(Mask.begin(), Mask.end(), FromV2Predicate);
+    if (FromV1 == FromV2 && DestIndex == Mask[DestIndex] % 4) {
+      From = V2;
+      To = V1;
+      DestIndex =
+          std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();
+    }
+  } else {
+    assert(std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) == 1 &&
+           "More than one element from V1 and from V2, or no elements from one "
+           "of the vectors. This case should not have returned true from "
+           "isINSERTPSMask");
+    From = V2;
+    To = V1;
+    DestIndex =
+        std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();
+  }
+
+  // Get an index into the source vector in the range [0,4) (the mask is
+  // in the range [0,8) because it can address V1 and V2)
+  unsigned SrcIndex = Mask[DestIndex] % 4;
+  if (MayFoldLoad(From)) {
+    // Trivial case, when From comes from a load and is only used by the
+    // shuffle. Make it use insertps from the vector that we need from that
+    // load.
+    SDValue NewLoad =
+        NarrowVectorLoadToElement(cast<LoadSDNode>(From), SrcIndex, DAG);
+    if (!NewLoad.getNode())
+      return SDValue();
+
+    if (EVT == MVT::f32) {
+      // Create this as a scalar to vector to match the instruction pattern.
+      SDValue LoadScalarToVector =
+          DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, NewLoad);
+      SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4);
+      return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector,
+                         InsertpsMask);
+    } else { // EVT == MVT::i32
+      // If we're getting an i32 from memory, use an INSERT_VECTOR_ELT
+      // instruction, to match the PINSRD instruction, which loads an i32 to a
+      // certain vector element.
+      return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, To, NewLoad,
+                         DAG.getConstant(DestIndex, MVT::i32));
+    }
+  }
+
+  // Vector-element-to-vector
+  SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6);
+  return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask);
+}
+
+// Reduce a vector shuffle to zext.
+static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
+                                    SelectionDAG &DAG) {
+  // PMOVZX is only available from SSE41.
+  if (!Subtarget->hasSSE41())
+    return SDValue();
+
+  MVT VT = Op.getSimpleValueType();
+
+  // Only AVX2 support 256-bit vector integer extending.
+  if (!Subtarget->hasInt256() && VT.is256BitVector())
+    return SDValue();
+
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  SDLoc DL(Op);
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  unsigned NumElems = VT.getVectorNumElements();
+
+  // Extending is an unary operation and the element type of the source vector
+  // won't be equal to or larger than i64.
+  if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() ||
+      VT.getVectorElementType() == MVT::i64)
+    return SDValue();
+
+  // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4.
+  unsigned Shift = 1; // Start from 2, i.e. 1 << 1.
+  while ((1U << Shift) < NumElems) {
+    if (SVOp->getMaskElt(1U << Shift) == 1)
+      break;
+    Shift += 1;
+    // The maximal ratio is 8, i.e. from i8 to i64.
+    if (Shift > 3)
+      return SDValue();
+  }
+
+  // Check the shuffle mask.
+  unsigned Mask = (1U << Shift) - 1;
+  for (unsigned i = 0; i != NumElems; ++i) {
+    int EltIdx = SVOp->getMaskElt(i);
+    if ((i & Mask) != 0 && EltIdx != -1)
+      return SDValue();
+    if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift))
+      return SDValue();
+  }
+
+  unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift;
+  MVT NeVT = MVT::getIntegerVT(NBits);
+  MVT NVT = MVT::getVectorVT(NeVT, NumElems >> Shift);
+
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(NVT))
+    return SDValue();
+
+  // Simplify the operand as it's prepared to be fed into shuffle.
+  unsigned SignificantBits = NVT.getSizeInBits() >> Shift;
+  if (V1.getOpcode() == ISD::BITCAST &&
+      V1.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
+      V1.getOperand(0).getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+      V1.getOperand(0).getOperand(0)
+        .getSimpleValueType().getSizeInBits() == SignificantBits) {
+    // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
+    SDValue V = V1.getOperand(0).getOperand(0).getOperand(0);
+    ConstantSDNode *CIdx =
+      dyn_cast<ConstantSDNode>(V1.getOperand(0).getOperand(0).getOperand(1));
+    // If it's foldable, i.e. normal load with single use, we will let code
+    // selection to fold it. Otherwise, we will short the conversion sequence.
+    if (CIdx && CIdx->getZExtValue() == 0 &&
+        (!ISD::isNormalLoad(V.getNode()) || !V.hasOneUse())) {
+      MVT FullVT = V.getSimpleValueType();
+      MVT V1VT = V1.getSimpleValueType();
+      if (FullVT.getSizeInBits() > V1VT.getSizeInBits()) {
+        // The "ext_vec_elt" node is wider than the result node.
+        // In this case we should extract subvector from V.
+        // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast (extract_subvector x)).
+        unsigned Ratio = FullVT.getSizeInBits() / V1VT.getSizeInBits();
+        MVT SubVecVT = MVT::getVectorVT(FullVT.getVectorElementType(),
+                                        FullVT.getVectorNumElements()/Ratio);
+        V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, V,
+                        DAG.getIntPtrConstant(0));
+      }
+      V1 = DAG.getNode(ISD::BITCAST, DL, V1VT, V);
+    }
+  }
+
+  return DAG.getNode(ISD::BITCAST, DL, VT,
+                     DAG.getNode(X86ISD::VZEXT, DL, NVT, V1));
+}
+
+static SDValue NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
+                                      SelectionDAG &DAG) {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  MVT VT = Op.getSimpleValueType();
+  SDLoc dl(Op);
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+
+  if (isZeroShuffle(SVOp))
+    return getZeroVector(VT, Subtarget, DAG, dl);
+
+  // Handle splat operations
+  if (SVOp->isSplat()) {
+    // Use vbroadcast whenever the splat comes from a foldable load
+    SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
+    if (Broadcast.getNode())
+      return Broadcast;
+  }
+
+  // Check integer expanding shuffles.
+  SDValue NewOp = LowerVectorIntExtend(Op, Subtarget, DAG);
+  if (NewOp.getNode())
+    return NewOp;
+
+  // If the shuffle can be profitably rewritten as a narrower shuffle, then
+  // do it!
+  if (VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v16i16 ||
+      VT == MVT::v32i8) {
+    SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
+    if (NewOp.getNode())
+      return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
+  } else if (VT.is128BitVector() && Subtarget->hasSSE2()) {
+    // FIXME: Figure out a cleaner way to do this.
+    if (ISD::isBuildVectorAllZeros(V2.getNode())) {
+      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
+      if (NewOp.getNode()) {
+        MVT NewVT = NewOp.getSimpleValueType();
+        if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(),
+                               NewVT, true, false))
+          return getVZextMovL(VT, NewVT, NewOp.getOperand(0), DAG, Subtarget,
+                              dl);
+      }
+    } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
+      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
+      if (NewOp.getNode()) {
+        MVT NewVT = NewOp.getSimpleValueType();
+        if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT))
+          return getVZextMovL(VT, NewVT, NewOp.getOperand(1), DAG, Subtarget,
+                              dl);
+      }
+    }
+  }
+  return SDValue();
+}
+
+SDValue
+X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  MVT VT = Op.getSimpleValueType();
+  SDLoc dl(Op);
+  unsigned NumElems = VT.getVectorNumElements();
+  bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
+  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
+  bool V1IsSplat = false;
+  bool V2IsSplat = false;
+  bool HasSSE2 = Subtarget->hasSSE2();
+  bool HasFp256    = Subtarget->hasFp256();
+  bool HasInt256   = Subtarget->hasInt256();
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool OptForSize = MF.getFunction()->getAttributes().
+    hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
+
+  // Check if we should use the experimental vector shuffle lowering. If so,
+  // delegate completely to that code path.
+  if (ExperimentalVectorShuffleLowering)
+    return lowerVectorShuffle(Op, Subtarget, DAG);
+
+  assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
+
+  if (V1IsUndef && V2IsUndef)
+    return DAG.getUNDEF(VT);
+
+  // When we create a shuffle node we put the UNDEF node to second operand,
+  // but in some cases the first operand may be transformed to UNDEF.
+  // In this case we should just commute the node.
+  if (V1IsUndef)
+    return DAG.getCommutedVectorShuffle(*SVOp);
+
+  // Vector shuffle lowering takes 3 steps:
+  //
+  // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable
+  //    narrowing and commutation of operands should be handled.
+  // 2) Matching of shuffles with known shuffle masks to x86 target specific
+  //    shuffle nodes.
+  // 3) Rewriting of unmatched masks into new generic shuffle operations,
+  //    so the shuffle can be broken into other shuffles and the legalizer can
+  //    try the lowering again.
+  //
+  // The general idea is that no vector_shuffle operation should be left to
+  // be matched during isel, all of them must be converted to a target specific
+  // node here.
+
+  // Normalize the input vectors. Here splats, zeroed vectors, profitable
+  // narrowing and commutation of operands should be handled. The actual code
+  // doesn't include all of those, work in progress...
+  SDValue NewOp = NormalizeVectorShuffle(Op, Subtarget, DAG);
+  if (NewOp.getNode())
+    return NewOp;
+
+  SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end());
+
+  // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and
+  // unpckh_undef). Only use pshufd if speed is more important than size.
+  if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasInt256))
+    return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
+  if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasInt256))
+    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
+
+  if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() &&
+      V2IsUndef && MayFoldVectorLoad(V1))
+    return getMOVDDup(Op, dl, V1, DAG);
+
+  if (isMOVHLPS_v_undef_Mask(M, VT))
+    return getMOVHighToLow(Op, dl, DAG);
+
+  // Use to match splats
+  if (HasSSE2 && isUNPCKHMask(M, VT, HasInt256) && V2IsUndef &&
+      (VT == MVT::v2f64 || VT == MVT::v2i64))
+    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
+
+  if (isPSHUFDMask(M, VT)) {
+    // The actual implementation will match the mask in the if above and then
+    // during isel it can match several different instructions, not only pshufd
+    // as its name says, sad but true, emulate the behavior for now...
+    if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64)))
+      return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG);
+
+    unsigned TargetMask = getShuffleSHUFImmediate(SVOp);
+
+    if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))
+      return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
+
+    if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64))
+      return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, TargetMask,
+                                  DAG);
+
+    return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1,
+                                TargetMask, DAG);
+  }
+
+  if (isPALIGNRMask(M, VT, Subtarget))
+    return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2,
+                                getShufflePALIGNRImmediate(SVOp),
+                                DAG);
+
+  // Check if this can be converted into a logical shift.
+  bool isLeft = false;
+  unsigned ShAmt = 0;
+  SDValue ShVal;
+  bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
+  if (isShift && ShVal.hasOneUse()) {
+    // If the shifted value has multiple uses, it may be cheaper to use
+    // v_set0 + movlhps or movhlps, etc.
+    MVT EltVT = VT.getVectorElementType();
+    ShAmt *= EltVT.getSizeInBits();
+    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
+  }
+
+  if (isMOVLMask(M, VT)) {
+    if (ISD::isBuildVectorAllZeros(V1.getNode()))
+      return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
+    if (!isMOVLPMask(M, VT)) {
+      if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
+        return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
+
+      if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
+    }
+  }
+
+  // FIXME: fold these into legal mask.
+  if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasInt256))
+    return getMOVLowToHigh(Op, dl, DAG, HasSSE2);
+
+  if (isMOVHLPSMask(M, VT))
+    return getMOVHighToLow(Op, dl, DAG);
+
+  if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget))
+    return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG);
+
+  if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget))
+    return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG);
+
+  if (isMOVLPMask(M, VT))
+    return getMOVLP(Op, dl, DAG, HasSSE2);
+
+  if (ShouldXformToMOVHLPS(M, VT) ||
+      ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT))
+    return DAG.getCommutedVectorShuffle(*SVOp);
+
+  if (isShift) {
+    // No better options. Use a vshldq / vsrldq.
+    MVT EltVT = VT.getVectorElementType();
+    ShAmt *= EltVT.getSizeInBits();
+    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
+  }
+
+  bool Commuted = false;
+  // FIXME: This should also accept a bitcast of a splat?  Be careful, not
+  // 1,1,1,1 -> v8i16 though.
+  BitVector UndefElements;
+  if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V1.getNode()))
+    if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())
+      V1IsSplat = true;
+  if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V2.getNode()))
+    if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())
+      V2IsSplat = true;
+
+  // Canonicalize the splat or undef, if present, to be on the RHS.
+  if (!V2IsUndef && V1IsSplat && !V2IsSplat) {
+    CommuteVectorShuffleMask(M, NumElems);
+    std::swap(V1, V2);
+    std::swap(V1IsSplat, V2IsSplat);
+    Commuted = true;
+  }
+
+  if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) {
+    // Shuffling low element of v1 into undef, just return v1.
+    if (V2IsUndef)
+      return V1;
+    // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
+    // the instruction selector will not match, so get a canonical MOVL with
+    // swapped operands to undo the commute.
+    return getMOVL(DAG, dl, VT, V2, V1);
+  }
+
+  if (isUNPCKLMask(M, VT, HasInt256))
+    return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
+
+  if (isUNPCKHMask(M, VT, HasInt256))
+    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
+
+  if (V2IsSplat) {
+    // Normalize mask so all entries that point to V2 points to its first
+    // element then try to match unpck{h|l} again. If match, return a
+    // new vector_shuffle with the corrected mask.p
+    SmallVector<int, 8> NewMask(M.begin(), M.end());
+    NormalizeMask(NewMask, NumElems);
+    if (isUNPCKLMask(NewMask, VT, HasInt256, true))
+      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
+    if (isUNPCKHMask(NewMask, VT, HasInt256, true))
+      return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
+  }
+
+  if (Commuted) {
+    // Commute is back and try unpck* again.
+    // FIXME: this seems wrong.
+    CommuteVectorShuffleMask(M, NumElems);
+    std::swap(V1, V2);
+    std::swap(V1IsSplat, V2IsSplat);
+
+    if (isUNPCKLMask(M, VT, HasInt256))
+      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
+
+    if (isUNPCKHMask(M, VT, HasInt256))
+      return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
+  }
+
+  // Normalize the node to match x86 shuffle ops if needed
+  if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true)))
+    return DAG.getCommutedVectorShuffle(*SVOp);
+
+  // The checks below are all present in isShuffleMaskLegal, but they are
+  // inlined here right now to enable us to directly emit target specific
+  // nodes, and remove one by one until they don't return Op anymore.
+
+  if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&
+      SVOp->getSplatIndex() == 0 && V2IsUndef) {
+    if (VT == MVT::v2f64 || VT == MVT::v2i64)
+      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
+  }
+
+  if (isPSHUFHWMask(M, VT, HasInt256))
+    return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1,
+                                getShufflePSHUFHWImmediate(SVOp),
+                                DAG);
+
+  if (isPSHUFLWMask(M, VT, HasInt256))
+    return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1,
+                                getShufflePSHUFLWImmediate(SVOp),
+                                DAG);
+
+  unsigned MaskValue;
+  if (isBlendMask(M, VT, Subtarget->hasSSE41(), Subtarget->hasInt256(),
+                  &MaskValue))
+    return LowerVECTOR_SHUFFLEtoBlend(SVOp, MaskValue, Subtarget, DAG);
+
+  if (isSHUFPMask(M, VT))
+    return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,
+                                getShuffleSHUFImmediate(SVOp), DAG);
+
+  if (isUNPCKL_v_undef_Mask(M, VT, HasInt256))
+    return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
+  if (isUNPCKH_v_undef_Mask(M, VT, HasInt256))
+    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
+
+  //===--------------------------------------------------------------------===//
+  // Generate target specific nodes for 128 or 256-bit shuffles only
+  // supported in the AVX instruction set.
+  //
+
+  // Handle VMOVDDUPY permutations
+  if (V2IsUndef && isMOVDDUPYMask(M, VT, HasFp256))
+    return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
+
+  // Handle VPERMILPS/D* permutations
+  if (isVPERMILPMask(M, VT)) {
+    if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32)
+      return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
+                                  getShuffleSHUFImmediate(SVOp), DAG);
+    return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1,
+                                getShuffleSHUFImmediate(SVOp), DAG);
+  }
+
+  unsigned Idx;
+  if (VT.is512BitVector() && isINSERT64x4Mask(M, VT, &Idx))
+    return Insert256BitVector(V1, Extract256BitVector(V2, 0, DAG, dl),
+                              Idx*(NumElems/2), DAG, dl);
+
+  // Handle VPERM2F128/VPERM2I128 permutations
+  if (isVPERM2X128Mask(M, VT, HasFp256))
+    return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
+                                V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
+
+  if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT))
+    return getINSERTPS(SVOp, dl, DAG);
+
+  unsigned Imm8;
+  if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8))
+    return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG);
+
+  if ((V2IsUndef && HasInt256 && VT.is256BitVector() && NumElems == 8) ||
+      VT.is512BitVector()) {
+    MVT MaskEltVT = MVT::getIntegerVT(VT.getVectorElementType().getSizeInBits());
+    MVT MaskVectorVT = MVT::getVectorVT(MaskEltVT, NumElems);
+    SmallVector<SDValue, 16> permclMask;
+    for (unsigned i = 0; i != NumElems; ++i) {
+      permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MaskEltVT));
+    }
+
+    SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT, permclMask);
+    if (V2IsUndef)
+      // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
+      return DAG.getNode(X86ISD::VPERMV, dl, VT,
+                          DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);
+    return DAG.getNode(X86ISD::VPERMV3, dl, VT, V1,
+                       DAG.getNode(ISD::BITCAST, dl, VT, Mask), V2);
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Since no target specific shuffle was selected for this generic one,
+  // lower it into other known shuffles. FIXME: this isn't true yet, but
+  // this is the plan.
+  //
+
+  // Handle v8i16 specifically since SSE can do byte extraction and insertion.
+  if (VT == MVT::v8i16) {
+    SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, Subtarget, DAG);
+    if (NewOp.getNode())
+      return NewOp;
+  }
+
+  if (VT == MVT::v16i16 && Subtarget->hasInt256()) {
+    SDValue NewOp = LowerVECTOR_SHUFFLEv16i16(Op, DAG);
+    if (NewOp.getNode())
+      return NewOp;
+  }
+
+  if (VT == MVT::v16i8) {
+    SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, Subtarget, DAG);
+    if (NewOp.getNode())
+      return NewOp;
+  }
+
+  if (VT == MVT::v32i8) {
+    SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, Subtarget, DAG);
+    if (NewOp.getNode())
+      return NewOp;
+  }
+
+  // Handle all 128-bit wide vectors with 4 elements, and match them with
+  // several different shuffle types.
+  if (NumElems == 4 && VT.is128BitVector())
+    return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG);
+
+  // Handle general 256-bit shuffles
+  if (VT.is256BitVector())
+    return LowerVECTOR_SHUFFLE_256(SVOp, DAG);
+
+  return SDValue();
+}
+
+// This function assumes its argument is a BUILD_VECTOR of constants or
+// undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is
+// true.
+static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,
+                                    unsigned &MaskValue) {
+  MaskValue = 0;
+  unsigned NumElems = BuildVector->getNumOperands();
+  // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
+  unsigned NumLanes = (NumElems - 1) / 8 + 1;
+  unsigned NumElemsInLane = NumElems / NumLanes;
+
+  // Blend for v16i16 should be symetric for the both lanes.
+  for (unsigned i = 0; i < NumElemsInLane; ++i) {
+    SDValue EltCond = BuildVector->getOperand(i);
+    SDValue SndLaneEltCond =
+        (NumLanes == 2) ? BuildVector->getOperand(i + NumElemsInLane) : EltCond;
+
+    int Lane1Cond = -1, Lane2Cond = -1;
+    if (isa<ConstantSDNode>(EltCond))
+      Lane1Cond = !isZero(EltCond);
+    if (isa<ConstantSDNode>(SndLaneEltCond))
+      Lane2Cond = !isZero(SndLaneEltCond);
+
+    if (Lane1Cond == Lane2Cond || Lane2Cond < 0)
+      // Lane1Cond != 0, means we want the first argument.
+      // Lane1Cond == 0, means we want the second argument.
+      // The encoding of this argument is 0 for the first argument, 1
+      // for the second. Therefore, invert the condition.
+      MaskValue |= !Lane1Cond << i;
+    else if (Lane1Cond < 0)
+      MaskValue |= !Lane2Cond << i;
+    else
+      return false;
+  }
+  return true;
+}
+
+// Try to lower a vselect node into a simple blend instruction.
+static SDValue LowerVSELECTtoBlend(SDValue Op, const X86Subtarget *Subtarget,
+                                   SelectionDAG &DAG) {
+  SDValue Cond = Op.getOperand(0);
+  SDValue LHS = Op.getOperand(1);
+  SDValue RHS = Op.getOperand(2);
+  SDLoc dl(Op);
+  MVT VT = Op.getSimpleValueType();
+  MVT EltVT = VT.getVectorElementType();
+  unsigned NumElems = VT.getVectorNumElements();
+
+  // There is no blend with immediate in AVX-512.
+  if (VT.is512BitVector())
+    return SDValue();
+
+  if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
+    return SDValue();
+  if (!Subtarget->hasInt256() && VT == MVT::v16i16)
+    return SDValue();
+
+  if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
+    return SDValue();
+
+  // Check the mask for BLEND and build the value.
+  unsigned MaskValue = 0;
+  if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
+    return SDValue();
+
+  // Convert i32 vectors to floating point if it is not AVX2.
+  // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
+  MVT BlendVT = VT;
+  if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
+    BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
+                               NumElems);
+    LHS = DAG.getNode(ISD::BITCAST, dl, VT, LHS);
+    RHS = DAG.getNode(ISD::BITCAST, dl, VT, RHS);
+  }
+
+  SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, LHS, RHS,
+                            DAG.getConstant(MaskValue, MVT::i32));
+  return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
+}
+
+SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
+  SDValue BlendOp = LowerVSELECTtoBlend(Op, Subtarget, DAG);
+  if (BlendOp.getNode())
+    return BlendOp;
+
+  // Some types for vselect were previously set to Expand, not Legal or
+  // Custom. Return an empty SDValue so we fall-through to Expand, after
+  // the Custom lowering phase.
+  MVT VT = Op.getSimpleValueType();
+  switch (VT.SimpleTy) {
+  default:
+    break;
+  case MVT::v8i16:
+  case MVT::v16i16:
+    return SDValue();
+  }
+
+  // We couldn't create a "Blend with immediate" node.
+  // This node should still be legal, but we'll have to emit a blendv*
+  // instruction.
+  return Op;
+}
+
+static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  SDLoc dl(Op);
+
+  if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
+    return SDValue();
+
+  if (VT.getSizeInBits() == 8) {
+    SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
+                                  Op.getOperand(0), Op.getOperand(1));
+    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
+                                  DAG.getValueType(VT));
+    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
+  }
+
+  if (VT.getSizeInBits() == 16) {
+    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+    // If Idx is 0, it's cheaper to do a move instead of a pextrw.
+    if (Idx == 0)
+      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
+                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+                                     DAG.getNode(ISD::BITCAST, dl,
+                                                 MVT::v4i32,
+                                                 Op.getOperand(0)),
+                                     Op.getOperand(1)));
+    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
+                                  Op.getOperand(0), Op.getOperand(1));
+    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
+                                  DAG.getValueType(VT));
+    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
+  }
+
+  if (VT == MVT::f32) {
+    // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
+    // the result back to FR32 register. It's only worth matching if the
+    // result has a single use which is a store or a bitcast to i32.  And in
+    // the case of a store, it's not worth it if the index is a constant 0,
+    // because a MOVSSmr can be used instead, which is smaller and faster.
+    if (!Op.hasOneUse())
+      return SDValue();
+    SDNode *User = *Op.getNode()->use_begin();
+    if ((User->getOpcode() != ISD::STORE ||
+         (isa<ConstantSDNode>(Op.getOperand(1)) &&
+          cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
+        (User->getOpcode() != ISD::BITCAST ||
+         User->getValueType(0) != MVT::i32))
+      return SDValue();
+    SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+                                  DAG.getNode(ISD::BITCAST, dl, MVT::v4i32,
+                                              Op.getOperand(0)),
+                                              Op.getOperand(1));
+    return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract);
+  }
+
+  if (VT == MVT::i32 || VT == MVT::i64) {
+    // ExtractPS/pextrq works with constant index.
+    if (isa<ConstantSDNode>(Op.getOperand(1)))
+      return Op;
+  }
+  return SDValue();
+}
+
+/// Extract one bit from mask vector, like v16i1 or v8i1.
+/// AVX-512 feature.
+SDValue
+X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Vec = Op.getOperand(0);
+  SDLoc dl(Vec);
+  MVT VecVT = Vec.getSimpleValueType();
+  SDValue Idx = Op.getOperand(1);
+  MVT EltVT = Op.getSimpleValueType();
+
+  assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
+
+  // variable index can't be handled in mask registers,
+  // extend vector to VR512
+  if (!isa<ConstantSDNode>(Idx)) {
+    MVT ExtVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
+    SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
+    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                              ExtVT.getVectorElementType(), Ext, Idx);
+    return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
+  }
+
+  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+  const TargetRegisterClass* rc = getRegClassFor(VecVT);
+  unsigned MaxSift = rc->getSize()*8 - 1;
+  Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
+                    DAG.getConstant(MaxSift - IdxVal, MVT::i8));
+  Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
+                    DAG.getConstant(MaxSift, MVT::i8));
+  return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
+                       DAG.getIntPtrConstant(0));
+}
+
+SDValue
+X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  SDValue Vec = Op.getOperand(0);
+  MVT VecVT = Vec.getSimpleValueType();
+  SDValue Idx = Op.getOperand(1);
+
+  if (Op.getSimpleValueType() == MVT::i1)
+    return ExtractBitFromMaskVector(Op, DAG);
+
+  if (!isa<ConstantSDNode>(Idx)) {
+    if (VecVT.is512BitVector() ||
+        (VecVT.is256BitVector() && Subtarget->hasInt256() &&
+         VecVT.getVectorElementType().getSizeInBits() == 32)) {
+
+      MVT MaskEltVT =
+        MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits());
+      MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
+                                    MaskEltVT.getSizeInBits());
+
+      Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
+      SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
+                                getZeroVector(MaskVT, Subtarget, DAG, dl),
+                                Idx, DAG.getConstant(0, getPointerTy()));
+      SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
+      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(),
+                        Perm, DAG.getConstant(0, getPointerTy()));
+    }
+    return SDValue();
+  }
+
+  // If this is a 256-bit vector result, first extract the 128-bit vector and
+  // then extract the element from the 128-bit vector.
+  if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
+
+    unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+    // Get the 128-bit vector.
+    Vec = Extract128BitVector(Vec, IdxVal, DAG, dl);
+    MVT EltVT = VecVT.getVectorElementType();
+
+    unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
+
+    //if (IdxVal >= NumElems/2)
+    //  IdxVal -= NumElems/2;
+    IdxVal -= (IdxVal/ElemsPerChunk)*ElemsPerChunk;
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
+                       DAG.getConstant(IdxVal, MVT::i32));
+  }
+
+  assert(VecVT.is128BitVector() && "Unexpected vector length");
+
+  if (Subtarget->hasSSE41()) {
+    SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
+    if (Res.getNode())
+      return Res;
+  }
+
+  MVT VT = Op.getSimpleValueType();
+  // TODO: handle v16i8.
+  if (VT.getSizeInBits() == 16) {
+    SDValue Vec = Op.getOperand(0);
+    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+    if (Idx == 0)
+      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
+                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+                                     DAG.getNode(ISD::BITCAST, dl,
+                                                 MVT::v4i32, Vec),
+                                     Op.getOperand(1)));
+    // Transform it so it match pextrw which produces a 32-bit result.
+    MVT EltVT = MVT::i32;
+    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
+                                  Op.getOperand(0), Op.getOperand(1));
+    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
+                                  DAG.getValueType(VT));
+    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
+  }
+
+  if (VT.getSizeInBits() == 32) {
+    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+    if (Idx == 0)
+      return Op;
+
+    // SHUFPS the element to the lowest double word, then movss.
+    int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
+    MVT VVT = Op.getOperand(0).getSimpleValueType();
+    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
+                                       DAG.getUNDEF(VVT), Mask);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
+                       DAG.getIntPtrConstant(0));
+  }
+
+  if (VT.getSizeInBits() == 64) {
+    // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
+    // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
+    //        to match extract_elt for f64.
+    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+    if (Idx == 0)
+      return Op;
+
+    // UNPCKHPD the element to the lowest double word, then movsd.
+    // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
+    // to a f64mem, the whole operation is folded into a single MOVHPDmr.
+    int Mask[2] = { 1, -1 };
+    MVT VVT = Op.getOperand(0).getSimpleValueType();
+    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
+                                       DAG.getUNDEF(VVT), Mask);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
+                       DAG.getIntPtrConstant(0));
+  }
+
+  return SDValue();
+}
+
+static SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  MVT EltVT = VT.getVectorElementType();
+  SDLoc dl(Op);
+
+  SDValue N0 = Op.getOperand(0);
+  SDValue N1 = Op.getOperand(1);
+  SDValue N2 = Op.getOperand(2);
+
+  if (!VT.is128BitVector())
+    return SDValue();
+
+  if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) &&
+      isa<ConstantSDNode>(N2)) {
+    unsigned Opc;
+    if (VT == MVT::v8i16)
+      Opc = X86ISD::PINSRW;
+    else if (VT == MVT::v16i8)
+      Opc = X86ISD::PINSRB;
+    else
+      Opc = X86ISD::PINSRB;
+
+    // Transform it so it match pinsr{b,w} which expects a GR32 as its second
+    // argument.
+    if (N1.getValueType() != MVT::i32)
+      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
+    if (N2.getValueType() != MVT::i32)
+      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
+    return DAG.getNode(Opc, dl, VT, N0, N1, N2);
+  }
+
+  if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
+    // Bits [7:6] of the constant are the source select.  This will always be
+    //  zero here.  The DAG Combiner may combine an extract_elt index into these
+    //  bits.  For example (insert (extract, 3), 2) could be matched by putting
+    //  the '3' into bits [7:6] of X86ISD::INSERTPS.
+    // Bits [5:4] of the constant are the destination select.  This is the
+    //  value of the incoming immediate.
+    // Bits [3:0] of the constant are the zero mask.  The DAG Combiner may
+    //   combine either bitwise AND or insert of float 0.0 to set these bits.
+    N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4);
+    // Create this as a scalar to vector..
+    N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
+    return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
+  }
+
+  if ((EltVT == MVT::i32 || EltVT == MVT::i64) && isa<ConstantSDNode>(N2)) {
+    // PINSR* works with constant index.
+    return Op;
+  }
+  return SDValue();
+}
+
+/// Insert one bit to mask vector, like v16i1 or v8i1.
+/// AVX-512 feature.
+SDValue 
+X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  SDValue Vec = Op.getOperand(0);
+  SDValue Elt = Op.getOperand(1);
+  SDValue Idx = Op.getOperand(2);
+  MVT VecVT = Vec.getSimpleValueType();
+
+  if (!isa<ConstantSDNode>(Idx)) {
+    // Non constant index. Extend source and destination,
+    // insert element and then truncate the result.
+    MVT ExtVecVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
+    MVT ExtEltVT = (VecVT == MVT::v8i1 ?  MVT::i64 : MVT::i32);
+    SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT, 
+      DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
+      DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
+    return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
+  }
+
+  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+  SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
+  if (Vec.getOpcode() == ISD::UNDEF)
+    return DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
+                       DAG.getConstant(IdxVal, MVT::i8));
+  const TargetRegisterClass* rc = getRegClassFor(VecVT);
+  unsigned MaxSift = rc->getSize()*8 - 1;
+  EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
+                    DAG.getConstant(MaxSift, MVT::i8));
+  EltInVec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, EltInVec,
+                    DAG.getConstant(MaxSift - IdxVal, MVT::i8));
+  return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
+}
+SDValue
+X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
+  MVT VT = Op.getSimpleValueType();
+  MVT EltVT = VT.getVectorElementType();
+  
+  if (EltVT == MVT::i1)
+    return InsertBitToMaskVector(Op, DAG);
+
+  SDLoc dl(Op);
+  SDValue N0 = Op.getOperand(0);
+  SDValue N1 = Op.getOperand(1);
+  SDValue N2 = Op.getOperand(2);
+
+  // If this is a 256-bit vector result, first extract the 128-bit vector,
+  // insert the element into the extracted half and then place it back.
+  if (VT.is256BitVector() || VT.is512BitVector()) {
+    if (!isa<ConstantSDNode>(N2))
+      return SDValue();
+
+    // Get the desired 128-bit vector half.
+    unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue();
+    SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);
+
+    // Insert the element into the desired half.
+    unsigned NumEltsIn128 = 128/EltVT.getSizeInBits();
+    unsigned IdxIn128 = IdxVal - (IdxVal/NumEltsIn128) * NumEltsIn128;
+
+    V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
+                    DAG.getConstant(IdxIn128, MVT::i32));
+
+    // Insert the changed part back to the 256-bit vector
+    return Insert128BitVector(N0, V, IdxVal, DAG, dl);
+  }
+
+  if (Subtarget->hasSSE41())
+    return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);
+
+  if (EltVT == MVT::i8)
+    return SDValue();
+
+  if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) {
+    // Transform it so it match pinsrw which expects a 16-bit value in a GR32
+    // as its second argument.
+    if (N1.getValueType() != MVT::i32)
+      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
+    if (N2.getValueType() != MVT::i32)
+      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
+    return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
+  }
+  return SDValue();
+}
+
+static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
+  SDLoc dl(Op);
+  MVT OpVT = Op.getSimpleValueType();
+
+  // If this is a 256-bit vector result, first insert into a 128-bit
+  // vector and then insert into the 256-bit vector.
+  if (!OpVT.is128BitVector()) {
+    // Insert into a 128-bit vector.
+    unsigned SizeFactor = OpVT.getSizeInBits()/128;
+    MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
+                                 OpVT.getVectorNumElements() / SizeFactor);
+
+    Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
+
+    // Insert the 128-bit vector.
+    return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
+  }
+
+  if (OpVT == MVT::v1i64 &&
+      Op.getOperand(0).getValueType() == MVT::i64)
+    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
+
+  SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
+  assert(OpVT.is128BitVector() && "Expected an SSE type!");
+  return DAG.getNode(ISD::BITCAST, dl, OpVT,
+                     DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt));
+}
+
+// Lower a node with an EXTRACT_SUBVECTOR opcode.  This may result in
+// a simple subregister reference or explicit instructions to grab
+// upper bits of a vector.
+static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
+                                      SelectionDAG &DAG) {
+  SDLoc dl(Op);
+  SDValue In =  Op.getOperand(0);
+  SDValue Idx = Op.getOperand(1);
+  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+  MVT ResVT   = Op.getSimpleValueType();
+  MVT InVT    = In.getSimpleValueType();
+
+  if (Subtarget->hasFp256()) {
+    if (ResVT.is128BitVector() &&
+        (InVT.is256BitVector() || InVT.is512BitVector()) &&
+        isa<ConstantSDNode>(Idx)) {
+      return Extract128BitVector(In, IdxVal, DAG, dl);
+    }
+    if (ResVT.is256BitVector() && InVT.is512BitVector() &&
+        isa<ConstantSDNode>(Idx)) {
+      return Extract256BitVector(In, IdxVal, DAG, dl);
+    }
+  }
+  return SDValue();
+}
+
+// Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
+// simple superregister reference or explicit instructions to insert
+// the upper bits of a vector.
+static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
+                                     SelectionDAG &DAG) {
+  if (Subtarget->hasFp256()) {
+    SDLoc dl(Op.getNode());
+    SDValue Vec = Op.getNode()->getOperand(0);
+    SDValue SubVec = Op.getNode()->getOperand(1);
+    SDValue Idx = Op.getNode()->getOperand(2);
+
+    if ((Op.getNode()->getSimpleValueType(0).is256BitVector() ||
+         Op.getNode()->getSimpleValueType(0).is512BitVector()) &&
+        SubVec.getNode()->getSimpleValueType(0).is128BitVector() &&
+        isa<ConstantSDNode>(Idx)) {
+      unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+      return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
+    }
+
+    if (Op.getNode()->getSimpleValueType(0).is512BitVector() &&
+        SubVec.getNode()->getSimpleValueType(0).is256BitVector() &&
+        isa<ConstantSDNode>(Idx)) {
+      unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+      return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
+    }
+  }
+  return SDValue();
+}
+
+// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
+// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
+// one of the above mentioned nodes. It has to be wrapped because otherwise
+// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
+// be used to form addressing mode. These wrapped nodes will be selected
+// into MOV32ri.
+SDValue
+X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
+  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+
+  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
+  // global base reg.
+  unsigned char OpFlag = 0;
+  unsigned WrapperKind = X86ISD::Wrapper;
+  CodeModel::Model M = DAG.getTarget().getCodeModel();
+
+  if (Subtarget->isPICStyleRIPRel() &&
+      (M == CodeModel::Small || M == CodeModel::Kernel))
+    WrapperKind = X86ISD::WrapperRIP;
+  else if (Subtarget->isPICStyleGOT())
+    OpFlag = X86II::MO_GOTOFF;
+  else if (Subtarget->isPICStyleStubPIC())
+    OpFlag = X86II::MO_PIC_BASE_OFFSET;
+
+  SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
+                                             CP->getAlignment(),
+                                             CP->getOffset(), OpFlag);
+  SDLoc DL(CP);
+  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
+  // With PIC, the address is actually $g + Offset.
+  if (OpFlag) {
+    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
+                         DAG.getNode(X86ISD::GlobalBaseReg,
+                                     SDLoc(), getPointerTy()),
+                         Result);
+  }
+
+  return Result;
+}
+
+SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+
+  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
+  // global base reg.
+  unsigned char OpFlag = 0;
+  unsigned WrapperKind = X86ISD::Wrapper;
+  CodeModel::Model M = DAG.getTarget().getCodeModel();
+
+  if (Subtarget->isPICStyleRIPRel() &&
+      (M == CodeModel::Small || M == CodeModel::Kernel))
+    WrapperKind = X86ISD::WrapperRIP;
+  else if (Subtarget->isPICStyleGOT())
+    OpFlag = X86II::MO_GOTOFF;
+  else if (Subtarget->isPICStyleStubPIC())
+    OpFlag = X86II::MO_PIC_BASE_OFFSET;
+
+  SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
+                                          OpFlag);
+  SDLoc DL(JT);
+  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
+
+  // With PIC, the address is actually $g + Offset.
+  if (OpFlag)
+    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
+                         DAG.getNode(X86ISD::GlobalBaseReg,
+                                     SDLoc(), getPointerTy()),
+                         Result);
+
+  return Result;
+}
+
+SDValue
+X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
+  const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
+
+  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
+  // global base reg.
+  unsigned char OpFlag = 0;
+  unsigned WrapperKind = X86ISD::Wrapper;
+  CodeModel::Model M = DAG.getTarget().getCodeModel();
+
+  if (Subtarget->isPICStyleRIPRel() &&
+      (M == CodeModel::Small || M == CodeModel::Kernel)) {
+    if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF())
+      OpFlag = X86II::MO_GOTPCREL;
+    WrapperKind = X86ISD::WrapperRIP;
+  } else if (Subtarget->isPICStyleGOT()) {
+    OpFlag = X86II::MO_GOT;
+  } else if (Subtarget->isPICStyleStubPIC()) {
+    OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE;
+  } else if (Subtarget->isPICStyleStubNoDynamic()) {
+    OpFlag = X86II::MO_DARWIN_NONLAZY;
+  }
+
+  SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
+
+  SDLoc DL(Op);
+  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
+
+  // With PIC, the address is actually $g + Offset.
+  if (DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
+      !Subtarget->is64Bit()) {
+    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
+                         DAG.getNode(X86ISD::GlobalBaseReg,
+                                     SDLoc(), getPointerTy()),
+                         Result);
+  }
+
+  // For symbols that require a load from a stub to get the address, emit the
+  // load.
+  if (isGlobalStubReference(OpFlag))
+    Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result,
+                         MachinePointerInfo::getGOT(), false, false, false, 0);
+
+  return Result;
+}
+
+SDValue
+X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
+  // Create the TargetBlockAddressAddress node.
+  unsigned char OpFlags =
+    Subtarget->ClassifyBlockAddressReference();
+  CodeModel::Model M = DAG.getTarget().getCodeModel();
+  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+  int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
+  SDLoc dl(Op);
+  SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset,
+                                             OpFlags);
+
+  if (Subtarget->isPICStyleRIPRel() &&
+      (M == CodeModel::Small || M == CodeModel::Kernel))
+    Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
+  else
+    Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
+
+  // With PIC, the address is actually $g + Offset.
+  if (isGlobalRelativeToPICBase(OpFlags)) {
+    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
+                         DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
+                         Result);
+  }
+
+  return Result;
+}
+
+SDValue
+X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
+                                      int64_t Offset, SelectionDAG &DAG) const {
+  // Create the TargetGlobalAddress node, folding in the constant
+  // offset if it is legal.
+  unsigned char OpFlags =
+      Subtarget->ClassifyGlobalReference(GV, DAG.getTarget());
+  CodeModel::Model M = DAG.getTarget().getCodeModel();
+  SDValue Result;
+  if (OpFlags == X86II::MO_NO_FLAG &&
+      X86::isOffsetSuitableForCodeModel(Offset, M)) {
+    // A direct static reference to a global.
+    Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
+    Offset = 0;
+  } else {
+    Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
+  }
+
+  if (Subtarget->isPICStyleRIPRel() &&
+      (M == CodeModel::Small || M == CodeModel::Kernel))
+    Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
+  else
+    Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
+
+  // With PIC, the address is actually $g + Offset.
+  if (isGlobalRelativeToPICBase(OpFlags)) {
+    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
+                         DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
+                         Result);
+  }
+
+  // For globals that require a load from a stub to get the address, emit the
+  // load.
+  if (isGlobalStubReference(OpFlags))
+    Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
+                         MachinePointerInfo::getGOT(), false, false, false, 0);
+
+  // If there was a non-zero offset that we didn't fold, create an explicit
+  // addition for it.
+  if (Offset != 0)
+    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
+                         DAG.getConstant(Offset, getPointerTy()));
+
+  return Result;
+}
+
+SDValue
+X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
+  return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
+}
+
+static SDValue
+GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
+           SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
+           unsigned char OperandFlags, bool LocalDynamic = false) {
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDLoc dl(GA);
+  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
+                                           GA->getValueType(0),
+                                           GA->getOffset(),
+                                           OperandFlags);
+
+  X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
+                                           : X86ISD::TLSADDR;
+
+  if (InFlag) {
+    SDValue Ops[] = { Chain,  TGA, *InFlag };
+    Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
+  } else {
+    SDValue Ops[]  = { Chain, TGA };
+    Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
+  }
+
+  // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
+  MFI->setAdjustsStack(true);
+
+  SDValue Flag = Chain.getValue(1);
+  return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
+}
+
+// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
+static SDValue
+LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+                                const EVT PtrVT) {
+  SDValue InFlag;
+  SDLoc dl(GA);  // ? function entry point might be better
+  SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
+                                   DAG.getNode(X86ISD::GlobalBaseReg,
+                                               SDLoc(), PtrVT), InFlag);
+  InFlag = Chain.getValue(1);
+
+  return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
+}
+
+// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
+static SDValue
+LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+                                const EVT PtrVT) {
+  return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
+                    X86::RAX, X86II::MO_TLSGD);
+}
+
+static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
+                                           SelectionDAG &DAG,
+                                           const EVT PtrVT,
+                                           bool is64Bit) {
+  SDLoc dl(GA);
+
+  // Get the start address of the TLS block for this module.
+  X86MachineFunctionInfo* MFI = DAG.getMachineFunction()
+      .getInfo<X86MachineFunctionInfo>();
+  MFI->incNumLocalDynamicTLSAccesses();
+
+  SDValue Base;
+  if (is64Bit) {
+    Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
+                      X86II::MO_TLSLD, /*LocalDynamic=*/true);
+  } else {
+    SDValue InFlag;
+    SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
+        DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
+    InFlag = Chain.getValue(1);
+    Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
+                      X86II::MO_TLSLDM, /*LocalDynamic=*/true);
+  }
+
+  // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
+  // of Base.
+
+  // Build x@dtpoff.
+  unsigned char OperandFlags = X86II::MO_DTPOFF;
+  unsigned WrapperKind = X86ISD::Wrapper;
+  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
+                                           GA->getValueType(0),
+                                           GA->getOffset(), OperandFlags);
+  SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
+
+  // Add x@dtpoff with the base.
+  return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
+}
+
+// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
+static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+                                   const EVT PtrVT, TLSModel::Model model,
+                                   bool is64Bit, bool isPIC) {
+  SDLoc dl(GA);
+
+  // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
+  Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
+                                                         is64Bit ? 257 : 256));
+
+  SDValue ThreadPointer =
+      DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0),
+                  MachinePointerInfo(Ptr), false, false, false, 0);
+
+  unsigned char OperandFlags = 0;
+  // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
+  // initialexec.
+  unsigned WrapperKind = X86ISD::Wrapper;
+  if (model == TLSModel::LocalExec) {
+    OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
+  } else if (model == TLSModel::InitialExec) {
+    if (is64Bit) {
+      OperandFlags = X86II::MO_GOTTPOFF;
+      WrapperKind = X86ISD::WrapperRIP;
+    } else {
+      OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
+    }
+  } else {
+    llvm_unreachable("Unexpected model");
+  }
+
+  // emit "addl x@ntpoff,%eax" (local exec)
+  // or "addl x@indntpoff,%eax" (initial exec)
+  // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
+  SDValue TGA =
+      DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
+                                 GA->getOffset(), OperandFlags);
+  SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
+
+  if (model == TLSModel::InitialExec) {
+    if (isPIC && !is64Bit) {
+      Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
+                           DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
+                           Offset);
+    }
+
+    Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
+                         MachinePointerInfo::getGOT(), false, false, false, 0);
+  }
+
+  // The address of the thread local variable is the add of the thread
+  // pointer with the offset of the variable.
+  return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
+}
+
+SDValue
+X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
+
+  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+  const GlobalValue *GV = GA->getGlobal();
+
+  if (Subtarget->isTargetELF()) {
+    TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
+
+    switch (model) {
+      case TLSModel::GeneralDynamic:
+        if (Subtarget->is64Bit())
+          return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
+        return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
+      case TLSModel::LocalDynamic:
+        return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(),
+                                           Subtarget->is64Bit());
+      case TLSModel::InitialExec:
+      case TLSModel::LocalExec:
+        return LowerToTLSExecModel(
+            GA, DAG, getPointerTy(), model, Subtarget->is64Bit(),
+            DAG.getTarget().getRelocationModel() == Reloc::PIC_);
+    }
+    llvm_unreachable("Unknown TLS model.");
+  }
+
+  if (Subtarget->isTargetDarwin()) {
+    // Darwin only has one model of TLS.  Lower to that.
+    unsigned char OpFlag = 0;
+    unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ?
+                           X86ISD::WrapperRIP : X86ISD::Wrapper;
+
+    // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
+    // global base reg.
+    bool PIC32 = (DAG.getTarget().getRelocationModel() == Reloc::PIC_) &&
+                 !Subtarget->is64Bit();
+    if (PIC32)
+      OpFlag = X86II::MO_TLVP_PIC_BASE;
+    else
+      OpFlag = X86II::MO_TLVP;
+    SDLoc DL(Op);
+    SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
+                                                GA->getValueType(0),
+                                                GA->getOffset(), OpFlag);
+    SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
+
+    // With PIC32, the address is actually $g + Offset.
+    if (PIC32)
+      Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(),
+                           DAG.getNode(X86ISD::GlobalBaseReg,
+                                       SDLoc(), getPointerTy()),
+                           Offset);
+
+    // Lowering the machine isd will make sure everything is in the right
+    // location.
+    SDValue Chain = DAG.getEntryNode();
+    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+    SDValue Args[] = { Chain, Offset };
+    Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
+
+    // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
+    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+    MFI->setAdjustsStack(true);
+
+    // And our return value (tls address) is in the standard call return value
+    // location.
+    unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
+    return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(),
+                              Chain.getValue(1));
+  }
+
+  if (Subtarget->isTargetKnownWindowsMSVC() ||
+      Subtarget->isTargetWindowsGNU()) {
+    // Just use the implicit TLS architecture
+    // Need to generate someting similar to:
+    //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
+    //                                  ; from TEB
+    //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
+    //   mov     rcx, qword [rdx+rcx*8]
+    //   mov     eax, .tls$:tlsvar
+    //   [rax+rcx] contains the address
+    // Windows 64bit: gs:0x58
+    // Windows 32bit: fs:__tls_array
+
+    SDLoc dl(GA);
+    SDValue Chain = DAG.getEntryNode();
+
+    // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
+    // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
+    // use its literal value of 0x2C.
+    Value *Ptr = Constant::getNullValue(Subtarget->is64Bit()
+                                        ? Type::getInt8PtrTy(*DAG.getContext(),
+                                                             256)
+                                        : Type::getInt32PtrTy(*DAG.getContext(),
+                                                              257));
+
+    SDValue TlsArray =
+        Subtarget->is64Bit()
+            ? DAG.getIntPtrConstant(0x58)
+            : (Subtarget->isTargetWindowsGNU()
+                   ? DAG.getIntPtrConstant(0x2C)
+                   : DAG.getExternalSymbol("_tls_array", getPointerTy()));
+
+    SDValue ThreadPointer =
+        DAG.getLoad(getPointerTy(), dl, Chain, TlsArray,
+                    MachinePointerInfo(Ptr), false, false, false, 0);
+
+    // Load the _tls_index variable
+    SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy());
+    if (Subtarget->is64Bit())
+      IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain,
+                           IDX, MachinePointerInfo(), MVT::i32,
+                           false, false, 0);
+    else
+      IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(),
+                        false, false, false, 0);
+
+    SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()),
+                                    getPointerTy());
+    IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale);
+
+    SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX);
+    res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(),
+                      false, false, false, 0);
+
+    // Get the offset of start of .tls section
+    SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
+                                             GA->getValueType(0),
+                                             GA->getOffset(), X86II::MO_SECREL);
+    SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA);
+
+    // The address of the thread local variable is the add of the thread
+    // pointer with the offset of the variable.
+    return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset);
+  }
+
+  llvm_unreachable("TLS not implemented for this target.");
+}
+
+/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values
+/// and take a 2 x i32 value to shift plus a shift amount.
+static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
+  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+  MVT VT = Op.getSimpleValueType();
+  unsigned VTBits = VT.getSizeInBits();
+  SDLoc dl(Op);
+  bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
+  SDValue ShOpLo = Op.getOperand(0);
+  SDValue ShOpHi = Op.getOperand(1);
+  SDValue ShAmt  = Op.getOperand(2);
+  // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
+  // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
+  // during isel.
+  SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
+                                  DAG.getConstant(VTBits - 1, MVT::i8));
+  SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
+                                     DAG.getConstant(VTBits - 1, MVT::i8))
+                       : DAG.getConstant(0, VT);
+
+  SDValue Tmp2, Tmp3;
+  if (Op.getOpcode() == ISD::SHL_PARTS) {
+    Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
+    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
+  } else {
+    Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
+    Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
+  }
+
+  // If the shift amount is larger or equal than the width of a part we can't
+  // rely on the results of shld/shrd. Insert a test and select the appropriate
+  // values for large shift amounts.
+  SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
+                                DAG.getConstant(VTBits, MVT::i8));
+  SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
+                             AndNode, DAG.getConstant(0, MVT::i8));
+
+  SDValue Hi, Lo;
+  SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+  SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
+  SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
+
+  if (Op.getOpcode() == ISD::SHL_PARTS) {
+    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
+    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
+  } else {
+    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
+    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
+  }
+
+  SDValue Ops[2] = { Lo, Hi };
+  return DAG.getMergeValues(Ops, dl);
+}
+
+SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  MVT SrcVT = Op.getOperand(0).getSimpleValueType();
+
+  if (SrcVT.isVector())
+    return SDValue();
+
+  assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
+         "Unknown SINT_TO_FP to lower!");
+
+  // These are really Legal; return the operand so the caller accepts it as
+  // Legal.
+  if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
+    return Op;
+  if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
+      Subtarget->is64Bit()) {
+    return Op;
+  }
+
+  SDLoc dl(Op);
+  unsigned Size = SrcVT.getSizeInBits()/8;
+  MachineFunction &MF = DAG.getMachineFunction();
+  int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
+  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+  SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
+                               StackSlot,
+                               MachinePointerInfo::getFixedStack(SSFI),
+                               false, false, 0);
+  return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
+}
+
+SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
+                                     SDValue StackSlot,
+                                     SelectionDAG &DAG) const {
+  // Build the FILD
+  SDLoc DL(Op);
+  SDVTList Tys;
+  bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
+  if (useSSE)
+    Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
+  else
+    Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
+
+  unsigned ByteSize = SrcVT.getSizeInBits()/8;
+
+  FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
+  MachineMemOperand *MMO;
+  if (FI) {
+    int SSFI = FI->getIndex();
+    MMO =
+      DAG.getMachineFunction()
+      .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
+                            MachineMemOperand::MOLoad, ByteSize, ByteSize);
+  } else {
+    MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
+    StackSlot = StackSlot.getOperand(1);
+  }
+  SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
+  SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
+                                           X86ISD::FILD, DL,
+                                           Tys, Ops, SrcVT, MMO);
+
+  if (useSSE) {
+    Chain = Result.getValue(1);
+    SDValue InFlag = Result.getValue(2);
+
+    // FIXME: Currently the FST is flagged to the FILD_FLAG. This
+    // shouldn't be necessary except that RFP cannot be live across
+    // multiple blocks. When stackifier is fixed, they can be uncoupled.
+    MachineFunction &MF = DAG.getMachineFunction();
+    unsigned SSFISize = Op.getValueType().getSizeInBits()/8;
+    int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false);
+    SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+    Tys = DAG.getVTList(MVT::Other);
+    SDValue Ops[] = {
+      Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
+    };
+    MachineMemOperand *MMO =
+      DAG.getMachineFunction()
+      .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
+                            MachineMemOperand::MOStore, SSFISize, SSFISize);
+
+    Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
+                                    Ops, Op.getValueType(), MMO);
+    Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot,
+                         MachinePointerInfo::getFixedStack(SSFI),
+                         false, false, false, 0);
+  }
+
+  return Result;
+}
+
+// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
+SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  // This algorithm is not obvious. Here it is what we're trying to output:
+  /*
+     movq       %rax,  %xmm0
+     punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
+     subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
+     #ifdef __SSE3__
+       haddpd   %xmm0, %xmm0
+     #else
+       pshufd   $0x4e, %xmm0, %xmm1
+       addpd    %xmm1, %xmm0
+     #endif
+  */
+
+  SDLoc dl(Op);
+  LLVMContext *Context = DAG.getContext();
+
+  // Build some magic constants.
+  static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
+  Constant *C0 = ConstantDataVector::get(*Context, CV0);
+  SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
+
+  SmallVector<Constant*,2> CV1;
+  CV1.push_back(
+    ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
+                                      APInt(64, 0x4330000000000000ULL))));
+  CV1.push_back(
+    ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
+                                      APInt(64, 0x4530000000000000ULL))));
+  Constant *C1 = ConstantVector::get(CV1);
+  SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
+
+  // Load the 64-bit value into an XMM register.
+  SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
+                            Op.getOperand(0));
+  SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
+                              MachinePointerInfo::getConstantPool(),
+                              false, false, false, 16);
+  SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32,
+                              DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1),
+                              CLod0);
+
+  SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
+                              MachinePointerInfo::getConstantPool(),
+                              false, false, false, 16);
+  SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1);
+  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
+  SDValue Result;
+
+  if (Subtarget->hasSSE3()) {
+    // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
+    Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
+  } else {
+    SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub);
+    SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,
+                                           S2F, 0x4E, DAG);
+    Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
+                         DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle),
+                         Sub);
+  }
+
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
+                     DAG.getIntPtrConstant(0));
+}
+
+// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
+SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  // FP constant to bias correct the final result.
+  SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
+                                   MVT::f64);
+
+  // Load the 32-bit value into an XMM register.
+  SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
+                             Op.getOperand(0));
+
+  // Zero out the upper parts of the register.
+  Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
+
+  Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
+                     DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load),
+                     DAG.getIntPtrConstant(0));
+
+  // Or the load with the bias.
+  SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
+                           DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
+                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+                                                   MVT::v2f64, Load)),
+                           DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
+                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+                                                   MVT::v2f64, Bias)));
+  Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
+                   DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or),
+                   DAG.getIntPtrConstant(0));
+
+  // Subtract the bias.
+  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
+
+  // Handle final rounding.
+  EVT DestVT = Op.getValueType();
+
+  if (DestVT.bitsLT(MVT::f64))
+    return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
+                       DAG.getIntPtrConstant(0));
+  if (DestVT.bitsGT(MVT::f64))
+    return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
+
+  // Handle final rounding.
+  return Sub;
+}
+
+SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDValue N0 = Op.getOperand(0);
+  MVT SVT = N0.getSimpleValueType();
+  SDLoc dl(Op);
+
+  assert((SVT == MVT::v4i8 || SVT == MVT::v4i16 ||
+          SVT == MVT::v8i8 || SVT == MVT::v8i16) &&
+         "Custom UINT_TO_FP is not supported!");
+
+  MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements());
+  return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
+                     DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
+}
+
+SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  SDValue N0 = Op.getOperand(0);
+  SDLoc dl(Op);
+
+  if (Op.getValueType().isVector())
+    return lowerUINT_TO_FP_vec(Op, DAG);
+
+  // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
+  // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
+  // the optimization here.
+  if (DAG.SignBitIsZero(N0))
+    return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
+
+  MVT SrcVT = N0.getSimpleValueType();
+  MVT DstVT = Op.getSimpleValueType();
+  if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
+    return LowerUINT_TO_FP_i64(Op, DAG);
+  if (SrcVT == MVT::i32 && X86ScalarSSEf64)
+    return LowerUINT_TO_FP_i32(Op, DAG);
+  if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
+    return SDValue();
+
+  // Make a 64-bit buffer, and use it to build an FILD.
+  SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
+  if (SrcVT == MVT::i32) {
+    SDValue WordOff = DAG.getConstant(4, getPointerTy());
+    SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
+                                     getPointerTy(), StackSlot, WordOff);
+    SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
+                                  StackSlot, MachinePointerInfo(),
+                                  false, false, 0);
+    SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32),
+                                  OffsetSlot, MachinePointerInfo(),
+                                  false, false, 0);
+    SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
+    return Fild;
+  }
+
+  assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
+                               StackSlot, MachinePointerInfo(),
+                               false, false, 0);
+  // For i64 source, we need to add the appropriate power of 2 if the input
+  // was negative.  This is the same as the optimization in
+  // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
+  // we must be careful to do the computation in x87 extended precision, not
+  // in SSE. (The generic code can't know it's OK to do this, or how to.)
+  int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
+  MachineMemOperand *MMO =
+    DAG.getMachineFunction()
+    .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
+                          MachineMemOperand::MOLoad, 8, 8);
+
+  SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
+  SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
+  SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
+                                         MVT::i64, MMO);
+
+  APInt FF(32, 0x5F800000ULL);
+
+  // Check whether the sign bit is set.
+  SDValue SignSet = DAG.getSetCC(dl,
+                                 getSetCCResultType(*DAG.getContext(), MVT::i64),
+                                 Op.getOperand(0), DAG.getConstant(0, MVT::i64),
+                                 ISD::SETLT);
+
+  // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
+  SDValue FudgePtr = DAG.getConstantPool(
+                             ConstantInt::get(*DAG.getContext(), FF.zext(64)),
+                                         getPointerTy());
+
+  // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
+  SDValue Zero = DAG.getIntPtrConstant(0);
+  SDValue Four = DAG.getIntPtrConstant(4);
+  SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
+                               Zero, Four);
+  FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset);
+
+  // Load the value out, extending it from f32 to f80.
+  // FIXME: Avoid the extend by constructing the right constant pool?
+  SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(),
+                                 FudgePtr, MachinePointerInfo::getConstantPool(),
+                                 MVT::f32, false, false, 4);
+  // Extend everything to 80 bits to force it to be done on x87.
+  SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
+  return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0));
+}
+
+std::pair<SDValue,SDValue>
+X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
+                                    bool IsSigned, bool IsReplace) const {
+  SDLoc DL(Op);
+
+  EVT DstTy = Op.getValueType();
+
+  if (!IsSigned && !isIntegerTypeFTOL(DstTy)) {
+    assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
+    DstTy = MVT::i64;
+  }
+
+  assert(DstTy.getSimpleVT() <= MVT::i64 &&
+         DstTy.getSimpleVT() >= MVT::i16 &&
+         "Unknown FP_TO_INT to lower!");
+
+  // These are really Legal.
+  if (DstTy == MVT::i32 &&
+      isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
+    return std::make_pair(SDValue(), SDValue());
+  if (Subtarget->is64Bit() &&
+      DstTy == MVT::i64 &&
+      isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
+    return std::make_pair(SDValue(), SDValue());
+
+  // We lower FP->int64 either into FISTP64 followed by a load from a temporary
+  // stack slot, or into the FTOL runtime function.
+  MachineFunction &MF = DAG.getMachineFunction();
+  unsigned MemSize = DstTy.getSizeInBits()/8;
+  int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
+  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+
+  unsigned Opc;
+  if (!IsSigned && isIntegerTypeFTOL(DstTy))
+    Opc = X86ISD::WIN_FTOL;
+  else
+    switch (DstTy.getSimpleVT().SimpleTy) {
+    default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
+    case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
+    case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
+    case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
+    }
+
+  SDValue Chain = DAG.getEntryNode();
+  SDValue Value = Op.getOperand(0);
+  EVT TheVT = Op.getOperand(0).getValueType();
+  // FIXME This causes a redundant load/store if the SSE-class value is already
+  // in memory, such as if it is on the callstack.
+  if (isScalarFPTypeInSSEReg(TheVT)) {
+    assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
+    Chain = DAG.getStore(Chain, DL, Value, StackSlot,
+                         MachinePointerInfo::getFixedStack(SSFI),
+                         false, false, 0);
+    SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
+    SDValue Ops[] = {
+      Chain, StackSlot, DAG.getValueType(TheVT)
+    };
+
+    MachineMemOperand *MMO =
+      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
+                              MachineMemOperand::MOLoad, MemSize, MemSize);
+    Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
+    Chain = Value.getValue(1);
+    SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
+    StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+  }
+
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
+                            MachineMemOperand::MOStore, MemSize, MemSize);
+
+  if (Opc != X86ISD::WIN_FTOL) {
+    // Build the FP_TO_INT*_IN_MEM
+    SDValue Ops[] = { Chain, Value, StackSlot };
+    SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
+                                           Ops, DstTy, MMO);
+    return std::make_pair(FIST, StackSlot);
+  } else {
+    SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL,
+      DAG.getVTList(MVT::Other, MVT::Glue),
+      Chain, Value);
+    SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX,
+      MVT::i32, ftol.getValue(1));
+    SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX,
+      MVT::i32, eax.getValue(2));
+    SDValue Ops[] = { eax, edx };
+    SDValue pair = IsReplace
+      ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops)
+      : DAG.getMergeValues(Ops, DL);
+    return std::make_pair(pair, SDValue());
+  }
+}
+
+static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
+                              const X86Subtarget *Subtarget) {
+  MVT VT = Op->getSimpleValueType(0);
+  SDValue In = Op->getOperand(0);
+  MVT InVT = In.getSimpleValueType();
+  SDLoc dl(Op);
+
+  // Optimize vectors in AVX mode:
+  //
+  //   v8i16 -> v8i32
+  //   Use vpunpcklwd for 4 lower elements  v8i16 -> v4i32.
+  //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
+  //   Concat upper and lower parts.
+  //
+  //   v4i32 -> v4i64
+  //   Use vpunpckldq for 4 lower elements  v4i32 -> v2i64.
+  //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
+  //   Concat upper and lower parts.
+  //
+
+  if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
+      ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
+      ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
+    return SDValue();
+
+  if (Subtarget->hasInt256())
+    return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
+
+  SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
+  SDValue Undef = DAG.getUNDEF(InVT);
+  bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
+  SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
+  SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
+
+  MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
+                             VT.getVectorNumElements()/2);
+
+  OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo);
+  OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi);
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
+}
+
+static  SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
+                                        SelectionDAG &DAG) {
+  MVT VT = Op->getSimpleValueType(0);
+  SDValue In = Op->getOperand(0);
+  MVT InVT = In.getSimpleValueType();
+  SDLoc DL(Op);
+  unsigned int NumElts = VT.getVectorNumElements();
+  if (NumElts != 8 && NumElts != 16)
+    return SDValue();
+
+  if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
+    return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
+
+  EVT ExtVT = (NumElts == 8)? MVT::v8i64 : MVT::v16i32;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  // Now we have only mask extension
+  assert(InVT.getVectorElementType() == MVT::i1);
+  SDValue Cst = DAG.getTargetConstant(1, ExtVT.getScalarType());
+  const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
+  SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
+  unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
+  SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
+                           MachinePointerInfo::getConstantPool(),
+                           false, false, false, Alignment);
+
+  SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, DL, ExtVT, In, Ld);
+  if (VT.is512BitVector())
+    return Brcst;
+  return DAG.getNode(X86ISD::VTRUNC, DL, VT, Brcst);
+}
+
+static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
+                               SelectionDAG &DAG) {
+  if (Subtarget->hasFp256()) {
+    SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
+    if (Res.getNode())
+      return Res;
+  }
+
+  return SDValue();
+}
+
+static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
+                                SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  MVT VT = Op.getSimpleValueType();
+  SDValue In = Op.getOperand(0);
+  MVT SVT = In.getSimpleValueType();
+
+  if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
+    return LowerZERO_EXTEND_AVX512(Op, DAG);
+
+  if (Subtarget->hasFp256()) {
+    SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
+    if (Res.getNode())
+      return Res;
+  }
+
+  assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
+         VT.getVectorNumElements() != SVT.getVectorNumElements());
+  return SDValue();
+}
+
+SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  MVT VT = Op.getSimpleValueType();
+  SDValue In = Op.getOperand(0);
+  MVT InVT = In.getSimpleValueType();
+
+  if (VT == MVT::i1) {
+    assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
+           "Invalid scalar TRUNCATE operation");
+    if (InVT == MVT::i32)
+      return SDValue();
+    if (InVT.getSizeInBits() == 64)
+      In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::i32, In);
+    else if (InVT.getSizeInBits() < 32)
+      In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
+    return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
+  }
+  assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
+         "Invalid TRUNCATE operation");
+
+  if (InVT.is512BitVector() || VT.getVectorElementType() == MVT::i1) {
+    if (VT.getVectorElementType().getSizeInBits() >=8)
+      return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
+
+    assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
+    unsigned NumElts = InVT.getVectorNumElements();
+    assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type");
+    if (InVT.getSizeInBits() < 512) {
+      MVT ExtVT = (NumElts == 16)? MVT::v16i32 : MVT::v8i64;
+      In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
+      InVT = ExtVT;
+    }
+    
+    SDValue Cst = DAG.getTargetConstant(1, InVT.getVectorElementType());
+    const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
+    SDValue CP = DAG.getConstantPool(C, getPointerTy());
+    unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
+    SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
+                           MachinePointerInfo::getConstantPool(),
+                           false, false, false, Alignment);
+    SDValue OneV = DAG.getNode(X86ISD::VBROADCAST, DL, InVT, Ld);
+    SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In);
+    return DAG.getNode(X86ISD::TESTM, DL, VT, And, And);
+  }
+
+  if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
+    // On AVX2, v4i64 -> v4i32 becomes VPERMD.
+    if (Subtarget->hasInt256()) {
+      static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
+      In = DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, In);
+      In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
+                                ShufMask);
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
+                         DAG.getIntPtrConstant(0));
+    }
+
+    SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
+                               DAG.getIntPtrConstant(0));
+    SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
+                               DAG.getIntPtrConstant(2));
+    OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
+    OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
+    static const int ShufMask[] = {0, 2, 4, 6};
+    return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
+  }
+
+  if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
+    // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
+    if (Subtarget->hasInt256()) {
+      In = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, In);
+
+      SmallVector<SDValue,32> pshufbMask;
+      for (unsigned i = 0; i < 2; ++i) {
+        pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8));
+        for (unsigned j = 0; j < 8; ++j)
+          pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
+      }
+      SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, pshufbMask);
+      In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
+      In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In);
+
+      static const int ShufMask[] = {0,  2,  -1,  -1};
+      In = DAG.getVectorShuffle(MVT::v4i64, DL,  In, DAG.getUNDEF(MVT::v4i64),
+                                &ShufMask[0]);
+      In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
+                       DAG.getIntPtrConstant(0));
+      return DAG.getNode(ISD::BITCAST, DL, VT, In);
+    }
+
+    SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
+                               DAG.getIntPtrConstant(0));
+
+    SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
+                               DAG.getIntPtrConstant(4));
+
+    OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpLo);
+    OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpHi);
+
+    // The PSHUFB mask:
+    static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
+                                   -1, -1, -1, -1, -1, -1, -1, -1};
+
+    SDValue Undef = DAG.getUNDEF(MVT::v16i8);
+    OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
+    OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
+
+    OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
+    OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
+
+    // The MOVLHPS Mask:
+    static const int ShufMask2[] = {0, 1, 4, 5};
+    SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
+    return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, res);
+  }
+
+  // Handle truncation of V256 to V128 using shuffles.
+  if (!VT.is128BitVector() || !InVT.is256BitVector())
+    return SDValue();
+
+  assert(Subtarget->hasFp256() && "256-bit vector without AVX!");
+
+  unsigned NumElems = VT.getVectorNumElements();
+  MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
+
+  SmallVector<int, 16> MaskVec(NumElems * 2, -1);
+  // Prepare truncation shuffle mask
+  for (unsigned i = 0; i != NumElems; ++i)
+    MaskVec[i] = i * 2;
+  SDValue V = DAG.getVectorShuffle(NVT, DL,
+                                   DAG.getNode(ISD::BITCAST, DL, NVT, In),
+                                   DAG.getUNDEF(NVT), &MaskVec[0]);
+  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
+                     DAG.getIntPtrConstant(0));
+}
+
+SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  assert(!Op.getSimpleValueType().isVector());
+
+  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
+    /*IsSigned=*/ true, /*IsReplace=*/ false);
+  SDValue FIST = Vals.first, StackSlot = Vals.second;
+  // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
+  if (!FIST.getNode()) return Op;
+
+  if (StackSlot.getNode())
+    // Load the result.
+    return DAG.getLoad(Op.getValueType(), SDLoc(Op),
+                       FIST, StackSlot, MachinePointerInfo(),
+                       false, false, false, 0);
+
+  // The node is the result.
+  return FIST;
+}
+
+SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
+    /*IsSigned=*/ false, /*IsReplace=*/ false);
+  SDValue FIST = Vals.first, StackSlot = Vals.second;
+  assert(FIST.getNode() && "Unexpected failure");
+
+  if (StackSlot.getNode())
+    // Load the result.
+    return DAG.getLoad(Op.getValueType(), SDLoc(Op),
+                       FIST, StackSlot, MachinePointerInfo(),
+                       false, false, false, 0);
+
+  // The node is the result.
+  return FIST;
+}
+
+static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  MVT VT = Op.getSimpleValueType();
+  SDValue In = Op.getOperand(0);
+  MVT SVT = In.getSimpleValueType();
+
+  assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
+
+  return DAG.getNode(X86ISD::VFPEXT, DL, VT,
+                     DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
+                                 In, DAG.getUNDEF(SVT)));
+}
+
+static SDValue LowerFABS(SDValue Op, SelectionDAG &DAG) {
+  LLVMContext *Context = DAG.getContext();
+  SDLoc dl(Op);
+  MVT VT = Op.getSimpleValueType();
+  MVT EltVT = VT;
+  unsigned NumElts = VT == MVT::f64 ? 2 : 4;
+  if (VT.isVector()) {
+    EltVT = VT.getVectorElementType();
+    NumElts = VT.getVectorNumElements();
+  }
+  Constant *C;
+  if (EltVT == MVT::f64)
+    C = ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
+                                          APInt(64, ~(1ULL << 63))));
+  else
+    C = ConstantFP::get(*Context, APFloat(APFloat::IEEEsingle,
+                                          APInt(32, ~(1U << 31))));
+  C = ConstantVector::getSplat(NumElts, C);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy());
+  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
+  SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+                             MachinePointerInfo::getConstantPool(),
+                             false, false, false, Alignment);
+  if (VT.isVector()) {
+    MVT ANDVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
+    return DAG.getNode(ISD::BITCAST, dl, VT,
+                       DAG.getNode(ISD::AND, dl, ANDVT,
+                                   DAG.getNode(ISD::BITCAST, dl, ANDVT,
+                                               Op.getOperand(0)),
+                                   DAG.getNode(ISD::BITCAST, dl, ANDVT, Mask)));
+  }
+  return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask);
+}
+
+static SDValue LowerFNEG(SDValue Op, SelectionDAG &DAG) {
+  LLVMContext *Context = DAG.getContext();
+  SDLoc dl(Op);
+  MVT VT = Op.getSimpleValueType();
+  MVT EltVT = VT;
+  unsigned NumElts = VT == MVT::f64 ? 2 : 4;
+  if (VT.isVector()) {
+    EltVT = VT.getVectorElementType();
+    NumElts = VT.getVectorNumElements();
+  }
+  Constant *C;
+  if (EltVT == MVT::f64)
+    C = ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
+                                          APInt(64, 1ULL << 63)));
+  else
+    C = ConstantFP::get(*Context, APFloat(APFloat::IEEEsingle,
+                                          APInt(32, 1U << 31)));
+  C = ConstantVector::getSplat(NumElts, C);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy());
+  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
+  SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+                             MachinePointerInfo::getConstantPool(),
+                             false, false, false, Alignment);
+  if (VT.isVector()) {
+    MVT XORVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits()/64);
+    return DAG.getNode(ISD::BITCAST, dl, VT,
+                       DAG.getNode(ISD::XOR, dl, XORVT,
+                                   DAG.getNode(ISD::BITCAST, dl, XORVT,
+                                               Op.getOperand(0)),
+                                   DAG.getNode(ISD::BITCAST, dl, XORVT, Mask)));
+  }
+
+  return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
+}
+
+static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  LLVMContext *Context = DAG.getContext();
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  SDLoc dl(Op);
+  MVT VT = Op.getSimpleValueType();
+  MVT SrcVT = Op1.getSimpleValueType();
+
+  // If second operand is smaller, extend it first.
+  if (SrcVT.bitsLT(VT)) {
+    Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
+    SrcVT = VT;
+  }
+  // And if it is bigger, shrink it first.
+  if (SrcVT.bitsGT(VT)) {
+    Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1));
+    SrcVT = VT;
+  }
+
+  // At this point the operands and the result should have the same
+  // type, and that won't be f80 since that is not custom lowered.
+
+  // First get the sign bit of second operand.
+  SmallVector<Constant*,4> CV;
+  if (SrcVT == MVT::f64) {
+    const fltSemantics &Sem = APFloat::IEEEdouble;
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 1ULL << 63))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0))));
+  } else {
+    const fltSemantics &Sem = APFloat::IEEEsingle;
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 1U << 31))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
+  }
+  Constant *C = ConstantVector::get(CV);
+  SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
+  SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
+                              MachinePointerInfo::getConstantPool(),
+                              false, false, false, 16);
+  SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
+
+  // Shift sign bit right or left if the two operands have different types.
+  if (SrcVT.bitsGT(VT)) {
+    // Op0 is MVT::f32, Op1 is MVT::f64.
+    SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit);
+    SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit,
+                          DAG.getConstant(32, MVT::i32));
+    SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit);
+    SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit,
+                          DAG.getIntPtrConstant(0));
+  }
+
+  // Clear first operand sign bit.
+  CV.clear();
+  if (VT == MVT::f64) {
+    const fltSemantics &Sem = APFloat::IEEEdouble;
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem,
+                                                   APInt(64, ~(1ULL << 63)))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0))));
+  } else {
+    const fltSemantics &Sem = APFloat::IEEEsingle;
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem,
+                                                   APInt(32, ~(1U << 31)))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
+  }
+  C = ConstantVector::get(CV);
+  CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
+  SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+                              MachinePointerInfo::getConstantPool(),
+                              false, false, false, 16);
+  SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2);
+
+  // Or the value with the sign bit.
+  return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
+}
+
+static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
+  SDValue N0 = Op.getOperand(0);
+  SDLoc dl(Op);
+  MVT VT = Op.getSimpleValueType();
+
+  // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
+  SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,
+                                  DAG.getConstant(1, VT));
+  return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT));
+}
+
+// LowerVectorAllZeroTest - Check whether an OR'd tree is PTEST-able.
+//
+static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget,
+                                      SelectionDAG &DAG) {
+  assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
+
+  if (!Subtarget->hasSSE41())
+    return SDValue();
+
+  if (!Op->hasOneUse())
+    return SDValue();
+
+  SDNode *N = Op.getNode();
+  SDLoc DL(N);
+
+  SmallVector<SDValue, 8> Opnds;
+  DenseMap<SDValue, unsigned> VecInMap;
+  SmallVector<SDValue, 8> VecIns;
+  EVT VT = MVT::Other;
+
+  // Recognize a special case where a vector is casted into wide integer to
+  // test all 0s.
+  Opnds.push_back(N->getOperand(0));
+  Opnds.push_back(N->getOperand(1));
+
+  for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
+    SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
+    // BFS traverse all OR'd operands.
+    if (I->getOpcode() == ISD::OR) {
+      Opnds.push_back(I->getOperand(0));
+      Opnds.push_back(I->getOperand(1));
+      // Re-evaluate the number of nodes to be traversed.
+      e += 2; // 2 more nodes (LHS and RHS) are pushed.
+      continue;
+    }
+
+    // Quit if a non-EXTRACT_VECTOR_ELT
+    if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      return SDValue();
+
+    // Quit if without a constant index.
+    SDValue Idx = I->getOperand(1);
+    if (!isa<ConstantSDNode>(Idx))
+      return SDValue();
+
+    SDValue ExtractedFromVec = I->getOperand(0);
+    DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
+    if (M == VecInMap.end()) {
+      VT = ExtractedFromVec.getValueType();
+      // Quit if not 128/256-bit vector.
+      if (!VT.is128BitVector() && !VT.is256BitVector())
+        return SDValue();
+      // Quit if not the same type.
+      if (VecInMap.begin() != VecInMap.end() &&
+          VT != VecInMap.begin()->first.getValueType())
+        return SDValue();
+      M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
+      VecIns.push_back(ExtractedFromVec);
+    }
+    M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
+  }
+
+  assert((VT.is128BitVector() || VT.is256BitVector()) &&
+         "Not extracted from 128-/256-bit vector.");
+
+  unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
+
+  for (DenseMap<SDValue, unsigned>::const_iterator
+        I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
+    // Quit if not all elements are used.
+    if (I->second != FullMask)
+      return SDValue();
+  }
+
+  EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
+
+  // Cast all vectors into TestVT for PTEST.
+  for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
+    VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]);
+
+  // If more than one full vectors are evaluated, OR them first before PTEST.
+  for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
+    // Each iteration will OR 2 nodes and append the result until there is only
+    // 1 node left, i.e. the final OR'd value of all vectors.
+    SDValue LHS = VecIns[Slot];
+    SDValue RHS = VecIns[Slot + 1];
+    VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
+  }
+
+  return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
+                     VecIns.back(), VecIns.back());
+}
+
+/// \brief return true if \c Op has a use that doesn't just read flags.
+static bool hasNonFlagsUse(SDValue Op) {
+  for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
+       ++UI) {
+    SDNode *User = *UI;
+    unsigned UOpNo = UI.getOperandNo();
+    if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
+      // Look pass truncate.
+      UOpNo = User->use_begin().getOperandNo();
+      User = *User->use_begin();
+    }
+
+    if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
+        !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
+      return true;
+  }
+  return false;
+}
+
+/// Emit nodes that will be selected as "test Op0,Op0", or something
+/// equivalent.
+SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
+                                    SelectionDAG &DAG) const {
+  if (Op.getValueType() == MVT::i1)
+    // KORTEST instruction should be selected
+    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
+                       DAG.getConstant(0, Op.getValueType()));
+
+  // CF and OF aren't always set the way we want. Determine which
+  // of these we need.
+  bool NeedCF = false;
+  bool NeedOF = false;
+  switch (X86CC) {
+  default: break;
+  case X86::COND_A: case X86::COND_AE:
+  case X86::COND_B: case X86::COND_BE:
+    NeedCF = true;
+    break;
+  case X86::COND_G: case X86::COND_GE:
+  case X86::COND_L: case X86::COND_LE:
+  case X86::COND_O: case X86::COND_NO: {
+    // Check if we really need to set the
+    // Overflow flag. If NoSignedWrap is present
+    // that is not actually needed.
+    switch (Op->getOpcode()) {
+    case ISD::ADD:
+    case ISD::SUB:
+    case ISD::MUL:
+    case ISD::SHL: {
+      const BinaryWithFlagsSDNode *BinNode =
+          cast<BinaryWithFlagsSDNode>(Op.getNode());
+      if (BinNode->hasNoSignedWrap())
+        break;
+    }
+    default:
+      NeedOF = true;
+      break;
+    }
+    break;
+  }
+  }
+  // See if we can use the EFLAGS value from the operand instead of
+  // doing a separate TEST. TEST always sets OF and CF to 0, so unless
+  // we prove that the arithmetic won't overflow, we can't use OF or CF.
+  if (Op.getResNo() != 0 || NeedOF || NeedCF) {
+    // Emit a CMP with 0, which is the TEST pattern.
+    //if (Op.getValueType() == MVT::i1)
+    //  return DAG.getNode(X86ISD::CMP, dl, MVT::i1, Op,
+    //                     DAG.getConstant(0, MVT::i1));
+    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
+                       DAG.getConstant(0, Op.getValueType()));
+  }
+  unsigned Opcode = 0;
+  unsigned NumOperands = 0;
+
+  // Truncate operations may prevent the merge of the SETCC instruction
+  // and the arithmetic instruction before it. Attempt to truncate the operands
+  // of the arithmetic instruction and use a reduced bit-width instruction.
+  bool NeedTruncation = false;
+  SDValue ArithOp = Op;
+  if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
+    SDValue Arith = Op->getOperand(0);
+    // Both the trunc and the arithmetic op need to have one user each.
+    if (Arith->hasOneUse())
+      switch (Arith.getOpcode()) {
+        default: break;
+        case ISD::ADD:
+        case ISD::SUB:
+        case ISD::AND:
+        case ISD::OR:
+        case ISD::XOR: {
+          NeedTruncation = true;
+          ArithOp = Arith;
+        }
+      }
+  }
+
+  // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
+  // which may be the result of a CAST.  We use the variable 'Op', which is the
+  // non-casted variable when we check for possible users.
+  switch (ArithOp.getOpcode()) {
+  case ISD::ADD:
+    // Due to an isel shortcoming, be conservative if this add is likely to be
+    // selected as part of a load-modify-store instruction. When the root node
+    // in a match is a store, isel doesn't know how to remap non-chain non-flag
+    // uses of other nodes in the match, such as the ADD in this case. This
+    // leads to the ADD being left around and reselected, with the result being
+    // two adds in the output.  Alas, even if none our users are stores, that
+    // doesn't prove we're O.K.  Ergo, if we have any parents that aren't
+    // CopyToReg or SETCC, eschew INC/DEC.  A better fix seems to require
+    // climbing the DAG back to the root, and it doesn't seem to be worth the
+    // effort.
+    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+         UE = Op.getNode()->use_end(); UI != UE; ++UI)
+      if (UI->getOpcode() != ISD::CopyToReg &&
+          UI->getOpcode() != ISD::SETCC &&
+          UI->getOpcode() != ISD::STORE)
+        goto default_case;
+
+    if (ConstantSDNode *C =
+        dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
+      // An add of one will be selected as an INC.
+      if (C->getAPIntValue() == 1 && !Subtarget->slowIncDec()) {
+        Opcode = X86ISD::INC;
+        NumOperands = 1;
+        break;
+      }
+
+      // An add of negative one (subtract of one) will be selected as a DEC.
+      if (C->getAPIntValue().isAllOnesValue() && !Subtarget->slowIncDec()) {
+        Opcode = X86ISD::DEC;
+        NumOperands = 1;
+        break;
+      }
+    }
+
+    // Otherwise use a regular EFLAGS-setting add.
+    Opcode = X86ISD::ADD;
+    NumOperands = 2;
+    break;
+  case ISD::SHL:
+  case ISD::SRL:
+    // If we have a constant logical shift that's only used in a comparison
+    // against zero turn it into an equivalent AND. This allows turning it into
+    // a TEST instruction later.
+    if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() &&
+        isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
+      EVT VT = Op.getValueType();
+      unsigned BitWidth = VT.getSizeInBits();
+      unsigned ShAmt = Op->getConstantOperandVal(1);
+      if (ShAmt >= BitWidth) // Avoid undefined shifts.
+        break;
+      APInt Mask = ArithOp.getOpcode() == ISD::SRL
+                       ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
+                       : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
+      if (!Mask.isSignedIntN(32)) // Avoid large immediates.
+        break;
+      SDValue New = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
+                                DAG.getConstant(Mask, VT));
+      DAG.ReplaceAllUsesWith(Op, New);
+      Op = New;
+    }
+    break;
+
+  case ISD::AND:
+    // If the primary and result isn't used, don't bother using X86ISD::AND,
+    // because a TEST instruction will be better.
+    if (!hasNonFlagsUse(Op))
+      break;
+    // FALL THROUGH
+  case ISD::SUB:
+  case ISD::OR:
+  case ISD::XOR:
+    // Due to the ISEL shortcoming noted above, be conservative if this op is
+    // likely to be selected as part of a load-modify-store instruction.
+    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+           UE = Op.getNode()->use_end(); UI != UE; ++UI)
+      if (UI->getOpcode() == ISD::STORE)
+        goto default_case;
+
+    // Otherwise use a regular EFLAGS-setting instruction.
+    switch (ArithOp.getOpcode()) {
+    default: llvm_unreachable("unexpected operator!");
+    case ISD::SUB: Opcode = X86ISD::SUB; break;
+    case ISD::XOR: Opcode = X86ISD::XOR; break;
+    case ISD::AND: Opcode = X86ISD::AND; break;
+    case ISD::OR: {
+      if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
+        SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG);
+        if (EFLAGS.getNode())
+          return EFLAGS;
+      }
+      Opcode = X86ISD::OR;
+      break;
+    }
+    }
+
+    NumOperands = 2;
+    break;
+  case X86ISD::ADD:
+  case X86ISD::SUB:
+  case X86ISD::INC:
+  case X86ISD::DEC:
+  case X86ISD::OR:
+  case X86ISD::XOR:
+  case X86ISD::AND:
+    return SDValue(Op.getNode(), 1);
+  default:
+  default_case:
+    break;
+  }
+
+  // If we found that truncation is beneficial, perform the truncation and
+  // update 'Op'.
+  if (NeedTruncation) {
+    EVT VT = Op.getValueType();
+    SDValue WideVal = Op->getOperand(0);
+    EVT WideVT = WideVal.getValueType();
+    unsigned ConvertedOp = 0;
+    // Use a target machine opcode to prevent further DAGCombine
+    // optimizations that may separate the arithmetic operations
+    // from the setcc node.
+    switch (WideVal.getOpcode()) {
+      default: break;
+      case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
+      case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
+      case ISD::AND: ConvertedOp = X86ISD::AND; break;
+      case ISD::OR:  ConvertedOp = X86ISD::OR;  break;
+      case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
+    }
+
+    if (ConvertedOp) {
+      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+      if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
+        SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
+        SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
+        Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
+      }
+    }
+  }
+
+  if (Opcode == 0)
+    // Emit a CMP with 0, which is the TEST pattern.
+    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
+                       DAG.getConstant(0, Op.getValueType()));
+
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+  SmallVector<SDValue, 4> Ops;
+  for (unsigned i = 0; i != NumOperands; ++i)
+    Ops.push_back(Op.getOperand(i));
+
+  SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
+  DAG.ReplaceAllUsesWith(Op, New);
+  return SDValue(New.getNode(), 1);
+}
+
+/// Emit nodes that will be selected as "cmp Op0,Op1", or something
+/// equivalent.
+SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
+                                   SDLoc dl, SelectionDAG &DAG) const {
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) {
+    if (C->getAPIntValue() == 0)
+      return EmitTest(Op0, X86CC, dl, DAG);
+
+     if (Op0.getValueType() == MVT::i1)
+       llvm_unreachable("Unexpected comparison operation for MVT::i1 operands");
+  }
+ 
+  if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
+       Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
+    // Do the comparison at i32 if it's smaller, besides the Atom case. 
+    // This avoids subregister aliasing issues. Keep the smaller reference 
+    // if we're optimizing for size, however, as that'll allow better folding 
+    // of memory operations.
+    if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 &&
+        !DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute(
+             AttributeSet::FunctionIndex, Attribute::MinSize) &&
+        !Subtarget->isAtom()) {
+      unsigned ExtendOp =
+          isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
+      Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
+      Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
+    }
+    // Use SUB instead of CMP to enable CSE between SUB and CMP.
+    SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
+    SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
+                              Op0, Op1);
+    return SDValue(Sub.getNode(), 1);
+  }
+  return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
+}
+
+/// Convert a comparison if required by the subtarget.
+SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
+                                                 SelectionDAG &DAG) const {
+  // If the subtarget does not support the FUCOMI instruction, floating-point
+  // comparisons have to be converted.
+  if (Subtarget->hasCMov() ||
+      Cmp.getOpcode() != X86ISD::CMP ||
+      !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
+      !Cmp.getOperand(1).getValueType().isFloatingPoint())
+    return Cmp;
+
+  // The instruction selector will select an FUCOM instruction instead of
+  // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
+  // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
+  // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
+  SDLoc dl(Cmp);
+  SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
+  SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
+  SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
+                            DAG.getConstant(8, MVT::i8));
+  SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
+  return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
+}
+
+static bool isAllOnes(SDValue V) {
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
+  return C && C->isAllOnesValue();
+}
+
+/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
+/// if it's possible.
+SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
+                                     SDLoc dl, SelectionDAG &DAG) const {
+  SDValue Op0 = And.getOperand(0);
+  SDValue Op1 = And.getOperand(1);
+  if (Op0.getOpcode() == ISD::TRUNCATE)
+    Op0 = Op0.getOperand(0);
+  if (Op1.getOpcode() == ISD::TRUNCATE)
+    Op1 = Op1.getOperand(0);
+
+  SDValue LHS, RHS;
+  if (Op1.getOpcode() == ISD::SHL)
+    std::swap(Op0, Op1);
+  if (Op0.getOpcode() == ISD::SHL) {
+    if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0)))
+      if (And00C->getZExtValue() == 1) {
+        // If we looked past a truncate, check that it's only truncating away
+        // known zeros.
+        unsigned BitWidth = Op0.getValueSizeInBits();
+        unsigned AndBitWidth = And.getValueSizeInBits();
+        if (BitWidth > AndBitWidth) {
+          APInt Zeros, Ones;
+          DAG.computeKnownBits(Op0, Zeros, Ones);
+          if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
+            return SDValue();
+        }
+        LHS = Op1;
+        RHS = Op0.getOperand(1);
+      }
+  } else if (Op1.getOpcode() == ISD::Constant) {
+    ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
+    uint64_t AndRHSVal = AndRHS->getZExtValue();
+    SDValue AndLHS = Op0;
+
+    if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
+      LHS = AndLHS.getOperand(0);
+      RHS = AndLHS.getOperand(1);
+    }
+
+    // Use BT if the immediate can't be encoded in a TEST instruction.
+    if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
+      LHS = AndLHS;
+      RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType());
+    }
+  }
+
+  if (LHS.getNode()) {
+    // If LHS is i8, promote it to i32 with any_extend.  There is no i8 BT
+    // instruction.  Since the shift amount is in-range-or-undefined, we know
+    // that doing a bittest on the i32 value is ok.  We extend to i32 because
+    // the encoding for the i16 version is larger than the i32 version.
+    // Also promote i16 to i32 for performance / code size reason.
+    if (LHS.getValueType() == MVT::i8 ||
+        LHS.getValueType() == MVT::i16)
+      LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
+
+    // If the operand types disagree, extend the shift amount to match.  Since
+    // BT ignores high bits (like shifts) we can use anyextend.
+    if (LHS.getValueType() != RHS.getValueType())
+      RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
+
+    SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
+    X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
+    return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                       DAG.getConstant(Cond, MVT::i8), BT);
+  }
+
+  return SDValue();
+}
+
+/// \brief - Turns an ISD::CondCode into a value suitable for SSE floating point
+/// mask CMPs.
+static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
+                              SDValue &Op1) {
+  unsigned SSECC;
+  bool Swap = false;
+
+  // SSE Condition code mapping:
+  //  0 - EQ
+  //  1 - LT
+  //  2 - LE
+  //  3 - UNORD
+  //  4 - NEQ
+  //  5 - NLT
+  //  6 - NLE
+  //  7 - ORD
+  switch (SetCCOpcode) {
+  default: llvm_unreachable("Unexpected SETCC condition");
+  case ISD::SETOEQ:
+  case ISD::SETEQ:  SSECC = 0; break;
+  case ISD::SETOGT:
+  case ISD::SETGT:  Swap = true; // Fallthrough
+  case ISD::SETLT:
+  case ISD::SETOLT: SSECC = 1; break;
+  case ISD::SETOGE:
+  case ISD::SETGE:  Swap = true; // Fallthrough
+  case ISD::SETLE:
+  case ISD::SETOLE: SSECC = 2; break;
+  case ISD::SETUO:  SSECC = 3; break;
+  case ISD::SETUNE:
+  case ISD::SETNE:  SSECC = 4; break;
+  case ISD::SETULE: Swap = true; // Fallthrough
+  case ISD::SETUGE: SSECC = 5; break;
+  case ISD::SETULT: Swap = true; // Fallthrough
+  case ISD::SETUGT: SSECC = 6; break;
+  case ISD::SETO:   SSECC = 7; break;
+  case ISD::SETUEQ:
+  case ISD::SETONE: SSECC = 8; break;
+  }
+  if (Swap)
+    std::swap(Op0, Op1);
+
+  return SSECC;
+}
+
+// Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128
+// ones, and then concatenate the result back.
+static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+
+  assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
+         "Unsupported value type for operation");
+
+  unsigned NumElems = VT.getVectorNumElements();
+  SDLoc dl(Op);
+  SDValue CC = Op.getOperand(2);
+
+  // Extract the LHS vectors
+  SDValue LHS = Op.getOperand(0);
+  SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
+  SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
+
+  // Extract the RHS vectors
+  SDValue RHS = Op.getOperand(1);
+  SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
+  SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
+
+  // Issue the operation on the smaller types and concatenate the result back
+  MVT EltVT = VT.getVectorElementType();
+  MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
+                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
+}
+
+static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG,
+                                     const X86Subtarget *Subtarget) {
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  SDValue CC = Op.getOperand(2);
+  MVT VT = Op.getSimpleValueType();
+  SDLoc dl(Op);
+
+  assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 32 &&
+         Op.getValueType().getScalarType() == MVT::i1 &&
+         "Cannot set masked compare for this operation");
+
+  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
+  unsigned  Opc = 0;
+  bool Unsigned = false;
+  bool Swap = false;
+  unsigned SSECC;
+  switch (SetCCOpcode) {
+  default: llvm_unreachable("Unexpected SETCC condition");
+  case ISD::SETNE:  SSECC = 4; break;
+  case ISD::SETEQ:  Opc = X86ISD::PCMPEQM; break;
+  case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
+  case ISD::SETLT:  Swap = true; //fall-through
+  case ISD::SETGT:  Opc = X86ISD::PCMPGTM; break;
+  case ISD::SETULT: SSECC = 1; Unsigned = true; break;
+  case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
+  case ISD::SETGE:  Swap = true; SSECC = 2; break; // LE + swap
+  case ISD::SETULE: Unsigned = true; //fall-through
+  case ISD::SETLE:  SSECC = 2; break;
+  }
+
+  if (Swap)
+    std::swap(Op0, Op1);
+  if (Opc)
+    return DAG.getNode(Opc, dl, VT, Op0, Op1);
+  Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
+  return DAG.getNode(Opc, dl, VT, Op0, Op1,
+                     DAG.getConstant(SSECC, MVT::i8));
+}
+
+/// \brief Try to turn a VSETULT into a VSETULE by modifying its second
+/// operand \p Op1.  If non-trivial (for example because it's not constant)
+/// return an empty value.
+static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG)
+{
+  BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
+  if (!BV)
+    return SDValue();
+
+  MVT VT = Op1.getSimpleValueType();
+  MVT EVT = VT.getVectorElementType();
+  unsigned n = VT.getVectorNumElements();
+  SmallVector<SDValue, 8> ULTOp1;
+
+  for (unsigned i = 0; i < n; ++i) {
+    ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
+    if (!Elt || Elt->isOpaque() || Elt->getValueType(0) != EVT)
+      return SDValue();
+
+    // Avoid underflow.
+    APInt Val = Elt->getAPIntValue();
+    if (Val == 0)
+      return SDValue();
+
+    ULTOp1.push_back(DAG.getConstant(Val - 1, EVT));
+  }
+
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, ULTOp1);
+}
+
+static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
+                           SelectionDAG &DAG) {
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  SDValue CC = Op.getOperand(2);
+  MVT VT = Op.getSimpleValueType();
+  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
+  bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
+  SDLoc dl(Op);
+
+  if (isFP) {
+#ifndef NDEBUG
+    MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
+    assert(EltVT == MVT::f32 || EltVT == MVT::f64);
+#endif
+
+    unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
+    unsigned Opc = X86ISD::CMPP;
+    if (Subtarget->hasAVX512() && VT.getVectorElementType() == MVT::i1) {
+      assert(VT.getVectorNumElements() <= 16);
+      Opc = X86ISD::CMPM;
+    }
+    // In the two special cases we can't handle, emit two comparisons.
+    if (SSECC == 8) {
+      unsigned CC0, CC1;
+      unsigned CombineOpc;
+      if (SetCCOpcode == ISD::SETUEQ) {
+        CC0 = 3; CC1 = 0; CombineOpc = ISD::OR;
+      } else {
+        assert(SetCCOpcode == ISD::SETONE);
+        CC0 = 7; CC1 = 4; CombineOpc = ISD::AND;
+      }
+
+      SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
+                                 DAG.getConstant(CC0, MVT::i8));
+      SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
+                                 DAG.getConstant(CC1, MVT::i8));
+      return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
+    }
+    // Handle all other FP comparisons here.
+    return DAG.getNode(Opc, dl, VT, Op0, Op1,
+                       DAG.getConstant(SSECC, MVT::i8));
+  }
+
+  // Break 256-bit integer vector compare into smaller ones.
+  if (VT.is256BitVector() && !Subtarget->hasInt256())
+    return Lower256IntVSETCC(Op, DAG);
+
+  bool MaskResult = (VT.getVectorElementType() == MVT::i1);
+  EVT OpVT = Op1.getValueType();
+  if (Subtarget->hasAVX512()) {
+    if (Op1.getValueType().is512BitVector() ||
+        (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32))
+      return LowerIntVSETCC_AVX512(Op, DAG, Subtarget);
+
+    // In AVX-512 architecture setcc returns mask with i1 elements,
+    // But there is no compare instruction for i8 and i16 elements.
+    // We are not talking about 512-bit operands in this case, these
+    // types are illegal.
+    if (MaskResult &&
+        (OpVT.getVectorElementType().getSizeInBits() < 32 &&
+         OpVT.getVectorElementType().getSizeInBits() >= 8))
+      return DAG.getNode(ISD::TRUNCATE, dl, VT,
+                         DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
+  }
+
+  // We are handling one of the integer comparisons here.  Since SSE only has
+  // GT and EQ comparisons for integer, swapping operands and multiple
+  // operations may be required for some comparisons.
+  unsigned Opc;
+  bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
+  bool Subus = false;
+
+  switch (SetCCOpcode) {
+  default: llvm_unreachable("Unexpected SETCC condition");
+  case ISD::SETNE:  Invert = true;
+  case ISD::SETEQ:  Opc = X86ISD::PCMPEQ; break;
+  case ISD::SETLT:  Swap = true;
+  case ISD::SETGT:  Opc = X86ISD::PCMPGT; break;
+  case ISD::SETGE:  Swap = true;
+  case ISD::SETLE:  Opc = X86ISD::PCMPGT;
+                    Invert = true; break;
+  case ISD::SETULT: Swap = true;
+  case ISD::SETUGT: Opc = X86ISD::PCMPGT;
+                    FlipSigns = true; break;
+  case ISD::SETUGE: Swap = true;
+  case ISD::SETULE: Opc = X86ISD::PCMPGT;
+                    FlipSigns = true; Invert = true; break;
+  }
+
+  // Special case: Use min/max operations for SETULE/SETUGE
+  MVT VET = VT.getVectorElementType();
+  bool hasMinMax =
+       (Subtarget->hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
+    || (Subtarget->hasSSE2()  && (VET == MVT::i8));
+
+  if (hasMinMax) {
+    switch (SetCCOpcode) {
+    default: break;
+    case ISD::SETULE: Opc = X86ISD::UMIN; MinMax = true; break;
+    case ISD::SETUGE: Opc = X86ISD::UMAX; MinMax = true; break;
+    }
+
+    if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
+  }
+
+  bool hasSubus = Subtarget->hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
+  if (!MinMax && hasSubus) {
+    // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
+    // Op0 u<= Op1:
+    //   t = psubus Op0, Op1
+    //   pcmpeq t, <0..0>
+    switch (SetCCOpcode) {
+    default: break;
+    case ISD::SETULT: {
+      // If the comparison is against a constant we can turn this into a
+      // setule.  With psubus, setule does not require a swap.  This is
+      // beneficial because the constant in the register is no longer
+      // destructed as the destination so it can be hoisted out of a loop.
+      // Only do this pre-AVX since vpcmp* is no longer destructive.
+      if (Subtarget->hasAVX())
+        break;
+      SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG);
+      if (ULEOp1.getNode()) {
+        Op1 = ULEOp1;
+        Subus = true; Invert = false; Swap = false;
+      }
+      break;
+    }
+    // Psubus is better than flip-sign because it requires no inversion.
+    case ISD::SETUGE: Subus = true; Invert = false; Swap = true;  break;
+    case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
+    }
+
+    if (Subus) {
+      Opc = X86ISD::SUBUS;
+      FlipSigns = false;
+    }
+  }
+
+  if (Swap)
+    std::swap(Op0, Op1);
+
+  // Check that the operation in question is available (most are plain SSE2,
+  // but PCMPGTQ and PCMPEQQ have different requirements).
+  if (VT == MVT::v2i64) {
+    if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) {
+      assert(Subtarget->hasSSE2() && "Don't know how to lower!");
+
+      // First cast everything to the right type.
+      Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
+      Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
+
+      // Since SSE has no unsigned integer comparisons, we need to flip the sign
+      // bits of the inputs before performing those operations. The lower
+      // compare is always unsigned.
+      SDValue SB;
+      if (FlipSigns) {
+        SB = DAG.getConstant(0x80000000U, MVT::v4i32);
+      } else {
+        SDValue Sign = DAG.getConstant(0x80000000U, MVT::i32);
+        SDValue Zero = DAG.getConstant(0x00000000U, MVT::i32);
+        SB = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                         Sign, Zero, Sign, Zero);
+      }
+      Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
+      Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
+
+      // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
+      SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
+      SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
+
+      // Create masks for only the low parts/high parts of the 64 bit integers.
+      static const int MaskHi[] = { 1, 1, 3, 3 };
+      static const int MaskLo[] = { 0, 0, 2, 2 };
+      SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
+      SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
+      SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
+
+      SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
+      Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
+
+      if (Invert)
+        Result = DAG.getNOT(dl, Result, MVT::v4i32);
+
+      return DAG.getNode(ISD::BITCAST, dl, VT, Result);
+    }
+
+    if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) {
+      // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
+      // pcmpeqd + pshufd + pand.
+      assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!");
+
+      // First cast everything to the right type.
+      Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
+      Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
+
+      // Do the compare.
+      SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
+
+      // Make sure the lower and upper halves are both all-ones.
+      static const int Mask[] = { 1, 0, 3, 2 };
+      SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
+      Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
+
+      if (Invert)
+        Result = DAG.getNOT(dl, Result, MVT::v4i32);
+
+      return DAG.getNode(ISD::BITCAST, dl, VT, Result);
+    }
+  }
+
+  // Since SSE has no unsigned integer comparisons, we need to flip the sign
+  // bits of the inputs before performing those operations.
+  if (FlipSigns) {
+    EVT EltVT = VT.getVectorElementType();
+    SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), VT);
+    Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
+    Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB);
+  }
+
+  SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
+
+  // If the logical-not of the result is required, perform that now.
+  if (Invert)
+    Result = DAG.getNOT(dl, Result, VT);
+
+  if (MinMax)
+    Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
+
+  if (Subus)
+    Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
+                         getZeroVector(VT, Subtarget, DAG, dl));
+
+  return Result;
+}
+
+SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+
+  MVT VT = Op.getSimpleValueType();
+
+  if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
+
+  assert(((!Subtarget->hasAVX512() && VT == MVT::i8) || (VT == MVT::i1))
+         && "SetCC type must be 8-bit or 1-bit integer");
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  SDLoc dl(Op);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+
+  // Optimize to BT if possible.
+  // Lower (X & (1 << N)) == 0 to BT(X, N).
+  // Lower ((X >>u N) & 1) != 0 to BT(X, N).
+  // Lower ((X >>s N) & 1) != 0 to BT(X, N).
+  if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&
+      Op1.getOpcode() == ISD::Constant &&
+      cast<ConstantSDNode>(Op1)->isNullValue() &&
+      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+    SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
+    if (NewSetCC.getNode())
+      return NewSetCC;
+  }
+
+  // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
+  // these.
+  if (Op1.getOpcode() == ISD::Constant &&
+      (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
+       cast<ConstantSDNode>(Op1)->isNullValue()) &&
+      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+
+    // If the input is a setcc, then reuse the input setcc or use a new one with
+    // the inverted condition.
+    if (Op0.getOpcode() == X86ISD::SETCC) {
+      X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
+      bool Invert = (CC == ISD::SETNE) ^
+        cast<ConstantSDNode>(Op1)->isNullValue();
+      if (!Invert)
+        return Op0;
+
+      CCode = X86::GetOppositeBranchCondition(CCode);
+      SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                                  DAG.getConstant(CCode, MVT::i8),
+                                  Op0.getOperand(1));
+      if (VT == MVT::i1)
+        return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
+      return SetCC;
+    }
+  }
+  if ((Op0.getValueType() == MVT::i1) && (Op1.getOpcode() == ISD::Constant) &&
+      (cast<ConstantSDNode>(Op1)->getZExtValue() == 1) &&
+      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+
+    ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
+    return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, MVT::i1), NewCC);
+  }
+
+  bool isFP = Op1.getSimpleValueType().isFloatingPoint();
+  unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
+  if (X86CC == X86::COND_INVALID)
+    return SDValue();
+
+  SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
+  EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
+  SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                              DAG.getConstant(X86CC, MVT::i8), EFLAGS);
+  if (VT == MVT::i1)
+    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
+  return SetCC;
+}
+
+// isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
+static bool isX86LogicalCmp(SDValue Op) {
+  unsigned Opc = Op.getNode()->getOpcode();
+  if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
+      Opc == X86ISD::SAHF)
+    return true;
+  if (Op.getResNo() == 1 &&
+      (Opc == X86ISD::ADD ||
+       Opc == X86ISD::SUB ||
+       Opc == X86ISD::ADC ||
+       Opc == X86ISD::SBB ||
+       Opc == X86ISD::SMUL ||
+       Opc == X86ISD::UMUL ||
+       Opc == X86ISD::INC ||
+       Opc == X86ISD::DEC ||
+       Opc == X86ISD::OR ||
+       Opc == X86ISD::XOR ||
+       Opc == X86ISD::AND))
+    return true;
+
+  if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
+    return true;
+
+  return false;
+}
+
+static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
+  if (V.getOpcode() != ISD::TRUNCATE)
+    return false;
+
+  SDValue VOp0 = V.getOperand(0);
+  unsigned InBits = VOp0.getValueSizeInBits();
+  unsigned Bits = V.getValueSizeInBits();
+  return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
+}
+
+SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
+  bool addTest = true;
+  SDValue Cond  = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  SDValue Op2 = Op.getOperand(2);
+  SDLoc DL(Op);
+  EVT VT = Op1.getValueType();
+  SDValue CC;
+
+  // Lower fp selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
+  // are available. Otherwise fp cmovs get lowered into a less efficient branch
+  // sequence later on.
+  if (Cond.getOpcode() == ISD::SETCC &&
+      ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
+       (Subtarget->hasSSE1() && VT == MVT::f32)) &&
+      VT == Cond.getOperand(0).getValueType() && Cond->hasOneUse()) {
+    SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
+    int SSECC = translateX86FSETCC(
+        cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
+
+    if (SSECC != 8) {
+      if (Subtarget->hasAVX512()) {
+        SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CondOp0, CondOp1,
+                                  DAG.getConstant(SSECC, MVT::i8));
+        return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2);
+      }
+      SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
+                                DAG.getConstant(SSECC, MVT::i8));
+      SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
+      SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
+      return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
+    }
+  }
+
+  if (Cond.getOpcode() == ISD::SETCC) {
+    SDValue NewCond = LowerSETCC(Cond, DAG);
+    if (NewCond.getNode())
+      Cond = NewCond;
+  }
+
+  // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
+  // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
+  // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
+  // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
+  if (Cond.getOpcode() == X86ISD::SETCC &&
+      Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
+      isZero(Cond.getOperand(1).getOperand(1))) {
+    SDValue Cmp = Cond.getOperand(1);
+
+    unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
+
+    if ((isAllOnes(Op1) || isAllOnes(Op2)) &&
+        (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
+      SDValue Y = isAllOnes(Op2) ? Op1 : Op2;
+
+      SDValue CmpOp0 = Cmp.getOperand(0);
+      // Apply further optimizations for special cases
+      // (select (x != 0), -1, 0) -> neg & sbb
+      // (select (x == 0), 0, -1) -> neg & sbb
+      if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y))
+        if (YC->isNullValue() &&
+            (isAllOnes(Op1) == (CondCode == X86::COND_NE))) {
+          SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
+          SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
+                                    DAG.getConstant(0, CmpOp0.getValueType()),
+                                    CmpOp0);
+          SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
+                                    DAG.getConstant(X86::COND_B, MVT::i8),
+                                    SDValue(Neg.getNode(), 1));
+          return Res;
+        }
+
+      Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
+                        CmpOp0, DAG.getConstant(1, CmpOp0.getValueType()));
+      Cmp = ConvertCmpIfNecessary(Cmp, DAG);
+
+      SDValue Res =   // Res = 0 or -1.
+        DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
+                    DAG.getConstant(X86::COND_B, MVT::i8), Cmp);
+
+      if (isAllOnes(Op1) != (CondCode == X86::COND_E))
+        Res = DAG.getNOT(DL, Res, Res.getValueType());
+
+      ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2);
+      if (!N2C || !N2C->isNullValue())
+        Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
+      return Res;
+    }
+  }
+
+  // Look past (and (setcc_carry (cmp ...)), 1).
+  if (Cond.getOpcode() == ISD::AND &&
+      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
+    if (C && C->getAPIntValue() == 1)
+      Cond = Cond.getOperand(0);
+  }
+
+  // If condition flag is set by a X86ISD::CMP, then use it as the condition
+  // setting operand in place of the X86ISD::SETCC.
+  unsigned CondOpcode = Cond.getOpcode();
+  if (CondOpcode == X86ISD::SETCC ||
+      CondOpcode == X86ISD::SETCC_CARRY) {
+    CC = Cond.getOperand(0);
+
+    SDValue Cmp = Cond.getOperand(1);
+    unsigned Opc = Cmp.getOpcode();
+    MVT VT = Op.getSimpleValueType();
+
+    bool IllegalFPCMov = false;
+    if (VT.isFloatingPoint() && !VT.isVector() &&
+        !isScalarFPTypeInSSEReg(VT))  // FPStack?
+      IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
+
+    if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
+        Opc == X86ISD::BT) { // FIXME
+      Cond = Cmp;
+      addTest = false;
+    }
+  } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
+             CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
+             ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
+              Cond.getOperand(0).getValueType() != MVT::i8)) {
+    SDValue LHS = Cond.getOperand(0);
+    SDValue RHS = Cond.getOperand(1);
+    unsigned X86Opcode;
+    unsigned X86Cond;
+    SDVTList VTs;
+    switch (CondOpcode) {
+    case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
+    case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
+    case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
+    case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
+    case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
+    case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
+    default: llvm_unreachable("unexpected overflowing operator");
+    }
+    if (CondOpcode == ISD::UMULO)
+      VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
+                          MVT::i32);
+    else
+      VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
+
+    SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
+
+    if (CondOpcode == ISD::UMULO)
+      Cond = X86Op.getValue(2);
+    else
+      Cond = X86Op.getValue(1);
+
+    CC = DAG.getConstant(X86Cond, MVT::i8);
+    addTest = false;
+  }
+
+  if (addTest) {
+    // Look pass the truncate if the high bits are known zero.
+    if (isTruncWithZeroHighBitsInput(Cond, DAG))
+        Cond = Cond.getOperand(0);
+
+    // We know the result of AND is compared against zero. Try to match
+    // it to BT.
+    if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
+      SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG);
+      if (NewSetCC.getNode()) {
+        CC = NewSetCC.getOperand(0);
+        Cond = NewSetCC.getOperand(1);
+        addTest = false;
+      }
+    }
+  }
+
+  if (addTest) {
+    CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+    Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
+  }
+
+  // a <  b ? -1 :  0 -> RES = ~setcc_carry
+  // a <  b ?  0 : -1 -> RES = setcc_carry
+  // a >= b ? -1 :  0 -> RES = setcc_carry
+  // a >= b ?  0 : -1 -> RES = ~setcc_carry
+  if (Cond.getOpcode() == X86ISD::SUB) {
+    Cond = ConvertCmpIfNecessary(Cond, DAG);
+    unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
+
+    if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
+        (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) {
+      SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
+                                DAG.getConstant(X86::COND_B, MVT::i8), Cond);
+      if (isAllOnes(Op1) != (CondCode == X86::COND_B))
+        return DAG.getNOT(DL, Res, Res.getValueType());
+      return Res;
+    }
+  }
+
+  // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
+  // widen the cmov and push the truncate through. This avoids introducing a new
+  // branch during isel and doesn't add any extensions.
+  if (Op.getValueType() == MVT::i8 &&
+      Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
+    SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
+    if (T1.getValueType() == T2.getValueType() &&
+        // Blacklist CopyFromReg to avoid partial register stalls.
+        T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
+      SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
+      SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
+      return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
+    }
+  }
+
+  // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
+  // condition is true.
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
+  SDValue Ops[] = { Op2, Op1, CC, Cond };
+  return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
+}
+
+static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op->getSimpleValueType(0);
+  SDValue In = Op->getOperand(0);
+  MVT InVT = In.getSimpleValueType();
+  SDLoc dl(Op);
+
+  unsigned int NumElts = VT.getVectorNumElements();
+  if (NumElts != 8 && NumElts != 16)
+    return SDValue();
+
+  if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
+    return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
+
+  MVT ExtVT = (NumElts == 8) ? MVT::v8i64 : MVT::v16i32;
+  Constant *C = ConstantInt::get(*DAG.getContext(),
+    APInt::getAllOnesValue(ExtVT.getScalarType().getSizeInBits()));
+
+  SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
+  unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
+  SDValue Ld = DAG.getLoad(ExtVT.getScalarType(), dl, DAG.getEntryNode(), CP,
+                          MachinePointerInfo::getConstantPool(),
+                          false, false, false, Alignment);
+  SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, dl, ExtVT, In, Ld);
+  if (VT.is512BitVector())
+    return Brcst;
+  return DAG.getNode(X86ISD::VTRUNC, dl, VT, Brcst);
+}
+
+static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
+                                SelectionDAG &DAG) {
+  MVT VT = Op->getSimpleValueType(0);
+  SDValue In = Op->getOperand(0);
+  MVT InVT = In.getSimpleValueType();
+  SDLoc dl(Op);
+
+  if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
+    return LowerSIGN_EXTEND_AVX512(Op, DAG);
+
+  if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
+      (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
+      (VT != MVT::v16i16 || InVT != MVT::v16i8))
+    return SDValue();
+
+  if (Subtarget->hasInt256())
+    return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
+
+  // Optimize vectors in AVX mode
+  // Sign extend  v8i16 to v8i32 and
+  //              v4i32 to v4i64
+  //
+  // Divide input vector into two parts
+  // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
+  // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
+  // concat the vectors to original VT
+
+  unsigned NumElems = InVT.getVectorNumElements();
+  SDValue Undef = DAG.getUNDEF(InVT);
+
+  SmallVector<int,8> ShufMask1(NumElems, -1);
+  for (unsigned i = 0; i != NumElems/2; ++i)
+    ShufMask1[i] = i;
+
+  SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask1[0]);
+
+  SmallVector<int,8> ShufMask2(NumElems, -1);
+  for (unsigned i = 0; i != NumElems/2; ++i)
+    ShufMask2[i] = i + NumElems/2;
+
+  SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]);
+
+  MVT HalfVT = MVT::getVectorVT(VT.getScalarType(),
+                                VT.getVectorNumElements()/2);
+
+  OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo);
+  OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi);
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
+}
+
+// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
+// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
+// from the AND / OR.
+static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
+  Opc = Op.getOpcode();
+  if (Opc != ISD::OR && Opc != ISD::AND)
+    return false;
+  return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
+          Op.getOperand(0).hasOneUse() &&
+          Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
+          Op.getOperand(1).hasOneUse());
+}
+
+// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and
+// 1 and that the SETCC node has a single use.
+static bool isXor1OfSetCC(SDValue Op) {
+  if (Op.getOpcode() != ISD::XOR)
+    return false;
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+  if (N1C && N1C->getAPIntValue() == 1) {
+    return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
+      Op.getOperand(0).hasOneUse();
+  }
+  return false;
+}
+
+SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
+  bool addTest = true;
+  SDValue Chain = Op.getOperand(0);
+  SDValue Cond  = Op.getOperand(1);
+  SDValue Dest  = Op.getOperand(2);
+  SDLoc dl(Op);
+  SDValue CC;
+  bool Inverted = false;
+
+  if (Cond.getOpcode() == ISD::SETCC) {
+    // Check for setcc([su]{add,sub,mul}o == 0).
+    if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
+        isa<ConstantSDNode>(Cond.getOperand(1)) &&
+        cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() &&
+        Cond.getOperand(0).getResNo() == 1 &&
+        (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
+         Cond.getOperand(0).getOpcode() == ISD::UADDO ||
+         Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
+         Cond.getOperand(0).getOpcode() == ISD::USUBO ||
+         Cond.getOperand(0).getOpcode() == ISD::SMULO ||
+         Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
+      Inverted = true;
+      Cond = Cond.getOperand(0);
+    } else {
+      SDValue NewCond = LowerSETCC(Cond, DAG);
+      if (NewCond.getNode())
+        Cond = NewCond;
+    }
+  }
+#if 0
+  // FIXME: LowerXALUO doesn't handle these!!
+  else if (Cond.getOpcode() == X86ISD::ADD  ||
+           Cond.getOpcode() == X86ISD::SUB  ||
+           Cond.getOpcode() == X86ISD::SMUL ||
+           Cond.getOpcode() == X86ISD::UMUL)
+    Cond = LowerXALUO(Cond, DAG);
+#endif
+
+  // Look pass (and (setcc_carry (cmp ...)), 1).
+  if (Cond.getOpcode() == ISD::AND &&
+      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
+    if (C && C->getAPIntValue() == 1)
+      Cond = Cond.getOperand(0);
+  }
+
+  // If condition flag is set by a X86ISD::CMP, then use it as the condition
+  // setting operand in place of the X86ISD::SETCC.
+  unsigned CondOpcode = Cond.getOpcode();
+  if (CondOpcode == X86ISD::SETCC ||
+      CondOpcode == X86ISD::SETCC_CARRY) {
+    CC = Cond.getOperand(0);
+
+    SDValue Cmp = Cond.getOperand(1);
+    unsigned Opc = Cmp.getOpcode();
+    // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
+    if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
+      Cond = Cmp;
+      addTest = false;
+    } else {
+      switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
+      default: break;
+      case X86::COND_O:
+      case X86::COND_B:
+        // These can only come from an arithmetic instruction with overflow,
+        // e.g. SADDO, UADDO.
+        Cond = Cond.getNode()->getOperand(1);
+        addTest = false;
+        break;
+      }
+    }
+  }
+  CondOpcode = Cond.getOpcode();
+  if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
+      CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
+      ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
+       Cond.getOperand(0).getValueType() != MVT::i8)) {
+    SDValue LHS = Cond.getOperand(0);
+    SDValue RHS = Cond.getOperand(1);
+    unsigned X86Opcode;
+    unsigned X86Cond;
+    SDVTList VTs;
+    // Keep this in sync with LowerXALUO, otherwise we might create redundant
+    // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
+    // X86ISD::INC).
+    switch (CondOpcode) {
+    case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
+    case ISD::SADDO:
+      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
+        if (C->isOne()) {
+          X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
+          break;
+        }
+      X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
+    case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
+    case ISD::SSUBO:
+      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
+        if (C->isOne()) {
+          X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
+          break;
+        }
+      X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
+    case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
+    case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
+    default: llvm_unreachable("unexpected overflowing operator");
+    }
+    if (Inverted)
+      X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
+    if (CondOpcode == ISD::UMULO)
+      VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
+                          MVT::i32);
+    else
+      VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
+
+    SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
+
+    if (CondOpcode == ISD::UMULO)
+      Cond = X86Op.getValue(2);
+    else
+      Cond = X86Op.getValue(1);
+
+    CC = DAG.getConstant(X86Cond, MVT::i8);
+    addTest = false;
+  } else {
+    unsigned CondOpc;
+    if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
+      SDValue Cmp = Cond.getOperand(0).getOperand(1);
+      if (CondOpc == ISD::OR) {
+        // Also, recognize the pattern generated by an FCMP_UNE. We can emit
+        // two branches instead of an explicit OR instruction with a
+        // separate test.
+        if (Cmp == Cond.getOperand(1).getOperand(1) &&
+            isX86LogicalCmp(Cmp)) {
+          CC = Cond.getOperand(0).getOperand(0);
+          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+                              Chain, Dest, CC, Cmp);
+          CC = Cond.getOperand(1).getOperand(0);
+          Cond = Cmp;
+          addTest = false;
+        }
+      } else { // ISD::AND
+        // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
+        // two branches instead of an explicit AND instruction with a
+        // separate test. However, we only do this if this block doesn't
+        // have a fall-through edge, because this requires an explicit
+        // jmp when the condition is false.
+        if (Cmp == Cond.getOperand(1).getOperand(1) &&
+            isX86LogicalCmp(Cmp) &&
+            Op.getNode()->hasOneUse()) {
+          X86::CondCode CCode =
+            (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
+          CCode = X86::GetOppositeBranchCondition(CCode);
+          CC = DAG.getConstant(CCode, MVT::i8);
+          SDNode *User = *Op.getNode()->use_begin();
+          // Look for an unconditional branch following this conditional branch.
+          // We need this because we need to reverse the successors in order
+          // to implement FCMP_OEQ.
+          if (User->getOpcode() == ISD::BR) {
+            SDValue FalseBB = User->getOperand(1);
+            SDNode *NewBR =
+              DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
+            assert(NewBR == User);
+            (void)NewBR;
+            Dest = FalseBB;
+
+            Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+                                Chain, Dest, CC, Cmp);
+            X86::CondCode CCode =
+              (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
+            CCode = X86::GetOppositeBranchCondition(CCode);
+            CC = DAG.getConstant(CCode, MVT::i8);
+            Cond = Cmp;
+            addTest = false;
+          }
+        }
+      }
+    } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
+      // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
+      // It should be transformed during dag combiner except when the condition
+      // is set by a arithmetics with overflow node.
+      X86::CondCode CCode =
+        (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
+      CCode = X86::GetOppositeBranchCondition(CCode);
+      CC = DAG.getConstant(CCode, MVT::i8);
+      Cond = Cond.getOperand(0).getOperand(1);
+      addTest = false;
+    } else if (Cond.getOpcode() == ISD::SETCC &&
+               cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
+      // For FCMP_OEQ, we can emit
+      // two branches instead of an explicit AND instruction with a
+      // separate test. However, we only do this if this block doesn't
+      // have a fall-through edge, because this requires an explicit
+      // jmp when the condition is false.
+      if (Op.getNode()->hasOneUse()) {
+        SDNode *User = *Op.getNode()->use_begin();
+        // Look for an unconditional branch following this conditional branch.
+        // We need this because we need to reverse the successors in order
+        // to implement FCMP_OEQ.
+        if (User->getOpcode() == ISD::BR) {
+          SDValue FalseBB = User->getOperand(1);
+          SDNode *NewBR =
+            DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
+          assert(NewBR == User);
+          (void)NewBR;
+          Dest = FalseBB;
+
+          SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
+                                    Cond.getOperand(0), Cond.getOperand(1));
+          Cmp = ConvertCmpIfNecessary(Cmp, DAG);
+          CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+                              Chain, Dest, CC, Cmp);
+          CC = DAG.getConstant(X86::COND_P, MVT::i8);
+          Cond = Cmp;
+          addTest = false;
+        }
+      }
+    } else if (Cond.getOpcode() == ISD::SETCC &&
+               cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
+      // For FCMP_UNE, we can emit
+      // two branches instead of an explicit AND instruction with a
+      // separate test. However, we only do this if this block doesn't
+      // have a fall-through edge, because this requires an explicit
+      // jmp when the condition is false.
+      if (Op.getNode()->hasOneUse()) {
+        SDNode *User = *Op.getNode()->use_begin();
+        // Look for an unconditional branch following this conditional branch.
+        // We need this because we need to reverse the successors in order
+        // to implement FCMP_UNE.
+        if (User->getOpcode() == ISD::BR) {
+          SDValue FalseBB = User->getOperand(1);
+          SDNode *NewBR =
+            DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
+          assert(NewBR == User);
+          (void)NewBR;
+
+          SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
+                                    Cond.getOperand(0), Cond.getOperand(1));
+          Cmp = ConvertCmpIfNecessary(Cmp, DAG);
+          CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+                              Chain, Dest, CC, Cmp);
+          CC = DAG.getConstant(X86::COND_NP, MVT::i8);
+          Cond = Cmp;
+          addTest = false;
+          Dest = FalseBB;
+        }
+      }
+    }
+  }
+
+  if (addTest) {
+    // Look pass the truncate if the high bits are known zero.
+    if (isTruncWithZeroHighBitsInput(Cond, DAG))
+        Cond = Cond.getOperand(0);
+
+    // We know the result of AND is compared against zero. Try to match
+    // it to BT.
+    if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
+      SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);
+      if (NewSetCC.getNode()) {
+        CC = NewSetCC.getOperand(0);
+        Cond = NewSetCC.getOperand(1);
+        addTest = false;
+      }
+    }
+  }
+
+  if (addTest) {
+    X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
+    CC = DAG.getConstant(X86Cond, MVT::i8);
+    Cond = EmitTest(Cond, X86Cond, dl, DAG);
+  }
+  Cond = ConvertCmpIfNecessary(Cond, DAG);
+  return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+                     Chain, Dest, CC, Cond);
+}
+
+// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
+// Calls to _alloca is needed to probe the stack when allocating more than 4k
+// bytes in one go. Touching the stack at 4K increments is necessary to ensure
+// that the guard pages used by the OS virtual memory manager are allocated in
+// correct sequence.
+SDValue
+X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool SplitStack = MF.shouldSplitStack();
+  bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMacho()) ||
+               SplitStack;
+  SDLoc dl(Op);
+
+  if (!Lower) {
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    SDNode* Node = Op.getNode();
+
+    unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
+    assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
+        " not tell us which reg is the stack pointer!");
+    EVT VT = Node->getValueType(0);
+    SDValue Tmp1 = SDValue(Node, 0);
+    SDValue Tmp2 = SDValue(Node, 1);
+    SDValue Tmp3 = Node->getOperand(2);
+    SDValue Chain = Tmp1.getOperand(0);
+
+    // Chain the dynamic stack allocation so that it doesn't modify the stack
+    // pointer when other instructions are using the stack.
+    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true),
+        SDLoc(Node));
+
+    SDValue Size = Tmp2.getOperand(1);
+    SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
+    Chain = SP.getValue(1);
+    unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
+    const TargetFrameLowering &TFI = *DAG.getTarget().getFrameLowering();
+    unsigned StackAlign = TFI.getStackAlignment();
+    Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
+    if (Align > StackAlign)
+      Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
+          DAG.getConstant(-(uint64_t)Align, VT));
+    Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
+
+    Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, true),
+        DAG.getIntPtrConstant(0, true), SDValue(),
+        SDLoc(Node));
+
+    SDValue Ops[2] = { Tmp1, Tmp2 };
+    return DAG.getMergeValues(Ops, dl);
+  }
+
+  // Get the inputs.
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size  = Op.getOperand(1);
+  unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+  EVT VT = Op.getNode()->getValueType(0);
+
+  bool Is64Bit = Subtarget->is64Bit();
+  EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32;
+
+  if (SplitStack) {
+    MachineRegisterInfo &MRI = MF.getRegInfo();
+
+    if (Is64Bit) {
+      // The 64 bit implementation of segmented stacks needs to clobber both r10
+      // r11. This makes it impossible to use it along with nested parameters.
+      const Function *F = MF.getFunction();
+
+      for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+           I != E; ++I)
+        if (I->hasNestAttr())
+          report_fatal_error("Cannot use segmented stacks with functions that "
+                             "have nested arguments.");
+    }
+
+    const TargetRegisterClass *AddrRegClass =
+      getRegClassFor(Subtarget->is64Bit() ? MVT::i64:MVT::i32);
+    unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
+    Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
+    SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
+                                DAG.getRegister(Vreg, SPTy));
+    SDValue Ops1[2] = { Value, Chain };
+    return DAG.getMergeValues(Ops1, dl);
+  } else {
+    SDValue Flag;
+    unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX);
+
+    Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
+    Flag = Chain.getValue(1);
+    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+
+    Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
+
+    const X86RegisterInfo *RegInfo =
+      static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
+    unsigned SPReg = RegInfo->getStackRegister();
+    SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
+    Chain = SP.getValue(1);
+
+    if (Align) {
+      SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
+                       DAG.getConstant(-(uint64_t)Align, VT));
+      Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
+    }
+
+    SDValue Ops1[2] = { SP, Chain };
+    return DAG.getMergeValues(Ops1, dl);
+  }
+}
+
+SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  SDLoc DL(Op);
+
+  if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) {
+    // vastart just stores the address of the VarArgsFrameIndex slot into the
+    // memory location argument.
+    SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
+                                   getPointerTy());
+    return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
+                        MachinePointerInfo(SV), false, false, 0);
+  }
+
+  // __va_list_tag:
+  //   gp_offset         (0 - 6 * 8)
+  //   fp_offset         (48 - 48 + 8 * 16)
+  //   overflow_arg_area (point to parameters coming in memory).
+  //   reg_save_area
+  SmallVector<SDValue, 8> MemOps;
+  SDValue FIN = Op.getOperand(1);
+  // Store gp_offset
+  SDValue Store = DAG.getStore(Op.getOperand(0), DL,
+                               DAG.getConstant(FuncInfo->getVarArgsGPOffset(),
+                                               MVT::i32),
+                               FIN, MachinePointerInfo(SV), false, false, 0);
+  MemOps.push_back(Store);
+
+  // Store fp_offset
+  FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
+                    FIN, DAG.getIntPtrConstant(4));
+  Store = DAG.getStore(Op.getOperand(0), DL,
+                       DAG.getConstant(FuncInfo->getVarArgsFPOffset(),
+                                       MVT::i32),
+                       FIN, MachinePointerInfo(SV, 4), false, false, 0);
+  MemOps.push_back(Store);
+
+  // Store ptr to overflow_arg_area
+  FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
+                    FIN, DAG.getIntPtrConstant(4));
+  SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
+                                    getPointerTy());
+  Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN,
+                       MachinePointerInfo(SV, 8),
+                       false, false, 0);
+  MemOps.push_back(Store);
+
+  // Store ptr to reg_save_area.
+  FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
+                    FIN, DAG.getIntPtrConstant(8));
+  SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
+                                    getPointerTy());
+  Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN,
+                       MachinePointerInfo(SV, 16), false, false, 0);
+  MemOps.push_back(Store);
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
+}
+
+SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
+  assert(Subtarget->is64Bit() &&
+         "LowerVAARG only handles 64-bit va_arg!");
+  assert((Subtarget->isTargetLinux() ||
+          Subtarget->isTargetDarwin()) &&
+          "Unhandled target in LowerVAARG");
+  assert(Op.getNode()->getNumOperands() == 4);
+  SDValue Chain = Op.getOperand(0);
+  SDValue SrcPtr = Op.getOperand(1);
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  unsigned Align = Op.getConstantOperandVal(3);
+  SDLoc dl(Op);
+
+  EVT ArgVT = Op.getNode()->getValueType(0);
+  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+  uint32_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
+  uint8_t ArgMode;
+
+  // Decide which area this value should be read from.
+  // TODO: Implement the AMD64 ABI in its entirety. This simple
+  // selection mechanism works only for the basic types.
+  if (ArgVT == MVT::f80) {
+    llvm_unreachable("va_arg for f80 not yet implemented");
+  } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
+    ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
+  } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
+    ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
+  } else {
+    llvm_unreachable("Unhandled argument type in LowerVAARG");
+  }
+
+  if (ArgMode == 2) {
+    // Sanity Check: Make sure using fp_offset makes sense.
+    assert(!DAG.getTarget().Options.UseSoftFloat &&
+           !(DAG.getMachineFunction()
+                .getFunction()->getAttributes()
+                .hasAttribute(AttributeSet::FunctionIndex,
+                              Attribute::NoImplicitFloat)) &&
+           Subtarget->hasSSE1());
+  }
+
+  // Insert VAARG_64 node into the DAG
+  // VAARG_64 returns two values: Variable Argument Address, Chain
+  SmallVector<SDValue, 11> InstOps;
+  InstOps.push_back(Chain);
+  InstOps.push_back(SrcPtr);
+  InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32));
+  InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8));
+  InstOps.push_back(DAG.getConstant(Align, MVT::i32));
+  SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other);
+  SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
+                                          VTs, InstOps, MVT::i64,
+                                          MachinePointerInfo(SV),
+                                          /*Align=*/0,
+                                          /*Volatile=*/false,
+                                          /*ReadMem=*/true,
+                                          /*WriteMem=*/true);
+  Chain = VAARG.getValue(1);
+
+  // Load the next argument and return it
+  return DAG.getLoad(ArgVT, dl,
+                     Chain,
+                     VAARG,
+                     MachinePointerInfo(),
+                     false, false, false, 0);
+}
+
+static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
+                           SelectionDAG &DAG) {
+  // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
+  assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
+  SDValue Chain = Op.getOperand(0);
+  SDValue DstPtr = Op.getOperand(1);
+  SDValue SrcPtr = Op.getOperand(2);
+  const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
+  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+  SDLoc DL(Op);
+
+  return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
+                       DAG.getIntPtrConstant(24), 8, /*isVolatile*/false,
+                       false,
+                       MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
+}
+
+// getTargetVShiftByConstNode - Handle vector element shifts where the shift
+// amount is a constant. Takes immediate version of shift as input.
+static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
+                                          SDValue SrcOp, uint64_t ShiftAmt,
+                                          SelectionDAG &DAG) {
+  MVT ElementType = VT.getVectorElementType();
+
+  // Fold this packed shift into its first operand if ShiftAmt is 0.
+  if (ShiftAmt == 0)
+    return SrcOp;
+
+  // Check for ShiftAmt >= element width
+  if (ShiftAmt >= ElementType.getSizeInBits()) {
+    if (Opc == X86ISD::VSRAI)
+      ShiftAmt = ElementType.getSizeInBits() - 1;
+    else
+      return DAG.getConstant(0, VT);
+  }
+
+  assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
+         && "Unknown target vector shift-by-constant node");
+
+  // Fold this packed vector shift into a build vector if SrcOp is a
+  // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT.
+  if (VT == SrcOp.getSimpleValueType() &&
+      ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
+    SmallVector<SDValue, 8> Elts;
+    unsigned NumElts = SrcOp->getNumOperands();
+    ConstantSDNode *ND;
+
+    switch(Opc) {
+    default: llvm_unreachable(nullptr);
+    case X86ISD::VSHLI:
+      for (unsigned i=0; i!=NumElts; ++i) {
+        SDValue CurrentOp = SrcOp->getOperand(i);
+        if (CurrentOp->getOpcode() == ISD::UNDEF) {
+          Elts.push_back(CurrentOp);
+          continue;
+        }
+        ND = cast<ConstantSDNode>(CurrentOp);
+        const APInt &C = ND->getAPIntValue();
+        Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), ElementType));
+      }
+      break;
+    case X86ISD::VSRLI:
+      for (unsigned i=0; i!=NumElts; ++i) {
+        SDValue CurrentOp = SrcOp->getOperand(i);
+        if (CurrentOp->getOpcode() == ISD::UNDEF) {
+          Elts.push_back(CurrentOp);
+          continue;
+        }
+        ND = cast<ConstantSDNode>(CurrentOp);
+        const APInt &C = ND->getAPIntValue();
+        Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), ElementType));
+      }
+      break;
+    case X86ISD::VSRAI:
+      for (unsigned i=0; i!=NumElts; ++i) {
+        SDValue CurrentOp = SrcOp->getOperand(i);
+        if (CurrentOp->getOpcode() == ISD::UNDEF) {
+          Elts.push_back(CurrentOp);
+          continue;
+        }
+        ND = cast<ConstantSDNode>(CurrentOp);
+        const APInt &C = ND->getAPIntValue();
+        Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), ElementType));
+      }
+      break;
+    }
+
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
+  }
+
+  return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, MVT::i8));
+}
+
+// getTargetVShiftNode - Handle vector element shifts where the shift amount
+// may or may not be a constant. Takes immediate version of shift as input.
+static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
+                                   SDValue SrcOp, SDValue ShAmt,
+                                   SelectionDAG &DAG) {
+  assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32");
+
+  // Catch shift-by-constant.
+  if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
+    return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
+                                      CShAmt->getZExtValue(), DAG);
+
+  // Change opcode to non-immediate version
+  switch (Opc) {
+    default: llvm_unreachable("Unknown target vector shift node");
+    case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
+    case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
+    case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
+  }
+
+  // Need to build a vector containing shift amount
+  // Shift amount is 32-bits, but SSE instructions read 64-bit, so fill with 0
+  SDValue ShOps[4];
+  ShOps[0] = ShAmt;
+  ShOps[1] = DAG.getConstant(0, MVT::i32);
+  ShOps[2] = ShOps[3] = DAG.getUNDEF(MVT::i32);
+  ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, ShOps);
+
+  // The return type has to be a 128-bit type with the same element
+  // type as the input type.
+  MVT EltVT = VT.getVectorElementType();
+  EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
+
+  ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt);
+  return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
+}
+
+static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
+  SDLoc dl(Op);
+  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  switch (IntNo) {
+  default: return SDValue();    // Don't custom lower most intrinsics.
+  // Comparison intrinsics.
+  case Intrinsic::x86_sse_comieq_ss:
+  case Intrinsic::x86_sse_comilt_ss:
+  case Intrinsic::x86_sse_comile_ss:
+  case Intrinsic::x86_sse_comigt_ss:
+  case Intrinsic::x86_sse_comige_ss:
+  case Intrinsic::x86_sse_comineq_ss:
+  case Intrinsic::x86_sse_ucomieq_ss:
+  case Intrinsic::x86_sse_ucomilt_ss:
+  case Intrinsic::x86_sse_ucomile_ss:
+  case Intrinsic::x86_sse_ucomigt_ss:
+  case Intrinsic::x86_sse_ucomige_ss:
+  case Intrinsic::x86_sse_ucomineq_ss:
+  case Intrinsic::x86_sse2_comieq_sd:
+  case Intrinsic::x86_sse2_comilt_sd:
+  case Intrinsic::x86_sse2_comile_sd:
+  case Intrinsic::x86_sse2_comigt_sd:
+  case Intrinsic::x86_sse2_comige_sd:
+  case Intrinsic::x86_sse2_comineq_sd:
+  case Intrinsic::x86_sse2_ucomieq_sd:
+  case Intrinsic::x86_sse2_ucomilt_sd:
+  case Intrinsic::x86_sse2_ucomile_sd:
+  case Intrinsic::x86_sse2_ucomigt_sd:
+  case Intrinsic::x86_sse2_ucomige_sd:
+  case Intrinsic::x86_sse2_ucomineq_sd: {
+    unsigned Opc;
+    ISD::CondCode CC;
+    switch (IntNo) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::x86_sse_comieq_ss:
+    case Intrinsic::x86_sse2_comieq_sd:
+      Opc = X86ISD::COMI;
+      CC = ISD::SETEQ;
+      break;
+    case Intrinsic::x86_sse_comilt_ss:
+    case Intrinsic::x86_sse2_comilt_sd:
+      Opc = X86ISD::COMI;
+      CC = ISD::SETLT;
+      break;
+    case Intrinsic::x86_sse_comile_ss:
+    case Intrinsic::x86_sse2_comile_sd:
+      Opc = X86ISD::COMI;
+      CC = ISD::SETLE;
+      break;
+    case Intrinsic::x86_sse_comigt_ss:
+    case Intrinsic::x86_sse2_comigt_sd:
+      Opc = X86ISD::COMI;
+      CC = ISD::SETGT;
+      break;
+    case Intrinsic::x86_sse_comige_ss:
+    case Intrinsic::x86_sse2_comige_sd:
+      Opc = X86ISD::COMI;
+      CC = ISD::SETGE;
+      break;
+    case Intrinsic::x86_sse_comineq_ss:
+    case Intrinsic::x86_sse2_comineq_sd:
+      Opc = X86ISD::COMI;
+      CC = ISD::SETNE;
+      break;
+    case Intrinsic::x86_sse_ucomieq_ss:
+    case Intrinsic::x86_sse2_ucomieq_sd:
+      Opc = X86ISD::UCOMI;
+      CC = ISD::SETEQ;
+      break;
+    case Intrinsic::x86_sse_ucomilt_ss:
+    case Intrinsic::x86_sse2_ucomilt_sd:
+      Opc = X86ISD::UCOMI;
+      CC = ISD::SETLT;
+      break;
+    case Intrinsic::x86_sse_ucomile_ss:
+    case Intrinsic::x86_sse2_ucomile_sd:
+      Opc = X86ISD::UCOMI;
+      CC = ISD::SETLE;
+      break;
+    case Intrinsic::x86_sse_ucomigt_ss:
+    case Intrinsic::x86_sse2_ucomigt_sd:
+      Opc = X86ISD::UCOMI;
+      CC = ISD::SETGT;
+      break;
+    case Intrinsic::x86_sse_ucomige_ss:
+    case Intrinsic::x86_sse2_ucomige_sd:
+      Opc = X86ISD::UCOMI;
+      CC = ISD::SETGE;
+      break;
+    case Intrinsic::x86_sse_ucomineq_ss:
+    case Intrinsic::x86_sse2_ucomineq_sd:
+      Opc = X86ISD::UCOMI;
+      CC = ISD::SETNE;
+      break;
+    }
+
+    SDValue LHS = Op.getOperand(1);
+    SDValue RHS = Op.getOperand(2);
+    unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG);
+    assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!");
+    SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS);
+    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                                DAG.getConstant(X86CC, MVT::i8), Cond);
+    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+  }
+
+  // Arithmetic intrinsics.
+  case Intrinsic::x86_sse2_pmulu_dq:
+  case Intrinsic::x86_avx2_pmulu_dq:
+    return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+
+  case Intrinsic::x86_sse41_pmuldq:
+  case Intrinsic::x86_avx2_pmul_dq:
+    return DAG.getNode(X86ISD::PMULDQ, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+
+  case Intrinsic::x86_sse2_pmulhu_w:
+  case Intrinsic::x86_avx2_pmulhu_w:
+    return DAG.getNode(ISD::MULHU, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+
+  case Intrinsic::x86_sse2_pmulh_w:
+  case Intrinsic::x86_avx2_pmulh_w:
+    return DAG.getNode(ISD::MULHS, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+
+  // SSE2/AVX2 sub with unsigned saturation intrinsics
+  case Intrinsic::x86_sse2_psubus_b:
+  case Intrinsic::x86_sse2_psubus_w:
+  case Intrinsic::x86_avx2_psubus_b:
+  case Intrinsic::x86_avx2_psubus_w:
+    return DAG.getNode(X86ISD::SUBUS, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+
+  // SSE3/AVX horizontal add/sub intrinsics
+  case Intrinsic::x86_sse3_hadd_ps:
+  case Intrinsic::x86_sse3_hadd_pd:
+  case Intrinsic::x86_avx_hadd_ps_256:
+  case Intrinsic::x86_avx_hadd_pd_256:
+  case Intrinsic::x86_sse3_hsub_ps:
+  case Intrinsic::x86_sse3_hsub_pd:
+  case Intrinsic::x86_avx_hsub_ps_256:
+  case Intrinsic::x86_avx_hsub_pd_256:
+  case Intrinsic::x86_ssse3_phadd_w_128:
+  case Intrinsic::x86_ssse3_phadd_d_128:
+  case Intrinsic::x86_avx2_phadd_w:
+  case Intrinsic::x86_avx2_phadd_d:
+  case Intrinsic::x86_ssse3_phsub_w_128:
+  case Intrinsic::x86_ssse3_phsub_d_128:
+  case Intrinsic::x86_avx2_phsub_w:
+  case Intrinsic::x86_avx2_phsub_d: {
+    unsigned Opcode;
+    switch (IntNo) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::x86_sse3_hadd_ps:
+    case Intrinsic::x86_sse3_hadd_pd:
+    case Intrinsic::x86_avx_hadd_ps_256:
+    case Intrinsic::x86_avx_hadd_pd_256:
+      Opcode = X86ISD::FHADD;
+      break;
+    case Intrinsic::x86_sse3_hsub_ps:
+    case Intrinsic::x86_sse3_hsub_pd:
+    case Intrinsic::x86_avx_hsub_ps_256:
+    case Intrinsic::x86_avx_hsub_pd_256:
+      Opcode = X86ISD::FHSUB;
+      break;
+    case Intrinsic::x86_ssse3_phadd_w_128:
+    case Intrinsic::x86_ssse3_phadd_d_128:
+    case Intrinsic::x86_avx2_phadd_w:
+    case Intrinsic::x86_avx2_phadd_d:
+      Opcode = X86ISD::HADD;
+      break;
+    case Intrinsic::x86_ssse3_phsub_w_128:
+    case Intrinsic::x86_ssse3_phsub_d_128:
+    case Intrinsic::x86_avx2_phsub_w:
+    case Intrinsic::x86_avx2_phsub_d:
+      Opcode = X86ISD::HSUB;
+      break;
+    }
+    return DAG.getNode(Opcode, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  }
+
+  // SSE2/SSE41/AVX2 integer max/min intrinsics.
+  case Intrinsic::x86_sse2_pmaxu_b:
+  case Intrinsic::x86_sse41_pmaxuw:
+  case Intrinsic::x86_sse41_pmaxud:
+  case Intrinsic::x86_avx2_pmaxu_b:
+  case Intrinsic::x86_avx2_pmaxu_w:
+  case Intrinsic::x86_avx2_pmaxu_d:
+  case Intrinsic::x86_sse2_pminu_b:
+  case Intrinsic::x86_sse41_pminuw:
+  case Intrinsic::x86_sse41_pminud:
+  case Intrinsic::x86_avx2_pminu_b:
+  case Intrinsic::x86_avx2_pminu_w:
+  case Intrinsic::x86_avx2_pminu_d:
+  case Intrinsic::x86_sse41_pmaxsb:
+  case Intrinsic::x86_sse2_pmaxs_w:
+  case Intrinsic::x86_sse41_pmaxsd:
+  case Intrinsic::x86_avx2_pmaxs_b:
+  case Intrinsic::x86_avx2_pmaxs_w:
+  case Intrinsic::x86_avx2_pmaxs_d:
+  case Intrinsic::x86_sse41_pminsb:
+  case Intrinsic::x86_sse2_pmins_w:
+  case Intrinsic::x86_sse41_pminsd:
+  case Intrinsic::x86_avx2_pmins_b:
+  case Intrinsic::x86_avx2_pmins_w:
+  case Intrinsic::x86_avx2_pmins_d: {
+    unsigned Opcode;
+    switch (IntNo) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::x86_sse2_pmaxu_b:
+    case Intrinsic::x86_sse41_pmaxuw:
+    case Intrinsic::x86_sse41_pmaxud:
+    case Intrinsic::x86_avx2_pmaxu_b:
+    case Intrinsic::x86_avx2_pmaxu_w:
+    case Intrinsic::x86_avx2_pmaxu_d:
+      Opcode = X86ISD::UMAX;
+      break;
+    case Intrinsic::x86_sse2_pminu_b:
+    case Intrinsic::x86_sse41_pminuw:
+    case Intrinsic::x86_sse41_pminud:
+    case Intrinsic::x86_avx2_pminu_b:
+    case Intrinsic::x86_avx2_pminu_w:
+    case Intrinsic::x86_avx2_pminu_d:
+      Opcode = X86ISD::UMIN;
+      break;
+    case Intrinsic::x86_sse41_pmaxsb:
+    case Intrinsic::x86_sse2_pmaxs_w:
+    case Intrinsic::x86_sse41_pmaxsd:
+    case Intrinsic::x86_avx2_pmaxs_b:
+    case Intrinsic::x86_avx2_pmaxs_w:
+    case Intrinsic::x86_avx2_pmaxs_d:
+      Opcode = X86ISD::SMAX;
+      break;
+    case Intrinsic::x86_sse41_pminsb:
+    case Intrinsic::x86_sse2_pmins_w:
+    case Intrinsic::x86_sse41_pminsd:
+    case Intrinsic::x86_avx2_pmins_b:
+    case Intrinsic::x86_avx2_pmins_w:
+    case Intrinsic::x86_avx2_pmins_d:
+      Opcode = X86ISD::SMIN;
+      break;
+    }
+    return DAG.getNode(Opcode, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  }
+
+  // SSE/SSE2/AVX floating point max/min intrinsics.
+  case Intrinsic::x86_sse_max_ps:
+  case Intrinsic::x86_sse2_max_pd:
+  case Intrinsic::x86_avx_max_ps_256:
+  case Intrinsic::x86_avx_max_pd_256:
+  case Intrinsic::x86_sse_min_ps:
+  case Intrinsic::x86_sse2_min_pd:
+  case Intrinsic::x86_avx_min_ps_256:
+  case Intrinsic::x86_avx_min_pd_256: {
+    unsigned Opcode;
+    switch (IntNo) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::x86_sse_max_ps:
+    case Intrinsic::x86_sse2_max_pd:
+    case Intrinsic::x86_avx_max_ps_256:
+    case Intrinsic::x86_avx_max_pd_256:
+      Opcode = X86ISD::FMAX;
+      break;
+    case Intrinsic::x86_sse_min_ps:
+    case Intrinsic::x86_sse2_min_pd:
+    case Intrinsic::x86_avx_min_ps_256:
+    case Intrinsic::x86_avx_min_pd_256:
+      Opcode = X86ISD::FMIN;
+      break;
+    }
+    return DAG.getNode(Opcode, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  }
+
+  // AVX2 variable shift intrinsics
+  case Intrinsic::x86_avx2_psllv_d:
+  case Intrinsic::x86_avx2_psllv_q:
+  case Intrinsic::x86_avx2_psllv_d_256:
+  case Intrinsic::x86_avx2_psllv_q_256:
+  case Intrinsic::x86_avx2_psrlv_d:
+  case Intrinsic::x86_avx2_psrlv_q:
+  case Intrinsic::x86_avx2_psrlv_d_256:
+  case Intrinsic::x86_avx2_psrlv_q_256:
+  case Intrinsic::x86_avx2_psrav_d:
+  case Intrinsic::x86_avx2_psrav_d_256: {
+    unsigned Opcode;
+    switch (IntNo) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::x86_avx2_psllv_d:
+    case Intrinsic::x86_avx2_psllv_q:
+    case Intrinsic::x86_avx2_psllv_d_256:
+    case Intrinsic::x86_avx2_psllv_q_256:
+      Opcode = ISD::SHL;
+      break;
+    case Intrinsic::x86_avx2_psrlv_d:
+    case Intrinsic::x86_avx2_psrlv_q:
+    case Intrinsic::x86_avx2_psrlv_d_256:
+    case Intrinsic::x86_avx2_psrlv_q_256:
+      Opcode = ISD::SRL;
+      break;
+    case Intrinsic::x86_avx2_psrav_d:
+    case Intrinsic::x86_avx2_psrav_d_256:
+      Opcode = ISD::SRA;
+      break;
+    }
+    return DAG.getNode(Opcode, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  }
+
+  case Intrinsic::x86_sse2_packssdw_128:
+  case Intrinsic::x86_sse2_packsswb_128:
+  case Intrinsic::x86_avx2_packssdw:
+  case Intrinsic::x86_avx2_packsswb:
+    return DAG.getNode(X86ISD::PACKSS, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+
+  case Intrinsic::x86_sse2_packuswb_128:
+  case Intrinsic::x86_sse41_packusdw:
+  case Intrinsic::x86_avx2_packuswb:
+  case Intrinsic::x86_avx2_packusdw:
+    return DAG.getNode(X86ISD::PACKUS, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+
+  case Intrinsic::x86_ssse3_pshuf_b_128:
+  case Intrinsic::x86_avx2_pshuf_b:
+    return DAG.getNode(X86ISD::PSHUFB, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+
+  case Intrinsic::x86_sse2_pshuf_d:
+    return DAG.getNode(X86ISD::PSHUFD, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+
+  case Intrinsic::x86_sse2_pshufl_w:
+    return DAG.getNode(X86ISD::PSHUFLW, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+
+  case Intrinsic::x86_sse2_pshufh_w:
+    return DAG.getNode(X86ISD::PSHUFHW, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+
+  case Intrinsic::x86_ssse3_psign_b_128:
+  case Intrinsic::x86_ssse3_psign_w_128:
+  case Intrinsic::x86_ssse3_psign_d_128:
+  case Intrinsic::x86_avx2_psign_b:
+  case Intrinsic::x86_avx2_psign_w:
+  case Intrinsic::x86_avx2_psign_d:
+    return DAG.getNode(X86ISD::PSIGN, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+
+  case Intrinsic::x86_sse41_insertps:
+    return DAG.getNode(X86ISD::INSERTPS, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
+  case Intrinsic::x86_avx_vperm2f128_ps_256:
+  case Intrinsic::x86_avx_vperm2f128_pd_256:
+  case Intrinsic::x86_avx_vperm2f128_si_256:
+  case Intrinsic::x86_avx2_vperm2i128:
+    return DAG.getNode(X86ISD::VPERM2X128, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
+  case Intrinsic::x86_avx2_permd:
+  case Intrinsic::x86_avx2_permps:
+    // Operands intentionally swapped. Mask is last operand to intrinsic,
+    // but second operand for node/instruction.
+    return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
+                       Op.getOperand(2), Op.getOperand(1));
+
+  case Intrinsic::x86_sse_sqrt_ps:
+  case Intrinsic::x86_sse2_sqrt_pd:
+  case Intrinsic::x86_avx_sqrt_ps_256:
+  case Intrinsic::x86_avx_sqrt_pd_256:
+    return DAG.getNode(ISD::FSQRT, dl, Op.getValueType(), Op.getOperand(1));
+
+  // ptest and testp intrinsics. The intrinsic these come from are designed to
+  // return an integer value, not just an instruction so lower it to the ptest
+  // or testp pattern and a setcc for the result.
+  case Intrinsic::x86_sse41_ptestz:
+  case Intrinsic::x86_sse41_ptestc:
+  case Intrinsic::x86_sse41_ptestnzc:
+  case Intrinsic::x86_avx_ptestz_256:
+  case Intrinsic::x86_avx_ptestc_256:
+  case Intrinsic::x86_avx_ptestnzc_256:
+  case Intrinsic::x86_avx_vtestz_ps:
+  case Intrinsic::x86_avx_vtestc_ps:
+  case Intrinsic::x86_avx_vtestnzc_ps:
+  case Intrinsic::x86_avx_vtestz_pd:
+  case Intrinsic::x86_avx_vtestc_pd:
+  case Intrinsic::x86_avx_vtestnzc_pd:
+  case Intrinsic::x86_avx_vtestz_ps_256:
+  case Intrinsic::x86_avx_vtestc_ps_256:
+  case Intrinsic::x86_avx_vtestnzc_ps_256:
+  case Intrinsic::x86_avx_vtestz_pd_256:
+  case Intrinsic::x86_avx_vtestc_pd_256:
+  case Intrinsic::x86_avx_vtestnzc_pd_256: {
+    bool IsTestPacked = false;
+    unsigned X86CC;
+    switch (IntNo) {
+    default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
+    case Intrinsic::x86_avx_vtestz_ps:
+    case Intrinsic::x86_avx_vtestz_pd:
+    case Intrinsic::x86_avx_vtestz_ps_256:
+    case Intrinsic::x86_avx_vtestz_pd_256:
+      IsTestPacked = true; // Fallthrough
+    case Intrinsic::x86_sse41_ptestz:
+    case Intrinsic::x86_avx_ptestz_256:
+      // ZF = 1
+      X86CC = X86::COND_E;
+      break;
+    case Intrinsic::x86_avx_vtestc_ps:
+    case Intrinsic::x86_avx_vtestc_pd:
+    case Intrinsic::x86_avx_vtestc_ps_256:
+    case Intrinsic::x86_avx_vtestc_pd_256:
+      IsTestPacked = true; // Fallthrough
+    case Intrinsic::x86_sse41_ptestc:
+    case Intrinsic::x86_avx_ptestc_256:
+      // CF = 1
+      X86CC = X86::COND_B;
+      break;
+    case Intrinsic::x86_avx_vtestnzc_ps:
+    case Intrinsic::x86_avx_vtestnzc_pd:
+    case Intrinsic::x86_avx_vtestnzc_ps_256:
+    case Intrinsic::x86_avx_vtestnzc_pd_256:
+      IsTestPacked = true; // Fallthrough
+    case Intrinsic::x86_sse41_ptestnzc:
+    case Intrinsic::x86_avx_ptestnzc_256:
+      // ZF and CF = 0
+      X86CC = X86::COND_A;
+      break;
+    }
+
+    SDValue LHS = Op.getOperand(1);
+    SDValue RHS = Op.getOperand(2);
+    unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
+    SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
+    SDValue CC = DAG.getConstant(X86CC, MVT::i8);
+    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
+    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+  }
+  case Intrinsic::x86_avx512_kortestz_w:
+  case Intrinsic::x86_avx512_kortestc_w: {
+    unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz_w)? X86::COND_E: X86::COND_B;
+    SDValue LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(1));
+    SDValue RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(2));
+    SDValue CC = DAG.getConstant(X86CC, MVT::i8);
+    SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
+    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i1, CC, Test);
+    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+  }
+
+  // SSE/AVX shift intrinsics
+  case Intrinsic::x86_sse2_psll_w:
+  case Intrinsic::x86_sse2_psll_d:
+  case Intrinsic::x86_sse2_psll_q:
+  case Intrinsic::x86_avx2_psll_w:
+  case Intrinsic::x86_avx2_psll_d:
+  case Intrinsic::x86_avx2_psll_q:
+  case Intrinsic::x86_sse2_psrl_w:
+  case Intrinsic::x86_sse2_psrl_d:
+  case Intrinsic::x86_sse2_psrl_q:
+  case Intrinsic::x86_avx2_psrl_w:
+  case Intrinsic::x86_avx2_psrl_d:
+  case Intrinsic::x86_avx2_psrl_q:
+  case Intrinsic::x86_sse2_psra_w:
+  case Intrinsic::x86_sse2_psra_d:
+  case Intrinsic::x86_avx2_psra_w:
+  case Intrinsic::x86_avx2_psra_d: {
+    unsigned Opcode;
+    switch (IntNo) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::x86_sse2_psll_w:
+    case Intrinsic::x86_sse2_psll_d:
+    case Intrinsic::x86_sse2_psll_q:
+    case Intrinsic::x86_avx2_psll_w:
+    case Intrinsic::x86_avx2_psll_d:
+    case Intrinsic::x86_avx2_psll_q:
+      Opcode = X86ISD::VSHL;
+      break;
+    case Intrinsic::x86_sse2_psrl_w:
+    case Intrinsic::x86_sse2_psrl_d:
+    case Intrinsic::x86_sse2_psrl_q:
+    case Intrinsic::x86_avx2_psrl_w:
+    case Intrinsic::x86_avx2_psrl_d:
+    case Intrinsic::x86_avx2_psrl_q:
+      Opcode = X86ISD::VSRL;
+      break;
+    case Intrinsic::x86_sse2_psra_w:
+    case Intrinsic::x86_sse2_psra_d:
+    case Intrinsic::x86_avx2_psra_w:
+    case Intrinsic::x86_avx2_psra_d:
+      Opcode = X86ISD::VSRA;
+      break;
+    }
+    return DAG.getNode(Opcode, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  }
+
+  // SSE/AVX immediate shift intrinsics
+  case Intrinsic::x86_sse2_pslli_w:
+  case Intrinsic::x86_sse2_pslli_d:
+  case Intrinsic::x86_sse2_pslli_q:
+  case Intrinsic::x86_avx2_pslli_w:
+  case Intrinsic::x86_avx2_pslli_d:
+  case Intrinsic::x86_avx2_pslli_q:
+  case Intrinsic::x86_sse2_psrli_w:
+  case Intrinsic::x86_sse2_psrli_d:
+  case Intrinsic::x86_sse2_psrli_q:
+  case Intrinsic::x86_avx2_psrli_w:
+  case Intrinsic::x86_avx2_psrli_d:
+  case Intrinsic::x86_avx2_psrli_q:
+  case Intrinsic::x86_sse2_psrai_w:
+  case Intrinsic::x86_sse2_psrai_d:
+  case Intrinsic::x86_avx2_psrai_w:
+  case Intrinsic::x86_avx2_psrai_d: {
+    unsigned Opcode;
+    switch (IntNo) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::x86_sse2_pslli_w:
+    case Intrinsic::x86_sse2_pslli_d:
+    case Intrinsic::x86_sse2_pslli_q:
+    case Intrinsic::x86_avx2_pslli_w:
+    case Intrinsic::x86_avx2_pslli_d:
+    case Intrinsic::x86_avx2_pslli_q:
+      Opcode = X86ISD::VSHLI;
+      break;
+    case Intrinsic::x86_sse2_psrli_w:
+    case Intrinsic::x86_sse2_psrli_d:
+    case Intrinsic::x86_sse2_psrli_q:
+    case Intrinsic::x86_avx2_psrli_w:
+    case Intrinsic::x86_avx2_psrli_d:
+    case Intrinsic::x86_avx2_psrli_q:
+      Opcode = X86ISD::VSRLI;
+      break;
+    case Intrinsic::x86_sse2_psrai_w:
+    case Intrinsic::x86_sse2_psrai_d:
+    case Intrinsic::x86_avx2_psrai_w:
+    case Intrinsic::x86_avx2_psrai_d:
+      Opcode = X86ISD::VSRAI;
+      break;
+    }
+    return getTargetVShiftNode(Opcode, dl, Op.getSimpleValueType(),
+                               Op.getOperand(1), Op.getOperand(2), DAG);
+  }
+
+  case Intrinsic::x86_sse42_pcmpistria128:
+  case Intrinsic::x86_sse42_pcmpestria128:
+  case Intrinsic::x86_sse42_pcmpistric128:
+  case Intrinsic::x86_sse42_pcmpestric128:
+  case Intrinsic::x86_sse42_pcmpistrio128:
+  case Intrinsic::x86_sse42_pcmpestrio128:
+  case Intrinsic::x86_sse42_pcmpistris128:
+  case Intrinsic::x86_sse42_pcmpestris128:
+  case Intrinsic::x86_sse42_pcmpistriz128:
+  case Intrinsic::x86_sse42_pcmpestriz128: {
+    unsigned Opcode;
+    unsigned X86CC;
+    switch (IntNo) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::x86_sse42_pcmpistria128:
+      Opcode = X86ISD::PCMPISTRI;
+      X86CC = X86::COND_A;
+      break;
+    case Intrinsic::x86_sse42_pcmpestria128:
+      Opcode = X86ISD::PCMPESTRI;
+      X86CC = X86::COND_A;
+      break;
+    case Intrinsic::x86_sse42_pcmpistric128:
+      Opcode = X86ISD::PCMPISTRI;
+      X86CC = X86::COND_B;
+      break;
+    case Intrinsic::x86_sse42_pcmpestric128:
+      Opcode = X86ISD::PCMPESTRI;
+      X86CC = X86::COND_B;
+      break;
+    case Intrinsic::x86_sse42_pcmpistrio128:
+      Opcode = X86ISD::PCMPISTRI;
+      X86CC = X86::COND_O;
+      break;
+    case Intrinsic::x86_sse42_pcmpestrio128:
+      Opcode = X86ISD::PCMPESTRI;
+      X86CC = X86::COND_O;
+      break;
+    case Intrinsic::x86_sse42_pcmpistris128:
+      Opcode = X86ISD::PCMPISTRI;
+      X86CC = X86::COND_S;
+      break;
+    case Intrinsic::x86_sse42_pcmpestris128:
+      Opcode = X86ISD::PCMPESTRI;
+      X86CC = X86::COND_S;
+      break;
+    case Intrinsic::x86_sse42_pcmpistriz128:
+      Opcode = X86ISD::PCMPISTRI;
+      X86CC = X86::COND_E;
+      break;
+    case Intrinsic::x86_sse42_pcmpestriz128:
+      Opcode = X86ISD::PCMPESTRI;
+      X86CC = X86::COND_E;
+      break;
+    }
+    SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
+    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+    SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
+    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                                DAG.getConstant(X86CC, MVT::i8),
+                                SDValue(PCMP.getNode(), 1));
+    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+  }
+
+  case Intrinsic::x86_sse42_pcmpistri128:
+  case Intrinsic::x86_sse42_pcmpestri128: {
+    unsigned Opcode;
+    if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
+      Opcode = X86ISD::PCMPISTRI;
+    else
+      Opcode = X86ISD::PCMPESTRI;
+
+    SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
+    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+    return DAG.getNode(Opcode, dl, VTs, NewOps);
+  }
+  case Intrinsic::x86_fma_vfmadd_ps:
+  case Intrinsic::x86_fma_vfmadd_pd:
+  case Intrinsic::x86_fma_vfmsub_ps:
+  case Intrinsic::x86_fma_vfmsub_pd:
+  case Intrinsic::x86_fma_vfnmadd_ps:
+  case Intrinsic::x86_fma_vfnmadd_pd:
+  case Intrinsic::x86_fma_vfnmsub_ps:
+  case Intrinsic::x86_fma_vfnmsub_pd:
+  case Intrinsic::x86_fma_vfmaddsub_ps:
+  case Intrinsic::x86_fma_vfmaddsub_pd:
+  case Intrinsic::x86_fma_vfmsubadd_ps:
+  case Intrinsic::x86_fma_vfmsubadd_pd:
+  case Intrinsic::x86_fma_vfmadd_ps_256:
+  case Intrinsic::x86_fma_vfmadd_pd_256:
+  case Intrinsic::x86_fma_vfmsub_ps_256:
+  case Intrinsic::x86_fma_vfmsub_pd_256:
+  case Intrinsic::x86_fma_vfnmadd_ps_256:
+  case Intrinsic::x86_fma_vfnmadd_pd_256:
+  case Intrinsic::x86_fma_vfnmsub_ps_256:
+  case Intrinsic::x86_fma_vfnmsub_pd_256:
+  case Intrinsic::x86_fma_vfmaddsub_ps_256:
+  case Intrinsic::x86_fma_vfmaddsub_pd_256:
+  case Intrinsic::x86_fma_vfmsubadd_ps_256:
+  case Intrinsic::x86_fma_vfmsubadd_pd_256:
+  case Intrinsic::x86_fma_vfmadd_ps_512:
+  case Intrinsic::x86_fma_vfmadd_pd_512:
+  case Intrinsic::x86_fma_vfmsub_ps_512:
+  case Intrinsic::x86_fma_vfmsub_pd_512:
+  case Intrinsic::x86_fma_vfnmadd_ps_512:
+  case Intrinsic::x86_fma_vfnmadd_pd_512:
+  case Intrinsic::x86_fma_vfnmsub_ps_512:
+  case Intrinsic::x86_fma_vfnmsub_pd_512:
+  case Intrinsic::x86_fma_vfmaddsub_ps_512:
+  case Intrinsic::x86_fma_vfmaddsub_pd_512:
+  case Intrinsic::x86_fma_vfmsubadd_ps_512:
+  case Intrinsic::x86_fma_vfmsubadd_pd_512: {
+    unsigned Opc;
+    switch (IntNo) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::x86_fma_vfmadd_ps:
+    case Intrinsic::x86_fma_vfmadd_pd:
+    case Intrinsic::x86_fma_vfmadd_ps_256:
+    case Intrinsic::x86_fma_vfmadd_pd_256:
+    case Intrinsic::x86_fma_vfmadd_ps_512:
+    case Intrinsic::x86_fma_vfmadd_pd_512:
+      Opc = X86ISD::FMADD;
+      break;
+    case Intrinsic::x86_fma_vfmsub_ps:
+    case Intrinsic::x86_fma_vfmsub_pd:
+    case Intrinsic::x86_fma_vfmsub_ps_256:
+    case Intrinsic::x86_fma_vfmsub_pd_256:
+    case Intrinsic::x86_fma_vfmsub_ps_512:
+    case Intrinsic::x86_fma_vfmsub_pd_512:
+      Opc = X86ISD::FMSUB;
+      break;
+    case Intrinsic::x86_fma_vfnmadd_ps:
+    case Intrinsic::x86_fma_vfnmadd_pd:
+    case Intrinsic::x86_fma_vfnmadd_ps_256:
+    case Intrinsic::x86_fma_vfnmadd_pd_256:
+    case Intrinsic::x86_fma_vfnmadd_ps_512:
+    case Intrinsic::x86_fma_vfnmadd_pd_512:
+      Opc = X86ISD::FNMADD;
+      break;
+    case Intrinsic::x86_fma_vfnmsub_ps:
+    case Intrinsic::x86_fma_vfnmsub_pd:
+    case Intrinsic::x86_fma_vfnmsub_ps_256:
+    case Intrinsic::x86_fma_vfnmsub_pd_256:
+    case Intrinsic::x86_fma_vfnmsub_ps_512:
+    case Intrinsic::x86_fma_vfnmsub_pd_512:
+      Opc = X86ISD::FNMSUB;
+      break;
+    case Intrinsic::x86_fma_vfmaddsub_ps:
+    case Intrinsic::x86_fma_vfmaddsub_pd:
+    case Intrinsic::x86_fma_vfmaddsub_ps_256:
+    case Intrinsic::x86_fma_vfmaddsub_pd_256:
+    case Intrinsic::x86_fma_vfmaddsub_ps_512:
+    case Intrinsic::x86_fma_vfmaddsub_pd_512:
+      Opc = X86ISD::FMADDSUB;
+      break;
+    case Intrinsic::x86_fma_vfmsubadd_ps:
+    case Intrinsic::x86_fma_vfmsubadd_pd:
+    case Intrinsic::x86_fma_vfmsubadd_ps_256:
+    case Intrinsic::x86_fma_vfmsubadd_pd_256:
+    case Intrinsic::x86_fma_vfmsubadd_ps_512:
+    case Intrinsic::x86_fma_vfmsubadd_pd_512:
+      Opc = X86ISD::FMSUBADD;
+      break;
+    }
+
+    return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
+                       Op.getOperand(2), Op.getOperand(3));
+  }
+  }
+}
+
+static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
+                              SDValue Src, SDValue Mask, SDValue Base,
+                              SDValue Index, SDValue ScaleOp, SDValue Chain,
+                              const X86Subtarget * Subtarget) {
+  SDLoc dl(Op);
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
+  assert(C && "Invalid scale type");
+  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
+  EVT MaskVT = MVT::getVectorVT(MVT::i1,
+                             Index.getSimpleValueType().getVectorNumElements());
+  SDValue MaskInReg;
+  ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
+  if (MaskC)
+    MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
+  else
+    MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
+  SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
+  SDValue Segment = DAG.getRegister(0, MVT::i32);
+  if (Src.getOpcode() == ISD::UNDEF)
+    Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl);
+  SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
+  SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
+  SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
+  return DAG.getMergeValues(RetOps, dl);
+}
+
+static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
+                               SDValue Src, SDValue Mask, SDValue Base,
+                               SDValue Index, SDValue ScaleOp, SDValue Chain) {
+  SDLoc dl(Op);
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
+  assert(C && "Invalid scale type");
+  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
+  SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
+  SDValue Segment = DAG.getRegister(0, MVT::i32);
+  EVT MaskVT = MVT::getVectorVT(MVT::i1,
+                             Index.getSimpleValueType().getVectorNumElements());
+  SDValue MaskInReg;
+  ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
+  if (MaskC)
+    MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
+  else
+    MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
+  SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
+  SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
+  SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
+  return SDValue(Res, 1);
+}
+
+static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
+                               SDValue Mask, SDValue Base, SDValue Index,
+                               SDValue ScaleOp, SDValue Chain) {
+  SDLoc dl(Op);
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
+  assert(C && "Invalid scale type");
+  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
+  SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
+  SDValue Segment = DAG.getRegister(0, MVT::i32);
+  EVT MaskVT =
+    MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
+  SDValue MaskInReg;
+  ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
+  if (MaskC)
+    MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
+  else
+    MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
+  //SDVTList VTs = DAG.getVTList(MVT::Other);
+  SDValue Ops[] = {MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
+  SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
+  return SDValue(Res, 0);
+}
+
+// getReadPerformanceCounter - Handles the lowering of builtin intrinsics that
+// read performance monitor counters (x86_rdpmc).
+static void getReadPerformanceCounter(SDNode *N, SDLoc DL,
+                              SelectionDAG &DAG, const X86Subtarget *Subtarget,
+                              SmallVectorImpl<SDValue> &Results) {
+  assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
+  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue LO, HI;
+
+  // The ECX register is used to select the index of the performance counter
+  // to read.
+  SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
+                                   N->getOperand(2));
+  SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
+
+  // Reads the content of a 64-bit performance counter and returns it in the
+  // registers EDX:EAX.
+  if (Subtarget->is64Bit()) {
+    LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
+    HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
+                            LO.getValue(2));
+  } else {
+    LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
+    HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
+                            LO.getValue(2));
+  }
+  Chain = HI.getValue(1);
+
+  if (Subtarget->is64Bit()) {
+    // The EAX register is loaded with the low-order 32 bits. The EDX register
+    // is loaded with the supported high-order bits of the counter.
+    SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
+                              DAG.getConstant(32, MVT::i8));
+    Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
+    Results.push_back(Chain);
+    return;
+  }
+
+  // Use a buildpair to merge the two 32-bit values into a 64-bit one.
+  SDValue Ops[] = { LO, HI };
+  SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
+  Results.push_back(Pair);
+  Results.push_back(Chain);
+}
+
+// getReadTimeStampCounter - Handles the lowering of builtin intrinsics that
+// read the time stamp counter (x86_rdtsc and x86_rdtscp). This function is
+// also used to custom lower READCYCLECOUNTER nodes.
+static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode,
+                              SelectionDAG &DAG, const X86Subtarget *Subtarget,
+                              SmallVectorImpl<SDValue> &Results) {
+  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
+  SDValue LO, HI;
+
+  // The processor's time-stamp counter (a 64-bit MSR) is stored into the
+  // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
+  // and the EAX register is loaded with the low-order 32 bits.
+  if (Subtarget->is64Bit()) {
+    LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
+    HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
+                            LO.getValue(2));
+  } else {
+    LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
+    HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
+                            LO.getValue(2));
+  }
+  SDValue Chain = HI.getValue(1);
+
+  if (Opcode == X86ISD::RDTSCP_DAG) {
+    assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
+
+    // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
+    // the ECX register. Add 'ecx' explicitly to the chain.
+    SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
+                                     HI.getValue(2));
+    // Explicitly store the content of ECX at the location passed in input
+    // to the 'rdtscp' intrinsic.
+    Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
+                         MachinePointerInfo(), false, false, 0);
+  }
+
+  if (Subtarget->is64Bit()) {
+    // The EDX register is loaded with the high-order 32 bits of the MSR, and
+    // the EAX register is loaded with the low-order 32 bits.
+    SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
+                              DAG.getConstant(32, MVT::i8));
+    Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
+    Results.push_back(Chain);
+    return;
+  }
+
+  // Use a buildpair to merge the two 32-bit values into a 64-bit one.
+  SDValue Ops[] = { LO, HI };
+  SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
+  Results.push_back(Pair);
+  Results.push_back(Chain);
+}
+
+static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
+                                     SelectionDAG &DAG) {
+  SmallVector<SDValue, 2> Results;
+  SDLoc DL(Op);
+  getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
+                          Results);
+  return DAG.getMergeValues(Results, DL);
+}
+
+enum IntrinsicType {
+  GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST
+};
+
+struct IntrinsicData {
+  IntrinsicData(IntrinsicType IType, unsigned IOpc0, unsigned IOpc1)
+    :Type(IType), Opc0(IOpc0), Opc1(IOpc1) {}
+  IntrinsicType Type;
+  unsigned      Opc0;
+  unsigned      Opc1;
+};
+
+std::map < unsigned, IntrinsicData> IntrMap;
+static void InitIntinsicsMap() {
+  static bool Initialized = false;
+  if (Initialized) 
+    return;
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qps_512,
+                                IntrinsicData(GATHER, X86::VGATHERQPSZrm, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qps_512,
+                                IntrinsicData(GATHER, X86::VGATHERQPSZrm, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qpd_512,
+                                IntrinsicData(GATHER, X86::VGATHERQPDZrm, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dpd_512,
+                                IntrinsicData(GATHER, X86::VGATHERDPDZrm, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dps_512,
+                                IntrinsicData(GATHER, X86::VGATHERDPSZrm, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qpi_512, 
+                                IntrinsicData(GATHER, X86::VPGATHERQDZrm, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qpq_512, 
+                                IntrinsicData(GATHER, X86::VPGATHERQQZrm, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dpi_512, 
+                                IntrinsicData(GATHER, X86::VPGATHERDDZrm, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dpq_512, 
+                                IntrinsicData(GATHER, X86::VPGATHERDQZrm, 0)));
+
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qps_512,
+                                IntrinsicData(SCATTER, X86::VSCATTERQPSZmr, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qpd_512, 
+                                IntrinsicData(SCATTER, X86::VSCATTERQPDZmr, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dpd_512, 
+                                IntrinsicData(SCATTER, X86::VSCATTERDPDZmr, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dps_512, 
+                                IntrinsicData(SCATTER, X86::VSCATTERDPSZmr, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qpi_512, 
+                                IntrinsicData(SCATTER, X86::VPSCATTERQDZmr, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qpq_512, 
+                                IntrinsicData(SCATTER, X86::VPSCATTERQQZmr, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dpi_512, 
+                                IntrinsicData(SCATTER, X86::VPSCATTERDDZmr, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dpq_512, 
+                                IntrinsicData(SCATTER, X86::VPSCATTERDQZmr, 0)));
+   
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_qps_512, 
+                                IntrinsicData(PREFETCH, X86::VGATHERPF0QPSm,
+                                                        X86::VGATHERPF1QPSm)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_qpd_512, 
+                                IntrinsicData(PREFETCH, X86::VGATHERPF0QPDm,
+                                                        X86::VGATHERPF1QPDm)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_dpd_512, 
+                                IntrinsicData(PREFETCH, X86::VGATHERPF0DPDm,
+                                                        X86::VGATHERPF1DPDm)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_dps_512, 
+                                IntrinsicData(PREFETCH, X86::VGATHERPF0DPSm,
+                                                        X86::VGATHERPF1DPSm)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_qps_512, 
+                                IntrinsicData(PREFETCH, X86::VSCATTERPF0QPSm,
+                                                        X86::VSCATTERPF1QPSm)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_qpd_512, 
+                                IntrinsicData(PREFETCH, X86::VSCATTERPF0QPDm,
+                                                        X86::VSCATTERPF1QPDm)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_dpd_512, 
+                                IntrinsicData(PREFETCH, X86::VSCATTERPF0DPDm,
+                                                        X86::VSCATTERPF1DPDm)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_dps_512, 
+                                IntrinsicData(PREFETCH, X86::VSCATTERPF0DPSm,
+                                                        X86::VSCATTERPF1DPSm)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_rdrand_16,
+                                IntrinsicData(RDRAND, X86ISD::RDRAND, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_rdrand_32,
+                                IntrinsicData(RDRAND, X86ISD::RDRAND, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_rdrand_64,
+                                IntrinsicData(RDRAND, X86ISD::RDRAND, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_rdseed_16,
+                                IntrinsicData(RDSEED, X86ISD::RDSEED, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_rdseed_32,
+                                IntrinsicData(RDSEED, X86ISD::RDSEED, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_rdseed_64,
+                                IntrinsicData(RDSEED, X86ISD::RDSEED, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_xtest,
+                                IntrinsicData(XTEST,  X86ISD::XTEST,  0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_rdtsc,
+                                IntrinsicData(RDTSC,  X86ISD::RDTSC_DAG, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_rdtscp,
+                                IntrinsicData(RDTSC,  X86ISD::RDTSCP_DAG, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_rdpmc,
+                                IntrinsicData(RDPMC,  X86ISD::RDPMC_DAG, 0)));
+  Initialized = true;
+}
+
+static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
+                                      SelectionDAG &DAG) {
+  InitIntinsicsMap();
+  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  std::map < unsigned, IntrinsicData>::const_iterator itr = IntrMap.find(IntNo);
+  if (itr == IntrMap.end())
+    return SDValue();
+
+  SDLoc dl(Op);
+  IntrinsicData Intr = itr->second;
+  switch(Intr.Type) {
+  case RDSEED:
+  case RDRAND: {
+    // Emit the node with the right value type.
+    SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
+    SDValue Result = DAG.getNode(Intr.Opc0, dl, VTs, Op.getOperand(0));
+
+    // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
+    // Otherwise return the value from Rand, which is always 0, casted to i32.
+    SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
+                      DAG.getConstant(1, Op->getValueType(1)),
+                      DAG.getConstant(X86::COND_B, MVT::i32),
+                      SDValue(Result.getNode(), 1) };
+    SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
+                                  DAG.getVTList(Op->getValueType(1), MVT::Glue),
+                                  Ops);
+
+    // Return { result, isValid, chain }.
+    return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
+                       SDValue(Result.getNode(), 2));
+  }
+  case GATHER: {
+  //gather(v1, mask, index, base, scale);
+    SDValue Chain = Op.getOperand(0);
+    SDValue Src   = Op.getOperand(2);
+    SDValue Base  = Op.getOperand(3);
+    SDValue Index = Op.getOperand(4);
+    SDValue Mask  = Op.getOperand(5);
+    SDValue Scale = Op.getOperand(6);
+    return getGatherNode(Intr.Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain,
+                          Subtarget);
+  }
+  case SCATTER: {
+  //scatter(base, mask, index, v1, scale);
+    SDValue Chain = Op.getOperand(0);
+    SDValue Base  = Op.getOperand(2);
+    SDValue Mask  = Op.getOperand(3);
+    SDValue Index = Op.getOperand(4);
+    SDValue Src   = Op.getOperand(5);
+    SDValue Scale = Op.getOperand(6);
+    return getScatterNode(Intr.Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain);
+  }
+  case PREFETCH: {
+    SDValue Hint = Op.getOperand(6);
+    unsigned HintVal;
+    if (dyn_cast<ConstantSDNode> (Hint) == nullptr ||
+        (HintVal = dyn_cast<ConstantSDNode> (Hint)->getZExtValue()) > 1)
+      llvm_unreachable("Wrong prefetch hint in intrinsic: should be 0 or 1");
+    unsigned Opcode = (HintVal ? Intr.Opc1 : Intr.Opc0);
+    SDValue Chain = Op.getOperand(0);
+    SDValue Mask  = Op.getOperand(2);
+    SDValue Index = Op.getOperand(3);
+    SDValue Base  = Op.getOperand(4);
+    SDValue Scale = Op.getOperand(5);
+    return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain);
+  }
+  // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
+  case RDTSC: {
+    SmallVector<SDValue, 2> Results;
+    getReadTimeStampCounter(Op.getNode(), dl, Intr.Opc0, DAG, Subtarget, Results);
+    return DAG.getMergeValues(Results, dl);
+  }
+  // Read Performance Monitoring Counters.
+  case RDPMC: {
+    SmallVector<SDValue, 2> Results;
+    getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
+    return DAG.getMergeValues(Results, dl);
+  }
+  // XTEST intrinsics.
+  case XTEST: {
+    SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
+    SDValue InTrans = DAG.getNode(X86ISD::XTEST, dl, VTs, Op.getOperand(0));
+    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                                DAG.getConstant(X86::COND_NE, MVT::i8),
+                                InTrans);
+    SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
+    return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
+                       Ret, SDValue(InTrans.getNode(), 1));
+  }
+  }
+  llvm_unreachable("Unknown Intrinsic Type");
+}
+
+SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI->setReturnAddressIsTaken(true);
+
+  if (verifyReturnAddressArgumentIsConstant(Op, DAG))
+    return SDValue();
+
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  SDLoc dl(Op);
+  EVT PtrVT = getPointerTy();
+
+  if (Depth > 0) {
+    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+    const X86RegisterInfo *RegInfo =
+      static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
+    SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT);
+    return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
+                       DAG.getNode(ISD::ADD, dl, PtrVT,
+                                   FrameAddr, Offset),
+                       MachinePointerInfo(), false, false, false, 0);
+  }
+
+  // Just load the return address.
+  SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
+  return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
+                     RetAddrFI, MachinePointerInfo(), false, false, false, 0);
+}
+
+SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI->setFrameAddressIsTaken(true);
+
+  EVT VT = Op.getValueType();
+  SDLoc dl(Op);  // FIXME probably not meaningful
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  const X86RegisterInfo *RegInfo =
+    static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
+  unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
+  assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
+          (FrameReg == X86::EBP && VT == MVT::i32)) &&
+         "Invalid Frame Register!");
+  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
+  while (Depth--)
+    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
+                            MachinePointerInfo(),
+                            false, false, false, 0);
+  return FrameAddr;
+}
+
+// FIXME? Maybe this could be a TableGen attribute on some registers and
+// this table could be generated automatically from RegInfo.
+unsigned X86TargetLowering::getRegisterByName(const char* RegName,
+                                              EVT VT) const {
+  unsigned Reg = StringSwitch<unsigned>(RegName)
+                       .Case("esp", X86::ESP)
+                       .Case("rsp", X86::RSP)
+                       .Default(0);
+  if (Reg)
+    return Reg;
+  report_fatal_error("Invalid register name global variable");
+}
+
+SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  const X86RegisterInfo *RegInfo =
+    static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
+  return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize());
+}
+
+SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain     = Op.getOperand(0);
+  SDValue Offset    = Op.getOperand(1);
+  SDValue Handler   = Op.getOperand(2);
+  SDLoc dl      (Op);
+
+  EVT PtrVT = getPointerTy();
+  const X86RegisterInfo *RegInfo =
+    static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
+  unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
+  assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
+          (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
+         "Invalid Frame Register!");
+  SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
+  unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
+
+  SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
+                                 DAG.getIntPtrConstant(RegInfo->getSlotSize()));
+  StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
+  Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
+                       false, false, 0);
+  Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
+
+  return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
+                     DAG.getRegister(StoreAddrReg, PtrVT));
+}
+
+SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
+                     DAG.getVTList(MVT::i32, MVT::Other),
+                     Op.getOperand(0), Op.getOperand(1));
+}
+
+SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
+                     Op.getOperand(0), Op.getOperand(1));
+}
+
+static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
+  return Op.getOperand(0);
+}
+
+SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  SDValue Root = Op.getOperand(0);
+  SDValue Trmp = Op.getOperand(1); // trampoline
+  SDValue FPtr = Op.getOperand(2); // nested function
+  SDValue Nest = Op.getOperand(3); // 'nest' parameter value
+  SDLoc dl (Op);
+
+  const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+  const TargetRegisterInfo* TRI = DAG.getTarget().getRegisterInfo();
+
+  if (Subtarget->is64Bit()) {
+    SDValue OutChains[6];
+
+    // Large code-model.
+    const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
+    const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
+
+    const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
+    const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
+
+    const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
+
+    // Load the pointer to the nested function into R11.
+    unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
+    SDValue Addr = Trmp;
+    OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
+                                Addr, MachinePointerInfo(TrmpAddr),
+                                false, false, 0);
+
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+                       DAG.getConstant(2, MVT::i64));
+    OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
+                                MachinePointerInfo(TrmpAddr, 2),
+                                false, false, 2);
+
+    // Load the 'nest' parameter value into R10.
+    // R10 is specified in X86CallingConv.td
+    OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+                       DAG.getConstant(10, MVT::i64));
+    OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
+                                Addr, MachinePointerInfo(TrmpAddr, 10),
+                                false, false, 0);
+
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+                       DAG.getConstant(12, MVT::i64));
+    OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
+                                MachinePointerInfo(TrmpAddr, 12),
+                                false, false, 2);
+
+    // Jump to the nested function.
+    OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+                       DAG.getConstant(20, MVT::i64));
+    OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
+                                Addr, MachinePointerInfo(TrmpAddr, 20),
+                                false, false, 0);
+
+    unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+                       DAG.getConstant(22, MVT::i64));
+    OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr,
+                                MachinePointerInfo(TrmpAddr, 22),
+                                false, false, 0);
+
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
+  } else {
+    const Function *Func =
+      cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
+    CallingConv::ID CC = Func->getCallingConv();
+    unsigned NestReg;
+
+    switch (CC) {
+    default:
+      llvm_unreachable("Unsupported calling convention");
+    case CallingConv::C:
+    case CallingConv::X86_StdCall: {
+      // Pass 'nest' parameter in ECX.
+      // Must be kept in sync with X86CallingConv.td
+      NestReg = X86::ECX;
+
+      // Check that ECX wasn't needed by an 'inreg' parameter.
+      FunctionType *FTy = Func->getFunctionType();
+      const AttributeSet &Attrs = Func->getAttributes();
+
+      if (!Attrs.isEmpty() && !Func->isVarArg()) {
+        unsigned InRegCount = 0;
+        unsigned Idx = 1;
+
+        for (FunctionType::param_iterator I = FTy->param_begin(),
+             E = FTy->param_end(); I != E; ++I, ++Idx)
+          if (Attrs.hasAttribute(Idx, Attribute::InReg))
+            // FIXME: should only count parameters that are lowered to integers.
+            InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
+
+        if (InRegCount > 2) {
+          report_fatal_error("Nest register in use - reduce number of inreg"
+                             " parameters!");
+        }
+      }
+      break;
+    }
+    case CallingConv::X86_FastCall:
+    case CallingConv::X86_ThisCall:
+    case CallingConv::Fast:
+      // Pass 'nest' parameter in EAX.
+      // Must be kept in sync with X86CallingConv.td
+      NestReg = X86::EAX;
+      break;
+    }
+
+    SDValue OutChains[4];
+    SDValue Addr, Disp;
+
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+                       DAG.getConstant(10, MVT::i32));
+    Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
+
+    // This is storing the opcode for MOV32ri.
+    const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
+    const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
+    OutChains[0] = DAG.getStore(Root, dl,
+                                DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
+                                Trmp, MachinePointerInfo(TrmpAddr),
+                                false, false, 0);
+
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+                       DAG.getConstant(1, MVT::i32));
+    OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
+                                MachinePointerInfo(TrmpAddr, 1),
+                                false, false, 1);
+
+    const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+                       DAG.getConstant(5, MVT::i32));
+    OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr,
+                                MachinePointerInfo(TrmpAddr, 5),
+                                false, false, 1);
+
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+                       DAG.getConstant(6, MVT::i32));
+    OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
+                                MachinePointerInfo(TrmpAddr, 6),
+                                false, false, 1);
+
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
+  }
+}
+
+SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  /*
+   The rounding mode is in bits 11:10 of FPSR, and has the following
+   settings:
+     00 Round to nearest
+     01 Round to -inf
+     10 Round to +inf
+     11 Round to 0
+
+  FLT_ROUNDS, on the other hand, expects the following:
+    -1 Undefined
+     0 Round to 0
+     1 Round to nearest
+     2 Round to +inf
+     3 Round to -inf
+
+  To perform the conversion, we do:
+    (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
+  */
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  const TargetMachine &TM = MF.getTarget();
+  const TargetFrameLowering &TFI = *TM.getFrameLowering();
+  unsigned StackAlignment = TFI.getStackAlignment();
+  MVT VT = Op.getSimpleValueType();
+  SDLoc DL(Op);
+
+  // Save FP Control Word to stack slot
+  int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
+  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+
+  MachineMemOperand *MMO =
+   MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
+                           MachineMemOperand::MOStore, 2, 2);
+
+  SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
+  SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
+                                          DAG.getVTList(MVT::Other),
+                                          Ops, MVT::i16, MMO);
+
+  // Load FP Control Word from stack slot
+  SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,
+                            MachinePointerInfo(), false, false, false, 0);
+
+  // Transform as necessary
+  SDValue CWD1 =
+    DAG.getNode(ISD::SRL, DL, MVT::i16,
+                DAG.getNode(ISD::AND, DL, MVT::i16,
+                            CWD, DAG.getConstant(0x800, MVT::i16)),
+                DAG.getConstant(11, MVT::i8));
+  SDValue CWD2 =
+    DAG.getNode(ISD::SRL, DL, MVT::i16,
+                DAG.getNode(ISD::AND, DL, MVT::i16,
+                            CWD, DAG.getConstant(0x400, MVT::i16)),
+                DAG.getConstant(9, MVT::i8));
+
+  SDValue RetVal =
+    DAG.getNode(ISD::AND, DL, MVT::i16,
+                DAG.getNode(ISD::ADD, DL, MVT::i16,
+                            DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
+                            DAG.getConstant(1, MVT::i16)),
+                DAG.getConstant(3, MVT::i16));
+
+  return DAG.getNode((VT.getSizeInBits() < 16 ?
+                      ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
+}
+
+static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  EVT OpVT = VT;
+  unsigned NumBits = VT.getSizeInBits();
+  SDLoc dl(Op);
+
+  Op = Op.getOperand(0);
+  if (VT == MVT::i8) {
+    // Zero extend to i32 since there is not an i8 bsr.
+    OpVT = MVT::i32;
+    Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
+  }
+
+  // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
+  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
+  Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
+
+  // If src is zero (i.e. bsr sets ZF), returns NumBits.
+  SDValue Ops[] = {
+    Op,
+    DAG.getConstant(NumBits+NumBits-1, OpVT),
+    DAG.getConstant(X86::COND_E, MVT::i8),
+    Op.getValue(1)
+  };
+  Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
+
+  // Finally xor with NumBits-1.
+  Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
+
+  if (VT == MVT::i8)
+    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
+  return Op;
+}
+
+static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  EVT OpVT = VT;
+  unsigned NumBits = VT.getSizeInBits();
+  SDLoc dl(Op);
+
+  Op = Op.getOperand(0);
+  if (VT == MVT::i8) {
+    // Zero extend to i32 since there is not an i8 bsr.
+    OpVT = MVT::i32;
+    Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
+  }
+
+  // Issue a bsr (scan bits in reverse).
+  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
+  Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
+
+  // And xor with NumBits-1.
+  Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
+
+  if (VT == MVT::i8)
+    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
+  return Op;
+}
+
+static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  unsigned NumBits = VT.getSizeInBits();
+  SDLoc dl(Op);
+  Op = Op.getOperand(0);
+
+  // Issue a bsf (scan bits forward) which also sets EFLAGS.
+  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+  Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op);
+
+  // If src is zero (i.e. bsf sets ZF), returns NumBits.
+  SDValue Ops[] = {
+    Op,
+    DAG.getConstant(NumBits, VT),
+    DAG.getConstant(X86::COND_E, MVT::i8),
+    Op.getValue(1)
+  };
+  return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
+}
+
+// Lower256IntArith - Break a 256-bit integer operation into two new 128-bit
+// ones, and then concatenate the result back.
+static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+
+  assert(VT.is256BitVector() && VT.isInteger() &&
+         "Unsupported value type for operation");
+
+  unsigned NumElems = VT.getVectorNumElements();
+  SDLoc dl(Op);
+
+  // Extract the LHS vectors
+  SDValue LHS = Op.getOperand(0);
+  SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
+  SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
+
+  // Extract the RHS vectors
+  SDValue RHS = Op.getOperand(1);
+  SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
+  SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
+
+  MVT EltVT = VT.getVectorElementType();
+  MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
+                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
+}
+
+static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
+  assert(Op.getSimpleValueType().is256BitVector() &&
+         Op.getSimpleValueType().isInteger() &&
+         "Only handle AVX 256-bit vector integer operation");
+  return Lower256IntArith(Op, DAG);
+}
+
+static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
+  assert(Op.getSimpleValueType().is256BitVector() &&
+         Op.getSimpleValueType().isInteger() &&
+         "Only handle AVX 256-bit vector integer operation");
+  return Lower256IntArith(Op, DAG);
+}
+
+static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
+                        SelectionDAG &DAG) {
+  SDLoc dl(Op);
+  MVT VT = Op.getSimpleValueType();
+
+  // Decompose 256-bit ops into smaller 128-bit ops.
+  if (VT.is256BitVector() && !Subtarget->hasInt256())
+    return Lower256IntArith(Op, DAG);
+
+  SDValue A = Op.getOperand(0);
+  SDValue B = Op.getOperand(1);
+
+  // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
+  if (VT == MVT::v4i32) {
+    assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() &&
+           "Should not custom lower when pmuldq is available!");
+
+    // Extract the odd parts.
+    static const int UnpackMask[] = { 1, -1, 3, -1 };
+    SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
+    SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
+
+    // Multiply the even parts.
+    SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
+    // Now multiply odd parts.
+    SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
+
+    Evens = DAG.getNode(ISD::BITCAST, dl, VT, Evens);
+    Odds = DAG.getNode(ISD::BITCAST, dl, VT, Odds);
+
+    // Merge the two vectors back together with a shuffle. This expands into 2
+    // shuffles.
+    static const int ShufMask[] = { 0, 4, 2, 6 };
+    return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
+  }
+
+  assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
+         "Only know how to lower V2I64/V4I64/V8I64 multiply");
+
+  //  Ahi = psrlqi(a, 32);
+  //  Bhi = psrlqi(b, 32);
+  //
+  //  AloBlo = pmuludq(a, b);
+  //  AloBhi = pmuludq(a, Bhi);
+  //  AhiBlo = pmuludq(Ahi, b);
+
+  //  AloBhi = psllqi(AloBhi, 32);
+  //  AhiBlo = psllqi(AhiBlo, 32);
+  //  return AloBlo + AloBhi + AhiBlo;
+
+  SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
+  SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
+
+  // Bit cast to 32-bit vectors for MULUDQ
+  EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 :
+                                  (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32;
+  A = DAG.getNode(ISD::BITCAST, dl, MulVT, A);
+  B = DAG.getNode(ISD::BITCAST, dl, MulVT, B);
+  Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi);
+  Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi);
+
+  SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
+  SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
+  SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
+
+  AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG);
+  AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG);
+
+  SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
+  return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
+}
+
+SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetWin64() && "Unexpected target");
+  EVT VT = Op.getValueType();
+  assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
+         "Unexpected return type for lowering");
+
+  RTLIB::Libcall LC;
+  bool isSigned;
+  switch (Op->getOpcode()) {
+  default: llvm_unreachable("Unexpected request for libcall!");
+  case ISD::SDIV:      isSigned = true;  LC = RTLIB::SDIV_I128;    break;
+  case ISD::UDIV:      isSigned = false; LC = RTLIB::UDIV_I128;    break;
+  case ISD::SREM:      isSigned = true;  LC = RTLIB::SREM_I128;    break;
+  case ISD::UREM:      isSigned = false; LC = RTLIB::UREM_I128;    break;
+  case ISD::SDIVREM:   isSigned = true;  LC = RTLIB::SDIVREM_I128; break;
+  case ISD::UDIVREM:   isSigned = false; LC = RTLIB::UDIVREM_I128; break;
+  }
+
+  SDLoc dl(Op);
+  SDValue InChain = DAG.getEntryNode();
+
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
+    EVT ArgVT = Op->getOperand(i).getValueType();
+    assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
+           "Unexpected argument type for lowering");
+    SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
+    Entry.Node = StackPtr;
+    InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MachinePointerInfo(),
+                           false, false, 16);
+    Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+    Entry.Ty = PointerType::get(ArgTy,0);
+    Entry.isSExt = false;
+    Entry.isZExt = false;
+    Args.push_back(Entry);
+  }
+
+  SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
+                                         getPointerTy());
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(InChain)
+    .setCallee(getLibcallCallingConv(LC),
+               static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
+               Callee, std::move(Args), 0)
+    .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
+
+  std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
+  return DAG.getNode(ISD::BITCAST, dl, VT, CallInfo.first);
+}
+
+static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
+                             SelectionDAG &DAG) {
+  SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
+  EVT VT = Op0.getValueType();
+  SDLoc dl(Op);
+
+  assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) ||
+         (VT == MVT::v8i32 && Subtarget->hasInt256()));
+
+  // PMULxD operations multiply each even value (starting at 0) of LHS with
+  // the related value of RHS and produce a widen result.
+  // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
+  // => <2 x i64> <ae|cg>
+  //
+  // In other word, to have all the results, we need to perform two PMULxD:
+  // 1. one with the even values.
+  // 2. one with the odd values.
+  // To achieve #2, with need to place the odd values at an even position.
+  //
+  // Place the odd value at an even position (basically, shift all values 1
+  // step to the left):
+  const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
+  // <a|b|c|d> => <b|undef|d|undef>
+  SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask);
+  // <e|f|g|h> => <f|undef|h|undef>
+  SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask);
+
+  // Emit two multiplies, one for the lower 2 ints and one for the higher 2
+  // ints.
+  MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
+  bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
+  unsigned Opcode =
+      (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
+  // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
+  // => <2 x i64> <ae|cg>
+  SDValue Mul1 = DAG.getNode(ISD::BITCAST, dl, VT,
+                             DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
+  // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
+  // => <2 x i64> <bf|dh>
+  SDValue Mul2 = DAG.getNode(ISD::BITCAST, dl, VT,
+                             DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
+
+  // Shuffle it back into the right order.
+  // The internal representation is big endian.
+  // In other words, a i64 bitcasted to 2 x i32 has its high part at index 0
+  // and its low part at index 1.
+  // Moreover, we have: Mul1 = <ae|cg> ; Mul2 = <bf|dh>
+  // Vector index                0 1   ;          2 3
+  // We want      <ae|bf|cg|dh>
+  // Vector index   0  2  1  3
+  // Since each element is seen as 2 x i32, we get:
+  // high_mask[i] = 2 x vector_index[i]
+  // low_mask[i] = 2 x vector_index[i] + 1
+  // where vector_index = {0, Size/2, 1, Size/2 + 1, ...,
+  //                       Size/2 - 1, Size/2 + Size/2 - 1}
+  // where Size is the number of element of the final vector.
+  SDValue Highs, Lows;
+  if (VT == MVT::v8i32) {
+    const int HighMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
+    Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
+    const int LowMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
+    Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
+  } else {
+    const int HighMask[] = {0, 4, 2, 6};
+    Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
+    const int LowMask[] = {1, 5, 3, 7};
+    Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
+  }
+
+  // If we have a signed multiply but no PMULDQ fix up the high parts of a
+  // unsigned multiply.
+  if (IsSigned && !Subtarget->hasSSE41()) {
+    SDValue ShAmt =
+        DAG.getConstant(31, DAG.getTargetLoweringInfo().getShiftAmountTy(VT));
+    SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
+                             DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
+    SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
+                             DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
+
+    SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
+    Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
+  }
+
+  // The low part of a MUL_LOHI is supposed to be the first value and the
+  // high part the second value.
+  return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getValueType(), Lows, Highs);
+}
+
+static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
+                                         const X86Subtarget *Subtarget) {
+  MVT VT = Op.getSimpleValueType();
+  SDLoc dl(Op);
+  SDValue R = Op.getOperand(0);
+  SDValue Amt = Op.getOperand(1);
+
+  // Optimize shl/srl/sra with constant shift amount.
+  if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
+    if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
+      uint64_t ShiftAmt = ShiftConst->getZExtValue();
+
+      if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
+          (Subtarget->hasInt256() &&
+           (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16)) ||
+          (Subtarget->hasAVX512() &&
+           (VT == MVT::v8i64 || VT == MVT::v16i32))) {
+        if (Op.getOpcode() == ISD::SHL)
+          return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,
+                                            DAG);
+        if (Op.getOpcode() == ISD::SRL)
+          return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,
+                                            DAG);
+        if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64)
+          return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,
+                                            DAG);
+      }
+
+      if (VT == MVT::v16i8) {
+        if (Op.getOpcode() == ISD::SHL) {
+          // Make a large shift.
+          SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
+                                                   MVT::v8i16, R, ShiftAmt,
+                                                   DAG);
+          SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
+          // Zero out the rightmost bits.
+          SmallVector<SDValue, 16> V(16,
+                                     DAG.getConstant(uint8_t(-1U << ShiftAmt),
+                                                     MVT::i8));
+          return DAG.getNode(ISD::AND, dl, VT, SHL,
+                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
+        }
+        if (Op.getOpcode() == ISD::SRL) {
+          // Make a large shift.
+          SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
+                                                   MVT::v8i16, R, ShiftAmt,
+                                                   DAG);
+          SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
+          // Zero out the leftmost bits.
+          SmallVector<SDValue, 16> V(16,
+                                     DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
+                                                     MVT::i8));
+          return DAG.getNode(ISD::AND, dl, VT, SRL,
+                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
+        }
+        if (Op.getOpcode() == ISD::SRA) {
+          if (ShiftAmt == 7) {
+            // R s>> 7  ===  R s< 0
+            SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
+            return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
+          }
+
+          // R s>> a === ((R u>> a) ^ m) - m
+          SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
+          SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt,
+                                                         MVT::i8));
+          SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
+          Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
+          Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
+          return Res;
+        }
+        llvm_unreachable("Unknown shift opcode.");
+      }
+
+      if (Subtarget->hasInt256() && VT == MVT::v32i8) {
+        if (Op.getOpcode() == ISD::SHL) {
+          // Make a large shift.
+          SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
+                                                   MVT::v16i16, R, ShiftAmt,
+                                                   DAG);
+          SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
+          // Zero out the rightmost bits.
+          SmallVector<SDValue, 32> V(32,
+                                     DAG.getConstant(uint8_t(-1U << ShiftAmt),
+                                                     MVT::i8));
+          return DAG.getNode(ISD::AND, dl, VT, SHL,
+                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
+        }
+        if (Op.getOpcode() == ISD::SRL) {
+          // Make a large shift.
+          SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
+                                                   MVT::v16i16, R, ShiftAmt,
+                                                   DAG);
+          SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
+          // Zero out the leftmost bits.
+          SmallVector<SDValue, 32> V(32,
+                                     DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
+                                                     MVT::i8));
+          return DAG.getNode(ISD::AND, dl, VT, SRL,
+                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
+        }
+        if (Op.getOpcode() == ISD::SRA) {
+          if (ShiftAmt == 7) {
+            // R s>> 7  ===  R s< 0
+            SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
+            return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
+          }
+
+          // R s>> a === ((R u>> a) ^ m) - m
+          SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
+          SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt,
+                                                         MVT::i8));
+          SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
+          Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
+          Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
+          return Res;
+        }
+        llvm_unreachable("Unknown shift opcode.");
+      }
+    }
+  }
+
+  // Special case in 32-bit mode, where i64 is expanded into high and low parts.
+  if (!Subtarget->is64Bit() &&
+      (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
+      Amt.getOpcode() == ISD::BITCAST &&
+      Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
+    Amt = Amt.getOperand(0);
+    unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
+                     VT.getVectorNumElements();
+    unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
+    uint64_t ShiftAmt = 0;
+    for (unsigned i = 0; i != Ratio; ++i) {
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i));
+      if (!C)
+        return SDValue();
+      // 6 == Log2(64)
+      ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
+    }
+    // Check remaining shift amounts.
+    for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
+      uint64_t ShAmt = 0;
+      for (unsigned j = 0; j != Ratio; ++j) {
+        ConstantSDNode *C =
+          dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
+        if (!C)
+          return SDValue();
+        // 6 == Log2(64)
+        ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
+      }
+      if (ShAmt != ShiftAmt)
+        return SDValue();
+    }
+    switch (Op.getOpcode()) {
+    default:
+      llvm_unreachable("Unknown shift opcode!");
+    case ISD::SHL:
+      return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,
+                                        DAG);
+    case ISD::SRL:
+      return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,
+                                        DAG);
+    case ISD::SRA:
+      return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,
+                                        DAG);
+    }
+  }
+
+  return SDValue();
+}
+
+static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
+                                        const X86Subtarget* Subtarget) {
+  MVT VT = Op.getSimpleValueType();
+  SDLoc dl(Op);
+  SDValue R = Op.getOperand(0);
+  SDValue Amt = Op.getOperand(1);
+
+  if ((VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) ||
+      VT == MVT::v4i32 || VT == MVT::v8i16 ||
+      (Subtarget->hasInt256() &&
+       ((VT == MVT::v4i64 && Op.getOpcode() != ISD::SRA) ||
+        VT == MVT::v8i32 || VT == MVT::v16i16)) ||
+       (Subtarget->hasAVX512() && (VT == MVT::v8i64 || VT == MVT::v16i32))) {
+    SDValue BaseShAmt;
+    EVT EltVT = VT.getVectorElementType();
+
+    if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
+      unsigned NumElts = VT.getVectorNumElements();
+      unsigned i, j;
+      for (i = 0; i != NumElts; ++i) {
+        if (Amt.getOperand(i).getOpcode() == ISD::UNDEF)
+          continue;
+        break;
+      }
+      for (j = i; j != NumElts; ++j) {
+        SDValue Arg = Amt.getOperand(j);
+        if (Arg.getOpcode() == ISD::UNDEF) continue;
+        if (Arg != Amt.getOperand(i))
+          break;
+      }
+      if (i != NumElts && j == NumElts)
+        BaseShAmt = Amt.getOperand(i);
+    } else {
+      if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
+        Amt = Amt.getOperand(0);
+      if (Amt.getOpcode() == ISD::VECTOR_SHUFFLE &&
+               cast<ShuffleVectorSDNode>(Amt)->isSplat()) {
+        SDValue InVec = Amt.getOperand(0);
+        if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
+          unsigned NumElts = InVec.getValueType().getVectorNumElements();
+          unsigned i = 0;
+          for (; i != NumElts; ++i) {
+            SDValue Arg = InVec.getOperand(i);
+            if (Arg.getOpcode() == ISD::UNDEF) continue;
+            BaseShAmt = Arg;
+            break;
+          }
+        } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
+           if (ConstantSDNode *C =
+               dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
+             unsigned SplatIdx =
+               cast<ShuffleVectorSDNode>(Amt)->getSplatIndex();
+             if (C->getZExtValue() == SplatIdx)
+               BaseShAmt = InVec.getOperand(1);
+           }
+        }
+        if (!BaseShAmt.getNode())
+          BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Amt,
+                                  DAG.getIntPtrConstant(0));
+      }
+    }
+
+    if (BaseShAmt.getNode()) {
+      if (EltVT.bitsGT(MVT::i32))
+        BaseShAmt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BaseShAmt);
+      else if (EltVT.bitsLT(MVT::i32))
+        BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
+
+      switch (Op.getOpcode()) {
+      default:
+        llvm_unreachable("Unknown shift opcode!");
+      case ISD::SHL:
+        switch (VT.SimpleTy) {
+        default: return SDValue();
+        case MVT::v2i64:
+        case MVT::v4i32:
+        case MVT::v8i16:
+        case MVT::v4i64:
+        case MVT::v8i32:
+        case MVT::v16i16:
+        case MVT::v16i32:
+        case MVT::v8i64:
+          return getTargetVShiftNode(X86ISD::VSHLI, dl, VT, R, BaseShAmt, DAG);
+        }
+      case ISD::SRA:
+        switch (VT.SimpleTy) {
+        default: return SDValue();
+        case MVT::v4i32:
+        case MVT::v8i16:
+        case MVT::v8i32:
+        case MVT::v16i16:
+        case MVT::v16i32:
+        case MVT::v8i64:
+          return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, R, BaseShAmt, DAG);
+        }
+      case ISD::SRL:
+        switch (VT.SimpleTy) {
+        default: return SDValue();
+        case MVT::v2i64:
+        case MVT::v4i32:
+        case MVT::v8i16:
+        case MVT::v4i64:
+        case MVT::v8i32:
+        case MVT::v16i16:
+        case MVT::v16i32:
+        case MVT::v8i64:
+          return getTargetVShiftNode(X86ISD::VSRLI, dl, VT, R, BaseShAmt, DAG);
+        }
+      }
+    }
+  }
+
+  // Special case in 32-bit mode, where i64 is expanded into high and low parts.
+  if (!Subtarget->is64Bit() &&
+      (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64) ||
+      (Subtarget->hasAVX512() && VT == MVT::v8i64)) &&
+      Amt.getOpcode() == ISD::BITCAST &&
+      Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
+    Amt = Amt.getOperand(0);
+    unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
+                     VT.getVectorNumElements();
+    std::vector<SDValue> Vals(Ratio);
+    for (unsigned i = 0; i != Ratio; ++i)
+      Vals[i] = Amt.getOperand(i);
+    for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
+      for (unsigned j = 0; j != Ratio; ++j)
+        if (Vals[j] != Amt.getOperand(i + j))
+          return SDValue();
+    }
+    switch (Op.getOpcode()) {
+    default:
+      llvm_unreachable("Unknown shift opcode!");
+    case ISD::SHL:
+      return DAG.getNode(X86ISD::VSHL, dl, VT, R, Op.getOperand(1));
+    case ISD::SRL:
+      return DAG.getNode(X86ISD::VSRL, dl, VT, R, Op.getOperand(1));
+    case ISD::SRA:
+      return DAG.getNode(X86ISD::VSRA, dl, VT, R, Op.getOperand(1));
+    }
+  }
+
+  return SDValue();
+}
+
+static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
+                          SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  SDLoc dl(Op);
+  SDValue R = Op.getOperand(0);
+  SDValue Amt = Op.getOperand(1);
+  SDValue V;
+
+  assert(VT.isVector() && "Custom lowering only for vector shifts!");
+  assert(Subtarget->hasSSE2() && "Only custom lower when we have SSE2!");
+
+  V = LowerScalarImmediateShift(Op, DAG, Subtarget);
+  if (V.getNode())
+    return V;
+
+  V = LowerScalarVariableShift(Op, DAG, Subtarget);
+  if (V.getNode())
+      return V;
+
+  if (Subtarget->hasAVX512() && (VT == MVT::v16i32 || VT == MVT::v8i64))
+    return Op;
+  // AVX2 has VPSLLV/VPSRAV/VPSRLV.
+  if (Subtarget->hasInt256()) {
+    if (Op.getOpcode() == ISD::SRL &&
+        (VT == MVT::v2i64 || VT == MVT::v4i32 ||
+         VT == MVT::v4i64 || VT == MVT::v8i32))
+      return Op;
+    if (Op.getOpcode() == ISD::SHL &&
+        (VT == MVT::v2i64 || VT == MVT::v4i32 ||
+         VT == MVT::v4i64 || VT == MVT::v8i32))
+      return Op;
+    if (Op.getOpcode() == ISD::SRA && (VT == MVT::v4i32 || VT == MVT::v8i32))
+      return Op;
+  }
+
+  // If possible, lower this packed shift into a vector multiply instead of
+  // expanding it into a sequence of scalar shifts.
+  // Do this only if the vector shift count is a constant build_vector.
+  if (Op.getOpcode() == ISD::SHL && 
+      (VT == MVT::v8i16 || VT == MVT::v4i32 ||
+       (Subtarget->hasInt256() && VT == MVT::v16i16)) &&
+      ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
+    SmallVector<SDValue, 8> Elts;
+    EVT SVT = VT.getScalarType();
+    unsigned SVTBits = SVT.getSizeInBits();
+    const APInt &One = APInt(SVTBits, 1);
+    unsigned NumElems = VT.getVectorNumElements();
+
+    for (unsigned i=0; i !=NumElems; ++i) {
+      SDValue Op = Amt->getOperand(i);
+      if (Op->getOpcode() == ISD::UNDEF) {
+        Elts.push_back(Op);
+        continue;
+      }
+
+      ConstantSDNode *ND = cast<ConstantSDNode>(Op);
+      const APInt &C = APInt(SVTBits, ND->getAPIntValue().getZExtValue());
+      uint64_t ShAmt = C.getZExtValue();
+      if (ShAmt >= SVTBits) {
+        Elts.push_back(DAG.getUNDEF(SVT));
+        continue;
+      }
+      Elts.push_back(DAG.getConstant(One.shl(ShAmt), SVT));
+    }
+    SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
+    return DAG.getNode(ISD::MUL, dl, VT, R, BV);
+  }
+
+  // Lower SHL with variable shift amount.
+  if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
+    Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, VT));
+
+    Op = DAG.getNode(ISD::ADD, dl, VT, Op, DAG.getConstant(0x3f800000U, VT));
+    Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op);
+    Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
+    return DAG.getNode(ISD::MUL, dl, VT, Op, R);
+  }
+
+  // If possible, lower this shift as a sequence of two shifts by
+  // constant plus a MOVSS/MOVSD instead of scalarizing it.
+  // Example:
+  //   (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
+  //
+  // Could be rewritten as:
+  //   (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
+  //
+  // The advantage is that the two shifts from the example would be
+  // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
+  // the vector shift into four scalar shifts plus four pairs of vector
+  // insert/extract.
+  if ((VT == MVT::v8i16 || VT == MVT::v4i32) &&
+      ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
+    unsigned TargetOpcode = X86ISD::MOVSS;
+    bool CanBeSimplified;
+    // The splat value for the first packed shift (the 'X' from the example).
+    SDValue Amt1 = Amt->getOperand(0);
+    // The splat value for the second packed shift (the 'Y' from the example).
+    SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) :
+                                        Amt->getOperand(2);
+
+    // See if it is possible to replace this node with a sequence of
+    // two shifts followed by a MOVSS/MOVSD
+    if (VT == MVT::v4i32) {
+      // Check if it is legal to use a MOVSS.
+      CanBeSimplified = Amt2 == Amt->getOperand(2) &&
+                        Amt2 == Amt->getOperand(3);
+      if (!CanBeSimplified) {
+        // Otherwise, check if we can still simplify this node using a MOVSD.
+        CanBeSimplified = Amt1 == Amt->getOperand(1) &&
+                          Amt->getOperand(2) == Amt->getOperand(3);
+        TargetOpcode = X86ISD::MOVSD;
+        Amt2 = Amt->getOperand(2);
+      }
+    } else {
+      // Do similar checks for the case where the machine value type
+      // is MVT::v8i16.
+      CanBeSimplified = Amt1 == Amt->getOperand(1);
+      for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
+        CanBeSimplified = Amt2 == Amt->getOperand(i);
+
+      if (!CanBeSimplified) {
+        TargetOpcode = X86ISD::MOVSD;
+        CanBeSimplified = true;
+        Amt2 = Amt->getOperand(4);
+        for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
+          CanBeSimplified = Amt1 == Amt->getOperand(i);
+        for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
+          CanBeSimplified = Amt2 == Amt->getOperand(j);
+      }
+    }
+    
+    if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
+        isa<ConstantSDNode>(Amt2)) {
+      // Replace this node with two shifts followed by a MOVSS/MOVSD.
+      EVT CastVT = MVT::v4i32;
+      SDValue Splat1 = 
+        DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), VT);
+      SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
+      SDValue Splat2 = 
+        DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), VT);
+      SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
+      if (TargetOpcode == X86ISD::MOVSD)
+        CastVT = MVT::v2i64;
+      SDValue BitCast1 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift1);
+      SDValue BitCast2 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift2);
+      SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2,
+                                            BitCast1, DAG);
+      return DAG.getNode(ISD::BITCAST, dl, VT, Result);
+    }
+  }
+
+  if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
+    assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq.");
+
+    // a = a << 5;
+    Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(5, VT));
+    Op = DAG.getNode(ISD::BITCAST, dl, VT, Op);
+
+    // Turn 'a' into a mask suitable for VSELECT
+    SDValue VSelM = DAG.getConstant(0x80, VT);
+    SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
+    OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
+
+    SDValue CM1 = DAG.getConstant(0x0f, VT);
+    SDValue CM2 = DAG.getConstant(0x3f, VT);
+
+    // r = VSELECT(r, psllw(r & (char16)15, 4), a);
+    SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1);
+    M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 4, DAG);
+    M = DAG.getNode(ISD::BITCAST, dl, VT, M);
+    R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
+
+    // a += a
+    Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
+    OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
+    OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
+
+    // r = VSELECT(r, psllw(r & (char16)63, 2), a);
+    M = DAG.getNode(ISD::AND, dl, VT, R, CM2);
+    M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 2, DAG);
+    M = DAG.getNode(ISD::BITCAST, dl, VT, M);
+    R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
+
+    // a += a
+    Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
+    OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
+    OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
+
+    // return VSELECT(r, r+r, a);
+    R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel,
+                    DAG.getNode(ISD::ADD, dl, VT, R, R), R);
+    return R;
+  }
+
+  // It's worth extending once and using the v8i32 shifts for 16-bit types, but
+  // the extra overheads to get from v16i8 to v8i32 make the existing SSE
+  // solution better.
+  if (Subtarget->hasInt256() && VT == MVT::v8i16) {
+    MVT NewVT = VT == MVT::v8i16 ? MVT::v8i32 : MVT::v16i16;
+    unsigned ExtOpc =
+        Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+    R = DAG.getNode(ExtOpc, dl, NewVT, R);
+    Amt = DAG.getNode(ISD::ANY_EXTEND, dl, NewVT, Amt);
+    return DAG.getNode(ISD::TRUNCATE, dl, VT,
+                       DAG.getNode(Op.getOpcode(), dl, NewVT, R, Amt));
+    }
+
+  // Decompose 256-bit shifts into smaller 128-bit shifts.
+  if (VT.is256BitVector()) {
+    unsigned NumElems = VT.getVectorNumElements();
+    MVT EltVT = VT.getVectorElementType();
+    EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+
+    // Extract the two vectors
+    SDValue V1 = Extract128BitVector(R, 0, DAG, dl);
+    SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl);
+
+    // Recreate the shift amount vectors
+    SDValue Amt1, Amt2;
+    if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
+      // Constant shift amount
+      SmallVector<SDValue, 4> Amt1Csts;
+      SmallVector<SDValue, 4> Amt2Csts;
+      for (unsigned i = 0; i != NumElems/2; ++i)
+        Amt1Csts.push_back(Amt->getOperand(i));
+      for (unsigned i = NumElems/2; i != NumElems; ++i)
+        Amt2Csts.push_back(Amt->getOperand(i));
+
+      Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt1Csts);
+      Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt2Csts);
+    } else {
+      // Variable shift amount
+      Amt1 = Extract128BitVector(Amt, 0, DAG, dl);
+      Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl);
+    }
+
+    // Issue new vector shifts for the smaller types
+    V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1);
+    V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2);
+
+    // Concatenate the result back
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2);
+  }
+
+  return SDValue();
+}
+
+static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
+  // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
+  // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
+  // looks for this combo and may remove the "setcc" instruction if the "setcc"
+  // has only one use.
+  SDNode *N = Op.getNode();
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  unsigned BaseOp = 0;
+  unsigned Cond = 0;
+  SDLoc DL(Op);
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Unknown ovf instruction!");
+  case ISD::SADDO:
+    // A subtract of one will be selected as a INC. Note that INC doesn't
+    // set CF, so we can't do this for UADDO.
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
+      if (C->isOne()) {
+        BaseOp = X86ISD::INC;
+        Cond = X86::COND_O;
+        break;
+      }
+    BaseOp = X86ISD::ADD;
+    Cond = X86::COND_O;
+    break;
+  case ISD::UADDO:
+    BaseOp = X86ISD::ADD;
+    Cond = X86::COND_B;
+    break;
+  case ISD::SSUBO:
+    // A subtract of one will be selected as a DEC. Note that DEC doesn't
+    // set CF, so we can't do this for USUBO.
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
+      if (C->isOne()) {
+        BaseOp = X86ISD::DEC;
+        Cond = X86::COND_O;
+        break;
+      }
+    BaseOp = X86ISD::SUB;
+    Cond = X86::COND_O;
+    break;
+  case ISD::USUBO:
+    BaseOp = X86ISD::SUB;
+    Cond = X86::COND_B;
+    break;
+  case ISD::SMULO:
+    BaseOp = X86ISD::SMUL;
+    Cond = X86::COND_O;
+    break;
+  case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
+    SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
+                                 MVT::i32);
+    SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
+
+    SDValue SetCC =
+      DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
+                  DAG.getConstant(X86::COND_O, MVT::i32),
+                  SDValue(Sum.getNode(), 2));
+
+    return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
+  }
+  }
+
+  // Also sets EFLAGS.
+  SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
+  SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
+
+  SDValue SetCC =
+    DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1),
+                DAG.getConstant(Cond, MVT::i32),
+                SDValue(Sum.getNode(), 1));
+
+  return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
+}
+
+SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+  MVT VT = Op.getSimpleValueType();
+
+  if (!Subtarget->hasSSE2() || !VT.isVector())
+    return SDValue();
+
+  unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
+                      ExtraVT.getScalarType().getSizeInBits();
+
+  switch (VT.SimpleTy) {
+    default: return SDValue();
+    case MVT::v8i32:
+    case MVT::v16i16:
+      if (!Subtarget->hasFp256())
+        return SDValue();
+      if (!Subtarget->hasInt256()) {
+        // needs to be split
+        unsigned NumElems = VT.getVectorNumElements();
+
+        // Extract the LHS vectors
+        SDValue LHS = Op.getOperand(0);
+        SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
+        SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
+
+        MVT EltVT = VT.getVectorElementType();
+        EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+
+        EVT ExtraEltVT = ExtraVT.getVectorElementType();
+        unsigned ExtraNumElems = ExtraVT.getVectorNumElements();
+        ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT,
+                                   ExtraNumElems/2);
+        SDValue Extra = DAG.getValueType(ExtraVT);
+
+        LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra);
+        LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra);
+
+        return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);
+      }
+      // fall through
+    case MVT::v4i32:
+    case MVT::v8i16: {
+      SDValue Op0 = Op.getOperand(0);
+      SDValue Op00 = Op0.getOperand(0);
+      SDValue Tmp1;
+      // Hopefully, this VECTOR_SHUFFLE is just a VZEXT.
+      if (Op0.getOpcode() == ISD::BITCAST &&
+          Op00.getOpcode() == ISD::VECTOR_SHUFFLE) {
+        // (sext (vzext x)) -> (vsext x)
+        Tmp1 = LowerVectorIntExtend(Op00, Subtarget, DAG);
+        if (Tmp1.getNode()) {
+          EVT ExtraEltVT = ExtraVT.getVectorElementType();
+          // This folding is only valid when the in-reg type is a vector of i8,
+          // i16, or i32.
+          if (ExtraEltVT == MVT::i8 || ExtraEltVT == MVT::i16 ||
+              ExtraEltVT == MVT::i32) {
+            SDValue Tmp1Op0 = Tmp1.getOperand(0);
+            assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT &&
+                   "This optimization is invalid without a VZEXT.");
+            return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0));
+          }
+          Op0 = Tmp1;
+        }
+      }
+
+      // If the above didn't work, then just use Shift-Left + Shift-Right.
+      Tmp1 = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Op0, BitsDiff,
+                                        DAG);
+      return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Tmp1, BitsDiff,
+                                        DAG);
+    }
+  }
+}
+
+static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
+                                 SelectionDAG &DAG) {
+  SDLoc dl(Op);
+  AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
+    cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
+  SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
+    cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
+
+  // The only fence that needs an instruction is a sequentially-consistent
+  // cross-thread fence.
+  if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) {
+    // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
+    // no-sse2). There isn't any reason to disable it if the target processor
+    // supports it.
+    if (Subtarget->hasSSE2() || Subtarget->is64Bit())
+      return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
+
+    SDValue Chain = Op.getOperand(0);
+    SDValue Zero = DAG.getConstant(0, MVT::i32);
+    SDValue Ops[] = {
+      DAG.getRegister(X86::ESP, MVT::i32), // Base
+      DAG.getTargetConstant(1, MVT::i8),   // Scale
+      DAG.getRegister(0, MVT::i32),        // Index
+      DAG.getTargetConstant(0, MVT::i32),  // Disp
+      DAG.getRegister(0, MVT::i32),        // Segment.
+      Zero,
+      Chain
+    };
+    SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
+    return SDValue(Res, 0);
+  }
+
+  // MEMBARRIER is a compiler barrier; it codegens to a no-op.
+  return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
+}
+
+static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
+                             SelectionDAG &DAG) {
+  MVT T = Op.getSimpleValueType();
+  SDLoc DL(Op);
+  unsigned Reg = 0;
+  unsigned size = 0;
+  switch(T.SimpleTy) {
+  default: llvm_unreachable("Invalid value type!");
+  case MVT::i8:  Reg = X86::AL;  size = 1; break;
+  case MVT::i16: Reg = X86::AX;  size = 2; break;
+  case MVT::i32: Reg = X86::EAX; size = 4; break;
+  case MVT::i64:
+    assert(Subtarget->is64Bit() && "Node not type legal!");
+    Reg = X86::RAX; size = 8;
+    break;
+  }
+  SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
+                                  Op.getOperand(2), SDValue());
+  SDValue Ops[] = { cpIn.getValue(0),
+                    Op.getOperand(1),
+                    Op.getOperand(3),
+                    DAG.getTargetConstant(size, MVT::i8),
+                    cpIn.getValue(1) };
+  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+  MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
+  SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
+                                           Ops, T, MMO);
+
+  SDValue cpOut =
+    DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
+  SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
+                                      MVT::i32, cpOut.getValue(2));
+  SDValue Success = DAG.getNode(X86ISD::SETCC, DL, Op->getValueType(1),
+                                DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS);
+
+  DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
+  DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
+  DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
+  return SDValue();
+}
+
+static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
+                            SelectionDAG &DAG) {
+  MVT SrcVT = Op.getOperand(0).getSimpleValueType();
+  MVT DstVT = Op.getSimpleValueType();
+
+  if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) {
+    assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
+    if (DstVT != MVT::f64)
+      // This conversion needs to be expanded.
+      return SDValue();
+
+    SDValue InVec = Op->getOperand(0);
+    SDLoc dl(Op);
+    unsigned NumElts = SrcVT.getVectorNumElements();
+    EVT SVT = SrcVT.getVectorElementType();
+
+    // Widen the vector in input in the case of MVT::v2i32.
+    // Example: from MVT::v2i32 to MVT::v4i32.
+    SmallVector<SDValue, 16> Elts;
+    for (unsigned i = 0, e = NumElts; i != e; ++i)
+      Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, InVec,
+                                 DAG.getIntPtrConstant(i)));
+
+    // Explicitly mark the extra elements as Undef.
+    SDValue Undef = DAG.getUNDEF(SVT);
+    for (unsigned i = NumElts, e = NumElts * 2; i != e; ++i)
+      Elts.push_back(Undef);
+
+    EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
+    SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Elts);
+    SDValue ToV2F64 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, BV);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
+                       DAG.getIntPtrConstant(0));
+  }
+
+  assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
+         Subtarget->hasMMX() && "Unexpected custom BITCAST");
+  assert((DstVT == MVT::i64 ||
+          (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
+         "Unexpected custom BITCAST");
+  // i64 <=> MMX conversions are Legal.
+  if (SrcVT==MVT::i64 && DstVT.isVector())
+    return Op;
+  if (DstVT==MVT::i64 && SrcVT.isVector())
+    return Op;
+  // MMX <=> MMX conversions are Legal.
+  if (SrcVT.isVector() && DstVT.isVector())
+    return Op;
+  // All other conversions need to be expanded.
+  return SDValue();
+}
+
+static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
+  SDNode *Node = Op.getNode();
+  SDLoc dl(Node);
+  EVT T = Node->getValueType(0);
+  SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
+                              DAG.getConstant(0, T), Node->getOperand(2));
+  return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
+                       cast<AtomicSDNode>(Node)->getMemoryVT(),
+                       Node->getOperand(0),
+                       Node->getOperand(1), negOp,
+                       cast<AtomicSDNode>(Node)->getMemOperand(),
+                       cast<AtomicSDNode>(Node)->getOrdering(),
+                       cast<AtomicSDNode>(Node)->getSynchScope());
+}
+
+static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
+  SDNode *Node = Op.getNode();
+  SDLoc dl(Node);
+  EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
+
+  // Convert seq_cst store -> xchg
+  // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
+  // FIXME: On 32-bit, store -> fist or movq would be more efficient
+  //        (The only way to get a 16-byte store is cmpxchg16b)
+  // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
+  if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent ||
+      !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
+    SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
+                                 cast<AtomicSDNode>(Node)->getMemoryVT(),
+                                 Node->getOperand(0),
+                                 Node->getOperand(1), Node->getOperand(2),
+                                 cast<AtomicSDNode>(Node)->getMemOperand(),
+                                 cast<AtomicSDNode>(Node)->getOrdering(),
+                                 cast<AtomicSDNode>(Node)->getSynchScope());
+    return Swap.getValue(1);
+  }
+  // Other atomic stores have a simple pattern.
+  return Op;
+}
+
+static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getNode()->getSimpleValueType(0);
+
+  // Let legalize expand this if it isn't a legal type yet.
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+
+  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+
+  unsigned Opc;
+  bool ExtraOp = false;
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Invalid code");
+  case ISD::ADDC: Opc = X86ISD::ADD; break;
+  case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
+  case ISD::SUBC: Opc = X86ISD::SUB; break;
+  case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
+  }
+
+  if (!ExtraOp)
+    return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
+                       Op.getOperand(1));
+  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
+                     Op.getOperand(1), Op.getOperand(2));
+}
+
+static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
+                            SelectionDAG &DAG) {
+  assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit());
+
+  // For MacOSX, we want to call an alternative entry point: __sincos_stret,
+  // which returns the values as { float, float } (in XMM0) or
+  // { double, double } (which is returned in XMM0, XMM1).
+  SDLoc dl(Op);
+  SDValue Arg = Op.getOperand(0);
+  EVT ArgVT = Arg.getValueType();
+  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+
+  Entry.Node = Arg;
+  Entry.Ty = ArgTy;
+  Entry.isSExt = false;
+  Entry.isZExt = false;
+  Args.push_back(Entry);
+
+  bool isF64 = ArgVT == MVT::f64;
+  // Only optimize x86_64 for now. i386 is a bit messy. For f32,
+  // the small struct {f32, f32} is returned in (eax, edx). For f64,
+  // the results are returned via SRet in memory.
+  const char *LibcallName =  isF64 ? "__sincos_stret" : "__sincosf_stret";
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy());
+
+  Type *RetTy = isF64
+    ? (Type*)StructType::get(ArgTy, ArgTy, NULL)
+    : (Type*)VectorType::get(ArgTy, 4);
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
+    .setCallee(CallingConv::C, RetTy, Callee, std::move(Args), 0);
+
+  std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
+
+  if (isF64)
+    // Returned in xmm0 and xmm1.
+    return CallResult.first;
+
+  // Returned in bits 0:31 and 32:64 xmm0.
+  SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
+                               CallResult.first, DAG.getIntPtrConstant(0));
+  SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
+                               CallResult.first, DAG.getIntPtrConstant(1));
+  SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
+  return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
+}
+
+/// LowerOperation - Provide custom lowering hooks for some operations.
+///
+SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Should not custom lower this!");
+  case ISD::SIGN_EXTEND_INREG:  return LowerSIGN_EXTEND_INREG(Op,DAG);
+  case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
+  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
+    return LowerCMP_SWAP(Op, Subtarget, DAG);
+  case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
+  case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op,DAG);
+  case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
+  case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, DAG);
+  case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
+  case ISD::VSELECT:            return LowerVSELECT(Op, DAG);
+  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
+  case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
+  case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
+  case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
+  case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
+  case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
+  case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
+  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
+  case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
+  case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
+  case ISD::SHL_PARTS:
+  case ISD::SRA_PARTS:
+  case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
+  case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
+  case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
+  case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
+  case ISD::ZERO_EXTEND:        return LowerZERO_EXTEND(Op, Subtarget, DAG);
+  case ISD::SIGN_EXTEND:        return LowerSIGN_EXTEND(Op, Subtarget, DAG);
+  case ISD::ANY_EXTEND:         return LowerANY_EXTEND(Op, Subtarget, DAG);
+  case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
+  case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
+  case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
+  case ISD::FABS:               return LowerFABS(Op, DAG);
+  case ISD::FNEG:               return LowerFNEG(Op, DAG);
+  case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
+  case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
+  case ISD::SETCC:              return LowerSETCC(Op, DAG);
+  case ISD::SELECT:             return LowerSELECT(Op, DAG);
+  case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
+  case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
+  case ISD::VASTART:            return LowerVASTART(Op, DAG);
+  case ISD::VAARG:              return LowerVAARG(Op, DAG);
+  case ISD::VACOPY:             return LowerVACOPY(Op, Subtarget, DAG);
+  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::INTRINSIC_VOID:
+  case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
+  case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
+  case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
+  case ISD::FRAME_TO_ARGS_OFFSET:
+                                return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
+  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
+  case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
+  case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
+  case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
+  case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
+  case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
+  case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
+  case ISD::CTLZ:               return LowerCTLZ(Op, DAG);
+  case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ_ZERO_UNDEF(Op, DAG);
+  case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
+  case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
+  case ISD::UMUL_LOHI:
+  case ISD::SMUL_LOHI:          return LowerMUL_LOHI(Op, Subtarget, DAG);
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);
+  case ISD::SADDO:
+  case ISD::UADDO:
+  case ISD::SSUBO:
+  case ISD::USUBO:
+  case ISD::SMULO:
+  case ISD::UMULO:              return LowerXALUO(Op, DAG);
+  case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
+  case ISD::BITCAST:            return LowerBITCAST(Op, Subtarget, DAG);
+  case ISD::ADDC:
+  case ISD::ADDE:
+  case ISD::SUBC:
+  case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
+  case ISD::ADD:                return LowerADD(Op, DAG);
+  case ISD::SUB:                return LowerSUB(Op, DAG);
+  case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
+  }
+}
+
+static void ReplaceATOMIC_LOAD(SDNode *Node,
+                               SmallVectorImpl<SDValue> &Results,
+                               SelectionDAG &DAG) {
+  SDLoc dl(Node);
+  EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
+
+  // Convert wide load -> cmpxchg8b/cmpxchg16b
+  // FIXME: On 32-bit, load -> fild or movq would be more efficient
+  //        (The only way to get a 16-byte load is cmpxchg16b)
+  // FIXME: 16-byte ATOMIC_CMP_SWAP isn't actually hooked up at the moment.
+  SDValue Zero = DAG.getConstant(0, VT);
+  SDVTList VTs = DAG.getVTList(VT, MVT::i1, MVT::Other);
+  SDValue Swap =
+      DAG.getAtomicCmpSwap(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, dl, VT, VTs,
+                           Node->getOperand(0), Node->getOperand(1), Zero, Zero,
+                           cast<AtomicSDNode>(Node)->getMemOperand(),
+                           cast<AtomicSDNode>(Node)->getOrdering(),
+                           cast<AtomicSDNode>(Node)->getOrdering(),
+                           cast<AtomicSDNode>(Node)->getSynchScope());
+  Results.push_back(Swap.getValue(0));
+  Results.push_back(Swap.getValue(2));
+}
+
+/// ReplaceNodeResults - Replace a node with an illegal result type
+/// with a new node built out of custom code.
+void X86TargetLowering::ReplaceNodeResults(SDNode *N,
+                                           SmallVectorImpl<SDValue>&Results,
+                                           SelectionDAG &DAG) const {
+  SDLoc dl(N);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Do not know how to custom type legalize this operation!");
+  case ISD::SIGN_EXTEND_INREG:
+  case ISD::ADDC:
+  case ISD::ADDE:
+  case ISD::SUBC:
+  case ISD::SUBE:
+    // We don't want to expand or promote these.
+    return;
+  case ISD::SDIV:
+  case ISD::UDIV:
+  case ISD::SREM:
+  case ISD::UREM:
+  case ISD::SDIVREM:
+  case ISD::UDIVREM: {
+    SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
+    Results.push_back(V);
+    return;
+  }
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT: {
+    bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
+
+    if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType()))
+      return;
+
+    std::pair<SDValue,SDValue> Vals =
+        FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
+    SDValue FIST = Vals.first, StackSlot = Vals.second;
+    if (FIST.getNode()) {
+      EVT VT = N->getValueType(0);
+      // Return a load from the stack slot.
+      if (StackSlot.getNode())
+        Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
+                                      MachinePointerInfo(),
+                                      false, false, false, 0));
+      else
+        Results.push_back(FIST);
+    }
+    return;
+  }
+  case ISD::UINT_TO_FP: {
+    assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
+    if (N->getOperand(0).getValueType() != MVT::v2i32 ||
+        N->getValueType(0) != MVT::v2f32)
+      return;
+    SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64,
+                                 N->getOperand(0));
+    SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
+                                     MVT::f64);
+    SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias);
+    SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
+                             DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, VBias));
+    Or = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or);
+    SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
+    Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
+    return;
+  }
+  case ISD::FP_ROUND: {
+    if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
+        return;
+    SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
+    Results.push_back(V);
+    return;
+  }
+  case ISD::INTRINSIC_W_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+    switch (IntNo) {
+    default : llvm_unreachable("Do not know how to custom type "
+                               "legalize this intrinsic operation!");
+    case Intrinsic::x86_rdtsc:
+      return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
+                                     Results);
+    case Intrinsic::x86_rdtscp:
+      return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
+                                     Results);
+    case Intrinsic::x86_rdpmc:
+      return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
+    }
+  }
+  case ISD::READCYCLECOUNTER: {
+    return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
+                                   Results);
+  }
+  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
+    EVT T = N->getValueType(0);
+    assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
+    bool Regs64bit = T == MVT::i128;
+    EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
+    SDValue cpInL, cpInH;
+    cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
+                        DAG.getConstant(0, HalfT));
+    cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
+                        DAG.getConstant(1, HalfT));
+    cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
+                             Regs64bit ? X86::RAX : X86::EAX,
+                             cpInL, SDValue());
+    cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
+                             Regs64bit ? X86::RDX : X86::EDX,
+                             cpInH, cpInL.getValue(1));
+    SDValue swapInL, swapInH;
+    swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
+                          DAG.getConstant(0, HalfT));
+    swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
+                          DAG.getConstant(1, HalfT));
+    swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl,
+                               Regs64bit ? X86::RBX : X86::EBX,
+                               swapInL, cpInH.getValue(1));
+    swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl,
+                               Regs64bit ? X86::RCX : X86::ECX,
+                               swapInH, swapInL.getValue(1));
+    SDValue Ops[] = { swapInH.getValue(0),
+                      N->getOperand(1),
+                      swapInH.getValue(1) };
+    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+    MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
+    unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG :
+                                  X86ISD::LCMPXCHG8_DAG;
+    SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
+    SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
+                                        Regs64bit ? X86::RAX : X86::EAX,
+                                        HalfT, Result.getValue(1));
+    SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
+                                        Regs64bit ? X86::RDX : X86::EDX,
+                                        HalfT, cpOutL.getValue(2));
+    SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
+
+    SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
+                                        MVT::i32, cpOutH.getValue(2));
+    SDValue Success =
+        DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                    DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS);
+    Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
+
+    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
+    Results.push_back(Success);
+    Results.push_back(EFLAGS.getValue(1));
+    return;
+  }
+  case ISD::ATOMIC_SWAP:
+  case ISD::ATOMIC_LOAD_ADD:
+  case ISD::ATOMIC_LOAD_SUB:
+  case ISD::ATOMIC_LOAD_AND:
+  case ISD::ATOMIC_LOAD_OR:
+  case ISD::ATOMIC_LOAD_XOR:
+  case ISD::ATOMIC_LOAD_NAND:
+  case ISD::ATOMIC_LOAD_MIN:
+  case ISD::ATOMIC_LOAD_MAX:
+  case ISD::ATOMIC_LOAD_UMIN:
+  case ISD::ATOMIC_LOAD_UMAX:
+    // Delegate to generic TypeLegalization. Situations we can really handle
+    // should have already been dealt with by X86AtomicExpand.cpp.
+    break;
+  case ISD::ATOMIC_LOAD: {
+    ReplaceATOMIC_LOAD(N, Results, DAG);
+    return;
+  }
+  case ISD::BITCAST: {
+    assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
+    EVT DstVT = N->getValueType(0);
+    EVT SrcVT = N->getOperand(0)->getValueType(0);
+
+    if (SrcVT != MVT::f64 ||
+        (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
+      return;
+
+    unsigned NumElts = DstVT.getVectorNumElements();
+    EVT SVT = DstVT.getVectorElementType();
+    EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
+    SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+                                   MVT::v2f64, N->getOperand(0));
+    SDValue ToVecInt = DAG.getNode(ISD::BITCAST, dl, WiderVT, Expanded);
+
+    if (ExperimentalVectorWideningLegalization) {
+      // If we are legalizing vectors by widening, we already have the desired
+      // legal vector type, just return it.
+      Results.push_back(ToVecInt);
+      return;
+    }
+
+    SmallVector<SDValue, 8> Elts;
+    for (unsigned i = 0, e = NumElts; i != e; ++i)
+      Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
+                                   ToVecInt, DAG.getIntPtrConstant(i)));
+
+    Results.push_back(DAG.getNode(ISD::BUILD_VECTOR, dl, DstVT, Elts));
+  }
+  }
+}
+
+const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default: return nullptr;
+  case X86ISD::BSF:                return "X86ISD::BSF";
+  case X86ISD::BSR:                return "X86ISD::BSR";
+  case X86ISD::SHLD:               return "X86ISD::SHLD";
+  case X86ISD::SHRD:               return "X86ISD::SHRD";
+  case X86ISD::FAND:               return "X86ISD::FAND";
+  case X86ISD::FANDN:              return "X86ISD::FANDN";
+  case X86ISD::FOR:                return "X86ISD::FOR";
+  case X86ISD::FXOR:               return "X86ISD::FXOR";
+  case X86ISD::FSRL:               return "X86ISD::FSRL";
+  case X86ISD::FILD:               return "X86ISD::FILD";
+  case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
+  case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
+  case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
+  case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
+  case X86ISD::FLD:                return "X86ISD::FLD";
+  case X86ISD::FST:                return "X86ISD::FST";
+  case X86ISD::CALL:               return "X86ISD::CALL";
+  case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
+  case X86ISD::RDTSCP_DAG:         return "X86ISD::RDTSCP_DAG";
+  case X86ISD::RDPMC_DAG:          return "X86ISD::RDPMC_DAG";
+  case X86ISD::BT:                 return "X86ISD::BT";
+  case X86ISD::CMP:                return "X86ISD::CMP";
+  case X86ISD::COMI:               return "X86ISD::COMI";
+  case X86ISD::UCOMI:              return "X86ISD::UCOMI";
+  case X86ISD::CMPM:               return "X86ISD::CMPM";
+  case X86ISD::CMPMU:              return "X86ISD::CMPMU";
+  case X86ISD::SETCC:              return "X86ISD::SETCC";
+  case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
+  case X86ISD::FSETCC:             return "X86ISD::FSETCC";
+  case X86ISD::CMOV:               return "X86ISD::CMOV";
+  case X86ISD::BRCOND:             return "X86ISD::BRCOND";
+  case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
+  case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
+  case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
+  case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
+  case X86ISD::Wrapper:            return "X86ISD::Wrapper";
+  case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
+  case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
+  case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
+  case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
+  case X86ISD::PINSRB:             return "X86ISD::PINSRB";
+  case X86ISD::PINSRW:             return "X86ISD::PINSRW";
+  case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
+  case X86ISD::ANDNP:              return "X86ISD::ANDNP";
+  case X86ISD::PSIGN:              return "X86ISD::PSIGN";
+  case X86ISD::BLENDV:             return "X86ISD::BLENDV";
+  case X86ISD::BLENDI:             return "X86ISD::BLENDI";
+  case X86ISD::SUBUS:              return "X86ISD::SUBUS";
+  case X86ISD::HADD:               return "X86ISD::HADD";
+  case X86ISD::HSUB:               return "X86ISD::HSUB";
+  case X86ISD::FHADD:              return "X86ISD::FHADD";
+  case X86ISD::FHSUB:              return "X86ISD::FHSUB";
+  case X86ISD::UMAX:               return "X86ISD::UMAX";
+  case X86ISD::UMIN:               return "X86ISD::UMIN";
+  case X86ISD::SMAX:               return "X86ISD::SMAX";
+  case X86ISD::SMIN:               return "X86ISD::SMIN";
+  case X86ISD::FMAX:               return "X86ISD::FMAX";
+  case X86ISD::FMIN:               return "X86ISD::FMIN";
+  case X86ISD::FMAXC:              return "X86ISD::FMAXC";
+  case X86ISD::FMINC:              return "X86ISD::FMINC";
+  case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
+  case X86ISD::FRCP:               return "X86ISD::FRCP";
+  case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
+  case X86ISD::TLSBASEADDR:        return "X86ISD::TLSBASEADDR";
+  case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
+  case X86ISD::EH_SJLJ_SETJMP:     return "X86ISD::EH_SJLJ_SETJMP";
+  case X86ISD::EH_SJLJ_LONGJMP:    return "X86ISD::EH_SJLJ_LONGJMP";
+  case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
+  case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
+  case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
+  case X86ISD::FNSTSW16r:          return "X86ISD::FNSTSW16r";
+  case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
+  case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
+  case X86ISD::LCMPXCHG16_DAG:     return "X86ISD::LCMPXCHG16_DAG";
+  case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
+  case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
+  case X86ISD::VZEXT:              return "X86ISD::VZEXT";
+  case X86ISD::VSEXT:              return "X86ISD::VSEXT";
+  case X86ISD::VTRUNC:             return "X86ISD::VTRUNC";
+  case X86ISD::VTRUNCM:            return "X86ISD::VTRUNCM";
+  case X86ISD::VINSERT:            return "X86ISD::VINSERT";
+  case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
+  case X86ISD::VFPROUND:           return "X86ISD::VFPROUND";
+  case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
+  case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
+  case X86ISD::VSHL:               return "X86ISD::VSHL";
+  case X86ISD::VSRL:               return "X86ISD::VSRL";
+  case X86ISD::VSRA:               return "X86ISD::VSRA";
+  case X86ISD::VSHLI:              return "X86ISD::VSHLI";
+  case X86ISD::VSRLI:              return "X86ISD::VSRLI";
+  case X86ISD::VSRAI:              return "X86ISD::VSRAI";
+  case X86ISD::CMPP:               return "X86ISD::CMPP";
+  case X86ISD::PCMPEQ:             return "X86ISD::PCMPEQ";
+  case X86ISD::PCMPGT:             return "X86ISD::PCMPGT";
+  case X86ISD::PCMPEQM:            return "X86ISD::PCMPEQM";
+  case X86ISD::PCMPGTM:            return "X86ISD::PCMPGTM";
+  case X86ISD::ADD:                return "X86ISD::ADD";
+  case X86ISD::SUB:                return "X86ISD::SUB";
+  case X86ISD::ADC:                return "X86ISD::ADC";
+  case X86ISD::SBB:                return "X86ISD::SBB";
+  case X86ISD::SMUL:               return "X86ISD::SMUL";
+  case X86ISD::UMUL:               return "X86ISD::UMUL";
+  case X86ISD::INC:                return "X86ISD::INC";
+  case X86ISD::DEC:                return "X86ISD::DEC";
+  case X86ISD::OR:                 return "X86ISD::OR";
+  case X86ISD::XOR:                return "X86ISD::XOR";
+  case X86ISD::AND:                return "X86ISD::AND";
+  case X86ISD::BEXTR:              return "X86ISD::BEXTR";
+  case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
+  case X86ISD::PTEST:              return "X86ISD::PTEST";
+  case X86ISD::TESTP:              return "X86ISD::TESTP";
+  case X86ISD::TESTM:              return "X86ISD::TESTM";
+  case X86ISD::TESTNM:             return "X86ISD::TESTNM";
+  case X86ISD::KORTEST:            return "X86ISD::KORTEST";
+  case X86ISD::PACKSS:             return "X86ISD::PACKSS";
+  case X86ISD::PACKUS:             return "X86ISD::PACKUS";
+  case X86ISD::PALIGNR:            return "X86ISD::PALIGNR";
+  case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
+  case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
+  case X86ISD::PSHUFLW:            return "X86ISD::PSHUFLW";
+  case X86ISD::SHUFP:              return "X86ISD::SHUFP";
+  case X86ISD::MOVLHPS:            return "X86ISD::MOVLHPS";
+  case X86ISD::MOVLHPD:            return "X86ISD::MOVLHPD";
+  case X86ISD::MOVHLPS:            return "X86ISD::MOVHLPS";
+  case X86ISD::MOVLPS:             return "X86ISD::MOVLPS";
+  case X86ISD::MOVLPD:             return "X86ISD::MOVLPD";
+  case X86ISD::MOVDDUP:            return "X86ISD::MOVDDUP";
+  case X86ISD::MOVSHDUP:           return "X86ISD::MOVSHDUP";
+  case X86ISD::MOVSLDUP:           return "X86ISD::MOVSLDUP";
+  case X86ISD::MOVSD:              return "X86ISD::MOVSD";
+  case X86ISD::MOVSS:              return "X86ISD::MOVSS";
+  case X86ISD::UNPCKL:             return "X86ISD::UNPCKL";
+  case X86ISD::UNPCKH:             return "X86ISD::UNPCKH";
+  case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
+  case X86ISD::VBROADCASTM:        return "X86ISD::VBROADCASTM";
+  case X86ISD::VEXTRACT:           return "X86ISD::VEXTRACT";
+  case X86ISD::VPERMILP:           return "X86ISD::VPERMILP";
+  case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
+  case X86ISD::VPERMV:             return "X86ISD::VPERMV";
+  case X86ISD::VPERMV3:            return "X86ISD::VPERMV3";
+  case X86ISD::VPERMIV3:           return "X86ISD::VPERMIV3";
+  case X86ISD::VPERMI:             return "X86ISD::VPERMI";
+  case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
+  case X86ISD::PMULDQ:             return "X86ISD::PMULDQ";
+  case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
+  case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
+  case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
+  case X86ISD::MEMBARRIER:         return "X86ISD::MEMBARRIER";
+  case X86ISD::SEG_ALLOCA:         return "X86ISD::SEG_ALLOCA";
+  case X86ISD::WIN_FTOL:           return "X86ISD::WIN_FTOL";
+  case X86ISD::SAHF:               return "X86ISD::SAHF";
+  case X86ISD::RDRAND:             return "X86ISD::RDRAND";
+  case X86ISD::RDSEED:             return "X86ISD::RDSEED";
+  case X86ISD::FMADD:              return "X86ISD::FMADD";
+  case X86ISD::FMSUB:              return "X86ISD::FMSUB";
+  case X86ISD::FNMADD:             return "X86ISD::FNMADD";
+  case X86ISD::FNMSUB:             return "X86ISD::FNMSUB";
+  case X86ISD::FMADDSUB:           return "X86ISD::FMADDSUB";
+  case X86ISD::FMSUBADD:           return "X86ISD::FMSUBADD";
+  case X86ISD::PCMPESTRI:          return "X86ISD::PCMPESTRI";
+  case X86ISD::PCMPISTRI:          return "X86ISD::PCMPISTRI";
+  case X86ISD::XTEST:              return "X86ISD::XTEST";
+  }
+}
+
+// isLegalAddressingMode - Return true if the addressing mode represented
+// by AM is legal for this target, for a load/store of the specified type.
+bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
+                                              Type *Ty) const {
+  // X86 supports extremely general addressing modes.
+  CodeModel::Model M = getTargetMachine().getCodeModel();
+  Reloc::Model R = getTargetMachine().getRelocationModel();
+
+  // X86 allows a sign-extended 32-bit immediate field as a displacement.
+  if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
+    return false;
+
+  if (AM.BaseGV) {
+    unsigned GVFlags =
+      Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine());
+
+    // If a reference to this global requires an extra load, we can't fold it.
+    if (isGlobalStubReference(GVFlags))
+      return false;
+
+    // If BaseGV requires a register for the PIC base, we cannot also have a
+    // BaseReg specified.
+    if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
+      return false;
+
+    // If lower 4G is not available, then we must use rip-relative addressing.
+    if ((M != CodeModel::Small || R != Reloc::Static) &&
+        Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1))
+      return false;
+  }
+
+  switch (AM.Scale) {
+  case 0:
+  case 1:
+  case 2:
+  case 4:
+  case 8:
+    // These scales always work.
+    break;
+  case 3:
+  case 5:
+  case 9:
+    // These scales are formed with basereg+scalereg.  Only accept if there is
+    // no basereg yet.
+    if (AM.HasBaseReg)
+      return false;
+    break;
+  default:  // Other stuff never works.
+    return false;
+  }
+
+  return true;
+}
+
+bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
+  unsigned Bits = Ty->getScalarSizeInBits();
+
+  // 8-bit shifts are always expensive, but versions with a scalar amount aren't
+  // particularly cheaper than those without.
+  if (Bits == 8)
+    return false;
+
+  // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
+  // variable shifts just as cheap as scalar ones.
+  if (Subtarget->hasInt256() && (Bits == 32 || Bits == 64))
+    return false;
+
+  // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
+  // fully general vector.
+  return true;
+}
+
+bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
+  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+    return false;
+  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
+  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
+  return NumBits1 > NumBits2;
+}
+
+bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
+  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+    return false;
+
+  if (!isTypeLegal(EVT::getEVT(Ty1)))
+    return false;
+
+  assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
+
+  // Assuming the caller doesn't have a zeroext or signext return parameter,
+  // truncation all the way down to i1 is valid.
+  return true;
+}
+
+bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
+  return isInt<32>(Imm);
+}
+
+bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
+  // Can also use sub to handle negated immediates.
+  return isInt<32>(Imm);
+}
+
+bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
+  if (!VT1.isInteger() || !VT2.isInteger())
+    return false;
+  unsigned NumBits1 = VT1.getSizeInBits();
+  unsigned NumBits2 = VT2.getSizeInBits();
+  return NumBits1 > NumBits2;
+}
+
+bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
+  // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
+  return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit();
+}
+
+bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
+  // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
+  return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
+}
+
+bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+  EVT VT1 = Val.getValueType();
+  if (isZExtFree(VT1, VT2))
+    return true;
+
+  if (Val.getOpcode() != ISD::LOAD)
+    return false;
+
+  if (!VT1.isSimple() || !VT1.isInteger() ||
+      !VT2.isSimple() || !VT2.isInteger())
+    return false;
+
+  switch (VT1.getSimpleVT().SimpleTy) {
+  default: break;
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+    // X86 has 8, 16, and 32-bit zero-extending loads.
+    return true;
+  }
+
+  return false;
+}
+
+bool
+X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+  if (!(Subtarget->hasFMA() || Subtarget->hasFMA4()))
+    return false;
+
+  VT = VT.getScalarType();
+
+  if (!VT.isSimple())
+    return false;
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::f32:
+  case MVT::f64:
+    return true;
+  default:
+    break;
+  }
+
+  return false;
+}
+
+bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
+  // i16 instructions are longer (0x66 prefix) and potentially slower.
+  return !(VT1 == MVT::i32 && VT2 == MVT::i16);
+}
+
+/// isShuffleMaskLegal - Targets can use this to indicate that they only
+/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
+/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
+/// are assumed to be legal.
+bool
+X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
+                                      EVT VT) const {
+  if (!VT.isSimple())
+    return false;
+
+  MVT SVT = VT.getSimpleVT();
+
+  // Very little shuffling can be done for 64-bit vectors right now.
+  if (VT.getSizeInBits() == 64)
+    return false;
+
+  // If this is a single-input shuffle with no 128 bit lane crossings we can
+  // lower it into pshufb.
+  if ((SVT.is128BitVector() && Subtarget->hasSSSE3()) ||
+      (SVT.is256BitVector() && Subtarget->hasInt256())) {
+    bool isLegal = true;
+    for (unsigned I = 0, E = M.size(); I != E; ++I) {
+      if (M[I] >= (int)SVT.getVectorNumElements() ||
+          ShuffleCrosses128bitLane(SVT, I, M[I])) {
+        isLegal = false;
+        break;
+      }
+    }
+    if (isLegal)
+      return true;
+  }
+
+  // FIXME: blends, shifts.
+  return (SVT.getVectorNumElements() == 2 ||
+          ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
+          isMOVLMask(M, SVT) ||
+          isMOVHLPSMask(M, SVT) ||
+          isSHUFPMask(M, SVT) ||
+          isPSHUFDMask(M, SVT) ||
+          isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) ||
+          isPSHUFLWMask(M, SVT, Subtarget->hasInt256()) ||
+          isPALIGNRMask(M, SVT, Subtarget) ||
+          isUNPCKLMask(M, SVT, Subtarget->hasInt256()) ||
+          isUNPCKHMask(M, SVT, Subtarget->hasInt256()) ||
+          isUNPCKL_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
+          isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
+          isBlendMask(M, SVT, Subtarget->hasSSE41(), Subtarget->hasInt256()));
+}
+
+bool
+X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
+                                          EVT VT) const {
+  if (!VT.isSimple())
+    return false;
+
+  MVT SVT = VT.getSimpleVT();
+  unsigned NumElts = SVT.getVectorNumElements();
+  // FIXME: This collection of masks seems suspect.
+  if (NumElts == 2)
+    return true;
+  if (NumElts == 4 && SVT.is128BitVector()) {
+    return (isMOVLMask(Mask, SVT)  ||
+            isCommutedMOVLMask(Mask, SVT, true) ||
+            isSHUFPMask(Mask, SVT) ||
+            isSHUFPMask(Mask, SVT, /* Commuted */ true));
+  }
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+//                           X86 Scheduler Hooks
+//===----------------------------------------------------------------------===//
+
+/// Utility function to emit xbegin specifying the start of an RTM region.
+static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB,
+                                     const TargetInstrInfo *TII) {
+  DebugLoc DL = MI->getDebugLoc();
+
+  const BasicBlock *BB = MBB->getBasicBlock();
+  MachineFunction::iterator I = MBB;
+  ++I;
+
+  // For the v = xbegin(), we generate
+  //
+  // thisMBB:
+  //  xbegin sinkMBB
+  //
+  // mainMBB:
+  //  eax = -1
+  //
+  // sinkMBB:
+  //  v = eax
+
+  MachineBasicBlock *thisMBB = MBB;
+  MachineFunction *MF = MBB->getParent();
+  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
+  MF->insert(I, mainMBB);
+  MF->insert(I, sinkMBB);
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), MBB,
+                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  // thisMBB:
+  //  xbegin sinkMBB
+  //  # fallthrough to mainMBB
+  //  # abortion to sinkMBB
+  BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
+  thisMBB->addSuccessor(mainMBB);
+  thisMBB->addSuccessor(sinkMBB);
+
+  // mainMBB:
+  //  EAX = -1
+  BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
+  mainMBB->addSuccessor(sinkMBB);
+
+  // sinkMBB:
+  // EAX is live into the sinkMBB
+  sinkMBB->addLiveIn(X86::EAX);
+  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+          TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
+    .addReg(X86::EAX);
+
+  MI->eraseFromParent();
+  return sinkMBB;
+}
+
+// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
+// or XMM0_V32I8 in AVX all of this code can be replaced with that
+// in the .td file.
+static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB,
+                                       const TargetInstrInfo *TII) {
+  unsigned Opc;
+  switch (MI->getOpcode()) {
+  default: llvm_unreachable("illegal opcode!");
+  case X86::PCMPISTRM128REG:  Opc = X86::PCMPISTRM128rr;  break;
+  case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
+  case X86::PCMPISTRM128MEM:  Opc = X86::PCMPISTRM128rm;  break;
+  case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
+  case X86::PCMPESTRM128REG:  Opc = X86::PCMPESTRM128rr;  break;
+  case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
+  case X86::PCMPESTRM128MEM:  Opc = X86::PCMPESTRM128rm;  break;
+  case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
+  }
+
+  DebugLoc dl = MI->getDebugLoc();
+  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
+
+  unsigned NumArgs = MI->getNumOperands();
+  for (unsigned i = 1; i < NumArgs; ++i) {
+    MachineOperand &Op = MI->getOperand(i);
+    if (!(Op.isReg() && Op.isImplicit()))
+      MIB.addOperand(Op);
+  }
+  if (MI->hasOneMemOperand())
+    MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+  BuildMI(*BB, MI, dl,
+    TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
+    .addReg(X86::XMM0);
+
+  MI->eraseFromParent();
+  return BB;
+}
+
+// FIXME: Custom handling because TableGen doesn't support multiple implicit
+// defs in an instruction pattern
+static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB,
+                                       const TargetInstrInfo *TII) {
+  unsigned Opc;
+  switch (MI->getOpcode()) {
+  default: llvm_unreachable("illegal opcode!");
+  case X86::PCMPISTRIREG:  Opc = X86::PCMPISTRIrr;  break;
+  case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
+  case X86::PCMPISTRIMEM:  Opc = X86::PCMPISTRIrm;  break;
+  case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
+  case X86::PCMPESTRIREG:  Opc = X86::PCMPESTRIrr;  break;
+  case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
+  case X86::PCMPESTRIMEM:  Opc = X86::PCMPESTRIrm;  break;
+  case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
+  }
+
+  DebugLoc dl = MI->getDebugLoc();
+  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
+
+  unsigned NumArgs = MI->getNumOperands(); // remove the results
+  for (unsigned i = 1; i < NumArgs; ++i) {
+    MachineOperand &Op = MI->getOperand(i);
+    if (!(Op.isReg() && Op.isImplicit()))
+      MIB.addOperand(Op);
+  }
+  if (MI->hasOneMemOperand())
+    MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+  BuildMI(*BB, MI, dl,
+    TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
+    .addReg(X86::ECX);
+
+  MI->eraseFromParent();
+  return BB;
+}
+
+static MachineBasicBlock * EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
+                                       const TargetInstrInfo *TII,
+                                       const X86Subtarget* Subtarget) {
+  DebugLoc dl = MI->getDebugLoc();
+
+  // Address into RAX/EAX, other two args into ECX, EDX.
+  unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
+  unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
+  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
+  for (int i = 0; i < X86::AddrNumOperands; ++i)
+    MIB.addOperand(MI->getOperand(i));
+
+  unsigned ValOps = X86::AddrNumOperands;
+  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
+    .addReg(MI->getOperand(ValOps).getReg());
+  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
+    .addReg(MI->getOperand(ValOps+1).getReg());
+
+  // The instruction doesn't actually take any operands though.
+  BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr));
+
+  MI->eraseFromParent(); // The pseudo is gone now.
+  return BB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitVAARG64WithCustomInserter(
+                   MachineInstr *MI,
+                   MachineBasicBlock *MBB) const {
+  // Emit va_arg instruction on X86-64.
+
+  // Operands to this pseudo-instruction:
+  // 0  ) Output        : destination address (reg)
+  // 1-5) Input         : va_list address (addr, i64mem)
+  // 6  ) ArgSize       : Size (in bytes) of vararg type
+  // 7  ) ArgMode       : 0=overflow only, 1=use gp_offset, 2=use fp_offset
+  // 8  ) Align         : Alignment of type
+  // 9  ) EFLAGS (implicit-def)
+
+  assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
+  assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands");
+
+  unsigned DestReg = MI->getOperand(0).getReg();
+  MachineOperand &Base = MI->getOperand(1);
+  MachineOperand &Scale = MI->getOperand(2);
+  MachineOperand &Index = MI->getOperand(3);
+  MachineOperand &Disp = MI->getOperand(4);
+  MachineOperand &Segment = MI->getOperand(5);
+  unsigned ArgSize = MI->getOperand(6).getImm();
+  unsigned ArgMode = MI->getOperand(7).getImm();
+  unsigned Align = MI->getOperand(8).getImm();
+
+  // Memory Reference
+  assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
+  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
+  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+
+  // Machine Information
+  const TargetInstrInfo *TII = MBB->getParent()->getTarget().getInstrInfo();
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
+  const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
+  DebugLoc DL = MI->getDebugLoc();
+
+  // struct va_list {
+  //   i32   gp_offset
+  //   i32   fp_offset
+  //   i64   overflow_area (address)
+  //   i64   reg_save_area (address)
+  // }
+  // sizeof(va_list) = 24
+  // alignment(va_list) = 8
+
+  unsigned TotalNumIntRegs = 6;
+  unsigned TotalNumXMMRegs = 8;
+  bool UseGPOffset = (ArgMode == 1);
+  bool UseFPOffset = (ArgMode == 2);
+  unsigned MaxOffset = TotalNumIntRegs * 8 +
+                       (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
+
+  /* Align ArgSize to a multiple of 8 */
+  unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
+  bool NeedsAlign = (Align > 8);
+
+  MachineBasicBlock *thisMBB = MBB;
+  MachineBasicBlock *overflowMBB;
+  MachineBasicBlock *offsetMBB;
+  MachineBasicBlock *endMBB;
+
+  unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
+  unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
+  unsigned OffsetReg = 0;
+
+  if (!UseGPOffset && !UseFPOffset) {
+    // If we only pull from the overflow region, we don't create a branch.
+    // We don't need to alter control flow.
+    OffsetDestReg = 0; // unused
+    OverflowDestReg = DestReg;
+
+    offsetMBB = nullptr;
+    overflowMBB = thisMBB;
+    endMBB = thisMBB;
+  } else {
+    // First emit code to check if gp_offset (or fp_offset) is below the bound.
+    // If so, pull the argument from reg_save_area. (branch to offsetMBB)
+    // If not, pull from overflow_area. (branch to overflowMBB)
+    //
+    //       thisMBB
+    //         |     .
+    //         |        .
+    //     offsetMBB   overflowMBB
+    //         |        .
+    //         |     .
+    //        endMBB
+
+    // Registers for the PHI in endMBB
+    OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
+    OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
+
+    const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+    MachineFunction *MF = MBB->getParent();
+    overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+    offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+    endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+
+    MachineFunction::iterator MBBIter = MBB;
+    ++MBBIter;
+
+    // Insert the new basic blocks
+    MF->insert(MBBIter, offsetMBB);
+    MF->insert(MBBIter, overflowMBB);
+    MF->insert(MBBIter, endMBB);
+
+    // Transfer the remainder of MBB and its successor edges to endMBB.
+    endMBB->splice(endMBB->begin(), thisMBB,
+                   std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
+    endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
+
+    // Make offsetMBB and overflowMBB successors of thisMBB
+    thisMBB->addSuccessor(offsetMBB);
+    thisMBB->addSuccessor(overflowMBB);
+
+    // endMBB is a successor of both offsetMBB and overflowMBB
+    offsetMBB->addSuccessor(endMBB);
+    overflowMBB->addSuccessor(endMBB);
+
+    // Load the offset value into a register
+    OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
+    BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
+      .addOperand(Base)
+      .addOperand(Scale)
+      .addOperand(Index)
+      .addDisp(Disp, UseFPOffset ? 4 : 0)
+      .addOperand(Segment)
+      .setMemRefs(MMOBegin, MMOEnd);
+
+    // Check if there is enough room left to pull this argument.
+    BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
+      .addReg(OffsetReg)
+      .addImm(MaxOffset + 8 - ArgSizeA8);
+
+    // Branch to "overflowMBB" if offset >= max
+    // Fall through to "offsetMBB" otherwise
+    BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
+      .addMBB(overflowMBB);
+  }
+
+  // In offsetMBB, emit code to use the reg_save_area.
+  if (offsetMBB) {
+    assert(OffsetReg != 0);
+
+    // Read the reg_save_area address.
+    unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
+    BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
+      .addOperand(Base)
+      .addOperand(Scale)
+      .addOperand(Index)
+      .addDisp(Disp, 16)
+      .addOperand(Segment)
+      .setMemRefs(MMOBegin, MMOEnd);
+
+    // Zero-extend the offset
+    unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
+      BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
+        .addImm(0)
+        .addReg(OffsetReg)
+        .addImm(X86::sub_32bit);
+
+    // Add the offset to the reg_save_area to get the final address.
+    BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
+      .addReg(OffsetReg64)
+      .addReg(RegSaveReg);
+
+    // Compute the offset for the next argument
+    unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
+    BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
+      .addReg(OffsetReg)
+      .addImm(UseFPOffset ? 16 : 8);
+
+    // Store it back into the va_list.
+    BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
+      .addOperand(Base)
+      .addOperand(Scale)
+      .addOperand(Index)
+      .addDisp(Disp, UseFPOffset ? 4 : 0)
+      .addOperand(Segment)
+      .addReg(NextOffsetReg)
+      .setMemRefs(MMOBegin, MMOEnd);
+
+    // Jump to endMBB
+    BuildMI(offsetMBB, DL, TII->get(X86::JMP_4))
+      .addMBB(endMBB);
+  }
+
+  //
+  // Emit code to use overflow area
+  //
+
+  // Load the overflow_area address into a register.
+  unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
+  BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
+    .addOperand(Base)
+    .addOperand(Scale)
+    .addOperand(Index)
+    .addDisp(Disp, 8)
+    .addOperand(Segment)
+    .setMemRefs(MMOBegin, MMOEnd);
+
+  // If we need to align it, do so. Otherwise, just copy the address
+  // to OverflowDestReg.
+  if (NeedsAlign) {
+    // Align the overflow address
+    assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2");
+    unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
+
+    // aligned_addr = (addr + (align-1)) & ~(align-1)
+    BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
+      .addReg(OverflowAddrReg)
+      .addImm(Align-1);
+
+    BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
+      .addReg(TmpReg)
+      .addImm(~(uint64_t)(Align-1));
+  } else {
+    BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
+      .addReg(OverflowAddrReg);
+  }
+
+  // Compute the next overflow address after this argument.
+  // (the overflow address should be kept 8-byte aligned)
+  unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
+  BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
+    .addReg(OverflowDestReg)
+    .addImm(ArgSizeA8);
+
+  // Store the new overflow address.
+  BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
+    .addOperand(Base)
+    .addOperand(Scale)
+    .addOperand(Index)
+    .addDisp(Disp, 8)
+    .addOperand(Segment)
+    .addReg(NextAddrReg)
+    .setMemRefs(MMOBegin, MMOEnd);
+
+  // If we branched, emit the PHI to the front of endMBB.
+  if (offsetMBB) {
+    BuildMI(*endMBB, endMBB->begin(), DL,
+            TII->get(X86::PHI), DestReg)
+      .addReg(OffsetDestReg).addMBB(offsetMBB)
+      .addReg(OverflowDestReg).addMBB(overflowMBB);
+  }
+
+  // Erase the pseudo instruction
+  MI->eraseFromParent();
+
+  return endMBB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
+                                                 MachineInstr *MI,
+                                                 MachineBasicBlock *MBB) const {
+  // Emit code to save XMM registers to the stack. The ABI says that the
+  // number of registers to save is given in %al, so it's theoretically
+  // possible to do an indirect jump trick to avoid saving all of them,
+  // however this code takes a simpler approach and just executes all
+  // of the stores if %al is non-zero. It's less code, and it's probably
+  // easier on the hardware branch predictor, and stores aren't all that
+  // expensive anyway.
+
+  // Create the new basic blocks. One block contains all the XMM stores,
+  // and one block is the final destination regardless of whether any
+  // stores were performed.
+  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+  MachineFunction *F = MBB->getParent();
+  MachineFunction::iterator MBBIter = MBB;
+  ++MBBIter;
+  MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(MBBIter, XMMSaveMBB);
+  F->insert(MBBIter, EndMBB);
+
+  // Transfer the remainder of MBB and its successor edges to EndMBB.
+  EndMBB->splice(EndMBB->begin(), MBB,
+                 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+  EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  // The original block will now fall through to the XMM save block.
+  MBB->addSuccessor(XMMSaveMBB);
+  // The XMMSaveMBB will fall through to the end block.
+  XMMSaveMBB->addSuccessor(EndMBB);
+
+  // Now add the instructions.
+  const TargetInstrInfo *TII = MBB->getParent()->getTarget().getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+
+  unsigned CountReg = MI->getOperand(0).getReg();
+  int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
+  int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
+
+  if (!Subtarget->isTargetWin64()) {
+    // If %al is 0, branch around the XMM save block.
+    BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
+    BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB);
+    MBB->addSuccessor(EndMBB);
+  }
+
+  // Make sure the last operand is EFLAGS, which gets clobbered by the branch
+  // that was just emitted, but clearly shouldn't be "saved".
+  assert((MI->getNumOperands() <= 3 ||
+          !MI->getOperand(MI->getNumOperands() - 1).isReg() ||
+          MI->getOperand(MI->getNumOperands() - 1).getReg() == X86::EFLAGS)
+         && "Expected last argument to be EFLAGS");
+  unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
+  // In the XMM save block, save all the XMM argument registers.
+  for (int i = 3, e = MI->getNumOperands() - 1; i != e; ++i) {
+    int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
+    MachineMemOperand *MMO =
+      F->getMachineMemOperand(
+          MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset),
+        MachineMemOperand::MOStore,
+        /*Size=*/16, /*Align=*/16);
+    BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
+      .addFrameIndex(RegSaveFrameIndex)
+      .addImm(/*Scale=*/1)
+      .addReg(/*IndexReg=*/0)
+      .addImm(/*Disp=*/Offset)
+      .addReg(/*Segment=*/0)
+      .addReg(MI->getOperand(i).getReg())
+      .addMemOperand(MMO);
+  }
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+
+  return EndMBB;
+}
+
+// The EFLAGS operand of SelectItr might be missing a kill marker
+// because there were multiple uses of EFLAGS, and ISel didn't know
+// which to mark. Figure out whether SelectItr should have had a
+// kill marker, and set it if it should. Returns the correct kill
+// marker value.
+static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
+                                     MachineBasicBlock* BB,
+                                     const TargetRegisterInfo* TRI) {
+  // Scan forward through BB for a use/def of EFLAGS.
+  MachineBasicBlock::iterator miI(std::next(SelectItr));
+  for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
+    const MachineInstr& mi = *miI;
+    if (mi.readsRegister(X86::EFLAGS))
+      return false;
+    if (mi.definesRegister(X86::EFLAGS))
+      break; // Should have kill-flag - update below.
+  }
+
+  // If we hit the end of the block, check whether EFLAGS is live into a
+  // successor.
+  if (miI == BB->end()) {
+    for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
+                                          sEnd = BB->succ_end();
+         sItr != sEnd; ++sItr) {
+      MachineBasicBlock* succ = *sItr;
+      if (succ->isLiveIn(X86::EFLAGS))
+        return false;
+    }
+  }
+
+  // We found a def, or hit the end of the basic block and EFLAGS wasn't live
+  // out. SelectMI should have a kill flag on EFLAGS.
+  SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
+  return true;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
+                                     MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = BB->getParent()->getTarget().getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+
+  // To "insert" a SELECT_CC instruction, we actually have to insert the
+  // diamond control-flow pattern.  The incoming instruction knows the
+  // destination vreg to set, the condition code register to branch on, the
+  // true/false values to select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   cmpTY ccX, r1, r2
+  //   bCC copy1MBB
+  //   fallthrough --> copy0MBB
+  MachineBasicBlock *thisMBB = BB;
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, copy0MBB);
+  F->insert(It, sinkMBB);
+
+  // If the EFLAGS register isn't dead in the terminator, then claim that it's
+  // live into the sink and copy blocks.
+  const TargetRegisterInfo* TRI = BB->getParent()->getTarget().getRegisterInfo();
+  if (!MI->killsRegister(X86::EFLAGS) &&
+      !checkAndUpdateEFLAGSKill(MI, BB, TRI)) {
+    copy0MBB->addLiveIn(X86::EFLAGS);
+    sinkMBB->addLiveIn(X86::EFLAGS);
+  }
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), BB,
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Add the true and fallthrough blocks as its successors.
+  BB->addSuccessor(copy0MBB);
+  BB->addSuccessor(sinkMBB);
+
+  // Create the conditional branch instruction.
+  unsigned Opc =
+    X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
+  BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
+
+  //  copy0MBB:
+  //   %FalseValue = ...
+  //   # fallthrough to sinkMBB
+  copy0MBB->addSuccessor(sinkMBB);
+
+  //  sinkMBB:
+  //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+  //  ...
+  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+          TII->get(X86::PHI), MI->getOperand(0).getReg())
+    .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
+    .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return sinkMBB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
+                                        bool Is64Bit) const {
+  MachineFunction *MF = BB->getParent();
+  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+
+  assert(MF->shouldSplitStack());
+
+  unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
+  unsigned TlsOffset = Is64Bit ? 0x70 : 0x30;
+
+  // BB:
+  //  ... [Till the alloca]
+  // If stacklet is not large enough, jump to mallocMBB
+  //
+  // bumpMBB:
+  //  Allocate by subtracting from RSP
+  //  Jump to continueMBB
+  //
+  // mallocMBB:
+  //  Allocate by call to runtime
+  //
+  // continueMBB:
+  //  ...
+  //  [rest of original BB]
+  //
+
+  MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  const TargetRegisterClass *AddrRegClass =
+    getRegClassFor(Is64Bit ? MVT::i64:MVT::i32);
+
+  unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
+    bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
+    tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
+    SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
+    sizeVReg = MI->getOperand(1).getReg(),
+    physSPReg = Is64Bit ? X86::RSP : X86::ESP;
+
+  MachineFunction::iterator MBBIter = BB;
+  ++MBBIter;
+
+  MF->insert(MBBIter, bumpMBB);
+  MF->insert(MBBIter, mallocMBB);
+  MF->insert(MBBIter, continueMBB);
+
+  continueMBB->splice(continueMBB->begin(), BB,
+                      std::next(MachineBasicBlock::iterator(MI)), BB->end());
+  continueMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Add code to the main basic block to check if the stack limit has been hit,
+  // and if so, jump to mallocMBB otherwise to bumpMBB.
+  BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
+  BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
+    .addReg(tmpSPVReg).addReg(sizeVReg);
+  BuildMI(BB, DL, TII->get(Is64Bit ? X86::CMP64mr:X86::CMP32mr))
+    .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
+    .addReg(SPLimitVReg);
+  BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB);
+
+  // bumpMBB simply decreases the stack pointer, since we know the current
+  // stacklet has enough space.
+  BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
+    .addReg(SPLimitVReg);
+  BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
+    .addReg(SPLimitVReg);
+  BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
+
+  // Calls into a routine in libgcc to allocate more space from the heap.
+  const uint32_t *RegMask =
+    MF->getTarget().getRegisterInfo()->getCallPreservedMask(CallingConv::C);
+  if (Is64Bit) {
+    BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
+      .addReg(sizeVReg);
+    BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
+      .addExternalSymbol("__morestack_allocate_stack_space")
+      .addRegMask(RegMask)
+      .addReg(X86::RDI, RegState::Implicit)
+      .addReg(X86::RAX, RegState::ImplicitDefine);
+  } else {
+    BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
+      .addImm(12);
+    BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
+    BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
+      .addExternalSymbol("__morestack_allocate_stack_space")
+      .addRegMask(RegMask)
+      .addReg(X86::EAX, RegState::ImplicitDefine);
+  }
+
+  if (!Is64Bit)
+    BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
+      .addImm(16);
+
+  BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
+    .addReg(Is64Bit ? X86::RAX : X86::EAX);
+  BuildMI(mallocMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
+
+  // Set up the CFG correctly.
+  BB->addSuccessor(bumpMBB);
+  BB->addSuccessor(mallocMBB);
+  mallocMBB->addSuccessor(continueMBB);
+  bumpMBB->addSuccessor(continueMBB);
+
+  // Take care of the PHI nodes.
+  BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
+          MI->getOperand(0).getReg())
+    .addReg(mallocPtrVReg).addMBB(mallocMBB)
+    .addReg(bumpSPPtrVReg).addMBB(bumpMBB);
+
+  // Delete the original pseudo instruction.
+  MI->eraseFromParent();
+
+  // And we're done.
+  return continueMBB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
+                                        MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = BB->getParent()->getTarget().getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+
+  assert(!Subtarget->isTargetMacho());
+
+  // The lowering is pretty easy: we're just emitting the call to _alloca.  The
+  // non-trivial part is impdef of ESP.
+
+  if (Subtarget->isTargetWin64()) {
+    if (Subtarget->isTargetCygMing()) {
+      // ___chkstk(Mingw64):
+      // Clobbers R10, R11, RAX and EFLAGS.
+      // Updates RSP.
+      BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
+        .addExternalSymbol("___chkstk")
+        .addReg(X86::RAX, RegState::Implicit)
+        .addReg(X86::RSP, RegState::Implicit)
+        .addReg(X86::RAX, RegState::Define | RegState::Implicit)
+        .addReg(X86::RSP, RegState::Define | RegState::Implicit)
+        .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
+    } else {
+      // __chkstk(MSVCRT): does not update stack pointer.
+      // Clobbers R10, R11 and EFLAGS.
+      BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
+        .addExternalSymbol("__chkstk")
+        .addReg(X86::RAX, RegState::Implicit)
+        .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
+      // RAX has the offset to be subtracted from RSP.
+      BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP)
+        .addReg(X86::RSP)
+        .addReg(X86::RAX);
+    }
+  } else {
+    const char *StackProbeSymbol =
+      Subtarget->isTargetKnownWindowsMSVC() ? "_chkstk" : "_alloca";
+
+    BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32))
+      .addExternalSymbol(StackProbeSymbol)
+      .addReg(X86::EAX, RegState::Implicit)
+      .addReg(X86::ESP, RegState::Implicit)
+      .addReg(X86::EAX, RegState::Define | RegState::Implicit)
+      .addReg(X86::ESP, RegState::Define | RegState::Implicit)
+      .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
+  }
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
+                                      MachineBasicBlock *BB) const {
+  // This is pretty easy.  We're taking the value that we received from
+  // our load from the relocation, sticking it in either RDI (x86-64)
+  // or EAX and doing an indirect call.  The return value will then
+  // be in the normal return register.
+  MachineFunction *F = BB->getParent();
+  const X86InstrInfo *TII
+    = static_cast<const X86InstrInfo*>(F->getTarget().getInstrInfo());
+  DebugLoc DL = MI->getDebugLoc();
+
+  assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
+  assert(MI->getOperand(3).isGlobal() && "This should be a global");
+
+  // Get a register mask for the lowered call.
+  // FIXME: The 32-bit calls have non-standard calling conventions. Use a
+  // proper register mask.
+  const uint32_t *RegMask =
+    F->getTarget().getRegisterInfo()->getCallPreservedMask(CallingConv::C);
+  if (Subtarget->is64Bit()) {
+    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
+                                      TII->get(X86::MOV64rm), X86::RDI)
+    .addReg(X86::RIP)
+    .addImm(0).addReg(0)
+    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
+                      MI->getOperand(3).getTargetFlags())
+    .addReg(0);
+    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
+    addDirectMem(MIB, X86::RDI);
+    MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
+  } else if (F->getTarget().getRelocationModel() != Reloc::PIC_) {
+    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
+                                      TII->get(X86::MOV32rm), X86::EAX)
+    .addReg(0)
+    .addImm(0).addReg(0)
+    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
+                      MI->getOperand(3).getTargetFlags())
+    .addReg(0);
+    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
+    addDirectMem(MIB, X86::EAX);
+    MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
+  } else {
+    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
+                                      TII->get(X86::MOV32rm), X86::EAX)
+    .addReg(TII->getGlobalBaseReg(F))
+    .addImm(0).addReg(0)
+    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
+                      MI->getOperand(3).getTargetFlags())
+    .addReg(0);
+    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
+    addDirectMem(MIB, X86::EAX);
+    MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
+  }
+
+  MI->eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
+                                    MachineBasicBlock *MBB) const {
+  DebugLoc DL = MI->getDebugLoc();
+  MachineFunction *MF = MBB->getParent();
+  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  const BasicBlock *BB = MBB->getBasicBlock();
+  MachineFunction::iterator I = MBB;
+  ++I;
+
+  // Memory Reference
+  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
+  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+
+  unsigned DstReg;
+  unsigned MemOpndSlot = 0;
+
+  unsigned CurOp = 0;
+
+  DstReg = MI->getOperand(CurOp++).getReg();
+  const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
+  assert(RC->hasType(MVT::i32) && "Invalid destination!");
+  unsigned mainDstReg = MRI.createVirtualRegister(RC);
+  unsigned restoreDstReg = MRI.createVirtualRegister(RC);
+
+  MemOpndSlot = CurOp;
+
+  MVT PVT = getPointerTy();
+  assert((PVT == MVT::i64 || PVT == MVT::i32) &&
+         "Invalid Pointer Size!");
+
+  // For v = setjmp(buf), we generate
+  //
+  // thisMBB:
+  //  buf[LabelOffset] = restoreMBB
+  //  SjLjSetup restoreMBB
+  //
+  // mainMBB:
+  //  v_main = 0
+  //
+  // sinkMBB:
+  //  v = phi(main, restore)
+  //
+  // restoreMBB:
+  //  v_restore = 1
+
+  MachineBasicBlock *thisMBB = MBB;
+  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
+  MF->insert(I, mainMBB);
+  MF->insert(I, sinkMBB);
+  MF->push_back(restoreMBB);
+
+  MachineInstrBuilder MIB;
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), MBB,
+                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  // thisMBB:
+  unsigned PtrStoreOpc = 0;
+  unsigned LabelReg = 0;
+  const int64_t LabelOffset = 1 * PVT.getStoreSize();
+  Reloc::Model RM = MF->getTarget().getRelocationModel();
+  bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
+                     (RM == Reloc::Static || RM == Reloc::DynamicNoPIC);
+
+  // Prepare IP either in reg or imm.
+  if (!UseImmLabel) {
+    PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
+    const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
+    LabelReg = MRI.createVirtualRegister(PtrRC);
+    if (Subtarget->is64Bit()) {
+      MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
+              .addReg(X86::RIP)
+              .addImm(0)
+              .addReg(0)
+              .addMBB(restoreMBB)
+              .addReg(0);
+    } else {
+      const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
+      MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
+              .addReg(XII->getGlobalBaseReg(MF))
+              .addImm(0)
+              .addReg(0)
+              .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference())
+              .addReg(0);
+    }
+  } else
+    PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
+  // Store IP
+  MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+    if (i == X86::AddrDisp)
+      MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset);
+    else
+      MIB.addOperand(MI->getOperand(MemOpndSlot + i));
+  }
+  if (!UseImmLabel)
+    MIB.addReg(LabelReg);
+  else
+    MIB.addMBB(restoreMBB);
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+  // Setup
+  MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
+          .addMBB(restoreMBB);
+
+  const X86RegisterInfo *RegInfo =
+    static_cast<const X86RegisterInfo*>(MF->getTarget().getRegisterInfo());
+  MIB.addRegMask(RegInfo->getNoPreservedMask());
+  thisMBB->addSuccessor(mainMBB);
+  thisMBB->addSuccessor(restoreMBB);
+
+  // mainMBB:
+  //  EAX = 0
+  BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
+  mainMBB->addSuccessor(sinkMBB);
+
+  // sinkMBB:
+  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+          TII->get(X86::PHI), DstReg)
+    .addReg(mainDstReg).addMBB(mainMBB)
+    .addReg(restoreDstReg).addMBB(restoreMBB);
+
+  // restoreMBB:
+  BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
+  BuildMI(restoreMBB, DL, TII->get(X86::JMP_4)).addMBB(sinkMBB);
+  restoreMBB->addSuccessor(sinkMBB);
+
+  MI->eraseFromParent();
+  return sinkMBB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
+                                     MachineBasicBlock *MBB) const {
+  DebugLoc DL = MI->getDebugLoc();
+  MachineFunction *MF = MBB->getParent();
+  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  // Memory Reference
+  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
+  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+
+  MVT PVT = getPointerTy();
+  assert((PVT == MVT::i64 || PVT == MVT::i32) &&
+         "Invalid Pointer Size!");
+
+  const TargetRegisterClass *RC =
+    (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
+  unsigned Tmp = MRI.createVirtualRegister(RC);
+  // Since FP is only updated here but NOT referenced, it's treated as GPR.
+  const X86RegisterInfo *RegInfo =
+    static_cast<const X86RegisterInfo*>(MF->getTarget().getRegisterInfo());
+  unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
+  unsigned SP = RegInfo->getStackRegister();
+
+  MachineInstrBuilder MIB;
+
+  const int64_t LabelOffset = 1 * PVT.getStoreSize();
+  const int64_t SPOffset = 2 * PVT.getStoreSize();
+
+  unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
+  unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
+
+  // Reload FP
+  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
+    MIB.addOperand(MI->getOperand(i));
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+  // Reload IP
+  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+    if (i == X86::AddrDisp)
+      MIB.addDisp(MI->getOperand(i), LabelOffset);
+    else
+      MIB.addOperand(MI->getOperand(i));
+  }
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+  // Reload SP
+  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+    if (i == X86::AddrDisp)
+      MIB.addDisp(MI->getOperand(i), SPOffset);
+    else
+      MIB.addOperand(MI->getOperand(i));
+  }
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+  // Jump
+  BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
+
+  MI->eraseFromParent();
+  return MBB;
+}
+
+// Replace 213-type (isel default) FMA3 instructions with 231-type for
+// accumulator loops. Writing back to the accumulator allows the coalescer
+// to remove extra copies in the loop.   
+MachineBasicBlock *
+X86TargetLowering::emitFMA3Instr(MachineInstr *MI,
+                                 MachineBasicBlock *MBB) const {
+  MachineOperand &AddendOp = MI->getOperand(3);
+
+  // Bail out early if the addend isn't a register - we can't switch these.
+  if (!AddendOp.isReg())
+    return MBB;
+
+  MachineFunction &MF = *MBB->getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  // Check whether the addend is defined by a PHI:
+  assert(MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?");
+  MachineInstr &AddendDef = *MRI.def_instr_begin(AddendOp.getReg());
+  if (!AddendDef.isPHI())
+    return MBB;
+
+  // Look for the following pattern:
+  // loop:
+  //   %addend = phi [%entry, 0], [%loop, %result]
+  //   ...
+  //   %result<tied1> = FMA213 %m2<tied0>, %m1, %addend
+
+  // Replace with:
+  //   loop:
+  //   %addend = phi [%entry, 0], [%loop, %result]
+  //   ...
+  //   %result<tied1> = FMA231 %addend<tied0>, %m1, %m2
+
+  for (unsigned i = 1, e = AddendDef.getNumOperands(); i < e; i += 2) {
+    assert(AddendDef.getOperand(i).isReg());
+    MachineOperand PHISrcOp = AddendDef.getOperand(i);
+    MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg());
+    if (&PHISrcInst == MI) {
+      // Found a matching instruction.
+      unsigned NewFMAOpc = 0;
+      switch (MI->getOpcode()) {
+        case X86::VFMADDPDr213r: NewFMAOpc = X86::VFMADDPDr231r; break;
+        case X86::VFMADDPSr213r: NewFMAOpc = X86::VFMADDPSr231r; break;
+        case X86::VFMADDSDr213r: NewFMAOpc = X86::VFMADDSDr231r; break;
+        case X86::VFMADDSSr213r: NewFMAOpc = X86::VFMADDSSr231r; break;
+        case X86::VFMSUBPDr213r: NewFMAOpc = X86::VFMSUBPDr231r; break;
+        case X86::VFMSUBPSr213r: NewFMAOpc = X86::VFMSUBPSr231r; break;
+        case X86::VFMSUBSDr213r: NewFMAOpc = X86::VFMSUBSDr231r; break;
+        case X86::VFMSUBSSr213r: NewFMAOpc = X86::VFMSUBSSr231r; break;
+        case X86::VFNMADDPDr213r: NewFMAOpc = X86::VFNMADDPDr231r; break;
+        case X86::VFNMADDPSr213r: NewFMAOpc = X86::VFNMADDPSr231r; break;
+        case X86::VFNMADDSDr213r: NewFMAOpc = X86::VFNMADDSDr231r; break;
+        case X86::VFNMADDSSr213r: NewFMAOpc = X86::VFNMADDSSr231r; break;
+        case X86::VFNMSUBPDr213r: NewFMAOpc = X86::VFNMSUBPDr231r; break;
+        case X86::VFNMSUBPSr213r: NewFMAOpc = X86::VFNMSUBPSr231r; break;
+        case X86::VFNMSUBSDr213r: NewFMAOpc = X86::VFNMSUBSDr231r; break;
+        case X86::VFNMSUBSSr213r: NewFMAOpc = X86::VFNMSUBSSr231r; break;
+        case X86::VFMADDPDr213rY: NewFMAOpc = X86::VFMADDPDr231rY; break;
+        case X86::VFMADDPSr213rY: NewFMAOpc = X86::VFMADDPSr231rY; break;
+        case X86::VFMSUBPDr213rY: NewFMAOpc = X86::VFMSUBPDr231rY; break;
+        case X86::VFMSUBPSr213rY: NewFMAOpc = X86::VFMSUBPSr231rY; break;
+        case X86::VFNMADDPDr213rY: NewFMAOpc = X86::VFNMADDPDr231rY; break;
+        case X86::VFNMADDPSr213rY: NewFMAOpc = X86::VFNMADDPSr231rY; break;
+        case X86::VFNMSUBPDr213rY: NewFMAOpc = X86::VFNMSUBPDr231rY; break;
+        case X86::VFNMSUBPSr213rY: NewFMAOpc = X86::VFNMSUBPSr231rY; break;
+        default: llvm_unreachable("Unrecognized FMA variant.");
+      }
+
+      const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+      MachineInstrBuilder MIB =
+        BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc))
+        .addOperand(MI->getOperand(0))
+        .addOperand(MI->getOperand(3))
+        .addOperand(MI->getOperand(2))
+        .addOperand(MI->getOperand(1));
+      MBB->insert(MachineBasicBlock::iterator(MI), MIB);
+      MI->eraseFromParent();
+    }
+  }
+
+  return MBB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                               MachineBasicBlock *BB) const {
+  switch (MI->getOpcode()) {
+  default: llvm_unreachable("Unexpected instr type to insert");
+  case X86::TAILJMPd64:
+  case X86::TAILJMPr64:
+  case X86::TAILJMPm64:
+    llvm_unreachable("TAILJMP64 would not be touched here.");
+  case X86::TCRETURNdi64:
+  case X86::TCRETURNri64:
+  case X86::TCRETURNmi64:
+    return BB;
+  case X86::WIN_ALLOCA:
+    return EmitLoweredWinAlloca(MI, BB);
+  case X86::SEG_ALLOCA_32:
+    return EmitLoweredSegAlloca(MI, BB, false);
+  case X86::SEG_ALLOCA_64:
+    return EmitLoweredSegAlloca(MI, BB, true);
+  case X86::TLSCall_32:
+  case X86::TLSCall_64:
+    return EmitLoweredTLSCall(MI, BB);
+  case X86::CMOV_GR8:
+  case X86::CMOV_FR32:
+  case X86::CMOV_FR64:
+  case X86::CMOV_V4F32:
+  case X86::CMOV_V2F64:
+  case X86::CMOV_V2I64:
+  case X86::CMOV_V8F32:
+  case X86::CMOV_V4F64:
+  case X86::CMOV_V4I64:
+  case X86::CMOV_V16F32:
+  case X86::CMOV_V8F64:
+  case X86::CMOV_V8I64:
+  case X86::CMOV_GR16:
+  case X86::CMOV_GR32:
+  case X86::CMOV_RFP32:
+  case X86::CMOV_RFP64:
+  case X86::CMOV_RFP80:
+    return EmitLoweredSelect(MI, BB);
+
+  case X86::FP32_TO_INT16_IN_MEM:
+  case X86::FP32_TO_INT32_IN_MEM:
+  case X86::FP32_TO_INT64_IN_MEM:
+  case X86::FP64_TO_INT16_IN_MEM:
+  case X86::FP64_TO_INT32_IN_MEM:
+  case X86::FP64_TO_INT64_IN_MEM:
+  case X86::FP80_TO_INT16_IN_MEM:
+  case X86::FP80_TO_INT32_IN_MEM:
+  case X86::FP80_TO_INT64_IN_MEM: {
+    MachineFunction *F = BB->getParent();
+    const TargetInstrInfo *TII = F->getTarget().getInstrInfo();
+    DebugLoc DL = MI->getDebugLoc();
+
+    // Change the floating point control register to use "round towards zero"
+    // mode when truncating to an integer value.
+    int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
+    addFrameReference(BuildMI(*BB, MI, DL,
+                              TII->get(X86::FNSTCW16m)), CWFrameIdx);
+
+    // Load the old value of the high byte of the control word...
+    unsigned OldCW =
+      F->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
+    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
+                      CWFrameIdx);
+
+    // Set the high part to be round to zero...
+    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
+      .addImm(0xC7F);
+
+    // Reload the modified control word now...
+    addFrameReference(BuildMI(*BB, MI, DL,
+                              TII->get(X86::FLDCW16m)), CWFrameIdx);
+
+    // Restore the memory image of control word to original value
+    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
+      .addReg(OldCW);
+
+    // Get the X86 opcode to use.
+    unsigned Opc;
+    switch (MI->getOpcode()) {
+    default: llvm_unreachable("illegal opcode!");
+    case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
+    case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
+    case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
+    case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
+    case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
+    case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
+    case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
+    case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
+    case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
+    }
+
+    X86AddressMode AM;
+    MachineOperand &Op = MI->getOperand(0);
+    if (Op.isReg()) {
+      AM.BaseType = X86AddressMode::RegBase;
+      AM.Base.Reg = Op.getReg();
+    } else {
+      AM.BaseType = X86AddressMode::FrameIndexBase;
+      AM.Base.FrameIndex = Op.getIndex();
+    }
+    Op = MI->getOperand(1);
+    if (Op.isImm())
+      AM.Scale = Op.getImm();
+    Op = MI->getOperand(2);
+    if (Op.isImm())
+      AM.IndexReg = Op.getImm();
+    Op = MI->getOperand(3);
+    if (Op.isGlobal()) {
+      AM.GV = Op.getGlobal();
+    } else {
+      AM.Disp = Op.getImm();
+    }
+    addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
+                      .addReg(MI->getOperand(X86::AddrNumOperands).getReg());
+
+    // Reload the original control word now.
+    addFrameReference(BuildMI(*BB, MI, DL,
+                              TII->get(X86::FLDCW16m)), CWFrameIdx);
+
+    MI->eraseFromParent();   // The pseudo instruction is gone now.
+    return BB;
+  }
+    // String/text processing lowering.
+  case X86::PCMPISTRM128REG:
+  case X86::VPCMPISTRM128REG:
+  case X86::PCMPISTRM128MEM:
+  case X86::VPCMPISTRM128MEM:
+  case X86::PCMPESTRM128REG:
+  case X86::VPCMPESTRM128REG:
+  case X86::PCMPESTRM128MEM:
+  case X86::VPCMPESTRM128MEM:
+    assert(Subtarget->hasSSE42() &&
+           "Target must have SSE4.2 or AVX features enabled");
+    return EmitPCMPSTRM(MI, BB, BB->getParent()->getTarget().getInstrInfo());
+
+  // String/text processing lowering.
+  case X86::PCMPISTRIREG:
+  case X86::VPCMPISTRIREG:
+  case X86::PCMPISTRIMEM:
+  case X86::VPCMPISTRIMEM:
+  case X86::PCMPESTRIREG:
+  case X86::VPCMPESTRIREG:
+  case X86::PCMPESTRIMEM:
+  case X86::VPCMPESTRIMEM:
+    assert(Subtarget->hasSSE42() &&
+           "Target must have SSE4.2 or AVX features enabled");
+    return EmitPCMPSTRI(MI, BB, BB->getParent()->getTarget().getInstrInfo());
+
+  // Thread synchronization.
+  case X86::MONITOR:
+    return EmitMonitor(MI, BB, BB->getParent()->getTarget().getInstrInfo(), Subtarget);
+
+  // xbegin
+  case X86::XBEGIN:
+    return EmitXBegin(MI, BB, BB->getParent()->getTarget().getInstrInfo());
+
+  case X86::VASTART_SAVE_XMM_REGS:
+    return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
+
+  case X86::VAARG_64:
+    return EmitVAARG64WithCustomInserter(MI, BB);
+
+  case X86::EH_SjLj_SetJmp32:
+  case X86::EH_SjLj_SetJmp64:
+    return emitEHSjLjSetJmp(MI, BB);
+
+  case X86::EH_SjLj_LongJmp32:
+  case X86::EH_SjLj_LongJmp64:
+    return emitEHSjLjLongJmp(MI, BB);
+
+  case TargetOpcode::STACKMAP:
+  case TargetOpcode::PATCHPOINT:
+    return emitPatchPoint(MI, BB);
+
+  case X86::VFMADDPDr213r:
+  case X86::VFMADDPSr213r:
+  case X86::VFMADDSDr213r:
+  case X86::VFMADDSSr213r:
+  case X86::VFMSUBPDr213r:
+  case X86::VFMSUBPSr213r:
+  case X86::VFMSUBSDr213r:
+  case X86::VFMSUBSSr213r:
+  case X86::VFNMADDPDr213r:
+  case X86::VFNMADDPSr213r:
+  case X86::VFNMADDSDr213r:
+  case X86::VFNMADDSSr213r:
+  case X86::VFNMSUBPDr213r:
+  case X86::VFNMSUBPSr213r:
+  case X86::VFNMSUBSDr213r:
+  case X86::VFNMSUBSSr213r:
+  case X86::VFMADDPDr213rY:
+  case X86::VFMADDPSr213rY:
+  case X86::VFMSUBPDr213rY:
+  case X86::VFMSUBPSr213rY:
+  case X86::VFNMADDPDr213rY:
+  case X86::VFNMADDPSr213rY:
+  case X86::VFNMSUBPDr213rY:
+  case X86::VFNMSUBPSr213rY:
+    return emitFMA3Instr(MI, BB);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                           X86 Optimization Hooks
+//===----------------------------------------------------------------------===//
+
+void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
+                                                      APInt &KnownZero,
+                                                      APInt &KnownOne,
+                                                      const SelectionDAG &DAG,
+                                                      unsigned Depth) const {
+  unsigned BitWidth = KnownZero.getBitWidth();
+  unsigned Opc = Op.getOpcode();
+  assert((Opc >= ISD::BUILTIN_OP_END ||
+          Opc == ISD::INTRINSIC_WO_CHAIN ||
+          Opc == ISD::INTRINSIC_W_CHAIN ||
+          Opc == ISD::INTRINSIC_VOID) &&
+         "Should use MaskedValueIsZero if you don't know whether Op"
+         " is a target node!");
+
+  KnownZero = KnownOne = APInt(BitWidth, 0);   // Don't know anything.
+  switch (Opc) {
+  default: break;
+  case X86ISD::ADD:
+  case X86ISD::SUB:
+  case X86ISD::ADC:
+  case X86ISD::SBB:
+  case X86ISD::SMUL:
+  case X86ISD::UMUL:
+  case X86ISD::INC:
+  case X86ISD::DEC:
+  case X86ISD::OR:
+  case X86ISD::XOR:
+  case X86ISD::AND:
+    // These nodes' second result is a boolean.
+    if (Op.getResNo() == 0)
+      break;
+    // Fallthrough
+  case X86ISD::SETCC:
+    KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
+    break;
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    unsigned NumLoBits = 0;
+    switch (IntId) {
+    default: break;
+    case Intrinsic::x86_sse_movmsk_ps:
+    case Intrinsic::x86_avx_movmsk_ps_256:
+    case Intrinsic::x86_sse2_movmsk_pd:
+    case Intrinsic::x86_avx_movmsk_pd_256:
+    case Intrinsic::x86_mmx_pmovmskb:
+    case Intrinsic::x86_sse2_pmovmskb_128:
+    case Intrinsic::x86_avx2_pmovmskb: {
+      // High bits of movmskp{s|d}, pmovmskb are known zero.
+      switch (IntId) {
+        default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+        case Intrinsic::x86_sse_movmsk_ps:      NumLoBits = 4; break;
+        case Intrinsic::x86_avx_movmsk_ps_256:  NumLoBits = 8; break;
+        case Intrinsic::x86_sse2_movmsk_pd:     NumLoBits = 2; break;
+        case Intrinsic::x86_avx_movmsk_pd_256:  NumLoBits = 4; break;
+        case Intrinsic::x86_mmx_pmovmskb:       NumLoBits = 8; break;
+        case Intrinsic::x86_sse2_pmovmskb_128:  NumLoBits = 16; break;
+        case Intrinsic::x86_avx2_pmovmskb:      NumLoBits = 32; break;
+      }
+      KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
+      break;
+    }
+    }
+    break;
+  }
+  }
+}
+
+unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
+  SDValue Op,
+  const SelectionDAG &,
+  unsigned Depth) const {
+  // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
+  if (Op.getOpcode() == X86ISD::SETCC_CARRY)
+    return Op.getValueType().getScalarType().getSizeInBits();
+
+  // Fallback case.
+  return 1;
+}
+
+/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
+/// node is a GlobalAddress + offset.
+bool X86TargetLowering::isGAPlusOffset(SDNode *N,
+                                       const GlobalValue* &GA,
+                                       int64_t &Offset) const {
+  if (N->getOpcode() == X86ISD::Wrapper) {
+    if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
+      GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
+      Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
+      return true;
+    }
+  }
+  return TargetLowering::isGAPlusOffset(N, GA, Offset);
+}
+
+/// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the
+/// same as extracting the high 128-bit part of 256-bit vector and then
+/// inserting the result into the low part of a new 256-bit vector
+static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
+  EVT VT = SVOp->getValueType(0);
+  unsigned NumElems = VT.getVectorNumElements();
+
+  // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
+  for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j)
+    if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
+        SVOp->getMaskElt(j) >= 0)
+      return false;
+
+  return true;
+}
+
+/// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the
+/// same as extracting the low 128-bit part of 256-bit vector and then
+/// inserting the result into the high part of a new 256-bit vector
+static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
+  EVT VT = SVOp->getValueType(0);
+  unsigned NumElems = VT.getVectorNumElements();
+
+  // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
+  for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j)
+    if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
+        SVOp->getMaskElt(j) >= 0)
+      return false;
+
+  return true;
+}
+
+/// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
+static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
+                                        TargetLowering::DAGCombinerInfo &DCI,
+                                        const X86Subtarget* Subtarget) {
+  SDLoc dl(N);
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  SDValue V1 = SVOp->getOperand(0);
+  SDValue V2 = SVOp->getOperand(1);
+  EVT VT = SVOp->getValueType(0);
+  unsigned NumElems = VT.getVectorNumElements();
+
+  if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
+      V2.getOpcode() == ISD::CONCAT_VECTORS) {
+    //
+    //                   0,0,0,...
+    //                      |
+    //    V      UNDEF    BUILD_VECTOR    UNDEF
+    //     \      /           \           /
+    //  CONCAT_VECTOR         CONCAT_VECTOR
+    //         \                  /
+    //          \                /
+    //          RESULT: V + zero extended
+    //
+    if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR ||
+        V2.getOperand(1).getOpcode() != ISD::UNDEF ||
+        V1.getOperand(1).getOpcode() != ISD::UNDEF)
+      return SDValue();
+
+    if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()))
+      return SDValue();
+
+    // To match the shuffle mask, the first half of the mask should
+    // be exactly the first vector, and all the rest a splat with the
+    // first element of the second one.
+    for (unsigned i = 0; i != NumElems/2; ++i)
+      if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
+          !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
+        return SDValue();
+
+    // If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
+    if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {
+      if (Ld->hasNUsesOfValue(1, 0)) {
+        SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
+        SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
+        SDValue ResNode =
+          DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
+                                  Ld->getMemoryVT(),
+                                  Ld->getPointerInfo(),
+                                  Ld->getAlignment(),
+                                  false/*isVolatile*/, true/*ReadMem*/,
+                                  false/*WriteMem*/);
+
+        // Make sure the newly-created LOAD is in the same position as Ld in
+        // terms of dependency. We create a TokenFactor for Ld and ResNode,
+        // and update uses of Ld's output chain to use the TokenFactor.
+        if (Ld->hasAnyUseOfValue(1)) {
+          SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                             SDValue(Ld, 1), SDValue(ResNode.getNode(), 1));
+          DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
+          DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
+                                 SDValue(ResNode.getNode(), 1));
+        }
+
+        return DAG.getNode(ISD::BITCAST, dl, VT, ResNode);
+      }
+    }
+
+    // Emit a zeroed vector and insert the desired subvector on its
+    // first half.
+    SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
+    SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
+    return DCI.CombineTo(N, InsV);
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Combine some shuffles into subvector extracts and inserts:
+  //
+
+  // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
+  if (isShuffleHigh128VectorInsertLow(SVOp)) {
+    SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl);
+    SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl);
+    return DCI.CombineTo(N, InsV);
+  }
+
+  // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
+  if (isShuffleLow128VectorInsertHigh(SVOp)) {
+    SDValue V = Extract128BitVector(V1, 0, DAG, dl);
+    SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl);
+    return DCI.CombineTo(N, InsV);
+  }
+
+  return SDValue();
+}
+
+/// \brief Get the PSHUF-style mask from PSHUF node.
+///
+/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
+/// PSHUF-style masks that can be reused with such instructions.
+static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
+  SmallVector<int, 4> Mask;
+  bool IsUnary;
+  bool HaveMask = getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), Mask, IsUnary);
+  (void)HaveMask;
+  assert(HaveMask);
+
+  switch (N.getOpcode()) {
+  case X86ISD::PSHUFD:
+    return Mask;
+  case X86ISD::PSHUFLW:
+    Mask.resize(4);
+    return Mask;
+  case X86ISD::PSHUFHW:
+    Mask.erase(Mask.begin(), Mask.begin() + 4);
+    for (int &M : Mask)
+      M -= 4;
+    return Mask;
+  default:
+    llvm_unreachable("No valid shuffle instruction found!");
+  }
+}
+
+/// \brief Search for a combinable shuffle across a chain ending in pshufd.
+///
+/// We walk up the chain and look for a combinable shuffle, skipping over
+/// shuffles that we could hoist this shuffle's transformation past without
+/// altering anything.
+static bool combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
+                                         SelectionDAG &DAG,
+                                         TargetLowering::DAGCombinerInfo &DCI) {
+  assert(N.getOpcode() == X86ISD::PSHUFD &&
+         "Called with something other than an x86 128-bit half shuffle!");
+  SDLoc DL(N);
+
+  // Walk up a single-use chain looking for a combinable shuffle.
+  SDValue V = N.getOperand(0);
+  for (; V.hasOneUse(); V = V.getOperand(0)) {
+    switch (V.getOpcode()) {
+    default:
+      return false; // Nothing combined!
+
+    case ISD::BITCAST:
+      // Skip bitcasts as we always know the type for the target specific
+      // instructions.
+      continue;
+
+    case X86ISD::PSHUFD:
+      // Found another dword shuffle.
+      break;
+
+    case X86ISD::PSHUFLW:
+      // Check that the low words (being shuffled) are the identity in the
+      // dword shuffle, and the high words are self-contained.
+      if (Mask[0] != 0 || Mask[1] != 1 ||
+          !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
+        return false;
+
+      continue;
+
+    case X86ISD::PSHUFHW:
+      // Check that the high words (being shuffled) are the identity in the
+      // dword shuffle, and the low words are self-contained.
+      if (Mask[2] != 2 || Mask[3] != 3 ||
+          !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
+        return false;
+
+      continue;
+
+    case X86ISD::UNPCKL:
+    case X86ISD::UNPCKH:
+      // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
+      // shuffle into a preceding word shuffle.
+      if (V.getValueType() != MVT::v16i8 && V.getValueType() != MVT::v8i16)
+        return false;
+
+      // Search for a half-shuffle which we can combine with.
+      unsigned CombineOp =
+          V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
+      if (V.getOperand(0) != V.getOperand(1) ||
+          !V->isOnlyUserOf(V.getOperand(0).getNode()))
+        return false;
+      V = V.getOperand(0);
+      do {
+        switch (V.getOpcode()) {
+        default:
+          return false; // Nothing to combine.
+
+        case X86ISD::PSHUFLW:
+        case X86ISD::PSHUFHW:
+          if (V.getOpcode() == CombineOp)
+            break;
+
+          // Fallthrough!
+        case ISD::BITCAST:
+          V = V.getOperand(0);
+          continue;
+        }
+        break;
+      } while (V.hasOneUse());
+      break;
+    }
+    // Break out of the loop if we break out of the switch.
+    break;
+  }
+
+  if (!V.hasOneUse())
+    // We fell out of the loop without finding a viable combining instruction.
+    return false;
+
+  // Record the old value to use in RAUW-ing.
+  SDValue Old = V;
+
+  // Merge this node's mask and our incoming mask.
+  SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
+  for (int &M : Mask)
+    M = VMask[M];
+  V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
+                  getV4X86ShuffleImm8ForMask(Mask, DAG));
+
+  // It is possible that one of the combinable shuffles was completely absorbed
+  // by the other, just replace it and revisit all users in that case.
+  if (Old.getNode() == V.getNode()) {
+    DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo=*/true);
+    return true;
+  }
+
+  // Replace N with its operand as we're going to combine that shuffle away.
+  DAG.ReplaceAllUsesWith(N, N.getOperand(0));
+
+  // Replace the combinable shuffle with the combined one, updating all users
+  // so that we re-evaluate the chain here.
+  DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
+  return true;
+}
+
+/// \brief Search for a combinable shuffle across a chain ending in pshuflw or pshufhw.
+///
+/// We walk up the chain, skipping shuffles of the other half and looking
+/// through shuffles which switch halves trying to find a shuffle of the same
+/// pair of dwords.
+static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
+                                        SelectionDAG &DAG,
+                                        TargetLowering::DAGCombinerInfo &DCI) {
+  assert(
+      (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
+      "Called with something other than an x86 128-bit half shuffle!");
+  SDLoc DL(N);
+  unsigned CombineOpcode = N.getOpcode();
+
+  // Walk up a single-use chain looking for a combinable shuffle.
+  SDValue V = N.getOperand(0);
+  for (; V.hasOneUse(); V = V.getOperand(0)) {
+    switch (V.getOpcode()) {
+    default:
+      return false; // Nothing combined!
+
+    case ISD::BITCAST:
+      // Skip bitcasts as we always know the type for the target specific
+      // instructions.
+      continue;
+
+    case X86ISD::PSHUFLW:
+    case X86ISD::PSHUFHW:
+      if (V.getOpcode() == CombineOpcode)
+        break;
+
+      // Other-half shuffles are no-ops.
+      continue;
+
+    case X86ISD::PSHUFD: {
+      // We can only handle pshufd if the half we are combining either stays in
+      // its half, or switches to the other half. Bail if one of these isn't
+      // true.
+      SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
+      int DOffset = CombineOpcode == X86ISD::PSHUFLW ? 0 : 2;
+      if (!((VMask[DOffset + 0] < 2 && VMask[DOffset + 1] < 2) ||
+            (VMask[DOffset + 0] >= 2 && VMask[DOffset + 1] >= 2)))
+        return false;
+
+      // Map the mask through the pshufd and keep walking up the chain.
+      for (int i = 0; i < 4; ++i)
+        Mask[i] = 2 * (VMask[DOffset + Mask[i] / 2] % 2) + Mask[i] % 2;
+
+      // Switch halves if the pshufd does.
+      CombineOpcode =
+          VMask[DOffset + Mask[0] / 2] < 2 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
+      continue;
+    }
+    }
+    // Break out of the loop if we break out of the switch.
+    break;
+  }
+
+  if (!V.hasOneUse())
+    // We fell out of the loop without finding a viable combining instruction.
+    return false;
+
+  // Record the old value to use in RAUW-ing.
+  SDValue Old = V;
+
+  // Merge this node's mask and our incoming mask (adjusted to account for all
+  // the pshufd instructions encountered).
+  SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
+  for (int &M : Mask)
+    M = VMask[M];
+  V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
+                  getV4X86ShuffleImm8ForMask(Mask, DAG));
+
+  // Replace N with its operand as we're going to combine that shuffle away.
+  DAG.ReplaceAllUsesWith(N, N.getOperand(0));
+
+  // Replace the combinable shuffle with the combined one, updating all users
+  // so that we re-evaluate the chain here.
+  DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
+  return true;
+}
+
+/// \brief Try to combine x86 target specific shuffles.
+static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
+                                           TargetLowering::DAGCombinerInfo &DCI,
+                                           const X86Subtarget *Subtarget) {
+  SDLoc DL(N);
+  MVT VT = N.getSimpleValueType();
+  SmallVector<int, 4> Mask;
+
+  switch (N.getOpcode()) {
+  case X86ISD::PSHUFD:
+  case X86ISD::PSHUFLW:
+  case X86ISD::PSHUFHW:
+    Mask = getPSHUFShuffleMask(N);
+    assert(Mask.size() == 4);
+    break;
+  default:
+    return SDValue();
+  }
+
+  // Nuke no-op shuffles that show up after combining.
+  if (isNoopShuffleMask(Mask))
+    return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
+
+  // Look for simplifications involving one or two shuffle instructions.
+  SDValue V = N.getOperand(0);
+  switch (N.getOpcode()) {
+  default:
+    break;
+  case X86ISD::PSHUFLW:
+  case X86ISD::PSHUFHW:
+    assert(VT == MVT::v8i16);
+    (void)VT;
+
+    if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
+      return SDValue(); // We combined away this shuffle, so we're done.
+
+    // See if this reduces to a PSHUFD which is no more expensive and can
+    // combine with more operations.
+    if (Mask[0] % 2 == 0 && Mask[2] % 2 == 0 &&
+        areAdjacentMasksSequential(Mask)) {
+      int DMask[] = {-1, -1, -1, -1};
+      int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
+      DMask[DOffset + 0] = DOffset + Mask[0] / 2;
+      DMask[DOffset + 1] = DOffset + Mask[2] / 2;
+      V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V);
+      DCI.AddToWorklist(V.getNode());
+      V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V,
+                      getV4X86ShuffleImm8ForMask(DMask, DAG));
+      DCI.AddToWorklist(V.getNode());
+      return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
+    }
+
+    // Look for shuffle patterns which can be implemented as a single unpack.
+    // FIXME: This doesn't handle the location of the PSHUFD generically, and
+    // only works when we have a PSHUFD followed by two half-shuffles.
+    if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
+        (V.getOpcode() == X86ISD::PSHUFLW ||
+         V.getOpcode() == X86ISD::PSHUFHW) &&
+        V.getOpcode() != N.getOpcode() &&
+        V.hasOneUse()) {
+      SDValue D = V.getOperand(0);
+      while (D.getOpcode() == ISD::BITCAST && D.hasOneUse())
+        D = D.getOperand(0);
+      if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
+        SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
+        SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
+        int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
+        int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
+        int WordMask[8];
+        for (int i = 0; i < 4; ++i) {
+          WordMask[i + NOffset] = Mask[i] + NOffset;
+          WordMask[i + VOffset] = VMask[i] + VOffset;
+        }
+        // Map the word mask through the DWord mask.
+        int MappedMask[8];
+        for (int i = 0; i < 8; ++i)
+          MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
+        const int UnpackLoMask[] = {0, 0, 1, 1, 2, 2, 3, 3};
+        const int UnpackHiMask[] = {4, 4, 5, 5, 6, 6, 7, 7};
+        if (std::equal(std::begin(MappedMask), std::end(MappedMask),
+                       std::begin(UnpackLoMask)) ||
+            std::equal(std::begin(MappedMask), std::end(MappedMask),
+                       std::begin(UnpackHiMask))) {
+          // We can replace all three shuffles with an unpack.
+          V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, D.getOperand(0));
+          DCI.AddToWorklist(V.getNode());
+          return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
+                                                : X86ISD::UNPCKH,
+                             DL, MVT::v8i16, V, V);
+        }
+      }
+    }
+
+    break;
+
+  case X86ISD::PSHUFD:
+    if (combineRedundantDWordShuffle(N, Mask, DAG, DCI))
+      return SDValue(); // We combined away this shuffle.
+
+    break;
+  }
+
+  return SDValue();
+}
+
+/// PerformShuffleCombine - Performs several different shuffle combines.
+static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     const X86Subtarget *Subtarget) {
+  SDLoc dl(N);
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+
+  // Canonicalize shuffles that perform 'addsub' on packed float vectors
+  // according to the rule:
+  //  (shuffle (FADD A, B), (FSUB A, B), Mask) ->
+  //  (shuffle (FSUB A, -B), (FADD A, -B), Mask)
+  //
+  // Where 'Mask' is:
+  //  <0,5,2,7>             -- for v4f32 and v4f64 shuffles;
+  //  <0,3>                 -- for v2f64 shuffles;
+  //  <0,9,2,11,4,13,6,15>  -- for v8f32 shuffles.
+  //
+  // This helps pattern-matching more SSE3/AVX ADDSUB instructions
+  // during ISel stage.
+  if (N->getOpcode() == ISD::VECTOR_SHUFFLE &&
+      ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
+       (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
+      N0->getOpcode() == ISD::FADD && N1->getOpcode() == ISD::FSUB &&
+      // Operands to the FADD and FSUB must be the same.
+      ((N0->getOperand(0) == N1->getOperand(0) &&
+        N0->getOperand(1) == N1->getOperand(1)) ||
+       // FADD is commutable. See if by commuting the operands of the FADD
+       // we would still be able to match the operands of the FSUB dag node.
+       (N0->getOperand(1) == N1->getOperand(0) &&
+        N0->getOperand(0) == N1->getOperand(1))) &&
+      N0->getOperand(0)->getOpcode() != ISD::UNDEF &&
+      N0->getOperand(1)->getOpcode() != ISD::UNDEF) {
+    
+    ShuffleVectorSDNode *SV = cast<ShuffleVectorSDNode>(N);
+    unsigned NumElts = VT.getVectorNumElements();
+    ArrayRef<int> Mask = SV->getMask();
+    bool CanFold = true;
+
+    for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i)
+      CanFold = Mask[i] == (int)((i & 1) ? i + NumElts : i);
+
+    if (CanFold) {
+      SDValue Op0 = N1->getOperand(0);
+      SDValue Op1 = DAG.getNode(ISD::FNEG, dl, VT, N1->getOperand(1));
+      SDValue Sub = DAG.getNode(ISD::FSUB, dl, VT, Op0, Op1);
+      SDValue Add = DAG.getNode(ISD::FADD, dl, VT, Op0, Op1);
+      return DAG.getVectorShuffle(VT, dl, Sub, Add, Mask);
+    }
+  }
+
+  // Don't create instructions with illegal types after legalize types has run.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
+    return SDValue();
+
+  // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
+  if (Subtarget->hasFp256() && VT.is256BitVector() &&
+      N->getOpcode() == ISD::VECTOR_SHUFFLE)
+    return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
+
+  // During Type Legalization, when promoting illegal vector types,
+  // the backend might introduce new shuffle dag nodes and bitcasts.
+  //
+  // This code performs the following transformation:
+  // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
+  //       (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
+  //
+  // We do this only if both the bitcast and the BINOP dag nodes have
+  // one use. Also, perform this transformation only if the new binary
+  // operation is legal. This is to avoid introducing dag nodes that
+  // potentially need to be further expanded (or custom lowered) into a
+  // less optimal sequence of dag nodes.
+  if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
+      N1.getOpcode() == ISD::UNDEF && N0.hasOneUse() &&
+      N0.getOpcode() == ISD::BITCAST) {
+    SDValue BC0 = N0.getOperand(0);
+    EVT SVT = BC0.getValueType();
+    unsigned Opcode = BC0.getOpcode();
+    unsigned NumElts = VT.getVectorNumElements();
+    
+    if (BC0.hasOneUse() && SVT.isVector() &&
+        SVT.getVectorNumElements() * 2 == NumElts &&
+        TLI.isOperationLegal(Opcode, VT)) {
+      bool CanFold = false;
+      switch (Opcode) {
+      default : break;
+      case ISD::ADD :
+      case ISD::FADD :
+      case ISD::SUB :
+      case ISD::FSUB :
+      case ISD::MUL :
+      case ISD::FMUL :
+        CanFold = true;
+      }
+
+      unsigned SVTNumElts = SVT.getVectorNumElements();
+      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+      for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
+        CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
+      for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
+        CanFold = SVOp->getMaskElt(i) < 0;
+
+      if (CanFold) {
+        SDValue BC00 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(0));
+        SDValue BC01 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(1));
+        SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
+        return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, &SVOp->getMask()[0]);
+      }
+    }
+  }
+
+  // Only handle 128 wide vector from here on.
+  if (!VT.is128BitVector())
+    return SDValue();
+
+  // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
+  // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
+  // consecutive, non-overlapping, and in the right order.
+  SmallVector<SDValue, 16> Elts;
+  for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
+    Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
+
+  SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true);
+  if (LD.getNode())
+    return LD;
+
+  if (isTargetShuffle(N->getOpcode())) {
+    SDValue Shuffle =
+        PerformTargetShuffleCombine(SDValue(N, 0), DAG, DCI, Subtarget);
+    if (Shuffle.getNode())
+      return Shuffle;
+  }
+
+  return SDValue();
+}
+
+/// PerformTruncateCombine - Converts truncate operation to
+/// a sequence of vector shuffle operations.
+/// It is possible when we truncate 256-bit vector to 128-bit vector
+static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
+                                      TargetLowering::DAGCombinerInfo &DCI,
+                                      const X86Subtarget *Subtarget)  {
+  return SDValue();
+}
+
+/// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target
+/// specific shuffle of a load can be folded into a single element load.
+/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
+/// shuffles have been customed lowered so we need to handle those here.
+static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
+                                         TargetLowering::DAGCombinerInfo &DCI) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SDValue InVec = N->getOperand(0);
+  SDValue EltNo = N->getOperand(1);
+
+  if (!isa<ConstantSDNode>(EltNo))
+    return SDValue();
+
+  EVT VT = InVec.getValueType();
+
+  bool HasShuffleIntoBitcast = false;
+  if (InVec.getOpcode() == ISD::BITCAST) {
+    // Don't duplicate a load with other uses.
+    if (!InVec.hasOneUse())
+      return SDValue();
+    EVT BCVT = InVec.getOperand(0).getValueType();
+    if (BCVT.getVectorNumElements() != VT.getVectorNumElements())
+      return SDValue();
+    InVec = InVec.getOperand(0);
+    HasShuffleIntoBitcast = true;
+  }
+
+  if (!isTargetShuffle(InVec.getOpcode()))
+    return SDValue();
+
+  // Don't duplicate a load with other uses.
+  if (!InVec.hasOneUse())
+    return SDValue();
+
+  SmallVector<int, 16> ShuffleMask;
+  bool UnaryShuffle;
+  if (!getTargetShuffleMask(InVec.getNode(), VT.getSimpleVT(), ShuffleMask,
+                            UnaryShuffle))
+    return SDValue();
+
+  // Select the input vector, guarding against out of range extract vector.
+  unsigned NumElems = VT.getVectorNumElements();
+  int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
+  int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt];
+  SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0)
+                                         : InVec.getOperand(1);
+
+  // If inputs to shuffle are the same for both ops, then allow 2 uses
+  unsigned AllowedUses = InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;
+
+  if (LdNode.getOpcode() == ISD::BITCAST) {
+    // Don't duplicate a load with other uses.
+    if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
+      return SDValue();
+
+    AllowedUses = 1; // only allow 1 load use if we have a bitcast
+    LdNode = LdNode.getOperand(0);
+  }
+
+  if (!ISD::isNormalLoad(LdNode.getNode()))
+    return SDValue();
+
+  LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
+
+  if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
+    return SDValue();
+
+  if (HasShuffleIntoBitcast) {
+    // If there's a bitcast before the shuffle, check if the load type and
+    // alignment is valid.
+    unsigned Align = LN0->getAlignment();
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    unsigned NewAlign = TLI.getDataLayout()->
+      getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext()));
+
+    if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT))
+      return SDValue();
+  }
+
+  // All checks match so transform back to vector_shuffle so that DAG combiner
+  // can finish the job
+  SDLoc dl(N);
+
+  // Create shuffle node taking into account the case that its a unary shuffle
+  SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(VT) : InVec.getOperand(1);
+  Shuffle = DAG.getVectorShuffle(InVec.getValueType(), dl,
+                                 InVec.getOperand(0), Shuffle,
+                                 &ShuffleMask[0]);
+  Shuffle = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
+                     EltNo);
+}
+
+/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
+/// generation and convert it from being a bunch of shuffles and extracts
+/// to a simple store and scalar loads to extract the elements.
+static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
+                                         TargetLowering::DAGCombinerInfo &DCI) {
+  SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI);
+  if (NewOp.getNode())
+    return NewOp;
+
+  SDValue InputVector = N->getOperand(0);
+
+  // Detect whether we are trying to convert from mmx to i32 and the bitcast
+  // from mmx to v2i32 has a single usage.
+  if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST &&
+      InputVector.getNode()->getOperand(0).getValueType() == MVT::x86mmx &&
+      InputVector.hasOneUse() && N->getValueType(0) == MVT::i32)
+    return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
+                       N->getValueType(0),
+                       InputVector.getNode()->getOperand(0));
+
+  // Only operate on vectors of 4 elements, where the alternative shuffling
+  // gets to be more expensive.
+  if (InputVector.getValueType() != MVT::v4i32)
+    return SDValue();
+
+  // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
+  // single use which is a sign-extend or zero-extend, and all elements are
+  // used.
+  SmallVector<SDNode *, 4> Uses;
+  unsigned ExtractedElements = 0;
+  for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
+       UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
+    if (UI.getUse().getResNo() != InputVector.getResNo())
+      return SDValue();
+
+    SDNode *Extract = *UI;
+    if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      return SDValue();
+
+    if (Extract->getValueType(0) != MVT::i32)
+      return SDValue();
+    if (!Extract->hasOneUse())
+      return SDValue();
+    if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
+        Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
+      return SDValue();
+    if (!isa<ConstantSDNode>(Extract->getOperand(1)))
+      return SDValue();
+
+    // Record which element was extracted.
+    ExtractedElements |=
+      1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
+
+    Uses.push_back(Extract);
+  }
+
+  // If not all the elements were used, this may not be worthwhile.
+  if (ExtractedElements != 15)
+    return SDValue();
+
+  // Ok, we've now decided to do the transformation.
+  SDLoc dl(InputVector);
+
+  // Store the value to a temporary stack slot.
+  SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
+  SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
+                            MachinePointerInfo(), false, false, 0);
+
+  // Replace each use (extract) with a load of the appropriate element.
+  for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
+       UE = Uses.end(); UI != UE; ++UI) {
+    SDNode *Extract = *UI;
+
+    // cOMpute the element's address.
+    SDValue Idx = Extract->getOperand(1);
+    unsigned EltSize =
+        InputVector.getValueType().getVectorElementType().getSizeInBits()/8;
+    uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue();
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
+
+    SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
+                                     StackPtr, OffsetVal);
+
+    // Load the scalar.
+    SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch,
+                                     ScalarAddr, MachinePointerInfo(),
+                                     false, false, false, 0);
+
+    // Replace the exact with the load.
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar);
+  }
+
+  // The replacement was made in place; don't return anything.
+  return SDValue();
+}
+
+/// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match.
+static std::pair<unsigned, bool>
+matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
+                   SelectionDAG &DAG, const X86Subtarget *Subtarget) {
+  if (!VT.isVector())
+    return std::make_pair(0, false);
+
+  bool NeedSplit = false;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: return std::make_pair(0, false);
+  case MVT::v32i8:
+  case MVT::v16i16:
+  case MVT::v8i32:
+    if (!Subtarget->hasAVX2())
+      NeedSplit = true;
+    if (!Subtarget->hasAVX())
+      return std::make_pair(0, false);
+    break;
+  case MVT::v16i8:
+  case MVT::v8i16:
+  case MVT::v4i32:
+    if (!Subtarget->hasSSE2())
+      return std::make_pair(0, false);
+  }
+
+  // SSE2 has only a small subset of the operations.
+  bool hasUnsigned = Subtarget->hasSSE41() ||
+                     (Subtarget->hasSSE2() && VT == MVT::v16i8);
+  bool hasSigned = Subtarget->hasSSE41() ||
+                   (Subtarget->hasSSE2() && VT == MVT::v8i16);
+
+  ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+
+  unsigned Opc = 0;
+  // Check for x CC y ? x : y.
+  if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
+      DAG.isEqualTo(RHS, Cond.getOperand(1))) {
+    switch (CC) {
+    default: break;
+    case ISD::SETULT:
+    case ISD::SETULE:
+      Opc = hasUnsigned ? X86ISD::UMIN : 0; break;
+    case ISD::SETUGT:
+    case ISD::SETUGE:
+      Opc = hasUnsigned ? X86ISD::UMAX : 0; break;
+    case ISD::SETLT:
+    case ISD::SETLE:
+      Opc = hasSigned ? X86ISD::SMIN : 0; break;
+    case ISD::SETGT:
+    case ISD::SETGE:
+      Opc = hasSigned ? X86ISD::SMAX : 0; break;
+    }
+  // Check for x CC y ? y : x -- a min/max with reversed arms.
+  } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
+             DAG.isEqualTo(RHS, Cond.getOperand(0))) {
+    switch (CC) {
+    default: break;
+    case ISD::SETULT:
+    case ISD::SETULE:
+      Opc = hasUnsigned ? X86ISD::UMAX : 0; break;
+    case ISD::SETUGT:
+    case ISD::SETUGE:
+      Opc = hasUnsigned ? X86ISD::UMIN : 0; break;
+    case ISD::SETLT:
+    case ISD::SETLE:
+      Opc = hasSigned ? X86ISD::SMAX : 0; break;
+    case ISD::SETGT:
+    case ISD::SETGE:
+      Opc = hasSigned ? X86ISD::SMIN : 0; break;
+    }
+  }
+
+  return std::make_pair(Opc, NeedSplit);
+}
+
+static SDValue
+TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
+                                      const X86Subtarget *Subtarget) {
+  SDLoc dl(N);
+  SDValue Cond = N->getOperand(0);
+  SDValue LHS = N->getOperand(1);
+  SDValue RHS = N->getOperand(2);
+
+  if (Cond.getOpcode() == ISD::SIGN_EXTEND) {
+    SDValue CondSrc = Cond->getOperand(0);
+    if (CondSrc->getOpcode() == ISD::SIGN_EXTEND_INREG)
+      Cond = CondSrc->getOperand(0);
+  }
+
+  MVT VT = N->getSimpleValueType(0);
+  MVT EltVT = VT.getVectorElementType();
+  unsigned NumElems = VT.getVectorNumElements();
+  // There is no blend with immediate in AVX-512.
+  if (VT.is512BitVector())
+    return SDValue();
+
+  if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
+    return SDValue();
+  if (!Subtarget->hasInt256() && VT == MVT::v16i16)
+    return SDValue();
+
+  if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
+    return SDValue();
+
+  unsigned MaskValue = 0;
+  if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
+    return SDValue();
+
+  SmallVector<int, 8> ShuffleMask(NumElems, -1);
+  for (unsigned i = 0; i < NumElems; ++i) {
+    // Be sure we emit undef where we can.
+    if (Cond.getOperand(i)->getOpcode() == ISD::UNDEF)
+      ShuffleMask[i] = -1;
+    else
+      ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1);
+  }
+
+  return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]);
+}
+
+/// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
+/// nodes.
+static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
+                                    TargetLowering::DAGCombinerInfo &DCI,
+                                    const X86Subtarget *Subtarget) {
+  SDLoc DL(N);
+  SDValue Cond = N->getOperand(0);
+  // Get the LHS/RHS of the select.
+  SDValue LHS = N->getOperand(1);
+  SDValue RHS = N->getOperand(2);
+  EVT VT = LHS.getValueType();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // If we have SSE[12] support, try to form min/max nodes. SSE min/max
+  // instructions match the semantics of the common C idiom x<y?x:y but not
+  // x<=y?x:y, because of how they handle negative zero (which can be
+  // ignored in unsafe-math mode).
+  if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
+      VT != MVT::f80 && TLI.isTypeLegal(VT) &&
+      (Subtarget->hasSSE2() ||
+       (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
+    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+
+    unsigned Opcode = 0;
+    // Check for x CC y ? x : y.
+    if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
+        DAG.isEqualTo(RHS, Cond.getOperand(1))) {
+      switch (CC) {
+      default: break;
+      case ISD::SETULT:
+        // Converting this to a min would handle NaNs incorrectly, and swapping
+        // the operands would cause it to handle comparisons between positive
+        // and negative zero incorrectly.
+        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
+          if (!DAG.getTarget().Options.UnsafeFPMath &&
+              !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
+            break;
+          std::swap(LHS, RHS);
+        }
+        Opcode = X86ISD::FMIN;
+        break;
+      case ISD::SETOLE:
+        // Converting this to a min would handle comparisons between positive
+        // and negative zero incorrectly.
+        if (!DAG.getTarget().Options.UnsafeFPMath &&
+            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
+          break;
+        Opcode = X86ISD::FMIN;
+        break;
+      case ISD::SETULE:
+        // Converting this to a min would handle both negative zeros and NaNs
+        // incorrectly, but we can swap the operands to fix both.
+        std::swap(LHS, RHS);
+      case ISD::SETOLT:
+      case ISD::SETLT:
+      case ISD::SETLE:
+        Opcode = X86ISD::FMIN;
+        break;
+
+      case ISD::SETOGE:
+        // Converting this to a max would handle comparisons between positive
+        // and negative zero incorrectly.
+        if (!DAG.getTarget().Options.UnsafeFPMath &&
+            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
+          break;
+        Opcode = X86ISD::FMAX;
+        break;
+      case ISD::SETUGT:
+        // Converting this to a max would handle NaNs incorrectly, and swapping
+        // the operands would cause it to handle comparisons between positive
+        // and negative zero incorrectly.
+        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
+          if (!DAG.getTarget().Options.UnsafeFPMath &&
+              !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
+            break;
+          std::swap(LHS, RHS);
+        }
+        Opcode = X86ISD::FMAX;
+        break;
+      case ISD::SETUGE:
+        // Converting this to a max would handle both negative zeros and NaNs
+        // incorrectly, but we can swap the operands to fix both.
+        std::swap(LHS, RHS);
+      case ISD::SETOGT:
+      case ISD::SETGT:
+      case ISD::SETGE:
+        Opcode = X86ISD::FMAX;
+        break;
+      }
+    // Check for x CC y ? y : x -- a min/max with reversed arms.
+    } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
+               DAG.isEqualTo(RHS, Cond.getOperand(0))) {
+      switch (CC) {
+      default: break;
+      case ISD::SETOGE:
+        // Converting this to a min would handle comparisons between positive
+        // and negative zero incorrectly, and swapping the operands would
+        // cause it to handle NaNs incorrectly.
+        if (!DAG.getTarget().Options.UnsafeFPMath &&
+            !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
+          if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
+            break;
+          std::swap(LHS, RHS);
+        }
+        Opcode = X86ISD::FMIN;
+        break;
+      case ISD::SETUGT:
+        // Converting this to a min would handle NaNs incorrectly.
+        if (!DAG.getTarget().Options.UnsafeFPMath &&
+            (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
+          break;
+        Opcode = X86ISD::FMIN;
+        break;
+      case ISD::SETUGE:
+        // Converting this to a min would handle both negative zeros and NaNs
+        // incorrectly, but we can swap the operands to fix both.
+        std::swap(LHS, RHS);
+      case ISD::SETOGT:
+      case ISD::SETGT:
+      case ISD::SETGE:
+        Opcode = X86ISD::FMIN;
+        break;
+
+      case ISD::SETULT:
+        // Converting this to a max would handle NaNs incorrectly.
+        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
+          break;
+        Opcode = X86ISD::FMAX;
+        break;
+      case ISD::SETOLE:
+        // Converting this to a max would handle comparisons between positive
+        // and negative zero incorrectly, and swapping the operands would
+        // cause it to handle NaNs incorrectly.
+        if (!DAG.getTarget().Options.UnsafeFPMath &&
+            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
+          if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
+            break;
+          std::swap(LHS, RHS);
+        }
+        Opcode = X86ISD::FMAX;
+        break;
+      case ISD::SETULE:
+        // Converting this to a max would handle both negative zeros and NaNs
+        // incorrectly, but we can swap the operands to fix both.
+        std::swap(LHS, RHS);
+      case ISD::SETOLT:
+      case ISD::SETLT:
+      case ISD::SETLE:
+        Opcode = X86ISD::FMAX;
+        break;
+      }
+    }
+
+    if (Opcode)
+      return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
+  }
+
+  EVT CondVT = Cond.getValueType();
+  if (Subtarget->hasAVX512() && VT.isVector() && CondVT.isVector() &&
+      CondVT.getVectorElementType() == MVT::i1) {
+    // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
+    // lowering on AVX-512. In this case we convert it to
+    // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
+    // The same situation for all 128 and 256-bit vectors of i8 and i16
+    EVT OpVT = LHS.getValueType();
+    if ((OpVT.is128BitVector() || OpVT.is256BitVector()) &&
+        (OpVT.getVectorElementType() == MVT::i8 ||
+         OpVT.getVectorElementType() == MVT::i16)) {
+      Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond);
+      DCI.AddToWorklist(Cond.getNode());
+      return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS);
+    }
+  }
+  // If this is a select between two integer constants, try to do some
+  // optimizations.
+  if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
+    if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
+      // Don't do this for crazy integer types.
+      if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
+        // If this is efficiently invertible, canonicalize the LHSC/RHSC values
+        // so that TrueC (the true value) is larger than FalseC.
+        bool NeedsCondInvert = false;
+
+        if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
+            // Efficiently invertible.
+            (Cond.getOpcode() == ISD::SETCC ||  // setcc -> invertible.
+             (Cond.getOpcode() == ISD::XOR &&   // xor(X, C) -> invertible.
+              isa<ConstantSDNode>(Cond.getOperand(1))))) {
+          NeedsCondInvert = true;
+          std::swap(TrueC, FalseC);
+        }
+
+        // Optimize C ? 8 : 0 -> zext(C) << 3.  Likewise for any pow2/0.
+        if (FalseC->getAPIntValue() == 0 &&
+            TrueC->getAPIntValue().isPowerOf2()) {
+          if (NeedsCondInvert) // Invert the condition if needed.
+            Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
+                               DAG.getConstant(1, Cond.getValueType()));
+
+          // Zero extend the condition if needed.
+          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
+
+          unsigned ShAmt = TrueC->getAPIntValue().logBase2();
+          return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
+                             DAG.getConstant(ShAmt, MVT::i8));
+        }
+
+        // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
+        if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
+          if (NeedsCondInvert) // Invert the condition if needed.
+            Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
+                               DAG.getConstant(1, Cond.getValueType()));
+
+          // Zero extend the condition if needed.
+          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
+                             FalseC->getValueType(0), Cond);
+          return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+                             SDValue(FalseC, 0));
+        }
+
+        // Optimize cases that will turn into an LEA instruction.  This requires
+        // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
+        if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
+          uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
+          if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
+
+          bool isFastMultiplier = false;
+          if (Diff < 10) {
+            switch ((unsigned char)Diff) {
+              default: break;
+              case 1:  // result = add base, cond
+              case 2:  // result = lea base(    , cond*2)
+              case 3:  // result = lea base(cond, cond*2)
+              case 4:  // result = lea base(    , cond*4)
+              case 5:  // result = lea base(cond, cond*4)
+              case 8:  // result = lea base(    , cond*8)
+              case 9:  // result = lea base(cond, cond*8)
+                isFastMultiplier = true;
+                break;
+            }
+          }
+
+          if (isFastMultiplier) {
+            APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
+            if (NeedsCondInvert) // Invert the condition if needed.
+              Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
+                                 DAG.getConstant(1, Cond.getValueType()));
+
+            // Zero extend the condition if needed.
+            Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
+                               Cond);
+            // Scale the condition by the difference.
+            if (Diff != 1)
+              Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
+                                 DAG.getConstant(Diff, Cond.getValueType()));
+
+            // Add the base if non-zero.
+            if (FalseC->getAPIntValue() != 0)
+              Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+                                 SDValue(FalseC, 0));
+            return Cond;
+          }
+        }
+      }
+  }
+
+  // Canonicalize max and min:
+  // (x > y) ? x : y -> (x >= y) ? x : y
+  // (x < y) ? x : y -> (x <= y) ? x : y
+  // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
+  // the need for an extra compare
+  // against zero. e.g.
+  // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
+  // subl   %esi, %edi
+  // testl  %edi, %edi
+  // movl   $0, %eax
+  // cmovgl %edi, %eax
+  // =>
+  // xorl   %eax, %eax
+  // subl   %esi, $edi
+  // cmovsl %eax, %edi
+  if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
+      DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
+      DAG.isEqualTo(RHS, Cond.getOperand(1))) {
+    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+    switch (CC) {
+    default: break;
+    case ISD::SETLT:
+    case ISD::SETGT: {
+      ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
+      Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
+                          Cond.getOperand(0), Cond.getOperand(1), NewCC);
+      return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
+    }
+    }
+  }
+
+  // Early exit check
+  if (!TLI.isTypeLegal(VT))
+    return SDValue();
+
+  // Match VSELECTs into subs with unsigned saturation.
+  if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
+      // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
+      ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
+       (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
+    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+
+    // Check if one of the arms of the VSELECT is a zero vector. If it's on the
+    // left side invert the predicate to simplify logic below.
+    SDValue Other;
+    if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
+      Other = RHS;
+      CC = ISD::getSetCCInverse(CC, true);
+    } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
+      Other = LHS;
+    }
+
+    if (Other.getNode() && Other->getNumOperands() == 2 &&
+        DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
+      SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
+      SDValue CondRHS = Cond->getOperand(1);
+
+      // Look for a general sub with unsigned saturation first.
+      // x >= y ? x-y : 0 --> subus x, y
+      // x >  y ? x-y : 0 --> subus x, y
+      if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
+          Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
+        return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
+
+      if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
+        if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
+          if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
+            if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
+              // If the RHS is a constant we have to reverse the const
+              // canonicalization.
+              // x > C-1 ? x+-C : 0 --> subus x, C
+              if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
+                  CondRHSConst->getAPIntValue() ==
+                      (-OpRHSConst->getAPIntValue() - 1))
+                return DAG.getNode(
+                    X86ISD::SUBUS, DL, VT, OpLHS,
+                    DAG.getConstant(-OpRHSConst->getAPIntValue(), VT));
+
+          // Another special case: If C was a sign bit, the sub has been
+          // canonicalized into a xor.
+          // FIXME: Would it be better to use computeKnownBits to determine
+          //        whether it's safe to decanonicalize the xor?
+          // x s< 0 ? x^C : 0 --> subus x, C
+          if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
+              ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
+              OpRHSConst->getAPIntValue().isSignBit())
+            // Note that we have to rebuild the RHS constant here to ensure we
+            // don't rely on particular values of undef lanes.
+            return DAG.getNode(
+                X86ISD::SUBUS, DL, VT, OpLHS,
+                DAG.getConstant(OpRHSConst->getAPIntValue(), VT));
+        }
+    }
+  }
+
+  // Try to match a min/max vector operation.
+  if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC) {
+    std::pair<unsigned, bool> ret = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget);
+    unsigned Opc = ret.first;
+    bool NeedSplit = ret.second;
+
+    if (Opc && NeedSplit) {
+      unsigned NumElems = VT.getVectorNumElements();
+      // Extract the LHS vectors
+      SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, DL);
+      SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, DL);
+
+      // Extract the RHS vectors
+      SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, DL);
+      SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, DL);
+
+      // Create min/max for each subvector
+      LHS = DAG.getNode(Opc, DL, LHS1.getValueType(), LHS1, RHS1);
+      RHS = DAG.getNode(Opc, DL, LHS2.getValueType(), LHS2, RHS2);
+
+      // Merge the result
+      return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS, RHS);
+    } else if (Opc)
+      return DAG.getNode(Opc, DL, VT, LHS, RHS);
+  }
+
+  // Simplify vector selection if the selector will be produced by CMPP*/PCMP*.
+  if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
+      // Check if SETCC has already been promoted
+      TLI.getSetCCResultType(*DAG.getContext(), VT) == CondVT &&
+      // Check that condition value type matches vselect operand type
+      CondVT == VT) { 
+
+    assert(Cond.getValueType().isVector() &&
+           "vector select expects a vector selector!");
+
+    bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
+    bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
+
+    if (!TValIsAllOnes && !FValIsAllZeros) {
+      // Try invert the condition if true value is not all 1s and false value
+      // is not all 0s.
+      bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
+      bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
+
+      if (TValIsAllZeros || FValIsAllOnes) {
+        SDValue CC = Cond.getOperand(2);
+        ISD::CondCode NewCC =
+          ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
+                               Cond.getOperand(0).getValueType().isInteger());
+        Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1), NewCC);
+        std::swap(LHS, RHS);
+        TValIsAllOnes = FValIsAllOnes;
+        FValIsAllZeros = TValIsAllZeros;
+      }
+    }
+
+    if (TValIsAllOnes || FValIsAllZeros) {
+      SDValue Ret;
+
+      if (TValIsAllOnes && FValIsAllZeros)
+        Ret = Cond;
+      else if (TValIsAllOnes)
+        Ret = DAG.getNode(ISD::OR, DL, CondVT, Cond,
+                          DAG.getNode(ISD::BITCAST, DL, CondVT, RHS));
+      else if (FValIsAllZeros)
+        Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond,
+                          DAG.getNode(ISD::BITCAST, DL, CondVT, LHS));
+
+      return DAG.getNode(ISD::BITCAST, DL, VT, Ret);
+    }
+  }
+
+  // Try to fold this VSELECT into a MOVSS/MOVSD
+  if (N->getOpcode() == ISD::VSELECT &&
+      Cond.getOpcode() == ISD::BUILD_VECTOR && !DCI.isBeforeLegalize()) {
+    if (VT == MVT::v4i32 || VT == MVT::v4f32 ||
+        (Subtarget->hasSSE2() && (VT == MVT::v2i64 || VT == MVT::v2f64))) {
+      bool CanFold = false;
+      unsigned NumElems = Cond.getNumOperands();
+      SDValue A = LHS;
+      SDValue B = RHS;
+      
+      if (isZero(Cond.getOperand(0))) {
+        CanFold = true;
+
+        // fold (vselect <0,-1,-1,-1>, A, B) -> (movss A, B)
+        // fold (vselect <0,-1> -> (movsd A, B)
+        for (unsigned i = 1, e = NumElems; i != e && CanFold; ++i)
+          CanFold = isAllOnes(Cond.getOperand(i));
+      } else if (isAllOnes(Cond.getOperand(0))) {
+        CanFold = true;
+        std::swap(A, B);
+
+        // fold (vselect <-1,0,0,0>, A, B) -> (movss B, A)
+        // fold (vselect <-1,0> -> (movsd B, A)
+        for (unsigned i = 1, e = NumElems; i != e && CanFold; ++i)
+          CanFold = isZero(Cond.getOperand(i));
+      }
+
+      if (CanFold) {
+        if (VT == MVT::v4i32 || VT == MVT::v4f32)
+          return getTargetShuffleNode(X86ISD::MOVSS, DL, VT, A, B, DAG);
+        return getTargetShuffleNode(X86ISD::MOVSD, DL, VT, A, B, DAG);
+      }
+
+      if (Subtarget->hasSSE2() && (VT == MVT::v4i32 || VT == MVT::v4f32)) {
+        // fold (v4i32: vselect <0,0,-1,-1>, A, B) ->
+        //      (v4i32 (bitcast (movsd (v2i64 (bitcast A)),
+        //                             (v2i64 (bitcast B)))))
+        //
+        // fold (v4f32: vselect <0,0,-1,-1>, A, B) ->
+        //      (v4f32 (bitcast (movsd (v2f64 (bitcast A)),
+        //                             (v2f64 (bitcast B)))))
+        //
+        // fold (v4i32: vselect <-1,-1,0,0>, A, B) ->
+        //      (v4i32 (bitcast (movsd (v2i64 (bitcast B)),
+        //                             (v2i64 (bitcast A)))))
+        //
+        // fold (v4f32: vselect <-1,-1,0,0>, A, B) ->
+        //      (v4f32 (bitcast (movsd (v2f64 (bitcast B)),
+        //                             (v2f64 (bitcast A)))))
+
+        CanFold = (isZero(Cond.getOperand(0)) &&
+                   isZero(Cond.getOperand(1)) &&
+                   isAllOnes(Cond.getOperand(2)) &&
+                   isAllOnes(Cond.getOperand(3)));
+
+        if (!CanFold && isAllOnes(Cond.getOperand(0)) &&
+            isAllOnes(Cond.getOperand(1)) &&
+            isZero(Cond.getOperand(2)) &&
+            isZero(Cond.getOperand(3))) {
+          CanFold = true;
+          std::swap(LHS, RHS);
+        }
+
+        if (CanFold) {
+          EVT NVT = (VT == MVT::v4i32) ? MVT::v2i64 : MVT::v2f64;
+          SDValue NewA = DAG.getNode(ISD::BITCAST, DL, NVT, LHS);
+          SDValue NewB = DAG.getNode(ISD::BITCAST, DL, NVT, RHS);
+          SDValue Select = getTargetShuffleNode(X86ISD::MOVSD, DL, NVT, NewA,
+                                                NewB, DAG);
+          return DAG.getNode(ISD::BITCAST, DL, VT, Select);
+        }
+      }
+    }
+  }
+
+  // If we know that this node is legal then we know that it is going to be
+  // matched by one of the SSE/AVX BLEND instructions. These instructions only
+  // depend on the highest bit in each word. Try to use SimplifyDemandedBits
+  // to simplify previous instructions.
+  if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
+      !DCI.isBeforeLegalize() &&
+      // We explicitly check against v8i16 and v16i16 because, although
+      // they're marked as Custom, they might only be legal when Cond is a
+      // build_vector of constants. This will be taken care in a later
+      // condition.
+      (TLI.isOperationLegalOrCustom(ISD::VSELECT, VT) && VT != MVT::v16i16 &&
+       VT != MVT::v8i16)) {
+    unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
+
+    // Don't optimize vector selects that map to mask-registers.
+    if (BitWidth == 1)
+      return SDValue();
+
+    // Check all uses of that condition operand to check whether it will be
+    // consumed by non-BLEND instructions, which may depend on all bits are set
+    // properly.
+    for (SDNode::use_iterator I = Cond->use_begin(),
+                              E = Cond->use_end(); I != E; ++I)
+      if (I->getOpcode() != ISD::VSELECT)
+        // TODO: Add other opcodes eventually lowered into BLEND.
+        return SDValue();
+
+    assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
+    APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
+
+    APInt KnownZero, KnownOne;
+    TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
+                                          DCI.isBeforeLegalizeOps());
+    if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
+        TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, TLO))
+      DCI.CommitTargetLoweringOpt(TLO);
+  }
+
+  // We should generate an X86ISD::BLENDI from a vselect if its argument
+  // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of
+  // constants. This specific pattern gets generated when we split a
+  // selector for a 512 bit vector in a machine without AVX512 (but with
+  // 256-bit vectors), during legalization:
+  //
+  // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS)
+  //
+  // Iff we find this pattern and the build_vectors are built from
+  // constants, we translate the vselect into a shuffle_vector that we
+  // know will be matched by LowerVECTOR_SHUFFLEtoBlend.
+  if (N->getOpcode() == ISD::VSELECT && !DCI.isBeforeLegalize()) {
+    SDValue Shuffle = TransformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
+    if (Shuffle.getNode())
+      return Shuffle;
+  }
+
+  return SDValue();
+}
+
+// Check whether a boolean test is testing a boolean value generated by
+// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
+// code.
+//
+// Simplify the following patterns:
+// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
+// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
+// to (Op EFLAGS Cond)
+//
+// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
+// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
+// to (Op EFLAGS !Cond)
+//
+// where Op could be BRCOND or CMOV.
+//
+static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
+  // Quit if not CMP and SUB with its value result used.
+  if (Cmp.getOpcode() != X86ISD::CMP &&
+      (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0)))
+      return SDValue();
+
+  // Quit if not used as a boolean value.
+  if (CC != X86::COND_E && CC != X86::COND_NE)
+    return SDValue();
+
+  // Check CMP operands. One of them should be 0 or 1 and the other should be
+  // an SetCC or extended from it.
+  SDValue Op1 = Cmp.getOperand(0);
+  SDValue Op2 = Cmp.getOperand(1);
+
+  SDValue SetCC;
+  const ConstantSDNode* C = nullptr;
+  bool needOppositeCond = (CC == X86::COND_E);
+  bool checkAgainstTrue = false; // Is it a comparison against 1?
+
+  if ((C = dyn_cast<ConstantSDNode>(Op1)))
+    SetCC = Op2;
+  else if ((C = dyn_cast<ConstantSDNode>(Op2)))
+    SetCC = Op1;
+  else // Quit if all operands are not constants.
+    return SDValue();
+
+  if (C->getZExtValue() == 1) {
+    needOppositeCond = !needOppositeCond;
+    checkAgainstTrue = true;
+  } else if (C->getZExtValue() != 0)
+    // Quit if the constant is neither 0 or 1.
+    return SDValue();
+
+  bool truncatedToBoolWithAnd = false;
+  // Skip (zext $x), (trunc $x), or (and $x, 1) node.
+  while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
+         SetCC.getOpcode() == ISD::TRUNCATE ||
+         SetCC.getOpcode() == ISD::AND) {
+    if (SetCC.getOpcode() == ISD::AND) {
+      int OpIdx = -1;
+      ConstantSDNode *CS;
+      if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(0))) &&
+          CS->getZExtValue() == 1)
+        OpIdx = 1;
+      if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(1))) &&
+          CS->getZExtValue() == 1)
+        OpIdx = 0;
+      if (OpIdx == -1)
+        break;
+      SetCC = SetCC.getOperand(OpIdx);
+      truncatedToBoolWithAnd = true;
+    } else
+      SetCC = SetCC.getOperand(0);
+  }
+
+  switch (SetCC.getOpcode()) {
+  case X86ISD::SETCC_CARRY:
+    // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
+    // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
+    // i.e. it's a comparison against true but the result of SETCC_CARRY is not
+    // truncated to i1 using 'and'.
+    if (checkAgainstTrue && !truncatedToBoolWithAnd)
+      break;
+    assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
+           "Invalid use of SETCC_CARRY!");
+    // FALL THROUGH
+  case X86ISD::SETCC:
+    // Set the condition code or opposite one if necessary.
+    CC = X86::CondCode(SetCC.getConstantOperandVal(0));
+    if (needOppositeCond)
+      CC = X86::GetOppositeBranchCondition(CC);
+    return SetCC.getOperand(1);
+  case X86ISD::CMOV: {
+    // Check whether false/true value has canonical one, i.e. 0 or 1.
+    ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
+    ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
+    // Quit if true value is not a constant.
+    if (!TVal)
+      return SDValue();
+    // Quit if false value is not a constant.
+    if (!FVal) {
+      SDValue Op = SetCC.getOperand(0);
+      // Skip 'zext' or 'trunc' node.
+      if (Op.getOpcode() == ISD::ZERO_EXTEND ||
+          Op.getOpcode() == ISD::TRUNCATE)
+        Op = Op.getOperand(0);
+      // A special case for rdrand/rdseed, where 0 is set if false cond is
+      // found.
+      if ((Op.getOpcode() != X86ISD::RDRAND &&
+           Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
+        return SDValue();
+    }
+    // Quit if false value is not the constant 0 or 1.
+    bool FValIsFalse = true;
+    if (FVal && FVal->getZExtValue() != 0) {
+      if (FVal->getZExtValue() != 1)
+        return SDValue();
+      // If FVal is 1, opposite cond is needed.
+      needOppositeCond = !needOppositeCond;
+      FValIsFalse = false;
+    }
+    // Quit if TVal is not the constant opposite of FVal.
+    if (FValIsFalse && TVal->getZExtValue() != 1)
+      return SDValue();
+    if (!FValIsFalse && TVal->getZExtValue() != 0)
+      return SDValue();
+    CC = X86::CondCode(SetCC.getConstantOperandVal(2));
+    if (needOppositeCond)
+      CC = X86::GetOppositeBranchCondition(CC);
+    return SetCC.getOperand(3);
+  }
+  }
+
+  return SDValue();
+}
+
+/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
+static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const X86Subtarget *Subtarget) {
+  SDLoc DL(N);
+
+  // If the flag operand isn't dead, don't touch this CMOV.
+  if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
+    return SDValue();
+
+  SDValue FalseOp = N->getOperand(0);
+  SDValue TrueOp = N->getOperand(1);
+  X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
+  SDValue Cond = N->getOperand(3);
+
+  if (CC == X86::COND_E || CC == X86::COND_NE) {
+    switch (Cond.getOpcode()) {
+    default: break;
+    case X86ISD::BSR:
+    case X86ISD::BSF:
+      // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
+      if (DAG.isKnownNeverZero(Cond.getOperand(0)))
+        return (CC == X86::COND_E) ? FalseOp : TrueOp;
+    }
+  }
+
+  SDValue Flags;
+
+  Flags = checkBoolTestSetCCCombine(Cond, CC);
+  if (Flags.getNode() &&
+      // Extra check as FCMOV only supports a subset of X86 cond.
+      (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) {
+    SDValue Ops[] = { FalseOp, TrueOp,
+                      DAG.getConstant(CC, MVT::i8), Flags };
+    return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
+  }
+
+  // If this is a select between two integer constants, try to do some
+  // optimizations.  Note that the operands are ordered the opposite of SELECT
+  // operands.
+  if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
+    if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
+      // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
+      // larger than FalseC (the false value).
+      if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
+        CC = X86::GetOppositeBranchCondition(CC);
+        std::swap(TrueC, FalseC);
+        std::swap(TrueOp, FalseOp);
+      }
+
+      // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
+      // This is efficient for any integer data type (including i8/i16) and
+      // shift amount.
+      if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
+        Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
+                           DAG.getConstant(CC, MVT::i8), Cond);
+
+        // Zero extend the condition if needed.
+        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
+
+        unsigned ShAmt = TrueC->getAPIntValue().logBase2();
+        Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
+                           DAG.getConstant(ShAmt, MVT::i8));
+        if (N->getNumValues() == 2)  // Dead flag value?
+          return DCI.CombineTo(N, Cond, SDValue());
+        return Cond;
+      }
+
+      // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
+      // for any integer data type, including i8/i16.
+      if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
+        Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
+                           DAG.getConstant(CC, MVT::i8), Cond);
+
+        // Zero extend the condition if needed.
+        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
+                           FalseC->getValueType(0), Cond);
+        Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+                           SDValue(FalseC, 0));
+
+        if (N->getNumValues() == 2)  // Dead flag value?
+          return DCI.CombineTo(N, Cond, SDValue());
+        return Cond;
+      }
+
+      // Optimize cases that will turn into an LEA instruction.  This requires
+      // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
+      if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
+        uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
+        if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
+
+        bool isFastMultiplier = false;
+        if (Diff < 10) {
+          switch ((unsigned char)Diff) {
+          default: break;
+          case 1:  // result = add base, cond
+          case 2:  // result = lea base(    , cond*2)
+          case 3:  // result = lea base(cond, cond*2)
+          case 4:  // result = lea base(    , cond*4)
+          case 5:  // result = lea base(cond, cond*4)
+          case 8:  // result = lea base(    , cond*8)
+          case 9:  // result = lea base(cond, cond*8)
+            isFastMultiplier = true;
+            break;
+          }
+        }
+
+        if (isFastMultiplier) {
+          APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
+          Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
+                             DAG.getConstant(CC, MVT::i8), Cond);
+          // Zero extend the condition if needed.
+          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
+                             Cond);
+          // Scale the condition by the difference.
+          if (Diff != 1)
+            Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
+                               DAG.getConstant(Diff, Cond.getValueType()));
+
+          // Add the base if non-zero.
+          if (FalseC->getAPIntValue() != 0)
+            Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+                               SDValue(FalseC, 0));
+          if (N->getNumValues() == 2)  // Dead flag value?
+            return DCI.CombineTo(N, Cond, SDValue());
+          return Cond;
+        }
+      }
+    }
+  }
+
+  // Handle these cases:
+  //   (select (x != c), e, c) -> select (x != c), e, x),
+  //   (select (x == c), c, e) -> select (x == c), x, e)
+  // where the c is an integer constant, and the "select" is the combination
+  // of CMOV and CMP.
+  //
+  // The rationale for this change is that the conditional-move from a constant
+  // needs two instructions, however, conditional-move from a register needs
+  // only one instruction.
+  //
+  // CAVEAT: By replacing a constant with a symbolic value, it may obscure
+  //  some instruction-combining opportunities. This opt needs to be
+  //  postponed as late as possible.
+  //
+  if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
+    // the DCI.xxxx conditions are provided to postpone the optimization as
+    // late as possible.
+
+    ConstantSDNode *CmpAgainst = nullptr;
+    if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
+        (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
+        !isa<ConstantSDNode>(Cond.getOperand(0))) {
+
+      if (CC == X86::COND_NE &&
+          CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
+        CC = X86::GetOppositeBranchCondition(CC);
+        std::swap(TrueOp, FalseOp);
+      }
+
+      if (CC == X86::COND_E &&
+          CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
+        SDValue Ops[] = { FalseOp, Cond.getOperand(0),
+                          DAG.getConstant(CC, MVT::i8), Cond };
+        return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
+      }
+    }
+  }
+
+  return SDValue();
+}
+
+static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
+                                                const X86Subtarget *Subtarget) {
+  unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+  switch (IntNo) {
+  default: return SDValue();
+  // SSE/AVX/AVX2 blend intrinsics.
+  case Intrinsic::x86_avx2_pblendvb:
+  case Intrinsic::x86_avx2_pblendw:
+  case Intrinsic::x86_avx2_pblendd_128:
+  case Intrinsic::x86_avx2_pblendd_256:
+    // Don't try to simplify this intrinsic if we don't have AVX2.
+    if (!Subtarget->hasAVX2())
+      return SDValue();
+    // FALL-THROUGH
+  case Intrinsic::x86_avx_blend_pd_256:
+  case Intrinsic::x86_avx_blend_ps_256:
+  case Intrinsic::x86_avx_blendv_pd_256:
+  case Intrinsic::x86_avx_blendv_ps_256:
+    // Don't try to simplify this intrinsic if we don't have AVX.
+    if (!Subtarget->hasAVX())
+      return SDValue();
+    // FALL-THROUGH
+  case Intrinsic::x86_sse41_pblendw:
+  case Intrinsic::x86_sse41_blendpd:
+  case Intrinsic::x86_sse41_blendps:
+  case Intrinsic::x86_sse41_blendvps:
+  case Intrinsic::x86_sse41_blendvpd:
+  case Intrinsic::x86_sse41_pblendvb: {
+    SDValue Op0 = N->getOperand(1);
+    SDValue Op1 = N->getOperand(2);
+    SDValue Mask = N->getOperand(3);
+
+    // Don't try to simplify this intrinsic if we don't have SSE4.1.
+    if (!Subtarget->hasSSE41())
+      return SDValue();
+
+    // fold (blend A, A, Mask) -> A
+    if (Op0 == Op1)
+      return Op0;
+    // fold (blend A, B, allZeros) -> A
+    if (ISD::isBuildVectorAllZeros(Mask.getNode()))
+      return Op0;
+    // fold (blend A, B, allOnes) -> B
+    if (ISD::isBuildVectorAllOnes(Mask.getNode()))
+      return Op1;
+    
+    // Simplify the case where the mask is a constant i32 value.
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Mask)) {
+      if (C->isNullValue())
+        return Op0;
+      if (C->isAllOnesValue())
+        return Op1;
+    }
+
+    return SDValue();
+  }
+
+  // Packed SSE2/AVX2 arithmetic shift immediate intrinsics.
+  case Intrinsic::x86_sse2_psrai_w:
+  case Intrinsic::x86_sse2_psrai_d:
+  case Intrinsic::x86_avx2_psrai_w:
+  case Intrinsic::x86_avx2_psrai_d:
+  case Intrinsic::x86_sse2_psra_w:
+  case Intrinsic::x86_sse2_psra_d:
+  case Intrinsic::x86_avx2_psra_w:
+  case Intrinsic::x86_avx2_psra_d: {
+    SDValue Op0 = N->getOperand(1);
+    SDValue Op1 = N->getOperand(2);
+    EVT VT = Op0.getValueType();
+    assert(VT.isVector() && "Expected a vector type!");
+
+    if (isa<BuildVectorSDNode>(Op1))
+      Op1 = Op1.getOperand(0);
+
+    if (!isa<ConstantSDNode>(Op1))
+      return SDValue();
+
+    EVT SVT = VT.getVectorElementType();
+    unsigned SVTBits = SVT.getSizeInBits();
+
+    ConstantSDNode *CND = cast<ConstantSDNode>(Op1);
+    const APInt &C = APInt(SVTBits, CND->getAPIntValue().getZExtValue());
+    uint64_t ShAmt = C.getZExtValue();
+
+    // Don't try to convert this shift into a ISD::SRA if the shift
+    // count is bigger than or equal to the element size.
+    if (ShAmt >= SVTBits)
+      return SDValue();
+
+    // Trivial case: if the shift count is zero, then fold this
+    // into the first operand.
+    if (ShAmt == 0)
+      return Op0;
+
+    // Replace this packed shift intrinsic with a target independent
+    // shift dag node.
+    SDValue Splat = DAG.getConstant(C, VT);
+    return DAG.getNode(ISD::SRA, SDLoc(N), VT, Op0, Splat);
+  }
+  }
+}
+
+/// PerformMulCombine - Optimize a single multiply with constant into two
+/// in order to implement it with two cheaper instructions, e.g.
+/// LEA + SHL, LEA + LEA.
+static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::i64)
+    return SDValue();
+
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  if (!C)
+    return SDValue();
+  uint64_t MulAmt = C->getZExtValue();
+  if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
+    return SDValue();
+
+  uint64_t MulAmt1 = 0;
+  uint64_t MulAmt2 = 0;
+  if ((MulAmt % 9) == 0) {
+    MulAmt1 = 9;
+    MulAmt2 = MulAmt / 9;
+  } else if ((MulAmt % 5) == 0) {
+    MulAmt1 = 5;
+    MulAmt2 = MulAmt / 5;
+  } else if ((MulAmt % 3) == 0) {
+    MulAmt1 = 3;
+    MulAmt2 = MulAmt / 3;
+  }
+  if (MulAmt2 &&
+      (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
+    SDLoc DL(N);
+
+    if (isPowerOf2_64(MulAmt2) &&
+        !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
+      // If second multiplifer is pow2, issue it first. We want the multiply by
+      // 3, 5, or 9 to be folded into the addressing mode unless the lone use
+      // is an add.
+      std::swap(MulAmt1, MulAmt2);
+
+    SDValue NewMul;
+    if (isPowerOf2_64(MulAmt1))
+      NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+                           DAG.getConstant(Log2_64(MulAmt1), MVT::i8));
+    else
+      NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
+                           DAG.getConstant(MulAmt1, VT));
+
+    if (isPowerOf2_64(MulAmt2))
+      NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
+                           DAG.getConstant(Log2_64(MulAmt2), MVT::i8));
+    else
+      NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
+                           DAG.getConstant(MulAmt2, VT));
+
+    // Do not add new nodes to DAG combiner worklist.
+    DCI.CombineTo(N, NewMul, false);
+  }
+  return SDValue();
+}
+
+static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  EVT VT = N0.getValueType();
+
+  // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
+  // since the result of setcc_c is all zero's or all ones.
+  if (VT.isInteger() && !VT.isVector() &&
+      N1C && N0.getOpcode() == ISD::AND &&
+      N0.getOperand(1).getOpcode() == ISD::Constant) {
+    SDValue N00 = N0.getOperand(0);
+    if (N00.getOpcode() == X86ISD::SETCC_CARRY ||
+        ((N00.getOpcode() == ISD::ANY_EXTEND ||
+          N00.getOpcode() == ISD::ZERO_EXTEND) &&
+         N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) {
+      APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
+      APInt ShAmt = N1C->getAPIntValue();
+      Mask = Mask.shl(ShAmt);
+      if (Mask != 0)
+        return DAG.getNode(ISD::AND, SDLoc(N), VT,
+                           N00, DAG.getConstant(Mask, VT));
+    }
+  }
+
+  // Hardware support for vector shifts is sparse which makes us scalarize the
+  // vector operations in many cases. Also, on sandybridge ADD is faster than
+  // shl.
+  // (shl V, 1) -> add V,V
+  if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
+    if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
+      assert(N0.getValueType().isVector() && "Invalid vector shift type");
+      // We shift all of the values by one. In many cases we do not have
+      // hardware support for this operation. This is better expressed as an ADD
+      // of two values.
+      if (N1SplatC->getZExtValue() == 1)
+        return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
+    }
+
+  return SDValue();
+}
+
+/// \brief Returns a vector of 0s if the node in input is a vector logical
+/// shift by a constant amount which is known to be bigger than or equal
+/// to the vector element size in bits.
+static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
+                                      const X86Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+
+  if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
+      (!Subtarget->hasInt256() ||
+       (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
+    return SDValue();
+
+  SDValue Amt = N->getOperand(1);
+  SDLoc DL(N);
+  if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
+    if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
+      APInt ShiftAmt = AmtSplat->getAPIntValue();
+      unsigned MaxAmount = VT.getVectorElementType().getSizeInBits();
+
+      // SSE2/AVX2 logical shifts always return a vector of 0s
+      // if the shift amount is bigger than or equal to
+      // the element size. The constant shift amount will be
+      // encoded as a 8-bit immediate.
+      if (ShiftAmt.trunc(8).uge(MaxAmount))
+        return getZeroVector(VT, Subtarget, DAG, DL);
+    }
+
+  return SDValue();
+}
+
+/// PerformShiftCombine - Combine shifts.
+static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   const X86Subtarget *Subtarget) {
+  if (N->getOpcode() == ISD::SHL) {
+    SDValue V = PerformSHLCombine(N, DAG);
+    if (V.getNode()) return V;
+  }
+
+  if (N->getOpcode() != ISD::SRA) {
+    // Try to fold this logical shift into a zero vector.
+    SDValue V = performShiftToAllZeros(N, DAG, Subtarget);
+    if (V.getNode()) return V;
+  }
+
+  return SDValue();
+}
+
+// CMPEQCombine - Recognize the distinctive  (AND (setcc ...) (setcc ..))
+// where both setccs reference the same FP CMP, and rewrite for CMPEQSS
+// and friends.  Likewise for OR -> CMPNEQSS.
+static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
+                            TargetLowering::DAGCombinerInfo &DCI,
+                            const X86Subtarget *Subtarget) {
+  unsigned opcode;
+
+  // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
+  // we're requiring SSE2 for both.
+  if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
+    SDValue N0 = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+    SDValue CMP0 = N0->getOperand(1);
+    SDValue CMP1 = N1->getOperand(1);
+    SDLoc DL(N);
+
+    // The SETCCs should both refer to the same CMP.
+    if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
+      return SDValue();
+
+    SDValue CMP00 = CMP0->getOperand(0);
+    SDValue CMP01 = CMP0->getOperand(1);
+    EVT     VT    = CMP00.getValueType();
+
+    if (VT == MVT::f32 || VT == MVT::f64) {
+      bool ExpectingFlags = false;
+      // Check for any users that want flags:
+      for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
+           !ExpectingFlags && UI != UE; ++UI)
+        switch (UI->getOpcode()) {
+        default:
+        case ISD::BR_CC:
+        case ISD::BRCOND:
+        case ISD::SELECT:
+          ExpectingFlags = true;
+          break;
+        case ISD::CopyToReg:
+        case ISD::SIGN_EXTEND:
+        case ISD::ZERO_EXTEND:
+        case ISD::ANY_EXTEND:
+          break;
+        }
+
+      if (!ExpectingFlags) {
+        enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
+        enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
+
+        if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
+          X86::CondCode tmp = cc0;
+          cc0 = cc1;
+          cc1 = tmp;
+        }
+
+        if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
+            (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
+          // FIXME: need symbolic constants for these magic numbers.
+          // See X86ATTInstPrinter.cpp:printSSECC().
+          unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
+          if (Subtarget->hasAVX512()) {
+            SDValue FSetCC = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CMP00,
+                                         CMP01, DAG.getConstant(x86cc, MVT::i8));
+            if (N->getValueType(0) != MVT::i1)
+              return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0),
+                                 FSetCC);
+            return FSetCC;
+          }
+          SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
+                                              CMP00.getValueType(), CMP00, CMP01,
+                                              DAG.getConstant(x86cc, MVT::i8));
+
+          bool is64BitFP = (CMP00.getValueType() == MVT::f64);
+          MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
+
+          if (is64BitFP && !Subtarget->is64Bit()) {
+            // On a 32-bit target, we cannot bitcast the 64-bit float to a
+            // 64-bit integer, since that's not a legal type. Since
+            // OnesOrZeroesF is all ones of all zeroes, we don't need all the
+            // bits, but can do this little dance to extract the lowest 32 bits
+            // and work with those going forward.
+            SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
+                                           OnesOrZeroesF);
+            SDValue Vector32 = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32,
+                                           Vector64);
+            OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
+                                        Vector32, DAG.getIntPtrConstant(0));
+            IntVT = MVT::i32;
+          }
+
+          SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, IntVT, OnesOrZeroesF);
+          SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
+                                      DAG.getConstant(1, IntVT));
+          SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed);
+          return OneBitOfTruth;
+        }
+      }
+    }
+  }
+  return SDValue();
+}
+
+/// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector
+/// so it can be folded inside ANDNP.
+static bool CanFoldXORWithAllOnes(const SDNode *N) {
+  EVT VT = N->getValueType(0);
+
+  // Match direct AllOnes for 128 and 256-bit vectors
+  if (ISD::isBuildVectorAllOnes(N))
+    return true;
+
+  // Look through a bit convert.
+  if (N->getOpcode() == ISD::BITCAST)
+    N = N->getOperand(0).getNode();
+
+  // Sometimes the operand may come from a insert_subvector building a 256-bit
+  // allones vector
+  if (VT.is256BitVector() &&
+      N->getOpcode() == ISD::INSERT_SUBVECTOR) {
+    SDValue V1 = N->getOperand(0);
+    SDValue V2 = N->getOperand(1);
+
+    if (V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
+        V1.getOperand(0).getOpcode() == ISD::UNDEF &&
+        ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) &&
+        ISD::isBuildVectorAllOnes(V2.getNode()))
+      return true;
+  }
+
+  return false;
+}
+
+// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
+// register. In most cases we actually compare or select YMM-sized registers
+// and mixing the two types creates horrible code. This method optimizes
+// some of the transition sequences.
+static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const X86Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  if (!VT.is256BitVector())
+    return SDValue();
+
+  assert((N->getOpcode() == ISD::ANY_EXTEND ||
+          N->getOpcode() == ISD::ZERO_EXTEND ||
+          N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
+
+  SDValue Narrow = N->getOperand(0);
+  EVT NarrowVT = Narrow->getValueType(0);
+  if (!NarrowVT.is128BitVector())
+    return SDValue();
+
+  if (Narrow->getOpcode() != ISD::XOR &&
+      Narrow->getOpcode() != ISD::AND &&
+      Narrow->getOpcode() != ISD::OR)
+    return SDValue();
+
+  SDValue N0  = Narrow->getOperand(0);
+  SDValue N1  = Narrow->getOperand(1);
+  SDLoc DL(Narrow);
+
+  // The Left side has to be a trunc.
+  if (N0.getOpcode() != ISD::TRUNCATE)
+    return SDValue();
+
+  // The type of the truncated inputs.
+  EVT WideVT = N0->getOperand(0)->getValueType(0);
+  if (WideVT != VT)
+    return SDValue();
+
+  // The right side has to be a 'trunc' or a constant vector.
+  bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
+  ConstantSDNode *RHSConstSplat = nullptr;
+  if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
+    RHSConstSplat = RHSBV->getConstantSplatNode();
+  if (!RHSTrunc && !RHSConstSplat)
+    return SDValue();
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
+    return SDValue();
+
+  // Set N0 and N1 to hold the inputs to the new wide operation.
+  N0 = N0->getOperand(0);
+  if (RHSConstSplat) {
+    N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(),
+                     SDValue(RHSConstSplat, 0));
+    SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1);
+    N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C);
+  } else if (RHSTrunc) {
+    N1 = N1->getOperand(0);
+  }
+
+  // Generate the wide operation.
+  SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
+  unsigned Opcode = N->getOpcode();
+  switch (Opcode) {
+  case ISD::ANY_EXTEND:
+    return Op;
+  case ISD::ZERO_EXTEND: {
+    unsigned InBits = NarrowVT.getScalarType().getSizeInBits();
+    APInt Mask = APInt::getAllOnesValue(InBits);
+    Mask = Mask.zext(VT.getScalarType().getSizeInBits());
+    return DAG.getNode(ISD::AND, DL, VT,
+                       Op, DAG.getConstant(Mask, VT));
+  }
+  case ISD::SIGN_EXTEND:
+    return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
+                       Op, DAG.getValueType(NarrowVT));
+  default:
+    llvm_unreachable("Unexpected opcode");
+  }
+}
+
+static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const X86Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
+  if (R.getNode())
+    return R;
+
+  // Create BEXTR instructions
+  // BEXTR is ((X >> imm) & (2**size-1))
+  if (VT == MVT::i32 || VT == MVT::i64) {
+    SDValue N0 = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+    SDLoc DL(N);
+
+    // Check for BEXTR.
+    if ((Subtarget->hasBMI() || Subtarget->hasTBM()) &&
+        (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) {
+      ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
+      ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+      if (MaskNode && ShiftNode) {
+        uint64_t Mask = MaskNode->getZExtValue();
+        uint64_t Shift = ShiftNode->getZExtValue();
+        if (isMask_64(Mask)) {
+          uint64_t MaskSize = CountPopulation_64(Mask);
+          if (Shift + MaskSize <= VT.getSizeInBits())
+            return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
+                               DAG.getConstant(Shift | (MaskSize << 8), VT));
+        }
+      }
+    } // BEXTR
+
+    return SDValue();
+  }
+
+  // Want to form ANDNP nodes:
+  // 1) In the hopes of then easily combining them with OR and AND nodes
+  //    to form PBLEND/PSIGN.
+  // 2) To match ANDN packed intrinsics
+  if (VT != MVT::v2i64 && VT != MVT::v4i64)
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDLoc DL(N);
+
+  // Check LHS for vnot
+  if (N0.getOpcode() == ISD::XOR &&
+      //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
+      CanFoldXORWithAllOnes(N0.getOperand(1).getNode()))
+    return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
+
+  // Check RHS for vnot
+  if (N1.getOpcode() == ISD::XOR &&
+      //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
+      CanFoldXORWithAllOnes(N1.getOperand(1).getNode()))
+    return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
+
+  return SDValue();
+}
+
+static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
+                                TargetLowering::DAGCombinerInfo &DCI,
+                                const X86Subtarget *Subtarget) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
+  if (R.getNode())
+    return R;
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+
+  // look for psign/blend
+  if (VT == MVT::v2i64 || VT == MVT::v4i64) {
+    if (!Subtarget->hasSSSE3() ||
+        (VT == MVT::v4i64 && !Subtarget->hasInt256()))
+      return SDValue();
+
+    // Canonicalize pandn to RHS
+    if (N0.getOpcode() == X86ISD::ANDNP)
+      std::swap(N0, N1);
+    // or (and (m, y), (pandn m, x))
+    if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) {
+      SDValue Mask = N1.getOperand(0);
+      SDValue X    = N1.getOperand(1);
+      SDValue Y;
+      if (N0.getOperand(0) == Mask)
+        Y = N0.getOperand(1);
+      if (N0.getOperand(1) == Mask)
+        Y = N0.getOperand(0);
+
+      // Check to see if the mask appeared in both the AND and ANDNP and
+      if (!Y.getNode())
+        return SDValue();
+
+      // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them.
+      // Look through mask bitcast.
+      if (Mask.getOpcode() == ISD::BITCAST)
+        Mask = Mask.getOperand(0);
+      if (X.getOpcode() == ISD::BITCAST)
+        X = X.getOperand(0);
+      if (Y.getOpcode() == ISD::BITCAST)
+        Y = Y.getOperand(0);
+
+      EVT MaskVT = Mask.getValueType();
+
+      // Validate that the Mask operand is a vector sra node.
+      // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
+      // there is no psrai.b
+      unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
+      unsigned SraAmt = ~0;
+      if (Mask.getOpcode() == ISD::SRA) {
+        if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
+          if (auto *AmtConst = AmtBV->getConstantSplatNode())
+            SraAmt = AmtConst->getZExtValue();
+      } else if (Mask.getOpcode() == X86ISD::VSRAI) {
+        SDValue SraC = Mask.getOperand(1);
+        SraAmt  = cast<ConstantSDNode>(SraC)->getZExtValue();
+      }
+      if ((SraAmt + 1) != EltBits)
+        return SDValue();
+
+      SDLoc DL(N);
+
+      // Now we know we at least have a plendvb with the mask val.  See if
+      // we can form a psignb/w/d.
+      // psign = x.type == y.type == mask.type && y = sub(0, x);
+      if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
+          ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&
+          X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
+        assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
+               "Unsupported VT for PSIGN");
+        Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0));
+        return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
+      }
+      // PBLENDVB only available on SSE 4.1
+      if (!Subtarget->hasSSE41())
+        return SDValue();
+
+      EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
+
+      X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X);
+      Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y);
+      Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask);
+      Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
+      return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
+    }
+  }
+
+  if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
+    return SDValue();
+
+  // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool OptForSize = MF.getFunction()->getAttributes().
+    hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
+
+  // SHLD/SHRD instructions have lower register pressure, but on some
+  // platforms they have higher latency than the equivalent
+  // series of shifts/or that would otherwise be generated.
+  // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
+  // have higher latencies and we are not optimizing for size.
+  if (!OptForSize && Subtarget->isSHLDSlow())
+    return SDValue();
+
+  if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
+    std::swap(N0, N1);
+  if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
+    return SDValue();
+  if (!N0.hasOneUse() || !N1.hasOneUse())
+    return SDValue();
+
+  SDValue ShAmt0 = N0.getOperand(1);
+  if (ShAmt0.getValueType() != MVT::i8)
+    return SDValue();
+  SDValue ShAmt1 = N1.getOperand(1);
+  if (ShAmt1.getValueType() != MVT::i8)
+    return SDValue();
+  if (ShAmt0.getOpcode() == ISD::TRUNCATE)
+    ShAmt0 = ShAmt0.getOperand(0);
+  if (ShAmt1.getOpcode() == ISD::TRUNCATE)
+    ShAmt1 = ShAmt1.getOperand(0);
+
+  SDLoc DL(N);
+  unsigned Opc = X86ISD::SHLD;
+  SDValue Op0 = N0.getOperand(0);
+  SDValue Op1 = N1.getOperand(0);
+  if (ShAmt0.getOpcode() == ISD::SUB) {
+    Opc = X86ISD::SHRD;
+    std::swap(Op0, Op1);
+    std::swap(ShAmt0, ShAmt1);
+  }
+
+  unsigned Bits = VT.getSizeInBits();
+  if (ShAmt1.getOpcode() == ISD::SUB) {
+    SDValue Sum = ShAmt1.getOperand(0);
+    if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
+      SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
+      if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE)
+        ShAmt1Op1 = ShAmt1Op1.getOperand(0);
+      if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
+        return DAG.getNode(Opc, DL, VT,
+                           Op0, Op1,
+                           DAG.getNode(ISD::TRUNCATE, DL,
+                                       MVT::i8, ShAmt0));
+    }
+  } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
+    ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
+    if (ShAmt0C &&
+        ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits)
+      return DAG.getNode(Opc, DL, VT,
+                         N0.getOperand(0), N1.getOperand(0),
+                         DAG.getNode(ISD::TRUNCATE, DL,
+                                       MVT::i8, ShAmt0));
+  }
+
+  return SDValue();
+}
+
+// Generate NEG and CMOV for integer abs.
+static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+
+  // Since X86 does not have CMOV for 8-bit integer, we don't convert
+  // 8-bit integer abs to NEG and CMOV.
+  if (VT.isInteger() && VT.getSizeInBits() == 8)
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDLoc DL(N);
+
+  // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
+  // and change it to SUB and CMOV.
+  if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
+      N0.getOpcode() == ISD::ADD &&
+      N0.getOperand(1) == N1 &&
+      N1.getOpcode() == ISD::SRA &&
+      N1.getOperand(0) == N0.getOperand(0))
+    if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
+      if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) {
+        // Generate SUB & CMOV.
+        SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
+                                  DAG.getConstant(0, VT), N0.getOperand(0));
+
+        SDValue Ops[] = { N0.getOperand(0), Neg,
+                          DAG.getConstant(X86::COND_GE, MVT::i8),
+                          SDValue(Neg.getNode(), 1) };
+        return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
+      }
+  return SDValue();
+}
+
+// PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes
+static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const X86Subtarget *Subtarget) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  if (Subtarget->hasCMov()) {
+    SDValue RV = performIntegerAbsCombine(N, DAG);
+    if (RV.getNode())
+      return RV;
+  }
+
+  return SDValue();
+}
+
+/// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
+static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const X86Subtarget *Subtarget) {
+  LoadSDNode *Ld = cast<LoadSDNode>(N);
+  EVT RegVT = Ld->getValueType(0);
+  EVT MemVT = Ld->getMemoryVT();
+  SDLoc dl(Ld);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  unsigned RegSz = RegVT.getSizeInBits();
+
+  // On Sandybridge unaligned 256bit loads are inefficient.
+  ISD::LoadExtType Ext = Ld->getExtensionType();
+  unsigned Alignment = Ld->getAlignment();
+  bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8;
+  if (RegVT.is256BitVector() && !Subtarget->hasInt256() &&
+      !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
+    unsigned NumElems = RegVT.getVectorNumElements();
+    if (NumElems < 2)
+      return SDValue();
+
+    SDValue Ptr = Ld->getBasePtr();
+    SDValue Increment = DAG.getConstant(16, TLI.getPointerTy());
+
+    EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
+                                  NumElems/2);
+    SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
+                                Ld->getPointerInfo(), Ld->isVolatile(),
+                                Ld->isNonTemporal(), Ld->isInvariant(),
+                                Alignment);
+    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+    SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
+                                Ld->getPointerInfo(), Ld->isVolatile(),
+                                Ld->isNonTemporal(), Ld->isInvariant(),
+                                std::min(16U, Alignment));
+    SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                             Load1.getValue(1),
+                             Load2.getValue(1));
+
+    SDValue NewVec = DAG.getUNDEF(RegVT);
+    NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl);
+    NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl);
+    return DCI.CombineTo(N, NewVec, TF, true);
+  }
+
+  // If this is a vector EXT Load then attempt to optimize it using a
+  // shuffle. If SSSE3 is not available we may emit an illegal shuffle but the
+  // expansion is still better than scalar code.
+  // We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise we'll
+  // emit a shuffle and a arithmetic shift.
+  // TODO: It is possible to support ZExt by zeroing the undef values
+  // during the shuffle phase or after the shuffle.
+  if (RegVT.isVector() && RegVT.isInteger() && Subtarget->hasSSE2() &&
+      (Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)) {
+    assert(MemVT != RegVT && "Cannot extend to the same type");
+    assert(MemVT.isVector() && "Must load a vector from memory");
+
+    unsigned NumElems = RegVT.getVectorNumElements();
+    unsigned MemSz = MemVT.getSizeInBits();
+    assert(RegSz > MemSz && "Register size must be greater than the mem size");
+
+    if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256())
+      return SDValue();
+
+    // All sizes must be a power of two.
+    if (!isPowerOf2_32(RegSz * MemSz * NumElems))
+      return SDValue();
+
+    // Attempt to load the original value using scalar loads.
+    // Find the largest scalar type that divides the total loaded size.
+    MVT SclrLoadTy = MVT::i8;
+    for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
+         tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
+      MVT Tp = (MVT::SimpleValueType)tp;
+      if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
+        SclrLoadTy = Tp;
+      }
+    }
+
+    // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
+    if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
+        (64 <= MemSz))
+      SclrLoadTy = MVT::f64;
+
+    // Calculate the number of scalar loads that we need to perform
+    // in order to load our vector from memory.
+    unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
+    if (Ext == ISD::SEXTLOAD && NumLoads > 1)
+      return SDValue();
+
+    unsigned loadRegZize = RegSz;
+    if (Ext == ISD::SEXTLOAD && RegSz == 256)
+      loadRegZize /= 2;
+
+    // Represent our vector as a sequence of elements which are the
+    // largest scalar that we can load.
+    EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy,
+      loadRegZize/SclrLoadTy.getSizeInBits());
+
+    // Represent the data using the same element type that is stored in
+    // memory. In practice, we ''widen'' MemVT.
+    EVT WideVecVT =
+          EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
+                       loadRegZize/MemVT.getScalarType().getSizeInBits());
+
+    assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
+      "Invalid vector type");
+
+    // We can't shuffle using an illegal type.
+    if (!TLI.isTypeLegal(WideVecVT))
+      return SDValue();
+
+    SmallVector<SDValue, 8> Chains;
+    SDValue Ptr = Ld->getBasePtr();
+    SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits()/8,
+                                        TLI.getPointerTy());
+    SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
+
+    for (unsigned i = 0; i < NumLoads; ++i) {
+      // Perform a single load.
+      SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(),
+                                       Ptr, Ld->getPointerInfo(),
+                                       Ld->isVolatile(), Ld->isNonTemporal(),
+                                       Ld->isInvariant(), Ld->getAlignment());
+      Chains.push_back(ScalarLoad.getValue(1));
+      // Create the first element type using SCALAR_TO_VECTOR in order to avoid
+      // another round of DAGCombining.
+      if (i == 0)
+        Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
+      else
+        Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
+                          ScalarLoad, DAG.getIntPtrConstant(i));
+
+      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+    }
+
+    SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
+
+    // Bitcast the loaded value to a vector of the original element type, in
+    // the size of the target vector type.
+    SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res);
+    unsigned SizeRatio = RegSz/MemSz;
+
+    if (Ext == ISD::SEXTLOAD) {
+      // If we have SSE4.1 we can directly emit a VSEXT node.
+      if (Subtarget->hasSSE41()) {
+        SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
+        return DCI.CombineTo(N, Sext, TF, true);
+      }
+
+      // Otherwise we'll shuffle the small elements in the high bits of the
+      // larger type and perform an arithmetic shift. If the shift is not legal
+      // it's better to scalarize.
+      if (!TLI.isOperationLegalOrCustom(ISD::SRA, RegVT))
+        return SDValue();
+
+      // Redistribute the loaded elements into the different locations.
+      SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
+      for (unsigned i = 0; i != NumElems; ++i)
+        ShuffleVec[i*SizeRatio + SizeRatio-1] = i;
+
+      SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
+                                           DAG.getUNDEF(WideVecVT),
+                                           &ShuffleVec[0]);
+
+      Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
+
+      // Build the arithmetic shift.
+      unsigned Amt = RegVT.getVectorElementType().getSizeInBits() -
+                     MemVT.getVectorElementType().getSizeInBits();
+      Shuff = DAG.getNode(ISD::SRA, dl, RegVT, Shuff,
+                          DAG.getConstant(Amt, RegVT));
+
+      return DCI.CombineTo(N, Shuff, TF, true);
+    }
+
+    // Redistribute the loaded elements into the different locations.
+    SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
+    for (unsigned i = 0; i != NumElems; ++i)
+      ShuffleVec[i*SizeRatio] = i;
+
+    SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
+                                         DAG.getUNDEF(WideVecVT),
+                                         &ShuffleVec[0]);
+
+    // Bitcast to the requested type.
+    Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
+    // Replace the original load with the new sequence
+    // and return the new chain.
+    return DCI.CombineTo(N, Shuff, TF, true);
+  }
+
+  return SDValue();
+}
+
+/// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
+static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
+                                   const X86Subtarget *Subtarget) {
+  StoreSDNode *St = cast<StoreSDNode>(N);
+  EVT VT = St->getValue().getValueType();
+  EVT StVT = St->getMemoryVT();
+  SDLoc dl(St);
+  SDValue StoredVal = St->getOperand(1);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // If we are saving a concatenation of two XMM registers, perform two stores.
+  // On Sandy Bridge, 256-bit memory operations are executed by two
+  // 128-bit ports. However, on Haswell it is better to issue a single 256-bit
+  // memory  operation.
+  unsigned Alignment = St->getAlignment();
+  bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8;
+  if (VT.is256BitVector() && !Subtarget->hasInt256() &&
+      StVT == VT && !IsAligned) {
+    unsigned NumElems = VT.getVectorNumElements();
+    if (NumElems < 2)
+      return SDValue();
+
+    SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl);
+    SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl);
+
+    SDValue Stride = DAG.getConstant(16, TLI.getPointerTy());
+    SDValue Ptr0 = St->getBasePtr();
+    SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride);
+
+    SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
+                                St->getPointerInfo(), St->isVolatile(),
+                                St->isNonTemporal(), Alignment);
+    SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
+                                St->getPointerInfo(), St->isVolatile(),
+                                St->isNonTemporal(),
+                                std::min(16U, Alignment));
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
+  }
+
+  // Optimize trunc store (of multiple scalars) to shuffle and store.
+  // First, pack all of the elements in one place. Next, store to memory
+  // in fewer chunks.
+  if (St->isTruncatingStore() && VT.isVector()) {
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    unsigned NumElems = VT.getVectorNumElements();
+    assert(StVT != VT && "Cannot truncate to the same type");
+    unsigned FromSz = VT.getVectorElementType().getSizeInBits();
+    unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
+
+    // From, To sizes and ElemCount must be pow of two
+    if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
+    // We are going to use the original vector elt for storing.
+    // Accumulated smaller vector elements must be a multiple of the store size.
+    if (0 != (NumElems * FromSz) % ToSz) return SDValue();
+
+    unsigned SizeRatio  = FromSz / ToSz;
+
+    assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
+
+    // Create a type on which we perform the shuffle
+    EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
+            StVT.getScalarType(), NumElems*SizeRatio);
+
+    assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+
+    SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue());
+    SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
+    for (unsigned i = 0; i != NumElems; ++i)
+      ShuffleVec[i] = i * SizeRatio;
+
+    // Can't shuffle using an illegal type.
+    if (!TLI.isTypeLegal(WideVecVT))
+      return SDValue();
+
+    SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
+                                         DAG.getUNDEF(WideVecVT),
+                                         &ShuffleVec[0]);
+    // At this point all of the data is stored at the bottom of the
+    // register. We now need to save it to mem.
+
+    // Find the largest store unit
+    MVT StoreType = MVT::i8;
+    for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
+         tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
+      MVT Tp = (MVT::SimpleValueType)tp;
+      if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
+        StoreType = Tp;
+    }
+
+    // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
+    if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
+        (64 <= NumElems * ToSz))
+      StoreType = MVT::f64;
+
+    // Bitcast the original vector into a vector of store-size units
+    EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
+            StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
+    assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
+    SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff);
+    SmallVector<SDValue, 8> Chains;
+    SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,
+                                        TLI.getPointerTy());
+    SDValue Ptr = St->getBasePtr();
+
+    // Perform one or more big stores into memory.
+    for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
+      SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                                   StoreType, ShuffWide,
+                                   DAG.getIntPtrConstant(i));
+      SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr,
+                                St->getPointerInfo(), St->isVolatile(),
+                                St->isNonTemporal(), St->getAlignment());
+      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+      Chains.push_back(Ch);
+    }
+
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
+  }
+
+  // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
+  // the FP state in cases where an emms may be missing.
+  // A preferable solution to the general problem is to figure out the right
+  // places to insert EMMS.  This qualifies as a quick hack.
+
+  // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
+  if (VT.getSizeInBits() != 64)
+    return SDValue();
+
+  const Function *F = DAG.getMachineFunction().getFunction();
+  bool NoImplicitFloatOps = F->getAttributes().
+    hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
+  bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps
+                     && Subtarget->hasSSE2();
+  if ((VT.isVector() ||
+       (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
+      isa<LoadSDNode>(St->getValue()) &&
+      !cast<LoadSDNode>(St->getValue())->isVolatile() &&
+      St->getChain().hasOneUse() && !St->isVolatile()) {
+    SDNode* LdVal = St->getValue().getNode();
+    LoadSDNode *Ld = nullptr;
+    int TokenFactorIndex = -1;
+    SmallVector<SDValue, 8> Ops;
+    SDNode* ChainVal = St->getChain().getNode();
+    // Must be a store of a load.  We currently handle two cases:  the load
+    // is a direct child, and it's under an intervening TokenFactor.  It is
+    // possible to dig deeper under nested TokenFactors.
+    if (ChainVal == LdVal)
+      Ld = cast<LoadSDNode>(St->getChain());
+    else if (St->getValue().hasOneUse() &&
+             ChainVal->getOpcode() == ISD::TokenFactor) {
+      for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
+        if (ChainVal->getOperand(i).getNode() == LdVal) {
+          TokenFactorIndex = i;
+          Ld = cast<LoadSDNode>(St->getValue());
+        } else
+          Ops.push_back(ChainVal->getOperand(i));
+      }
+    }
+
+    if (!Ld || !ISD::isNormalLoad(Ld))
+      return SDValue();
+
+    // If this is not the MMX case, i.e. we are just turning i64 load/store
+    // into f64 load/store, avoid the transformation if there are multiple
+    // uses of the loaded value.
+    if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
+      return SDValue();
+
+    SDLoc LdDL(Ld);
+    SDLoc StDL(N);
+    // If we are a 64-bit capable x86, lower to a single movq load/store pair.
+    // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
+    // pair instead.
+    if (Subtarget->is64Bit() || F64IsLegal) {
+      EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
+      SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
+                                  Ld->getPointerInfo(), Ld->isVolatile(),
+                                  Ld->isNonTemporal(), Ld->isInvariant(),
+                                  Ld->getAlignment());
+      SDValue NewChain = NewLd.getValue(1);
+      if (TokenFactorIndex != -1) {
+        Ops.push_back(NewChain);
+        NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
+      }
+      return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
+                          St->getPointerInfo(),
+                          St->isVolatile(), St->isNonTemporal(),
+                          St->getAlignment());
+    }
+
+    // Otherwise, lower to two pairs of 32-bit loads / stores.
+    SDValue LoAddr = Ld->getBasePtr();
+    SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
+                                 DAG.getConstant(4, MVT::i32));
+
+    SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
+                               Ld->getPointerInfo(),
+                               Ld->isVolatile(), Ld->isNonTemporal(),
+                               Ld->isInvariant(), Ld->getAlignment());
+    SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
+                               Ld->getPointerInfo().getWithOffset(4),
+                               Ld->isVolatile(), Ld->isNonTemporal(),
+                               Ld->isInvariant(),
+                               MinAlign(Ld->getAlignment(), 4));
+
+    SDValue NewChain = LoLd.getValue(1);
+    if (TokenFactorIndex != -1) {
+      Ops.push_back(LoLd);
+      Ops.push_back(HiLd);
+      NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
+    }
+
+    LoAddr = St->getBasePtr();
+    HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
+                         DAG.getConstant(4, MVT::i32));
+
+    SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
+                                St->getPointerInfo(),
+                                St->isVolatile(), St->isNonTemporal(),
+                                St->getAlignment());
+    SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
+                                St->getPointerInfo().getWithOffset(4),
+                                St->isVolatile(),
+                                St->isNonTemporal(),
+                                MinAlign(St->getAlignment(), 4));
+    return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
+  }
+  return SDValue();
+}
+
+/// isHorizontalBinOp - Return 'true' if this vector operation is "horizontal"
+/// and return the operands for the horizontal operation in LHS and RHS.  A
+/// horizontal operation performs the binary operation on successive elements
+/// of its first operand, then on successive elements of its second operand,
+/// returning the resulting values in a vector.  For example, if
+///   A = < float a0, float a1, float a2, float a3 >
+/// and
+///   B = < float b0, float b1, float b2, float b3 >
+/// then the result of doing a horizontal operation on A and B is
+///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
+/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
+/// A horizontal-op B, for some already available A and B, and if so then LHS is
+/// set to A, RHS to B, and the routine returns 'true'.
+/// Note that the binary operation should have the property that if one of the
+/// operands is UNDEF then the result is UNDEF.
+static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
+  // Look for the following pattern: if
+  //   A = < float a0, float a1, float a2, float a3 >
+  //   B = < float b0, float b1, float b2, float b3 >
+  // and
+  //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
+  //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
+  // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
+  // which is A horizontal-op B.
+
+  // At least one of the operands should be a vector shuffle.
+  if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
+      RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
+    return false;
+
+  MVT VT = LHS.getSimpleValueType();
+
+  assert((VT.is128BitVector() || VT.is256BitVector()) &&
+         "Unsupported vector type for horizontal add/sub");
+
+  // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
+  // operate independently on 128-bit lanes.
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned NumLaneElts = NumElts / NumLanes;
+  assert((NumLaneElts % 2 == 0) &&
+         "Vector type should have an even number of elements in each lane");
+  unsigned HalfLaneElts = NumLaneElts/2;
+
+  // View LHS in the form
+  //   LHS = VECTOR_SHUFFLE A, B, LMask
+  // If LHS is not a shuffle then pretend it is the shuffle
+  //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
+  // NOTE: in what follows a default initialized SDValue represents an UNDEF of
+  // type VT.
+  SDValue A, B;
+  SmallVector<int, 16> LMask(NumElts);
+  if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
+    if (LHS.getOperand(0).getOpcode() != ISD::UNDEF)
+      A = LHS.getOperand(0);
+    if (LHS.getOperand(1).getOpcode() != ISD::UNDEF)
+      B = LHS.getOperand(1);
+    ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
+    std::copy(Mask.begin(), Mask.end(), LMask.begin());
+  } else {
+    if (LHS.getOpcode() != ISD::UNDEF)
+      A = LHS;
+    for (unsigned i = 0; i != NumElts; ++i)
+      LMask[i] = i;
+  }
+
+  // Likewise, view RHS in the form
+  //   RHS = VECTOR_SHUFFLE C, D, RMask
+  SDValue C, D;
+  SmallVector<int, 16> RMask(NumElts);
+  if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
+    if (RHS.getOperand(0).getOpcode() != ISD::UNDEF)
+      C = RHS.getOperand(0);
+    if (RHS.getOperand(1).getOpcode() != ISD::UNDEF)
+      D = RHS.getOperand(1);
+    ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
+    std::copy(Mask.begin(), Mask.end(), RMask.begin());
+  } else {
+    if (RHS.getOpcode() != ISD::UNDEF)
+      C = RHS;
+    for (unsigned i = 0; i != NumElts; ++i)
+      RMask[i] = i;
+  }
+
+  // Check that the shuffles are both shuffling the same vectors.
+  if (!(A == C && B == D) && !(A == D && B == C))
+    return false;
+
+  // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
+  if (!A.getNode() && !B.getNode())
+    return false;
+
+  // If A and B occur in reverse order in RHS, then "swap" them (which means
+  // rewriting the mask).
+  if (A != C)
+    CommuteVectorShuffleMask(RMask, NumElts);
+
+  // At this point LHS and RHS are equivalent to
+  //   LHS = VECTOR_SHUFFLE A, B, LMask
+  //   RHS = VECTOR_SHUFFLE A, B, RMask
+  // Check that the masks correspond to performing a horizontal operation.
+  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+    for (unsigned i = 0; i != NumLaneElts; ++i) {
+      int LIdx = LMask[i+l], RIdx = RMask[i+l];
+
+      // Ignore any UNDEF components.
+      if (LIdx < 0 || RIdx < 0 ||
+          (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
+          (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
+        continue;
+
+      // Check that successive elements are being operated on.  If not, this is
+      // not a horizontal operation.
+      unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
+      int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
+      if (!(LIdx == Index && RIdx == Index + 1) &&
+          !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
+        return false;
+    }
+  }
+
+  LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
+  RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
+  return true;
+}
+
+/// PerformFADDCombine - Do target-specific dag combines on floating point adds.
+static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
+                                  const X86Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  // Try to synthesize horizontal adds from adds of shuffles.
+  if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
+       (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
+      isHorizontalBinOp(LHS, RHS, true))
+    return DAG.getNode(X86ISD::FHADD, SDLoc(N), VT, LHS, RHS);
+  return SDValue();
+}
+
+/// PerformFSUBCombine - Do target-specific dag combines on floating point subs.
+static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
+                                  const X86Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  // Try to synthesize horizontal subs from subs of shuffles.
+  if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
+       (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
+      isHorizontalBinOp(LHS, RHS, false))
+    return DAG.getNode(X86ISD::FHSUB, SDLoc(N), VT, LHS, RHS);
+  return SDValue();
+}
+
+/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and
+/// X86ISD::FXOR nodes.
+static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
+  assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
+  // F[X]OR(0.0, x) -> x
+  // F[X]OR(x, 0.0) -> x
+  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
+    if (C->getValueAPF().isPosZero())
+      return N->getOperand(1);
+  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
+    if (C->getValueAPF().isPosZero())
+      return N->getOperand(0);
+  return SDValue();
+}
+
+/// PerformFMinFMaxCombine - Do target-specific dag combines on X86ISD::FMIN and
+/// X86ISD::FMAX nodes.
+static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
+  assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
+
+  // Only perform optimizations if UnsafeMath is used.
+  if (!DAG.getTarget().Options.UnsafeFPMath)
+    return SDValue();
+
+  // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
+  // into FMINC and FMAXC, which are Commutative operations.
+  unsigned NewOp = 0;
+  switch (N->getOpcode()) {
+    default: llvm_unreachable("unknown opcode");
+    case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;
+    case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
+  }
+
+  return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
+                     N->getOperand(0), N->getOperand(1));
+}
+
+/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes.
+static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
+  // FAND(0.0, x) -> 0.0
+  // FAND(x, 0.0) -> 0.0
+  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
+    if (C->getValueAPF().isPosZero())
+      return N->getOperand(0);
+  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
+    if (C->getValueAPF().isPosZero())
+      return N->getOperand(1);
+  return SDValue();
+}
+
+/// PerformFANDNCombine - Do target-specific dag combines on X86ISD::FANDN nodes
+static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) {
+  // FANDN(x, 0.0) -> 0.0
+  // FANDN(0.0, x) -> x
+  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
+    if (C->getValueAPF().isPosZero())
+      return N->getOperand(1);
+  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
+    if (C->getValueAPF().isPosZero())
+      return N->getOperand(1);
+  return SDValue();
+}
+
+static SDValue PerformBTCombine(SDNode *N,
+                                SelectionDAG &DAG,
+                                TargetLowering::DAGCombinerInfo &DCI) {
+  // BT ignores high bits in the bit index operand.
+  SDValue Op1 = N->getOperand(1);
+  if (Op1.hasOneUse()) {
+    unsigned BitWidth = Op1.getValueSizeInBits();
+    APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
+    APInt KnownZero, KnownOne;
+    TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+                                          !DCI.isBeforeLegalizeOps());
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
+        TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
+      DCI.CommitTargetLoweringOpt(TLO);
+  }
+  return SDValue();
+}
+
+static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
+  SDValue Op = N->getOperand(0);
+  if (Op.getOpcode() == ISD::BITCAST)
+    Op = Op.getOperand(0);
+  EVT VT = N->getValueType(0), OpVT = Op.getValueType();
+  if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
+      VT.getVectorElementType().getSizeInBits() ==
+      OpVT.getVectorElementType().getSizeInBits()) {
+    return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
+  }
+  return SDValue();
+}
+
+static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
+                                               const X86Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  if (!VT.isVector())
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
+  SDLoc dl(N);
+
+  // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
+  // both SSE and AVX2 since there is no sign-extended shift right
+  // operation on a vector with 64-bit elements.
+  //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
+  // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
+  if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
+      N0.getOpcode() == ISD::SIGN_EXTEND)) {
+    SDValue N00 = N0.getOperand(0);
+
+    // EXTLOAD has a better solution on AVX2,
+    // it may be replaced with X86ISD::VSEXT node.
+    if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256())
+      if (!ISD::isNormalLoad(N00.getNode()))
+        return SDValue();
+
+    if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
+        SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
+                                  N00, N1);
+      return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
+    }
+  }
+  return SDValue();
+}
+
+static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const X86Subtarget *Subtarget) {
+  if (!DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  if (!Subtarget->hasFp256())
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  if (VT.isVector() && VT.getSizeInBits() == 256) {
+    SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
+    if (R.getNode())
+      return R;
+  }
+
+  return SDValue();
+}
+
+static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
+                                 const X86Subtarget* Subtarget) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+
+  // Let legalize expand this if it isn't a legal type yet.
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+
+  EVT ScalarVT = VT.getScalarType();
+  if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
+      (!Subtarget->hasFMA() && !Subtarget->hasFMA4()))
+    return SDValue();
+
+  SDValue A = N->getOperand(0);
+  SDValue B = N->getOperand(1);
+  SDValue C = N->getOperand(2);
+
+  bool NegA = (A.getOpcode() == ISD::FNEG);
+  bool NegB = (B.getOpcode() == ISD::FNEG);
+  bool NegC = (C.getOpcode() == ISD::FNEG);
+
+  // Negative multiplication when NegA xor NegB
+  bool NegMul = (NegA != NegB);
+  if (NegA)
+    A = A.getOperand(0);
+  if (NegB)
+    B = B.getOperand(0);
+  if (NegC)
+    C = C.getOperand(0);
+
+  unsigned Opcode;
+  if (!NegMul)
+    Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
+  else
+    Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
+
+  return DAG.getNode(Opcode, dl, VT, A, B, C);
+}
+
+static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const X86Subtarget *Subtarget) {
+  // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
+  //           (and (i32 x86isd::setcc_carry), 1)
+  // This eliminates the zext. This transformation is necessary because
+  // ISD::SETCC is always legalized to i8.
+  SDLoc dl(N);
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  if (N0.getOpcode() == ISD::AND &&
+      N0.hasOneUse() &&
+      N0.getOperand(0).hasOneUse()) {
+    SDValue N00 = N0.getOperand(0);
+    if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+      if (!C || C->getZExtValue() != 1)
+        return SDValue();
+      return DAG.getNode(ISD::AND, dl, VT,
+                         DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
+                                     N00.getOperand(0), N00.getOperand(1)),
+                         DAG.getConstant(1, VT));
+    }
+  }
+
+  if (N0.getOpcode() == ISD::TRUNCATE &&
+      N0.hasOneUse() &&
+      N0.getOperand(0).hasOneUse()) {
+    SDValue N00 = N0.getOperand(0);
+    if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
+      return DAG.getNode(ISD::AND, dl, VT,
+                         DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
+                                     N00.getOperand(0), N00.getOperand(1)),
+                         DAG.getConstant(1, VT));
+    }
+  }
+  if (VT.is256BitVector()) {
+    SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
+    if (R.getNode())
+      return R;
+  }
+
+  return SDValue();
+}
+
+// Optimize x == -y --> x+y == 0
+//          x != -y --> x+y != 0
+static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
+                                      const X86Subtarget* Subtarget) {
+  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0)))
+      if (C->getAPIntValue() == 0 && LHS.hasOneUse()) {
+        SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),
+                                   LHS.getValueType(), RHS, LHS.getOperand(1));
+        return DAG.getSetCC(SDLoc(N), N->getValueType(0),
+                            addV, DAG.getConstant(0, addV.getValueType()), CC);
+      }
+  if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0)))
+      if (C->getAPIntValue() == 0 && RHS.hasOneUse()) {
+        SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),
+                                   RHS.getValueType(), LHS, RHS.getOperand(1));
+        return DAG.getSetCC(SDLoc(N), N->getValueType(0),
+                            addV, DAG.getConstant(0, addV.getValueType()), CC);
+      }
+
+  if (VT.getScalarType() == MVT::i1) {
+    bool IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
+      (LHS.getOperand(0).getValueType().getScalarType() ==  MVT::i1);
+    bool IsVZero0 = ISD::isBuildVectorAllZeros(LHS.getNode());
+    if (!IsSEXT0 && !IsVZero0)
+      return SDValue();
+    bool IsSEXT1 = (RHS.getOpcode() == ISD::SIGN_EXTEND) &&
+      (RHS.getOperand(0).getValueType().getScalarType() ==  MVT::i1);
+    bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
+
+    if (!IsSEXT1 && !IsVZero1)
+      return SDValue();
+
+    if (IsSEXT0 && IsVZero1) {
+      assert(VT == LHS.getOperand(0).getValueType() && "Uexpected operand type");
+      if (CC == ISD::SETEQ)
+        return DAG.getNOT(DL, LHS.getOperand(0), VT);
+      return LHS.getOperand(0);
+    }
+    if (IsSEXT1 && IsVZero0) {
+      assert(VT == RHS.getOperand(0).getValueType() && "Uexpected operand type");
+      if (CC == ISD::SETEQ)
+        return DAG.getNOT(DL, RHS.getOperand(0), VT);
+      return RHS.getOperand(0);
+    }
+  }
+
+  return SDValue();
+}
+
+static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,
+                                      const X86Subtarget *Subtarget) {
+  SDLoc dl(N);
+  MVT VT = N->getOperand(1)->getSimpleValueType(0);
+  assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
+         "X86insertps is only defined for v4x32");
+
+  SDValue Ld = N->getOperand(1);
+  if (MayFoldLoad(Ld)) {
+    // Extract the countS bits from the immediate so we can get the proper
+    // address when narrowing the vector load to a specific element.
+    // When the second source op is a memory address, interps doesn't use
+    // countS and just gets an f32 from that address.
+    unsigned DestIndex =
+        cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6;
+    Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG);
+  } else
+    return SDValue();
+
+  // Create this as a scalar to vector to match the instruction pattern.
+  SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
+  // countS bits are ignored when loading from memory on insertps, which
+  // means we don't need to explicitly set them to 0.
+  return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
+                     LoadScalarToVector, N->getOperand(2));
+}
+
+// Helper function of PerformSETCCCombine. It is to materialize "setb reg"
+// as "sbb reg,reg", since it can be extended without zext and produces
+// an all-ones bit which is more useful than 0/1 in some cases.
+static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG,
+                               MVT VT) {
+  if (VT == MVT::i8)
+    return DAG.getNode(ISD::AND, DL, VT,
+                       DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
+                                   DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS),
+                       DAG.getConstant(1, VT));
+  assert (VT == MVT::i1 && "Unexpected type for SECCC node");
+  return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1,
+                     DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
+                                 DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS));
+}
+
+// Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
+static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   const X86Subtarget *Subtarget) {
+  SDLoc DL(N);
+  X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
+  SDValue EFLAGS = N->getOperand(1);
+
+  if (CC == X86::COND_A) {
+    // Try to convert COND_A into COND_B in an attempt to facilitate
+    // materializing "setb reg".
+    //
+    // Do not flip "e > c", where "c" is a constant, because Cmp instruction
+    // cannot take an immediate as its first operand.
+    //
+    if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
+        EFLAGS.getValueType().isInteger() &&
+        !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
+      SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
+                                   EFLAGS.getNode()->getVTList(),
+                                   EFLAGS.getOperand(1), EFLAGS.getOperand(0));
+      SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
+      return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0));
+    }
+  }
+
+  // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
+  // a zext and produces an all-ones bit which is more useful than 0/1 in some
+  // cases.
+  if (CC == X86::COND_B)
+    return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));
+
+  SDValue Flags;
+
+  Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
+  if (Flags.getNode()) {
+    SDValue Cond = DAG.getConstant(CC, MVT::i8);
+    return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
+  }
+
+  return SDValue();
+}
+
+// Optimize branch condition evaluation.
+//
+static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
+                                    TargetLowering::DAGCombinerInfo &DCI,
+                                    const X86Subtarget *Subtarget) {
+  SDLoc DL(N);
+  SDValue Chain = N->getOperand(0);
+  SDValue Dest = N->getOperand(1);
+  SDValue EFLAGS = N->getOperand(3);
+  X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
+
+  SDValue Flags;
+
+  Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
+  if (Flags.getNode()) {
+    SDValue Cond = DAG.getConstant(CC, MVT::i8);
+    return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
+                       Flags);
+  }
+
+  return SDValue();
+}
+
+static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
+                                                         SelectionDAG &DAG) {
+  // Take advantage of vector comparisons producing 0 or -1 in each lane to
+  // optimize away operation when it's from a constant.
+  //
+  // The general transformation is:
+  //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
+  //       AND(VECTOR_CMP(x,y), constant2)
+  //    constant2 = UNARYOP(constant)
+
+  // Early exit if this isn't a vector operation, the operand of the
+  // unary operation isn't a bitwise AND, or if the sizes of the operations
+  // aren't the same.
+  EVT VT = N->getValueType(0);
+  if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
+      N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
+      VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
+    return SDValue();
+
+  // Now check that the other operand of the AND is a constant splat. We could
+  // make the transformation for non-constant splats as well, but it's unclear
+  // that would be a benefit as it would not eliminate any operations, just
+  // perform one more step in scalar code before moving to the vector unit.
+  if (BuildVectorSDNode *BV =
+          dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
+    // Bail out if the vector isn't a constant splat.
+    if (!BV->getConstantSplatNode())
+      return SDValue();
+
+    // Everything checks out. Build up the new and improved node.
+    SDLoc DL(N);
+    EVT IntVT = BV->getValueType(0);
+    // Create a new constant of the appropriate type for the transformed
+    // DAG.
+    SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
+    // The AND node needs bitcasts to/from an integer vector type around it.
+    SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
+    SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
+                                 N->getOperand(0)->getOperand(0), MaskConst);
+    SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
+    return Res;
+  }
+
+  return SDValue();
+}
+
+static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
+                                        const X86TargetLowering *XTLI) {
+  // First try to optimize away the conversion entirely when it's
+  // conditionally from a constant. Vectors only.
+  SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG);
+  if (Res != SDValue())
+    return Res;
+
+  // Now move on to more general possibilities.
+  SDValue Op0 = N->getOperand(0);
+  EVT InVT = Op0->getValueType(0);
+
+  // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32))
+  if (InVT == MVT::v8i8 || InVT == MVT::v4i8) {
+    SDLoc dl(N);
+    MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
+    SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
+    return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P);
+  }
+
+  // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
+  // a 32-bit target where SSE doesn't support i64->FP operations.
+  if (Op0.getOpcode() == ISD::LOAD) {
+    LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
+    EVT VT = Ld->getValueType(0);
+    if (!Ld->isVolatile() && !N->getValueType(0).isVector() &&
+        ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
+        !XTLI->getSubtarget()->is64Bit() &&
+        VT == MVT::i64) {
+      SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0),
+                                          Ld->getChain(), Op0, DAG);
+      DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
+      return FILDChain;
+    }
+  }
+  return SDValue();
+}
+
+// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
+static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
+                                 X86TargetLowering::DAGCombinerInfo &DCI) {
+  // If the LHS and RHS of the ADC node are zero, then it can't overflow and
+  // the result is either zero or one (depending on the input carry bit).
+  // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
+  if (X86::isZeroNode(N->getOperand(0)) &&
+      X86::isZeroNode(N->getOperand(1)) &&
+      // We don't have a good way to replace an EFLAGS use, so only do this when
+      // dead right now.
+      SDValue(N, 1).use_empty()) {
+    SDLoc DL(N);
+    EVT VT = N->getValueType(0);
+    SDValue CarryOut = DAG.getConstant(0, N->getValueType(1));
+    SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
+                               DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
+                                           DAG.getConstant(X86::COND_B,MVT::i8),
+                                           N->getOperand(2)),
+                               DAG.getConstant(1, VT));
+    return DCI.CombineTo(N, Res1, CarryOut);
+  }
+
+  return SDValue();
+}
+
+// fold (add Y, (sete  X, 0)) -> adc  0, Y
+//      (add Y, (setne X, 0)) -> sbb -1, Y
+//      (sub (sete  X, 0), Y) -> sbb  0, Y
+//      (sub (setne X, 0), Y) -> adc -1, Y
+static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
+  SDLoc DL(N);
+
+  // Look through ZExts.
+  SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
+  if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
+    return SDValue();
+
+  SDValue SetCC = Ext.getOperand(0);
+  if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
+    return SDValue();
+
+  X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
+  if (CC != X86::COND_E && CC != X86::COND_NE)
+    return SDValue();
+
+  SDValue Cmp = SetCC.getOperand(1);
+  if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
+      !X86::isZeroNode(Cmp.getOperand(1)) ||
+      !Cmp.getOperand(0).getValueType().isInteger())
+    return SDValue();
+
+  SDValue CmpOp0 = Cmp.getOperand(0);
+  SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
+                               DAG.getConstant(1, CmpOp0.getValueType()));
+
+  SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
+  if (CC == X86::COND_NE)
+    return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
+                       DL, OtherVal.getValueType(), OtherVal,
+                       DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp);
+  return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
+                     DL, OtherVal.getValueType(), OtherVal,
+                     DAG.getConstant(0, OtherVal.getValueType()), NewCmp);
+}
+
+/// PerformADDCombine - Do target-specific dag combines on integer adds.
+static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG,
+                                 const X86Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+
+  // Try to synthesize horizontal adds from adds of shuffles.
+  if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
+       (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
+      isHorizontalBinOp(Op0, Op1, true))
+    return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
+
+  return OptimizeConditionalInDecrement(N, DAG);
+}
+
+static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
+                                 const X86Subtarget *Subtarget) {
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+
+  // X86 can't encode an immediate LHS of a sub. See if we can push the
+  // negation into a preceding instruction.
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
+    // If the RHS of the sub is a XOR with one use and a constant, invert the
+    // immediate. Then add one to the LHS of the sub so we can turn
+    // X-Y -> X+~Y+1, saving one register.
+    if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
+        isa<ConstantSDNode>(Op1.getOperand(1))) {
+      APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
+      EVT VT = Op0.getValueType();
+      SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
+                                   Op1.getOperand(0),
+                                   DAG.getConstant(~XorC, VT));
+      return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
+                         DAG.getConstant(C->getAPIntValue()+1, VT));
+    }
+  }
+
+  // Try to synthesize horizontal adds from adds of shuffles.
+  EVT VT = N->getValueType(0);
+  if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
+       (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
+      isHorizontalBinOp(Op0, Op1, true))
+    return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
+
+  return OptimizeConditionalInDecrement(N, DAG);
+}
+
+/// performVZEXTCombine - Performs build vector combines
+static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
+                                        TargetLowering::DAGCombinerInfo &DCI,
+                                        const X86Subtarget *Subtarget) {
+  // (vzext (bitcast (vzext (x)) -> (vzext x)
+  SDValue In = N->getOperand(0);
+  while (In.getOpcode() == ISD::BITCAST)
+    In = In.getOperand(0);
+
+  if (In.getOpcode() != X86ISD::VZEXT)
+    return SDValue();
+
+  return DAG.getNode(X86ISD::VZEXT, SDLoc(N), N->getValueType(0),
+                     In.getOperand(0));
+}
+
+SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
+                                             DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::EXTRACT_VECTOR_ELT:
+    return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);
+  case ISD::VSELECT:
+  case ISD::SELECT:         return PerformSELECTCombine(N, DAG, DCI, Subtarget);
+  case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI, Subtarget);
+  case ISD::ADD:            return PerformAddCombine(N, DAG, Subtarget);
+  case ISD::SUB:            return PerformSubCombine(N, DAG, Subtarget);
+  case X86ISD::ADC:         return PerformADCCombine(N, DAG, DCI);
+  case ISD::MUL:            return PerformMulCombine(N, DAG, DCI);
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:            return PerformShiftCombine(N, DAG, DCI, Subtarget);
+  case ISD::AND:            return PerformAndCombine(N, DAG, DCI, Subtarget);
+  case ISD::OR:             return PerformOrCombine(N, DAG, DCI, Subtarget);
+  case ISD::XOR:            return PerformXorCombine(N, DAG, DCI, Subtarget);
+  case ISD::LOAD:           return PerformLOADCombine(N, DAG, DCI, Subtarget);
+  case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
+  case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, this);
+  case ISD::FADD:           return PerformFADDCombine(N, DAG, Subtarget);
+  case ISD::FSUB:           return PerformFSUBCombine(N, DAG, Subtarget);
+  case X86ISD::FXOR:
+  case X86ISD::FOR:         return PerformFORCombine(N, DAG);
+  case X86ISD::FMIN:
+  case X86ISD::FMAX:        return PerformFMinFMaxCombine(N, DAG);
+  case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
+  case X86ISD::FANDN:       return PerformFANDNCombine(N, DAG);
+  case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
+  case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
+  case ISD::ANY_EXTEND:
+  case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG, DCI, Subtarget);
+  case ISD::SIGN_EXTEND:    return PerformSExtCombine(N, DAG, DCI, Subtarget);
+  case ISD::SIGN_EXTEND_INREG:
+    return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
+  case ISD::TRUNCATE:       return PerformTruncateCombine(N, DAG,DCI,Subtarget);
+  case ISD::SETCC:          return PerformISDSETCCCombine(N, DAG, Subtarget);
+  case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG, DCI, Subtarget);
+  case X86ISD::BRCOND:      return PerformBrCondCombine(N, DAG, DCI, Subtarget);
+  case X86ISD::VZEXT:       return performVZEXTCombine(N, DAG, DCI, Subtarget);
+  case X86ISD::SHUFP:       // Handle all target specific shuffles
+  case X86ISD::PALIGNR:
+  case X86ISD::UNPCKH:
+  case X86ISD::UNPCKL:
+  case X86ISD::MOVHLPS:
+  case X86ISD::MOVLHPS:
+  case X86ISD::PSHUFD:
+  case X86ISD::PSHUFHW:
+  case X86ISD::PSHUFLW:
+  case X86ISD::MOVSS:
+  case X86ISD::MOVSD:
+  case X86ISD::VPERMILP:
+  case X86ISD::VPERM2X128:
+  case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
+  case ISD::FMA:            return PerformFMACombine(N, DAG, Subtarget);
+  case ISD::INTRINSIC_WO_CHAIN:
+    return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget);
+  case X86ISD::INSERTPS:
+    return PerformINSERTPSCombine(N, DAG, Subtarget);
+  case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DAG, Subtarget);
+  }
+
+  return SDValue();
+}
+
+/// isTypeDesirableForOp - Return true if the target has native support for
+/// the specified value type and it is 'desirable' to use the type for the
+/// given node type. e.g. On x86 i16 is legal, but undesirable since i16
+/// instruction encodings are longer and some i16 instructions are slow.
+bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
+  if (!isTypeLegal(VT))
+    return false;
+  if (VT != MVT::i16)
+    return true;
+
+  switch (Opc) {
+  default:
+    return true;
+  case ISD::LOAD:
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::ANY_EXTEND:
+  case ISD::SHL:
+  case ISD::SRL:
+  case ISD::SUB:
+  case ISD::ADD:
+  case ISD::MUL:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+    return false;
+  }
+}
+
+/// IsDesirableToPromoteOp - This method query the target whether it is
+/// beneficial for dag combiner to promote the specified node. If true, it
+/// should return the desired promotion type by reference.
+bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
+  EVT VT = Op.getValueType();
+  if (VT != MVT::i16)
+    return false;
+
+  bool Promote = false;
+  bool Commute = false;
+  switch (Op.getOpcode()) {
+  default: break;
+  case ISD::LOAD: {
+    LoadSDNode *LD = cast<LoadSDNode>(Op);
+    // If the non-extending load has a single use and it's not live out, then it
+    // might be folded.
+    if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&&
+                                                     Op.hasOneUse()*/) {
+      for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+             UE = Op.getNode()->use_end(); UI != UE; ++UI) {
+        // The only case where we'd want to promote LOAD (rather then it being
+        // promoted as an operand is when it's only use is liveout.
+        if (UI->getOpcode() != ISD::CopyToReg)
+          return false;
+      }
+    }
+    Promote = true;
+    break;
+  }
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::ANY_EXTEND:
+    Promote = true;
+    break;
+  case ISD::SHL:
+  case ISD::SRL: {
+    SDValue N0 = Op.getOperand(0);
+    // Look out for (store (shl (load), x)).
+    if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
+      return false;
+    Promote = true;
+    break;
+  }
+  case ISD::ADD:
+  case ISD::MUL:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+    Commute = true;
+    // fallthrough
+  case ISD::SUB: {
+    SDValue N0 = Op.getOperand(0);
+    SDValue N1 = Op.getOperand(1);
+    if (!Commute && MayFoldLoad(N1))
+      return false;
+    // Avoid disabling potential load folding opportunities.
+    if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
+      return false;
+    if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
+      return false;
+    Promote = true;
+  }
+  }
+
+  PVT = MVT::i32;
+  return Promote;
+}
+
+//===----------------------------------------------------------------------===//
+//                           X86 Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+namespace {
+  // Helper to match a string separated by whitespace.
+  bool matchAsmImpl(StringRef s, ArrayRef<const StringRef *> args) {
+    s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace.
+
+    for (unsigned i = 0, e = args.size(); i != e; ++i) {
+      StringRef piece(*args[i]);
+      if (!s.startswith(piece)) // Check if the piece matches.
+        return false;
+
+      s = s.substr(piece.size());
+      StringRef::size_type pos = s.find_first_not_of(" \t");
+      if (pos == 0) // We matched a prefix.
+        return false;
+
+      s = s.substr(pos);
+    }
+
+    return s.empty();
+  }
+  const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={};
+}
+
+static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
+
+  if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
+    if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
+        std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
+        std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
+
+      if (AsmPieces.size() == 3)
+        return true;
+      else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
+        return true;
+    }
+  }
+  return false;
+}
+
+bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
+  InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
+
+  std::string AsmStr = IA->getAsmString();
+
+  IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+  if (!Ty || Ty->getBitWidth() % 16 != 0)
+    return false;
+
+  // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
+  SmallVector<StringRef, 4> AsmPieces;
+  SplitString(AsmStr, AsmPieces, ";\n");
+
+  switch (AsmPieces.size()) {
+  default: return false;
+  case 1:
+    // FIXME: this should verify that we are targeting a 486 or better.  If not,
+    // we will turn this bswap into something that will be lowered to logical
+    // ops instead of emitting the bswap asm.  For now, we don't support 486 or
+    // lower so don't worry about this.
+    // bswap $0
+    if (matchAsm(AsmPieces[0], "bswap", "$0") ||
+        matchAsm(AsmPieces[0], "bswapl", "$0") ||
+        matchAsm(AsmPieces[0], "bswapq", "$0") ||
+        matchAsm(AsmPieces[0], "bswap", "${0:q}") ||
+        matchAsm(AsmPieces[0], "bswapl", "${0:q}") ||
+        matchAsm(AsmPieces[0], "bswapq", "${0:q}")) {
+      // No need to check constraints, nothing other than the equivalent of
+      // "=r,0" would be valid here.
+      return IntrinsicLowering::LowerToByteSwap(CI);
+    }
+
+    // rorw $$8, ${0:w}  -->  llvm.bswap.i16
+    if (CI->getType()->isIntegerTy(16) &&
+        IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
+        (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") ||
+         matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) {
+      AsmPieces.clear();
+      const std::string &ConstraintsStr = IA->getConstraintString();
+      SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
+      array_pod_sort(AsmPieces.begin(), AsmPieces.end());
+      if (clobbersFlagRegisters(AsmPieces))
+        return IntrinsicLowering::LowerToByteSwap(CI);
+    }
+    break;
+  case 3:
+    if (CI->getType()->isIntegerTy(32) &&
+        IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
+        matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") &&
+        matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") &&
+        matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) {
+      AsmPieces.clear();
+      const std::string &ConstraintsStr = IA->getConstraintString();
+      SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
+      array_pod_sort(AsmPieces.begin(), AsmPieces.end());
+      if (clobbersFlagRegisters(AsmPieces))
+        return IntrinsicLowering::LowerToByteSwap(CI);
+    }
+
+    if (CI->getType()->isIntegerTy(64)) {
+      InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
+      if (Constraints.size() >= 2 &&
+          Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
+          Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
+        // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
+        if (matchAsm(AsmPieces[0], "bswap", "%eax") &&
+            matchAsm(AsmPieces[1], "bswap", "%edx") &&
+            matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx"))
+          return IntrinsicLowering::LowerToByteSwap(CI);
+      }
+    }
+    break;
+  }
+  return false;
+}
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+X86TargetLowering::ConstraintType
+X86TargetLowering::getConstraintType(const std::string &Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'R':
+    case 'q':
+    case 'Q':
+    case 'f':
+    case 't':
+    case 'u':
+    case 'y':
+    case 'x':
+    case 'Y':
+    case 'l':
+      return C_RegisterClass;
+    case 'a':
+    case 'b':
+    case 'c':
+    case 'd':
+    case 'S':
+    case 'D':
+    case 'A':
+      return C_Register;
+    case 'I':
+    case 'J':
+    case 'K':
+    case 'L':
+    case 'M':
+    case 'N':
+    case 'G':
+    case 'C':
+    case 'e':
+    case 'Z':
+      return C_Other;
+    default:
+      break;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+/// Examine constraint type and operand type and determine a weight value.
+/// This object must already have been set up with the operand type
+/// and the current alternative constraint selected.
+TargetLowering::ConstraintWeight
+  X86TargetLowering::getSingleConstraintMatchWeight(
+    AsmOperandInfo &info, const char *constraint) const {
+  ConstraintWeight weight = CW_Invalid;
+  Value *CallOperandVal = info.CallOperandVal;
+    // If we don't have a value, we can't do a match,
+    // but allow it at the lowest weight.
+  if (!CallOperandVal)
+    return CW_Default;
+  Type *type = CallOperandVal->getType();
+  // Look at the constraint type.
+  switch (*constraint) {
+  default:
+    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+  case 'R':
+  case 'q':
+  case 'Q':
+  case 'a':
+  case 'b':
+  case 'c':
+  case 'd':
+  case 'S':
+  case 'D':
+  case 'A':
+    if (CallOperandVal->getType()->isIntegerTy())
+      weight = CW_SpecificReg;
+    break;
+  case 'f':
+  case 't':
+  case 'u':
+    if (type->isFloatingPointTy())
+      weight = CW_SpecificReg;
+    break;
+  case 'y':
+    if (type->isX86_MMXTy() && Subtarget->hasMMX())
+      weight = CW_SpecificReg;
+    break;
+  case 'x':
+  case 'Y':
+    if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) ||
+        ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasFp256()))
+      weight = CW_Register;
+    break;
+  case 'I':
+    if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
+      if (C->getZExtValue() <= 31)
+        weight = CW_Constant;
+    }
+    break;
+  case 'J':
+    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+      if (C->getZExtValue() <= 63)
+        weight = CW_Constant;
+    }
+    break;
+  case 'K':
+    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+      if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
+        weight = CW_Constant;
+    }
+    break;
+  case 'L':
+    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+      if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
+        weight = CW_Constant;
+    }
+    break;
+  case 'M':
+    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+      if (C->getZExtValue() <= 3)
+        weight = CW_Constant;
+    }
+    break;
+  case 'N':
+    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+      if (C->getZExtValue() <= 0xff)
+        weight = CW_Constant;
+    }
+    break;
+  case 'G':
+  case 'C':
+    if (dyn_cast<ConstantFP>(CallOperandVal)) {
+      weight = CW_Constant;
+    }
+    break;
+  case 'e':
+    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+      if ((C->getSExtValue() >= -0x80000000LL) &&
+          (C->getSExtValue() <= 0x7fffffffLL))
+        weight = CW_Constant;
+    }
+    break;
+  case 'Z':
+    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+      if (C->getZExtValue() <= 0xffffffff)
+        weight = CW_Constant;
+    }
+    break;
+  }
+  return weight;
+}
+
+/// LowerXConstraint - try to replace an X constraint, which matches anything,
+/// with another that has more specific requirements based on the type of the
+/// corresponding operand.
+const char *X86TargetLowering::
+LowerXConstraint(EVT ConstraintVT) const {
+  // FP X constraints get lowered to SSE1/2 registers if available, otherwise
+  // 'f' like normal targets.
+  if (ConstraintVT.isFloatingPoint()) {
+    if (Subtarget->hasSSE2())
+      return "Y";
+    if (Subtarget->hasSSE1())
+      return "x";
+  }
+
+  return TargetLowering::LowerXConstraint(ConstraintVT);
+}
+
+/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+/// vector.  If it is invalid, don't add anything to Ops.
+void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+                                                     std::string &Constraint,
+                                                     std::vector<SDValue>&Ops,
+                                                     SelectionDAG &DAG) const {
+  SDValue Result;
+
+  // Only support length 1 constraints for now.
+  if (Constraint.length() > 1) return;
+
+  char ConstraintLetter = Constraint[0];
+  switch (ConstraintLetter) {
+  default: break;
+  case 'I':
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (C->getZExtValue() <= 31) {
+        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+        break;
+      }
+    }
+    return;
+  case 'J':
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (C->getZExtValue() <= 63) {
+        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+        break;
+      }
+    }
+    return;
+  case 'K':
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (isInt<8>(C->getSExtValue())) {
+        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+        break;
+      }
+    }
+    return;
+  case 'N':
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (C->getZExtValue() <= 255) {
+        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+        break;
+      }
+    }
+    return;
+  case 'e': {
+    // 32-bit signed value
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
+                                           C->getSExtValue())) {
+        // Widen to 64 bits here to get it sign extended.
+        Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64);
+        break;
+      }
+    // FIXME gcc accepts some relocatable values here too, but only in certain
+    // memory models; it's complicated.
+    }
+    return;
+  }
+  case 'Z': {
+    // 32-bit unsigned value
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
+                                           C->getZExtValue())) {
+        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+        break;
+      }
+    }
+    // FIXME gcc accepts some relocatable values here too, but only in certain
+    // memory models; it's complicated.
+    return;
+  }
+  case 'i': {
+    // Literal immediates are always ok.
+    if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
+      // Widen to 64 bits here to get it sign extended.
+      Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64);
+      break;
+    }
+
+    // In any sort of PIC mode addresses need to be computed at runtime by
+    // adding in a register or some sort of table lookup.  These can't
+    // be used as immediates.
+    if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC())
+      return;
+
+    // If we are in non-pic codegen mode, we allow the address of a global (with
+    // an optional displacement) to be used with 'i'.
+    GlobalAddressSDNode *GA = nullptr;
+    int64_t Offset = 0;
+
+    // Match either (GA), (GA+C), (GA+C1+C2), etc.
+    while (1) {
+      if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
+        Offset += GA->getOffset();
+        break;
+      } else if (Op.getOpcode() == ISD::ADD) {
+        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+          Offset += C->getZExtValue();
+          Op = Op.getOperand(0);
+          continue;
+        }
+      } else if (Op.getOpcode() == ISD::SUB) {
+        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+          Offset += -C->getZExtValue();
+          Op = Op.getOperand(0);
+          continue;
+        }
+      }
+
+      // Otherwise, this isn't something we can handle, reject it.
+      return;
+    }
+
+    const GlobalValue *GV = GA->getGlobal();
+    // If we require an extra load to get this address, as in PIC mode, we
+    // can't accept it.
+    if (isGlobalStubReference(
+            Subtarget->ClassifyGlobalReference(GV, DAG.getTarget())))
+      return;
+
+    Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
+                                        GA->getValueType(0), Offset);
+    break;
+  }
+  }
+
+  if (Result.getNode()) {
+    Ops.push_back(Result);
+    return;
+  }
+  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+std::pair<unsigned, const TargetRegisterClass*>
+X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
+                                                MVT VT) const {
+  // First, see if this is a constraint that directly corresponds to an LLVM
+  // register class.
+  if (Constraint.size() == 1) {
+    // GCC Constraint Letters
+    switch (Constraint[0]) {
+    default: break;
+      // TODO: Slight differences here in allocation order and leaving
+      // RIP in the class. Do they matter any more here than they do
+      // in the normal allocation?
+    case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
+      if (Subtarget->is64Bit()) {
+        if (VT == MVT::i32 || VT == MVT::f32)
+          return std::make_pair(0U, &X86::GR32RegClass);
+        if (VT == MVT::i16)
+          return std::make_pair(0U, &X86::GR16RegClass);
+        if (VT == MVT::i8 || VT == MVT::i1)
+          return std::make_pair(0U, &X86::GR8RegClass);
+        if (VT == MVT::i64 || VT == MVT::f64)
+          return std::make_pair(0U, &X86::GR64RegClass);
+        break;
+      }
+      // 32-bit fallthrough
+    case 'Q':   // Q_REGS
+      if (VT == MVT::i32 || VT == MVT::f32)
+        return std::make_pair(0U, &X86::GR32_ABCDRegClass);
+      if (VT == MVT::i16)
+        return std::make_pair(0U, &X86::GR16_ABCDRegClass);
+      if (VT == MVT::i8 || VT == MVT::i1)
+        return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
+      if (VT == MVT::i64)
+        return std::make_pair(0U, &X86::GR64_ABCDRegClass);
+      break;
+    case 'r':   // GENERAL_REGS
+    case 'l':   // INDEX_REGS
+      if (VT == MVT::i8 || VT == MVT::i1)
+        return std::make_pair(0U, &X86::GR8RegClass);
+      if (VT == MVT::i16)
+        return std::make_pair(0U, &X86::GR16RegClass);
+      if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit())
+        return std::make_pair(0U, &X86::GR32RegClass);
+      return std::make_pair(0U, &X86::GR64RegClass);
+    case 'R':   // LEGACY_REGS
+      if (VT == MVT::i8 || VT == MVT::i1)
+        return std::make_pair(0U, &X86::GR8_NOREXRegClass);
+      if (VT == MVT::i16)
+        return std::make_pair(0U, &X86::GR16_NOREXRegClass);
+      if (VT == MVT::i32 || !Subtarget->is64Bit())
+        return std::make_pair(0U, &X86::GR32_NOREXRegClass);
+      return std::make_pair(0U, &X86::GR64_NOREXRegClass);
+    case 'f':  // FP Stack registers.
+      // If SSE is enabled for this VT, use f80 to ensure the isel moves the
+      // value to the correct fpstack register class.
+      if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
+        return std::make_pair(0U, &X86::RFP32RegClass);
+      if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
+        return std::make_pair(0U, &X86::RFP64RegClass);
+      return std::make_pair(0U, &X86::RFP80RegClass);
+    case 'y':   // MMX_REGS if MMX allowed.
+      if (!Subtarget->hasMMX()) break;
+      return std::make_pair(0U, &X86::VR64RegClass);
+    case 'Y':   // SSE_REGS if SSE2 allowed
+      if (!Subtarget->hasSSE2()) break;
+      // FALL THROUGH.
+    case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
+      if (!Subtarget->hasSSE1()) break;
+
+      switch (VT.SimpleTy) {
+      default: break;
+      // Scalar SSE types.
+      case MVT::f32:
+      case MVT::i32:
+        return std::make_pair(0U, &X86::FR32RegClass);
+      case MVT::f64:
+      case MVT::i64:
+        return std::make_pair(0U, &X86::FR64RegClass);
+      // Vector types.
+      case MVT::v16i8:
+      case MVT::v8i16:
+      case MVT::v4i32:
+      case MVT::v2i64:
+      case MVT::v4f32:
+      case MVT::v2f64:
+        return std::make_pair(0U, &X86::VR128RegClass);
+      // AVX types.
+      case MVT::v32i8:
+      case MVT::v16i16:
+      case MVT::v8i32:
+      case MVT::v4i64:
+      case MVT::v8f32:
+      case MVT::v4f64:
+        return std::make_pair(0U, &X86::VR256RegClass);
+      case MVT::v8f64:
+      case MVT::v16f32:
+      case MVT::v16i32:
+      case MVT::v8i64:
+        return std::make_pair(0U, &X86::VR512RegClass);
+      }
+      break;
+    }
+  }
+
+  // Use the default implementation in TargetLowering to convert the register
+  // constraint into a member of a register class.
+  std::pair<unsigned, const TargetRegisterClass*> Res;
+  Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+
+  // Not found as a standard register?
+  if (!Res.second) {
+    // Map st(0) -> st(7) -> ST0
+    if (Constraint.size() == 7 && Constraint[0] == '{' &&
+        tolower(Constraint[1]) == 's' &&
+        tolower(Constraint[2]) == 't' &&
+        Constraint[3] == '(' &&
+        (Constraint[4] >= '0' && Constraint[4] <= '7') &&
+        Constraint[5] == ')' &&
+        Constraint[6] == '}') {
+
+      Res.first = X86::ST0+Constraint[4]-'0';
+      Res.second = &X86::RFP80RegClass;
+      return Res;
+    }
+
+    // GCC allows "st(0)" to be called just plain "st".
+    if (StringRef("{st}").equals_lower(Constraint)) {
+      Res.first = X86::ST0;
+      Res.second = &X86::RFP80RegClass;
+      return Res;
+    }
+
+    // flags -> EFLAGS
+    if (StringRef("{flags}").equals_lower(Constraint)) {
+      Res.first = X86::EFLAGS;
+      Res.second = &X86::CCRRegClass;
+      return Res;
+    }
+
+    // 'A' means EAX + EDX.
+    if (Constraint == "A") {
+      Res.first = X86::EAX;
+      Res.second = &X86::GR32_ADRegClass;
+      return Res;
+    }
+    return Res;
+  }
+
+  // Otherwise, check to see if this is a register class of the wrong value
+  // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
+  // turn into {ax},{dx}.
+  if (Res.second->hasType(VT))
+    return Res;   // Correct type already, nothing to do.
+
+  // All of the single-register GCC register classes map their values onto
+  // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp".  If we
+  // really want an 8-bit or 32-bit register, map to the appropriate register
+  // class and return the appropriate register.
+  if (Res.second == &X86::GR16RegClass) {
+    if (VT == MVT::i8 || VT == MVT::i1) {
+      unsigned DestReg = 0;
+      switch (Res.first) {
+      default: break;
+      case X86::AX: DestReg = X86::AL; break;
+      case X86::DX: DestReg = X86::DL; break;
+      case X86::CX: DestReg = X86::CL; break;
+      case X86::BX: DestReg = X86::BL; break;
+      }
+      if (DestReg) {
+        Res.first = DestReg;
+        Res.second = &X86::GR8RegClass;
+      }
+    } else if (VT == MVT::i32 || VT == MVT::f32) {
+      unsigned DestReg = 0;
+      switch (Res.first) {
+      default: break;
+      case X86::AX: DestReg = X86::EAX; break;
+      case X86::DX: DestReg = X86::EDX; break;
+      case X86::CX: DestReg = X86::ECX; break;
+      case X86::BX: DestReg = X86::EBX; break;
+      case X86::SI: DestReg = X86::ESI; break;
+      case X86::DI: DestReg = X86::EDI; break;
+      case X86::BP: DestReg = X86::EBP; break;
+      case X86::SP: DestReg = X86::ESP; break;
+      }
+      if (DestReg) {
+        Res.first = DestReg;
+        Res.second = &X86::GR32RegClass;
+      }
+    } else if (VT == MVT::i64 || VT == MVT::f64) {
+      unsigned DestReg = 0;
+      switch (Res.first) {
+      default: break;
+      case X86::AX: DestReg = X86::RAX; break;
+      case X86::DX: DestReg = X86::RDX; break;
+      case X86::CX: DestReg = X86::RCX; break;
+      case X86::BX: DestReg = X86::RBX; break;
+      case X86::SI: DestReg = X86::RSI; break;
+      case X86::DI: DestReg = X86::RDI; break;
+      case X86::BP: DestReg = X86::RBP; break;
+      case X86::SP: DestReg = X86::RSP; break;
+      }
+      if (DestReg) {
+        Res.first = DestReg;
+        Res.second = &X86::GR64RegClass;
+      }
+    }
+  } else if (Res.second == &X86::FR32RegClass ||
+             Res.second == &X86::FR64RegClass ||
+             Res.second == &X86::VR128RegClass ||
+             Res.second == &X86::VR256RegClass ||
+             Res.second == &X86::FR32XRegClass ||
+             Res.second == &X86::FR64XRegClass ||
+             Res.second == &X86::VR128XRegClass ||
+             Res.second == &X86::VR256XRegClass ||
+             Res.second == &X86::VR512RegClass) {
+    // Handle references to XMM physical registers that got mapped into the
+    // wrong class.  This can happen with constraints like {xmm0} where the
+    // target independent register mapper will just pick the first match it can
+    // find, ignoring the required type.
+
+    if (VT == MVT::f32 || VT == MVT::i32)
+      Res.second = &X86::FR32RegClass;
+    else if (VT == MVT::f64 || VT == MVT::i64)
+      Res.second = &X86::FR64RegClass;
+    else if (X86::VR128RegClass.hasType(VT))
+      Res.second = &X86::VR128RegClass;
+    else if (X86::VR256RegClass.hasType(VT))
+      Res.second = &X86::VR256RegClass;
+    else if (X86::VR512RegClass.hasType(VT))
+      Res.second = &X86::VR512RegClass;
+  }
+
+  return Res;
+}
+
+int X86TargetLowering::getScalingFactorCost(const AddrMode &AM,
+                                            Type *Ty) const {
+  // Scaling factors are not free at all.
+  // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
+  // will take 2 allocations in the out of order engine instead of 1
+  // for plain addressing mode, i.e. inst (reg1).
+  // E.g.,
+  // vaddps (%rsi,%drx), %ymm0, %ymm1
+  // Requires two allocations (one for the load, one for the computation)
+  // whereas:
+  // vaddps (%rsi), %ymm0, %ymm1
+  // Requires just 1 allocation, i.e., freeing allocations for other operations
+  // and having less micro operations to execute.
+  //
+  // For some X86 architectures, this is even worse because for instance for
+  // stores, the complex addressing mode forces the instruction to use the
+  // "load" ports instead of the dedicated "store" port.
+  // E.g., on Haswell:
+  // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
+  // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.   
+  if (isLegalAddressingMode(AM, Ty))
+    // Scale represents reg2 * scale, thus account for 1
+    // as soon as we use a second register.
+    return AM.Scale != 0;
+  return -1;
+}
+
+bool X86TargetLowering::isTargetFTOL() const {
+  return Subtarget->isTargetKnownWindowsMSVC() && !Subtarget->is64Bit();
+}
diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.h b/contrib/llvm/lib/Target/X86/X86ISelLowering.h
new file mode 100644
index 0000000..c8cdce7
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.h
@@ -0,0 +1,1016 @@
+//===-- X86ISelLowering.h - X86 DAG Lowering Interface ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that X86 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86ISELLOWERING_H
+#define X86ISELLOWERING_H
+
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetOptions.h"
+
+namespace llvm {
+  class X86Subtarget;
+  class X86TargetMachine;
+
+  namespace X86ISD {
+    // X86 Specific DAG Nodes
+    enum NodeType {
+      // Start the numbering where the builtin ops leave off.
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+      /// BSF - Bit scan forward.
+      /// BSR - Bit scan reverse.
+      BSF,
+      BSR,
+
+      /// SHLD, SHRD - Double shift instructions. These correspond to
+      /// X86::SHLDxx and X86::SHRDxx instructions.
+      SHLD,
+      SHRD,
+
+      /// FAND - Bitwise logical AND of floating point values. This corresponds
+      /// to X86::ANDPS or X86::ANDPD.
+      FAND,
+
+      /// FOR - Bitwise logical OR of floating point values. This corresponds
+      /// to X86::ORPS or X86::ORPD.
+      FOR,
+
+      /// FXOR - Bitwise logical XOR of floating point values. This corresponds
+      /// to X86::XORPS or X86::XORPD.
+      FXOR,
+
+      /// FANDN - Bitwise logical ANDNOT of floating point values. This
+      /// corresponds to X86::ANDNPS or X86::ANDNPD.
+      FANDN,
+
+      /// FSRL - Bitwise logical right shift of floating point values. These
+      /// corresponds to X86::PSRLDQ.
+      FSRL,
+
+      /// CALL - These operations represent an abstract X86 call
+      /// instruction, which includes a bunch of information.  In particular the
+      /// operands of these node are:
+      ///
+      ///     #0 - The incoming token chain
+      ///     #1 - The callee
+      ///     #2 - The number of arg bytes the caller pushes on the stack.
+      ///     #3 - The number of arg bytes the callee pops off the stack.
+      ///     #4 - The value to pass in AL/AX/EAX (optional)
+      ///     #5 - The value to pass in DL/DX/EDX (optional)
+      ///
+      /// The result values of these nodes are:
+      ///
+      ///     #0 - The outgoing token chain
+      ///     #1 - The first register result value (optional)
+      ///     #2 - The second register result value (optional)
+      ///
+      CALL,
+
+      /// RDTSC_DAG - This operation implements the lowering for
+      /// readcyclecounter
+      RDTSC_DAG,
+
+      /// X86 Read Time-Stamp Counter and Processor ID.
+      RDTSCP_DAG,
+
+      /// X86 Read Performance Monitoring Counters.
+      RDPMC_DAG,
+
+      /// X86 compare and logical compare instructions.
+      CMP, COMI, UCOMI,
+
+      /// X86 bit-test instructions.
+      BT,
+
+      /// X86 SetCC. Operand 0 is condition code, and operand 1 is the EFLAGS
+      /// operand, usually produced by a CMP instruction.
+      SETCC,
+
+      /// X86 Select
+      SELECT,
+
+      // Same as SETCC except it's materialized with a sbb and the value is all
+      // one's or all zero's.
+      SETCC_CARRY,  // R = carry_bit ? ~0 : 0
+
+      /// X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
+      /// Operands are two FP values to compare; result is a mask of
+      /// 0s or 1s.  Generally DTRT for C/C++ with NaNs.
+      FSETCC,
+
+      /// X86 MOVMSK{pd|ps}, extracts sign bits of two or four FP values,
+      /// result in an integer GPR.  Needs masking for scalar result.
+      FGETSIGNx86,
+
+      /// X86 conditional moves. Operand 0 and operand 1 are the two values
+      /// to select from. Operand 2 is the condition code, and operand 3 is the
+      /// flag operand produced by a CMP or TEST instruction. It also writes a
+      /// flag result.
+      CMOV,
+
+      /// X86 conditional branches. Operand 0 is the chain operand, operand 1
+      /// is the block to branch if condition is true, operand 2 is the
+      /// condition code, and operand 3 is the flag operand produced by a CMP
+      /// or TEST instruction.
+      BRCOND,
+
+      /// Return with a flag operand. Operand 0 is the chain operand, operand
+      /// 1 is the number of bytes of stack to pop.
+      RET_FLAG,
+
+      /// REP_STOS - Repeat fill, corresponds to X86::REP_STOSx.
+      REP_STOS,
+
+      /// REP_MOVS - Repeat move, corresponds to X86::REP_MOVSx.
+      REP_MOVS,
+
+      /// GlobalBaseReg - On Darwin, this node represents the result of the popl
+      /// at function entry, used for PIC code.
+      GlobalBaseReg,
+
+      /// Wrapper - A wrapper node for TargetConstantPool,
+      /// TargetExternalSymbol, and TargetGlobalAddress.
+      Wrapper,
+
+      /// WrapperRIP - Special wrapper used under X86-64 PIC mode for RIP
+      /// relative displacements.
+      WrapperRIP,
+
+      /// MOVDQ2Q - Copies a 64-bit value from the low word of an XMM vector
+      /// to an MMX vector.  If you think this is too close to the previous
+      /// mnemonic, so do I; blame Intel.
+      MOVDQ2Q,
+
+      /// MMX_MOVD2W - Copies a 32-bit value from the low word of a MMX
+      /// vector to a GPR.
+      MMX_MOVD2W,
+
+      /// PEXTRB - Extract an 8-bit value from a vector and zero extend it to
+      /// i32, corresponds to X86::PEXTRB.
+      PEXTRB,
+
+      /// PEXTRW - Extract a 16-bit value from a vector and zero extend it to
+      /// i32, corresponds to X86::PEXTRW.
+      PEXTRW,
+
+      /// INSERTPS - Insert any element of a 4 x float vector into any element
+      /// of a destination 4 x floatvector.
+      INSERTPS,
+
+      /// PINSRB - Insert the lower 8-bits of a 32-bit value to a vector,
+      /// corresponds to X86::PINSRB.
+      PINSRB,
+
+      /// PINSRW - Insert the lower 16-bits of a 32-bit value to a vector,
+      /// corresponds to X86::PINSRW.
+      PINSRW, MMX_PINSRW,
+
+      /// PSHUFB - Shuffle 16 8-bit values within a vector.
+      PSHUFB,
+
+      /// ANDNP - Bitwise Logical AND NOT of Packed FP values.
+      ANDNP,
+
+      /// PSIGN - Copy integer sign.
+      PSIGN,
+
+      /// BLENDV - Blend where the selector is a register.
+      BLENDV,
+
+      /// BLENDI - Blend where the selector is an immediate.
+      BLENDI,
+
+      // SUBUS - Integer sub with unsigned saturation.
+      SUBUS,
+
+      /// HADD - Integer horizontal add.
+      HADD,
+
+      /// HSUB - Integer horizontal sub.
+      HSUB,
+
+      /// FHADD - Floating point horizontal add.
+      FHADD,
+
+      /// FHSUB - Floating point horizontal sub.
+      FHSUB,
+
+      /// UMAX, UMIN - Unsigned integer max and min.
+      UMAX, UMIN,
+
+      /// SMAX, SMIN - Signed integer max and min.
+      SMAX, SMIN,
+
+      /// FMAX, FMIN - Floating point max and min.
+      ///
+      FMAX, FMIN,
+
+      /// FMAXC, FMINC - Commutative FMIN and FMAX.
+      FMAXC, FMINC,
+
+      /// FRSQRT, FRCP - Floating point reciprocal-sqrt and reciprocal
+      /// approximation.  Note that these typically require refinement
+      /// in order to obtain suitable precision.
+      FRSQRT, FRCP,
+
+      // TLSADDR - Thread Local Storage.
+      TLSADDR,
+
+      // TLSBASEADDR - Thread Local Storage. A call to get the start address
+      // of the TLS block for the current module.
+      TLSBASEADDR,
+
+      // TLSCALL - Thread Local Storage.  When calling to an OS provided
+      // thunk at the address from an earlier relocation.
+      TLSCALL,
+
+      // EH_RETURN - Exception Handling helpers.
+      EH_RETURN,
+
+      // EH_SJLJ_SETJMP - SjLj exception handling setjmp.
+      EH_SJLJ_SETJMP,
+
+      // EH_SJLJ_LONGJMP - SjLj exception handling longjmp.
+      EH_SJLJ_LONGJMP,
+
+      /// TC_RETURN - Tail call return. See X86TargetLowering::LowerCall for
+      /// the list of operands.
+      TC_RETURN,
+
+      // VZEXT_MOVL - Vector move to low scalar and zero higher vector elements.
+      VZEXT_MOVL,
+
+      // VZEXT - Vector integer zero-extend.
+      VZEXT,
+
+      // VSEXT - Vector integer signed-extend.
+      VSEXT,
+
+      // VTRUNC - Vector integer truncate.
+      VTRUNC,
+
+      // VTRUNC - Vector integer truncate with mask.
+      VTRUNCM,
+
+      // VFPEXT - Vector FP extend.
+      VFPEXT,
+
+      // VFPROUND - Vector FP round.
+      VFPROUND,
+
+      // VSHL, VSRL - 128-bit vector logical left / right shift
+      VSHLDQ, VSRLDQ,
+
+      // VSHL, VSRL, VSRA - Vector shift elements
+      VSHL, VSRL, VSRA,
+
+      // VSHLI, VSRLI, VSRAI - Vector shift elements by immediate
+      VSHLI, VSRLI, VSRAI,
+
+      // CMPP - Vector packed double/float comparison.
+      CMPP,
+
+      // PCMP* - Vector integer comparisons.
+      PCMPEQ, PCMPGT,
+      // PCMP*M - Vector integer comparisons, the result is in a mask vector.
+      PCMPEQM, PCMPGTM,
+
+      /// CMPM, CMPMU - Vector comparison generating mask bits for fp and
+      /// integer signed and unsigned data types.
+      CMPM,
+      CMPMU,
+
+      // ADD, SUB, SMUL, etc. - Arithmetic operations with FLAGS results.
+      ADD, SUB, ADC, SBB, SMUL,
+      INC, DEC, OR, XOR, AND,
+
+      BEXTR,  // BEXTR - Bit field extract
+
+      UMUL, // LOW, HI, FLAGS = umul LHS, RHS
+
+      // MUL_IMM - X86 specific multiply by immediate.
+      MUL_IMM,
+
+      // PTEST - Vector bitwise comparisons.
+      PTEST,
+
+      // TESTP - Vector packed fp sign bitwise comparisons.
+      TESTP,
+
+      // TESTM, TESTNM - Vector "test" in AVX-512, the result is in a mask vector.
+      TESTM,
+      TESTNM,
+
+      // OR/AND test for masks
+      KORTEST,
+
+      // Several flavors of instructions with vector shuffle behaviors.
+      PACKSS,
+      PACKUS,
+      PALIGNR,
+      PSHUFD,
+      PSHUFHW,
+      PSHUFLW,
+      SHUFP,
+      MOVDDUP,
+      MOVSHDUP,
+      MOVSLDUP,
+      MOVLHPS,
+      MOVLHPD,
+      MOVHLPS,
+      MOVLPS,
+      MOVLPD,
+      MOVSD,
+      MOVSS,
+      UNPCKL,
+      UNPCKH,
+      VPERMILP,
+      VPERMV,
+      VPERMV3,
+      VPERMIV3,
+      VPERMI,
+      VPERM2X128,
+      VBROADCAST,
+      // masked broadcast
+      VBROADCASTM,
+      // Insert/Extract vector element
+      VINSERT,
+      VEXTRACT,
+
+      // PMULUDQ - Vector multiply packed unsigned doubleword integers
+      PMULUDQ,
+      // PMULUDQ - Vector multiply packed signed doubleword integers
+      PMULDQ,
+
+      // FMA nodes
+      FMADD,
+      FNMADD,
+      FMSUB,
+      FNMSUB,
+      FMADDSUB,
+      FMSUBADD,
+
+      // VASTART_SAVE_XMM_REGS - Save xmm argument registers to the stack,
+      // according to %al. An operator is needed so that this can be expanded
+      // with control flow.
+      VASTART_SAVE_XMM_REGS,
+
+      // WIN_ALLOCA - Windows's _chkstk call to do stack probing.
+      WIN_ALLOCA,
+
+      // SEG_ALLOCA - For allocating variable amounts of stack space when using
+      // segmented stacks. Check if the current stacklet has enough space, and
+      // falls back to heap allocation if not.
+      SEG_ALLOCA,
+
+      // WIN_FTOL - Windows's _ftol2 runtime routine to do fptoui.
+      WIN_FTOL,
+
+      // Memory barrier
+      MEMBARRIER,
+      MFENCE,
+      SFENCE,
+      LFENCE,
+
+      // FNSTSW16r - Store FP status word into i16 register.
+      FNSTSW16r,
+
+      // SAHF - Store contents of %ah into %eflags.
+      SAHF,
+
+      // RDRAND - Get a random integer and indicate whether it is valid in CF.
+      RDRAND,
+
+      // RDSEED - Get a NIST SP800-90B & C compliant random integer and
+      // indicate whether it is valid in CF.
+      RDSEED,
+
+      // PCMP*STRI
+      PCMPISTRI,
+      PCMPESTRI,
+
+      // XTEST - Test if in transactional execution.
+      XTEST,
+
+      // LCMPXCHG_DAG, LCMPXCHG8_DAG, LCMPXCHG16_DAG - Compare and swap.
+      LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE,
+      LCMPXCHG8_DAG,
+      LCMPXCHG16_DAG,
+
+      // VZEXT_LOAD - Load, scalar_to_vector, and zero extend.
+      VZEXT_LOAD,
+
+      // FNSTCW16m - Store FP control world into i16 memory.
+      FNSTCW16m,
+
+      /// FP_TO_INT*_IN_MEM - This instruction implements FP_TO_SINT with the
+      /// integer destination in memory and a FP reg source.  This corresponds
+      /// to the X86::FIST*m instructions and the rounding mode change stuff. It
+      /// has two inputs (token chain and address) and two outputs (int value
+      /// and token chain).
+      FP_TO_INT16_IN_MEM,
+      FP_TO_INT32_IN_MEM,
+      FP_TO_INT64_IN_MEM,
+
+      /// FILD, FILD_FLAG - This instruction implements SINT_TO_FP with the
+      /// integer source in memory and FP reg result.  This corresponds to the
+      /// X86::FILD*m instructions. It has three inputs (token chain, address,
+      /// and source type) and two outputs (FP value and token chain). FILD_FLAG
+      /// also produces a flag).
+      FILD,
+      FILD_FLAG,
+
+      /// FLD - This instruction implements an extending load to FP stack slots.
+      /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain
+      /// operand, ptr to load from, and a ValueType node indicating the type
+      /// to load to.
+      FLD,
+
+      /// FST - This instruction implements a truncating store to FP stack
+      /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a
+      /// chain operand, value to store, address, and a ValueType to store it
+      /// as.
+      FST,
+
+      /// VAARG_64 - This instruction grabs the address of the next argument
+      /// from a va_list. (reads and modifies the va_list in memory)
+      VAARG_64
+
+      // WARNING: Do not add anything in the end unless you want the node to
+      // have memop! In fact, starting from ATOMADD64_DAG all opcodes will be
+      // thought as target memory ops!
+    };
+  }
+
+  /// Define some predicates that are used for node matching.
+  namespace X86 {
+    /// isVEXTRACT128Index - Return true if the specified
+    /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
+    /// suitable for input to VEXTRACTF128, VEXTRACTI128 instructions.
+    bool isVEXTRACT128Index(SDNode *N);
+
+    /// isVINSERT128Index - Return true if the specified
+    /// INSERT_SUBVECTOR operand specifies a subvector insert that is
+    /// suitable for input to VINSERTF128, VINSERTI128 instructions.
+    bool isVINSERT128Index(SDNode *N);
+
+    /// isVEXTRACT256Index - Return true if the specified
+    /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
+    /// suitable for input to VEXTRACTF64X4, VEXTRACTI64X4 instructions.
+    bool isVEXTRACT256Index(SDNode *N);
+
+    /// isVINSERT256Index - Return true if the specified
+    /// INSERT_SUBVECTOR operand specifies a subvector insert that is
+    /// suitable for input to VINSERTF64X4, VINSERTI64X4 instructions.
+    bool isVINSERT256Index(SDNode *N);
+
+    /// getExtractVEXTRACT128Immediate - Return the appropriate
+    /// immediate to extract the specified EXTRACT_SUBVECTOR index
+    /// with VEXTRACTF128, VEXTRACTI128 instructions.
+    unsigned getExtractVEXTRACT128Immediate(SDNode *N);
+
+    /// getInsertVINSERT128Immediate - Return the appropriate
+    /// immediate to insert at the specified INSERT_SUBVECTOR index
+    /// with VINSERTF128, VINSERT128 instructions.
+    unsigned getInsertVINSERT128Immediate(SDNode *N);
+
+    /// getExtractVEXTRACT256Immediate - Return the appropriate
+    /// immediate to extract the specified EXTRACT_SUBVECTOR index
+    /// with VEXTRACTF64X4, VEXTRACTI64x4 instructions.
+    unsigned getExtractVEXTRACT256Immediate(SDNode *N);
+
+    /// getInsertVINSERT256Immediate - Return the appropriate
+    /// immediate to insert at the specified INSERT_SUBVECTOR index
+    /// with VINSERTF64x4, VINSERTI64x4 instructions.
+    unsigned getInsertVINSERT256Immediate(SDNode *N);
+
+    /// isZeroNode - Returns true if Elt is a constant zero or a floating point
+    /// constant +0.0.
+    bool isZeroNode(SDValue Elt);
+
+    /// isOffsetSuitableForCodeModel - Returns true of the given offset can be
+    /// fit into displacement field of the instruction.
+    bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
+                                      bool hasSymbolicDisplacement = true);
+
+
+    /// isCalleePop - Determines whether the callee is required to pop its
+    /// own arguments. Callee pop is necessary to support tail calls.
+    bool isCalleePop(CallingConv::ID CallingConv,
+                     bool is64Bit, bool IsVarArg, bool TailCallOpt);
+  }
+
+  //===--------------------------------------------------------------------===//
+  //  X86TargetLowering - X86 Implementation of the TargetLowering interface
+  class X86TargetLowering final : public TargetLowering {
+  public:
+    explicit X86TargetLowering(X86TargetMachine &TM);
+
+    unsigned getJumpTableEncoding() const override;
+
+    MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i8; }
+
+    const MCExpr *
+    LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
+                              const MachineBasicBlock *MBB, unsigned uid,
+                              MCContext &Ctx) const override;
+
+    /// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
+    /// jumptable.
+    SDValue getPICJumpTableRelocBase(SDValue Table,
+                                     SelectionDAG &DAG) const override;
+    const MCExpr *
+    getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
+                                 unsigned JTI, MCContext &Ctx) const override;
+
+    /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
+    /// function arguments in the caller parameter area. For X86, aggregates
+    /// that contains are placed at 16-byte boundaries while the rest are at
+    /// 4-byte boundaries.
+    unsigned getByValTypeAlignment(Type *Ty) const override;
+
+    /// getOptimalMemOpType - Returns the target specific optimal type for load
+    /// and store operations as a result of memset, memcpy, and memmove
+    /// lowering. If DstAlign is zero that means it's safe to destination
+    /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
+    /// means there isn't a need to check it against alignment requirement,
+    /// probably because the source does not need to be loaded. If 'IsMemset' is
+    /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
+    /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
+    /// source is constant so it does not need to be loaded.
+    /// It returns EVT::Other if the type should be determined using generic
+    /// target-independent logic.
+    EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
+                            bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
+                            MachineFunction &MF) const override;
+
+    /// isSafeMemOpType - Returns true if it's safe to use load / store of the
+    /// specified type to expand memcpy / memset inline. This is mostly true
+    /// for all types except for some special cases. For example, on X86
+    /// targets without SSE2 f64 load / store are done with fldl / fstpl which
+    /// also does type conversion. Note the specified type doesn't have to be
+    /// legal as the hook is used before type legalization.
+    bool isSafeMemOpType(MVT VT) const override;
+
+    /// allowsUnalignedMemoryAccesses - Returns true if the target allows
+    /// unaligned memory accesses. of the specified type. Returns whether it
+    /// is "fast" by reference in the second argument.
+    bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AS,
+                                       bool *Fast) const override;
+
+    /// LowerOperation - Provide custom lowering hooks for some operations.
+    ///
+    SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+    /// ReplaceNodeResults - Replace the results of node with an illegal result
+    /// type with new values built out of custom code.
+    ///
+    void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+                            SelectionDAG &DAG) const override;
+
+
+    SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
+    /// isTypeDesirableForOp - Return true if the target has native support for
+    /// the specified value type and it is 'desirable' to use the type for the
+    /// given node type. e.g. On x86 i16 is legal, but undesirable since i16
+    /// instruction encodings are longer and some i16 instructions are slow.
+    bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override;
+
+    /// isTypeDesirable - Return true if the target has native support for the
+    /// specified value type and it is 'desirable' to use the type. e.g. On x86
+    /// i16 is legal, but undesirable since i16 instruction encodings are longer
+    /// and some i16 instructions are slow.
+    bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override;
+
+    MachineBasicBlock *
+      EmitInstrWithCustomInserter(MachineInstr *MI,
+                                  MachineBasicBlock *MBB) const override;
+
+
+    /// getTargetNodeName - This method returns the name of a target specific
+    /// DAG node.
+    const char *getTargetNodeName(unsigned Opcode) const override;
+
+    /// getSetCCResultType - Return the value type to use for ISD::SETCC.
+    EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
+
+    /// computeKnownBitsForTargetNode - Determine which of the bits specified
+    /// in Mask are known to be either zero or one and return them in the
+    /// KnownZero/KnownOne bitsets.
+    void computeKnownBitsForTargetNode(const SDValue Op,
+                                       APInt &KnownZero,
+                                       APInt &KnownOne,
+                                       const SelectionDAG &DAG,
+                                       unsigned Depth = 0) const override;
+
+    // ComputeNumSignBitsForTargetNode - Determine the number of bits in the
+    // operation that are sign bits.
+    unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
+                                             const SelectionDAG &DAG,
+                                             unsigned Depth) const override;
+
+    bool isGAPlusOffset(SDNode *N, const GlobalValue* &GA,
+                        int64_t &Offset) const override;
+
+    SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;
+
+    bool ExpandInlineAsm(CallInst *CI) const override;
+
+    ConstraintType
+      getConstraintType(const std::string &Constraint) const override;
+
+    /// Examine constraint string and operand type and determine a weight value.
+    /// The operand object must already have been set up with the operand type.
+    ConstraintWeight
+      getSingleConstraintMatchWeight(AsmOperandInfo &info,
+                                     const char *constraint) const override;
+
+    const char *LowerXConstraint(EVT ConstraintVT) const override;
+
+    /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+    /// vector.  If it is invalid, don't add anything to Ops. If hasMemory is
+    /// true it means one of the asm constraint of the inline asm instruction
+    /// being processed is 'm'.
+    void LowerAsmOperandForConstraint(SDValue Op,
+                                      std::string &Constraint,
+                                      std::vector<SDValue> &Ops,
+                                      SelectionDAG &DAG) const override;
+
+    /// getRegForInlineAsmConstraint - Given a physical register constraint
+    /// (e.g. {edx}), return the register number and the register class for the
+    /// register.  This should only be used for C_Register constraints.  On
+    /// error, this returns a register number of 0.
+    std::pair<unsigned, const TargetRegisterClass*>
+      getRegForInlineAsmConstraint(const std::string &Constraint,
+                                   MVT VT) const override;
+
+    /// isLegalAddressingMode - Return true if the addressing mode represented
+    /// by AM is legal for this target, for a load/store of the specified type.
+    bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
+
+    /// isLegalICmpImmediate - Return true if the specified immediate is legal
+    /// icmp immediate, that is the target has icmp instructions which can
+    /// compare a register against the immediate without having to materialize
+    /// the immediate into a register.
+    bool isLegalICmpImmediate(int64_t Imm) const override;
+
+    /// isLegalAddImmediate - Return true if the specified immediate is legal
+    /// add immediate, that is the target has add instructions which can
+    /// add a register and the immediate without having to materialize
+    /// the immediate into a register.
+    bool isLegalAddImmediate(int64_t Imm) const override;
+
+    /// \brief Return the cost of the scaling factor used in the addressing
+    /// mode represented by AM for this target, for a load/store
+    /// of the specified type.
+    /// If the AM is supported, the return value must be >= 0.
+    /// If the AM is not supported, it returns a negative value.
+    int getScalingFactorCost(const AddrMode &AM, Type *Ty) const override;
+
+    bool isVectorShiftByScalarCheap(Type *Ty) const override;
+
+    /// isTruncateFree - Return true if it's free to truncate a value of
+    /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in
+    /// register EAX to i16 by referencing its sub-register AX.
+    bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
+    bool isTruncateFree(EVT VT1, EVT VT2) const override;
+
+    bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override;
+
+    /// isZExtFree - Return true if any actual instruction that defines a
+    /// value of type Ty1 implicit zero-extends the value to Ty2 in the result
+    /// register. This does not necessarily include registers defined in
+    /// unknown ways, such as incoming arguments, or copies from unknown
+    /// virtual registers. Also, if isTruncateFree(Ty2, Ty1) is true, this
+    /// does not necessarily apply to truncate instructions. e.g. on x86-64,
+    /// all instructions that define 32-bit values implicit zero-extend the
+    /// result out to 64 bits.
+    bool isZExtFree(Type *Ty1, Type *Ty2) const override;
+    bool isZExtFree(EVT VT1, EVT VT2) const override;
+    bool isZExtFree(SDValue Val, EVT VT2) const override;
+
+    /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
+    /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
+    /// expanded to FMAs when this method returns true, otherwise fmuladd is
+    /// expanded to fmul + fadd.
+    bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
+
+    /// isNarrowingProfitable - Return true if it's profitable to narrow
+    /// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow
+    /// from i32 to i8 but not from i32 to i16.
+    bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
+
+    /// isFPImmLegal - Returns true if the target can instruction select the
+    /// specified FP immediate natively. If false, the legalizer will
+    /// materialize the FP immediate as a load from a constant pool.
+    bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
+
+    /// isShuffleMaskLegal - Targets can use this to indicate that they only
+    /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
+    /// By default, if a target supports the VECTOR_SHUFFLE node, all mask
+    /// values are assumed to be legal.
+    bool isShuffleMaskLegal(const SmallVectorImpl<int> &Mask,
+                            EVT VT) const override;
+
+    /// isVectorClearMaskLegal - Similar to isShuffleMaskLegal. This is
+    /// used by Targets can use this to indicate if there is a suitable
+    /// VECTOR_SHUFFLE that can be used to replace a VAND with a constant
+    /// pool entry.
+    bool isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
+                                EVT VT) const override;
+
+    /// ShouldShrinkFPConstant - If true, then instruction selection should
+    /// seek to shrink the FP constant of the specified type to a smaller type
+    /// in order to save space and / or reduce runtime.
+    bool ShouldShrinkFPConstant(EVT VT) const override {
+      // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
+      // expensive than a straight movsd. On the other hand, it's important to
+      // shrink long double fp constant since fldt is very slow.
+      return !X86ScalarSSEf64 || VT == MVT::f80;
+    }
+
+    const X86Subtarget* getSubtarget() const {
+      return Subtarget;
+    }
+
+    /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is
+    /// computed in an SSE register, not on the X87 floating point stack.
+    bool isScalarFPTypeInSSEReg(EVT VT) const {
+      return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
+      (VT == MVT::f32 && X86ScalarSSEf32);   // f32 is when SSE1
+    }
+
+    /// isTargetFTOL - Return true if the target uses the MSVC _ftol2 routine
+    /// for fptoui.
+    bool isTargetFTOL() const;
+
+    /// isIntegerTypeFTOL - Return true if the MSVC _ftol2 routine should be
+    /// used for fptoui to the given type.
+    bool isIntegerTypeFTOL(EVT VT) const {
+      return isTargetFTOL() && VT == MVT::i64;
+    }
+
+    /// \brief Returns true if it is beneficial to convert a load of a constant
+    /// to just the constant itself.
+    bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                           Type *Ty) const override;
+
+    /// Intel processors have a unified instruction and data cache
+    const char * getClearCacheBuiltinName() const override {
+      return nullptr; // nothing to do, move along.
+    }
+
+    unsigned getRegisterByName(const char* RegName, EVT VT) const override;
+
+    /// createFastISel - This method returns a target specific FastISel object,
+    /// or null if the target does not support "fast" ISel.
+    FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                             const TargetLibraryInfo *libInfo) const override;
+
+    /// getStackCookieLocation - Return true if the target stores stack
+    /// protector cookies at a fixed offset in some non-standard address
+    /// space, and populates the address space and offset as
+    /// appropriate.
+    bool getStackCookieLocation(unsigned &AddressSpace,
+                                unsigned &Offset) const override;
+
+    SDValue BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot,
+                      SelectionDAG &DAG) const;
+
+    bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
+
+    /// \brief Reset the operation actions based on target options.
+    void resetOperationActions() override;
+
+    /// \brief Customize the preferred legalization strategy for certain types.
+    LegalizeTypeAction getPreferredVectorAction(EVT VT) const override;
+
+  protected:
+    std::pair<const TargetRegisterClass*, uint8_t>
+    findRepresentativeClass(MVT VT) const override;
+
+  private:
+    /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
+    /// make the right decision when generating code for different targets.
+    const X86Subtarget *Subtarget;
+    const DataLayout *TD;
+
+    /// Used to store the TargetOptions so that we don't waste time resetting
+    /// the operation actions unless we have to.
+    TargetOptions TO;
+
+    /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87
+    /// floating point ops.
+    /// When SSE is available, use it for f32 operations.
+    /// When SSE2 is available, use it for f64 operations.
+    bool X86ScalarSSEf32;
+    bool X86ScalarSSEf64;
+
+    /// LegalFPImmediates - A list of legal fp immediates.
+    std::vector<APFloat> LegalFPImmediates;
+
+    /// addLegalFPImmediate - Indicate that this x86 target can instruction
+    /// select the specified FP immediate natively.
+    void addLegalFPImmediate(const APFloat& Imm) {
+      LegalFPImmediates.push_back(Imm);
+    }
+
+    SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                            CallingConv::ID CallConv, bool isVarArg,
+                            const SmallVectorImpl<ISD::InputArg> &Ins,
+                            SDLoc dl, SelectionDAG &DAG,
+                            SmallVectorImpl<SDValue> &InVals) const;
+    SDValue LowerMemArgument(SDValue Chain,
+                             CallingConv::ID CallConv,
+                             const SmallVectorImpl<ISD::InputArg> &ArgInfo,
+                             SDLoc dl, SelectionDAG &DAG,
+                             const CCValAssign &VA,  MachineFrameInfo *MFI,
+                              unsigned i) const;
+    SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
+                             SDLoc dl, SelectionDAG &DAG,
+                             const CCValAssign &VA,
+                             ISD::ArgFlagsTy Flags) const;
+
+    // Call lowering helpers.
+
+    /// IsEligibleForTailCallOptimization - Check whether the call is eligible
+    /// for tail call optimization. Targets which want to do tail call
+    /// optimization should implement this function.
+    bool IsEligibleForTailCallOptimization(SDValue Callee,
+                                           CallingConv::ID CalleeCC,
+                                           bool isVarArg,
+                                           bool isCalleeStructRet,
+                                           bool isCallerStructRet,
+                                           Type *RetTy,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<SDValue> &OutVals,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                           SelectionDAG& DAG) const;
+    bool IsCalleePop(bool isVarArg, CallingConv::ID CallConv) const;
+    SDValue EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr,
+                                SDValue Chain, bool IsTailCall, bool Is64Bit,
+                                int FPDiff, SDLoc dl) const;
+
+    unsigned GetAlignedArgumentStackSize(unsigned StackSize,
+                                         SelectionDAG &DAG) const;
+
+    std::pair<SDValue,SDValue> FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
+                                               bool isSigned,
+                                               bool isReplace) const;
+
+    SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const;
+    SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const;
+
+    SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
+                               int64_t Offset, SelectionDAG &DAG) const;
+    SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerToBT(SDValue And, ISD::CondCode CC,
+                      SDLoc dl, SelectionDAG &DAG) const;
+    SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerMEMSET(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const;
+
+    SDValue
+      LowerFormalArguments(SDValue Chain,
+                           CallingConv::ID CallConv, bool isVarArg,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           SDLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals) const override;
+    SDValue LowerCall(CallLoweringInfo &CLI,
+                      SmallVectorImpl<SDValue> &InVals) const override;
+
+    SDValue LowerReturn(SDValue Chain,
+                        CallingConv::ID CallConv, bool isVarArg,
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        const SmallVectorImpl<SDValue> &OutVals,
+                        SDLoc dl, SelectionDAG &DAG) const override;
+
+    bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
+
+    bool mayBeEmittedAsTailCall(CallInst *CI) const override;
+
+    MVT getTypeForExtArgOrReturn(MVT VT,
+                                 ISD::NodeType ExtendKind) const override;
+
+    bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+                        bool isVarArg,
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        LLVMContext &Context) const override;
+
+    const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
+
+    /// Utility function to emit atomic-load-arith operations (and, or, xor,
+    /// nand, max, min, umax, umin). It takes the corresponding instruction to
+    /// expand, the associated machine basic block, and the associated X86
+    /// opcodes for reg/reg.
+    MachineBasicBlock *EmitAtomicLoadArith(MachineInstr *MI,
+                                           MachineBasicBlock *MBB) const;
+
+    /// Utility function to emit atomic-load-arith operations (and, or, xor,
+    /// nand, add, sub, swap) for 64-bit operands on 32-bit target.
+    MachineBasicBlock *EmitAtomicLoadArith6432(MachineInstr *MI,
+                                               MachineBasicBlock *MBB) const;
+
+    // Utility function to emit the low-level va_arg code for X86-64.
+    MachineBasicBlock *EmitVAARG64WithCustomInserter(
+                       MachineInstr *MI,
+                       MachineBasicBlock *MBB) const;
+
+    /// Utility function to emit the xmm reg save portion of va_start.
+    MachineBasicBlock *EmitVAStartSaveXMMRegsWithCustomInserter(
+                                                   MachineInstr *BInstr,
+                                                   MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *EmitLoweredSelect(MachineInstr *I,
+                                         MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *EmitLoweredWinAlloca(MachineInstr *MI,
+                                              MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr *MI,
+                                            MachineBasicBlock *BB,
+                                            bool Is64Bit) const;
+
+    MachineBasicBlock *EmitLoweredTLSCall(MachineInstr *MI,
+                                          MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *emitLoweredTLSAddr(MachineInstr *MI,
+                                          MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr *MI,
+                                        MachineBasicBlock *MBB) const;
+
+    MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr *MI,
+                                         MachineBasicBlock *MBB) const;
+
+    MachineBasicBlock *emitFMA3Instr(MachineInstr *MI,
+                                     MachineBasicBlock *MBB) const;
+
+    /// Emit nodes that will be selected as "test Op0,Op0", or something
+    /// equivalent, for use with the given x86 condition code.
+    SDValue EmitTest(SDValue Op0, unsigned X86CC, SDLoc dl,
+                     SelectionDAG &DAG) const;
+
+    /// Emit nodes that will be selected as "cmp Op0,Op1", or something
+    /// equivalent, for use with the given x86 condition code.
+    SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, SDLoc dl,
+                    SelectionDAG &DAG) const;
+
+    /// Convert a comparison if required by the subtarget.
+    SDValue ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const;
+  };
+
+  namespace X86 {
+    FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                             const TargetLibraryInfo *libInfo);
+  }
+}
+
+#endif    // X86ISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/X86/X86Instr3DNow.td b/contrib/llvm/lib/Target/X86/X86Instr3DNow.td
new file mode 100644
index 0000000..ba1aede
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86Instr3DNow.td
@@ -0,0 +1,103 @@
+//===-- X86Instr3DNow.td - The 3DNow! Instruction Set ------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the 3DNow! instruction set, which extends MMX to support
+// floating point and also adds a few more random instructions for good measure.
+//
+//===----------------------------------------------------------------------===//
+
+class I3DNow<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pat>
+      : I<o, F, outs, ins, asm, pat>, TB, Requires<[Has3DNow]> {
+}
+
+class I3DNow_binop<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat>
+      : I3DNow<o, F, (outs VR64:$dst), ins,
+          !strconcat(Mnemonic, "\t{$src2, $dst|$dst, $src2}"), pat>,
+        Has3DNow0F0FOpcode {
+  // FIXME: The disassembler doesn't support Has3DNow0F0FOpcode yet.
+  let isAsmParserOnly = 1;
+  let Constraints = "$src1 = $dst";
+}
+
+class I3DNow_conv<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat>
+      : I3DNow<o, F, (outs VR64:$dst), ins,
+          !strconcat(Mnemonic, "\t{$src, $dst|$dst, $src}"), pat>,
+        Has3DNow0F0FOpcode {
+  // FIXME: The disassembler doesn't support Has3DNow0F0FOpcode yet.
+  let isAsmParserOnly = 1;
+}
+
+multiclass I3DNow_binop_rm<bits<8> opc, string Mn> {
+  def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn, []>;
+  def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn, []>;
+}
+
+multiclass I3DNow_binop_rm_int<bits<8> opc, string Mn, string Ver = ""> {
+  def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn,
+    [(set VR64:$dst, (!cast<Intrinsic>(
+      !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1, VR64:$src2))]>;
+  def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn,
+    [(set VR64:$dst, (!cast<Intrinsic>(
+      !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1,
+        (bitconvert (load_mmx addr:$src2))))]>;
+}
+
+multiclass I3DNow_conv_rm<bits<8> opc, string Mn> {
+  def rr : I3DNow_conv<opc, MRMSrcReg, (ins VR64:$src1), Mn, []>;
+  def rm : I3DNow_conv<opc, MRMSrcMem, (ins i64mem:$src1), Mn, []>;
+}
+
+multiclass I3DNow_conv_rm_int<bits<8> opc, string Mn, string Ver = ""> {
+  def rr : I3DNow_conv<opc, MRMSrcReg, (ins VR64:$src), Mn,
+    [(set VR64:$dst, (!cast<Intrinsic>(
+      !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src))]>;
+  def rm : I3DNow_conv<opc, MRMSrcMem, (ins i64mem:$src), Mn,
+    [(set VR64:$dst, (!cast<Intrinsic>(
+      !strconcat("int_x86_3dnow", Ver, "_", Mn))
+        (bitconvert (load_mmx addr:$src))))]>;
+}
+
+defm PAVGUSB  : I3DNow_binop_rm_int<0xBF, "pavgusb">;
+defm PF2ID    : I3DNow_conv_rm_int<0x1D, "pf2id">;
+defm PFACC    : I3DNow_binop_rm_int<0xAE, "pfacc">;
+defm PFADD    : I3DNow_binop_rm_int<0x9E, "pfadd">;
+defm PFCMPEQ  : I3DNow_binop_rm_int<0xB0, "pfcmpeq">;
+defm PFCMPGE  : I3DNow_binop_rm_int<0x90, "pfcmpge">;
+defm PFCMPGT  : I3DNow_binop_rm_int<0xA0, "pfcmpgt">;
+defm PFMAX    : I3DNow_binop_rm_int<0xA4, "pfmax">;
+defm PFMIN    : I3DNow_binop_rm_int<0x94, "pfmin">;
+defm PFMUL    : I3DNow_binop_rm_int<0xB4, "pfmul">;
+defm PFRCP    : I3DNow_conv_rm_int<0x96, "pfrcp">;
+defm PFRCPIT1 : I3DNow_binop_rm_int<0xA6, "pfrcpit1">;
+defm PFRCPIT2 : I3DNow_binop_rm_int<0xB6, "pfrcpit2">;
+defm PFRSQIT1 : I3DNow_binop_rm_int<0xA7, "pfrsqit1">;
+defm PFRSQRT  : I3DNow_conv_rm_int<0x97, "pfrsqrt">;
+defm PFSUB    : I3DNow_binop_rm_int<0x9A, "pfsub">;
+defm PFSUBR   : I3DNow_binop_rm_int<0xAA, "pfsubr">;
+defm PI2FD    : I3DNow_conv_rm_int<0x0D, "pi2fd">;
+defm PMULHRW  : I3DNow_binop_rm_int<0xB7, "pmulhrw">;
+
+
+def FEMMS : I3DNow<0x0E, RawFrm, (outs), (ins), "femms",
+                   [(int_x86_mmx_femms)]>;
+
+def PREFETCH : I3DNow<0x0D, MRM0m, (outs), (ins i8mem:$addr),
+                      "prefetch\t$addr",
+                      [(prefetch addr:$addr, (i32 0), imm, (i32 1))]>;
+
+def PREFETCHW : I<0x0D, MRM1m, (outs), (ins i8mem:$addr), "prefetchw\t$addr",
+                  [(prefetch addr:$addr, (i32 1), (i32 3), (i32 1))]>, TB,
+                Requires<[HasPrefetchW]>;
+
+// "3DNowA" instructions
+defm PF2IW    : I3DNow_conv_rm_int<0x1C, "pf2iw", "a">;
+defm PI2FW    : I3DNow_conv_rm_int<0x0C, "pi2fw", "a">;
+defm PFNACC   : I3DNow_binop_rm_int<0x8A, "pfnacc", "a">;
+defm PFPNACC  : I3DNow_binop_rm_int<0x8E, "pfpnacc", "a">;
+defm PSWAPD   : I3DNow_conv_rm_int<0xBB, "pswapd", "a">;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrAVX512.td b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td
new file mode 100644
index 0000000..d289408
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -0,0 +1,4543 @@
+// Bitcasts between 512-bit vector types. Return the original type since
+// no instruction is needed for the conversion
+let Predicates = [HasAVX512] in {
+  def : Pat<(v8f64  (bitconvert (v8i64 VR512:$src))),  (v8f64 VR512:$src)>;
+  def : Pat<(v8f64  (bitconvert (v16i32 VR512:$src))), (v8f64 VR512:$src)>;
+  def : Pat<(v8f64  (bitconvert (v32i16 VR512:$src))),  (v8f64 VR512:$src)>;
+  def : Pat<(v8f64  (bitconvert (v64i8 VR512:$src))), (v8f64 VR512:$src)>;
+  def : Pat<(v8f64  (bitconvert (v16f32 VR512:$src))), (v8f64 VR512:$src)>;
+  def : Pat<(v16f32 (bitconvert (v8i64 VR512:$src))),  (v16f32 VR512:$src)>;
+  def : Pat<(v16f32 (bitconvert (v16i32 VR512:$src))), (v16f32 VR512:$src)>;
+  def : Pat<(v16f32 (bitconvert (v32i16 VR512:$src))), (v16f32 VR512:$src)>;
+  def : Pat<(v16f32 (bitconvert (v64i8 VR512:$src))), (v16f32 VR512:$src)>;
+  def : Pat<(v16f32 (bitconvert (v8f64 VR512:$src))),  (v16f32 VR512:$src)>;
+  def : Pat<(v8i64  (bitconvert (v16i32 VR512:$src))), (v8i64 VR512:$src)>;
+  def : Pat<(v8i64  (bitconvert (v32i16 VR512:$src))), (v8i64 VR512:$src)>;
+  def : Pat<(v8i64  (bitconvert (v64i8 VR512:$src))), (v8i64 VR512:$src)>;
+  def : Pat<(v8i64  (bitconvert (v8f64 VR512:$src))),  (v8i64 VR512:$src)>;
+  def : Pat<(v8i64  (bitconvert (v16f32 VR512:$src))), (v8i64 VR512:$src)>;
+  def : Pat<(v16i32 (bitconvert (v8i64 VR512:$src))), (v16i32 VR512:$src)>;
+  def : Pat<(v16i32 (bitconvert (v32i16 VR512:$src))),  (v16i32 VR512:$src)>;
+  def : Pat<(v16i32 (bitconvert (v64i8 VR512:$src))),  (v16i32 VR512:$src)>;
+  def : Pat<(v16i32 (bitconvert (v8f64 VR512:$src))),  (v16i32 VR512:$src)>;
+  def : Pat<(v32i16 (bitconvert (v8i64 VR512:$src))), (v32i16 VR512:$src)>;
+  def : Pat<(v32i16 (bitconvert (v16i32 VR512:$src))),  (v32i16 VR512:$src)>;
+  def : Pat<(v32i16 (bitconvert (v64i8 VR512:$src))),  (v32i16 VR512:$src)>;
+  def : Pat<(v32i16 (bitconvert (v8f64 VR512:$src))),  (v32i16 VR512:$src)>;
+  def : Pat<(v32i16 (bitconvert (v16f32 VR512:$src))), (v32i16 VR512:$src)>;
+  def : Pat<(v32i16 (bitconvert (v16f32 VR512:$src))), (v32i16 VR512:$src)>;
+  def : Pat<(v64i8  (bitconvert (v8i64 VR512:$src))), (v64i8 VR512:$src)>;
+  def : Pat<(v64i8  (bitconvert (v16i32 VR512:$src))), (v64i8 VR512:$src)>;
+  def : Pat<(v64i8  (bitconvert (v32i16 VR512:$src))), (v64i8 VR512:$src)>;
+  def : Pat<(v64i8  (bitconvert (v8f64 VR512:$src))),  (v64i8 VR512:$src)>;
+  def : Pat<(v64i8  (bitconvert (v16f32 VR512:$src))), (v64i8 VR512:$src)>;
+
+  def : Pat<(v2i64 (bitconvert (v4i32 VR128X:$src))), (v2i64 VR128X:$src)>;
+  def : Pat<(v2i64 (bitconvert (v8i16 VR128X:$src))), (v2i64 VR128X:$src)>;
+  def : Pat<(v2i64 (bitconvert (v16i8 VR128X:$src))), (v2i64 VR128X:$src)>;
+  def : Pat<(v2i64 (bitconvert (v2f64 VR128X:$src))), (v2i64 VR128X:$src)>;
+  def : Pat<(v2i64 (bitconvert (v4f32 VR128X:$src))), (v2i64 VR128X:$src)>;
+  def : Pat<(v4i32 (bitconvert (v2i64 VR128X:$src))), (v4i32 VR128X:$src)>;
+  def : Pat<(v4i32 (bitconvert (v8i16 VR128X:$src))), (v4i32 VR128X:$src)>;
+  def : Pat<(v4i32 (bitconvert (v16i8 VR128X:$src))), (v4i32 VR128X:$src)>;
+  def : Pat<(v4i32 (bitconvert (v2f64 VR128X:$src))), (v4i32 VR128X:$src)>;
+  def : Pat<(v4i32 (bitconvert (v4f32 VR128X:$src))), (v4i32 VR128X:$src)>;
+  def : Pat<(v8i16 (bitconvert (v2i64 VR128X:$src))), (v8i16 VR128X:$src)>;
+  def : Pat<(v8i16 (bitconvert (v4i32 VR128X:$src))), (v8i16 VR128X:$src)>;
+  def : Pat<(v8i16 (bitconvert (v16i8 VR128X:$src))), (v8i16 VR128X:$src)>;
+  def : Pat<(v8i16 (bitconvert (v2f64 VR128X:$src))), (v8i16 VR128X:$src)>;
+  def : Pat<(v8i16 (bitconvert (v4f32 VR128X:$src))), (v8i16 VR128X:$src)>;
+  def : Pat<(v16i8 (bitconvert (v2i64 VR128X:$src))), (v16i8 VR128X:$src)>;
+  def : Pat<(v16i8 (bitconvert (v4i32 VR128X:$src))), (v16i8 VR128X:$src)>;
+  def : Pat<(v16i8 (bitconvert (v8i16 VR128X:$src))), (v16i8 VR128X:$src)>;
+  def : Pat<(v16i8 (bitconvert (v2f64 VR128X:$src))), (v16i8 VR128X:$src)>;
+  def : Pat<(v16i8 (bitconvert (v4f32 VR128X:$src))), (v16i8 VR128X:$src)>;
+  def : Pat<(v4f32 (bitconvert (v2i64 VR128X:$src))), (v4f32 VR128X:$src)>;
+  def : Pat<(v4f32 (bitconvert (v4i32 VR128X:$src))), (v4f32 VR128X:$src)>;
+  def : Pat<(v4f32 (bitconvert (v8i16 VR128X:$src))), (v4f32 VR128X:$src)>;
+  def : Pat<(v4f32 (bitconvert (v16i8 VR128X:$src))), (v4f32 VR128X:$src)>;
+  def : Pat<(v4f32 (bitconvert (v2f64 VR128X:$src))), (v4f32 VR128X:$src)>;
+  def : Pat<(v2f64 (bitconvert (v2i64 VR128X:$src))), (v2f64 VR128X:$src)>;
+  def : Pat<(v2f64 (bitconvert (v4i32 VR128X:$src))), (v2f64 VR128X:$src)>;
+  def : Pat<(v2f64 (bitconvert (v8i16 VR128X:$src))), (v2f64 VR128X:$src)>;
+  def : Pat<(v2f64 (bitconvert (v16i8 VR128X:$src))), (v2f64 VR128X:$src)>;
+  def : Pat<(v2f64 (bitconvert (v4f32 VR128X:$src))), (v2f64 VR128X:$src)>;
+
+// Bitcasts between 256-bit vector types. Return the original type since
+// no instruction is needed for the conversion
+  def : Pat<(v4f64  (bitconvert (v8f32 VR256X:$src))),  (v4f64 VR256X:$src)>;
+  def : Pat<(v4f64  (bitconvert (v8i32 VR256X:$src))),  (v4f64 VR256X:$src)>;
+  def : Pat<(v4f64  (bitconvert (v4i64 VR256X:$src))),  (v4f64 VR256X:$src)>;
+  def : Pat<(v4f64  (bitconvert (v16i16 VR256X:$src))), (v4f64 VR256X:$src)>;
+  def : Pat<(v4f64  (bitconvert (v32i8 VR256X:$src))),  (v4f64 VR256X:$src)>;
+  def : Pat<(v8f32  (bitconvert (v8i32 VR256X:$src))),  (v8f32 VR256X:$src)>;
+  def : Pat<(v8f32  (bitconvert (v4i64 VR256X:$src))),  (v8f32 VR256X:$src)>;
+  def : Pat<(v8f32  (bitconvert (v4f64 VR256X:$src))),  (v8f32 VR256X:$src)>;
+  def : Pat<(v8f32  (bitconvert (v32i8 VR256X:$src))),  (v8f32 VR256X:$src)>;
+  def : Pat<(v8f32  (bitconvert (v16i16 VR256X:$src))), (v8f32 VR256X:$src)>;
+  def : Pat<(v4i64  (bitconvert (v8f32 VR256X:$src))),  (v4i64 VR256X:$src)>;
+  def : Pat<(v4i64  (bitconvert (v8i32 VR256X:$src))),  (v4i64 VR256X:$src)>;
+  def : Pat<(v4i64  (bitconvert (v4f64 VR256X:$src))),  (v4i64 VR256X:$src)>;
+  def : Pat<(v4i64  (bitconvert (v32i8 VR256X:$src))),  (v4i64 VR256X:$src)>;
+  def : Pat<(v4i64  (bitconvert (v16i16 VR256X:$src))), (v4i64 VR256X:$src)>;
+  def : Pat<(v32i8  (bitconvert (v4f64 VR256X:$src))),  (v32i8 VR256X:$src)>;
+  def : Pat<(v32i8  (bitconvert (v4i64 VR256X:$src))),  (v32i8 VR256X:$src)>;
+  def : Pat<(v32i8  (bitconvert (v8f32 VR256X:$src))),  (v32i8 VR256X:$src)>;
+  def : Pat<(v32i8  (bitconvert (v8i32 VR256X:$src))),  (v32i8 VR256X:$src)>;
+  def : Pat<(v32i8  (bitconvert (v16i16 VR256X:$src))), (v32i8 VR256X:$src)>;
+  def : Pat<(v8i32  (bitconvert (v32i8 VR256X:$src))),  (v8i32 VR256X:$src)>;
+  def : Pat<(v8i32  (bitconvert (v16i16 VR256X:$src))), (v8i32 VR256X:$src)>;
+  def : Pat<(v8i32  (bitconvert (v8f32 VR256X:$src))),  (v8i32 VR256X:$src)>;
+  def : Pat<(v8i32  (bitconvert (v4i64 VR256X:$src))),  (v8i32 VR256X:$src)>;
+  def : Pat<(v8i32  (bitconvert (v4f64 VR256X:$src))),  (v8i32 VR256X:$src)>;
+  def : Pat<(v16i16 (bitconvert (v8f32 VR256X:$src))),  (v16i16 VR256X:$src)>;
+  def : Pat<(v16i16 (bitconvert (v8i32 VR256X:$src))),  (v16i16 VR256X:$src)>;
+  def : Pat<(v16i16 (bitconvert (v4i64 VR256X:$src))),  (v16i16 VR256X:$src)>;
+  def : Pat<(v16i16 (bitconvert (v4f64 VR256X:$src))),  (v16i16 VR256X:$src)>;
+  def : Pat<(v16i16 (bitconvert (v32i8 VR256X:$src))),  (v16i16 VR256X:$src)>;
+}
+
+//
+// AVX-512: VPXOR instruction writes zero to its upper part, it's safe build zeros.
+//
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+    isPseudo = 1, Predicates = [HasAVX512] in {
+def AVX512_512_SET0 : I<0, Pseudo, (outs VR512:$dst), (ins), "",
+               [(set VR512:$dst, (v16f32 immAllZerosV))]>;
+}
+
+let Predicates = [HasAVX512] in {
+def : Pat<(v8i64 immAllZerosV), (AVX512_512_SET0)>;
+def : Pat<(v16i32 immAllZerosV), (AVX512_512_SET0)>;
+def : Pat<(v8f64 immAllZerosV), (AVX512_512_SET0)>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - VECTOR INSERT
+//
+// -- 32x8 form --
+let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
+def VINSERTF32x4rr : AVX512AIi8<0x18, MRMSrcReg, (outs VR512:$dst),
+          (ins VR512:$src1, VR128X:$src2, i8imm:$src3),
+          "vinsertf32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          []>, EVEX_4V, EVEX_V512;
+let mayLoad = 1 in
+def VINSERTF32x4rm : AVX512AIi8<0x18, MRMSrcMem, (outs VR512:$dst),
+          (ins VR512:$src1, f128mem:$src2, i8imm:$src3),
+          "vinsertf32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          []>, EVEX_4V, EVEX_V512, EVEX_CD8<32, CD8VT4>;
+}
+
+// -- 64x4 fp form --
+let hasSideEffects = 0, ExeDomain = SSEPackedDouble in {
+def VINSERTF64x4rr : AVX512AIi8<0x1a, MRMSrcReg, (outs VR512:$dst),
+          (ins VR512:$src1, VR256X:$src2, i8imm:$src3),
+          "vinsertf64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          []>, EVEX_4V, EVEX_V512, VEX_W;
+let mayLoad = 1 in
+def VINSERTF64x4rm : AVX512AIi8<0x1a, MRMSrcMem, (outs VR512:$dst),
+          (ins VR512:$src1, i256mem:$src2, i8imm:$src3),
+          "vinsertf64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          []>, EVEX_4V, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>;
+}
+// -- 32x4 integer form --
+let hasSideEffects = 0 in {
+def VINSERTI32x4rr : AVX512AIi8<0x38, MRMSrcReg, (outs VR512:$dst),
+          (ins VR512:$src1, VR128X:$src2, i8imm:$src3),
+          "vinserti32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          []>, EVEX_4V, EVEX_V512;
+let mayLoad = 1 in
+def VINSERTI32x4rm : AVX512AIi8<0x38, MRMSrcMem, (outs VR512:$dst),
+          (ins VR512:$src1, i128mem:$src2, i8imm:$src3),
+          "vinserti32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          []>, EVEX_4V, EVEX_V512, EVEX_CD8<32, CD8VT4>;
+}
+
+let hasSideEffects = 0 in {
+// -- 64x4 form --
+def VINSERTI64x4rr : AVX512AIi8<0x3a, MRMSrcReg, (outs VR512:$dst),
+          (ins VR512:$src1, VR256X:$src2, i8imm:$src3),
+          "vinserti64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          []>, EVEX_4V, EVEX_V512, VEX_W;
+let mayLoad = 1 in
+def VINSERTI64x4rm : AVX512AIi8<0x3a, MRMSrcMem, (outs VR512:$dst),
+          (ins VR512:$src1, i256mem:$src2, i8imm:$src3),
+          "vinserti64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          []>, EVEX_4V, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>;
+}
+
+def : Pat<(vinsert128_insert:$ins (v16f32 VR512:$src1), (v4f32 VR128X:$src2),
+           (iPTR imm)), (VINSERTF32x4rr VR512:$src1, VR128X:$src2,
+                        (INSERT_get_vinsert128_imm VR512:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v8f64  VR512:$src1), (v2f64 VR128X:$src2),
+           (iPTR imm)), (VINSERTF32x4rr VR512:$src1, VR128X:$src2,
+                        (INSERT_get_vinsert128_imm VR512:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v8i64  VR512:$src1), (v2i64 VR128X:$src2),
+           (iPTR imm)), (VINSERTI32x4rr VR512:$src1, VR128X:$src2,
+                        (INSERT_get_vinsert128_imm VR512:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v16i32 VR512:$src1), (v4i32 VR128X:$src2),
+           (iPTR imm)), (VINSERTI32x4rr VR512:$src1, VR128X:$src2,
+                        (INSERT_get_vinsert128_imm VR512:$ins))>;
+
+def : Pat<(vinsert128_insert:$ins (v16f32 VR512:$src1), (loadv4f32 addr:$src2),
+           (iPTR imm)), (VINSERTF32x4rm VR512:$src1, addr:$src2,
+                        (INSERT_get_vinsert128_imm VR512:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v16i32 VR512:$src1),
+                  (bc_v4i32 (loadv2i64 addr:$src2)),
+           (iPTR imm)), (VINSERTI32x4rm VR512:$src1, addr:$src2,
+                        (INSERT_get_vinsert128_imm VR512:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v8f64  VR512:$src1), (loadv2f64 addr:$src2),
+           (iPTR imm)), (VINSERTF32x4rm VR512:$src1, addr:$src2,
+                        (INSERT_get_vinsert128_imm VR512:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v8i64  VR512:$src1), (loadv2i64 addr:$src2),
+           (iPTR imm)), (VINSERTI32x4rm VR512:$src1, addr:$src2,
+                        (INSERT_get_vinsert128_imm VR512:$ins))>;
+
+def : Pat<(vinsert256_insert:$ins (v16f32  VR512:$src1), (v8f32 VR256X:$src2),
+           (iPTR imm)), (VINSERTF64x4rr VR512:$src1, VR256X:$src2,
+                        (INSERT_get_vinsert256_imm VR512:$ins))>;
+def : Pat<(vinsert256_insert:$ins (v8f64  VR512:$src1), (v4f64 VR256X:$src2),
+           (iPTR imm)), (VINSERTF64x4rr VR512:$src1, VR256X:$src2,
+                        (INSERT_get_vinsert256_imm VR512:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v8i64  VR512:$src1), (v4i64 VR256X:$src2),
+           (iPTR imm)), (VINSERTI64x4rr VR512:$src1, VR256X:$src2,
+                        (INSERT_get_vinsert256_imm VR512:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v16i32 VR512:$src1), (v8i32 VR256X:$src2),
+           (iPTR imm)), (VINSERTI64x4rr VR512:$src1, VR256X:$src2,
+                        (INSERT_get_vinsert256_imm VR512:$ins))>;
+
+def : Pat<(vinsert256_insert:$ins (v16f32  VR512:$src1), (loadv8f32 addr:$src2),
+           (iPTR imm)), (VINSERTF64x4rm VR512:$src1, addr:$src2,
+                        (INSERT_get_vinsert256_imm VR512:$ins))>;
+def : Pat<(vinsert256_insert:$ins (v8f64  VR512:$src1), (loadv4f64 addr:$src2),
+           (iPTR imm)), (VINSERTF64x4rm VR512:$src1, addr:$src2,
+                        (INSERT_get_vinsert256_imm VR512:$ins))>;
+def : Pat<(vinsert256_insert:$ins (v8i64  VR512:$src1), (loadv4i64 addr:$src2),
+           (iPTR imm)), (VINSERTI64x4rm VR512:$src1, addr:$src2,
+                        (INSERT_get_vinsert256_imm VR512:$ins))>;
+def : Pat<(vinsert256_insert:$ins (v16i32 VR512:$src1),
+	                (bc_v8i32 (loadv4i64 addr:$src2)),
+           (iPTR imm)), (VINSERTI64x4rm VR512:$src1, addr:$src2,
+                        (INSERT_get_vinsert256_imm VR512:$ins))>;
+
+// vinsertps - insert f32 to XMM
+def VINSERTPSzrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst),
+      (ins VR128X:$src1, VR128X:$src2, u32u8imm:$src3),
+      "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+      [(set VR128X:$dst, (X86insertps VR128X:$src1, VR128X:$src2, imm:$src3))]>,
+      EVEX_4V;
+def VINSERTPSzrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst),
+      (ins VR128X:$src1, f32mem:$src2, u32u8imm:$src3),
+      "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+      [(set VR128X:$dst, (X86insertps VR128X:$src1,
+                          (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
+                          imm:$src3))]>, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 VECTOR EXTRACT
+//---
+let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
+// -- 32x4 form --
+def VEXTRACTF32x4rr : AVX512AIi8<0x19, MRMDestReg, (outs VR128X:$dst),
+          (ins VR512:$src1, i8imm:$src2),
+          "vextractf32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+          []>, EVEX, EVEX_V512;
+def VEXTRACTF32x4mr : AVX512AIi8<0x19, MRMDestMem, (outs),
+          (ins f128mem:$dst, VR512:$src1, i8imm:$src2),
+          "vextractf32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+          []>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VT4>;
+
+// -- 64x4 form --
+def VEXTRACTF64x4rr : AVX512AIi8<0x1b, MRMDestReg, (outs VR256X:$dst),
+          (ins VR512:$src1, i8imm:$src2),
+          "vextractf64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+          []>, EVEX, EVEX_V512, VEX_W;
+let mayStore = 1 in
+def VEXTRACTF64x4mr : AVX512AIi8<0x1b, MRMDestMem, (outs),
+          (ins f256mem:$dst, VR512:$src1, i8imm:$src2),
+          "vextractf64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+          []>, EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>;
+}
+
+let hasSideEffects = 0 in {
+// -- 32x4 form --
+def VEXTRACTI32x4rr : AVX512AIi8<0x39, MRMDestReg, (outs VR128X:$dst),
+          (ins VR512:$src1, i8imm:$src2),
+          "vextracti32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+          []>, EVEX, EVEX_V512;
+def VEXTRACTI32x4mr : AVX512AIi8<0x39, MRMDestMem, (outs),
+          (ins i128mem:$dst, VR512:$src1, i8imm:$src2),
+          "vextracti32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+          []>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VT4>;
+
+// -- 64x4 form --
+def VEXTRACTI64x4rr : AVX512AIi8<0x3b, MRMDestReg, (outs VR256X:$dst),
+          (ins VR512:$src1, i8imm:$src2),
+          "vextracti64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+          []>, EVEX, EVEX_V512, VEX_W;
+let mayStore = 1 in
+def VEXTRACTI64x4mr : AVX512AIi8<0x3b, MRMDestMem, (outs),
+          (ins i256mem:$dst, VR512:$src1, i8imm:$src2),
+          "vextracti64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+          []>, EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>;
+}
+
+def : Pat<(vextract128_extract:$ext (v16f32 VR512:$src1), (iPTR imm)),
+          (v4f32 (VEXTRACTF32x4rr VR512:$src1,
+                  (EXTRACT_get_vextract128_imm VR128X:$ext)))>;
+
+def : Pat<(vextract128_extract:$ext VR512:$src1, (iPTR imm)),
+          (v4i32 (VEXTRACTF32x4rr VR512:$src1,
+                  (EXTRACT_get_vextract128_imm VR128X:$ext)))>;
+
+def : Pat<(vextract128_extract:$ext (v8f64 VR512:$src1), (iPTR imm)),
+          (v2f64 (VEXTRACTF32x4rr VR512:$src1,
+                  (EXTRACT_get_vextract128_imm VR128X:$ext)))>;
+
+def : Pat<(vextract128_extract:$ext (v8i64 VR512:$src1), (iPTR imm)),
+          (v2i64 (VEXTRACTI32x4rr VR512:$src1,
+                  (EXTRACT_get_vextract128_imm VR128X:$ext)))>;
+
+
+def : Pat<(vextract256_extract:$ext (v16f32 VR512:$src1), (iPTR imm)),
+          (v8f32 (VEXTRACTF64x4rr VR512:$src1,
+                  (EXTRACT_get_vextract256_imm VR256X:$ext)))>;
+
+def : Pat<(vextract256_extract:$ext (v16i32 VR512:$src1), (iPTR imm)),
+          (v8i32 (VEXTRACTI64x4rr VR512:$src1,
+                    (EXTRACT_get_vextract256_imm VR256X:$ext)))>;
+
+def : Pat<(vextract256_extract:$ext (v8f64 VR512:$src1), (iPTR imm)),
+          (v4f64 (VEXTRACTF64x4rr VR512:$src1,
+                  (EXTRACT_get_vextract256_imm VR256X:$ext)))>;
+
+def : Pat<(vextract256_extract:$ext (v8i64 VR512:$src1), (iPTR imm)),
+          (v4i64 (VEXTRACTI64x4rr VR512:$src1,
+                  (EXTRACT_get_vextract256_imm VR256X:$ext)))>;
+
+// A 256-bit subvector extract from the first 512-bit vector position
+// is a subregister copy that needs no instruction.
+def : Pat<(v8i32 (extract_subvector (v16i32 VR512:$src), (iPTR 0))),
+          (v8i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_ymm))>;
+def : Pat<(v8f32 (extract_subvector (v16f32 VR512:$src), (iPTR 0))),
+          (v8f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_ymm))>;
+def : Pat<(v4i64 (extract_subvector (v8i64 VR512:$src), (iPTR 0))),
+          (v4i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_ymm))>;
+def : Pat<(v4f64 (extract_subvector (v8f64 VR512:$src), (iPTR 0))),
+          (v4f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_ymm))>;
+
+// zmm -> xmm
+def : Pat<(v4i32 (extract_subvector (v16i32 VR512:$src), (iPTR 0))),
+          (v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm))>;
+def : Pat<(v2i64 (extract_subvector (v8i64 VR512:$src), (iPTR 0))),
+          (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm))>;
+def : Pat<(v2f64 (extract_subvector (v8f64 VR512:$src), (iPTR 0))),
+          (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm))>;
+def : Pat<(v4f32 (extract_subvector (v16f32 VR512:$src), (iPTR 0))),
+          (v4f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm))>;
+
+
+// A 128-bit subvector insert to the first 512-bit vector position
+// is a subregister copy that needs no instruction.
+def : Pat<(insert_subvector undef, (v2i64 VR128X:$src), (iPTR 0)),
+          (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)),
+          (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+          sub_ymm)>;
+def : Pat<(insert_subvector undef, (v2f64 VR128X:$src), (iPTR 0)),
+          (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)),
+          (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+          sub_ymm)>;
+def : Pat<(insert_subvector undef, (v4i32 VR128X:$src), (iPTR 0)),
+          (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)),
+          (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+          sub_ymm)>;
+def : Pat<(insert_subvector undef, (v4f32 VR128X:$src), (iPTR 0)),
+          (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
+          (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+          sub_ymm)>;
+
+def : Pat<(insert_subvector undef, (v4i64 VR256X:$src), (iPTR 0)),
+          (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
+def : Pat<(insert_subvector undef, (v4f64 VR256X:$src), (iPTR 0)),
+          (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
+def : Pat<(insert_subvector undef, (v8i32 VR256X:$src), (iPTR 0)),
+          (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
+def : Pat<(insert_subvector undef, (v8f32 VR256X:$src), (iPTR 0)),
+          (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
+
+// vextractps - extract 32 bits from XMM
+def VEXTRACTPSzrr : AVX512AIi8<0x17, MRMDestReg, (outs GR32:$dst),
+      (ins VR128X:$src1, u32u8imm:$src2),
+      "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+      [(set GR32:$dst, (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2))]>,
+      EVEX;
+
+def VEXTRACTPSzmr : AVX512AIi8<0x17, MRMDestMem, (outs),
+      (ins f32mem:$dst, VR128X:$src1, u32u8imm:$src2),
+      "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+      [(store (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2),
+                          addr:$dst)]>, EVEX, EVEX_CD8<32, CD8VT1>;
+
+//===---------------------------------------------------------------------===//
+// AVX-512 BROADCAST
+//---
+multiclass avx512_fp_broadcast<bits<8> opc, string OpcodeStr, 
+                         RegisterClass DestRC,
+                         RegisterClass SrcRC, X86MemOperand x86memop> {
+  def rr : AVX5128I<opc, MRMSrcReg, (outs DestRC:$dst), (ins SrcRC:$src),
+         !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
+         []>, EVEX;
+  def rm : AVX5128I<opc, MRMSrcMem, (outs DestRC:$dst), (ins x86memop:$src),
+        !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),[]>, EVEX;
+}
+let ExeDomain = SSEPackedSingle in {
+  defm VBROADCASTSSZ  : avx512_fp_broadcast<0x18, "vbroadcastss", VR512,
+                                       VR128X, f32mem>,
+                                       EVEX_V512, EVEX_CD8<32, CD8VT1>;
+}
+
+let ExeDomain = SSEPackedDouble in {
+  defm VBROADCASTSDZ  : avx512_fp_broadcast<0x19, "vbroadcastsd", VR512,
+                                       VR128X, f64mem>,
+                                       EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+}
+
+def : Pat<(v16f32 (X86VBroadcast (loadf32 addr:$src))),
+          (VBROADCASTSSZrm addr:$src)>;
+def : Pat<(v8f64 (X86VBroadcast (loadf64 addr:$src))),
+          (VBROADCASTSDZrm addr:$src)>;
+
+def : Pat<(int_x86_avx512_vbroadcast_ss_512 addr:$src),
+          (VBROADCASTSSZrm addr:$src)>;
+def : Pat<(int_x86_avx512_vbroadcast_sd_512 addr:$src),
+          (VBROADCASTSDZrm addr:$src)>;
+
+multiclass avx512_int_broadcast_reg<bits<8> opc, string OpcodeStr,
+                          RegisterClass SrcRC, RegisterClass KRC> {
+  def Zrr : AVX5128I<opc, MRMSrcReg, (outs VR512:$dst), (ins SrcRC:$src),
+                   !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
+                   []>, EVEX, EVEX_V512;
+  def Zkrr : AVX5128I<opc, MRMSrcReg, (outs VR512:$dst), 
+                   (ins KRC:$mask, SrcRC:$src),
+                   !strconcat(OpcodeStr, 
+                        " \t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"),
+                   []>, EVEX, EVEX_V512, EVEX_KZ;
+}
+
+defm VPBROADCASTDr  : avx512_int_broadcast_reg<0x7C, "vpbroadcastd", GR32, VK16WM>;
+defm VPBROADCASTQr  : avx512_int_broadcast_reg<0x7C, "vpbroadcastq", GR64, VK8WM>,
+                                            VEX_W;
+                                            
+def : Pat <(v16i32 (X86vzext VK16WM:$mask)),
+           (VPBROADCASTDrZkrr VK16WM:$mask, (i32 (MOV32ri 0x1)))>;
+
+def : Pat <(v8i64 (X86vzext VK8WM:$mask)),
+           (VPBROADCASTQrZkrr VK8WM:$mask, (i64 (MOV64ri 0x1)))>;
+
+def : Pat<(v16i32 (X86VBroadcast (i32 GR32:$src))),
+        (VPBROADCASTDrZrr GR32:$src)>;
+def : Pat<(v16i32 (X86VBroadcastm VK16WM:$mask, (i32 GR32:$src))),
+        (VPBROADCASTDrZkrr VK16WM:$mask, GR32:$src)>;
+def : Pat<(v8i64 (X86VBroadcast (i64 GR64:$src))),
+        (VPBROADCASTQrZrr GR64:$src)>;
+def : Pat<(v8i64 (X86VBroadcastm VK8WM:$mask, (i64 GR64:$src))),
+        (VPBROADCASTQrZkrr VK8WM:$mask, GR64:$src)>;
+
+def : Pat<(v16i32 (int_x86_avx512_pbroadcastd_i32_512 (i32 GR32:$src))),
+        (VPBROADCASTDrZrr GR32:$src)>;
+def : Pat<(v8i64 (int_x86_avx512_pbroadcastq_i64_512 (i64 GR64:$src))),
+        (VPBROADCASTQrZrr GR64:$src)>;
+
+def : Pat<(v16i32 (int_x86_avx512_mask_pbroadcast_d_gpr_512 (i32 GR32:$src),
+                   (v16i32 immAllZerosV), (i16 GR16:$mask))),
+          (VPBROADCASTDrZkrr (COPY_TO_REGCLASS GR16:$mask, VK16WM), GR32:$src)>;
+def : Pat<(v8i64 (int_x86_avx512_mask_pbroadcast_q_gpr_512 (i64 GR64:$src),
+                   (bc_v8i64 (v16i32 immAllZerosV)), (i8 GR8:$mask))),
+          (VPBROADCASTQrZkrr (COPY_TO_REGCLASS GR8:$mask, VK8WM), GR64:$src)>;
+
+multiclass avx512_int_broadcast_rm<bits<8> opc, string OpcodeStr,
+                          X86MemOperand x86memop, PatFrag ld_frag,
+                          RegisterClass DstRC, ValueType OpVT, ValueType SrcVT,
+                          RegisterClass KRC> {
+  def rr : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins VR128X:$src),
+                  !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
+                  [(set DstRC:$dst,
+                    (OpVT (X86VBroadcast (SrcVT VR128X:$src))))]>, EVEX;
+  def krr : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins KRC:$mask,
+                                                         VR128X:$src),
+                    !strconcat(OpcodeStr, 
+                    " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
+                    [(set DstRC:$dst,
+                      (OpVT (X86VBroadcastm KRC:$mask, (SrcVT VR128X:$src))))]>,
+                    EVEX, EVEX_KZ;
+  let mayLoad = 1 in {
+  def rm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
+                  !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
+                  [(set DstRC:$dst, 
+                    (OpVT (X86VBroadcast (ld_frag addr:$src))))]>, EVEX;
+  def krm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins KRC:$mask,
+                                                         x86memop:$src),
+                  !strconcat(OpcodeStr, 
+                      " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
+                  [(set DstRC:$dst, (OpVT (X86VBroadcastm KRC:$mask, 
+                                     (ld_frag addr:$src))))]>, EVEX, EVEX_KZ;
+  }
+}
+
+defm VPBROADCASTDZ  : avx512_int_broadcast_rm<0x58, "vpbroadcastd", i32mem,
+                      loadi32, VR512, v16i32, v4i32, VK16WM>,
+                      EVEX_V512, EVEX_CD8<32, CD8VT1>;
+defm VPBROADCASTQZ  : avx512_int_broadcast_rm<0x59, "vpbroadcastq", i64mem,
+                      loadi64, VR512, v8i64, v2i64, VK8WM>,  EVEX_V512, VEX_W,
+                      EVEX_CD8<64, CD8VT1>;
+
+multiclass avx512_int_subvec_broadcast_rm<bits<8> opc, string OpcodeStr,
+                          X86MemOperand x86memop, PatFrag ld_frag,
+                          RegisterClass KRC> {
+  let mayLoad = 1 in {
+  def rm : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins x86memop:$src),
+                  !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
+                  []>, EVEX;
+  def krm : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins KRC:$mask,
+                                                         x86memop:$src),
+                  !strconcat(OpcodeStr,
+                      " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
+                  []>, EVEX, EVEX_KZ;
+  }
+}
+
+defm VBROADCASTI32X4 : avx512_int_subvec_broadcast_rm<0x5a, "vbroadcasti32x4",
+                       i128mem, loadv2i64, VK16WM>,
+                       EVEX_V512, EVEX_CD8<32, CD8VT4>;
+defm VBROADCASTI64X4 : avx512_int_subvec_broadcast_rm<0x5b, "vbroadcasti64x4",
+                       i256mem, loadv4i64, VK16WM>, VEX_W,
+                       EVEX_V512, EVEX_CD8<64, CD8VT4>;
+
+def : Pat<(v16i32 (int_x86_avx512_pbroadcastd_512 (v4i32 VR128X:$src))),
+          (VPBROADCASTDZrr VR128X:$src)>;
+def : Pat<(v8i64 (int_x86_avx512_pbroadcastq_512 (v2i64 VR128X:$src))),
+          (VPBROADCASTQZrr VR128X:$src)>;
+
+def : Pat<(v16f32 (X86VBroadcast (v4f32 VR128X:$src))),
+          (VBROADCASTSSZrr VR128X:$src)>;
+def : Pat<(v8f64 (X86VBroadcast (v2f64 VR128X:$src))),
+          (VBROADCASTSDZrr VR128X:$src)>;
+
+def : Pat<(v16f32 (int_x86_avx512_vbroadcast_ss_ps_512 (v4f32 VR128X:$src))),
+          (VBROADCASTSSZrr VR128X:$src)>;
+def : Pat<(v8f64 (int_x86_avx512_vbroadcast_sd_pd_512 (v2f64 VR128X:$src))),
+          (VBROADCASTSDZrr VR128X:$src)>;
+    
+// Provide fallback in case the load node that is used in the patterns above
+// is used by additional users, which prevents the pattern selection.
+def : Pat<(v16f32 (X86VBroadcast FR32X:$src)),
+          (VBROADCASTSSZrr (COPY_TO_REGCLASS FR32X:$src, VR128X))>;
+def : Pat<(v8f64 (X86VBroadcast FR64X:$src)),
+          (VBROADCASTSDZrr (COPY_TO_REGCLASS FR64X:$src, VR128X))>;
+
+
+let Predicates = [HasAVX512] in {
+def : Pat<(v8i32 (X86VBroadcastm (v8i1 VK8WM:$mask), (loadi32 addr:$src))),
+           (EXTRACT_SUBREG 
+              (v16i32 (VPBROADCASTDZkrm (COPY_TO_REGCLASS VK8WM:$mask, VK16WM),
+                       addr:$src)), sub_ymm)>;
+}
+//===----------------------------------------------------------------------===//
+// AVX-512 BROADCAST MASK TO VECTOR REGISTER
+//---
+
+multiclass avx512_mask_broadcast<bits<8> opc, string OpcodeStr,
+                       RegisterClass DstRC, RegisterClass KRC,
+                       ValueType OpVT, ValueType SrcVT> {
+def rr : AVX512XS8I<opc, MRMDestReg, (outs DstRC:$dst), (ins KRC:$src),
+                  !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
+                  []>, EVEX;
+}
+
+let Predicates = [HasCDI] in {
+defm VPBROADCASTMW2D : avx512_mask_broadcast<0x3A, "vpbroadcastmw2d", VR512,
+                                             VK16, v16i32, v16i1>, EVEX_V512;
+defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q", VR512,
+                                            VK8, v8i64, v8i1>, EVEX_V512, VEX_W;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - VPERM
+//
+// -- immediate form --
+multiclass avx512_perm_imm<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                         SDNode OpNode, PatFrag mem_frag, 
+                         X86MemOperand x86memop, ValueType OpVT> {
+  def ri : AVX512AIi8<opc, MRMSrcReg, (outs RC:$dst),
+                     (ins RC:$src1, i8imm:$src2),
+                     !strconcat(OpcodeStr,
+                         " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     [(set RC:$dst,
+                       (OpVT (OpNode RC:$src1, (i8 imm:$src2))))]>,
+                     EVEX;
+  def mi : AVX512AIi8<opc, MRMSrcMem, (outs RC:$dst),
+                     (ins x86memop:$src1, i8imm:$src2),
+                     !strconcat(OpcodeStr,
+                         " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     [(set RC:$dst,
+                       (OpVT (OpNode (mem_frag addr:$src1),
+                              (i8 imm:$src2))))]>, EVEX;
+}
+
+defm VPERMQZ  : avx512_perm_imm<0x00, "vpermq", VR512, X86VPermi, memopv8i64,
+                        i512mem, v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+let ExeDomain = SSEPackedDouble in 
+defm VPERMPDZ  : avx512_perm_imm<0x01, "vpermpd", VR512, X86VPermi, memopv8f64, 
+                        f512mem, v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+// -- VPERM - register form --
+multiclass avx512_perm<bits<8> opc, string OpcodeStr, RegisterClass RC, 
+                     PatFrag mem_frag, X86MemOperand x86memop, ValueType OpVT> {
+
+  def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2),
+                   !strconcat(OpcodeStr,
+                       " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                   [(set RC:$dst,
+                     (OpVT (X86VPermv RC:$src1, RC:$src2)))]>, EVEX_4V;
+
+  def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+                   (ins RC:$src1, x86memop:$src2),
+                   !strconcat(OpcodeStr,
+                       " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                   [(set RC:$dst,
+                     (OpVT (X86VPermv RC:$src1, (mem_frag addr:$src2))))]>,
+                     EVEX_4V;
+}
+
+defm VPERMDZ   : avx512_perm<0x36, "vpermd",  VR512,  memopv16i32, i512mem,
+                           v16i32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPERMQZ   : avx512_perm<0x36, "vpermq",  VR512,  memopv8i64,  i512mem, 
+                           v8i64>,  EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+let ExeDomain = SSEPackedSingle in
+defm VPERMPSZ  : avx512_perm<0x16, "vpermps", VR512,  memopv16f32, f512mem,
+                           v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+let ExeDomain = SSEPackedDouble in
+defm VPERMPDZ  : avx512_perm<0x16, "vpermpd", VR512,  memopv8f64, f512mem, 
+                           v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+// -- VPERM2I - 3 source operands form --
+multiclass avx512_perm_3src<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                          PatFrag mem_frag, X86MemOperand x86memop,
+                          SDNode OpNode, ValueType OpVT, RegisterClass KRC> {
+let Constraints = "$src1 = $dst" in {
+  def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, RC:$src3),
+                   !strconcat(OpcodeStr,
+                       " \t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set RC:$dst,
+                     (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>,
+                    EVEX_4V;
+
+  def rrk : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
+                   (ins RC:$src1, KRC:$mask, RC:$src2, RC:$src3),
+                   !strconcat(OpcodeStr,
+                       " \t{$src3, $src2, $dst {${mask}}|"
+                       "$dst {${mask}}, $src2, $src3}"),
+                   [(set RC:$dst, (OpVT (vselect KRC:$mask,
+                                           (OpNode RC:$src1, RC:$src2,
+                                              RC:$src3),
+                                           RC:$src1)))]>,
+                    EVEX_4V, EVEX_K;
+
+  let AddedComplexity = 30 in // Prefer over VMOV*rrkz Pat<>
+    def rrkz : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
+                   (ins RC:$src1, KRC:$mask, RC:$src2, RC:$src3),
+                   !strconcat(OpcodeStr,
+                       " \t{$src3, $src2, $dst {${mask}} {z} |",
+                       "$dst {${mask}} {z}, $src2, $src3}"),
+                   [(set RC:$dst, (OpVT (vselect KRC:$mask,
+                                           (OpNode RC:$src1, RC:$src2,
+                                              RC:$src3),
+                                           (OpVT (bitconvert
+                                              (v16i32 immAllZerosV))))))]>,
+                    EVEX_4V, EVEX_KZ;
+
+  def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, x86memop:$src3),
+                   !strconcat(OpcodeStr,
+                    " \t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set RC:$dst,
+                     (OpVT (OpNode RC:$src1, RC:$src2,
+                      (mem_frag addr:$src3))))]>, EVEX_4V;
+
+  def rmk : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+                   (ins RC:$src1, KRC:$mask, RC:$src2, x86memop:$src3),
+                   !strconcat(OpcodeStr,
+                    " \t{$src3, $src2, $dst {${mask}}|"
+                    "$dst {${mask}}, $src2, $src3}"),
+                   [(set RC:$dst,
+                       (OpVT (vselect KRC:$mask,
+                                      (OpNode RC:$src1, RC:$src2,
+                                         (mem_frag addr:$src3)),
+                                      RC:$src1)))]>,
+                    EVEX_4V, EVEX_K;
+
+  let AddedComplexity = 10 in // Prefer over the rrkz variant
+    def rmkz : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+                   (ins RC:$src1, KRC:$mask, RC:$src2, x86memop:$src3),
+                   !strconcat(OpcodeStr,
+                    " \t{$src3, $src2, $dst {${mask}} {z}|"
+                    "$dst {${mask}} {z}, $src2, $src3}"),
+                   [(set RC:$dst,
+                     (OpVT (vselect KRC:$mask,
+                                    (OpNode RC:$src1, RC:$src2,
+                                            (mem_frag addr:$src3)),
+                                    (OpVT (bitconvert
+                                       (v16i32 immAllZerosV))))))]>,
+                    EVEX_4V, EVEX_KZ;
+  }
+}
+defm VPERMI2D  : avx512_perm_3src<0x76, "vpermi2d",  VR512, memopv16i32,
+                                  i512mem, X86VPermiv3, v16i32, VK16WM>,
+                 EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPERMI2Q  : avx512_perm_3src<0x76, "vpermi2q",  VR512, memopv8i64,
+                                  i512mem, X86VPermiv3, v8i64, VK8WM>,
+                 EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPERMI2PS : avx512_perm_3src<0x77, "vpermi2ps",  VR512, memopv16f32,
+                                  i512mem, X86VPermiv3, v16f32, VK16WM>,
+                 EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPERMI2PD : avx512_perm_3src<0x77, "vpermi2pd",  VR512, memopv8f64,
+                                  i512mem, X86VPermiv3, v8f64, VK8WM>,
+                  EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+multiclass avx512_perm_table_3src<bits<8> opc, string Suffix, RegisterClass RC,
+                          PatFrag mem_frag, X86MemOperand x86memop,
+                          SDNode OpNode, ValueType OpVT, RegisterClass KRC,
+                          ValueType MaskVT, RegisterClass MRC> :
+        avx512_perm_3src<opc, "vpermt2"##Suffix, RC, mem_frag, x86memop, OpNode,
+                         OpVT, KRC> {
+  def : Pat<(OpVT (!cast<Intrinsic>("int_x86_avx512_mask_vpermt_"##Suffix##"_512")
+                     VR512:$idx, VR512:$src1, VR512:$src2, -1)),
+            (!cast<Instruction>(NAME#rr) VR512:$src1, VR512:$idx, VR512:$src2)>;
+
+  def : Pat<(OpVT (!cast<Intrinsic>("int_x86_avx512_mask_vpermt_"##Suffix##"_512")
+                     VR512:$idx, VR512:$src1, VR512:$src2, MRC:$mask)),
+            (!cast<Instruction>(NAME#rrk) VR512:$src1,
+              (MaskVT (COPY_TO_REGCLASS MRC:$mask, KRC)), VR512:$idx, VR512:$src2)>;
+}
+
+defm VPERMT2D  : avx512_perm_table_3src<0x7E, "d",  VR512, memopv16i32, i512mem,
+                               X86VPermv3, v16i32, VK16WM, v16i1, GR16>,
+                 EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPERMT2Q  : avx512_perm_table_3src<0x7E, "q",  VR512, memopv8i64, i512mem,
+                               X86VPermv3, v8i64, VK8WM, v8i1, GR8>,
+                 EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPERMT2PS : avx512_perm_table_3src<0x7F, "ps",  VR512, memopv16f32, i512mem,
+                               X86VPermv3, v16f32, VK16WM, v16i1, GR16>,
+                 EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPERMT2PD : avx512_perm_table_3src<0x7F, "pd",  VR512, memopv8f64, i512mem,
+                               X86VPermv3, v8f64, VK8WM, v8i1, GR8>,
+                 EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - BLEND using mask
+//
+multiclass avx512_blendmask<bits<8> opc, string OpcodeStr,
+                          RegisterClass KRC, RegisterClass RC,
+                          X86MemOperand x86memop, PatFrag mem_frag,
+                          SDNode OpNode, ValueType vt> {
+  def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
+             (ins KRC:$mask, RC:$src1, RC:$src2),
+             !strconcat(OpcodeStr,
+             " \t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
+             [(set RC:$dst, (OpNode KRC:$mask, (vt RC:$src2),
+                 (vt RC:$src1)))]>, EVEX_4V, EVEX_K;
+  let mayLoad = 1 in
+  def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+             (ins KRC:$mask, RC:$src1, x86memop:$src2),
+             !strconcat(OpcodeStr,
+             " \t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
+             []>, EVEX_4V, EVEX_K;
+}
+
+let ExeDomain = SSEPackedSingle in
+defm VBLENDMPSZ : avx512_blendmask<0x65, "vblendmps", 
+                              VK16WM, VR512, f512mem,
+                              memopv16f32, vselect, v16f32>, 
+                              EVEX_CD8<32, CD8VF>, EVEX_V512;
+let ExeDomain = SSEPackedDouble in
+defm VBLENDMPDZ : avx512_blendmask<0x65, "vblendmpd", 
+                              VK8WM, VR512, f512mem,
+                              memopv8f64, vselect, v8f64>, 
+                              VEX_W, EVEX_CD8<64, CD8VF>, EVEX_V512;
+
+def : Pat<(v16f32 (int_x86_avx512_mask_blend_ps_512 (v16f32 VR512:$src1),
+                 (v16f32 VR512:$src2), (i16 GR16:$mask))),
+        (VBLENDMPSZrr (COPY_TO_REGCLASS GR16:$mask, VK16WM),
+         VR512:$src1, VR512:$src2)>;
+
+def : Pat<(v8f64 (int_x86_avx512_mask_blend_pd_512 (v8f64 VR512:$src1),
+                 (v8f64 VR512:$src2), (i8 GR8:$mask))),
+        (VBLENDMPDZrr (COPY_TO_REGCLASS GR8:$mask, VK8WM),
+         VR512:$src1, VR512:$src2)>;
+
+defm VPBLENDMDZ : avx512_blendmask<0x64, "vpblendmd", 
+                              VK16WM, VR512, f512mem, 
+                              memopv16i32, vselect, v16i32>, 
+                              EVEX_CD8<32, CD8VF>, EVEX_V512;
+
+defm VPBLENDMQZ : avx512_blendmask<0x64, "vpblendmq", 
+                              VK8WM, VR512, f512mem, 
+                              memopv8i64, vselect, v8i64>, 
+                              VEX_W, EVEX_CD8<64, CD8VF>, EVEX_V512;
+
+def : Pat<(v16i32 (int_x86_avx512_mask_blend_d_512 (v16i32 VR512:$src1),
+                 (v16i32 VR512:$src2), (i16 GR16:$mask))),
+        (VPBLENDMDZrr (COPY_TO_REGCLASS GR16:$mask, VK16),
+         VR512:$src1, VR512:$src2)>;
+
+def : Pat<(v8i64 (int_x86_avx512_mask_blend_q_512 (v8i64 VR512:$src1),
+                 (v8i64 VR512:$src2), (i8 GR8:$mask))),
+        (VPBLENDMQZrr (COPY_TO_REGCLASS GR8:$mask, VK8),
+         VR512:$src1, VR512:$src2)>;
+
+let Predicates = [HasAVX512] in {
+def : Pat<(v8f32 (vselect (v8i1 VK8WM:$mask), (v8f32 VR256X:$src1),
+                            (v8f32 VR256X:$src2))),
+            (EXTRACT_SUBREG 
+              (v16f32 (VBLENDMPSZrr (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), 
+            (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)),
+            (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>;
+
+def : Pat<(v8i32 (vselect (v8i1 VK8WM:$mask), (v8i32 VR256X:$src1),
+                            (v8i32 VR256X:$src2))),
+            (EXTRACT_SUBREG 
+                (v16i32 (VPBLENDMDZrr (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), 
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)),
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>;
+}
+//===----------------------------------------------------------------------===//
+// Compare Instructions
+//===----------------------------------------------------------------------===//
+
+// avx512_cmp_scalar - AVX512 CMPSS and CMPSD
+multiclass avx512_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
+                            Operand CC, SDNode OpNode, ValueType VT,
+                            PatFrag ld_frag, string asm, string asm_alt> {
+  def rr : AVX512Ii8<0xC2, MRMSrcReg,
+                (outs VK1:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
+                [(set VK1:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))],
+                IIC_SSE_ALU_F32S_RR>, EVEX_4V;
+  def rm : AVX512Ii8<0xC2, MRMSrcMem,
+                (outs VK1:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
+                [(set VK1:$dst, (OpNode (VT RC:$src1),
+                (ld_frag addr:$src2), imm:$cc))], IIC_SSE_ALU_F32P_RM>, EVEX_4V;
+  let isAsmParserOnly = 1, hasSideEffects = 0 in {
+    def rri_alt : AVX512Ii8<0xC2, MRMSrcReg,
+               (outs VK1:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc),
+               asm_alt, [], IIC_SSE_ALU_F32S_RR>, EVEX_4V;
+    def rmi_alt : AVX512Ii8<0xC2, MRMSrcMem,
+               (outs VK1:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc),
+               asm_alt, [], IIC_SSE_ALU_F32P_RM>, EVEX_4V;
+  }
+}
+
+let Predicates = [HasAVX512] in {
+defm VCMPSSZ : avx512_cmp_scalar<FR32X, f32mem, AVXCC, X86cmpms, f32, loadf32,
+                 "vcmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                 "vcmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">,
+                 XS;
+defm VCMPSDZ : avx512_cmp_scalar<FR64X, f64mem, AVXCC, X86cmpms, f64, loadf64,
+                 "vcmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                 "vcmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">,
+                 XD, VEX_W;
+}
+
+multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, RegisterClass KRC, 
+              RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag, 
+              SDNode OpNode, ValueType vt> {
+  def rr : AVX512BI<opc, MRMSrcReg,
+             (outs KRC:$dst), (ins RC:$src1, RC:$src2), 
+             !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set KRC:$dst, (OpNode (vt RC:$src1), (vt RC:$src2)))], 
+             IIC_SSE_ALU_F32P_RR>, EVEX_4V;
+  def rm : AVX512BI<opc, MRMSrcMem,
+             (outs KRC:$dst), (ins RC:$src1, x86memop:$src2), 
+             !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set KRC:$dst, (OpNode (vt RC:$src1), (memop_frag addr:$src2)))],
+             IIC_SSE_ALU_F32P_RM>, EVEX_4V;
+}
+
+defm VPCMPEQDZ : avx512_icmp_packed<0x76, "vpcmpeqd", VK16, VR512, i512mem, 
+                           memopv16i32, X86pcmpeqm, v16i32>, EVEX_V512,
+                           EVEX_CD8<32, CD8VF>;
+defm VPCMPEQQZ : avx512_icmp_packed<0x29, "vpcmpeqq", VK8, VR512, i512mem, 
+                           memopv8i64, X86pcmpeqm, v8i64>, T8PD, EVEX_V512,
+                           VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VPCMPGTDZ : avx512_icmp_packed<0x66, "vpcmpgtd", VK16, VR512, i512mem, 
+                           memopv16i32, X86pcmpgtm, v16i32>, EVEX_V512,
+                           EVEX_CD8<32, CD8VF>;
+defm VPCMPGTQZ : avx512_icmp_packed<0x37, "vpcmpgtq", VK8, VR512, i512mem, 
+                           memopv8i64, X86pcmpgtm, v8i64>, T8PD, EVEX_V512,
+                           VEX_W, EVEX_CD8<64, CD8VF>;
+
+def : Pat<(v8i1 (X86pcmpgtm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
+            (COPY_TO_REGCLASS (VPCMPGTDZrr 
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)),
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm))), VK8)>;
+
+def : Pat<(v8i1 (X86pcmpeqm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
+            (COPY_TO_REGCLASS (VPCMPEQDZrr 
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)),
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm))), VK8)>;
+
+multiclass avx512_icmp_cc<bits<8> opc, RegisterClass WMRC, RegisterClass KRC,
+              RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag, 
+              SDNode OpNode, ValueType vt, Operand CC, string Suffix> {
+  def rri : AVX512AIi8<opc, MRMSrcReg,
+             (outs KRC:$dst), (ins RC:$src1, RC:$src2, CC:$cc),
+             !strconcat("vpcmp${cc}", Suffix,
+                        "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set KRC:$dst, (OpNode (vt RC:$src1), (vt RC:$src2), imm:$cc))], 
+             IIC_SSE_ALU_F32P_RR>, EVEX_4V;
+  def rmi : AVX512AIi8<opc, MRMSrcMem,
+             (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc),
+             !strconcat("vpcmp${cc}", Suffix,
+                        "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set KRC:$dst, (OpNode (vt RC:$src1), (memop_frag addr:$src2),
+                              imm:$cc))], IIC_SSE_ALU_F32P_RM>, EVEX_4V;
+  // Accept explicit immediate argument form instead of comparison code.
+  let isAsmParserOnly = 1, hasSideEffects = 0 in {
+    def rri_alt : AVX512AIi8<opc, MRMSrcReg,
+               (outs KRC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc),
+               !strconcat("vpcmp", Suffix,
+                  "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"),
+               [], IIC_SSE_ALU_F32P_RR>, EVEX_4V;
+    def rrik_alt : AVX512AIi8<opc, MRMSrcReg,
+               (outs KRC:$dst), (ins WMRC:$mask, RC:$src1, RC:$src2, i8imm:$cc),
+               !strconcat("vpcmp", Suffix,
+                  "\t{$cc, $src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2, $cc}"),
+               [], IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K;
+    def rmi_alt : AVX512AIi8<opc, MRMSrcMem,
+               (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc),
+               !strconcat("vpcmp", Suffix,
+                  "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"),
+               [], IIC_SSE_ALU_F32P_RM>, EVEX_4V;
+    def rmik_alt : AVX512AIi8<opc, MRMSrcMem,
+               (outs KRC:$dst), (ins WMRC:$mask, RC:$src1, x86memop:$src2, i8imm:$cc),
+               !strconcat("vpcmp", Suffix,
+                  "\t{$cc, $src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2, $cc}"),
+               [], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K;
+  }
+}
+
+defm VPCMPDZ :  avx512_icmp_cc<0x1F, VK16WM, VK16, VR512, i512mem, memopv16i32,
+                               X86cmpm, v16i32, AVXCC, "d">,
+                EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPCMPUDZ : avx512_icmp_cc<0x1E, VK16WM, VK16, VR512, i512mem, memopv16i32,
+                               X86cmpmu, v16i32, AVXCC, "ud">,
+                EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+defm VPCMPQZ :  avx512_icmp_cc<0x1F, VK8WM, VK8, VR512, i512mem, memopv8i64,
+                               X86cmpm, v8i64, AVXCC, "q">,
+                VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
+defm VPCMPUQZ : avx512_icmp_cc<0x1E, VK8WM, VK8, VR512, i512mem, memopv8i64,
+                               X86cmpmu, v8i64, AVXCC, "uq">,
+                VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
+
+// avx512_cmp_packed - compare packed instructions
+multiclass avx512_cmp_packed<RegisterClass KRC, RegisterClass RC,
+                           X86MemOperand x86memop, ValueType vt,
+                           string suffix, Domain d> {
+  def rri : AVX512PIi8<0xC2, MRMSrcReg,
+             (outs KRC:$dst), (ins RC:$src1, RC:$src2, AVXCC:$cc),
+             !strconcat("vcmp${cc}", suffix,
+                        " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set KRC:$dst, (X86cmpm (vt RC:$src1), (vt RC:$src2), imm:$cc))], d>;
+  def rrib: AVX512PIi8<0xC2, MRMSrcReg,
+             (outs KRC:$dst), (ins RC:$src1, RC:$src2, AVXCC:$cc),
+     !strconcat("vcmp${cc}", suffix,
+                " \t{{sae}, $src2, $src1, $dst|$dst, $src1, $src2, {sae}}"),
+                [], d>, EVEX_B;
+  def rmi : AVX512PIi8<0xC2, MRMSrcMem,
+             (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, AVXCC:$cc),
+              !strconcat("vcmp${cc}", suffix,
+                         " \t{$src2, $src1, $dst|$dst, $src1, $src2, $cc}"),
+             [(set KRC:$dst,
+              (X86cmpm (vt RC:$src1), (memop addr:$src2), imm:$cc))], d>;
+
+  // Accept explicit immediate argument form instead of comparison code.
+  let isAsmParserOnly = 1, hasSideEffects = 0 in {
+    def rri_alt : AVX512PIi8<0xC2, MRMSrcReg,
+               (outs KRC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc),
+              !strconcat("vcmp", suffix,
+                        " \t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [], d>;
+    def rmi_alt : AVX512PIi8<0xC2, MRMSrcMem,
+               (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc),
+              !strconcat("vcmp", suffix,
+                        " \t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [], d>;
+  }
+}
+
+defm VCMPPSZ : avx512_cmp_packed<VK16, VR512, f512mem, v16f32,
+               "ps", SSEPackedSingle>, PS, EVEX_4V, EVEX_V512,
+               EVEX_CD8<32, CD8VF>;
+defm VCMPPDZ : avx512_cmp_packed<VK8, VR512, f512mem, v8f64,
+               "pd", SSEPackedDouble>, PD, EVEX_4V, VEX_W, EVEX_V512,
+               EVEX_CD8<64, CD8VF>;
+
+def : Pat<(v8i1 (X86cmpm (v8f32 VR256X:$src1), (v8f32 VR256X:$src2), imm:$cc)),
+          (COPY_TO_REGCLASS (VCMPPSZrri
+            (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)),
+            (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)),
+            imm:$cc), VK8)>;
+def : Pat<(v8i1 (X86cmpm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)),
+          (COPY_TO_REGCLASS (VPCMPDZrri
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)),
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)),
+            imm:$cc), VK8)>;
+def : Pat<(v8i1 (X86cmpmu (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)),
+          (COPY_TO_REGCLASS (VPCMPUDZrri
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)),
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)),
+            imm:$cc), VK8)>;
+
+def : Pat<(i16 (int_x86_avx512_mask_cmp_ps_512 (v16f32 VR512:$src1),
+                (v16f32 VR512:$src2), imm:$cc, (i16 -1),
+                 FROUND_NO_EXC)),
+          (COPY_TO_REGCLASS (VCMPPSZrrib VR512:$src1, VR512:$src2,
+                             (I8Imm imm:$cc)), GR16)>;
+           
+def : Pat<(i8 (int_x86_avx512_mask_cmp_pd_512 (v8f64 VR512:$src1),
+                (v8f64 VR512:$src2), imm:$cc, (i8 -1),
+                 FROUND_NO_EXC)),
+          (COPY_TO_REGCLASS (VCMPPDZrrib VR512:$src1, VR512:$src2,
+                             (I8Imm imm:$cc)), GR8)>;
+
+def : Pat<(i16 (int_x86_avx512_mask_cmp_ps_512 (v16f32 VR512:$src1),
+                (v16f32 VR512:$src2), imm:$cc, (i16 -1),
+                FROUND_CURRENT)),
+          (COPY_TO_REGCLASS (VCMPPSZrri VR512:$src1, VR512:$src2,
+                             (I8Imm imm:$cc)), GR16)>;
+
+def : Pat<(i8 (int_x86_avx512_mask_cmp_pd_512 (v8f64 VR512:$src1),
+                (v8f64 VR512:$src2), imm:$cc, (i8 -1),
+                 FROUND_CURRENT)),
+          (COPY_TO_REGCLASS (VCMPPDZrri VR512:$src1, VR512:$src2,
+                             (I8Imm imm:$cc)), GR8)>;
+
+// Mask register copy, including
+// - copy between mask registers
+// - load/store mask registers
+// - copy from GPR to mask register and vice versa
+//
+multiclass avx512_mask_mov<bits<8> opc_kk, bits<8> opc_km, bits<8> opc_mk,
+                         string OpcodeStr, RegisterClass KRC,
+                         ValueType vt, X86MemOperand x86memop> {
+  let hasSideEffects = 0 in {
+    def kk : I<opc_kk, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src),
+               !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), []>;
+    let mayLoad = 1 in
+    def km : I<opc_km, MRMSrcMem, (outs KRC:$dst), (ins x86memop:$src),
+               !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
+               [(set KRC:$dst, (vt (load addr:$src)))]>;
+    let mayStore = 1 in
+    def mk : I<opc_mk, MRMDestMem, (outs), (ins x86memop:$dst, KRC:$src),
+               !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), []>;
+  }
+}
+
+multiclass avx512_mask_mov_gpr<bits<8> opc_kr, bits<8> opc_rk,
+                             string OpcodeStr,
+                             RegisterClass KRC, RegisterClass GRC> {
+  let hasSideEffects = 0 in {
+    def kr : I<opc_kr, MRMSrcReg, (outs KRC:$dst), (ins GRC:$src),
+               !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), []>;
+    def rk : I<opc_rk, MRMSrcReg, (outs GRC:$dst), (ins KRC:$src),
+               !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), []>;
+  }
+}
+
+let Predicates = [HasAVX512] in {
+  defm KMOVW : avx512_mask_mov<0x90, 0x90, 0x91, "kmovw", VK16, v16i1, i16mem>,
+               VEX, PS;
+  defm KMOVW : avx512_mask_mov_gpr<0x92, 0x93, "kmovw", VK16, GR32>,
+               VEX, PS;
+}
+
+let Predicates = [HasAVX512] in {
+  // GR16 from/to 16-bit mask
+  def : Pat<(v16i1 (bitconvert (i16 GR16:$src))),
+            (KMOVWkr (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit))>;
+  def : Pat<(i16 (bitconvert (v16i1 VK16:$src))),
+            (EXTRACT_SUBREG (KMOVWrk VK16:$src), sub_16bit)>;
+
+  // Store kreg in memory
+  def : Pat<(store (v16i1 VK16:$src), addr:$dst),
+            (KMOVWmk addr:$dst, VK16:$src)>;
+
+  def : Pat<(store VK8:$src, addr:$dst),
+            (KMOVWmk addr:$dst, (COPY_TO_REGCLASS VK8:$src, VK16))>;
+
+  def : Pat<(i1 (load addr:$src)),
+            (COPY_TO_REGCLASS (KMOVWkm addr:$src), VK1)>;
+
+  def : Pat<(v8i1 (load addr:$src)),
+            (COPY_TO_REGCLASS (KMOVWkm addr:$src), VK8)>;
+
+  def : Pat<(i1 (trunc (i32 GR32:$src))),
+            (COPY_TO_REGCLASS (KMOVWkr (AND32ri $src, (i32 1))), VK1)>;
+
+  def : Pat<(i1 (trunc (i8 GR8:$src))),
+       (COPY_TO_REGCLASS
+        (KMOVWkr (AND32ri (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit), (i32 1))),
+       VK1)>;
+  def : Pat<(i1 (trunc (i16 GR16:$src))),
+       (COPY_TO_REGCLASS
+        (KMOVWkr (AND32ri (SUBREG_TO_REG (i32 0), $src, sub_16bit), (i32 1))),
+       VK1)>;
+            
+  def : Pat<(i32 (zext VK1:$src)),
+            (AND32ri (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1))>;
+  def : Pat<(i8 (zext VK1:$src)),
+            (EXTRACT_SUBREG
+             (AND32ri (KMOVWrk
+                       (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)), sub_8bit)>;
+  def : Pat<(i64 (zext VK1:$src)),
+            (AND64ri8 (SUBREG_TO_REG (i64 0),
+             (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), sub_32bit), (i64 1))>;
+  def : Pat<(i16 (zext VK1:$src)),
+            (EXTRACT_SUBREG
+             (AND32ri (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)),
+              sub_16bit)>;
+  def : Pat<(v16i1 (scalar_to_vector VK1:$src)),
+            (COPY_TO_REGCLASS VK1:$src, VK16)>;
+  def : Pat<(v8i1 (scalar_to_vector VK1:$src)),
+            (COPY_TO_REGCLASS VK1:$src, VK8)>;
+}
+// With AVX-512 only, 8-bit mask is promoted to 16-bit mask.
+let Predicates = [HasAVX512] in {
+  // GR from/to 8-bit mask without native support
+  def : Pat<(v8i1 (bitconvert (i8 GR8:$src))),
+            (COPY_TO_REGCLASS
+              (KMOVWkr (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)),
+              VK8)>;
+  def : Pat<(i8 (bitconvert (v8i1 VK8:$src))),
+            (EXTRACT_SUBREG
+              (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16)),
+              sub_8bit)>;
+
+  def : Pat<(i1 (X86Vextract VK16:$src, (iPTR 0))),
+            (COPY_TO_REGCLASS VK16:$src, VK1)>;
+  def : Pat<(i1 (X86Vextract VK8:$src, (iPTR 0))),
+            (COPY_TO_REGCLASS VK8:$src, VK1)>;
+
+}
+
+// Mask unary operation
+// - KNOT
+multiclass avx512_mask_unop<bits<8> opc, string OpcodeStr,
+                         RegisterClass KRC, SDPatternOperator OpNode> {
+  let Predicates = [HasAVX512] in
+    def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src),
+               !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
+               [(set KRC:$dst, (OpNode KRC:$src))]>;
+}
+
+multiclass avx512_mask_unop_w<bits<8> opc, string OpcodeStr,
+                               SDPatternOperator OpNode> {
+  defm W : avx512_mask_unop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode>,
+                          VEX, PS;
+}
+
+defm KNOT : avx512_mask_unop_w<0x44, "knot", not>;
+
+multiclass avx512_mask_unop_int<string IntName, string InstName> {
+  let Predicates = [HasAVX512] in
+    def : Pat<(!cast<Intrinsic>("int_x86_avx512_"##IntName##"_w")
+                (i16 GR16:$src)),
+              (COPY_TO_REGCLASS (!cast<Instruction>(InstName##"Wrr")
+              (v16i1 (COPY_TO_REGCLASS GR16:$src, VK16))), GR16)>;
+}
+defm : avx512_mask_unop_int<"knot", "KNOT">;
+
+def : Pat<(xor VK16:$src1, (v16i1 immAllOnesV)), (KNOTWrr VK16:$src1)>;
+def : Pat<(xor VK8:$src1,  (v8i1 immAllOnesV)),
+          (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$src1, VK16)), VK8)>;
+
+// With AVX-512, 8-bit mask is promoted to 16-bit mask.
+def : Pat<(not VK8:$src),
+          (COPY_TO_REGCLASS
+            (KNOTWrr (COPY_TO_REGCLASS VK8:$src, VK16)), VK8)>;
+
+// Mask binary operation
+// - KAND, KANDN, KOR, KXNOR, KXOR
+multiclass avx512_mask_binop<bits<8> opc, string OpcodeStr,
+                           RegisterClass KRC, SDPatternOperator OpNode> {
+  let Predicates = [HasAVX512] in
+    def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src1, KRC:$src2),
+               !strconcat(OpcodeStr,
+                          " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+               [(set KRC:$dst, (OpNode KRC:$src1, KRC:$src2))]>;
+}
+
+multiclass avx512_mask_binop_w<bits<8> opc, string OpcodeStr,
+                             SDPatternOperator OpNode> {
+  defm W : avx512_mask_binop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode>,
+                           VEX_4V, VEX_L, PS;
+}
+
+def andn : PatFrag<(ops node:$i0, node:$i1), (and (not node:$i0), node:$i1)>;
+def xnor : PatFrag<(ops node:$i0, node:$i1), (not (xor node:$i0, node:$i1))>;
+
+let isCommutable = 1 in {
+  defm KAND  : avx512_mask_binop_w<0x41, "kand",  and>;
+  let isCommutable = 0 in
+  defm KANDN : avx512_mask_binop_w<0x42, "kandn", andn>;
+  defm KOR   : avx512_mask_binop_w<0x45, "kor",   or>;
+  defm KXNOR : avx512_mask_binop_w<0x46, "kxnor", xnor>;
+  defm KXOR  : avx512_mask_binop_w<0x47, "kxor",  xor>;
+}
+
+def : Pat<(xor VK1:$src1, VK1:$src2),
+     (COPY_TO_REGCLASS (KXORWrr (COPY_TO_REGCLASS VK1:$src1, VK16),
+                                (COPY_TO_REGCLASS VK1:$src2, VK16)), VK1)>;
+
+def : Pat<(or VK1:$src1, VK1:$src2),
+     (COPY_TO_REGCLASS (KORWrr (COPY_TO_REGCLASS VK1:$src1, VK16),
+                               (COPY_TO_REGCLASS VK1:$src2, VK16)), VK1)>;
+
+def : Pat<(and VK1:$src1, VK1:$src2),
+     (COPY_TO_REGCLASS (KANDWrr (COPY_TO_REGCLASS VK1:$src1, VK16),
+                                (COPY_TO_REGCLASS VK1:$src2, VK16)), VK1)>;
+
+multiclass avx512_mask_binop_int<string IntName, string InstName> {
+  let Predicates = [HasAVX512] in
+    def : Pat<(!cast<Intrinsic>("int_x86_avx512_"##IntName##"_w")
+                (i16 GR16:$src1), (i16 GR16:$src2)),
+              (COPY_TO_REGCLASS (!cast<Instruction>(InstName##"Wrr")
+              (v16i1 (COPY_TO_REGCLASS GR16:$src1, VK16)),
+              (v16i1 (COPY_TO_REGCLASS GR16:$src2, VK16))), GR16)>;
+}
+
+defm : avx512_mask_binop_int<"kand",  "KAND">;
+defm : avx512_mask_binop_int<"kandn", "KANDN">;
+defm : avx512_mask_binop_int<"kor",   "KOR">;
+defm : avx512_mask_binop_int<"kxnor", "KXNOR">;
+defm : avx512_mask_binop_int<"kxor",  "KXOR">;
+
+// With AVX-512, 8-bit mask is promoted to 16-bit mask.
+multiclass avx512_binop_pat<SDPatternOperator OpNode, Instruction Inst> {
+  let Predicates = [HasAVX512] in
+    def : Pat<(OpNode VK8:$src1, VK8:$src2),
+              (COPY_TO_REGCLASS
+                (Inst (COPY_TO_REGCLASS VK8:$src1, VK16),
+                      (COPY_TO_REGCLASS VK8:$src2, VK16)), VK8)>;
+}
+
+defm : avx512_binop_pat<and,  KANDWrr>;
+defm : avx512_binop_pat<andn, KANDNWrr>;
+defm : avx512_binop_pat<or,   KORWrr>;
+defm : avx512_binop_pat<xnor, KXNORWrr>;
+defm : avx512_binop_pat<xor,  KXORWrr>;
+
+// Mask unpacking
+multiclass avx512_mask_unpck<bits<8> opc, string OpcodeStr,
+                           RegisterClass KRC> {
+  let Predicates = [HasAVX512] in
+    def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src1, KRC:$src2),
+               !strconcat(OpcodeStr,
+                          " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
+}
+
+multiclass avx512_mask_unpck_bw<bits<8> opc, string OpcodeStr> {
+  defm BW : avx512_mask_unpck<opc, !strconcat(OpcodeStr, "bw"), VK16>,
+                            VEX_4V, VEX_L, PD;
+}
+
+defm KUNPCK : avx512_mask_unpck_bw<0x4b, "kunpck">;
+def : Pat<(v16i1 (concat_vectors (v8i1 VK8:$src1), (v8i1 VK8:$src2))),
+          (KUNPCKBWrr (COPY_TO_REGCLASS VK8:$src2, VK16),
+                  (COPY_TO_REGCLASS VK8:$src1, VK16))>;
+
+
+multiclass avx512_mask_unpck_int<string IntName, string InstName> {
+  let Predicates = [HasAVX512] in
+    def : Pat<(!cast<Intrinsic>("int_x86_avx512_"##IntName##"_bw")
+                (i16 GR16:$src1), (i16 GR16:$src2)),
+              (COPY_TO_REGCLASS (!cast<Instruction>(InstName##"BWrr")
+              (v16i1 (COPY_TO_REGCLASS GR16:$src1, VK16)),
+              (v16i1 (COPY_TO_REGCLASS GR16:$src2, VK16))), GR16)>;
+}
+defm : avx512_mask_unpck_int<"kunpck",  "KUNPCK">;
+
+// Mask bit testing
+multiclass avx512_mask_testop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
+                            SDNode OpNode> {
+  let Predicates = [HasAVX512], Defs = [EFLAGS] in
+    def rr : I<opc, MRMSrcReg, (outs), (ins KRC:$src1, KRC:$src2),
+               !strconcat(OpcodeStr, " \t{$src2, $src1|$src1, $src2}"),
+               [(set EFLAGS, (OpNode KRC:$src1, KRC:$src2))]>;
+}
+
+multiclass avx512_mask_testop_w<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+  defm W : avx512_mask_testop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode>,
+                            VEX, PS;
+}
+
+defm KORTEST : avx512_mask_testop_w<0x98, "kortest", X86kortest>;
+
+def : Pat<(X86cmp VK1:$src1, (i1 0)),
+          (KORTESTWrr (COPY_TO_REGCLASS VK1:$src1, VK16),
+           (COPY_TO_REGCLASS VK1:$src1, VK16))>;
+
+// Mask shift
+multiclass avx512_mask_shiftop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
+                             SDNode OpNode> {
+  let Predicates = [HasAVX512] in
+    def ri : Ii8<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src, i8imm:$imm),
+                 !strconcat(OpcodeStr,
+                            " \t{$imm, $src, $dst|$dst, $src, $imm}"),
+                            [(set KRC:$dst, (OpNode KRC:$src, (i8 imm:$imm)))]>;
+}
+
+multiclass avx512_mask_shiftop_w<bits<8> opc1, bits<8> opc2, string OpcodeStr,
+                               SDNode OpNode> {
+  defm W : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "w"), VK16, OpNode>,
+                             VEX, TAPD, VEX_W;
+}
+
+defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86vshli>;
+defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", X86vsrli>;
+
+// Mask setting all 0s or 1s
+multiclass avx512_mask_setop<RegisterClass KRC, ValueType VT, PatFrag Val> {
+  let Predicates = [HasAVX512] in
+    let isReMaterializable = 1, isAsCheapAsAMove = 1, isPseudo = 1 in
+      def #NAME# : I<0, Pseudo, (outs KRC:$dst), (ins), "",
+                     [(set KRC:$dst, (VT Val))]>;
+}
+
+multiclass avx512_mask_setop_w<PatFrag Val> {
+  defm B : avx512_mask_setop<VK8,   v8i1, Val>;
+  defm W : avx512_mask_setop<VK16, v16i1, Val>;
+}
+
+defm KSET0 : avx512_mask_setop_w<immAllZerosV>;
+defm KSET1 : avx512_mask_setop_w<immAllOnesV>;
+
+// With AVX-512 only, 8-bit mask is promoted to 16-bit mask.
+let Predicates = [HasAVX512] in {
+  def : Pat<(v8i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK8)>;
+  def : Pat<(v8i1 immAllOnesV),  (COPY_TO_REGCLASS (KSET1W), VK8)>;
+  def : Pat<(i1 0), (COPY_TO_REGCLASS (KSET0W), VK1)>;
+  def : Pat<(i1 1), (COPY_TO_REGCLASS (KSET1W), VK1)>;
+  def : Pat<(i1 -1), (COPY_TO_REGCLASS (KSET1W), VK1)>;
+}
+def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 0))),
+          (v8i1 (COPY_TO_REGCLASS VK16:$src, VK8))>;
+
+def : Pat<(v16i1 (insert_subvector undef, (v8i1 VK8:$src), (iPTR 0))),
+          (v16i1 (COPY_TO_REGCLASS VK8:$src, VK16))>;
+
+def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 8))),
+          (v8i1 (COPY_TO_REGCLASS (KSHIFTRWri VK16:$src, (i8 8)), VK8))>;
+
+def : Pat<(v8i1 (X86vshli VK8:$src, (i8 imm:$imm))),
+          (v8i1 (COPY_TO_REGCLASS (KSHIFTLWri (COPY_TO_REGCLASS VK8:$src, VK16), (I8Imm $imm)), VK8))>;
+
+def : Pat<(v8i1 (X86vsrli VK8:$src, (i8 imm:$imm))),
+          (v8i1 (COPY_TO_REGCLASS (KSHIFTRWri (COPY_TO_REGCLASS VK8:$src, VK16), (I8Imm $imm)), VK8))>;
+//===----------------------------------------------------------------------===//
+// AVX-512 - Aligned and unaligned load and store
+//
+
+multiclass avx512_load<bits<8> opc, RegisterClass RC, RegisterClass KRC,
+                            X86MemOperand x86memop, PatFrag ld_frag, 
+                            string asm, Domain d,
+                            ValueType vt, bit IsReMaterializable = 1> {
+let hasSideEffects = 0 in {
+  def rr : AVX512PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
+              !strconcat(asm, " \t{$src, $dst|$dst, $src}"), [], d>,
+              EVEX;
+  def rrkz : AVX512PI<opc, MRMSrcReg, (outs RC:$dst), (ins KRC:$mask, RC:$src),
+               !strconcat(asm,
+               " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
+               [], d>, EVEX, EVEX_KZ;
+  }
+  let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in
+  def rm : AVX512PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+              !strconcat(asm, " \t{$src, $dst|$dst, $src}"),
+               [(set (vt RC:$dst), (ld_frag addr:$src))], d>, EVEX;
+  let Constraints = "$src1 = $dst",  hasSideEffects = 0 in {
+  def rrk : AVX512PI<opc, MRMSrcReg, (outs RC:$dst), 
+                                     (ins RC:$src1, KRC:$mask, RC:$src2),
+              !strconcat(asm, 
+              " \t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), [], d>,
+              EVEX, EVEX_K;
+  let mayLoad = 1 in
+  def rmk : AVX512PI<opc, MRMSrcMem, (outs RC:$dst),
+                                (ins RC:$src1, KRC:$mask, x86memop:$src2),
+              !strconcat(asm, 
+              " \t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
+               [], d>, EVEX, EVEX_K;
+  }
+  let mayLoad = 1 in
+  def rmkz : AVX512PI<opc, MRMSrcMem, (outs RC:$dst),
+                      (ins KRC:$mask, x86memop:$src2),
+              !strconcat(asm,
+              " \t{$src2, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src2}"),
+               [], d>, EVEX, EVEX_KZ;
+}
+
+multiclass avx512_store<bits<8> opc, RegisterClass RC, RegisterClass KRC,
+                            X86MemOperand x86memop, PatFrag store_frag,
+                            string asm, Domain d, ValueType vt> {
+  let isAsmParserOnly = 1, hasSideEffects = 0 in {
+  def rr_alt : AVX512PI<opc, MRMDestReg, (outs RC:$dst), (ins RC:$src),
+              !strconcat(asm, " \t{$src, $dst|$dst, $src}"), [], d>,
+              EVEX;
+  let Constraints = "$src1 = $dst" in
+  def alt_rrk : AVX512PI<opc, MRMDestReg, (outs  RC:$dst),
+                                     (ins RC:$src1, KRC:$mask, RC:$src2),
+              !strconcat(asm,
+              " \t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), [], d>,
+              EVEX, EVEX_K;
+  def alt_rrkz : AVX512PI<opc, MRMDestReg, (outs  RC:$dst),
+                                           (ins KRC:$mask, RC:$src),
+              !strconcat(asm,
+              " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
+              [], d>, EVEX, EVEX_KZ;
+  }
+  let mayStore = 1 in {
+  def mr : AVX512PI<opc, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
+              !strconcat(asm, " \t{$src, $dst|$dst, $src}"),
+               [(store_frag (vt RC:$src), addr:$dst)], d>, EVEX;
+  def mrk : AVX512PI<opc, MRMDestMem, (outs),
+                                (ins x86memop:$dst, KRC:$mask, RC:$src),
+              !strconcat(asm,
+              " \t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"),
+               [], d>, EVEX, EVEX_K;
+  def mrkz : AVX512PI<opc, MRMDestMem, (outs),
+                      (ins x86memop:$dst, KRC:$mask, RC:$src),
+              !strconcat(asm,
+              " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
+               [], d>, EVEX, EVEX_KZ;
+  }
+}
+
+defm VMOVAPSZ : avx512_load<0x28, VR512, VK16WM, f512mem, alignedloadv16f32,
+                              "vmovaps", SSEPackedSingle, v16f32>,
+                avx512_store<0x29, VR512, VK16WM, f512mem, alignedstore512,
+                              "vmovaps", SSEPackedSingle, v16f32>,
+                               PS, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VMOVAPDZ : avx512_load<0x28, VR512, VK8WM, f512mem, alignedloadv8f64,
+                              "vmovapd", SSEPackedDouble, v8f64>,
+                avx512_store<0x29, VR512, VK8WM, f512mem, alignedstore512,
+                              "vmovapd", SSEPackedDouble, v8f64>,
+                              PD, EVEX_V512, VEX_W,
+                              EVEX_CD8<64, CD8VF>;
+defm VMOVUPSZ : avx512_load<0x10, VR512, VK16WM, f512mem, loadv16f32,
+                              "vmovups", SSEPackedSingle, v16f32>,
+                avx512_store<0x11, VR512, VK16WM, f512mem, store,
+                              "vmovups", SSEPackedSingle, v16f32>,
+                              PS, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VMOVUPDZ : avx512_load<0x10, VR512, VK8WM, f512mem, loadv8f64,
+                              "vmovupd", SSEPackedDouble, v8f64, 0>,
+                avx512_store<0x11, VR512, VK8WM, f512mem, store,
+                              "vmovupd", SSEPackedDouble, v8f64>,
+                               PD, EVEX_V512, VEX_W,
+                               EVEX_CD8<64, CD8VF>;
+def: Pat<(v8f64 (int_x86_avx512_mask_loadu_pd_512 addr:$ptr,
+                 (bc_v8f64 (v16i32 immAllZerosV)), GR8:$mask)),
+       (VMOVUPDZrmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>;
+
+def: Pat<(v16f32 (int_x86_avx512_mask_loadu_ps_512 addr:$ptr,
+                 (bc_v16f32 (v16i32 immAllZerosV)), GR16:$mask)),
+       (VMOVUPSZrmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>;
+
+def: Pat<(int_x86_avx512_mask_storeu_ps_512 addr:$ptr, (v16f32 VR512:$src),
+          GR16:$mask),
+         (VMOVUPSZmrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)),
+            VR512:$src)>;
+def: Pat<(int_x86_avx512_mask_storeu_pd_512 addr:$ptr, (v8f64 VR512:$src),
+          GR8:$mask),
+         (VMOVUPDZmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)),
+            VR512:$src)>;
+
+defm VMOVDQA32: avx512_load<0x6F, VR512, VK16WM, i512mem, alignedloadv16i32,
+                              "vmovdqa32", SSEPackedInt, v16i32>,
+                avx512_store<0x7F, VR512, VK16WM, i512mem, alignedstore512,
+                              "vmovdqa32", SSEPackedInt, v16i32>,
+                               PD, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VMOVDQA64: avx512_load<0x6F, VR512, VK8WM, i512mem, alignedloadv8i64,
+                              "vmovdqa64", SSEPackedInt, v8i64>,
+                avx512_store<0x7F, VR512, VK8WM, i512mem, alignedstore512,
+                              "vmovdqa64", SSEPackedInt, v8i64>,
+                               PD, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
+defm VMOVDQU32: avx512_load<0x6F, VR512, VK16WM, i512mem, load,
+                              "vmovdqu32", SSEPackedInt, v16i32>,
+                avx512_store<0x7F, VR512, VK16WM, i512mem, store,
+                              "vmovdqu32", SSEPackedInt, v16i32>,
+                               XS, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VMOVDQU64: avx512_load<0x6F, VR512, VK8WM, i512mem, load,
+                              "vmovdqu64", SSEPackedInt, v8i64>,
+                avx512_store<0x7F, VR512, VK8WM, i512mem, store,
+                              "vmovdqu64", SSEPackedInt, v8i64>,
+                               XS, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
+
+def: Pat<(v16i32 (int_x86_avx512_mask_loadu_d_512 addr:$ptr,
+                 (v16i32 immAllZerosV), GR16:$mask)),
+       (VMOVDQU32rmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>;
+
+def: Pat<(v8i64 (int_x86_avx512_mask_loadu_q_512 addr:$ptr,
+                 (bc_v8i64 (v16i32 immAllZerosV)), GR8:$mask)),
+       (VMOVDQU64rmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>;
+
+def: Pat<(int_x86_avx512_mask_storeu_d_512 addr:$ptr, (v16i32 VR512:$src),
+          GR16:$mask),
+         (VMOVDQU32mrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)),
+            VR512:$src)>;
+def: Pat<(int_x86_avx512_mask_storeu_q_512 addr:$ptr, (v8i64 VR512:$src),
+          GR8:$mask),
+         (VMOVDQU64mrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)),
+            VR512:$src)>;
+
+let AddedComplexity = 20 in {
+def : Pat<(v8i64 (vselect VK8WM:$mask, (v8i64 VR512:$src),
+                           (bc_v8i64 (v16i32 immAllZerosV)))),
+                  (VMOVDQU64rrkz VK8WM:$mask, VR512:$src)>;
+
+def : Pat<(v8i64 (vselect VK8WM:$mask, (bc_v8i64 (v16i32 immAllZerosV)),
+                  (v8i64 VR512:$src))),
+   (VMOVDQU64rrkz (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$mask, VK16)),
+                                              VK8), VR512:$src)>;
+
+def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 VR512:$src),
+                           (v16i32 immAllZerosV))),
+                  (VMOVDQU32rrkz VK16WM:$mask, VR512:$src)>;
+
+def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 immAllZerosV),
+                   (v16i32 VR512:$src))),
+   (VMOVDQU32rrkz (KNOTWrr VK16WM:$mask), VR512:$src)>;
+                                              
+def : Pat<(v16f32 (vselect VK16WM:$mask, (v16f32 VR512:$src1), 
+                           (v16f32 VR512:$src2))),
+                  (VMOVUPSZrrk VR512:$src2, VK16WM:$mask, VR512:$src1)>;
+def : Pat<(v8f64 (vselect VK8WM:$mask, (v8f64 VR512:$src1), 
+                           (v8f64 VR512:$src2))),
+                  (VMOVUPDZrrk VR512:$src2, VK8WM:$mask, VR512:$src1)>;
+def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 VR512:$src1), 
+                           (v16i32 VR512:$src2))),
+                  (VMOVDQU32rrk VR512:$src2, VK16WM:$mask, VR512:$src1)>;
+def : Pat<(v8i64 (vselect VK8WM:$mask, (v8i64 VR512:$src1), 
+                           (v8i64 VR512:$src2))),
+                  (VMOVDQU64rrk VR512:$src2, VK8WM:$mask, VR512:$src1)>;
+}
+// Move Int Doubleword to Packed Double Int
+//
+def VMOVDI2PDIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src),
+                      "vmovd\t{$src, $dst|$dst, $src}",
+                      [(set VR128X:$dst,
+                        (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
+                        EVEX, VEX_LIG;
+def VMOVDI2PDIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst), (ins i32mem:$src),
+                      "vmovd\t{$src, $dst|$dst, $src}",
+                      [(set VR128X:$dst,
+                        (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
+                        IIC_SSE_MOVDQ>, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+def VMOV64toPQIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR64:$src),
+                      "vmovq\t{$src, $dst|$dst, $src}",
+                        [(set VR128X:$dst,
+                          (v2i64 (scalar_to_vector GR64:$src)))],
+                          IIC_SSE_MOVDQ>, EVEX, VEX_W, VEX_LIG;
+let isCodeGenOnly = 1 in {
+def VMOV64toSDZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
+                       "vmovq\t{$src, $dst|$dst, $src}",
+                       [(set FR64:$dst, (bitconvert GR64:$src))],
+                       IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>;
+def VMOVSDto64Zrr : AVX512BI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
+                         "vmovq\t{$src, $dst|$dst, $src}",
+                         [(set GR64:$dst, (bitconvert FR64:$src))],
+                         IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>;
+}
+def VMOVSDto64Zmr : AVX512BI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
+                         "vmovq\t{$src, $dst|$dst, $src}",
+                         [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
+                         IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteStore]>,
+                         EVEX_CD8<64, CD8VT1>;
+
+// Move Int Doubleword to Single Scalar
+//
+let isCodeGenOnly = 1 in {
+def VMOVDI2SSZrr  : AVX512BI<0x6E, MRMSrcReg, (outs FR32X:$dst), (ins GR32:$src),
+                      "vmovd\t{$src, $dst|$dst, $src}",
+                      [(set FR32X:$dst, (bitconvert GR32:$src))],
+                      IIC_SSE_MOVDQ>, EVEX, VEX_LIG;
+
+def VMOVDI2SSZrm  : AVX512BI<0x6E, MRMSrcMem, (outs FR32X:$dst), (ins i32mem:$src),
+                      "vmovd\t{$src, $dst|$dst, $src}",
+                      [(set FR32X:$dst, (bitconvert (loadi32 addr:$src)))],
+                      IIC_SSE_MOVDQ>, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+}
+
+// Move doubleword from xmm register to r/m32
+//
+def VMOVPDI2DIZrr  : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128X:$src),
+                       "vmovd\t{$src, $dst|$dst, $src}",
+                       [(set GR32:$dst, (vector_extract (v4i32 VR128X:$src),
+                                        (iPTR 0)))], IIC_SSE_MOVD_ToGP>,
+                       EVEX, VEX_LIG;
+def VMOVPDI2DIZmr  : AVX512BI<0x7E, MRMDestMem, (outs),
+                       (ins i32mem:$dst, VR128X:$src),
+                       "vmovd\t{$src, $dst|$dst, $src}",
+                       [(store (i32 (vector_extract (v4i32 VR128X:$src),
+                                     (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
+                       EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+
+// Move quadword from xmm1 register to r/m64
+//
+def VMOVPQIto64Zrr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src),
+                      "vmovq\t{$src, $dst|$dst, $src}",
+                      [(set GR64:$dst, (extractelt (v2i64 VR128X:$src),
+                                                   (iPTR 0)))],
+                      IIC_SSE_MOVD_ToGP>, PD, EVEX, VEX_LIG, VEX_W,
+                      Requires<[HasAVX512, In64BitMode]>;
+
+def VMOVPQIto64Zmr : I<0xD6, MRMDestMem, (outs),
+                       (ins i64mem:$dst, VR128X:$src),
+                       "vmovq\t{$src, $dst|$dst, $src}",
+                       [(store (extractelt (v2i64 VR128X:$src), (iPTR 0)),
+                               addr:$dst)], IIC_SSE_MOVDQ>,
+                       EVEX, PD, VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>,
+                       Sched<[WriteStore]>, Requires<[HasAVX512, In64BitMode]>;
+
+// Move Scalar Single to Double Int
+//
+let isCodeGenOnly = 1 in {
+def VMOVSS2DIZrr  : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst),
+                      (ins FR32X:$src),
+                      "vmovd\t{$src, $dst|$dst, $src}",
+                      [(set GR32:$dst, (bitconvert FR32X:$src))],
+                      IIC_SSE_MOVD_ToGP>, EVEX, VEX_LIG;
+def VMOVSS2DIZmr  : AVX512BI<0x7E, MRMDestMem, (outs),
+                      (ins i32mem:$dst, FR32X:$src),
+                      "vmovd\t{$src, $dst|$dst, $src}",
+                      [(store (i32 (bitconvert FR32X:$src)), addr:$dst)],
+                      IIC_SSE_MOVDQ>, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+}
+
+// Move Quadword Int to Packed Quadword Int
+//
+def VMOVQI2PQIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst),
+                      (ins i64mem:$src),
+                      "vmovq\t{$src, $dst|$dst, $src}",
+                      [(set VR128X:$dst,
+                        (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
+                      EVEX, VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512  MOVSS, MOVSD
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_move_scalar <string asm, RegisterClass RC, 
+                              SDNode OpNode, ValueType vt,
+                              X86MemOperand x86memop, PatFrag mem_pat> {
+  let hasSideEffects = 0 in {
+  def rr : SI<0x10, MRMSrcReg, (outs VR128X:$dst), (ins VR128X:$src1, RC:$src2), 
+              !strconcat(asm, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+              [(set VR128X:$dst, (vt (OpNode VR128X:$src1,
+                                      (scalar_to_vector RC:$src2))))],
+              IIC_SSE_MOV_S_RR>, EVEX_4V, VEX_LIG;
+  let Constraints = "$src1 = $dst" in
+  def rrk : SI<0x10, MRMSrcReg, (outs VR128X:$dst),
+              (ins VR128X:$src1, VK1WM:$mask, RC:$src2, RC:$src3),
+              !strconcat(asm,
+                " \t{$src3, $src2, $dst {${mask}}|$dst {${mask}}, $src2, $src3}"),
+              [], IIC_SSE_MOV_S_RR>, EVEX_4V, VEX_LIG, EVEX_K;
+  def rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+              !strconcat(asm, " \t{$src, $dst|$dst, $src}"),
+              [(set RC:$dst, (mem_pat addr:$src))], IIC_SSE_MOV_S_RM>,
+              EVEX, VEX_LIG;
+  def mr: SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
+             !strconcat(asm, " \t{$src, $dst|$dst, $src}"),
+             [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>,
+             EVEX, VEX_LIG;
+  } //hasSideEffects = 0
+}
+
+let ExeDomain = SSEPackedSingle in
+defm VMOVSSZ : avx512_move_scalar<"movss", FR32X, X86Movss, v4f32, f32mem,
+                                 loadf32>, XS, EVEX_CD8<32, CD8VT1>;
+
+let ExeDomain = SSEPackedDouble in
+defm VMOVSDZ : avx512_move_scalar<"movsd", FR64X, X86Movsd, v2f64, f64mem,
+                                 loadf64>, XD, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+def : Pat<(f32 (X86select VK1WM:$mask, (f32 FR32X:$src1), (f32 FR32X:$src2))),
+          (COPY_TO_REGCLASS (VMOVSSZrrk (COPY_TO_REGCLASS FR32X:$src2, VR128X),
+           VK1WM:$mask, (f32 (IMPLICIT_DEF)), FR32X:$src1), FR32X)>;
+
+def : Pat<(f64 (X86select VK1WM:$mask, (f64 FR64X:$src1), (f64 FR64X:$src2))),
+          (COPY_TO_REGCLASS (VMOVSDZrrk (COPY_TO_REGCLASS FR64X:$src2, VR128X),
+           VK1WM:$mask, (f64 (IMPLICIT_DEF)), FR64X:$src1), FR64X)>;
+
+// For the disassembler
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
+  def VMOVSSZrr_REV : SI<0x11, MRMDestReg, (outs VR128X:$dst),
+                        (ins VR128X:$src1, FR32X:$src2),
+                        "movss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
+                        IIC_SSE_MOV_S_RR>,
+                        XS, EVEX_4V, VEX_LIG;
+  def VMOVSDZrr_REV : SI<0x11, MRMDestReg, (outs VR128X:$dst),
+                        (ins VR128X:$src1, FR64X:$src2),
+                        "movsd\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
+                        IIC_SSE_MOV_S_RR>,
+                        XD, EVEX_4V, VEX_LIG, VEX_W;
+}
+
+let Predicates = [HasAVX512] in {
+  let AddedComplexity = 15 in {
+  // Move scalar to XMM zero-extended, zeroing a VR128X then do a
+  // MOVS{S,D} to the lower bits.
+  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32X:$src)))),
+            (VMOVSSZrr (v4f32 (V_SET0)), FR32X:$src)>;
+  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128X:$src))),
+            (VMOVSSZrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
+  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128X:$src))),
+            (VMOVSSZrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
+  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64X:$src)))),
+            (VMOVSDZrr (v2f64 (V_SET0)), FR64X:$src)>;
+
+  // Move low f32 and clear high bits.
+  def : Pat<(v8f32 (X86vzmovl (v8f32 VR256X:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (VMOVSSZrr (v4f32 (V_SET0)), 
+              (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)), sub_xmm)>;
+  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256X:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (VMOVSSZrr (v4i32 (V_SET0)),
+                       (EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm)), sub_xmm)>;
+  }
+
+  let AddedComplexity = 20 in {
+  // MOVSSrm zeros the high parts of the register; represent this
+  // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
+  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
+            (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;
+  def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
+            (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;
+  def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
+            (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;
+
+  // MOVSDrm zeros the high parts of the register; represent this
+  // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
+  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
+            (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
+  def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
+            (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
+  def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
+            (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
+  def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
+            (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
+  def : Pat<(v2f64 (X86vzload addr:$src)),
+            (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
+
+  // Represent the same patterns above but in the form they appear for
+  // 256-bit types
+  def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
+                   (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrm addr:$src), sub_xmm)>;
+  def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
+                   (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
+  def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
+                   (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
+  }
+  def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
+                   (v4f32 (scalar_to_vector FR32X:$src)), (iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (v4f32 (VMOVSSZrr (v4f32 (V_SET0)),
+                                            FR32X:$src)), sub_xmm)>;
+  def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
+                   (v2f64 (scalar_to_vector FR64X:$src)), (iPTR 0)))),
+            (SUBREG_TO_REG (i64 0), (v2f64 (VMOVSDZrr (v2f64 (V_SET0)),
+                                     FR64X:$src)), sub_xmm)>;
+  def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
+                   (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
+            (SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>;
+
+  // Move low f64 and clear high bits.
+  def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (VMOVSDZrr (v2f64 (V_SET0)),
+                       (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)), sub_xmm)>;
+
+  def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))),
+            (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (V_SET0)),
+                       (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)), sub_xmm)>;
+
+  // Extract and store.
+  def : Pat<(store (f32 (vector_extract (v4f32 VR128X:$src), (iPTR 0))),
+                   addr:$dst),
+            (VMOVSSZmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128X:$src), FR32X))>;
+  def : Pat<(store (f64 (vector_extract (v2f64 VR128X:$src), (iPTR 0))),
+                   addr:$dst),
+            (VMOVSDZmr addr:$dst, (COPY_TO_REGCLASS (v2f64 VR128X:$src), FR64X))>;
+
+  // Shuffle with VMOVSS
+  def : Pat<(v4i32 (X86Movss VR128X:$src1, VR128X:$src2)),
+            (VMOVSSZrr (v4i32 VR128X:$src1),
+                      (COPY_TO_REGCLASS (v4i32 VR128X:$src2), FR32X))>;
+  def : Pat<(v4f32 (X86Movss VR128X:$src1, VR128X:$src2)),
+            (VMOVSSZrr (v4f32 VR128X:$src1),
+                      (COPY_TO_REGCLASS (v4f32 VR128X:$src2), FR32X))>;
+
+  // 256-bit variants
+  def : Pat<(v8i32 (X86Movss VR256X:$src1, VR256X:$src2)),
+            (SUBREG_TO_REG (i32 0),
+              (VMOVSSZrr (EXTRACT_SUBREG (v8i32 VR256X:$src1), sub_xmm),
+                        (EXTRACT_SUBREG (v8i32 VR256X:$src2), sub_xmm)),
+              sub_xmm)>;
+  def : Pat<(v8f32 (X86Movss VR256X:$src1, VR256X:$src2)),
+            (SUBREG_TO_REG (i32 0),
+              (VMOVSSZrr (EXTRACT_SUBREG (v8f32 VR256X:$src1), sub_xmm),
+                        (EXTRACT_SUBREG (v8f32 VR256X:$src2), sub_xmm)),
+              sub_xmm)>;
+
+  // Shuffle with VMOVSD
+  def : Pat<(v2i64 (X86Movsd VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+  def : Pat<(v2f64 (X86Movsd VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+  def : Pat<(v4f32 (X86Movsd VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+  def : Pat<(v4i32 (X86Movsd VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+
+  // 256-bit variants
+  def : Pat<(v4i64 (X86Movsd VR256X:$src1, VR256X:$src2)),
+            (SUBREG_TO_REG (i32 0),
+              (VMOVSDZrr (EXTRACT_SUBREG (v4i64 VR256X:$src1), sub_xmm),
+                        (EXTRACT_SUBREG (v4i64 VR256X:$src2), sub_xmm)),
+              sub_xmm)>;
+  def : Pat<(v4f64 (X86Movsd VR256X:$src1, VR256X:$src2)),
+            (SUBREG_TO_REG (i32 0),
+              (VMOVSDZrr (EXTRACT_SUBREG (v4f64 VR256X:$src1), sub_xmm),
+                        (EXTRACT_SUBREG (v4f64 VR256X:$src2), sub_xmm)),
+              sub_xmm)>;
+
+  def : Pat<(v2f64 (X86Movlpd VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+  def : Pat<(v2i64 (X86Movlpd VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+  def : Pat<(v4f32 (X86Movlps VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+  def : Pat<(v4i32 (X86Movlps VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+}
+
+let AddedComplexity = 15 in
+def VMOVZPQILo2PQIZrr : AVX512XSI<0x7E, MRMSrcReg, (outs VR128X:$dst),
+                                (ins VR128X:$src),
+                                "vmovq\t{$src, $dst|$dst, $src}",
+                                [(set VR128X:$dst, (v2i64 (X86vzmovl 
+                                                   (v2i64 VR128X:$src))))],
+                                IIC_SSE_MOVQ_RR>, EVEX, VEX_W;
+
+let AddedComplexity = 20 in
+def VMOVZPQILo2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst),
+                                 (ins i128mem:$src),
+                                 "vmovq\t{$src, $dst|$dst, $src}",
+                                 [(set VR128X:$dst, (v2i64 (X86vzmovl
+                                                     (loadv2i64 addr:$src))))],
+                                 IIC_SSE_MOVDQ>, EVEX, VEX_W,
+                                 EVEX_CD8<8, CD8VT8>;
+
+let Predicates = [HasAVX512] in {
+  // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part.
+  let AddedComplexity = 20 in {
+    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
+              (VMOVDI2PDIZrm addr:$src)>;
+    def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
+              (VMOV64toPQIZrr GR64:$src)>;
+    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
+              (VMOVDI2PDIZrr GR32:$src)>;
+              
+    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
+              (VMOVDI2PDIZrm addr:$src)>;
+    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+              (VMOVDI2PDIZrm addr:$src)>;
+    def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
+            (VMOVZPQILo2PQIZrm addr:$src)>;
+    def : Pat<(v2f64 (X86vzmovl (v2f64 VR128X:$src))),
+            (VMOVZPQILo2PQIZrr VR128X:$src)>;
+    def : Pat<(v2i64 (X86vzload addr:$src)),
+            (VMOVZPQILo2PQIZrm addr:$src)>;
+  }
+
+  // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
+  def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
+                               (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src), sub_xmm)>;
+  def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
+                               (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
+            (SUBREG_TO_REG (i64 0), (VMOV64toPQIZrr GR64:$src), sub_xmm)>;
+}
+
+def : Pat<(v16i32 (X86Vinsert (v16i32 immAllZerosV), GR32:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src2), sub_xmm)>;
+
+def : Pat<(v8i64 (X86Vinsert (bc_v8i64 (v16i32 immAllZerosV)), GR64:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOV64toPQIZrr GR64:$src2), sub_xmm)>;
+
+def : Pat<(v16i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src2), sub_xmm)>;
+
+def : Pat<(v8i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOV64toPQIZrr GR64:$src2), sub_xmm)>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - Non-temporals
+//===----------------------------------------------------------------------===//
+
+def VMOVNTDQAZrm : AVX5128I<0x2A, MRMSrcMem, (outs VR512:$dst),
+                            (ins i512mem:$src),
+                            "vmovntdqa\t{$src, $dst|$dst, $src}",
+                            [(set VR512:$dst,
+                              (int_x86_avx512_movntdqa addr:$src))]>,
+                   EVEX, EVEX_V512, EVEX_CD8<64, CD8VF>;
+
+// Prefer non-temporal over temporal versions
+let AddedComplexity = 400, SchedRW = [WriteStore] in {
+
+def VMOVNTPSZmr : AVX512PSI<0x2B, MRMDestMem, (outs),
+                            (ins f512mem:$dst, VR512:$src),
+                            "vmovntps\t{$src, $dst|$dst, $src}",
+                            [(alignednontemporalstore (v16f32 VR512:$src),
+                                                      addr:$dst)],
+                            IIC_SSE_MOVNT>,
+                  EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+def VMOVNTPDZmr : AVX512PDI<0x2B, MRMDestMem, (outs),
+                            (ins f512mem:$dst, VR512:$src),
+                            "vmovntpd\t{$src, $dst|$dst, $src}",
+                            [(alignednontemporalstore (v8f64 VR512:$src),
+                                                      addr:$dst)],
+			    IIC_SSE_MOVNT>,
+                  EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+
+def VMOVNTDQZmr : AVX512BI<0xE7, MRMDestMem, (outs),
+                           (ins i512mem:$dst, VR512:$src),
+                           "vmovntdq\t{$src, $dst|$dst, $src}",
+                           [(alignednontemporalstore (v8i64 VR512:$src),
+                                                     addr:$dst)],
+                           IIC_SSE_MOVNT>,
+                  EVEX, EVEX_V512, EVEX_CD8<64, CD8VF>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - Integer arithmetic
+//
+multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                        ValueType OpVT, RegisterClass KRC,
+                        RegisterClass RC, PatFrag memop_frag,
+                        X86MemOperand x86memop, PatFrag scalar_mfrag,
+                        X86MemOperand x86scalar_mop, string BrdcstStr,
+                        OpndItins itins, bit IsCommutable = 0> {
+  let isCommutable = IsCommutable in
+    def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
+              (ins RC:$src1, RC:$src2),
+              !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+              [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1), (OpVT RC:$src2))))],
+              itins.rr>, EVEX_4V;
+  let AddedComplexity = 30 in {
+    let Constraints = "$src0 = $dst" in
+      def rrk : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
+                 (ins RC:$src0, KRC:$mask, RC:$src1, RC:$src2),
+                 !strconcat(OpcodeStr,
+                    " \t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"),
+                 [(set RC:$dst, (OpVT (vselect KRC:$mask,
+                                  (OpNode (OpVT RC:$src1), (OpVT RC:$src2)),
+                                  RC:$src0)))],
+                 itins.rr>, EVEX_4V, EVEX_K;
+    def rrkz : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
+                (ins KRC:$mask, RC:$src1, RC:$src2),
+                !strconcat(OpcodeStr, " \t{$src2, $src1, $dst {${mask}} {z}" ,
+                    "|$dst {${mask}} {z}, $src1, $src2}"),
+                [(set RC:$dst, (OpVT (vselect KRC:$mask,
+                                  (OpNode (OpVT RC:$src1), (OpVT RC:$src2)),
+                                  (OpVT immAllZerosV))))],
+                itins.rr>, EVEX_4V, EVEX_KZ;
+  }
+
+  let mayLoad = 1 in {
+    def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+              (ins RC:$src1, x86memop:$src2),
+              !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+              [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1), (memop_frag addr:$src2))))],
+              itins.rm>, EVEX_4V;
+    let AddedComplexity = 30 in {
+    let Constraints = "$src0 = $dst" in
+      def rmk : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+                 (ins RC:$src0, KRC:$mask, RC:$src1, x86memop:$src2),
+                 !strconcat(OpcodeStr,
+                     " \t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"),
+                 [(set RC:$dst, (OpVT (vselect KRC:$mask,
+                                    (OpNode (OpVT RC:$src1), (memop_frag addr:$src2)),
+                                    RC:$src0)))],
+                 itins.rm>, EVEX_4V, EVEX_K;
+    def rmkz : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+                (ins KRC:$mask, RC:$src1, x86memop:$src2),
+                !strconcat(OpcodeStr,
+                    " \t{$src2, $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, $src2}"),
+                [(set RC:$dst, (OpVT (vselect KRC:$mask,
+                                    (OpNode (OpVT RC:$src1), (memop_frag addr:$src2)),
+                                    (OpVT immAllZerosV))))],
+                itins.rm>, EVEX_4V, EVEX_KZ;
+    }
+    def rmb : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+               (ins RC:$src1, x86scalar_mop:$src2),
+               !strconcat(OpcodeStr, " \t{${src2}", BrdcstStr,
+                          ", $src1, $dst|$dst, $src1, ${src2}", BrdcstStr, "}"),
+               [(set RC:$dst, (OpNode RC:$src1,
+                               (OpVT (X86VBroadcast (scalar_mfrag addr:$src2)))))],
+               itins.rm>, EVEX_4V, EVEX_B;
+    let AddedComplexity = 30 in {
+    let Constraints = "$src0 = $dst" in
+      def rmbk : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+                  (ins RC:$src0, KRC:$mask, RC:$src1, x86scalar_mop:$src2),
+                  !strconcat(OpcodeStr, " \t{${src2}", BrdcstStr,
+                             ", $src1, $dst {${mask}}|$dst {${mask}}, $src1, ${src2}",
+                             BrdcstStr, "}"),
+                  [(set RC:$dst, (OpVT (vselect KRC:$mask,
+                                    (OpNode (OpVT RC:$src1),
+                                     (OpVT (X86VBroadcast (scalar_mfrag addr:$src2)))),
+                                    RC:$src0)))],
+                  itins.rm>, EVEX_4V, EVEX_B, EVEX_K;
+    def rmbkz : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+                 (ins KRC:$mask, RC:$src1, x86scalar_mop:$src2),
+                 !strconcat(OpcodeStr, " \t{${src2}", BrdcstStr,
+                            ", $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, ${src2}",
+                            BrdcstStr, "}"),
+                 [(set RC:$dst, (OpVT (vselect KRC:$mask,
+                                    (OpNode (OpVT RC:$src1),
+                                     (OpVT (X86VBroadcast (scalar_mfrag addr:$src2)))),
+                                    (OpVT immAllZerosV))))],
+                 itins.rm>, EVEX_4V, EVEX_B, EVEX_KZ;
+    }
+  }
+}
+
+multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, ValueType DstVT,
+                            ValueType SrcVT, RegisterClass KRC, RegisterClass RC,
+                            PatFrag memop_frag, X86MemOperand x86memop,
+                            PatFrag scalar_mfrag, X86MemOperand x86scalar_mop,
+                            string BrdcstStr, OpndItins itins, bit IsCommutable = 0> {
+  let isCommutable = IsCommutable in
+  {
+    def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, RC:$src2),
+       !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       []>, EVEX_4V;
+    def rrk : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
+               (ins KRC:$mask, RC:$src1, RC:$src2),
+               !strconcat(OpcodeStr,
+                  " \t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"),
+               [], itins.rr>, EVEX_4V, EVEX_K;
+    def rrkz : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
+                (ins KRC:$mask, RC:$src1, RC:$src2),
+                !strconcat(OpcodeStr, " \t{$src2, $src1, $dst {${mask}} {z}" ,
+                    "|$dst {${mask}} {z}, $src1, $src2}"),
+                [], itins.rr>, EVEX_4V, EVEX_KZ;
+  }
+  let mayLoad = 1 in {
+    def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+              (ins RC:$src1, x86memop:$src2),
+              !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+              []>, EVEX_4V;
+    def rmk : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+               (ins KRC:$mask, RC:$src1, x86memop:$src2),
+               !strconcat(OpcodeStr,
+                   " \t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"),
+               [], itins.rm>, EVEX_4V, EVEX_K;
+    def rmkz : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+                (ins KRC:$mask, RC:$src1, x86memop:$src2),
+                !strconcat(OpcodeStr,
+                    " \t{$src2, $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, $src2}"),
+                [], itins.rm>, EVEX_4V, EVEX_KZ;
+    def rmb : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+               (ins RC:$src1, x86scalar_mop:$src2),
+               !strconcat(OpcodeStr, " \t{${src2}", BrdcstStr,
+                          ", $src1, $dst|$dst, $src1, ${src2}", BrdcstStr, "}"),
+               [], itins.rm>, EVEX_4V, EVEX_B;
+    def rmbk : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+                (ins KRC:$mask, RC:$src1, x86scalar_mop:$src2),
+                !strconcat(OpcodeStr, " \t{${src2}", BrdcstStr,
+                           ", $src1, $dst {${mask}}|$dst {${mask}}, $src1, ${src2}",
+                           BrdcstStr, "}"),
+                [], itins.rm>, EVEX_4V, EVEX_B, EVEX_K;
+    def rmbkz : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+                 (ins KRC:$mask, RC:$src1, x86scalar_mop:$src2),
+                 !strconcat(OpcodeStr, " \t{${src2}", BrdcstStr,
+                            ", $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, ${src2}",
+                            BrdcstStr, "}"),
+                 [], itins.rm>, EVEX_4V, EVEX_B, EVEX_KZ;
+  }
+}
+
+defm VPADDDZ : avx512_binop_rm<0xFE, "vpaddd", add, v16i32, VK16WM, VR512,
+                   memopv16i32, i512mem, loadi32, i32mem, "{1to16}",
+                   SSE_INTALU_ITINS_P, 1>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+defm VPSUBDZ : avx512_binop_rm<0xFA, "vpsubd", sub, v16i32, VK16WM, VR512,
+                   memopv16i32, i512mem, loadi32, i32mem, "{1to16}",
+                   SSE_INTALU_ITINS_P, 0>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+defm VPMULLDZ : avx512_binop_rm<0x40, "vpmulld", mul, v16i32, VK16WM, VR512,
+                   memopv16i32, i512mem, loadi32, i32mem, "{1to16}",
+                   SSE_INTALU_ITINS_P, 1>, T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+defm VPADDQZ : avx512_binop_rm<0xD4, "vpaddq", add, v8i64, VK8WM, VR512,
+                   memopv8i64, i512mem, loadi64, i64mem, "{1to8}",
+                   SSE_INTALU_ITINS_P, 1>, EVEX_CD8<64, CD8VF>, EVEX_V512, VEX_W;
+
+defm VPSUBQZ : avx512_binop_rm<0xFB, "vpsubq", sub, v8i64, VK8WM, VR512,
+                   memopv8i64, i512mem, loadi64, i64mem, "{1to8}",
+                   SSE_INTALU_ITINS_P, 0>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VPMULDQZ : avx512_binop_rm2<0x28, "vpmuldq", v8i64, v16i32, VK8WM, VR512,
+                   memopv8i64, i512mem, loadi64, i64mem, "{1to8}",
+                   SSE_INTALU_ITINS_P, 1>, T8PD, EVEX_V512,
+                   EVEX_CD8<64, CD8VF>, VEX_W;
+
+defm VPMULUDQZ : avx512_binop_rm2<0xF4, "vpmuludq", v8i64, v16i32, VK8WM, VR512,
+                   memopv8i64, i512mem, loadi64, i64mem, "{1to8}",
+                   SSE_INTMUL_ITINS_P, 1>, EVEX_V512, EVEX_CD8<64, CD8VF>, VEX_W;
+
+def : Pat<(v8i64 (X86pmuludq (v16i32 VR512:$src1), (v16i32 VR512:$src2))),
+          (VPMULUDQZrr VR512:$src1, VR512:$src2)>;
+
+def : Pat<(v8i64 (int_x86_avx512_mask_pmulu_dq_512 (v16i32 VR512:$src1),
+           (v16i32 VR512:$src2), (bc_v8i64 (v16i32 immAllZerosV)), (i8 -1))),
+          (VPMULUDQZrr VR512:$src1, VR512:$src2)>;
+def : Pat<(v8i64 (int_x86_avx512_mask_pmul_dq_512 (v16i32 VR512:$src1),
+           (v16i32 VR512:$src2), (bc_v8i64 (v16i32 immAllZerosV)), (i8 -1))),
+          (VPMULDQZrr VR512:$src1, VR512:$src2)>;
+
+defm VPMAXUDZ : avx512_binop_rm<0x3F, "vpmaxud", X86umax, v16i32, VK16WM, VR512,
+                   memopv16i32, i512mem, loadi32, i32mem, "{1to16}",
+                   SSE_INTALU_ITINS_P, 1>,
+                   T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPMAXUQZ : avx512_binop_rm<0x3F, "vpmaxuq", X86umax, v8i64, VK8WM, VR512,
+                   memopv8i64, i512mem, loadi64, i64mem, "{1to8}",
+                   SSE_INTALU_ITINS_P, 0>,
+                   T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VPMAXSDZ : avx512_binop_rm<0x3D, "vpmaxsd", X86smax, v16i32, VK16WM, VR512,
+                   memopv16i32, i512mem, loadi32, i32mem, "{1to16}",
+                   SSE_INTALU_ITINS_P, 1>,
+                   T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPMAXSQZ : avx512_binop_rm<0x3D, "vpmaxsq", X86smax, v8i64, VK8WM, VR512,
+                   memopv8i64, i512mem, loadi64, i64mem, "{1to8}",
+                   SSE_INTALU_ITINS_P, 0>,
+                   T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VPMINUDZ : avx512_binop_rm<0x3B, "vpminud", X86umin, v16i32, VK16WM, VR512,
+                   memopv16i32, i512mem, loadi32, i32mem, "{1to16}",
+                   SSE_INTALU_ITINS_P, 1>,
+                   T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPMINUQZ : avx512_binop_rm<0x3B, "vpminuq", X86umin, v8i64, VK8WM, VR512,
+                   memopv8i64, i512mem, loadi64, i64mem, "{1to8}",
+                   SSE_INTALU_ITINS_P, 0>,
+                   T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VPMINSDZ : avx512_binop_rm<0x39, "vpminsd", X86smin, v16i32, VK16WM, VR512,
+                   memopv16i32, i512mem, loadi32, i32mem, "{1to16}",
+                   SSE_INTALU_ITINS_P, 1>,
+                   T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPMINSQZ : avx512_binop_rm<0x39, "vpminsq", X86smin, v8i64, VK8WM, VR512,
+                   memopv8i64, i512mem, loadi64, i64mem, "{1to8}",
+                   SSE_INTALU_ITINS_P, 0>,
+                   T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+def : Pat <(v16i32 (int_x86_avx512_mask_pmaxs_d_512 (v16i32 VR512:$src1),
+                    (v16i32 VR512:$src2), (v16i32 immAllZerosV), (i16 -1))),
+           (VPMAXSDZrr VR512:$src1, VR512:$src2)>;
+def : Pat <(v16i32 (int_x86_avx512_mask_pmaxu_d_512 (v16i32 VR512:$src1),
+                    (v16i32 VR512:$src2), (v16i32 immAllZerosV), (i16 -1))),
+           (VPMAXUDZrr VR512:$src1, VR512:$src2)>;
+def : Pat <(v8i64 (int_x86_avx512_mask_pmaxs_q_512 (v8i64 VR512:$src1),
+                (v8i64 VR512:$src2), (bc_v8i64 (v16i32 immAllZerosV)), (i8 -1))),
+           (VPMAXSQZrr VR512:$src1, VR512:$src2)>;
+def : Pat <(v8i64 (int_x86_avx512_mask_pmaxu_q_512 (v8i64 VR512:$src1),
+                (v8i64 VR512:$src2), (bc_v8i64 (v16i32 immAllZerosV)), (i8 -1))),
+           (VPMAXUQZrr VR512:$src1, VR512:$src2)>;
+def : Pat <(v16i32 (int_x86_avx512_mask_pmins_d_512 (v16i32 VR512:$src1),
+                    (v16i32 VR512:$src2), (v16i32 immAllZerosV), (i16 -1))),
+           (VPMINSDZrr VR512:$src1, VR512:$src2)>;
+def : Pat <(v16i32 (int_x86_avx512_mask_pminu_d_512 (v16i32 VR512:$src1),
+                    (v16i32 VR512:$src2), (v16i32 immAllZerosV), (i16 -1))),
+           (VPMINUDZrr VR512:$src1, VR512:$src2)>;
+def : Pat <(v8i64 (int_x86_avx512_mask_pmins_q_512 (v8i64 VR512:$src1),
+                (v8i64 VR512:$src2), (bc_v8i64 (v16i32 immAllZerosV)), (i8 -1))),
+           (VPMINSQZrr VR512:$src1, VR512:$src2)>;
+def : Pat <(v8i64 (int_x86_avx512_mask_pminu_q_512 (v8i64 VR512:$src1),
+                (v8i64 VR512:$src2), (bc_v8i64 (v16i32 immAllZerosV)), (i8 -1))),
+           (VPMINUQZrr VR512:$src1, VR512:$src2)>;
+//===----------------------------------------------------------------------===//
+// AVX-512 - Unpack Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_unpack_fp<bits<8> opc, SDNode OpNode, ValueType vt,
+                                   PatFrag mem_frag, RegisterClass RC,
+                                   X86MemOperand x86memop, string asm,
+                                   Domain d> {
+    def rr : AVX512PI<opc, MRMSrcReg,
+                (outs RC:$dst), (ins RC:$src1, RC:$src2),
+                asm, [(set RC:$dst,
+                           (vt (OpNode RC:$src1, RC:$src2)))],
+                           d>, EVEX_4V;
+    def rm : AVX512PI<opc, MRMSrcMem,
+                (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+                asm, [(set RC:$dst,
+                       (vt (OpNode RC:$src1,
+                            (bitconvert (mem_frag addr:$src2)))))],
+                        d>, EVEX_4V;
+}
+
+defm VUNPCKHPSZ: avx512_unpack_fp<0x15, X86Unpckh, v16f32, memopv8f64,
+      VR512, f512mem, "vunpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+      SSEPackedSingle>, PS, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VUNPCKHPDZ: avx512_unpack_fp<0x15, X86Unpckh, v8f64, memopv8f64,
+      VR512, f512mem, "vunpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+      SSEPackedDouble>, PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VUNPCKLPSZ: avx512_unpack_fp<0x14, X86Unpckl, v16f32, memopv8f64,
+      VR512, f512mem, "vunpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+      SSEPackedSingle>, PS, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VUNPCKLPDZ: avx512_unpack_fp<0x14, X86Unpckl, v8f64, memopv8f64,
+      VR512, f512mem, "vunpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+      SSEPackedDouble>, PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+multiclass avx512_unpack_int<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                        ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+                        X86MemOperand x86memop> {
+  def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, RC:$src2),
+       !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1), (OpVT RC:$src2))))], 
+       IIC_SSE_UNPCK>, EVEX_4V;
+  def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, x86memop:$src2),
+       !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1),
+                                     (bitconvert (memop_frag addr:$src2)))))],
+                                     IIC_SSE_UNPCK>, EVEX_4V;
+}
+defm VPUNPCKLDQZ  : avx512_unpack_int<0x62, "vpunpckldq", X86Unpckl, v16i32,
+                                VR512, memopv16i32, i512mem>, EVEX_V512,
+                                EVEX_CD8<32, CD8VF>;
+defm VPUNPCKLQDQZ : avx512_unpack_int<0x6C, "vpunpcklqdq", X86Unpckl, v8i64,
+                                VR512, memopv8i64, i512mem>, EVEX_V512,
+                                VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPUNPCKHDQZ  : avx512_unpack_int<0x6A, "vpunpckhdq", X86Unpckh, v16i32,
+                                VR512, memopv16i32, i512mem>, EVEX_V512,
+                                EVEX_CD8<32, CD8VF>;
+defm VPUNPCKHQDQZ : avx512_unpack_int<0x6D, "vpunpckhqdq", X86Unpckh, v8i64,
+                                VR512, memopv8i64, i512mem>, EVEX_V512,
+                                VEX_W, EVEX_CD8<64, CD8VF>;
+//===----------------------------------------------------------------------===//
+// AVX-512 - PSHUFD
+//
+
+multiclass avx512_pshuf_imm<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                         SDNode OpNode, PatFrag mem_frag, 
+                         X86MemOperand x86memop, ValueType OpVT> {
+  def ri : AVX512Ii8<opc, MRMSrcReg, (outs RC:$dst),
+                     (ins RC:$src1, i8imm:$src2),
+                     !strconcat(OpcodeStr,
+                         " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     [(set RC:$dst,
+                       (OpVT (OpNode RC:$src1, (i8 imm:$src2))))]>,
+                     EVEX;
+  def mi : AVX512Ii8<opc, MRMSrcMem, (outs RC:$dst),
+                     (ins x86memop:$src1, i8imm:$src2),
+                     !strconcat(OpcodeStr,
+                         " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     [(set RC:$dst,
+                       (OpVT (OpNode (mem_frag addr:$src1),
+                              (i8 imm:$src2))))]>, EVEX;
+}
+
+defm VPSHUFDZ : avx512_pshuf_imm<0x70, "vpshufd", VR512, X86PShufd, memopv16i32,
+                      i512mem, v16i32>, PD, EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+let ExeDomain = SSEPackedSingle in
+defm VPERMILPSZ : avx512_pshuf_imm<0x04, "vpermilps", VR512, X86VPermilp,
+                      memopv16f32, i512mem, v16f32>, TAPD, EVEX_V512,
+                      EVEX_CD8<32, CD8VF>;
+let ExeDomain = SSEPackedDouble in
+defm VPERMILPDZ : avx512_pshuf_imm<0x05, "vpermilpd", VR512, X86VPermilp,
+                      memopv8f64, i512mem, v8f64>, TAPD, EVEX_V512,
+                      VEX_W, EVEX_CD8<32, CD8VF>;
+
+def : Pat<(v16i32 (X86VPermilp VR512:$src1, (i8 imm:$imm))),
+          (VPERMILPSZri VR512:$src1, imm:$imm)>;
+def : Pat<(v8i64 (X86VPermilp VR512:$src1, (i8 imm:$imm))),
+          (VPERMILPDZri VR512:$src1, imm:$imm)>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512  Logical Instructions
+//===----------------------------------------------------------------------===//
+
+defm VPANDDZ : avx512_binop_rm<0xDB, "vpandd", and, v16i32, VK16WM, VR512, memopv16i32,
+                      i512mem, loadi32, i32mem, "{1to16}", SSE_BIT_ITINS_P, 1>,
+                      EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPANDQZ : avx512_binop_rm<0xDB, "vpandq", and, v8i64, VK8WM, VR512, memopv8i64,
+                      i512mem, loadi64, i64mem, "{1to8}", SSE_BIT_ITINS_P, 1>,
+                      EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPORDZ  : avx512_binop_rm<0xEB, "vpord", or, v16i32, VK16WM, VR512, memopv16i32,
+                      i512mem, loadi32, i32mem, "{1to16}", SSE_BIT_ITINS_P, 1>,
+                      EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPORQZ  : avx512_binop_rm<0xEB, "vporq", or, v8i64, VK8WM, VR512, memopv8i64,
+                      i512mem, loadi64, i64mem, "{1to8}", SSE_BIT_ITINS_P, 1>,
+                      EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPXORDZ : avx512_binop_rm<0xEF, "vpxord", xor, v16i32, VK16WM, VR512, memopv16i32,
+                      i512mem, loadi32, i32mem, "{1to16}", SSE_BIT_ITINS_P, 1>,
+                      EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPXORQZ : avx512_binop_rm<0xEF, "vpxorq", xor, v8i64, VK8WM, VR512, memopv8i64,
+                      i512mem, loadi64, i64mem, "{1to8}", SSE_BIT_ITINS_P, 1>,
+                      EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPANDNDZ : avx512_binop_rm<0xDF, "vpandnd", X86andnp, v16i32, VK16WM, VR512,
+                      memopv16i32, i512mem, loadi32, i32mem, "{1to16}",
+                      SSE_BIT_ITINS_P, 0>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPANDNQZ : avx512_binop_rm<0xDF, "vpandnq", X86andnp, v8i64, VK8WM, VR512,
+                      memopv8i64, i512mem, loadi64, i64mem, "{1to8}",
+                      SSE_BIT_ITINS_P, 0>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512  FP arithmetic
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                  SizeItins itins> {
+  defm SSZ : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"), OpNode, FR32X,
+                             f32mem, itins.s, 0>, XS, EVEX_4V, VEX_LIG,
+                             EVEX_CD8<32, CD8VT1>;
+  defm SDZ : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"), OpNode, FR64X,
+                             f64mem, itins.d, 0>, XD, VEX_W, EVEX_4V, VEX_LIG,
+                             EVEX_CD8<64, CD8VT1>;
+}
+
+let isCommutable = 1 in {
+defm VADD : avx512_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>;
+defm VMUL : avx512_binop_s<0x59, "mul", fmul, SSE_ALU_ITINS_S>;
+defm VMIN : avx512_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>;
+defm VMAX : avx512_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>;
+}
+let isCommutable = 0 in {
+defm VSUB : avx512_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>;
+defm VDIV : avx512_binop_s<0x5E, "div", fdiv, SSE_ALU_ITINS_S>;
+}
+
+multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           RegisterClass KRC,
+                           RegisterClass RC, ValueType vt,
+                           X86MemOperand x86memop, PatFrag mem_frag,
+                           X86MemOperand x86scalar_mop, PatFrag scalar_mfrag,
+                           string BrdcstStr,
+                           Domain d, OpndItins itins, bit commutable> {
+  let isCommutable = commutable in {
+    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+       !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], itins.rr, d>,
+       EVEX_4V;
+
+    def rrk: PI<opc, MRMSrcReg, (outs RC:$dst), (ins KRC:$mask, RC:$src1, RC:$src2),
+       !strconcat(OpcodeStr,
+           " \t{$src2, $src1, $dst {${mask}} |$dst {${mask}}, $src1, $src2}"),
+       [], itins.rr, d>, EVEX_4V, EVEX_K;
+
+    def rrkz: PI<opc, MRMSrcReg, (outs RC:$dst), (ins KRC:$mask, RC:$src1, RC:$src2),
+       !strconcat(OpcodeStr,
+           " \t{$src2, $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, $src2}"),
+       [], itins.rr, d>, EVEX_4V, EVEX_KZ;
+  }
+
+  let mayLoad = 1 in {
+    def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+       !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
+          itins.rm, d>, EVEX_4V;
+
+    def rmb : PI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, x86scalar_mop:$src2),
+       !strconcat(OpcodeStr, " \t{${src2}", BrdcstStr,
+           ", $src1, $dst|$dst, $src1, ${src2}", BrdcstStr, "}"),
+       [(set RC:$dst, (OpNode RC:$src1, 
+                       (vt (X86VBroadcast (scalar_mfrag addr:$src2)))))],
+       itins.rm, d>, EVEX_4V, EVEX_B;
+
+    def rmk : PI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins KRC:$mask, RC:$src1, x86memop:$src2), !strconcat(OpcodeStr,
+           "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"),
+       [], itins.rm, d>, EVEX_4V, EVEX_K;
+
+    def rmkz : PI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins KRC:$mask, RC:$src1, x86memop:$src2), !strconcat(OpcodeStr,
+           "\t{$src2, $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, $src2}"),
+       [], itins.rm, d>, EVEX_4V, EVEX_KZ;
+
+    def rmbk : PI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins KRC:$mask, RC:$src1, x86scalar_mop:$src2), !strconcat(OpcodeStr,
+           " \t{${src2}", BrdcstStr,
+           ", $src1, $dst {${mask}}|$dst {${mask}}, $src1, ${src2}", BrdcstStr, "}"),
+       [], itins.rm, d>, EVEX_4V, EVEX_B, EVEX_K;
+
+    def rmbkz : PI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins KRC:$mask, RC:$src1, x86scalar_mop:$src2), !strconcat(OpcodeStr,
+           " \t{${src2}", BrdcstStr,
+           ", $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, ${src2}",
+           BrdcstStr, "}"),
+       [], itins.rm, d>, EVEX_4V, EVEX_B, EVEX_KZ;
+  }
+}
+
+defm VADDPSZ : avx512_fp_packed<0x58, "addps", fadd, VK16WM, VR512, v16f32, f512mem,
+                   memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle, 
+                   SSE_ALU_ITINS_P.s, 1>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+                   
+defm VADDPDZ : avx512_fp_packed<0x58, "addpd", fadd, VK8WM, VR512, v8f64, f512mem,
+                   memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble,
+                   SSE_ALU_ITINS_P.d, 1>,
+                   EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VMULPSZ : avx512_fp_packed<0x59, "mulps", fmul, VK16WM, VR512, v16f32, f512mem,
+                   memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle,
+                   SSE_ALU_ITINS_P.s, 1>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+defm VMULPDZ : avx512_fp_packed<0x59, "mulpd", fmul, VK8WM, VR512, v8f64, f512mem,
+                   memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble,
+                   SSE_ALU_ITINS_P.d, 1>,
+                   EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VMINPSZ : avx512_fp_packed<0x5D, "minps", X86fmin, VK16WM, VR512, v16f32, f512mem,
+                   memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle,
+                   SSE_ALU_ITINS_P.s, 1>,
+                   EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+defm VMAXPSZ : avx512_fp_packed<0x5F, "maxps", X86fmax, VK16WM, VR512, v16f32, f512mem,
+                   memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle,
+                   SSE_ALU_ITINS_P.s, 1>,
+                   EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+
+defm VMINPDZ : avx512_fp_packed<0x5D, "minpd", X86fmin, VK8WM, VR512, v8f64, f512mem,
+                   memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble,
+                   SSE_ALU_ITINS_P.d, 1>,
+                   EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VMAXPDZ : avx512_fp_packed<0x5F, "maxpd", X86fmax, VK8WM, VR512, v8f64, f512mem,
+                   memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble,
+                   SSE_ALU_ITINS_P.d, 1>,
+                   EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VSUBPSZ : avx512_fp_packed<0x5C, "subps", fsub, VK16WM, VR512, v16f32, f512mem,
+                   memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle,
+                   SSE_ALU_ITINS_P.s, 0>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+defm VDIVPSZ : avx512_fp_packed<0x5E, "divps", fdiv, VK16WM, VR512, v16f32, f512mem,
+                   memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle,
+                   SSE_ALU_ITINS_P.s, 0>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+
+defm VSUBPDZ : avx512_fp_packed<0x5C, "subpd", fsub, VK8WM, VR512, v8f64, f512mem,
+                   memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble,
+                   SSE_ALU_ITINS_P.d, 0>, 
+                   EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VDIVPDZ : avx512_fp_packed<0x5E, "divpd", fdiv, VK8WM, VR512, v8f64, f512mem,
+                   memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble,
+                   SSE_ALU_ITINS_P.d, 0>, 
+                   EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>;
+
+def : Pat<(v16f32 (int_x86_avx512_mask_max_ps_512 (v16f32 VR512:$src1),
+                   (v16f32 VR512:$src2), (bc_v16f32 (v16i32 immAllZerosV)),
+                   (i16 -1), FROUND_CURRENT)),
+          (VMAXPSZrr VR512:$src1, VR512:$src2)>;
+
+def : Pat<(v8f64 (int_x86_avx512_mask_max_pd_512 (v8f64 VR512:$src1),
+                   (v8f64 VR512:$src2), (bc_v8f64 (v16i32 immAllZerosV)),
+                   (i8 -1), FROUND_CURRENT)),
+          (VMAXPDZrr VR512:$src1, VR512:$src2)>;
+
+def : Pat<(v16f32 (int_x86_avx512_mask_min_ps_512 (v16f32 VR512:$src1),
+                   (v16f32 VR512:$src2), (bc_v16f32 (v16i32 immAllZerosV)),
+                   (i16 -1), FROUND_CURRENT)),
+          (VMINPSZrr VR512:$src1, VR512:$src2)>;
+
+def : Pat<(v8f64 (int_x86_avx512_mask_min_pd_512 (v8f64 VR512:$src1),
+                   (v8f64 VR512:$src2), (bc_v8f64 (v16i32 immAllZerosV)),
+                   (i8 -1), FROUND_CURRENT)),
+          (VMINPDZrr VR512:$src1, VR512:$src2)>;
+//===----------------------------------------------------------------------===//
+// AVX-512  VPTESTM instructions
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_vptest<bits<8> opc, string OpcodeStr, RegisterClass KRC, 
+              RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag, 
+              SDNode OpNode, ValueType vt> {
+  def rr : AVX512PI<opc, MRMSrcReg,
+             (outs KRC:$dst), (ins RC:$src1, RC:$src2), 
+             !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set KRC:$dst, (OpNode (vt RC:$src1), (vt RC:$src2)))],
+             SSEPackedInt>, EVEX_4V;
+  def rm : AVX512PI<opc, MRMSrcMem,
+             (outs KRC:$dst), (ins RC:$src1, x86memop:$src2), 
+             !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set KRC:$dst, (OpNode (vt RC:$src1), 
+              (bitconvert (memop_frag addr:$src2))))], SSEPackedInt>, EVEX_4V;
+}
+
+defm VPTESTMDZ  : avx512_vptest<0x27, "vptestmd", VK16, VR512,  f512mem,
+                              memopv16i32, X86testm, v16i32>, T8PD, EVEX_V512,
+                              EVEX_CD8<32, CD8VF>;
+defm VPTESTMQZ  : avx512_vptest<0x27, "vptestmq", VK8, VR512,  f512mem,
+                              memopv8i64, X86testm, v8i64>, T8PD, EVEX_V512, VEX_W,
+                              EVEX_CD8<64, CD8VF>;
+
+let Predicates = [HasCDI] in {
+defm VPTESTNMDZ  : avx512_vptest<0x27, "vptestnmd", VK16, VR512,  f512mem,
+                              memopv16i32, X86testnm, v16i32>, T8XS, EVEX_V512,
+                              EVEX_CD8<32, CD8VF>;
+defm VPTESTNMQZ  : avx512_vptest<0x27, "vptestnmq", VK8, VR512,  f512mem,
+                              memopv8i64, X86testnm, v8i64>, T8XS, EVEX_V512, VEX_W,
+                              EVEX_CD8<64, CD8VF>;
+}
+
+def : Pat <(i16 (int_x86_avx512_mask_ptestm_d_512 (v16i32 VR512:$src1),
+                 (v16i32 VR512:$src2), (i16 -1))),
+                 (COPY_TO_REGCLASS (VPTESTMDZrr VR512:$src1, VR512:$src2), GR16)>;
+
+def : Pat <(i8 (int_x86_avx512_mask_ptestm_q_512 (v8i64 VR512:$src1),
+                 (v8i64 VR512:$src2), (i8 -1))),
+                 (COPY_TO_REGCLASS (VPTESTMQZrr VR512:$src1, VR512:$src2), GR8)>;
+//===----------------------------------------------------------------------===//
+// AVX-512  Shift instructions
+//===----------------------------------------------------------------------===//
+multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM,
+                         string OpcodeStr, SDNode OpNode, RegisterClass RC,
+                         ValueType vt, X86MemOperand x86memop, PatFrag mem_frag,
+                         RegisterClass KRC> {
+  def ri : AVX512BIi8<opc, ImmFormR, (outs RC:$dst),
+       (ins RC:$src1, i8imm:$src2),
+           !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set RC:$dst, (vt (OpNode RC:$src1, (i8 imm:$src2))))],
+        SSE_INTSHIFT_ITINS_P.rr>, EVEX_4V;
+  def rik : AVX512BIi8<opc, ImmFormR, (outs RC:$dst),
+       (ins KRC:$mask, RC:$src1, i8imm:$src2),
+           !strconcat(OpcodeStr,
+                " \t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"),
+       [], SSE_INTSHIFT_ITINS_P.rr>, EVEX_4V, EVEX_K;
+  def mi: AVX512BIi8<opc, ImmFormM, (outs RC:$dst),
+       (ins x86memop:$src1, i8imm:$src2),
+           !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set RC:$dst, (OpNode (mem_frag addr:$src1),
+                     (i8 imm:$src2)))], SSE_INTSHIFT_ITINS_P.rm>, EVEX_4V;
+  def mik: AVX512BIi8<opc, ImmFormM, (outs RC:$dst),
+       (ins KRC:$mask, x86memop:$src1, i8imm:$src2),
+           !strconcat(OpcodeStr,
+                " \t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"),
+       [], SSE_INTSHIFT_ITINS_P.rm>, EVEX_4V, EVEX_K;
+}
+
+multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                          RegisterClass RC, ValueType vt, ValueType SrcVT,
+                          PatFrag bc_frag, RegisterClass KRC> {
+  // src2 is always 128-bit
+  def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, VR128X:$src2),
+           !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set RC:$dst, (vt (OpNode RC:$src1, (SrcVT VR128X:$src2))))],
+        SSE_INTSHIFT_ITINS_P.rr>, EVEX_4V;
+  def rrk : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
+       (ins KRC:$mask, RC:$src1, VR128X:$src2),
+           !strconcat(OpcodeStr,
+                " \t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"),
+       [], SSE_INTSHIFT_ITINS_P.rr>, EVEX_4V, EVEX_K;
+  def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, i128mem:$src2),
+           !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set RC:$dst, (vt (OpNode RC:$src1,
+                       (bc_frag (memopv2i64 addr:$src2)))))],
+                        SSE_INTSHIFT_ITINS_P.rm>, EVEX_4V;
+  def rmk : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins KRC:$mask, RC:$src1, i128mem:$src2),
+           !strconcat(OpcodeStr,
+                " \t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"),
+       [], SSE_INTSHIFT_ITINS_P.rm>, EVEX_4V, EVEX_K;
+}
+
+defm VPSRLDZ : avx512_shift_rmi<0x72, MRM2r, MRM2m, "vpsrld", X86vsrli,
+                           VR512, v16i32, i512mem, memopv16i32, VK16WM>,
+                           EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPSRLDZ : avx512_shift_rrm<0xD2, "vpsrld", X86vsrl,
+                           VR512, v16i32, v4i32, bc_v4i32, VK16WM>, EVEX_V512,
+                           EVEX_CD8<32, CD8VQ>;
+                           
+defm VPSRLQZ : avx512_shift_rmi<0x73, MRM2r, MRM2m, "vpsrlq", X86vsrli,
+                           VR512, v8i64, i512mem, memopv8i64, VK8WM>, EVEX_V512,
+                           EVEX_CD8<64, CD8VF>, VEX_W;
+defm VPSRLQZ : avx512_shift_rrm<0xD3, "vpsrlq", X86vsrl,
+                           VR512, v8i64, v2i64, bc_v2i64, VK8WM>, EVEX_V512,
+                           EVEX_CD8<64, CD8VQ>, VEX_W;
+
+defm VPSLLDZ : avx512_shift_rmi<0x72, MRM6r, MRM6m, "vpslld", X86vshli,
+                           VR512, v16i32, i512mem, memopv16i32, VK16WM>, EVEX_V512,
+                           EVEX_CD8<32, CD8VF>;
+defm VPSLLDZ : avx512_shift_rrm<0xF2, "vpslld", X86vshl,
+                           VR512, v16i32, v4i32, bc_v4i32, VK16WM>, EVEX_V512,
+                           EVEX_CD8<32, CD8VQ>;
+                           
+defm VPSLLQZ : avx512_shift_rmi<0x73, MRM6r, MRM6m, "vpsllq", X86vshli,
+                           VR512, v8i64, i512mem, memopv8i64, VK8WM>, EVEX_V512,
+                           EVEX_CD8<64, CD8VF>, VEX_W;
+defm VPSLLQZ : avx512_shift_rrm<0xF3, "vpsllq", X86vshl,
+                           VR512, v8i64, v2i64, bc_v2i64, VK8WM>, EVEX_V512,
+                           EVEX_CD8<64, CD8VQ>, VEX_W;
+
+defm VPSRADZ : avx512_shift_rmi<0x72, MRM4r, MRM4m, "vpsrad", X86vsrai,
+                           VR512, v16i32, i512mem, memopv16i32, VK16WM>,
+                           EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPSRADZ : avx512_shift_rrm<0xE2, "vpsrad", X86vsra,
+                           VR512, v16i32, v4i32, bc_v4i32, VK16WM>, EVEX_V512,
+                           EVEX_CD8<32, CD8VQ>;
+                           
+defm VPSRAQZ : avx512_shift_rmi<0x72, MRM4r, MRM4m, "vpsraq", X86vsrai,
+                           VR512, v8i64, i512mem, memopv8i64, VK8WM>, EVEX_V512,
+                           EVEX_CD8<64, CD8VF>, VEX_W;
+defm VPSRAQZ : avx512_shift_rrm<0xE2, "vpsraq", X86vsra,
+                           VR512, v8i64, v2i64, bc_v2i64, VK8WM>, EVEX_V512,
+                           EVEX_CD8<64, CD8VQ>, VEX_W;
+
+//===-------------------------------------------------------------------===//
+// Variable Bit Shifts
+//===-------------------------------------------------------------------===//
+multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           RegisterClass RC, ValueType vt,
+                           X86MemOperand x86memop, PatFrag mem_frag> {
+  def rr  : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
+             (ins RC:$src1, RC:$src2),
+             !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set RC:$dst,
+               (vt (OpNode RC:$src1, (vt RC:$src2))))]>,
+             EVEX_4V;
+  def rm  : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+             (ins RC:$src1, x86memop:$src2),
+             !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set RC:$dst,
+               (vt (OpNode RC:$src1, (mem_frag addr:$src2))))]>,
+             EVEX_4V;
+}
+
+defm VPSLLVDZ : avx512_var_shift<0x47, "vpsllvd", shl, VR512, v16i32, 
+                               i512mem, memopv16i32>, EVEX_V512,
+                               EVEX_CD8<32, CD8VF>;
+defm VPSLLVQZ : avx512_var_shift<0x47, "vpsllvq", shl, VR512, v8i64, 
+                               i512mem, memopv8i64>, EVEX_V512, VEX_W,
+                               EVEX_CD8<64, CD8VF>;
+defm VPSRLVDZ : avx512_var_shift<0x45, "vpsrlvd", srl, VR512, v16i32, 
+                               i512mem, memopv16i32>, EVEX_V512,
+                               EVEX_CD8<32, CD8VF>;
+defm VPSRLVQZ : avx512_var_shift<0x45, "vpsrlvq", srl, VR512, v8i64, 
+                               i512mem, memopv8i64>, EVEX_V512, VEX_W,
+                               EVEX_CD8<64, CD8VF>;
+defm VPSRAVDZ : avx512_var_shift<0x46, "vpsravd", sra, VR512, v16i32, 
+                               i512mem, memopv16i32>, EVEX_V512,
+                               EVEX_CD8<32, CD8VF>;
+defm VPSRAVQZ : avx512_var_shift<0x46, "vpsravq", sra, VR512, v8i64, 
+                               i512mem, memopv8i64>, EVEX_V512, VEX_W,
+                               EVEX_CD8<64, CD8VF>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - MOVDDUP
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_movddup<string OpcodeStr, RegisterClass RC, ValueType VT, 
+                        X86MemOperand x86memop, PatFrag memop_frag> {
+def rr  : AVX512PDI<0x12, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
+                    !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
+                    [(set RC:$dst, (VT (X86Movddup RC:$src)))]>, EVEX;
+def rm  : AVX512PDI<0x12, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+                    !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
+                    [(set RC:$dst,
+                      (VT (X86Movddup (memop_frag addr:$src))))]>, EVEX;
+}
+
+defm VMOVDDUPZ : avx512_movddup<"vmovddup", VR512, v8f64, f512mem, memopv8f64>,
+                 VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
+def : Pat<(X86Movddup (v8f64 (scalar_to_vector (loadf64 addr:$src)))),
+          (VMOVDDUPZrm addr:$src)>;
+
+//===---------------------------------------------------------------------===//
+// Replicate Single FP - MOVSHDUP and MOVSLDUP
+//===---------------------------------------------------------------------===//
+multiclass avx512_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
+                              ValueType vt, RegisterClass RC, PatFrag mem_frag,
+                              X86MemOperand x86memop> {
+  def rr : AVX512XSI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
+                    !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
+                      [(set RC:$dst, (vt (OpNode RC:$src)))]>, EVEX;
+  let mayLoad = 1 in
+  def rm : AVX512XSI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+                    !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
+                      [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>, EVEX;
+}
+
+defm VMOVSHDUPZ  : avx512_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
+                       v16f32, VR512, memopv16f32, f512mem>, EVEX_V512,
+                       EVEX_CD8<32, CD8VF>;
+defm VMOVSLDUPZ  : avx512_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
+                       v16f32, VR512, memopv16f32, f512mem>, EVEX_V512,
+                       EVEX_CD8<32, CD8VF>;
+
+def : Pat<(v16i32 (X86Movshdup VR512:$src)), (VMOVSHDUPZrr VR512:$src)>;
+def : Pat<(v16i32 (X86Movshdup (memopv16i32 addr:$src))),
+           (VMOVSHDUPZrm addr:$src)>;
+def : Pat<(v16i32 (X86Movsldup VR512:$src)), (VMOVSLDUPZrr VR512:$src)>;
+def : Pat<(v16i32 (X86Movsldup (memopv16i32 addr:$src))),
+           (VMOVSLDUPZrm addr:$src)>;
+
+//===----------------------------------------------------------------------===//
+// Move Low to High and High to Low packed FP Instructions
+//===----------------------------------------------------------------------===//
+def VMOVLHPSZrr : AVX512PSI<0x16, MRMSrcReg, (outs VR128X:$dst),
+          (ins VR128X:$src1, VR128X:$src2),
+          "vmovlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+          [(set VR128X:$dst, (v4f32 (X86Movlhps VR128X:$src1, VR128X:$src2)))],
+           IIC_SSE_MOV_LH>, EVEX_4V;
+def VMOVHLPSZrr : AVX512PSI<0x12, MRMSrcReg, (outs VR128X:$dst),
+          (ins VR128X:$src1, VR128X:$src2),
+          "vmovhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+          [(set VR128X:$dst, (v4f32 (X86Movhlps VR128X:$src1, VR128X:$src2)))],
+          IIC_SSE_MOV_LH>, EVEX_4V;
+
+let Predicates = [HasAVX512] in {
+  // MOVLHPS patterns
+  def : Pat<(v4i32 (X86Movlhps VR128X:$src1, VR128X:$src2)),
+            (VMOVLHPSZrr VR128X:$src1, VR128X:$src2)>;
+  def : Pat<(v2i64 (X86Movlhps VR128X:$src1, VR128X:$src2)),
+            (VMOVLHPSZrr (v2i64 VR128X:$src1), VR128X:$src2)>;
+
+  // MOVHLPS patterns
+  def : Pat<(v4i32 (X86Movhlps VR128X:$src1, VR128X:$src2)),
+            (VMOVHLPSZrr VR128X:$src1, VR128X:$src2)>;
+}
+
+//===----------------------------------------------------------------------===//
+// FMA - Fused Multiply Operations
+//
+let Constraints = "$src1 = $dst" in {
+multiclass avx512_fma3p_rm<bits<8> opc, string OpcodeStr,
+            RegisterClass RC, X86MemOperand x86memop,
+            PatFrag mem_frag, X86MemOperand x86scalar_mop, PatFrag scalar_mfrag,
+            string BrdcstStr, SDNode OpNode, ValueType OpVT> {
+  def r: AVX512FMA3<opc, MRMSrcReg, (outs RC:$dst),
+          (ins RC:$src1, RC:$src2, RC:$src3),
+          !strconcat(OpcodeStr," \t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+          [(set RC:$dst, (OpVT(OpNode RC:$src1, RC:$src2, RC:$src3)))]>;
+
+  let mayLoad = 1 in
+  def m: AVX512FMA3<opc, MRMSrcMem, (outs RC:$dst),
+          (ins RC:$src1, RC:$src2, x86memop:$src3),
+          !strconcat(OpcodeStr, " \t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+          [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2,
+                                               (mem_frag addr:$src3))))]>;
+   def mb: AVX512FMA3<opc, MRMSrcMem, (outs RC:$dst),
+           (ins RC:$src1, RC:$src2, x86scalar_mop:$src3),
+           !strconcat(OpcodeStr, " \t{${src3}", BrdcstStr,
+            ", $src2, $dst|$dst, $src2, ${src3}", BrdcstStr, "}"),
+           [(set RC:$dst, (OpNode RC:$src1, RC:$src2,
+           (OpVT (X86VBroadcast (scalar_mfrag addr:$src3)))))]>, EVEX_B;
+}
+} // Constraints = "$src1 = $dst"
+
+let ExeDomain = SSEPackedSingle in {
+  defm VFMADD213PSZ    : avx512_fma3p_rm<0xA8, "vfmadd213ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fmadd, v16f32>, EVEX_V512,
+                                    EVEX_CD8<32, CD8VF>;
+  defm VFMSUB213PSZ    : avx512_fma3p_rm<0xAA, "vfmsub213ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fmsub, v16f32>, EVEX_V512,
+                                    EVEX_CD8<32, CD8VF>;
+  defm VFMADDSUB213PSZ : avx512_fma3p_rm<0xA6, "vfmaddsub213ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fmaddsub, v16f32>,
+                                    EVEX_V512, EVEX_CD8<32, CD8VF>;
+  defm VFMSUBADD213PSZ : avx512_fma3p_rm<0xA7, "vfmsubadd213ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fmsubadd, v16f32>,
+                                    EVEX_V512, EVEX_CD8<32, CD8VF>;
+  defm VFNMADD213PSZ   : avx512_fma3p_rm<0xAC, "vfnmadd213ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fnmadd, v16f32>, EVEX_V512,
+                                    EVEX_CD8<32, CD8VF>;
+  defm VFNMSUB213PSZ   : avx512_fma3p_rm<0xAE, "vfnmsub213ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fnmsub, v16f32>, EVEX_V512,
+                                    EVEX_CD8<32, CD8VF>;
+}
+let ExeDomain = SSEPackedDouble in {
+  defm VFMADD213PDZ    : avx512_fma3p_rm<0xA8, "vfmadd213pd", VR512, f512mem,
+                                    memopv8f64, f64mem, loadf64, "{1to8}",
+                                    X86Fmadd, v8f64>, EVEX_V512,
+                                    VEX_W, EVEX_CD8<64, CD8VF>;
+  defm VFMSUB213PDZ    : avx512_fma3p_rm<0xAA, "vfmsub213pd", VR512, f512mem,
+                                    memopv8f64, f64mem, loadf64, "{1to8}",
+                                    X86Fmsub, v8f64>, EVEX_V512, VEX_W,
+                                    EVEX_CD8<64, CD8VF>;
+  defm VFMADDSUB213PDZ : avx512_fma3p_rm<0xA6, "vfmaddsub213pd", VR512, f512mem,
+                                    memopv8f64, f64mem, loadf64, "{1to8}",
+                                    X86Fmaddsub, v8f64>, EVEX_V512, VEX_W,
+                                    EVEX_CD8<64, CD8VF>;
+  defm VFMSUBADD213PDZ : avx512_fma3p_rm<0xA7, "vfmsubadd213pd", VR512, f512mem,
+                                    memopv8f64, f64mem, loadf64, "{1to8}",
+                                    X86Fmsubadd, v8f64>, EVEX_V512, VEX_W,
+                                    EVEX_CD8<64, CD8VF>;
+  defm VFNMADD213PDZ : avx512_fma3p_rm<0xAC, "vfnmadd213pd", VR512, f512mem,
+                                  memopv8f64, f64mem, loadf64, "{1to8}",
+                                  X86Fnmadd, v8f64>, EVEX_V512, VEX_W,
+                                  EVEX_CD8<64, CD8VF>;
+  defm VFNMSUB213PDZ : avx512_fma3p_rm<0xAE, "vfnmsub213pd", VR512, f512mem,
+                                  memopv8f64, f64mem, loadf64, "{1to8}",
+                                  X86Fnmsub, v8f64>, EVEX_V512, VEX_W,
+                                  EVEX_CD8<64, CD8VF>;
+}
+
+let Constraints = "$src1 = $dst" in {
+multiclass avx512_fma3p_m132<bits<8> opc, string OpcodeStr,
+            RegisterClass RC, X86MemOperand x86memop,
+            PatFrag mem_frag, X86MemOperand x86scalar_mop, PatFrag scalar_mfrag,
+            string BrdcstStr, SDNode OpNode, ValueType OpVT> {
+  let mayLoad = 1 in
+  def m: AVX512FMA3<opc, MRMSrcMem, (outs RC:$dst),
+          (ins RC:$src1, RC:$src3, x86memop:$src2),
+          !strconcat(OpcodeStr, " \t{$src2, $src3, $dst|$dst, $src3, $src2}"),
+          [(set RC:$dst, (OpVT (OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3)))]>;
+   def mb: AVX512FMA3<opc, MRMSrcMem, (outs RC:$dst),
+           (ins RC:$src1, RC:$src3, x86scalar_mop:$src2),
+           !strconcat(OpcodeStr, " \t{${src2}", BrdcstStr,
+            ", $src3, $dst|$dst, $src3, ${src2}", BrdcstStr, "}"),
+           [(set RC:$dst, (OpNode RC:$src1, 
+           (OpVT (X86VBroadcast (scalar_mfrag addr:$src2))), RC:$src3))]>, EVEX_B;
+}
+} // Constraints = "$src1 = $dst"
+
+
+let ExeDomain = SSEPackedSingle in {
+  defm VFMADD132PSZ    : avx512_fma3p_m132<0x98, "vfmadd132ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fmadd, v16f32>, EVEX_V512,
+                                    EVEX_CD8<32, CD8VF>;
+  defm VFMSUB132PSZ    : avx512_fma3p_m132<0x9A, "vfmsub132ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fmsub, v16f32>, EVEX_V512,
+                                    EVEX_CD8<32, CD8VF>;
+  defm VFMADDSUB132PSZ : avx512_fma3p_m132<0x96, "vfmaddsub132ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fmaddsub, v16f32>,
+                                    EVEX_V512, EVEX_CD8<32, CD8VF>;
+  defm VFMSUBADD132PSZ : avx512_fma3p_m132<0x97, "vfmsubadd132ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fmsubadd, v16f32>,
+                                    EVEX_V512, EVEX_CD8<32, CD8VF>;
+  defm VFNMADD132PSZ   : avx512_fma3p_m132<0x9C, "vfnmadd132ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fnmadd, v16f32>, EVEX_V512,
+                                    EVEX_CD8<32, CD8VF>;
+  defm VFNMSUB132PSZ   : avx512_fma3p_m132<0x9E, "vfnmsub132ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fnmsub, v16f32>, EVEX_V512,
+                                    EVEX_CD8<32, CD8VF>;
+}
+let ExeDomain = SSEPackedDouble in {
+  defm VFMADD132PDZ    : avx512_fma3p_m132<0x98, "vfmadd132pd", VR512, f512mem,
+                                    memopv8f64, f64mem, loadf64, "{1to8}",
+                                    X86Fmadd, v8f64>, EVEX_V512,
+                                    VEX_W, EVEX_CD8<64, CD8VF>;
+  defm VFMSUB132PDZ    : avx512_fma3p_m132<0x9A, "vfmsub132pd", VR512, f512mem,
+                                    memopv8f64, f64mem, loadf64, "{1to8}",
+                                    X86Fmsub, v8f64>, EVEX_V512, VEX_W,
+                                    EVEX_CD8<64, CD8VF>;
+  defm VFMADDSUB132PDZ : avx512_fma3p_m132<0x96, "vfmaddsub132pd", VR512, f512mem,
+                                    memopv8f64, f64mem, loadf64, "{1to8}",
+                                    X86Fmaddsub, v8f64>, EVEX_V512, VEX_W,
+                                    EVEX_CD8<64, CD8VF>;
+  defm VFMSUBADD132PDZ : avx512_fma3p_m132<0x97, "vfmsubadd132pd", VR512, f512mem,
+                                    memopv8f64, f64mem, loadf64, "{1to8}",
+                                    X86Fmsubadd, v8f64>, EVEX_V512, VEX_W,
+                                    EVEX_CD8<64, CD8VF>;
+  defm VFNMADD132PDZ : avx512_fma3p_m132<0x9C, "vfnmadd132pd", VR512, f512mem,
+                                  memopv8f64, f64mem, loadf64, "{1to8}",
+                                  X86Fnmadd, v8f64>, EVEX_V512, VEX_W,
+                                  EVEX_CD8<64, CD8VF>;
+  defm VFNMSUB132PDZ : avx512_fma3p_m132<0x9E, "vfnmsub132pd", VR512, f512mem,
+                                  memopv8f64, f64mem, loadf64, "{1to8}",
+                                  X86Fnmsub, v8f64>, EVEX_V512, VEX_W,
+                                  EVEX_CD8<64, CD8VF>;
+}
+
+// Scalar FMA
+let Constraints = "$src1 = $dst" in {
+multiclass avx512_fma3s_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 
+                 RegisterClass RC, ValueType OpVT, 
+                 X86MemOperand x86memop, Operand memop, 
+                 PatFrag mem_frag> {
+  let isCommutable = 1 in
+  def r     : AVX512FMA3<opc, MRMSrcReg, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, RC:$src3),
+                   !strconcat(OpcodeStr,
+                              " \t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set RC:$dst,
+                     (OpVT (OpNode RC:$src2, RC:$src1, RC:$src3)))]>;
+  let mayLoad = 1 in
+  def m     : AVX512FMA3<opc, MRMSrcMem, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, f128mem:$src3),
+                   !strconcat(OpcodeStr,
+                              " \t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set RC:$dst,
+                     (OpVT (OpNode RC:$src2, RC:$src1,
+                            (mem_frag addr:$src3))))]>;
+}
+
+} // Constraints = "$src1 = $dst"
+
+defm VFMADDSSZ  : avx512_fma3s_rm<0xA9, "vfmadd213ss", X86Fmadd, FR32X,
+                      f32, f32mem, ssmem, loadf32>, EVEX_CD8<32, CD8VT1>;
+defm VFMADDSDZ  : avx512_fma3s_rm<0xA9, "vfmadd213sd", X86Fmadd, FR64X,
+                      f64, f64mem, sdmem, loadf64>, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VFMSUBSSZ  : avx512_fma3s_rm<0xAB, "vfmsub213ss", X86Fmsub, FR32X,
+                      f32, f32mem, ssmem, loadf32>, EVEX_CD8<32, CD8VT1>;
+defm VFMSUBSDZ  : avx512_fma3s_rm<0xAB, "vfmsub213sd", X86Fmsub, FR64X,
+                      f64, f64mem, sdmem, loadf64>, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VFNMADDSSZ  : avx512_fma3s_rm<0xAD, "vfnmadd213ss", X86Fnmadd, FR32X,
+                      f32, f32mem, ssmem, loadf32>, EVEX_CD8<32, CD8VT1>;
+defm VFNMADDSDZ  : avx512_fma3s_rm<0xAD, "vfnmadd213sd", X86Fnmadd, FR64X,
+                      f64, f64mem, sdmem, loadf64>, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VFNMSUBSSZ  : avx512_fma3s_rm<0xAF, "vfnmsub213ss", X86Fnmsub, FR32X,
+                      f32, f32mem, ssmem, loadf32>, EVEX_CD8<32, CD8VT1>;
+defm VFNMSUBSDZ  : avx512_fma3s_rm<0xAF, "vfnmsub213sd", X86Fnmsub, FR64X,
+                      f64, f64mem, sdmem, loadf64>, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512  Scalar convert from sign integer to float/double
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_vcvtsi<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+                          X86MemOperand x86memop, string asm> {
+let hasSideEffects = 0 in {
+  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
+              !strconcat(asm," \t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+              EVEX_4V;
+  let mayLoad = 1 in
+  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
+              (ins DstRC:$src1, x86memop:$src),
+              !strconcat(asm," \t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+              EVEX_4V;
+} // hasSideEffects = 0
+}
+let Predicates = [HasAVX512] in {
+defm VCVTSI2SSZ   : avx512_vcvtsi<0x2A, GR32, FR32X, i32mem, "cvtsi2ss{l}">,
+                                  XS, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+defm VCVTSI642SSZ : avx512_vcvtsi<0x2A, GR64, FR32X, i64mem, "cvtsi2ss{q}">,
+                                  XS, VEX_W, VEX_LIG, EVEX_CD8<64, CD8VT1>;
+defm VCVTSI2SDZ   : avx512_vcvtsi<0x2A, GR32, FR64X, i32mem, "cvtsi2sd{l}">,
+                                  XD, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+defm VCVTSI642SDZ : avx512_vcvtsi<0x2A, GR64, FR64X, i64mem, "cvtsi2sd{q}">,
+                                  XD, VEX_W, VEX_LIG, EVEX_CD8<64, CD8VT1>;
+
+def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
+          (VCVTSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
+          (VCVTSI642SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))),
+          (VCVTSI2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))),
+          (VCVTSI642SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>;
+
+def : Pat<(f32 (sint_to_fp GR32:$src)),
+          (VCVTSI2SSZrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
+def : Pat<(f32 (sint_to_fp GR64:$src)),
+          (VCVTSI642SSZrr (f32 (IMPLICIT_DEF)), GR64:$src)>;
+def : Pat<(f64 (sint_to_fp GR32:$src)),
+          (VCVTSI2SDZrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
+def : Pat<(f64 (sint_to_fp GR64:$src)),
+          (VCVTSI642SDZrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
+
+defm VCVTUSI2SSZ   : avx512_vcvtsi<0x7B, GR32, FR32X, i32mem, "cvtusi2ss{l}">,
+                                  XS, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+defm VCVTUSI642SSZ : avx512_vcvtsi<0x7B, GR64, FR32X, i64mem, "cvtusi2ss{q}">,
+                                  XS, VEX_W, VEX_LIG, EVEX_CD8<64, CD8VT1>;
+defm VCVTUSI2SDZ   : avx512_vcvtsi<0x7B, GR32, FR64X, i32mem, "cvtusi2sd{l}">,
+                                  XD, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+defm VCVTUSI642SDZ : avx512_vcvtsi<0x7B, GR64, FR64X, i64mem, "cvtusi2sd{q}">,
+                                  XD, VEX_W, VEX_LIG, EVEX_CD8<64, CD8VT1>;
+
+def : Pat<(f32 (uint_to_fp (loadi32 addr:$src))),
+          (VCVTUSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f32 (uint_to_fp (loadi64 addr:$src))),
+          (VCVTUSI642SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f64 (uint_to_fp (loadi32 addr:$src))),
+          (VCVTUSI2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f64 (uint_to_fp (loadi64 addr:$src))),
+          (VCVTUSI642SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>;
+
+def : Pat<(f32 (uint_to_fp GR32:$src)),
+          (VCVTUSI2SSZrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
+def : Pat<(f32 (uint_to_fp GR64:$src)),
+          (VCVTUSI642SSZrr (f32 (IMPLICIT_DEF)), GR64:$src)>;
+def : Pat<(f64 (uint_to_fp GR32:$src)),
+          (VCVTUSI2SDZrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
+def : Pat<(f64 (uint_to_fp GR64:$src)),
+          (VCVTUSI642SDZrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX-512  Scalar convert from float/double to integer
+//===----------------------------------------------------------------------===//
+multiclass avx512_cvt_s_int<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+                          Intrinsic Int, Operand memop, ComplexPattern mem_cpat,
+                          string asm> {
+let hasSideEffects = 0 in {
+  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
+              !strconcat(asm," \t{$src, $dst|$dst, $src}"),
+              [(set DstRC:$dst, (Int SrcRC:$src))]>, EVEX, VEX_LIG,
+              Requires<[HasAVX512]>;
+  let mayLoad = 1 in
+  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
+              !strconcat(asm," \t{$src, $dst|$dst, $src}"), []>, EVEX, VEX_LIG,
+              Requires<[HasAVX512]>;
+} // hasSideEffects = 0
+}
+let Predicates = [HasAVX512] in {
+// Convert float/double to signed/unsigned int 32/64
+defm VCVTSS2SIZ:    avx512_cvt_s_int<0x2D, VR128X, GR32, int_x86_sse_cvtss2si,
+                                   ssmem, sse_load_f32, "cvtss2si">,
+                                   XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTSS2SI64Z:  avx512_cvt_s_int<0x2D, VR128X, GR64, int_x86_sse_cvtss2si64,
+                                   ssmem, sse_load_f32, "cvtss2si">,
+                                   XS, VEX_W, EVEX_CD8<32, CD8VT1>;
+defm VCVTSS2USIZ:   avx512_cvt_s_int<0x79, VR128X, GR32, int_x86_avx512_cvtss2usi,
+                                   ssmem, sse_load_f32, "cvtss2usi">,
+                                   XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTSS2USI64Z: avx512_cvt_s_int<0x79, VR128X, GR64,
+                                   int_x86_avx512_cvtss2usi64, ssmem,
+                                   sse_load_f32, "cvtss2usi">, XS, VEX_W,
+                                   EVEX_CD8<32, CD8VT1>;
+defm VCVTSD2SIZ:    avx512_cvt_s_int<0x2D, VR128X, GR32, int_x86_sse2_cvtsd2si,
+                                   sdmem, sse_load_f64, "cvtsd2si">,
+                                   XD, EVEX_CD8<64, CD8VT1>;
+defm VCVTSD2SI64Z:  avx512_cvt_s_int<0x2D, VR128X, GR64, int_x86_sse2_cvtsd2si64,
+                                   sdmem, sse_load_f64, "cvtsd2si">,
+                                   XD, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VCVTSD2USIZ:   avx512_cvt_s_int<0x79, VR128X, GR32, int_x86_avx512_cvtsd2usi,
+                                   sdmem, sse_load_f64, "cvtsd2usi">,
+                                   XD, EVEX_CD8<64, CD8VT1>;
+defm VCVTSD2USI64Z: avx512_cvt_s_int<0x79, VR128X, GR64,
+                                   int_x86_avx512_cvtsd2usi64, sdmem,
+                                   sse_load_f64, "cvtsd2usi">, XD, VEX_W,
+                                   EVEX_CD8<64, CD8VT1>;
+
+let isCodeGenOnly = 1 in {
+  defm Int_VCVTSI2SSZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X,
+            int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}",
+            SSE_CVT_Scalar, 0>, XS, EVEX_4V;
+  defm Int_VCVTSI2SS64Z : sse12_cvt_sint_3addr<0x2A, GR64, VR128X,
+            int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss{q}",
+            SSE_CVT_Scalar, 0>, XS, EVEX_4V, VEX_W;
+  defm Int_VCVTSI2SDZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X,
+            int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd{l}",
+            SSE_CVT_Scalar, 0>, XD, EVEX_4V;
+  defm Int_VCVTSI2SD64Z : sse12_cvt_sint_3addr<0x2A, GR64, VR128X,
+            int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}",
+            SSE_CVT_Scalar, 0>, XD, EVEX_4V, VEX_W;
+
+  defm Int_VCVTUSI2SSZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X,
+            int_x86_avx512_cvtusi2ss, i32mem, loadi32, "cvtusi2ss{l}",
+            SSE_CVT_Scalar, 0>, XS, EVEX_4V;
+  defm Int_VCVTUSI2SS64Z : sse12_cvt_sint_3addr<0x2A, GR64, VR128X,
+            int_x86_avx512_cvtusi642ss, i64mem, loadi64, "cvtusi2ss{q}",
+            SSE_CVT_Scalar, 0>, XS, EVEX_4V, VEX_W;
+  defm Int_VCVTUSI2SDZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X,
+            int_x86_avx512_cvtusi2sd, i32mem, loadi32, "cvtusi2sd{l}",
+            SSE_CVT_Scalar, 0>, XD, EVEX_4V;
+  defm Int_VCVTUSI2SD64Z : sse12_cvt_sint_3addr<0x2A, GR64, VR128X,
+            int_x86_avx512_cvtusi642sd, i64mem, loadi64, "cvtusi2sd{q}",
+            SSE_CVT_Scalar, 0>, XD, EVEX_4V, VEX_W;
+} // isCodeGenOnly = 1
+
+// Convert float/double to signed/unsigned int 32/64 with truncation
+let isCodeGenOnly = 1 in {
+  defm Int_VCVTTSS2SIZ : avx512_cvt_s_int<0x2C, VR128X, GR32, int_x86_sse_cvttss2si,
+                                     ssmem, sse_load_f32, "cvttss2si">,
+                                     XS, EVEX_CD8<32, CD8VT1>;
+  defm Int_VCVTTSS2SI64Z : avx512_cvt_s_int<0x2C, VR128X, GR64,
+                                     int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
+                                     "cvttss2si">, XS, VEX_W,
+                                     EVEX_CD8<32, CD8VT1>;
+  defm Int_VCVTTSD2SIZ : avx512_cvt_s_int<0x2C, VR128X, GR32, int_x86_sse2_cvttsd2si,
+                                     sdmem, sse_load_f64, "cvttsd2si">, XD,
+                                     EVEX_CD8<64, CD8VT1>;
+  defm Int_VCVTTSD2SI64Z : avx512_cvt_s_int<0x2C, VR128X, GR64,
+                                     int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
+                                     "cvttsd2si">, XD, VEX_W,
+                                     EVEX_CD8<64, CD8VT1>;
+  defm Int_VCVTTSS2USIZ : avx512_cvt_s_int<0x78, VR128X, GR32,
+                                     int_x86_avx512_cvttss2usi, ssmem, sse_load_f32,
+                                     "cvttss2usi">, XS, EVEX_CD8<32, CD8VT1>;
+  defm Int_VCVTTSS2USI64Z : avx512_cvt_s_int<0x78, VR128X, GR64,
+                                     int_x86_avx512_cvttss2usi64, ssmem,
+                                     sse_load_f32, "cvttss2usi">, XS, VEX_W,
+                                     EVEX_CD8<32, CD8VT1>;
+  defm Int_VCVTTSD2USIZ : avx512_cvt_s_int<0x78, VR128X, GR32,
+                                     int_x86_avx512_cvttsd2usi,
+                                     sdmem, sse_load_f64, "cvttsd2usi">, XD,
+                                     EVEX_CD8<64, CD8VT1>;
+  defm Int_VCVTTSD2USI64Z : avx512_cvt_s_int<0x78, VR128X, GR64,
+                                     int_x86_avx512_cvttsd2usi64, sdmem,
+                                     sse_load_f64, "cvttsd2usi">, XD, VEX_W,
+                                     EVEX_CD8<64, CD8VT1>;
+} // isCodeGenOnly = 1
+
+multiclass avx512_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+                         SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
+                         string asm> {
+  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
+              !strconcat(asm," \t{$src, $dst|$dst, $src}"),
+              [(set DstRC:$dst, (OpNode SrcRC:$src))]>, EVEX;
+  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
+              !strconcat(asm," \t{$src, $dst|$dst, $src}"),
+              [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>, EVEX;
+}
+
+defm VCVTTSS2SIZ    : avx512_cvt_s<0x2C, FR32X, GR32, fp_to_sint, f32mem,
+                                  loadf32, "cvttss2si">, XS,
+                                  EVEX_CD8<32, CD8VT1>;
+defm VCVTTSS2USIZ   : avx512_cvt_s<0x78, FR32X, GR32, fp_to_uint, f32mem,
+                                  loadf32, "cvttss2usi">, XS,
+                                  EVEX_CD8<32, CD8VT1>;
+defm VCVTTSS2SI64Z  : avx512_cvt_s<0x2C, FR32X, GR64, fp_to_sint, f32mem,
+                                  loadf32, "cvttss2si">, XS, VEX_W,
+                                  EVEX_CD8<32, CD8VT1>;
+defm VCVTTSS2USI64Z : avx512_cvt_s<0x78, FR32X, GR64, fp_to_uint, f32mem,
+                                  loadf32, "cvttss2usi">, XS, VEX_W,
+                                  EVEX_CD8<32, CD8VT1>;
+defm VCVTTSD2SIZ    : avx512_cvt_s<0x2C, FR64X, GR32, fp_to_sint, f64mem,
+                                  loadf64, "cvttsd2si">, XD,
+                                  EVEX_CD8<64, CD8VT1>;
+defm VCVTTSD2USIZ   : avx512_cvt_s<0x78, FR64X, GR32, fp_to_uint, f64mem,
+                                  loadf64, "cvttsd2usi">, XD,
+                                  EVEX_CD8<64, CD8VT1>;
+defm VCVTTSD2SI64Z  : avx512_cvt_s<0x2C, FR64X, GR64, fp_to_sint, f64mem,
+                                  loadf64, "cvttsd2si">, XD, VEX_W,
+                                  EVEX_CD8<64, CD8VT1>;
+defm VCVTTSD2USI64Z : avx512_cvt_s<0x78, FR64X, GR64, fp_to_uint, f64mem,
+                                  loadf64, "cvttsd2usi">, XD, VEX_W,
+                                  EVEX_CD8<64, CD8VT1>;
+} // HasAVX512
+//===----------------------------------------------------------------------===//
+// AVX-512  Convert form float to double and back
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0 in {
+def VCVTSS2SDZrr : AVX512XSI<0x5A, MRMSrcReg, (outs FR64X:$dst),
+                    (ins FR32X:$src1, FR32X:$src2),
+                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    []>, EVEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>;
+let mayLoad = 1 in
+def VCVTSS2SDZrm : AVX512XSI<0x5A, MRMSrcMem, (outs FR64X:$dst),
+                    (ins FR32X:$src1, f32mem:$src2),
+                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    []>, EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>,
+                    EVEX_CD8<32, CD8VT1>;
+
+// Convert scalar double to scalar single
+def VCVTSD2SSZrr  : AVX512XDI<0x5A, MRMSrcReg, (outs FR32X:$dst),
+                      (ins FR64X:$src1, FR64X:$src2),
+                      "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      []>, EVEX_4V, VEX_LIG, VEX_W, Sched<[WriteCvtF2F]>;
+let mayLoad = 1 in
+def VCVTSD2SSZrm  : AVX512XDI<0x5A, MRMSrcMem, (outs FR32X:$dst),
+                      (ins FR64X:$src1, f64mem:$src2),
+                      "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      []>, EVEX_4V, VEX_LIG, VEX_W,
+                      Sched<[WriteCvtF2FLd, ReadAfterLd]>, EVEX_CD8<64, CD8VT1>;
+}
+
+def : Pat<(f64 (fextend FR32X:$src)), (VCVTSS2SDZrr FR32X:$src, FR32X:$src)>,
+      Requires<[HasAVX512]>;
+def : Pat<(fextend (loadf32 addr:$src)),
+    (VCVTSS2SDZrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX512]>;
+
+def : Pat<(extloadf32 addr:$src),
+    (VCVTSS2SDZrm (f32 (IMPLICIT_DEF)), addr:$src)>,
+      Requires<[HasAVX512, OptForSize]>;
+
+def : Pat<(extloadf32 addr:$src),
+    (VCVTSS2SDZrr (f32 (IMPLICIT_DEF)), (VMOVSSZrm addr:$src))>,
+    Requires<[HasAVX512, OptForSpeed]>;
+
+def : Pat<(f32 (fround FR64X:$src)), (VCVTSD2SSZrr FR64X:$src, FR64X:$src)>,
+           Requires<[HasAVX512]>;
+
+multiclass avx512_vcvt_fp_with_rc<bits<8> opc, string asm, RegisterClass SrcRC, 
+               RegisterClass DstRC, SDNode OpNode, PatFrag mem_frag, 
+               X86MemOperand x86memop, ValueType OpVT, ValueType InVT,
+               Domain d> {
+let hasSideEffects = 0 in {
+  def rr : AVX512PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
+              !strconcat(asm," \t{$src, $dst|$dst, $src}"),
+              [(set DstRC:$dst,
+                (OpVT (OpNode (InVT SrcRC:$src))))], d>, EVEX;
+  def rrb : AVX512PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src, AVX512RC:$rc),
+              !strconcat(asm," \t{$rc, $src, $dst|$dst, $src, $rc}"),
+              [], d>, EVEX, EVEX_B, EVEX_RC;
+  let mayLoad = 1 in
+  def rm : AVX512PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
+              !strconcat(asm," \t{$src, $dst|$dst, $src}"),
+              [(set DstRC:$dst,
+                (OpVT (OpNode (InVT (bitconvert (mem_frag addr:$src))))))], d>, EVEX;
+} // hasSideEffects = 0
+}
+
+multiclass avx512_vcvt_fp<bits<8> opc, string asm, RegisterClass SrcRC,
+               RegisterClass DstRC, SDNode OpNode, PatFrag mem_frag,
+               X86MemOperand x86memop, ValueType OpVT, ValueType InVT,
+               Domain d> {
+let hasSideEffects = 0 in {
+  def rr : AVX512PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
+              !strconcat(asm," \t{$src, $dst|$dst, $src}"),
+              [(set DstRC:$dst,
+                (OpVT (OpNode (InVT SrcRC:$src))))], d>, EVEX;
+  let mayLoad = 1 in
+  def rm : AVX512PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
+              !strconcat(asm," \t{$src, $dst|$dst, $src}"),
+              [(set DstRC:$dst,
+                (OpVT (OpNode (InVT (bitconvert (mem_frag addr:$src))))))], d>, EVEX;
+} // hasSideEffects = 0
+}
+
+defm VCVTPD2PSZ : avx512_vcvt_fp_with_rc<0x5A, "vcvtpd2ps", VR512, VR256X, fround,
+                                memopv8f64, f512mem, v8f32, v8f64,
+                                SSEPackedSingle>, EVEX_V512, VEX_W, PD,
+                                EVEX_CD8<64, CD8VF>;
+
+defm VCVTPS2PDZ : avx512_vcvt_fp<0x5A, "vcvtps2pd", VR256X, VR512, fextend,
+                                memopv4f64, f256mem, v8f64, v8f32,
+                                SSEPackedDouble>, EVEX_V512, PS,
+                                EVEX_CD8<32, CD8VH>;
+def : Pat<(v8f64 (extloadv8f32 addr:$src)),
+            (VCVTPS2PDZrm addr:$src)>;
+            
+def : Pat<(v8f32 (int_x86_avx512_mask_cvtpd2ps_512 (v8f64 VR512:$src),
+                   (bc_v8f32(v8i32 immAllZerosV)), (i8 -1), (i32 FROUND_CURRENT))),
+          (VCVTPD2PSZrr VR512:$src)>;
+
+def : Pat<(v8f32 (int_x86_avx512_mask_cvtpd2ps_512 (v8f64 VR512:$src),
+                   (bc_v8f32(v8i32 immAllZerosV)), (i8 -1), imm:$rc)),
+          (VCVTPD2PSZrrb VR512:$src, imm:$rc)>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512  Vector convert from sign integer to float/double
+//===----------------------------------------------------------------------===//
+
+defm VCVTDQ2PSZ : avx512_vcvt_fp_with_rc<0x5B, "vcvtdq2ps", VR512, VR512, sint_to_fp,
+                                memopv8i64, i512mem, v16f32, v16i32,
+                                SSEPackedSingle>, EVEX_V512, PS,
+                                EVEX_CD8<32, CD8VF>;
+
+defm VCVTDQ2PDZ : avx512_vcvt_fp<0xE6, "vcvtdq2pd", VR256X, VR512, sint_to_fp,
+                                memopv4i64, i256mem, v8f64, v8i32,
+                                SSEPackedDouble>, EVEX_V512, XS,
+                                EVEX_CD8<32, CD8VH>;
+
+defm VCVTTPS2DQZ : avx512_vcvt_fp<0x5B, "vcvttps2dq", VR512, VR512, fp_to_sint,
+                                 memopv16f32, f512mem, v16i32, v16f32,
+                                 SSEPackedSingle>, EVEX_V512, XS,
+                                 EVEX_CD8<32, CD8VF>;
+
+defm VCVTTPD2DQZ : avx512_vcvt_fp<0xE6, "vcvttpd2dq", VR512, VR256X, fp_to_sint,
+                                 memopv8f64, f512mem, v8i32, v8f64, 
+                                 SSEPackedDouble>, EVEX_V512, PD, VEX_W,
+                                 EVEX_CD8<64, CD8VF>;
+
+defm VCVTTPS2UDQZ : avx512_vcvt_fp<0x78, "vcvttps2udq", VR512, VR512, fp_to_uint,
+                                 memopv16f32, f512mem, v16i32, v16f32,
+                                 SSEPackedSingle>, EVEX_V512, PS,
+                                 EVEX_CD8<32, CD8VF>;
+
+// cvttps2udq (src, 0, mask-all-ones, sae-current)
+def : Pat<(v16i32 (int_x86_avx512_mask_cvttps2udq_512 (v16f32 VR512:$src),
+                   (v16i32 immAllZerosV), (i16 -1), FROUND_CURRENT)),
+          (VCVTTPS2UDQZrr VR512:$src)>;
+
+defm VCVTTPD2UDQZ : avx512_vcvt_fp<0x78, "vcvttpd2udq", VR512, VR256X, fp_to_uint,
+                                 memopv8f64, f512mem, v8i32, v8f64,
+                                 SSEPackedDouble>, EVEX_V512, PS, VEX_W,
+                                 EVEX_CD8<64, CD8VF>;
+                                 
+// cvttpd2udq (src, 0, mask-all-ones, sae-current)
+def : Pat<(v8i32 (int_x86_avx512_mask_cvttpd2udq_512 (v8f64 VR512:$src),
+                   (v8i32 immAllZerosV), (i8 -1), FROUND_CURRENT)),
+          (VCVTTPD2UDQZrr VR512:$src)>;
+
+defm VCVTUDQ2PDZ : avx512_vcvt_fp<0x7A, "vcvtudq2pd", VR256X, VR512, uint_to_fp,
+                                 memopv4i64, f256mem, v8f64, v8i32,
+                                 SSEPackedDouble>, EVEX_V512, XS,
+                                 EVEX_CD8<32, CD8VH>;
+                                 
+defm VCVTUDQ2PSZ : avx512_vcvt_fp_with_rc<0x7A, "vcvtudq2ps", VR512, VR512, uint_to_fp,
+                                 memopv16i32, f512mem, v16f32, v16i32,
+                                 SSEPackedSingle>, EVEX_V512, XD,
+                                 EVEX_CD8<32, CD8VF>;
+
+def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src1))),
+          (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr 
+           (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>;
+                                 
+def : Pat<(v4i32 (fp_to_uint (v4f32 VR128X:$src1))),
+          (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr
+           (v16f32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_xmm)>;
+
+def : Pat<(v8f32 (uint_to_fp (v8i32 VR256X:$src1))),
+          (EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr
+           (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>;
+           
+def : Pat<(v4f32 (uint_to_fp (v4i32 VR128X:$src1))),
+          (EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr
+           (v16i32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_xmm)>;
+
+def : Pat<(v4f64 (uint_to_fp (v4i32 VR128X:$src1))),
+          (EXTRACT_SUBREG (v8f64 (VCVTUDQ2PDZrr
+           (v8i32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_ymm)>;
+
+def : Pat<(v16f32 (int_x86_avx512_mask_cvtdq2ps_512 (v16i32 VR512:$src),
+                   (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1), imm:$rc)),
+          (VCVTDQ2PSZrrb VR512:$src, imm:$rc)>;
+def : Pat<(v8f64 (int_x86_avx512_mask_cvtdq2pd_512 (v8i32 VR256X:$src),
+                   (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1))),
+          (VCVTDQ2PDZrr VR256X:$src)>;
+def : Pat<(v16f32 (int_x86_avx512_mask_cvtudq2ps_512 (v16i32 VR512:$src),
+                   (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1), imm:$rc)),
+          (VCVTUDQ2PSZrrb VR512:$src, imm:$rc)>;
+def : Pat<(v8f64 (int_x86_avx512_mask_cvtudq2pd_512 (v8i32 VR256X:$src),
+                   (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1))),
+          (VCVTUDQ2PDZrr VR256X:$src)>;
+
+multiclass avx512_vcvt_fp2int<bits<8> opc, string asm, RegisterClass SrcRC,
+               RegisterClass DstRC, PatFrag mem_frag,
+               X86MemOperand x86memop, Domain d> {
+let hasSideEffects = 0 in {
+  def rr : AVX512PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
+              !strconcat(asm," \t{$src, $dst|$dst, $src}"),
+              [], d>, EVEX;
+  def rrb : AVX512PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src, AVX512RC:$rc),
+              !strconcat(asm," \t{$rc, $src, $dst|$dst, $src, $rc}"),
+              [], d>, EVEX, EVEX_B, EVEX_RC;
+  let mayLoad = 1 in
+  def rm : AVX512PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
+              !strconcat(asm," \t{$src, $dst|$dst, $src}"),
+              [], d>, EVEX;
+} // hasSideEffects = 0
+}
+
+defm VCVTPS2DQZ : avx512_vcvt_fp2int<0x5B, "vcvtps2dq", VR512, VR512,
+                                 memopv16f32, f512mem, SSEPackedSingle>, PD,
+                                 EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VCVTPD2DQZ : avx512_vcvt_fp2int<0xE6, "vcvtpd2dq", VR512, VR256X,
+                                 memopv8f64, f512mem, SSEPackedDouble>, XD, VEX_W,
+                                 EVEX_V512, EVEX_CD8<64, CD8VF>;
+
+def : Pat <(v16i32 (int_x86_avx512_mask_cvtps2dq_512 (v16f32 VR512:$src),
+                    (v16i32 immAllZerosV), (i16 -1), imm:$rc)),
+           (VCVTPS2DQZrrb VR512:$src, imm:$rc)>;
+
+def : Pat <(v8i32 (int_x86_avx512_mask_cvtpd2dq_512 (v8f64 VR512:$src),
+                    (v8i32 immAllZerosV), (i8 -1), imm:$rc)),
+           (VCVTPD2DQZrrb VR512:$src, imm:$rc)>;
+
+defm VCVTPS2UDQZ : avx512_vcvt_fp2int<0x79, "vcvtps2udq", VR512, VR512,
+                                 memopv16f32, f512mem, SSEPackedSingle>,
+                                 PS, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VCVTPD2UDQZ : avx512_vcvt_fp2int<0x79, "vcvtpd2udq", VR512, VR256X,
+                                 memopv8f64, f512mem, SSEPackedDouble>, VEX_W,
+                                 PS, EVEX_V512, EVEX_CD8<64, CD8VF>;
+
+def : Pat <(v16i32 (int_x86_avx512_mask_cvtps2udq_512 (v16f32 VR512:$src),
+                    (v16i32 immAllZerosV), (i16 -1), imm:$rc)),
+           (VCVTPS2UDQZrrb VR512:$src, imm:$rc)>;
+
+def : Pat <(v8i32 (int_x86_avx512_mask_cvtpd2udq_512 (v8f64 VR512:$src),
+                    (v8i32 immAllZerosV), (i8 -1), imm:$rc)),
+           (VCVTPD2UDQZrrb VR512:$src, imm:$rc)>;
+
+let Predicates = [HasAVX512] in {
+  def : Pat<(v8f32 (fround (loadv8f64 addr:$src))),
+            (VCVTPD2PSZrm addr:$src)>;
+  def : Pat<(v8f64 (extloadv8f32 addr:$src)),
+            (VCVTPS2PDZrm addr:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// Half precision conversion instructions
+//===----------------------------------------------------------------------===//
+multiclass avx512_cvtph2ps<RegisterClass destRC, RegisterClass srcRC,
+                             X86MemOperand x86memop> {
+  def rr : AVX5128I<0x13, MRMSrcReg, (outs destRC:$dst), (ins srcRC:$src),
+             "vcvtph2ps\t{$src, $dst|$dst, $src}",
+             []>, EVEX;
+  let hasSideEffects = 0, mayLoad = 1 in
+  def rm : AVX5128I<0x13, MRMSrcMem, (outs destRC:$dst), (ins x86memop:$src),
+             "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, EVEX;
+}
+
+multiclass avx512_cvtps2ph<RegisterClass destRC, RegisterClass srcRC,
+                             X86MemOperand x86memop> {
+  def rr : AVX512AIi8<0x1D, MRMDestReg, (outs destRC:$dst),
+               (ins srcRC:$src1, i32i8imm:$src2),
+               "vcvtps2ph \t{$src2, $src1, $dst|$dst, $src1, $src2}",
+               []>, EVEX;
+  let hasSideEffects = 0, mayStore = 1 in
+  def mr : AVX512AIi8<0x1D, MRMDestMem, (outs),
+               (ins x86memop:$dst, srcRC:$src1, i32i8imm:$src2),
+               "vcvtps2ph \t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, EVEX;
+}
+
+defm VCVTPH2PSZ : avx512_cvtph2ps<VR512, VR256X, f256mem>, EVEX_V512,
+                                    EVEX_CD8<32, CD8VH>;
+defm VCVTPS2PHZ : avx512_cvtps2ph<VR256X, VR512, f256mem>, EVEX_V512,
+                                    EVEX_CD8<32, CD8VH>;
+
+def : Pat<(v16i16 (int_x86_avx512_mask_vcvtps2ph_512 (v16f32 VR512:$src),
+           imm:$rc, (bc_v16i16(v8i32 immAllZerosV)), (i16 -1))),
+           (VCVTPS2PHZrr VR512:$src, imm:$rc)>;
+
+def : Pat<(v16f32 (int_x86_avx512_mask_vcvtph2ps_512 (v16i16 VR256X:$src),
+           (bc_v16f32(v16i32 immAllZerosV)), (i16 -1), (i32 FROUND_CURRENT))),
+           (VCVTPH2PSZrr VR256X:$src)>;
+
+let Defs = [EFLAGS], Predicates = [HasAVX512] in {
+  defm VUCOMISSZ : sse12_ord_cmp<0x2E, FR32X, X86cmp, f32, f32mem, loadf32,
+                                 "ucomiss">, PS, EVEX, VEX_LIG,
+                                 EVEX_CD8<32, CD8VT1>;
+  defm VUCOMISDZ : sse12_ord_cmp<0x2E, FR64X, X86cmp, f64, f64mem, loadf64,
+                                  "ucomisd">, PD, EVEX,
+                                  VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+  let Pattern = []<dag> in {
+    defm VCOMISSZ  : sse12_ord_cmp<0x2F, VR128X, undef, v4f32, f128mem, load,
+                                   "comiss">, PS, EVEX, VEX_LIG,
+                                   EVEX_CD8<32, CD8VT1>;
+    defm VCOMISDZ  : sse12_ord_cmp<0x2F, VR128X, undef, v2f64, f128mem, load,
+                                   "comisd">, PD, EVEX,
+                                    VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+  }
+  let isCodeGenOnly = 1 in {
+    defm Int_VUCOMISSZ  : sse12_ord_cmp<0x2E, VR128X, X86ucomi, v4f32, f128mem,
+                              load, "ucomiss">, PS, EVEX, VEX_LIG,
+                              EVEX_CD8<32, CD8VT1>;
+    defm Int_VUCOMISDZ  : sse12_ord_cmp<0x2E, VR128X, X86ucomi, v2f64, f128mem,
+                              load, "ucomisd">, PD, EVEX,
+                              VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+    defm Int_VCOMISSZ  : sse12_ord_cmp<0x2F, VR128X, X86comi, v4f32, f128mem,
+                              load, "comiss">, PS, EVEX, VEX_LIG,
+                              EVEX_CD8<32, CD8VT1>;
+    defm Int_VCOMISDZ  : sse12_ord_cmp<0x2F, VR128X, X86comi, v2f64, f128mem,
+                              load, "comisd">, PD, EVEX,
+                              VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+  }
+}
+  
+/// avx512_fp14_s rcp14ss, rcp14sd, rsqrt14ss, rsqrt14sd
+multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                            X86MemOperand x86memop> {
+  let hasSideEffects = 0 in {
+  def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
+               (ins RC:$src1, RC:$src2),
+               !strconcat(OpcodeStr,
+               " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX_4V;
+  let mayLoad = 1 in {
+  def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+               (ins RC:$src1, x86memop:$src2),
+               !strconcat(OpcodeStr,
+               " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX_4V;
+  }
+}
+}
+
+defm VRCP14SS   : avx512_fp14_s<0x4D, "vrcp14ss", FR32X, f32mem>,
+                  EVEX_CD8<32, CD8VT1>;
+defm VRCP14SD   : avx512_fp14_s<0x4D, "vrcp14sd", FR64X, f64mem>,
+                  VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VRSQRT14SS   : avx512_fp14_s<0x4F, "vrsqrt14ss", FR32X, f32mem>,
+                  EVEX_CD8<32, CD8VT1>;
+defm VRSQRT14SD   : avx512_fp14_s<0x4F, "vrsqrt14sd", FR64X, f64mem>,
+                  VEX_W, EVEX_CD8<64, CD8VT1>;
+
+def : Pat <(v4f32 (int_x86_avx512_rcp14_ss (v4f32 VR128X:$src1),
+              (v4f32 VR128X:$src2), (bc_v4f32 (v4i32 immAllZerosV)), (i8 -1))),
+           (COPY_TO_REGCLASS (VRCP14SSrr (COPY_TO_REGCLASS VR128X:$src1, FR32X),
+                       (COPY_TO_REGCLASS VR128X:$src2, FR32X)), VR128X)>;
+
+def : Pat <(v2f64 (int_x86_avx512_rcp14_sd (v2f64 VR128X:$src1),
+              (v2f64 VR128X:$src2), (bc_v2f64 (v4i32 immAllZerosV)), (i8 -1))),
+           (COPY_TO_REGCLASS (VRCP14SDrr (COPY_TO_REGCLASS VR128X:$src1, FR64X),
+                       (COPY_TO_REGCLASS VR128X:$src2, FR64X)), VR128X)>;
+
+def : Pat <(v4f32 (int_x86_avx512_rsqrt14_ss (v4f32 VR128X:$src1),
+              (v4f32 VR128X:$src2), (bc_v4f32 (v4i32 immAllZerosV)), (i8 -1))),
+           (COPY_TO_REGCLASS (VRSQRT14SSrr (COPY_TO_REGCLASS VR128X:$src1, FR32X),
+                       (COPY_TO_REGCLASS VR128X:$src2, FR32X)), VR128X)>;
+
+def : Pat <(v2f64 (int_x86_avx512_rsqrt14_sd (v2f64 VR128X:$src1),
+              (v2f64 VR128X:$src2), (bc_v2f64 (v4i32 immAllZerosV)), (i8 -1))),
+           (COPY_TO_REGCLASS (VRSQRT14SDrr (COPY_TO_REGCLASS VR128X:$src1, FR64X),
+                       (COPY_TO_REGCLASS VR128X:$src2, FR64X)), VR128X)>;
+
+/// avx512_fp14_p rcp14ps, rcp14pd, rsqrt14ps, rsqrt14pd
+multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                         RegisterClass RC, X86MemOperand x86memop,
+                         PatFrag mem_frag, ValueType OpVt> {
+  def r : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
+                        !strconcat(OpcodeStr,
+                                   " \t{$src, $dst|$dst, $src}"),
+                        [(set RC:$dst, (OpVt (OpNode RC:$src)))]>,
+                        EVEX;
+  def m : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+                        !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
+                        [(set RC:$dst, (OpVt (OpNode (mem_frag addr:$src))))]>,
+                        EVEX;
+}
+defm VRSQRT14PSZ : avx512_fp14_p<0x4E, "vrsqrt14ps", X86frsqrt, VR512, f512mem,
+                        memopv16f32, v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VRSQRT14PDZ : avx512_fp14_p<0x4E, "vrsqrt14pd", X86frsqrt, VR512, f512mem,
+                        memopv8f64, v8f64>, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
+defm VRCP14PSZ : avx512_fp14_p<0x4C, "vrcp14ps", X86frcp, VR512, f512mem,
+                        memopv16f32, v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VRCP14PDZ : avx512_fp14_p<0x4C, "vrcp14pd", X86frcp, VR512, f512mem,
+                        memopv8f64, v8f64>, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
+
+def : Pat <(v16f32 (int_x86_avx512_rsqrt14_ps_512 (v16f32 VR512:$src),
+              (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1))),
+           (VRSQRT14PSZr VR512:$src)>;
+def : Pat <(v8f64 (int_x86_avx512_rsqrt14_pd_512 (v8f64 VR512:$src),
+              (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1))),
+           (VRSQRT14PDZr VR512:$src)>;
+
+def : Pat <(v16f32 (int_x86_avx512_rcp14_ps_512 (v16f32 VR512:$src),
+              (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1))),
+           (VRCP14PSZr VR512:$src)>;
+def : Pat <(v8f64 (int_x86_avx512_rcp14_pd_512 (v8f64 VR512:$src),
+              (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1))),
+           (VRCP14PDZr VR512:$src)>;
+
+/// avx512_fp28_s rcp28ss, rcp28sd, rsqrt28ss, rsqrt28sd
+multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                            X86MemOperand x86memop> {
+  let hasSideEffects = 0, Predicates = [HasERI] in {
+  def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
+               (ins RC:$src1, RC:$src2),
+               !strconcat(OpcodeStr,
+               " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX_4V;
+  def rrb : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
+               (ins RC:$src1, RC:$src2),
+               !strconcat(OpcodeStr,
+               " \t{{sae}, $src2, $src1, $dst|$dst, $src1, $src2, {sae}}"),
+               []>, EVEX_4V, EVEX_B;
+  let mayLoad = 1 in {
+  def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+               (ins RC:$src1, x86memop:$src2),
+               !strconcat(OpcodeStr,
+               " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX_4V;
+  }
+}
+}
+
+defm VRCP28SS   : avx512_fp28_s<0xCB, "vrcp28ss", FR32X, f32mem>,
+                  EVEX_CD8<32, CD8VT1>;
+defm VRCP28SD   : avx512_fp28_s<0xCB, "vrcp28sd", FR64X, f64mem>,
+                  VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VRSQRT28SS   : avx512_fp28_s<0xCD, "vrsqrt28ss", FR32X, f32mem>,
+                  EVEX_CD8<32, CD8VT1>;
+defm VRSQRT28SD   : avx512_fp28_s<0xCD, "vrsqrt28sd", FR64X, f64mem>,
+                  VEX_W, EVEX_CD8<64, CD8VT1>;
+
+def : Pat <(v4f32 (int_x86_avx512_rcp28_ss (v4f32 VR128X:$src1),
+              (v4f32 VR128X:$src2), (bc_v4f32 (v4i32 immAllZerosV)), (i8 -1),
+                   FROUND_NO_EXC)),
+           (COPY_TO_REGCLASS (VRCP28SSrrb (COPY_TO_REGCLASS VR128X:$src1, FR32X),
+                       (COPY_TO_REGCLASS VR128X:$src2, FR32X)), VR128X)>;
+
+def : Pat <(v2f64 (int_x86_avx512_rcp28_sd (v2f64 VR128X:$src1),
+              (v2f64 VR128X:$src2), (bc_v2f64 (v4i32 immAllZerosV)), (i8 -1),
+                   FROUND_NO_EXC)),
+           (COPY_TO_REGCLASS (VRCP28SDrrb (COPY_TO_REGCLASS VR128X:$src1, FR64X),
+                       (COPY_TO_REGCLASS VR128X:$src2, FR64X)), VR128X)>;
+
+def : Pat <(v4f32 (int_x86_avx512_rsqrt28_ss (v4f32 VR128X:$src1),
+              (v4f32 VR128X:$src2), (bc_v4f32 (v4i32 immAllZerosV)), (i8 -1),
+                   FROUND_NO_EXC)),
+           (COPY_TO_REGCLASS (VRSQRT28SSrrb (COPY_TO_REGCLASS VR128X:$src1, FR32X),
+                       (COPY_TO_REGCLASS VR128X:$src2, FR32X)), VR128X)>;
+
+def : Pat <(v2f64 (int_x86_avx512_rsqrt28_sd (v2f64 VR128X:$src1),
+              (v2f64 VR128X:$src2), (bc_v2f64 (v4i32 immAllZerosV)), (i8 -1),
+                   FROUND_NO_EXC)),
+           (COPY_TO_REGCLASS (VRSQRT28SDrrb (COPY_TO_REGCLASS VR128X:$src1, FR64X),
+                       (COPY_TO_REGCLASS VR128X:$src2, FR64X)), VR128X)>;
+
+/// avx512_fp28_p rcp28ps, rcp28pd, rsqrt28ps, rsqrt28pd
+multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr,
+                         RegisterClass RC, X86MemOperand x86memop> {
+  let hasSideEffects = 0, Predicates = [HasERI] in {
+  def r : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
+                        !strconcat(OpcodeStr,
+                                   " \t{$src, $dst|$dst, $src}"),
+                        []>, EVEX;
+  def rb : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
+                        !strconcat(OpcodeStr,
+                                   " \t{{sae}, $src, $dst|$dst, $src, {sae}}"),
+                        []>, EVEX, EVEX_B;
+  def m : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+                        !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
+                        []>, EVEX;
+  }
+}
+defm VRSQRT28PSZ : avx512_fp28_p<0xCC, "vrsqrt28ps", VR512, f512mem>,
+                        EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VRSQRT28PDZ : avx512_fp28_p<0xCC, "vrsqrt28pd", VR512, f512mem>,
+                        VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
+defm VRCP28PSZ : avx512_fp28_p<0xCA, "vrcp28ps", VR512, f512mem>,
+                        EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VRCP28PDZ : avx512_fp28_p<0xCA, "vrcp28pd", VR512, f512mem>,
+                        VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
+
+def : Pat <(v16f32 (int_x86_avx512_rsqrt28_ps (v16f32 VR512:$src),
+              (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1), FROUND_NO_EXC)),
+           (VRSQRT28PSZrb VR512:$src)>;
+def : Pat <(v8f64 (int_x86_avx512_rsqrt28_pd (v8f64 VR512:$src),
+              (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1), FROUND_NO_EXC)),
+           (VRSQRT28PDZrb VR512:$src)>;
+
+def : Pat <(v16f32 (int_x86_avx512_rcp28_ps (v16f32 VR512:$src),
+              (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1), FROUND_NO_EXC)),
+           (VRCP28PSZrb VR512:$src)>;
+def : Pat <(v8f64 (int_x86_avx512_rcp28_pd (v8f64 VR512:$src),
+              (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1), FROUND_NO_EXC)),
+           (VRCP28PDZrb VR512:$src)>;
+
+multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                              Intrinsic V16F32Int, Intrinsic V8F64Int,
+                              OpndItins itins_s, OpndItins itins_d> {
+  def PSZrr :AVX512PSI<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src),
+             !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+             [(set VR512:$dst, (v16f32 (OpNode VR512:$src)))], itins_s.rr>,
+             EVEX, EVEX_V512;
+
+  let mayLoad = 1 in
+  def PSZrm : AVX512PSI<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src),
+              !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+              [(set VR512:$dst, 
+                (OpNode (v16f32 (bitconvert (memopv16f32 addr:$src)))))],
+              itins_s.rm>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+  def PDZrr : AVX512PDI<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src),
+              !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+              [(set VR512:$dst, (v8f64 (OpNode VR512:$src)))], itins_d.rr>,
+              EVEX, EVEX_V512;
+
+  let mayLoad = 1 in
+    def PDZrm : AVX512PDI<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src),
+                !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+                [(set VR512:$dst, (OpNode
+                  (v8f64 (bitconvert (memopv16f32 addr:$src)))))],
+                itins_d.rm>, EVEX, EVEX_V512, EVEX_CD8<64, CD8VF>;
+
+let isCodeGenOnly = 1 in {
+  def PSZr_Int : AVX512PSI<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src),
+                           !strconcat(OpcodeStr,
+                                      "ps\t{$src, $dst|$dst, $src}"),
+                           [(set VR512:$dst, (V16F32Int VR512:$src))]>, 
+                           EVEX, EVEX_V512;
+  def PSZm_Int : AVX512PSI<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src),
+                          !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+                          [(set VR512:$dst, 
+                           (V16F32Int (memopv16f32 addr:$src)))]>, EVEX,
+                          EVEX_V512, EVEX_CD8<32, CD8VF>;
+  def PDZr_Int : AVX512PDI<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src),
+                           !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+                           [(set VR512:$dst, (V8F64Int VR512:$src))]>, 
+                           EVEX, EVEX_V512, VEX_W;
+  def PDZm_Int : AVX512PDI<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src),
+                         !strconcat(OpcodeStr,
+                         "pd\t{$src, $dst|$dst, $src}"),
+                         [(set VR512:$dst, (V8F64Int (memopv8f64 addr:$src)))]>,
+                         EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; 
+} // isCodeGenOnly = 1
+}
+
+multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,
+                          Intrinsic F32Int, Intrinsic F64Int,
+                          OpndItins itins_s, OpndItins itins_d> {
+  def SSZr : SI<opc, MRMSrcReg, (outs FR32X:$dst),
+               (ins FR32X:$src1, FR32X:$src2),
+               !strconcat(OpcodeStr,
+                          "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                      [], itins_s.rr>, XS, EVEX_4V;
+  let isCodeGenOnly = 1 in
+  def SSZr_Int : SIi8<opc, MRMSrcReg, (outs VR128X:$dst),
+               (ins VR128X:$src1, VR128X:$src2),
+               !strconcat(OpcodeStr,
+                "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+               [(set VR128X:$dst, 
+                 (F32Int VR128X:$src1, VR128X:$src2))],
+               itins_s.rr>, XS, EVEX_4V;
+  let mayLoad = 1 in {
+  def SSZm : SI<opc, MRMSrcMem, (outs FR32X:$dst),
+               (ins FR32X:$src1, f32mem:$src2),
+               !strconcat(OpcodeStr,
+                          "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                      [], itins_s.rm>, XS, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+  let isCodeGenOnly = 1 in
+  def SSZm_Int : SIi8<opc, MRMSrcMem, (outs VR128X:$dst),
+                   (ins VR128X:$src1, ssmem:$src2),
+                   !strconcat(OpcodeStr,
+                 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                   [(set VR128X:$dst, 
+                     (F32Int VR128X:$src1, sse_load_f32:$src2))],
+                   itins_s.rm>, XS, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+  }
+  def SDZr : SI<opc, MRMSrcReg, (outs FR64X:$dst),
+               (ins FR64X:$src1, FR64X:$src2),
+               !strconcat(OpcodeStr,
+                          "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
+                      XD, EVEX_4V, VEX_W;
+  let isCodeGenOnly = 1 in
+  def SDZr_Int : SIi8<opc, MRMSrcReg, (outs VR128X:$dst),
+               (ins VR128X:$src1, VR128X:$src2),
+               !strconcat(OpcodeStr,
+                "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+               [(set VR128X:$dst, 
+                 (F64Int VR128X:$src1, VR128X:$src2))],
+               itins_s.rr>, XD, EVEX_4V, VEX_W;
+  let mayLoad = 1 in {
+  def SDZm : SI<opc, MRMSrcMem, (outs FR64X:$dst),
+               (ins FR64X:$src1, f64mem:$src2),
+               !strconcat(OpcodeStr,
+                  "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
+               XD, EVEX_4V, VEX_W, EVEX_CD8<64, CD8VT1>;
+  let isCodeGenOnly = 1 in
+  def SDZm_Int : SIi8<opc, MRMSrcMem, (outs VR128X:$dst),
+                  (ins VR128X:$src1, sdmem:$src2),
+                   !strconcat(OpcodeStr,
+                  "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                  [(set VR128X:$dst, 
+                    (F64Int VR128X:$src1, sse_load_f64:$src2))]>, 
+                  XD, EVEX_4V, VEX_W, EVEX_CD8<64, CD8VT1>;
+  }
+}
+
+
+defm VSQRT  : avx512_sqrt_scalar<0x51, "sqrt", 
+                int_x86_avx512_sqrt_ss, int_x86_avx512_sqrt_sd, 
+                SSE_SQRTSS, SSE_SQRTSD>,
+              avx512_sqrt_packed<0x51, "vsqrt", fsqrt,
+                int_x86_avx512_sqrt_ps_512, int_x86_avx512_sqrt_pd_512,
+                SSE_SQRTPS, SSE_SQRTPD>;
+
+let Predicates = [HasAVX512] in {
+  def : Pat<(f32 (fsqrt FR32X:$src)),
+            (VSQRTSSZr (f32 (IMPLICIT_DEF)), FR32X:$src)>;
+  def : Pat<(f32 (fsqrt (load addr:$src))),
+            (VSQRTSSZm (f32 (IMPLICIT_DEF)), addr:$src)>,
+            Requires<[OptForSize]>;
+  def : Pat<(f64 (fsqrt FR64X:$src)),
+            (VSQRTSDZr (f64 (IMPLICIT_DEF)), FR64X:$src)>;
+  def : Pat<(f64 (fsqrt (load addr:$src))),
+            (VSQRTSDZm (f64 (IMPLICIT_DEF)), addr:$src)>,
+            Requires<[OptForSize]>;
+
+  def : Pat<(f32 (X86frsqrt FR32X:$src)),
+            (VRSQRT14SSrr (f32 (IMPLICIT_DEF)), FR32X:$src)>;
+  def : Pat<(f32 (X86frsqrt (load addr:$src))),
+            (VRSQRT14SSrm (f32 (IMPLICIT_DEF)), addr:$src)>,
+            Requires<[OptForSize]>;
+
+  def : Pat<(f32 (X86frcp FR32X:$src)),
+            (VRCP14SSrr (f32 (IMPLICIT_DEF)), FR32X:$src)>;
+  def : Pat<(f32 (X86frcp (load addr:$src))),
+            (VRCP14SSrm (f32 (IMPLICIT_DEF)), addr:$src)>,
+            Requires<[OptForSize]>;
+
+  def : Pat<(int_x86_sse_sqrt_ss VR128X:$src),
+            (COPY_TO_REGCLASS (VSQRTSSZr (f32 (IMPLICIT_DEF)),
+                                        (COPY_TO_REGCLASS VR128X:$src, FR32)),
+                              VR128X)>;
+  def : Pat<(int_x86_sse_sqrt_ss sse_load_f32:$src),
+            (VSQRTSSZm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
+
+  def : Pat<(int_x86_sse2_sqrt_sd VR128X:$src),
+            (COPY_TO_REGCLASS (VSQRTSDZr (f64 (IMPLICIT_DEF)),
+                                        (COPY_TO_REGCLASS VR128X:$src, FR64)),
+                              VR128X)>;
+  def : Pat<(int_x86_sse2_sqrt_sd sse_load_f64:$src),
+            (VSQRTSDZm_Int (v2f64 (IMPLICIT_DEF)), sse_load_f64:$src)>;
+}
+
+
+multiclass avx512_fp_unop_rm<bits<8> opcps, bits<8> opcpd, string OpcodeStr,
+                            X86MemOperand x86memop, RegisterClass RC,
+                            PatFrag mem_frag32, PatFrag mem_frag64,
+                            Intrinsic V4F32Int, Intrinsic V2F64Int,
+                            CD8VForm VForm> {
+let ExeDomain = SSEPackedSingle in {
+  // Intrinsic operation, reg.
+  // Vector intrinsic operation, reg
+  def PSr : AVX512AIi8<opcps, MRMSrcReg,
+                    (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2),
+                    !strconcat(OpcodeStr,
+                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set RC:$dst, (V4F32Int RC:$src1, imm:$src2))]>;
+
+  // Vector intrinsic operation, mem
+  def PSm : AVX512AIi8<opcps, MRMSrcMem,
+                    (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2),
+                    !strconcat(OpcodeStr,
+                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set RC:$dst,
+                          (V4F32Int (mem_frag32 addr:$src1),imm:$src2))]>,
+                    EVEX_CD8<32, VForm>;
+} // ExeDomain = SSEPackedSingle
+
+let ExeDomain = SSEPackedDouble in {
+  // Vector intrinsic operation, reg
+  def PDr : AVX512AIi8<opcpd, MRMSrcReg,
+                     (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2),
+                     !strconcat(OpcodeStr,
+                     "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     [(set RC:$dst, (V2F64Int RC:$src1, imm:$src2))]>;
+
+  // Vector intrinsic operation, mem
+  def PDm : AVX512AIi8<opcpd, MRMSrcMem,
+                     (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2),
+                     !strconcat(OpcodeStr,
+                     "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     [(set RC:$dst,
+                          (V2F64Int (mem_frag64 addr:$src1),imm:$src2))]>,
+                     EVEX_CD8<64, VForm>;
+} // ExeDomain = SSEPackedDouble
+}
+
+multiclass avx512_fp_binop_rm<bits<8> opcss, bits<8> opcsd,
+                            string OpcodeStr,
+                            Intrinsic F32Int,
+                            Intrinsic F64Int> {
+let ExeDomain = GenericDomain in {
+  // Operation, reg.
+  let hasSideEffects = 0 in
+  def SSr : AVX512AIi8<opcss, MRMSrcReg,
+      (outs FR32X:$dst), (ins FR32X:$src1, FR32X:$src2, i32i8imm:$src3),
+      !strconcat(OpcodeStr,
+              "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+      []>;
+
+  // Intrinsic operation, reg.
+  let isCodeGenOnly = 1 in
+  def SSr_Int : AVX512AIi8<opcss, MRMSrcReg,
+        (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2, i32i8imm:$src3),
+        !strconcat(OpcodeStr,
+                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+        [(set VR128X:$dst, (F32Int VR128X:$src1, VR128X:$src2, imm:$src3))]>;
+
+  // Intrinsic operation, mem.
+  def SSm : AVX512AIi8<opcss, MRMSrcMem, (outs VR128X:$dst),
+                     (ins VR128X:$src1, ssmem:$src2, i32i8imm:$src3),
+                     !strconcat(OpcodeStr,
+                   "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                     [(set VR128X:$dst, (F32Int VR128X:$src1, 
+                                         sse_load_f32:$src2, imm:$src3))]>,
+                     EVEX_CD8<32, CD8VT1>;
+
+  // Operation, reg.
+  let hasSideEffects = 0 in
+  def SDr : AVX512AIi8<opcsd, MRMSrcReg,
+        (outs FR64X:$dst), (ins FR64X:$src1, FR64X:$src2, i32i8imm:$src3),
+        !strconcat(OpcodeStr,
+                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+        []>, VEX_W;
+
+  // Intrinsic operation, reg.
+  let isCodeGenOnly = 1 in
+  def SDr_Int : AVX512AIi8<opcsd, MRMSrcReg,
+        (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2, i32i8imm:$src3),
+        !strconcat(OpcodeStr,
+                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+        [(set VR128X:$dst, (F64Int VR128X:$src1, VR128X:$src2, imm:$src3))]>,
+        VEX_W;
+
+  // Intrinsic operation, mem.
+  def SDm : AVX512AIi8<opcsd, MRMSrcMem,
+        (outs VR128X:$dst), (ins VR128X:$src1, sdmem:$src2, i32i8imm:$src3),
+        !strconcat(OpcodeStr,
+                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+        [(set VR128X:$dst,
+              (F64Int VR128X:$src1, sse_load_f64:$src2, imm:$src3))]>,
+        VEX_W, EVEX_CD8<64, CD8VT1>;
+} // ExeDomain = GenericDomain
+}
+
+multiclass avx512_rndscale<bits<8> opc, string OpcodeStr,
+                            X86MemOperand x86memop, RegisterClass RC,
+                            PatFrag mem_frag, Domain d> {
+let ExeDomain = d in {
+  // Intrinsic operation, reg.
+  // Vector intrinsic operation, reg
+  def r : AVX512AIi8<opc, MRMSrcReg,
+                    (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2),
+                    !strconcat(OpcodeStr,
+                    " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    []>, EVEX;
+
+  // Vector intrinsic operation, mem
+  def m : AVX512AIi8<opc, MRMSrcMem,
+                    (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2),
+                    !strconcat(OpcodeStr,
+                    " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    []>, EVEX;
+} // ExeDomain
+}
+
+
+defm VRNDSCALEPSZ : avx512_rndscale<0x08, "vrndscaleps", f512mem, VR512,
+                                memopv16f32, SSEPackedSingle>, EVEX_V512,
+                                EVEX_CD8<32, CD8VF>;
+
+def : Pat<(v16f32 (int_x86_avx512_mask_rndscale_ps_512 (v16f32 VR512:$src1),
+                   imm:$src2, (v16f32 VR512:$src1), (i16 -1),
+                   FROUND_CURRENT)),
+                   (VRNDSCALEPSZr VR512:$src1, imm:$src2)>;
+
+
+defm VRNDSCALEPDZ : avx512_rndscale<0x09, "vrndscalepd", f512mem, VR512,
+                                memopv8f64, SSEPackedDouble>, EVEX_V512,
+                                VEX_W, EVEX_CD8<64, CD8VF>;
+
+def : Pat<(v8f64 (int_x86_avx512_mask_rndscale_pd_512 (v8f64 VR512:$src1),
+                  imm:$src2, (v8f64 VR512:$src1), (i8 -1),
+                  FROUND_CURRENT)),
+                   (VRNDSCALEPDZr VR512:$src1, imm:$src2)>;
+
+multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr,
+                     Operand x86memop, RegisterClass RC, Domain d> {
+let ExeDomain = d in {
+  def r : AVX512AIi8<opc, MRMSrcReg,
+                    (outs RC:$dst), (ins RC:$src1, RC:$src2, i32i8imm:$src3),
+                    !strconcat(OpcodeStr,
+                    " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    []>, EVEX_4V;
+
+  def m : AVX512AIi8<opc, MRMSrcMem,
+                    (outs RC:$dst), (ins RC:$src1, x86memop:$src2,  i32i8imm:$src3),
+                    !strconcat(OpcodeStr,
+                    " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    []>, EVEX_4V;
+} // ExeDomain
+}
+
+defm VRNDSCALESS : avx512_rndscale_scalar<0x0A, "vrndscaless", ssmem, FR32X,
+                                SSEPackedSingle>, EVEX_CD8<32, CD8VT1>;
+                                
+defm VRNDSCALESD : avx512_rndscale_scalar<0x0B, "vrndscalesd", sdmem, FR64X,
+                                SSEPackedDouble>, EVEX_CD8<64, CD8VT1>;
+
+def : Pat<(ffloor FR32X:$src),
+          (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0x1))>;
+def : Pat<(f64 (ffloor FR64X:$src)),
+          (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0x1))>;
+def : Pat<(f32 (fnearbyint FR32X:$src)),
+          (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0xC))>;
+def : Pat<(f64 (fnearbyint FR64X:$src)),
+          (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0xC))>;
+def : Pat<(f32 (fceil FR32X:$src)),
+          (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0x2))>;
+def : Pat<(f64 (fceil FR64X:$src)),
+          (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0x2))>;
+def : Pat<(f32 (frint FR32X:$src)),
+          (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0x4))>;
+def : Pat<(f64 (frint FR64X:$src)),
+          (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0x4))>;
+def : Pat<(f32 (ftrunc FR32X:$src)),
+          (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0x3))>;
+def : Pat<(f64 (ftrunc FR64X:$src)),
+          (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0x3))>;
+
+def : Pat<(v16f32 (ffloor VR512:$src)),
+          (VRNDSCALEPSZr VR512:$src, (i32 0x1))>;
+def : Pat<(v16f32 (fnearbyint VR512:$src)),
+          (VRNDSCALEPSZr VR512:$src, (i32 0xC))>;
+def : Pat<(v16f32 (fceil VR512:$src)),
+          (VRNDSCALEPSZr VR512:$src, (i32 0x2))>;
+def : Pat<(v16f32 (frint VR512:$src)),
+          (VRNDSCALEPSZr VR512:$src, (i32 0x4))>;
+def : Pat<(v16f32 (ftrunc VR512:$src)),
+          (VRNDSCALEPSZr VR512:$src, (i32 0x3))>;
+
+def : Pat<(v8f64 (ffloor VR512:$src)),
+          (VRNDSCALEPDZr VR512:$src, (i32 0x1))>;
+def : Pat<(v8f64 (fnearbyint VR512:$src)),
+          (VRNDSCALEPDZr VR512:$src, (i32 0xC))>;
+def : Pat<(v8f64 (fceil VR512:$src)),
+          (VRNDSCALEPDZr VR512:$src, (i32 0x2))>;
+def : Pat<(v8f64 (frint VR512:$src)),
+          (VRNDSCALEPDZr VR512:$src, (i32 0x4))>;
+def : Pat<(v8f64 (ftrunc VR512:$src)),
+          (VRNDSCALEPDZr VR512:$src, (i32 0x3))>;
+
+//-------------------------------------------------
+// Integer truncate and extend operations
+//-------------------------------------------------
+
+multiclass avx512_trunc_sat<bits<8> opc, string OpcodeStr,
+                          RegisterClass dstRC, RegisterClass srcRC,
+                          RegisterClass KRC, X86MemOperand x86memop> {
+  def rr : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst),
+               (ins srcRC:$src),
+               !strconcat(OpcodeStr," \t{$src, $dst|$dst, $src}"),
+               []>, EVEX;
+
+  def rrk : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst),
+               (ins KRC:$mask, srcRC:$src),
+               !strconcat(OpcodeStr,
+                 " \t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"),
+               []>, EVEX, EVEX_K;
+
+  def rrkz : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst),
+               (ins KRC:$mask, srcRC:$src),
+               !strconcat(OpcodeStr,
+                 " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
+               []>, EVEX, EVEX_KZ;
+
+  def mr : AVX512XS8I<opc, MRMDestMem, (outs), (ins x86memop:$dst, srcRC:$src),
+               !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
+               []>, EVEX;
+
+  def mrk : AVX512XS8I<opc, MRMDestMem, (outs),
+               (ins x86memop:$dst, KRC:$mask, srcRC:$src),
+               !strconcat(OpcodeStr, " \t{$src, $dst {${mask}}|${dst} {${mask}}, $src}"),
+               []>, EVEX, EVEX_K;
+
+}
+defm VPMOVQB    : avx512_trunc_sat<0x32, "vpmovqb",   VR128X, VR512, VK8WM, 
+                                 i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>;
+defm VPMOVSQB   : avx512_trunc_sat<0x22, "vpmovsqb",  VR128X, VR512, VK8WM,
+                                 i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>;
+defm VPMOVUSQB  : avx512_trunc_sat<0x12, "vpmovusqb", VR128X, VR512, VK8WM,
+                                 i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>;
+defm VPMOVQW    : avx512_trunc_sat<0x34, "vpmovqw",   VR128X, VR512, VK8WM,
+                                 i128mem>, EVEX_V512, EVEX_CD8<16, CD8VQ>;
+defm VPMOVSQW   : avx512_trunc_sat<0x24, "vpmovsqw",  VR128X, VR512, VK8WM,
+                                 i128mem>, EVEX_V512, EVEX_CD8<16, CD8VQ>;
+defm VPMOVUSQW  : avx512_trunc_sat<0x14, "vpmovusqw", VR128X, VR512, VK8WM,
+                                 i128mem>, EVEX_V512, EVEX_CD8<16, CD8VQ>;
+defm VPMOVQD    : avx512_trunc_sat<0x35, "vpmovqd",   VR256X, VR512, VK8WM,
+                                 i256mem>, EVEX_V512, EVEX_CD8<32, CD8VH>;
+defm VPMOVSQD   : avx512_trunc_sat<0x25, "vpmovsqd",  VR256X, VR512, VK8WM,
+                                 i256mem>, EVEX_V512, EVEX_CD8<32, CD8VH>;
+defm VPMOVUSQD  : avx512_trunc_sat<0x15, "vpmovusqd", VR256X, VR512, VK8WM,
+                                 i256mem>, EVEX_V512, EVEX_CD8<32, CD8VH>;
+defm VPMOVDW    : avx512_trunc_sat<0x33, "vpmovdw",   VR256X, VR512, VK16WM,
+                                 i256mem>, EVEX_V512, EVEX_CD8<16, CD8VH>;
+defm VPMOVSDW   : avx512_trunc_sat<0x23, "vpmovsdw",  VR256X, VR512, VK16WM,
+                                 i256mem>, EVEX_V512, EVEX_CD8<16, CD8VH>;
+defm VPMOVUSDW  : avx512_trunc_sat<0x13, "vpmovusdw", VR256X, VR512, VK16WM,
+                                 i256mem>, EVEX_V512, EVEX_CD8<16, CD8VH>;
+defm VPMOVDB    : avx512_trunc_sat<0x31, "vpmovdb",   VR128X, VR512, VK16WM,
+                                 i128mem>, EVEX_V512, EVEX_CD8<8, CD8VQ>;
+defm VPMOVSDB   : avx512_trunc_sat<0x21, "vpmovsdb",  VR128X, VR512, VK16WM,
+                                 i128mem>, EVEX_V512, EVEX_CD8<8, CD8VQ>;
+defm VPMOVUSDB  : avx512_trunc_sat<0x11, "vpmovusdb", VR128X, VR512, VK16WM,
+                                 i128mem>, EVEX_V512, EVEX_CD8<8, CD8VQ>;
+
+def : Pat<(v16i8  (X86vtrunc (v8i64  VR512:$src))), (VPMOVQBrr  VR512:$src)>;
+def : Pat<(v8i16  (X86vtrunc (v8i64  VR512:$src))), (VPMOVQWrr  VR512:$src)>;
+def : Pat<(v16i16 (X86vtrunc (v16i32 VR512:$src))), (VPMOVDWrr  VR512:$src)>;
+def : Pat<(v16i8  (X86vtrunc (v16i32 VR512:$src))), (VPMOVDBrr  VR512:$src)>;
+def : Pat<(v8i32  (X86vtrunc (v8i64  VR512:$src))), (VPMOVQDrr  VR512:$src)>;
+
+def : Pat<(v16i8  (X86vtruncm VK16WM:$mask, (v16i32 VR512:$src))),
+                  (VPMOVDBrrkz VK16WM:$mask, VR512:$src)>;
+def : Pat<(v16i16 (X86vtruncm VK16WM:$mask, (v16i32 VR512:$src))),
+                  (VPMOVDWrrkz VK16WM:$mask, VR512:$src)>;
+def : Pat<(v8i16  (X86vtruncm VK8WM:$mask,  (v8i64 VR512:$src))),
+                  (VPMOVQWrrkz  VK8WM:$mask, VR512:$src)>;
+def : Pat<(v8i32  (X86vtruncm VK8WM:$mask,  (v8i64 VR512:$src))),
+                  (VPMOVQDrrkz  VK8WM:$mask, VR512:$src)>;
+
+
+multiclass avx512_extend<bits<8> opc, string OpcodeStr, RegisterClass KRC,
+                      RegisterClass DstRC, RegisterClass SrcRC, SDNode OpNode,
+                      PatFrag mem_frag, X86MemOperand x86memop,
+                      ValueType OpVT, ValueType InVT> {
+
+  def rr : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst),
+              (ins SrcRC:$src),
+              !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
+              [(set DstRC:$dst, (OpVT (OpNode (InVT SrcRC:$src))))]>, EVEX;
+
+  def rrk : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst),
+              (ins KRC:$mask, SrcRC:$src),
+              !strconcat(OpcodeStr, " \t{$src, $dst {${mask}} |$dst {${mask}}, $src}"),
+              []>, EVEX, EVEX_K;
+
+  def rrkz : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst),
+              (ins KRC:$mask, SrcRC:$src),
+              !strconcat(OpcodeStr, " \t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"),
+              []>, EVEX, EVEX_KZ;
+
+  let mayLoad = 1 in {
+    def rm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst),
+              (ins x86memop:$src),
+              !strconcat(OpcodeStr," \t{$src, $dst|$dst, $src}"),
+              [(set DstRC:$dst,
+                (OpVT (OpNode (InVT (bitconvert (mem_frag addr:$src))))))]>,
+              EVEX;
+
+    def rmk : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst),
+              (ins KRC:$mask, x86memop:$src),
+              !strconcat(OpcodeStr," \t{$src, $dst {${mask}} |$dst {${mask}}, $src}"),
+              []>,
+              EVEX, EVEX_K;
+
+    def rmkz : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst),
+              (ins KRC:$mask, x86memop:$src),
+              !strconcat(OpcodeStr," \t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"),
+              []>,
+              EVEX, EVEX_KZ;
+  }
+}
+
+defm VPMOVZXBDZ: avx512_extend<0x31, "vpmovzxbd", VK16WM, VR512, VR128X, X86vzext,
+                             memopv2i64, i128mem, v16i32, v16i8>, EVEX_V512,
+                             EVEX_CD8<8, CD8VQ>;
+defm VPMOVZXBQZ: avx512_extend<0x32, "vpmovzxbq", VK8WM, VR512, VR128X, X86vzext,
+                             memopv2i64, i128mem, v8i64, v16i8>, EVEX_V512,
+                             EVEX_CD8<8, CD8VO>;
+defm VPMOVZXWDZ: avx512_extend<0x33, "vpmovzxwd", VK16WM, VR512, VR256X, X86vzext,
+                             memopv4i64, i256mem, v16i32, v16i16>, EVEX_V512,
+                             EVEX_CD8<16, CD8VH>;
+defm VPMOVZXWQZ: avx512_extend<0x34, "vpmovzxwq", VK8WM, VR512, VR128X, X86vzext,
+                             memopv2i64, i128mem, v8i64, v8i16>, EVEX_V512,
+                             EVEX_CD8<16, CD8VQ>;
+defm VPMOVZXDQZ: avx512_extend<0x35, "vpmovzxdq", VK8WM, VR512, VR256X, X86vzext,
+                             memopv4i64, i256mem, v8i64, v8i32>, EVEX_V512,
+                             EVEX_CD8<32, CD8VH>;
+
+defm VPMOVSXBDZ: avx512_extend<0x21, "vpmovsxbd", VK16WM, VR512, VR128X, X86vsext,
+                             memopv2i64, i128mem, v16i32, v16i8>, EVEX_V512,
+                             EVEX_CD8<8, CD8VQ>;
+defm VPMOVSXBQZ: avx512_extend<0x22, "vpmovsxbq", VK8WM, VR512, VR128X, X86vsext,
+                             memopv2i64, i128mem, v8i64, v16i8>, EVEX_V512,
+                             EVEX_CD8<8, CD8VO>;
+defm VPMOVSXWDZ: avx512_extend<0x23, "vpmovsxwd", VK16WM, VR512, VR256X, X86vsext,
+                             memopv4i64, i256mem, v16i32, v16i16>, EVEX_V512,
+                             EVEX_CD8<16, CD8VH>;
+defm VPMOVSXWQZ: avx512_extend<0x24, "vpmovsxwq", VK8WM, VR512, VR128X, X86vsext,
+                             memopv2i64, i128mem, v8i64, v8i16>, EVEX_V512,
+                             EVEX_CD8<16, CD8VQ>;
+defm VPMOVSXDQZ: avx512_extend<0x25, "vpmovsxdq", VK8WM, VR512, VR256X, X86vsext,
+                             memopv4i64, i256mem, v8i64, v8i32>, EVEX_V512,
+                             EVEX_CD8<32, CD8VH>;
+
+//===----------------------------------------------------------------------===//
+// GATHER - SCATTER Operations
+
+multiclass avx512_gather<bits<8> opc, string OpcodeStr, RegisterClass KRC,
+                       RegisterClass RC, X86MemOperand memop> {
+let mayLoad = 1,
+  Constraints = "@earlyclobber $dst, $src1 = $dst, $mask = $mask_wb" in
+  def rm  : AVX5128I<opc, MRMSrcMem, (outs RC:$dst, KRC:$mask_wb),
+            (ins RC:$src1, KRC:$mask, memop:$src2),
+            !strconcat(OpcodeStr,
+            " \t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
+            []>, EVEX, EVEX_K;
+}
+
+let ExeDomain = SSEPackedDouble in {
+defm VGATHERDPDZ : avx512_gather<0x92, "vgatherdpd", VK8WM, VR512, vy64xmem>,
+                                 EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VGATHERQPDZ : avx512_gather<0x93, "vgatherqpd", VK8WM, VR512, vz64mem>,
+                                 EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+}
+
+let ExeDomain = SSEPackedSingle in {
+defm VGATHERDPSZ : avx512_gather<0x92, "vgatherdps", VK16WM, VR512, vz32mem>,
+                                 EVEX_V512, EVEX_CD8<32, CD8VT1>;
+defm VGATHERQPSZ : avx512_gather<0x93, "vgatherqps", VK8WM, VR256X, vz64mem>,
+                                 EVEX_V512, EVEX_CD8<32, CD8VT1>;
+}
+  
+defm VPGATHERDQZ : avx512_gather<0x90, "vpgatherdq", VK8WM, VR512,  vy64xmem>,
+                                 EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VPGATHERDDZ : avx512_gather<0x90, "vpgatherdd", VK16WM, VR512, vz32mem>,
+                                 EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VPGATHERQQZ : avx512_gather<0x91, "vpgatherqq", VK8WM, VR512,  vz64mem>,
+                                 EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VPGATHERQDZ : avx512_gather<0x91, "vpgatherqd", VK8WM, VR256X,  vz64mem>,
+                                 EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+multiclass avx512_scatter<bits<8> opc, string OpcodeStr, RegisterClass KRC,
+                       RegisterClass RC, X86MemOperand memop> {
+let mayStore = 1, Constraints = "$mask = $mask_wb" in
+  def mr  : AVX5128I<opc, MRMDestMem, (outs KRC:$mask_wb),
+            (ins memop:$dst, KRC:$mask, RC:$src2),
+            !strconcat(OpcodeStr,
+            " \t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
+            []>, EVEX, EVEX_K;
+}
+
+let ExeDomain = SSEPackedDouble in {
+defm VSCATTERDPDZ : avx512_scatter<0xA2, "vscatterdpd", VK8WM, VR512, vy64xmem>,
+                                   EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VSCATTERQPDZ : avx512_scatter<0xA3, "vscatterqpd", VK8WM, VR512, vz64mem>,
+                                   EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+}
+
+let ExeDomain = SSEPackedSingle in {
+defm VSCATTERDPSZ : avx512_scatter<0xA2, "vscatterdps", VK16WM, VR512, vz32mem>,
+                                   EVEX_V512, EVEX_CD8<32, CD8VT1>;
+defm VSCATTERQPSZ : avx512_scatter<0xA3, "vscatterqps", VK8WM, VR256X, vz64mem>,
+                                   EVEX_V512, EVEX_CD8<32, CD8VT1>;
+}
+
+defm VPSCATTERDQZ : avx512_scatter<0xA0, "vpscatterdq", VK8WM, VR512, vy64xmem>,
+                                   EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VPSCATTERDDZ : avx512_scatter<0xA0, "vpscatterdd", VK16WM, VR512, vz32mem>,
+                                   EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VPSCATTERQQZ : avx512_scatter<0xA1, "vpscatterqq", VK8WM, VR512, vz64mem>,
+                                  EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VPSCATTERQDZ : avx512_scatter<0xA1, "vpscatterqd", VK8WM, VR256X, vz64mem>,
+                                  EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+// prefetch
+multiclass avx512_gather_scatter_prefetch<bits<8> opc, Format F, string OpcodeStr,
+                       RegisterClass KRC, X86MemOperand memop> {
+  let Predicates = [HasPFI], hasSideEffects = 1 in
+  def m  : AVX5128I<opc, F, (outs), (ins KRC:$mask, memop:$src),
+            !strconcat(OpcodeStr, " \t{$src {${mask}}|{${mask}}, $src}"),
+            []>, EVEX, EVEX_K;
+}
+
+defm VGATHERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dps",
+                     VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VGATHERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qps",
+                     VK8WM, vz64mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+
+defm VGATHERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dpd",
+                     VK8WM, vy32mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+
+defm VGATHERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qpd",
+                     VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+                     
+defm VGATHERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dps",
+                     VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VGATHERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM2m, "vgatherpf1qps",
+                     VK8WM, vz64mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+
+defm VGATHERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dpd",
+                     VK8WM, vy32mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+
+defm VGATHERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM2m, "vgatherpf1qpd",
+                     VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+defm VSCATTERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dps",
+                     VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VSCATTERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM5m, "vscatterpf0qps",
+                     VK8WM, vz64mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+
+defm VSCATTERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dpd",
+                     VK8WM, vy32mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+
+defm VSCATTERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM5m, "vscatterpf0qpd",
+                     VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+defm VSCATTERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dps",
+                     VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VSCATTERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qps",
+                     VK8WM, vz64mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+
+defm VSCATTERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dpd",
+                     VK8WM, vy32mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+
+defm VSCATTERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qpd",
+                     VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+//===----------------------------------------------------------------------===//
+// VSHUFPS - VSHUFPD Operations
+
+multiclass avx512_shufp<RegisterClass RC, X86MemOperand x86memop,
+                      ValueType vt, string OpcodeStr, PatFrag mem_frag,
+                      Domain d> {
+  def rmi : AVX512PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
+                   (ins RC:$src1, x86memop:$src2, i8imm:$src3),
+                   !strconcat(OpcodeStr,
+                   " \t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                   [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
+                                       (i8 imm:$src3))))], d, IIC_SSE_SHUFP>,
+                   EVEX_4V, Sched<[WriteShuffleLd, ReadAfterLd]>;
+  def rri : AVX512PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, i8imm:$src3),
+                   !strconcat(OpcodeStr,
+                   " \t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                   [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
+                                       (i8 imm:$src3))))], d, IIC_SSE_SHUFP>,
+                   EVEX_4V, Sched<[WriteShuffle]>;
+}
+
+defm VSHUFPSZ  : avx512_shufp<VR512, f512mem, v16f32, "vshufps", memopv16f32,
+                  SSEPackedSingle>, PS, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VSHUFPDZ  : avx512_shufp<VR512, f512mem, v8f64, "vshufpd", memopv8f64,
+                  SSEPackedDouble>, PD, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
+
+def : Pat<(v16i32 (X86Shufp VR512:$src1, VR512:$src2, (i8 imm:$imm))),
+          (VSHUFPSZrri VR512:$src1, VR512:$src2, imm:$imm)>;
+def : Pat<(v16i32 (X86Shufp VR512:$src1,
+                    (memopv16i32 addr:$src2), (i8 imm:$imm))),
+          (VSHUFPSZrmi VR512:$src1, addr:$src2, imm:$imm)>;
+
+def : Pat<(v8i64 (X86Shufp VR512:$src1, VR512:$src2, (i8 imm:$imm))),
+          (VSHUFPDZrri VR512:$src1, VR512:$src2, imm:$imm)>;
+def : Pat<(v8i64 (X86Shufp VR512:$src1,
+                            (memopv8i64 addr:$src2), (i8 imm:$imm))),
+          (VSHUFPDZrmi VR512:$src1, addr:$src2, imm:$imm)>;
+
+multiclass avx512_alignr<string OpcodeStr, RegisterClass RC,
+                       X86MemOperand x86memop> {
+  def rri : AVX512AIi8<0x03, MRMSrcReg, (outs RC:$dst),
+                     (ins RC:$src1, RC:$src2, i8imm:$src3),
+                     !strconcat(OpcodeStr,
+                     " \t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                     []>, EVEX_4V;
+  let mayLoad = 1 in
+  def rmi : AVX512AIi8<0x03, MRMSrcMem, (outs RC:$dst),
+                     (ins RC:$src1, x86memop:$src2, i8imm:$src3),
+                     !strconcat(OpcodeStr,
+                     " \t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                     []>, EVEX_4V;
+}
+defm VALIGND : avx512_alignr<"valignd", VR512, i512mem>, 
+                 EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VALIGNQ : avx512_alignr<"valignq", VR512, i512mem>, 
+                 VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
+
+def : Pat<(v16f32 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))),
+          (VALIGNDrri VR512:$src2, VR512:$src1, imm:$imm)>;
+def : Pat<(v8f64 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))),
+          (VALIGNQrri VR512:$src2, VR512:$src1, imm:$imm)>;
+def : Pat<(v16i32 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))),
+          (VALIGNDrri VR512:$src2, VR512:$src1, imm:$imm)>;
+def : Pat<(v8i64 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))),
+          (VALIGNQrri VR512:$src2, VR512:$src1, imm:$imm)>;
+
+// Helper fragments to match sext vXi1 to vXiY.
+def v16i1sextv16i32  : PatLeaf<(v16i32 (X86vsrai VR512:$src, (i8 31)))>;
+def v8i1sextv8i64  : PatLeaf<(v8i64 (X86vsrai VR512:$src, (i8 63)))>;
+
+multiclass avx512_vpabs<bits<8> opc, string OpcodeStr, ValueType OpVT,
+                        RegisterClass KRC, RegisterClass RC,
+                        X86MemOperand x86memop, X86MemOperand x86scalar_mop,
+                        string BrdcstStr> {
+  def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
+            !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
+            []>, EVEX;
+  def rrk : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins KRC:$mask, RC:$src),
+             !strconcat(OpcodeStr, " \t{$src, $dst {${mask}}|$dst {${mask}}, $src}"),
+             []>, EVEX, EVEX_K;
+  def rrkz : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins KRC:$mask, RC:$src),
+              !strconcat(OpcodeStr,
+                         " \t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"),
+              []>, EVEX, EVEX_KZ;
+  let mayLoad = 1 in {
+    def rm : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst),
+              (ins x86memop:$src),
+              !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
+              []>, EVEX;
+    def rmk : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst),
+               (ins KRC:$mask, x86memop:$src),
+               !strconcat(OpcodeStr,
+                          " \t{$src, $dst {${mask}}|$dst {${mask}}, $src}"),
+               []>, EVEX, EVEX_K;
+    def rmkz : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst),
+                (ins KRC:$mask, x86memop:$src),
+                !strconcat(OpcodeStr,
+                           " \t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"),
+                []>, EVEX, EVEX_KZ;
+    def rmb : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst),
+               (ins x86scalar_mop:$src),
+               !strconcat(OpcodeStr, " \t{${src}", BrdcstStr,
+                          ", $dst|$dst, ${src}", BrdcstStr, "}"),
+               []>, EVEX, EVEX_B;
+    def rmbk : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst),
+                (ins KRC:$mask, x86scalar_mop:$src),
+                !strconcat(OpcodeStr, " \t{${src}", BrdcstStr,
+                           ", $dst {${mask}}|$dst {${mask}}, ${src}", BrdcstStr, "}"),
+                []>, EVEX, EVEX_B, EVEX_K;
+    def rmbkz : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst),
+                 (ins KRC:$mask, x86scalar_mop:$src),
+                 !strconcat(OpcodeStr, " \t{${src}", BrdcstStr,
+                            ", $dst {${mask}} {z}|$dst {${mask}} {z}, ${src}",
+                            BrdcstStr, "}"),
+                 []>, EVEX, EVEX_B, EVEX_KZ;
+  }
+}
+
+defm VPABSDZ : avx512_vpabs<0x1E, "vpabsd", v16i32, VK16WM, VR512,
+                           i512mem, i32mem, "{1to16}">, EVEX_V512,
+                           EVEX_CD8<32, CD8VF>;
+defm VPABSQZ : avx512_vpabs<0x1F, "vpabsq", v8i64, VK8WM, VR512,
+                           i512mem, i64mem, "{1to8}">, EVEX_V512, VEX_W,
+                           EVEX_CD8<64, CD8VF>;
+
+def : Pat<(xor
+          (bc_v16i32 (v16i1sextv16i32)),
+          (bc_v16i32 (add (v16i32 VR512:$src), (v16i1sextv16i32)))),
+          (VPABSDZrr VR512:$src)>;
+def : Pat<(xor
+          (bc_v8i64 (v8i1sextv8i64)),
+          (bc_v8i64 (add (v8i64 VR512:$src), (v8i1sextv8i64)))),
+          (VPABSQZrr VR512:$src)>;
+
+def : Pat<(v16i32 (int_x86_avx512_mask_pabs_d_512 (v16i32 VR512:$src),
+                   (v16i32 immAllZerosV), (i16 -1))),
+          (VPABSDZrr VR512:$src)>;
+def : Pat<(v8i64 (int_x86_avx512_mask_pabs_q_512 (v8i64 VR512:$src),
+                   (bc_v8i64 (v16i32 immAllZerosV)), (i8 -1))),
+          (VPABSQZrr VR512:$src)>;
+
+multiclass avx512_conflict<bits<8> opc, string OpcodeStr, 
+                        RegisterClass RC, RegisterClass KRC,
+                        X86MemOperand x86memop,
+                        X86MemOperand x86scalar_mop, string BrdcstStr> {
+  def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src),
+       !strconcat(OpcodeStr, " \t{$src, ${dst} |${dst}, $src}"),
+       []>, EVEX;
+  def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+       (ins x86memop:$src),
+       !strconcat(OpcodeStr, " \t{$src, ${dst}|${dst}, $src}"),
+       []>, EVEX;
+  def rmb : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+       (ins x86scalar_mop:$src),
+       !strconcat(OpcodeStr, " \t{${src}", BrdcstStr,
+                  ", ${dst}|${dst}, ${src}", BrdcstStr, "}"),
+       []>, EVEX, EVEX_B;
+  def rrkz : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
+       (ins KRC:$mask, RC:$src),
+       !strconcat(OpcodeStr,
+                  " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
+       []>, EVEX, EVEX_KZ;
+  def rmkz : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+       (ins KRC:$mask, x86memop:$src),
+       !strconcat(OpcodeStr,
+                  " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
+       []>, EVEX, EVEX_KZ;
+  def rmbkz : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+       (ins KRC:$mask, x86scalar_mop:$src),
+       !strconcat(OpcodeStr, " \t{${src}", BrdcstStr,
+                  ", ${dst} {${mask}} {z}|${dst} {${mask}} {z}, ${src}",
+                  BrdcstStr, "}"),
+       []>, EVEX, EVEX_KZ, EVEX_B;
+       
+  let Constraints = "$src1 = $dst" in {
+  def rrk : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, KRC:$mask, RC:$src2),
+       !strconcat(OpcodeStr,
+                  " \t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
+       []>, EVEX, EVEX_K;
+  def rmk : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, KRC:$mask, x86memop:$src2),
+       !strconcat(OpcodeStr,
+                  " \t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
+       []>, EVEX, EVEX_K;
+  def rmbk : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, KRC:$mask, x86scalar_mop:$src2),
+       !strconcat(OpcodeStr, " \t{${src2}", BrdcstStr,
+                  ", ${dst} {${mask}}|${dst} {${mask}}, ${src2}", BrdcstStr, "}"),
+       []>, EVEX, EVEX_K, EVEX_B;
+   }
+}
+
+let Predicates = [HasCDI] in {
+defm VPCONFLICTD : avx512_conflict<0xC4, "vpconflictd", VR512, VK16WM,
+                    i512mem, i32mem, "{1to16}">,
+                    EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+
+defm VPCONFLICTQ : avx512_conflict<0xC4, "vpconflictq", VR512, VK8WM,
+                    i512mem, i64mem, "{1to8}">,
+                    EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+}
+
+def : Pat<(int_x86_avx512_mask_conflict_d_512 VR512:$src2, VR512:$src1,
+                                              GR16:$mask),
+          (VPCONFLICTDrrk VR512:$src1,
+           (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), VR512:$src2)>;
+
+def : Pat<(int_x86_avx512_mask_conflict_q_512 VR512:$src2, VR512:$src1,
+                                              GR8:$mask),
+          (VPCONFLICTQrrk VR512:$src1,
+           (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), VR512:$src2)>;
+
+let Predicates = [HasCDI] in {
+defm VPLZCNTD : avx512_conflict<0x44, "vplzcntd", VR512, VK16WM,
+                    i512mem, i32mem, "{1to16}">,
+                    EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+
+defm VPLZCNTQ : avx512_conflict<0x44, "vplzcntq", VR512, VK8WM,
+                    i512mem, i64mem, "{1to8}">,
+                    EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+}
+
+def : Pat<(int_x86_avx512_mask_lzcnt_d_512 VR512:$src2, VR512:$src1,
+                                              GR16:$mask),
+          (VPLZCNTDrrk VR512:$src1,
+           (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), VR512:$src2)>;
+
+def : Pat<(int_x86_avx512_mask_lzcnt_q_512 VR512:$src2, VR512:$src1,
+                                              GR8:$mask),
+          (VPLZCNTQrrk VR512:$src1,
+           (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), VR512:$src2)>;
+
+def : Pat<(v16i32 (ctlz (memopv16i32 addr:$src))),
+          (VPLZCNTDrm addr:$src)>;
+def : Pat<(v16i32 (ctlz (v16i32 VR512:$src))),
+          (VPLZCNTDrr VR512:$src)>;
+def : Pat<(v8i64 (ctlz (memopv8i64 addr:$src))),
+          (VPLZCNTQrm addr:$src)>;
+def : Pat<(v8i64 (ctlz (v8i64 VR512:$src))),
+          (VPLZCNTQrr VR512:$src)>;
+
+def : Pat<(store (i1 -1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>;
+def : Pat<(store (i1  1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>;
+def : Pat<(store (i1  0), addr:$dst), (MOV8mi addr:$dst, (i8 0))>;
+
+def : Pat<(store VK1:$src, addr:$dst),
+          (KMOVWmk addr:$dst, (COPY_TO_REGCLASS VK1:$src, VK16))>;
+
+def truncstorei1 : PatFrag<(ops node:$val, node:$ptr),
+                           (truncstore node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i1;
+}]>;
+
+def : Pat<(truncstorei1 GR8:$src, addr:$dst),
+          (MOV8mr addr:$dst, GR8:$src)>;
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td b/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td
new file mode 100644
index 0000000..f2574cc
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -0,0 +1,1403 @@
+//===-- X86InstrArithmetic.td - Integer Arithmetic Instrs --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the integer arithmetic instructions in the X86
+// architecture.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// LEA - Load Effective Address
+let SchedRW = [WriteLEA] in {
+let neverHasSideEffects = 1 in
+def LEA16r   : I<0x8D, MRMSrcMem,
+                 (outs GR16:$dst), (ins i32mem:$src),
+                 "lea{w}\t{$src|$dst}, {$dst|$src}", [], IIC_LEA_16>, OpSize16;
+let isReMaterializable = 1 in
+def LEA32r   : I<0x8D, MRMSrcMem,
+                 (outs GR32:$dst), (ins i32mem:$src),
+                 "lea{l}\t{$src|$dst}, {$dst|$src}",
+                 [(set GR32:$dst, lea32addr:$src)], IIC_LEA>,
+                 OpSize32, Requires<[Not64BitMode]>;
+
+def LEA64_32r : I<0x8D, MRMSrcMem,
+                  (outs GR32:$dst), (ins lea64_32mem:$src),
+                  "lea{l}\t{$src|$dst}, {$dst|$src}",
+                  [(set GR32:$dst, lea64_32addr:$src)], IIC_LEA>,
+                  OpSize32, Requires<[In64BitMode]>;
+
+let isReMaterializable = 1 in
+def LEA64r   : RI<0x8D, MRMSrcMem, (outs GR64:$dst), (ins lea64mem:$src),
+                  "lea{q}\t{$src|$dst}, {$dst|$src}",
+                  [(set GR64:$dst, lea64addr:$src)], IIC_LEA>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+//  Fixed-Register Multiplication and Division Instructions.
+//
+
+// SchedModel info for instruction that loads one value and gets the second
+// (and possibly third) value from a register.
+// This is used for instructions that put the memory operands before other
+// uses.
+class SchedLoadReg<SchedWrite SW> : Sched<[SW,
+  // Memory operand.
+  ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+  // Register reads (implicit or explicit).
+  ReadAfterLd, ReadAfterLd]>;
+
+// Extra precision multiplication
+
+// AL is really implied by AX, but the registers in Defs must match the
+// SDNode results (i8, i32).
+// AL,AH = AL*GR8
+let Defs = [AL,EFLAGS,AX], Uses = [AL] in
+def MUL8r  : I<0xF6, MRM4r, (outs),  (ins GR8:$src), "mul{b}\t$src",
+               // FIXME: Used for 8-bit mul, ignore result upper 8 bits.
+               // This probably ought to be moved to a def : Pat<> if the
+               // syntax can be accepted.
+               [(set AL, (mul AL, GR8:$src)),
+                (implicit EFLAGS)], IIC_MUL8>, Sched<[WriteIMul]>;
+// AX,DX = AX*GR16
+let Defs = [AX,DX,EFLAGS], Uses = [AX], neverHasSideEffects = 1 in
+def MUL16r : I<0xF7, MRM4r, (outs),  (ins GR16:$src),
+               "mul{w}\t$src",
+               [], IIC_MUL16_REG>, OpSize16, Sched<[WriteIMul]>;
+// EAX,EDX = EAX*GR32
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX], neverHasSideEffects = 1 in
+def MUL32r : I<0xF7, MRM4r, (outs),  (ins GR32:$src),
+               "mul{l}\t$src",
+               [/*(set EAX, EDX, EFLAGS, (X86umul_flag EAX, GR32:$src))*/],
+               IIC_MUL32_REG>, OpSize32, Sched<[WriteIMul]>;
+// RAX,RDX = RAX*GR64
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX], neverHasSideEffects = 1 in
+def MUL64r : RI<0xF7, MRM4r, (outs), (ins GR64:$src),
+                "mul{q}\t$src",
+                [/*(set RAX, RDX, EFLAGS, (X86umul_flag RAX, GR64:$src))*/],
+                IIC_MUL64>, Sched<[WriteIMul]>;
+// AL,AH = AL*[mem8]
+let Defs = [AL,EFLAGS,AX], Uses = [AL] in
+def MUL8m  : I<0xF6, MRM4m, (outs), (ins i8mem :$src),
+               "mul{b}\t$src",
+               // FIXME: Used for 8-bit mul, ignore result upper 8 bits.
+               // This probably ought to be moved to a def : Pat<> if the
+               // syntax can be accepted.
+               [(set AL, (mul AL, (loadi8 addr:$src))),
+                (implicit EFLAGS)], IIC_MUL8>, SchedLoadReg<WriteIMulLd>;
+// AX,DX = AX*[mem16]
+let mayLoad = 1, neverHasSideEffects = 1 in {
+let Defs = [AX,DX,EFLAGS], Uses = [AX] in
+def MUL16m : I<0xF7, MRM4m, (outs), (ins i16mem:$src),
+               "mul{w}\t$src",
+               [], IIC_MUL16_MEM>, OpSize16, SchedLoadReg<WriteIMulLd>;
+// EAX,EDX = EAX*[mem32]
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
+def MUL32m : I<0xF7, MRM4m, (outs), (ins i32mem:$src),
+              "mul{l}\t$src",
+              [], IIC_MUL32_MEM>, OpSize32, SchedLoadReg<WriteIMulLd>;
+// RAX,RDX = RAX*[mem64]
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
+def MUL64m : RI<0xF7, MRM4m, (outs), (ins i64mem:$src),
+                "mul{q}\t$src", [], IIC_MUL64>, SchedLoadReg<WriteIMulLd>;
+}
+
+let neverHasSideEffects = 1 in {
+// AL,AH = AL*GR8
+let Defs = [AL,EFLAGS,AX], Uses = [AL] in
+def IMUL8r  : I<0xF6, MRM5r, (outs),  (ins GR8:$src), "imul{b}\t$src", [],
+              IIC_IMUL8>, Sched<[WriteIMul]>;
+// AX,DX = AX*GR16
+let Defs = [AX,DX,EFLAGS], Uses = [AX] in
+def IMUL16r : I<0xF7, MRM5r, (outs),  (ins GR16:$src), "imul{w}\t$src", [],
+              IIC_IMUL16_RR>, OpSize16, Sched<[WriteIMul]>;
+// EAX,EDX = EAX*GR32
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
+def IMUL32r : I<0xF7, MRM5r, (outs),  (ins GR32:$src), "imul{l}\t$src", [],
+              IIC_IMUL32_RR>, OpSize32, Sched<[WriteIMul]>;
+// RAX,RDX = RAX*GR64
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
+def IMUL64r : RI<0xF7, MRM5r, (outs), (ins GR64:$src), "imul{q}\t$src", [],
+              IIC_IMUL64_RR>, Sched<[WriteIMul]>;
+
+let mayLoad = 1 in {
+// AL,AH = AL*[mem8]
+let Defs = [AL,EFLAGS,AX], Uses = [AL] in
+def IMUL8m  : I<0xF6, MRM5m, (outs), (ins i8mem :$src),
+                "imul{b}\t$src", [], IIC_IMUL8>, SchedLoadReg<WriteIMulLd>;
+// AX,DX = AX*[mem16]
+let Defs = [AX,DX,EFLAGS], Uses = [AX] in
+def IMUL16m : I<0xF7, MRM5m, (outs), (ins i16mem:$src),
+                "imul{w}\t$src", [], IIC_IMUL16_MEM>, OpSize16,
+              SchedLoadReg<WriteIMulLd>;
+// EAX,EDX = EAX*[mem32]
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
+def IMUL32m : I<0xF7, MRM5m, (outs), (ins i32mem:$src),
+                "imul{l}\t$src", [], IIC_IMUL32_MEM>, OpSize32,
+              SchedLoadReg<WriteIMulLd>;
+// RAX,RDX = RAX*[mem64]
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
+def IMUL64m : RI<0xF7, MRM5m, (outs), (ins i64mem:$src),
+                 "imul{q}\t$src", [], IIC_IMUL64>, SchedLoadReg<WriteIMulLd>;
+}
+} // neverHasSideEffects
+
+
+let Defs = [EFLAGS] in {
+let Constraints = "$src1 = $dst" in {
+
+let isCommutable = 1, SchedRW = [WriteIMul] in {
+// X = IMUL Y, Z --> X = IMUL Z, Y
+// Register-Register Signed Integer Multiply
+def IMUL16rr : I<0xAF, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src1,GR16:$src2),
+                 "imul{w}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, EFLAGS,
+                       (X86smul_flag GR16:$src1, GR16:$src2))], IIC_IMUL16_RR>,
+                       TB, OpSize16;
+def IMUL32rr : I<0xAF, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1,GR32:$src2),
+                 "imul{l}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, EFLAGS,
+                       (X86smul_flag GR32:$src1, GR32:$src2))], IIC_IMUL32_RR>,
+                 TB, OpSize32;
+def IMUL64rr : RI<0xAF, MRMSrcReg, (outs GR64:$dst),
+                                   (ins GR64:$src1, GR64:$src2),
+                  "imul{q}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, EFLAGS,
+                        (X86smul_flag GR64:$src1, GR64:$src2))], IIC_IMUL64_RR>,
+                 TB;
+} // isCommutable, SchedRW
+
+// Register-Memory Signed Integer Multiply
+let SchedRW = [WriteIMulLd, ReadAfterLd] in {
+def IMUL16rm : I<0xAF, MRMSrcMem, (outs GR16:$dst),
+                                  (ins GR16:$src1, i16mem:$src2),
+                 "imul{w}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, EFLAGS,
+                       (X86smul_flag GR16:$src1, (load addr:$src2)))],
+                       IIC_IMUL16_RM>,
+               TB, OpSize16;
+def IMUL32rm : I<0xAF, MRMSrcMem, (outs GR32:$dst),
+                 (ins GR32:$src1, i32mem:$src2),
+                 "imul{l}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, EFLAGS,
+                       (X86smul_flag GR32:$src1, (load addr:$src2)))],
+                       IIC_IMUL32_RM>,
+               TB, OpSize32;
+def IMUL64rm : RI<0xAF, MRMSrcMem, (outs GR64:$dst),
+                                   (ins GR64:$src1, i64mem:$src2),
+                  "imul{q}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, EFLAGS,
+                        (X86smul_flag GR64:$src1, (load addr:$src2)))],
+                        IIC_IMUL64_RM>,
+               TB;
+} // SchedRW
+} // Constraints = "$src1 = $dst"
+
+} // Defs = [EFLAGS]
+
+// Surprisingly enough, these are not two address instructions!
+let Defs = [EFLAGS] in {
+let SchedRW = [WriteIMul] in {
+// Register-Integer Signed Integer Multiply
+def IMUL16rri  : Ii16<0x69, MRMSrcReg,                      // GR16 = GR16*I16
+                      (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+                      "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set GR16:$dst, EFLAGS,
+                            (X86smul_flag GR16:$src1, imm:$src2))],
+                            IIC_IMUL16_RRI>, OpSize16;
+def IMUL16rri8 : Ii8<0x6B, MRMSrcReg,                       // GR16 = GR16*I8
+                     (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
+                     "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     [(set GR16:$dst, EFLAGS,
+                           (X86smul_flag GR16:$src1, i16immSExt8:$src2))],
+                           IIC_IMUL16_RRI>, OpSize16;
+def IMUL32rri  : Ii32<0x69, MRMSrcReg,                      // GR32 = GR32*I32
+                      (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
+                      "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set GR32:$dst, EFLAGS,
+                            (X86smul_flag GR32:$src1, imm:$src2))],
+                            IIC_IMUL32_RRI>, OpSize32;
+def IMUL32rri8 : Ii8<0x6B, MRMSrcReg,                       // GR32 = GR32*I8
+                     (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
+                     "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     [(set GR32:$dst, EFLAGS,
+                           (X86smul_flag GR32:$src1, i32immSExt8:$src2))],
+                           IIC_IMUL32_RRI>, OpSize32;
+def IMUL64rri32 : RIi32S<0x69, MRMSrcReg,                    // GR64 = GR64*I32
+                         (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
+                         "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                         [(set GR64:$dst, EFLAGS,
+                             (X86smul_flag GR64:$src1, i64immSExt32:$src2))],
+                             IIC_IMUL64_RRI>;
+def IMUL64rri8 : RIi8<0x6B, MRMSrcReg,                      // GR64 = GR64*I8
+                      (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
+                      "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set GR64:$dst, EFLAGS,
+                            (X86smul_flag GR64:$src1, i64immSExt8:$src2))],
+                            IIC_IMUL64_RRI>;
+} // SchedRW
+
+// Memory-Integer Signed Integer Multiply
+let SchedRW = [WriteIMulLd] in {
+def IMUL16rmi  : Ii16<0x69, MRMSrcMem,                     // GR16 = [mem16]*I16
+                      (outs GR16:$dst), (ins i16mem:$src1, i16imm:$src2),
+                      "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set GR16:$dst, EFLAGS,
+                            (X86smul_flag (load addr:$src1), imm:$src2))],
+                            IIC_IMUL16_RMI>,
+                 OpSize16;
+def IMUL16rmi8 : Ii8<0x6B, MRMSrcMem,                       // GR16 = [mem16]*I8
+                     (outs GR16:$dst), (ins i16mem:$src1, i16i8imm :$src2),
+                     "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     [(set GR16:$dst, EFLAGS,
+                           (X86smul_flag (load addr:$src1),
+                                         i16immSExt8:$src2))], IIC_IMUL16_RMI>,
+                                         OpSize16;
+def IMUL32rmi  : Ii32<0x69, MRMSrcMem,                     // GR32 = [mem32]*I32
+                      (outs GR32:$dst), (ins i32mem:$src1, i32imm:$src2),
+                      "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set GR32:$dst, EFLAGS,
+                            (X86smul_flag (load addr:$src1), imm:$src2))],
+                            IIC_IMUL32_RMI>, OpSize32;
+def IMUL32rmi8 : Ii8<0x6B, MRMSrcMem,                       // GR32 = [mem32]*I8
+                     (outs GR32:$dst), (ins i32mem:$src1, i32i8imm: $src2),
+                     "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     [(set GR32:$dst, EFLAGS,
+                           (X86smul_flag (load addr:$src1),
+                                         i32immSExt8:$src2))],
+                                         IIC_IMUL32_RMI>, OpSize32;
+def IMUL64rmi32 : RIi32S<0x69, MRMSrcMem,                   // GR64 = [mem64]*I32
+                         (outs GR64:$dst), (ins i64mem:$src1, i64i32imm:$src2),
+                         "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                         [(set GR64:$dst, EFLAGS,
+                              (X86smul_flag (load addr:$src1),
+                                            i64immSExt32:$src2))],
+                                            IIC_IMUL64_RMI>;
+def IMUL64rmi8 : RIi8<0x6B, MRMSrcMem,                      // GR64 = [mem64]*I8
+                      (outs GR64:$dst), (ins i64mem:$src1, i64i8imm: $src2),
+                      "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set GR64:$dst, EFLAGS,
+                            (X86smul_flag (load addr:$src1),
+                                          i64immSExt8:$src2))],
+                                          IIC_IMUL64_RMI>;
+} // SchedRW
+} // Defs = [EFLAGS]
+
+
+
+
+// unsigned division/remainder
+let hasSideEffects = 1 in { // so that we don't speculatively execute
+let SchedRW = [WriteIDiv] in {
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
+def DIV8r  : I<0xF6, MRM6r, (outs),  (ins GR8:$src),    // AX/r8 = AL,AH
+               "div{b}\t$src", [], IIC_DIV8_REG>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def DIV16r : I<0xF7, MRM6r, (outs),  (ins GR16:$src),   // DX:AX/r16 = AX,DX
+               "div{w}\t$src", [], IIC_DIV16>, OpSize16;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
+def DIV32r : I<0xF7, MRM6r, (outs),  (ins GR32:$src),   // EDX:EAX/r32 = EAX,EDX
+               "div{l}\t$src", [], IIC_DIV32>, OpSize32;
+// RDX:RAX/r64 = RAX,RDX
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
+def DIV64r : RI<0xF7, MRM6r, (outs), (ins GR64:$src),
+                "div{q}\t$src", [], IIC_DIV64>;
+} // SchedRW
+
+let mayLoad = 1 in {
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
+def DIV8m  : I<0xF6, MRM6m, (outs), (ins i8mem:$src),   // AX/[mem8] = AL,AH
+               "div{b}\t$src", [], IIC_DIV8_MEM>,
+             SchedLoadReg<WriteIDivLd>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def DIV16m : I<0xF7, MRM6m, (outs), (ins i16mem:$src),  // DX:AX/[mem16] = AX,DX
+               "div{w}\t$src", [], IIC_DIV16>, OpSize16,
+             SchedLoadReg<WriteIDivLd>;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in    // EDX:EAX/[mem32] = EAX,EDX
+def DIV32m : I<0xF7, MRM6m, (outs), (ins i32mem:$src),
+               "div{l}\t$src", [], IIC_DIV32>,
+             SchedLoadReg<WriteIDivLd>, OpSize32;
+// RDX:RAX/[mem64] = RAX,RDX
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
+def DIV64m : RI<0xF7, MRM6m, (outs), (ins i64mem:$src),
+                "div{q}\t$src", [], IIC_DIV64>,
+             SchedLoadReg<WriteIDivLd>;
+}
+
+// Signed division/remainder.
+let SchedRW = [WriteIDiv] in {
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
+def IDIV8r : I<0xF6, MRM7r, (outs),  (ins GR8:$src),    // AX/r8 = AL,AH
+               "idiv{b}\t$src", [], IIC_IDIV8>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def IDIV16r: I<0xF7, MRM7r, (outs),  (ins GR16:$src),   // DX:AX/r16 = AX,DX
+               "idiv{w}\t$src", [], IIC_IDIV16>, OpSize16;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
+def IDIV32r: I<0xF7, MRM7r, (outs),  (ins GR32:$src),   // EDX:EAX/r32 = EAX,EDX
+               "idiv{l}\t$src", [], IIC_IDIV32>, OpSize32;
+// RDX:RAX/r64 = RAX,RDX
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
+def IDIV64r: RI<0xF7, MRM7r, (outs), (ins GR64:$src),
+                "idiv{q}\t$src", [], IIC_IDIV64>;
+} // SchedRW
+
+let mayLoad = 1 in {
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
+def IDIV8m : I<0xF6, MRM7m, (outs), (ins i8mem:$src),   // AX/[mem8] = AL,AH
+               "idiv{b}\t$src", [], IIC_IDIV8>,
+             SchedLoadReg<WriteIDivLd>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def IDIV16m: I<0xF7, MRM7m, (outs), (ins i16mem:$src),  // DX:AX/[mem16] = AX,DX
+               "idiv{w}\t$src", [], IIC_IDIV16>, OpSize16,
+             SchedLoadReg<WriteIDivLd>;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in    // EDX:EAX/[mem32] = EAX,EDX
+def IDIV32m: I<0xF7, MRM7m, (outs), (ins i32mem:$src),
+               "idiv{l}\t$src", [], IIC_IDIV32>, OpSize32,
+             SchedLoadReg<WriteIDivLd>;
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in // RDX:RAX/[mem64] = RAX,RDX
+def IDIV64m: RI<0xF7, MRM7m, (outs), (ins i64mem:$src),
+                "idiv{q}\t$src", [], IIC_IDIV64>,
+             SchedLoadReg<WriteIDivLd>;
+}
+} // hasSideEffects = 0
+
+//===----------------------------------------------------------------------===//
+//  Two address Instructions.
+//
+
+// unary instructions
+let CodeSize = 2 in {
+let Defs = [EFLAGS] in {
+let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
+def NEG8r  : I<0xF6, MRM3r, (outs GR8 :$dst), (ins GR8 :$src1),
+               "neg{b}\t$dst",
+               [(set GR8:$dst, (ineg GR8:$src1)),
+                (implicit EFLAGS)], IIC_UNARY_REG>;
+def NEG16r : I<0xF7, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
+               "neg{w}\t$dst",
+               [(set GR16:$dst, (ineg GR16:$src1)),
+                (implicit EFLAGS)], IIC_UNARY_REG>, OpSize16;
+def NEG32r : I<0xF7, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
+               "neg{l}\t$dst",
+               [(set GR32:$dst, (ineg GR32:$src1)),
+                (implicit EFLAGS)], IIC_UNARY_REG>, OpSize32;
+def NEG64r : RI<0xF7, MRM3r, (outs GR64:$dst), (ins GR64:$src1), "neg{q}\t$dst",
+                [(set GR64:$dst, (ineg GR64:$src1)),
+                 (implicit EFLAGS)], IIC_UNARY_REG>;
+} // Constraints = "$src1 = $dst", SchedRW
+
+// Read-modify-write negate.
+let SchedRW = [WriteALULd, WriteRMW] in {
+def NEG8m  : I<0xF6, MRM3m, (outs), (ins i8mem :$dst),
+               "neg{b}\t$dst",
+               [(store (ineg (loadi8 addr:$dst)), addr:$dst),
+                (implicit EFLAGS)], IIC_UNARY_MEM>;
+def NEG16m : I<0xF7, MRM3m, (outs), (ins i16mem:$dst),
+               "neg{w}\t$dst",
+               [(store (ineg (loadi16 addr:$dst)), addr:$dst),
+                (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize16;
+def NEG32m : I<0xF7, MRM3m, (outs), (ins i32mem:$dst),
+               "neg{l}\t$dst",
+               [(store (ineg (loadi32 addr:$dst)), addr:$dst),
+                (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize32;
+def NEG64m : RI<0xF7, MRM3m, (outs), (ins i64mem:$dst), "neg{q}\t$dst",
+                [(store (ineg (loadi64 addr:$dst)), addr:$dst),
+                 (implicit EFLAGS)], IIC_UNARY_MEM>;
+} // SchedRW
+} // Defs = [EFLAGS]
+
+
+// Note: NOT does not set EFLAGS!
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
+// Match xor -1 to not. Favors these over a move imm + xor to save code size.
+let AddedComplexity = 15 in {
+def NOT8r  : I<0xF6, MRM2r, (outs GR8 :$dst), (ins GR8 :$src1),
+               "not{b}\t$dst",
+               [(set GR8:$dst, (not GR8:$src1))], IIC_UNARY_REG>;
+def NOT16r : I<0xF7, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
+               "not{w}\t$dst",
+               [(set GR16:$dst, (not GR16:$src1))], IIC_UNARY_REG>, OpSize16;
+def NOT32r : I<0xF7, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
+               "not{l}\t$dst",
+               [(set GR32:$dst, (not GR32:$src1))], IIC_UNARY_REG>, OpSize32;
+def NOT64r : RI<0xF7, MRM2r, (outs GR64:$dst), (ins GR64:$src1), "not{q}\t$dst",
+                [(set GR64:$dst, (not GR64:$src1))], IIC_UNARY_REG>;
+}
+} // Constraints = "$src1 = $dst", SchedRW
+
+let SchedRW = [WriteALULd, WriteRMW] in {
+def NOT8m  : I<0xF6, MRM2m, (outs), (ins i8mem :$dst),
+               "not{b}\t$dst",
+               [(store (not (loadi8 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>;
+def NOT16m : I<0xF7, MRM2m, (outs), (ins i16mem:$dst),
+               "not{w}\t$dst",
+               [(store (not (loadi16 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>,
+               OpSize16;
+def NOT32m : I<0xF7, MRM2m, (outs), (ins i32mem:$dst),
+               "not{l}\t$dst",
+               [(store (not (loadi32 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>,
+               OpSize32;
+def NOT64m : RI<0xF7, MRM2m, (outs), (ins i64mem:$dst), "not{q}\t$dst",
+                [(store (not (loadi64 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>;
+} // SchedRW
+} // CodeSize
+
+// TODO: inc/dec is slow for P4, but fast for Pentium-M.
+let Defs = [EFLAGS] in {
+let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
+let CodeSize = 2 in
+def INC8r  : I<0xFE, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
+               "inc{b}\t$dst",
+               [(set GR8:$dst, EFLAGS, (X86inc_flag GR8:$src1))],
+               IIC_UNARY_REG>;
+
+let isConvertibleToThreeAddress = 1, CodeSize = 1 in {  // Can xform into LEA.
+def INC16r : I<0x40, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1),
+               "inc{w}\t$dst",
+               [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src1))], IIC_UNARY_REG>,
+             OpSize16, Requires<[Not64BitMode]>;
+def INC32r : I<0x40, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1),
+               "inc{l}\t$dst",
+               [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src1))],
+               IIC_UNARY_REG>,
+             OpSize32, Requires<[Not64BitMode]>;
+def INC64r : RI<0xFF, MRM0r, (outs GR64:$dst), (ins GR64:$src1), "inc{q}\t$dst",
+                [(set GR64:$dst, EFLAGS, (X86inc_flag GR64:$src1))],
+                IIC_UNARY_REG>;
+} // isConvertibleToThreeAddress = 1, CodeSize = 1
+
+
+// In 64-bit mode, single byte INC and DEC cannot be encoded.
+let isConvertibleToThreeAddress = 1, CodeSize = 2 in {
+// Can transform into LEA.
+def INC64_16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
+                  "inc{w}\t$dst",
+                  [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src1))],
+                  IIC_UNARY_REG>,
+                OpSize16, Requires<[In64BitMode]>;
+def INC64_32r : I<0xFF, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
+                  "inc{l}\t$dst",
+                  [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src1))],
+                  IIC_UNARY_REG>,
+                OpSize32, Requires<[In64BitMode]>;
+def DEC64_16r : I<0xFF, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
+                  "dec{w}\t$dst",
+                  [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src1))],
+                  IIC_UNARY_REG>,
+                OpSize16, Requires<[In64BitMode]>;
+def DEC64_32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
+                  "dec{l}\t$dst",
+                  [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src1))],
+                  IIC_UNARY_REG>,
+                OpSize32, Requires<[In64BitMode]>;
+} // isConvertibleToThreeAddress = 1, CodeSize = 2
+
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
+    CodeSize = 2 in {
+def INC32_16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
+                  "inc{w}\t$dst", [], IIC_UNARY_REG>,
+                OpSize16, Requires<[Not64BitMode]>;
+def INC32_32r : I<0xFF, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
+                  "inc{l}\t$dst", [], IIC_UNARY_REG>,
+                OpSize32, Requires<[Not64BitMode]>;
+def DEC32_16r : I<0xFF, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
+                  "dec{w}\t$dst", [], IIC_UNARY_REG>,
+                OpSize16, Requires<[Not64BitMode]>;
+def DEC32_32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
+                  "dec{l}\t$dst", [], IIC_UNARY_REG>,
+                OpSize32, Requires<[Not64BitMode]>;
+} // isCodeGenOnly = 1, ForceDisassemble = 1, HasSideEffects = 0, CodeSize = 2
+
+} // Constraints = "$src1 = $dst", SchedRW
+
+let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in {
+  def INC8m  : I<0xFE, MRM0m, (outs), (ins i8mem :$dst), "inc{b}\t$dst",
+               [(store (add (loadi8 addr:$dst), 1), addr:$dst),
+                (implicit EFLAGS)], IIC_UNARY_MEM>;
+  def INC16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst), "inc{w}\t$dst",
+               [(store (add (loadi16 addr:$dst), 1), addr:$dst),
+                (implicit EFLAGS)], IIC_UNARY_MEM>,
+               OpSize16, Requires<[Not64BitMode]>;
+  def INC32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst), "inc{l}\t$dst",
+               [(store (add (loadi32 addr:$dst), 1), addr:$dst),
+                (implicit EFLAGS)], IIC_UNARY_MEM>,
+               OpSize32, Requires<[Not64BitMode]>;
+  def INC64m : RI<0xFF, MRM0m, (outs), (ins i64mem:$dst), "inc{q}\t$dst",
+                  [(store (add (loadi64 addr:$dst), 1), addr:$dst),
+                   (implicit EFLAGS)], IIC_UNARY_MEM>;
+
+// These are duplicates of their 32-bit counterparts. Only needed so X86 knows
+// how to unfold them.
+// FIXME: What is this for??
+def INC64_16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst), "inc{w}\t$dst",
+                  [(store (add (loadi16 addr:$dst), 1), addr:$dst),
+                    (implicit EFLAGS)], IIC_UNARY_MEM>,
+                OpSize16, Requires<[In64BitMode]>;
+def INC64_32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst), "inc{l}\t$dst",
+                  [(store (add (loadi32 addr:$dst), 1), addr:$dst),
+                    (implicit EFLAGS)], IIC_UNARY_MEM>,
+                OpSize32, Requires<[In64BitMode]>;
+def DEC64_16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst), "dec{w}\t$dst",
+                  [(store (add (loadi16 addr:$dst), -1), addr:$dst),
+                    (implicit EFLAGS)], IIC_UNARY_MEM>,
+                OpSize16, Requires<[In64BitMode]>;
+def DEC64_32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst), "dec{l}\t$dst",
+                  [(store (add (loadi32 addr:$dst), -1), addr:$dst),
+                    (implicit EFLAGS)], IIC_UNARY_MEM>,
+                OpSize32, Requires<[In64BitMode]>;
+} // CodeSize = 2, SchedRW
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
+let CodeSize = 2 in
+def DEC8r  : I<0xFE, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
+               "dec{b}\t$dst",
+               [(set GR8:$dst, EFLAGS, (X86dec_flag GR8:$src1))],
+               IIC_UNARY_REG>;
+let isConvertibleToThreeAddress = 1, CodeSize = 1 in {   // Can xform into LEA.
+def DEC16r : I<0x48, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1),
+               "dec{w}\t$dst",
+               [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src1))],
+               IIC_UNARY_REG>,
+             OpSize16, Requires<[Not64BitMode]>;
+def DEC32r : I<0x48, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1),
+               "dec{l}\t$dst",
+               [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src1))],
+               IIC_UNARY_REG>,
+             OpSize32, Requires<[Not64BitMode]>;
+def DEC64r : RI<0xFF, MRM1r, (outs GR64:$dst), (ins GR64:$src1), "dec{q}\t$dst",
+                [(set GR64:$dst, EFLAGS, (X86dec_flag GR64:$src1))],
+                IIC_UNARY_REG>;
+} // CodeSize = 2
+} // Constraints = "$src1 = $dst", SchedRW
+
+
+let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in {
+  def DEC8m  : I<0xFE, MRM1m, (outs), (ins i8mem :$dst), "dec{b}\t$dst",
+               [(store (add (loadi8 addr:$dst), -1), addr:$dst),
+                (implicit EFLAGS)], IIC_UNARY_MEM>;
+  def DEC16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst), "dec{w}\t$dst",
+               [(store (add (loadi16 addr:$dst), -1), addr:$dst),
+                (implicit EFLAGS)], IIC_UNARY_MEM>,
+               OpSize16, Requires<[Not64BitMode]>;
+  def DEC32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst), "dec{l}\t$dst",
+               [(store (add (loadi32 addr:$dst), -1), addr:$dst),
+                (implicit EFLAGS)], IIC_UNARY_MEM>,
+               OpSize32, Requires<[Not64BitMode]>;
+  def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
+                  [(store (add (loadi64 addr:$dst), -1), addr:$dst),
+                   (implicit EFLAGS)], IIC_UNARY_MEM>;
+} // CodeSize = 2, SchedRW
+} // Defs = [EFLAGS]
+
+/// X86TypeInfo - This is a bunch of information that describes relevant X86
+/// information about value types.  For example, it can tell you what the
+/// register class and preferred load to use.
+class X86TypeInfo<ValueType vt, string instrsuffix, RegisterClass regclass,
+                  PatFrag loadnode, X86MemOperand memoperand, ImmType immkind,
+                  Operand immoperand, SDPatternOperator immoperator,
+                  Operand imm8operand, SDPatternOperator imm8operator,
+                  bit hasOddOpcode, OperandSize opSize,
+                  bit hasREX_WPrefix> {
+  /// VT - This is the value type itself.
+  ValueType VT = vt;
+
+  /// InstrSuffix - This is the suffix used on instructions with this type.  For
+  /// example, i8 -> "b", i16 -> "w", i32 -> "l", i64 -> "q".
+  string InstrSuffix = instrsuffix;
+
+  /// RegClass - This is the register class associated with this type.  For
+  /// example, i8 -> GR8, i16 -> GR16, i32 -> GR32, i64 -> GR64.
+  RegisterClass RegClass = regclass;
+
+  /// LoadNode - This is the load node associated with this type.  For
+  /// example, i8 -> loadi8, i16 -> loadi16, i32 -> loadi32, i64 -> loadi64.
+  PatFrag LoadNode = loadnode;
+
+  /// MemOperand - This is the memory operand associated with this type.  For
+  /// example, i8 -> i8mem, i16 -> i16mem, i32 -> i32mem, i64 -> i64mem.
+  X86MemOperand MemOperand = memoperand;
+
+  /// ImmEncoding - This is the encoding of an immediate of this type.  For
+  /// example, i8 -> Imm8, i16 -> Imm16, i32 -> Imm32.  Note that i64 -> Imm32
+  /// since the immediate fields of i64 instructions is a 32-bit sign extended
+  /// value.
+  ImmType ImmEncoding = immkind;
+
+  /// ImmOperand - This is the operand kind of an immediate of this type.  For
+  /// example, i8 -> i8imm, i16 -> i16imm, i32 -> i32imm.  Note that i64 ->
+  /// i64i32imm since the immediate fields of i64 instructions is a 32-bit sign
+  /// extended value.
+  Operand ImmOperand = immoperand;
+
+  /// ImmOperator - This is the operator that should be used to match an
+  /// immediate of this kind in a pattern (e.g. imm, or i64immSExt32).
+  SDPatternOperator ImmOperator = immoperator;
+
+  /// Imm8Operand - This is the operand kind to use for an imm8 of this type.
+  /// For example, i8 -> <invalid>, i16 -> i16i8imm, i32 -> i32i8imm.  This is
+  /// only used for instructions that have a sign-extended imm8 field form.
+  Operand Imm8Operand = imm8operand;
+
+  /// Imm8Operator - This is the operator that should be used to match an 8-bit
+  /// sign extended immediate of this kind in a pattern (e.g. imm16immSExt8).
+  SDPatternOperator Imm8Operator = imm8operator;
+
+  /// HasOddOpcode - This bit is true if the instruction should have an odd (as
+  /// opposed to even) opcode.  Operations on i8 are usually even, operations on
+  /// other datatypes are odd.
+  bit HasOddOpcode = hasOddOpcode;
+
+  /// OpSize - Selects whether the instruction needs a 0x66 prefix based on
+  /// 16-bit vs 32-bit mode. i8/i64 set this to OpSizeFixed. i16 sets this
+  /// to Opsize16. i32 sets this to OpSize32.
+  OperandSize OpSize = opSize;
+
+  /// HasREX_WPrefix - This bit is set to true if the instruction should have
+  /// the 0x40 REX prefix.  This is set for i64 types.
+  bit HasREX_WPrefix = hasREX_WPrefix;
+}
+
+def invalid_node : SDNode<"<<invalid_node>>", SDTIntLeaf,[],"<<invalid_node>>">;
+
+
+def Xi8  : X86TypeInfo<i8 , "b", GR8 , loadi8 , i8mem ,
+                       Imm8 , i8imm ,    imm,          i8imm   , invalid_node,
+                       0, OpSizeFixed, 0>;
+def Xi16 : X86TypeInfo<i16, "w", GR16, loadi16, i16mem,
+                       Imm16, i16imm,    imm,          i16i8imm, i16immSExt8,
+                       1, OpSize16, 0>;
+def Xi32 : X86TypeInfo<i32, "l", GR32, loadi32, i32mem,
+                       Imm32, i32imm,    imm,          i32i8imm, i32immSExt8,
+                       1, OpSize32, 0>;
+def Xi64 : X86TypeInfo<i64, "q", GR64, loadi64, i64mem,
+                       Imm32S, i64i32imm, i64immSExt32, i64i8imm, i64immSExt8,
+                       1, OpSizeFixed, 1>;
+
+/// ITy - This instruction base class takes the type info for the instruction.
+/// Using this, it:
+/// 1. Concatenates together the instruction mnemonic with the appropriate
+///    suffix letter, a tab, and the arguments.
+/// 2. Infers whether the instruction should have a 0x66 prefix byte.
+/// 3. Infers whether the instruction should have a 0x40 REX_W prefix.
+/// 4. Infers whether the low bit of the opcode should be 0 (for i8 operations)
+///    or 1 (for i16,i32,i64 operations).
+class ITy<bits<8> opcode, Format f, X86TypeInfo typeinfo, dag outs, dag ins,
+          string mnemonic, string args, list<dag> pattern,
+          InstrItinClass itin = IIC_BIN_NONMEM>
+  : I<{opcode{7}, opcode{6}, opcode{5}, opcode{4},
+       opcode{3}, opcode{2}, opcode{1}, typeinfo.HasOddOpcode },
+      f, outs, ins,
+      !strconcat(mnemonic, "{", typeinfo.InstrSuffix, "}\t", args), pattern,
+      itin> {
+
+  // Infer instruction prefixes from type info.
+  let OpSize = typeinfo.OpSize;
+  let hasREX_WPrefix  = typeinfo.HasREX_WPrefix;
+}
+
+// BinOpRR - Instructions like "add reg, reg, reg".
+class BinOpRR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+              dag outlist, list<dag> pattern, InstrItinClass itin,
+              Format f = MRMDestReg>
+  : ITy<opcode, f, typeinfo, outlist,
+        (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
+        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>,
+    Sched<[WriteALU]>;
+
+// BinOpRR_R - Instructions like "add reg, reg, reg", where the pattern has
+// just a regclass (no eflags) as a result.
+class BinOpRR_R<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                SDNode opnode>
+  : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
+            [(set typeinfo.RegClass:$dst,
+                  (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))],
+                  IIC_BIN_NONMEM>;
+
+// BinOpRR_F - Instructions like "cmp reg, Reg", where the pattern has
+// just a EFLAGS as a result.
+class BinOpRR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                SDPatternOperator opnode, Format f = MRMDestReg>
+  : BinOpRR<opcode, mnemonic, typeinfo, (outs),
+            [(set EFLAGS,
+                  (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))],
+            IIC_BIN_NONMEM, f>;
+
+// BinOpRR_RF - Instructions like "add reg, reg, reg", where the pattern has
+// both a regclass and EFLAGS as a result.
+class BinOpRR_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                 SDNode opnode>
+  : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
+            [(set typeinfo.RegClass:$dst, EFLAGS,
+                  (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))],
+                  IIC_BIN_NONMEM>;
+
+// BinOpRR_RFF - Instructions like "adc reg, reg, reg", where the pattern has
+// both a regclass and EFLAGS as a result, and has EFLAGS as input.
+class BinOpRR_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                  SDNode opnode>
+  : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
+            [(set typeinfo.RegClass:$dst, EFLAGS,
+                  (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2,
+                          EFLAGS))], IIC_BIN_CARRY_NONMEM>;
+
+// BinOpRR_Rev - Instructions like "add reg, reg, reg" (reversed encoding).
+class BinOpRR_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                 InstrItinClass itin = IIC_BIN_NONMEM>
+  : ITy<opcode, MRMSrcReg, typeinfo,
+        (outs typeinfo.RegClass:$dst),
+        (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
+        mnemonic, "{$src2, $dst|$dst, $src2}", [], itin>,
+    Sched<[WriteALU]> {
+  // The disassembler should know about this, but not the asmparser.
+  let isCodeGenOnly = 1;
+  let ForceDisassemble = 1;
+  let hasSideEffects = 0;
+}
+
+// BinOpRR_RDD_Rev - Instructions like "adc reg, reg, reg" (reversed encoding).
+class BinOpRR_RFF_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
+  : BinOpRR_Rev<opcode, mnemonic, typeinfo, IIC_BIN_CARRY_NONMEM>;
+
+// BinOpRR_F_Rev - Instructions like "cmp reg, reg" (reversed encoding).
+class BinOpRR_F_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
+  : ITy<opcode, MRMSrcReg, typeinfo, (outs),
+        (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
+        mnemonic, "{$src2, $src1|$src1, $src2}", [], IIC_BIN_NONMEM>,
+    Sched<[WriteALU]> {
+  // The disassembler should know about this, but not the asmparser.
+  let isCodeGenOnly = 1;
+  let ForceDisassemble = 1;
+  let hasSideEffects = 0;
+}
+
+// BinOpRM - Instructions like "add reg, reg, [mem]".
+class BinOpRM<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+              dag outlist, list<dag> pattern,
+              InstrItinClass itin = IIC_BIN_MEM>
+  : ITy<opcode, MRMSrcMem, typeinfo, outlist,
+        (ins typeinfo.RegClass:$src1, typeinfo.MemOperand:$src2),
+        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>,
+    Sched<[WriteALULd, ReadAfterLd]>;
+
+// BinOpRM_R - Instructions like "add reg, reg, [mem]".
+class BinOpRM_R<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+              SDNode opnode>
+  : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
+            [(set typeinfo.RegClass:$dst,
+            (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2)))]>;
+
+// BinOpRM_F - Instructions like "cmp reg, [mem]".
+class BinOpRM_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+              SDPatternOperator opnode>
+  : BinOpRM<opcode, mnemonic, typeinfo, (outs),
+            [(set EFLAGS,
+            (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2)))]>;
+
+// BinOpRM_RF - Instructions like "add reg, reg, [mem]".
+class BinOpRM_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                 SDNode opnode>
+  : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
+            [(set typeinfo.RegClass:$dst, EFLAGS,
+            (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2)))]>;
+
+// BinOpRM_RFF - Instructions like "adc reg, reg, [mem]".
+class BinOpRM_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                 SDNode opnode>
+  : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
+            [(set typeinfo.RegClass:$dst, EFLAGS,
+            (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2),
+                    EFLAGS))], IIC_BIN_CARRY_MEM>;
+
+// BinOpRI - Instructions like "add reg, reg, imm".
+class BinOpRI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+              Format f, dag outlist, list<dag> pattern,
+              InstrItinClass itin = IIC_BIN_NONMEM>
+  : ITy<opcode, f, typeinfo, outlist,
+        (ins typeinfo.RegClass:$src1, typeinfo.ImmOperand:$src2),
+        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>,
+    Sched<[WriteALU]> {
+  let ImmT = typeinfo.ImmEncoding;
+}
+
+// BinOpRI_R - Instructions like "add reg, reg, imm".
+class BinOpRI_R<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                SDNode opnode, Format f>
+  : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
+            [(set typeinfo.RegClass:$dst,
+                (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2))]>;
+
+// BinOpRI_F - Instructions like "cmp reg, imm".
+class BinOpRI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                SDPatternOperator opnode, Format f>
+  : BinOpRI<opcode, mnemonic, typeinfo, f, (outs),
+            [(set EFLAGS,
+                (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2))]>;
+
+// BinOpRI_RF - Instructions like "add reg, reg, imm".
+class BinOpRI_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                 SDNode opnode, Format f>
+  : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
+            [(set typeinfo.RegClass:$dst, EFLAGS,
+                (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2))]>;
+// BinOpRI_RFF - Instructions like "adc reg, reg, imm".
+class BinOpRI_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                 SDNode opnode, Format f>
+  : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
+            [(set typeinfo.RegClass:$dst, EFLAGS,
+                (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2,
+                        EFLAGS))], IIC_BIN_CARRY_NONMEM>;
+
+// BinOpRI8 - Instructions like "add reg, reg, imm8".
+class BinOpRI8<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+               Format f, dag outlist, list<dag> pattern,
+               InstrItinClass itin = IIC_BIN_NONMEM>
+  : ITy<opcode, f, typeinfo, outlist,
+        (ins typeinfo.RegClass:$src1, typeinfo.Imm8Operand:$src2),
+        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>,
+    Sched<[WriteALU]> {
+  let ImmT = Imm8; // Always 8-bit immediate.
+}
+
+// BinOpRI8_R - Instructions like "add reg, reg, imm8".
+class BinOpRI8_R<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                  SDNode opnode, Format f>
+  : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
+             [(set typeinfo.RegClass:$dst,
+               (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2))]>;
+
+// BinOpRI8_F - Instructions like "cmp reg, imm8".
+class BinOpRI8_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                  SDNode opnode, Format f>
+  : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs),
+             [(set EFLAGS,
+               (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2))]>;
+
+// BinOpRI8_RF - Instructions like "add reg, reg, imm8".
+class BinOpRI8_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                  SDNode opnode, Format f>
+  : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
+             [(set typeinfo.RegClass:$dst, EFLAGS,
+               (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2))]>;
+
+// BinOpRI8_RFF - Instructions like "adc reg, reg, imm8".
+class BinOpRI8_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                   SDNode opnode, Format f>
+  : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
+             [(set typeinfo.RegClass:$dst, EFLAGS,
+               (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2,
+                       EFLAGS))], IIC_BIN_CARRY_NONMEM>;
+
+// BinOpMR - Instructions like "add [mem], reg".
+class BinOpMR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+              list<dag> pattern, InstrItinClass itin = IIC_BIN_MEM>
+  : ITy<opcode, MRMDestMem, typeinfo,
+        (outs), (ins typeinfo.MemOperand:$dst, typeinfo.RegClass:$src),
+        mnemonic, "{$src, $dst|$dst, $src}", pattern, itin>,
+    Sched<[WriteALULd, WriteRMW]>;
+
+// BinOpMR_RMW - Instructions like "add [mem], reg".
+class BinOpMR_RMW<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                  SDNode opnode>
+  : BinOpMR<opcode, mnemonic, typeinfo,
+          [(store (opnode (load addr:$dst), typeinfo.RegClass:$src), addr:$dst),
+           (implicit EFLAGS)]>;
+
+// BinOpMR_RMW_FF - Instructions like "adc [mem], reg".
+class BinOpMR_RMW_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                    SDNode opnode>
+  : BinOpMR<opcode, mnemonic, typeinfo,
+          [(store (opnode (load addr:$dst), typeinfo.RegClass:$src, EFLAGS),
+                  addr:$dst),
+           (implicit EFLAGS)], IIC_BIN_CARRY_MEM>;
+
+// BinOpMR_F - Instructions like "cmp [mem], reg".
+class BinOpMR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                  SDNode opnode>
+  : BinOpMR<opcode, mnemonic, typeinfo,
+            [(set EFLAGS, (opnode (load addr:$dst), typeinfo.RegClass:$src))]>;
+
+// BinOpMI - Instructions like "add [mem], imm".
+class BinOpMI<string mnemonic, X86TypeInfo typeinfo,
+              Format f, list<dag> pattern, bits<8> opcode = 0x80,
+              InstrItinClass itin = IIC_BIN_MEM>
+  : ITy<opcode, f, typeinfo,
+        (outs), (ins typeinfo.MemOperand:$dst, typeinfo.ImmOperand:$src),
+        mnemonic, "{$src, $dst|$dst, $src}", pattern, itin>,
+    Sched<[WriteALULd, WriteRMW]> {
+  let ImmT = typeinfo.ImmEncoding;
+}
+
+// BinOpMI_RMW - Instructions like "add [mem], imm".
+class BinOpMI_RMW<string mnemonic, X86TypeInfo typeinfo,
+                  SDNode opnode, Format f>
+  : BinOpMI<mnemonic, typeinfo, f,
+            [(store (opnode (typeinfo.VT (load addr:$dst)),
+                            typeinfo.ImmOperator:$src), addr:$dst),
+             (implicit EFLAGS)]>;
+// BinOpMI_RMW_FF - Instructions like "adc [mem], imm".
+class BinOpMI_RMW_FF<string mnemonic, X86TypeInfo typeinfo,
+                  SDNode opnode, Format f>
+  : BinOpMI<mnemonic, typeinfo, f,
+            [(store (opnode (typeinfo.VT (load addr:$dst)),
+                            typeinfo.ImmOperator:$src, EFLAGS), addr:$dst),
+             (implicit EFLAGS)], 0x80, IIC_BIN_CARRY_MEM>;
+
+// BinOpMI_F - Instructions like "cmp [mem], imm".
+class BinOpMI_F<string mnemonic, X86TypeInfo typeinfo,
+                SDPatternOperator opnode, Format f, bits<8> opcode = 0x80>
+  : BinOpMI<mnemonic, typeinfo, f,
+            [(set EFLAGS, (opnode (typeinfo.VT (load addr:$dst)),
+                                               typeinfo.ImmOperator:$src))],
+            opcode>;
+
+// BinOpMI8 - Instructions like "add [mem], imm8".
+class BinOpMI8<string mnemonic, X86TypeInfo typeinfo,
+               Format f, list<dag> pattern,
+               InstrItinClass itin = IIC_BIN_MEM>
+  : ITy<0x82, f, typeinfo,
+        (outs), (ins typeinfo.MemOperand:$dst, typeinfo.Imm8Operand:$src),
+        mnemonic, "{$src, $dst|$dst, $src}", pattern, itin>,
+    Sched<[WriteALULd, WriteRMW]> {
+  let ImmT = Imm8; // Always 8-bit immediate.
+}
+
+// BinOpMI8_RMW - Instructions like "add [mem], imm8".
+class BinOpMI8_RMW<string mnemonic, X86TypeInfo typeinfo,
+                   SDNode opnode, Format f>
+  : BinOpMI8<mnemonic, typeinfo, f,
+             [(store (opnode (load addr:$dst),
+                             typeinfo.Imm8Operator:$src), addr:$dst),
+              (implicit EFLAGS)]>;
+
+// BinOpMI8_RMW_FF - Instructions like "adc [mem], imm8".
+class BinOpMI8_RMW_FF<string mnemonic, X86TypeInfo typeinfo,
+                   SDNode opnode, Format f>
+  : BinOpMI8<mnemonic, typeinfo, f,
+             [(store (opnode (load addr:$dst),
+                             typeinfo.Imm8Operator:$src, EFLAGS), addr:$dst),
+              (implicit EFLAGS)], IIC_BIN_CARRY_MEM>;
+
+// BinOpMI8_F - Instructions like "cmp [mem], imm8".
+class BinOpMI8_F<string mnemonic, X86TypeInfo typeinfo,
+                 SDNode opnode, Format f>
+  : BinOpMI8<mnemonic, typeinfo, f,
+             [(set EFLAGS, (opnode (load addr:$dst),
+                                   typeinfo.Imm8Operator:$src))]>;
+
+// BinOpAI - Instructions like "add %eax, %eax, imm", that imp-def EFLAGS.
+class BinOpAI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+              Register areg, string operands,
+              InstrItinClass itin = IIC_BIN_NONMEM>
+  : ITy<opcode, RawFrm, typeinfo,
+        (outs), (ins typeinfo.ImmOperand:$src),
+        mnemonic, operands, [], itin>, Sched<[WriteALU]> {
+  let ImmT = typeinfo.ImmEncoding;
+  let Uses = [areg];
+  let Defs = [areg, EFLAGS];
+  let hasSideEffects = 0;
+}
+
+// BinOpAI_FF - Instructions like "adc %eax, %eax, imm", that implicitly define
+// and use EFLAGS.
+class BinOpAI_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                Register areg, string operands>
+  : BinOpAI<opcode, mnemonic, typeinfo, areg, operands,
+            IIC_BIN_CARRY_NONMEM> {
+  let Uses = [areg, EFLAGS];
+}
+
+/// ArithBinOp_RF - This is an arithmetic binary operator where the pattern is
+/// defined with "(set GPR:$dst, EFLAGS, (...".
+///
+/// It would be nice to get rid of the second and third argument here, but
+/// tblgen can't handle dependent type references aggressively enough: PR8330
+multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
+                         string mnemonic, Format RegMRM, Format MemMRM,
+                         SDNode opnodeflag, SDNode opnode,
+                         bit CommutableRR, bit ConvertibleToThreeAddress> {
+  let Defs = [EFLAGS] in {
+    let Constraints = "$src1 = $dst" in {
+      let isCommutable = CommutableRR,
+          isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+        def NAME#8rr  : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag>;
+        def NAME#16rr : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag>;
+        def NAME#32rr : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag>;
+        def NAME#64rr : BinOpRR_RF<BaseOpc, mnemonic, Xi64, opnodeflag>;
+      } // isCommutable
+
+      def NAME#8rr_REV  : BinOpRR_Rev<BaseOpc2, mnemonic, Xi8>;
+      def NAME#16rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi16>;
+      def NAME#32rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi32>;
+      def NAME#64rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi64>;
+
+      def NAME#8rm   : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag>;
+      def NAME#16rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag>;
+      def NAME#32rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag>;
+      def NAME#64rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag>;
+
+      let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+        // NOTE: These are order specific, we want the ri8 forms to be listed
+        // first so that they are slightly preferred to the ri forms.
+        def NAME#16ri8 : BinOpRI8_RF<0x82, mnemonic, Xi16, opnodeflag, RegMRM>;
+        def NAME#32ri8 : BinOpRI8_RF<0x82, mnemonic, Xi32, opnodeflag, RegMRM>;
+        def NAME#64ri8 : BinOpRI8_RF<0x82, mnemonic, Xi64, opnodeflag, RegMRM>;
+
+        def NAME#8ri   : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>;
+        def NAME#16ri  : BinOpRI_RF<0x80, mnemonic, Xi16, opnodeflag, RegMRM>;
+        def NAME#32ri  : BinOpRI_RF<0x80, mnemonic, Xi32, opnodeflag, RegMRM>;
+        def NAME#64ri32: BinOpRI_RF<0x80, mnemonic, Xi64, opnodeflag, RegMRM>;
+      }
+    } // Constraints = "$src1 = $dst"
+
+    def NAME#8mr    : BinOpMR_RMW<BaseOpc, mnemonic, Xi8 , opnode>;
+    def NAME#16mr   : BinOpMR_RMW<BaseOpc, mnemonic, Xi16, opnode>;
+    def NAME#32mr   : BinOpMR_RMW<BaseOpc, mnemonic, Xi32, opnode>;
+    def NAME#64mr   : BinOpMR_RMW<BaseOpc, mnemonic, Xi64, opnode>;
+
+    // NOTE: These are order specific, we want the mi8 forms to be listed
+    // first so that they are slightly preferred to the mi forms.
+    def NAME#16mi8  : BinOpMI8_RMW<mnemonic, Xi16, opnode, MemMRM>;
+    def NAME#32mi8  : BinOpMI8_RMW<mnemonic, Xi32, opnode, MemMRM>;
+    def NAME#64mi8  : BinOpMI8_RMW<mnemonic, Xi64, opnode, MemMRM>;
+
+    def NAME#8mi    : BinOpMI_RMW<mnemonic, Xi8 , opnode, MemMRM>;
+    def NAME#16mi   : BinOpMI_RMW<mnemonic, Xi16, opnode, MemMRM>;
+    def NAME#32mi   : BinOpMI_RMW<mnemonic, Xi32, opnode, MemMRM>;
+    def NAME#64mi32 : BinOpMI_RMW<mnemonic, Xi64, opnode, MemMRM>;
+  } // Defs = [EFLAGS]
+
+  def NAME#8i8   : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
+                           "{$src, %al|al, $src}">;
+  def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX,
+                           "{$src, %ax|ax, $src}">;
+  def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX,
+                           "{$src, %eax|eax, $src}">;
+  def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX,
+                           "{$src, %rax|rax, $src}">;
+}
+
+/// ArithBinOp_RFF - This is an arithmetic binary operator where the pattern is
+/// defined with "(set GPR:$dst, EFLAGS, (node LHS, RHS, EFLAGS))" like ADC and
+/// SBB.
+///
+/// It would be nice to get rid of the second and third argument here, but
+/// tblgen can't handle dependent type references aggressively enough: PR8330
+multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
+                          string mnemonic, Format RegMRM, Format MemMRM,
+                          SDNode opnode, bit CommutableRR,
+                           bit ConvertibleToThreeAddress> {
+  let Uses = [EFLAGS], Defs = [EFLAGS] in {
+    let Constraints = "$src1 = $dst" in {
+      let isCommutable = CommutableRR,
+          isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+        def NAME#8rr  : BinOpRR_RFF<BaseOpc, mnemonic, Xi8 , opnode>;
+        def NAME#16rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi16, opnode>;
+        def NAME#32rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi32, opnode>;
+        def NAME#64rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi64, opnode>;
+      } // isCommutable
+
+      def NAME#8rr_REV  : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi8>;
+      def NAME#16rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi16>;
+      def NAME#32rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi32>;
+      def NAME#64rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi64>;
+
+      def NAME#8rm   : BinOpRM_RFF<BaseOpc2, mnemonic, Xi8 , opnode>;
+      def NAME#16rm  : BinOpRM_RFF<BaseOpc2, mnemonic, Xi16, opnode>;
+      def NAME#32rm  : BinOpRM_RFF<BaseOpc2, mnemonic, Xi32, opnode>;
+      def NAME#64rm  : BinOpRM_RFF<BaseOpc2, mnemonic, Xi64, opnode>;
+
+      let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+        // NOTE: These are order specific, we want the ri8 forms to be listed
+        // first so that they are slightly preferred to the ri forms.
+        def NAME#16ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi16, opnode, RegMRM>;
+        def NAME#32ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi32, opnode, RegMRM>;
+        def NAME#64ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi64, opnode, RegMRM>;
+
+        def NAME#8ri   : BinOpRI_RFF<0x80, mnemonic, Xi8 , opnode, RegMRM>;
+        def NAME#16ri  : BinOpRI_RFF<0x80, mnemonic, Xi16, opnode, RegMRM>;
+        def NAME#32ri  : BinOpRI_RFF<0x80, mnemonic, Xi32, opnode, RegMRM>;
+        def NAME#64ri32: BinOpRI_RFF<0x80, mnemonic, Xi64, opnode, RegMRM>;
+      }
+    } // Constraints = "$src1 = $dst"
+
+    def NAME#8mr    : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi8 , opnode>;
+    def NAME#16mr   : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi16, opnode>;
+    def NAME#32mr   : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi32, opnode>;
+    def NAME#64mr   : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi64, opnode>;
+
+    // NOTE: These are order specific, we want the mi8 forms to be listed
+    // first so that they are slightly preferred to the mi forms.
+    def NAME#16mi8  : BinOpMI8_RMW_FF<mnemonic, Xi16, opnode, MemMRM>;
+    def NAME#32mi8  : BinOpMI8_RMW_FF<mnemonic, Xi32, opnode, MemMRM>;
+    def NAME#64mi8  : BinOpMI8_RMW_FF<mnemonic, Xi64, opnode, MemMRM>;
+
+    def NAME#8mi    : BinOpMI_RMW_FF<mnemonic, Xi8 , opnode, MemMRM>;
+    def NAME#16mi   : BinOpMI_RMW_FF<mnemonic, Xi16, opnode, MemMRM>;
+    def NAME#32mi   : BinOpMI_RMW_FF<mnemonic, Xi32, opnode, MemMRM>;
+    def NAME#64mi32 : BinOpMI_RMW_FF<mnemonic, Xi64, opnode, MemMRM>;
+  } // Uses = [EFLAGS], Defs = [EFLAGS]
+
+  def NAME#8i8   : BinOpAI_FF<BaseOpc4, mnemonic, Xi8 , AL,
+                              "{$src, %al|al, $src}">;
+  def NAME#16i16 : BinOpAI_FF<BaseOpc4, mnemonic, Xi16, AX,
+                              "{$src, %ax|ax, $src}">;
+  def NAME#32i32 : BinOpAI_FF<BaseOpc4, mnemonic, Xi32, EAX,
+                              "{$src, %eax|eax, $src}">;
+  def NAME#64i32 : BinOpAI_FF<BaseOpc4, mnemonic, Xi64, RAX,
+                              "{$src, %rax|rax, $src}">;
+}
+
+/// ArithBinOp_F - This is an arithmetic binary operator where the pattern is
+/// defined with "(set EFLAGS, (...".  It would be really nice to find a way
+/// to factor this with the other ArithBinOp_*.
+///
+multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
+                        string mnemonic, Format RegMRM, Format MemMRM,
+                        SDNode opnode,
+                        bit CommutableRR, bit ConvertibleToThreeAddress> {
+  let Defs = [EFLAGS] in {
+    let isCommutable = CommutableRR,
+        isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+      def NAME#8rr  : BinOpRR_F<BaseOpc, mnemonic, Xi8 , opnode>;
+      def NAME#16rr : BinOpRR_F<BaseOpc, mnemonic, Xi16, opnode>;
+      def NAME#32rr : BinOpRR_F<BaseOpc, mnemonic, Xi32, opnode>;
+      def NAME#64rr : BinOpRR_F<BaseOpc, mnemonic, Xi64, opnode>;
+    } // isCommutable
+
+    def NAME#8rr_REV  : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi8>;
+    def NAME#16rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi16>;
+    def NAME#32rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi32>;
+    def NAME#64rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi64>;
+
+    def NAME#8rm   : BinOpRM_F<BaseOpc2, mnemonic, Xi8 , opnode>;
+    def NAME#16rm  : BinOpRM_F<BaseOpc2, mnemonic, Xi16, opnode>;
+    def NAME#32rm  : BinOpRM_F<BaseOpc2, mnemonic, Xi32, opnode>;
+    def NAME#64rm  : BinOpRM_F<BaseOpc2, mnemonic, Xi64, opnode>;
+
+    let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+      // NOTE: These are order specific, we want the ri8 forms to be listed
+      // first so that they are slightly preferred to the ri forms.
+      def NAME#16ri8 : BinOpRI8_F<0x82, mnemonic, Xi16, opnode, RegMRM>;
+      def NAME#32ri8 : BinOpRI8_F<0x82, mnemonic, Xi32, opnode, RegMRM>;
+      def NAME#64ri8 : BinOpRI8_F<0x82, mnemonic, Xi64, opnode, RegMRM>;
+
+      def NAME#8ri   : BinOpRI_F<0x80, mnemonic, Xi8 , opnode, RegMRM>;
+      def NAME#16ri  : BinOpRI_F<0x80, mnemonic, Xi16, opnode, RegMRM>;
+      def NAME#32ri  : BinOpRI_F<0x80, mnemonic, Xi32, opnode, RegMRM>;
+      def NAME#64ri32: BinOpRI_F<0x80, mnemonic, Xi64, opnode, RegMRM>;
+    }
+
+    def NAME#8mr    : BinOpMR_F<BaseOpc, mnemonic, Xi8 , opnode>;
+    def NAME#16mr   : BinOpMR_F<BaseOpc, mnemonic, Xi16, opnode>;
+    def NAME#32mr   : BinOpMR_F<BaseOpc, mnemonic, Xi32, opnode>;
+    def NAME#64mr   : BinOpMR_F<BaseOpc, mnemonic, Xi64, opnode>;
+
+    // NOTE: These are order specific, we want the mi8 forms to be listed
+    // first so that they are slightly preferred to the mi forms.
+    def NAME#16mi8  : BinOpMI8_F<mnemonic, Xi16, opnode, MemMRM>;
+    def NAME#32mi8  : BinOpMI8_F<mnemonic, Xi32, opnode, MemMRM>;
+    def NAME#64mi8  : BinOpMI8_F<mnemonic, Xi64, opnode, MemMRM>;
+
+    def NAME#8mi    : BinOpMI_F<mnemonic, Xi8 , opnode, MemMRM>;
+    def NAME#16mi   : BinOpMI_F<mnemonic, Xi16, opnode, MemMRM>;
+    def NAME#32mi   : BinOpMI_F<mnemonic, Xi32, opnode, MemMRM>;
+    def NAME#64mi32 : BinOpMI_F<mnemonic, Xi64, opnode, MemMRM>;
+  } // Defs = [EFLAGS]
+
+  def NAME#8i8   : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
+                           "{$src, %al|al, $src}">;
+  def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX,
+                           "{$src, %ax|ax, $src}">;
+  def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX,
+                           "{$src, %eax|eax, $src}">;
+  def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX,
+                           "{$src, %rax|rax, $src}">;
+}
+
+
+defm AND : ArithBinOp_RF<0x20, 0x22, 0x24, "and", MRM4r, MRM4m,
+                         X86and_flag, and, 1, 0>;
+defm OR  : ArithBinOp_RF<0x08, 0x0A, 0x0C, "or", MRM1r, MRM1m,
+                         X86or_flag, or, 1, 0>;
+defm XOR : ArithBinOp_RF<0x30, 0x32, 0x34, "xor", MRM6r, MRM6m,
+                         X86xor_flag, xor, 1, 0>;
+defm ADD : ArithBinOp_RF<0x00, 0x02, 0x04, "add", MRM0r, MRM0m,
+                         X86add_flag, add, 1, 1>;
+let isCompare = 1 in {
+defm SUB : ArithBinOp_RF<0x28, 0x2A, 0x2C, "sub", MRM5r, MRM5m,
+                         X86sub_flag, sub, 0, 0>;
+}
+
+// Arithmetic.
+defm ADC : ArithBinOp_RFF<0x10, 0x12, 0x14, "adc", MRM2r, MRM2m, X86adc_flag,
+                          1, 0>;
+defm SBB : ArithBinOp_RFF<0x18, 0x1A, 0x1C, "sbb", MRM3r, MRM3m, X86sbb_flag,
+                          0, 0>;
+
+let isCompare = 1 in {
+defm CMP : ArithBinOp_F<0x38, 0x3A, 0x3C, "cmp", MRM7r, MRM7m, X86cmp, 0, 0>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Semantically, test instructions are similar like AND, except they don't
+// generate a result.  From an encoding perspective, they are very different:
+// they don't have all the usual imm8 and REV forms, and are encoded into a
+// different space.
+def X86testpat : PatFrag<(ops node:$lhs, node:$rhs),
+                         (X86cmp (and_su node:$lhs, node:$rhs), 0)>;
+
+let isCompare = 1 in {
+  let Defs = [EFLAGS] in {
+    let isCommutable = 1 in {
+      def TEST8rr  : BinOpRR_F<0x84, "test", Xi8 , X86testpat, MRMSrcReg>;
+      def TEST16rr : BinOpRR_F<0x84, "test", Xi16, X86testpat, MRMSrcReg>;
+      def TEST32rr : BinOpRR_F<0x84, "test", Xi32, X86testpat, MRMSrcReg>;
+      def TEST64rr : BinOpRR_F<0x84, "test", Xi64, X86testpat, MRMSrcReg>;
+    } // isCommutable
+
+    def TEST8rm    : BinOpRM_F<0x84, "test", Xi8 , X86testpat>;
+    def TEST16rm   : BinOpRM_F<0x84, "test", Xi16, X86testpat>;
+    def TEST32rm   : BinOpRM_F<0x84, "test", Xi32, X86testpat>;
+    def TEST64rm   : BinOpRM_F<0x84, "test", Xi64, X86testpat>;
+
+    def TEST8ri    : BinOpRI_F<0xF6, "test", Xi8 , X86testpat, MRM0r>;
+    def TEST16ri   : BinOpRI_F<0xF6, "test", Xi16, X86testpat, MRM0r>;
+    def TEST32ri   : BinOpRI_F<0xF6, "test", Xi32, X86testpat, MRM0r>;
+    def TEST64ri32 : BinOpRI_F<0xF6, "test", Xi64, X86testpat, MRM0r>;
+
+    def TEST8mi    : BinOpMI_F<"test", Xi8 , X86testpat, MRM0m, 0xF6>;
+    def TEST16mi   : BinOpMI_F<"test", Xi16, X86testpat, MRM0m, 0xF6>;
+    def TEST32mi   : BinOpMI_F<"test", Xi32, X86testpat, MRM0m, 0xF6>;
+    def TEST64mi32 : BinOpMI_F<"test", Xi64, X86testpat, MRM0m, 0xF6>;
+
+    // When testing the result of EXTRACT_SUBREG sub_8bit_hi, make sure the
+    // register class is constrained to GR8_NOREX. This pseudo is explicitly
+    // marked side-effect free, since it doesn't have an isel pattern like
+    // other test instructions. 
+    let isPseudo = 1, hasSideEffects = 0 in
+    def TEST8ri_NOREX : I<0, Pseudo, (outs), (ins GR8_NOREX:$src, i8imm:$mask),
+                          "", [], IIC_BIN_NONMEM>, Sched<[WriteALU]>;
+  } // Defs = [EFLAGS]
+
+  def TEST8i8    : BinOpAI<0xA8, "test", Xi8 , AL,
+                           "{$src, %al|al, $src}">;
+  def TEST16i16  : BinOpAI<0xA8, "test", Xi16, AX,
+                           "{$src, %ax|ax, $src}">;
+  def TEST32i32  : BinOpAI<0xA8, "test", Xi32, EAX,
+                           "{$src, %eax|eax, $src}">;
+  def TEST64i32  : BinOpAI<0xA8, "test", Xi64, RAX,
+                           "{$src, %rax|rax, $src}">;
+} // isCompare
+
+//===----------------------------------------------------------------------===//
+// ANDN Instruction
+//
+multiclass bmi_andn<string mnemonic, RegisterClass RC, X86MemOperand x86memop,
+                    PatFrag ld_frag> {
+  def rr : I<0xF2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+            !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+            [(set RC:$dst, EFLAGS, (X86and_flag (not RC:$src1), RC:$src2))],
+            IIC_BIN_NONMEM>, Sched<[WriteALU]>;
+  def rm : I<0xF2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+            !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+            [(set RC:$dst, EFLAGS,
+             (X86and_flag (not RC:$src1), (ld_frag addr:$src2)))], IIC_BIN_MEM>,
+           Sched<[WriteALULd, ReadAfterLd]>;
+}
+
+let Predicates = [HasBMI], Defs = [EFLAGS] in {
+  defm ANDN32 : bmi_andn<"andn{l}", GR32, i32mem, loadi32>, T8PS, VEX_4V;
+  defm ANDN64 : bmi_andn<"andn{q}", GR64, i64mem, loadi64>, T8PS, VEX_4V, VEX_W;
+}
+
+let Predicates = [HasBMI] in {
+  def : Pat<(and (not GR32:$src1), GR32:$src2),
+            (ANDN32rr GR32:$src1, GR32:$src2)>;
+  def : Pat<(and (not GR64:$src1), GR64:$src2),
+            (ANDN64rr GR64:$src1, GR64:$src2)>;
+  def : Pat<(and (not GR32:$src1), (loadi32 addr:$src2)),
+            (ANDN32rm GR32:$src1, addr:$src2)>;
+  def : Pat<(and (not GR64:$src1), (loadi64 addr:$src2)),
+            (ANDN64rm GR64:$src1, addr:$src2)>;
+}
+
+//===----------------------------------------------------------------------===//
+// MULX Instruction
+//
+multiclass bmi_mulx<string mnemonic, RegisterClass RC, X86MemOperand x86memop> {
+let neverHasSideEffects = 1 in {
+  let isCommutable = 1 in
+  def rr : I<0xF6, MRMSrcReg, (outs RC:$dst1, RC:$dst2), (ins RC:$src),
+             !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
+             [], IIC_MUL8>, T8XD, VEX_4V, Sched<[WriteIMul, WriteIMulH]>;
+
+  let mayLoad = 1 in
+  def rm : I<0xF6, MRMSrcMem, (outs RC:$dst1, RC:$dst2), (ins x86memop:$src),
+             !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
+             [], IIC_MUL8>, T8XD, VEX_4V, Sched<[WriteIMulLd, WriteIMulH]>;
+}
+}
+
+let Predicates = [HasBMI2] in {
+  let Uses = [EDX] in
+    defm MULX32 : bmi_mulx<"mulx{l}", GR32, i32mem>;
+  let Uses = [RDX] in
+    defm MULX64 : bmi_mulx<"mulx{q}", GR64, i64mem>, VEX_W;
+}
+
+//===----------------------------------------------------------------------===//
+// ADCX Instruction
+//
+let hasSideEffects = 0, Predicates = [HasADX], Defs = [EFLAGS] in {
+  let SchedRW = [WriteALU] in {
+  def ADCX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+             "adcx{l}\t{$src, $dst|$dst, $src}",
+             [], IIC_BIN_NONMEM>, T8PD;
+
+  def ADCX64rr : RI<0xF6, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+             "adcx{q}\t{$src, $dst|$dst, $src}",
+             [], IIC_BIN_NONMEM>, T8PD, Requires<[In64BitMode]>;
+  } // SchedRW
+
+  let mayLoad = 1, SchedRW = [WriteALULd] in {
+  def ADCX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+             "adcx{l}\t{$src, $dst|$dst, $src}",
+             [], IIC_BIN_MEM>, T8PD;
+
+  def ADCX64rm : RI<0xF6, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+             "adcx{q}\t{$src, $dst|$dst, $src}",
+             [], IIC_BIN_MEM>, T8PD, Requires<[In64BitMode]>;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// ADOX Instruction
+//
+let hasSideEffects = 0, Predicates = [HasADX], Defs = [EFLAGS] in {
+  let SchedRW = [WriteALU] in {
+  def ADOX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+             "adox{l}\t{$src, $dst|$dst, $src}",
+             [], IIC_BIN_NONMEM>, T8XS;
+
+  def ADOX64rr : RI<0xF6, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+             "adox{q}\t{$src, $dst|$dst, $src}",
+             [], IIC_BIN_NONMEM>, T8XS, Requires<[In64BitMode]>;
+  } // SchedRW
+
+  let mayLoad = 1, SchedRW = [WriteALULd] in {
+  def ADOX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+             "adox{l}\t{$src, $dst|$dst, $src}",
+             [], IIC_BIN_MEM>, T8XS;
+
+  def ADOX64rm : RI<0xF6, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+             "adox{q}\t{$src, $dst|$dst, $src}",
+             [], IIC_BIN_MEM>, T8XS, Requires<[In64BitMode]>;
+  }
+}
diff --git a/contrib/llvm/lib/Target/X86/X86InstrBuilder.h b/contrib/llvm/lib/Target/X86/X86InstrBuilder.h
new file mode 100644
index 0000000..e421f8c
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrBuilder.h
@@ -0,0 +1,184 @@
+//===-- X86InstrBuilder.h - Functions to aid building x86 insts -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes functions that may be used with BuildMI from the
+// MachineInstrBuilder.h file to handle X86'isms in a clean way.
+//
+// The BuildMem function may be used with the BuildMI function to add entire
+// memory references in a single, typed, function call.  X86 memory references
+// can be very complex expressions (described in the README), so wrapping them
+// up behind an easier to use interface makes sense.  Descriptions of the
+// functions are included below.
+//
+// For reference, the order of operands for memory references is:
+// (Operand), Base, Scale, Index, Displacement.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86INSTRBUILDER_H
+#define X86INSTRBUILDER_H
+
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+
+namespace llvm {
+
+/// X86AddressMode - This struct holds a generalized full x86 address mode.
+/// The base register can be a frame index, which will eventually be replaced
+/// with BP or SP and Disp being offsetted accordingly.  The displacement may
+/// also include the offset of a global value.
+struct X86AddressMode {
+  enum {
+    RegBase,
+    FrameIndexBase
+  } BaseType;
+
+  union {
+    unsigned Reg;
+    int FrameIndex;
+  } Base;
+
+  unsigned Scale;
+  unsigned IndexReg;
+  int Disp;
+  const GlobalValue *GV;
+  unsigned GVOpFlags;
+
+  X86AddressMode()
+    : BaseType(RegBase), Scale(1), IndexReg(0), Disp(0), GV(nullptr),
+      GVOpFlags(0) {
+    Base.Reg = 0;
+  }
+
+
+  void getFullAddress(SmallVectorImpl<MachineOperand> &MO) {
+    assert(Scale == 1 || Scale == 2 || Scale == 4 || Scale == 8);
+
+    if (BaseType == X86AddressMode::RegBase)
+      MO.push_back(MachineOperand::CreateReg(Base.Reg, false, false,
+                                             false, false, false, 0, false));
+    else {
+      assert(BaseType == X86AddressMode::FrameIndexBase);
+      MO.push_back(MachineOperand::CreateFI(Base.FrameIndex));
+    }
+
+    MO.push_back(MachineOperand::CreateImm(Scale));
+    MO.push_back(MachineOperand::CreateReg(IndexReg, false, false,
+                                           false, false, false, 0, false));
+
+    if (GV)
+      MO.push_back(MachineOperand::CreateGA(GV, Disp, GVOpFlags));
+    else
+      MO.push_back(MachineOperand::CreateImm(Disp));
+
+    MO.push_back(MachineOperand::CreateReg(0, false, false,
+                                           false, false, false, 0, false));
+  }
+};
+
+/// addDirectMem - This function is used to add a direct memory reference to the
+/// current instruction -- that is, a dereference of an address in a register,
+/// with no scale, index or displacement. An example is: DWORD PTR [EAX].
+///
+static inline const MachineInstrBuilder &
+addDirectMem(const MachineInstrBuilder &MIB, unsigned Reg) {
+  // Because memory references are always represented with five
+  // values, this adds: Reg, 1, NoReg, 0, NoReg to the instruction.
+  return MIB.addReg(Reg).addImm(1).addReg(0).addImm(0).addReg(0);
+}
+
+
+static inline const MachineInstrBuilder &
+addOffset(const MachineInstrBuilder &MIB, int Offset) {
+  return MIB.addImm(1).addReg(0).addImm(Offset).addReg(0);
+}
+
+/// addRegOffset - This function is used to add a memory reference of the form
+/// [Reg + Offset], i.e., one with no scale or index, but with a
+/// displacement. An example is: DWORD PTR [EAX + 4].
+///
+static inline const MachineInstrBuilder &
+addRegOffset(const MachineInstrBuilder &MIB,
+             unsigned Reg, bool isKill, int Offset) {
+  return addOffset(MIB.addReg(Reg, getKillRegState(isKill)), Offset);
+}
+
+/// addRegReg - This function is used to add a memory reference of the form:
+/// [Reg + Reg].
+static inline const MachineInstrBuilder &addRegReg(const MachineInstrBuilder &MIB,
+                                            unsigned Reg1, bool isKill1,
+                                            unsigned Reg2, bool isKill2) {
+  return MIB.addReg(Reg1, getKillRegState(isKill1)).addImm(1)
+    .addReg(Reg2, getKillRegState(isKill2)).addImm(0).addReg(0);
+}
+
+static inline const MachineInstrBuilder &
+addFullAddress(const MachineInstrBuilder &MIB,
+               const X86AddressMode &AM) {
+  assert(AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8);
+
+  if (AM.BaseType == X86AddressMode::RegBase)
+    MIB.addReg(AM.Base.Reg);
+  else {
+    assert(AM.BaseType == X86AddressMode::FrameIndexBase);
+    MIB.addFrameIndex(AM.Base.FrameIndex);
+  }
+
+  MIB.addImm(AM.Scale).addReg(AM.IndexReg);
+  if (AM.GV)
+    MIB.addGlobalAddress(AM.GV, AM.Disp, AM.GVOpFlags);
+  else
+    MIB.addImm(AM.Disp);
+
+  return MIB.addReg(0);
+}
+
+/// addFrameReference - This function is used to add a reference to the base of
+/// an abstract object on the stack frame of the current function.  This
+/// reference has base register as the FrameIndex offset until it is resolved.
+/// This allows a constant offset to be specified as well...
+///
+static inline const MachineInstrBuilder &
+addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0) {
+  MachineInstr *MI = MIB;
+  MachineFunction &MF = *MI->getParent()->getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+  const MCInstrDesc &MCID = MI->getDesc();
+  unsigned Flags = 0;
+  if (MCID.mayLoad())
+    Flags |= MachineMemOperand::MOLoad;
+  if (MCID.mayStore())
+    Flags |= MachineMemOperand::MOStore;
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI, Offset),
+                            Flags, MFI.getObjectSize(FI),
+                            MFI.getObjectAlignment(FI));
+  return addOffset(MIB.addFrameIndex(FI), Offset)
+            .addMemOperand(MMO);
+}
+
+/// addConstantPoolReference - This function is used to add a reference to the
+/// base of a constant value spilled to the per-function constant pool.  The
+/// reference uses the abstract ConstantPoolIndex which is retained until
+/// either machine code emission or assembly output. In PIC mode on x86-32,
+/// the GlobalBaseReg parameter can be used to make this a
+/// GlobalBaseReg-relative reference.
+///
+static inline const MachineInstrBuilder &
+addConstantPoolReference(const MachineInstrBuilder &MIB, unsigned CPI,
+                         unsigned GlobalBaseReg, unsigned char OpFlags) {
+  //FIXME: factor this
+  return MIB.addReg(GlobalBaseReg).addImm(1).addReg(0)
+    .addConstantPoolIndex(CPI, 0, OpFlags).addReg(0);
+}
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td b/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td
new file mode 100644
index 0000000..315f213
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td
@@ -0,0 +1,112 @@
+//===-- X86InstrCMovSetCC.td - Conditional Move and SetCC --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 conditional move and set on condition
+// instructions.
+//
+//===----------------------------------------------------------------------===//
+
+
+// SetCC instructions.
+multiclass CMOV<bits<8> opc, string Mnemonic, PatLeaf CondNode> {
+  let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
+      isCommutable = 1, SchedRW = [WriteALU] in {
+    def NAME#16rr
+      : I<opc, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+          !strconcat(Mnemonic, "{w}\t{$src2, $dst|$dst, $src2}"),
+          [(set GR16:$dst,
+                (X86cmov GR16:$src1, GR16:$src2, CondNode, EFLAGS))],
+                IIC_CMOV16_RR>, TB, OpSize16;
+    def NAME#32rr
+      : I<opc, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+          !strconcat(Mnemonic, "{l}\t{$src2, $dst|$dst, $src2}"),
+          [(set GR32:$dst,
+                (X86cmov GR32:$src1, GR32:$src2, CondNode, EFLAGS))],
+                IIC_CMOV32_RR>, TB, OpSize32;
+    def NAME#64rr
+      :RI<opc, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+          !strconcat(Mnemonic, "{q}\t{$src2, $dst|$dst, $src2}"),
+          [(set GR64:$dst,
+                (X86cmov GR64:$src1, GR64:$src2, CondNode, EFLAGS))],
+                IIC_CMOV32_RR>, TB;
+  }
+
+  let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
+      SchedRW = [WriteALULd, ReadAfterLd] in {
+    def NAME#16rm
+      : I<opc, MRMSrcMem, (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+          !strconcat(Mnemonic, "{w}\t{$src2, $dst|$dst, $src2}"),
+          [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                    CondNode, EFLAGS))], IIC_CMOV16_RM>,
+                                    TB, OpSize16;
+    def NAME#32rm
+      : I<opc, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+          !strconcat(Mnemonic, "{l}\t{$src2, $dst|$dst, $src2}"),
+          [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                    CondNode, EFLAGS))], IIC_CMOV32_RM>,
+                                    TB, OpSize32;
+    def NAME#64rm
+      :RI<opc, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+          !strconcat(Mnemonic, "{q}\t{$src2, $dst|$dst, $src2}"),
+          [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                    CondNode, EFLAGS))], IIC_CMOV32_RM>, TB;
+  } // Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst"
+} // end multiclass
+
+
+// Conditional Moves.
+defm CMOVO  : CMOV<0x40, "cmovo" , X86_COND_O>;
+defm CMOVNO : CMOV<0x41, "cmovno", X86_COND_NO>;
+defm CMOVB  : CMOV<0x42, "cmovb" , X86_COND_B>;
+defm CMOVAE : CMOV<0x43, "cmovae", X86_COND_AE>;
+defm CMOVE  : CMOV<0x44, "cmove" , X86_COND_E>;
+defm CMOVNE : CMOV<0x45, "cmovne", X86_COND_NE>;
+defm CMOVBE : CMOV<0x46, "cmovbe", X86_COND_BE>;
+defm CMOVA  : CMOV<0x47, "cmova" , X86_COND_A>;
+defm CMOVS  : CMOV<0x48, "cmovs" , X86_COND_S>;
+defm CMOVNS : CMOV<0x49, "cmovns", X86_COND_NS>;
+defm CMOVP  : CMOV<0x4A, "cmovp" , X86_COND_P>;
+defm CMOVNP : CMOV<0x4B, "cmovnp", X86_COND_NP>;
+defm CMOVL  : CMOV<0x4C, "cmovl" , X86_COND_L>;
+defm CMOVGE : CMOV<0x4D, "cmovge", X86_COND_GE>;
+defm CMOVLE : CMOV<0x4E, "cmovle", X86_COND_LE>;
+defm CMOVG  : CMOV<0x4F, "cmovg" , X86_COND_G>;
+
+
+// SetCC instructions.
+multiclass SETCC<bits<8> opc, string Mnemonic, PatLeaf OpNode> {
+  let Uses = [EFLAGS] in {
+    def r    : I<opc, MRMXr,  (outs GR8:$dst), (ins),
+                     !strconcat(Mnemonic, "\t$dst"),
+                     [(set GR8:$dst, (X86setcc OpNode, EFLAGS))],
+                     IIC_SET_R>, TB, Sched<[WriteALU]>;
+    def m    : I<opc, MRMXm,  (outs), (ins i8mem:$dst),
+                     !strconcat(Mnemonic, "\t$dst"),
+                     [(store (X86setcc OpNode, EFLAGS), addr:$dst)],
+                     IIC_SET_M>, TB, Sched<[WriteALU, WriteStore]>;
+  } // Uses = [EFLAGS]
+}
+
+defm SETO  : SETCC<0x90, "seto",  X86_COND_O>;   // is overflow bit set
+defm SETNO : SETCC<0x91, "setno", X86_COND_NO>;  // is overflow bit not set
+defm SETB  : SETCC<0x92, "setb",  X86_COND_B>;   // unsigned less than
+defm SETAE : SETCC<0x93, "setae", X86_COND_AE>;  // unsigned greater or equal
+defm SETE  : SETCC<0x94, "sete",  X86_COND_E>;   // equal to
+defm SETNE : SETCC<0x95, "setne", X86_COND_NE>;  // not equal to
+defm SETBE : SETCC<0x96, "setbe", X86_COND_BE>;  // unsigned less than or equal
+defm SETA  : SETCC<0x97, "seta",  X86_COND_A>;   // unsigned greater than
+defm SETS  : SETCC<0x98, "sets",  X86_COND_S>;   // is signed bit set
+defm SETNS : SETCC<0x99, "setns", X86_COND_NS>;  // is not signed
+defm SETP  : SETCC<0x9A, "setp",  X86_COND_P>;   // is parity bit set
+defm SETNP : SETCC<0x9B, "setnp", X86_COND_NP>;  // is parity bit not set
+defm SETL  : SETCC<0x9C, "setl",  X86_COND_L>;   // signed less than
+defm SETGE : SETCC<0x9D, "setge", X86_COND_GE>;  // signed greater or equal
+defm SETLE : SETCC<0x9E, "setle", X86_COND_LE>;  // signed less than or equal
+defm SETG  : SETCC<0x9F, "setg",  X86_COND_G>;   // signed greater than
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrCompiler.td b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td
new file mode 100644
index 0000000..ca4f608
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -0,0 +1,1776 @@
+//===- X86InstrCompiler.td - Compiler Pseudos and Patterns -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the various pseudo instructions used by the compiler,
+// as well as Pat patterns used during instruction selection.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Pattern Matching Support
+
+def GetLo32XForm : SDNodeXForm<imm, [{
+  // Transformation function: get the low 32 bits.
+  return getI32Imm((unsigned)N->getZExtValue());
+}]>;
+
+def GetLo8XForm : SDNodeXForm<imm, [{
+  // Transformation function: get the low 8 bits.
+  return getI8Imm((uint8_t)N->getZExtValue());
+}]>;
+
+
+//===----------------------------------------------------------------------===//
+// Random Pseudo Instructions.
+
+// PIC base construction.  This expands to code that looks like this:
+//     call  $next_inst
+//     popl %destreg"
+let neverHasSideEffects = 1, isNotDuplicable = 1, Uses = [ESP] in
+  def MOVPC32r : Ii32<0xE8, Pseudo, (outs GR32:$reg), (ins i32imm:$label),
+                      "", []>;
+
+
+// ADJCALLSTACKDOWN/UP implicitly use/def ESP because they may be expanded into
+// a stack adjustment and the codegen must know that they may modify the stack
+// pointer before prolog-epilog rewriting occurs.
+// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
+// sub / add which can clobber EFLAGS.
+let Defs = [ESP, EFLAGS], Uses = [ESP] in {
+def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt),
+                           "#ADJCALLSTACKDOWN",
+                           [(X86callseq_start timm:$amt)]>,
+                          Requires<[Not64BitMode]>;
+def ADJCALLSTACKUP32   : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
+                           "#ADJCALLSTACKUP",
+                           [(X86callseq_end timm:$amt1, timm:$amt2)]>,
+                          Requires<[Not64BitMode]>;
+}
+
+// ADJCALLSTACKDOWN/UP implicitly use/def RSP because they may be expanded into
+// a stack adjustment and the codegen must know that they may modify the stack
+// pointer before prolog-epilog rewriting occurs.
+// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
+// sub / add which can clobber EFLAGS.
+let Defs = [RSP, EFLAGS], Uses = [RSP] in {
+def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt),
+                           "#ADJCALLSTACKDOWN",
+                           [(X86callseq_start timm:$amt)]>,
+                          Requires<[In64BitMode]>;
+def ADJCALLSTACKUP64   : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
+                           "#ADJCALLSTACKUP",
+                           [(X86callseq_end timm:$amt1, timm:$amt2)]>,
+                          Requires<[In64BitMode]>;
+}
+
+
+
+// x86-64 va_start lowering magic.
+let usesCustomInserter = 1, Defs = [EFLAGS] in {
+def VASTART_SAVE_XMM_REGS : I<0, Pseudo,
+                              (outs),
+                              (ins GR8:$al,
+                                   i64imm:$regsavefi, i64imm:$offset,
+                                   variable_ops),
+                              "#VASTART_SAVE_XMM_REGS $al, $regsavefi, $offset",
+                              [(X86vastart_save_xmm_regs GR8:$al,
+                                                         imm:$regsavefi,
+                                                         imm:$offset),
+                               (implicit EFLAGS)]>;
+
+// The VAARG_64 pseudo-instruction takes the address of the va_list,
+// and places the address of the next argument into a register.
+let Defs = [EFLAGS] in
+def VAARG_64 : I<0, Pseudo,
+                 (outs GR64:$dst),
+                 (ins i8mem:$ap, i32imm:$size, i8imm:$mode, i32imm:$align),
+                 "#VAARG_64 $dst, $ap, $size, $mode, $align",
+                 [(set GR64:$dst,
+                    (X86vaarg64 addr:$ap, imm:$size, imm:$mode, imm:$align)),
+                  (implicit EFLAGS)]>;
+
+// Dynamic stack allocation yields a _chkstk or _alloca call for all Windows
+// targets.  These calls are needed to probe the stack when allocating more than
+// 4k bytes in one go. Touching the stack at 4K increments is necessary to
+// ensure that the guard pages used by the OS virtual memory manager are
+// allocated in correct sequence.
+// The main point of having separate instruction are extra unmodelled effects
+// (compared to ordinary calls) like stack pointer change.
+
+let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in
+  def WIN_ALLOCA : I<0, Pseudo, (outs), (ins),
+                     "# dynamic stack allocation",
+                     [(X86WinAlloca)]>;
+
+// When using segmented stacks these are lowered into instructions which first
+// check if the current stacklet has enough free memory. If it does, memory is
+// allocated by bumping the stack pointer. Otherwise memory is allocated from
+// the heap.
+
+let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in
+def SEG_ALLOCA_32 : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$size),
+                      "# variable sized alloca for segmented stacks",
+                      [(set GR32:$dst,
+                         (X86SegAlloca GR32:$size))]>,
+                    Requires<[Not64BitMode]>;
+
+let Defs = [RAX, RSP, EFLAGS], Uses = [RSP] in
+def SEG_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size),
+                      "# variable sized alloca for segmented stacks",
+                      [(set GR64:$dst,
+                         (X86SegAlloca GR64:$size))]>,
+                    Requires<[In64BitMode]>;
+}
+
+// The MSVC runtime contains an _ftol2 routine for converting floating-point
+// to integer values. It has a strange calling convention: the input is
+// popped from the x87 stack, and the return value is given in EDX:EAX. ECX is
+// used as a temporary register. No other registers (aside from flags) are
+// touched.
+// Microsoft toolchains do not support 80-bit precision, so a WIN_FTOL_80
+// variant is unnecessary.
+
+let Defs = [EAX, EDX, ECX, EFLAGS], FPForm = SpecialFP in {
+  def WIN_FTOL_32 : I<0, Pseudo, (outs), (ins RFP32:$src),
+                      "# win32 fptoui",
+                      [(X86WinFTOL RFP32:$src)]>,
+                    Requires<[Not64BitMode]>;
+
+  def WIN_FTOL_64 : I<0, Pseudo, (outs), (ins RFP64:$src),
+                      "# win32 fptoui",
+                      [(X86WinFTOL RFP64:$src)]>,
+                    Requires<[Not64BitMode]>;
+}
+
+//===----------------------------------------------------------------------===//
+// EH Pseudo Instructions
+//
+let SchedRW = [WriteSystem] in {
+let isTerminator = 1, isReturn = 1, isBarrier = 1,
+    hasCtrlDep = 1, isCodeGenOnly = 1 in {
+def EH_RETURN   : I<0xC3, RawFrm, (outs), (ins GR32:$addr),
+                    "ret\t#eh_return, addr: $addr",
+                    [(X86ehret GR32:$addr)], IIC_RET>, Sched<[WriteJumpLd]>;
+
+}
+
+let isTerminator = 1, isReturn = 1, isBarrier = 1,
+    hasCtrlDep = 1, isCodeGenOnly = 1 in {
+def EH_RETURN64   : I<0xC3, RawFrm, (outs), (ins GR64:$addr),
+                     "ret\t#eh_return, addr: $addr",
+                     [(X86ehret GR64:$addr)], IIC_RET>, Sched<[WriteJumpLd]>;
+
+}
+
+let hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1,
+    usesCustomInserter = 1 in {
+  def EH_SjLj_SetJmp32  : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$buf),
+                            "#EH_SJLJ_SETJMP32",
+                            [(set GR32:$dst, (X86eh_sjlj_setjmp addr:$buf))]>,
+                          Requires<[Not64BitMode]>;
+  def EH_SjLj_SetJmp64  : I<0, Pseudo, (outs GR32:$dst), (ins i64mem:$buf),
+                            "#EH_SJLJ_SETJMP64",
+                            [(set GR32:$dst, (X86eh_sjlj_setjmp addr:$buf))]>,
+                          Requires<[In64BitMode]>;
+  let isTerminator = 1 in {
+  def EH_SjLj_LongJmp32 : I<0, Pseudo, (outs), (ins i32mem:$buf),
+                            "#EH_SJLJ_LONGJMP32",
+                            [(X86eh_sjlj_longjmp addr:$buf)]>,
+                          Requires<[Not64BitMode]>;
+  def EH_SjLj_LongJmp64 : I<0, Pseudo, (outs), (ins i64mem:$buf),
+                            "#EH_SJLJ_LONGJMP64",
+                            [(X86eh_sjlj_longjmp addr:$buf)]>,
+                          Requires<[In64BitMode]>;
+  }
+}
+} // SchedRW
+
+let isBranch = 1, isTerminator = 1, isCodeGenOnly = 1 in {
+  def EH_SjLj_Setup : I<0, Pseudo, (outs), (ins brtarget:$dst),
+                        "#EH_SjLj_Setup\t$dst", []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions used by unwind info.
+//
+let isPseudo = 1 in {
+  def SEH_PushReg : I<0, Pseudo, (outs), (ins i32imm:$reg),
+                            "#SEH_PushReg $reg", []>;
+  def SEH_SaveReg : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$dst),
+                            "#SEH_SaveReg $reg, $dst", []>;
+  def SEH_SaveXMM : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$dst),
+                            "#SEH_SaveXMM $reg, $dst", []>;
+  def SEH_StackAlloc : I<0, Pseudo, (outs), (ins i32imm:$size),
+                            "#SEH_StackAlloc $size", []>;
+  def SEH_SetFrame : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$offset),
+                            "#SEH_SetFrame $reg, $offset", []>;
+  def SEH_PushFrame : I<0, Pseudo, (outs), (ins i1imm:$mode),
+                            "#SEH_PushFrame $mode", []>;
+  def SEH_EndPrologue : I<0, Pseudo, (outs), (ins),
+                            "#SEH_EndPrologue", []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions used by segmented stacks.
+//
+
+// This is lowered into a RET instruction by MCInstLower.  We need
+// this so that we don't have to have a MachineBasicBlock which ends
+// with a RET and also has successors.
+let isPseudo = 1 in {
+def MORESTACK_RET: I<0, Pseudo, (outs), (ins),
+                          "", []>;
+
+// This instruction is lowered to a RET followed by a MOV.  The two
+// instructions are not generated on a higher level since then the
+// verifier sees a MachineBasicBlock ending with a non-terminator.
+def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins),
+                                  "", []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Alias Instructions
+//===----------------------------------------------------------------------===//
+
+// Alias instruction mapping movr0 to xor.
+// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
+let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1,
+    isPseudo = 1 in
+def MOV32r0  : I<0, Pseudo, (outs GR32:$dst), (ins), "",
+                 [(set GR32:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>;
+
+// Other widths can also make use of the 32-bit xor, which may have a smaller
+// encoding and avoid partial register updates.
+def : Pat<(i8 0), (EXTRACT_SUBREG (MOV32r0), sub_8bit)>;
+def : Pat<(i16 0), (EXTRACT_SUBREG (MOV32r0), sub_16bit)>;
+def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)> {
+  let AddedComplexity = 20;
+}
+
+// Materialize i64 constant where top 32-bits are zero. This could theoretically
+// use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however
+// that would make it more difficult to rematerialize.
+let AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1,
+    isCodeGenOnly = 1, neverHasSideEffects = 1 in
+def MOV32ri64 : Ii32<0xb8, AddRegFrm, (outs GR32:$dst), (ins i64i32imm:$src),
+                     "", [], IIC_ALU_NONMEM>, Sched<[WriteALU]>;
+
+// This 64-bit pseudo-move can be used for both a 64-bit constant that is
+// actually the zero-extension of a 32-bit constant, and for labels in the
+// x86-64 small code model.
+def mov64imm32 : ComplexPattern<i64, 1, "SelectMOV64Imm32", [imm, X86Wrapper]>;
+
+let AddedComplexity = 1 in
+def : Pat<(i64 mov64imm32:$src),
+          (SUBREG_TO_REG (i64 0), (MOV32ri64 mov64imm32:$src), sub_32bit)>;
+
+// Use sbb to materialize carry bit.
+let Uses = [EFLAGS], Defs = [EFLAGS], isPseudo = 1, SchedRW = [WriteALU] in {
+// FIXME: These are pseudo ops that should be replaced with Pat<> patterns.
+// However, Pat<> can't replicate the destination reg into the inputs of the
+// result.
+def SETB_C8r : I<0, Pseudo, (outs GR8:$dst), (ins), "",
+                 [(set GR8:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
+def SETB_C16r : I<0, Pseudo, (outs GR16:$dst), (ins), "",
+                 [(set GR16:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
+def SETB_C32r : I<0, Pseudo, (outs GR32:$dst), (ins), "",
+                 [(set GR32:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
+def SETB_C64r : I<0, Pseudo, (outs GR64:$dst), (ins), "",
+                 [(set GR64:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
+} // isCodeGenOnly
+
+
+def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+          (SETB_C16r)>;
+def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+          (SETB_C32r)>;
+def : Pat<(i64 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+          (SETB_C64r)>;
+
+def : Pat<(i16 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+          (SETB_C16r)>;
+def : Pat<(i32 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+          (SETB_C32r)>;
+def : Pat<(i64 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+          (SETB_C64r)>;
+
+// We canonicalize 'setb' to "(and (sbb reg,reg), 1)" on the hope that the and
+// will be eliminated and that the sbb can be extended up to a wider type.  When
+// this happens, it is great.  However, if we are left with an 8-bit sbb and an
+// and, we might as well just match it as a setb.
+def : Pat<(and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1),
+          (SETBr)>;
+
+// (add OP, SETB) -> (adc OP, 0)
+def : Pat<(add (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR8:$op),
+          (ADC8ri GR8:$op, 0)>;
+def : Pat<(add (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR32:$op),
+          (ADC32ri8 GR32:$op, 0)>;
+def : Pat<(add (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR64:$op),
+          (ADC64ri8 GR64:$op, 0)>;
+
+// (sub OP, SETB) -> (sbb OP, 0)
+def : Pat<(sub GR8:$op, (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1)),
+          (SBB8ri GR8:$op, 0)>;
+def : Pat<(sub GR32:$op, (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1)),
+          (SBB32ri8 GR32:$op, 0)>;
+def : Pat<(sub GR64:$op, (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1)),
+          (SBB64ri8 GR64:$op, 0)>;
+
+// (sub OP, SETCC_CARRY) -> (adc OP, 0)
+def : Pat<(sub GR8:$op, (i8 (X86setcc_c X86_COND_B, EFLAGS))),
+          (ADC8ri GR8:$op, 0)>;
+def : Pat<(sub GR32:$op, (i32 (X86setcc_c X86_COND_B, EFLAGS))),
+          (ADC32ri8 GR32:$op, 0)>;
+def : Pat<(sub GR64:$op, (i64 (X86setcc_c X86_COND_B, EFLAGS))),
+          (ADC64ri8 GR64:$op, 0)>;
+
+//===----------------------------------------------------------------------===//
+// String Pseudo Instructions
+//
+let SchedRW = [WriteMicrocoded] in {
+let Defs = [ECX,EDI,ESI], Uses = [ECX,EDI,ESI], isCodeGenOnly = 1 in {
+def REP_MOVSB_32 : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}",
+                    [(X86rep_movs i8)], IIC_REP_MOVS>, REP,
+                   Requires<[Not64BitMode]>;
+def REP_MOVSW_32 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}",
+                    [(X86rep_movs i16)], IIC_REP_MOVS>, REP, OpSize16,
+                   Requires<[Not64BitMode]>;
+def REP_MOVSD_32 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}",
+                    [(X86rep_movs i32)], IIC_REP_MOVS>, REP, OpSize32,
+                   Requires<[Not64BitMode]>;
+}
+
+let Defs = [RCX,RDI,RSI], Uses = [RCX,RDI,RSI], isCodeGenOnly = 1 in {
+def REP_MOVSB_64 : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}",
+                    [(X86rep_movs i8)], IIC_REP_MOVS>, REP,
+                   Requires<[In64BitMode]>;
+def REP_MOVSW_64 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}",
+                    [(X86rep_movs i16)], IIC_REP_MOVS>, REP, OpSize16,
+                   Requires<[In64BitMode]>;
+def REP_MOVSD_64 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}",
+                    [(X86rep_movs i32)], IIC_REP_MOVS>, REP, OpSize32,
+                   Requires<[In64BitMode]>;
+def REP_MOVSQ_64 : RI<0xA5, RawFrm, (outs), (ins), "{rep;movsq|rep movsq}",
+                    [(X86rep_movs i64)], IIC_REP_MOVS>, REP,
+                   Requires<[In64BitMode]>;
+}
+
+// FIXME: Should use "(X86rep_stos AL)" as the pattern.
+let Defs = [ECX,EDI], isCodeGenOnly = 1 in {
+  let Uses = [AL,ECX,EDI] in
+  def REP_STOSB_32 : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}",
+                      [(X86rep_stos i8)], IIC_REP_STOS>, REP,
+                     Requires<[Not64BitMode]>;
+  let Uses = [AX,ECX,EDI] in
+  def REP_STOSW_32 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}",
+                      [(X86rep_stos i16)], IIC_REP_STOS>, REP, OpSize16,
+                     Requires<[Not64BitMode]>;
+  let Uses = [EAX,ECX,EDI] in
+  def REP_STOSD_32 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}",
+                      [(X86rep_stos i32)], IIC_REP_STOS>, REP, OpSize32,
+                     Requires<[Not64BitMode]>;
+}
+
+let Defs = [RCX,RDI], isCodeGenOnly = 1 in {
+  let Uses = [AL,RCX,RDI] in
+  def REP_STOSB_64 : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}",
+                      [(X86rep_stos i8)], IIC_REP_STOS>, REP,
+                     Requires<[In64BitMode]>;
+  let Uses = [AX,RCX,RDI] in
+  def REP_STOSW_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}",
+                      [(X86rep_stos i16)], IIC_REP_STOS>, REP, OpSize16,
+                     Requires<[In64BitMode]>;
+  let Uses = [RAX,RCX,RDI] in
+  def REP_STOSD_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}",
+                      [(X86rep_stos i32)], IIC_REP_STOS>, REP, OpSize32,
+                     Requires<[In64BitMode]>;
+
+  let Uses = [RAX,RCX,RDI] in
+  def REP_STOSQ_64 : RI<0xAB, RawFrm, (outs), (ins), "{rep;stosq|rep stosq}",
+                      [(X86rep_stos i64)], IIC_REP_STOS>, REP,
+                     Requires<[In64BitMode]>;
+}
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Thread Local Storage Instructions
+//
+
+// ELF TLS Support
+// All calls clobber the non-callee saved registers. ESP is marked as
+// a use to prevent stack-pointer assignments that appear immediately
+// before calls from potentially appearing dead.
+let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0,
+            MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+            XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+            XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
+    Uses = [ESP] in {
+def TLS_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
+                  "# TLS_addr32",
+                  [(X86tlsaddr tls32addr:$sym)]>,
+                  Requires<[Not64BitMode]>;
+def TLS_base_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
+                  "# TLS_base_addr32",
+                  [(X86tlsbaseaddr tls32baseaddr:$sym)]>,
+                  Requires<[Not64BitMode]>;
+}
+
+// All calls clobber the non-callee saved registers. RSP is marked as
+// a use to prevent stack-pointer assignments that appear immediately
+// before calls from potentially appearing dead.
+let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
+            FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, ST1,
+            MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+            XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+            XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
+    Uses = [RSP] in {
+def TLS_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
+                   "# TLS_addr64",
+                  [(X86tlsaddr tls64addr:$sym)]>,
+                  Requires<[In64BitMode]>;
+def TLS_base_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
+                   "# TLS_base_addr64",
+                  [(X86tlsbaseaddr tls64baseaddr:$sym)]>,
+                  Requires<[In64BitMode]>;
+}
+
+// Darwin TLS Support
+// For i386, the address of the thunk is passed on the stack, on return the
+// address of the variable is in %eax.  %ecx is trashed during the function
+// call.  All other registers are preserved.
+let Defs = [EAX, ECX, EFLAGS],
+    Uses = [ESP],
+    usesCustomInserter = 1 in
+def TLSCall_32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
+                "# TLSCall_32",
+                [(X86TLSCall addr:$sym)]>,
+                Requires<[Not64BitMode]>;
+
+// For x86_64, the address of the thunk is passed in %rdi, on return
+// the address of the variable is in %rax.  All other registers are preserved.
+let Defs = [RAX, EFLAGS],
+    Uses = [RSP, RDI],
+    usesCustomInserter = 1 in
+def TLSCall_64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
+                  "# TLSCall_64",
+                  [(X86TLSCall addr:$sym)]>,
+                  Requires<[In64BitMode]>;
+
+
+//===----------------------------------------------------------------------===//
+// Conditional Move Pseudo Instructions
+
+// X86 doesn't have 8-bit conditional moves. Use a customInserter to
+// emit control flow. An alternative to this is to mark i8 SELECT as Promote,
+// however that requires promoting the operands, and can induce additional
+// i8 register pressure.
+let usesCustomInserter = 1, Uses = [EFLAGS] in {
+def CMOV_GR8 : I<0, Pseudo,
+                 (outs GR8:$dst), (ins GR8:$src1, GR8:$src2, i8imm:$cond),
+                 "#CMOV_GR8 PSEUDO!",
+                 [(set GR8:$dst, (X86cmov GR8:$src1, GR8:$src2,
+                                          imm:$cond, EFLAGS))]>;
+
+let Predicates = [NoCMov] in {
+def CMOV_GR32 : I<0, Pseudo,
+                    (outs GR32:$dst), (ins GR32:$src1, GR32:$src2, i8imm:$cond),
+                    "#CMOV_GR32* PSEUDO!",
+                    [(set GR32:$dst,
+                      (X86cmov GR32:$src1, GR32:$src2, imm:$cond, EFLAGS))]>;
+def CMOV_GR16 : I<0, Pseudo,
+                    (outs GR16:$dst), (ins GR16:$src1, GR16:$src2, i8imm:$cond),
+                    "#CMOV_GR16* PSEUDO!",
+                    [(set GR16:$dst,
+                      (X86cmov GR16:$src1, GR16:$src2, imm:$cond, EFLAGS))]>;
+} // Predicates = [NoCMov]
+
+// fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no
+// SSE1.
+let Predicates = [FPStackf32] in
+def CMOV_RFP32 : I<0, Pseudo,
+                    (outs RFP32:$dst),
+                    (ins RFP32:$src1, RFP32:$src2, i8imm:$cond),
+                    "#CMOV_RFP32 PSEUDO!",
+                    [(set RFP32:$dst,
+                      (X86cmov RFP32:$src1, RFP32:$src2, imm:$cond,
+                                                  EFLAGS))]>;
+// fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no
+// SSE2.
+let Predicates = [FPStackf64] in
+def CMOV_RFP64 : I<0, Pseudo,
+                    (outs RFP64:$dst),
+                    (ins RFP64:$src1, RFP64:$src2, i8imm:$cond),
+                    "#CMOV_RFP64 PSEUDO!",
+                    [(set RFP64:$dst,
+                      (X86cmov RFP64:$src1, RFP64:$src2, imm:$cond,
+                                                  EFLAGS))]>;
+def CMOV_RFP80 : I<0, Pseudo,
+                    (outs RFP80:$dst),
+                    (ins RFP80:$src1, RFP80:$src2, i8imm:$cond),
+                    "#CMOV_RFP80 PSEUDO!",
+                    [(set RFP80:$dst,
+                      (X86cmov RFP80:$src1, RFP80:$src2, imm:$cond,
+                                                  EFLAGS))]>;
+} // UsesCustomInserter = 1, Uses = [EFLAGS]
+
+
+//===----------------------------------------------------------------------===//
+// Normal-Instructions-With-Lock-Prefix Pseudo Instructions
+//===----------------------------------------------------------------------===//
+
+// FIXME: Use normal instructions and add lock prefix dynamically.
+
+// Memory barriers
+
+// TODO: Get this to fold the constant into the instruction.
+let isCodeGenOnly = 1, Defs = [EFLAGS] in
+def OR32mrLocked  : I<0x09, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$zero),
+                      "or{l}\t{$zero, $dst|$dst, $zero}",
+                      [], IIC_ALU_MEM>, Requires<[Not64BitMode]>, LOCK,
+                    Sched<[WriteALULd, WriteRMW]>;
+
+let hasSideEffects = 1 in
+def Int_MemBarrier : I<0, Pseudo, (outs), (ins),
+                     "#MEMBARRIER",
+                     [(X86MemBarrier)]>, Sched<[WriteLoad]>;
+
+// RegOpc corresponds to the mr version of the instruction
+// ImmOpc corresponds to the mi version of the instruction
+// ImmOpc8 corresponds to the mi8 version of the instruction
+// ImmMod corresponds to the instruction format of the mi and mi8 versions
+multiclass LOCK_ArithBinOp<bits<8> RegOpc, bits<8> ImmOpc, bits<8> ImmOpc8,
+                           Format ImmMod, string mnemonic> {
+let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1,
+    SchedRW = [WriteALULd, WriteRMW] in {
+
+def NAME#8mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
+                  RegOpc{3}, RegOpc{2}, RegOpc{1}, 0 },
+                  MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2),
+                  !strconcat(mnemonic, "{b}\t",
+                             "{$src2, $dst|$dst, $src2}"),
+                  [], IIC_ALU_NONMEM>, LOCK;
+def NAME#16mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
+                   RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
+                   MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+                   !strconcat(mnemonic, "{w}\t",
+                              "{$src2, $dst|$dst, $src2}"),
+                   [], IIC_ALU_NONMEM>, OpSize16, LOCK;
+def NAME#32mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
+                   RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
+                   MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+                   !strconcat(mnemonic, "{l}\t",
+                              "{$src2, $dst|$dst, $src2}"),
+                   [], IIC_ALU_NONMEM>, OpSize32, LOCK;
+def NAME#64mr : RI<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
+                    RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
+                    MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+                    !strconcat(mnemonic, "{q}\t",
+                               "{$src2, $dst|$dst, $src2}"),
+                    [], IIC_ALU_NONMEM>, LOCK;
+
+def NAME#8mi : Ii8<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
+                    ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 0 },
+                    ImmMod, (outs), (ins i8mem :$dst, i8imm :$src2),
+                    !strconcat(mnemonic, "{b}\t",
+                               "{$src2, $dst|$dst, $src2}"),
+                    [], IIC_ALU_MEM>, LOCK;
+
+def NAME#16mi : Ii16<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
+                      ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
+                      ImmMod, (outs), (ins i16mem :$dst, i16imm :$src2),
+                      !strconcat(mnemonic, "{w}\t",
+                                 "{$src2, $dst|$dst, $src2}"),
+                      [], IIC_ALU_MEM>, OpSize16, LOCK;
+
+def NAME#32mi : Ii32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
+                      ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
+                      ImmMod, (outs), (ins i32mem :$dst, i32imm :$src2),
+                      !strconcat(mnemonic, "{l}\t",
+                                 "{$src2, $dst|$dst, $src2}"),
+                      [], IIC_ALU_MEM>, OpSize32, LOCK;
+
+def NAME#64mi32 : RIi32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
+                         ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
+                         ImmMod, (outs), (ins i64mem :$dst, i64i32imm :$src2),
+                         !strconcat(mnemonic, "{q}\t",
+                                    "{$src2, $dst|$dst, $src2}"),
+                         [], IIC_ALU_MEM>, LOCK;
+
+def NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
+                      ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
+                      ImmMod, (outs), (ins i16mem :$dst, i16i8imm :$src2),
+                      !strconcat(mnemonic, "{w}\t",
+                                 "{$src2, $dst|$dst, $src2}"),
+                      [], IIC_ALU_MEM>, OpSize16, LOCK;
+def NAME#32mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
+                      ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
+                      ImmMod, (outs), (ins i32mem :$dst, i32i8imm :$src2),
+                      !strconcat(mnemonic, "{l}\t",
+                                 "{$src2, $dst|$dst, $src2}"),
+                      [], IIC_ALU_MEM>, OpSize32, LOCK;
+def NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
+                       ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
+                       ImmMod, (outs), (ins i64mem :$dst, i64i8imm :$src2),
+                       !strconcat(mnemonic, "{q}\t",
+                                  "{$src2, $dst|$dst, $src2}"),
+                       [], IIC_ALU_MEM>, LOCK;
+
+}
+
+}
+
+defm LOCK_ADD : LOCK_ArithBinOp<0x00, 0x80, 0x83, MRM0m, "add">;
+defm LOCK_SUB : LOCK_ArithBinOp<0x28, 0x80, 0x83, MRM5m, "sub">;
+defm LOCK_OR  : LOCK_ArithBinOp<0x08, 0x80, 0x83, MRM1m, "or">;
+defm LOCK_AND : LOCK_ArithBinOp<0x20, 0x80, 0x83, MRM4m, "and">;
+defm LOCK_XOR : LOCK_ArithBinOp<0x30, 0x80, 0x83, MRM6m, "xor">;
+
+// Optimized codegen when the non-memory output is not used.
+multiclass LOCK_ArithUnOp<bits<8> Opc8, bits<8> Opc, Format Form,
+                          string mnemonic> {
+let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1,
+    SchedRW = [WriteALULd, WriteRMW] in {
+
+def NAME#8m  : I<Opc8, Form, (outs), (ins i8mem :$dst),
+                 !strconcat(mnemonic, "{b}\t$dst"),
+                 [], IIC_UNARY_MEM>, LOCK;
+def NAME#16m : I<Opc, Form, (outs), (ins i16mem:$dst),
+                 !strconcat(mnemonic, "{w}\t$dst"),
+                 [], IIC_UNARY_MEM>, OpSize16, LOCK;
+def NAME#32m : I<Opc, Form, (outs), (ins i32mem:$dst),
+                 !strconcat(mnemonic, "{l}\t$dst"),
+                 [], IIC_UNARY_MEM>, OpSize32, LOCK;
+def NAME#64m : RI<Opc, Form, (outs), (ins i64mem:$dst),
+                  !strconcat(mnemonic, "{q}\t$dst"),
+                  [], IIC_UNARY_MEM>, LOCK;
+}
+}
+
+defm LOCK_INC    : LOCK_ArithUnOp<0xFE, 0xFF, MRM0m, "inc">;
+defm LOCK_DEC    : LOCK_ArithUnOp<0xFE, 0xFF, MRM1m, "dec">;
+
+// Atomic compare and swap.
+multiclass LCMPXCHG_UnOp<bits<8> Opc, Format Form, string mnemonic,
+                         SDPatternOperator frag, X86MemOperand x86memop,
+                         InstrItinClass itin> {
+let isCodeGenOnly = 1 in {
+  def NAME : I<Opc, Form, (outs), (ins x86memop:$ptr),
+               !strconcat(mnemonic, "\t$ptr"),
+               [(frag addr:$ptr)], itin>, TB, LOCK;
+}
+}
+
+multiclass LCMPXCHG_BinOp<bits<8> Opc8, bits<8> Opc, Format Form,
+                          string mnemonic, SDPatternOperator frag,
+                          InstrItinClass itin8, InstrItinClass itin> {
+let isCodeGenOnly = 1, SchedRW = [WriteALULd, WriteRMW] in {
+  let Defs = [AL, EFLAGS], Uses = [AL] in
+  def NAME#8  : I<Opc8, Form, (outs), (ins i8mem:$ptr, GR8:$swap),
+                  !strconcat(mnemonic, "{b}\t{$swap, $ptr|$ptr, $swap}"),
+                  [(frag addr:$ptr, GR8:$swap, 1)], itin8>, TB, LOCK;
+  let Defs = [AX, EFLAGS], Uses = [AX] in
+  def NAME#16 : I<Opc, Form, (outs), (ins i16mem:$ptr, GR16:$swap),
+                  !strconcat(mnemonic, "{w}\t{$swap, $ptr|$ptr, $swap}"),
+                  [(frag addr:$ptr, GR16:$swap, 2)], itin>, TB, OpSize16, LOCK;
+  let Defs = [EAX, EFLAGS], Uses = [EAX] in
+  def NAME#32 : I<Opc, Form, (outs), (ins i32mem:$ptr, GR32:$swap),
+                  !strconcat(mnemonic, "{l}\t{$swap, $ptr|$ptr, $swap}"),
+                  [(frag addr:$ptr, GR32:$swap, 4)], itin>, TB, OpSize32, LOCK;
+  let Defs = [RAX, EFLAGS], Uses = [RAX] in
+  def NAME#64 : RI<Opc, Form, (outs), (ins i64mem:$ptr, GR64:$swap),
+                   !strconcat(mnemonic, "{q}\t{$swap, $ptr|$ptr, $swap}"),
+                   [(frag addr:$ptr, GR64:$swap, 8)], itin>, TB, LOCK;
+}
+}
+
+let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX],
+    SchedRW = [WriteALULd, WriteRMW] in {
+defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b",
+                                X86cas8, i64mem,
+                                IIC_CMPX_LOCK_8B>;
+}
+
+let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX],
+    Predicates = [HasCmpxchg16b], SchedRW = [WriteALULd, WriteRMW] in {
+defm LCMPXCHG16B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg16b",
+                                 X86cas16, i128mem,
+                                 IIC_CMPX_LOCK_16B>, REX_W;
+}
+
+defm LCMPXCHG : LCMPXCHG_BinOp<0xB0, 0xB1, MRMDestMem, "cmpxchg",
+                               X86cas, IIC_CMPX_LOCK_8, IIC_CMPX_LOCK>;
+
+// Atomic exchange and add
+multiclass ATOMIC_LOAD_BINOP<bits<8> opc8, bits<8> opc, string mnemonic,
+                             string frag,
+                             InstrItinClass itin8, InstrItinClass itin> {
+  let Constraints = "$val = $dst", Defs = [EFLAGS], isCodeGenOnly = 1,
+      SchedRW = [WriteALULd, WriteRMW] in {
+    def NAME#8  : I<opc8, MRMSrcMem, (outs GR8:$dst),
+                    (ins GR8:$val, i8mem:$ptr),
+                    !strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"),
+                    [(set GR8:$dst,
+                          (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))],
+                    itin8>;
+    def NAME#16 : I<opc, MRMSrcMem, (outs GR16:$dst),
+                    (ins GR16:$val, i16mem:$ptr),
+                    !strconcat(mnemonic, "{w}\t{$val, $ptr|$ptr, $val}"),
+                    [(set
+                       GR16:$dst,
+                       (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))],
+                    itin>, OpSize16;
+    def NAME#32 : I<opc, MRMSrcMem, (outs GR32:$dst),
+                    (ins GR32:$val, i32mem:$ptr),
+                    !strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"),
+                    [(set
+                       GR32:$dst,
+                       (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))],
+                    itin>, OpSize32;
+    def NAME#64 : RI<opc, MRMSrcMem, (outs GR64:$dst),
+                     (ins GR64:$val, i64mem:$ptr),
+                     !strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"),
+                     [(set
+                        GR64:$dst,
+                        (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))],
+                     itin>;
+  }
+}
+
+defm LXADD : ATOMIC_LOAD_BINOP<0xc0, 0xc1, "xadd", "atomic_load_add",
+                               IIC_XADD_LOCK_MEM8, IIC_XADD_LOCK_MEM>,
+             TB, LOCK;
+
+def ACQUIRE_MOV8rm  : I<0, Pseudo, (outs GR8 :$dst), (ins i8mem :$src),
+                      "#ACQUIRE_MOV PSEUDO!",
+                      [(set GR8:$dst,  (atomic_load_8  addr:$src))]>;
+def ACQUIRE_MOV16rm : I<0, Pseudo, (outs GR16:$dst), (ins i16mem:$src),
+                      "#ACQUIRE_MOV PSEUDO!",
+                      [(set GR16:$dst, (atomic_load_16 addr:$src))]>;
+def ACQUIRE_MOV32rm : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$src),
+                      "#ACQUIRE_MOV PSEUDO!",
+                      [(set GR32:$dst, (atomic_load_32 addr:$src))]>;
+def ACQUIRE_MOV64rm : I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$src),
+                      "#ACQUIRE_MOV PSEUDO!",
+                      [(set GR64:$dst, (atomic_load_64 addr:$src))]>;
+
+def RELEASE_MOV8mr  : I<0, Pseudo, (outs), (ins i8mem :$dst, GR8 :$src),
+                        "#RELEASE_MOV PSEUDO!",
+                        [(atomic_store_8  addr:$dst, GR8 :$src)]>;
+def RELEASE_MOV16mr : I<0, Pseudo, (outs), (ins i16mem:$dst, GR16:$src),
+                        "#RELEASE_MOV PSEUDO!",
+                        [(atomic_store_16 addr:$dst, GR16:$src)]>;
+def RELEASE_MOV32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, GR32:$src),
+                        "#RELEASE_MOV PSEUDO!",
+                        [(atomic_store_32 addr:$dst, GR32:$src)]>;
+def RELEASE_MOV64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src),
+                        "#RELEASE_MOV PSEUDO!",
+                        [(atomic_store_64 addr:$dst, GR64:$src)]>;
+
+//===----------------------------------------------------------------------===//
+// Conditional Move Pseudo Instructions.
+//===----------------------------------------------------------------------===//
+
+
+// CMOV* - Used to implement the SSE SELECT DAG operation.  Expanded after
+// instruction selection into a branch sequence.
+let Uses = [EFLAGS], usesCustomInserter = 1 in {
+  def CMOV_FR32 : I<0, Pseudo,
+                    (outs FR32:$dst), (ins FR32:$t, FR32:$f, i8imm:$cond),
+                    "#CMOV_FR32 PSEUDO!",
+                    [(set FR32:$dst, (X86cmov FR32:$t, FR32:$f, imm:$cond,
+                                                  EFLAGS))]>;
+  def CMOV_FR64 : I<0, Pseudo,
+                    (outs FR64:$dst), (ins FR64:$t, FR64:$f, i8imm:$cond),
+                    "#CMOV_FR64 PSEUDO!",
+                    [(set FR64:$dst, (X86cmov FR64:$t, FR64:$f, imm:$cond,
+                                                  EFLAGS))]>;
+  def CMOV_V4F32 : I<0, Pseudo,
+                    (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond),
+                    "#CMOV_V4F32 PSEUDO!",
+                    [(set VR128:$dst,
+                      (v4f32 (X86cmov VR128:$t, VR128:$f, imm:$cond,
+                                          EFLAGS)))]>;
+  def CMOV_V2F64 : I<0, Pseudo,
+                    (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond),
+                    "#CMOV_V2F64 PSEUDO!",
+                    [(set VR128:$dst,
+                      (v2f64 (X86cmov VR128:$t, VR128:$f, imm:$cond,
+                                          EFLAGS)))]>;
+  def CMOV_V2I64 : I<0, Pseudo,
+                    (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond),
+                    "#CMOV_V2I64 PSEUDO!",
+                    [(set VR128:$dst,
+                      (v2i64 (X86cmov VR128:$t, VR128:$f, imm:$cond,
+                                          EFLAGS)))]>;
+  def CMOV_V8F32 : I<0, Pseudo,
+                    (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond),
+                    "#CMOV_V8F32 PSEUDO!",
+                    [(set VR256:$dst,
+                      (v8f32 (X86cmov VR256:$t, VR256:$f, imm:$cond,
+                                          EFLAGS)))]>;
+  def CMOV_V4F64 : I<0, Pseudo,
+                    (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond),
+                    "#CMOV_V4F64 PSEUDO!",
+                    [(set VR256:$dst,
+                      (v4f64 (X86cmov VR256:$t, VR256:$f, imm:$cond,
+                                          EFLAGS)))]>;
+  def CMOV_V4I64 : I<0, Pseudo,
+                    (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond),
+                    "#CMOV_V4I64 PSEUDO!",
+                    [(set VR256:$dst,
+                      (v4i64 (X86cmov VR256:$t, VR256:$f, imm:$cond,
+                                          EFLAGS)))]>;
+  def CMOV_V8I64 : I<0, Pseudo,
+                    (outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond),
+                    "#CMOV_V8I64 PSEUDO!",
+                    [(set VR512:$dst,
+                      (v8i64 (X86cmov VR512:$t, VR512:$f, imm:$cond,
+                                          EFLAGS)))]>;
+  def CMOV_V8F64 : I<0, Pseudo,
+                    (outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond),
+                    "#CMOV_V8F64 PSEUDO!",
+                    [(set VR512:$dst,
+                      (v8f64 (X86cmov VR512:$t, VR512:$f, imm:$cond,
+                                          EFLAGS)))]>;
+  def CMOV_V16F32 : I<0, Pseudo,
+                    (outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond),
+                    "#CMOV_V16F32 PSEUDO!",
+                    [(set VR512:$dst,
+                      (v16f32 (X86cmov VR512:$t, VR512:$f, imm:$cond,
+                                          EFLAGS)))]>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// DAG Pattern Matching Rules
+//===----------------------------------------------------------------------===//
+
+// ConstantPool GlobalAddress, ExternalSymbol, and JumpTable
+def : Pat<(i32 (X86Wrapper tconstpool  :$dst)), (MOV32ri tconstpool  :$dst)>;
+def : Pat<(i32 (X86Wrapper tjumptable  :$dst)), (MOV32ri tjumptable  :$dst)>;
+def : Pat<(i32 (X86Wrapper tglobaltlsaddr:$dst)),(MOV32ri tglobaltlsaddr:$dst)>;
+def : Pat<(i32 (X86Wrapper tglobaladdr :$dst)), (MOV32ri tglobaladdr :$dst)>;
+def : Pat<(i32 (X86Wrapper texternalsym:$dst)), (MOV32ri texternalsym:$dst)>;
+def : Pat<(i32 (X86Wrapper tblockaddress:$dst)), (MOV32ri tblockaddress:$dst)>;
+
+def : Pat<(add GR32:$src1, (X86Wrapper tconstpool:$src2)),
+          (ADD32ri GR32:$src1, tconstpool:$src2)>;
+def : Pat<(add GR32:$src1, (X86Wrapper tjumptable:$src2)),
+          (ADD32ri GR32:$src1, tjumptable:$src2)>;
+def : Pat<(add GR32:$src1, (X86Wrapper tglobaladdr :$src2)),
+          (ADD32ri GR32:$src1, tglobaladdr:$src2)>;
+def : Pat<(add GR32:$src1, (X86Wrapper texternalsym:$src2)),
+          (ADD32ri GR32:$src1, texternalsym:$src2)>;
+def : Pat<(add GR32:$src1, (X86Wrapper tblockaddress:$src2)),
+          (ADD32ri GR32:$src1, tblockaddress:$src2)>;
+
+def : Pat<(store (i32 (X86Wrapper tglobaladdr:$src)), addr:$dst),
+          (MOV32mi addr:$dst, tglobaladdr:$src)>;
+def : Pat<(store (i32 (X86Wrapper texternalsym:$src)), addr:$dst),
+          (MOV32mi addr:$dst, texternalsym:$src)>;
+def : Pat<(store (i32 (X86Wrapper tblockaddress:$src)), addr:$dst),
+          (MOV32mi addr:$dst, tblockaddress:$src)>;
+
+// ConstantPool GlobalAddress, ExternalSymbol, and JumpTable when not in small
+// code model mode, should use 'movabs'.  FIXME: This is really a hack, the
+//  'movabs' predicate should handle this sort of thing.
+def : Pat<(i64 (X86Wrapper tconstpool  :$dst)),
+          (MOV64ri tconstpool  :$dst)>, Requires<[FarData]>;
+def : Pat<(i64 (X86Wrapper tjumptable  :$dst)),
+          (MOV64ri tjumptable  :$dst)>, Requires<[FarData]>;
+def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)),
+          (MOV64ri tglobaladdr :$dst)>, Requires<[FarData]>;
+def : Pat<(i64 (X86Wrapper texternalsym:$dst)),
+          (MOV64ri texternalsym:$dst)>, Requires<[FarData]>;
+def : Pat<(i64 (X86Wrapper tblockaddress:$dst)),
+          (MOV64ri tblockaddress:$dst)>, Requires<[FarData]>;
+
+// In kernel code model, we can get the address of a label
+// into a register with 'movq'.  FIXME: This is a hack, the 'imm' predicate of
+// the MOV64ri32 should accept these.
+def : Pat<(i64 (X86Wrapper tconstpool  :$dst)),
+          (MOV64ri32 tconstpool  :$dst)>, Requires<[KernelCode]>;
+def : Pat<(i64 (X86Wrapper tjumptable  :$dst)),
+          (MOV64ri32 tjumptable  :$dst)>, Requires<[KernelCode]>;
+def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)),
+          (MOV64ri32 tglobaladdr :$dst)>, Requires<[KernelCode]>;
+def : Pat<(i64 (X86Wrapper texternalsym:$dst)),
+          (MOV64ri32 texternalsym:$dst)>, Requires<[KernelCode]>;
+def : Pat<(i64 (X86Wrapper tblockaddress:$dst)),
+          (MOV64ri32 tblockaddress:$dst)>, Requires<[KernelCode]>;
+
+// If we have small model and -static mode, it is safe to store global addresses
+// directly as immediates.  FIXME: This is really a hack, the 'imm' predicate
+// for MOV64mi32 should handle this sort of thing.
+def : Pat<(store (i64 (X86Wrapper tconstpool:$src)), addr:$dst),
+          (MOV64mi32 addr:$dst, tconstpool:$src)>,
+          Requires<[NearData, IsStatic]>;
+def : Pat<(store (i64 (X86Wrapper tjumptable:$src)), addr:$dst),
+          (MOV64mi32 addr:$dst, tjumptable:$src)>,
+          Requires<[NearData, IsStatic]>;
+def : Pat<(store (i64 (X86Wrapper tglobaladdr:$src)), addr:$dst),
+          (MOV64mi32 addr:$dst, tglobaladdr:$src)>,
+          Requires<[NearData, IsStatic]>;
+def : Pat<(store (i64 (X86Wrapper texternalsym:$src)), addr:$dst),
+          (MOV64mi32 addr:$dst, texternalsym:$src)>,
+          Requires<[NearData, IsStatic]>;
+def : Pat<(store (i64 (X86Wrapper tblockaddress:$src)), addr:$dst),
+          (MOV64mi32 addr:$dst, tblockaddress:$src)>,
+          Requires<[NearData, IsStatic]>;
+
+// Calls
+
+// tls has some funny stuff here...
+// This corresponds to movabs $foo@tpoff, %rax
+def : Pat<(i64 (X86Wrapper tglobaltlsaddr :$dst)),
+          (MOV64ri32 tglobaltlsaddr :$dst)>;
+// This corresponds to add $foo@tpoff, %rax
+def : Pat<(add GR64:$src1, (X86Wrapper tglobaltlsaddr :$dst)),
+          (ADD64ri32 GR64:$src1, tglobaltlsaddr :$dst)>;
+
+
+// Direct PC relative function call for small code model. 32-bit displacement
+// sign extended to 64-bit.
+def : Pat<(X86call (i64 tglobaladdr:$dst)),
+          (CALL64pcrel32 tglobaladdr:$dst)>;
+def : Pat<(X86call (i64 texternalsym:$dst)),
+          (CALL64pcrel32 texternalsym:$dst)>;
+
+// Tailcall stuff. The TCRETURN instructions execute after the epilog, so they
+// can never use callee-saved registers. That is the purpose of the GR64_TC
+// register classes.
+//
+// The only volatile register that is never used by the calling convention is
+// %r11. This happens when calling a vararg function with 6 arguments.
+//
+// Match an X86tcret that uses less than 7 volatile registers.
+def X86tcret_6regs : PatFrag<(ops node:$ptr, node:$off),
+                             (X86tcret node:$ptr, node:$off), [{
+  // X86tcret args: (*chain, ptr, imm, regs..., glue)
+  unsigned NumRegs = 0;
+  for (unsigned i = 3, e = N->getNumOperands(); i != e; ++i)
+    if (isa<RegisterSDNode>(N->getOperand(i)) && ++NumRegs > 6)
+      return false;
+  return true;
+}]>;
+
+def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
+          (TCRETURNri ptr_rc_tailcall:$dst, imm:$off)>,
+          Requires<[Not64BitMode]>;
+
+// FIXME: This is disabled for 32-bit PIC mode because the global base
+// register which is part of the address mode may be assigned a
+// callee-saved register.
+def : Pat<(X86tcret (load addr:$dst), imm:$off),
+          (TCRETURNmi addr:$dst, imm:$off)>,
+          Requires<[Not64BitMode, IsNotPIC]>;
+
+def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off),
+          (TCRETURNdi texternalsym:$dst, imm:$off)>,
+          Requires<[Not64BitMode]>;
+
+def : Pat<(X86tcret (i32 texternalsym:$dst), imm:$off),
+          (TCRETURNdi texternalsym:$dst, imm:$off)>,
+          Requires<[Not64BitMode]>;
+
+def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
+          (TCRETURNri64 ptr_rc_tailcall:$dst, imm:$off)>,
+          Requires<[In64BitMode]>;
+
+// Don't fold loads into X86tcret requiring more than 6 regs.
+// There wouldn't be enough scratch registers for base+index.
+def : Pat<(X86tcret_6regs (load addr:$dst), imm:$off),
+          (TCRETURNmi64 addr:$dst, imm:$off)>,
+          Requires<[In64BitMode]>;
+
+def : Pat<(X86tcret (i64 tglobaladdr:$dst), imm:$off),
+          (TCRETURNdi64 tglobaladdr:$dst, imm:$off)>,
+          Requires<[In64BitMode]>;
+
+def : Pat<(X86tcret (i64 texternalsym:$dst), imm:$off),
+          (TCRETURNdi64 texternalsym:$dst, imm:$off)>,
+          Requires<[In64BitMode]>;
+
+// Normal calls, with various flavors of addresses.
+def : Pat<(X86call (i32 tglobaladdr:$dst)),
+          (CALLpcrel32 tglobaladdr:$dst)>;
+def : Pat<(X86call (i32 texternalsym:$dst)),
+          (CALLpcrel32 texternalsym:$dst)>;
+def : Pat<(X86call (i32 imm:$dst)),
+          (CALLpcrel32 imm:$dst)>, Requires<[CallImmAddr]>;
+
+// Comparisons.
+
+// TEST R,R is smaller than CMP R,0
+def : Pat<(X86cmp GR8:$src1, 0),
+          (TEST8rr GR8:$src1, GR8:$src1)>;
+def : Pat<(X86cmp GR16:$src1, 0),
+          (TEST16rr GR16:$src1, GR16:$src1)>;
+def : Pat<(X86cmp GR32:$src1, 0),
+          (TEST32rr GR32:$src1, GR32:$src1)>;
+def : Pat<(X86cmp GR64:$src1, 0),
+          (TEST64rr GR64:$src1, GR64:$src1)>;
+
+// Conditional moves with folded loads with operands swapped and conditions
+// inverted.
+multiclass CMOVmr<PatLeaf InvertedCond, Instruction Inst16, Instruction Inst32,
+                  Instruction Inst64> {
+  let Predicates = [HasCMov] in {
+    def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, InvertedCond, EFLAGS),
+              (Inst16 GR16:$src2, addr:$src1)>;
+    def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, InvertedCond, EFLAGS),
+              (Inst32 GR32:$src2, addr:$src1)>;
+    def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, InvertedCond, EFLAGS),
+              (Inst64 GR64:$src2, addr:$src1)>;
+  }
+}
+
+defm : CMOVmr<X86_COND_B , CMOVAE16rm, CMOVAE32rm, CMOVAE64rm>;
+defm : CMOVmr<X86_COND_AE, CMOVB16rm , CMOVB32rm , CMOVB64rm>;
+defm : CMOVmr<X86_COND_E , CMOVNE16rm, CMOVNE32rm, CMOVNE64rm>;
+defm : CMOVmr<X86_COND_NE, CMOVE16rm , CMOVE32rm , CMOVE64rm>;
+defm : CMOVmr<X86_COND_BE, CMOVA16rm , CMOVA32rm , CMOVA64rm>;
+defm : CMOVmr<X86_COND_A , CMOVBE16rm, CMOVBE32rm, CMOVBE64rm>;
+defm : CMOVmr<X86_COND_L , CMOVGE16rm, CMOVGE32rm, CMOVGE64rm>;
+defm : CMOVmr<X86_COND_GE, CMOVL16rm , CMOVL32rm , CMOVL64rm>;
+defm : CMOVmr<X86_COND_LE, CMOVG16rm , CMOVG32rm , CMOVG64rm>;
+defm : CMOVmr<X86_COND_G , CMOVLE16rm, CMOVLE32rm, CMOVLE64rm>;
+defm : CMOVmr<X86_COND_P , CMOVNP16rm, CMOVNP32rm, CMOVNP64rm>;
+defm : CMOVmr<X86_COND_NP, CMOVP16rm , CMOVP32rm , CMOVP64rm>;
+defm : CMOVmr<X86_COND_S , CMOVNS16rm, CMOVNS32rm, CMOVNS64rm>;
+defm : CMOVmr<X86_COND_NS, CMOVS16rm , CMOVS32rm , CMOVS64rm>;
+defm : CMOVmr<X86_COND_O , CMOVNO16rm, CMOVNO32rm, CMOVNO64rm>;
+defm : CMOVmr<X86_COND_NO, CMOVO16rm , CMOVO32rm , CMOVO64rm>;
+
+// zextload bool -> zextload byte
+def : Pat<(zextloadi8i1  addr:$src), (MOV8rm     addr:$src)>;
+def : Pat<(zextloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>;
+def : Pat<(zextloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>;
+def : Pat<(zextloadi64i1 addr:$src),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
+
+// extload bool -> extload byte
+// When extloading from 16-bit and smaller memory locations into 64-bit
+// registers, use zero-extending loads so that the entire 64-bit register is
+// defined, avoiding partial-register updates.
+
+def : Pat<(extloadi8i1 addr:$src),   (MOV8rm      addr:$src)>;
+def : Pat<(extloadi16i1 addr:$src),  (MOVZX16rm8  addr:$src)>;
+def : Pat<(extloadi32i1 addr:$src),  (MOVZX32rm8  addr:$src)>;
+def : Pat<(extloadi16i8 addr:$src),  (MOVZX16rm8  addr:$src)>;
+def : Pat<(extloadi32i8 addr:$src),  (MOVZX32rm8  addr:$src)>;
+def : Pat<(extloadi32i16 addr:$src), (MOVZX32rm16 addr:$src)>;
+
+// For other extloads, use subregs, since the high contents of the register are
+// defined after an extload.
+def : Pat<(extloadi64i1 addr:$src),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
+def : Pat<(extloadi64i8 addr:$src),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
+def : Pat<(extloadi64i16 addr:$src),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rm16 addr:$src), sub_32bit)>;
+def : Pat<(extloadi64i32 addr:$src),
+          (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), sub_32bit)>;
+
+// anyext. Define these to do an explicit zero-extend to
+// avoid partial-register updates.
+def : Pat<(i16 (anyext GR8 :$src)), (EXTRACT_SUBREG
+                                     (MOVZX32rr8 GR8 :$src), sub_16bit)>;
+def : Pat<(i32 (anyext GR8 :$src)), (MOVZX32rr8  GR8 :$src)>;
+
+// Except for i16 -> i32 since isel expect i16 ops to be promoted to i32.
+def : Pat<(i32 (anyext GR16:$src)),
+          (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR16:$src, sub_16bit)>;
+
+def : Pat<(i64 (anyext GR8 :$src)),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rr8  GR8  :$src), sub_32bit)>;
+def : Pat<(i64 (anyext GR16:$src)),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rr16 GR16 :$src), sub_32bit)>;
+def : Pat<(i64 (anyext GR32:$src)),
+          (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>;
+
+
+// Any instruction that defines a 32-bit result leaves the high half of the
+// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
+// be copying from a truncate. And x86's cmov doesn't do anything if the
+// condition is false. But any other 32-bit operation will zero-extend
+// up to 64 bits.
+def def32 : PatLeaf<(i32 GR32:$src), [{
+  return N->getOpcode() != ISD::TRUNCATE &&
+         N->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
+         N->getOpcode() != ISD::CopyFromReg &&
+         N->getOpcode() != X86ISD::CMOV;
+}]>;
+
+// In the case of a 32-bit def that is known to implicitly zero-extend,
+// we can use a SUBREG_TO_REG.
+def : Pat<(i64 (zext def32:$src)),
+          (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>;
+
+//===----------------------------------------------------------------------===//
+// Pattern match OR as ADD
+//===----------------------------------------------------------------------===//
+
+// If safe, we prefer to pattern match OR as ADD at isel time. ADD can be
+// 3-addressified into an LEA instruction to avoid copies.  However, we also
+// want to finally emit these instructions as an or at the end of the code
+// generator to make the generated code easier to read.  To do this, we select
+// into "disjoint bits" pseudo ops.
+
+// Treat an 'or' node is as an 'add' if the or'ed bits are known to be zero.
+def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1)))
+    return CurDAG->MaskedValueIsZero(N->getOperand(0), CN->getAPIntValue());
+
+  APInt KnownZero0, KnownOne0;
+  CurDAG->computeKnownBits(N->getOperand(0), KnownZero0, KnownOne0, 0);
+  APInt KnownZero1, KnownOne1;
+  CurDAG->computeKnownBits(N->getOperand(1), KnownZero1, KnownOne1, 0);
+  return (~KnownZero0 & ~KnownZero1) == 0;
+}]>;
+
+
+// (or x1, x2) -> (add x1, x2) if two operands are known not to share bits.
+// Try this before the selecting to OR.
+let AddedComplexity = 5, SchedRW = [WriteALU] in {
+
+let isConvertibleToThreeAddress = 1,
+    Constraints = "$src1 = $dst", Defs = [EFLAGS] in {
+let isCommutable = 1 in {
+def ADD16rr_DB  : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                    "", // orw/addw REG, REG
+                    [(set GR16:$dst, (or_is_add GR16:$src1, GR16:$src2))]>;
+def ADD32rr_DB  : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                    "", // orl/addl REG, REG
+                    [(set GR32:$dst, (or_is_add GR32:$src1, GR32:$src2))]>;
+def ADD64rr_DB  : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                    "", // orq/addq REG, REG
+                    [(set GR64:$dst, (or_is_add GR64:$src1, GR64:$src2))]>;
+} // isCommutable
+
+// NOTE: These are order specific, we want the ri8 forms to be listed
+// first so that they are slightly preferred to the ri forms.
+
+def ADD16ri8_DB : I<0, Pseudo,
+                    (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
+                    "", // orw/addw REG, imm8
+                    [(set GR16:$dst,(or_is_add GR16:$src1,i16immSExt8:$src2))]>;
+def ADD16ri_DB  : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+                    "", // orw/addw REG, imm
+                    [(set GR16:$dst, (or_is_add GR16:$src1, imm:$src2))]>;
+
+def ADD32ri8_DB : I<0, Pseudo,
+                    (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
+                    "", // orl/addl REG, imm8
+                    [(set GR32:$dst,(or_is_add GR32:$src1,i32immSExt8:$src2))]>;
+def ADD32ri_DB  : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
+                    "", // orl/addl REG, imm
+                    [(set GR32:$dst, (or_is_add GR32:$src1, imm:$src2))]>;
+
+
+def ADD64ri8_DB : I<0, Pseudo,
+                    (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
+                    "", // orq/addq REG, imm8
+                    [(set GR64:$dst, (or_is_add GR64:$src1,
+                                                i64immSExt8:$src2))]>;
+def ADD64ri32_DB : I<0, Pseudo,
+                     (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
+                      "", // orq/addq REG, imm
+                      [(set GR64:$dst, (or_is_add GR64:$src1,
+                                                  i64immSExt32:$src2))]>;
+}
+} // AddedComplexity, SchedRW
+
+
+//===----------------------------------------------------------------------===//
+// Some peepholes
+//===----------------------------------------------------------------------===//
+
+// Odd encoding trick: -128 fits into an 8-bit immediate field while
+// +128 doesn't, so in this special case use a sub instead of an add.
+def : Pat<(add GR16:$src1, 128),
+          (SUB16ri8 GR16:$src1, -128)>;
+def : Pat<(store (add (loadi16 addr:$dst), 128), addr:$dst),
+          (SUB16mi8 addr:$dst, -128)>;
+
+def : Pat<(add GR32:$src1, 128),
+          (SUB32ri8 GR32:$src1, -128)>;
+def : Pat<(store (add (loadi32 addr:$dst), 128), addr:$dst),
+          (SUB32mi8 addr:$dst, -128)>;
+
+def : Pat<(add GR64:$src1, 128),
+          (SUB64ri8 GR64:$src1, -128)>;
+def : Pat<(store (add (loadi64 addr:$dst), 128), addr:$dst),
+          (SUB64mi8 addr:$dst, -128)>;
+
+// The same trick applies for 32-bit immediate fields in 64-bit
+// instructions.
+def : Pat<(add GR64:$src1, 0x0000000080000000),
+          (SUB64ri32 GR64:$src1, 0xffffffff80000000)>;
+def : Pat<(store (add (loadi64 addr:$dst), 0x00000000800000000), addr:$dst),
+          (SUB64mi32 addr:$dst, 0xffffffff80000000)>;
+
+// To avoid needing to materialize an immediate in a register, use a 32-bit and
+// with implicit zero-extension instead of a 64-bit and if the immediate has at
+// least 32 bits of leading zeros. If in addition the last 32 bits can be
+// represented with a sign extension of a 8 bit constant, use that.
+
+def : Pat<(and GR64:$src, i64immZExt32SExt8:$imm),
+          (SUBREG_TO_REG
+            (i64 0),
+            (AND32ri8
+              (EXTRACT_SUBREG GR64:$src, sub_32bit),
+              (i32 (GetLo8XForm imm:$imm))),
+            sub_32bit)>;
+
+def : Pat<(and GR64:$src, i64immZExt32:$imm),
+          (SUBREG_TO_REG
+            (i64 0),
+            (AND32ri
+              (EXTRACT_SUBREG GR64:$src, sub_32bit),
+              (i32 (GetLo32XForm imm:$imm))),
+            sub_32bit)>;
+
+
+// r & (2^16-1) ==> movz
+def : Pat<(and GR32:$src1, 0xffff),
+          (MOVZX32rr16 (EXTRACT_SUBREG GR32:$src1, sub_16bit))>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR32:$src1, 0xff),
+          (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src1,
+                                                             GR32_ABCD)),
+                                      sub_8bit))>,
+      Requires<[Not64BitMode]>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR16:$src1, 0xff),
+           (EXTRACT_SUBREG (MOVZX32rr8 (EXTRACT_SUBREG
+            (i16 (COPY_TO_REGCLASS GR16:$src1, GR16_ABCD)), sub_8bit)),
+             sub_16bit)>,
+      Requires<[Not64BitMode]>;
+
+// r & (2^32-1) ==> movz
+def : Pat<(and GR64:$src, 0x00000000FFFFFFFF),
+          (SUBREG_TO_REG (i64 0),
+                         (MOV32rr (EXTRACT_SUBREG GR64:$src, sub_32bit)),
+                         sub_32bit)>;
+// r & (2^16-1) ==> movz
+def : Pat<(and GR64:$src, 0xffff),
+          (SUBREG_TO_REG (i64 0),
+                      (MOVZX32rr16 (i16 (EXTRACT_SUBREG GR64:$src, sub_16bit))),
+                      sub_32bit)>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR64:$src, 0xff),
+          (SUBREG_TO_REG (i64 0),
+                         (MOVZX32rr8 (i8 (EXTRACT_SUBREG GR64:$src, sub_8bit))),
+                         sub_32bit)>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR32:$src1, 0xff),
+           (MOVZX32rr8 (EXTRACT_SUBREG GR32:$src1, sub_8bit))>,
+      Requires<[In64BitMode]>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR16:$src1, 0xff),
+           (EXTRACT_SUBREG (MOVZX32rr8 (i8
+            (EXTRACT_SUBREG GR16:$src1, sub_8bit))), sub_16bit)>,
+      Requires<[In64BitMode]>;
+
+
+// sext_inreg patterns
+def : Pat<(sext_inreg GR32:$src, i16),
+          (MOVSX32rr16 (EXTRACT_SUBREG GR32:$src, sub_16bit))>;
+def : Pat<(sext_inreg GR32:$src, i8),
+          (MOVSX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
+                                                             GR32_ABCD)),
+                                      sub_8bit))>,
+      Requires<[Not64BitMode]>;
+
+def : Pat<(sext_inreg GR16:$src, i8),
+           (EXTRACT_SUBREG (i32 (MOVSX32rr8 (EXTRACT_SUBREG
+            (i32 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), sub_8bit))),
+             sub_16bit)>,
+      Requires<[Not64BitMode]>;
+
+def : Pat<(sext_inreg GR64:$src, i32),
+          (MOVSX64rr32 (EXTRACT_SUBREG GR64:$src, sub_32bit))>;
+def : Pat<(sext_inreg GR64:$src, i16),
+          (MOVSX64rr16 (EXTRACT_SUBREG GR64:$src, sub_16bit))>;
+def : Pat<(sext_inreg GR64:$src, i8),
+          (MOVSX64rr8 (EXTRACT_SUBREG GR64:$src, sub_8bit))>;
+def : Pat<(sext_inreg GR32:$src, i8),
+          (MOVSX32rr8 (EXTRACT_SUBREG GR32:$src, sub_8bit))>,
+      Requires<[In64BitMode]>;
+def : Pat<(sext_inreg GR16:$src, i8),
+           (EXTRACT_SUBREG (MOVSX32rr8
+            (EXTRACT_SUBREG GR16:$src, sub_8bit)), sub_16bit)>,
+      Requires<[In64BitMode]>;
+
+// sext, sext_load, zext, zext_load
+def: Pat<(i16 (sext GR8:$src)),
+          (EXTRACT_SUBREG (MOVSX32rr8 GR8:$src), sub_16bit)>;
+def: Pat<(sextloadi16i8 addr:$src),
+          (EXTRACT_SUBREG (MOVSX32rm8 addr:$src), sub_16bit)>;
+def: Pat<(i16 (zext GR8:$src)),
+          (EXTRACT_SUBREG (MOVZX32rr8 GR8:$src), sub_16bit)>;
+def: Pat<(zextloadi16i8 addr:$src),
+          (EXTRACT_SUBREG (MOVZX32rm8 addr:$src), sub_16bit)>;
+
+// trunc patterns
+def : Pat<(i16 (trunc GR32:$src)),
+          (EXTRACT_SUBREG GR32:$src, sub_16bit)>;
+def : Pat<(i8 (trunc GR32:$src)),
+          (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
+                          sub_8bit)>,
+      Requires<[Not64BitMode]>;
+def : Pat<(i8 (trunc GR16:$src)),
+          (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+                          sub_8bit)>,
+      Requires<[Not64BitMode]>;
+def : Pat<(i32 (trunc GR64:$src)),
+          (EXTRACT_SUBREG GR64:$src, sub_32bit)>;
+def : Pat<(i16 (trunc GR64:$src)),
+          (EXTRACT_SUBREG GR64:$src, sub_16bit)>;
+def : Pat<(i8 (trunc GR64:$src)),
+          (EXTRACT_SUBREG GR64:$src, sub_8bit)>;
+def : Pat<(i8 (trunc GR32:$src)),
+          (EXTRACT_SUBREG GR32:$src, sub_8bit)>,
+      Requires<[In64BitMode]>;
+def : Pat<(i8 (trunc GR16:$src)),
+          (EXTRACT_SUBREG GR16:$src, sub_8bit)>,
+      Requires<[In64BitMode]>;
+
+// h-register tricks
+def : Pat<(i8 (trunc (srl_su GR16:$src, (i8 8)))),
+          (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+                          sub_8bit_hi)>,
+      Requires<[Not64BitMode]>;
+def : Pat<(i8 (trunc (srl_su GR32:$src, (i8 8)))),
+          (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
+                          sub_8bit_hi)>,
+      Requires<[Not64BitMode]>;
+def : Pat<(srl GR16:$src, (i8 8)),
+          (EXTRACT_SUBREG
+            (MOVZX32rr8
+              (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+                              sub_8bit_hi)),
+            sub_16bit)>,
+      Requires<[Not64BitMode]>;
+def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))),
+          (MOVZX32rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src,
+                                                             GR16_ABCD)),
+                                      sub_8bit_hi))>,
+      Requires<[Not64BitMode]>;
+def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))),
+          (MOVZX32rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src,
+                                                             GR16_ABCD)),
+                                      sub_8bit_hi))>,
+      Requires<[Not64BitMode]>;
+def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)),
+          (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
+                                                             GR32_ABCD)),
+                                      sub_8bit_hi))>,
+      Requires<[Not64BitMode]>;
+def : Pat<(srl (and_su GR32:$src, 0xff00), (i8 8)),
+          (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
+                                                             GR32_ABCD)),
+                                      sub_8bit_hi))>,
+      Requires<[Not64BitMode]>;
+
+// h-register tricks.
+// For now, be conservative on x86-64 and use an h-register extract only if the
+// value is immediately zero-extended or stored, which are somewhat common
+// cases. This uses a bunch of code to prevent a register requiring a REX prefix
+// from being allocated in the same instruction as the h register, as there's
+// currently no way to describe this requirement to the register allocator.
+
+// h-register extract and zero-extend.
+def : Pat<(and (srl_su GR64:$src, (i8 8)), (i64 255)),
+          (SUBREG_TO_REG
+            (i64 0),
+            (MOVZX32_NOREXrr8
+              (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS GR64:$src, GR64_ABCD)),
+                              sub_8bit_hi)),
+            sub_32bit)>;
+def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)),
+          (MOVZX32_NOREXrr8
+            (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
+                            sub_8bit_hi))>,
+      Requires<[In64BitMode]>;
+def : Pat<(srl (and_su GR32:$src, 0xff00), (i8 8)),
+          (MOVZX32_NOREXrr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
+                                                                   GR32_ABCD)),
+                                             sub_8bit_hi))>,
+      Requires<[In64BitMode]>;
+def : Pat<(srl GR16:$src, (i8 8)),
+          (EXTRACT_SUBREG
+            (MOVZX32_NOREXrr8
+              (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+                              sub_8bit_hi)),
+            sub_16bit)>,
+      Requires<[In64BitMode]>;
+def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))),
+          (MOVZX32_NOREXrr8
+            (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+                            sub_8bit_hi))>,
+      Requires<[In64BitMode]>;
+def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))),
+          (MOVZX32_NOREXrr8
+            (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+                            sub_8bit_hi))>,
+      Requires<[In64BitMode]>;
+def : Pat<(i64 (zext (srl_su GR16:$src, (i8 8)))),
+          (SUBREG_TO_REG
+            (i64 0),
+            (MOVZX32_NOREXrr8
+              (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+                              sub_8bit_hi)),
+            sub_32bit)>;
+def : Pat<(i64 (anyext (srl_su GR16:$src, (i8 8)))),
+          (SUBREG_TO_REG
+            (i64 0),
+            (MOVZX32_NOREXrr8
+              (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+                              sub_8bit_hi)),
+            sub_32bit)>;
+
+// h-register extract and store.
+def : Pat<(store (i8 (trunc_su (srl_su GR64:$src, (i8 8)))), addr:$dst),
+          (MOV8mr_NOREX
+            addr:$dst,
+            (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS GR64:$src, GR64_ABCD)),
+                            sub_8bit_hi))>;
+def : Pat<(store (i8 (trunc_su (srl_su GR32:$src, (i8 8)))), addr:$dst),
+          (MOV8mr_NOREX
+            addr:$dst,
+            (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
+                            sub_8bit_hi))>,
+      Requires<[In64BitMode]>;
+def : Pat<(store (i8 (trunc_su (srl_su GR16:$src, (i8 8)))), addr:$dst),
+          (MOV8mr_NOREX
+            addr:$dst,
+            (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+                            sub_8bit_hi))>,
+      Requires<[In64BitMode]>;
+
+
+// (shl x, 1) ==> (add x, x)
+// Note that if x is undef (immediate or otherwise), we could theoretically
+// end up with the two uses of x getting different values, producing a result
+// where the least significant bit is not 0. However, the probability of this
+// happening is considered low enough that this is officially not a
+// "real problem".
+def : Pat<(shl GR8 :$src1, (i8 1)), (ADD8rr  GR8 :$src1, GR8 :$src1)>;
+def : Pat<(shl GR16:$src1, (i8 1)), (ADD16rr GR16:$src1, GR16:$src1)>;
+def : Pat<(shl GR32:$src1, (i8 1)), (ADD32rr GR32:$src1, GR32:$src1)>;
+def : Pat<(shl GR64:$src1, (i8 1)), (ADD64rr GR64:$src1, GR64:$src1)>;
+
+// Helper imms that check if a mask doesn't change significant shift bits.
+def immShift32 : ImmLeaf<i8, [{ return CountTrailingOnes_32(Imm) >= 5; }]>;
+def immShift64 : ImmLeaf<i8, [{ return CountTrailingOnes_32(Imm) >= 6; }]>;
+
+// Shift amount is implicitly masked.
+multiclass MaskedShiftAmountPats<SDNode frag, string name> {
+  // (shift x (and y, 31)) ==> (shift x, y)
+  def : Pat<(frag GR8:$src1, (and CL, immShift32)),
+            (!cast<Instruction>(name # "8rCL") GR8:$src1)>;
+  def : Pat<(frag GR16:$src1, (and CL, immShift32)),
+            (!cast<Instruction>(name # "16rCL") GR16:$src1)>;
+  def : Pat<(frag GR32:$src1, (and CL, immShift32)),
+            (!cast<Instruction>(name # "32rCL") GR32:$src1)>;
+  def : Pat<(store (frag (loadi8 addr:$dst), (and CL, immShift32)), addr:$dst),
+            (!cast<Instruction>(name # "8mCL") addr:$dst)>;
+  def : Pat<(store (frag (loadi16 addr:$dst), (and CL, immShift32)), addr:$dst),
+            (!cast<Instruction>(name # "16mCL") addr:$dst)>;
+  def : Pat<(store (frag (loadi32 addr:$dst), (and CL, immShift32)), addr:$dst),
+            (!cast<Instruction>(name # "32mCL") addr:$dst)>;
+
+  // (shift x (and y, 63)) ==> (shift x, y)
+  def : Pat<(frag GR64:$src1, (and CL, immShift64)),
+            (!cast<Instruction>(name # "64rCL") GR64:$src1)>;
+  def : Pat<(store (frag (loadi64 addr:$dst), (and CL, 63)), addr:$dst),
+            (!cast<Instruction>(name # "64mCL") addr:$dst)>;
+}
+
+defm : MaskedShiftAmountPats<shl, "SHL">;
+defm : MaskedShiftAmountPats<srl, "SHR">;
+defm : MaskedShiftAmountPats<sra, "SAR">;
+defm : MaskedShiftAmountPats<rotl, "ROL">;
+defm : MaskedShiftAmountPats<rotr, "ROR">;
+
+// (anyext (setcc_carry)) -> (setcc_carry)
+def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+          (SETB_C16r)>;
+def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+          (SETB_C32r)>;
+def : Pat<(i32 (anyext (i16 (X86setcc_c X86_COND_B, EFLAGS)))),
+          (SETB_C32r)>;
+
+
+
+
+//===----------------------------------------------------------------------===//
+// EFLAGS-defining Patterns
+//===----------------------------------------------------------------------===//
+
+// add reg, reg
+def : Pat<(add GR8 :$src1, GR8 :$src2), (ADD8rr  GR8 :$src1, GR8 :$src2)>;
+def : Pat<(add GR16:$src1, GR16:$src2), (ADD16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(add GR32:$src1, GR32:$src2), (ADD32rr GR32:$src1, GR32:$src2)>;
+
+// add reg, mem
+def : Pat<(add GR8:$src1, (loadi8 addr:$src2)),
+          (ADD8rm GR8:$src1, addr:$src2)>;
+def : Pat<(add GR16:$src1, (loadi16 addr:$src2)),
+          (ADD16rm GR16:$src1, addr:$src2)>;
+def : Pat<(add GR32:$src1, (loadi32 addr:$src2)),
+          (ADD32rm GR32:$src1, addr:$src2)>;
+
+// add reg, imm
+def : Pat<(add GR8 :$src1, imm:$src2), (ADD8ri  GR8:$src1 , imm:$src2)>;
+def : Pat<(add GR16:$src1, imm:$src2), (ADD16ri GR16:$src1, imm:$src2)>;
+def : Pat<(add GR32:$src1, imm:$src2), (ADD32ri GR32:$src1, imm:$src2)>;
+def : Pat<(add GR16:$src1, i16immSExt8:$src2),
+          (ADD16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(add GR32:$src1, i32immSExt8:$src2),
+          (ADD32ri8 GR32:$src1, i32immSExt8:$src2)>;
+
+// sub reg, reg
+def : Pat<(sub GR8 :$src1, GR8 :$src2), (SUB8rr  GR8 :$src1, GR8 :$src2)>;
+def : Pat<(sub GR16:$src1, GR16:$src2), (SUB16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(sub GR32:$src1, GR32:$src2), (SUB32rr GR32:$src1, GR32:$src2)>;
+
+// sub reg, mem
+def : Pat<(sub GR8:$src1, (loadi8 addr:$src2)),
+          (SUB8rm GR8:$src1, addr:$src2)>;
+def : Pat<(sub GR16:$src1, (loadi16 addr:$src2)),
+          (SUB16rm GR16:$src1, addr:$src2)>;
+def : Pat<(sub GR32:$src1, (loadi32 addr:$src2)),
+          (SUB32rm GR32:$src1, addr:$src2)>;
+
+// sub reg, imm
+def : Pat<(sub GR8:$src1, imm:$src2),
+          (SUB8ri GR8:$src1, imm:$src2)>;
+def : Pat<(sub GR16:$src1, imm:$src2),
+          (SUB16ri GR16:$src1, imm:$src2)>;
+def : Pat<(sub GR32:$src1, imm:$src2),
+          (SUB32ri GR32:$src1, imm:$src2)>;
+def : Pat<(sub GR16:$src1, i16immSExt8:$src2),
+          (SUB16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(sub GR32:$src1, i32immSExt8:$src2),
+          (SUB32ri8 GR32:$src1, i32immSExt8:$src2)>;
+
+// sub 0, reg
+def : Pat<(X86sub_flag 0, GR8 :$src), (NEG8r  GR8 :$src)>;
+def : Pat<(X86sub_flag 0, GR16:$src), (NEG16r GR16:$src)>;
+def : Pat<(X86sub_flag 0, GR32:$src), (NEG32r GR32:$src)>;
+def : Pat<(X86sub_flag 0, GR64:$src), (NEG64r GR64:$src)>;
+
+// mul reg, reg
+def : Pat<(mul GR16:$src1, GR16:$src2),
+          (IMUL16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(mul GR32:$src1, GR32:$src2),
+          (IMUL32rr GR32:$src1, GR32:$src2)>;
+
+// mul reg, mem
+def : Pat<(mul GR16:$src1, (loadi16 addr:$src2)),
+          (IMUL16rm GR16:$src1, addr:$src2)>;
+def : Pat<(mul GR32:$src1, (loadi32 addr:$src2)),
+          (IMUL32rm GR32:$src1, addr:$src2)>;
+
+// mul reg, imm
+def : Pat<(mul GR16:$src1, imm:$src2),
+          (IMUL16rri GR16:$src1, imm:$src2)>;
+def : Pat<(mul GR32:$src1, imm:$src2),
+          (IMUL32rri GR32:$src1, imm:$src2)>;
+def : Pat<(mul GR16:$src1, i16immSExt8:$src2),
+          (IMUL16rri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(mul GR32:$src1, i32immSExt8:$src2),
+          (IMUL32rri8 GR32:$src1, i32immSExt8:$src2)>;
+
+// reg = mul mem, imm
+def : Pat<(mul (loadi16 addr:$src1), imm:$src2),
+          (IMUL16rmi addr:$src1, imm:$src2)>;
+def : Pat<(mul (loadi32 addr:$src1), imm:$src2),
+          (IMUL32rmi addr:$src1, imm:$src2)>;
+def : Pat<(mul (loadi16 addr:$src1), i16immSExt8:$src2),
+          (IMUL16rmi8 addr:$src1, i16immSExt8:$src2)>;
+def : Pat<(mul (loadi32 addr:$src1), i32immSExt8:$src2),
+          (IMUL32rmi8 addr:$src1, i32immSExt8:$src2)>;
+
+// Patterns for nodes that do not produce flags, for instructions that do.
+
+// addition
+def : Pat<(add GR64:$src1, GR64:$src2),
+          (ADD64rr GR64:$src1, GR64:$src2)>;
+def : Pat<(add GR64:$src1, i64immSExt8:$src2),
+          (ADD64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(add GR64:$src1, i64immSExt32:$src2),
+          (ADD64ri32 GR64:$src1, i64immSExt32:$src2)>;
+def : Pat<(add GR64:$src1, (loadi64 addr:$src2)),
+          (ADD64rm GR64:$src1, addr:$src2)>;
+
+// subtraction
+def : Pat<(sub GR64:$src1, GR64:$src2),
+          (SUB64rr GR64:$src1, GR64:$src2)>;
+def : Pat<(sub GR64:$src1, (loadi64 addr:$src2)),
+          (SUB64rm GR64:$src1, addr:$src2)>;
+def : Pat<(sub GR64:$src1, i64immSExt8:$src2),
+          (SUB64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(sub GR64:$src1, i64immSExt32:$src2),
+          (SUB64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// Multiply
+def : Pat<(mul GR64:$src1, GR64:$src2),
+          (IMUL64rr GR64:$src1, GR64:$src2)>;
+def : Pat<(mul GR64:$src1, (loadi64 addr:$src2)),
+          (IMUL64rm GR64:$src1, addr:$src2)>;
+def : Pat<(mul GR64:$src1, i64immSExt8:$src2),
+          (IMUL64rri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(mul GR64:$src1, i64immSExt32:$src2),
+          (IMUL64rri32 GR64:$src1, i64immSExt32:$src2)>;
+def : Pat<(mul (loadi64 addr:$src1), i64immSExt8:$src2),
+          (IMUL64rmi8 addr:$src1, i64immSExt8:$src2)>;
+def : Pat<(mul (loadi64 addr:$src1), i64immSExt32:$src2),
+          (IMUL64rmi32 addr:$src1, i64immSExt32:$src2)>;
+
+// Increment reg.
+// Do not make INC if it is slow
+def : Pat<(add GR8:$src, 1),
+          (INC8r GR8:$src)>, Requires<[NotSlowIncDec]>;
+def : Pat<(add GR16:$src, 1),
+          (INC16r GR16:$src)>, Requires<[NotSlowIncDec, Not64BitMode]>;
+def : Pat<(add GR16:$src, 1),
+          (INC64_16r GR16:$src)>, Requires<[NotSlowIncDec, In64BitMode]>;
+def : Pat<(add GR32:$src, 1),
+          (INC32r GR32:$src)>, Requires<[NotSlowIncDec, Not64BitMode]>;
+def : Pat<(add GR32:$src, 1),
+          (INC64_32r GR32:$src)>, Requires<[NotSlowIncDec, In64BitMode]>;
+def : Pat<(add GR64:$src, 1),
+          (INC64r GR64:$src)>, Requires<[NotSlowIncDec]>;
+
+// Decrement reg.
+// Do not make DEC if it is slow
+def : Pat<(add GR8:$src, -1),
+          (DEC8r GR8:$src)>, Requires<[NotSlowIncDec]>;
+def : Pat<(add GR16:$src, -1),
+          (DEC16r GR16:$src)>, Requires<[NotSlowIncDec, Not64BitMode]>;
+def : Pat<(add GR16:$src, -1),
+          (DEC64_16r GR16:$src)>, Requires<[NotSlowIncDec, In64BitMode]>;
+def : Pat<(add GR32:$src, -1),
+          (DEC32r GR32:$src)>, Requires<[NotSlowIncDec, Not64BitMode]>;
+def : Pat<(add GR32:$src, -1),
+          (DEC64_32r GR32:$src)>, Requires<[NotSlowIncDec, In64BitMode]>;
+def : Pat<(add GR64:$src, -1),
+          (DEC64r GR64:$src)>, Requires<[NotSlowIncDec]>;
+
+// or reg/reg.
+def : Pat<(or GR8 :$src1, GR8 :$src2), (OR8rr  GR8 :$src1, GR8 :$src2)>;
+def : Pat<(or GR16:$src1, GR16:$src2), (OR16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(or GR32:$src1, GR32:$src2), (OR32rr GR32:$src1, GR32:$src2)>;
+def : Pat<(or GR64:$src1, GR64:$src2), (OR64rr GR64:$src1, GR64:$src2)>;
+
+// or reg/mem
+def : Pat<(or GR8:$src1, (loadi8 addr:$src2)),
+          (OR8rm GR8:$src1, addr:$src2)>;
+def : Pat<(or GR16:$src1, (loadi16 addr:$src2)),
+          (OR16rm GR16:$src1, addr:$src2)>;
+def : Pat<(or GR32:$src1, (loadi32 addr:$src2)),
+          (OR32rm GR32:$src1, addr:$src2)>;
+def : Pat<(or GR64:$src1, (loadi64 addr:$src2)),
+          (OR64rm GR64:$src1, addr:$src2)>;
+
+// or reg/imm
+def : Pat<(or GR8:$src1 , imm:$src2), (OR8ri  GR8 :$src1, imm:$src2)>;
+def : Pat<(or GR16:$src1, imm:$src2), (OR16ri GR16:$src1, imm:$src2)>;
+def : Pat<(or GR32:$src1, imm:$src2), (OR32ri GR32:$src1, imm:$src2)>;
+def : Pat<(or GR16:$src1, i16immSExt8:$src2),
+          (OR16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(or GR32:$src1, i32immSExt8:$src2),
+          (OR32ri8 GR32:$src1, i32immSExt8:$src2)>;
+def : Pat<(or GR64:$src1, i64immSExt8:$src2),
+          (OR64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(or GR64:$src1, i64immSExt32:$src2),
+          (OR64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// xor reg/reg
+def : Pat<(xor GR8 :$src1, GR8 :$src2), (XOR8rr  GR8 :$src1, GR8 :$src2)>;
+def : Pat<(xor GR16:$src1, GR16:$src2), (XOR16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(xor GR32:$src1, GR32:$src2), (XOR32rr GR32:$src1, GR32:$src2)>;
+def : Pat<(xor GR64:$src1, GR64:$src2), (XOR64rr GR64:$src1, GR64:$src2)>;
+
+// xor reg/mem
+def : Pat<(xor GR8:$src1, (loadi8 addr:$src2)),
+          (XOR8rm GR8:$src1, addr:$src2)>;
+def : Pat<(xor GR16:$src1, (loadi16 addr:$src2)),
+          (XOR16rm GR16:$src1, addr:$src2)>;
+def : Pat<(xor GR32:$src1, (loadi32 addr:$src2)),
+          (XOR32rm GR32:$src1, addr:$src2)>;
+def : Pat<(xor GR64:$src1, (loadi64 addr:$src2)),
+          (XOR64rm GR64:$src1, addr:$src2)>;
+
+// xor reg/imm
+def : Pat<(xor GR8:$src1, imm:$src2),
+          (XOR8ri GR8:$src1, imm:$src2)>;
+def : Pat<(xor GR16:$src1, imm:$src2),
+          (XOR16ri GR16:$src1, imm:$src2)>;
+def : Pat<(xor GR32:$src1, imm:$src2),
+          (XOR32ri GR32:$src1, imm:$src2)>;
+def : Pat<(xor GR16:$src1, i16immSExt8:$src2),
+          (XOR16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(xor GR32:$src1, i32immSExt8:$src2),
+          (XOR32ri8 GR32:$src1, i32immSExt8:$src2)>;
+def : Pat<(xor GR64:$src1, i64immSExt8:$src2),
+          (XOR64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(xor GR64:$src1, i64immSExt32:$src2),
+          (XOR64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// and reg/reg
+def : Pat<(and GR8 :$src1, GR8 :$src2), (AND8rr  GR8 :$src1, GR8 :$src2)>;
+def : Pat<(and GR16:$src1, GR16:$src2), (AND16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(and GR32:$src1, GR32:$src2), (AND32rr GR32:$src1, GR32:$src2)>;
+def : Pat<(and GR64:$src1, GR64:$src2), (AND64rr GR64:$src1, GR64:$src2)>;
+
+// and reg/mem
+def : Pat<(and GR8:$src1, (loadi8 addr:$src2)),
+          (AND8rm GR8:$src1, addr:$src2)>;
+def : Pat<(and GR16:$src1, (loadi16 addr:$src2)),
+          (AND16rm GR16:$src1, addr:$src2)>;
+def : Pat<(and GR32:$src1, (loadi32 addr:$src2)),
+          (AND32rm GR32:$src1, addr:$src2)>;
+def : Pat<(and GR64:$src1, (loadi64 addr:$src2)),
+          (AND64rm GR64:$src1, addr:$src2)>;
+
+// and reg/imm
+def : Pat<(and GR8:$src1, imm:$src2),
+          (AND8ri GR8:$src1, imm:$src2)>;
+def : Pat<(and GR16:$src1, imm:$src2),
+          (AND16ri GR16:$src1, imm:$src2)>;
+def : Pat<(and GR32:$src1, imm:$src2),
+          (AND32ri GR32:$src1, imm:$src2)>;
+def : Pat<(and GR16:$src1, i16immSExt8:$src2),
+          (AND16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(and GR32:$src1, i32immSExt8:$src2),
+          (AND32ri8 GR32:$src1, i32immSExt8:$src2)>;
+def : Pat<(and GR64:$src1, i64immSExt8:$src2),
+          (AND64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(and GR64:$src1, i64immSExt32:$src2),
+          (AND64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// Bit scan instruction patterns to match explicit zero-undef behavior.
+def : Pat<(cttz_zero_undef GR16:$src), (BSF16rr GR16:$src)>;
+def : Pat<(cttz_zero_undef GR32:$src), (BSF32rr GR32:$src)>;
+def : Pat<(cttz_zero_undef GR64:$src), (BSF64rr GR64:$src)>;
+def : Pat<(cttz_zero_undef (loadi16 addr:$src)), (BSF16rm addr:$src)>;
+def : Pat<(cttz_zero_undef (loadi32 addr:$src)), (BSF32rm addr:$src)>;
+def : Pat<(cttz_zero_undef (loadi64 addr:$src)), (BSF64rm addr:$src)>;
+
+// When HasMOVBE is enabled it is possible to get a non-legalized
+// register-register 16 bit bswap. This maps it to a ROL instruction.
+let Predicates = [HasMOVBE] in {
+ def : Pat<(bswap GR16:$src), (ROL16ri GR16:$src, (i8 8))>;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86InstrControl.td b/contrib/llvm/lib/Target/X86/X86InstrControl.td
new file mode 100644
index 0000000..39ad395
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrControl.td
@@ -0,0 +1,315 @@
+//===-- X86InstrControl.td - Control Flow Instructions -----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 jump, return, call, and related instructions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Control Flow Instructions.
+//
+
+// Return instructions.
+//
+// The X86retflag return instructions are variadic because we may add ST0 and
+// ST1 arguments when returning values on the x87 stack.
+let isTerminator = 1, isReturn = 1, isBarrier = 1,
+    hasCtrlDep = 1, FPForm = SpecialFP, SchedRW = [WriteJumpLd] in {
+  def RETL   : I   <0xC3, RawFrm, (outs), (ins variable_ops),
+                    "ret{l}", [(X86retflag 0)], IIC_RET>, OpSize32,
+                    Requires<[Not64BitMode]>;
+  def RETQ   : I   <0xC3, RawFrm, (outs), (ins variable_ops),
+                    "ret{q}", [(X86retflag 0)], IIC_RET>, OpSize32,
+                    Requires<[In64BitMode]>;
+  def RETW   : I   <0xC3, RawFrm, (outs), (ins),
+                    "ret{w}",
+                    [], IIC_RET>, OpSize16;
+  def RETIL  : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
+                    "ret{l}\t$amt",
+                    [(X86retflag timm:$amt)], IIC_RET_IMM>, OpSize32,
+               Requires<[Not64BitMode]>;
+  def RETIQ  : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
+                    "ret{q}\t$amt",
+                    [(X86retflag timm:$amt)], IIC_RET_IMM>, OpSize32,
+               Requires<[In64BitMode]>;
+  def RETIW  : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt),
+                    "ret{w}\t$amt",
+                    [], IIC_RET_IMM>, OpSize16;
+  def LRETL  : I   <0xCB, RawFrm, (outs), (ins),
+                    "{l}ret{l|f}", [], IIC_RET>, OpSize32;
+  def LRETQ  : RI  <0xCB, RawFrm, (outs), (ins),
+                    "{l}ret{|f}q", [], IIC_RET>, Requires<[In64BitMode]>;
+  def LRETW  : I   <0xCB, RawFrm, (outs), (ins),
+                    "{l}ret{w|f}", [], IIC_RET>, OpSize16;
+  def LRETIL : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
+                    "{l}ret{l|f}\t$amt", [], IIC_RET>, OpSize32;
+  def LRETIQ : RIi16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
+                    "{l}ret{|f}q\t$amt", [], IIC_RET>, Requires<[In64BitMode]>;
+  def LRETIW : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
+                    "{l}ret{w|f}\t$amt", [], IIC_RET>, OpSize16;
+}
+
+// Unconditional branches.
+let isBarrier = 1, isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in {
+  def JMP_4 : Ii32PCRel<0xE9, RawFrm, (outs), (ins brtarget:$dst),
+                        "jmp\t$dst", [(br bb:$dst)], IIC_JMP_REL>, OpSize32;
+  def JMP_2 : Ii16PCRel<0xE9, RawFrm, (outs), (ins brtarget:$dst),
+                        "jmp\t$dst", [(br bb:$dst)], IIC_JMP_REL>, OpSize16,
+                        Requires<[In16BitMode]>;
+  let hasSideEffects = 0 in
+  def JMP_1 : Ii8PCRel<0xEB, RawFrm, (outs), (ins brtarget8:$dst),
+                       "jmp\t$dst", [], IIC_JMP_REL>;
+}
+
+// Conditional Branches.
+let isBranch = 1, isTerminator = 1, Uses = [EFLAGS], SchedRW = [WriteJump] in {
+  multiclass ICBr<bits<8> opc1, bits<8> opc4, string asm, PatFrag Cond> {
+    let hasSideEffects = 0 in
+    def _1 : Ii8PCRel <opc1, RawFrm, (outs), (ins brtarget8:$dst), asm, [],
+                       IIC_Jcc>;
+    def _2 : Ii16PCRel<opc4, RawFrm, (outs), (ins brtarget:$dst), asm,
+                       [(X86brcond bb:$dst, Cond, EFLAGS)], IIC_Jcc>, OpSize16,
+		       TB, Requires<[In16BitMode]>;
+    def _4 : Ii32PCRel<opc4, RawFrm, (outs), (ins brtarget:$dst), asm,
+                       [(X86brcond bb:$dst, Cond, EFLAGS)], IIC_Jcc>, TB,
+             OpSize32;
+  }
+}
+
+defm JO  : ICBr<0x70, 0x80, "jo\t$dst" , X86_COND_O>;
+defm JNO : ICBr<0x71, 0x81, "jno\t$dst" , X86_COND_NO>;
+defm JB  : ICBr<0x72, 0x82, "jb\t$dst" , X86_COND_B>;
+defm JAE : ICBr<0x73, 0x83, "jae\t$dst", X86_COND_AE>;
+defm JE  : ICBr<0x74, 0x84, "je\t$dst" , X86_COND_E>;
+defm JNE : ICBr<0x75, 0x85, "jne\t$dst", X86_COND_NE>;
+defm JBE : ICBr<0x76, 0x86, "jbe\t$dst", X86_COND_BE>;
+defm JA  : ICBr<0x77, 0x87, "ja\t$dst" , X86_COND_A>;
+defm JS  : ICBr<0x78, 0x88, "js\t$dst" , X86_COND_S>;
+defm JNS : ICBr<0x79, 0x89, "jns\t$dst", X86_COND_NS>;
+defm JP  : ICBr<0x7A, 0x8A, "jp\t$dst" , X86_COND_P>;
+defm JNP : ICBr<0x7B, 0x8B, "jnp\t$dst", X86_COND_NP>;
+defm JL  : ICBr<0x7C, 0x8C, "jl\t$dst" , X86_COND_L>;
+defm JGE : ICBr<0x7D, 0x8D, "jge\t$dst", X86_COND_GE>;
+defm JLE : ICBr<0x7E, 0x8E, "jle\t$dst", X86_COND_LE>;
+defm JG  : ICBr<0x7F, 0x8F, "jg\t$dst" , X86_COND_G>;
+
+// jcx/jecx/jrcx instructions.
+let isBranch = 1, isTerminator = 1, hasSideEffects = 0, SchedRW = [WriteJump] in {
+  // These are the 32-bit versions of this instruction for the asmparser.  In
+  // 32-bit mode, the address size prefix is jcxz and the unprefixed version is
+  // jecxz.
+  let Uses = [CX] in
+    def JCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
+                        "jcxz\t$dst", [], IIC_JCXZ>, AdSize, Requires<[Not64BitMode]>;
+  let Uses = [ECX] in
+    def JECXZ_32 : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
+                           "jecxz\t$dst", [], IIC_JCXZ>, Requires<[Not64BitMode]>;
+
+  // J*CXZ instruction: 64-bit versions of this instruction for the asmparser.
+  // In 64-bit mode, the address size prefix is jecxz and the unprefixed version
+  // is jrcxz.
+  let Uses = [ECX] in
+    def JECXZ_64 : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
+                            "jecxz\t$dst", [], IIC_JCXZ>, AdSize, Requires<[In64BitMode]>;
+  let Uses = [RCX] in
+    def JRCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
+                           "jrcxz\t$dst", [], IIC_JCXZ>, Requires<[In64BitMode]>;
+}
+
+// Indirect branches
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
+  def JMP16r     : I<0xFF, MRM4r, (outs), (ins GR16:$dst), "jmp{w}\t{*}$dst",
+                     [(brind GR16:$dst)], IIC_JMP_REG>, Requires<[Not64BitMode]>,
+                   OpSize16, Sched<[WriteJump]>;
+  def JMP16m     : I<0xFF, MRM4m, (outs), (ins i16mem:$dst), "jmp{w}\t{*}$dst",
+                     [(brind (loadi16 addr:$dst))], IIC_JMP_MEM>,
+                   Requires<[Not64BitMode]>, OpSize16, Sched<[WriteJumpLd]>;
+
+  def JMP32r     : I<0xFF, MRM4r, (outs), (ins GR32:$dst), "jmp{l}\t{*}$dst",
+                     [(brind GR32:$dst)], IIC_JMP_REG>, Requires<[Not64BitMode]>,
+                   OpSize32, Sched<[WriteJump]>;
+  def JMP32m     : I<0xFF, MRM4m, (outs), (ins i32mem:$dst), "jmp{l}\t{*}$dst",
+                     [(brind (loadi32 addr:$dst))], IIC_JMP_MEM>,
+                   Requires<[Not64BitMode]>, OpSize32, Sched<[WriteJumpLd]>;
+
+  def JMP64r     : I<0xFF, MRM4r, (outs), (ins GR64:$dst), "jmp{q}\t{*}$dst",
+                     [(brind GR64:$dst)], IIC_JMP_REG>, Requires<[In64BitMode]>,
+                   Sched<[WriteJump]>;
+  def JMP64m     : I<0xFF, MRM4m, (outs), (ins i64mem:$dst), "jmp{q}\t{*}$dst",
+                     [(brind (loadi64 addr:$dst))], IIC_JMP_MEM>,
+                   Requires<[In64BitMode]>, Sched<[WriteJumpLd]>;
+
+  def FARJMP16i  : Iseg16<0xEA, RawFrmImm16, (outs),
+                          (ins i16imm:$off, i16imm:$seg),
+                          "ljmp{w}\t{$seg, $off|$off, $seg}", [],
+                          IIC_JMP_FAR_PTR>, OpSize16, Sched<[WriteJump]>;
+  def FARJMP32i  : Iseg32<0xEA, RawFrmImm16, (outs),
+                          (ins i32imm:$off, i16imm:$seg),
+                          "ljmp{l}\t{$seg, $off|$off, $seg}", [],
+                          IIC_JMP_FAR_PTR>, OpSize32, Sched<[WriteJump]>;
+  def FARJMP64   : RI<0xFF, MRM5m, (outs), (ins opaque80mem:$dst),
+                      "ljmp{q}\t{*}$dst", [], IIC_JMP_FAR_MEM>,
+                   Sched<[WriteJump]>;
+
+  def FARJMP16m  : I<0xFF, MRM5m, (outs), (ins opaque32mem:$dst),
+                     "ljmp{w}\t{*}$dst", [], IIC_JMP_FAR_MEM>, OpSize16,
+                   Sched<[WriteJumpLd]>;
+  def FARJMP32m  : I<0xFF, MRM5m, (outs), (ins opaque48mem:$dst),
+                     "ljmp{l}\t{*}$dst", [], IIC_JMP_FAR_MEM>, OpSize32,
+                   Sched<[WriteJumpLd]>;
+}
+
+
+// Loop instructions
+let SchedRW = [WriteJump] in {
+def LOOP   : Ii8PCRel<0xE2, RawFrm, (outs), (ins brtarget8:$dst), "loop\t$dst", [], IIC_LOOP>;
+def LOOPE  : Ii8PCRel<0xE1, RawFrm, (outs), (ins brtarget8:$dst), "loope\t$dst", [], IIC_LOOPE>;
+def LOOPNE : Ii8PCRel<0xE0, RawFrm, (outs), (ins brtarget8:$dst), "loopne\t$dst", [], IIC_LOOPNE>;
+}
+
+//===----------------------------------------------------------------------===//
+//  Call Instructions...
+//
+let isCall = 1 in
+  // All calls clobber the non-callee saved registers. ESP is marked as
+  // a use to prevent stack-pointer assignments that appear immediately
+  // before calls from potentially appearing dead. Uses for argument
+  // registers are added manually.
+  let Uses = [ESP] in {
+    def CALLpcrel32 : Ii32PCRel<0xE8, RawFrm,
+                           (outs), (ins i32imm_pcrel:$dst),
+                           "call{l}\t$dst", [], IIC_CALL_RI>, OpSize32,
+                      Requires<[Not64BitMode]>, Sched<[WriteJump]>;
+    def CALLpcrel16 : Ii16PCRel<0xE8, RawFrm,
+                           (outs), (ins i16imm_pcrel:$dst),
+                           "call{w}\t$dst", [], IIC_CALL_RI>, OpSize16,
+                      Sched<[WriteJump]>;
+    def CALL16r     : I<0xFF, MRM2r, (outs), (ins GR16:$dst),
+                        "call{w}\t{*}$dst", [(X86call GR16:$dst)], IIC_CALL_RI>,
+                      OpSize16, Requires<[Not64BitMode]>, Sched<[WriteJump]>;
+    def CALL16m     : I<0xFF, MRM2m, (outs), (ins i16mem:$dst),
+                        "call{w}\t{*}$dst", [(X86call (loadi16 addr:$dst))],
+                        IIC_CALL_MEM>, OpSize16,
+                      Requires<[Not64BitMode,FavorMemIndirectCall]>,
+                      Sched<[WriteJumpLd]>;
+    def CALL32r     : I<0xFF, MRM2r, (outs), (ins GR32:$dst),
+                        "call{l}\t{*}$dst", [(X86call GR32:$dst)], IIC_CALL_RI>,
+                      OpSize32, Requires<[Not64BitMode]>, Sched<[WriteJump]>;
+    def CALL32m     : I<0xFF, MRM2m, (outs), (ins i32mem:$dst),
+                        "call{l}\t{*}$dst", [(X86call (loadi32 addr:$dst))],
+                        IIC_CALL_MEM>, OpSize32,
+                      Requires<[Not64BitMode,FavorMemIndirectCall]>,
+                      Sched<[WriteJumpLd]>;
+
+    def FARCALL16i  : Iseg16<0x9A, RawFrmImm16, (outs),
+                             (ins i16imm:$off, i16imm:$seg),
+                             "lcall{w}\t{$seg, $off|$off, $seg}", [],
+                             IIC_CALL_FAR_PTR>, OpSize16, Sched<[WriteJump]>;
+    def FARCALL32i  : Iseg32<0x9A, RawFrmImm16, (outs),
+                             (ins i32imm:$off, i16imm:$seg),
+                             "lcall{l}\t{$seg, $off|$off, $seg}", [],
+                             IIC_CALL_FAR_PTR>, OpSize32, Sched<[WriteJump]>;
+
+    def FARCALL16m  : I<0xFF, MRM3m, (outs), (ins opaque32mem:$dst),
+                        "lcall{w}\t{*}$dst", [], IIC_CALL_FAR_MEM>, OpSize16,
+                      Sched<[WriteJumpLd]>;
+    def FARCALL32m  : I<0xFF, MRM3m, (outs), (ins opaque48mem:$dst),
+                        "lcall{l}\t{*}$dst", [], IIC_CALL_FAR_MEM>, OpSize32,
+                      Sched<[WriteJumpLd]>;
+  }
+
+
+// Tail call stuff.
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
+    isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in
+  let Uses = [ESP] in {
+  def TCRETURNdi : PseudoI<(outs),
+                     (ins i32imm_pcrel:$dst, i32imm:$offset), []>;
+  def TCRETURNri : PseudoI<(outs),
+                     (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>;
+  let mayLoad = 1 in
+  def TCRETURNmi : PseudoI<(outs),
+                     (ins i32mem_TC:$dst, i32imm:$offset), []>;
+
+  // FIXME: The should be pseudo instructions that are lowered when going to
+  // mcinst.
+  def TAILJMPd : Ii32PCRel<0xE9, RawFrm, (outs),
+                           (ins i32imm_pcrel:$dst),
+                           "jmp\t$dst  # TAILCALL",
+                           [], IIC_JMP_REL>;
+  def TAILJMPr : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
+                   "", [], IIC_JMP_REG>;  // FIXME: Remove encoding when JIT is dead.
+  let mayLoad = 1 in
+  def TAILJMPm : I<0xFF, MRM4m, (outs), (ins i32mem_TC:$dst),
+                   "jmp{l}\t{*}$dst  # TAILCALL", [], IIC_JMP_MEM>;
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Call Instructions...
+//
+
+// RSP is marked as a use to prevent stack-pointer assignments that appear
+// immediately before calls from potentially appearing dead. Uses for argument
+// registers are added manually.
+let isCall = 1, Uses = [RSP], SchedRW = [WriteJump] in {
+  // NOTE: this pattern doesn't match "X86call imm", because we do not know
+  // that the offset between an arbitrary immediate and the call will fit in
+  // the 32-bit pcrel field that we have.
+  def CALL64pcrel32 : Ii32PCRel<0xE8, RawFrm,
+                        (outs), (ins i64i32imm_pcrel:$dst),
+                        "call{q}\t$dst", [], IIC_CALL_RI>, OpSize32,
+                      Requires<[In64BitMode]>;
+  def CALL64r       : I<0xFF, MRM2r, (outs), (ins GR64:$dst),
+                        "call{q}\t{*}$dst", [(X86call GR64:$dst)],
+                        IIC_CALL_RI>,
+                      Requires<[In64BitMode]>;
+  def CALL64m       : I<0xFF, MRM2m, (outs), (ins i64mem:$dst),
+                        "call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))],
+                        IIC_CALL_MEM>,
+                      Requires<[In64BitMode,FavorMemIndirectCall]>;
+
+  def FARCALL64   : RI<0xFF, MRM3m, (outs), (ins opaque80mem:$dst),
+                       "lcall{q}\t{*}$dst", [], IIC_CALL_FAR_MEM>;
+}
+
+let isCall = 1, isCodeGenOnly = 1 in
+  // __chkstk(MSVC):     clobber R10, R11 and EFLAGS.
+  // ___chkstk(Mingw64): clobber R10, R11, RAX and EFLAGS, and update RSP.
+  let Defs = [RAX, R10, R11, RSP, EFLAGS],
+      Uses = [RSP] in {
+    def W64ALLOCA : Ii32PCRel<0xE8, RawFrm,
+                      (outs), (ins i64i32imm_pcrel:$dst),
+                      "call{q}\t$dst", [], IIC_CALL_RI>,
+                    Requires<[IsWin64]>, Sched<[WriteJump]>;
+  }
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
+    isCodeGenOnly = 1, Uses = [RSP], usesCustomInserter = 1,
+    SchedRW = [WriteJump] in {
+  def TCRETURNdi64 : PseudoI<(outs),
+                      (ins i64i32imm_pcrel:$dst, i32imm:$offset),
+                      []>;
+  def TCRETURNri64 : PseudoI<(outs),
+                      (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>;
+  let mayLoad = 1 in
+  def TCRETURNmi64 : PseudoI<(outs),
+                       (ins i64mem_TC:$dst, i32imm:$offset), []>;
+
+  def TAILJMPd64 : Ii32PCRel<0xE9, RawFrm, (outs),
+                                      (ins i64i32imm_pcrel:$dst),
+                   "jmp\t$dst  # TAILCALL", [], IIC_JMP_REL>;
+  def TAILJMPr64 : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
+                     "jmp{q}\t{*}$dst  # TAILCALL", [], IIC_JMP_MEM>;
+
+  let mayLoad = 1 in
+  def TAILJMPm64 : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst),
+                     "jmp{q}\t{*}$dst  # TAILCALL", [], IIC_JMP_MEM>;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86InstrExtension.td b/contrib/llvm/lib/Target/X86/X86InstrExtension.td
new file mode 100644
index 0000000..6be6a1f
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrExtension.td
@@ -0,0 +1,172 @@
+//===-- X86InstrExtension.td - Sign and Zero Extensions ----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the sign and zero extension operations.
+//
+//===----------------------------------------------------------------------===//
+
+let neverHasSideEffects = 1 in {
+  let Defs = [AX], Uses = [AL] in
+  def CBW : I<0x98, RawFrm, (outs), (ins),
+              "{cbtw|cbw}", [], IIC_CBW>, OpSize16;  // AX = signext(AL)
+  let Defs = [EAX], Uses = [AX] in
+  def CWDE : I<0x98, RawFrm, (outs), (ins),
+              "{cwtl|cwde}", [], IIC_CBW>, OpSize32; // EAX = signext(AX)
+
+  let Defs = [AX,DX], Uses = [AX] in
+  def CWD : I<0x99, RawFrm, (outs), (ins),
+              "{cwtd|cwd}", [], IIC_CBW>, OpSize16; // DX:AX = signext(AX)
+  let Defs = [EAX,EDX], Uses = [EAX] in
+  def CDQ : I<0x99, RawFrm, (outs), (ins),
+              "{cltd|cdq}", [], IIC_CBW>, OpSize32; // EDX:EAX = signext(EAX)
+
+
+  let Defs = [RAX], Uses = [EAX] in
+  def CDQE : RI<0x98, RawFrm, (outs), (ins),
+               "{cltq|cdqe}", [], IIC_CBW>;     // RAX = signext(EAX)
+
+  let Defs = [RAX,RDX], Uses = [RAX] in
+  def CQO  : RI<0x99, RawFrm, (outs), (ins),
+                "{cqto|cqo}", [], IIC_CBW>; // RDX:RAX = signext(RAX)
+}
+
+
+
+// Sign/Zero extenders
+let neverHasSideEffects = 1 in {
+def MOVSX16rr8 : I<0xBE, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src),
+                   "movs{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVSX_R16_R8>,
+                   TB, OpSize16, Sched<[WriteALU]>;
+let mayLoad = 1 in
+def MOVSX16rm8 : I<0xBE, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src),
+                   "movs{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVSX_R16_M8>,
+                   TB, OpSize16, Sched<[WriteALULd]>;
+} // neverHasSideEffects = 1
+def MOVSX32rr8 : I<0xBE, MRMSrcReg, (outs GR32:$dst), (ins GR8:$src),
+                   "movs{bl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (sext GR8:$src))], IIC_MOVSX>, TB,
+                   OpSize32, Sched<[WriteALU]>;
+def MOVSX32rm8 : I<0xBE, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src),
+                   "movs{bl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (sextloadi32i8 addr:$src))], IIC_MOVSX>, TB,
+                   OpSize32, Sched<[WriteALULd]>;
+def MOVSX32rr16: I<0xBF, MRMSrcReg, (outs GR32:$dst), (ins GR16:$src),
+                   "movs{wl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (sext GR16:$src))], IIC_MOVSX>, TB,
+                   OpSize32, Sched<[WriteALU]>;
+def MOVSX32rm16: I<0xBF, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
+                   "movs{wl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (sextloadi32i16 addr:$src))], IIC_MOVSX>,
+                   OpSize32, TB, Sched<[WriteALULd]>;
+
+let neverHasSideEffects = 1 in {
+def MOVZX16rr8 : I<0xB6, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src),
+                   "movz{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX_R16_R8>,
+                   TB, OpSize16, Sched<[WriteALU]>;
+let mayLoad = 1 in
+def MOVZX16rm8 : I<0xB6, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src),
+                   "movz{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX_R16_M8>,
+                   TB, OpSize16, Sched<[WriteALULd]>;
+} // neverHasSideEffects = 1
+def MOVZX32rr8 : I<0xB6, MRMSrcReg, (outs GR32:$dst), (ins GR8 :$src),
+                   "movz{bl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (zext GR8:$src))], IIC_MOVZX>, TB,
+                   OpSize32, Sched<[WriteALU]>;
+def MOVZX32rm8 : I<0xB6, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src),
+                   "movz{bl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (zextloadi32i8 addr:$src))], IIC_MOVZX>, TB,
+                   OpSize32, Sched<[WriteALULd]>;
+def MOVZX32rr16: I<0xB7, MRMSrcReg, (outs GR32:$dst), (ins GR16:$src),
+                   "movz{wl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (zext GR16:$src))], IIC_MOVZX>, TB,
+                   OpSize32, Sched<[WriteALU]>;
+def MOVZX32rm16: I<0xB7, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
+                   "movz{wl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (zextloadi32i16 addr:$src))], IIC_MOVZX>,
+                   TB, OpSize32, Sched<[WriteALULd]>;
+
+// These are the same as the regular MOVZX32rr8 and MOVZX32rm8
+// except that they use GR32_NOREX for the output operand register class
+// instead of GR32. This allows them to operate on h registers on x86-64.
+let neverHasSideEffects = 1, isCodeGenOnly = 1 in {
+def MOVZX32_NOREXrr8 : I<0xB6, MRMSrcReg,
+                         (outs GR32_NOREX:$dst), (ins GR8_NOREX:$src),
+                         "movz{bl|x}\t{$src, $dst|$dst, $src}",
+                         [], IIC_MOVZX>, TB, Sched<[WriteALU]>;
+let mayLoad = 1 in
+def MOVZX32_NOREXrm8 : I<0xB6, MRMSrcMem,
+                         (outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src),
+                         "movz{bl|x}\t{$src, $dst|$dst, $src}",
+                         [], IIC_MOVZX>, TB, Sched<[WriteALULd]>;
+}
+
+// MOVSX64rr8 always has a REX prefix and it has an 8-bit register
+// operand, which makes it a rare instruction with an 8-bit register
+// operand that can never access an h register. If support for h registers
+// were generalized, this would require a special register class.
+def MOVSX64rr8 : RI<0xBE, MRMSrcReg, (outs GR64:$dst), (ins GR8 :$src),
+                    "movs{bq|x}\t{$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sext GR8:$src))], IIC_MOVSX>, TB,
+                    Sched<[WriteALU]>;
+def MOVSX64rm8 : RI<0xBE, MRMSrcMem, (outs GR64:$dst), (ins i8mem :$src),
+                    "movs{bq|x}\t{$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sextloadi64i8 addr:$src))], IIC_MOVSX>,
+                    TB, Sched<[WriteALULd]>;
+def MOVSX64rr16: RI<0xBF, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src),
+                    "movs{wq|x}\t{$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sext GR16:$src))], IIC_MOVSX>, TB,
+                    Sched<[WriteALU]>;
+def MOVSX64rm16: RI<0xBF, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
+                    "movs{wq|x}\t{$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sextloadi64i16 addr:$src))], IIC_MOVSX>,
+                    TB, Sched<[WriteALULd]>;
+def MOVSX64rr32: RI<0x63, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src),
+                    "movs{lq|xd}\t{$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sext GR32:$src))], IIC_MOVSX>,
+                    Sched<[WriteALU]>;
+def MOVSX64rm32: RI<0x63, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src),
+                    "movs{lq|xd}\t{$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sextloadi64i32 addr:$src))], IIC_MOVSX>,
+                    Sched<[WriteALULd]>;
+
+// movzbq and movzwq encodings for the disassembler
+def MOVZX64rr8_Q : RI<0xB6, MRMSrcReg, (outs GR64:$dst), (ins GR8:$src),
+                       "movz{bq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+                       TB, Sched<[WriteALU]>;
+def MOVZX64rm8_Q : RI<0xB6, MRMSrcMem, (outs GR64:$dst), (ins i8mem:$src),
+                       "movz{bq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+                       TB, Sched<[WriteALULd]>;
+def MOVZX64rr16_Q : RI<0xB7, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src),
+                       "movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+                       TB, Sched<[WriteALU]>;
+def MOVZX64rm16_Q : RI<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
+                       "movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+                       TB, Sched<[WriteALULd]>;
+
+// 64-bit zero-extension patterns use SUBREG_TO_REG and an operation writing a
+// 32-bit register.
+def : Pat<(i64 (zext GR8:$src)),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rr8 GR8:$src), sub_32bit)>;
+def : Pat<(zextloadi64i8 addr:$src),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
+
+def : Pat<(i64 (zext GR16:$src)),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rr16 GR16:$src), sub_32bit)>;
+def : Pat<(zextloadi64i16 addr:$src),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rm16 addr:$src), sub_32bit)>;
+
+// The preferred way to do 32-bit-to-64-bit zero extension on x86-64 is to use a
+// SUBREG_TO_REG to utilize implicit zero-extension, however this isn't possible
+// when the 32-bit value is defined by a truncate or is copied from something
+// where the high bits aren't necessarily all zero. In such cases, we fall back
+// to these explicit zext instructions.
+def : Pat<(i64 (zext GR32:$src)),
+          (SUBREG_TO_REG (i64 0), (MOV32rr GR32:$src), sub_32bit)>;
+def : Pat<(i64 (zextloadi64i32 addr:$src)),
+          (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), sub_32bit)>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFMA.td b/contrib/llvm/lib/Target/X86/X86InstrFMA.td
new file mode 100644
index 0000000..c0a6864
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrFMA.td
@@ -0,0 +1,390 @@
+//===-- X86InstrFMA.td - FMA Instruction Set ---------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes FMA (Fused Multiply-Add) instructions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// FMA3 - Intel 3 operand Fused Multiply-Add instructions
+//===----------------------------------------------------------------------===//
+
+let Constraints = "$src1 = $dst" in {
+multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
+                    PatFrag MemFrag128, PatFrag MemFrag256,
+                    ValueType OpVT128, ValueType OpVT256,
+                    bit IsRVariantCommutable = 0, bit IsMVariantCommutable = 0,
+                    SDPatternOperator Op = null_frag> {
+  let usesCustomInserter = 1, isCommutable = IsRVariantCommutable in
+  def r     : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
+                   (ins VR128:$src1, VR128:$src2, VR128:$src3),
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set VR128:$dst, (OpVT128 (Op VR128:$src2,
+                                               VR128:$src1, VR128:$src3)))]>;
+
+  let mayLoad = 1, isCommutable = IsMVariantCommutable in
+  def m     : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
+                   (ins VR128:$src1, VR128:$src2, f128mem:$src3),
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set VR128:$dst, (OpVT128 (Op VR128:$src2, VR128:$src1,
+                                               (MemFrag128 addr:$src3))))]>;
+
+  let usesCustomInserter = 1, isCommutable = IsRVariantCommutable in
+  def rY    : FMA3<opc, MRMSrcReg, (outs VR256:$dst),
+                   (ins VR256:$src1, VR256:$src2, VR256:$src3),
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set VR256:$dst, (OpVT256 (Op VR256:$src2, VR256:$src1,
+                                               VR256:$src3)))]>, VEX_L;
+
+  let mayLoad = 1, isCommutable = IsMVariantCommutable in
+  def mY    : FMA3<opc, MRMSrcMem, (outs VR256:$dst),
+                   (ins VR256:$src1, VR256:$src2, f256mem:$src3),
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set VR256:$dst,
+                     (OpVT256 (Op VR256:$src2, VR256:$src1,
+                               (MemFrag256 addr:$src3))))]>, VEX_L;
+}
+} // Constraints = "$src1 = $dst"
+
+multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
+                       string OpcodeStr, string PackTy,
+                       PatFrag MemFrag128, PatFrag MemFrag256,
+                       SDNode Op, ValueType OpTy128, ValueType OpTy256> {
+  // For 213, both the register and memory variant are commutable.
+  // Indeed, the commutable operands are 1 and 2 and both live in registers
+  // for both variants.
+  defm r213 : fma3p_rm<opc213,
+                       !strconcat(OpcodeStr, "213", PackTy),
+                       MemFrag128, MemFrag256, OpTy128, OpTy256,
+                       /* IsRVariantCommutable */ 1,
+                       /* IsMVariantCommutable */ 1,
+                       Op>;
+let neverHasSideEffects = 1 in {
+  defm r132 : fma3p_rm<opc132,
+                       !strconcat(OpcodeStr, "132", PackTy),
+                       MemFrag128, MemFrag256, OpTy128, OpTy256>;
+  // For 231, only the register variant is commutable.
+  // For the memory variant the folded operand must be in 3. Thus,
+  // in that case, it cannot be swapped with 2.
+  defm r231 : fma3p_rm<opc231,
+                       !strconcat(OpcodeStr, "231", PackTy),
+                       MemFrag128, MemFrag256, OpTy128, OpTy256,
+                       /* IsRVariantCommutable */ 1,
+                       /* IsMVariantCommutable */ 0>;
+} // neverHasSideEffects = 1
+}
+
+// Fused Multiply-Add
+let ExeDomain = SSEPackedSingle in {
+  defm VFMADDPS    : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps", loadv4f32,
+                                 loadv8f32, X86Fmadd, v4f32, v8f32>;
+  defm VFMSUBPS    : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", loadv4f32,
+                                 loadv8f32, X86Fmsub, v4f32, v8f32>;
+  defm VFMADDSUBPS : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps",
+                                 loadv4f32, loadv8f32, X86Fmaddsub,
+                                 v4f32, v8f32>;
+  defm VFMSUBADDPS : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "ps",
+                                 loadv4f32, loadv8f32, X86Fmsubadd,
+                                 v4f32, v8f32>;
+}
+
+let ExeDomain = SSEPackedDouble in {
+  defm VFMADDPD    : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd", loadv2f64,
+                                 loadv4f64, X86Fmadd, v2f64, v4f64>, VEX_W;
+  defm VFMSUBPD    : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", loadv2f64,
+                                 loadv4f64, X86Fmsub, v2f64, v4f64>, VEX_W;
+  defm VFMADDSUBPD : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd",
+                                 loadv2f64, loadv4f64, X86Fmaddsub,
+                                 v2f64, v4f64>, VEX_W;
+  defm VFMSUBADDPD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "pd",
+                                 loadv2f64, loadv4f64, X86Fmsubadd,
+                                 v2f64, v4f64>, VEX_W;
+}
+
+// Fused Negative Multiply-Add
+let ExeDomain = SSEPackedSingle in {
+  defm VFNMADDPS : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps",  loadv4f32,
+                               loadv8f32, X86Fnmadd, v4f32, v8f32>;
+  defm VFNMSUBPS : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps",  loadv4f32,
+                               loadv8f32, X86Fnmsub, v4f32, v8f32>;
+}
+let ExeDomain = SSEPackedDouble in {
+  defm VFNMADDPD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd", loadv2f64,
+                               loadv4f64, X86Fnmadd, v2f64, v4f64>, VEX_W;
+  defm VFNMSUBPD : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd",
+                               loadv2f64, loadv4f64, X86Fnmsub, v2f64,
+                               v4f64>, VEX_W;
+}
+
+let Constraints = "$src1 = $dst" in {
+multiclass fma3s_rm<bits<8> opc, string OpcodeStr, X86MemOperand x86memop,
+                    RegisterClass RC, ValueType OpVT, PatFrag mem_frag,
+                    bit IsRVariantCommutable = 0, bit IsMVariantCommutable = 0,
+                    SDPatternOperator OpNode = null_frag> {
+  let usesCustomInserter = 1, isCommutable = IsRVariantCommutable in
+  def r     : FMA3<opc, MRMSrcReg, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, RC:$src3),
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set RC:$dst,
+                     (OpVT (OpNode RC:$src2, RC:$src1, RC:$src3)))]>;
+
+  let mayLoad = 1, isCommutable = IsMVariantCommutable in
+  def m     : FMA3<opc, MRMSrcMem, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, x86memop:$src3),
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set RC:$dst,
+                     (OpVT (OpNode RC:$src2, RC:$src1,
+                            (mem_frag addr:$src3))))]>;
+}
+} // Constraints = "$src1 = $dst"
+
+multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
+                       string OpStr, string PackTy, string PT2, Intrinsic Int,
+                       SDNode OpNode, RegisterClass RC, ValueType OpVT,
+                       X86MemOperand x86memop, Operand memop, PatFrag mem_frag,
+                       ComplexPattern mem_cpat> {
+let neverHasSideEffects = 1 in {
+  defm r132 : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy),
+                       x86memop, RC, OpVT, mem_frag>;
+  // See the other defm of r231 for the explanation regarding the
+  // commutable flags.
+  defm r231 : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy),
+                       x86memop, RC, OpVT, mem_frag,
+                       /* IsRVariantCommutable */ 1,
+                       /* IsMVariantCommutable */ 0>;
+}
+
+// See the other defm of r213 for the explanation regarding the
+// commutable flags.
+defm r213 : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy),
+                     x86memop, RC, OpVT, mem_frag,
+                     /* IsRVariantCommutable */ 1,
+                     /* IsMVariantCommutable */ 1,
+                     OpNode>;
+}
+
+multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,
+                 string OpStr, Intrinsic IntF32, Intrinsic IntF64,
+                 SDNode OpNode> {
+  defm SS : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", "SS", IntF32, OpNode,
+                        FR32, f32, f32mem, ssmem, loadf32, sse_load_f32>;
+  defm SD : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", "PD", IntF64, OpNode,
+                        FR64, f64, f64mem, sdmem, loadf64, sse_load_f64>, VEX_W;
+
+  def : Pat<(IntF32 VR128:$src1, VR128:$src2, VR128:$src3),
+            (COPY_TO_REGCLASS
+              (!cast<Instruction>(NAME#"SSr213r")
+                (COPY_TO_REGCLASS $src2, FR32),
+                (COPY_TO_REGCLASS $src1, FR32),
+                (COPY_TO_REGCLASS $src3, FR32)),
+              VR128)>;
+
+  def : Pat<(IntF64 VR128:$src1, VR128:$src2, VR128:$src3),
+            (COPY_TO_REGCLASS
+              (!cast<Instruction>(NAME#"SDr213r")
+                (COPY_TO_REGCLASS $src2, FR64),
+                (COPY_TO_REGCLASS $src1, FR64),
+                (COPY_TO_REGCLASS $src3, FR64)),
+              VR128)>;
+}
+
+defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", int_x86_fma_vfmadd_ss,
+                    int_x86_fma_vfmadd_sd, X86Fmadd>, VEX_LIG;
+defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", int_x86_fma_vfmsub_ss,
+                    int_x86_fma_vfmsub_sd, X86Fmsub>, VEX_LIG;
+
+defm VFNMADD : fma3s<0x9D, 0xAD, 0xBD, "vfnmadd", int_x86_fma_vfnmadd_ss,
+                     int_x86_fma_vfnmadd_sd, X86Fnmadd>, VEX_LIG;
+defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", int_x86_fma_vfnmsub_ss,
+                     int_x86_fma_vfnmsub_sd, X86Fnmsub>, VEX_LIG;
+
+
+//===----------------------------------------------------------------------===//
+// FMA4 - AMD 4 operand Fused Multiply-Add instructions
+//===----------------------------------------------------------------------===//
+
+
+multiclass fma4s<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                 X86MemOperand x86memop, ValueType OpVT, SDNode OpNode,
+                 PatFrag mem_frag> {
+  let isCommutable = 1 in
+  def rr : FMA4<opc, MRMSrcReg, (outs RC:$dst),
+           (ins RC:$src1, RC:$src2, RC:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set RC:$dst,
+             (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, VEX_W, VEX_LIG, MemOp4;
+  def rm : FMA4<opc, MRMSrcMem, (outs RC:$dst),
+           (ins RC:$src1, RC:$src2, x86memop:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set RC:$dst, (OpNode RC:$src1, RC:$src2,
+                           (mem_frag addr:$src3)))]>, VEX_W, VEX_LIG, MemOp4;
+  def mr : FMA4<opc, MRMSrcMem, (outs RC:$dst),
+           (ins RC:$src1, x86memop:$src2, RC:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set RC:$dst,
+             (OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3))]>, VEX_LIG;
+// For disassembler
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
+  def rr_REV : FMA4<opc, MRMSrcReg, (outs RC:$dst),
+               (ins RC:$src1, RC:$src2, RC:$src3),
+               !strconcat(OpcodeStr,
+               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>,
+               VEX_LIG;
+}
+
+multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop,
+                     ComplexPattern mem_cpat, Intrinsic Int> {
+let isCodeGenOnly = 1 in {
+  let isCommutable = 1 in
+  def rr_Int : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
+               (ins VR128:$src1, VR128:$src2, VR128:$src3),
+               !strconcat(OpcodeStr,
+               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+               [(set VR128:$dst,
+                 (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, VEX_LIG, MemOp4;
+  def rm_Int : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
+               (ins VR128:$src1, VR128:$src2, memop:$src3),
+               !strconcat(OpcodeStr,
+               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+               [(set VR128:$dst, (Int VR128:$src1, VR128:$src2,
+                                  mem_cpat:$src3))]>, VEX_W, VEX_LIG, MemOp4;
+  def mr_Int : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
+               (ins VR128:$src1, memop:$src2, VR128:$src3),
+               !strconcat(OpcodeStr,
+               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+               [(set VR128:$dst,
+                 (Int VR128:$src1, mem_cpat:$src2, VR128:$src3))]>, VEX_LIG;
+} // isCodeGenOnly = 1
+}
+
+multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                 ValueType OpVT128, ValueType OpVT256,
+                 PatFrag ld_frag128, PatFrag ld_frag256> {
+  let isCommutable = 1 in
+  def rr : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2, VR128:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR128:$dst,
+             (OpVT128 (OpNode VR128:$src1, VR128:$src2, VR128:$src3)))]>,
+           VEX_W, MemOp4;
+  def rm : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2, f128mem:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR128:$dst, (OpNode VR128:$src1, VR128:$src2,
+                              (ld_frag128 addr:$src3)))]>, VEX_W, MemOp4;
+  def mr : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, f128mem:$src2, VR128:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR128:$dst,
+             (OpNode VR128:$src1, (ld_frag128 addr:$src2), VR128:$src3))]>;
+  let isCommutable = 1 in
+  def rrY : FMA4<opc, MRMSrcReg, (outs VR256:$dst),
+           (ins VR256:$src1, VR256:$src2, VR256:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR256:$dst,
+             (OpVT256 (OpNode VR256:$src1, VR256:$src2, VR256:$src3)))]>,
+           VEX_W, MemOp4, VEX_L;
+  def rmY : FMA4<opc, MRMSrcMem, (outs VR256:$dst),
+           (ins VR256:$src1, VR256:$src2, f256mem:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR256:$dst, (OpNode VR256:$src1, VR256:$src2,
+                              (ld_frag256 addr:$src3)))]>, VEX_W, MemOp4, VEX_L;
+  def mrY : FMA4<opc, MRMSrcMem, (outs VR256:$dst),
+           (ins VR256:$src1, f256mem:$src2, VR256:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR256:$dst, (OpNode VR256:$src1,
+                              (ld_frag256 addr:$src2), VR256:$src3))]>, VEX_L;
+// For disassembler
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
+  def rr_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
+               (ins VR128:$src1, VR128:$src2, VR128:$src3),
+               !strconcat(OpcodeStr,
+               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>;
+  def rrY_REV : FMA4<opc, MRMSrcReg, (outs VR256:$dst),
+                (ins VR256:$src1, VR256:$src2, VR256:$src3),
+                !strconcat(OpcodeStr,
+                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>,
+                VEX_L;
+} // isCodeGenOnly = 1
+}
+
+defm VFMADDSS4  : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32>,
+                  fma4s_int<0x6A, "vfmaddss", ssmem, sse_load_f32,
+                            int_x86_fma_vfmadd_ss>;
+defm VFMADDSD4  : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64>,
+                  fma4s_int<0x6B, "vfmaddsd", sdmem, sse_load_f64,
+                            int_x86_fma_vfmadd_sd>;
+defm VFMSUBSS4  : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32>,
+                  fma4s_int<0x6E, "vfmsubss", ssmem, sse_load_f32,
+                            int_x86_fma_vfmsub_ss>;
+defm VFMSUBSD4  : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64>,
+                  fma4s_int<0x6F, "vfmsubsd", sdmem, sse_load_f64,
+                            int_x86_fma_vfmsub_sd>;
+defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32,
+                        X86Fnmadd, loadf32>,
+                  fma4s_int<0x7A, "vfnmaddss", ssmem, sse_load_f32,
+                            int_x86_fma_vfnmadd_ss>;
+defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64,
+                        X86Fnmadd, loadf64>,
+                  fma4s_int<0x7B, "vfnmaddsd", sdmem, sse_load_f64,
+                            int_x86_fma_vfnmadd_sd>;
+defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32,
+                        X86Fnmsub, loadf32>,
+                  fma4s_int<0x7E, "vfnmsubss", ssmem, sse_load_f32,
+                            int_x86_fma_vfnmsub_ss>;
+defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64,
+                        X86Fnmsub, loadf64>,
+                  fma4s_int<0x7F, "vfnmsubsd", sdmem, sse_load_f64,
+                            int_x86_fma_vfnmsub_sd>;
+
+let ExeDomain = SSEPackedSingle in {
+  defm VFMADDPS4    : fma4p<0x68, "vfmaddps", X86Fmadd, v4f32, v8f32,
+                            loadv4f32, loadv8f32>;
+  defm VFMSUBPS4    : fma4p<0x6C, "vfmsubps", X86Fmsub, v4f32, v8f32,
+                            loadv4f32, loadv8f32>;
+  defm VFNMADDPS4   : fma4p<0x78, "vfnmaddps", X86Fnmadd, v4f32, v8f32,
+                            loadv4f32, loadv8f32>;
+  defm VFNMSUBPS4   : fma4p<0x7C, "vfnmsubps", X86Fnmsub, v4f32, v8f32,
+                            loadv4f32, loadv8f32>;
+  defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", X86Fmaddsub, v4f32, v8f32,
+                            loadv4f32, loadv8f32>;
+  defm VFMSUBADDPS4 : fma4p<0x5E, "vfmsubaddps", X86Fmsubadd, v4f32, v8f32,
+                            loadv4f32, loadv8f32>;
+}
+
+let ExeDomain = SSEPackedDouble in {
+  defm VFMADDPD4    : fma4p<0x69, "vfmaddpd", X86Fmadd, v2f64, v4f64,
+                            loadv2f64, loadv4f64>;
+  defm VFMSUBPD4    : fma4p<0x6D, "vfmsubpd", X86Fmsub, v2f64, v4f64,
+                            loadv2f64, loadv4f64>;
+  defm VFNMADDPD4   : fma4p<0x79, "vfnmaddpd", X86Fnmadd, v2f64, v4f64,
+                            loadv2f64, loadv4f64>;
+  defm VFNMSUBPD4   : fma4p<0x7D, "vfnmsubpd", X86Fnmsub, v2f64, v4f64,
+                            loadv2f64, loadv4f64>;
+  defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", X86Fmaddsub, v2f64, v4f64,
+                            loadv2f64, loadv4f64>;
+  defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd", X86Fmsubadd, v2f64, v4f64,
+                            loadv2f64, loadv4f64>;
+}
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFPStack.td b/contrib/llvm/lib/Target/X86/X86InstrFPStack.td
new file mode 100644
index 0000000..4ad7b7e
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrFPStack.td
@@ -0,0 +1,700 @@
+//===- X86InstrFPStack.td - FPU Instruction Set ------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 x87 FPU instruction set, defining the
+// instructions, and properties of the instructions which are needed for code
+// generation, machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// FPStack specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDTX86FpGet2    : SDTypeProfile<2, 0, [SDTCisVT<0, f80>, 
+                                           SDTCisVT<1, f80>]>;
+def SDTX86Fld       : SDTypeProfile<1, 2, [SDTCisFP<0>,
+                                           SDTCisPtrTy<1>, 
+                                           SDTCisVT<2, OtherVT>]>;
+def SDTX86Fst       : SDTypeProfile<0, 3, [SDTCisFP<0>,
+                                           SDTCisPtrTy<1>, 
+                                           SDTCisVT<2, OtherVT>]>;
+def SDTX86Fild      : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisPtrTy<1>,
+                                           SDTCisVT<2, OtherVT>]>;
+def SDTX86Fnstsw    : SDTypeProfile<1, 1, [SDTCisVT<0, i16>, SDTCisVT<1, i16>]>;
+def SDTX86FpToIMem  : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>;
+
+def SDTX86CwdStore  : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+
+def X86fld          : SDNode<"X86ISD::FLD", SDTX86Fld,
+                             [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def X86fst          : SDNode<"X86ISD::FST", SDTX86Fst,
+                             [SDNPHasChain, SDNPInGlue, SDNPMayStore,
+                              SDNPMemOperand]>;
+def X86fild         : SDNode<"X86ISD::FILD", SDTX86Fild,
+                             [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def X86fildflag     : SDNode<"X86ISD::FILD_FLAG", SDTX86Fild,
+                             [SDNPHasChain, SDNPOutGlue, SDNPMayLoad,
+                              SDNPMemOperand]>;
+def X86fp_stsw      : SDNode<"X86ISD::FNSTSW16r", SDTX86Fnstsw>;
+def X86fp_to_i16mem : SDNode<"X86ISD::FP_TO_INT16_IN_MEM", SDTX86FpToIMem,
+                             [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def X86fp_to_i32mem : SDNode<"X86ISD::FP_TO_INT32_IN_MEM", SDTX86FpToIMem,
+                             [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def X86fp_to_i64mem : SDNode<"X86ISD::FP_TO_INT64_IN_MEM", SDTX86FpToIMem,
+                             [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def X86fp_cwd_get16 : SDNode<"X86ISD::FNSTCW16m",          SDTX86CwdStore,
+                             [SDNPHasChain, SDNPMayStore, SDNPSideEffect,
+                              SDNPMemOperand]>;
+
+//===----------------------------------------------------------------------===//
+// FPStack pattern fragments
+//===----------------------------------------------------------------------===//
+
+def fpimm0 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(+0.0);
+}]>;
+
+def fpimmneg0 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(-0.0);
+}]>;
+
+def fpimm1 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(+1.0);
+}]>;
+
+def fpimmneg1 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(-1.0);
+}]>;
+
+// Some 'special' instructions
+let usesCustomInserter = 1 in {  // Expanded after instruction selection.
+  def FP32_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP32:$src),
+                              [(X86fp_to_i16mem RFP32:$src, addr:$dst)]>;
+  def FP32_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP32:$src),
+                              [(X86fp_to_i32mem RFP32:$src, addr:$dst)]>;
+  def FP32_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP32:$src),
+                              [(X86fp_to_i64mem RFP32:$src, addr:$dst)]>;
+  def FP64_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP64:$src),
+                              [(X86fp_to_i16mem RFP64:$src, addr:$dst)]>;
+  def FP64_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP64:$src),
+                              [(X86fp_to_i32mem RFP64:$src, addr:$dst)]>;
+  def FP64_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP64:$src),
+                              [(X86fp_to_i64mem RFP64:$src, addr:$dst)]>;
+  def FP80_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP80:$src),
+                              [(X86fp_to_i16mem RFP80:$src, addr:$dst)]>;
+  def FP80_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP80:$src),
+                              [(X86fp_to_i32mem RFP80:$src, addr:$dst)]>;
+  def FP80_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP80:$src),
+                              [(X86fp_to_i64mem RFP80:$src, addr:$dst)]>;
+}
+
+// All FP Stack operations are represented with four instructions here.  The
+// first three instructions, generated by the instruction selector, use "RFP32"
+// "RFP64" or "RFP80" registers: traditional register files to reference 32-bit,
+// 64-bit or 80-bit floating point values.  These sizes apply to the values, 
+// not the registers, which are always 80 bits; RFP32, RFP64 and RFP80 can be
+// copied to each other without losing information.  These instructions are all
+// pseudo instructions and use the "_Fp" suffix.
+// In some cases there are additional variants with a mixture of different
+// register sizes.
+// The second instruction is defined with FPI, which is the actual instruction
+// emitted by the assembler.  These use "RST" registers, although frequently
+// the actual register(s) used are implicit.  These are always 80 bits.
+// The FP stackifier pass converts one to the other after register allocation 
+// occurs.
+//
+// Note that the FpI instruction should have instruction selection info (e.g.
+// a pattern) and the FPI instruction should have emission info (e.g. opcode
+// encoding and asm printing info).
+
+// Pseudo Instruction for FP stack return values.
+def FpPOP_RETVAL : FpI_<(outs RFP80:$dst), (ins), SpecialFP, []>;
+
+// FpIf32, FpIf64 - Floating Point Pseudo Instruction template.
+// f32 instructions can use SSE1 and are predicated on FPStackf32 == !SSE1.
+// f64 instructions can use SSE2 and are predicated on FPStackf64 == !SSE2.
+// f80 instructions cannot use SSE and use neither of these.
+class FpIf32<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+  FpI_<outs, ins, fp, pattern>, Requires<[FPStackf32]>;
+class FpIf64<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+  FpI_<outs, ins, fp, pattern>, Requires<[FPStackf64]>;
+
+// Factoring for arithmetic.
+multiclass FPBinary_rr<SDNode OpNode> {
+// Register op register -> register
+// These are separated out because they have no reversed form.
+def _Fp32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2), TwoArgFP,
+                [(set RFP32:$dst, (OpNode RFP32:$src1, RFP32:$src2))]>;
+def _Fp64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, RFP64:$src2), TwoArgFP,
+                [(set RFP64:$dst, (OpNode RFP64:$src1, RFP64:$src2))]>;
+def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2), TwoArgFP,
+                [(set RFP80:$dst, (OpNode RFP80:$src1, RFP80:$src2))]>;
+}
+// The FopST0 series are not included here because of the irregularities
+// in where the 'r' goes in assembly output.
+// These instructions cannot address 80-bit memory.
+multiclass FPBinary<SDNode OpNode, Format fp, string asmstring> {
+// ST(0) = ST(0) + [mem]
+def _Fp32m  : FpIf32<(outs RFP32:$dst), 
+                     (ins RFP32:$src1, f32mem:$src2), OneArgFPRW,
+                  [(set RFP32:$dst, 
+                    (OpNode RFP32:$src1, (loadf32 addr:$src2)))]>;
+def _Fp64m  : FpIf64<(outs RFP64:$dst), 
+                     (ins RFP64:$src1, f64mem:$src2), OneArgFPRW,
+                  [(set RFP64:$dst, 
+                    (OpNode RFP64:$src1, (loadf64 addr:$src2)))]>;
+def _Fp64m32: FpIf64<(outs RFP64:$dst), 
+                     (ins RFP64:$src1, f32mem:$src2), OneArgFPRW,
+                  [(set RFP64:$dst, 
+                    (OpNode RFP64:$src1, (f64 (extloadf32 addr:$src2))))]>;
+def _Fp80m32: FpI_<(outs RFP80:$dst), 
+                   (ins RFP80:$src1, f32mem:$src2), OneArgFPRW,
+                  [(set RFP80:$dst, 
+                    (OpNode RFP80:$src1, (f80 (extloadf32 addr:$src2))))]>;
+def _Fp80m64: FpI_<(outs RFP80:$dst), 
+                   (ins RFP80:$src1, f64mem:$src2), OneArgFPRW,
+                  [(set RFP80:$dst, 
+                    (OpNode RFP80:$src1, (f80 (extloadf64 addr:$src2))))]>;
+def _F32m  : FPI<0xD8, fp, (outs), (ins f32mem:$src), 
+                 !strconcat("f", asmstring, "{s}\t$src")> { 
+  let mayLoad = 1; 
+}
+def _F64m  : FPI<0xDC, fp, (outs), (ins f64mem:$src), 
+                 !strconcat("f", asmstring, "{l}\t$src")> { 
+  let mayLoad = 1; 
+}
+// ST(0) = ST(0) + [memint]
+def _FpI16m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i16mem:$src2), 
+                       OneArgFPRW,
+                    [(set RFP32:$dst, (OpNode RFP32:$src1,
+                                       (X86fild addr:$src2, i16)))]>;
+def _FpI32m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i32mem:$src2), 
+                       OneArgFPRW,
+                    [(set RFP32:$dst, (OpNode RFP32:$src1,
+                                       (X86fild addr:$src2, i32)))]>;
+def _FpI16m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i16mem:$src2), 
+                       OneArgFPRW,
+                    [(set RFP64:$dst, (OpNode RFP64:$src1,
+                                       (X86fild addr:$src2, i16)))]>;
+def _FpI32m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i32mem:$src2), 
+                       OneArgFPRW,
+                    [(set RFP64:$dst, (OpNode RFP64:$src1,
+                                       (X86fild addr:$src2, i32)))]>;
+def _FpI16m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i16mem:$src2), 
+                       OneArgFPRW,
+                    [(set RFP80:$dst, (OpNode RFP80:$src1,
+                                       (X86fild addr:$src2, i16)))]>;
+def _FpI32m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i32mem:$src2), 
+                       OneArgFPRW,
+                    [(set RFP80:$dst, (OpNode RFP80:$src1,
+                                       (X86fild addr:$src2, i32)))]>;
+def _FI16m  : FPI<0xDE, fp, (outs), (ins i16mem:$src), 
+                  !strconcat("fi", asmstring, "{s}\t$src")> { 
+  let mayLoad = 1; 
+}
+def _FI32m  : FPI<0xDA, fp, (outs), (ins i32mem:$src), 
+                  !strconcat("fi", asmstring, "{l}\t$src")> { 
+  let mayLoad = 1; 
+}
+}
+
+let Defs = [FPSW] in {
+// FPBinary_rr just defines pseudo-instructions, no need to set a scheduling
+// resources.
+defm ADD : FPBinary_rr<fadd>;
+defm SUB : FPBinary_rr<fsub>;
+defm MUL : FPBinary_rr<fmul>;
+defm DIV : FPBinary_rr<fdiv>;
+// Sets the scheduling resources for the actual NAME#_F<size>m defintions.
+let SchedRW = [WriteFAddLd] in {
+defm ADD : FPBinary<fadd, MRM0m, "add">;
+defm SUB : FPBinary<fsub, MRM4m, "sub">;
+defm SUBR: FPBinary<fsub ,MRM5m, "subr">;
+}
+let SchedRW = [WriteFMulLd] in {
+defm MUL : FPBinary<fmul, MRM1m, "mul">;
+}
+let SchedRW = [WriteFDivLd] in {
+defm DIV : FPBinary<fdiv, MRM6m, "div">;
+defm DIVR: FPBinary<fdiv, MRM7m, "divr">;
+}
+}
+
+class FPST0rInst<Format fp, string asm>
+  : FPI<0xD8, fp, (outs), (ins RST:$op), asm>;
+class FPrST0Inst<Format fp, string asm>
+  : FPI<0xDC, fp, (outs), (ins RST:$op), asm>;
+class FPrST0PInst<Format fp, string asm>
+  : FPI<0xDE, fp, (outs), (ins RST:$op), asm>;
+
+// NOTE: GAS and apparently all other AT&T style assemblers have a broken notion
+// of some of the 'reverse' forms of the fsub and fdiv instructions.  As such,
+// we have to put some 'r's in and take them out of weird places.
+let SchedRW = [WriteFAdd] in {
+def ADD_FST0r   : FPST0rInst <MRM0r, "fadd\t$op">;
+def ADD_FrST0   : FPrST0Inst <MRM0r, "fadd\t{%st(0), $op|$op, st(0)}">;
+def ADD_FPrST0  : FPrST0PInst<MRM0r, "faddp\t$op">;
+def SUBR_FST0r  : FPST0rInst <MRM5r, "fsubr\t$op">;
+def SUB_FrST0   : FPrST0Inst <MRM5r, "fsub{r}\t{%st(0), $op|$op, st(0)}">;
+def SUB_FPrST0  : FPrST0PInst<MRM5r, "fsub{r}p\t$op">;
+def SUB_FST0r   : FPST0rInst <MRM4r, "fsub\t$op">;
+def SUBR_FrST0  : FPrST0Inst <MRM4r, "fsub{|r}\t{%st(0), $op|$op, st(0)}">;
+def SUBR_FPrST0 : FPrST0PInst<MRM4r, "fsub{|r}p\t$op">;
+} // SchedRW
+let SchedRW = [WriteFMul] in {
+def MUL_FST0r   : FPST0rInst <MRM1r, "fmul\t$op">;
+def MUL_FrST0   : FPrST0Inst <MRM1r, "fmul\t{%st(0), $op|$op, st(0)}">;
+def MUL_FPrST0  : FPrST0PInst<MRM1r, "fmulp\t$op">;
+} // SchedRW
+let SchedRW = [WriteFDiv] in {
+def DIVR_FST0r  : FPST0rInst <MRM7r, "fdivr\t$op">;
+def DIV_FrST0   : FPrST0Inst <MRM7r, "fdiv{r}\t{%st(0), $op|$op, st(0)}">;
+def DIV_FPrST0  : FPrST0PInst<MRM7r, "fdiv{r}p\t$op">;
+def DIV_FST0r   : FPST0rInst <MRM6r, "fdiv\t$op">;
+def DIVR_FrST0  : FPrST0Inst <MRM6r, "fdiv{|r}\t{%st(0), $op|$op, st(0)}">;
+def DIVR_FPrST0 : FPrST0PInst<MRM6r, "fdiv{|r}p\t$op">;
+} // SchedRW
+
+def COM_FST0r   : FPST0rInst <MRM2r, "fcom\t$op">;
+def COMP_FST0r  : FPST0rInst <MRM3r, "fcomp\t$op">;
+
+// Unary operations.
+multiclass FPUnary<SDNode OpNode, Format fp, string asmstring> {
+def _Fp32  : FpIf32<(outs RFP32:$dst), (ins RFP32:$src), OneArgFPRW,
+                 [(set RFP32:$dst, (OpNode RFP32:$src))]>;
+def _Fp64  : FpIf64<(outs RFP64:$dst), (ins RFP64:$src), OneArgFPRW,
+                 [(set RFP64:$dst, (OpNode RFP64:$src))]>;
+def _Fp80  : FpI_<(outs RFP80:$dst), (ins RFP80:$src), OneArgFPRW,
+                 [(set RFP80:$dst, (OpNode RFP80:$src))]>;
+def _F     : FPI<0xD9, fp, (outs), (ins), asmstring>;
+}
+
+let Defs = [FPSW] in {
+defm CHS : FPUnary<fneg, MRM_E0, "fchs">;
+defm ABS : FPUnary<fabs, MRM_E1, "fabs">;
+let SchedRW = [WriteFSqrt] in {
+defm SQRT: FPUnary<fsqrt,MRM_FA, "fsqrt">;
+}
+defm SIN : FPUnary<fsin, MRM_FE, "fsin">;
+defm COS : FPUnary<fcos, MRM_FF, "fcos">;
+
+let neverHasSideEffects = 1 in {
+def TST_Fp32  : FpIf32<(outs), (ins RFP32:$src), OneArgFP, []>;
+def TST_Fp64  : FpIf64<(outs), (ins RFP64:$src), OneArgFP, []>;
+def TST_Fp80  : FpI_<(outs), (ins RFP80:$src), OneArgFP, []>;
+}
+def TST_F  : FPI<0xD9, MRM_E4, (outs), (ins), "ftst">;
+} // Defs = [FPSW]
+
+// Versions of FP instructions that take a single memory operand.  Added for the
+//   disassembler; remove as they are included with patterns elsewhere.
+def FCOM32m  : FPI<0xD8, MRM2m, (outs), (ins f32mem:$src), "fcom{s}\t$src">;
+def FCOMP32m : FPI<0xD8, MRM3m, (outs), (ins f32mem:$src), "fcomp{s}\t$src">;
+
+def FLDENVm  : FPI<0xD9, MRM4m, (outs), (ins f32mem:$src), "fldenv\t$src">;
+def FSTENVm  : FPI<0xD9, MRM6m, (outs f32mem:$dst), (ins), "fnstenv\t$dst">;
+
+def FICOM32m : FPI<0xDA, MRM2m, (outs), (ins i32mem:$src), "ficom{l}\t$src">;
+def FICOMP32m: FPI<0xDA, MRM3m, (outs), (ins i32mem:$src), "ficomp{l}\t$src">;
+
+def FCOM64m  : FPI<0xDC, MRM2m, (outs), (ins f64mem:$src), "fcom{l}\t$src">;
+def FCOMP64m : FPI<0xDC, MRM3m, (outs), (ins f64mem:$src), "fcomp{l}\t$src">;
+
+def FRSTORm  : FPI<0xDD, MRM4m, (outs f32mem:$dst), (ins), "frstor\t$dst">;
+def FSAVEm   : FPI<0xDD, MRM6m, (outs f32mem:$dst), (ins), "fnsave\t$dst">;
+def FNSTSWm  : FPI<0xDD, MRM7m, (outs f32mem:$dst), (ins), "fnstsw\t$dst">;
+
+def FICOM16m : FPI<0xDE, MRM2m, (outs), (ins i16mem:$src), "ficom{s}\t$src">;
+def FICOMP16m: FPI<0xDE, MRM3m, (outs), (ins i16mem:$src), "ficomp{s}\t$src">;
+
+def FBLDm    : FPI<0xDF, MRM4m, (outs), (ins f32mem:$src), "fbld\t$src">;
+def FBSTPm   : FPI<0xDF, MRM6m, (outs f32mem:$dst), (ins), "fbstp\t$dst">;
+
+// Floating point cmovs.
+class FpIf32CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+  FpI_<outs, ins, fp, pattern>, Requires<[FPStackf32, HasCMov]>;
+class FpIf64CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+  FpI_<outs, ins, fp, pattern>, Requires<[FPStackf64, HasCMov]>;
+
+multiclass FPCMov<PatLeaf cc> {
+  def _Fp32  : FpIf32CMov<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2),
+                       CondMovFP,
+                     [(set RFP32:$dst, (X86cmov RFP32:$src1, RFP32:$src2,
+                                        cc, EFLAGS))]>;
+  def _Fp64  : FpIf64CMov<(outs RFP64:$dst), (ins RFP64:$src1, RFP64:$src2),
+                       CondMovFP,
+                     [(set RFP64:$dst, (X86cmov RFP64:$src1, RFP64:$src2,
+                                        cc, EFLAGS))]>;
+  def _Fp80  : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2),
+                     CondMovFP,
+                     [(set RFP80:$dst, (X86cmov RFP80:$src1, RFP80:$src2,
+                                        cc, EFLAGS))]>,
+                                        Requires<[HasCMov]>;
+}
+
+let Defs = [FPSW] in {
+let Uses = [EFLAGS], Constraints = "$src1 = $dst" in {
+defm CMOVB  : FPCMov<X86_COND_B>;
+defm CMOVBE : FPCMov<X86_COND_BE>;
+defm CMOVE  : FPCMov<X86_COND_E>;
+defm CMOVP  : FPCMov<X86_COND_P>;
+defm CMOVNB : FPCMov<X86_COND_AE>;
+defm CMOVNBE: FPCMov<X86_COND_A>;
+defm CMOVNE : FPCMov<X86_COND_NE>;
+defm CMOVNP : FPCMov<X86_COND_NP>;
+} // Uses = [EFLAGS], Constraints = "$src1 = $dst"
+
+let Predicates = [HasCMov] in {
+// These are not factored because there's no clean way to pass DA/DB.
+def CMOVB_F  : FPI<0xDA, MRM0r, (outs RST:$op), (ins),
+                  "fcmovb\t{$op, %st(0)|st(0), $op}">;
+def CMOVBE_F : FPI<0xDA, MRM2r, (outs RST:$op), (ins),
+                  "fcmovbe\t{$op, %st(0)|st(0), $op}">;
+def CMOVE_F  : FPI<0xDA, MRM1r, (outs RST:$op), (ins),
+                  "fcmove\t{$op, %st(0)|st(0), $op}">;
+def CMOVP_F  : FPI<0xDA, MRM3r, (outs RST:$op), (ins),
+                  "fcmovu\t{$op, %st(0)|st(0), $op}">;
+def CMOVNB_F : FPI<0xDB, MRM0r, (outs RST:$op), (ins),
+                  "fcmovnb\t{$op, %st(0)|st(0), $op}">;
+def CMOVNBE_F: FPI<0xDB, MRM2r, (outs RST:$op), (ins),
+                  "fcmovnbe\t{$op, %st(0)|st(0), $op}">;
+def CMOVNE_F : FPI<0xDB, MRM1r, (outs RST:$op), (ins),
+                  "fcmovne\t{$op, %st(0)|st(0), $op}">;
+def CMOVNP_F : FPI<0xDB, MRM3r, (outs RST:$op), (ins),
+                  "fcmovnu\t{$op, %st(0)|st(0), $op}">;
+} // Predicates = [HasCMov]
+
+// Floating point loads & stores.
+let canFoldAsLoad = 1 in {
+def LD_Fp32m   : FpIf32<(outs RFP32:$dst), (ins f32mem:$src), ZeroArgFP,
+                  [(set RFP32:$dst, (loadf32 addr:$src))]>;
+let isReMaterializable = 1 in
+  def LD_Fp64m : FpIf64<(outs RFP64:$dst), (ins f64mem:$src), ZeroArgFP,
+                  [(set RFP64:$dst, (loadf64 addr:$src))]>;
+def LD_Fp80m   : FpI_<(outs RFP80:$dst), (ins f80mem:$src), ZeroArgFP,
+                  [(set RFP80:$dst, (loadf80 addr:$src))]>;
+}
+def LD_Fp32m64 : FpIf64<(outs RFP64:$dst), (ins f32mem:$src), ZeroArgFP,
+                  [(set RFP64:$dst, (f64 (extloadf32 addr:$src)))]>;
+def LD_Fp64m80 : FpI_<(outs RFP80:$dst), (ins f64mem:$src), ZeroArgFP,
+                  [(set RFP80:$dst, (f80 (extloadf64 addr:$src)))]>;
+def LD_Fp32m80 : FpI_<(outs RFP80:$dst), (ins f32mem:$src), ZeroArgFP,
+                  [(set RFP80:$dst, (f80 (extloadf32 addr:$src)))]>;
+def ILD_Fp16m32: FpIf32<(outs RFP32:$dst), (ins i16mem:$src), ZeroArgFP,
+                  [(set RFP32:$dst, (X86fild addr:$src, i16))]>;
+def ILD_Fp32m32: FpIf32<(outs RFP32:$dst), (ins i32mem:$src), ZeroArgFP,
+                  [(set RFP32:$dst, (X86fild addr:$src, i32))]>;
+def ILD_Fp64m32: FpIf32<(outs RFP32:$dst), (ins i64mem:$src), ZeroArgFP,
+                  [(set RFP32:$dst, (X86fild addr:$src, i64))]>;
+def ILD_Fp16m64: FpIf64<(outs RFP64:$dst), (ins i16mem:$src), ZeroArgFP,
+                  [(set RFP64:$dst, (X86fild addr:$src, i16))]>;
+def ILD_Fp32m64: FpIf64<(outs RFP64:$dst), (ins i32mem:$src), ZeroArgFP,
+                  [(set RFP64:$dst, (X86fild addr:$src, i32))]>;
+def ILD_Fp64m64: FpIf64<(outs RFP64:$dst), (ins i64mem:$src), ZeroArgFP,
+                  [(set RFP64:$dst, (X86fild addr:$src, i64))]>;
+def ILD_Fp16m80: FpI_<(outs RFP80:$dst), (ins i16mem:$src), ZeroArgFP,
+                  [(set RFP80:$dst, (X86fild addr:$src, i16))]>;
+def ILD_Fp32m80: FpI_<(outs RFP80:$dst), (ins i32mem:$src), ZeroArgFP,
+                  [(set RFP80:$dst, (X86fild addr:$src, i32))]>;
+def ILD_Fp64m80: FpI_<(outs RFP80:$dst), (ins i64mem:$src), ZeroArgFP,
+                  [(set RFP80:$dst, (X86fild addr:$src, i64))]>;
+
+def ST_Fp32m   : FpIf32<(outs), (ins f32mem:$op, RFP32:$src), OneArgFP,
+                  [(store RFP32:$src, addr:$op)]>;
+def ST_Fp64m32 : FpIf64<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP,
+                  [(truncstoref32 RFP64:$src, addr:$op)]>;
+def ST_Fp64m   : FpIf64<(outs), (ins f64mem:$op, RFP64:$src), OneArgFP,
+                  [(store RFP64:$src, addr:$op)]>;
+def ST_Fp80m32 : FpI_<(outs), (ins f32mem:$op, RFP80:$src), OneArgFP,
+                  [(truncstoref32 RFP80:$src, addr:$op)]>;
+def ST_Fp80m64 : FpI_<(outs), (ins f64mem:$op, RFP80:$src), OneArgFP,
+                  [(truncstoref64 RFP80:$src, addr:$op)]>;
+// FST does not support 80-bit memory target; FSTP must be used.
+
+let mayStore = 1, neverHasSideEffects = 1 in {
+def ST_FpP32m    : FpIf32<(outs), (ins f32mem:$op, RFP32:$src), OneArgFP, []>;
+def ST_FpP64m32  : FpIf64<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP, []>;
+def ST_FpP64m    : FpIf64<(outs), (ins f64mem:$op, RFP64:$src), OneArgFP, []>;
+def ST_FpP80m32  : FpI_<(outs), (ins f32mem:$op, RFP80:$src), OneArgFP, []>;
+def ST_FpP80m64  : FpI_<(outs), (ins f64mem:$op, RFP80:$src), OneArgFP, []>;
+}
+def ST_FpP80m    : FpI_<(outs), (ins f80mem:$op, RFP80:$src), OneArgFP,
+                    [(store RFP80:$src, addr:$op)]>;
+let mayStore = 1, neverHasSideEffects = 1 in {
+def IST_Fp16m32  : FpIf32<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp32m32  : FpIf32<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp64m32  : FpIf32<(outs), (ins i64mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp16m64  : FpIf64<(outs), (ins i16mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp32m64  : FpIf64<(outs), (ins i32mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp64m64  : FpIf64<(outs), (ins i64mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp16m80  : FpI_<(outs), (ins i16mem:$op, RFP80:$src), OneArgFP, []>;
+def IST_Fp32m80  : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP, []>;
+def IST_Fp64m80  : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP, []>;
+}
+
+let mayLoad = 1, SchedRW = [WriteLoad] in {
+def LD_F32m   : FPI<0xD9, MRM0m, (outs), (ins f32mem:$src), "fld{s}\t$src",
+                    IIC_FLD>;
+def LD_F64m   : FPI<0xDD, MRM0m, (outs), (ins f64mem:$src), "fld{l}\t$src",
+                    IIC_FLD>;
+def LD_F80m   : FPI<0xDB, MRM5m, (outs), (ins f80mem:$src), "fld{t}\t$src",
+                    IIC_FLD80>;
+def ILD_F16m  : FPI<0xDF, MRM0m, (outs), (ins i16mem:$src), "fild{s}\t$src",
+                    IIC_FILD>;
+def ILD_F32m  : FPI<0xDB, MRM0m, (outs), (ins i32mem:$src), "fild{l}\t$src",
+                    IIC_FILD>;
+def ILD_F64m  : FPI<0xDF, MRM5m, (outs), (ins i64mem:$src), "fild{ll}\t$src",
+                    IIC_FILD>;
+}
+let mayStore = 1, SchedRW = [WriteStore] in {
+def ST_F32m   : FPI<0xD9, MRM2m, (outs), (ins f32mem:$dst), "fst{s}\t$dst",
+                    IIC_FST>;
+def ST_F64m   : FPI<0xDD, MRM2m, (outs), (ins f64mem:$dst), "fst{l}\t$dst",
+                    IIC_FST>;
+def ST_FP32m  : FPI<0xD9, MRM3m, (outs), (ins f32mem:$dst), "fstp{s}\t$dst",
+                    IIC_FST>;
+def ST_FP64m  : FPI<0xDD, MRM3m, (outs), (ins f64mem:$dst), "fstp{l}\t$dst",
+                    IIC_FST>;
+def ST_FP80m  : FPI<0xDB, MRM7m, (outs), (ins f80mem:$dst), "fstp{t}\t$dst",
+                    IIC_FST80>;
+def IST_F16m  : FPI<0xDF, MRM2m, (outs), (ins i16mem:$dst), "fist{s}\t$dst",
+                    IIC_FIST>;
+def IST_F32m  : FPI<0xDB, MRM2m, (outs), (ins i32mem:$dst), "fist{l}\t$dst",
+                    IIC_FIST>;
+def IST_FP16m : FPI<0xDF, MRM3m, (outs), (ins i16mem:$dst), "fistp{s}\t$dst",
+                    IIC_FIST>;
+def IST_FP32m : FPI<0xDB, MRM3m, (outs), (ins i32mem:$dst), "fistp{l}\t$dst",
+                    IIC_FIST>;
+def IST_FP64m : FPI<0xDF, MRM7m, (outs), (ins i64mem:$dst), "fistp{ll}\t$dst",
+                    IIC_FIST>;
+}
+
+// FISTTP requires SSE3 even though it's a FPStack op.
+let Predicates = [HasSSE3] in {
+def ISTT_Fp16m32 : FpI_<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP,
+                    [(X86fp_to_i16mem RFP32:$src, addr:$op)]>;
+def ISTT_Fp32m32 : FpI_<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP,
+                    [(X86fp_to_i32mem RFP32:$src, addr:$op)]>;
+def ISTT_Fp64m32 : FpI_<(outs), (ins i64mem:$op, RFP32:$src), OneArgFP,
+                    [(X86fp_to_i64mem RFP32:$src, addr:$op)]>;
+def ISTT_Fp16m64 : FpI_<(outs), (ins i16mem:$op, RFP64:$src), OneArgFP,
+                    [(X86fp_to_i16mem RFP64:$src, addr:$op)]>;
+def ISTT_Fp32m64 : FpI_<(outs), (ins i32mem:$op, RFP64:$src), OneArgFP,
+                    [(X86fp_to_i32mem RFP64:$src, addr:$op)]>;
+def ISTT_Fp64m64 : FpI_<(outs), (ins i64mem:$op, RFP64:$src), OneArgFP,
+                    [(X86fp_to_i64mem RFP64:$src, addr:$op)]>;
+def ISTT_Fp16m80 : FpI_<(outs), (ins i16mem:$op, RFP80:$src), OneArgFP,
+                    [(X86fp_to_i16mem RFP80:$src, addr:$op)]>;
+def ISTT_Fp32m80 : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP,
+                    [(X86fp_to_i32mem RFP80:$src, addr:$op)]>;
+def ISTT_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP,
+                    [(X86fp_to_i64mem RFP80:$src, addr:$op)]>;
+} // Predicates = [HasSSE3]
+
+let mayStore = 1, SchedRW = [WriteStore] in {
+def ISTT_FP16m : FPI<0xDF, MRM1m, (outs), (ins i16mem:$dst), "fisttp{s}\t$dst",
+  IIC_FST>;
+def ISTT_FP32m : FPI<0xDB, MRM1m, (outs), (ins i32mem:$dst), "fisttp{l}\t$dst",
+  IIC_FST>;
+def ISTT_FP64m : FPI<0xDD, MRM1m, (outs), (ins i64mem:$dst), 
+  "fisttp{ll}\t$dst", IIC_FST>;
+}
+
+// FP Stack manipulation instructions.
+let SchedRW = [WriteMove] in {
+def LD_Frr   : FPI<0xD9, MRM0r, (outs), (ins RST:$op), "fld\t$op", IIC_FLD>;
+def ST_Frr   : FPI<0xDD, MRM2r, (outs), (ins RST:$op), "fst\t$op", IIC_FST>;
+def ST_FPrr  : FPI<0xDD, MRM3r, (outs), (ins RST:$op), "fstp\t$op", IIC_FST>;
+def XCH_F    : FPI<0xD9, MRM1r, (outs), (ins RST:$op), "fxch\t$op", IIC_FXCH>;
+}
+
+// Floating point constant loads.
+let isReMaterializable = 1 in {
+def LD_Fp032 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP,
+                [(set RFP32:$dst, fpimm0)]>;
+def LD_Fp132 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP,
+                [(set RFP32:$dst, fpimm1)]>;
+def LD_Fp064 : FpIf64<(outs RFP64:$dst), (ins), ZeroArgFP,
+                [(set RFP64:$dst, fpimm0)]>;
+def LD_Fp164 : FpIf64<(outs RFP64:$dst), (ins), ZeroArgFP,
+                [(set RFP64:$dst, fpimm1)]>;
+def LD_Fp080 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP,
+                [(set RFP80:$dst, fpimm0)]>;
+def LD_Fp180 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP,
+                [(set RFP80:$dst, fpimm1)]>;
+}
+
+let SchedRW = [WriteZero] in {
+def LD_F0 : FPI<0xD9, MRM_EE, (outs), (ins), "fldz", IIC_FLDZ>;
+def LD_F1 : FPI<0xD9, MRM_E8, (outs), (ins), "fld1", IIC_FIST>;
+}
+
+// Floating point compares.
+let SchedRW = [WriteFAdd] in {
+def UCOM_Fpr32 : FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
+                        [(set FPSW, (trunc (X86cmp RFP32:$lhs, RFP32:$rhs)))]>;
+def UCOM_Fpr64 : FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
+                        [(set FPSW, (trunc (X86cmp RFP64:$lhs, RFP64:$rhs)))]>;
+def UCOM_Fpr80 : FpI_  <(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
+                        [(set FPSW, (trunc (X86cmp RFP80:$lhs, RFP80:$rhs)))]>;
+} // SchedRW
+} // Defs = [FPSW]
+
+let SchedRW = [WriteFAdd] in {
+// CC = ST(0) cmp ST(i)
+let Defs = [EFLAGS, FPSW] in {
+def UCOM_FpIr32: FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
+                  [(set EFLAGS, (X86cmp RFP32:$lhs, RFP32:$rhs))]>;
+def UCOM_FpIr64: FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
+                  [(set EFLAGS, (X86cmp RFP64:$lhs, RFP64:$rhs))]>;
+def UCOM_FpIr80: FpI_<(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
+                  [(set EFLAGS, (X86cmp RFP80:$lhs, RFP80:$rhs))]>;
+}
+
+let Defs = [FPSW], Uses = [ST0] in {
+def UCOM_Fr    : FPI<0xDD, MRM4r,    // FPSW = cmp ST(0) with ST(i)
+                    (outs), (ins RST:$reg), "fucom\t$reg", IIC_FUCOM>;
+def UCOM_FPr   : FPI<0xDD, MRM5r,    // FPSW = cmp ST(0) with ST(i), pop
+                    (outs), (ins RST:$reg), "fucomp\t$reg", IIC_FUCOM>;
+def UCOM_FPPr  : FPI<0xDA, MRM_E9,       // cmp ST(0) with ST(1), pop, pop
+                    (outs), (ins), "fucompp", IIC_FUCOM>;
+}
+
+let Defs = [EFLAGS, FPSW], Uses = [ST0] in {
+def UCOM_FIr   : FPI<0xDB, MRM5r,     // CC = cmp ST(0) with ST(i)
+                    (outs), (ins RST:$reg), "fucomi\t$reg", IIC_FUCOMI>;
+def UCOM_FIPr  : FPI<0xDF, MRM5r,     // CC = cmp ST(0) with ST(i), pop
+                    (outs), (ins RST:$reg), "fucompi\t$reg", IIC_FUCOMI>;
+}
+
+let Defs = [EFLAGS, FPSW] in {
+def COM_FIr : FPI<0xDB, MRM6r, (outs), (ins RST:$reg),
+                  "fcomi\t$reg", IIC_FCOMI>;
+def COM_FIPr : FPI<0xDF, MRM6r, (outs), (ins RST:$reg),
+                   "fcompi\t$reg", IIC_FCOMI>;
+}
+} // SchedRW
+
+// Floating point flag ops.
+let SchedRW = [WriteALU] in {
+let Defs = [AX], Uses = [FPSW] in
+def FNSTSW16r : I<0xDF, MRM_E0,                  // AX = fp flags
+                  (outs), (ins), "fnstsw\t{%ax|ax}",
+                  [(set AX, (X86fp_stsw FPSW))], IIC_FNSTSW>;
+
+def FNSTCW16m : I<0xD9, MRM7m,                   // [mem16] = X87 control world
+                  (outs), (ins i16mem:$dst), "fnstcw\t$dst",
+                  [(X86fp_cwd_get16 addr:$dst)], IIC_FNSTCW>;
+} // SchedRW
+let mayLoad = 1 in
+def FLDCW16m  : I<0xD9, MRM5m,                   // X87 control world = [mem16]
+                  (outs), (ins i16mem:$dst), "fldcw\t$dst", [], IIC_FLDCW>,
+                Sched<[WriteLoad]>;
+
+// FPU control instructions
+let SchedRW = [WriteMicrocoded] in {
+let Defs = [FPSW] in
+def FNINIT : I<0xDB, MRM_E3, (outs), (ins), "fninit", [], IIC_FNINIT>;
+def FFREE : FPI<0xDD, MRM0r, (outs), (ins RST:$reg),
+                "ffree\t$reg", IIC_FFREE>;
+// Clear exceptions
+
+let Defs = [FPSW] in
+def FNCLEX : I<0xDB, MRM_E2, (outs), (ins), "fnclex", [], IIC_FNCLEX>;
+} // SchedRW
+
+// Operandless floating-point instructions for the disassembler.
+let SchedRW = [WriteMicrocoded] in {
+def WAIT : I<0x9B, RawFrm, (outs), (ins), "wait", [], IIC_WAIT>;
+
+def FNOP : I<0xD9, MRM_D0, (outs), (ins), "fnop", [], IIC_FNOP>;
+def FXAM : I<0xD9, MRM_E5, (outs), (ins), "fxam", [], IIC_FXAM>;
+def FLDL2T : I<0xD9, MRM_E9, (outs), (ins), "fldl2t", [], IIC_FLDL>;
+def FLDL2E : I<0xD9, MRM_EA, (outs), (ins), "fldl2e", [], IIC_FLDL>;
+def FLDPI : I<0xD9, MRM_EB, (outs), (ins), "fldpi", [], IIC_FLDL>;
+def FLDLG2 : I<0xD9, MRM_EC, (outs), (ins), "fldlg2", [], IIC_FLDL>;
+def FLDLN2 : I<0xD9, MRM_ED, (outs), (ins), "fldln2", [], IIC_FLDL>;
+def F2XM1 : I<0xD9, MRM_F0, (outs), (ins), "f2xm1", [], IIC_F2XM1>;
+def FYL2X : I<0xD9, MRM_F1, (outs), (ins), "fyl2x", [], IIC_FYL2X>;
+def FPTAN : I<0xD9, MRM_F2, (outs), (ins), "fptan", [], IIC_FPTAN>;
+def FPATAN : I<0xD9, MRM_F3, (outs), (ins), "fpatan", [], IIC_FPATAN>;
+def FXTRACT : I<0xD9, MRM_F4, (outs), (ins), "fxtract", [], IIC_FXTRACT>;
+def FPREM1 : I<0xD9, MRM_F5, (outs), (ins), "fprem1", [], IIC_FPREM1>;
+def FDECSTP : I<0xD9, MRM_F6, (outs), (ins), "fdecstp", [], IIC_FPSTP>;
+def FINCSTP : I<0xD9, MRM_F7, (outs), (ins), "fincstp", [], IIC_FPSTP>;
+def FPREM : I<0xD9, MRM_F8, (outs), (ins), "fprem", [], IIC_FPREM>;
+def FYL2XP1 : I<0xD9, MRM_F9, (outs), (ins), "fyl2xp1", [], IIC_FYL2XP1>;
+def FSINCOS : I<0xD9, MRM_FB, (outs), (ins), "fsincos", [], IIC_FSINCOS>;
+def FRNDINT : I<0xD9, MRM_FC, (outs), (ins), "frndint", [], IIC_FRNDINT>;
+def FSCALE : I<0xD9, MRM_FD, (outs), (ins), "fscale", [], IIC_FSCALE>;
+def FCOMPP : I<0xDE, MRM_D9, (outs), (ins), "fcompp", [], IIC_FCOMPP>;
+
+def FXSAVE : I<0xAE, MRM0m, (outs opaque512mem:$dst), (ins),
+               "fxsave\t$dst", [], IIC_FXSAVE>, TB;
+def FXSAVE64 : RI<0xAE, MRM0m, (outs opaque512mem:$dst), (ins),
+                  "fxsave{q|64}\t$dst", [], IIC_FXSAVE>, TB, 
+                  Requires<[In64BitMode]>;
+def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
+                "fxrstor\t$src", [], IIC_FXRSTOR>, TB;
+def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
+                  "fxrstor{q|64}\t$src", [], IIC_FXRSTOR>, TB,
+                  Requires<[In64BitMode]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// Required for RET of f32 / f64 / f80 values.
+def : Pat<(X86fld addr:$src, f32), (LD_Fp32m addr:$src)>;
+def : Pat<(X86fld addr:$src, f64), (LD_Fp64m addr:$src)>;
+def : Pat<(X86fld addr:$src, f80), (LD_Fp80m addr:$src)>;
+
+// Required for CALL which return f32 / f64 / f80 values.
+def : Pat<(X86fst RFP32:$src, addr:$op, f32), (ST_Fp32m addr:$op, RFP32:$src)>;
+def : Pat<(X86fst RFP64:$src, addr:$op, f32), (ST_Fp64m32 addr:$op, 
+                                                          RFP64:$src)>;
+def : Pat<(X86fst RFP64:$src, addr:$op, f64), (ST_Fp64m addr:$op, RFP64:$src)>;
+def : Pat<(X86fst RFP80:$src, addr:$op, f32), (ST_Fp80m32 addr:$op, 
+                                                          RFP80:$src)>;
+def : Pat<(X86fst RFP80:$src, addr:$op, f64), (ST_Fp80m64 addr:$op, 
+                                                          RFP80:$src)>;
+def : Pat<(X86fst RFP80:$src, addr:$op, f80), (ST_FpP80m addr:$op,
+                                                         RFP80:$src)>;
+
+// Floating point constant -0.0 and -1.0
+def : Pat<(f32 fpimmneg0), (CHS_Fp32 (LD_Fp032))>, Requires<[FPStackf32]>;
+def : Pat<(f32 fpimmneg1), (CHS_Fp32 (LD_Fp132))>, Requires<[FPStackf32]>;
+def : Pat<(f64 fpimmneg0), (CHS_Fp64 (LD_Fp064))>, Requires<[FPStackf64]>;
+def : Pat<(f64 fpimmneg1), (CHS_Fp64 (LD_Fp164))>, Requires<[FPStackf64]>;
+def : Pat<(f80 fpimmneg0), (CHS_Fp80 (LD_Fp080))>;
+def : Pat<(f80 fpimmneg1), (CHS_Fp80 (LD_Fp180))>;
+
+// Used to conv. i64 to f64 since there isn't a SSE version.
+def : Pat<(X86fildflag addr:$src, i64), (ILD_Fp64m64 addr:$src)>;
+
+// FP extensions map onto simple pseudo-value conversions if they are to/from
+// the FP stack.
+def : Pat<(f64 (fextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP64)>,
+          Requires<[FPStackf32]>;
+def : Pat<(f80 (fextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP80)>,
+           Requires<[FPStackf32]>;
+def : Pat<(f80 (fextend RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP80)>,
+           Requires<[FPStackf64]>;
+
+// FP truncations map onto simple pseudo-value conversions if they are to/from
+// the FP stack.  We have validated that only value-preserving truncations make
+// it through isel.
+def : Pat<(f32 (fround RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP32)>,
+          Requires<[FPStackf32]>;
+def : Pat<(f32 (fround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP32)>,
+           Requires<[FPStackf32]>;
+def : Pat<(f64 (fround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP64)>,
+           Requires<[FPStackf64]>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFormats.td b/contrib/llvm/lib/Target/X86/X86InstrFormats.td
new file mode 100644
index 0000000..8ef5f90
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrFormats.td
@@ -0,0 +1,876 @@
+//===-- X86InstrFormats.td - X86 Instruction Formats -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// X86 Instruction Format Definitions.
+//
+
+// Format specifies the encoding used by the instruction.  This is part of the
+// ad-hoc solution used to emit machine instruction encodings by our machine
+// code emitter.
+class Format<bits<7> val> {
+  bits<7> Value = val;
+}
+
+def Pseudo     : Format<0>; def RawFrm     : Format<1>;
+def AddRegFrm  : Format<2>; def MRMDestReg : Format<3>;
+def MRMDestMem : Format<4>; def MRMSrcReg  : Format<5>;
+def MRMSrcMem  : Format<6>; def RawFrmMemOffs : Format<7>;
+def RawFrmSrc  : Format<8>; def RawFrmDst     : Format<9>;
+def RawFrmDstSrc: Format<10>;
+def RawFrmImm8 : Format<11>;
+def RawFrmImm16 : Format<12>;
+def MRMXr  : Format<14>; def MRMXm  : Format<15>;
+def MRM0r  : Format<16>; def MRM1r  : Format<17>; def MRM2r  : Format<18>;
+def MRM3r  : Format<19>; def MRM4r  : Format<20>; def MRM5r  : Format<21>;
+def MRM6r  : Format<22>; def MRM7r  : Format<23>;
+def MRM0m  : Format<24>; def MRM1m  : Format<25>; def MRM2m  : Format<26>;
+def MRM3m  : Format<27>; def MRM4m  : Format<28>; def MRM5m  : Format<29>;
+def MRM6m  : Format<30>; def MRM7m  : Format<31>;
+def MRM_C0 : Format<32>; def MRM_C1 : Format<33>; def MRM_C2 : Format<34>;
+def MRM_C3 : Format<35>; def MRM_C4 : Format<36>; def MRM_C8 : Format<37>;
+def MRM_C9 : Format<38>; def MRM_CA : Format<39>; def MRM_CB : Format<40>;
+def MRM_D0 : Format<41>; def MRM_D1 : Format<42>; def MRM_D4 : Format<43>;
+def MRM_D5 : Format<44>; def MRM_D6 : Format<45>; def MRM_D8 : Format<46>;
+def MRM_D9 : Format<47>; def MRM_DA : Format<48>; def MRM_DB : Format<49>;
+def MRM_DC : Format<50>; def MRM_DD : Format<51>; def MRM_DE : Format<52>;
+def MRM_DF : Format<53>; def MRM_E0 : Format<54>; def MRM_E1 : Format<55>;
+def MRM_E2 : Format<56>; def MRM_E3 : Format<57>; def MRM_E4 : Format<58>;
+def MRM_E5 : Format<59>; def MRM_E8 : Format<60>; def MRM_E9 : Format<61>;
+def MRM_EA : Format<62>; def MRM_EB : Format<63>; def MRM_EC : Format<64>;
+def MRM_ED : Format<65>; def MRM_EE : Format<66>; def MRM_F0 : Format<67>;
+def MRM_F1 : Format<68>; def MRM_F2 : Format<69>; def MRM_F3 : Format<70>;
+def MRM_F4 : Format<71>; def MRM_F5 : Format<72>; def MRM_F6 : Format<73>;
+def MRM_F7 : Format<74>; def MRM_F8 : Format<75>; def MRM_F9 : Format<76>;
+def MRM_FA : Format<77>; def MRM_FB : Format<78>; def MRM_FC : Format<79>;
+def MRM_FD : Format<80>; def MRM_FE : Format<81>; def MRM_FF : Format<82>;
+
+// ImmType - This specifies the immediate type used by an instruction. This is
+// part of the ad-hoc solution used to emit machine instruction encodings by our
+// machine code emitter.
+class ImmType<bits<4> val> {
+  bits<4> Value = val;
+}
+def NoImm      : ImmType<0>;
+def Imm8       : ImmType<1>;
+def Imm8PCRel  : ImmType<2>;
+def Imm16      : ImmType<3>;
+def Imm16PCRel : ImmType<4>;
+def Imm32      : ImmType<5>;
+def Imm32PCRel : ImmType<6>;
+def Imm32S     : ImmType<7>;
+def Imm64      : ImmType<8>;
+
+// FPFormat - This specifies what form this FP instruction has.  This is used by
+// the Floating-Point stackifier pass.
+class FPFormat<bits<3> val> {
+  bits<3> Value = val;
+}
+def NotFP      : FPFormat<0>;
+def ZeroArgFP  : FPFormat<1>;
+def OneArgFP   : FPFormat<2>;
+def OneArgFPRW : FPFormat<3>;
+def TwoArgFP   : FPFormat<4>;
+def CompareFP  : FPFormat<5>;
+def CondMovFP  : FPFormat<6>;
+def SpecialFP  : FPFormat<7>;
+
+// Class specifying the SSE execution domain, used by the SSEDomainFix pass.
+// Keep in sync with tables in X86InstrInfo.cpp.
+class Domain<bits<2> val> {
+  bits<2> Value = val;
+}
+def GenericDomain   : Domain<0>;
+def SSEPackedSingle : Domain<1>;
+def SSEPackedDouble : Domain<2>;
+def SSEPackedInt    : Domain<3>;
+
+// Class specifying the vector form of the decompressed
+// displacement of 8-bit.
+class CD8VForm<bits<3> val> {
+  bits<3> Value = val;
+}
+def CD8VF  : CD8VForm<0>;  // v := VL
+def CD8VH  : CD8VForm<1>;  // v := VL/2
+def CD8VQ  : CD8VForm<2>;  // v := VL/4
+def CD8VO  : CD8VForm<3>;  // v := VL/8
+def CD8VT1 : CD8VForm<4>;  // v := 1
+def CD8VT2 : CD8VForm<5>;  // v := 2
+def CD8VT4 : CD8VForm<6>;  // v := 4
+def CD8VT8 : CD8VForm<7>;  // v := 8
+
+// Class specifying the prefix used an opcode extension.
+class Prefix<bits<3> val> {
+  bits<3> Value = val;
+}
+def NoPrfx : Prefix<0>;
+def PS     : Prefix<1>;
+def PD     : Prefix<2>;
+def XS     : Prefix<3>;
+def XD     : Prefix<4>;
+
+// Class specifying the opcode map.
+class Map<bits<3> val> {
+  bits<3> Value = val;
+}
+def OB   : Map<0>;
+def TB   : Map<1>;
+def T8   : Map<2>;
+def TA   : Map<3>;
+def XOP8 : Map<4>;
+def XOP9 : Map<5>;
+def XOPA : Map<6>;
+
+// Class specifying the encoding
+class Encoding<bits<2> val> {
+  bits<2> Value = val;
+}
+def EncNormal : Encoding<0>;
+def EncVEX    : Encoding<1>;
+def EncXOP    : Encoding<2>;
+def EncEVEX   : Encoding<3>;
+
+// Operand size for encodings that change based on mode.
+class OperandSize<bits<2> val> {
+  bits<2> Value = val;
+}
+def OpSizeFixed : OperandSize<0>; // Never needs a 0x66 prefix.
+def OpSize16    : OperandSize<1>; // Needs 0x66 prefix in 32-bit mode.
+def OpSize32    : OperandSize<2>; // Needs 0x66 prefix in 16-bit mode.
+
+// Prefix byte classes which are used to indicate to the ad-hoc machine code
+// emitter that various prefix bytes are required.
+class OpSize16 { OperandSize OpSize = OpSize16; }
+class OpSize32 { OperandSize OpSize = OpSize32; }
+class AdSize { bit hasAdSizePrefix = 1; }
+class REX_W  { bit hasREX_WPrefix = 1; }
+class LOCK   { bit hasLockPrefix = 1; }
+class REP    { bit hasREPPrefix = 1; }
+class TB     { Map OpMap = TB; }
+class T8     { Map OpMap = T8; }
+class TA     { Map OpMap = TA; }
+class XOP8   { Map OpMap = XOP8; Prefix OpPrefix = PS; }
+class XOP9   { Map OpMap = XOP9; Prefix OpPrefix = PS; }
+class XOPA   { Map OpMap = XOPA; Prefix OpPrefix = PS; }
+class OBXS   { Prefix OpPrefix = XS; }
+class PS   : TB { Prefix OpPrefix = PS; }
+class PD   : TB { Prefix OpPrefix = PD; }
+class XD   : TB { Prefix OpPrefix = XD; }
+class XS   : TB { Prefix OpPrefix = XS; }
+class T8PS : T8 { Prefix OpPrefix = PS; }
+class T8PD : T8 { Prefix OpPrefix = PD; }
+class T8XD : T8 { Prefix OpPrefix = XD; }
+class T8XS : T8 { Prefix OpPrefix = XS; }
+class TAPS : TA { Prefix OpPrefix = PS; }
+class TAPD : TA { Prefix OpPrefix = PD; }
+class TAXD : TA { Prefix OpPrefix = XD; }
+class VEX    { Encoding OpEnc = EncVEX; }
+class VEX_W  { bit hasVEX_WPrefix = 1; }
+class VEX_4V : VEX { bit hasVEX_4V = 1; }
+class VEX_4VOp3 : VEX { bit hasVEX_4VOp3 = 1; }
+class VEX_I8IMM { bit hasVEX_i8ImmReg = 1; }
+class VEX_L  { bit hasVEX_L = 1; }
+class VEX_LIG { bit ignoresVEX_L = 1; }
+class EVEX : VEX { Encoding OpEnc = EncEVEX; }
+class EVEX_4V : VEX_4V { Encoding OpEnc = EncEVEX; }
+class EVEX_K { bit hasEVEX_K = 1; }
+class EVEX_KZ : EVEX_K { bit hasEVEX_Z = 1; }
+class EVEX_B { bit hasEVEX_B = 1; }
+class EVEX_RC { bit hasEVEX_RC = 1; }
+class EVEX_V512 { bit hasEVEX_L2 = 1; bit hasVEX_L = 0; }
+class EVEX_V256 { bit hasEVEX_L2 = 0; bit hasVEX_L = 1; }
+class EVEX_V128 { bit hasEVEX_L2 = 0; bit hasVEX_L = 0; }
+
+// Specify AVX512 8-bit compressed displacement encoding based on the vector
+// element size in bits (8, 16, 32, 64) and the CDisp8 form.
+class EVEX_CD8<int esize, CD8VForm form> {
+  int CD8_EltSize = !srl(esize, 3);
+  bits<3> CD8_Form = form.Value;
+}
+
+class Has3DNow0F0FOpcode  { bit has3DNow0F0FOpcode = 1; }
+class MemOp4 { bit hasMemOp4Prefix = 1; }
+class XOP { Encoding OpEnc = EncXOP; }
+class XOP_4V : XOP { bit hasVEX_4V = 1; }
+class XOP_4VOp3 : XOP { bit hasVEX_4VOp3 = 1; }
+
+class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
+              string AsmStr,
+              InstrItinClass itin,
+              Domain d = GenericDomain>
+  : Instruction {
+  let Namespace = "X86";
+
+  bits<8> Opcode = opcod;
+  Format Form = f;
+  bits<7> FormBits = Form.Value;
+  ImmType ImmT = i;
+
+  dag OutOperandList = outs;
+  dag InOperandList = ins;
+  string AsmString = AsmStr;
+
+  // If this is a pseudo instruction, mark it isCodeGenOnly.
+  let isCodeGenOnly = !eq(!cast<string>(f), "Pseudo");
+
+  let Itinerary = itin;
+
+  //
+  // Attributes specific to X86 instructions...
+  //
+  bit ForceDisassemble = 0; // Force instruction to disassemble even though it's
+                            // isCodeGenonly. Needed to hide an ambiguous
+                            // AsmString from the parser, but still disassemble.
+
+  OperandSize OpSize = OpSizeFixed; // Does this instruction's encoding change
+                                    // based on operand size of the mode
+  bits<2> OpSizeBits = OpSize.Value;
+  bit hasAdSizePrefix = 0;  // Does this inst have a 0x67 prefix?
+
+  Prefix OpPrefix = NoPrfx; // Which prefix byte does this inst have?
+  bits<3> OpPrefixBits = OpPrefix.Value;
+  Map OpMap = OB;           // Which opcode map does this inst have?
+  bits<3> OpMapBits = OpMap.Value;
+  bit hasREX_WPrefix  = 0;  // Does this inst require the REX.W prefix?
+  FPFormat FPForm = NotFP;  // What flavor of FP instruction is this?
+  bit hasLockPrefix = 0;    // Does this inst have a 0xF0 prefix?
+  Domain ExeDomain = d;
+  bit hasREPPrefix = 0;     // Does this inst have a REP prefix?
+  Encoding OpEnc = EncNormal; // Encoding used by this instruction
+  bits<2> OpEncBits = OpEnc.Value;
+  bit hasVEX_WPrefix = 0;   // Does this inst set the VEX_W field?
+  bit hasVEX_4V = 0;        // Does this inst require the VEX.VVVV field?
+  bit hasVEX_4VOp3 = 0;     // Does this inst require the VEX.VVVV field to
+                            // encode the third operand?
+  bit hasVEX_i8ImmReg = 0;  // Does this inst require the last source register
+                            // to be encoded in a immediate field?
+  bit hasVEX_L = 0;         // Does this inst use large (256-bit) registers?
+  bit ignoresVEX_L = 0;     // Does this instruction ignore the L-bit
+  bit hasEVEX_K = 0;        // Does this inst require masking?
+  bit hasEVEX_Z = 0;        // Does this inst set the EVEX_Z field?
+  bit hasEVEX_L2 = 0;       // Does this inst set the EVEX_L2 field?
+  bit hasEVEX_B = 0;        // Does this inst set the EVEX_B field?
+  bits<3> CD8_Form = 0;     // Compressed disp8 form - vector-width.
+  // Declare it int rather than bits<4> so that all bits are defined when
+  // assigning to bits<7>.
+  int CD8_EltSize = 0;      // Compressed disp8 form - element-size in bytes.
+  bit has3DNow0F0FOpcode =0;// Wacky 3dNow! encoding?
+  bit hasMemOp4Prefix = 0;  // Same bit as VEX_W, but used for swapping operands
+  bit hasEVEX_RC = 0;       // Explicitly specified rounding control in FP instruction.
+
+  bits<2> EVEX_LL;
+  let EVEX_LL{0} = hasVEX_L;
+  let EVEX_LL{1} = hasEVEX_L2;
+  // Vector size in bytes.
+  bits<7> VectSize = !shl(16, EVEX_LL);
+
+  // The scaling factor for AVX512's compressed displacement is either
+  //   - the size of a  power-of-two number of elements or
+  //   - the size of a single element for broadcasts or
+  //   - the total vector size divided by a power-of-two number.
+  // Possible values are: 0 (non-AVX512 inst), 1, 2, 4, 8, 16, 32 and 64.
+  bits<7> CD8_Scale = !if (!eq (OpEnc.Value, EncEVEX.Value),
+                           !if (CD8_Form{2},
+                                !shl(CD8_EltSize, CD8_Form{1-0}),
+                                !if (hasEVEX_B,
+                                     CD8_EltSize,
+                                     !srl(VectSize, CD8_Form{1-0}))), 0);
+
+  // TSFlags layout should be kept in sync with X86InstrInfo.h.
+  let TSFlags{6-0}   = FormBits;
+  let TSFlags{8-7}   = OpSizeBits;
+  let TSFlags{9}     = hasAdSizePrefix;
+  let TSFlags{12-10} = OpPrefixBits;
+  let TSFlags{15-13} = OpMapBits;
+  let TSFlags{16}    = hasREX_WPrefix;
+  let TSFlags{20-17} = ImmT.Value;
+  let TSFlags{23-21} = FPForm.Value;
+  let TSFlags{24}    = hasLockPrefix;
+  let TSFlags{25}    = hasREPPrefix;
+  let TSFlags{27-26} = ExeDomain.Value;
+  let TSFlags{29-28} = OpEncBits;
+  let TSFlags{37-30} = Opcode;
+  let TSFlags{38}    = hasVEX_WPrefix;
+  let TSFlags{39}    = hasVEX_4V;
+  let TSFlags{40}    = hasVEX_4VOp3;
+  let TSFlags{41}    = hasVEX_i8ImmReg;
+  let TSFlags{42}    = hasVEX_L;
+  let TSFlags{43}    = ignoresVEX_L;
+  let TSFlags{44}    = hasEVEX_K;
+  let TSFlags{45}    = hasEVEX_Z;
+  let TSFlags{46}    = hasEVEX_L2;
+  let TSFlags{47}    = hasEVEX_B;
+  // If we run out of TSFlags bits, it's possible to encode this in 3 bits.
+  let TSFlags{54-48} = CD8_Scale;
+  let TSFlags{55}    = has3DNow0F0FOpcode;
+  let TSFlags{56}    = hasMemOp4Prefix;
+  let TSFlags{57}    = hasEVEX_RC;
+}
+
+class PseudoI<dag oops, dag iops, list<dag> pattern>
+  : X86Inst<0, Pseudo, NoImm, oops, iops, "", NoItinerary> {
+  let Pattern = pattern;
+}
+
+class I<bits<8> o, Format f, dag outs, dag ins, string asm,
+        list<dag> pattern, InstrItinClass itin = NoItinerary,
+        Domain d = GenericDomain>
+  : X86Inst<o, f, NoImm, outs, ins, asm, itin, d> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+class Ii8 <bits<8> o, Format f, dag outs, dag ins, string asm, 
+           list<dag> pattern, InstrItinClass itin = NoItinerary,
+           Domain d = GenericDomain>
+  : X86Inst<o, f, Imm8, outs, ins, asm, itin, d> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+class Ii8PCRel<bits<8> o, Format f, dag outs, dag ins, string asm, 
+               list<dag> pattern, InstrItinClass itin = NoItinerary>
+  : X86Inst<o, f, Imm8PCRel, outs, ins, asm, itin> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+class Ii16<bits<8> o, Format f, dag outs, dag ins, string asm, 
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+  : X86Inst<o, f, Imm16, outs, ins, asm, itin> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+class Ii32<bits<8> o, Format f, dag outs, dag ins, string asm, 
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+  : X86Inst<o, f, Imm32, outs, ins, asm, itin> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+class Ii32S<bits<8> o, Format f, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+  : X86Inst<o, f, Imm32S, outs, ins, asm, itin> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+
+class Ii16PCRel<bits<8> o, Format f, dag outs, dag ins, string asm, 
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+           : X86Inst<o, f, Imm16PCRel, outs, ins, asm, itin> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+
+class Ii32PCRel<bits<8> o, Format f, dag outs, dag ins, string asm, 
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+  : X86Inst<o, f, Imm32PCRel, outs, ins, asm, itin> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+
+// FPStack Instruction Templates:
+// FPI - Floating Point Instruction template.
+class FPI<bits<8> o, Format F, dag outs, dag ins, string asm,
+          InstrItinClass itin = NoItinerary>
+  : I<o, F, outs, ins, asm, [], itin> {}
+
+// FpI_ - Floating Point Pseudo Instruction template. Not Predicated.
+class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern,
+           InstrItinClass itin = NoItinerary>
+  : X86Inst<0, Pseudo, NoImm, outs, ins, "", itin> {
+  let FPForm = fp;
+  let Pattern = pattern;
+}
+
+// Templates for instructions that use a 16- or 32-bit segmented address as
+//  their only operand: lcall (FAR CALL) and ljmp (FAR JMP)
+//
+//   Iseg16 - 16-bit segment selector, 16-bit offset
+//   Iseg32 - 16-bit segment selector, 32-bit offset
+
+class Iseg16 <bits<8> o, Format f, dag outs, dag ins, string asm, 
+              list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : X86Inst<o, f, Imm16, outs, ins, asm, itin> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+
+class Iseg32 <bits<8> o, Format f, dag outs, dag ins, string asm, 
+              list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : X86Inst<o, f, Imm32, outs, ins, asm, itin> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+
+// SI - SSE 1 & 2 scalar instructions
+class SI<bits<8> o, Format F, dag outs, dag ins, string asm,
+         list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin> {
+  let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512],
+                   !if(!eq(OpEnc.Value, EncVEX.Value), [UseAVX],
+                   !if(!eq(OpPrefix.Value, XS.Value), [UseSSE1],
+                   !if(!eq(OpPrefix.Value, XD.Value), [UseSSE2],
+                   !if(!eq(OpPrefix.Value, PD.Value), [UseSSE2],
+                   [UseSSE1])))));
+
+  // AVX instructions have a 'v' prefix in the mnemonic
+  let AsmString = !if(!eq(OpEnc.Value, EncEVEX.Value), !strconcat("v", asm),
+                  !if(!eq(OpEnc.Value, EncVEX.Value), !strconcat("v", asm),
+                  asm));
+}
+
+// SIi8 - SSE 1 & 2 scalar instructions
+class SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin> {
+  let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512],
+                   !if(!eq(OpEnc.Value, EncVEX.Value), [UseAVX],
+                   !if(!eq(OpPrefix.Value, XS.Value), [UseSSE1],
+                   [UseSSE2])));
+
+  // AVX instructions have a 'v' prefix in the mnemonic
+  let AsmString = !if(!eq(OpEnc.Value, EncEVEX.Value), !strconcat("v", asm),
+                  !if(!eq(OpEnc.Value, EncVEX.Value), !strconcat("v", asm),
+                  asm));
+}
+
+// PI - SSE 1 & 2 packed instructions
+class PI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern,
+         InstrItinClass itin, Domain d>
+      : I<o, F, outs, ins, asm, pattern, itin, d> {
+  let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512],
+                   !if(!eq(OpEnc.Value, EncVEX.Value), [HasAVX],
+                   !if(!eq(OpPrefix.Value, PD.Value), [UseSSE2],
+                   [UseSSE1])));
+
+  // AVX instructions have a 'v' prefix in the mnemonic
+  let AsmString = !if(!eq(OpEnc.Value, EncEVEX.Value), !strconcat("v", asm),
+                  !if(!eq(OpEnc.Value, EncVEX.Value), !strconcat("v", asm),
+                  asm));
+}
+
+// MMXPI - SSE 1 & 2 packed instructions with MMX operands
+class MMXPI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern,
+            InstrItinClass itin, Domain d>
+      : I<o, F, outs, ins, asm, pattern, itin, d> {
+  let Predicates = !if(!eq(OpPrefix.Value, PD.Value), [HasSSE2],
+                       [HasSSE1]);
+}
+
+// PIi8 - SSE 1 & 2 packed instructions with immediate
+class PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin, Domain d>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, d> {
+  let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512],
+                   !if(!eq(OpEnc.Value, EncVEX.Value), [HasAVX],
+                   !if(!eq(OpPrefix.Value, PD.Value), [UseSSE2],
+                   [UseSSE1])));
+
+  // AVX instructions have a 'v' prefix in the mnemonic
+  let AsmString = !if(!eq(OpEnc.Value, EncEVEX.Value), !strconcat("v", asm),
+                  !if(!eq(OpEnc.Value, EncVEX.Value), !strconcat("v", asm),
+                  asm));
+}
+
+// SSE1 Instruction Templates:
+// 
+//   SSI   - SSE1 instructions with XS prefix.
+//   PSI   - SSE1 instructions with PS prefix.
+//   PSIi8 - SSE1 instructions with ImmT == Imm8 and PS prefix.
+//   VSSI  - SSE1 instructions with XS prefix in AVX form.
+//   VPSI  - SSE1 instructions with PS prefix in AVX form, packed single.
+
+class SSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+          list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE1]>;
+class SSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE1]>;
+class PSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+          list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, PS,
+        Requires<[UseSSE1]>;
+class PSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, PS,
+        Requires<[UseSSE1]>;
+class VSSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XS,
+        Requires<[HasAVX]>;
+class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin, SSEPackedSingle>, PS,
+        Requires<[HasAVX]>;
+
+// SSE2 Instruction Templates:
+// 
+//   SDI    - SSE2 instructions with XD prefix.
+//   SDIi8  - SSE2 instructions with ImmT == Imm8 and XD prefix.
+//   S2SI   - SSE2 instructions with XS prefix.
+//   SSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix.
+//   PDI    - SSE2 instructions with PD prefix, packed double domain.
+//   PDIi8  - SSE2 instructions with ImmT == Imm8 and PD prefix.
+//   VSDI   - SSE2 scalar instructions with XD prefix in AVX form.
+//   VPDI   - SSE2 vector instructions with PD prefix in AVX form,
+//                 packed double domain.
+//   VS2I   - SSE2 scalar instructions with PD prefix in AVX form.
+//   S2I    - SSE2 scalar instructions with PD prefix.
+//   MMXSDIi8  - SSE2 instructions with ImmT == Imm8 and XD prefix as well as
+//               MMX operands.
+//   MMXSSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix as well as
+//               MMX operands.
+
+class SDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+          list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[UseSSE2]>;
+class SDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[UseSSE2]>;
+class S2SI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE2]>;
+class S2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+             list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[UseSSE2]>;
+class PDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+          list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, PD,
+        Requires<[UseSSE2]>;
+class PDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, PD,
+        Requires<[UseSSE2]>;
+class VSDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XD,
+        Requires<[UseAVX]>;
+class VS2SI<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XS,
+        Requires<[HasAVX]>;
+class VPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin, SSEPackedDouble>,
+        PD, Requires<[HasAVX]>;
+class VS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, PD,
+        Requires<[UseAVX]>;
+class S2I<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, PD, Requires<[UseSSE2]>;
+class MMXSDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+               list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasSSE2]>;
+class MMXS2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+                list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE2]>;
+
+// SSE3 Instruction Templates:
+// 
+//   S3I   - SSE3 instructions with PD prefixes.
+//   S3SI  - SSE3 instructions with XS prefix.
+//   S3DI  - SSE3 instructions with XD prefix.
+
+class S3SI<bits<8> o, Format F, dag outs, dag ins, string asm, 
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, XS,
+        Requires<[UseSSE3]>;
+class S3DI<bits<8> o, Format F, dag outs, dag ins, string asm, 
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, XD,
+        Requires<[UseSSE3]>;
+class S3I<bits<8> o, Format F, dag outs, dag ins, string asm,
+          list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, PD,
+        Requires<[UseSSE3]>;
+
+
+// SSSE3 Instruction Templates:
+// 
+//   SS38I - SSSE3 instructions with T8 prefix.
+//   SS3AI - SSSE3 instructions with TA prefix.
+//   MMXSS38I - SSSE3 instructions with T8 prefix and MMX operands.
+//   MMXSS3AI - SSSE3 instructions with TA prefix and MMX operands.
+//
+// Note: SSSE3 instructions have 64-bit and 128-bit versions. The 64-bit version
+// uses the MMX registers. The 64-bit versions are grouped with the MMX
+// classes. They need to be enabled even if AVX is enabled.
+
+class SS38I<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+        Requires<[UseSSSE3]>;
+class SS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+        Requires<[UseSSSE3]>;
+class MMXSS38I<bits<8> o, Format F, dag outs, dag ins, string asm,
+               list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PS,
+        Requires<[HasSSSE3]>;
+class MMXSS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
+               list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPS,
+        Requires<[HasSSSE3]>;
+
+// SSE4.1 Instruction Templates:
+// 
+//   SS48I - SSE 4.1 instructions with T8 prefix.
+//   SS41AIi8 - SSE 4.1 instructions with TA prefix and ImmT == Imm8.
+//
+class SS48I<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+        Requires<[UseSSE41]>;
+class SS4AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+        Requires<[UseSSE41]>;
+
+// SSE4.2 Instruction Templates:
+// 
+//   SS428I - SSE 4.2 instructions with T8 prefix.
+class SS428I<bits<8> o, Format F, dag outs, dag ins, string asm,
+             list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+        Requires<[UseSSE42]>;
+
+//   SS42FI - SSE 4.2 instructions with T8XD prefix.
+// NOTE: 'HasSSE42' is used as SS42FI is only used for CRC32 insns.
+class SS42FI<bits<8> o, Format F, dag outs, dag ins, string asm,
+             list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, T8XD, Requires<[HasSSE42]>;
+
+//   SS42AI = SSE 4.2 instructions with TA prefix
+class SS42AI<bits<8> o, Format F, dag outs, dag ins, string asm,
+             list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+        Requires<[UseSSE42]>;
+
+// AVX Instruction Templates:
+//   Instructions introduced in AVX (no SSE equivalent forms)
+//
+//   AVX8I - AVX instructions with T8PD prefix.
+//   AVXAIi8 - AVX instructions with TAPD prefix and ImmT = Imm8.
+class AVX8I<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+        Requires<[HasAVX]>;
+class AVXAIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+              list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+        Requires<[HasAVX]>;
+
+// AVX2 Instruction Templates:
+//   Instructions introduced in AVX2 (no SSE equivalent forms)
+//
+//   AVX28I - AVX2 instructions with T8PD prefix.
+//   AVX2AIi8 - AVX2 instructions with TAPD prefix and ImmT = Imm8.
+class AVX28I<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+        Requires<[HasAVX2]>;
+class AVX2AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+              list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+        Requires<[HasAVX2]>;
+
+
+// AVX-512 Instruction Templates:
+//   Instructions introduced in AVX-512 (no SSE equivalent forms)
+//
+//   AVX5128I - AVX-512 instructions with T8PD prefix.
+//   AVX512AIi8 - AVX-512 instructions with TAPD prefix and ImmT = Imm8.
+//   AVX512PDI  - AVX-512 instructions with PD, double packed.
+//   AVX512PSI  - AVX-512 instructions with PS, single packed.
+//   AVX512XS8I - AVX-512 instructions with T8 and XS prefixes.
+//   AVX512XSI  - AVX-512 instructions with XS prefix, generic domain.
+//   AVX512BI   - AVX-512 instructions with PD, int packed domain.
+//   AVX512SI   - AVX-512 scalar instructions with PD prefix.
+
+class AVX5128I<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+        Requires<[HasAVX512]>;
+class AVX512XS8I<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8XS,
+        Requires<[HasAVX512]>;
+class AVX512XSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, XS,
+        Requires<[HasAVX512]>;
+class AVX512XDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, XD,
+        Requires<[HasAVX512]>;
+class AVX512BI<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, PD,
+        Requires<[HasAVX512]>;
+class AVX512BIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+              list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, PD,
+        Requires<[HasAVX512]>;
+class AVX512AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+              list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+        Requires<[HasAVX512]>;
+class AVX512Ii8<bits<8> o, Format F, dag outs, dag ins, string asm,
+              list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>,
+        Requires<[HasAVX512]>;
+class AVX512PDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, PD,
+        Requires<[HasAVX512]>;
+class AVX512PSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, PS,
+        Requires<[HasAVX512]>;
+class AVX512PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+              list<dag> pattern, Domain d, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, d>, Requires<[HasAVX512]>;
+class AVX512PI<bits<8> o, Format F, dag outs, dag ins, string asm,
+              list<dag> pattern, Domain d, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, d>, Requires<[HasAVX512]>;
+class AVX512FMA3<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag>pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, T8PD,
+        EVEX_4V, Requires<[HasAVX512]>;
+
+// AES Instruction Templates:
+//
+// AES8I
+// These use the same encoding as the SSE4.2 T8 and TA encodings.
+class AES8I<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag>pattern, InstrItinClass itin = IIC_AES>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+        Requires<[HasAES]>;
+
+class AESAI<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+        Requires<[HasAES]>;
+
+// PCLMUL Instruction Templates
+class PCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+               list<dag>pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+        Requires<[HasPCLMUL]>;
+
+class AVXPCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+                  list<dag>pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+        VEX_4V, Requires<[HasAVX, HasPCLMUL]>;
+
+// FMA3 Instruction Templates
+class FMA3<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag>pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, T8PD,
+        VEX_4V, FMASC, Requires<[HasFMA]>;
+
+// FMA4 Instruction Templates
+class FMA4<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag>pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin>, TAPD,
+        VEX_4V, VEX_I8IMM, FMASC, Requires<[HasFMA4]>;
+
+// XOP 2, 3 and 4 Operand Instruction Template
+class IXOP<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>,
+         XOP9, Requires<[HasXOP]>;
+
+// XOP 2, 3 and 4 Operand Instruction Templates with imm byte
+class IXOPi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>,
+         XOP8, Requires<[HasXOP]>;
+
+//  XOP 5 operand instruction (VEX encoding!)
+class IXOP5<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag>pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+        VEX_4V, VEX_I8IMM, Requires<[HasXOP]>;
+
+// X86-64 Instruction templates...
+//
+
+class RI<bits<8> o, Format F, dag outs, dag ins, string asm,
+         list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, REX_W;
+class RIi8 <bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin>, REX_W;
+class RIi16 <bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii16<o, F, outs, ins, asm, pattern, itin>, REX_W;
+class RIi32 <bits<8> o, Format F, dag outs, dag ins, string asm,
+             list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii32<o, F, outs, ins, asm, pattern, itin>, REX_W;
+class RIi32S <bits<8> o, Format F, dag outs, dag ins, string asm,
+              list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii32S<o, F, outs, ins, asm, pattern, itin>, REX_W;
+
+class RIi64<bits<8> o, Format f, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+  : X86Inst<o, f, Imm64, outs, ins, asm, itin>, REX_W {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+
+class RIi64_NOREX<bits<8> o, Format f, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+  : X86Inst<o, f, Imm64, outs, ins, asm, itin> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+
+class RS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : S2I<o, F, outs, ins, asm, pattern, itin>, REX_W;
+class VRS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : VS2I<o, F, outs, ins, asm, pattern, itin>, VEX_W;
+
+// MMX Instruction templates
+//
+
+// MMXI   - MMX instructions with TB prefix.
+// MMXI32 - MMX instructions with TB prefix valid only in 32 bit mode.
+// MMXI64 - MMX instructions with TB prefix valid only in 64 bit mode.
+// MMX2I  - MMX / SSE2 instructions with PD prefix.
+// MMXIi8 - MMX instructions with ImmT == Imm8 and PS prefix.
+// MMXIi8 - MMX instructions with ImmT == Imm8 and PS prefix.
+// MMXID  - MMX instructions with XD prefix.
+// MMXIS  - MMX instructions with XS prefix.
+class MMXI<bits<8> o, Format F, dag outs, dag ins, string asm, 
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX]>;
+class MMXI32<bits<8> o, Format F, dag outs, dag ins, string asm, 
+             list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX,Not64BitMode]>;
+class MMXI64<bits<8> o, Format F, dag outs, dag ins, string asm, 
+             list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX,In64BitMode]>;
+class MMXRI<bits<8> o, Format F, dag outs, dag ins, string asm, 
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, PS, REX_W, Requires<[HasMMX]>;
+class MMX2I<bits<8> o, Format F, dag outs, dag ins, string asm, 
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, PD, Requires<[HasMMX]>;
+class MMXIi8<bits<8> o, Format F, dag outs, dag ins, string asm, 
+             list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX]>;
+class MMXID<bits<8> o, Format F, dag outs, dag ins, string asm, 
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasMMX]>;
+class MMXIS<bits<8> o, Format F, dag outs, dag ins, string asm, 
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[HasMMX]>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
new file mode 100644
index 0000000..6f0fa94
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -0,0 +1,570 @@
+//===-- X86InstrFragmentsSIMD.td - x86 SIMD ISA ------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides pattern fragments useful for SIMD instructions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MMX Pattern Fragments
+//===----------------------------------------------------------------------===//
+
+def load_mmx : PatFrag<(ops node:$ptr), (x86mmx (load node:$ptr))>;
+def bc_mmx  : PatFrag<(ops node:$in), (x86mmx  (bitconvert node:$in))>;
+
+//===----------------------------------------------------------------------===//
+// SSE specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDTX86FPShiftOp : SDTypeProfile<1, 2, [ SDTCisSameAs<0, 1>,
+                                            SDTCisFP<0>, SDTCisInt<2> ]>;
+def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>,
+                                       SDTCisFP<1>, SDTCisVT<3, i8>,
+                                       SDTCisVec<1>]>;
+
+def X86umin    : SDNode<"X86ISD::UMIN",      SDTIntBinOp>;
+def X86umax    : SDNode<"X86ISD::UMAX",      SDTIntBinOp>;
+def X86smin    : SDNode<"X86ISD::SMIN",      SDTIntBinOp>;
+def X86smax    : SDNode<"X86ISD::SMAX",      SDTIntBinOp>;
+
+def X86fmin    : SDNode<"X86ISD::FMIN",      SDTFPBinOp>;
+def X86fmax    : SDNode<"X86ISD::FMAX",      SDTFPBinOp>;
+
+// Commutative and Associative FMIN and FMAX.
+def X86fminc    : SDNode<"X86ISD::FMINC", SDTFPBinOp,
+    [SDNPCommutative, SDNPAssociative]>;
+def X86fmaxc    : SDNode<"X86ISD::FMAXC", SDTFPBinOp,
+    [SDNPCommutative, SDNPAssociative]>;
+
+def X86fand    : SDNode<"X86ISD::FAND",      SDTFPBinOp,
+                        [SDNPCommutative, SDNPAssociative]>;
+def X86for     : SDNode<"X86ISD::FOR",       SDTFPBinOp,
+                        [SDNPCommutative, SDNPAssociative]>;
+def X86fxor    : SDNode<"X86ISD::FXOR",      SDTFPBinOp,
+                        [SDNPCommutative, SDNPAssociative]>;
+def X86fandn   : SDNode<"X86ISD::FANDN",     SDTFPBinOp,
+                        [SDNPCommutative, SDNPAssociative]>;
+def X86frsqrt  : SDNode<"X86ISD::FRSQRT",    SDTFPUnaryOp>;
+def X86frcp    : SDNode<"X86ISD::FRCP",      SDTFPUnaryOp>;
+def X86fsrl    : SDNode<"X86ISD::FSRL",      SDTX86FPShiftOp>;
+def X86fgetsign: SDNode<"X86ISD::FGETSIGNx86",SDTFPToIntOp>;
+def X86fhadd   : SDNode<"X86ISD::FHADD",     SDTFPBinOp>;
+def X86fhsub   : SDNode<"X86ISD::FHSUB",     SDTFPBinOp>;
+def X86hadd    : SDNode<"X86ISD::HADD",      SDTIntBinOp>;
+def X86hsub    : SDNode<"X86ISD::HSUB",      SDTIntBinOp>;
+def X86comi    : SDNode<"X86ISD::COMI",      SDTX86CmpTest>;
+def X86ucomi   : SDNode<"X86ISD::UCOMI",     SDTX86CmpTest>;
+def X86cmps    : SDNode<"X86ISD::FSETCC",     SDTX86Cmps>;
+//def X86cmpsd   : SDNode<"X86ISD::FSETCCsd",    SDTX86Cmpsd>;
+def X86pshufb  : SDNode<"X86ISD::PSHUFB",
+                 SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                      SDTCisSameAs<0,2>]>>;
+def X86andnp   : SDNode<"X86ISD::ANDNP",
+                 SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                      SDTCisSameAs<0,2>]>>;
+def X86psign   : SDNode<"X86ISD::PSIGN",
+                 SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                      SDTCisSameAs<0,2>]>>;
+def X86pextrb  : SDNode<"X86ISD::PEXTRB",
+                 SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>;
+def X86pextrw  : SDNode<"X86ISD::PEXTRW",
+                 SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>;
+def X86pinsrb  : SDNode<"X86ISD::PINSRB",
+                 SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
+                                      SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
+def X86pinsrw  : SDNode<"X86ISD::PINSRW",
+                 SDTypeProfile<1, 3, [SDTCisVT<0, v8i16>, SDTCisSameAs<0,1>,
+                                      SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
+def X86insertps : SDNode<"X86ISD::INSERTPS",
+                 SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisSameAs<0,1>,
+                                      SDTCisVT<2, v4f32>, SDTCisPtrTy<3>]>>;
+def X86vzmovl  : SDNode<"X86ISD::VZEXT_MOVL",
+                 SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>;
+
+def X86vzload  : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad,
+                        [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+def X86vzext   : SDNode<"X86ISD::VZEXT",
+                         SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+                                              SDTCisInt<0>, SDTCisInt<1>,
+                                              SDTCisOpSmallerThanOp<1, 0>]>>;
+
+def X86vsext   : SDNode<"X86ISD::VSEXT",
+                         SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+                                              SDTCisInt<0>, SDTCisInt<1>,
+                                              SDTCisOpSmallerThanOp<1, 0>]>>;
+
+def X86vtrunc   : SDNode<"X86ISD::VTRUNC",
+                         SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+                                              SDTCisInt<0>, SDTCisInt<1>,
+                                              SDTCisOpSmallerThanOp<0, 1>]>>;
+def X86trunc    : SDNode<"X86ISD::TRUNC",
+                         SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisInt<1>,
+                                              SDTCisOpSmallerThanOp<0, 1>]>>;
+
+def X86vtruncm   : SDNode<"X86ISD::VTRUNCM",
+                         SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                              SDTCisInt<0>, SDTCisInt<1>,
+                                              SDTCisVec<2>, SDTCisInt<2>,
+                                              SDTCisOpSmallerThanOp<0, 2>]>>;
+def X86vfpext  : SDNode<"X86ISD::VFPEXT",
+                        SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+                                             SDTCisFP<0>, SDTCisFP<1>,
+                                             SDTCisOpSmallerThanOp<1, 0>]>>;
+def X86vfpround: SDNode<"X86ISD::VFPROUND",
+                        SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+                                             SDTCisFP<0>, SDTCisFP<1>,
+                                             SDTCisOpSmallerThanOp<0, 1>]>>;
+
+def X86vshldq  : SDNode<"X86ISD::VSHLDQ",    SDTIntShiftOp>;
+def X86vshrdq  : SDNode<"X86ISD::VSRLDQ",    SDTIntShiftOp>;
+def X86cmpp    : SDNode<"X86ISD::CMPP",      SDTX86VFCMP>;
+def X86pcmpeq  : SDNode<"X86ISD::PCMPEQ", SDTIntBinOp, [SDNPCommutative]>;
+def X86pcmpgt  : SDNode<"X86ISD::PCMPGT", SDTIntBinOp>;
+
+def X86IntCmpMask : SDTypeProfile<1, 2,
+    [SDTCisVec<0>, SDTCisSameAs<1, 2>, SDTCisInt<1>]>;
+def X86pcmpeqm  : SDNode<"X86ISD::PCMPEQM", X86IntCmpMask, [SDNPCommutative]>;
+def X86pcmpgtm  : SDNode<"X86ISD::PCMPGTM", X86IntCmpMask>;
+
+def X86CmpMaskCC :
+      SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisInt<0>, SDTCisVec<1>,
+                           SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
+def X86CmpMaskCCScalar :
+      SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
+
+def X86cmpm   : SDNode<"X86ISD::CMPM",    X86CmpMaskCC>;
+def X86cmpmu  : SDNode<"X86ISD::CMPMU",   X86CmpMaskCC>;
+def X86cmpms  : SDNode<"X86ISD::FSETCC",  X86CmpMaskCCScalar>;
+
+def X86vshl    : SDNode<"X86ISD::VSHL",
+                        SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                      SDTCisVec<2>]>>;
+def X86vsrl    : SDNode<"X86ISD::VSRL",
+                        SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                      SDTCisVec<2>]>>;
+def X86vsra    : SDNode<"X86ISD::VSRA",
+                        SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                      SDTCisVec<2>]>>;
+
+def X86vshli   : SDNode<"X86ISD::VSHLI", SDTIntShiftOp>;
+def X86vsrli   : SDNode<"X86ISD::VSRLI", SDTIntShiftOp>;
+def X86vsrai   : SDNode<"X86ISD::VSRAI", SDTIntShiftOp>;
+
+def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
+                                          SDTCisVec<1>,
+                                          SDTCisSameAs<2, 1>]>;
+def X86subus   : SDNode<"X86ISD::SUBUS", SDTIntBinOp>;
+def X86ptest   : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>;
+def X86testp   : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>;
+def X86kortest : SDNode<"X86ISD::KORTEST", SDTX86CmpPTest>;
+def X86testm   : SDNode<"X86ISD::TESTM", SDTypeProfile<1, 2, [SDTCisVec<0>,
+                                          SDTCisVec<1>,
+                                          SDTCisSameAs<2, 1>]>>;
+def X86testnm  : SDNode<"X86ISD::TESTNM", SDTypeProfile<1, 2, [SDTCisVec<0>,
+                                          SDTCisVec<1>,
+                                          SDTCisSameAs<2, 1>]>>;
+def X86select  : SDNode<"X86ISD::SELECT"     , SDTSelect>;
+
+def X86pmuludq : SDNode<"X86ISD::PMULUDQ",
+                        SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                      SDTCisSameAs<1,2>]>>;
+def X86pmuldq  : SDNode<"X86ISD::PMULDQ",
+                         SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                       SDTCisSameAs<1,2>]>>;
+
+// Specific shuffle nodes - At some point ISD::VECTOR_SHUFFLE will always get
+// translated into one of the target nodes below during lowering.
+// Note: this is a work in progress...
+def SDTShuff1Op : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
+def SDTShuff2Op : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                SDTCisSameAs<0,2>]>;
+def SDTShuff3Op : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                SDTCisSameAs<0,2>, SDTCisSameAs<0,3>]>;
+
+def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>,
+                                 SDTCisSameAs<0,1>, SDTCisInt<2>]>;
+def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                 SDTCisSameAs<0,2>, SDTCisInt<3>]>;
+
+def SDTVBroadcast  : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
+def SDTVBroadcastm : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>]>;
+
+def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                             SDTCisSameAs<1,2>, SDTCisVT<3, i32>]>;
+
+def SDTFma : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>,
+                           SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>;
+
+def X86PAlignr : SDNode<"X86ISD::PALIGNR", SDTShuff3OpI>;
+
+def X86PShufd  : SDNode<"X86ISD::PSHUFD", SDTShuff2OpI>;
+def X86PShufhw : SDNode<"X86ISD::PSHUFHW", SDTShuff2OpI>;
+def X86PShuflw : SDNode<"X86ISD::PSHUFLW", SDTShuff2OpI>;
+
+def X86Shufp : SDNode<"X86ISD::SHUFP", SDTShuff3OpI>;
+
+def X86Movddup  : SDNode<"X86ISD::MOVDDUP", SDTShuff1Op>;
+def X86Movshdup : SDNode<"X86ISD::MOVSHDUP", SDTShuff1Op>;
+def X86Movsldup : SDNode<"X86ISD::MOVSLDUP", SDTShuff1Op>;
+
+def X86Movsd : SDNode<"X86ISD::MOVSD", SDTShuff2Op>;
+def X86Movss : SDNode<"X86ISD::MOVSS", SDTShuff2Op>;
+
+def X86Movlhps : SDNode<"X86ISD::MOVLHPS", SDTShuff2Op>;
+def X86Movlhpd : SDNode<"X86ISD::MOVLHPD", SDTShuff2Op>;
+def X86Movhlps : SDNode<"X86ISD::MOVHLPS", SDTShuff2Op>;
+
+def X86Movlps : SDNode<"X86ISD::MOVLPS", SDTShuff2Op>;
+def X86Movlpd : SDNode<"X86ISD::MOVLPD", SDTShuff2Op>;
+
+def SDTPack : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<2, 1>]>;
+def X86Packss : SDNode<"X86ISD::PACKSS", SDTPack>;
+def X86Packus : SDNode<"X86ISD::PACKUS", SDTPack>;
+
+def X86Unpckl : SDNode<"X86ISD::UNPCKL", SDTShuff2Op>;
+def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>;
+
+def X86VPermilp  : SDNode<"X86ISD::VPERMILP", SDTShuff2OpI>;
+def X86VPermv    : SDNode<"X86ISD::VPERMV",   SDTShuff2Op>;
+def X86VPermi    : SDNode<"X86ISD::VPERMI",   SDTShuff2OpI>;
+def X86VPermv3   : SDNode<"X86ISD::VPERMV3",  SDTShuff3Op>;
+def X86VPermiv3   : SDNode<"X86ISD::VPERMIV3",  SDTShuff3Op>;
+
+def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>;
+
+def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>;
+def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>;
+def X86Vinsert   : SDNode<"X86ISD::VINSERT",  SDTypeProfile<1, 3,
+                              [SDTCisSameAs<0, 1>, SDTCisPtrTy<3>]>, []>;
+def X86Vextract   : SDNode<"X86ISD::VEXTRACT",  SDTypeProfile<1, 2,
+                              [SDTCisVec<1>, SDTCisPtrTy<2>]>, []>;
+
+def X86Blendi    : SDNode<"X86ISD::BLENDI",   SDTBlend>;
+def X86Fmadd     : SDNode<"X86ISD::FMADD",     SDTFma>;
+def X86Fnmadd    : SDNode<"X86ISD::FNMADD",    SDTFma>;
+def X86Fmsub     : SDNode<"X86ISD::FMSUB",     SDTFma>;
+def X86Fnmsub    : SDNode<"X86ISD::FNMSUB",    SDTFma>;
+def X86Fmaddsub  : SDNode<"X86ISD::FMADDSUB",  SDTFma>;
+def X86Fmsubadd  : SDNode<"X86ISD::FMSUBADD",  SDTFma>;
+
+def SDT_PCMPISTRI : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
+                                         SDTCisVT<2, v16i8>, SDTCisVT<3, v16i8>,
+                                         SDTCisVT<4, i8>]>;
+def SDT_PCMPESTRI : SDTypeProfile<2, 5, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
+                                         SDTCisVT<2, v16i8>, SDTCisVT<3, i32>,
+                                         SDTCisVT<4, v16i8>, SDTCisVT<5, i32>,
+                                         SDTCisVT<6, i8>]>;
+
+def X86pcmpistri : SDNode<"X86ISD::PCMPISTRI", SDT_PCMPISTRI>;
+def X86pcmpestri : SDNode<"X86ISD::PCMPESTRI", SDT_PCMPESTRI>;
+
+//===----------------------------------------------------------------------===//
+// SSE Complex Patterns
+//===----------------------------------------------------------------------===//
+
+// These are 'extloads' from a scalar to the low element of a vector, zeroing
+// the top elements.  These are used for the SSE 'ss' and 'sd' instruction
+// forms.
+def sse_load_f32 : ComplexPattern<v4f32, 5, "SelectScalarSSELoad", [],
+                                  [SDNPHasChain, SDNPMayLoad, SDNPMemOperand,
+                                   SDNPWantRoot]>;
+def sse_load_f64 : ComplexPattern<v2f64, 5, "SelectScalarSSELoad", [],
+                                  [SDNPHasChain, SDNPMayLoad, SDNPMemOperand,
+                                   SDNPWantRoot]>;
+
+def ssmem : Operand<v4f32> {
+  let PrintMethod = "printf32mem";
+  let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm);
+  let ParserMatchClass = X86Mem32AsmOperand;
+  let OperandType = "OPERAND_MEMORY";
+}
+def sdmem : Operand<v2f64> {
+  let PrintMethod = "printf64mem";
+  let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm);
+  let ParserMatchClass = X86Mem64AsmOperand;
+  let OperandType = "OPERAND_MEMORY";
+}
+
+//===----------------------------------------------------------------------===//
+// SSE pattern fragments
+//===----------------------------------------------------------------------===//
+
+// 128-bit load pattern fragments
+// NOTE: all 128-bit integer vector loads are promoted to v2i64
+def loadv4f32    : PatFrag<(ops node:$ptr), (v4f32 (load node:$ptr))>;
+def loadv2f64    : PatFrag<(ops node:$ptr), (v2f64 (load node:$ptr))>;
+def loadv2i64    : PatFrag<(ops node:$ptr), (v2i64 (load node:$ptr))>;
+
+// 256-bit load pattern fragments
+// NOTE: all 256-bit integer vector loads are promoted to v4i64
+def loadv8f32    : PatFrag<(ops node:$ptr), (v8f32 (load node:$ptr))>;
+def loadv4f64    : PatFrag<(ops node:$ptr), (v4f64 (load node:$ptr))>;
+def loadv4i64    : PatFrag<(ops node:$ptr), (v4i64 (load node:$ptr))>;
+
+// 512-bit load pattern fragments
+def loadv16f32   : PatFrag<(ops node:$ptr), (v16f32 (load node:$ptr))>;
+def loadv8f64    : PatFrag<(ops node:$ptr), (v8f64 (load node:$ptr))>;
+def loadv16i32   : PatFrag<(ops node:$ptr), (v16i32 (load node:$ptr))>;
+def loadv8i64    : PatFrag<(ops node:$ptr), (v8i64 (load node:$ptr))>;
+
+// 128-/256-/512-bit extload pattern fragments
+def extloadv2f32 : PatFrag<(ops node:$ptr), (v2f64 (extloadvf32 node:$ptr))>;
+def extloadv4f32 : PatFrag<(ops node:$ptr), (v4f64 (extloadvf32 node:$ptr))>;
+def extloadv8f32 : PatFrag<(ops node:$ptr), (v8f64 (extloadvf32 node:$ptr))>;
+
+// Like 'store', but always requires 128-bit vector alignment.
+def alignedstore : PatFrag<(ops node:$val, node:$ptr),
+                           (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() >= 16;
+}]>;
+
+// Like 'store', but always requires 256-bit vector alignment.
+def alignedstore256 : PatFrag<(ops node:$val, node:$ptr),
+                              (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() >= 32;
+}]>;
+
+// Like 'store', but always requires 512-bit vector alignment.
+def alignedstore512 : PatFrag<(ops node:$val, node:$ptr),
+                              (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() >= 64;
+}]>;
+
+// Like 'load', but always requires 128-bit vector alignment.
+def alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 16;
+}]>;
+
+// Like 'X86vzload', but always requires 128-bit vector alignment.
+def alignedX86vzload : PatFrag<(ops node:$ptr), (X86vzload node:$ptr), [{
+  return cast<MemSDNode>(N)->getAlignment() >= 16;
+}]>;
+
+// Like 'load', but always requires 256-bit vector alignment.
+def alignedload256 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 32;
+}]>;
+
+// Like 'load', but always requires 512-bit vector alignment.
+def alignedload512 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 64;
+}]>;
+
+def alignedloadfsf32 : PatFrag<(ops node:$ptr),
+                               (f32 (alignedload node:$ptr))>;
+def alignedloadfsf64 : PatFrag<(ops node:$ptr),
+                               (f64 (alignedload node:$ptr))>;
+
+// 128-bit aligned load pattern fragments
+// NOTE: all 128-bit integer vector loads are promoted to v2i64
+def alignedloadv4f32 : PatFrag<(ops node:$ptr),
+                               (v4f32 (alignedload node:$ptr))>;
+def alignedloadv2f64 : PatFrag<(ops node:$ptr),
+                               (v2f64 (alignedload node:$ptr))>;
+def alignedloadv2i64 : PatFrag<(ops node:$ptr),
+                               (v2i64 (alignedload node:$ptr))>;
+
+// 256-bit aligned load pattern fragments
+// NOTE: all 256-bit integer vector loads are promoted to v4i64
+def alignedloadv8f32 : PatFrag<(ops node:$ptr),
+                               (v8f32 (alignedload256 node:$ptr))>;
+def alignedloadv4f64 : PatFrag<(ops node:$ptr),
+                               (v4f64 (alignedload256 node:$ptr))>;
+def alignedloadv4i64 : PatFrag<(ops node:$ptr),
+                               (v4i64 (alignedload256 node:$ptr))>;
+
+// 512-bit aligned load pattern fragments
+def alignedloadv16f32 : PatFrag<(ops node:$ptr),
+                                (v16f32 (alignedload512 node:$ptr))>;
+def alignedloadv16i32 : PatFrag<(ops node:$ptr),
+                                (v16i32 (alignedload512 node:$ptr))>;
+def alignedloadv8f64  : PatFrag<(ops node:$ptr),
+                                (v8f64  (alignedload512 node:$ptr))>;
+def alignedloadv8i64  : PatFrag<(ops node:$ptr),
+                                (v8i64  (alignedload512 node:$ptr))>;
+
+// Like 'load', but uses special alignment checks suitable for use in
+// memory operands in most SSE instructions, which are required to
+// be naturally aligned on some targets but not on others.  If the subtarget
+// allows unaligned accesses, match any load, though this may require
+// setting a feature bit in the processor (on startup, for example).
+// Opteron 10h and later implement such a feature.
+def memop : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return    Subtarget->hasVectorUAMem()
+         || cast<LoadSDNode>(N)->getAlignment() >= 16;
+}]>;
+
+def memop4 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return    Subtarget->hasVectorUAMem()
+         || cast<LoadSDNode>(N)->getAlignment() >= 4;
+}]>;
+
+def memop8 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return    Subtarget->hasVectorUAMem()
+         || cast<LoadSDNode>(N)->getAlignment() >= 8;
+}]>;
+
+def memopfsf32 : PatFrag<(ops node:$ptr), (f32   (memop node:$ptr))>;
+def memopfsf64 : PatFrag<(ops node:$ptr), (f64   (memop node:$ptr))>;
+
+// 128-bit memop pattern fragments
+// NOTE: all 128-bit integer vector loads are promoted to v2i64
+def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>;
+def memopv2f64 : PatFrag<(ops node:$ptr), (v2f64 (memop node:$ptr))>;
+def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>;
+
+// 256-bit memop pattern fragments
+// NOTE: all 256-bit integer vector loads are promoted to v4i64
+def memopv8f32 : PatFrag<(ops node:$ptr), (v8f32 (memop node:$ptr))>;
+def memopv4f64 : PatFrag<(ops node:$ptr), (v4f64 (memop node:$ptr))>;
+def memopv4i64 : PatFrag<(ops node:$ptr), (v4i64 (memop node:$ptr))>;
+
+// 512-bit memop pattern fragments
+def memopv16f32 : PatFrag<(ops node:$ptr), (v16f32 (memop4 node:$ptr))>;
+def memopv8f64  : PatFrag<(ops node:$ptr), (v8f64  (memop8 node:$ptr))>;
+def memopv16i32 : PatFrag<(ops node:$ptr), (v16i32 (memop4 node:$ptr))>;
+def memopv8i64  : PatFrag<(ops node:$ptr), (v8i64  (memop8 node:$ptr))>;
+
+// SSSE3 uses MMX registers for some instructions. They aren't aligned on a
+// 16-byte boundary.
+// FIXME: 8 byte alignment for mmx reads is not required
+def memop64 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 8;
+}]>;
+
+def memopmmx  : PatFrag<(ops node:$ptr), (x86mmx  (memop64 node:$ptr))>;
+
+// MOVNT Support
+// Like 'store', but requires the non-temporal bit to be set
+def nontemporalstore : PatFrag<(ops node:$val, node:$ptr),
+                           (st node:$val, node:$ptr), [{
+  if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
+    return ST->isNonTemporal();
+  return false;
+}]>;
+
+def alignednontemporalstore : PatFrag<(ops node:$val, node:$ptr),
+                                    (st node:$val, node:$ptr), [{
+  if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
+    return ST->isNonTemporal() && !ST->isTruncatingStore() &&
+           ST->getAddressingMode() == ISD::UNINDEXED &&
+           ST->getAlignment() >= 16;
+  return false;
+}]>;
+
+def unalignednontemporalstore : PatFrag<(ops node:$val, node:$ptr),
+                                      (st node:$val, node:$ptr), [{
+  if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
+    return ST->isNonTemporal() &&
+           ST->getAlignment() < 16;
+  return false;
+}]>;
+
+// 128-bit bitconvert pattern fragments
+def bc_v4f32 : PatFrag<(ops node:$in), (v4f32 (bitconvert node:$in))>;
+def bc_v2f64 : PatFrag<(ops node:$in), (v2f64 (bitconvert node:$in))>;
+def bc_v16i8 : PatFrag<(ops node:$in), (v16i8 (bitconvert node:$in))>;
+def bc_v8i16 : PatFrag<(ops node:$in), (v8i16 (bitconvert node:$in))>;
+def bc_v4i32 : PatFrag<(ops node:$in), (v4i32 (bitconvert node:$in))>;
+def bc_v2i64 : PatFrag<(ops node:$in), (v2i64 (bitconvert node:$in))>;
+
+// 256-bit bitconvert pattern fragments
+def bc_v32i8 : PatFrag<(ops node:$in), (v32i8 (bitconvert node:$in))>;
+def bc_v16i16 : PatFrag<(ops node:$in), (v16i16 (bitconvert node:$in))>;
+def bc_v8i32 : PatFrag<(ops node:$in), (v8i32 (bitconvert node:$in))>;
+def bc_v4i64 : PatFrag<(ops node:$in), (v4i64 (bitconvert node:$in))>;
+def bc_v8f32 : PatFrag<(ops node:$in), (v8f32 (bitconvert node:$in))>;
+
+// 512-bit bitconvert pattern fragments
+def bc_v16i32 : PatFrag<(ops node:$in), (v16i32 (bitconvert node:$in))>;
+def bc_v8i64 : PatFrag<(ops node:$in), (v8i64 (bitconvert node:$in))>;
+def bc_v8f64 : PatFrag<(ops node:$in), (v8f64 (bitconvert node:$in))>;
+def bc_v16f32 : PatFrag<(ops node:$in), (v16f32 (bitconvert node:$in))>;
+
+def vzmovl_v2i64 : PatFrag<(ops node:$src),
+                           (bitconvert (v2i64 (X86vzmovl
+                             (v2i64 (scalar_to_vector (loadi64 node:$src))))))>;
+def vzmovl_v4i32 : PatFrag<(ops node:$src),
+                           (bitconvert (v4i32 (X86vzmovl
+                             (v4i32 (scalar_to_vector (loadi32 node:$src))))))>;
+
+def vzload_v2i64 : PatFrag<(ops node:$src),
+                           (bitconvert (v2i64 (X86vzload node:$src)))>;
+
+
+def fp32imm0 : PatLeaf<(f32 fpimm), [{
+  return N->isExactlyValue(+0.0);
+}]>;
+
+def I8Imm : SDNodeXForm<imm, [{
+  // Transformation function: get the low 8 bits.
+  return getI8Imm((uint8_t)N->getZExtValue());
+}]>;
+
+def FROUND_NO_EXC : ImmLeaf<i32, [{ return Imm == 8; }]>;
+def FROUND_CURRENT : ImmLeaf<i32, [{ return Imm == 4; }]>;
+
+// BYTE_imm - Transform bit immediates into byte immediates.
+def BYTE_imm  : SDNodeXForm<imm, [{
+  // Transformation function: imm >> 3
+  return getI32Imm(N->getZExtValue() >> 3);
+}]>;
+
+// EXTRACT_get_vextract128_imm xform function: convert extract_subvector index
+// to VEXTRACTF128/VEXTRACTI128 imm.
+def EXTRACT_get_vextract128_imm : SDNodeXForm<extract_subvector, [{
+  return getI8Imm(X86::getExtractVEXTRACT128Immediate(N));
+}]>;
+
+// INSERT_get_vinsert128_imm xform function: convert insert_subvector index to
+// VINSERTF128/VINSERTI128 imm.
+def INSERT_get_vinsert128_imm : SDNodeXForm<insert_subvector, [{
+  return getI8Imm(X86::getInsertVINSERT128Immediate(N));
+}]>;
+
+// EXTRACT_get_vextract256_imm xform function: convert extract_subvector index
+// to VEXTRACTF64x4 imm.
+def EXTRACT_get_vextract256_imm : SDNodeXForm<extract_subvector, [{
+  return getI8Imm(X86::getExtractVEXTRACT256Immediate(N));
+}]>;
+
+// INSERT_get_vinsert256_imm xform function: convert insert_subvector index to
+// VINSERTF64x4 imm.
+def INSERT_get_vinsert256_imm : SDNodeXForm<insert_subvector, [{
+  return getI8Imm(X86::getInsertVINSERT256Immediate(N));
+}]>;
+
+def vextract128_extract : PatFrag<(ops node:$bigvec, node:$index),
+                                   (extract_subvector node:$bigvec,
+                                                      node:$index), [{
+  return X86::isVEXTRACT128Index(N);
+}], EXTRACT_get_vextract128_imm>;
+
+def vinsert128_insert : PatFrag<(ops node:$bigvec, node:$smallvec,
+                                      node:$index),
+                                 (insert_subvector node:$bigvec, node:$smallvec,
+                                                   node:$index), [{
+  return X86::isVINSERT128Index(N);
+}], INSERT_get_vinsert128_imm>;
+
+
+def vextract256_extract : PatFrag<(ops node:$bigvec, node:$index),
+                                   (extract_subvector node:$bigvec,
+                                                      node:$index), [{
+  return X86::isVEXTRACT256Index(N);
+}], EXTRACT_get_vextract256_imm>;
+
+def vinsert256_insert : PatFrag<(ops node:$bigvec, node:$smallvec,
+                                      node:$index),
+                                 (insert_subvector node:$bigvec, node:$smallvec,
+                                                   node:$index), [{
+  return X86::isVINSERT256Index(N);
+}], INSERT_get_vinsert256_imm>;
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp
new file mode 100644
index 0000000..0d3afc4
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -0,0 +1,5580 @@
+//===-- X86InstrInfo.cpp - X86 Instruction Information --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstrInfo.h"
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
+#include <limits>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-instr-info"
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "X86GenInstrInfo.inc"
+
+static cl::opt<bool>
+NoFusing("disable-spill-fusing",
+         cl::desc("Disable fusing of spill code into instructions"));
+static cl::opt<bool>
+PrintFailedFusing("print-failed-fuse-candidates",
+                  cl::desc("Print instructions that the allocator wants to"
+                           " fuse, but the X86 backend currently can't"),
+                  cl::Hidden);
+static cl::opt<bool>
+ReMatPICStubLoad("remat-pic-stub-load",
+                 cl::desc("Re-materialize load from stub in PIC mode"),
+                 cl::init(false), cl::Hidden);
+
+enum {
+  // Select which memory operand is being unfolded.
+  // (stored in bits 0 - 3)
+  TB_INDEX_0    = 0,
+  TB_INDEX_1    = 1,
+  TB_INDEX_2    = 2,
+  TB_INDEX_3    = 3,
+  TB_INDEX_MASK = 0xf,
+
+  // Do not insert the reverse map (MemOp -> RegOp) into the table.
+  // This may be needed because there is a many -> one mapping.
+  TB_NO_REVERSE   = 1 << 4,
+
+  // Do not insert the forward map (RegOp -> MemOp) into the table.
+  // This is needed for Native Client, which prohibits branch
+  // instructions from using a memory operand.
+  TB_NO_FORWARD   = 1 << 5,
+
+  TB_FOLDED_LOAD  = 1 << 6,
+  TB_FOLDED_STORE = 1 << 7,
+
+  // Minimum alignment required for load/store.
+  // Used for RegOp->MemOp conversion.
+  // (stored in bits 8 - 15)
+  TB_ALIGN_SHIFT = 8,
+  TB_ALIGN_NONE  =    0 << TB_ALIGN_SHIFT,
+  TB_ALIGN_16    =   16 << TB_ALIGN_SHIFT,
+  TB_ALIGN_32    =   32 << TB_ALIGN_SHIFT,
+  TB_ALIGN_64    =   64 << TB_ALIGN_SHIFT,
+  TB_ALIGN_MASK  = 0xff << TB_ALIGN_SHIFT
+};
+
+struct X86OpTblEntry {
+  uint16_t RegOp;
+  uint16_t MemOp;
+  uint16_t Flags;
+};
+
+// Pin the vtable to this file.
+void X86InstrInfo::anchor() {}
+
+X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
+    : X86GenInstrInfo(
+          (STI.is64Bit() ? X86::ADJCALLSTACKDOWN64 : X86::ADJCALLSTACKDOWN32),
+          (STI.is64Bit() ? X86::ADJCALLSTACKUP64 : X86::ADJCALLSTACKUP32)),
+      Subtarget(STI), RI(STI) {
+
+  static const X86OpTblEntry OpTbl2Addr[] = {
+    { X86::ADC32ri,     X86::ADC32mi,    0 },
+    { X86::ADC32ri8,    X86::ADC32mi8,   0 },
+    { X86::ADC32rr,     X86::ADC32mr,    0 },
+    { X86::ADC64ri32,   X86::ADC64mi32,  0 },
+    { X86::ADC64ri8,    X86::ADC64mi8,   0 },
+    { X86::ADC64rr,     X86::ADC64mr,    0 },
+    { X86::ADD16ri,     X86::ADD16mi,    0 },
+    { X86::ADD16ri8,    X86::ADD16mi8,   0 },
+    { X86::ADD16ri_DB,  X86::ADD16mi,    TB_NO_REVERSE },
+    { X86::ADD16ri8_DB, X86::ADD16mi8,   TB_NO_REVERSE },
+    { X86::ADD16rr,     X86::ADD16mr,    0 },
+    { X86::ADD16rr_DB,  X86::ADD16mr,    TB_NO_REVERSE },
+    { X86::ADD32ri,     X86::ADD32mi,    0 },
+    { X86::ADD32ri8,    X86::ADD32mi8,   0 },
+    { X86::ADD32ri_DB,  X86::ADD32mi,    TB_NO_REVERSE },
+    { X86::ADD32ri8_DB, X86::ADD32mi8,   TB_NO_REVERSE },
+    { X86::ADD32rr,     X86::ADD32mr,    0 },
+    { X86::ADD32rr_DB,  X86::ADD32mr,    TB_NO_REVERSE },
+    { X86::ADD64ri32,   X86::ADD64mi32,  0 },
+    { X86::ADD64ri8,    X86::ADD64mi8,   0 },
+    { X86::ADD64ri32_DB,X86::ADD64mi32,  TB_NO_REVERSE },
+    { X86::ADD64ri8_DB, X86::ADD64mi8,   TB_NO_REVERSE },
+    { X86::ADD64rr,     X86::ADD64mr,    0 },
+    { X86::ADD64rr_DB,  X86::ADD64mr,    TB_NO_REVERSE },
+    { X86::ADD8ri,      X86::ADD8mi,     0 },
+    { X86::ADD8rr,      X86::ADD8mr,     0 },
+    { X86::AND16ri,     X86::AND16mi,    0 },
+    { X86::AND16ri8,    X86::AND16mi8,   0 },
+    { X86::AND16rr,     X86::AND16mr,    0 },
+    { X86::AND32ri,     X86::AND32mi,    0 },
+    { X86::AND32ri8,    X86::AND32mi8,   0 },
+    { X86::AND32rr,     X86::AND32mr,    0 },
+    { X86::AND64ri32,   X86::AND64mi32,  0 },
+    { X86::AND64ri8,    X86::AND64mi8,   0 },
+    { X86::AND64rr,     X86::AND64mr,    0 },
+    { X86::AND8ri,      X86::AND8mi,     0 },
+    { X86::AND8rr,      X86::AND8mr,     0 },
+    { X86::DEC16r,      X86::DEC16m,     0 },
+    { X86::DEC32r,      X86::DEC32m,     0 },
+    { X86::DEC64_16r,   X86::DEC64_16m,  0 },
+    { X86::DEC64_32r,   X86::DEC64_32m,  0 },
+    { X86::DEC64r,      X86::DEC64m,     0 },
+    { X86::DEC8r,       X86::DEC8m,      0 },
+    { X86::INC16r,      X86::INC16m,     0 },
+    { X86::INC32r,      X86::INC32m,     0 },
+    { X86::INC64_16r,   X86::INC64_16m,  0 },
+    { X86::INC64_32r,   X86::INC64_32m,  0 },
+    { X86::INC64r,      X86::INC64m,     0 },
+    { X86::INC8r,       X86::INC8m,      0 },
+    { X86::NEG16r,      X86::NEG16m,     0 },
+    { X86::NEG32r,      X86::NEG32m,     0 },
+    { X86::NEG64r,      X86::NEG64m,     0 },
+    { X86::NEG8r,       X86::NEG8m,      0 },
+    { X86::NOT16r,      X86::NOT16m,     0 },
+    { X86::NOT32r,      X86::NOT32m,     0 },
+    { X86::NOT64r,      X86::NOT64m,     0 },
+    { X86::NOT8r,       X86::NOT8m,      0 },
+    { X86::OR16ri,      X86::OR16mi,     0 },
+    { X86::OR16ri8,     X86::OR16mi8,    0 },
+    { X86::OR16rr,      X86::OR16mr,     0 },
+    { X86::OR32ri,      X86::OR32mi,     0 },
+    { X86::OR32ri8,     X86::OR32mi8,    0 },
+    { X86::OR32rr,      X86::OR32mr,     0 },
+    { X86::OR64ri32,    X86::OR64mi32,   0 },
+    { X86::OR64ri8,     X86::OR64mi8,    0 },
+    { X86::OR64rr,      X86::OR64mr,     0 },
+    { X86::OR8ri,       X86::OR8mi,      0 },
+    { X86::OR8rr,       X86::OR8mr,      0 },
+    { X86::ROL16r1,     X86::ROL16m1,    0 },
+    { X86::ROL16rCL,    X86::ROL16mCL,   0 },
+    { X86::ROL16ri,     X86::ROL16mi,    0 },
+    { X86::ROL32r1,     X86::ROL32m1,    0 },
+    { X86::ROL32rCL,    X86::ROL32mCL,   0 },
+    { X86::ROL32ri,     X86::ROL32mi,    0 },
+    { X86::ROL64r1,     X86::ROL64m1,    0 },
+    { X86::ROL64rCL,    X86::ROL64mCL,   0 },
+    { X86::ROL64ri,     X86::ROL64mi,    0 },
+    { X86::ROL8r1,      X86::ROL8m1,     0 },
+    { X86::ROL8rCL,     X86::ROL8mCL,    0 },
+    { X86::ROL8ri,      X86::ROL8mi,     0 },
+    { X86::ROR16r1,     X86::ROR16m1,    0 },
+    { X86::ROR16rCL,    X86::ROR16mCL,   0 },
+    { X86::ROR16ri,     X86::ROR16mi,    0 },
+    { X86::ROR32r1,     X86::ROR32m1,    0 },
+    { X86::ROR32rCL,    X86::ROR32mCL,   0 },
+    { X86::ROR32ri,     X86::ROR32mi,    0 },
+    { X86::ROR64r1,     X86::ROR64m1,    0 },
+    { X86::ROR64rCL,    X86::ROR64mCL,   0 },
+    { X86::ROR64ri,     X86::ROR64mi,    0 },
+    { X86::ROR8r1,      X86::ROR8m1,     0 },
+    { X86::ROR8rCL,     X86::ROR8mCL,    0 },
+    { X86::ROR8ri,      X86::ROR8mi,     0 },
+    { X86::SAR16r1,     X86::SAR16m1,    0 },
+    { X86::SAR16rCL,    X86::SAR16mCL,   0 },
+    { X86::SAR16ri,     X86::SAR16mi,    0 },
+    { X86::SAR32r1,     X86::SAR32m1,    0 },
+    { X86::SAR32rCL,    X86::SAR32mCL,   0 },
+    { X86::SAR32ri,     X86::SAR32mi,    0 },
+    { X86::SAR64r1,     X86::SAR64m1,    0 },
+    { X86::SAR64rCL,    X86::SAR64mCL,   0 },
+    { X86::SAR64ri,     X86::SAR64mi,    0 },
+    { X86::SAR8r1,      X86::SAR8m1,     0 },
+    { X86::SAR8rCL,     X86::SAR8mCL,    0 },
+    { X86::SAR8ri,      X86::SAR8mi,     0 },
+    { X86::SBB32ri,     X86::SBB32mi,    0 },
+    { X86::SBB32ri8,    X86::SBB32mi8,   0 },
+    { X86::SBB32rr,     X86::SBB32mr,    0 },
+    { X86::SBB64ri32,   X86::SBB64mi32,  0 },
+    { X86::SBB64ri8,    X86::SBB64mi8,   0 },
+    { X86::SBB64rr,     X86::SBB64mr,    0 },
+    { X86::SHL16rCL,    X86::SHL16mCL,   0 },
+    { X86::SHL16ri,     X86::SHL16mi,    0 },
+    { X86::SHL32rCL,    X86::SHL32mCL,   0 },
+    { X86::SHL32ri,     X86::SHL32mi,    0 },
+    { X86::SHL64rCL,    X86::SHL64mCL,   0 },
+    { X86::SHL64ri,     X86::SHL64mi,    0 },
+    { X86::SHL8rCL,     X86::SHL8mCL,    0 },
+    { X86::SHL8ri,      X86::SHL8mi,     0 },
+    { X86::SHLD16rrCL,  X86::SHLD16mrCL, 0 },
+    { X86::SHLD16rri8,  X86::SHLD16mri8, 0 },
+    { X86::SHLD32rrCL,  X86::SHLD32mrCL, 0 },
+    { X86::SHLD32rri8,  X86::SHLD32mri8, 0 },
+    { X86::SHLD64rrCL,  X86::SHLD64mrCL, 0 },
+    { X86::SHLD64rri8,  X86::SHLD64mri8, 0 },
+    { X86::SHR16r1,     X86::SHR16m1,    0 },
+    { X86::SHR16rCL,    X86::SHR16mCL,   0 },
+    { X86::SHR16ri,     X86::SHR16mi,    0 },
+    { X86::SHR32r1,     X86::SHR32m1,    0 },
+    { X86::SHR32rCL,    X86::SHR32mCL,   0 },
+    { X86::SHR32ri,     X86::SHR32mi,    0 },
+    { X86::SHR64r1,     X86::SHR64m1,    0 },
+    { X86::SHR64rCL,    X86::SHR64mCL,   0 },
+    { X86::SHR64ri,     X86::SHR64mi,    0 },
+    { X86::SHR8r1,      X86::SHR8m1,     0 },
+    { X86::SHR8rCL,     X86::SHR8mCL,    0 },
+    { X86::SHR8ri,      X86::SHR8mi,     0 },
+    { X86::SHRD16rrCL,  X86::SHRD16mrCL, 0 },
+    { X86::SHRD16rri8,  X86::SHRD16mri8, 0 },
+    { X86::SHRD32rrCL,  X86::SHRD32mrCL, 0 },
+    { X86::SHRD32rri8,  X86::SHRD32mri8, 0 },
+    { X86::SHRD64rrCL,  X86::SHRD64mrCL, 0 },
+    { X86::SHRD64rri8,  X86::SHRD64mri8, 0 },
+    { X86::SUB16ri,     X86::SUB16mi,    0 },
+    { X86::SUB16ri8,    X86::SUB16mi8,   0 },
+    { X86::SUB16rr,     X86::SUB16mr,    0 },
+    { X86::SUB32ri,     X86::SUB32mi,    0 },
+    { X86::SUB32ri8,    X86::SUB32mi8,   0 },
+    { X86::SUB32rr,     X86::SUB32mr,    0 },
+    { X86::SUB64ri32,   X86::SUB64mi32,  0 },
+    { X86::SUB64ri8,    X86::SUB64mi8,   0 },
+    { X86::SUB64rr,     X86::SUB64mr,    0 },
+    { X86::SUB8ri,      X86::SUB8mi,     0 },
+    { X86::SUB8rr,      X86::SUB8mr,     0 },
+    { X86::XOR16ri,     X86::XOR16mi,    0 },
+    { X86::XOR16ri8,    X86::XOR16mi8,   0 },
+    { X86::XOR16rr,     X86::XOR16mr,    0 },
+    { X86::XOR32ri,     X86::XOR32mi,    0 },
+    { X86::XOR32ri8,    X86::XOR32mi8,   0 },
+    { X86::XOR32rr,     X86::XOR32mr,    0 },
+    { X86::XOR64ri32,   X86::XOR64mi32,  0 },
+    { X86::XOR64ri8,    X86::XOR64mi8,   0 },
+    { X86::XOR64rr,     X86::XOR64mr,    0 },
+    { X86::XOR8ri,      X86::XOR8mi,     0 },
+    { X86::XOR8rr,      X86::XOR8mr,     0 }
+  };
+
+  for (unsigned i = 0, e = array_lengthof(OpTbl2Addr); i != e; ++i) {
+    unsigned RegOp = OpTbl2Addr[i].RegOp;
+    unsigned MemOp = OpTbl2Addr[i].MemOp;
+    unsigned Flags = OpTbl2Addr[i].Flags;
+    AddTableEntry(RegOp2MemOpTable2Addr, MemOp2RegOpTable,
+                  RegOp, MemOp,
+                  // Index 0, folded load and store, no alignment requirement.
+                  Flags | TB_INDEX_0 | TB_FOLDED_LOAD | TB_FOLDED_STORE);
+  }
+
+  static const X86OpTblEntry OpTbl0[] = {
+    { X86::BT16ri8,     X86::BT16mi8,       TB_FOLDED_LOAD },
+    { X86::BT32ri8,     X86::BT32mi8,       TB_FOLDED_LOAD },
+    { X86::BT64ri8,     X86::BT64mi8,       TB_FOLDED_LOAD },
+    { X86::CALL32r,     X86::CALL32m,       TB_FOLDED_LOAD },
+    { X86::CALL64r,     X86::CALL64m,       TB_FOLDED_LOAD },
+    { X86::CMP16ri,     X86::CMP16mi,       TB_FOLDED_LOAD },
+    { X86::CMP16ri8,    X86::CMP16mi8,      TB_FOLDED_LOAD },
+    { X86::CMP16rr,     X86::CMP16mr,       TB_FOLDED_LOAD },
+    { X86::CMP32ri,     X86::CMP32mi,       TB_FOLDED_LOAD },
+    { X86::CMP32ri8,    X86::CMP32mi8,      TB_FOLDED_LOAD },
+    { X86::CMP32rr,     X86::CMP32mr,       TB_FOLDED_LOAD },
+    { X86::CMP64ri32,   X86::CMP64mi32,     TB_FOLDED_LOAD },
+    { X86::CMP64ri8,    X86::CMP64mi8,      TB_FOLDED_LOAD },
+    { X86::CMP64rr,     X86::CMP64mr,       TB_FOLDED_LOAD },
+    { X86::CMP8ri,      X86::CMP8mi,        TB_FOLDED_LOAD },
+    { X86::CMP8rr,      X86::CMP8mr,        TB_FOLDED_LOAD },
+    { X86::DIV16r,      X86::DIV16m,        TB_FOLDED_LOAD },
+    { X86::DIV32r,      X86::DIV32m,        TB_FOLDED_LOAD },
+    { X86::DIV64r,      X86::DIV64m,        TB_FOLDED_LOAD },
+    { X86::DIV8r,       X86::DIV8m,         TB_FOLDED_LOAD },
+    { X86::EXTRACTPSrr, X86::EXTRACTPSmr,   TB_FOLDED_STORE },
+    { X86::IDIV16r,     X86::IDIV16m,       TB_FOLDED_LOAD },
+    { X86::IDIV32r,     X86::IDIV32m,       TB_FOLDED_LOAD },
+    { X86::IDIV64r,     X86::IDIV64m,       TB_FOLDED_LOAD },
+    { X86::IDIV8r,      X86::IDIV8m,        TB_FOLDED_LOAD },
+    { X86::IMUL16r,     X86::IMUL16m,       TB_FOLDED_LOAD },
+    { X86::IMUL32r,     X86::IMUL32m,       TB_FOLDED_LOAD },
+    { X86::IMUL64r,     X86::IMUL64m,       TB_FOLDED_LOAD },
+    { X86::IMUL8r,      X86::IMUL8m,        TB_FOLDED_LOAD },
+    { X86::JMP32r,      X86::JMP32m,        TB_FOLDED_LOAD },
+    { X86::JMP64r,      X86::JMP64m,        TB_FOLDED_LOAD },
+    { X86::MOV16ri,     X86::MOV16mi,       TB_FOLDED_STORE },
+    { X86::MOV16rr,     X86::MOV16mr,       TB_FOLDED_STORE },
+    { X86::MOV32ri,     X86::MOV32mi,       TB_FOLDED_STORE },
+    { X86::MOV32rr,     X86::MOV32mr,       TB_FOLDED_STORE },
+    { X86::MOV64ri32,   X86::MOV64mi32,     TB_FOLDED_STORE },
+    { X86::MOV64rr,     X86::MOV64mr,       TB_FOLDED_STORE },
+    { X86::MOV8ri,      X86::MOV8mi,        TB_FOLDED_STORE },
+    { X86::MOV8rr,      X86::MOV8mr,        TB_FOLDED_STORE },
+    { X86::MOV8rr_NOREX, X86::MOV8mr_NOREX, TB_FOLDED_STORE },
+    { X86::MOVAPDrr,    X86::MOVAPDmr,      TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::MOVAPSrr,    X86::MOVAPSmr,      TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::MOVDQArr,    X86::MOVDQAmr,      TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::MOVPDI2DIrr, X86::MOVPDI2DImr,   TB_FOLDED_STORE },
+    { X86::MOVPQIto64rr,X86::MOVPQI2QImr,   TB_FOLDED_STORE },
+    { X86::MOVSDto64rr, X86::MOVSDto64mr,   TB_FOLDED_STORE },
+    { X86::MOVSS2DIrr,  X86::MOVSS2DImr,    TB_FOLDED_STORE },
+    { X86::MOVUPDrr,    X86::MOVUPDmr,      TB_FOLDED_STORE },
+    { X86::MOVUPSrr,    X86::MOVUPSmr,      TB_FOLDED_STORE },
+    { X86::MUL16r,      X86::MUL16m,        TB_FOLDED_LOAD },
+    { X86::MUL32r,      X86::MUL32m,        TB_FOLDED_LOAD },
+    { X86::MUL64r,      X86::MUL64m,        TB_FOLDED_LOAD },
+    { X86::MUL8r,       X86::MUL8m,         TB_FOLDED_LOAD },
+    { X86::SETAEr,      X86::SETAEm,        TB_FOLDED_STORE },
+    { X86::SETAr,       X86::SETAm,         TB_FOLDED_STORE },
+    { X86::SETBEr,      X86::SETBEm,        TB_FOLDED_STORE },
+    { X86::SETBr,       X86::SETBm,         TB_FOLDED_STORE },
+    { X86::SETEr,       X86::SETEm,         TB_FOLDED_STORE },
+    { X86::SETGEr,      X86::SETGEm,        TB_FOLDED_STORE },
+    { X86::SETGr,       X86::SETGm,         TB_FOLDED_STORE },
+    { X86::SETLEr,      X86::SETLEm,        TB_FOLDED_STORE },
+    { X86::SETLr,       X86::SETLm,         TB_FOLDED_STORE },
+    { X86::SETNEr,      X86::SETNEm,        TB_FOLDED_STORE },
+    { X86::SETNOr,      X86::SETNOm,        TB_FOLDED_STORE },
+    { X86::SETNPr,      X86::SETNPm,        TB_FOLDED_STORE },
+    { X86::SETNSr,      X86::SETNSm,        TB_FOLDED_STORE },
+    { X86::SETOr,       X86::SETOm,         TB_FOLDED_STORE },
+    { X86::SETPr,       X86::SETPm,         TB_FOLDED_STORE },
+    { X86::SETSr,       X86::SETSm,         TB_FOLDED_STORE },
+    { X86::TAILJMPr,    X86::TAILJMPm,      TB_FOLDED_LOAD },
+    { X86::TAILJMPr64,  X86::TAILJMPm64,    TB_FOLDED_LOAD },
+    { X86::TEST16ri,    X86::TEST16mi,      TB_FOLDED_LOAD },
+    { X86::TEST32ri,    X86::TEST32mi,      TB_FOLDED_LOAD },
+    { X86::TEST64ri32,  X86::TEST64mi32,    TB_FOLDED_LOAD },
+    { X86::TEST8ri,     X86::TEST8mi,       TB_FOLDED_LOAD },
+    // AVX 128-bit versions of foldable instructions
+    { X86::VEXTRACTPSrr,X86::VEXTRACTPSmr,  TB_FOLDED_STORE  },
+    { X86::VEXTRACTF128rr, X86::VEXTRACTF128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::VMOVAPDrr,   X86::VMOVAPDmr,     TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::VMOVAPSrr,   X86::VMOVAPSmr,     TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::VMOVDQArr,   X86::VMOVDQAmr,     TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::VMOVPDI2DIrr,X86::VMOVPDI2DImr,  TB_FOLDED_STORE },
+    { X86::VMOVPQIto64rr, X86::VMOVPQI2QImr,TB_FOLDED_STORE },
+    { X86::VMOVSDto64rr,X86::VMOVSDto64mr,  TB_FOLDED_STORE },
+    { X86::VMOVSS2DIrr, X86::VMOVSS2DImr,   TB_FOLDED_STORE },
+    { X86::VMOVUPDrr,   X86::VMOVUPDmr,     TB_FOLDED_STORE },
+    { X86::VMOVUPSrr,   X86::VMOVUPSmr,     TB_FOLDED_STORE },
+    // AVX 256-bit foldable instructions
+    { X86::VEXTRACTI128rr, X86::VEXTRACTI128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::VMOVAPDYrr,  X86::VMOVAPDYmr,    TB_FOLDED_STORE | TB_ALIGN_32 },
+    { X86::VMOVAPSYrr,  X86::VMOVAPSYmr,    TB_FOLDED_STORE | TB_ALIGN_32 },
+    { X86::VMOVDQAYrr,  X86::VMOVDQAYmr,    TB_FOLDED_STORE | TB_ALIGN_32 },
+    { X86::VMOVUPDYrr,  X86::VMOVUPDYmr,    TB_FOLDED_STORE },
+    { X86::VMOVUPSYrr,  X86::VMOVUPSYmr,    TB_FOLDED_STORE },
+    // AVX-512 foldable instructions
+    { X86::VMOVPDI2DIZrr,X86::VMOVPDI2DIZmr,  TB_FOLDED_STORE }
+  };
+
+  for (unsigned i = 0, e = array_lengthof(OpTbl0); i != e; ++i) {
+    unsigned RegOp      = OpTbl0[i].RegOp;
+    unsigned MemOp      = OpTbl0[i].MemOp;
+    unsigned Flags      = OpTbl0[i].Flags;
+    AddTableEntry(RegOp2MemOpTable0, MemOp2RegOpTable,
+                  RegOp, MemOp, TB_INDEX_0 | Flags);
+  }
+
+  static const X86OpTblEntry OpTbl1[] = {
+    { X86::CMP16rr,         X86::CMP16rm,             0 },
+    { X86::CMP32rr,         X86::CMP32rm,             0 },
+    { X86::CMP64rr,         X86::CMP64rm,             0 },
+    { X86::CMP8rr,          X86::CMP8rm,              0 },
+    { X86::CVTSD2SSrr,      X86::CVTSD2SSrm,          0 },
+    { X86::CVTSI2SD64rr,    X86::CVTSI2SD64rm,        0 },
+    { X86::CVTSI2SDrr,      X86::CVTSI2SDrm,          0 },
+    { X86::CVTSI2SS64rr,    X86::CVTSI2SS64rm,        0 },
+    { X86::CVTSI2SSrr,      X86::CVTSI2SSrm,          0 },
+    { X86::CVTSS2SDrr,      X86::CVTSS2SDrm,          0 },
+    { X86::CVTTSD2SI64rr,   X86::CVTTSD2SI64rm,       0 },
+    { X86::CVTTSD2SIrr,     X86::CVTTSD2SIrm,         0 },
+    { X86::CVTTSS2SI64rr,   X86::CVTTSS2SI64rm,       0 },
+    { X86::CVTTSS2SIrr,     X86::CVTTSS2SIrm,         0 },
+    { X86::IMUL16rri,       X86::IMUL16rmi,           0 },
+    { X86::IMUL16rri8,      X86::IMUL16rmi8,          0 },
+    { X86::IMUL32rri,       X86::IMUL32rmi,           0 },
+    { X86::IMUL32rri8,      X86::IMUL32rmi8,          0 },
+    { X86::IMUL64rri32,     X86::IMUL64rmi32,         0 },
+    { X86::IMUL64rri8,      X86::IMUL64rmi8,          0 },
+    { X86::Int_COMISDrr,    X86::Int_COMISDrm,        0 },
+    { X86::Int_COMISSrr,    X86::Int_COMISSrm,        0 },
+    { X86::CVTSD2SI64rr,    X86::CVTSD2SI64rm,        0 },
+    { X86::CVTSD2SIrr,      X86::CVTSD2SIrm,          0 },
+    { X86::CVTSS2SI64rr,    X86::CVTSS2SI64rm,        0 },
+    { X86::CVTSS2SIrr,      X86::CVTSS2SIrm,          0 },
+    { X86::CVTTPD2DQrr,     X86::CVTTPD2DQrm,         TB_ALIGN_16 },
+    { X86::CVTTPS2DQrr,     X86::CVTTPS2DQrm,         TB_ALIGN_16 },
+    { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm,  0 },
+    { X86::Int_CVTTSD2SIrr, X86::Int_CVTTSD2SIrm,     0 },
+    { X86::Int_CVTTSS2SI64rr,X86::Int_CVTTSS2SI64rm,  0 },
+    { X86::Int_CVTTSS2SIrr, X86::Int_CVTTSS2SIrm,     0 },
+    { X86::Int_UCOMISDrr,   X86::Int_UCOMISDrm,       0 },
+    { X86::Int_UCOMISSrr,   X86::Int_UCOMISSrm,       0 },
+    { X86::MOV16rr,         X86::MOV16rm,             0 },
+    { X86::MOV32rr,         X86::MOV32rm,             0 },
+    { X86::MOV64rr,         X86::MOV64rm,             0 },
+    { X86::MOV64toPQIrr,    X86::MOVQI2PQIrm,         0 },
+    { X86::MOV64toSDrr,     X86::MOV64toSDrm,         0 },
+    { X86::MOV8rr,          X86::MOV8rm,              0 },
+    { X86::MOVAPDrr,        X86::MOVAPDrm,            TB_ALIGN_16 },
+    { X86::MOVAPSrr,        X86::MOVAPSrm,            TB_ALIGN_16 },
+    { X86::MOVDDUPrr,       X86::MOVDDUPrm,           0 },
+    { X86::MOVDI2PDIrr,     X86::MOVDI2PDIrm,         0 },
+    { X86::MOVDI2SSrr,      X86::MOVDI2SSrm,          0 },
+    { X86::MOVDQArr,        X86::MOVDQArm,            TB_ALIGN_16 },
+    { X86::MOVSHDUPrr,      X86::MOVSHDUPrm,          TB_ALIGN_16 },
+    { X86::MOVSLDUPrr,      X86::MOVSLDUPrm,          TB_ALIGN_16 },
+    { X86::MOVSX16rr8,      X86::MOVSX16rm8,          0 },
+    { X86::MOVSX32rr16,     X86::MOVSX32rm16,         0 },
+    { X86::MOVSX32rr8,      X86::MOVSX32rm8,          0 },
+    { X86::MOVSX64rr16,     X86::MOVSX64rm16,         0 },
+    { X86::MOVSX64rr32,     X86::MOVSX64rm32,         0 },
+    { X86::MOVSX64rr8,      X86::MOVSX64rm8,          0 },
+    { X86::MOVUPDrr,        X86::MOVUPDrm,            TB_ALIGN_16 },
+    { X86::MOVUPSrr,        X86::MOVUPSrm,            0 },
+    { X86::MOVZQI2PQIrr,    X86::MOVZQI2PQIrm,        0 },
+    { X86::MOVZPQILo2PQIrr, X86::MOVZPQILo2PQIrm,     TB_ALIGN_16 },
+    { X86::MOVZX16rr8,      X86::MOVZX16rm8,          0 },
+    { X86::MOVZX32rr16,     X86::MOVZX32rm16,         0 },
+    { X86::MOVZX32_NOREXrr8, X86::MOVZX32_NOREXrm8,   0 },
+    { X86::MOVZX32rr8,      X86::MOVZX32rm8,          0 },
+    { X86::PABSBrr128,      X86::PABSBrm128,          TB_ALIGN_16 },
+    { X86::PABSDrr128,      X86::PABSDrm128,          TB_ALIGN_16 },
+    { X86::PABSWrr128,      X86::PABSWrm128,          TB_ALIGN_16 },
+    { X86::PSHUFDri,        X86::PSHUFDmi,            TB_ALIGN_16 },
+    { X86::PSHUFHWri,       X86::PSHUFHWmi,           TB_ALIGN_16 },
+    { X86::PSHUFLWri,       X86::PSHUFLWmi,           TB_ALIGN_16 },
+    { X86::RCPPSr,          X86::RCPPSm,              TB_ALIGN_16 },
+    { X86::RCPPSr_Int,      X86::RCPPSm_Int,          TB_ALIGN_16 },
+    { X86::RSQRTPSr,        X86::RSQRTPSm,            TB_ALIGN_16 },
+    { X86::RSQRTPSr_Int,    X86::RSQRTPSm_Int,        TB_ALIGN_16 },
+    { X86::RSQRTSSr,        X86::RSQRTSSm,            0 },
+    { X86::RSQRTSSr_Int,    X86::RSQRTSSm_Int,        0 },
+    { X86::SQRTPDr,         X86::SQRTPDm,             TB_ALIGN_16 },
+    { X86::SQRTPSr,         X86::SQRTPSm,             TB_ALIGN_16 },
+    { X86::SQRTSDr,         X86::SQRTSDm,             0 },
+    { X86::SQRTSDr_Int,     X86::SQRTSDm_Int,         0 },
+    { X86::SQRTSSr,         X86::SQRTSSm,             0 },
+    { X86::SQRTSSr_Int,     X86::SQRTSSm_Int,         0 },
+    { X86::TEST16rr,        X86::TEST16rm,            0 },
+    { X86::TEST32rr,        X86::TEST32rm,            0 },
+    { X86::TEST64rr,        X86::TEST64rm,            0 },
+    { X86::TEST8rr,         X86::TEST8rm,             0 },
+    // FIXME: TEST*rr EAX,EAX ---> CMP [mem], 0
+    { X86::UCOMISDrr,       X86::UCOMISDrm,           0 },
+    { X86::UCOMISSrr,       X86::UCOMISSrm,           0 },
+    // AVX 128-bit versions of foldable instructions
+    { X86::Int_VCOMISDrr,   X86::Int_VCOMISDrm,       0 },
+    { X86::Int_VCOMISSrr,   X86::Int_VCOMISSrm,       0 },
+    { X86::Int_VUCOMISDrr,  X86::Int_VUCOMISDrm,      0 },
+    { X86::Int_VUCOMISSrr,  X86::Int_VUCOMISSrm,      0 },
+    { X86::VCVTTSD2SI64rr,  X86::VCVTTSD2SI64rm,      0 },
+    { X86::Int_VCVTTSD2SI64rr,X86::Int_VCVTTSD2SI64rm,0 },
+    { X86::VCVTTSD2SIrr,    X86::VCVTTSD2SIrm,        0 },
+    { X86::Int_VCVTTSD2SIrr,X86::Int_VCVTTSD2SIrm,    0 },
+    { X86::VCVTTSS2SI64rr,  X86::VCVTTSS2SI64rm,      0 },
+    { X86::Int_VCVTTSS2SI64rr,X86::Int_VCVTTSS2SI64rm,0 },
+    { X86::VCVTTSS2SIrr,    X86::VCVTTSS2SIrm,        0 },
+    { X86::Int_VCVTTSS2SIrr,X86::Int_VCVTTSS2SIrm,    0 },
+    { X86::VCVTSD2SI64rr,   X86::VCVTSD2SI64rm,       0 },
+    { X86::VCVTSD2SIrr,     X86::VCVTSD2SIrm,         0 },
+    { X86::VCVTSS2SI64rr,   X86::VCVTSS2SI64rm,       0 },
+    { X86::VCVTSS2SIrr,     X86::VCVTSS2SIrm,         0 },
+    { X86::VMOV64toPQIrr,   X86::VMOVQI2PQIrm,        0 },
+    { X86::VMOV64toSDrr,    X86::VMOV64toSDrm,        0 },
+    { X86::VMOVAPDrr,       X86::VMOVAPDrm,           TB_ALIGN_16 },
+    { X86::VMOVAPSrr,       X86::VMOVAPSrm,           TB_ALIGN_16 },
+    { X86::VMOVDDUPrr,      X86::VMOVDDUPrm,          0 },
+    { X86::VMOVDI2PDIrr,    X86::VMOVDI2PDIrm,        0 },
+    { X86::VMOVDI2SSrr,     X86::VMOVDI2SSrm,         0 },
+    { X86::VMOVDQArr,       X86::VMOVDQArm,           TB_ALIGN_16 },
+    { X86::VMOVSLDUPrr,     X86::VMOVSLDUPrm,         TB_ALIGN_16 },
+    { X86::VMOVSHDUPrr,     X86::VMOVSHDUPrm,         TB_ALIGN_16 },
+    { X86::VMOVUPDrr,       X86::VMOVUPDrm,           0 },
+    { X86::VMOVUPSrr,       X86::VMOVUPSrm,           0 },
+    { X86::VMOVZQI2PQIrr,   X86::VMOVZQI2PQIrm,       0 },
+    { X86::VMOVZPQILo2PQIrr,X86::VMOVZPQILo2PQIrm,    TB_ALIGN_16 },
+    { X86::VPABSBrr128,     X86::VPABSBrm128,         0 },
+    { X86::VPABSDrr128,     X86::VPABSDrm128,         0 },
+    { X86::VPABSWrr128,     X86::VPABSWrm128,         0 },
+    { X86::VPERMILPDri,     X86::VPERMILPDmi,         0 },
+    { X86::VPERMILPSri,     X86::VPERMILPSmi,         0 },
+    { X86::VPSHUFDri,       X86::VPSHUFDmi,           0 },
+    { X86::VPSHUFHWri,      X86::VPSHUFHWmi,          0 },
+    { X86::VPSHUFLWri,      X86::VPSHUFLWmi,          0 },
+    { X86::VRCPPSr,         X86::VRCPPSm,             0 },
+    { X86::VRCPPSr_Int,     X86::VRCPPSm_Int,         0 },
+    { X86::VRSQRTPSr,       X86::VRSQRTPSm,           0 },
+    { X86::VRSQRTPSr_Int,   X86::VRSQRTPSm_Int,       0 },
+    { X86::VSQRTPDr,        X86::VSQRTPDm,            0 },
+    { X86::VSQRTPSr,        X86::VSQRTPSm,            0 },
+    { X86::VUCOMISDrr,      X86::VUCOMISDrm,          0 },
+    { X86::VUCOMISSrr,      X86::VUCOMISSrm,          0 },
+    { X86::VBROADCASTSSrr,  X86::VBROADCASTSSrm,      TB_NO_REVERSE },
+
+    // AVX 256-bit foldable instructions
+    { X86::VMOVAPDYrr,      X86::VMOVAPDYrm,          TB_ALIGN_32 },
+    { X86::VMOVAPSYrr,      X86::VMOVAPSYrm,          TB_ALIGN_32 },
+    { X86::VMOVDQAYrr,      X86::VMOVDQAYrm,          TB_ALIGN_32 },
+    { X86::VMOVUPDYrr,      X86::VMOVUPDYrm,          0 },
+    { X86::VMOVUPSYrr,      X86::VMOVUPSYrm,          0 },
+    { X86::VPERMILPDYri,    X86::VPERMILPDYmi,        0 },
+    { X86::VPERMILPSYri,    X86::VPERMILPSYmi,        0 },
+
+    // AVX2 foldable instructions
+    { X86::VPABSBrr256,     X86::VPABSBrm256,         0 },
+    { X86::VPABSDrr256,     X86::VPABSDrm256,         0 },
+    { X86::VPABSWrr256,     X86::VPABSWrm256,         0 },
+    { X86::VPSHUFDYri,      X86::VPSHUFDYmi,          0 },
+    { X86::VPSHUFHWYri,     X86::VPSHUFHWYmi,         0 },
+    { X86::VPSHUFLWYri,     X86::VPSHUFLWYmi,         0 },
+    { X86::VRCPPSYr,        X86::VRCPPSYm,            0 },
+    { X86::VRCPPSYr_Int,    X86::VRCPPSYm_Int,        0 },
+    { X86::VRSQRTPSYr,      X86::VRSQRTPSYm,          0 },
+    { X86::VSQRTPDYr,       X86::VSQRTPDYm,           0 },
+    { X86::VSQRTPSYr,       X86::VSQRTPSYm,           0 },
+    { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm,     TB_NO_REVERSE },
+    { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm,     TB_NO_REVERSE },
+
+    // BMI/BMI2/LZCNT/POPCNT/TBM foldable instructions
+    { X86::BEXTR32rr,       X86::BEXTR32rm,           0 },
+    { X86::BEXTR64rr,       X86::BEXTR64rm,           0 },
+    { X86::BEXTRI32ri,      X86::BEXTRI32mi,          0 },
+    { X86::BEXTRI64ri,      X86::BEXTRI64mi,          0 },
+    { X86::BLCFILL32rr,     X86::BLCFILL32rm,         0 },
+    { X86::BLCFILL64rr,     X86::BLCFILL64rm,         0 },
+    { X86::BLCI32rr,        X86::BLCI32rm,            0 },
+    { X86::BLCI64rr,        X86::BLCI64rm,            0 },
+    { X86::BLCIC32rr,       X86::BLCIC32rm,           0 },
+    { X86::BLCIC64rr,       X86::BLCIC64rm,           0 },
+    { X86::BLCMSK32rr,      X86::BLCMSK32rm,          0 },
+    { X86::BLCMSK64rr,      X86::BLCMSK64rm,          0 },
+    { X86::BLCS32rr,        X86::BLCS32rm,            0 },
+    { X86::BLCS64rr,        X86::BLCS64rm,            0 },
+    { X86::BLSFILL32rr,     X86::BLSFILL32rm,         0 },
+    { X86::BLSFILL64rr,     X86::BLSFILL64rm,         0 },
+    { X86::BLSI32rr,        X86::BLSI32rm,            0 },
+    { X86::BLSI64rr,        X86::BLSI64rm,            0 },
+    { X86::BLSIC32rr,       X86::BLSIC32rm,           0 },
+    { X86::BLSIC64rr,       X86::BLSIC64rm,           0 },
+    { X86::BLSMSK32rr,      X86::BLSMSK32rm,          0 },
+    { X86::BLSMSK64rr,      X86::BLSMSK64rm,          0 },
+    { X86::BLSR32rr,        X86::BLSR32rm,            0 },
+    { X86::BLSR64rr,        X86::BLSR64rm,            0 },
+    { X86::BZHI32rr,        X86::BZHI32rm,            0 },
+    { X86::BZHI64rr,        X86::BZHI64rm,            0 },
+    { X86::LZCNT16rr,       X86::LZCNT16rm,           0 },
+    { X86::LZCNT32rr,       X86::LZCNT32rm,           0 },
+    { X86::LZCNT64rr,       X86::LZCNT64rm,           0 },
+    { X86::POPCNT16rr,      X86::POPCNT16rm,          0 },
+    { X86::POPCNT32rr,      X86::POPCNT32rm,          0 },
+    { X86::POPCNT64rr,      X86::POPCNT64rm,          0 },
+    { X86::RORX32ri,        X86::RORX32mi,            0 },
+    { X86::RORX64ri,        X86::RORX64mi,            0 },
+    { X86::SARX32rr,        X86::SARX32rm,            0 },
+    { X86::SARX64rr,        X86::SARX64rm,            0 },
+    { X86::SHRX32rr,        X86::SHRX32rm,            0 },
+    { X86::SHRX64rr,        X86::SHRX64rm,            0 },
+    { X86::SHLX32rr,        X86::SHLX32rm,            0 },
+    { X86::SHLX64rr,        X86::SHLX64rm,            0 },
+    { X86::T1MSKC32rr,      X86::T1MSKC32rm,          0 },
+    { X86::T1MSKC64rr,      X86::T1MSKC64rm,          0 },
+    { X86::TZCNT16rr,       X86::TZCNT16rm,           0 },
+    { X86::TZCNT32rr,       X86::TZCNT32rm,           0 },
+    { X86::TZCNT64rr,       X86::TZCNT64rm,           0 },
+    { X86::TZMSK32rr,       X86::TZMSK32rm,           0 },
+    { X86::TZMSK64rr,       X86::TZMSK64rm,           0 },
+
+    // AVX-512 foldable instructions
+    { X86::VMOV64toPQIZrr,  X86::VMOVQI2PQIZrm,       0 },
+    { X86::VMOVDI2SSZrr,    X86::VMOVDI2SSZrm,        0 },
+    { X86::VMOVDQA32rr,     X86::VMOVDQA32rm,         TB_ALIGN_64 },
+    { X86::VMOVDQA64rr,     X86::VMOVDQA64rm,         TB_ALIGN_64 },
+    { X86::VMOVDQU32rr,     X86::VMOVDQU32rm,         0 },
+    { X86::VMOVDQU64rr,     X86::VMOVDQU64rm,         0 },
+    { X86::VPABSDZrr,       X86::VPABSDZrm,           0 },
+    { X86::VPABSQZrr,       X86::VPABSQZrm,           0 },
+
+    // AES foldable instructions
+    { X86::AESIMCrr,              X86::AESIMCrm,              TB_ALIGN_16 },
+    { X86::AESKEYGENASSIST128rr,  X86::AESKEYGENASSIST128rm,  TB_ALIGN_16 },
+    { X86::VAESIMCrr,             X86::VAESIMCrm,             TB_ALIGN_16 },
+    { X86::VAESKEYGENASSIST128rr, X86::VAESKEYGENASSIST128rm, TB_ALIGN_16 },
+  };
+
+  for (unsigned i = 0, e = array_lengthof(OpTbl1); i != e; ++i) {
+    unsigned RegOp = OpTbl1[i].RegOp;
+    unsigned MemOp = OpTbl1[i].MemOp;
+    unsigned Flags = OpTbl1[i].Flags;
+    AddTableEntry(RegOp2MemOpTable1, MemOp2RegOpTable,
+                  RegOp, MemOp,
+                  // Index 1, folded load
+                  Flags | TB_INDEX_1 | TB_FOLDED_LOAD);
+  }
+
+  static const X86OpTblEntry OpTbl2[] = {
+    { X86::ADC32rr,         X86::ADC32rm,       0 },
+    { X86::ADC64rr,         X86::ADC64rm,       0 },
+    { X86::ADD16rr,         X86::ADD16rm,       0 },
+    { X86::ADD16rr_DB,      X86::ADD16rm,       TB_NO_REVERSE },
+    { X86::ADD32rr,         X86::ADD32rm,       0 },
+    { X86::ADD32rr_DB,      X86::ADD32rm,       TB_NO_REVERSE },
+    { X86::ADD64rr,         X86::ADD64rm,       0 },
+    { X86::ADD64rr_DB,      X86::ADD64rm,       TB_NO_REVERSE },
+    { X86::ADD8rr,          X86::ADD8rm,        0 },
+    { X86::ADDPDrr,         X86::ADDPDrm,       TB_ALIGN_16 },
+    { X86::ADDPSrr,         X86::ADDPSrm,       TB_ALIGN_16 },
+    { X86::ADDSDrr,         X86::ADDSDrm,       0 },
+    { X86::ADDSSrr,         X86::ADDSSrm,       0 },
+    { X86::ADDSUBPDrr,      X86::ADDSUBPDrm,    TB_ALIGN_16 },
+    { X86::ADDSUBPSrr,      X86::ADDSUBPSrm,    TB_ALIGN_16 },
+    { X86::AND16rr,         X86::AND16rm,       0 },
+    { X86::AND32rr,         X86::AND32rm,       0 },
+    { X86::AND64rr,         X86::AND64rm,       0 },
+    { X86::AND8rr,          X86::AND8rm,        0 },
+    { X86::ANDNPDrr,        X86::ANDNPDrm,      TB_ALIGN_16 },
+    { X86::ANDNPSrr,        X86::ANDNPSrm,      TB_ALIGN_16 },
+    { X86::ANDPDrr,         X86::ANDPDrm,       TB_ALIGN_16 },
+    { X86::ANDPSrr,         X86::ANDPSrm,       TB_ALIGN_16 },
+    { X86::BLENDPDrri,      X86::BLENDPDrmi,    TB_ALIGN_16 },
+    { X86::BLENDPSrri,      X86::BLENDPSrmi,    TB_ALIGN_16 },
+    { X86::BLENDVPDrr0,     X86::BLENDVPDrm0,   TB_ALIGN_16 },
+    { X86::BLENDVPSrr0,     X86::BLENDVPSrm0,   TB_ALIGN_16 },
+    { X86::CMOVA16rr,       X86::CMOVA16rm,     0 },
+    { X86::CMOVA32rr,       X86::CMOVA32rm,     0 },
+    { X86::CMOVA64rr,       X86::CMOVA64rm,     0 },
+    { X86::CMOVAE16rr,      X86::CMOVAE16rm,    0 },
+    { X86::CMOVAE32rr,      X86::CMOVAE32rm,    0 },
+    { X86::CMOVAE64rr,      X86::CMOVAE64rm,    0 },
+    { X86::CMOVB16rr,       X86::CMOVB16rm,     0 },
+    { X86::CMOVB32rr,       X86::CMOVB32rm,     0 },
+    { X86::CMOVB64rr,       X86::CMOVB64rm,     0 },
+    { X86::CMOVBE16rr,      X86::CMOVBE16rm,    0 },
+    { X86::CMOVBE32rr,      X86::CMOVBE32rm,    0 },
+    { X86::CMOVBE64rr,      X86::CMOVBE64rm,    0 },
+    { X86::CMOVE16rr,       X86::CMOVE16rm,     0 },
+    { X86::CMOVE32rr,       X86::CMOVE32rm,     0 },
+    { X86::CMOVE64rr,       X86::CMOVE64rm,     0 },
+    { X86::CMOVG16rr,       X86::CMOVG16rm,     0 },
+    { X86::CMOVG32rr,       X86::CMOVG32rm,     0 },
+    { X86::CMOVG64rr,       X86::CMOVG64rm,     0 },
+    { X86::CMOVGE16rr,      X86::CMOVGE16rm,    0 },
+    { X86::CMOVGE32rr,      X86::CMOVGE32rm,    0 },
+    { X86::CMOVGE64rr,      X86::CMOVGE64rm,    0 },
+    { X86::CMOVL16rr,       X86::CMOVL16rm,     0 },
+    { X86::CMOVL32rr,       X86::CMOVL32rm,     0 },
+    { X86::CMOVL64rr,       X86::CMOVL64rm,     0 },
+    { X86::CMOVLE16rr,      X86::CMOVLE16rm,    0 },
+    { X86::CMOVLE32rr,      X86::CMOVLE32rm,    0 },
+    { X86::CMOVLE64rr,      X86::CMOVLE64rm,    0 },
+    { X86::CMOVNE16rr,      X86::CMOVNE16rm,    0 },
+    { X86::CMOVNE32rr,      X86::CMOVNE32rm,    0 },
+    { X86::CMOVNE64rr,      X86::CMOVNE64rm,    0 },
+    { X86::CMOVNO16rr,      X86::CMOVNO16rm,    0 },
+    { X86::CMOVNO32rr,      X86::CMOVNO32rm,    0 },
+    { X86::CMOVNO64rr,      X86::CMOVNO64rm,    0 },
+    { X86::CMOVNP16rr,      X86::CMOVNP16rm,    0 },
+    { X86::CMOVNP32rr,      X86::CMOVNP32rm,    0 },
+    { X86::CMOVNP64rr,      X86::CMOVNP64rm,    0 },
+    { X86::CMOVNS16rr,      X86::CMOVNS16rm,    0 },
+    { X86::CMOVNS32rr,      X86::CMOVNS32rm,    0 },
+    { X86::CMOVNS64rr,      X86::CMOVNS64rm,    0 },
+    { X86::CMOVO16rr,       X86::CMOVO16rm,     0 },
+    { X86::CMOVO32rr,       X86::CMOVO32rm,     0 },
+    { X86::CMOVO64rr,       X86::CMOVO64rm,     0 },
+    { X86::CMOVP16rr,       X86::CMOVP16rm,     0 },
+    { X86::CMOVP32rr,       X86::CMOVP32rm,     0 },
+    { X86::CMOVP64rr,       X86::CMOVP64rm,     0 },
+    { X86::CMOVS16rr,       X86::CMOVS16rm,     0 },
+    { X86::CMOVS32rr,       X86::CMOVS32rm,     0 },
+    { X86::CMOVS64rr,       X86::CMOVS64rm,     0 },
+    { X86::CMPPDrri,        X86::CMPPDrmi,      TB_ALIGN_16 },
+    { X86::CMPPSrri,        X86::CMPPSrmi,      TB_ALIGN_16 },
+    { X86::CMPSDrr,         X86::CMPSDrm,       0 },
+    { X86::CMPSSrr,         X86::CMPSSrm,       0 },
+    { X86::DIVPDrr,         X86::DIVPDrm,       TB_ALIGN_16 },
+    { X86::DIVPSrr,         X86::DIVPSrm,       TB_ALIGN_16 },
+    { X86::DIVSDrr,         X86::DIVSDrm,       0 },
+    { X86::DIVSSrr,         X86::DIVSSrm,       0 },
+    { X86::FsANDNPDrr,      X86::FsANDNPDrm,    TB_ALIGN_16 },
+    { X86::FsANDNPSrr,      X86::FsANDNPSrm,    TB_ALIGN_16 },
+    { X86::FsANDPDrr,       X86::FsANDPDrm,     TB_ALIGN_16 },
+    { X86::FsANDPSrr,       X86::FsANDPSrm,     TB_ALIGN_16 },
+    { X86::FsORPDrr,        X86::FsORPDrm,      TB_ALIGN_16 },
+    { X86::FsORPSrr,        X86::FsORPSrm,      TB_ALIGN_16 },
+    { X86::FsXORPDrr,       X86::FsXORPDrm,     TB_ALIGN_16 },
+    { X86::FsXORPSrr,       X86::FsXORPSrm,     TB_ALIGN_16 },
+    { X86::HADDPDrr,        X86::HADDPDrm,      TB_ALIGN_16 },
+    { X86::HADDPSrr,        X86::HADDPSrm,      TB_ALIGN_16 },
+    { X86::HSUBPDrr,        X86::HSUBPDrm,      TB_ALIGN_16 },
+    { X86::HSUBPSrr,        X86::HSUBPSrm,      TB_ALIGN_16 },
+    { X86::IMUL16rr,        X86::IMUL16rm,      0 },
+    { X86::IMUL32rr,        X86::IMUL32rm,      0 },
+    { X86::IMUL64rr,        X86::IMUL64rm,      0 },
+    { X86::Int_CMPSDrr,     X86::Int_CMPSDrm,   0 },
+    { X86::Int_CMPSSrr,     X86::Int_CMPSSrm,   0 },
+    { X86::Int_CVTSD2SSrr,  X86::Int_CVTSD2SSrm,      0 },
+    { X86::Int_CVTSI2SD64rr,X86::Int_CVTSI2SD64rm,    0 },
+    { X86::Int_CVTSI2SDrr,  X86::Int_CVTSI2SDrm,      0 },
+    { X86::Int_CVTSI2SS64rr,X86::Int_CVTSI2SS64rm,    0 },
+    { X86::Int_CVTSI2SSrr,  X86::Int_CVTSI2SSrm,      0 },
+    { X86::Int_CVTSS2SDrr,  X86::Int_CVTSS2SDrm,      0 },
+    { X86::MAXPDrr,         X86::MAXPDrm,       TB_ALIGN_16 },
+    { X86::MAXPSrr,         X86::MAXPSrm,       TB_ALIGN_16 },
+    { X86::MAXSDrr,         X86::MAXSDrm,       0 },
+    { X86::MAXSSrr,         X86::MAXSSrm,       0 },
+    { X86::MINPDrr,         X86::MINPDrm,       TB_ALIGN_16 },
+    { X86::MINPSrr,         X86::MINPSrm,       TB_ALIGN_16 },
+    { X86::MINSDrr,         X86::MINSDrm,       0 },
+    { X86::MINSSrr,         X86::MINSSrm,       0 },
+    { X86::MPSADBWrri,      X86::MPSADBWrmi,    TB_ALIGN_16 },
+    { X86::MULPDrr,         X86::MULPDrm,       TB_ALIGN_16 },
+    { X86::MULPSrr,         X86::MULPSrm,       TB_ALIGN_16 },
+    { X86::MULSDrr,         X86::MULSDrm,       0 },
+    { X86::MULSSrr,         X86::MULSSrm,       0 },
+    { X86::OR16rr,          X86::OR16rm,        0 },
+    { X86::OR32rr,          X86::OR32rm,        0 },
+    { X86::OR64rr,          X86::OR64rm,        0 },
+    { X86::OR8rr,           X86::OR8rm,         0 },
+    { X86::ORPDrr,          X86::ORPDrm,        TB_ALIGN_16 },
+    { X86::ORPSrr,          X86::ORPSrm,        TB_ALIGN_16 },
+    { X86::PACKSSDWrr,      X86::PACKSSDWrm,    TB_ALIGN_16 },
+    { X86::PACKSSWBrr,      X86::PACKSSWBrm,    TB_ALIGN_16 },
+    { X86::PACKUSDWrr,      X86::PACKUSDWrm,    TB_ALIGN_16 },
+    { X86::PACKUSWBrr,      X86::PACKUSWBrm,    TB_ALIGN_16 },
+    { X86::PADDBrr,         X86::PADDBrm,       TB_ALIGN_16 },
+    { X86::PADDDrr,         X86::PADDDrm,       TB_ALIGN_16 },
+    { X86::PADDQrr,         X86::PADDQrm,       TB_ALIGN_16 },
+    { X86::PADDSBrr,        X86::PADDSBrm,      TB_ALIGN_16 },
+    { X86::PADDSWrr,        X86::PADDSWrm,      TB_ALIGN_16 },
+    { X86::PADDUSBrr,       X86::PADDUSBrm,     TB_ALIGN_16 },
+    { X86::PADDUSWrr,       X86::PADDUSWrm,     TB_ALIGN_16 },
+    { X86::PADDWrr,         X86::PADDWrm,       TB_ALIGN_16 },
+    { X86::PALIGNR128rr,    X86::PALIGNR128rm,  TB_ALIGN_16 },
+    { X86::PANDNrr,         X86::PANDNrm,       TB_ALIGN_16 },
+    { X86::PANDrr,          X86::PANDrm,        TB_ALIGN_16 },
+    { X86::PAVGBrr,         X86::PAVGBrm,       TB_ALIGN_16 },
+    { X86::PAVGWrr,         X86::PAVGWrm,       TB_ALIGN_16 },
+    { X86::PBLENDWrri,      X86::PBLENDWrmi,    TB_ALIGN_16 },
+    { X86::PCMPEQBrr,       X86::PCMPEQBrm,     TB_ALIGN_16 },
+    { X86::PCMPEQDrr,       X86::PCMPEQDrm,     TB_ALIGN_16 },
+    { X86::PCMPEQQrr,       X86::PCMPEQQrm,     TB_ALIGN_16 },
+    { X86::PCMPEQWrr,       X86::PCMPEQWrm,     TB_ALIGN_16 },
+    { X86::PCMPGTBrr,       X86::PCMPGTBrm,     TB_ALIGN_16 },
+    { X86::PCMPGTDrr,       X86::PCMPGTDrm,     TB_ALIGN_16 },
+    { X86::PCMPGTQrr,       X86::PCMPGTQrm,     TB_ALIGN_16 },
+    { X86::PCMPGTWrr,       X86::PCMPGTWrm,     TB_ALIGN_16 },
+    { X86::PHADDDrr,        X86::PHADDDrm,      TB_ALIGN_16 },
+    { X86::PHADDWrr,        X86::PHADDWrm,      TB_ALIGN_16 },
+    { X86::PHADDSWrr128,    X86::PHADDSWrm128,  TB_ALIGN_16 },
+    { X86::PHSUBDrr,        X86::PHSUBDrm,      TB_ALIGN_16 },
+    { X86::PHSUBSWrr128,    X86::PHSUBSWrm128,  TB_ALIGN_16 },
+    { X86::PHSUBWrr,        X86::PHSUBWrm,      TB_ALIGN_16 },
+    { X86::PINSRWrri,       X86::PINSRWrmi,     TB_ALIGN_16 },
+    { X86::PMADDUBSWrr128,  X86::PMADDUBSWrm128, TB_ALIGN_16 },
+    { X86::PMADDWDrr,       X86::PMADDWDrm,     TB_ALIGN_16 },
+    { X86::PMAXSWrr,        X86::PMAXSWrm,      TB_ALIGN_16 },
+    { X86::PMAXUBrr,        X86::PMAXUBrm,      TB_ALIGN_16 },
+    { X86::PMINSWrr,        X86::PMINSWrm,      TB_ALIGN_16 },
+    { X86::PMINUBrr,        X86::PMINUBrm,      TB_ALIGN_16 },
+    { X86::PMINSBrr,        X86::PMINSBrm,      TB_ALIGN_16 },
+    { X86::PMINSDrr,        X86::PMINSDrm,      TB_ALIGN_16 },
+    { X86::PMINUDrr,        X86::PMINUDrm,      TB_ALIGN_16 },
+    { X86::PMINUWrr,        X86::PMINUWrm,      TB_ALIGN_16 },
+    { X86::PMAXSBrr,        X86::PMAXSBrm,      TB_ALIGN_16 },
+    { X86::PMAXSDrr,        X86::PMAXSDrm,      TB_ALIGN_16 },
+    { X86::PMAXUDrr,        X86::PMAXUDrm,      TB_ALIGN_16 },
+    { X86::PMAXUWrr,        X86::PMAXUWrm,      TB_ALIGN_16 },
+    { X86::PMULDQrr,        X86::PMULDQrm,      TB_ALIGN_16 },
+    { X86::PMULHRSWrr128,   X86::PMULHRSWrm128, TB_ALIGN_16 },
+    { X86::PMULHUWrr,       X86::PMULHUWrm,     TB_ALIGN_16 },
+    { X86::PMULHWrr,        X86::PMULHWrm,      TB_ALIGN_16 },
+    { X86::PMULLDrr,        X86::PMULLDrm,      TB_ALIGN_16 },
+    { X86::PMULLWrr,        X86::PMULLWrm,      TB_ALIGN_16 },
+    { X86::PMULUDQrr,       X86::PMULUDQrm,     TB_ALIGN_16 },
+    { X86::PORrr,           X86::PORrm,         TB_ALIGN_16 },
+    { X86::PSADBWrr,        X86::PSADBWrm,      TB_ALIGN_16 },
+    { X86::PSHUFBrr,        X86::PSHUFBrm,      TB_ALIGN_16 },
+    { X86::PSIGNBrr,        X86::PSIGNBrm,      TB_ALIGN_16 },
+    { X86::PSIGNWrr,        X86::PSIGNWrm,      TB_ALIGN_16 },
+    { X86::PSIGNDrr,        X86::PSIGNDrm,      TB_ALIGN_16 },
+    { X86::PSLLDrr,         X86::PSLLDrm,       TB_ALIGN_16 },
+    { X86::PSLLQrr,         X86::PSLLQrm,       TB_ALIGN_16 },
+    { X86::PSLLWrr,         X86::PSLLWrm,       TB_ALIGN_16 },
+    { X86::PSRADrr,         X86::PSRADrm,       TB_ALIGN_16 },
+    { X86::PSRAWrr,         X86::PSRAWrm,       TB_ALIGN_16 },
+    { X86::PSRLDrr,         X86::PSRLDrm,       TB_ALIGN_16 },
+    { X86::PSRLQrr,         X86::PSRLQrm,       TB_ALIGN_16 },
+    { X86::PSRLWrr,         X86::PSRLWrm,       TB_ALIGN_16 },
+    { X86::PSUBBrr,         X86::PSUBBrm,       TB_ALIGN_16 },
+    { X86::PSUBDrr,         X86::PSUBDrm,       TB_ALIGN_16 },
+    { X86::PSUBSBrr,        X86::PSUBSBrm,      TB_ALIGN_16 },
+    { X86::PSUBSWrr,        X86::PSUBSWrm,      TB_ALIGN_16 },
+    { X86::PSUBWrr,         X86::PSUBWrm,       TB_ALIGN_16 },
+    { X86::PUNPCKHBWrr,     X86::PUNPCKHBWrm,   TB_ALIGN_16 },
+    { X86::PUNPCKHDQrr,     X86::PUNPCKHDQrm,   TB_ALIGN_16 },
+    { X86::PUNPCKHQDQrr,    X86::PUNPCKHQDQrm,  TB_ALIGN_16 },
+    { X86::PUNPCKHWDrr,     X86::PUNPCKHWDrm,   TB_ALIGN_16 },
+    { X86::PUNPCKLBWrr,     X86::PUNPCKLBWrm,   TB_ALIGN_16 },
+    { X86::PUNPCKLDQrr,     X86::PUNPCKLDQrm,   TB_ALIGN_16 },
+    { X86::PUNPCKLQDQrr,    X86::PUNPCKLQDQrm,  TB_ALIGN_16 },
+    { X86::PUNPCKLWDrr,     X86::PUNPCKLWDrm,   TB_ALIGN_16 },
+    { X86::PXORrr,          X86::PXORrm,        TB_ALIGN_16 },
+    { X86::SBB32rr,         X86::SBB32rm,       0 },
+    { X86::SBB64rr,         X86::SBB64rm,       0 },
+    { X86::SHUFPDrri,       X86::SHUFPDrmi,     TB_ALIGN_16 },
+    { X86::SHUFPSrri,       X86::SHUFPSrmi,     TB_ALIGN_16 },
+    { X86::SUB16rr,         X86::SUB16rm,       0 },
+    { X86::SUB32rr,         X86::SUB32rm,       0 },
+    { X86::SUB64rr,         X86::SUB64rm,       0 },
+    { X86::SUB8rr,          X86::SUB8rm,        0 },
+    { X86::SUBPDrr,         X86::SUBPDrm,       TB_ALIGN_16 },
+    { X86::SUBPSrr,         X86::SUBPSrm,       TB_ALIGN_16 },
+    { X86::SUBSDrr,         X86::SUBSDrm,       0 },
+    { X86::SUBSSrr,         X86::SUBSSrm,       0 },
+    // FIXME: TEST*rr -> swapped operand of TEST*mr.
+    { X86::UNPCKHPDrr,      X86::UNPCKHPDrm,    TB_ALIGN_16 },
+    { X86::UNPCKHPSrr,      X86::UNPCKHPSrm,    TB_ALIGN_16 },
+    { X86::UNPCKLPDrr,      X86::UNPCKLPDrm,    TB_ALIGN_16 },
+    { X86::UNPCKLPSrr,      X86::UNPCKLPSrm,    TB_ALIGN_16 },
+    { X86::XOR16rr,         X86::XOR16rm,       0 },
+    { X86::XOR32rr,         X86::XOR32rm,       0 },
+    { X86::XOR64rr,         X86::XOR64rm,       0 },
+    { X86::XOR8rr,          X86::XOR8rm,        0 },
+    { X86::XORPDrr,         X86::XORPDrm,       TB_ALIGN_16 },
+    { X86::XORPSrr,         X86::XORPSrm,       TB_ALIGN_16 },
+    // AVX 128-bit versions of foldable instructions
+    { X86::VCVTSD2SSrr,       X86::VCVTSD2SSrm,        0 },
+    { X86::Int_VCVTSD2SSrr,   X86::Int_VCVTSD2SSrm,    0 },
+    { X86::VCVTSI2SD64rr,     X86::VCVTSI2SD64rm,      0 },
+    { X86::Int_VCVTSI2SD64rr, X86::Int_VCVTSI2SD64rm,  0 },
+    { X86::VCVTSI2SDrr,       X86::VCVTSI2SDrm,        0 },
+    { X86::Int_VCVTSI2SDrr,   X86::Int_VCVTSI2SDrm,    0 },
+    { X86::VCVTSI2SS64rr,     X86::VCVTSI2SS64rm,      0 },
+    { X86::Int_VCVTSI2SS64rr, X86::Int_VCVTSI2SS64rm,  0 },
+    { X86::VCVTSI2SSrr,       X86::VCVTSI2SSrm,        0 },
+    { X86::Int_VCVTSI2SSrr,   X86::Int_VCVTSI2SSrm,    0 },
+    { X86::VCVTSS2SDrr,       X86::VCVTSS2SDrm,        0 },
+    { X86::Int_VCVTSS2SDrr,   X86::Int_VCVTSS2SDrm,    0 },
+    { X86::VCVTTPD2DQrr,      X86::VCVTTPD2DQXrm,      0 },
+    { X86::VCVTTPS2DQrr,      X86::VCVTTPS2DQrm,       0 },
+    { X86::VRSQRTSSr,         X86::VRSQRTSSm,          0 },
+    { X86::VSQRTSDr,          X86::VSQRTSDm,           0 },
+    { X86::VSQRTSSr,          X86::VSQRTSSm,           0 },
+    { X86::VADDPDrr,          X86::VADDPDrm,           0 },
+    { X86::VADDPSrr,          X86::VADDPSrm,           0 },
+    { X86::VADDSDrr,          X86::VADDSDrm,           0 },
+    { X86::VADDSSrr,          X86::VADDSSrm,           0 },
+    { X86::VADDSUBPDrr,       X86::VADDSUBPDrm,        0 },
+    { X86::VADDSUBPSrr,       X86::VADDSUBPSrm,        0 },
+    { X86::VANDNPDrr,         X86::VANDNPDrm,          0 },
+    { X86::VANDNPSrr,         X86::VANDNPSrm,          0 },
+    { X86::VANDPDrr,          X86::VANDPDrm,           0 },
+    { X86::VANDPSrr,          X86::VANDPSrm,           0 },
+    { X86::VBLENDPDrri,       X86::VBLENDPDrmi,        0 },
+    { X86::VBLENDPSrri,       X86::VBLENDPSrmi,        0 },
+    { X86::VBLENDVPDrr,       X86::VBLENDVPDrm,        0 },
+    { X86::VBLENDVPSrr,       X86::VBLENDVPSrm,        0 },
+    { X86::VCMPPDrri,         X86::VCMPPDrmi,          0 },
+    { X86::VCMPPSrri,         X86::VCMPPSrmi,          0 },
+    { X86::VCMPSDrr,          X86::VCMPSDrm,           0 },
+    { X86::VCMPSSrr,          X86::VCMPSSrm,           0 },
+    { X86::VDIVPDrr,          X86::VDIVPDrm,           0 },
+    { X86::VDIVPSrr,          X86::VDIVPSrm,           0 },
+    { X86::VDIVSDrr,          X86::VDIVSDrm,           0 },
+    { X86::VDIVSSrr,          X86::VDIVSSrm,           0 },
+    { X86::VFsANDNPDrr,       X86::VFsANDNPDrm,        TB_ALIGN_16 },
+    { X86::VFsANDNPSrr,       X86::VFsANDNPSrm,        TB_ALIGN_16 },
+    { X86::VFsANDPDrr,        X86::VFsANDPDrm,         TB_ALIGN_16 },
+    { X86::VFsANDPSrr,        X86::VFsANDPSrm,         TB_ALIGN_16 },
+    { X86::VFsORPDrr,         X86::VFsORPDrm,          TB_ALIGN_16 },
+    { X86::VFsORPSrr,         X86::VFsORPSrm,          TB_ALIGN_16 },
+    { X86::VFsXORPDrr,        X86::VFsXORPDrm,         TB_ALIGN_16 },
+    { X86::VFsXORPSrr,        X86::VFsXORPSrm,         TB_ALIGN_16 },
+    { X86::VHADDPDrr,         X86::VHADDPDrm,          0 },
+    { X86::VHADDPSrr,         X86::VHADDPSrm,          0 },
+    { X86::VHSUBPDrr,         X86::VHSUBPDrm,          0 },
+    { X86::VHSUBPSrr,         X86::VHSUBPSrm,          0 },
+    { X86::Int_VCMPSDrr,      X86::Int_VCMPSDrm,       0 },
+    { X86::Int_VCMPSSrr,      X86::Int_VCMPSSrm,       0 },
+    { X86::VMAXPDrr,          X86::VMAXPDrm,           0 },
+    { X86::VMAXPSrr,          X86::VMAXPSrm,           0 },
+    { X86::VMAXSDrr,          X86::VMAXSDrm,           0 },
+    { X86::VMAXSSrr,          X86::VMAXSSrm,           0 },
+    { X86::VMINPDrr,          X86::VMINPDrm,           0 },
+    { X86::VMINPSrr,          X86::VMINPSrm,           0 },
+    { X86::VMINSDrr,          X86::VMINSDrm,           0 },
+    { X86::VMINSSrr,          X86::VMINSSrm,           0 },
+    { X86::VMPSADBWrri,       X86::VMPSADBWrmi,        0 },
+    { X86::VMULPDrr,          X86::VMULPDrm,           0 },
+    { X86::VMULPSrr,          X86::VMULPSrm,           0 },
+    { X86::VMULSDrr,          X86::VMULSDrm,           0 },
+    { X86::VMULSSrr,          X86::VMULSSrm,           0 },
+    { X86::VORPDrr,           X86::VORPDrm,            0 },
+    { X86::VORPSrr,           X86::VORPSrm,            0 },
+    { X86::VPACKSSDWrr,       X86::VPACKSSDWrm,        0 },
+    { X86::VPACKSSWBrr,       X86::VPACKSSWBrm,        0 },
+    { X86::VPACKUSDWrr,       X86::VPACKUSDWrm,        0 },
+    { X86::VPACKUSWBrr,       X86::VPACKUSWBrm,        0 },
+    { X86::VPADDBrr,          X86::VPADDBrm,           0 },
+    { X86::VPADDDrr,          X86::VPADDDrm,           0 },
+    { X86::VPADDQrr,          X86::VPADDQrm,           0 },
+    { X86::VPADDSBrr,         X86::VPADDSBrm,          0 },
+    { X86::VPADDSWrr,         X86::VPADDSWrm,          0 },
+    { X86::VPADDUSBrr,        X86::VPADDUSBrm,         0 },
+    { X86::VPADDUSWrr,        X86::VPADDUSWrm,         0 },
+    { X86::VPADDWrr,          X86::VPADDWrm,           0 },
+    { X86::VPALIGNR128rr,     X86::VPALIGNR128rm,      0 },
+    { X86::VPANDNrr,          X86::VPANDNrm,           0 },
+    { X86::VPANDrr,           X86::VPANDrm,            0 },
+    { X86::VPAVGBrr,          X86::VPAVGBrm,           0 },
+    { X86::VPAVGWrr,          X86::VPAVGWrm,           0 },
+    { X86::VPBLENDWrri,       X86::VPBLENDWrmi,        0 },
+    { X86::VPCMPEQBrr,        X86::VPCMPEQBrm,         0 },
+    { X86::VPCMPEQDrr,        X86::VPCMPEQDrm,         0 },
+    { X86::VPCMPEQQrr,        X86::VPCMPEQQrm,         0 },
+    { X86::VPCMPEQWrr,        X86::VPCMPEQWrm,         0 },
+    { X86::VPCMPGTBrr,        X86::VPCMPGTBrm,         0 },
+    { X86::VPCMPGTDrr,        X86::VPCMPGTDrm,         0 },
+    { X86::VPCMPGTQrr,        X86::VPCMPGTQrm,         0 },
+    { X86::VPCMPGTWrr,        X86::VPCMPGTWrm,         0 },
+    { X86::VPHADDDrr,         X86::VPHADDDrm,          0 },
+    { X86::VPHADDSWrr128,     X86::VPHADDSWrm128,      0 },
+    { X86::VPHADDWrr,         X86::VPHADDWrm,          0 },
+    { X86::VPHSUBDrr,         X86::VPHSUBDrm,          0 },
+    { X86::VPHSUBSWrr128,     X86::VPHSUBSWrm128,      0 },
+    { X86::VPHSUBWrr,         X86::VPHSUBWrm,          0 },
+    { X86::VPERMILPDrr,       X86::VPERMILPDrm,        0 },
+    { X86::VPERMILPSrr,       X86::VPERMILPSrm,        0 },
+    { X86::VPINSRWrri,        X86::VPINSRWrmi,         0 },
+    { X86::VPMADDUBSWrr128,   X86::VPMADDUBSWrm128,    0 },
+    { X86::VPMADDWDrr,        X86::VPMADDWDrm,         0 },
+    { X86::VPMAXSWrr,         X86::VPMAXSWrm,          0 },
+    { X86::VPMAXUBrr,         X86::VPMAXUBrm,          0 },
+    { X86::VPMINSWrr,         X86::VPMINSWrm,          0 },
+    { X86::VPMINUBrr,         X86::VPMINUBrm,          0 },
+    { X86::VPMINSBrr,         X86::VPMINSBrm,          0 },
+    { X86::VPMINSDrr,         X86::VPMINSDrm,          0 },
+    { X86::VPMINUDrr,         X86::VPMINUDrm,          0 },
+    { X86::VPMINUWrr,         X86::VPMINUWrm,          0 },
+    { X86::VPMAXSBrr,         X86::VPMAXSBrm,          0 },
+    { X86::VPMAXSDrr,         X86::VPMAXSDrm,          0 },
+    { X86::VPMAXUDrr,         X86::VPMAXUDrm,          0 },
+    { X86::VPMAXUWrr,         X86::VPMAXUWrm,          0 },
+    { X86::VPMULDQrr,         X86::VPMULDQrm,          0 },
+    { X86::VPMULHRSWrr128,    X86::VPMULHRSWrm128,     0 },
+    { X86::VPMULHUWrr,        X86::VPMULHUWrm,         0 },
+    { X86::VPMULHWrr,         X86::VPMULHWrm,          0 },
+    { X86::VPMULLDrr,         X86::VPMULLDrm,          0 },
+    { X86::VPMULLWrr,         X86::VPMULLWrm,          0 },
+    { X86::VPMULUDQrr,        X86::VPMULUDQrm,         0 },
+    { X86::VPORrr,            X86::VPORrm,             0 },
+    { X86::VPSADBWrr,         X86::VPSADBWrm,          0 },
+    { X86::VPSHUFBrr,         X86::VPSHUFBrm,          0 },
+    { X86::VPSIGNBrr,         X86::VPSIGNBrm,          0 },
+    { X86::VPSIGNWrr,         X86::VPSIGNWrm,          0 },
+    { X86::VPSIGNDrr,         X86::VPSIGNDrm,          0 },
+    { X86::VPSLLDrr,          X86::VPSLLDrm,           0 },
+    { X86::VPSLLQrr,          X86::VPSLLQrm,           0 },
+    { X86::VPSLLWrr,          X86::VPSLLWrm,           0 },
+    { X86::VPSRADrr,          X86::VPSRADrm,           0 },
+    { X86::VPSRAWrr,          X86::VPSRAWrm,           0 },
+    { X86::VPSRLDrr,          X86::VPSRLDrm,           0 },
+    { X86::VPSRLQrr,          X86::VPSRLQrm,           0 },
+    { X86::VPSRLWrr,          X86::VPSRLWrm,           0 },
+    { X86::VPSUBBrr,          X86::VPSUBBrm,           0 },
+    { X86::VPSUBDrr,          X86::VPSUBDrm,           0 },
+    { X86::VPSUBSBrr,         X86::VPSUBSBrm,          0 },
+    { X86::VPSUBSWrr,         X86::VPSUBSWrm,          0 },
+    { X86::VPSUBWrr,          X86::VPSUBWrm,           0 },
+    { X86::VPUNPCKHBWrr,      X86::VPUNPCKHBWrm,       0 },
+    { X86::VPUNPCKHDQrr,      X86::VPUNPCKHDQrm,       0 },
+    { X86::VPUNPCKHQDQrr,     X86::VPUNPCKHQDQrm,      0 },
+    { X86::VPUNPCKHWDrr,      X86::VPUNPCKHWDrm,       0 },
+    { X86::VPUNPCKLBWrr,      X86::VPUNPCKLBWrm,       0 },
+    { X86::VPUNPCKLDQrr,      X86::VPUNPCKLDQrm,       0 },
+    { X86::VPUNPCKLQDQrr,     X86::VPUNPCKLQDQrm,      0 },
+    { X86::VPUNPCKLWDrr,      X86::VPUNPCKLWDrm,       0 },
+    { X86::VPXORrr,           X86::VPXORrm,            0 },
+    { X86::VSHUFPDrri,        X86::VSHUFPDrmi,         0 },
+    { X86::VSHUFPSrri,        X86::VSHUFPSrmi,         0 },
+    { X86::VSUBPDrr,          X86::VSUBPDrm,           0 },
+    { X86::VSUBPSrr,          X86::VSUBPSrm,           0 },
+    { X86::VSUBSDrr,          X86::VSUBSDrm,           0 },
+    { X86::VSUBSSrr,          X86::VSUBSSrm,           0 },
+    { X86::VUNPCKHPDrr,       X86::VUNPCKHPDrm,        0 },
+    { X86::VUNPCKHPSrr,       X86::VUNPCKHPSrm,        0 },
+    { X86::VUNPCKLPDrr,       X86::VUNPCKLPDrm,        0 },
+    { X86::VUNPCKLPSrr,       X86::VUNPCKLPSrm,        0 },
+    { X86::VXORPDrr,          X86::VXORPDrm,           0 },
+    { X86::VXORPSrr,          X86::VXORPSrm,           0 },
+    // AVX 256-bit foldable instructions
+    { X86::VADDPDYrr,         X86::VADDPDYrm,          0 },
+    { X86::VADDPSYrr,         X86::VADDPSYrm,          0 },
+    { X86::VADDSUBPDYrr,      X86::VADDSUBPDYrm,       0 },
+    { X86::VADDSUBPSYrr,      X86::VADDSUBPSYrm,       0 },
+    { X86::VANDNPDYrr,        X86::VANDNPDYrm,         0 },
+    { X86::VANDNPSYrr,        X86::VANDNPSYrm,         0 },
+    { X86::VANDPDYrr,         X86::VANDPDYrm,          0 },
+    { X86::VANDPSYrr,         X86::VANDPSYrm,          0 },
+    { X86::VBLENDPDYrri,      X86::VBLENDPDYrmi,       0 },
+    { X86::VBLENDPSYrri,      X86::VBLENDPSYrmi,       0 },
+    { X86::VBLENDVPDYrr,      X86::VBLENDVPDYrm,       0 },
+    { X86::VBLENDVPSYrr,      X86::VBLENDVPSYrm,       0 },
+    { X86::VCMPPDYrri,        X86::VCMPPDYrmi,         0 },
+    { X86::VCMPPSYrri,        X86::VCMPPSYrmi,         0 },
+    { X86::VDIVPDYrr,         X86::VDIVPDYrm,          0 },
+    { X86::VDIVPSYrr,         X86::VDIVPSYrm,          0 },
+    { X86::VHADDPDYrr,        X86::VHADDPDYrm,         0 },
+    { X86::VHADDPSYrr,        X86::VHADDPSYrm,         0 },
+    { X86::VHSUBPDYrr,        X86::VHSUBPDYrm,         0 },
+    { X86::VHSUBPSYrr,        X86::VHSUBPSYrm,         0 },
+    { X86::VINSERTF128rr,     X86::VINSERTF128rm,      0 },
+    { X86::VMAXPDYrr,         X86::VMAXPDYrm,          0 },
+    { X86::VMAXPSYrr,         X86::VMAXPSYrm,          0 },
+    { X86::VMINPDYrr,         X86::VMINPDYrm,          0 },
+    { X86::VMINPSYrr,         X86::VMINPSYrm,          0 },
+    { X86::VMULPDYrr,         X86::VMULPDYrm,          0 },
+    { X86::VMULPSYrr,         X86::VMULPSYrm,          0 },
+    { X86::VORPDYrr,          X86::VORPDYrm,           0 },
+    { X86::VORPSYrr,          X86::VORPSYrm,           0 },
+    { X86::VPERM2F128rr,      X86::VPERM2F128rm,       0 },
+    { X86::VPERMILPDYrr,      X86::VPERMILPDYrm,       0 },
+    { X86::VPERMILPSYrr,      X86::VPERMILPSYrm,       0 },
+    { X86::VSHUFPDYrri,       X86::VSHUFPDYrmi,        0 },
+    { X86::VSHUFPSYrri,       X86::VSHUFPSYrmi,        0 },
+    { X86::VSUBPDYrr,         X86::VSUBPDYrm,          0 },
+    { X86::VSUBPSYrr,         X86::VSUBPSYrm,          0 },
+    { X86::VUNPCKHPDYrr,      X86::VUNPCKHPDYrm,       0 },
+    { X86::VUNPCKHPSYrr,      X86::VUNPCKHPSYrm,       0 },
+    { X86::VUNPCKLPDYrr,      X86::VUNPCKLPDYrm,       0 },
+    { X86::VUNPCKLPSYrr,      X86::VUNPCKLPSYrm,       0 },
+    { X86::VXORPDYrr,         X86::VXORPDYrm,          0 },
+    { X86::VXORPSYrr,         X86::VXORPSYrm,          0 },
+    // AVX2 foldable instructions
+    { X86::VINSERTI128rr,     X86::VINSERTI128rm,      0 },
+    { X86::VPACKSSDWYrr,      X86::VPACKSSDWYrm,       0 },
+    { X86::VPACKSSWBYrr,      X86::VPACKSSWBYrm,       0 },
+    { X86::VPACKUSDWYrr,      X86::VPACKUSDWYrm,       0 },
+    { X86::VPACKUSWBYrr,      X86::VPACKUSWBYrm,       0 },
+    { X86::VPADDBYrr,         X86::VPADDBYrm,          0 },
+    { X86::VPADDDYrr,         X86::VPADDDYrm,          0 },
+    { X86::VPADDQYrr,         X86::VPADDQYrm,          0 },
+    { X86::VPADDSBYrr,        X86::VPADDSBYrm,         0 },
+    { X86::VPADDSWYrr,        X86::VPADDSWYrm,         0 },
+    { X86::VPADDUSBYrr,       X86::VPADDUSBYrm,        0 },
+    { X86::VPADDUSWYrr,       X86::VPADDUSWYrm,        0 },
+    { X86::VPADDWYrr,         X86::VPADDWYrm,          0 },
+    { X86::VPALIGNR256rr,     X86::VPALIGNR256rm,      0 },
+    { X86::VPANDNYrr,         X86::VPANDNYrm,          0 },
+    { X86::VPANDYrr,          X86::VPANDYrm,           0 },
+    { X86::VPAVGBYrr,         X86::VPAVGBYrm,          0 },
+    { X86::VPAVGWYrr,         X86::VPAVGWYrm,          0 },
+    { X86::VPBLENDDrri,       X86::VPBLENDDrmi,        0 },
+    { X86::VPBLENDDYrri,      X86::VPBLENDDYrmi,       0 },
+    { X86::VPBLENDWYrri,      X86::VPBLENDWYrmi,       0 },
+    { X86::VPCMPEQBYrr,       X86::VPCMPEQBYrm,        0 },
+    { X86::VPCMPEQDYrr,       X86::VPCMPEQDYrm,        0 },
+    { X86::VPCMPEQQYrr,       X86::VPCMPEQQYrm,        0 },
+    { X86::VPCMPEQWYrr,       X86::VPCMPEQWYrm,        0 },
+    { X86::VPCMPGTBYrr,       X86::VPCMPGTBYrm,        0 },
+    { X86::VPCMPGTDYrr,       X86::VPCMPGTDYrm,        0 },
+    { X86::VPCMPGTQYrr,       X86::VPCMPGTQYrm,        0 },
+    { X86::VPCMPGTWYrr,       X86::VPCMPGTWYrm,        0 },
+    { X86::VPERM2I128rr,      X86::VPERM2I128rm,       0 },
+    { X86::VPERMDYrr,         X86::VPERMDYrm,          0 },
+    { X86::VPERMPDYri,        X86::VPERMPDYmi,         0 },
+    { X86::VPERMPSYrr,        X86::VPERMPSYrm,         0 },
+    { X86::VPERMQYri,         X86::VPERMQYmi,          0 },
+    { X86::VPHADDDYrr,        X86::VPHADDDYrm,         0 },
+    { X86::VPHADDSWrr256,     X86::VPHADDSWrm256,      0 },
+    { X86::VPHADDWYrr,        X86::VPHADDWYrm,         0 },
+    { X86::VPHSUBDYrr,        X86::VPHSUBDYrm,         0 },
+    { X86::VPHSUBSWrr256,     X86::VPHSUBSWrm256,      0 },
+    { X86::VPHSUBWYrr,        X86::VPHSUBWYrm,         0 },
+    { X86::VPMADDUBSWrr256,   X86::VPMADDUBSWrm256,    0 },
+    { X86::VPMADDWDYrr,       X86::VPMADDWDYrm,        0 },
+    { X86::VPMAXSWYrr,        X86::VPMAXSWYrm,         0 },
+    { X86::VPMAXUBYrr,        X86::VPMAXUBYrm,         0 },
+    { X86::VPMINSWYrr,        X86::VPMINSWYrm,         0 },
+    { X86::VPMINUBYrr,        X86::VPMINUBYrm,         0 },
+    { X86::VPMINSBYrr,        X86::VPMINSBYrm,         0 },
+    { X86::VPMINSDYrr,        X86::VPMINSDYrm,         0 },
+    { X86::VPMINUDYrr,        X86::VPMINUDYrm,         0 },
+    { X86::VPMINUWYrr,        X86::VPMINUWYrm,         0 },
+    { X86::VPMAXSBYrr,        X86::VPMAXSBYrm,         0 },
+    { X86::VPMAXSDYrr,        X86::VPMAXSDYrm,         0 },
+    { X86::VPMAXUDYrr,        X86::VPMAXUDYrm,         0 },
+    { X86::VPMAXUWYrr,        X86::VPMAXUWYrm,         0 },
+    { X86::VMPSADBWYrri,      X86::VMPSADBWYrmi,       0 },
+    { X86::VPMULDQYrr,        X86::VPMULDQYrm,         0 },
+    { X86::VPMULHRSWrr256,    X86::VPMULHRSWrm256,     0 },
+    { X86::VPMULHUWYrr,       X86::VPMULHUWYrm,        0 },
+    { X86::VPMULHWYrr,        X86::VPMULHWYrm,         0 },
+    { X86::VPMULLDYrr,        X86::VPMULLDYrm,         0 },
+    { X86::VPMULLWYrr,        X86::VPMULLWYrm,         0 },
+    { X86::VPMULUDQYrr,       X86::VPMULUDQYrm,        0 },
+    { X86::VPORYrr,           X86::VPORYrm,            0 },
+    { X86::VPSADBWYrr,        X86::VPSADBWYrm,         0 },
+    { X86::VPSHUFBYrr,        X86::VPSHUFBYrm,         0 },
+    { X86::VPSIGNBYrr,        X86::VPSIGNBYrm,         0 },
+    { X86::VPSIGNWYrr,        X86::VPSIGNWYrm,         0 },
+    { X86::VPSIGNDYrr,        X86::VPSIGNDYrm,         0 },
+    { X86::VPSLLDYrr,         X86::VPSLLDYrm,          0 },
+    { X86::VPSLLQYrr,         X86::VPSLLQYrm,          0 },
+    { X86::VPSLLWYrr,         X86::VPSLLWYrm,          0 },
+    { X86::VPSLLVDrr,         X86::VPSLLVDrm,          0 },
+    { X86::VPSLLVDYrr,        X86::VPSLLVDYrm,         0 },
+    { X86::VPSLLVQrr,         X86::VPSLLVQrm,          0 },
+    { X86::VPSLLVQYrr,        X86::VPSLLVQYrm,         0 },
+    { X86::VPSRADYrr,         X86::VPSRADYrm,          0 },
+    { X86::VPSRAWYrr,         X86::VPSRAWYrm,          0 },
+    { X86::VPSRAVDrr,         X86::VPSRAVDrm,          0 },
+    { X86::VPSRAVDYrr,        X86::VPSRAVDYrm,         0 },
+    { X86::VPSRLDYrr,         X86::VPSRLDYrm,          0 },
+    { X86::VPSRLQYrr,         X86::VPSRLQYrm,          0 },
+    { X86::VPSRLWYrr,         X86::VPSRLWYrm,          0 },
+    { X86::VPSRLVDrr,         X86::VPSRLVDrm,          0 },
+    { X86::VPSRLVDYrr,        X86::VPSRLVDYrm,         0 },
+    { X86::VPSRLVQrr,         X86::VPSRLVQrm,          0 },
+    { X86::VPSRLVQYrr,        X86::VPSRLVQYrm,         0 },
+    { X86::VPSUBBYrr,         X86::VPSUBBYrm,          0 },
+    { X86::VPSUBDYrr,         X86::VPSUBDYrm,          0 },
+    { X86::VPSUBSBYrr,        X86::VPSUBSBYrm,         0 },
+    { X86::VPSUBSWYrr,        X86::VPSUBSWYrm,         0 },
+    { X86::VPSUBWYrr,         X86::VPSUBWYrm,          0 },
+    { X86::VPUNPCKHBWYrr,     X86::VPUNPCKHBWYrm,      0 },
+    { X86::VPUNPCKHDQYrr,     X86::VPUNPCKHDQYrm,      0 },
+    { X86::VPUNPCKHQDQYrr,    X86::VPUNPCKHQDQYrm,     0 },
+    { X86::VPUNPCKHWDYrr,     X86::VPUNPCKHWDYrm,      0 },
+    { X86::VPUNPCKLBWYrr,     X86::VPUNPCKLBWYrm,      0 },
+    { X86::VPUNPCKLDQYrr,     X86::VPUNPCKLDQYrm,      0 },
+    { X86::VPUNPCKLQDQYrr,    X86::VPUNPCKLQDQYrm,     0 },
+    { X86::VPUNPCKLWDYrr,     X86::VPUNPCKLWDYrm,      0 },
+    { X86::VPXORYrr,          X86::VPXORYrm,           0 },
+    // FIXME: add AVX 256-bit foldable instructions
+
+    // FMA4 foldable patterns
+    { X86::VFMADDSS4rr,       X86::VFMADDSS4mr,        0           },
+    { X86::VFMADDSD4rr,       X86::VFMADDSD4mr,        0           },
+    { X86::VFMADDPS4rr,       X86::VFMADDPS4mr,        TB_ALIGN_16 },
+    { X86::VFMADDPD4rr,       X86::VFMADDPD4mr,        TB_ALIGN_16 },
+    { X86::VFMADDPS4rrY,      X86::VFMADDPS4mrY,       TB_ALIGN_32 },
+    { X86::VFMADDPD4rrY,      X86::VFMADDPD4mrY,       TB_ALIGN_32 },
+    { X86::VFNMADDSS4rr,      X86::VFNMADDSS4mr,       0           },
+    { X86::VFNMADDSD4rr,      X86::VFNMADDSD4mr,       0           },
+    { X86::VFNMADDPS4rr,      X86::VFNMADDPS4mr,       TB_ALIGN_16 },
+    { X86::VFNMADDPD4rr,      X86::VFNMADDPD4mr,       TB_ALIGN_16 },
+    { X86::VFNMADDPS4rrY,     X86::VFNMADDPS4mrY,      TB_ALIGN_32 },
+    { X86::VFNMADDPD4rrY,     X86::VFNMADDPD4mrY,      TB_ALIGN_32 },
+    { X86::VFMSUBSS4rr,       X86::VFMSUBSS4mr,        0           },
+    { X86::VFMSUBSD4rr,       X86::VFMSUBSD4mr,        0           },
+    { X86::VFMSUBPS4rr,       X86::VFMSUBPS4mr,        TB_ALIGN_16 },
+    { X86::VFMSUBPD4rr,       X86::VFMSUBPD4mr,        TB_ALIGN_16 },
+    { X86::VFMSUBPS4rrY,      X86::VFMSUBPS4mrY,       TB_ALIGN_32 },
+    { X86::VFMSUBPD4rrY,      X86::VFMSUBPD4mrY,       TB_ALIGN_32 },
+    { X86::VFNMSUBSS4rr,      X86::VFNMSUBSS4mr,       0           },
+    { X86::VFNMSUBSD4rr,      X86::VFNMSUBSD4mr,       0           },
+    { X86::VFNMSUBPS4rr,      X86::VFNMSUBPS4mr,       TB_ALIGN_16 },
+    { X86::VFNMSUBPD4rr,      X86::VFNMSUBPD4mr,       TB_ALIGN_16 },
+    { X86::VFNMSUBPS4rrY,     X86::VFNMSUBPS4mrY,      TB_ALIGN_32 },
+    { X86::VFNMSUBPD4rrY,     X86::VFNMSUBPD4mrY,      TB_ALIGN_32 },
+    { X86::VFMADDSUBPS4rr,    X86::VFMADDSUBPS4mr,     TB_ALIGN_16 },
+    { X86::VFMADDSUBPD4rr,    X86::VFMADDSUBPD4mr,     TB_ALIGN_16 },
+    { X86::VFMADDSUBPS4rrY,   X86::VFMADDSUBPS4mrY,    TB_ALIGN_32 },
+    { X86::VFMADDSUBPD4rrY,   X86::VFMADDSUBPD4mrY,    TB_ALIGN_32 },
+    { X86::VFMSUBADDPS4rr,    X86::VFMSUBADDPS4mr,     TB_ALIGN_16 },
+    { X86::VFMSUBADDPD4rr,    X86::VFMSUBADDPD4mr,     TB_ALIGN_16 },
+    { X86::VFMSUBADDPS4rrY,   X86::VFMSUBADDPS4mrY,    TB_ALIGN_32 },
+    { X86::VFMSUBADDPD4rrY,   X86::VFMSUBADDPD4mrY,    TB_ALIGN_32 },
+
+    // BMI/BMI2 foldable instructions
+    { X86::ANDN32rr,          X86::ANDN32rm,            0 },
+    { X86::ANDN64rr,          X86::ANDN64rm,            0 },
+    { X86::MULX32rr,          X86::MULX32rm,            0 },
+    { X86::MULX64rr,          X86::MULX64rm,            0 },
+    { X86::PDEP32rr,          X86::PDEP32rm,            0 },
+    { X86::PDEP64rr,          X86::PDEP64rm,            0 },
+    { X86::PEXT32rr,          X86::PEXT32rm,            0 },
+    { X86::PEXT64rr,          X86::PEXT64rm,            0 },
+
+    // AVX-512 foldable instructions
+    { X86::VADDPSZrr,         X86::VADDPSZrm,           0 },
+    { X86::VADDPDZrr,         X86::VADDPDZrm,           0 },
+    { X86::VSUBPSZrr,         X86::VSUBPSZrm,           0 },
+    { X86::VSUBPDZrr,         X86::VSUBPDZrm,           0 },
+    { X86::VMULPSZrr,         X86::VMULPSZrm,           0 },
+    { X86::VMULPDZrr,         X86::VMULPDZrm,           0 },
+    { X86::VDIVPSZrr,         X86::VDIVPSZrm,           0 },
+    { X86::VDIVPDZrr,         X86::VDIVPDZrm,           0 },
+    { X86::VMINPSZrr,         X86::VMINPSZrm,           0 },
+    { X86::VMINPDZrr,         X86::VMINPDZrm,           0 },
+    { X86::VMAXPSZrr,         X86::VMAXPSZrm,           0 },
+    { X86::VMAXPDZrr,         X86::VMAXPDZrm,           0 },
+    { X86::VPADDDZrr,         X86::VPADDDZrm,           0 },
+    { X86::VPADDQZrr,         X86::VPADDQZrm,           0 },
+    { X86::VPERMPDZri,        X86::VPERMPDZmi,          0 },
+    { X86::VPERMPSZrr,        X86::VPERMPSZrm,          0 },
+    { X86::VPMAXSDZrr,        X86::VPMAXSDZrm,          0 },
+    { X86::VPMAXSQZrr,        X86::VPMAXSQZrm,          0 },
+    { X86::VPMAXUDZrr,        X86::VPMAXUDZrm,          0 },
+    { X86::VPMAXUQZrr,        X86::VPMAXUQZrm,          0 },
+    { X86::VPMINSDZrr,        X86::VPMINSDZrm,          0 },
+    { X86::VPMINSQZrr,        X86::VPMINSQZrm,          0 },
+    { X86::VPMINUDZrr,        X86::VPMINUDZrm,          0 },
+    { X86::VPMINUQZrr,        X86::VPMINUQZrm,          0 },
+    { X86::VPMULDQZrr,        X86::VPMULDQZrm,          0 },
+    { X86::VPSLLVDZrr,        X86::VPSLLVDZrm,          0 },
+    { X86::VPSLLVQZrr,        X86::VPSLLVQZrm,          0 },
+    { X86::VPSRAVDZrr,        X86::VPSRAVDZrm,          0 },
+    { X86::VPSRLVDZrr,        X86::VPSRLVDZrm,          0 },
+    { X86::VPSRLVQZrr,        X86::VPSRLVQZrm,          0 },
+    { X86::VPSUBDZrr,         X86::VPSUBDZrm,           0 },
+    { X86::VPSUBQZrr,         X86::VPSUBQZrm,           0 },
+    { X86::VSHUFPDZrri,       X86::VSHUFPDZrmi,         0 },
+    { X86::VSHUFPSZrri,       X86::VSHUFPSZrmi,         0 },
+    { X86::VALIGNQrri,        X86::VALIGNQrmi,          0 },
+    { X86::VALIGNDrri,        X86::VALIGNDrmi,          0 },
+    { X86::VPMULUDQZrr,       X86::VPMULUDQZrm,         0 },
+
+    // AES foldable instructions
+    { X86::AESDECLASTrr,      X86::AESDECLASTrm,        TB_ALIGN_16 },
+    { X86::AESDECrr,          X86::AESDECrm,            TB_ALIGN_16 },
+    { X86::AESENCLASTrr,      X86::AESENCLASTrm,        TB_ALIGN_16 },
+    { X86::AESENCrr,          X86::AESENCrm,            TB_ALIGN_16 },
+    { X86::VAESDECLASTrr,     X86::VAESDECLASTrm,       TB_ALIGN_16 },
+    { X86::VAESDECrr,         X86::VAESDECrm,           TB_ALIGN_16 },
+    { X86::VAESENCLASTrr,     X86::VAESENCLASTrm,       TB_ALIGN_16 },
+    { X86::VAESENCrr,         X86::VAESENCrm,           TB_ALIGN_16 },
+
+    // SHA foldable instructions
+    { X86::SHA1MSG1rr,        X86::SHA1MSG1rm,          TB_ALIGN_16 },
+    { X86::SHA1MSG2rr,        X86::SHA1MSG2rm,          TB_ALIGN_16 },
+    { X86::SHA1NEXTErr,       X86::SHA1NEXTErm,         TB_ALIGN_16 },
+    { X86::SHA1RNDS4rri,      X86::SHA1RNDS4rmi,        TB_ALIGN_16 },
+    { X86::SHA256MSG1rr,      X86::SHA256MSG1rm,        TB_ALIGN_16 },
+    { X86::SHA256MSG2rr,      X86::SHA256MSG2rm,        TB_ALIGN_16 },
+    { X86::SHA256RNDS2rr,     X86::SHA256RNDS2rm,       TB_ALIGN_16 },
+  };
+
+  for (unsigned i = 0, e = array_lengthof(OpTbl2); i != e; ++i) {
+    unsigned RegOp = OpTbl2[i].RegOp;
+    unsigned MemOp = OpTbl2[i].MemOp;
+    unsigned Flags = OpTbl2[i].Flags;
+    AddTableEntry(RegOp2MemOpTable2, MemOp2RegOpTable,
+                  RegOp, MemOp,
+                  // Index 2, folded load
+                  Flags | TB_INDEX_2 | TB_FOLDED_LOAD);
+  }
+
+  static const X86OpTblEntry OpTbl3[] = {
+    // FMA foldable instructions
+    { X86::VFMADDSSr231r,         X86::VFMADDSSr231m,         TB_ALIGN_NONE },
+    { X86::VFMADDSDr231r,         X86::VFMADDSDr231m,         TB_ALIGN_NONE },
+    { X86::VFMADDSSr132r,         X86::VFMADDSSr132m,         TB_ALIGN_NONE },
+    { X86::VFMADDSDr132r,         X86::VFMADDSDr132m,         TB_ALIGN_NONE },
+    { X86::VFMADDSSr213r,         X86::VFMADDSSr213m,         TB_ALIGN_NONE },
+    { X86::VFMADDSDr213r,         X86::VFMADDSDr213m,         TB_ALIGN_NONE },
+
+    { X86::VFMADDPSr231r,         X86::VFMADDPSr231m,         TB_ALIGN_NONE },
+    { X86::VFMADDPDr231r,         X86::VFMADDPDr231m,         TB_ALIGN_NONE },
+    { X86::VFMADDPSr132r,         X86::VFMADDPSr132m,         TB_ALIGN_NONE },
+    { X86::VFMADDPDr132r,         X86::VFMADDPDr132m,         TB_ALIGN_NONE },
+    { X86::VFMADDPSr213r,         X86::VFMADDPSr213m,         TB_ALIGN_NONE },
+    { X86::VFMADDPDr213r,         X86::VFMADDPDr213m,         TB_ALIGN_NONE },
+    { X86::VFMADDPSr231rY,        X86::VFMADDPSr231mY,        TB_ALIGN_NONE },
+    { X86::VFMADDPDr231rY,        X86::VFMADDPDr231mY,        TB_ALIGN_NONE },
+    { X86::VFMADDPSr132rY,        X86::VFMADDPSr132mY,        TB_ALIGN_NONE },
+    { X86::VFMADDPDr132rY,        X86::VFMADDPDr132mY,        TB_ALIGN_NONE },
+    { X86::VFMADDPSr213rY,        X86::VFMADDPSr213mY,        TB_ALIGN_NONE },
+    { X86::VFMADDPDr213rY,        X86::VFMADDPDr213mY,        TB_ALIGN_NONE },
+
+    { X86::VFNMADDSSr231r,        X86::VFNMADDSSr231m,        TB_ALIGN_NONE },
+    { X86::VFNMADDSDr231r,        X86::VFNMADDSDr231m,        TB_ALIGN_NONE },
+    { X86::VFNMADDSSr132r,        X86::VFNMADDSSr132m,        TB_ALIGN_NONE },
+    { X86::VFNMADDSDr132r,        X86::VFNMADDSDr132m,        TB_ALIGN_NONE },
+    { X86::VFNMADDSSr213r,        X86::VFNMADDSSr213m,        TB_ALIGN_NONE },
+    { X86::VFNMADDSDr213r,        X86::VFNMADDSDr213m,        TB_ALIGN_NONE },
+
+    { X86::VFNMADDPSr231r,        X86::VFNMADDPSr231m,        TB_ALIGN_NONE },
+    { X86::VFNMADDPDr231r,        X86::VFNMADDPDr231m,        TB_ALIGN_NONE },
+    { X86::VFNMADDPSr132r,        X86::VFNMADDPSr132m,        TB_ALIGN_NONE },
+    { X86::VFNMADDPDr132r,        X86::VFNMADDPDr132m,        TB_ALIGN_NONE },
+    { X86::VFNMADDPSr213r,        X86::VFNMADDPSr213m,        TB_ALIGN_NONE },
+    { X86::VFNMADDPDr213r,        X86::VFNMADDPDr213m,        TB_ALIGN_NONE },
+    { X86::VFNMADDPSr231rY,       X86::VFNMADDPSr231mY,       TB_ALIGN_NONE },
+    { X86::VFNMADDPDr231rY,       X86::VFNMADDPDr231mY,       TB_ALIGN_NONE },
+    { X86::VFNMADDPSr132rY,       X86::VFNMADDPSr132mY,       TB_ALIGN_NONE },
+    { X86::VFNMADDPDr132rY,       X86::VFNMADDPDr132mY,       TB_ALIGN_NONE },
+    { X86::VFNMADDPSr213rY,       X86::VFNMADDPSr213mY,       TB_ALIGN_NONE },
+    { X86::VFNMADDPDr213rY,       X86::VFNMADDPDr213mY,       TB_ALIGN_NONE },
+
+    { X86::VFMSUBSSr231r,         X86::VFMSUBSSr231m,         TB_ALIGN_NONE },
+    { X86::VFMSUBSDr231r,         X86::VFMSUBSDr231m,         TB_ALIGN_NONE },
+    { X86::VFMSUBSSr132r,         X86::VFMSUBSSr132m,         TB_ALIGN_NONE },
+    { X86::VFMSUBSDr132r,         X86::VFMSUBSDr132m,         TB_ALIGN_NONE },
+    { X86::VFMSUBSSr213r,         X86::VFMSUBSSr213m,         TB_ALIGN_NONE },
+    { X86::VFMSUBSDr213r,         X86::VFMSUBSDr213m,         TB_ALIGN_NONE },
+
+    { X86::VFMSUBPSr231r,         X86::VFMSUBPSr231m,         TB_ALIGN_NONE },
+    { X86::VFMSUBPDr231r,         X86::VFMSUBPDr231m,         TB_ALIGN_NONE },
+    { X86::VFMSUBPSr132r,         X86::VFMSUBPSr132m,         TB_ALIGN_NONE },
+    { X86::VFMSUBPDr132r,         X86::VFMSUBPDr132m,         TB_ALIGN_NONE },
+    { X86::VFMSUBPSr213r,         X86::VFMSUBPSr213m,         TB_ALIGN_NONE },
+    { X86::VFMSUBPDr213r,         X86::VFMSUBPDr213m,         TB_ALIGN_NONE },
+    { X86::VFMSUBPSr231rY,        X86::VFMSUBPSr231mY,        TB_ALIGN_NONE },
+    { X86::VFMSUBPDr231rY,        X86::VFMSUBPDr231mY,        TB_ALIGN_NONE },
+    { X86::VFMSUBPSr132rY,        X86::VFMSUBPSr132mY,        TB_ALIGN_NONE },
+    { X86::VFMSUBPDr132rY,        X86::VFMSUBPDr132mY,        TB_ALIGN_NONE },
+    { X86::VFMSUBPSr213rY,        X86::VFMSUBPSr213mY,        TB_ALIGN_NONE },
+    { X86::VFMSUBPDr213rY,        X86::VFMSUBPDr213mY,        TB_ALIGN_NONE },
+
+    { X86::VFNMSUBSSr231r,        X86::VFNMSUBSSr231m,        TB_ALIGN_NONE },
+    { X86::VFNMSUBSDr231r,        X86::VFNMSUBSDr231m,        TB_ALIGN_NONE },
+    { X86::VFNMSUBSSr132r,        X86::VFNMSUBSSr132m,        TB_ALIGN_NONE },
+    { X86::VFNMSUBSDr132r,        X86::VFNMSUBSDr132m,        TB_ALIGN_NONE },
+    { X86::VFNMSUBSSr213r,        X86::VFNMSUBSSr213m,        TB_ALIGN_NONE },
+    { X86::VFNMSUBSDr213r,        X86::VFNMSUBSDr213m,        TB_ALIGN_NONE },
+
+    { X86::VFNMSUBPSr231r,        X86::VFNMSUBPSr231m,        TB_ALIGN_NONE },
+    { X86::VFNMSUBPDr231r,        X86::VFNMSUBPDr231m,        TB_ALIGN_NONE },
+    { X86::VFNMSUBPSr132r,        X86::VFNMSUBPSr132m,        TB_ALIGN_NONE },
+    { X86::VFNMSUBPDr132r,        X86::VFNMSUBPDr132m,        TB_ALIGN_NONE },
+    { X86::VFNMSUBPSr213r,        X86::VFNMSUBPSr213m,        TB_ALIGN_NONE },
+    { X86::VFNMSUBPDr213r,        X86::VFNMSUBPDr213m,        TB_ALIGN_NONE },
+    { X86::VFNMSUBPSr231rY,       X86::VFNMSUBPSr231mY,       TB_ALIGN_NONE },
+    { X86::VFNMSUBPDr231rY,       X86::VFNMSUBPDr231mY,       TB_ALIGN_NONE },
+    { X86::VFNMSUBPSr132rY,       X86::VFNMSUBPSr132mY,       TB_ALIGN_NONE },
+    { X86::VFNMSUBPDr132rY,       X86::VFNMSUBPDr132mY,       TB_ALIGN_NONE },
+    { X86::VFNMSUBPSr213rY,       X86::VFNMSUBPSr213mY,       TB_ALIGN_NONE },
+    { X86::VFNMSUBPDr213rY,       X86::VFNMSUBPDr213mY,       TB_ALIGN_NONE },
+
+    { X86::VFMADDSUBPSr231r,      X86::VFMADDSUBPSr231m,      TB_ALIGN_NONE },
+    { X86::VFMADDSUBPDr231r,      X86::VFMADDSUBPDr231m,      TB_ALIGN_NONE },
+    { X86::VFMADDSUBPSr132r,      X86::VFMADDSUBPSr132m,      TB_ALIGN_NONE },
+    { X86::VFMADDSUBPDr132r,      X86::VFMADDSUBPDr132m,      TB_ALIGN_NONE },
+    { X86::VFMADDSUBPSr213r,      X86::VFMADDSUBPSr213m,      TB_ALIGN_NONE },
+    { X86::VFMADDSUBPDr213r,      X86::VFMADDSUBPDr213m,      TB_ALIGN_NONE },
+    { X86::VFMADDSUBPSr231rY,     X86::VFMADDSUBPSr231mY,     TB_ALIGN_NONE },
+    { X86::VFMADDSUBPDr231rY,     X86::VFMADDSUBPDr231mY,     TB_ALIGN_NONE },
+    { X86::VFMADDSUBPSr132rY,     X86::VFMADDSUBPSr132mY,     TB_ALIGN_NONE },
+    { X86::VFMADDSUBPDr132rY,     X86::VFMADDSUBPDr132mY,     TB_ALIGN_NONE },
+    { X86::VFMADDSUBPSr213rY,     X86::VFMADDSUBPSr213mY,     TB_ALIGN_NONE },
+    { X86::VFMADDSUBPDr213rY,     X86::VFMADDSUBPDr213mY,     TB_ALIGN_NONE },
+
+    { X86::VFMSUBADDPSr231r,      X86::VFMSUBADDPSr231m,      TB_ALIGN_NONE },
+    { X86::VFMSUBADDPDr231r,      X86::VFMSUBADDPDr231m,      TB_ALIGN_NONE },
+    { X86::VFMSUBADDPSr132r,      X86::VFMSUBADDPSr132m,      TB_ALIGN_NONE },
+    { X86::VFMSUBADDPDr132r,      X86::VFMSUBADDPDr132m,      TB_ALIGN_NONE },
+    { X86::VFMSUBADDPSr213r,      X86::VFMSUBADDPSr213m,      TB_ALIGN_NONE },
+    { X86::VFMSUBADDPDr213r,      X86::VFMSUBADDPDr213m,      TB_ALIGN_NONE },
+    { X86::VFMSUBADDPSr231rY,     X86::VFMSUBADDPSr231mY,     TB_ALIGN_NONE },
+    { X86::VFMSUBADDPDr231rY,     X86::VFMSUBADDPDr231mY,     TB_ALIGN_NONE },
+    { X86::VFMSUBADDPSr132rY,     X86::VFMSUBADDPSr132mY,     TB_ALIGN_NONE },
+    { X86::VFMSUBADDPDr132rY,     X86::VFMSUBADDPDr132mY,     TB_ALIGN_NONE },
+    { X86::VFMSUBADDPSr213rY,     X86::VFMSUBADDPSr213mY,     TB_ALIGN_NONE },
+    { X86::VFMSUBADDPDr213rY,     X86::VFMSUBADDPDr213mY,     TB_ALIGN_NONE },
+
+    // FMA4 foldable patterns
+    { X86::VFMADDSS4rr,           X86::VFMADDSS4rm,           0           },
+    { X86::VFMADDSD4rr,           X86::VFMADDSD4rm,           0           },
+    { X86::VFMADDPS4rr,           X86::VFMADDPS4rm,           TB_ALIGN_16 },
+    { X86::VFMADDPD4rr,           X86::VFMADDPD4rm,           TB_ALIGN_16 },
+    { X86::VFMADDPS4rrY,          X86::VFMADDPS4rmY,          TB_ALIGN_32 },
+    { X86::VFMADDPD4rrY,          X86::VFMADDPD4rmY,          TB_ALIGN_32 },
+    { X86::VFNMADDSS4rr,          X86::VFNMADDSS4rm,          0           },
+    { X86::VFNMADDSD4rr,          X86::VFNMADDSD4rm,          0           },
+    { X86::VFNMADDPS4rr,          X86::VFNMADDPS4rm,          TB_ALIGN_16 },
+    { X86::VFNMADDPD4rr,          X86::VFNMADDPD4rm,          TB_ALIGN_16 },
+    { X86::VFNMADDPS4rrY,         X86::VFNMADDPS4rmY,         TB_ALIGN_32 },
+    { X86::VFNMADDPD4rrY,         X86::VFNMADDPD4rmY,         TB_ALIGN_32 },
+    { X86::VFMSUBSS4rr,           X86::VFMSUBSS4rm,           0           },
+    { X86::VFMSUBSD4rr,           X86::VFMSUBSD4rm,           0           },
+    { X86::VFMSUBPS4rr,           X86::VFMSUBPS4rm,           TB_ALIGN_16 },
+    { X86::VFMSUBPD4rr,           X86::VFMSUBPD4rm,           TB_ALIGN_16 },
+    { X86::VFMSUBPS4rrY,          X86::VFMSUBPS4rmY,          TB_ALIGN_32 },
+    { X86::VFMSUBPD4rrY,          X86::VFMSUBPD4rmY,          TB_ALIGN_32 },
+    { X86::VFNMSUBSS4rr,          X86::VFNMSUBSS4rm,          0           },
+    { X86::VFNMSUBSD4rr,          X86::VFNMSUBSD4rm,          0           },
+    { X86::VFNMSUBPS4rr,          X86::VFNMSUBPS4rm,          TB_ALIGN_16 },
+    { X86::VFNMSUBPD4rr,          X86::VFNMSUBPD4rm,          TB_ALIGN_16 },
+    { X86::VFNMSUBPS4rrY,         X86::VFNMSUBPS4rmY,         TB_ALIGN_32 },
+    { X86::VFNMSUBPD4rrY,         X86::VFNMSUBPD4rmY,         TB_ALIGN_32 },
+    { X86::VFMADDSUBPS4rr,        X86::VFMADDSUBPS4rm,        TB_ALIGN_16 },
+    { X86::VFMADDSUBPD4rr,        X86::VFMADDSUBPD4rm,        TB_ALIGN_16 },
+    { X86::VFMADDSUBPS4rrY,       X86::VFMADDSUBPS4rmY,       TB_ALIGN_32 },
+    { X86::VFMADDSUBPD4rrY,       X86::VFMADDSUBPD4rmY,       TB_ALIGN_32 },
+    { X86::VFMSUBADDPS4rr,        X86::VFMSUBADDPS4rm,        TB_ALIGN_16 },
+    { X86::VFMSUBADDPD4rr,        X86::VFMSUBADDPD4rm,        TB_ALIGN_16 },
+    { X86::VFMSUBADDPS4rrY,       X86::VFMSUBADDPS4rmY,       TB_ALIGN_32 },
+    { X86::VFMSUBADDPD4rrY,       X86::VFMSUBADDPD4rmY,       TB_ALIGN_32 },
+    // AVX-512 VPERMI instructions with 3 source operands.
+    { X86::VPERMI2Drr,            X86::VPERMI2Drm,            0 },
+    { X86::VPERMI2Qrr,            X86::VPERMI2Qrm,            0 },
+    { X86::VPERMI2PSrr,           X86::VPERMI2PSrm,           0 },
+    { X86::VPERMI2PDrr,           X86::VPERMI2PDrm,           0 },
+    { X86::VBLENDMPDZrr,          X86::VBLENDMPDZrm,          0 },
+    { X86::VBLENDMPSZrr,          X86::VBLENDMPSZrm,          0 },
+    { X86::VPBLENDMDZrr,          X86::VPBLENDMDZrm,          0 },
+    { X86::VPBLENDMQZrr,          X86::VPBLENDMQZrm,          0 }
+  };
+
+  for (unsigned i = 0, e = array_lengthof(OpTbl3); i != e; ++i) {
+    unsigned RegOp = OpTbl3[i].RegOp;
+    unsigned MemOp = OpTbl3[i].MemOp;
+    unsigned Flags = OpTbl3[i].Flags;
+    AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
+                  RegOp, MemOp,
+                  // Index 3, folded load
+                  Flags | TB_INDEX_3 | TB_FOLDED_LOAD);
+  }
+
+}
+
+void
+X86InstrInfo::AddTableEntry(RegOp2MemOpTableType &R2MTable,
+                            MemOp2RegOpTableType &M2RTable,
+                            unsigned RegOp, unsigned MemOp, unsigned Flags) {
+    if ((Flags & TB_NO_FORWARD) == 0) {
+      assert(!R2MTable.count(RegOp) && "Duplicate entry!");
+      R2MTable[RegOp] = std::make_pair(MemOp, Flags);
+    }
+    if ((Flags & TB_NO_REVERSE) == 0) {
+      assert(!M2RTable.count(MemOp) &&
+           "Duplicated entries in unfolding maps?");
+      M2RTable[MemOp] = std::make_pair(RegOp, Flags);
+    }
+}
+
+bool
+X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
+                                    unsigned &SrcReg, unsigned &DstReg,
+                                    unsigned &SubIdx) const {
+  switch (MI.getOpcode()) {
+  default: break;
+  case X86::MOVSX16rr8:
+  case X86::MOVZX16rr8:
+  case X86::MOVSX32rr8:
+  case X86::MOVZX32rr8:
+  case X86::MOVSX64rr8:
+    if (!Subtarget.is64Bit())
+      // It's not always legal to reference the low 8-bit of the larger
+      // register in 32-bit mode.
+      return false;
+  case X86::MOVSX32rr16:
+  case X86::MOVZX32rr16:
+  case X86::MOVSX64rr16:
+  case X86::MOVSX64rr32: {
+    if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
+      // Be conservative.
+      return false;
+    SrcReg = MI.getOperand(1).getReg();
+    DstReg = MI.getOperand(0).getReg();
+    switch (MI.getOpcode()) {
+    default: llvm_unreachable("Unreachable!");
+    case X86::MOVSX16rr8:
+    case X86::MOVZX16rr8:
+    case X86::MOVSX32rr8:
+    case X86::MOVZX32rr8:
+    case X86::MOVSX64rr8:
+      SubIdx = X86::sub_8bit;
+      break;
+    case X86::MOVSX32rr16:
+    case X86::MOVZX32rr16:
+    case X86::MOVSX64rr16:
+      SubIdx = X86::sub_16bit;
+      break;
+    case X86::MOVSX64rr32:
+      SubIdx = X86::sub_32bit;
+      break;
+    }
+    return true;
+  }
+  }
+  return false;
+}
+
+/// isFrameOperand - Return true and the FrameIndex if the specified
+/// operand and follow operands form a reference to the stack frame.
+bool X86InstrInfo::isFrameOperand(const MachineInstr *MI, unsigned int Op,
+                                  int &FrameIndex) const {
+  if (MI->getOperand(Op+X86::AddrBaseReg).isFI() &&
+      MI->getOperand(Op+X86::AddrScaleAmt).isImm() &&
+      MI->getOperand(Op+X86::AddrIndexReg).isReg() &&
+      MI->getOperand(Op+X86::AddrDisp).isImm() &&
+      MI->getOperand(Op+X86::AddrScaleAmt).getImm() == 1 &&
+      MI->getOperand(Op+X86::AddrIndexReg).getReg() == 0 &&
+      MI->getOperand(Op+X86::AddrDisp).getImm() == 0) {
+    FrameIndex = MI->getOperand(Op+X86::AddrBaseReg).getIndex();
+    return true;
+  }
+  return false;
+}
+
+static bool isFrameLoadOpcode(int Opcode) {
+  switch (Opcode) {
+  default:
+    return false;
+  case X86::MOV8rm:
+  case X86::MOV16rm:
+  case X86::MOV32rm:
+  case X86::MOV64rm:
+  case X86::LD_Fp64m:
+  case X86::MOVSSrm:
+  case X86::MOVSDrm:
+  case X86::MOVAPSrm:
+  case X86::MOVAPDrm:
+  case X86::MOVDQArm:
+  case X86::VMOVSSrm:
+  case X86::VMOVSDrm:
+  case X86::VMOVAPSrm:
+  case X86::VMOVAPDrm:
+  case X86::VMOVDQArm:
+  case X86::VMOVAPSYrm:
+  case X86::VMOVAPDYrm:
+  case X86::VMOVDQAYrm:
+  case X86::MMX_MOVD64rm:
+  case X86::MMX_MOVQ64rm:
+  case X86::VMOVAPSZrm:
+  case X86::VMOVUPSZrm:
+    return true;
+  }
+}
+
+static bool isFrameStoreOpcode(int Opcode) {
+  switch (Opcode) {
+  default: break;
+  case X86::MOV8mr:
+  case X86::MOV16mr:
+  case X86::MOV32mr:
+  case X86::MOV64mr:
+  case X86::ST_FpP64m:
+  case X86::MOVSSmr:
+  case X86::MOVSDmr:
+  case X86::MOVAPSmr:
+  case X86::MOVAPDmr:
+  case X86::MOVDQAmr:
+  case X86::VMOVSSmr:
+  case X86::VMOVSDmr:
+  case X86::VMOVAPSmr:
+  case X86::VMOVAPDmr:
+  case X86::VMOVDQAmr:
+  case X86::VMOVAPSYmr:
+  case X86::VMOVAPDYmr:
+  case X86::VMOVDQAYmr:
+  case X86::VMOVUPSZmr:
+  case X86::VMOVAPSZmr:
+  case X86::MMX_MOVD64mr:
+  case X86::MMX_MOVQ64mr:
+  case X86::MMX_MOVNTQmr:
+    return true;
+  }
+  return false;
+}
+
+unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+                                           int &FrameIndex) const {
+  if (isFrameLoadOpcode(MI->getOpcode()))
+    if (MI->getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex))
+      return MI->getOperand(0).getReg();
+  return 0;
+}
+
+unsigned X86InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI,
+                                                 int &FrameIndex) const {
+  if (isFrameLoadOpcode(MI->getOpcode())) {
+    unsigned Reg;
+    if ((Reg = isLoadFromStackSlot(MI, FrameIndex)))
+      return Reg;
+    // Check for post-frame index elimination operations
+    const MachineMemOperand *Dummy;
+    return hasLoadFromStackSlot(MI, Dummy, FrameIndex);
+  }
+  return 0;
+}
+
+unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+                                          int &FrameIndex) const {
+  if (isFrameStoreOpcode(MI->getOpcode()))
+    if (MI->getOperand(X86::AddrNumOperands).getSubReg() == 0 &&
+        isFrameOperand(MI, 0, FrameIndex))
+      return MI->getOperand(X86::AddrNumOperands).getReg();
+  return 0;
+}
+
+unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr *MI,
+                                                int &FrameIndex) const {
+  if (isFrameStoreOpcode(MI->getOpcode())) {
+    unsigned Reg;
+    if ((Reg = isStoreToStackSlot(MI, FrameIndex)))
+      return Reg;
+    // Check for post-frame index elimination operations
+    const MachineMemOperand *Dummy;
+    return hasStoreToStackSlot(MI, Dummy, FrameIndex);
+  }
+  return 0;
+}
+
+/// regIsPICBase - Return true if register is PIC base (i.e.g defined by
+/// X86::MOVPC32r.
+static bool regIsPICBase(unsigned BaseReg, const MachineRegisterInfo &MRI) {
+  // Don't waste compile time scanning use-def chains of physregs.
+  if (!TargetRegisterInfo::isVirtualRegister(BaseReg))
+    return false;
+  bool isPICBase = false;
+  for (MachineRegisterInfo::def_instr_iterator I = MRI.def_instr_begin(BaseReg),
+         E = MRI.def_instr_end(); I != E; ++I) {
+    MachineInstr *DefMI = &*I;
+    if (DefMI->getOpcode() != X86::MOVPC32r)
+      return false;
+    assert(!isPICBase && "More than one PIC base?");
+    isPICBase = true;
+  }
+  return isPICBase;
+}
+
+bool
+X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI,
+                                                AliasAnalysis *AA) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case X86::MOV8rm:
+  case X86::MOV16rm:
+  case X86::MOV32rm:
+  case X86::MOV64rm:
+  case X86::LD_Fp64m:
+  case X86::MOVSSrm:
+  case X86::MOVSDrm:
+  case X86::MOVAPSrm:
+  case X86::MOVUPSrm:
+  case X86::MOVAPDrm:
+  case X86::MOVDQArm:
+  case X86::MOVDQUrm:
+  case X86::VMOVSSrm:
+  case X86::VMOVSDrm:
+  case X86::VMOVAPSrm:
+  case X86::VMOVUPSrm:
+  case X86::VMOVAPDrm:
+  case X86::VMOVDQArm:
+  case X86::VMOVDQUrm:
+  case X86::VMOVAPSYrm:
+  case X86::VMOVUPSYrm:
+  case X86::VMOVAPDYrm:
+  case X86::VMOVDQAYrm:
+  case X86::VMOVDQUYrm:
+  case X86::MMX_MOVD64rm:
+  case X86::MMX_MOVQ64rm:
+  case X86::FsVMOVAPSrm:
+  case X86::FsVMOVAPDrm:
+  case X86::FsMOVAPSrm:
+  case X86::FsMOVAPDrm: {
+    // Loads from constant pools are trivially rematerializable.
+    if (MI->getOperand(1+X86::AddrBaseReg).isReg() &&
+        MI->getOperand(1+X86::AddrScaleAmt).isImm() &&
+        MI->getOperand(1+X86::AddrIndexReg).isReg() &&
+        MI->getOperand(1+X86::AddrIndexReg).getReg() == 0 &&
+        MI->isInvariantLoad(AA)) {
+      unsigned BaseReg = MI->getOperand(1+X86::AddrBaseReg).getReg();
+      if (BaseReg == 0 || BaseReg == X86::RIP)
+        return true;
+      // Allow re-materialization of PIC load.
+      if (!ReMatPICStubLoad && MI->getOperand(1+X86::AddrDisp).isGlobal())
+        return false;
+      const MachineFunction &MF = *MI->getParent()->getParent();
+      const MachineRegisterInfo &MRI = MF.getRegInfo();
+      return regIsPICBase(BaseReg, MRI);
+    }
+    return false;
+  }
+
+  case X86::LEA32r:
+  case X86::LEA64r: {
+    if (MI->getOperand(1+X86::AddrScaleAmt).isImm() &&
+        MI->getOperand(1+X86::AddrIndexReg).isReg() &&
+        MI->getOperand(1+X86::AddrIndexReg).getReg() == 0 &&
+        !MI->getOperand(1+X86::AddrDisp).isReg()) {
+      // lea fi#, lea GV, etc. are all rematerializable.
+      if (!MI->getOperand(1+X86::AddrBaseReg).isReg())
+        return true;
+      unsigned BaseReg = MI->getOperand(1+X86::AddrBaseReg).getReg();
+      if (BaseReg == 0)
+        return true;
+      // Allow re-materialization of lea PICBase + x.
+      const MachineFunction &MF = *MI->getParent()->getParent();
+      const MachineRegisterInfo &MRI = MF.getRegInfo();
+      return regIsPICBase(BaseReg, MRI);
+    }
+    return false;
+  }
+  }
+
+  // All other instructions marked M_REMATERIALIZABLE are always trivially
+  // rematerializable.
+  return true;
+}
+
+bool X86InstrInfo::isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator I) const {
+  MachineBasicBlock::iterator E = MBB.end();
+
+  // For compile time consideration, if we are not able to determine the
+  // safety after visiting 4 instructions in each direction, we will assume
+  // it's not safe.
+  MachineBasicBlock::iterator Iter = I;
+  for (unsigned i = 0; Iter != E && i < 4; ++i) {
+    bool SeenDef = false;
+    for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) {
+      MachineOperand &MO = Iter->getOperand(j);
+      if (MO.isRegMask() && MO.clobbersPhysReg(X86::EFLAGS))
+        SeenDef = true;
+      if (!MO.isReg())
+        continue;
+      if (MO.getReg() == X86::EFLAGS) {
+        if (MO.isUse())
+          return false;
+        SeenDef = true;
+      }
+    }
+
+    if (SeenDef)
+      // This instruction defines EFLAGS, no need to look any further.
+      return true;
+    ++Iter;
+    // Skip over DBG_VALUE.
+    while (Iter != E && Iter->isDebugValue())
+      ++Iter;
+  }
+
+  // It is safe to clobber EFLAGS at the end of a block of no successor has it
+  // live in.
+  if (Iter == E) {
+    for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(),
+           SE = MBB.succ_end(); SI != SE; ++SI)
+      if ((*SI)->isLiveIn(X86::EFLAGS))
+        return false;
+    return true;
+  }
+
+  MachineBasicBlock::iterator B = MBB.begin();
+  Iter = I;
+  for (unsigned i = 0; i < 4; ++i) {
+    // If we make it to the beginning of the block, it's safe to clobber
+    // EFLAGS iff EFLAGS is not live-in.
+    if (Iter == B)
+      return !MBB.isLiveIn(X86::EFLAGS);
+
+    --Iter;
+    // Skip over DBG_VALUE.
+    while (Iter != B && Iter->isDebugValue())
+      --Iter;
+
+    bool SawKill = false;
+    for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) {
+      MachineOperand &MO = Iter->getOperand(j);
+      // A register mask may clobber EFLAGS, but we should still look for a
+      // live EFLAGS def.
+      if (MO.isRegMask() && MO.clobbersPhysReg(X86::EFLAGS))
+        SawKill = true;
+      if (MO.isReg() && MO.getReg() == X86::EFLAGS) {
+        if (MO.isDef()) return MO.isDead();
+        if (MO.isKill()) SawKill = true;
+      }
+    }
+
+    if (SawKill)
+      // This instruction kills EFLAGS and doesn't redefine it, so
+      // there's no need to look further.
+      return true;
+  }
+
+  // Conservative answer.
+  return false;
+}
+
+void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator I,
+                                 unsigned DestReg, unsigned SubIdx,
+                                 const MachineInstr *Orig,
+                                 const TargetRegisterInfo &TRI) const {
+  // MOV32r0 is implemented with a xor which clobbers condition code.
+  // Re-materialize it as movri instructions to avoid side effects.
+  unsigned Opc = Orig->getOpcode();
+  if (Opc == X86::MOV32r0 && !isSafeToClobberEFLAGS(MBB, I)) {
+    DebugLoc DL = Orig->getDebugLoc();
+    BuildMI(MBB, I, DL, get(X86::MOV32ri)).addOperand(Orig->getOperand(0))
+      .addImm(0);
+  } else {
+    MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig);
+    MBB.insert(I, MI);
+  }
+
+  MachineInstr *NewMI = std::prev(I);
+  NewMI->substituteRegister(Orig->getOperand(0).getReg(), DestReg, SubIdx, TRI);
+}
+
+/// hasLiveCondCodeDef - True if MI has a condition code def, e.g. EFLAGS, that
+/// is not marked dead.
+static bool hasLiveCondCodeDef(MachineInstr *MI) {
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.isDef() &&
+        MO.getReg() == X86::EFLAGS && !MO.isDead()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+/// getTruncatedShiftCount - check whether the shift count for a machine operand
+/// is non-zero.
+inline static unsigned getTruncatedShiftCount(MachineInstr *MI,
+                                              unsigned ShiftAmtOperandIdx) {
+  // The shift count is six bits with the REX.W prefix and five bits without.
+  unsigned ShiftCountMask = (MI->getDesc().TSFlags & X86II::REX_W) ? 63 : 31;
+  unsigned Imm = MI->getOperand(ShiftAmtOperandIdx).getImm();
+  return Imm & ShiftCountMask;
+}
+
+/// isTruncatedShiftCountForLEA - check whether the given shift count is appropriate
+/// can be represented by a LEA instruction.
+inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
+  // Left shift instructions can be transformed into load-effective-address
+  // instructions if we can encode them appropriately.
+  // A LEA instruction utilizes a SIB byte to encode it's scale factor.
+  // The SIB.scale field is two bits wide which means that we can encode any
+  // shift amount less than 4.
+  return ShAmt < 4 && ShAmt > 0;
+}
+
+bool X86InstrInfo::classifyLEAReg(MachineInstr *MI, const MachineOperand &Src,
+                                  unsigned Opc, bool AllowSP,
+                                  unsigned &NewSrc, bool &isKill, bool &isUndef,
+                                  MachineOperand &ImplicitOp) const {
+  MachineFunction &MF = *MI->getParent()->getParent();
+  const TargetRegisterClass *RC;
+  if (AllowSP) {
+    RC = Opc != X86::LEA32r ? &X86::GR64RegClass : &X86::GR32RegClass;
+  } else {
+    RC = Opc != X86::LEA32r ?
+      &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass;
+  }
+  unsigned SrcReg = Src.getReg();
+
+  // For both LEA64 and LEA32 the register already has essentially the right
+  // type (32-bit or 64-bit) we may just need to forbid SP.
+  if (Opc != X86::LEA64_32r) {
+    NewSrc = SrcReg;
+    isKill = Src.isKill();
+    isUndef = Src.isUndef();
+
+    if (TargetRegisterInfo::isVirtualRegister(NewSrc) &&
+        !MF.getRegInfo().constrainRegClass(NewSrc, RC))
+      return false;
+
+    return true;
+  }
+
+  // This is for an LEA64_32r and incoming registers are 32-bit. One way or
+  // another we need to add 64-bit registers to the final MI.
+  if (TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
+    ImplicitOp = Src;
+    ImplicitOp.setImplicit();
+
+    NewSrc = getX86SubSuperRegister(Src.getReg(), MVT::i64);
+    MachineBasicBlock::LivenessQueryResult LQR =
+      MI->getParent()->computeRegisterLiveness(&getRegisterInfo(), NewSrc, MI);
+
+    switch (LQR) {
+    case MachineBasicBlock::LQR_Unknown:
+      // We can't give sane liveness flags to the instruction, abandon LEA
+      // formation.
+      return false;
+    case MachineBasicBlock::LQR_Live:
+      isKill = MI->killsRegister(SrcReg);
+      isUndef = false;
+      break;
+    default:
+      // The physreg itself is dead, so we have to use it as an <undef>.
+      isKill = false;
+      isUndef = true;
+      break;
+    }
+  } else {
+    // Virtual register of the wrong class, we have to create a temporary 64-bit
+    // vreg to feed into the LEA.
+    NewSrc = MF.getRegInfo().createVirtualRegister(RC);
+    BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+            get(TargetOpcode::COPY))
+      .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit)
+        .addOperand(Src);
+
+    // Which is obviously going to be dead after we're done with it.
+    isKill = true;
+    isUndef = false;
+  }
+
+  // We've set all the parameters without issue.
+  return true;
+}
+
+/// convertToThreeAddressWithLEA - Helper for convertToThreeAddress when
+/// 16-bit LEA is disabled, use 32-bit LEA to form 3-address code by promoting
+/// to a 32-bit superregister and then truncating back down to a 16-bit
+/// subregister.
+MachineInstr *
+X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
+                                           MachineFunction::iterator &MFI,
+                                           MachineBasicBlock::iterator &MBBI,
+                                           LiveVariables *LV) const {
+  MachineInstr *MI = MBBI;
+  unsigned Dest = MI->getOperand(0).getReg();
+  unsigned Src = MI->getOperand(1).getReg();
+  bool isDead = MI->getOperand(0).isDead();
+  bool isKill = MI->getOperand(1).isKill();
+
+  MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo();
+  unsigned leaOutReg = RegInfo.createVirtualRegister(&X86::GR32RegClass);
+  unsigned Opc, leaInReg;
+  if (Subtarget.is64Bit()) {
+    Opc = X86::LEA64_32r;
+    leaInReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
+  } else {
+    Opc = X86::LEA32r;
+    leaInReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
+  }
+
+  // Build and insert into an implicit UNDEF value. This is OK because
+  // well be shifting and then extracting the lower 16-bits.
+  // This has the potential to cause partial register stall. e.g.
+  //   movw    (%rbp,%rcx,2), %dx
+  //   leal    -65(%rdx), %esi
+  // But testing has shown this *does* help performance in 64-bit mode (at
+  // least on modern x86 machines).
+  BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(X86::IMPLICIT_DEF), leaInReg);
+  MachineInstr *InsMI =
+    BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(TargetOpcode::COPY))
+    .addReg(leaInReg, RegState::Define, X86::sub_16bit)
+    .addReg(Src, getKillRegState(isKill));
+
+  MachineInstrBuilder MIB = BuildMI(*MFI, MBBI, MI->getDebugLoc(),
+                                    get(Opc), leaOutReg);
+  switch (MIOpc) {
+  default: llvm_unreachable("Unreachable!");
+  case X86::SHL16ri: {
+    unsigned ShAmt = MI->getOperand(2).getImm();
+    MIB.addReg(0).addImm(1 << ShAmt)
+       .addReg(leaInReg, RegState::Kill).addImm(0).addReg(0);
+    break;
+  }
+  case X86::INC16r:
+  case X86::INC64_16r:
+    addRegOffset(MIB, leaInReg, true, 1);
+    break;
+  case X86::DEC16r:
+  case X86::DEC64_16r:
+    addRegOffset(MIB, leaInReg, true, -1);
+    break;
+  case X86::ADD16ri:
+  case X86::ADD16ri8:
+  case X86::ADD16ri_DB:
+  case X86::ADD16ri8_DB:
+    addRegOffset(MIB, leaInReg, true, MI->getOperand(2).getImm());
+    break;
+  case X86::ADD16rr:
+  case X86::ADD16rr_DB: {
+    unsigned Src2 = MI->getOperand(2).getReg();
+    bool isKill2 = MI->getOperand(2).isKill();
+    unsigned leaInReg2 = 0;
+    MachineInstr *InsMI2 = nullptr;
+    if (Src == Src2) {
+      // ADD16rr %reg1028<kill>, %reg1028
+      // just a single insert_subreg.
+      addRegReg(MIB, leaInReg, true, leaInReg, false);
+    } else {
+      if (Subtarget.is64Bit())
+        leaInReg2 = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
+      else
+        leaInReg2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
+      // Build and insert into an implicit UNDEF value. This is OK because
+      // well be shifting and then extracting the lower 16-bits.
+      BuildMI(*MFI, &*MIB, MI->getDebugLoc(), get(X86::IMPLICIT_DEF),leaInReg2);
+      InsMI2 =
+        BuildMI(*MFI, &*MIB, MI->getDebugLoc(), get(TargetOpcode::COPY))
+        .addReg(leaInReg2, RegState::Define, X86::sub_16bit)
+        .addReg(Src2, getKillRegState(isKill2));
+      addRegReg(MIB, leaInReg, true, leaInReg2, true);
+    }
+    if (LV && isKill2 && InsMI2)
+      LV->replaceKillInstruction(Src2, MI, InsMI2);
+    break;
+  }
+  }
+
+  MachineInstr *NewMI = MIB;
+  MachineInstr *ExtMI =
+    BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(TargetOpcode::COPY))
+    .addReg(Dest, RegState::Define | getDeadRegState(isDead))
+    .addReg(leaOutReg, RegState::Kill, X86::sub_16bit);
+
+  if (LV) {
+    // Update live variables
+    LV->getVarInfo(leaInReg).Kills.push_back(NewMI);
+    LV->getVarInfo(leaOutReg).Kills.push_back(ExtMI);
+    if (isKill)
+      LV->replaceKillInstruction(Src, MI, InsMI);
+    if (isDead)
+      LV->replaceKillInstruction(Dest, MI, ExtMI);
+  }
+
+  return ExtMI;
+}
+
+/// convertToThreeAddress - This method must be implemented by targets that
+/// set the M_CONVERTIBLE_TO_3_ADDR flag.  When this flag is set, the target
+/// may be able to convert a two-address instruction into a true
+/// three-address instruction on demand.  This allows the X86 target (for
+/// example) to convert ADD and SHL instructions into LEA instructions if they
+/// would require register copies due to two-addressness.
+///
+/// This method returns a null pointer if the transformation cannot be
+/// performed, otherwise it returns the new instruction.
+///
+MachineInstr *
+X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
+                                    MachineBasicBlock::iterator &MBBI,
+                                    LiveVariables *LV) const {
+  MachineInstr *MI = MBBI;
+
+  // The following opcodes also sets the condition code register(s). Only
+  // convert them to equivalent lea if the condition code register def's
+  // are dead!
+  if (hasLiveCondCodeDef(MI))
+    return nullptr;
+
+  MachineFunction &MF = *MI->getParent()->getParent();
+  // All instructions input are two-addr instructions.  Get the known operands.
+  const MachineOperand &Dest = MI->getOperand(0);
+  const MachineOperand &Src = MI->getOperand(1);
+
+  MachineInstr *NewMI = nullptr;
+  // FIXME: 16-bit LEA's are really slow on Athlons, but not bad on P4's.  When
+  // we have better subtarget support, enable the 16-bit LEA generation here.
+  // 16-bit LEA is also slow on Core2.
+  bool DisableLEA16 = true;
+  bool is64Bit = Subtarget.is64Bit();
+
+  unsigned MIOpc = MI->getOpcode();
+  switch (MIOpc) {
+  case X86::SHUFPSrri: {
+    assert(MI->getNumOperands() == 4 && "Unknown shufps instruction!");
+    if (!Subtarget.hasSSE2()) return nullptr;
+
+    unsigned B = MI->getOperand(1).getReg();
+    unsigned C = MI->getOperand(2).getReg();
+    if (B != C) return nullptr;
+    unsigned M = MI->getOperand(3).getImm();
+    NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::PSHUFDri))
+      .addOperand(Dest).addOperand(Src).addImm(M);
+    break;
+  }
+  case X86::SHUFPDrri: {
+    assert(MI->getNumOperands() == 4 && "Unknown shufpd instruction!");
+    if (!Subtarget.hasSSE2()) return nullptr;
+
+    unsigned B = MI->getOperand(1).getReg();
+    unsigned C = MI->getOperand(2).getReg();
+    if (B != C) return nullptr;
+    unsigned M = MI->getOperand(3).getImm();
+
+    // Convert to PSHUFD mask.
+    M = ((M & 1) << 1) | ((M & 1) << 3) | ((M & 2) << 4) | ((M & 2) << 6)| 0x44;
+
+    NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::PSHUFDri))
+      .addOperand(Dest).addOperand(Src).addImm(M);
+    break;
+  }
+  case X86::SHL64ri: {
+    assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
+    unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+    if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
+
+    // LEA can't handle RSP.
+    if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
+        !MF.getRegInfo().constrainRegClass(Src.getReg(),
+                                           &X86::GR64_NOSPRegClass))
+      return nullptr;
+
+    NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r))
+      .addOperand(Dest)
+      .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0);
+    break;
+  }
+  case X86::SHL32ri: {
+    assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
+    unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+    if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
+
+    unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
+
+    // LEA can't handle ESP.
+    bool isKill, isUndef;
+    unsigned SrcReg;
+    MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
+                        SrcReg, isKill, isUndef, ImplicitOp))
+      return nullptr;
+
+    MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
+      .addOperand(Dest)
+      .addReg(0).addImm(1 << ShAmt)
+      .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef))
+      .addImm(0).addReg(0);
+    if (ImplicitOp.getReg() != 0)
+      MIB.addOperand(ImplicitOp);
+    NewMI = MIB;
+
+    break;
+  }
+  case X86::SHL16ri: {
+    assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
+    unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+    if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
+
+    if (DisableLEA16)
+      return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : nullptr;
+    NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+      .addOperand(Dest)
+      .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0);
+    break;
+  }
+  default: {
+
+    switch (MIOpc) {
+    default: return nullptr;
+    case X86::INC64r:
+    case X86::INC32r:
+    case X86::INC64_32r: {
+      assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!");
+      unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r
+        : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
+      bool isKill, isUndef;
+      unsigned SrcReg;
+      MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+      if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
+                          SrcReg, isKill, isUndef, ImplicitOp))
+        return nullptr;
+
+      MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
+          .addOperand(Dest)
+          .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef));
+      if (ImplicitOp.getReg() != 0)
+        MIB.addOperand(ImplicitOp);
+
+      NewMI = addOffset(MIB, 1);
+      break;
+    }
+    case X86::INC16r:
+    case X86::INC64_16r:
+      if (DisableLEA16)
+        return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV)
+                       : nullptr;
+      assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!");
+      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+                        .addOperand(Dest).addOperand(Src), 1);
+      break;
+    case X86::DEC64r:
+    case X86::DEC32r:
+    case X86::DEC64_32r: {
+      assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!");
+      unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r
+        : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
+
+      bool isKill, isUndef;
+      unsigned SrcReg;
+      MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+      if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
+                          SrcReg, isKill, isUndef, ImplicitOp))
+        return nullptr;
+
+      MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
+          .addOperand(Dest)
+          .addReg(SrcReg, getUndefRegState(isUndef) | getKillRegState(isKill));
+      if (ImplicitOp.getReg() != 0)
+        MIB.addOperand(ImplicitOp);
+
+      NewMI = addOffset(MIB, -1);
+
+      break;
+    }
+    case X86::DEC16r:
+    case X86::DEC64_16r:
+      if (DisableLEA16)
+        return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV)
+                       : nullptr;
+      assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!");
+      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+                        .addOperand(Dest).addOperand(Src), -1);
+      break;
+    case X86::ADD64rr:
+    case X86::ADD64rr_DB:
+    case X86::ADD32rr:
+    case X86::ADD32rr_DB: {
+      assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+      unsigned Opc;
+      if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB)
+        Opc = X86::LEA64r;
+      else
+        Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
+
+      bool isKill, isUndef;
+      unsigned SrcReg;
+      MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+      if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
+                          SrcReg, isKill, isUndef, ImplicitOp))
+        return nullptr;
+
+      const MachineOperand &Src2 = MI->getOperand(2);
+      bool isKill2, isUndef2;
+      unsigned SrcReg2;
+      MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false);
+      if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/ false,
+                          SrcReg2, isKill2, isUndef2, ImplicitOp2))
+        return nullptr;
+
+      MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
+        .addOperand(Dest);
+      if (ImplicitOp.getReg() != 0)
+        MIB.addOperand(ImplicitOp);
+      if (ImplicitOp2.getReg() != 0)
+        MIB.addOperand(ImplicitOp2);
+
+      NewMI = addRegReg(MIB, SrcReg, isKill, SrcReg2, isKill2);
+
+      // Preserve undefness of the operands.
+      NewMI->getOperand(1).setIsUndef(isUndef);
+      NewMI->getOperand(3).setIsUndef(isUndef2);
+
+      if (LV && Src2.isKill())
+        LV->replaceKillInstruction(SrcReg2, MI, NewMI);
+      break;
+    }
+    case X86::ADD16rr:
+    case X86::ADD16rr_DB: {
+      if (DisableLEA16)
+        return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV)
+                       : nullptr;
+      assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+      unsigned Src2 = MI->getOperand(2).getReg();
+      bool isKill2 = MI->getOperand(2).isKill();
+      NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+                        .addOperand(Dest),
+                        Src.getReg(), Src.isKill(), Src2, isKill2);
+
+      // Preserve undefness of the operands.
+      bool isUndef = MI->getOperand(1).isUndef();
+      bool isUndef2 = MI->getOperand(2).isUndef();
+      NewMI->getOperand(1).setIsUndef(isUndef);
+      NewMI->getOperand(3).setIsUndef(isUndef2);
+
+      if (LV && isKill2)
+        LV->replaceKillInstruction(Src2, MI, NewMI);
+      break;
+    }
+    case X86::ADD64ri32:
+    case X86::ADD64ri8:
+    case X86::ADD64ri32_DB:
+    case X86::ADD64ri8_DB:
+      assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r))
+                        .addOperand(Dest).addOperand(Src),
+                        MI->getOperand(2).getImm());
+      break;
+    case X86::ADD32ri:
+    case X86::ADD32ri8:
+    case X86::ADD32ri_DB:
+    case X86::ADD32ri8_DB: {
+      assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+      unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
+
+      bool isKill, isUndef;
+      unsigned SrcReg;
+      MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+      if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
+                          SrcReg, isKill, isUndef, ImplicitOp))
+        return nullptr;
+
+      MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
+          .addOperand(Dest)
+          .addReg(SrcReg, getUndefRegState(isUndef) | getKillRegState(isKill));
+      if (ImplicitOp.getReg() != 0)
+        MIB.addOperand(ImplicitOp);
+
+      NewMI = addOffset(MIB, MI->getOperand(2).getImm());
+      break;
+    }
+    case X86::ADD16ri:
+    case X86::ADD16ri8:
+    case X86::ADD16ri_DB:
+    case X86::ADD16ri8_DB:
+      if (DisableLEA16)
+        return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV)
+                       : nullptr;
+      assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+                        .addOperand(Dest).addOperand(Src),
+                        MI->getOperand(2).getImm());
+      break;
+    }
+  }
+  }
+
+  if (!NewMI) return nullptr;
+
+  if (LV) {  // Update live variables
+    if (Src.isKill())
+      LV->replaceKillInstruction(Src.getReg(), MI, NewMI);
+    if (Dest.isDead())
+      LV->replaceKillInstruction(Dest.getReg(), MI, NewMI);
+  }
+
+  MFI->insert(MBBI, NewMI);          // Insert the new inst
+  return NewMI;
+}
+
+/// commuteInstruction - We have a few instructions that must be hacked on to
+/// commute them.
+///
+MachineInstr *
+X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
+  switch (MI->getOpcode()) {
+  case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I)
+  case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I)
+  case X86::SHRD32rri8: // A = SHRD32rri8 B, C, I -> A = SHLD32rri8 C, B, (32-I)
+  case X86::SHLD32rri8: // A = SHLD32rri8 B, C, I -> A = SHRD32rri8 C, B, (32-I)
+  case X86::SHRD64rri8: // A = SHRD64rri8 B, C, I -> A = SHLD64rri8 C, B, (64-I)
+  case X86::SHLD64rri8:{// A = SHLD64rri8 B, C, I -> A = SHRD64rri8 C, B, (64-I)
+    unsigned Opc;
+    unsigned Size;
+    switch (MI->getOpcode()) {
+    default: llvm_unreachable("Unreachable!");
+    case X86::SHRD16rri8: Size = 16; Opc = X86::SHLD16rri8; break;
+    case X86::SHLD16rri8: Size = 16; Opc = X86::SHRD16rri8; break;
+    case X86::SHRD32rri8: Size = 32; Opc = X86::SHLD32rri8; break;
+    case X86::SHLD32rri8: Size = 32; Opc = X86::SHRD32rri8; break;
+    case X86::SHRD64rri8: Size = 64; Opc = X86::SHLD64rri8; break;
+    case X86::SHLD64rri8: Size = 64; Opc = X86::SHRD64rri8; break;
+    }
+    unsigned Amt = MI->getOperand(3).getImm();
+    if (NewMI) {
+      MachineFunction &MF = *MI->getParent()->getParent();
+      MI = MF.CloneMachineInstr(MI);
+      NewMI = false;
+    }
+    MI->setDesc(get(Opc));
+    MI->getOperand(3).setImm(Size-Amt);
+    return TargetInstrInfo::commuteInstruction(MI, NewMI);
+  }
+  case X86::CMOVB16rr:  case X86::CMOVB32rr:  case X86::CMOVB64rr:
+  case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr:
+  case X86::CMOVE16rr:  case X86::CMOVE32rr:  case X86::CMOVE64rr:
+  case X86::CMOVNE16rr: case X86::CMOVNE32rr: case X86::CMOVNE64rr:
+  case X86::CMOVBE16rr: case X86::CMOVBE32rr: case X86::CMOVBE64rr:
+  case X86::CMOVA16rr:  case X86::CMOVA32rr:  case X86::CMOVA64rr:
+  case X86::CMOVL16rr:  case X86::CMOVL32rr:  case X86::CMOVL64rr:
+  case X86::CMOVGE16rr: case X86::CMOVGE32rr: case X86::CMOVGE64rr:
+  case X86::CMOVLE16rr: case X86::CMOVLE32rr: case X86::CMOVLE64rr:
+  case X86::CMOVG16rr:  case X86::CMOVG32rr:  case X86::CMOVG64rr:
+  case X86::CMOVS16rr:  case X86::CMOVS32rr:  case X86::CMOVS64rr:
+  case X86::CMOVNS16rr: case X86::CMOVNS32rr: case X86::CMOVNS64rr:
+  case X86::CMOVP16rr:  case X86::CMOVP32rr:  case X86::CMOVP64rr:
+  case X86::CMOVNP16rr: case X86::CMOVNP32rr: case X86::CMOVNP64rr:
+  case X86::CMOVO16rr:  case X86::CMOVO32rr:  case X86::CMOVO64rr:
+  case X86::CMOVNO16rr: case X86::CMOVNO32rr: case X86::CMOVNO64rr: {
+    unsigned Opc;
+    switch (MI->getOpcode()) {
+    default: llvm_unreachable("Unreachable!");
+    case X86::CMOVB16rr:  Opc = X86::CMOVAE16rr; break;
+    case X86::CMOVB32rr:  Opc = X86::CMOVAE32rr; break;
+    case X86::CMOVB64rr:  Opc = X86::CMOVAE64rr; break;
+    case X86::CMOVAE16rr: Opc = X86::CMOVB16rr; break;
+    case X86::CMOVAE32rr: Opc = X86::CMOVB32rr; break;
+    case X86::CMOVAE64rr: Opc = X86::CMOVB64rr; break;
+    case X86::CMOVE16rr:  Opc = X86::CMOVNE16rr; break;
+    case X86::CMOVE32rr:  Opc = X86::CMOVNE32rr; break;
+    case X86::CMOVE64rr:  Opc = X86::CMOVNE64rr; break;
+    case X86::CMOVNE16rr: Opc = X86::CMOVE16rr; break;
+    case X86::CMOVNE32rr: Opc = X86::CMOVE32rr; break;
+    case X86::CMOVNE64rr: Opc = X86::CMOVE64rr; break;
+    case X86::CMOVBE16rr: Opc = X86::CMOVA16rr; break;
+    case X86::CMOVBE32rr: Opc = X86::CMOVA32rr; break;
+    case X86::CMOVBE64rr: Opc = X86::CMOVA64rr; break;
+    case X86::CMOVA16rr:  Opc = X86::CMOVBE16rr; break;
+    case X86::CMOVA32rr:  Opc = X86::CMOVBE32rr; break;
+    case X86::CMOVA64rr:  Opc = X86::CMOVBE64rr; break;
+    case X86::CMOVL16rr:  Opc = X86::CMOVGE16rr; break;
+    case X86::CMOVL32rr:  Opc = X86::CMOVGE32rr; break;
+    case X86::CMOVL64rr:  Opc = X86::CMOVGE64rr; break;
+    case X86::CMOVGE16rr: Opc = X86::CMOVL16rr; break;
+    case X86::CMOVGE32rr: Opc = X86::CMOVL32rr; break;
+    case X86::CMOVGE64rr: Opc = X86::CMOVL64rr; break;
+    case X86::CMOVLE16rr: Opc = X86::CMOVG16rr; break;
+    case X86::CMOVLE32rr: Opc = X86::CMOVG32rr; break;
+    case X86::CMOVLE64rr: Opc = X86::CMOVG64rr; break;
+    case X86::CMOVG16rr:  Opc = X86::CMOVLE16rr; break;
+    case X86::CMOVG32rr:  Opc = X86::CMOVLE32rr; break;
+    case X86::CMOVG64rr:  Opc = X86::CMOVLE64rr; break;
+    case X86::CMOVS16rr:  Opc = X86::CMOVNS16rr; break;
+    case X86::CMOVS32rr:  Opc = X86::CMOVNS32rr; break;
+    case X86::CMOVS64rr:  Opc = X86::CMOVNS64rr; break;
+    case X86::CMOVNS16rr: Opc = X86::CMOVS16rr; break;
+    case X86::CMOVNS32rr: Opc = X86::CMOVS32rr; break;
+    case X86::CMOVNS64rr: Opc = X86::CMOVS64rr; break;
+    case X86::CMOVP16rr:  Opc = X86::CMOVNP16rr; break;
+    case X86::CMOVP32rr:  Opc = X86::CMOVNP32rr; break;
+    case X86::CMOVP64rr:  Opc = X86::CMOVNP64rr; break;
+    case X86::CMOVNP16rr: Opc = X86::CMOVP16rr; break;
+    case X86::CMOVNP32rr: Opc = X86::CMOVP32rr; break;
+    case X86::CMOVNP64rr: Opc = X86::CMOVP64rr; break;
+    case X86::CMOVO16rr:  Opc = X86::CMOVNO16rr; break;
+    case X86::CMOVO32rr:  Opc = X86::CMOVNO32rr; break;
+    case X86::CMOVO64rr:  Opc = X86::CMOVNO64rr; break;
+    case X86::CMOVNO16rr: Opc = X86::CMOVO16rr; break;
+    case X86::CMOVNO32rr: Opc = X86::CMOVO32rr; break;
+    case X86::CMOVNO64rr: Opc = X86::CMOVO64rr; break;
+    }
+    if (NewMI) {
+      MachineFunction &MF = *MI->getParent()->getParent();
+      MI = MF.CloneMachineInstr(MI);
+      NewMI = false;
+    }
+    MI->setDesc(get(Opc));
+    // Fallthrough intended.
+  }
+  default:
+    return TargetInstrInfo::commuteInstruction(MI, NewMI);
+  }
+}
+
+bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
+                                         unsigned &SrcOpIdx2) const {
+  switch (MI->getOpcode()) {
+    case X86::VFMADDPDr231r:
+    case X86::VFMADDPSr231r:
+    case X86::VFMADDSDr231r:
+    case X86::VFMADDSSr231r:
+    case X86::VFMSUBPDr231r:
+    case X86::VFMSUBPSr231r:
+    case X86::VFMSUBSDr231r:
+    case X86::VFMSUBSSr231r:
+    case X86::VFNMADDPDr231r:
+    case X86::VFNMADDPSr231r:
+    case X86::VFNMADDSDr231r:
+    case X86::VFNMADDSSr231r:
+    case X86::VFNMSUBPDr231r:
+    case X86::VFNMSUBPSr231r:
+    case X86::VFNMSUBSDr231r:
+    case X86::VFNMSUBSSr231r:
+    case X86::VFMADDPDr231rY:
+    case X86::VFMADDPSr231rY:
+    case X86::VFMSUBPDr231rY:
+    case X86::VFMSUBPSr231rY:
+    case X86::VFNMADDPDr231rY:
+    case X86::VFNMADDPSr231rY:
+    case X86::VFNMSUBPDr231rY:
+    case X86::VFNMSUBPSr231rY:
+      SrcOpIdx1 = 2;
+      SrcOpIdx2 = 3;
+      return true;
+    default:
+      return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
+  }
+}
+
+static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) {
+  switch (BrOpc) {
+  default: return X86::COND_INVALID;
+  case X86::JE_4:  return X86::COND_E;
+  case X86::JNE_4: return X86::COND_NE;
+  case X86::JL_4:  return X86::COND_L;
+  case X86::JLE_4: return X86::COND_LE;
+  case X86::JG_4:  return X86::COND_G;
+  case X86::JGE_4: return X86::COND_GE;
+  case X86::JB_4:  return X86::COND_B;
+  case X86::JBE_4: return X86::COND_BE;
+  case X86::JA_4:  return X86::COND_A;
+  case X86::JAE_4: return X86::COND_AE;
+  case X86::JS_4:  return X86::COND_S;
+  case X86::JNS_4: return X86::COND_NS;
+  case X86::JP_4:  return X86::COND_P;
+  case X86::JNP_4: return X86::COND_NP;
+  case X86::JO_4:  return X86::COND_O;
+  case X86::JNO_4: return X86::COND_NO;
+  }
+}
+
+/// getCondFromSETOpc - return condition code of a SET opcode.
+static X86::CondCode getCondFromSETOpc(unsigned Opc) {
+  switch (Opc) {
+  default: return X86::COND_INVALID;
+  case X86::SETAr:  case X86::SETAm:  return X86::COND_A;
+  case X86::SETAEr: case X86::SETAEm: return X86::COND_AE;
+  case X86::SETBr:  case X86::SETBm:  return X86::COND_B;
+  case X86::SETBEr: case X86::SETBEm: return X86::COND_BE;
+  case X86::SETEr:  case X86::SETEm:  return X86::COND_E;
+  case X86::SETGr:  case X86::SETGm:  return X86::COND_G;
+  case X86::SETGEr: case X86::SETGEm: return X86::COND_GE;
+  case X86::SETLr:  case X86::SETLm:  return X86::COND_L;
+  case X86::SETLEr: case X86::SETLEm: return X86::COND_LE;
+  case X86::SETNEr: case X86::SETNEm: return X86::COND_NE;
+  case X86::SETNOr: case X86::SETNOm: return X86::COND_NO;
+  case X86::SETNPr: case X86::SETNPm: return X86::COND_NP;
+  case X86::SETNSr: case X86::SETNSm: return X86::COND_NS;
+  case X86::SETOr:  case X86::SETOm:  return X86::COND_O;
+  case X86::SETPr:  case X86::SETPm:  return X86::COND_P;
+  case X86::SETSr:  case X86::SETSm:  return X86::COND_S;
+  }
+}
+
+/// getCondFromCmovOpc - return condition code of a CMov opcode.
+X86::CondCode X86::getCondFromCMovOpc(unsigned Opc) {
+  switch (Opc) {
+  default: return X86::COND_INVALID;
+  case X86::CMOVA16rm:  case X86::CMOVA16rr:  case X86::CMOVA32rm:
+  case X86::CMOVA32rr:  case X86::CMOVA64rm:  case X86::CMOVA64rr:
+    return X86::COND_A;
+  case X86::CMOVAE16rm: case X86::CMOVAE16rr: case X86::CMOVAE32rm:
+  case X86::CMOVAE32rr: case X86::CMOVAE64rm: case X86::CMOVAE64rr:
+    return X86::COND_AE;
+  case X86::CMOVB16rm:  case X86::CMOVB16rr:  case X86::CMOVB32rm:
+  case X86::CMOVB32rr:  case X86::CMOVB64rm:  case X86::CMOVB64rr:
+    return X86::COND_B;
+  case X86::CMOVBE16rm: case X86::CMOVBE16rr: case X86::CMOVBE32rm:
+  case X86::CMOVBE32rr: case X86::CMOVBE64rm: case X86::CMOVBE64rr:
+    return X86::COND_BE;
+  case X86::CMOVE16rm:  case X86::CMOVE16rr:  case X86::CMOVE32rm:
+  case X86::CMOVE32rr:  case X86::CMOVE64rm:  case X86::CMOVE64rr:
+    return X86::COND_E;
+  case X86::CMOVG16rm:  case X86::CMOVG16rr:  case X86::CMOVG32rm:
+  case X86::CMOVG32rr:  case X86::CMOVG64rm:  case X86::CMOVG64rr:
+    return X86::COND_G;
+  case X86::CMOVGE16rm: case X86::CMOVGE16rr: case X86::CMOVGE32rm:
+  case X86::CMOVGE32rr: case X86::CMOVGE64rm: case X86::CMOVGE64rr:
+    return X86::COND_GE;
+  case X86::CMOVL16rm:  case X86::CMOVL16rr:  case X86::CMOVL32rm:
+  case X86::CMOVL32rr:  case X86::CMOVL64rm:  case X86::CMOVL64rr:
+    return X86::COND_L;
+  case X86::CMOVLE16rm: case X86::CMOVLE16rr: case X86::CMOVLE32rm:
+  case X86::CMOVLE32rr: case X86::CMOVLE64rm: case X86::CMOVLE64rr:
+    return X86::COND_LE;
+  case X86::CMOVNE16rm: case X86::CMOVNE16rr: case X86::CMOVNE32rm:
+  case X86::CMOVNE32rr: case X86::CMOVNE64rm: case X86::CMOVNE64rr:
+    return X86::COND_NE;
+  case X86::CMOVNO16rm: case X86::CMOVNO16rr: case X86::CMOVNO32rm:
+  case X86::CMOVNO32rr: case X86::CMOVNO64rm: case X86::CMOVNO64rr:
+    return X86::COND_NO;
+  case X86::CMOVNP16rm: case X86::CMOVNP16rr: case X86::CMOVNP32rm:
+  case X86::CMOVNP32rr: case X86::CMOVNP64rm: case X86::CMOVNP64rr:
+    return X86::COND_NP;
+  case X86::CMOVNS16rm: case X86::CMOVNS16rr: case X86::CMOVNS32rm:
+  case X86::CMOVNS32rr: case X86::CMOVNS64rm: case X86::CMOVNS64rr:
+    return X86::COND_NS;
+  case X86::CMOVO16rm:  case X86::CMOVO16rr:  case X86::CMOVO32rm:
+  case X86::CMOVO32rr:  case X86::CMOVO64rm:  case X86::CMOVO64rr:
+    return X86::COND_O;
+  case X86::CMOVP16rm:  case X86::CMOVP16rr:  case X86::CMOVP32rm:
+  case X86::CMOVP32rr:  case X86::CMOVP64rm:  case X86::CMOVP64rr:
+    return X86::COND_P;
+  case X86::CMOVS16rm:  case X86::CMOVS16rr:  case X86::CMOVS32rm:
+  case X86::CMOVS32rr:  case X86::CMOVS64rm:  case X86::CMOVS64rr:
+    return X86::COND_S;
+  }
+}
+
+unsigned X86::GetCondBranchFromCond(X86::CondCode CC) {
+  switch (CC) {
+  default: llvm_unreachable("Illegal condition code!");
+  case X86::COND_E:  return X86::JE_4;
+  case X86::COND_NE: return X86::JNE_4;
+  case X86::COND_L:  return X86::JL_4;
+  case X86::COND_LE: return X86::JLE_4;
+  case X86::COND_G:  return X86::JG_4;
+  case X86::COND_GE: return X86::JGE_4;
+  case X86::COND_B:  return X86::JB_4;
+  case X86::COND_BE: return X86::JBE_4;
+  case X86::COND_A:  return X86::JA_4;
+  case X86::COND_AE: return X86::JAE_4;
+  case X86::COND_S:  return X86::JS_4;
+  case X86::COND_NS: return X86::JNS_4;
+  case X86::COND_P:  return X86::JP_4;
+  case X86::COND_NP: return X86::JNP_4;
+  case X86::COND_O:  return X86::JO_4;
+  case X86::COND_NO: return X86::JNO_4;
+  }
+}
+
+/// GetOppositeBranchCondition - Return the inverse of the specified condition,
+/// e.g. turning COND_E to COND_NE.
+X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) {
+  switch (CC) {
+  default: llvm_unreachable("Illegal condition code!");
+  case X86::COND_E:  return X86::COND_NE;
+  case X86::COND_NE: return X86::COND_E;
+  case X86::COND_L:  return X86::COND_GE;
+  case X86::COND_LE: return X86::COND_G;
+  case X86::COND_G:  return X86::COND_LE;
+  case X86::COND_GE: return X86::COND_L;
+  case X86::COND_B:  return X86::COND_AE;
+  case X86::COND_BE: return X86::COND_A;
+  case X86::COND_A:  return X86::COND_BE;
+  case X86::COND_AE: return X86::COND_B;
+  case X86::COND_S:  return X86::COND_NS;
+  case X86::COND_NS: return X86::COND_S;
+  case X86::COND_P:  return X86::COND_NP;
+  case X86::COND_NP: return X86::COND_P;
+  case X86::COND_O:  return X86::COND_NO;
+  case X86::COND_NO: return X86::COND_O;
+  }
+}
+
+/// getSwappedCondition - assume the flags are set by MI(a,b), return
+/// the condition code if we modify the instructions such that flags are
+/// set by MI(b,a).
+static X86::CondCode getSwappedCondition(X86::CondCode CC) {
+  switch (CC) {
+  default: return X86::COND_INVALID;
+  case X86::COND_E:  return X86::COND_E;
+  case X86::COND_NE: return X86::COND_NE;
+  case X86::COND_L:  return X86::COND_G;
+  case X86::COND_LE: return X86::COND_GE;
+  case X86::COND_G:  return X86::COND_L;
+  case X86::COND_GE: return X86::COND_LE;
+  case X86::COND_B:  return X86::COND_A;
+  case X86::COND_BE: return X86::COND_AE;
+  case X86::COND_A:  return X86::COND_B;
+  case X86::COND_AE: return X86::COND_BE;
+  }
+}
+
+/// getSETFromCond - Return a set opcode for the given condition and
+/// whether it has memory operand.
+unsigned X86::getSETFromCond(CondCode CC, bool HasMemoryOperand) {
+  static const uint16_t Opc[16][2] = {
+    { X86::SETAr,  X86::SETAm  },
+    { X86::SETAEr, X86::SETAEm },
+    { X86::SETBr,  X86::SETBm  },
+    { X86::SETBEr, X86::SETBEm },
+    { X86::SETEr,  X86::SETEm  },
+    { X86::SETGr,  X86::SETGm  },
+    { X86::SETGEr, X86::SETGEm },
+    { X86::SETLr,  X86::SETLm  },
+    { X86::SETLEr, X86::SETLEm },
+    { X86::SETNEr, X86::SETNEm },
+    { X86::SETNOr, X86::SETNOm },
+    { X86::SETNPr, X86::SETNPm },
+    { X86::SETNSr, X86::SETNSm },
+    { X86::SETOr,  X86::SETOm  },
+    { X86::SETPr,  X86::SETPm  },
+    { X86::SETSr,  X86::SETSm  }
+  };
+
+  assert(CC <= LAST_VALID_COND && "Can only handle standard cond codes");
+  return Opc[CC][HasMemoryOperand ? 1 : 0];
+}
+
+/// getCMovFromCond - Return a cmov opcode for the given condition,
+/// register size in bytes, and operand type.
+unsigned X86::getCMovFromCond(CondCode CC, unsigned RegBytes,
+                              bool HasMemoryOperand) {
+  static const uint16_t Opc[32][3] = {
+    { X86::CMOVA16rr,  X86::CMOVA32rr,  X86::CMOVA64rr  },
+    { X86::CMOVAE16rr, X86::CMOVAE32rr, X86::CMOVAE64rr },
+    { X86::CMOVB16rr,  X86::CMOVB32rr,  X86::CMOVB64rr  },
+    { X86::CMOVBE16rr, X86::CMOVBE32rr, X86::CMOVBE64rr },
+    { X86::CMOVE16rr,  X86::CMOVE32rr,  X86::CMOVE64rr  },
+    { X86::CMOVG16rr,  X86::CMOVG32rr,  X86::CMOVG64rr  },
+    { X86::CMOVGE16rr, X86::CMOVGE32rr, X86::CMOVGE64rr },
+    { X86::CMOVL16rr,  X86::CMOVL32rr,  X86::CMOVL64rr  },
+    { X86::CMOVLE16rr, X86::CMOVLE32rr, X86::CMOVLE64rr },
+    { X86::CMOVNE16rr, X86::CMOVNE32rr, X86::CMOVNE64rr },
+    { X86::CMOVNO16rr, X86::CMOVNO32rr, X86::CMOVNO64rr },
+    { X86::CMOVNP16rr, X86::CMOVNP32rr, X86::CMOVNP64rr },
+    { X86::CMOVNS16rr, X86::CMOVNS32rr, X86::CMOVNS64rr },
+    { X86::CMOVO16rr,  X86::CMOVO32rr,  X86::CMOVO64rr  },
+    { X86::CMOVP16rr,  X86::CMOVP32rr,  X86::CMOVP64rr  },
+    { X86::CMOVS16rr,  X86::CMOVS32rr,  X86::CMOVS64rr  },
+    { X86::CMOVA16rm,  X86::CMOVA32rm,  X86::CMOVA64rm  },
+    { X86::CMOVAE16rm, X86::CMOVAE32rm, X86::CMOVAE64rm },
+    { X86::CMOVB16rm,  X86::CMOVB32rm,  X86::CMOVB64rm  },
+    { X86::CMOVBE16rm, X86::CMOVBE32rm, X86::CMOVBE64rm },
+    { X86::CMOVE16rm,  X86::CMOVE32rm,  X86::CMOVE64rm  },
+    { X86::CMOVG16rm,  X86::CMOVG32rm,  X86::CMOVG64rm  },
+    { X86::CMOVGE16rm, X86::CMOVGE32rm, X86::CMOVGE64rm },
+    { X86::CMOVL16rm,  X86::CMOVL32rm,  X86::CMOVL64rm  },
+    { X86::CMOVLE16rm, X86::CMOVLE32rm, X86::CMOVLE64rm },
+    { X86::CMOVNE16rm, X86::CMOVNE32rm, X86::CMOVNE64rm },
+    { X86::CMOVNO16rm, X86::CMOVNO32rm, X86::CMOVNO64rm },
+    { X86::CMOVNP16rm, X86::CMOVNP32rm, X86::CMOVNP64rm },
+    { X86::CMOVNS16rm, X86::CMOVNS32rm, X86::CMOVNS64rm },
+    { X86::CMOVO16rm,  X86::CMOVO32rm,  X86::CMOVO64rm  },
+    { X86::CMOVP16rm,  X86::CMOVP32rm,  X86::CMOVP64rm  },
+    { X86::CMOVS16rm,  X86::CMOVS32rm,  X86::CMOVS64rm  }
+  };
+
+  assert(CC < 16 && "Can only handle standard cond codes");
+  unsigned Idx = HasMemoryOperand ? 16+CC : CC;
+  switch(RegBytes) {
+  default: llvm_unreachable("Illegal register size!");
+  case 2: return Opc[Idx][0];
+  case 4: return Opc[Idx][1];
+  case 8: return Opc[Idx][2];
+  }
+}
+
+bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
+  if (!MI->isTerminator()) return false;
+
+  // Conditional branch is a special case.
+  if (MI->isBranch() && !MI->isBarrier())
+    return true;
+  if (!MI->isPredicable())
+    return true;
+  return !isPredicated(MI);
+}
+
+bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+                                 MachineBasicBlock *&TBB,
+                                 MachineBasicBlock *&FBB,
+                                 SmallVectorImpl<MachineOperand> &Cond,
+                                 bool AllowModify) const {
+  // Start from the bottom of the block and work up, examining the
+  // terminator instructions.
+  MachineBasicBlock::iterator I = MBB.end();
+  MachineBasicBlock::iterator UnCondBrIter = MBB.end();
+  while (I != MBB.begin()) {
+    --I;
+    if (I->isDebugValue())
+      continue;
+
+    // Working from the bottom, when we see a non-terminator instruction, we're
+    // done.
+    if (!isUnpredicatedTerminator(I))
+      break;
+
+    // A terminator that isn't a branch can't easily be handled by this
+    // analysis.
+    if (!I->isBranch())
+      return true;
+
+    // Handle unconditional branches.
+    if (I->getOpcode() == X86::JMP_4) {
+      UnCondBrIter = I;
+
+      if (!AllowModify) {
+        TBB = I->getOperand(0).getMBB();
+        continue;
+      }
+
+      // If the block has any instructions after a JMP, delete them.
+      while (std::next(I) != MBB.end())
+        std::next(I)->eraseFromParent();
+
+      Cond.clear();
+      FBB = nullptr;
+
+      // Delete the JMP if it's equivalent to a fall-through.
+      if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
+        TBB = nullptr;
+        I->eraseFromParent();
+        I = MBB.end();
+        UnCondBrIter = MBB.end();
+        continue;
+      }
+
+      // TBB is used to indicate the unconditional destination.
+      TBB = I->getOperand(0).getMBB();
+      continue;
+    }
+
+    // Handle conditional branches.
+    X86::CondCode BranchCode = getCondFromBranchOpc(I->getOpcode());
+    if (BranchCode == X86::COND_INVALID)
+      return true;  // Can't handle indirect branch.
+
+    // Working from the bottom, handle the first conditional branch.
+    if (Cond.empty()) {
+      MachineBasicBlock *TargetBB = I->getOperand(0).getMBB();
+      if (AllowModify && UnCondBrIter != MBB.end() &&
+          MBB.isLayoutSuccessor(TargetBB)) {
+        // If we can modify the code and it ends in something like:
+        //
+        //     jCC L1
+        //     jmp L2
+        //   L1:
+        //     ...
+        //   L2:
+        //
+        // Then we can change this to:
+        //
+        //     jnCC L2
+        //   L1:
+        //     ...
+        //   L2:
+        //
+        // Which is a bit more efficient.
+        // We conditionally jump to the fall-through block.
+        BranchCode = GetOppositeBranchCondition(BranchCode);
+        unsigned JNCC = GetCondBranchFromCond(BranchCode);
+        MachineBasicBlock::iterator OldInst = I;
+
+        BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(JNCC))
+          .addMBB(UnCondBrIter->getOperand(0).getMBB());
+        BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JMP_4))
+          .addMBB(TargetBB);
+
+        OldInst->eraseFromParent();
+        UnCondBrIter->eraseFromParent();
+
+        // Restart the analysis.
+        UnCondBrIter = MBB.end();
+        I = MBB.end();
+        continue;
+      }
+
+      FBB = TBB;
+      TBB = I->getOperand(0).getMBB();
+      Cond.push_back(MachineOperand::CreateImm(BranchCode));
+      continue;
+    }
+
+    // Handle subsequent conditional branches. Only handle the case where all
+    // conditional branches branch to the same destination and their condition
+    // opcodes fit one of the special multi-branch idioms.
+    assert(Cond.size() == 1);
+    assert(TBB);
+
+    // Only handle the case where all conditional branches branch to the same
+    // destination.
+    if (TBB != I->getOperand(0).getMBB())
+      return true;
+
+    // If the conditions are the same, we can leave them alone.
+    X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm();
+    if (OldBranchCode == BranchCode)
+      continue;
+
+    // If they differ, see if they fit one of the known patterns. Theoretically,
+    // we could handle more patterns here, but we shouldn't expect to see them
+    // if instruction selection has done a reasonable job.
+    if ((OldBranchCode == X86::COND_NP &&
+         BranchCode == X86::COND_E) ||
+        (OldBranchCode == X86::COND_E &&
+         BranchCode == X86::COND_NP))
+      BranchCode = X86::COND_NP_OR_E;
+    else if ((OldBranchCode == X86::COND_P &&
+              BranchCode == X86::COND_NE) ||
+             (OldBranchCode == X86::COND_NE &&
+              BranchCode == X86::COND_P))
+      BranchCode = X86::COND_NE_OR_P;
+    else
+      return true;
+
+    // Update the MachineOperand.
+    Cond[0].setImm(BranchCode);
+  }
+
+  return false;
+}
+
+unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.end();
+  unsigned Count = 0;
+
+  while (I != MBB.begin()) {
+    --I;
+    if (I->isDebugValue())
+      continue;
+    if (I->getOpcode() != X86::JMP_4 &&
+        getCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID)
+      break;
+    // Remove the branch.
+    I->eraseFromParent();
+    I = MBB.end();
+    ++Count;
+  }
+
+  return Count;
+}
+
+unsigned
+X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                           MachineBasicBlock *FBB,
+                           const SmallVectorImpl<MachineOperand> &Cond,
+                           DebugLoc DL) const {
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 1 || Cond.size() == 0) &&
+         "X86 branch conditions have one component!");
+
+  if (Cond.empty()) {
+    // Unconditional branch?
+    assert(!FBB && "Unconditional branch with multiple successors!");
+    BuildMI(&MBB, DL, get(X86::JMP_4)).addMBB(TBB);
+    return 1;
+  }
+
+  // Conditional branch.
+  unsigned Count = 0;
+  X86::CondCode CC = (X86::CondCode)Cond[0].getImm();
+  switch (CC) {
+  case X86::COND_NP_OR_E:
+    // Synthesize NP_OR_E with two branches.
+    BuildMI(&MBB, DL, get(X86::JNP_4)).addMBB(TBB);
+    ++Count;
+    BuildMI(&MBB, DL, get(X86::JE_4)).addMBB(TBB);
+    ++Count;
+    break;
+  case X86::COND_NE_OR_P:
+    // Synthesize NE_OR_P with two branches.
+    BuildMI(&MBB, DL, get(X86::JNE_4)).addMBB(TBB);
+    ++Count;
+    BuildMI(&MBB, DL, get(X86::JP_4)).addMBB(TBB);
+    ++Count;
+    break;
+  default: {
+    unsigned Opc = GetCondBranchFromCond(CC);
+    BuildMI(&MBB, DL, get(Opc)).addMBB(TBB);
+    ++Count;
+  }
+  }
+  if (FBB) {
+    // Two-way Conditional branch. Insert the second branch.
+    BuildMI(&MBB, DL, get(X86::JMP_4)).addMBB(FBB);
+    ++Count;
+  }
+  return Count;
+}
+
+bool X86InstrInfo::
+canInsertSelect(const MachineBasicBlock &MBB,
+                const SmallVectorImpl<MachineOperand> &Cond,
+                unsigned TrueReg, unsigned FalseReg,
+                int &CondCycles, int &TrueCycles, int &FalseCycles) const {
+  // Not all subtargets have cmov instructions.
+  if (!Subtarget.hasCMov())
+    return false;
+  if (Cond.size() != 1)
+    return false;
+  // We cannot do the composite conditions, at least not in SSA form.
+  if ((X86::CondCode)Cond[0].getImm() > X86::COND_S)
+    return false;
+
+  // Check register classes.
+  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  const TargetRegisterClass *RC =
+    RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
+  if (!RC)
+    return false;
+
+  // We have cmov instructions for 16, 32, and 64 bit general purpose registers.
+  if (X86::GR16RegClass.hasSubClassEq(RC) ||
+      X86::GR32RegClass.hasSubClassEq(RC) ||
+      X86::GR64RegClass.hasSubClassEq(RC)) {
+    // This latency applies to Pentium M, Merom, Wolfdale, Nehalem, and Sandy
+    // Bridge. Probably Ivy Bridge as well.
+    CondCycles = 2;
+    TrueCycles = 2;
+    FalseCycles = 2;
+    return true;
+  }
+
+  // Can't do vectors.
+  return false;
+}
+
+void X86InstrInfo::insertSelect(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator I, DebugLoc DL,
+                                unsigned DstReg,
+                                const SmallVectorImpl<MachineOperand> &Cond,
+                                unsigned TrueReg, unsigned FalseReg) const {
+   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+   assert(Cond.size() == 1 && "Invalid Cond array");
+   unsigned Opc = getCMovFromCond((X86::CondCode)Cond[0].getImm(),
+                                  MRI.getRegClass(DstReg)->getSize(),
+                                  false/*HasMemoryOperand*/);
+   BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(FalseReg).addReg(TrueReg);
+}
+
+/// isHReg - Test if the given register is a physical h register.
+static bool isHReg(unsigned Reg) {
+  return X86::GR8_ABCD_HRegClass.contains(Reg);
+}
+
+// Try and copy between VR128/VR64 and GR64 registers.
+static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
+                                        const X86Subtarget &Subtarget) {
+
+  // SrcReg(VR128) -> DestReg(GR64)
+  // SrcReg(VR64)  -> DestReg(GR64)
+  // SrcReg(GR64)  -> DestReg(VR128)
+  // SrcReg(GR64)  -> DestReg(VR64)
+
+  bool HasAVX = Subtarget.hasAVX();
+  bool HasAVX512 = Subtarget.hasAVX512();
+  if (X86::GR64RegClass.contains(DestReg)) {
+    if (X86::VR128XRegClass.contains(SrcReg))
+      // Copy from a VR128 register to a GR64 register.
+      return HasAVX512 ? X86::VMOVPQIto64Zrr: (HasAVX ? X86::VMOVPQIto64rr :
+                                               X86::MOVPQIto64rr);
+    if (X86::VR64RegClass.contains(SrcReg))
+      // Copy from a VR64 register to a GR64 register.
+      return X86::MOVSDto64rr;
+  } else if (X86::GR64RegClass.contains(SrcReg)) {
+    // Copy from a GR64 register to a VR128 register.
+    if (X86::VR128XRegClass.contains(DestReg))
+      return HasAVX512 ? X86::VMOV64toPQIZrr: (HasAVX ? X86::VMOV64toPQIrr :
+                                               X86::MOV64toPQIrr);
+    // Copy from a GR64 register to a VR64 register.
+    if (X86::VR64RegClass.contains(DestReg))
+      return X86::MOV64toSDrr;
+  }
+
+  // SrcReg(FR32) -> DestReg(GR32)
+  // SrcReg(GR32) -> DestReg(FR32)
+
+  if (X86::GR32RegClass.contains(DestReg) && X86::FR32XRegClass.contains(SrcReg))
+    // Copy from a FR32 register to a GR32 register.
+    return HasAVX512 ? X86::VMOVSS2DIZrr : (HasAVX ? X86::VMOVSS2DIrr : X86::MOVSS2DIrr);
+
+  if (X86::FR32XRegClass.contains(DestReg) && X86::GR32RegClass.contains(SrcReg))
+    // Copy from a GR32 register to a FR32 register.
+    return HasAVX512 ? X86::VMOVDI2SSZrr : (HasAVX ? X86::VMOVDI2SSrr : X86::MOVDI2SSrr);
+  return 0;
+}
+
+inline static bool MaskRegClassContains(unsigned Reg) {
+  return X86::VK8RegClass.contains(Reg) ||
+         X86::VK16RegClass.contains(Reg) ||
+         X86::VK1RegClass.contains(Reg);
+}
+static
+unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg) {
+  if (X86::VR128XRegClass.contains(DestReg, SrcReg) ||
+      X86::VR256XRegClass.contains(DestReg, SrcReg) ||
+      X86::VR512RegClass.contains(DestReg, SrcReg)) {
+     DestReg = get512BitSuperRegister(DestReg);
+     SrcReg = get512BitSuperRegister(SrcReg);
+     return X86::VMOVAPSZrr;
+  }
+  if (MaskRegClassContains(DestReg) &&
+      MaskRegClassContains(SrcReg))
+    return X86::KMOVWkk;
+  if (MaskRegClassContains(DestReg) &&
+      (X86::GR32RegClass.contains(SrcReg) ||
+       X86::GR16RegClass.contains(SrcReg) ||
+       X86::GR8RegClass.contains(SrcReg))) {
+    SrcReg = getX86SubSuperRegister(SrcReg, MVT::i32);
+    return X86::KMOVWkr;
+  }
+  if ((X86::GR32RegClass.contains(DestReg) ||
+       X86::GR16RegClass.contains(DestReg) ||
+       X86::GR8RegClass.contains(DestReg)) &&
+       MaskRegClassContains(SrcReg)) {
+    DestReg = getX86SubSuperRegister(DestReg, MVT::i32);
+    return X86::KMOVWrk;
+  }
+  return 0;
+}
+
+void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator MI, DebugLoc DL,
+                               unsigned DestReg, unsigned SrcReg,
+                               bool KillSrc) const {
+  // First deal with the normal symmetric copies.
+  bool HasAVX = Subtarget.hasAVX();
+  bool HasAVX512 = Subtarget.hasAVX512();
+  unsigned Opc = 0;
+  if (X86::GR64RegClass.contains(DestReg, SrcReg))
+    Opc = X86::MOV64rr;
+  else if (X86::GR32RegClass.contains(DestReg, SrcReg))
+    Opc = X86::MOV32rr;
+  else if (X86::GR16RegClass.contains(DestReg, SrcReg))
+    Opc = X86::MOV16rr;
+  else if (X86::GR8RegClass.contains(DestReg, SrcReg)) {
+    // Copying to or from a physical H register on x86-64 requires a NOREX
+    // move.  Otherwise use a normal move.
+    if ((isHReg(DestReg) || isHReg(SrcReg)) &&
+        Subtarget.is64Bit()) {
+      Opc = X86::MOV8rr_NOREX;
+      // Both operands must be encodable without an REX prefix.
+      assert(X86::GR8_NOREXRegClass.contains(SrcReg, DestReg) &&
+             "8-bit H register can not be copied outside GR8_NOREX");
+    } else
+      Opc = X86::MOV8rr;
+  }
+  else if (X86::VR64RegClass.contains(DestReg, SrcReg))
+    Opc = X86::MMX_MOVQ64rr;
+  else if (HasAVX512)
+    Opc = copyPhysRegOpcode_AVX512(DestReg, SrcReg);
+  else if (X86::VR128RegClass.contains(DestReg, SrcReg))
+    Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr;
+  else if (X86::VR256RegClass.contains(DestReg, SrcReg))
+    Opc = X86::VMOVAPSYrr;
+  if (!Opc)
+    Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, Subtarget);
+
+  if (Opc) {
+    BuildMI(MBB, MI, DL, get(Opc), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+
+  // Moving EFLAGS to / from another register requires a push and a pop.
+  // Notice that we have to adjust the stack if we don't want to clobber the
+  // first frame index. See X86FrameLowering.cpp - colobbersTheStack.
+  if (SrcReg == X86::EFLAGS) {
+    if (X86::GR64RegClass.contains(DestReg)) {
+      BuildMI(MBB, MI, DL, get(X86::PUSHF64));
+      BuildMI(MBB, MI, DL, get(X86::POP64r), DestReg);
+      return;
+    }
+    if (X86::GR32RegClass.contains(DestReg)) {
+      BuildMI(MBB, MI, DL, get(X86::PUSHF32));
+      BuildMI(MBB, MI, DL, get(X86::POP32r), DestReg);
+      return;
+    }
+  }
+  if (DestReg == X86::EFLAGS) {
+    if (X86::GR64RegClass.contains(SrcReg)) {
+      BuildMI(MBB, MI, DL, get(X86::PUSH64r))
+        .addReg(SrcReg, getKillRegState(KillSrc));
+      BuildMI(MBB, MI, DL, get(X86::POPF64));
+      return;
+    }
+    if (X86::GR32RegClass.contains(SrcReg)) {
+      BuildMI(MBB, MI, DL, get(X86::PUSH32r))
+        .addReg(SrcReg, getKillRegState(KillSrc));
+      BuildMI(MBB, MI, DL, get(X86::POPF32));
+      return;
+    }
+  }
+
+  DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg)
+               << " to " << RI.getName(DestReg) << '\n');
+  llvm_unreachable("Cannot emit physreg copy instruction");
+}
+
+static unsigned getLoadStoreRegOpcode(unsigned Reg,
+                                      const TargetRegisterClass *RC,
+                                      bool isStackAligned,
+                                      const X86Subtarget &STI,
+                                      bool load) {
+  if (STI.hasAVX512()) {
+    if (X86::VK8RegClass.hasSubClassEq(RC)  ||
+      X86::VK16RegClass.hasSubClassEq(RC))
+      return load ? X86::KMOVWkm : X86::KMOVWmk;
+    if (RC->getSize() == 4 && X86::FR32XRegClass.hasSubClassEq(RC))
+      return load ? X86::VMOVSSZrm : X86::VMOVSSZmr;
+    if (RC->getSize() == 8 && X86::FR64XRegClass.hasSubClassEq(RC))
+      return load ? X86::VMOVSDZrm : X86::VMOVSDZmr;
+    if (X86::VR512RegClass.hasSubClassEq(RC))
+      return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
+  }
+
+  bool HasAVX = STI.hasAVX();
+  switch (RC->getSize()) {
+  default:
+    llvm_unreachable("Unknown spill size");
+  case 1:
+    assert(X86::GR8RegClass.hasSubClassEq(RC) && "Unknown 1-byte regclass");
+    if (STI.is64Bit())
+      // Copying to or from a physical H register on x86-64 requires a NOREX
+      // move.  Otherwise use a normal move.
+      if (isHReg(Reg) || X86::GR8_ABCD_HRegClass.hasSubClassEq(RC))
+        return load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX;
+    return load ? X86::MOV8rm : X86::MOV8mr;
+  case 2:
+    assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass");
+    return load ? X86::MOV16rm : X86::MOV16mr;
+  case 4:
+    if (X86::GR32RegClass.hasSubClassEq(RC))
+      return load ? X86::MOV32rm : X86::MOV32mr;
+    if (X86::FR32RegClass.hasSubClassEq(RC))
+      return load ?
+        (HasAVX ? X86::VMOVSSrm : X86::MOVSSrm) :
+        (HasAVX ? X86::VMOVSSmr : X86::MOVSSmr);
+    if (X86::RFP32RegClass.hasSubClassEq(RC))
+      return load ? X86::LD_Fp32m : X86::ST_Fp32m;
+    llvm_unreachable("Unknown 4-byte regclass");
+  case 8:
+    if (X86::GR64RegClass.hasSubClassEq(RC))
+      return load ? X86::MOV64rm : X86::MOV64mr;
+    if (X86::FR64RegClass.hasSubClassEq(RC))
+      return load ?
+        (HasAVX ? X86::VMOVSDrm : X86::MOVSDrm) :
+        (HasAVX ? X86::VMOVSDmr : X86::MOVSDmr);
+    if (X86::VR64RegClass.hasSubClassEq(RC))
+      return load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr;
+    if (X86::RFP64RegClass.hasSubClassEq(RC))
+      return load ? X86::LD_Fp64m : X86::ST_Fp64m;
+    llvm_unreachable("Unknown 8-byte regclass");
+  case 10:
+    assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass");
+    return load ? X86::LD_Fp80m : X86::ST_FpP80m;
+  case 16: {
+    assert((X86::VR128RegClass.hasSubClassEq(RC) ||
+            X86::VR128XRegClass.hasSubClassEq(RC))&& "Unknown 16-byte regclass");
+    // If stack is realigned we can use aligned stores.
+    if (isStackAligned)
+      return load ?
+        (HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm) :
+        (HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr);
+    else
+      return load ?
+        (HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm) :
+        (HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr);
+  }
+  case 32:
+    assert((X86::VR256RegClass.hasSubClassEq(RC) ||
+            X86::VR256XRegClass.hasSubClassEq(RC)) && "Unknown 32-byte regclass");
+    // If stack is realigned we can use aligned stores.
+    if (isStackAligned)
+      return load ? X86::VMOVAPSYrm : X86::VMOVAPSYmr;
+    else
+      return load ? X86::VMOVUPSYrm : X86::VMOVUPSYmr;
+  case 64:
+    assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass");
+    if (isStackAligned)
+      return load ? X86::VMOVAPSZrm : X86::VMOVAPSZmr;
+    else
+      return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
+  }
+}
+
+static unsigned getStoreRegOpcode(unsigned SrcReg,
+                                  const TargetRegisterClass *RC,
+                                  bool isStackAligned,
+                                  const X86Subtarget &STI) {
+  return getLoadStoreRegOpcode(SrcReg, RC, isStackAligned, STI, false);
+}
+
+
+static unsigned getLoadRegOpcode(unsigned DestReg,
+                                 const TargetRegisterClass *RC,
+                                 bool isStackAligned,
+                                 const X86Subtarget &STI) {
+  return getLoadStoreRegOpcode(DestReg, RC, isStackAligned, STI, true);
+}
+
+void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MI,
+                                       unsigned SrcReg, bool isKill, int FrameIdx,
+                                       const TargetRegisterClass *RC,
+                                       const TargetRegisterInfo *TRI) const {
+  const MachineFunction &MF = *MBB.getParent();
+  assert(MF.getFrameInfo()->getObjectSize(FrameIdx) >= RC->getSize() &&
+         "Stack slot too small for store");
+  unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
+  bool isAligned =
+      (MF.getTarget().getFrameLowering()->getStackAlignment() >= Alignment) ||
+      RI.canRealignStack(MF);
+  unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
+  DebugLoc DL = MBB.findDebugLoc(MI);
+  addFrameReference(BuildMI(MBB, MI, DL, get(Opc)), FrameIdx)
+    .addReg(SrcReg, getKillRegState(isKill));
+}
+
+void X86InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
+                                  bool isKill,
+                                  SmallVectorImpl<MachineOperand> &Addr,
+                                  const TargetRegisterClass *RC,
+                                  MachineInstr::mmo_iterator MMOBegin,
+                                  MachineInstr::mmo_iterator MMOEnd,
+                                  SmallVectorImpl<MachineInstr*> &NewMIs) const {
+  unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
+  bool isAligned = MMOBegin != MMOEnd &&
+                   (*MMOBegin)->getAlignment() >= Alignment;
+  unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
+  DebugLoc DL;
+  MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
+  for (unsigned i = 0, e = Addr.size(); i != e; ++i)
+    MIB.addOperand(Addr[i]);
+  MIB.addReg(SrcReg, getKillRegState(isKill));
+  (*MIB).setMemRefs(MMOBegin, MMOEnd);
+  NewMIs.push_back(MIB);
+}
+
+
+void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MI,
+                                        unsigned DestReg, int FrameIdx,
+                                        const TargetRegisterClass *RC,
+                                        const TargetRegisterInfo *TRI) const {
+  const MachineFunction &MF = *MBB.getParent();
+  unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
+  bool isAligned =
+      (MF.getTarget().getFrameLowering()->getStackAlignment() >= Alignment) ||
+      RI.canRealignStack(MF);
+  unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
+  DebugLoc DL = MBB.findDebugLoc(MI);
+  addFrameReference(BuildMI(MBB, MI, DL, get(Opc), DestReg), FrameIdx);
+}
+
+void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+                                 SmallVectorImpl<MachineOperand> &Addr,
+                                 const TargetRegisterClass *RC,
+                                 MachineInstr::mmo_iterator MMOBegin,
+                                 MachineInstr::mmo_iterator MMOEnd,
+                                 SmallVectorImpl<MachineInstr*> &NewMIs) const {
+  unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
+  bool isAligned = MMOBegin != MMOEnd &&
+                   (*MMOBegin)->getAlignment() >= Alignment;
+  unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
+  DebugLoc DL;
+  MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg);
+  for (unsigned i = 0, e = Addr.size(); i != e; ++i)
+    MIB.addOperand(Addr[i]);
+  (*MIB).setMemRefs(MMOBegin, MMOEnd);
+  NewMIs.push_back(MIB);
+}
+
+bool X86InstrInfo::
+analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, unsigned &SrcReg2,
+               int &CmpMask, int &CmpValue) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case X86::CMP64ri32:
+  case X86::CMP64ri8:
+  case X86::CMP32ri:
+  case X86::CMP32ri8:
+  case X86::CMP16ri:
+  case X86::CMP16ri8:
+  case X86::CMP8ri:
+    SrcReg = MI->getOperand(0).getReg();
+    SrcReg2 = 0;
+    CmpMask = ~0;
+    CmpValue = MI->getOperand(1).getImm();
+    return true;
+  // A SUB can be used to perform comparison.
+  case X86::SUB64rm:
+  case X86::SUB32rm:
+  case X86::SUB16rm:
+  case X86::SUB8rm:
+    SrcReg = MI->getOperand(1).getReg();
+    SrcReg2 = 0;
+    CmpMask = ~0;
+    CmpValue = 0;
+    return true;
+  case X86::SUB64rr:
+  case X86::SUB32rr:
+  case X86::SUB16rr:
+  case X86::SUB8rr:
+    SrcReg = MI->getOperand(1).getReg();
+    SrcReg2 = MI->getOperand(2).getReg();
+    CmpMask = ~0;
+    CmpValue = 0;
+    return true;
+  case X86::SUB64ri32:
+  case X86::SUB64ri8:
+  case X86::SUB32ri:
+  case X86::SUB32ri8:
+  case X86::SUB16ri:
+  case X86::SUB16ri8:
+  case X86::SUB8ri:
+    SrcReg = MI->getOperand(1).getReg();
+    SrcReg2 = 0;
+    CmpMask = ~0;
+    CmpValue = MI->getOperand(2).getImm();
+    return true;
+  case X86::CMP64rr:
+  case X86::CMP32rr:
+  case X86::CMP16rr:
+  case X86::CMP8rr:
+    SrcReg = MI->getOperand(0).getReg();
+    SrcReg2 = MI->getOperand(1).getReg();
+    CmpMask = ~0;
+    CmpValue = 0;
+    return true;
+  case X86::TEST8rr:
+  case X86::TEST16rr:
+  case X86::TEST32rr:
+  case X86::TEST64rr:
+    SrcReg = MI->getOperand(0).getReg();
+    if (MI->getOperand(1).getReg() != SrcReg) return false;
+    // Compare against zero.
+    SrcReg2 = 0;
+    CmpMask = ~0;
+    CmpValue = 0;
+    return true;
+  }
+  return false;
+}
+
+/// isRedundantFlagInstr - check whether the first instruction, whose only
+/// purpose is to update flags, can be made redundant.
+/// CMPrr can be made redundant by SUBrr if the operands are the same.
+/// This function can be extended later on.
+/// SrcReg, SrcRegs: register operands for FlagI.
+/// ImmValue: immediate for FlagI if it takes an immediate.
+inline static bool isRedundantFlagInstr(MachineInstr *FlagI, unsigned SrcReg,
+                                        unsigned SrcReg2, int ImmValue,
+                                        MachineInstr *OI) {
+  if (((FlagI->getOpcode() == X86::CMP64rr &&
+        OI->getOpcode() == X86::SUB64rr) ||
+       (FlagI->getOpcode() == X86::CMP32rr &&
+        OI->getOpcode() == X86::SUB32rr)||
+       (FlagI->getOpcode() == X86::CMP16rr &&
+        OI->getOpcode() == X86::SUB16rr)||
+       (FlagI->getOpcode() == X86::CMP8rr &&
+        OI->getOpcode() == X86::SUB8rr)) &&
+      ((OI->getOperand(1).getReg() == SrcReg &&
+        OI->getOperand(2).getReg() == SrcReg2) ||
+       (OI->getOperand(1).getReg() == SrcReg2 &&
+        OI->getOperand(2).getReg() == SrcReg)))
+    return true;
+
+  if (((FlagI->getOpcode() == X86::CMP64ri32 &&
+        OI->getOpcode() == X86::SUB64ri32) ||
+       (FlagI->getOpcode() == X86::CMP64ri8 &&
+        OI->getOpcode() == X86::SUB64ri8) ||
+       (FlagI->getOpcode() == X86::CMP32ri &&
+        OI->getOpcode() == X86::SUB32ri) ||
+       (FlagI->getOpcode() == X86::CMP32ri8 &&
+        OI->getOpcode() == X86::SUB32ri8) ||
+       (FlagI->getOpcode() == X86::CMP16ri &&
+        OI->getOpcode() == X86::SUB16ri) ||
+       (FlagI->getOpcode() == X86::CMP16ri8 &&
+        OI->getOpcode() == X86::SUB16ri8) ||
+       (FlagI->getOpcode() == X86::CMP8ri &&
+        OI->getOpcode() == X86::SUB8ri)) &&
+      OI->getOperand(1).getReg() == SrcReg &&
+      OI->getOperand(2).getImm() == ImmValue)
+    return true;
+  return false;
+}
+
+/// isDefConvertible - check whether the definition can be converted
+/// to remove a comparison against zero.
+inline static bool isDefConvertible(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  default: return false;
+
+  // The shift instructions only modify ZF if their shift count is non-zero.
+  // N.B.: The processor truncates the shift count depending on the encoding.
+  case X86::SAR8ri:    case X86::SAR16ri:  case X86::SAR32ri:case X86::SAR64ri:
+  case X86::SHR8ri:    case X86::SHR16ri:  case X86::SHR32ri:case X86::SHR64ri:
+     return getTruncatedShiftCount(MI, 2) != 0;
+
+  // Some left shift instructions can be turned into LEA instructions but only
+  // if their flags aren't used. Avoid transforming such instructions.
+  case X86::SHL8ri:    case X86::SHL16ri:  case X86::SHL32ri:case X86::SHL64ri:{
+    unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+    if (isTruncatedShiftCountForLEA(ShAmt)) return false;
+    return ShAmt != 0;
+  }
+
+  case X86::SHRD16rri8:case X86::SHRD32rri8:case X86::SHRD64rri8:
+  case X86::SHLD16rri8:case X86::SHLD32rri8:case X86::SHLD64rri8:
+     return getTruncatedShiftCount(MI, 3) != 0;
+
+  case X86::SUB64ri32: case X86::SUB64ri8: case X86::SUB32ri:
+  case X86::SUB32ri8:  case X86::SUB16ri:  case X86::SUB16ri8:
+  case X86::SUB8ri:    case X86::SUB64rr:  case X86::SUB32rr:
+  case X86::SUB16rr:   case X86::SUB8rr:   case X86::SUB64rm:
+  case X86::SUB32rm:   case X86::SUB16rm:  case X86::SUB8rm:
+  case X86::DEC64r:    case X86::DEC32r:   case X86::DEC16r: case X86::DEC8r:
+  case X86::DEC64_32r: case X86::DEC64_16r:
+  case X86::ADD64ri32: case X86::ADD64ri8: case X86::ADD32ri:
+  case X86::ADD32ri8:  case X86::ADD16ri:  case X86::ADD16ri8:
+  case X86::ADD8ri:    case X86::ADD64rr:  case X86::ADD32rr:
+  case X86::ADD16rr:   case X86::ADD8rr:   case X86::ADD64rm:
+  case X86::ADD32rm:   case X86::ADD16rm:  case X86::ADD8rm:
+  case X86::INC64r:    case X86::INC32r:   case X86::INC16r: case X86::INC8r:
+  case X86::INC64_32r: case X86::INC64_16r:
+  case X86::AND64ri32: case X86::AND64ri8: case X86::AND32ri:
+  case X86::AND32ri8:  case X86::AND16ri:  case X86::AND16ri8:
+  case X86::AND8ri:    case X86::AND64rr:  case X86::AND32rr:
+  case X86::AND16rr:   case X86::AND8rr:   case X86::AND64rm:
+  case X86::AND32rm:   case X86::AND16rm:  case X86::AND8rm:
+  case X86::XOR64ri32: case X86::XOR64ri8: case X86::XOR32ri:
+  case X86::XOR32ri8:  case X86::XOR16ri:  case X86::XOR16ri8:
+  case X86::XOR8ri:    case X86::XOR64rr:  case X86::XOR32rr:
+  case X86::XOR16rr:   case X86::XOR8rr:   case X86::XOR64rm:
+  case X86::XOR32rm:   case X86::XOR16rm:  case X86::XOR8rm:
+  case X86::OR64ri32:  case X86::OR64ri8:  case X86::OR32ri:
+  case X86::OR32ri8:   case X86::OR16ri:   case X86::OR16ri8:
+  case X86::OR8ri:     case X86::OR64rr:   case X86::OR32rr:
+  case X86::OR16rr:    case X86::OR8rr:    case X86::OR64rm:
+  case X86::OR32rm:    case X86::OR16rm:   case X86::OR8rm:
+  case X86::NEG8r:     case X86::NEG16r:   case X86::NEG32r: case X86::NEG64r:
+  case X86::SAR8r1:    case X86::SAR16r1:  case X86::SAR32r1:case X86::SAR64r1:
+  case X86::SHR8r1:    case X86::SHR16r1:  case X86::SHR32r1:case X86::SHR64r1:
+  case X86::SHL8r1:    case X86::SHL16r1:  case X86::SHL32r1:case X86::SHL64r1:
+  case X86::ADC32ri:   case X86::ADC32ri8:
+  case X86::ADC32rr:   case X86::ADC64ri32:
+  case X86::ADC64ri8:  case X86::ADC64rr:
+  case X86::SBB32ri:   case X86::SBB32ri8:
+  case X86::SBB32rr:   case X86::SBB64ri32:
+  case X86::SBB64ri8:  case X86::SBB64rr:
+  case X86::ANDN32rr:  case X86::ANDN32rm:
+  case X86::ANDN64rr:  case X86::ANDN64rm:
+  case X86::BEXTR32rr: case X86::BEXTR64rr:
+  case X86::BEXTR32rm: case X86::BEXTR64rm:
+  case X86::BLSI32rr:  case X86::BLSI32rm:
+  case X86::BLSI64rr:  case X86::BLSI64rm:
+  case X86::BLSMSK32rr:case X86::BLSMSK32rm:
+  case X86::BLSMSK64rr:case X86::BLSMSK64rm:
+  case X86::BLSR32rr:  case X86::BLSR32rm:
+  case X86::BLSR64rr:  case X86::BLSR64rm:
+  case X86::BZHI32rr:  case X86::BZHI32rm:
+  case X86::BZHI64rr:  case X86::BZHI64rm:
+  case X86::LZCNT16rr: case X86::LZCNT16rm:
+  case X86::LZCNT32rr: case X86::LZCNT32rm:
+  case X86::LZCNT64rr: case X86::LZCNT64rm:
+  case X86::POPCNT16rr:case X86::POPCNT16rm:
+  case X86::POPCNT32rr:case X86::POPCNT32rm:
+  case X86::POPCNT64rr:case X86::POPCNT64rm:
+  case X86::TZCNT16rr: case X86::TZCNT16rm:
+  case X86::TZCNT32rr: case X86::TZCNT32rm:
+  case X86::TZCNT64rr: case X86::TZCNT64rm:
+    return true;
+  }
+}
+
+/// isUseDefConvertible - check whether the use can be converted
+/// to remove a comparison against zero.
+static X86::CondCode isUseDefConvertible(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  default: return X86::COND_INVALID;
+  case X86::LZCNT16rr: case X86::LZCNT16rm:
+  case X86::LZCNT32rr: case X86::LZCNT32rm:
+  case X86::LZCNT64rr: case X86::LZCNT64rm:
+    return X86::COND_B;
+  case X86::POPCNT16rr:case X86::POPCNT16rm:
+  case X86::POPCNT32rr:case X86::POPCNT32rm:
+  case X86::POPCNT64rr:case X86::POPCNT64rm:
+    return X86::COND_E;
+  case X86::TZCNT16rr: case X86::TZCNT16rm:
+  case X86::TZCNT32rr: case X86::TZCNT32rm:
+  case X86::TZCNT64rr: case X86::TZCNT64rm:
+    return X86::COND_B;
+  }
+}
+
+/// optimizeCompareInstr - Check if there exists an earlier instruction that
+/// operates on the same source operands and sets flags in the same way as
+/// Compare; remove Compare if possible.
+bool X86InstrInfo::
+optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
+                     int CmpMask, int CmpValue,
+                     const MachineRegisterInfo *MRI) const {
+  // Check whether we can replace SUB with CMP.
+  unsigned NewOpcode = 0;
+  switch (CmpInstr->getOpcode()) {
+  default: break;
+  case X86::SUB64ri32:
+  case X86::SUB64ri8:
+  case X86::SUB32ri:
+  case X86::SUB32ri8:
+  case X86::SUB16ri:
+  case X86::SUB16ri8:
+  case X86::SUB8ri:
+  case X86::SUB64rm:
+  case X86::SUB32rm:
+  case X86::SUB16rm:
+  case X86::SUB8rm:
+  case X86::SUB64rr:
+  case X86::SUB32rr:
+  case X86::SUB16rr:
+  case X86::SUB8rr: {
+    if (!MRI->use_nodbg_empty(CmpInstr->getOperand(0).getReg()))
+      return false;
+    // There is no use of the destination register, we can replace SUB with CMP.
+    switch (CmpInstr->getOpcode()) {
+    default: llvm_unreachable("Unreachable!");
+    case X86::SUB64rm:   NewOpcode = X86::CMP64rm;   break;
+    case X86::SUB32rm:   NewOpcode = X86::CMP32rm;   break;
+    case X86::SUB16rm:   NewOpcode = X86::CMP16rm;   break;
+    case X86::SUB8rm:    NewOpcode = X86::CMP8rm;    break;
+    case X86::SUB64rr:   NewOpcode = X86::CMP64rr;   break;
+    case X86::SUB32rr:   NewOpcode = X86::CMP32rr;   break;
+    case X86::SUB16rr:   NewOpcode = X86::CMP16rr;   break;
+    case X86::SUB8rr:    NewOpcode = X86::CMP8rr;    break;
+    case X86::SUB64ri32: NewOpcode = X86::CMP64ri32; break;
+    case X86::SUB64ri8:  NewOpcode = X86::CMP64ri8;  break;
+    case X86::SUB32ri:   NewOpcode = X86::CMP32ri;   break;
+    case X86::SUB32ri8:  NewOpcode = X86::CMP32ri8;  break;
+    case X86::SUB16ri:   NewOpcode = X86::CMP16ri;   break;
+    case X86::SUB16ri8:  NewOpcode = X86::CMP16ri8;  break;
+    case X86::SUB8ri:    NewOpcode = X86::CMP8ri;    break;
+    }
+    CmpInstr->setDesc(get(NewOpcode));
+    CmpInstr->RemoveOperand(0);
+    // Fall through to optimize Cmp if Cmp is CMPrr or CMPri.
+    if (NewOpcode == X86::CMP64rm || NewOpcode == X86::CMP32rm ||
+        NewOpcode == X86::CMP16rm || NewOpcode == X86::CMP8rm)
+      return false;
+  }
+  }
+
+  // Get the unique definition of SrcReg.
+  MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
+  if (!MI) return false;
+
+  // CmpInstr is the first instruction of the BB.
+  MachineBasicBlock::iterator I = CmpInstr, Def = MI;
+
+  // If we are comparing against zero, check whether we can use MI to update
+  // EFLAGS. If MI is not in the same BB as CmpInstr, do not optimize.
+  bool IsCmpZero = (SrcReg2 == 0 && CmpValue == 0);
+  if (IsCmpZero && MI->getParent() != CmpInstr->getParent())
+    return false;
+
+  // If we have a use of the source register between the def and our compare
+  // instruction we can eliminate the compare iff the use sets EFLAGS in the
+  // right way.
+  bool ShouldUpdateCC = false;
+  X86::CondCode NewCC = X86::COND_INVALID;
+  if (IsCmpZero && !isDefConvertible(MI)) {
+    // Scan forward from the use until we hit the use we're looking for or the
+    // compare instruction.
+    for (MachineBasicBlock::iterator J = MI;; ++J) {
+      // Do we have a convertible instruction?
+      NewCC = isUseDefConvertible(J);
+      if (NewCC != X86::COND_INVALID && J->getOperand(1).isReg() &&
+          J->getOperand(1).getReg() == SrcReg) {
+        assert(J->definesRegister(X86::EFLAGS) && "Must be an EFLAGS def!");
+        ShouldUpdateCC = true; // Update CC later on.
+        // This is not a def of SrcReg, but still a def of EFLAGS. Keep going
+        // with the new def.
+        MI = Def = J;
+        break;
+      }
+
+      if (J == I)
+        return false;
+    }
+  }
+
+  // We are searching for an earlier instruction that can make CmpInstr
+  // redundant and that instruction will be saved in Sub.
+  MachineInstr *Sub = nullptr;
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+
+  // We iterate backward, starting from the instruction before CmpInstr and
+  // stop when reaching the definition of a source register or done with the BB.
+  // RI points to the instruction before CmpInstr.
+  // If the definition is in this basic block, RE points to the definition;
+  // otherwise, RE is the rend of the basic block.
+  MachineBasicBlock::reverse_iterator
+      RI = MachineBasicBlock::reverse_iterator(I),
+      RE = CmpInstr->getParent() == MI->getParent() ?
+           MachineBasicBlock::reverse_iterator(++Def) /* points to MI */ :
+           CmpInstr->getParent()->rend();
+  MachineInstr *Movr0Inst = nullptr;
+  for (; RI != RE; ++RI) {
+    MachineInstr *Instr = &*RI;
+    // Check whether CmpInstr can be made redundant by the current instruction.
+    if (!IsCmpZero &&
+        isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpValue, Instr)) {
+      Sub = Instr;
+      break;
+    }
+
+    if (Instr->modifiesRegister(X86::EFLAGS, TRI) ||
+        Instr->readsRegister(X86::EFLAGS, TRI)) {
+      // This instruction modifies or uses EFLAGS.
+
+      // MOV32r0 etc. are implemented with xor which clobbers condition code.
+      // They are safe to move up, if the definition to EFLAGS is dead and
+      // earlier instructions do not read or write EFLAGS.
+      if (!Movr0Inst && Instr->getOpcode() == X86::MOV32r0 &&
+          Instr->registerDefIsDead(X86::EFLAGS, TRI)) {
+        Movr0Inst = Instr;
+        continue;
+      }
+
+      // We can't remove CmpInstr.
+      return false;
+    }
+  }
+
+  // Return false if no candidates exist.
+  if (!IsCmpZero && !Sub)
+    return false;
+
+  bool IsSwapped = (SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 &&
+                    Sub->getOperand(2).getReg() == SrcReg);
+
+  // Scan forward from the instruction after CmpInstr for uses of EFLAGS.
+  // It is safe to remove CmpInstr if EFLAGS is redefined or killed.
+  // If we are done with the basic block, we need to check whether EFLAGS is
+  // live-out.
+  bool IsSafe = false;
+  SmallVector<std::pair<MachineInstr*, unsigned /*NewOpc*/>, 4> OpsToUpdate;
+  MachineBasicBlock::iterator E = CmpInstr->getParent()->end();
+  for (++I; I != E; ++I) {
+    const MachineInstr &Instr = *I;
+    bool ModifyEFLAGS = Instr.modifiesRegister(X86::EFLAGS, TRI);
+    bool UseEFLAGS = Instr.readsRegister(X86::EFLAGS, TRI);
+    // We should check the usage if this instruction uses and updates EFLAGS.
+    if (!UseEFLAGS && ModifyEFLAGS) {
+      // It is safe to remove CmpInstr if EFLAGS is updated again.
+      IsSafe = true;
+      break;
+    }
+    if (!UseEFLAGS && !ModifyEFLAGS)
+      continue;
+
+    // EFLAGS is used by this instruction.
+    X86::CondCode OldCC = X86::COND_INVALID;
+    bool OpcIsSET = false;
+    if (IsCmpZero || IsSwapped) {
+      // We decode the condition code from opcode.
+      if (Instr.isBranch())
+        OldCC = getCondFromBranchOpc(Instr.getOpcode());
+      else {
+        OldCC = getCondFromSETOpc(Instr.getOpcode());
+        if (OldCC != X86::COND_INVALID)
+          OpcIsSET = true;
+        else
+          OldCC = X86::getCondFromCMovOpc(Instr.getOpcode());
+      }
+      if (OldCC == X86::COND_INVALID) return false;
+    }
+    if (IsCmpZero) {
+      switch (OldCC) {
+      default: break;
+      case X86::COND_A: case X86::COND_AE:
+      case X86::COND_B: case X86::COND_BE:
+      case X86::COND_G: case X86::COND_GE:
+      case X86::COND_L: case X86::COND_LE:
+      case X86::COND_O: case X86::COND_NO:
+        // CF and OF are used, we can't perform this optimization.
+        return false;
+      }
+
+      // If we're updating the condition code check if we have to reverse the
+      // condition.
+      if (ShouldUpdateCC)
+        switch (OldCC) {
+        default:
+          return false;
+        case X86::COND_E:
+          break;
+        case X86::COND_NE:
+          NewCC = GetOppositeBranchCondition(NewCC);
+          break;
+        }
+    } else if (IsSwapped) {
+      // If we have SUB(r1, r2) and CMP(r2, r1), the condition code needs
+      // to be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
+      // We swap the condition code and synthesize the new opcode.
+      NewCC = getSwappedCondition(OldCC);
+      if (NewCC == X86::COND_INVALID) return false;
+    }
+
+    if ((ShouldUpdateCC || IsSwapped) && NewCC != OldCC) {
+      // Synthesize the new opcode.
+      bool HasMemoryOperand = Instr.hasOneMemOperand();
+      unsigned NewOpc;
+      if (Instr.isBranch())
+        NewOpc = GetCondBranchFromCond(NewCC);
+      else if(OpcIsSET)
+        NewOpc = getSETFromCond(NewCC, HasMemoryOperand);
+      else {
+        unsigned DstReg = Instr.getOperand(0).getReg();
+        NewOpc = getCMovFromCond(NewCC, MRI->getRegClass(DstReg)->getSize(),
+                                 HasMemoryOperand);
+      }
+
+      // Push the MachineInstr to OpsToUpdate.
+      // If it is safe to remove CmpInstr, the condition code of these
+      // instructions will be modified.
+      OpsToUpdate.push_back(std::make_pair(&*I, NewOpc));
+    }
+    if (ModifyEFLAGS || Instr.killsRegister(X86::EFLAGS, TRI)) {
+      // It is safe to remove CmpInstr if EFLAGS is updated again or killed.
+      IsSafe = true;
+      break;
+    }
+  }
+
+  // If EFLAGS is not killed nor re-defined, we should check whether it is
+  // live-out. If it is live-out, do not optimize.
+  if ((IsCmpZero || IsSwapped) && !IsSafe) {
+    MachineBasicBlock *MBB = CmpInstr->getParent();
+    for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+             SE = MBB->succ_end(); SI != SE; ++SI)
+      if ((*SI)->isLiveIn(X86::EFLAGS))
+        return false;
+  }
+
+  // The instruction to be updated is either Sub or MI.
+  Sub = IsCmpZero ? MI : Sub;
+  // Move Movr0Inst to the appropriate place before Sub.
+  if (Movr0Inst) {
+    // Look backwards until we find a def that doesn't use the current EFLAGS.
+    Def = Sub;
+    MachineBasicBlock::reverse_iterator
+      InsertI = MachineBasicBlock::reverse_iterator(++Def),
+                InsertE = Sub->getParent()->rend();
+    for (; InsertI != InsertE; ++InsertI) {
+      MachineInstr *Instr = &*InsertI;
+      if (!Instr->readsRegister(X86::EFLAGS, TRI) &&
+          Instr->modifiesRegister(X86::EFLAGS, TRI)) {
+        Sub->getParent()->remove(Movr0Inst);
+        Instr->getParent()->insert(MachineBasicBlock::iterator(Instr),
+                                   Movr0Inst);
+        break;
+      }
+    }
+    if (InsertI == InsertE)
+      return false;
+  }
+
+  // Make sure Sub instruction defines EFLAGS and mark the def live.
+  unsigned i = 0, e = Sub->getNumOperands();
+  for (; i != e; ++i) {
+    MachineOperand &MO = Sub->getOperand(i);
+    if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS) {
+      MO.setIsDead(false);
+      break;
+    }
+  }
+  assert(i != e && "Unable to locate a def EFLAGS operand");
+
+  CmpInstr->eraseFromParent();
+
+  // Modify the condition code of instructions in OpsToUpdate.
+  for (unsigned i = 0, e = OpsToUpdate.size(); i < e; i++)
+    OpsToUpdate[i].first->setDesc(get(OpsToUpdate[i].second));
+  return true;
+}
+
+/// optimizeLoadInstr - Try to remove the load by folding it to a register
+/// operand at the use. We fold the load instructions if load defines a virtual
+/// register, the virtual register is used once in the same BB, and the
+/// instructions in-between do not load or store, and have no side effects.
+MachineInstr* X86InstrInfo::
+optimizeLoadInstr(MachineInstr *MI, const MachineRegisterInfo *MRI,
+                  unsigned &FoldAsLoadDefReg,
+                  MachineInstr *&DefMI) const {
+  if (FoldAsLoadDefReg == 0)
+    return nullptr;
+  // To be conservative, if there exists another load, clear the load candidate.
+  if (MI->mayLoad()) {
+    FoldAsLoadDefReg = 0;
+    return nullptr;
+  }
+
+  // Check whether we can move DefMI here.
+  DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
+  assert(DefMI);
+  bool SawStore = false;
+  if (!DefMI->isSafeToMove(this, nullptr, SawStore))
+    return nullptr;
+
+  // We try to commute MI if possible.
+  unsigned IdxEnd = (MI->isCommutable()) ? 2 : 1;
+  for (unsigned Idx = 0; Idx < IdxEnd; Idx++) {
+    // Collect information about virtual register operands of MI.
+    unsigned SrcOperandId = 0;
+    bool FoundSrcOperand = false;
+    for (unsigned i = 0, e = MI->getDesc().getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if (!MO.isReg())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (Reg != FoldAsLoadDefReg)
+        continue;
+      // Do not fold if we have a subreg use or a def or multiple uses.
+      if (MO.getSubReg() || MO.isDef() || FoundSrcOperand)
+        return nullptr;
+
+      SrcOperandId = i;
+      FoundSrcOperand = true;
+    }
+    if (!FoundSrcOperand) return nullptr;
+
+    // Check whether we can fold the def into SrcOperandId.
+    SmallVector<unsigned, 8> Ops;
+    Ops.push_back(SrcOperandId);
+    MachineInstr *FoldMI = foldMemoryOperand(MI, Ops, DefMI);
+    if (FoldMI) {
+      FoldAsLoadDefReg = 0;
+      return FoldMI;
+    }
+
+    if (Idx == 1) {
+      // MI was changed but it didn't help, commute it back!
+      commuteInstruction(MI, false);
+      return nullptr;
+    }
+
+    // Check whether we can commute MI and enable folding.
+    if (MI->isCommutable()) {
+      MachineInstr *NewMI = commuteInstruction(MI, false);
+      // Unable to commute.
+      if (!NewMI) return nullptr;
+      if (NewMI != MI) {
+        // New instruction. It doesn't need to be kept.
+        NewMI->eraseFromParent();
+        return nullptr;
+      }
+    }
+  }
+  return nullptr;
+}
+
+/// Expand2AddrUndef - Expand a single-def pseudo instruction to a two-addr
+/// instruction with two undef reads of the register being defined.  This is
+/// used for mapping:
+///   %xmm4 = V_SET0
+/// to:
+///   %xmm4 = PXORrr %xmm4<undef>, %xmm4<undef>
+///
+static bool Expand2AddrUndef(MachineInstrBuilder &MIB,
+                             const MCInstrDesc &Desc) {
+  assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
+  unsigned Reg = MIB->getOperand(0).getReg();
+  MIB->setDesc(Desc);
+
+  // MachineInstr::addOperand() will insert explicit operands before any
+  // implicit operands.
+  MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
+  // But we don't trust that.
+  assert(MIB->getOperand(1).getReg() == Reg &&
+         MIB->getOperand(2).getReg() == Reg && "Misplaced operand");
+  return true;
+}
+
+bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
+  bool HasAVX = Subtarget.hasAVX();
+  MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI);
+  switch (MI->getOpcode()) {
+  case X86::MOV32r0:
+    return Expand2AddrUndef(MIB, get(X86::XOR32rr));
+  case X86::SETB_C8r:
+    return Expand2AddrUndef(MIB, get(X86::SBB8rr));
+  case X86::SETB_C16r:
+    return Expand2AddrUndef(MIB, get(X86::SBB16rr));
+  case X86::SETB_C32r:
+    return Expand2AddrUndef(MIB, get(X86::SBB32rr));
+  case X86::SETB_C64r:
+    return Expand2AddrUndef(MIB, get(X86::SBB64rr));
+  case X86::V_SET0:
+  case X86::FsFLD0SS:
+  case X86::FsFLD0SD:
+    return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr));
+  case X86::AVX_SET0:
+    assert(HasAVX && "AVX not supported");
+    return Expand2AddrUndef(MIB, get(X86::VXORPSYrr));
+  case X86::AVX512_512_SET0:
+    return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
+  case X86::V_SETALLONES:
+    return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
+  case X86::AVX2_SETALLONES:
+    return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
+  case X86::TEST8ri_NOREX:
+    MI->setDesc(get(X86::TEST8ri));
+    return true;
+  case X86::KSET0B: 
+  case X86::KSET0W: return Expand2AddrUndef(MIB, get(X86::KXORWrr));
+  case X86::KSET1B:
+  case X86::KSET1W: return Expand2AddrUndef(MIB, get(X86::KXNORWrr));
+  }
+  return false;
+}
+
+static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
+                                     const SmallVectorImpl<MachineOperand> &MOs,
+                                     MachineInstr *MI,
+                                     const TargetInstrInfo &TII) {
+  // Create the base instruction with the memory operand as the first part.
+  // Omit the implicit operands, something BuildMI can't do.
+  MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode),
+                                              MI->getDebugLoc(), true);
+  MachineInstrBuilder MIB(MF, NewMI);
+  unsigned NumAddrOps = MOs.size();
+  for (unsigned i = 0; i != NumAddrOps; ++i)
+    MIB.addOperand(MOs[i]);
+  if (NumAddrOps < 4)  // FrameIndex only
+    addOffset(MIB, 0);
+
+  // Loop over the rest of the ri operands, converting them over.
+  unsigned NumOps = MI->getDesc().getNumOperands()-2;
+  for (unsigned i = 0; i != NumOps; ++i) {
+    MachineOperand &MO = MI->getOperand(i+2);
+    MIB.addOperand(MO);
+  }
+  for (unsigned i = NumOps+2, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    MIB.addOperand(MO);
+  }
+  return MIB;
+}
+
+static MachineInstr *FuseInst(MachineFunction &MF,
+                              unsigned Opcode, unsigned OpNo,
+                              const SmallVectorImpl<MachineOperand> &MOs,
+                              MachineInstr *MI, const TargetInstrInfo &TII) {
+  // Omit the implicit operands, something BuildMI can't do.
+  MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode),
+                                              MI->getDebugLoc(), true);
+  MachineInstrBuilder MIB(MF, NewMI);
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (i == OpNo) {
+      assert(MO.isReg() && "Expected to fold into reg operand!");
+      unsigned NumAddrOps = MOs.size();
+      for (unsigned i = 0; i != NumAddrOps; ++i)
+        MIB.addOperand(MOs[i]);
+      if (NumAddrOps < 4)  // FrameIndex only
+        addOffset(MIB, 0);
+    } else {
+      MIB.addOperand(MO);
+    }
+  }
+  return MIB;
+}
+
+static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode,
+                                const SmallVectorImpl<MachineOperand> &MOs,
+                                MachineInstr *MI) {
+  MachineFunction &MF = *MI->getParent()->getParent();
+  MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), TII.get(Opcode));
+
+  unsigned NumAddrOps = MOs.size();
+  for (unsigned i = 0; i != NumAddrOps; ++i)
+    MIB.addOperand(MOs[i]);
+  if (NumAddrOps < 4)  // FrameIndex only
+    addOffset(MIB, 0);
+  return MIB.addImm(0);
+}
+
+MachineInstr*
+X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+                                    MachineInstr *MI, unsigned i,
+                                    const SmallVectorImpl<MachineOperand> &MOs,
+                                    unsigned Size, unsigned Align) const {
+  const DenseMap<unsigned,
+                 std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr;
+  bool isCallRegIndirect = Subtarget.callRegIndirect();
+  bool isTwoAddrFold = false;
+
+  // Atom favors register form of call. So, we do not fold loads into calls
+  // when X86Subtarget is Atom.
+  if (isCallRegIndirect &&
+    (MI->getOpcode() == X86::CALL32r || MI->getOpcode() == X86::CALL64r)) {
+    return nullptr;
+  }
+
+  unsigned NumOps = MI->getDesc().getNumOperands();
+  bool isTwoAddr = NumOps > 1 &&
+    MI->getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1;
+
+  // FIXME: AsmPrinter doesn't know how to handle
+  // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding.
+  if (MI->getOpcode() == X86::ADD32ri &&
+      MI->getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS)
+    return nullptr;
+
+  MachineInstr *NewMI = nullptr;
+  // Folding a memory location into the two-address part of a two-address
+  // instruction is different than folding it other places.  It requires
+  // replacing the *two* registers with the memory location.
+  if (isTwoAddr && NumOps >= 2 && i < 2 &&
+      MI->getOperand(0).isReg() &&
+      MI->getOperand(1).isReg() &&
+      MI->getOperand(0).getReg() == MI->getOperand(1).getReg()) {
+    OpcodeTablePtr = &RegOp2MemOpTable2Addr;
+    isTwoAddrFold = true;
+  } else if (i == 0) { // If operand 0
+    if (MI->getOpcode() == X86::MOV32r0) {
+      NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, MI);
+      if (NewMI)
+        return NewMI;
+    }
+
+    OpcodeTablePtr = &RegOp2MemOpTable0;
+  } else if (i == 1) {
+    OpcodeTablePtr = &RegOp2MemOpTable1;
+  } else if (i == 2) {
+    OpcodeTablePtr = &RegOp2MemOpTable2;
+  } else if (i == 3) {
+    OpcodeTablePtr = &RegOp2MemOpTable3;
+  }
+
+  // If table selected...
+  if (OpcodeTablePtr) {
+    // Find the Opcode to fuse
+    DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I =
+      OpcodeTablePtr->find(MI->getOpcode());
+    if (I != OpcodeTablePtr->end()) {
+      unsigned Opcode = I->second.first;
+      unsigned MinAlign = (I->second.second & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT;
+      if (Align < MinAlign)
+        return nullptr;
+      bool NarrowToMOV32rm = false;
+      if (Size) {
+        unsigned RCSize = getRegClass(MI->getDesc(), i, &RI, MF)->getSize();
+        if (Size < RCSize) {
+          // Check if it's safe to fold the load. If the size of the object is
+          // narrower than the load width, then it's not.
+          if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
+            return nullptr;
+          // If this is a 64-bit load, but the spill slot is 32, then we can do
+          // a 32-bit load which is implicitly zero-extended. This likely is due
+          // to liveintervalanalysis remat'ing a load from stack slot.
+          if (MI->getOperand(0).getSubReg() || MI->getOperand(1).getSubReg())
+            return nullptr;
+          Opcode = X86::MOV32rm;
+          NarrowToMOV32rm = true;
+        }
+      }
+
+      if (isTwoAddrFold)
+        NewMI = FuseTwoAddrInst(MF, Opcode, MOs, MI, *this);
+      else
+        NewMI = FuseInst(MF, Opcode, i, MOs, MI, *this);
+
+      if (NarrowToMOV32rm) {
+        // If this is the special case where we use a MOV32rm to load a 32-bit
+        // value and zero-extend the top bits. Change the destination register
+        // to a 32-bit one.
+        unsigned DstReg = NewMI->getOperand(0).getReg();
+        if (TargetRegisterInfo::isPhysicalRegister(DstReg))
+          NewMI->getOperand(0).setReg(RI.getSubReg(DstReg,
+                                                   X86::sub_32bit));
+        else
+          NewMI->getOperand(0).setSubReg(X86::sub_32bit);
+      }
+      return NewMI;
+    }
+  }
+
+  // No fusion
+  if (PrintFailedFusing && !MI->isCopy())
+    dbgs() << "We failed to fuse operand " << i << " in " << *MI;
+  return nullptr;
+}
+
+/// hasPartialRegUpdate - Return true for all instructions that only update
+/// the first 32 or 64-bits of the destination register and leave the rest
+/// unmodified. This can be used to avoid folding loads if the instructions
+/// only update part of the destination register, and the non-updated part is
+/// not needed. e.g. cvtss2sd, sqrtss. Unfolding the load from these
+/// instructions breaks the partial register dependency and it can improve
+/// performance. e.g.:
+///
+///   movss (%rdi), %xmm0
+///   cvtss2sd %xmm0, %xmm0
+///
+/// Instead of
+///   cvtss2sd (%rdi), %xmm0
+///
+/// FIXME: This should be turned into a TSFlags.
+///
+static bool hasPartialRegUpdate(unsigned Opcode) {
+  switch (Opcode) {
+  case X86::CVTSI2SSrr:
+  case X86::CVTSI2SS64rr:
+  case X86::CVTSI2SDrr:
+  case X86::CVTSI2SD64rr:
+  case X86::CVTSD2SSrr:
+  case X86::Int_CVTSD2SSrr:
+  case X86::CVTSS2SDrr:
+  case X86::Int_CVTSS2SDrr:
+  case X86::RCPSSr:
+  case X86::RCPSSr_Int:
+  case X86::ROUNDSDr:
+  case X86::ROUNDSDr_Int:
+  case X86::ROUNDSSr:
+  case X86::ROUNDSSr_Int:
+  case X86::RSQRTSSr:
+  case X86::RSQRTSSr_Int:
+  case X86::SQRTSSr:
+  case X86::SQRTSSr_Int:
+    return true;
+  }
+
+  return false;
+}
+
+/// getPartialRegUpdateClearance - Inform the ExeDepsFix pass how many idle
+/// instructions we would like before a partial register update.
+unsigned X86InstrInfo::
+getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum,
+                             const TargetRegisterInfo *TRI) const {
+  if (OpNum != 0 || !hasPartialRegUpdate(MI->getOpcode()))
+    return 0;
+
+  // If MI is marked as reading Reg, the partial register update is wanted.
+  const MachineOperand &MO = MI->getOperand(0);
+  unsigned Reg = MO.getReg();
+  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+    if (MO.readsReg() || MI->readsVirtualRegister(Reg))
+      return 0;
+  } else {
+    if (MI->readsRegister(Reg, TRI))
+      return 0;
+  }
+
+  // If any of the preceding 16 instructions are reading Reg, insert a
+  // dependency breaking instruction.  The magic number is based on a few
+  // Nehalem experiments.
+  return 16;
+}
+
+// Return true for any instruction the copies the high bits of the first source
+// operand into the unused high bits of the destination operand.
+static bool hasUndefRegUpdate(unsigned Opcode) {
+  switch (Opcode) {
+  case X86::VCVTSI2SSrr:
+  case X86::Int_VCVTSI2SSrr:
+  case X86::VCVTSI2SS64rr:
+  case X86::Int_VCVTSI2SS64rr:
+  case X86::VCVTSI2SDrr:
+  case X86::Int_VCVTSI2SDrr:
+  case X86::VCVTSI2SD64rr:
+  case X86::Int_VCVTSI2SD64rr:
+  case X86::VCVTSD2SSrr:
+  case X86::Int_VCVTSD2SSrr:
+  case X86::VCVTSS2SDrr:
+  case X86::Int_VCVTSS2SDrr:
+  case X86::VRCPSSr:
+  case X86::VROUNDSDr:
+  case X86::VROUNDSDr_Int:
+  case X86::VROUNDSSr:
+  case X86::VROUNDSSr_Int:
+  case X86::VRSQRTSSr:
+  case X86::VSQRTSSr:
+
+  // AVX-512
+  case X86::VCVTSD2SSZrr:
+  case X86::VCVTSS2SDZrr:
+    return true;
+  }
+
+  return false;
+}
+
+/// Inform the ExeDepsFix pass how many idle instructions we would like before
+/// certain undef register reads.
+///
+/// This catches the VCVTSI2SD family of instructions:
+///
+/// vcvtsi2sdq %rax, %xmm0<undef>, %xmm14
+///
+/// We should to be careful *not* to catch VXOR idioms which are presumably
+/// handled specially in the pipeline:
+///
+/// vxorps %xmm1<undef>, %xmm1<undef>, %xmm1
+///
+/// Like getPartialRegUpdateClearance, this makes a strong assumption that the
+/// high bits that are passed-through are not live.
+unsigned X86InstrInfo::
+getUndefRegClearance(const MachineInstr *MI, unsigned &OpNum,
+                     const TargetRegisterInfo *TRI) const {
+  if (!hasUndefRegUpdate(MI->getOpcode()))
+    return 0;
+
+  // Set the OpNum parameter to the first source operand.
+  OpNum = 1;
+
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  if (MO.isUndef() && TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+    // Use the same magic number as getPartialRegUpdateClearance.
+    return 16;
+  }
+  return 0;
+}
+
+void X86InstrInfo::
+breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum,
+                          const TargetRegisterInfo *TRI) const {
+  unsigned Reg = MI->getOperand(OpNum).getReg();
+  // If MI kills this register, the false dependence is already broken.
+  if (MI->killsRegister(Reg, TRI))
+    return;
+  if (X86::VR128RegClass.contains(Reg)) {
+    // These instructions are all floating point domain, so xorps is the best
+    // choice.
+    bool HasAVX = Subtarget.hasAVX();
+    unsigned Opc = HasAVX ? X86::VXORPSrr : X86::XORPSrr;
+    BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(Opc), Reg)
+      .addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
+  } else if (X86::VR256RegClass.contains(Reg)) {
+    // Use vxorps to clear the full ymm register.
+    // It wants to read and write the xmm sub-register.
+    unsigned XReg = TRI->getSubReg(Reg, X86::sub_xmm);
+    BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(X86::VXORPSrr), XReg)
+      .addReg(XReg, RegState::Undef).addReg(XReg, RegState::Undef)
+      .addReg(Reg, RegState::ImplicitDefine);
+  } else
+    return;
+  MI->addRegisterKilled(Reg, TRI, true);
+}
+
+MachineInstr*
+X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
+                                    const SmallVectorImpl<unsigned> &Ops,
+                                    int FrameIndex) const {
+  // Check switch flag
+  if (NoFusing) return nullptr;
+
+  // Unless optimizing for size, don't fold to avoid partial
+  // register update stalls
+  if (!MF.getFunction()->getAttributes().
+        hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize) &&
+      hasPartialRegUpdate(MI->getOpcode()))
+    return nullptr;
+
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  unsigned Size = MFI->getObjectSize(FrameIndex);
+  unsigned Alignment = MFI->getObjectAlignment(FrameIndex);
+  // If the function stack isn't realigned we don't want to fold instructions
+  // that need increased alignment.
+  if (!RI.needsStackRealignment(MF))
+    Alignment = std::min(
+        Alignment, MF.getTarget().getFrameLowering()->getStackAlignment());
+  if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
+    unsigned NewOpc = 0;
+    unsigned RCSize = 0;
+    switch (MI->getOpcode()) {
+    default: return nullptr;
+    case X86::TEST8rr:  NewOpc = X86::CMP8ri; RCSize = 1; break;
+    case X86::TEST16rr: NewOpc = X86::CMP16ri8; RCSize = 2; break;
+    case X86::TEST32rr: NewOpc = X86::CMP32ri8; RCSize = 4; break;
+    case X86::TEST64rr: NewOpc = X86::CMP64ri8; RCSize = 8; break;
+    }
+    // Check if it's safe to fold the load. If the size of the object is
+    // narrower than the load width, then it's not.
+    if (Size < RCSize)
+      return nullptr;
+    // Change to CMPXXri r, 0 first.
+    MI->setDesc(get(NewOpc));
+    MI->getOperand(1).ChangeToImmediate(0);
+  } else if (Ops.size() != 1)
+    return nullptr;
+
+  SmallVector<MachineOperand,4> MOs;
+  MOs.push_back(MachineOperand::CreateFI(FrameIndex));
+  return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, Size, Alignment);
+}
+
+MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+                                                  MachineInstr *MI,
+                                           const SmallVectorImpl<unsigned> &Ops,
+                                                  MachineInstr *LoadMI) const {
+  // If loading from a FrameIndex, fold directly from the FrameIndex.
+  unsigned NumOps = LoadMI->getDesc().getNumOperands();
+  int FrameIndex;
+  if (isLoadFromStackSlot(LoadMI, FrameIndex))
+    return foldMemoryOperandImpl(MF, MI, Ops, FrameIndex);
+
+  // Check switch flag
+  if (NoFusing) return nullptr;
+
+  // Unless optimizing for size, don't fold to avoid partial
+  // register update stalls
+  if (!MF.getFunction()->getAttributes().
+        hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize) &&
+      hasPartialRegUpdate(MI->getOpcode()))
+    return nullptr;
+
+  // Determine the alignment of the load.
+  unsigned Alignment = 0;
+  if (LoadMI->hasOneMemOperand())
+    Alignment = (*LoadMI->memoperands_begin())->getAlignment();
+  else
+    switch (LoadMI->getOpcode()) {
+    case X86::AVX2_SETALLONES:
+    case X86::AVX_SET0:
+      Alignment = 32;
+      break;
+    case X86::V_SET0:
+    case X86::V_SETALLONES:
+      Alignment = 16;
+      break;
+    case X86::FsFLD0SD:
+      Alignment = 8;
+      break;
+    case X86::FsFLD0SS:
+      Alignment = 4;
+      break;
+    default:
+      return nullptr;
+    }
+  if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
+    unsigned NewOpc = 0;
+    switch (MI->getOpcode()) {
+    default: return nullptr;
+    case X86::TEST8rr:  NewOpc = X86::CMP8ri; break;
+    case X86::TEST16rr: NewOpc = X86::CMP16ri8; break;
+    case X86::TEST32rr: NewOpc = X86::CMP32ri8; break;
+    case X86::TEST64rr: NewOpc = X86::CMP64ri8; break;
+    }
+    // Change to CMPXXri r, 0 first.
+    MI->setDesc(get(NewOpc));
+    MI->getOperand(1).ChangeToImmediate(0);
+  } else if (Ops.size() != 1)
+    return nullptr;
+
+  // Make sure the subregisters match.
+  // Otherwise we risk changing the size of the load.
+  if (LoadMI->getOperand(0).getSubReg() != MI->getOperand(Ops[0]).getSubReg())
+    return nullptr;
+
+  SmallVector<MachineOperand,X86::AddrNumOperands> MOs;
+  switch (LoadMI->getOpcode()) {
+  case X86::V_SET0:
+  case X86::V_SETALLONES:
+  case X86::AVX2_SETALLONES:
+  case X86::AVX_SET0:
+  case X86::FsFLD0SD:
+  case X86::FsFLD0SS: {
+    // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure.
+    // Create a constant-pool entry and operands to load from it.
+
+    // Medium and large mode can't fold loads this way.
+    if (MF.getTarget().getCodeModel() != CodeModel::Small &&
+        MF.getTarget().getCodeModel() != CodeModel::Kernel)
+      return nullptr;
+
+    // x86-32 PIC requires a PIC base register for constant pools.
+    unsigned PICBase = 0;
+    if (MF.getTarget().getRelocationModel() == Reloc::PIC_) {
+      if (Subtarget.is64Bit())
+        PICBase = X86::RIP;
+      else
+        // FIXME: PICBase = getGlobalBaseReg(&MF);
+        // This doesn't work for several reasons.
+        // 1. GlobalBaseReg may have been spilled.
+        // 2. It may not be live at MI.
+        return nullptr;
+    }
+
+    // Create a constant-pool entry.
+    MachineConstantPool &MCP = *MF.getConstantPool();
+    Type *Ty;
+    unsigned Opc = LoadMI->getOpcode();
+    if (Opc == X86::FsFLD0SS)
+      Ty = Type::getFloatTy(MF.getFunction()->getContext());
+    else if (Opc == X86::FsFLD0SD)
+      Ty = Type::getDoubleTy(MF.getFunction()->getContext());
+    else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0)
+      Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 8);
+    else
+      Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4);
+
+    bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES);
+    const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) :
+                                    Constant::getNullValue(Ty);
+    unsigned CPI = MCP.getConstantPoolIndex(C, Alignment);
+
+    // Create operands to load from the constant pool entry.
+    MOs.push_back(MachineOperand::CreateReg(PICBase, false));
+    MOs.push_back(MachineOperand::CreateImm(1));
+    MOs.push_back(MachineOperand::CreateReg(0, false));
+    MOs.push_back(MachineOperand::CreateCPI(CPI, 0));
+    MOs.push_back(MachineOperand::CreateReg(0, false));
+    break;
+  }
+  default: {
+    if ((LoadMI->getOpcode() == X86::MOVSSrm ||
+         LoadMI->getOpcode() == X86::VMOVSSrm) &&
+        MF.getRegInfo().getRegClass(LoadMI->getOperand(0).getReg())->getSize()
+          > 4)
+      // These instructions only load 32 bits, we can't fold them if the
+      // destination register is wider than 32 bits (4 bytes).
+      return nullptr;
+    if ((LoadMI->getOpcode() == X86::MOVSDrm ||
+         LoadMI->getOpcode() == X86::VMOVSDrm) &&
+        MF.getRegInfo().getRegClass(LoadMI->getOperand(0).getReg())->getSize()
+          > 8)
+      // These instructions only load 64 bits, we can't fold them if the
+      // destination register is wider than 64 bits (8 bytes).
+      return nullptr;
+
+    // Folding a normal load. Just copy the load's address operands.
+    for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i)
+      MOs.push_back(LoadMI->getOperand(i));
+    break;
+  }
+  }
+  return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, 0, Alignment);
+}
+
+
+bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
+                                  const SmallVectorImpl<unsigned> &Ops) const {
+  // Check switch flag
+  if (NoFusing) return 0;
+
+  if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
+    switch (MI->getOpcode()) {
+    default: return false;
+    case X86::TEST8rr:
+    case X86::TEST16rr:
+    case X86::TEST32rr:
+    case X86::TEST64rr:
+      return true;
+    case X86::ADD32ri:
+      // FIXME: AsmPrinter doesn't know how to handle
+      // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding.
+      if (MI->getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS)
+        return false;
+      break;
+    }
+  }
+
+  if (Ops.size() != 1)
+    return false;
+
+  unsigned OpNum = Ops[0];
+  unsigned Opc = MI->getOpcode();
+  unsigned NumOps = MI->getDesc().getNumOperands();
+  bool isTwoAddr = NumOps > 1 &&
+    MI->getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1;
+
+  // Folding a memory location into the two-address part of a two-address
+  // instruction is different than folding it other places.  It requires
+  // replacing the *two* registers with the memory location.
+  const DenseMap<unsigned,
+                 std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr;
+  if (isTwoAddr && NumOps >= 2 && OpNum < 2) {
+    OpcodeTablePtr = &RegOp2MemOpTable2Addr;
+  } else if (OpNum == 0) { // If operand 0
+    if (Opc == X86::MOV32r0)
+      return true;
+
+    OpcodeTablePtr = &RegOp2MemOpTable0;
+  } else if (OpNum == 1) {
+    OpcodeTablePtr = &RegOp2MemOpTable1;
+  } else if (OpNum == 2) {
+    OpcodeTablePtr = &RegOp2MemOpTable2;
+  } else if (OpNum == 3) {
+    OpcodeTablePtr = &RegOp2MemOpTable3;
+  }
+
+  if (OpcodeTablePtr && OpcodeTablePtr->count(Opc))
+    return true;
+  return TargetInstrInfo::canFoldMemoryOperand(MI, Ops);
+}
+
+bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
+                                unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
+                                SmallVectorImpl<MachineInstr*> &NewMIs) const {
+  DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I =
+    MemOp2RegOpTable.find(MI->getOpcode());
+  if (I == MemOp2RegOpTable.end())
+    return false;
+  unsigned Opc = I->second.first;
+  unsigned Index = I->second.second & TB_INDEX_MASK;
+  bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
+  bool FoldedStore = I->second.second & TB_FOLDED_STORE;
+  if (UnfoldLoad && !FoldedLoad)
+    return false;
+  UnfoldLoad &= FoldedLoad;
+  if (UnfoldStore && !FoldedStore)
+    return false;
+  UnfoldStore &= FoldedStore;
+
+  const MCInstrDesc &MCID = get(Opc);
+  const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
+  if (!MI->hasOneMemOperand() &&
+      RC == &X86::VR128RegClass &&
+      !Subtarget.isUnalignedMemAccessFast())
+    // Without memoperands, loadRegFromAddr and storeRegToStackSlot will
+    // conservatively assume the address is unaligned. That's bad for
+    // performance.
+    return false;
+  SmallVector<MachineOperand, X86::AddrNumOperands> AddrOps;
+  SmallVector<MachineOperand,2> BeforeOps;
+  SmallVector<MachineOperand,2> AfterOps;
+  SmallVector<MachineOperand,4> ImpOps;
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &Op = MI->getOperand(i);
+    if (i >= Index && i < Index + X86::AddrNumOperands)
+      AddrOps.push_back(Op);
+    else if (Op.isReg() && Op.isImplicit())
+      ImpOps.push_back(Op);
+    else if (i < Index)
+      BeforeOps.push_back(Op);
+    else if (i > Index)
+      AfterOps.push_back(Op);
+  }
+
+  // Emit the load instruction.
+  if (UnfoldLoad) {
+    std::pair<MachineInstr::mmo_iterator,
+              MachineInstr::mmo_iterator> MMOs =
+      MF.extractLoadMemRefs(MI->memoperands_begin(),
+                            MI->memoperands_end());
+    loadRegFromAddr(MF, Reg, AddrOps, RC, MMOs.first, MMOs.second, NewMIs);
+    if (UnfoldStore) {
+      // Address operands cannot be marked isKill.
+      for (unsigned i = 1; i != 1 + X86::AddrNumOperands; ++i) {
+        MachineOperand &MO = NewMIs[0]->getOperand(i);
+        if (MO.isReg())
+          MO.setIsKill(false);
+      }
+    }
+  }
+
+  // Emit the data processing instruction.
+  MachineInstr *DataMI = MF.CreateMachineInstr(MCID, MI->getDebugLoc(), true);
+  MachineInstrBuilder MIB(MF, DataMI);
+
+  if (FoldedStore)
+    MIB.addReg(Reg, RegState::Define);
+  for (unsigned i = 0, e = BeforeOps.size(); i != e; ++i)
+    MIB.addOperand(BeforeOps[i]);
+  if (FoldedLoad)
+    MIB.addReg(Reg);
+  for (unsigned i = 0, e = AfterOps.size(); i != e; ++i)
+    MIB.addOperand(AfterOps[i]);
+  for (unsigned i = 0, e = ImpOps.size(); i != e; ++i) {
+    MachineOperand &MO = ImpOps[i];
+    MIB.addReg(MO.getReg(),
+               getDefRegState(MO.isDef()) |
+               RegState::Implicit |
+               getKillRegState(MO.isKill()) |
+               getDeadRegState(MO.isDead()) |
+               getUndefRegState(MO.isUndef()));
+  }
+  // Change CMP32ri r, 0 back to TEST32rr r, r, etc.
+  switch (DataMI->getOpcode()) {
+  default: break;
+  case X86::CMP64ri32:
+  case X86::CMP64ri8:
+  case X86::CMP32ri:
+  case X86::CMP32ri8:
+  case X86::CMP16ri:
+  case X86::CMP16ri8:
+  case X86::CMP8ri: {
+    MachineOperand &MO0 = DataMI->getOperand(0);
+    MachineOperand &MO1 = DataMI->getOperand(1);
+    if (MO1.getImm() == 0) {
+      unsigned NewOpc;
+      switch (DataMI->getOpcode()) {
+      default: llvm_unreachable("Unreachable!");
+      case X86::CMP64ri8:
+      case X86::CMP64ri32: NewOpc = X86::TEST64rr; break;
+      case X86::CMP32ri8:
+      case X86::CMP32ri:   NewOpc = X86::TEST32rr; break;
+      case X86::CMP16ri8:
+      case X86::CMP16ri:   NewOpc = X86::TEST16rr; break;
+      case X86::CMP8ri:    NewOpc = X86::TEST8rr; break;
+      }
+      DataMI->setDesc(get(NewOpc));
+      MO1.ChangeToRegister(MO0.getReg(), false);
+    }
+  }
+  }
+  NewMIs.push_back(DataMI);
+
+  // Emit the store instruction.
+  if (UnfoldStore) {
+    const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI, MF);
+    std::pair<MachineInstr::mmo_iterator,
+              MachineInstr::mmo_iterator> MMOs =
+      MF.extractStoreMemRefs(MI->memoperands_begin(),
+                             MI->memoperands_end());
+    storeRegToAddr(MF, Reg, true, AddrOps, DstRC, MMOs.first, MMOs.second, NewMIs);
+  }
+
+  return true;
+}
+
+bool
+X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
+                                  SmallVectorImpl<SDNode*> &NewNodes) const {
+  if (!N->isMachineOpcode())
+    return false;
+
+  DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I =
+    MemOp2RegOpTable.find(N->getMachineOpcode());
+  if (I == MemOp2RegOpTable.end())
+    return false;
+  unsigned Opc = I->second.first;
+  unsigned Index = I->second.second & TB_INDEX_MASK;
+  bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
+  bool FoldedStore = I->second.second & TB_FOLDED_STORE;
+  const MCInstrDesc &MCID = get(Opc);
+  MachineFunction &MF = DAG.getMachineFunction();
+  const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
+  unsigned NumDefs = MCID.NumDefs;
+  std::vector<SDValue> AddrOps;
+  std::vector<SDValue> BeforeOps;
+  std::vector<SDValue> AfterOps;
+  SDLoc dl(N);
+  unsigned NumOps = N->getNumOperands();
+  for (unsigned i = 0; i != NumOps-1; ++i) {
+    SDValue Op = N->getOperand(i);
+    if (i >= Index-NumDefs && i < Index-NumDefs + X86::AddrNumOperands)
+      AddrOps.push_back(Op);
+    else if (i < Index-NumDefs)
+      BeforeOps.push_back(Op);
+    else if (i > Index-NumDefs)
+      AfterOps.push_back(Op);
+  }
+  SDValue Chain = N->getOperand(NumOps-1);
+  AddrOps.push_back(Chain);
+
+  // Emit the load instruction.
+  SDNode *Load = nullptr;
+  if (FoldedLoad) {
+    EVT VT = *RC->vt_begin();
+    std::pair<MachineInstr::mmo_iterator,
+              MachineInstr::mmo_iterator> MMOs =
+      MF.extractLoadMemRefs(cast<MachineSDNode>(N)->memoperands_begin(),
+                            cast<MachineSDNode>(N)->memoperands_end());
+    if (!(*MMOs.first) &&
+        RC == &X86::VR128RegClass &&
+        !Subtarget.isUnalignedMemAccessFast())
+      // Do not introduce a slow unaligned load.
+      return false;
+    unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
+    bool isAligned = (*MMOs.first) &&
+                     (*MMOs.first)->getAlignment() >= Alignment;
+    Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, Subtarget), dl,
+                              VT, MVT::Other, AddrOps);
+    NewNodes.push_back(Load);
+
+    // Preserve memory reference information.
+    cast<MachineSDNode>(Load)->setMemRefs(MMOs.first, MMOs.second);
+  }
+
+  // Emit the data processing instruction.
+  std::vector<EVT> VTs;
+  const TargetRegisterClass *DstRC = nullptr;
+  if (MCID.getNumDefs() > 0) {
+    DstRC = getRegClass(MCID, 0, &RI, MF);
+    VTs.push_back(*DstRC->vt_begin());
+  }
+  for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
+    EVT VT = N->getValueType(i);
+    if (VT != MVT::Other && i >= (unsigned)MCID.getNumDefs())
+      VTs.push_back(VT);
+  }
+  if (Load)
+    BeforeOps.push_back(SDValue(Load, 0));
+  std::copy(AfterOps.begin(), AfterOps.end(), std::back_inserter(BeforeOps));
+  SDNode *NewNode= DAG.getMachineNode(Opc, dl, VTs, BeforeOps);
+  NewNodes.push_back(NewNode);
+
+  // Emit the store instruction.
+  if (FoldedStore) {
+    AddrOps.pop_back();
+    AddrOps.push_back(SDValue(NewNode, 0));
+    AddrOps.push_back(Chain);
+    std::pair<MachineInstr::mmo_iterator,
+              MachineInstr::mmo_iterator> MMOs =
+      MF.extractStoreMemRefs(cast<MachineSDNode>(N)->memoperands_begin(),
+                             cast<MachineSDNode>(N)->memoperands_end());
+    if (!(*MMOs.first) &&
+        RC == &X86::VR128RegClass &&
+        !Subtarget.isUnalignedMemAccessFast())
+      // Do not introduce a slow unaligned store.
+      return false;
+    unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
+    bool isAligned = (*MMOs.first) &&
+                     (*MMOs.first)->getAlignment() >= Alignment;
+    SDNode *Store =
+        DAG.getMachineNode(getStoreRegOpcode(0, DstRC, isAligned, Subtarget),
+                           dl, MVT::Other, AddrOps);
+    NewNodes.push_back(Store);
+
+    // Preserve memory reference information.
+    cast<MachineSDNode>(Load)->setMemRefs(MMOs.first, MMOs.second);
+  }
+
+  return true;
+}
+
+unsigned X86InstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc,
+                                      bool UnfoldLoad, bool UnfoldStore,
+                                      unsigned *LoadRegIndex) const {
+  DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I =
+    MemOp2RegOpTable.find(Opc);
+  if (I == MemOp2RegOpTable.end())
+    return 0;
+  bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
+  bool FoldedStore = I->second.second & TB_FOLDED_STORE;
+  if (UnfoldLoad && !FoldedLoad)
+    return 0;
+  if (UnfoldStore && !FoldedStore)
+    return 0;
+  if (LoadRegIndex)
+    *LoadRegIndex = I->second.second & TB_INDEX_MASK;
+  return I->second.first;
+}
+
+bool
+X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
+                                     int64_t &Offset1, int64_t &Offset2) const {
+  if (!Load1->isMachineOpcode() || !Load2->isMachineOpcode())
+    return false;
+  unsigned Opc1 = Load1->getMachineOpcode();
+  unsigned Opc2 = Load2->getMachineOpcode();
+  switch (Opc1) {
+  default: return false;
+  case X86::MOV8rm:
+  case X86::MOV16rm:
+  case X86::MOV32rm:
+  case X86::MOV64rm:
+  case X86::LD_Fp32m:
+  case X86::LD_Fp64m:
+  case X86::LD_Fp80m:
+  case X86::MOVSSrm:
+  case X86::MOVSDrm:
+  case X86::MMX_MOVD64rm:
+  case X86::MMX_MOVQ64rm:
+  case X86::FsMOVAPSrm:
+  case X86::FsMOVAPDrm:
+  case X86::MOVAPSrm:
+  case X86::MOVUPSrm:
+  case X86::MOVAPDrm:
+  case X86::MOVDQArm:
+  case X86::MOVDQUrm:
+  // AVX load instructions
+  case X86::VMOVSSrm:
+  case X86::VMOVSDrm:
+  case X86::FsVMOVAPSrm:
+  case X86::FsVMOVAPDrm:
+  case X86::VMOVAPSrm:
+  case X86::VMOVUPSrm:
+  case X86::VMOVAPDrm:
+  case X86::VMOVDQArm:
+  case X86::VMOVDQUrm:
+  case X86::VMOVAPSYrm:
+  case X86::VMOVUPSYrm:
+  case X86::VMOVAPDYrm:
+  case X86::VMOVDQAYrm:
+  case X86::VMOVDQUYrm:
+    break;
+  }
+  switch (Opc2) {
+  default: return false;
+  case X86::MOV8rm:
+  case X86::MOV16rm:
+  case X86::MOV32rm:
+  case X86::MOV64rm:
+  case X86::LD_Fp32m:
+  case X86::LD_Fp64m:
+  case X86::LD_Fp80m:
+  case X86::MOVSSrm:
+  case X86::MOVSDrm:
+  case X86::MMX_MOVD64rm:
+  case X86::MMX_MOVQ64rm:
+  case X86::FsMOVAPSrm:
+  case X86::FsMOVAPDrm:
+  case X86::MOVAPSrm:
+  case X86::MOVUPSrm:
+  case X86::MOVAPDrm:
+  case X86::MOVDQArm:
+  case X86::MOVDQUrm:
+  // AVX load instructions
+  case X86::VMOVSSrm:
+  case X86::VMOVSDrm:
+  case X86::FsVMOVAPSrm:
+  case X86::FsVMOVAPDrm:
+  case X86::VMOVAPSrm:
+  case X86::VMOVUPSrm:
+  case X86::VMOVAPDrm:
+  case X86::VMOVDQArm:
+  case X86::VMOVDQUrm:
+  case X86::VMOVAPSYrm:
+  case X86::VMOVUPSYrm:
+  case X86::VMOVAPDYrm:
+  case X86::VMOVDQAYrm:
+  case X86::VMOVDQUYrm:
+    break;
+  }
+
+  // Check if chain operands and base addresses match.
+  if (Load1->getOperand(0) != Load2->getOperand(0) ||
+      Load1->getOperand(5) != Load2->getOperand(5))
+    return false;
+  // Segment operands should match as well.
+  if (Load1->getOperand(4) != Load2->getOperand(4))
+    return false;
+  // Scale should be 1, Index should be Reg0.
+  if (Load1->getOperand(1) == Load2->getOperand(1) &&
+      Load1->getOperand(2) == Load2->getOperand(2)) {
+    if (cast<ConstantSDNode>(Load1->getOperand(1))->getZExtValue() != 1)
+      return false;
+
+    // Now let's examine the displacements.
+    if (isa<ConstantSDNode>(Load1->getOperand(3)) &&
+        isa<ConstantSDNode>(Load2->getOperand(3))) {
+      Offset1 = cast<ConstantSDNode>(Load1->getOperand(3))->getSExtValue();
+      Offset2 = cast<ConstantSDNode>(Load2->getOperand(3))->getSExtValue();
+      return true;
+    }
+  }
+  return false;
+}
+
+bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
+                                           int64_t Offset1, int64_t Offset2,
+                                           unsigned NumLoads) const {
+  assert(Offset2 > Offset1);
+  if ((Offset2 - Offset1) / 8 > 64)
+    return false;
+
+  unsigned Opc1 = Load1->getMachineOpcode();
+  unsigned Opc2 = Load2->getMachineOpcode();
+  if (Opc1 != Opc2)
+    return false;  // FIXME: overly conservative?
+
+  switch (Opc1) {
+  default: break;
+  case X86::LD_Fp32m:
+  case X86::LD_Fp64m:
+  case X86::LD_Fp80m:
+  case X86::MMX_MOVD64rm:
+  case X86::MMX_MOVQ64rm:
+    return false;
+  }
+
+  EVT VT = Load1->getValueType(0);
+  switch (VT.getSimpleVT().SimpleTy) {
+  default:
+    // XMM registers. In 64-bit mode we can be a bit more aggressive since we
+    // have 16 of them to play with.
+    if (Subtarget.is64Bit()) {
+      if (NumLoads >= 3)
+        return false;
+    } else if (NumLoads) {
+      return false;
+    }
+    break;
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+  case MVT::i64:
+  case MVT::f32:
+  case MVT::f64:
+    if (NumLoads)
+      return false;
+    break;
+  }
+
+  return true;
+}
+
+bool X86InstrInfo::shouldScheduleAdjacent(MachineInstr* First,
+                                          MachineInstr *Second) const {
+  // Check if this processor supports macro-fusion. Since this is a minor
+  // heuristic, we haven't specifically reserved a feature. hasAVX is a decent
+  // proxy for SandyBridge+.
+  if (!Subtarget.hasAVX())
+    return false;
+
+  enum {
+    FuseTest,
+    FuseCmp,
+    FuseInc
+  } FuseKind;
+
+  switch(Second->getOpcode()) {
+  default:
+    return false;
+  case X86::JE_4:
+  case X86::JNE_4:
+  case X86::JL_4:
+  case X86::JLE_4:
+  case X86::JG_4:
+  case X86::JGE_4:
+    FuseKind = FuseInc;
+    break;
+  case X86::JB_4:
+  case X86::JBE_4:
+  case X86::JA_4:
+  case X86::JAE_4:
+    FuseKind = FuseCmp;
+    break;
+  case X86::JS_4:
+  case X86::JNS_4:
+  case X86::JP_4:
+  case X86::JNP_4:
+  case X86::JO_4:
+  case X86::JNO_4:
+    FuseKind = FuseTest;
+    break;
+  }
+  switch (First->getOpcode()) {
+  default:
+    return false;
+  case X86::TEST8rr:
+  case X86::TEST16rr:
+  case X86::TEST32rr:
+  case X86::TEST64rr:
+  case X86::TEST8ri:
+  case X86::TEST16ri:
+  case X86::TEST32ri:
+  case X86::TEST32i32:
+  case X86::TEST64i32:
+  case X86::TEST64ri32:
+  case X86::TEST8rm:
+  case X86::TEST16rm:
+  case X86::TEST32rm:
+  case X86::TEST64rm:
+  case X86::TEST8ri_NOREX:
+  case X86::AND16i16:
+  case X86::AND16ri:
+  case X86::AND16ri8:
+  case X86::AND16rm:
+  case X86::AND16rr:
+  case X86::AND32i32:
+  case X86::AND32ri:
+  case X86::AND32ri8:
+  case X86::AND32rm:
+  case X86::AND32rr:
+  case X86::AND64i32:
+  case X86::AND64ri32:
+  case X86::AND64ri8:
+  case X86::AND64rm:
+  case X86::AND64rr:
+  case X86::AND8i8:
+  case X86::AND8ri:
+  case X86::AND8rm:
+  case X86::AND8rr:
+    return true;
+  case X86::CMP16i16:
+  case X86::CMP16ri:
+  case X86::CMP16ri8:
+  case X86::CMP16rm:
+  case X86::CMP16rr:
+  case X86::CMP32i32:
+  case X86::CMP32ri:
+  case X86::CMP32ri8:
+  case X86::CMP32rm:
+  case X86::CMP32rr:
+  case X86::CMP64i32:
+  case X86::CMP64ri32:
+  case X86::CMP64ri8:
+  case X86::CMP64rm:
+  case X86::CMP64rr:
+  case X86::CMP8i8:
+  case X86::CMP8ri:
+  case X86::CMP8rm:
+  case X86::CMP8rr:
+  case X86::ADD16i16:
+  case X86::ADD16ri:
+  case X86::ADD16ri8:
+  case X86::ADD16ri8_DB:
+  case X86::ADD16ri_DB:
+  case X86::ADD16rm:
+  case X86::ADD16rr:
+  case X86::ADD16rr_DB:
+  case X86::ADD32i32:
+  case X86::ADD32ri:
+  case X86::ADD32ri8:
+  case X86::ADD32ri8_DB:
+  case X86::ADD32ri_DB:
+  case X86::ADD32rm:
+  case X86::ADD32rr:
+  case X86::ADD32rr_DB:
+  case X86::ADD64i32:
+  case X86::ADD64ri32:
+  case X86::ADD64ri32_DB:
+  case X86::ADD64ri8:
+  case X86::ADD64ri8_DB:
+  case X86::ADD64rm:
+  case X86::ADD64rr:
+  case X86::ADD64rr_DB:
+  case X86::ADD8i8:
+  case X86::ADD8mi:
+  case X86::ADD8mr:
+  case X86::ADD8ri:
+  case X86::ADD8rm:
+  case X86::ADD8rr:
+  case X86::SUB16i16:
+  case X86::SUB16ri:
+  case X86::SUB16ri8:
+  case X86::SUB16rm:
+  case X86::SUB16rr:
+  case X86::SUB32i32:
+  case X86::SUB32ri:
+  case X86::SUB32ri8:
+  case X86::SUB32rm:
+  case X86::SUB32rr:
+  case X86::SUB64i32:
+  case X86::SUB64ri32:
+  case X86::SUB64ri8:
+  case X86::SUB64rm:
+  case X86::SUB64rr:
+  case X86::SUB8i8:
+  case X86::SUB8ri:
+  case X86::SUB8rm:
+  case X86::SUB8rr:
+    return FuseKind == FuseCmp || FuseKind == FuseInc;
+  case X86::INC16r:
+  case X86::INC32r:
+  case X86::INC64_16r:
+  case X86::INC64_32r:
+  case X86::INC64r:
+  case X86::INC8r:
+  case X86::DEC16r:
+  case X86::DEC32r:
+  case X86::DEC64_16r:
+  case X86::DEC64_32r:
+  case X86::DEC64r:
+  case X86::DEC8r:
+    return FuseKind == FuseInc;
+  }
+}
+
+bool X86InstrInfo::
+ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+  assert(Cond.size() == 1 && "Invalid X86 branch condition!");
+  X86::CondCode CC = static_cast<X86::CondCode>(Cond[0].getImm());
+  if (CC == X86::COND_NE_OR_P || CC == X86::COND_NP_OR_E)
+    return true;
+  Cond[0].setImm(GetOppositeBranchCondition(CC));
+  return false;
+}
+
+bool X86InstrInfo::
+isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
+  // FIXME: Return false for x87 stack register classes for now. We can't
+  // allow any loads of these registers before FpGet_ST0_80.
+  return !(RC == &X86::CCRRegClass || RC == &X86::RFP32RegClass ||
+           RC == &X86::RFP64RegClass || RC == &X86::RFP80RegClass);
+}
+
+/// getGlobalBaseReg - Return a virtual register initialized with the
+/// the global base register value. Output instructions required to
+/// initialize the register in the function entry block, if necessary.
+///
+/// TODO: Eliminate this and move the code to X86MachineFunctionInfo.
+///
+unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
+  assert(!Subtarget.is64Bit() &&
+         "X86-64 PIC uses RIP relative addressing");
+
+  X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
+  unsigned GlobalBaseReg = X86FI->getGlobalBaseReg();
+  if (GlobalBaseReg != 0)
+    return GlobalBaseReg;
+
+  // Create the register. The code to initialize it is inserted
+  // later, by the CGBR pass (below).
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  GlobalBaseReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
+  X86FI->setGlobalBaseReg(GlobalBaseReg);
+  return GlobalBaseReg;
+}
+
+// These are the replaceable SSE instructions. Some of these have Int variants
+// that we don't include here. We don't want to replace instructions selected
+// by intrinsics.
+static const uint16_t ReplaceableInstrs[][3] = {
+  //PackedSingle     PackedDouble    PackedInt
+  { X86::MOVAPSmr,   X86::MOVAPDmr,  X86::MOVDQAmr  },
+  { X86::MOVAPSrm,   X86::MOVAPDrm,  X86::MOVDQArm  },
+  { X86::MOVAPSrr,   X86::MOVAPDrr,  X86::MOVDQArr  },
+  { X86::MOVUPSmr,   X86::MOVUPDmr,  X86::MOVDQUmr  },
+  { X86::MOVUPSrm,   X86::MOVUPDrm,  X86::MOVDQUrm  },
+  { X86::MOVNTPSmr,  X86::MOVNTPDmr, X86::MOVNTDQmr },
+  { X86::ANDNPSrm,   X86::ANDNPDrm,  X86::PANDNrm   },
+  { X86::ANDNPSrr,   X86::ANDNPDrr,  X86::PANDNrr   },
+  { X86::ANDPSrm,    X86::ANDPDrm,   X86::PANDrm    },
+  { X86::ANDPSrr,    X86::ANDPDrr,   X86::PANDrr    },
+  { X86::ORPSrm,     X86::ORPDrm,    X86::PORrm     },
+  { X86::ORPSrr,     X86::ORPDrr,    X86::PORrr     },
+  { X86::XORPSrm,    X86::XORPDrm,   X86::PXORrm    },
+  { X86::XORPSrr,    X86::XORPDrr,   X86::PXORrr    },
+  // AVX 128-bit support
+  { X86::VMOVAPSmr,  X86::VMOVAPDmr,  X86::VMOVDQAmr  },
+  { X86::VMOVAPSrm,  X86::VMOVAPDrm,  X86::VMOVDQArm  },
+  { X86::VMOVAPSrr,  X86::VMOVAPDrr,  X86::VMOVDQArr  },
+  { X86::VMOVUPSmr,  X86::VMOVUPDmr,  X86::VMOVDQUmr  },
+  { X86::VMOVUPSrm,  X86::VMOVUPDrm,  X86::VMOVDQUrm  },
+  { X86::VMOVNTPSmr, X86::VMOVNTPDmr, X86::VMOVNTDQmr },
+  { X86::VANDNPSrm,  X86::VANDNPDrm,  X86::VPANDNrm   },
+  { X86::VANDNPSrr,  X86::VANDNPDrr,  X86::VPANDNrr   },
+  { X86::VANDPSrm,   X86::VANDPDrm,   X86::VPANDrm    },
+  { X86::VANDPSrr,   X86::VANDPDrr,   X86::VPANDrr    },
+  { X86::VORPSrm,    X86::VORPDrm,    X86::VPORrm     },
+  { X86::VORPSrr,    X86::VORPDrr,    X86::VPORrr     },
+  { X86::VXORPSrm,   X86::VXORPDrm,   X86::VPXORrm    },
+  { X86::VXORPSrr,   X86::VXORPDrr,   X86::VPXORrr    },
+  // AVX 256-bit support
+  { X86::VMOVAPSYmr,   X86::VMOVAPDYmr,   X86::VMOVDQAYmr  },
+  { X86::VMOVAPSYrm,   X86::VMOVAPDYrm,   X86::VMOVDQAYrm  },
+  { X86::VMOVAPSYrr,   X86::VMOVAPDYrr,   X86::VMOVDQAYrr  },
+  { X86::VMOVUPSYmr,   X86::VMOVUPDYmr,   X86::VMOVDQUYmr  },
+  { X86::VMOVUPSYrm,   X86::VMOVUPDYrm,   X86::VMOVDQUYrm  },
+  { X86::VMOVNTPSYmr,  X86::VMOVNTPDYmr,  X86::VMOVNTDQYmr }
+};
+
+static const uint16_t ReplaceableInstrsAVX2[][3] = {
+  //PackedSingle       PackedDouble       PackedInt
+  { X86::VANDNPSYrm,   X86::VANDNPDYrm,   X86::VPANDNYrm   },
+  { X86::VANDNPSYrr,   X86::VANDNPDYrr,   X86::VPANDNYrr   },
+  { X86::VANDPSYrm,    X86::VANDPDYrm,    X86::VPANDYrm    },
+  { X86::VANDPSYrr,    X86::VANDPDYrr,    X86::VPANDYrr    },
+  { X86::VORPSYrm,     X86::VORPDYrm,     X86::VPORYrm     },
+  { X86::VORPSYrr,     X86::VORPDYrr,     X86::VPORYrr     },
+  { X86::VXORPSYrm,    X86::VXORPDYrm,    X86::VPXORYrm    },
+  { X86::VXORPSYrr,    X86::VXORPDYrr,    X86::VPXORYrr    },
+  { X86::VEXTRACTF128mr, X86::VEXTRACTF128mr, X86::VEXTRACTI128mr },
+  { X86::VEXTRACTF128rr, X86::VEXTRACTF128rr, X86::VEXTRACTI128rr },
+  { X86::VINSERTF128rm,  X86::VINSERTF128rm,  X86::VINSERTI128rm },
+  { X86::VINSERTF128rr,  X86::VINSERTF128rr,  X86::VINSERTI128rr },
+  { X86::VPERM2F128rm,   X86::VPERM2F128rm,   X86::VPERM2I128rm },
+  { X86::VPERM2F128rr,   X86::VPERM2F128rr,   X86::VPERM2I128rr },
+  { X86::VBROADCASTSSrm, X86::VBROADCASTSSrm, X86::VPBROADCASTDrm},
+  { X86::VBROADCASTSSrr, X86::VBROADCASTSSrr, X86::VPBROADCASTDrr},
+  { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrr, X86::VPBROADCASTDYrr},
+  { X86::VBROADCASTSSYrm, X86::VBROADCASTSSYrm, X86::VPBROADCASTDYrm},
+  { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrr, X86::VPBROADCASTQYrr},
+  { X86::VBROADCASTSDYrm, X86::VBROADCASTSDYrm, X86::VPBROADCASTQYrm}
+};
+
+// FIXME: Some shuffle and unpack instructions have equivalents in different
+// domains, but they require a bit more work than just switching opcodes.
+
+static const uint16_t *lookup(unsigned opcode, unsigned domain) {
+  for (unsigned i = 0, e = array_lengthof(ReplaceableInstrs); i != e; ++i)
+    if (ReplaceableInstrs[i][domain-1] == opcode)
+      return ReplaceableInstrs[i];
+  return nullptr;
+}
+
+static const uint16_t *lookupAVX2(unsigned opcode, unsigned domain) {
+  for (unsigned i = 0, e = array_lengthof(ReplaceableInstrsAVX2); i != e; ++i)
+    if (ReplaceableInstrsAVX2[i][domain-1] == opcode)
+      return ReplaceableInstrsAVX2[i];
+  return nullptr;
+}
+
+std::pair<uint16_t, uint16_t>
+X86InstrInfo::getExecutionDomain(const MachineInstr *MI) const {
+  uint16_t domain = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
+  bool hasAVX2 = Subtarget.hasAVX2();
+  uint16_t validDomains = 0;
+  if (domain && lookup(MI->getOpcode(), domain))
+    validDomains = 0xe;
+  else if (domain && lookupAVX2(MI->getOpcode(), domain))
+    validDomains = hasAVX2 ? 0xe : 0x6;
+  return std::make_pair(domain, validDomains);
+}
+
+void X86InstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
+  assert(Domain>0 && Domain<4 && "Invalid execution domain");
+  uint16_t dom = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
+  assert(dom && "Not an SSE instruction");
+  const uint16_t *table = lookup(MI->getOpcode(), dom);
+  if (!table) { // try the other table
+    assert((Subtarget.hasAVX2() || Domain < 3) &&
+           "256-bit vector operations only available in AVX2");
+    table = lookupAVX2(MI->getOpcode(), dom);
+  }
+  assert(table && "Cannot change domain");
+  MI->setDesc(get(table[Domain-1]));
+}
+
+/// getNoopForMachoTarget - Return the noop instruction to use for a noop.
+void X86InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+  NopInst.setOpcode(X86::NOOP);
+}
+
+void X86InstrInfo::getUnconditionalBranch(
+    MCInst &Branch, const MCSymbolRefExpr *BranchTarget) const {
+  Branch.setOpcode(X86::JMP_4);
+  Branch.addOperand(MCOperand::CreateExpr(BranchTarget));
+}
+
+void X86InstrInfo::getTrap(MCInst &MI) const {
+  MI.setOpcode(X86::TRAP);
+}
+
+bool X86InstrInfo::isHighLatencyDef(int opc) const {
+  switch (opc) {
+  default: return false;
+  case X86::DIVSDrm:
+  case X86::DIVSDrm_Int:
+  case X86::DIVSDrr:
+  case X86::DIVSDrr_Int:
+  case X86::DIVSSrm:
+  case X86::DIVSSrm_Int:
+  case X86::DIVSSrr:
+  case X86::DIVSSrr_Int:
+  case X86::SQRTPDm:
+  case X86::SQRTPDr:
+  case X86::SQRTPSm:
+  case X86::SQRTPSr:
+  case X86::SQRTSDm:
+  case X86::SQRTSDm_Int:
+  case X86::SQRTSDr:
+  case X86::SQRTSDr_Int:
+  case X86::SQRTSSm:
+  case X86::SQRTSSm_Int:
+  case X86::SQRTSSr:
+  case X86::SQRTSSr_Int:
+  // AVX instructions with high latency
+  case X86::VDIVSDrm:
+  case X86::VDIVSDrm_Int:
+  case X86::VDIVSDrr:
+  case X86::VDIVSDrr_Int:
+  case X86::VDIVSSrm:
+  case X86::VDIVSSrm_Int:
+  case X86::VDIVSSrr:
+  case X86::VDIVSSrr_Int:
+  case X86::VSQRTPDm:
+  case X86::VSQRTPDr:
+  case X86::VSQRTPSm:
+  case X86::VSQRTPSr:
+  case X86::VSQRTSDm:
+  case X86::VSQRTSDm_Int:
+  case X86::VSQRTSDr:
+  case X86::VSQRTSSm:
+  case X86::VSQRTSSm_Int:
+  case X86::VSQRTSSr:
+  case X86::VSQRTPDZrm:
+  case X86::VSQRTPDZrr:
+  case X86::VSQRTPSZrm:
+  case X86::VSQRTPSZrr:
+  case X86::VSQRTSDZm:
+  case X86::VSQRTSDZm_Int:
+  case X86::VSQRTSDZr:
+  case X86::VSQRTSSZm_Int:
+  case X86::VSQRTSSZr:
+  case X86::VSQRTSSZm:
+  case X86::VDIVSDZrm:
+  case X86::VDIVSDZrr:
+  case X86::VDIVSSZrm:
+  case X86::VDIVSSZrr:
+
+  case X86::VGATHERQPSZrm:
+  case X86::VGATHERQPDZrm:
+  case X86::VGATHERDPDZrm:
+  case X86::VGATHERDPSZrm:
+  case X86::VPGATHERQDZrm:
+  case X86::VPGATHERQQZrm:
+  case X86::VPGATHERDDZrm:
+  case X86::VPGATHERDQZrm:
+  case X86::VSCATTERQPDZmr:
+  case X86::VSCATTERQPSZmr:
+  case X86::VSCATTERDPDZmr:
+  case X86::VSCATTERDPSZmr:
+  case X86::VPSCATTERQDZmr:
+  case X86::VPSCATTERQQZmr:
+  case X86::VPSCATTERDDZmr:
+  case X86::VPSCATTERDQZmr:
+    return true;
+  }
+}
+
+bool X86InstrInfo::
+hasHighOperandLatency(const InstrItineraryData *ItinData,
+                      const MachineRegisterInfo *MRI,
+                      const MachineInstr *DefMI, unsigned DefIdx,
+                      const MachineInstr *UseMI, unsigned UseIdx) const {
+  return isHighLatencyDef(DefMI->getOpcode());
+}
+
+namespace {
+  /// CGBR - Create Global Base Reg pass. This initializes the PIC
+  /// global base register for x86-32.
+  struct CGBR : public MachineFunctionPass {
+    static char ID;
+    CGBR() : MachineFunctionPass(ID) {}
+
+    bool runOnMachineFunction(MachineFunction &MF) override {
+      const X86TargetMachine *TM =
+        static_cast<const X86TargetMachine *>(&MF.getTarget());
+
+      // Don't do anything if this is 64-bit as 64-bit PIC
+      // uses RIP relative addressing.
+      if (TM->getSubtarget<X86Subtarget>().is64Bit())
+        return false;
+
+      // Only emit a global base reg in PIC mode.
+      if (TM->getRelocationModel() != Reloc::PIC_)
+        return false;
+
+      X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+      unsigned GlobalBaseReg = X86FI->getGlobalBaseReg();
+
+      // If we didn't need a GlobalBaseReg, don't insert code.
+      if (GlobalBaseReg == 0)
+        return false;
+
+      // Insert the set of GlobalBaseReg into the first MBB of the function
+      MachineBasicBlock &FirstMBB = MF.front();
+      MachineBasicBlock::iterator MBBI = FirstMBB.begin();
+      DebugLoc DL = FirstMBB.findDebugLoc(MBBI);
+      MachineRegisterInfo &RegInfo = MF.getRegInfo();
+      const X86InstrInfo *TII = TM->getInstrInfo();
+
+      unsigned PC;
+      if (TM->getSubtarget<X86Subtarget>().isPICStyleGOT())
+        PC = RegInfo.createVirtualRegister(&X86::GR32RegClass);
+      else
+        PC = GlobalBaseReg;
+
+      // Operand of MovePCtoStack is completely ignored by asm printer. It's
+      // only used in JIT code emission as displacement to pc.
+      BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOVPC32r), PC).addImm(0);
+
+      // If we're using vanilla 'GOT' PIC style, we should use relative addressing
+      // not to pc, but to _GLOBAL_OFFSET_TABLE_ external.
+      if (TM->getSubtarget<X86Subtarget>().isPICStyleGOT()) {
+        // Generate addl $__GLOBAL_OFFSET_TABLE_ + [.-piclabel], %some_register
+        BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD32ri), GlobalBaseReg)
+          .addReg(PC).addExternalSymbol("_GLOBAL_OFFSET_TABLE_",
+                                        X86II::MO_GOT_ABSOLUTE_ADDRESS);
+      }
+
+      return true;
+    }
+
+    const char *getPassName() const override {
+      return "X86 PIC Global Base Reg Initialization";
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+  };
+}
+
+char CGBR::ID = 0;
+FunctionPass*
+llvm::createX86GlobalBaseRegPass() { return new CGBR(); }
+
+namespace {
+  struct LDTLSCleanup : public MachineFunctionPass {
+    static char ID;
+    LDTLSCleanup() : MachineFunctionPass(ID) {}
+
+    bool runOnMachineFunction(MachineFunction &MF) override {
+      X86MachineFunctionInfo* MFI = MF.getInfo<X86MachineFunctionInfo>();
+      if (MFI->getNumLocalDynamicTLSAccesses() < 2) {
+        // No point folding accesses if there isn't at least two.
+        return false;
+      }
+
+      MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
+      return VisitNode(DT->getRootNode(), 0);
+    }
+
+    // Visit the dominator subtree rooted at Node in pre-order.
+    // If TLSBaseAddrReg is non-null, then use that to replace any
+    // TLS_base_addr instructions. Otherwise, create the register
+    // when the first such instruction is seen, and then use it
+    // as we encounter more instructions.
+    bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg) {
+      MachineBasicBlock *BB = Node->getBlock();
+      bool Changed = false;
+
+      // Traverse the current block.
+      for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
+           ++I) {
+        switch (I->getOpcode()) {
+          case X86::TLS_base_addr32:
+          case X86::TLS_base_addr64:
+            if (TLSBaseAddrReg)
+              I = ReplaceTLSBaseAddrCall(I, TLSBaseAddrReg);
+            else
+              I = SetRegister(I, &TLSBaseAddrReg);
+            Changed = true;
+            break;
+          default:
+            break;
+        }
+      }
+
+      // Visit the children of this block in the dominator tree.
+      for (MachineDomTreeNode::iterator I = Node->begin(), E = Node->end();
+           I != E; ++I) {
+        Changed |= VisitNode(*I, TLSBaseAddrReg);
+      }
+
+      return Changed;
+    }
+
+    // Replace the TLS_base_addr instruction I with a copy from
+    // TLSBaseAddrReg, returning the new instruction.
+    MachineInstr *ReplaceTLSBaseAddrCall(MachineInstr *I,
+                                         unsigned TLSBaseAddrReg) {
+      MachineFunction *MF = I->getParent()->getParent();
+      const X86TargetMachine *TM =
+          static_cast<const X86TargetMachine *>(&MF->getTarget());
+      const bool is64Bit = TM->getSubtarget<X86Subtarget>().is64Bit();
+      const X86InstrInfo *TII = TM->getInstrInfo();
+
+      // Insert a Copy from TLSBaseAddrReg to RAX/EAX.
+      MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(),
+                                   TII->get(TargetOpcode::COPY),
+                                   is64Bit ? X86::RAX : X86::EAX)
+                                   .addReg(TLSBaseAddrReg);
+
+      // Erase the TLS_base_addr instruction.
+      I->eraseFromParent();
+
+      return Copy;
+    }
+
+    // Create a virtal register in *TLSBaseAddrReg, and populate it by
+    // inserting a copy instruction after I. Returns the new instruction.
+    MachineInstr *SetRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) {
+      MachineFunction *MF = I->getParent()->getParent();
+      const X86TargetMachine *TM =
+          static_cast<const X86TargetMachine *>(&MF->getTarget());
+      const bool is64Bit = TM->getSubtarget<X86Subtarget>().is64Bit();
+      const X86InstrInfo *TII = TM->getInstrInfo();
+
+      // Create a virtual register for the TLS base address.
+      MachineRegisterInfo &RegInfo = MF->getRegInfo();
+      *TLSBaseAddrReg = RegInfo.createVirtualRegister(is64Bit
+                                                      ? &X86::GR64RegClass
+                                                      : &X86::GR32RegClass);
+
+      // Insert a copy from RAX/EAX to TLSBaseAddrReg.
+      MachineInstr *Next = I->getNextNode();
+      MachineInstr *Copy = BuildMI(*I->getParent(), Next, I->getDebugLoc(),
+                                   TII->get(TargetOpcode::COPY),
+                                   *TLSBaseAddrReg)
+                                   .addReg(is64Bit ? X86::RAX : X86::EAX);
+
+      return Copy;
+    }
+
+    const char *getPassName() const override {
+      return "Local Dynamic TLS Access Clean-up";
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      AU.addRequired<MachineDominatorTree>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+  };
+}
+
+char LDTLSCleanup::ID = 0;
+FunctionPass*
+llvm::createCleanupLocalDynamicTLSPass() { return new LDTLSCleanup(); }
diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.h b/contrib/llvm/lib/Target/X86/X86InstrInfo.h
new file mode 100644
index 0000000..c177e3a
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.h
@@ -0,0 +1,464 @@
+//===-- X86InstrInfo.h - X86 Instruction Information ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86INSTRUCTIONINFO_H
+#define X86INSTRUCTIONINFO_H
+
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "X86RegisterInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "X86GenInstrInfo.inc"
+
+namespace llvm {
+  class X86RegisterInfo;
+  class X86Subtarget;
+
+namespace X86 {
+  // X86 specific condition code. These correspond to X86_*_COND in
+  // X86InstrInfo.td. They must be kept in synch.
+  enum CondCode {
+    COND_A  = 0,
+    COND_AE = 1,
+    COND_B  = 2,
+    COND_BE = 3,
+    COND_E  = 4,
+    COND_G  = 5,
+    COND_GE = 6,
+    COND_L  = 7,
+    COND_LE = 8,
+    COND_NE = 9,
+    COND_NO = 10,
+    COND_NP = 11,
+    COND_NS = 12,
+    COND_O  = 13,
+    COND_P  = 14,
+    COND_S  = 15,
+    LAST_VALID_COND = COND_S,
+
+    // Artificial condition codes. These are used by AnalyzeBranch
+    // to indicate a block terminated with two conditional branches to
+    // the same location. This occurs in code using FCMP_OEQ or FCMP_UNE,
+    // which can't be represented on x86 with a single condition. These
+    // are never used in MachineInstrs.
+    COND_NE_OR_P,
+    COND_NP_OR_E,
+
+    COND_INVALID
+  };
+
+  // Turn condition code into conditional branch opcode.
+  unsigned GetCondBranchFromCond(CondCode CC);
+
+  /// \brief Return a set opcode for the given condition and whether it has
+  /// a memory operand.
+  unsigned getSETFromCond(CondCode CC, bool HasMemoryOperand = false);
+
+  /// \brief Return a cmov opcode for the given condition, register size in
+  /// bytes, and operand type.
+  unsigned getCMovFromCond(CondCode CC, unsigned RegBytes,
+                           bool HasMemoryOperand = false);
+
+  // Turn CMov opcode into condition code.
+  CondCode getCondFromCMovOpc(unsigned Opc);
+
+  /// GetOppositeBranchCondition - Return the inverse of the specified cond,
+  /// e.g. turning COND_E to COND_NE.
+  CondCode GetOppositeBranchCondition(CondCode CC);
+}  // end namespace X86;
+
+
+/// isGlobalStubReference - Return true if the specified TargetFlag operand is
+/// a reference to a stub for a global, not the global itself.
+inline static bool isGlobalStubReference(unsigned char TargetFlag) {
+  switch (TargetFlag) {
+  case X86II::MO_DLLIMPORT: // dllimport stub.
+  case X86II::MO_GOTPCREL:  // rip-relative GOT reference.
+  case X86II::MO_GOT:       // normal GOT reference.
+  case X86II::MO_DARWIN_NONLAZY_PIC_BASE:        // Normal $non_lazy_ptr ref.
+  case X86II::MO_DARWIN_NONLAZY:                 // Normal $non_lazy_ptr ref.
+  case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: // Hidden $non_lazy_ptr ref.
+    return true;
+  default:
+    return false;
+  }
+}
+
+/// isGlobalRelativeToPICBase - Return true if the specified global value
+/// reference is relative to a 32-bit PIC base (X86ISD::GlobalBaseReg).  If this
+/// is true, the addressing mode has the PIC base register added in (e.g. EBX).
+inline static bool isGlobalRelativeToPICBase(unsigned char TargetFlag) {
+  switch (TargetFlag) {
+  case X86II::MO_GOTOFF:                         // isPICStyleGOT: local global.
+  case X86II::MO_GOT:                            // isPICStyleGOT: other global.
+  case X86II::MO_PIC_BASE_OFFSET:                // Darwin local global.
+  case X86II::MO_DARWIN_NONLAZY_PIC_BASE:        // Darwin/32 external global.
+  case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: // Darwin/32 hidden global.
+  case X86II::MO_TLVP:                           // ??? Pretty sure..
+    return true;
+  default:
+    return false;
+  }
+}
+
+inline static bool isScale(const MachineOperand &MO) {
+  return MO.isImm() &&
+    (MO.getImm() == 1 || MO.getImm() == 2 ||
+     MO.getImm() == 4 || MO.getImm() == 8);
+}
+
+inline static bool isLeaMem(const MachineInstr *MI, unsigned Op) {
+  if (MI->getOperand(Op).isFI()) return true;
+  return Op+X86::AddrSegmentReg <= MI->getNumOperands() &&
+    MI->getOperand(Op+X86::AddrBaseReg).isReg() &&
+    isScale(MI->getOperand(Op+X86::AddrScaleAmt)) &&
+    MI->getOperand(Op+X86::AddrIndexReg).isReg() &&
+    (MI->getOperand(Op+X86::AddrDisp).isImm() ||
+     MI->getOperand(Op+X86::AddrDisp).isGlobal() ||
+     MI->getOperand(Op+X86::AddrDisp).isCPI() ||
+     MI->getOperand(Op+X86::AddrDisp).isJTI());
+}
+
+inline static bool isMem(const MachineInstr *MI, unsigned Op) {
+  if (MI->getOperand(Op).isFI()) return true;
+  return Op+X86::AddrNumOperands <= MI->getNumOperands() &&
+    MI->getOperand(Op+X86::AddrSegmentReg).isReg() &&
+    isLeaMem(MI, Op);
+}
+
+class X86InstrInfo final : public X86GenInstrInfo {
+  X86Subtarget &Subtarget;
+  const X86RegisterInfo RI;
+
+  /// RegOp2MemOpTable3Addr, RegOp2MemOpTable0, RegOp2MemOpTable1,
+  /// RegOp2MemOpTable2, RegOp2MemOpTable3 - Load / store folding opcode maps.
+  ///
+  typedef DenseMap<unsigned,
+                   std::pair<unsigned, unsigned> > RegOp2MemOpTableType;
+  RegOp2MemOpTableType RegOp2MemOpTable2Addr;
+  RegOp2MemOpTableType RegOp2MemOpTable0;
+  RegOp2MemOpTableType RegOp2MemOpTable1;
+  RegOp2MemOpTableType RegOp2MemOpTable2;
+  RegOp2MemOpTableType RegOp2MemOpTable3;
+
+  /// MemOp2RegOpTable - Load / store unfolding opcode map.
+  ///
+  typedef DenseMap<unsigned,
+                   std::pair<unsigned, unsigned> > MemOp2RegOpTableType;
+  MemOp2RegOpTableType MemOp2RegOpTable;
+
+  static void AddTableEntry(RegOp2MemOpTableType &R2MTable,
+                            MemOp2RegOpTableType &M2RTable,
+                            unsigned RegOp, unsigned MemOp, unsigned Flags);
+
+  virtual void anchor();
+
+public:
+  explicit X86InstrInfo(X86Subtarget &STI);
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  const X86RegisterInfo &getRegisterInfo() const { return RI; }
+
+  /// isCoalescableExtInstr - Return true if the instruction is a "coalescable"
+  /// extension instruction. That is, it's like a copy where it's legal for the
+  /// source to overlap the destination. e.g. X86::MOVSX64rr32. If this returns
+  /// true, then it's expected the pre-extension value is available as a subreg
+  /// of the result register. This also returns the sub-register index in
+  /// SubIdx.
+  bool isCoalescableExtInstr(const MachineInstr &MI,
+                             unsigned &SrcReg, unsigned &DstReg,
+                             unsigned &SubIdx) const override;
+
+  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                               int &FrameIndex) const override;
+  /// isLoadFromStackSlotPostFE - Check for post-frame ptr elimination
+  /// stack locations as well.  This uses a heuristic so it isn't
+  /// reliable for correctness.
+  unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI,
+                                     int &FrameIndex) const override;
+
+  unsigned isStoreToStackSlot(const MachineInstr *MI,
+                              int &FrameIndex) const override;
+  /// isStoreToStackSlotPostFE - Check for post-frame ptr elimination
+  /// stack locations as well.  This uses a heuristic so it isn't
+  /// reliable for correctness.
+  unsigned isStoreToStackSlotPostFE(const MachineInstr *MI,
+                                    int &FrameIndex) const override;
+
+  bool isReallyTriviallyReMaterializable(const MachineInstr *MI,
+                                         AliasAnalysis *AA) const override;
+  void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                     unsigned DestReg, unsigned SubIdx,
+                     const MachineInstr *Orig,
+                     const TargetRegisterInfo &TRI) const override;
+
+  /// Given an operand within a MachineInstr, insert preceding code to put it
+  /// into the right format for a particular kind of LEA instruction. This may
+  /// involve using an appropriate super-register instead (with an implicit use
+  /// of the original) or creating a new virtual register and inserting COPY
+  /// instructions to get the data into the right class.
+  ///
+  /// Reference parameters are set to indicate how caller should add this
+  /// operand to the LEA instruction.
+  bool classifyLEAReg(MachineInstr *MI, const MachineOperand &Src,
+                      unsigned LEAOpcode, bool AllowSP,
+                      unsigned &NewSrc, bool &isKill,
+                      bool &isUndef, MachineOperand &ImplicitOp) const;
+
+  /// convertToThreeAddress - This method must be implemented by targets that
+  /// set the M_CONVERTIBLE_TO_3_ADDR flag.  When this flag is set, the target
+  /// may be able to convert a two-address instruction into a true
+  /// three-address instruction on demand.  This allows the X86 target (for
+  /// example) to convert ADD and SHL instructions into LEA instructions if they
+  /// would require register copies due to two-addressness.
+  ///
+  /// This method returns a null pointer if the transformation cannot be
+  /// performed, otherwise it returns the new instruction.
+  ///
+  MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
+                                      MachineBasicBlock::iterator &MBBI,
+                                      LiveVariables *LV) const override;
+
+  /// commuteInstruction - We have a few instructions that must be hacked on to
+  /// commute them.
+  ///
+  MachineInstr *commuteInstruction(MachineInstr *MI, bool NewMI) const override;
+
+  bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
+                             unsigned &SrcOpIdx2) const override;
+
+  // Branch analysis.
+  bool isUnpredicatedTerminator(const MachineInstr* MI) const override;
+  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify) const override;
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
+  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB,
+                        const SmallVectorImpl<MachineOperand> &Cond,
+                        DebugLoc DL) const override;
+  bool canInsertSelect(const MachineBasicBlock&,
+                       const SmallVectorImpl<MachineOperand> &Cond,
+                       unsigned, unsigned, int&, int&, int&) const override;
+  void insertSelect(MachineBasicBlock &MBB,
+                    MachineBasicBlock::iterator MI, DebugLoc DL,
+                    unsigned DstReg,
+                    const SmallVectorImpl<MachineOperand> &Cond,
+                    unsigned TrueReg, unsigned FalseReg) const override;
+  void copyPhysReg(MachineBasicBlock &MBB,
+                   MachineBasicBlock::iterator MI, DebugLoc DL,
+                   unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+
+  void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
+                      SmallVectorImpl<MachineOperand> &Addr,
+                      const TargetRegisterClass *RC,
+                      MachineInstr::mmo_iterator MMOBegin,
+                      MachineInstr::mmo_iterator MMOEnd,
+                      SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
+
+  void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+                       SmallVectorImpl<MachineOperand> &Addr,
+                       const TargetRegisterClass *RC,
+                       MachineInstr::mmo_iterator MMOBegin,
+                       MachineInstr::mmo_iterator MMOEnd,
+                       SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+  bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
+
+  /// foldMemoryOperand - If this target supports it, fold a load or store of
+  /// the specified stack slot into the specified machine instruction for the
+  /// specified operand(s).  If this is possible, the target should perform the
+  /// folding and return true, otherwise it should return false.  If it folds
+  /// the instruction, it is likely that the MachineInstruction the iterator
+  /// references has been changed.
+  MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                      MachineInstr* MI,
+                                      const SmallVectorImpl<unsigned> &Ops,
+                                      int FrameIndex) const override;
+
+  /// foldMemoryOperand - Same as the previous version except it allows folding
+  /// of any load and store from / to any address, not just from a specific
+  /// stack slot.
+  MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                      MachineInstr* MI,
+                                      const SmallVectorImpl<unsigned> &Ops,
+                                      MachineInstr* LoadMI) const override;
+
+  /// canFoldMemoryOperand - Returns true if the specified load / store is
+  /// folding is possible.
+  bool canFoldMemoryOperand(const MachineInstr*,
+                            const SmallVectorImpl<unsigned> &) const override;
+
+  /// unfoldMemoryOperand - Separate a single instruction which folded a load or
+  /// a store or a load and a store into two or more instruction. If this is
+  /// possible, returns true as well as the new instructions by reference.
+  bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
+                         unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
+                         SmallVectorImpl<MachineInstr*> &NewMIs) const override;
+
+  bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
+                           SmallVectorImpl<SDNode*> &NewNodes) const override;
+
+  /// getOpcodeAfterMemoryUnfold - Returns the opcode of the would be new
+  /// instruction after load / store are unfolded from an instruction of the
+  /// specified opcode. It returns zero if the specified unfolding is not
+  /// possible. If LoadRegIndex is non-null, it is filled in with the operand
+  /// index of the operand which will hold the register holding the loaded
+  /// value.
+  unsigned getOpcodeAfterMemoryUnfold(unsigned Opc,
+                              bool UnfoldLoad, bool UnfoldStore,
+                              unsigned *LoadRegIndex = nullptr) const override;
+
+  /// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler
+  /// to determine if two loads are loading from the same base address. It
+  /// should only return true if the base pointers are the same and the
+  /// only differences between the two addresses are the offset. It also returns
+  /// the offsets by reference.
+  bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, int64_t &Offset1,
+                               int64_t &Offset2) const override;
+
+  /// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to
+  /// determine (in conjunction with areLoadsFromSameBasePtr) if two loads should
+  /// be scheduled togther. On some targets if two loads are loading from
+  /// addresses in the same cache line, it's better if they are scheduled
+  /// together. This function takes two integers that represent the load offsets
+  /// from the common base address. It returns true if it decides it's desirable
+  /// to schedule the two loads together. "NumLoads" is the number of loads that
+  /// have already been scheduled after Load1.
+  bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
+                               int64_t Offset1, int64_t Offset2,
+                               unsigned NumLoads) const override;
+
+  bool shouldScheduleAdjacent(MachineInstr* First,
+                              MachineInstr *Second) const override;
+
+  void getNoopForMachoTarget(MCInst &NopInst) const override;
+
+  bool
+  ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+
+  /// isSafeToMoveRegClassDefs - Return true if it's safe to move a machine
+  /// instruction that defines the specified register class.
+  bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override;
+
+  /// isSafeToClobberEFLAGS - Return true if it's safe insert an instruction tha
+  /// would clobber the EFLAGS condition register. Note the result may be
+  /// conservative. If it cannot definitely determine the safety after visiting
+  /// a few instructions in each direction it assumes it's not safe.
+  bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator I) const;
+
+  static bool isX86_64ExtendedReg(const MachineOperand &MO) {
+    if (!MO.isReg()) return false;
+    return X86II::isX86_64ExtendedReg(MO.getReg());
+  }
+
+  /// getGlobalBaseReg - Return a virtual register initialized with the
+  /// the global base register value. Output instructions required to
+  /// initialize the register in the function entry block, if necessary.
+  ///
+  unsigned getGlobalBaseReg(MachineFunction *MF) const;
+
+  std::pair<uint16_t, uint16_t>
+  getExecutionDomain(const MachineInstr *MI) const override;
+
+  void setExecutionDomain(MachineInstr *MI, unsigned Domain) const override;
+
+  unsigned
+    getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum,
+                                 const TargetRegisterInfo *TRI) const override;
+  unsigned getUndefRegClearance(const MachineInstr *MI, unsigned &OpNum,
+                                const TargetRegisterInfo *TRI) const override;
+  void breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum,
+                                 const TargetRegisterInfo *TRI) const override;
+
+  MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                      MachineInstr* MI,
+                                      unsigned OpNum,
+                                      const SmallVectorImpl<MachineOperand> &MOs,
+                                      unsigned Size, unsigned Alignment) const;
+
+  void
+  getUnconditionalBranch(MCInst &Branch,
+                         const MCSymbolRefExpr *BranchTarget) const override;
+
+  void getTrap(MCInst &MI) const override;
+
+  bool isHighLatencyDef(int opc) const override;
+
+  bool hasHighOperandLatency(const InstrItineraryData *ItinData,
+                             const MachineRegisterInfo *MRI,
+                             const MachineInstr *DefMI, unsigned DefIdx,
+                             const MachineInstr *UseMI,
+                             unsigned UseIdx) const override;
+
+  /// analyzeCompare - For a comparison instruction, return the source registers
+  /// in SrcReg and SrcReg2 if having two register operands, and the value it
+  /// compares against in CmpValue. Return true if the comparison instruction
+  /// can be analyzed.
+  bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
+                      unsigned &SrcReg2, int &CmpMask,
+                      int &CmpValue) const override;
+
+  /// optimizeCompareInstr - Check if there exists an earlier instruction that
+  /// operates on the same source operands and sets flags in the same way as
+  /// Compare; remove Compare if possible.
+  bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
+                            unsigned SrcReg2, int CmpMask, int CmpValue,
+                            const MachineRegisterInfo *MRI) const override;
+
+  /// optimizeLoadInstr - Try to remove the load by folding it to a register
+  /// operand at the use. We fold the load instructions if and only if the
+  /// def and use are in the same BB. We only look at one load and see
+  /// whether it can be folded into MI. FoldAsLoadDefReg is the virtual register
+  /// defined by the load we are trying to fold. DefMI returns the machine
+  /// instruction that defines FoldAsLoadDefReg, and the function returns
+  /// the machine instruction generated due to folding.
+  MachineInstr* optimizeLoadInstr(MachineInstr *MI,
+                                  const MachineRegisterInfo *MRI,
+                                  unsigned &FoldAsLoadDefReg,
+                                  MachineInstr *&DefMI) const override;
+
+private:
+  MachineInstr * convertToThreeAddressWithLEA(unsigned MIOpc,
+                                              MachineFunction::iterator &MFI,
+                                              MachineBasicBlock::iterator &MBBI,
+                                              LiveVariables *LV) const;
+
+  /// isFrameOperand - Return true and the FrameIndex if the specified
+  /// operand and follow operands form a reference to the stack frame.
+  bool isFrameOperand(const MachineInstr *MI, unsigned int Op,
+                      int &FrameIndex) const;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.td b/contrib/llvm/lib/Target/X86/X86InstrInfo.td
new file mode 100644
index 0000000..0f872a6
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.td
@@ -0,0 +1,2867 @@
+//===-- X86InstrInfo.td - Main X86 Instruction Definition --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 instruction set, defining the instructions, and
+// properties of the instructions which are needed for code generation, machine
+// code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// X86 specific DAG Nodes.
+//
+
+def SDTIntShiftDOp: SDTypeProfile<1, 3,
+                                  [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+                                   SDTCisInt<0>, SDTCisInt<3>]>;
+
+def SDTX86CmpTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisSameAs<1, 2>]>;
+
+def SDTX86Cmps : SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
+//def SDTX86Cmpss : SDTypeProfile<1, 3, [SDTCisVT<0, f32>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
+
+def SDTX86Cmov    : SDTypeProfile<1, 4,
+                                  [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
+                                   SDTCisVT<3, i8>, SDTCisVT<4, i32>]>;
+
+// Unary and binary operator instructions that set EFLAGS as a side-effect.
+def SDTUnaryArithWithFlags : SDTypeProfile<2, 1,
+                                           [SDTCisInt<0>, SDTCisVT<1, i32>]>;
+
+def SDTBinaryArithWithFlags : SDTypeProfile<2, 2,
+                                            [SDTCisSameAs<0, 2>,
+                                             SDTCisSameAs<0, 3>,
+                                             SDTCisInt<0>, SDTCisVT<1, i32>]>;
+
+// SDTBinaryArithWithFlagsInOut - RES1, EFLAGS = op LHS, RHS, EFLAGS
+def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3,
+                                            [SDTCisSameAs<0, 2>,
+                                             SDTCisSameAs<0, 3>,
+                                             SDTCisInt<0>,
+                                             SDTCisVT<1, i32>,
+                                             SDTCisVT<4, i32>]>;
+// RES1, RES2, FLAGS = op LHS, RHS
+def SDT2ResultBinaryArithWithFlags : SDTypeProfile<3, 2,
+                                            [SDTCisSameAs<0, 1>,
+                                             SDTCisSameAs<0, 2>,
+                                             SDTCisSameAs<0, 3>,
+                                             SDTCisInt<0>, SDTCisVT<1, i32>]>;
+def SDTX86BrCond  : SDTypeProfile<0, 3,
+                                  [SDTCisVT<0, OtherVT>,
+                                   SDTCisVT<1, i8>, SDTCisVT<2, i32>]>;
+
+def SDTX86SetCC   : SDTypeProfile<1, 2,
+                                  [SDTCisVT<0, i8>,
+                                   SDTCisVT<1, i8>, SDTCisVT<2, i32>]>;
+def SDTX86SetCC_C : SDTypeProfile<1, 2,
+                                  [SDTCisInt<0>,
+                                   SDTCisVT<1, i8>, SDTCisVT<2, i32>]>;
+
+def SDTX86sahf : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVT<1, i8>]>;
+
+def SDTX86rdrand : SDTypeProfile<2, 0, [SDTCisInt<0>, SDTCisVT<1, i32>]>;
+
+def SDTX86cas : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisInt<1>,
+                                     SDTCisVT<2, i8>]>;
+def SDTX86caspair : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+
+def SDTX86atomicBinary : SDTypeProfile<2, 3, [SDTCisInt<0>, SDTCisInt<1>,
+                                SDTCisPtrTy<2>, SDTCisInt<3>,SDTCisInt<4>]>;
+def SDTX86Ret     : SDTypeProfile<0, -1, [SDTCisVT<0, i16>]>;
+
+def SDT_X86CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
+def SDT_X86CallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>,
+                                        SDTCisVT<1, i32>]>;
+
+def SDT_X86Call   : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
+
+def SDT_X86VASTART_SAVE_XMM_REGS : SDTypeProfile<0, -1, [SDTCisVT<0, i8>,
+                                                         SDTCisVT<1, iPTR>,
+                                                         SDTCisVT<2, iPTR>]>;
+
+def SDT_X86VAARG_64 : SDTypeProfile<1, -1, [SDTCisPtrTy<0>,
+                                            SDTCisPtrTy<1>,
+                                            SDTCisVT<2, i32>,
+                                            SDTCisVT<3, i8>,
+                                            SDTCisVT<4, i32>]>;
+
+def SDTX86RepStr  : SDTypeProfile<0, 1, [SDTCisVT<0, OtherVT>]>;
+
+def SDTX86Void    : SDTypeProfile<0, 0, []>;
+
+def SDTX86Wrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
+
+def SDT_X86TLSADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def SDT_X86TLSBASEADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def SDT_X86TLSCALL : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def SDT_X86SEG_ALLOCA : SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
+
+def SDT_X86WIN_FTOL : SDTypeProfile<0, 1, [SDTCisFP<0>]>;
+
+def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>;
+
+def SDT_X86MEMBARRIER : SDTypeProfile<0, 0, []>;
+
+def X86MemBarrier : SDNode<"X86ISD::MEMBARRIER", SDT_X86MEMBARRIER,
+                            [SDNPHasChain,SDNPSideEffect]>;
+def X86MFence : SDNode<"X86ISD::MFENCE", SDT_X86MEMBARRIER,
+                        [SDNPHasChain]>;
+def X86SFence : SDNode<"X86ISD::SFENCE", SDT_X86MEMBARRIER,
+                        [SDNPHasChain]>;
+def X86LFence : SDNode<"X86ISD::LFENCE", SDT_X86MEMBARRIER,
+                        [SDNPHasChain]>;
+
+
+def X86bsf     : SDNode<"X86ISD::BSF",      SDTUnaryArithWithFlags>;
+def X86bsr     : SDNode<"X86ISD::BSR",      SDTUnaryArithWithFlags>;
+def X86shld    : SDNode<"X86ISD::SHLD",     SDTIntShiftDOp>;
+def X86shrd    : SDNode<"X86ISD::SHRD",     SDTIntShiftDOp>;
+
+def X86cmp     : SDNode<"X86ISD::CMP" ,     SDTX86CmpTest>;
+def X86bt      : SDNode<"X86ISD::BT",       SDTX86CmpTest>;
+
+def X86cmov    : SDNode<"X86ISD::CMOV",     SDTX86Cmov>;
+def X86brcond  : SDNode<"X86ISD::BRCOND",   SDTX86BrCond,
+                        [SDNPHasChain]>;
+def X86setcc   : SDNode<"X86ISD::SETCC",    SDTX86SetCC>;
+def X86setcc_c : SDNode<"X86ISD::SETCC_CARRY", SDTX86SetCC_C>;
+
+def X86sahf    : SDNode<"X86ISD::SAHF",     SDTX86sahf>;
+
+def X86rdrand  : SDNode<"X86ISD::RDRAND",   SDTX86rdrand,
+                        [SDNPHasChain, SDNPSideEffect]>;
+
+def X86rdseed  : SDNode<"X86ISD::RDSEED",   SDTX86rdrand,
+                        [SDNPHasChain, SDNPSideEffect]>;
+
+def X86cas : SDNode<"X86ISD::LCMPXCHG_DAG", SDTX86cas,
+                        [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
+                         SDNPMayLoad, SDNPMemOperand]>;
+def X86cas8 : SDNode<"X86ISD::LCMPXCHG8_DAG", SDTX86caspair,
+                        [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
+                         SDNPMayLoad, SDNPMemOperand]>;
+def X86cas16 : SDNode<"X86ISD::LCMPXCHG16_DAG", SDTX86caspair,
+                        [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
+                         SDNPMayLoad, SDNPMemOperand]>;
+
+def X86retflag : SDNode<"X86ISD::RET_FLAG", SDTX86Ret,
+                        [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+def X86vastart_save_xmm_regs :
+                 SDNode<"X86ISD::VASTART_SAVE_XMM_REGS",
+                        SDT_X86VASTART_SAVE_XMM_REGS,
+                        [SDNPHasChain, SDNPVariadic]>;
+def X86vaarg64 :
+                 SDNode<"X86ISD::VAARG_64", SDT_X86VAARG_64,
+                        [SDNPHasChain, SDNPMayLoad, SDNPMayStore,
+                         SDNPMemOperand]>;
+def X86callseq_start :
+                 SDNode<"ISD::CALLSEQ_START", SDT_X86CallSeqStart,
+                        [SDNPHasChain, SDNPOutGlue]>;
+def X86callseq_end :
+                 SDNode<"ISD::CALLSEQ_END",   SDT_X86CallSeqEnd,
+                        [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def X86call    : SDNode<"X86ISD::CALL",     SDT_X86Call,
+                        [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
+                         SDNPVariadic]>;
+
+def X86rep_stos: SDNode<"X86ISD::REP_STOS", SDTX86RepStr,
+                        [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore]>;
+def X86rep_movs: SDNode<"X86ISD::REP_MOVS", SDTX86RepStr,
+                        [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
+                         SDNPMayLoad]>;
+
+def X86rdtsc   : SDNode<"X86ISD::RDTSC_DAG", SDTX86Void,
+                        [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
+def X86rdtscp  : SDNode<"X86ISD::RDTSCP_DAG", SDTX86Void,
+                        [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
+def X86rdpmc   : SDNode<"X86ISD::RDPMC_DAG", SDTX86Void,
+                        [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; 
+
+def X86Wrapper    : SDNode<"X86ISD::Wrapper",     SDTX86Wrapper>;
+def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP",  SDTX86Wrapper>;
+
+def X86tlsaddr : SDNode<"X86ISD::TLSADDR", SDT_X86TLSADDR,
+                        [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def X86tlsbaseaddr : SDNode<"X86ISD::TLSBASEADDR", SDT_X86TLSBASEADDR,
+                        [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def X86ehret : SDNode<"X86ISD::EH_RETURN", SDT_X86EHRET,
+                        [SDNPHasChain]>;
+
+def X86eh_sjlj_setjmp  : SDNode<"X86ISD::EH_SJLJ_SETJMP",
+                                SDTypeProfile<1, 1, [SDTCisInt<0>,
+                                                     SDTCisPtrTy<1>]>,
+                                [SDNPHasChain, SDNPSideEffect]>;
+def X86eh_sjlj_longjmp : SDNode<"X86ISD::EH_SJLJ_LONGJMP",
+                                SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
+                                [SDNPHasChain, SDNPSideEffect]>;
+
+def X86tcret : SDNode<"X86ISD::TC_RETURN", SDT_X86TCRET,
+                        [SDNPHasChain,  SDNPOptInGlue, SDNPVariadic]>;
+
+def X86add_flag  : SDNode<"X86ISD::ADD",  SDTBinaryArithWithFlags,
+                          [SDNPCommutative]>;
+def X86sub_flag  : SDNode<"X86ISD::SUB",  SDTBinaryArithWithFlags>;
+def X86smul_flag : SDNode<"X86ISD::SMUL", SDTBinaryArithWithFlags,
+                          [SDNPCommutative]>;
+def X86umul_flag : SDNode<"X86ISD::UMUL", SDT2ResultBinaryArithWithFlags,
+                          [SDNPCommutative]>;
+def X86adc_flag  : SDNode<"X86ISD::ADC",  SDTBinaryArithWithFlagsInOut>;
+def X86sbb_flag  : SDNode<"X86ISD::SBB",  SDTBinaryArithWithFlagsInOut>;
+
+def X86inc_flag  : SDNode<"X86ISD::INC",  SDTUnaryArithWithFlags>;
+def X86dec_flag  : SDNode<"X86ISD::DEC",  SDTUnaryArithWithFlags>;
+def X86or_flag   : SDNode<"X86ISD::OR",   SDTBinaryArithWithFlags,
+                          [SDNPCommutative]>;
+def X86xor_flag  : SDNode<"X86ISD::XOR",  SDTBinaryArithWithFlags,
+                          [SDNPCommutative]>;
+def X86and_flag  : SDNode<"X86ISD::AND",  SDTBinaryArithWithFlags,
+                          [SDNPCommutative]>;
+
+def X86bextr  : SDNode<"X86ISD::BEXTR",  SDTIntBinOp>;
+
+def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>;
+
+def X86WinAlloca : SDNode<"X86ISD::WIN_ALLOCA", SDTX86Void,
+                          [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>;
+
+def X86SegAlloca : SDNode<"X86ISD::SEG_ALLOCA", SDT_X86SEG_ALLOCA,
+                          [SDNPHasChain]>;
+
+def X86TLSCall : SDNode<"X86ISD::TLSCALL", SDT_X86TLSCALL,
+                        [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def X86WinFTOL : SDNode<"X86ISD::WIN_FTOL", SDT_X86WIN_FTOL,
+                        [SDNPHasChain, SDNPOutGlue]>;
+
+//===----------------------------------------------------------------------===//
+// X86 Operand Definitions.
+//
+
+// A version of ptr_rc which excludes SP, ESP, and RSP. This is used for
+// the index operand of an address, to conform to x86 encoding restrictions.
+def ptr_rc_nosp : PointerLikeRegClass<1>;
+
+// *mem - Operand definitions for the funky X86 addressing mode operands.
+//
+def X86MemAsmOperand : AsmOperandClass {
+ let Name = "Mem";
+}
+def X86Mem8AsmOperand : AsmOperandClass {
+  let Name = "Mem8"; let RenderMethod = "addMemOperands";
+}
+def X86Mem16AsmOperand : AsmOperandClass {
+  let Name = "Mem16"; let RenderMethod = "addMemOperands";
+}
+def X86Mem32AsmOperand : AsmOperandClass {
+  let Name = "Mem32"; let RenderMethod = "addMemOperands";
+}
+def X86Mem64AsmOperand : AsmOperandClass {
+  let Name = "Mem64"; let RenderMethod = "addMemOperands";
+}
+def X86Mem80AsmOperand : AsmOperandClass {
+  let Name = "Mem80"; let RenderMethod = "addMemOperands";
+}
+def X86Mem128AsmOperand : AsmOperandClass {
+  let Name = "Mem128"; let RenderMethod = "addMemOperands";
+}
+def X86Mem256AsmOperand : AsmOperandClass {
+  let Name = "Mem256"; let RenderMethod = "addMemOperands";
+}
+def X86Mem512AsmOperand : AsmOperandClass {
+  let Name = "Mem512"; let RenderMethod = "addMemOperands";
+}
+
+// Gather mem operands
+def X86MemVX32Operand : AsmOperandClass {
+  let Name = "MemVX32"; let RenderMethod = "addMemOperands";
+}
+def X86MemVY32Operand : AsmOperandClass {
+  let Name = "MemVY32"; let RenderMethod = "addMemOperands";
+}
+def X86MemVZ32Operand : AsmOperandClass {
+  let Name = "MemVZ32"; let RenderMethod = "addMemOperands";
+}
+def X86MemVX64Operand : AsmOperandClass {
+  let Name = "MemVX64"; let RenderMethod = "addMemOperands";
+}
+def X86MemVY64Operand : AsmOperandClass {
+  let Name = "MemVY64"; let RenderMethod = "addMemOperands";
+}
+def X86MemVZ64Operand : AsmOperandClass {
+  let Name = "MemVZ64"; let RenderMethod = "addMemOperands";
+}
+
+def X86AbsMemAsmOperand : AsmOperandClass {
+  let Name = "AbsMem";
+  let SuperClasses = [X86MemAsmOperand];
+}
+class X86MemOperand<string printMethod> : Operand<iPTR> {
+  let PrintMethod = printMethod;
+  let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm);
+  let ParserMatchClass = X86MemAsmOperand;
+}
+
+let OperandType = "OPERAND_MEMORY" in {
+def opaque32mem : X86MemOperand<"printopaquemem">;
+def opaque48mem : X86MemOperand<"printopaquemem">;
+def opaque80mem : X86MemOperand<"printopaquemem">;
+def opaque512mem : X86MemOperand<"printopaquemem">;
+
+def i8mem   : X86MemOperand<"printi8mem"> {
+  let ParserMatchClass = X86Mem8AsmOperand; }
+def i16mem  : X86MemOperand<"printi16mem"> {
+  let ParserMatchClass = X86Mem16AsmOperand; }
+def i32mem  : X86MemOperand<"printi32mem"> {
+  let ParserMatchClass = X86Mem32AsmOperand; }
+def i64mem  : X86MemOperand<"printi64mem"> {
+  let ParserMatchClass = X86Mem64AsmOperand; }
+def i128mem : X86MemOperand<"printi128mem"> {
+  let ParserMatchClass = X86Mem128AsmOperand; }
+def i256mem : X86MemOperand<"printi256mem"> {
+  let ParserMatchClass = X86Mem256AsmOperand; }
+def i512mem : X86MemOperand<"printi512mem"> {
+  let ParserMatchClass = X86Mem512AsmOperand; }
+def f32mem  : X86MemOperand<"printf32mem"> {
+  let ParserMatchClass = X86Mem32AsmOperand; }
+def f64mem  : X86MemOperand<"printf64mem"> {
+  let ParserMatchClass = X86Mem64AsmOperand; }
+def f80mem  : X86MemOperand<"printf80mem"> {
+  let ParserMatchClass = X86Mem80AsmOperand; }
+def f128mem : X86MemOperand<"printf128mem"> {
+  let ParserMatchClass = X86Mem128AsmOperand; }
+def f256mem : X86MemOperand<"printf256mem">{
+  let ParserMatchClass = X86Mem256AsmOperand; }
+def f512mem : X86MemOperand<"printf512mem">{
+  let ParserMatchClass = X86Mem512AsmOperand; }
+def v512mem : Operand<iPTR> {
+  let PrintMethod = "printf512mem";
+  let MIOperandInfo = (ops ptr_rc, i8imm, VR512, i32imm, i8imm);
+  let ParserMatchClass = X86Mem512AsmOperand; }
+
+// Gather mem operands
+def vx32mem : X86MemOperand<"printi32mem">{
+  let MIOperandInfo = (ops ptr_rc, i8imm, VR128, i32imm, i8imm);
+  let ParserMatchClass = X86MemVX32Operand; }
+def vy32mem : X86MemOperand<"printi32mem">{
+  let MIOperandInfo = (ops ptr_rc, i8imm, VR256, i32imm, i8imm);
+  let ParserMatchClass = X86MemVY32Operand; }
+def vx64mem : X86MemOperand<"printi64mem">{
+  let MIOperandInfo = (ops ptr_rc, i8imm, VR128, i32imm, i8imm);
+  let ParserMatchClass = X86MemVX64Operand; }
+def vy64mem : X86MemOperand<"printi64mem">{
+  let MIOperandInfo = (ops ptr_rc, i8imm, VR256, i32imm, i8imm);
+  let ParserMatchClass = X86MemVY64Operand; }
+def vy64xmem : X86MemOperand<"printi64mem">{
+  let MIOperandInfo = (ops ptr_rc, i8imm, VR256X, i32imm, i8imm);
+  let ParserMatchClass = X86MemVY64Operand; }
+def vz32mem : X86MemOperand<"printi32mem">{
+  let MIOperandInfo = (ops ptr_rc, i16imm, VR512, i32imm, i8imm);
+  let ParserMatchClass = X86MemVZ32Operand; }
+def vz64mem : X86MemOperand<"printi64mem">{
+  let MIOperandInfo = (ops ptr_rc, i8imm, VR512, i32imm, i8imm);
+  let ParserMatchClass = X86MemVZ64Operand; }
+}
+
+// A version of i8mem for use on x86-64 that uses GR64_NOREX instead of
+// plain GR64, so that it doesn't potentially require a REX prefix.
+def i8mem_NOREX : Operand<i64> {
+  let PrintMethod = "printi8mem";
+  let MIOperandInfo = (ops GR64_NOREX, i8imm, GR64_NOREX_NOSP, i32imm, i8imm);
+  let ParserMatchClass = X86Mem8AsmOperand;
+  let OperandType = "OPERAND_MEMORY";
+}
+
+// GPRs available for tailcall.
+// It represents GR32_TC, GR64_TC or GR64_TCW64.
+def ptr_rc_tailcall : PointerLikeRegClass<2>;
+
+// Special i32mem for addresses of load folding tail calls. These are not
+// allowed to use callee-saved registers since they must be scheduled
+// after callee-saved register are popped.
+def i32mem_TC : Operand<i32> {
+  let PrintMethod = "printi32mem";
+  let MIOperandInfo = (ops ptr_rc_tailcall, i8imm, ptr_rc_tailcall,
+                       i32imm, i8imm);
+  let ParserMatchClass = X86Mem32AsmOperand;
+  let OperandType = "OPERAND_MEMORY";
+}
+
+// Special i64mem for addresses of load folding tail calls. These are not
+// allowed to use callee-saved registers since they must be scheduled
+// after callee-saved register are popped.
+def i64mem_TC : Operand<i64> {
+  let PrintMethod = "printi64mem";
+  let MIOperandInfo = (ops ptr_rc_tailcall, i8imm,
+                       ptr_rc_tailcall, i32imm, i8imm);
+  let ParserMatchClass = X86Mem64AsmOperand;
+  let OperandType = "OPERAND_MEMORY";
+}
+
+let OperandType = "OPERAND_PCREL",
+    ParserMatchClass = X86AbsMemAsmOperand,
+    PrintMethod = "printPCRelImm" in {
+def i32imm_pcrel : Operand<i32>;
+def i16imm_pcrel : Operand<i16>;
+
+// Branch targets have OtherVT type and print as pc-relative values.
+def brtarget : Operand<OtherVT>;
+def brtarget8 : Operand<OtherVT>;
+
+}
+
+def X86SrcIdx8Operand : AsmOperandClass {
+  let Name = "SrcIdx8";
+  let RenderMethod = "addSrcIdxOperands";
+  let SuperClasses = [X86Mem8AsmOperand];
+}
+def X86SrcIdx16Operand : AsmOperandClass {
+  let Name = "SrcIdx16";
+  let RenderMethod = "addSrcIdxOperands";
+  let SuperClasses = [X86Mem16AsmOperand];
+}
+def X86SrcIdx32Operand : AsmOperandClass {
+  let Name = "SrcIdx32";
+  let RenderMethod = "addSrcIdxOperands";
+  let SuperClasses = [X86Mem32AsmOperand];
+}
+def X86SrcIdx64Operand : AsmOperandClass {
+  let Name = "SrcIdx64";
+  let RenderMethod = "addSrcIdxOperands";
+  let SuperClasses = [X86Mem64AsmOperand];
+}
+def X86DstIdx8Operand : AsmOperandClass {
+  let Name = "DstIdx8";
+  let RenderMethod = "addDstIdxOperands";
+  let SuperClasses = [X86Mem8AsmOperand];
+}
+def X86DstIdx16Operand : AsmOperandClass {
+  let Name = "DstIdx16";
+  let RenderMethod = "addDstIdxOperands";
+  let SuperClasses = [X86Mem16AsmOperand];
+}
+def X86DstIdx32Operand : AsmOperandClass {
+  let Name = "DstIdx32";
+  let RenderMethod = "addDstIdxOperands";
+  let SuperClasses = [X86Mem32AsmOperand];
+}
+def X86DstIdx64Operand : AsmOperandClass {
+  let Name = "DstIdx64";
+  let RenderMethod = "addDstIdxOperands";
+  let SuperClasses = [X86Mem64AsmOperand];
+}
+def X86MemOffs8AsmOperand : AsmOperandClass {
+  let Name = "MemOffs8";
+  let RenderMethod = "addMemOffsOperands";
+  let SuperClasses = [X86Mem8AsmOperand];
+}
+def X86MemOffs16AsmOperand : AsmOperandClass {
+  let Name = "MemOffs16";
+  let RenderMethod = "addMemOffsOperands";
+  let SuperClasses = [X86Mem16AsmOperand];
+}
+def X86MemOffs32AsmOperand : AsmOperandClass {
+  let Name = "MemOffs32";
+  let RenderMethod = "addMemOffsOperands";
+  let SuperClasses = [X86Mem32AsmOperand];
+}
+def X86MemOffs64AsmOperand : AsmOperandClass {
+  let Name = "MemOffs64";
+  let RenderMethod = "addMemOffsOperands";
+  let SuperClasses = [X86Mem64AsmOperand];
+}
+let OperandType = "OPERAND_MEMORY" in {
+def srcidx8 : Operand<iPTR> {
+  let ParserMatchClass = X86SrcIdx8Operand;
+  let MIOperandInfo = (ops ptr_rc, i8imm);
+  let PrintMethod = "printSrcIdx8"; }
+def srcidx16 : Operand<iPTR> {
+  let ParserMatchClass = X86SrcIdx16Operand;
+  let MIOperandInfo = (ops ptr_rc, i8imm);
+  let PrintMethod = "printSrcIdx16"; }
+def srcidx32 : Operand<iPTR> {
+  let ParserMatchClass = X86SrcIdx32Operand;
+  let MIOperandInfo = (ops ptr_rc, i8imm);
+  let PrintMethod = "printSrcIdx32"; }
+def srcidx64 : Operand<iPTR> {
+  let ParserMatchClass = X86SrcIdx64Operand;
+  let MIOperandInfo = (ops ptr_rc, i8imm);
+  let PrintMethod = "printSrcIdx64"; }
+def dstidx8 : Operand<iPTR> {
+  let ParserMatchClass = X86DstIdx8Operand;
+  let MIOperandInfo = (ops ptr_rc);
+  let PrintMethod = "printDstIdx8"; }
+def dstidx16 : Operand<iPTR> {
+  let ParserMatchClass = X86DstIdx16Operand;
+  let MIOperandInfo = (ops ptr_rc);
+  let PrintMethod = "printDstIdx16"; }
+def dstidx32 : Operand<iPTR> {
+  let ParserMatchClass = X86DstIdx32Operand;
+  let MIOperandInfo = (ops ptr_rc);
+  let PrintMethod = "printDstIdx32"; }
+def dstidx64 : Operand<iPTR> {
+  let ParserMatchClass = X86DstIdx64Operand;
+  let MIOperandInfo = (ops ptr_rc);
+  let PrintMethod = "printDstIdx64"; }
+def offset8 : Operand<iPTR> {
+  let ParserMatchClass = X86MemOffs8AsmOperand;
+  let MIOperandInfo = (ops i64imm, i8imm);
+  let PrintMethod = "printMemOffs8"; }
+def offset16 : Operand<iPTR> {
+  let ParserMatchClass = X86MemOffs16AsmOperand;
+  let MIOperandInfo = (ops i64imm, i8imm);
+  let PrintMethod = "printMemOffs16"; }
+def offset32 : Operand<iPTR> {
+  let ParserMatchClass = X86MemOffs32AsmOperand;
+  let MIOperandInfo = (ops i64imm, i8imm);
+  let PrintMethod = "printMemOffs32"; }
+def offset64 : Operand<iPTR> {
+  let ParserMatchClass = X86MemOffs64AsmOperand;
+  let MIOperandInfo = (ops i64imm, i8imm);
+  let PrintMethod = "printMemOffs64"; }
+}
+
+
+def SSECC : Operand<i8> {
+  let PrintMethod = "printSSECC";
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+def AVXCC : Operand<i8> {
+  let PrintMethod = "printAVXCC";
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+class ImmSExtAsmOperandClass : AsmOperandClass {
+  let SuperClasses = [ImmAsmOperand];
+  let RenderMethod = "addImmOperands";
+}
+
+class ImmZExtAsmOperandClass : AsmOperandClass {
+  let SuperClasses = [ImmAsmOperand];
+  let RenderMethod = "addImmOperands";
+}
+
+def X86GR32orGR64AsmOperand : AsmOperandClass {
+  let Name = "GR32orGR64";
+}
+
+def GR32orGR64 : RegisterOperand<GR32> {
+  let ParserMatchClass = X86GR32orGR64AsmOperand;
+}
+
+def AVX512RC : Operand<i32> {
+  let PrintMethod = "printRoundingControl";
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+// Sign-extended immediate classes. We don't need to define the full lattice
+// here because there is no instruction with an ambiguity between ImmSExti64i32
+// and ImmSExti32i8.
+//
+// The strange ranges come from the fact that the assembler always works with
+// 64-bit immediates, but for a 16-bit target value we want to accept both "-1"
+// (which will be a -1ULL), and "0xFF" (-1 in 16-bits).
+
+// [0, 0x7FFFFFFF]                                            |
+//   [0xFFFFFFFF80000000, 0xFFFFFFFFFFFFFFFF]
+def ImmSExti64i32AsmOperand : ImmSExtAsmOperandClass {
+  let Name = "ImmSExti64i32";
+}
+
+// [0, 0x0000007F] | [0x000000000000FF80, 0x000000000000FFFF] |
+//   [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
+def ImmSExti16i8AsmOperand : ImmSExtAsmOperandClass {
+  let Name = "ImmSExti16i8";
+  let SuperClasses = [ImmSExti64i32AsmOperand];
+}
+
+// [0, 0x0000007F] | [0x00000000FFFFFF80, 0x00000000FFFFFFFF] |
+//   [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
+def ImmSExti32i8AsmOperand : ImmSExtAsmOperandClass {
+  let Name = "ImmSExti32i8";
+}
+
+// [0, 0x000000FF]
+def ImmZExtu32u8AsmOperand : ImmZExtAsmOperandClass {
+  let Name = "ImmZExtu32u8";
+}
+
+
+// [0, 0x0000007F]                                            |
+//   [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
+def ImmSExti64i8AsmOperand : ImmSExtAsmOperandClass {
+  let Name = "ImmSExti64i8";
+  let SuperClasses = [ImmSExti16i8AsmOperand, ImmSExti32i8AsmOperand,
+                      ImmSExti64i32AsmOperand];
+}
+
+// A couple of more descriptive operand definitions.
+// 16-bits but only 8 bits are significant.
+def i16i8imm  : Operand<i16> {
+  let ParserMatchClass = ImmSExti16i8AsmOperand;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+// 32-bits but only 8 bits are significant.
+def i32i8imm  : Operand<i32> {
+  let ParserMatchClass = ImmSExti32i8AsmOperand;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+// 32-bits but only 8 bits are significant, and those 8 bits are unsigned.
+def u32u8imm  : Operand<i32> {
+  let ParserMatchClass = ImmZExtu32u8AsmOperand;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+// 64-bits but only 32 bits are significant.
+def i64i32imm  : Operand<i64> {
+  let ParserMatchClass = ImmSExti64i32AsmOperand;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+// 64-bits but only 32 bits are significant, and those bits are treated as being
+// pc relative.
+def i64i32imm_pcrel : Operand<i64> {
+  let PrintMethod = "printPCRelImm";
+  let ParserMatchClass = X86AbsMemAsmOperand;
+  let OperandType = "OPERAND_PCREL";
+}
+
+// 64-bits but only 8 bits are significant.
+def i64i8imm   : Operand<i64> {
+  let ParserMatchClass = ImmSExti64i8AsmOperand;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+def lea64_32mem : Operand<i32> {
+  let PrintMethod = "printi32mem";
+  let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, i8imm);
+  let ParserMatchClass = X86MemAsmOperand;
+}
+
+// Memory operands that use 64-bit pointers in both ILP32 and LP64.
+def lea64mem : Operand<i64> {
+  let PrintMethod = "printi64mem";
+  let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, i8imm);
+  let ParserMatchClass = X86MemAsmOperand;
+}
+
+
+//===----------------------------------------------------------------------===//
+// X86 Complex Pattern Definitions.
+//
+
+// Define X86 specific addressing mode.
+def addr      : ComplexPattern<iPTR, 5, "SelectAddr", [], [SDNPWantParent]>;
+def lea32addr : ComplexPattern<i32, 5, "SelectLEAAddr",
+                               [add, sub, mul, X86mul_imm, shl, or, frameindex],
+                               []>;
+// In 64-bit mode 32-bit LEAs can use RIP-relative addressing.
+def lea64_32addr : ComplexPattern<i32, 5, "SelectLEA64_32Addr",
+                                  [add, sub, mul, X86mul_imm, shl, or,
+                                   frameindex, X86WrapperRIP],
+                                  []>;
+
+def tls32addr : ComplexPattern<i32, 5, "SelectTLSADDRAddr",
+                               [tglobaltlsaddr], []>;
+
+def tls32baseaddr : ComplexPattern<i32, 5, "SelectTLSADDRAddr",
+                               [tglobaltlsaddr], []>;
+
+def lea64addr : ComplexPattern<i64, 5, "SelectLEAAddr",
+                        [add, sub, mul, X86mul_imm, shl, or, frameindex,
+                         X86WrapperRIP], []>;
+
+def tls64addr : ComplexPattern<i64, 5, "SelectTLSADDRAddr",
+                               [tglobaltlsaddr], []>;
+
+def tls64baseaddr : ComplexPattern<i64, 5, "SelectTLSADDRAddr",
+                               [tglobaltlsaddr], []>;
+
+//===----------------------------------------------------------------------===//
+// X86 Instruction Predicate Definitions.
+def HasCMov      : Predicate<"Subtarget->hasCMov()">;
+def NoCMov       : Predicate<"!Subtarget->hasCMov()">;
+
+def HasMMX       : Predicate<"Subtarget->hasMMX()">;
+def Has3DNow     : Predicate<"Subtarget->has3DNow()">;
+def Has3DNowA    : Predicate<"Subtarget->has3DNowA()">;
+def HasSSE1      : Predicate<"Subtarget->hasSSE1()">;
+def UseSSE1      : Predicate<"Subtarget->hasSSE1() && !Subtarget->hasAVX()">;
+def HasSSE2      : Predicate<"Subtarget->hasSSE2()">;
+def UseSSE2      : Predicate<"Subtarget->hasSSE2() && !Subtarget->hasAVX()">;
+def HasSSE3      : Predicate<"Subtarget->hasSSE3()">;
+def UseSSE3      : Predicate<"Subtarget->hasSSE3() && !Subtarget->hasAVX()">;
+def HasSSSE3     : Predicate<"Subtarget->hasSSSE3()">;
+def UseSSSE3     : Predicate<"Subtarget->hasSSSE3() && !Subtarget->hasAVX()">;
+def HasSSE41     : Predicate<"Subtarget->hasSSE41()">;
+def UseSSE41     : Predicate<"Subtarget->hasSSE41() && !Subtarget->hasAVX()">;
+def HasSSE42     : Predicate<"Subtarget->hasSSE42()">;
+def UseSSE42     : Predicate<"Subtarget->hasSSE42() && !Subtarget->hasAVX()">;
+def HasSSE4A     : Predicate<"Subtarget->hasSSE4A()">;
+def HasAVX       : Predicate<"Subtarget->hasAVX()">;
+def HasAVX2      : Predicate<"Subtarget->hasAVX2()">;
+def HasAVX1Only  : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX2()">;
+def HasAVX512    : Predicate<"Subtarget->hasAVX512()">,
+                     AssemblerPredicate<"FeatureAVX512", "AVX-512 ISA">;
+def UseAVX       : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX512()">;
+def UseAVX2      : Predicate<"Subtarget->hasAVX2() && !Subtarget->hasAVX512()">;
+def NoAVX512     : Predicate<"!Subtarget->hasAVX512()">;
+def HasCDI       : Predicate<"Subtarget->hasCDI()">;
+def HasPFI       : Predicate<"Subtarget->hasPFI()">;
+def HasERI       : Predicate<"Subtarget->hasERI()">;
+def HasDQI       : Predicate<"Subtarget->hasDQI()">;
+def HasBWI       : Predicate<"Subtarget->hasBWI()">;
+def HasVLX       : Predicate<"Subtarget->hasVLX()">,
+                     AssemblerPredicate<"FeatureVLX", "AVX-512 VLX ISA">;
+
+def HasPOPCNT    : Predicate<"Subtarget->hasPOPCNT()">;
+def HasAES       : Predicate<"Subtarget->hasAES()">;
+def HasPCLMUL    : Predicate<"Subtarget->hasPCLMUL()">;
+def HasFMA       : Predicate<"Subtarget->hasFMA()">;
+def UseFMAOnAVX  : Predicate<"Subtarget->hasFMA() && !Subtarget->hasAVX512()">;
+def HasFMA4      : Predicate<"Subtarget->hasFMA4()">;
+def HasXOP       : Predicate<"Subtarget->hasXOP()">;
+def HasTBM       : Predicate<"Subtarget->hasTBM()">;
+def HasMOVBE     : Predicate<"Subtarget->hasMOVBE()">;
+def HasRDRAND    : Predicate<"Subtarget->hasRDRAND()">;
+def HasF16C      : Predicate<"Subtarget->hasF16C()">;
+def HasFSGSBase  : Predicate<"Subtarget->hasFSGSBase()">;
+def HasLZCNT     : Predicate<"Subtarget->hasLZCNT()">;
+def HasBMI       : Predicate<"Subtarget->hasBMI()">;
+def HasBMI2      : Predicate<"Subtarget->hasBMI2()">;
+def HasRTM       : Predicate<"Subtarget->hasRTM()">;
+def HasHLE       : Predicate<"Subtarget->hasHLE()">;
+def HasTSX       : Predicate<"Subtarget->hasRTM() || Subtarget->hasHLE()">;
+def HasADX       : Predicate<"Subtarget->hasADX()">;
+def HasSHA       : Predicate<"Subtarget->hasSHA()">;
+def HasPRFCHW    : Predicate<"Subtarget->hasPRFCHW()">;
+def HasRDSEED    : Predicate<"Subtarget->hasRDSEED()">;
+def HasPrefetchW : Predicate<"Subtarget->hasPRFCHW()">;
+def FPStackf32   : Predicate<"!Subtarget->hasSSE1()">;
+def FPStackf64   : Predicate<"!Subtarget->hasSSE2()">;
+def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">;
+def Not64BitMode : Predicate<"!Subtarget->is64Bit()">,
+                             AssemblerPredicate<"!Mode64Bit", "Not 64-bit mode">;
+def In64BitMode  : Predicate<"Subtarget->is64Bit()">,
+                             AssemblerPredicate<"Mode64Bit", "64-bit mode">;
+def In16BitMode  : Predicate<"Subtarget->is16Bit()">,
+                             AssemblerPredicate<"Mode16Bit", "16-bit mode">;
+def Not16BitMode : Predicate<"!Subtarget->is16Bit()">,
+                             AssemblerPredicate<"!Mode16Bit", "Not 16-bit mode">;
+def In32BitMode  : Predicate<"Subtarget->is32Bit()">,
+                             AssemblerPredicate<"Mode32Bit", "32-bit mode">;
+def IsWin64      : Predicate<"Subtarget->isTargetWin64()">;
+def IsNaCl       : Predicate<"Subtarget->isTargetNaCl()">;
+def NotNaCl      : Predicate<"!Subtarget->isTargetNaCl()">;
+def SmallCode    : Predicate<"TM.getCodeModel() == CodeModel::Small">;
+def KernelCode   : Predicate<"TM.getCodeModel() == CodeModel::Kernel">;
+def FarData      : Predicate<"TM.getCodeModel() != CodeModel::Small &&"
+                             "TM.getCodeModel() != CodeModel::Kernel">;
+def NearData     : Predicate<"TM.getCodeModel() == CodeModel::Small ||"
+                             "TM.getCodeModel() == CodeModel::Kernel">;
+def IsStatic     : Predicate<"TM.getRelocationModel() == Reloc::Static">;
+def IsNotPIC     : Predicate<"TM.getRelocationModel() != Reloc::PIC_">;
+def OptForSize   : Predicate<"OptForSize">;
+def OptForSpeed  : Predicate<"!OptForSize">;
+def FastBTMem    : Predicate<"!Subtarget->isBTMemSlow()">;
+def CallImmAddr  : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">;
+def FavorMemIndirectCall  : Predicate<"!Subtarget->callRegIndirect()">;
+def NotSlowIncDec : Predicate<"!Subtarget->slowIncDec()">;
+
+//===----------------------------------------------------------------------===//
+// X86 Instruction Format Definitions.
+//
+
+include "X86InstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Pattern fragments.
+//
+
+// X86 specific condition code. These correspond to CondCode in
+// X86InstrInfo.h. They must be kept in synch.
+def X86_COND_A   : PatLeaf<(i8 0)>;  // alt. COND_NBE
+def X86_COND_AE  : PatLeaf<(i8 1)>;  // alt. COND_NC
+def X86_COND_B   : PatLeaf<(i8 2)>;  // alt. COND_C
+def X86_COND_BE  : PatLeaf<(i8 3)>;  // alt. COND_NA
+def X86_COND_E   : PatLeaf<(i8 4)>;  // alt. COND_Z
+def X86_COND_G   : PatLeaf<(i8 5)>;  // alt. COND_NLE
+def X86_COND_GE  : PatLeaf<(i8 6)>;  // alt. COND_NL
+def X86_COND_L   : PatLeaf<(i8 7)>;  // alt. COND_NGE
+def X86_COND_LE  : PatLeaf<(i8 8)>;  // alt. COND_NG
+def X86_COND_NE  : PatLeaf<(i8 9)>;  // alt. COND_NZ
+def X86_COND_NO  : PatLeaf<(i8 10)>;
+def X86_COND_NP  : PatLeaf<(i8 11)>; // alt. COND_PO
+def X86_COND_NS  : PatLeaf<(i8 12)>;
+def X86_COND_O   : PatLeaf<(i8 13)>;
+def X86_COND_P   : PatLeaf<(i8 14)>; // alt. COND_PE
+def X86_COND_S   : PatLeaf<(i8 15)>;
+
+let FastIselShouldIgnore = 1 in { // FastIsel should ignore all simm8 instrs.
+  def i16immSExt8  : ImmLeaf<i16, [{ return Imm == (int8_t)Imm; }]>;
+  def i32immSExt8  : ImmLeaf<i32, [{ return Imm == (int8_t)Imm; }]>;
+  def i64immSExt8  : ImmLeaf<i64, [{ return Imm == (int8_t)Imm; }]>;
+}
+
+def i64immSExt32 : ImmLeaf<i64, [{ return Imm == (int32_t)Imm; }]>;
+
+
+// i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit
+// unsigned field.
+def i64immZExt32 : ImmLeaf<i64, [{ return (uint64_t)Imm == (uint32_t)Imm; }]>;
+
+def i64immZExt32SExt8 : ImmLeaf<i64, [{
+  return (uint64_t)Imm == (uint32_t)Imm && (int32_t)Imm == (int8_t)Imm;
+}]>;
+
+// Helper fragments for loads.
+// It's always safe to treat a anyext i16 load as a i32 load if the i16 is
+// known to be 32-bit aligned or better. Ditto for i8 to i16.
+def loadi16 : PatFrag<(ops node:$ptr), (i16 (unindexedload node:$ptr)), [{
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  if (ExtType == ISD::NON_EXTLOAD)
+    return true;
+  if (ExtType == ISD::EXTLOAD)
+    return LD->getAlignment() >= 2 && !LD->isVolatile();
+  return false;
+}]>;
+
+def loadi16_anyext : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)),[{
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  if (ExtType == ISD::EXTLOAD)
+    return LD->getAlignment() >= 2 && !LD->isVolatile();
+  return false;
+}]>;
+
+def loadi32 : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  if (ExtType == ISD::NON_EXTLOAD)
+    return true;
+  if (ExtType == ISD::EXTLOAD)
+    return LD->getAlignment() >= 4 && !LD->isVolatile();
+  return false;
+}]>;
+
+def loadi8  : PatFrag<(ops node:$ptr), (i8  (load node:$ptr))>;
+def loadi64 : PatFrag<(ops node:$ptr), (i64 (load node:$ptr))>;
+def loadf32 : PatFrag<(ops node:$ptr), (f32 (load node:$ptr))>;
+def loadf64 : PatFrag<(ops node:$ptr), (f64 (load node:$ptr))>;
+def loadf80 : PatFrag<(ops node:$ptr), (f80 (load node:$ptr))>;
+
+def sextloadi16i8  : PatFrag<(ops node:$ptr), (i16 (sextloadi8 node:$ptr))>;
+def sextloadi32i8  : PatFrag<(ops node:$ptr), (i32 (sextloadi8 node:$ptr))>;
+def sextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (sextloadi16 node:$ptr))>;
+def sextloadi64i8  : PatFrag<(ops node:$ptr), (i64 (sextloadi8 node:$ptr))>;
+def sextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (sextloadi16 node:$ptr))>;
+def sextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (sextloadi32 node:$ptr))>;
+
+def zextloadi8i1   : PatFrag<(ops node:$ptr), (i8  (zextloadi1 node:$ptr))>;
+def zextloadi16i1  : PatFrag<(ops node:$ptr), (i16 (zextloadi1 node:$ptr))>;
+def zextloadi32i1  : PatFrag<(ops node:$ptr), (i32 (zextloadi1 node:$ptr))>;
+def zextloadi16i8  : PatFrag<(ops node:$ptr), (i16 (zextloadi8 node:$ptr))>;
+def zextloadi32i8  : PatFrag<(ops node:$ptr), (i32 (zextloadi8 node:$ptr))>;
+def zextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (zextloadi16 node:$ptr))>;
+def zextloadi64i1  : PatFrag<(ops node:$ptr), (i64 (zextloadi1 node:$ptr))>;
+def zextloadi64i8  : PatFrag<(ops node:$ptr), (i64 (zextloadi8 node:$ptr))>;
+def zextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (zextloadi16 node:$ptr))>;
+def zextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (zextloadi32 node:$ptr))>;
+
+def extloadi8i1    : PatFrag<(ops node:$ptr), (i8  (extloadi1 node:$ptr))>;
+def extloadi16i1   : PatFrag<(ops node:$ptr), (i16 (extloadi1 node:$ptr))>;
+def extloadi32i1   : PatFrag<(ops node:$ptr), (i32 (extloadi1 node:$ptr))>;
+def extloadi16i8   : PatFrag<(ops node:$ptr), (i16 (extloadi8 node:$ptr))>;
+def extloadi32i8   : PatFrag<(ops node:$ptr), (i32 (extloadi8 node:$ptr))>;
+def extloadi32i16  : PatFrag<(ops node:$ptr), (i32 (extloadi16 node:$ptr))>;
+def extloadi64i1   : PatFrag<(ops node:$ptr), (i64 (extloadi1 node:$ptr))>;
+def extloadi64i8   : PatFrag<(ops node:$ptr), (i64 (extloadi8 node:$ptr))>;
+def extloadi64i16  : PatFrag<(ops node:$ptr), (i64 (extloadi16 node:$ptr))>;
+def extloadi64i32  : PatFrag<(ops node:$ptr), (i64 (extloadi32 node:$ptr))>;
+
+
+// An 'and' node with a single use.
+def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{
+  return N->hasOneUse();
+}]>;
+// An 'srl' node with a single use.
+def srl_su : PatFrag<(ops node:$lhs, node:$rhs), (srl node:$lhs, node:$rhs), [{
+  return N->hasOneUse();
+}]>;
+// An 'trunc' node with a single use.
+def trunc_su : PatFrag<(ops node:$src), (trunc node:$src), [{
+  return N->hasOneUse();
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Instruction list.
+//
+
+// Nop
+let neverHasSideEffects = 1, SchedRW = [WriteZero] in {
+  def NOOP : I<0x90, RawFrm, (outs), (ins), "nop", [], IIC_NOP>;
+  def NOOPW : I<0x1f, MRMXm, (outs), (ins i16mem:$zero),
+                "nop{w}\t$zero", [], IIC_NOP>, TB, OpSize16;
+  def NOOPL : I<0x1f, MRMXm, (outs), (ins i32mem:$zero),
+                "nop{l}\t$zero", [], IIC_NOP>, TB, OpSize32;
+}
+
+
+// Constructing a stack frame.
+def ENTER : Ii16<0xC8, RawFrmImm8, (outs), (ins i16imm:$len, i8imm:$lvl),
+                 "enter\t$len, $lvl", [], IIC_ENTER>, Sched<[WriteMicrocoded]>;
+
+let SchedRW = [WriteALU] in {
+let Defs = [EBP, ESP], Uses = [EBP, ESP], mayLoad = 1, neverHasSideEffects=1 in
+def LEAVE    : I<0xC9, RawFrm,
+                 (outs), (ins), "leave", [], IIC_LEAVE>,
+                 Requires<[Not64BitMode]>;
+
+let Defs = [RBP,RSP], Uses = [RBP,RSP], mayLoad = 1, neverHasSideEffects = 1 in
+def LEAVE64  : I<0xC9, RawFrm,
+                 (outs), (ins), "leave", [], IIC_LEAVE>,
+                 Requires<[In64BitMode]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+//  Miscellaneous Instructions.
+//
+
+let Defs = [ESP], Uses = [ESP], neverHasSideEffects=1 in {
+let mayLoad = 1, SchedRW = [WriteLoad] in {
+def POP16r  : I<0x58, AddRegFrm, (outs GR16:$reg), (ins), "pop{w}\t$reg", [],
+                IIC_POP_REG16>, OpSize16;
+def POP32r  : I<0x58, AddRegFrm, (outs GR32:$reg), (ins), "pop{l}\t$reg", [],
+                IIC_POP_REG>, OpSize32, Requires<[Not64BitMode]>;
+def POP16rmr: I<0x8F, MRM0r, (outs GR16:$reg), (ins), "pop{w}\t$reg", [],
+                IIC_POP_REG>, OpSize16;
+def POP16rmm: I<0x8F, MRM0m, (outs), (ins i16mem:$dst), "pop{w}\t$dst", [],
+                IIC_POP_MEM>, OpSize16;
+def POP32rmr: I<0x8F, MRM0r, (outs GR32:$reg), (ins), "pop{l}\t$reg", [],
+                IIC_POP_REG>, OpSize32, Requires<[Not64BitMode]>;
+def POP32rmm: I<0x8F, MRM0m, (outs), (ins i32mem:$dst), "pop{l}\t$dst", [],
+                IIC_POP_MEM>, OpSize32, Requires<[Not64BitMode]>;
+
+def POPF16   : I<0x9D, RawFrm, (outs), (ins), "popf{w}", [], IIC_POP_F>,
+                OpSize16;
+def POPF32   : I<0x9D, RawFrm, (outs), (ins), "popf{l|d}", [], IIC_POP_FD>,
+                OpSize32, Requires<[Not64BitMode]>;
+} // mayLoad, SchedRW
+
+let mayStore = 1, SchedRW = [WriteStore] in {
+def PUSH16r  : I<0x50, AddRegFrm, (outs), (ins GR16:$reg), "push{w}\t$reg",[],
+                 IIC_PUSH_REG>, OpSize16;
+def PUSH32r  : I<0x50, AddRegFrm, (outs), (ins GR32:$reg), "push{l}\t$reg",[],
+                 IIC_PUSH_REG>, OpSize32, Requires<[Not64BitMode]>;
+def PUSH16rmr: I<0xFF, MRM6r, (outs), (ins GR16:$reg), "push{w}\t$reg",[],
+                 IIC_PUSH_REG>, OpSize16;
+def PUSH16rmm: I<0xFF, MRM6m, (outs), (ins i16mem:$src), "push{w}\t$src",[],
+                 IIC_PUSH_MEM>, OpSize16;
+def PUSH32rmr: I<0xFF, MRM6r, (outs), (ins GR32:$reg), "push{l}\t$reg",[],
+                 IIC_PUSH_REG>, OpSize32, Requires<[Not64BitMode]>;
+def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src",[],
+                 IIC_PUSH_MEM>, OpSize32, Requires<[Not64BitMode]>;
+
+def PUSH16i8 : Ii8<0x6a, RawFrm, (outs), (ins i16i8imm:$imm),
+                   "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize16,
+                   Requires<[Not64BitMode]>;
+def PUSH32i8 : Ii8<0x6a, RawFrm, (outs), (ins i32i8imm:$imm),
+                   "push{l}\t$imm", [], IIC_PUSH_IMM>, OpSize32,
+                   Requires<[Not64BitMode]>;
+def PUSHi16  : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm),
+                   "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize16,
+                   Requires<[Not64BitMode]>;
+def PUSHi32  : Ii32<0x68, RawFrm, (outs), (ins i32imm:$imm),
+                   "push{l}\t$imm", [], IIC_PUSH_IMM>, OpSize32,
+                   Requires<[Not64BitMode]>;
+
+def PUSHF16  : I<0x9C, RawFrm, (outs), (ins), "pushf{w}", [], IIC_PUSH_F>,
+                 OpSize16;
+def PUSHF32  : I<0x9C, RawFrm, (outs), (ins), "pushf{l|d}", [], IIC_PUSH_F>,
+               OpSize32, Requires<[Not64BitMode]>;
+
+} // mayStore, SchedRW
+}
+
+let Defs = [RSP], Uses = [RSP], neverHasSideEffects=1 in {
+let mayLoad = 1, SchedRW = [WriteLoad] in {
+def POP64r   : I<0x58, AddRegFrm, (outs GR64:$reg), (ins), "pop{q}\t$reg", [],
+                 IIC_POP_REG>, OpSize32, Requires<[In64BitMode]>;
+def POP64rmr: I<0x8F, MRM0r, (outs GR64:$reg), (ins), "pop{q}\t$reg", [],
+                IIC_POP_REG>, OpSize32, Requires<[In64BitMode]>;
+def POP64rmm: I<0x8F, MRM0m, (outs), (ins i64mem:$dst), "pop{q}\t$dst", [],
+                IIC_POP_MEM>, OpSize32, Requires<[In64BitMode]>;
+} // mayLoad, SchedRW
+let mayStore = 1, SchedRW = [WriteStore] in {
+def PUSH64r  : I<0x50, AddRegFrm, (outs), (ins GR64:$reg), "push{q}\t$reg", [],
+                 IIC_PUSH_REG>, OpSize32, Requires<[In64BitMode]>;
+def PUSH64rmr: I<0xFF, MRM6r, (outs), (ins GR64:$reg), "push{q}\t$reg", [],
+                 IIC_PUSH_REG>, OpSize32, Requires<[In64BitMode]>;
+def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", [],
+                 IIC_PUSH_MEM>, OpSize32, Requires<[In64BitMode]>;
+} // mayStore, SchedRW
+}
+
+let Defs = [RSP], Uses = [RSP], neverHasSideEffects = 1, mayStore = 1,
+    SchedRW = [WriteStore] in {
+def PUSH64i8   : Ii8<0x6a, RawFrm, (outs), (ins i64i8imm:$imm),
+                    "push{q}\t$imm", [], IIC_PUSH_IMM>, Requires<[In64BitMode]>;
+def PUSH64i16  : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm),
+                    "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize16,
+                    Requires<[In64BitMode]>;
+def PUSH64i32  : Ii32S<0x68, RawFrm, (outs), (ins i64i32imm:$imm),
+                    "push{q}\t$imm", [], IIC_PUSH_IMM>, OpSize32,
+                    Requires<[In64BitMode]>;
+}
+
+let Defs = [RSP, EFLAGS], Uses = [RSP], mayLoad = 1, neverHasSideEffects=1 in
+def POPF64   : I<0x9D, RawFrm, (outs), (ins), "popfq", [], IIC_POP_FD>,
+               OpSize32, Requires<[In64BitMode]>, Sched<[WriteLoad]>;
+let Defs = [RSP], Uses = [RSP, EFLAGS], mayStore = 1, neverHasSideEffects=1 in
+def PUSHF64    : I<0x9C, RawFrm, (outs), (ins), "pushfq", [], IIC_PUSH_F>,
+                 OpSize32, Requires<[In64BitMode]>, Sched<[WriteStore]>;
+
+let Defs = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP], Uses = [ESP],
+    mayLoad = 1, neverHasSideEffects = 1, SchedRW = [WriteLoad] in {
+def POPA32   : I<0x61, RawFrm, (outs), (ins), "popal", [], IIC_POP_A>,
+               OpSize32, Requires<[Not64BitMode]>;
+def POPA16   : I<0x61, RawFrm, (outs), (ins), "popaw", [], IIC_POP_A>,
+               OpSize16, Requires<[Not64BitMode]>;
+}
+let Defs = [ESP], Uses = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP],
+    mayStore = 1, neverHasSideEffects = 1, SchedRW = [WriteStore] in {
+def PUSHA32  : I<0x60, RawFrm, (outs), (ins), "pushal", [], IIC_PUSH_A>,
+               OpSize32, Requires<[Not64BitMode]>;
+def PUSHA16  : I<0x60, RawFrm, (outs), (ins), "pushaw", [], IIC_PUSH_A>,
+               OpSize16, Requires<[Not64BitMode]>;
+}
+
+let Constraints = "$src = $dst", SchedRW = [WriteALU] in {
+// GR32 = bswap GR32
+def BSWAP32r : I<0xC8, AddRegFrm,
+                 (outs GR32:$dst), (ins GR32:$src),
+                 "bswap{l}\t$dst",
+                 [(set GR32:$dst, (bswap GR32:$src))], IIC_BSWAP>, OpSize32, TB;
+
+def BSWAP64r : RI<0xC8, AddRegFrm, (outs GR64:$dst), (ins GR64:$src),
+                  "bswap{q}\t$dst",
+                  [(set GR64:$dst, (bswap GR64:$src))], IIC_BSWAP>, TB;
+} // Constraints = "$src = $dst", SchedRW
+
+// Bit scan instructions.
+let Defs = [EFLAGS] in {
+def BSF16rr  : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                 "bsf{w}\t{$src, $dst|$dst, $src}",
+                 [(set GR16:$dst, EFLAGS, (X86bsf GR16:$src))],
+                  IIC_BIT_SCAN_REG>, PS, OpSize16, Sched<[WriteShift]>;
+def BSF16rm  : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                 "bsf{w}\t{$src, $dst|$dst, $src}",
+                 [(set GR16:$dst, EFLAGS, (X86bsf (loadi16 addr:$src)))],
+                  IIC_BIT_SCAN_MEM>, PS, OpSize16, Sched<[WriteShiftLd]>;
+def BSF32rr  : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                 "bsf{l}\t{$src, $dst|$dst, $src}",
+                 [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))],
+                 IIC_BIT_SCAN_REG>, PS, OpSize32, Sched<[WriteShift]>;
+def BSF32rm  : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                 "bsf{l}\t{$src, $dst|$dst, $src}",
+                 [(set GR32:$dst, EFLAGS, (X86bsf (loadi32 addr:$src)))],
+                 IIC_BIT_SCAN_MEM>, PS, OpSize32, Sched<[WriteShiftLd]>;
+def BSF64rr  : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+                  "bsf{q}\t{$src, $dst|$dst, $src}",
+                  [(set GR64:$dst, EFLAGS, (X86bsf GR64:$src))],
+                  IIC_BIT_SCAN_REG>, PS, Sched<[WriteShift]>;
+def BSF64rm  : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                  "bsf{q}\t{$src, $dst|$dst, $src}",
+                  [(set GR64:$dst, EFLAGS, (X86bsf (loadi64 addr:$src)))],
+                  IIC_BIT_SCAN_MEM>, PS, Sched<[WriteShiftLd]>;
+
+def BSR16rr  : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                 "bsr{w}\t{$src, $dst|$dst, $src}",
+                 [(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))],
+                 IIC_BIT_SCAN_REG>, PS, OpSize16, Sched<[WriteShift]>;
+def BSR16rm  : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                 "bsr{w}\t{$src, $dst|$dst, $src}",
+                 [(set GR16:$dst, EFLAGS, (X86bsr (loadi16 addr:$src)))],
+                 IIC_BIT_SCAN_MEM>, PS, OpSize16, Sched<[WriteShiftLd]>;
+def BSR32rr  : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                 "bsr{l}\t{$src, $dst|$dst, $src}",
+                 [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))],
+                 IIC_BIT_SCAN_REG>, PS, OpSize32, Sched<[WriteShift]>;
+def BSR32rm  : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                 "bsr{l}\t{$src, $dst|$dst, $src}",
+                 [(set GR32:$dst, EFLAGS, (X86bsr (loadi32 addr:$src)))],
+                 IIC_BIT_SCAN_MEM>, PS, OpSize32, Sched<[WriteShiftLd]>;
+def BSR64rr  : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+                  "bsr{q}\t{$src, $dst|$dst, $src}",
+                  [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))],
+                  IIC_BIT_SCAN_REG>, PS, Sched<[WriteShift]>;
+def BSR64rm  : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                  "bsr{q}\t{$src, $dst|$dst, $src}",
+                  [(set GR64:$dst, EFLAGS, (X86bsr (loadi64 addr:$src)))],
+                  IIC_BIT_SCAN_MEM>, PS, Sched<[WriteShiftLd]>;
+} // Defs = [EFLAGS]
+
+let SchedRW = [WriteMicrocoded] in {
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [EDI,ESI], Uses = [EDI,ESI,EFLAGS] in {
+def MOVSB : I<0xA4, RawFrmDstSrc, (outs dstidx8:$dst), (ins srcidx8:$src),
+              "movsb\t{$src, $dst|$dst, $src}", [], IIC_MOVS>;
+def MOVSW : I<0xA5, RawFrmDstSrc, (outs dstidx16:$dst), (ins srcidx16:$src),
+              "movsw\t{$src, $dst|$dst, $src}", [], IIC_MOVS>, OpSize16;
+def MOVSL : I<0xA5, RawFrmDstSrc, (outs dstidx32:$dst), (ins srcidx32:$src),
+              "movs{l|d}\t{$src, $dst|$dst, $src}", [], IIC_MOVS>, OpSize32;
+def MOVSQ : RI<0xA5, RawFrmDstSrc, (outs dstidx64:$dst), (ins srcidx64:$src),
+               "movsq\t{$src, $dst|$dst, $src}", [], IIC_MOVS>;
+}
+
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [EDI], Uses = [AL,EDI,EFLAGS] in
+def STOSB : I<0xAA, RawFrmDst, (outs dstidx8:$dst), (ins),
+              "stosb\t{%al, $dst|$dst, al}", [], IIC_STOS>;
+let Defs = [EDI], Uses = [AX,EDI,EFLAGS] in
+def STOSW : I<0xAB, RawFrmDst, (outs dstidx16:$dst), (ins),
+              "stosw\t{%ax, $dst|$dst, ax}", [], IIC_STOS>, OpSize16;
+let Defs = [EDI], Uses = [EAX,EDI,EFLAGS] in
+def STOSL : I<0xAB, RawFrmDst, (outs dstidx32:$dst), (ins),
+              "stos{l|d}\t{%eax, $dst|$dst, eax}", [], IIC_STOS>, OpSize32;
+let Defs = [RCX,RDI], Uses = [RAX,RCX,RDI,EFLAGS] in
+def STOSQ : RI<0xAB, RawFrmDst, (outs dstidx64:$dst), (ins),
+               "stosq\t{%rax, $dst|$dst, rax}", [], IIC_STOS>;
+
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [EDI,EFLAGS], Uses = [AL,EDI,EFLAGS] in
+def SCASB : I<0xAE, RawFrmDst, (outs), (ins dstidx8:$dst),
+              "scasb\t{$dst, %al|al, $dst}", [], IIC_SCAS>;
+let Defs = [EDI,EFLAGS], Uses = [AX,EDI,EFLAGS] in
+def SCASW : I<0xAF, RawFrmDst, (outs), (ins dstidx16:$dst),
+              "scasw\t{$dst, %ax|ax, $dst}", [], IIC_SCAS>, OpSize16;
+let Defs = [EDI,EFLAGS], Uses = [EAX,EDI,EFLAGS] in
+def SCASL : I<0xAF, RawFrmDst, (outs), (ins dstidx32:$dst),
+              "scas{l|d}\t{$dst, %eax|eax, $dst}", [], IIC_SCAS>, OpSize32;
+let Defs = [EDI,EFLAGS], Uses = [RAX,EDI,EFLAGS] in
+def SCASQ : RI<0xAF, RawFrmDst, (outs), (ins dstidx64:$dst),
+               "scasq\t{$dst, %rax|rax, $dst}", [], IIC_SCAS>;
+
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [EDI,ESI,EFLAGS], Uses = [EDI,ESI,EFLAGS] in {
+def CMPSB : I<0xA6, RawFrmDstSrc, (outs), (ins dstidx8:$dst, srcidx8:$src),
+              "cmpsb\t{$dst, $src|$src, $dst}", [], IIC_CMPS>;
+def CMPSW : I<0xA7, RawFrmDstSrc, (outs), (ins dstidx16:$dst, srcidx16:$src),
+              "cmpsw\t{$dst, $src|$src, $dst}", [], IIC_CMPS>, OpSize16;
+def CMPSL : I<0xA7, RawFrmDstSrc, (outs), (ins dstidx32:$dst, srcidx32:$src),
+              "cmps{l|d}\t{$dst, $src|$src, $dst}", [], IIC_CMPS>, OpSize32;
+def CMPSQ : RI<0xA7, RawFrmDstSrc, (outs), (ins dstidx64:$dst, srcidx64:$src),
+               "cmpsq\t{$dst, $src|$src, $dst}", [], IIC_CMPS>;
+}
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+//  Move Instructions.
+//
+let SchedRW = [WriteMove] in {
+let neverHasSideEffects = 1 in {
+def MOV8rr  : I<0x88, MRMDestReg, (outs GR8 :$dst), (ins GR8 :$src),
+                "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+def MOV16rr : I<0x89, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
+                "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16;
+def MOV32rr : I<0x89, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32;
+def MOV64rr : RI<0x89, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
+                 "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+}
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+def MOV8ri  : Ii8 <0xB0, AddRegFrm, (outs GR8 :$dst), (ins i8imm :$src),
+                   "mov{b}\t{$src, $dst|$dst, $src}",
+                   [(set GR8:$dst, imm:$src)], IIC_MOV>;
+def MOV16ri : Ii16<0xB8, AddRegFrm, (outs GR16:$dst), (ins i16imm:$src),
+                   "mov{w}\t{$src, $dst|$dst, $src}",
+                   [(set GR16:$dst, imm:$src)], IIC_MOV>, OpSize16;
+def MOV32ri : Ii32<0xB8, AddRegFrm, (outs GR32:$dst), (ins i32imm:$src),
+                   "mov{l}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, imm:$src)], IIC_MOV>, OpSize32;
+def MOV64ri32 : RIi32S<0xC7, MRM0r, (outs GR64:$dst), (ins i64i32imm:$src),
+                       "mov{q}\t{$src, $dst|$dst, $src}",
+                       [(set GR64:$dst, i64immSExt32:$src)], IIC_MOV>;
+}
+let isReMaterializable = 1 in {
+def MOV64ri : RIi64<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64imm:$src),
+                    "movabs{q}\t{$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, imm:$src)], IIC_MOV>;
+}
+
+// Longer forms that use a ModR/M byte. Needed for disassembler
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
+def MOV8ri_alt  : Ii8 <0xC6, MRM0r, (outs GR8 :$dst), (ins i8imm :$src),
+                   "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+def MOV16ri_alt : Ii16<0xC7, MRM0r, (outs GR16:$dst), (ins i16imm:$src),
+                   "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16;
+def MOV32ri_alt : Ii32<0xC7, MRM0r, (outs GR32:$dst), (ins i32imm:$src),
+                   "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32;
+}
+} // SchedRW
+
+let SchedRW = [WriteStore] in {
+def MOV8mi  : Ii8 <0xC6, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src),
+                   "mov{b}\t{$src, $dst|$dst, $src}",
+                   [(store (i8 imm:$src), addr:$dst)], IIC_MOV_MEM>;
+def MOV16mi : Ii16<0xC7, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src),
+                   "mov{w}\t{$src, $dst|$dst, $src}",
+                   [(store (i16 imm:$src), addr:$dst)], IIC_MOV_MEM>, OpSize16;
+def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src),
+                   "mov{l}\t{$src, $dst|$dst, $src}",
+                   [(store (i32 imm:$src), addr:$dst)], IIC_MOV_MEM>, OpSize32;
+def MOV64mi32 : RIi32S<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src),
+                       "mov{q}\t{$src, $dst|$dst, $src}",
+                       [(store i64immSExt32:$src, addr:$dst)], IIC_MOV_MEM>;
+} // SchedRW
+
+let hasSideEffects = 0 in {
+
+/// moffs8, moffs16 and moffs32 versions of moves.  The immediate is a
+/// 32-bit offset from the segment base. These are only valid in x86-32 mode.
+let SchedRW = [WriteALU] in {
+let mayLoad = 1 in {
+let Defs = [AL] in
+def MOV8o8a : Ii32 <0xA0, RawFrmMemOffs, (outs), (ins offset8:$src),
+                   "mov{b}\t{$src, %al|al, $src}", [], IIC_MOV_MEM>,
+                   Requires<[In32BitMode]>;
+let Defs = [AX] in
+def MOV16o16a : Ii32 <0xA1, RawFrmMemOffs, (outs), (ins offset16:$src),
+                      "mov{w}\t{$src, %ax|ax, $src}", [], IIC_MOV_MEM>,
+                      OpSize16, Requires<[In32BitMode]>;
+let Defs = [EAX] in
+def MOV32o32a : Ii32 <0xA1, RawFrmMemOffs, (outs), (ins offset32:$src),
+                      "mov{l}\t{$src, %eax|eax, $src}", [], IIC_MOV_MEM>,
+                      OpSize32, Requires<[In32BitMode]>;
+
+let Defs = [AL] in
+def MOV8o8a_16 : Ii16 <0xA0, RawFrmMemOffs, (outs), (ins offset8:$src),
+                   "mov{b}\t{$src, %al|al, $src}", [], IIC_MOV_MEM>,
+                   AdSize, Requires<[In16BitMode]>;
+let Defs = [AX] in
+def MOV16o16a_16 : Ii16 <0xA1, RawFrmMemOffs, (outs), (ins offset16:$src),
+                      "mov{w}\t{$src, %ax|ax, $src}", [], IIC_MOV_MEM>,
+                      OpSize16, AdSize, Requires<[In16BitMode]>;
+let Defs = [EAX] in
+def MOV32o32a_16 : Ii16 <0xA1, RawFrmMemOffs, (outs), (ins offset32:$src),
+                      "mov{l}\t{$src, %eax|eax, $src}", [], IIC_MOV_MEM>,
+                      AdSize, OpSize32, Requires<[In16BitMode]>;
+}
+let mayStore = 1 in {
+let Uses = [AL] in
+def MOV8ao8 : Ii32 <0xA2, RawFrmMemOffs, (outs offset8:$dst), (ins),
+                   "mov{b}\t{%al, $dst|$dst, al}", [], IIC_MOV_MEM>,
+                  Requires<[In32BitMode]>;
+let Uses = [AX] in
+def MOV16ao16 : Ii32 <0xA3, RawFrmMemOffs, (outs offset16:$dst), (ins),
+                      "mov{w}\t{%ax, $dst|$dst, ax}", [], IIC_MOV_MEM>,
+                      OpSize16, Requires<[In32BitMode]>;
+let Uses = [EAX] in
+def MOV32ao32 : Ii32 <0xA3, RawFrmMemOffs, (outs offset32:$dst), (ins),
+                      "mov{l}\t{%eax, $dst|$dst, eax}", [], IIC_MOV_MEM>,
+                     OpSize32, Requires<[In32BitMode]>;
+
+let Uses = [AL] in
+def MOV8ao8_16 : Ii16 <0xA2, RawFrmMemOffs, (outs offset8:$dst), (ins),
+                   "mov{b}\t{%al, $dst|$dst, al}", [], IIC_MOV_MEM>,
+                  AdSize, Requires<[In16BitMode]>;
+let Uses = [AX] in
+def MOV16ao16_16 : Ii16 <0xA3, RawFrmMemOffs, (outs offset16:$dst), (ins),
+                      "mov{w}\t{%ax, $dst|$dst, ax}", [], IIC_MOV_MEM>,
+                      OpSize16, AdSize, Requires<[In16BitMode]>;
+let Uses = [EAX] in
+def MOV32ao32_16 : Ii16 <0xA3, RawFrmMemOffs, (outs offset32:$dst), (ins),
+                      "mov{l}\t{%eax, $dst|$dst, eax}", [], IIC_MOV_MEM>,
+                     OpSize32, AdSize, Requires<[In16BitMode]>;
+}
+}
+
+// These forms all have full 64-bit absolute addresses in their instructions
+// and use the movabs mnemonic to indicate this specific form.
+let mayLoad = 1 in {
+let Defs = [AL] in
+def MOV64o8a : RIi64_NOREX<0xA0, RawFrmMemOffs, (outs), (ins offset8:$src),
+                     "movabs{b}\t{$src, %al|al, $src}", []>,
+                     Requires<[In64BitMode]>;
+let Defs = [AX] in
+def MOV64o16a : RIi64_NOREX<0xA1, RawFrmMemOffs, (outs), (ins offset16:$src),
+                     "movabs{w}\t{$src, %ax|ax, $src}", []>, OpSize16,
+                     Requires<[In64BitMode]>;
+let Defs = [EAX] in
+def MOV64o32a : RIi64_NOREX<0xA1, RawFrmMemOffs, (outs), (ins offset32:$src),
+                     "movabs{l}\t{$src, %eax|eax, $src}", []>, OpSize32,
+                     Requires<[In64BitMode]>;
+let Defs = [RAX] in
+def MOV64o64a : RIi64<0xA1, RawFrmMemOffs, (outs), (ins offset64:$src),
+                     "movabs{q}\t{$src, %rax|rax, $src}", []>,
+                     Requires<[In64BitMode]>;
+}
+
+let mayStore = 1 in {
+let Uses = [AL] in
+def MOV64ao8 : RIi64_NOREX<0xA2, RawFrmMemOffs, (outs offset8:$dst), (ins),
+                     "movabs{b}\t{%al, $dst|$dst, al}", []>,
+                     Requires<[In64BitMode]>;
+let Uses = [AX] in
+def MOV64ao16 : RIi64_NOREX<0xA3, RawFrmMemOffs, (outs offset16:$dst), (ins),
+                     "movabs{w}\t{%ax, $dst|$dst, ax}", []>, OpSize16,
+                     Requires<[In64BitMode]>;
+let Uses = [EAX] in
+def MOV64ao32 : RIi64_NOREX<0xA3, RawFrmMemOffs, (outs offset32:$dst), (ins),
+                     "movabs{l}\t{%eax, $dst|$dst, eax}", []>, OpSize32,
+                     Requires<[In64BitMode]>;
+let Uses = [RAX] in
+def MOV64ao64 : RIi64<0xA3, RawFrmMemOffs, (outs offset64:$dst), (ins),
+                     "movabs{q}\t{%rax, $dst|$dst, rax}", []>,
+                     Requires<[In64BitMode]>;
+}
+} // hasSideEffects = 0
+
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
+    SchedRW = [WriteMove] in {
+def MOV8rr_REV : I<0x8A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src),
+                   "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+def MOV16rr_REV : I<0x8B, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                    "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16;
+def MOV32rr_REV : I<0x8B, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                    "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32;
+def MOV64rr_REV : RI<0x8B, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+                     "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+}
+
+let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in {
+def MOV8rm  : I<0x8A, MRMSrcMem, (outs GR8 :$dst), (ins i8mem :$src),
+                "mov{b}\t{$src, $dst|$dst, $src}",
+                [(set GR8:$dst, (loadi8 addr:$src))], IIC_MOV_MEM>;
+def MOV16rm : I<0x8B, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                "mov{w}\t{$src, $dst|$dst, $src}",
+                [(set GR16:$dst, (loadi16 addr:$src))], IIC_MOV_MEM>, OpSize16;
+def MOV32rm : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}",
+                [(set GR32:$dst, (loadi32 addr:$src))], IIC_MOV_MEM>, OpSize32;
+def MOV64rm : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                 "mov{q}\t{$src, $dst|$dst, $src}",
+                 [(set GR64:$dst, (load addr:$src))], IIC_MOV_MEM>;
+}
+
+let SchedRW = [WriteStore] in {
+def MOV8mr  : I<0x88, MRMDestMem, (outs), (ins i8mem :$dst, GR8 :$src),
+                "mov{b}\t{$src, $dst|$dst, $src}",
+                [(store GR8:$src, addr:$dst)], IIC_MOV_MEM>;
+def MOV16mr : I<0x89, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+                "mov{w}\t{$src, $dst|$dst, $src}",
+                [(store GR16:$src, addr:$dst)], IIC_MOV_MEM>, OpSize16;
+def MOV32mr : I<0x89, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}",
+                [(store GR32:$src, addr:$dst)], IIC_MOV_MEM>, OpSize32;
+def MOV64mr : RI<0x89, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+                 "mov{q}\t{$src, $dst|$dst, $src}",
+                 [(store GR64:$src, addr:$dst)], IIC_MOV_MEM>;
+} // SchedRW
+
+// Versions of MOV8rr, MOV8mr, and MOV8rm that use i8mem_NOREX and GR8_NOREX so
+// that they can be used for copying and storing h registers, which can't be
+// encoded when a REX prefix is present.
+let isCodeGenOnly = 1 in {
+let neverHasSideEffects = 1 in
+def MOV8rr_NOREX : I<0x88, MRMDestReg,
+                     (outs GR8_NOREX:$dst), (ins GR8_NOREX:$src),
+                     "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", [], IIC_MOV>,
+                   Sched<[WriteMove]>;
+let mayStore = 1, neverHasSideEffects = 1 in
+def MOV8mr_NOREX : I<0x88, MRMDestMem,
+                     (outs), (ins i8mem_NOREX:$dst, GR8_NOREX:$src),
+                     "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", [],
+                     IIC_MOV_MEM>, Sched<[WriteStore]>;
+let mayLoad = 1, neverHasSideEffects = 1,
+    canFoldAsLoad = 1, isReMaterializable = 1 in
+def MOV8rm_NOREX : I<0x8A, MRMSrcMem,
+                     (outs GR8_NOREX:$dst), (ins i8mem_NOREX:$src),
+                     "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", [],
+                     IIC_MOV_MEM>, Sched<[WriteLoad]>;
+}
+
+
+// Condition code ops, incl. set if equal/not equal/...
+let SchedRW = [WriteALU] in {
+let Defs = [EFLAGS], Uses = [AH] in
+def SAHF     : I<0x9E, RawFrm, (outs),  (ins), "sahf",
+                 [(set EFLAGS, (X86sahf AH))], IIC_AHF>;
+let Defs = [AH], Uses = [EFLAGS], neverHasSideEffects = 1 in
+def LAHF     : I<0x9F, RawFrm, (outs),  (ins), "lahf", [],
+                IIC_AHF>;  // AH = flags
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Bit tests instructions: BT, BTS, BTR, BTC.
+
+let Defs = [EFLAGS] in {
+let SchedRW = [WriteALU] in {
+def BT16rr : I<0xA3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
+               "bt{w}\t{$src2, $src1|$src1, $src2}",
+               [(set EFLAGS, (X86bt GR16:$src1, GR16:$src2))], IIC_BT_RR>,
+               OpSize16, TB;
+def BT32rr : I<0xA3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
+               "bt{l}\t{$src2, $src1|$src1, $src2}",
+               [(set EFLAGS, (X86bt GR32:$src1, GR32:$src2))], IIC_BT_RR>,
+               OpSize32, TB;
+def BT64rr : RI<0xA3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
+               "bt{q}\t{$src2, $src1|$src1, $src2}",
+               [(set EFLAGS, (X86bt GR64:$src1, GR64:$src2))], IIC_BT_RR>, TB;
+} // SchedRW
+
+// Unlike with the register+register form, the memory+register form of the
+// bt instruction does not ignore the high bits of the index. From ISel's
+// perspective, this is pretty bizarre. Make these instructions disassembly
+// only for now.
+
+let mayLoad = 1, hasSideEffects = 0, SchedRW = [WriteALULd] in {
+  def BT16mr : I<0xA3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
+                 "bt{w}\t{$src2, $src1|$src1, $src2}",
+  //               [(X86bt (loadi16 addr:$src1), GR16:$src2),
+  //                (implicit EFLAGS)]
+                 [], IIC_BT_MR
+                 >, OpSize16, TB, Requires<[FastBTMem]>;
+  def BT32mr : I<0xA3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
+                 "bt{l}\t{$src2, $src1|$src1, $src2}",
+  //               [(X86bt (loadi32 addr:$src1), GR32:$src2),
+  //                (implicit EFLAGS)]
+                 [], IIC_BT_MR
+                 >, OpSize32, TB, Requires<[FastBTMem]>;
+  def BT64mr : RI<0xA3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+                 "bt{q}\t{$src2, $src1|$src1, $src2}",
+  //               [(X86bt (loadi64 addr:$src1), GR64:$src2),
+  //                (implicit EFLAGS)]
+                  [], IIC_BT_MR
+                  >, TB;
+}
+
+let SchedRW = [WriteALU] in {
+def BT16ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR16:$src1, i16i8imm:$src2),
+                "bt{w}\t{$src2, $src1|$src1, $src2}",
+                [(set EFLAGS, (X86bt GR16:$src1, i16immSExt8:$src2))],
+                IIC_BT_RI>, OpSize16, TB;
+def BT32ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR32:$src1, i32i8imm:$src2),
+                "bt{l}\t{$src2, $src1|$src1, $src2}",
+                [(set EFLAGS, (X86bt GR32:$src1, i32immSExt8:$src2))],
+                IIC_BT_RI>, OpSize32, TB;
+def BT64ri8 : RIi8<0xBA, MRM4r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+                "bt{q}\t{$src2, $src1|$src1, $src2}",
+                [(set EFLAGS, (X86bt GR64:$src1, i64immSExt8:$src2))],
+                IIC_BT_RI>, TB;
+} // SchedRW
+
+// Note that these instructions don't need FastBTMem because that
+// only applies when the other operand is in a register. When it's
+// an immediate, bt is still fast.
+let SchedRW = [WriteALU] in {
+def BT16mi8 : Ii8<0xBA, MRM4m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
+                "bt{w}\t{$src2, $src1|$src1, $src2}",
+                [(set EFLAGS, (X86bt (loadi16 addr:$src1), i16immSExt8:$src2))
+                 ], IIC_BT_MI>, OpSize16, TB;
+def BT32mi8 : Ii8<0xBA, MRM4m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
+                "bt{l}\t{$src2, $src1|$src1, $src2}",
+                [(set EFLAGS, (X86bt (loadi32 addr:$src1), i32immSExt8:$src2))
+                 ], IIC_BT_MI>, OpSize32, TB;
+def BT64mi8 : RIi8<0xBA, MRM4m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+                "bt{q}\t{$src2, $src1|$src1, $src2}",
+                [(set EFLAGS, (X86bt (loadi64 addr:$src1),
+                                     i64immSExt8:$src2))], IIC_BT_MI>, TB;
+} // SchedRW
+
+let hasSideEffects = 0 in {
+let SchedRW = [WriteALU] in {
+def BTC16rr : I<0xBB, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
+                "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+                OpSize16, TB;
+def BTC32rr : I<0xBB, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
+                "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+                OpSize32, TB;
+def BTC64rr : RI<0xBB, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
+                 "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def BTC16mr : I<0xBB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
+                "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+                OpSize16, TB;
+def BTC32mr : I<0xBB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
+                "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+                OpSize32, TB;
+def BTC64mr : RI<0xBB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+                 "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
+}
+
+let SchedRW = [WriteALU] in {
+def BTC16ri8 : Ii8<0xBA, MRM7r, (outs), (ins GR16:$src1, i16i8imm:$src2),
+                    "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+                    OpSize16, TB;
+def BTC32ri8 : Ii8<0xBA, MRM7r, (outs), (ins GR32:$src1, i32i8imm:$src2),
+                    "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+                    OpSize32, TB;
+def BTC64ri8 : RIi8<0xBA, MRM7r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+                    "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def BTC16mi8 : Ii8<0xBA, MRM7m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
+                    "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+                    OpSize16, TB;
+def BTC32mi8 : Ii8<0xBA, MRM7m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
+                    "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+                    OpSize32, TB;
+def BTC64mi8 : RIi8<0xBA, MRM7m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+                    "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
+}
+
+let SchedRW = [WriteALU] in {
+def BTR16rr : I<0xB3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
+                "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+                OpSize16, TB;
+def BTR32rr : I<0xB3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
+                "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+                OpSize32, TB;
+def BTR64rr : RI<0xB3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
+                 "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def BTR16mr : I<0xB3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
+                "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+                OpSize16, TB;
+def BTR32mr : I<0xB3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
+                "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+                OpSize32, TB;
+def BTR64mr : RI<0xB3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+                 "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
+}
+
+let SchedRW = [WriteALU] in {
+def BTR16ri8 : Ii8<0xBA, MRM6r, (outs), (ins GR16:$src1, i16i8imm:$src2),
+                    "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+                    OpSize16, TB;
+def BTR32ri8 : Ii8<0xBA, MRM6r, (outs), (ins GR32:$src1, i32i8imm:$src2),
+                    "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+                    OpSize32, TB;
+def BTR64ri8 : RIi8<0xBA, MRM6r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+                    "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def BTR16mi8 : Ii8<0xBA, MRM6m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
+                    "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+                    OpSize16, TB;
+def BTR32mi8 : Ii8<0xBA, MRM6m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
+                    "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+                    OpSize32, TB;
+def BTR64mi8 : RIi8<0xBA, MRM6m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+                    "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
+}
+
+let SchedRW = [WriteALU] in {
+def BTS16rr : I<0xAB, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
+                "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+                OpSize16, TB;
+def BTS32rr : I<0xAB, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
+                "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+              OpSize32, TB;
+def BTS64rr : RI<0xAB, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
+               "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def BTS16mr : I<0xAB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
+              "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+              OpSize16, TB;
+def BTS32mr : I<0xAB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
+              "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+              OpSize32, TB;
+def BTS64mr : RI<0xAB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+                 "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
+}
+
+let SchedRW = [WriteALU] in {
+def BTS16ri8 : Ii8<0xBA, MRM5r, (outs), (ins GR16:$src1, i16i8imm:$src2),
+                    "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+                    OpSize16, TB;
+def BTS32ri8 : Ii8<0xBA, MRM5r, (outs), (ins GR32:$src1, i32i8imm:$src2),
+                    "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+                    OpSize32, TB;
+def BTS64ri8 : RIi8<0xBA, MRM5r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+                    "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def BTS16mi8 : Ii8<0xBA, MRM5m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
+                    "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+                    OpSize16, TB;
+def BTS32mi8 : Ii8<0xBA, MRM5m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
+                    "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+                    OpSize32, TB;
+def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+                    "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
+}
+} // hasSideEffects = 0
+} // Defs = [EFLAGS]
+
+
+//===----------------------------------------------------------------------===//
+// Atomic support
+//
+
+// Atomic swap. These are just normal xchg instructions. But since a memory
+// operand is referenced, the atomicity is ensured.
+multiclass ATOMIC_SWAP<bits<8> opc8, bits<8> opc, string mnemonic, string frag,
+                       InstrItinClass itin> {
+  let Constraints = "$val = $dst", SchedRW = [WriteALULd, WriteRMW] in {
+    def NAME#8rm  : I<opc8, MRMSrcMem, (outs GR8:$dst),
+                      (ins GR8:$val, i8mem:$ptr),
+                      !strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"),
+                      [(set
+                         GR8:$dst,
+                         (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))],
+                      itin>;
+    def NAME#16rm : I<opc, MRMSrcMem, (outs GR16:$dst),
+                      (ins GR16:$val, i16mem:$ptr),
+                      !strconcat(mnemonic, "{w}\t{$val, $ptr|$ptr, $val}"),
+                      [(set
+                         GR16:$dst,
+                         (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))],
+                      itin>, OpSize16;
+    def NAME#32rm : I<opc, MRMSrcMem, (outs GR32:$dst),
+                      (ins GR32:$val, i32mem:$ptr),
+                      !strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"),
+                      [(set
+                         GR32:$dst,
+                         (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))],
+                      itin>, OpSize32;
+    def NAME#64rm : RI<opc, MRMSrcMem, (outs GR64:$dst),
+                       (ins GR64:$val, i64mem:$ptr),
+                       !strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"),
+                       [(set
+                         GR64:$dst,
+                         (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))],
+                       itin>;
+  }
+}
+
+defm XCHG    : ATOMIC_SWAP<0x86, 0x87, "xchg", "atomic_swap", IIC_XCHG_MEM>;
+
+// Swap between registers.
+let SchedRW = [WriteALU] in {
+let Constraints = "$val = $dst" in {
+def XCHG8rr : I<0x86, MRMSrcReg, (outs GR8:$dst), (ins GR8:$val, GR8:$src),
+                "xchg{b}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>;
+def XCHG16rr : I<0x87, MRMSrcReg, (outs GR16:$dst), (ins GR16:$val, GR16:$src),
+                 "xchg{w}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>,
+                 OpSize16;
+def XCHG32rr : I<0x87, MRMSrcReg, (outs GR32:$dst), (ins GR32:$val, GR32:$src),
+                 "xchg{l}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>,
+                 OpSize32;
+def XCHG64rr : RI<0x87, MRMSrcReg, (outs GR64:$dst), (ins GR64:$val,GR64:$src),
+                  "xchg{q}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>;
+}
+
+// Swap between EAX and other registers.
+let Uses = [AX], Defs = [AX] in
+def XCHG16ar : I<0x90, AddRegFrm, (outs), (ins GR16:$src),
+                  "xchg{w}\t{$src, %ax|ax, $src}", [], IIC_XCHG_REG>, OpSize16;
+let Uses = [EAX], Defs = [EAX] in
+def XCHG32ar : I<0x90, AddRegFrm, (outs), (ins GR32:$src),
+                  "xchg{l}\t{$src, %eax|eax, $src}", [], IIC_XCHG_REG>,
+                  OpSize32, Requires<[Not64BitMode]>;
+let Uses = [EAX], Defs = [EAX] in
+// Uses GR32_NOAX in 64-bit mode to prevent encoding using the 0x90 NOP encoding.
+// xchg %eax, %eax needs to clear upper 32-bits of RAX so is not a NOP.
+def XCHG32ar64 : I<0x90, AddRegFrm, (outs), (ins GR32_NOAX:$src),
+                   "xchg{l}\t{$src, %eax|eax, $src}", [], IIC_XCHG_REG>,
+                   OpSize32, Requires<[In64BitMode]>;
+let Uses = [RAX], Defs = [RAX] in
+def XCHG64ar : RI<0x90, AddRegFrm, (outs), (ins GR64:$src),
+                  "xchg{q}\t{$src, %rax|rax, $src}", [], IIC_XCHG_REG>;
+} // SchedRW
+
+let SchedRW = [WriteALU] in {
+def XADD8rr : I<0xC0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src),
+                "xadd{b}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB;
+def XADD16rr : I<0xC1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
+                 "xadd{w}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB,
+                 OpSize16;
+def XADD32rr  : I<0xC1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
+                 "xadd{l}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB,
+                 OpSize32;
+def XADD64rr  : RI<0xC1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
+                   "xadd{q}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def XADD8rm   : I<0xC0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src),
+                 "xadd{b}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB;
+def XADD16rm  : I<0xC1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+                 "xadd{w}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB,
+                 OpSize16;
+def XADD32rm  : I<0xC1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+                 "xadd{l}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB,
+                 OpSize32;
+def XADD64rm  : RI<0xC1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+                   "xadd{q}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB;
+
+}
+
+let SchedRW = [WriteALU] in {
+def CMPXCHG8rr : I<0xB0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src),
+                   "cmpxchg{b}\t{$src, $dst|$dst, $src}", [],
+                   IIC_CMPXCHG_REG8>, TB;
+def CMPXCHG16rr : I<0xB1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
+                    "cmpxchg{w}\t{$src, $dst|$dst, $src}", [],
+                    IIC_CMPXCHG_REG>, TB, OpSize16;
+def CMPXCHG32rr  : I<0xB1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
+                     "cmpxchg{l}\t{$src, $dst|$dst, $src}", [],
+                     IIC_CMPXCHG_REG>, TB, OpSize32;
+def CMPXCHG64rr  : RI<0xB1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
+                      "cmpxchg{q}\t{$src, $dst|$dst, $src}", [],
+                      IIC_CMPXCHG_REG>, TB;
+} // SchedRW
+
+let SchedRW = [WriteALULd, WriteRMW] in {
+let mayLoad = 1, mayStore = 1 in {
+def CMPXCHG8rm   : I<0xB0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src),
+                     "cmpxchg{b}\t{$src, $dst|$dst, $src}", [],
+                     IIC_CMPXCHG_MEM8>, TB;
+def CMPXCHG16rm  : I<0xB1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+                     "cmpxchg{w}\t{$src, $dst|$dst, $src}", [],
+                     IIC_CMPXCHG_MEM>, TB, OpSize16;
+def CMPXCHG32rm  : I<0xB1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+                     "cmpxchg{l}\t{$src, $dst|$dst, $src}", [],
+                     IIC_CMPXCHG_MEM>, TB, OpSize32;
+def CMPXCHG64rm  : RI<0xB1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+                      "cmpxchg{q}\t{$src, $dst|$dst, $src}", [],
+                      IIC_CMPXCHG_MEM>, TB;
+}
+
+let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX] in
+def CMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$dst),
+                  "cmpxchg8b\t$dst", [], IIC_CMPXCHG_8B>, TB;
+
+let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX] in
+def CMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$dst),
+                    "cmpxchg16b\t$dst", [], IIC_CMPXCHG_16B>,
+                    TB, Requires<[HasCmpxchg16b]>;
+} // SchedRW
+
+
+// Lock instruction prefix
+def LOCK_PREFIX : I<0xF0, RawFrm, (outs),  (ins), "lock", []>;
+
+// Rex64 instruction prefix
+def REX64_PREFIX : I<0x48, RawFrm, (outs),  (ins), "rex64", []>,
+                     Requires<[In64BitMode]>;
+
+// Data16 instruction prefix
+def DATA16_PREFIX : I<0x66, RawFrm, (outs),  (ins), "data16", []>;
+
+// Repeat string operation instruction prefixes
+// These uses the DF flag in the EFLAGS register to inc or dec ECX
+let Defs = [ECX], Uses = [ECX,EFLAGS] in {
+// Repeat (used with INS, OUTS, MOVS, LODS and STOS)
+def REP_PREFIX : I<0xF3, RawFrm, (outs),  (ins), "rep", []>;
+// Repeat while not equal (used with CMPS and SCAS)
+def REPNE_PREFIX : I<0xF2, RawFrm, (outs),  (ins), "repne", []>;
+}
+
+
+// String manipulation instructions
+let SchedRW = [WriteMicrocoded] in {
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [AL,ESI], Uses = [ESI,EFLAGS] in
+def LODSB : I<0xAC, RawFrmSrc, (outs), (ins srcidx8:$src),
+              "lodsb\t{$src, %al|al, $src}", [], IIC_LODS>;
+let Defs = [AX,ESI], Uses = [ESI,EFLAGS] in
+def LODSW : I<0xAD, RawFrmSrc, (outs), (ins srcidx16:$src),
+              "lodsw\t{$src, %ax|ax, $src}", [], IIC_LODS>, OpSize16;
+let Defs = [EAX,ESI], Uses = [ESI,EFLAGS] in
+def LODSL : I<0xAD, RawFrmSrc, (outs), (ins srcidx32:$src),
+              "lods{l|d}\t{$src, %eax|eax, $src}", [], IIC_LODS>, OpSize32;
+let Defs = [RAX,ESI], Uses = [ESI,EFLAGS] in
+def LODSQ : RI<0xAD, RawFrmSrc, (outs), (ins srcidx64:$src),
+               "lodsq\t{$src, %rax|rax, $src}", [], IIC_LODS>;
+}
+
+let SchedRW = [WriteSystem] in {
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [ESI], Uses = [DX,ESI,EFLAGS] in {
+def OUTSB : I<0x6E, RawFrmSrc, (outs), (ins srcidx8:$src),
+             "outsb\t{$src, %dx|dx, $src}", [], IIC_OUTS>;
+def OUTSW : I<0x6F, RawFrmSrc, (outs), (ins srcidx16:$src),
+              "outsw\t{$src, %dx|dx, $src}", [], IIC_OUTS>, OpSize16;
+def OUTSL : I<0x6F, RawFrmSrc, (outs), (ins srcidx32:$src),
+              "outs{l|d}\t{$src, %dx|dx, $src}", [], IIC_OUTS>, OpSize32;
+}
+
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [EDI], Uses = [DX,EDI,EFLAGS] in {
+def INSB : I<0x6C, RawFrmDst, (outs dstidx8:$dst), (ins),
+             "insb\t{%dx, $dst|$dst, dx}", [], IIC_INS>;
+def INSW : I<0x6D, RawFrmDst, (outs dstidx16:$dst), (ins),
+             "insw\t{%dx, $dst|$dst, dx}", [], IIC_INS>,  OpSize16;
+def INSL : I<0x6D, RawFrmDst, (outs dstidx32:$dst), (ins),
+             "ins{l|d}\t{%dx, $dst|$dst, dx}", [], IIC_INS>, OpSize32;
+}
+}
+
+// Flag instructions
+let SchedRW = [WriteALU] in {
+def CLC : I<0xF8, RawFrm, (outs), (ins), "clc", [], IIC_CLC>;
+def STC : I<0xF9, RawFrm, (outs), (ins), "stc", [], IIC_STC>;
+def CLI : I<0xFA, RawFrm, (outs), (ins), "cli", [], IIC_CLI>;
+def STI : I<0xFB, RawFrm, (outs), (ins), "sti", [], IIC_STI>;
+def CLD : I<0xFC, RawFrm, (outs), (ins), "cld", [], IIC_CLD>;
+def STD : I<0xFD, RawFrm, (outs), (ins), "std", [], IIC_STD>;
+def CMC : I<0xF5, RawFrm, (outs), (ins), "cmc", [], IIC_CMC>;
+
+def CLTS : I<0x06, RawFrm, (outs), (ins), "clts", [], IIC_CLTS>, TB;
+}
+
+// Table lookup instructions
+def XLAT : I<0xD7, RawFrm, (outs), (ins), "xlatb", [], IIC_XLAT>,
+           Sched<[WriteLoad]>;
+
+let SchedRW = [WriteMicrocoded] in {
+// ASCII Adjust After Addition
+// sets AL, AH and CF and AF of EFLAGS and uses AL and AF of EFLAGS
+def AAA : I<0x37, RawFrm, (outs), (ins), "aaa", [], IIC_AAA>,
+            Requires<[Not64BitMode]>;
+
+// ASCII Adjust AX Before Division
+// sets AL, AH and EFLAGS and uses AL and AH
+def AAD8i8 : Ii8<0xD5, RawFrm, (outs), (ins i8imm:$src),
+                 "aad\t$src", [], IIC_AAD>, Requires<[Not64BitMode]>;
+
+// ASCII Adjust AX After Multiply
+// sets AL, AH and EFLAGS and uses AL
+def AAM8i8 : Ii8<0xD4, RawFrm, (outs), (ins i8imm:$src),
+                 "aam\t$src", [], IIC_AAM>, Requires<[Not64BitMode]>;
+
+// ASCII Adjust AL After Subtraction - sets
+// sets AL, AH and CF and AF of EFLAGS and uses AL and AF of EFLAGS
+def AAS : I<0x3F, RawFrm, (outs), (ins), "aas", [], IIC_AAS>,
+            Requires<[Not64BitMode]>;
+
+// Decimal Adjust AL after Addition
+// sets AL, CF and AF of EFLAGS and uses AL, CF and AF of EFLAGS
+def DAA : I<0x27, RawFrm, (outs), (ins), "daa", [], IIC_DAA>,
+            Requires<[Not64BitMode]>;
+
+// Decimal Adjust AL after Subtraction
+// sets AL, CF and AF of EFLAGS and uses AL, CF and AF of EFLAGS
+def DAS : I<0x2F, RawFrm, (outs), (ins), "das", [], IIC_DAS>,
+            Requires<[Not64BitMode]>;
+} // SchedRW
+
+let SchedRW = [WriteSystem] in {
+// Check Array Index Against Bounds
+def BOUNDS16rm : I<0x62, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                   "bound\t{$src, $dst|$dst, $src}", [], IIC_BOUND>, OpSize16,
+                   Requires<[Not64BitMode]>;
+def BOUNDS32rm : I<0x62, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                   "bound\t{$src, $dst|$dst, $src}", [], IIC_BOUND>, OpSize32,
+                   Requires<[Not64BitMode]>;
+
+// Adjust RPL Field of Segment Selector
+def ARPL16rr : I<0x63, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
+                 "arpl\t{$src, $dst|$dst, $src}", [], IIC_ARPL_REG>,
+                 Requires<[Not64BitMode]>;
+def ARPL16mr : I<0x63, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+                 "arpl\t{$src, $dst|$dst, $src}", [], IIC_ARPL_MEM>,
+                 Requires<[Not64BitMode]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// MOVBE Instructions
+//
+let Predicates = [HasMOVBE] in {
+  let SchedRW = [WriteALULd] in {
+  def MOVBE16rm : I<0xF0, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                    "movbe{w}\t{$src, $dst|$dst, $src}",
+                    [(set GR16:$dst, (bswap (loadi16 addr:$src)))], IIC_MOVBE>,
+                    OpSize16, T8PS;
+  def MOVBE32rm : I<0xF0, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                    "movbe{l}\t{$src, $dst|$dst, $src}",
+                    [(set GR32:$dst, (bswap (loadi32 addr:$src)))], IIC_MOVBE>,
+                    OpSize32, T8PS;
+  def MOVBE64rm : RI<0xF0, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                     "movbe{q}\t{$src, $dst|$dst, $src}",
+                     [(set GR64:$dst, (bswap (loadi64 addr:$src)))], IIC_MOVBE>,
+                     T8PS;
+  }
+  let SchedRW = [WriteStore] in {
+  def MOVBE16mr : I<0xF1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+                    "movbe{w}\t{$src, $dst|$dst, $src}",
+                    [(store (bswap GR16:$src), addr:$dst)], IIC_MOVBE>,
+                    OpSize16, T8PS;
+  def MOVBE32mr : I<0xF1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+                    "movbe{l}\t{$src, $dst|$dst, $src}",
+                    [(store (bswap GR32:$src), addr:$dst)], IIC_MOVBE>,
+                    OpSize32, T8PS;
+  def MOVBE64mr : RI<0xF1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+                     "movbe{q}\t{$src, $dst|$dst, $src}",
+                     [(store (bswap GR64:$src), addr:$dst)], IIC_MOVBE>,
+                     T8PS;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// RDRAND Instruction
+//
+let Predicates = [HasRDRAND], Defs = [EFLAGS] in {
+  def RDRAND16r : I<0xC7, MRM6r, (outs GR16:$dst), (ins),
+                    "rdrand{w}\t$dst",
+                    [(set GR16:$dst, EFLAGS, (X86rdrand))]>, OpSize16, TB;
+  def RDRAND32r : I<0xC7, MRM6r, (outs GR32:$dst), (ins),
+                    "rdrand{l}\t$dst",
+                    [(set GR32:$dst, EFLAGS, (X86rdrand))]>, OpSize32, TB;
+  def RDRAND64r : RI<0xC7, MRM6r, (outs GR64:$dst), (ins),
+                     "rdrand{q}\t$dst",
+                     [(set GR64:$dst, EFLAGS, (X86rdrand))]>, TB;
+}
+
+//===----------------------------------------------------------------------===//
+// RDSEED Instruction
+//
+let Predicates = [HasRDSEED], Defs = [EFLAGS] in {
+  def RDSEED16r : I<0xC7, MRM7r, (outs GR16:$dst), (ins),
+                    "rdseed{w}\t$dst",
+                    [(set GR16:$dst, EFLAGS, (X86rdseed))]>, OpSize16, TB;
+  def RDSEED32r : I<0xC7, MRM7r, (outs GR32:$dst), (ins),
+                    "rdseed{l}\t$dst",
+                    [(set GR32:$dst, EFLAGS, (X86rdseed))]>, OpSize32, TB;
+  def RDSEED64r : RI<0xC7, MRM7r, (outs GR64:$dst), (ins),
+                     "rdseed{q}\t$dst",
+                     [(set GR64:$dst, EFLAGS, (X86rdseed))]>, TB;
+}
+
+//===----------------------------------------------------------------------===//
+// LZCNT Instruction
+//
+let Predicates = [HasLZCNT], Defs = [EFLAGS] in {
+  def LZCNT16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                    "lzcnt{w}\t{$src, $dst|$dst, $src}",
+                    [(set GR16:$dst, (ctlz GR16:$src)), (implicit EFLAGS)]>, XS,
+                    OpSize16;
+  def LZCNT16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                    "lzcnt{w}\t{$src, $dst|$dst, $src}",
+                    [(set GR16:$dst, (ctlz (loadi16 addr:$src))),
+                     (implicit EFLAGS)]>, XS, OpSize16;
+
+  def LZCNT32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                    "lzcnt{l}\t{$src, $dst|$dst, $src}",
+                    [(set GR32:$dst, (ctlz GR32:$src)), (implicit EFLAGS)]>, XS,
+                    OpSize32;
+  def LZCNT32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                    "lzcnt{l}\t{$src, $dst|$dst, $src}",
+                    [(set GR32:$dst, (ctlz (loadi32 addr:$src))),
+                     (implicit EFLAGS)]>, XS, OpSize32;
+
+  def LZCNT64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+                     "lzcnt{q}\t{$src, $dst|$dst, $src}",
+                     [(set GR64:$dst, (ctlz GR64:$src)), (implicit EFLAGS)]>,
+                     XS;
+  def LZCNT64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                     "lzcnt{q}\t{$src, $dst|$dst, $src}",
+                     [(set GR64:$dst, (ctlz (loadi64 addr:$src))),
+                      (implicit EFLAGS)]>, XS;
+}
+
+let Predicates = [HasLZCNT] in {
+  def : Pat<(X86cmov (ctlz GR16:$src), (i16 16), (X86_COND_E),
+              (X86cmp GR16:$src, (i16 0))), 
+            (LZCNT16rr GR16:$src)>;
+  def : Pat<(X86cmov (ctlz GR32:$src), (i32 32), (X86_COND_E),
+              (X86cmp GR32:$src, (i32 0))),
+            (LZCNT32rr GR32:$src)>;
+  def : Pat<(X86cmov (ctlz GR64:$src), (i64 64), (X86_COND_E),
+              (X86cmp GR64:$src, (i64 0))),
+            (LZCNT64rr GR64:$src)>;
+  def : Pat<(X86cmov (i16 16), (ctlz GR16:$src), (X86_COND_E),
+              (X86cmp GR16:$src, (i16 0))),
+            (LZCNT16rr GR16:$src)>;
+  def : Pat<(X86cmov (i32 32), (ctlz GR32:$src), (X86_COND_E),
+              (X86cmp GR32:$src, (i32 0))),
+            (LZCNT32rr GR32:$src)>;
+  def : Pat<(X86cmov (i64 64), (ctlz GR64:$src), (X86_COND_E),
+              (X86cmp GR64:$src, (i64 0))),
+            (LZCNT64rr GR64:$src)>;
+
+  def : Pat<(X86cmov (ctlz (loadi16 addr:$src)), (i16 16), (X86_COND_E),
+              (X86cmp (loadi16 addr:$src), (i16 0))), 
+            (LZCNT16rm addr:$src)>;
+  def : Pat<(X86cmov (ctlz (loadi32 addr:$src)), (i32 32), (X86_COND_E),
+              (X86cmp (loadi32 addr:$src), (i32 0))), 
+            (LZCNT32rm addr:$src)>;
+  def : Pat<(X86cmov (ctlz (loadi64 addr:$src)), (i64 64), (X86_COND_E),
+              (X86cmp (loadi64 addr:$src), (i64 0))), 
+            (LZCNT64rm addr:$src)>;
+  def : Pat<(X86cmov (i16 16), (ctlz (loadi16 addr:$src)), (X86_COND_E),
+              (X86cmp (loadi16 addr:$src), (i16 0))), 
+            (LZCNT16rm addr:$src)>;
+  def : Pat<(X86cmov (i32 32), (ctlz (loadi32 addr:$src)), (X86_COND_E),
+              (X86cmp (loadi32 addr:$src), (i32 0))), 
+            (LZCNT32rm addr:$src)>;
+  def : Pat<(X86cmov (i64 64), (ctlz (loadi64 addr:$src)), (X86_COND_E),
+              (X86cmp (loadi64 addr:$src), (i64 0))), 
+            (LZCNT64rm addr:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// BMI Instructions
+//
+let Predicates = [HasBMI], Defs = [EFLAGS] in {
+  def TZCNT16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                    "tzcnt{w}\t{$src, $dst|$dst, $src}",
+                    [(set GR16:$dst, (cttz GR16:$src)), (implicit EFLAGS)]>, XS,
+                    OpSize16;
+  def TZCNT16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                    "tzcnt{w}\t{$src, $dst|$dst, $src}",
+                    [(set GR16:$dst, (cttz (loadi16 addr:$src))),
+                     (implicit EFLAGS)]>, XS, OpSize16;
+
+  def TZCNT32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                    "tzcnt{l}\t{$src, $dst|$dst, $src}",
+                    [(set GR32:$dst, (cttz GR32:$src)), (implicit EFLAGS)]>, XS,
+                    OpSize32;
+  def TZCNT32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                    "tzcnt{l}\t{$src, $dst|$dst, $src}",
+                    [(set GR32:$dst, (cttz (loadi32 addr:$src))),
+                     (implicit EFLAGS)]>, XS, OpSize32;
+
+  def TZCNT64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+                     "tzcnt{q}\t{$src, $dst|$dst, $src}",
+                     [(set GR64:$dst, (cttz GR64:$src)), (implicit EFLAGS)]>,
+                     XS;
+  def TZCNT64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                     "tzcnt{q}\t{$src, $dst|$dst, $src}",
+                     [(set GR64:$dst, (cttz (loadi64 addr:$src))),
+                      (implicit EFLAGS)]>, XS;
+}
+
+multiclass bmi_bls<string mnemonic, Format RegMRM, Format MemMRM,
+                  RegisterClass RC, X86MemOperand x86memop> {
+let hasSideEffects = 0 in {
+  def rr : I<0xF3, RegMRM, (outs RC:$dst), (ins RC:$src),
+             !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"),
+             []>, T8PS, VEX_4V;
+  let mayLoad = 1 in
+  def rm : I<0xF3, MemMRM, (outs RC:$dst), (ins x86memop:$src),
+             !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"),
+             []>, T8PS, VEX_4V;
+}
+}
+
+let Predicates = [HasBMI], Defs = [EFLAGS] in {
+  defm BLSR32 : bmi_bls<"blsr{l}", MRM1r, MRM1m, GR32, i32mem>;
+  defm BLSR64 : bmi_bls<"blsr{q}", MRM1r, MRM1m, GR64, i64mem>, VEX_W;
+  defm BLSMSK32 : bmi_bls<"blsmsk{l}", MRM2r, MRM2m, GR32, i32mem>;
+  defm BLSMSK64 : bmi_bls<"blsmsk{q}", MRM2r, MRM2m, GR64, i64mem>, VEX_W;
+  defm BLSI32 : bmi_bls<"blsi{l}", MRM3r, MRM3m, GR32, i32mem>;
+  defm BLSI64 : bmi_bls<"blsi{q}", MRM3r, MRM3m, GR64, i64mem>, VEX_W;
+}
+
+//===----------------------------------------------------------------------===//
+// Pattern fragments to auto generate BMI instructions.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasBMI] in {
+  // FIXME: patterns for the load versions are not implemented
+  def : Pat<(and GR32:$src, (add GR32:$src, -1)),
+            (BLSR32rr GR32:$src)>;
+  def : Pat<(and GR64:$src, (add GR64:$src, -1)),
+            (BLSR64rr GR64:$src)>;
+
+  def : Pat<(xor GR32:$src, (add GR32:$src, -1)),
+            (BLSMSK32rr GR32:$src)>;
+  def : Pat<(xor GR64:$src, (add GR64:$src, -1)),
+            (BLSMSK64rr GR64:$src)>;
+
+  def : Pat<(and GR32:$src, (ineg GR32:$src)),
+            (BLSI32rr GR32:$src)>;
+  def : Pat<(and GR64:$src, (ineg GR64:$src)),
+            (BLSI64rr GR64:$src)>;
+}
+
+let Predicates = [HasBMI] in {
+  def : Pat<(X86cmov (cttz GR16:$src), (i16 16), (X86_COND_E),
+              (X86cmp GR16:$src, (i16 0))),
+            (TZCNT16rr GR16:$src)>;
+  def : Pat<(X86cmov (cttz GR32:$src), (i32 32), (X86_COND_E),
+              (X86cmp GR32:$src, (i32 0))),
+            (TZCNT32rr GR32:$src)>;
+  def : Pat<(X86cmov (cttz GR64:$src), (i64 64), (X86_COND_E),
+              (X86cmp GR64:$src, (i64 0))),
+            (TZCNT64rr GR64:$src)>;
+  def : Pat<(X86cmov (i16 16), (cttz GR16:$src), (X86_COND_E),
+              (X86cmp GR16:$src, (i16 0))),
+            (TZCNT16rr GR16:$src)>;
+  def : Pat<(X86cmov (i32 32), (cttz GR32:$src), (X86_COND_E),
+              (X86cmp GR32:$src, (i32 0))),
+            (TZCNT32rr GR32:$src)>;
+  def : Pat<(X86cmov (i64 64), (cttz GR64:$src), (X86_COND_E),
+              (X86cmp GR64:$src, (i64 0))),
+            (TZCNT64rr GR64:$src)>;
+
+  def : Pat<(X86cmov (cttz (loadi16 addr:$src)), (i16 16), (X86_COND_E),
+              (X86cmp (loadi16 addr:$src), (i16 0))), 
+            (TZCNT16rm addr:$src)>;
+  def : Pat<(X86cmov (cttz (loadi32 addr:$src)), (i32 32), (X86_COND_E),
+              (X86cmp (loadi32 addr:$src), (i32 0))), 
+            (TZCNT32rm addr:$src)>;
+  def : Pat<(X86cmov (cttz (loadi64 addr:$src)), (i64 64), (X86_COND_E),
+              (X86cmp (loadi64 addr:$src), (i64 0))), 
+            (TZCNT64rm addr:$src)>;
+  def : Pat<(X86cmov (i16 16), (cttz (loadi16 addr:$src)), (X86_COND_E),
+              (X86cmp (loadi16 addr:$src), (i16 0))), 
+            (TZCNT16rm addr:$src)>;
+  def : Pat<(X86cmov (i32 32), (cttz (loadi32 addr:$src)), (X86_COND_E),
+              (X86cmp (loadi32 addr:$src), (i32 0))), 
+            (TZCNT32rm addr:$src)>;
+  def : Pat<(X86cmov (i64 64), (cttz (loadi64 addr:$src)), (X86_COND_E),
+              (X86cmp (loadi64 addr:$src), (i64 0))), 
+            (TZCNT64rm addr:$src)>;
+}
+
+
+multiclass bmi_bextr_bzhi<bits<8> opc, string mnemonic, RegisterClass RC,
+                          X86MemOperand x86memop, Intrinsic Int,
+                          PatFrag ld_frag> {
+  def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+             !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set RC:$dst, (Int RC:$src1, RC:$src2)), (implicit EFLAGS)]>,
+             T8PS, VEX_4VOp3;
+  def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1, RC:$src2),
+             !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set RC:$dst, (Int (ld_frag addr:$src1), RC:$src2)),
+              (implicit EFLAGS)]>, T8PS, VEX_4VOp3;
+}
+
+let Predicates = [HasBMI], Defs = [EFLAGS] in {
+  defm BEXTR32 : bmi_bextr_bzhi<0xF7, "bextr{l}", GR32, i32mem,
+                                int_x86_bmi_bextr_32, loadi32>;
+  defm BEXTR64 : bmi_bextr_bzhi<0xF7, "bextr{q}", GR64, i64mem,
+                                int_x86_bmi_bextr_64, loadi64>, VEX_W;
+}
+
+let Predicates = [HasBMI2], Defs = [EFLAGS] in {
+  defm BZHI32 : bmi_bextr_bzhi<0xF5, "bzhi{l}", GR32, i32mem,
+                               int_x86_bmi_bzhi_32, loadi32>;
+  defm BZHI64 : bmi_bextr_bzhi<0xF5, "bzhi{q}", GR64, i64mem,
+                               int_x86_bmi_bzhi_64, loadi64>, VEX_W;
+}
+
+
+def CountTrailingOnes : SDNodeXForm<imm, [{
+  // Count the trailing ones in the immediate.
+  return getI8Imm(CountTrailingOnes_64(N->getZExtValue()));
+}]>;
+
+def BZHIMask : ImmLeaf<i64, [{
+  return isMask_64(Imm) && (CountTrailingOnes_64(Imm) > 32);
+}]>;
+
+let Predicates = [HasBMI2] in {
+  def : Pat<(and GR64:$src, BZHIMask:$mask),
+            (BZHI64rr GR64:$src,
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+                             (MOV8ri (CountTrailingOnes imm:$mask)), sub_8bit))>;
+
+  def : Pat<(and GR32:$src, (add (shl 1, GR8:$lz), -1)),
+            (BZHI32rr GR32:$src,
+              (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
+
+  def : Pat<(and (loadi32 addr:$src), (add (shl 1, GR8:$lz), -1)),
+            (BZHI32rm addr:$src,
+              (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
+
+  def : Pat<(and GR64:$src, (add (shl 1, GR8:$lz), -1)),
+            (BZHI64rr GR64:$src,
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
+
+  def : Pat<(and (loadi64 addr:$src), (add (shl 1, GR8:$lz), -1)),
+            (BZHI64rm addr:$src,
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
+} // HasBMI2
+
+let Predicates = [HasBMI] in {
+  def : Pat<(X86bextr GR32:$src1, GR32:$src2),
+            (BEXTR32rr GR32:$src1, GR32:$src2)>;
+  def : Pat<(X86bextr (loadi32 addr:$src1), GR32:$src2),
+            (BEXTR32rm addr:$src1, GR32:$src2)>;
+  def : Pat<(X86bextr GR64:$src1, GR64:$src2),
+            (BEXTR64rr GR64:$src1, GR64:$src2)>;
+  def : Pat<(X86bextr (loadi64 addr:$src1), GR64:$src2),
+            (BEXTR64rm addr:$src1, GR64:$src2)>;
+} // HasBMI
+
+multiclass bmi_pdep_pext<string mnemonic, RegisterClass RC,
+                         X86MemOperand x86memop, Intrinsic Int,
+                         PatFrag ld_frag> {
+  def rr : I<0xF5, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+             !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set RC:$dst, (Int RC:$src1, RC:$src2))]>,
+             VEX_4V;
+  def rm : I<0xF5, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+             !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2)))]>, VEX_4V;
+}
+
+let Predicates = [HasBMI2] in {
+  defm PDEP32 : bmi_pdep_pext<"pdep{l}", GR32, i32mem,
+                               int_x86_bmi_pdep_32, loadi32>, T8XD;
+  defm PDEP64 : bmi_pdep_pext<"pdep{q}", GR64, i64mem,
+                               int_x86_bmi_pdep_64, loadi64>, T8XD, VEX_W;
+  defm PEXT32 : bmi_pdep_pext<"pext{l}", GR32, i32mem,
+                               int_x86_bmi_pext_32, loadi32>, T8XS;
+  defm PEXT64 : bmi_pdep_pext<"pext{q}", GR64, i64mem,
+                               int_x86_bmi_pext_64, loadi64>, T8XS, VEX_W;
+}
+
+//===----------------------------------------------------------------------===//
+// TBM Instructions
+//
+let Predicates = [HasTBM], Defs = [EFLAGS] in {
+
+multiclass tbm_ternary_imm_intr<bits<8> opc, RegisterClass RC, string OpcodeStr,
+                                X86MemOperand x86memop, PatFrag ld_frag,
+                                Intrinsic Int, Operand immtype,
+                                SDPatternOperator immoperator> {
+  def ri : Ii32<opc,  MRMSrcReg, (outs RC:$dst), (ins RC:$src1, immtype:$cntl),
+                !strconcat(OpcodeStr,
+                           "\t{$cntl, $src1, $dst|$dst, $src1, $cntl}"),
+                [(set RC:$dst, (Int RC:$src1, immoperator:$cntl))]>,
+           XOP, XOPA;
+  def mi : Ii32<opc,  MRMSrcMem, (outs RC:$dst),
+                (ins x86memop:$src1, immtype:$cntl),
+                !strconcat(OpcodeStr,
+                           "\t{$cntl, $src1, $dst|$dst, $src1, $cntl}"),
+                [(set RC:$dst, (Int (ld_frag addr:$src1), immoperator:$cntl))]>,
+           XOP, XOPA;
+}
+
+defm BEXTRI32 : tbm_ternary_imm_intr<0x10, GR32, "bextr", i32mem, loadi32,
+                                     int_x86_tbm_bextri_u32, i32imm, imm>;
+let ImmT = Imm32S in
+defm BEXTRI64 : tbm_ternary_imm_intr<0x10, GR64, "bextr", i64mem, loadi64,
+                                     int_x86_tbm_bextri_u64, i64i32imm,
+                                     i64immSExt32>, VEX_W;
+
+multiclass tbm_binary_rm<bits<8> opc, Format FormReg, Format FormMem,
+                         RegisterClass RC, string OpcodeStr,
+                         X86MemOperand x86memop, PatFrag ld_frag> {
+let hasSideEffects = 0 in {
+  def rr : I<opc,  FormReg, (outs RC:$dst), (ins RC:$src),
+             !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"),
+             []>, XOP_4V, XOP9;
+  let mayLoad = 1 in
+  def rm : I<opc,  FormMem, (outs RC:$dst), (ins x86memop:$src),
+             !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"),
+             []>, XOP_4V, XOP9;
+}
+}
+
+multiclass tbm_binary_intr<bits<8> opc, string OpcodeStr,
+                           Format FormReg, Format FormMem> {
+  defm NAME#32 : tbm_binary_rm<opc, FormReg, FormMem, GR32, OpcodeStr, i32mem,
+                               loadi32>;
+  defm NAME#64 : tbm_binary_rm<opc, FormReg, FormMem, GR64, OpcodeStr, i64mem,
+                               loadi64>, VEX_W;
+}
+
+defm BLCFILL : tbm_binary_intr<0x01, "blcfill", MRM1r, MRM1m>;
+defm BLCI    : tbm_binary_intr<0x02, "blci", MRM6r, MRM6m>;
+defm BLCIC   : tbm_binary_intr<0x01, "blcic", MRM5r, MRM5m>;
+defm BLCMSK  : tbm_binary_intr<0x02, "blcmsk", MRM1r, MRM1m>;
+defm BLCS    : tbm_binary_intr<0x01, "blcs", MRM3r, MRM3m>;
+defm BLSFILL : tbm_binary_intr<0x01, "blsfill", MRM2r, MRM2m>;
+defm BLSIC   : tbm_binary_intr<0x01, "blsic", MRM6r, MRM6m>;
+defm T1MSKC  : tbm_binary_intr<0x01, "t1mskc", MRM7r, MRM7m>;
+defm TZMSK   : tbm_binary_intr<0x01, "tzmsk", MRM4r, MRM4m>;
+} // HasTBM, EFLAGS
+
+//===----------------------------------------------------------------------===//
+// Pattern fragments to auto generate TBM instructions.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasTBM] in {
+  def : Pat<(X86bextr GR32:$src1, (i32 imm:$src2)),
+            (BEXTRI32ri GR32:$src1, imm:$src2)>;
+  def : Pat<(X86bextr (loadi32 addr:$src1), (i32 imm:$src2)),
+            (BEXTRI32mi addr:$src1, imm:$src2)>;
+  def : Pat<(X86bextr GR64:$src1, i64immSExt32:$src2),
+            (BEXTRI64ri GR64:$src1, i64immSExt32:$src2)>;
+  def : Pat<(X86bextr (loadi64 addr:$src1), i64immSExt32:$src2),
+            (BEXTRI64mi addr:$src1, i64immSExt32:$src2)>;
+
+  // FIXME: patterns for the load versions are not implemented
+  def : Pat<(and GR32:$src, (add GR32:$src, 1)),
+            (BLCFILL32rr GR32:$src)>;
+  def : Pat<(and GR64:$src, (add GR64:$src, 1)),
+            (BLCFILL64rr GR64:$src)>;
+
+  def : Pat<(or GR32:$src, (not (add GR32:$src, 1))),
+            (BLCI32rr GR32:$src)>;
+  def : Pat<(or GR64:$src, (not (add GR64:$src, 1))),
+            (BLCI64rr GR64:$src)>;
+
+  // Extra patterns because opt can optimize the above patterns to this.
+  def : Pat<(or GR32:$src, (sub -2, GR32:$src)),
+            (BLCI32rr GR32:$src)>;
+  def : Pat<(or GR64:$src, (sub -2, GR64:$src)),
+            (BLCI64rr GR64:$src)>;
+
+  def : Pat<(and (not GR32:$src), (add GR32:$src, 1)),
+            (BLCIC32rr GR32:$src)>;
+  def : Pat<(and (not GR64:$src), (add GR64:$src, 1)),
+            (BLCIC64rr GR64:$src)>;
+
+  def : Pat<(xor GR32:$src, (add GR32:$src, 1)),
+            (BLCMSK32rr GR32:$src)>;
+  def : Pat<(xor GR64:$src, (add GR64:$src, 1)),
+            (BLCMSK64rr GR64:$src)>;
+
+  def : Pat<(or GR32:$src, (add GR32:$src, 1)),
+            (BLCS32rr GR32:$src)>;
+  def : Pat<(or GR64:$src, (add GR64:$src, 1)),
+            (BLCS64rr GR64:$src)>;
+
+  def : Pat<(or GR32:$src, (add GR32:$src, -1)),
+            (BLSFILL32rr GR32:$src)>;
+  def : Pat<(or GR64:$src, (add GR64:$src, -1)),
+            (BLSFILL64rr GR64:$src)>;
+
+  def : Pat<(or (not GR32:$src), (add GR32:$src, -1)),
+            (BLSIC32rr GR32:$src)>;
+  def : Pat<(or (not GR64:$src), (add GR64:$src, -1)),
+            (BLSIC64rr GR64:$src)>;
+
+  def : Pat<(or (not GR32:$src), (add GR32:$src, 1)),
+            (T1MSKC32rr GR32:$src)>;
+  def : Pat<(or (not GR64:$src), (add GR64:$src, 1)),
+            (T1MSKC64rr GR64:$src)>;
+
+  def : Pat<(and (not GR32:$src), (add GR32:$src, -1)),
+            (TZMSK32rr GR32:$src)>;
+  def : Pat<(and (not GR64:$src), (add GR64:$src, -1)),
+            (TZMSK64rr GR64:$src)>;
+} // HasTBM
+
+//===----------------------------------------------------------------------===//
+// Subsystems.
+//===----------------------------------------------------------------------===//
+
+include "X86InstrArithmetic.td"
+include "X86InstrCMovSetCC.td"
+include "X86InstrExtension.td"
+include "X86InstrControl.td"
+include "X86InstrShiftRotate.td"
+
+// X87 Floating Point Stack.
+include "X86InstrFPStack.td"
+
+// SIMD support (SSE, MMX and AVX)
+include "X86InstrFragmentsSIMD.td"
+
+// FMA - Fused Multiply-Add support (requires FMA)
+include "X86InstrFMA.td"
+
+// XOP
+include "X86InstrXOP.td"
+
+// SSE, MMX and 3DNow! vector support.
+include "X86InstrSSE.td"
+include "X86InstrAVX512.td"
+include "X86InstrMMX.td"
+include "X86Instr3DNow.td"
+
+include "X86InstrVMX.td"
+include "X86InstrSVM.td"
+
+include "X86InstrTSX.td"
+
+// System instructions.
+include "X86InstrSystem.td"
+
+// Compiler Pseudo Instructions and Pat Patterns
+include "X86InstrCompiler.td"
+
+//===----------------------------------------------------------------------===//
+// Assembler Mnemonic Aliases
+//===----------------------------------------------------------------------===//
+
+def : MnemonicAlias<"call", "callw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"call", "calll", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"call", "callq", "att">, Requires<[In64BitMode]>;
+
+def : MnemonicAlias<"cbw",  "cbtw", "att">;
+def : MnemonicAlias<"cwde", "cwtl", "att">;
+def : MnemonicAlias<"cwd",  "cwtd", "att">;
+def : MnemonicAlias<"cdq",  "cltd", "att">;
+def : MnemonicAlias<"cdqe", "cltq", "att">;
+def : MnemonicAlias<"cqo",  "cqto", "att">;
+
+// In 64-bit mode lret maps to lretl; it is not ambiguous with lretq.
+def : MnemonicAlias<"lret", "lretw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"lret", "lretl", "att">, Requires<[Not16BitMode]>;
+
+def : MnemonicAlias<"leavel", "leave", "att">, Requires<[Not64BitMode]>;
+def : MnemonicAlias<"leaveq", "leave", "att">, Requires<[In64BitMode]>;
+
+def : MnemonicAlias<"loopz",  "loope",  "att">;
+def : MnemonicAlias<"loopnz", "loopne", "att">;
+
+def : MnemonicAlias<"pop",   "popw",  "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"pop",   "popl",  "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"pop",   "popq",  "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"popf",  "popfw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"popf",  "popfl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"popf",  "popfq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"popfd", "popfl", "att">;
+
+// FIXME: This is wrong for "push reg".  "push %bx" should turn into pushw in
+// all modes.  However: "push (addr)" and "push $42" should default to
+// pushl/pushq depending on the current mode.  Similar for "pop %bx"
+def : MnemonicAlias<"push",   "pushw",  "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"push",   "pushl",  "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"push",   "pushq",  "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"pushf",  "pushfw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"pushf",  "pushfl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"pushf",  "pushfq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"pushfd", "pushfl", "att">;
+
+def : MnemonicAlias<"popad",  "popal",  "intel">, Requires<[Not64BitMode]>;
+def : MnemonicAlias<"pushad", "pushal", "intel">, Requires<[Not64BitMode]>;
+def : MnemonicAlias<"popa",   "popaw",  "intel">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"pusha",  "pushaw", "intel">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"popa",   "popal",  "intel">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"pusha",  "pushal", "intel">, Requires<[In32BitMode]>;
+
+def : MnemonicAlias<"popa",   "popaw",  "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"pusha",  "pushaw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"popa",   "popal",  "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"pusha",  "pushal", "att">, Requires<[In32BitMode]>;
+
+def : MnemonicAlias<"repe",  "rep",   "att">;
+def : MnemonicAlias<"repz",  "rep",   "att">;
+def : MnemonicAlias<"repnz", "repne", "att">;
+
+def : MnemonicAlias<"ret", "retw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"ret", "retl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"ret", "retq", "att">, Requires<[In64BitMode]>;
+
+def : MnemonicAlias<"salb", "shlb", "att">;
+def : MnemonicAlias<"salw", "shlw", "att">;
+def : MnemonicAlias<"sall", "shll", "att">;
+def : MnemonicAlias<"salq", "shlq", "att">;
+
+def : MnemonicAlias<"smovb", "movsb", "att">;
+def : MnemonicAlias<"smovw", "movsw", "att">;
+def : MnemonicAlias<"smovl", "movsl", "att">;
+def : MnemonicAlias<"smovq", "movsq", "att">;
+
+def : MnemonicAlias<"ud2a",  "ud2",  "att">;
+def : MnemonicAlias<"verrw", "verr", "att">;
+
+// System instruction aliases.
+def : MnemonicAlias<"iret",    "iretw",    "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"iret",    "iretl",    "att">, Requires<[Not16BitMode]>;
+def : MnemonicAlias<"sysret",  "sysretl",  "att">;
+def : MnemonicAlias<"sysexit", "sysexitl", "att">;
+
+def : MnemonicAlias<"lgdt", "lgdtw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"lgdt", "lgdtl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"lgdt", "lgdtq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"lidt", "lidtw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"lidt", "lidtl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"lidt", "lidtq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"sgdt", "sgdtw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"sgdt", "sgdtl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"sgdt", "sgdtq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"sidt", "sidtw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"sidt", "sidtl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"sidt", "sidtq", "att">, Requires<[In64BitMode]>;
+
+
+// Floating point stack aliases.
+def : MnemonicAlias<"fcmovz",   "fcmove",   "att">;
+def : MnemonicAlias<"fcmova",   "fcmovnbe", "att">;
+def : MnemonicAlias<"fcmovnae", "fcmovb",   "att">;
+def : MnemonicAlias<"fcmovna",  "fcmovbe",  "att">;
+def : MnemonicAlias<"fcmovae",  "fcmovnb",  "att">;
+def : MnemonicAlias<"fcomip",   "fcompi",   "att">;
+def : MnemonicAlias<"fildq",    "fildll",   "att">;
+def : MnemonicAlias<"fistpq",   "fistpll",  "att">;
+def : MnemonicAlias<"fisttpq",  "fisttpll", "att">;
+def : MnemonicAlias<"fldcww",   "fldcw",    "att">;
+def : MnemonicAlias<"fnstcww",  "fnstcw",   "att">;
+def : MnemonicAlias<"fnstsww",  "fnstsw",   "att">;
+def : MnemonicAlias<"fucomip",  "fucompi",  "att">;
+def : MnemonicAlias<"fwait",    "wait",     "att">;
+
+
+class CondCodeAlias<string Prefix,string Suffix, string OldCond, string NewCond,
+                    string VariantName>
+  : MnemonicAlias<!strconcat(Prefix, OldCond, Suffix),
+                  !strconcat(Prefix, NewCond, Suffix), VariantName>;
+
+/// IntegerCondCodeMnemonicAlias - This multiclass defines a bunch of
+/// MnemonicAlias's that canonicalize the condition code in a mnemonic, for
+/// example "setz" -> "sete".
+multiclass IntegerCondCodeMnemonicAlias<string Prefix, string Suffix,
+                                        string V = ""> {
+  def C   : CondCodeAlias<Prefix, Suffix, "c",   "b",  V>; // setc   -> setb
+  def Z   : CondCodeAlias<Prefix, Suffix, "z" ,  "e",  V>; // setz   -> sete
+  def NA  : CondCodeAlias<Prefix, Suffix, "na",  "be", V>; // setna  -> setbe
+  def NB  : CondCodeAlias<Prefix, Suffix, "nb",  "ae", V>; // setnb  -> setae
+  def NC  : CondCodeAlias<Prefix, Suffix, "nc",  "ae", V>; // setnc  -> setae
+  def NG  : CondCodeAlias<Prefix, Suffix, "ng",  "le", V>; // setng  -> setle
+  def NL  : CondCodeAlias<Prefix, Suffix, "nl",  "ge", V>; // setnl  -> setge
+  def NZ  : CondCodeAlias<Prefix, Suffix, "nz",  "ne", V>; // setnz  -> setne
+  def PE  : CondCodeAlias<Prefix, Suffix, "pe",  "p",  V>; // setpe  -> setp
+  def PO  : CondCodeAlias<Prefix, Suffix, "po",  "np", V>; // setpo  -> setnp
+
+  def NAE : CondCodeAlias<Prefix, Suffix, "nae", "b",  V>; // setnae -> setb
+  def NBE : CondCodeAlias<Prefix, Suffix, "nbe", "a",  V>; // setnbe -> seta
+  def NGE : CondCodeAlias<Prefix, Suffix, "nge", "l",  V>; // setnge -> setl
+  def NLE : CondCodeAlias<Prefix, Suffix, "nle", "g",  V>; // setnle -> setg
+}
+
+// Aliases for set<CC>
+defm : IntegerCondCodeMnemonicAlias<"set", "">;
+// Aliases for j<CC>
+defm : IntegerCondCodeMnemonicAlias<"j", "">;
+// Aliases for cmov<CC>{w,l,q}
+defm : IntegerCondCodeMnemonicAlias<"cmov", "w", "att">;
+defm : IntegerCondCodeMnemonicAlias<"cmov", "l", "att">;
+defm : IntegerCondCodeMnemonicAlias<"cmov", "q", "att">;
+// No size suffix for intel-style asm.
+defm : IntegerCondCodeMnemonicAlias<"cmov", "", "intel">;
+
+
+//===----------------------------------------------------------------------===//
+// Assembler Instruction Aliases
+//===----------------------------------------------------------------------===//
+
+// aad/aam default to base 10 if no operand is specified.
+def : InstAlias<"aad", (AAD8i8 10)>;
+def : InstAlias<"aam", (AAM8i8 10)>;
+
+// Disambiguate the mem/imm form of bt-without-a-suffix as btl.
+// Likewise for btc/btr/bts.
+def : InstAlias<"bt {$imm, $mem|$mem, $imm}",
+                (BT32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
+def : InstAlias<"btc {$imm, $mem|$mem, $imm}",
+                (BTC32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
+def : InstAlias<"btr {$imm, $mem|$mem, $imm}",
+                (BTR32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
+def : InstAlias<"bts {$imm, $mem|$mem, $imm}",
+                (BTS32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
+
+// clr aliases.
+def : InstAlias<"clrb $reg", (XOR8rr  GR8 :$reg, GR8 :$reg), 0>;
+def : InstAlias<"clrw $reg", (XOR16rr GR16:$reg, GR16:$reg), 0>;
+def : InstAlias<"clrl $reg", (XOR32rr GR32:$reg, GR32:$reg), 0>;
+def : InstAlias<"clrq $reg", (XOR64rr GR64:$reg, GR64:$reg), 0>;
+
+// lods aliases. Accept the destination being omitted because it's implicit
+// in the mnemonic, or the mnemonic suffix being omitted because it's implicit
+// in the destination.
+def : InstAlias<"lodsb $src", (LODSB srcidx8:$src),  0>;
+def : InstAlias<"lodsw $src", (LODSW srcidx16:$src), 0>;
+def : InstAlias<"lods{l|d} $src", (LODSL srcidx32:$src), 0>;
+def : InstAlias<"lodsq $src", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"lods {$src, %al|al, $src}", (LODSB srcidx8:$src),  0>;
+def : InstAlias<"lods {$src, %ax|ax, $src}", (LODSW srcidx16:$src), 0>;
+def : InstAlias<"lods {$src, %eax|eax, $src}", (LODSL srcidx32:$src), 0>;
+def : InstAlias<"lods {$src, %rax|rax, $src}", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>;
+
+// stos aliases. Accept the source being omitted because it's implicit in
+// the mnemonic, or the mnemonic suffix being omitted because it's implicit
+// in the source.
+def : InstAlias<"stosb $dst", (STOSB dstidx8:$dst),  0>;
+def : InstAlias<"stosw $dst", (STOSW dstidx16:$dst), 0>;
+def : InstAlias<"stos{l|d} $dst", (STOSL dstidx32:$dst), 0>;
+def : InstAlias<"stosq $dst", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"stos {%al, $dst|$dst, al}", (STOSB dstidx8:$dst),  0>;
+def : InstAlias<"stos {%ax, $dst|$dst, ax}", (STOSW dstidx16:$dst), 0>;
+def : InstAlias<"stos {%eax, $dst|$dst, eax}", (STOSL dstidx32:$dst), 0>;
+def : InstAlias<"stos {%rax, $dst|$dst, rax}", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+
+// scas aliases. Accept the destination being omitted because it's implicit
+// in the mnemonic, or the mnemonic suffix being omitted because it's implicit
+// in the destination.
+def : InstAlias<"scasb $dst", (SCASB dstidx8:$dst),  0>;
+def : InstAlias<"scasw $dst", (SCASW dstidx16:$dst), 0>;
+def : InstAlias<"scas{l|d} $dst", (SCASL dstidx32:$dst), 0>;
+def : InstAlias<"scasq $dst", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"scas {$dst, %al|al, $dst}", (SCASB dstidx8:$dst),  0>;
+def : InstAlias<"scas {$dst, %ax|ax, $dst}", (SCASW dstidx16:$dst), 0>;
+def : InstAlias<"scas {$dst, %eax|eax, $dst}", (SCASL dstidx32:$dst), 0>;
+def : InstAlias<"scas {$dst, %rax|rax, $dst}", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+
+// div and idiv aliases for explicit A register.
+def : InstAlias<"div{b}\t{$src, %al|al, $src}", (DIV8r  GR8 :$src)>;
+def : InstAlias<"div{w}\t{$src, %ax|ax, $src}", (DIV16r GR16:$src)>;
+def : InstAlias<"div{l}\t{$src, %eax|eax, $src}", (DIV32r GR32:$src)>;
+def : InstAlias<"div{q}\t{$src, %rax|rax, $src}", (DIV64r GR64:$src)>;
+def : InstAlias<"div{b}\t{$src, %al|al, $src}", (DIV8m  i8mem :$src)>;
+def : InstAlias<"div{w}\t{$src, %ax|ax, $src}", (DIV16m i16mem:$src)>;
+def : InstAlias<"div{l}\t{$src, %eax|eax, $src}", (DIV32m i32mem:$src)>;
+def : InstAlias<"div{q}\t{$src, %rax|rax, $src}", (DIV64m i64mem:$src)>;
+def : InstAlias<"idiv{b}\t{$src, %al|al, $src}", (IDIV8r  GR8 :$src)>;
+def : InstAlias<"idiv{w}\t{$src, %ax|ax, $src}", (IDIV16r GR16:$src)>;
+def : InstAlias<"idiv{l}\t{$src, %eax|eax, $src}", (IDIV32r GR32:$src)>;
+def : InstAlias<"idiv{q}\t{$src, %rax|rax, $src}", (IDIV64r GR64:$src)>;
+def : InstAlias<"idiv{b}\t{$src, %al|al, $src}", (IDIV8m  i8mem :$src)>;
+def : InstAlias<"idiv{w}\t{$src, %ax|ax, $src}", (IDIV16m i16mem:$src)>;
+def : InstAlias<"idiv{l}\t{$src, %eax|eax, $src}", (IDIV32m i32mem:$src)>;
+def : InstAlias<"idiv{q}\t{$src, %rax|rax, $src}", (IDIV64m i64mem:$src)>;
+
+
+
+// Various unary fpstack operations default to operating on on ST1.
+// For example, "fxch" -> "fxch %st(1)"
+def : InstAlias<"faddp",        (ADD_FPrST0  ST1), 0>;
+def : InstAlias<"fsub{|r}p",    (SUBR_FPrST0 ST1), 0>;
+def : InstAlias<"fsub{r|}p",    (SUB_FPrST0  ST1), 0>;
+def : InstAlias<"fmulp",        (MUL_FPrST0  ST1), 0>;
+def : InstAlias<"fdiv{|r}p",    (DIVR_FPrST0 ST1), 0>;
+def : InstAlias<"fdiv{r|}p",    (DIV_FPrST0  ST1), 0>;
+def : InstAlias<"fxch",         (XCH_F       ST1), 0>;
+def : InstAlias<"fcom",         (COM_FST0r   ST1), 0>;
+def : InstAlias<"fcomp",        (COMP_FST0r  ST1), 0>;
+def : InstAlias<"fcomi",        (COM_FIr     ST1), 0>;
+def : InstAlias<"fcompi",       (COM_FIPr    ST1), 0>;
+def : InstAlias<"fucom",        (UCOM_Fr     ST1), 0>;
+def : InstAlias<"fucomp",       (UCOM_FPr    ST1), 0>;
+def : InstAlias<"fucomi",       (UCOM_FIr    ST1), 0>;
+def : InstAlias<"fucompi",      (UCOM_FIPr   ST1), 0>;
+
+// Handle fmul/fadd/fsub/fdiv instructions with explicitly written st(0) op.
+// For example, "fadd %st(4), %st(0)" -> "fadd %st(4)".  We also disambiguate
+// instructions like "fadd %st(0), %st(0)" as "fadd %st(0)" for consistency with
+// gas.
+multiclass FpUnaryAlias<string Mnemonic, Instruction Inst, bit EmitAlias = 1> {
+ def : InstAlias<!strconcat(Mnemonic, "\t{$op, %st(0)|st(0), $op}"),
+                 (Inst RST:$op), EmitAlias>;
+ def : InstAlias<!strconcat(Mnemonic, "\t{%st(0), %st(0)|st(0), st(0)}"),
+                 (Inst ST0), EmitAlias>;
+}
+
+defm : FpUnaryAlias<"fadd",   ADD_FST0r>;
+defm : FpUnaryAlias<"faddp",  ADD_FPrST0, 0>;
+defm : FpUnaryAlias<"fsub",   SUB_FST0r>;
+defm : FpUnaryAlias<"fsub{|r}p",  SUBR_FPrST0>;
+defm : FpUnaryAlias<"fsubr",  SUBR_FST0r>;
+defm : FpUnaryAlias<"fsub{r|}p", SUB_FPrST0>;
+defm : FpUnaryAlias<"fmul",   MUL_FST0r>;
+defm : FpUnaryAlias<"fmulp",  MUL_FPrST0>;
+defm : FpUnaryAlias<"fdiv",   DIV_FST0r>;
+defm : FpUnaryAlias<"fdiv{|r}p",  DIVR_FPrST0>;
+defm : FpUnaryAlias<"fdivr",  DIVR_FST0r>;
+defm : FpUnaryAlias<"fdiv{r|}p", DIV_FPrST0>;
+defm : FpUnaryAlias<"fcomi",   COM_FIr, 0>;
+defm : FpUnaryAlias<"fucomi",  UCOM_FIr, 0>;
+defm : FpUnaryAlias<"fcompi",   COM_FIPr>;
+defm : FpUnaryAlias<"fucompi",  UCOM_FIPr>;
+
+
+// Handle "f{mulp,addp} st(0), $op" the same as "f{mulp,addp} $op", since they
+// commute.  We also allow fdiv[r]p/fsubrp even though they don't commute,
+// solely because gas supports it.
+def : InstAlias<"faddp\t{%st(0), $op|$op, st(0)}", (ADD_FPrST0 RST:$op), 0>;
+def : InstAlias<"fmulp\t{%st(0), $op|$op, st(0)}", (MUL_FPrST0 RST:$op)>;
+def : InstAlias<"fsub{|r}p\t{%st(0), $op|$op, st(0)}", (SUBR_FPrST0 RST:$op)>;
+def : InstAlias<"fsub{r|}p\t{%st(0), $op|$op, st(0)}", (SUB_FPrST0 RST:$op)>;
+def : InstAlias<"fdiv{|r}p\t{%st(0), $op|$op, st(0)}", (DIVR_FPrST0 RST:$op)>;
+def : InstAlias<"fdiv{r|}p\t{%st(0), $op|$op, st(0)}", (DIV_FPrST0 RST:$op)>;
+
+// We accept "fnstsw %eax" even though it only writes %ax.
+def : InstAlias<"fnstsw\t{%eax|eax}", (FNSTSW16r)>;
+def : InstAlias<"fnstsw\t{%al|al}" , (FNSTSW16r)>;
+def : InstAlias<"fnstsw"     , (FNSTSW16r)>;
+
+// lcall and ljmp aliases.  This seems to be an odd mapping in 64-bit mode, but
+// this is compatible with what GAS does.
+def : InstAlias<"lcall $seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg), 0>, Requires<[Not16BitMode]>;
+def : InstAlias<"ljmp $seg, $off",  (FARJMP32i  i32imm:$off, i16imm:$seg), 0>, Requires<[Not16BitMode]>;
+def : InstAlias<"lcall *$dst",      (FARCALL32m opaque48mem:$dst), 0>, Requires<[Not16BitMode]>;
+def : InstAlias<"ljmp *$dst",       (FARJMP32m  opaque48mem:$dst), 0>, Requires<[Not16BitMode]>;
+def : InstAlias<"lcall $seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"ljmp $seg, $off",  (FARJMP16i  i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"lcall *$dst",      (FARCALL16m opaque32mem:$dst), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"ljmp *$dst",       (FARJMP16m  opaque32mem:$dst), 0>, Requires<[In16BitMode]>;
+
+def : InstAlias<"call *$dst",       (CALL64m i16mem:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"jmp *$dst",        (JMP64m  i16mem:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"call *$dst",       (CALL32m i16mem:$dst), 0>, Requires<[In32BitMode]>;
+def : InstAlias<"jmp *$dst",        (JMP32m  i16mem:$dst), 0>, Requires<[In32BitMode]>;
+def : InstAlias<"call *$dst",       (CALL16m i16mem:$dst), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"jmp *$dst",        (JMP16m  i16mem:$dst), 0>, Requires<[In16BitMode]>;
+
+
+// "imul <imm>, B" is an alias for "imul <imm>, B, B".
+def : InstAlias<"imulw $imm, $r", (IMUL16rri  GR16:$r, GR16:$r, i16imm:$imm)>;
+def : InstAlias<"imulw $imm, $r", (IMUL16rri8 GR16:$r, GR16:$r, i16i8imm:$imm)>;
+def : InstAlias<"imull $imm, $r", (IMUL32rri  GR32:$r, GR32:$r, i32imm:$imm)>;
+def : InstAlias<"imull $imm, $r", (IMUL32rri8 GR32:$r, GR32:$r, i32i8imm:$imm)>;
+def : InstAlias<"imulq $imm, $r",(IMUL64rri32 GR64:$r, GR64:$r,i64i32imm:$imm)>;
+def : InstAlias<"imulq $imm, $r", (IMUL64rri8 GR64:$r, GR64:$r, i64i8imm:$imm)>;
+
+// inb %dx -> inb %al, %dx
+def : InstAlias<"inb\t{%dx|dx}", (IN8rr), 0>;
+def : InstAlias<"inw\t{%dx|dx}", (IN16rr), 0>;
+def : InstAlias<"inl\t{%dx|dx}", (IN32rr), 0>;
+def : InstAlias<"inb\t$port", (IN8ri i8imm:$port), 0>;
+def : InstAlias<"inw\t$port", (IN16ri i8imm:$port), 0>;
+def : InstAlias<"inl\t$port", (IN32ri i8imm:$port), 0>;
+
+
+// jmp and call aliases for lcall and ljmp.  jmp $42,$5 -> ljmp
+def : InstAlias<"call $seg, $off",  (FARCALL16i i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>;
+def : InstAlias<"jmp $seg, $off",   (FARJMP16i  i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>;
+def : InstAlias<"call $seg, $off",  (FARCALL32i i32imm:$off, i16imm:$seg)>, Requires<[Not16BitMode]>;
+def : InstAlias<"jmp $seg, $off",   (FARJMP32i  i32imm:$off, i16imm:$seg)>, Requires<[Not16BitMode]>;
+def : InstAlias<"callw $seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg)>;
+def : InstAlias<"jmpw $seg, $off",  (FARJMP16i  i16imm:$off, i16imm:$seg)>;
+def : InstAlias<"calll $seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>;
+def : InstAlias<"jmpl $seg, $off",  (FARJMP32i  i32imm:$off, i16imm:$seg)>;
+
+// Force mov without a suffix with a segment and mem to prefer the 'l' form of
+// the move.  All segment/mem forms are equivalent, this has the shortest
+// encoding.
+def : InstAlias<"mov $mem, $seg", (MOV32sm SEGMENT_REG:$seg, i32mem:$mem), 0>;
+def : InstAlias<"mov $seg, $mem", (MOV32ms i32mem:$mem, SEGMENT_REG:$seg), 0>;
+
+// Match 'movq <largeimm>, <reg>' as an alias for movabsq.
+def : InstAlias<"movq $imm, $reg", (MOV64ri GR64:$reg, i64imm:$imm), 0>;
+
+// Match 'movq GR64, MMX' as an alias for movd.
+def : InstAlias<"movq $src, $dst",
+                (MMX_MOVD64to64rr VR64:$dst, GR64:$src), 0>;
+def : InstAlias<"movq $src, $dst",
+                (MMX_MOVD64from64rr GR64:$dst, VR64:$src), 0>;
+
+// movsx aliases
+def : InstAlias<"movsx $src, $dst", (MOVSX16rr8 GR16:$dst, GR8:$src), 0>;
+def : InstAlias<"movsx $src, $dst", (MOVSX16rm8 GR16:$dst, i8mem:$src), 0>;
+def : InstAlias<"movsx $src, $dst", (MOVSX32rr8 GR32:$dst, GR8:$src), 0>;
+def : InstAlias<"movsx $src, $dst", (MOVSX32rr16 GR32:$dst, GR16:$src), 0>;
+def : InstAlias<"movsx $src, $dst", (MOVSX64rr8 GR64:$dst, GR8:$src), 0>;
+def : InstAlias<"movsx $src, $dst", (MOVSX64rr16 GR64:$dst, GR16:$src), 0>;
+def : InstAlias<"movsx $src, $dst", (MOVSX64rr32 GR64:$dst, GR32:$src), 0>;
+
+// movzx aliases
+def : InstAlias<"movzx $src, $dst", (MOVZX16rr8 GR16:$dst, GR8:$src), 0>;
+def : InstAlias<"movzx $src, $dst", (MOVZX16rm8 GR16:$dst, i8mem:$src), 0>;
+def : InstAlias<"movzx $src, $dst", (MOVZX32rr8 GR32:$dst, GR8:$src), 0>;
+def : InstAlias<"movzx $src, $dst", (MOVZX32rr16 GR32:$dst, GR16:$src), 0>;
+def : InstAlias<"movzx $src, $dst", (MOVZX64rr8_Q GR64:$dst, GR8:$src), 0>;
+def : InstAlias<"movzx $src, $dst", (MOVZX64rr16_Q GR64:$dst, GR16:$src), 0>;
+// Note: No GR32->GR64 movzx form.
+
+// outb %dx -> outb %al, %dx
+def : InstAlias<"outb\t{%dx|dx}", (OUT8rr), 0>;
+def : InstAlias<"outw\t{%dx|dx}", (OUT16rr), 0>;
+def : InstAlias<"outl\t{%dx|dx}", (OUT32rr), 0>;
+def : InstAlias<"outb\t$port", (OUT8ir i8imm:$port), 0>;
+def : InstAlias<"outw\t$port", (OUT16ir i8imm:$port), 0>;
+def : InstAlias<"outl\t$port", (OUT32ir i8imm:$port), 0>;
+
+// 'sldt <mem>' can be encoded with either sldtw or sldtq with the same
+// effect (both store to a 16-bit mem).  Force to sldtw to avoid ambiguity
+// errors, since its encoding is the most compact.
+def : InstAlias<"sldt $mem", (SLDT16m i16mem:$mem), 0>;
+
+// shld/shrd op,op -> shld op, op, CL
+def : InstAlias<"shld{w}\t{$r2, $r1|$r1, $r2}", (SHLD16rrCL GR16:$r1, GR16:$r2), 0>;
+def : InstAlias<"shld{l}\t{$r2, $r1|$r1, $r2}", (SHLD32rrCL GR32:$r1, GR32:$r2), 0>;
+def : InstAlias<"shld{q}\t{$r2, $r1|$r1, $r2}", (SHLD64rrCL GR64:$r1, GR64:$r2), 0>;
+def : InstAlias<"shrd{w}\t{$r2, $r1|$r1, $r2}", (SHRD16rrCL GR16:$r1, GR16:$r2), 0>;
+def : InstAlias<"shrd{l}\t{$r2, $r1|$r1, $r2}", (SHRD32rrCL GR32:$r1, GR32:$r2), 0>;
+def : InstAlias<"shrd{q}\t{$r2, $r1|$r1, $r2}", (SHRD64rrCL GR64:$r1, GR64:$r2), 0>;
+
+def : InstAlias<"shld{w}\t{$reg, $mem|$mem, $reg}", (SHLD16mrCL i16mem:$mem, GR16:$reg), 0>;
+def : InstAlias<"shld{l}\t{$reg, $mem|$mem, $reg}", (SHLD32mrCL i32mem:$mem, GR32:$reg), 0>;
+def : InstAlias<"shld{q}\t{$reg, $mem|$mem, $reg}", (SHLD64mrCL i64mem:$mem, GR64:$reg), 0>;
+def : InstAlias<"shrd{w}\t{$reg, $mem|$mem, $reg}", (SHRD16mrCL i16mem:$mem, GR16:$reg), 0>;
+def : InstAlias<"shrd{l}\t{$reg, $mem|$mem, $reg}", (SHRD32mrCL i32mem:$mem, GR32:$reg), 0>;
+def : InstAlias<"shrd{q}\t{$reg, $mem|$mem, $reg}", (SHRD64mrCL i64mem:$mem, GR64:$reg), 0>;
+
+/*  FIXME: This is disabled because the asm matcher is currently incapable of
+ *  matching a fixed immediate like $1.
+// "shl X, $1" is an alias for "shl X".
+multiclass ShiftRotateByOneAlias<string Mnemonic, string Opc> {
+ def : InstAlias<!strconcat(Mnemonic, "b $op, $$1"),
+                 (!cast<Instruction>(!strconcat(Opc, "8r1")) GR8:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "w $op, $$1"),
+                 (!cast<Instruction>(!strconcat(Opc, "16r1")) GR16:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "l $op, $$1"),
+                 (!cast<Instruction>(!strconcat(Opc, "32r1")) GR32:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "q $op, $$1"),
+                 (!cast<Instruction>(!strconcat(Opc, "64r1")) GR64:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "b $op, $$1"),
+                 (!cast<Instruction>(!strconcat(Opc, "8m1")) i8mem:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "w $op, $$1"),
+                 (!cast<Instruction>(!strconcat(Opc, "16m1")) i16mem:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "l $op, $$1"),
+                 (!cast<Instruction>(!strconcat(Opc, "32m1")) i32mem:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "q $op, $$1"),
+                 (!cast<Instruction>(!strconcat(Opc, "64m1")) i64mem:$op)>;
+}
+
+defm : ShiftRotateByOneAlias<"rcl", "RCL">;
+defm : ShiftRotateByOneAlias<"rcr", "RCR">;
+defm : ShiftRotateByOneAlias<"rol", "ROL">;
+defm : ShiftRotateByOneAlias<"ror", "ROR">;
+FIXME */
+
+// test: We accept "testX <reg>, <mem>" and "testX <mem>, <reg>" as synonyms.
+def : InstAlias<"test{b}\t{$val, $mem|$mem, $val}",
+                (TEST8rm  GR8 :$val, i8mem :$mem), 0>;
+def : InstAlias<"test{w}\t{$val, $mem|$mem, $val}",
+                (TEST16rm GR16:$val, i16mem:$mem), 0>;
+def : InstAlias<"test{l}\t{$val, $mem|$mem, $val}",
+                (TEST32rm GR32:$val, i32mem:$mem), 0>;
+def : InstAlias<"test{q}\t{$val, $mem|$mem, $val}",
+                (TEST64rm GR64:$val, i64mem:$mem), 0>;
+
+// xchg: We accept "xchgX <reg>, <mem>" and "xchgX <mem>, <reg>" as synonyms.
+def : InstAlias<"xchg{b}\t{$mem, $val|$val, $mem}",
+                (XCHG8rm  GR8 :$val, i8mem :$mem), 0>;
+def : InstAlias<"xchg{w}\t{$mem, $val|$val, $mem}",
+                (XCHG16rm GR16:$val, i16mem:$mem), 0>;
+def : InstAlias<"xchg{l}\t{$mem, $val|$val, $mem}",
+                (XCHG32rm GR32:$val, i32mem:$mem), 0>;
+def : InstAlias<"xchg{q}\t{$mem, $val|$val, $mem}",
+                (XCHG64rm GR64:$val, i64mem:$mem), 0>;
+
+// xchg: We accept "xchgX <reg>, %eax" and "xchgX %eax, <reg>" as synonyms.
+def : InstAlias<"xchg{w}\t{%ax, $src|$src, ax}", (XCHG16ar GR16:$src), 0>;
+def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}",
+                (XCHG32ar GR32:$src), 0>, Requires<[Not64BitMode]>;
+def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}",
+                (XCHG32ar64 GR32_NOAX:$src), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"xchg{q}\t{%rax, $src|$src, rax}", (XCHG64ar GR64:$src), 0>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrMMX.td b/contrib/llvm/lib/Target/X86/X86InstrMMX.td
new file mode 100644
index 0000000..ecf80a1
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrMMX.td
@@ -0,0 +1,623 @@
+//===-- X86InstrMMX.td - Describe the MMX Instruction Set --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 MMX instruction set, defining the instructions,
+// and properties of the instructions which are needed for code generation,
+// machine code emission, and analysis.
+//
+// All instructions that use MMX should be in this file, even if they also use
+// SSE.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MMX Multiclasses
+//===----------------------------------------------------------------------===//
+
+let Sched = WriteVecALU in {
+def MMX_INTALU_ITINS : OpndItins<
+  IIC_MMX_ALU_RR, IIC_MMX_ALU_RM
+>;
+
+def MMX_INTALUQ_ITINS : OpndItins<
+  IIC_MMX_ALUQ_RR, IIC_MMX_ALUQ_RM
+>;
+
+def MMX_PHADDSUBW : OpndItins<
+  IIC_MMX_PHADDSUBW_RR, IIC_MMX_PHADDSUBW_RM
+>;
+
+def MMX_PHADDSUBD : OpndItins<
+  IIC_MMX_PHADDSUBD_RR, IIC_MMX_PHADDSUBD_RM
+>;
+}
+
+let Sched = WriteVecIMul in
+def MMX_PMUL_ITINS : OpndItins<
+  IIC_MMX_PMUL, IIC_MMX_PMUL
+>;
+
+let Sched = WriteVecALU in {
+def MMX_PSADBW_ITINS : OpndItins<
+  IIC_MMX_PSADBW, IIC_MMX_PSADBW
+>;
+
+def MMX_MISC_FUNC_ITINS : OpndItins<
+  IIC_MMX_MISC_FUNC_MEM, IIC_MMX_MISC_FUNC_REG
+>;
+}
+
+def MMX_SHIFT_ITINS : ShiftOpndItins<
+  IIC_MMX_SHIFT_RR, IIC_MMX_SHIFT_RM, IIC_MMX_SHIFT_RI
+>;
+
+let Sched = WriteShuffle in {
+def MMX_UNPCK_H_ITINS : OpndItins<
+  IIC_MMX_UNPCK_H_RR, IIC_MMX_UNPCK_H_RM
+>;
+
+def MMX_UNPCK_L_ITINS : OpndItins<
+  IIC_MMX_UNPCK_L, IIC_MMX_UNPCK_L
+>;
+
+def MMX_PCK_ITINS : OpndItins<
+  IIC_MMX_PCK_RR, IIC_MMX_PCK_RM
+>;
+
+def MMX_PSHUF_ITINS : OpndItins<
+  IIC_MMX_PSHUF, IIC_MMX_PSHUF
+>;
+} // Sched
+
+let Sched = WriteCvtF2I in {
+def MMX_CVT_PD_ITINS : OpndItins<
+  IIC_MMX_CVT_PD_RR, IIC_MMX_CVT_PD_RM
+>;
+
+def MMX_CVT_PS_ITINS : OpndItins<
+  IIC_MMX_CVT_PS_RR, IIC_MMX_CVT_PS_RM
+>;
+}
+
+let Constraints = "$src1 = $dst" in {
+  // MMXI_binop_rm_int - Simple MMX binary operator based on intrinsic.
+  // When this is cleaned up, remove the FIXME from X86RecognizableInstr.cpp.
+  multiclass MMXI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
+                               OpndItins itins, bit Commutable = 0> {
+    def irr : MMXI<opc, MRMSrcReg, (outs VR64:$dst),
+                 (ins VR64:$src1, VR64:$src2),
+                 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                 [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))], itins.rr>,
+              Sched<[itins.Sched]> {
+      let isCommutable = Commutable;
+    }
+    def irm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
+                 (ins VR64:$src1, i64mem:$src2),
+                 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                 [(set VR64:$dst, (IntId VR64:$src1,
+                                   (bitconvert (load_mmx addr:$src2))))],
+                 itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+  }
+
+  multiclass MMXI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm,
+                                string OpcodeStr, Intrinsic IntId,
+                                Intrinsic IntId2, ShiftOpndItins itins> {
+    def rr : MMXI<opc, MRMSrcReg, (outs VR64:$dst),
+                                  (ins VR64:$src1, VR64:$src2),
+                  !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                  [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))], itins.rr>,
+             Sched<[WriteVecShift]>;
+    def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
+                                  (ins VR64:$src1, i64mem:$src2),
+                  !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                  [(set VR64:$dst, (IntId VR64:$src1,
+                                    (bitconvert (load_mmx addr:$src2))))],
+                  itins.rm>, Sched<[WriteVecShiftLd, ReadAfterLd]>;
+    def ri : MMXIi8<opc2, ImmForm, (outs VR64:$dst),
+                                   (ins VR64:$src1, i32i8imm:$src2),
+                    !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           [(set VR64:$dst, (IntId2 VR64:$src1, (i32 imm:$src2)))], itins.ri>,
+           Sched<[WriteVecShift]>;
+  }
+}
+
+/// Unary MMX instructions requiring SSSE3.
+multiclass SS3I_unop_rm_int_mm<bits<8> opc, string OpcodeStr,
+                               Intrinsic IntId64, OpndItins itins> {
+  def rr64 : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
+                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                   [(set VR64:$dst, (IntId64 VR64:$src))], itins.rr>,
+             Sched<[itins.Sched]>;
+
+  def rm64 : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
+                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                   [(set VR64:$dst,
+                     (IntId64 (bitconvert (memopmmx addr:$src))))],
+                   itins.rm>, Sched<[itins.Sched.Folded]>;
+}
+
+/// Binary MMX instructions requiring SSSE3.
+let ImmT = NoImm, Constraints = "$src1 = $dst" in {
+multiclass SS3I_binop_rm_int_mm<bits<8> opc, string OpcodeStr,
+                             Intrinsic IntId64, OpndItins itins> {
+  let isCommutable = 0 in
+  def rr64 : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst),
+       (ins VR64:$src1, VR64:$src2),
+        !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+       [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))], itins.rr>,
+      Sched<[itins.Sched]>;
+  def rm64 : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst),
+       (ins VR64:$src1, i64mem:$src2),
+        !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+       [(set VR64:$dst,
+         (IntId64 VR64:$src1,
+          (bitconvert (memopmmx addr:$src2))))], itins.rm>,
+      Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+}
+
+/// PALIGN MMX instructions (require SSSE3).
+multiclass ssse3_palign_mm<string asm, Intrinsic IntId> {
+  def R64irr  : MMXSS3AI<0x0F, MRMSrcReg, (outs VR64:$dst),
+      (ins VR64:$src1, VR64:$src2, i8imm:$src3),
+      !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 
+      [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2, (i8 imm:$src3)))]>;
+  def R64irm  : MMXSS3AI<0x0F, MRMSrcMem, (outs VR64:$dst),
+      (ins VR64:$src1, i64mem:$src2, i8imm:$src3),
+      !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+      [(set VR64:$dst, (IntId VR64:$src1,
+                       (bitconvert (load_mmx addr:$src2)), (i8 imm:$src3)))]>;
+}
+
+multiclass sse12_cvt_pint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+                         Intrinsic Int, X86MemOperand x86memop, PatFrag ld_frag,
+                         string asm, OpndItins itins, Domain d> {
+  def irr : MMXPI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
+                  [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr, d>,
+            Sched<[itins.Sched]>;
+  def irm : MMXPI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
+                  [(set DstRC:$dst, (Int (ld_frag addr:$src)))], itins.rm, d>,
+            Sched<[itins.Sched.Folded]>;
+}
+
+multiclass sse12_cvt_pint_3addr<bits<8> opc, RegisterClass SrcRC,
+                    RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop,
+                    PatFrag ld_frag, string asm, Domain d> {
+  def irr : MMXPI<opc, MRMSrcReg, (outs DstRC:$dst),
+                  (ins DstRC:$src1, SrcRC:$src2), asm,
+                  [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))],
+                  NoItinerary, d>;
+  def irm : MMXPI<opc, MRMSrcMem, (outs DstRC:$dst),
+                  (ins DstRC:$src1, x86memop:$src2), asm,
+                  [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))],
+                  NoItinerary, d>;
+}
+
+//===----------------------------------------------------------------------===//
+// MMX EMMS Instruction
+//===----------------------------------------------------------------------===//
+
+def MMX_EMMS  : MMXI<0x77, RawFrm, (outs), (ins), "emms",
+                     [(int_x86_mmx_emms)], IIC_MMX_EMMS>;
+
+//===----------------------------------------------------------------------===//
+// MMX Scalar Instructions
+//===----------------------------------------------------------------------===//
+
+// Data Transfer Instructions
+def MMX_MOVD64rr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set VR64:$dst, 
+                         (x86mmx (scalar_to_vector GR32:$src)))],
+                        IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>;
+let canFoldAsLoad = 1 in
+def MMX_MOVD64rm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set VR64:$dst,
+                        (x86mmx (scalar_to_vector (loadi32 addr:$src))))],
+                        IIC_MMX_MOV_MM_RM>, Sched<[WriteLoad]>;
+let mayStore = 1 in
+def MMX_MOVD64mr : MMXI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR64:$src),
+                        "movd\t{$src, $dst|$dst, $src}", [], IIC_MMX_MOV_MM_RM>,
+                   Sched<[WriteStore]>;
+
+// Low word of MMX to GPR.
+def MMX_X86movd2w : SDNode<"X86ISD::MMX_MOVD2W", SDTypeProfile<1, 1,
+                            [SDTCisVT<0, i32>, SDTCisVT<1, x86mmx>]>>;
+def MMX_MOVD64grr : MMXI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR64:$src),
+                         "movd\t{$src, $dst|$dst, $src}",
+                         [(set GR32:$dst,
+                          (MMX_X86movd2w (x86mmx VR64:$src)))],
+                          IIC_MMX_MOV_REG_MM>, Sched<[WriteMove]>;
+
+def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src),
+                             "movd\t{$src, $dst|$dst, $src}",
+                             [(set VR64:$dst, (bitconvert GR64:$src))],
+                             IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>;
+
+// These are 64 bit moves, but since the OS X assembler doesn't
+// recognize a register-register movq, we write them as
+// movd.
+let SchedRW = [WriteMove] in {
+def MMX_MOVD64from64rr : MMXRI<0x7E, MRMDestReg,
+                               (outs GR64:$dst), (ins VR64:$src),
+                               "movd\t{$src, $dst|$dst, $src}", 
+                             [(set GR64:$dst,
+                              (bitconvert VR64:$src))], IIC_MMX_MOV_REG_MM>;
+let neverHasSideEffects = 1 in
+def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
+                        "movq\t{$src, $dst|$dst, $src}", [],
+                        IIC_MMX_MOVQ_RR>;
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
+def MMX_MOVQ64rr_REV : MMXI<0x7F, MRMDestReg, (outs VR64:$dst), (ins VR64:$src),
+                        "movq\t{$src, $dst|$dst, $src}", [],
+                        IIC_MMX_MOVQ_RR>;
+}
+} // SchedRW
+
+let SchedRW = [WriteLoad] in {
+let canFoldAsLoad = 1 in
+def MMX_MOVQ64rm : MMXI<0x6F, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
+                        "movq\t{$src, $dst|$dst, $src}",
+                        [(set VR64:$dst, (load_mmx addr:$src))],
+                        IIC_MMX_MOVQ_RM>;
+} // SchedRW
+let SchedRW = [WriteStore] in
+def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
+                        "movq\t{$src, $dst|$dst, $src}",
+                        [(store (x86mmx VR64:$src), addr:$dst)],
+                        IIC_MMX_MOVQ_RM>;
+
+let SchedRW = [WriteMove] in {
+def MMX_MOVDQ2Qrr : MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
+                             (ins VR128:$src), "movdq2q\t{$src, $dst|$dst, $src}",
+                             [(set VR64:$dst,
+                               (x86mmx (bitconvert
+                               (i64 (vector_extract (v2i64 VR128:$src),
+                                     (iPTR 0))))))],
+                             IIC_MMX_MOVQ_RR>;
+
+def MMX_MOVQ2DQrr : MMXS2SIi8<0xD6, MRMSrcReg, (outs VR128:$dst),
+                              (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}",
+                              [(set VR128:$dst,
+                                (v2i64
+                                  (scalar_to_vector
+                                    (i64 (bitconvert (x86mmx VR64:$src))))))],
+                              IIC_MMX_MOVQ_RR>;
+
+let isCodeGenOnly = 1, hasSideEffects = 1 in {
+def MMX_MOVQ2FR64rr: MMXS2SIi8<0xD6, MRMSrcReg, (outs FR64:$dst),
+                               (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}",
+                               [], IIC_MMX_MOVQ_RR>;
+
+def MMX_MOVFR642Qrr: MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
+                              (ins FR64:$src), "movdq2q\t{$src, $dst|$dst, $src}",
+                              [], IIC_MMX_MOVQ_RR>;
+}
+} // SchedRW
+
+def MMX_MOVNTQmr  : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
+                         "movntq\t{$src, $dst|$dst, $src}",
+                         [(int_x86_mmx_movnt_dq addr:$dst, VR64:$src)],
+                         IIC_MMX_MOVQ_RM>, Sched<[WriteStore]>;
+
+let Predicates = [HasMMX] in {
+  let AddedComplexity = 15 in
+  // movd to MMX register zero-extends
+  def : Pat<(x86mmx (X86vzmovl (x86mmx (scalar_to_vector GR32:$src)))),
+            (MMX_MOVD64rr GR32:$src)>;
+  let AddedComplexity = 20 in
+  def : Pat<(x86mmx (X86vzmovl (x86mmx (scalar_to_vector (loadi32 addr:$src))))),
+            (MMX_MOVD64rm addr:$src)>;
+}
+
+// Arithmetic Instructions
+defm MMX_PABSB : SS3I_unop_rm_int_mm<0x1C, "pabsb", int_x86_ssse3_pabs_b,
+                                     MMX_INTALU_ITINS>;
+defm MMX_PABSW : SS3I_unop_rm_int_mm<0x1D, "pabsw", int_x86_ssse3_pabs_w,
+                                     MMX_INTALU_ITINS>;
+defm MMX_PABSD : SS3I_unop_rm_int_mm<0x1E, "pabsd", int_x86_ssse3_pabs_d,
+                                     MMX_INTALU_ITINS>;
+// -- Addition
+defm MMX_PADDB : MMXI_binop_rm_int<0xFC, "paddb", int_x86_mmx_padd_b,
+                                   MMX_INTALU_ITINS, 1>;
+defm MMX_PADDW : MMXI_binop_rm_int<0xFD, "paddw", int_x86_mmx_padd_w,
+                                   MMX_INTALU_ITINS, 1>;
+defm MMX_PADDD : MMXI_binop_rm_int<0xFE, "paddd", int_x86_mmx_padd_d,
+                                   MMX_INTALU_ITINS, 1>;
+defm MMX_PADDQ : MMXI_binop_rm_int<0xD4, "paddq", int_x86_mmx_padd_q,
+                                   MMX_INTALUQ_ITINS, 1>;
+defm MMX_PADDSB  : MMXI_binop_rm_int<0xEC, "paddsb" , int_x86_mmx_padds_b,
+                                   MMX_INTALU_ITINS, 1>;
+defm MMX_PADDSW  : MMXI_binop_rm_int<0xED, "paddsw" , int_x86_mmx_padds_w,
+                                   MMX_INTALU_ITINS, 1>;
+
+defm MMX_PADDUSB : MMXI_binop_rm_int<0xDC, "paddusb", int_x86_mmx_paddus_b,
+                                   MMX_INTALU_ITINS, 1>;
+defm MMX_PADDUSW : MMXI_binop_rm_int<0xDD, "paddusw", int_x86_mmx_paddus_w,
+                                   MMX_INTALU_ITINS, 1>;
+
+defm MMX_PHADDW  : SS3I_binop_rm_int_mm<0x01, "phaddw", int_x86_ssse3_phadd_w,
+                                   MMX_PHADDSUBW>;
+defm MMX_PHADD   : SS3I_binop_rm_int_mm<0x02, "phaddd", int_x86_ssse3_phadd_d,
+                                   MMX_PHADDSUBD>;
+defm MMX_PHADDSW : SS3I_binop_rm_int_mm<0x03, "phaddsw",int_x86_ssse3_phadd_sw,
+                                   MMX_PHADDSUBW>;
+
+
+// -- Subtraction
+defm MMX_PSUBB : MMXI_binop_rm_int<0xF8, "psubb", int_x86_mmx_psub_b,
+                                   MMX_INTALU_ITINS>;
+defm MMX_PSUBW : MMXI_binop_rm_int<0xF9, "psubw", int_x86_mmx_psub_w,
+                                   MMX_INTALU_ITINS>;
+defm MMX_PSUBD : MMXI_binop_rm_int<0xFA, "psubd", int_x86_mmx_psub_d,
+                                   MMX_INTALU_ITINS>;
+defm MMX_PSUBQ : MMXI_binop_rm_int<0xFB, "psubq", int_x86_mmx_psub_q,
+                                   MMX_INTALUQ_ITINS>;
+
+defm MMX_PSUBSB  : MMXI_binop_rm_int<0xE8, "psubsb" , int_x86_mmx_psubs_b,
+                                   MMX_INTALU_ITINS>;
+defm MMX_PSUBSW  : MMXI_binop_rm_int<0xE9, "psubsw" , int_x86_mmx_psubs_w,
+                                   MMX_INTALU_ITINS>;
+
+defm MMX_PSUBUSB : MMXI_binop_rm_int<0xD8, "psubusb", int_x86_mmx_psubus_b,
+                                   MMX_INTALU_ITINS>;
+defm MMX_PSUBUSW : MMXI_binop_rm_int<0xD9, "psubusw", int_x86_mmx_psubus_w,
+                                   MMX_INTALU_ITINS>;
+
+defm MMX_PHSUBW  : SS3I_binop_rm_int_mm<0x05, "phsubw", int_x86_ssse3_phsub_w,
+                                   MMX_PHADDSUBW>;
+defm MMX_PHSUBD  : SS3I_binop_rm_int_mm<0x06, "phsubd", int_x86_ssse3_phsub_d,
+                                   MMX_PHADDSUBD>;
+defm MMX_PHSUBSW : SS3I_binop_rm_int_mm<0x07, "phsubsw",int_x86_ssse3_phsub_sw,
+                                   MMX_PHADDSUBW>;
+
+// -- Multiplication
+defm MMX_PMULLW  : MMXI_binop_rm_int<0xD5, "pmullw", int_x86_mmx_pmull_w,
+                                     MMX_PMUL_ITINS, 1>;
+
+defm MMX_PMULHW  : MMXI_binop_rm_int<0xE5, "pmulhw",  int_x86_mmx_pmulh_w,
+                                     MMX_PMUL_ITINS, 1>;
+defm MMX_PMULHUW : MMXI_binop_rm_int<0xE4, "pmulhuw", int_x86_mmx_pmulhu_w,
+                                     MMX_PMUL_ITINS, 1>;
+defm MMX_PMULUDQ : MMXI_binop_rm_int<0xF4, "pmuludq", int_x86_mmx_pmulu_dq,
+                                     MMX_PMUL_ITINS, 1>;
+let isCommutable = 1 in
+defm MMX_PMULHRSW : SS3I_binop_rm_int_mm<0x0B, "pmulhrsw",
+                                     int_x86_ssse3_pmul_hr_sw, MMX_PMUL_ITINS>;
+
+// -- Miscellanea
+defm MMX_PMADDWD : MMXI_binop_rm_int<0xF5, "pmaddwd", int_x86_mmx_pmadd_wd,
+                                     MMX_PMUL_ITINS, 1>;
+
+defm MMX_PMADDUBSW : SS3I_binop_rm_int_mm<0x04, "pmaddubsw",
+                                     int_x86_ssse3_pmadd_ub_sw, MMX_PMUL_ITINS>;
+defm MMX_PAVGB   : MMXI_binop_rm_int<0xE0, "pavgb", int_x86_mmx_pavg_b,
+                                     MMX_MISC_FUNC_ITINS, 1>;
+defm MMX_PAVGW   : MMXI_binop_rm_int<0xE3, "pavgw", int_x86_mmx_pavg_w,
+                                     MMX_MISC_FUNC_ITINS, 1>;
+
+defm MMX_PMINUB  : MMXI_binop_rm_int<0xDA, "pminub", int_x86_mmx_pminu_b,
+                                     MMX_MISC_FUNC_ITINS, 1>;
+defm MMX_PMINSW  : MMXI_binop_rm_int<0xEA, "pminsw", int_x86_mmx_pmins_w,
+                                     MMX_MISC_FUNC_ITINS, 1>;
+
+defm MMX_PMAXUB  : MMXI_binop_rm_int<0xDE, "pmaxub", int_x86_mmx_pmaxu_b,
+                                     MMX_MISC_FUNC_ITINS, 1>;
+defm MMX_PMAXSW  : MMXI_binop_rm_int<0xEE, "pmaxsw", int_x86_mmx_pmaxs_w,
+                                     MMX_MISC_FUNC_ITINS, 1>;
+
+defm MMX_PSADBW  : MMXI_binop_rm_int<0xF6, "psadbw", int_x86_mmx_psad_bw,
+                                     MMX_PSADBW_ITINS, 1>;
+
+defm MMX_PSIGNB :  SS3I_binop_rm_int_mm<0x08, "psignb", int_x86_ssse3_psign_b,
+                                        MMX_MISC_FUNC_ITINS>;
+defm MMX_PSIGNW :  SS3I_binop_rm_int_mm<0x09, "psignw", int_x86_ssse3_psign_w,
+                                        MMX_MISC_FUNC_ITINS>;
+defm MMX_PSIGND :  SS3I_binop_rm_int_mm<0x0A, "psignd", int_x86_ssse3_psign_d,
+                                        MMX_MISC_FUNC_ITINS>;
+let Constraints = "$src1 = $dst" in
+  defm MMX_PALIGN : ssse3_palign_mm<"palignr", int_x86_mmx_palignr_b>;
+
+// Logical Instructions
+defm MMX_PAND : MMXI_binop_rm_int<0xDB, "pand", int_x86_mmx_pand,
+                                  MMX_INTALU_ITINS, 1>;
+defm MMX_POR  : MMXI_binop_rm_int<0xEB, "por" , int_x86_mmx_por,
+                                  MMX_INTALU_ITINS, 1>;
+defm MMX_PXOR : MMXI_binop_rm_int<0xEF, "pxor", int_x86_mmx_pxor,
+                                  MMX_INTALU_ITINS, 1>;
+defm MMX_PANDN : MMXI_binop_rm_int<0xDF, "pandn", int_x86_mmx_pandn,
+                                  MMX_INTALU_ITINS>;
+
+// Shift Instructions
+defm MMX_PSRLW : MMXI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw",
+                                    int_x86_mmx_psrl_w, int_x86_mmx_psrli_w,
+                                    MMX_SHIFT_ITINS>;
+defm MMX_PSRLD : MMXI_binop_rmi_int<0xD2, 0x72, MRM2r, "psrld",
+                                    int_x86_mmx_psrl_d, int_x86_mmx_psrli_d,
+                                    MMX_SHIFT_ITINS>;
+defm MMX_PSRLQ : MMXI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq",
+                                    int_x86_mmx_psrl_q, int_x86_mmx_psrli_q,
+                                    MMX_SHIFT_ITINS>;
+
+defm MMX_PSLLW : MMXI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw",
+                                    int_x86_mmx_psll_w, int_x86_mmx_pslli_w,
+                                    MMX_SHIFT_ITINS>;
+defm MMX_PSLLD : MMXI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld",
+                                    int_x86_mmx_psll_d, int_x86_mmx_pslli_d,
+                                    MMX_SHIFT_ITINS>;
+defm MMX_PSLLQ : MMXI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq",
+                                    int_x86_mmx_psll_q, int_x86_mmx_pslli_q,
+                                    MMX_SHIFT_ITINS>;
+
+defm MMX_PSRAW : MMXI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw",
+                                    int_x86_mmx_psra_w, int_x86_mmx_psrai_w,
+                                    MMX_SHIFT_ITINS>;
+defm MMX_PSRAD : MMXI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad",
+                                    int_x86_mmx_psra_d, int_x86_mmx_psrai_d,
+                                    MMX_SHIFT_ITINS>;
+
+// Comparison Instructions
+defm MMX_PCMPEQB : MMXI_binop_rm_int<0x74, "pcmpeqb", int_x86_mmx_pcmpeq_b,
+                                     MMX_INTALU_ITINS>;
+defm MMX_PCMPEQW : MMXI_binop_rm_int<0x75, "pcmpeqw", int_x86_mmx_pcmpeq_w,
+                                     MMX_INTALU_ITINS>;
+defm MMX_PCMPEQD : MMXI_binop_rm_int<0x76, "pcmpeqd", int_x86_mmx_pcmpeq_d,
+                                     MMX_INTALU_ITINS>;
+
+defm MMX_PCMPGTB : MMXI_binop_rm_int<0x64, "pcmpgtb", int_x86_mmx_pcmpgt_b,
+                                     MMX_INTALU_ITINS>;
+defm MMX_PCMPGTW : MMXI_binop_rm_int<0x65, "pcmpgtw", int_x86_mmx_pcmpgt_w,
+                                     MMX_INTALU_ITINS>;
+defm MMX_PCMPGTD : MMXI_binop_rm_int<0x66, "pcmpgtd", int_x86_mmx_pcmpgt_d,
+                                     MMX_INTALU_ITINS>;
+
+// -- Unpack Instructions
+defm MMX_PUNPCKHBW : MMXI_binop_rm_int<0x68, "punpckhbw", 
+                                       int_x86_mmx_punpckhbw,
+                                       MMX_UNPCK_H_ITINS>;
+defm MMX_PUNPCKHWD : MMXI_binop_rm_int<0x69, "punpckhwd", 
+                                       int_x86_mmx_punpckhwd,
+                                       MMX_UNPCK_H_ITINS>;
+defm MMX_PUNPCKHDQ : MMXI_binop_rm_int<0x6A, "punpckhdq", 
+                                       int_x86_mmx_punpckhdq,
+                                       MMX_UNPCK_H_ITINS>;
+defm MMX_PUNPCKLBW : MMXI_binop_rm_int<0x60, "punpcklbw", 
+                                       int_x86_mmx_punpcklbw,
+                                       MMX_UNPCK_L_ITINS>;
+defm MMX_PUNPCKLWD : MMXI_binop_rm_int<0x61, "punpcklwd", 
+                                       int_x86_mmx_punpcklwd,
+                                       MMX_UNPCK_L_ITINS>;
+defm MMX_PUNPCKLDQ : MMXI_binop_rm_int<0x62, "punpckldq",
+                                       int_x86_mmx_punpckldq,
+                                       MMX_UNPCK_L_ITINS>;
+
+// -- Pack Instructions
+defm MMX_PACKSSWB : MMXI_binop_rm_int<0x63, "packsswb", int_x86_mmx_packsswb,
+                                      MMX_PCK_ITINS>;
+defm MMX_PACKSSDW : MMXI_binop_rm_int<0x6B, "packssdw", int_x86_mmx_packssdw,
+                                      MMX_PCK_ITINS>;
+defm MMX_PACKUSWB : MMXI_binop_rm_int<0x67, "packuswb", int_x86_mmx_packuswb,
+                                      MMX_PCK_ITINS>;
+
+// -- Shuffle Instructions
+defm MMX_PSHUFB : SS3I_binop_rm_int_mm<0x00, "pshufb", int_x86_ssse3_pshuf_b,
+                                       MMX_PSHUF_ITINS>;
+
+def MMX_PSHUFWri : MMXIi8<0x70, MRMSrcReg,
+                          (outs VR64:$dst), (ins VR64:$src1, i8imm:$src2),
+                          "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                          [(set VR64:$dst,
+                             (int_x86_sse_pshuf_w VR64:$src1, imm:$src2))],
+                          IIC_MMX_PSHUF>, Sched<[WriteShuffle]>;
+def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem,
+                          (outs VR64:$dst), (ins i64mem:$src1, i8imm:$src2),
+                          "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                          [(set VR64:$dst,
+                             (int_x86_sse_pshuf_w (load_mmx addr:$src1),
+                                                   imm:$src2))],
+                          IIC_MMX_PSHUF>, Sched<[WriteShuffleLd]>;
+
+
+
+
+// -- Conversion Instructions
+defm MMX_CVTPS2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtps2pi,
+                      f64mem, load, "cvtps2pi\t{$src, $dst|$dst, $src}",
+                      MMX_CVT_PS_ITINS, SSEPackedSingle>, PS;
+defm MMX_CVTPD2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtpd2pi,
+                      f128mem, memop, "cvtpd2pi\t{$src, $dst|$dst, $src}",
+                      MMX_CVT_PD_ITINS, SSEPackedDouble>, PD;
+defm MMX_CVTTPS2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttps2pi,
+                       f64mem, load, "cvttps2pi\t{$src, $dst|$dst, $src}",
+                       MMX_CVT_PS_ITINS, SSEPackedSingle>, PS;
+defm MMX_CVTTPD2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttpd2pi,
+                       f128mem, memop, "cvttpd2pi\t{$src, $dst|$dst, $src}",
+                       MMX_CVT_PD_ITINS, SSEPackedDouble>, PD;
+defm MMX_CVTPI2PD : sse12_cvt_pint<0x2A, VR64, VR128, int_x86_sse_cvtpi2pd,
+                         i64mem, load, "cvtpi2pd\t{$src, $dst|$dst, $src}",
+                         MMX_CVT_PD_ITINS, SSEPackedDouble>, PD;
+let Constraints = "$src1 = $dst" in {
+  defm MMX_CVTPI2PS : sse12_cvt_pint_3addr<0x2A, VR64, VR128,
+                         int_x86_sse_cvtpi2ps,
+                         i64mem, load, "cvtpi2ps\t{$src2, $dst|$dst, $src2}",
+                          SSEPackedSingle>, PS;
+}
+
+// Extract / Insert
+def MMX_PEXTRWirri: MMXIi8<0xC5, MRMSrcReg,
+                       (outs GR32orGR64:$dst), (ins VR64:$src1, i32i8imm:$src2),
+                       "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                       [(set GR32orGR64:$dst, (int_x86_mmx_pextr_w VR64:$src1,
+                                         (iPTR imm:$src2)))],
+                       IIC_MMX_PEXTR>, Sched<[WriteShuffle]>;
+let Constraints = "$src1 = $dst" in {
+  def MMX_PINSRWirri : MMXIi8<0xC4, MRMSrcReg,
+                      (outs VR64:$dst), 
+                      (ins VR64:$src1, GR32orGR64:$src2, i32i8imm:$src3),
+                      "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                      [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
+                                        GR32orGR64:$src2, (iPTR imm:$src3)))],
+                      IIC_MMX_PINSRW>, Sched<[WriteShuffle]>;
+
+  def MMX_PINSRWirmi : MMXIi8<0xC4, MRMSrcMem,
+                     (outs VR64:$dst),
+                     (ins VR64:$src1, i16mem:$src2, i32i8imm:$src3),
+                     "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                     [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
+                                         (i32 (anyext (loadi16 addr:$src2))),
+                                       (iPTR imm:$src3)))],
+                     IIC_MMX_PINSRW>, Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+// Mask creation
+def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
+                          (ins VR64:$src),
+                          "pmovmskb\t{$src, $dst|$dst, $src}",
+                          [(set GR32orGR64:$dst,
+                                (int_x86_mmx_pmovmskb VR64:$src))]>;
+
+
+// Low word of XMM to MMX.
+def MMX_X86movdq2q : SDNode<"X86ISD::MOVDQ2Q", SDTypeProfile<1, 1,
+                            [SDTCisVT<0, x86mmx>, SDTCisVT<1, v2i64>]>>;
+
+def : Pat<(x86mmx (MMX_X86movdq2q VR128:$src)),
+          (x86mmx (MMX_MOVDQ2Qrr VR128:$src))>;
+
+def : Pat<(x86mmx (MMX_X86movdq2q (loadv2i64 addr:$src))),
+          (x86mmx (MMX_MOVQ64rm addr:$src))>;
+
+// Misc.
+let SchedRW = [WriteShuffle] in {
+let Uses = [EDI] in
+def MMX_MASKMOVQ : MMXI32<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
+                          "maskmovq\t{$mask, $src|$src, $mask}",
+                          [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, EDI)],
+                          IIC_MMX_MASKMOV>;
+let Uses = [RDI] in
+def MMX_MASKMOVQ64: MMXI64<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
+                           "maskmovq\t{$mask, $src|$src, $mask}",
+                           [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, RDI)],
+                           IIC_MMX_MASKMOV>;
+}
+
+// 64-bit bit convert.
+let Predicates = [HasSSE2] in {
+def : Pat<(x86mmx (bitconvert (i64 GR64:$src))),
+          (MMX_MOVD64to64rr GR64:$src)>;
+def : Pat<(i64 (bitconvert (x86mmx VR64:$src))),
+          (MMX_MOVD64from64rr VR64:$src)>;
+def : Pat<(f64 (bitconvert (x86mmx VR64:$src))),
+          (MMX_MOVQ2FR64rr VR64:$src)>;
+def : Pat<(x86mmx (bitconvert (f64 FR64:$src))),
+          (MMX_MOVFR642Qrr FR64:$src)>;
+}
+
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrSSE.td b/contrib/llvm/lib/Target/X86/X86InstrSSE.td
new file mode 100644
index 0000000..2bb898e
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrSSE.td
@@ -0,0 +1,9061 @@
+//===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 SSE instruction set, defining the instructions,
+// and properties of the instructions which are needed for code generation,
+// machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+class OpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm> {
+  InstrItinClass rr = arg_rr;
+  InstrItinClass rm = arg_rm;
+  // InstrSchedModel info.
+  X86FoldableSchedWrite Sched = WriteFAdd;
+}
+
+class SizeItins<OpndItins arg_s, OpndItins arg_d> {
+  OpndItins s = arg_s;
+  OpndItins d = arg_d;
+}
+
+
+class ShiftOpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm,
+  InstrItinClass arg_ri> {
+  InstrItinClass rr = arg_rr;
+  InstrItinClass rm = arg_rm;
+  InstrItinClass ri = arg_ri;
+}
+
+
+// scalar
+let Sched = WriteFAdd in {
+def SSE_ALU_F32S : OpndItins<
+  IIC_SSE_ALU_F32S_RR, IIC_SSE_ALU_F32S_RM
+>;
+
+def SSE_ALU_F64S : OpndItins<
+  IIC_SSE_ALU_F64S_RR, IIC_SSE_ALU_F64S_RM
+>;
+}
+
+def SSE_ALU_ITINS_S : SizeItins<
+  SSE_ALU_F32S, SSE_ALU_F64S
+>;
+
+let Sched = WriteFMul in {
+def SSE_MUL_F32S : OpndItins<
+  IIC_SSE_MUL_F32S_RR, IIC_SSE_MUL_F64S_RM
+>;
+
+def SSE_MUL_F64S : OpndItins<
+  IIC_SSE_MUL_F64S_RR, IIC_SSE_MUL_F64S_RM
+>;
+}
+
+def SSE_MUL_ITINS_S : SizeItins<
+  SSE_MUL_F32S, SSE_MUL_F64S
+>;
+
+let Sched = WriteFDiv in {
+def SSE_DIV_F32S : OpndItins<
+  IIC_SSE_DIV_F32S_RR, IIC_SSE_DIV_F64S_RM
+>;
+
+def SSE_DIV_F64S : OpndItins<
+  IIC_SSE_DIV_F64S_RR, IIC_SSE_DIV_F64S_RM
+>;
+}
+
+def SSE_DIV_ITINS_S : SizeItins<
+  SSE_DIV_F32S, SSE_DIV_F64S
+>;
+
+// parallel
+let Sched = WriteFAdd in {
+def SSE_ALU_F32P : OpndItins<
+  IIC_SSE_ALU_F32P_RR, IIC_SSE_ALU_F32P_RM
+>;
+
+def SSE_ALU_F64P : OpndItins<
+  IIC_SSE_ALU_F64P_RR, IIC_SSE_ALU_F64P_RM
+>;
+}
+
+def SSE_ALU_ITINS_P : SizeItins<
+  SSE_ALU_F32P, SSE_ALU_F64P
+>;
+
+let Sched = WriteFMul in {
+def SSE_MUL_F32P : OpndItins<
+  IIC_SSE_MUL_F32P_RR, IIC_SSE_MUL_F64P_RM
+>;
+
+def SSE_MUL_F64P : OpndItins<
+  IIC_SSE_MUL_F64P_RR, IIC_SSE_MUL_F64P_RM
+>;
+}
+
+def SSE_MUL_ITINS_P : SizeItins<
+  SSE_MUL_F32P, SSE_MUL_F64P
+>;
+
+let Sched = WriteFDiv in {
+def SSE_DIV_F32P : OpndItins<
+  IIC_SSE_DIV_F32P_RR, IIC_SSE_DIV_F64P_RM
+>;
+
+def SSE_DIV_F64P : OpndItins<
+  IIC_SSE_DIV_F64P_RR, IIC_SSE_DIV_F64P_RM
+>;
+}
+
+def SSE_DIV_ITINS_P : SizeItins<
+  SSE_DIV_F32P, SSE_DIV_F64P
+>;
+
+let Sched = WriteVecLogic in
+def SSE_VEC_BIT_ITINS_P : OpndItins<
+  IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM
+>;
+
+def SSE_BIT_ITINS_P : OpndItins<
+  IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM
+>;
+
+let Sched = WriteVecALU in {
+def SSE_INTALU_ITINS_P : OpndItins<
+  IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
+>;
+
+def SSE_INTALUQ_ITINS_P : OpndItins<
+  IIC_SSE_INTALUQ_P_RR, IIC_SSE_INTALUQ_P_RM
+>;
+}
+
+let Sched = WriteVecIMul in
+def SSE_INTMUL_ITINS_P : OpndItins<
+  IIC_SSE_INTMUL_P_RR, IIC_SSE_INTMUL_P_RM
+>;
+
+def SSE_INTSHIFT_ITINS_P : ShiftOpndItins<
+  IIC_SSE_INTSH_P_RR, IIC_SSE_INTSH_P_RM, IIC_SSE_INTSH_P_RI
+>;
+
+def SSE_MOVA_ITINS : OpndItins<
+  IIC_SSE_MOVA_P_RR, IIC_SSE_MOVA_P_RM
+>;
+
+def SSE_MOVU_ITINS : OpndItins<
+  IIC_SSE_MOVU_P_RR, IIC_SSE_MOVU_P_RM
+>;
+
+def SSE_DPPD_ITINS : OpndItins<
+  IIC_SSE_DPPD_RR, IIC_SSE_DPPD_RM
+>;
+
+def SSE_DPPS_ITINS : OpndItins<
+  IIC_SSE_DPPS_RR, IIC_SSE_DPPD_RM
+>;
+
+def DEFAULT_ITINS : OpndItins<
+  IIC_ALU_NONMEM, IIC_ALU_MEM
+>;
+
+def SSE_EXTRACT_ITINS : OpndItins<
+  IIC_SSE_EXTRACTPS_RR, IIC_SSE_EXTRACTPS_RM
+>;
+
+def SSE_INSERT_ITINS : OpndItins<
+  IIC_SSE_INSERTPS_RR, IIC_SSE_INSERTPS_RM
+>;
+
+let Sched = WriteMPSAD in
+def SSE_MPSADBW_ITINS : OpndItins<
+  IIC_SSE_MPSADBW_RR, IIC_SSE_MPSADBW_RM
+>;
+
+def SSE_PMULLD_ITINS : OpndItins<
+  IIC_SSE_PMULLD_RR, IIC_SSE_PMULLD_RM
+>;
+
+// Definitions for backward compatibility.
+// The instructions mapped on these definitions uses a different itinerary
+// than the actual scheduling model.
+let Sched = WriteShuffle in
+def DEFAULT_ITINS_SHUFFLESCHED :  OpndItins<
+  IIC_ALU_NONMEM, IIC_ALU_MEM
+>;
+
+let Sched = WriteVecIMul in
+def DEFAULT_ITINS_VECIMULSCHED :  OpndItins<
+  IIC_ALU_NONMEM, IIC_ALU_MEM
+>;
+
+let Sched = WriteShuffle in
+def SSE_INTALU_ITINS_SHUFF_P : OpndItins<
+  IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
+>;
+
+let Sched = WriteMPSAD in
+def DEFAULT_ITINS_MPSADSCHED :  OpndItins<
+  IIC_ALU_NONMEM, IIC_ALU_MEM
+>;
+
+let Sched = WriteFBlend in
+def DEFAULT_ITINS_FBLENDSCHED :  OpndItins<
+  IIC_ALU_NONMEM, IIC_ALU_MEM
+>;
+
+let Sched = WriteBlend in
+def DEFAULT_ITINS_BLENDSCHED :  OpndItins<
+  IIC_ALU_NONMEM, IIC_ALU_MEM
+>;
+
+let Sched = WriteFBlend in
+def SSE_INTALU_ITINS_FBLEND_P : OpndItins<
+  IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
+>;
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 Instructions Classes
+//===----------------------------------------------------------------------===//
+
+/// sse12_fp_scalar - SSE 1 & 2 scalar instructions class
+multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           RegisterClass RC, X86MemOperand x86memop,
+                           OpndItins itins,
+                           bit Is2Addr = 1> {
+  let isCommutable = 1 in {
+    def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], itins.rr>,
+       Sched<[itins.Sched]>;
+  }
+  def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], itins.rm>,
+       Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
+multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                             string asm, string SSEVer, string FPSizeStr,
+                             Operand memopr, ComplexPattern mem_cpat,
+                             OpndItins itins,
+                             bit Is2Addr = 1> {
+let isCodeGenOnly = 1 in {
+  def rr_Int : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (!cast<Intrinsic>(
+                 !strconcat("int_x86_sse", SSEVer, "_", OpcodeStr, FPSizeStr))
+             RC:$src1, RC:$src2))], itins.rr>,
+       Sched<[itins.Sched]>;
+  def rm_Int : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
+       !if(Is2Addr,
+           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (!cast<Intrinsic>(!strconcat("int_x86_sse",
+                                          SSEVer, "_", OpcodeStr, FPSizeStr))
+             RC:$src1, mem_cpat:$src2))], itins.rm>,
+       Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+}
+
+/// sse12_fp_packed - SSE 1 & 2 packed instructions class
+multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           RegisterClass RC, ValueType vt,
+                           X86MemOperand x86memop, PatFrag mem_frag,
+                           Domain d, OpndItins itins, bit Is2Addr = 1> {
+  let isCommutable = 1 in
+    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], itins.rr, d>,
+       Sched<[itins.Sched]>;
+  let mayLoad = 1 in
+    def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
+          itins.rm, d>,
+       Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+/// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class
+multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
+                                      string OpcodeStr, X86MemOperand x86memop,
+                                      list<dag> pat_rr, list<dag> pat_rm,
+                                      bit Is2Addr = 1> {
+  let isCommutable = 1, hasSideEffects = 0 in
+    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       pat_rr, NoItinerary, d>,
+       Sched<[WriteVecLogic]>;
+  def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       pat_rm, NoItinerary, d>,
+       Sched<[WriteVecLogicLd, ReadAfterLd]>;
+}
+
+//===----------------------------------------------------------------------===//
+//  Non-instruction patterns
+//===----------------------------------------------------------------------===//
+
+// A vector extract of the first f32/f64 position is a subregister copy
+def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
+          (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32)>;
+def : Pat<(f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
+          (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64)>;
+
+// A 128-bit subvector extract from the first 256-bit vector position
+// is a subregister copy that needs no instruction.
+def : Pat<(v4i32 (extract_subvector (v8i32 VR256:$src), (iPTR 0))),
+          (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm))>;
+def : Pat<(v4f32 (extract_subvector (v8f32 VR256:$src), (iPTR 0))),
+          (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm))>;
+
+def : Pat<(v2i64 (extract_subvector (v4i64 VR256:$src), (iPTR 0))),
+          (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm))>;
+def : Pat<(v2f64 (extract_subvector (v4f64 VR256:$src), (iPTR 0))),
+          (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm))>;
+
+def : Pat<(v8i16 (extract_subvector (v16i16 VR256:$src), (iPTR 0))),
+          (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src), sub_xmm))>;
+def : Pat<(v16i8 (extract_subvector (v32i8 VR256:$src), (iPTR 0))),
+          (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src), sub_xmm))>;
+
+// A 128-bit subvector insert to the first 256-bit vector position
+// is a subregister copy that needs no instruction.
+let AddedComplexity = 25 in { // to give priority over vinsertf128rm
+def : Pat<(insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)),
+          (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)),
+          (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)),
+          (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)),
+          (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v8i16 VR128:$src), (iPTR 0)),
+          (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (iPTR 0)),
+          (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+}
+
+// Implicitly promote a 32-bit scalar to a vector.
+def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
+          (COPY_TO_REGCLASS FR32:$src, VR128)>;
+def : Pat<(v8f32 (scalar_to_vector FR32:$src)),
+          (COPY_TO_REGCLASS FR32:$src, VR128)>;
+// Implicitly promote a 64-bit scalar to a vector.
+def : Pat<(v2f64 (scalar_to_vector FR64:$src)),
+          (COPY_TO_REGCLASS FR64:$src, VR128)>;
+def : Pat<(v4f64 (scalar_to_vector FR64:$src)),
+          (COPY_TO_REGCLASS FR64:$src, VR128)>;
+
+// Bitcasts between 128-bit vector types. Return the original type since
+// no instruction is needed for the conversion
+let Predicates = [HasSSE2] in {
+  def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
+  def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
+  def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
+  def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
+  def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
+  def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
+  def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
+  def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
+  def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
+  def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
+  def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
+  def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
+  def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
+  def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
+  def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
+  def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
+  def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
+  def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>;
+  def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
+  def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
+  def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
+  def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
+  def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
+  def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
+  def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
+  def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
+  def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
+  def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
+  def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
+  def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
+}
+
+// Bitcasts between 256-bit vector types. Return the original type since
+// no instruction is needed for the conversion
+let Predicates = [HasAVX] in {
+  def : Pat<(v4f64  (bitconvert (v8f32 VR256:$src))),  (v4f64 VR256:$src)>;
+  def : Pat<(v4f64  (bitconvert (v8i32 VR256:$src))),  (v4f64 VR256:$src)>;
+  def : Pat<(v4f64  (bitconvert (v4i64 VR256:$src))),  (v4f64 VR256:$src)>;
+  def : Pat<(v4f64  (bitconvert (v16i16 VR256:$src))), (v4f64 VR256:$src)>;
+  def : Pat<(v4f64  (bitconvert (v32i8 VR256:$src))),  (v4f64 VR256:$src)>;
+  def : Pat<(v8f32  (bitconvert (v8i32 VR256:$src))),  (v8f32 VR256:$src)>;
+  def : Pat<(v8f32  (bitconvert (v4i64 VR256:$src))),  (v8f32 VR256:$src)>;
+  def : Pat<(v8f32  (bitconvert (v4f64 VR256:$src))),  (v8f32 VR256:$src)>;
+  def : Pat<(v8f32  (bitconvert (v32i8 VR256:$src))),  (v8f32 VR256:$src)>;
+  def : Pat<(v8f32  (bitconvert (v16i16 VR256:$src))), (v8f32 VR256:$src)>;
+  def : Pat<(v4i64  (bitconvert (v8f32 VR256:$src))),  (v4i64 VR256:$src)>;
+  def : Pat<(v4i64  (bitconvert (v8i32 VR256:$src))),  (v4i64 VR256:$src)>;
+  def : Pat<(v4i64  (bitconvert (v4f64 VR256:$src))),  (v4i64 VR256:$src)>;
+  def : Pat<(v4i64  (bitconvert (v32i8 VR256:$src))),  (v4i64 VR256:$src)>;
+  def : Pat<(v4i64  (bitconvert (v16i16 VR256:$src))), (v4i64 VR256:$src)>;
+  def : Pat<(v32i8  (bitconvert (v4f64 VR256:$src))),  (v32i8 VR256:$src)>;
+  def : Pat<(v32i8  (bitconvert (v4i64 VR256:$src))),  (v32i8 VR256:$src)>;
+  def : Pat<(v32i8  (bitconvert (v8f32 VR256:$src))),  (v32i8 VR256:$src)>;
+  def : Pat<(v32i8  (bitconvert (v8i32 VR256:$src))),  (v32i8 VR256:$src)>;
+  def : Pat<(v32i8  (bitconvert (v16i16 VR256:$src))), (v32i8 VR256:$src)>;
+  def : Pat<(v8i32  (bitconvert (v32i8 VR256:$src))),  (v8i32 VR256:$src)>;
+  def : Pat<(v8i32  (bitconvert (v16i16 VR256:$src))), (v8i32 VR256:$src)>;
+  def : Pat<(v8i32  (bitconvert (v8f32 VR256:$src))),  (v8i32 VR256:$src)>;
+  def : Pat<(v8i32  (bitconvert (v4i64 VR256:$src))),  (v8i32 VR256:$src)>;
+  def : Pat<(v8i32  (bitconvert (v4f64 VR256:$src))),  (v8i32 VR256:$src)>;
+  def : Pat<(v16i16 (bitconvert (v8f32 VR256:$src))),  (v16i16 VR256:$src)>;
+  def : Pat<(v16i16 (bitconvert (v8i32 VR256:$src))),  (v16i16 VR256:$src)>;
+  def : Pat<(v16i16 (bitconvert (v4i64 VR256:$src))),  (v16i16 VR256:$src)>;
+  def : Pat<(v16i16 (bitconvert (v4f64 VR256:$src))),  (v16i16 VR256:$src)>;
+  def : Pat<(v16i16 (bitconvert (v32i8 VR256:$src))),  (v16i16 VR256:$src)>;
+}
+
+// Alias instructions that map fld0 to xorps for sse or vxorps for avx.
+// This is expanded by ExpandPostRAPseudos.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+    isPseudo = 1, SchedRW = [WriteZero] in {
+  def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
+                   [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1]>;
+  def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
+                   [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2]>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX & SSE - Zero/One Vectors
+//===----------------------------------------------------------------------===//
+
+// Alias instruction that maps zero vector to pxor / xorp* for sse.
+// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
+// swizzled by ExecutionDepsFix to pxor.
+// We set canFoldAsLoad because this can be converted to a constant-pool
+// load of an all-zeros value if folding it would be beneficial.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+    isPseudo = 1, SchedRW = [WriteZero] in {
+def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
+               [(set VR128:$dst, (v4f32 immAllZerosV))]>;
+}
+
+def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
+def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
+def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
+def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
+def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
+
+
+// The same as done above but for AVX.  The 256-bit AVX1 ISA doesn't support PI,
+// and doesn't need it because on sandy bridge the register is set to zero
+// at the rename stage without using any execution unit, so SET0PSY
+// and SET0PDY can be used for vector int instructions without penalty
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+    isPseudo = 1, Predicates = [HasAVX], SchedRW = [WriteZero] in {
+def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
+                 [(set VR256:$dst, (v8f32 immAllZerosV))]>;
+}
+
+let Predicates = [HasAVX] in
+  def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>;
+
+let Predicates = [HasAVX2] in {
+  def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>;
+  def : Pat<(v8i32 immAllZerosV), (AVX_SET0)>;
+  def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>;
+  def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>;
+}
+
+// AVX1 has no support for 256-bit integer instructions, but since the 128-bit
+// VPXOR instruction writes zero to its upper part, it's safe build zeros.
+let Predicates = [HasAVX1Only] in {
+def : Pat<(v32i8 immAllZerosV), (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>;
+def : Pat<(bc_v32i8 (v8f32 immAllZerosV)),
+          (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>;
+
+def : Pat<(v16i16 immAllZerosV), (SUBREG_TO_REG (i16 0), (V_SET0), sub_xmm)>;
+def : Pat<(bc_v16i16 (v8f32 immAllZerosV)),
+          (SUBREG_TO_REG (i16 0), (V_SET0), sub_xmm)>;
+
+def : Pat<(v8i32 immAllZerosV), (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>;
+def : Pat<(bc_v8i32 (v8f32 immAllZerosV)),
+          (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>;
+
+def : Pat<(v4i64 immAllZerosV), (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>;
+def : Pat<(bc_v4i64 (v8f32 immAllZerosV)),
+          (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>;
+}
+
+// We set canFoldAsLoad because this can be converted to a constant-pool
+// load of an all-ones value if folding it would be beneficial.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+    isPseudo = 1, SchedRW = [WriteZero] in {
+  def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
+                       [(set VR128:$dst, (v4i32 immAllOnesV))]>;
+  let Predicates = [HasAVX2] in
+  def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "",
+                          [(set VR256:$dst, (v8i32 immAllOnesV))]>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move FP Scalar Instructions
+//
+// Move Instructions. Register-to-register movss/movsd is not used for FR32/64
+// register copies because it's a partial register update; Register-to-register
+// movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires
+// that the insert be implementable in terms of a copy, and just mentioned, we
+// don't use movss/movsd for copies.
+//===----------------------------------------------------------------------===//
+
+multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt,
+                         X86MemOperand x86memop, string base_opc,
+                         string asm_opr> {
+  def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst),
+              (ins VR128:$src1, RC:$src2),
+              !strconcat(base_opc, asm_opr),
+              [(set VR128:$dst, (vt (OpNode VR128:$src1,
+                                 (scalar_to_vector RC:$src2))))],
+              IIC_SSE_MOV_S_RR>, Sched<[WriteFShuffle]>;
+
+  // For the disassembler
+  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
+  def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
+                  (ins VR128:$src1, RC:$src2),
+                  !strconcat(base_opc, asm_opr),
+                  [], IIC_SSE_MOV_S_RR>, Sched<[WriteFShuffle]>;
+}
+
+multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
+                      X86MemOperand x86memop, string OpcodeStr> {
+  // AVX
+  defm V#NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
+                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}">,
+                              VEX_4V, VEX_LIG;
+
+  def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
+                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                     [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>,
+                     VEX, VEX_LIG, Sched<[WriteStore]>;
+  // SSE1 & 2
+  let Constraints = "$src1 = $dst" in {
+    defm NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
+                              "\t{$src2, $dst|$dst, $src2}">;
+  }
+
+  def NAME#mr   : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
+                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                     [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>,
+                  Sched<[WriteStore]>;
+}
+
+// Loading from memory automatically zeroing upper bits.
+multiclass sse12_move_rm<RegisterClass RC, X86MemOperand x86memop,
+                         PatFrag mem_pat, string OpcodeStr> {
+  def V#NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                     [(set RC:$dst, (mem_pat addr:$src))],
+                     IIC_SSE_MOV_S_RM>, VEX, VEX_LIG, Sched<[WriteLoad]>;
+  def NAME#rm   : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                     [(set RC:$dst, (mem_pat addr:$src))],
+                     IIC_SSE_MOV_S_RM>, Sched<[WriteLoad]>;
+}
+
+defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss">, XS;
+defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd">, XD;
+
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
+  defm MOVSS : sse12_move_rm<FR32, f32mem, loadf32, "movss">, XS;
+
+  let AddedComplexity = 20 in
+    defm MOVSD : sse12_move_rm<FR64, f64mem, loadf64, "movsd">, XD;
+}
+
+// Patterns
+let Predicates = [UseAVX] in {
+  let AddedComplexity = 15 in {
+  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
+  // MOVS{S,D} to the lower bits.
+  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
+            (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
+  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
+            (VMOVSSrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
+  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
+            (VMOVSSrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
+  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
+            (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
+
+  // Move low f32 and clear high bits.
+  def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (VMOVSSrr (v4f32 (V_SET0)),
+                       (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)), sub_xmm)>;
+  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (VMOVSSrr (v4i32 (V_SET0)),
+                       (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)), sub_xmm)>;
+  }
+
+  let AddedComplexity = 20 in {
+  // MOVSSrm zeros the high parts of the register; represent this
+  // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
+  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
+            (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
+  def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
+            (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
+  def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
+            (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
+
+  // MOVSDrm zeros the high parts of the register; represent this
+  // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
+  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
+            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
+  def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
+            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
+  def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
+            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
+  def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
+            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
+  def : Pat<(v2f64 (X86vzload addr:$src)),
+            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
+
+  // Represent the same patterns above but in the form they appear for
+  // 256-bit types
+  def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
+                   (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
+  def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
+                   (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
+  def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
+                   (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
+  }
+  def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
+                   (v4f32 (scalar_to_vector FR32:$src)), (iPTR 0)))),
+            (SUBREG_TO_REG (i32 0),
+                           (v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)),
+                           sub_xmm)>;
+  def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
+                   (v2f64 (scalar_to_vector FR64:$src)), (iPTR 0)))),
+            (SUBREG_TO_REG (i64 0),
+                           (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)),
+                           sub_xmm)>;
+  def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
+                   (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
+            (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_xmm)>;
+
+  // Move low f64 and clear high bits.
+  def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (VMOVSDrr (v2f64 (V_SET0)),
+                       (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)), sub_xmm)>;
+
+  def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (VMOVSDrr (v2i64 (V_SET0)),
+                       (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)), sub_xmm)>;
+
+  // Extract and store.
+  def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
+                   addr:$dst),
+            (VMOVSSmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32))>;
+  def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
+                   addr:$dst),
+            (VMOVSDmr addr:$dst, (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64))>;
+
+  // Shuffle with VMOVSS
+  def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
+            (VMOVSSrr (v4i32 VR128:$src1),
+                      (COPY_TO_REGCLASS (v4i32 VR128:$src2), FR32))>;
+  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
+            (VMOVSSrr (v4f32 VR128:$src1),
+                      (COPY_TO_REGCLASS (v4f32 VR128:$src2), FR32))>;
+
+  // 256-bit variants
+  def : Pat<(v8i32 (X86Movss VR256:$src1, VR256:$src2)),
+            (SUBREG_TO_REG (i32 0),
+              (VMOVSSrr (EXTRACT_SUBREG (v8i32 VR256:$src1), sub_xmm),
+                        (EXTRACT_SUBREG (v8i32 VR256:$src2), sub_xmm)),
+              sub_xmm)>;
+  def : Pat<(v8f32 (X86Movss VR256:$src1, VR256:$src2)),
+            (SUBREG_TO_REG (i32 0),
+              (VMOVSSrr (EXTRACT_SUBREG (v8f32 VR256:$src1), sub_xmm),
+                        (EXTRACT_SUBREG (v8f32 VR256:$src2), sub_xmm)),
+              sub_xmm)>;
+
+  // Shuffle with VMOVSD
+  def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
+            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
+            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
+            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
+            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+
+  // 256-bit variants
+  def : Pat<(v4i64 (X86Movsd VR256:$src1, VR256:$src2)),
+            (SUBREG_TO_REG (i32 0),
+              (VMOVSDrr (EXTRACT_SUBREG (v4i64 VR256:$src1), sub_xmm),
+                        (EXTRACT_SUBREG (v4i64 VR256:$src2), sub_xmm)),
+              sub_xmm)>;
+  def : Pat<(v4f64 (X86Movsd VR256:$src1, VR256:$src2)),
+            (SUBREG_TO_REG (i32 0),
+              (VMOVSDrr (EXTRACT_SUBREG (v4f64 VR256:$src1), sub_xmm),
+                        (EXTRACT_SUBREG (v4f64 VR256:$src2), sub_xmm)),
+              sub_xmm)>;
+
+
+  // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
+  // is during lowering, where it's not possible to recognize the fold cause
+  // it has two uses through a bitcast. One use disappears at isel time and the
+  // fold opportunity reappears.
+  def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
+            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
+            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
+            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
+            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+}
+
+let Predicates = [UseSSE1] in {
+  let AddedComplexity = 15 in {
+  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
+  // MOVSS to the lower bits.
+  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
+            (MOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
+  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
+            (MOVSSrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
+  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
+            (MOVSSrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
+  }
+
+  let AddedComplexity = 20 in {
+  // MOVSSrm already zeros the high parts of the register.
+  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
+            (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
+  def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
+            (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
+  def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
+            (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
+  }
+
+  // Extract and store.
+  def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
+                   addr:$dst),
+            (MOVSSmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR32))>;
+
+  // Shuffle with MOVSS
+  def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
+            (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>;
+  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
+            (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>;
+}
+
+let Predicates = [UseSSE2] in {
+  let AddedComplexity = 15 in {
+  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
+  // MOVSD to the lower bits.
+  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
+            (MOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
+  }
+
+  let AddedComplexity = 20 in {
+  // MOVSDrm already zeros the high parts of the register.
+  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
+            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
+  def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
+            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
+  def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
+            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
+  def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
+            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
+  def : Pat<(v2f64 (X86vzload addr:$src)),
+            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
+  }
+
+  // Extract and store.
+  def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
+                   addr:$dst),
+            (MOVSDmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR64))>;
+
+  // Shuffle with MOVSD
+  def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
+            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
+            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
+            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
+            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+
+  // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
+  // is during lowering, where it's not possible to recognize the fold cause
+  // it has two uses through a bitcast. One use disappears at isel time and the
+  // fold opportunity reappears.
+  def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
+            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
+            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
+            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
+            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
+                            X86MemOperand x86memop, PatFrag ld_frag,
+                            string asm, Domain d,
+                            OpndItins itins,
+                            bit IsReMaterializable = 1> {
+let neverHasSideEffects = 1 in
+  def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
+              !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], itins.rr, d>,
+           Sched<[WriteFShuffle]>;
+let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in
+  def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+                   [(set RC:$dst, (ld_frag addr:$src))], itins.rm, d>,
+           Sched<[WriteLoad]>;
+}
+
+defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
+                              "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
+                              PS, VEX;
+defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
+                              "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
+                              PD, VEX;
+defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
+                              "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
+                              PS, VEX;
+defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
+                              "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
+                              PD, VEX;
+
+defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32,
+                              "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
+                              PS, VEX, VEX_L;
+defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64,
+                              "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
+                              PD, VEX, VEX_L;
+defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32,
+                              "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
+                              PS, VEX, VEX_L;
+defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64,
+                              "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
+                              PD, VEX, VEX_L;
+defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
+                              "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
+                              PS;
+defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
+                              "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
+                              PD;
+defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
+                              "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
+                              PS;
+defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
+                              "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
+                              PD;
+
+let SchedRW = [WriteStore] in {
+def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                   "movaps\t{$src, $dst|$dst, $src}",
+                   [(alignedstore (v4f32 VR128:$src), addr:$dst)],
+                   IIC_SSE_MOVA_P_MR>, VEX;
+def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                   "movapd\t{$src, $dst|$dst, $src}",
+                   [(alignedstore (v2f64 VR128:$src), addr:$dst)],
+                   IIC_SSE_MOVA_P_MR>, VEX;
+def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                   "movups\t{$src, $dst|$dst, $src}",
+                   [(store (v4f32 VR128:$src), addr:$dst)],
+                   IIC_SSE_MOVU_P_MR>, VEX;
+def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                   "movupd\t{$src, $dst|$dst, $src}",
+                   [(store (v2f64 VR128:$src), addr:$dst)],
+                   IIC_SSE_MOVU_P_MR>, VEX;
+def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
+                   "movaps\t{$src, $dst|$dst, $src}",
+                   [(alignedstore256 (v8f32 VR256:$src), addr:$dst)],
+                   IIC_SSE_MOVA_P_MR>, VEX, VEX_L;
+def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
+                   "movapd\t{$src, $dst|$dst, $src}",
+                   [(alignedstore256 (v4f64 VR256:$src), addr:$dst)],
+                   IIC_SSE_MOVA_P_MR>, VEX, VEX_L;
+def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
+                   "movups\t{$src, $dst|$dst, $src}",
+                   [(store (v8f32 VR256:$src), addr:$dst)],
+                   IIC_SSE_MOVU_P_MR>, VEX, VEX_L;
+def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
+                   "movupd\t{$src, $dst|$dst, $src}",
+                   [(store (v4f64 VR256:$src), addr:$dst)],
+                   IIC_SSE_MOVU_P_MR>, VEX, VEX_L;
+} // SchedRW
+
+// For disassembler
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
+    SchedRW = [WriteFShuffle] in {
+  def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
+                          (ins VR128:$src),
+                          "movaps\t{$src, $dst|$dst, $src}", [],
+                          IIC_SSE_MOVA_P_RR>, VEX;
+  def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
+                           (ins VR128:$src),
+                           "movapd\t{$src, $dst|$dst, $src}", [],
+                           IIC_SSE_MOVA_P_RR>, VEX;
+  def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
+                           (ins VR128:$src),
+                           "movups\t{$src, $dst|$dst, $src}", [],
+                           IIC_SSE_MOVU_P_RR>, VEX;
+  def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
+                           (ins VR128:$src),
+                           "movupd\t{$src, $dst|$dst, $src}", [],
+                           IIC_SSE_MOVU_P_RR>, VEX;
+  def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
+                            (ins VR256:$src),
+                            "movaps\t{$src, $dst|$dst, $src}", [],
+                            IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
+  def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
+                            (ins VR256:$src),
+                            "movapd\t{$src, $dst|$dst, $src}", [],
+                            IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
+  def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
+                            (ins VR256:$src),
+                            "movups\t{$src, $dst|$dst, $src}", [],
+                            IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
+  def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
+                            (ins VR256:$src),
+                            "movupd\t{$src, $dst|$dst, $src}", [],
+                            IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
+}
+
+let Predicates = [HasAVX] in {
+def : Pat<(v8i32 (X86vzmovl
+                  (insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)))),
+          (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
+def : Pat<(v4i64 (X86vzmovl
+                  (insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)))),
+          (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
+def : Pat<(v8f32 (X86vzmovl
+                  (insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)))),
+          (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
+def : Pat<(v4f64 (X86vzmovl
+                  (insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)))),
+          (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
+}
+
+
+def : Pat<(int_x86_avx_storeu_ps_256 addr:$dst, VR256:$src),
+          (VMOVUPSYmr addr:$dst, VR256:$src)>;
+def : Pat<(int_x86_avx_storeu_pd_256 addr:$dst, VR256:$src),
+          (VMOVUPDYmr addr:$dst, VR256:$src)>;
+
+let SchedRW = [WriteStore] in {
+def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                   "movaps\t{$src, $dst|$dst, $src}",
+                   [(alignedstore (v4f32 VR128:$src), addr:$dst)],
+                   IIC_SSE_MOVA_P_MR>;
+def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                   "movapd\t{$src, $dst|$dst, $src}",
+                   [(alignedstore (v2f64 VR128:$src), addr:$dst)],
+                   IIC_SSE_MOVA_P_MR>;
+def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                   "movups\t{$src, $dst|$dst, $src}",
+                   [(store (v4f32 VR128:$src), addr:$dst)],
+                   IIC_SSE_MOVU_P_MR>;
+def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                   "movupd\t{$src, $dst|$dst, $src}",
+                   [(store (v2f64 VR128:$src), addr:$dst)],
+                   IIC_SSE_MOVU_P_MR>;
+} // SchedRW
+
+// For disassembler
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
+    SchedRW = [WriteMove] in {
+  def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                         "movaps\t{$src, $dst|$dst, $src}", [],
+                         IIC_SSE_MOVA_P_RR>;
+  def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                         "movapd\t{$src, $dst|$dst, $src}", [],
+                         IIC_SSE_MOVA_P_RR>;
+  def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                         "movups\t{$src, $dst|$dst, $src}", [],
+                         IIC_SSE_MOVU_P_RR>;
+  def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                         "movupd\t{$src, $dst|$dst, $src}", [],
+                         IIC_SSE_MOVU_P_RR>;
+}
+
+let Predicates = [HasAVX] in {
+  def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src),
+            (VMOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src),
+            (VMOVUPDmr addr:$dst, VR128:$src)>;
+}
+
+let Predicates = [UseSSE1] in
+  def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src),
+            (MOVUPSmr addr:$dst, VR128:$src)>;
+let Predicates = [UseSSE2] in
+  def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src),
+            (MOVUPDmr addr:$dst, VR128:$src)>;
+
+// Use vmovaps/vmovups for AVX integer load/store.
+let Predicates = [HasAVX] in {
+  // 128-bit load/store
+  def : Pat<(alignedloadv2i64 addr:$src),
+            (VMOVAPSrm addr:$src)>;
+  def : Pat<(loadv2i64 addr:$src),
+            (VMOVUPSrm addr:$src)>;
+
+  def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
+            (VMOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
+            (VMOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
+            (VMOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
+            (VMOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v2i64 VR128:$src), addr:$dst),
+            (VMOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
+            (VMOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
+            (VMOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
+            (VMOVUPSmr addr:$dst, VR128:$src)>;
+
+  // 256-bit load/store
+  def : Pat<(alignedloadv4i64 addr:$src),
+            (VMOVAPSYrm addr:$src)>;
+  def : Pat<(loadv4i64 addr:$src),
+            (VMOVUPSYrm addr:$src)>;
+  def : Pat<(alignedstore256 (v4i64 VR256:$src), addr:$dst),
+            (VMOVAPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(alignedstore256 (v8i32 VR256:$src), addr:$dst),
+            (VMOVAPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(alignedstore256 (v16i16 VR256:$src), addr:$dst),
+            (VMOVAPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(alignedstore256 (v32i8 VR256:$src), addr:$dst),
+            (VMOVAPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(store (v4i64 VR256:$src), addr:$dst),
+            (VMOVUPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(store (v8i32 VR256:$src), addr:$dst),
+            (VMOVUPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(store (v16i16 VR256:$src), addr:$dst),
+            (VMOVUPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(store (v32i8 VR256:$src), addr:$dst),
+            (VMOVUPSYmr addr:$dst, VR256:$src)>;
+
+  // Special patterns for storing subvector extracts of lower 128-bits
+  // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
+  def : Pat<(alignedstore (v2f64 (extract_subvector
+                                  (v4f64 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v4f32 (extract_subvector
+                                  (v8f32 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v2i64 (extract_subvector
+                                  (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v4i32 (extract_subvector
+                                  (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v8i16 (extract_subvector
+                                  (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v16i8 (extract_subvector
+                                  (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+
+  def : Pat<(store (v2f64 (extract_subvector
+                           (v4f64 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVUPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v4f32 (extract_subvector
+                           (v8f32 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVUPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v2i64 (extract_subvector
+                           (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVUPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v4i32 (extract_subvector
+                           (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVUPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v8i16 (extract_subvector
+                           (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVUPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v16i8 (extract_subvector
+                           (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVUPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+}
+
+// Use movaps / movups for SSE integer load / store (one byte shorter).
+// The instructions selected below are then converted to MOVDQA/MOVDQU
+// during the SSE domain pass.
+let Predicates = [UseSSE1] in {
+  def : Pat<(alignedloadv2i64 addr:$src),
+            (MOVAPSrm addr:$src)>;
+  def : Pat<(loadv2i64 addr:$src),
+            (MOVUPSrm addr:$src)>;
+
+  def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
+            (MOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
+            (MOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
+            (MOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
+            (MOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v2i64 VR128:$src), addr:$dst),
+            (MOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
+            (MOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
+            (MOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
+            (MOVUPSmr addr:$dst, VR128:$src)>;
+}
+
+// Alias instruction to load FR32 or FR64 from f128mem using movaps. Upper
+// bits are disregarded. FIXME: Set encoding to pseudo!
+let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in {
+let isCodeGenOnly = 1 in {
+  def FsVMOVAPSrm : VPSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
+                         "movaps\t{$src, $dst|$dst, $src}",
+                         [(set FR32:$dst, (alignedloadfsf32 addr:$src))],
+                         IIC_SSE_MOVA_P_RM>, VEX;
+  def FsVMOVAPDrm : VPDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
+                         "movapd\t{$src, $dst|$dst, $src}",
+                         [(set FR64:$dst, (alignedloadfsf64 addr:$src))],
+                         IIC_SSE_MOVA_P_RM>, VEX;
+  def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
+                       "movaps\t{$src, $dst|$dst, $src}",
+                       [(set FR32:$dst, (alignedloadfsf32 addr:$src))],
+                       IIC_SSE_MOVA_P_RM>;
+  def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
+                       "movapd\t{$src, $dst|$dst, $src}",
+                       [(set FR64:$dst, (alignedloadfsf64 addr:$src))],
+                       IIC_SSE_MOVA_P_RM>;
+}
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move Low packed FP Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode psnode, SDNode pdnode,
+                                      string base_opc, string asm_opr,
+                                      InstrItinClass itin> {
+  def PSrm : PI<opc, MRMSrcMem,
+         (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
+         !strconcat(base_opc, "s", asm_opr),
+     [(set VR128:$dst,
+       (psnode VR128:$src1,
+              (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))],
+              itin, SSEPackedSingle>, PS,
+     Sched<[WriteFShuffleLd, ReadAfterLd]>;
+
+  def PDrm : PI<opc, MRMSrcMem,
+         (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
+         !strconcat(base_opc, "d", asm_opr),
+     [(set VR128:$dst, (v2f64 (pdnode VR128:$src1,
+                              (scalar_to_vector (loadf64 addr:$src2)))))],
+              itin, SSEPackedDouble>, PD,
+     Sched<[WriteFShuffleLd, ReadAfterLd]>;
+
+}
+
+multiclass sse12_mov_hilo_packed<bits<8>opc, SDNode psnode, SDNode pdnode,
+                                 string base_opc, InstrItinClass itin> {
+  defm V#NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
+                                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                                    itin>, VEX_4V;
+
+let Constraints = "$src1 = $dst" in
+  defm NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
+                                    "\t{$src2, $dst|$dst, $src2}",
+                                    itin>;
+}
+
+let AddedComplexity = 20 in {
+  defm MOVL : sse12_mov_hilo_packed<0x12, X86Movlps, X86Movlpd, "movlp",
+                                    IIC_SSE_MOV_LH>;
+}
+
+let SchedRW = [WriteStore] in {
+def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movlps\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
+                                 (iPTR 0))), addr:$dst)],
+                                 IIC_SSE_MOV_LH>, VEX;
+def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movlpd\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (vector_extract (v2f64 VR128:$src),
+                                 (iPTR 0))), addr:$dst)],
+                                 IIC_SSE_MOV_LH>, VEX;
+def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movlps\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
+                                 (iPTR 0))), addr:$dst)],
+                                 IIC_SSE_MOV_LH>;
+def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movlpd\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (vector_extract (v2f64 VR128:$src),
+                                 (iPTR 0))), addr:$dst)],
+                                 IIC_SSE_MOV_LH>;
+} // SchedRW
+
+let Predicates = [HasAVX] in {
+  // Shuffle with VMOVLPS
+  def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
+            (VMOVLPSrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
+            (VMOVLPSrm VR128:$src1, addr:$src2)>;
+
+  // Shuffle with VMOVLPD
+  def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
+            (VMOVLPDrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
+            (VMOVLPDrm VR128:$src1, addr:$src2)>;
+
+  // Store patterns
+  def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
+                   addr:$src1),
+            (VMOVLPSmr addr:$src1, VR128:$src2)>;
+  def : Pat<(store (v4i32 (X86Movlps
+                   (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)), addr:$src1),
+            (VMOVLPSmr addr:$src1, VR128:$src2)>;
+  def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
+                   addr:$src1),
+            (VMOVLPDmr addr:$src1, VR128:$src2)>;
+  def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)),
+                   addr:$src1),
+            (VMOVLPDmr addr:$src1, VR128:$src2)>;
+}
+
+let Predicates = [UseSSE1] in {
+  // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
+  def : Pat<(store (i64 (vector_extract (bc_v2i64 (v4f32 VR128:$src2)),
+                                 (iPTR 0))), addr:$src1),
+            (MOVLPSmr addr:$src1, VR128:$src2)>;
+
+  // Shuffle with MOVLPS
+  def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
+            (MOVLPSrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
+            (MOVLPSrm VR128:$src1, addr:$src2)>;
+  def : Pat<(X86Movlps VR128:$src1,
+                      (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
+            (MOVLPSrm VR128:$src1, addr:$src2)>;
+
+  // Store patterns
+  def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
+                                      addr:$src1),
+            (MOVLPSmr addr:$src1, VR128:$src2)>;
+  def : Pat<(store (v4i32 (X86Movlps
+                   (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)),
+                              addr:$src1),
+            (MOVLPSmr addr:$src1, VR128:$src2)>;
+}
+
+let Predicates = [UseSSE2] in {
+  // Shuffle with MOVLPD
+  def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
+            (MOVLPDrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
+            (MOVLPDrm VR128:$src1, addr:$src2)>;
+
+  // Store patterns
+  def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
+                           addr:$src1),
+            (MOVLPDmr addr:$src1, VR128:$src2)>;
+  def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)),
+                           addr:$src1),
+            (MOVLPDmr addr:$src1, VR128:$src2)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move Hi packed FP Instructions
+//===----------------------------------------------------------------------===//
+
+let AddedComplexity = 20 in {
+  defm MOVH : sse12_mov_hilo_packed<0x16, X86Movlhps, X86Movlhpd, "movhp",
+                                    IIC_SSE_MOV_LH>;
+}
+
+let SchedRW = [WriteStore] in {
+// v2f64 extract element 1 is always custom lowered to unpack high to low
+// and extract element 0 so the non-store version isn't too horrible.
+def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movhps\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (vector_extract
+                                 (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
+                                            (bc_v2f64 (v4f32 VR128:$src))),
+                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX;
+def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movhpd\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (vector_extract
+                                 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
+                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX;
+def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movhps\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (vector_extract
+                                 (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
+                                            (bc_v2f64 (v4f32 VR128:$src))),
+                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>;
+def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movhpd\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (vector_extract
+                                 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
+                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>;
+} // SchedRW
+
+let Predicates = [HasAVX] in {
+  // VMOVHPS patterns
+  def : Pat<(X86Movlhps VR128:$src1,
+                 (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
+            (VMOVHPSrm VR128:$src1, addr:$src2)>;
+  def : Pat<(X86Movlhps VR128:$src1,
+                 (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
+            (VMOVHPSrm VR128:$src1, addr:$src2)>;
+
+  // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
+  // is during lowering, where it's not possible to recognize the load fold
+  // cause it has two uses through a bitcast. One use disappears at isel time
+  // and the fold opportunity reappears.
+  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
+                      (scalar_to_vector (loadf64 addr:$src2)))),
+            (VMOVHPDrm VR128:$src1, addr:$src2)>;
+}
+
+let Predicates = [UseSSE1] in {
+  // MOVHPS patterns
+  def : Pat<(X86Movlhps VR128:$src1,
+                 (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
+            (MOVHPSrm VR128:$src1, addr:$src2)>;
+  def : Pat<(X86Movlhps VR128:$src1,
+                 (bc_v4f32 (v2i64 (X86vzload addr:$src2)))),
+            (MOVHPSrm VR128:$src1, addr:$src2)>;
+}
+
+let Predicates = [UseSSE2] in {
+  // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
+  // is during lowering, where it's not possible to recognize the load fold
+  // cause it has two uses through a bitcast. One use disappears at isel time
+  // and the fold opportunity reappears.
+  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
+                      (scalar_to_vector (loadf64 addr:$src2)))),
+            (MOVHPDrm VR128:$src1, addr:$src2)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
+//===----------------------------------------------------------------------===//
+
+let AddedComplexity = 20, Predicates = [UseAVX] in {
+  def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
+                                       (ins VR128:$src1, VR128:$src2),
+                      "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set VR128:$dst,
+                        (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
+                        IIC_SSE_MOV_LH>,
+                      VEX_4V, Sched<[WriteFShuffle]>;
+  def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
+                                       (ins VR128:$src1, VR128:$src2),
+                      "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set VR128:$dst,
+                        (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))],
+                        IIC_SSE_MOV_LH>,
+                      VEX_4V, Sched<[WriteFShuffle]>;
+}
+let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
+  def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
+                                       (ins VR128:$src1, VR128:$src2),
+                      "movlhps\t{$src2, $dst|$dst, $src2}",
+                      [(set VR128:$dst,
+                        (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
+                        IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
+  def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
+                                       (ins VR128:$src1, VR128:$src2),
+                      "movhlps\t{$src2, $dst|$dst, $src2}",
+                      [(set VR128:$dst,
+                        (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))],
+                        IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
+}
+
+let Predicates = [UseAVX] in {
+  // MOVLHPS patterns
+  def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
+            (VMOVLHPSrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
+            (VMOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
+
+  // MOVHLPS patterns
+  def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)),
+            (VMOVHLPSrr VR128:$src1, VR128:$src2)>;
+}
+
+let Predicates = [UseSSE1] in {
+  // MOVLHPS patterns
+  def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
+            (MOVLHPSrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
+            (MOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
+
+  // MOVHLPS patterns
+  def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)),
+            (MOVHLPSrr VR128:$src1, VR128:$src2)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Conversion Instructions
+//===----------------------------------------------------------------------===//
+
+def SSE_CVT_PD : OpndItins<
+  IIC_SSE_CVT_PD_RR, IIC_SSE_CVT_PD_RM
+>;
+
+let Sched = WriteCvtI2F in
+def SSE_CVT_PS : OpndItins<
+  IIC_SSE_CVT_PS_RR, IIC_SSE_CVT_PS_RM
+>;
+
+let Sched = WriteCvtI2F in
+def SSE_CVT_Scalar : OpndItins<
+  IIC_SSE_CVT_Scalar_RR, IIC_SSE_CVT_Scalar_RM
+>;
+
+let Sched = WriteCvtF2I in
+def SSE_CVT_SS2SI_32 : OpndItins<
+  IIC_SSE_CVT_SS2SI32_RR, IIC_SSE_CVT_SS2SI32_RM
+>;
+
+let Sched = WriteCvtF2I in
+def SSE_CVT_SS2SI_64 : OpndItins<
+  IIC_SSE_CVT_SS2SI64_RR, IIC_SSE_CVT_SS2SI64_RM
+>;
+
+let Sched = WriteCvtF2I in
+def SSE_CVT_SD2SI : OpndItins<
+  IIC_SSE_CVT_SD2SI_RR, IIC_SSE_CVT_SD2SI_RM
+>;
+
+multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+                     SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
+                     string asm, OpndItins itins> {
+  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
+                        [(set DstRC:$dst, (OpNode SrcRC:$src))],
+                        itins.rr>, Sched<[itins.Sched]>;
+  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
+                        [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))],
+                        itins.rm>, Sched<[itins.Sched.Folded]>;
+}
+
+multiclass sse12_cvt_p<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+                       X86MemOperand x86memop, string asm, Domain d,
+                       OpndItins itins> {
+let neverHasSideEffects = 1 in {
+  def rr : I<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
+             [], itins.rr, d>, Sched<[itins.Sched]>;
+  let mayLoad = 1 in
+  def rm : I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
+             [], itins.rm, d>, Sched<[itins.Sched.Folded]>;
+}
+}
+
+multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+                          X86MemOperand x86memop, string asm> {
+let neverHasSideEffects = 1, Predicates = [UseAVX] in {
+  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
+              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+           Sched<[WriteCvtI2F]>;
+  let mayLoad = 1 in
+  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
+              (ins DstRC:$src1, x86memop:$src),
+              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+           Sched<[WriteCvtI2FLd, ReadAfterLd]>;
+} // neverHasSideEffects = 1
+}
+
+let Predicates = [UseAVX] in {
+defm VCVTTSS2SI   : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
+                                "cvttss2si\t{$src, $dst|$dst, $src}",
+                                SSE_CVT_SS2SI_32>,
+                                XS, VEX, VEX_LIG;
+defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
+                                "cvttss2si\t{$src, $dst|$dst, $src}",
+                                SSE_CVT_SS2SI_64>,
+                                XS, VEX, VEX_W, VEX_LIG;
+defm VCVTTSD2SI   : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
+                                "cvttsd2si\t{$src, $dst|$dst, $src}",
+                                SSE_CVT_SD2SI>,
+                                XD, VEX, VEX_LIG;
+defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
+                                "cvttsd2si\t{$src, $dst|$dst, $src}",
+                                SSE_CVT_SD2SI>,
+                                XD, VEX, VEX_W, VEX_LIG;
+
+def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
+                (VCVTTSS2SIrr GR32:$dst, FR32:$src), 0>;
+def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
+                (VCVTTSS2SIrm GR32:$dst, f32mem:$src), 0>;
+def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
+                (VCVTTSD2SIrr GR32:$dst, FR64:$src), 0>;
+def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
+                (VCVTTSD2SIrm GR32:$dst, f64mem:$src), 0>;
+def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
+                (VCVTTSS2SI64rr GR64:$dst, FR32:$src), 0>;
+def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
+                (VCVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>;
+def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
+                (VCVTTSD2SI64rr GR64:$dst, FR64:$src), 0>;
+def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
+                (VCVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>;
+}
+// The assembler can recognize rr 64-bit instructions by seeing a rxx
+// register, but the same isn't true when only using memory operands,
+// provide other assembly "l" and "q" forms to address this explicitly
+// where appropriate to do so.
+defm VCVTSI2SS   : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss{l}">,
+                                  XS, VEX_4V, VEX_LIG;
+defm VCVTSI2SS64 : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}">,
+                                  XS, VEX_4V, VEX_W, VEX_LIG;
+defm VCVTSI2SD   : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}">,
+                                  XD, VEX_4V, VEX_LIG;
+defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">,
+                                  XD, VEX_4V, VEX_W, VEX_LIG;
+
+let Predicates = [UseAVX] in {
+  def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
+                (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src), 0>;
+  def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
+                (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src), 0>;
+
+  def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
+            (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+  def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
+            (VCVTSI2SS64rm (f32 (IMPLICIT_DEF)), addr:$src)>;
+  def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))),
+            (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
+  def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))),
+            (VCVTSI2SD64rm (f64 (IMPLICIT_DEF)), addr:$src)>;
+
+  def : Pat<(f32 (sint_to_fp GR32:$src)),
+            (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
+  def : Pat<(f32 (sint_to_fp GR64:$src)),
+            (VCVTSI2SS64rr (f32 (IMPLICIT_DEF)), GR64:$src)>;
+  def : Pat<(f64 (sint_to_fp GR32:$src)),
+            (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
+  def : Pat<(f64 (sint_to_fp GR64:$src)),
+            (VCVTSI2SD64rr (f64 (IMPLICIT_DEF)), GR64:$src)>;
+}
+
+defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
+                      "cvttss2si\t{$src, $dst|$dst, $src}",
+                      SSE_CVT_SS2SI_32>, XS;
+defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
+                      "cvttss2si\t{$src, $dst|$dst, $src}",
+                      SSE_CVT_SS2SI_64>, XS, REX_W;
+defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
+                      "cvttsd2si\t{$src, $dst|$dst, $src}",
+                      SSE_CVT_SD2SI>, XD;
+defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
+                      "cvttsd2si\t{$src, $dst|$dst, $src}",
+                      SSE_CVT_SD2SI>, XD, REX_W;
+defm CVTSI2SS  : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32,
+                      "cvtsi2ss{l}\t{$src, $dst|$dst, $src}",
+                      SSE_CVT_Scalar>, XS;
+defm CVTSI2SS64 : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64,
+                      "cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
+                      SSE_CVT_Scalar>, XS, REX_W;
+defm CVTSI2SD  : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32,
+                      "cvtsi2sd{l}\t{$src, $dst|$dst, $src}",
+                      SSE_CVT_Scalar>, XD;
+defm CVTSI2SD64 : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64,
+                      "cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
+                      SSE_CVT_Scalar>, XD, REX_W;
+
+def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
+                (CVTTSS2SIrr GR32:$dst, FR32:$src), 0>;
+def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
+                (CVTTSS2SIrm GR32:$dst, f32mem:$src), 0>;
+def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
+                (CVTTSD2SIrr GR32:$dst, FR64:$src), 0>;
+def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
+                (CVTTSD2SIrm GR32:$dst, f64mem:$src), 0>;
+def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
+                (CVTTSS2SI64rr GR64:$dst, FR32:$src), 0>;
+def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
+                (CVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>;
+def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
+                (CVTTSD2SI64rr GR64:$dst, FR64:$src), 0>;
+def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
+                (CVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>;
+
+def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}",
+                (CVTSI2SSrm FR64:$dst, i32mem:$src), 0>;
+def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
+                (CVTSI2SDrm FR64:$dst, i32mem:$src), 0>;
+
+// Conversion Instructions Intrinsics - Match intrinsics which expect MM
+// and/or XMM operand(s).
+
+multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+                         Intrinsic Int, Operand memop, ComplexPattern mem_cpat,
+                         string asm, OpndItins itins> {
+  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
+              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+              [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr>,
+           Sched<[itins.Sched]>;
+  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
+              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+              [(set DstRC:$dst, (Int mem_cpat:$src))], itins.rm>,
+           Sched<[itins.Sched.Folded]>;
+}
+
+multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
+                    RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop,
+                    PatFrag ld_frag, string asm, OpndItins itins,
+                    bit Is2Addr = 1> {
+  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
+              !if(Is2Addr,
+                  !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
+                  !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+              [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))],
+              itins.rr>, Sched<[itins.Sched]>;
+  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
+              (ins DstRC:$src1, x86memop:$src2),
+              !if(Is2Addr,
+                  !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
+                  !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+              [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))],
+              itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+let Predicates = [UseAVX] in {
+defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32,
+                  int_x86_sse2_cvtsd2si, sdmem, sse_load_f64, "cvtsd2si",
+                  SSE_CVT_SD2SI>, XD, VEX, VEX_LIG;
+defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64,
+                    int_x86_sse2_cvtsd2si64, sdmem, sse_load_f64, "cvtsd2si",
+                    SSE_CVT_SD2SI>, XD, VEX, VEX_W, VEX_LIG;
+}
+defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si,
+                 sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD;
+defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64,
+                   sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD, REX_W;
+
+
+let isCodeGenOnly = 1 in {
+  let Predicates = [UseAVX] in {
+  defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+            int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}",
+            SSE_CVT_Scalar, 0>, XS, VEX_4V;
+  defm Int_VCVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+            int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss{q}",
+            SSE_CVT_Scalar, 0>, XS, VEX_4V,
+            VEX_W;
+  defm Int_VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+            int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd{l}",
+            SSE_CVT_Scalar, 0>, XD, VEX_4V;
+  defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+            int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}",
+            SSE_CVT_Scalar, 0>, XD,
+            VEX_4V, VEX_W;
+  }
+  let Constraints = "$src1 = $dst" in {
+    defm Int_CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+                          int_x86_sse_cvtsi2ss, i32mem, loadi32,
+                          "cvtsi2ss{l}", SSE_CVT_Scalar>, XS;
+    defm Int_CVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+                          int_x86_sse_cvtsi642ss, i64mem, loadi64,
+                          "cvtsi2ss{q}", SSE_CVT_Scalar>, XS, REX_W;
+    defm Int_CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+                          int_x86_sse2_cvtsi2sd, i32mem, loadi32,
+                          "cvtsi2sd{l}", SSE_CVT_Scalar>, XD;
+    defm Int_CVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+                          int_x86_sse2_cvtsi642sd, i64mem, loadi64,
+                          "cvtsi2sd{q}", SSE_CVT_Scalar>, XD, REX_W;
+  }
+} // isCodeGenOnly = 1
+
+/// SSE 1 Only
+
+// Aliases for intrinsics
+let isCodeGenOnly = 1 in {
+let Predicates = [UseAVX] in {
+defm Int_VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
+                                    ssmem, sse_load_f32, "cvttss2si",
+                                    SSE_CVT_SS2SI_32>, XS, VEX;
+defm Int_VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
+                                   int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
+                                   "cvttss2si", SSE_CVT_SS2SI_64>,
+                                   XS, VEX, VEX_W;
+defm Int_VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
+                                    sdmem, sse_load_f64, "cvttsd2si",
+                                    SSE_CVT_SD2SI>, XD, VEX;
+defm Int_VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
+                                  int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
+                                  "cvttsd2si", SSE_CVT_SD2SI>,
+                                  XD, VEX, VEX_W;
+}
+defm Int_CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
+                                    ssmem, sse_load_f32, "cvttss2si",
+                                    SSE_CVT_SS2SI_32>, XS;
+defm Int_CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
+                                   int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
+                                   "cvttss2si", SSE_CVT_SS2SI_64>, XS, REX_W;
+defm Int_CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
+                                    sdmem, sse_load_f64, "cvttsd2si",
+                                    SSE_CVT_SD2SI>, XD;
+defm Int_CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
+                                  int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
+                                  "cvttsd2si", SSE_CVT_SD2SI>, XD, REX_W;
+} // isCodeGenOnly = 1
+
+let Predicates = [UseAVX] in {
+defm VCVTSS2SI   : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
+                                  ssmem, sse_load_f32, "cvtss2si",
+                                  SSE_CVT_SS2SI_32>, XS, VEX, VEX_LIG;
+defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
+                                  ssmem, sse_load_f32, "cvtss2si",
+                                  SSE_CVT_SS2SI_64>, XS, VEX, VEX_W, VEX_LIG;
+}
+defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
+                               ssmem, sse_load_f32, "cvtss2si",
+                               SSE_CVT_SS2SI_32>, XS;
+defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
+                                 ssmem, sse_load_f32, "cvtss2si",
+                                 SSE_CVT_SS2SI_64>, XS, REX_W;
+
+defm VCVTDQ2PS   : sse12_cvt_p<0x5B, VR128, VR128, i128mem,
+                               "vcvtdq2ps\t{$src, $dst|$dst, $src}",
+                               SSEPackedSingle, SSE_CVT_PS>,
+                               PS, VEX, Requires<[HasAVX]>;
+defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, VR256, i256mem,
+                               "vcvtdq2ps\t{$src, $dst|$dst, $src}",
+                               SSEPackedSingle, SSE_CVT_PS>,
+                               PS, VEX, VEX_L, Requires<[HasAVX]>;
+
+defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem,
+                            "cvtdq2ps\t{$src, $dst|$dst, $src}",
+                            SSEPackedSingle, SSE_CVT_PS>,
+                            PS, Requires<[UseSSE2]>;
+
+let Predicates = [UseAVX] in {
+def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
+                (VCVTSS2SIrr GR32:$dst, VR128:$src), 0>;
+def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
+                (VCVTSS2SIrm GR32:$dst, ssmem:$src), 0>;
+def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
+                (VCVTSD2SIrr GR32:$dst, VR128:$src), 0>;
+def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
+                (VCVTSD2SIrm GR32:$dst, sdmem:$src), 0>;
+def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
+                (VCVTSS2SI64rr GR64:$dst, VR128:$src), 0>;
+def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
+                (VCVTSS2SI64rm GR64:$dst, ssmem:$src), 0>;
+def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
+                (VCVTSD2SI64rr GR64:$dst, VR128:$src), 0>;
+def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
+                (VCVTSD2SI64rm GR64:$dst, sdmem:$src), 0>;
+}
+
+def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
+                (CVTSS2SIrr GR32:$dst, VR128:$src), 0>;
+def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
+                (CVTSS2SIrm GR32:$dst, ssmem:$src), 0>;
+def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
+                (CVTSD2SIrr GR32:$dst, VR128:$src), 0>;
+def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
+                (CVTSD2SIrm GR32:$dst, sdmem:$src), 0>;
+def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
+                (CVTSS2SI64rr GR64:$dst, VR128:$src), 0>;
+def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
+                (CVTSS2SI64rm GR64:$dst, ssmem:$src), 0>;
+def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
+                (CVTSD2SI64rr GR64:$dst, VR128:$src), 0>;
+def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
+                (CVTSD2SI64rm GR64:$dst, sdmem:$src)>;
+
+/// SSE 2 Only
+
+// Convert scalar double to scalar single
+let neverHasSideEffects = 1, Predicates = [UseAVX] in {
+def VCVTSD2SSrr  : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
+                       (ins FR64:$src1, FR64:$src2),
+                      "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
+                      IIC_SSE_CVT_Scalar_RR>, VEX_4V, VEX_LIG,
+                      Sched<[WriteCvtF2F]>;
+let mayLoad = 1 in
+def VCVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst),
+                       (ins FR64:$src1, f64mem:$src2),
+                      "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [], IIC_SSE_CVT_Scalar_RM>,
+                      XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG,
+                      Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+}
+
+def : Pat<(f32 (fround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>,
+          Requires<[UseAVX]>;
+
+def CVTSD2SSrr  : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
+                      "cvtsd2ss\t{$src, $dst|$dst, $src}",
+                      [(set FR32:$dst, (fround FR64:$src))],
+                      IIC_SSE_CVT_Scalar_RR>, Sched<[WriteCvtF2F]>;
+def CVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
+                      "cvtsd2ss\t{$src, $dst|$dst, $src}",
+                      [(set FR32:$dst, (fround (loadf64 addr:$src)))],
+                      IIC_SSE_CVT_Scalar_RM>,
+                      XD,
+                  Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>;
+
+let isCodeGenOnly = 1 in {
+def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg,
+                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                       [(set VR128:$dst,
+                         (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
+                       IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, Requires<[UseAVX]>,
+                       Sched<[WriteCvtF2F]>;
+def Int_VCVTSD2SSrm: I<0x5A, MRMSrcReg,
+                       (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
+                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
+                                          VR128:$src1, sse_load_f64:$src2))],
+                       IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, Requires<[UseAVX]>,
+                       Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+
+let Constraints = "$src1 = $dst" in {
+def Int_CVTSD2SSrr: I<0x5A, MRMSrcReg,
+                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                       "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
+                       [(set VR128:$dst,
+                         (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
+                       IIC_SSE_CVT_Scalar_RR>, XD, Requires<[UseSSE2]>,
+                       Sched<[WriteCvtF2F]>;
+def Int_CVTSD2SSrm: I<0x5A, MRMSrcReg,
+                       (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
+                       "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
+                                          VR128:$src1, sse_load_f64:$src2))],
+                       IIC_SSE_CVT_Scalar_RM>, XD, Requires<[UseSSE2]>,
+                       Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+}
+} // isCodeGenOnly = 1
+
+// Convert scalar single to scalar double
+// SSE2 instructions with XS prefix
+let neverHasSideEffects = 1, Predicates = [UseAVX] in {
+def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
+                    (ins FR32:$src1, FR32:$src2),
+                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [], IIC_SSE_CVT_Scalar_RR>,
+                    XS, Requires<[HasAVX]>, VEX_4V, VEX_LIG,
+                    Sched<[WriteCvtF2F]>;
+let mayLoad = 1 in
+def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
+                    (ins FR32:$src1, f32mem:$src2),
+                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [], IIC_SSE_CVT_Scalar_RM>,
+                    XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>,
+                    Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+}
+
+def : Pat<(f64 (fextend FR32:$src)),
+    (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[UseAVX]>;
+def : Pat<(fextend (loadf32 addr:$src)),
+    (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX]>;
+
+def : Pat<(extloadf32 addr:$src),
+    (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>,
+    Requires<[UseAVX, OptForSize]>;
+def : Pat<(extloadf32 addr:$src),
+    (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>,
+    Requires<[UseAVX, OptForSpeed]>;
+
+def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
+                   "cvtss2sd\t{$src, $dst|$dst, $src}",
+                   [(set FR64:$dst, (fextend FR32:$src))],
+                   IIC_SSE_CVT_Scalar_RR>, XS,
+                 Requires<[UseSSE2]>, Sched<[WriteCvtF2F]>;
+def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
+                   "cvtss2sd\t{$src, $dst|$dst, $src}",
+                   [(set FR64:$dst, (extloadf32 addr:$src))],
+                   IIC_SSE_CVT_Scalar_RM>, XS,
+                 Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>;
+
+// extload f32 -> f64.  This matches load+fextend because we have a hack in
+// the isel (PreprocessForFPConvert) that can introduce loads after dag
+// combine.
+// Since these loads aren't folded into the fextend, we have to match it
+// explicitly here.
+def : Pat<(fextend (loadf32 addr:$src)),
+          (CVTSS2SDrm addr:$src)>, Requires<[UseSSE2]>;
+def : Pat<(extloadf32 addr:$src),
+          (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>;
+
+let isCodeGenOnly = 1 in {
+def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg,
+                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set VR128:$dst,
+                      (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
+                    IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, Requires<[UseAVX]>,
+                    Sched<[WriteCvtF2F]>;
+def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem,
+                      (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
+                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set VR128:$dst,
+                      (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
+                    IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, Requires<[UseAVX]>,
+                    Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
+def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg,
+                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst,
+                      (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
+                    IIC_SSE_CVT_Scalar_RR>, XS, Requires<[UseSSE2]>,
+                    Sched<[WriteCvtF2F]>;
+def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
+                      (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
+                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst,
+                      (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
+                    IIC_SSE_CVT_Scalar_RM>, XS, Requires<[UseSSE2]>,
+                    Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+}
+} // isCodeGenOnly = 1
+
+// Convert packed single/double fp to doubleword
+def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "cvtps2dq\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
+                       IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
+def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                       "cvtps2dq\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst,
+                         (int_x86_sse2_cvtps2dq (loadv4f32 addr:$src)))],
+                       IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
+def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+                        "cvtps2dq\t{$src, $dst|$dst, $src}",
+                        [(set VR256:$dst,
+                          (int_x86_avx_cvt_ps2dq_256 VR256:$src))],
+                        IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
+def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+                        "cvtps2dq\t{$src, $dst|$dst, $src}",
+                        [(set VR256:$dst,
+                          (int_x86_avx_cvt_ps2dq_256 (loadv8f32 addr:$src)))],
+                        IIC_SSE_CVT_PS_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
+def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                     "cvtps2dq\t{$src, $dst|$dst, $src}",
+                     [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
+                     IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
+def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                     "cvtps2dq\t{$src, $dst|$dst, $src}",
+                     [(set VR128:$dst,
+                       (int_x86_sse2_cvtps2dq (memopv4f32 addr:$src)))],
+                     IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
+
+
+// Convert Packed Double FP to Packed DW Integers
+let Predicates = [HasAVX] in {
+// The assembler can recognize rr 256-bit instructions by seeing a ymm
+// register, but the same isn't true when using memory operands instead.
+// Provide other assembly rr and rm forms to address this explicitly.
+def VCVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "vcvtpd2dq\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>,
+                       VEX, Sched<[WriteCvtF2I]>;
+
+// XMM only
+def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
+                (VCVTPD2DQrr VR128:$dst, VR128:$src), 0>;
+def VCVTPD2DQXrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                       "vcvtpd2dqx\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst,
+                         (int_x86_sse2_cvtpd2dq (loadv2f64 addr:$src)))]>, VEX,
+                       Sched<[WriteCvtF2ILd]>;
+
+// YMM only
+def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
+                       "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst,
+                         (int_x86_avx_cvt_pd2dq_256 VR256:$src))]>, VEX, VEX_L,
+                       Sched<[WriteCvtF2I]>;
+def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
+                       "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst,
+                         (int_x86_avx_cvt_pd2dq_256 (loadv4f64 addr:$src)))]>,
+                       VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
+def : InstAlias<"vcvtpd2dq\t{$src, $dst|$dst, $src}",
+                (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0>;
+}
+
+def CVTPD2DQrm  : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                      "cvtpd2dq\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (int_x86_sse2_cvtpd2dq (memopv2f64 addr:$src)))],
+                      IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2ILd]>;
+def CVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                      "cvtpd2dq\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))],
+                      IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>;
+
+// Convert with truncation packed single/double fp to doubleword
+// SSE2 packed instructions with XS prefix
+def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                         "cvttps2dq\t{$src, $dst|$dst, $src}",
+                         [(set VR128:$dst,
+                           (int_x86_sse2_cvttps2dq VR128:$src))],
+                         IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
+def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                         "cvttps2dq\t{$src, $dst|$dst, $src}",
+                         [(set VR128:$dst, (int_x86_sse2_cvttps2dq
+                                            (loadv4f32 addr:$src)))],
+                         IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
+def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+                          "cvttps2dq\t{$src, $dst|$dst, $src}",
+                          [(set VR256:$dst,
+                            (int_x86_avx_cvtt_ps2dq_256 VR256:$src))],
+                          IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
+def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+                          "cvttps2dq\t{$src, $dst|$dst, $src}",
+                          [(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256
+                                             (loadv8f32 addr:$src)))],
+                          IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
+                          Sched<[WriteCvtF2ILd]>;
+
+def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "cvttps2dq\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvttps2dq VR128:$src))],
+                       IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
+def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                       "cvttps2dq\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst,
+                         (int_x86_sse2_cvttps2dq (memopv4f32 addr:$src)))],
+                       IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
+
+let Predicates = [HasAVX] in {
+  def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
+            (VCVTDQ2PSrr VR128:$src)>;
+  def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))),
+            (VCVTDQ2PSrm addr:$src)>;
+
+  def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
+            (VCVTDQ2PSrr VR128:$src)>;
+  def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (loadv2i64 addr:$src))),
+            (VCVTDQ2PSrm addr:$src)>;
+
+  def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
+            (VCVTTPS2DQrr VR128:$src)>;
+  def : Pat<(v4i32 (fp_to_sint (loadv4f32 addr:$src))),
+            (VCVTTPS2DQrm addr:$src)>;
+
+  def : Pat<(v8f32 (sint_to_fp (v8i32 VR256:$src))),
+            (VCVTDQ2PSYrr VR256:$src)>;
+  def : Pat<(v8f32 (sint_to_fp (bc_v8i32 (loadv4i64 addr:$src)))),
+            (VCVTDQ2PSYrm addr:$src)>;
+
+  def : Pat<(v8i32 (fp_to_sint (v8f32 VR256:$src))),
+            (VCVTTPS2DQYrr VR256:$src)>;
+  def : Pat<(v8i32 (fp_to_sint (loadv8f32 addr:$src))),
+            (VCVTTPS2DQYrm addr:$src)>;
+}
+
+let Predicates = [UseSSE2] in {
+  def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
+            (CVTDQ2PSrr VR128:$src)>;
+  def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))),
+            (CVTDQ2PSrm addr:$src)>;
+
+  def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
+            (CVTDQ2PSrr VR128:$src)>;
+  def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (memopv2i64 addr:$src))),
+            (CVTDQ2PSrm addr:$src)>;
+
+  def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
+            (CVTTPS2DQrr VR128:$src)>;
+  def : Pat<(v4i32 (fp_to_sint (memopv4f32 addr:$src))),
+            (CVTTPS2DQrm addr:$src)>;
+}
+
+def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                        "cvttpd2dq\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst,
+                              (int_x86_sse2_cvttpd2dq VR128:$src))],
+                              IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2I]>;
+
+// The assembler can recognize rr 256-bit instructions by seeing a ymm
+// register, but the same isn't true when using memory operands instead.
+// Provide other assembly rr and rm forms to address this explicitly.
+
+// XMM only
+def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
+                (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0>;
+def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                         "cvttpd2dqx\t{$src, $dst|$dst, $src}",
+                         [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
+                                            (loadv2f64 addr:$src)))],
+                         IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2ILd]>;
+
+// YMM only
+def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
+                         "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
+                         [(set VR128:$dst,
+                           (int_x86_avx_cvtt_pd2dq_256 VR256:$src))],
+                         IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
+def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
+                         "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
+                         [(set VR128:$dst,
+                          (int_x86_avx_cvtt_pd2dq_256 (loadv4f64 addr:$src)))],
+                         IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
+def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}",
+                (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>;
+
+let Predicates = [HasAVX] in {
+  def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))),
+            (VCVTTPD2DQYrr VR256:$src)>;
+  def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))),
+            (VCVTTPD2DQYrm addr:$src)>;
+} // Predicates = [HasAVX]
+
+def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                      "cvttpd2dq\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))],
+                      IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>;
+def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
+                      "cvttpd2dq\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
+                                        (memopv2f64 addr:$src)))],
+                                        IIC_SSE_CVT_PD_RM>,
+                      Sched<[WriteCvtF2ILd]>;
+
+// Convert packed single to packed double
+let Predicates = [HasAVX] in {
+                  // SSE2 instructions without OpSize prefix
+def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
+                     [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
+                     IIC_SSE_CVT_PD_RR>, PS, VEX, Sched<[WriteCvtF2F]>;
+def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
+                    "vcvtps2pd\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
+                    IIC_SSE_CVT_PD_RM>, PS, VEX, Sched<[WriteCvtF2FLd]>;
+def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
+                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
+                     [(set VR256:$dst,
+                       (int_x86_avx_cvt_ps2_pd_256 VR128:$src))],
+                     IIC_SSE_CVT_PD_RR>, PS, VEX, VEX_L, Sched<[WriteCvtF2F]>;
+def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
+                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
+                     [(set VR256:$dst,
+                       (int_x86_avx_cvt_ps2_pd_256 (loadv4f32 addr:$src)))],
+                     IIC_SSE_CVT_PD_RM>, PS, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
+}
+
+let Predicates = [UseSSE2] in {
+def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "cvtps2pd\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
+                       IIC_SSE_CVT_PD_RR>, PS, Sched<[WriteCvtF2F]>;
+def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
+                   "cvtps2pd\t{$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
+                   IIC_SSE_CVT_PD_RM>, PS, Sched<[WriteCvtF2FLd]>;
+}
+
+// Convert Packed DW Integers to Packed Double FP
+let Predicates = [HasAVX] in {
+let neverHasSideEffects = 1, mayLoad = 1 in
+def VCVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                     "vcvtdq2pd\t{$src, $dst|$dst, $src}",
+                     []>, VEX, Sched<[WriteCvtI2FLd]>;
+def VCVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                     "vcvtdq2pd\t{$src, $dst|$dst, $src}",
+                     [(set VR128:$dst,
+                       (int_x86_sse2_cvtdq2pd VR128:$src))]>, VEX,
+                   Sched<[WriteCvtI2F]>;
+def VCVTDQ2PDYrm  : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
+                     "vcvtdq2pd\t{$src, $dst|$dst, $src}",
+                     [(set VR256:$dst,
+                       (int_x86_avx_cvtdq2_pd_256
+                        (bitconvert (loadv2i64 addr:$src))))]>, VEX, VEX_L,
+                    Sched<[WriteCvtI2FLd]>;
+def VCVTDQ2PDYrr  : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
+                     "vcvtdq2pd\t{$src, $dst|$dst, $src}",
+                     [(set VR256:$dst,
+                       (int_x86_avx_cvtdq2_pd_256 VR128:$src))]>, VEX, VEX_L,
+                    Sched<[WriteCvtI2F]>;
+}
+
+let neverHasSideEffects = 1, mayLoad = 1 in
+def CVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                       "cvtdq2pd\t{$src, $dst|$dst, $src}", [],
+                       IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtI2FLd]>;
+def CVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "cvtdq2pd\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))],
+                       IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtI2F]>;
+
+// AVX 256-bit register conversion intrinsics
+let Predicates = [HasAVX] in {
+  def : Pat<(v4f64 (sint_to_fp (v4i32 VR128:$src))),
+            (VCVTDQ2PDYrr VR128:$src)>;
+  def : Pat<(v4f64 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))),
+            (VCVTDQ2PDYrm addr:$src)>;
+} // Predicates = [HasAVX]
+
+// Convert packed double to packed single
+// The assembler can recognize rr 256-bit instructions by seeing a ymm
+// register, but the same isn't true when using memory operands instead.
+// Provide other assembly rr and rm forms to address this explicitly.
+def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "cvtpd2ps\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))],
+                       IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2F]>;
+
+// XMM only
+def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
+                (VCVTPD2PSrr VR128:$dst, VR128:$src), 0>;
+def VCVTPD2PSXrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                        "cvtpd2psx\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst,
+                          (int_x86_sse2_cvtpd2ps (loadv2f64 addr:$src)))],
+                        IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2FLd]>;
+
+// YMM only
+def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
+                        "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst,
+                          (int_x86_avx_cvt_pd2_ps_256 VR256:$src))],
+                        IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2F]>;
+def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
+                        "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst,
+                          (int_x86_avx_cvt_pd2_ps_256 (loadv4f64 addr:$src)))],
+                        IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
+def : InstAlias<"vcvtpd2ps\t{$src, $dst|$dst, $src}",
+                (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0>;
+
+def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                     "cvtpd2ps\t{$src, $dst|$dst, $src}",
+                     [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))],
+                     IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2F]>;
+def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                     "cvtpd2ps\t{$src, $dst|$dst, $src}",
+                     [(set VR128:$dst,
+                       (int_x86_sse2_cvtpd2ps (memopv2f64 addr:$src)))],
+                     IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2FLd]>;
+
+
+// AVX 256-bit register conversion intrinsics
+// FIXME: Migrate SSE conversion intrinsics matching to use patterns as below
+// whenever possible to avoid declaring two versions of each one.
+let Predicates = [HasAVX] in {
+  def : Pat<(int_x86_avx_cvtdq2_ps_256 VR256:$src),
+            (VCVTDQ2PSYrr VR256:$src)>;
+  def : Pat<(int_x86_avx_cvtdq2_ps_256 (bitconvert (loadv4i64 addr:$src))),
+            (VCVTDQ2PSYrm addr:$src)>;
+
+  // Match fround and fextend for 128/256-bit conversions
+  def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))),
+            (VCVTPD2PSrr VR128:$src)>;
+  def : Pat<(v4f32 (X86vfpround (loadv2f64 addr:$src))),
+            (VCVTPD2PSXrm addr:$src)>;
+  def : Pat<(v4f32 (fround (v4f64 VR256:$src))),
+            (VCVTPD2PSYrr VR256:$src)>;
+  def : Pat<(v4f32 (fround (loadv4f64 addr:$src))),
+            (VCVTPD2PSYrm addr:$src)>;
+
+  def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))),
+            (VCVTPS2PDrr VR128:$src)>;
+  def : Pat<(v4f64 (fextend (v4f32 VR128:$src))),
+            (VCVTPS2PDYrr VR128:$src)>;
+  def : Pat<(v4f64 (extloadv4f32 addr:$src)),
+            (VCVTPS2PDYrm addr:$src)>;
+}
+
+let Predicates = [UseSSE2] in {
+  // Match fround and fextend for 128 conversions
+  def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))),
+            (CVTPD2PSrr VR128:$src)>;
+  def : Pat<(v4f32 (X86vfpround (memopv2f64 addr:$src))),
+            (CVTPD2PSrm addr:$src)>;
+
+  def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))),
+            (CVTPS2PDrr VR128:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Compare Instructions
+//===----------------------------------------------------------------------===//
+
+// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
+multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
+                            Operand CC, SDNode OpNode, ValueType VT,
+                            PatFrag ld_frag, string asm, string asm_alt,
+                            OpndItins itins> {
+  def rr : SIi8<0xC2, MRMSrcReg,
+                (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
+                [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))],
+                itins.rr>, Sched<[itins.Sched]>;
+  def rm : SIi8<0xC2, MRMSrcMem,
+                (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
+                [(set RC:$dst, (OpNode (VT RC:$src1),
+                                         (ld_frag addr:$src2), imm:$cc))],
+                                         itins.rm>,
+           Sched<[itins.Sched.Folded, ReadAfterLd]>;
+
+  // Accept explicit immediate argument form instead of comparison code.
+  let isAsmParserOnly = 1, hasSideEffects = 0 in {
+    def rr_alt : SIi8<0xC2, MRMSrcReg, (outs RC:$dst),
+                      (ins RC:$src1, RC:$src2, i8imm:$cc), asm_alt, [],
+                      IIC_SSE_ALU_F32S_RR>, Sched<[itins.Sched]>;
+    let mayLoad = 1 in
+    def rm_alt : SIi8<0xC2, MRMSrcMem, (outs RC:$dst),
+                      (ins RC:$src1, x86memop:$src2, i8imm:$cc), asm_alt, [],
+                      IIC_SSE_ALU_F32S_RM>,
+                      Sched<[itins.Sched.Folded, ReadAfterLd]>;
+  }
+}
+
+defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmps, f32, loadf32,
+                 "cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                 "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+                 SSE_ALU_F32S>,
+                 XS, VEX_4V, VEX_LIG;
+defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmps, f64, loadf64,
+                 "cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                 "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+                 SSE_ALU_F32S>, // same latency as 32 bit compare
+                 XD, VEX_4V, VEX_LIG;
+
+let Constraints = "$src1 = $dst" in {
+  defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmps, f32, loadf32,
+                  "cmp${cc}ss\t{$src2, $dst|$dst, $src2}",
+                  "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", SSE_ALU_F32S>,
+                  XS;
+  defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmps, f64, loadf64,
+                  "cmp${cc}sd\t{$src2, $dst|$dst, $src2}",
+                  "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
+                  SSE_ALU_F64S>,
+                  XD;
+}
+
+multiclass sse12_cmp_scalar_int<X86MemOperand x86memop, Operand CC,
+                         Intrinsic Int, string asm, OpndItins itins> {
+  def rr : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
+                      (ins VR128:$src1, VR128:$src, CC:$cc), asm,
+                        [(set VR128:$dst, (Int VR128:$src1,
+                                               VR128:$src, imm:$cc))],
+                                               itins.rr>,
+           Sched<[itins.Sched]>;
+  def rm : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
+                      (ins VR128:$src1, x86memop:$src, CC:$cc), asm,
+                        [(set VR128:$dst, (Int VR128:$src1,
+                                               (load addr:$src), imm:$cc))],
+                                               itins.rm>,
+           Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+let isCodeGenOnly = 1 in {
+  // Aliases to match intrinsics which expect XMM operand(s).
+  defm Int_VCMPSS  : sse12_cmp_scalar_int<f32mem, AVXCC, int_x86_sse_cmp_ss,
+                       "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}",
+                       SSE_ALU_F32S>,
+                       XS, VEX_4V;
+  defm Int_VCMPSD  : sse12_cmp_scalar_int<f64mem, AVXCC, int_x86_sse2_cmp_sd,
+                       "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}",
+                       SSE_ALU_F32S>, // same latency as f32
+                       XD, VEX_4V;
+  let Constraints = "$src1 = $dst" in {
+    defm Int_CMPSS  : sse12_cmp_scalar_int<f32mem, SSECC, int_x86_sse_cmp_ss,
+                         "cmp${cc}ss\t{$src, $dst|$dst, $src}",
+                         SSE_ALU_F32S>, XS;
+    defm Int_CMPSD  : sse12_cmp_scalar_int<f64mem, SSECC, int_x86_sse2_cmp_sd,
+                         "cmp${cc}sd\t{$src, $dst|$dst, $src}",
+                         SSE_ALU_F64S>,
+                         XD;
+}
+}
+
+
+// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
+multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
+                            ValueType vt, X86MemOperand x86memop,
+                            PatFrag ld_frag, string OpcodeStr> {
+  def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
+                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+                     [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))],
+                     IIC_SSE_COMIS_RR>,
+          Sched<[WriteFAdd]>;
+  def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
+                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+                     [(set EFLAGS, (OpNode (vt RC:$src1),
+                                           (ld_frag addr:$src2)))],
+                                           IIC_SSE_COMIS_RM>,
+          Sched<[WriteFAddLd, ReadAfterLd]>;
+}
+
+let Defs = [EFLAGS] in {
+  defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
+                                  "ucomiss">, PS, VEX, VEX_LIG;
+  defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
+                                  "ucomisd">, PD, VEX, VEX_LIG;
+  let Pattern = []<dag> in {
+    defm VCOMISS  : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load,
+                                    "comiss">, PS, VEX, VEX_LIG;
+    defm VCOMISD  : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load,
+                                    "comisd">, PD, VEX, VEX_LIG;
+  }
+
+  let isCodeGenOnly = 1 in {
+    defm Int_VUCOMISS  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
+                              load, "ucomiss">, PS, VEX;
+    defm Int_VUCOMISD  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
+                              load, "ucomisd">, PD, VEX;
+
+    defm Int_VCOMISS  : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem,
+                              load, "comiss">, PS, VEX;
+    defm Int_VCOMISD  : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem,
+                              load, "comisd">, PD, VEX;
+  }
+  defm UCOMISS  : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
+                                  "ucomiss">, PS;
+  defm UCOMISD  : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
+                                  "ucomisd">, PD;
+
+  let Pattern = []<dag> in {
+    defm COMISS  : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load,
+                                    "comiss">, PS;
+    defm COMISD  : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load,
+                                    "comisd">, PD;
+  }
+
+  let isCodeGenOnly = 1 in {
+    defm Int_UCOMISS  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
+                                load, "ucomiss">, PS;
+    defm Int_UCOMISD  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
+                                load, "ucomisd">, PD;
+
+    defm Int_COMISS  : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, load,
+                                    "comiss">, PS;
+    defm Int_COMISD  : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, load,
+                                    "comisd">, PD;
+  }
+} // Defs = [EFLAGS]
+
+// sse12_cmp_packed - sse 1 & 2 compare packed instructions
+multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
+                            Operand CC, Intrinsic Int, string asm,
+                            string asm_alt, Domain d,
+                            OpndItins itins = SSE_ALU_F32P> {
+  def rri : PIi8<0xC2, MRMSrcReg,
+             (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
+             [(set RC:$dst, (Int RC:$src1, RC:$src2, imm:$cc))],
+             itins.rr, d>,
+            Sched<[WriteFAdd]>;
+  def rmi : PIi8<0xC2, MRMSrcMem,
+             (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
+             [(set RC:$dst, (Int RC:$src1, (memop addr:$src2), imm:$cc))],
+             itins.rm, d>,
+            Sched<[WriteFAddLd, ReadAfterLd]>;
+
+  // Accept explicit immediate argument form instead of comparison code.
+  let isAsmParserOnly = 1, hasSideEffects = 0 in {
+    def rri_alt : PIi8<0xC2, MRMSrcReg,
+               (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc),
+               asm_alt, [], itins.rr, d>, Sched<[WriteFAdd]>;
+    def rmi_alt : PIi8<0xC2, MRMSrcMem,
+               (outs RC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc),
+               asm_alt, [], itins.rm, d>,
+               Sched<[WriteFAddLd, ReadAfterLd]>;
+  }
+}
+
+defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse_cmp_ps,
+               "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+               "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+               SSEPackedSingle>, PS, VEX_4V;
+defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse2_cmp_pd,
+               "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+               "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+               SSEPackedDouble>, PD, VEX_4V;
+defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_ps_256,
+               "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+               "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+               SSEPackedSingle>, PS, VEX_4V, VEX_L;
+defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_pd_256,
+               "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+               "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+               SSEPackedDouble>, PD, VEX_4V, VEX_L;
+let Constraints = "$src1 = $dst" in {
+  defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse_cmp_ps,
+                 "cmp${cc}ps\t{$src2, $dst|$dst, $src2}",
+                 "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
+                 SSEPackedSingle, SSE_ALU_F32P>, PS;
+  defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse2_cmp_pd,
+                 "cmp${cc}pd\t{$src2, $dst|$dst, $src2}",
+                 "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
+                 SSEPackedDouble, SSE_ALU_F64P>, PD;
+}
+
+let Predicates = [HasAVX] in {
+def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
+          (VCMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
+def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)),
+          (VCMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
+def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
+          (VCMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
+def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)),
+          (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
+
+def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), VR256:$src2, imm:$cc)),
+          (VCMPPSYrri (v8f32 VR256:$src1), (v8f32 VR256:$src2), imm:$cc)>;
+def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), (memop addr:$src2), imm:$cc)),
+          (VCMPPSYrmi (v8f32 VR256:$src1), addr:$src2, imm:$cc)>;
+def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), VR256:$src2, imm:$cc)),
+          (VCMPPDYrri VR256:$src1, VR256:$src2, imm:$cc)>;
+def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), (memop addr:$src2), imm:$cc)),
+          (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>;
+}
+
+let Predicates = [UseSSE1] in {
+def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
+          (CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
+def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)),
+          (CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
+}
+
+let Predicates = [UseSSE2] in {
+def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
+          (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
+def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)),
+          (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Shuffle Instructions
+//===----------------------------------------------------------------------===//
+
+/// sse12_shuffle - sse 1 & 2 fp shuffle instructions
+multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
+                         ValueType vt, string asm, PatFrag mem_frag,
+                         Domain d, bit IsConvertibleToThreeAddress = 0> {
+  def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
+                   (ins RC:$src1, x86memop:$src2, i8imm:$src3), asm,
+                   [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
+                                       (i8 imm:$src3))))], IIC_SSE_SHUFP, d>,
+            Sched<[WriteFShuffleLd, ReadAfterLd]>;
+  let isConvertibleToThreeAddress = IsConvertibleToThreeAddress in
+    def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, i8imm:$src3), asm,
+                   [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
+                                       (i8 imm:$src3))))], IIC_SSE_SHUFP, d>,
+              Sched<[WriteFShuffle]>;
+}
+
+defm VSHUFPS  : sse12_shuffle<VR128, f128mem, v4f32,
+           "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+           loadv4f32, SSEPackedSingle>, PS, VEX_4V;
+defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
+           "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+           loadv8f32, SSEPackedSingle>, PS, VEX_4V, VEX_L;
+defm VSHUFPD  : sse12_shuffle<VR128, f128mem, v2f64,
+           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+           loadv2f64, SSEPackedDouble>, PD, VEX_4V;
+defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
+           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+           loadv4f64, SSEPackedDouble>, PD, VEX_4V, VEX_L;
+
+let Constraints = "$src1 = $dst" in {
+  defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
+                    "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                    memopv4f32, SSEPackedSingle, 1 /* cvt to pshufd */>, PS;
+  defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
+                    "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                    memopv2f64, SSEPackedDouble, 1 /* cvt to pshufd */>, PD;
+}
+
+let Predicates = [HasAVX] in {
+  def : Pat<(v4i32 (X86Shufp VR128:$src1,
+                       (bc_v4i32 (loadv2i64 addr:$src2)), (i8 imm:$imm))),
+            (VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
+  def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+            (VSHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
+
+  def : Pat<(v2i64 (X86Shufp VR128:$src1,
+                       (loadv2i64 addr:$src2), (i8 imm:$imm))),
+            (VSHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
+  def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+            (VSHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
+
+  // 256-bit patterns
+  def : Pat<(v8i32 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+            (VSHUFPSYrri VR256:$src1, VR256:$src2, imm:$imm)>;
+  def : Pat<(v8i32 (X86Shufp VR256:$src1,
+                      (bc_v8i32 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
+            (VSHUFPSYrmi VR256:$src1, addr:$src2, imm:$imm)>;
+
+  def : Pat<(v4i64 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+            (VSHUFPDYrri VR256:$src1, VR256:$src2, imm:$imm)>;
+  def : Pat<(v4i64 (X86Shufp VR256:$src1,
+                              (loadv4i64 addr:$src2), (i8 imm:$imm))),
+            (VSHUFPDYrmi VR256:$src1, addr:$src2, imm:$imm)>;
+}
+
+let Predicates = [UseSSE1] in {
+  def : Pat<(v4i32 (X86Shufp VR128:$src1,
+                       (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))),
+            (SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
+  def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+            (SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
+}
+
+let Predicates = [UseSSE2] in {
+  // Generic SHUFPD patterns
+  def : Pat<(v2i64 (X86Shufp VR128:$src1,
+                       (memopv2i64 addr:$src2), (i8 imm:$imm))),
+            (SHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
+  def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+            (SHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Unpack FP Instructions
+//===----------------------------------------------------------------------===//
+
+/// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave
+multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
+                                   PatFrag mem_frag, RegisterClass RC,
+                                   X86MemOperand x86memop, string asm,
+                                   Domain d> {
+    def rr : PI<opc, MRMSrcReg,
+                (outs RC:$dst), (ins RC:$src1, RC:$src2),
+                asm, [(set RC:$dst,
+                           (vt (OpNode RC:$src1, RC:$src2)))],
+                           IIC_SSE_UNPCK, d>, Sched<[WriteFShuffle]>;
+    def rm : PI<opc, MRMSrcMem,
+                (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+                asm, [(set RC:$dst,
+                           (vt (OpNode RC:$src1,
+                                       (mem_frag addr:$src2))))],
+                                       IIC_SSE_UNPCK, d>,
+             Sched<[WriteFShuffleLd, ReadAfterLd]>;
+}
+
+defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, loadv4f32,
+      VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     SSEPackedSingle>, PS, VEX_4V;
+defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, loadv2f64,
+      VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     SSEPackedDouble>, PD, VEX_4V;
+defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, loadv4f32,
+      VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     SSEPackedSingle>, PS, VEX_4V;
+defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, loadv2f64,
+      VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     SSEPackedDouble>, PD, VEX_4V;
+
+defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, loadv8f32,
+      VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     SSEPackedSingle>, PS, VEX_4V, VEX_L;
+defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, loadv4f64,
+      VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     SSEPackedDouble>, PD, VEX_4V, VEX_L;
+defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, loadv8f32,
+      VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     SSEPackedSingle>, PS, VEX_4V, VEX_L;
+defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, loadv4f64,
+      VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     SSEPackedDouble>, PD, VEX_4V, VEX_L;
+
+let Constraints = "$src1 = $dst" in {
+  defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32,
+        VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
+                       SSEPackedSingle>, PS;
+  defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64,
+        VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
+                       SSEPackedDouble>, PD;
+  defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32,
+        VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
+                       SSEPackedSingle>, PS;
+  defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64,
+        VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
+                       SSEPackedDouble>, PD;
+} // Constraints = "$src1 = $dst"
+
+let Predicates = [HasAVX1Only] in {
+  def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
+            (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
+            (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
+            (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
+            (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
+
+  def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))),
+            (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
+            (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))),
+            (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
+            (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
+}
+
+let Predicates = [HasAVX] in {
+  // FIXME: Instead of X86Movddup, there should be a X86Unpckl here, the
+  // problem is during lowering, where it's not possible to recognize the load
+  // fold cause it has two uses through a bitcast. One use disappears at isel
+  // time and the fold opportunity reappears.
+  def : Pat<(v2f64 (X86Movddup VR128:$src)),
+            (VUNPCKLPDrr VR128:$src, VR128:$src)>;
+}
+
+let Predicates = [UseSSE2] in {
+  // FIXME: Instead of X86Movddup, there should be a X86Unpckl here, the
+  // problem is during lowering, where it's not possible to recognize the load
+  // fold cause it has two uses through a bitcast. One use disappears at isel
+  // time and the fold opportunity reappears.
+  def : Pat<(v2f64 (X86Movddup VR128:$src)),
+            (UNPCKLPDrr VR128:$src, VR128:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Extract Floating-Point Sign mask
+//===----------------------------------------------------------------------===//
+
+/// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave
+multiclass sse12_extr_sign_mask<RegisterClass RC, Intrinsic Int, string asm,
+                                Domain d> {
+  def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src),
+              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+              [(set GR32orGR64:$dst, (Int RC:$src))], IIC_SSE_MOVMSK, d>,
+              Sched<[WriteVecLogic]>;
+}
+
+let Predicates = [HasAVX] in {
+  defm VMOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps,
+                                        "movmskps", SSEPackedSingle>, PS, VEX;
+  defm VMOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd,
+                                        "movmskpd", SSEPackedDouble>, PD, VEX;
+  defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_ps_256,
+                                        "movmskps", SSEPackedSingle>, PS,
+                                        VEX, VEX_L;
+  defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_pd_256,
+                                        "movmskpd", SSEPackedDouble>, PD,
+                                        VEX, VEX_L;
+
+  def : Pat<(i32 (X86fgetsign FR32:$src)),
+            (VMOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
+  def : Pat<(i64 (X86fgetsign FR32:$src)),
+            (SUBREG_TO_REG (i64 0),
+             (VMOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128)), sub_32bit)>;
+  def : Pat<(i32 (X86fgetsign FR64:$src)),
+            (VMOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
+  def : Pat<(i64 (X86fgetsign FR64:$src)),
+            (SUBREG_TO_REG (i64 0),
+             (VMOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128)), sub_32bit)>;
+}
+
+defm MOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps",
+                                     SSEPackedSingle>, PS;
+defm MOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, "movmskpd",
+                                     SSEPackedDouble>, PD;
+
+def : Pat<(i32 (X86fgetsign FR32:$src)),
+          (MOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128))>,
+      Requires<[UseSSE1]>;
+def : Pat<(i64 (X86fgetsign FR32:$src)),
+          (SUBREG_TO_REG (i64 0),
+           (MOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128)), sub_32bit)>,
+      Requires<[UseSSE1]>;
+def : Pat<(i32 (X86fgetsign FR64:$src)),
+          (MOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128))>,
+      Requires<[UseSSE2]>;
+def : Pat<(i64 (X86fgetsign FR64:$src)),
+          (SUBREG_TO_REG (i64 0),
+           (MOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128)), sub_32bit)>,
+      Requires<[UseSSE2]>;
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Logical Instructions
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in { // SSE integer instructions
+
+/// PDI_binop_rm - Simple SSE2 binary operator.
+multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                        ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+                        X86MemOperand x86memop, OpndItins itins,
+                        bit IsCommutable, bit Is2Addr> {
+  let isCommutable = IsCommutable in
+  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>,
+       Sched<[itins.Sched]>;
+  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (OpVT (OpNode RC:$src1,
+                                     (bitconvert (memop_frag addr:$src2)))))],
+                                     itins.rm>,
+       Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+} // ExeDomain = SSEPackedInt
+
+multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
+                         ValueType OpVT128, ValueType OpVT256,
+                         OpndItins itins, bit IsCommutable = 0> {
+let Predicates = [HasAVX] in
+  defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
+                    VR128, loadv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V;
+
+let Constraints = "$src1 = $dst" in
+  defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
+                           memopv2i64, i128mem, itins, IsCommutable, 1>;
+
+let Predicates = [HasAVX2] in
+  defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
+                               OpVT256, VR256, loadv4i64, i256mem, itins,
+                               IsCommutable, 0>, VEX_4V, VEX_L;
+}
+
+// These are ordered here for pattern ordering requirements with the fp versions
+
+defm PAND  : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64,
+                           SSE_VEC_BIT_ITINS_P, 1>;
+defm POR   : PDI_binop_all<0xEB, "por", or, v2i64, v4i64,
+                           SSE_VEC_BIT_ITINS_P, 1>;
+defm PXOR  : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64,
+                           SSE_VEC_BIT_ITINS_P, 1>;
+defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
+                           SSE_VEC_BIT_ITINS_P, 0>;
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Logical Instructions
+//===----------------------------------------------------------------------===//
+
+/// sse12_fp_alias_pack_logical - SSE 1 & 2 aliased packed FP logical ops
+///
+multiclass sse12_fp_alias_pack_logical<bits<8> opc, string OpcodeStr,
+                                       SDNode OpNode, OpndItins itins> {
+  defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
+              FR32, f32, f128mem, memopfsf32, SSEPackedSingle, itins, 0>,
+              PS, VEX_4V;
+
+  defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
+        FR64, f64, f128mem, memopfsf64, SSEPackedDouble, itins, 0>,
+        PD, VEX_4V;
+
+  let Constraints = "$src1 = $dst" in {
+    defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, FR32,
+                f32, f128mem, memopfsf32, SSEPackedSingle, itins>,
+                PS;
+
+    defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, FR64,
+                f64, f128mem, memopfsf64, SSEPackedDouble, itins>,
+                PD;
+  }
+}
+
+// Alias bitwise logical operations using SSE logical ops on packed FP values.
+let isCodeGenOnly = 1 in {
+  defm FsAND  : sse12_fp_alias_pack_logical<0x54, "and", X86fand,
+                SSE_BIT_ITINS_P>;
+  defm FsOR   : sse12_fp_alias_pack_logical<0x56, "or", X86for,
+                SSE_BIT_ITINS_P>;
+  defm FsXOR  : sse12_fp_alias_pack_logical<0x57, "xor", X86fxor,
+                SSE_BIT_ITINS_P>;
+
+  let isCommutable = 0 in
+    defm FsANDN : sse12_fp_alias_pack_logical<0x55, "andn", X86fandn,
+                  SSE_BIT_ITINS_P>;
+}
+
+/// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
+///
+multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
+                                   SDNode OpNode> {
+  defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
+        !strconcat(OpcodeStr, "ps"), f256mem,
+        [(set VR256:$dst, (v4i64 (OpNode VR256:$src1, VR256:$src2)))],
+        [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)),
+                           (loadv4i64 addr:$src2)))], 0>, PS, VEX_4V, VEX_L;
+
+  defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
+        !strconcat(OpcodeStr, "pd"), f256mem,
+        [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
+                                  (bc_v4i64 (v4f64 VR256:$src2))))],
+        [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
+                                  (loadv4i64 addr:$src2)))], 0>,
+                                  PD, VEX_4V, VEX_L;
+
+  // In AVX no need to add a pattern for 128-bit logical rr ps, because they
+  // are all promoted to v2i64, and the patterns are covered by the int
+  // version. This is needed in SSE only, because v2i64 isn't supported on
+  // SSE1, but only on SSE2.
+  defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
+       !strconcat(OpcodeStr, "ps"), f128mem, [],
+       [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
+                                 (loadv2i64 addr:$src2)))], 0>, PS, VEX_4V;
+
+  defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
+       !strconcat(OpcodeStr, "pd"), f128mem,
+       [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
+                                 (bc_v2i64 (v2f64 VR128:$src2))))],
+       [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
+                                 (loadv2i64 addr:$src2)))], 0>,
+                                                 PD, VEX_4V;
+
+  let Constraints = "$src1 = $dst" in {
+    defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
+         !strconcat(OpcodeStr, "ps"), f128mem,
+         [(set VR128:$dst, (v2i64 (OpNode VR128:$src1, VR128:$src2)))],
+         [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
+                                   (memopv2i64 addr:$src2)))]>, PS;
+
+    defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
+         !strconcat(OpcodeStr, "pd"), f128mem,
+         [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
+                                   (bc_v2i64 (v2f64 VR128:$src2))))],
+         [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
+                                   (memopv2i64 addr:$src2)))]>, PD;
+  }
+}
+
+defm AND  : sse12_fp_packed_logical<0x54, "and", and>;
+defm OR   : sse12_fp_packed_logical<0x56, "or", or>;
+defm XOR  : sse12_fp_packed_logical<0x57, "xor", xor>;
+let isCommutable = 0 in
+  defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp>;
+
+// AVX1 requires type coercions in order to fold loads directly into logical
+// operations.
+let Predicates = [HasAVX1Only] in {
+  def : Pat<(bc_v8f32 (and VR256:$src1, (loadv4i64 addr:$src2))),
+            (VANDPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(bc_v8f32 (or VR256:$src1, (loadv4i64 addr:$src2))),
+            (VORPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(bc_v8f32 (xor VR256:$src1, (loadv4i64 addr:$src2))),
+            (VXORPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(bc_v8f32 (X86andnp VR256:$src1, (loadv4i64 addr:$src2))),
+            (VANDNPSYrm VR256:$src1, addr:$src2)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Arithmetic Instructions
+//===----------------------------------------------------------------------===//
+
+/// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and
+/// vector forms.
+///
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation.  This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a scalar)
+/// and leaves the top elements unmodified (therefore these cannot be commuted).
+///
+/// These three forms can each be reg+reg or reg+mem.
+///
+
+/// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those
+/// classes below
+multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, SizeItins itins> {
+  defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
+                               VR128, v4f32, f128mem, loadv4f32,
+                               SSEPackedSingle, itins.s, 0>, PS, VEX_4V;
+  defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
+                               VR128, v2f64, f128mem, loadv2f64,
+                               SSEPackedDouble, itins.d, 0>, PD, VEX_4V;
+
+  defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
+                        OpNode, VR256, v8f32, f256mem, loadv8f32,
+                        SSEPackedSingle, itins.s, 0>, PS, VEX_4V, VEX_L;
+  defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
+                        OpNode, VR256, v4f64, f256mem, loadv4f64,
+                        SSEPackedDouble, itins.d, 0>, PD, VEX_4V, VEX_L;
+
+  let Constraints = "$src1 = $dst" in {
+    defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
+                              v4f32, f128mem, memopv4f32, SSEPackedSingle,
+                              itins.s>, PS;
+    defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
+                              v2f64, f128mem, memopv2f64, SSEPackedDouble,
+                              itins.d>, PD;
+  }
+}
+
+multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                  SizeItins itins> {
+  defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
+                         OpNode, FR32, f32mem, itins.s, 0>, XS, VEX_4V, VEX_LIG;
+  defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
+                         OpNode, FR64, f64mem, itins.d, 0>, XD, VEX_4V, VEX_LIG;
+
+  let Constraints = "$src1 = $dst" in {
+    defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
+                              OpNode, FR32, f32mem, itins.s>, XS;
+    defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
+                              OpNode, FR64, f64mem, itins.d>, XD;
+  }
+}
+
+multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
+                                      SizeItins itins> {
+  defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
+                   !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32,
+                   itins.s, 0>, XS, VEX_4V, VEX_LIG;
+  defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
+                   !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64,
+                   itins.d, 0>, XD, VEX_4V, VEX_LIG;
+
+  let Constraints = "$src1 = $dst" in {
+    defm SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
+                   !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32,
+                   itins.s>, XS;
+    defm SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
+                   !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64,
+                   itins.d>, XD;
+  }
+}
+
+// Binary Arithmetic instructions
+defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P>,
+           basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>,
+           basic_sse12_fp_binop_s_int<0x58, "add", SSE_ALU_ITINS_S>;
+defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P>,
+           basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S>,
+           basic_sse12_fp_binop_s_int<0x59, "mul", SSE_MUL_ITINS_S>;
+let isCommutable = 0 in {
+  defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P>,
+             basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>,
+             basic_sse12_fp_binop_s_int<0x5C, "sub", SSE_ALU_ITINS_S>;
+  defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_DIV_ITINS_P>,
+             basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S>,
+             basic_sse12_fp_binop_s_int<0x5E, "div", SSE_DIV_ITINS_S>;
+  defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>,
+             basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>,
+             basic_sse12_fp_binop_s_int<0x5F, "max", SSE_ALU_ITINS_S>;
+  defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>,
+             basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>,
+             basic_sse12_fp_binop_s_int<0x5D, "min", SSE_ALU_ITINS_S>;
+}
+
+let isCodeGenOnly = 1 in {
+  defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P>,
+             basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S>;
+  defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SSE_ALU_ITINS_P>,
+             basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S>;
+}
+
+// Patterns used to select SSE scalar fp arithmetic instructions from
+// a scalar fp operation followed by a blend.
+//
+// These patterns know, for example, how to select an ADDSS from a
+// float add plus vector insert.
+//
+// The effect is that the backend no longer emits unnecessary vector
+// insert instructions immediately after SSE scalar fp instructions
+// like addss or mulss.
+//
+// For example, given the following code:
+//   __m128 foo(__m128 A, __m128 B) {
+//     A[0] += B[0];
+//     return A;
+//   }
+//
+// previously we generated:
+//   addss %xmm0, %xmm1
+//   movss %xmm1, %xmm0
+// 
+// we now generate:
+//   addss %xmm1, %xmm0
+
+let Predicates = [UseSSE1] in {
+  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd
+                      (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+                      FR32:$src))))),
+            (ADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fsub
+                      (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+                      FR32:$src))))),
+            (SUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fmul
+                      (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+                      FR32:$src))))),
+            (MULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fdiv
+                      (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+                      FR32:$src))))),
+            (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+}
+
+let Predicates = [UseSSE2] in {
+  // SSE2 patterns to select scalar double-precision fp arithmetic instructions
+
+  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd
+                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+                      FR64:$src))))),
+            (ADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub
+                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+                      FR64:$src))))),
+            (SUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul
+                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+                      FR64:$src))))),
+            (MULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv
+                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+                      FR64:$src))))),
+            (DIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+}
+
+let Predicates = [UseSSE41] in {
+  // If the subtarget has SSE4.1 but not AVX, the vector insert
+  // instruction is lowered into a X86insertps rather than a X86Movss.
+  // When selecting SSE scalar single-precision fp arithmetic instructions,
+  // make sure that we correctly match the X86insertps.
+
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+                  (fadd (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+                    FR32:$src))), (iPTR 0))),
+            (ADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+                  (fsub (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+                    FR32:$src))), (iPTR 0))),
+            (SUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+                  (fmul (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+                    FR32:$src))), (iPTR 0))),
+            (MULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+                  (fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+                    FR32:$src))), (iPTR 0))),
+            (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+}
+
+let Predicates = [HasAVX] in {
+  // The following patterns select AVX Scalar single/double precision fp
+  // arithmetic instructions.
+
+  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd
+                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+                      FR64:$src))))),
+            (VADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub
+                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+                      FR64:$src))))),
+            (VSUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul
+                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+                      FR64:$src))))),
+            (VMULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv
+                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+                      FR64:$src))))),
+            (VDIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+                 (fadd (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+                       FR32:$src))), (iPTR 0))),
+            (VADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+                 (fsub (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+                       FR32:$src))), (iPTR 0))),
+            (VSUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+                 (fmul (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+                       FR32:$src))), (iPTR 0))),
+            (VMULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+                 (fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+                       FR32:$src))), (iPTR 0))),
+            (VDIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+}
+
+// Patterns used to select SSE scalar fp arithmetic instructions from
+// a vector packed single/double fp operation followed by a vector insert.
+//
+// The effect is that the backend converts the packed fp instruction
+// followed by a vector insert into a single SSE scalar fp instruction.
+//
+// For example, given the following code:
+//   __m128 foo(__m128 A, __m128 B) {
+//     __m128 C = A + B;
+//     return (__m128) {c[0], a[1], a[2], a[3]};
+//   }
+//
+// previously we generated:
+//   addps %xmm0, %xmm1
+//   movss %xmm1, %xmm0
+// 
+// we now generate:
+//   addss %xmm1, %xmm0
+
+let Predicates = [UseSSE1] in {
+  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
+                   (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
+            (ADDSSrr_Int v4f32:$dst, v4f32:$src)>;
+  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), 
+                   (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
+            (SUBSSrr_Int v4f32:$dst, v4f32:$src)>;
+  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
+                   (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
+            (MULSSrr_Int v4f32:$dst, v4f32:$src)>;
+  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), 
+                   (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
+            (DIVSSrr_Int v4f32:$dst, v4f32:$src)>;
+}
+
+let Predicates = [UseSSE2] in {
+  // SSE2 patterns to select scalar double-precision fp arithmetic instructions
+  // from a packed double-precision fp instruction plus movsd.
+
+  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
+                   (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
+            (ADDSDrr_Int v2f64:$dst, v2f64:$src)>;
+  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
+                   (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
+            (SUBSDrr_Int v2f64:$dst, v2f64:$src)>;
+  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
+                   (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
+            (MULSDrr_Int v2f64:$dst, v2f64:$src)>;
+  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
+                   (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
+            (DIVSDrr_Int v2f64:$dst, v2f64:$src)>;
+}
+
+let Predicates = [HasAVX] in {
+  // The following patterns select AVX Scalar single/double precision fp
+  // arithmetic instructions from a packed single precision fp instruction
+  // plus movss/movsd.
+
+  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
+                   (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
+            (VADDSSrr_Int v4f32:$dst, v4f32:$src)>;
+  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
+                   (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
+            (VSUBSSrr_Int v4f32:$dst, v4f32:$src)>;
+  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
+                   (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
+            (VMULSSrr_Int v4f32:$dst, v4f32:$src)>;
+  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
+                   (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
+            (VDIVSSrr_Int v4f32:$dst, v4f32:$src)>;
+  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
+                   (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
+            (VADDSDrr_Int v2f64:$dst, v2f64:$src)>;
+  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
+                   (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
+            (VSUBSDrr_Int v2f64:$dst, v2f64:$src)>;
+  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
+                   (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
+            (VMULSDrr_Int v2f64:$dst, v2f64:$src)>;
+  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
+                   (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
+            (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>;
+}
+
+/// Unop Arithmetic
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation.  This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a
+/// scalar) and leaves the top elements undefined.
+///
+/// And, we have a special variant form for a full-vector intrinsic form.
+
+let Sched = WriteFSqrt in {
+def SSE_SQRTPS : OpndItins<
+  IIC_SSE_SQRTPS_RR, IIC_SSE_SQRTPS_RM
+>;
+
+def SSE_SQRTSS : OpndItins<
+  IIC_SSE_SQRTSS_RR, IIC_SSE_SQRTSS_RM
+>;
+
+def SSE_SQRTPD : OpndItins<
+  IIC_SSE_SQRTPD_RR, IIC_SSE_SQRTPD_RM
+>;
+
+def SSE_SQRTSD : OpndItins<
+  IIC_SSE_SQRTSD_RR, IIC_SSE_SQRTSD_RM
+>;
+}
+
+let Sched = WriteFRcp in {
+def SSE_RCPP : OpndItins<
+  IIC_SSE_RCPP_RR, IIC_SSE_RCPP_RM
+>;
+
+def SSE_RCPS : OpndItins<
+  IIC_SSE_RCPS_RR, IIC_SSE_RCPS_RM
+>;
+}
+
+/// sse1_fp_unop_s - SSE1 unops in scalar form.
+multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr,
+                          SDNode OpNode, Intrinsic F32Int, OpndItins itins> {
+let Predicates = [HasAVX], hasSideEffects = 0 in {
+  def V#NAME#SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst),
+                      (ins FR32:$src1, FR32:$src2),
+                      !strconcat("v", OpcodeStr,
+                                 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                      []>, VEX_4V, VEX_LIG, Sched<[itins.Sched]>;
+  let mayLoad = 1 in {
+  def V#NAME#SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst),
+                      (ins FR32:$src1,f32mem:$src2),
+                      !strconcat("v", OpcodeStr,
+                                 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                      []>, VEX_4V, VEX_LIG,
+                   Sched<[itins.Sched.Folded, ReadAfterLd]>;
+  let isCodeGenOnly = 1 in
+  def V#NAME#SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
+                      (ins VR128:$src1, ssmem:$src2),
+                      !strconcat("v", OpcodeStr,
+                                 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                      []>, VEX_4V, VEX_LIG,
+                      Sched<[itins.Sched.Folded, ReadAfterLd]>;
+  }
+}
+
+  def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
+                !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
+                [(set FR32:$dst, (OpNode FR32:$src))]>, Sched<[itins.Sched]>;
+  // For scalar unary operations, fold a load into the operation
+  // only in OptForSize mode. It eliminates an instruction, but it also
+  // eliminates a whole-register clobber (the load), so it introduces a
+  // partial register update condition.
+  def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
+                !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
+                [(set FR32:$dst, (OpNode (load addr:$src)))], itins.rm>, XS,
+            Requires<[UseSSE1, OptForSize]>, Sched<[itins.Sched.Folded]>;
+let isCodeGenOnly = 1 in {
+  def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                    !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (F32Int VR128:$src))], itins.rr>,
+                Sched<[itins.Sched]>;
+  def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), (ins ssmem:$src),
+                    !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (F32Int sse_load_f32:$src))], itins.rm>,
+                Sched<[itins.Sched.Folded]>;
+}
+}
+
+/// sse1_fp_unop_s_rw - SSE1 unops where vector form has a read-write operand.
+multiclass sse1_fp_unop_rw<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           OpndItins itins> {
+let Predicates = [HasAVX], hasSideEffects = 0 in {
+  def V#NAME#SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst),
+                       (ins FR32:$src1, FR32:$src2),
+                       !strconcat("v", OpcodeStr,
+                           "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                []>, VEX_4V, VEX_LIG, Sched<[itins.Sched]>;
+  let mayLoad = 1 in {
+  def V#NAME#SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst),
+                      (ins FR32:$src1,f32mem:$src2),
+                      !strconcat("v", OpcodeStr,
+                                 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                      []>, VEX_4V, VEX_LIG,
+                   Sched<[itins.Sched.Folded, ReadAfterLd]>;
+  let isCodeGenOnly = 1 in
+  def V#NAME#SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
+                      (ins VR128:$src1, ssmem:$src2),
+                      !strconcat("v", OpcodeStr,
+                                 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                      []>, VEX_4V, VEX_LIG,
+                      Sched<[itins.Sched.Folded, ReadAfterLd]>;
+  }
+}
+
+  def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
+                !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
+                [(set FR32:$dst, (OpNode FR32:$src))]>, Sched<[itins.Sched]>;
+  // For scalar unary operations, fold a load into the operation
+  // only in OptForSize mode. It eliminates an instruction, but it also
+  // eliminates a whole-register clobber (the load), so it introduces a
+  // partial register update condition.
+  def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
+                !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
+                [(set FR32:$dst, (OpNode (load addr:$src)))], itins.rm>, XS,
+            Requires<[UseSSE1, OptForSize]>, Sched<[itins.Sched.Folded]>;
+  let isCodeGenOnly = 1, Constraints = "$src1 = $dst" in {
+    def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst),
+                      (ins VR128:$src1, VR128:$src2),
+                      !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
+                      [], itins.rr>, Sched<[itins.Sched]>;
+    let mayLoad = 1, hasSideEffects = 0 in
+    def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
+                      (ins VR128:$src1, ssmem:$src2),
+                      !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
+                      [], itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+  }
+}
+
+/// sse1_fp_unop_p - SSE1 unops in packed form.
+multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                          OpndItins itins> {
+let Predicates = [HasAVX] in {
+  def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       !strconcat("v", OpcodeStr,
+                                  "ps\t{$src, $dst|$dst, $src}"),
+                       [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))],
+                       itins.rr>, VEX, Sched<[itins.Sched]>;
+  def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                       !strconcat("v", OpcodeStr,
+                                  "ps\t{$src, $dst|$dst, $src}"),
+                       [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))],
+                       itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
+  def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+                        !strconcat("v", OpcodeStr,
+                                   "ps\t{$src, $dst|$dst, $src}"),
+                        [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))],
+                        itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>;
+  def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+                        !strconcat("v", OpcodeStr,
+                                   "ps\t{$src, $dst|$dst, $src}"),
+                        [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))],
+                        itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
+}
+
+  def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+                [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))], itins.rr>,
+            Sched<[itins.Sched]>;
+  def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+                [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))], itins.rm>,
+            Sched<[itins.Sched.Folded]>;
+}
+
+/// sse1_fp_unop_p_int - SSE1 intrinsics unops in packed forms.
+multiclass sse1_fp_unop_p_int<bits<8> opc, string OpcodeStr,
+                              Intrinsic V4F32Int, Intrinsic V8F32Int,
+                              OpndItins itins> {
+let isCodeGenOnly = 1 in {
+let Predicates = [HasAVX] in {
+  def V#NAME#PSr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                           !strconcat("v", OpcodeStr,
+                                      "ps\t{$src, $dst|$dst, $src}"),
+                           [(set VR128:$dst, (V4F32Int VR128:$src))],
+                           itins.rr>, VEX, Sched<[itins.Sched]>;
+  def V#NAME#PSm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                          !strconcat("v", OpcodeStr,
+                          "ps\t{$src, $dst|$dst, $src}"),
+                          [(set VR128:$dst, (V4F32Int (loadv4f32 addr:$src)))],
+                          itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
+  def V#NAME#PSYr_Int : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+                            !strconcat("v", OpcodeStr,
+                                       "ps\t{$src, $dst|$dst, $src}"),
+                            [(set VR256:$dst, (V8F32Int VR256:$src))],
+                            itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>;
+  def V#NAME#PSYm_Int : PSI<opc, MRMSrcMem, (outs VR256:$dst),
+                          (ins f256mem:$src),
+                          !strconcat("v", OpcodeStr,
+                                    "ps\t{$src, $dst|$dst, $src}"),
+                          [(set VR256:$dst, (V8F32Int (loadv8f32 addr:$src)))],
+                          itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
+}
+
+  def PSr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                    !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (V4F32Int VR128:$src))],
+                    itins.rr>, Sched<[itins.Sched]>;
+  def PSm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                    !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (V4F32Int (memopv4f32 addr:$src)))],
+                    itins.rm>, Sched<[itins.Sched.Folded]>;
+} // isCodeGenOnly = 1
+}
+
+/// sse2_fp_unop_s - SSE2 unops in scalar form.
+multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr,
+                          SDNode OpNode, Intrinsic F64Int, OpndItins itins> {
+let Predicates = [HasAVX], hasSideEffects = 0 in {
+  def V#NAME#SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst),
+                      (ins FR64:$src1, FR64:$src2),
+                      !strconcat("v", OpcodeStr,
+                                 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                      []>, VEX_4V, VEX_LIG, Sched<[itins.Sched]>;
+  let mayLoad = 1 in {
+  def V#NAME#SDm : SDI<opc, MRMSrcMem, (outs FR64:$dst),
+                      (ins FR64:$src1,f64mem:$src2),
+                      !strconcat("v", OpcodeStr,
+                                 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                      []>, VEX_4V, VEX_LIG,
+                   Sched<[itins.Sched.Folded, ReadAfterLd]>;
+  let isCodeGenOnly = 1 in
+  def V#NAME#SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst),
+                      (ins VR128:$src1, sdmem:$src2),
+                      !strconcat("v", OpcodeStr,
+                                 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                      []>, VEX_4V, VEX_LIG,
+                      Sched<[itins.Sched.Folded, ReadAfterLd]>;
+  }
+}
+
+  def SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
+                !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
+                [(set FR64:$dst, (OpNode FR64:$src))], itins.rr>,
+            Sched<[itins.Sched]>;
+  // See the comments in sse1_fp_unop_s for why this is OptForSize.
+  def SDm : I<opc, MRMSrcMem, (outs FR64:$dst), (ins f64mem:$src),
+                !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
+                [(set FR64:$dst, (OpNode (load addr:$src)))], itins.rm>, XD,
+            Requires<[UseSSE2, OptForSize]>, Sched<[itins.Sched.Folded]>;
+let isCodeGenOnly = 1 in {
+  def SDr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                    !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (F64Int VR128:$src))], itins.rr>,
+                Sched<[itins.Sched]>;
+  def SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst), (ins sdmem:$src),
+                    !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (F64Int sse_load_f64:$src))], itins.rm>,
+                Sched<[itins.Sched.Folded]>;
+}
+}
+
+/// sse2_fp_unop_p - SSE2 unops in vector forms.
+multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
+                          SDNode OpNode, OpndItins itins> {
+let Predicates = [HasAVX] in {
+  def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       !strconcat("v", OpcodeStr,
+                                  "pd\t{$src, $dst|$dst, $src}"),
+                       [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))],
+                       itins.rr>, VEX, Sched<[itins.Sched]>;
+  def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                       !strconcat("v", OpcodeStr,
+                                  "pd\t{$src, $dst|$dst, $src}"),
+                       [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))],
+                       itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
+  def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+                        !strconcat("v", OpcodeStr,
+                                   "pd\t{$src, $dst|$dst, $src}"),
+                        [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))],
+                        itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>;
+  def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+                        !strconcat("v", OpcodeStr,
+                                   "pd\t{$src, $dst|$dst, $src}"),
+                        [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))],
+                        itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
+}
+
+  def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+              !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+              [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))], itins.rr>,
+            Sched<[itins.Sched]>;
+  def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+                [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))], itins.rm>,
+            Sched<[itins.Sched.Folded]>;
+}
+
+// Square root.
+defm SQRT  : sse1_fp_unop_s<0x51, "sqrt",  fsqrt, int_x86_sse_sqrt_ss,
+                            SSE_SQRTSS>,
+             sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS>,
+             sse2_fp_unop_s<0x51, "sqrt",  fsqrt, int_x86_sse2_sqrt_sd,
+                            SSE_SQRTSD>,
+             sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPD>;
+
+// Reciprocal approximations. Note that these typically require refinement
+// in order to obtain suitable precision.
+defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, SSE_SQRTSS>,
+             sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_SQRTPS>,
+             sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps,
+                                int_x86_avx_rsqrt_ps_256, SSE_SQRTPS>;
+defm RCP   : sse1_fp_unop_rw<0x53, "rcp", X86frcp, SSE_RCPS>,
+             sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP>,
+             sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps,
+                                int_x86_avx_rcp_ps_256, SSE_RCPP>;
+
+let Predicates = [UseAVX] in {
+  def : Pat<(f32 (fsqrt FR32:$src)),
+            (VSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
+  def : Pat<(f32 (fsqrt (load addr:$src))),
+            (VSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
+            Requires<[HasAVX, OptForSize]>;
+  def : Pat<(f64 (fsqrt FR64:$src)),
+            (VSQRTSDr (f64 (IMPLICIT_DEF)), FR64:$src)>, Requires<[HasAVX]>;
+  def : Pat<(f64 (fsqrt (load addr:$src))),
+            (VSQRTSDm (f64 (IMPLICIT_DEF)), addr:$src)>,
+            Requires<[HasAVX, OptForSize]>;
+
+  def : Pat<(f32 (X86frsqrt FR32:$src)),
+            (VRSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
+  def : Pat<(f32 (X86frsqrt (load addr:$src))),
+            (VRSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
+            Requires<[HasAVX, OptForSize]>;
+
+  def : Pat<(f32 (X86frcp FR32:$src)),
+            (VRCPSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
+  def : Pat<(f32 (X86frcp (load addr:$src))),
+            (VRCPSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
+            Requires<[HasAVX, OptForSize]>;
+}
+let Predicates = [UseAVX] in {
+  def : Pat<(int_x86_sse_sqrt_ss VR128:$src),
+            (COPY_TO_REGCLASS (VSQRTSSr (f32 (IMPLICIT_DEF)),
+                                        (COPY_TO_REGCLASS VR128:$src, FR32)),
+                              VR128)>;
+  def : Pat<(int_x86_sse_sqrt_ss sse_load_f32:$src),
+            (VSQRTSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
+
+  def : Pat<(int_x86_sse2_sqrt_sd VR128:$src),
+            (COPY_TO_REGCLASS (VSQRTSDr (f64 (IMPLICIT_DEF)),
+                                        (COPY_TO_REGCLASS VR128:$src, FR64)),
+                              VR128)>;
+  def : Pat<(int_x86_sse2_sqrt_sd sse_load_f64:$src),
+            (VSQRTSDm_Int (v2f64 (IMPLICIT_DEF)), sse_load_f64:$src)>;
+}
+
+let Predicates = [HasAVX] in {
+  def : Pat<(int_x86_sse_rsqrt_ss VR128:$src),
+            (COPY_TO_REGCLASS (VRSQRTSSr (f32 (IMPLICIT_DEF)),
+                                         (COPY_TO_REGCLASS VR128:$src, FR32)),
+                              VR128)>;
+  def : Pat<(int_x86_sse_rsqrt_ss sse_load_f32:$src),
+            (VRSQRTSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
+
+  def : Pat<(int_x86_sse_rcp_ss VR128:$src),
+            (COPY_TO_REGCLASS (VRCPSSr (f32 (IMPLICIT_DEF)),
+                                       (COPY_TO_REGCLASS VR128:$src, FR32)),
+                              VR128)>;
+  def : Pat<(int_x86_sse_rcp_ss sse_load_f32:$src),
+            (VRCPSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
+}
+
+// Reciprocal approximations. Note that these typically require refinement
+// in order to obtain suitable precision.
+let Predicates = [UseSSE1] in {
+  def : Pat<(int_x86_sse_rsqrt_ss VR128:$src),
+            (RSQRTSSr_Int VR128:$src, VR128:$src)>;
+  def : Pat<(int_x86_sse_rcp_ss VR128:$src),
+            (RCPSSr_Int VR128:$src, VR128:$src)>;
+}
+
+// There is no f64 version of the reciprocal approximation instructions.
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Non-temporal stores
+//===----------------------------------------------------------------------===//
+
+let AddedComplexity = 400 in { // Prefer non-temporal versions
+let SchedRW = [WriteStore] in {
+def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
+                     (ins f128mem:$dst, VR128:$src),
+                     "movntps\t{$src, $dst|$dst, $src}",
+                     [(alignednontemporalstore (v4f32 VR128:$src),
+                                               addr:$dst)],
+                                               IIC_SSE_MOVNT>, VEX;
+def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
+                     (ins f128mem:$dst, VR128:$src),
+                     "movntpd\t{$src, $dst|$dst, $src}",
+                     [(alignednontemporalstore (v2f64 VR128:$src),
+                                               addr:$dst)],
+                                               IIC_SSE_MOVNT>, VEX;
+
+let ExeDomain = SSEPackedInt in
+def VMOVNTDQmr    : VPDI<0xE7, MRMDestMem, (outs),
+                         (ins f128mem:$dst, VR128:$src),
+                         "movntdq\t{$src, $dst|$dst, $src}",
+                         [(alignednontemporalstore (v2i64 VR128:$src),
+                                                   addr:$dst)],
+                                                   IIC_SSE_MOVNT>, VEX;
+
+def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
+                     (ins f256mem:$dst, VR256:$src),
+                     "movntps\t{$src, $dst|$dst, $src}",
+                     [(alignednontemporalstore (v8f32 VR256:$src),
+                                               addr:$dst)],
+                                               IIC_SSE_MOVNT>, VEX, VEX_L;
+def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
+                     (ins f256mem:$dst, VR256:$src),
+                     "movntpd\t{$src, $dst|$dst, $src}",
+                     [(alignednontemporalstore (v4f64 VR256:$src),
+                                               addr:$dst)],
+                                               IIC_SSE_MOVNT>, VEX, VEX_L;
+let ExeDomain = SSEPackedInt in
+def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
+                    (ins f256mem:$dst, VR256:$src),
+                    "movntdq\t{$src, $dst|$dst, $src}",
+                    [(alignednontemporalstore (v4i64 VR256:$src),
+                                              addr:$dst)],
+                                              IIC_SSE_MOVNT>, VEX, VEX_L;
+
+def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                    "movntps\t{$src, $dst|$dst, $src}",
+                    [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)],
+                    IIC_SSE_MOVNT>;
+def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                    "movntpd\t{$src, $dst|$dst, $src}",
+                    [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)],
+                    IIC_SSE_MOVNT>;
+
+let ExeDomain = SSEPackedInt in
+def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                    "movntdq\t{$src, $dst|$dst, $src}",
+                    [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)],
+                    IIC_SSE_MOVNT>;
+
+// There is no AVX form for instructions below this point
+def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+                 "movnti{l}\t{$src, $dst|$dst, $src}",
+                 [(nontemporalstore (i32 GR32:$src), addr:$dst)],
+                 IIC_SSE_MOVNT>,
+               PS, Requires<[HasSSE2]>;
+def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+                     "movnti{q}\t{$src, $dst|$dst, $src}",
+                     [(nontemporalstore (i64 GR64:$src), addr:$dst)],
+                     IIC_SSE_MOVNT>,
+                  PS, Requires<[HasSSE2]>;
+} // SchedRW = [WriteStore]
+
+} // AddedComplexity
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Prefetch and memory fence
+//===----------------------------------------------------------------------===//
+
+// Prefetch intrinsic.
+let Predicates = [HasSSE1], SchedRW = [WriteLoad] in {
+def PREFETCHT0   : I<0x18, MRM1m, (outs), (ins i8mem:$src),
+    "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))],
+    IIC_SSE_PREFETCH>, TB;
+def PREFETCHT1   : I<0x18, MRM2m, (outs), (ins i8mem:$src),
+    "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))],
+    IIC_SSE_PREFETCH>, TB;
+def PREFETCHT2   : I<0x18, MRM3m, (outs), (ins i8mem:$src),
+    "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))],
+    IIC_SSE_PREFETCH>, TB;
+def PREFETCHNTA  : I<0x18, MRM0m, (outs), (ins i8mem:$src),
+    "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))],
+    IIC_SSE_PREFETCH>, TB;
+}
+
+// FIXME: How should flush instruction be modeled?
+let SchedRW = [WriteLoad] in {
+// Flush cache
+def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
+               "clflush\t$src", [(int_x86_sse2_clflush addr:$src)],
+               IIC_SSE_PREFETCH>, TB, Requires<[HasSSE2]>;
+}
+
+let SchedRW = [WriteNop] in {
+// Pause. This "instruction" is encoded as "rep; nop", so even though it
+// was introduced with SSE2, it's backward compatible.
+def PAUSE : I<0x90, RawFrm, (outs), (ins),  
+              "pause", [(int_x86_sse2_pause)], IIC_SSE_PAUSE>, 
+              OBXS, Requires<[HasSSE2]>;
+}
+
+let SchedRW = [WriteFence] in {
+// Load, store, and memory fence
+def SFENCE : I<0xAE, MRM_F8, (outs), (ins),
+               "sfence", [(int_x86_sse_sfence)], IIC_SSE_SFENCE>,
+               TB, Requires<[HasSSE1]>;
+def LFENCE : I<0xAE, MRM_E8, (outs), (ins),
+               "lfence", [(int_x86_sse2_lfence)], IIC_SSE_LFENCE>,
+               TB, Requires<[HasSSE2]>;
+def MFENCE : I<0xAE, MRM_F0, (outs), (ins),
+               "mfence", [(int_x86_sse2_mfence)], IIC_SSE_MFENCE>,
+               TB, Requires<[HasSSE2]>;
+} // SchedRW
+
+def : Pat<(X86SFence), (SFENCE)>;
+def : Pat<(X86LFence), (LFENCE)>;
+def : Pat<(X86MFence), (MFENCE)>;
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Load/Store XCSR register
+//===----------------------------------------------------------------------===//
+
+def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
+                  "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
+                  IIC_SSE_LDMXCSR>, VEX, Sched<[WriteLoad]>;
+def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
+                  "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
+                  IIC_SSE_STMXCSR>, VEX, Sched<[WriteStore]>;
+
+def LDMXCSR : PSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
+                  "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
+                  IIC_SSE_LDMXCSR>, Sched<[WriteLoad]>;
+def STMXCSR : PSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
+                  "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
+                  IIC_SSE_STMXCSR>, Sched<[WriteStore]>;
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Move Aligned/Unaligned Packed Integer Instructions
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in { // SSE integer instructions
+
+let neverHasSideEffects = 1, SchedRW = [WriteMove] in {
+def VMOVDQArr  : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                    "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
+                    VEX;
+def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+                    "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
+                    VEX, VEX_L;
+def VMOVDQUrr  : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                    "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
+                    VEX;
+def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+                    "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
+                    VEX, VEX_L;
+}
+
+// For Disassembler
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
+    SchedRW = [WriteMove] in {
+def VMOVDQArr_REV  : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                        "movdqa\t{$src, $dst|$dst, $src}", [],
+                        IIC_SSE_MOVA_P_RR>,
+                        VEX;
+def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
+                        "movdqa\t{$src, $dst|$dst, $src}", [],
+                        IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
+def VMOVDQUrr_REV  : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                        "movdqu\t{$src, $dst|$dst, $src}", [],
+                        IIC_SSE_MOVU_P_RR>,
+                        VEX;
+def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
+                        "movdqu\t{$src, $dst|$dst, $src}", [],
+                        IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
+}
+
+let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
+    neverHasSideEffects = 1, SchedRW = [WriteLoad] in {
+def VMOVDQArm  : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                   "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>,
+                   VEX;
+def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
+                   "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>,
+                   VEX, VEX_L;
+let Predicates = [HasAVX] in {
+  def VMOVDQUrm  : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                    "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
+                    XS, VEX;
+  def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
+                    "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
+                    XS, VEX, VEX_L;
+}
+}
+
+let mayStore = 1, neverHasSideEffects = 1, SchedRW = [WriteStore] in {
+def VMOVDQAmr  : VPDI<0x7F, MRMDestMem, (outs),
+                     (ins i128mem:$dst, VR128:$src),
+                     "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>,
+                     VEX;
+def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
+                     (ins i256mem:$dst, VR256:$src),
+                     "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>,
+                     VEX, VEX_L;
+let Predicates = [HasAVX] in {
+def VMOVDQUmr  : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+                  "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>,
+                  XS, VEX;
+def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
+                  "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>,
+                  XS, VEX, VEX_L;
+}
+}
+
+let SchedRW = [WriteMove] in {
+let neverHasSideEffects = 1 in
+def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                   "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>;
+
+def MOVDQUrr :   I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                   "movdqu\t{$src, $dst|$dst, $src}",
+                   [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
+
+// For Disassembler
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
+def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                       "movdqa\t{$src, $dst|$dst, $src}", [],
+                       IIC_SSE_MOVA_P_RR>;
+
+def MOVDQUrr_REV :   I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                       "movdqu\t{$src, $dst|$dst, $src}",
+                       [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
+}
+} // SchedRW
+
+let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
+    neverHasSideEffects = 1, SchedRW = [WriteLoad] in {
+def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                   "movdqa\t{$src, $dst|$dst, $src}",
+                   [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/],
+                   IIC_SSE_MOVA_P_RM>;
+def MOVDQUrm :   I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                   "movdqu\t{$src, $dst|$dst, $src}",
+                   [/*(set VR128:$dst, (loadv2i64 addr:$src))*/],
+                   IIC_SSE_MOVU_P_RM>,
+                 XS, Requires<[UseSSE2]>;
+}
+
+let mayStore = 1, neverHasSideEffects = 1, SchedRW = [WriteStore] in {
+def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+                   "movdqa\t{$src, $dst|$dst, $src}",
+                   [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/],
+                   IIC_SSE_MOVA_P_MR>;
+def MOVDQUmr :   I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+                   "movdqu\t{$src, $dst|$dst, $src}",
+                   [/*(store (v2i64 VR128:$src), addr:$dst)*/],
+                   IIC_SSE_MOVU_P_MR>,
+                 XS, Requires<[UseSSE2]>;
+}
+
+} // ExeDomain = SSEPackedInt
+
+let Predicates = [HasAVX] in {
+  def : Pat<(int_x86_sse2_storeu_dq addr:$dst, VR128:$src),
+            (VMOVDQUmr addr:$dst, VR128:$src)>;
+  def : Pat<(int_x86_avx_storeu_dq_256 addr:$dst, VR256:$src),
+            (VMOVDQUYmr addr:$dst, VR256:$src)>;
+}
+let Predicates = [UseSSE2] in
+def : Pat<(int_x86_sse2_storeu_dq addr:$dst, VR128:$src),
+          (MOVDQUmr addr:$dst, VR128:$src)>;
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Arithmetic Instructions
+//===---------------------------------------------------------------------===//
+
+let Sched = WriteVecIMul in
+def SSE_PMADD : OpndItins<
+  IIC_SSE_PMADD, IIC_SSE_PMADD
+>;
+
+let ExeDomain = SSEPackedInt in { // SSE integer instructions
+
+multiclass PDI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
+                            RegisterClass RC, PatFrag memop_frag,
+                            X86MemOperand x86memop,
+                            OpndItins itins,
+                            bit IsCommutable = 0,
+                            bit Is2Addr = 1> {
+  let isCommutable = IsCommutable in
+  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (IntId RC:$src1, RC:$src2))], itins.rr>,
+      Sched<[itins.Sched]>;
+  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (IntId RC:$src1, (bitconvert (memop_frag addr:$src2))))],
+       itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+multiclass PDI_binop_all_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128,
+                             Intrinsic IntId256, OpndItins itins,
+                             bit IsCommutable = 0> {
+let Predicates = [HasAVX] in
+  defm V#NAME : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId128,
+                                 VR128, loadv2i64, i128mem, itins,
+                                 IsCommutable, 0>, VEX_4V;
+
+let Constraints = "$src1 = $dst" in
+  defm NAME : PDI_binop_rm_int<opc, OpcodeStr, IntId128, VR128, memopv2i64,
+                               i128mem, itins, IsCommutable, 1>;
+
+let Predicates = [HasAVX2] in
+  defm V#NAME#Y : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId256,
+                                   VR256, loadv4i64, i256mem, itins,
+                                   IsCommutable, 0>, VEX_4V, VEX_L;
+}
+
+multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
+                         string OpcodeStr, SDNode OpNode,
+                         SDNode OpNode2, RegisterClass RC,
+                         ValueType DstVT, ValueType SrcVT, PatFrag bc_frag,
+                         ShiftOpndItins itins,
+                         bit Is2Addr = 1> {
+  // src2 is always 128-bit
+  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, VR128:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))],
+        itins.rr>, Sched<[WriteVecShift]>;
+  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, i128mem:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (DstVT (OpNode RC:$src1,
+                       (bc_frag (memopv2i64 addr:$src2)))))], itins.rm>,
+      Sched<[WriteVecShiftLd, ReadAfterLd]>;
+  def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
+       (ins RC:$src1, i8imm:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 imm:$src2))))], itins.ri>,
+       Sched<[WriteVecShift]>;
+}
+
+/// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types
+multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                         ValueType DstVT, ValueType SrcVT, RegisterClass RC,
+                         PatFrag memop_frag, X86MemOperand x86memop,
+                         OpndItins itins,
+                         bit IsCommutable = 0, bit Is2Addr = 1> {
+  let isCommutable = IsCommutable in
+  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
+       Sched<[itins.Sched]>;
+  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
+                                     (bitconvert (memop_frag addr:$src2)))))]>,
+       Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+} // ExeDomain = SSEPackedInt
+
+defm PADDB   : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8,
+                             SSE_INTALU_ITINS_P, 1>;
+defm PADDW   : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16,
+                             SSE_INTALU_ITINS_P, 1>;
+defm PADDD   : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32,
+                             SSE_INTALU_ITINS_P, 1>;
+defm PADDQ   : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64,
+                             SSE_INTALUQ_ITINS_P, 1>;
+defm PMULLW  : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16,
+                             SSE_INTMUL_ITINS_P, 1>;
+defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16,
+                             SSE_INTMUL_ITINS_P, 1>;
+defm PMULHW  : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16,
+                             SSE_INTMUL_ITINS_P, 1>;
+defm PSUBB   : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8,
+                             SSE_INTALU_ITINS_P, 0>;
+defm PSUBW   : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16,
+                             SSE_INTALU_ITINS_P, 0>;
+defm PSUBD   : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32,
+                             SSE_INTALU_ITINS_P, 0>;
+defm PSUBQ   : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64,
+                             SSE_INTALUQ_ITINS_P, 0>;
+defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", X86subus, v16i8, v32i8,
+                             SSE_INTALU_ITINS_P, 0>;
+defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", X86subus, v8i16, v16i16,
+                             SSE_INTALU_ITINS_P, 0>;
+defm PMINUB  : PDI_binop_all<0xDA, "pminub", X86umin, v16i8, v32i8,
+                             SSE_INTALU_ITINS_P, 1>;
+defm PMINSW  : PDI_binop_all<0xEA, "pminsw", X86smin, v8i16, v16i16,
+                             SSE_INTALU_ITINS_P, 1>;
+defm PMAXUB  : PDI_binop_all<0xDE, "pmaxub", X86umax, v16i8, v32i8,
+                             SSE_INTALU_ITINS_P, 1>;
+defm PMAXSW  : PDI_binop_all<0xEE, "pmaxsw", X86smax, v8i16, v16i16,
+                             SSE_INTALU_ITINS_P, 1>;
+
+// Intrinsic forms
+defm PSUBSB  : PDI_binop_all_int<0xE8, "psubsb", int_x86_sse2_psubs_b,
+                                 int_x86_avx2_psubs_b, SSE_INTALU_ITINS_P, 0>;
+defm PSUBSW  : PDI_binop_all_int<0xE9, "psubsw" , int_x86_sse2_psubs_w,
+                                 int_x86_avx2_psubs_w, SSE_INTALU_ITINS_P, 0>;
+defm PADDSB  : PDI_binop_all_int<0xEC, "paddsb" , int_x86_sse2_padds_b,
+                                 int_x86_avx2_padds_b, SSE_INTALU_ITINS_P, 1>;
+defm PADDSW  : PDI_binop_all_int<0xED, "paddsw" , int_x86_sse2_padds_w,
+                                 int_x86_avx2_padds_w, SSE_INTALU_ITINS_P, 1>;
+defm PADDUSB : PDI_binop_all_int<0xDC, "paddusb", int_x86_sse2_paddus_b,
+                                 int_x86_avx2_paddus_b, SSE_INTALU_ITINS_P, 1>;
+defm PADDUSW : PDI_binop_all_int<0xDD, "paddusw", int_x86_sse2_paddus_w,
+                                 int_x86_avx2_paddus_w, SSE_INTALU_ITINS_P, 1>;
+defm PMADDWD : PDI_binop_all_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd,
+                                 int_x86_avx2_pmadd_wd, SSE_PMADD, 1>;
+defm PAVGB   : PDI_binop_all_int<0xE0, "pavgb", int_x86_sse2_pavg_b,
+                                 int_x86_avx2_pavg_b, SSE_INTALU_ITINS_P, 1>;
+defm PAVGW   : PDI_binop_all_int<0xE3, "pavgw", int_x86_sse2_pavg_w,
+                                 int_x86_avx2_pavg_w, SSE_INTALU_ITINS_P, 1>;
+defm PSADBW  : PDI_binop_all_int<0xF6, "psadbw", int_x86_sse2_psad_bw,
+                                 int_x86_avx2_psad_bw, SSE_PMADD, 1>;
+
+let Predicates = [HasAVX] in
+defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128,
+                              loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>,
+                              VEX_4V;
+let Predicates = [HasAVX2] in
+defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32,
+                               VR256, loadv4i64, i256mem,
+                               SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
+let Constraints = "$src1 = $dst" in
+defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128,
+                             memopv2i64, i128mem, SSE_INTMUL_ITINS_P, 1>;
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Logical Instructions
+//===---------------------------------------------------------------------===//
+
+let Predicates = [HasAVX] in {
+defm VPSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli,
+                            VR128, v8i16, v8i16, bc_v8i16,
+                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+defm VPSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli,
+                            VR128, v4i32, v4i32, bc_v4i32,
+                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+defm VPSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli,
+                            VR128, v2i64, v2i64, bc_v2i64,
+                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+
+defm VPSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli,
+                            VR128, v8i16, v8i16, bc_v8i16,
+                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+defm VPSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli,
+                            VR128, v4i32, v4i32, bc_v4i32,
+                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+defm VPSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli,
+                            VR128, v2i64, v2i64, bc_v2i64,
+                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+
+defm VPSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai,
+                            VR128, v8i16, v8i16, bc_v8i16,
+                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+defm VPSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai,
+                            VR128, v4i32, v4i32, bc_v4i32,
+                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+
+let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in {
+  // 128-bit logical shifts.
+  def VPSLLDQri : PDIi8<0x73, MRM7r,
+                    (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                    "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set VR128:$dst,
+                      (int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2))]>,
+                    VEX_4V;
+  def VPSRLDQri : PDIi8<0x73, MRM3r,
+                    (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                    "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set VR128:$dst,
+                      (int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2))]>,
+                    VEX_4V;
+  // PSRADQri doesn't exist in SSE[1-3].
+}
+} // Predicates = [HasAVX]
+
+let Predicates = [HasAVX2] in {
+defm VPSLLWY : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli,
+                             VR256, v16i16, v8i16, bc_v8i16,
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+defm VPSLLDY : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli,
+                             VR256, v8i32, v4i32, bc_v4i32,
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+defm VPSLLQY : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli,
+                             VR256, v4i64, v2i64, bc_v2i64,
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+
+defm VPSRLWY : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli,
+                             VR256, v16i16, v8i16, bc_v8i16,
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+defm VPSRLDY : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli,
+                             VR256, v8i32, v4i32, bc_v4i32,
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+defm VPSRLQY : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli,
+                             VR256, v4i64, v2i64, bc_v2i64,
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+
+defm VPSRAWY : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai,
+                             VR256, v16i16, v8i16, bc_v8i16,
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+defm VPSRADY : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai,
+                             VR256, v8i32, v4i32, bc_v4i32,
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+
+let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in {
+  // 256-bit logical shifts.
+  def VPSLLDQYri : PDIi8<0x73, MRM7r,
+                    (outs VR256:$dst), (ins VR256:$src1, i32i8imm:$src2),
+                    "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set VR256:$dst,
+                      (int_x86_avx2_psll_dq_bs VR256:$src1, imm:$src2))]>,
+                    VEX_4V, VEX_L;
+  def VPSRLDQYri : PDIi8<0x73, MRM3r,
+                    (outs VR256:$dst), (ins VR256:$src1, i32i8imm:$src2),
+                    "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set VR256:$dst,
+                      (int_x86_avx2_psrl_dq_bs VR256:$src1, imm:$src2))]>,
+                    VEX_4V, VEX_L;
+  // PSRADQYri doesn't exist in SSE[1-3].
+}
+} // Predicates = [HasAVX2]
+
+let Constraints = "$src1 = $dst" in {
+defm PSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli,
+                           VR128, v8i16, v8i16, bc_v8i16,
+                           SSE_INTSHIFT_ITINS_P>;
+defm PSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli,
+                           VR128, v4i32, v4i32, bc_v4i32,
+                           SSE_INTSHIFT_ITINS_P>;
+defm PSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli,
+                           VR128, v2i64, v2i64, bc_v2i64,
+                           SSE_INTSHIFT_ITINS_P>;
+
+defm PSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli,
+                           VR128, v8i16, v8i16, bc_v8i16,
+                           SSE_INTSHIFT_ITINS_P>;
+defm PSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli,
+                           VR128, v4i32, v4i32, bc_v4i32,
+                           SSE_INTSHIFT_ITINS_P>;
+defm PSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli,
+                           VR128, v2i64, v2i64, bc_v2i64,
+                           SSE_INTSHIFT_ITINS_P>;
+
+defm PSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai,
+                           VR128, v8i16, v8i16, bc_v8i16,
+                           SSE_INTSHIFT_ITINS_P>;
+defm PSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai,
+                           VR128, v4i32, v4i32, bc_v4i32,
+                           SSE_INTSHIFT_ITINS_P>;
+
+let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in {
+  // 128-bit logical shifts.
+  def PSLLDQri : PDIi8<0x73, MRM7r,
+                       (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                       "pslldq\t{$src2, $dst|$dst, $src2}",
+                       [(set VR128:$dst,
+                         (int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2))],
+                         IIC_SSE_INTSHDQ_P_RI>;
+  def PSRLDQri : PDIi8<0x73, MRM3r,
+                       (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                       "psrldq\t{$src2, $dst|$dst, $src2}",
+                       [(set VR128:$dst,
+                         (int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2))],
+                         IIC_SSE_INTSHDQ_P_RI>;
+  // PSRADQri doesn't exist in SSE[1-3].
+}
+} // Constraints = "$src1 = $dst"
+
+let Predicates = [HasAVX] in {
+  def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2),
+            (VPSLLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
+  def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2),
+            (VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
+  def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)),
+            (VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
+
+  // Shift up / down and insert zero's.
+  def : Pat<(v2i64 (X86vshldq VR128:$src, (i8 imm:$amt))),
+            (VPSLLDQri VR128:$src, (BYTE_imm imm:$amt))>;
+  def : Pat<(v2i64 (X86vshrdq VR128:$src, (i8 imm:$amt))),
+            (VPSRLDQri VR128:$src, (BYTE_imm imm:$amt))>;
+}
+
+let Predicates = [HasAVX2] in {
+  def : Pat<(int_x86_avx2_psll_dq VR256:$src1, imm:$src2),
+            (VPSLLDQYri VR256:$src1, (BYTE_imm imm:$src2))>;
+  def : Pat<(int_x86_avx2_psrl_dq VR256:$src1, imm:$src2),
+            (VPSRLDQYri VR256:$src1, (BYTE_imm imm:$src2))>;
+}
+
+let Predicates = [UseSSE2] in {
+  def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2),
+            (PSLLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
+  def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2),
+            (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
+  def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)),
+            (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
+
+  // Shift up / down and insert zero's.
+  def : Pat<(v2i64 (X86vshldq VR128:$src, (i8 imm:$amt))),
+            (PSLLDQri VR128:$src, (BYTE_imm imm:$amt))>;
+  def : Pat<(v2i64 (X86vshrdq VR128:$src, (i8 imm:$amt))),
+            (PSRLDQri VR128:$src, (BYTE_imm imm:$amt))>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Comparison Instructions
+//===---------------------------------------------------------------------===//
+
+defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8,
+                             SSE_INTALU_ITINS_P, 1>;
+defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16,
+                             SSE_INTALU_ITINS_P, 1>;
+defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32,
+                             SSE_INTALU_ITINS_P, 1>;
+defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8,
+                             SSE_INTALU_ITINS_P, 0>;
+defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16,
+                             SSE_INTALU_ITINS_P, 0>;
+defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32,
+                             SSE_INTALU_ITINS_P, 0>;
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Shuffle Instructions
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in {
+multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256,
+                         SDNode OpNode> {
+let Predicates = [HasAVX] in {
+  def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst),
+                      (ins VR128:$src1, i8imm:$src2),
+                      !strconcat("v", OpcodeStr,
+                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                      [(set VR128:$dst,
+                        (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
+                      IIC_SSE_PSHUF_RI>, VEX, Sched<[WriteShuffle]>;
+  def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
+                      (ins i128mem:$src1, i8imm:$src2),
+                      !strconcat("v", OpcodeStr,
+                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     [(set VR128:$dst,
+                       (vt128 (OpNode (bitconvert (loadv2i64 addr:$src1)),
+                        (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX,
+                  Sched<[WriteShuffleLd]>;
+}
+
+let Predicates = [HasAVX2] in {
+  def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst),
+                       (ins VR256:$src1, i8imm:$src2),
+                       !strconcat("v", OpcodeStr,
+                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                       [(set VR256:$dst,
+                         (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))],
+                       IIC_SSE_PSHUF_RI>, VEX, VEX_L, Sched<[WriteShuffle]>;
+  def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
+                       (ins i256mem:$src1, i8imm:$src2),
+                       !strconcat("v", OpcodeStr,
+                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                      [(set VR256:$dst,
+                        (vt256 (OpNode (bitconvert (loadv4i64 addr:$src1)),
+                         (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX, VEX_L,
+                   Sched<[WriteShuffleLd]>;
+}
+
+let Predicates = [UseSSE2] in {
+  def ri : Ii8<0x70, MRMSrcReg,
+               (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2),
+               !strconcat(OpcodeStr,
+                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                [(set VR128:$dst,
+                  (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
+                IIC_SSE_PSHUF_RI>, Sched<[WriteShuffle]>;
+  def mi : Ii8<0x70, MRMSrcMem,
+               (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2),
+               !strconcat(OpcodeStr,
+                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                [(set VR128:$dst,
+                  (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)),
+                          (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>,
+           Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+}
+} // ExeDomain = SSEPackedInt
+
+defm PSHUFD  : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd>, PD;
+defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw>, XS;
+defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw>, XD;
+
+let Predicates = [HasAVX] in {
+  def : Pat<(v4f32 (X86PShufd (loadv4f32 addr:$src1), (i8 imm:$imm))),
+            (VPSHUFDmi addr:$src1, imm:$imm)>;
+  def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
+            (VPSHUFDri VR128:$src1, imm:$imm)>;
+}
+
+let Predicates = [UseSSE2] in {
+  def : Pat<(v4f32 (X86PShufd (memopv4f32 addr:$src1), (i8 imm:$imm))),
+            (PSHUFDmi addr:$src1, imm:$imm)>;
+  def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
+            (PSHUFDri VR128:$src1, imm:$imm)>;
+}
+
+//===---------------------------------------------------------------------===//
+// Packed Integer Pack Instructions (SSE & AVX)
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in {
+multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
+                     ValueType ArgVT, SDNode OpNode, PatFrag bc_frag,
+                     bit Is2Addr = 1> {
+  def rr : PDI<opc, MRMSrcReg,
+               (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+               !if(Is2Addr,
+                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                   !strconcat(OpcodeStr,
+                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+               [(set VR128:$dst,
+                     (OutVT (OpNode (ArgVT VR128:$src1), VR128:$src2)))]>,
+               Sched<[WriteShuffle]>;
+  def rm : PDI<opc, MRMSrcMem,
+               (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+               !if(Is2Addr,
+                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                   !strconcat(OpcodeStr,
+                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+               [(set VR128:$dst,
+                     (OutVT (OpNode VR128:$src1,
+                                    (bc_frag (memopv2i64 addr:$src2)))))]>,
+               Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+multiclass sse2_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT,
+                       ValueType ArgVT, SDNode OpNode, PatFrag bc_frag> {
+  def Yrr : PDI<opc, MRMSrcReg,
+                (outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
+                !strconcat(OpcodeStr,
+                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                [(set VR256:$dst,
+                      (OutVT (OpNode (ArgVT VR256:$src1), VR256:$src2)))]>,
+                Sched<[WriteShuffle]>;
+  def Yrm : PDI<opc, MRMSrcMem,
+                (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
+                !strconcat(OpcodeStr,
+                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                [(set VR256:$dst,
+                      (OutVT (OpNode VR256:$src1,
+                                     (bc_frag (memopv4i64 addr:$src2)))))]>,
+                Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
+                     ValueType ArgVT, SDNode OpNode, PatFrag bc_frag,
+                     bit Is2Addr = 1> {
+  def rr : SS48I<opc, MRMSrcReg,
+                 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                 !if(Is2Addr,
+                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                     !strconcat(OpcodeStr,
+                                "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+                 [(set VR128:$dst,
+                       (OutVT (OpNode (ArgVT VR128:$src1), VR128:$src2)))]>,
+                 Sched<[WriteShuffle]>;
+  def rm : SS48I<opc, MRMSrcMem,
+                 (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+                 !if(Is2Addr,
+                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                     !strconcat(OpcodeStr,
+                                "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+                 [(set VR128:$dst,
+                       (OutVT (OpNode VR128:$src1,
+                                      (bc_frag (memopv2i64 addr:$src2)))))]>,
+                 Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+multiclass sse4_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT,
+                     ValueType ArgVT, SDNode OpNode, PatFrag bc_frag> {
+  def Yrr : SS48I<opc, MRMSrcReg,
+                  (outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
+                  !strconcat(OpcodeStr,
+                             "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                  [(set VR256:$dst,
+                        (OutVT (OpNode (ArgVT VR256:$src1), VR256:$src2)))]>,
+                  Sched<[WriteShuffle]>;
+  def Yrm : SS48I<opc, MRMSrcMem,
+                  (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
+                  !strconcat(OpcodeStr,
+                             "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                  [(set VR256:$dst,
+                        (OutVT (OpNode VR256:$src1,
+                                       (bc_frag (memopv4i64 addr:$src2)))))]>,
+                  Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX] in {
+  defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss,
+                             bc_v8i16, 0>, VEX_4V;
+  defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss,
+                             bc_v4i32, 0>, VEX_4V;
+
+  defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus,
+                             bc_v8i16, 0>, VEX_4V;
+  defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus,
+                             bc_v4i32, 0>, VEX_4V;
+}
+
+let Predicates = [HasAVX2] in {
+  defm VPACKSSWB : sse2_pack_y<0x63, "vpacksswb", v32i8, v16i16, X86Packss,
+                               bc_v16i16>, VEX_4V, VEX_L;
+  defm VPACKSSDW : sse2_pack_y<0x6B, "vpackssdw", v16i16, v8i32, X86Packss,
+                               bc_v8i32>, VEX_4V, VEX_L;
+
+  defm VPACKUSWB : sse2_pack_y<0x67, "vpackuswb", v32i8, v16i16, X86Packus,
+                               bc_v16i16>, VEX_4V, VEX_L;
+  defm VPACKUSDW : sse4_pack_y<0x2B, "vpackusdw", v16i16, v8i32, X86Packus,
+                               bc_v8i32>, VEX_4V, VEX_L;
+}
+
+let Constraints = "$src1 = $dst" in {
+  defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss,
+                            bc_v8i16>;
+  defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss,
+                            bc_v4i32>;
+
+  defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus,
+                            bc_v8i16>;
+
+  let Predicates = [HasSSE41] in
+  defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus,
+                            bc_v4i32>;
+}
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Unpack Instructions
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in {
+multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
+                       SDNode OpNode, PatFrag bc_frag, bit Is2Addr = 1> {
+  def rr : PDI<opc, MRMSrcReg,
+      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+      !if(Is2Addr,
+          !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
+          !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+      [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))],
+      IIC_SSE_UNPCK>, Sched<[WriteShuffle]>;
+  def rm : PDI<opc, MRMSrcMem,
+      (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+      !if(Is2Addr,
+          !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
+          !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+      [(set VR128:$dst, (OpNode VR128:$src1,
+                                  (bc_frag (memopv2i64
+                                               addr:$src2))))],
+                                               IIC_SSE_UNPCK>,
+      Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+multiclass sse2_unpack_y<bits<8> opc, string OpcodeStr, ValueType vt,
+                         SDNode OpNode, PatFrag bc_frag> {
+  def Yrr : PDI<opc, MRMSrcReg,
+      (outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
+      !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+      [(set VR256:$dst, (vt (OpNode VR256:$src1, VR256:$src2)))]>,
+      Sched<[WriteShuffle]>;
+  def Yrm : PDI<opc, MRMSrcMem,
+      (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
+      !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+      [(set VR256:$dst, (OpNode VR256:$src1,
+                                  (bc_frag (memopv4i64 addr:$src2))))]>,
+      Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX] in {
+  defm VPUNPCKLBW  : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl,
+                                 bc_v16i8, 0>, VEX_4V;
+  defm VPUNPCKLWD  : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl,
+                                 bc_v8i16, 0>, VEX_4V;
+  defm VPUNPCKLDQ  : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl,
+                                 bc_v4i32, 0>, VEX_4V;
+  defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl,
+                                 bc_v2i64, 0>, VEX_4V;
+
+  defm VPUNPCKHBW  : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh,
+                                 bc_v16i8, 0>, VEX_4V;
+  defm VPUNPCKHWD  : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh,
+                                 bc_v8i16, 0>, VEX_4V;
+  defm VPUNPCKHDQ  : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh,
+                                 bc_v4i32, 0>, VEX_4V;
+  defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh,
+                                 bc_v2i64, 0>, VEX_4V;
+}
+
+let Predicates = [HasAVX2] in {
+  defm VPUNPCKLBW  : sse2_unpack_y<0x60, "vpunpcklbw", v32i8, X86Unpckl,
+                                   bc_v32i8>, VEX_4V, VEX_L;
+  defm VPUNPCKLWD  : sse2_unpack_y<0x61, "vpunpcklwd", v16i16, X86Unpckl,
+                                   bc_v16i16>, VEX_4V, VEX_L;
+  defm VPUNPCKLDQ  : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Unpckl,
+                                   bc_v8i32>, VEX_4V, VEX_L;
+  defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Unpckl,
+                                   bc_v4i64>, VEX_4V, VEX_L;
+
+  defm VPUNPCKHBW  : sse2_unpack_y<0x68, "vpunpckhbw", v32i8, X86Unpckh,
+                                   bc_v32i8>, VEX_4V, VEX_L;
+  defm VPUNPCKHWD  : sse2_unpack_y<0x69, "vpunpckhwd", v16i16, X86Unpckh,
+                                   bc_v16i16>, VEX_4V, VEX_L;
+  defm VPUNPCKHDQ  : sse2_unpack_y<0x6A, "vpunpckhdq", v8i32, X86Unpckh,
+                                   bc_v8i32>, VEX_4V, VEX_L;
+  defm VPUNPCKHQDQ : sse2_unpack_y<0x6D, "vpunpckhqdq", v4i64, X86Unpckh,
+                                   bc_v4i64>, VEX_4V, VEX_L;
+}
+
+let Constraints = "$src1 = $dst" in {
+  defm PUNPCKLBW  : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl,
+                                bc_v16i8>;
+  defm PUNPCKLWD  : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl,
+                                bc_v8i16>;
+  defm PUNPCKLDQ  : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl,
+                                bc_v4i32>;
+  defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl,
+                                bc_v2i64>;
+
+  defm PUNPCKHBW  : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh,
+                                bc_v16i8>;
+  defm PUNPCKHWD  : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh,
+                                bc_v8i16>;
+  defm PUNPCKHDQ  : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh,
+                                bc_v4i32>;
+  defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh,
+                                bc_v2i64>;
+}
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Extract and Insert
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in {
+multiclass sse2_pinsrw<bit Is2Addr = 1> {
+  def rri : Ii8<0xC4, MRMSrcReg,
+       (outs VR128:$dst), (ins VR128:$src1,
+        GR32orGR64:$src2, i32i8imm:$src3),
+       !if(Is2Addr,
+           "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+           "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+       [(set VR128:$dst,
+         (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))],
+       IIC_SSE_PINSRW>, Sched<[WriteShuffle]>;
+  def rmi : Ii8<0xC4, MRMSrcMem,
+                       (outs VR128:$dst), (ins VR128:$src1,
+                        i16mem:$src2, i32i8imm:$src3),
+       !if(Is2Addr,
+           "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+           "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+       [(set VR128:$dst,
+         (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
+                    imm:$src3))], IIC_SSE_PINSRW>,
+       Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+// Extract
+let Predicates = [HasAVX] in
+def VPEXTRWri : Ii8<0xC5, MRMSrcReg,
+                    (outs GR32orGR64:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                    "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
+                                            imm:$src2))]>, PD, VEX,
+                Sched<[WriteShuffle]>;
+def PEXTRWri : PDIi8<0xC5, MRMSrcReg,
+                    (outs GR32orGR64:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                    "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
+                                            imm:$src2))], IIC_SSE_PEXTRW>,
+               Sched<[WriteShuffleLd, ReadAfterLd]>;
+
+// Insert
+let Predicates = [HasAVX] in
+defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V;
+
+let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in
+defm PINSRW : sse2_pinsrw, PD;
+
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Mask Creation
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLogic] in {
+
+def VPMOVMSKBrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
+           (ins VR128:$src),
+           "pmovmskb\t{$src, $dst|$dst, $src}",
+           [(set GR32orGR64:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))],
+           IIC_SSE_MOVMSK>, VEX;
+
+let Predicates = [HasAVX2] in {
+def VPMOVMSKBYrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
+           (ins VR256:$src),
+           "pmovmskb\t{$src, $dst|$dst, $src}",
+           [(set GR32orGR64:$dst, (int_x86_avx2_pmovmskb VR256:$src))]>,
+           VEX, VEX_L;
+}
+
+def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
+           "pmovmskb\t{$src, $dst|$dst, $src}",
+           [(set GR32orGR64:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))],
+           IIC_SSE_MOVMSK>;
+
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Conditional Store
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in {
+
+let Uses = [EDI], Predicates = [HasAVX,Not64BitMode] in
+def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
+           (ins VR128:$src, VR128:$mask),
+           "maskmovdqu\t{$mask, $src|$src, $mask}",
+           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
+           IIC_SSE_MASKMOV>, VEX;
+let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in
+def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
+           (ins VR128:$src, VR128:$mask),
+           "maskmovdqu\t{$mask, $src|$src, $mask}",
+           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
+           IIC_SSE_MASKMOV>, VEX;
+
+let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in
+def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
+           "maskmovdqu\t{$mask, $src|$src, $mask}",
+           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
+           IIC_SSE_MASKMOV>;
+let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in
+def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
+           "maskmovdqu\t{$mask, $src|$src, $mask}",
+           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
+           IIC_SSE_MASKMOV>;
+
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Move Doubleword
+//===---------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------===//
+// Move Int Doubleword to Packed Double Int
+//
+def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
+                      "movd\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
+                        VEX, Sched<[WriteMove]>;
+def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
+                      "movd\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
+                        IIC_SSE_MOVDQ>,
+                      VEX, Sched<[WriteLoad]>;
+def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+                        "movq\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst,
+                          (v2i64 (scalar_to_vector GR64:$src)))],
+                          IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
+let isCodeGenOnly = 1 in
+def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
+                       "movq\t{$src, $dst|$dst, $src}",
+                       [(set FR64:$dst, (bitconvert GR64:$src))],
+                       IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
+
+def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
+                      "movd\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
+                  Sched<[WriteMove]>;
+def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
+                      "movd\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
+                        IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
+def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+                        "mov{d|q}\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst,
+                          (v2i64 (scalar_to_vector GR64:$src)))],
+                          IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
+let isCodeGenOnly = 1 in
+def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
+                       "mov{d|q}\t{$src, $dst|$dst, $src}",
+                       [(set FR64:$dst, (bitconvert GR64:$src))],
+                       IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
+
+//===---------------------------------------------------------------------===//
+// Move Int Doubleword to Single Scalar
+//
+let isCodeGenOnly = 1 in {
+  def VMOVDI2SSrr  : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set FR32:$dst, (bitconvert GR32:$src))],
+                        IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
+
+  def VMOVDI2SSrm  : VS2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
+                        IIC_SSE_MOVDQ>,
+                        VEX, Sched<[WriteLoad]>;
+  def MOVDI2SSrr  : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set FR32:$dst, (bitconvert GR32:$src))],
+                        IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
+
+  def MOVDI2SSrm  : S2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
+                        IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
+}
+
+//===---------------------------------------------------------------------===//
+// Move Packed Doubleword Int to Packed Double Int
+//
+def VMOVPDI2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
+                       "movd\t{$src, $dst|$dst, $src}",
+                       [(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
+                                        (iPTR 0)))], IIC_SSE_MOVD_ToGP>, VEX,
+                    Sched<[WriteMove]>;
+def VMOVPDI2DImr  : VS2I<0x7E, MRMDestMem, (outs),
+                       (ins i32mem:$dst, VR128:$src),
+                       "movd\t{$src, $dst|$dst, $src}",
+                       [(store (i32 (vector_extract (v4i32 VR128:$src),
+                                     (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
+                                     VEX, Sched<[WriteStore]>;
+def MOVPDI2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
+                       "movd\t{$src, $dst|$dst, $src}",
+                       [(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
+                                        (iPTR 0)))], IIC_SSE_MOVD_ToGP>,
+                   Sched<[WriteMove]>;
+def MOVPDI2DImr  : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
+                       "movd\t{$src, $dst|$dst, $src}",
+                       [(store (i32 (vector_extract (v4i32 VR128:$src),
+                                     (iPTR 0))), addr:$dst)],
+                                     IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+
+def : Pat<(v8i32 (X86Vinsert (v8i32 immAllZerosV), GR32:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>;
+
+def : Pat<(v4i64 (X86Vinsert (bc_v4i64 (v8i32 immAllZerosV)), GR64:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>;
+
+def : Pat<(v8i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>;
+
+def : Pat<(v4i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>;
+
+//===---------------------------------------------------------------------===//
+// Move Packed Doubleword Int first element to Doubleword Int
+//
+let SchedRW = [WriteMove] in {
+def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
+                          "movq\t{$src, $dst|$dst, $src}",
+                          [(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
+                                                           (iPTR 0)))],
+                                                           IIC_SSE_MOVD_ToGP>,
+                      VEX;
+
+def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
+                        "mov{d|q}\t{$src, $dst|$dst, $src}",
+                        [(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
+                                                         (iPTR 0)))],
+                                                         IIC_SSE_MOVD_ToGP>;
+} //SchedRW
+
+//===---------------------------------------------------------------------===//
+// Bitcast FR64 <-> GR64
+//
+let isCodeGenOnly = 1 in {
+  let Predicates = [UseAVX] in
+  def VMOV64toSDrm : VS2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
+                          "movq\t{$src, $dst|$dst, $src}",
+                          [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
+                          VEX, Sched<[WriteLoad]>;
+  def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
+                           "movq\t{$src, $dst|$dst, $src}",
+                           [(set GR64:$dst, (bitconvert FR64:$src))],
+                           IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
+  def VMOVSDto64mr : VRS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
+                           "movq\t{$src, $dst|$dst, $src}",
+                           [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
+                           IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
+
+  def MOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
+                         "movq\t{$src, $dst|$dst, $src}",
+                         [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))],
+                         IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
+  def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
+                         "mov{d|q}\t{$src, $dst|$dst, $src}",
+                         [(set GR64:$dst, (bitconvert FR64:$src))],
+                         IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
+  def MOVSDto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
+                         "movq\t{$src, $dst|$dst, $src}",
+                         [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
+                         IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+}
+
+//===---------------------------------------------------------------------===//
+// Move Scalar Single to Double Int
+//
+let isCodeGenOnly = 1 in {
+  def VMOVSS2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set GR32:$dst, (bitconvert FR32:$src))],
+                        IIC_SSE_MOVD_ToGP>, VEX, Sched<[WriteMove]>;
+  def VMOVSS2DImr  : VS2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
+                        IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
+  def MOVSS2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set GR32:$dst, (bitconvert FR32:$src))],
+                        IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
+  def MOVSS2DImr  : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
+                        IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+}
+
+//===---------------------------------------------------------------------===//
+// Patterns and instructions to describe movd/movq to XMM register zero-extends
+//
+let isCodeGenOnly = 1, SchedRW = [WriteMove] in {
+let AddedComplexity = 15 in {
+def VMOVZQI2PQIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+                       "movq\t{$src, $dst|$dst, $src}", // X86-64 only
+                       [(set VR128:$dst, (v2i64 (X86vzmovl
+                                      (v2i64 (scalar_to_vector GR64:$src)))))],
+                                      IIC_SSE_MOVDQ>,
+                                      VEX, VEX_W;
+def MOVZQI2PQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+                       "mov{d|q}\t{$src, $dst|$dst, $src}", // X86-64 only
+                       [(set VR128:$dst, (v2i64 (X86vzmovl
+                                      (v2i64 (scalar_to_vector GR64:$src)))))],
+                                      IIC_SSE_MOVDQ>;
+}
+} // isCodeGenOnly, SchedRW
+
+let Predicates = [UseAVX] in {
+  let AddedComplexity = 15 in
+    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
+              (VMOVDI2PDIrr GR32:$src)>;
+
+  // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part.
+  let AddedComplexity = 20 in {
+    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
+              (VMOVDI2PDIrm addr:$src)>;
+    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
+              (VMOVDI2PDIrm addr:$src)>;
+    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+              (VMOVDI2PDIrm addr:$src)>;
+  }
+  // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
+  def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
+                               (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src), sub_xmm)>;
+  def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
+                               (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
+            (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrr GR64:$src), sub_xmm)>;
+}
+
+let Predicates = [UseSSE2] in {
+  let AddedComplexity = 15 in
+    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
+              (MOVDI2PDIrr GR32:$src)>;
+
+  let AddedComplexity = 20 in {
+    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
+              (MOVDI2PDIrm addr:$src)>;
+    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
+              (MOVDI2PDIrm addr:$src)>;
+    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+              (MOVDI2PDIrm addr:$src)>;
+  }
+}
+
+// These are the correct encodings of the instructions so that we know how to
+// read correct assembly, even though we continue to emit the wrong ones for
+// compatibility with Darwin's buggy assembler.
+def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
+                (MOV64toPQIrr VR128:$dst, GR64:$src), 0>;
+def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
+                (MOVPQIto64rr GR64:$dst, VR128:$src), 0>;
+// Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX.
+def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
+                (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>;
+def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
+                (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>;
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Move Quadword
+//===---------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------===//
+// Move Quadword Int to Packed Quadword Int
+//
+
+let SchedRW = [WriteLoad] in {
+def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                    "vmovq\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst,
+                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
+                    VEX, Requires<[UseAVX]>;
+def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                    "movq\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst,
+                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))],
+                      IIC_SSE_MOVDQ>, XS,
+                    Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
+} // SchedRW
+
+//===---------------------------------------------------------------------===//
+// Move Packed Quadword Int to Quadword Int
+//
+let SchedRW = [WriteStore] in {
+def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+                      "movq\t{$src, $dst|$dst, $src}",
+                      [(store (i64 (vector_extract (v2i64 VR128:$src),
+                                    (iPTR 0))), addr:$dst)],
+                                    IIC_SSE_MOVDQ>, VEX;
+def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+                      "movq\t{$src, $dst|$dst, $src}",
+                      [(store (i64 (vector_extract (v2i64 VR128:$src),
+                                    (iPTR 0))), addr:$dst)],
+                                    IIC_SSE_MOVDQ>;
+} // SchedRW
+
+// For disassembler only
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
+    SchedRW = [WriteVecLogic] in {
+def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                     "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>, VEX;
+def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                      "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>;
+}
+
+//===---------------------------------------------------------------------===//
+// Store / copy lower 64-bits of a XMM register.
+//
+let Predicates = [UseAVX] in
+def : Pat<(int_x86_sse2_storel_dq addr:$dst, VR128:$src),
+          (VMOVPQI2QImr addr:$dst, VR128:$src)>;
+let Predicates = [UseSSE2] in
+def : Pat<(int_x86_sse2_storel_dq addr:$dst, VR128:$src),
+          (MOVPQI2QImr addr:$dst, VR128:$src)>;
+
+let isCodeGenOnly = 1, AddedComplexity = 20 in {
+def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                     "vmovq\t{$src, $dst|$dst, $src}",
+                     [(set VR128:$dst,
+                       (v2i64 (X86vzmovl (v2i64 (scalar_to_vector
+                                                 (loadi64 addr:$src))))))],
+                                                 IIC_SSE_MOVDQ>,
+                     XS, VEX, Requires<[UseAVX]>, Sched<[WriteLoad]>;
+
+def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                     "movq\t{$src, $dst|$dst, $src}",
+                     [(set VR128:$dst,
+                       (v2i64 (X86vzmovl (v2i64 (scalar_to_vector
+                                                 (loadi64 addr:$src))))))],
+                                                 IIC_SSE_MOVDQ>,
+                     XS, Requires<[UseSSE2]>, Sched<[WriteLoad]>;
+}
+
+let Predicates = [UseAVX], AddedComplexity = 20 in {
+  def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
+            (VMOVZQI2PQIrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzload addr:$src)),
+            (VMOVZQI2PQIrm addr:$src)>;
+}
+
+let Predicates = [UseSSE2], AddedComplexity = 20 in {
+  def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
+            (MOVZQI2PQIrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>;
+}
+
+let Predicates = [HasAVX] in {
+def : Pat<(v4i64 (alignedX86vzload addr:$src)),
+          (SUBREG_TO_REG (i32 0), (VMOVAPSrm addr:$src), sub_xmm)>;
+def : Pat<(v4i64 (X86vzload addr:$src)),
+          (SUBREG_TO_REG (i32 0), (VMOVUPSrm addr:$src), sub_xmm)>;
+}
+
+//===---------------------------------------------------------------------===//
+// Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
+// IA32 document. movq xmm1, xmm2 does clear the high bits.
+//
+let SchedRW = [WriteVecLogic] in {
+let AddedComplexity = 15 in
+def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                        "vmovq\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
+                    IIC_SSE_MOVQ_RR>,
+                      XS, VEX, Requires<[UseAVX]>;
+let AddedComplexity = 15 in
+def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                        "movq\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
+                    IIC_SSE_MOVQ_RR>,
+                      XS, Requires<[UseSSE2]>;
+} // SchedRW
+
+let isCodeGenOnly = 1, SchedRW = [WriteVecLogicLd] in {
+let AddedComplexity = 20 in
+def VMOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                        "vmovq\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst, (v2i64 (X86vzmovl
+                                             (loadv2i64 addr:$src))))],
+                                             IIC_SSE_MOVDQ>,
+                      XS, VEX, Requires<[UseAVX]>;
+let AddedComplexity = 20 in {
+def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                        "movq\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst, (v2i64 (X86vzmovl
+                                             (loadv2i64 addr:$src))))],
+                                             IIC_SSE_MOVDQ>,
+                      XS, Requires<[UseSSE2]>;
+}
+} // isCodeGenOnly, SchedRW
+
+let AddedComplexity = 20 in {
+  let Predicates = [UseAVX] in {
+    def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
+              (VMOVZPQILo2PQIrr VR128:$src)>;
+  }
+  let Predicates = [UseSSE2] in {
+    def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
+              (MOVZPQILo2PQIrr VR128:$src)>;
+  }
+}
+
+//===---------------------------------------------------------------------===//
+// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
+//===---------------------------------------------------------------------===//
+multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
+                              ValueType vt, RegisterClass RC, PatFrag mem_frag,
+                              X86MemOperand x86memop> {
+def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                      [(set RC:$dst, (vt (OpNode RC:$src)))],
+                      IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
+def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                      [(set RC:$dst, (OpNode (mem_frag addr:$src)))],
+                      IIC_SSE_MOV_LH>, Sched<[WriteLoad]>;
+}
+
+let Predicates = [HasAVX] in {
+  defm VMOVSHDUP  : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
+                                       v4f32, VR128, loadv4f32, f128mem>, VEX;
+  defm VMOVSLDUP  : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
+                                       v4f32, VR128, loadv4f32, f128mem>, VEX;
+  defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
+                                 v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L;
+  defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
+                                 v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L;
+}
+defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
+                                   memopv4f32, f128mem>;
+defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
+                                   memopv4f32, f128mem>;
+
+let Predicates = [HasAVX] in {
+  def : Pat<(v4i32 (X86Movshdup VR128:$src)),
+            (VMOVSHDUPrr VR128:$src)>;
+  def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (loadv2i64 addr:$src)))),
+            (VMOVSHDUPrm addr:$src)>;
+  def : Pat<(v4i32 (X86Movsldup VR128:$src)),
+            (VMOVSLDUPrr VR128:$src)>;
+  def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (loadv2i64 addr:$src)))),
+            (VMOVSLDUPrm addr:$src)>;
+  def : Pat<(v8i32 (X86Movshdup VR256:$src)),
+            (VMOVSHDUPYrr VR256:$src)>;
+  def : Pat<(v8i32 (X86Movshdup (bc_v8i32 (loadv4i64 addr:$src)))),
+            (VMOVSHDUPYrm addr:$src)>;
+  def : Pat<(v8i32 (X86Movsldup VR256:$src)),
+            (VMOVSLDUPYrr VR256:$src)>;
+  def : Pat<(v8i32 (X86Movsldup (bc_v8i32 (loadv4i64 addr:$src)))),
+            (VMOVSLDUPYrm addr:$src)>;
+}
+
+let Predicates = [UseSSE3] in {
+  def : Pat<(v4i32 (X86Movshdup VR128:$src)),
+            (MOVSHDUPrr VR128:$src)>;
+  def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))),
+            (MOVSHDUPrm addr:$src)>;
+  def : Pat<(v4i32 (X86Movsldup VR128:$src)),
+            (MOVSLDUPrr VR128:$src)>;
+  def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))),
+            (MOVSLDUPrm addr:$src)>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSE3 - Replicate Double FP - MOVDDUP
+//===---------------------------------------------------------------------===//
+
+multiclass sse3_replicate_dfp<string OpcodeStr> {
+let neverHasSideEffects = 1 in
+def rr  : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [], IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
+def rm  : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst,
+                      (v2f64 (X86Movddup
+                              (scalar_to_vector (loadf64 addr:$src)))))],
+                              IIC_SSE_MOV_LH>, Sched<[WriteLoad]>;
+}
+
+// FIXME: Merge with above classe when there're patterns for the ymm version
+multiclass sse3_replicate_dfp_y<string OpcodeStr> {
+def rr  : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>,
+                    Sched<[WriteFShuffle]>;
+def rm  : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set VR256:$dst,
+                      (v4f64 (X86Movddup
+                              (scalar_to_vector (loadf64 addr:$src)))))]>,
+                    Sched<[WriteLoad]>;
+}
+
+let Predicates = [HasAVX] in {
+  defm VMOVDDUP  : sse3_replicate_dfp<"vmovddup">, VEX;
+  defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX, VEX_L;
+}
+
+defm MOVDDUP : sse3_replicate_dfp<"movddup">;
+
+let Predicates = [HasAVX] in {
+  def : Pat<(X86Movddup (loadv2f64 addr:$src)),
+            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
+  def : Pat<(X86Movddup (bc_v2f64 (loadv4f32 addr:$src))),
+            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
+  def : Pat<(X86Movddup (bc_v2f64 (loadv2i64 addr:$src))),
+            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
+  def : Pat<(X86Movddup (bc_v2f64
+                             (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
+            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
+
+  // 256-bit version
+  def : Pat<(X86Movddup (loadv4f64 addr:$src)),
+            (VMOVDDUPYrm addr:$src)>;
+  def : Pat<(X86Movddup (loadv4i64 addr:$src)),
+            (VMOVDDUPYrm addr:$src)>;
+  def : Pat<(X86Movddup (v4i64 (scalar_to_vector (loadi64 addr:$src)))),
+            (VMOVDDUPYrm addr:$src)>;
+  def : Pat<(X86Movddup (v4i64 VR256:$src)),
+            (VMOVDDUPYrr VR256:$src)>;
+}
+
+let Predicates = [UseSSE3] in {
+  def : Pat<(X86Movddup (memopv2f64 addr:$src)),
+            (MOVDDUPrm addr:$src)>;
+  def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))),
+            (MOVDDUPrm addr:$src)>;
+  def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))),
+            (MOVDDUPrm addr:$src)>;
+  def : Pat<(X86Movddup (bc_v2f64
+                             (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
+            (MOVDDUPrm addr:$src)>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSE3 - Move Unaligned Integer
+//===---------------------------------------------------------------------===//
+
+let SchedRW = [WriteLoad] in {
+let Predicates = [HasAVX] in {
+  def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                   "vlddqu\t{$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, VEX;
+  def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
+                   "vlddqu\t{$src, $dst|$dst, $src}",
+                   [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>,
+                   VEX, VEX_L;
+}
+def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                   "lddqu\t{$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))],
+                   IIC_SSE_LDDQU>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSE3 - Arithmetic
+//===---------------------------------------------------------------------===//
+
+multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC,
+                       X86MemOperand x86memop, OpndItins itins,
+                       bit Is2Addr = 1> {
+  def rr : I<0xD0, MRMSrcReg,
+       (outs RC:$dst), (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (Int RC:$src1, RC:$src2))], itins.rr>,
+       Sched<[itins.Sched]>;
+  def rm : I<0xD0, MRMSrcMem,
+       (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (Int RC:$src1, (memop addr:$src2)))], itins.rr>,
+       Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX] in {
+  let ExeDomain = SSEPackedSingle in {
+    defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128,
+                                 f128mem, SSE_ALU_F32P, 0>, XD, VEX_4V;
+    defm VADDSUBPSY : sse3_addsub<int_x86_avx_addsub_ps_256, "vaddsubps", VR256,
+                               f256mem, SSE_ALU_F32P, 0>, XD, VEX_4V, VEX_L;
+  }
+  let ExeDomain = SSEPackedDouble in {
+    defm VADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "vaddsubpd", VR128,
+                                 f128mem, SSE_ALU_F64P, 0>, PD, VEX_4V;
+    defm VADDSUBPDY : sse3_addsub<int_x86_avx_addsub_pd_256, "vaddsubpd", VR256,
+                           f256mem, SSE_ALU_F64P, 0>, PD, VEX_4V, VEX_L;
+  }
+}
+let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
+  let ExeDomain = SSEPackedSingle in
+  defm ADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "addsubps", VR128,
+                              f128mem, SSE_ALU_F32P>, XD;
+  let ExeDomain = SSEPackedDouble in
+  defm ADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "addsubpd", VR128,
+                              f128mem, SSE_ALU_F64P>, PD;
+}
+
+// Patterns used to select 'addsub' instructions.
+let Predicates = [HasAVX] in {
+  // Constant 170 corresponds to the binary mask '10101010'.
+  // When used as a blend mask, it allows selecting eight elements from two
+  // input vectors as follow:
+  // - Even-numbered values in the destination are copied from
+  //   the corresponding elements in the first input vector;
+  // - Odd-numbered values in the destination are copied from
+  //   the corresponding elements in the second input vector.
+
+  def : Pat<(v8f32 (X86Blendi (v8f32 (fsub VR256:$lhs, VR256:$rhs)),
+                              (v8f32 (fadd VR256:$lhs, VR256:$rhs)), (i32 170))),
+            (VADDSUBPSYrr VR256:$lhs, VR256:$rhs)>;
+
+  // Constant 10 corresponds to the binary mask '1010'.
+  // In the two pattens below, constant 10 is used as a blend mask to select
+  // - the 1st and 3rd element from the first input vector (the 'fsub' node);
+  // - the 2nd and 4th element from the second input vector (the 'fadd' node).
+
+  def : Pat<(v4f64 (X86Blendi (v4f64 (fsub VR256:$lhs, VR256:$rhs)),
+                             (v4f64 (fadd VR256:$lhs, VR256:$rhs)), (i32 10))),
+            (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>;
+  def : Pat<(v4f64 (X86Blendi (v4f64 (fsub VR256:$lhs, VR256:$rhs)),
+                              (v4f64 (fadd VR256:$lhs, VR256:$rhs)), (i32 10))),
+            (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>;
+  def : Pat<(v4f32 (X86Blendi (v4f32 (fsub VR128:$lhs, VR128:$rhs)),
+                              (v4f32 (fadd VR128:$lhs, VR128:$rhs)), (i32 10))),
+            (VADDSUBPSrr VR128:$lhs, VR128:$rhs)>;
+  def : Pat<(v2f64 (X86Blendi (v2f64 (fsub VR128:$lhs, VR128:$rhs)),
+                              (v2f64 (fadd VR128:$lhs, VR128:$rhs)), (i32 2))), 
+            (VADDSUBPDrr VR128:$lhs, VR128:$rhs)>;
+  def : Pat<(v2f64 (X86Movsd (v2f64 (fadd VR128:$lhs, VR128:$rhs)),
+                             (v2f64 (fsub VR128:$lhs, VR128:$rhs)))),
+            (VADDSUBPDrr VR128:$lhs, VR128:$rhs)>;
+}
+
+let Predicates = [UseSSE3] in {
+  // Constant 10 corresponds to the binary mask '1010'.
+  // In the pattern below, it is used as a blend mask to select:
+  // - the 1st and 3rd element from the first input vector (the fsub node);
+  // - the 2nd and 4th element from the second input vector (the fadd node).
+
+  def : Pat<(v4f32 (X86Blendi (v4f32 (fsub VR128:$lhs, VR128:$rhs)),
+                              (v4f32 (fadd VR128:$lhs, VR128:$rhs)), (i32 10))),
+            (ADDSUBPSrr VR128:$lhs, VR128:$rhs)>;
+
+  def : Pat<(v2f64 (X86Blendi (v2f64 (fsub VR128:$lhs, VR128:$rhs)),
+                              (v2f64 (fadd VR128:$lhs, VR128:$rhs)), (i32 2))), 
+            (ADDSUBPDrr VR128:$lhs, VR128:$rhs)>;
+  def : Pat<(v2f64 (X86Movsd (v2f64 (fadd VR128:$lhs, VR128:$rhs)),
+                             (v2f64 (fsub VR128:$lhs, VR128:$rhs)))),
+            (ADDSUBPDrr VR128:$lhs, VR128:$rhs)>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSE3 Instructions
+//===---------------------------------------------------------------------===//
+
+// Horizontal ops
+multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
+                   X86MemOperand x86memop, SDNode OpNode, bit Is2Addr = 1> {
+  def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>,
+      Sched<[WriteFAdd]>;
+
+  def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+      [(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))],
+        IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>;
+}
+multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
+                  X86MemOperand x86memop, SDNode OpNode, bit Is2Addr = 1> {
+  def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>,
+      Sched<[WriteFAdd]>;
+
+  def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+      [(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))],
+        IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX] in {
+  let ExeDomain = SSEPackedSingle in {
+    defm VHADDPS  : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
+                            X86fhadd, 0>, VEX_4V;
+    defm VHSUBPS  : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
+                            X86fhsub, 0>, VEX_4V;
+    defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
+                            X86fhadd, 0>, VEX_4V, VEX_L;
+    defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
+                            X86fhsub, 0>, VEX_4V, VEX_L;
+  }
+  let ExeDomain = SSEPackedDouble in {
+    defm VHADDPD  : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem,
+                            X86fhadd, 0>, VEX_4V;
+    defm VHSUBPD  : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem,
+                            X86fhsub, 0>, VEX_4V;
+    defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem,
+                            X86fhadd, 0>, VEX_4V, VEX_L;
+    defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem,
+                            X86fhsub, 0>, VEX_4V, VEX_L;
+  }
+}
+
+let Constraints = "$src1 = $dst" in {
+  let ExeDomain = SSEPackedSingle in {
+    defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd>;
+    defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub>;
+  }
+  let ExeDomain = SSEPackedDouble in {
+    defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd>;
+    defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub>;
+  }
+}
+
+//===---------------------------------------------------------------------===//
+// SSSE3 - Packed Absolute Instructions
+//===---------------------------------------------------------------------===//
+
+
+/// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
+multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr,
+                            Intrinsic IntId128> {
+  def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
+                    (ins VR128:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (IntId128 VR128:$src))], IIC_SSE_PABS_RR>,
+                    Sched<[WriteVecALU]>;
+
+  def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
+                    (ins i128mem:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst,
+                      (IntId128
+                       (bitconvert (memopv2i64 addr:$src))))], IIC_SSE_PABS_RM>,
+                    Sched<[WriteVecALULd]>;
+}
+
+/// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
+multiclass SS3I_unop_rm_int_y<bits<8> opc, string OpcodeStr,
+                              Intrinsic IntId256> {
+  def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
+                    (ins VR256:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set VR256:$dst, (IntId256 VR256:$src))]>,
+                    Sched<[WriteVecALU]>;
+
+  def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
+                    (ins i256mem:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set VR256:$dst,
+                      (IntId256
+                       (bitconvert (memopv4i64 addr:$src))))]>,
+                    Sched<[WriteVecALULd]>;
+}
+
+// Helper fragments to match sext vXi1 to vXiY.
+def v16i1sextv16i8 : PatLeaf<(v16i8 (X86pcmpgt (bc_v16i8 (v4i32 immAllZerosV)),
+                                               VR128:$src))>;
+def v8i1sextv8i16  : PatLeaf<(v8i16 (X86vsrai VR128:$src, (i8 15)))>;
+def v4i1sextv4i32  : PatLeaf<(v4i32 (X86vsrai VR128:$src, (i8 31)))>;
+def v32i1sextv32i8 : PatLeaf<(v32i8 (X86pcmpgt (bc_v32i8 (v8i32 immAllZerosV)),
+                                               VR256:$src))>;
+def v16i1sextv16i16: PatLeaf<(v16i16 (X86vsrai VR256:$src, (i8 15)))>;
+def v8i1sextv8i32  : PatLeaf<(v8i32 (X86vsrai VR256:$src, (i8 31)))>;
+
+let Predicates = [HasAVX] in {
+  defm VPABSB  : SS3I_unop_rm_int<0x1C, "vpabsb",
+                                  int_x86_ssse3_pabs_b_128>, VEX;
+  defm VPABSW  : SS3I_unop_rm_int<0x1D, "vpabsw",
+                                  int_x86_ssse3_pabs_w_128>, VEX;
+  defm VPABSD  : SS3I_unop_rm_int<0x1E, "vpabsd",
+                                  int_x86_ssse3_pabs_d_128>, VEX;
+
+  def : Pat<(xor
+            (bc_v2i64 (v16i1sextv16i8)),
+            (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))),
+            (VPABSBrr128 VR128:$src)>;
+  def : Pat<(xor
+            (bc_v2i64 (v8i1sextv8i16)),
+            (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))),
+            (VPABSWrr128 VR128:$src)>;
+  def : Pat<(xor
+            (bc_v2i64 (v4i1sextv4i32)),
+            (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))),
+            (VPABSDrr128 VR128:$src)>;
+}
+
+let Predicates = [HasAVX2] in {
+  defm VPABSB  : SS3I_unop_rm_int_y<0x1C, "vpabsb",
+                                    int_x86_avx2_pabs_b>, VEX, VEX_L;
+  defm VPABSW  : SS3I_unop_rm_int_y<0x1D, "vpabsw",
+                                    int_x86_avx2_pabs_w>, VEX, VEX_L;
+  defm VPABSD  : SS3I_unop_rm_int_y<0x1E, "vpabsd",
+                                    int_x86_avx2_pabs_d>, VEX, VEX_L;
+
+  def : Pat<(xor
+            (bc_v4i64 (v32i1sextv32i8)),
+            (bc_v4i64 (add (v32i8 VR256:$src), (v32i1sextv32i8)))),
+            (VPABSBrr256 VR256:$src)>;
+  def : Pat<(xor
+            (bc_v4i64 (v16i1sextv16i16)),
+            (bc_v4i64 (add (v16i16 VR256:$src), (v16i1sextv16i16)))),
+            (VPABSWrr256 VR256:$src)>;
+  def : Pat<(xor
+            (bc_v4i64 (v8i1sextv8i32)),
+            (bc_v4i64 (add (v8i32 VR256:$src), (v8i1sextv8i32)))),
+            (VPABSDrr256 VR256:$src)>;
+}
+
+defm PABSB : SS3I_unop_rm_int<0x1C, "pabsb",
+                              int_x86_ssse3_pabs_b_128>;
+defm PABSW : SS3I_unop_rm_int<0x1D, "pabsw",
+                              int_x86_ssse3_pabs_w_128>;
+defm PABSD : SS3I_unop_rm_int<0x1E, "pabsd",
+                              int_x86_ssse3_pabs_d_128>;
+
+let Predicates = [HasSSSE3] in {
+  def : Pat<(xor
+            (bc_v2i64 (v16i1sextv16i8)),
+            (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))),
+            (PABSBrr128 VR128:$src)>;
+  def : Pat<(xor
+            (bc_v2i64 (v8i1sextv8i16)),
+            (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))),
+            (PABSWrr128 VR128:$src)>;
+  def : Pat<(xor
+            (bc_v2i64 (v4i1sextv4i32)),
+            (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))),
+            (PABSDrr128 VR128:$src)>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSSE3 - Packed Binary Operator Instructions
+//===---------------------------------------------------------------------===//
+
+let Sched = WriteVecALU in {
+def SSE_PHADDSUBD : OpndItins<
+  IIC_SSE_PHADDSUBD_RR, IIC_SSE_PHADDSUBD_RM
+>;
+def SSE_PHADDSUBSW : OpndItins<
+  IIC_SSE_PHADDSUBSW_RR, IIC_SSE_PHADDSUBSW_RM
+>;
+def SSE_PHADDSUBW : OpndItins<
+  IIC_SSE_PHADDSUBW_RR, IIC_SSE_PHADDSUBW_RM
+>;
+}
+let Sched = WriteShuffle in
+def SSE_PSHUFB : OpndItins<
+  IIC_SSE_PSHUFB_RR, IIC_SSE_PSHUFB_RM
+>;
+let Sched = WriteVecALU in
+def SSE_PSIGN : OpndItins<
+  IIC_SSE_PSIGN_RR, IIC_SSE_PSIGN_RM
+>;
+let Sched = WriteVecIMul in
+def SSE_PMULHRSW : OpndItins<
+  IIC_SSE_PMULHRSW, IIC_SSE_PMULHRSW
+>;
+
+/// SS3I_binop_rm - Simple SSSE3 bin op
+multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                         ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+                         X86MemOperand x86memop, OpndItins itins,
+                         bit Is2Addr = 1> {
+  let isCommutable = 1 in
+  def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>,
+       Sched<[itins.Sched]>;
+  def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst,
+         (OpVT (OpNode RC:$src1,
+          (bitconvert (memop_frag addr:$src2)))))], itins.rm>,
+       Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+/// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}.
+multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
+                             Intrinsic IntId128, OpndItins itins,
+                             bit Is2Addr = 1> {
+  let isCommutable = 1 in
+  def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
+       (ins VR128:$src1, VR128:$src2),
+       !if(Is2Addr,
+         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
+       Sched<[itins.Sched]>;
+  def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
+       (ins VR128:$src1, i128mem:$src2),
+       !if(Is2Addr,
+         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set VR128:$dst,
+         (IntId128 VR128:$src1,
+          (bitconvert (memopv2i64 addr:$src2))))]>,
+       Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
+                               Intrinsic IntId256,
+                               X86FoldableSchedWrite Sched> {
+  let isCommutable = 1 in
+  def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
+       (ins VR256:$src1, VR256:$src2),
+       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>,
+       Sched<[Sched]>;
+  def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
+       (ins VR256:$src1, i256mem:$src2),
+       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set VR256:$dst,
+         (IntId256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2))))]>,
+       Sched<[Sched.Folded, ReadAfterLd]>;
+}
+
+let ImmT = NoImm, Predicates = [HasAVX] in {
+let isCommutable = 0 in {
+  defm VPHADDW    : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, VR128,
+                                  loadv2i64, i128mem,
+                                  SSE_PHADDSUBW, 0>, VEX_4V;
+  defm VPHADDD    : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, VR128,
+                                  loadv2i64, i128mem,
+                                  SSE_PHADDSUBD, 0>, VEX_4V;
+  defm VPHSUBW    : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, VR128,
+                                  loadv2i64, i128mem,
+                                  SSE_PHADDSUBW, 0>, VEX_4V;
+  defm VPHSUBD    : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, VR128,
+                                  loadv2i64, i128mem,
+                                  SSE_PHADDSUBD, 0>, VEX_4V;
+  defm VPSIGNB    : SS3I_binop_rm<0x08, "vpsignb", X86psign, v16i8, VR128,
+                                  loadv2i64, i128mem,
+                                  SSE_PSIGN, 0>, VEX_4V;
+  defm VPSIGNW    : SS3I_binop_rm<0x09, "vpsignw", X86psign, v8i16, VR128,
+                                  loadv2i64, i128mem,
+                                  SSE_PSIGN, 0>, VEX_4V;
+  defm VPSIGND    : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v4i32, VR128,
+                                  loadv2i64, i128mem,
+                                  SSE_PSIGN, 0>, VEX_4V;
+  defm VPSHUFB    : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, VR128,
+                                  loadv2i64, i128mem,
+                                  SSE_PSHUFB, 0>, VEX_4V;
+  defm VPHADDSW   : SS3I_binop_rm_int<0x03, "vphaddsw",
+                                      int_x86_ssse3_phadd_sw_128,
+                                      SSE_PHADDSUBSW, 0>, VEX_4V;
+  defm VPHSUBSW   : SS3I_binop_rm_int<0x07, "vphsubsw",
+                                      int_x86_ssse3_phsub_sw_128,
+                                      SSE_PHADDSUBSW, 0>, VEX_4V;
+  defm VPMADDUBSW : SS3I_binop_rm_int<0x04, "vpmaddubsw",
+                                      int_x86_ssse3_pmadd_ub_sw_128,
+                                      SSE_PMADD, 0>, VEX_4V;
+}
+defm VPMULHRSW    : SS3I_binop_rm_int<0x0B, "vpmulhrsw",
+                                      int_x86_ssse3_pmul_hr_sw_128,
+                                      SSE_PMULHRSW, 0>, VEX_4V;
+}
+
+let ImmT = NoImm, Predicates = [HasAVX2] in {
+let isCommutable = 0 in {
+  defm VPHADDWY   : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, VR256,
+                                  loadv4i64, i256mem,
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+  defm VPHADDDY   : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, VR256,
+                                  loadv4i64, i256mem,
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+  defm VPHSUBWY   : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, VR256,
+                                  loadv4i64, i256mem,
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+  defm VPHSUBDY   : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, VR256,
+                                  loadv4i64, i256mem,
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+  defm VPSIGNBY   : SS3I_binop_rm<0x08, "vpsignb", X86psign, v32i8, VR256,
+                                  loadv4i64, i256mem,
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+  defm VPSIGNWY   : SS3I_binop_rm<0x09, "vpsignw", X86psign, v16i16, VR256,
+                                  loadv4i64, i256mem,
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+  defm VPSIGNDY   : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v8i32, VR256,
+                                  loadv4i64, i256mem,
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+  defm VPSHUFBY   : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, VR256,
+                                  loadv4i64, i256mem,
+                                  SSE_PSHUFB, 0>, VEX_4V, VEX_L;
+  defm VPHADDSW   : SS3I_binop_rm_int_y<0x03, "vphaddsw",
+                                        int_x86_avx2_phadd_sw,
+                                        WriteVecALU>, VEX_4V, VEX_L;
+  defm VPHSUBSW   : SS3I_binop_rm_int_y<0x07, "vphsubsw",
+                                        int_x86_avx2_phsub_sw,
+                                        WriteVecALU>, VEX_4V, VEX_L;
+  defm VPMADDUBSW : SS3I_binop_rm_int_y<0x04, "vpmaddubsw",
+                                       int_x86_avx2_pmadd_ub_sw,
+                                        WriteVecIMul>, VEX_4V, VEX_L;
+}
+defm VPMULHRSW    : SS3I_binop_rm_int_y<0x0B, "vpmulhrsw",
+                                        int_x86_avx2_pmul_hr_sw,
+                                        WriteVecIMul>, VEX_4V, VEX_L;
+}
+
+// None of these have i8 immediate fields.
+let ImmT = NoImm, Constraints = "$src1 = $dst" in {
+let isCommutable = 0 in {
+  defm PHADDW    : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, VR128,
+                                 memopv2i64, i128mem, SSE_PHADDSUBW>;
+  defm PHADDD    : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, VR128,
+                                 memopv2i64, i128mem, SSE_PHADDSUBD>;
+  defm PHSUBW    : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, VR128,
+                                 memopv2i64, i128mem, SSE_PHADDSUBW>;
+  defm PHSUBD    : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, VR128,
+                                 memopv2i64, i128mem, SSE_PHADDSUBD>;
+  defm PSIGNB    : SS3I_binop_rm<0x08, "psignb", X86psign, v16i8, VR128,
+                                 memopv2i64, i128mem, SSE_PSIGN>;
+  defm PSIGNW    : SS3I_binop_rm<0x09, "psignw", X86psign, v8i16, VR128,
+                                 memopv2i64, i128mem, SSE_PSIGN>;
+  defm PSIGND    : SS3I_binop_rm<0x0A, "psignd", X86psign, v4i32, VR128,
+                                 memopv2i64, i128mem, SSE_PSIGN>;
+  defm PSHUFB    : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, VR128,
+                                 memopv2i64, i128mem, SSE_PSHUFB>;
+  defm PHADDSW   : SS3I_binop_rm_int<0x03, "phaddsw",
+                                     int_x86_ssse3_phadd_sw_128,
+                                     SSE_PHADDSUBSW>;
+  defm PHSUBSW   : SS3I_binop_rm_int<0x07, "phsubsw",
+                                     int_x86_ssse3_phsub_sw_128,
+                                     SSE_PHADDSUBSW>;
+  defm PMADDUBSW : SS3I_binop_rm_int<0x04, "pmaddubsw",
+                                     int_x86_ssse3_pmadd_ub_sw_128, SSE_PMADD>;
+}
+defm PMULHRSW    : SS3I_binop_rm_int<0x0B, "pmulhrsw",
+                                     int_x86_ssse3_pmul_hr_sw_128,
+                                     SSE_PMULHRSW>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSSE3 - Packed Align Instruction Patterns
+//===---------------------------------------------------------------------===//
+
+multiclass ssse3_palignr<string asm, bit Is2Addr = 1> {
+  let neverHasSideEffects = 1 in {
+  def R128rr : SS3AI<0x0F, MRMSrcReg, (outs VR128:$dst),
+      (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [], IIC_SSE_PALIGNRR>, Sched<[WriteShuffle]>;
+  let mayLoad = 1 in
+  def R128rm : SS3AI<0x0F, MRMSrcMem, (outs VR128:$dst),
+      (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [], IIC_SSE_PALIGNRM>, Sched<[WriteShuffleLd, ReadAfterLd]>;
+  }
+}
+
+multiclass ssse3_palignr_y<string asm, bit Is2Addr = 1> {
+  let neverHasSideEffects = 1 in {
+  def R256rr : SS3AI<0x0F, MRMSrcReg, (outs VR256:$dst),
+      (ins VR256:$src1, VR256:$src2, i8imm:$src3),
+      !strconcat(asm,
+                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+      []>, Sched<[WriteShuffle]>;
+  let mayLoad = 1 in
+  def R256rm : SS3AI<0x0F, MRMSrcMem, (outs VR256:$dst),
+      (ins VR256:$src1, i256mem:$src2, i8imm:$src3),
+      !strconcat(asm,
+                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+      []>, Sched<[WriteShuffleLd, ReadAfterLd]>;
+  }
+}
+
+let Predicates = [HasAVX] in
+  defm VPALIGN : ssse3_palignr<"vpalignr", 0>, VEX_4V;
+let Predicates = [HasAVX2] in
+  defm VPALIGN : ssse3_palignr_y<"vpalignr", 0>, VEX_4V, VEX_L;
+let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
+  defm PALIGN : ssse3_palignr<"palignr">;
+
+let Predicates = [HasAVX2] in {
+def : Pat<(v8i32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
+def : Pat<(v8f32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
+def : Pat<(v16i16 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
+def : Pat<(v32i8 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
+}
+
+let Predicates = [HasAVX] in {
+def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+}
+
+let Predicates = [UseSSSE3] in {
+def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSSE3 - Thread synchronization
+//===---------------------------------------------------------------------===//
+
+let SchedRW = [WriteSystem] in {
+let usesCustomInserter = 1 in {
+def MONITOR : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3),
+                [(int_x86_sse3_monitor addr:$src1, GR32:$src2, GR32:$src3)]>,
+                Requires<[HasSSE3]>;
+}
+
+let Uses = [EAX, ECX, EDX] in
+def MONITORrrr : I<0x01, MRM_C8, (outs), (ins), "monitor", [], IIC_SSE_MONITOR>,
+                 TB, Requires<[HasSSE3]>;
+let Uses = [ECX, EAX] in
+def MWAITrr   : I<0x01, MRM_C9, (outs), (ins), "mwait",
+                [(int_x86_sse3_mwait ECX, EAX)], IIC_SSE_MWAIT>,
+                TB, Requires<[HasSSE3]>;
+} // SchedRW
+
+def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>;
+def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>;
+
+def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORrrr)>,
+      Requires<[Not64BitMode]>;
+def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORrrr)>,
+      Requires<[In64BitMode]>;
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Packed Move with Sign/Zero Extend
+//===----------------------------------------------------------------------===//
+
+multiclass SS41I_binop_rm_int8<bits<8> opc, string OpcodeStr, Intrinsic IntId,
+                               OpndItins itins = DEFAULT_ITINS> {
+  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                 [(set VR128:$dst, (IntId VR128:$src))], itins.rr>,
+                 Sched<[itins.Sched]>;
+
+  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+       [(set VR128:$dst,
+         (IntId (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))],
+         itins.rm>, Sched<[itins.Sched.Folded]>;
+}
+
+multiclass SS41I_binop_rm_int16_y<bits<8> opc, string OpcodeStr,
+                                 Intrinsic IntId, X86FoldableSchedWrite Sched> {
+  def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
+                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                  [(set VR256:$dst, (IntId VR128:$src))]>, Sched<[Sched]>;
+
+  def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
+                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                  [(set VR256:$dst, (IntId (load addr:$src)))]>,
+                  Sched<[Sched.Folded]>;
+}
+
+let Predicates = [HasAVX] in {
+defm VPMOVSXBW : SS41I_binop_rm_int8<0x20, "vpmovsxbw",
+                                     int_x86_sse41_pmovsxbw,
+                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
+defm VPMOVSXWD : SS41I_binop_rm_int8<0x23, "vpmovsxwd",
+                                     int_x86_sse41_pmovsxwd,
+                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
+defm VPMOVSXDQ : SS41I_binop_rm_int8<0x25, "vpmovsxdq",
+                                     int_x86_sse41_pmovsxdq,
+                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
+defm VPMOVZXBW : SS41I_binop_rm_int8<0x30, "vpmovzxbw",
+                                     int_x86_sse41_pmovzxbw,
+                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
+defm VPMOVZXWD : SS41I_binop_rm_int8<0x33, "vpmovzxwd",
+                                     int_x86_sse41_pmovzxwd,
+                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
+defm VPMOVZXDQ : SS41I_binop_rm_int8<0x35, "vpmovzxdq",
+                                     int_x86_sse41_pmovzxdq,
+                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
+}
+
+let Predicates = [HasAVX2] in {
+defm VPMOVSXBW : SS41I_binop_rm_int16_y<0x20, "vpmovsxbw",
+                                        int_x86_avx2_pmovsxbw,
+                                        WriteShuffle>, VEX, VEX_L;
+defm VPMOVSXWD : SS41I_binop_rm_int16_y<0x23, "vpmovsxwd",
+                                        int_x86_avx2_pmovsxwd,
+                                        WriteShuffle>, VEX, VEX_L;
+defm VPMOVSXDQ : SS41I_binop_rm_int16_y<0x25, "vpmovsxdq",
+                                        int_x86_avx2_pmovsxdq,
+                                        WriteShuffle>, VEX, VEX_L;
+defm VPMOVZXBW : SS41I_binop_rm_int16_y<0x30, "vpmovzxbw",
+                                        int_x86_avx2_pmovzxbw,
+                                        WriteShuffle>, VEX, VEX_L;
+defm VPMOVZXWD : SS41I_binop_rm_int16_y<0x33, "vpmovzxwd",
+                                        int_x86_avx2_pmovzxwd,
+                                        WriteShuffle>, VEX, VEX_L;
+defm VPMOVZXDQ : SS41I_binop_rm_int16_y<0x35, "vpmovzxdq",
+                                        int_x86_avx2_pmovzxdq,
+                                        WriteShuffle>, VEX, VEX_L;
+}
+
+defm PMOVSXBW   : SS41I_binop_rm_int8<0x20, "pmovsxbw", int_x86_sse41_pmovsxbw,
+                                      SSE_INTALU_ITINS_SHUFF_P>;
+defm PMOVSXWD   : SS41I_binop_rm_int8<0x23, "pmovsxwd", int_x86_sse41_pmovsxwd,
+                                      SSE_INTALU_ITINS_SHUFF_P>;
+defm PMOVSXDQ   : SS41I_binop_rm_int8<0x25, "pmovsxdq", int_x86_sse41_pmovsxdq,
+                                      SSE_INTALU_ITINS_SHUFF_P>;
+defm PMOVZXBW   : SS41I_binop_rm_int8<0x30, "pmovzxbw", int_x86_sse41_pmovzxbw,
+                                      SSE_INTALU_ITINS_SHUFF_P>;
+defm PMOVZXWD   : SS41I_binop_rm_int8<0x33, "pmovzxwd", int_x86_sse41_pmovzxwd,
+                                      SSE_INTALU_ITINS_SHUFF_P>;
+defm PMOVZXDQ   : SS41I_binop_rm_int8<0x35, "pmovzxdq", int_x86_sse41_pmovzxdq,
+                                      SSE_INTALU_ITINS_SHUFF_P>;
+
+let Predicates = [HasAVX] in {
+  // Common patterns involving scalar load.
+  def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)),
+            (VPMOVSXBWrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)),
+            (VPMOVSXBWrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxbw (bc_v16i8 (loadv2i64 addr:$src))),
+            (VPMOVSXBWrm addr:$src)>;
+
+  def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)),
+            (VPMOVSXWDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)),
+            (VPMOVSXWDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxwd (bc_v8i16 (loadv2i64 addr:$src))),
+            (VPMOVSXWDrm addr:$src)>;
+
+  def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)),
+            (VPMOVSXDQrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)),
+            (VPMOVSXDQrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxdq (bc_v4i32 (loadv2i64 addr:$src))),
+            (VPMOVSXDQrm addr:$src)>;
+
+  def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)),
+            (VPMOVZXBWrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)),
+            (VPMOVZXBWrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxbw (bc_v16i8 (loadv2i64 addr:$src))),
+            (VPMOVZXBWrm addr:$src)>;
+
+  def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)),
+            (VPMOVZXWDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)),
+            (VPMOVZXWDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxwd (bc_v8i16 (loadv2i64 addr:$src))),
+            (VPMOVZXWDrm addr:$src)>;
+
+  def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)),
+            (VPMOVZXDQrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)),
+            (VPMOVZXDQrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxdq (bc_v4i32 (loadv2i64 addr:$src))),
+            (VPMOVZXDQrm addr:$src)>;
+}
+
+let Predicates = [UseSSE41] in {
+  // Common patterns involving scalar load.
+  def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)),
+            (PMOVSXBWrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)),
+            (PMOVSXBWrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxbw (bc_v16i8 (loadv2i64 addr:$src))),
+            (PMOVSXBWrm addr:$src)>;
+
+  def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)),
+            (PMOVSXWDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)),
+            (PMOVSXWDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxwd (bc_v8i16 (loadv2i64 addr:$src))),
+            (PMOVSXWDrm addr:$src)>;
+
+  def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)),
+            (PMOVSXDQrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)),
+            (PMOVSXDQrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxdq (bc_v4i32 (loadv2i64 addr:$src))),
+            (PMOVSXDQrm addr:$src)>;
+
+  def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)),
+            (PMOVZXBWrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)),
+            (PMOVZXBWrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxbw (bc_v16i8 (loadv2i64 addr:$src))),
+            (PMOVZXBWrm addr:$src)>;
+
+  def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)),
+            (PMOVZXWDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)),
+            (PMOVZXWDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxwd (bc_v8i16 (loadv2i64 addr:$src))),
+            (PMOVZXWDrm addr:$src)>;
+
+  def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)),
+            (PMOVZXDQrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)),
+            (PMOVZXDQrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxdq (bc_v4i32 (loadv2i64 addr:$src))),
+            (PMOVZXDQrm addr:$src)>;
+}
+
+multiclass SS41I_binop_rm_int4<bits<8> opc, string OpcodeStr, Intrinsic IntId,
+                               OpndItins itins = DEFAULT_ITINS> {
+  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                 [(set VR128:$dst, (IntId VR128:$src))], itins.rr>,
+                 Sched<[itins.Sched]>;
+
+  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
+                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+       [(set VR128:$dst,
+         (IntId (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))],
+         itins.rm>, Sched<[itins.Sched.Folded]>;
+}
+
+multiclass SS41I_binop_rm_int8_y<bits<8> opc, string OpcodeStr,
+                                 Intrinsic IntId, X86FoldableSchedWrite Sched> {
+  def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
+                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                  [(set VR256:$dst, (IntId VR128:$src))]>, Sched<[Sched]>;
+
+  def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), (ins i32mem:$src),
+                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+       [(set VR256:$dst,
+         (IntId (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))]>,
+         Sched<[Sched.Folded]>;
+}
+
+let Predicates = [HasAVX] in {
+defm VPMOVSXBD : SS41I_binop_rm_int4<0x21, "vpmovsxbd", int_x86_sse41_pmovsxbd,
+                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
+defm VPMOVSXWQ : SS41I_binop_rm_int4<0x24, "vpmovsxwq", int_x86_sse41_pmovsxwq,
+                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
+defm VPMOVZXBD : SS41I_binop_rm_int4<0x31, "vpmovzxbd", int_x86_sse41_pmovzxbd,
+                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
+defm VPMOVZXWQ : SS41I_binop_rm_int4<0x34, "vpmovzxwq", int_x86_sse41_pmovzxwq,
+                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
+}
+
+let Predicates = [HasAVX2] in {
+defm VPMOVSXBD : SS41I_binop_rm_int8_y<0x21, "vpmovsxbd",
+                                       int_x86_avx2_pmovsxbd, WriteShuffle>,
+                                       VEX, VEX_L;
+defm VPMOVSXWQ : SS41I_binop_rm_int8_y<0x24, "vpmovsxwq",
+                                       int_x86_avx2_pmovsxwq, WriteShuffle>,
+                                       VEX, VEX_L;
+defm VPMOVZXBD : SS41I_binop_rm_int8_y<0x31, "vpmovzxbd",
+                                       int_x86_avx2_pmovzxbd, WriteShuffle>,
+                                       VEX, VEX_L;
+defm VPMOVZXWQ : SS41I_binop_rm_int8_y<0x34, "vpmovzxwq",
+                                       int_x86_avx2_pmovzxwq, WriteShuffle>,
+                                       VEX, VEX_L;
+}
+
+defm PMOVSXBD   : SS41I_binop_rm_int4<0x21, "pmovsxbd", int_x86_sse41_pmovsxbd,
+                                      SSE_INTALU_ITINS_SHUFF_P>;
+defm PMOVSXWQ   : SS41I_binop_rm_int4<0x24, "pmovsxwq", int_x86_sse41_pmovsxwq,
+                                      SSE_INTALU_ITINS_SHUFF_P>;
+defm PMOVZXBD   : SS41I_binop_rm_int4<0x31, "pmovzxbd", int_x86_sse41_pmovzxbd,
+                                      SSE_INTALU_ITINS_SHUFF_P>;
+defm PMOVZXWQ   : SS41I_binop_rm_int4<0x34, "pmovzxwq", int_x86_sse41_pmovzxwq,
+                                      SSE_INTALU_ITINS_SHUFF_P>;
+
+let Predicates = [HasAVX] in {
+  // Common patterns involving scalar load
+  def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)),
+            (VPMOVSXBDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxwq (vzmovl_v4i32 addr:$src)),
+            (VPMOVSXWQrm addr:$src)>;
+
+  def : Pat<(int_x86_sse41_pmovzxbd (vzmovl_v4i32 addr:$src)),
+            (VPMOVZXBDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxwq (vzmovl_v4i32 addr:$src)),
+            (VPMOVZXWQrm addr:$src)>;
+}
+
+let Predicates = [UseSSE41] in {
+  // Common patterns involving scalar load
+  def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)),
+            (PMOVSXBDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxwq (vzmovl_v4i32 addr:$src)),
+            (PMOVSXWQrm addr:$src)>;
+
+  def : Pat<(int_x86_sse41_pmovzxbd (vzmovl_v4i32 addr:$src)),
+            (PMOVZXBDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxwq (vzmovl_v4i32 addr:$src)),
+            (PMOVZXWQrm addr:$src)>;
+}
+
+multiclass SS41I_binop_rm_int2<bits<8> opc, string OpcodeStr, Intrinsic IntId,
+                               X86FoldableSchedWrite Sched> {
+  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                 [(set VR128:$dst, (IntId VR128:$src))]>, Sched<[Sched]>;
+
+  // Expecting a i16 load any extended to i32 value.
+  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i16mem:$src),
+                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                 [(set VR128:$dst, (IntId (bitconvert
+                     (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))]>,
+                 Sched<[Sched.Folded]>;
+}
+
+multiclass SS41I_binop_rm_int4_y<bits<8> opc, string OpcodeStr,
+                                 Intrinsic IntId, X86FoldableSchedWrite Sched> {
+  def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
+                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                 [(set VR256:$dst, (IntId VR128:$src))]>, Sched<[Sched]>;
+
+  // Expecting a i16 load any extended to i32 value.
+  def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), (ins i16mem:$src),
+                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                  [(set VR256:$dst, (IntId (bitconvert
+                      (v4i32 (scalar_to_vector (loadi32 addr:$src))))))]>,
+                 Sched<[Sched.Folded]>;
+}
+
+let Predicates = [HasAVX] in {
+defm VPMOVSXBQ : SS41I_binop_rm_int2<0x22, "vpmovsxbq", int_x86_sse41_pmovsxbq,
+                                     WriteShuffle>, VEX;
+defm VPMOVZXBQ : SS41I_binop_rm_int2<0x32, "vpmovzxbq", int_x86_sse41_pmovzxbq,
+                                     WriteShuffle>, VEX;
+}
+let Predicates = [HasAVX2] in {
+defm VPMOVSXBQ : SS41I_binop_rm_int4_y<0x22, "vpmovsxbq", int_x86_avx2_pmovsxbq,
+                                       WriteShuffle>, VEX, VEX_L;
+defm VPMOVZXBQ : SS41I_binop_rm_int4_y<0x32, "vpmovzxbq", int_x86_avx2_pmovzxbq,
+                                       WriteShuffle>, VEX, VEX_L;
+}
+defm PMOVSXBQ   : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq,
+                                      WriteShuffle>;
+defm PMOVZXBQ   : SS41I_binop_rm_int2<0x32, "pmovzxbq", int_x86_sse41_pmovzxbq,
+                                      WriteShuffle>;
+
+let Predicates = [HasAVX2] in {
+  def : Pat<(v16i16 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBWYrr VR128:$src)>;
+  def : Pat<(v8i32  (X86vsext (v16i8 VR128:$src))), (VPMOVSXBDYrr VR128:$src)>;
+  def : Pat<(v4i64  (X86vsext (v16i8 VR128:$src))), (VPMOVSXBQYrr VR128:$src)>;
+
+  def : Pat<(v8i32  (X86vsext (v8i16 VR128:$src))), (VPMOVSXWDYrr VR128:$src)>;
+  def : Pat<(v4i64  (X86vsext (v8i16 VR128:$src))), (VPMOVSXWQYrr VR128:$src)>;
+
+  def : Pat<(v4i64  (X86vsext (v4i32 VR128:$src))), (VPMOVSXDQYrr VR128:$src)>;
+
+  def : Pat<(v16i16 (X86vsext (v32i8 VR256:$src))),
+            (VPMOVSXBWYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+  def : Pat<(v8i32 (X86vsext (v32i8 VR256:$src))),
+            (VPMOVSXBDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+  def : Pat<(v4i64 (X86vsext (v32i8 VR256:$src))),
+            (VPMOVSXBQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+
+  def : Pat<(v8i32 (X86vsext (v16i16 VR256:$src))),
+            (VPMOVSXWDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+  def : Pat<(v4i64 (X86vsext (v16i16 VR256:$src))),
+            (VPMOVSXWQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+
+  def : Pat<(v4i64 (X86vsext (v8i32 VR256:$src))),
+            (VPMOVSXDQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+
+  def : Pat<(v8i32 (X86vsext (v8i16 (bitconvert (v2i64 (load addr:$src)))))),
+            (VPMOVSXWDYrm addr:$src)>;
+  def : Pat<(v4i64 (X86vsext (v4i32 (bitconvert (v2i64 (load addr:$src)))))),
+            (VPMOVSXDQYrm addr:$src)>;
+
+  def : Pat<(v8i32 (X86vsext (v16i8 (bitconvert (v2i64 
+                    (scalar_to_vector (loadi64 addr:$src))))))),
+            (VPMOVSXBDYrm addr:$src)>;
+  def : Pat<(v8i32 (X86vsext (v16i8 (bitconvert (v2f64 
+                    (scalar_to_vector (loadf64 addr:$src))))))),
+            (VPMOVSXBDYrm addr:$src)>;
+
+  def : Pat<(v4i64 (X86vsext (v8i16 (bitconvert (v2i64 
+                    (scalar_to_vector (loadi64 addr:$src))))))),
+            (VPMOVSXWQYrm addr:$src)>;
+  def : Pat<(v4i64 (X86vsext (v8i16 (bitconvert (v2f64 
+                    (scalar_to_vector (loadf64 addr:$src))))))),
+            (VPMOVSXWQYrm addr:$src)>;
+
+  def : Pat<(v4i64 (X86vsext (v16i8 (bitconvert (v4i32 
+                    (scalar_to_vector (loadi32 addr:$src))))))),
+            (VPMOVSXBQYrm addr:$src)>;
+}
+
+let Predicates = [HasAVX] in {
+  // Common patterns involving scalar load
+  def : Pat<(int_x86_sse41_pmovsxbq
+              (bitconvert (v4i32 (X86vzmovl
+                            (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+            (VPMOVSXBQrm addr:$src)>;
+
+  def : Pat<(int_x86_sse41_pmovzxbq
+              (bitconvert (v4i32 (X86vzmovl
+                            (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+            (VPMOVZXBQrm addr:$src)>;
+}
+
+let Predicates = [UseSSE41] in {
+  def : Pat<(v8i16 (X86vsext (v16i8 VR128:$src))), (PMOVSXBWrr VR128:$src)>;
+  def : Pat<(v4i32 (X86vsext (v16i8 VR128:$src))), (PMOVSXBDrr VR128:$src)>;
+  def : Pat<(v2i64 (X86vsext (v16i8 VR128:$src))), (PMOVSXBQrr VR128:$src)>;
+
+  def : Pat<(v4i32 (X86vsext (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>;
+  def : Pat<(v2i64 (X86vsext (v8i16 VR128:$src))), (PMOVSXWQrr VR128:$src)>;
+
+  def : Pat<(v2i64 (X86vsext (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>;
+
+  // Common patterns involving scalar load
+  def : Pat<(int_x86_sse41_pmovsxbq
+              (bitconvert (v4i32 (X86vzmovl
+                            (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+            (PMOVSXBQrm addr:$src)>;
+
+  def : Pat<(int_x86_sse41_pmovzxbq
+              (bitconvert (v4i32 (X86vzmovl
+                            (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+            (PMOVZXBQrm addr:$src)>;
+
+  def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2i64
+                    (scalar_to_vector (loadi64 addr:$src))))))),
+            (PMOVSXWDrm addr:$src)>;
+  def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2f64
+                    (scalar_to_vector (loadf64 addr:$src))))))),
+            (PMOVSXWDrm addr:$src)>;
+  def : Pat<(v4i32 (X86vsext (v16i8 (bitconvert (v4i32
+                    (scalar_to_vector (loadi32 addr:$src))))))),
+            (PMOVSXBDrm addr:$src)>;
+  def : Pat<(v2i64 (X86vsext (v8i16 (bitconvert (v4i32
+                    (scalar_to_vector (loadi32 addr:$src))))))),
+            (PMOVSXWQrm addr:$src)>;
+  def : Pat<(v2i64 (X86vsext (v16i8 (bitconvert (v4i32
+                    (scalar_to_vector (extloadi32i16 addr:$src))))))),
+            (PMOVSXBQrm addr:$src)>;
+  def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2i64
+                    (scalar_to_vector (loadi64 addr:$src))))))),
+            (PMOVSXDQrm addr:$src)>;
+  def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2f64
+                    (scalar_to_vector (loadf64 addr:$src))))))),
+            (PMOVSXDQrm addr:$src)>;
+  def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2i64
+                    (scalar_to_vector (loadi64 addr:$src))))))),
+            (PMOVSXBWrm addr:$src)>;
+  def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2f64
+                    (scalar_to_vector (loadf64 addr:$src))))))),
+            (PMOVSXBWrm addr:$src)>;
+}
+
+let Predicates = [HasAVX2] in {
+  def : Pat<(v16i16 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBWYrr VR128:$src)>;
+  def : Pat<(v8i32  (X86vzext (v16i8 VR128:$src))), (VPMOVZXBDYrr VR128:$src)>;
+  def : Pat<(v4i64  (X86vzext (v16i8 VR128:$src))), (VPMOVZXBQYrr VR128:$src)>;
+
+  def : Pat<(v8i32  (X86vzext (v8i16 VR128:$src))), (VPMOVZXWDYrr VR128:$src)>;
+  def : Pat<(v4i64  (X86vzext (v8i16 VR128:$src))), (VPMOVZXWQYrr VR128:$src)>;
+
+  def : Pat<(v4i64  (X86vzext (v4i32 VR128:$src))), (VPMOVZXDQYrr VR128:$src)>;
+
+  def : Pat<(v16i16 (X86vzext (v32i8 VR256:$src))),
+            (VPMOVZXBWYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+  def : Pat<(v8i32 (X86vzext (v32i8 VR256:$src))),
+            (VPMOVZXBDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+  def : Pat<(v4i64 (X86vzext (v32i8 VR256:$src))),
+            (VPMOVZXBQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+
+  def : Pat<(v8i32 (X86vzext (v16i16 VR256:$src))),
+            (VPMOVZXWDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+  def : Pat<(v4i64 (X86vzext (v16i16 VR256:$src))),
+            (VPMOVZXWQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+
+  def : Pat<(v4i64 (X86vzext (v8i32 VR256:$src))),
+            (VPMOVZXDQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+}
+
+let Predicates = [HasAVX] in {
+  def : Pat<(v8i16 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBWrr VR128:$src)>;
+  def : Pat<(v4i32 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBDrr VR128:$src)>;
+  def : Pat<(v2i64 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBQrr VR128:$src)>;
+
+  def : Pat<(v4i32 (X86vzext (v8i16 VR128:$src))), (VPMOVZXWDrr VR128:$src)>;
+  def : Pat<(v2i64 (X86vzext (v8i16 VR128:$src))), (VPMOVZXWQrr VR128:$src)>;
+
+  def : Pat<(v2i64 (X86vzext (v4i32 VR128:$src))), (VPMOVZXDQrr VR128:$src)>;
+
+  def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
+            (VPMOVZXBWrm addr:$src)>;
+  def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
+            (VPMOVZXBWrm addr:$src)>;
+  def : Pat<(v4i32 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+            (VPMOVZXBDrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))),
+            (VPMOVZXBQrm addr:$src)>;
+
+  def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
+            (VPMOVZXWDrm addr:$src)>;
+  def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
+            (VPMOVZXWDrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzext (v8i16 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+            (VPMOVZXWQrm addr:$src)>;
+
+  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
+            (VPMOVZXDQrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
+            (VPMOVZXDQrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (X86vzload addr:$src)))))),
+            (VPMOVZXDQrm addr:$src)>;
+
+  def : Pat<(v8i16 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBWrr VR128:$src)>;
+  def : Pat<(v4i32 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBDrr VR128:$src)>;
+  def : Pat<(v2i64 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBQrr VR128:$src)>;
+
+  def : Pat<(v4i32 (X86vsext (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>;
+  def : Pat<(v2i64 (X86vsext (v8i16 VR128:$src))), (VPMOVSXWQrr VR128:$src)>;
+
+  def : Pat<(v2i64 (X86vsext (v4i32 VR128:$src))), (VPMOVSXDQrr VR128:$src)>;
+
+  def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2i64
+                    (scalar_to_vector (loadi64 addr:$src))))))),
+            (VPMOVSXWDrm addr:$src)>;
+  def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2i64
+                    (scalar_to_vector (loadi64 addr:$src))))))),
+            (VPMOVSXDQrm addr:$src)>;
+  def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2f64
+                    (scalar_to_vector (loadf64 addr:$src))))))),
+            (VPMOVSXWDrm addr:$src)>;
+  def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2f64
+                    (scalar_to_vector (loadf64 addr:$src))))))),
+            (VPMOVSXDQrm addr:$src)>;
+  def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2i64
+                    (scalar_to_vector (loadi64 addr:$src))))))),
+            (VPMOVSXBWrm addr:$src)>;
+  def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2f64
+                    (scalar_to_vector (loadf64 addr:$src))))))),
+            (VPMOVSXBWrm addr:$src)>;
+
+  def : Pat<(v4i32 (X86vsext (v16i8 (bitconvert (v4i32
+                    (scalar_to_vector (loadi32 addr:$src))))))),
+            (VPMOVSXBDrm addr:$src)>;
+  def : Pat<(v2i64 (X86vsext (v8i16 (bitconvert (v4i32
+                    (scalar_to_vector (loadi32 addr:$src))))))),
+            (VPMOVSXWQrm addr:$src)>;
+  def : Pat<(v2i64 (X86vsext (v16i8 (bitconvert (v4i32
+                    (scalar_to_vector (extloadi32i16 addr:$src))))))),
+            (VPMOVSXBQrm addr:$src)>;
+}
+
+let Predicates = [UseSSE41] in {
+  def : Pat<(v8i16 (X86vzext (v16i8 VR128:$src))), (PMOVZXBWrr VR128:$src)>;
+  def : Pat<(v4i32 (X86vzext (v16i8 VR128:$src))), (PMOVZXBDrr VR128:$src)>;
+  def : Pat<(v2i64 (X86vzext (v16i8 VR128:$src))), (PMOVZXBQrr VR128:$src)>;
+
+  def : Pat<(v4i32 (X86vzext (v8i16 VR128:$src))), (PMOVZXWDrr VR128:$src)>;
+  def : Pat<(v2i64 (X86vzext (v8i16 VR128:$src))), (PMOVZXWQrr VR128:$src)>;
+
+  def : Pat<(v2i64 (X86vzext (v4i32 VR128:$src))), (PMOVZXDQrr VR128:$src)>;
+
+  def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
+            (PMOVZXBWrm addr:$src)>;
+  def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
+            (PMOVZXBWrm addr:$src)>;
+  def : Pat<(v4i32 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+            (PMOVZXBDrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))),
+            (PMOVZXBQrm addr:$src)>;
+
+  def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
+            (PMOVZXWDrm addr:$src)>;
+  def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
+            (PMOVZXWDrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzext (v8i16 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+            (PMOVZXWQrm addr:$src)>;
+
+  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
+            (PMOVZXDQrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
+            (PMOVZXDQrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (X86vzload addr:$src)))))),
+            (PMOVZXDQrm addr:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Extract Instructions
+//===----------------------------------------------------------------------===//
+
+/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
+multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
+  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
+                 (ins VR128:$src1, i32i8imm:$src2),
+                 !strconcat(OpcodeStr,
+                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1),
+                                         imm:$src2))]>,
+                  Sched<[WriteShuffle]>;
+  let neverHasSideEffects = 1, mayStore = 1,
+      SchedRW = [WriteShuffleLd, WriteRMW] in
+  def mr : SS4AIi8<opc, MRMDestMem, (outs),
+                 (ins i8mem:$dst, VR128:$src1, i32i8imm:$src2),
+                 !strconcat(OpcodeStr,
+                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(store (i8 (trunc (assertzext (X86pextrb (v16i8 VR128:$src1),
+						 imm:$src2)))), addr:$dst)]>;
+}
+
+let Predicates = [HasAVX] in
+  defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX;
+
+defm PEXTRB      : SS41I_extract8<0x14, "pextrb">;
+
+
+/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
+multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
+  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
+  def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
+                   (ins VR128:$src1, i32i8imm:$src2),
+                   !strconcat(OpcodeStr,
+                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                   []>, Sched<[WriteShuffle]>;
+
+  let neverHasSideEffects = 1, mayStore = 1,
+      SchedRW = [WriteShuffleLd, WriteRMW] in
+  def mr : SS4AIi8<opc, MRMDestMem, (outs),
+                 (ins i16mem:$dst, VR128:$src1, i32i8imm:$src2),
+                 !strconcat(OpcodeStr,
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(store (i16 (trunc (assertzext (X86pextrw (v8i16 VR128:$src1),
+						  imm:$src2)))), addr:$dst)]>;
+}
+
+let Predicates = [HasAVX] in
+  defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX;
+
+defm PEXTRW      : SS41I_extract16<0x15, "pextrw">;
+
+
+/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
+multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
+  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
+                 (ins VR128:$src1, i32i8imm:$src2),
+                 !strconcat(OpcodeStr,
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(set GR32:$dst,
+                  (extractelt (v4i32 VR128:$src1), imm:$src2))]>,
+                  Sched<[WriteShuffle]>;
+  let SchedRW = [WriteShuffleLd, WriteRMW] in
+  def mr : SS4AIi8<opc, MRMDestMem, (outs),
+                 (ins i32mem:$dst, VR128:$src1, i32i8imm:$src2),
+                 !strconcat(OpcodeStr,
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(store (extractelt (v4i32 VR128:$src1), imm:$src2),
+                          addr:$dst)]>;
+}
+
+let Predicates = [HasAVX] in
+  defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX;
+
+defm PEXTRD      : SS41I_extract32<0x16, "pextrd">;
+
+/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
+multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
+  def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst),
+                 (ins VR128:$src1, i32i8imm:$src2),
+                 !strconcat(OpcodeStr,
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(set GR64:$dst,
+                  (extractelt (v2i64 VR128:$src1), imm:$src2))]>,
+                  Sched<[WriteShuffle]>, REX_W;
+  let SchedRW = [WriteShuffleLd, WriteRMW] in
+  def mr : SS4AIi8<opc, MRMDestMem, (outs),
+                 (ins i64mem:$dst, VR128:$src1, i32i8imm:$src2),
+                 !strconcat(OpcodeStr,
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
+                          addr:$dst)]>, REX_W;
+}
+
+let Predicates = [HasAVX] in
+  defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W;
+
+defm PEXTRQ      : SS41I_extract64<0x16, "pextrq">;
+
+/// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
+/// destination
+multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr,
+                            OpndItins itins = DEFAULT_ITINS> {
+  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
+                 (ins VR128:$src1, i32i8imm:$src2),
+                 !strconcat(OpcodeStr,
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(set GR32orGR64:$dst,
+                    (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))],
+                    itins.rr>, Sched<[WriteFBlend]>;
+  let SchedRW = [WriteFBlendLd, WriteRMW] in
+  def mr : SS4AIi8<opc, MRMDestMem, (outs),
+                 (ins f32mem:$dst, VR128:$src1, i32i8imm:$src2),
+                 !strconcat(OpcodeStr,
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
+                          addr:$dst)], itins.rm>;
+}
+
+let ExeDomain = SSEPackedSingle in {
+  let Predicates = [UseAVX] in
+    defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX;
+  defm EXTRACTPS   : SS41I_extractf32<0x17, "extractps", SSE_EXTRACT_ITINS>;
+}
+
+// Also match an EXTRACTPS store when the store is done as f32 instead of i32.
+def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
+                                              imm:$src2))),
+                 addr:$dst),
+          (VEXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
+          Requires<[HasAVX]>;
+def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
+                                              imm:$src2))),
+                 addr:$dst),
+          (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
+          Requires<[UseSSE41]>;
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Insert Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
+  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+      (ins VR128:$src1, GR32orGR64:$src2, i32i8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [(set VR128:$dst,
+        (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
+      Sched<[WriteShuffle]>;
+  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+      (ins VR128:$src1, i8mem:$src2, i32i8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [(set VR128:$dst,
+        (X86pinsrb VR128:$src1, (extloadi8 addr:$src2),
+                   imm:$src3))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX] in
+  defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V;
+let Constraints = "$src1 = $dst" in
+  defm PINSRB  : SS41I_insert8<0x20, "pinsrb">;
+
+multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
+  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+      (ins VR128:$src1, GR32:$src2, i32i8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [(set VR128:$dst,
+        (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
+      Sched<[WriteShuffle]>;
+  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+      (ins VR128:$src1, i32mem:$src2, i32i8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [(set VR128:$dst,
+        (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2),
+                          imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX] in
+  defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V;
+let Constraints = "$src1 = $dst" in
+  defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
+
+multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
+  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+      (ins VR128:$src1, GR64:$src2, i32i8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [(set VR128:$dst,
+        (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
+      Sched<[WriteShuffle]>;
+  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+      (ins VR128:$src1, i64mem:$src2, i32i8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [(set VR128:$dst,
+        (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2),
+                          imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX] in
+  defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W;
+let Constraints = "$src1 = $dst" in
+  defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W;
+
+// insertps has a few different modes, there's the first two here below which
+// are optimized inserts that won't zero arbitrary elements in the destination
+// vector. The next one matches the intrinsic and could zero arbitrary elements
+// in the target vector.
+multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1,
+                           OpndItins itins = DEFAULT_ITINS> {
+  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+      (ins VR128:$src1, VR128:$src2, u32u8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [(set VR128:$dst,
+        (X86insertps VR128:$src1, VR128:$src2, imm:$src3))], itins.rr>,
+      Sched<[WriteFShuffle]>;
+  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+      (ins VR128:$src1, f32mem:$src2, u32u8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [(set VR128:$dst,
+        (X86insertps VR128:$src1,
+                   (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
+                    imm:$src3))], itins.rm>,
+      Sched<[WriteFShuffleLd, ReadAfterLd]>;
+}
+
+let ExeDomain = SSEPackedSingle in {
+  let Predicates = [UseAVX] in
+    defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V;
+  let Constraints = "$src1 = $dst" in
+    defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1, SSE_INSERT_ITINS>;
+}
+
+let Predicates = [UseSSE41] in {
+  // If we're inserting an element from a load or a null pshuf of a load,
+  // fold the load into the insertps instruction.
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd (v4f32
+                       (scalar_to_vector (loadf32 addr:$src2))), (i8 0)),
+                   imm:$src3)),
+            (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd
+                      (loadv4f32 addr:$src2), (i8 0)), imm:$src3)),
+            (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
+}
+
+let Predicates = [UseAVX] in {
+  // If we're inserting an element from a vbroadcast of a load, fold the
+  // load into the X86insertps instruction.
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
+                (X86VBroadcast (loadf32 addr:$src2)), imm:$src3)),
+            (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
+                (X86VBroadcast (loadv4f32 addr:$src2)), imm:$src3)),
+            (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Round Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd, string OpcodeStr,
+                            X86MemOperand x86memop, RegisterClass RC,
+                            PatFrag mem_frag32, PatFrag mem_frag64,
+                            Intrinsic V4F32Int, Intrinsic V2F64Int> {
+let ExeDomain = SSEPackedSingle in {
+  // Intrinsic operation, reg.
+  // Vector intrinsic operation, reg
+  def PSr : SS4AIi8<opcps, MRMSrcReg,
+                    (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2),
+                    !strconcat(OpcodeStr,
+                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set RC:$dst, (V4F32Int RC:$src1, imm:$src2))],
+                    IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAdd]>;
+
+  // Vector intrinsic operation, mem
+  def PSm : SS4AIi8<opcps, MRMSrcMem,
+                    (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2),
+                    !strconcat(OpcodeStr,
+                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set RC:$dst,
+                          (V4F32Int (mem_frag32 addr:$src1),imm:$src2))],
+                          IIC_SSE_ROUNDPS_MEM>, Sched<[WriteFAddLd]>;
+} // ExeDomain = SSEPackedSingle
+
+let ExeDomain = SSEPackedDouble in {
+  // Vector intrinsic operation, reg
+  def PDr : SS4AIi8<opcpd, MRMSrcReg,
+                    (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2),
+                    !strconcat(OpcodeStr,
+                    "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set RC:$dst, (V2F64Int RC:$src1, imm:$src2))],
+                    IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAdd]>;
+
+  // Vector intrinsic operation, mem
+  def PDm : SS4AIi8<opcpd, MRMSrcMem,
+                    (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2),
+                    !strconcat(OpcodeStr,
+                    "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set RC:$dst,
+                          (V2F64Int (mem_frag64 addr:$src1),imm:$src2))],
+                          IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAddLd]>;
+} // ExeDomain = SSEPackedDouble
+}
+
+multiclass sse41_fp_binop_rm<bits<8> opcss, bits<8> opcsd,
+                            string OpcodeStr,
+                            Intrinsic F32Int,
+                            Intrinsic F64Int, bit Is2Addr = 1> {
+let ExeDomain = GenericDomain in {
+  // Operation, reg.
+  let hasSideEffects = 0 in
+  def SSr : SS4AIi8<opcss, MRMSrcReg,
+      (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32i8imm:$src3),
+      !if(Is2Addr,
+          !strconcat(OpcodeStr,
+              "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+          !strconcat(OpcodeStr,
+              "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      []>, Sched<[WriteFAdd]>;
+
+  // Intrinsic operation, reg.
+  let isCodeGenOnly = 1 in
+  def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
+        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
+        !if(Is2Addr,
+            !strconcat(OpcodeStr,
+                "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+            !strconcat(OpcodeStr,
+                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+        [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2, imm:$src3))]>,
+        Sched<[WriteFAdd]>;
+
+  // Intrinsic operation, mem.
+  def SSm : SS4AIi8<opcss, MRMSrcMem,
+        (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32i8imm:$src3),
+        !if(Is2Addr,
+            !strconcat(OpcodeStr,
+                "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+            !strconcat(OpcodeStr,
+                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+        [(set VR128:$dst,
+             (F32Int VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
+        Sched<[WriteFAddLd, ReadAfterLd]>;
+
+  // Operation, reg.
+  let hasSideEffects = 0 in
+  def SDr : SS4AIi8<opcsd, MRMSrcReg,
+        (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32i8imm:$src3),
+        !if(Is2Addr,
+            !strconcat(OpcodeStr,
+                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+            !strconcat(OpcodeStr,
+                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+        []>, Sched<[WriteFAdd]>;
+
+  // Intrinsic operation, reg.
+  let isCodeGenOnly = 1 in
+  def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
+        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
+        !if(Is2Addr,
+            !strconcat(OpcodeStr,
+                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+            !strconcat(OpcodeStr,
+                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+        [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2, imm:$src3))]>,
+        Sched<[WriteFAdd]>;
+
+  // Intrinsic operation, mem.
+  def SDm : SS4AIi8<opcsd, MRMSrcMem,
+        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32i8imm:$src3),
+        !if(Is2Addr,
+            !strconcat(OpcodeStr,
+                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+            !strconcat(OpcodeStr,
+                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+        [(set VR128:$dst,
+              (F64Int VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
+        Sched<[WriteFAddLd, ReadAfterLd]>;
+} // ExeDomain = GenericDomain
+}
+
+// FP round - roundss, roundps, roundsd, roundpd
+let Predicates = [HasAVX] in {
+  // Intrinsic form
+  defm VROUND  : sse41_fp_unop_rm<0x08, 0x09, "vround", f128mem, VR128,
+                                  loadv4f32, loadv2f64,
+                                  int_x86_sse41_round_ps,
+                                  int_x86_sse41_round_pd>, VEX;
+  defm VROUNDY : sse41_fp_unop_rm<0x08, 0x09, "vround", f256mem, VR256,
+                                  loadv8f32, loadv4f64,
+                                  int_x86_avx_round_ps_256,
+                                  int_x86_avx_round_pd_256>, VEX, VEX_L;
+  defm VROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "vround",
+                                  int_x86_sse41_round_ss,
+                                  int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG;
+
+  def : Pat<(ffloor FR32:$src),
+            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>;
+  def : Pat<(f64 (ffloor FR64:$src)),
+            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>;
+  def : Pat<(f32 (fnearbyint FR32:$src)),
+            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
+  def : Pat<(f64 (fnearbyint FR64:$src)),
+            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
+  def : Pat<(f32 (fceil FR32:$src)),
+            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>;
+  def : Pat<(f64 (fceil FR64:$src)),
+            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>;
+  def : Pat<(f32 (frint FR32:$src)),
+            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
+  def : Pat<(f64 (frint FR64:$src)),
+            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
+  def : Pat<(f32 (ftrunc FR32:$src)),
+            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>;
+  def : Pat<(f64 (ftrunc FR64:$src)),
+            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>;
+
+  def : Pat<(v4f32 (ffloor VR128:$src)),
+            (VROUNDPSr VR128:$src, (i32 0x1))>;
+  def : Pat<(v4f32 (fnearbyint VR128:$src)),
+            (VROUNDPSr VR128:$src, (i32 0xC))>;
+  def : Pat<(v4f32 (fceil VR128:$src)),
+            (VROUNDPSr VR128:$src, (i32 0x2))>;
+  def : Pat<(v4f32 (frint VR128:$src)),
+            (VROUNDPSr VR128:$src, (i32 0x4))>;
+  def : Pat<(v4f32 (ftrunc VR128:$src)),
+            (VROUNDPSr VR128:$src, (i32 0x3))>;
+
+  def : Pat<(v2f64 (ffloor VR128:$src)),
+            (VROUNDPDr VR128:$src, (i32 0x1))>;
+  def : Pat<(v2f64 (fnearbyint VR128:$src)),
+            (VROUNDPDr VR128:$src, (i32 0xC))>;
+  def : Pat<(v2f64 (fceil VR128:$src)),
+            (VROUNDPDr VR128:$src, (i32 0x2))>;
+  def : Pat<(v2f64 (frint VR128:$src)),
+            (VROUNDPDr VR128:$src, (i32 0x4))>;
+  def : Pat<(v2f64 (ftrunc VR128:$src)),
+            (VROUNDPDr VR128:$src, (i32 0x3))>;
+
+  def : Pat<(v8f32 (ffloor VR256:$src)),
+            (VROUNDYPSr VR256:$src, (i32 0x1))>;
+  def : Pat<(v8f32 (fnearbyint VR256:$src)),
+            (VROUNDYPSr VR256:$src, (i32 0xC))>;
+  def : Pat<(v8f32 (fceil VR256:$src)),
+            (VROUNDYPSr VR256:$src, (i32 0x2))>;
+  def : Pat<(v8f32 (frint VR256:$src)),
+            (VROUNDYPSr VR256:$src, (i32 0x4))>;
+  def : Pat<(v8f32 (ftrunc VR256:$src)),
+            (VROUNDYPSr VR256:$src, (i32 0x3))>;
+
+  def : Pat<(v4f64 (ffloor VR256:$src)),
+            (VROUNDYPDr VR256:$src, (i32 0x1))>;
+  def : Pat<(v4f64 (fnearbyint VR256:$src)),
+            (VROUNDYPDr VR256:$src, (i32 0xC))>;
+  def : Pat<(v4f64 (fceil VR256:$src)),
+            (VROUNDYPDr VR256:$src, (i32 0x2))>;
+  def : Pat<(v4f64 (frint VR256:$src)),
+            (VROUNDYPDr VR256:$src, (i32 0x4))>;
+  def : Pat<(v4f64 (ftrunc VR256:$src)),
+            (VROUNDYPDr VR256:$src, (i32 0x3))>;
+}
+
+defm ROUND  : sse41_fp_unop_rm<0x08, 0x09, "round", f128mem, VR128,
+                               memopv4f32, memopv2f64,
+                               int_x86_sse41_round_ps, int_x86_sse41_round_pd>;
+let Constraints = "$src1 = $dst" in
+defm ROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "round",
+                               int_x86_sse41_round_ss, int_x86_sse41_round_sd>;
+
+let Predicates = [UseSSE41] in {
+  def : Pat<(ffloor FR32:$src),
+            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>;
+  def : Pat<(f64 (ffloor FR64:$src)),
+            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>;
+  def : Pat<(f32 (fnearbyint FR32:$src)),
+            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
+  def : Pat<(f64 (fnearbyint FR64:$src)),
+            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
+  def : Pat<(f32 (fceil FR32:$src)),
+            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>;
+  def : Pat<(f64 (fceil FR64:$src)),
+            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>;
+  def : Pat<(f32 (frint FR32:$src)),
+            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
+  def : Pat<(f64 (frint FR64:$src)),
+            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
+  def : Pat<(f32 (ftrunc FR32:$src)),
+            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>;
+  def : Pat<(f64 (ftrunc FR64:$src)),
+            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>;
+
+  def : Pat<(v4f32 (ffloor VR128:$src)),
+            (ROUNDPSr VR128:$src, (i32 0x1))>;
+  def : Pat<(v4f32 (fnearbyint VR128:$src)),
+            (ROUNDPSr VR128:$src, (i32 0xC))>;
+  def : Pat<(v4f32 (fceil VR128:$src)),
+            (ROUNDPSr VR128:$src, (i32 0x2))>;
+  def : Pat<(v4f32 (frint VR128:$src)),
+            (ROUNDPSr VR128:$src, (i32 0x4))>;
+  def : Pat<(v4f32 (ftrunc VR128:$src)),
+            (ROUNDPSr VR128:$src, (i32 0x3))>;
+
+  def : Pat<(v2f64 (ffloor VR128:$src)),
+            (ROUNDPDr VR128:$src, (i32 0x1))>;
+  def : Pat<(v2f64 (fnearbyint VR128:$src)),
+            (ROUNDPDr VR128:$src, (i32 0xC))>;
+  def : Pat<(v2f64 (fceil VR128:$src)),
+            (ROUNDPDr VR128:$src, (i32 0x2))>;
+  def : Pat<(v2f64 (frint VR128:$src)),
+            (ROUNDPDr VR128:$src, (i32 0x4))>;
+  def : Pat<(v2f64 (ftrunc VR128:$src)),
+            (ROUNDPDr VR128:$src, (i32 0x3))>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Packed Bit Test
+//===----------------------------------------------------------------------===//
+
+// ptest instruction we'll lower to this in X86ISelLowering primarily from
+// the intel intrinsic that corresponds to this.
+let Defs = [EFLAGS], Predicates = [HasAVX] in {
+def VPTESTrr  : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+                "vptest\t{$src2, $src1|$src1, $src2}",
+                [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
+                Sched<[WriteVecLogic]>, VEX;
+def VPTESTrm  : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
+                "vptest\t{$src2, $src1|$src1, $src2}",
+                [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>,
+                Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX;
+
+def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
+                "vptest\t{$src2, $src1|$src1, $src2}",
+                [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>,
+                Sched<[WriteVecLogic]>, VEX, VEX_L;
+def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
+                "vptest\t{$src2, $src1|$src1, $src2}",
+                [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>,
+                Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX, VEX_L;
+}
+
+let Defs = [EFLAGS] in {
+def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+              "ptest\t{$src2, $src1|$src1, $src2}",
+              [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
+              Sched<[WriteVecLogic]>;
+def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
+              "ptest\t{$src2, $src1|$src1, $src2}",
+              [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>,
+              Sched<[WriteVecLogicLd, ReadAfterLd]>;
+}
+
+// The bit test instructions below are AVX only
+multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                       X86MemOperand x86memop, PatFrag mem_frag, ValueType vt> {
+  def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
+            !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+            [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>,
+            Sched<[WriteVecLogic]>, VEX;
+  def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
+            !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+            [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>,
+            Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX;
+}
+
+let Defs = [EFLAGS], Predicates = [HasAVX] in {
+let ExeDomain = SSEPackedSingle in {
+defm VTESTPS  : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32>;
+defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32>,
+                            VEX_L;
+}
+let ExeDomain = SSEPackedDouble in {
+defm VTESTPD  : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64>;
+defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64>,
+                            VEX_L;
+}
+}
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Misc Instructions
+//===----------------------------------------------------------------------===//
+
+let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
+  def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                     "popcnt{w}\t{$src, $dst|$dst, $src}",
+                     [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)],
+                     IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>,
+                     OpSize16, XS;
+  def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                     "popcnt{w}\t{$src, $dst|$dst, $src}",
+                     [(set GR16:$dst, (ctpop (loadi16 addr:$src))),
+                      (implicit EFLAGS)], IIC_SSE_POPCNT_RM>,
+                      Sched<[WriteFAddLd]>, OpSize16, XS;
+
+  def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                     "popcnt{l}\t{$src, $dst|$dst, $src}",
+                     [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)],
+                     IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>,
+                     OpSize32, XS;
+
+  def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                     "popcnt{l}\t{$src, $dst|$dst, $src}",
+                     [(set GR32:$dst, (ctpop (loadi32 addr:$src))),
+                      (implicit EFLAGS)], IIC_SSE_POPCNT_RM>,
+                      Sched<[WriteFAddLd]>, OpSize32, XS;
+
+  def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+                      "popcnt{q}\t{$src, $dst|$dst, $src}",
+                      [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)],
+                      IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>, XS;
+  def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                      "popcnt{q}\t{$src, $dst|$dst, $src}",
+                      [(set GR64:$dst, (ctpop (loadi64 addr:$src))),
+                       (implicit EFLAGS)], IIC_SSE_POPCNT_RM>,
+                       Sched<[WriteFAddLd]>, XS;
+}
+
+
+
+// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
+multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
+                                 Intrinsic IntId128,
+                                 X86FoldableSchedWrite Sched> {
+  def rr128 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
+                    (ins VR128:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (IntId128 VR128:$src))]>,
+                    Sched<[Sched]>;
+  def rm128 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
+                     (ins i128mem:$src),
+                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                     [(set VR128:$dst,
+                       (IntId128 (bitconvert (memopv2i64 addr:$src))))]>,
+                    Sched<[Sched.Folded]>;
+}
+
+// PHMIN has the same profile as PSAD, thus we use the same scheduling
+// model, although the naming is misleading.
+let Predicates = [HasAVX] in
+defm VPHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "vphminposuw",
+                                         int_x86_sse41_phminposuw,
+                                         WriteVecIMul>, VEX;
+defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw",
+                                         int_x86_sse41_phminposuw,
+                                         WriteVecIMul>;
+
+/// SS41I_binop_rm_int - Simple SSE 4.1 binary operator
+multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr,
+                              Intrinsic IntId128, bit Is2Addr = 1,
+                              OpndItins itins = DEFAULT_ITINS> {
+  let isCommutable = 1 in
+  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
+       (ins VR128:$src1, VR128:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))],
+       itins.rr>, Sched<[itins.Sched]>;
+  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
+       (ins VR128:$src1, i128mem:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set VR128:$dst,
+         (IntId128 VR128:$src1, (bitconvert (memopv2i64 addr:$src2))))],
+       itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+/// SS41I_binop_rm_int_y - Simple SSE 4.1 binary operator
+multiclass SS41I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
+                                Intrinsic IntId256,
+                                X86FoldableSchedWrite Sched> {
+  let isCommutable = 1 in
+  def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst),
+       (ins VR256:$src1, VR256:$src2),
+       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>,
+       Sched<[Sched]>;
+  def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst),
+       (ins VR256:$src1, i256mem:$src2),
+       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set VR256:$dst,
+         (IntId256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2))))]>,
+       Sched<[Sched.Folded, ReadAfterLd]>;
+}
+
+
+/// SS48I_binop_rm - Simple SSE41 binary operator.
+multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                          ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+                          X86MemOperand x86memop, bit Is2Addr = 1,
+                          OpndItins itins = SSE_INTALU_ITINS_P> {
+  let isCommutable = 1 in
+  def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
+       Sched<[itins.Sched]>;
+  def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst,
+         (OpVT (OpNode RC:$src1, (bitconvert (memop_frag addr:$src2)))))]>,
+       Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+/// SS48I_binop_rm2 - Simple SSE41 binary operator with different src and dst
+/// types.
+multiclass SS48I_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                         ValueType DstVT, ValueType SrcVT, RegisterClass RC,
+                         PatFrag memop_frag, X86MemOperand x86memop,
+                         OpndItins itins,
+                         bit IsCommutable = 0, bit Is2Addr = 1> {
+  let isCommutable = IsCommutable in
+  def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
+       Sched<[itins.Sched]>;
+  def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
+                                     (bitconvert (memop_frag addr:$src2)))))]>,
+       Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX] in {
+  let isCommutable = 0 in
+  defm VPMINSB   : SS48I_binop_rm<0x38, "vpminsb", X86smin, v16i8, VR128,
+                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V;
+  defm VPMINSD   : SS48I_binop_rm<0x39, "vpminsd", X86smin, v4i32, VR128,
+                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V;
+  defm VPMINUD   : SS48I_binop_rm<0x3B, "vpminud", X86umin, v4i32, VR128,
+                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V;
+  defm VPMINUW   : SS48I_binop_rm<0x3A, "vpminuw", X86umin, v8i16, VR128,
+                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V;
+  defm VPMAXSB   : SS48I_binop_rm<0x3C, "vpmaxsb", X86smax, v16i8, VR128,
+                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V;
+  defm VPMAXSD   : SS48I_binop_rm<0x3D, "vpmaxsd", X86smax, v4i32, VR128,
+                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V;
+  defm VPMAXUD   : SS48I_binop_rm<0x3F, "vpmaxud", X86umax, v4i32, VR128,
+                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V;
+  defm VPMAXUW   : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v8i16, VR128,
+                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V;
+  defm VPMULDQ   : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v2i64, v4i32,
+                                   VR128, loadv2i64, i128mem,
+                                   SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V;
+}
+
+let Predicates = [HasAVX2] in {
+  let isCommutable = 0 in
+  defm VPMINSBY  : SS48I_binop_rm<0x38, "vpminsb", X86smin, v32i8, VR256,
+                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V, VEX_L;
+  defm VPMINSDY  : SS48I_binop_rm<0x39, "vpminsd", X86smin, v8i32, VR256,
+                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V, VEX_L;
+  defm VPMINUDY  : SS48I_binop_rm<0x3B, "vpminud", X86umin, v8i32, VR256,
+                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V, VEX_L;
+  defm VPMINUWY  : SS48I_binop_rm<0x3A, "vpminuw", X86umin, v16i16, VR256,
+                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V, VEX_L;
+  defm VPMAXSBY  : SS48I_binop_rm<0x3C, "vpmaxsb", X86smax, v32i8, VR256,
+                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V, VEX_L;
+  defm VPMAXSDY  : SS48I_binop_rm<0x3D, "vpmaxsd", X86smax, v8i32, VR256,
+                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V, VEX_L;
+  defm VPMAXUDY  : SS48I_binop_rm<0x3F, "vpmaxud", X86umax, v8i32, VR256,
+                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V, VEX_L;
+  defm VPMAXUWY  : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v16i16, VR256,
+                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V, VEX_L;
+  defm VPMULDQY : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v4i64, v8i32,
+                                  VR256, loadv4i64, i256mem,
+                                  SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
+}
+
+let Constraints = "$src1 = $dst" in {
+  let isCommutable = 0 in
+  defm PMINSB   : SS48I_binop_rm<0x38, "pminsb", X86smin, v16i8, VR128,
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+  defm PMINSD   : SS48I_binop_rm<0x39, "pminsd", X86smin, v4i32, VR128,
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+  defm PMINUD   : SS48I_binop_rm<0x3B, "pminud", X86umin, v4i32, VR128,
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+  defm PMINUW   : SS48I_binop_rm<0x3A, "pminuw", X86umin, v8i16, VR128,
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+  defm PMAXSB   : SS48I_binop_rm<0x3C, "pmaxsb", X86smax, v16i8, VR128,
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+  defm PMAXSD   : SS48I_binop_rm<0x3D, "pmaxsd", X86smax, v4i32, VR128,
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+  defm PMAXUD   : SS48I_binop_rm<0x3F, "pmaxud", X86umax, v4i32, VR128,
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+  defm PMAXUW   : SS48I_binop_rm<0x3E, "pmaxuw", X86umax, v8i16, VR128,
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+  defm PMULDQ   : SS48I_binop_rm2<0x28, "pmuldq", X86pmuldq, v2i64, v4i32,
+                                  VR128, memopv2i64, i128mem,
+                                  SSE_INTMUL_ITINS_P, 1>;
+}
+
+let Predicates = [HasAVX] in {
+  defm VPMULLD  : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
+                                 memopv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                 VEX_4V;
+  defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
+                                 memopv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                 VEX_4V;
+}
+let Predicates = [HasAVX2] in {
+  defm VPMULLDY  : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
+                                  memopv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V, VEX_L;
+  defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
+                                  memopv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V, VEX_L;
+}
+
+let Constraints = "$src1 = $dst" in {
+  defm PMULLD  : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
+                                memopv2i64, i128mem, 1, SSE_PMULLD_ITINS>;
+  defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128,
+                                memopv2i64, i128mem, 1, SSE_INTALUQ_ITINS_P>;
+}
+
+/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
+multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
+                 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
+                 X86MemOperand x86memop, bit Is2Addr = 1,
+                 OpndItins itins = DEFAULT_ITINS> {
+  let isCommutable = 1 in
+  def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
+        (ins RC:$src1, RC:$src2, u32u8imm:$src3),
+        !if(Is2Addr,
+            !strconcat(OpcodeStr,
+                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+            !strconcat(OpcodeStr,
+                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+        [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))], itins.rr>,
+        Sched<[itins.Sched]>;
+  def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
+        (ins RC:$src1, x86memop:$src2, u32u8imm:$src3),
+        !if(Is2Addr,
+            !strconcat(OpcodeStr,
+                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+            !strconcat(OpcodeStr,
+                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+        [(set RC:$dst,
+          (IntId RC:$src1,
+           (bitconvert (memop_frag addr:$src2)), imm:$src3))], itins.rm>,
+        Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX] in {
+  let isCommutable = 0 in {
+    let ExeDomain = SSEPackedSingle in {
+    defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps,
+                                        VR128, loadv4f32, f128mem, 0,
+                                        DEFAULT_ITINS_FBLENDSCHED>, VEX_4V;
+    defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps",
+                                    int_x86_avx_blend_ps_256, VR256, loadv8f32,
+                                    f256mem, 0, DEFAULT_ITINS_FBLENDSCHED>,
+                                    VEX_4V, VEX_L;
+    }
+    let ExeDomain = SSEPackedDouble in {
+    defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd,
+                                        VR128, loadv2f64, f128mem, 0,
+                                        DEFAULT_ITINS_FBLENDSCHED>, VEX_4V;
+    defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd",
+                                     int_x86_avx_blend_pd_256,VR256, loadv4f64,
+                                     f256mem, 0, DEFAULT_ITINS_FBLENDSCHED>,
+                                     VEX_4V, VEX_L;
+    }
+  defm VPBLENDW : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_sse41_pblendw,
+                                      VR128, loadv2i64, i128mem, 0,
+                                      DEFAULT_ITINS_BLENDSCHED>, VEX_4V;
+  defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
+                                      VR128, loadv2i64, i128mem, 0,
+                                      DEFAULT_ITINS_MPSADSCHED>, VEX_4V;
+  }
+  let ExeDomain = SSEPackedSingle in
+  defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
+                                   VR128, loadv4f32, f128mem, 0,
+                                   SSE_DPPS_ITINS>, VEX_4V;
+  let ExeDomain = SSEPackedDouble in
+  defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
+                                   VR128, loadv2f64, f128mem, 0,
+                                   SSE_DPPS_ITINS>, VEX_4V;
+  let ExeDomain = SSEPackedSingle in
+  defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
+                                    VR256, loadv8f32, i256mem, 0,
+                                    SSE_DPPS_ITINS>, VEX_4V, VEX_L;
+}
+
+let Predicates = [HasAVX2] in {
+  let isCommutable = 0 in {
+  defm VPBLENDWY : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_avx2_pblendw,
+                                  VR256, loadv4i64, i256mem, 0,
+                                  DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_L;
+  defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
+                                  VR256, loadv4i64, i256mem, 0,
+                                  DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_L;
+  }
+}
+
+let Constraints = "$src1 = $dst" in {
+  let isCommutable = 0 in {
+  let ExeDomain = SSEPackedSingle in
+  defm BLENDPS : SS41I_binop_rmi_int<0x0C, "blendps", int_x86_sse41_blendps,
+                                     VR128, memopv4f32, f128mem,
+                                     1, SSE_INTALU_ITINS_FBLEND_P>;
+  let ExeDomain = SSEPackedDouble in
+  defm BLENDPD : SS41I_binop_rmi_int<0x0D, "blendpd", int_x86_sse41_blendpd,
+                                     VR128, memopv2f64, f128mem,
+                                     1, SSE_INTALU_ITINS_FBLEND_P>;
+  defm PBLENDW : SS41I_binop_rmi_int<0x0E, "pblendw", int_x86_sse41_pblendw,
+                                     VR128, memopv2i64, i128mem,
+                                     1, SSE_INTALU_ITINS_FBLEND_P>;
+  defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
+                                     VR128, memopv2i64, i128mem,
+                                     1, SSE_MPSADBW_ITINS>;
+  }
+  let ExeDomain = SSEPackedSingle in
+  defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
+                                  VR128, memopv4f32, f128mem, 1,
+                                  SSE_DPPS_ITINS>;
+  let ExeDomain = SSEPackedDouble in
+  defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
+                                  VR128, memopv2f64, f128mem, 1,
+                                  SSE_DPPD_ITINS>;
+}
+
+/// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators
+multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
+                                    RegisterClass RC, X86MemOperand x86memop,
+                                    PatFrag mem_frag, Intrinsic IntId,
+                                    X86FoldableSchedWrite Sched> {
+  def rr : Ii8<opc, MRMSrcReg, (outs RC:$dst),
+                  (ins RC:$src1, RC:$src2, RC:$src3),
+                  !strconcat(OpcodeStr,
+                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                  [(set RC:$dst, (IntId RC:$src1, RC:$src2, RC:$src3))],
+                  NoItinerary, SSEPackedInt>, TAPD, VEX_4V, VEX_I8IMM,
+                Sched<[Sched]>;
+
+  def rm : Ii8<opc, MRMSrcMem, (outs RC:$dst),
+                  (ins RC:$src1, x86memop:$src2, RC:$src3),
+                  !strconcat(OpcodeStr,
+                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                  [(set RC:$dst,
+                        (IntId RC:$src1, (bitconvert (mem_frag addr:$src2)),
+                               RC:$src3))],
+                  NoItinerary, SSEPackedInt>, TAPD, VEX_4V, VEX_I8IMM,
+                Sched<[Sched.Folded, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX] in {
+let ExeDomain = SSEPackedDouble in {
+defm VBLENDVPD  : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem,
+                                           loadv2f64, int_x86_sse41_blendvpd,
+                                           WriteFVarBlend>;
+defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem,
+                                  loadv4f64, int_x86_avx_blendv_pd_256,
+                                  WriteFVarBlend>, VEX_L;
+} // ExeDomain = SSEPackedDouble
+let ExeDomain = SSEPackedSingle in {
+defm VBLENDVPS  : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem,
+                                           loadv4f32, int_x86_sse41_blendvps,
+                                           WriteFVarBlend>;
+defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem,
+                                  loadv8f32, int_x86_avx_blendv_ps_256,
+                                  WriteFVarBlend>, VEX_L;
+} // ExeDomain = SSEPackedSingle
+defm VPBLENDVB  : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem,
+                                           loadv2i64, int_x86_sse41_pblendvb,
+                                           WriteVarBlend>;
+}
+
+let Predicates = [HasAVX2] in {
+defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem,
+                                      loadv4i64, int_x86_avx2_pblendvb,
+                                      WriteVarBlend>, VEX_L;
+}
+
+let Predicates = [HasAVX] in {
+  def : Pat<(v16i8 (vselect (v16i8 VR128:$mask), (v16i8 VR128:$src1),
+                            (v16i8 VR128:$src2))),
+            (VPBLENDVBrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+  def : Pat<(v4i32 (vselect (v4i32 VR128:$mask), (v4i32 VR128:$src1),
+                            (v4i32 VR128:$src2))),
+            (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+  def : Pat<(v4f32 (vselect (v4i32 VR128:$mask), (v4f32 VR128:$src1),
+                            (v4f32 VR128:$src2))),
+            (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+  def : Pat<(v2i64 (vselect (v2i64 VR128:$mask), (v2i64 VR128:$src1),
+                            (v2i64 VR128:$src2))),
+            (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+  def : Pat<(v2f64 (vselect (v2i64 VR128:$mask), (v2f64 VR128:$src1),
+                            (v2f64 VR128:$src2))),
+            (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+  def : Pat<(v8i32 (vselect (v8i32 VR256:$mask), (v8i32 VR256:$src1),
+                            (v8i32 VR256:$src2))),
+            (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+  def : Pat<(v8f32 (vselect (v8i32 VR256:$mask), (v8f32 VR256:$src1),
+                            (v8f32 VR256:$src2))),
+            (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+  def : Pat<(v4i64 (vselect (v4i64 VR256:$mask), (v4i64 VR256:$src1),
+                            (v4i64 VR256:$src2))),
+            (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+  def : Pat<(v4f64 (vselect (v4i64 VR256:$mask), (v4f64 VR256:$src1),
+                            (v4f64 VR256:$src2))),
+            (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+
+  def : Pat<(v8f32 (X86Blendi (v8f32 VR256:$src1), (v8f32 VR256:$src2),
+                               (imm:$mask))),
+            (VBLENDPSYrri VR256:$src1, VR256:$src2, imm:$mask)>;
+  def : Pat<(v4f64 (X86Blendi (v4f64 VR256:$src1), (v4f64 VR256:$src2),
+                               (imm:$mask))),
+            (VBLENDPDYrri VR256:$src1, VR256:$src2, imm:$mask)>;
+
+  def : Pat<(v8i16 (X86Blendi (v8i16 VR128:$src1), (v8i16 VR128:$src2),
+                               (imm:$mask))),
+            (VPBLENDWrri VR128:$src1, VR128:$src2, imm:$mask)>;
+  def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$src1), (v4f32 VR128:$src2),
+                               (imm:$mask))),
+            (VBLENDPSrri VR128:$src1, VR128:$src2, imm:$mask)>;
+  def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$src1), (v2f64 VR128:$src2),
+                               (imm:$mask))),
+            (VBLENDPDrri VR128:$src1, VR128:$src2, imm:$mask)>;
+}
+
+let Predicates = [HasAVX2] in {
+  def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1),
+                            (v32i8 VR256:$src2))),
+            (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+  def : Pat<(v16i16 (X86Blendi (v16i16 VR256:$src1), (v16i16 VR256:$src2),
+                               (imm:$mask))),
+            (VPBLENDWYrri VR256:$src1, VR256:$src2, imm:$mask)>;
+}
+
+/// SS41I_ternary_int - SSE 4.1 ternary operator
+let Uses = [XMM0], Constraints = "$src1 = $dst" in {
+  multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
+                               X86MemOperand x86memop, Intrinsic IntId,
+                               OpndItins itins = DEFAULT_ITINS> {
+    def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
+                    (ins VR128:$src1, VR128:$src2),
+                    !strconcat(OpcodeStr,
+                     "\t{$src2, $dst|$dst, $src2}"),
+                    [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))],
+                    itins.rr>;
+
+    def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
+                    (ins VR128:$src1, x86memop:$src2),
+                    !strconcat(OpcodeStr,
+                     "\t{$src2, $dst|$dst, $src2}"),
+                    [(set VR128:$dst,
+                      (IntId VR128:$src1,
+                       (bitconvert (mem_frag addr:$src2)), XMM0))],
+                       itins.rm>;
+  }
+}
+
+let ExeDomain = SSEPackedDouble in
+defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64, f128mem,
+                                  int_x86_sse41_blendvpd>;
+let ExeDomain = SSEPackedSingle in
+defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32, f128mem,
+                                  int_x86_sse41_blendvps>;
+defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem,
+                                  int_x86_sse41_pblendvb>;
+
+// Aliases with the implicit xmm0 argument
+def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
+                (BLENDVPDrr0 VR128:$dst, VR128:$src2)>;
+def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
+                (BLENDVPDrm0 VR128:$dst, f128mem:$src2)>;
+def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
+                (BLENDVPSrr0 VR128:$dst, VR128:$src2)>;
+def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
+                (BLENDVPSrm0 VR128:$dst, f128mem:$src2)>;
+def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
+                (PBLENDVBrr0 VR128:$dst, VR128:$src2)>;
+def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
+                (PBLENDVBrm0 VR128:$dst, i128mem:$src2)>;
+
+let Predicates = [UseSSE41] in {
+  def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1),
+                            (v16i8 VR128:$src2))),
+            (PBLENDVBrr0 VR128:$src2, VR128:$src1)>;
+  def : Pat<(v4i32 (vselect (v4i32 XMM0), (v4i32 VR128:$src1),
+                            (v4i32 VR128:$src2))),
+            (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
+  def : Pat<(v4f32 (vselect (v4i32 XMM0), (v4f32 VR128:$src1),
+                            (v4f32 VR128:$src2))),
+            (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
+  def : Pat<(v2i64 (vselect (v2i64 XMM0), (v2i64 VR128:$src1),
+                            (v2i64 VR128:$src2))),
+            (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
+  def : Pat<(v2f64 (vselect (v2i64 XMM0), (v2f64 VR128:$src1),
+                            (v2f64 VR128:$src2))),
+            (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
+
+  def : Pat<(v8i16 (X86Blendi (v8i16 VR128:$src1), (v8i16 VR128:$src2),
+                               (imm:$mask))),
+            (PBLENDWrri VR128:$src1, VR128:$src2, imm:$mask)>;
+  def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$src1), (v4f32 VR128:$src2),
+                               (imm:$mask))),
+            (BLENDPSrri VR128:$src1, VR128:$src2, imm:$mask)>;
+  def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$src1), (v2f64 VR128:$src2),
+                               (imm:$mask))),
+            (BLENDPDrri VR128:$src1, VR128:$src2, imm:$mask)>;
+
+}
+
+let SchedRW = [WriteLoad] in {
+let Predicates = [HasAVX] in
+def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                       "vmovntdqa\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>,
+                       VEX;
+let Predicates = [HasAVX2] in
+def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
+                         "vmovntdqa\t{$src, $dst|$dst, $src}",
+                         [(set VR256:$dst, (int_x86_avx2_movntdqa addr:$src))]>,
+                         VEX, VEX_L;
+def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                       "movntdqa\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// SSE4.2 - Compare Instructions
+//===----------------------------------------------------------------------===//
+
+/// SS42I_binop_rm - Simple SSE 4.2 binary operator
+multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                          ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+                          X86MemOperand x86memop, bit Is2Addr = 1> {
+  def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>;
+  def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst,
+         (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>;
+}
+
+let Predicates = [HasAVX] in
+  defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
+                                 loadv2i64, i128mem, 0>, VEX_4V;
+
+let Predicates = [HasAVX2] in
+  defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
+                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
+
+let Constraints = "$src1 = $dst" in
+  defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
+                                memopv2i64, i128mem>;
+
+//===----------------------------------------------------------------------===//
+// SSE4.2 - String/text Processing Instructions
+//===----------------------------------------------------------------------===//
+
+// Packed Compare Implicit Length Strings, Return Mask
+multiclass pseudo_pcmpistrm<string asm> {
+  def REG : PseudoI<(outs VR128:$dst),
+                    (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+    [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, VR128:$src2,
+                                                  imm:$src3))]>;
+  def MEM : PseudoI<(outs VR128:$dst),
+                    (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+    [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1,
+                       (bc_v16i8 (memopv2i64 addr:$src2)), imm:$src3))]>;
+}
+
+let Defs = [EFLAGS], usesCustomInserter = 1 in {
+  defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128">, Requires<[HasAVX]>;
+  defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128">, Requires<[UseSSE42]>;
+}
+
+multiclass pcmpistrm_SS42AI<string asm> {
+  def rr : SS42AI<0x62, MRMSrcReg, (outs),
+    (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+    []>, Sched<[WritePCmpIStrM]>;
+  let mayLoad = 1 in
+  def rm :SS42AI<0x62, MRMSrcMem, (outs),
+    (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+    []>, Sched<[WritePCmpIStrMLd, ReadAfterLd]>;
+}
+
+let Defs = [XMM0, EFLAGS], neverHasSideEffects = 1 in {
+  let Predicates = [HasAVX] in
+  defm VPCMPISTRM128 : pcmpistrm_SS42AI<"vpcmpistrm">, VEX;
+  defm PCMPISTRM128  : pcmpistrm_SS42AI<"pcmpistrm"> ;
+}
+
+// Packed Compare Explicit Length Strings, Return Mask
+multiclass pseudo_pcmpestrm<string asm> {
+  def REG : PseudoI<(outs VR128:$dst),
+                    (ins VR128:$src1, VR128:$src3, i8imm:$src5),
+    [(set VR128:$dst, (int_x86_sse42_pcmpestrm128
+                       VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
+  def MEM : PseudoI<(outs VR128:$dst),
+                    (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
+    [(set VR128:$dst, (int_x86_sse42_pcmpestrm128 VR128:$src1, EAX,
+                       (bc_v16i8 (memopv2i64 addr:$src3)), EDX, imm:$src5))]>;
+}
+
+let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
+  defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128">, Requires<[HasAVX]>;
+  defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128">, Requires<[UseSSE42]>;
+}
+
+multiclass SS42AI_pcmpestrm<string asm> {
+  def rr : SS42AI<0x60, MRMSrcReg, (outs),
+    (ins VR128:$src1, VR128:$src3, i8imm:$src5),
+    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+    []>, Sched<[WritePCmpEStrM]>;
+  let mayLoad = 1 in
+  def rm : SS42AI<0x60, MRMSrcMem, (outs),
+    (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
+    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+    []>, Sched<[WritePCmpEStrMLd, ReadAfterLd]>;
+}
+
+let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in {
+  let Predicates = [HasAVX] in
+  defm VPCMPESTRM128 : SS42AI_pcmpestrm<"vpcmpestrm">, VEX;
+  defm PCMPESTRM128 :  SS42AI_pcmpestrm<"pcmpestrm">;
+}
+
+// Packed Compare Implicit Length Strings, Return Index
+multiclass pseudo_pcmpistri<string asm> {
+  def REG : PseudoI<(outs GR32:$dst),
+                    (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+    [(set GR32:$dst, EFLAGS,
+      (X86pcmpistri VR128:$src1, VR128:$src2, imm:$src3))]>;
+  def MEM : PseudoI<(outs GR32:$dst),
+                    (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+    [(set GR32:$dst, EFLAGS, (X86pcmpistri VR128:$src1,
+                              (bc_v16i8 (memopv2i64 addr:$src2)), imm:$src3))]>;
+}
+
+let Defs = [EFLAGS], usesCustomInserter = 1 in {
+  defm VPCMPISTRI : pseudo_pcmpistri<"#VPCMPISTRI">, Requires<[HasAVX]>;
+  defm PCMPISTRI  : pseudo_pcmpistri<"#PCMPISTRI">, Requires<[UseSSE42]>;
+}
+
+multiclass SS42AI_pcmpistri<string asm> {
+  def rr : SS42AI<0x63, MRMSrcReg, (outs),
+    (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+    []>, Sched<[WritePCmpIStrI]>;
+  let mayLoad = 1 in
+  def rm : SS42AI<0x63, MRMSrcMem, (outs),
+    (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+    []>, Sched<[WritePCmpIStrILd, ReadAfterLd]>;
+}
+
+let Defs = [ECX, EFLAGS], neverHasSideEffects = 1 in {
+  let Predicates = [HasAVX] in
+  defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX;
+  defm PCMPISTRI  : SS42AI_pcmpistri<"pcmpistri">;
+}
+
+// Packed Compare Explicit Length Strings, Return Index
+multiclass pseudo_pcmpestri<string asm> {
+  def REG : PseudoI<(outs GR32:$dst),
+                    (ins VR128:$src1, VR128:$src3, i8imm:$src5),
+    [(set GR32:$dst, EFLAGS,
+      (X86pcmpestri VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
+  def MEM : PseudoI<(outs GR32:$dst),
+                    (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
+    [(set GR32:$dst, EFLAGS,
+      (X86pcmpestri VR128:$src1, EAX, (bc_v16i8 (memopv2i64 addr:$src3)), EDX,
+       imm:$src5))]>;
+}
+
+let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
+  defm VPCMPESTRI : pseudo_pcmpestri<"#VPCMPESTRI">, Requires<[HasAVX]>;
+  defm PCMPESTRI  : pseudo_pcmpestri<"#PCMPESTRI">, Requires<[UseSSE42]>;
+}
+
+multiclass SS42AI_pcmpestri<string asm> {
+  def rr : SS42AI<0x61, MRMSrcReg, (outs),
+    (ins VR128:$src1, VR128:$src3, i8imm:$src5),
+    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+    []>, Sched<[WritePCmpEStrI]>;
+  let mayLoad = 1 in
+  def rm : SS42AI<0x61, MRMSrcMem, (outs),
+    (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
+    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+    []>, Sched<[WritePCmpEStrILd, ReadAfterLd]>;
+}
+
+let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in {
+  let Predicates = [HasAVX] in
+  defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX;
+  defm PCMPESTRI  : SS42AI_pcmpestri<"pcmpestri">;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE4.2 - CRC Instructions
+//===----------------------------------------------------------------------===//
+
+// No CRC instructions have AVX equivalents
+
+// crc intrinsic instruction
+// This set of instructions are only rm, the only difference is the size
+// of r and m.
+class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut,
+                   RegisterClass RCIn, SDPatternOperator Int> :
+  SS42FI<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2),
+         !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
+         [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))], IIC_CRC32_REG>,
+         Sched<[WriteFAdd]>;
+
+class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut,
+                   X86MemOperand x86memop, SDPatternOperator Int> :
+  SS42FI<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2),
+         !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
+         [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))],
+         IIC_CRC32_MEM>, Sched<[WriteFAddLd, ReadAfterLd]>;
+
+let Constraints = "$src1 = $dst" in {
+  def CRC32r32m8  : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem,
+                                 int_x86_sse42_crc32_32_8>;
+  def CRC32r32r8  : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8,
+                                 int_x86_sse42_crc32_32_8>;
+  def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem,
+                                 int_x86_sse42_crc32_32_16>, OpSize16;
+  def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16,
+                                 int_x86_sse42_crc32_32_16>, OpSize16;
+  def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem,
+                                 int_x86_sse42_crc32_32_32>, OpSize32;
+  def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32,
+                                 int_x86_sse42_crc32_32_32>, OpSize32;
+  def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem,
+                                 int_x86_sse42_crc32_64_64>, REX_W;
+  def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64,
+                                 int_x86_sse42_crc32_64_64>, REX_W;
+  let hasSideEffects = 0 in {
+    let mayLoad = 1 in
+    def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem,
+                                   null_frag>, REX_W;
+    def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8,
+                                   null_frag>, REX_W;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// SHA-NI Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
+                      bit UsesXMM0 = 0> {
+  def rr : I<Opc, MRMSrcReg, (outs VR128:$dst),
+             (ins VR128:$src1, VR128:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+             [!if(UsesXMM0,
+                  (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)),
+                  (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>, T8;
+
+  def rm : I<Opc, MRMSrcMem, (outs VR128:$dst),
+             (ins VR128:$src1, i128mem:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+             [!if(UsesXMM0,
+                  (set VR128:$dst, (IntId VR128:$src1,
+                    (bc_v4i32 (memopv2i64 addr:$src2)), XMM0)),
+                  (set VR128:$dst, (IntId VR128:$src1,
+                    (bc_v4i32 (memopv2i64 addr:$src2)))))]>, T8;
+}
+
+let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
+  def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst),
+                         (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+                         "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                         [(set VR128:$dst,
+                           (int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
+                            (i8 imm:$src3)))]>, TA;
+  def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
+                         (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+                         "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                         [(set VR128:$dst,
+                           (int_x86_sha1rnds4 VR128:$src1,
+                            (bc_v4i32 (memopv2i64 addr:$src2)),
+                            (i8 imm:$src3)))]>, TA;
+
+  defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte>;
+  defm SHA1MSG1  : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1>;
+  defm SHA1MSG2  : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2>;
+
+  let Uses=[XMM0] in
+  defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2, 1>;
+
+  defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1>;
+  defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2>;
+}
+
+// Aliases with explicit %xmm0
+def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
+                (SHA256RNDS2rr VR128:$dst, VR128:$src2)>;
+def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
+                (SHA256RNDS2rm VR128:$dst, i128mem:$src2)>;
+
+//===----------------------------------------------------------------------===//
+// AES-NI Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
+                              Intrinsic IntId128, bit Is2Addr = 1> {
+  def rr : AES8I<opc, MRMSrcReg, (outs VR128:$dst),
+       (ins VR128:$src1, VR128:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
+       Sched<[WriteAESDecEnc]>;
+  def rm : AES8I<opc, MRMSrcMem, (outs VR128:$dst),
+       (ins VR128:$src1, i128mem:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set VR128:$dst,
+         (IntId128 VR128:$src1, (memopv2i64 addr:$src2)))]>,
+       Sched<[WriteAESDecEncLd, ReadAfterLd]>;
+}
+
+// Perform One Round of an AES Encryption/Decryption Flow
+let Predicates = [HasAVX, HasAES] in {
+  defm VAESENC          : AESI_binop_rm_int<0xDC, "vaesenc",
+                         int_x86_aesni_aesenc, 0>, VEX_4V;
+  defm VAESENCLAST      : AESI_binop_rm_int<0xDD, "vaesenclast",
+                         int_x86_aesni_aesenclast, 0>, VEX_4V;
+  defm VAESDEC          : AESI_binop_rm_int<0xDE, "vaesdec",
+                         int_x86_aesni_aesdec, 0>, VEX_4V;
+  defm VAESDECLAST      : AESI_binop_rm_int<0xDF, "vaesdeclast",
+                         int_x86_aesni_aesdeclast, 0>, VEX_4V;
+}
+
+let Constraints = "$src1 = $dst" in {
+  defm AESENC          : AESI_binop_rm_int<0xDC, "aesenc",
+                         int_x86_aesni_aesenc>;
+  defm AESENCLAST      : AESI_binop_rm_int<0xDD, "aesenclast",
+                         int_x86_aesni_aesenclast>;
+  defm AESDEC          : AESI_binop_rm_int<0xDE, "aesdec",
+                         int_x86_aesni_aesdec>;
+  defm AESDECLAST      : AESI_binop_rm_int<0xDF, "aesdeclast",
+                         int_x86_aesni_aesdeclast>;
+}
+
+// Perform the AES InvMixColumn Transformation
+let Predicates = [HasAVX, HasAES] in {
+  def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
+      (ins VR128:$src1),
+      "vaesimc\t{$src1, $dst|$dst, $src1}",
+      [(set VR128:$dst,
+        (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>,
+      VEX;
+  def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
+      (ins i128mem:$src1),
+      "vaesimc\t{$src1, $dst|$dst, $src1}",
+      [(set VR128:$dst, (int_x86_aesni_aesimc (loadv2i64 addr:$src1)))]>,
+      Sched<[WriteAESIMCLd]>, VEX;
+}
+def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
+  (ins VR128:$src1),
+  "aesimc\t{$src1, $dst|$dst, $src1}",
+  [(set VR128:$dst,
+    (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>;
+def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
+  (ins i128mem:$src1),
+  "aesimc\t{$src1, $dst|$dst, $src1}",
+  [(set VR128:$dst, (int_x86_aesni_aesimc (memopv2i64 addr:$src1)))]>,
+  Sched<[WriteAESIMCLd]>;
+
+// AES Round Key Generation Assist
+let Predicates = [HasAVX, HasAES] in {
+  def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
+      (ins VR128:$src1, i8imm:$src2),
+      "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+      [(set VR128:$dst,
+        (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
+      Sched<[WriteAESKeyGen]>, VEX;
+  def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
+      (ins i128mem:$src1, i8imm:$src2),
+      "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+      [(set VR128:$dst,
+        (int_x86_aesni_aeskeygenassist (loadv2i64 addr:$src1), imm:$src2))]>,
+      Sched<[WriteAESKeyGenLd]>, VEX;
+}
+def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
+  (ins VR128:$src1, i8imm:$src2),
+  "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+  [(set VR128:$dst,
+    (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
+  Sched<[WriteAESKeyGen]>;
+def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
+  (ins i128mem:$src1, i8imm:$src2),
+  "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+  [(set VR128:$dst,
+    (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>,
+  Sched<[WriteAESKeyGenLd]>;
+
+//===----------------------------------------------------------------------===//
+// PCLMUL Instructions
+//===----------------------------------------------------------------------===//
+
+// AVX carry-less Multiplication instructions
+def VPCLMULQDQrr : AVXPCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+           "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+           [(set VR128:$dst,
+             (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>,
+           Sched<[WriteCLMul]>;
+
+def VPCLMULQDQrm : AVXPCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+           "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+           [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
+                              (loadv2i64 addr:$src2), imm:$src3))]>,
+           Sched<[WriteCLMulLd, ReadAfterLd]>;
+
+// Carry-less Multiplication instructions
+let Constraints = "$src1 = $dst" in {
+def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+           "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+           [(set VR128:$dst,
+             (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))],
+             IIC_SSE_PCLMULQDQ_RR>, Sched<[WriteCLMul]>;
+
+def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+           "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+           [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
+                              (memopv2i64 addr:$src2), imm:$src3))],
+                              IIC_SSE_PCLMULQDQ_RM>,
+           Sched<[WriteCLMulLd, ReadAfterLd]>;
+} // Constraints = "$src1 = $dst"
+
+
+multiclass pclmul_alias<string asm, int immop> {
+  def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"),
+                  (PCLMULQDQrr VR128:$dst, VR128:$src, immop), 0>;
+
+  def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"),
+                  (PCLMULQDQrm VR128:$dst, i128mem:$src, immop), 0>;
+
+  def : InstAlias<!strconcat("vpclmul", asm,
+                             "dq {$src2, $src1, $dst|$dst, $src1, $src2}"),
+                  (VPCLMULQDQrr VR128:$dst, VR128:$src1, VR128:$src2, immop),
+                  0>;
+
+  def : InstAlias<!strconcat("vpclmul", asm,
+                             "dq {$src2, $src1, $dst|$dst, $src1, $src2}"),
+                  (VPCLMULQDQrm VR128:$dst, VR128:$src1, i128mem:$src2, immop),
+                  0>;
+}
+defm : pclmul_alias<"hqhq", 0x11>;
+defm : pclmul_alias<"hqlq", 0x01>;
+defm : pclmul_alias<"lqhq", 0x10>;
+defm : pclmul_alias<"lqlq", 0x00>;
+
+//===----------------------------------------------------------------------===//
+// SSE4A Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasSSE4A] in {
+
+let Constraints = "$src = $dst" in {
+def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
+                 (ins VR128:$src, i8imm:$len, i8imm:$idx),
+                 "extrq\t{$idx, $len, $src|$src, $len, $idx}",
+                 [(set VR128:$dst, (int_x86_sse4a_extrqi VR128:$src, imm:$len,
+                                    imm:$idx))]>, PD;
+def EXTRQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
+              (ins VR128:$src, VR128:$mask),
+              "extrq\t{$mask, $src|$src, $mask}",
+              [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src,
+                                 VR128:$mask))]>, PD;
+
+def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
+                   (ins VR128:$src, VR128:$src2, i8imm:$len, i8imm:$idx),
+                   "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
+                   [(set VR128:$dst, (int_x86_sse4a_insertqi VR128:$src,
+                                      VR128:$src2, imm:$len, imm:$idx))]>, XD;
+def INSERTQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
+                 (ins VR128:$src, VR128:$mask),
+                 "insertq\t{$mask, $src|$src, $mask}",
+                 [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,
+                                    VR128:$mask))]>, XD;
+}
+
+def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
+                "movntss\t{$src, $dst|$dst, $src}",
+                [(int_x86_sse4a_movnt_ss addr:$dst, VR128:$src)]>, XS;
+
+def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                "movntsd\t{$src, $dst|$dst, $src}",
+                [(int_x86_sse4a_movnt_sd addr:$dst, VR128:$src)]>, XD;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX Instructions
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// VBROADCAST - Load from memory and broadcast to all elements of the
+//              destination operand
+//
+class avx_broadcast<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                    X86MemOperand x86memop, Intrinsic Int, SchedWrite Sched> :
+  AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+        !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+        [(set RC:$dst, (Int addr:$src))]>, Sched<[Sched]>, VEX;
+
+class avx_broadcast_no_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                           X86MemOperand x86memop, ValueType VT,
+                           PatFrag ld_frag, SchedWrite Sched> :
+  AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+        !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+        [(set RC:$dst, (VT (X86VBroadcast (ld_frag addr:$src))))]>,
+        Sched<[Sched]>, VEX {
+    let mayLoad = 1;
+}
+
+// AVX2 adds register forms
+class avx2_broadcast_reg<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                         Intrinsic Int, SchedWrite Sched> :
+  AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
+         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+         [(set RC:$dst, (Int VR128:$src))]>, Sched<[Sched]>, VEX;
+
+let ExeDomain = SSEPackedSingle in {
+  def VBROADCASTSSrm  : avx_broadcast_no_int<0x18, "vbroadcastss", VR128,
+                                             f32mem, v4f32, loadf32, WriteLoad>;
+  def VBROADCASTSSYrm : avx_broadcast_no_int<0x18, "vbroadcastss", VR256,
+                                             f32mem, v8f32, loadf32,
+                                             WriteFShuffleLd>, VEX_L;
+}
+let ExeDomain = SSEPackedDouble in
+def VBROADCASTSDYrm  : avx_broadcast_no_int<0x19, "vbroadcastsd", VR256, f64mem,
+                                    v4f64, loadf64, WriteFShuffleLd>, VEX_L;
+def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem,
+                                   int_x86_avx_vbroadcastf128_pd_256,
+                                   WriteFShuffleLd>, VEX_L;
+
+let ExeDomain = SSEPackedSingle in {
+  def VBROADCASTSSrr  : avx2_broadcast_reg<0x18, "vbroadcastss", VR128,
+                                           int_x86_avx2_vbroadcast_ss_ps,
+                                           WriteFShuffle>;
+  def VBROADCASTSSYrr : avx2_broadcast_reg<0x18, "vbroadcastss", VR256,
+                                      int_x86_avx2_vbroadcast_ss_ps_256,
+                                      WriteFShuffle256>, VEX_L;
+}
+let ExeDomain = SSEPackedDouble in
+def VBROADCASTSDYrr  : avx2_broadcast_reg<0x19, "vbroadcastsd", VR256,
+                                      int_x86_avx2_vbroadcast_sd_pd_256,
+                                      WriteFShuffle256>, VEX_L;
+
+let Predicates = [HasAVX2] in
+def VBROADCASTI128 : avx_broadcast<0x5A, "vbroadcasti128", VR256, i128mem,
+                                   int_x86_avx2_vbroadcasti128, WriteLoad>,
+                                   VEX_L;
+
+let Predicates = [HasAVX] in
+def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src),
+          (VBROADCASTF128 addr:$src)>;
+
+
+//===----------------------------------------------------------------------===//
+// VINSERTF128 - Insert packed floating-point values
+//
+let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in {
+def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
+          (ins VR256:$src1, VR128:$src2, i8imm:$src3),
+          "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          []>, Sched<[WriteFShuffle]>, VEX_4V, VEX_L;
+let mayLoad = 1 in
+def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
+          (ins VR256:$src1, f128mem:$src2, i8imm:$src3),
+          "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          []>, Sched<[WriteFShuffleLd, ReadAfterLd]>, VEX_4V, VEX_L;
+}
+
+let Predicates = [HasAVX] in {
+def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2),
+                                   (iPTR imm)),
+          (VINSERTF128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2),
+                                   (iPTR imm)),
+          (VINSERTF128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+
+def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (loadv4f32 addr:$src2),
+                                   (iPTR imm)),
+          (VINSERTF128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (loadv2f64 addr:$src2),
+                                   (iPTR imm)),
+          (VINSERTF128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+}
+
+let Predicates = [HasAVX1Only] in {
+def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
+                                   (iPTR imm)),
+          (VINSERTF128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
+                                   (iPTR imm)),
+          (VINSERTF128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
+                                   (iPTR imm)),
+          (VINSERTF128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
+                                   (iPTR imm)),
+          (VINSERTF128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+
+def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2),
+                                   (iPTR imm)),
+          (VINSERTF128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1),
+                                   (bc_v4i32 (loadv2i64 addr:$src2)),
+                                   (iPTR imm)),
+          (VINSERTF128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1),
+                                   (bc_v16i8 (loadv2i64 addr:$src2)),
+                                   (iPTR imm)),
+          (VINSERTF128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1),
+                                   (bc_v8i16 (loadv2i64 addr:$src2)),
+                                   (iPTR imm)),
+          (VINSERTF128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+}
+
+//===----------------------------------------------------------------------===//
+// VEXTRACTF128 - Extract packed floating-point values
+//
+let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in {
+def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
+          (ins VR256:$src1, i8imm:$src2),
+          "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+          []>, Sched<[WriteFShuffle]>, VEX, VEX_L;
+let mayStore = 1 in
+def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
+          (ins f128mem:$dst, VR256:$src1, i8imm:$src2),
+          "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+          []>, Sched<[WriteStore]>, VEX, VEX_L;
+}
+
+// AVX1 patterns
+let Predicates = [HasAVX] in {
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
+          (v4f32 (VEXTRACTF128rr
+                    (v8f32 VR256:$src1),
+                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
+          (v2f64 (VEXTRACTF128rr
+                    (v4f64 VR256:$src1),
+                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+
+def : Pat<(store (v4f32 (vextract128_extract:$ext (v8f32 VR256:$src1),
+                         (iPTR imm))), addr:$dst),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextract128_imm VR128:$ext))>;
+def : Pat<(store (v2f64 (vextract128_extract:$ext (v4f64 VR256:$src1),
+                         (iPTR imm))), addr:$dst),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextract128_imm VR128:$ext))>;
+}
+
+let Predicates = [HasAVX1Only] in {
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
+          (v2i64 (VEXTRACTF128rr
+                  (v4i64 VR256:$src1),
+                  (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
+          (v4i32 (VEXTRACTF128rr
+                  (v8i32 VR256:$src1),
+                  (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
+          (v8i16 (VEXTRACTF128rr
+                  (v16i16 VR256:$src1),
+                  (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
+          (v16i8 (VEXTRACTF128rr
+                  (v32i8 VR256:$src1),
+                  (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+
+def : Pat<(alignedstore (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextract128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextract128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextract128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextract128_imm VR128:$ext))>;
+}
+
+//===----------------------------------------------------------------------===//
+// VMASKMOV - Conditional SIMD Packed Loads and Stores
+//
+multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
+                          Intrinsic IntLd, Intrinsic IntLd256,
+                          Intrinsic IntSt, Intrinsic IntSt256> {
+  def rm  : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst),
+             (ins VR128:$src1, f128mem:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
+             VEX_4V;
+  def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
+             (ins VR256:$src1, f256mem:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
+             VEX_4V, VEX_L;
+  def mr  : AVX8I<opc_mr, MRMDestMem, (outs),
+             (ins f128mem:$dst, VR128:$src1, VR128:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V;
+  def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
+             (ins f256mem:$dst, VR256:$src1, VR256:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L;
+}
+
+let ExeDomain = SSEPackedSingle in
+defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps",
+                                 int_x86_avx_maskload_ps,
+                                 int_x86_avx_maskload_ps_256,
+                                 int_x86_avx_maskstore_ps,
+                                 int_x86_avx_maskstore_ps_256>;
+let ExeDomain = SSEPackedDouble in
+defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
+                                 int_x86_avx_maskload_pd,
+                                 int_x86_avx_maskload_pd_256,
+                                 int_x86_avx_maskstore_pd,
+                                 int_x86_avx_maskstore_pd_256>;
+
+//===----------------------------------------------------------------------===//
+// VPERMIL - Permute Single and Double Floating-Point Values
+//
+multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
+                      RegisterClass RC, X86MemOperand x86memop_f,
+                      X86MemOperand x86memop_i, PatFrag i_frag,
+                      Intrinsic IntVar, ValueType vt> {
+  def rr  : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst),
+             (ins RC:$src1, RC:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set RC:$dst, (IntVar RC:$src1, RC:$src2))]>, VEX_4V,
+             Sched<[WriteFShuffle]>;
+  def rm  : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst),
+             (ins RC:$src1, x86memop_i:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set RC:$dst, (IntVar RC:$src1,
+                             (bitconvert (i_frag addr:$src2))))]>, VEX_4V,
+             Sched<[WriteFShuffleLd, ReadAfterLd]>;
+
+  def ri  : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
+             (ins RC:$src1, i8imm:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set RC:$dst, (vt (X86VPermilp RC:$src1, (i8 imm:$src2))))]>, VEX,
+             Sched<[WriteFShuffle]>;
+  def mi  : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
+             (ins x86memop_f:$src1, i8imm:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set RC:$dst,
+               (vt (X86VPermilp (memop addr:$src1), (i8 imm:$src2))))]>, VEX,
+             Sched<[WriteFShuffleLd]>;
+}
+
+let ExeDomain = SSEPackedSingle in {
+  defm VPERMILPS  : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
+                               loadv2i64, int_x86_avx_vpermilvar_ps, v4f32>;
+  defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
+                       loadv4i64, int_x86_avx_vpermilvar_ps_256, v8f32>, VEX_L;
+}
+let ExeDomain = SSEPackedDouble in {
+  defm VPERMILPD  : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
+                               loadv2i64, int_x86_avx_vpermilvar_pd, v2f64>;
+  defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
+                       loadv4i64, int_x86_avx_vpermilvar_pd_256, v4f64>, VEX_L;
+}
+
+let Predicates = [HasAVX] in {
+def : Pat<(v8i32 (X86VPermilp VR256:$src1, (i8 imm:$imm))),
+          (VPERMILPSYri VR256:$src1, imm:$imm)>;
+def : Pat<(v4i64 (X86VPermilp VR256:$src1, (i8 imm:$imm))),
+          (VPERMILPDYri VR256:$src1, imm:$imm)>;
+def : Pat<(v8i32 (X86VPermilp (bc_v8i32 (loadv4i64 addr:$src1)),
+                               (i8 imm:$imm))),
+          (VPERMILPSYmi addr:$src1, imm:$imm)>;
+def : Pat<(v4i64 (X86VPermilp (loadv4i64 addr:$src1), (i8 imm:$imm))),
+          (VPERMILPDYmi addr:$src1, imm:$imm)>;
+
+def : Pat<(v2i64 (X86VPermilp VR128:$src1, (i8 imm:$imm))),
+          (VPERMILPDri VR128:$src1, imm:$imm)>;
+def : Pat<(v2i64 (X86VPermilp (loadv2i64 addr:$src1), (i8 imm:$imm))),
+          (VPERMILPDmi addr:$src1, imm:$imm)>;
+}
+
+//===----------------------------------------------------------------------===//
+// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
+//
+let ExeDomain = SSEPackedSingle in {
+def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
+          (ins VR256:$src1, VR256:$src2, i8imm:$src3),
+          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          [(set VR256:$dst, (v8f32 (X86VPerm2x128 VR256:$src1, VR256:$src2,
+                              (i8 imm:$src3))))]>, VEX_4V, VEX_L,
+          Sched<[WriteFShuffle]>;
+def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
+          (ins VR256:$src1, f256mem:$src2, i8imm:$src3),
+          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv8f32 addr:$src2),
+                             (i8 imm:$src3)))]>, VEX_4V, VEX_L,
+          Sched<[WriteFShuffleLd, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX] in {
+def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1,
+                  (loadv4f64 addr:$src2), (i8 imm:$imm))),
+          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+}
+
+let Predicates = [HasAVX1Only] in {
+def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+
+def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1,
+                  (bc_v8i32 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
+          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1,
+                  (loadv4i64 addr:$src2), (i8 imm:$imm))),
+          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1,
+                  (bc_v32i8 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
+          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
+                  (bc_v16i16 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
+          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+}
+
+//===----------------------------------------------------------------------===//
+// VZERO - Zero YMM registers
+//
+let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
+            YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
+  // Zero All YMM registers
+  def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
+                  [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L, Requires<[HasAVX]>;
+
+  // Zero Upper bits of YMM registers
+  def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
+                     [(int_x86_avx_vzeroupper)]>, PS, VEX, Requires<[HasAVX]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Half precision conversion instructions
+//===----------------------------------------------------------------------===//
+multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
+  def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
+             "vcvtph2ps\t{$src, $dst|$dst, $src}",
+             [(set RC:$dst, (Int VR128:$src))]>,
+             T8PD, VEX, Sched<[WriteCvtF2F]>;
+  let neverHasSideEffects = 1, mayLoad = 1 in
+  def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+             "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8PD, VEX,
+             Sched<[WriteCvtF2FLd]>;
+}
+
+multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
+  def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
+               (ins RC:$src1, i32i8imm:$src2),
+               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+               [(set VR128:$dst, (Int RC:$src1, imm:$src2))]>,
+               TAPD, VEX, Sched<[WriteCvtF2F]>;
+  let neverHasSideEffects = 1, mayStore = 1,
+      SchedRW = [WriteCvtF2FLd, WriteRMW] in
+  def mr : Ii8<0x1D, MRMDestMem, (outs),
+               (ins x86memop:$dst, RC:$src1, i32i8imm:$src2),
+               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+               TAPD, VEX;
+}
+
+let Predicates = [HasF16C] in {
+  defm VCVTPH2PS  : f16c_ph2ps<VR128, f64mem, int_x86_vcvtph2ps_128>;
+  defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>, VEX_L;
+  defm VCVTPS2PH  : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>;
+  defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>, VEX_L;
+
+  // Pattern match vcvtph2ps of a scalar i64 load.
+  def : Pat<(int_x86_vcvtph2ps_128 (vzmovl_v2i64 addr:$src)),
+            (VCVTPH2PSrm addr:$src)>;
+  def : Pat<(int_x86_vcvtph2ps_128 (vzload_v2i64 addr:$src)),
+            (VCVTPH2PSrm addr:$src)>;
+}
+
+// Patterns for  matching conversions from float to half-float and vice versa.
+let Predicates = [HasF16C] in {
+  def : Pat<(fp_to_f16 FR32:$src),
+            (i16 (EXTRACT_SUBREG (VMOVPDI2DIrr (VCVTPS2PHrr
+              (COPY_TO_REGCLASS FR32:$src, VR128), 0)), sub_16bit))>;
+
+  def : Pat<(f16_to_fp GR16:$src),
+            (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr
+              (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128)), FR32)) >;
+
+  def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32:$src))),
+            (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr
+              (VCVTPS2PHrr (COPY_TO_REGCLASS FR32:$src, VR128), 0)), FR32)) >;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX2 Instructions
+//===----------------------------------------------------------------------===//
+
+/// AVX2_binop_rmi_int - AVX2 binary operator with 8-bit immediate
+multiclass AVX2_binop_rmi_int<bits<8> opc, string OpcodeStr,
+                 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
+                 X86MemOperand x86memop> {
+  let isCommutable = 1 in
+  def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst),
+        (ins RC:$src1, RC:$src2, u32u8imm:$src3),
+        !strconcat(OpcodeStr,
+            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+        [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>,
+        Sched<[WriteBlend]>, VEX_4V;
+  def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst),
+        (ins RC:$src1, x86memop:$src2, u32u8imm:$src3),
+        !strconcat(OpcodeStr,
+            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+        [(set RC:$dst,
+          (IntId RC:$src1,
+           (bitconvert (memop_frag addr:$src2)), imm:$src3))]>,
+        Sched<[WriteBlendLd, ReadAfterLd]>, VEX_4V;
+}
+
+let isCommutable = 0 in {
+defm VPBLENDD : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_128,
+                                   VR128, loadv2i64, i128mem>;
+defm VPBLENDDY : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_256,
+                                    VR256, loadv4i64, i256mem>, VEX_L;
+}
+
+def : Pat<(v4i32 (X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2),
+                  imm:$mask)),
+          (VPBLENDDrri VR128:$src1, VR128:$src2, imm:$mask)>;
+def : Pat<(v8i32 (X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2),
+                  imm:$mask)),
+          (VPBLENDDYrri VR256:$src1, VR256:$src2, imm:$mask)>;
+
+//===----------------------------------------------------------------------===//
+// VPBROADCAST - Load from memory and broadcast to all elements of the
+//               destination operand
+//
+multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
+                          X86MemOperand x86memop, PatFrag ld_frag,
+                          Intrinsic Int128, Intrinsic Int256> {
+  def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                  [(set VR128:$dst, (Int128 VR128:$src))]>,
+                  Sched<[WriteShuffle]>, VEX;
+  def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
+                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                  [(set VR128:$dst,
+                    (Int128 (scalar_to_vector (ld_frag addr:$src))))]>,
+                  Sched<[WriteLoad]>, VEX;
+  def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
+                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                   [(set VR256:$dst, (Int256 VR128:$src))]>,
+                   Sched<[WriteShuffle256]>, VEX, VEX_L;
+  def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
+                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                   [(set VR256:$dst,
+                    (Int256 (scalar_to_vector (ld_frag addr:$src))))]>,
+                   Sched<[WriteLoad]>, VEX, VEX_L;
+}
+
+defm VPBROADCASTB  : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8,
+                                    int_x86_avx2_pbroadcastb_128,
+                                    int_x86_avx2_pbroadcastb_256>;
+defm VPBROADCASTW  : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16,
+                                    int_x86_avx2_pbroadcastw_128,
+                                    int_x86_avx2_pbroadcastw_256>;
+defm VPBROADCASTD  : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32,
+                                    int_x86_avx2_pbroadcastd_128,
+                                    int_x86_avx2_pbroadcastd_256>;
+defm VPBROADCASTQ  : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64,
+                                    int_x86_avx2_pbroadcastq_128,
+                                    int_x86_avx2_pbroadcastq_256>;
+
+let Predicates = [HasAVX2] in {
+  def : Pat<(v16i8 (X86VBroadcast (loadi8 addr:$src))),
+          (VPBROADCASTBrm addr:$src)>;
+  def : Pat<(v32i8 (X86VBroadcast (loadi8 addr:$src))),
+          (VPBROADCASTBYrm addr:$src)>;
+  def : Pat<(v8i16 (X86VBroadcast (loadi16 addr:$src))),
+          (VPBROADCASTWrm addr:$src)>;
+  def : Pat<(v16i16 (X86VBroadcast (loadi16 addr:$src))),
+          (VPBROADCASTWYrm addr:$src)>;
+  def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
+          (VPBROADCASTDrm addr:$src)>;
+  def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
+          (VPBROADCASTDYrm addr:$src)>;
+  def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))),
+          (VPBROADCASTQrm addr:$src)>;
+  def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
+          (VPBROADCASTQYrm addr:$src)>;
+
+  def : Pat<(v16i8 (X86VBroadcast (v16i8 VR128:$src))),
+          (VPBROADCASTBrr VR128:$src)>;
+  def : Pat<(v32i8 (X86VBroadcast (v16i8 VR128:$src))),
+          (VPBROADCASTBYrr VR128:$src)>;
+  def : Pat<(v8i16 (X86VBroadcast (v8i16 VR128:$src))),
+          (VPBROADCASTWrr VR128:$src)>;
+  def : Pat<(v16i16 (X86VBroadcast (v8i16 VR128:$src))),
+          (VPBROADCASTWYrr VR128:$src)>;
+  def : Pat<(v4i32 (X86VBroadcast (v4i32 VR128:$src))),
+          (VPBROADCASTDrr VR128:$src)>;
+  def : Pat<(v8i32 (X86VBroadcast (v4i32 VR128:$src))),
+          (VPBROADCASTDYrr VR128:$src)>;
+  def : Pat<(v2i64 (X86VBroadcast (v2i64 VR128:$src))),
+          (VPBROADCASTQrr VR128:$src)>;
+  def : Pat<(v4i64 (X86VBroadcast (v2i64 VR128:$src))),
+          (VPBROADCASTQYrr VR128:$src)>;
+  def : Pat<(v4f32 (X86VBroadcast (v4f32 VR128:$src))),
+          (VBROADCASTSSrr VR128:$src)>;
+  def : Pat<(v8f32 (X86VBroadcast (v4f32 VR128:$src))),
+          (VBROADCASTSSYrr VR128:$src)>;
+  def : Pat<(v2f64 (X86VBroadcast (v2f64 VR128:$src))),
+          (VPBROADCASTQrr VR128:$src)>;
+  def : Pat<(v4f64 (X86VBroadcast (v2f64 VR128:$src))),
+          (VBROADCASTSDYrr VR128:$src)>;
+
+  // Provide fallback in case the load node that is used in the patterns above
+  // is used by additional users, which prevents the pattern selection.
+  let AddedComplexity = 20 in {
+    def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
+              (VBROADCASTSSrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
+    def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
+              (VBROADCASTSSYrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
+    def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
+              (VBROADCASTSDYrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
+
+    def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
+              (VBROADCASTSSrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
+    def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
+              (VBROADCASTSSYrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
+    def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
+              (VBROADCASTSDYrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
+
+    def : Pat<(v16i8 (X86VBroadcast GR8:$src)),
+          (VPBROADCASTBrr (COPY_TO_REGCLASS
+                           (i32 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)),
+                           VR128))>;
+    def : Pat<(v32i8 (X86VBroadcast GR8:$src)),
+          (VPBROADCASTBYrr (COPY_TO_REGCLASS
+                            (i32 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)),
+                            VR128))>;
+
+    def : Pat<(v8i16 (X86VBroadcast GR16:$src)),
+          (VPBROADCASTWrr (COPY_TO_REGCLASS
+                           (i32 (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit)),
+                           VR128))>;
+    def : Pat<(v16i16 (X86VBroadcast GR16:$src)),
+          (VPBROADCASTWYrr (COPY_TO_REGCLASS
+                            (i32 (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit)),
+                            VR128))>;
+
+    // The patterns for VPBROADCASTD are not needed because they would match
+    // the exact same thing as VBROADCASTSS patterns.
+
+    def : Pat<(v2i64 (X86VBroadcast GR64:$src)),
+          (VPBROADCASTQrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
+    // The v4i64 pattern is not needed because VBROADCASTSDYrr already match.
+  }
+}
+
+// AVX1 broadcast patterns
+let Predicates = [HasAVX1Only] in {
+def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
+          (VBROADCASTSSYrm addr:$src)>;
+def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
+          (VBROADCASTSDYrm addr:$src)>;
+def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
+          (VBROADCASTSSrm addr:$src)>;
+}
+
+let Predicates = [HasAVX] in {
+  // Provide fallback in case the load node that is used in the patterns above
+  // is used by additional users, which prevents the pattern selection.
+  let AddedComplexity = 20 in {
+  // 128bit broadcasts:
+  def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
+            (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0)>;
+  def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
+            (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
+              (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), sub_xmm),
+              (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), 1)>;
+  def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
+            (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
+              (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), sub_xmm),
+              (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), 1)>;
+
+  def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
+            (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0)>;
+  def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
+            (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+              (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), sub_xmm),
+              (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), 1)>;
+  def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
+            (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)),
+              (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), sub_xmm),
+              (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), 1)>;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// VPERM - Permute instructions
+//
+
+multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
+                     ValueType OpVT> {
+  def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
+                   (ins VR256:$src1, VR256:$src2),
+                   !strconcat(OpcodeStr,
+                       "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                   [(set VR256:$dst,
+                     (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
+                   Sched<[WriteFShuffle256]>, VEX_4V, VEX_L;
+  def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
+                   (ins VR256:$src1, i256mem:$src2),
+                   !strconcat(OpcodeStr,
+                       "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                   [(set VR256:$dst,
+                     (OpVT (X86VPermv VR256:$src1,
+                            (bitconvert (mem_frag addr:$src2)))))]>,
+                   Sched<[WriteFShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L;
+}
+
+defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32>;
+let ExeDomain = SSEPackedSingle in
+defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32>;
+
+multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
+                         ValueType OpVT> {
+  def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst),
+                     (ins VR256:$src1, i8imm:$src2),
+                     !strconcat(OpcodeStr,
+                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     [(set VR256:$dst,
+                       (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>,
+                     Sched<[WriteShuffle256]>, VEX, VEX_L;
+  def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
+                     (ins i256mem:$src1, i8imm:$src2),
+                     !strconcat(OpcodeStr,
+                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     [(set VR256:$dst,
+                       (OpVT (X86VPermi (mem_frag addr:$src1),
+                              (i8 imm:$src2))))]>,
+                     Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX, VEX_L;
+}
+
+defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64>, VEX_W;
+let ExeDomain = SSEPackedDouble in
+defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64>, VEX_W;
+
+//===----------------------------------------------------------------------===//
+// VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
+//
+def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
+          (ins VR256:$src1, VR256:$src2, i8imm:$src3),
+          "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
+                            (i8 imm:$src3))))]>, Sched<[WriteShuffle256]>,
+          VEX_4V, VEX_L;
+def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
+          (ins VR256:$src1, f256mem:$src2, i8imm:$src3),
+          "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2),
+                             (i8 imm:$src3)))]>,
+          Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L;
+
+let Predicates = [HasAVX2] in {
+def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+
+def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, (bc_v32i8 (loadv4i64 addr:$src2)),
+                  (i8 imm:$imm))),
+          (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
+                   (bc_v16i16 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
+          (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)),
+                  (i8 imm:$imm))),
+          (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// VINSERTI128 - Insert packed integer values
+//
+let neverHasSideEffects = 1 in {
+def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
+          (ins VR256:$src1, VR128:$src2, i8imm:$src3),
+          "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
+let mayLoad = 1 in
+def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
+          (ins VR256:$src1, i128mem:$src2, i8imm:$src3),
+          "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          []>, Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L;
+}
+
+let Predicates = [HasAVX2] in {
+def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
+                                   (iPTR imm)),
+          (VINSERTI128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
+                                   (iPTR imm)),
+          (VINSERTI128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
+                                   (iPTR imm)),
+          (VINSERTI128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
+                                   (iPTR imm)),
+          (VINSERTI128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+
+def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2),
+                                   (iPTR imm)),
+          (VINSERTI128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1),
+                                   (bc_v4i32 (loadv2i64 addr:$src2)),
+                                   (iPTR imm)),
+          (VINSERTI128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1),
+                                   (bc_v16i8 (loadv2i64 addr:$src2)),
+                                   (iPTR imm)),
+          (VINSERTI128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1),
+                                   (bc_v8i16 (loadv2i64 addr:$src2)),
+                                   (iPTR imm)),
+          (VINSERTI128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+}
+
+//===----------------------------------------------------------------------===//
+// VEXTRACTI128 - Extract packed integer values
+//
+def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst),
+          (ins VR256:$src1, i8imm:$src2),
+          "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+          [(set VR128:$dst,
+            (int_x86_avx2_vextracti128 VR256:$src1, imm:$src2))]>,
+          Sched<[WriteShuffle256]>, VEX, VEX_L;
+let neverHasSideEffects = 1, mayStore = 1 in
+def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
+          (ins i128mem:$dst, VR256:$src1, i8imm:$src2),
+          "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+          Sched<[WriteStore]>, VEX, VEX_L;
+
+let Predicates = [HasAVX2] in {
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
+          (v2i64 (VEXTRACTI128rr
+                    (v4i64 VR256:$src1),
+                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
+          (v4i32 (VEXTRACTI128rr
+                    (v8i32 VR256:$src1),
+                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
+          (v8i16 (VEXTRACTI128rr
+                    (v16i16 VR256:$src1),
+                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
+          (v16i8 (VEXTRACTI128rr
+                    (v32i8 VR256:$src1),
+                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+
+def : Pat<(store (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1),
+                         (iPTR imm))), addr:$dst),
+          (VEXTRACTI128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextract128_imm VR128:$ext))>;
+def : Pat<(store (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1),
+                         (iPTR imm))), addr:$dst),
+          (VEXTRACTI128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextract128_imm VR128:$ext))>;
+def : Pat<(store (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1),
+                         (iPTR imm))), addr:$dst),
+          (VEXTRACTI128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextract128_imm VR128:$ext))>;
+def : Pat<(store (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1),
+                         (iPTR imm))), addr:$dst),
+          (VEXTRACTI128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextract128_imm VR128:$ext))>;
+}
+
+//===----------------------------------------------------------------------===//
+// VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores
+//
+multiclass avx2_pmovmask<string OpcodeStr,
+                         Intrinsic IntLd128, Intrinsic IntLd256,
+                         Intrinsic IntSt128, Intrinsic IntSt256> {
+  def rm  : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst),
+             (ins VR128:$src1, i128mem:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>, VEX_4V;
+  def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
+             (ins VR256:$src1, i256mem:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
+             VEX_4V, VEX_L;
+  def mr  : AVX28I<0x8e, MRMDestMem, (outs),
+             (ins i128mem:$dst, VR128:$src1, VR128:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V;
+  def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
+             (ins i256mem:$dst, VR256:$src1, VR256:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L;
+}
+
+defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
+                                int_x86_avx2_maskload_d,
+                                int_x86_avx2_maskload_d_256,
+                                int_x86_avx2_maskstore_d,
+                                int_x86_avx2_maskstore_d_256>;
+defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
+                                int_x86_avx2_maskload_q,
+                                int_x86_avx2_maskload_q_256,
+                                int_x86_avx2_maskstore_q,
+                                int_x86_avx2_maskstore_q_256>, VEX_W;
+
+
+//===----------------------------------------------------------------------===//
+// Variable Bit Shifts
+//
+multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                          ValueType vt128, ValueType vt256> {
+  def rr  : AVX28I<opc, MRMSrcReg, (outs VR128:$dst),
+             (ins VR128:$src1, VR128:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set VR128:$dst,
+               (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>,
+             VEX_4V, Sched<[WriteVarVecShift]>;
+  def rm  : AVX28I<opc, MRMSrcMem, (outs VR128:$dst),
+             (ins VR128:$src1, i128mem:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set VR128:$dst,
+               (vt128 (OpNode VR128:$src1,
+                       (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>,
+             VEX_4V, Sched<[WriteVarVecShiftLd, ReadAfterLd]>;
+  def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
+             (ins VR256:$src1, VR256:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set VR256:$dst,
+               (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>,
+             VEX_4V, VEX_L, Sched<[WriteVarVecShift]>;
+  def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
+             (ins VR256:$src1, i256mem:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set VR256:$dst,
+               (vt256 (OpNode VR256:$src1,
+                       (vt256 (bitconvert (loadv4i64 addr:$src2))))))]>,
+             VEX_4V, VEX_L, Sched<[WriteVarVecShiftLd, ReadAfterLd]>;
+}
+
+defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, v4i32, v8i32>;
+defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, v2i64, v4i64>, VEX_W;
+defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>;
+defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W;
+defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>;
+
+//===----------------------------------------------------------------------===//
+// VGATHER - GATHER Operations
+multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256,
+                       X86MemOperand memop128, X86MemOperand memop256> {
+  def rm  : AVX28I<opc, MRMSrcMem, (outs VR128:$dst, VR128:$mask_wb),
+            (ins VR128:$src1, memop128:$src2, VR128:$mask),
+            !strconcat(OpcodeStr,
+              "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
+            []>, VEX_4VOp3;
+  def Yrm : AVX28I<opc, MRMSrcMem, (outs RC256:$dst, RC256:$mask_wb),
+            (ins RC256:$src1, memop256:$src2, RC256:$mask),
+            !strconcat(OpcodeStr,
+              "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
+            []>, VEX_4VOp3, VEX_L;
+}
+
+let mayLoad = 1, Constraints
+  = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
+  in {
+  defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", VR256, vx64mem, vx64mem>, VEX_W;
+  defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", VR256, vx64mem, vy64mem>, VEX_W;
+  defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", VR256, vx32mem, vy32mem>;
+  defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", VR128, vx32mem, vy32mem>;
+
+  let ExeDomain = SSEPackedDouble in {
+    defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", VR256, vx64mem, vx64mem>, VEX_W;
+    defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", VR256, vx64mem, vy64mem>, VEX_W;
+  }
+
+  let ExeDomain = SSEPackedSingle in {
+    defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", VR256, vx32mem, vy32mem>;
+    defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", VR128, vx32mem, vy32mem>;
+  }
+}
diff --git a/contrib/llvm/lib/Target/X86/X86InstrSVM.td b/contrib/llvm/lib/Target/X86/X86InstrSVM.td
new file mode 100644
index 0000000..c847be7e
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrSVM.td
@@ -0,0 +1,62 @@
+//===-- X86InstrSVM.td - SVM Instruction Set Extension -----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the AMD SVM instruction
+// set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SVM instructions
+
+// 0F 01 D9
+def VMMCALL : I<0x01, MRM_D9, (outs), (ins), "vmmcall", []>, TB;
+
+// 0F 01 DC
+def STGI : I<0x01, MRM_DC, (outs), (ins), "stgi", []>, TB;
+
+// 0F 01 DD
+def CLGI : I<0x01, MRM_DD, (outs), (ins), "clgi", []>, TB;
+
+// 0F 01 DE
+let Uses = [EAX] in
+def SKINIT : I<0x01, MRM_DE, (outs), (ins), "skinit\t{%eax|eax}", []>, TB;
+
+// 0F 01 D8
+let Uses = [EAX] in
+def VMRUN32 : I<0x01, MRM_D8, (outs), (ins),
+                "vmrun\t{%eax|eax}", []>, TB, Requires<[Not64BitMode]>;
+let Uses = [RAX] in
+def VMRUN64 : I<0x01, MRM_D8, (outs), (ins),
+                "vmrun\t{%rax|rax}", []>, TB, Requires<[In64BitMode]>;
+
+// 0F 01 DA
+let Uses = [EAX] in
+def VMLOAD32 : I<0x01, MRM_DA, (outs), (ins),
+                "vmload\t{%eax|eax}", []>, TB, Requires<[Not64BitMode]>;
+let Uses = [RAX] in
+def VMLOAD64 : I<0x01, MRM_DA, (outs), (ins),
+                "vmload\t{%rax|rax}", []>, TB, Requires<[In64BitMode]>;
+
+// 0F 01 DB
+let Uses = [EAX] in
+def VMSAVE32 : I<0x01, MRM_DB, (outs), (ins),
+                "vmsave\t{%eax|eax}", []>, TB, Requires<[Not64BitMode]>;
+let Uses = [RAX] in
+def VMSAVE64 : I<0x01, MRM_DB, (outs), (ins),
+                "vmsave\t{%rax|rax}", []>, TB, Requires<[In64BitMode]>;
+
+// 0F 01 DF
+let Uses = [EAX, ECX] in
+def INVLPGA32 : I<0x01, MRM_DF, (outs), (ins),
+                "invlpga\t{%ecx, %eax|eax, ecx}", []>, TB, Requires<[Not64BitMode]>;
+let Uses = [RAX, ECX] in
+def INVLPGA64 : I<0x01, MRM_DF, (outs), (ins),
+                "invlpga\t{%ecx, %rax|rax, ecx}", []>, TB, Requires<[In64BitMode]>;
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td
new file mode 100644
index 0000000..d0bb523
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td
@@ -0,0 +1,969 @@
+//===-- X86InstrShiftRotate.td - Shift and Rotate Instrs ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the shift and rotate instructions.
+//
+//===----------------------------------------------------------------------===//
+
+// FIXME: Someone needs to smear multipattern goodness all over this file.
+
+let Defs = [EFLAGS] in {
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+let Uses = [CL] in {
+def SHL8rCL  : I<0xD2, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1),
+                 "shl{b}\t{%cl, $dst|$dst, cl}",
+                 [(set GR8:$dst, (shl GR8:$src1, CL))], IIC_SR>;
+def SHL16rCL : I<0xD3, MRM4r, (outs GR16:$dst), (ins GR16:$src1),
+                 "shl{w}\t{%cl, $dst|$dst, cl}",
+                 [(set GR16:$dst, (shl GR16:$src1, CL))], IIC_SR>, OpSize16;
+def SHL32rCL : I<0xD3, MRM4r, (outs GR32:$dst), (ins GR32:$src1),
+                 "shl{l}\t{%cl, $dst|$dst, cl}",
+                 [(set GR32:$dst, (shl GR32:$src1, CL))], IIC_SR>, OpSize32;
+def SHL64rCL : RI<0xD3, MRM4r, (outs GR64:$dst), (ins GR64:$src1),
+                  "shl{q}\t{%cl, $dst|$dst, cl}",
+                  [(set GR64:$dst, (shl GR64:$src1, CL))], IIC_SR>;
+} // Uses = [CL]
+
+def SHL8ri   : Ii8<0xC0, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
+                   "shl{b}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (shl GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
+
+let isConvertibleToThreeAddress = 1 in {   // Can transform into LEA.
+def SHL16ri  : Ii8<0xC1, MRM4r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
+                   "shl{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (shl GR16:$src1, (i8 imm:$src2)))], IIC_SR>,
+                   OpSize16;
+def SHL32ri  : Ii8<0xC1, MRM4r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
+                   "shl{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (shl GR32:$src1, (i8 imm:$src2)))], IIC_SR>,
+                   OpSize32;
+def SHL64ri  : RIi8<0xC1, MRM4r, (outs GR64:$dst),
+                    (ins GR64:$src1, i8imm:$src2),
+                    "shl{q}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (shl GR64:$src1, (i8 imm:$src2)))],
+                    IIC_SR>;
+
+// NOTE: We don't include patterns for shifts of a register by one, because
+// 'add reg,reg' is cheaper (and we have a Pat pattern for shift-by-one).
+let hasSideEffects = 0 in {
+def SHL8r1   : I<0xD0, MRM4r, (outs GR8:$dst), (ins GR8:$src1),
+                 "shl{b}\t$dst", [], IIC_SR>;
+def SHL16r1  : I<0xD1, MRM4r, (outs GR16:$dst), (ins GR16:$src1),
+                 "shl{w}\t$dst", [], IIC_SR>, OpSize16;
+def SHL32r1  : I<0xD1, MRM4r, (outs GR32:$dst), (ins GR32:$src1),
+                 "shl{l}\t$dst", [], IIC_SR>, OpSize32;
+def SHL64r1  : RI<0xD1, MRM4r, (outs GR64:$dst), (ins GR64:$src1),
+                 "shl{q}\t$dst", [], IIC_SR>;
+} // hasSideEffects = 0
+} // isConvertibleToThreeAddress = 1
+} // Constraints = "$src = $dst", SchedRW
+
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+// FIXME: Why do we need an explicit "Uses = [CL]" when the instr has a pattern
+// using CL?
+let Uses = [CL] in {
+def SHL8mCL  : I<0xD2, MRM4m, (outs), (ins i8mem :$dst),
+                 "shl{b}\t{%cl, $dst|$dst, cl}",
+                 [(store (shl (loadi8 addr:$dst), CL), addr:$dst)], IIC_SR>;
+def SHL16mCL : I<0xD3, MRM4m, (outs), (ins i16mem:$dst),
+                 "shl{w}\t{%cl, $dst|$dst, cl}",
+                 [(store (shl (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>,
+                 OpSize16;
+def SHL32mCL : I<0xD3, MRM4m, (outs), (ins i32mem:$dst),
+                 "shl{l}\t{%cl, $dst|$dst, cl}",
+                 [(store (shl (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>,
+                 OpSize32;
+def SHL64mCL : RI<0xD3, MRM4m, (outs), (ins i64mem:$dst),
+                  "shl{q}\t{%cl, $dst|$dst, cl}",
+                  [(store (shl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>;
+}
+def SHL8mi   : Ii8<0xC0, MRM4m, (outs), (ins i8mem :$dst, i8imm:$src),
+                   "shl{b}\t{$src, $dst|$dst, $src}",
+                [(store (shl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
+                IIC_SR>;
+def SHL16mi  : Ii8<0xC1, MRM4m, (outs), (ins i16mem:$dst, i8imm:$src),
+                   "shl{w}\t{$src, $dst|$dst, $src}",
+               [(store (shl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
+               IIC_SR>, OpSize16;
+def SHL32mi  : Ii8<0xC1, MRM4m, (outs), (ins i32mem:$dst, i8imm:$src),
+                   "shl{l}\t{$src, $dst|$dst, $src}",
+               [(store (shl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
+               IIC_SR>, OpSize32;
+def SHL64mi : RIi8<0xC1, MRM4m, (outs), (ins i64mem:$dst, i8imm:$src),
+                  "shl{q}\t{$src, $dst|$dst, $src}",
+                 [(store (shl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
+                 IIC_SR>;
+
+// Shift by 1
+def SHL8m1   : I<0xD0, MRM4m, (outs), (ins i8mem :$dst),
+                 "shl{b}\t$dst",
+                [(store (shl (loadi8 addr:$dst), (i8 1)), addr:$dst)],
+                IIC_SR>;
+def SHL16m1  : I<0xD1, MRM4m, (outs), (ins i16mem:$dst),
+                 "shl{w}\t$dst",
+               [(store (shl (loadi16 addr:$dst), (i8 1)), addr:$dst)],
+               IIC_SR>, OpSize16;
+def SHL32m1  : I<0xD1, MRM4m, (outs), (ins i32mem:$dst),
+                 "shl{l}\t$dst",
+               [(store (shl (loadi32 addr:$dst), (i8 1)), addr:$dst)],
+               IIC_SR>, OpSize32;
+def SHL64m1 : RI<0xD1, MRM4m, (outs), (ins i64mem:$dst),
+                  "shl{q}\t$dst",
+                 [(store (shl (loadi64 addr:$dst), (i8 1)), addr:$dst)],
+                 IIC_SR>;
+} // SchedRW
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+let Uses = [CL] in {
+def SHR8rCL  : I<0xD2, MRM5r, (outs GR8 :$dst), (ins GR8 :$src1),
+                 "shr{b}\t{%cl, $dst|$dst, cl}",
+                 [(set GR8:$dst, (srl GR8:$src1, CL))], IIC_SR>;
+def SHR16rCL : I<0xD3, MRM5r, (outs GR16:$dst), (ins GR16:$src1),
+                 "shr{w}\t{%cl, $dst|$dst, cl}",
+                 [(set GR16:$dst, (srl GR16:$src1, CL))], IIC_SR>, OpSize16;
+def SHR32rCL : I<0xD3, MRM5r, (outs GR32:$dst), (ins GR32:$src1),
+                 "shr{l}\t{%cl, $dst|$dst, cl}",
+                 [(set GR32:$dst, (srl GR32:$src1, CL))], IIC_SR>, OpSize32;
+def SHR64rCL : RI<0xD3, MRM5r, (outs GR64:$dst), (ins GR64:$src1),
+                  "shr{q}\t{%cl, $dst|$dst, cl}",
+                  [(set GR64:$dst, (srl GR64:$src1, CL))], IIC_SR>;
+}
+
+def SHR8ri   : Ii8<0xC0, MRM5r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+                   "shr{b}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (srl GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
+def SHR16ri  : Ii8<0xC1, MRM5r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
+                   "shr{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (srl GR16:$src1, (i8 imm:$src2)))],
+                   IIC_SR>, OpSize16;
+def SHR32ri  : Ii8<0xC1, MRM5r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
+                   "shr{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (srl GR32:$src1, (i8 imm:$src2)))],
+                   IIC_SR>, OpSize32;
+def SHR64ri : RIi8<0xC1, MRM5r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$src2),
+                  "shr{q}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (srl GR64:$src1, (i8 imm:$src2)))], IIC_SR>;
+
+// Shift right by 1
+def SHR8r1   : I<0xD0, MRM5r, (outs GR8:$dst), (ins GR8:$src1),
+                 "shr{b}\t$dst",
+                 [(set GR8:$dst, (srl GR8:$src1, (i8 1)))], IIC_SR>;
+def SHR16r1  : I<0xD1, MRM5r, (outs GR16:$dst), (ins GR16:$src1),
+                 "shr{w}\t$dst",
+                 [(set GR16:$dst, (srl GR16:$src1, (i8 1)))], IIC_SR>, OpSize16;
+def SHR32r1  : I<0xD1, MRM5r, (outs GR32:$dst), (ins GR32:$src1),
+                 "shr{l}\t$dst",
+                 [(set GR32:$dst, (srl GR32:$src1, (i8 1)))], IIC_SR>, OpSize32;
+def SHR64r1  : RI<0xD1, MRM5r, (outs GR64:$dst), (ins GR64:$src1),
+                 "shr{q}\t$dst",
+                 [(set GR64:$dst, (srl GR64:$src1, (i8 1)))], IIC_SR>;
+} // Constraints = "$src = $dst", SchedRW
+
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+let Uses = [CL] in {
+def SHR8mCL  : I<0xD2, MRM5m, (outs), (ins i8mem :$dst),
+                 "shr{b}\t{%cl, $dst|$dst, cl}",
+                 [(store (srl (loadi8 addr:$dst), CL), addr:$dst)], IIC_SR>;
+def SHR16mCL : I<0xD3, MRM5m, (outs), (ins i16mem:$dst),
+                 "shr{w}\t{%cl, $dst|$dst, cl}",
+                 [(store (srl (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>,
+                 OpSize16;
+def SHR32mCL : I<0xD3, MRM5m, (outs), (ins i32mem:$dst),
+                 "shr{l}\t{%cl, $dst|$dst, cl}",
+                 [(store (srl (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>,
+                 OpSize32;
+def SHR64mCL : RI<0xD3, MRM5m, (outs), (ins i64mem:$dst),
+                  "shr{q}\t{%cl, $dst|$dst, cl}",
+                  [(store (srl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>;
+}
+def SHR8mi   : Ii8<0xC0, MRM5m, (outs), (ins i8mem :$dst, i8imm:$src),
+                   "shr{b}\t{$src, $dst|$dst, $src}",
+                [(store (srl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
+                IIC_SR>;
+def SHR16mi  : Ii8<0xC1, MRM5m, (outs), (ins i16mem:$dst, i8imm:$src),
+                   "shr{w}\t{$src, $dst|$dst, $src}",
+               [(store (srl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
+               IIC_SR>, OpSize16;
+def SHR32mi  : Ii8<0xC1, MRM5m, (outs), (ins i32mem:$dst, i8imm:$src),
+                   "shr{l}\t{$src, $dst|$dst, $src}",
+               [(store (srl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
+               IIC_SR>, OpSize32;
+def SHR64mi : RIi8<0xC1, MRM5m, (outs), (ins i64mem:$dst, i8imm:$src),
+                  "shr{q}\t{$src, $dst|$dst, $src}",
+                 [(store (srl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
+                 IIC_SR>;
+
+// Shift by 1
+def SHR8m1   : I<0xD0, MRM5m, (outs), (ins i8mem :$dst),
+                 "shr{b}\t$dst",
+                [(store (srl (loadi8 addr:$dst), (i8 1)), addr:$dst)],
+                IIC_SR>;
+def SHR16m1  : I<0xD1, MRM5m, (outs), (ins i16mem:$dst),
+                 "shr{w}\t$dst",
+               [(store (srl (loadi16 addr:$dst), (i8 1)), addr:$dst)],
+               IIC_SR>, OpSize16;
+def SHR32m1  : I<0xD1, MRM5m, (outs), (ins i32mem:$dst),
+                 "shr{l}\t$dst",
+               [(store (srl (loadi32 addr:$dst), (i8 1)), addr:$dst)],
+               IIC_SR>, OpSize32;
+def SHR64m1 : RI<0xD1, MRM5m, (outs), (ins i64mem:$dst),
+                  "shr{q}\t$dst",
+                 [(store (srl (loadi64 addr:$dst), (i8 1)), addr:$dst)],
+                 IIC_SR>;
+} // SchedRW
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+let Uses = [CL] in {
+def SAR8rCL  : I<0xD2, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1),
+                 "sar{b}\t{%cl, $dst|$dst, cl}",
+                 [(set GR8:$dst, (sra GR8:$src1, CL))],
+                 IIC_SR>;
+def SAR16rCL : I<0xD3, MRM7r, (outs GR16:$dst), (ins GR16:$src1),
+                 "sar{w}\t{%cl, $dst|$dst, cl}",
+                 [(set GR16:$dst, (sra GR16:$src1, CL))],
+                 IIC_SR>, OpSize16;
+def SAR32rCL : I<0xD3, MRM7r, (outs GR32:$dst), (ins GR32:$src1),
+                 "sar{l}\t{%cl, $dst|$dst, cl}",
+                 [(set GR32:$dst, (sra GR32:$src1, CL))],
+                 IIC_SR>, OpSize32;
+def SAR64rCL : RI<0xD3, MRM7r, (outs GR64:$dst), (ins GR64:$src1),
+                 "sar{q}\t{%cl, $dst|$dst, cl}",
+                 [(set GR64:$dst, (sra GR64:$src1, CL))],
+                 IIC_SR>;
+}
+
+def SAR8ri   : Ii8<0xC0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
+                   "sar{b}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (sra GR8:$src1, (i8 imm:$src2)))],
+                   IIC_SR>;
+def SAR16ri  : Ii8<0xC1, MRM7r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
+                   "sar{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (sra GR16:$src1, (i8 imm:$src2)))],
+                   IIC_SR>, OpSize16;
+def SAR32ri  : Ii8<0xC1, MRM7r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
+                   "sar{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (sra GR32:$src1, (i8 imm:$src2)))],
+                   IIC_SR>, OpSize32;
+def SAR64ri  : RIi8<0xC1, MRM7r, (outs GR64:$dst),
+                    (ins GR64:$src1, i8imm:$src2),
+                    "sar{q}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (sra GR64:$src1, (i8 imm:$src2)))],
+                    IIC_SR>;
+
+// Shift by 1
+def SAR8r1   : I<0xD0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1),
+                 "sar{b}\t$dst",
+                 [(set GR8:$dst, (sra GR8:$src1, (i8 1)))],
+                 IIC_SR>;
+def SAR16r1  : I<0xD1, MRM7r, (outs GR16:$dst), (ins GR16:$src1),
+                 "sar{w}\t$dst",
+                 [(set GR16:$dst, (sra GR16:$src1, (i8 1)))],
+                 IIC_SR>, OpSize16;
+def SAR32r1  : I<0xD1, MRM7r, (outs GR32:$dst), (ins GR32:$src1),
+                 "sar{l}\t$dst",
+                 [(set GR32:$dst, (sra GR32:$src1, (i8 1)))],
+                 IIC_SR>, OpSize32;
+def SAR64r1  : RI<0xD1, MRM7r, (outs GR64:$dst), (ins GR64:$src1),
+                 "sar{q}\t$dst",
+                 [(set GR64:$dst, (sra GR64:$src1, (i8 1)))],
+                 IIC_SR>;
+} // Constraints = "$src = $dst", SchedRW
+
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+let Uses = [CL] in {
+def SAR8mCL  : I<0xD2, MRM7m, (outs), (ins i8mem :$dst),
+                 "sar{b}\t{%cl, $dst|$dst, cl}",
+                 [(store (sra (loadi8 addr:$dst), CL), addr:$dst)],
+                 IIC_SR>;
+def SAR16mCL : I<0xD3, MRM7m, (outs), (ins i16mem:$dst),
+                 "sar{w}\t{%cl, $dst|$dst, cl}",
+                 [(store (sra (loadi16 addr:$dst), CL), addr:$dst)],
+                 IIC_SR>, OpSize16;
+def SAR32mCL : I<0xD3, MRM7m, (outs), (ins i32mem:$dst), 
+                 "sar{l}\t{%cl, $dst|$dst, cl}",
+                 [(store (sra (loadi32 addr:$dst), CL), addr:$dst)],
+                 IIC_SR>, OpSize32;
+def SAR64mCL : RI<0xD3, MRM7m, (outs), (ins i64mem:$dst), 
+                 "sar{q}\t{%cl, $dst|$dst, cl}",
+                 [(store (sra (loadi64 addr:$dst), CL), addr:$dst)],
+                 IIC_SR>;
+}
+def SAR8mi   : Ii8<0xC0, MRM7m, (outs), (ins i8mem :$dst, i8imm:$src),
+                   "sar{b}\t{$src, $dst|$dst, $src}",
+                [(store (sra (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
+                IIC_SR>;
+def SAR16mi  : Ii8<0xC1, MRM7m, (outs), (ins i16mem:$dst, i8imm:$src),
+                   "sar{w}\t{$src, $dst|$dst, $src}",
+               [(store (sra (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
+               IIC_SR>, OpSize16;
+def SAR32mi  : Ii8<0xC1, MRM7m, (outs), (ins i32mem:$dst, i8imm:$src),
+                   "sar{l}\t{$src, $dst|$dst, $src}",
+               [(store (sra (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
+               IIC_SR>, OpSize32;
+def SAR64mi  : RIi8<0xC1, MRM7m, (outs), (ins i64mem:$dst, i8imm:$src),
+                    "sar{q}\t{$src, $dst|$dst, $src}",
+                 [(store (sra (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
+                 IIC_SR>;
+
+// Shift by 1
+def SAR8m1   : I<0xD0, MRM7m, (outs), (ins i8mem :$dst),
+                 "sar{b}\t$dst",
+                [(store (sra (loadi8 addr:$dst), (i8 1)), addr:$dst)],
+                IIC_SR>;
+def SAR16m1  : I<0xD1, MRM7m, (outs), (ins i16mem:$dst),
+                 "sar{w}\t$dst",
+               [(store (sra (loadi16 addr:$dst), (i8 1)), addr:$dst)],
+               IIC_SR>, OpSize16;
+def SAR32m1  : I<0xD1, MRM7m, (outs), (ins i32mem:$dst),
+                 "sar{l}\t$dst",
+               [(store (sra (loadi32 addr:$dst), (i8 1)), addr:$dst)],
+               IIC_SR>, OpSize32;
+def SAR64m1 : RI<0xD1, MRM7m, (outs), (ins i64mem:$dst),
+                  "sar{q}\t$dst",
+                 [(store (sra (loadi64 addr:$dst), (i8 1)), addr:$dst)],
+                 IIC_SR>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Rotate instructions
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 0 in {
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+def RCL8r1 : I<0xD0, MRM2r, (outs GR8:$dst), (ins GR8:$src1),
+               "rcl{b}\t$dst", [], IIC_SR>;
+def RCL8ri : Ii8<0xC0, MRM2r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$cnt),
+                 "rcl{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+let Uses = [CL] in
+def RCL8rCL : I<0xD2, MRM2r, (outs GR8:$dst), (ins GR8:$src1),
+                "rcl{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+  
+def RCL16r1 : I<0xD1, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
+                "rcl{w}\t$dst", [], IIC_SR>, OpSize16;
+def RCL16ri : Ii8<0xC1, MRM2r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$cnt),
+                  "rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16;
+let Uses = [CL] in
+def RCL16rCL : I<0xD3, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
+                 "rcl{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16;
+
+def RCL32r1 : I<0xD1, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
+                "rcl{l}\t$dst", [], IIC_SR>, OpSize32;
+def RCL32ri : Ii8<0xC1, MRM2r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$cnt),
+                  "rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32;
+let Uses = [CL] in
+def RCL32rCL : I<0xD3, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
+                 "rcl{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32;
+
+
+def RCL64r1 : RI<0xD1, MRM2r, (outs GR64:$dst), (ins GR64:$src1),
+                 "rcl{q}\t$dst", [], IIC_SR>;
+def RCL64ri : RIi8<0xC1, MRM2r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$cnt),
+                   "rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+let Uses = [CL] in
+def RCL64rCL : RI<0xD3, MRM2r, (outs GR64:$dst), (ins GR64:$src1),
+                  "rcl{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+
+
+def RCR8r1 : I<0xD0, MRM3r, (outs GR8:$dst), (ins GR8:$src1),
+               "rcr{b}\t$dst", [], IIC_SR>;
+def RCR8ri : Ii8<0xC0, MRM3r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$cnt),
+                 "rcr{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+let Uses = [CL] in
+def RCR8rCL : I<0xD2, MRM3r, (outs GR8:$dst), (ins GR8:$src1),
+                "rcr{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+  
+def RCR16r1 : I<0xD1, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
+                "rcr{w}\t$dst", [], IIC_SR>, OpSize16;
+def RCR16ri : Ii8<0xC1, MRM3r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$cnt),
+                  "rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16;
+let Uses = [CL] in
+def RCR16rCL : I<0xD3, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
+                 "rcr{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16;
+
+def RCR32r1 : I<0xD1, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
+                "rcr{l}\t$dst", [], IIC_SR>, OpSize32;
+def RCR32ri : Ii8<0xC1, MRM3r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$cnt),
+                  "rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32;
+let Uses = [CL] in
+def RCR32rCL : I<0xD3, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
+                 "rcr{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32;
+                 
+def RCR64r1 : RI<0xD1, MRM3r, (outs GR64:$dst), (ins GR64:$src1),
+                 "rcr{q}\t$dst", [], IIC_SR>;
+def RCR64ri : RIi8<0xC1, MRM3r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$cnt),
+                   "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+let Uses = [CL] in
+def RCR64rCL : RI<0xD3, MRM3r, (outs GR64:$dst), (ins GR64:$src1),
+                  "rcr{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+
+} // Constraints = "$src = $dst"
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+def RCL8m1 : I<0xD0, MRM2m, (outs), (ins i8mem:$dst),
+               "rcl{b}\t$dst", [], IIC_SR>;
+def RCL8mi : Ii8<0xC0, MRM2m, (outs), (ins i8mem:$dst, i8imm:$cnt),
+                 "rcl{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+def RCL16m1 : I<0xD1, MRM2m, (outs), (ins i16mem:$dst),
+                "rcl{w}\t$dst", [], IIC_SR>, OpSize16;
+def RCL16mi : Ii8<0xC1, MRM2m, (outs), (ins i16mem:$dst, i8imm:$cnt),
+                  "rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16;
+def RCL32m1 : I<0xD1, MRM2m, (outs), (ins i32mem:$dst),
+                "rcl{l}\t$dst", [], IIC_SR>, OpSize32;
+def RCL32mi : Ii8<0xC1, MRM2m, (outs), (ins i32mem:$dst, i8imm:$cnt),
+                  "rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32;
+def RCL64m1 : RI<0xD1, MRM2m, (outs), (ins i64mem:$dst),
+                 "rcl{q}\t$dst", [], IIC_SR>;
+def RCL64mi : RIi8<0xC1, MRM2m, (outs), (ins i64mem:$dst, i8imm:$cnt),
+                   "rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+
+def RCR8m1 : I<0xD0, MRM3m, (outs), (ins i8mem:$dst),
+               "rcr{b}\t$dst", [], IIC_SR>;
+def RCR8mi : Ii8<0xC0, MRM3m, (outs), (ins i8mem:$dst, i8imm:$cnt),
+                 "rcr{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+def RCR16m1 : I<0xD1, MRM3m, (outs), (ins i16mem:$dst),
+                "rcr{w}\t$dst", [], IIC_SR>, OpSize16;
+def RCR16mi : Ii8<0xC1, MRM3m, (outs), (ins i16mem:$dst, i8imm:$cnt),
+                  "rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16;
+def RCR32m1 : I<0xD1, MRM3m, (outs), (ins i32mem:$dst),
+                "rcr{l}\t$dst", [], IIC_SR>, OpSize32;
+def RCR32mi : Ii8<0xC1, MRM3m, (outs), (ins i32mem:$dst, i8imm:$cnt),
+                  "rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32;
+def RCR64m1 : RI<0xD1, MRM3m, (outs), (ins i64mem:$dst),
+                 "rcr{q}\t$dst", [], IIC_SR>;
+def RCR64mi : RIi8<0xC1, MRM3m, (outs), (ins i64mem:$dst, i8imm:$cnt),
+                   "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+
+let Uses = [CL] in {
+def RCL8mCL : I<0xD2, MRM2m, (outs), (ins i8mem:$dst),
+                "rcl{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+def RCL16mCL : I<0xD3, MRM2m, (outs), (ins i16mem:$dst),
+                 "rcl{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16;
+def RCL32mCL : I<0xD3, MRM2m, (outs), (ins i32mem:$dst),
+                 "rcl{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32;
+def RCL64mCL : RI<0xD3, MRM2m, (outs), (ins i64mem:$dst),
+                  "rcl{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+
+def RCR8mCL : I<0xD2, MRM3m, (outs), (ins i8mem:$dst),
+                "rcr{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+def RCR16mCL : I<0xD3, MRM3m, (outs), (ins i16mem:$dst),
+                 "rcr{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16;
+def RCR32mCL : I<0xD3, MRM3m, (outs), (ins i32mem:$dst),
+                 "rcr{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32;
+def RCR64mCL : RI<0xD3, MRM3m, (outs), (ins i64mem:$dst),
+                  "rcr{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+}
+} // SchedRW
+} // hasSideEffects = 0
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+// FIXME: provide shorter instructions when imm8 == 1
+let Uses = [CL] in {
+def ROL8rCL  : I<0xD2, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
+                 "rol{b}\t{%cl, $dst|$dst, cl}",
+                 [(set GR8:$dst, (rotl GR8:$src1, CL))], IIC_SR>;
+def ROL16rCL : I<0xD3, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
+                 "rol{w}\t{%cl, $dst|$dst, cl}",
+                 [(set GR16:$dst, (rotl GR16:$src1, CL))], IIC_SR>, OpSize16;
+def ROL32rCL : I<0xD3, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
+                 "rol{l}\t{%cl, $dst|$dst, cl}",
+                 [(set GR32:$dst, (rotl GR32:$src1, CL))], IIC_SR>, OpSize32;
+def ROL64rCL : RI<0xD3, MRM0r, (outs GR64:$dst), (ins GR64:$src1),
+                  "rol{q}\t{%cl, $dst|$dst, cl}",
+                  [(set GR64:$dst, (rotl GR64:$src1, CL))], IIC_SR>;
+}
+
+def ROL8ri   : Ii8<0xC0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
+                   "rol{b}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (rotl GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
+def ROL16ri  : Ii8<0xC1, MRM0r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
+                   "rol{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (rotl GR16:$src1, (i8 imm:$src2)))],
+                   IIC_SR>, OpSize16;
+def ROL32ri  : Ii8<0xC1, MRM0r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
+                   "rol{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$src2)))],
+                   IIC_SR>, OpSize32;
+def ROL64ri  : RIi8<0xC1, MRM0r, (outs GR64:$dst), 
+                    (ins GR64:$src1, i8imm:$src2),
+                    "rol{q}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$src2)))],
+                    IIC_SR>;
+
+// Rotate by 1
+def ROL8r1   : I<0xD0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
+                 "rol{b}\t$dst",
+                 [(set GR8:$dst, (rotl GR8:$src1, (i8 1)))],
+                 IIC_SR>;
+def ROL16r1  : I<0xD1, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
+                 "rol{w}\t$dst",
+                 [(set GR16:$dst, (rotl GR16:$src1, (i8 1)))],
+                 IIC_SR>, OpSize16;
+def ROL32r1  : I<0xD1, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
+                 "rol{l}\t$dst",
+                 [(set GR32:$dst, (rotl GR32:$src1, (i8 1)))],
+                 IIC_SR>, OpSize32;
+def ROL64r1  : RI<0xD1, MRM0r, (outs GR64:$dst), (ins GR64:$src1),
+                  "rol{q}\t$dst",
+                  [(set GR64:$dst, (rotl GR64:$src1, (i8 1)))],
+                  IIC_SR>;
+} // Constraints = "$src = $dst", SchedRW
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+let Uses = [CL] in {
+def ROL8mCL  : I<0xD2, MRM0m, (outs), (ins i8mem :$dst),
+                 "rol{b}\t{%cl, $dst|$dst, cl}",
+                 [(store (rotl (loadi8 addr:$dst), CL), addr:$dst)],
+                 IIC_SR>;
+def ROL16mCL : I<0xD3, MRM0m, (outs), (ins i16mem:$dst),
+                 "rol{w}\t{%cl, $dst|$dst, cl}",
+                 [(store (rotl (loadi16 addr:$dst), CL), addr:$dst)],
+                 IIC_SR>, OpSize16;
+def ROL32mCL : I<0xD3, MRM0m, (outs), (ins i32mem:$dst),
+                 "rol{l}\t{%cl, $dst|$dst, cl}",
+                 [(store (rotl (loadi32 addr:$dst), CL), addr:$dst)],
+                 IIC_SR>, OpSize32;
+def ROL64mCL :  RI<0xD3, MRM0m, (outs), (ins i64mem:$dst),
+                   "rol{q}\t{%cl, $dst|$dst, cl}",
+                   [(store (rotl (loadi64 addr:$dst), CL), addr:$dst)],
+                   IIC_SR>;
+}
+def ROL8mi   : Ii8<0xC0, MRM0m, (outs), (ins i8mem :$dst, i8imm:$src1),
+                   "rol{b}\t{$src1, $dst|$dst, $src1}",
+               [(store (rotl (loadi8 addr:$dst), (i8 imm:$src1)), addr:$dst)],
+               IIC_SR>;
+def ROL16mi  : Ii8<0xC1, MRM0m, (outs), (ins i16mem:$dst, i8imm:$src1),
+                   "rol{w}\t{$src1, $dst|$dst, $src1}",
+              [(store (rotl (loadi16 addr:$dst), (i8 imm:$src1)), addr:$dst)],
+              IIC_SR>, OpSize16;
+def ROL32mi  : Ii8<0xC1, MRM0m, (outs), (ins i32mem:$dst, i8imm:$src1),
+                   "rol{l}\t{$src1, $dst|$dst, $src1}",
+              [(store (rotl (loadi32 addr:$dst), (i8 imm:$src1)), addr:$dst)],
+              IIC_SR>, OpSize32;
+def ROL64mi  : RIi8<0xC1, MRM0m, (outs), (ins i64mem:$dst, i8imm:$src1),
+                    "rol{q}\t{$src1, $dst|$dst, $src1}",
+                [(store (rotl (loadi64 addr:$dst), (i8 imm:$src1)), addr:$dst)],
+                IIC_SR>;
+
+// Rotate by 1
+def ROL8m1   : I<0xD0, MRM0m, (outs), (ins i8mem :$dst),
+                 "rol{b}\t$dst",
+               [(store (rotl (loadi8 addr:$dst), (i8 1)), addr:$dst)],
+               IIC_SR>;
+def ROL16m1  : I<0xD1, MRM0m, (outs), (ins i16mem:$dst),
+                 "rol{w}\t$dst",
+              [(store (rotl (loadi16 addr:$dst), (i8 1)), addr:$dst)],
+              IIC_SR>, OpSize16;
+def ROL32m1  : I<0xD1, MRM0m, (outs), (ins i32mem:$dst),
+                 "rol{l}\t$dst",
+              [(store (rotl (loadi32 addr:$dst), (i8 1)), addr:$dst)],
+              IIC_SR>, OpSize32;
+def ROL64m1  : RI<0xD1, MRM0m, (outs), (ins i64mem:$dst),
+                 "rol{q}\t$dst",
+               [(store (rotl (loadi64 addr:$dst), (i8 1)), addr:$dst)],
+               IIC_SR>;
+} // SchedRW
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+let Uses = [CL] in {
+def ROR8rCL  : I<0xD2, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
+                 "ror{b}\t{%cl, $dst|$dst, cl}",
+                 [(set GR8:$dst, (rotr GR8:$src1, CL))], IIC_SR>;
+def ROR16rCL : I<0xD3, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
+                 "ror{w}\t{%cl, $dst|$dst, cl}",
+                 [(set GR16:$dst, (rotr GR16:$src1, CL))], IIC_SR>, OpSize16;
+def ROR32rCL : I<0xD3, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
+                 "ror{l}\t{%cl, $dst|$dst, cl}",
+                 [(set GR32:$dst, (rotr GR32:$src1, CL))], IIC_SR>, OpSize32;
+def ROR64rCL : RI<0xD3, MRM1r, (outs GR64:$dst), (ins GR64:$src1),
+                  "ror{q}\t{%cl, $dst|$dst, cl}",
+                  [(set GR64:$dst, (rotr GR64:$src1, CL))], IIC_SR>;
+}
+
+def ROR8ri   : Ii8<0xC0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
+                   "ror{b}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (rotr GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
+def ROR16ri  : Ii8<0xC1, MRM1r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
+                   "ror{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (rotr GR16:$src1, (i8 imm:$src2)))],
+                   IIC_SR>, OpSize16;
+def ROR32ri  : Ii8<0xC1, MRM1r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
+                   "ror{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (rotr GR32:$src1, (i8 imm:$src2)))],
+                   IIC_SR>, OpSize32;
+def ROR64ri  : RIi8<0xC1, MRM1r, (outs GR64:$dst), 
+                    (ins GR64:$src1, i8imm:$src2),
+                    "ror{q}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (rotr GR64:$src1, (i8 imm:$src2)))],
+                    IIC_SR>;
+
+// Rotate by 1
+def ROR8r1   : I<0xD0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
+                 "ror{b}\t$dst",
+                 [(set GR8:$dst, (rotr GR8:$src1, (i8 1)))],
+                 IIC_SR>;
+def ROR16r1  : I<0xD1, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
+                 "ror{w}\t$dst",
+                 [(set GR16:$dst, (rotr GR16:$src1, (i8 1)))],
+                 IIC_SR>, OpSize16;
+def ROR32r1  : I<0xD1, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
+                 "ror{l}\t$dst",
+                 [(set GR32:$dst, (rotr GR32:$src1, (i8 1)))],
+                 IIC_SR>, OpSize32;
+def ROR64r1  : RI<0xD1, MRM1r, (outs GR64:$dst), (ins GR64:$src1),
+                  "ror{q}\t$dst",
+                  [(set GR64:$dst, (rotr GR64:$src1, (i8 1)))],
+                  IIC_SR>;
+} // Constraints = "$src = $dst", SchedRW
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+let Uses = [CL] in {
+def ROR8mCL  : I<0xD2, MRM1m, (outs), (ins i8mem :$dst),
+                 "ror{b}\t{%cl, $dst|$dst, cl}",
+                 [(store (rotr (loadi8 addr:$dst), CL), addr:$dst)],
+                 IIC_SR>;
+def ROR16mCL : I<0xD3, MRM1m, (outs), (ins i16mem:$dst),
+                 "ror{w}\t{%cl, $dst|$dst, cl}",
+                 [(store (rotr (loadi16 addr:$dst), CL), addr:$dst)],
+                 IIC_SR>, OpSize16;
+def ROR32mCL : I<0xD3, MRM1m, (outs), (ins i32mem:$dst), 
+                 "ror{l}\t{%cl, $dst|$dst, cl}",
+                 [(store (rotr (loadi32 addr:$dst), CL), addr:$dst)],
+                 IIC_SR>, OpSize32;
+def ROR64mCL : RI<0xD3, MRM1m, (outs), (ins i64mem:$dst), 
+                  "ror{q}\t{%cl, $dst|$dst, cl}",
+                  [(store (rotr (loadi64 addr:$dst), CL), addr:$dst)],
+                  IIC_SR>;
+}
+def ROR8mi   : Ii8<0xC0, MRM1m, (outs), (ins i8mem :$dst, i8imm:$src),
+                   "ror{b}\t{$src, $dst|$dst, $src}",
+               [(store (rotr (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
+               IIC_SR>;
+def ROR16mi  : Ii8<0xC1, MRM1m, (outs), (ins i16mem:$dst, i8imm:$src),
+                   "ror{w}\t{$src, $dst|$dst, $src}",
+              [(store (rotr (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
+              IIC_SR>, OpSize16;
+def ROR32mi  : Ii8<0xC1, MRM1m, (outs), (ins i32mem:$dst, i8imm:$src),
+                   "ror{l}\t{$src, $dst|$dst, $src}",
+              [(store (rotr (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
+              IIC_SR>, OpSize32;
+def ROR64mi  : RIi8<0xC1, MRM1m, (outs), (ins i64mem:$dst, i8imm:$src),
+                    "ror{q}\t{$src, $dst|$dst, $src}",
+                [(store (rotr (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
+                IIC_SR>;
+
+// Rotate by 1
+def ROR8m1   : I<0xD0, MRM1m, (outs), (ins i8mem :$dst),
+                 "ror{b}\t$dst",
+               [(store (rotr (loadi8 addr:$dst), (i8 1)), addr:$dst)],
+               IIC_SR>;
+def ROR16m1  : I<0xD1, MRM1m, (outs), (ins i16mem:$dst),
+                 "ror{w}\t$dst",
+              [(store (rotr (loadi16 addr:$dst), (i8 1)), addr:$dst)],
+              IIC_SR>, OpSize16;
+def ROR32m1  : I<0xD1, MRM1m, (outs), (ins i32mem:$dst),
+                 "ror{l}\t$dst",
+              [(store (rotr (loadi32 addr:$dst), (i8 1)), addr:$dst)],
+              IIC_SR>, OpSize32;
+def ROR64m1  : RI<0xD1, MRM1m, (outs), (ins i64mem:$dst),
+                 "ror{q}\t$dst",
+               [(store (rotr (loadi64 addr:$dst), (i8 1)), addr:$dst)],
+               IIC_SR>;
+} // SchedRW
+
+
+//===----------------------------------------------------------------------===//
+// Double shift instructions (generalizations of rotate)
+//===----------------------------------------------------------------------===//
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+
+let Uses = [CL] in {
+def SHLD16rrCL : I<0xA5, MRMDestReg, (outs GR16:$dst), 
+                   (ins GR16:$src1, GR16:$src2),
+                   "shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+                   [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, CL))],
+                    IIC_SHD16_REG_CL>,
+                   TB, OpSize16;
+def SHRD16rrCL : I<0xAD, MRMDestReg, (outs GR16:$dst), 
+                   (ins GR16:$src1, GR16:$src2),
+                   "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+                   [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, CL))],
+                    IIC_SHD16_REG_CL>,
+                   TB, OpSize16;
+def SHLD32rrCL : I<0xA5, MRMDestReg, (outs GR32:$dst), 
+                   (ins GR32:$src1, GR32:$src2),
+                   "shld{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+                   [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, CL))],
+                    IIC_SHD32_REG_CL>, TB, OpSize32;
+def SHRD32rrCL : I<0xAD, MRMDestReg, (outs GR32:$dst),
+                   (ins GR32:$src1, GR32:$src2),
+                   "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+                   [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, CL))],
+                   IIC_SHD32_REG_CL>, TB, OpSize32;
+def SHLD64rrCL : RI<0xA5, MRMDestReg, (outs GR64:$dst), 
+                    (ins GR64:$src1, GR64:$src2),
+                    "shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+                    [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, CL))],
+                    IIC_SHD64_REG_CL>, 
+                    TB;
+def SHRD64rrCL : RI<0xAD, MRMDestReg, (outs GR64:$dst), 
+                    (ins GR64:$src1, GR64:$src2),
+                    "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+                    [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, CL))],
+                    IIC_SHD64_REG_CL>, 
+                    TB;
+}
+
+let isCommutable = 1 in {  // These instructions commute to each other.
+def SHLD16rri8 : Ii8<0xA4, MRMDestReg,
+                     (outs GR16:$dst), 
+                     (ins GR16:$src1, GR16:$src2, i8imm:$src3),
+                     "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                     [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2,
+                                      (i8 imm:$src3)))], IIC_SHD16_REG_IM>,
+                     TB, OpSize16;
+def SHRD16rri8 : Ii8<0xAC, MRMDestReg,
+                     (outs GR16:$dst), 
+                     (ins GR16:$src1, GR16:$src2, i8imm:$src3),
+                     "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                     [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2,
+                                      (i8 imm:$src3)))], IIC_SHD16_REG_IM>,
+                     TB, OpSize16;
+def SHLD32rri8 : Ii8<0xA4, MRMDestReg,
+                     (outs GR32:$dst), 
+                     (ins GR32:$src1, GR32:$src2, i8imm:$src3),
+                     "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                     [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2,
+                                      (i8 imm:$src3)))], IIC_SHD32_REG_IM>,
+                 TB, OpSize32;
+def SHRD32rri8 : Ii8<0xAC, MRMDestReg,
+                     (outs GR32:$dst), 
+                     (ins GR32:$src1, GR32:$src2, i8imm:$src3),
+                     "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                     [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2,
+                                      (i8 imm:$src3)))], IIC_SHD32_REG_IM>,
+                 TB, OpSize32;
+def SHLD64rri8 : RIi8<0xA4, MRMDestReg,
+                      (outs GR64:$dst), 
+                      (ins GR64:$src1, GR64:$src2, i8imm:$src3),
+                      "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                      [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2,
+                                       (i8 imm:$src3)))], IIC_SHD64_REG_IM>,
+                 TB;
+def SHRD64rri8 : RIi8<0xAC, MRMDestReg,
+                      (outs GR64:$dst), 
+                      (ins GR64:$src1, GR64:$src2, i8imm:$src3),
+                      "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                      [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2,
+                                       (i8 imm:$src3)))], IIC_SHD64_REG_IM>,
+                 TB;
+}
+} // Constraints = "$src = $dst", SchedRW
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+let Uses = [CL] in {
+def SHLD16mrCL : I<0xA5, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+                   "shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+                   [(store (X86shld (loadi16 addr:$dst), GR16:$src2, CL),
+                     addr:$dst)], IIC_SHD16_MEM_CL>, TB, OpSize16;
+def SHRD16mrCL : I<0xAD, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+                  "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+                  [(store (X86shrd (loadi16 addr:$dst), GR16:$src2, CL),
+                    addr:$dst)], IIC_SHD16_MEM_CL>, TB, OpSize16;
+
+def SHLD32mrCL : I<0xA5, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+                   "shld{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+                   [(store (X86shld (loadi32 addr:$dst), GR32:$src2, CL),
+                     addr:$dst)], IIC_SHD32_MEM_CL>, TB, OpSize32;
+def SHRD32mrCL : I<0xAD, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+                  "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+                  [(store (X86shrd (loadi32 addr:$dst), GR32:$src2, CL),
+                    addr:$dst)], IIC_SHD32_MEM_CL>, TB, OpSize32;
+                    
+def SHLD64mrCL : RI<0xA5, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+                    "shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+                    [(store (X86shld (loadi64 addr:$dst), GR64:$src2, CL),
+                      addr:$dst)], IIC_SHD64_MEM_CL>, TB;
+def SHRD64mrCL : RI<0xAD, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+                    "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+                    [(store (X86shrd (loadi64 addr:$dst), GR64:$src2, CL),
+                      addr:$dst)], IIC_SHD64_MEM_CL>, TB;
+}
+
+def SHLD16mri8 : Ii8<0xA4, MRMDestMem,
+                    (outs), (ins i16mem:$dst, GR16:$src2, i8imm:$src3),
+                    "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                    [(store (X86shld (loadi16 addr:$dst), GR16:$src2,
+                                      (i8 imm:$src3)), addr:$dst)],
+                                      IIC_SHD16_MEM_IM>,
+                    TB, OpSize16;
+def SHRD16mri8 : Ii8<0xAC, MRMDestMem, 
+                     (outs), (ins i16mem:$dst, GR16:$src2, i8imm:$src3),
+                     "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                    [(store (X86shrd (loadi16 addr:$dst), GR16:$src2,
+                                      (i8 imm:$src3)), addr:$dst)],
+                                      IIC_SHD16_MEM_IM>,
+                     TB, OpSize16;
+
+def SHLD32mri8 : Ii8<0xA4, MRMDestMem,
+                    (outs), (ins i32mem:$dst, GR32:$src2, i8imm:$src3),
+                    "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                    [(store (X86shld (loadi32 addr:$dst), GR32:$src2,
+                                      (i8 imm:$src3)), addr:$dst)],
+                                      IIC_SHD32_MEM_IM>,
+                    TB, OpSize32;
+def SHRD32mri8 : Ii8<0xAC, MRMDestMem, 
+                     (outs), (ins i32mem:$dst, GR32:$src2, i8imm:$src3),
+                     "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                     [(store (X86shrd (loadi32 addr:$dst), GR32:$src2,
+                                       (i8 imm:$src3)), addr:$dst)],
+                                       IIC_SHD32_MEM_IM>,
+                     TB, OpSize32;
+
+def SHLD64mri8 : RIi8<0xA4, MRMDestMem,
+                      (outs), (ins i64mem:$dst, GR64:$src2, i8imm:$src3),
+                      "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                      [(store (X86shld (loadi64 addr:$dst), GR64:$src2,
+                                       (i8 imm:$src3)), addr:$dst)],
+                                       IIC_SHD64_MEM_IM>,
+                 TB;
+def SHRD64mri8 : RIi8<0xAC, MRMDestMem, 
+                      (outs), (ins i64mem:$dst, GR64:$src2, i8imm:$src3),
+                      "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                      [(store (X86shrd (loadi64 addr:$dst), GR64:$src2,
+                                       (i8 imm:$src3)), addr:$dst)],
+                                       IIC_SHD64_MEM_IM>,
+                 TB;
+} // SchedRW
+
+} // Defs = [EFLAGS]
+
+def ROT32L2R_imm8  : SDNodeXForm<imm, [{
+  // Convert a ROTL shamt to a ROTR shamt on 32-bit integer.
+  return getI8Imm(32 - N->getZExtValue());
+}]>;
+
+def ROT64L2R_imm8  : SDNodeXForm<imm, [{
+  // Convert a ROTL shamt to a ROTR shamt on 64-bit integer.
+  return getI8Imm(64 - N->getZExtValue());
+}]>;
+
+multiclass bmi_rotate<string asm, RegisterClass RC, X86MemOperand x86memop> {
+let neverHasSideEffects = 1 in {
+  def ri : Ii8<0xF0, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, i8imm:$src2),
+               !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+               []>, TAXD, VEX, Sched<[WriteShift]>;
+  let mayLoad = 1 in
+  def mi : Ii8<0xF0, MRMSrcMem, (outs RC:$dst),
+               (ins x86memop:$src1, i8imm:$src2),
+               !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+               []>, TAXD, VEX, Sched<[WriteShiftLd]>;
+}
+}
+
+multiclass bmi_shift<string asm, RegisterClass RC, X86MemOperand x86memop> {
+let neverHasSideEffects = 1 in {
+  def rr : I<0xF7, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+             !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
+             VEX_4VOp3, Sched<[WriteShift]>;
+  let mayLoad = 1 in
+  def rm : I<0xF7, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1, RC:$src2),
+             !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
+             VEX_4VOp3,
+             Sched<[WriteShiftLd,
+                    // x86memop:$src1
+                    ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+                    ReadDefault,
+                    // RC:$src1
+                    ReadAfterLd]>;
+}
+}
+
+let Predicates = [HasBMI2] in {
+  defm RORX32 : bmi_rotate<"rorx{l}", GR32, i32mem>;
+  defm RORX64 : bmi_rotate<"rorx{q}", GR64, i64mem>, VEX_W;
+  defm SARX32 : bmi_shift<"sarx{l}", GR32, i32mem>, T8XS;
+  defm SARX64 : bmi_shift<"sarx{q}", GR64, i64mem>, T8XS, VEX_W;
+  defm SHRX32 : bmi_shift<"shrx{l}", GR32, i32mem>, T8XD;
+  defm SHRX64 : bmi_shift<"shrx{q}", GR64, i64mem>, T8XD, VEX_W;
+  defm SHLX32 : bmi_shift<"shlx{l}", GR32, i32mem>, T8PD;
+  defm SHLX64 : bmi_shift<"shlx{q}", GR64, i64mem>, T8PD, VEX_W;
+
+  // Prefer RORX which is non-destructive and doesn't update EFLAGS.
+  let AddedComplexity = 10 in {
+    def : Pat<(rotl GR32:$src, (i8 imm:$shamt)),
+              (RORX32ri GR32:$src, (ROT32L2R_imm8 imm:$shamt))>;
+    def : Pat<(rotl GR64:$src, (i8 imm:$shamt)),
+              (RORX64ri GR64:$src, (ROT64L2R_imm8 imm:$shamt))>;
+  }
+
+  def : Pat<(rotl (loadi32 addr:$src), (i8 imm:$shamt)),
+            (RORX32mi addr:$src, (ROT32L2R_imm8 imm:$shamt))>;
+  def : Pat<(rotl (loadi64 addr:$src), (i8 imm:$shamt)),
+            (RORX64mi addr:$src, (ROT64L2R_imm8 imm:$shamt))>;
+
+  // Prefer SARX/SHRX/SHLX over SAR/SHR/SHL with variable shift BUT not
+  // immedidate shift, i.e. the following code is considered better
+  //
+  //  mov %edi, %esi
+  //  shl $imm, %esi
+  //  ... %edi, ...
+  //
+  // than
+  //
+  //  movb $imm, %sil
+  //  shlx %sil, %edi, %esi
+  //  ... %edi, ...
+  //
+  let AddedComplexity = 1 in {
+    def : Pat<(sra GR32:$src1, GR8:$src2),
+              (SARX32rr GR32:$src1,
+                        (INSERT_SUBREG
+                          (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+    def : Pat<(sra GR64:$src1, GR8:$src2),
+              (SARX64rr GR64:$src1,
+                        (INSERT_SUBREG
+                          (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+    def : Pat<(srl GR32:$src1, GR8:$src2),
+              (SHRX32rr GR32:$src1,
+                        (INSERT_SUBREG
+                          (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+    def : Pat<(srl GR64:$src1, GR8:$src2),
+              (SHRX64rr GR64:$src1,
+                        (INSERT_SUBREG
+                          (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+    def : Pat<(shl GR32:$src1, GR8:$src2),
+              (SHLX32rr GR32:$src1,
+                        (INSERT_SUBREG
+                          (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+    def : Pat<(shl GR64:$src1, GR8:$src2),
+              (SHLX64rr GR64:$src1,
+                        (INSERT_SUBREG
+                          (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+  }
+
+  // Patterns on SARXrm/SHRXrm/SHLXrm are explicitly omitted to favor
+  //
+  //  mov (%ecx), %esi
+  //  shl $imm, $esi
+  //
+  // over
+  //
+  //  movb $imm %al
+  //  shlx %al, (%ecx), %esi
+  //
+  // As SARXrr/SHRXrr/SHLXrr is favored on variable shift, the peephole
+  // optimization will fold them into SARXrm/SHRXrm/SHLXrm if possible.
+}
diff --git a/contrib/llvm/lib/Target/X86/X86InstrSystem.td b/contrib/llvm/lib/Target/X86/X86InstrSystem.td
new file mode 100644
index 0000000..5402780
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrSystem.td
@@ -0,0 +1,569 @@
+//===-- X86InstrSystem.td - System Instructions ------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 instructions that are generally used in
+// privileged modes.  These are not typically used by the compiler, but are
+// supported for the assembler and disassembler.
+//
+//===----------------------------------------------------------------------===//
+
+let SchedRW = [WriteSystem] in {
+let Defs = [RAX, RDX] in
+  def RDTSC : I<0x31, RawFrm, (outs), (ins), "rdtsc", [(X86rdtsc)], IIC_RDTSC>,
+              TB;
+
+let Defs = [RAX, RCX, RDX] in
+  def RDTSCP : I<0x01, MRM_F9, (outs), (ins), "rdtscp", [(X86rdtscp)]>, TB;
+
+// CPU flow control instructions
+
+let isTerminator = 1, isBarrier = 1, hasCtrlDep = 1 in {
+  def TRAP    : I<0x0B, RawFrm, (outs), (ins), "ud2", [(trap)]>, TB;
+  def UD2B    : I<0xB9, RawFrm, (outs), (ins), "ud2b", []>, TB;
+}
+
+def HLT : I<0xF4, RawFrm, (outs), (ins), "hlt", [], IIC_HLT>;
+def RSM : I<0xAA, RawFrm, (outs), (ins), "rsm", [], IIC_RSM>, TB;
+
+// Interrupt and SysCall Instructions.
+let Uses = [EFLAGS] in
+  def INTO : I<0xce, RawFrm, (outs), (ins), "into", []>;
+def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3",
+              [(int_x86_int (i8 3))], IIC_INT3>;
+} // SchedRW
+
+def : Pat<(debugtrap),
+          (INT3)>;
+
+// The long form of "int $3" turns into int3 as a size optimization.
+// FIXME: This doesn't work because InstAlias can't match immediate constants.
+//def : InstAlias<"int\t$3", (INT3)>;
+
+let SchedRW = [WriteSystem] in {
+
+def INT : Ii8<0xcd, RawFrm, (outs), (ins i8imm:$trap), "int\t$trap",
+              [(int_x86_int imm:$trap)], IIC_INT>;
+
+
+def SYSCALL  : I<0x05, RawFrm, (outs), (ins), "syscall", [], IIC_SYSCALL>, TB;
+def SYSRET   : I<0x07, RawFrm, (outs), (ins), "sysret{l}", [], IIC_SYSCALL>, TB;
+def SYSRET64 :RI<0x07, RawFrm, (outs), (ins), "sysret{q}", [], IIC_SYSCALL>, TB,
+               Requires<[In64BitMode]>;
+
+def SYSENTER : I<0x34, RawFrm, (outs), (ins), "sysenter", [],
+                 IIC_SYS_ENTER_EXIT>, TB;
+
+def SYSEXIT   : I<0x35, RawFrm, (outs), (ins), "sysexit{l}", [],
+                 IIC_SYS_ENTER_EXIT>, TB;
+def SYSEXIT64 :RI<0x35, RawFrm, (outs), (ins), "sysexit{q}", [],
+                 IIC_SYS_ENTER_EXIT>, TB, Requires<[In64BitMode]>;
+
+def IRET16 : I<0xcf, RawFrm, (outs), (ins), "iret{w}", [], IIC_IRET>, OpSize16;
+def IRET32 : I<0xcf, RawFrm, (outs), (ins), "iret{l|d}", [], IIC_IRET>,
+             OpSize32;
+def IRET64 : RI<0xcf, RawFrm, (outs), (ins), "iretq", [], IIC_IRET>,
+             Requires<[In64BitMode]>;
+} // SchedRW
+
+
+//===----------------------------------------------------------------------===//
+//  Input/Output Instructions.
+//
+let SchedRW = [WriteSystem] in {
+let Defs = [AL], Uses = [DX] in
+def IN8rr  : I<0xEC, RawFrm, (outs), (ins),
+               "in{b}\t{%dx, %al|al, dx}", [], IIC_IN_RR>;
+let Defs = [AX], Uses = [DX] in
+def IN16rr : I<0xED, RawFrm, (outs), (ins),
+               "in{w}\t{%dx, %ax|ax, dx}", [], IIC_IN_RR>,  OpSize16;
+let Defs = [EAX], Uses = [DX] in
+def IN32rr : I<0xED, RawFrm, (outs), (ins),
+               "in{l}\t{%dx, %eax|eax, dx}", [], IIC_IN_RR>, OpSize32;
+
+let Defs = [AL] in
+def IN8ri  : Ii8<0xE4, RawFrm, (outs), (ins i8imm:$port),
+                  "in{b}\t{$port, %al|al, $port}", [], IIC_IN_RI>;
+let Defs = [AX] in
+def IN16ri : Ii8<0xE5, RawFrm, (outs), (ins i8imm:$port),
+                  "in{w}\t{$port, %ax|ax, $port}", [], IIC_IN_RI>, OpSize16;
+let Defs = [EAX] in
+def IN32ri : Ii8<0xE5, RawFrm, (outs), (ins i8imm:$port),
+                  "in{l}\t{$port, %eax|eax, $port}", [], IIC_IN_RI>, OpSize32;
+
+let Uses = [DX, AL] in
+def OUT8rr  : I<0xEE, RawFrm, (outs), (ins),
+                "out{b}\t{%al, %dx|dx, al}", [], IIC_OUT_RR>;
+let Uses = [DX, AX] in
+def OUT16rr : I<0xEF, RawFrm, (outs), (ins),
+                "out{w}\t{%ax, %dx|dx, ax}", [], IIC_OUT_RR>, OpSize16;
+let Uses = [DX, EAX] in
+def OUT32rr : I<0xEF, RawFrm, (outs), (ins),
+                "out{l}\t{%eax, %dx|dx, eax}", [], IIC_OUT_RR>, OpSize32;
+
+let Uses = [AL] in
+def OUT8ir  : Ii8<0xE6, RawFrm, (outs), (ins i8imm:$port),
+                   "out{b}\t{%al, $port|$port, al}", [], IIC_OUT_IR>;
+let Uses = [AX] in
+def OUT16ir : Ii8<0xE7, RawFrm, (outs), (ins i8imm:$port),
+                   "out{w}\t{%ax, $port|$port, ax}", [], IIC_OUT_IR>, OpSize16;
+let Uses = [EAX] in
+def OUT32ir : Ii8<0xE7, RawFrm, (outs), (ins i8imm:$port),
+                  "out{l}\t{%eax, $port|$port, eax}", [], IIC_OUT_IR>, OpSize32;
+
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Moves to and from debug registers
+
+let SchedRW = [WriteSystem] in {
+def MOV32rd : I<0x21, MRMDestReg, (outs GR32:$dst), (ins DEBUG_REG:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_DR>, TB,
+                Requires<[Not64BitMode]>;
+def MOV64rd : I<0x21, MRMDestReg, (outs GR64:$dst), (ins DEBUG_REG:$src),
+                "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_DR>, TB,
+                Requires<[In64BitMode]>;
+
+def MOV32dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR32:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_DR_REG>, TB,
+                Requires<[Not64BitMode]>;
+def MOV64dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR64:$src),
+                "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_DR_REG>, TB,
+                Requires<[In64BitMode]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Moves to and from control registers
+
+let SchedRW = [WriteSystem] in {
+def MOV32rc : I<0x20, MRMDestReg, (outs GR32:$dst), (ins CONTROL_REG:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_CR>, TB,
+                Requires<[Not64BitMode]>;
+def MOV64rc : I<0x20, MRMDestReg, (outs GR64:$dst), (ins CONTROL_REG:$src),
+                "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_CR>, TB,
+                Requires<[In64BitMode]>;
+
+def MOV32cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR32:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_CR_REG>, TB,
+                Requires<[Not64BitMode]>;
+def MOV64cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR64:$src),
+                "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_CR_REG>, TB,
+                Requires<[In64BitMode]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Segment override instruction prefixes
+
+def CS_PREFIX : I<0x2E, RawFrm, (outs), (ins), "cs", []>;
+def SS_PREFIX : I<0x36, RawFrm, (outs), (ins), "ss", []>;
+def DS_PREFIX : I<0x3E, RawFrm, (outs), (ins), "ds", []>;
+def ES_PREFIX : I<0x26, RawFrm, (outs), (ins), "es", []>;
+def FS_PREFIX : I<0x64, RawFrm, (outs), (ins), "fs", []>;
+def GS_PREFIX : I<0x65, RawFrm, (outs), (ins), "gs", []>;
+
+
+//===----------------------------------------------------------------------===//
+// Moves to and from segment registers.
+//
+
+let SchedRW = [WriteMove] in {
+def MOV16rs : I<0x8C, MRMDestReg, (outs GR16:$dst), (ins SEGMENT_REG:$src),
+                "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>, OpSize16;
+def MOV32rs : I<0x8C, MRMDestReg, (outs GR32:$dst), (ins SEGMENT_REG:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>, OpSize32;
+def MOV64rs : RI<0x8C, MRMDestReg, (outs GR64:$dst), (ins SEGMENT_REG:$src),
+                 "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>;
+
+def MOV16ms : I<0x8C, MRMDestMem, (outs i16mem:$dst), (ins SEGMENT_REG:$src),
+                "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>, OpSize16;
+def MOV32ms : I<0x8C, MRMDestMem, (outs i32mem:$dst), (ins SEGMENT_REG:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>, OpSize32;
+def MOV64ms : RI<0x8C, MRMDestMem, (outs i64mem:$dst), (ins SEGMENT_REG:$src),
+                 "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>;
+
+def MOV16sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR16:$src),
+                "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>, OpSize16;
+def MOV32sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR32:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>, OpSize32;
+def MOV64sr : RI<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR64:$src),
+                 "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>;
+
+def MOV16sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i16mem:$src),
+                "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>, OpSize16;
+def MOV32sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i32mem:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>, OpSize32;
+def MOV64sm : RI<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i64mem:$src),
+                 "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Segmentation support instructions.
+
+let SchedRW = [WriteSystem] in {
+def SWAPGS : I<0x01, MRM_F8, (outs), (ins), "swapgs", [], IIC_SWAPGS>, TB;
+
+def LAR16rm : I<0x02, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), 
+                "lar{w}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB,
+                OpSize16;
+def LAR16rr : I<0x02, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                "lar{w}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB,
+                OpSize16;
+
+// i16mem operand in LAR32rm and GR32 operand in LAR32rr is not a typo.
+def LAR32rm : I<0x02, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src), 
+                "lar{l}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB,
+                OpSize32;
+def LAR32rr : I<0x02, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                "lar{l}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB,
+                OpSize32;
+// i16mem operand in LAR64rm and GR32 operand in LAR32rr is not a typo.
+def LAR64rm : RI<0x02, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src), 
+                 "lar{q}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB;
+def LAR64rr : RI<0x02, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src),
+                 "lar{q}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB;
+
+def LSL16rm : I<0x03, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                "lsl{w}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB,
+                OpSize16;
+def LSL16rr : I<0x03, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                "lsl{w}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB,
+                OpSize16;
+def LSL32rm : I<0x03, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                "lsl{l}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB,
+                OpSize32;
+def LSL32rr : I<0x03, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                "lsl{l}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB,
+                OpSize32;
+def LSL64rm : RI<0x03, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                 "lsl{q}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB; 
+def LSL64rr : RI<0x03, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+                 "lsl{q}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB;
+
+def INVLPG : I<0x01, MRM7m, (outs), (ins i8mem:$addr), "invlpg\t$addr",
+               [], IIC_INVLPG>, TB;
+
+def STR16r : I<0x00, MRM1r, (outs GR16:$dst), (ins),
+               "str{w}\t$dst", [], IIC_STR>, TB, OpSize16;
+def STR32r : I<0x00, MRM1r, (outs GR32:$dst), (ins),
+               "str{l}\t$dst", [], IIC_STR>, TB, OpSize32;
+def STR64r : RI<0x00, MRM1r, (outs GR64:$dst), (ins),
+                "str{q}\t$dst", [], IIC_STR>, TB;
+def STRm   : I<0x00, MRM1m, (outs i16mem:$dst), (ins),
+               "str{w}\t$dst", [], IIC_STR>, TB;
+
+def LTRr : I<0x00, MRM3r, (outs), (ins GR16:$src),
+             "ltr{w}\t$src", [], IIC_LTR>, TB;
+def LTRm : I<0x00, MRM3m, (outs), (ins i16mem:$src),
+             "ltr{w}\t$src", [], IIC_LTR>, TB;
+             
+def PUSHCS16 : I<0x0E, RawFrm, (outs), (ins),
+                 "push{w}\t{%cs|cs}", [], IIC_PUSH_SR>,
+                 OpSize16, Requires<[Not64BitMode]>;
+def PUSHCS32 : I<0x0E, RawFrm, (outs), (ins),
+                 "push{l}\t{%cs|cs}", [], IIC_PUSH_CS>,
+                 OpSize32, Requires<[Not64BitMode]>;
+def PUSHSS16 : I<0x16, RawFrm, (outs), (ins),
+                 "push{w}\t{%ss|ss}", [], IIC_PUSH_SR>,
+                 OpSize16, Requires<[Not64BitMode]>;
+def PUSHSS32 : I<0x16, RawFrm, (outs), (ins),
+                 "push{l}\t{%ss|ss}", [], IIC_PUSH_SR>,
+                 OpSize32, Requires<[Not64BitMode]>;
+def PUSHDS16 : I<0x1E, RawFrm, (outs), (ins),
+                 "push{w}\t{%ds|ds}", [], IIC_PUSH_SR>,
+                 OpSize16, Requires<[Not64BitMode]>;
+def PUSHDS32 : I<0x1E, RawFrm, (outs), (ins),
+                 "push{l}\t{%ds|ds}", [], IIC_PUSH_SR>,
+                 OpSize32, Requires<[Not64BitMode]>;
+def PUSHES16 : I<0x06, RawFrm, (outs), (ins),
+                 "push{w}\t{%es|es}", [], IIC_PUSH_SR>,
+                 OpSize16, Requires<[Not64BitMode]>;
+def PUSHES32 : I<0x06, RawFrm, (outs), (ins),
+                 "push{l}\t{%es|es}", [], IIC_PUSH_SR>,
+                 OpSize32, Requires<[Not64BitMode]>;
+def PUSHFS16 : I<0xa0, RawFrm, (outs), (ins),
+                 "push{w}\t{%fs|fs}", [], IIC_PUSH_SR>, OpSize16, TB;
+def PUSHFS32 : I<0xa0, RawFrm, (outs), (ins),
+                 "push{l}\t{%fs|fs}", [], IIC_PUSH_SR>, TB,
+               OpSize32, Requires<[Not64BitMode]>;
+def PUSHGS16 : I<0xa8, RawFrm, (outs), (ins),
+                 "push{w}\t{%gs|gs}", [], IIC_PUSH_SR>, OpSize16, TB;
+def PUSHGS32 : I<0xa8, RawFrm, (outs), (ins),
+                 "push{l}\t{%gs|gs}", [], IIC_PUSH_SR>, TB,
+               OpSize32, Requires<[Not64BitMode]>;
+def PUSHFS64 : I<0xa0, RawFrm, (outs), (ins),
+                 "push{q}\t{%fs|fs}", [], IIC_PUSH_SR>, TB,
+               OpSize32, Requires<[In64BitMode]>;
+def PUSHGS64 : I<0xa8, RawFrm, (outs), (ins),
+                 "push{q}\t{%gs|gs}", [], IIC_PUSH_SR>, TB,
+               OpSize32, Requires<[In64BitMode]>;
+
+// No "pop cs" instruction.
+def POPSS16 : I<0x17, RawFrm, (outs), (ins),
+                "pop{w}\t{%ss|ss}", [], IIC_POP_SR_SS>,
+              OpSize16, Requires<[Not64BitMode]>;
+def POPSS32 : I<0x17, RawFrm, (outs), (ins),
+                "pop{l}\t{%ss|ss}", [], IIC_POP_SR_SS>,
+              OpSize32, Requires<[Not64BitMode]>;
+
+def POPDS16 : I<0x1F, RawFrm, (outs), (ins),
+                "pop{w}\t{%ds|ds}", [], IIC_POP_SR>,
+              OpSize16, Requires<[Not64BitMode]>;
+def POPDS32 : I<0x1F, RawFrm, (outs), (ins),
+                "pop{l}\t{%ds|ds}", [], IIC_POP_SR>,
+              OpSize32, Requires<[Not64BitMode]>;
+
+def POPES16 : I<0x07, RawFrm, (outs), (ins),
+                "pop{w}\t{%es|es}", [], IIC_POP_SR>,
+              OpSize16, Requires<[Not64BitMode]>;
+def POPES32 : I<0x07, RawFrm, (outs), (ins),
+                "pop{l}\t{%es|es}", [], IIC_POP_SR>,
+              OpSize32, Requires<[Not64BitMode]>;
+
+def POPFS16 : I<0xa1, RawFrm, (outs), (ins),
+                "pop{w}\t{%fs|fs}", [], IIC_POP_SR>, OpSize16, TB;
+def POPFS32 : I<0xa1, RawFrm, (outs), (ins),
+                "pop{l}\t{%fs|fs}", [], IIC_POP_SR>, TB,
+              OpSize32, Requires<[Not64BitMode]>;
+def POPFS64 : I<0xa1, RawFrm, (outs), (ins),
+                "pop{q}\t{%fs|fs}", [], IIC_POP_SR>, TB,
+              OpSize32, Requires<[In64BitMode]>;
+
+def POPGS16 : I<0xa9, RawFrm, (outs), (ins),
+                "pop{w}\t{%gs|gs}", [], IIC_POP_SR>, OpSize16, TB;
+def POPGS32 : I<0xa9, RawFrm, (outs), (ins),
+                "pop{l}\t{%gs|gs}", [], IIC_POP_SR>, TB,
+              OpSize32, Requires<[Not64BitMode]>;
+def POPGS64 : I<0xa9, RawFrm, (outs), (ins),
+                "pop{q}\t{%gs|gs}", [], IIC_POP_SR>, TB,
+              OpSize32, Requires<[In64BitMode]>;
+
+
+def LDS16rm : I<0xc5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
+                "lds{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize16;
+def LDS32rm : I<0xc5, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
+                "lds{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize32;
+                
+def LSS16rm : I<0xb2, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
+                "lss{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize16;
+def LSS32rm : I<0xb2, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
+                "lss{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize32;
+def LSS64rm : RI<0xb2, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
+                 "lss{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
+                
+def LES16rm : I<0xc4, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
+                "les{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize16;
+def LES32rm : I<0xc4, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
+                "les{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize32;
+                
+def LFS16rm : I<0xb4, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
+                "lfs{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize16;
+def LFS32rm : I<0xb4, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
+                "lfs{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize32;
+def LFS64rm : RI<0xb4, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
+                 "lfs{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
+                
+def LGS16rm : I<0xb5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
+                "lgs{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize16;
+def LGS32rm : I<0xb5, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
+                "lgs{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize32;
+                
+def LGS64rm : RI<0xb5, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
+                 "lgs{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
+
+
+def VERRr : I<0x00, MRM4r, (outs), (ins GR16:$seg),
+              "verr\t$seg", [], IIC_VERR>, TB;
+def VERRm : I<0x00, MRM4m, (outs), (ins i16mem:$seg),
+              "verr\t$seg", [], IIC_VERR>, TB;
+def VERWr : I<0x00, MRM5r, (outs), (ins GR16:$seg),
+              "verw\t$seg", [], IIC_VERW_MEM>, TB;
+def VERWm : I<0x00, MRM5m, (outs), (ins i16mem:$seg),
+              "verw\t$seg", [], IIC_VERW_REG>, TB;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Descriptor-table support instructions
+
+let SchedRW = [WriteSystem] in {
+def SGDT16m : I<0x01, MRM0m, (outs opaque48mem:$dst), (ins),
+              "sgdt{w}\t$dst", [], IIC_SGDT>, TB, OpSize16, Requires<[Not64BitMode]>;
+def SGDT32m : I<0x01, MRM0m, (outs opaque48mem:$dst), (ins),
+              "sgdt{l}\t$dst", [], IIC_SGDT>, OpSize32, TB, Requires <[Not64BitMode]>;
+def SGDT64m : I<0x01, MRM0m, (outs opaque80mem:$dst), (ins),
+              "sgdt{q}\t$dst", [], IIC_SGDT>, TB, Requires <[In64BitMode]>;
+def SIDT16m : I<0x01, MRM1m, (outs opaque48mem:$dst), (ins),
+              "sidt{w}\t$dst", [], IIC_SIDT>, TB, OpSize16, Requires<[Not64BitMode]>;
+def SIDT32m : I<0x01, MRM1m, (outs opaque48mem:$dst), (ins),
+              "sidt{l}\t$dst", []>, OpSize32, TB, Requires <[Not64BitMode]>;
+def SIDT64m : I<0x01, MRM1m, (outs opaque80mem:$dst), (ins),
+              "sidt{q}\t$dst", []>, TB, Requires <[In64BitMode]>;
+def SLDT16r : I<0x00, MRM0r, (outs GR16:$dst), (ins),
+                "sldt{w}\t$dst", [], IIC_SLDT>, TB, OpSize16;
+def SLDT16m : I<0x00, MRM0m, (outs i16mem:$dst), (ins),
+                "sldt{w}\t$dst", [], IIC_SLDT>, TB;
+def SLDT32r : I<0x00, MRM0r, (outs GR32:$dst), (ins),
+                "sldt{l}\t$dst", [], IIC_SLDT>, OpSize32, TB;
+                
+// LLDT is not interpreted specially in 64-bit mode because there is no sign
+//   extension.
+def SLDT64r : RI<0x00, MRM0r, (outs GR64:$dst), (ins),
+                 "sldt{q}\t$dst", [], IIC_SLDT>, TB;
+def SLDT64m : RI<0x00, MRM0m, (outs i16mem:$dst), (ins),
+                 "sldt{q}\t$dst", [], IIC_SLDT>, TB;
+
+def LGDT16m : I<0x01, MRM2m, (outs), (ins opaque48mem:$src),
+              "lgdt{w}\t$src", [], IIC_LGDT>, TB, OpSize16, Requires<[Not64BitMode]>;
+def LGDT32m : I<0x01, MRM2m, (outs), (ins opaque48mem:$src),
+              "lgdt{l}\t$src", [], IIC_LGDT>, OpSize32, TB, Requires<[Not64BitMode]>;
+def LGDT64m : I<0x01, MRM2m, (outs), (ins opaque80mem:$src),
+              "lgdt{q}\t$src", [], IIC_LGDT>, TB, Requires<[In64BitMode]>;
+def LIDT16m : I<0x01, MRM3m, (outs), (ins opaque48mem:$src),
+              "lidt{w}\t$src", [], IIC_LIDT>, TB, OpSize16, Requires<[Not64BitMode]>;
+def LIDT32m : I<0x01, MRM3m, (outs), (ins opaque48mem:$src),
+              "lidt{l}\t$src", [], IIC_LIDT>, OpSize32, TB, Requires<[Not64BitMode]>;
+def LIDT64m : I<0x01, MRM3m, (outs), (ins opaque80mem:$src),
+              "lidt{q}\t$src", [], IIC_LIDT>, TB, Requires<[In64BitMode]>;
+def LLDT16r : I<0x00, MRM2r, (outs), (ins GR16:$src),
+                "lldt{w}\t$src", [], IIC_LLDT_REG>, TB;
+def LLDT16m : I<0x00, MRM2m, (outs), (ins i16mem:$src),
+                "lldt{w}\t$src", [], IIC_LLDT_MEM>, TB;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Specialized register support
+let SchedRW = [WriteSystem] in {
+def WRMSR : I<0x30, RawFrm, (outs), (ins), "wrmsr", [], IIC_WRMSR>, TB;
+def RDMSR : I<0x32, RawFrm, (outs), (ins), "rdmsr", [], IIC_RDMSR>, TB;
+
+let Defs = [RAX, RDX], Uses = [ECX] in
+  def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", [(X86rdpmc)], IIC_RDPMC>,
+              TB;
+
+def SMSW16r : I<0x01, MRM4r, (outs GR16:$dst), (ins), 
+                "smsw{w}\t$dst", [], IIC_SMSW>, OpSize16, TB;
+def SMSW32r : I<0x01, MRM4r, (outs GR32:$dst), (ins), 
+                "smsw{l}\t$dst", [], IIC_SMSW>, OpSize32, TB;
+// no m form encodable; use SMSW16m
+def SMSW64r : RI<0x01, MRM4r, (outs GR64:$dst), (ins), 
+                 "smsw{q}\t$dst", [], IIC_SMSW>, TB;
+
+// For memory operands, there is only a 16-bit form
+def SMSW16m : I<0x01, MRM4m, (outs i16mem:$dst), (ins),
+                "smsw{w}\t$dst", [], IIC_SMSW>, TB;
+
+def LMSW16r : I<0x01, MRM6r, (outs), (ins GR16:$src),
+                "lmsw{w}\t$src", [], IIC_LMSW_MEM>, TB;
+def LMSW16m : I<0x01, MRM6m, (outs), (ins i16mem:$src),
+                "lmsw{w}\t$src", [], IIC_LMSW_REG>, TB;
+
+let Defs = [EAX, EBX, ECX, EDX], Uses = [EAX, ECX] in
+  def CPUID32 : I<0xA2, RawFrm, (outs), (ins), "cpuid", [], IIC_CPUID>, TB,
+  Requires<[Not64BitMode]>;
+let Defs = [RAX, RBX, RCX, RDX], Uses = [RAX, RCX] in
+  def CPUID64 : I<0xA2, RawFrm, (outs), (ins), "cpuid", [], IIC_CPUID>, TB,
+  Requires<[In64BitMode]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Cache instructions
+let SchedRW = [WriteSystem] in {
+def INVD : I<0x08, RawFrm, (outs), (ins), "invd", [], IIC_INVD>, TB;
+def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", [], IIC_INVD>, TB;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// XSAVE instructions
+let SchedRW = [WriteSystem] in {
+let Defs = [RDX, RAX], Uses = [RCX] in
+  def XGETBV : I<0x01, MRM_D0, (outs), (ins), "xgetbv", []>, TB;
+
+let Uses = [RDX, RAX, RCX] in
+  def XSETBV : I<0x01, MRM_D1, (outs), (ins), "xsetbv", []>, TB;
+
+let Uses = [RDX, RAX] in {
+  def XSAVE : I<0xAE, MRM4m, (outs opaque512mem:$dst), (ins),
+               "xsave\t$dst", []>, TB;
+  def XSAVE64 : RI<0xAE, MRM4m, (outs opaque512mem:$dst), (ins),
+                 "xsave{q|64}\t$dst", []>, TB, Requires<[In64BitMode]>;
+  def XRSTOR : I<0xAE, MRM5m, (outs), (ins opaque512mem:$dst),
+               "xrstor\t$dst", []>, TB;
+  def XRSTOR64 : RI<0xAE, MRM5m, (outs), (ins opaque512mem:$dst),
+                 "xrstor{q|64}\t$dst", []>, TB, Requires<[In64BitMode]>;
+  def XSAVEOPT : I<0xAE, MRM6m, (outs opaque512mem:$dst), (ins),
+                  "xsaveopt\t$dst", []>, TB;
+  def XSAVEOPT64 : RI<0xAE, MRM6m, (outs opaque512mem:$dst), (ins),
+                    "xsaveopt{q|64}\t$dst", []>, TB, Requires<[In64BitMode]>;
+}
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// VIA PadLock crypto instructions
+let Defs = [RAX, RDI], Uses = [RDX, RDI] in
+  def XSTORE : I<0xa7, MRM_C0, (outs), (ins), "xstore", []>, TB;
+
+def : InstAlias<"xstorerng", (XSTORE)>;
+
+let Defs = [RSI, RDI], Uses = [RBX, RDX, RSI, RDI] in {
+  def XCRYPTECB : I<0xa7, MRM_C8, (outs), (ins), "xcryptecb", []>, TB;
+  def XCRYPTCBC : I<0xa7, MRM_D0, (outs), (ins), "xcryptcbc", []>, TB;
+  def XCRYPTCTR : I<0xa7, MRM_D8, (outs), (ins), "xcryptctr", []>, TB;
+  def XCRYPTCFB : I<0xa7, MRM_E0, (outs), (ins), "xcryptcfb", []>, TB;
+  def XCRYPTOFB : I<0xa7, MRM_E8, (outs), (ins), "xcryptofb", []>, TB;
+}
+
+let Defs = [RAX, RSI, RDI], Uses = [RAX, RSI, RDI] in {
+  def XSHA1 : I<0xa6, MRM_C8, (outs), (ins), "xsha1", []>, TB;
+  def XSHA256 : I<0xa6, MRM_D0, (outs), (ins), "xsha256", []>, TB;
+}
+let Defs = [RAX, RDX, RSI], Uses = [RAX, RSI] in
+  def MONTMUL : I<0xa6, MRM_C0, (outs), (ins), "montmul", []>, TB;
+
+//===----------------------------------------------------------------------===//
+// FS/GS Base Instructions
+let Predicates = [HasFSGSBase, In64BitMode] in {
+  def RDFSBASE : I<0xAE, MRM0r, (outs GR32:$dst), (ins),
+                   "rdfsbase{l}\t$dst",
+                   [(set GR32:$dst, (int_x86_rdfsbase_32))]>, XS;
+  def RDFSBASE64 : RI<0xAE, MRM0r, (outs GR64:$dst), (ins),
+                     "rdfsbase{q}\t$dst",
+                     [(set GR64:$dst, (int_x86_rdfsbase_64))]>, XS;
+  def RDGSBASE : I<0xAE, MRM1r, (outs GR32:$dst), (ins),
+                   "rdgsbase{l}\t$dst",
+                   [(set GR32:$dst, (int_x86_rdgsbase_32))]>, XS;
+  def RDGSBASE64 : RI<0xAE, MRM1r, (outs GR64:$dst), (ins),
+                     "rdgsbase{q}\t$dst",
+                     [(set GR64:$dst, (int_x86_rdgsbase_64))]>, XS;
+  def WRFSBASE : I<0xAE, MRM2r, (outs), (ins GR32:$src),
+                   "wrfsbase{l}\t$src",
+                   [(int_x86_wrfsbase_32 GR32:$src)]>, XS;
+  def WRFSBASE64 : RI<0xAE, MRM2r, (outs), (ins GR64:$src),
+                      "wrfsbase{q}\t$src",
+                      [(int_x86_wrfsbase_64 GR64:$src)]>, XS;
+  def WRGSBASE : I<0xAE, MRM3r, (outs), (ins GR32:$src),
+                   "wrgsbase{l}\t$src",
+                   [(int_x86_wrgsbase_32 GR32:$src)]>, XS;
+  def WRGSBASE64 : RI<0xAE, MRM3r, (outs), (ins GR64:$src),
+                      "wrgsbase{q}\t$src",
+                      [(int_x86_wrgsbase_64 GR64:$src)]>, XS;
+}
+
+//===----------------------------------------------------------------------===//
+// INVPCID Instruction
+def INVPCID32 : I<0x82, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
+                "invpcid\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+                Requires<[Not64BitMode]>;
+def INVPCID64 : I<0x82, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
+                "invpcid\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+                Requires<[In64BitMode]>;
+
+//===----------------------------------------------------------------------===//
+// SMAP Instruction
+let Defs = [EFLAGS], Uses = [EFLAGS] in {
+  def CLAC : I<0x01, MRM_CA, (outs), (ins), "clac", []>, TB;
+  def STAC : I<0x01, MRM_CB, (outs), (ins), "stac", []>, TB;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86InstrTSX.td b/contrib/llvm/lib/Target/X86/X86InstrTSX.td
new file mode 100644
index 0000000..4940efc
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrTSX.td
@@ -0,0 +1,47 @@
+//===-- X86InstrVMX.td - TSX Instruction Set Extension -----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the Intel TSX instruction
+// set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// TSX instructions
+
+def X86xtest: SDNode<"X86ISD::XTEST", SDTypeProfile<1, 0, [SDTCisVT<0, i32>]>,
+                     [SDNPHasChain, SDNPSideEffect]>;
+
+let usesCustomInserter = 1 in
+def XBEGIN : I<0, Pseudo, (outs GR32:$dst), (ins),
+               "# XBEGIN", [(set GR32:$dst, (int_x86_xbegin))]>,
+             Requires<[HasRTM]>;
+
+let isBranch = 1, isTerminator = 1, Defs = [EAX] in
+def XBEGIN_4 : Ii32PCRel<0xc7, MRM_F8, (outs), (ins brtarget:$dst),
+                         "xbegin\t$dst", []>, Requires<[HasRTM]>;
+
+def XEND : I<0x01, MRM_D5, (outs), (ins),
+             "xend", [(int_x86_xend)]>, TB, Requires<[HasRTM]>;
+
+let Defs = [EFLAGS] in
+def XTEST : I<0x01, MRM_D6, (outs), (ins),
+              "xtest", [(set EFLAGS, (X86xtest))]>, TB, Requires<[HasTSX]>;
+
+def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm),
+                 "xabort\t$imm",
+                 [(int_x86_xabort imm:$imm)]>, Requires<[HasRTM]>;
+
+// HLE prefixes
+
+let isAsmParserOnly = 1 in {
+def XACQUIRE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "xacquire", []>, Requires<[HasHLE]>;
+def XRELEASE_PREFIX : I<0xF3, RawFrm, (outs), (ins), "xrelease", []>, Requires<[HasHLE]>;
+}
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrVMX.td b/contrib/llvm/lib/Target/X86/X86InstrVMX.td
new file mode 100644
index 0000000..79afe9a
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrVMX.td
@@ -0,0 +1,66 @@
+//===-- X86InstrVMX.td - VMX Instruction Set Extension -----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the Intel VMX instruction
+// set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// VMX instructions
+
+// 66 0F 38 80
+def INVEPT32 : I<0x80, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
+               "invept\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+               Requires<[Not64BitMode]>;
+def INVEPT64 : I<0x80, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
+               "invept\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+               Requires<[In64BitMode]>;
+// 66 0F 38 81
+def INVVPID32 : I<0x81, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
+                "invvpid\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+                Requires<[Not64BitMode]>;
+def INVVPID64 : I<0x81, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
+                "invvpid\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+                Requires<[In64BitMode]>;
+// 0F 01 C1
+def VMCALL : I<0x01, MRM_C1, (outs), (ins), "vmcall", []>, TB;
+def VMCLEARm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs),
+  "vmclear\t$vmcs", []>, PD;
+// OF 01 D4
+def VMFUNC : I<0x01, MRM_D4, (outs), (ins), "vmfunc", []>, TB;
+// 0F 01 C2
+def VMLAUNCH : I<0x01, MRM_C2, (outs), (ins), "vmlaunch", []>, TB;
+// 0F 01 C3
+def VMRESUME : I<0x01, MRM_C3, (outs), (ins), "vmresume", []>, TB;
+def VMPTRLDm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs),
+  "vmptrld\t$vmcs", []>, PS;
+def VMPTRSTm : I<0xC7, MRM7m, (outs i64mem:$vmcs), (ins),
+  "vmptrst\t$vmcs", []>, TB;
+def VMREAD64rm : I<0x78, MRMDestMem, (outs i64mem:$dst), (ins GR64:$src),
+  "vmread{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
+def VMREAD64rr : I<0x78, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
+  "vmread{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
+def VMREAD32rm : I<0x78, MRMDestMem, (outs i32mem:$dst), (ins GR32:$src),
+  "vmread{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
+def VMREAD32rr : I<0x78, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
+  "vmread{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
+def VMWRITE64rm : I<0x79, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+  "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
+def VMWRITE64rr : I<0x79, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+  "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
+def VMWRITE32rm : I<0x79, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+  "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
+def VMWRITE32rr : I<0x79, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+  "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
+// 0F 01 C4
+def VMXOFF : I<0x01, MRM_C4, (outs), (ins), "vmxoff", []>, TB;
+def VMXON : I<0xC7, MRM6m, (outs), (ins i64mem:$vmxon),
+  "vmxon\t$vmxon", []>, XS;
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrXOP.td b/contrib/llvm/lib/Target/X86/X86InstrXOP.td
new file mode 100644
index 0000000..45e2ff0
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrXOP.td
@@ -0,0 +1,289 @@
+//===-- X86InstrXOP.td - XOP Instruction Set ---------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes XOP (eXtended OPerations)
+//
+//===----------------------------------------------------------------------===//
+
+multiclass xop2op<bits<8> opc, string OpcodeStr, Intrinsic Int, PatFrag memop> {
+  def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+           !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+           [(set VR128:$dst, (Int VR128:$src))]>, XOP;
+  def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+           !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+           [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP;
+}
+
+defm VPHSUBWD  : xop2op<0xE2, "vphsubwd", int_x86_xop_vphsubwd, memopv2i64>;
+defm VPHSUBDQ  : xop2op<0xE3, "vphsubdq", int_x86_xop_vphsubdq, memopv2i64>;
+defm VPHSUBBW  : xop2op<0xE1, "vphsubbw", int_x86_xop_vphsubbw, memopv2i64>;
+defm VPHADDWQ  : xop2op<0xC7, "vphaddwq", int_x86_xop_vphaddwq, memopv2i64>;
+defm VPHADDWD  : xop2op<0xC6, "vphaddwd", int_x86_xop_vphaddwd, memopv2i64>;
+defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", int_x86_xop_vphadduwq, memopv2i64>;
+defm VPHADDUWD : xop2op<0xD6, "vphadduwd", int_x86_xop_vphadduwd, memopv2i64>;
+defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", int_x86_xop_vphaddudq, memopv2i64>;
+defm VPHADDUBW : xop2op<0xD1, "vphaddubw", int_x86_xop_vphaddubw, memopv2i64>;
+defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", int_x86_xop_vphaddubq, memopv2i64>;
+defm VPHADDUBD : xop2op<0xD2, "vphaddubd", int_x86_xop_vphaddubd, memopv2i64>;
+defm VPHADDDQ  : xop2op<0xCB, "vphadddq", int_x86_xop_vphadddq, memopv2i64>;
+defm VPHADDBW  : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw, memopv2i64>;
+defm VPHADDBQ  : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq, memopv2i64>;
+defm VPHADDBD  : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd, memopv2i64>;
+
+// Scalar load 2 addr operand instructions
+multiclass xop2opsld<bits<8> opc, string OpcodeStr, Intrinsic Int,
+                     Operand memop, ComplexPattern mem_cpat> {
+  def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+           !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+           [(set VR128:$dst, (Int VR128:$src))]>, XOP;
+  def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins memop:$src),
+           !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+           [(set VR128:$dst, (Int (bitconvert mem_cpat:$src)))]>, XOP;
+}
+
+defm VFRCZSS   : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss,
+                 ssmem, sse_load_f32>;
+defm VFRCZSD   : xop2opsld<0x83, "vfrczsd", int_x86_xop_vfrcz_sd,
+                 sdmem, sse_load_f64>;
+
+multiclass xop2op128<bits<8> opc, string OpcodeStr, Intrinsic Int,
+                     PatFrag memop> {
+  def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+           !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+           [(set VR128:$dst, (Int VR128:$src))]>, XOP;
+  def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+           !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+           [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP;
+}
+
+defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, memopv4f32>;
+defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, memopv2f64>;
+
+multiclass xop2op256<bits<8> opc, string OpcodeStr, Intrinsic Int,
+                     PatFrag memop> {
+  def rrY : IXOP<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+           !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+           [(set VR256:$dst, (Int VR256:$src))]>, XOP, VEX_L;
+  def rmY : IXOP<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+           !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+           [(set VR256:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP, VEX_L;
+}
+
+defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256, memopv8f32>;
+defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, memopv4f64>;
+
+multiclass xop3op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
+  def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           [(set VR128:$dst, (Int VR128:$src1, VR128:$src2))]>, XOP_4VOp3;
+  def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, i128mem:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           [(set VR128:$dst,
+              (Int VR128:$src1, (bitconvert (memopv2i64 addr:$src2))))]>,
+           XOP_4V, VEX_W;
+  def mr : IXOP<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins i128mem:$src1, VR128:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           [(set VR128:$dst,
+              (Int (bitconvert (memopv2i64 addr:$src1)), VR128:$src2))]>,
+             XOP_4VOp3;
+}
+
+defm VPSHLW : xop3op<0x95, "vpshlw", int_x86_xop_vpshlw>;
+defm VPSHLQ : xop3op<0x97, "vpshlq", int_x86_xop_vpshlq>;
+defm VPSHLD : xop3op<0x96, "vpshld", int_x86_xop_vpshld>;
+defm VPSHLB : xop3op<0x94, "vpshlb", int_x86_xop_vpshlb>;
+defm VPSHAW : xop3op<0x99, "vpshaw", int_x86_xop_vpshaw>;
+defm VPSHAQ : xop3op<0x9B, "vpshaq", int_x86_xop_vpshaq>;
+defm VPSHAD : xop3op<0x9A, "vpshad", int_x86_xop_vpshad>;
+defm VPSHAB : xop3op<0x98, "vpshab", int_x86_xop_vpshab>;
+defm VPROTW : xop3op<0x91, "vprotw", int_x86_xop_vprotw>;
+defm VPROTQ : xop3op<0x93, "vprotq", int_x86_xop_vprotq>;
+defm VPROTD : xop3op<0x92, "vprotd", int_x86_xop_vprotd>;
+defm VPROTB : xop3op<0x90, "vprotb", int_x86_xop_vprotb>;
+
+multiclass xop3opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> {
+  def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+           (ins VR128:$src1, i8imm:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           [(set VR128:$dst, (Int VR128:$src1, imm:$src2))]>, XOP;
+  def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins i128mem:$src1, i8imm:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           [(set VR128:$dst,
+             (Int (bitconvert (memopv2i64 addr:$src1)), imm:$src2))]>, XOP;
+}
+
+defm VPROTW : xop3opimm<0xC1, "vprotw", int_x86_xop_vprotwi>;
+defm VPROTQ : xop3opimm<0xC3, "vprotq", int_x86_xop_vprotqi>;
+defm VPROTD : xop3opimm<0xC2, "vprotd", int_x86_xop_vprotdi>;
+defm VPROTB : xop3opimm<0xC0, "vprotb", int_x86_xop_vprotbi>;
+
+// Instruction where second source can be memory, but third must be register
+multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int> {
+  def rr : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2, VR128:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR128:$dst,
+              (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, XOP_4V, VEX_I8IMM;
+  def rm : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, i128mem:$src2, VR128:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR128:$dst,
+              (Int VR128:$src1, (bitconvert (memopv2i64 addr:$src2)),
+              VR128:$src3))]>, XOP_4V, VEX_I8IMM;
+}
+
+defm VPMADCSWD  : xop4opm2<0xB6, "vpmadcswd", int_x86_xop_vpmadcswd>;
+defm VPMADCSSWD : xop4opm2<0xA6, "vpmadcsswd", int_x86_xop_vpmadcsswd>;
+defm VPMACSWW   : xop4opm2<0x95, "vpmacsww", int_x86_xop_vpmacsww>;
+defm VPMACSWD   : xop4opm2<0x96, "vpmacswd", int_x86_xop_vpmacswd>;
+defm VPMACSSWW  : xop4opm2<0x85, "vpmacssww", int_x86_xop_vpmacssww>;
+defm VPMACSSWD  : xop4opm2<0x86, "vpmacsswd", int_x86_xop_vpmacsswd>;
+defm VPMACSSDQL : xop4opm2<0x87, "vpmacssdql", int_x86_xop_vpmacssdql>;
+defm VPMACSSDQH : xop4opm2<0x8F, "vpmacssdqh", int_x86_xop_vpmacssdqh>;
+defm VPMACSSDD  : xop4opm2<0x8E, "vpmacssdd", int_x86_xop_vpmacssdd>;
+defm VPMACSDQL  : xop4opm2<0x97, "vpmacsdql", int_x86_xop_vpmacsdql>;
+defm VPMACSDQH  : xop4opm2<0x9F, "vpmacsdqh", int_x86_xop_vpmacsdqh>;
+defm VPMACSDD   : xop4opm2<0x9E, "vpmacsdd", int_x86_xop_vpmacsdd>;
+
+// Instruction where second source can be memory, third must be imm8
+multiclass xop4opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> {
+  def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR128:$dst, (Int VR128:$src1, VR128:$src2, imm:$src3))]>,
+           XOP_4V;
+  def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR128:$dst,
+             (Int VR128:$src1, (bitconvert (memopv2i64 addr:$src2)),
+              imm:$src3))]>, XOP_4V;
+}
+
+defm VPCOMB  : xop4opimm<0xCC, "vpcomb", int_x86_xop_vpcomb>;
+defm VPCOMW  : xop4opimm<0xCD, "vpcomw", int_x86_xop_vpcomw>;
+defm VPCOMD  : xop4opimm<0xCE, "vpcomd", int_x86_xop_vpcomd>;
+defm VPCOMQ  : xop4opimm<0xCF, "vpcomq", int_x86_xop_vpcomq>;
+defm VPCOMUB : xop4opimm<0xEC, "vpcomub", int_x86_xop_vpcomub>;
+defm VPCOMUW : xop4opimm<0xED, "vpcomuw", int_x86_xop_vpcomuw>;
+defm VPCOMUD : xop4opimm<0xEE, "vpcomud", int_x86_xop_vpcomud>;
+defm VPCOMUQ : xop4opimm<0xEF, "vpcomuq", int_x86_xop_vpcomuq>;
+
+// Instruction where either second or third source can be memory
+multiclass xop4op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
+  def rr : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2, VR128:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR128:$dst, (Int VR128:$src1, VR128:$src2, VR128:$src3))]>,
+           XOP_4V, VEX_I8IMM;
+  def rm : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2, i128mem:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR128:$dst,
+             (Int VR128:$src1, VR128:$src2,
+              (bitconvert (memopv2i64 addr:$src3))))]>,
+           XOP_4V, VEX_I8IMM, VEX_W, MemOp4;
+  def mr : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, i128mem:$src2, VR128:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR128:$dst,
+             (Int VR128:$src1, (bitconvert (memopv2i64 addr:$src2)),
+              VR128:$src3))]>,
+           XOP_4V, VEX_I8IMM;
+}
+
+defm VPPERM : xop4op<0xA3, "vpperm", int_x86_xop_vpperm>;
+defm VPCMOV : xop4op<0xA2, "vpcmov", int_x86_xop_vpcmov>;
+
+multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> {
+  def rrY : IXOPi8<opc, MRMSrcReg, (outs VR256:$dst),
+           (ins VR256:$src1, VR256:$src2, VR256:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR256:$dst, (Int VR256:$src1, VR256:$src2, VR256:$src3))]>,
+           XOP_4V, VEX_I8IMM, VEX_L;
+  def rmY : IXOPi8<opc, MRMSrcMem, (outs VR256:$dst),
+           (ins VR256:$src1, VR256:$src2, i256mem:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR256:$dst,
+             (Int VR256:$src1, VR256:$src2,
+              (bitconvert (memopv4i64 addr:$src3))))]>,
+           XOP_4V, VEX_I8IMM, VEX_W, MemOp4, VEX_L;
+  def mrY : IXOPi8<opc, MRMSrcMem, (outs VR256:$dst),
+           (ins VR256:$src1, f256mem:$src2, VR256:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR256:$dst,
+             (Int VR256:$src1, (bitconvert (memopv4i64 addr:$src2)),
+              VR256:$src3))]>,
+           XOP_4V, VEX_I8IMM, VEX_L;
+}
+
+defm VPCMOV : xop4op256<0xA2, "vpcmov", int_x86_xop_vpcmov_256>;
+
+multiclass xop5op<bits<8> opc, string OpcodeStr, Intrinsic Int128,
+                  Intrinsic Int256, PatFrag ld_128, PatFrag ld_256> {
+  def rr : IXOP5<opc, MRMSrcReg, (outs VR128:$dst),
+        (ins VR128:$src1, VR128:$src2, VR128:$src3, i8imm:$src4),
+        !strconcat(OpcodeStr,
+        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+        [(set VR128:$dst,
+           (Int128 VR128:$src1, VR128:$src2, VR128:$src3, imm:$src4))]>;
+  def rm : IXOP5<opc, MRMSrcMem, (outs VR128:$dst),
+        (ins VR128:$src1, VR128:$src2, f128mem:$src3, i8imm:$src4),
+        !strconcat(OpcodeStr,
+        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+        [(set VR128:$dst,
+           (Int128 VR128:$src1, VR128:$src2, (ld_128 addr:$src3), imm:$src4))]>,
+        VEX_W, MemOp4;
+  def mr : IXOP5<opc, MRMSrcMem, (outs VR128:$dst),
+        (ins VR128:$src1, f128mem:$src2, VR128:$src3, i8imm:$src4),
+        !strconcat(OpcodeStr,
+        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+        [(set VR128:$dst,
+           (Int128 VR128:$src1, (ld_128 addr:$src2), VR128:$src3, imm:$src4))]>;
+  def rrY : IXOP5<opc, MRMSrcReg, (outs VR256:$dst),
+        (ins VR256:$src1, VR256:$src2, VR256:$src3, i8imm:$src4),
+        !strconcat(OpcodeStr,
+        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+        [(set VR256:$dst,
+          (Int256 VR256:$src1, VR256:$src2, VR256:$src3, imm:$src4))]>, VEX_L;
+  def rmY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst),
+        (ins VR256:$src1, VR256:$src2, f256mem:$src3, i8imm:$src4),
+        !strconcat(OpcodeStr,
+        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+        [(set VR256:$dst,
+          (Int256 VR256:$src1, VR256:$src2, (ld_256 addr:$src3), imm:$src4))]>,
+        VEX_W, MemOp4, VEX_L;
+  def mrY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst),
+        (ins VR256:$src1, f256mem:$src2, VR256:$src3, i8imm:$src4),
+        !strconcat(OpcodeStr,
+        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+        [(set VR256:$dst,
+           (Int256 VR256:$src1, (ld_256 addr:$src2), VR256:$src3, imm:$src4))]>,
+        VEX_L;
+}
+
+defm VPERMIL2PD : xop5op<0x49, "vpermil2pd", int_x86_xop_vpermil2pd,
+                         int_x86_xop_vpermil2pd_256, memopv2f64, memopv4f64>;
+defm VPERMIL2PS : xop5op<0x48, "vpermil2ps", int_x86_xop_vpermil2ps,
+                         int_x86_xop_vpermil2ps_256, memopv4f32, memopv8f32>;
+
diff --git a/contrib/llvm/lib/Target/X86/X86JITInfo.cpp b/contrib/llvm/lib/Target/X86/X86JITInfo.cpp
new file mode 100644
index 0000000..a082c4f
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86JITInfo.cpp
@@ -0,0 +1,588 @@
+//===-- X86JITInfo.cpp - Implement the JIT interfaces for the X86 target --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the JIT interfaces for the X86 target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86JITInfo.h"
+#include "X86Relocations.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Valgrind.h"
+#include <cstdlib>
+#include <cstring>
+using namespace llvm;
+
+#define DEBUG_TYPE "jit"
+
+// Determine the platform we're running on
+#if defined (__x86_64__) || defined (_M_AMD64) || defined (_M_X64)
+# define X86_64_JIT
+#elif defined(__i386__) || defined(i386) || defined(_M_IX86)
+# define X86_32_JIT
+#endif
+
+void X86JITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
+  unsigned char *OldByte = (unsigned char *)Old;
+  *OldByte++ = 0xE9;                // Emit JMP opcode.
+  unsigned *OldWord = (unsigned *)OldByte;
+  unsigned NewAddr = (intptr_t)New;
+  unsigned OldAddr = (intptr_t)OldWord;
+  *OldWord = NewAddr - OldAddr - 4; // Emit PC-relative addr of New code.
+
+  // X86 doesn't need to invalidate the processor cache, so just invalidate
+  // Valgrind's cache directly.
+  sys::ValgrindDiscardTranslations(Old, 5);
+}
+
+
+/// JITCompilerFunction - This contains the address of the JIT function used to
+/// compile a function lazily.
+static TargetJITInfo::JITCompilerFn JITCompilerFunction;
+
+// Get the ASMPREFIX for the current host.  This is often '_'.
+#ifndef __USER_LABEL_PREFIX__
+#define __USER_LABEL_PREFIX__
+#endif
+#define GETASMPREFIX2(X) #X
+#define GETASMPREFIX(X) GETASMPREFIX2(X)
+#define ASMPREFIX GETASMPREFIX(__USER_LABEL_PREFIX__)
+
+// For ELF targets, use a .size and .type directive, to let tools
+// know the extent of functions defined in assembler.
+#if defined(__ELF__)
+# define SIZE(sym) ".size " #sym ", . - " #sym "\n"
+# define TYPE_FUNCTION(sym) ".type " #sym ", @function\n"
+#else
+# define SIZE(sym)
+# define TYPE_FUNCTION(sym)
+#endif
+
+// Provide a convenient way for disabling usage of CFI directives.
+// This is needed for old/broken assemblers (for example, gas on
+// Darwin is pretty old and doesn't support these directives)
+#if defined(__APPLE__)
+# define CFI(x)
+#else
+// FIXME: Disable this until we really want to use it. Also, we will
+//        need to add some workarounds for compilers, which support
+//        only subset of these directives.
+# define CFI(x)
+#endif
+
+// Provide a wrapper for LLVMX86CompilationCallback2 that saves non-traditional
+// callee saved registers, for the fastcc calling convention.
+extern "C" {
+#if defined(X86_64_JIT)
+# ifndef _MSC_VER
+  // No need to save EAX/EDX for X86-64.
+  void X86CompilationCallback(void);
+  asm(
+    ".text\n"
+    ".align 8\n"
+    ".globl " ASMPREFIX "X86CompilationCallback\n"
+    TYPE_FUNCTION(X86CompilationCallback)
+  ASMPREFIX "X86CompilationCallback:\n"
+    CFI(".cfi_startproc\n")
+    // Save RBP
+    "pushq   %rbp\n"
+    CFI(".cfi_def_cfa_offset 16\n")
+    CFI(".cfi_offset %rbp, -16\n")
+    // Save RSP
+    "movq    %rsp, %rbp\n"
+    CFI(".cfi_def_cfa_register %rbp\n")
+    // Save all int arg registers
+    "pushq   %rdi\n"
+    CFI(".cfi_rel_offset %rdi, 0\n")
+    "pushq   %rsi\n"
+    CFI(".cfi_rel_offset %rsi, 8\n")
+    "pushq   %rdx\n"
+    CFI(".cfi_rel_offset %rdx, 16\n")
+    "pushq   %rcx\n"
+    CFI(".cfi_rel_offset %rcx, 24\n")
+    "pushq   %r8\n"
+    CFI(".cfi_rel_offset %r8, 32\n")
+    "pushq   %r9\n"
+    CFI(".cfi_rel_offset %r9, 40\n")
+    // Align stack on 16-byte boundary. ESP might not be properly aligned
+    // (8 byte) if this is called from an indirect stub.
+    "andq    $-16, %rsp\n"
+    // Save all XMM arg registers
+    "subq    $128, %rsp\n"
+    "movaps  %xmm0, (%rsp)\n"
+    "movaps  %xmm1, 16(%rsp)\n"
+    "movaps  %xmm2, 32(%rsp)\n"
+    "movaps  %xmm3, 48(%rsp)\n"
+    "movaps  %xmm4, 64(%rsp)\n"
+    "movaps  %xmm5, 80(%rsp)\n"
+    "movaps  %xmm6, 96(%rsp)\n"
+    "movaps  %xmm7, 112(%rsp)\n"
+    // JIT callee
+#if defined(_WIN64) || defined(__CYGWIN__)
+    "subq    $32, %rsp\n"
+    "movq    %rbp, %rcx\n"    // Pass prev frame and return address
+    "movq    8(%rbp), %rdx\n"
+    "call    " ASMPREFIX "LLVMX86CompilationCallback2\n"
+    "addq    $32, %rsp\n"
+#else
+    "movq    %rbp, %rdi\n"    // Pass prev frame and return address
+    "movq    8(%rbp), %rsi\n"
+    "call    " ASMPREFIX "LLVMX86CompilationCallback2\n"
+#endif
+    // Restore all XMM arg registers
+    "movaps  112(%rsp), %xmm7\n"
+    "movaps  96(%rsp), %xmm6\n"
+    "movaps  80(%rsp), %xmm5\n"
+    "movaps  64(%rsp), %xmm4\n"
+    "movaps  48(%rsp), %xmm3\n"
+    "movaps  32(%rsp), %xmm2\n"
+    "movaps  16(%rsp), %xmm1\n"
+    "movaps  (%rsp), %xmm0\n"
+    // Restore RSP
+    "movq    %rbp, %rsp\n"
+    CFI(".cfi_def_cfa_register %rsp\n")
+    // Restore all int arg registers
+    "subq    $48, %rsp\n"
+    CFI(".cfi_adjust_cfa_offset 48\n")
+    "popq    %r9\n"
+    CFI(".cfi_adjust_cfa_offset -8\n")
+    CFI(".cfi_restore %r9\n")
+    "popq    %r8\n"
+    CFI(".cfi_adjust_cfa_offset -8\n")
+    CFI(".cfi_restore %r8\n")
+    "popq    %rcx\n"
+    CFI(".cfi_adjust_cfa_offset -8\n")
+    CFI(".cfi_restore %rcx\n")
+    "popq    %rdx\n"
+    CFI(".cfi_adjust_cfa_offset -8\n")
+    CFI(".cfi_restore %rdx\n")
+    "popq    %rsi\n"
+    CFI(".cfi_adjust_cfa_offset -8\n")
+    CFI(".cfi_restore %rsi\n")
+    "popq    %rdi\n"
+    CFI(".cfi_adjust_cfa_offset -8\n")
+    CFI(".cfi_restore %rdi\n")
+    // Restore RBP
+    "popq    %rbp\n"
+    CFI(".cfi_adjust_cfa_offset -8\n")
+    CFI(".cfi_restore %rbp\n")
+    "ret\n"
+    CFI(".cfi_endproc\n")
+    SIZE(X86CompilationCallback)
+  );
+# else
+  // No inline assembler support on this platform. The routine is in external
+  // file.
+  void X86CompilationCallback();
+
+# endif
+#elif defined (X86_32_JIT)
+# ifndef _MSC_VER
+  void X86CompilationCallback(void);
+  asm(
+    ".text\n"
+    ".align 8\n"
+    ".globl " ASMPREFIX "X86CompilationCallback\n"
+    TYPE_FUNCTION(X86CompilationCallback)
+  ASMPREFIX "X86CompilationCallback:\n"
+    CFI(".cfi_startproc\n")
+    "pushl   %ebp\n"
+    CFI(".cfi_def_cfa_offset 8\n")
+    CFI(".cfi_offset %ebp, -8\n")
+    "movl    %esp, %ebp\n"    // Standard prologue
+    CFI(".cfi_def_cfa_register %ebp\n")
+    "pushl   %eax\n"
+    CFI(".cfi_rel_offset %eax, 0\n")
+    "pushl   %edx\n"          // Save EAX/EDX/ECX
+    CFI(".cfi_rel_offset %edx, 4\n")
+    "pushl   %ecx\n"
+    CFI(".cfi_rel_offset %ecx, 8\n")
+#  if defined(__APPLE__)
+    "andl    $-16, %esp\n"    // Align ESP on 16-byte boundary
+#  endif
+    "subl    $16, %esp\n"
+    "movl    4(%ebp), %eax\n" // Pass prev frame and return address
+    "movl    %eax, 4(%esp)\n"
+    "movl    %ebp, (%esp)\n"
+    "call    " ASMPREFIX "LLVMX86CompilationCallback2\n"
+    "movl    %ebp, %esp\n"    // Restore ESP
+    CFI(".cfi_def_cfa_register %esp\n")
+    "subl    $12, %esp\n"
+    CFI(".cfi_adjust_cfa_offset 12\n")
+    "popl    %ecx\n"
+    CFI(".cfi_adjust_cfa_offset -4\n")
+    CFI(".cfi_restore %ecx\n")
+    "popl    %edx\n"
+    CFI(".cfi_adjust_cfa_offset -4\n")
+    CFI(".cfi_restore %edx\n")
+    "popl    %eax\n"
+    CFI(".cfi_adjust_cfa_offset -4\n")
+    CFI(".cfi_restore %eax\n")
+    "popl    %ebp\n"
+    CFI(".cfi_adjust_cfa_offset -4\n")
+    CFI(".cfi_restore %ebp\n")
+    "ret\n"
+    CFI(".cfi_endproc\n")
+    SIZE(X86CompilationCallback)
+  );
+
+  // Same as X86CompilationCallback but also saves XMM argument registers.
+  void X86CompilationCallback_SSE(void);
+  asm(
+    ".text\n"
+    ".align 8\n"
+    ".globl " ASMPREFIX "X86CompilationCallback_SSE\n"
+    TYPE_FUNCTION(X86CompilationCallback_SSE)
+  ASMPREFIX "X86CompilationCallback_SSE:\n"
+    CFI(".cfi_startproc\n")
+    "pushl   %ebp\n"
+    CFI(".cfi_def_cfa_offset 8\n")
+    CFI(".cfi_offset %ebp, -8\n")
+    "movl    %esp, %ebp\n"    // Standard prologue
+    CFI(".cfi_def_cfa_register %ebp\n")
+    "pushl   %eax\n"
+    CFI(".cfi_rel_offset %eax, 0\n")
+    "pushl   %edx\n"          // Save EAX/EDX/ECX
+    CFI(".cfi_rel_offset %edx, 4\n")
+    "pushl   %ecx\n"
+    CFI(".cfi_rel_offset %ecx, 8\n")
+    "andl    $-16, %esp\n"    // Align ESP on 16-byte boundary
+    // Save all XMM arg registers
+    "subl    $64, %esp\n"
+    // FIXME: provide frame move information for xmm registers.
+    // This can be tricky, because CFA register is ebp (unaligned)
+    // and we need to produce offsets relative to it.
+    "movaps  %xmm0, (%esp)\n"
+    "movaps  %xmm1, 16(%esp)\n"
+    "movaps  %xmm2, 32(%esp)\n"
+    "movaps  %xmm3, 48(%esp)\n"
+    "subl    $16, %esp\n"
+    "movl    4(%ebp), %eax\n" // Pass prev frame and return address
+    "movl    %eax, 4(%esp)\n"
+    "movl    %ebp, (%esp)\n"
+    "call    " ASMPREFIX "LLVMX86CompilationCallback2\n"
+    "addl    $16, %esp\n"
+    "movaps  48(%esp), %xmm3\n"
+    CFI(".cfi_restore %xmm3\n")
+    "movaps  32(%esp), %xmm2\n"
+    CFI(".cfi_restore %xmm2\n")
+    "movaps  16(%esp), %xmm1\n"
+    CFI(".cfi_restore %xmm1\n")
+    "movaps  (%esp), %xmm0\n"
+    CFI(".cfi_restore %xmm0\n")
+    "movl    %ebp, %esp\n"    // Restore ESP
+    CFI(".cfi_def_cfa_register esp\n")
+    "subl    $12, %esp\n"
+    CFI(".cfi_adjust_cfa_offset 12\n")
+    "popl    %ecx\n"
+    CFI(".cfi_adjust_cfa_offset -4\n")
+    CFI(".cfi_restore %ecx\n")
+    "popl    %edx\n"
+    CFI(".cfi_adjust_cfa_offset -4\n")
+    CFI(".cfi_restore %edx\n")
+    "popl    %eax\n"
+    CFI(".cfi_adjust_cfa_offset -4\n")
+    CFI(".cfi_restore %eax\n")
+    "popl    %ebp\n"
+    CFI(".cfi_adjust_cfa_offset -4\n")
+    CFI(".cfi_restore %ebp\n")
+    "ret\n"
+    CFI(".cfi_endproc\n")
+    SIZE(X86CompilationCallback_SSE)
+  );
+# else
+  void LLVMX86CompilationCallback2(intptr_t *StackPtr, intptr_t RetAddr);
+
+  _declspec(naked) void X86CompilationCallback(void) {
+    __asm {
+      push  ebp
+      mov   ebp, esp
+      push  eax
+      push  edx
+      push  ecx
+      and   esp, -16
+      sub   esp, 16
+      mov   eax, dword ptr [ebp+4]
+      mov   dword ptr [esp+4], eax
+      mov   dword ptr [esp], ebp
+      call  LLVMX86CompilationCallback2
+      mov   esp, ebp
+      sub   esp, 12
+      pop   ecx
+      pop   edx
+      pop   eax
+      pop   ebp
+      ret
+    }
+  }
+
+# endif // _MSC_VER
+
+#else // Not an i386 host
+  void X86CompilationCallback() {
+    llvm_unreachable("Cannot call X86CompilationCallback() on a non-x86 arch!");
+  }
+#endif
+}
+
+/// This is the target-specific function invoked by the
+/// function stub when we did not know the real target of a call.  This function
+/// must locate the start of the stub or call site and pass it into the JIT
+/// compiler function.
+extern "C" {
+LLVM_ATTRIBUTE_USED // Referenced from inline asm.
+LLVM_LIBRARY_VISIBILITY void LLVMX86CompilationCallback2(intptr_t *StackPtr,
+                                                         intptr_t RetAddr) {
+  intptr_t *RetAddrLoc = &StackPtr[1];
+  // We are reading raw stack data here. Tell MemorySanitizer that it is
+  // sufficiently initialized.
+  __msan_unpoison(RetAddrLoc, sizeof(*RetAddrLoc));
+  assert(*RetAddrLoc == RetAddr &&
+         "Could not find return address on the stack!");
+
+  // It's a stub if there is an interrupt marker after the call.
+  bool isStub = ((unsigned char*)RetAddr)[0] == 0xCE;
+
+  // The call instruction should have pushed the return value onto the stack...
+#if defined (X86_64_JIT)
+  RetAddr--;     // Backtrack to the reference itself...
+#else
+  RetAddr -= 4;  // Backtrack to the reference itself...
+#endif
+
+#if 0
+  DEBUG(dbgs() << "In callback! Addr=" << (void*)RetAddr
+               << " ESP=" << (void*)StackPtr
+               << ": Resolving call to function: "
+               << TheVM->getFunctionReferencedName((void*)RetAddr) << "\n");
+#endif
+
+  // Sanity check to make sure this really is a call instruction.
+#if defined (X86_64_JIT)
+  assert(((unsigned char*)RetAddr)[-2] == 0x41 &&"Not a call instr!");
+  assert(((unsigned char*)RetAddr)[-1] == 0xFF &&"Not a call instr!");
+#else
+  assert(((unsigned char*)RetAddr)[-1] == 0xE8 &&"Not a call instr!");
+#endif
+
+  intptr_t NewVal = (intptr_t)JITCompilerFunction((void*)RetAddr);
+
+  // Rewrite the call target... so that we don't end up here every time we
+  // execute the call.
+#if defined (X86_64_JIT)
+  assert(isStub &&
+         "X86-64 doesn't support rewriting non-stub lazy compilation calls:"
+         " the call instruction varies too much.");
+#else
+  *(intptr_t *)RetAddr = (intptr_t)(NewVal-RetAddr-4);
+#endif
+
+  if (isStub) {
+    // If this is a stub, rewrite the call into an unconditional branch
+    // instruction so that two return addresses are not pushed onto the stack
+    // when the requested function finally gets called.  This also makes the
+    // 0xCE byte (interrupt) dead, so the marker doesn't effect anything.
+#if defined (X86_64_JIT)
+    // If the target address is within 32-bit range of the stub, use a
+    // PC-relative branch instead of loading the actual address.  (This is
+    // considerably shorter than the 64-bit immediate load already there.)
+    // We assume here intptr_t is 64 bits.
+    intptr_t diff = NewVal-RetAddr+7;
+    if (diff >= -2147483648LL && diff <= 2147483647LL) {
+      *(unsigned char*)(RetAddr-0xc) = 0xE9;
+      *(intptr_t *)(RetAddr-0xb) = diff & 0xffffffff;
+    } else {
+      *(intptr_t *)(RetAddr - 0xa) = NewVal;
+      ((unsigned char*)RetAddr)[0] = (2 | (4 << 3) | (3 << 6));
+    }
+    sys::ValgrindDiscardTranslations((void*)(RetAddr-0xc), 0xd);
+#else
+    ((unsigned char*)RetAddr)[-1] = 0xE9;
+    sys::ValgrindDiscardTranslations((void*)(RetAddr-1), 5);
+#endif
+  }
+
+  // Change the return address to reexecute the call instruction...
+#if defined (X86_64_JIT)
+  *RetAddrLoc -= 0xd;
+#else
+  *RetAddrLoc -= 5;
+#endif
+}
+}
+
+TargetJITInfo::LazyResolverFn
+X86JITInfo::getLazyResolverFunction(JITCompilerFn F) {
+  TsanIgnoreWritesBegin();
+  JITCompilerFunction = F;
+  TsanIgnoreWritesEnd();
+
+#if defined (X86_32_JIT) && !defined (_MSC_VER)
+#if defined(__SSE__)
+  // SSE Callback should be called for SSE-enabled LLVM.
+  return X86CompilationCallback_SSE;
+#else
+  if (useSSE)
+    return X86CompilationCallback_SSE;
+#endif
+#endif
+
+  return X86CompilationCallback;
+}
+
+X86JITInfo::X86JITInfo(bool UseSSE) {
+  useSSE = UseSSE;
+  useGOT = 0;
+  TLSOffset = nullptr;
+}
+
+void *X86JITInfo::emitGlobalValueIndirectSym(const GlobalValue* GV, void *ptr,
+                                             JITCodeEmitter &JCE) {
+#if defined (X86_64_JIT)
+  const unsigned Alignment = 8;
+  uint8_t Buffer[8];
+  uint8_t *Cur = Buffer;
+  MachineCodeEmitter::emitWordLEInto(Cur, (unsigned)(intptr_t)ptr);
+  MachineCodeEmitter::emitWordLEInto(Cur, (unsigned)(((intptr_t)ptr) >> 32));
+#else
+  const unsigned Alignment = 4;
+  uint8_t Buffer[4];
+  uint8_t *Cur = Buffer;
+  MachineCodeEmitter::emitWordLEInto(Cur, (intptr_t)ptr);
+#endif
+  return JCE.allocIndirectGV(GV, Buffer, sizeof(Buffer), Alignment);
+}
+
+TargetJITInfo::StubLayout X86JITInfo::getStubLayout() {
+  // The 64-bit stub contains:
+  //   movabs r10 <- 8-byte-target-address  # 10 bytes
+  //   call|jmp *r10  # 3 bytes
+  // The 32-bit stub contains a 5-byte call|jmp.
+  // If the stub is a call to the compilation callback, an extra byte is added
+  // to mark it as a stub.
+  StubLayout Result = {14, 4};
+  return Result;
+}
+
+void *X86JITInfo::emitFunctionStub(const Function* F, void *Target,
+                                   JITCodeEmitter &JCE) {
+  // Note, we cast to intptr_t here to silence a -pedantic warning that
+  // complains about casting a function pointer to a normal pointer.
+#if defined (X86_32_JIT) && !defined (_MSC_VER)
+  bool NotCC = (Target != (void*)(intptr_t)X86CompilationCallback &&
+                Target != (void*)(intptr_t)X86CompilationCallback_SSE);
+#else
+  bool NotCC = Target != (void*)(intptr_t)X86CompilationCallback;
+#endif
+  JCE.emitAlignment(4);
+  void *Result = (void*)JCE.getCurrentPCValue();
+  if (NotCC) {
+#if defined (X86_64_JIT)
+    JCE.emitByte(0x49);          // REX prefix
+    JCE.emitByte(0xB8+2);        // movabsq r10
+    JCE.emitWordLE((unsigned)(intptr_t)Target);
+    JCE.emitWordLE((unsigned)(((intptr_t)Target) >> 32));
+    JCE.emitByte(0x41);          // REX prefix
+    JCE.emitByte(0xFF);          // jmpq *r10
+    JCE.emitByte(2 | (4 << 3) | (3 << 6));
+#else
+    JCE.emitByte(0xE9);
+    JCE.emitWordLE((intptr_t)Target-JCE.getCurrentPCValue()-4);
+#endif
+    return Result;
+  }
+
+#if defined (X86_64_JIT)
+  JCE.emitByte(0x49);          // REX prefix
+  JCE.emitByte(0xB8+2);        // movabsq r10
+  JCE.emitWordLE((unsigned)(intptr_t)Target);
+  JCE.emitWordLE((unsigned)(((intptr_t)Target) >> 32));
+  JCE.emitByte(0x41);          // REX prefix
+  JCE.emitByte(0xFF);          // callq *r10
+  JCE.emitByte(2 | (2 << 3) | (3 << 6));
+#else
+  JCE.emitByte(0xE8);   // Call with 32 bit pc-rel destination...
+
+  JCE.emitWordLE((intptr_t)Target-JCE.getCurrentPCValue()-4);
+#endif
+
+  // This used to use 0xCD, but that value is used by JITMemoryManager to
+  // initialize the buffer with garbage, which means it may follow a
+  // noreturn function call, confusing LLVMX86CompilationCallback2.  PR 4929.
+  JCE.emitByte(0xCE);   // Interrupt - Just a marker identifying the stub!
+  return Result;
+}
+
+/// getPICJumpTableEntry - Returns the value of the jumptable entry for the
+/// specific basic block.
+uintptr_t X86JITInfo::getPICJumpTableEntry(uintptr_t BB, uintptr_t Entry) {
+#if defined(X86_64_JIT)
+  return BB - Entry;
+#else
+  return BB - PICBase;
+#endif
+}
+
+template<typename T> static void addUnaligned(void *Pos, T Delta) {
+  T Value;
+  std::memcpy(reinterpret_cast<char*>(&Value), reinterpret_cast<char*>(Pos),
+              sizeof(T));
+  Value += Delta;
+  std::memcpy(reinterpret_cast<char*>(Pos), reinterpret_cast<char*>(&Value),
+              sizeof(T));
+}
+
+/// relocate - Before the JIT can run a block of code that has been emitted,
+/// it must rewrite the code to contain the actual addresses of any
+/// referenced global symbols.
+void X86JITInfo::relocate(void *Function, MachineRelocation *MR,
+                          unsigned NumRelocs, unsigned char* GOTBase) {
+  for (unsigned i = 0; i != NumRelocs; ++i, ++MR) {
+    void *RelocPos = (char*)Function + MR->getMachineCodeOffset();
+    intptr_t ResultPtr = (intptr_t)MR->getResultPointer();
+    switch ((X86::RelocationType)MR->getRelocationType()) {
+    case X86::reloc_pcrel_word: {
+      // PC relative relocation, add the relocated value to the value already in
+      // memory, after we adjust it for where the PC is.
+      ResultPtr = ResultPtr -(intptr_t)RelocPos - 4 - MR->getConstantVal();
+      addUnaligned<unsigned>(RelocPos, ResultPtr);
+      break;
+    }
+    case X86::reloc_picrel_word: {
+      // PIC base relative relocation, add the relocated value to the value
+      // already in memory, after we adjust it for where the PIC base is.
+      ResultPtr = ResultPtr - ((intptr_t)Function + MR->getConstantVal());
+      addUnaligned<unsigned>(RelocPos, ResultPtr);
+      break;
+    }
+    case X86::reloc_absolute_word:
+    case X86::reloc_absolute_word_sext:
+      // Absolute relocation, just add the relocated value to the value already
+      // in memory.
+      addUnaligned<unsigned>(RelocPos, ResultPtr);
+      break;
+    case X86::reloc_absolute_dword:
+      addUnaligned<intptr_t>(RelocPos, ResultPtr);
+      break;
+    }
+  }
+}
+
+char* X86JITInfo::allocateThreadLocalMemory(size_t size) {
+#if defined(X86_32_JIT) && !defined(__APPLE__) && !defined(_MSC_VER)
+  TLSOffset -= size;
+  return TLSOffset;
+#else
+  llvm_unreachable("Cannot allocate thread local storage on this arch!");
+#endif
+}
diff --git a/contrib/llvm/lib/Target/X86/X86JITInfo.h b/contrib/llvm/lib/Target/X86/X86JITInfo.h
new file mode 100644
index 0000000..564343f
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86JITInfo.h
@@ -0,0 +1,79 @@
+//===-- X86JITInfo.h - X86 implementation of the JIT interface --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetJITInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86JITINFO_H
+#define X86JITINFO_H
+
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Target/TargetJITInfo.h"
+
+namespace llvm {
+  class X86Subtarget;
+
+  class X86JITInfo : public TargetJITInfo {
+    uintptr_t PICBase;
+    char *TLSOffset;
+    bool useSSE;
+  public:
+    explicit X86JITInfo(bool UseSSE);
+
+    /// replaceMachineCodeForFunction - Make it so that calling the function
+    /// whose machine code is at OLD turns into a call to NEW, perhaps by
+    /// overwriting OLD with a branch to NEW.  This is used for self-modifying
+    /// code.
+    ///
+    void replaceMachineCodeForFunction(void *Old, void *New) override;
+
+    /// emitGlobalValueIndirectSym - Use the specified JITCodeEmitter object
+    /// to emit an indirect symbol which contains the address of the specified
+    /// ptr.
+    void *emitGlobalValueIndirectSym(const GlobalValue* GV, void *ptr,
+                                     JITCodeEmitter &JCE) override;
+
+    // getStubLayout - Returns the size and alignment of the largest call stub
+    // on X86.
+    StubLayout getStubLayout() override;
+
+    /// emitFunctionStub - Use the specified JITCodeEmitter object to emit a
+    /// small native function that simply calls the function at the specified
+    /// address.
+    void *emitFunctionStub(const Function* F, void *Target,
+                           JITCodeEmitter &JCE) override;
+
+    /// getPICJumpTableEntry - Returns the value of the jumptable entry for the
+    /// specific basic block.
+    uintptr_t getPICJumpTableEntry(uintptr_t BB, uintptr_t JTBase) override;
+
+    /// getLazyResolverFunction - Expose the lazy resolver to the JIT.
+    LazyResolverFn getLazyResolverFunction(JITCompilerFn) override;
+
+    /// relocate - Before the JIT can run a block of code that has been emitted,
+    /// it must rewrite the code to contain the actual addresses of any
+    /// referenced global symbols.
+    void relocate(void *Function, MachineRelocation *MR,
+                  unsigned NumRelocs, unsigned char* GOTBase) override;
+
+    /// allocateThreadLocalMemory - Each target has its own way of
+    /// handling thread local variables. This method returns a value only
+    /// meaningful to the target.
+    char* allocateThreadLocalMemory(size_t size) override;
+
+    /// setPICBase / getPICBase - Getter / setter of PICBase, used to compute
+    /// PIC jumptable entry.
+    void setPICBase(uintptr_t Base) { PICBase = Base; }
+    uintptr_t getPICBase() const { return PICBase; }
+  };
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp b/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp
new file mode 100644
index 0000000..2bd70a9
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -0,0 +1,926 @@
+//===-- X86MCInstLower.cpp - Convert X86 MachineInstr to an MCInst --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower X86 MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86AsmPrinter.h"
+#include "X86RegisterInfo.h"
+#include "InstPrinter/X86ATTInstPrinter.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+using namespace llvm;
+
+namespace {
+
+/// X86MCInstLower - This class is used to lower an MachineInstr into an MCInst.
+class X86MCInstLower {
+  MCContext &Ctx;
+  const MachineFunction &MF;
+  const TargetMachine &TM;
+  const MCAsmInfo &MAI;
+  X86AsmPrinter &AsmPrinter;
+public:
+  X86MCInstLower(const MachineFunction &MF, X86AsmPrinter &asmprinter);
+
+  void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+  MCSymbol *GetSymbolFromOperand(const MachineOperand &MO) const;
+  MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
+
+private:
+  MachineModuleInfoMachO &getMachOMMI() const;
+  Mangler *getMang() const {
+    return AsmPrinter.Mang;
+  }
+};
+
+} // end anonymous namespace
+
+X86MCInstLower::X86MCInstLower(const MachineFunction &mf,
+                               X86AsmPrinter &asmprinter)
+: Ctx(mf.getContext()), MF(mf), TM(mf.getTarget()),
+  MAI(*TM.getMCAsmInfo()), AsmPrinter(asmprinter) {}
+
+MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const {
+  return MF.getMMI().getObjFileInfo<MachineModuleInfoMachO>();
+}
+
+
+/// GetSymbolFromOperand - Lower an MO_GlobalAddress or MO_ExternalSymbol
+/// operand to an MCSymbol.
+MCSymbol *X86MCInstLower::
+GetSymbolFromOperand(const MachineOperand &MO) const {
+  const DataLayout *DL = TM.getDataLayout();
+  assert((MO.isGlobal() || MO.isSymbol() || MO.isMBB()) && "Isn't a symbol reference");
+
+  SmallString<128> Name;
+  StringRef Suffix;
+
+  switch (MO.getTargetFlags()) {
+  case X86II::MO_DLLIMPORT:
+    // Handle dllimport linkage.
+    Name += "__imp_";
+    break;
+  case X86II::MO_DARWIN_STUB:
+    Suffix = "$stub";
+    break;
+  case X86II::MO_DARWIN_NONLAZY:
+  case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
+  case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE:
+    Suffix = "$non_lazy_ptr";
+    break;
+  }
+
+  if (!Suffix.empty())
+    Name += DL->getPrivateGlobalPrefix();
+
+  unsigned PrefixLen = Name.size();
+
+  if (MO.isGlobal()) {
+    const GlobalValue *GV = MO.getGlobal();
+    AsmPrinter.getNameWithPrefix(Name, GV);
+  } else if (MO.isSymbol()) {
+    getMang()->getNameWithPrefix(Name, MO.getSymbolName());
+  } else if (MO.isMBB()) {
+    Name += MO.getMBB()->getSymbol()->getName();
+  }
+  unsigned OrigLen = Name.size() - PrefixLen;
+
+  Name += Suffix;
+  MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name);
+
+  StringRef OrigName = StringRef(Name).substr(PrefixLen, OrigLen);
+
+  // If the target flags on the operand changes the name of the symbol, do that
+  // before we return the symbol.
+  switch (MO.getTargetFlags()) {
+  default: break;
+  case X86II::MO_DARWIN_NONLAZY:
+  case X86II::MO_DARWIN_NONLAZY_PIC_BASE: {
+    MachineModuleInfoImpl::StubValueTy &StubSym =
+      getMachOMMI().getGVStubEntry(Sym);
+    if (!StubSym.getPointer()) {
+      assert(MO.isGlobal() && "Extern symbol not handled yet");
+      StubSym =
+        MachineModuleInfoImpl::
+        StubValueTy(AsmPrinter.getSymbol(MO.getGlobal()),
+                    !MO.getGlobal()->hasInternalLinkage());
+    }
+    break;
+  }
+  case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: {
+    MachineModuleInfoImpl::StubValueTy &StubSym =
+      getMachOMMI().getHiddenGVStubEntry(Sym);
+    if (!StubSym.getPointer()) {
+      assert(MO.isGlobal() && "Extern symbol not handled yet");
+      StubSym =
+        MachineModuleInfoImpl::
+        StubValueTy(AsmPrinter.getSymbol(MO.getGlobal()),
+                    !MO.getGlobal()->hasInternalLinkage());
+    }
+    break;
+  }
+  case X86II::MO_DARWIN_STUB: {
+    MachineModuleInfoImpl::StubValueTy &StubSym =
+      getMachOMMI().getFnStubEntry(Sym);
+    if (StubSym.getPointer())
+      return Sym;
+
+    if (MO.isGlobal()) {
+      StubSym =
+        MachineModuleInfoImpl::
+        StubValueTy(AsmPrinter.getSymbol(MO.getGlobal()),
+                    !MO.getGlobal()->hasInternalLinkage());
+    } else {
+      StubSym =
+        MachineModuleInfoImpl::
+        StubValueTy(Ctx.GetOrCreateSymbol(OrigName), false);
+    }
+    break;
+  }
+  }
+
+  return Sym;
+}
+
+MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
+                                             MCSymbol *Sym) const {
+  // FIXME: We would like an efficient form for this, so we don't have to do a
+  // lot of extra uniquing.
+  const MCExpr *Expr = nullptr;
+  MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
+
+  switch (MO.getTargetFlags()) {
+  default: llvm_unreachable("Unknown target flag on GV operand");
+  case X86II::MO_NO_FLAG:    // No flag.
+  // These affect the name of the symbol, not any suffix.
+  case X86II::MO_DARWIN_NONLAZY:
+  case X86II::MO_DLLIMPORT:
+  case X86II::MO_DARWIN_STUB:
+    break;
+
+  case X86II::MO_TLVP:      RefKind = MCSymbolRefExpr::VK_TLVP; break;
+  case X86II::MO_TLVP_PIC_BASE:
+    Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_TLVP, Ctx);
+    // Subtract the pic base.
+    Expr = MCBinaryExpr::CreateSub(Expr,
+                                  MCSymbolRefExpr::Create(MF.getPICBaseSymbol(),
+                                                           Ctx),
+                                   Ctx);
+    break;
+  case X86II::MO_SECREL:    RefKind = MCSymbolRefExpr::VK_SECREL; break;
+  case X86II::MO_TLSGD:     RefKind = MCSymbolRefExpr::VK_TLSGD; break;
+  case X86II::MO_TLSLD:     RefKind = MCSymbolRefExpr::VK_TLSLD; break;
+  case X86II::MO_TLSLDM:    RefKind = MCSymbolRefExpr::VK_TLSLDM; break;
+  case X86II::MO_GOTTPOFF:  RefKind = MCSymbolRefExpr::VK_GOTTPOFF; break;
+  case X86II::MO_INDNTPOFF: RefKind = MCSymbolRefExpr::VK_INDNTPOFF; break;
+  case X86II::MO_TPOFF:     RefKind = MCSymbolRefExpr::VK_TPOFF; break;
+  case X86II::MO_DTPOFF:    RefKind = MCSymbolRefExpr::VK_DTPOFF; break;
+  case X86II::MO_NTPOFF:    RefKind = MCSymbolRefExpr::VK_NTPOFF; break;
+  case X86II::MO_GOTNTPOFF: RefKind = MCSymbolRefExpr::VK_GOTNTPOFF; break;
+  case X86II::MO_GOTPCREL:  RefKind = MCSymbolRefExpr::VK_GOTPCREL; break;
+  case X86II::MO_GOT:       RefKind = MCSymbolRefExpr::VK_GOT; break;
+  case X86II::MO_GOTOFF:    RefKind = MCSymbolRefExpr::VK_GOTOFF; break;
+  case X86II::MO_PLT:       RefKind = MCSymbolRefExpr::VK_PLT; break;
+  case X86II::MO_PIC_BASE_OFFSET:
+  case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
+  case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE:
+    Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+    // Subtract the pic base.
+    Expr = MCBinaryExpr::CreateSub(Expr,
+                            MCSymbolRefExpr::Create(MF.getPICBaseSymbol(), Ctx),
+                                   Ctx);
+    if (MO.isJTI() && MAI.hasSetDirective()) {
+      // If .set directive is supported, use it to reduce the number of
+      // relocations the assembler will generate for differences between
+      // local labels. This is only safe when the symbols are in the same
+      // section so we are restricting it to jumptable references.
+      MCSymbol *Label = Ctx.CreateTempSymbol();
+      AsmPrinter.OutStreamer.EmitAssignment(Label, Expr);
+      Expr = MCSymbolRefExpr::Create(Label, Ctx);
+    }
+    break;
+  }
+
+  if (!Expr)
+    Expr = MCSymbolRefExpr::Create(Sym, RefKind, Ctx);
+
+  if (!MO.isJTI() && !MO.isMBB() && MO.getOffset())
+    Expr = MCBinaryExpr::CreateAdd(Expr,
+                                   MCConstantExpr::Create(MO.getOffset(), Ctx),
+                                   Ctx);
+  return MCOperand::CreateExpr(Expr);
+}
+
+
+/// \brief Simplify FOO $imm, %{al,ax,eax,rax} to FOO $imm, for instruction with
+/// a short fixed-register form.
+static void SimplifyShortImmForm(MCInst &Inst, unsigned Opcode) {
+  unsigned ImmOp = Inst.getNumOperands() - 1;
+  assert(Inst.getOperand(0).isReg() &&
+         (Inst.getOperand(ImmOp).isImm() || Inst.getOperand(ImmOp).isExpr()) &&
+         ((Inst.getNumOperands() == 3 && Inst.getOperand(1).isReg() &&
+           Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg()) ||
+          Inst.getNumOperands() == 2) && "Unexpected instruction!");
+
+  // Check whether the destination register can be fixed.
+  unsigned Reg = Inst.getOperand(0).getReg();
+  if (Reg != X86::AL && Reg != X86::AX && Reg != X86::EAX && Reg != X86::RAX)
+    return;
+
+  // If so, rewrite the instruction.
+  MCOperand Saved = Inst.getOperand(ImmOp);
+  Inst = MCInst();
+  Inst.setOpcode(Opcode);
+  Inst.addOperand(Saved);
+}
+
+/// \brief If a movsx instruction has a shorter encoding for the used register
+/// simplify the instruction to use it instead.
+static void SimplifyMOVSX(MCInst &Inst) {
+  unsigned NewOpcode = 0;
+  unsigned Op0 = Inst.getOperand(0).getReg(), Op1 = Inst.getOperand(1).getReg();
+  switch (Inst.getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected instruction!");
+  case X86::MOVSX16rr8:  // movsbw %al, %ax   --> cbtw
+    if (Op0 == X86::AX && Op1 == X86::AL)
+      NewOpcode = X86::CBW;
+    break;
+  case X86::MOVSX32rr16: // movswl %ax, %eax  --> cwtl
+    if (Op0 == X86::EAX && Op1 == X86::AX)
+      NewOpcode = X86::CWDE;
+    break;
+  case X86::MOVSX64rr32: // movslq %eax, %rax --> cltq
+    if (Op0 == X86::RAX && Op1 == X86::EAX)
+      NewOpcode = X86::CDQE;
+    break;
+  }
+
+  if (NewOpcode != 0) {
+    Inst = MCInst();
+    Inst.setOpcode(NewOpcode);
+  }
+}
+
+/// \brief Simplify things like MOV32rm to MOV32o32a.
+static void SimplifyShortMoveForm(X86AsmPrinter &Printer, MCInst &Inst,
+                                  unsigned Opcode) {
+  // Don't make these simplifications in 64-bit mode; other assemblers don't
+  // perform them because they make the code larger.
+  if (Printer.getSubtarget().is64Bit())
+    return;
+
+  bool IsStore = Inst.getOperand(0).isReg() && Inst.getOperand(1).isReg();
+  unsigned AddrBase = IsStore;
+  unsigned RegOp = IsStore ? 0 : 5;
+  unsigned AddrOp = AddrBase + 3;
+  assert(Inst.getNumOperands() == 6 && Inst.getOperand(RegOp).isReg() &&
+         Inst.getOperand(AddrBase + X86::AddrBaseReg).isReg() &&
+         Inst.getOperand(AddrBase + X86::AddrScaleAmt).isImm() &&
+         Inst.getOperand(AddrBase + X86::AddrIndexReg).isReg() &&
+         Inst.getOperand(AddrBase + X86::AddrSegmentReg).isReg() &&
+         (Inst.getOperand(AddrOp).isExpr() ||
+          Inst.getOperand(AddrOp).isImm()) &&
+         "Unexpected instruction!");
+
+  // Check whether the destination register can be fixed.
+  unsigned Reg = Inst.getOperand(RegOp).getReg();
+  if (Reg != X86::AL && Reg != X86::AX && Reg != X86::EAX && Reg != X86::RAX)
+    return;
+
+  // Check whether this is an absolute address.
+  // FIXME: We know TLVP symbol refs aren't, but there should be a better way
+  // to do this here.
+  bool Absolute = true;
+  if (Inst.getOperand(AddrOp).isExpr()) {
+    const MCExpr *MCE = Inst.getOperand(AddrOp).getExpr();
+    if (const MCSymbolRefExpr *SRE = dyn_cast<MCSymbolRefExpr>(MCE))
+      if (SRE->getKind() == MCSymbolRefExpr::VK_TLVP)
+        Absolute = false;
+  }
+
+  if (Absolute &&
+      (Inst.getOperand(AddrBase + X86::AddrBaseReg).getReg() != 0 ||
+       Inst.getOperand(AddrBase + X86::AddrScaleAmt).getImm() != 1 ||
+       Inst.getOperand(AddrBase + X86::AddrIndexReg).getReg() != 0))
+    return;
+
+  // If so, rewrite the instruction.
+  MCOperand Saved = Inst.getOperand(AddrOp);
+  MCOperand Seg = Inst.getOperand(AddrBase + X86::AddrSegmentReg);
+  Inst = MCInst();
+  Inst.setOpcode(Opcode);
+  Inst.addOperand(Saved);
+  Inst.addOperand(Seg);
+}
+
+static unsigned getRetOpcode(const X86Subtarget &Subtarget)
+{
+	return Subtarget.is64Bit() ? X86::RETQ : X86::RETL;
+}
+
+void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+  OutMI.setOpcode(MI->getOpcode());
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+
+    MCOperand MCOp;
+    switch (MO.getType()) {
+    default:
+      MI->dump();
+      llvm_unreachable("unknown operand type");
+    case MachineOperand::MO_Register:
+      // Ignore all implicit register operands.
+      if (MO.isImplicit()) continue;
+      MCOp = MCOperand::CreateReg(MO.getReg());
+      break;
+    case MachineOperand::MO_Immediate:
+      MCOp = MCOperand::CreateImm(MO.getImm());
+      break;
+    case MachineOperand::MO_MachineBasicBlock:
+    case MachineOperand::MO_GlobalAddress:
+    case MachineOperand::MO_ExternalSymbol:
+      MCOp = LowerSymbolOperand(MO, GetSymbolFromOperand(MO));
+      break;
+    case MachineOperand::MO_JumpTableIndex:
+      MCOp = LowerSymbolOperand(MO, AsmPrinter.GetJTISymbol(MO.getIndex()));
+      break;
+    case MachineOperand::MO_ConstantPoolIndex:
+      MCOp = LowerSymbolOperand(MO, AsmPrinter.GetCPISymbol(MO.getIndex()));
+      break;
+    case MachineOperand::MO_BlockAddress:
+      MCOp = LowerSymbolOperand(MO,
+                     AsmPrinter.GetBlockAddressSymbol(MO.getBlockAddress()));
+      break;
+    case MachineOperand::MO_RegisterMask:
+      // Ignore call clobbers.
+      continue;
+    }
+
+    OutMI.addOperand(MCOp);
+  }
+
+  // Handle a few special cases to eliminate operand modifiers.
+ReSimplify:
+  switch (OutMI.getOpcode()) {
+  case X86::LEA64_32r:
+  case X86::LEA64r:
+  case X86::LEA16r:
+  case X86::LEA32r:
+    // LEA should have a segment register, but it must be empty.
+    assert(OutMI.getNumOperands() == 1+X86::AddrNumOperands &&
+           "Unexpected # of LEA operands");
+    assert(OutMI.getOperand(1+X86::AddrSegmentReg).getReg() == 0 &&
+           "LEA has segment specified!");
+    break;
+
+  case X86::MOV32ri64:
+    OutMI.setOpcode(X86::MOV32ri);
+    break;
+
+  // Commute operands to get a smaller encoding by using VEX.R instead of VEX.B
+  // if one of the registers is extended, but other isn't.
+  case X86::VMOVAPDrr:
+  case X86::VMOVAPDYrr:
+  case X86::VMOVAPSrr:
+  case X86::VMOVAPSYrr:
+  case X86::VMOVDQArr:
+  case X86::VMOVDQAYrr:
+  case X86::VMOVDQUrr:
+  case X86::VMOVDQUYrr:
+  case X86::VMOVUPDrr:
+  case X86::VMOVUPDYrr:
+  case X86::VMOVUPSrr:
+  case X86::VMOVUPSYrr: {
+    if (!X86II::isX86_64ExtendedReg(OutMI.getOperand(0).getReg()) &&
+        X86II::isX86_64ExtendedReg(OutMI.getOperand(1).getReg())) {
+      unsigned NewOpc;
+      switch (OutMI.getOpcode()) {
+      default: llvm_unreachable("Invalid opcode");
+      case X86::VMOVAPDrr:  NewOpc = X86::VMOVAPDrr_REV;  break;
+      case X86::VMOVAPDYrr: NewOpc = X86::VMOVAPDYrr_REV; break;
+      case X86::VMOVAPSrr:  NewOpc = X86::VMOVAPSrr_REV;  break;
+      case X86::VMOVAPSYrr: NewOpc = X86::VMOVAPSYrr_REV; break;
+      case X86::VMOVDQArr:  NewOpc = X86::VMOVDQArr_REV;  break;
+      case X86::VMOVDQAYrr: NewOpc = X86::VMOVDQAYrr_REV; break;
+      case X86::VMOVDQUrr:  NewOpc = X86::VMOVDQUrr_REV;  break;
+      case X86::VMOVDQUYrr: NewOpc = X86::VMOVDQUYrr_REV; break;
+      case X86::VMOVUPDrr:  NewOpc = X86::VMOVUPDrr_REV;  break;
+      case X86::VMOVUPDYrr: NewOpc = X86::VMOVUPDYrr_REV; break;
+      case X86::VMOVUPSrr:  NewOpc = X86::VMOVUPSrr_REV;  break;
+      case X86::VMOVUPSYrr: NewOpc = X86::VMOVUPSYrr_REV; break;
+      }
+      OutMI.setOpcode(NewOpc);
+    }
+    break;
+  }
+  case X86::VMOVSDrr:
+  case X86::VMOVSSrr: {
+    if (!X86II::isX86_64ExtendedReg(OutMI.getOperand(0).getReg()) &&
+        X86II::isX86_64ExtendedReg(OutMI.getOperand(2).getReg())) {
+      unsigned NewOpc;
+      switch (OutMI.getOpcode()) {
+      default: llvm_unreachable("Invalid opcode");
+      case X86::VMOVSDrr:   NewOpc = X86::VMOVSDrr_REV;   break;
+      case X86::VMOVSSrr:   NewOpc = X86::VMOVSSrr_REV;   break;
+      }
+      OutMI.setOpcode(NewOpc);
+    }
+    break;
+  }
+
+  // TAILJMPr64, CALL64r, CALL64pcrel32 - These instructions have register
+  // inputs modeled as normal uses instead of implicit uses.  As such, truncate
+  // off all but the first operand (the callee).  FIXME: Change isel.
+  case X86::TAILJMPr64:
+  case X86::CALL64r:
+  case X86::CALL64pcrel32: {
+    unsigned Opcode = OutMI.getOpcode();
+    MCOperand Saved = OutMI.getOperand(0);
+    OutMI = MCInst();
+    OutMI.setOpcode(Opcode);
+    OutMI.addOperand(Saved);
+    break;
+  }
+
+  case X86::EH_RETURN:
+  case X86::EH_RETURN64: {
+    OutMI = MCInst();
+    OutMI.setOpcode(getRetOpcode(AsmPrinter.getSubtarget()));
+    break;
+  }
+
+  // TAILJMPd, TAILJMPd64 - Lower to the correct jump instructions.
+  case X86::TAILJMPr:
+  case X86::TAILJMPd:
+  case X86::TAILJMPd64: {
+    unsigned Opcode;
+    switch (OutMI.getOpcode()) {
+    default: llvm_unreachable("Invalid opcode");
+    case X86::TAILJMPr: Opcode = X86::JMP32r; break;
+    case X86::TAILJMPd:
+    case X86::TAILJMPd64: Opcode = X86::JMP_1; break;
+    }
+
+    MCOperand Saved = OutMI.getOperand(0);
+    OutMI = MCInst();
+    OutMI.setOpcode(Opcode);
+    OutMI.addOperand(Saved);
+    break;
+  }
+
+  // These are pseudo-ops for OR to help with the OR->ADD transformation.  We do
+  // this with an ugly goto in case the resultant OR uses EAX and needs the
+  // short form.
+  case X86::ADD16rr_DB:   OutMI.setOpcode(X86::OR16rr); goto ReSimplify;
+  case X86::ADD32rr_DB:   OutMI.setOpcode(X86::OR32rr); goto ReSimplify;
+  case X86::ADD64rr_DB:   OutMI.setOpcode(X86::OR64rr); goto ReSimplify;
+  case X86::ADD16ri_DB:   OutMI.setOpcode(X86::OR16ri); goto ReSimplify;
+  case X86::ADD32ri_DB:   OutMI.setOpcode(X86::OR32ri); goto ReSimplify;
+  case X86::ADD64ri32_DB: OutMI.setOpcode(X86::OR64ri32); goto ReSimplify;
+  case X86::ADD16ri8_DB:  OutMI.setOpcode(X86::OR16ri8); goto ReSimplify;
+  case X86::ADD32ri8_DB:  OutMI.setOpcode(X86::OR32ri8); goto ReSimplify;
+  case X86::ADD64ri8_DB:  OutMI.setOpcode(X86::OR64ri8); goto ReSimplify;
+
+  // The assembler backend wants to see branches in their small form and relax
+  // them to their large form.  The JIT can only handle the large form because
+  // it does not do relaxation.  For now, translate the large form to the
+  // small one here.
+  case X86::JMP_4: OutMI.setOpcode(X86::JMP_1); break;
+  case X86::JO_4:  OutMI.setOpcode(X86::JO_1); break;
+  case X86::JNO_4: OutMI.setOpcode(X86::JNO_1); break;
+  case X86::JB_4:  OutMI.setOpcode(X86::JB_1); break;
+  case X86::JAE_4: OutMI.setOpcode(X86::JAE_1); break;
+  case X86::JE_4:  OutMI.setOpcode(X86::JE_1); break;
+  case X86::JNE_4: OutMI.setOpcode(X86::JNE_1); break;
+  case X86::JBE_4: OutMI.setOpcode(X86::JBE_1); break;
+  case X86::JA_4:  OutMI.setOpcode(X86::JA_1); break;
+  case X86::JS_4:  OutMI.setOpcode(X86::JS_1); break;
+  case X86::JNS_4: OutMI.setOpcode(X86::JNS_1); break;
+  case X86::JP_4:  OutMI.setOpcode(X86::JP_1); break;
+  case X86::JNP_4: OutMI.setOpcode(X86::JNP_1); break;
+  case X86::JL_4:  OutMI.setOpcode(X86::JL_1); break;
+  case X86::JGE_4: OutMI.setOpcode(X86::JGE_1); break;
+  case X86::JLE_4: OutMI.setOpcode(X86::JLE_1); break;
+  case X86::JG_4:  OutMI.setOpcode(X86::JG_1); break;
+
+  // Atomic load and store require a separate pseudo-inst because Acquire
+  // implies mayStore and Release implies mayLoad; fix these to regular MOV
+  // instructions here
+  case X86::ACQUIRE_MOV8rm:  OutMI.setOpcode(X86::MOV8rm); goto ReSimplify;
+  case X86::ACQUIRE_MOV16rm: OutMI.setOpcode(X86::MOV16rm); goto ReSimplify;
+  case X86::ACQUIRE_MOV32rm: OutMI.setOpcode(X86::MOV32rm); goto ReSimplify;
+  case X86::ACQUIRE_MOV64rm: OutMI.setOpcode(X86::MOV64rm); goto ReSimplify;
+  case X86::RELEASE_MOV8mr:  OutMI.setOpcode(X86::MOV8mr); goto ReSimplify;
+  case X86::RELEASE_MOV16mr: OutMI.setOpcode(X86::MOV16mr); goto ReSimplify;
+  case X86::RELEASE_MOV32mr: OutMI.setOpcode(X86::MOV32mr); goto ReSimplify;
+  case X86::RELEASE_MOV64mr: OutMI.setOpcode(X86::MOV64mr); goto ReSimplify;
+
+  // We don't currently select the correct instruction form for instructions
+  // which have a short %eax, etc. form. Handle this by custom lowering, for
+  // now.
+  //
+  // Note, we are currently not handling the following instructions:
+  // MOV64ao8, MOV64o8a
+  // XCHG16ar, XCHG32ar, XCHG64ar
+  case X86::MOV8mr_NOREX:
+  case X86::MOV8mr:     SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV8ao8); break;
+  case X86::MOV8rm_NOREX:
+  case X86::MOV8rm:     SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV8o8a); break;
+  case X86::MOV16mr:    SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV16ao16); break;
+  case X86::MOV16rm:    SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV16o16a); break;
+  case X86::MOV32mr:    SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV32ao32); break;
+  case X86::MOV32rm:    SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV32o32a); break;
+
+  case X86::ADC8ri:     SimplifyShortImmForm(OutMI, X86::ADC8i8);    break;
+  case X86::ADC16ri:    SimplifyShortImmForm(OutMI, X86::ADC16i16);  break;
+  case X86::ADC32ri:    SimplifyShortImmForm(OutMI, X86::ADC32i32);  break;
+  case X86::ADC64ri32:  SimplifyShortImmForm(OutMI, X86::ADC64i32);  break;
+  case X86::ADD8ri:     SimplifyShortImmForm(OutMI, X86::ADD8i8);    break;
+  case X86::ADD16ri:    SimplifyShortImmForm(OutMI, X86::ADD16i16);  break;
+  case X86::ADD32ri:    SimplifyShortImmForm(OutMI, X86::ADD32i32);  break;
+  case X86::ADD64ri32:  SimplifyShortImmForm(OutMI, X86::ADD64i32);  break;
+  case X86::AND8ri:     SimplifyShortImmForm(OutMI, X86::AND8i8);    break;
+  case X86::AND16ri:    SimplifyShortImmForm(OutMI, X86::AND16i16);  break;
+  case X86::AND32ri:    SimplifyShortImmForm(OutMI, X86::AND32i32);  break;
+  case X86::AND64ri32:  SimplifyShortImmForm(OutMI, X86::AND64i32);  break;
+  case X86::CMP8ri:     SimplifyShortImmForm(OutMI, X86::CMP8i8);    break;
+  case X86::CMP16ri:    SimplifyShortImmForm(OutMI, X86::CMP16i16);  break;
+  case X86::CMP32ri:    SimplifyShortImmForm(OutMI, X86::CMP32i32);  break;
+  case X86::CMP64ri32:  SimplifyShortImmForm(OutMI, X86::CMP64i32);  break;
+  case X86::OR8ri:      SimplifyShortImmForm(OutMI, X86::OR8i8);     break;
+  case X86::OR16ri:     SimplifyShortImmForm(OutMI, X86::OR16i16);   break;
+  case X86::OR32ri:     SimplifyShortImmForm(OutMI, X86::OR32i32);   break;
+  case X86::OR64ri32:   SimplifyShortImmForm(OutMI, X86::OR64i32);   break;
+  case X86::SBB8ri:     SimplifyShortImmForm(OutMI, X86::SBB8i8);    break;
+  case X86::SBB16ri:    SimplifyShortImmForm(OutMI, X86::SBB16i16);  break;
+  case X86::SBB32ri:    SimplifyShortImmForm(OutMI, X86::SBB32i32);  break;
+  case X86::SBB64ri32:  SimplifyShortImmForm(OutMI, X86::SBB64i32);  break;
+  case X86::SUB8ri:     SimplifyShortImmForm(OutMI, X86::SUB8i8);    break;
+  case X86::SUB16ri:    SimplifyShortImmForm(OutMI, X86::SUB16i16);  break;
+  case X86::SUB32ri:    SimplifyShortImmForm(OutMI, X86::SUB32i32);  break;
+  case X86::SUB64ri32:  SimplifyShortImmForm(OutMI, X86::SUB64i32);  break;
+  case X86::TEST8ri:    SimplifyShortImmForm(OutMI, X86::TEST8i8);   break;
+  case X86::TEST16ri:   SimplifyShortImmForm(OutMI, X86::TEST16i16); break;
+  case X86::TEST32ri:   SimplifyShortImmForm(OutMI, X86::TEST32i32); break;
+  case X86::TEST64ri32: SimplifyShortImmForm(OutMI, X86::TEST64i32); break;
+  case X86::XOR8ri:     SimplifyShortImmForm(OutMI, X86::XOR8i8);    break;
+  case X86::XOR16ri:    SimplifyShortImmForm(OutMI, X86::XOR16i16);  break;
+  case X86::XOR32ri:    SimplifyShortImmForm(OutMI, X86::XOR32i32);  break;
+  case X86::XOR64ri32:  SimplifyShortImmForm(OutMI, X86::XOR64i32);  break;
+
+  // Try to shrink some forms of movsx.
+  case X86::MOVSX16rr8:
+  case X86::MOVSX32rr16:
+  case X86::MOVSX64rr32:
+    SimplifyMOVSX(OutMI);
+    break;
+  }
+}
+
+static void LowerTlsAddr(MCStreamer &OutStreamer,
+                         X86MCInstLower &MCInstLowering,
+                         const MachineInstr &MI,
+                         const MCSubtargetInfo& STI) {
+
+  bool is64Bits = MI.getOpcode() == X86::TLS_addr64 ||
+                  MI.getOpcode() == X86::TLS_base_addr64;
+
+  bool needsPadding = MI.getOpcode() == X86::TLS_addr64;
+
+  MCContext &context = OutStreamer.getContext();
+
+  if (needsPadding)
+    OutStreamer.EmitInstruction(MCInstBuilder(X86::DATA16_PREFIX), STI);
+
+  MCSymbolRefExpr::VariantKind SRVK;
+  switch (MI.getOpcode()) {
+    case X86::TLS_addr32:
+    case X86::TLS_addr64:
+      SRVK = MCSymbolRefExpr::VK_TLSGD;
+      break;
+    case X86::TLS_base_addr32:
+      SRVK = MCSymbolRefExpr::VK_TLSLDM;
+      break;
+    case X86::TLS_base_addr64:
+      SRVK = MCSymbolRefExpr::VK_TLSLD;
+      break;
+    default:
+      llvm_unreachable("unexpected opcode");
+  }
+
+  MCSymbol *sym = MCInstLowering.GetSymbolFromOperand(MI.getOperand(3));
+  const MCSymbolRefExpr *symRef = MCSymbolRefExpr::Create(sym, SRVK, context);
+
+  MCInst LEA;
+  if (is64Bits) {
+    LEA.setOpcode(X86::LEA64r);
+    LEA.addOperand(MCOperand::CreateReg(X86::RDI)); // dest
+    LEA.addOperand(MCOperand::CreateReg(X86::RIP)); // base
+    LEA.addOperand(MCOperand::CreateImm(1));        // scale
+    LEA.addOperand(MCOperand::CreateReg(0));        // index
+    LEA.addOperand(MCOperand::CreateExpr(symRef));  // disp
+    LEA.addOperand(MCOperand::CreateReg(0));        // seg
+  } else if (SRVK == MCSymbolRefExpr::VK_TLSLDM) {
+    LEA.setOpcode(X86::LEA32r);
+    LEA.addOperand(MCOperand::CreateReg(X86::EAX)); // dest
+    LEA.addOperand(MCOperand::CreateReg(X86::EBX)); // base
+    LEA.addOperand(MCOperand::CreateImm(1));        // scale
+    LEA.addOperand(MCOperand::CreateReg(0));        // index
+    LEA.addOperand(MCOperand::CreateExpr(symRef));  // disp
+    LEA.addOperand(MCOperand::CreateReg(0));        // seg
+  } else {
+    LEA.setOpcode(X86::LEA32r);
+    LEA.addOperand(MCOperand::CreateReg(X86::EAX)); // dest
+    LEA.addOperand(MCOperand::CreateReg(0));        // base
+    LEA.addOperand(MCOperand::CreateImm(1));        // scale
+    LEA.addOperand(MCOperand::CreateReg(X86::EBX)); // index
+    LEA.addOperand(MCOperand::CreateExpr(symRef));  // disp
+    LEA.addOperand(MCOperand::CreateReg(0));        // seg
+  }
+  OutStreamer.EmitInstruction(LEA, STI);
+
+  if (needsPadding) {
+    OutStreamer.EmitInstruction(MCInstBuilder(X86::DATA16_PREFIX), STI);
+    OutStreamer.EmitInstruction(MCInstBuilder(X86::DATA16_PREFIX), STI);
+    OutStreamer.EmitInstruction(MCInstBuilder(X86::REX64_PREFIX), STI);
+  }
+
+  StringRef name = is64Bits ? "__tls_get_addr" : "___tls_get_addr";
+  MCSymbol *tlsGetAddr = context.GetOrCreateSymbol(name);
+  const MCSymbolRefExpr *tlsRef =
+    MCSymbolRefExpr::Create(tlsGetAddr,
+                            MCSymbolRefExpr::VK_PLT,
+                            context);
+
+  OutStreamer.EmitInstruction(MCInstBuilder(is64Bits ? X86::CALL64pcrel32
+                                                     : X86::CALLpcrel32)
+    .addExpr(tlsRef), STI);
+}
+
+/// \brief Emit the optimal amount of multi-byte nops on X86.
+static void EmitNops(MCStreamer &OS, unsigned NumBytes, bool Is64Bit, const MCSubtargetInfo &STI) {
+  // This works only for 64bit. For 32bit we have to do additional checking if
+  // the CPU supports multi-byte nops.
+  assert(Is64Bit && "EmitNops only supports X86-64");
+  while (NumBytes) {
+    unsigned Opc, BaseReg, ScaleVal, IndexReg, Displacement, SegmentReg;
+    Opc = IndexReg = Displacement = SegmentReg = 0;
+    BaseReg = X86::RAX; ScaleVal = 1;
+    switch (NumBytes) {
+    case  0: llvm_unreachable("Zero nops?"); break;
+    case  1: NumBytes -=  1; Opc = X86::NOOP; break;
+    case  2: NumBytes -=  2; Opc = X86::XCHG16ar; break;
+    case  3: NumBytes -=  3; Opc = X86::NOOPL; break;
+    case  4: NumBytes -=  4; Opc = X86::NOOPL; Displacement = 8; break;
+    case  5: NumBytes -=  5; Opc = X86::NOOPL; Displacement = 8;
+             IndexReg = X86::RAX; break;
+    case  6: NumBytes -=  6; Opc = X86::NOOPW; Displacement = 8;
+             IndexReg = X86::RAX; break;
+    case  7: NumBytes -=  7; Opc = X86::NOOPL; Displacement = 512; break;
+    case  8: NumBytes -=  8; Opc = X86::NOOPL; Displacement = 512;
+             IndexReg = X86::RAX; break;
+    case  9: NumBytes -=  9; Opc = X86::NOOPW; Displacement = 512;
+             IndexReg = X86::RAX; break;
+    default: NumBytes -= 10; Opc = X86::NOOPW; Displacement = 512;
+             IndexReg = X86::RAX; SegmentReg = X86::CS; break;
+    }
+
+    unsigned NumPrefixes = std::min(NumBytes, 5U);
+    NumBytes -= NumPrefixes;
+    for (unsigned i = 0; i != NumPrefixes; ++i)
+      OS.EmitBytes("\x66");
+
+    switch (Opc) {
+    default: llvm_unreachable("Unexpected opcode"); break;
+    case X86::NOOP:
+      OS.EmitInstruction(MCInstBuilder(Opc), STI);
+      break;
+    case X86::XCHG16ar:
+      OS.EmitInstruction(MCInstBuilder(Opc).addReg(X86::AX), STI);
+      break;
+    case X86::NOOPL:
+    case X86::NOOPW:
+      OS.EmitInstruction(MCInstBuilder(Opc).addReg(BaseReg).addImm(ScaleVal)
+                                           .addReg(IndexReg)
+                                           .addImm(Displacement)
+                                           .addReg(SegmentReg), STI);
+      break;
+    }
+  } // while (NumBytes)
+}
+
+// Lower a stackmap of the form:
+// <id>, <shadowBytes>, ...
+static void LowerSTACKMAP(MCStreamer &OS, StackMaps &SM,
+                          const MachineInstr &MI, bool Is64Bit, const MCSubtargetInfo& STI) {
+  unsigned NumBytes = MI.getOperand(1).getImm();
+  SM.recordStackMap(MI);
+  // Emit padding.
+  // FIXME: These nops ensure that the stackmap's shadow is covered by
+  // instructions from the same basic block, but the nops should not be
+  // necessary if instructions from the same block follow the stackmap.
+  EmitNops(OS, NumBytes, Is64Bit, STI);
+}
+
+// Lower a patchpoint of the form:
+// [<def>], <id>, <numBytes>, <target>, <numArgs>, <cc>, ...
+static void LowerPATCHPOINT(MCStreamer &OS, StackMaps &SM,
+                            const MachineInstr &MI, bool Is64Bit, const MCSubtargetInfo& STI) {
+  assert(Is64Bit && "Patchpoint currently only supports X86-64");
+  SM.recordPatchPoint(MI);
+
+  PatchPointOpers opers(&MI);
+  unsigned ScratchIdx = opers.getNextScratchIdx();
+  unsigned EncodedBytes = 0;
+  int64_t CallTarget = opers.getMetaOper(PatchPointOpers::TargetPos).getImm();
+  if (CallTarget) {
+    // Emit MOV to materialize the target address and the CALL to target.
+    // This is encoded with 12-13 bytes, depending on which register is used.
+    unsigned ScratchReg = MI.getOperand(ScratchIdx).getReg();
+    if (X86II::isX86_64ExtendedReg(ScratchReg))
+      EncodedBytes = 13;
+    else
+      EncodedBytes = 12;
+    OS.EmitInstruction(MCInstBuilder(X86::MOV64ri).addReg(ScratchReg)
+                                                  .addImm(CallTarget), STI);
+    OS.EmitInstruction(MCInstBuilder(X86::CALL64r).addReg(ScratchReg), STI);
+  }
+  // Emit padding.
+  unsigned NumBytes = opers.getMetaOper(PatchPointOpers::NBytesPos).getImm();
+  assert(NumBytes >= EncodedBytes &&
+         "Patchpoint can't request size less than the length of a call.");
+
+  EmitNops(OS, NumBytes - EncodedBytes, Is64Bit, STI);
+}
+
+void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  X86MCInstLower MCInstLowering(*MF, *this);
+  const X86RegisterInfo *RI =
+      static_cast<const X86RegisterInfo *>(TM.getRegisterInfo());
+
+  switch (MI->getOpcode()) {
+  case TargetOpcode::DBG_VALUE:
+    llvm_unreachable("Should be handled target independently");
+
+  // Emit nothing here but a comment if we can.
+  case X86::Int_MemBarrier:
+    OutStreamer.emitRawComment("MEMBARRIER");
+    return;
+
+
+  case X86::EH_RETURN:
+  case X86::EH_RETURN64: {
+    // Lower these as normal, but add some comments.
+    unsigned Reg = MI->getOperand(0).getReg();
+    OutStreamer.AddComment(StringRef("eh_return, addr: %") +
+                           X86ATTInstPrinter::getRegisterName(Reg));
+    break;
+  }
+  case X86::TAILJMPr:
+  case X86::TAILJMPd:
+  case X86::TAILJMPd64:
+    // Lower these as normal, but add some comments.
+    OutStreamer.AddComment("TAILCALL");
+    break;
+
+  case X86::TLS_addr32:
+  case X86::TLS_addr64:
+  case X86::TLS_base_addr32:
+  case X86::TLS_base_addr64:
+    return LowerTlsAddr(OutStreamer, MCInstLowering, *MI, getSubtargetInfo());
+
+  case X86::MOVPC32r: {
+    // This is a pseudo op for a two instruction sequence with a label, which
+    // looks like:
+    //     call "L1$pb"
+    // "L1$pb":
+    //     popl %esi
+
+    // Emit the call.
+    MCSymbol *PICBase = MF->getPICBaseSymbol();
+    // FIXME: We would like an efficient form for this, so we don't have to do a
+    // lot of extra uniquing.
+    EmitToStreamer(OutStreamer, MCInstBuilder(X86::CALLpcrel32)
+      .addExpr(MCSymbolRefExpr::Create(PICBase, OutContext)));
+
+    // Emit the label.
+    OutStreamer.EmitLabel(PICBase);
+
+    // popl $reg
+    EmitToStreamer(OutStreamer, MCInstBuilder(X86::POP32r)
+      .addReg(MI->getOperand(0).getReg()));
+    return;
+  }
+
+  case X86::ADD32ri: {
+    // Lower the MO_GOT_ABSOLUTE_ADDRESS form of ADD32ri.
+    if (MI->getOperand(2).getTargetFlags() != X86II::MO_GOT_ABSOLUTE_ADDRESS)
+      break;
+
+    // Okay, we have something like:
+    //  EAX = ADD32ri EAX, MO_GOT_ABSOLUTE_ADDRESS(@MYGLOBAL)
+
+    // For this, we want to print something like:
+    //   MYGLOBAL + (. - PICBASE)
+    // However, we can't generate a ".", so just emit a new label here and refer
+    // to it.
+    MCSymbol *DotSym = OutContext.CreateTempSymbol();
+    OutStreamer.EmitLabel(DotSym);
+
+    // Now that we have emitted the label, lower the complex operand expression.
+    MCSymbol *OpSym = MCInstLowering.GetSymbolFromOperand(MI->getOperand(2));
+
+    const MCExpr *DotExpr = MCSymbolRefExpr::Create(DotSym, OutContext);
+    const MCExpr *PICBase =
+      MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), OutContext);
+    DotExpr = MCBinaryExpr::CreateSub(DotExpr, PICBase, OutContext);
+
+    DotExpr = MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(OpSym,OutContext),
+                                      DotExpr, OutContext);
+
+    EmitToStreamer(OutStreamer, MCInstBuilder(X86::ADD32ri)
+      .addReg(MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(1).getReg())
+      .addExpr(DotExpr));
+    return;
+  }
+
+  case TargetOpcode::STACKMAP:
+    return LowerSTACKMAP(OutStreamer, SM, *MI, Subtarget->is64Bit(), getSubtargetInfo());
+
+  case TargetOpcode::PATCHPOINT:
+    return LowerPATCHPOINT(OutStreamer, SM, *MI, Subtarget->is64Bit(), getSubtargetInfo());
+
+  case X86::MORESTACK_RET:
+    EmitToStreamer(OutStreamer, MCInstBuilder(getRetOpcode(*Subtarget)));
+    return;
+
+  case X86::MORESTACK_RET_RESTORE_R10:
+    // Return, then restore R10.
+    EmitToStreamer(OutStreamer, MCInstBuilder(getRetOpcode(*Subtarget)));
+    EmitToStreamer(OutStreamer, MCInstBuilder(X86::MOV64rr)
+      .addReg(X86::R10)
+      .addReg(X86::RAX));
+    return;
+
+  case X86::SEH_PushReg:
+    OutStreamer.EmitWinCFIPushReg(RI->getSEHRegNum(MI->getOperand(0).getImm()));
+    return;
+
+  case X86::SEH_SaveReg:
+    OutStreamer.EmitWinCFISaveReg(RI->getSEHRegNum(MI->getOperand(0).getImm()),
+                                  MI->getOperand(1).getImm());
+    return;
+
+  case X86::SEH_SaveXMM:
+    OutStreamer.EmitWinCFISaveXMM(RI->getSEHRegNum(MI->getOperand(0).getImm()),
+                                  MI->getOperand(1).getImm());
+    return;
+
+  case X86::SEH_StackAlloc:
+    OutStreamer.EmitWinCFIAllocStack(MI->getOperand(0).getImm());
+    return;
+
+  case X86::SEH_SetFrame:
+    OutStreamer.EmitWinCFISetFrame(RI->getSEHRegNum(MI->getOperand(0).getImm()),
+                                   MI->getOperand(1).getImm());
+    return;
+
+  case X86::SEH_PushFrame:
+    OutStreamer.EmitWinCFIPushFrame(MI->getOperand(0).getImm());
+    return;
+
+  case X86::SEH_EndPrologue:
+    OutStreamer.EmitWinCFIEndProlog();
+    return;
+  }
+
+  MCInst TmpInst;
+  MCInstLowering.Lower(MI, TmpInst);
+  EmitToStreamer(OutStreamer, TmpInst);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp
new file mode 100644
index 0000000..568dc22
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp
@@ -0,0 +1,14 @@
+//===-- X86MachineFuctionInfo.cpp - X86 machine function info -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MachineFunctionInfo.h"
+
+using namespace llvm;
+
+void X86MachineFunctionInfo::anchor() { }
diff --git a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h
new file mode 100644
index 0000000..78d20ce
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h
@@ -0,0 +1,145 @@
+//===-- X86MachineFuctionInfo.h - X86 machine function info -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares X86-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86MACHINEFUNCTIONINFO_H
+#define X86MACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+/// X86MachineFunctionInfo - This class is derived from MachineFunction and
+/// contains private X86 target-specific information for each MachineFunction.
+class X86MachineFunctionInfo : public MachineFunctionInfo {
+  virtual void anchor();
+
+  /// ForceFramePointer - True if the function is required to use of frame
+  /// pointer for reasons other than it containing dynamic allocation or
+  /// that FP eliminatation is turned off. For example, Cygwin main function
+  /// contains stack pointer re-alignment code which requires FP.
+  bool ForceFramePointer;
+
+  /// CalleeSavedFrameSize - Size of the callee-saved register portion of the
+  /// stack frame in bytes.
+  unsigned CalleeSavedFrameSize;
+
+  /// BytesToPopOnReturn - Number of bytes function pops on return (in addition
+  /// to the space used by the return address).
+  /// Used on windows platform for stdcall & fastcall name decoration
+  unsigned BytesToPopOnReturn;
+
+  /// ReturnAddrIndex - FrameIndex for return slot.
+  int ReturnAddrIndex;
+
+  /// TailCallReturnAddrDelta - The number of bytes by which return address
+  /// stack slot is moved as the result of tail call optimization.
+  int TailCallReturnAddrDelta;
+
+  /// SRetReturnReg - Some subtargets require that sret lowering includes
+  /// returning the value of the returned struct in a register. This field
+  /// holds the virtual register into which the sret argument is passed.
+  unsigned SRetReturnReg;
+
+  /// GlobalBaseReg - keeps track of the virtual register initialized for
+  /// use as the global base register. This is used for PIC in some PIC
+  /// relocation models.
+  unsigned GlobalBaseReg;
+
+  /// VarArgsFrameIndex - FrameIndex for start of varargs area.
+  int VarArgsFrameIndex;
+  /// RegSaveFrameIndex - X86-64 vararg func register save area.
+  int RegSaveFrameIndex;
+  /// VarArgsGPOffset - X86-64 vararg func int reg offset.
+  unsigned VarArgsGPOffset;
+  /// VarArgsFPOffset - X86-64 vararg func fp reg offset.
+  unsigned VarArgsFPOffset;
+  /// ArgumentStackSize - The number of bytes on stack consumed by the arguments
+  /// being passed on the stack.
+  unsigned ArgumentStackSize;
+  /// NumLocalDynamics - Number of local-dynamic TLS accesses.
+  unsigned NumLocalDynamics;
+
+public:
+  X86MachineFunctionInfo() : ForceFramePointer(false),
+                             CalleeSavedFrameSize(0),
+                             BytesToPopOnReturn(0),
+                             ReturnAddrIndex(0),
+                             TailCallReturnAddrDelta(0),
+                             SRetReturnReg(0),
+                             GlobalBaseReg(0),
+                             VarArgsFrameIndex(0),
+                             RegSaveFrameIndex(0),
+                             VarArgsGPOffset(0),
+                             VarArgsFPOffset(0),
+                             ArgumentStackSize(0),
+                             NumLocalDynamics(0) {}
+
+  explicit X86MachineFunctionInfo(MachineFunction &MF)
+    : ForceFramePointer(false),
+      CalleeSavedFrameSize(0),
+      BytesToPopOnReturn(0),
+      ReturnAddrIndex(0),
+      TailCallReturnAddrDelta(0),
+      SRetReturnReg(0),
+      GlobalBaseReg(0),
+      VarArgsFrameIndex(0),
+      RegSaveFrameIndex(0),
+      VarArgsGPOffset(0),
+      VarArgsFPOffset(0),
+      ArgumentStackSize(0),
+      NumLocalDynamics(0) {}
+
+  bool getForceFramePointer() const { return ForceFramePointer;}
+  void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; }
+
+  unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
+  void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; }
+
+  unsigned getBytesToPopOnReturn() const { return BytesToPopOnReturn; }
+  void setBytesToPopOnReturn (unsigned bytes) { BytesToPopOnReturn = bytes;}
+
+  int getRAIndex() const { return ReturnAddrIndex; }
+  void setRAIndex(int Index) { ReturnAddrIndex = Index; }
+
+  int getTCReturnAddrDelta() const { return TailCallReturnAddrDelta; }
+  void setTCReturnAddrDelta(int delta) {TailCallReturnAddrDelta = delta;}
+
+  unsigned getSRetReturnReg() const { return SRetReturnReg; }
+  void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
+
+  unsigned getGlobalBaseReg() const { return GlobalBaseReg; }
+  void setGlobalBaseReg(unsigned Reg) { GlobalBaseReg = Reg; }
+
+  int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
+  void setVarArgsFrameIndex(int Idx) { VarArgsFrameIndex = Idx; }
+
+  int getRegSaveFrameIndex() const { return RegSaveFrameIndex; }
+  void setRegSaveFrameIndex(int Idx) { RegSaveFrameIndex = Idx; }
+
+  unsigned getVarArgsGPOffset() const { return VarArgsGPOffset; }
+  void setVarArgsGPOffset(unsigned Offset) { VarArgsGPOffset = Offset; }
+
+  unsigned getVarArgsFPOffset() const { return VarArgsFPOffset; }
+  void setVarArgsFPOffset(unsigned Offset) { VarArgsFPOffset = Offset; }
+
+  unsigned getArgumentStackSize() const { return ArgumentStackSize; }
+  void setArgumentStackSize(unsigned size) { ArgumentStackSize = size; }
+
+  unsigned getNumLocalDynamicTLSAccesses() const { return NumLocalDynamics; }
+  void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamics; }
+
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp b/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp
new file mode 100644
index 0000000..6639875
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp
@@ -0,0 +1,217 @@
+//===-------- X86PadShortFunction.cpp - pad short functions -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which will pad short functions to prevent
+// a stall if a function returns before the return address is ready. This
+// is needed for some Intel Atom processors.
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-pad-short-functions"
+
+STATISTIC(NumBBsPadded, "Number of basic blocks padded");
+
+namespace {
+  struct VisitedBBInfo {
+    // HasReturn - Whether the BB contains a return instruction
+    bool HasReturn;
+
+    // Cycles - Number of cycles until return if HasReturn is true, otherwise
+    // number of cycles until end of the BB
+    unsigned int Cycles;
+
+    VisitedBBInfo() : HasReturn(false), Cycles(0) {}
+    VisitedBBInfo(bool HasReturn, unsigned int Cycles)
+      : HasReturn(HasReturn), Cycles(Cycles) {}
+  };
+
+  struct PadShortFunc : public MachineFunctionPass {
+    static char ID;
+    PadShortFunc() : MachineFunctionPass(ID)
+                   , Threshold(4), TM(nullptr), TII(nullptr) {}
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    const char *getPassName() const override {
+      return "X86 Atom pad short functions";
+    }
+
+  private:
+    void findReturns(MachineBasicBlock *MBB,
+                     unsigned int Cycles = 0);
+
+    bool cyclesUntilReturn(MachineBasicBlock *MBB,
+                           unsigned int &Cycles);
+
+    void addPadding(MachineBasicBlock *MBB,
+                    MachineBasicBlock::iterator &MBBI,
+                    unsigned int NOOPsToAdd);
+
+    const unsigned int Threshold;
+
+    // ReturnBBs - Maps basic blocks that return to the minimum number of
+    // cycles until the return, starting from the entry block.
+    DenseMap<MachineBasicBlock*, unsigned int> ReturnBBs;
+
+    // VisitedBBs - Cache of previously visited BBs.
+    DenseMap<MachineBasicBlock*, VisitedBBInfo> VisitedBBs;
+
+    const TargetMachine *TM;
+    const TargetInstrInfo *TII;
+  };
+
+  char PadShortFunc::ID = 0;
+}
+
+FunctionPass *llvm::createX86PadShortFunctions() {
+  return new PadShortFunc();
+}
+
+/// runOnMachineFunction - Loop over all of the basic blocks, inserting
+/// NOOP instructions before early exits.
+bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) {
+  const AttributeSet &FnAttrs = MF.getFunction()->getAttributes();
+  if (FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
+                           Attribute::OptimizeForSize) ||
+      FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
+                           Attribute::MinSize)) {
+    return false;
+  }
+
+  TM = &MF.getTarget();
+  if (!TM->getSubtarget<X86Subtarget>().padShortFunctions())
+    return false;
+
+  TII = TM->getInstrInfo();
+
+  // Search through basic blocks and mark the ones that have early returns
+  ReturnBBs.clear();
+  VisitedBBs.clear();
+  findReturns(MF.begin());
+
+  bool MadeChange = false;
+
+  MachineBasicBlock *MBB;
+  unsigned int Cycles = 0;
+
+  // Pad the identified basic blocks with NOOPs
+  for (DenseMap<MachineBasicBlock*, unsigned int>::iterator I = ReturnBBs.begin();
+       I != ReturnBBs.end(); ++I) {
+    MBB = I->first;
+    Cycles = I->second;
+
+    if (Cycles < Threshold) {
+      // BB ends in a return. Skip over any DBG_VALUE instructions
+      // trailing the terminator.
+      assert(MBB->size() > 0 &&
+             "Basic block should contain at least a RET but is empty");
+      MachineBasicBlock::iterator ReturnLoc = --MBB->end();
+
+      while (ReturnLoc->isDebugValue())
+        --ReturnLoc;
+      assert(ReturnLoc->isReturn() && !ReturnLoc->isCall() &&
+             "Basic block does not end with RET");
+
+      addPadding(MBB, ReturnLoc, Threshold - Cycles);
+      NumBBsPadded++;
+      MadeChange = true;
+    }
+  }
+
+  return MadeChange;
+}
+
+/// findReturn - Starting at MBB, follow control flow and add all
+/// basic blocks that contain a return to ReturnBBs.
+void PadShortFunc::findReturns(MachineBasicBlock *MBB, unsigned int Cycles) {
+  // If this BB has a return, note how many cycles it takes to get there.
+  bool hasReturn = cyclesUntilReturn(MBB, Cycles);
+  if (Cycles >= Threshold)
+    return;
+
+  if (hasReturn) {
+    ReturnBBs[MBB] = std::max(ReturnBBs[MBB], Cycles);
+    return;
+  }
+
+  // Follow branches in BB and look for returns
+  for (MachineBasicBlock::succ_iterator I = MBB->succ_begin();
+       I != MBB->succ_end(); ++I) {
+    if (*I == MBB)
+      continue;
+    findReturns(*I, Cycles);
+  }
+}
+
+/// cyclesUntilReturn - return true if the MBB has a return instruction,
+/// and return false otherwise.
+/// Cycles will be incremented by the number of cycles taken to reach the
+/// return or the end of the BB, whichever occurs first.
+bool PadShortFunc::cyclesUntilReturn(MachineBasicBlock *MBB,
+                                     unsigned int &Cycles) {
+  // Return cached result if BB was previously visited
+  DenseMap<MachineBasicBlock*, VisitedBBInfo>::iterator it
+    = VisitedBBs.find(MBB);
+  if (it != VisitedBBs.end()) {
+    VisitedBBInfo BBInfo = it->second;
+    Cycles += BBInfo.Cycles;
+    return BBInfo.HasReturn;
+  }
+
+  unsigned int CyclesToEnd = 0;
+
+  for (MachineBasicBlock::iterator MBBI = MBB->begin();
+        MBBI != MBB->end(); ++MBBI) {
+    MachineInstr *MI = MBBI;
+    // Mark basic blocks with a return instruction. Calls to other
+    // functions do not count because the called function will be padded,
+    // if necessary.
+    if (MI->isReturn() && !MI->isCall()) {
+      VisitedBBs[MBB] = VisitedBBInfo(true, CyclesToEnd);
+      Cycles += CyclesToEnd;
+      return true;
+    }
+
+    CyclesToEnd += TII->getInstrLatency(TM->getInstrItineraryData(), MI);
+  }
+
+  VisitedBBs[MBB] = VisitedBBInfo(false, CyclesToEnd);
+  Cycles += CyclesToEnd;
+  return false;
+}
+
+/// addPadding - Add the given number of NOOP instructions to the function
+/// just prior to the return at MBBI
+void PadShortFunc::addPadding(MachineBasicBlock *MBB,
+                              MachineBasicBlock::iterator &MBBI,
+                              unsigned int NOOPsToAdd) {
+  DebugLoc DL = MBBI->getDebugLoc();
+
+  while (NOOPsToAdd-- > 0) {
+    BuildMI(*MBB, MBBI, DL, TII->get(X86::NOOP));
+    BuildMI(*MBB, MBBI, DL, TII->get(X86::NOOP));
+  }
+}
diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp
new file mode 100644
index 0000000..e8a7e84
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -0,0 +1,717 @@
+//===-- X86RegisterInfo.cpp - X86 Register Information --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetRegisterInfo class.
+// This file is responsible for the frame pointer elimination optimization
+// on X86.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86RegisterInfo.h"
+#include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+#define GET_REGINFO_TARGET_DESC
+#include "X86GenRegisterInfo.inc"
+
+cl::opt<bool>
+ForceStackAlign("force-align-stack",
+                 cl::desc("Force align the stack to the minimum alignment"
+                           " needed for the function."),
+                 cl::init(false), cl::Hidden);
+
+static cl::opt<bool>
+EnableBasePointer("x86-use-base-pointer", cl::Hidden, cl::init(true),
+          cl::desc("Enable use of a base pointer for complex stack frames"));
+
+X86RegisterInfo::X86RegisterInfo(const X86Subtarget &STI)
+    : X86GenRegisterInfo(
+          (STI.is64Bit() ? X86::RIP : X86::EIP),
+          X86_MC::getDwarfRegFlavour(STI.getTargetTriple(), false),
+          X86_MC::getDwarfRegFlavour(STI.getTargetTriple(), true),
+          (STI.is64Bit() ? X86::RIP : X86::EIP)),
+      Subtarget(STI) {
+  X86_MC::InitLLVM2SEHRegisterMapping(this);
+
+  // Cache some information.
+  Is64Bit = Subtarget.is64Bit();
+  IsWin64 = Subtarget.isTargetWin64();
+
+  if (Is64Bit) {
+    SlotSize = 8;
+    StackPtr = X86::RSP;
+    FramePtr = X86::RBP;
+  } else {
+    SlotSize = 4;
+    StackPtr = X86::ESP;
+    FramePtr = X86::EBP;
+  }
+  // Use a callee-saved register as the base pointer.  These registers must
+  // not conflict with any ABI requirements.  For example, in 32-bit mode PIC
+  // requires GOT in the EBX register before function calls via PLT GOT pointer.
+  BasePtr = Is64Bit ? X86::RBX : X86::ESI;
+}
+
+bool
+X86RegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
+  // ExeDepsFixer and PostRAScheduler require liveness.
+  return true;
+}
+
+int
+X86RegisterInfo::getSEHRegNum(unsigned i) const {
+  return getEncodingValue(i);
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getSubClassWithSubReg(const TargetRegisterClass *RC,
+                                       unsigned Idx) const {
+  // The sub_8bit sub-register index is more constrained in 32-bit mode.
+  // It behaves just like the sub_8bit_hi index.
+  if (!Is64Bit && Idx == X86::sub_8bit)
+    Idx = X86::sub_8bit_hi;
+
+  // Forward to TableGen's default version.
+  return X86GenRegisterInfo::getSubClassWithSubReg(RC, Idx);
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A,
+                                          const TargetRegisterClass *B,
+                                          unsigned SubIdx) const {
+  // The sub_8bit sub-register index is more constrained in 32-bit mode.
+  if (!Is64Bit && SubIdx == X86::sub_8bit) {
+    A = X86GenRegisterInfo::getSubClassWithSubReg(A, X86::sub_8bit_hi);
+    if (!A)
+      return nullptr;
+  }
+  return X86GenRegisterInfo::getMatchingSuperRegClass(A, B, SubIdx);
+}
+
+const TargetRegisterClass*
+X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) const{
+  // Don't allow super-classes of GR8_NOREX.  This class is only used after
+  // extrating sub_8bit_hi sub-registers.  The H sub-registers cannot be copied
+  // to the full GR8 register class in 64-bit mode, so we cannot allow the
+  // reigster class inflation.
+  //
+  // The GR8_NOREX class is always used in a way that won't be constrained to a
+  // sub-class, so sub-classes like GR8_ABCD_L are allowed to expand to the
+  // full GR8 class.
+  if (RC == &X86::GR8_NOREXRegClass)
+    return RC;
+
+  const TargetRegisterClass *Super = RC;
+  TargetRegisterClass::sc_iterator I = RC->getSuperClasses();
+  do {
+    switch (Super->getID()) {
+    case X86::GR8RegClassID:
+    case X86::GR16RegClassID:
+    case X86::GR32RegClassID:
+    case X86::GR64RegClassID:
+    case X86::FR32RegClassID:
+    case X86::FR64RegClassID:
+    case X86::RFP32RegClassID:
+    case X86::RFP64RegClassID:
+    case X86::RFP80RegClassID:
+    case X86::VR128RegClassID:
+    case X86::VR256RegClassID:
+      // Don't return a super-class that would shrink the spill size.
+      // That can happen with the vector and float classes.
+      if (Super->getSize() == RC->getSize())
+        return Super;
+    }
+    Super = *I++;
+  } while (Super);
+  return RC;
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getPointerRegClass(const MachineFunction &MF,
+                                    unsigned Kind) const {
+  switch (Kind) {
+  default: llvm_unreachable("Unexpected Kind in getPointerRegClass!");
+  case 0: // Normal GPRs.
+    if (Subtarget.isTarget64BitLP64())
+      return &X86::GR64RegClass;
+    return &X86::GR32RegClass;
+  case 1: // Normal GPRs except the stack pointer (for encoding reasons).
+    if (Subtarget.isTarget64BitLP64())
+      return &X86::GR64_NOSPRegClass;
+    return &X86::GR32_NOSPRegClass;
+  case 2: // Available for tailcall (not callee-saved GPRs).
+    if (Subtarget.isTargetWin64())
+      return &X86::GR64_TCW64RegClass;
+    else if (Subtarget.is64Bit())
+      return &X86::GR64_TCRegClass;
+
+    const Function *F = MF.getFunction();
+    bool hasHipeCC = (F ? F->getCallingConv() == CallingConv::HiPE : false);
+    if (hasHipeCC)
+      return &X86::GR32RegClass;
+    return &X86::GR32_TCRegClass;
+  }
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
+  if (RC == &X86::CCRRegClass) {
+    if (Is64Bit)
+      return &X86::GR64RegClass;
+    else
+      return &X86::GR32RegClass;
+  }
+  return RC;
+}
+
+unsigned
+X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
+                                     MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  unsigned FPDiff = TFI->hasFP(MF) ? 1 : 0;
+  switch (RC->getID()) {
+  default:
+    return 0;
+  case X86::GR32RegClassID:
+    return 4 - FPDiff;
+  case X86::GR64RegClassID:
+    return 12 - FPDiff;
+  case X86::VR128RegClassID:
+    return Subtarget.is64Bit() ? 10 : 4;
+  case X86::VR64RegClassID:
+    return 4;
+  }
+}
+
+const MCPhysReg *
+X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  bool HasAVX = Subtarget.hasAVX();
+  bool HasAVX512 = Subtarget.hasAVX512();
+
+  assert(MF && "MachineFunction required");
+  switch (MF->getFunction()->getCallingConv()) {
+  case CallingConv::GHC:
+  case CallingConv::HiPE:
+    return CSR_NoRegs_SaveList;
+  case CallingConv::AnyReg:
+    if (HasAVX)
+      return CSR_64_AllRegs_AVX_SaveList;
+    return CSR_64_AllRegs_SaveList;
+  case CallingConv::PreserveMost:
+    return CSR_64_RT_MostRegs_SaveList;
+  case CallingConv::PreserveAll:
+    if (HasAVX)
+      return CSR_64_RT_AllRegs_AVX_SaveList;
+    return CSR_64_RT_AllRegs_SaveList;
+  case CallingConv::Intel_OCL_BI: {
+    if (HasAVX512 && IsWin64)
+      return CSR_Win64_Intel_OCL_BI_AVX512_SaveList;
+    if (HasAVX512 && Is64Bit)
+      return CSR_64_Intel_OCL_BI_AVX512_SaveList;
+    if (HasAVX && IsWin64)
+      return CSR_Win64_Intel_OCL_BI_AVX_SaveList;
+    if (HasAVX && Is64Bit)
+      return CSR_64_Intel_OCL_BI_AVX_SaveList;
+    if (!HasAVX && !IsWin64 && Is64Bit)
+      return CSR_64_Intel_OCL_BI_SaveList;
+    break;
+  }
+  case CallingConv::Cold:
+    if (Is64Bit)
+      return CSR_64_MostRegs_SaveList;
+    break;
+  default:
+    break;
+  }
+
+  bool CallsEHReturn = MF->getMMI().callsEHReturn();
+  if (Is64Bit) {
+    if (IsWin64)
+      return CSR_Win64_SaveList;
+    if (CallsEHReturn)
+      return CSR_64EHRet_SaveList;
+    return CSR_64_SaveList;
+  }
+  if (CallsEHReturn)
+    return CSR_32EHRet_SaveList;
+  return CSR_32_SaveList;
+}
+
+const uint32_t*
+X86RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
+  bool HasAVX = Subtarget.hasAVX();
+  bool HasAVX512 = Subtarget.hasAVX512();
+
+  switch (CC) {
+  case CallingConv::GHC:
+  case CallingConv::HiPE:
+    return CSR_NoRegs_RegMask;
+  case CallingConv::AnyReg:
+    if (HasAVX)
+      return CSR_64_AllRegs_AVX_RegMask;
+    return CSR_64_AllRegs_RegMask;
+  case CallingConv::PreserveMost:
+    return CSR_64_RT_MostRegs_RegMask;
+  case CallingConv::PreserveAll:
+    if (HasAVX)
+      return CSR_64_RT_AllRegs_AVX_RegMask;
+    return CSR_64_RT_AllRegs_RegMask;
+  case CallingConv::Intel_OCL_BI: {
+    if (HasAVX512 && IsWin64)
+      return CSR_Win64_Intel_OCL_BI_AVX512_RegMask;
+    if (HasAVX512 && Is64Bit)
+      return CSR_64_Intel_OCL_BI_AVX512_RegMask;
+    if (HasAVX && IsWin64)
+      return CSR_Win64_Intel_OCL_BI_AVX_RegMask;
+    if (HasAVX && Is64Bit)
+      return CSR_64_Intel_OCL_BI_AVX_RegMask;
+    if (!HasAVX && !IsWin64 && Is64Bit)
+      return CSR_64_Intel_OCL_BI_RegMask;
+    break;
+  }
+  case CallingConv::Cold:
+    if (Is64Bit)
+      return CSR_64_MostRegs_RegMask;
+    break;
+  default:
+    break;
+  }
+
+  // Unlike getCalleeSavedRegs(), we don't have MMI so we can't check
+  // callsEHReturn().
+  if (Is64Bit) {
+    if (IsWin64)
+      return CSR_Win64_RegMask;
+    return CSR_64_RegMask;
+  }
+  return CSR_32_RegMask;
+}
+
+const uint32_t*
+X86RegisterInfo::getNoPreservedMask() const {
+  return CSR_NoRegs_RegMask;
+}
+
+BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  // Set the stack-pointer register and its aliases as reserved.
+  for (MCSubRegIterator I(X86::RSP, this, /*IncludeSelf=*/true); I.isValid();
+       ++I)
+    Reserved.set(*I);
+
+  // Set the instruction pointer register and its aliases as reserved.
+  for (MCSubRegIterator I(X86::RIP, this, /*IncludeSelf=*/true); I.isValid();
+       ++I)
+    Reserved.set(*I);
+
+  // Set the frame-pointer register and its aliases as reserved if needed.
+  if (TFI->hasFP(MF)) {
+    for (MCSubRegIterator I(X86::RBP, this, /*IncludeSelf=*/true); I.isValid();
+         ++I)
+      Reserved.set(*I);
+  }
+
+  // Set the base-pointer register and its aliases as reserved if needed.
+  if (hasBasePointer(MF)) {
+    CallingConv::ID CC = MF.getFunction()->getCallingConv();
+    const uint32_t* RegMask = getCallPreservedMask(CC);
+    if (MachineOperand::clobbersPhysReg(RegMask, getBaseRegister()))
+      report_fatal_error(
+        "Stack realignment in presence of dynamic allocas is not supported with"
+        "this calling convention.");
+
+    for (MCSubRegIterator I(getBaseRegister(), this, /*IncludeSelf=*/true);
+         I.isValid(); ++I)
+      Reserved.set(*I);
+  }
+
+  // Mark the segment registers as reserved.
+  Reserved.set(X86::CS);
+  Reserved.set(X86::SS);
+  Reserved.set(X86::DS);
+  Reserved.set(X86::ES);
+  Reserved.set(X86::FS);
+  Reserved.set(X86::GS);
+
+  // Mark the floating point stack registers as reserved.
+  for (unsigned n = 0; n != 8; ++n)
+    Reserved.set(X86::ST0 + n);
+
+  // Reserve the registers that only exist in 64-bit mode.
+  if (!Is64Bit) {
+    // These 8-bit registers are part of the x86-64 extension even though their
+    // super-registers are old 32-bits.
+    Reserved.set(X86::SIL);
+    Reserved.set(X86::DIL);
+    Reserved.set(X86::BPL);
+    Reserved.set(X86::SPL);
+
+    for (unsigned n = 0; n != 8; ++n) {
+      // R8, R9, ...
+      for (MCRegAliasIterator AI(X86::R8 + n, this, true); AI.isValid(); ++AI)
+        Reserved.set(*AI);
+
+      // XMM8, XMM9, ...
+      for (MCRegAliasIterator AI(X86::XMM8 + n, this, true); AI.isValid(); ++AI)
+        Reserved.set(*AI);
+    }
+  }
+  if (!Is64Bit || !Subtarget.hasAVX512()) {
+    for (unsigned n = 16; n != 32; ++n) {
+      for (MCRegAliasIterator AI(X86::XMM0 + n, this, true); AI.isValid(); ++AI)
+        Reserved.set(*AI);
+    }
+  }
+
+  return Reserved;
+}
+
+//===----------------------------------------------------------------------===//
+// Stack Frame Processing methods
+//===----------------------------------------------------------------------===//
+
+bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
+   const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+   if (!EnableBasePointer)
+     return false;
+
+   // When we need stack realignment, we can't address the stack from the frame
+   // pointer.  When we have dynamic allocas or stack-adjusting inline asm, we
+   // can't address variables from the stack pointer.  MS inline asm can
+   // reference locals while also adjusting the stack pointer.  When we can't
+   // use both the SP and the FP, we need a separate base pointer register.
+   bool CantUseFP = needsStackRealignment(MF);
+   bool CantUseSP =
+       MFI->hasVarSizedObjects() || MFI->hasInlineAsmWithSPAdjust();
+   return CantUseFP && CantUseSP;
+}
+
+bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const {
+  if (MF.getFunction()->hasFnAttribute("no-realign-stack"))
+    return false;
+
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const MachineRegisterInfo *MRI = &MF.getRegInfo();
+
+  // Stack realignment requires a frame pointer.  If we already started
+  // register allocation with frame pointer elimination, it is too late now.
+  if (!MRI->canReserveReg(FramePtr))
+    return false;
+
+  // If a base pointer is necessary.  Check that it isn't too late to reserve
+  // it.
+  if (MFI->hasVarSizedObjects())
+    return MRI->canReserveReg(BasePtr);
+  return true;
+}
+
+bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const Function *F = MF.getFunction();
+  unsigned StackAlign = MF.getTarget().getFrameLowering()->getStackAlignment();
+  bool requiresRealignment =
+    ((MFI->getMaxAlignment() > StackAlign) ||
+     F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                     Attribute::StackAlignment));
+
+  // If we've requested that we force align the stack do so now.
+  if (ForceStackAlign)
+    return canRealignStack(MF);
+
+  return requiresRealignment && canRealignStack(MF);
+}
+
+bool X86RegisterInfo::hasReservedSpillSlot(const MachineFunction &MF,
+                                           unsigned Reg, int &FrameIdx) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  if (Reg == FramePtr && TFI->hasFP(MF)) {
+    FrameIdx = MF.getFrameInfo()->getObjectIndexBegin();
+    return true;
+  }
+  return false;
+}
+
+void
+X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                     int SPAdj, unsigned FIOperandNum,
+                                     RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected");
+
+  MachineInstr &MI = *II;
+  MachineFunction &MF = *MI.getParent()->getParent();
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+  unsigned BasePtr;
+
+  unsigned Opc = MI.getOpcode();
+  bool AfterFPPop = Opc == X86::TAILJMPm64 || Opc == X86::TAILJMPm;
+  if (hasBasePointer(MF))
+    BasePtr = (FrameIndex < 0 ? FramePtr : getBaseRegister());
+  else if (needsStackRealignment(MF))
+    BasePtr = (FrameIndex < 0 ? FramePtr : StackPtr);
+  else if (AfterFPPop)
+    BasePtr = StackPtr;
+  else
+    BasePtr = (TFI->hasFP(MF) ? FramePtr : StackPtr);
+
+  // This must be part of a four operand memory reference.  Replace the
+  // FrameIndex with base register with EBP.  Add an offset to the offset.
+  MI.getOperand(FIOperandNum).ChangeToRegister(BasePtr, false);
+
+  // Now add the frame object offset to the offset from EBP.
+  int FIOffset;
+  if (AfterFPPop) {
+    // Tail call jmp happens after FP is popped.
+    const MachineFrameInfo *MFI = MF.getFrameInfo();
+    FIOffset = MFI->getObjectOffset(FrameIndex) - TFI->getOffsetOfLocalArea();
+  } else
+    FIOffset = TFI->getFrameIndexOffset(MF, FrameIndex);
+
+  // The frame index format for stackmaps and patchpoints is different from the
+  // X86 format. It only has a FI and an offset.
+  if (Opc == TargetOpcode::STACKMAP || Opc == TargetOpcode::PATCHPOINT) {
+    assert(BasePtr == FramePtr && "Expected the FP as base register");
+    int64_t Offset = MI.getOperand(FIOperandNum + 1).getImm() + FIOffset;
+    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+    return;
+  }
+
+  if (MI.getOperand(FIOperandNum+3).isImm()) {
+    // Offset is a 32-bit integer.
+    int Imm = (int)(MI.getOperand(FIOperandNum + 3).getImm());
+    int Offset = FIOffset + Imm;
+    assert((!Is64Bit || isInt<32>((long long)FIOffset + Imm)) &&
+           "Requesting 64-bit offset in 32-bit immediate!");
+    MI.getOperand(FIOperandNum + 3).ChangeToImmediate(Offset);
+  } else {
+    // Offset is symbolic. This is extremely rare.
+    uint64_t Offset = FIOffset +
+      (uint64_t)MI.getOperand(FIOperandNum+3).getOffset();
+    MI.getOperand(FIOperandNum + 3).setOffset(Offset);
+  }
+}
+
+unsigned X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  return TFI->hasFP(MF) ? FramePtr : StackPtr;
+}
+
+namespace llvm {
+unsigned getX86SubSuperRegister(unsigned Reg, MVT::SimpleValueType VT,
+                                bool High) {
+  switch (VT) {
+  default: llvm_unreachable("Unexpected VT");
+  case MVT::i8:
+    if (High) {
+      switch (Reg) {
+      default: return getX86SubSuperRegister(Reg, MVT::i64);
+      case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+        return X86::SI;
+      case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+        return X86::DI;
+      case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+        return X86::BP;
+      case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+        return X86::SP;
+      case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+        return X86::AH;
+      case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+        return X86::DH;
+      case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+        return X86::CH;
+      case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+        return X86::BH;
+      }
+    } else {
+      switch (Reg) {
+      default: llvm_unreachable("Unexpected register");
+      case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+        return X86::AL;
+      case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+        return X86::DL;
+      case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+        return X86::CL;
+      case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+        return X86::BL;
+      case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+        return X86::SIL;
+      case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+        return X86::DIL;
+      case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+        return X86::BPL;
+      case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+        return X86::SPL;
+      case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+        return X86::R8B;
+      case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+        return X86::R9B;
+      case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+        return X86::R10B;
+      case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+        return X86::R11B;
+      case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+        return X86::R12B;
+      case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+        return X86::R13B;
+      case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+        return X86::R14B;
+      case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+        return X86::R15B;
+      }
+    }
+  case MVT::i16:
+    switch (Reg) {
+    default: llvm_unreachable("Unexpected register");
+    case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+      return X86::AX;
+    case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+      return X86::DX;
+    case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+      return X86::CX;
+    case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+      return X86::BX;
+    case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+      return X86::SI;
+    case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+      return X86::DI;
+    case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+      return X86::BP;
+    case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+      return X86::SP;
+    case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+      return X86::R8W;
+    case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+      return X86::R9W;
+    case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+      return X86::R10W;
+    case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+      return X86::R11W;
+    case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+      return X86::R12W;
+    case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+      return X86::R13W;
+    case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+      return X86::R14W;
+    case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+      return X86::R15W;
+    }
+  case MVT::i32:
+    switch (Reg) {
+    default: llvm_unreachable("Unexpected register");
+    case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+      return X86::EAX;
+    case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+      return X86::EDX;
+    case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+      return X86::ECX;
+    case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+      return X86::EBX;
+    case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+      return X86::ESI;
+    case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+      return X86::EDI;
+    case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+      return X86::EBP;
+    case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+      return X86::ESP;
+    case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+      return X86::R8D;
+    case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+      return X86::R9D;
+    case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+      return X86::R10D;
+    case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+      return X86::R11D;
+    case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+      return X86::R12D;
+    case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+      return X86::R13D;
+    case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+      return X86::R14D;
+    case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+      return X86::R15D;
+    }
+  case MVT::i64:
+    switch (Reg) {
+    default: llvm_unreachable("Unexpected register");
+    case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+      return X86::RAX;
+    case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+      return X86::RDX;
+    case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+      return X86::RCX;
+    case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+      return X86::RBX;
+    case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+      return X86::RSI;
+    case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+      return X86::RDI;
+    case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+      return X86::RBP;
+    case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+      return X86::RSP;
+    case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+      return X86::R8;
+    case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+      return X86::R9;
+    case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+      return X86::R10;
+    case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+      return X86::R11;
+    case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+      return X86::R12;
+    case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+      return X86::R13;
+    case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+      return X86::R14;
+    case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+      return X86::R15;
+    }
+  }
+}
+
+unsigned get512BitSuperRegister(unsigned Reg) {
+  if (Reg >= X86::XMM0 && Reg <= X86::XMM31)
+    return X86::ZMM0 + (Reg - X86::XMM0);
+  if (Reg >= X86::YMM0 && Reg <= X86::YMM31)
+    return X86::ZMM0 + (Reg - X86::YMM0);
+  if (Reg >= X86::ZMM0 && Reg <= X86::ZMM31)
+    return Reg;
+  llvm_unreachable("Unexpected SIMD register");
+}
+
+}
diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.h b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h
new file mode 100644
index 0000000..74efd1f
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h
@@ -0,0 +1,141 @@
+//===-- X86RegisterInfo.h - X86 Register Information Impl -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86REGISTERINFO_H
+#define X86REGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "X86GenRegisterInfo.inc"
+
+namespace llvm {
+  class Type;
+  class TargetInstrInfo;
+  class X86Subtarget;
+
+class X86RegisterInfo final : public X86GenRegisterInfo {
+public:
+  const X86Subtarget &Subtarget;
+
+private:
+  /// Is64Bit - Is the target 64-bits.
+  ///
+  bool Is64Bit;
+
+  /// IsWin64 - Is the target on of win64 flavours
+  ///
+  bool IsWin64;
+
+  /// SlotSize - Stack slot size in bytes.
+  ///
+  unsigned SlotSize;
+
+  /// StackPtr - X86 physical register used as stack ptr.
+  ///
+  unsigned StackPtr;
+
+  /// FramePtr - X86 physical register used as frame ptr.
+  ///
+  unsigned FramePtr;
+
+  /// BasePtr - X86 physical register used as a base ptr in complex stack
+  /// frames. I.e., when we need a 3rd base, not just SP and FP, due to
+  /// variable size stack objects.
+  unsigned BasePtr;
+
+public:
+  X86RegisterInfo(const X86Subtarget &STI);
+
+  // FIXME: This should be tablegen'd like getDwarfRegNum is
+  int getSEHRegNum(unsigned i) const;
+
+  /// Code Generation virtual methods...
+  ///
+  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;
+
+  /// getMatchingSuperRegClass - Return a subclass of the specified register
+  /// class A so that each register in it has a sub-register of the
+  /// specified sub-register index which is in the specified register class B.
+  const TargetRegisterClass *
+  getMatchingSuperRegClass(const TargetRegisterClass *A,
+                           const TargetRegisterClass *B,
+                           unsigned Idx) const override;
+
+  const TargetRegisterClass *
+  getSubClassWithSubReg(const TargetRegisterClass *RC,
+                        unsigned Idx) const override;
+
+  const TargetRegisterClass*
+  getLargestLegalSuperClass(const TargetRegisterClass *RC) const override;
+
+  /// getPointerRegClass - Returns a TargetRegisterClass used for pointer
+  /// values.
+  const TargetRegisterClass *
+  getPointerRegClass(const MachineFunction &MF,
+                     unsigned Kind = 0) const override;
+
+  /// getCrossCopyRegClass - Returns a legal register class to copy a register
+  /// in the specified class to or from. Returns NULL if it is possible to copy
+  /// between a two registers of the specified class.
+  const TargetRegisterClass *
+  getCrossCopyRegClass(const TargetRegisterClass *RC) const override;
+
+  unsigned getRegPressureLimit(const TargetRegisterClass *RC,
+                               MachineFunction &MF) const override;
+
+  /// getCalleeSavedRegs - Return a null-terminated list of all of the
+  /// callee-save registers on this target.
+  const MCPhysReg *
+  getCalleeSavedRegs(const MachineFunction* MF) const override;
+  const uint32_t *getCallPreservedMask(CallingConv::ID) const override;
+  const uint32_t *getNoPreservedMask() const;
+
+  /// getReservedRegs - Returns a bitset indexed by physical register number
+  /// indicating if a register is a special register that has particular uses and
+  /// should be considered unavailable at all times, e.g. SP, RA. This is used by
+  /// register scavenger to determine what registers are free.
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+  bool hasBasePointer(const MachineFunction &MF) const;
+
+  bool canRealignStack(const MachineFunction &MF) const;
+
+  bool needsStackRealignment(const MachineFunction &MF) const override;
+
+  bool hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg,
+                            int &FrameIdx) const override;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator MI,
+                           int SPAdj, unsigned FIOperandNum,
+                           RegScavenger *RS = nullptr) const override;
+
+  // Debug information queries.
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
+  unsigned getStackRegister() const { return StackPtr; }
+  unsigned getBaseRegister() const { return BasePtr; }
+  // FIXME: Move to FrameInfok
+  unsigned getSlotSize() const { return SlotSize; }
+};
+
+// getX86SubSuperRegister - X86 utility function. It returns the sub or super
+// register of a specific X86 register.
+// e.g. getX86SubSuperRegister(X86::EAX, MVT::i16) return X86:AX
+unsigned getX86SubSuperRegister(unsigned, MVT::SimpleValueType, bool High=false);
+
+//get512BitRegister - X86 utility - returns 512-bit super register
+unsigned get512BitSuperRegister(unsigned Reg);
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.td b/contrib/llvm/lib/Target/X86/X86RegisterInfo.td
new file mode 100644
index 0000000..0da9863
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.td
@@ -0,0 +1,481 @@
+//===- X86RegisterInfo.td - Describe the X86 Register File --*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 Register file, defining the registers themselves,
+// aliases between the registers, and the register classes built out of the
+// registers.
+//
+//===----------------------------------------------------------------------===//
+
+class X86Reg<string n, bits<16> Enc, list<Register> subregs = []> : Register<n> {
+  let Namespace = "X86";
+  let HWEncoding = Enc;
+  let SubRegs = subregs;
+}
+
+// Subregister indices.
+let Namespace = "X86" in {
+  def sub_8bit    : SubRegIndex<8>;
+  def sub_8bit_hi : SubRegIndex<8, 8>;
+  def sub_16bit   : SubRegIndex<16>;
+  def sub_32bit   : SubRegIndex<32>;
+  def sub_xmm     : SubRegIndex<128>;
+  def sub_ymm     : SubRegIndex<256>;
+}
+
+//===----------------------------------------------------------------------===//
+//  Register definitions...
+//
+
+// In the register alias definitions below, we define which registers alias
+// which others.  We only specify which registers the small registers alias,
+// because the register file generator is smart enough to figure out that
+// AL aliases AX if we tell it that AX aliased AL (for example).
+
+// Dwarf numbering is different for 32-bit and 64-bit, and there are
+// variations by target as well. Currently the first entry is for X86-64,
+// second - for EH on X86-32/Darwin and third is 'generic' one (X86-32/Linux
+// and debug information on X86-32/Darwin)
+
+// 8-bit registers
+// Low registers
+def AL : X86Reg<"al", 0>;
+def DL : X86Reg<"dl", 2>;
+def CL : X86Reg<"cl", 1>;
+def BL : X86Reg<"bl", 3>;
+
+// High registers. On x86-64, these cannot be used in any instruction
+// with a REX prefix.
+def AH : X86Reg<"ah", 4>;
+def DH : X86Reg<"dh", 6>;
+def CH : X86Reg<"ch", 5>;
+def BH : X86Reg<"bh", 7>;
+
+// X86-64 only, requires REX.
+let CostPerUse = 1 in {
+def SIL  : X86Reg<"sil",   6>;
+def DIL  : X86Reg<"dil",   7>;
+def BPL  : X86Reg<"bpl",   5>;
+def SPL  : X86Reg<"spl",   4>;
+def R8B  : X86Reg<"r8b",   8>;
+def R9B  : X86Reg<"r9b",   9>;
+def R10B : X86Reg<"r10b", 10>;
+def R11B : X86Reg<"r11b", 11>;
+def R12B : X86Reg<"r12b", 12>;
+def R13B : X86Reg<"r13b", 13>;
+def R14B : X86Reg<"r14b", 14>;
+def R15B : X86Reg<"r15b", 15>;
+}
+
+// 16-bit registers
+let SubRegIndices = [sub_8bit, sub_8bit_hi], CoveredBySubRegs = 1 in {
+def AX : X86Reg<"ax", 0, [AL,AH]>;
+def DX : X86Reg<"dx", 2, [DL,DH]>;
+def CX : X86Reg<"cx", 1, [CL,CH]>;
+def BX : X86Reg<"bx", 3, [BL,BH]>;
+}
+let SubRegIndices = [sub_8bit] in {
+def SI : X86Reg<"si", 6, [SIL]>;
+def DI : X86Reg<"di", 7, [DIL]>;
+def BP : X86Reg<"bp", 5, [BPL]>;
+def SP : X86Reg<"sp", 4, [SPL]>;
+}
+def IP : X86Reg<"ip", 0>;
+
+// X86-64 only, requires REX.
+let SubRegIndices = [sub_8bit], CostPerUse = 1 in {
+def R8W  : X86Reg<"r8w",   8, [R8B]>;
+def R9W  : X86Reg<"r9w",   9, [R9B]>;
+def R10W : X86Reg<"r10w", 10, [R10B]>;
+def R11W : X86Reg<"r11w", 11, [R11B]>;
+def R12W : X86Reg<"r12w", 12, [R12B]>;
+def R13W : X86Reg<"r13w", 13, [R13B]>;
+def R14W : X86Reg<"r14w", 14, [R14B]>;
+def R15W : X86Reg<"r15w", 15, [R15B]>;
+}
+
+// 32-bit registers
+let SubRegIndices = [sub_16bit] in {
+def EAX : X86Reg<"eax", 0, [AX]>, DwarfRegNum<[-2, 0, 0]>;
+def EDX : X86Reg<"edx", 2, [DX]>, DwarfRegNum<[-2, 2, 2]>;
+def ECX : X86Reg<"ecx", 1, [CX]>, DwarfRegNum<[-2, 1, 1]>;
+def EBX : X86Reg<"ebx", 3, [BX]>, DwarfRegNum<[-2, 3, 3]>;
+def ESI : X86Reg<"esi", 6, [SI]>, DwarfRegNum<[-2, 6, 6]>;
+def EDI : X86Reg<"edi", 7, [DI]>, DwarfRegNum<[-2, 7, 7]>;
+def EBP : X86Reg<"ebp", 5, [BP]>, DwarfRegNum<[-2, 4, 5]>;
+def ESP : X86Reg<"esp", 4, [SP]>, DwarfRegNum<[-2, 5, 4]>;
+def EIP : X86Reg<"eip", 0, [IP]>, DwarfRegNum<[-2, 8, 8]>;
+
+// X86-64 only, requires REX
+let CostPerUse = 1 in {
+def R8D  : X86Reg<"r8d",   8, [R8W]>;
+def R9D  : X86Reg<"r9d",   9, [R9W]>;
+def R10D : X86Reg<"r10d", 10, [R10W]>;
+def R11D : X86Reg<"r11d", 11, [R11W]>;
+def R12D : X86Reg<"r12d", 12, [R12W]>;
+def R13D : X86Reg<"r13d", 13, [R13W]>;
+def R14D : X86Reg<"r14d", 14, [R14W]>;
+def R15D : X86Reg<"r15d", 15, [R15W]>;
+}}
+
+// 64-bit registers, X86-64 only
+let SubRegIndices = [sub_32bit] in {
+def RAX : X86Reg<"rax", 0, [EAX]>, DwarfRegNum<[0, -2, -2]>;
+def RDX : X86Reg<"rdx", 2, [EDX]>, DwarfRegNum<[1, -2, -2]>;
+def RCX : X86Reg<"rcx", 1, [ECX]>, DwarfRegNum<[2, -2, -2]>;
+def RBX : X86Reg<"rbx", 3, [EBX]>, DwarfRegNum<[3, -2, -2]>;
+def RSI : X86Reg<"rsi", 6, [ESI]>, DwarfRegNum<[4, -2, -2]>;
+def RDI : X86Reg<"rdi", 7, [EDI]>, DwarfRegNum<[5, -2, -2]>;
+def RBP : X86Reg<"rbp", 5, [EBP]>, DwarfRegNum<[6, -2, -2]>;
+def RSP : X86Reg<"rsp", 4, [ESP]>, DwarfRegNum<[7, -2, -2]>;
+
+// These also require REX.
+let CostPerUse = 1 in {
+def R8  : X86Reg<"r8",   8, [R8D]>,  DwarfRegNum<[ 8, -2, -2]>;
+def R9  : X86Reg<"r9",   9, [R9D]>,  DwarfRegNum<[ 9, -2, -2]>;
+def R10 : X86Reg<"r10", 10, [R10D]>, DwarfRegNum<[10, -2, -2]>;
+def R11 : X86Reg<"r11", 11, [R11D]>, DwarfRegNum<[11, -2, -2]>;
+def R12 : X86Reg<"r12", 12, [R12D]>, DwarfRegNum<[12, -2, -2]>;
+def R13 : X86Reg<"r13", 13, [R13D]>, DwarfRegNum<[13, -2, -2]>;
+def R14 : X86Reg<"r14", 14, [R14D]>, DwarfRegNum<[14, -2, -2]>;
+def R15 : X86Reg<"r15", 15, [R15D]>, DwarfRegNum<[15, -2, -2]>;
+def RIP : X86Reg<"rip",  0, [EIP]>,  DwarfRegNum<[16, -2, -2]>;
+}}
+
+// MMX Registers. These are actually aliased to ST0 .. ST7
+def MM0 : X86Reg<"mm0", 0>, DwarfRegNum<[41, 29, 29]>;
+def MM1 : X86Reg<"mm1", 1>, DwarfRegNum<[42, 30, 30]>;
+def MM2 : X86Reg<"mm2", 2>, DwarfRegNum<[43, 31, 31]>;
+def MM3 : X86Reg<"mm3", 3>, DwarfRegNum<[44, 32, 32]>;
+def MM4 : X86Reg<"mm4", 4>, DwarfRegNum<[45, 33, 33]>;
+def MM5 : X86Reg<"mm5", 5>, DwarfRegNum<[46, 34, 34]>;
+def MM6 : X86Reg<"mm6", 6>, DwarfRegNum<[47, 35, 35]>;
+def MM7 : X86Reg<"mm7", 7>, DwarfRegNum<[48, 36, 36]>;
+
+// Pseudo Floating Point registers
+def FP0 : X86Reg<"fp0", 0>;
+def FP1 : X86Reg<"fp1", 0>;
+def FP2 : X86Reg<"fp2", 0>;
+def FP3 : X86Reg<"fp3", 0>;
+def FP4 : X86Reg<"fp4", 0>;
+def FP5 : X86Reg<"fp5", 0>;
+def FP6 : X86Reg<"fp6", 0>;
+
+// XMM Registers, used by the various SSE instruction set extensions.
+def XMM0: X86Reg<"xmm0", 0>, DwarfRegNum<[17, 21, 21]>;
+def XMM1: X86Reg<"xmm1", 1>, DwarfRegNum<[18, 22, 22]>;
+def XMM2: X86Reg<"xmm2", 2>, DwarfRegNum<[19, 23, 23]>;
+def XMM3: X86Reg<"xmm3", 3>, DwarfRegNum<[20, 24, 24]>;
+def XMM4: X86Reg<"xmm4", 4>, DwarfRegNum<[21, 25, 25]>;
+def XMM5: X86Reg<"xmm5", 5>, DwarfRegNum<[22, 26, 26]>;
+def XMM6: X86Reg<"xmm6", 6>, DwarfRegNum<[23, 27, 27]>;
+def XMM7: X86Reg<"xmm7", 7>, DwarfRegNum<[24, 28, 28]>;
+
+// X86-64 only
+let CostPerUse = 1 in {
+def XMM8:  X86Reg<"xmm8",   8>, DwarfRegNum<[25, -2, -2]>;
+def XMM9:  X86Reg<"xmm9",   9>, DwarfRegNum<[26, -2, -2]>;
+def XMM10: X86Reg<"xmm10", 10>, DwarfRegNum<[27, -2, -2]>;
+def XMM11: X86Reg<"xmm11", 11>, DwarfRegNum<[28, -2, -2]>;
+def XMM12: X86Reg<"xmm12", 12>, DwarfRegNum<[29, -2, -2]>;
+def XMM13: X86Reg<"xmm13", 13>, DwarfRegNum<[30, -2, -2]>;
+def XMM14: X86Reg<"xmm14", 14>, DwarfRegNum<[31, -2, -2]>;
+def XMM15: X86Reg<"xmm15", 15>, DwarfRegNum<[32, -2, -2]>;
+
+def XMM16:  X86Reg<"xmm16", 16>, DwarfRegNum<[60, -2, -2]>;
+def XMM17:  X86Reg<"xmm17", 17>, DwarfRegNum<[61, -2, -2]>;
+def XMM18:  X86Reg<"xmm18", 18>, DwarfRegNum<[62, -2, -2]>;
+def XMM19:  X86Reg<"xmm19", 19>, DwarfRegNum<[63, -2, -2]>;
+def XMM20:  X86Reg<"xmm20", 20>, DwarfRegNum<[64, -2, -2]>;
+def XMM21:  X86Reg<"xmm21", 21>, DwarfRegNum<[65, -2, -2]>;
+def XMM22:  X86Reg<"xmm22", 22>, DwarfRegNum<[66, -2, -2]>;
+def XMM23:  X86Reg<"xmm23", 23>, DwarfRegNum<[67, -2, -2]>;
+def XMM24:  X86Reg<"xmm24", 24>, DwarfRegNum<[68, -2, -2]>;
+def XMM25:  X86Reg<"xmm25", 25>, DwarfRegNum<[69, -2, -2]>;
+def XMM26:  X86Reg<"xmm26", 26>, DwarfRegNum<[70, -2, -2]>;
+def XMM27:  X86Reg<"xmm27", 27>, DwarfRegNum<[71, -2, -2]>;
+def XMM28:  X86Reg<"xmm28", 28>, DwarfRegNum<[72, -2, -2]>;
+def XMM29:  X86Reg<"xmm29", 29>, DwarfRegNum<[73, -2, -2]>;
+def XMM30:  X86Reg<"xmm30", 30>, DwarfRegNum<[74, -2, -2]>;
+def XMM31:  X86Reg<"xmm31", 31>, DwarfRegNum<[75, -2, -2]>;
+
+} // CostPerUse
+
+// YMM0-15 registers, used by AVX instructions and
+// YMM16-31 registers, used by AVX-512 instructions.
+let SubRegIndices = [sub_xmm] in {
+  foreach  Index = 0-31 in {
+    def YMM#Index : X86Reg<"ymm"#Index, Index, [!cast<X86Reg>("XMM"#Index)]>,
+                    DwarfRegAlias<!cast<X86Reg>("XMM"#Index)>;
+  }
+}
+
+// ZMM Registers, used by AVX-512 instructions.
+let SubRegIndices = [sub_ymm] in {
+  foreach  Index = 0-31 in {
+    def ZMM#Index : X86Reg<"zmm"#Index, Index, [!cast<X86Reg>("YMM"#Index)]>,
+                    DwarfRegAlias<!cast<X86Reg>("XMM"#Index)>;
+  }
+}
+
+  // Mask Registers, used by AVX-512 instructions.
+  def K0 : X86Reg<"k0", 0>, DwarfRegNum<[118, -2, -2]>;
+  def K1 : X86Reg<"k1", 1>, DwarfRegNum<[119, -2, -2]>;
+  def K2 : X86Reg<"k2", 2>, DwarfRegNum<[120, -2, -2]>;
+  def K3 : X86Reg<"k3", 3>, DwarfRegNum<[121, -2, -2]>;
+  def K4 : X86Reg<"k4", 4>, DwarfRegNum<[122, -2, -2]>;
+  def K5 : X86Reg<"k5", 5>, DwarfRegNum<[123, -2, -2]>;
+  def K6 : X86Reg<"k6", 6>, DwarfRegNum<[124, -2, -2]>;
+  def K7 : X86Reg<"k7", 7>, DwarfRegNum<[125, -2, -2]>;
+
+class STRegister<string n, bits<16> Enc, list<Register> A> : X86Reg<n, Enc> {
+  let Aliases = A;
+}
+
+// Floating point stack registers. These don't map one-to-one to the FP
+// pseudo registers, but we still mark them as aliasing FP registers. That
+// way both kinds can be live without exceeding the stack depth. ST registers
+// are only live around inline assembly.
+def ST0 : STRegister<"st(0)", 0, []>,    DwarfRegNum<[33, 12, 11]>;
+def ST1 : STRegister<"st(1)", 1, [FP6]>, DwarfRegNum<[34, 13, 12]>;
+def ST2 : STRegister<"st(2)", 2, [FP5]>, DwarfRegNum<[35, 14, 13]>;
+def ST3 : STRegister<"st(3)", 3, [FP4]>, DwarfRegNum<[36, 15, 14]>;
+def ST4 : STRegister<"st(4)", 4, [FP3]>, DwarfRegNum<[37, 16, 15]>;
+def ST5 : STRegister<"st(5)", 5, [FP2]>, DwarfRegNum<[38, 17, 16]>;
+def ST6 : STRegister<"st(6)", 6, [FP1]>, DwarfRegNum<[39, 18, 17]>;
+def ST7 : STRegister<"st(7)", 7, [FP0]>, DwarfRegNum<[40, 19, 18]>;
+
+// Floating-point status word
+def FPSW : X86Reg<"fpsw", 0>;
+
+// Status flags register
+def EFLAGS : X86Reg<"flags", 0>;
+
+// Segment registers
+def CS : X86Reg<"cs", 1>;
+def DS : X86Reg<"ds", 3>;
+def SS : X86Reg<"ss", 2>;
+def ES : X86Reg<"es", 0>;
+def FS : X86Reg<"fs", 4>;
+def GS : X86Reg<"gs", 5>;
+
+// Debug registers
+def DR0 : X86Reg<"dr0", 0>;
+def DR1 : X86Reg<"dr1", 1>;
+def DR2 : X86Reg<"dr2", 2>;
+def DR3 : X86Reg<"dr3", 3>;
+def DR4 : X86Reg<"dr4", 4>;
+def DR5 : X86Reg<"dr5", 5>;
+def DR6 : X86Reg<"dr6", 6>;
+def DR7 : X86Reg<"dr7", 7>;
+
+// Control registers
+def CR0  : X86Reg<"cr0",   0>;
+def CR1  : X86Reg<"cr1",   1>;
+def CR2  : X86Reg<"cr2",   2>;
+def CR3  : X86Reg<"cr3",   3>;
+def CR4  : X86Reg<"cr4",   4>;
+def CR5  : X86Reg<"cr5",   5>;
+def CR6  : X86Reg<"cr6",   6>;
+def CR7  : X86Reg<"cr7",   7>;
+def CR8  : X86Reg<"cr8",   8>;
+def CR9  : X86Reg<"cr9",   9>;
+def CR10 : X86Reg<"cr10", 10>;
+def CR11 : X86Reg<"cr11", 11>;
+def CR12 : X86Reg<"cr12", 12>;
+def CR13 : X86Reg<"cr13", 13>;
+def CR14 : X86Reg<"cr14", 14>;
+def CR15 : X86Reg<"cr15", 15>;
+
+// Pseudo index registers
+def EIZ : X86Reg<"eiz", 4>;
+def RIZ : X86Reg<"riz", 4>;
+
+
+//===----------------------------------------------------------------------===//
+// Register Class Definitions... now that we have all of the pieces, define the
+// top-level register classes.  The order specified in the register list is
+// implicitly defined to be the register allocation order.
+//
+
+// List call-clobbered registers before callee-save registers. RBX, RBP, (and
+// R12, R13, R14, and R15 for X86-64) are callee-save registers.
+// In 64-mode, there are 12 additional i8 registers, SIL, DIL, BPL, SPL, and
+// R8B, ... R15B.
+// Allocate R12 and R13 last, as these require an extra byte when
+// encoded in x86_64 instructions.
+// FIXME: Allow AH, CH, DH, BH to be used as general-purpose registers in
+// 64-bit mode. The main complication is that they cannot be encoded in an
+// instruction requiring a REX prefix, while SIL, DIL, BPL, R8D, etc.
+// require a REX prefix. For example, "addb %ah, %dil" and "movzbl %ah, %r8d"
+// cannot be encoded.
+def GR8 : RegisterClass<"X86", [i8],  8,
+                        (add AL, CL, DL, AH, CH, DH, BL, BH, SIL, DIL, BPL, SPL,
+                             R8B, R9B, R10B, R11B, R14B, R15B, R12B, R13B)> {
+  let AltOrders = [(sub GR8, AH, BH, CH, DH)];
+  let AltOrderSelect = [{
+    return MF.getTarget().getSubtarget<X86Subtarget>().is64Bit();
+  }];
+}
+
+def GR16 : RegisterClass<"X86", [i16], 16,
+                         (add AX, CX, DX, SI, DI, BX, BP, SP,
+                              R8W, R9W, R10W, R11W, R14W, R15W, R12W, R13W)>;
+
+def GR32 : RegisterClass<"X86", [i32], 32,
+                         (add EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP,
+                              R8D, R9D, R10D, R11D, R14D, R15D, R12D, R13D)>;
+
+// GR64 - 64-bit GPRs. This oddly includes RIP, which isn't accurate, since
+// RIP isn't really a register and it can't be used anywhere except in an
+// address, but it doesn't cause trouble.
+def GR64 : RegisterClass<"X86", [i64], 64,
+                         (add RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
+                              RBX, R14, R15, R12, R13, RBP, RSP, RIP)>;
+
+// Segment registers for use by MOV instructions (and others) that have a
+//   segment register as one operand.  Always contain a 16-bit segment
+//   descriptor.
+def SEGMENT_REG : RegisterClass<"X86", [i16], 16, (add CS, DS, SS, ES, FS, GS)>;
+
+// Debug registers.
+def DEBUG_REG : RegisterClass<"X86", [i32], 32, (sequence "DR%u", 0, 7)>;
+
+// Control registers.
+def CONTROL_REG : RegisterClass<"X86", [i64], 64, (sequence "CR%u", 0, 15)>;
+
+// GR8_ABCD_L, GR8_ABCD_H, GR16_ABCD, GR32_ABCD, GR64_ABCD - Subclasses of
+// GR8, GR16, GR32, and GR64 which contain just the "a" "b", "c", and "d"
+// registers. On x86-32, GR16_ABCD and GR32_ABCD are classes for registers
+// that support 8-bit subreg operations. On x86-64, GR16_ABCD, GR32_ABCD,
+// and GR64_ABCD are classes for registers that support 8-bit h-register
+// operations.
+def GR8_ABCD_L : RegisterClass<"X86", [i8], 8, (add AL, CL, DL, BL)>;
+def GR8_ABCD_H : RegisterClass<"X86", [i8], 8, (add AH, CH, DH, BH)>;
+def GR16_ABCD : RegisterClass<"X86", [i16], 16, (add AX, CX, DX, BX)>;
+def GR32_ABCD : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX, EBX)>;
+def GR64_ABCD : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RBX)>;
+def GR32_TC   : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX)>;
+def GR64_TC   : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RSI, RDI,
+                                                     R8, R9, R11, RIP)>;
+def GR64_TCW64 : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX,
+                                                      R8, R9, R11)>;
+
+// GR8_NOREX - GR8 registers which do not require a REX prefix.
+def GR8_NOREX : RegisterClass<"X86", [i8], 8,
+                              (add AL, CL, DL, AH, CH, DH, BL, BH)> {
+  let AltOrders = [(sub GR8_NOREX, AH, BH, CH, DH)];
+  let AltOrderSelect = [{
+    return MF.getTarget().getSubtarget<X86Subtarget>().is64Bit();
+  }];
+}
+// GR16_NOREX - GR16 registers which do not require a REX prefix.
+def GR16_NOREX : RegisterClass<"X86", [i16], 16,
+                               (add AX, CX, DX, SI, DI, BX, BP, SP)>;
+// GR32_NOREX - GR32 registers which do not require a REX prefix.
+def GR32_NOREX : RegisterClass<"X86", [i32], 32,
+                               (add EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP)>;
+// GR64_NOREX - GR64 registers which do not require a REX prefix.
+def GR64_NOREX : RegisterClass<"X86", [i64], 64,
+                            (add RAX, RCX, RDX, RSI, RDI, RBX, RBP, RSP, RIP)>;
+
+// GR32_NOAX - GR32 registers except EAX. Used by AddRegFrm of XCHG32 in 64-bit
+// mode to prevent encoding using the 0x90 NOP encoding. xchg %eax, %eax needs
+// to clear upper 32-bits of RAX so is not a NOP.
+def GR32_NOAX : RegisterClass<"X86", [i32], 32, (sub GR32, EAX)>;
+
+// GR32_NOSP - GR32 registers except ESP.
+def GR32_NOSP : RegisterClass<"X86", [i32], 32, (sub GR32, ESP)>;
+
+// GR64_NOSP - GR64 registers except RSP (and RIP).
+def GR64_NOSP : RegisterClass<"X86", [i64], 64, (sub GR64, RSP, RIP)>;
+
+// GR32_NOREX_NOSP - GR32 registers which do not require a REX prefix except
+// ESP.
+def GR32_NOREX_NOSP : RegisterClass<"X86", [i32], 32,
+                                    (and GR32_NOREX, GR32_NOSP)>;
+
+// GR64_NOREX_NOSP - GR64_NOREX registers except RSP.
+def GR64_NOREX_NOSP : RegisterClass<"X86", [i64], 64,
+                                    (and GR64_NOREX, GR64_NOSP)>;
+
+// A class to support the 'A' assembler constraint: EAX then EDX.
+def GR32_AD : RegisterClass<"X86", [i32], 32, (add EAX, EDX)>;
+
+// Scalar SSE2 floating point registers.
+def FR32 : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 15)>;
+
+def FR64 : RegisterClass<"X86", [f64], 64, (add FR32)>;
+
+
+// FIXME: This sets up the floating point register files as though they are f64
+// values, though they really are f80 values.  This will cause us to spill
+// values as 64-bit quantities instead of 80-bit quantities, which is much much
+// faster on common hardware.  In reality, this should be controlled by a
+// command line option or something.
+
+def RFP32 : RegisterClass<"X86",[f32], 32, (sequence "FP%u", 0, 6)>;
+def RFP64 : RegisterClass<"X86",[f64], 32, (add RFP32)>;
+def RFP80 : RegisterClass<"X86",[f80], 32, (add RFP32)>;
+
+// Floating point stack registers (these are not allocatable by the
+// register allocator - the floating point stackifier is responsible
+// for transforming FPn allocations to STn registers)
+def RST : RegisterClass<"X86", [f80, f64, f32], 32, (sequence "ST%u", 0, 7)> {
+  let isAllocatable = 0;
+}
+
+// Generic vector registers: VR64 and VR128.
+def VR64: RegisterClass<"X86", [x86mmx], 64, (sequence "MM%u", 0, 7)>;
+def VR128 : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+                          128, (add FR32)>;
+def VR256 : RegisterClass<"X86", [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+                          256, (sequence "YMM%u", 0, 15)>;
+
+// Status flags registers.
+def CCR : RegisterClass<"X86", [i32], 32, (add EFLAGS)> {
+  let CopyCost = -1;  // Don't allow copying of status registers.
+  let isAllocatable = 0;
+}
+def FPCCR : RegisterClass<"X86", [i16], 16, (add FPSW)> {
+  let CopyCost = -1;  // Don't allow copying of status registers.
+  let isAllocatable = 0;
+}
+
+// AVX-512 vector/mask registers.
+def VR512 : RegisterClass<"X86", [v16f32, v8f64, v64i8, v32i16, v16i32, v8i64], 512,
+    (sequence "ZMM%u", 0, 31)>;
+
+// Scalar AVX-512 floating point registers.
+def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)>;
+
+def FR64X : RegisterClass<"X86", [f64], 64, (add FR32X)>;
+
+// Extended VR128 and VR256 for AVX-512 instructions
+def VR128X : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+                          128, (add FR32X)>;
+def VR256X : RegisterClass<"X86", [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+                          256, (sequence "YMM%u", 0, 31)>;
+
+// Mask registers
+def VK1     : RegisterClass<"X86", [i1],    16, (sequence "K%u", 0, 7)> {let Size = 16;}
+def VK2     : RegisterClass<"X86", [v2i1],  16, (add VK1)> {let Size = 16;}
+def VK4     : RegisterClass<"X86", [v4i1],  16, (add VK2)> {let Size = 16;}
+def VK8     : RegisterClass<"X86", [v8i1],  16, (add VK4)> {let Size = 16;}
+def VK16    : RegisterClass<"X86", [v16i1], 16, (add VK8)> {let Size = 16;}
+def VK32    : RegisterClass<"X86", [v32i1], 32, (add VK16)> {let Size = 32;}
+def VK64    : RegisterClass<"X86", [v64i1], 64, (add VK32)> {let Size = 64;}
+
+def VK1WM   : RegisterClass<"X86", [i1],    16, (sub VK1, K0)> {let Size = 16;}
+def VK2WM   : RegisterClass<"X86", [v2i1],  16, (sub VK2, K0)> {let Size = 16;}
+def VK4WM   : RegisterClass<"X86", [v4i1],  16, (sub VK4, K0)> {let Size = 16;}
+def VK8WM   : RegisterClass<"X86", [v8i1],  16, (sub VK8, K0)> {let Size = 16;}
+def VK16WM  : RegisterClass<"X86", [v16i1], 16, (add VK8WM)>   {let Size = 16;}
+def VK32WM  : RegisterClass<"X86", [v32i1], 32, (add VK16WM)> {let Size = 32;}
+def VK64WM  : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;}
diff --git a/contrib/llvm/lib/Target/X86/X86Relocations.h b/contrib/llvm/lib/Target/X86/X86Relocations.h
new file mode 100644
index 0000000..0333056
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86Relocations.h
@@ -0,0 +1,52 @@
+//===-- X86Relocations.h - X86 Code Relocations -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86 target-specific relocation types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86RELOCATIONS_H
+#define X86RELOCATIONS_H
+
+#include "llvm/CodeGen/MachineRelocation.h"
+
+namespace llvm {
+  namespace X86 {
+    /// RelocationType - An enum for the x86 relocation codes. Note that
+    /// the terminology here doesn't follow x86 convention - word means
+    /// 32-bit and dword means 64-bit. The relocations will be treated
+    /// by JIT or ObjectCode emitters, this is transparent to the x86 code
+    /// emitter but JIT and ObjectCode will treat them differently
+    enum RelocationType {
+      /// reloc_pcrel_word - PC relative relocation, add the relocated value to
+      /// the value already in memory, after we adjust it for where the PC is.
+      reloc_pcrel_word = 0,
+
+      /// reloc_picrel_word - PIC base relative relocation, add the relocated
+      /// value to the value already in memory, after we adjust it for where the
+      /// PIC base is.
+      reloc_picrel_word = 1,
+
+      /// reloc_absolute_word - absolute relocation, just add the relocated
+      /// value to the value already in memory.
+      reloc_absolute_word = 2,
+
+      /// reloc_absolute_word_sext - absolute relocation, just add the relocated
+      /// value to the value already in memory. In object files, it represents a
+      /// value which must be sign-extended when resolving the relocation.
+      reloc_absolute_word_sext = 3,
+
+      /// reloc_absolute_dword - absolute relocation, just add the relocated
+      /// value to the value already in memory.
+      reloc_absolute_dword = 4
+    };
+  }
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86SchedHaswell.td b/contrib/llvm/lib/Target/X86/X86SchedHaswell.td
new file mode 100644
index 0000000..6966d61
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86SchedHaswell.td
@@ -0,0 +1,264 @@
+//=- X86SchedHaswell.td - X86 Haswell Scheduling -------------*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Haswell to support instruction
+// scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def HaswellModel : SchedMachineModel {
+  // All x86 instructions are modeled as a single micro-op, and HW can decode 4
+  // instructions per cycle.
+  let IssueWidth = 4;
+  let MicroOpBufferSize = 192; // Based on the reorder buffer.
+  let LoadLatency = 4;
+  let MispredictPenalty = 16;
+
+  // Based on the LSD (loop-stream detector) queue size and benchmarking data.
+  let LoopMicroOpBufferSize = 50;
+
+  // FIXME: SSE4 and AVX are unimplemented. This flag is set to allow
+  // the scheduler to assign a default model to unrecognized opcodes.
+  let CompleteModel = 0;
+}
+
+let SchedModel = HaswellModel in {
+
+// Haswell can issue micro-ops to 8 different ports in one cycle.
+
+// Ports 0, 1, 5, and 6 handle all computation.
+// Port 4 gets the data half of stores. Store data can be available later than
+// the store address, but since we don't model the latency of stores, we can
+// ignore that.
+// Ports 2 and 3 are identical. They handle loads and the address half of
+// stores. Port 7 can handle address calculations.
+def HWPort0 : ProcResource<1>;
+def HWPort1 : ProcResource<1>;
+def HWPort2 : ProcResource<1>;
+def HWPort3 : ProcResource<1>;
+def HWPort4 : ProcResource<1>;
+def HWPort5 : ProcResource<1>;
+def HWPort6 : ProcResource<1>;
+def HWPort7 : ProcResource<1>;
+
+// Many micro-ops are capable of issuing on multiple ports.
+def HWPort23  : ProcResGroup<[HWPort2, HWPort3]>;
+def HWPort237 : ProcResGroup<[HWPort2, HWPort3, HWPort7]>;
+def HWPort05  : ProcResGroup<[HWPort0, HWPort5]>;
+def HWPort06 : ProcResGroup<[HWPort0, HWPort6]>;
+def HWPort15  : ProcResGroup<[HWPort1, HWPort5]>;
+def HWPort16  : ProcResGroup<[HWPort1, HWPort6]>;
+def HWPort015 : ProcResGroup<[HWPort0, HWPort1, HWPort5]>;
+def HWPort0156: ProcResGroup<[HWPort0, HWPort1, HWPort5, HWPort6]>;
+
+// 60 Entry Unified Scheduler
+def HWPortAny : ProcResGroup<[HWPort0, HWPort1, HWPort2, HWPort3, HWPort4,
+                              HWPort5, HWPort6, HWPort7]> {
+  let BufferSize=60;
+}
+
+// Integer division issued on port 0.
+def HWDivider : ProcResource<1>;
+
+// Loads are 4 cycles, so ReadAfterLd registers needn't be available until 4
+// cycles after the memory operand.
+def : ReadAdvance<ReadAfterLd, 4>;
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when queued in the reservation station.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+multiclass HWWriteResPair<X86FoldableSchedWrite SchedRW,
+                          ProcResourceKind ExePort,
+                          int Lat> {
+  // Register variant is using a single cycle on ExePort.
+  def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+
+  // Memory variant also uses a cycle on port 2/3 and adds 4 cycles to the
+  // latency.
+  def : WriteRes<SchedRW.Folded, [HWPort23, ExePort]> {
+     let Latency = !add(Lat, 4);
+  }
+}
+
+// A folded store needs a cycle on port 4 for the store data, but it does not
+// need an extra port 2/3 cycle to recompute the address.
+def : WriteRes<WriteRMW, [HWPort4]>;
+
+// Store_addr on 237.
+// Store_data on 4.
+def : WriteRes<WriteStore, [HWPort237, HWPort4]>;
+def : WriteRes<WriteLoad,  [HWPort23]> { let Latency = 4; }
+def : WriteRes<WriteMove,  [HWPort0156]>;
+def : WriteRes<WriteZero,  []>;
+
+defm : HWWriteResPair<WriteALU,   HWPort0156, 1>;
+defm : HWWriteResPair<WriteIMul,  HWPort1,   3>;
+def  : WriteRes<WriteIMulH, []> { let Latency = 3; }
+defm : HWWriteResPair<WriteShift, HWPort06,  1>;
+defm : HWWriteResPair<WriteJump,  HWPort06,   1>;
+
+// This is for simple LEAs with one or two input operands.
+// The complex ones can only execute on port 1, and they require two cycles on
+// the port to read all inputs. We don't model that.
+def : WriteRes<WriteLEA, [HWPort15]>;
+
+// This is quite rough, latency depends on the dividend.
+def : WriteRes<WriteIDiv, [HWPort0, HWDivider]> {
+  let Latency = 25;
+  let ResourceCycles = [1, 10];
+}
+def : WriteRes<WriteIDivLd, [HWPort23, HWPort0, HWDivider]> {
+  let Latency = 29;
+  let ResourceCycles = [1, 1, 10];
+}
+
+// Scalar and vector floating point.
+defm : HWWriteResPair<WriteFAdd,   HWPort1, 3>;
+defm : HWWriteResPair<WriteFMul,   HWPort0, 5>;
+defm : HWWriteResPair<WriteFDiv,   HWPort0, 12>; // 10-14 cycles.
+defm : HWWriteResPair<WriteFRcp,   HWPort0, 5>;
+defm : HWWriteResPair<WriteFSqrt,  HWPort0, 15>;
+defm : HWWriteResPair<WriteCvtF2I, HWPort1, 3>;
+defm : HWWriteResPair<WriteCvtI2F, HWPort1, 4>;
+defm : HWWriteResPair<WriteCvtF2F, HWPort1, 3>;
+defm : HWWriteResPair<WriteFShuffle,  HWPort5,  1>;
+defm : HWWriteResPair<WriteFBlend,  HWPort015,  1>;
+defm : HWWriteResPair<WriteFShuffle256,  HWPort5,  3>;
+
+def : WriteRes<WriteFVarBlend, [HWPort5]> {
+  let Latency = 2;
+  let ResourceCycles = [2];
+}
+def : WriteRes<WriteFVarBlendLd, [HWPort5, HWPort23]> {
+  let Latency = 6;
+  let ResourceCycles = [2, 1];
+}
+
+// Vector integer operations.
+defm : HWWriteResPair<WriteVecShift, HWPort0,  1>;
+defm : HWWriteResPair<WriteVecLogic, HWPort015, 1>;
+defm : HWWriteResPair<WriteVecALU,   HWPort15,  1>;
+defm : HWWriteResPair<WriteVecIMul,  HWPort0,   5>;
+defm : HWWriteResPair<WriteShuffle,  HWPort5,  1>;
+defm : HWWriteResPair<WriteBlend,  HWPort15,  1>;
+defm : HWWriteResPair<WriteShuffle256,  HWPort5,  3>;
+
+def : WriteRes<WriteVarBlend, [HWPort5]> {
+  let Latency = 2;
+  let ResourceCycles = [2];
+}
+def : WriteRes<WriteVarBlendLd, [HWPort5, HWPort23]> {
+  let Latency = 6;
+  let ResourceCycles = [2, 1];
+}
+
+def : WriteRes<WriteVarVecShift, [HWPort0, HWPort5]> {
+  let Latency = 2;
+  let ResourceCycles = [2, 1];
+}
+def : WriteRes<WriteVarVecShiftLd, [HWPort0, HWPort5, HWPort23]> {
+  let Latency = 6;
+  let ResourceCycles = [2, 1, 1];
+}
+
+def : WriteRes<WriteMPSAD, [HWPort0, HWPort5]> {
+  let Latency = 6;
+  let ResourceCycles = [1, 2];
+}
+def : WriteRes<WriteMPSADLd, [HWPort23, HWPort0, HWPort5]> {
+  let Latency = 6;
+  let ResourceCycles = [1, 1, 2];
+}
+
+// String instructions.
+// Packed Compare Implicit Length Strings, Return Mask
+def : WriteRes<WritePCmpIStrM, [HWPort0]> {
+  let Latency = 10;
+  let ResourceCycles = [3];
+}
+def : WriteRes<WritePCmpIStrMLd, [HWPort0, HWPort23]> {
+  let Latency = 10;
+  let ResourceCycles = [3, 1];
+}
+
+// Packed Compare Explicit Length Strings, Return Mask
+def : WriteRes<WritePCmpEStrM, [HWPort0, HWPort16, HWPort5]> {
+  let Latency = 10;
+  let ResourceCycles = [3, 2, 4];
+}
+def : WriteRes<WritePCmpEStrMLd, [HWPort05, HWPort16, HWPort23]> {
+  let Latency = 10;
+  let ResourceCycles = [6, 2, 1];
+}
+
+// Packed Compare Implicit Length Strings, Return Index
+def : WriteRes<WritePCmpIStrI, [HWPort0]> {
+  let Latency = 11;
+  let ResourceCycles = [3];
+}
+def : WriteRes<WritePCmpIStrILd, [HWPort0, HWPort23]> {
+  let Latency = 11;
+  let ResourceCycles = [3, 1];
+}
+
+// Packed Compare Explicit Length Strings, Return Index
+def : WriteRes<WritePCmpEStrI, [HWPort05, HWPort16]> {
+  let Latency = 11;
+  let ResourceCycles = [6, 2];
+}
+def : WriteRes<WritePCmpEStrILd, [HWPort0, HWPort16, HWPort5, HWPort23]> {
+  let Latency = 11;
+  let ResourceCycles = [3, 2, 2, 1];
+}
+
+// AES Instructions.
+def : WriteRes<WriteAESDecEnc, [HWPort5]> {
+  let Latency = 7;
+  let ResourceCycles = [1];
+}
+def : WriteRes<WriteAESDecEncLd, [HWPort5, HWPort23]> {
+  let Latency = 7;
+  let ResourceCycles = [1, 1];
+}
+
+def : WriteRes<WriteAESIMC, [HWPort5]> {
+  let Latency = 14;
+  let ResourceCycles = [2];
+}
+def : WriteRes<WriteAESIMCLd, [HWPort5, HWPort23]> {
+  let Latency = 14;
+  let ResourceCycles = [2, 1];
+}
+
+def : WriteRes<WriteAESKeyGen, [HWPort0, HWPort5]> {
+  let Latency = 10;
+  let ResourceCycles = [2, 8];
+}
+def : WriteRes<WriteAESKeyGenLd, [HWPort0, HWPort5, HWPort23]> {
+  let Latency = 10;
+  let ResourceCycles = [2, 7, 1];
+}
+
+// Carry-less multiplication instructions.
+def : WriteRes<WriteCLMul, [HWPort0, HWPort5]> {
+  let Latency = 7;
+  let ResourceCycles = [2, 1];
+}
+def : WriteRes<WriteCLMulLd, [HWPort0, HWPort5, HWPort23]> {
+  let Latency = 7;
+  let ResourceCycles = [2, 1, 1];
+}
+
+def : WriteRes<WriteSystem,     [HWPort0156]> { let Latency = 100; }
+def : WriteRes<WriteMicrocoded, [HWPort0156]> { let Latency = 100; }
+def : WriteRes<WriteFence,  [HWPort23, HWPort4]>;
+def : WriteRes<WriteNop, []>;
+} // SchedModel
diff --git a/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td b/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td
new file mode 100644
index 0000000..83f0534
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td
@@ -0,0 +1,249 @@
+//=- X86SchedSandyBridge.td - X86 Sandy Bridge Scheduling ----*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Sandy Bridge to support instruction
+// scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def SandyBridgeModel : SchedMachineModel {
+  // All x86 instructions are modeled as a single micro-op, and SB can decode 4
+  // instructions per cycle.
+  // FIXME: Identify instructions that aren't a single fused micro-op.
+  let IssueWidth = 4;
+  let MicroOpBufferSize = 168; // Based on the reorder buffer.
+  let LoadLatency = 4;
+  let MispredictPenalty = 16;
+
+  // Based on the LSD (loop-stream detector) queue size.
+  let LoopMicroOpBufferSize = 28;
+
+  // FIXME: SSE4 and AVX are unimplemented. This flag is set to allow
+  // the scheduler to assign a default model to unrecognized opcodes.
+  let CompleteModel = 0;
+}
+
+let SchedModel = SandyBridgeModel in {
+
+// Sandy Bridge can issue micro-ops to 6 different ports in one cycle.
+
+// Ports 0, 1, and 5 handle all computation.
+def SBPort0 : ProcResource<1>;
+def SBPort1 : ProcResource<1>;
+def SBPort5 : ProcResource<1>;
+
+// Ports 2 and 3 are identical. They handle loads and the address half of
+// stores.
+def SBPort23 : ProcResource<2>;
+
+// Port 4 gets the data half of stores. Store data can be available later than
+// the store address, but since we don't model the latency of stores, we can
+// ignore that.
+def SBPort4 : ProcResource<1>;
+
+// Many micro-ops are capable of issuing on multiple ports.
+def SBPort05  : ProcResGroup<[SBPort0, SBPort5]>;
+def SBPort15  : ProcResGroup<[SBPort1, SBPort5]>;
+def SBPort015 : ProcResGroup<[SBPort0, SBPort1, SBPort5]>;
+
+// 54 Entry Unified Scheduler
+def SBPortAny : ProcResGroup<[SBPort0, SBPort1, SBPort23, SBPort4, SBPort5]> {
+  let BufferSize=54;
+}
+
+// Integer division issued on port 0.
+def SBDivider : ProcResource<1>;
+
+// Loads are 4 cycles, so ReadAfterLd registers needn't be available until 4
+// cycles after the memory operand.
+def : ReadAdvance<ReadAfterLd, 4>;
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when queued in the reservation station.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+multiclass SBWriteResPair<X86FoldableSchedWrite SchedRW,
+                          ProcResourceKind ExePort,
+                          int Lat> {
+  // Register variant is using a single cycle on ExePort.
+  def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+
+  // Memory variant also uses a cycle on port 2/3 and adds 4 cycles to the
+  // latency.
+  def : WriteRes<SchedRW.Folded, [SBPort23, ExePort]> {
+     let Latency = !add(Lat, 4);
+  }
+}
+
+// A folded store needs a cycle on port 4 for the store data, but it does not
+// need an extra port 2/3 cycle to recompute the address.
+def : WriteRes<WriteRMW, [SBPort4]>;
+
+def : WriteRes<WriteStore, [SBPort23, SBPort4]>;
+def : WriteRes<WriteLoad,  [SBPort23]> { let Latency = 4; }
+def : WriteRes<WriteMove,  [SBPort015]>;
+def : WriteRes<WriteZero,  []>;
+
+defm : SBWriteResPair<WriteALU,   SBPort015, 1>;
+defm : SBWriteResPair<WriteIMul,  SBPort1,   3>;
+def  : WriteRes<WriteIMulH, []> { let Latency = 3; }
+defm : SBWriteResPair<WriteShift, SBPort05,  1>;
+defm : SBWriteResPair<WriteJump,  SBPort5,   1>;
+
+// This is for simple LEAs with one or two input operands.
+// The complex ones can only execute on port 1, and they require two cycles on
+// the port to read all inputs. We don't model that.
+def : WriteRes<WriteLEA, [SBPort15]>;
+
+// This is quite rough, latency depends on the dividend.
+def : WriteRes<WriteIDiv, [SBPort0, SBDivider]> {
+  let Latency = 25;
+  let ResourceCycles = [1, 10];
+}
+def : WriteRes<WriteIDivLd, [SBPort23, SBPort0, SBDivider]> {
+  let Latency = 29;
+  let ResourceCycles = [1, 1, 10];
+}
+
+// Scalar and vector floating point.
+defm : SBWriteResPair<WriteFAdd,   SBPort1, 3>;
+defm : SBWriteResPair<WriteFMul,   SBPort0, 5>;
+defm : SBWriteResPair<WriteFDiv,   SBPort0, 12>; // 10-14 cycles.
+defm : SBWriteResPair<WriteFRcp,   SBPort0, 5>;
+defm : SBWriteResPair<WriteFSqrt,  SBPort0, 15>;
+defm : SBWriteResPair<WriteCvtF2I, SBPort1, 3>;
+defm : SBWriteResPair<WriteCvtI2F, SBPort1, 4>;
+defm : SBWriteResPair<WriteCvtF2F, SBPort1, 3>;
+defm : SBWriteResPair<WriteFShuffle,  SBPort5,  1>;
+defm : SBWriteResPair<WriteFBlend,  SBPort05,  1>;
+def : WriteRes<WriteFVarBlend, [SBPort0, SBPort5]> {
+  let Latency = 2;
+  let ResourceCycles = [1, 1];
+}
+def : WriteRes<WriteFVarBlendLd, [SBPort0, SBPort5, SBPort23]> {
+  let Latency = 6;
+  let ResourceCycles = [1, 1, 1];
+}
+
+// Vector integer operations.
+defm : SBWriteResPair<WriteVecShift, SBPort05,  1>;
+defm : SBWriteResPair<WriteVecLogic, SBPort015, 1>;
+defm : SBWriteResPair<WriteVecALU,   SBPort15,  1>;
+defm : SBWriteResPair<WriteVecIMul,  SBPort0,   5>;
+defm : SBWriteResPair<WriteShuffle,  SBPort15,  1>;
+defm : SBWriteResPair<WriteBlend,  SBPort15,  1>;
+def : WriteRes<WriteVarBlend, [SBPort1, SBPort5]> {
+  let Latency = 2;
+  let ResourceCycles = [1, 1];
+}
+def : WriteRes<WriteVarBlendLd, [SBPort1, SBPort5, SBPort23]> {
+  let Latency = 6;
+  let ResourceCycles = [1, 1, 1];
+}
+def : WriteRes<WriteMPSAD, [SBPort0, SBPort1, SBPort5]> {
+  let Latency = 6;
+  let ResourceCycles = [1, 1, 1];
+}
+def : WriteRes<WriteMPSADLd, [SBPort0, SBPort1, SBPort5, SBPort23]> {
+  let Latency = 6;
+  let ResourceCycles = [1, 1, 1, 1];
+}
+
+// String instructions.
+// Packed Compare Implicit Length Strings, Return Mask
+def : WriteRes<WritePCmpIStrM, [SBPort015]> {
+  let Latency = 11;
+  let ResourceCycles = [3];
+}
+def : WriteRes<WritePCmpIStrMLd, [SBPort015, SBPort23]> {
+  let Latency = 11;
+  let ResourceCycles = [3, 1];
+}
+
+// Packed Compare Explicit Length Strings, Return Mask
+def : WriteRes<WritePCmpEStrM, [SBPort015]> {
+  let Latency = 11;
+  let ResourceCycles = [8];
+}
+def : WriteRes<WritePCmpEStrMLd, [SBPort015, SBPort23]> {
+  let Latency = 11;
+  let ResourceCycles = [7, 1];
+}
+
+// Packed Compare Implicit Length Strings, Return Index
+def : WriteRes<WritePCmpIStrI, [SBPort015]> {
+  let Latency = 3;
+  let ResourceCycles = [3];
+}
+def : WriteRes<WritePCmpIStrILd, [SBPort015, SBPort23]> {
+  let Latency = 3;
+  let ResourceCycles = [3, 1];
+}
+
+// Packed Compare Explicit Length Strings, Return Index
+def : WriteRes<WritePCmpEStrI, [SBPort015]> {
+  let Latency = 4;
+  let ResourceCycles = [8];
+}
+def : WriteRes<WritePCmpEStrILd, [SBPort015, SBPort23]> {
+  let Latency = 4;
+  let ResourceCycles = [7, 1];
+}
+
+// AES Instructions.
+def : WriteRes<WriteAESDecEnc, [SBPort015]> {
+  let Latency = 8;
+  let ResourceCycles = [2];
+}
+def : WriteRes<WriteAESDecEncLd, [SBPort015, SBPort23]> {
+  let Latency = 8;
+  let ResourceCycles = [2, 1];
+}
+
+def : WriteRes<WriteAESIMC, [SBPort015]> {
+  let Latency = 8;
+  let ResourceCycles = [2];
+}
+def : WriteRes<WriteAESIMCLd, [SBPort015, SBPort23]> {
+  let Latency = 8;
+  let ResourceCycles = [2, 1];
+}
+
+def : WriteRes<WriteAESKeyGen, [SBPort015]> {
+  let Latency = 8;
+  let ResourceCycles = [11];
+}
+def : WriteRes<WriteAESKeyGenLd, [SBPort015, SBPort23]> {
+  let Latency = 8;
+  let ResourceCycles = [10, 1];
+}
+
+// Carry-less multiplication instructions.
+def : WriteRes<WriteCLMul, [SBPort015]> {
+  let Latency = 14;
+  let ResourceCycles = [18];
+}
+def : WriteRes<WriteCLMulLd, [SBPort015, SBPort23]> {
+  let Latency = 14;
+  let ResourceCycles = [17, 1];
+}
+
+
+def : WriteRes<WriteSystem,     [SBPort015]> { let Latency = 100; }
+def : WriteRes<WriteMicrocoded, [SBPort015]> { let Latency = 100; }
+def : WriteRes<WriteFence, [SBPort23, SBPort4]>;
+def : WriteRes<WriteNop, []>;
+
+// AVX2 is not supported on that architecture, but we should define the basic
+// scheduling resources anyway.
+defm : SBWriteResPair<WriteFShuffle256, SBPort0,  1>;
+defm : SBWriteResPair<WriteShuffle256, SBPort0,  1>;
+defm : SBWriteResPair<WriteVarVecShift, SBPort0,  1>;
+} // SchedModel
diff --git a/contrib/llvm/lib/Target/X86/X86Schedule.td b/contrib/llvm/lib/Target/X86/X86Schedule.td
new file mode 100644
index 0000000..b76850a
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86Schedule.td
@@ -0,0 +1,642 @@
+//===-- X86Schedule.td - X86 Scheduling Definitions --------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// InstrSchedModel annotations for out-of-order CPUs.
+//
+// These annotations are independent of the itinerary classes defined below.
+
+// Instructions with folded loads need to read the memory operand immediately,
+// but other register operands don't have to be read until the load is ready.
+// These operands are marked with ReadAfterLd.
+def ReadAfterLd : SchedRead;
+
+// Instructions with both a load and a store folded are modeled as a folded
+// load + WriteRMW.
+def WriteRMW : SchedWrite;
+
+// Most instructions can fold loads, so almost every SchedWrite comes in two
+// variants: With and without a folded load.
+// An X86FoldableSchedWrite holds a reference to the corresponding SchedWrite
+// with a folded load.
+class X86FoldableSchedWrite : SchedWrite {
+  // The SchedWrite to use when a load is folded into the instruction.
+  SchedWrite Folded;
+}
+
+// Multiclass that produces a linked pair of SchedWrites.
+multiclass X86SchedWritePair {
+  // Register-Memory operation.
+  def Ld : SchedWrite;
+  // Register-Register operation.
+  def NAME : X86FoldableSchedWrite {
+    let Folded = !cast<SchedWrite>(NAME#"Ld");
+  }
+}
+
+// Arithmetic.
+defm WriteALU  : X86SchedWritePair; // Simple integer ALU op.
+defm WriteIMul : X86SchedWritePair; // Integer multiplication.
+def  WriteIMulH : SchedWrite;       // Integer multiplication, high part.
+defm WriteIDiv : X86SchedWritePair; // Integer division.
+def  WriteLEA  : SchedWrite;        // LEA instructions can't fold loads.
+
+// Integer shifts and rotates.
+defm WriteShift : X86SchedWritePair;
+
+// Loads, stores, and moves, not folded with other operations.
+def WriteLoad  : SchedWrite;
+def WriteStore : SchedWrite;
+def WriteMove  : SchedWrite;
+
+// Idioms that clear a register, like xorps %xmm0, %xmm0.
+// These can often bypass execution ports completely.
+def WriteZero : SchedWrite;
+
+// Branches don't produce values, so they have no latency, but they still
+// consume resources. Indirect branches can fold loads.
+defm WriteJump : X86SchedWritePair;
+
+// Floating point. This covers both scalar and vector operations.
+defm WriteFAdd  : X86SchedWritePair; // Floating point add/sub/compare.
+defm WriteFMul  : X86SchedWritePair; // Floating point multiplication.
+defm WriteFDiv  : X86SchedWritePair; // Floating point division.
+defm WriteFSqrt : X86SchedWritePair; // Floating point square root.
+defm WriteFRcp  : X86SchedWritePair; // Floating point reciprocal.
+defm WriteFMA   : X86SchedWritePair; // Fused Multiply Add.
+defm WriteFShuffle  : X86SchedWritePair; // Floating point vector shuffles.
+defm WriteFBlend  : X86SchedWritePair; // Floating point vector blends.
+defm WriteFVarBlend  : X86SchedWritePair; // Fp vector variable blends.
+
+// FMA Scheduling helper class.
+class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; }
+
+// Vector integer operations.
+defm WriteVecALU   : X86SchedWritePair; // Vector integer ALU op, no logicals.
+defm WriteVecShift : X86SchedWritePair; // Vector integer shifts.
+defm WriteVecIMul  : X86SchedWritePair; // Vector integer multiply.
+defm WriteShuffle  : X86SchedWritePair; // Vector shuffles.
+defm WriteBlend  : X86SchedWritePair; // Vector blends.
+defm WriteVarBlend  : X86SchedWritePair; // Vector variable blends.
+defm WriteMPSAD : X86SchedWritePair; // Vector MPSAD.
+
+// Vector bitwise operations.
+// These are often used on both floating point and integer vectors.
+defm WriteVecLogic : X86SchedWritePair; // Vector and/or/xor.
+
+// Conversion between integer and float.
+defm WriteCvtF2I : X86SchedWritePair; // Float -> Integer.
+defm WriteCvtI2F : X86SchedWritePair; // Integer -> Float.
+defm WriteCvtF2F : X86SchedWritePair; // Float -> Float size conversion.
+
+// Strings instructions.
+// Packed Compare Implicit Length Strings, Return Mask
+defm WritePCmpIStrM : X86SchedWritePair;
+// Packed Compare Explicit Length Strings, Return Mask
+defm WritePCmpEStrM : X86SchedWritePair;
+// Packed Compare Implicit Length Strings, Return Index
+defm WritePCmpIStrI : X86SchedWritePair;
+// Packed Compare Explicit Length Strings, Return Index
+defm WritePCmpEStrI : X86SchedWritePair;
+
+// AES instructions.
+defm WriteAESDecEnc : X86SchedWritePair; // Decryption, encryption.
+defm WriteAESIMC : X86SchedWritePair; // InvMixColumn.
+defm WriteAESKeyGen : X86SchedWritePair; // Key Generation.
+
+// Carry-less multiplication instructions.
+defm WriteCLMul : X86SchedWritePair;
+
+// Catch-all for expensive system instructions.
+def WriteSystem : SchedWrite;
+
+// AVX2.
+defm WriteFShuffle256 : X86SchedWritePair; // Fp 256-bit width vector shuffles.
+defm WriteShuffle256 : X86SchedWritePair; // 256-bit width vector shuffles.
+defm WriteVarVecShift : X86SchedWritePair; // Variable vector shifts.
+
+// Old microcoded instructions that nobody use.
+def WriteMicrocoded : SchedWrite;
+
+// Fence instructions.
+def WriteFence : SchedWrite;
+
+// Nop, not very useful expect it provides a model for nops!
+def WriteNop : SchedWrite;
+
+//===----------------------------------------------------------------------===//
+// Instruction Itinerary classes used for X86
+def IIC_ALU_MEM     : InstrItinClass;
+def IIC_ALU_NONMEM  : InstrItinClass;
+def IIC_LEA         : InstrItinClass;
+def IIC_LEA_16      : InstrItinClass;
+def IIC_MUL8        : InstrItinClass;
+def IIC_MUL16_MEM   : InstrItinClass;
+def IIC_MUL16_REG   : InstrItinClass;
+def IIC_MUL32_MEM   : InstrItinClass;
+def IIC_MUL32_REG   : InstrItinClass;
+def IIC_MUL64       : InstrItinClass;
+// imul by al, ax, eax, tax
+def IIC_IMUL8       : InstrItinClass;
+def IIC_IMUL16_MEM  : InstrItinClass;
+def IIC_IMUL16_REG  : InstrItinClass;
+def IIC_IMUL32_MEM  : InstrItinClass;
+def IIC_IMUL32_REG  : InstrItinClass;
+def IIC_IMUL64      : InstrItinClass;
+// imul reg by reg|mem
+def IIC_IMUL16_RM   : InstrItinClass;
+def IIC_IMUL16_RR   : InstrItinClass;
+def IIC_IMUL32_RM   : InstrItinClass;
+def IIC_IMUL32_RR   : InstrItinClass;
+def IIC_IMUL64_RM   : InstrItinClass;
+def IIC_IMUL64_RR   : InstrItinClass;
+// imul reg = reg/mem * imm
+def IIC_IMUL16_RMI  : InstrItinClass;
+def IIC_IMUL16_RRI  : InstrItinClass;
+def IIC_IMUL32_RMI  : InstrItinClass;
+def IIC_IMUL32_RRI  : InstrItinClass;
+def IIC_IMUL64_RMI  : InstrItinClass;
+def IIC_IMUL64_RRI  : InstrItinClass;
+// div
+def IIC_DIV8_MEM    : InstrItinClass;
+def IIC_DIV8_REG    : InstrItinClass;
+def IIC_DIV16       : InstrItinClass;
+def IIC_DIV32       : InstrItinClass;
+def IIC_DIV64       : InstrItinClass;
+// idiv
+def IIC_IDIV8       : InstrItinClass;
+def IIC_IDIV16      : InstrItinClass;
+def IIC_IDIV32      : InstrItinClass;
+def IIC_IDIV64      : InstrItinClass;
+// neg/not/inc/dec
+def IIC_UNARY_REG   : InstrItinClass;
+def IIC_UNARY_MEM   : InstrItinClass;
+// add/sub/and/or/xor/sbc/cmp/test
+def IIC_BIN_MEM     : InstrItinClass;
+def IIC_BIN_NONMEM  : InstrItinClass;
+// adc/sbc
+def IIC_BIN_CARRY_MEM     : InstrItinClass;
+def IIC_BIN_CARRY_NONMEM  : InstrItinClass;
+// shift/rotate
+def IIC_SR          : InstrItinClass;
+// shift double
+def IIC_SHD16_REG_IM : InstrItinClass;
+def IIC_SHD16_REG_CL : InstrItinClass;
+def IIC_SHD16_MEM_IM : InstrItinClass;
+def IIC_SHD16_MEM_CL : InstrItinClass;
+def IIC_SHD32_REG_IM : InstrItinClass;
+def IIC_SHD32_REG_CL : InstrItinClass;
+def IIC_SHD32_MEM_IM : InstrItinClass;
+def IIC_SHD32_MEM_CL : InstrItinClass;
+def IIC_SHD64_REG_IM : InstrItinClass;
+def IIC_SHD64_REG_CL : InstrItinClass;
+def IIC_SHD64_MEM_IM : InstrItinClass;
+def IIC_SHD64_MEM_CL : InstrItinClass;
+// cmov
+def IIC_CMOV16_RM : InstrItinClass;
+def IIC_CMOV16_RR : InstrItinClass;
+def IIC_CMOV32_RM : InstrItinClass;
+def IIC_CMOV32_RR : InstrItinClass;
+def IIC_CMOV64_RM : InstrItinClass;
+def IIC_CMOV64_RR : InstrItinClass;
+// set
+def IIC_SET_R : InstrItinClass;
+def IIC_SET_M : InstrItinClass;
+// jmp/jcc/jcxz
+def IIC_Jcc : InstrItinClass;
+def IIC_JCXZ : InstrItinClass;
+def IIC_JMP_REL : InstrItinClass;
+def IIC_JMP_REG : InstrItinClass;
+def IIC_JMP_MEM : InstrItinClass;
+def IIC_JMP_FAR_MEM : InstrItinClass;
+def IIC_JMP_FAR_PTR : InstrItinClass;
+// loop
+def IIC_LOOP : InstrItinClass;
+def IIC_LOOPE : InstrItinClass;
+def IIC_LOOPNE : InstrItinClass;
+// call
+def IIC_CALL_RI : InstrItinClass;
+def IIC_CALL_MEM : InstrItinClass;
+def IIC_CALL_FAR_MEM : InstrItinClass;
+def IIC_CALL_FAR_PTR : InstrItinClass;
+// ret
+def IIC_RET : InstrItinClass;
+def IIC_RET_IMM : InstrItinClass;
+//sign extension movs
+def IIC_MOVSX : InstrItinClass;
+def IIC_MOVSX_R16_R8 : InstrItinClass;
+def IIC_MOVSX_R16_M8 : InstrItinClass;
+def IIC_MOVSX_R16_R16 : InstrItinClass;
+def IIC_MOVSX_R32_R32 : InstrItinClass;
+//zero extension movs
+def IIC_MOVZX : InstrItinClass;
+def IIC_MOVZX_R16_R8 : InstrItinClass;
+def IIC_MOVZX_R16_M8 : InstrItinClass;
+
+def IIC_REP_MOVS : InstrItinClass;
+def IIC_REP_STOS : InstrItinClass;
+
+// SSE scalar/parallel binary operations
+def IIC_SSE_ALU_F32S_RR : InstrItinClass;
+def IIC_SSE_ALU_F32S_RM : InstrItinClass;
+def IIC_SSE_ALU_F64S_RR : InstrItinClass;
+def IIC_SSE_ALU_F64S_RM : InstrItinClass;
+def IIC_SSE_MUL_F32S_RR : InstrItinClass;
+def IIC_SSE_MUL_F32S_RM : InstrItinClass;
+def IIC_SSE_MUL_F64S_RR : InstrItinClass;
+def IIC_SSE_MUL_F64S_RM : InstrItinClass;
+def IIC_SSE_DIV_F32S_RR : InstrItinClass;
+def IIC_SSE_DIV_F32S_RM : InstrItinClass;
+def IIC_SSE_DIV_F64S_RR : InstrItinClass;
+def IIC_SSE_DIV_F64S_RM : InstrItinClass;
+def IIC_SSE_ALU_F32P_RR : InstrItinClass;
+def IIC_SSE_ALU_F32P_RM : InstrItinClass;
+def IIC_SSE_ALU_F64P_RR : InstrItinClass;
+def IIC_SSE_ALU_F64P_RM : InstrItinClass;
+def IIC_SSE_MUL_F32P_RR : InstrItinClass;
+def IIC_SSE_MUL_F32P_RM : InstrItinClass;
+def IIC_SSE_MUL_F64P_RR : InstrItinClass;
+def IIC_SSE_MUL_F64P_RM : InstrItinClass;
+def IIC_SSE_DIV_F32P_RR : InstrItinClass;
+def IIC_SSE_DIV_F32P_RM : InstrItinClass;
+def IIC_SSE_DIV_F64P_RR : InstrItinClass;
+def IIC_SSE_DIV_F64P_RM : InstrItinClass;
+
+def IIC_SSE_COMIS_RR : InstrItinClass;
+def IIC_SSE_COMIS_RM : InstrItinClass;
+
+def IIC_SSE_HADDSUB_RR : InstrItinClass;
+def IIC_SSE_HADDSUB_RM : InstrItinClass;
+
+def IIC_SSE_BIT_P_RR  : InstrItinClass;
+def IIC_SSE_BIT_P_RM  : InstrItinClass;
+
+def IIC_SSE_INTALU_P_RR  : InstrItinClass;
+def IIC_SSE_INTALU_P_RM  : InstrItinClass;
+def IIC_SSE_INTALUQ_P_RR  : InstrItinClass;
+def IIC_SSE_INTALUQ_P_RM  : InstrItinClass;
+
+def IIC_SSE_INTMUL_P_RR : InstrItinClass;
+def IIC_SSE_INTMUL_P_RM : InstrItinClass;
+
+def IIC_SSE_INTSH_P_RR : InstrItinClass;
+def IIC_SSE_INTSH_P_RM : InstrItinClass;
+def IIC_SSE_INTSH_P_RI : InstrItinClass;
+
+def IIC_SSE_INTSHDQ_P_RI : InstrItinClass;
+
+def IIC_SSE_SHUFP : InstrItinClass;
+def IIC_SSE_PSHUF_RI : InstrItinClass;
+def IIC_SSE_PSHUF_MI : InstrItinClass;
+
+def IIC_SSE_UNPCK : InstrItinClass;
+
+def IIC_SSE_MOVMSK : InstrItinClass;
+def IIC_SSE_MASKMOV : InstrItinClass;
+
+def IIC_SSE_PEXTRW : InstrItinClass;
+def IIC_SSE_PINSRW : InstrItinClass;
+
+def IIC_SSE_PABS_RR : InstrItinClass;
+def IIC_SSE_PABS_RM : InstrItinClass;
+
+def IIC_SSE_SQRTPS_RR : InstrItinClass;
+def IIC_SSE_SQRTPS_RM : InstrItinClass;
+def IIC_SSE_SQRTSS_RR : InstrItinClass;
+def IIC_SSE_SQRTSS_RM : InstrItinClass;
+def IIC_SSE_SQRTPD_RR : InstrItinClass;
+def IIC_SSE_SQRTPD_RM : InstrItinClass;
+def IIC_SSE_SQRTSD_RR : InstrItinClass;
+def IIC_SSE_SQRTSD_RM : InstrItinClass;
+
+def IIC_SSE_RCPP_RR : InstrItinClass;
+def IIC_SSE_RCPP_RM : InstrItinClass;
+def IIC_SSE_RCPS_RR : InstrItinClass;
+def IIC_SSE_RCPS_RM : InstrItinClass;
+
+def IIC_SSE_MOV_S_RR : InstrItinClass;
+def IIC_SSE_MOV_S_RM : InstrItinClass;
+def IIC_SSE_MOV_S_MR : InstrItinClass;
+
+def IIC_SSE_MOVA_P_RR : InstrItinClass;
+def IIC_SSE_MOVA_P_RM : InstrItinClass;
+def IIC_SSE_MOVA_P_MR : InstrItinClass;
+
+def IIC_SSE_MOVU_P_RR : InstrItinClass;
+def IIC_SSE_MOVU_P_RM : InstrItinClass;
+def IIC_SSE_MOVU_P_MR : InstrItinClass;
+
+def IIC_SSE_MOVDQ : InstrItinClass;
+def IIC_SSE_MOVD_ToGP : InstrItinClass;
+def IIC_SSE_MOVQ_RR : InstrItinClass;
+
+def IIC_SSE_MOV_LH : InstrItinClass;
+
+def IIC_SSE_LDDQU : InstrItinClass;
+
+def IIC_SSE_MOVNT : InstrItinClass;
+
+def IIC_SSE_PHADDSUBD_RR : InstrItinClass;
+def IIC_SSE_PHADDSUBD_RM : InstrItinClass;
+def IIC_SSE_PHADDSUBSW_RR : InstrItinClass;
+def IIC_SSE_PHADDSUBSW_RM : InstrItinClass;
+def IIC_SSE_PHADDSUBW_RR : InstrItinClass;
+def IIC_SSE_PHADDSUBW_RM : InstrItinClass;
+def IIC_SSE_PSHUFB_RR : InstrItinClass;
+def IIC_SSE_PSHUFB_RM : InstrItinClass;
+def IIC_SSE_PSIGN_RR : InstrItinClass;
+def IIC_SSE_PSIGN_RM : InstrItinClass;
+
+def IIC_SSE_PMADD : InstrItinClass;
+def IIC_SSE_PMULHRSW : InstrItinClass;
+def IIC_SSE_PALIGNRR : InstrItinClass;
+def IIC_SSE_PALIGNRM : InstrItinClass;
+def IIC_SSE_MWAIT : InstrItinClass;
+def IIC_SSE_MONITOR : InstrItinClass;
+
+def IIC_SSE_PREFETCH : InstrItinClass;
+def IIC_SSE_PAUSE : InstrItinClass;
+def IIC_SSE_LFENCE : InstrItinClass;
+def IIC_SSE_MFENCE : InstrItinClass;
+def IIC_SSE_SFENCE : InstrItinClass;
+def IIC_SSE_LDMXCSR : InstrItinClass;
+def IIC_SSE_STMXCSR : InstrItinClass;
+
+def IIC_SSE_CVT_PD_RR : InstrItinClass;
+def IIC_SSE_CVT_PD_RM : InstrItinClass;
+def IIC_SSE_CVT_PS_RR : InstrItinClass;
+def IIC_SSE_CVT_PS_RM : InstrItinClass;
+def IIC_SSE_CVT_PI2PS_RR : InstrItinClass;
+def IIC_SSE_CVT_PI2PS_RM : InstrItinClass;
+def IIC_SSE_CVT_Scalar_RR : InstrItinClass;
+def IIC_SSE_CVT_Scalar_RM : InstrItinClass;
+def IIC_SSE_CVT_SS2SI32_RM : InstrItinClass;
+def IIC_SSE_CVT_SS2SI32_RR : InstrItinClass;
+def IIC_SSE_CVT_SS2SI64_RM : InstrItinClass;
+def IIC_SSE_CVT_SS2SI64_RR : InstrItinClass;
+def IIC_SSE_CVT_SD2SI_RM : InstrItinClass;
+def IIC_SSE_CVT_SD2SI_RR : InstrItinClass;
+
+// MMX
+def IIC_MMX_MOV_MM_RM : InstrItinClass;
+def IIC_MMX_MOV_REG_MM : InstrItinClass;
+def IIC_MMX_MOVQ_RM : InstrItinClass;
+def IIC_MMX_MOVQ_RR : InstrItinClass;
+
+def IIC_MMX_ALU_RM : InstrItinClass;
+def IIC_MMX_ALU_RR : InstrItinClass;
+def IIC_MMX_ALUQ_RM : InstrItinClass;
+def IIC_MMX_ALUQ_RR : InstrItinClass;
+def IIC_MMX_PHADDSUBW_RM : InstrItinClass;
+def IIC_MMX_PHADDSUBW_RR : InstrItinClass;
+def IIC_MMX_PHADDSUBD_RM : InstrItinClass;
+def IIC_MMX_PHADDSUBD_RR : InstrItinClass;
+def IIC_MMX_PMUL : InstrItinClass;
+def IIC_MMX_MISC_FUNC_MEM : InstrItinClass;
+def IIC_MMX_MISC_FUNC_REG : InstrItinClass;
+def IIC_MMX_PSADBW : InstrItinClass;
+def IIC_MMX_SHIFT_RI : InstrItinClass;
+def IIC_MMX_SHIFT_RM : InstrItinClass;
+def IIC_MMX_SHIFT_RR : InstrItinClass;
+def IIC_MMX_UNPCK_H_RM : InstrItinClass;
+def IIC_MMX_UNPCK_H_RR : InstrItinClass;
+def IIC_MMX_UNPCK_L : InstrItinClass;
+def IIC_MMX_PCK_RM : InstrItinClass;
+def IIC_MMX_PCK_RR : InstrItinClass;
+def IIC_MMX_PSHUF : InstrItinClass;
+def IIC_MMX_PEXTR : InstrItinClass;
+def IIC_MMX_PINSRW : InstrItinClass;
+def IIC_MMX_MASKMOV : InstrItinClass;
+
+def IIC_MMX_CVT_PD_RR : InstrItinClass;
+def IIC_MMX_CVT_PD_RM : InstrItinClass;
+def IIC_MMX_CVT_PS_RR : InstrItinClass;
+def IIC_MMX_CVT_PS_RM : InstrItinClass;
+
+def IIC_CMPX_LOCK : InstrItinClass;
+def IIC_CMPX_LOCK_8 : InstrItinClass;
+def IIC_CMPX_LOCK_8B : InstrItinClass;
+def IIC_CMPX_LOCK_16B : InstrItinClass;
+
+def IIC_XADD_LOCK_MEM : InstrItinClass;
+def IIC_XADD_LOCK_MEM8 : InstrItinClass;
+
+def IIC_FILD : InstrItinClass;
+def IIC_FLD : InstrItinClass;
+def IIC_FLD80 : InstrItinClass;
+def IIC_FST : InstrItinClass;
+def IIC_FST80 : InstrItinClass;
+def IIC_FIST : InstrItinClass;
+def IIC_FLDZ : InstrItinClass;
+def IIC_FUCOM : InstrItinClass;
+def IIC_FUCOMI : InstrItinClass;
+def IIC_FCOMI : InstrItinClass;
+def IIC_FNSTSW : InstrItinClass;
+def IIC_FNSTCW : InstrItinClass;
+def IIC_FLDCW : InstrItinClass;
+def IIC_FNINIT : InstrItinClass;
+def IIC_FFREE : InstrItinClass;
+def IIC_FNCLEX : InstrItinClass;
+def IIC_WAIT : InstrItinClass;
+def IIC_FXAM : InstrItinClass;
+def IIC_FNOP : InstrItinClass;
+def IIC_FLDL : InstrItinClass;
+def IIC_F2XM1 : InstrItinClass;
+def IIC_FYL2X : InstrItinClass;
+def IIC_FPTAN : InstrItinClass;
+def IIC_FPATAN : InstrItinClass;
+def IIC_FXTRACT : InstrItinClass;
+def IIC_FPREM1 : InstrItinClass;
+def IIC_FPSTP : InstrItinClass;
+def IIC_FPREM : InstrItinClass;
+def IIC_FYL2XP1 : InstrItinClass;
+def IIC_FSINCOS : InstrItinClass;
+def IIC_FRNDINT : InstrItinClass;
+def IIC_FSCALE : InstrItinClass;
+def IIC_FCOMPP : InstrItinClass;
+def IIC_FXSAVE : InstrItinClass;
+def IIC_FXRSTOR : InstrItinClass;
+
+def IIC_FXCH : InstrItinClass;
+
+// System instructions
+def IIC_CPUID : InstrItinClass;
+def IIC_INT : InstrItinClass;
+def IIC_INT3 : InstrItinClass;
+def IIC_INVD : InstrItinClass;
+def IIC_INVLPG : InstrItinClass;
+def IIC_IRET : InstrItinClass;
+def IIC_HLT : InstrItinClass;
+def IIC_LXS : InstrItinClass;
+def IIC_LTR : InstrItinClass;
+def IIC_RDTSC : InstrItinClass;
+def IIC_RSM : InstrItinClass;
+def IIC_SIDT : InstrItinClass;
+def IIC_SGDT : InstrItinClass;
+def IIC_SLDT : InstrItinClass;
+def IIC_STR : InstrItinClass;
+def IIC_SWAPGS : InstrItinClass;
+def IIC_SYSCALL : InstrItinClass;
+def IIC_SYS_ENTER_EXIT : InstrItinClass;
+def IIC_IN_RR : InstrItinClass;
+def IIC_IN_RI : InstrItinClass;
+def IIC_OUT_RR : InstrItinClass;
+def IIC_OUT_IR : InstrItinClass;
+def IIC_INS : InstrItinClass;
+def IIC_MOV_REG_DR : InstrItinClass;
+def IIC_MOV_DR_REG : InstrItinClass;
+def IIC_MOV_REG_CR : InstrItinClass;
+def IIC_MOV_CR_REG : InstrItinClass;
+def IIC_MOV_REG_SR : InstrItinClass;
+def IIC_MOV_MEM_SR : InstrItinClass;
+def IIC_MOV_SR_REG : InstrItinClass;
+def IIC_MOV_SR_MEM : InstrItinClass;
+def IIC_LAR_RM : InstrItinClass;
+def IIC_LAR_RR : InstrItinClass;
+def IIC_LSL_RM : InstrItinClass;
+def IIC_LSL_RR : InstrItinClass;
+def IIC_LGDT : InstrItinClass;
+def IIC_LIDT : InstrItinClass;
+def IIC_LLDT_REG : InstrItinClass;
+def IIC_LLDT_MEM : InstrItinClass;
+def IIC_PUSH_CS : InstrItinClass;
+def IIC_PUSH_SR : InstrItinClass;
+def IIC_POP_SR : InstrItinClass;
+def IIC_POP_SR_SS : InstrItinClass;
+def IIC_VERR : InstrItinClass;
+def IIC_VERW_REG : InstrItinClass;
+def IIC_VERW_MEM : InstrItinClass;
+def IIC_WRMSR : InstrItinClass;
+def IIC_RDMSR : InstrItinClass;
+def IIC_RDPMC : InstrItinClass;
+def IIC_SMSW : InstrItinClass;
+def IIC_LMSW_REG : InstrItinClass;
+def IIC_LMSW_MEM : InstrItinClass;
+def IIC_ENTER : InstrItinClass;
+def IIC_LEAVE : InstrItinClass;
+def IIC_POP_MEM : InstrItinClass;
+def IIC_POP_REG16 : InstrItinClass;
+def IIC_POP_REG : InstrItinClass;
+def IIC_POP_F : InstrItinClass;
+def IIC_POP_FD : InstrItinClass;
+def IIC_POP_A : InstrItinClass;
+def IIC_PUSH_IMM : InstrItinClass;
+def IIC_PUSH_MEM : InstrItinClass;
+def IIC_PUSH_REG : InstrItinClass;
+def IIC_PUSH_F : InstrItinClass;
+def IIC_PUSH_A : InstrItinClass;
+def IIC_BSWAP : InstrItinClass;
+def IIC_BIT_SCAN_MEM : InstrItinClass;
+def IIC_BIT_SCAN_REG : InstrItinClass;
+def IIC_MOVS : InstrItinClass;
+def IIC_STOS : InstrItinClass;
+def IIC_SCAS : InstrItinClass;
+def IIC_CMPS : InstrItinClass;
+def IIC_MOV : InstrItinClass;
+def IIC_MOV_MEM : InstrItinClass;
+def IIC_AHF : InstrItinClass;
+def IIC_BT_MI : InstrItinClass;
+def IIC_BT_MR : InstrItinClass;
+def IIC_BT_RI : InstrItinClass;
+def IIC_BT_RR : InstrItinClass;
+def IIC_BTX_MI : InstrItinClass;
+def IIC_BTX_MR : InstrItinClass;
+def IIC_BTX_RI : InstrItinClass;
+def IIC_BTX_RR : InstrItinClass;
+def IIC_XCHG_REG : InstrItinClass;
+def IIC_XCHG_MEM : InstrItinClass;
+def IIC_XADD_REG : InstrItinClass;
+def IIC_XADD_MEM : InstrItinClass;
+def IIC_CMPXCHG_MEM : InstrItinClass;
+def IIC_CMPXCHG_REG : InstrItinClass;
+def IIC_CMPXCHG_MEM8 : InstrItinClass;
+def IIC_CMPXCHG_REG8 : InstrItinClass;
+def IIC_CMPXCHG_8B : InstrItinClass;
+def IIC_CMPXCHG_16B : InstrItinClass;
+def IIC_LODS : InstrItinClass;
+def IIC_OUTS : InstrItinClass;
+def IIC_CLC : InstrItinClass;
+def IIC_CLD : InstrItinClass;
+def IIC_CLI : InstrItinClass;
+def IIC_CMC : InstrItinClass;
+def IIC_CLTS : InstrItinClass;
+def IIC_STC : InstrItinClass;
+def IIC_STI : InstrItinClass;
+def IIC_STD : InstrItinClass;
+def IIC_XLAT : InstrItinClass;
+def IIC_AAA : InstrItinClass;
+def IIC_AAD : InstrItinClass;
+def IIC_AAM : InstrItinClass;
+def IIC_AAS : InstrItinClass;
+def IIC_DAA : InstrItinClass;
+def IIC_DAS : InstrItinClass;
+def IIC_BOUND : InstrItinClass;
+def IIC_ARPL_REG : InstrItinClass;
+def IIC_ARPL_MEM : InstrItinClass;
+def IIC_MOVBE : InstrItinClass;
+def IIC_AES   : InstrItinClass;
+def IIC_BLEND_MEM : InstrItinClass;
+def IIC_BLEND_NOMEM : InstrItinClass;
+def IIC_CBW   : InstrItinClass;
+def IIC_CRC32_REG : InstrItinClass;
+def IIC_CRC32_MEM : InstrItinClass;
+def IIC_SSE_DPPD_RR : InstrItinClass;
+def IIC_SSE_DPPD_RM : InstrItinClass;
+def IIC_SSE_DPPS_RR : InstrItinClass;
+def IIC_SSE_DPPS_RM : InstrItinClass;
+def IIC_MMX_EMMS : InstrItinClass;
+def IIC_SSE_EXTRACTPS_RR : InstrItinClass;
+def IIC_SSE_EXTRACTPS_RM : InstrItinClass;
+def IIC_SSE_INSERTPS_RR : InstrItinClass;
+def IIC_SSE_INSERTPS_RM : InstrItinClass;
+def IIC_SSE_MPSADBW_RR : InstrItinClass;
+def IIC_SSE_MPSADBW_RM : InstrItinClass;
+def IIC_SSE_PMULLD_RR : InstrItinClass;
+def IIC_SSE_PMULLD_RM : InstrItinClass;
+def IIC_SSE_ROUNDPS_REG : InstrItinClass;
+def IIC_SSE_ROUNDPS_MEM : InstrItinClass;
+def IIC_SSE_ROUNDPD_REG : InstrItinClass;
+def IIC_SSE_ROUNDPD_MEM : InstrItinClass;
+def IIC_SSE_POPCNT_RR : InstrItinClass;
+def IIC_SSE_POPCNT_RM : InstrItinClass;
+def IIC_SSE_PCLMULQDQ_RR : InstrItinClass;
+def IIC_SSE_PCLMULQDQ_RM : InstrItinClass;
+
+def IIC_NOP : InstrItinClass;
+
+//===----------------------------------------------------------------------===//
+// Processor instruction itineraries.
+
+// IssueWidth is analogous to the number of decode units. Core and its
+// descendents, including Nehalem and SandyBridge have 4 decoders.
+// Resources beyond the decoder operate on micro-ops and are bufferred
+// so adjacent micro-ops don't directly compete.
+//
+// MicroOpBufferSize > 1 indicates that RAW dependencies can be
+// decoded in the same cycle. The value 32 is a reasonably arbitrary
+// number of in-flight instructions.
+//
+// HighLatency=10 is optimistic. X86InstrInfo::isHighLatencyDef
+// indicates high latency opcodes. Alternatively, InstrItinData
+// entries may be included here to define specific operand
+// latencies. Since these latencies are not used for pipeline hazards,
+// they do not need to be exact.
+//
+// The GenericModel contains no instruction itineraries.
+def GenericModel : SchedMachineModel {
+  let IssueWidth = 4;
+  let MicroOpBufferSize = 32;
+  let LoadLatency = 4;
+  let HighLatency = 10;
+  let PostRAScheduler = 0;
+}
+
+include "X86ScheduleAtom.td"
+include "X86SchedSandyBridge.td"
+include "X86SchedHaswell.td"
+include "X86ScheduleSLM.td"
diff --git a/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td b/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td
new file mode 100644
index 0000000..c8820aa
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td
@@ -0,0 +1,544 @@
+//===- X86ScheduleAtom.td - X86 Atom Scheduling Definitions -*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the Intel Atom
+// in order (Saltwell-32nm/Bonnell-45nm) processors.
+//
+//===----------------------------------------------------------------------===//
+
+//
+// Scheduling information derived from the "Intel 64 and IA32 Architectures
+// Optimization Reference Manual", Chapter 13, Section 4.
+// Functional Units
+//    Port 0
+def Port0 : FuncUnit; // ALU: ALU0, shift/rotate, load/store
+                      // SIMD/FP: SIMD ALU, Shuffle,SIMD/FP multiply, divide
+def Port1 : FuncUnit; // ALU: ALU1, bit processing, jump, and LEA
+                      // SIMD/FP: SIMD ALU, FP Adder
+
+def AtomItineraries : ProcessorItineraries<
+  [ Port0, Port1 ],
+  [], [
+  // P0 only
+  // InstrItinData<class, [InstrStage<N, [P0]>] >,
+  // P0 or P1
+  // InstrItinData<class, [InstrStage<N, [P0, P1]>] >,
+  // P0 and P1
+  // InstrItinData<class, [InstrStage<N, [P0], 0>,  InstrStage<N, [P1]>] >,
+  //
+  // Default is 1 cycle, port0 or port1
+  InstrItinData<IIC_ALU_MEM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_ALU_NONMEM, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_LEA, [InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_LEA_16, [InstrStage<2, [Port0, Port1]>] >,
+  // mul
+  InstrItinData<IIC_MUL8, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_MUL16_MEM, [InstrStage<8, [Port0, Port1]>] >,
+  InstrItinData<IIC_MUL16_REG, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_MUL32_MEM, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_MUL32_REG, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_MUL64, [InstrStage<12, [Port0, Port1]>] >,
+  // imul by al, ax, eax, rax
+  InstrItinData<IIC_IMUL8, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_IMUL16_MEM, [InstrStage<8, [Port0, Port1]>] >,
+  InstrItinData<IIC_IMUL16_REG, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_IMUL32_MEM, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_IMUL32_REG, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_IMUL64, [InstrStage<12, [Port0, Port1]>] >,
+  // imul reg by reg|mem
+  InstrItinData<IIC_IMUL16_RM, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_IMUL16_RR, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_IMUL32_RM, [InstrStage<5, [Port0]>] >,
+  InstrItinData<IIC_IMUL32_RR, [InstrStage<5, [Port0]>] >,
+  InstrItinData<IIC_IMUL64_RM, [InstrStage<12, [Port0, Port1]>] >,
+  InstrItinData<IIC_IMUL64_RR, [InstrStage<12, [Port0, Port1]>] >,
+  // imul reg = reg/mem * imm
+  InstrItinData<IIC_IMUL16_RRI, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_IMUL32_RRI, [InstrStage<5, [Port0]>] >,
+  InstrItinData<IIC_IMUL64_RRI, [InstrStage<14, [Port0, Port1]>] >,
+  InstrItinData<IIC_IMUL16_RMI, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_IMUL32_RMI, [InstrStage<5, [Port0]>] >,
+  InstrItinData<IIC_IMUL64_RMI, [InstrStage<14, [Port0, Port1]>] >,
+  // idiv
+  InstrItinData<IIC_IDIV8, [InstrStage<62, [Port0, Port1]>] >,
+  InstrItinData<IIC_IDIV16, [InstrStage<62, [Port0, Port1]>] >,
+  InstrItinData<IIC_IDIV32, [InstrStage<62, [Port0, Port1]>] >,
+  InstrItinData<IIC_IDIV64, [InstrStage<130, [Port0, Port1]>] >,
+  // div
+  InstrItinData<IIC_DIV8_REG, [InstrStage<50, [Port0, Port1]>] >,
+  InstrItinData<IIC_DIV8_MEM, [InstrStage<68, [Port0, Port1]>] >,
+  InstrItinData<IIC_DIV16, [InstrStage<50, [Port0, Port1]>] >,
+  InstrItinData<IIC_DIV32, [InstrStage<50, [Port0, Port1]>] >,
+  InstrItinData<IIC_DIV64, [InstrStage<130, [Port0, Port1]>] >,
+  // neg/not/inc/dec
+  InstrItinData<IIC_UNARY_REG, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_UNARY_MEM, [InstrStage<1, [Port0]>] >,
+  // add/sub/and/or/xor/cmp/test
+  InstrItinData<IIC_BIN_NONMEM, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_BIN_MEM, [InstrStage<1, [Port0]>] >,
+  // adc/sbc
+  InstrItinData<IIC_BIN_CARRY_NONMEM, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_BIN_CARRY_MEM, [InstrStage<1, [Port0]>] >,
+  // shift/rotate
+  InstrItinData<IIC_SR, [InstrStage<1, [Port0]>] >,
+  // shift double
+  InstrItinData<IIC_SHD16_REG_IM, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_SHD16_REG_CL, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_SHD16_MEM_IM, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_SHD16_MEM_CL, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_SHD32_REG_IM, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_SHD32_REG_CL, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_SHD32_MEM_IM, [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_SHD32_MEM_CL, [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_SHD64_REG_IM, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_SHD64_REG_CL, [InstrStage<8, [Port0, Port1]>] >,
+  InstrItinData<IIC_SHD64_MEM_IM, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_SHD64_MEM_CL, [InstrStage<9, [Port0, Port1]>] >,
+  // cmov
+  InstrItinData<IIC_CMOV16_RM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_CMOV16_RR, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMOV32_RM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_CMOV32_RR, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMOV64_RM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_CMOV64_RR, [InstrStage<1, [Port0, Port1]>] >,
+  // set
+  InstrItinData<IIC_SET_M, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_SET_R, [InstrStage<1, [Port0, Port1]>] >,
+  // jcc
+  InstrItinData<IIC_Jcc, [InstrStage<1, [Port1]>] >,
+  // jcxz/jecxz/jrcxz
+  InstrItinData<IIC_JCXZ, [InstrStage<4, [Port0, Port1]>] >,
+  // jmp rel
+  InstrItinData<IIC_JMP_REL, [InstrStage<1, [Port1]>] >,
+  // jmp indirect
+  InstrItinData<IIC_JMP_REG, [InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_JMP_MEM, [InstrStage<2, [Port0, Port1]>] >,
+  // jmp far
+  InstrItinData<IIC_JMP_FAR_MEM, [InstrStage<32, [Port0, Port1]>] >,
+  InstrItinData<IIC_JMP_FAR_PTR, [InstrStage<31, [Port0, Port1]>] >,
+  // loop/loope/loopne
+  InstrItinData<IIC_LOOP, [InstrStage<18, [Port0, Port1]>] >,
+  InstrItinData<IIC_LOOPE, [InstrStage<8, [Port0, Port1]>] >,
+  InstrItinData<IIC_LOOPNE, [InstrStage<17, [Port0, Port1]>] >,
+  // call - all but reg/imm
+  InstrItinData<IIC_CALL_RI, [InstrStage<1, [Port0], 0>,
+                              InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_CALL_MEM, [InstrStage<15, [Port0, Port1]>] >,
+  InstrItinData<IIC_CALL_FAR_MEM, [InstrStage<40, [Port0, Port1]>] >,
+  InstrItinData<IIC_CALL_FAR_PTR, [InstrStage<39, [Port0, Port1]>] >,
+  //ret
+  InstrItinData<IIC_RET, [InstrStage<79, [Port0, Port1]>] >,
+  InstrItinData<IIC_RET_IMM, [InstrStage<1, [Port0], 0>,  InstrStage<1, [Port1]>] >,
+  //sign extension movs
+  InstrItinData<IIC_MOVSX,[InstrStage<1, [Port0] >] >,
+  InstrItinData<IIC_MOVSX_R16_R8, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOVSX_R16_M8, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOVSX_R16_R16, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOVSX_R32_R32, [InstrStage<1, [Port0, Port1]>] >,
+  //zero extension movs
+  InstrItinData<IIC_MOVZX,[InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MOVZX_R16_R8, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOVZX_R16_M8, [InstrStage<3, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_REP_MOVS, [InstrStage<75, [Port0, Port1]>] >,
+  InstrItinData<IIC_REP_STOS, [InstrStage<74, [Port0, Port1]>] >,
+
+  // SSE binary operations
+  // arithmetic fp scalar
+  InstrItinData<IIC_SSE_ALU_F32S_RR, [InstrStage<5, [Port1]>] >,
+  InstrItinData<IIC_SSE_ALU_F32S_RM, [InstrStage<5, [Port0], 0>,
+                                   InstrStage<5, [Port1]>] >,
+  InstrItinData<IIC_SSE_ALU_F64S_RR, [InstrStage<5, [Port1]>] >,
+  InstrItinData<IIC_SSE_ALU_F64S_RM, [InstrStage<5, [Port0], 0>,
+                                   InstrStage<5, [Port1]>] >,
+  InstrItinData<IIC_SSE_MUL_F32S_RR, [InstrStage<4, [Port0]>] >,
+  InstrItinData<IIC_SSE_MUL_F32S_RM, [InstrStage<4, [Port0]>] >,
+  InstrItinData<IIC_SSE_MUL_F64S_RR, [InstrStage<5, [Port0]>] >,
+  InstrItinData<IIC_SSE_MUL_F64S_RM, [InstrStage<5, [Port0]>] >,
+  InstrItinData<IIC_SSE_DIV_F32S_RR, [InstrStage<34, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_DIV_F32S_RM, [InstrStage<34, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_DIV_F64S_RR, [InstrStage<62, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_DIV_F64S_RM, [InstrStage<62, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_SSE_COMIS_RR, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_COMIS_RM, [InstrStage<10, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_SSE_HADDSUB_RR, [InstrStage<8, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_HADDSUB_RM, [InstrStage<9, [Port0, Port1]>] >,
+
+  // arithmetic fp parallel
+  InstrItinData<IIC_SSE_ALU_F32P_RR, [InstrStage<5, [Port1]>] >,
+  InstrItinData<IIC_SSE_ALU_F32P_RM, [InstrStage<5, [Port0], 0>,
+                                   InstrStage<5, [Port1]>] >,
+  InstrItinData<IIC_SSE_ALU_F64P_RR, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_ALU_F64P_RM, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_MUL_F32P_RR, [InstrStage<5, [Port0]>] >,
+  InstrItinData<IIC_SSE_MUL_F32P_RM, [InstrStage<5, [Port0]>] >,
+  InstrItinData<IIC_SSE_MUL_F64P_RR, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_MUL_F64P_RM, [InstrStage<10, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_DIV_F32P_RR, [InstrStage<70, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_DIV_F32P_RM, [InstrStage<70, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_DIV_F64P_RR, [InstrStage<125, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_DIV_F64P_RM, [InstrStage<125, [Port0, Port1]>] >,
+
+  // bitwise parallel
+  InstrItinData<IIC_SSE_BIT_P_RR, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_BIT_P_RM, [InstrStage<1, [Port0]>] >,
+
+  // arithmetic int parallel
+  InstrItinData<IIC_SSE_INTALU_P_RR, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_INTALU_P_RM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_INTALUQ_P_RR, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_INTALUQ_P_RM, [InstrStage<3, [Port0, Port1]>] >,
+
+  // multiply int parallel
+  InstrItinData<IIC_SSE_INTMUL_P_RR, [InstrStage<5, [Port0]>] >,
+  InstrItinData<IIC_SSE_INTMUL_P_RM, [InstrStage<5, [Port0]>] >,
+
+  // shift parallel
+  InstrItinData<IIC_SSE_INTSH_P_RR, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_INTSH_P_RM, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_INTSH_P_RI, [InstrStage<1, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_SSE_INTSHDQ_P_RI, [InstrStage<1, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_SSE_SHUFP, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_PSHUF_RI, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_PSHUF_MI, [InstrStage<1, [Port0]>] >,
+
+  InstrItinData<IIC_SSE_UNPCK, [InstrStage<1, [Port0]>] >,
+
+  InstrItinData<IIC_SSE_SQRTPS_RR, [InstrStage<70, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_SQRTPS_RM, [InstrStage<70, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_SQRTSS_RR, [InstrStage<34, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_SQRTSS_RM, [InstrStage<34, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_SSE_SQRTPD_RR, [InstrStage<125, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_SQRTPD_RM, [InstrStage<125, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_SQRTSD_RR, [InstrStage<62, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_SQRTSD_RM, [InstrStage<62, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_SSE_RCPP_RR, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_RCPP_RM, [InstrStage<10, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_RCPS_RR, [InstrStage<4, [Port0]>] >,
+  InstrItinData<IIC_SSE_RCPS_RM, [InstrStage<4, [Port0]>] >,
+
+  InstrItinData<IIC_SSE_MOVMSK, [InstrStage<3, [Port0]>] >,
+  InstrItinData<IIC_SSE_MASKMOV, [InstrStage<2, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_SSE_PEXTRW, [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_PINSRW, [InstrStage<1, [Port0]>] >,
+
+  InstrItinData<IIC_SSE_PABS_RR, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_PABS_RM, [InstrStage<1, [Port0]>] >,
+
+  InstrItinData<IIC_SSE_MOV_S_RR, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_MOV_S_RM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_MOV_S_MR, [InstrStage<1, [Port0]>] >,
+
+  InstrItinData<IIC_SSE_MOVA_P_RR, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_MOVA_P_RM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_MOVA_P_MR, [InstrStage<1, [Port0]>] >,
+
+  InstrItinData<IIC_SSE_MOVU_P_RR, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_MOVU_P_RM, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_MOVU_P_MR, [InstrStage<2, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_SSE_MOV_LH, [InstrStage<1, [Port0]>] >,
+
+  InstrItinData<IIC_SSE_LDDQU, [InstrStage<3, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_SSE_MOVDQ, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_MOVD_ToGP, [InstrStage<3, [Port0]>] >,
+  InstrItinData<IIC_SSE_MOVQ_RR, [InstrStage<1, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_SSE_MOVNT, [InstrStage<1, [Port0]>] >,
+
+  InstrItinData<IIC_SSE_PREFETCH, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_PAUSE, [InstrStage<17, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_LFENCE, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_MFENCE, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_SFENCE, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_LDMXCSR, [InstrStage<5, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_STMXCSR, [InstrStage<15, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_SSE_PHADDSUBD_RR, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_PHADDSUBD_RM, [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_PHADDSUBSW_RR, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_PHADDSUBSW_RM, [InstrStage<8, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_PHADDSUBW_RR, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_PHADDSUBW_RM, [InstrStage<8, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_PSHUFB_RR, [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_PSHUFB_RM, [InstrStage<5, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_PSIGN_RR, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_PSIGN_RM, [InstrStage<1, [Port0]>] >,
+
+  InstrItinData<IIC_SSE_PMADD, [InstrStage<5, [Port0]>] >,
+  InstrItinData<IIC_SSE_PMULHRSW, [InstrStage<5, [Port0]>] >,
+  InstrItinData<IIC_SSE_PALIGNRR, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_PALIGNRM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_MWAIT, [InstrStage<46, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_MONITOR, [InstrStage<45, [Port0, Port1]>] >,
+
+  // conversions
+  // to/from PD ...
+  InstrItinData<IIC_SSE_CVT_PD_RR, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_CVT_PD_RM, [InstrStage<8, [Port0, Port1]>] >,
+  // to/from PS except to/from PD and PS2PI
+  InstrItinData<IIC_SSE_CVT_PS_RR, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_CVT_PS_RM, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_CVT_Scalar_RR, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_CVT_Scalar_RM, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_CVT_SS2SI32_RR, [InstrStage<8, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_CVT_SS2SI32_RM, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_CVT_SS2SI64_RR, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_CVT_SS2SI64_RM, [InstrStage<10, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_CVT_SD2SI_RR, [InstrStage<8, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_CVT_SD2SI_RM, [InstrStage<9, [Port0, Port1]>] >,
+
+  // MMX MOVs
+  InstrItinData<IIC_MMX_MOV_MM_RM,  [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_MOV_REG_MM, [InstrStage<3, [Port0]>] >,
+  InstrItinData<IIC_MMX_MOVQ_RM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_MOVQ_RR, [InstrStage<1, [Port0, Port1]>] >,
+  // other MMX
+  InstrItinData<IIC_MMX_ALU_RM,  [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_ALU_RR,  [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_ALUQ_RM, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_ALUQ_RR, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PHADDSUBW_RM, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PHADDSUBW_RR, [InstrStage<5, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PHADDSUBD_RM, [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PHADDSUBD_RR, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PMUL, [InstrStage<4, [Port0]>] >,
+  InstrItinData<IIC_MMX_MISC_FUNC_MEM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_MISC_FUNC_REG, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PSADBW,   [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_SHIFT_RI, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_SHIFT_RM, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_SHIFT_RR, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_UNPCK_H_RM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_UNPCK_H_RR, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_UNPCK_L, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_PCK_RM,  [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_PCK_RR,  [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PSHUF,   [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_PEXTR,   [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PINSRW,  [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_MASKMOV, [InstrStage<1, [Port0]>] >,
+  // conversions
+  // from/to PD
+  InstrItinData<IIC_MMX_CVT_PD_RR, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_CVT_PD_RM, [InstrStage<8, [Port0, Port1]>] >,
+  // from/to PI
+  InstrItinData<IIC_MMX_CVT_PS_RR, [InstrStage<5, [Port1]>] >,
+  InstrItinData<IIC_MMX_CVT_PS_RM, [InstrStage<5, [Port0], 0>,
+                                    InstrStage<5, [Port1]>]>,
+
+  InstrItinData<IIC_CMPX_LOCK, [InstrStage<14, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPX_LOCK_8, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPX_LOCK_8B, [InstrStage<18, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPX_LOCK_16B, [InstrStage<22, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<3, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_FILD, [InstrStage<5, [Port0], 0>, InstrStage<5, [Port1]>] >,
+  InstrItinData<IIC_FLD,  [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_FLD80, [InstrStage<4, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_FST,   [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_FST80, [InstrStage<5, [Port0, Port1]>] >,
+  InstrItinData<IIC_FIST,  [InstrStage<6, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_FLDZ,   [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_FUCOM,  [InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_FUCOMI, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_FCOMI,  [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_FNSTSW, [InstrStage<10, [Port0, Port1]>] >,
+  InstrItinData<IIC_FNSTCW, [InstrStage<8, [Port0, Port1]>] >,
+  InstrItinData<IIC_FLDCW,  [InstrStage<5, [Port0, Port1]>] >,
+  InstrItinData<IIC_FNINIT, [InstrStage<63, [Port0, Port1]>] >,
+  InstrItinData<IIC_FFREE,  [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_FNCLEX, [InstrStage<25, [Port0, Port1]>] >,
+  InstrItinData<IIC_WAIT,  [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_FXAM,  [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_FNOP,  [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_FLDL,  [InstrStage<10, [Port0, Port1]>] >,
+  InstrItinData<IIC_F2XM1,  [InstrStage<99, [Port0, Port1]>] >,
+  InstrItinData<IIC_FYL2X,  [InstrStage<146, [Port0, Port1]>] >,
+  InstrItinData<IIC_FPTAN,  [InstrStage<168, [Port0, Port1]>] >,
+  InstrItinData<IIC_FPATAN,  [InstrStage<183, [Port0, Port1]>] >,
+  InstrItinData<IIC_FXTRACT,  [InstrStage<25, [Port0, Port1]>] >,
+  InstrItinData<IIC_FPREM1,  [InstrStage<71, [Port0, Port1]>] >,
+  InstrItinData<IIC_FPSTP,  [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_FPREM,  [InstrStage<55, [Port0, Port1]>] >,
+  InstrItinData<IIC_FYL2XP1,  [InstrStage<147, [Port0, Port1]>] >,
+  InstrItinData<IIC_FSINCOS,  [InstrStage<174, [Port0, Port1]>] >,
+  InstrItinData<IIC_FRNDINT,  [InstrStage<46, [Port0, Port1]>] >,
+  InstrItinData<IIC_FSCALE,  [InstrStage<77, [Port0, Port1]>] >,
+  InstrItinData<IIC_FCOMPP,  [InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_FXSAVE,  [InstrStage<140, [Port0, Port1]>] >,
+  InstrItinData<IIC_FXRSTOR,  [InstrStage<141, [Port0, Port1]>] >,
+  InstrItinData<IIC_FXCH, [InstrStage<1, [Port0], 0>, InstrStage<1, [Port1]>] >,
+
+  // System instructions
+  InstrItinData<IIC_CPUID, [InstrStage<121, [Port0, Port1]>] >,
+  InstrItinData<IIC_INT,   [InstrStage<127, [Port0, Port1]>] >,
+  InstrItinData<IIC_INT3,  [InstrStage<130, [Port0, Port1]>] >,
+  InstrItinData<IIC_INVD,  [InstrStage<1003, [Port0, Port1]>] >,
+  InstrItinData<IIC_INVLPG, [InstrStage<71, [Port0, Port1]>] >,
+  InstrItinData<IIC_IRET,  [InstrStage<109, [Port0, Port1]>] >,
+  InstrItinData<IIC_HLT,   [InstrStage<121, [Port0, Port1]>] >,
+  InstrItinData<IIC_LXS,   [InstrStage<10, [Port0, Port1]>] >,
+  InstrItinData<IIC_LTR,   [InstrStage<83, [Port0, Port1]>] >,
+  InstrItinData<IIC_RDTSC, [InstrStage<30, [Port0, Port1]>] >,
+  InstrItinData<IIC_RSM,   [InstrStage<741, [Port0, Port1]>] >,
+  InstrItinData<IIC_SIDT,  [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_SGDT,  [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_SLDT,  [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_STR,    [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_SWAPGS, [InstrStage<22, [Port0, Port1]>] >,
+  InstrItinData<IIC_SYSCALL, [InstrStage<96, [Port0, Port1]>] >,
+  InstrItinData<IIC_SYS_ENTER_EXIT, [InstrStage<88, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_IN_RR,  [InstrStage<94, [Port0, Port1]>] >,
+  InstrItinData<IIC_IN_RI,  [InstrStage<92, [Port0, Port1]>] >,
+  InstrItinData<IIC_OUT_RR, [InstrStage<68, [Port0, Port1]>] >,
+  InstrItinData<IIC_OUT_IR, [InstrStage<72, [Port0, Port1]>] >,
+  InstrItinData<IIC_INS,    [InstrStage<59, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_MOV_REG_DR, [InstrStage<88, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOV_DR_REG, [InstrStage<123, [Port0, Port1]>] >,
+  // worst case for mov REG_CRx
+  InstrItinData<IIC_MOV_REG_CR, [InstrStage<12, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOV_CR_REG, [InstrStage<136, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_MOV_REG_SR, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MOV_MEM_SR, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOV_SR_REG, [InstrStage<21, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOV_SR_MEM, [InstrStage<26, [Port0, Port1]>] >,
+  // LAR
+  InstrItinData<IIC_LAR_RM,  [InstrStage<50, [Port0, Port1]>] >,
+  InstrItinData<IIC_LAR_RR,  [InstrStage<54, [Port0, Port1]>] >,
+  // LSL
+  InstrItinData<IIC_LSL_RM,  [InstrStage<46, [Port0, Port1]>] >,
+  InstrItinData<IIC_LSL_RR,  [InstrStage<49, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_LGDT, [InstrStage<44, [Port0, Port1]>] >,
+  InstrItinData<IIC_LIDT, [InstrStage<44, [Port0, Port1]>] >,
+  InstrItinData<IIC_LLDT_REG, [InstrStage<60, [Port0, Port1]>] >,
+  InstrItinData<IIC_LLDT_MEM, [InstrStage<64, [Port0, Port1]>] >,
+  // push control register, segment registers
+  InstrItinData<IIC_PUSH_CS, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_PUSH_SR, [InstrStage<2, [Port0, Port1]>] >,
+  // pop control register, segment registers
+  InstrItinData<IIC_POP_SR,    [InstrStage<29, [Port0, Port1]>] >,
+  InstrItinData<IIC_POP_SR_SS, [InstrStage<48, [Port0, Port1]>] >,
+  // VERR, VERW
+  InstrItinData<IIC_VERR,     [InstrStage<41, [Port0, Port1]>] >,
+  InstrItinData<IIC_VERW_REG, [InstrStage<51, [Port0, Port1]>] >,
+  InstrItinData<IIC_VERW_MEM, [InstrStage<50, [Port0, Port1]>] >,
+  // WRMSR, RDMSR
+  InstrItinData<IIC_WRMSR, [InstrStage<202, [Port0, Port1]>] >,
+  InstrItinData<IIC_RDMSR, [InstrStage<78, [Port0, Port1]>] >,
+  InstrItinData<IIC_RDPMC, [InstrStage<46, [Port0, Port1]>] >,
+  // SMSW, LMSW
+  InstrItinData<IIC_SMSW, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_LMSW_REG, [InstrStage<69, [Port0, Port1]>] >,
+  InstrItinData<IIC_LMSW_MEM, [InstrStage<67, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_ENTER, [InstrStage<32, [Port0, Port1]>] >,
+  InstrItinData<IIC_LEAVE, [InstrStage<2, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_POP_MEM, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_POP_REG16, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_POP_REG, [InstrStage<1, [Port0], 0>,
+                            InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_POP_F, [InstrStage<32, [Port0, Port1]>] >,
+  InstrItinData<IIC_POP_FD, [InstrStage<26, [Port0, Port1]>] >,
+  InstrItinData<IIC_POP_A, [InstrStage<9, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_PUSH_IMM, [InstrStage<1, [Port0], 0>,
+                               InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_PUSH_MEM, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_PUSH_REG, [InstrStage<1, [Port0], 0>,
+                               InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_PUSH_F, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_PUSH_A, [InstrStage<8, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_BSWAP, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_BIT_SCAN_MEM, [InstrStage<16, [Port0, Port1]>] >,
+  InstrItinData<IIC_BIT_SCAN_REG, [InstrStage<16, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOVS, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_STOS, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_SCAS, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPS, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOV, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOV_MEM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_AHF, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_BT_MI, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_BT_MR, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_BT_RI, [InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_BT_RR, [InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_BTX_MI, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_BTX_MR, [InstrStage<11, [Port0, Port1]>] >,
+  InstrItinData<IIC_BTX_RI, [InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_BTX_RR, [InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_XCHG_REG, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_XCHG_MEM, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_XADD_REG, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_XADD_MEM, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPXCHG_MEM, [InstrStage<14, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPXCHG_REG, [InstrStage<15, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPXCHG_MEM8, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPXCHG_REG8, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPXCHG_8B, [InstrStage<18, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPXCHG_16B, [InstrStage<22, [Port0, Port1]>] >,
+  InstrItinData<IIC_LODS, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_OUTS, [InstrStage<74, [Port0, Port1]>] >,
+  InstrItinData<IIC_CLC, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_CLD, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_CLI, [InstrStage<14, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMC, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_CLTS, [InstrStage<33, [Port0, Port1]>] >,
+  InstrItinData<IIC_STC, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_STI, [InstrStage<17, [Port0, Port1]>] >,
+  InstrItinData<IIC_STD, [InstrStage<21, [Port0, Port1]>] >,
+  InstrItinData<IIC_XLAT, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_AAA, [InstrStage<13, [Port0, Port1]>] >,
+  InstrItinData<IIC_AAD, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_AAM, [InstrStage<21, [Port0, Port1]>] >,
+  InstrItinData<IIC_AAS, [InstrStage<13, [Port0, Port1]>] >,
+  InstrItinData<IIC_DAA, [InstrStage<18, [Port0, Port1]>] >,
+  InstrItinData<IIC_DAS, [InstrStage<20, [Port0, Port1]>] >,
+  InstrItinData<IIC_BOUND, [InstrStage<11, [Port0, Port1]>] >,
+  InstrItinData<IIC_ARPL_REG, [InstrStage<24, [Port0, Port1]>] >,
+  InstrItinData<IIC_ARPL_MEM, [InstrStage<23, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOVBE, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_CBW, [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_EMMS, [InstrStage<5, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_NOP, [InstrStage<1, [Port0, Port1]>] >
+  ]>;
+
+// Atom machine model.
+def AtomModel : SchedMachineModel {
+  let IssueWidth = 2;  // Allows 2 instructions per scheduling group.
+  let MicroOpBufferSize = 0; // In-order execution, always hide latency.
+  let LoadLatency = 3; // Expected cycles, may be overriden by OperandCycles.
+  let HighLatency = 30;// Expected, may be overriden by OperandCycles.
+
+  // On the Atom, the throughput for taken branches is 2 cycles. For small
+  // simple loops, expand by a small factor to hide the backedge cost.
+  let LoopMicroOpBufferSize = 10;
+  let PostRAScheduler = 1;
+
+  let Itineraries = AtomItineraries;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86ScheduleSLM.td b/contrib/llvm/lib/Target/X86/X86ScheduleSLM.td
new file mode 100644
index 0000000..90d8587
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ScheduleSLM.td
@@ -0,0 +1,232 @@
+//=- X86ScheduleSLM.td - X86 Silvermont Scheduling -----------*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Intel Silvermont to support
+// instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def SLMModel : SchedMachineModel {
+  // All x86 instructions are modeled as a single micro-op, and SLM can decode 2
+  // instructions per cycle.
+  let IssueWidth = 2;
+  let MicroOpBufferSize = 32; // Based on the reorder buffer.
+  let LoadLatency = 3;
+  let MispredictPenalty = 10;
+  let PostRAScheduler = 1;
+
+  // For small loops, expand by a small factor to hide the backedge cost.
+  let LoopMicroOpBufferSize = 10;
+
+  // FIXME: SSE4 is unimplemented. This flag is set to allow
+  // the scheduler to assign a default model to unrecognized opcodes.
+  let CompleteModel = 0;
+}
+
+let SchedModel = SLMModel in {
+
+// Silvermont has 5 reservation stations for micro-ops
+
+def IEC_RSV0 : ProcResource<1>;
+def IEC_RSV1 : ProcResource<1>;
+def FPC_RSV0 : ProcResource<1> { let BufferSize = 1; }
+def FPC_RSV1 : ProcResource<1> { let BufferSize = 1; }
+def MEC_RSV  : ProcResource<1>;
+
+// Many micro-ops are capable of issuing on multiple ports.
+def IEC_RSV01  : ProcResGroup<[IEC_RSV0, IEC_RSV1]>;
+def FPC_RSV01  : ProcResGroup<[FPC_RSV0, FPC_RSV1]>;
+
+def SMDivider      : ProcResource<1>;
+def SMFPMultiplier : ProcResource<1>;
+def SMFPDivider    : ProcResource<1>;
+
+// Loads are 3 cycles, so ReadAfterLd registers needn't be available until 3
+// cycles after the memory operand.
+def : ReadAdvance<ReadAfterLd, 3>;
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when queued in the reservation station.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+multiclass SMWriteResPair<X86FoldableSchedWrite SchedRW,
+                          ProcResourceKind ExePort,
+                          int Lat> {
+  // Register variant is using a single cycle on ExePort.
+  def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+
+  // Memory variant also uses a cycle on MEC_RSV and adds 3 cycles to the
+  // latency.
+  def : WriteRes<SchedRW.Folded, [MEC_RSV, ExePort]> {
+     let Latency = !add(Lat, 3);
+  }
+}
+
+// A folded store needs a cycle on MEC_RSV for the store data, but it does not
+// need an extra port cycle to recompute the address.
+def : WriteRes<WriteRMW, [MEC_RSV]>;
+
+def : WriteRes<WriteStore, [IEC_RSV01, MEC_RSV]>;
+def : WriteRes<WriteLoad,  [MEC_RSV]> { let Latency = 3; }
+def : WriteRes<WriteMove,  [IEC_RSV01]>;
+def : WriteRes<WriteZero,  []>;
+
+defm : SMWriteResPair<WriteALU,   IEC_RSV01, 1>;
+defm : SMWriteResPair<WriteIMul,  IEC_RSV1,  3>;
+defm : SMWriteResPair<WriteShift, IEC_RSV0,  1>;
+defm : SMWriteResPair<WriteJump,  IEC_RSV1,   1>;
+
+// This is for simple LEAs with one or two input operands.
+// The complex ones can only execute on port 1, and they require two cycles on
+// the port to read all inputs. We don't model that.
+def : WriteRes<WriteLEA, [IEC_RSV1]>;
+
+// This is quite rough, latency depends on the dividend.
+def : WriteRes<WriteIDiv, [IEC_RSV01, SMDivider]> {
+  let Latency = 25;
+  let ResourceCycles = [1, 25];
+}
+def : WriteRes<WriteIDivLd, [MEC_RSV, IEC_RSV01, SMDivider]> {
+  let Latency = 29;
+  let ResourceCycles = [1, 1, 25];
+}
+
+// Scalar and vector floating point.
+defm : SMWriteResPair<WriteFAdd,   FPC_RSV1, 3>;
+defm : SMWriteResPair<WriteFRcp,   FPC_RSV0, 5>;
+defm : SMWriteResPair<WriteFSqrt,  FPC_RSV0, 15>;
+defm : SMWriteResPair<WriteCvtF2I, FPC_RSV01, 4>;
+defm : SMWriteResPair<WriteCvtI2F, FPC_RSV01, 4>;
+defm : SMWriteResPair<WriteCvtF2F, FPC_RSV01, 4>;
+defm : SMWriteResPair<WriteFShuffle,  FPC_RSV0,  1>;
+defm : SMWriteResPair<WriteFBlend,  FPC_RSV0,  1>;
+
+// This is quite rough, latency depends on precision
+def : WriteRes<WriteFMul, [FPC_RSV0, SMFPMultiplier]> {
+  let Latency = 5;
+  let ResourceCycles = [1, 2];
+}
+def : WriteRes<WriteFMulLd, [MEC_RSV, FPC_RSV0, SMFPMultiplier]> {
+  let Latency = 8;
+  let ResourceCycles = [1, 1, 2];
+}
+
+def : WriteRes<WriteFDiv, [FPC_RSV0, SMFPDivider]> {
+  let Latency = 34;
+  let ResourceCycles = [1, 34];
+}
+def : WriteRes<WriteFDivLd, [MEC_RSV, FPC_RSV0, SMFPDivider]> {
+  let Latency = 37;
+  let ResourceCycles = [1, 1, 34];
+}
+
+// Vector integer operations.
+defm : SMWriteResPair<WriteVecShift, FPC_RSV0,  1>;
+defm : SMWriteResPair<WriteVecLogic, FPC_RSV01, 1>;
+defm : SMWriteResPair<WriteVecALU,   FPC_RSV01,  1>;
+defm : SMWriteResPair<WriteVecIMul,  FPC_RSV0,   4>;
+defm : SMWriteResPair<WriteShuffle,  FPC_RSV0,  1>;
+defm : SMWriteResPair<WriteBlend,  FPC_RSV0,  1>;
+defm : SMWriteResPair<WriteMPSAD,  FPC_RSV0,  7>;
+
+// String instructions.
+// Packed Compare Implicit Length Strings, Return Mask
+def : WriteRes<WritePCmpIStrM, [FPC_RSV0]> {
+  let Latency = 13;
+  let ResourceCycles = [13];
+}
+def : WriteRes<WritePCmpIStrMLd, [FPC_RSV0, MEC_RSV]> {
+  let Latency = 13;
+  let ResourceCycles = [13, 1];
+}
+
+// Packed Compare Explicit Length Strings, Return Mask
+def : WriteRes<WritePCmpEStrM, [FPC_RSV0]> {
+  let Latency = 17;
+  let ResourceCycles = [17];
+}
+def : WriteRes<WritePCmpEStrMLd, [FPC_RSV0, MEC_RSV]> {
+  let Latency = 17;
+  let ResourceCycles = [17, 1];
+}
+
+// Packed Compare Implicit Length Strings, Return Index
+def : WriteRes<WritePCmpIStrI, [FPC_RSV0]> {
+  let Latency = 17;
+  let ResourceCycles = [17];
+}
+def : WriteRes<WritePCmpIStrILd, [FPC_RSV0, MEC_RSV]> {
+  let Latency = 17;
+  let ResourceCycles = [17, 1];
+}
+
+// Packed Compare Explicit Length Strings, Return Index
+def : WriteRes<WritePCmpEStrI, [FPC_RSV0]> {
+  let Latency = 21;
+  let ResourceCycles = [21];
+}
+def : WriteRes<WritePCmpEStrILd, [FPC_RSV0, MEC_RSV]> {
+  let Latency = 21;
+  let ResourceCycles = [21, 1];
+}
+
+// AES Instructions.
+def : WriteRes<WriteAESDecEnc, [FPC_RSV0]> {
+  let Latency = 8;
+  let ResourceCycles = [5];
+}
+def : WriteRes<WriteAESDecEncLd, [FPC_RSV0, MEC_RSV]> {
+  let Latency = 8;
+  let ResourceCycles = [5, 1];
+}
+
+def : WriteRes<WriteAESIMC, [FPC_RSV0]> {
+  let Latency = 8;
+  let ResourceCycles = [5];
+}
+def : WriteRes<WriteAESIMCLd, [FPC_RSV0, MEC_RSV]> {
+  let Latency = 8;
+  let ResourceCycles = [5, 1];
+}
+
+def : WriteRes<WriteAESKeyGen, [FPC_RSV0]> {
+  let Latency = 8;
+  let ResourceCycles = [5];
+}
+def : WriteRes<WriteAESKeyGenLd, [FPC_RSV0, MEC_RSV]> {
+  let Latency = 8;
+  let ResourceCycles = [5, 1];
+}
+
+// Carry-less multiplication instructions.
+def : WriteRes<WriteCLMul, [FPC_RSV0]> {
+  let Latency = 10;
+  let ResourceCycles = [10];
+}
+def : WriteRes<WriteCLMulLd, [FPC_RSV0, MEC_RSV]> {
+  let Latency = 10;
+  let ResourceCycles = [10, 1];
+}
+
+
+def : WriteRes<WriteSystem,     [FPC_RSV0]> { let Latency = 100; }
+def : WriteRes<WriteMicrocoded, [FPC_RSV0]> { let Latency = 100; }
+def : WriteRes<WriteFence, [MEC_RSV]>;
+def : WriteRes<WriteNop, []>;
+
+// AVX is not supported on that architecture, but we should define the basic
+// scheduling resources anyway.
+def  : WriteRes<WriteIMulH, [FPC_RSV0]>;
+defm : SMWriteResPair<WriteVarBlend, FPC_RSV0, 1>;
+defm : SMWriteResPair<WriteFVarBlend, FPC_RSV0, 1>;
+defm : SMWriteResPair<WriteFShuffle256, FPC_RSV0,  1>;
+defm : SMWriteResPair<WriteShuffle256, FPC_RSV0,  1>;
+defm : SMWriteResPair<WriteVarVecShift, FPC_RSV0,  1>;
+} // SchedModel
diff --git a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
new file mode 100644
index 0000000..a83dd9b
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -0,0 +1,267 @@
+//===-- X86SelectionDAGInfo.cpp - X86 SelectionDAG Info -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the X86SelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstrInfo.h"
+#include "X86ISelLowering.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "X86SelectionDAGInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/Target/TargetLowering.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-selectiondag-info"
+
+X86SelectionDAGInfo::X86SelectionDAGInfo(const DataLayout &DL)
+    : TargetSelectionDAGInfo(&DL) {}
+
+X86SelectionDAGInfo::~X86SelectionDAGInfo() {}
+
+SDValue
+X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
+                                             SDValue Chain,
+                                             SDValue Dst, SDValue Src,
+                                             SDValue Size, unsigned Align,
+                                             bool isVolatile,
+                                         MachinePointerInfo DstPtrInfo) const {
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  const X86Subtarget &Subtarget = DAG.getTarget().getSubtarget<X86Subtarget>();
+
+  // If to a segment-relative address space, use the default lowering.
+  if (DstPtrInfo.getAddrSpace() >= 256)
+    return SDValue();
+
+  // If not DWORD aligned or size is more than the threshold, call the library.
+  // The libc version is likely to be faster for these cases. It can use the
+  // address value and run time information about the CPU.
+  if ((Align & 3) != 0 || !ConstantSize ||
+      ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold()) {
+    // Check to see if there is a specialized entry-point for memory zeroing.
+    ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
+
+    if (const char *bzeroEntry =  V &&
+        V->isNullValue() ? Subtarget.getBZeroEntry() : nullptr) {
+      EVT IntPtr = DAG.getTargetLoweringInfo().getPointerTy();
+      Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
+      TargetLowering::ArgListTy Args;
+      TargetLowering::ArgListEntry Entry;
+      Entry.Node = Dst;
+      Entry.Ty = IntPtrTy;
+      Args.push_back(Entry);
+      Entry.Node = Size;
+      Args.push_back(Entry);
+
+      TargetLowering::CallLoweringInfo CLI(DAG);
+      CLI.setDebugLoc(dl).setChain(Chain)
+        .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+                   DAG.getExternalSymbol(bzeroEntry, IntPtr), std::move(Args),
+                   0)
+        .setDiscardResult();
+
+      std::pair<SDValue,SDValue> CallResult = DAG.getTargetLoweringInfo().LowerCallTo(CLI);
+      return CallResult.second;
+    }
+
+    // Otherwise have the target-independent code call memset.
+    return SDValue();
+  }
+
+  uint64_t SizeVal = ConstantSize->getZExtValue();
+  SDValue InFlag;
+  EVT AVT;
+  SDValue Count;
+  ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Src);
+  unsigned BytesLeft = 0;
+  bool TwoRepStos = false;
+  if (ValC) {
+    unsigned ValReg;
+    uint64_t Val = ValC->getZExtValue() & 255;
+
+    // If the value is a constant, then we can potentially use larger sets.
+    switch (Align & 3) {
+    case 2:   // WORD aligned
+      AVT = MVT::i16;
+      ValReg = X86::AX;
+      Val = (Val << 8) | Val;
+      break;
+    case 0:  // DWORD aligned
+      AVT = MVT::i32;
+      ValReg = X86::EAX;
+      Val = (Val << 8)  | Val;
+      Val = (Val << 16) | Val;
+      if (Subtarget.is64Bit() && ((Align & 0x7) == 0)) {  // QWORD aligned
+        AVT = MVT::i64;
+        ValReg = X86::RAX;
+        Val = (Val << 32) | Val;
+      }
+      break;
+    default:  // Byte aligned
+      AVT = MVT::i8;
+      ValReg = X86::AL;
+      Count = DAG.getIntPtrConstant(SizeVal);
+      break;
+    }
+
+    if (AVT.bitsGT(MVT::i8)) {
+      unsigned UBytes = AVT.getSizeInBits() / 8;
+      Count = DAG.getIntPtrConstant(SizeVal / UBytes);
+      BytesLeft = SizeVal % UBytes;
+    }
+
+    Chain  = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, AVT),
+                              InFlag);
+    InFlag = Chain.getValue(1);
+  } else {
+    AVT = MVT::i8;
+    Count  = DAG.getIntPtrConstant(SizeVal);
+    Chain  = DAG.getCopyToReg(Chain, dl, X86::AL, Src, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RCX : X86::ECX,
+                           Count, InFlag);
+  InFlag = Chain.getValue(1);
+  Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RDI : X86::EDI,
+                           Dst, InFlag);
+  InFlag = Chain.getValue(1);
+
+  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag };
+  Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);
+
+  if (TwoRepStos) {
+    InFlag = Chain.getValue(1);
+    Count  = Size;
+    EVT CVT = Count.getValueType();
+    SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count,
+                               DAG.getConstant((AVT == MVT::i64) ? 7 : 3, CVT));
+    Chain  = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX :
+                                                             X86::ECX,
+                              Left, InFlag);
+    InFlag = Chain.getValue(1);
+    Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+    SDValue Ops[] = { Chain, DAG.getValueType(MVT::i8), InFlag };
+    Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);
+  } else if (BytesLeft) {
+    // Handle the last 1 - 7 bytes.
+    unsigned Offset = SizeVal - BytesLeft;
+    EVT AddrVT = Dst.getValueType();
+    EVT SizeVT = Size.getValueType();
+
+    Chain = DAG.getMemset(Chain, dl,
+                          DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
+                                      DAG.getConstant(Offset, AddrVT)),
+                          Src,
+                          DAG.getConstant(BytesLeft, SizeVT),
+                          Align, isVolatile, DstPtrInfo.getWithOffset(Offset));
+  }
+
+  // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain.
+  return Chain;
+}
+
+SDValue
+X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
+                                        SDValue Chain, SDValue Dst, SDValue Src,
+                                        SDValue Size, unsigned Align,
+                                        bool isVolatile, bool AlwaysInline,
+                                         MachinePointerInfo DstPtrInfo,
+                                         MachinePointerInfo SrcPtrInfo) const {
+  // This requires the copy size to be a constant, preferably
+  // within a subtarget-specific limit.
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  const X86Subtarget &Subtarget = DAG.getTarget().getSubtarget<X86Subtarget>();
+  if (!ConstantSize)
+    return SDValue();
+  uint64_t SizeVal = ConstantSize->getZExtValue();
+  if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold())
+    return SDValue();
+
+  /// If not DWORD aligned, it is more efficient to call the library.  However
+  /// if calling the library is not allowed (AlwaysInline), then soldier on as
+  /// the code generated here is better than the long load-store sequence we
+  /// would otherwise get.
+  if (!AlwaysInline && (Align & 3) != 0)
+    return SDValue();
+
+  // If to a segment-relative address space, use the default lowering.
+  if (DstPtrInfo.getAddrSpace() >= 256 ||
+      SrcPtrInfo.getAddrSpace() >= 256)
+    return SDValue();
+
+  // ESI might be used as a base pointer, in that case we can't simply overwrite
+  // the register.  Fall back to generic code.
+  const X86RegisterInfo *TRI =
+      static_cast<const X86RegisterInfo *>(DAG.getTarget().getRegisterInfo());
+  if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
+      TRI->getBaseRegister() == X86::ESI)
+    return SDValue();
+
+  MVT AVT;
+  if (Align & 1)
+    AVT = MVT::i8;
+  else if (Align & 2)
+    AVT = MVT::i16;
+  else if (Align & 4)
+    // DWORD aligned
+    AVT = MVT::i32;
+  else
+    // QWORD aligned
+    AVT = Subtarget.is64Bit() ? MVT::i64 : MVT::i32;
+
+  unsigned UBytes = AVT.getSizeInBits() / 8;
+  unsigned CountVal = SizeVal / UBytes;
+  SDValue Count = DAG.getIntPtrConstant(CountVal);
+  unsigned BytesLeft = SizeVal % UBytes;
+
+  SDValue InFlag;
+  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RCX :
+                                                              X86::ECX,
+                            Count, InFlag);
+  InFlag = Chain.getValue(1);
+  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RDI :
+                                                              X86::EDI,
+                            Dst, InFlag);
+  InFlag = Chain.getValue(1);
+  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RSI :
+                                                              X86::ESI,
+                            Src, InFlag);
+  InFlag = Chain.getValue(1);
+
+  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag };
+  SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops);
+
+  SmallVector<SDValue, 4> Results;
+  Results.push_back(RepMovs);
+  if (BytesLeft) {
+    // Handle the last 1 - 7 bytes.
+    unsigned Offset = SizeVal - BytesLeft;
+    EVT DstVT = Dst.getValueType();
+    EVT SrcVT = Src.getValueType();
+    EVT SizeVT = Size.getValueType();
+    Results.push_back(DAG.getMemcpy(Chain, dl,
+                                    DAG.getNode(ISD::ADD, dl, DstVT, Dst,
+                                                DAG.getConstant(Offset, DstVT)),
+                                    DAG.getNode(ISD::ADD, dl, SrcVT, Src,
+                                                DAG.getConstant(Offset, SrcVT)),
+                                    DAG.getConstant(BytesLeft, SizeVT),
+                                    Align, isVolatile, AlwaysInline,
+                                    DstPtrInfo.getWithOffset(Offset),
+                                    SrcPtrInfo.getWithOffset(Offset)));
+  }
+
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h
new file mode 100644
index 0000000..c12555a
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h
@@ -0,0 +1,48 @@
+//===-- X86SelectionDAGInfo.h - X86 SelectionDAG Info -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86 subclass for TargetSelectionDAGInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86SELECTIONDAGINFO_H
+#define X86SELECTIONDAGINFO_H
+
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+
+namespace llvm {
+
+class X86TargetLowering;
+class X86TargetMachine;
+class X86Subtarget;
+
+class X86SelectionDAGInfo : public TargetSelectionDAGInfo {
+public:
+  explicit X86SelectionDAGInfo(const DataLayout &DL);
+  ~X86SelectionDAGInfo();
+
+  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
+                                  SDValue Chain,
+                                  SDValue Dst, SDValue Src,
+                                  SDValue Size, unsigned Align,
+                                  bool isVolatile,
+                                  MachinePointerInfo DstPtrInfo) const override;
+
+  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
+                                  SDValue Chain,
+                                  SDValue Dst, SDValue Src,
+                                  SDValue Size, unsigned Align,
+                                  bool isVolatile, bool AlwaysInline,
+                                  MachinePointerInfo DstPtrInfo,
+                                  MachinePointerInfo SrcPtrInfo) const override;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.cpp b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp
new file mode 100644
index 0000000..41551a1
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -0,0 +1,364 @@
+//===-- X86Subtarget.cpp - X86 Subtarget Information ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the X86 specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86Subtarget.h"
+#include "X86InstrInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif
+
+using namespace llvm;
+
+#define DEBUG_TYPE "subtarget"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "X86GenSubtargetInfo.inc"
+
+// Temporary option to control early if-conversion for x86 while adding machine
+// models.
+static cl::opt<bool>
+X86EarlyIfConv("x86-early-ifcvt", cl::Hidden,
+               cl::desc("Enable early if-conversion on X86"));
+
+
+/// ClassifyBlockAddressReference - Classify a blockaddress reference for the
+/// current subtarget according to how we should reference it in a non-pcrel
+/// context.
+unsigned char X86Subtarget::ClassifyBlockAddressReference() const {
+  if (isPICStyleGOT())    // 32-bit ELF targets.
+    return X86II::MO_GOTOFF;
+
+  if (isPICStyleStubPIC())   // Darwin/32 in PIC mode.
+    return X86II::MO_PIC_BASE_OFFSET;
+
+  // Direct static reference to label.
+  return X86II::MO_NO_FLAG;
+}
+
+/// ClassifyGlobalReference - Classify a global variable reference for the
+/// current subtarget according to how we should reference it in a non-pcrel
+/// context.
+unsigned char X86Subtarget::
+ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const {
+  // DLLImport only exists on windows, it is implemented as a load from a
+  // DLLIMPORT stub.
+  if (GV->hasDLLImportStorageClass())
+    return X86II::MO_DLLIMPORT;
+
+  // Determine whether this is a reference to a definition or a declaration.
+  // Materializable GVs (in JIT lazy compilation mode) do not require an extra
+  // load from stub.
+  bool isDecl = GV->hasAvailableExternallyLinkage();
+  if (GV->isDeclaration() && !GV->isMaterializable())
+    isDecl = true;
+
+  // X86-64 in PIC mode.
+  if (isPICStyleRIPRel()) {
+    // Large model never uses stubs.
+    if (TM.getCodeModel() == CodeModel::Large)
+      return X86II::MO_NO_FLAG;
+
+    if (isTargetDarwin()) {
+      // If symbol visibility is hidden, the extra load is not needed if
+      // target is x86-64 or the symbol is definitely defined in the current
+      // translation unit.
+      if (GV->hasDefaultVisibility() &&
+          (isDecl || GV->isWeakForLinker()))
+        return X86II::MO_GOTPCREL;
+    } else if (!isTargetWin64()) {
+      assert(isTargetELF() && "Unknown rip-relative target");
+
+      // Extra load is needed for all externally visible.
+      if (!GV->hasLocalLinkage() && GV->hasDefaultVisibility())
+        return X86II::MO_GOTPCREL;
+    }
+
+    return X86II::MO_NO_FLAG;
+  }
+
+  if (isPICStyleGOT()) {   // 32-bit ELF targets.
+    // Extra load is needed for all externally visible.
+    if (GV->hasLocalLinkage() || GV->hasHiddenVisibility())
+      return X86II::MO_GOTOFF;
+    return X86II::MO_GOT;
+  }
+
+  if (isPICStyleStubPIC()) {  // Darwin/32 in PIC mode.
+    // Determine whether we have a stub reference and/or whether the reference
+    // is relative to the PIC base or not.
+
+    // If this is a strong reference to a definition, it is definitely not
+    // through a stub.
+    if (!isDecl && !GV->isWeakForLinker())
+      return X86II::MO_PIC_BASE_OFFSET;
+
+    // Unless we have a symbol with hidden visibility, we have to go through a
+    // normal $non_lazy_ptr stub because this symbol might be resolved late.
+    if (!GV->hasHiddenVisibility())  // Non-hidden $non_lazy_ptr reference.
+      return X86II::MO_DARWIN_NONLAZY_PIC_BASE;
+
+    // If symbol visibility is hidden, we have a stub for common symbol
+    // references and external declarations.
+    if (isDecl || GV->hasCommonLinkage()) {
+      // Hidden $non_lazy_ptr reference.
+      return X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE;
+    }
+
+    // Otherwise, no stub.
+    return X86II::MO_PIC_BASE_OFFSET;
+  }
+
+  if (isPICStyleStubNoDynamic()) {  // Darwin/32 in -mdynamic-no-pic mode.
+    // Determine whether we have a stub reference.
+
+    // If this is a strong reference to a definition, it is definitely not
+    // through a stub.
+    if (!isDecl && !GV->isWeakForLinker())
+      return X86II::MO_NO_FLAG;
+
+    // Unless we have a symbol with hidden visibility, we have to go through a
+    // normal $non_lazy_ptr stub because this symbol might be resolved late.
+    if (!GV->hasHiddenVisibility())  // Non-hidden $non_lazy_ptr reference.
+      return X86II::MO_DARWIN_NONLAZY;
+
+    // Otherwise, no stub.
+    return X86II::MO_NO_FLAG;
+  }
+
+  // Direct static reference to global.
+  return X86II::MO_NO_FLAG;
+}
+
+
+/// getBZeroEntry - This function returns the name of a function which has an
+/// interface like the non-standard bzero function, if such a function exists on
+/// the current subtarget and it is considered prefereable over memset with zero
+/// passed as the second argument. Otherwise it returns null.
+const char *X86Subtarget::getBZeroEntry() const {
+  // Darwin 10 has a __bzero entry point for this purpose.
+  if (getTargetTriple().isMacOSX() &&
+      !getTargetTriple().isMacOSXVersionLT(10, 6))
+    return "__bzero";
+
+  return nullptr;
+}
+
+bool X86Subtarget::hasSinCos() const {
+  return getTargetTriple().isMacOSX() &&
+    !getTargetTriple().isMacOSXVersionLT(10, 9) &&
+    is64Bit();
+}
+
+/// IsLegalToCallImmediateAddr - Return true if the subtarget allows calls
+/// to immediate address.
+bool X86Subtarget::IsLegalToCallImmediateAddr(const TargetMachine &TM) const {
+  // FIXME: I386 PE/COFF supports PC relative calls using IMAGE_REL_I386_REL32
+  // but WinCOFFObjectWriter::RecordRelocation cannot emit them.  Once it does,
+  // the following check for Win32 should be removed.
+  if (In64BitMode || isTargetWin32())
+    return false;
+  return isTargetELF() || TM.getRelocationModel() == Reloc::Static;
+}
+
+void X86Subtarget::resetSubtargetFeatures(const MachineFunction *MF) {
+  AttributeSet FnAttrs = MF->getFunction()->getAttributes();
+  Attribute CPUAttr =
+      FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-cpu");
+  Attribute FSAttr =
+      FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-features");
+  std::string CPU =
+      !CPUAttr.hasAttribute(Attribute::None) ? CPUAttr.getValueAsString() : "";
+  std::string FS =
+      !FSAttr.hasAttribute(Attribute::None) ? FSAttr.getValueAsString() : "";
+  if (!FS.empty()) {
+    initializeEnvironment();
+    resetSubtargetFeatures(CPU, FS);
+  }
+}
+
+void X86Subtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) {
+  std::string CPUName = CPU;
+  if (CPUName.empty())
+    CPUName = "generic";
+
+  // Make sure 64-bit features are available in 64-bit mode. (But make sure
+  // SSE2 can be turned off explicitly.)
+  std::string FullFS = FS;
+  if (In64BitMode) {
+    if (!FullFS.empty())
+      FullFS = "+64bit,+sse2," + FullFS;
+    else
+      FullFS = "+64bit,+sse2";
+  }
+
+  // If feature string is not empty, parse features string.
+  ParseSubtargetFeatures(CPUName, FullFS);
+
+  // Make sure the right MCSchedModel is used.
+  InitCPUSchedModel(CPUName);
+
+  InstrItins = getInstrItineraryForCPU(CPUName);
+
+  // It's important to keep the MCSubtargetInfo feature bits in sync with
+  // target data structure which is shared with MC code emitter, etc.
+  if (In64BitMode)
+    ToggleFeature(X86::Mode64Bit);
+  else if (In32BitMode)
+    ToggleFeature(X86::Mode32Bit);
+  else if (In16BitMode)
+    ToggleFeature(X86::Mode16Bit);
+  else
+    llvm_unreachable("Not 16-bit, 32-bit or 64-bit mode!");
+
+  DEBUG(dbgs() << "Subtarget features: SSELevel " << X86SSELevel
+               << ", 3DNowLevel " << X863DNowLevel
+               << ", 64bit " << HasX86_64 << "\n");
+  assert((!In64BitMode || HasX86_64) &&
+         "64-bit code requested on a subtarget that doesn't support it!");
+
+  // Stack alignment is 16 bytes on Darwin, Linux and Solaris (both
+  // 32 and 64 bit) and for all 64-bit targets.
+  if (StackAlignOverride)
+    stackAlignment = StackAlignOverride;
+  else if (isTargetDarwin() || isTargetLinux() || isTargetSolaris() ||
+           In64BitMode)
+    stackAlignment = 16;
+}
+
+void X86Subtarget::initializeEnvironment() {
+  X86SSELevel = NoMMXSSE;
+  X863DNowLevel = NoThreeDNow;
+  HasCMov = false;
+  HasX86_64 = false;
+  HasPOPCNT = false;
+  HasSSE4A = false;
+  HasAES = false;
+  HasPCLMUL = false;
+  HasFMA = false;
+  HasFMA4 = false;
+  HasXOP = false;
+  HasTBM = false;
+  HasMOVBE = false;
+  HasRDRAND = false;
+  HasF16C = false;
+  HasFSGSBase = false;
+  HasLZCNT = false;
+  HasBMI = false;
+  HasBMI2 = false;
+  HasRTM = false;
+  HasHLE = false;
+  HasERI = false;
+  HasCDI = false;
+  HasPFI = false;
+  HasDQI = false;
+  HasBWI = false;
+  HasVLX = false;
+  HasADX = false;
+  HasSHA = false;
+  HasPRFCHW = false;
+  HasRDSEED = false;
+  IsBTMemSlow = false;
+  IsSHLDSlow = false;
+  IsUAMemFast = false;
+  HasVectorUAMem = false;
+  HasCmpxchg16b = false;
+  UseLeaForSP = false;
+  HasSlowDivide = false;
+  PadShortFunctions = false;
+  CallRegIndirect = false;
+  LEAUsesAG = false;
+  SlowLEA = false;
+  SlowIncDec = false;
+  stackAlignment = 4;
+  // FIXME: this is a known good value for Yonah. How about others?
+  MaxInlineSizeThreshold = 128;
+}
+
+static std::string computeDataLayout(const X86Subtarget &ST) {
+  // X86 is little endian
+  std::string Ret = "e";
+
+  Ret += DataLayout::getManglingComponent(ST.getTargetTriple());
+  // X86 and x32 have 32 bit pointers.
+  if (ST.isTarget64BitILP32() || !ST.is64Bit())
+    Ret += "-p:32:32";
+
+  // Some ABIs align 64 bit integers and doubles to 64 bits, others to 32.
+  if (ST.is64Bit() || ST.isOSWindows() || ST.isTargetNaCl())
+    Ret += "-i64:64";
+  else
+    Ret += "-f64:32:64";
+
+  // Some ABIs align long double to 128 bits, others to 32.
+  if (ST.isTargetNaCl())
+    ; // No f80
+  else if (ST.is64Bit() || ST.isTargetDarwin())
+    Ret += "-f80:128";
+  else
+    Ret += "-f80:32";
+
+  // The registers can hold 8, 16, 32 or, in x86-64, 64 bits.
+  if (ST.is64Bit())
+    Ret += "-n8:16:32:64";
+  else
+    Ret += "-n8:16:32";
+
+  // The stack is aligned to 32 bits on some ABIs and 128 bits on others.
+  if (!ST.is64Bit() && ST.isOSWindows())  
+    Ret += "-S32";
+  else
+    Ret += "-S128";
+
+  return Ret;
+}
+
+X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU,
+                                                            StringRef FS) {
+  initializeEnvironment();
+  resetSubtargetFeatures(CPU, FS);
+  return *this;
+}
+
+X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
+                           const std::string &FS, X86TargetMachine &TM,
+                           unsigned StackAlignOverride)
+    : X86GenSubtargetInfo(TT, CPU, FS), X86ProcFamily(Others),
+      PICStyle(PICStyles::None), TargetTriple(TT),
+      StackAlignOverride(StackAlignOverride),
+      In64BitMode(TargetTriple.getArch() == Triple::x86_64),
+      In32BitMode(TargetTriple.getArch() == Triple::x86 &&
+                  TargetTriple.getEnvironment() != Triple::CODE16),
+      In16BitMode(TargetTriple.getArch() == Triple::x86 &&
+                  TargetTriple.getEnvironment() == Triple::CODE16),
+      DL(computeDataLayout(*this)), TSInfo(DL),
+      InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM),
+      FrameLowering(TargetFrameLowering::StackGrowsDown, getStackAlignment(),
+                    is64Bit() ? -8 : -4),
+      JITInfo(hasSSE1()) {}
+
+bool X86Subtarget::enableEarlyIfConversion() const {
+  return hasCMov() && X86EarlyIfConv;
+}
+
diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.h b/contrib/llvm/lib/Target/X86/X86Subtarget.h
new file mode 100644
index 0000000..5f5df5e
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86Subtarget.h
@@ -0,0 +1,478 @@
+//===-- X86Subtarget.h - Define Subtarget for the X86 ----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the X86 specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86SUBTARGET_H
+#define X86SUBTARGET_H
+
+#include "X86FrameLowering.h"
+#include "X86ISelLowering.h"
+#include "X86InstrInfo.h"
+#include "X86JITInfo.h"
+#include "X86SelectionDAGInfo.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "X86GenSubtargetInfo.inc"
+
+namespace llvm {
+class GlobalValue;
+class StringRef;
+class TargetMachine;
+
+/// PICStyles - The X86 backend supports a number of different styles of PIC.
+///
+namespace PICStyles {
+enum Style {
+  StubPIC,          // Used on i386-darwin in -fPIC mode.
+  StubDynamicNoPIC, // Used on i386-darwin in -mdynamic-no-pic mode.
+  GOT,              // Used on many 32-bit unices in -fPIC mode.
+  RIPRel,           // Used on X86-64 when not in -static mode.
+  None              // Set when in -static mode (not PIC or DynamicNoPIC mode).
+};
+}
+
+class X86Subtarget final : public X86GenSubtargetInfo {
+
+protected:
+  enum X86SSEEnum {
+    NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F
+  };
+
+  enum X863DNowEnum {
+    NoThreeDNow, ThreeDNow, ThreeDNowA
+  };
+
+  enum X86ProcFamilyEnum {
+    Others, IntelAtom, IntelSLM
+  };
+
+  /// X86ProcFamily - X86 processor family: Intel Atom, and others
+  X86ProcFamilyEnum X86ProcFamily;
+
+  /// PICStyle - Which PIC style to use
+  ///
+  PICStyles::Style PICStyle;
+
+  /// X86SSELevel - MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, or
+  /// none supported.
+  X86SSEEnum X86SSELevel;
+
+  /// X863DNowLevel - 3DNow or 3DNow Athlon, or none supported.
+  ///
+  X863DNowEnum X863DNowLevel;
+
+  /// HasCMov - True if this processor has conditional move instructions
+  /// (generally pentium pro+).
+  bool HasCMov;
+
+  /// HasX86_64 - True if the processor supports X86-64 instructions.
+  ///
+  bool HasX86_64;
+
+  /// HasPOPCNT - True if the processor supports POPCNT.
+  bool HasPOPCNT;
+
+  /// HasSSE4A - True if the processor supports SSE4A instructions.
+  bool HasSSE4A;
+
+  /// HasAES - Target has AES instructions
+  bool HasAES;
+
+  /// HasPCLMUL - Target has carry-less multiplication
+  bool HasPCLMUL;
+
+  /// HasFMA - Target has 3-operand fused multiply-add
+  bool HasFMA;
+
+  /// HasFMA4 - Target has 4-operand fused multiply-add
+  bool HasFMA4;
+
+  /// HasXOP - Target has XOP instructions
+  bool HasXOP;
+
+  /// HasTBM - Target has TBM instructions.
+  bool HasTBM;
+
+  /// HasMOVBE - True if the processor has the MOVBE instruction.
+  bool HasMOVBE;
+
+  /// HasRDRAND - True if the processor has the RDRAND instruction.
+  bool HasRDRAND;
+
+  /// HasF16C - Processor has 16-bit floating point conversion instructions.
+  bool HasF16C;
+
+  /// HasFSGSBase - Processor has FS/GS base insturctions.
+  bool HasFSGSBase;
+
+  /// HasLZCNT - Processor has LZCNT instruction.
+  bool HasLZCNT;
+
+  /// HasBMI - Processor has BMI1 instructions.
+  bool HasBMI;
+
+  /// HasBMI2 - Processor has BMI2 instructions.
+  bool HasBMI2;
+
+  /// HasRTM - Processor has RTM instructions.
+  bool HasRTM;
+
+  /// HasHLE - Processor has HLE.
+  bool HasHLE;
+
+  /// HasADX - Processor has ADX instructions.
+  bool HasADX;
+
+  /// HasSHA - Processor has SHA instructions.
+  bool HasSHA;
+
+  /// HasPRFCHW - Processor has PRFCHW instructions.
+  bool HasPRFCHW;
+
+  /// HasRDSEED - Processor has RDSEED instructions.
+  bool HasRDSEED;
+
+  /// IsBTMemSlow - True if BT (bit test) of memory instructions are slow.
+  bool IsBTMemSlow;
+
+  /// IsSHLDSlow - True if SHLD instructions are slow.
+  bool IsSHLDSlow;
+
+  /// IsUAMemFast - True if unaligned memory access is fast.
+  bool IsUAMemFast;
+
+  /// HasVectorUAMem - True if SIMD operations can have unaligned memory
+  /// operands. This may require setting a feature bit in the processor.
+  bool HasVectorUAMem;
+
+  /// HasCmpxchg16b - True if this processor has the CMPXCHG16B instruction;
+  /// this is true for most x86-64 chips, but not the first AMD chips.
+  bool HasCmpxchg16b;
+
+  /// UseLeaForSP - True if the LEA instruction should be used for adjusting
+  /// the stack pointer. This is an optimization for Intel Atom processors.
+  bool UseLeaForSP;
+
+  /// HasSlowDivide - True if smaller divides are significantly faster than
+  /// full divides and should be used when possible.
+  bool HasSlowDivide;
+
+  /// PadShortFunctions - True if the short functions should be padded to prevent
+  /// a stall when returning too early.
+  bool PadShortFunctions;
+
+  /// CallRegIndirect - True if the Calls with memory reference should be converted
+  /// to a register-based indirect call.
+  bool CallRegIndirect;
+  /// LEAUsesAG - True if the LEA instruction inputs have to be ready at
+  ///             address generation (AG) time.
+  bool LEAUsesAG;
+
+  /// SlowLEA - True if the LEA instruction with certain arguments is slow
+  bool SlowLEA;
+
+  /// SlowIncDec - True if INC and DEC instructions are slow when writing to flags
+  bool SlowIncDec;
+
+  /// Processor has AVX-512 PreFetch Instructions
+  bool HasPFI;
+
+  /// Processor has AVX-512 Exponential and Reciprocal Instructions
+  bool HasERI;
+
+  /// Processor has AVX-512 Conflict Detection Instructions
+  bool HasCDI;
+
+  /// Processor has AVX-512 Doubleword and Quadword instructions
+  bool HasDQI;
+
+  /// Processor has AVX-512 Byte and Word instructions
+  bool HasBWI;
+
+  /// Processor has AVX-512 Vector Length eXtenstions
+  bool HasVLX;
+
+  /// stackAlignment - The minimum alignment known to hold of the stack frame on
+  /// entry to the function and which must be maintained by every function.
+  unsigned stackAlignment;
+
+  /// Max. memset / memcpy size that is turned into rep/movs, rep/stos ops.
+  ///
+  unsigned MaxInlineSizeThreshold;
+
+  /// TargetTriple - What processor and OS we're targeting.
+  Triple TargetTriple;
+
+  /// Instruction itineraries for scheduling
+  InstrItineraryData InstrItins;
+
+private:
+  /// StackAlignOverride - Override the stack alignment.
+  unsigned StackAlignOverride;
+
+  /// In64BitMode - True if compiling for 64-bit, false for 16-bit or 32-bit.
+  bool In64BitMode;
+
+  /// In32BitMode - True if compiling for 32-bit, false for 16-bit or 64-bit.
+  bool In32BitMode;
+
+  /// In16BitMode - True if compiling for 16-bit, false for 32-bit or 64-bit.
+  bool In16BitMode;
+
+  // Calculates type size & alignment
+  const DataLayout DL;
+  X86SelectionDAGInfo TSInfo;
+  // Ordering here is important. X86InstrInfo initializes X86RegisterInfo which
+  // X86TargetLowering needs.
+  X86InstrInfo InstrInfo;
+  X86TargetLowering TLInfo;
+  X86FrameLowering FrameLowering;
+  X86JITInfo JITInfo;
+
+public:
+  /// This constructor initializes the data members to match that
+  /// of the specified triple.
+  ///
+  X86Subtarget(const std::string &TT, const std::string &CPU,
+               const std::string &FS, X86TargetMachine &TM,
+               unsigned StackAlignOverride);
+
+  const X86TargetLowering *getTargetLowering() const { return &TLInfo; }
+  const X86InstrInfo *getInstrInfo() const { return &InstrInfo; }
+  const DataLayout *getDataLayout() const { return &DL; }
+  const X86FrameLowering *getFrameLowering() const { return &FrameLowering; }
+  const X86SelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; }
+  X86JITInfo *getJITInfo() { return &JITInfo; }
+
+  /// getStackAlignment - Returns the minimum alignment known to hold of the
+  /// stack frame on entry to the function and which must be maintained by every
+  /// function for this subtarget.
+  unsigned getStackAlignment() const { return stackAlignment; }
+
+  /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
+  /// that still makes it profitable to inline the call.
+  unsigned getMaxInlineSizeThreshold() const { return MaxInlineSizeThreshold; }
+
+  /// ParseSubtargetFeatures - Parses features string setting specified
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  /// \brief Reset the features for the X86 target.
+  void resetSubtargetFeatures(const MachineFunction *MF) override;
+private:
+  /// \brief Initialize the full set of dependencies so we can use an initializer
+  /// list for X86Subtarget.
+  X86Subtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
+  void initializeEnvironment();
+  void resetSubtargetFeatures(StringRef CPU, StringRef FS);
+public:
+  /// Is this x86_64? (disregarding specific ABI / programming model)
+  bool is64Bit() const {
+    return In64BitMode;
+  }
+
+  bool is32Bit() const {
+    return In32BitMode;
+  }
+
+  bool is16Bit() const {
+    return In16BitMode;
+  }
+
+  /// Is this x86_64 with the ILP32 programming model (x32 ABI)?
+  bool isTarget64BitILP32() const {
+    return In64BitMode && (TargetTriple.getEnvironment() == Triple::GNUX32 ||
+                           TargetTriple.getOS() == Triple::NaCl);
+  }
+
+  /// Is this x86_64 with the LP64 programming model (standard AMD64, no x32)?
+  bool isTarget64BitLP64() const {
+    return In64BitMode && (TargetTriple.getEnvironment() != Triple::GNUX32);
+  }
+
+  PICStyles::Style getPICStyle() const { return PICStyle; }
+  void setPICStyle(PICStyles::Style Style)  { PICStyle = Style; }
+
+  bool hasCMov() const { return HasCMov; }
+  bool hasMMX() const { return X86SSELevel >= MMX; }
+  bool hasSSE1() const { return X86SSELevel >= SSE1; }
+  bool hasSSE2() const { return X86SSELevel >= SSE2; }
+  bool hasSSE3() const { return X86SSELevel >= SSE3; }
+  bool hasSSSE3() const { return X86SSELevel >= SSSE3; }
+  bool hasSSE41() const { return X86SSELevel >= SSE41; }
+  bool hasSSE42() const { return X86SSELevel >= SSE42; }
+  bool hasAVX() const { return X86SSELevel >= AVX; }
+  bool hasAVX2() const { return X86SSELevel >= AVX2; }
+  bool hasAVX512() const { return X86SSELevel >= AVX512F; }
+  bool hasFp256() const { return hasAVX(); }
+  bool hasInt256() const { return hasAVX2(); }
+  bool hasSSE4A() const { return HasSSE4A; }
+  bool has3DNow() const { return X863DNowLevel >= ThreeDNow; }
+  bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; }
+  bool hasPOPCNT() const { return HasPOPCNT; }
+  bool hasAES() const { return HasAES; }
+  bool hasPCLMUL() const { return HasPCLMUL; }
+  bool hasFMA() const { return HasFMA; }
+  // FIXME: Favor FMA when both are enabled. Is this the right thing to do?
+  bool hasFMA4() const { return HasFMA4 && !HasFMA; }
+  bool hasXOP() const { return HasXOP; }
+  bool hasTBM() const { return HasTBM; }
+  bool hasMOVBE() const { return HasMOVBE; }
+  bool hasRDRAND() const { return HasRDRAND; }
+  bool hasF16C() const { return HasF16C; }
+  bool hasFSGSBase() const { return HasFSGSBase; }
+  bool hasLZCNT() const { return HasLZCNT; }
+  bool hasBMI() const { return HasBMI; }
+  bool hasBMI2() const { return HasBMI2; }
+  bool hasRTM() const { return HasRTM; }
+  bool hasHLE() const { return HasHLE; }
+  bool hasADX() const { return HasADX; }
+  bool hasSHA() const { return HasSHA; }
+  bool hasPRFCHW() const { return HasPRFCHW; }
+  bool hasRDSEED() const { return HasRDSEED; }
+  bool isBTMemSlow() const { return IsBTMemSlow; }
+  bool isSHLDSlow() const { return IsSHLDSlow; }
+  bool isUnalignedMemAccessFast() const { return IsUAMemFast; }
+  bool hasVectorUAMem() const { return HasVectorUAMem; }
+  bool hasCmpxchg16b() const { return HasCmpxchg16b; }
+  bool useLeaForSP() const { return UseLeaForSP; }
+  bool hasSlowDivide() const { return HasSlowDivide; }
+  bool padShortFunctions() const { return PadShortFunctions; }
+  bool callRegIndirect() const { return CallRegIndirect; }
+  bool LEAusesAG() const { return LEAUsesAG; }
+  bool slowLEA() const { return SlowLEA; }
+  bool slowIncDec() const { return SlowIncDec; }
+  bool hasCDI() const { return HasCDI; }
+  bool hasPFI() const { return HasPFI; }
+  bool hasERI() const { return HasERI; }
+  bool hasDQI() const { return HasDQI; }
+  bool hasBWI() const { return HasBWI; }
+  bool hasVLX() const { return HasVLX; }
+
+  bool isAtom() const { return X86ProcFamily == IntelAtom; }
+  bool isSLM() const { return X86ProcFamily == IntelSLM; }
+
+  const Triple &getTargetTriple() const { return TargetTriple; }
+
+  bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
+  bool isTargetFreeBSD() const {
+    return TargetTriple.getOS() == Triple::FreeBSD;
+  }
+  bool isTargetSolaris() const {
+    return TargetTriple.getOS() == Triple::Solaris;
+  }
+
+  bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
+  bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); }
+  bool isTargetMacho() const { return TargetTriple.isOSBinFormatMachO(); }
+
+  bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
+  bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
+  bool isTargetNaCl32() const { return isTargetNaCl() && !is64Bit(); }
+  bool isTargetNaCl64() const { return isTargetNaCl() && is64Bit(); }
+
+  bool isTargetWindowsMSVC() const {
+    return TargetTriple.isWindowsMSVCEnvironment();
+  }
+
+  bool isTargetKnownWindowsMSVC() const {
+    return TargetTriple.isKnownWindowsMSVCEnvironment();
+  }
+
+  bool isTargetWindowsCygwin() const {
+    return TargetTriple.isWindowsCygwinEnvironment();
+  }
+
+  bool isTargetWindowsGNU() const {
+    return TargetTriple.isWindowsGNUEnvironment();
+  }
+
+  bool isTargetCygMing() const { return TargetTriple.isOSCygMing(); }
+
+  bool isOSWindows() const { return TargetTriple.isOSWindows(); }
+
+  bool isTargetWin64() const {
+    return In64BitMode && TargetTriple.isOSWindows();
+  }
+
+  bool isTargetWin32() const {
+    return !In64BitMode && (isTargetCygMing() || isTargetKnownWindowsMSVC());
+  }
+
+  bool isPICStyleSet() const { return PICStyle != PICStyles::None; }
+  bool isPICStyleGOT() const { return PICStyle == PICStyles::GOT; }
+  bool isPICStyleRIPRel() const { return PICStyle == PICStyles::RIPRel; }
+
+  bool isPICStyleStubPIC() const {
+    return PICStyle == PICStyles::StubPIC;
+  }
+
+  bool isPICStyleStubNoDynamic() const {
+    return PICStyle == PICStyles::StubDynamicNoPIC;
+  }
+  bool isPICStyleStubAny() const {
+    return PICStyle == PICStyles::StubDynamicNoPIC ||
+           PICStyle == PICStyles::StubPIC;
+  }
+
+  bool isCallingConvWin64(CallingConv::ID CC) const {
+    return (isTargetWin64() && CC != CallingConv::X86_64_SysV) ||
+           CC == CallingConv::X86_64_Win64;
+  }
+
+  /// ClassifyGlobalReference - Classify a global variable reference for the
+  /// current subtarget according to how we should reference it in a non-pcrel
+  /// context.
+  unsigned char ClassifyGlobalReference(const GlobalValue *GV,
+                                        const TargetMachine &TM)const;
+
+  /// ClassifyBlockAddressReference - Classify a blockaddress reference for the
+  /// current subtarget according to how we should reference it in a non-pcrel
+  /// context.
+  unsigned char ClassifyBlockAddressReference() const;
+
+  /// IsLegalToCallImmediateAddr - Return true if the subtarget allows calls
+  /// to immediate address.
+  bool IsLegalToCallImmediateAddr(const TargetMachine &TM) const;
+
+  /// This function returns the name of a function which has an interface
+  /// like the non-standard bzero function, if such a function exists on
+  /// the current subtarget and it is considered prefereable over
+  /// memset with zero passed as the second argument. Otherwise it
+  /// returns null.
+  const char *getBZeroEntry() const;
+
+  /// This function returns true if the target has sincos() routine in its
+  /// compiler runtime or math libraries.
+  bool hasSinCos() const;
+
+  /// Enable the MachineScheduler pass for all X86 subtargets.
+  bool enableMachineScheduler() const override { return true; }
+
+  bool enableEarlyIfConversion() const override;
+
+  /// getInstrItins = Return the instruction itineraries based on the
+  /// subtarget selection.
+  const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
+
+  AntiDepBreakMode getAntiDepBreakMode() const override {
+    return TargetSubtargetInfo::ANTIDEP_CRITICAL;
+  }
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp
new file mode 100644
index 0000000..f12140f
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -0,0 +1,186 @@
+//===-- X86TargetMachine.cpp - Define TargetMachine for the X86 -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86TargetMachine.h"
+#include "X86.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+extern "C" void LLVMInitializeX86Target() {
+  // Register the target.
+  RegisterTargetMachine<X86TargetMachine> X(TheX86_32Target);
+  RegisterTargetMachine<X86TargetMachine> Y(TheX86_64Target);
+}
+
+void X86TargetMachine::anchor() { }
+
+/// X86TargetMachine ctor - Create an X86 target.
+///
+X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT, StringRef CPU,
+                                   StringRef FS, const TargetOptions &Options,
+                                   Reloc::Model RM, CodeModel::Model CM,
+                                   CodeGenOpt::Level OL)
+    : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+      Subtarget(TT, CPU, FS, *this, Options.StackAlignmentOverride) {
+  // Determine the PICStyle based on the target selected.
+  if (getRelocationModel() == Reloc::Static) {
+    // Unless we're in PIC or DynamicNoPIC mode, set the PIC style to None.
+    Subtarget.setPICStyle(PICStyles::None);
+  } else if (Subtarget.is64Bit()) {
+    // PIC in 64 bit mode is always rip-rel.
+    Subtarget.setPICStyle(PICStyles::RIPRel);
+  } else if (Subtarget.isTargetCOFF()) {
+    Subtarget.setPICStyle(PICStyles::None);
+  } else if (Subtarget.isTargetDarwin()) {
+    if (getRelocationModel() == Reloc::PIC_)
+      Subtarget.setPICStyle(PICStyles::StubPIC);
+    else {
+      assert(getRelocationModel() == Reloc::DynamicNoPIC);
+      Subtarget.setPICStyle(PICStyles::StubDynamicNoPIC);
+    }
+  } else if (Subtarget.isTargetELF()) {
+    Subtarget.setPICStyle(PICStyles::GOT);
+  }
+
+  // default to hard float ABI
+  if (Options.FloatABIType == FloatABI::Default)
+    this->Options.FloatABIType = FloatABI::Hard;
+
+  // Windows stack unwinder gets confused when execution flow "falls through"
+  // after a call to 'noreturn' function.
+  // To prevent that, we emit a trap for 'unreachable' IR instructions.
+  // (which on X86, happens to be the 'ud2' instruction)
+  if (Subtarget.isTargetWin64())
+    this->Options.TrapUnreachable = true;
+
+  initAsmInfo();
+}
+
+//===----------------------------------------------------------------------===//
+// Command line options for x86
+//===----------------------------------------------------------------------===//
+static cl::opt<bool>
+UseVZeroUpper("x86-use-vzeroupper", cl::Hidden,
+  cl::desc("Minimize AVX to SSE transition penalty"),
+  cl::init(true));
+
+//===----------------------------------------------------------------------===//
+// X86 Analysis Pass Setup
+//===----------------------------------------------------------------------===//
+
+void X86TargetMachine::addAnalysisPasses(PassManagerBase &PM) {
+  // Add first the target-independent BasicTTI pass, then our X86 pass. This
+  // allows the X86 pass to delegate to the target independent layer when
+  // appropriate.
+  PM.add(createBasicTargetTransformInfoPass(this));
+  PM.add(createX86TargetTransformInfoPass(this));
+}
+
+
+//===----------------------------------------------------------------------===//
+// Pass Pipeline Configuration
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// X86 Code Generator Pass Configuration Options.
+class X86PassConfig : public TargetPassConfig {
+public:
+  X86PassConfig(X86TargetMachine *TM, PassManagerBase &PM)
+    : TargetPassConfig(TM, PM) {}
+
+  X86TargetMachine &getX86TargetMachine() const {
+    return getTM<X86TargetMachine>();
+  }
+
+  const X86Subtarget &getX86Subtarget() const {
+    return *getX86TargetMachine().getSubtargetImpl();
+  }
+
+  void addIRPasses() override;
+  bool addInstSelector() override;
+  bool addILPOpts() override;
+  bool addPreRegAlloc() override;
+  bool addPostRegAlloc() override;
+  bool addPreEmitPass() override;
+};
+} // namespace
+
+TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new X86PassConfig(this, PM);
+}
+
+void X86PassConfig::addIRPasses() {
+  addPass(createX86AtomicExpandPass(&getX86TargetMachine()));
+
+  TargetPassConfig::addIRPasses();
+}
+
+bool X86PassConfig::addInstSelector() {
+  // Install an instruction selector.
+  addPass(createX86ISelDag(getX86TargetMachine(), getOptLevel()));
+
+  // For ELF, cleanup any local-dynamic TLS accesses.
+  if (getX86Subtarget().isTargetELF() && getOptLevel() != CodeGenOpt::None)
+    addPass(createCleanupLocalDynamicTLSPass());
+
+  addPass(createX86GlobalBaseRegPass());
+
+  return false;
+}
+
+bool X86PassConfig::addILPOpts() {
+  addPass(&EarlyIfConverterID);
+  return true;
+}
+
+bool X86PassConfig::addPreRegAlloc() {
+  return false;  // -print-machineinstr shouldn't print after this.
+}
+
+bool X86PassConfig::addPostRegAlloc() {
+  addPass(createX86FloatingPointStackifierPass());
+  return true;  // -print-machineinstr should print after this.
+}
+
+bool X86PassConfig::addPreEmitPass() {
+  bool ShouldPrint = false;
+  if (getOptLevel() != CodeGenOpt::None && getX86Subtarget().hasSSE2()) {
+    addPass(createExecutionDependencyFixPass(&X86::VR128RegClass));
+    ShouldPrint = true;
+  }
+
+  if (UseVZeroUpper) {
+    addPass(createX86IssueVZeroUpperPass());
+    ShouldPrint = true;
+  }
+
+  if (getOptLevel() != CodeGenOpt::None) {
+    addPass(createX86PadShortFunctions());
+    addPass(createX86FixupLEAs());
+    ShouldPrint = true;
+  }
+
+  return ShouldPrint;
+}
+
+bool X86TargetMachine::addCodeEmitter(PassManagerBase &PM,
+                                      JITCodeEmitter &JCE) {
+  PM.add(createX86JITCodeEmitterPass(*this, JCE));
+
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86TargetMachine.h b/contrib/llvm/lib/Target/X86/X86TargetMachine.h
new file mode 100644
index 0000000..41d5157
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86TargetMachine.h
@@ -0,0 +1,70 @@
+//===-- X86TargetMachine.h - Define TargetMachine for the X86 ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the X86 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86TARGETMACHINE_H
+#define X86TARGETMACHINE_H
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class StringRef;
+
+class X86TargetMachine final : public LLVMTargetMachine {
+  virtual void anchor();
+  X86Subtarget       Subtarget;
+
+public:
+  X86TargetMachine(const Target &T, StringRef TT,
+                   StringRef CPU, StringRef FS, const TargetOptions &Options,
+                   Reloc::Model RM, CodeModel::Model CM,
+                   CodeGenOpt::Level OL);
+
+  const DataLayout *getDataLayout() const override {
+    return getSubtargetImpl()->getDataLayout();
+  }
+  const X86InstrInfo *getInstrInfo() const override {
+    return getSubtargetImpl()->getInstrInfo();
+  }
+  const TargetFrameLowering *getFrameLowering() const override {
+    return getSubtargetImpl()->getFrameLowering();
+  }
+  X86JITInfo *getJITInfo() override { return Subtarget.getJITInfo(); }
+  const X86Subtarget *getSubtargetImpl() const override { return &Subtarget; }
+  const X86TargetLowering *getTargetLowering() const override {
+    return getSubtargetImpl()->getTargetLowering();
+  }
+  const X86SelectionDAGInfo *getSelectionDAGInfo() const override {
+    return getSubtargetImpl()->getSelectionDAGInfo();
+  }
+  const X86RegisterInfo  *getRegisterInfo() const override {
+    return &getInstrInfo()->getRegisterInfo();
+  }
+  const InstrItineraryData *getInstrItineraryData() const override {
+    return &getSubtargetImpl()->getInstrItineraryData();
+  }
+
+  /// \brief Register X86 analysis passes with a pass manager.
+  void addAnalysisPasses(PassManagerBase &PM) override;
+
+  // Set up the pass pipeline.
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+  bool addCodeEmitter(PassManagerBase &PM, JITCodeEmitter &JCE) override;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp
new file mode 100644
index 0000000..f8bcd61
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp
@@ -0,0 +1,171 @@
+//===-- X86TargetObjectFile.cpp - X86 Object Info -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86TargetObjectFile.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Target/TargetLowering.h"
+
+using namespace llvm;
+using namespace dwarf;
+
+const MCExpr *X86_64MachoTargetObjectFile::getTTypeGlobalReference(
+    const GlobalValue *GV, unsigned Encoding, Mangler &Mang,
+    const TargetMachine &TM, MachineModuleInfo *MMI,
+    MCStreamer &Streamer) const {
+
+  // On Darwin/X86-64, we can reference dwarf symbols with foo@GOTPCREL+4, which
+  // is an indirect pc-relative reference.
+  if ((Encoding & DW_EH_PE_indirect) && (Encoding & DW_EH_PE_pcrel)) {
+    const MCSymbol *Sym = TM.getSymbol(GV, Mang);
+    const MCExpr *Res =
+      MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, getContext());
+    const MCExpr *Four = MCConstantExpr::Create(4, getContext());
+    return MCBinaryExpr::CreateAdd(Res, Four, getContext());
+  }
+
+  return TargetLoweringObjectFileMachO::getTTypeGlobalReference(
+      GV, Encoding, Mang, TM, MMI, Streamer);
+}
+
+MCSymbol *X86_64MachoTargetObjectFile::getCFIPersonalitySymbol(
+    const GlobalValue *GV, Mangler &Mang, const TargetMachine &TM,
+    MachineModuleInfo *MMI) const {
+  return TM.getSymbol(GV, Mang);
+}
+
+void
+X86LinuxTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM) {
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+  InitializeELF(TM.Options.UseInitArray);
+}
+
+const MCExpr *
+X86LinuxTargetObjectFile::getDebugThreadLocalSymbol(
+    const MCSymbol *Sym) const {
+  return MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_DTPOFF, getContext());
+}
+
+const MCExpr *X86WindowsTargetObjectFile::getExecutableRelativeSymbol(
+    const ConstantExpr *CE, Mangler &Mang, const TargetMachine &TM) const {
+  // We are looking for the difference of two symbols, need a subtraction
+  // operation.
+  const SubOperator *Sub = dyn_cast<SubOperator>(CE);
+  if (!Sub)
+    return nullptr;
+
+  // Symbols must first be numbers before we can subtract them, we need to see a
+  // ptrtoint on both subtraction operands.
+  const PtrToIntOperator *SubLHS =
+      dyn_cast<PtrToIntOperator>(Sub->getOperand(0));
+  const PtrToIntOperator *SubRHS =
+      dyn_cast<PtrToIntOperator>(Sub->getOperand(1));
+  if (!SubLHS || !SubRHS)
+    return nullptr;
+
+  // Our symbols should exist in address space zero, cowardly no-op if
+  // otherwise.
+  if (SubLHS->getPointerAddressSpace() != 0 ||
+      SubRHS->getPointerAddressSpace() != 0)
+    return nullptr;
+
+  // Both ptrtoint instructions must wrap global variables:
+  // - Only global variables are eligible for image relative relocations.
+  // - The subtrahend refers to the special symbol __ImageBase, a global.
+  const GlobalVariable *GVLHS =
+      dyn_cast<GlobalVariable>(SubLHS->getPointerOperand());
+  const GlobalVariable *GVRHS =
+      dyn_cast<GlobalVariable>(SubRHS->getPointerOperand());
+  if (!GVLHS || !GVRHS)
+    return nullptr;
+
+  // We expect __ImageBase to be a global variable without a section, externally
+  // defined.
+  //
+  // It should look something like this: @__ImageBase = external constant i8
+  if (GVRHS->isThreadLocal() || GVRHS->getName() != "__ImageBase" ||
+      !GVRHS->hasExternalLinkage() || GVRHS->hasInitializer() ||
+      GVRHS->hasSection())
+    return nullptr;
+
+  // An image-relative, thread-local, symbol makes no sense.
+  if (GVLHS->isThreadLocal())
+    return nullptr;
+
+  return MCSymbolRefExpr::Create(TM.getSymbol(GVLHS, Mang),
+                                 MCSymbolRefExpr::VK_COFF_IMGREL32,
+                                 getContext());
+}
+
+static std::string APIntToHexString(const APInt &AI) {
+  unsigned Width = (AI.getBitWidth() / 8) * 2;
+  std::string HexString = utohexstr(AI.getLimitedValue(), /*LowerCase=*/true);
+  unsigned Size = HexString.size();
+  assert(Width >= Size && "hex string is too large!");
+  HexString.insert(HexString.begin(), Width - Size, '0');
+
+  return HexString;
+}
+
+
+static std::string scalarConstantToHexString(const Constant *C) {
+  Type *Ty = C->getType();
+  APInt AI;
+  if (isa<UndefValue>(C)) {
+    AI = APInt(Ty->getPrimitiveSizeInBits(), /*val=*/0);
+  } else if (Ty->isFloatTy() || Ty->isDoubleTy()) {
+    const auto *CFP = cast<ConstantFP>(C);
+    AI = CFP->getValueAPF().bitcastToAPInt();
+  } else if (Ty->isIntegerTy()) {
+    const auto *CI = cast<ConstantInt>(C);
+    AI = CI->getValue();
+  } else {
+    llvm_unreachable("unexpected constant pool element type!");
+  }
+  return APIntToHexString(AI);
+}
+
+const MCSection *
+X86WindowsTargetObjectFile::getSectionForConstant(SectionKind Kind,
+                                                  const Constant *C) const {
+  if (Kind.isReadOnly()) {
+    if (C) {
+      Type *Ty = C->getType();
+      SmallString<32> COMDATSymName;
+      if (Ty->isFloatTy() || Ty->isDoubleTy()) {
+        COMDATSymName = "__real@";
+        COMDATSymName += scalarConstantToHexString(C);
+      } else if (const auto *VTy = dyn_cast<VectorType>(Ty)) {
+        uint64_t NumBits = VTy->getBitWidth();
+        if (NumBits == 128 || NumBits == 256) {
+          COMDATSymName = NumBits == 128 ? "__xmm@" : "__ymm@";
+          for (int I = VTy->getNumElements() - 1, E = -1; I != E; --I)
+            COMDATSymName +=
+                scalarConstantToHexString(C->getAggregateElement(I));
+        }
+      }
+      if (!COMDATSymName.empty()) {
+        unsigned Characteristics = COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                   COFF::IMAGE_SCN_MEM_READ |
+                                   COFF::IMAGE_SCN_LNK_COMDAT;
+        return getContext().getCOFFSection(".rdata", Characteristics, Kind,
+                                           COMDATSymName,
+                                           COFF::IMAGE_COMDAT_SELECT_ANY);
+      }
+    }
+  }
+
+  return TargetLoweringObjectFile::getSectionForConstant(Kind, C);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h
new file mode 100644
index 0000000..4a10b7e
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h
@@ -0,0 +1,58 @@
+//===-- X86TargetObjectFile.h - X86 Object Info -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_X86_TARGETOBJECTFILE_H
+#define LLVM_TARGET_X86_TARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
+namespace llvm {
+
+  /// X86_64MachoTargetObjectFile - This TLOF implementation is used for Darwin
+  /// x86-64.
+  class X86_64MachoTargetObjectFile : public TargetLoweringObjectFileMachO {
+  public:
+    const MCExpr *
+    getTTypeGlobalReference(const GlobalValue *GV, unsigned Encoding,
+                            Mangler &Mang, const TargetMachine &TM,
+                            MachineModuleInfo *MMI,
+                            MCStreamer &Streamer) const override;
+
+    // getCFIPersonalitySymbol - The symbol that gets passed to
+    // .cfi_personality.
+    MCSymbol *getCFIPersonalitySymbol(const GlobalValue *GV, Mangler &Mang,
+                                      const TargetMachine &TM,
+                                      MachineModuleInfo *MMI) const override;
+  };
+
+  /// X86LinuxTargetObjectFile - This implementation is used for linux x86
+  /// and x86-64.
+  class X86LinuxTargetObjectFile : public TargetLoweringObjectFileELF {
+    void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+
+    /// \brief Describe a TLS variable address within debug info.
+    const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const override;
+  };
+
+  /// \brief This implementation is used for Windows targets on x86 and x86-64.
+  class X86WindowsTargetObjectFile : public TargetLoweringObjectFileCOFF {
+    const MCExpr *
+    getExecutableRelativeSymbol(const ConstantExpr *CE, Mangler &Mang,
+                                const TargetMachine &TM) const override;
+
+    /// \brief Given a mergeable constant with the specified size and relocation
+    /// information, return a section that it should be placed in.
+    const MCSection *getSectionForConstant(SectionKind Kind,
+                                           const Constant *C) const override;
+  };
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
new file mode 100644
index 0000000..c961e2f
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -0,0 +1,1064 @@
+//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements a TargetTransformInfo analysis pass specific to the
+/// X86 target machine. It uses the target's detailed information to provide
+/// more precise answers to certain TTI queries, while letting the target
+/// independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86TargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/CostTable.h"
+#include "llvm/Target/TargetLowering.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "x86tti"
+
+// Declare the pass initialization routine locally as target-specific passes
+// don't have a target-wide initialization entry point, and so we rely on the
+// pass constructor initialization.
+namespace llvm {
+void initializeX86TTIPass(PassRegistry &);
+}
+
+namespace {
+
+class X86TTI final : public ImmutablePass, public TargetTransformInfo {
+  const X86Subtarget *ST;
+  const X86TargetLowering *TLI;
+
+  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
+  /// are set if the result needs to be inserted and/or extracted from vectors.
+  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
+
+public:
+  X86TTI() : ImmutablePass(ID), ST(nullptr), TLI(nullptr) {
+    llvm_unreachable("This pass cannot be directly constructed");
+  }
+
+  X86TTI(const X86TargetMachine *TM)
+    : ImmutablePass(ID), ST(TM->getSubtargetImpl()),
+      TLI(TM->getTargetLowering()) {
+    initializeX86TTIPass(*PassRegistry::getPassRegistry());
+  }
+
+  void initializePass() override {
+    pushTTIStack(this);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    TargetTransformInfo::getAnalysisUsage(AU);
+  }
+
+  /// Pass identification.
+  static char ID;
+
+  /// Provide necessary pointer adjustments for the two base classes.
+  void *getAdjustedAnalysisPointer(const void *ID) override {
+    if (ID == &TargetTransformInfo::ID)
+      return (TargetTransformInfo*)this;
+    return this;
+  }
+
+  /// \name Scalar TTI Implementations
+  /// @{
+  PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override;
+
+  /// @}
+
+  /// \name Vector TTI Implementations
+  /// @{
+
+  unsigned getNumberOfRegisters(bool Vector) const override;
+  unsigned getRegisterBitWidth(bool Vector) const override;
+  unsigned getMaximumUnrollFactor() const override;
+  unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind,
+                                  OperandValueKind) const override;
+  unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
+                          int Index, Type *SubTp) const override;
+  unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
+                            Type *Src) const override;
+  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                              Type *CondTy) const override;
+  unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
+                              unsigned Index) const override;
+  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                           unsigned AddressSpace) const override;
+
+  unsigned getAddressComputationCost(Type *PtrTy,
+                                     bool IsComplex) const override;
+
+  unsigned getReductionCost(unsigned Opcode, Type *Ty,
+                            bool IsPairwiseForm) const override;
+
+  unsigned getIntImmCost(int64_t) const;
+
+  unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override;
+
+  unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
+                         Type *Ty) const override;
+  unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+                         Type *Ty) const override;
+
+  /// @}
+};
+
+} // end anonymous namespace
+
+INITIALIZE_AG_PASS(X86TTI, TargetTransformInfo, "x86tti",
+                   "X86 Target Transform Info", true, true, false)
+char X86TTI::ID = 0;
+
+ImmutablePass *
+llvm::createX86TargetTransformInfoPass(const X86TargetMachine *TM) {
+  return new X86TTI(TM);
+}
+
+
+//===----------------------------------------------------------------------===//
+//
+// X86 cost model.
+//
+//===----------------------------------------------------------------------===//
+
+X86TTI::PopcntSupportKind X86TTI::getPopcntSupport(unsigned TyWidth) const {
+  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
+  // TODO: Currently the __builtin_popcount() implementation using SSE3
+  //   instructions is inefficient. Once the problem is fixed, we should
+  //   call ST->hasSSE3() instead of ST->hasPOPCNT().
+  return ST->hasPOPCNT() ? PSK_FastHardware : PSK_Software;
+}
+
+unsigned X86TTI::getNumberOfRegisters(bool Vector) const {
+  if (Vector && !ST->hasSSE1())
+    return 0;
+
+  if (ST->is64Bit()) {
+    if (Vector && ST->hasAVX512())
+      return 32;
+    return 16;
+  }
+  return 8;
+}
+
+unsigned X86TTI::getRegisterBitWidth(bool Vector) const {
+  if (Vector) {
+    if (ST->hasAVX512()) return 512;
+    if (ST->hasAVX()) return 256;
+    if (ST->hasSSE1()) return 128;
+    return 0;
+  }
+
+  if (ST->is64Bit())
+    return 64;
+  return 32;
+
+}
+
+unsigned X86TTI::getMaximumUnrollFactor() const {
+  if (ST->isAtom())
+    return 1;
+
+  // Sandybridge and Haswell have multiple execution ports and pipelined
+  // vector units.
+  if (ST->hasAVX())
+    return 4;
+
+  return 2;
+}
+
+unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+                                        OperandValueKind Op1Info,
+                                        OperandValueKind Op2Info) const {
+  // Legalize the type.
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
+
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  static const CostTblEntry<MVT::SimpleValueType>
+  AVX2UniformConstCostTable[] = {
+    { ISD::SDIV, MVT::v16i16,  6 }, // vpmulhw sequence
+    { ISD::UDIV, MVT::v16i16,  6 }, // vpmulhuw sequence
+    { ISD::SDIV, MVT::v8i32,  15 }, // vpmuldq sequence
+    { ISD::UDIV, MVT::v8i32,  15 }, // vpmuludq sequence
+  };
+
+  if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+      ST->hasAVX2()) {
+    int Idx = CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second);
+    if (Idx != -1)
+      return LT.first * AVX2UniformConstCostTable[Idx].Cost;
+  }
+
+  static const CostTblEntry<MVT::SimpleValueType> AVX2CostTable[] = {
+    // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
+    // customize them to detect the cases where shift amount is a scalar one.
+    { ISD::SHL,     MVT::v4i32,    1 },
+    { ISD::SRL,     MVT::v4i32,    1 },
+    { ISD::SRA,     MVT::v4i32,    1 },
+    { ISD::SHL,     MVT::v8i32,    1 },
+    { ISD::SRL,     MVT::v8i32,    1 },
+    { ISD::SRA,     MVT::v8i32,    1 },
+    { ISD::SHL,     MVT::v2i64,    1 },
+    { ISD::SRL,     MVT::v2i64,    1 },
+    { ISD::SHL,     MVT::v4i64,    1 },
+    { ISD::SRL,     MVT::v4i64,    1 },
+
+    { ISD::SHL,  MVT::v32i8,  42 }, // cmpeqb sequence.
+    { ISD::SHL,  MVT::v16i16,  16*10 }, // Scalarized.
+
+    { ISD::SRL,  MVT::v32i8,  32*10 }, // Scalarized.
+    { ISD::SRL,  MVT::v16i16,  8*10 }, // Scalarized.
+
+    { ISD::SRA,  MVT::v32i8,  32*10 }, // Scalarized.
+    { ISD::SRA,  MVT::v16i16,  16*10 }, // Scalarized.
+    { ISD::SRA,  MVT::v4i64,  4*10 }, // Scalarized.
+
+    // Vectorizing division is a bad idea. See the SSE2 table for more comments.
+    { ISD::SDIV,  MVT::v32i8,  32*20 },
+    { ISD::SDIV,  MVT::v16i16, 16*20 },
+    { ISD::SDIV,  MVT::v8i32,  8*20 },
+    { ISD::SDIV,  MVT::v4i64,  4*20 },
+    { ISD::UDIV,  MVT::v32i8,  32*20 },
+    { ISD::UDIV,  MVT::v16i16, 16*20 },
+    { ISD::UDIV,  MVT::v8i32,  8*20 },
+    { ISD::UDIV,  MVT::v4i64,  4*20 },
+  };
+
+  // Look for AVX2 lowering tricks.
+  if (ST->hasAVX2()) {
+    if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
+        (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
+         Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
+      // On AVX2, a packed v16i16 shift left by a constant build_vector
+      // is lowered into a vector multiply (vpmullw).
+      return LT.first;
+
+    int Idx = CostTableLookup(AVX2CostTable, ISD, LT.second);
+    if (Idx != -1)
+      return LT.first * AVX2CostTable[Idx].Cost;
+  }
+
+  static const CostTblEntry<MVT::SimpleValueType>
+  SSE2UniformConstCostTable[] = {
+    // We don't correctly identify costs of casts because they are marked as
+    // custom.
+    // Constant splats are cheaper for the following instructions.
+    { ISD::SHL,  MVT::v16i8,  1 }, // psllw.
+    { ISD::SHL,  MVT::v8i16,  1 }, // psllw.
+    { ISD::SHL,  MVT::v4i32,  1 }, // pslld
+    { ISD::SHL,  MVT::v2i64,  1 }, // psllq.
+
+    { ISD::SRL,  MVT::v16i8,  1 }, // psrlw.
+    { ISD::SRL,  MVT::v8i16,  1 }, // psrlw.
+    { ISD::SRL,  MVT::v4i32,  1 }, // psrld.
+    { ISD::SRL,  MVT::v2i64,  1 }, // psrlq.
+
+    { ISD::SRA,  MVT::v16i8,  4 }, // psrlw, pand, pxor, psubb.
+    { ISD::SRA,  MVT::v8i16,  1 }, // psraw.
+    { ISD::SRA,  MVT::v4i32,  1 }, // psrad.
+
+    { ISD::SDIV, MVT::v8i16,  6 }, // pmulhw sequence
+    { ISD::UDIV, MVT::v8i16,  6 }, // pmulhuw sequence
+    { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
+    { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
+  };
+
+  if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+      ST->hasSSE2()) {
+    // pmuldq sequence.
+    if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
+      return LT.first * 15;
+
+    int Idx = CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second);
+    if (Idx != -1)
+      return LT.first * SSE2UniformConstCostTable[Idx].Cost;
+  }
+
+  if (ISD == ISD::SHL &&
+      Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) {
+    EVT VT = LT.second;
+    if ((VT == MVT::v8i16 && ST->hasSSE2()) ||
+        (VT == MVT::v4i32 && ST->hasSSE41()))
+      // Vector shift left by non uniform constant can be lowered
+      // into vector multiply (pmullw/pmulld).
+      return LT.first;
+    if (VT == MVT::v4i32 && ST->hasSSE2())
+      // A vector shift left by non uniform constant is converted
+      // into a vector multiply; the new multiply is eventually
+      // lowered into a sequence of shuffles and 2 x pmuludq.
+      ISD = ISD::MUL;
+  }
+
+  static const CostTblEntry<MVT::SimpleValueType> SSE2CostTable[] = {
+    // We don't correctly identify costs of casts because they are marked as
+    // custom.
+    // For some cases, where the shift amount is a scalar we would be able
+    // to generate better code. Unfortunately, when this is the case the value
+    // (the splat) will get hoisted out of the loop, thereby making it invisible
+    // to ISel. The cost model must return worst case assumptions because it is
+    // used for vectorization and we don't want to make vectorized code worse
+    // than scalar code.
+    { ISD::SHL,  MVT::v16i8,  30 }, // cmpeqb sequence.
+    { ISD::SHL,  MVT::v8i16,  8*10 }, // Scalarized.
+    { ISD::SHL,  MVT::v4i32,  2*5 }, // We optimized this using mul.
+    { ISD::SHL,  MVT::v2i64,  2*10 }, // Scalarized.
+    { ISD::SHL,  MVT::v4i64,  4*10 }, // Scalarized. 
+
+    { ISD::SRL,  MVT::v16i8,  16*10 }, // Scalarized.
+    { ISD::SRL,  MVT::v8i16,  8*10 }, // Scalarized.
+    { ISD::SRL,  MVT::v4i32,  4*10 }, // Scalarized.
+    { ISD::SRL,  MVT::v2i64,  2*10 }, // Scalarized.
+
+    { ISD::SRA,  MVT::v16i8,  16*10 }, // Scalarized.
+    { ISD::SRA,  MVT::v8i16,  8*10 }, // Scalarized.
+    { ISD::SRA,  MVT::v4i32,  4*10 }, // Scalarized.
+    { ISD::SRA,  MVT::v2i64,  2*10 }, // Scalarized.
+
+    // It is not a good idea to vectorize division. We have to scalarize it and
+    // in the process we will often end up having to spilling regular
+    // registers. The overhead of division is going to dominate most kernels
+    // anyways so try hard to prevent vectorization of division - it is
+    // generally a bad idea. Assume somewhat arbitrarily that we have to be able
+    // to hide "20 cycles" for each lane.
+    { ISD::SDIV,  MVT::v16i8,  16*20 },
+    { ISD::SDIV,  MVT::v8i16,  8*20 },
+    { ISD::SDIV,  MVT::v4i32,  4*20 },
+    { ISD::SDIV,  MVT::v2i64,  2*20 },
+    { ISD::UDIV,  MVT::v16i8,  16*20 },
+    { ISD::UDIV,  MVT::v8i16,  8*20 },
+    { ISD::UDIV,  MVT::v4i32,  4*20 },
+    { ISD::UDIV,  MVT::v2i64,  2*20 },
+  };
+
+  if (ST->hasSSE2()) {
+    int Idx = CostTableLookup(SSE2CostTable, ISD, LT.second);
+    if (Idx != -1)
+      return LT.first * SSE2CostTable[Idx].Cost;
+  }
+
+  static const CostTblEntry<MVT::SimpleValueType> AVX1CostTable[] = {
+    // We don't have to scalarize unsupported ops. We can issue two half-sized
+    // operations and we only need to extract the upper YMM half.
+    // Two ops + 1 extract + 1 insert = 4.
+    { ISD::MUL,     MVT::v16i16,   4 },
+    { ISD::MUL,     MVT::v8i32,    4 },
+    { ISD::SUB,     MVT::v8i32,    4 },
+    { ISD::ADD,     MVT::v8i32,    4 },
+    { ISD::SUB,     MVT::v4i64,    4 },
+    { ISD::ADD,     MVT::v4i64,    4 },
+    // A v4i64 multiply is custom lowered as two split v2i64 vectors that then
+    // are lowered as a series of long multiplies(3), shifts(4) and adds(2)
+    // Because we believe v4i64 to be a legal type, we must also include the
+    // split factor of two in the cost table. Therefore, the cost here is 18
+    // instead of 9.
+    { ISD::MUL,     MVT::v4i64,    18 },
+  };
+
+  // Look for AVX1 lowering tricks.
+  if (ST->hasAVX() && !ST->hasAVX2()) {
+    EVT VT = LT.second;
+
+    // v16i16 and v8i32 shifts by non-uniform constants are lowered into a
+    // sequence of extract + two vector multiply + insert.
+    if (ISD == ISD::SHL && (VT == MVT::v8i32 || VT == MVT::v16i16) &&
+        Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)
+      ISD = ISD::MUL;
+
+    int Idx = CostTableLookup(AVX1CostTable, ISD, VT);
+    if (Idx != -1)
+      return LT.first * AVX1CostTable[Idx].Cost;
+  }
+
+  // Custom lowering of vectors.
+  static const CostTblEntry<MVT::SimpleValueType> CustomLowered[] = {
+    // A v2i64/v4i64 and multiply is custom lowered as a series of long
+    // multiplies(3), shifts(4) and adds(2).
+    { ISD::MUL,     MVT::v2i64,    9 },
+    { ISD::MUL,     MVT::v4i64,    9 },
+  };
+  int Idx = CostTableLookup(CustomLowered, ISD, LT.second);
+  if (Idx != -1)
+    return LT.first * CustomLowered[Idx].Cost;
+
+  // Special lowering of v4i32 mul on sse2, sse3: Lower v4i32 mul as 2x shuffle,
+  // 2x pmuludq, 2x shuffle.
+  if (ISD == ISD::MUL && LT.second == MVT::v4i32 && ST->hasSSE2() &&
+      !ST->hasSSE41())
+    return LT.first * 6;
+
+  // Fallback to the default implementation.
+  return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Op1Info,
+                                                     Op2Info);
+}
+
+unsigned X86TTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
+                                Type *SubTp) const {
+  // We only estimate the cost of reverse and alternate shuffles.
+  if (Kind != SK_Reverse && Kind != SK_Alternate)
+    return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
+
+  if (Kind == SK_Reverse) {
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
+    unsigned Cost = 1;
+    if (LT.second.getSizeInBits() > 128)
+      Cost = 3; // Extract + insert + copy.
+
+    // Multiple by the number of parts.
+    return Cost * LT.first;
+  }
+
+  if (Kind == SK_Alternate) {
+    // 64-bit packed float vectors (v2f32) are widened to type v4f32.
+    // 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
+
+    // The backend knows how to generate a single VEX.256 version of
+    // instruction VPBLENDW if the target supports AVX2.
+    if (ST->hasAVX2() && LT.second == MVT::v16i16)
+      return LT.first;
+
+    static const CostTblEntry<MVT::SimpleValueType> AVXAltShuffleTbl[] = {
+      {ISD::VECTOR_SHUFFLE, MVT::v4i64, 1},  // vblendpd
+      {ISD::VECTOR_SHUFFLE, MVT::v4f64, 1},  // vblendpd
+
+      {ISD::VECTOR_SHUFFLE, MVT::v8i32, 1},  // vblendps
+      {ISD::VECTOR_SHUFFLE, MVT::v8f32, 1},  // vblendps
+
+      // This shuffle is custom lowered into a sequence of:
+      //  2x  vextractf128 , 2x vpblendw , 1x vinsertf128
+      {ISD::VECTOR_SHUFFLE, MVT::v16i16, 5},
+
+      // This shuffle is custom lowered into a long sequence of:
+      //  2x vextractf128 , 4x vpshufb , 2x vpor ,  1x vinsertf128
+      {ISD::VECTOR_SHUFFLE, MVT::v32i8, 9}
+    };
+
+    if (ST->hasAVX()) {
+      int Idx = CostTableLookup(AVXAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
+      if (Idx != -1)
+        return LT.first * AVXAltShuffleTbl[Idx].Cost;
+    }
+
+    static const CostTblEntry<MVT::SimpleValueType> SSE41AltShuffleTbl[] = {
+      // These are lowered into movsd.
+      {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
+      {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
+
+      // packed float vectors with four elements are lowered into BLENDI dag
+      // nodes. A v4i32/v4f32 BLENDI generates a single 'blendps'/'blendpd'.
+      {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
+      {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
+
+      // This shuffle generates a single pshufw.
+      {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
+
+      // There is no instruction that matches a v16i8 alternate shuffle.
+      // The backend will expand it into the sequence 'pshufb + pshufb + or'.
+      {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3}
+    };
+
+    if (ST->hasSSE41()) {
+      int Idx = CostTableLookup(SSE41AltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
+      if (Idx != -1)
+        return LT.first * SSE41AltShuffleTbl[Idx].Cost;
+    }
+
+    static const CostTblEntry<MVT::SimpleValueType> SSSE3AltShuffleTbl[] = {
+      {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},  // movsd
+      {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},  // movsd
+
+      // SSE3 doesn't have 'blendps'. The following shuffles are expanded into
+      // the sequence 'shufps + pshufd'
+      {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
+      {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
+
+      {ISD::VECTOR_SHUFFLE, MVT::v8i16, 3}, // pshufb + pshufb + or
+      {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3}  // pshufb + pshufb + or
+    };
+ 
+    if (ST->hasSSSE3()) {
+      int Idx = CostTableLookup(SSSE3AltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
+      if (Idx != -1)
+        return LT.first * SSSE3AltShuffleTbl[Idx].Cost;
+    }
+
+    static const CostTblEntry<MVT::SimpleValueType> SSEAltShuffleTbl[] = {
+      {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},  // movsd
+      {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},  // movsd
+
+      {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, // shufps + pshufd
+      {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, // shufps + pshufd
+ 
+      // This is expanded into a long sequence of four extract + four insert.
+      {ISD::VECTOR_SHUFFLE, MVT::v8i16, 8}, // 4 x pextrw + 4 pinsrw.
+
+      // 8 x (pinsrw + pextrw + and + movb + movzb + or)
+      {ISD::VECTOR_SHUFFLE, MVT::v16i8, 48}
+    };
+
+    // Fall-back (SSE3 and SSE2). 
+    int Idx = CostTableLookup(SSEAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
+    if (Idx != -1)
+      return LT.first * SSEAltShuffleTbl[Idx].Cost;
+    return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
+  }
+
+  return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
+}
+
+unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const {
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  std::pair<unsigned, MVT> LTSrc = TLI->getTypeLegalizationCost(Src);
+  std::pair<unsigned, MVT> LTDest = TLI->getTypeLegalizationCost(Dst);
+
+  static const TypeConversionCostTblEntry<MVT::SimpleValueType>
+  SSE2ConvTbl[] = {
+    // These are somewhat magic numbers justified by looking at the output of
+    // Intel's IACA, running some kernels and making sure when we take
+    // legalization into account the throughput will be overestimated.
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
+    // There are faster sequences for float conversions.
+    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
+    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 15 },
+    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
+    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
+    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
+    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 15 },
+    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
+    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
+  };
+
+  if (ST->hasSSE2() && !ST->hasAVX()) {
+    int Idx =
+        ConvertCostTableLookup(SSE2ConvTbl, ISD, LTDest.second, LTSrc.second);
+    if (Idx != -1)
+      return LTSrc.first * SSE2ConvTbl[Idx].Cost;
+  }
+
+  EVT SrcTy = TLI->getValueType(Src);
+  EVT DstTy = TLI->getValueType(Dst);
+
+  // The function getSimpleVT only handles simple value types.
+  if (!SrcTy.isSimple() || !DstTy.isSimple())
+    return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
+
+  static const TypeConversionCostTblEntry<MVT::SimpleValueType>
+  AVX2ConversionTbl[] = {
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  1 },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  1 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   3 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   3 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,   3 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,   3 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  1 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  1 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   3 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   3 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i8,   3 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i8,   3 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16,  3 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16,  3 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32,  1 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32,  1 },
+
+    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i64,  2 },
+    { ISD::TRUNCATE,    MVT::v4i16,  MVT::v4i64,  2 },
+    { ISD::TRUNCATE,    MVT::v4i32,  MVT::v4i64,  2 },
+    { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i32,  2 },
+    { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  2 },
+    { ISD::TRUNCATE,    MVT::v8i32,  MVT::v8i64,  4 },
+  };
+
+  static const TypeConversionCostTblEntry<MVT::SimpleValueType>
+  AVXConversionTbl[] = {
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,  7 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,  4 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,  7 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,  4 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16, 4 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16, 4 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,  6 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,  4 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i8,  6 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i8,  4 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16, 6 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16, 3 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32, 4 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32, 4 },
+
+    { ISD::TRUNCATE,    MVT::v4i8,  MVT::v4i64,  4 },
+    { ISD::TRUNCATE,    MVT::v4i16, MVT::v4i64,  4 },
+    { ISD::TRUNCATE,    MVT::v4i32, MVT::v4i64,  4 },
+    { ISD::TRUNCATE,    MVT::v8i8,  MVT::v8i32,  4 },
+    { ISD::TRUNCATE,    MVT::v8i16, MVT::v8i32,  5 },
+    { ISD::TRUNCATE,    MVT::v16i8, MVT::v16i16, 4 },
+    { ISD::TRUNCATE,    MVT::v8i32, MVT::v8i64,  9 },
+
+    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i1,  8 },
+    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i8,  8 },
+    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i16, 5 },
+    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i32, 1 },
+    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i1,  3 },
+    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i8,  3 },
+    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i16, 3 },
+    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
+    { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i1,  3 },
+    { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i8,  3 },
+    { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i16, 3 },
+    { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i32, 1 },
+
+    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i1,  6 },
+    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i8,  5 },
+    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i16, 5 },
+    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i32, 9 },
+    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i1,  7 },
+    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i8,  2 },
+    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
+    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i32, 6 },
+    { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i1,  7 },
+    { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i8,  2 },
+    { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i16, 2 },
+    { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i32, 6 },
+    // The generic code to compute the scalar overhead is currently broken.
+    // Workaround this limitation by estimating the scalarization overhead
+    // here. We have roughly 10 instructions per scalar element.
+    // Multiply that by the vector width.
+    // FIXME: remove that when PR19268 is fixed.
+    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i64, 2*10 },
+    { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i64, 4*10 },
+
+    { ISD::FP_TO_SINT,  MVT::v8i8,  MVT::v8f32, 7 },
+    { ISD::FP_TO_SINT,  MVT::v4i8,  MVT::v4f32, 1 },
+    // This node is expanded into scalarized operations but BasicTTI is overly
+    // optimistic estimating its cost.  It computes 3 per element (one
+    // vector-extract, one scalar conversion and one vector-insert).  The
+    // problem is that the inserts form a read-modify-write chain so latency
+    // should be factored in too.  Inflating the cost per element by 1.
+    { ISD::FP_TO_UINT,  MVT::v8i32, MVT::v8f32, 8*4 },
+    { ISD::FP_TO_UINT,  MVT::v4i32, MVT::v4f64, 4*4 },
+  };
+
+  if (ST->hasAVX2()) {
+    int Idx = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
+                                     DstTy.getSimpleVT(), SrcTy.getSimpleVT());
+    if (Idx != -1)
+      return AVX2ConversionTbl[Idx].Cost;
+  }
+
+  if (ST->hasAVX()) {
+    int Idx = ConvertCostTableLookup(AVXConversionTbl, ISD, DstTy.getSimpleVT(),
+                                     SrcTy.getSimpleVT());
+    if (Idx != -1)
+      return AVXConversionTbl[Idx].Cost;
+  }
+
+  return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
+}
+
+unsigned X86TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                                    Type *CondTy) const {
+  // Legalize the type.
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy);
+
+  MVT MTy = LT.second;
+
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  static const CostTblEntry<MVT::SimpleValueType> SSE42CostTbl[] = {
+    { ISD::SETCC,   MVT::v2f64,   1 },
+    { ISD::SETCC,   MVT::v4f32,   1 },
+    { ISD::SETCC,   MVT::v2i64,   1 },
+    { ISD::SETCC,   MVT::v4i32,   1 },
+    { ISD::SETCC,   MVT::v8i16,   1 },
+    { ISD::SETCC,   MVT::v16i8,   1 },
+  };
+
+  static const CostTblEntry<MVT::SimpleValueType> AVX1CostTbl[] = {
+    { ISD::SETCC,   MVT::v4f64,   1 },
+    { ISD::SETCC,   MVT::v8f32,   1 },
+    // AVX1 does not support 8-wide integer compare.
+    { ISD::SETCC,   MVT::v4i64,   4 },
+    { ISD::SETCC,   MVT::v8i32,   4 },
+    { ISD::SETCC,   MVT::v16i16,  4 },
+    { ISD::SETCC,   MVT::v32i8,   4 },
+  };
+
+  static const CostTblEntry<MVT::SimpleValueType> AVX2CostTbl[] = {
+    { ISD::SETCC,   MVT::v4i64,   1 },
+    { ISD::SETCC,   MVT::v8i32,   1 },
+    { ISD::SETCC,   MVT::v16i16,  1 },
+    { ISD::SETCC,   MVT::v32i8,   1 },
+  };
+
+  if (ST->hasAVX2()) {
+    int Idx = CostTableLookup(AVX2CostTbl, ISD, MTy);
+    if (Idx != -1)
+      return LT.first * AVX2CostTbl[Idx].Cost;
+  }
+
+  if (ST->hasAVX()) {
+    int Idx = CostTableLookup(AVX1CostTbl, ISD, MTy);
+    if (Idx != -1)
+      return LT.first * AVX1CostTbl[Idx].Cost;
+  }
+
+  if (ST->hasSSE42()) {
+    int Idx = CostTableLookup(SSE42CostTbl, ISD, MTy);
+    if (Idx != -1)
+      return LT.first * SSE42CostTbl[Idx].Cost;
+  }
+
+  return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+}
+
+unsigned X86TTI::getVectorInstrCost(unsigned Opcode, Type *Val,
+                                    unsigned Index) const {
+  assert(Val->isVectorTy() && "This must be a vector type");
+
+  if (Index != -1U) {
+    // Legalize the type.
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val);
+
+    // This type is legalized to a scalar type.
+    if (!LT.second.isVector())
+      return 0;
+
+    // The type may be split. Normalize the index to the new type.
+    unsigned Width = LT.second.getVectorNumElements();
+    Index = Index % Width;
+
+    // Floating point scalars are already located in index #0.
+    if (Val->getScalarType()->isFloatingPointTy() && Index == 0)
+      return 0;
+  }
+
+  return TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index);
+}
+
+unsigned X86TTI::getScalarizationOverhead(Type *Ty, bool Insert,
+                                            bool Extract) const {
+  assert (Ty->isVectorTy() && "Can only scalarize vectors");
+  unsigned Cost = 0;
+
+  for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
+    if (Insert)
+      Cost += TopTTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
+    if (Extract)
+      Cost += TopTTI->getVectorInstrCost(Instruction::ExtractElement, Ty, i);
+  }
+
+  return Cost;
+}
+
+unsigned X86TTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                                 unsigned AddressSpace) const {
+  // Handle non-power-of-two vectors such as <3 x float>
+  if (VectorType *VTy = dyn_cast<VectorType>(Src)) {
+    unsigned NumElem = VTy->getVectorNumElements();
+
+    // Handle a few common cases:
+    // <3 x float>
+    if (NumElem == 3 && VTy->getScalarSizeInBits() == 32)
+      // Cost = 64 bit store + extract + 32 bit store.
+      return 3;
+
+    // <3 x double>
+    if (NumElem == 3 && VTy->getScalarSizeInBits() == 64)
+      // Cost = 128 bit store + unpack + 64 bit store.
+      return 3;
+
+    // Assume that all other non-power-of-two numbers are scalarized.
+    if (!isPowerOf2_32(NumElem)) {
+      unsigned Cost = TargetTransformInfo::getMemoryOpCost(Opcode,
+                                                           VTy->getScalarType(),
+                                                           Alignment,
+                                                           AddressSpace);
+      unsigned SplitCost = getScalarizationOverhead(Src,
+                                                    Opcode == Instruction::Load,
+                                                    Opcode==Instruction::Store);
+      return NumElem * Cost + SplitCost;
+    }
+  }
+
+  // Legalize the type.
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
+  assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
+         "Invalid Opcode");
+
+  // Each load/store unit costs 1.
+  unsigned Cost = LT.first * 1;
+
+  // On Sandybridge 256bit load/stores are double pumped
+  // (but not on Haswell).
+  if (LT.second.getSizeInBits() > 128 && !ST->hasAVX2())
+    Cost*=2;
+
+  return Cost;
+}
+
+unsigned X86TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
+  // Address computations in vectorized code with non-consecutive addresses will
+  // likely result in more instructions compared to scalar code where the
+  // computation can more often be merged into the index mode. The resulting
+  // extra micro-ops can significantly decrease throughput.
+  unsigned NumVectorInstToHideOverhead = 10;
+
+  if (Ty->isVectorTy() && IsComplex)
+    return NumVectorInstToHideOverhead;
+
+  return TargetTransformInfo::getAddressComputationCost(Ty, IsComplex);
+}
+
+unsigned X86TTI::getReductionCost(unsigned Opcode, Type *ValTy,
+                                  bool IsPairwise) const {
+  
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy);
+  
+  MVT MTy = LT.second;
+  
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+ 
+  // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput 
+  // and make it as the cost. 
+ 
+  static const CostTblEntry<MVT::SimpleValueType> SSE42CostTblPairWise[] = {
+    { ISD::FADD,  MVT::v2f64,   2 },
+    { ISD::FADD,  MVT::v4f32,   4 },
+    { ISD::ADD,   MVT::v2i64,   2 },      // The data reported by the IACA tool is "1.6".
+    { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "3.5".
+    { ISD::ADD,   MVT::v8i16,   5 },
+  };
+ 
+  static const CostTblEntry<MVT::SimpleValueType> AVX1CostTblPairWise[] = {
+    { ISD::FADD,  MVT::v4f32,   4 },
+    { ISD::FADD,  MVT::v4f64,   5 },
+    { ISD::FADD,  MVT::v8f32,   7 },
+    { ISD::ADD,   MVT::v2i64,   1 },      // The data reported by the IACA tool is "1.5".
+    { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "3.5".
+    { ISD::ADD,   MVT::v4i64,   5 },      // The data reported by the IACA tool is "4.8".
+    { ISD::ADD,   MVT::v8i16,   5 },
+    { ISD::ADD,   MVT::v8i32,   5 },
+  };
+
+  static const CostTblEntry<MVT::SimpleValueType> SSE42CostTblNoPairWise[] = {
+    { ISD::FADD,  MVT::v2f64,   2 },
+    { ISD::FADD,  MVT::v4f32,   4 },
+    { ISD::ADD,   MVT::v2i64,   2 },      // The data reported by the IACA tool is "1.6".
+    { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "3.3".
+    { ISD::ADD,   MVT::v8i16,   4 },      // The data reported by the IACA tool is "4.3".
+  };
+  
+  static const CostTblEntry<MVT::SimpleValueType> AVX1CostTblNoPairWise[] = {
+    { ISD::FADD,  MVT::v4f32,   3 },
+    { ISD::FADD,  MVT::v4f64,   3 },
+    { ISD::FADD,  MVT::v8f32,   4 },
+    { ISD::ADD,   MVT::v2i64,   1 },      // The data reported by the IACA tool is "1.5".
+    { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "2.8".
+    { ISD::ADD,   MVT::v4i64,   3 },
+    { ISD::ADD,   MVT::v8i16,   4 },
+    { ISD::ADD,   MVT::v8i32,   5 },
+  };
+  
+  if (IsPairwise) {
+    if (ST->hasAVX()) {
+      int Idx = CostTableLookup(AVX1CostTblPairWise, ISD, MTy);
+      if (Idx != -1)
+        return LT.first * AVX1CostTblPairWise[Idx].Cost;
+    }
+  
+    if (ST->hasSSE42()) {
+      int Idx = CostTableLookup(SSE42CostTblPairWise, ISD, MTy);
+      if (Idx != -1)
+        return LT.first * SSE42CostTblPairWise[Idx].Cost;
+    }
+  } else {
+    if (ST->hasAVX()) {
+      int Idx = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy);
+      if (Idx != -1)
+        return LT.first * AVX1CostTblNoPairWise[Idx].Cost;
+    }
+    
+    if (ST->hasSSE42()) {
+      int Idx = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy);
+      if (Idx != -1)
+        return LT.first * SSE42CostTblNoPairWise[Idx].Cost;
+    }
+  }
+
+  return TargetTransformInfo::getReductionCost(Opcode, ValTy, IsPairwise);
+}
+
+/// \brief Calculate the cost of materializing a 64-bit value. This helper
+/// method might only calculate a fraction of a larger immediate. Therefore it
+/// is valid to return a cost of ZERO.
+unsigned X86TTI::getIntImmCost(int64_t Val) const {
+  if (Val == 0)
+    return TCC_Free;
+
+  if (isInt<32>(Val))
+    return TCC_Basic;
+
+  return 2 * TCC_Basic;
+}
+
+unsigned X86TTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  if (BitSize == 0)
+    return ~0U;
+
+  // Never hoist constants larger than 128bit, because this might lead to
+  // incorrect code generation or assertions in codegen.
+  // Fixme: Create a cost model for types larger than i128 once the codegen
+  // issues have been fixed.
+  if (BitSize > 128)
+    return TCC_Free;
+
+  if (Imm == 0)
+    return TCC_Free;
+
+  // Sign-extend all constants to a multiple of 64-bit.
+  APInt ImmVal = Imm;
+  if (BitSize & 0x3f)
+    ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
+
+  // Split the constant into 64-bit chunks and calculate the cost for each
+  // chunk.
+  unsigned Cost = 0;
+  for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
+    APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
+    int64_t Val = Tmp.getSExtValue();
+    Cost += getIntImmCost(Val);
+  }
+  // We need at least one instruction to materialze the constant.
+  return std::max(1U, Cost);
+}
+
+unsigned X86TTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
+                               Type *Ty) const {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  // There is no cost model for constants with a bit size of 0. Return TCC_Free
+  // here, so that constant hoisting will ignore this constant.
+  if (BitSize == 0)
+    return TCC_Free;
+
+  unsigned ImmIdx = ~0U;
+  switch (Opcode) {
+  default: return TCC_Free;
+  case Instruction::GetElementPtr:
+    // Always hoist the base address of a GetElementPtr. This prevents the
+    // creation of new constants for every base constant that gets constant
+    // folded with the offset.
+    if (Idx == 0)
+      return 2 * TCC_Basic;
+    return TCC_Free;
+  case Instruction::Store:
+    ImmIdx = 0;
+    break;
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::ICmp:
+    ImmIdx = 1;
+    break;
+  // Always return TCC_Free for the shift value of a shift instruction.
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+    if (Idx == 1)
+      return TCC_Free;
+    break;
+  case Instruction::Trunc:
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::IntToPtr:
+  case Instruction::PtrToInt:
+  case Instruction::BitCast:
+  case Instruction::PHI:
+  case Instruction::Call:
+  case Instruction::Select:
+  case Instruction::Ret:
+  case Instruction::Load:
+    break;
+  }
+
+  if (Idx == ImmIdx) {
+    unsigned NumConstants = (BitSize + 63) / 64;
+    unsigned Cost = X86TTI::getIntImmCost(Imm, Ty);
+    return (Cost <= NumConstants * TCC_Basic)
+      ? static_cast<unsigned>(TCC_Free)
+      : Cost;
+  }
+
+  return X86TTI::getIntImmCost(Imm, Ty);
+}
+
+unsigned X86TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
+                               const APInt &Imm, Type *Ty) const {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  // There is no cost model for constants with a bit size of 0. Return TCC_Free
+  // here, so that constant hoisting will ignore this constant.
+  if (BitSize == 0)
+    return TCC_Free;
+
+  switch (IID) {
+  default: return TCC_Free;
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::uadd_with_overflow:
+  case Intrinsic::ssub_with_overflow:
+  case Intrinsic::usub_with_overflow:
+  case Intrinsic::smul_with_overflow:
+  case Intrinsic::umul_with_overflow:
+    if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue()))
+      return TCC_Free;
+    break;
+  case Intrinsic::experimental_stackmap:
+    if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
+      return TCC_Free;
+    break;
+  case Intrinsic::experimental_patchpoint_void:
+  case Intrinsic::experimental_patchpoint_i64:
+    if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
+      return TCC_Free;
+    break;
+  }
+  return X86TTI::getIntImmCost(Imm, Ty);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp b/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp
new file mode 100644
index 0000000..0bb5f99
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp
@@ -0,0 +1,316 @@
+//===-- X86VZeroUpper.cpp - AVX vzeroupper instruction inserter -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which inserts x86 AVX vzeroupper instructions
+// before calls to SSE encoded functions. This avoids transition latency
+// penalty when tranfering control between AVX encoded instructions and old
+// SSE encoding mode.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-vzeroupper"
+
+STATISTIC(NumVZU, "Number of vzeroupper instructions inserted");
+
+namespace {
+
+  class VZeroUpperInserter : public MachineFunctionPass {
+  public:
+
+    VZeroUpperInserter() : MachineFunctionPass(ID) {}
+    bool runOnMachineFunction(MachineFunction &MF) override;
+    const char *getPassName() const override {return "X86 vzeroupper inserter";}
+
+  private:
+
+    void processBasicBlock(MachineBasicBlock &MBB);
+    void insertVZeroUpper(MachineBasicBlock::iterator I,
+                          MachineBasicBlock &MBB);
+    void addDirtySuccessor(MachineBasicBlock &MBB);
+
+    typedef enum { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY } BlockExitState;
+    static const char* getBlockExitStateName(BlockExitState ST);
+
+    // Core algorithm state:
+    // BlockState - Each block is either:
+    //   - PASS_THROUGH: There are neither YMM dirtying instructions nor
+    //                   vzeroupper instructions in this block.
+    //   - EXITS_CLEAN: There is (or will be) a vzeroupper instruction in this
+    //                  block that will ensure that YMM is clean on exit.
+    //   - EXITS_DIRTY: An instruction in the block dirties YMM and no
+    //                  subsequent vzeroupper in the block clears it.
+    //
+    // AddedToDirtySuccessors - This flag is raised when a block is added to the
+    //                          DirtySuccessors list to ensure that it's not
+    //                          added multiple times.
+    //
+    // FirstUnguardedCall - Records the location of the first unguarded call in
+    //                      each basic block that may need to be guarded by a
+    //                      vzeroupper. We won't know whether it actually needs
+    //                      to be guarded until we discover a predecessor that
+    //                      is DIRTY_OUT.
+    struct BlockState {
+      BlockState() : ExitState(PASS_THROUGH), AddedToDirtySuccessors(false) {}
+      BlockExitState ExitState;
+      bool AddedToDirtySuccessors;
+      MachineBasicBlock::iterator FirstUnguardedCall;
+    };
+    typedef SmallVector<BlockState, 8> BlockStateMap;
+    typedef SmallVector<MachineBasicBlock*, 8> DirtySuccessorsWorkList;
+
+    BlockStateMap BlockStates;
+    DirtySuccessorsWorkList DirtySuccessors;
+    bool EverMadeChange;
+    const TargetInstrInfo *TII;
+
+    static char ID;
+  };
+
+  char VZeroUpperInserter::ID = 0;
+}
+
+FunctionPass *llvm::createX86IssueVZeroUpperPass() {
+  return new VZeroUpperInserter();
+}
+
+const char* VZeroUpperInserter::getBlockExitStateName(BlockExitState ST) {
+  switch (ST) {
+    case PASS_THROUGH: return "Pass-through";
+    case EXITS_DIRTY: return "Exits-dirty";
+    case EXITS_CLEAN: return "Exits-clean";
+  }
+  llvm_unreachable("Invalid block exit state.");
+}
+
+static bool isYmmReg(unsigned Reg) {
+  return (Reg >= X86::YMM0 && Reg <= X86::YMM15);
+}
+
+static bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) {
+  for (MachineRegisterInfo::livein_iterator I = MRI.livein_begin(),
+       E = MRI.livein_end(); I != E; ++I)
+    if (isYmmReg(I->first))
+      return true;
+
+  return false;
+}
+
+static bool clobbersAllYmmRegs(const MachineOperand &MO) {
+  for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) {
+    if (!MO.clobbersPhysReg(reg))
+      return false;
+  }
+  return true;
+}
+
+static bool hasYmmReg(MachineInstr *MI) {
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MI->isCall() && MO.isRegMask() && !clobbersAllYmmRegs(MO))
+      return true;
+    if (!MO.isReg())
+      continue;
+    if (MO.isDebug())
+      continue;
+    if (isYmmReg(MO.getReg()))
+      return true;
+  }
+  return false;
+}
+
+/// clobbersAnyYmmReg() - Check if any YMM register will be clobbered by this
+/// instruction.
+static bool callClobbersAnyYmmReg(MachineInstr *MI) {
+  assert(MI->isCall() && "Can only be called on call instructions.");
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isRegMask())
+      continue;
+    for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) {
+      if (MO.clobbersPhysReg(reg))
+        return true;
+    }
+  }
+  return false;
+}
+
+// Insert a vzeroupper instruction before I.
+void VZeroUpperInserter::insertVZeroUpper(MachineBasicBlock::iterator I,
+                                              MachineBasicBlock &MBB) {
+  DebugLoc dl = I->getDebugLoc();
+  BuildMI(MBB, I, dl, TII->get(X86::VZEROUPPER));
+  ++NumVZU;
+  EverMadeChange = true;
+}
+
+// Add MBB to the DirtySuccessors list if it hasn't already been added.
+void VZeroUpperInserter::addDirtySuccessor(MachineBasicBlock &MBB) {
+  if (!BlockStates[MBB.getNumber()].AddedToDirtySuccessors) {
+    DirtySuccessors.push_back(&MBB);
+    BlockStates[MBB.getNumber()].AddedToDirtySuccessors = true;
+  }
+}
+
+/// processBasicBlock - Loop over all of the instructions in the basic block,
+/// inserting vzero upper instructions before function calls.
+void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
+
+  // Start by assuming that the block PASS_THROUGH, which implies no unguarded
+  // calls.
+  BlockExitState CurState = PASS_THROUGH;
+  BlockStates[MBB.getNumber()].FirstUnguardedCall = MBB.end();
+
+  for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) {
+    MachineInstr *MI = I;
+    bool isControlFlow = MI->isCall() || MI->isReturn();
+
+    // Shortcut: don't need to check regular instructions in dirty state.
+    if (!isControlFlow && CurState == EXITS_DIRTY)
+      continue;
+
+    if (hasYmmReg(MI)) {
+      // We found a ymm-using instruction; this could be an AVX instruction,
+      // or it could be control flow.
+      CurState = EXITS_DIRTY;
+      continue;
+    }
+
+    // Check for control-flow out of the current function (which might
+    // indirectly execute SSE instructions).
+    if (!isControlFlow)
+      continue;
+
+    // If the call won't clobber any YMM register, skip it as well. It usually
+    // happens on helper function calls (such as '_chkstk', '_ftol2') where
+    // standard calling convention is not used (RegMask is not used to mark
+    // register clobbered and register usage (def/imp-def/use) is well-dfined
+    // and explicitly specified.
+    if (MI->isCall() && !callClobbersAnyYmmReg(MI))
+      continue;
+
+    // The VZEROUPPER instruction resets the upper 128 bits of all Intel AVX
+    // registers. This instruction has zero latency. In addition, the processor
+    // changes back to Clean state, after which execution of Intel SSE
+    // instructions or Intel AVX instructions has no transition penalty. Add
+    // the VZEROUPPER instruction before any function call/return that might
+    // execute SSE code.
+    // FIXME: In some cases, we may want to move the VZEROUPPER into a
+    // predecessor block.
+    if (CurState == EXITS_DIRTY) {
+      // After the inserted VZEROUPPER the state becomes clean again, but
+      // other YMM may appear before other subsequent calls or even before
+      // the end of the BB.
+      insertVZeroUpper(I, MBB);
+      CurState = EXITS_CLEAN;
+    } else if (CurState == PASS_THROUGH) {
+      // If this block is currently in pass-through state and we encounter a
+      // call then whether we need a vzeroupper or not depends on whether this
+      // block has successors that exit dirty. Record the location of the call,
+      // and set the state to EXITS_CLEAN, but do not insert the vzeroupper yet.
+      // It will be inserted later if necessary.
+      BlockStates[MBB.getNumber()].FirstUnguardedCall = I;
+      CurState = EXITS_CLEAN;
+    }
+  }
+
+  DEBUG(dbgs() << "MBB #" << MBB.getNumber() << " exit state: "
+               << getBlockExitStateName(CurState) << '\n');
+
+  if (CurState == EXITS_DIRTY)
+    for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(),
+                                          SE = MBB.succ_end();
+         SI != SE; ++SI)
+      addDirtySuccessor(**SI);
+
+  BlockStates[MBB.getNumber()].ExitState = CurState;
+}
+
+/// runOnMachineFunction - Loop over all of the basic blocks, inserting
+/// vzero upper instructions before function calls.
+bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
+  const X86Subtarget &ST = MF.getTarget().getSubtarget<X86Subtarget>();
+  if (!ST.hasAVX() || ST.hasAVX512())
+    return false;
+  TII = MF.getTarget().getInstrInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  EverMadeChange = false;
+
+  // Fast check: if the function doesn't use any ymm registers, we don't need
+  // to insert any VZEROUPPER instructions.  This is constant-time, so it is
+  // cheap in the common case of no ymm use.
+  bool YMMUsed = false;
+  const TargetRegisterClass *RC = &X86::VR256RegClass;
+  for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end();
+       i != e; i++) {
+    if (!MRI.reg_nodbg_empty(*i)) {
+      YMMUsed = true;
+      break;
+    }
+  }
+  if (!YMMUsed) {
+    return false;
+  }
+
+  assert(BlockStates.empty() && DirtySuccessors.empty() &&
+         "X86VZeroUpper state should be clear");
+  BlockStates.resize(MF.getNumBlockIDs());
+
+  // Process all blocks. This will compute block exit states, record the first
+  // unguarded call in each block, and add successors of dirty blocks to the
+  // DirtySuccessors list.
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)
+    processBasicBlock(*I);
+
+  // If any YMM regs are live in to this function, add the entry block to the
+  // DirtySuccessors list
+  if (checkFnHasLiveInYmm(MRI))
+    addDirtySuccessor(MF.front());
+
+  // Re-visit all blocks that are successors of EXITS_DIRTY bsocks. Add
+  // vzeroupper instructions to unguarded calls, and propagate EXITS_DIRTY
+  // through PASS_THROUGH blocks.
+  while (!DirtySuccessors.empty()) {
+    MachineBasicBlock &MBB = *DirtySuccessors.back();
+    DirtySuccessors.pop_back();
+    BlockState &BBState = BlockStates[MBB.getNumber()];
+
+    // MBB is a successor of a dirty block, so its first call needs to be
+    // guarded.
+    if (BBState.FirstUnguardedCall != MBB.end())
+      insertVZeroUpper(BBState.FirstUnguardedCall, MBB);
+
+    // If this successor was a pass-through block then it is now dirty, and its
+    // successors need to be added to the worklist (if they haven't been
+    // already).
+    if (BBState.ExitState == PASS_THROUGH) {
+      DEBUG(dbgs() << "MBB #" << MBB.getNumber()
+                   << " was Pass-through, is now Dirty-out.\n");
+      for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(),
+                                            SE = MBB.succ_end();
+           SI != SE; ++SI)
+        addDirtySuccessor(**SI);
+    }
+  }
+
+  BlockStates.clear();
+  return EverMadeChange;
+}
diff --git a/contrib/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp b/contrib/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
new file mode 100644
index 0000000..7fef796
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
@@ -0,0 +1,802 @@
+//===- XCoreDisassembler.cpp - Disassembler for XCore -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file is part of the XCore Disassembler.
+///
+//===----------------------------------------------------------------------===//
+
+#include "XCore.h"
+#include "XCoreRegisterInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/MemoryObject.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "xcore-disassembler"
+
+typedef MCDisassembler::DecodeStatus DecodeStatus;
+
+namespace {
+
+/// \brief A disassembler class for XCore.
+class XCoreDisassembler : public MCDisassembler {
+public:
+  XCoreDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) :
+    MCDisassembler(STI, Ctx) {}
+
+  /// \brief See MCDisassembler.
+  virtual DecodeStatus getInstruction(MCInst &instr,
+                                      uint64_t &size,
+                                      const MemoryObject &region,
+                                      uint64_t address,
+                                      raw_ostream &vStream,
+                                      raw_ostream &cStream) const override;
+
+};
+}
+
+static bool readInstruction16(const MemoryObject &region,
+                              uint64_t address,
+                              uint64_t &size,
+                              uint16_t &insn) {
+  uint8_t Bytes[4];
+
+  // We want to read exactly 2 Bytes of data.
+  if (region.readBytes(address, 2, Bytes) == -1) {
+    size = 0;
+    return false;
+  }
+  // Encoded as a little-endian 16-bit word in the stream.
+  insn = (Bytes[0] <<  0) | (Bytes[1] <<  8);
+  return true;
+}
+
+static bool readInstruction32(const MemoryObject &region,
+                              uint64_t address,
+                              uint64_t &size,
+                              uint32_t &insn) {
+  uint8_t Bytes[4];
+
+  // We want to read exactly 4 Bytes of data.
+  if (region.readBytes(address, 4, Bytes) == -1) {
+    size = 0;
+    return false;
+  }
+  // Encoded as a little-endian 32-bit word in the stream.
+  insn = (Bytes[0] << 0) | (Bytes[1] << 8) | (Bytes[2] << 16) |
+         (Bytes[3] << 24);
+  return true;
+}
+
+static unsigned getReg(const void *D, unsigned RC, unsigned RegNo) {
+  const XCoreDisassembler *Dis = static_cast<const XCoreDisassembler*>(D);
+  const MCRegisterInfo *RegInfo = Dis->getContext().getRegisterInfo();
+  return *(RegInfo->getRegClass(RC).begin() + RegNo);
+}
+
+static DecodeStatus DecodeGRRegsRegisterClass(MCInst &Inst,
+                                              unsigned RegNo,
+                                              uint64_t Address,
+                                              const void *Decoder);
+
+static DecodeStatus DecodeRRegsRegisterClass(MCInst &Inst,
+                                             unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+
+static DecodeStatus DecodeBitpOperand(MCInst &Inst, unsigned Val,
+                                      uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeNegImmOperand(MCInst &Inst, unsigned Val,
+                                        uint64_t Address, const void *Decoder);
+
+static DecodeStatus Decode2RInstruction(MCInst &Inst,
+                                        unsigned Insn,
+                                        uint64_t Address,
+                                        const void *Decoder);
+
+static DecodeStatus Decode2RImmInstruction(MCInst &Inst,
+                                           unsigned Insn,
+                                           uint64_t Address,
+                                           const void *Decoder);
+
+static DecodeStatus DecodeR2RInstruction(MCInst &Inst,
+                                         unsigned Insn,
+                                         uint64_t Address,
+                                         const void *Decoder);
+
+static DecodeStatus Decode2RSrcDstInstruction(MCInst &Inst,
+                                              unsigned Insn,
+                                              uint64_t Address,
+                                              const void *Decoder);
+
+static DecodeStatus DecodeRUSInstruction(MCInst &Inst,
+                                         unsigned Insn,
+                                         uint64_t Address,
+                                         const void *Decoder);
+
+static DecodeStatus DecodeRUSBitpInstruction(MCInst &Inst,
+                                             unsigned Insn,
+                                             uint64_t Address,
+                                             const void *Decoder);
+
+static DecodeStatus DecodeRUSSrcDstBitpInstruction(MCInst &Inst,
+                                                   unsigned Insn,
+                                                   uint64_t Address,
+                                                   const void *Decoder);
+
+static DecodeStatus DecodeL2RInstruction(MCInst &Inst,
+                                         unsigned Insn,
+                                         uint64_t Address,
+                                         const void *Decoder);
+
+static DecodeStatus DecodeLR2RInstruction(MCInst &Inst,
+                                          unsigned Insn,
+                                          uint64_t Address,
+                                          const void *Decoder);
+
+static DecodeStatus Decode3RInstruction(MCInst &Inst,
+                                        unsigned Insn,
+                                        uint64_t Address,
+                                        const void *Decoder);
+
+static DecodeStatus Decode3RImmInstruction(MCInst &Inst,
+                                           unsigned Insn,
+                                           uint64_t Address,
+                                           const void *Decoder);
+
+static DecodeStatus Decode2RUSInstruction(MCInst &Inst,
+                                          unsigned Insn,
+                                          uint64_t Address,
+                                          const void *Decoder);
+
+static DecodeStatus Decode2RUSBitpInstruction(MCInst &Inst,
+                                              unsigned Insn,
+                                              uint64_t Address,
+                                              const void *Decoder);
+
+static DecodeStatus DecodeL3RInstruction(MCInst &Inst,
+                                         unsigned Insn,
+                                         uint64_t Address,
+                                         const void *Decoder);
+
+static DecodeStatus DecodeL3RSrcDstInstruction(MCInst &Inst,
+                                               unsigned Insn,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeL2RUSInstruction(MCInst &Inst,
+                                           unsigned Insn,
+                                           uint64_t Address,
+                                           const void *Decoder);
+
+static DecodeStatus DecodeL2RUSBitpInstruction(MCInst &Inst,
+                                               unsigned Insn,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeL6RInstruction(MCInst &Inst,
+                                         unsigned Insn,
+                                         uint64_t Address,
+                                         const void *Decoder);
+
+static DecodeStatus DecodeL5RInstruction(MCInst &Inst,
+                                         unsigned Insn,
+                                         uint64_t Address,
+                                         const void *Decoder);
+
+static DecodeStatus DecodeL4RSrcDstInstruction(MCInst &Inst,
+                                               unsigned Insn,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeL4RSrcDstSrcDstInstruction(MCInst &Inst,
+                                                     unsigned Insn,
+                                                     uint64_t Address,
+                                                     const void *Decoder);
+
+#include "XCoreGenDisassemblerTables.inc"
+
+static DecodeStatus DecodeGRRegsRegisterClass(MCInst &Inst,
+                                              unsigned RegNo,
+                                              uint64_t Address,
+                                              const void *Decoder)
+{
+  if (RegNo > 11)
+    return MCDisassembler::Fail;
+  unsigned Reg = getReg(Decoder, XCore::GRRegsRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeRRegsRegisterClass(MCInst &Inst,
+                                             unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder)
+{
+  if (RegNo > 15)
+    return MCDisassembler::Fail;
+  unsigned Reg = getReg(Decoder, XCore::RRegsRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeBitpOperand(MCInst &Inst, unsigned Val,
+                                      uint64_t Address, const void *Decoder) {
+  if (Val > 11)
+    return MCDisassembler::Fail;
+  static unsigned Values[] = {
+    32 /*bpw*/, 1, 2, 3, 4, 5, 6, 7, 8, 16, 24, 32
+  };
+  Inst.addOperand(MCOperand::CreateImm(Values[Val]));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeNegImmOperand(MCInst &Inst, unsigned Val,
+                                        uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(-(int64_t)Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus
+Decode2OpInstruction(unsigned Insn, unsigned &Op1, unsigned &Op2) {
+  unsigned Combined = fieldFromInstruction(Insn, 6, 5);
+  if (Combined < 27)
+    return MCDisassembler::Fail;
+  if (fieldFromInstruction(Insn, 5, 1)) {
+    if (Combined == 31)
+      return MCDisassembler::Fail;
+    Combined += 5;
+  }
+  Combined -= 27;
+  unsigned Op1High = Combined % 3;
+  unsigned Op2High = Combined / 3;
+  Op1 = (Op1High << 2) | fieldFromInstruction(Insn, 2, 2);
+  Op2 = (Op2High << 2) | fieldFromInstruction(Insn, 0, 2);
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus
+Decode3OpInstruction(unsigned Insn, unsigned &Op1, unsigned &Op2,
+                     unsigned &Op3) {
+  unsigned Combined = fieldFromInstruction(Insn, 6, 5);
+  if (Combined >= 27)
+    return MCDisassembler::Fail;
+
+  unsigned Op1High = Combined % 3;
+  unsigned Op2High = (Combined / 3) % 3;
+  unsigned Op3High = Combined / 9;
+  Op1 = (Op1High << 2) | fieldFromInstruction(Insn, 4, 2);
+  Op2 = (Op2High << 2) | fieldFromInstruction(Insn, 2, 2);
+  Op3 = (Op3High << 2) | fieldFromInstruction(Insn, 0, 2);
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus
+Decode2OpInstructionFail(MCInst &Inst, unsigned Insn, uint64_t Address,
+                         const void *Decoder) {
+  // Try and decode as a 3R instruction.
+  unsigned Opcode = fieldFromInstruction(Insn, 11, 5);
+  switch (Opcode) {
+  case 0x0:
+    Inst.setOpcode(XCore::STW_2rus);
+    return Decode2RUSInstruction(Inst, Insn, Address, Decoder);
+  case 0x1:
+    Inst.setOpcode(XCore::LDW_2rus);
+    return Decode2RUSInstruction(Inst, Insn, Address, Decoder);
+  case 0x2:
+    Inst.setOpcode(XCore::ADD_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x3:
+    Inst.setOpcode(XCore::SUB_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x4:
+    Inst.setOpcode(XCore::SHL_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x5:
+    Inst.setOpcode(XCore::SHR_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x6:
+    Inst.setOpcode(XCore::EQ_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x7:
+    Inst.setOpcode(XCore::AND_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x8:
+    Inst.setOpcode(XCore::OR_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x9:
+    Inst.setOpcode(XCore::LDW_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x10:
+    Inst.setOpcode(XCore::LD16S_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x11:
+    Inst.setOpcode(XCore::LD8U_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x12:
+    Inst.setOpcode(XCore::ADD_2rus);
+    return Decode2RUSInstruction(Inst, Insn, Address, Decoder);
+  case 0x13:
+    Inst.setOpcode(XCore::SUB_2rus);
+    return Decode2RUSInstruction(Inst, Insn, Address, Decoder);
+  case 0x14:
+    Inst.setOpcode(XCore::SHL_2rus);
+    return Decode2RUSBitpInstruction(Inst, Insn, Address, Decoder);
+  case 0x15:
+    Inst.setOpcode(XCore::SHR_2rus);
+    return Decode2RUSBitpInstruction(Inst, Insn, Address, Decoder);
+  case 0x16:
+    Inst.setOpcode(XCore::EQ_2rus);
+    return Decode2RUSInstruction(Inst, Insn, Address, Decoder);
+  case 0x17:
+    Inst.setOpcode(XCore::TSETR_3r);
+    return Decode3RImmInstruction(Inst, Insn, Address, Decoder);
+  case 0x18:
+    Inst.setOpcode(XCore::LSS_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x19:
+    Inst.setOpcode(XCore::LSU_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  }
+  return MCDisassembler::Fail;
+}
+
+static DecodeStatus
+Decode2RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                    const void *Decoder) {
+  unsigned Op1, Op2;
+  DecodeStatus S = Decode2OpInstruction(Insn, Op1, Op2);
+  if (S != MCDisassembler::Success)
+    return Decode2OpInstructionFail(Inst, Insn, Address, Decoder);
+
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+  return S;
+}
+
+static DecodeStatus
+Decode2RImmInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                       const void *Decoder) {
+  unsigned Op1, Op2;
+  DecodeStatus S = Decode2OpInstruction(Insn, Op1, Op2);
+  if (S != MCDisassembler::Success)
+    return Decode2OpInstructionFail(Inst, Insn, Address, Decoder);
+
+  Inst.addOperand(MCOperand::CreateImm(Op1));
+  DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+  return S;
+}
+
+static DecodeStatus
+DecodeR2RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                     const void *Decoder) {
+  unsigned Op1, Op2;
+  DecodeStatus S = Decode2OpInstruction(Insn, Op2, Op1);
+  if (S != MCDisassembler::Success)
+    return Decode2OpInstructionFail(Inst, Insn, Address, Decoder);
+
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+  return S;
+}
+
+static DecodeStatus
+Decode2RSrcDstInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                          const void *Decoder) {
+  unsigned Op1, Op2;
+  DecodeStatus S = Decode2OpInstruction(Insn, Op1, Op2);
+  if (S != MCDisassembler::Success)
+    return Decode2OpInstructionFail(Inst, Insn, Address, Decoder);
+
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+  return S;
+}
+
+static DecodeStatus
+DecodeRUSInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                     const void *Decoder) {
+  unsigned Op1, Op2;
+  DecodeStatus S = Decode2OpInstruction(Insn, Op1, Op2);
+  if (S != MCDisassembler::Success)
+    return Decode2OpInstructionFail(Inst, Insn, Address, Decoder);
+
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  Inst.addOperand(MCOperand::CreateImm(Op2));
+  return S;
+}
+
+static DecodeStatus
+DecodeRUSBitpInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                         const void *Decoder) {
+  unsigned Op1, Op2;
+  DecodeStatus S = Decode2OpInstruction(Insn, Op1, Op2);
+  if (S != MCDisassembler::Success)
+    return Decode2OpInstructionFail(Inst, Insn, Address, Decoder);
+
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  DecodeBitpOperand(Inst, Op2, Address, Decoder);
+  return S;
+}
+
+static DecodeStatus
+DecodeRUSSrcDstBitpInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                               const void *Decoder) {
+  unsigned Op1, Op2;
+  DecodeStatus S = Decode2OpInstruction(Insn, Op1, Op2);
+  if (S != MCDisassembler::Success)
+    return Decode2OpInstructionFail(Inst, Insn, Address, Decoder);
+
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  DecodeBitpOperand(Inst, Op2, Address, Decoder);
+  return S;
+}
+
+static DecodeStatus
+DecodeL2OpInstructionFail(MCInst &Inst, unsigned Insn, uint64_t Address,
+                          const void *Decoder) {
+  // Try and decode as a L3R / L2RUS instruction.
+  unsigned Opcode = fieldFromInstruction(Insn, 16, 4) |
+                    fieldFromInstruction(Insn, 27, 5) << 4;
+  switch (Opcode) {
+  case 0x0c:
+    Inst.setOpcode(XCore::STW_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x1c:
+    Inst.setOpcode(XCore::XOR_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x2c:
+    Inst.setOpcode(XCore::ASHR_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x3c:
+    Inst.setOpcode(XCore::LDAWF_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x4c:
+    Inst.setOpcode(XCore::LDAWB_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x5c:
+    Inst.setOpcode(XCore::LDA16F_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x6c:
+    Inst.setOpcode(XCore::LDA16B_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x7c:
+    Inst.setOpcode(XCore::MUL_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x8c:
+    Inst.setOpcode(XCore::DIVS_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x9c:
+    Inst.setOpcode(XCore::DIVU_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x10c:
+    Inst.setOpcode(XCore::ST16_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x11c:
+    Inst.setOpcode(XCore::ST8_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x12c:
+    Inst.setOpcode(XCore::ASHR_l2rus);
+    return DecodeL2RUSBitpInstruction(Inst, Insn, Address, Decoder);
+  case 0x12d:
+    Inst.setOpcode(XCore::OUTPW_l2rus);
+    return DecodeL2RUSBitpInstruction(Inst, Insn, Address, Decoder);
+  case 0x12e:
+    Inst.setOpcode(XCore::INPW_l2rus);
+    return DecodeL2RUSBitpInstruction(Inst, Insn, Address, Decoder);
+  case 0x13c:
+    Inst.setOpcode(XCore::LDAWF_l2rus);
+    return DecodeL2RUSInstruction(Inst, Insn, Address, Decoder);
+  case 0x14c:
+    Inst.setOpcode(XCore::LDAWB_l2rus);
+    return DecodeL2RUSInstruction(Inst, Insn, Address, Decoder);
+  case 0x15c:
+    Inst.setOpcode(XCore::CRC_l3r);
+    return DecodeL3RSrcDstInstruction(Inst, Insn, Address, Decoder);
+  case 0x18c:
+    Inst.setOpcode(XCore::REMS_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x19c:
+    Inst.setOpcode(XCore::REMU_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  }
+  return MCDisassembler::Fail;
+}
+
+static DecodeStatus
+DecodeL2RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                               const void *Decoder) {
+  unsigned Op1, Op2;
+  DecodeStatus S = Decode2OpInstruction(fieldFromInstruction(Insn, 0, 16),
+                                        Op1, Op2);
+  if (S != MCDisassembler::Success)
+    return DecodeL2OpInstructionFail(Inst, Insn, Address, Decoder);
+
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+  return S;
+}
+
+static DecodeStatus
+DecodeLR2RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                               const void *Decoder) {
+  unsigned Op1, Op2;
+  DecodeStatus S = Decode2OpInstruction(fieldFromInstruction(Insn, 0, 16),
+                                        Op1, Op2);
+  if (S != MCDisassembler::Success)
+    return DecodeL2OpInstructionFail(Inst, Insn, Address, Decoder);
+
+  DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  return S;
+}
+
+static DecodeStatus
+Decode3RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                    const void *Decoder) {
+  unsigned Op1, Op2, Op3;
+  DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
+  }
+  return S;
+}
+
+static DecodeStatus
+Decode3RImmInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                       const void *Decoder) {
+  unsigned Op1, Op2, Op3;
+  DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3);
+  if (S == MCDisassembler::Success) {
+    Inst.addOperand(MCOperand::CreateImm(Op1));
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
+  }
+  return S;
+}
+
+static DecodeStatus
+Decode2RUSInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                      const void *Decoder) {
+  unsigned Op1, Op2, Op3;
+  DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    Inst.addOperand(MCOperand::CreateImm(Op3));
+  }
+  return S;
+}
+
+static DecodeStatus
+Decode2RUSBitpInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                      const void *Decoder) {
+  unsigned Op1, Op2, Op3;
+  DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    DecodeBitpOperand(Inst, Op3, Address, Decoder);
+  }
+  return S;
+}
+
+static DecodeStatus
+DecodeL3RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                     const void *Decoder) {
+  unsigned Op1, Op2, Op3;
+  DecodeStatus S =
+    Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
+  }
+  return S;
+}
+
+static DecodeStatus
+DecodeL3RSrcDstInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                           const void *Decoder) {
+  unsigned Op1, Op2, Op3;
+  DecodeStatus S =
+  Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
+  }
+  return S;
+}
+
+static DecodeStatus
+DecodeL2RUSInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                       const void *Decoder) {
+  unsigned Op1, Op2, Op3;
+  DecodeStatus S =
+  Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    Inst.addOperand(MCOperand::CreateImm(Op3));
+  }
+  return S;
+}
+
+static DecodeStatus
+DecodeL2RUSBitpInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                           const void *Decoder) {
+  unsigned Op1, Op2, Op3;
+  DecodeStatus S =
+  Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    DecodeBitpOperand(Inst, Op3, Address, Decoder);
+  }
+  return S;
+}
+
+static DecodeStatus
+DecodeL6RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                     const void *Decoder) {
+  unsigned Op1, Op2, Op3, Op4, Op5, Op6;
+  DecodeStatus S =
+    Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+  if (S != MCDisassembler::Success)
+    return S;
+  S = Decode3OpInstruction(fieldFromInstruction(Insn, 16, 16), Op4, Op5, Op6);
+  if (S != MCDisassembler::Success)
+    return S;
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op4, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op5, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op6, Address, Decoder);
+  return S;
+}
+
+static DecodeStatus
+DecodeL5RInstructionFail(MCInst &Inst, unsigned Insn, uint64_t Address,
+                     const void *Decoder) {
+  // Try and decode as a L6R instruction.
+  Inst.clear();
+  unsigned Opcode = fieldFromInstruction(Insn, 27, 5);
+  switch (Opcode) {
+  case 0x00:
+    Inst.setOpcode(XCore::LMUL_l6r);
+    return DecodeL6RInstruction(Inst, Insn, Address, Decoder);
+  }
+  return MCDisassembler::Fail;
+}
+
+static DecodeStatus
+DecodeL5RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                     const void *Decoder) {
+  unsigned Op1, Op2, Op3, Op4, Op5;
+  DecodeStatus S =
+    Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+  if (S != MCDisassembler::Success)
+    return DecodeL5RInstructionFail(Inst, Insn, Address, Decoder);
+  S = Decode2OpInstruction(fieldFromInstruction(Insn, 16, 16), Op4, Op5);
+  if (S != MCDisassembler::Success)
+    return DecodeL5RInstructionFail(Inst, Insn, Address, Decoder);
+
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op4, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op5, Address, Decoder);
+  return S;
+}
+
+static DecodeStatus
+DecodeL4RSrcDstInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                           const void *Decoder) {
+  unsigned Op1, Op2, Op3;
+  unsigned Op4 = fieldFromInstruction(Insn, 16, 4);
+  DecodeStatus S =
+    Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    S = DecodeGRRegsRegisterClass(Inst, Op4, Address, Decoder);
+  }
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op4, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
+  }
+  return S;
+}
+
+static DecodeStatus
+DecodeL4RSrcDstSrcDstInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                 const void *Decoder) {
+  unsigned Op1, Op2, Op3;
+  unsigned Op4 = fieldFromInstruction(Insn, 16, 4);
+  DecodeStatus S =
+  Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    S = DecodeGRRegsRegisterClass(Inst, Op4, Address, Decoder);
+  }
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op4, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
+  }
+  return S;
+}
+
+MCDisassembler::DecodeStatus
+XCoreDisassembler::getInstruction(MCInst &instr,
+                                  uint64_t &Size,
+                                  const MemoryObject &Region,
+                                  uint64_t Address,
+                                  raw_ostream &vStream,
+                                  raw_ostream &cStream) const {
+  uint16_t insn16;
+
+  if (!readInstruction16(Region, Address, Size, insn16)) {
+    return Fail;
+  }
+
+  // Calling the auto-generated decoder function.
+  DecodeStatus Result = decodeInstruction(DecoderTable16, instr, insn16,
+                                          Address, this, STI);
+  if (Result != Fail) {
+    Size = 2;
+    return Result;
+  }
+
+  uint32_t insn32;
+
+  if (!readInstruction32(Region, Address, Size, insn32)) {
+    return Fail;
+  }
+
+  // Calling the auto-generated decoder function.
+  Result = decodeInstruction(DecoderTable32, instr, insn32, Address, this, STI);
+  if (Result != Fail) {
+    Size = 4;
+    return Result;
+  }
+
+  return Fail;
+}
+
+namespace llvm {
+  extern Target TheXCoreTarget;
+}
+
+static MCDisassembler *createXCoreDisassembler(const Target &T,
+                                               const MCSubtargetInfo &STI,
+                                               MCContext &Ctx) {
+  return new XCoreDisassembler(STI, Ctx);
+}
+
+extern "C" void LLVMInitializeXCoreDisassembler() {
+  // Register the disassembler.
+  TargetRegistry::RegisterMCDisassembler(TheXCoreTarget,
+                                         createXCoreDisassembler);
+}
diff --git a/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp b/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp
new file mode 100644
index 0000000..215fe89
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp
@@ -0,0 +1,87 @@
+//===-- XCoreInstPrinter.cpp - Convert XCore MCInst to assembly syntax ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an XCore MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreInstPrinter.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+#include "XCoreGenAsmWriter.inc"
+
+void XCoreInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+  OS << StringRef(getRegisterName(RegNo)).lower();
+}
+
+void XCoreInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                                 StringRef Annot) {
+  printInstruction(MI, O);
+  printAnnotation(O, Annot);
+}
+
+void XCoreInstPrinter::
+printInlineJT(const MCInst *MI, int opNum, raw_ostream &O) {
+  report_fatal_error("can't handle InlineJT");
+}
+
+void XCoreInstPrinter::
+printInlineJT32(const MCInst *MI, int opNum, raw_ostream &O) {
+  report_fatal_error("can't handle InlineJT32");
+}
+
+static void printExpr(const MCExpr *Expr, raw_ostream &OS) {
+  int Offset = 0;
+  const MCSymbolRefExpr *SRE;
+
+  if (const MCBinaryExpr *BE = dyn_cast<MCBinaryExpr>(Expr)) {
+    SRE = dyn_cast<MCSymbolRefExpr>(BE->getLHS());
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(BE->getRHS());
+    assert(SRE && CE && "Binary expression must be sym+const.");
+    Offset = CE->getValue();
+  } else {
+    SRE = dyn_cast<MCSymbolRefExpr>(Expr);
+    assert(SRE && "Unexpected MCExpr type.");
+  }
+  assert(SRE->getKind() == MCSymbolRefExpr::VK_None);
+
+  OS << SRE->getSymbol();
+
+  if (Offset) {
+    if (Offset > 0)
+      OS << '+';
+    OS << Offset;
+  }
+}
+
+void XCoreInstPrinter::
+printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    printRegName(O, Op.getReg());
+    return;
+  }
+
+  if (Op.isImm()) {
+    O << Op.getImm();
+    return;
+  }
+
+  assert(Op.isExpr() && "unknown operand kind in printOperand");
+  printExpr(Op.getExpr(), O);
+}
diff --git a/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h b/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h
new file mode 100644
index 0000000..98e7c98
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h
@@ -0,0 +1,44 @@
+//== XCoreInstPrinter.h - Convert XCore MCInst to assembly syntax -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file contains the declaration of the XCoreInstPrinter class,
+/// which is used to print XCore MCInst to a .s file.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef XCOREINSTPRINTER_H
+#define XCOREINSTPRINTER_H
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+
+class TargetMachine;
+
+class XCoreInstPrinter : public MCInstPrinter {
+public:
+  XCoreInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                  const MCRegisterInfo &MRI)
+    : MCInstPrinter(MAI, MII, MRI) {}
+
+  // Autogenerated by tblgen.
+  void printInstruction(const MCInst *MI, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo);
+
+  void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override;
+private:
+  void printInlineJT(const MCInst *MI, int opNum, raw_ostream &O);
+  void printInlineJT32(const MCInst *MI, int opNum, raw_ostream &O);
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printMemOperand(const MCInst *MI, int opNum, raw_ostream &O);
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp b/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp
new file mode 100644
index 0000000..5665911
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp
@@ -0,0 +1,35 @@
+//===-- XCoreMCAsmInfo.cpp - XCore asm properties -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreMCAsmInfo.h"
+#include "llvm/ADT/StringRef.h"
+using namespace llvm;
+
+void XCoreMCAsmInfo::anchor() { }
+
+XCoreMCAsmInfo::XCoreMCAsmInfo(StringRef TT) {
+  SupportsDebugInformation = true;
+  Data16bitsDirective = "\t.short\t";
+  Data32bitsDirective = "\t.long\t";
+  Data64bitsDirective = nullptr;
+  ZeroDirective = "\t.space\t";
+  CommentString = "#";
+
+  AscizDirective = ".asciiz";
+
+  HiddenVisibilityAttr = MCSA_Invalid;
+  HiddenDeclarationVisibilityAttr = MCSA_Invalid;
+  ProtectedVisibilityAttr = MCSA_Invalid;
+
+  // Debug
+  HasLEB128 = true;
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+  DwarfRegNumForCFI = true;
+}
+
diff --git a/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h b/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h
new file mode 100644
index 0000000..da2689a
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h
@@ -0,0 +1,31 @@
+//===-- XCoreMCAsmInfo.h - XCore asm properties ----------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the XCoreMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef XCORETARGETASMINFO_H
+#define XCORETARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfoELF.h"
+
+namespace llvm {
+  class StringRef;
+  class Target;
+
+  class XCoreMCAsmInfo : public MCAsmInfoELF {
+    void anchor() override;
+  public:
+    explicit XCoreMCAsmInfo(StringRef TT);
+  };
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp b/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
new file mode 100644
index 0000000..d54e94f
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
@@ -0,0 +1,164 @@
+//===-- XCoreMCTargetDesc.cpp - XCore Target Descriptions -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides XCore specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreMCTargetDesc.h"
+#include "InstPrinter/XCoreInstPrinter.h"
+#include "XCoreMCAsmInfo.h"
+#include "XCoreTargetStreamer.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_MC_DESC
+#include "XCoreGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "XCoreGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "XCoreGenRegisterInfo.inc"
+
+static MCInstrInfo *createXCoreMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitXCoreMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createXCoreMCRegisterInfo(StringRef TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitXCoreMCRegisterInfo(X, XCore::LR);
+  return X;
+}
+
+static MCSubtargetInfo *createXCoreMCSubtargetInfo(StringRef TT, StringRef CPU,
+                                                   StringRef FS) {
+  MCSubtargetInfo *X = new MCSubtargetInfo();
+  InitXCoreMCSubtargetInfo(X, TT, CPU, FS);
+  return X;
+}
+
+static MCAsmInfo *createXCoreMCAsmInfo(const MCRegisterInfo &MRI,
+                                       StringRef TT) {
+  MCAsmInfo *MAI = new XCoreMCAsmInfo(TT);
+
+  // Initial state of the frame pointer is SP.
+  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, XCore::SP, 0);
+  MAI->addInitialFrameState(Inst);
+
+  return MAI;
+}
+
+static MCCodeGenInfo *createXCoreMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                               CodeModel::Model CM,
+                                               CodeGenOpt::Level OL) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  if (RM == Reloc::Default) {
+    RM = Reloc::Static;
+  }
+  if (CM == CodeModel::Default) {
+    CM = CodeModel::Small;
+  }
+  if (CM != CodeModel::Small && CM != CodeModel::Large)
+    report_fatal_error("Target only supports CodeModel Small or Large");
+
+  X->InitMCCodeGenInfo(RM, CM, OL);
+  return X;
+}
+
+static MCInstPrinter *createXCoreMCInstPrinter(const Target &T,
+                                               unsigned SyntaxVariant,
+                                               const MCAsmInfo &MAI,
+                                               const MCInstrInfo &MII,
+                                               const MCRegisterInfo &MRI,
+                                               const MCSubtargetInfo &STI) {
+  return new XCoreInstPrinter(MAI, MII, MRI);
+}
+
+XCoreTargetStreamer::XCoreTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
+XCoreTargetStreamer::~XCoreTargetStreamer() {}
+
+namespace {
+
+class XCoreTargetAsmStreamer : public XCoreTargetStreamer {
+  formatted_raw_ostream &OS;
+public:
+  XCoreTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
+  virtual void emitCCTopData(StringRef Name) override;
+  virtual void emitCCTopFunction(StringRef Name) override;
+  virtual void emitCCBottomData(StringRef Name) override;
+  virtual void emitCCBottomFunction(StringRef Name) override;
+};
+
+XCoreTargetAsmStreamer::XCoreTargetAsmStreamer(MCStreamer &S,
+                                               formatted_raw_ostream &OS)
+    : XCoreTargetStreamer(S), OS(OS) {}
+
+void XCoreTargetAsmStreamer::emitCCTopData(StringRef Name) {
+  OS << "\t.cc_top " << Name << ".data," << Name << '\n';
+}
+
+void XCoreTargetAsmStreamer::emitCCTopFunction(StringRef Name) {
+  OS << "\t.cc_top " << Name << ".function," << Name << '\n';
+}
+
+void XCoreTargetAsmStreamer::emitCCBottomData(StringRef Name) {
+  OS << "\t.cc_bottom " << Name << ".data\n";
+}
+
+void XCoreTargetAsmStreamer::emitCCBottomFunction(StringRef Name) {
+  OS << "\t.cc_bottom " << Name << ".function\n";
+}
+}
+
+static MCStreamer *
+createXCoreMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
+                         bool isVerboseAsm, bool useDwarfDirectory,
+                         MCInstPrinter *InstPrint, MCCodeEmitter *CE,
+                         MCAsmBackend *TAB, bool ShowInst) {
+  MCStreamer *S = llvm::createAsmStreamer(
+      Ctx, OS, isVerboseAsm, useDwarfDirectory, InstPrint, CE, TAB, ShowInst);
+  new XCoreTargetAsmStreamer(*S, OS);
+  return S;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeXCoreTargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfoFn X(TheXCoreTarget, createXCoreMCAsmInfo);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheXCoreTarget,
+                                        createXCoreMCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheXCoreTarget, createXCoreMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheXCoreTarget, createXCoreMCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheXCoreTarget,
+                                          createXCoreMCSubtargetInfo);
+
+  // Register the MCInstPrinter
+  TargetRegistry::RegisterMCInstPrinter(TheXCoreTarget,
+                                        createXCoreMCInstPrinter);
+
+  TargetRegistry::RegisterAsmStreamer(TheXCoreTarget, createXCoreMCAsmStreamer);
+}
diff --git a/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h b/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h
new file mode 100644
index 0000000..a255adb
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h
@@ -0,0 +1,38 @@
+//===-- XCoreMCTargetDesc.h - XCore Target Descriptions ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides XCore specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef XCOREMCTARGETDESC_H
+#define XCOREMCTARGETDESC_H
+
+namespace llvm {
+class Target;
+
+extern Target TheXCoreTarget;
+
+} // End llvm namespace
+
+// Defines symbolic names for XCore registers.  This defines a mapping from
+// register name to register number.
+//
+#define GET_REGINFO_ENUM
+#include "XCoreGenRegisterInfo.inc"
+
+// Defines symbolic names for the XCore instructions.
+//
+#define GET_INSTRINFO_ENUM
+#include "XCoreGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "XCoreGenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp b/contrib/llvm/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp
new file mode 100644
index 0000000..00e34e0
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp
@@ -0,0 +1,19 @@
+//===-- XCoreTargetInfo.cpp - XCore Target Implementation -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCore.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheXCoreTarget;
+
+extern "C" void LLVMInitializeXCoreTargetInfo() { 
+  RegisterTarget<Triple::xcore> X(TheXCoreTarget, "xcore", "XCore");
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCore.h b/contrib/llvm/lib/Target/XCore/XCore.h
new file mode 100644
index 0000000..d707edc
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCore.h
@@ -0,0 +1,39 @@
+//===-- XCore.h - Top-level interface for XCore representation --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// XCore back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_XCORE_H
+#define TARGET_XCORE_H
+
+#include "MCTargetDesc/XCoreMCTargetDesc.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+  class FunctionPass;
+  class ModulePass;
+  class TargetMachine;
+  class XCoreTargetMachine;
+  class formatted_raw_ostream;
+
+  void initializeXCoreLowerThreadLocalPass(PassRegistry &p);
+
+  FunctionPass *createXCoreFrameToArgsOffsetEliminationPass();
+  FunctionPass *createXCoreISelDag(XCoreTargetMachine &TM,
+                                   CodeGenOpt::Level OptLevel);
+  ModulePass *createXCoreLowerThreadLocalPass();
+
+  ImmutablePass *createXCoreTargetTransformInfoPass(const XCoreTargetMachine *TM);
+
+} // end namespace llvm;
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/XCore.td b/contrib/llvm/lib/Target/XCore/XCore.td
new file mode 100644
index 0000000..04a1dd5
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCore.td
@@ -0,0 +1,47 @@
+//===-- XCore.td - Describe the XCore Target Machine -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the top level entry point for the XCore target.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// Descriptions
+//===----------------------------------------------------------------------===//
+
+include "XCoreRegisterInfo.td"
+include "XCoreInstrInfo.td"
+include "XCoreCallingConv.td"
+
+def XCoreInstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// XCore processors supported.
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+def : Proc<"generic",      []>;
+def : Proc<"xs1b-generic", []>;
+
+//===----------------------------------------------------------------------===//
+// Declare the target which we are implementing
+//===----------------------------------------------------------------------===//
+
+def XCore : Target {
+  // Pull in Instruction Info:
+  let InstructionSet = XCoreInstrInfo;
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp b/contrib/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp
new file mode 100644
index 0000000..e98d4f9
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp
@@ -0,0 +1,307 @@
+//===-- XCoreAsmPrinter.cpp - XCore LLVM assembly writer ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to the XAS-format XCore assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCore.h"
+#include "InstPrinter/XCoreInstPrinter.h"
+#include "XCoreInstrInfo.h"
+#include "XCoreMCInstLower.h"
+#include "XCoreSubtarget.h"
+#include "XCoreTargetMachine.h"
+#include "XCoreTargetStreamer.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include <algorithm>
+#include <cctype>
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+namespace {
+  class XCoreAsmPrinter : public AsmPrinter {
+    const XCoreSubtarget &Subtarget;
+    XCoreMCInstLower MCInstLowering;
+    XCoreTargetStreamer &getTargetStreamer();
+
+  public:
+    explicit XCoreAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+      : AsmPrinter(TM, Streamer), Subtarget(TM.getSubtarget<XCoreSubtarget>()),
+        MCInstLowering(*this) {}
+
+    const char *getPassName() const override {
+      return "XCore Assembly Printer";
+    }
+
+    void printInlineJT(const MachineInstr *MI, int opNum, raw_ostream &O,
+                       const std::string &directive = ".jmptable");
+    void printInlineJT32(const MachineInstr *MI, int opNum, raw_ostream &O) {
+      printInlineJT(MI, opNum, O, ".jmptable32");
+    }
+    void printOperand(const MachineInstr *MI, int opNum, raw_ostream &O);
+    bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                         unsigned AsmVariant, const char *ExtraCode,
+                         raw_ostream &O) override;
+    bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
+                               unsigned AsmVariant, const char *ExtraCode,
+                               raw_ostream &O) override;
+
+    void emitArrayBound(MCSymbol *Sym, const GlobalVariable *GV);
+    void EmitGlobalVariable(const GlobalVariable *GV) override;
+
+    void EmitFunctionEntryLabel() override;
+    void EmitInstruction(const MachineInstr *MI) override;
+    void EmitFunctionBodyStart() override;
+    void EmitFunctionBodyEnd() override;
+  };
+} // end of anonymous namespace
+
+XCoreTargetStreamer &XCoreAsmPrinter::getTargetStreamer() {
+  return static_cast<XCoreTargetStreamer&>(*OutStreamer.getTargetStreamer());
+}
+
+void XCoreAsmPrinter::emitArrayBound(MCSymbol *Sym, const GlobalVariable *GV) {
+  assert( ( GV->hasExternalLinkage() || GV->hasWeakLinkage() ||
+            GV->hasLinkOnceLinkage() || GV->hasCommonLinkage() ) &&
+          "Unexpected linkage");
+  if (ArrayType *ATy = dyn_cast<ArrayType>(
+                        cast<PointerType>(GV->getType())->getElementType())) {
+
+    MCSymbol *SymGlob = OutContext.GetOrCreateSymbol(
+                          Twine(Sym->getName() + StringRef(".globound")));
+    OutStreamer.EmitSymbolAttribute(SymGlob, MCSA_Global);
+    OutStreamer.EmitAssignment(SymGlob,
+                               MCConstantExpr::Create(ATy->getNumElements(),
+                                                      OutContext));
+    if (GV->hasWeakLinkage() || GV->hasLinkOnceLinkage() ||
+        GV->hasCommonLinkage()) {
+      // TODO Use COMDAT groups for LinkOnceLinkage
+      OutStreamer.EmitSymbolAttribute(SymGlob, MCSA_Weak);
+    }
+  }
+}
+
+void XCoreAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
+  // Check to see if this is a special global used by LLVM, if so, emit it.
+  if (!GV->hasInitializer() ||
+      EmitSpecialLLVMGlobal(GV))
+    return;
+
+  const DataLayout *TD = TM.getDataLayout();
+  OutStreamer.SwitchSection(
+      getObjFileLowering().SectionForGlobal(GV, *Mang, TM));
+
+  MCSymbol *GVSym = getSymbol(GV);
+  const Constant *C = GV->getInitializer();
+  unsigned Align = (unsigned)TD->getPreferredTypeAlignmentShift(C->getType());
+  
+  // Mark the start of the global
+  getTargetStreamer().emitCCTopData(GVSym->getName());
+
+  switch (GV->getLinkage()) {
+  case GlobalValue::AppendingLinkage:
+    report_fatal_error("AppendingLinkage is not supported by this target!");
+  case GlobalValue::LinkOnceAnyLinkage:
+  case GlobalValue::LinkOnceODRLinkage:
+  case GlobalValue::WeakAnyLinkage:
+  case GlobalValue::WeakODRLinkage:
+  case GlobalValue::ExternalLinkage:
+  case GlobalValue::CommonLinkage:
+    emitArrayBound(GVSym, GV);
+    OutStreamer.EmitSymbolAttribute(GVSym, MCSA_Global);
+
+    // TODO Use COMDAT groups for LinkOnceLinkage
+    if (GV->hasWeakLinkage() || GV->hasLinkOnceLinkage() ||
+        GV->hasCommonLinkage())
+      OutStreamer.EmitSymbolAttribute(GVSym, MCSA_Weak);
+    // FALL THROUGH
+  case GlobalValue::InternalLinkage:
+  case GlobalValue::PrivateLinkage:
+    break;
+  default:
+    llvm_unreachable("Unknown linkage type!");
+  }
+
+  EmitAlignment(Align > 2 ? Align : 2, GV);
+  
+  if (GV->isThreadLocal()) {
+    report_fatal_error("TLS is not supported by this target!");
+  }
+  unsigned Size = TD->getTypeAllocSize(C->getType());
+  if (MAI->hasDotTypeDotSizeDirective()) {
+    OutStreamer.EmitSymbolAttribute(GVSym, MCSA_ELF_TypeObject);
+    OutStreamer.EmitELFSize(GVSym, MCConstantExpr::Create(Size, OutContext));
+  }
+  OutStreamer.EmitLabel(GVSym);
+  
+  EmitGlobalConstant(C);
+  // The ABI requires that unsigned scalar types smaller than 32 bits
+  // are padded to 32 bits.
+  if (Size < 4)
+    OutStreamer.EmitZeros(4 - Size);
+  
+  // Mark the end of the global
+  getTargetStreamer().emitCCBottomData(GVSym->getName());
+}
+
+void XCoreAsmPrinter::EmitFunctionBodyStart() {
+  MCInstLowering.Initialize(Mang, &MF->getContext());
+}
+
+/// EmitFunctionBodyEnd - Targets can override this to emit stuff after
+/// the last basic block in the function.
+void XCoreAsmPrinter::EmitFunctionBodyEnd() {
+  // Emit function end directives
+  getTargetStreamer().emitCCBottomFunction(CurrentFnSym->getName());
+}
+
+void XCoreAsmPrinter::EmitFunctionEntryLabel() {
+  // Mark the start of the function
+  getTargetStreamer().emitCCTopFunction(CurrentFnSym->getName());
+  OutStreamer.EmitLabel(CurrentFnSym);
+}
+
+void XCoreAsmPrinter::
+printInlineJT(const MachineInstr *MI, int opNum, raw_ostream &O,
+              const std::string &directive) {
+  unsigned JTI = MI->getOperand(opNum).getIndex();
+  const MachineFunction *MF = MI->getParent()->getParent();
+  const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
+  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+  const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
+  O << "\t" << directive << " ";
+  for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) {
+    MachineBasicBlock *MBB = JTBBs[i];
+    if (i > 0)
+      O << ",";
+    O << *MBB->getSymbol();
+  }
+}
+
+void XCoreAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
+                                   raw_ostream &O) {
+  const DataLayout *DL = TM.getDataLayout();
+  const MachineOperand &MO = MI->getOperand(opNum);
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    O << XCoreInstPrinter::getRegisterName(MO.getReg());
+    break;
+  case MachineOperand::MO_Immediate:
+    O << MO.getImm();
+    break;
+  case MachineOperand::MO_MachineBasicBlock:
+    O << *MO.getMBB()->getSymbol();
+    break;
+  case MachineOperand::MO_GlobalAddress:
+    O << *getSymbol(MO.getGlobal());
+    break;
+  case MachineOperand::MO_ConstantPoolIndex:
+    O << DL->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber()
+      << '_' << MO.getIndex();
+    break;
+  case MachineOperand::MO_BlockAddress:
+    O << *GetBlockAddressSymbol(MO.getBlockAddress());
+    break;
+  default:
+    llvm_unreachable("not implemented");
+  }
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool XCoreAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                      unsigned AsmVariant,const char *ExtraCode,
+                                      raw_ostream &O) {
+  // Print the operand if there is no operand modifier.
+  if (!ExtraCode || !ExtraCode[0]) {
+    printOperand(MI, OpNo, O);
+    return false;
+  }
+
+  // Otherwise fallback on the default implementation.
+  return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
+}
+
+bool XCoreAsmPrinter::
+PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
+                      unsigned AsmVariant, const char *ExtraCode,
+                      raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0]) {
+    return true; // Unknown modifier.
+  }
+  printOperand(MI, OpNum, O);
+  O << '[';
+  printOperand(MI, OpNum + 1, O);
+  O << ']';
+  return false;
+}
+
+void XCoreAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  SmallString<128> Str;
+  raw_svector_ostream O(Str);
+
+  switch (MI->getOpcode()) {
+  case XCore::DBG_VALUE:
+    llvm_unreachable("Should be handled target independently");
+  case XCore::ADD_2rus:
+    if (MI->getOperand(2).getImm() == 0) {
+      O << "\tmov "
+        << XCoreInstPrinter::getRegisterName(MI->getOperand(0).getReg()) << ", "
+        << XCoreInstPrinter::getRegisterName(MI->getOperand(1).getReg());
+      OutStreamer.EmitRawText(O.str());
+      return;
+    }
+    break;
+  case XCore::BR_JT:
+  case XCore::BR_JT32:
+    O << "\tbru "
+      << XCoreInstPrinter::getRegisterName(MI->getOperand(1).getReg()) << '\n';
+    if (MI->getOpcode() == XCore::BR_JT)
+      printInlineJT(MI, 0, O);
+    else
+      printInlineJT32(MI, 0, O);
+    O << '\n';
+    OutStreamer.EmitRawText(O.str());
+    return;
+  }
+
+  MCInst TmpInst;
+  MCInstLowering.Lower(MI, TmpInst);
+
+  EmitToStreamer(OutStreamer, TmpInst);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeXCoreAsmPrinter() { 
+  RegisterAsmPrinter<XCoreAsmPrinter> X(TheXCoreTarget);
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreCallingConv.td b/contrib/llvm/lib/Target/XCore/XCoreCallingConv.td
new file mode 100644
index 0000000..e149e6d
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreCallingConv.td
@@ -0,0 +1,40 @@
+//===- XCoreCallingConv.td - Calling Conventions for XCore -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This describes the calling conventions for XCore architecture.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// XCore Return Value Calling Convention
+//===----------------------------------------------------------------------===//
+def RetCC_XCore : CallingConv<[
+  // i32 are returned in registers R0, R1, R2, R3
+  CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>,
+
+  // Integer values get stored in stack slots that are 4 bytes in
+  // size and 4-byte aligned.
+  CCIfType<[i32], CCAssignToStack<4, 4>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// XCore Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+def CC_XCore : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // The 'nest' parameter, if any, is passed in R11.
+  CCIfNest<CCAssignToReg<[R11]>>,
+
+  // The first 4 integer arguments are passed in integer registers.
+  CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>,
+
+  // Integer values get stored in stack slots that are 4 bytes in
+  // size and 4-byte aligned.
+  CCIfType<[i32], CCAssignToStack<4, 4>>
+]>;
diff --git a/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.cpp b/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.cpp
new file mode 100644
index 0000000..e694736
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.cpp
@@ -0,0 +1,582 @@
+//===-- XCoreFrameLowering.cpp - Frame info for XCore Target --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains XCore frame information that doesn't fit anywhere else
+// cleanly...
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreFrameLowering.h"
+#include "XCore.h"
+#include "XCoreInstrInfo.h"
+#include "XCoreMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetOptions.h"
+#include <algorithm>    // std::sort
+
+using namespace llvm;
+
+static const unsigned FramePtr = XCore::R10;
+static const int MaxImmU16 = (1<<16) - 1;
+
+// helper functions. FIXME: Eliminate.
+static inline bool isImmU6(unsigned val) {
+  return val < (1 << 6);
+}
+
+static inline bool isImmU16(unsigned val) {
+  return val < (1 << 16);
+}
+
+// Helper structure with compare function for handling stack slots.
+namespace {
+struct StackSlotInfo {
+  int FI;
+  int Offset;
+  unsigned Reg;
+  StackSlotInfo(int f, int o, int r) : FI(f), Offset(o), Reg(r){};
+};
+}  // end anonymous namespace
+
+static bool CompareSSIOffset(const StackSlotInfo& a, const StackSlotInfo& b) {
+  return a.Offset < b.Offset;
+}
+
+
+static void EmitDefCfaRegister(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator MBBI, DebugLoc dl,
+                               const TargetInstrInfo &TII,
+                               MachineModuleInfo *MMI, unsigned DRegNum) {
+  unsigned CFIIndex = MMI->addFrameInst(
+      MCCFIInstruction::createDefCfaRegister(nullptr, DRegNum));
+  BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+}
+
+static void EmitDefCfaOffset(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator MBBI, DebugLoc dl,
+                             const TargetInstrInfo &TII,
+                             MachineModuleInfo *MMI, int Offset) {
+  unsigned CFIIndex =
+      MMI->addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, -Offset));
+  BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+}
+
+static void EmitCfiOffset(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MBBI, DebugLoc dl,
+                          const TargetInstrInfo &TII, MachineModuleInfo *MMI,
+                          unsigned DRegNum, int Offset) {
+  unsigned CFIIndex = MMI->addFrameInst(
+      MCCFIInstruction::createOffset(nullptr, DRegNum, Offset));
+  BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+}
+
+/// The SP register is moved in steps of 'MaxImmU16' towards the bottom of the
+/// frame. During these steps, it may be necessary to spill registers.
+/// IfNeededExtSP emits the necessary EXTSP instructions to move the SP only
+/// as far as to make 'OffsetFromBottom' reachable using an STWSP_lru6.
+/// \param OffsetFromTop the spill offset from the top of the frame.
+/// \param [in,out] Adjusted the current SP offset from the top of the frame.
+static void IfNeededExtSP(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MBBI, DebugLoc dl,
+                          const TargetInstrInfo &TII, MachineModuleInfo *MMI,
+                          int OffsetFromTop, int &Adjusted, int FrameSize,
+                          bool emitFrameMoves) {
+  while (OffsetFromTop > Adjusted) {
+    assert(Adjusted < FrameSize && "OffsetFromTop is beyond FrameSize");
+    int remaining = FrameSize - Adjusted;
+    int OpImm = (remaining > MaxImmU16) ? MaxImmU16 : remaining;
+    int Opcode = isImmU6(OpImm) ? XCore::EXTSP_u6 : XCore::EXTSP_lu6;
+    BuildMI(MBB, MBBI, dl, TII.get(Opcode)).addImm(OpImm);
+    Adjusted += OpImm;
+    if (emitFrameMoves)
+      EmitDefCfaOffset(MBB, MBBI, dl, TII, MMI, Adjusted*4);
+  }
+}
+
+/// The SP register is moved in steps of 'MaxImmU16' towards the top of the
+/// frame. During these steps, it may be necessary to re-load registers.
+/// IfNeededLDAWSP emits the necessary LDAWSP instructions to move the SP only
+/// as far as to make 'OffsetFromTop' reachable using an LDAWSP_lru6.
+/// \param OffsetFromTop the spill offset from the top of the frame.
+/// \param [in,out] RemainingAdj the current SP offset from the top of the
+/// frame.
+static void IfNeededLDAWSP(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI, DebugLoc dl,
+                           const TargetInstrInfo &TII, int OffsetFromTop,
+                           int &RemainingAdj) {
+  while (OffsetFromTop < RemainingAdj - MaxImmU16) {
+    assert(RemainingAdj && "OffsetFromTop is beyond FrameSize");
+    int OpImm = (RemainingAdj > MaxImmU16) ? MaxImmU16 : RemainingAdj;
+    int Opcode = isImmU6(OpImm) ? XCore::LDAWSP_ru6 : XCore::LDAWSP_lru6;
+    BuildMI(MBB, MBBI, dl, TII.get(Opcode), XCore::SP).addImm(OpImm);
+    RemainingAdj -= OpImm;
+  }
+}
+
+/// Creates an ordered list of registers that are spilled
+/// during the emitPrologue/emitEpilogue.
+/// Registers are ordered according to their frame offset.
+/// As offsets are negative, the largest offsets will be first.
+static void GetSpillList(SmallVectorImpl<StackSlotInfo> &SpillList,
+                         MachineFrameInfo *MFI, XCoreFunctionInfo *XFI,
+                         bool fetchLR, bool fetchFP) {
+  if (fetchLR) {
+    int Offset = MFI->getObjectOffset(XFI->getLRSpillSlot());
+    SpillList.push_back(StackSlotInfo(XFI->getLRSpillSlot(),
+                                      Offset,
+                                      XCore::LR));
+  }
+  if (fetchFP) {
+    int Offset = MFI->getObjectOffset(XFI->getFPSpillSlot());
+    SpillList.push_back(StackSlotInfo(XFI->getFPSpillSlot(),
+                                      Offset,
+                                      FramePtr));
+  }
+  std::sort(SpillList.begin(), SpillList.end(), CompareSSIOffset);
+}
+
+/// Creates an ordered list of EH info register 'spills'.
+/// These slots are only used by the unwinder and calls to llvm.eh.return().
+/// Registers are ordered according to their frame offset.
+/// As offsets are negative, the largest offsets will be first.
+static void GetEHSpillList(SmallVectorImpl<StackSlotInfo> &SpillList,
+                           MachineFrameInfo *MFI, XCoreFunctionInfo *XFI,
+                           const TargetLowering *TL) {
+  assert(XFI->hasEHSpillSlot() && "There are no EH register spill slots");
+  const int* EHSlot = XFI->getEHSpillSlot();
+  SpillList.push_back(StackSlotInfo(EHSlot[0],
+                                    MFI->getObjectOffset(EHSlot[0]),
+                                    TL->getExceptionPointerRegister()));
+  SpillList.push_back(StackSlotInfo(EHSlot[0],
+                                    MFI->getObjectOffset(EHSlot[1]),
+                                    TL->getExceptionSelectorRegister()));
+  std::sort(SpillList.begin(), SpillList.end(), CompareSSIOffset);
+}
+
+
+static MachineMemOperand *
+getFrameIndexMMO(MachineBasicBlock &MBB, int FrameIndex, unsigned flags) {
+  MachineFunction *MF = MBB.getParent();
+  const MachineFrameInfo &MFI = *MF->getFrameInfo();
+  MachineMemOperand *MMO =
+    MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIndex),
+                             flags, MFI.getObjectSize(FrameIndex),
+                             MFI.getObjectAlignment(FrameIndex));
+  return MMO;
+}
+
+
+/// Restore clobbered registers with their spill slot value.
+/// The SP will be adjusted at the same time, thus the SpillList must be ordered
+/// with the largest (negative) offsets first.
+static void
+RestoreSpillList(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                 DebugLoc dl, const TargetInstrInfo &TII, int &RemainingAdj,
+                 SmallVectorImpl<StackSlotInfo> &SpillList) {
+  for (unsigned i = 0, e = SpillList.size(); i != e; ++i) {
+    assert(SpillList[i].Offset % 4 == 0 && "Misaligned stack offset");
+    assert(SpillList[i].Offset <= 0 && "Unexpected positive stack offset");
+    int OffsetFromTop = - SpillList[i].Offset/4;
+    IfNeededLDAWSP(MBB, MBBI, dl, TII, OffsetFromTop, RemainingAdj);
+    int Offset = RemainingAdj - OffsetFromTop;
+    int Opcode = isImmU6(Offset) ? XCore::LDWSP_ru6 : XCore::LDWSP_lru6;
+    BuildMI(MBB, MBBI, dl, TII.get(Opcode), SpillList[i].Reg)
+      .addImm(Offset)
+      .addMemOperand(getFrameIndexMMO(MBB, SpillList[i].FI,
+                                      MachineMemOperand::MOLoad));
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// XCoreFrameLowering:
+//===----------------------------------------------------------------------===//
+
+XCoreFrameLowering::XCoreFrameLowering(const XCoreSubtarget &sti)
+  : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 4, 0) {
+  // Do nothing
+}
+
+bool XCoreFrameLowering::hasFP(const MachineFunction &MF) const {
+  return MF.getTarget().Options.DisableFramePointerElim(MF) ||
+         MF.getFrameInfo()->hasVarSizedObjects();
+}
+
+void XCoreFrameLowering::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();   // Prolog goes in entry BB
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineModuleInfo *MMI = &MF.getMMI();
+  const MCRegisterInfo *MRI = MMI->getContext().getRegisterInfo();
+  const XCoreInstrInfo &TII =
+    *static_cast<const XCoreInstrInfo*>(MF.getTarget().getInstrInfo());
+  XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
+  // Debug location must be unknown since the first debug location is used
+  // to determine the end of the prologue.
+  DebugLoc dl;
+
+  if (MFI->getMaxAlignment() > getStackAlignment())
+    report_fatal_error("emitPrologue unsupported alignment: "
+                       + Twine(MFI->getMaxAlignment()));
+
+  const AttributeSet &PAL = MF.getFunction()->getAttributes();
+  if (PAL.hasAttrSomewhere(Attribute::Nest))
+    BuildMI(MBB, MBBI, dl, TII.get(XCore::LDWSP_ru6), XCore::R11).addImm(0);
+    // FIX: Needs addMemOperand() but can't use getFixedStack() or getStack().
+
+  // Work out frame sizes.
+  // We will adjust the SP in stages towards the final FrameSize.
+  assert(MFI->getStackSize()%4 == 0 && "Misaligned frame size");
+  const int FrameSize = MFI->getStackSize() / 4;
+  int Adjusted = 0;
+
+  bool saveLR = XFI->hasLRSpillSlot();
+  bool UseENTSP = saveLR && FrameSize
+                  && (MFI->getObjectOffset(XFI->getLRSpillSlot()) == 0);
+  if (UseENTSP)
+    saveLR = false;
+  bool FP = hasFP(MF);
+  bool emitFrameMoves = XCoreRegisterInfo::needsFrameMoves(MF);
+
+  if (UseENTSP) {
+    // Allocate space on the stack at the same time as saving LR.
+    Adjusted = (FrameSize > MaxImmU16) ? MaxImmU16 : FrameSize;
+    int Opcode = isImmU6(Adjusted) ? XCore::ENTSP_u6 : XCore::ENTSP_lu6;
+    MBB.addLiveIn(XCore::LR);
+    MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(Opcode));
+    MIB.addImm(Adjusted);
+    MIB->addRegisterKilled(XCore::LR, MF.getTarget().getRegisterInfo(), true);
+    if (emitFrameMoves) {
+      EmitDefCfaOffset(MBB, MBBI, dl, TII, MMI, Adjusted*4);
+      unsigned DRegNum = MRI->getDwarfRegNum(XCore::LR, true);
+      EmitCfiOffset(MBB, MBBI, dl, TII, MMI, DRegNum, 0);
+    }
+  }
+
+  // If necessary, save LR and FP to the stack, as we EXTSP.
+  SmallVector<StackSlotInfo,2> SpillList;
+  GetSpillList(SpillList, MFI, XFI, saveLR, FP);
+  // We want the nearest (negative) offsets first, so reverse list.
+  std::reverse(SpillList.begin(), SpillList.end());
+  for (unsigned i = 0, e = SpillList.size(); i != e; ++i) {
+    assert(SpillList[i].Offset % 4 == 0 && "Misaligned stack offset");
+    assert(SpillList[i].Offset <= 0 && "Unexpected positive stack offset");
+    int OffsetFromTop = - SpillList[i].Offset/4;
+    IfNeededExtSP(MBB, MBBI, dl, TII, MMI, OffsetFromTop, Adjusted, FrameSize,
+                  emitFrameMoves);
+    int Offset = Adjusted - OffsetFromTop;
+    int Opcode = isImmU6(Offset) ? XCore::STWSP_ru6 : XCore::STWSP_lru6;
+    MBB.addLiveIn(SpillList[i].Reg);
+    BuildMI(MBB, MBBI, dl, TII.get(Opcode))
+      .addReg(SpillList[i].Reg, RegState::Kill)
+      .addImm(Offset)
+      .addMemOperand(getFrameIndexMMO(MBB, SpillList[i].FI,
+                                      MachineMemOperand::MOStore));
+    if (emitFrameMoves) {
+      unsigned DRegNum = MRI->getDwarfRegNum(SpillList[i].Reg, true);
+      EmitCfiOffset(MBB, MBBI, dl, TII, MMI, DRegNum, SpillList[i].Offset);
+    }
+  }
+
+  // Complete any remaining Stack adjustment.
+  IfNeededExtSP(MBB, MBBI, dl, TII, MMI, FrameSize, Adjusted, FrameSize,
+                emitFrameMoves);
+  assert(Adjusted==FrameSize && "IfNeededExtSP has not completed adjustment");
+
+  if (FP) {
+    // Set the FP from the SP.
+    BuildMI(MBB, MBBI, dl, TII.get(XCore::LDAWSP_ru6), FramePtr).addImm(0);
+    if (emitFrameMoves)
+      EmitDefCfaRegister(MBB, MBBI, dl, TII, MMI,
+                         MRI->getDwarfRegNum(FramePtr, true));
+  }
+
+  if (emitFrameMoves) {
+    // Frame moves for callee saved.
+    auto SpillLabels = XFI->getSpillLabels();
+    for (unsigned I = 0, E = SpillLabels.size(); I != E; ++I) {
+      MachineBasicBlock::iterator Pos = SpillLabels[I].first;
+      ++Pos;
+      CalleeSavedInfo &CSI = SpillLabels[I].second;
+      int Offset = MFI->getObjectOffset(CSI.getFrameIdx());
+      unsigned DRegNum = MRI->getDwarfRegNum(CSI.getReg(), true);
+      EmitCfiOffset(MBB, Pos, dl, TII, MMI, DRegNum, Offset);
+    }
+    if (XFI->hasEHSpillSlot()) {
+      // The unwinder requires stack slot & CFI offsets for the exception info.
+      // We do not save/spill these registers.
+      SmallVector<StackSlotInfo,2> SpillList;
+      GetEHSpillList(SpillList, MFI, XFI, MF.getTarget().getTargetLowering());
+      assert(SpillList.size()==2 && "Unexpected SpillList size");
+      EmitCfiOffset(MBB, MBBI, dl, TII, MMI,
+                    MRI->getDwarfRegNum(SpillList[0].Reg, true),
+                    SpillList[0].Offset);
+      EmitCfiOffset(MBB, MBBI, dl, TII, MMI,
+                    MRI->getDwarfRegNum(SpillList[1].Reg, true),
+                    SpillList[1].Offset);
+    }
+  }
+}
+
+void XCoreFrameLowering::emitEpilogue(MachineFunction &MF,
+                                     MachineBasicBlock &MBB) const {
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  const XCoreInstrInfo &TII =
+    *static_cast<const XCoreInstrInfo*>(MF.getTarget().getInstrInfo());
+  XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
+  DebugLoc dl = MBBI->getDebugLoc();
+  unsigned RetOpcode = MBBI->getOpcode();
+
+  // Work out frame sizes.
+  // We will adjust the SP in stages towards the final FrameSize.
+  int RemainingAdj = MFI->getStackSize();
+  assert(RemainingAdj%4 == 0 && "Misaligned frame size");
+  RemainingAdj /= 4;
+
+  if (RetOpcode == XCore::EH_RETURN) {
+    // 'Restore' the exception info the unwinder has placed into the stack
+    // slots.
+    SmallVector<StackSlotInfo,2> SpillList;
+    GetEHSpillList(SpillList, MFI, XFI, MF.getTarget().getTargetLowering());
+    RestoreSpillList(MBB, MBBI, dl, TII, RemainingAdj, SpillList);
+
+    // Return to the landing pad.
+    unsigned EhStackReg = MBBI->getOperand(0).getReg();
+    unsigned EhHandlerReg = MBBI->getOperand(1).getReg();
+    BuildMI(MBB, MBBI, dl, TII.get(XCore::SETSP_1r)).addReg(EhStackReg);
+    BuildMI(MBB, MBBI, dl, TII.get(XCore::BAU_1r)).addReg(EhHandlerReg);
+    MBB.erase(MBBI);  // Erase the previous return instruction.
+    return;
+  }
+
+  bool restoreLR = XFI->hasLRSpillSlot();
+  bool UseRETSP = restoreLR && RemainingAdj
+                  && (MFI->getObjectOffset(XFI->getLRSpillSlot()) == 0);
+  if (UseRETSP)
+    restoreLR = false;
+  bool FP = hasFP(MF);
+
+  if (FP) // Restore the stack pointer.
+    BuildMI(MBB, MBBI, dl, TII.get(XCore::SETSP_1r)).addReg(FramePtr);
+
+  // If necessary, restore LR and FP from the stack, as we EXTSP.
+  SmallVector<StackSlotInfo,2> SpillList;
+  GetSpillList(SpillList, MFI, XFI, restoreLR, FP);
+  RestoreSpillList(MBB, MBBI, dl, TII, RemainingAdj, SpillList);
+
+  if (RemainingAdj) {
+    // Complete all but one of the remaining Stack adjustments.
+    IfNeededLDAWSP(MBB, MBBI, dl, TII, 0, RemainingAdj);
+    if (UseRETSP) {
+      // Fold prologue into return instruction
+      assert(RetOpcode == XCore::RETSP_u6
+             || RetOpcode == XCore::RETSP_lu6);
+      int Opcode = isImmU6(RemainingAdj) ? XCore::RETSP_u6 : XCore::RETSP_lu6;
+      MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(Opcode))
+                                  .addImm(RemainingAdj);
+      for (unsigned i = 3, e = MBBI->getNumOperands(); i < e; ++i)
+        MIB->addOperand(MBBI->getOperand(i)); // copy any variadic operands
+      MBB.erase(MBBI);  // Erase the previous return instruction.
+    } else {
+      int Opcode = isImmU6(RemainingAdj) ? XCore::LDAWSP_ru6 :
+                                           XCore::LDAWSP_lru6;
+      BuildMI(MBB, MBBI, dl, TII.get(Opcode), XCore::SP).addImm(RemainingAdj);
+      // Don't erase the return instruction.
+    }
+  } // else Don't erase the return instruction.
+}
+
+bool XCoreFrameLowering::
+spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MI,
+                          const std::vector<CalleeSavedInfo> &CSI,
+                          const TargetRegisterInfo *TRI) const {
+  if (CSI.empty())
+    return true;
+
+  MachineFunction *MF = MBB.getParent();
+  const TargetInstrInfo &TII = *MF->getTarget().getInstrInfo();
+  XCoreFunctionInfo *XFI = MF->getInfo<XCoreFunctionInfo>();
+  bool emitFrameMoves = XCoreRegisterInfo::needsFrameMoves(*MF);
+
+  DebugLoc DL;
+  if (MI != MBB.end() && !MI->isDebugValue())
+    DL = MI->getDebugLoc();
+
+  for (std::vector<CalleeSavedInfo>::const_iterator it = CSI.begin();
+                                                    it != CSI.end(); ++it) {
+    unsigned Reg = it->getReg();
+    assert(Reg != XCore::LR && !(Reg == XCore::R10 && hasFP(*MF)) &&
+           "LR & FP are always handled in emitPrologue");
+
+    // Add the callee-saved register as live-in. It's killed at the spill.
+    MBB.addLiveIn(Reg);
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+    TII.storeRegToStackSlot(MBB, MI, Reg, true, it->getFrameIdx(), RC, TRI);
+    if (emitFrameMoves) {
+      auto Store = MI;
+      --Store;
+      XFI->getSpillLabels().push_back(std::make_pair(Store, *it));
+    }
+  }
+  return true;
+}
+
+bool XCoreFrameLowering::
+restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            const std::vector<CalleeSavedInfo> &CSI,
+                            const TargetRegisterInfo *TRI) const{
+  MachineFunction *MF = MBB.getParent();
+  const TargetInstrInfo &TII = *MF->getTarget().getInstrInfo();
+  bool AtStart = MI == MBB.begin();
+  MachineBasicBlock::iterator BeforeI = MI;
+  if (!AtStart)
+    --BeforeI;
+  for (std::vector<CalleeSavedInfo>::const_iterator it = CSI.begin();
+                                                    it != CSI.end(); ++it) {
+    unsigned Reg = it->getReg();
+    assert(Reg != XCore::LR && !(Reg == XCore::R10 && hasFP(*MF)) &&
+           "LR & FP are always handled in emitEpilogue");
+
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+    TII.loadRegFromStackSlot(MBB, MI, Reg, it->getFrameIdx(), RC, TRI);
+    assert(MI != MBB.begin() &&
+           "loadRegFromStackSlot didn't insert any code!");
+    // Insert in reverse order.  loadRegFromStackSlot can insert multiple
+    // instructions.
+    if (AtStart)
+      MI = MBB.begin();
+    else {
+      MI = BeforeI;
+      ++MI;
+    }
+  }
+  return true;
+}
+
+// This function eliminates ADJCALLSTACKDOWN,
+// ADJCALLSTACKUP pseudo instructions
+void XCoreFrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  const XCoreInstrInfo &TII =
+    *static_cast<const XCoreInstrInfo*>(MF.getTarget().getInstrInfo());
+  if (!hasReservedCallFrame(MF)) {
+    // Turn the adjcallstackdown instruction into 'extsp <amt>' and the
+    // adjcallstackup instruction into 'ldaw sp, sp[<amt>]'
+    MachineInstr *Old = I;
+    uint64_t Amount = Old->getOperand(0).getImm();
+    if (Amount != 0) {
+      // We need to keep the stack aligned properly.  To do this, we round the
+      // amount of space needed for the outgoing arguments up to the next
+      // alignment boundary.
+      unsigned Align = getStackAlignment();
+      Amount = (Amount+Align-1)/Align*Align;
+
+      assert(Amount%4 == 0);
+      Amount /= 4;
+
+      bool isU6 = isImmU6(Amount);
+      if (!isU6 && !isImmU16(Amount)) {
+        // FIX could emit multiple instructions in this case.
+#ifndef NDEBUG
+        errs() << "eliminateCallFramePseudoInstr size too big: "
+               << Amount << "\n";
+#endif
+        llvm_unreachable(nullptr);
+      }
+
+      MachineInstr *New;
+      if (Old->getOpcode() == XCore::ADJCALLSTACKDOWN) {
+        int Opcode = isU6 ? XCore::EXTSP_u6 : XCore::EXTSP_lu6;
+        New=BuildMI(MF, Old->getDebugLoc(), TII.get(Opcode))
+          .addImm(Amount);
+      } else {
+        assert(Old->getOpcode() == XCore::ADJCALLSTACKUP);
+        int Opcode = isU6 ? XCore::LDAWSP_ru6 : XCore::LDAWSP_lru6;
+        New=BuildMI(MF, Old->getDebugLoc(), TII.get(Opcode), XCore::SP)
+          .addImm(Amount);
+      }
+
+      // Replace the pseudo instruction with a new instruction...
+      MBB.insert(I, New);
+    }
+  }
+
+  MBB.erase(I);
+}
+
+void XCoreFrameLowering::
+processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                     RegScavenger *RS) const {
+  XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
+
+  bool LRUsed = MF.getRegInfo().isPhysRegUsed(XCore::LR);
+
+  if (!LRUsed && !MF.getFunction()->isVarArg() &&
+      MF.getFrameInfo()->estimateStackSize(MF))
+    // If we need to extend the stack it is more efficient to use entsp / retsp.
+    // We force the LR to be saved so these instructions are used.
+    LRUsed = true;
+
+  if (MF.getMMI().callsUnwindInit() || MF.getMMI().callsEHReturn()) {
+    // The unwinder expects to find spill slots for the exception info regs R0
+    // & R1. These are used during llvm.eh.return() to 'restore' the exception
+    // info. N.B. we do not spill or restore R0, R1 during normal operation.
+    XFI->createEHSpillSlot(MF);
+    // As we will  have a stack, we force the LR to be saved.
+    LRUsed = true;
+  }
+
+  if (LRUsed) {
+    // We will handle the LR in the prologue/epilogue
+    // and allocate space on the stack ourselves.
+    MF.getRegInfo().setPhysRegUnused(XCore::LR);
+    XFI->createLRSpillSlot(MF);
+  }
+
+  if (hasFP(MF))
+    // A callee save register is used to hold the FP.
+    // This needs saving / restoring in the epilogue / prologue.
+    XFI->createFPSpillSlot(MF);
+}
+
+void XCoreFrameLowering::
+processFunctionBeforeFrameFinalized(MachineFunction &MF,
+                                    RegScavenger *RS) const {
+  assert(RS && "requiresRegisterScavenging failed");
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const TargetRegisterClass *RC = &XCore::GRRegsRegClass;
+  XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
+  // Reserve slots close to SP or frame pointer for Scavenging spills.
+  // When using SP for small frames, we don't need any scratch registers.
+  // When using SP for large frames, we may need 2 scratch registers.
+  // When using FP, for large or small frames, we may need 1 scratch register.
+  if (XFI->isLargeFrame(MF) || hasFP(MF))
+    RS->addScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(),
+                                                       RC->getAlignment(),
+                                                       false));
+  if (XFI->isLargeFrame(MF) && !hasFP(MF))
+    RS->addScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(),
+                                                       RC->getAlignment(),
+                                                       false));
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.h b/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.h
new file mode 100644
index 0000000..e4f806a
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.h
@@ -0,0 +1,62 @@
+//===-- XCoreFrameLowering.h - Frame info for XCore Target ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains XCore frame information that doesn't fit anywhere else
+// cleanly...
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef XCOREFRAMEINFO_H
+#define XCOREFRAMEINFO_H
+
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+  class XCoreSubtarget;
+
+  class XCoreFrameLowering: public TargetFrameLowering {
+  public:
+    XCoreFrameLowering(const XCoreSubtarget &STI);
+
+    /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+    /// the function.
+    void emitPrologue(MachineFunction &MF) const override;
+    void emitEpilogue(MachineFunction &MF,
+                      MachineBasicBlock &MBB) const override;
+
+    bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator MI,
+                                  const std::vector<CalleeSavedInfo> &CSI,
+                                  const TargetRegisterInfo *TRI) const override;
+    bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator MI,
+                                  const std::vector<CalleeSavedInfo> &CSI,
+                                  const TargetRegisterInfo *TRI) const override;
+
+    void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                  MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I) const override;
+
+    bool hasFP(const MachineFunction &MF) const override;
+
+    void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                     RegScavenger *RS = nullptr) const override;
+
+    void processFunctionBeforeFrameFinalized(MachineFunction &MF,
+                                     RegScavenger *RS = nullptr) const override;
+
+    //! Stack slot size (4 bytes)
+    static int stackSlotSize() {
+      return 4;
+    }
+  };
+}
+
+#endif // XCOREFRAMEINFO_H
diff --git a/contrib/llvm/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp b/contrib/llvm/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp
new file mode 100644
index 0000000..30c7b59
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp
@@ -0,0 +1,62 @@
+//===-- XCoreFrameToArgsOffsetElim.cpp ----------------------------*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Replace Pseudo FRAME_TO_ARGS_OFFSET with the appropriate real offset.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCore.h"
+#include "XCoreInstrInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+namespace {
+  struct XCoreFTAOElim : public MachineFunctionPass {
+    static char ID;
+    XCoreFTAOElim() : MachineFunctionPass(ID) {}
+
+    bool runOnMachineFunction(MachineFunction &Fn) override;
+
+    const char *getPassName() const override {
+      return "XCore FRAME_TO_ARGS_OFFSET Elimination";
+    }
+  };
+  char XCoreFTAOElim::ID = 0;
+}
+
+/// createXCoreFrameToArgsOffsetEliminationPass - returns an instance of the
+/// Frame to args offset elimination pass
+FunctionPass *llvm::createXCoreFrameToArgsOffsetEliminationPass() {
+  return new XCoreFTAOElim();
+}
+
+bool XCoreFTAOElim::runOnMachineFunction(MachineFunction &MF) {
+  const XCoreInstrInfo &TII =
+          *static_cast<const XCoreInstrInfo*>(MF.getTarget().getInstrInfo());
+  unsigned StackSize = MF.getFrameInfo()->getStackSize();
+  for (MachineFunction::iterator MFI = MF.begin(), E = MF.end(); MFI != E;
+       ++MFI) {
+    MachineBasicBlock &MBB = *MFI;
+    for (MachineBasicBlock::iterator MBBI = MBB.begin(), EE = MBB.end();
+         MBBI != EE; ++MBBI) {
+      if (MBBI->getOpcode() == XCore::FRAME_TO_ARGS_OFFSET) {
+        MachineInstr *OldInst = MBBI;
+        unsigned Reg = OldInst->getOperand(0).getReg();
+        MBBI = TII.loadImmediate(MBB, MBBI, Reg, StackSize);
+        OldInst->eraseFromParent();
+      }
+    }
+  }
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp b/contrib/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp
new file mode 100644
index 0000000..86bc6f2
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp
@@ -0,0 +1,274 @@
+//===-- XCoreISelDAGToDAG.cpp - A dag to dag inst selector for XCore ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the XCore target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCore.h"
+#include "XCoreTargetMachine.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLowering.h"
+using namespace llvm;
+
+/// XCoreDAGToDAGISel - XCore specific code to select XCore machine
+/// instructions for SelectionDAG operations.
+///
+namespace {
+  class XCoreDAGToDAGISel : public SelectionDAGISel {
+    const XCoreSubtarget &Subtarget;
+
+  public:
+    XCoreDAGToDAGISel(XCoreTargetMachine &TM, CodeGenOpt::Level OptLevel)
+      : SelectionDAGISel(TM, OptLevel),
+        Subtarget(*TM.getSubtargetImpl()) { }
+
+    SDNode *Select(SDNode *N) override;
+    SDNode *SelectBRIND(SDNode *N);
+
+    /// getI32Imm - Return a target constant with the specified value, of type
+    /// i32.
+    inline SDValue getI32Imm(unsigned Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i32);
+    }
+
+    inline bool immMskBitp(SDNode *inN) const {
+      ConstantSDNode *N = cast<ConstantSDNode>(inN);
+      uint32_t value = (uint32_t)N->getZExtValue();
+      if (!isMask_32(value)) {
+        return false;
+      }
+      int msksize = 32 - countLeadingZeros(value);
+      return (msksize >= 1 && msksize <= 8) ||
+              msksize == 16 || msksize == 24 || msksize == 32;
+    }
+
+    // Complex Pattern Selectors.
+    bool SelectADDRspii(SDValue Addr, SDValue &Base, SDValue &Offset);
+
+    bool SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
+                                      std::vector<SDValue> &OutOps) override;
+
+    const char *getPassName() const override {
+      return "XCore DAG->DAG Pattern Instruction Selection";
+    } 
+    
+    // Include the pieces autogenerated from the target description.
+  #include "XCoreGenDAGISel.inc"
+  };
+}  // end anonymous namespace
+
+/// createXCoreISelDag - This pass converts a legalized DAG into a 
+/// XCore-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createXCoreISelDag(XCoreTargetMachine &TM,
+                                       CodeGenOpt::Level OptLevel) {
+  return new XCoreDAGToDAGISel(TM, OptLevel);
+}
+
+bool XCoreDAGToDAGISel::SelectADDRspii(SDValue Addr, SDValue &Base,
+                                       SDValue &Offset) {
+  FrameIndexSDNode *FIN = nullptr;
+  if ((FIN = dyn_cast<FrameIndexSDNode>(Addr))) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+  if (Addr.getOpcode() == ISD::ADD) {
+    ConstantSDNode *CN = nullptr;
+    if ((FIN = dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
+      && (CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
+      && (CN->getSExtValue() % 4 == 0 && CN->getSExtValue() >= 0)) {
+      // Constant positive word offset from frame index
+      Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+      Offset = CurDAG->getTargetConstant(CN->getSExtValue(), MVT::i32);
+      return true;
+    }
+  }
+  return false;
+}
+
+bool XCoreDAGToDAGISel::
+SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
+                             std::vector<SDValue> &OutOps) {
+  SDValue Reg;
+  switch (ConstraintCode) {
+  default: return true;
+  case 'm': // Memory.
+    switch (Op.getOpcode()) {
+    default: return true;
+    case XCoreISD::CPRelativeWrapper:
+      Reg = CurDAG->getRegister(XCore::CP, MVT::i32);
+      break;
+    case XCoreISD::DPRelativeWrapper:
+      Reg = CurDAG->getRegister(XCore::DP, MVT::i32);
+      break;
+    }
+  }
+  OutOps.push_back(Reg);
+  OutOps.push_back(Op.getOperand(0));
+  return false;
+}
+
+SDNode *XCoreDAGToDAGISel::Select(SDNode *N) {
+  SDLoc dl(N);
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::Constant: {
+    uint64_t Val = cast<ConstantSDNode>(N)->getZExtValue();
+    if (immMskBitp(N)) {
+      // Transformation function: get the size of a mask
+      // Look for the first non-zero bit
+      SDValue MskSize = getI32Imm(32 - countLeadingZeros((uint32_t)Val));
+      return CurDAG->getMachineNode(XCore::MKMSK_rus, dl,
+                                    MVT::i32, MskSize);
+    }
+    else if (!isUInt<16>(Val)) {
+      SDValue CPIdx =
+        CurDAG->getTargetConstantPool(ConstantInt::get(
+                              Type::getInt32Ty(*CurDAG->getContext()), Val),
+                                      getTargetLowering()->getPointerTy());
+      SDNode *node = CurDAG->getMachineNode(XCore::LDWCP_lru6, dl, MVT::i32,
+                                            MVT::Other, CPIdx,
+                                            CurDAG->getEntryNode());
+      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+      MemOp[0] = MF->getMachineMemOperand(
+        MachinePointerInfo::getConstantPool(), MachineMemOperand::MOLoad, 4, 4);      
+      cast<MachineSDNode>(node)->setMemRefs(MemOp, MemOp + 1);
+      return node;
+    }
+    break;
+  }
+  case XCoreISD::LADD: {
+    SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                        N->getOperand(2) };
+    return CurDAG->getMachineNode(XCore::LADD_l5r, dl, MVT::i32, MVT::i32,
+                                  Ops);
+  }
+  case XCoreISD::LSUB: {
+    SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                        N->getOperand(2) };
+    return CurDAG->getMachineNode(XCore::LSUB_l5r, dl, MVT::i32, MVT::i32,
+                                  Ops);
+  }
+  case XCoreISD::MACCU: {
+    SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                      N->getOperand(2), N->getOperand(3) };
+    return CurDAG->getMachineNode(XCore::MACCU_l4r, dl, MVT::i32, MVT::i32,
+                                  Ops);
+  }
+  case XCoreISD::MACCS: {
+    SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                      N->getOperand(2), N->getOperand(3) };
+    return CurDAG->getMachineNode(XCore::MACCS_l4r, dl, MVT::i32, MVT::i32,
+                                  Ops);
+  }
+  case XCoreISD::LMUL: {
+    SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                      N->getOperand(2), N->getOperand(3) };
+    return CurDAG->getMachineNode(XCore::LMUL_l6r, dl, MVT::i32, MVT::i32,
+                                  Ops);
+  }
+  case XCoreISD::CRC8: {
+    SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2) };
+    return CurDAG->getMachineNode(XCore::CRC8_l4r, dl, MVT::i32, MVT::i32,
+                                  Ops);
+  }
+  case ISD::BRIND:
+    if (SDNode *ResNode = SelectBRIND(N))
+      return ResNode;
+    break;
+  // Other cases are autogenerated.
+  }
+  return SelectCode(N);
+}
+
+/// Given a chain return a new chain where any appearance of Old is replaced
+/// by New. There must be at most one instruction between Old and Chain and
+/// this instruction must be a TokenFactor. Returns an empty SDValue if 
+/// these conditions don't hold.
+static SDValue
+replaceInChain(SelectionDAG *CurDAG, SDValue Chain, SDValue Old, SDValue New)
+{
+  if (Chain == Old)
+    return New;
+  if (Chain->getOpcode() != ISD::TokenFactor)
+    return SDValue();
+  SmallVector<SDValue, 8> Ops;
+  bool found = false;
+  for (unsigned i = 0, e = Chain->getNumOperands(); i != e; ++i) {
+    if (Chain->getOperand(i) == Old) {
+      Ops.push_back(New);
+      found = true;
+    } else {
+      Ops.push_back(Chain->getOperand(i));
+    }
+  }
+  if (!found)
+    return SDValue();
+  return CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, Ops);
+}
+
+SDNode *XCoreDAGToDAGISel::SelectBRIND(SDNode *N) {
+  SDLoc dl(N);
+  // (brind (int_xcore_checkevent (addr)))
+  SDValue Chain = N->getOperand(0);
+  SDValue Addr = N->getOperand(1);
+  if (Addr->getOpcode() != ISD::INTRINSIC_W_CHAIN)
+    return nullptr;
+  unsigned IntNo = cast<ConstantSDNode>(Addr->getOperand(1))->getZExtValue();
+  if (IntNo != Intrinsic::xcore_checkevent)
+    return nullptr;
+  SDValue nextAddr = Addr->getOperand(2);
+  SDValue CheckEventChainOut(Addr.getNode(), 1);
+  if (!CheckEventChainOut.use_empty()) {
+    // If the chain out of the checkevent intrinsic is an operand of the
+    // indirect branch or used in a TokenFactor which is the operand of the
+    // indirect branch then build a new chain which uses the chain coming into
+    // the checkevent intrinsic instead.
+    SDValue CheckEventChainIn = Addr->getOperand(0);
+    SDValue NewChain = replaceInChain(CurDAG, Chain, CheckEventChainOut,
+                                      CheckEventChainIn);
+    if (!NewChain.getNode())
+      return nullptr;
+    Chain = NewChain;
+  }
+  // Enable events on the thread using setsr 1 and then disable them immediately
+  // after with clrsr 1. If any resources owned by the thread are ready an event
+  // will be taken. If no resource is ready we branch to the address which was
+  // the operand to the checkevent intrinsic.
+  SDValue constOne = getI32Imm(1);
+  SDValue Glue =
+    SDValue(CurDAG->getMachineNode(XCore::SETSR_branch_u6, dl, MVT::Glue,
+                                   constOne, Chain), 0);
+  Glue =
+    SDValue(CurDAG->getMachineNode(XCore::CLRSR_branch_u6, dl, MVT::Glue,
+                                   constOne, Glue), 0);
+  if (nextAddr->getOpcode() == XCoreISD::PCRelativeWrapper &&
+      nextAddr->getOperand(0)->getOpcode() == ISD::TargetBlockAddress) {
+    return CurDAG->SelectNodeTo(N, XCore::BRFU_lu6, MVT::Other,
+                                nextAddr->getOperand(0), Glue);
+  }
+  return CurDAG->SelectNodeTo(N, XCore::BAU_1r, MVT::Other, nextAddr, Glue);
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreISelLowering.cpp b/contrib/llvm/lib/Target/XCore/XCoreISelLowering.cpp
new file mode 100644
index 0000000..be7ef64
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreISelLowering.cpp
@@ -0,0 +1,1966 @@
+//===-- XCoreISelLowering.cpp - XCore DAG Lowering Implementation ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the XCoreTargetLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreISelLowering.h"
+#include "XCore.h"
+#include "XCoreMachineFunctionInfo.h"
+#include "XCoreSubtarget.h"
+#include "XCoreTargetMachine.h"
+#include "XCoreTargetObjectFile.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "xcore-lower"
+
+const char *XCoreTargetLowering::
+getTargetNodeName(unsigned Opcode) const
+{
+  switch (Opcode)
+  {
+    case XCoreISD::BL                : return "XCoreISD::BL";
+    case XCoreISD::PCRelativeWrapper : return "XCoreISD::PCRelativeWrapper";
+    case XCoreISD::DPRelativeWrapper : return "XCoreISD::DPRelativeWrapper";
+    case XCoreISD::CPRelativeWrapper : return "XCoreISD::CPRelativeWrapper";
+    case XCoreISD::LDWSP             : return "XCoreISD::LDWSP";
+    case XCoreISD::STWSP             : return "XCoreISD::STWSP";
+    case XCoreISD::RETSP             : return "XCoreISD::RETSP";
+    case XCoreISD::LADD              : return "XCoreISD::LADD";
+    case XCoreISD::LSUB              : return "XCoreISD::LSUB";
+    case XCoreISD::LMUL              : return "XCoreISD::LMUL";
+    case XCoreISD::MACCU             : return "XCoreISD::MACCU";
+    case XCoreISD::MACCS             : return "XCoreISD::MACCS";
+    case XCoreISD::CRC8              : return "XCoreISD::CRC8";
+    case XCoreISD::BR_JT             : return "XCoreISD::BR_JT";
+    case XCoreISD::BR_JT32           : return "XCoreISD::BR_JT32";
+    case XCoreISD::FRAME_TO_ARGS_OFFSET : return "XCoreISD::FRAME_TO_ARGS_OFFSET";
+    case XCoreISD::EH_RETURN         : return "XCoreISD::EH_RETURN";
+    case XCoreISD::MEMBARRIER        : return "XCoreISD::MEMBARRIER";
+    default                          : return nullptr;
+  }
+}
+
+XCoreTargetLowering::XCoreTargetLowering(const TargetMachine &TM)
+    : TargetLowering(TM, new XCoreTargetObjectFile()), TM(TM),
+      Subtarget(TM.getSubtarget<XCoreSubtarget>()) {
+
+  // Set up the register classes.
+  addRegisterClass(MVT::i32, &XCore::GRRegsRegClass);
+
+  // Compute derived properties from the register classes
+  computeRegisterProperties();
+
+  // Division is expensive
+  setIntDivIsCheap(false);
+
+  setStackPointerRegisterToSaveRestore(XCore::SP);
+
+  setSchedulingPreference(Sched::Source);
+
+  // Use i32 for setcc operations results (slt, sgt, ...).
+  setBooleanContents(ZeroOrOneBooleanContent);
+  setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
+
+  // XCore does not have the NodeTypes below.
+  setOperationAction(ISD::BR_CC,     MVT::i32,   Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i32,   Expand);
+  setOperationAction(ISD::ADDC, MVT::i32, Expand);
+  setOperationAction(ISD::ADDE, MVT::i32, Expand);
+  setOperationAction(ISD::SUBC, MVT::i32, Expand);
+  setOperationAction(ISD::SUBE, MVT::i32, Expand);
+
+  // 64bit
+  setOperationAction(ISD::ADD, MVT::i64, Custom);
+  setOperationAction(ISD::SUB, MVT::i64, Custom);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i32, Custom);
+  setOperationAction(ISD::UMUL_LOHI, MVT::i32, Custom);
+  setOperationAction(ISD::MULHS, MVT::i32, Expand);
+  setOperationAction(ISD::MULHU, MVT::i32, Expand);
+  setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
+  setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);
+  setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand);
+
+  // Bit Manipulation
+  setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+  setOperationAction(ISD::ROTL , MVT::i32, Expand);
+  setOperationAction(ISD::ROTR , MVT::i32, Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
+
+  setOperationAction(ISD::TRAP, MVT::Other, Legal);
+
+  // Jump tables.
+  setOperationAction(ISD::BR_JT, MVT::Other, Custom);
+
+  setOperationAction(ISD::GlobalAddress, MVT::i32,   Custom);
+  setOperationAction(ISD::BlockAddress, MVT::i32 , Custom);
+
+  // Conversion of i64 -> double produces constantpool nodes
+  setOperationAction(ISD::ConstantPool, MVT::i32,   Custom);
+
+  // Loads
+  setLoadExtAction(ISD::EXTLOAD, MVT::i1, Promote);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Expand);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Expand);
+
+  // Custom expand misaligned loads / stores.
+  setOperationAction(ISD::LOAD, MVT::i32, Custom);
+  setOperationAction(ISD::STORE, MVT::i32, Custom);
+
+  // Varargs
+  setOperationAction(ISD::VAEND, MVT::Other, Expand);
+  setOperationAction(ISD::VACOPY, MVT::Other, Expand);
+  setOperationAction(ISD::VAARG, MVT::Other, Custom);
+  setOperationAction(ISD::VASTART, MVT::Other, Custom);
+
+  // Dynamic stack
+  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
+
+  // Exception handling
+  setOperationAction(ISD::EH_RETURN, MVT::Other, Custom);
+  setExceptionPointerRegister(XCore::R0);
+  setExceptionSelectorRegister(XCore::R1);
+  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
+
+  // Atomic operations
+  // We request a fence for ATOMIC_* instructions, to reduce them to Monotonic.
+  // As we are always Sequential Consistent, an ATOMIC_FENCE becomes a no OP.
+  setInsertFencesForAtomic(true);
+  setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);
+
+  // TRAMPOLINE is custom lowered.
+  setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
+  setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
+
+  // We want to custom lower some of our intrinsics.
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+
+  MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 4;
+  MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize
+    = MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 2;
+
+  // We have target-specific dag combine patterns for the following nodes:
+  setTargetDAGCombine(ISD::STORE);
+  setTargetDAGCombine(ISD::ADD);
+  setTargetDAGCombine(ISD::INTRINSIC_VOID);
+  setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
+
+  setMinFunctionAlignment(1);
+  setPrefFunctionAlignment(2);
+}
+
+bool XCoreTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+  if (Val.getOpcode() != ISD::LOAD)
+    return false;
+
+  EVT VT1 = Val.getValueType();
+  if (!VT1.isSimple() || !VT1.isInteger() ||
+      !VT2.isSimple() || !VT2.isInteger())
+    return false;
+
+  switch (VT1.getSimpleVT().SimpleTy) {
+  default: break;
+  case MVT::i8:
+    return true;
+  }
+
+  return false;
+}
+
+SDValue XCoreTargetLowering::
+LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  switch (Op.getOpcode())
+  {
+  case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
+  case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
+  case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
+  case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
+  case ISD::BR_JT:              return LowerBR_JT(Op, DAG);
+  case ISD::LOAD:               return LowerLOAD(Op, DAG);
+  case ISD::STORE:              return LowerSTORE(Op, DAG);
+  case ISD::VAARG:              return LowerVAARG(Op, DAG);
+  case ISD::VASTART:            return LowerVASTART(Op, DAG);
+  case ISD::SMUL_LOHI:          return LowerSMUL_LOHI(Op, DAG);
+  case ISD::UMUL_LOHI:          return LowerUMUL_LOHI(Op, DAG);
+  // FIXME: Remove these when LegalizeDAGTypes lands.
+  case ISD::ADD:
+  case ISD::SUB:                return ExpandADDSUB(Op.getNode(), DAG);
+  case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
+  case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
+  case ISD::FRAME_TO_ARGS_OFFSET: return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
+  case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
+  case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
+  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, DAG);
+  case ISD::ATOMIC_LOAD:        return LowerATOMIC_LOAD(Op, DAG);
+  case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op, DAG);
+  default:
+    llvm_unreachable("unimplemented operand");
+  }
+}
+
+/// ReplaceNodeResults - Replace the results of node with an illegal result
+/// type with new values built out of custom code.
+void XCoreTargetLowering::ReplaceNodeResults(SDNode *N,
+                                             SmallVectorImpl<SDValue>&Results,
+                                             SelectionDAG &DAG) const {
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Don't know how to custom expand this!");
+  case ISD::ADD:
+  case ISD::SUB:
+    Results.push_back(ExpandADDSUB(N, DAG));
+    return;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//  Misc Lower Operation implementation
+//===----------------------------------------------------------------------===//
+
+SDValue XCoreTargetLowering::getGlobalAddressWrapper(SDValue GA,
+                                                     const GlobalValue *GV,
+                                                     SelectionDAG &DAG) const {
+  // FIXME there is no actual debug info here
+  SDLoc dl(GA);
+
+  if (GV->getType()->getElementType()->isFunctionTy())
+    return DAG.getNode(XCoreISD::PCRelativeWrapper, dl, MVT::i32, GA);
+
+  const auto *GVar = dyn_cast<GlobalVariable>(GV);
+  if ((GV->hasSection() && StringRef(GV->getSection()).startswith(".cp.")) ||
+      (GVar && GVar->isConstant() && GV->hasLocalLinkage()))
+    return DAG.getNode(XCoreISD::CPRelativeWrapper, dl, MVT::i32, GA);
+
+  return DAG.getNode(XCoreISD::DPRelativeWrapper, dl, MVT::i32, GA);
+}
+
+static bool IsSmallObject(const GlobalValue *GV, const XCoreTargetLowering &XTL) {
+  if (XTL.getTargetMachine().getCodeModel() == CodeModel::Small)
+    return true;
+
+  Type *ObjType = GV->getType()->getPointerElementType();
+  if (!ObjType->isSized())
+    return false;
+
+  unsigned ObjSize = XTL.getDataLayout()->getTypeAllocSize(ObjType);
+  return ObjSize < CodeModelLargeSize && ObjSize != 0;
+}
+
+SDValue XCoreTargetLowering::
+LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const
+{
+  const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
+  const GlobalValue *GV = GN->getGlobal();
+  SDLoc DL(GN);
+  int64_t Offset = GN->getOffset();
+  if (IsSmallObject(GV, *this)) {
+    // We can only fold positive offsets that are a multiple of the word size.
+    int64_t FoldedOffset = std::max(Offset & ~3, (int64_t)0);
+    SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, FoldedOffset);
+    GA = getGlobalAddressWrapper(GA, GV, DAG);
+    // Handle the rest of the offset.
+    if (Offset != FoldedOffset) {
+      SDValue Remaining = DAG.getConstant(Offset - FoldedOffset, MVT::i32);
+      GA = DAG.getNode(ISD::ADD, DL, MVT::i32, GA, Remaining);
+    }
+    return GA;
+  } else {
+    // Ideally we would not fold in offset with an index <= 11.
+    Type *Ty = Type::getInt8PtrTy(*DAG.getContext());
+    Constant *GA = ConstantExpr::getBitCast(const_cast<GlobalValue*>(GV), Ty);
+    Ty = Type::getInt32Ty(*DAG.getContext());
+    Constant *Idx = ConstantInt::get(Ty, Offset);
+    Constant *GAI = ConstantExpr::getGetElementPtr(GA, Idx);
+    SDValue CP = DAG.getConstantPool(GAI, MVT::i32);
+    return DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), CP,
+                       MachinePointerInfo(), false, false, false, 0);
+  }
+}
+
+SDValue XCoreTargetLowering::
+LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const
+{
+  SDLoc DL(Op);
+
+  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+  SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy());
+
+  return DAG.getNode(XCoreISD::PCRelativeWrapper, DL, getPointerTy(), Result);
+}
+
+SDValue XCoreTargetLowering::
+LowerConstantPool(SDValue Op, SelectionDAG &DAG) const
+{
+  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+  // FIXME there isn't really debug info here
+  SDLoc dl(CP);
+  EVT PtrVT = Op.getValueType();
+  SDValue Res;
+  if (CP->isMachineConstantPoolEntry()) {
+    Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
+                                    CP->getAlignment(), CP->getOffset());
+  } else {
+    Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT,
+                                    CP->getAlignment(), CP->getOffset());
+  }
+  return DAG.getNode(XCoreISD::CPRelativeWrapper, dl, MVT::i32, Res);
+}
+
+unsigned XCoreTargetLowering::getJumpTableEncoding() const {
+  return MachineJumpTableInfo::EK_Inline;
+}
+
+SDValue XCoreTargetLowering::
+LowerBR_JT(SDValue Op, SelectionDAG &DAG) const
+{
+  SDValue Chain = Op.getOperand(0);
+  SDValue Table = Op.getOperand(1);
+  SDValue Index = Op.getOperand(2);
+  SDLoc dl(Op);
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
+  unsigned JTI = JT->getIndex();
+  MachineFunction &MF = DAG.getMachineFunction();
+  const MachineJumpTableInfo *MJTI = MF.getJumpTableInfo();
+  SDValue TargetJT = DAG.getTargetJumpTable(JT->getIndex(), MVT::i32);
+
+  unsigned NumEntries = MJTI->getJumpTables()[JTI].MBBs.size();
+  if (NumEntries <= 32) {
+    return DAG.getNode(XCoreISD::BR_JT, dl, MVT::Other, Chain, TargetJT, Index);
+  }
+  assert((NumEntries >> 31) == 0);
+  SDValue ScaledIndex = DAG.getNode(ISD::SHL, dl, MVT::i32, Index,
+                                    DAG.getConstant(1, MVT::i32));
+  return DAG.getNode(XCoreISD::BR_JT32, dl, MVT::Other, Chain, TargetJT,
+                     ScaledIndex);
+}
+
+SDValue XCoreTargetLowering::
+lowerLoadWordFromAlignedBasePlusOffset(SDLoc DL, SDValue Chain, SDValue Base,
+                                       int64_t Offset, SelectionDAG &DAG) const
+{
+  if ((Offset & 0x3) == 0) {
+    return DAG.getLoad(getPointerTy(), DL, Chain, Base, MachinePointerInfo(),
+                       false, false, false, 0);
+  }
+  // Lower to pair of consecutive word aligned loads plus some bit shifting.
+  int32_t HighOffset = RoundUpToAlignment(Offset, 4);
+  int32_t LowOffset = HighOffset - 4;
+  SDValue LowAddr, HighAddr;
+  if (GlobalAddressSDNode *GASD =
+        dyn_cast<GlobalAddressSDNode>(Base.getNode())) {
+    LowAddr = DAG.getGlobalAddress(GASD->getGlobal(), DL, Base.getValueType(),
+                                   LowOffset);
+    HighAddr = DAG.getGlobalAddress(GASD->getGlobal(), DL, Base.getValueType(),
+                                    HighOffset);
+  } else {
+    LowAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, Base,
+                          DAG.getConstant(LowOffset, MVT::i32));
+    HighAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, Base,
+                           DAG.getConstant(HighOffset, MVT::i32));
+  }
+  SDValue LowShift = DAG.getConstant((Offset - LowOffset) * 8, MVT::i32);
+  SDValue HighShift = DAG.getConstant((HighOffset - Offset) * 8, MVT::i32);
+
+  SDValue Low = DAG.getLoad(getPointerTy(), DL, Chain,
+                            LowAddr, MachinePointerInfo(),
+                            false, false, false, 0);
+  SDValue High = DAG.getLoad(getPointerTy(), DL, Chain,
+                             HighAddr, MachinePointerInfo(),
+                             false, false, false, 0);
+  SDValue LowShifted = DAG.getNode(ISD::SRL, DL, MVT::i32, Low, LowShift);
+  SDValue HighShifted = DAG.getNode(ISD::SHL, DL, MVT::i32, High, HighShift);
+  SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i32, LowShifted, HighShifted);
+  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Low.getValue(1),
+                      High.getValue(1));
+  SDValue Ops[] = { Result, Chain };
+  return DAG.getMergeValues(Ops, DL);
+}
+
+static bool isWordAligned(SDValue Value, SelectionDAG &DAG)
+{
+  APInt KnownZero, KnownOne;
+  DAG.computeKnownBits(Value, KnownZero, KnownOne);
+  return KnownZero.countTrailingOnes() >= 2;
+}
+
+SDValue XCoreTargetLowering::
+LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  LoadSDNode *LD = cast<LoadSDNode>(Op);
+  assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
+         "Unexpected extension type");
+  assert(LD->getMemoryVT() == MVT::i32 && "Unexpected load EVT");
+  if (allowsUnalignedMemoryAccesses(LD->getMemoryVT()))
+    return SDValue();
+
+  unsigned ABIAlignment = getDataLayout()->
+    getABITypeAlignment(LD->getMemoryVT().getTypeForEVT(*DAG.getContext()));
+  // Leave aligned load alone.
+  if (LD->getAlignment() >= ABIAlignment)
+    return SDValue();
+
+  SDValue Chain = LD->getChain();
+  SDValue BasePtr = LD->getBasePtr();
+  SDLoc DL(Op);
+
+  if (!LD->isVolatile()) {
+    const GlobalValue *GV;
+    int64_t Offset = 0;
+    if (DAG.isBaseWithConstantOffset(BasePtr) &&
+        isWordAligned(BasePtr->getOperand(0), DAG)) {
+      SDValue NewBasePtr = BasePtr->getOperand(0);
+      Offset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
+      return lowerLoadWordFromAlignedBasePlusOffset(DL, Chain, NewBasePtr,
+                                                    Offset, DAG);
+    }
+    if (TLI.isGAPlusOffset(BasePtr.getNode(), GV, Offset) &&
+        MinAlign(GV->getAlignment(), 4) == 4) {
+      SDValue NewBasePtr = DAG.getGlobalAddress(GV, DL,
+                                                BasePtr->getValueType(0));
+      return lowerLoadWordFromAlignedBasePlusOffset(DL, Chain, NewBasePtr,
+                                                    Offset, DAG);
+    }
+  }
+
+  if (LD->getAlignment() == 2) {
+    SDValue Low = DAG.getExtLoad(ISD::ZEXTLOAD, DL, MVT::i32, Chain,
+                                 BasePtr, LD->getPointerInfo(), MVT::i16,
+                                 LD->isVolatile(), LD->isNonTemporal(), 2);
+    SDValue HighAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
+                                   DAG.getConstant(2, MVT::i32));
+    SDValue High = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
+                                  HighAddr,
+                                  LD->getPointerInfo().getWithOffset(2),
+                                  MVT::i16, LD->isVolatile(),
+                                  LD->isNonTemporal(), 2);
+    SDValue HighShifted = DAG.getNode(ISD::SHL, DL, MVT::i32, High,
+                                      DAG.getConstant(16, MVT::i32));
+    SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i32, Low, HighShifted);
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Low.getValue(1),
+                             High.getValue(1));
+    SDValue Ops[] = { Result, Chain };
+    return DAG.getMergeValues(Ops, DL);
+  }
+
+  // Lower to a call to __misaligned_load(BasePtr).
+  Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+
+  Entry.Ty = IntPtrTy;
+  Entry.Node = BasePtr;
+  Args.push_back(Entry);
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(DL).setChain(Chain)
+    .setCallee(CallingConv::C, IntPtrTy,
+               DAG.getExternalSymbol("__misaligned_load", getPointerTy()),
+               std::move(Args), 0);
+
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+  SDValue Ops[] = { CallResult.first, CallResult.second };
+  return DAG.getMergeValues(Ops, DL);
+}
+
+SDValue XCoreTargetLowering::
+LowerSTORE(SDValue Op, SelectionDAG &DAG) const
+{
+  StoreSDNode *ST = cast<StoreSDNode>(Op);
+  assert(!ST->isTruncatingStore() && "Unexpected store type");
+  assert(ST->getMemoryVT() == MVT::i32 && "Unexpected store EVT");
+  if (allowsUnalignedMemoryAccesses(ST->getMemoryVT())) {
+    return SDValue();
+  }
+  unsigned ABIAlignment = getDataLayout()->
+    getABITypeAlignment(ST->getMemoryVT().getTypeForEVT(*DAG.getContext()));
+  // Leave aligned store alone.
+  if (ST->getAlignment() >= ABIAlignment) {
+    return SDValue();
+  }
+  SDValue Chain = ST->getChain();
+  SDValue BasePtr = ST->getBasePtr();
+  SDValue Value = ST->getValue();
+  SDLoc dl(Op);
+
+  if (ST->getAlignment() == 2) {
+    SDValue Low = Value;
+    SDValue High = DAG.getNode(ISD::SRL, dl, MVT::i32, Value,
+                                      DAG.getConstant(16, MVT::i32));
+    SDValue StoreLow = DAG.getTruncStore(Chain, dl, Low, BasePtr,
+                                         ST->getPointerInfo(), MVT::i16,
+                                         ST->isVolatile(), ST->isNonTemporal(),
+                                         2);
+    SDValue HighAddr = DAG.getNode(ISD::ADD, dl, MVT::i32, BasePtr,
+                                   DAG.getConstant(2, MVT::i32));
+    SDValue StoreHigh = DAG.getTruncStore(Chain, dl, High, HighAddr,
+                                          ST->getPointerInfo().getWithOffset(2),
+                                          MVT::i16, ST->isVolatile(),
+                                          ST->isNonTemporal(), 2);
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StoreLow, StoreHigh);
+  }
+
+  // Lower to a call to __misaligned_store(BasePtr, Value).
+  Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+
+  Entry.Ty = IntPtrTy;
+  Entry.Node = BasePtr;
+  Args.push_back(Entry);
+
+  Entry.Node = Value;
+  Args.push_back(Entry);
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(Chain)
+    .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+               DAG.getExternalSymbol("__misaligned_store", getPointerTy()),
+               std::move(Args), 0);
+
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+  return CallResult.second;
+}
+
+SDValue XCoreTargetLowering::
+LowerSMUL_LOHI(SDValue Op, SelectionDAG &DAG) const
+{
+  assert(Op.getValueType() == MVT::i32 && Op.getOpcode() == ISD::SMUL_LOHI &&
+         "Unexpected operand to lower!");
+  SDLoc dl(Op);
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue Zero = DAG.getConstant(0, MVT::i32);
+  SDValue Hi = DAG.getNode(XCoreISD::MACCS, dl,
+                           DAG.getVTList(MVT::i32, MVT::i32), Zero, Zero,
+                           LHS, RHS);
+  SDValue Lo(Hi.getNode(), 1);
+  SDValue Ops[] = { Lo, Hi };
+  return DAG.getMergeValues(Ops, dl);
+}
+
+SDValue XCoreTargetLowering::
+LowerUMUL_LOHI(SDValue Op, SelectionDAG &DAG) const
+{
+  assert(Op.getValueType() == MVT::i32 && Op.getOpcode() == ISD::UMUL_LOHI &&
+         "Unexpected operand to lower!");
+  SDLoc dl(Op);
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue Zero = DAG.getConstant(0, MVT::i32);
+  SDValue Hi = DAG.getNode(XCoreISD::LMUL, dl,
+                           DAG.getVTList(MVT::i32, MVT::i32), LHS, RHS,
+                           Zero, Zero);
+  SDValue Lo(Hi.getNode(), 1);
+  SDValue Ops[] = { Lo, Hi };
+  return DAG.getMergeValues(Ops, dl);
+}
+
+/// isADDADDMUL - Return whether Op is in a form that is equivalent to
+/// add(add(mul(x,y),a),b). If requireIntermediatesHaveOneUse is true then
+/// each intermediate result in the calculation must also have a single use.
+/// If the Op is in the correct form the constituent parts are written to Mul0,
+/// Mul1, Addend0 and Addend1.
+static bool
+isADDADDMUL(SDValue Op, SDValue &Mul0, SDValue &Mul1, SDValue &Addend0,
+            SDValue &Addend1, bool requireIntermediatesHaveOneUse)
+{
+  if (Op.getOpcode() != ISD::ADD)
+    return false;
+  SDValue N0 = Op.getOperand(0);
+  SDValue N1 = Op.getOperand(1);
+  SDValue AddOp;
+  SDValue OtherOp;
+  if (N0.getOpcode() == ISD::ADD) {
+    AddOp = N0;
+    OtherOp = N1;
+  } else if (N1.getOpcode() == ISD::ADD) {
+    AddOp = N1;
+    OtherOp = N0;
+  } else {
+    return false;
+  }
+  if (requireIntermediatesHaveOneUse && !AddOp.hasOneUse())
+    return false;
+  if (OtherOp.getOpcode() == ISD::MUL) {
+    // add(add(a,b),mul(x,y))
+    if (requireIntermediatesHaveOneUse && !OtherOp.hasOneUse())
+      return false;
+    Mul0 = OtherOp.getOperand(0);
+    Mul1 = OtherOp.getOperand(1);
+    Addend0 = AddOp.getOperand(0);
+    Addend1 = AddOp.getOperand(1);
+    return true;
+  }
+  if (AddOp.getOperand(0).getOpcode() == ISD::MUL) {
+    // add(add(mul(x,y),a),b)
+    if (requireIntermediatesHaveOneUse && !AddOp.getOperand(0).hasOneUse())
+      return false;
+    Mul0 = AddOp.getOperand(0).getOperand(0);
+    Mul1 = AddOp.getOperand(0).getOperand(1);
+    Addend0 = AddOp.getOperand(1);
+    Addend1 = OtherOp;
+    return true;
+  }
+  if (AddOp.getOperand(1).getOpcode() == ISD::MUL) {
+    // add(add(a,mul(x,y)),b)
+    if (requireIntermediatesHaveOneUse && !AddOp.getOperand(1).hasOneUse())
+      return false;
+    Mul0 = AddOp.getOperand(1).getOperand(0);
+    Mul1 = AddOp.getOperand(1).getOperand(1);
+    Addend0 = AddOp.getOperand(0);
+    Addend1 = OtherOp;
+    return true;
+  }
+  return false;
+}
+
+SDValue XCoreTargetLowering::
+TryExpandADDWithMul(SDNode *N, SelectionDAG &DAG) const
+{
+  SDValue Mul;
+  SDValue Other;
+  if (N->getOperand(0).getOpcode() == ISD::MUL) {
+    Mul = N->getOperand(0);
+    Other = N->getOperand(1);
+  } else if (N->getOperand(1).getOpcode() == ISD::MUL) {
+    Mul = N->getOperand(1);
+    Other = N->getOperand(0);
+  } else {
+    return SDValue();
+  }
+  SDLoc dl(N);
+  SDValue LL, RL, AddendL, AddendH;
+  LL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                   Mul.getOperand(0),  DAG.getConstant(0, MVT::i32));
+  RL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                   Mul.getOperand(1),  DAG.getConstant(0, MVT::i32));
+  AddendL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                        Other,  DAG.getConstant(0, MVT::i32));
+  AddendH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                        Other,  DAG.getConstant(1, MVT::i32));
+  APInt HighMask = APInt::getHighBitsSet(64, 32);
+  unsigned LHSSB = DAG.ComputeNumSignBits(Mul.getOperand(0));
+  unsigned RHSSB = DAG.ComputeNumSignBits(Mul.getOperand(1));
+  if (DAG.MaskedValueIsZero(Mul.getOperand(0), HighMask) &&
+      DAG.MaskedValueIsZero(Mul.getOperand(1), HighMask)) {
+    // The inputs are both zero-extended.
+    SDValue Hi = DAG.getNode(XCoreISD::MACCU, dl,
+                             DAG.getVTList(MVT::i32, MVT::i32), AddendH,
+                             AddendL, LL, RL);
+    SDValue Lo(Hi.getNode(), 1);
+    return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
+  }
+  if (LHSSB > 32 && RHSSB > 32) {
+    // The inputs are both sign-extended.
+    SDValue Hi = DAG.getNode(XCoreISD::MACCS, dl,
+                             DAG.getVTList(MVT::i32, MVT::i32), AddendH,
+                             AddendL, LL, RL);
+    SDValue Lo(Hi.getNode(), 1);
+    return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
+  }
+  SDValue LH, RH;
+  LH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                   Mul.getOperand(0),  DAG.getConstant(1, MVT::i32));
+  RH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                   Mul.getOperand(1),  DAG.getConstant(1, MVT::i32));
+  SDValue Hi = DAG.getNode(XCoreISD::MACCU, dl,
+                           DAG.getVTList(MVT::i32, MVT::i32), AddendH,
+                           AddendL, LL, RL);
+  SDValue Lo(Hi.getNode(), 1);
+  RH = DAG.getNode(ISD::MUL, dl, MVT::i32, LL, RH);
+  LH = DAG.getNode(ISD::MUL, dl, MVT::i32, LH, RL);
+  Hi = DAG.getNode(ISD::ADD, dl, MVT::i32, Hi, RH);
+  Hi = DAG.getNode(ISD::ADD, dl, MVT::i32, Hi, LH);
+  return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
+}
+
+SDValue XCoreTargetLowering::
+ExpandADDSUB(SDNode *N, SelectionDAG &DAG) const
+{
+  assert(N->getValueType(0) == MVT::i64 &&
+         (N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
+        "Unknown operand to lower!");
+
+  if (N->getOpcode() == ISD::ADD) {
+    SDValue Result = TryExpandADDWithMul(N, DAG);
+    if (Result.getNode())
+      return Result;
+  }
+
+  SDLoc dl(N);
+
+  // Extract components
+  SDValue LHSL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                            N->getOperand(0),  DAG.getConstant(0, MVT::i32));
+  SDValue LHSH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                            N->getOperand(0),  DAG.getConstant(1, MVT::i32));
+  SDValue RHSL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                             N->getOperand(1), DAG.getConstant(0, MVT::i32));
+  SDValue RHSH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                             N->getOperand(1), DAG.getConstant(1, MVT::i32));
+
+  // Expand
+  unsigned Opcode = (N->getOpcode() == ISD::ADD) ? XCoreISD::LADD :
+                                                   XCoreISD::LSUB;
+  SDValue Zero = DAG.getConstant(0, MVT::i32);
+  SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
+                           LHSL, RHSL, Zero);
+  SDValue Carry(Lo.getNode(), 1);
+
+  SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
+                           LHSH, RHSH, Carry);
+  SDValue Ignored(Hi.getNode(), 1);
+  // Merge the pieces
+  return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
+}
+
+SDValue XCoreTargetLowering::
+LowerVAARG(SDValue Op, SelectionDAG &DAG) const
+{
+  // Whist llvm does not support aggregate varargs we can ignore
+  // the possibility of the ValueType being an implicit byVal vararg.
+  SDNode *Node = Op.getNode();
+  EVT VT = Node->getValueType(0); // not an aggregate
+  SDValue InChain = Node->getOperand(0);
+  SDValue VAListPtr = Node->getOperand(1);
+  EVT PtrVT = VAListPtr.getValueType();
+  const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
+  SDLoc dl(Node);
+  SDValue VAList = DAG.getLoad(PtrVT, dl, InChain,
+                               VAListPtr, MachinePointerInfo(SV),
+                               false, false, false, 0);
+  // Increment the pointer, VAList, to the next vararg
+  SDValue nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAList,
+                                DAG.getIntPtrConstant(VT.getSizeInBits() / 8));
+  // Store the incremented VAList to the legalized pointer
+  InChain = DAG.getStore(VAList.getValue(1), dl, nextPtr, VAListPtr,
+                         MachinePointerInfo(SV), false, false, 0);
+  // Load the actual argument out of the pointer VAList
+  return DAG.getLoad(VT, dl, InChain, VAList, MachinePointerInfo(),
+                     false, false, false, 0);
+}
+
+SDValue XCoreTargetLowering::
+LowerVASTART(SDValue Op, SelectionDAG &DAG) const
+{
+  SDLoc dl(Op);
+  // vastart stores the address of the VarArgsFrameIndex slot into the
+  // memory location argument
+  MachineFunction &MF = DAG.getMachineFunction();
+  XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
+  SDValue Addr = DAG.getFrameIndex(XFI->getVarArgsFrameIndex(), MVT::i32);
+  return DAG.getStore(Op.getOperand(0), dl, Addr, Op.getOperand(1),
+                      MachinePointerInfo(), false, false, 0);
+}
+
+SDValue XCoreTargetLowering::LowerFRAMEADDR(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  // This nodes represent llvm.frameaddress on the DAG.
+  // It takes one operand, the index of the frame address to return.
+  // An index of zero corresponds to the current function's frame address.
+  // An index of one to the parent's frame address, and so on.
+  // Depths > 0 not supported yet!
+  if (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() > 0)
+    return SDValue();
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  const TargetRegisterInfo *RegInfo = getTargetMachine().getRegisterInfo();
+  return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(Op),
+                            RegInfo->getFrameRegister(MF), MVT::i32);
+}
+
+SDValue XCoreTargetLowering::
+LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
+  // This nodes represent llvm.returnaddress on the DAG.
+  // It takes one operand, the index of the return address to return.
+  // An index of zero corresponds to the current function's return address.
+  // An index of one to the parent's return address, and so on.
+  // Depths > 0 not supported yet!
+  if (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() > 0)
+    return SDValue();
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
+  int FI = XFI->createLRSpillSlot(MF);
+  SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
+  return DAG.getLoad(getPointerTy(), SDLoc(Op), DAG.getEntryNode(), FIN,
+                     MachinePointerInfo::getFixedStack(FI), false, false,
+                     false, 0);
+}
+
+SDValue XCoreTargetLowering::
+LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const {
+  // This node represents offset from frame pointer to first on-stack argument.
+  // This is needed for correct stack adjustment during unwind.
+  // However, we don't know the offset until after the frame has be finalised.
+  // This is done during the XCoreFTAOElim pass.
+  return DAG.getNode(XCoreISD::FRAME_TO_ARGS_OFFSET, SDLoc(Op), MVT::i32);
+}
+
+SDValue XCoreTargetLowering::
+LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
+  // OUTCHAIN = EH_RETURN(INCHAIN, OFFSET, HANDLER)
+  // This node represents 'eh_return' gcc dwarf builtin, which is used to
+  // return from exception. The general meaning is: adjust stack by OFFSET and
+  // pass execution to HANDLER.
+  MachineFunction &MF = DAG.getMachineFunction();
+  SDValue Chain     = Op.getOperand(0);
+  SDValue Offset    = Op.getOperand(1);
+  SDValue Handler   = Op.getOperand(2);
+  SDLoc dl(Op);
+
+  // Absolute SP = (FP + FrameToArgs) + Offset
+  const TargetRegisterInfo *RegInfo = getTargetMachine().getRegisterInfo();
+  SDValue Stack = DAG.getCopyFromReg(DAG.getEntryNode(), dl,
+                            RegInfo->getFrameRegister(MF), MVT::i32);
+  SDValue FrameToArgs = DAG.getNode(XCoreISD::FRAME_TO_ARGS_OFFSET, dl,
+                                    MVT::i32);
+  Stack = DAG.getNode(ISD::ADD, dl, MVT::i32, Stack, FrameToArgs);
+  Stack = DAG.getNode(ISD::ADD, dl, MVT::i32, Stack, Offset);
+
+  // R0=ExceptionPointerRegister R1=ExceptionSelectorRegister
+  // which leaves 2 caller saved registers, R2 & R3 for us to use.
+  unsigned StackReg = XCore::R2;
+  unsigned HandlerReg = XCore::R3;
+
+  SDValue OutChains[] = {
+    DAG.getCopyToReg(Chain, dl, StackReg, Stack),
+    DAG.getCopyToReg(Chain, dl, HandlerReg, Handler)
+  };
+
+  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
+
+  return DAG.getNode(XCoreISD::EH_RETURN, dl, MVT::Other, Chain,
+                     DAG.getRegister(StackReg, MVT::i32),
+                     DAG.getRegister(HandlerReg, MVT::i32));
+
+}
+
+SDValue XCoreTargetLowering::
+LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const {
+  return Op.getOperand(0);
+}
+
+SDValue XCoreTargetLowering::
+LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  SDValue Trmp = Op.getOperand(1); // trampoline
+  SDValue FPtr = Op.getOperand(2); // nested function
+  SDValue Nest = Op.getOperand(3); // 'nest' parameter value
+
+  const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+
+  // .align 4
+  // LDAPF_u10 r11, nest
+  // LDW_2rus r11, r11[0]
+  // STWSP_ru6 r11, sp[0]
+  // LDAPF_u10 r11, fptr
+  // LDW_2rus r11, r11[0]
+  // BAU_1r r11
+  // nest:
+  // .word nest
+  // fptr:
+  // .word fptr
+  SDValue OutChains[5];
+
+  SDValue Addr = Trmp;
+
+  SDLoc dl(Op);
+  OutChains[0] = DAG.getStore(Chain, dl, DAG.getConstant(0x0a3cd805, MVT::i32),
+                              Addr, MachinePointerInfo(TrmpAddr), false, false,
+                              0);
+
+  Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+                     DAG.getConstant(4, MVT::i32));
+  OutChains[1] = DAG.getStore(Chain, dl, DAG.getConstant(0xd80456c0, MVT::i32),
+                              Addr, MachinePointerInfo(TrmpAddr, 4), false,
+                              false, 0);
+
+  Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+                     DAG.getConstant(8, MVT::i32));
+  OutChains[2] = DAG.getStore(Chain, dl, DAG.getConstant(0x27fb0a3c, MVT::i32),
+                              Addr, MachinePointerInfo(TrmpAddr, 8), false,
+                              false, 0);
+
+  Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+                     DAG.getConstant(12, MVT::i32));
+  OutChains[3] = DAG.getStore(Chain, dl, Nest, Addr,
+                              MachinePointerInfo(TrmpAddr, 12), false, false,
+                              0);
+
+  Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+                     DAG.getConstant(16, MVT::i32));
+  OutChains[4] = DAG.getStore(Chain, dl, FPtr, Addr,
+                              MachinePointerInfo(TrmpAddr, 16), false, false,
+                              0);
+
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
+}
+
+SDValue XCoreTargetLowering::
+LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  switch (IntNo) {
+    case Intrinsic::xcore_crc8:
+      EVT VT = Op.getValueType();
+      SDValue Data =
+        DAG.getNode(XCoreISD::CRC8, DL, DAG.getVTList(VT, VT),
+                    Op.getOperand(1), Op.getOperand(2) , Op.getOperand(3));
+      SDValue Crc(Data.getNode(), 1);
+      SDValue Results[] = { Crc, Data };
+      return DAG.getMergeValues(Results, DL);
+  }
+  return SDValue();
+}
+
+SDValue XCoreTargetLowering::
+LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  return DAG.getNode(XCoreISD::MEMBARRIER, DL, MVT::Other, Op.getOperand(0));
+}
+
+SDValue XCoreTargetLowering::
+LowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG) const {
+  AtomicSDNode *N = cast<AtomicSDNode>(Op);
+  assert(N->getOpcode() == ISD::ATOMIC_LOAD && "Bad Atomic OP");
+  assert(N->getOrdering() <= Monotonic &&
+         "setInsertFencesForAtomic(true) and yet greater than Monotonic");
+  if (N->getMemoryVT() == MVT::i32) {
+    if (N->getAlignment() < 4)
+      report_fatal_error("atomic load must be aligned");
+    return DAG.getLoad(getPointerTy(), SDLoc(Op), N->getChain(),
+                       N->getBasePtr(), N->getPointerInfo(),
+                       N->isVolatile(), N->isNonTemporal(),
+                       N->isInvariant(), N->getAlignment(),
+                       N->getTBAAInfo(), N->getRanges());
+  }
+  if (N->getMemoryVT() == MVT::i16) {
+    if (N->getAlignment() < 2)
+      report_fatal_error("atomic load must be aligned");
+    return DAG.getExtLoad(ISD::EXTLOAD, SDLoc(Op), MVT::i32, N->getChain(),
+                          N->getBasePtr(), N->getPointerInfo(), MVT::i16,
+                          N->isVolatile(), N->isNonTemporal(),
+                          N->getAlignment(), N->getTBAAInfo());
+  }
+  if (N->getMemoryVT() == MVT::i8)
+    return DAG.getExtLoad(ISD::EXTLOAD, SDLoc(Op), MVT::i32, N->getChain(),
+                          N->getBasePtr(), N->getPointerInfo(), MVT::i8,
+                          N->isVolatile(), N->isNonTemporal(),
+                          N->getAlignment(), N->getTBAAInfo());
+  return SDValue();
+}
+
+SDValue XCoreTargetLowering::
+LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const {
+  AtomicSDNode *N = cast<AtomicSDNode>(Op);
+  assert(N->getOpcode() == ISD::ATOMIC_STORE && "Bad Atomic OP");
+  assert(N->getOrdering() <= Monotonic &&
+         "setInsertFencesForAtomic(true) and yet greater than Monotonic");
+  if (N->getMemoryVT() == MVT::i32) {
+    if (N->getAlignment() < 4)
+      report_fatal_error("atomic store must be aligned");
+    return DAG.getStore(N->getChain(), SDLoc(Op), N->getVal(),
+                        N->getBasePtr(), N->getPointerInfo(),
+                        N->isVolatile(), N->isNonTemporal(),
+                        N->getAlignment(), N->getTBAAInfo());
+  }
+  if (N->getMemoryVT() == MVT::i16) {
+    if (N->getAlignment() < 2)
+      report_fatal_error("atomic store must be aligned");
+    return DAG.getTruncStore(N->getChain(), SDLoc(Op), N->getVal(),
+                             N->getBasePtr(), N->getPointerInfo(), MVT::i16,
+                             N->isVolatile(), N->isNonTemporal(),
+                             N->getAlignment(), N->getTBAAInfo());
+  }
+  if (N->getMemoryVT() == MVT::i8)
+    return DAG.getTruncStore(N->getChain(), SDLoc(Op), N->getVal(),
+                             N->getBasePtr(), N->getPointerInfo(), MVT::i8,
+                             N->isVolatile(), N->isNonTemporal(),
+                             N->getAlignment(), N->getTBAAInfo());
+  return SDValue();
+}
+
+//===----------------------------------------------------------------------===//
+//                      Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "XCoreGenCallingConv.inc"
+
+//===----------------------------------------------------------------------===//
+//                  Call Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+/// XCore call implementation
+SDValue
+XCoreTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                               SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  SDLoc &dl                             = CLI.DL;
+  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+  SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
+  SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  bool &isTailCall                      = CLI.IsTailCall;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool isVarArg                         = CLI.IsVarArg;
+
+  // XCore target does not yet support tail call optimization.
+  isTailCall = false;
+
+  // For now, only CallingConv::C implemented
+  switch (CallConv)
+  {
+    default:
+      llvm_unreachable("Unsupported calling convention");
+    case CallingConv::Fast:
+    case CallingConv::C:
+      return LowerCCCCallTo(Chain, Callee, CallConv, isVarArg, isTailCall,
+                            Outs, OutVals, Ins, dl, DAG, InVals);
+  }
+}
+
+/// LowerCallResult - Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers / memory locations.
+static SDValue
+LowerCallResult(SDValue Chain, SDValue InFlag,
+                const SmallVectorImpl<CCValAssign> &RVLocs,
+                SDLoc dl, SelectionDAG &DAG,
+                SmallVectorImpl<SDValue> &InVals) {
+  SmallVector<std::pair<int, unsigned>, 4> ResultMemLocs;
+  // Copy results out of physical registers.
+  for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
+    const CCValAssign &VA = RVLocs[i];
+    if (VA.isRegLoc()) {
+      Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getValVT(),
+                                 InFlag).getValue(1);
+      InFlag = Chain.getValue(2);
+      InVals.push_back(Chain.getValue(0));
+    } else {
+      assert(VA.isMemLoc());
+      ResultMemLocs.push_back(std::make_pair(VA.getLocMemOffset(),
+                                             InVals.size()));
+      // Reserve space for this result.
+      InVals.push_back(SDValue());
+    }
+  }
+
+  // Copy results out of memory.
+  SmallVector<SDValue, 4> MemOpChains;
+  for (unsigned i = 0, e = ResultMemLocs.size(); i != e; ++i) {
+    int offset = ResultMemLocs[i].first;
+    unsigned index = ResultMemLocs[i].second;
+    SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
+    SDValue Ops[] = { Chain, DAG.getConstant(offset / 4, MVT::i32) };
+    SDValue load = DAG.getNode(XCoreISD::LDWSP, dl, VTs, Ops);
+    InVals[index] = load;
+    MemOpChains.push_back(load.getValue(1));
+  }
+
+  // Transform all loads nodes into one single node because
+  // all load nodes are independent of each other.
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
+
+  return Chain;
+}
+
+/// LowerCCCCallTo - functions arguments are copied from virtual
+/// regs to (physical regs)/(stack frame), CALLSEQ_START and
+/// CALLSEQ_END are emitted.
+/// TODO: isTailCall, sret.
+SDValue
+XCoreTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
+                                    CallingConv::ID CallConv, bool isVarArg,
+                                    bool isTailCall,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<SDValue> &OutVals,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                    SDLoc dl, SelectionDAG &DAG,
+                                    SmallVectorImpl<SDValue> &InVals) const {
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+
+  // The ABI dictates there should be one stack slot available to the callee
+  // on function entry (for saving lr).
+  CCInfo.AllocateStack(4, 4);
+
+  CCInfo.AnalyzeCallOperands(Outs, CC_XCore);
+
+  SmallVector<CCValAssign, 16> RVLocs;
+  // Analyze return values to determine the number of bytes of stack required.
+  CCState RetCCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                    getTargetMachine(), RVLocs, *DAG.getContext());
+  RetCCInfo.AllocateStack(CCInfo.getNextStackOffset(), 4);
+  RetCCInfo.AnalyzeCallResult(Ins, RetCC_XCore);
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = RetCCInfo.getNextStackOffset();
+
+  Chain = DAG.getCALLSEQ_START(Chain,DAG.getConstant(NumBytes,
+                                 getPointerTy(), true), dl);
+
+  SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass;
+  SmallVector<SDValue, 12> MemOpChains;
+
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    SDValue Arg = OutVals[i];
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+      default: llvm_unreachable("Unknown loc info!");
+      case CCValAssign::Full: break;
+      case CCValAssign::SExt:
+        Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+      case CCValAssign::ZExt:
+        Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+      case CCValAssign::AExt:
+        Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+    }
+
+    // Arguments that can be passed on register must be kept at
+    // RegsToPass vector
+    if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else {
+      assert(VA.isMemLoc());
+
+      int Offset = VA.getLocMemOffset();
+
+      MemOpChains.push_back(DAG.getNode(XCoreISD::STWSP, dl, MVT::Other,
+                                        Chain, Arg,
+                                        DAG.getConstant(Offset/4, MVT::i32)));
+    }
+  }
+
+  // Transform all store nodes into one single node because
+  // all store nodes are independent of each other.
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
+
+  // Build a sequence of copy-to-reg nodes chained together with token
+  // chain and flag operands which copy the outgoing args into registers.
+  // The InFlag in necessary since all emitted instructions must be
+  // stuck together.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // If the callee is a GlobalAddress node (quite common, every direct call is)
+  // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+  // Likewise ExternalSymbol -> TargetExternalSymbol.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, MVT::i32);
+  else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
+    Callee = DAG.getTargetExternalSymbol(E->getSymbol(), MVT::i32);
+
+  // XCoreBranchLink = #chain, #target_address, #opt_in_flags...
+  //             = Chain, Callee, Reg#1, Reg#2, ...
+  //
+  // Returns a chain & a flag for retval copy to use.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add argument registers to the end of the list so that they are
+  // known live into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  Chain  = DAG.getNode(XCoreISD::BL, dl, NodeTys, Ops);
+  InFlag = Chain.getValue(1);
+
+  // Create the CALLSEQ_END node.
+  Chain = DAG.getCALLSEQ_END(Chain,
+                             DAG.getConstant(NumBytes, getPointerTy(), true),
+                             DAG.getConstant(0, getPointerTy(), true),
+                             InFlag, dl);
+  InFlag = Chain.getValue(1);
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return LowerCallResult(Chain, InFlag, RVLocs, dl, DAG, InVals);
+}
+
+//===----------------------------------------------------------------------===//
+//             Formal Arguments Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+  struct ArgDataPair { SDValue SDV; ISD::ArgFlagsTy Flags; };
+}
+
+/// XCore formal arguments implementation
+SDValue
+XCoreTargetLowering::LowerFormalArguments(SDValue Chain,
+                                          CallingConv::ID CallConv,
+                                          bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                          SDLoc dl,
+                                          SelectionDAG &DAG,
+                                          SmallVectorImpl<SDValue> &InVals)
+                                            const {
+  switch (CallConv)
+  {
+    default:
+      llvm_unreachable("Unsupported calling convention");
+    case CallingConv::C:
+    case CallingConv::Fast:
+      return LowerCCCArguments(Chain, CallConv, isVarArg,
+                               Ins, dl, DAG, InVals);
+  }
+}
+
+/// LowerCCCArguments - transform physical registers into
+/// virtual registers and generate load operations for
+/// arguments places on the stack.
+/// TODO: sret
+SDValue
+XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
+                                       CallingConv::ID CallConv,
+                                       bool isVarArg,
+                                       const SmallVectorImpl<ISD::InputArg>
+                                         &Ins,
+                                       SDLoc dl,
+                                       SelectionDAG &DAG,
+                                       SmallVectorImpl<SDValue> &InVals) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+  XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+
+  CCInfo.AnalyzeFormalArguments(Ins, CC_XCore);
+
+  unsigned StackSlotSize = XCoreFrameLowering::stackSlotSize();
+
+  unsigned LRSaveSize = StackSlotSize;
+
+  if (!isVarArg)
+    XFI->setReturnStackOffset(CCInfo.getNextStackOffset() + LRSaveSize);
+
+  // All getCopyFromReg ops must precede any getMemcpys to prevent the
+  // scheduler clobbering a register before it has been copied.
+  // The stages are:
+  // 1. CopyFromReg (and load) arg & vararg registers.
+  // 2. Chain CopyFromReg nodes into a TokenFactor.
+  // 3. Memcpy 'byVal' args & push final InVals.
+  // 4. Chain mem ops nodes into a TokenFactor.
+  SmallVector<SDValue, 4> CFRegNode;
+  SmallVector<ArgDataPair, 4> ArgData;
+  SmallVector<SDValue, 4> MemOps;
+
+  // 1a. CopyFromReg (and load) arg registers.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+
+    CCValAssign &VA = ArgLocs[i];
+    SDValue ArgIn;
+
+    if (VA.isRegLoc()) {
+      // Arguments passed in registers
+      EVT RegVT = VA.getLocVT();
+      switch (RegVT.getSimpleVT().SimpleTy) {
+      default:
+        {
+#ifndef NDEBUG
+          errs() << "LowerFormalArguments Unhandled argument type: "
+                 << RegVT.getSimpleVT().SimpleTy << "\n";
+#endif
+          llvm_unreachable(nullptr);
+        }
+      case MVT::i32:
+        unsigned VReg = RegInfo.createVirtualRegister(&XCore::GRRegsRegClass);
+        RegInfo.addLiveIn(VA.getLocReg(), VReg);
+        ArgIn = DAG.getCopyFromReg(Chain, dl, VReg, RegVT);
+        CFRegNode.push_back(ArgIn.getValue(ArgIn->getNumValues() - 1));
+      }
+    } else {
+      // sanity check
+      assert(VA.isMemLoc());
+      // Load the argument to a virtual register
+      unsigned ObjSize = VA.getLocVT().getSizeInBits()/8;
+      if (ObjSize > StackSlotSize) {
+        errs() << "LowerFormalArguments Unhandled argument type: "
+               << EVT(VA.getLocVT()).getEVTString()
+               << "\n";
+      }
+      // Create the frame index object for this incoming parameter...
+      int FI = MFI->CreateFixedObject(ObjSize,
+                                      LRSaveSize + VA.getLocMemOffset(),
+                                      true);
+
+      // Create the SelectionDAG nodes corresponding to a load
+      //from this parameter
+      SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
+      ArgIn = DAG.getLoad(VA.getLocVT(), dl, Chain, FIN,
+                          MachinePointerInfo::getFixedStack(FI),
+                          false, false, false, 0);
+    }
+    const ArgDataPair ADP = { ArgIn, Ins[i].Flags };
+    ArgData.push_back(ADP);
+  }
+
+  // 1b. CopyFromReg vararg registers.
+  if (isVarArg) {
+    // Argument registers
+    static const MCPhysReg ArgRegs[] = {
+      XCore::R0, XCore::R1, XCore::R2, XCore::R3
+    };
+    XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
+    unsigned FirstVAReg = CCInfo.getFirstUnallocated(ArgRegs,
+                                                     array_lengthof(ArgRegs));
+    if (FirstVAReg < array_lengthof(ArgRegs)) {
+      int offset = 0;
+      // Save remaining registers, storing higher register numbers at a higher
+      // address
+      for (int i = array_lengthof(ArgRegs) - 1; i >= (int)FirstVAReg; --i) {
+        // Create a stack slot
+        int FI = MFI->CreateFixedObject(4, offset, true);
+        if (i == (int)FirstVAReg) {
+          XFI->setVarArgsFrameIndex(FI);
+        }
+        offset -= StackSlotSize;
+        SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
+        // Move argument from phys reg -> virt reg
+        unsigned VReg = RegInfo.createVirtualRegister(&XCore::GRRegsRegClass);
+        RegInfo.addLiveIn(ArgRegs[i], VReg);
+        SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
+        CFRegNode.push_back(Val.getValue(Val->getNumValues() - 1));
+        // Move argument from virt reg -> stack
+        SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                                     MachinePointerInfo(), false, false, 0);
+        MemOps.push_back(Store);
+      }
+    } else {
+      // This will point to the next argument passed via stack.
+      XFI->setVarArgsFrameIndex(
+        MFI->CreateFixedObject(4, LRSaveSize + CCInfo.getNextStackOffset(),
+                               true));
+    }
+  }
+
+  // 2. chain CopyFromReg nodes into a TokenFactor.
+  if (!CFRegNode.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, CFRegNode);
+
+  // 3. Memcpy 'byVal' args & push final InVals.
+  // Aggregates passed "byVal" need to be copied by the callee.
+  // The callee will use a pointer to this copy, rather than the original
+  // pointer.
+  for (SmallVectorImpl<ArgDataPair>::const_iterator ArgDI = ArgData.begin(),
+                                                    ArgDE = ArgData.end();
+       ArgDI != ArgDE; ++ArgDI) {
+    if (ArgDI->Flags.isByVal() && ArgDI->Flags.getByValSize()) {
+      unsigned Size = ArgDI->Flags.getByValSize();
+      unsigned Align = std::max(StackSlotSize, ArgDI->Flags.getByValAlign());
+      // Create a new object on the stack and copy the pointee into it.
+      int FI = MFI->CreateStackObject(Size, Align, false);
+      SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
+      InVals.push_back(FIN);
+      MemOps.push_back(DAG.getMemcpy(Chain, dl, FIN, ArgDI->SDV,
+                                     DAG.getConstant(Size, MVT::i32),
+                                     Align, false, false,
+                                     MachinePointerInfo(),
+                                     MachinePointerInfo()));
+    } else {
+      InVals.push_back(ArgDI->SDV);
+    }
+  }
+
+  // 4, chain mem ops nodes into a TokenFactor.
+  if (!MemOps.empty()) {
+    MemOps.push_back(Chain);
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
+  }
+
+  return Chain;
+}
+
+//===----------------------------------------------------------------------===//
+//               Return Value Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+bool XCoreTargetLowering::
+CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+               bool isVarArg,
+               const SmallVectorImpl<ISD::OutputArg> &Outs,
+               LLVMContext &Context) const {
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), RVLocs, Context);
+  if (!CCInfo.CheckReturn(Outs, RetCC_XCore))
+    return false;
+  if (CCInfo.getNextStackOffset() != 0 && isVarArg)
+    return false;
+  return true;
+}
+
+SDValue
+XCoreTargetLowering::LowerReturn(SDValue Chain,
+                                 CallingConv::ID CallConv, bool isVarArg,
+                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                 const SmallVectorImpl<SDValue> &OutVals,
+                                 SDLoc dl, SelectionDAG &DAG) const {
+
+  XCoreFunctionInfo *XFI =
+    DAG.getMachineFunction().getInfo<XCoreFunctionInfo>();
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+
+  // CCValAssign - represent the assignment of
+  // the return value to a location
+  SmallVector<CCValAssign, 16> RVLocs;
+
+  // CCState - Info about the registers and stack slot.
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), RVLocs, *DAG.getContext());
+
+  // Analyze return values.
+  if (!isVarArg)
+    CCInfo.AllocateStack(XFI->getReturnStackOffset(), 4);
+
+  CCInfo.AnalyzeReturn(Outs, RetCC_XCore);
+
+  SDValue Flag;
+  SmallVector<SDValue, 4> RetOps(1, Chain);
+
+  // Return on XCore is always a "retsp 0"
+  RetOps.push_back(DAG.getConstant(0, MVT::i32));
+
+  SmallVector<SDValue, 4> MemOpChains;
+  // Handle return values that must be copied to memory.
+  for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
+    CCValAssign &VA = RVLocs[i];
+    if (VA.isRegLoc())
+      continue;
+    assert(VA.isMemLoc());
+    if (isVarArg) {
+      report_fatal_error("Can't return value from vararg function in memory");
+    }
+
+    int Offset = VA.getLocMemOffset();
+    unsigned ObjSize = VA.getLocVT().getSizeInBits() / 8;
+    // Create the frame index object for the memory location.
+    int FI = MFI->CreateFixedObject(ObjSize, Offset, false);
+
+    // Create a SelectionDAG node corresponding to a store
+    // to this memory location.
+    SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
+    MemOpChains.push_back(DAG.getStore(Chain, dl, OutVals[i], FIN,
+                          MachinePointerInfo::getFixedStack(FI), false, false,
+                          0));
+  }
+
+  // Transform all store nodes into one single node because
+  // all stores are independent of each other.
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
+
+  // Now handle return values copied to registers.
+  for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
+    CCValAssign &VA = RVLocs[i];
+    if (!VA.isRegLoc())
+      continue;
+    // Copy the result values into the output registers.
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), OutVals[i], Flag);
+
+    // guarantee that all emitted copies are
+    // stuck together, avoiding something bad
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
+
+  RetOps[0] = Chain;  // Update chain.
+
+  // Add the flag if we have it.
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
+  return DAG.getNode(XCoreISD::RETSP, dl, MVT::Other, RetOps);
+}
+
+//===----------------------------------------------------------------------===//
+//  Other Lowering Code
+//===----------------------------------------------------------------------===//
+
+MachineBasicBlock *
+XCoreTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                 MachineBasicBlock *BB) const {
+  const TargetInstrInfo &TII = *getTargetMachine().getInstrInfo();
+  DebugLoc dl = MI->getDebugLoc();
+  assert((MI->getOpcode() == XCore::SELECT_CC) &&
+         "Unexpected instr type to insert");
+
+  // To "insert" a SELECT_CC instruction, we actually have to insert the diamond
+  // control-flow pattern.  The incoming instruction knows the destination vreg
+  // to set, the condition code register to branch on, the true/false values to
+  // select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   cmpTY ccX, r1, r2
+  //   bCC copy1MBB
+  //   fallthrough --> copy0MBB
+  MachineBasicBlock *thisMBB = BB;
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, copy0MBB);
+  F->insert(It, sinkMBB);
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), BB,
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Next, add the true and fallthrough blocks as its successors.
+  BB->addSuccessor(copy0MBB);
+  BB->addSuccessor(sinkMBB);
+
+  BuildMI(BB, dl, TII.get(XCore::BRFT_lru6))
+    .addReg(MI->getOperand(1).getReg()).addMBB(sinkMBB);
+
+  //  copy0MBB:
+  //   %FalseValue = ...
+  //   # fallthrough to sinkMBB
+  BB = copy0MBB;
+
+  // Update machine-CFG edges
+  BB->addSuccessor(sinkMBB);
+
+  //  sinkMBB:
+  //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+  //  ...
+  BB = sinkMBB;
+  BuildMI(*BB, BB->begin(), dl,
+          TII.get(XCore::PHI), MI->getOperand(0).getReg())
+    .addReg(MI->getOperand(3).getReg()).addMBB(copy0MBB)
+    .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+}
+
+//===----------------------------------------------------------------------===//
+// Target Optimization Hooks
+//===----------------------------------------------------------------------===//
+
+SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
+                                             DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc dl(N);
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::INTRINSIC_VOID:
+    switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+    case Intrinsic::xcore_outt:
+    case Intrinsic::xcore_outct:
+    case Intrinsic::xcore_chkct: {
+      SDValue OutVal = N->getOperand(3);
+      // These instructions ignore the high bits.
+      if (OutVal.hasOneUse()) {
+        unsigned BitWidth = OutVal.getValueSizeInBits();
+        APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 8);
+        APInt KnownZero, KnownOne;
+        TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+                                              !DCI.isBeforeLegalizeOps());
+        const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+        if (TLO.ShrinkDemandedConstant(OutVal, DemandedMask) ||
+            TLI.SimplifyDemandedBits(OutVal, DemandedMask, KnownZero, KnownOne,
+                                     TLO))
+          DCI.CommitTargetLoweringOpt(TLO);
+      }
+      break;
+    }
+    case Intrinsic::xcore_setpt: {
+      SDValue Time = N->getOperand(3);
+      // This instruction ignores the high bits.
+      if (Time.hasOneUse()) {
+        unsigned BitWidth = Time.getValueSizeInBits();
+        APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
+        APInt KnownZero, KnownOne;
+        TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+                                              !DCI.isBeforeLegalizeOps());
+        const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+        if (TLO.ShrinkDemandedConstant(Time, DemandedMask) ||
+            TLI.SimplifyDemandedBits(Time, DemandedMask, KnownZero, KnownOne,
+                                     TLO))
+          DCI.CommitTargetLoweringOpt(TLO);
+      }
+      break;
+    }
+    }
+    break;
+  case XCoreISD::LADD: {
+    SDValue N0 = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+    SDValue N2 = N->getOperand(2);
+    ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+    ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+    EVT VT = N0.getValueType();
+
+    // canonicalize constant to RHS
+    if (N0C && !N1C)
+      return DAG.getNode(XCoreISD::LADD, dl, DAG.getVTList(VT, VT), N1, N0, N2);
+
+    // fold (ladd 0, 0, x) -> 0, x & 1
+    if (N0C && N0C->isNullValue() && N1C && N1C->isNullValue()) {
+      SDValue Carry = DAG.getConstant(0, VT);
+      SDValue Result = DAG.getNode(ISD::AND, dl, VT, N2,
+                                   DAG.getConstant(1, VT));
+      SDValue Ops[] = { Result, Carry };
+      return DAG.getMergeValues(Ops, dl);
+    }
+
+    // fold (ladd x, 0, y) -> 0, add x, y iff carry is unused and y has only the
+    // low bit set
+    if (N1C && N1C->isNullValue() && N->hasNUsesOfValue(0, 1)) {
+      APInt KnownZero, KnownOne;
+      APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
+                                         VT.getSizeInBits() - 1);
+      DAG.computeKnownBits(N2, KnownZero, KnownOne);
+      if ((KnownZero & Mask) == Mask) {
+        SDValue Carry = DAG.getConstant(0, VT);
+        SDValue Result = DAG.getNode(ISD::ADD, dl, VT, N0, N2);
+        SDValue Ops[] = { Result, Carry };
+        return DAG.getMergeValues(Ops, dl);
+      }
+    }
+  }
+  break;
+  case XCoreISD::LSUB: {
+    SDValue N0 = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+    SDValue N2 = N->getOperand(2);
+    ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+    ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+    EVT VT = N0.getValueType();
+
+    // fold (lsub 0, 0, x) -> x, -x iff x has only the low bit set
+    if (N0C && N0C->isNullValue() && N1C && N1C->isNullValue()) {
+      APInt KnownZero, KnownOne;
+      APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
+                                         VT.getSizeInBits() - 1);
+      DAG.computeKnownBits(N2, KnownZero, KnownOne);
+      if ((KnownZero & Mask) == Mask) {
+        SDValue Borrow = N2;
+        SDValue Result = DAG.getNode(ISD::SUB, dl, VT,
+                                     DAG.getConstant(0, VT), N2);
+        SDValue Ops[] = { Result, Borrow };
+        return DAG.getMergeValues(Ops, dl);
+      }
+    }
+
+    // fold (lsub x, 0, y) -> 0, sub x, y iff borrow is unused and y has only the
+    // low bit set
+    if (N1C && N1C->isNullValue() && N->hasNUsesOfValue(0, 1)) {
+      APInt KnownZero, KnownOne;
+      APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
+                                         VT.getSizeInBits() - 1);
+      DAG.computeKnownBits(N2, KnownZero, KnownOne);
+      if ((KnownZero & Mask) == Mask) {
+        SDValue Borrow = DAG.getConstant(0, VT);
+        SDValue Result = DAG.getNode(ISD::SUB, dl, VT, N0, N2);
+        SDValue Ops[] = { Result, Borrow };
+        return DAG.getMergeValues(Ops, dl);
+      }
+    }
+  }
+  break;
+  case XCoreISD::LMUL: {
+    SDValue N0 = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+    SDValue N2 = N->getOperand(2);
+    SDValue N3 = N->getOperand(3);
+    ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+    ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+    EVT VT = N0.getValueType();
+    // Canonicalize multiplicative constant to RHS. If both multiplicative
+    // operands are constant canonicalize smallest to RHS.
+    if ((N0C && !N1C) ||
+        (N0C && N1C && N0C->getZExtValue() < N1C->getZExtValue()))
+      return DAG.getNode(XCoreISD::LMUL, dl, DAG.getVTList(VT, VT),
+                         N1, N0, N2, N3);
+
+    // lmul(x, 0, a, b)
+    if (N1C && N1C->isNullValue()) {
+      // If the high result is unused fold to add(a, b)
+      if (N->hasNUsesOfValue(0, 0)) {
+        SDValue Lo = DAG.getNode(ISD::ADD, dl, VT, N2, N3);
+        SDValue Ops[] = { Lo, Lo };
+        return DAG.getMergeValues(Ops, dl);
+      }
+      // Otherwise fold to ladd(a, b, 0)
+      SDValue Result =
+        DAG.getNode(XCoreISD::LADD, dl, DAG.getVTList(VT, VT), N2, N3, N1);
+      SDValue Carry(Result.getNode(), 1);
+      SDValue Ops[] = { Carry, Result };
+      return DAG.getMergeValues(Ops, dl);
+    }
+  }
+  break;
+  case ISD::ADD: {
+    // Fold 32 bit expressions such as add(add(mul(x,y),a),b) ->
+    // lmul(x, y, a, b). The high result of lmul will be ignored.
+    // This is only profitable if the intermediate results are unused
+    // elsewhere.
+    SDValue Mul0, Mul1, Addend0, Addend1;
+    if (N->getValueType(0) == MVT::i32 &&
+        isADDADDMUL(SDValue(N, 0), Mul0, Mul1, Addend0, Addend1, true)) {
+      SDValue Ignored = DAG.getNode(XCoreISD::LMUL, dl,
+                                    DAG.getVTList(MVT::i32, MVT::i32), Mul0,
+                                    Mul1, Addend0, Addend1);
+      SDValue Result(Ignored.getNode(), 1);
+      return Result;
+    }
+    APInt HighMask = APInt::getHighBitsSet(64, 32);
+    // Fold 64 bit expression such as add(add(mul(x,y),a),b) ->
+    // lmul(x, y, a, b) if all operands are zero-extended. We do this
+    // before type legalization as it is messy to match the operands after
+    // that.
+    if (N->getValueType(0) == MVT::i64 &&
+        isADDADDMUL(SDValue(N, 0), Mul0, Mul1, Addend0, Addend1, false) &&
+        DAG.MaskedValueIsZero(Mul0, HighMask) &&
+        DAG.MaskedValueIsZero(Mul1, HighMask) &&
+        DAG.MaskedValueIsZero(Addend0, HighMask) &&
+        DAG.MaskedValueIsZero(Addend1, HighMask)) {
+      SDValue Mul0L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                                  Mul0, DAG.getConstant(0, MVT::i32));
+      SDValue Mul1L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                                  Mul1, DAG.getConstant(0, MVT::i32));
+      SDValue Addend0L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                                     Addend0, DAG.getConstant(0, MVT::i32));
+      SDValue Addend1L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                                     Addend1, DAG.getConstant(0, MVT::i32));
+      SDValue Hi = DAG.getNode(XCoreISD::LMUL, dl,
+                               DAG.getVTList(MVT::i32, MVT::i32), Mul0L, Mul1L,
+                               Addend0L, Addend1L);
+      SDValue Lo(Hi.getNode(), 1);
+      return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
+    }
+  }
+  break;
+  case ISD::STORE: {
+    // Replace unaligned store of unaligned load with memmove.
+    StoreSDNode *ST  = cast<StoreSDNode>(N);
+    if (!DCI.isBeforeLegalize() ||
+        allowsUnalignedMemoryAccesses(ST->getMemoryVT()) ||
+        ST->isVolatile() || ST->isIndexed()) {
+      break;
+    }
+    SDValue Chain = ST->getChain();
+
+    unsigned StoreBits = ST->getMemoryVT().getStoreSizeInBits();
+    if (StoreBits % 8) {
+      break;
+    }
+    unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(
+        ST->getMemoryVT().getTypeForEVT(*DCI.DAG.getContext()));
+    unsigned Alignment = ST->getAlignment();
+    if (Alignment >= ABIAlignment) {
+      break;
+    }
+
+    if (LoadSDNode *LD = dyn_cast<LoadSDNode>(ST->getValue())) {
+      if (LD->hasNUsesOfValue(1, 0) && ST->getMemoryVT() == LD->getMemoryVT() &&
+        LD->getAlignment() == Alignment &&
+        !LD->isVolatile() && !LD->isIndexed() &&
+        Chain.reachesChainWithoutSideEffects(SDValue(LD, 1))) {
+        return DAG.getMemmove(Chain, dl, ST->getBasePtr(),
+                              LD->getBasePtr(),
+                              DAG.getConstant(StoreBits/8, MVT::i32),
+                              Alignment, false, ST->getPointerInfo(),
+                              LD->getPointerInfo());
+      }
+    }
+    break;
+  }
+  }
+  return SDValue();
+}
+
+void XCoreTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
+                                                        APInt &KnownZero,
+                                                        APInt &KnownOne,
+                                                        const SelectionDAG &DAG,
+                                                        unsigned Depth) const {
+  KnownZero = KnownOne = APInt(KnownZero.getBitWidth(), 0);
+  switch (Op.getOpcode()) {
+  default: break;
+  case XCoreISD::LADD:
+  case XCoreISD::LSUB:
+    if (Op.getResNo() == 1) {
+      // Top bits of carry / borrow are clear.
+      KnownZero = APInt::getHighBitsSet(KnownZero.getBitWidth(),
+                                        KnownZero.getBitWidth() - 1);
+    }
+    break;
+  case ISD::INTRINSIC_W_CHAIN:
+    {
+      unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+      switch (IntNo) {
+      case Intrinsic::xcore_getts:
+        // High bits are known to be zero.
+        KnownZero = APInt::getHighBitsSet(KnownZero.getBitWidth(),
+                                          KnownZero.getBitWidth() - 16);
+        break;
+      case Intrinsic::xcore_int:
+      case Intrinsic::xcore_inct:
+        // High bits are known to be zero.
+        KnownZero = APInt::getHighBitsSet(KnownZero.getBitWidth(),
+                                          KnownZero.getBitWidth() - 8);
+        break;
+      case Intrinsic::xcore_testct:
+        // Result is either 0 or 1.
+        KnownZero = APInt::getHighBitsSet(KnownZero.getBitWidth(),
+                                          KnownZero.getBitWidth() - 1);
+        break;
+      case Intrinsic::xcore_testwct:
+        // Result is in the range 0 - 4.
+        KnownZero = APInt::getHighBitsSet(KnownZero.getBitWidth(),
+                                          KnownZero.getBitWidth() - 3);
+        break;
+      }
+    }
+    break;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//  Addressing mode description hooks
+//===----------------------------------------------------------------------===//
+
+static inline bool isImmUs(int64_t val)
+{
+  return (val >= 0 && val <= 11);
+}
+
+static inline bool isImmUs2(int64_t val)
+{
+  return (val%2 == 0 && isImmUs(val/2));
+}
+
+static inline bool isImmUs4(int64_t val)
+{
+  return (val%4 == 0 && isImmUs(val/4));
+}
+
+/// isLegalAddressingMode - Return true if the addressing mode represented
+/// by AM is legal for this target, for a load/store of the specified type.
+bool
+XCoreTargetLowering::isLegalAddressingMode(const AddrMode &AM,
+                                              Type *Ty) const {
+  if (Ty->getTypeID() == Type::VoidTyID)
+    return AM.Scale == 0 && isImmUs(AM.BaseOffs) && isImmUs4(AM.BaseOffs);
+
+  const DataLayout *TD = TM.getDataLayout();
+  unsigned Size = TD->getTypeAllocSize(Ty);
+  if (AM.BaseGV) {
+    return Size >= 4 && !AM.HasBaseReg && AM.Scale == 0 &&
+                 AM.BaseOffs%4 == 0;
+  }
+
+  switch (Size) {
+  case 1:
+    // reg + imm
+    if (AM.Scale == 0) {
+      return isImmUs(AM.BaseOffs);
+    }
+    // reg + reg
+    return AM.Scale == 1 && AM.BaseOffs == 0;
+  case 2:
+  case 3:
+    // reg + imm
+    if (AM.Scale == 0) {
+      return isImmUs2(AM.BaseOffs);
+    }
+    // reg + reg<<1
+    return AM.Scale == 2 && AM.BaseOffs == 0;
+  default:
+    // reg + imm
+    if (AM.Scale == 0) {
+      return isImmUs4(AM.BaseOffs);
+    }
+    // reg + reg<<2
+    return AM.Scale == 4 && AM.BaseOffs == 0;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                           XCore Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+std::pair<unsigned, const TargetRegisterClass*>
+XCoreTargetLowering::
+getRegForInlineAsmConstraint(const std::string &Constraint,
+                             MVT VT) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default : break;
+    case 'r':
+      return std::make_pair(0U, &XCore::GRRegsRegClass);
+    }
+  }
+  // Use the default implementation in TargetLowering to convert the register
+  // constraint into a member of a register class.
+  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreISelLowering.h b/contrib/llvm/lib/Target/XCore/XCoreISelLowering.h
new file mode 100644
index 0000000..62b89c3
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreISelLowering.h
@@ -0,0 +1,218 @@
+//===-- XCoreISelLowering.h - XCore DAG Lowering Interface ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that XCore uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef XCOREISELLOWERING_H
+#define XCOREISELLOWERING_H
+
+#include "XCore.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+  // Forward delcarations
+  class XCoreSubtarget;
+  class XCoreTargetMachine;
+
+  namespace XCoreISD {
+    enum NodeType {
+      // Start the numbering where the builtin ops and target ops leave off.
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+      // Branch and link (call)
+      BL,
+
+      // pc relative address
+      PCRelativeWrapper,
+
+      // dp relative address
+      DPRelativeWrapper,
+
+      // cp relative address
+      CPRelativeWrapper,
+
+      // Load word from stack
+      LDWSP,
+
+      // Store word to stack
+      STWSP,
+
+      // Corresponds to retsp instruction
+      RETSP,
+
+      // Corresponds to LADD instruction
+      LADD,
+
+      // Corresponds to LSUB instruction
+      LSUB,
+
+      // Corresponds to LMUL instruction
+      LMUL,
+
+      // Corresponds to MACCU instruction
+      MACCU,
+
+      // Corresponds to MACCS instruction
+      MACCS,
+
+      // Corresponds to CRC8 instruction
+      CRC8,
+
+      // Jumptable branch.
+      BR_JT,
+
+      // Jumptable branch using long branches for each entry.
+      BR_JT32,
+
+      // Offset from frame pointer to the first (possible) on-stack argument
+      FRAME_TO_ARGS_OFFSET,
+
+      // Exception handler return. The stack is restored to the first
+      // followed by a jump to the second argument.
+      EH_RETURN,
+
+      // Memory barrier.
+      MEMBARRIER
+    };
+  }
+
+  //===--------------------------------------------------------------------===//
+  // TargetLowering Implementation
+  //===--------------------------------------------------------------------===//
+  class XCoreTargetLowering : public TargetLowering
+  {
+  public:
+
+    explicit XCoreTargetLowering(const TargetMachine &TM);
+
+    using TargetLowering::isZExtFree;
+    bool isZExtFree(SDValue Val, EVT VT2) const override;
+
+
+    unsigned getJumpTableEncoding() const override;
+    MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i32; }
+
+    /// LowerOperation - Provide custom lowering hooks for some operations.
+    SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+    /// ReplaceNodeResults - Replace the results of node with an illegal result
+    /// type with new values built out of custom code.
+    ///
+    void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+                            SelectionDAG &DAG) const override;
+
+    /// getTargetNodeName - This method returns the name of a target specific
+    //  DAG node.
+    const char *getTargetNodeName(unsigned Opcode) const override;
+
+    MachineBasicBlock *
+      EmitInstrWithCustomInserter(MachineInstr *MI,
+                                  MachineBasicBlock *MBB) const override;
+
+    bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
+
+  private:
+    const TargetMachine &TM;
+    const XCoreSubtarget &Subtarget;
+
+    // Lower Operand helpers
+    SDValue LowerCCCArguments(SDValue Chain,
+                              CallingConv::ID CallConv,
+                              bool isVarArg,
+                              const SmallVectorImpl<ISD::InputArg> &Ins,
+                              SDLoc dl, SelectionDAG &DAG,
+                              SmallVectorImpl<SDValue> &InVals) const;
+    SDValue LowerCCCCallTo(SDValue Chain, SDValue Callee,
+                           CallingConv::ID CallConv, bool isVarArg,
+                           bool isTailCall,
+                           const SmallVectorImpl<ISD::OutputArg> &Outs,
+                           const SmallVectorImpl<SDValue> &OutVals,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           SDLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals) const;
+    SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;
+    SDValue getGlobalAddressWrapper(SDValue GA, const GlobalValue *GV,
+                                    SelectionDAG &DAG) const;
+    SDValue lowerLoadWordFromAlignedBasePlusOffset(SDLoc DL, SDValue Chain,
+                                                   SDValue Base, int64_t Offset,
+                                                   SelectionDAG &DAG) const;
+
+    // Lower Operand specifics
+    SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerUMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const;
+
+    // Inline asm support
+    std::pair<unsigned, const TargetRegisterClass*>
+    getRegForInlineAsmConstraint(const std::string &Constraint,
+                                 MVT VT) const override;
+
+    // Expand specifics
+    SDValue TryExpandADDWithMul(SDNode *Op, SelectionDAG &DAG) const;
+    SDValue ExpandADDSUB(SDNode *Op, SelectionDAG &DAG) const;
+
+    SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
+    void computeKnownBitsForTargetNode(const SDValue Op,
+                                       APInt &KnownZero,
+                                       APInt &KnownOne,
+                                       const SelectionDAG &DAG,
+                                       unsigned Depth = 0) const override;
+
+    SDValue
+      LowerFormalArguments(SDValue Chain,
+                           CallingConv::ID CallConv,
+                           bool isVarArg,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           SDLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals) const override;
+
+    SDValue
+      LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                SmallVectorImpl<SDValue> &InVals) const override;
+
+    SDValue
+      LowerReturn(SDValue Chain,
+                  CallingConv::ID CallConv, bool isVarArg,
+                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  const SmallVectorImpl<SDValue> &OutVals,
+                  SDLoc dl, SelectionDAG &DAG) const override;
+
+    bool
+      CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+                     bool isVarArg,
+                     const SmallVectorImpl<ISD::OutputArg> &ArgsFlags,
+                     LLVMContext &Context) const override;
+  };
+}
+
+#endif // XCOREISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/XCore/XCoreInstrFormats.td b/contrib/llvm/lib/Target/XCore/XCoreInstrFormats.td
new file mode 100644
index 0000000..379cc39
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreInstrFormats.td
@@ -0,0 +1,277 @@
+//===-- XCoreInstrFormats.td - XCore Instruction Formats ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass
+//===----------------------------------------------------------------------===//
+class InstXCore<int sz, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : Instruction {
+  field bits<32> Inst;
+
+  let Namespace = "XCore";
+  dag OutOperandList = outs;
+  dag InOperandList = ins;
+  let AsmString   = asmstr;
+  let Pattern = pattern;
+  let Size = sz;
+  field bits<32> SoftFail = 0;
+}
+
+// XCore pseudo instructions format
+class PseudoInstXCore<dag outs, dag ins, string asmstr, list<dag> pattern>
+   : InstXCore<0, outs, ins, asmstr, pattern> {
+  let isPseudo = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction formats
+//===----------------------------------------------------------------------===//
+
+class _F3R<bits<5> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<2, outs, ins, asmstr, pattern> {
+  let Inst{15-11} = opc;
+  let DecoderMethod = "Decode3RInstruction";
+}
+
+// 3R with first operand as an immediate. Used for TSETR where the first
+// operand is treated as an immediate since it refers to a register number in
+// another thread.
+class _F3RImm<bits<5> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : _F3R<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "Decode3RImmInstruction";
+}
+
+class _FL3R<bits<9> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<4, outs, ins, asmstr, pattern> {
+  let Inst{31-27} = opc{8-4};
+  let Inst{26-20} = 0b1111110;
+  let Inst{19-16} = opc{3-0};
+
+  let Inst{15-11} = 0b11111;
+  let DecoderMethod = "DecodeL3RInstruction";
+}
+
+// L3R with first operand as both a source and a destination.
+class _FL3RSrcDst<bits<9> opc, dag outs, dag ins, string asmstr,
+                  list<dag> pattern> : _FL3R<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "DecodeL3RSrcDstInstruction";
+}
+
+class _F2RUS<bits<5> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<2, outs, ins, asmstr, pattern> {
+  let Inst{15-11} = opc;
+  let DecoderMethod = "Decode2RUSInstruction";
+}
+
+// 2RUS with bitp operand
+class _F2RUSBitp<bits<5> opc, dag outs, dag ins, string asmstr,
+                 list<dag> pattern>
+    : _F2RUS<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "Decode2RUSBitpInstruction";
+}
+
+class _FL2RUS<bits<9> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<4, outs, ins, asmstr, pattern> {
+  let Inst{31-27} = opc{8-4};
+  let Inst{26-20} = 0b1111110;
+  let Inst{19-16} = opc{3-0};
+
+  let Inst{15-11} = 0b11111;
+  let DecoderMethod = "DecodeL2RUSInstruction";
+}
+
+// L2RUS with bitp operand
+class _FL2RUSBitp<bits<9> opc, dag outs, dag ins, string asmstr,
+                  list<dag> pattern>
+    : _FL2RUS<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "DecodeL2RUSBitpInstruction";
+}
+
+class _FRU6<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<2, outs, ins, asmstr, pattern> {
+  bits<4> a;
+  bits<6> b;
+
+  let Inst{15-10} = opc;
+  let Inst{9-6} = a;
+  let Inst{5-0} = b;
+}
+
+class _FLRU6<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<4, outs, ins, asmstr, pattern> {
+  bits<4> a;
+  bits<16> b;
+
+  let Inst{31-26} = opc;
+  let Inst{25-22} = a;
+  let Inst{21-16} = b{5-0};
+  let Inst{15-10} = 0b111100;
+  let Inst{9-0} = b{15-6};
+}
+
+class _FU6<bits<10> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<2, outs, ins, asmstr, pattern> {
+  bits<6> a;
+
+  let Inst{15-6} = opc;
+  let Inst{5-0} = a;
+}
+
+class _FLU6<bits<10> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<4, outs, ins, asmstr, pattern> {
+  bits<16> a;
+
+  let Inst{31-22} = opc;
+  let Inst{21-16} = a{5-0};
+  let Inst{15-10} = 0b111100;
+  let Inst{9-0} = a{15-6};
+}
+
+class _FU10<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<2, outs, ins, asmstr, pattern> {
+  bits<10> a;
+
+  let Inst{15-10} = opc;
+  let Inst{9-0} = a;
+}
+
+class _FLU10<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<4, outs, ins, asmstr, pattern> {
+  bits<20> a;
+
+  let Inst{31-26} = opc;
+  let Inst{25-16} = a{9-0};
+  let Inst{15-10} = 0b111100;
+  let Inst{9-0} = a{19-10};
+}
+
+class _F2R<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<2, outs, ins, asmstr, pattern> {
+  let Inst{15-11} = opc{5-1};
+  let Inst{4} = opc{0};
+  let DecoderMethod = "Decode2RInstruction";
+}
+
+// 2R with first operand as an immediate. Used for TSETMR where the first
+// operand is treated as an immediate since it refers to a register number in
+// another thread.
+class _F2RImm<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : _F2R<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "Decode2RImmInstruction";
+}
+
+// 2R with first operand as both a source and a destination.
+class _F2RSrcDst<bits<6> opc, dag outs, dag ins, string asmstr,
+                 list<dag> pattern> : _F2R<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "Decode2RSrcDstInstruction";
+}
+
+// Same as 2R with last two operands swapped
+class _FR2R<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : _F2R<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "DecodeR2RInstruction";
+}
+
+class _FRUS<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<2, outs, ins, asmstr, pattern> {
+  let Inst{15-11} = opc{5-1};
+  let Inst{4} = opc{0};
+  let DecoderMethod = "DecodeRUSInstruction";
+}
+
+// RUS with bitp operand
+class _FRUSBitp<bits<6> opc, dag outs, dag ins, string asmstr,
+                list<dag> pattern>
+    : _FRUS<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "DecodeRUSBitpInstruction";
+}
+
+// RUS with first operand as both a source and a destination and a bitp second
+// operand
+class _FRUSSrcDstBitp<bits<6> opc, dag outs, dag ins, string asmstr,
+                      list<dag> pattern>
+    : _FRUS<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "DecodeRUSSrcDstBitpInstruction";
+}
+
+class _FL2R<bits<10> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<4, outs, ins, asmstr, pattern> {
+  let Inst{31-27} = opc{9-5};
+  let Inst{26-20} = 0b1111110;
+  let Inst{19-16} = opc{4-1};
+
+  let Inst{15-11} = 0b11111;
+  let Inst{4} = opc{0};
+  let DecoderMethod = "DecodeL2RInstruction";
+}
+
+// Same as L2R with last two operands swapped
+class _FLR2R<bits<10> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : _FL2R<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "DecodeLR2RInstruction";
+}
+
+class _F1R<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<2, outs, ins, asmstr, pattern> {
+  bits<4> a;
+
+  let Inst{15-11} = opc{5-1};
+  let Inst{10-5} = 0b111111;
+  let Inst{4} = opc{0};
+  let Inst{3-0} = a;
+}
+
+class _F0R<bits<10> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<2, outs, ins, asmstr, pattern> {
+  let Inst{15-11} = opc{9-5};
+  let Inst{10-5} = 0b111111;
+  let Inst{4-0} = opc{4-0};
+}
+
+class _FL4R<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<4, outs, ins, asmstr, pattern> {
+  bits<4> d;
+
+  let Inst{31-27} = opc{5-1};
+  let Inst{26-21} = 0b111111;
+  let Inst{20} = opc{0};
+  let Inst{19-16} = d;
+  let Inst{15-11} = 0b11111;
+}
+
+// L4R with 4th operand as both a source and a destination.
+class _FL4RSrcDst<bits<6> opc, dag outs, dag ins, string asmstr,
+                  list<dag> pattern>
+    : _FL4R<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "DecodeL4RSrcDstInstruction";
+}
+
+// L4R with 1st and 4th operand as both a source and a destination.
+class _FL4RSrcDstSrcDst<bits<6> opc, dag outs, dag ins, string asmstr,
+                        list<dag> pattern>
+    : _FL4R<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "DecodeL4RSrcDstSrcDstInstruction";
+}
+
+class _FL5R<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<4, outs, ins, asmstr, pattern> {
+  let Inst{31-27} = opc{5-1};
+  let Inst{20} = opc{0};
+  let Inst{15-11} = 0b11111;
+
+  let DecoderMethod = "DecodeL5RInstruction";
+}
+
+class _FL6R<bits<5> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<4, outs, ins, asmstr, pattern> {
+  let Inst{31-27} = opc;
+  let Inst{15-11} = 0b11111;
+
+  let DecoderMethod = "DecodeL6RInstruction";
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.cpp b/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.cpp
new file mode 100644
index 0000000..36ea9a0
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.cpp
@@ -0,0 +1,461 @@
+//===-- XCoreInstrInfo.cpp - XCore Instruction Information ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the XCore implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreInstrInfo.h"
+#include "XCore.h"
+#include "XCoreMachineFunctionInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "XCoreGenInstrInfo.inc"
+
+namespace llvm {
+namespace XCore {
+
+  // XCore Condition Codes
+  enum CondCode {
+    COND_TRUE,
+    COND_FALSE,
+    COND_INVALID
+  };
+}
+}
+
+// Pin the vtable to this file.
+void XCoreInstrInfo::anchor() {}
+
+XCoreInstrInfo::XCoreInstrInfo()
+  : XCoreGenInstrInfo(XCore::ADJCALLSTACKDOWN, XCore::ADJCALLSTACKUP),
+    RI() {
+}
+
+static bool isZeroImm(const MachineOperand &op) {
+  return op.isImm() && op.getImm() == 0;
+}
+
+/// isLoadFromStackSlot - If the specified machine instruction is a direct
+/// load from a stack slot, return the virtual or physical register number of
+/// the destination along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than loading from the stack slot.
+unsigned
+XCoreInstrInfo::isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const{
+  int Opcode = MI->getOpcode();
+  if (Opcode == XCore::LDWFI) 
+  {
+    if ((MI->getOperand(1).isFI()) && // is a stack slot
+        (MI->getOperand(2).isImm()) &&  // the imm is zero
+        (isZeroImm(MI->getOperand(2)))) 
+    {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+  }
+  return 0;
+}
+  
+  /// isStoreToStackSlot - If the specified machine instruction is a direct
+  /// store to a stack slot, return the virtual or physical register number of
+  /// the source reg along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than storing to the stack slot.
+unsigned
+XCoreInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+                                   int &FrameIndex) const {
+  int Opcode = MI->getOpcode();
+  if (Opcode == XCore::STWFI)
+  {
+    if ((MI->getOperand(1).isFI()) && // is a stack slot
+        (MI->getOperand(2).isImm()) &&  // the imm is zero
+        (isZeroImm(MI->getOperand(2))))
+    {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+  }
+  return 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Branch Analysis
+//===----------------------------------------------------------------------===//
+
+static inline bool IsBRU(unsigned BrOpc) {
+  return BrOpc == XCore::BRFU_u6
+      || BrOpc == XCore::BRFU_lu6
+      || BrOpc == XCore::BRBU_u6
+      || BrOpc == XCore::BRBU_lu6;
+}
+
+static inline bool IsBRT(unsigned BrOpc) {
+  return BrOpc == XCore::BRFT_ru6
+      || BrOpc == XCore::BRFT_lru6
+      || BrOpc == XCore::BRBT_ru6
+      || BrOpc == XCore::BRBT_lru6;
+}
+
+static inline bool IsBRF(unsigned BrOpc) {
+  return BrOpc == XCore::BRFF_ru6
+      || BrOpc == XCore::BRFF_lru6
+      || BrOpc == XCore::BRBF_ru6
+      || BrOpc == XCore::BRBF_lru6;
+}
+
+static inline bool IsCondBranch(unsigned BrOpc) {
+  return IsBRF(BrOpc) || IsBRT(BrOpc);
+}
+
+static inline bool IsBR_JT(unsigned BrOpc) {
+  return BrOpc == XCore::BR_JT
+      || BrOpc == XCore::BR_JT32;
+}
+
+/// GetCondFromBranchOpc - Return the XCore CC that matches 
+/// the correspondent Branch instruction opcode.
+static XCore::CondCode GetCondFromBranchOpc(unsigned BrOpc) 
+{
+  if (IsBRT(BrOpc)) {
+    return XCore::COND_TRUE;
+  } else if (IsBRF(BrOpc)) {
+    return XCore::COND_FALSE;
+  } else {
+    return XCore::COND_INVALID;
+  }
+}
+
+/// GetCondBranchFromCond - Return the Branch instruction
+/// opcode that matches the cc.
+static inline unsigned GetCondBranchFromCond(XCore::CondCode CC) 
+{
+  switch (CC) {
+  default: llvm_unreachable("Illegal condition code!");
+  case XCore::COND_TRUE   : return XCore::BRFT_lru6;
+  case XCore::COND_FALSE  : return XCore::BRFF_lru6;
+  }
+}
+
+/// GetOppositeBranchCondition - Return the inverse of the specified 
+/// condition, e.g. turning COND_E to COND_NE.
+static inline XCore::CondCode GetOppositeBranchCondition(XCore::CondCode CC)
+{
+  switch (CC) {
+  default: llvm_unreachable("Illegal condition code!");
+  case XCore::COND_TRUE   : return XCore::COND_FALSE;
+  case XCore::COND_FALSE  : return XCore::COND_TRUE;
+  }
+}
+
+/// AnalyzeBranch - Analyze the branching code at the end of MBB, returning
+/// true if it cannot be understood (e.g. it's a switch dispatch or isn't
+/// implemented for a target).  Upon success, this returns false and returns
+/// with the following information in various cases:
+///
+/// 1. If this block ends with no branches (it just falls through to its succ)
+///    just return false, leaving TBB/FBB null.
+/// 2. If this block ends with only an unconditional branch, it sets TBB to be
+///    the destination block.
+/// 3. If this block ends with an conditional branch and it falls through to
+///    an successor block, it sets TBB to be the branch destination block and a
+///    list of operands that evaluate the condition. These
+///    operands can be passed to other TargetInstrInfo methods to create new
+///    branches.
+/// 4. If this block ends with an conditional branch and an unconditional
+///    block, it returns the 'true' destination in TBB, the 'false' destination
+///    in FBB, and a list of operands that evaluate the condition. These
+///    operands can be passed to other TargetInstrInfo methods to create new
+///    branches.
+///
+/// Note that RemoveBranch and InsertBranch must be implemented to support
+/// cases where this method returns success.
+///
+bool
+XCoreInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                              MachineBasicBlock *&FBB,
+                              SmallVectorImpl<MachineOperand> &Cond,
+                              bool AllowModify) const {
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin())
+    return false;
+  --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return false;
+    --I;
+  }
+  if (!isUnpredicatedTerminator(I))
+    return false;
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = I;
+  
+  // If there is only one terminator instruction, process it.
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+    if (IsBRU(LastInst->getOpcode())) {
+      TBB = LastInst->getOperand(0).getMBB();
+      return false;
+    }
+    
+    XCore::CondCode BranchCode = GetCondFromBranchOpc(LastInst->getOpcode());
+    if (BranchCode == XCore::COND_INVALID)
+      return true;  // Can't handle indirect branch.
+    
+    // Conditional branch
+    // Block ends with fall-through condbranch.
+
+    TBB = LastInst->getOperand(1).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(BranchCode));
+    Cond.push_back(LastInst->getOperand(0));
+    return false;
+  }
+  
+  // Get the instruction before it if it's a terminator.
+  MachineInstr *SecondLastInst = I;
+
+  // If there are three terminators, we don't know what sort of block this is.
+  if (SecondLastInst && I != MBB.begin() &&
+      isUnpredicatedTerminator(--I))
+    return true;
+  
+  unsigned SecondLastOpc    = SecondLastInst->getOpcode();
+  XCore::CondCode BranchCode = GetCondFromBranchOpc(SecondLastOpc);
+  
+  // If the block ends with conditional branch followed by unconditional,
+  // handle it.
+  if (BranchCode != XCore::COND_INVALID
+    && IsBRU(LastInst->getOpcode())) {
+
+    TBB = SecondLastInst->getOperand(1).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(BranchCode));
+    Cond.push_back(SecondLastInst->getOperand(0));
+
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  }
+  
+  // If the block ends with two unconditional branches, handle it.  The second
+  // one is not executed, so remove it.
+  if (IsBRU(SecondLastInst->getOpcode()) && 
+      IsBRU(LastInst->getOpcode())) {
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return false;
+  }
+
+  // Likewise if it ends with a branch table followed by an unconditional branch.
+  if (IsBR_JT(SecondLastInst->getOpcode()) && IsBRU(LastInst->getOpcode())) {
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return true;
+  }
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+unsigned
+XCoreInstrInfo::InsertBranch(MachineBasicBlock &MBB,MachineBasicBlock *TBB,
+                             MachineBasicBlock *FBB,
+                             const SmallVectorImpl<MachineOperand> &Cond,
+                             DebugLoc DL)const{
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 2 || Cond.size() == 0) &&
+         "Unexpected number of components!");
+  
+  if (!FBB) { // One way branch.
+    if (Cond.empty()) {
+      // Unconditional branch
+      BuildMI(&MBB, DL, get(XCore::BRFU_lu6)).addMBB(TBB);
+    } else {
+      // Conditional branch.
+      unsigned Opc = GetCondBranchFromCond((XCore::CondCode)Cond[0].getImm());
+      BuildMI(&MBB, DL, get(Opc)).addReg(Cond[1].getReg())
+                             .addMBB(TBB);
+    }
+    return 1;
+  }
+  
+  // Two-way Conditional branch.
+  assert(Cond.size() == 2 && "Unexpected number of components!");
+  unsigned Opc = GetCondBranchFromCond((XCore::CondCode)Cond[0].getImm());
+  BuildMI(&MBB, DL, get(Opc)).addReg(Cond[1].getReg())
+                         .addMBB(TBB);
+  BuildMI(&MBB, DL, get(XCore::BRFU_lu6)).addMBB(FBB);
+  return 2;
+}
+
+unsigned
+XCoreInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin()) return 0;
+  --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return 0;
+    --I;
+  }
+  if (!IsBRU(I->getOpcode()) && !IsCondBranch(I->getOpcode()))
+    return 0;
+  
+  // Remove the branch.
+  I->eraseFromParent();
+  
+  I = MBB.end();
+
+  if (I == MBB.begin()) return 1;
+  --I;
+  if (!IsCondBranch(I->getOpcode()))
+    return 1;
+  
+  // Remove the branch.
+  I->eraseFromParent();
+  return 2;
+}
+
+void XCoreInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator I, DebugLoc DL,
+                                 unsigned DestReg, unsigned SrcReg,
+                                 bool KillSrc) const {
+  bool GRDest = XCore::GRRegsRegClass.contains(DestReg);
+  bool GRSrc  = XCore::GRRegsRegClass.contains(SrcReg);
+
+  if (GRDest && GRSrc) {
+    BuildMI(MBB, I, DL, get(XCore::ADD_2rus), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc))
+      .addImm(0);
+    return;
+  }
+  
+  if (GRDest && SrcReg == XCore::SP) {
+    BuildMI(MBB, I, DL, get(XCore::LDAWSP_ru6), DestReg).addImm(0);
+    return;
+  }
+
+  if (DestReg == XCore::SP && GRSrc) {
+    BuildMI(MBB, I, DL, get(XCore::SETSP_1r))
+      .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+  llvm_unreachable("Impossible reg-to-reg copy");
+}
+
+void XCoreInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator I,
+                                         unsigned SrcReg, bool isKill,
+                                         int FrameIndex,
+                                         const TargetRegisterClass *RC,
+                                         const TargetRegisterInfo *TRI) const
+{
+  DebugLoc DL;
+  if (I != MBB.end() && !I->isDebugValue())
+    DL = I->getDebugLoc();
+  MachineFunction *MF = MBB.getParent();
+  const MachineFrameInfo &MFI = *MF->getFrameInfo();
+  MachineMemOperand *MMO =
+    MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIndex),
+                             MachineMemOperand::MOStore,
+                             MFI.getObjectSize(FrameIndex),
+                             MFI.getObjectAlignment(FrameIndex));
+  BuildMI(MBB, I, DL, get(XCore::STWFI))
+    .addReg(SrcReg, getKillRegState(isKill))
+    .addFrameIndex(FrameIndex)
+    .addImm(0)
+    .addMemOperand(MMO);
+}
+
+void XCoreInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator I,
+                                          unsigned DestReg, int FrameIndex,
+                                          const TargetRegisterClass *RC,
+                                          const TargetRegisterInfo *TRI) const
+{
+  DebugLoc DL;
+  if (I != MBB.end() && !I->isDebugValue())
+    DL = I->getDebugLoc();
+  MachineFunction *MF = MBB.getParent();
+  const MachineFrameInfo &MFI = *MF->getFrameInfo();
+  MachineMemOperand *MMO =
+    MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIndex),
+                             MachineMemOperand::MOLoad,
+                             MFI.getObjectSize(FrameIndex),
+                             MFI.getObjectAlignment(FrameIndex));
+  BuildMI(MBB, I, DL, get(XCore::LDWFI), DestReg)
+    .addFrameIndex(FrameIndex)
+    .addImm(0)
+    .addMemOperand(MMO);
+}
+
+/// ReverseBranchCondition - Return the inverse opcode of the 
+/// specified Branch instruction.
+bool XCoreInstrInfo::
+ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+  assert((Cond.size() == 2) && 
+          "Invalid XCore branch condition!");
+  Cond[0].setImm(GetOppositeBranchCondition((XCore::CondCode)Cond[0].getImm()));
+  return false;
+}
+
+static inline bool isImmU6(unsigned val) {
+  return val < (1 << 6);
+}
+
+static inline bool isImmU16(unsigned val) {
+  return val < (1 << 16);
+}
+
+static bool isImmMskBitp(unsigned val) {
+  if (!isMask_32(val)) {
+    return false;
+  }
+  int N = Log2_32(val) + 1;
+  return (N >= 1 && N <= 8) || N == 16 || N == 24 || N == 32;
+}
+
+MachineBasicBlock::iterator XCoreInstrInfo::loadImmediate(
+                                              MachineBasicBlock &MBB,
+                                              MachineBasicBlock::iterator MI,
+                                              unsigned Reg, uint64_t Value) const {
+  DebugLoc dl;
+  if (MI != MBB.end() && !MI->isDebugValue())
+    dl = MI->getDebugLoc();
+  if (isImmMskBitp(Value)) {
+    int N = Log2_32(Value) + 1;
+    return BuildMI(MBB, MI, dl, get(XCore::MKMSK_rus), Reg).addImm(N);
+  }
+  if (isImmU16(Value)) {
+    int Opcode = isImmU6(Value) ? XCore::LDC_ru6 : XCore::LDC_lru6;
+    return BuildMI(MBB, MI, dl, get(Opcode), Reg).addImm(Value);
+  }
+  MachineConstantPool *ConstantPool = MBB.getParent()->getConstantPool();
+  const Constant *C = ConstantInt::get(
+        Type::getInt32Ty(MBB.getParent()->getFunction()->getContext()), Value);
+  unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4);
+  return BuildMI(MBB, MI, dl, get(XCore::LDWCP_lru6), Reg)
+            .addConstantPoolIndex(Idx);
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.h b/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.h
new file mode 100644
index 0000000..e0be96b
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.h
@@ -0,0 +1,94 @@
+//===-- XCoreInstrInfo.h - XCore Instruction Information --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the XCore implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef XCOREINSTRUCTIONINFO_H
+#define XCOREINSTRUCTIONINFO_H
+
+#include "XCoreRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "XCoreGenInstrInfo.inc"
+
+namespace llvm {
+
+class XCoreInstrInfo : public XCoreGenInstrInfo {
+  const XCoreRegisterInfo RI;
+  virtual void anchor();
+public:
+  XCoreInstrInfo();
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  const TargetRegisterInfo &getRegisterInfo() const { return RI; }
+
+  /// isLoadFromStackSlot - If the specified machine instruction is a direct
+  /// load from a stack slot, return the virtual or physical register number of
+  /// the destination along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than loading from the stack slot.
+  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                               int &FrameIndex) const override;
+
+  /// isStoreToStackSlot - If the specified machine instruction is a direct
+  /// store to a stack slot, return the virtual or physical register number of
+  /// the source reg along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than storing to the stack slot.
+  unsigned isStoreToStackSlot(const MachineInstr *MI,
+                              int &FrameIndex) const override;
+
+  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify) const override;
+
+  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB,
+                        const SmallVectorImpl<MachineOperand> &Cond,
+                        DebugLoc DL) const override;
+
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
+
+  void copyPhysReg(MachineBasicBlock &MBB,
+                   MachineBasicBlock::iterator I, DebugLoc DL,
+                   unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
+
+  bool ReverseBranchCondition(
+                          SmallVectorImpl<MachineOperand> &Cond) const override;
+
+  // Emit code before MBBI to load immediate value into physical register Reg.
+  // Returns an iterator to the new instruction.
+  MachineBasicBlock::iterator loadImmediate(MachineBasicBlock &MBB,
+                                            MachineBasicBlock::iterator MI,
+                                            unsigned Reg, uint64_t Value) const;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.td b/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.td
new file mode 100644
index 0000000..00cb705
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.td
@@ -0,0 +1,1319 @@
+//===-- XCoreInstrInfo.td - Target Description for XCore ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the XCore instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+// Uses of CP, DP are not currently reflected in the patterns, since
+// having a physical register as an operand prevents loop hoisting and
+// since the value of these registers never changes during the life of the
+// function.
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass.
+//===----------------------------------------------------------------------===//
+
+include "XCoreInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// XCore specific DAG Nodes.
+//
+
+// Call
+def SDT_XCoreBranchLink : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+def XCoreBranchLink     : SDNode<"XCoreISD::BL",SDT_XCoreBranchLink,
+                            [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                             SDNPVariadic]>;
+
+def XCoreRetsp : SDNode<"XCoreISD::RETSP", SDTBrind,
+                      [SDNPHasChain, SDNPOptInGlue, SDNPMayLoad, SDNPVariadic]>;
+
+def SDT_XCoreEhRet : SDTypeProfile<0, 2,
+                            [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
+def XCoreEhRet       : SDNode<"XCoreISD::EH_RETURN", SDT_XCoreEhRet,
+                         [SDNPHasChain, SDNPOptInGlue]>;
+
+def SDT_XCoreBR_JT    : SDTypeProfile<0, 2,
+                                      [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
+
+def XCoreBR_JT : SDNode<"XCoreISD::BR_JT", SDT_XCoreBR_JT,
+                        [SDNPHasChain]>;
+
+def XCoreBR_JT32 : SDNode<"XCoreISD::BR_JT32", SDT_XCoreBR_JT,
+                        [SDNPHasChain]>;
+
+def SDT_XCoreAddress    : SDTypeProfile<1, 1,
+                            [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
+
+def pcrelwrapper : SDNode<"XCoreISD::PCRelativeWrapper", SDT_XCoreAddress,
+                           []>;
+
+def dprelwrapper : SDNode<"XCoreISD::DPRelativeWrapper", SDT_XCoreAddress,
+                           []>;
+
+def cprelwrapper : SDNode<"XCoreISD::CPRelativeWrapper", SDT_XCoreAddress,
+                           []>;
+
+def frametoargsoffset : SDNode<"XCoreISD::FRAME_TO_ARGS_OFFSET", SDTIntLeaf,
+                               []>;
+
+def SDT_XCoreStwsp    : SDTypeProfile<0, 2, [SDTCisInt<1>]>;
+def XCoreStwsp        : SDNode<"XCoreISD::STWSP", SDT_XCoreStwsp,
+                               [SDNPHasChain, SDNPMayStore]>;
+
+def SDT_XCoreLdwsp    : SDTypeProfile<1, 1, [SDTCisInt<1>]>;
+def XCoreLdwsp        : SDNode<"XCoreISD::LDWSP", SDT_XCoreLdwsp,
+                               [SDNPHasChain, SDNPMayLoad]>;
+
+// These are target-independent nodes, but have target-specific formats.
+def SDT_XCoreCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_XCoreCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>,
+                                        SDTCisVT<1, i32> ]>;
+
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_XCoreCallSeqStart,
+                           [SDNPHasChain, SDNPOutGlue]>;
+def callseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_XCoreCallSeqEnd,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def SDT_XCoreMEMBARRIER : SDTypeProfile<0, 0, []>;
+
+def XCoreMemBarrier : SDNode<"XCoreISD::MEMBARRIER", SDT_XCoreMEMBARRIER,
+                             [SDNPHasChain]>;
+
+//===----------------------------------------------------------------------===//
+// Instruction Pattern Stuff
+//===----------------------------------------------------------------------===//
+
+def div4_xform : SDNodeXForm<imm, [{
+  // Transformation function: imm/4
+  assert(N->getZExtValue() % 4 == 0);
+  return getI32Imm(N->getZExtValue()/4);
+}]>;
+
+def msksize_xform : SDNodeXForm<imm, [{
+  // Transformation function: get the size of a mask
+  assert(isMask_32(N->getZExtValue()));
+  // look for the first non-zero bit
+  return getI32Imm(32 - countLeadingZeros((uint32_t)N->getZExtValue()));
+}]>;
+
+def neg_xform : SDNodeXForm<imm, [{
+  // Transformation function: -imm
+  uint32_t value = N->getZExtValue();
+  return getI32Imm(-value);
+}]>;
+
+def bpwsub_xform : SDNodeXForm<imm, [{
+  // Transformation function: 32-imm
+  uint32_t value = N->getZExtValue();
+  return getI32Imm(32-value);
+}]>;
+
+def div4neg_xform : SDNodeXForm<imm, [{
+  // Transformation function: -imm/4
+  uint32_t value = N->getZExtValue();
+  assert(-value % 4 == 0);
+  return getI32Imm(-value/4);
+}]>;
+
+def immUs4Neg : PatLeaf<(imm), [{
+  uint32_t value = (uint32_t)N->getZExtValue();
+  return (-value)%4 == 0 && (-value)/4 <= 11;
+}]>;
+
+def immUs4 : PatLeaf<(imm), [{
+  uint32_t value = (uint32_t)N->getZExtValue();
+  return value%4 == 0 && value/4 <= 11;
+}]>;
+
+def immUsNeg : PatLeaf<(imm), [{
+  return -((uint32_t)N->getZExtValue()) <= 11;
+}]>;
+
+def immUs : PatLeaf<(imm), [{
+  return (uint32_t)N->getZExtValue() <= 11;
+}]>;
+
+def immU6 : PatLeaf<(imm), [{
+  return (uint32_t)N->getZExtValue() < (1 << 6);
+}]>;
+
+def immU10 : PatLeaf<(imm), [{
+  return (uint32_t)N->getZExtValue() < (1 << 10);
+}]>;
+
+def immU16 : PatLeaf<(imm), [{
+  return (uint32_t)N->getZExtValue() < (1 << 16);
+}]>;
+
+def immU20 : PatLeaf<(imm), [{
+  return (uint32_t)N->getZExtValue() < (1 << 20);
+}]>;
+
+def immMskBitp : PatLeaf<(imm), [{ return immMskBitp(N); }]>;
+
+def immBitp : PatLeaf<(imm), [{
+  uint32_t value = (uint32_t)N->getZExtValue();
+  return (value >= 1 && value <= 8)
+          || value == 16
+          || value == 24
+          || value == 32;
+}]>;
+
+def immBpwSubBitp : PatLeaf<(imm), [{
+  uint32_t value = (uint32_t)N->getZExtValue();
+  return (value >= 24 && value <= 31)
+          || value == 16
+          || value == 8
+          || value == 0;
+}]>;
+
+def lda16f : PatFrag<(ops node:$addr, node:$offset),
+                     (add node:$addr, (shl node:$offset, 1))>;
+def lda16b : PatFrag<(ops node:$addr, node:$offset),
+                     (sub node:$addr, (shl node:$offset, 1))>;
+def ldawf : PatFrag<(ops node:$addr, node:$offset),
+                     (add node:$addr, (shl node:$offset, 2))>;
+def ldawb : PatFrag<(ops node:$addr, node:$offset),
+                     (sub node:$addr, (shl node:$offset, 2))>;
+
+// Instruction operand types
+def pcrel_imm  : Operand<i32>;
+def pcrel_imm_neg  : Operand<i32> {
+  let DecoderMethod = "DecodeNegImmOperand";
+}
+def brtarget : Operand<OtherVT>;
+def brtarget_neg : Operand<OtherVT> {
+  let DecoderMethod = "DecodeNegImmOperand";
+}
+
+// Addressing modes
+def ADDRspii : ComplexPattern<i32, 2, "SelectADDRspii", [add, frameindex], []>;
+
+// Address operands
+def MEMii : Operand<i32> {
+  let MIOperandInfo = (ops i32imm, i32imm);
+}
+
+// Jump tables.
+def InlineJT : Operand<i32> {
+  let PrintMethod = "printInlineJT";
+}
+
+def InlineJT32 : Operand<i32> {
+  let PrintMethod = "printInlineJT32";
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction Class Templates
+//===----------------------------------------------------------------------===//
+
+// Three operand short
+
+multiclass F3R_2RUS<bits<5> opc1, bits<5> opc2, string OpcStr, SDNode OpNode> {
+  def _3r: _F3R<opc1, (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+                !strconcat(OpcStr, " $dst, $b, $c"),
+                [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
+  def _2rus : _F2RUS<opc2, (outs GRRegs:$dst), (ins GRRegs:$b, i32imm:$c),
+                     !strconcat(OpcStr, " $dst, $b, $c"),
+                     [(set GRRegs:$dst, (OpNode GRRegs:$b, immUs:$c))]>;
+}
+
+multiclass F3R_2RUS_np<bits<5> opc1, bits<5> opc2, string OpcStr> {
+  def _3r: _F3R<opc1, (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+                !strconcat(OpcStr, " $dst, $b, $c"), []>;
+  def _2rus : _F2RUS<opc2, (outs GRRegs:$dst), (ins GRRegs:$b, i32imm:$c),
+                     !strconcat(OpcStr, " $dst, $b, $c"), []>;
+}
+
+multiclass F3R_2RBITP<bits<5> opc1, bits<5> opc2, string OpcStr,
+                      SDNode OpNode> {
+  def _3r: _F3R<opc1, (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+                !strconcat(OpcStr, " $dst, $b, $c"),
+                [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
+  def _2rus : _F2RUSBitp<opc2, (outs GRRegs:$dst), (ins GRRegs:$b, i32imm:$c),
+                         !strconcat(OpcStr, " $dst, $b, $c"),
+                         [(set GRRegs:$dst, (OpNode GRRegs:$b, immBitp:$c))]>;
+}
+
+class F3R<bits<5> opc, string OpcStr, SDNode OpNode> :
+  _F3R<opc, (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+       !strconcat(OpcStr, " $dst, $b, $c"),
+       [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
+
+class F3R_np<bits<5> opc, string OpcStr> :
+  _F3R<opc, (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+       !strconcat(OpcStr, " $dst, $b, $c"), []>;
+// Three operand long
+
+/// FL3R_L2RUS multiclass - Define a normal FL3R/FL2RUS pattern in one shot.
+multiclass FL3R_L2RUS<bits<9> opc1, bits<9> opc2, string OpcStr,
+                      SDNode OpNode> {
+  def _l3r: _FL3R<opc1, (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+                  !strconcat(OpcStr, " $dst, $b, $c"),
+                  [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
+  def _l2rus : _FL2RUS<opc2, (outs GRRegs:$dst), (ins GRRegs:$b, i32imm:$c),
+                       !strconcat(OpcStr, " $dst, $b, $c"),
+                       [(set GRRegs:$dst, (OpNode GRRegs:$b, immUs:$c))]>;
+}
+
+/// FL3R_L2RUS multiclass - Define a normal FL3R/FL2RUS pattern in one shot.
+multiclass FL3R_L2RBITP<bits<9> opc1, bits<9> opc2, string OpcStr,
+                        SDNode OpNode> {
+  def _l3r: _FL3R<opc1, (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+                  !strconcat(OpcStr, " $dst, $b, $c"),
+                  [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
+  def _l2rus : _FL2RUSBitp<opc2, (outs GRRegs:$dst), (ins GRRegs:$b, i32imm:$c),
+                           !strconcat(OpcStr, " $dst, $b, $c"),
+                           [(set GRRegs:$dst, (OpNode GRRegs:$b, immBitp:$c))]>;
+}
+
+class FL3R<bits<9> opc, string OpcStr, SDNode OpNode> :
+  _FL3R<opc, (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+        !strconcat(OpcStr, " $dst, $b, $c"),
+        [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
+
+// Register - U6
+// Operand register - U6
+multiclass FRU6_LRU6_branch<bits<6> opc, string OpcStr> {
+  def _ru6: _FRU6<opc, (outs), (ins GRRegs:$a, brtarget:$b),
+                  !strconcat(OpcStr, " $a, $b"), []>;
+  def _lru6: _FLRU6<opc, (outs), (ins GRRegs:$a, brtarget:$b),
+                    !strconcat(OpcStr, " $a, $b"), []>;
+}
+
+multiclass FRU6_LRU6_backwards_branch<bits<6> opc, string OpcStr> {
+  def _ru6: _FRU6<opc, (outs), (ins GRRegs:$a, brtarget_neg:$b),
+                  !strconcat(OpcStr, " $a, $b"), []>;
+  def _lru6: _FLRU6<opc, (outs), (ins GRRegs:$a, brtarget_neg:$b),
+                    !strconcat(OpcStr, " $a, $b"), []>;
+}
+
+
+// U6
+multiclass FU6_LU6<bits<10> opc, string OpcStr, SDNode OpNode> {
+  def _u6: _FU6<opc, (outs), (ins i32imm:$a), !strconcat(OpcStr, " $a"),
+                [(OpNode immU6:$a)]>;
+  def _lu6: _FLU6<opc, (outs), (ins i32imm:$a), !strconcat(OpcStr, " $a"),
+                  [(OpNode immU16:$a)]>;
+}
+
+multiclass FU6_LU6_int<bits<10> opc, string OpcStr, Intrinsic Int> {
+  def _u6: _FU6<opc, (outs), (ins i32imm:$a), !strconcat(OpcStr, " $a"),
+                [(Int immU6:$a)]>;
+  def _lu6: _FLU6<opc, (outs), (ins i32imm:$a), !strconcat(OpcStr, " $a"),
+                  [(Int immU16:$a)]>;
+}
+
+multiclass FU6_LU6_np<bits<10> opc, string OpcStr> {
+  def _u6: _FU6<opc, (outs), (ins i32imm:$a), !strconcat(OpcStr, " $a"), []>;
+  def _lu6: _FLU6<opc, (outs), (ins i32imm:$a), !strconcat(OpcStr, " $a"), []>;
+}
+
+// Two operand short
+
+class F2R_np<bits<6> opc, string OpcStr> :
+  _F2R<opc, (outs GRRegs:$dst), (ins GRRegs:$b),
+       !strconcat(OpcStr, " $dst, $b"), []>;
+
+// Two operand long
+
+//===----------------------------------------------------------------------===//
+// Pseudo Instructions
+//===----------------------------------------------------------------------===//
+
+let Defs = [SP], Uses = [SP] in {
+def ADJCALLSTACKDOWN : PseudoInstXCore<(outs), (ins i32imm:$amt),
+                               "# ADJCALLSTACKDOWN $amt",
+                               [(callseq_start timm:$amt)]>;
+def ADJCALLSTACKUP : PseudoInstXCore<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+                            "# ADJCALLSTACKUP $amt1",
+                            [(callseq_end timm:$amt1, timm:$amt2)]>;
+}
+
+let isReMaterializable = 1 in
+def FRAME_TO_ARGS_OFFSET : PseudoInstXCore<(outs GRRegs:$dst), (ins),
+                               "# FRAME_TO_ARGS_OFFSET $dst",
+                               [(set GRRegs:$dst, (frametoargsoffset))]>;
+
+let isReturn = 1, isTerminator = 1, isBarrier = 1 in
+def EH_RETURN : PseudoInstXCore<(outs), (ins GRRegs:$s, GRRegs:$handler),
+                               "# EH_RETURN $s, $handler",
+                               [(XCoreEhRet GRRegs:$s, GRRegs:$handler)]>;
+
+def LDWFI : PseudoInstXCore<(outs GRRegs:$dst), (ins MEMii:$addr),
+                             "# LDWFI $dst, $addr",
+                             [(set GRRegs:$dst, (load ADDRspii:$addr))]>;
+
+def LDAWFI : PseudoInstXCore<(outs GRRegs:$dst), (ins MEMii:$addr),
+                             "# LDAWFI $dst, $addr",
+                             [(set GRRegs:$dst, ADDRspii:$addr)]>;
+
+def STWFI : PseudoInstXCore<(outs), (ins GRRegs:$src, MEMii:$addr),
+                            "# STWFI $src, $addr",
+                            [(store GRRegs:$src, ADDRspii:$addr)]>;
+
+// SELECT_CC_* - Used to implement the SELECT_CC DAG operation.  Expanded after
+// instruction selection into a branch sequence.
+let usesCustomInserter = 1 in {
+  def SELECT_CC : PseudoInstXCore<(outs GRRegs:$dst),
+                              (ins GRRegs:$cond, GRRegs:$T, GRRegs:$F),
+                              "# SELECT_CC PSEUDO!",
+                              [(set GRRegs:$dst,
+                                 (select GRRegs:$cond, GRRegs:$T, GRRegs:$F))]>;
+}
+
+let hasSideEffects = 1 in
+def Int_MemBarrier : PseudoInstXCore<(outs), (ins), "#MEMBARRIER",
+                                     [(XCoreMemBarrier)]>;
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+// Three operand short
+defm ADD : F3R_2RUS<0b00010, 0b10010, "add", add>;
+defm SUB : F3R_2RUS<0b00011, 0b10011, "sub", sub>;
+let neverHasSideEffects = 1 in {
+defm EQ : F3R_2RUS_np<0b00110, 0b10110, "eq">;
+def LSS_3r : F3R_np<0b11000, "lss">;
+def LSU_3r : F3R_np<0b11001, "lsu">;
+}
+def AND_3r : F3R<0b00111, "and", and>;
+def OR_3r : F3R<0b01000, "or", or>;
+
+let mayLoad=1 in {
+def LDW_3r : _F3R<0b01001, (outs GRRegs:$dst),
+                  (ins GRRegs:$addr, GRRegs:$offset),
+                  "ldw $dst, $addr[$offset]", []>;
+
+def LDW_2rus : _F2RUS<0b00001, (outs GRRegs:$dst),
+                      (ins GRRegs:$addr, i32imm:$offset),
+                      "ldw $dst, $addr[$offset]", []>;
+
+def LD16S_3r :  _F3R<0b10000, (outs GRRegs:$dst),
+                     (ins GRRegs:$addr, GRRegs:$offset),
+                     "ld16s $dst, $addr[$offset]", []>;
+
+def LD8U_3r :  _F3R<0b10001, (outs GRRegs:$dst),
+                    (ins GRRegs:$addr, GRRegs:$offset),
+                    "ld8u $dst, $addr[$offset]", []>;
+}
+
+let mayStore=1 in {
+def STW_l3r : _FL3R<0b000001100, (outs),
+                    (ins GRRegs:$val, GRRegs:$addr, GRRegs:$offset),
+                    "stw $val, $addr[$offset]", []>;
+
+def STW_2rus : _F2RUS<0b0000, (outs),
+                      (ins GRRegs:$val, GRRegs:$addr, i32imm:$offset),
+                      "stw $val, $addr[$offset]", []>;
+}
+
+defm SHL : F3R_2RBITP<0b00100, 0b10100, "shl", shl>;
+defm SHR : F3R_2RBITP<0b00101, 0b10101, "shr", srl>;
+
+// The first operand is treated as an immediate since it refers to a register
+// number in another thread.
+def TSETR_3r : _F3RImm<0b10111, (outs), (ins i32imm:$a, GRRegs:$b, GRRegs:$c),
+                       "set t[$c]:r$a, $b", []>;
+
+// Three operand long
+def LDAWF_l3r : _FL3R<0b000111100, (outs GRRegs:$dst),
+                      (ins GRRegs:$addr, GRRegs:$offset),
+                      "ldaw $dst, $addr[$offset]",
+                      [(set GRRegs:$dst,
+                         (ldawf GRRegs:$addr, GRRegs:$offset))]>;
+
+let neverHasSideEffects = 1 in
+def LDAWF_l2rus : _FL2RUS<0b100111100, (outs GRRegs:$dst),
+                          (ins GRRegs:$addr, i32imm:$offset),
+                          "ldaw $dst, $addr[$offset]", []>;
+
+def LDAWB_l3r : _FL3R<0b001001100, (outs GRRegs:$dst),
+                      (ins GRRegs:$addr, GRRegs:$offset),
+                      "ldaw $dst, $addr[-$offset]",
+                      [(set GRRegs:$dst,
+                         (ldawb GRRegs:$addr, GRRegs:$offset))]>;
+
+let neverHasSideEffects = 1 in
+def LDAWB_l2rus : _FL2RUS<0b101001100, (outs GRRegs:$dst),
+                         (ins GRRegs:$addr, i32imm:$offset),
+                         "ldaw $dst, $addr[-$offset]", []>;
+
+def LDA16F_l3r : _FL3R<0b001011100, (outs GRRegs:$dst),
+                       (ins GRRegs:$addr, GRRegs:$offset),
+                       "lda16 $dst, $addr[$offset]",
+                       [(set GRRegs:$dst,
+                          (lda16f GRRegs:$addr, GRRegs:$offset))]>;
+
+def LDA16B_l3r : _FL3R<0b001101100, (outs GRRegs:$dst),
+                       (ins GRRegs:$addr, GRRegs:$offset),
+                       "lda16 $dst, $addr[-$offset]",
+                       [(set GRRegs:$dst,
+                          (lda16b GRRegs:$addr, GRRegs:$offset))]>;
+
+def MUL_l3r : FL3R<0b001111100, "mul", mul>;
+// Instructions which may trap are marked as side effecting.
+let hasSideEffects = 1 in {
+def DIVS_l3r : FL3R<0b010001100, "divs", sdiv>;
+def DIVU_l3r : FL3R<0b010011100, "divu", udiv>;
+def REMS_l3r : FL3R<0b110001100, "rems", srem>;
+def REMU_l3r : FL3R<0b110011100, "remu", urem>;
+}
+def XOR_l3r : FL3R<0b000011100, "xor", xor>;
+defm ASHR : FL3R_L2RBITP<0b000101100, 0b100101100, "ashr", sra>;
+
+let Constraints = "$src1 = $dst" in
+def CRC_l3r : _FL3RSrcDst<0b101011100, (outs GRRegs:$dst),
+                          (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3),
+                          "crc32 $dst, $src2, $src3",
+                          [(set GRRegs:$dst,
+                             (int_xcore_crc32 GRRegs:$src1, GRRegs:$src2,
+                                              GRRegs:$src3))]>;
+
+let mayStore=1 in {
+def ST16_l3r : _FL3R<0b100001100, (outs),
+                     (ins GRRegs:$val, GRRegs:$addr, GRRegs:$offset),
+                     "st16 $val, $addr[$offset]", []>;
+
+def ST8_l3r : _FL3R<0b100011100, (outs),
+                    (ins GRRegs:$val, GRRegs:$addr, GRRegs:$offset),
+                    "st8 $val, $addr[$offset]", []>;
+}
+
+def INPW_l2rus : _FL2RUSBitp<0b100101110, (outs GRRegs:$a),
+                             (ins GRRegs:$b, i32imm:$c), "inpw $a, res[$b], $c",
+                             []>;
+
+def OUTPW_l2rus : _FL2RUSBitp<0b100101101, (outs),
+                              (ins GRRegs:$a, GRRegs:$b, i32imm:$c),
+                              "outpw res[$b], $a, $c", []>;
+
+// Four operand long
+let Constraints = "$e = $a,$f = $b" in {
+def MACCU_l4r : _FL4RSrcDstSrcDst<
+  0b000001, (outs GRRegs:$a, GRRegs:$b),
+  (ins GRRegs:$e, GRRegs:$f, GRRegs:$c, GRRegs:$d), "maccu $a, $b, $c, $d", []>;
+
+def MACCS_l4r : _FL4RSrcDstSrcDst<
+  0b000010, (outs GRRegs:$a, GRRegs:$b),
+  (ins GRRegs:$e, GRRegs:$f, GRRegs:$c, GRRegs:$d), "maccs $a, $b, $c, $d", []>;
+}
+
+let Constraints = "$e = $b" in
+def CRC8_l4r : _FL4RSrcDst<0b000000, (outs GRRegs:$a, GRRegs:$b),
+                           (ins GRRegs:$e, GRRegs:$c, GRRegs:$d),
+                           "crc8 $b, $a, $c, $d", []>;
+
+// Five operand long
+
+def LADD_l5r : _FL5R<0b000001, (outs GRRegs:$dst1, GRRegs:$dst2),
+                     (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3),
+                     "ladd $dst2, $dst1, $src1, $src2, $src3",
+                     []>;
+
+def LSUB_l5r : _FL5R<0b000010, (outs GRRegs:$dst1, GRRegs:$dst2),
+                     (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3),
+                     "lsub $dst2, $dst1, $src1, $src2, $src3", []>;
+
+def LDIVU_l5r : _FL5R<0b000000, (outs GRRegs:$dst1, GRRegs:$dst2),
+                      (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3),
+                      "ldivu $dst1, $dst2, $src3, $src1, $src2", []>;
+
+// Six operand long
+
+def LMUL_l6r : _FL6R<
+  0b00000, (outs GRRegs:$dst1, GRRegs:$dst2),
+  (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3, GRRegs:$src4),
+  "lmul $dst1, $dst2, $src1, $src2, $src3, $src4", []>;
+
+// Register - U6
+
+//let Uses = [DP] in ...
+let neverHasSideEffects = 1, isReMaterializable = 1 in
+def LDAWDP_ru6: _FRU6<0b011000, (outs RRegs:$a), (ins i32imm:$b),
+                      "ldaw $a, dp[$b]", []>;
+
+let isReMaterializable = 1 in                    
+def LDAWDP_lru6: _FLRU6<0b011000, (outs RRegs:$a), (ins i32imm:$b),
+                        "ldaw $a, dp[$b]",
+                        [(set RRegs:$a, (dprelwrapper tglobaladdr:$b))]>;
+
+let mayLoad=1 in
+def LDWDP_ru6: _FRU6<0b010110, (outs RRegs:$a), (ins i32imm:$b),
+                     "ldw $a, dp[$b]", []>;
+
+def LDWDP_lru6: _FLRU6<0b010110, (outs RRegs:$a), (ins i32imm:$b),
+                       "ldw $a, dp[$b]",
+                       [(set RRegs:$a, (load (dprelwrapper tglobaladdr:$b)))]>;
+
+let mayStore=1 in
+def STWDP_ru6 : _FRU6<0b010100, (outs), (ins RRegs:$a, i32imm:$b),
+                      "stw $a, dp[$b]", []>;
+
+def STWDP_lru6 : _FLRU6<0b010100, (outs), (ins RRegs:$a, i32imm:$b),
+                        "stw $a, dp[$b]",
+                        [(store RRegs:$a, (dprelwrapper tglobaladdr:$b))]>;
+
+//let Uses = [CP] in ..
+let mayLoad = 1, isReMaterializable = 1, neverHasSideEffects = 1 in {
+def LDWCP_ru6 : _FRU6<0b011011, (outs RRegs:$a), (ins i32imm:$b),
+                      "ldw $a, cp[$b]", []>;
+def LDWCP_lru6: _FLRU6<0b011011, (outs RRegs:$a), (ins i32imm:$b),
+                       "ldw $a, cp[$b]",
+                       [(set RRegs:$a, (load (cprelwrapper tglobaladdr:$b)))]>;
+}
+
+let Uses = [SP] in {
+let mayStore=1 in {
+def STWSP_ru6 : _FRU6<0b010101, (outs), (ins RRegs:$a, i32imm:$b),
+                      "stw $a, sp[$b]",
+                      [(XCoreStwsp RRegs:$a, immU6:$b)]>;
+
+def STWSP_lru6 : _FLRU6<0b010101, (outs), (ins RRegs:$a, i32imm:$b),
+                        "stw $a, sp[$b]",
+                        [(XCoreStwsp RRegs:$a, immU16:$b)]>;
+}
+
+let mayLoad=1 in {
+def LDWSP_ru6 : _FRU6<0b010111, (outs RRegs:$a), (ins i32imm:$b),
+                      "ldw $a, sp[$b]",
+                      [(set RRegs:$a, (XCoreLdwsp immU6:$b))]>;
+
+def LDWSP_lru6 : _FLRU6<0b010111, (outs RRegs:$a), (ins i32imm:$b),
+                        "ldw $a, sp[$b]",
+                        [(set RRegs:$a, (XCoreLdwsp immU16:$b))]>;
+}
+
+let neverHasSideEffects = 1 in {
+def LDAWSP_ru6 : _FRU6<0b011001, (outs RRegs:$a), (ins i32imm:$b),
+                       "ldaw $a, sp[$b]", []>;
+
+def LDAWSP_lru6 : _FLRU6<0b011001, (outs RRegs:$a), (ins i32imm:$b),
+                         "ldaw $a, sp[$b]", []>;
+}
+}
+
+let isReMaterializable = 1 in {
+def LDC_ru6 : _FRU6<0b011010, (outs RRegs:$a), (ins i32imm:$b),
+                    "ldc $a, $b", [(set RRegs:$a, immU6:$b)]>;
+
+def LDC_lru6 : _FLRU6<0b011010, (outs RRegs:$a), (ins i32imm:$b),
+                      "ldc $a, $b", [(set RRegs:$a, immU16:$b)]>;
+}
+
+def SETC_ru6 : _FRU6<0b111010, (outs), (ins GRRegs:$a, i32imm:$b),
+                     "setc res[$a], $b",
+                     [(int_xcore_setc GRRegs:$a, immU6:$b)]>;
+
+def SETC_lru6 : _FLRU6<0b111010, (outs), (ins GRRegs:$a, i32imm:$b),
+                       "setc res[$a], $b",
+                       [(int_xcore_setc GRRegs:$a, immU16:$b)]>;
+
+// Operand register - U6
+let isBranch = 1, isTerminator = 1 in {
+defm BRFT: FRU6_LRU6_branch<0b011100, "bt">;
+defm BRBT: FRU6_LRU6_backwards_branch<0b011101, "bt">;
+defm BRFF: FRU6_LRU6_branch<0b011110, "bf">;
+defm BRBF: FRU6_LRU6_backwards_branch<0b011111, "bf">;
+}
+
+// U6
+let Defs = [SP], Uses = [SP] in {
+let neverHasSideEffects = 1 in
+defm EXTSP : FU6_LU6_np<0b0111011110, "extsp">;
+
+let mayStore = 1 in
+defm ENTSP : FU6_LU6_np<0b0111011101, "entsp">;
+
+let isReturn = 1, isTerminator = 1, mayLoad = 1, isBarrier = 1 in {
+defm RETSP : FU6_LU6<0b0111011111, "retsp", XCoreRetsp>;
+}
+}
+
+let neverHasSideEffects = 1 in
+defm EXTDP : FU6_LU6_np<0b0111001110, "extdp">;
+
+let Uses = [R11], isCall=1 in
+defm BLAT : FU6_LU6_np<0b0111001101, "blat">;
+
+let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
+def BRBU_u6 : _FU6<0b0111011100, (outs), (ins brtarget_neg:$a), "bu $a", []>;
+
+def BRBU_lu6 : _FLU6<0b0111011100, (outs), (ins brtarget_neg:$a), "bu $a", []>;
+
+def BRFU_u6 : _FU6<0b0111001100, (outs), (ins brtarget:$a), "bu $a", []>;
+
+def BRFU_lu6 : _FLU6<0b0111001100, (outs), (ins brtarget:$a), "bu $a", []>;
+}
+
+//let Uses = [CP] in ...
+let Defs = [R11], neverHasSideEffects = 1, isReMaterializable = 1 in
+def LDAWCP_u6: _FU6<0b0111111101, (outs), (ins i32imm:$a), "ldaw r11, cp[$a]",
+                    []>;
+
+let Defs = [R11], isReMaterializable = 1 in
+def LDAWCP_lu6: _FLU6<0b0111111101, (outs), (ins i32imm:$a), "ldaw r11, cp[$a]",
+                      [(set R11, (cprelwrapper tglobaladdr:$a))]>;
+
+let Defs = [R11] in
+defm GETSR : FU6_LU6_np<0b0111111100, "getsr r11,">;
+
+defm SETSR : FU6_LU6_int<0b0111101101, "setsr", int_xcore_setsr>;
+
+defm CLRSR : FU6_LU6_int<0b0111101100, "clrsr", int_xcore_clrsr>;
+
+// setsr may cause a branch if it is used to enable events. clrsr may
+// branch if it is executed while events are enabled.
+let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1,
+    isCodeGenOnly = 1 in {
+defm SETSR_branch : FU6_LU6_np<0b0111101101, "setsr">;
+defm CLRSR_branch : FU6_LU6_np<0b0111101100, "clrsr">;
+}
+
+defm KCALL : FU6_LU6_np<0b0111001111, "kcall">;
+
+let Uses = [SP], Defs = [SP], mayStore = 1 in
+defm KENTSP : FU6_LU6_np<0b0111101110, "kentsp">;
+
+let Uses = [SP], Defs = [SP], mayLoad = 1 in
+defm KRESTSP : FU6_LU6_np<0b0111101111, "krestsp">;
+
+// U10
+
+let Defs = [R11], isReMaterializable = 1 in {
+let neverHasSideEffects = 1 in
+def LDAPF_u10 : _FU10<0b110110, (outs), (ins pcrel_imm:$a), "ldap r11, $a", []>;
+
+def LDAPF_lu10 : _FLU10<0b110110, (outs), (ins pcrel_imm:$a), "ldap r11, $a",
+                        [(set R11, (pcrelwrapper tglobaladdr:$a))]>;
+
+let neverHasSideEffects = 1 in
+def LDAPB_u10 : _FU10<0b110111, (outs), (ins pcrel_imm_neg:$a), "ldap r11, $a",
+                      []>;
+
+let neverHasSideEffects = 1 in
+def LDAPB_lu10 : _FLU10<0b110111, (outs), (ins pcrel_imm_neg:$a),
+                        "ldap r11, $a",
+                        [(set R11, (pcrelwrapper tglobaladdr:$a))]>;
+
+let isCodeGenOnly = 1 in
+def LDAPF_lu10_ba : _FLU10<0b110110, (outs), (ins pcrel_imm:$a), "ldap r11, $a",
+                           [(set R11, (pcrelwrapper tblockaddress:$a))]>;
+}
+
+let isCall=1,
+// All calls clobber the link register and the non-callee-saved registers:
+Defs = [R0, R1, R2, R3, R11, LR], Uses = [SP] in {
+def BLACP_u10 : _FU10<0b111000, (outs), (ins i32imm:$a), "bla cp[$a]", []>;
+
+def BLACP_lu10 : _FLU10<0b111000, (outs), (ins i32imm:$a), "bla cp[$a]", []>;
+
+def BLRF_u10 : _FU10<0b110100, (outs), (ins pcrel_imm:$a), "bl $a",
+                     []>;
+
+def BLRF_lu10 : _FLU10<0b110100, (outs), (ins pcrel_imm:$a), "bl $a",
+                       [(XCoreBranchLink tglobaladdr:$a)]>;
+
+def BLRB_u10 : _FU10<0b110101, (outs), (ins pcrel_imm_neg:$a), "bl $a", []>;
+
+def BLRB_lu10 : _FLU10<0b110101, (outs), (ins pcrel_imm_neg:$a), "bl $a", []>;
+}
+
+let Defs = [R11], mayLoad = 1, isReMaterializable = 1,
+    neverHasSideEffects = 1 in {
+def LDWCP_u10 : _FU10<0b111001, (outs), (ins i32imm:$a), "ldw r11, cp[$a]", []>;
+
+def LDWCP_lu10 : _FLU10<0b111001, (outs), (ins i32imm:$a), "ldw r11, cp[$a]",
+                        []>;
+}
+
+// Two operand short
+def NOT : _F2R<0b100010, (outs GRRegs:$dst), (ins GRRegs:$b),
+                "not $dst, $b", [(set GRRegs:$dst, (not GRRegs:$b))]>;
+
+def NEG : _F2R<0b100100, (outs GRRegs:$dst), (ins GRRegs:$b),
+                "neg $dst, $b", [(set GRRegs:$dst, (ineg GRRegs:$b))]>;
+
+let Constraints = "$src1 = $dst" in {
+def SEXT_rus :
+  _FRUSSrcDstBitp<0b001101, (outs GRRegs:$dst), (ins GRRegs:$src1, i32imm:$src2),
+                  "sext $dst, $src2",
+                  [(set GRRegs:$dst, (int_xcore_sext GRRegs:$src1,
+                                                     immBitp:$src2))]>;
+
+def SEXT_2r :
+  _F2RSrcDst<0b001100, (outs GRRegs:$dst), (ins GRRegs:$src1, GRRegs:$src2),
+             "sext $dst, $src2",
+             [(set GRRegs:$dst, (int_xcore_sext GRRegs:$src1, GRRegs:$src2))]>;
+
+def ZEXT_rus :
+  _FRUSSrcDstBitp<0b010001, (outs GRRegs:$dst), (ins GRRegs:$src1, i32imm:$src2),
+                  "zext $dst, $src2",
+                  [(set GRRegs:$dst, (int_xcore_zext GRRegs:$src1,
+                                                     immBitp:$src2))]>;
+
+def ZEXT_2r :
+  _F2RSrcDst<0b010000, (outs GRRegs:$dst), (ins GRRegs:$src1, GRRegs:$src2),
+             "zext $dst, $src2",
+             [(set GRRegs:$dst, (int_xcore_zext GRRegs:$src1, GRRegs:$src2))]>;
+
+def ANDNOT_2r :
+  _F2RSrcDst<0b001010, (outs GRRegs:$dst), (ins GRRegs:$src1, GRRegs:$src2),
+             "andnot $dst, $src2",
+             [(set GRRegs:$dst, (and GRRegs:$src1, (not GRRegs:$src2)))]>;
+}
+
+let isReMaterializable = 1, neverHasSideEffects = 1 in
+def MKMSK_rus : _FRUSBitp<0b101001, (outs GRRegs:$dst), (ins i32imm:$size),
+                          "mkmsk $dst, $size", []>;
+
+def MKMSK_2r : _F2R<0b101000, (outs GRRegs:$dst), (ins GRRegs:$size),
+                    "mkmsk $dst, $size",
+                    [(set GRRegs:$dst, (add (shl 1, GRRegs:$size), -1))]>;
+
+def GETR_rus : _FRUS<0b100000, (outs GRRegs:$dst), (ins i32imm:$type),
+                     "getr $dst, $type",
+                     [(set GRRegs:$dst, (int_xcore_getr immUs:$type))]>;
+
+def GETTS_2r : _F2R<0b001110, (outs GRRegs:$dst), (ins GRRegs:$r),
+                    "getts $dst, res[$r]",
+                    [(set GRRegs:$dst, (int_xcore_getts GRRegs:$r))]>;
+
+def SETPT_2r : _FR2R<0b001111, (outs), (ins GRRegs:$r, GRRegs:$val),
+                     "setpt res[$r], $val",
+                     [(int_xcore_setpt GRRegs:$r, GRRegs:$val)]>;
+
+def OUTCT_2r : _F2R<0b010010, (outs), (ins GRRegs:$r, GRRegs:$val),
+                    "outct res[$r], $val",
+                    [(int_xcore_outct GRRegs:$r, GRRegs:$val)]>;
+
+def OUTCT_rus : _FRUS<0b010011, (outs), (ins GRRegs:$r, i32imm:$val),
+                       "outct res[$r], $val",
+                       [(int_xcore_outct GRRegs:$r, immUs:$val)]>;
+
+def OUTT_2r : _FR2R<0b000011, (outs), (ins GRRegs:$r, GRRegs:$val),
+                    "outt res[$r], $val",
+                    [(int_xcore_outt GRRegs:$r, GRRegs:$val)]>;
+
+def OUT_2r : _FR2R<0b101010, (outs), (ins GRRegs:$r, GRRegs:$val),
+                   "out res[$r], $val",
+                   [(int_xcore_out GRRegs:$r, GRRegs:$val)]>;
+
+let Constraints = "$src = $dst" in
+def OUTSHR_2r :
+  _F2RSrcDst<0b101011, (outs GRRegs:$dst), (ins GRRegs:$src, GRRegs:$r),
+             "outshr res[$r], $src",
+             [(set GRRegs:$dst, (int_xcore_outshr GRRegs:$r, GRRegs:$src))]>;
+
+def INCT_2r : _F2R<0b100001, (outs GRRegs:$dst), (ins GRRegs:$r),
+                   "inct $dst, res[$r]",
+                   [(set GRRegs:$dst, (int_xcore_inct GRRegs:$r))]>;
+
+def INT_2r : _F2R<0b100011, (outs GRRegs:$dst), (ins GRRegs:$r),
+                  "int $dst, res[$r]",
+                  [(set GRRegs:$dst, (int_xcore_int GRRegs:$r))]>;
+
+def IN_2r : _F2R<0b101100, (outs GRRegs:$dst), (ins GRRegs:$r),
+                 "in $dst, res[$r]",
+                 [(set GRRegs:$dst, (int_xcore_in GRRegs:$r))]>;
+
+let Constraints = "$src = $dst" in
+def INSHR_2r :
+  _F2RSrcDst<0b101101, (outs GRRegs:$dst), (ins GRRegs:$src, GRRegs:$r),
+             "inshr $dst, res[$r]",
+             [(set GRRegs:$dst, (int_xcore_inshr GRRegs:$r, GRRegs:$src))]>;
+
+def CHKCT_2r : _F2R<0b110010, (outs), (ins GRRegs:$r, GRRegs:$val),
+                    "chkct res[$r], $val",
+                    [(int_xcore_chkct GRRegs:$r, GRRegs:$val)]>;
+
+def CHKCT_rus : _FRUSBitp<0b110011, (outs), (ins GRRegs:$r, i32imm:$val),
+                          "chkct res[$r], $val",
+                          [(int_xcore_chkct GRRegs:$r, immUs:$val)]>;
+
+def TESTCT_2r : _F2R<0b101111, (outs GRRegs:$dst), (ins GRRegs:$src),
+                     "testct $dst, res[$src]",
+                     [(set GRRegs:$dst, (int_xcore_testct GRRegs:$src))]>;
+
+def TESTWCT_2r : _F2R<0b110001, (outs GRRegs:$dst), (ins GRRegs:$src),
+                      "testwct $dst, res[$src]",
+                      [(set GRRegs:$dst, (int_xcore_testwct GRRegs:$src))]>;
+
+def SETD_2r : _FR2R<0b000101, (outs), (ins GRRegs:$r, GRRegs:$val),
+                    "setd res[$r], $val",
+                    [(int_xcore_setd GRRegs:$r, GRRegs:$val)]>;
+
+def SETPSC_2r : _FR2R<0b110000, (outs), (ins GRRegs:$src1, GRRegs:$src2),
+                      "setpsc res[$src1], $src2",
+                      [(int_xcore_setpsc GRRegs:$src1, GRRegs:$src2)]>;
+
+def GETST_2r : _F2R<0b000001, (outs GRRegs:$dst), (ins GRRegs:$r),
+                    "getst $dst, res[$r]",
+                    [(set GRRegs:$dst, (int_xcore_getst GRRegs:$r))]>;
+
+def INITSP_2r : _F2R<0b000100, (outs), (ins GRRegs:$src, GRRegs:$t),
+                     "init t[$t]:sp, $src",
+                     [(int_xcore_initsp GRRegs:$t, GRRegs:$src)]>;
+
+def INITPC_2r : _F2R<0b000000, (outs), (ins GRRegs:$src, GRRegs:$t),
+                     "init t[$t]:pc, $src",
+                     [(int_xcore_initpc GRRegs:$t, GRRegs:$src)]>;
+
+def INITCP_2r : _F2R<0b000110, (outs), (ins GRRegs:$src, GRRegs:$t),
+                     "init t[$t]:cp, $src",
+                     [(int_xcore_initcp GRRegs:$t, GRRegs:$src)]>;
+
+def INITDP_2r : _F2R<0b000010, (outs), (ins GRRegs:$src, GRRegs:$t),
+                     "init t[$t]:dp, $src",
+                     [(int_xcore_initdp GRRegs:$t, GRRegs:$src)]>;
+
+def PEEK_2r : _F2R<0b101110, (outs GRRegs:$dst), (ins GRRegs:$src),
+                    "peek $dst, res[$src]",
+                    [(set GRRegs:$dst, (int_xcore_peek GRRegs:$src))]>;
+
+def ENDIN_2r : _F2R<0b100101, (outs GRRegs:$dst), (ins GRRegs:$src),
+                     "endin $dst, res[$src]",
+                     [(set GRRegs:$dst, (int_xcore_endin GRRegs:$src))]>;
+
+def EEF_2r : _F2R<0b001011, (outs), (ins GRRegs:$a, GRRegs:$b),
+                  "eef $a, res[$b]", []>;
+
+def EET_2r : _F2R<0b001001, (outs), (ins GRRegs:$a, GRRegs:$b),
+                  "eet $a, res[$b]", []>;
+
+def TSETMR_2r : _F2RImm<0b000111, (outs), (ins i32imm:$a, GRRegs:$b),
+                        "tsetmr r$a, $b", []>;
+
+// Two operand long
+def BITREV_l2r : _FL2R<0b0000011000, (outs GRRegs:$dst), (ins GRRegs:$src),
+                       "bitrev $dst, $src",
+                       [(set GRRegs:$dst, (int_xcore_bitrev GRRegs:$src))]>;
+
+def BYTEREV_l2r : _FL2R<0b0000011001, (outs GRRegs:$dst), (ins GRRegs:$src),
+                        "byterev $dst, $src",
+                        [(set GRRegs:$dst, (bswap GRRegs:$src))]>;
+
+def CLZ_l2r : _FL2R<0b000111000, (outs GRRegs:$dst), (ins GRRegs:$src),
+                    "clz $dst, $src",
+                    [(set GRRegs:$dst, (ctlz GRRegs:$src))]>;
+
+def GETD_l2r : _FL2R<0b0001111001, (outs GRRegs:$dst), (ins GRRegs:$src),
+                     "getd $dst, res[$src]", []>;
+
+def GETN_l2r : _FL2R<0b0011011001, (outs GRRegs:$dst), (ins GRRegs:$src),
+                     "getn $dst, res[$src]", []>;
+
+def SETC_l2r : _FL2R<0b0010111001, (outs), (ins GRRegs:$r, GRRegs:$val),
+                     "setc res[$r], $val",
+                     [(int_xcore_setc GRRegs:$r, GRRegs:$val)]>;
+
+def SETTW_l2r : _FLR2R<0b0010011001, (outs), (ins GRRegs:$r, GRRegs:$val),
+                       "settw res[$r], $val",
+                       [(int_xcore_settw GRRegs:$r, GRRegs:$val)]>;
+
+def GETPS_l2r : _FL2R<0b0001011001, (outs GRRegs:$dst), (ins GRRegs:$src),
+                      "get $dst, ps[$src]",
+                      [(set GRRegs:$dst, (int_xcore_getps GRRegs:$src))]>;
+
+def SETPS_l2r : _FLR2R<0b0001111000, (outs), (ins GRRegs:$src1, GRRegs:$src2),
+                       "set ps[$src1], $src2",
+                       [(int_xcore_setps GRRegs:$src1, GRRegs:$src2)]>;
+
+def INITLR_l2r : _FL2R<0b0001011000, (outs), (ins GRRegs:$src, GRRegs:$t),
+                       "init t[$t]:lr, $src",
+                       [(int_xcore_initlr GRRegs:$t, GRRegs:$src)]>;
+
+def SETCLK_l2r : _FLR2R<0b0000111001, (outs), (ins GRRegs:$src1, GRRegs:$src2),
+                        "setclk res[$src1], $src2",
+                        [(int_xcore_setclk GRRegs:$src1, GRRegs:$src2)]>;
+
+def SETN_l2r : _FLR2R<0b0011011000, (outs), (ins GRRegs:$src1, GRRegs:$src2),
+                      "setn res[$src1], $src2", []>;
+
+def SETRDY_l2r : _FLR2R<0b0010111000, (outs), (ins GRRegs:$src1, GRRegs:$src2),
+                        "setrdy res[$src1], $src2",
+                        [(int_xcore_setrdy GRRegs:$src1, GRRegs:$src2)]>;
+
+def TESTLCL_l2r : _FL2R<0b0010011000, (outs GRRegs:$dst), (ins GRRegs:$src),
+                        "testlcl $dst, res[$src]", []>;
+
+// One operand short
+def MSYNC_1r : _F1R<0b000111, (outs), (ins GRRegs:$a),
+                    "msync res[$a]",
+                    [(int_xcore_msync GRRegs:$a)]>;
+def MJOIN_1r : _F1R<0b000101, (outs), (ins GRRegs:$a),
+                    "mjoin res[$a]",
+                    [(int_xcore_mjoin GRRegs:$a)]>;
+
+let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1 in
+def BAU_1r : _F1R<0b001001, (outs), (ins GRRegs:$a),
+                 "bau $a",
+                 [(brind GRRegs:$a)]>;
+
+let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1 in
+def BR_JT : PseudoInstXCore<(outs), (ins InlineJT:$t, GRRegs:$i),
+                            "bru $i\n$t",
+                            [(XCoreBR_JT tjumptable:$t, GRRegs:$i)]>;
+
+let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1 in
+def BR_JT32 : PseudoInstXCore<(outs), (ins InlineJT32:$t, GRRegs:$i),
+                              "bru $i\n$t",
+                              [(XCoreBR_JT32 tjumptable:$t, GRRegs:$i)]>;
+
+let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1 in
+def BRU_1r : _F1R<0b001010, (outs), (ins GRRegs:$a), "bru $a", []>;
+
+let Defs=[SP], neverHasSideEffects=1 in
+def SETSP_1r : _F1R<0b001011, (outs), (ins GRRegs:$a), "set sp, $a", []>;
+
+let neverHasSideEffects=1 in
+def SETDP_1r : _F1R<0b001100, (outs), (ins GRRegs:$a), "set dp, $a", []>;
+
+let neverHasSideEffects=1 in
+def SETCP_1r : _F1R<0b001101, (outs), (ins GRRegs:$a), "set cp, $a", []>;
+
+let hasCtrlDep = 1 in 
+def ECALLT_1r : _F1R<0b010011, (outs), (ins GRRegs:$a),
+                 "ecallt $a",
+                 []>;
+
+let hasCtrlDep = 1 in 
+def ECALLF_1r : _F1R<0b010010, (outs), (ins GRRegs:$a),
+                 "ecallf $a",
+                 []>;
+
+let isCall=1, 
+// All calls clobber the link register and the non-callee-saved registers:
+Defs = [R0, R1, R2, R3, R11, LR], Uses = [SP] in {
+def BLA_1r : _F1R<0b001000, (outs), (ins GRRegs:$a),
+                 "bla $a",
+                 [(XCoreBranchLink GRRegs:$a)]>;
+}
+
+def SYNCR_1r : _F1R<0b100001, (outs), (ins GRRegs:$a),
+                 "syncr res[$a]",
+                 [(int_xcore_syncr GRRegs:$a)]>;
+
+def FREER_1r : _F1R<0b000100, (outs), (ins GRRegs:$a),
+               "freer res[$a]",
+               [(int_xcore_freer GRRegs:$a)]>;
+
+let Uses=[R11] in {
+def SETV_1r : _F1R<0b010001, (outs), (ins GRRegs:$a),
+                   "setv res[$a], r11",
+                   [(int_xcore_setv GRRegs:$a, R11)]>;
+
+def SETEV_1r : _F1R<0b001111, (outs), (ins GRRegs:$a),
+                    "setev res[$a], r11",
+                    [(int_xcore_setev GRRegs:$a, R11)]>;
+}
+
+def DGETREG_1r : _F1R<0b001110, (outs GRRegs:$a), (ins), "dgetreg $a", []>;
+
+def EDU_1r : _F1R<0b000000, (outs), (ins GRRegs:$a), "edu res[$a]",
+                  [(int_xcore_edu GRRegs:$a)]>;
+
+def EEU_1r : _F1R<0b000001, (outs), (ins GRRegs:$a),
+               "eeu res[$a]",
+               [(int_xcore_eeu GRRegs:$a)]>;
+
+def KCALL_1r : _F1R<0b010000, (outs), (ins GRRegs:$a), "kcall $a", []>;
+
+def WAITEF_1R : _F1R<0b000011, (outs), (ins GRRegs:$a), "waitef $a", []>;
+
+def WAITET_1R : _F1R<0b000010, (outs), (ins GRRegs:$a), "waitet $a", []>;
+
+def TSTART_1R : _F1R<0b000110, (outs), (ins GRRegs:$a), "start t[$a]", []>;
+
+def CLRPT_1R : _F1R<0b100000, (outs), (ins GRRegs:$a), "clrpt res[$a]",
+                    [(int_xcore_clrpt GRRegs:$a)]>;
+
+// Zero operand short
+
+def CLRE_0R : _F0R<0b0000001101, (outs), (ins), "clre", [(int_xcore_clre)]>;
+
+def DCALL_0R : _F0R<0b0000011100, (outs), (ins), "dcall", []>;
+
+let Defs = [SP], Uses = [SP] in
+def DENTSP_0R : _F0R<0b0001001100, (outs), (ins), "dentsp", []>;
+
+let Defs = [SP] in
+def DRESTSP_0R : _F0R<0b0001001101, (outs), (ins), "drestsp", []>;
+
+def DRET_0R : _F0R<0b0000011110, (outs), (ins), "dret", []>;
+
+def FREET_0R : _F0R<0b0000001111, (outs), (ins), "freet", []>;
+
+let Defs = [R11] in {
+def GETID_0R : _F0R<0b0001001110, (outs), (ins),
+                    "get r11, id",
+                    [(set R11, (int_xcore_getid))]>;
+
+def GETED_0R : _F0R<0b0000111110, (outs), (ins),
+                    "get r11, ed",
+                    [(set R11, (int_xcore_geted))]>;
+
+def GETET_0R : _F0R<0b0000111111, (outs), (ins),
+                    "get r11, et",
+                    [(set R11, (int_xcore_getet))]>;
+
+def GETKEP_0R : _F0R<0b0001001111, (outs), (ins),
+                     "get r11, kep", []>;
+
+def GETKSP_0R : _F0R<0b0001011100, (outs), (ins),
+                     "get r11, ksp", []>;
+}
+
+let Defs = [SP] in
+def KRET_0R : _F0R<0b0000011101, (outs), (ins), "kret", []>;
+
+let Uses = [SP], mayLoad = 1 in {
+def LDET_0R : _F0R<0b0001011110, (outs), (ins), "ldw et, sp[4]", []>;
+
+def LDSED_0R : _F0R<0b0001011101, (outs), (ins), "ldw sed, sp[3]", []>;
+
+def LDSPC_0R : _F0R<0b0000101100, (outs), (ins), "ldw spc, sp[1]", []>;
+
+def LDSSR_0R : _F0R<0b0000101110, (outs), (ins), "ldw ssr, sp[2]", []>;
+}
+
+let Uses=[R11] in
+def SETKEP_0R : _F0R<0b0000011111, (outs), (ins), "set kep, r11", []>;
+
+def SSYNC_0r : _F0R<0b0000001110, (outs), (ins),
+                    "ssync",
+                    [(int_xcore_ssync)]>;
+
+let Uses = [SP], mayStore = 1 in {
+def STET_0R : _F0R<0b0000111101, (outs), (ins), "stw et, sp[4]", []>;
+
+def STSED_0R : _F0R<0b0000111100, (outs), (ins), "stw sed, sp[3]", []>;
+
+def STSPC_0R : _F0R<0b0000101101, (outs), (ins), "stw spc, sp[1]", []>;
+
+def STSSR_0R : _F0R<0b0000101111, (outs), (ins), "stw ssr, sp[2]", []>;
+}
+
+let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1,
+    hasSideEffects = 1 in
+def WAITEU_0R : _F0R<0b0000001100, (outs), (ins),
+                     "waiteu",
+                     [(brind (int_xcore_waitevent))]>;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+def : Pat<(XCoreBranchLink texternalsym:$addr), (BLRF_lu10 texternalsym:$addr)>;
+
+/// sext_inreg
+def : Pat<(sext_inreg GRRegs:$b, i1), (SEXT_rus GRRegs:$b, 1)>;
+def : Pat<(sext_inreg GRRegs:$b, i8), (SEXT_rus GRRegs:$b, 8)>;
+def : Pat<(sext_inreg GRRegs:$b, i16), (SEXT_rus GRRegs:$b, 16)>;
+
+/// loads
+def : Pat<(zextloadi8 (add GRRegs:$addr, GRRegs:$offset)),
+          (LD8U_3r GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(zextloadi8 GRRegs:$addr), (LD8U_3r GRRegs:$addr, (LDC_ru6 0))>;
+
+def : Pat<(sextloadi16 (lda16f GRRegs:$addr, GRRegs:$offset)),
+          (LD16S_3r GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(sextloadi16 GRRegs:$addr), (LD16S_3r GRRegs:$addr, (LDC_ru6 0))>;
+
+def : Pat<(load (ldawf GRRegs:$addr, GRRegs:$offset)),
+          (LDW_3r GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(load (add GRRegs:$addr, immUs4:$offset)),
+          (LDW_2rus GRRegs:$addr, (div4_xform immUs4:$offset))>;
+def : Pat<(load GRRegs:$addr), (LDW_2rus GRRegs:$addr, 0)>;
+
+/// anyext
+def : Pat<(extloadi8 (add GRRegs:$addr, GRRegs:$offset)),
+          (LD8U_3r GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(extloadi8 GRRegs:$addr), (LD8U_3r GRRegs:$addr, (LDC_ru6 0))>;
+def : Pat<(extloadi16 (lda16f GRRegs:$addr, GRRegs:$offset)),
+          (LD16S_3r GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(extloadi16 GRRegs:$addr), (LD16S_3r GRRegs:$addr, (LDC_ru6 0))>;
+
+/// stores
+def : Pat<(truncstorei8 GRRegs:$val, (add GRRegs:$addr, GRRegs:$offset)),
+          (ST8_l3r GRRegs:$val, GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(truncstorei8 GRRegs:$val, GRRegs:$addr),
+          (ST8_l3r GRRegs:$val, GRRegs:$addr, (LDC_ru6 0))>;
+          
+def : Pat<(truncstorei16 GRRegs:$val, (lda16f GRRegs:$addr, GRRegs:$offset)),
+          (ST16_l3r GRRegs:$val, GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(truncstorei16 GRRegs:$val, GRRegs:$addr),
+          (ST16_l3r GRRegs:$val, GRRegs:$addr, (LDC_ru6 0))>;
+
+def : Pat<(store GRRegs:$val, (ldawf GRRegs:$addr, GRRegs:$offset)),
+          (STW_l3r GRRegs:$val, GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(store GRRegs:$val, (add GRRegs:$addr, immUs4:$offset)),
+          (STW_2rus GRRegs:$val, GRRegs:$addr, (div4_xform immUs4:$offset))>;
+def : Pat<(store GRRegs:$val, GRRegs:$addr),
+          (STW_2rus GRRegs:$val, GRRegs:$addr, 0)>;
+
+/// cttz
+def : Pat<(cttz GRRegs:$src), (CLZ_l2r (BITREV_l2r GRRegs:$src))>;
+
+/// trap
+def : Pat<(trap), (ECALLF_1r (LDC_ru6 0))>;
+
+///
+/// branch patterns
+///
+
+// unconditional branch
+def : Pat<(br bb:$addr), (BRFU_lu6 bb:$addr)>;
+
+// direct match equal/notequal zero brcond
+def : Pat<(brcond (setne GRRegs:$lhs, 0), bb:$dst),
+          (BRFT_lru6 GRRegs:$lhs, bb:$dst)>;
+def : Pat<(brcond (seteq GRRegs:$lhs, 0), bb:$dst),
+          (BRFF_lru6 GRRegs:$lhs, bb:$dst)>;
+
+def : Pat<(brcond (setle GRRegs:$lhs, GRRegs:$rhs), bb:$dst),
+          (BRFF_lru6 (LSS_3r GRRegs:$rhs, GRRegs:$lhs), bb:$dst)>;
+def : Pat<(brcond (setule GRRegs:$lhs, GRRegs:$rhs), bb:$dst),
+          (BRFF_lru6 (LSU_3r GRRegs:$rhs, GRRegs:$lhs), bb:$dst)>;
+def : Pat<(brcond (setge GRRegs:$lhs, GRRegs:$rhs), bb:$dst),
+          (BRFF_lru6 (LSS_3r GRRegs:$lhs, GRRegs:$rhs), bb:$dst)>;
+def : Pat<(brcond (setuge GRRegs:$lhs, GRRegs:$rhs), bb:$dst),
+          (BRFF_lru6 (LSU_3r GRRegs:$lhs, GRRegs:$rhs), bb:$dst)>;
+def : Pat<(brcond (setne GRRegs:$lhs, GRRegs:$rhs), bb:$dst),
+          (BRFF_lru6 (EQ_3r GRRegs:$lhs, GRRegs:$rhs), bb:$dst)>;
+def : Pat<(brcond (setne GRRegs:$lhs, immUs:$rhs), bb:$dst),
+          (BRFF_lru6 (EQ_2rus GRRegs:$lhs, immUs:$rhs), bb:$dst)>;
+
+// generic brcond pattern
+def : Pat<(brcond GRRegs:$cond, bb:$addr), (BRFT_lru6 GRRegs:$cond, bb:$addr)>;
+
+
+///
+/// Select patterns
+///
+
+// direct match equal/notequal zero select
+def : Pat<(select (setne GRRegs:$lhs, 0), GRRegs:$T, GRRegs:$F),
+        (SELECT_CC GRRegs:$lhs, GRRegs:$T, GRRegs:$F)>;
+
+def : Pat<(select (seteq GRRegs:$lhs, 0), GRRegs:$T, GRRegs:$F),
+        (SELECT_CC GRRegs:$lhs, GRRegs:$F, GRRegs:$T)>;
+
+def : Pat<(select (setle GRRegs:$lhs, GRRegs:$rhs), GRRegs:$T, GRRegs:$F),
+          (SELECT_CC (LSS_3r GRRegs:$rhs, GRRegs:$lhs), GRRegs:$F, GRRegs:$T)>;
+def : Pat<(select (setule GRRegs:$lhs, GRRegs:$rhs), GRRegs:$T, GRRegs:$F),
+          (SELECT_CC (LSU_3r GRRegs:$rhs, GRRegs:$lhs), GRRegs:$F, GRRegs:$T)>;
+def : Pat<(select (setge GRRegs:$lhs, GRRegs:$rhs), GRRegs:$T, GRRegs:$F),
+          (SELECT_CC (LSS_3r GRRegs:$lhs, GRRegs:$rhs), GRRegs:$F, GRRegs:$T)>;
+def : Pat<(select (setuge GRRegs:$lhs, GRRegs:$rhs), GRRegs:$T, GRRegs:$F),
+          (SELECT_CC (LSU_3r GRRegs:$lhs, GRRegs:$rhs), GRRegs:$F, GRRegs:$T)>;
+def : Pat<(select (setne GRRegs:$lhs, GRRegs:$rhs), GRRegs:$T, GRRegs:$F),
+          (SELECT_CC (EQ_3r GRRegs:$lhs, GRRegs:$rhs), GRRegs:$F, GRRegs:$T)>;
+def : Pat<(select (setne GRRegs:$lhs, immUs:$rhs), GRRegs:$T, GRRegs:$F),
+          (SELECT_CC (EQ_2rus GRRegs:$lhs, immUs:$rhs), GRRegs:$F, GRRegs:$T)>;
+
+///
+/// setcc patterns, only matched when none of the above brcond
+/// patterns match
+///
+
+// setcc 2 register operands
+def : Pat<(setle GRRegs:$lhs, GRRegs:$rhs),
+          (EQ_2rus (LSS_3r GRRegs:$rhs, GRRegs:$lhs), 0)>;
+def : Pat<(setule GRRegs:$lhs, GRRegs:$rhs),
+          (EQ_2rus (LSU_3r GRRegs:$rhs, GRRegs:$lhs), 0)>;
+
+def : Pat<(setgt GRRegs:$lhs, GRRegs:$rhs),
+          (LSS_3r GRRegs:$rhs, GRRegs:$lhs)>;
+def : Pat<(setugt GRRegs:$lhs, GRRegs:$rhs),
+          (LSU_3r GRRegs:$rhs, GRRegs:$lhs)>;
+
+def : Pat<(setge GRRegs:$lhs, GRRegs:$rhs),
+          (EQ_2rus (LSS_3r GRRegs:$lhs, GRRegs:$rhs), 0)>;
+def : Pat<(setuge GRRegs:$lhs, GRRegs:$rhs),
+          (EQ_2rus (LSU_3r GRRegs:$lhs, GRRegs:$rhs), 0)>;
+
+def : Pat<(setlt GRRegs:$lhs, GRRegs:$rhs),
+          (LSS_3r GRRegs:$lhs, GRRegs:$rhs)>;
+def : Pat<(setult GRRegs:$lhs, GRRegs:$rhs),
+          (LSU_3r GRRegs:$lhs, GRRegs:$rhs)>;
+
+def : Pat<(setne GRRegs:$lhs, GRRegs:$rhs),
+          (EQ_2rus (EQ_3r GRRegs:$lhs, GRRegs:$rhs), 0)>;
+
+def : Pat<(seteq GRRegs:$lhs, GRRegs:$rhs),
+          (EQ_3r GRRegs:$lhs, GRRegs:$rhs)>;
+
+// setcc reg/imm operands
+def : Pat<(seteq GRRegs:$lhs, immUs:$rhs),
+          (EQ_2rus GRRegs:$lhs, immUs:$rhs)>;
+def : Pat<(setne GRRegs:$lhs, immUs:$rhs),
+          (EQ_2rus (EQ_2rus GRRegs:$lhs, immUs:$rhs), 0)>;
+
+// misc
+def : Pat<(add GRRegs:$addr, immUs4:$offset),
+          (LDAWF_l2rus GRRegs:$addr, (div4_xform immUs4:$offset))>;
+
+def : Pat<(sub GRRegs:$addr, immUs4:$offset),
+          (LDAWB_l2rus GRRegs:$addr, (div4_xform immUs4:$offset))>;
+
+def : Pat<(and GRRegs:$val, immMskBitp:$mask),
+          (ZEXT_rus GRRegs:$val, (msksize_xform immMskBitp:$mask))>;
+
+// (sub X, imm) gets canonicalized to (add X, -imm).  Match this form.
+def : Pat<(add GRRegs:$src1, immUsNeg:$src2),
+          (SUB_2rus GRRegs:$src1, (neg_xform immUsNeg:$src2))>;
+
+def : Pat<(add GRRegs:$src1, immUs4Neg:$src2),
+          (LDAWB_l2rus GRRegs:$src1, (div4neg_xform immUs4Neg:$src2))>;
+
+///
+/// Some peepholes
+///
+
+def : Pat<(mul GRRegs:$src, 3),
+          (LDA16F_l3r GRRegs:$src, GRRegs:$src)>;
+
+def : Pat<(mul GRRegs:$src, 5),
+          (LDAWF_l3r GRRegs:$src, GRRegs:$src)>;
+
+def : Pat<(mul GRRegs:$src, -3),
+          (LDAWB_l3r GRRegs:$src, GRRegs:$src)>;
+
+// ashr X, 32 is equivalent to ashr X, 31 on the XCore.
+def : Pat<(sra GRRegs:$src, 31),
+          (ASHR_l2rus GRRegs:$src, 32)>;
+
+def : Pat<(brcond (setlt GRRegs:$lhs, 0), bb:$dst),
+          (BRFT_lru6 (ASHR_l2rus GRRegs:$lhs, 32), bb:$dst)>;
+
+// setge X, 0 is canonicalized to setgt X, -1
+def : Pat<(brcond (setgt GRRegs:$lhs, -1), bb:$dst),
+          (BRFF_lru6 (ASHR_l2rus GRRegs:$lhs, 32), bb:$dst)>;
+
+def : Pat<(select (setlt GRRegs:$lhs, 0), GRRegs:$T, GRRegs:$F),
+          (SELECT_CC (ASHR_l2rus GRRegs:$lhs, 32), GRRegs:$T, GRRegs:$F)>;
+
+def : Pat<(select (setgt GRRegs:$lhs, -1), GRRegs:$T, GRRegs:$F),
+          (SELECT_CC (ASHR_l2rus GRRegs:$lhs, 32), GRRegs:$F, GRRegs:$T)>;
+
+def : Pat<(setgt GRRegs:$lhs, -1),
+          (EQ_2rus (ASHR_l2rus GRRegs:$lhs, 32), 0)>;
+
+def : Pat<(sra (shl GRRegs:$src, immBpwSubBitp:$imm), immBpwSubBitp:$imm),
+          (SEXT_rus GRRegs:$src, (bpwsub_xform immBpwSubBitp:$imm))>;
+
+def : Pat<(load (cprelwrapper tconstpool:$b)),
+          (LDWCP_lru6 tconstpool:$b)>;
+
+def : Pat<(cprelwrapper tconstpool:$b),
+          (LDAWCP_lu6 tconstpool:$b)>;
diff --git a/contrib/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp b/contrib/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp
new file mode 100644
index 0000000..ac3bae5
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp
@@ -0,0 +1,239 @@
+//===-- XCoreLowerThreadLocal - Lower thread local variables --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file contains a pass that lowers thread local variables on the
+///        XCore.
+///
+//===----------------------------------------------------------------------===//
+
+#include "XCore.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/NoFolder.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#define DEBUG_TYPE "xcore-lower-thread-local"
+
+using namespace llvm;
+
+static cl::opt<unsigned> MaxThreads(
+  "xcore-max-threads", cl::Optional,
+  cl::desc("Maximum number of threads (for emulation thread-local storage)"),
+  cl::Hidden, cl::value_desc("number"), cl::init(8));
+
+namespace {
+  /// Lowers thread local variables on the XCore. Each thread local variable is
+  /// expanded to an array of n elements indexed by the thread ID where n is the
+  /// fixed number hardware threads supported by the device.
+  struct XCoreLowerThreadLocal : public ModulePass {
+    static char ID;
+
+    XCoreLowerThreadLocal() : ModulePass(ID) {
+      initializeXCoreLowerThreadLocalPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool lowerGlobal(GlobalVariable *GV);
+
+    bool runOnModule(Module &M) override;
+  };
+}
+
+char XCoreLowerThreadLocal::ID = 0;
+
+INITIALIZE_PASS(XCoreLowerThreadLocal, "xcore-lower-thread-local",
+                "Lower thread local variables", false, false)
+
+ModulePass *llvm::createXCoreLowerThreadLocalPass() {
+  return new XCoreLowerThreadLocal();
+}
+
+static ArrayType *createLoweredType(Type *OriginalType) {
+  return ArrayType::get(OriginalType, MaxThreads);
+}
+
+static Constant *
+createLoweredInitializer(ArrayType *NewType, Constant *OriginalInitializer) {
+  SmallVector<Constant *, 8> Elements(MaxThreads);
+  for (unsigned i = 0; i != MaxThreads; ++i) {
+    Elements[i] = OriginalInitializer;
+  }
+  return ConstantArray::get(NewType, Elements);
+}
+
+static Instruction *
+createReplacementInstr(ConstantExpr *CE, Instruction *Instr) {
+  IRBuilder<true,NoFolder> Builder(Instr);
+  unsigned OpCode = CE->getOpcode();
+  switch (OpCode) {
+    case Instruction::GetElementPtr: {
+      SmallVector<Value *,4> CEOpVec(CE->op_begin(), CE->op_end());
+      ArrayRef<Value *> CEOps(CEOpVec);
+      return dyn_cast<Instruction>(Builder.CreateInBoundsGEP(CEOps[0],
+                                                             CEOps.slice(1)));
+    }
+    case Instruction::Add:
+    case Instruction::Sub:
+    case Instruction::Mul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+      return dyn_cast<Instruction>(
+                  Builder.CreateBinOp((Instruction::BinaryOps)OpCode,
+                                      CE->getOperand(0), CE->getOperand(1),
+                                      CE->getName()));
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::BitCast:
+      return dyn_cast<Instruction>(
+                  Builder.CreateCast((Instruction::CastOps)OpCode,
+                                     CE->getOperand(0), CE->getType(),
+                                     CE->getName()));
+    default:
+      llvm_unreachable("Unhandled constant expression!\n");
+  }
+}
+
+static bool replaceConstantExprOp(ConstantExpr *CE, Pass *P) {
+  do {
+    SmallVector<WeakVH,8> WUsers(CE->user_begin(), CE->user_end());
+    std::sort(WUsers.begin(), WUsers.end());
+    WUsers.erase(std::unique(WUsers.begin(), WUsers.end()), WUsers.end());
+    while (!WUsers.empty())
+      if (WeakVH WU = WUsers.pop_back_val()) {
+        if (PHINode *PN = dyn_cast<PHINode>(WU)) {
+          for (int I = 0, E = PN->getNumIncomingValues(); I < E; ++I)
+            if (PN->getIncomingValue(I) == CE) {
+              BasicBlock *PredBB = PN->getIncomingBlock(I);
+              if (PredBB->getTerminator()->getNumSuccessors() > 1)
+                PredBB = SplitEdge(PredBB, PN->getParent(), P);
+              Instruction *InsertPos = PredBB->getTerminator();
+              Instruction *NewInst = createReplacementInstr(CE, InsertPos);
+              PN->setOperand(I, NewInst);
+            }
+        } else if (Instruction *Instr = dyn_cast<Instruction>(WU)) {
+          Instruction *NewInst = createReplacementInstr(CE, Instr);
+          Instr->replaceUsesOfWith(CE, NewInst);
+        } else {
+          ConstantExpr *CExpr = dyn_cast<ConstantExpr>(WU);
+          if (!CExpr || !replaceConstantExprOp(CExpr, P))
+            return false;
+        }
+      }
+  } while (CE->hasNUsesOrMore(1)); // We need to check because a recursive
+  // sibling may have used 'CE' when createReplacementInstr was called.
+  CE->destroyConstant();
+  return true;
+}
+
+static bool rewriteNonInstructionUses(GlobalVariable *GV, Pass *P) {
+  SmallVector<WeakVH,8> WUsers;
+  for (User *U : GV->users())
+    if (!isa<Instruction>(U))
+      WUsers.push_back(WeakVH(U));
+  while (!WUsers.empty())
+    if (WeakVH WU = WUsers.pop_back_val()) {
+      ConstantExpr *CE = dyn_cast<ConstantExpr>(WU);
+      if (!CE || !replaceConstantExprOp(CE, P))
+        return false;
+    }
+  return true;
+}
+
+static bool isZeroLengthArray(Type *Ty) {
+  ArrayType *AT = dyn_cast<ArrayType>(Ty);
+  return AT && (AT->getNumElements() == 0);
+}
+
+bool XCoreLowerThreadLocal::lowerGlobal(GlobalVariable *GV) {
+  Module *M = GV->getParent();
+  LLVMContext &Ctx = M->getContext();
+  if (!GV->isThreadLocal())
+    return false;
+
+  // Skip globals that we can't lower and leave it for the backend to error.
+  if (!rewriteNonInstructionUses(GV, this) ||
+      !GV->getType()->isSized() || isZeroLengthArray(GV->getType()))
+    return false;
+
+  // Create replacement global.
+  ArrayType *NewType = createLoweredType(GV->getType()->getElementType());
+  Constant *NewInitializer = nullptr;
+  if (GV->hasInitializer())
+    NewInitializer = createLoweredInitializer(NewType,
+                                              GV->getInitializer());
+  GlobalVariable *NewGV =
+    new GlobalVariable(*M, NewType, GV->isConstant(), GV->getLinkage(),
+                       NewInitializer, "", nullptr,
+                       GlobalVariable::NotThreadLocal,
+                       GV->getType()->getAddressSpace(),
+                       GV->isExternallyInitialized());
+
+  // Update uses.
+  SmallVector<User *, 16> Users(GV->user_begin(), GV->user_end());
+  for (unsigned I = 0, E = Users.size(); I != E; ++I) {
+    User *U = Users[I];
+    Instruction *Inst = cast<Instruction>(U);
+    IRBuilder<> Builder(Inst);
+    Function *GetID = Intrinsic::getDeclaration(GV->getParent(),
+                                                Intrinsic::xcore_getid);
+    Value *ThreadID = Builder.CreateCall(GetID);
+    SmallVector<Value *, 2> Indices;
+    Indices.push_back(Constant::getNullValue(Type::getInt64Ty(Ctx)));
+    Indices.push_back(ThreadID);
+    Value *Addr = Builder.CreateInBoundsGEP(NewGV, Indices);
+    U->replaceUsesOfWith(GV, Addr);
+  }
+
+  // Remove old global.
+  NewGV->takeName(GV);
+  GV->eraseFromParent();
+  return true;
+}
+
+bool XCoreLowerThreadLocal::runOnModule(Module &M) {
+  // Find thread local globals.
+  bool MadeChange = false;
+  SmallVector<GlobalVariable *, 16> ThreadLocalGlobals;
+  for (Module::global_iterator GVI = M.global_begin(), E = M.global_end();
+       GVI != E; ++GVI) {
+    GlobalVariable *GV = GVI;
+    if (GV->isThreadLocal())
+      ThreadLocalGlobals.push_back(GV);
+  }
+  for (unsigned I = 0, E = ThreadLocalGlobals.size(); I != E; ++I) {
+    MadeChange |= lowerGlobal(ThreadLocalGlobals[I]);
+  }
+  return MadeChange;
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreMCInstLower.cpp b/contrib/llvm/lib/Target/XCore/XCoreMCInstLower.cpp
new file mode 100644
index 0000000..dfdadcf
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreMCInstLower.cpp
@@ -0,0 +1,117 @@
+//===-- XCoreMCInstLower.cpp - Convert XCore MachineInstr to MCInst -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file contains code to lower XCore MachineInstrs to their
+/// corresponding MCInst records.
+///
+//===----------------------------------------------------------------------===//
+#include "XCoreMCInstLower.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+
+using namespace llvm;
+
+XCoreMCInstLower::XCoreMCInstLower(class AsmPrinter &asmprinter)
+: Printer(asmprinter) {}
+
+void XCoreMCInstLower::Initialize(Mangler *M, MCContext *C) {
+  Mang = M;
+  Ctx = C;
+}
+
+MCOperand XCoreMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
+                                               MachineOperandType MOTy,
+                                               unsigned Offset) const {
+  MCSymbolRefExpr::VariantKind Kind = MCSymbolRefExpr::VK_None;
+  const MCSymbol *Symbol;
+
+  switch (MOTy) {
+    case MachineOperand::MO_MachineBasicBlock:
+      Symbol = MO.getMBB()->getSymbol();
+      break;
+    case MachineOperand::MO_GlobalAddress:
+      Symbol = Printer.getSymbol(MO.getGlobal());
+      Offset += MO.getOffset();
+      break;
+    case MachineOperand::MO_BlockAddress:
+      Symbol = Printer.GetBlockAddressSymbol(MO.getBlockAddress());
+      Offset += MO.getOffset();
+      break;
+    case MachineOperand::MO_ExternalSymbol:
+      Symbol = Printer.GetExternalSymbolSymbol(MO.getSymbolName());
+      Offset += MO.getOffset();
+      break;
+    case MachineOperand::MO_JumpTableIndex:
+      Symbol = Printer.GetJTISymbol(MO.getIndex());
+      break;
+    case MachineOperand::MO_ConstantPoolIndex:
+      Symbol = Printer.GetCPISymbol(MO.getIndex());
+      Offset += MO.getOffset();
+      break;
+    default:
+      llvm_unreachable("<unknown operand type>");
+  }
+
+  const MCSymbolRefExpr *MCSym = MCSymbolRefExpr::Create(Symbol, Kind, *Ctx);
+
+  if (!Offset)
+    return MCOperand::CreateExpr(MCSym);
+
+  // Assume offset is never negative.
+  assert(Offset > 0);
+
+  const MCConstantExpr *OffsetExpr =  MCConstantExpr::Create(Offset, *Ctx);
+  const MCBinaryExpr *Add = MCBinaryExpr::CreateAdd(MCSym, OffsetExpr, *Ctx);
+  return MCOperand::CreateExpr(Add);
+}
+
+MCOperand XCoreMCInstLower::LowerOperand(const MachineOperand &MO,
+                                         unsigned offset) const {
+  MachineOperandType MOTy = MO.getType();
+
+  switch (MOTy) {
+    default: llvm_unreachable("unknown operand type");
+    case MachineOperand::MO_Register:
+      // Ignore all implicit register operands.
+      if (MO.isImplicit()) break;
+      return MCOperand::CreateReg(MO.getReg());
+    case MachineOperand::MO_Immediate:
+      return MCOperand::CreateImm(MO.getImm() + offset);
+    case MachineOperand::MO_MachineBasicBlock:
+    case MachineOperand::MO_GlobalAddress:
+    case MachineOperand::MO_ExternalSymbol:
+    case MachineOperand::MO_JumpTableIndex:
+    case MachineOperand::MO_ConstantPoolIndex:
+    case MachineOperand::MO_BlockAddress:
+      return LowerSymbolOperand(MO, MOTy, offset);
+    case MachineOperand::MO_RegisterMask:
+      break;
+  }
+
+  return MCOperand();
+}
+
+void XCoreMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+  OutMI.setOpcode(MI->getOpcode());
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    MCOperand MCOp = LowerOperand(MO);
+
+    if (MCOp.isValid())
+      OutMI.addOperand(MCOp);
+  }
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreMCInstLower.h b/contrib/llvm/lib/Target/XCore/XCoreMCInstLower.h
new file mode 100644
index 0000000..28e702b
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreMCInstLower.h
@@ -0,0 +1,42 @@
+//===-- XCoreMCInstLower.h - Lower MachineInstr to MCInst ------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef XCOREMCINSTLOWER_H
+#define XCOREMCINSTLOWER_H
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+  class MCContext;
+  class MCInst;
+  class MCOperand;
+  class MachineInstr;
+  class MachineFunction;
+  class Mangler;
+  class AsmPrinter;
+
+/// \brief This class is used to lower an MachineInstr into an MCInst.
+class LLVM_LIBRARY_VISIBILITY XCoreMCInstLower {
+  typedef MachineOperand::MachineOperandType MachineOperandType;
+  MCContext *Ctx;
+  Mangler *Mang;
+  AsmPrinter &Printer;
+public:
+  XCoreMCInstLower(class AsmPrinter &asmprinter);
+  void Initialize(Mangler *mang, MCContext *C);
+  void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+  MCOperand LowerOperand(const MachineOperand& MO, unsigned offset = 0) const;
+
+private:
+  MCOperand LowerSymbolOperand(const MachineOperand &MO,
+                               MachineOperandType MOTy, unsigned Offset) const;
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.cpp
new file mode 100644
index 0000000..9ef9752
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.cpp
@@ -0,0 +1,72 @@
+//===-- XCoreMachineFuctionInfo.cpp - XCore machine function info ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreMachineFunctionInfo.h"
+#include "XCoreInstrInfo.h"
+#include "llvm/IR/Function.h"
+
+using namespace llvm;
+
+void XCoreFunctionInfo::anchor() { }
+
+bool XCoreFunctionInfo::isLargeFrame(const MachineFunction &MF) const {
+  if (CachedEStackSize == -1) {
+    CachedEStackSize = MF.getFrameInfo()->estimateStackSize(MF);
+  }
+  // isLargeFrame() is used when deciding if spill slots should be added to
+  // allow eliminateFrameIndex() to scavenge registers.
+  // This is only required when there is no FP and offsets are greater than
+  // ~256KB (~64Kwords). Thus only for code run on the emulator!
+  //
+  // The arbitrary value of 0xf000 allows frames of up to ~240KB before spill
+  // slots are added for the use of eliminateFrameIndex() register scavenging.
+  // For frames less than 240KB, it is assumed that there will be less than
+  // 16KB of function arguments.
+  return CachedEStackSize > 0xf000;
+}
+
+int XCoreFunctionInfo::createLRSpillSlot(MachineFunction &MF) {
+  if (LRSpillSlotSet) {
+    return LRSpillSlot;
+  }
+  const TargetRegisterClass *RC = &XCore::GRRegsRegClass;
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  if (! MF.getFunction()->isVarArg()) {
+    // A fixed offset of 0 allows us to save / restore LR using entsp / retsp.
+    LRSpillSlot = MFI->CreateFixedObject(RC->getSize(), 0, true);
+  } else {
+    LRSpillSlot = MFI->CreateStackObject(RC->getSize(), RC->getAlignment(), true);
+  }
+  LRSpillSlotSet = true;
+  return LRSpillSlot;
+}
+
+int XCoreFunctionInfo::createFPSpillSlot(MachineFunction &MF) {
+  if (FPSpillSlotSet) {
+    return FPSpillSlot;
+  }
+  const TargetRegisterClass *RC = &XCore::GRRegsRegClass;
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  FPSpillSlot = MFI->CreateStackObject(RC->getSize(), RC->getAlignment(), true);
+  FPSpillSlotSet = true;
+  return FPSpillSlot;
+}
+
+const int* XCoreFunctionInfo::createEHSpillSlot(MachineFunction &MF) {
+  if (EHSpillSlotSet) {
+    return EHSpillSlot;
+  }
+  const TargetRegisterClass *RC = &XCore::GRRegsRegClass;
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  EHSpillSlot[0] = MFI->CreateStackObject(RC->getSize(), RC->getAlignment(), true);
+  EHSpillSlot[1] = MFI->CreateStackObject(RC->getSize(), RC->getAlignment(), true);
+  EHSpillSlotSet = true;
+  return EHSpillSlot;
+}
+
diff --git a/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.h b/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.h
new file mode 100644
index 0000000..212a5cf
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.h
@@ -0,0 +1,106 @@
+//===-- XCoreMachineFuctionInfo.h - XCore machine function info -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares XCore-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef XCOREMACHINEFUNCTIONINFO_H
+#define XCOREMACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include <vector>
+
+namespace llvm {
+
+// Forward declarations
+class Function;
+
+/// XCoreFunctionInfo - This class is derived from MachineFunction private
+/// XCore target-specific information for each MachineFunction.
+class XCoreFunctionInfo : public MachineFunctionInfo {
+  virtual void anchor();
+  bool LRSpillSlotSet;
+  int LRSpillSlot;
+  bool FPSpillSlotSet;
+  int FPSpillSlot;
+  bool EHSpillSlotSet;
+  int EHSpillSlot[2];
+  unsigned ReturnStackOffset;
+  bool ReturnStackOffsetSet;
+  int VarArgsFrameIndex;
+  mutable int CachedEStackSize;
+  std::vector<std::pair<MachineBasicBlock::iterator, CalleeSavedInfo>>
+  SpillLabels;
+
+public:
+  XCoreFunctionInfo() :
+    LRSpillSlotSet(false),
+    FPSpillSlotSet(false),
+    EHSpillSlotSet(false),
+    ReturnStackOffsetSet(false),
+    VarArgsFrameIndex(0),
+    CachedEStackSize(-1) {}
+  
+  explicit XCoreFunctionInfo(MachineFunction &MF) :
+    LRSpillSlotSet(false),
+    FPSpillSlotSet(false),
+    EHSpillSlotSet(false),
+    ReturnStackOffsetSet(false),
+    VarArgsFrameIndex(0),
+    CachedEStackSize(-1) {}
+  
+  ~XCoreFunctionInfo() {}
+  
+  void setVarArgsFrameIndex(int off) { VarArgsFrameIndex = off; }
+  int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
+
+  int createLRSpillSlot(MachineFunction &MF);
+  bool hasLRSpillSlot() { return LRSpillSlotSet; }
+  int getLRSpillSlot() const {
+    assert(LRSpillSlotSet && "LR Spill slot not set");
+    return LRSpillSlot;
+  }
+
+  int createFPSpillSlot(MachineFunction &MF);
+  bool hasFPSpillSlot() { return FPSpillSlotSet; }
+  int getFPSpillSlot() const {
+    assert(FPSpillSlotSet && "FP Spill slot not set");
+    return FPSpillSlot;
+  }
+
+  const int* createEHSpillSlot(MachineFunction &MF);
+  bool hasEHSpillSlot() { return EHSpillSlotSet; }
+  const int* getEHSpillSlot() const {
+    assert(EHSpillSlotSet && "EH Spill slot not set");
+    return EHSpillSlot;
+  }
+
+  void setReturnStackOffset(unsigned value) {
+    assert(!ReturnStackOffsetSet && "Return stack offset set twice");
+    ReturnStackOffset = value;
+    ReturnStackOffsetSet = true;
+  }
+
+  unsigned getReturnStackOffset() const {
+    assert(ReturnStackOffsetSet && "Return stack offset not set");
+    return ReturnStackOffset;
+  }
+
+  bool isLargeFrame(const MachineFunction &MF) const;
+
+  std::vector<std::pair<MachineBasicBlock::iterator, CalleeSavedInfo>> &
+  getSpillLabels() {
+    return SpillLabels;
+  }
+};
+} // End llvm namespace
+
+#endif // XCOREMACHINEFUNCTIONINFO_H
diff --git a/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.cpp b/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.cpp
new file mode 100644
index 0000000..316c82c
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.cpp
@@ -0,0 +1,329 @@
+//===-- XCoreRegisterInfo.cpp - XCore Register Information ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the XCore implementation of the MRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreRegisterInfo.h"
+#include "XCore.h"
+#include "XCoreInstrInfo.h"
+#include "XCoreMachineFunctionInfo.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "xcore-reg-info"
+
+#define GET_REGINFO_TARGET_DESC
+#include "XCoreGenRegisterInfo.inc"
+
+XCoreRegisterInfo::XCoreRegisterInfo()
+  : XCoreGenRegisterInfo(XCore::LR) {
+}
+
+// helper functions
+static inline bool isImmUs(unsigned val) {
+  return val <= 11;
+}
+
+static inline bool isImmU6(unsigned val) {
+  return val < (1 << 6);
+}
+
+static inline bool isImmU16(unsigned val) {
+  return val < (1 << 16);
+}
+
+
+static void InsertFPImmInst(MachineBasicBlock::iterator II,
+                            const XCoreInstrInfo &TII,
+                            unsigned Reg, unsigned FrameReg, int Offset ) {
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc dl = MI.getDebugLoc();
+
+  switch (MI.getOpcode()) {
+  case XCore::LDWFI:
+    BuildMI(MBB, II, dl, TII.get(XCore::LDW_2rus), Reg)
+          .addReg(FrameReg)
+          .addImm(Offset)
+          .addMemOperand(*MI.memoperands_begin());
+    break;
+  case XCore::STWFI:
+    BuildMI(MBB, II, dl, TII.get(XCore::STW_2rus))
+          .addReg(Reg, getKillRegState(MI.getOperand(0).isKill()))
+          .addReg(FrameReg)
+          .addImm(Offset)
+          .addMemOperand(*MI.memoperands_begin());
+    break;
+  case XCore::LDAWFI:
+    BuildMI(MBB, II, dl, TII.get(XCore::LDAWF_l2rus), Reg)
+          .addReg(FrameReg)
+          .addImm(Offset);
+    break;
+  default:
+    llvm_unreachable("Unexpected Opcode");
+  }
+}
+
+static void InsertFPConstInst(MachineBasicBlock::iterator II,
+                              const XCoreInstrInfo &TII,
+                              unsigned Reg, unsigned FrameReg,
+                              int Offset, RegScavenger *RS ) {
+  assert(RS && "requiresRegisterScavenging failed");
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc dl = MI.getDebugLoc();
+  unsigned ScratchOffset = RS->scavengeRegister(&XCore::GRRegsRegClass, II, 0);
+  RS->setUsed(ScratchOffset);
+  TII.loadImmediate(MBB, II, ScratchOffset, Offset);
+
+  switch (MI.getOpcode()) {
+  case XCore::LDWFI:
+    BuildMI(MBB, II, dl, TII.get(XCore::LDW_3r), Reg)
+          .addReg(FrameReg)
+          .addReg(ScratchOffset, RegState::Kill)
+          .addMemOperand(*MI.memoperands_begin());
+    break;
+  case XCore::STWFI:
+    BuildMI(MBB, II, dl, TII.get(XCore::STW_l3r))
+          .addReg(Reg, getKillRegState(MI.getOperand(0).isKill()))
+          .addReg(FrameReg)
+          .addReg(ScratchOffset, RegState::Kill)
+          .addMemOperand(*MI.memoperands_begin());
+    break;
+  case XCore::LDAWFI:
+    BuildMI(MBB, II, dl, TII.get(XCore::LDAWF_l3r), Reg)
+          .addReg(FrameReg)
+          .addReg(ScratchOffset, RegState::Kill);
+    break;
+  default:
+    llvm_unreachable("Unexpected Opcode");
+  }
+}
+
+static void InsertSPImmInst(MachineBasicBlock::iterator II,
+                            const XCoreInstrInfo &TII,
+                            unsigned Reg, int Offset) {
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc dl = MI.getDebugLoc();
+  bool isU6 = isImmU6(Offset);
+
+  switch (MI.getOpcode()) {
+  int NewOpcode;
+  case XCore::LDWFI:
+    NewOpcode = (isU6) ? XCore::LDWSP_ru6 : XCore::LDWSP_lru6;
+    BuildMI(MBB, II, dl, TII.get(NewOpcode), Reg)
+          .addImm(Offset)
+          .addMemOperand(*MI.memoperands_begin());
+    break;
+  case XCore::STWFI:
+    NewOpcode = (isU6) ? XCore::STWSP_ru6 : XCore::STWSP_lru6;
+    BuildMI(MBB, II, dl, TII.get(NewOpcode))
+          .addReg(Reg, getKillRegState(MI.getOperand(0).isKill()))
+          .addImm(Offset)
+          .addMemOperand(*MI.memoperands_begin());
+    break;
+  case XCore::LDAWFI:
+    NewOpcode = (isU6) ? XCore::LDAWSP_ru6 : XCore::LDAWSP_lru6;
+    BuildMI(MBB, II, dl, TII.get(NewOpcode), Reg)
+          .addImm(Offset);
+    break;
+  default:
+    llvm_unreachable("Unexpected Opcode");
+  }
+}
+
+static void InsertSPConstInst(MachineBasicBlock::iterator II,
+                                const XCoreInstrInfo &TII,
+                                unsigned Reg, int Offset, RegScavenger *RS ) {
+  assert(RS && "requiresRegisterScavenging failed");
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc dl = MI.getDebugLoc();
+  unsigned OpCode = MI.getOpcode();
+
+  unsigned ScratchBase;
+  if (OpCode==XCore::STWFI) {
+    ScratchBase = RS->scavengeRegister(&XCore::GRRegsRegClass, II, 0);
+    RS->setUsed(ScratchBase);
+  } else
+    ScratchBase = Reg;
+  BuildMI(MBB, II, dl, TII.get(XCore::LDAWSP_ru6), ScratchBase).addImm(0);
+  unsigned ScratchOffset = RS->scavengeRegister(&XCore::GRRegsRegClass, II, 0);
+  RS->setUsed(ScratchOffset);
+  TII.loadImmediate(MBB, II, ScratchOffset, Offset);
+
+  switch (OpCode) {
+  case XCore::LDWFI:
+    BuildMI(MBB, II, dl, TII.get(XCore::LDW_3r), Reg)
+          .addReg(ScratchBase, RegState::Kill)
+          .addReg(ScratchOffset, RegState::Kill)
+          .addMemOperand(*MI.memoperands_begin());
+    break;
+  case XCore::STWFI:
+    BuildMI(MBB, II, dl, TII.get(XCore::STW_l3r))
+          .addReg(Reg, getKillRegState(MI.getOperand(0).isKill()))
+          .addReg(ScratchBase, RegState::Kill)
+          .addReg(ScratchOffset, RegState::Kill)
+          .addMemOperand(*MI.memoperands_begin());
+    break;
+  case XCore::LDAWFI:
+    BuildMI(MBB, II, dl, TII.get(XCore::LDAWF_l3r), Reg)
+          .addReg(ScratchBase, RegState::Kill)
+          .addReg(ScratchOffset, RegState::Kill);
+    break;
+  default:
+    llvm_unreachable("Unexpected Opcode");
+  }
+}
+
+bool XCoreRegisterInfo::needsFrameMoves(const MachineFunction &MF) {
+  return MF.getMMI().hasDebugInfo() ||
+    MF.getFunction()->needsUnwindTableEntry();
+}
+
+const MCPhysReg* XCoreRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
+                                                                         const {
+  // The callee saved registers LR & FP are explicitly handled during
+  // emitPrologue & emitEpilogue and related functions.
+  static const MCPhysReg CalleeSavedRegs[] = {
+    XCore::R4, XCore::R5, XCore::R6, XCore::R7,
+    XCore::R8, XCore::R9, XCore::R10,
+    0
+  };
+  static const MCPhysReg CalleeSavedRegsFP[] = {
+    XCore::R4, XCore::R5, XCore::R6, XCore::R7,
+    XCore::R8, XCore::R9,
+    0
+  };
+  const TargetFrameLowering *TFI = MF->getTarget().getFrameLowering();
+  if (TFI->hasFP(*MF))
+    return CalleeSavedRegsFP;
+  return CalleeSavedRegs;
+}
+
+BitVector XCoreRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  Reserved.set(XCore::CP);
+  Reserved.set(XCore::DP);
+  Reserved.set(XCore::SP);
+  Reserved.set(XCore::LR);
+  if (TFI->hasFP(MF)) {
+    Reserved.set(XCore::R10);
+  }
+  return Reserved;
+}
+
+bool
+XCoreRegisterInfo::requiresRegisterScavenging(const MachineFunction &MF) const {
+  return true;
+}
+
+bool
+XCoreRegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
+  return true;
+}
+
+bool
+XCoreRegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const {
+  return false;
+}
+
+void
+XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                       int SPAdj, unsigned FIOperandNum,
+                                       RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected");
+  MachineInstr &MI = *II;
+  MachineOperand &FrameOp = MI.getOperand(FIOperandNum);
+  int FrameIndex = FrameOp.getIndex();
+
+  MachineFunction &MF = *MI.getParent()->getParent();
+  const XCoreInstrInfo &TII =
+          *static_cast<const XCoreInstrInfo*>(MF.getTarget().getInstrInfo());
+
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex);
+  int StackSize = MF.getFrameInfo()->getStackSize();
+
+  #ifndef NDEBUG
+  DEBUG(errs() << "\nFunction         : " 
+        << MF.getName() << "\n");
+  DEBUG(errs() << "<--------->\n");
+  DEBUG(MI.print(errs()));
+  DEBUG(errs() << "FrameIndex         : " << FrameIndex << "\n");
+  DEBUG(errs() << "FrameOffset        : " << Offset << "\n");
+  DEBUG(errs() << "StackSize          : " << StackSize << "\n");
+  #endif
+
+  Offset += StackSize;
+
+  unsigned FrameReg = getFrameRegister(MF);
+
+  // Special handling of DBG_VALUE instructions.
+  if (MI.isDebugValue()) {
+    MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false /*isDef*/);
+    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+    return;
+  }
+
+  // fold constant into offset.
+  Offset += MI.getOperand(FIOperandNum + 1).getImm();
+  MI.getOperand(FIOperandNum + 1).ChangeToImmediate(0);
+  
+  assert(Offset%4 == 0 && "Misaligned stack offset");
+  DEBUG(errs() << "Offset             : " << Offset << "\n" << "<--------->\n");
+  Offset/=4;
+  
+  unsigned Reg = MI.getOperand(0).getReg();
+  assert(XCore::GRRegsRegClass.contains(Reg) && "Unexpected register operand");
+
+  if (TFI->hasFP(MF)) {
+    if (isImmUs(Offset))
+      InsertFPImmInst(II, TII, Reg, FrameReg, Offset);
+    else
+      InsertFPConstInst(II, TII, Reg, FrameReg, Offset, RS);
+  } else {
+    if (isImmU16(Offset))
+      InsertSPImmInst(II, TII, Reg, Offset);
+    else
+      InsertSPConstInst(II, TII, Reg, Offset, RS);
+  }
+  // Erase old instruction.
+  MachineBasicBlock &MBB = *MI.getParent();
+  MBB.erase(II);
+}
+
+
+unsigned XCoreRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  return TFI->hasFP(MF) ? XCore::R10 : XCore::SP;
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.h b/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.h
new file mode 100644
index 0000000..aa617a0
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.h
@@ -0,0 +1,56 @@
+//===-- XCoreRegisterInfo.h - XCore Register Information Impl ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the XCore implementation of the MRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef XCOREREGISTERINFO_H
+#define XCOREREGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "XCoreGenRegisterInfo.inc"
+
+namespace llvm {
+
+class TargetInstrInfo;
+
+struct XCoreRegisterInfo : public XCoreGenRegisterInfo {
+public:
+  XCoreRegisterInfo();
+
+  /// Code Generation virtual methods...
+
+  const MCPhysReg *
+  getCalleeSavedRegs(const MachineFunction *MF =nullptr) const override;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+  
+  bool requiresRegisterScavenging(const MachineFunction &MF) const override;
+
+  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;
+
+  bool useFPForScavengingIndex(const MachineFunction &MF) const override;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                           int SPAdj, unsigned FIOperandNum,
+                           RegScavenger *RS = nullptr) const override;
+
+  // Debug information queries.
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
+
+  //! Return whether to emit frame moves
+  static bool needsFrameMoves(const MachineFunction &MF);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.td b/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.td
new file mode 100644
index 0000000..6694b28
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.td
@@ -0,0 +1,59 @@
+//===-- XCoreRegisterInfo.td - XCore Register defs ---------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the XCore register file 
+//===----------------------------------------------------------------------===//
+
+class XCoreReg<string n> : Register<n> {
+  field bits<4> Num;
+  let Namespace = "XCore";
+}
+
+// Registers are identified with 4-bit ID numbers.
+// Ri - 32-bit integer registers
+class Ri<bits<4> num, string n> : XCoreReg<n> {
+  let Num = num;
+}
+
+// CPU registers
+def R0  : Ri< 0, "r0">, DwarfRegNum<[0]>;
+def R1  : Ri< 1, "r1">, DwarfRegNum<[1]>;
+def R2  : Ri< 2, "r2">, DwarfRegNum<[2]>; 
+def R3  : Ri< 3, "r3">, DwarfRegNum<[3]>;
+def R4  : Ri< 4, "r4">, DwarfRegNum<[4]>;
+def R5  : Ri< 5, "r5">, DwarfRegNum<[5]>; 
+def R6  : Ri< 6, "r6">, DwarfRegNum<[6]>;
+def R7  : Ri< 7, "r7">, DwarfRegNum<[7]>;
+def R8  : Ri< 8, "r8">, DwarfRegNum<[8]>;
+def R9  : Ri< 9, "r9">, DwarfRegNum<[9]>; 
+def R10 : Ri<10, "r10">, DwarfRegNum<[10]>;
+def R11 : Ri<11, "r11">, DwarfRegNum<[11]>;
+def CP : Ri<12, "cp">, DwarfRegNum<[12]>; 
+def DP : Ri<13, "dp">, DwarfRegNum<[13]>;
+def SP : Ri<14, "sp">, DwarfRegNum<[14]>;
+def LR : Ri<15, "lr">, DwarfRegNum<[15]>;
+
+// Register classes.
+//
+def GRRegs : RegisterClass<"XCore", [i32], 32,
+  // Return values and arguments
+  (add R0, R1, R2, R3,
+  // Callee save
+  R4, R5, R6, R7, R8, R9, R10,
+  // Not preserved across procedure calls
+  R11)>;
+
+// Reserved
+def RRegs : RegisterClass<"XCore", [i32], 32,
+  (add R0, R1, R2, R3,
+   R4, R5, R6, R7, R8, R9, R10,
+   R11, CP, DP, SP, LR)> {
+  let isAllocatable = 0;
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.cpp b/contrib/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
new file mode 100644
index 0000000..91b33fd
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
@@ -0,0 +1,58 @@
+//===-- XCoreSelectionDAGInfo.cpp - XCore SelectionDAG Info ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the XCoreSelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreTargetMachine.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "xcore-selectiondag-info"
+
+XCoreSelectionDAGInfo::XCoreSelectionDAGInfo(const DataLayout &DL)
+    : TargetSelectionDAGInfo(&DL) {}
+
+XCoreSelectionDAGInfo::~XCoreSelectionDAGInfo() {
+}
+
+SDValue XCoreSelectionDAGInfo::
+EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SDValue Chain,
+                        SDValue Dst, SDValue Src, SDValue Size, unsigned Align,
+                        bool isVolatile, bool AlwaysInline,
+                        MachinePointerInfo DstPtrInfo,
+                        MachinePointerInfo SrcPtrInfo) const
+{
+  unsigned SizeBitWidth = Size.getValueType().getSizeInBits();
+  // Call __memcpy_4 if the src, dst and size are all 4 byte aligned.
+  if (!AlwaysInline && (Align & 3) == 0 &&
+      DAG.MaskedValueIsZero(Size, APInt(SizeBitWidth, 3))) {
+    const TargetLowering &TLI = *DAG.getTarget().getTargetLowering();
+    TargetLowering::ArgListTy Args;
+    TargetLowering::ArgListEntry Entry;
+    Entry.Ty = TLI.getDataLayout()->getIntPtrType(*DAG.getContext());
+    Entry.Node = Dst; Args.push_back(Entry);
+    Entry.Node = Src; Args.push_back(Entry);
+    Entry.Node = Size; Args.push_back(Entry);
+
+    TargetLowering::CallLoweringInfo CLI(DAG);
+    CLI.setDebugLoc(dl).setChain(Chain)
+      .setCallee(TLI.getLibcallCallingConv(RTLIB::MEMCPY),
+                 Type::getVoidTy(*DAG.getContext()),
+                 DAG.getExternalSymbol("__memcpy_4", TLI.getPointerTy()),
+                 std::move(Args), 0)
+      .setDiscardResult();
+
+    std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
+    return CallResult.second;
+  }
+
+  // Otherwise have the target-independent code call memcpy.
+  return SDValue();
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.h b/contrib/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.h
new file mode 100644
index 0000000..0079de1
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.h
@@ -0,0 +1,40 @@
+//===-- XCoreSelectionDAGInfo.h - XCore SelectionDAG Info -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the XCore subclass for TargetSelectionDAGInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef XCORESELECTIONDAGINFO_H
+#define XCORESELECTIONDAGINFO_H
+
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+
+namespace llvm {
+
+class XCoreTargetMachine;
+
+class XCoreSelectionDAGInfo : public TargetSelectionDAGInfo {
+public:
+  explicit XCoreSelectionDAGInfo(const DataLayout &DL);
+  ~XCoreSelectionDAGInfo();
+
+  SDValue
+  EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
+                          SDValue Chain,
+                          SDValue Op1, SDValue Op2,
+                          SDValue Op3, unsigned Align, bool isVolatile,
+                          bool AlwaysInline,
+                          MachinePointerInfo DstPtrInfo,
+                          MachinePointerInfo SrcPtrInfo) const override;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/XCoreSubtarget.cpp b/contrib/llvm/lib/Target/XCore/XCoreSubtarget.cpp
new file mode 100644
index 0000000..7227411
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreSubtarget.cpp
@@ -0,0 +1,32 @@
+//===-- XCoreSubtarget.cpp - XCore Subtarget Information ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the XCore specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreSubtarget.h"
+#include "XCore.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "xcore-subtarget"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "XCoreGenSubtargetInfo.inc"
+
+void XCoreSubtarget::anchor() { }
+
+XCoreSubtarget::XCoreSubtarget(const std::string &TT, const std::string &CPU,
+                               const std::string &FS, const TargetMachine &TM)
+    : XCoreGenSubtargetInfo(TT, CPU, FS),
+      DL("e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:32-f64:32-a:0:32-n32"),
+      InstrInfo(), FrameLowering(*this), TLInfo(TM), TSInfo(DL) {}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreSubtarget.h b/contrib/llvm/lib/Target/XCore/XCoreSubtarget.h
new file mode 100644
index 0000000..1e9810b
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreSubtarget.h
@@ -0,0 +1,62 @@
+//===-- XCoreSubtarget.h - Define Subtarget for the XCore -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the XCore specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef XCORESUBTARGET_H
+#define XCORESUBTARGET_H
+
+#include "XCoreFrameLowering.h"
+#include "XCoreISelLowering.h"
+#include "XCoreInstrInfo.h"
+#include "XCoreSelectionDAGInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "XCoreGenSubtargetInfo.inc"
+
+namespace llvm {
+class StringRef;
+
+class XCoreSubtarget : public XCoreGenSubtargetInfo {
+  virtual void anchor();
+  const DataLayout DL;       // Calculates type size & alignment
+  XCoreInstrInfo InstrInfo;
+  XCoreFrameLowering FrameLowering;
+  XCoreTargetLowering TLInfo;
+  XCoreSelectionDAGInfo TSInfo;
+
+public:
+  /// This constructor initializes the data members to match that
+  /// of the specified triple.
+  ///
+  XCoreSubtarget(const std::string &TT, const std::string &CPU,
+                 const std::string &FS, const TargetMachine &TM);
+  
+  /// ParseSubtargetFeatures - Parses features string setting specified 
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  const XCoreInstrInfo *getInstrInfo() const { return &InstrInfo; }
+  const XCoreFrameLowering *getFrameLowering() const { return &FrameLowering; }
+  const XCoreTargetLowering *getTargetLowering() const { return &TLInfo; }
+  const XCoreSelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; }
+  const TargetRegisterInfo *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+  const DataLayout *getDataLayout() const { return &DL; }
+};
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.cpp b/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.cpp
new file mode 100644
index 0000000..8d8bb38
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -0,0 +1,80 @@
+//===-- XCoreTargetMachine.cpp - Define TargetMachine for XCore -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreTargetMachine.h"
+#include "XCore.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Module.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+/// XCoreTargetMachine ctor - Create an ILP32 architecture model
+///
+XCoreTargetMachine::XCoreTargetMachine(const Target &T, StringRef TT,
+                                       StringRef CPU, StringRef FS,
+                                       const TargetOptions &Options,
+                                       Reloc::Model RM, CodeModel::Model CM,
+                                       CodeGenOpt::Level OL)
+    : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+      Subtarget(TT, CPU, FS, *this) {
+  initAsmInfo();
+}
+
+namespace {
+/// XCore Code Generator Pass Configuration Options.
+class XCorePassConfig : public TargetPassConfig {
+public:
+  XCorePassConfig(XCoreTargetMachine *TM, PassManagerBase &PM)
+    : TargetPassConfig(TM, PM) {}
+
+  XCoreTargetMachine &getXCoreTargetMachine() const {
+    return getTM<XCoreTargetMachine>();
+  }
+
+  bool addPreISel() override;
+  bool addInstSelector() override;
+  bool addPreEmitPass() override;
+};
+} // namespace
+
+TargetPassConfig *XCoreTargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new XCorePassConfig(this, PM);
+}
+
+bool XCorePassConfig::addPreISel() {
+  addPass(createXCoreLowerThreadLocalPass());
+  return false;
+}
+
+bool XCorePassConfig::addInstSelector() {
+  addPass(createXCoreISelDag(getXCoreTargetMachine(), getOptLevel()));
+  return false;
+}
+
+bool XCorePassConfig::addPreEmitPass() {
+  addPass(createXCoreFrameToArgsOffsetEliminationPass());
+  return false;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeXCoreTarget() {
+  RegisterTargetMachine<XCoreTargetMachine> X(TheXCoreTarget);
+}
+
+void XCoreTargetMachine::addAnalysisPasses(PassManagerBase &PM) {
+  // Add first the target-independent BasicTTI pass, then our XCore pass. This
+  // allows the XCore pass to delegate to the target independent layer when
+  // appropriate.
+  PM.add(createBasicTargetTransformInfoPass(this));
+  PM.add(createXCoreTargetTransformInfoPass(this));
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.h b/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.h
new file mode 100644
index 0000000..14c43bf
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.h
@@ -0,0 +1,58 @@
+//===-- XCoreTargetMachine.h - Define TargetMachine for XCore ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the XCore specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef XCORETARGETMACHINE_H
+#define XCORETARGETMACHINE_H
+
+#include "XCoreSubtarget.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class XCoreTargetMachine : public LLVMTargetMachine {
+  XCoreSubtarget Subtarget;
+public:
+  XCoreTargetMachine(const Target &T, StringRef TT,
+                     StringRef CPU, StringRef FS, const TargetOptions &Options,
+                     Reloc::Model RM, CodeModel::Model CM,
+                     CodeGenOpt::Level OL);
+
+  const XCoreInstrInfo *getInstrInfo() const override {
+    return getSubtargetImpl()->getInstrInfo();
+  }
+  const XCoreFrameLowering *getFrameLowering() const override {
+    return getSubtargetImpl()->getFrameLowering();
+  }
+  const XCoreSubtarget *getSubtargetImpl() const override { return &Subtarget; }
+  const XCoreTargetLowering *getTargetLowering() const override {
+    return getSubtargetImpl()->getTargetLowering();
+  }
+  const XCoreSelectionDAGInfo* getSelectionDAGInfo() const override {
+    return getSubtargetImpl()->getSelectionDAGInfo();
+  }
+  const TargetRegisterInfo *getRegisterInfo() const override {
+    return getSubtargetImpl()->getRegisterInfo();
+  }
+  const DataLayout *getDataLayout() const override {
+    return getSubtargetImpl()->getDataLayout();
+  }
+
+  // Pass Pipeline Configuration
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+  void addAnalysisPasses(PassManagerBase &PM) override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.cpp b/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.cpp
new file mode 100644
index 0000000..cfd3302
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.cpp
@@ -0,0 +1,179 @@
+//===-- XCoreTargetObjectFile.cpp - XCore object files --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreTargetObjectFile.h"
+#include "XCoreSubtarget.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+
+void XCoreTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM){
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+
+  BSSSection =
+    Ctx.getELFSection(".dp.bss", ELF::SHT_NOBITS,
+                      ELF::SHF_ALLOC | ELF::SHF_WRITE |
+                      ELF::XCORE_SHF_DP_SECTION,
+                      SectionKind::getBSS());
+  BSSSectionLarge =
+    Ctx.getELFSection(".dp.bss.large", ELF::SHT_NOBITS,
+                      ELF::SHF_ALLOC | ELF::SHF_WRITE |
+                      ELF::XCORE_SHF_DP_SECTION,
+                      SectionKind::getBSS());
+  DataSection =
+    Ctx.getELFSection(".dp.data", ELF::SHT_PROGBITS,
+                      ELF::SHF_ALLOC | ELF::SHF_WRITE |
+                      ELF::XCORE_SHF_DP_SECTION,
+                      SectionKind::getDataRel());
+  DataSectionLarge =
+    Ctx.getELFSection(".dp.data.large", ELF::SHT_PROGBITS,
+                      ELF::SHF_ALLOC | ELF::SHF_WRITE |
+                      ELF::XCORE_SHF_DP_SECTION,
+                      SectionKind::getDataRel());
+  DataRelROSection =
+    Ctx.getELFSection(".dp.rodata", ELF::SHT_PROGBITS,
+                      ELF::SHF_ALLOC | ELF::SHF_WRITE |
+                      ELF::XCORE_SHF_DP_SECTION,
+                      SectionKind::getReadOnlyWithRel());
+  DataRelROSectionLarge =
+    Ctx.getELFSection(".dp.rodata.large", ELF::SHT_PROGBITS,
+                      ELF::SHF_ALLOC | ELF::SHF_WRITE |
+                      ELF::XCORE_SHF_DP_SECTION,
+                      SectionKind::getReadOnlyWithRel());
+  ReadOnlySection =
+    Ctx.getELFSection(".cp.rodata", ELF::SHT_PROGBITS,
+                      ELF::SHF_ALLOC |
+                      ELF::XCORE_SHF_CP_SECTION,
+                      SectionKind::getReadOnlyWithRel());
+  ReadOnlySectionLarge =
+    Ctx.getELFSection(".cp.rodata.large", ELF::SHT_PROGBITS,
+                      ELF::SHF_ALLOC |
+                      ELF::XCORE_SHF_CP_SECTION,
+                      SectionKind::getReadOnlyWithRel());
+  MergeableConst4Section = 
+    Ctx.getELFSection(".cp.rodata.cst4", ELF::SHT_PROGBITS,
+                      ELF::SHF_ALLOC | ELF::SHF_MERGE |
+                      ELF::XCORE_SHF_CP_SECTION,
+                      SectionKind::getMergeableConst4());
+  MergeableConst8Section = 
+    Ctx.getELFSection(".cp.rodata.cst8", ELF::SHT_PROGBITS,
+                      ELF::SHF_ALLOC | ELF::SHF_MERGE |
+                      ELF::XCORE_SHF_CP_SECTION,
+                      SectionKind::getMergeableConst8());
+  MergeableConst16Section = 
+    Ctx.getELFSection(".cp.rodata.cst16", ELF::SHT_PROGBITS,
+                      ELF::SHF_ALLOC | ELF::SHF_MERGE |
+                      ELF::XCORE_SHF_CP_SECTION,
+                      SectionKind::getMergeableConst16());
+  CStringSection =
+    Ctx.getELFSection(".cp.rodata.string", ELF::SHT_PROGBITS,
+                      ELF::SHF_ALLOC | ELF::SHF_MERGE | ELF::SHF_STRINGS |
+                      ELF::XCORE_SHF_CP_SECTION,
+                      SectionKind::getReadOnlyWithRel());
+  // TextSection       - see MObjectFileInfo.cpp
+  // StaticCtorSection - see MObjectFileInfo.cpp
+  // StaticDtorSection - see MObjectFileInfo.cpp
+ }
+
+static unsigned getXCoreSectionType(SectionKind K) {
+  if (K.isBSS())
+    return ELF::SHT_NOBITS;
+  return ELF::SHT_PROGBITS;
+}
+
+static unsigned getXCoreSectionFlags(SectionKind K, bool IsCPRel) {
+  unsigned Flags = 0;
+
+  if (!K.isMetadata())
+    Flags |= ELF::SHF_ALLOC;
+
+  if (K.isText())
+    Flags |= ELF::SHF_EXECINSTR;
+  else if (IsCPRel)
+    Flags |= ELF::XCORE_SHF_CP_SECTION;
+  else
+    Flags |= ELF::XCORE_SHF_DP_SECTION;
+
+  if (K.isWriteable())
+    Flags |= ELF::SHF_WRITE;
+
+  if (K.isMergeableCString() || K.isMergeableConst4() ||
+      K.isMergeableConst8() || K.isMergeableConst16())
+    Flags |= ELF::SHF_MERGE;
+
+  if (K.isMergeableCString())
+    Flags |= ELF::SHF_STRINGS;
+
+  return Flags;
+}
+
+const MCSection *
+XCoreTargetObjectFile::getExplicitSectionGlobal(const GlobalValue *GV,
+                                                SectionKind Kind, Mangler &Mang,
+                                                const TargetMachine &TM) const {
+  StringRef SectionName = GV->getSection();
+  // Infer section flags from the section name if we can.
+  bool IsCPRel = SectionName.startswith(".cp.");
+  if (IsCPRel && !Kind.isReadOnly())
+    report_fatal_error("Using .cp. section for writeable object.");
+  return getContext().getELFSection(SectionName, getXCoreSectionType(Kind),
+                                    getXCoreSectionFlags(Kind, IsCPRel), Kind);
+}
+
+const MCSection *XCoreTargetObjectFile::
+SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind, Mangler &Mang,
+                       const TargetMachine &TM) const{
+
+  bool UseCPRel = GV->isLocalLinkage(GV->getLinkage());
+
+  if (Kind.isText())                    return TextSection;
+  if (UseCPRel) {
+    if (Kind.isMergeable1ByteCString()) return CStringSection;
+    if (Kind.isMergeableConst4())       return MergeableConst4Section;
+    if (Kind.isMergeableConst8())       return MergeableConst8Section;
+    if (Kind.isMergeableConst16())      return MergeableConst16Section;
+  }
+  Type *ObjType = GV->getType()->getPointerElementType();
+  if (TM.getCodeModel() == CodeModel::Small ||
+      !ObjType->isSized() ||
+      TM.getDataLayout()->getTypeAllocSize(ObjType) < CodeModelLargeSize) {
+    if (Kind.isReadOnly())              return UseCPRel? ReadOnlySection
+                                                       : DataRelROSection;
+    if (Kind.isBSS() || Kind.isCommon())return BSSSection;
+    if (Kind.isDataRel())               return DataSection;
+    if (Kind.isReadOnlyWithRel())       return DataRelROSection;
+  } else {
+    if (Kind.isReadOnly())              return UseCPRel? ReadOnlySectionLarge
+                                                       : DataRelROSectionLarge;
+    if (Kind.isBSS() || Kind.isCommon())return BSSSectionLarge;
+    if (Kind.isDataRel())               return DataSectionLarge;
+    if (Kind.isReadOnlyWithRel())       return DataRelROSectionLarge;
+  }
+
+  assert((Kind.isThreadLocal() || Kind.isCommon()) && "Unknown section kind");
+  report_fatal_error("Target does not support TLS or Common sections");
+}
+
+const MCSection *
+XCoreTargetObjectFile::getSectionForConstant(SectionKind Kind,
+                                             const Constant *C) const {
+  if (Kind.isMergeableConst4())           return MergeableConst4Section;
+  if (Kind.isMergeableConst8())           return MergeableConst8Section;
+  if (Kind.isMergeableConst16())          return MergeableConst16Section;
+  assert((Kind.isReadOnly() || Kind.isReadOnlyWithRel()) &&
+         "Unknown section kind");
+  // We assume the size of the object is never greater than CodeModelLargeSize.
+  // To handle CodeModelLargeSize changes to AsmPrinter would be required.
+  return ReadOnlySection;
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.h b/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.h
new file mode 100644
index 0000000..d389e55
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.h
@@ -0,0 +1,42 @@
+//===-- XCoreTargetObjectFile.h - XCore Object Info -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_XCORE_TARGETOBJECTFILE_H
+#define LLVM_TARGET_XCORE_TARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+
+namespace llvm {
+
+static const unsigned CodeModelLargeSize = 256;
+
+  class XCoreTargetObjectFile : public TargetLoweringObjectFileELF {
+   const MCSection *BSSSectionLarge;
+   const MCSection *DataSectionLarge;
+   const MCSection *ReadOnlySectionLarge;
+   const MCSection *DataRelROSectionLarge;
+  public:
+    void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+
+    const MCSection *
+      getExplicitSectionGlobal(const GlobalValue *GV,
+                               SectionKind Kind, Mangler &Mang,
+                               const TargetMachine &TM) const override;
+
+    const MCSection *
+      SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
+                             Mangler &Mang,
+                             const TargetMachine &TM) const override;
+
+    const MCSection *getSectionForConstant(SectionKind Kind,
+                                           const Constant *C) const override;
+  };
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/XCoreTargetStreamer.h b/contrib/llvm/lib/Target/XCore/XCoreTargetStreamer.h
new file mode 100644
index 0000000..0a394da
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreTargetStreamer.h
@@ -0,0 +1,27 @@
+//===-- XCoreTargetStreamer.h - XCore Target Streamer ----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef XCORETARGETSTREAMER_H
+#define XCORETARGETSTREAMER_H
+
+#include "llvm/MC/MCStreamer.h"
+
+namespace llvm {
+class XCoreTargetStreamer : public MCTargetStreamer {
+public:
+  XCoreTargetStreamer(MCStreamer &S);
+  virtual ~XCoreTargetStreamer();
+  virtual void emitCCTopData(StringRef Name) = 0;
+  virtual void emitCCTopFunction(StringRef Name) = 0;
+  virtual void emitCCBottomData(StringRef Name) = 0;
+  virtual void emitCCBottomFunction(StringRef Name) = 0;
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/XCoreTargetTransformInfo.cpp b/contrib/llvm/lib/Target/XCore/XCoreTargetTransformInfo.cpp
new file mode 100644
index 0000000..80d193d
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreTargetTransformInfo.cpp
@@ -0,0 +1,80 @@
+//===-- XCoreTargetTransformInfo.cpp - XCore specific TTI pass ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements a TargetTransformInfo analysis pass specific to the
+/// XCore target machine. It uses the target's detailed information to provide
+/// more precise answers to certain TTI queries, while letting the target
+/// independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#include "XCore.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/CostTable.h"
+#include "llvm/Target/TargetLowering.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "xcoretti"
+
+// Declare the pass initialization routine locally as target-specific passes
+// don't have a target-wide initialization entry point, and so we rely on the
+// pass constructor initialization.
+namespace llvm {
+void initializeXCoreTTIPass(PassRegistry &);
+}
+
+namespace {
+
+class XCoreTTI final : public ImmutablePass, public TargetTransformInfo {
+public:
+  XCoreTTI() : ImmutablePass(ID) {
+    llvm_unreachable("This pass cannot be directly constructed");
+  }
+
+  XCoreTTI(const XCoreTargetMachine *TM)
+      : ImmutablePass(ID) {
+    initializeXCoreTTIPass(*PassRegistry::getPassRegistry());
+  }
+
+  virtual void initializePass() override {
+    pushTTIStack(this);
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+    TargetTransformInfo::getAnalysisUsage(AU);
+  }
+
+  static char ID;
+
+  virtual void *getAdjustedAnalysisPointer(const void *ID) override {
+    if (ID == &TargetTransformInfo::ID)
+      return (TargetTransformInfo*)this;
+    return this;
+  }
+
+  unsigned getNumberOfRegisters(bool Vector) const override {
+    if (Vector) {
+       return 0;
+    }
+    return 12;
+  }
+};
+
+} // end anonymous namespace
+
+INITIALIZE_AG_PASS(XCoreTTI, TargetTransformInfo, "xcoretti",
+                   "XCore Target Transform Info", true, true, false)
+char XCoreTTI::ID = 0;
+
+
+ImmutablePass *
+llvm::createXCoreTargetTransformInfoPass(const XCoreTargetMachine *TM) {
+  return new XCoreTTI(TM);
+}